[𝘀𝗽𝗿] changes introduced through rebaseusers/aaupov/spr/main.boltnfc-print-timers-in-perf2bolt-invocation

Created using spr 1.3.4 [skip ci]
author: Amir Ayupov <aaupov@fb.com> 2024-07-31 22:13:40 -0700
committer: Amir Ayupov <aaupov@fb.com> 2024-07-31 22:13:40 -0700
commit: 16a22bc81f9200e016296237ca6640fd9c0c3178 (patch)
tree: f79f44ef275aa946390a055fb913899c5288149e
parent: e1ae4a428056fc77ceedf4a6d354c9fe52b8a79a (diff)
parent: fb97b4f96217442c684a940558135ffbfe45b756 (diff)
download: llvm-users/aaupov/spr/main.boltnfc-print-timers-in-perf2bolt-invocation.zip
llvm-users/aaupov/spr/main.boltnfc-print-timers-in-perf2bolt-invocation.tar.gz
llvm-users/aaupov/spr/main.boltnfc-print-timers-in-perf2bolt-invocation.tar.bz2
1975 files changed, 75478 insertions, 47113 deletions
diff --git a/.github/workflows/release-asset-audit.py b/.github/workflows/release-asset-audit.py
new file mode 100644
index 0000000..355e7fe
--- /dev/null
+++ b/.github/workflows/release-asset-audit.py
@@ -0,0 +1,51 @@
+import github
+import sys
+
+def main():
+    token = sys.argv[1]
+
+    gh = github.Github(login_or_token=token)
+    repo = gh.get_repo("llvm/llvm-project")
+
+    uploaders = set(
+        [
+            "DimitryAndric",
+            "stefanp-ibm",
+            "lei137",
+            "omjavaid",
+            "nicolerabjohn",
+            "amy-kwan",
+            "mandlebug",
+            "zmodem",
+            "androm3da",
+            "tru",
+            "rovka",
+            "rorth",
+            "quinnlp",
+            "kamaub",
+            "abrisco",
+            "jakeegan",
+            "maryammo",
+            "tstellar",
+            "github-actions[bot]",
+        ]
+    )
+
+    for release in repo.get_releases():
+        print("Release:", release.title)
+        for asset in release.get_assets():
+            created_at = asset.created_at
+            updated_at = (
+                "" if asset.created_at == asset.updated_at else asset.updated_at
+            )
+            print(
+                f"{asset.name} : {asset.uploader.login} [{created_at} {updated_at}] ( {asset.download_count} )"
+            )
+            if asset.uploader.login not in uploaders:
+                with open('comment', 'w') as file:
+                    file.write(f'@{asset.uploader.login} is not a valid uploader.')
+                sys.exit(1)
+
+
+if __name__ == "__main__":
+    main()
diff --git a/.github/workflows/release-asset-audit.yml b/.github/workflows/release-asset-audit.yml
new file mode 100644
index 0000000..018c5d5
--- /dev/null
+++ b/.github/workflows/release-asset-audit.yml
@@ -0,0 +1,54 @@
+name: Release Asset Audit
+
+on:
+  workflow_dispatch:
+  release:
+  schedule:
+    # * is a special character in YAML so you have to quote this string
+    # Run once an hour
+    - cron:  '5 * * * *'
+
+  pull_request:
+    paths:
+      - ".github/workflows/release-asset-audit.py"
+      - ".github/workflows/release-asset-audit.yml"
+
+permissions:
+  contents: read # Default everything to read-only
+
+jobs:
+  audit:
+    name: "Release Asset Audit"
+    runs-on: ubuntu-22.04
+    if: github.repository == 'llvm/llvm-project'
+    steps:
+      - uses: actions/checkout@a5ac7e51b41094c92402da3b24376905380afc29 #v4.1.6
+      - name: "Run Audit Script"
+        env:
+          GITHUB_TOKEN: ${{ github.token }}
+        run: |
+          pip install --require-hashes -r ./llvm/utils/git/requirements.txt
+          python3 ./.github/workflows/release-asset-audit.py $GITHUB_TOKEN
+      - name: "File Issue"
+        if: >-
+          github.event_name != 'pull_request' &&
+          failure()
+        uses: actions/github-script@60a0d83039c74a4aee543508d2ffcb1c3799cdea #v7.0.1
+        with:
+          github-token: ${{ secrets.ISSUE_SUBSCRIBER_TOKEN }}
+          script: |
+            var fs = require('fs');
+            var body = ''
+            if (fs.existsSync('./comment')) {
+              body = fs.readFileSync('./comment') + "\n\n";
+            }
+            body = body + `\n\nhttps://github.com/${context.repo.owner}/${context.repo.repo}/actions/runs/${context.runId}`
+
+            const issue = await github.rest.issues.create({
+               owner: context.repo.owner,
+               repo: context.repo.repo,
+               title: "Release Asset Audit Failed",
+               labels: ['infrastructure'],
+               body: body
+            });
+            console.log(issue);
diff --git a/.github/workflows/release-binaries-all.yml b/.github/workflows/release-binaries-all.yml
new file mode 100644
index 0000000..73c9d96
--- /dev/null
+++ b/.github/workflows/release-binaries-all.yml
@@ -0,0 +1,94 @@
+name: Release Binaries All
+
+permissions:
+  contents: read # Default everything to read-only
+
+on:
+  workflow_dispatch:
+    inputs:
+      release-version:
+        description: 'Release Version'
+        required: true
+        type: string
+      upload:
+        description: 'Upload binaries to the release page'
+        required: true
+        default: false
+        type: boolean
+
+  workflow_call:
+    inputs:
+      release-version:
+        description: 'Release Version'
+        required: true
+        type: string
+      upload:
+        description: 'Upload binaries to the release page'
+        required: true
+        default: false
+        type: boolean
+
+  pull_request:
+    types:
+      - opened
+      - synchronize
+      - reopened
+      # When a PR is closed, we still start this workflow, but then skip
+      # all the jobs, which makes it effectively a no-op.  The reason to
+      # do this is that it allows us to take advantage of concurrency groups
+      # to cancel in progress CI jobs whenever the PR is closed.
+      - closed
+    paths:
+      - '.github/workflows/release-binaries-all.yml'
+      - '.github/workflows/release-binaries.yml'
+      - '.github/workflows/release-binaries-setup-stage/*'
+      - '.github/workflows/release-binaries-save-stage/*'
+
+concurrency:
+  group: ${{ github.workflow }}-${{ github.event.pull_request.number || 'dispatch' }}
+  cancel-in-progress: True
+
+jobs:
+  setup-variables:
+    if: >-
+      (github.event_name != 'pull_request' || github.event.action != 'closed')
+    runs-on: ubuntu-22.04
+    outputs:
+      release-version: ${{ steps.vars.outputs.release-version }}
+      upload: ${{ steps.vars.outputs.upload }}
+    steps:
+      - shell: bash
+        id: vars
+        run: |
+          upload="${{ inputs.upload }}"
+          release_version="${{ inputs.release-version }}"
+          if [ "${{ github.event_name }}" = "pull_request" ]; then
+            upload="false"
+            release_version=""
+          fi
+          echo "release-version=$release_version" >> "$GITHUB_OUTPUT"
+          echo "upload=$upload" >> "$GITHUB_OUTPUT"
+
+  release-binaries-all:
+    name: Build Release Binaries
+    needs:
+      - setup-variables
+    permissions:
+      contents: write # For release uploads
+      id-token: write     # For artifact attestations
+      attestations: write # For artifact attestations
+    strategy:
+      fail-fast: false
+      matrix:
+        runs-on:
+          - ubuntu-22.04
+          - windows-2022
+          - macos-13
+          - macos-14
+
+    uses: ./.github/workflows/release-binaries.yml
+    with:
+      release-version: "${{ needs.setup-variables.outputs.release-version }}"
+      upload: ${{ needs.setup-variables.outputs.upload == 'true'}}
+      runs-on: "${{ matrix.runs-on }}"
+
diff --git a/.github/workflows/release-binaries-save-stage/action.yml b/.github/workflows/release-binaries-save-stage/action.yml
new file mode 100644
index 0000000..e2f3eea
--- /dev/null
+++ b/.github/workflows/release-binaries-save-stage/action.yml
@@ -0,0 +1,38 @@
+name: Save Stage
+description: >-
+  Upload the source and binary directories from a build stage so that they
+  can be re-used in the next stage.  This action is used to the release
+  binaries workflow into multiple stages to avoid the 6 hour timeout on
+  the GitHub hosted runners.
+inputs:
+  build-prefix:
+    description: "Directory containing the build directory."
+    required: true
+    type: 'string'
+
+runs:
+  using: "composite"
+  steps:
+    # We need to create an archive of the build directory, because it has too
+    # many files to upload.
+    - name: Package Build and Source Directories
+      shell: bash
+      run: |
+        # Windows does not support symlinks, so we need to dereference them.
+        tar --exclude build/ ${{ (runner.os == 'Windows' && '-h') || '' }} -c . | zstd -T0 -c > ../llvm-project.tar.zst
+        mv ../llvm-project.tar.zst .
+        tar -C ${{ inputs.build-prefix }} -c build/ | zstd -T0 -c > build.tar.zst
+
+    - name: Upload Stage 1 Source
+      uses: actions/upload-artifact@26f96dfa697d77e81fd5907df203aa23a56210a8 #v4.3.0
+      with:
+        name: ${{ runner.os }}-${{ runner.arch }}-${{ github.job }}-source
+        path: llvm-project.tar.zst
+        retention-days: 2
+
+    - name: Upload Stage 1 Build Dir
+      uses: actions/upload-artifact@26f96dfa697d77e81fd5907df203aa23a56210a8 #v4.3.0
+      with:
+        name: ${{ runner.os}}-${{ runner.arch }}-${{ github.job }}-build
+        path: build.tar.zst
+        retention-days: 2
diff --git a/.github/workflows/release-binaries-setup-stage/action.yml b/.github/workflows/release-binaries-setup-stage/action.yml
new file mode 100644
index 0000000..f5e5db2
--- /dev/null
+++ b/.github/workflows/release-binaries-setup-stage/action.yml
@@ -0,0 +1,59 @@
+name: Setup Stage
+description: >-
+  Setup the next stage of the release binaries workflow.  This sets up the
+  environment correctly for a new stage of the release binaries workflow
+  and also restores the source and build directory from the previous stage.
+
+inputs:
+  previous-artifact:
+    description: >-
+      A unique descriptor for the artifact from the previous stage.  This will
+      be used to construct the final artifact pattern, which is:
+      $RUNNER_OS-$RUNNER_ARCH-$PREVIOUS_ARTIFACT-*
+    required: false
+    type: 'string'
+
+outputs:
+  build-prefix:
+    description: "Directory containing the build directory."
+    value: ${{ steps.build-prefix.outputs.build-prefix }}
+
+runs:
+  using: "composite"
+  steps:
+    - name: Install Ninja
+      uses: llvm/actions/install-ninja@22e9f909d35b50bd1181709564bfe816eaeaae81 # main
+   
+    - name: Setup Windows
+      if: startsWith(runner.os, 'Windows')
+      uses: llvm/actions/setup-windows@main
+      with:
+        arch: amd64
+
+    - name: Set Build Prefix
+      id: build-prefix
+      shell: bash
+      run: |
+        build_prefix=`pwd`
+        if [ "${{ runner.os }}" = "Linux" ]; then
+          sudo chown $USER:$USER /mnt/
+          build_prefix=/mnt/
+        fi
+        echo "build-prefix=$build_prefix" >> $GITHUB_OUTPUT
+
+    - name: Download Previous Stage Artifact
+      if: ${{ inputs.previous-artifact }}
+      id: download
+      uses: actions/download-artifact@6b208ae046db98c579e8a3aa621ab581ff575935 # v4.1.1
+      with:
+        pattern: ${{ runner.os }}-${{ runner.arch }}-${{ inputs.previous-artifact }}-*
+        merge-multiple: true
+
+    - name: Unpack Artifact
+      if: ${{ steps.download.outputs.download-path }}
+      shell: bash
+      run: |
+        tar --zstd -xf llvm-project.tar.zst
+        rm llvm-project.tar.zst
+        tar --zstd -C ${{ steps.build-prefix.outputs.build-prefix}} -xf build.tar.zst
+        rm build.tar.zst
diff --git a/.github/workflows/release-binaries.yml b/.github/workflows/release-binaries.yml
index 7de4d00..b1b046d 100644
--- a/.github/workflows/release-binaries.yml
+++ b/.github/workflows/release-binaries.yml
@@ -5,28 +5,38 @@ on:
     inputs:
       release-version:
         description: 'Release Version'
-        required: true
+        required: false
         type: string
       upload:
         description: 'Upload binaries to the release page'
         required: true
         default: false
         type: boolean
+      runs-on:
+        description: "Runner to use for the build"
+        required: true
+        type: choice
+        options:
+          - ubuntu-22.04
+          - windows-2022
+          - macos-13
+          - macos-14
 
   workflow_call:
     inputs:
       release-version:
         description: 'Release Version'
-        required: true
+        required: false
         type: string
       upload:
         description: 'Upload binaries to the release page'
         required: true
         default: false
         type: boolean
-  schedule:
-    # * is a special character in YAML so you have to quote this string
-    - cron:  '0 8 1 * *'
+      runs-on:
+        description: "Runner to use for the build"
+        required: true
+        type: string
 
 permissions:
   contents: read # Default everything to read-only
@@ -34,30 +44,39 @@ permissions:
 jobs:
   prepare:
     name: Prepare to build binaries
-    runs-on: ubuntu-22.04
+    runs-on: ${{ inputs.runs-on }}
     if: github.repository == 'llvm/llvm-project'
     outputs:
       release-version: ${{ steps.vars.outputs.release-version }}
       ref: ${{ steps.vars.outputs.ref }}
       upload: ${{ steps.vars.outputs.upload }}
+      target-cmake-flags: ${{ steps.vars.outputs.target-cmake-flags }}
+      build-flang: ${{ steps.vars.outputs.build-flang }}
+      enable-pgo: ${{ steps.vars.outputs.enable-pgo }}
+      release-binary-basename: ${{ steps.vars.outputs.release-binary-basename }}
+      release-binary-filename: ${{ steps.vars.outputs.release-binary-filename }}
 
     steps:
     - name: Checkout LLVM
       uses: actions/checkout@b4ffde65f46336ab88eb53be808477a3936bae11 # v4.1.1
 
     - name: Install Dependencies
+      shell: bash
       run: |
         pip install --require-hashes -r ./llvm/utils/git/requirements.txt
 
     - name: Check Permissions
+      if: github.event_name != 'pull_request'
       env:
         GITHUB_TOKEN: ${{ github.token }}
         USER_TOKEN: ${{ secrets.RELEASE_TASKS_USER_TOKEN }}
+      shell: bash
       run: |
         ./llvm/utils/release/./github-upload-release.py --token "$GITHUB_TOKEN" --user ${{ github.actor }} --user-token "$USER_TOKEN" check-permissions
 
     - name: Collect Variables
       id: vars
+      shell: bash
       # In order for the test-release.sh script to run correctly, the LLVM
       # source needs to be at the following location relative to the build dir:
       # | X.Y.Z-rcN | ./rcN/llvm-project
@@ -67,242 +86,393 @@ jobs:
       # | X.Y.Z-rcN | -rc N -test-asserts
       # | X.Y.Z     | -final
       run: |
-        tag="${{ github.ref_name }}"
         trimmed=$(echo ${{ inputs.release-version }} | xargs)
-        [[ "$trimmed" != "" ]] && tag="llvmorg-$trimmed"
-        if [ "$tag" = "main" ]; then
-          # If tag is main, then we've been triggered by a scheduled so pass so
-          # use the head commit as the tag.
-          tag=`git rev-parse HEAD`
+        if [ -n "$trimmed" ]; then
+          release_version="$trimmed"
+          ref="llvmorg-$release_version"
+        else
+          release_version="${{ (github.event_name == 'pull_request' && format('PR{0}', github.event.pull_request.number)) || 'CI'}}-${{ github.sha }}"
+          ref=${{ github.sha }}
         fi
         if [ -n "${{ inputs.upload }}" ]; then
           upload="${{ inputs.upload }}"
         else
           upload="false"
         fi
-        bash .github/workflows/set-release-binary-outputs.sh "$tag" "$upload"
+        echo "release-version=$release_version">> $GITHUB_OUTPUT
+        echo "ref=$ref" >> $GITHUB_OUTPUT
+        echo "upload=$upload" >> $GITHUB_OUTPUT
+
+        release_binary_basename="LLVM-$release_version-${{ runner.os }}-${{ runner.arch }}"
+        echo "release-binary-basename=$release_binary_basename" >> $GITHUB_OUTPUT
+        echo "release-binary-filename=$release_binary_basename.tar.xz" >> $GITHUB_OUTPUT
+
+        # Detect necessary CMake flags
+        target="${{ runner.os }}-${{ runner.arch }}"
+        echo "enable-pgo=false" >> $GITHUB_OUTPUT
+        target_cmake_flags="-DLLVM_RELEASE_ENABLE_PGO=OFF"
+        # The macOS builds try to cross compile some libraries so we need to
+        # add extra CMake args to disable them.
+        # See https://github.com/llvm/llvm-project/issues/99767
+        if [ "${{ runner.os }}" = "macOS" ]; then
+          target_cmake_flags="$target_cmake_flags -DBOOTSTRAP_COMPILER_RT_ENABLE_IOS=OFF"
+          if [ "${{ runner.arch }}" = "ARM64" ]; then
+            arches=arm64
+          else
+            arches=x86_64
+          fi
+          target_cmake_flags="$target_cmake_flags -DBOOTSTRAP_DARWIN_osx_ARCHS=$arches -DBOOTSTRAP_DARWIN_osx_BUILTIN_ARCHS=$arches"
+        fi
+
+        # x86 macOS and x86 Windows have trouble building flang, so disable it.
+        # Windows: https://github.com/llvm/llvm-project/issues/100202
+        # macOS: 'rebase opcodes terminated early at offset 1 of 80016' when building __fortran_builtins.mod
+        build_flang="true"
+
+        if [ "$target" = "Windows-X64" ]; then
+          target_cmake_flags="$target_cmake_flags -DLLVM_RELEASE_ENABLE_PROJECTS=\"clang;lld;lldb;clang-tools-extra;bolt;polly;mlir\""
+          build_flang="false"
+        fi
+
+        if [ "${{ runner.os }}" = "Windows" ]; then
+          # The build times out on Windows, so we need to disable LTO.
+          target_cmake_flags="$target_cmake_flags -DLLVM_RELEASE_ENABLE_LTO=OFF"
+        fi
 
-  build-stage1-linux:
-    name: "Build Stage 1 Linux"
+        echo "target-cmake-flags=$target_cmake_flags" >> $GITHUB_OUTPUT
+        echo "build-flang=$build_flang" >> $GITHUB_OUTPUT
+
+  build-stage1:
+    name: "Build Stage 1"
     needs: prepare
-    runs-on: ubuntu-22.04
     if: github.repository == 'llvm/llvm-project'
+    runs-on: ${{ inputs.runs-on }}
     steps:
+
+    - name: Checkout Actions
+      uses: actions/checkout@b4ffde65f46336ab88eb53be808477a3936bae11 # v4.1.1
+      with:
+        ref: ${{ (github.event_name == 'pull_request' && github.sha) || 'main' }}
+        sparse-checkout: |
+          .github/workflows/
+        sparse-checkout-cone-mode: false
+        # Check out outside of working directory so the source checkout doesn't
+        # remove it.
+        path: workflows
+
+    # actions/checkout does not support paths outside of the GITHUB_WORKSPACE.
+    # Also, anything that we put inside of GITHUB_WORKSPACE will be overwritten
+    # by future actions/checkout steps.  Therefore, in order to checkout the
+    # latest actions from main, we need to first checkout out the actions inside of
+    # GITHUB_WORKSPACE (see previous step), then use actions/checkout to checkout
+    # the code being built and the move the actions from main back into GITHUB_WORKSPACE,
+    # becasue the uses on composite actions only reads workflows from inside GITHUB_WORKSPACE.
+    - shell: bash
+      run: mv workflows  ../workflows-main
+
     - name: Checkout LLVM
       uses: actions/checkout@b4ffde65f46336ab88eb53be808477a3936bae11 # v4.1.1
       with:
         ref: ${{ needs.prepare.outputs.ref }}
 
-    - name: Install Ninja
-      uses: llvm/actions/install-ninja@22e9f909d35b50bd1181709564bfe816eaeaae81 # main
+    - name: Copy main workflows
+      shell: bash
+      run: |
+        mv ../workflows-main .
+
+    - name: Setup Stage
+      id: setup-stage
+      uses: ./workflows-main/.github/workflows/release-binaries-setup-stage
 
     - name: Setup sccache
       uses: hendrikmuhs/ccache-action@ca3acd2731eef11f1572ccb126356c2f9298d35e # v1.2.9
       with:
-        max-size: 250M
-        key: sccache-${{ runner.os }}-release
+        # Default to 2G to workaround: https://github.com/hendrikmuhs/ccache-action/issues/174
+        max-size: 2G
+        key: sccache-${{ runner.os }}-${{ runner.arch }}-release
         variant: sccache
 
     - name: Build Stage 1 Clang
+      id: build
+      shell: bash
       run: |
-        sudo chown $USER:$USER /mnt/
-        cmake -G Ninja -C clang/cmake/caches/Release.cmake -DCMAKE_C_COMPILER_LAUNCHER=sccache -DCMAKE_CXX_COMPILER_LAUNCHER=sccache -S llvm -B /mnt/build
-        ninja -v -C /mnt/build
-
-    # We need to create an archive of the build directory, because it has too
-    # many files to upload.
-    - name: Package Build and Source Directories
-      run: |
-        tar -c . | zstd -T0 -c > llvm-project.tar.zst
-        tar -C /mnt/ -c build/ | zstd -T0 -c > build.tar.zst
-
-    - name: Upload Stage 1 Source
-      uses: actions/upload-artifact@26f96dfa697d77e81fd5907df203aa23a56210a8 #v4.3.0
-      with:
-        name: stage1-source
-        path: llvm-project.tar.zst
-        retention-days: 2
-
-    - name: Upload Stage 1 Build Dir
-      uses: actions/upload-artifact@26f96dfa697d77e81fd5907df203aa23a56210a8 #v4.3.0
+        # There were some issues on the ARM64 MacOS runners with trying to build x86 object,
+        # so we need to set some extra cmake flags to disable this.
+        cmake -G Ninja -S llvm -B ${{ steps.setup-stage.outputs.build-prefix }}/build \
+            ${{ needs.prepare.outputs.target-cmake-flags }} \
+            -C clang/cmake/caches/Release.cmake \
+            -DBOOTSTRAP_LLVM_PARALLEL_LINK_JOBS=1 \
+            -DBOOTSTRAP_CPACK_PACKAGE_FILE_NAME="${{ needs.prepare.outputs.release-binary-basename }}" \
+            -DCMAKE_C_COMPILER_LAUNCHER=sccache \
+            -DCMAKE_CXX_COMPILER_LAUNCHER=sccache
+        ninja -v -C ${{ steps.setup-stage.outputs.build-prefix }}/build
+        # There is a race condition on the MacOS builders and this command is here
+        # to help debug that when it happens.
+        ls -ltr ${{ steps.setup-stage.outputs.build-prefix }}/build
+    
+    - name: Save Stage
+      uses: ./workflows-main/.github/workflows/release-binaries-save-stage
       with:
-        name: stage1-build
-        path: build.tar.zst
-        retention-days: 2
+        build-prefix: ${{ steps.setup-stage.outputs.build-prefix }}
 
-  build-stage2-linux:
-    name: "Build Stage 2 Linux"
+  build-stage2:
+    name: "Build Stage 2"
     needs:
       - prepare
-      - build-stage1-linux
-    runs-on: ubuntu-22.04
+      - build-stage1
     if: github.repository == 'llvm/llvm-project'
+    runs-on: ${{ inputs.runs-on }}
     steps:
-    - name: Install Ninja
-      uses: llvm/actions/install-ninja@22e9f909d35b50bd1181709564bfe816eaeaae81 # main
-
-    - name: Download Stage 1 Artifacts
-      uses: actions/download-artifact@6b208ae046db98c579e8a3aa621ab581ff575935 # v4.1.1
+    - name: Checkout Actions
+      uses: actions/checkout@v4
       with:
-        pattern: stage1-*
-        merge-multiple: true
-
-    - name: Unpack Artifacts
-      run: |
-        tar --zstd -xf llvm-project.tar.zst
-        rm llvm-project.tar.zst
-        sudo chown $USER:$USER /mnt/
-        tar --zstd -C /mnt -xf build.tar.zst
-        rm build.tar.zst
+        ref: ${{ (github.event_name == 'pull_request' && github.sha) || 'main' }}
+        sparse-checkout: |
+          .github/workflows/
+        sparse-checkout-cone-mode: false
+        path: workflows
+    - name: Setup Stage
+      id: setup-stage
+      uses: ./workflows/.github/workflows/release-binaries-setup-stage
+      with:
+        previous-artifact: build-stage1
 
     - name: Build Stage 2
       # Re-enable once PGO builds are supported.
-      if: false
-      run: |
-        ninja -C /mnt/build stage2-instrumented
-
-    # We need to create an archive of the build directory, because it has too
-    # many files to upload.
-    - name: Save Build and Source Directories
+      if: needs.prepare.outputs.enable-pgo == 'true'
+      shell: bash
       run: |
-        tar -c . | zstd -T0 -c > llvm-project.tar.zst
-        tar -C /mnt/ -c build/ | zstd -T0 -c > build.tar.zst
+        ninja -C ${{ steps.setup-stage.outputs.build-prefix}}/build stage2-instrumented
 
-    - name: Upload Stage 2 Source
-      uses: actions/upload-artifact@26f96dfa697d77e81fd5907df203aa23a56210a8 #v4.3.0
+    - name: Save Stage
+      uses: ./workflows/.github/workflows/release-binaries-save-stage
       with:
-        name: stage2-source
-        path: ${{ github.workspace }}/llvm-project.tar.zst
-        retention-days: 2
+        build-prefix: ${{ steps.setup-stage.outputs.build-prefix }}
 
-    - name: Upload Stage 2 Build Dir
-      uses: actions/upload-artifact@26f96dfa697d77e81fd5907df203aa23a56210a8 #v4.3.0
+  build-stage3-clang:
+    name: "Build Stage 3 LLVM/Clang"
+    needs:
+      - prepare
+      - build-stage2
+    if: github.repository == 'llvm/llvm-project'
+    runs-on: ${{ inputs.runs-on }}
+    steps:
+    - name: Checkout Actions
+      uses: actions/checkout@v4
+      with:
+        ref: ${{ (github.event_name == 'pull_request' && github.sha) || 'main' }}
+        sparse-checkout: |
+          .github/workflows/
+        sparse-checkout-cone-mode: false
+        path: workflows
+    - name: Setup Stage
+      id: setup-stage
+      uses: ./workflows/.github/workflows/release-binaries-setup-stage
       with:
-        name: stage2-build
-        path: ${{ github.workspace }}/build.tar.zst
-        retention-days: 2
+        previous-artifact: build-stage2
 
+    - name: Build LLVM/Clang
+      shell: bash
+      run: |
+        # There is a race condition on the MacOS builders and this command is here
+        # to help debug that when it happens.
+        ls -ltr ${{ steps.setup-stage.outputs.build-prefix }}/build
+        ninja -C ${{ steps.setup-stage.outputs.build-prefix }}/build stage2-clang
+        # Build some of the larger binaries here too.
+        ninja -C ${{ steps.setup-stage.outputs.build-prefix }}/build/tools/clang/stage2-bins/ \
+            clang-scan-deps \
+            modularize clangd \
+            clangd-indexer \
+            clang-check \
+            ${{ (runner.os == 'Linux' && 'clangd-fuzzer') || '' }} \
+            clang-tidy \
+            llc \
+            lli \
+            llvm-exegesis \
+            llvm-opt-fuzzer \
+            llvm-reduce \
+            llvm-lto \
+            dsymutil
+
+    - name: Save Stage
+      uses: ./workflows/.github/workflows/release-binaries-save-stage
+      with:
+        build-prefix: ${{ steps.setup-stage.outputs.build-prefix }}
 
-  build-stage3-linux:
-    name: "Build Stage 3 Linux"
+  build-stage3-flang:
+    name: "Build Stage 3 Flang/MLIR/Bolt"
     needs:
       - prepare
-      - build-stage2-linux
-    outputs:
-      filename: ${{ steps.package-info.outputs.release-filename }}
-    runs-on: ubuntu-22.04-16x64
-    if: github.repository == 'llvm/llvm-project'
+      - build-stage3-clang
+    runs-on: ${{ inputs.runs-on }}
     steps:
-    - name: Install Ninja
-      uses: llvm/actions/install-ninja@22e9f909d35b50bd1181709564bfe816eaeaae81 # main
-
-    - name: 'Download artifact'
-      uses: actions/download-artifact@6b208ae046db98c579e8a3aa621ab581ff575935 # v4.1.1
+    - name: Checkout Actions
+      uses: actions/checkout@v4
       with:
-        pattern: stage2-*
-        merge-multiple: true
+        ref: ${{ (github.event_name == 'pull_request' && github.sha) || 'main' }}
+        sparse-checkout: |
+          .github/workflows/
+        sparse-checkout-cone-mode: false
+        path: workflows
+    - name: Setup Stage
+      id: setup-stage
+      uses: ./workflows/.github/workflows/release-binaries-setup-stage
+      with:
+        previous-artifact: build-stage3-clang
 
-    - name: Unpack Artifact
+    - name: Build Flang / MLIR / Bolt
+      shell: bash
       run: |
-        tar --zstd -xf llvm-project.tar.zst
-        rm llvm-project.tar.zst
-        sudo chown $USER:$USER /mnt/
-        tar --zstd -C /mnt -xf build.tar.zst
-        rm build.tar.zst
+        # Build some of the mlir tools that take a long time to link
+        if [ "${{ needs.prepare.outputs.build-flang }}" = "true" ]; then
+          ninja -C ${{ steps.setup-stage.outputs.build-prefix }}/build/tools/clang/stage2-bins/ -j2 flang-new bbc
+        fi
+        ninja -C ${{ steps.setup-stage.outputs.build-prefix }}/build/tools/clang/stage2-bins/ \
+            mlir-bytecode-parser-fuzzer \
+            mlir-cpu-runner \
+            mlir-lsp-server \
+            mlir-opt \
+            mlir-query \
+            mlir-reduce \
+            mlir-text-parser-fuzzer \
+            mlir-translate \
+            mlir-transform-opt \
+            mlir-cat \
+            mlir-minimal-opt \
+            mlir-minimal-opt-canonicalize \
+            mlir-pdll-lsp-server \
+            llvm-bolt \
+            llvm-bolt-heatmap
+    
+    - name: Save Stage
+      uses: ./workflows/.github/workflows/release-binaries-save-stage
+      with:
+        build-prefix: ${{ steps.setup-stage.outputs.build-prefix }}
 
-    - name: Build Release Package
-      run: |
-        ninja -C /mnt/build stage2-package
+  build-stage3-all:
+    name: "Build Stage 3"
+    needs:
+      - prepare
+      - build-stage3-flang
+    runs-on: ${{ inputs.runs-on }}
+    steps:
+    - name: Checkout Actions
+      uses: actions/checkout@v4
+      with:
+        ref: ${{ (github.event_name == 'pull_request' && github.sha) || 'main' }}
+        sparse-checkout: |
+          .github/workflows/
+        sparse-checkout-cone-mode: false
+        path: workflows
+    - name: Setup Stage
+      id: setup-stage
+      uses: ./workflows/.github/workflows/release-binaries-setup-stage
+      with:
+        previous-artifact: build-stage3-flang
 
-    - id: package-info
+    - name: Build Release Package
+      shell: bash
       run: |
-        filename="LLVM-${{ needs.prepare.outputs.release-version }}-Linux.tar.xz"
-        echo "filename=$filename" >> $GITHUB_OUTPUT
-        echo "path=/mnt/build/tools/clang/stage2-bins/$filename" >> $GITHUB_OUTPUT
+        which cmake
+        ninja -C ${{ steps.setup-stage.outputs.build-prefix }}/build stage2-package
+        # Copy Release artifact to the workspace so it is easier to upload.
+        # This is necessary, because on Windows, the build-prefix path can
+        # only be used on bash steps, because it uses the form of /d/files/
+        # and other steps expect D:\files.
+        mv ${{ steps.setup-stage.outputs.build-prefix  }}/build/tools/clang/stage2-bins/${{ needs.prepare.outputs.release-binary-filename }} .
 
     - uses: actions/upload-artifact@26f96dfa697d77e81fd5907df203aa23a56210a8 #v4.3.0
-      if: always()
       with:
-        name: release-binary
-        path: ${{ steps.package-info.outputs.path }}
+        name: ${{ runner.os }}-${{ runner.arch }}-release-binary
+        # Due to path differences on Windows when running in bash vs running on node,
+        # we need to search for files in the current workspace.
+        path: |
+          ${{ needs.prepare.outputs.release-binary-filename }}
 
     # Clean up some build files to reduce size of artifact.
     - name: Clean Up Build Directory
+      shell: bash
       run: |
-        find /mnt/build -iname ${{ steps.package-info.outputs.filename }} -delete
+        find ${{ steps.setup-stage.outputs.build-prefix }}/build -iname ${{ needs.prepare.outputs.release-binary-filename }} -delete
+        rm -Rf ${{ steps.setup-stage.outputs.build-prefix }}/build/tools/clang/stage2-bins/_CPack_Packages
 
-    # We need to create an archive of the build directory, because it has too
-    # many files to upload.
-    - name: Save Build and Source Directories
-      run: |
-        tar -c . | zstd -T0 -c > llvm-project.tar.zst
-        tar -C /mnt/ -c build/ | zstd -T0 -c > build.tar.zst
-
-    - name: Upload Stage 3 Source
-      uses: actions/upload-artifact@26f96dfa697d77e81fd5907df203aa23a56210a8 #v4.3.0
-      with:
-        name: stage3-source
-        path: llvm-project.tar.zst
-        retention-days: 2
-
-    - name: Upload Stage 3 Build Dir
-      uses: actions/upload-artifact@26f96dfa697d77e81fd5907df203aa23a56210a8 #v4.3.0
+    - name: Save Stage
+      uses: ./workflows/.github/workflows/release-binaries-save-stage
       with:
-        name: stage3-build
-        path: build.tar.zst
-        retention-days: 2
+        build-prefix: ${{ steps.setup-stage.outputs.build-prefix }}
 
-  upload-release-binaries-linux:
-    name: "Upload Linux Release Binaries"
+  upload-release-binaries:
+    name: "Upload Release Binaries"
     needs:
       - prepare
-      - build-stage3-linux
-    if : ${{ needs.prepare.outputs.upload == 'true' }}
+      - build-stage3-all
+    if: >-
+      always() &&
+      github.event_name != 'pull_request' &&
+      needs.prepare.outputs.upload == 'true'
     runs-on: ubuntu-22.04
     permissions:
       contents: write # For release uploads
+      id-token: write     # For artifact attestations
+      attestations: write # For artifact attestations
 
     steps:
     - name: 'Download artifact'
       uses: actions/download-artifact@6b208ae046db98c579e8a3aa621ab581ff575935 # v4.1.1
       with:
-        name: release-binary
+        pattern: '*-release-binary'
+        merge-multiple: true
+
+    - name: Attest Build Provenance
+      id: provenance
+      uses: actions/attest-build-provenance@897ed5eab6ed058a474202017ada7f40bfa52940 # v1.0.0
+      with:
+        subject-path: ${{ needs.prepare.outputs.release-binary-filename }}
+
+    - name: Rename attestation file
+      run:
+        mv ${{ steps.provenance.outputs.bundle-path }} ${{ needs.prepare.outputs.release-binary-filename }}.jsonl
+
+    - name: Upload Build Provenance
+      uses: actions/upload-artifact@65462800fd760344b1a7b4382951275a0abb4808 #v4.3.3
+      with:
+        name: ${{ runner.os }}-${{ runner.arch }}-release-binary-attestation
+        path: ${{ needs.prepare.outputs.release-binary-filename }}.jsonl
 
     - name: Upload Release
+      shell: bash
       run: |
         sudo apt install python3-github
         ./llvm-project/llvm/utils/release/github-upload-release.py \
         --token ${{ github.token }} \
         --release ${{ needs.prepare.outputs.release-version }} \
         upload \
-        --files ${{ needs.build-stage3-linux.outputs.release-filename }}
-
+        --files ${{ needs.prepare.outputs.release-binary-filename }}*
 
-  test-stage3-linux:
-    name: "Test Stage 3 Linux"
+  test-stage3:
+    name: "Test Stage 3"
     needs:
       - prepare
-      - build-stage3-linux
-    runs-on: ubuntu-22.04
-    if: github.repository == 'llvm/llvm-project'
+      - build-stage3-all
+    if: >-
+      github.repository == 'llvm/llvm-project'
+    runs-on: ${{ inputs.runs-on }}
     steps:
-    - name: Install Ninja
-      uses: llvm/actions/install-ninja@22e9f909d35b50bd1181709564bfe816eaeaae81 # main
-
-    - name: 'Download artifact'
-      uses: actions/download-artifact@6b208ae046db98c579e8a3aa621ab581ff575935 # v4.1.1
+    - name: Checkout Actions
+      uses: actions/checkout@v4
       with:
-        pattern: stage3-*
-        merge-multiple: true
-
-    - name: Unpack Artifact
-      run: |
-        tar --zstd -xf llvm-project.tar.zst
-        rm llvm-project.tar.zst
-        sudo chown $USER:$USER /mnt/
-        tar --zstd -C /mnt -xf build.tar.zst
-        rm build.tar.zst
+        ref: ${{ (github.event_name == 'pull_request' && github.sha) || 'main' }}
+        sparse-checkout: |
+          .github/workflows/
+        sparse-checkout-cone-mode: false
+        path: workflows
+    - name: Setup Stage
+      id: setup-stage
+      uses: ./workflows/.github/workflows/release-binaries-setup-stage
+      with:
+        previous-artifact: build-stage3-all
 
     - name: Run Tests
+      shell: bash
       run: |
-        ninja -C /mnt/build stage2-check-all
+        ninja -C ${{ steps.setup-stage.outputs.build-prefix }}/build stage2-check-all
diff --git a/.github/workflows/release-sources.yml b/.github/workflows/release-sources.yml
index 9c5b1a9..b0c0b65 100644
--- a/.github/workflows/release-sources.yml
+++ b/.github/workflows/release-sources.yml
@@ -47,7 +47,7 @@ jobs:
     steps:
       - id: inputs
         run: |
-          ref=${{ inputs.release-version || github.sha }}
+          ref=${{ (inputs.release-version && format('llvmorg-{0}', inputs.release-version)) || github.sha }}
           if [ -n "${{ inputs.release-version }}" ]; then
             export_args="-release ${{ inputs.release-version }} -final"
           else
diff --git a/.github/workflows/release-tasks.yml b/.github/workflows/release-tasks.yml
index 2ed56da..7dd4c30 100644
--- a/.github/workflows/release-tasks.yml
+++ b/.github/workflows/release-tasks.yml
@@ -81,10 +81,20 @@ jobs:
     needs:
       - validate-tag
       - release-create
+    strategy:
+      fail-fast: false
+      matrix:
+        runs-on:
+          - ubuntu-22.04
+          - windows-2022
+          - macos-13
+          - macos-14
+
     uses: ./.github/workflows/release-binaries.yml
     with:
       release-version: ${{ needs.validate-tag.outputs.release-version }}
       upload: true
+      runs-on: ${{ matrix.runs-on }}
 
   release-sources:
     name: Package Release Sources
diff --git a/bolt/include/bolt/Core/DIEBuilder.h b/bolt/include/bolt/Core/DIEBuilder.h
index 0b840c1..e5b057e 100644
--- a/bolt/include/bolt/Core/DIEBuilder.h
+++ b/bolt/include/bolt/Core/DIEBuilder.h
@@ -127,6 +127,9 @@ private:
   DWARFContext *DwarfContext{nullptr};
   DWARFUnit *SkeletonCU{nullptr};
   uint64_t UnitSize{0};
+  /// Adds separate UnitSize counter for updating DebugNames
+  /// so there is no dependency between the functions.
+  uint64_t DebugNamesUnitSize{0};
   llvm::DenseSet<uint64_t> AllProcessed;
   DWARF5AcceleratorTable &DebugNamesTable;
   // Unordered map to handle name collision if output DWO directory is
@@ -203,13 +206,16 @@ private:
   /// Update references once the layout is finalized.
   void updateReferences();
 
-  /// Update the Offset and Size of DIE, populate DebugNames table.
+  /// Update the Offset and Size of DIE.
   /// Along with current CU, and DIE being processed and the new DIE offset to
   /// be updated, it takes in Parents vector that can be empty if this DIE has
   /// no parents.
-  uint32_t finalizeDIEs(DWARFUnit &CU, DIE &Die,
-                        std::optional<BOLTDWARF5AccelTableData *> Parent,
-                        uint32_t NumberParentsInChain, uint32_t &CurOffset);
+  uint32_t finalizeDIEs(DWARFUnit &CU, DIE &Die, uint32_t &CurOffset);
+
+  /// Populates DebugNames table.
+  void populateDebugNamesTable(DWARFUnit &CU, const DIE &Die,
+                               std::optional<BOLTDWARF5AccelTableData *> Parent,
+                               uint32_t NumberParentsInChain);
 
   void registerUnit(DWARFUnit &DU, bool NeedSort);
 
@@ -338,6 +344,9 @@ public:
   /// Finish current DIE construction.
   void finish();
 
+  /// Update debug names table.
+  void updateDebugNamesTable();
+
   // Interface to edit DIE
   template <class T> T *allocateDIEValue() {
     return new (getState().DIEAlloc) T;
diff --git a/bolt/include/bolt/Core/DebugData.h b/bolt/include/bolt/Core/DebugData.h
index 5935ffa..6ea3b1a 100644
--- a/bolt/include/bolt/Core/DebugData.h
+++ b/bolt/include/bolt/Core/DebugData.h
@@ -475,7 +475,8 @@ public:
   }
 
   /// Update Str offset in .debug_str in .debug_str_offsets.
-  void updateAddressMap(uint32_t Index, uint32_t Address);
+  void updateAddressMap(uint32_t Index, uint32_t Address,
+                        const DWARFUnit &Unit);
 
   /// Get offset for given index in original .debug_str_offsets section.
   uint64_t getOffset(uint32_t Index) const { return StrOffsets[Index]; }
@@ -507,6 +508,8 @@ private:
   std::unique_ptr<DebugStrOffsetsBufferVector> StrOffsetsBuffer;
   std::unique_ptr<raw_svector_ostream> StrOffsetsStream;
   std::map<uint32_t, uint32_t> IndexToAddressMap;
+  [[maybe_unused]]
+  DenseSet<uint64_t> DebugStrOffsetFinalized;
   SmallVector<uint32_t, 5> StrOffsets;
   std::unordered_map<uint64_t, uint64_t> ProcessedBaseOffsets;
   bool StrOffsetSectionWasModified = false;
diff --git a/bolt/include/bolt/Core/GDBIndex.h b/bolt/include/bolt/Core/GDBIndex.h
index 6604c2a..0ebcf4e 100644
--- a/bolt/include/bolt/Core/GDBIndex.h
+++ b/bolt/include/bolt/Core/GDBIndex.h
@@ -53,6 +53,14 @@ public:
   const GDBIndexTUEntryType &getGDBIndexTUEntryVector() const {
     return GDBIndexTUEntryVector;
   }
+
+  /// Sorts entries in GDBIndexTUEntryVector according to the TypeHash.
+  void sortGDBIndexTUEntryVector() {
+    llvm::stable_sort(GDBIndexTUEntryVector, [](const GDBIndexTUEntry &LHS,
+                                                const GDBIndexTUEntry &RHS) {
+      return LHS.TypeHash > RHS.TypeHash;
+    });
+  }
 };
 
 } // namespace bolt
diff --git a/bolt/include/bolt/Rewrite/DWARFRewriter.h b/bolt/include/bolt/Rewrite/DWARFRewriter.h
index b798c5b..deaf179 100644
--- a/bolt/include/bolt/Rewrite/DWARFRewriter.h
+++ b/bolt/include/bolt/Rewrite/DWARFRewriter.h
@@ -15,7 +15,6 @@
 #include "bolt/Core/GDBIndex.h"
 #include "llvm/ADT/StringRef.h"
 #include "llvm/CodeGen/DIE.h"
-#include "llvm/DWP/DWP.h"
 #include "llvm/MC/MCContext.h"
 #include "llvm/Support/ToolOutputFile.h"
 #include <cstdint>
@@ -41,13 +40,6 @@ public:
     uint64_t TypeHash;
     uint64_t TypeDIERelativeOffset;
   };
-  /// Contains information for CU or TU so we can output correct {cu, tu}-index.
-  struct UnitMeta {
-    uint64_t Offset;
-    uint64_t Length;
-    uint64_t TUHash;
-  };
-  using UnitMetaVectorType = std::vector<UnitMeta>;
 
 private:
   BinaryContext &BC;
@@ -194,35 +186,6 @@ public:
                      const std::string &, DebugLocWriter &,
                      DebugStrOffsetsWriter &, DebugStrWriter &);
   using KnownSectionsEntry = std::pair<MCSection *, DWARFSectionKind>;
-  struct DWPState {
-    std::unique_ptr<ToolOutputFile> Out;
-    std::unique_ptr<BinaryContext> TmpBC;
-    std::unique_ptr<MCStreamer> Streamer;
-    std::unique_ptr<DWPStringPool> Strings;
-    /// Used to store String sections for .dwo files if they are being modified.
-    std::vector<std::unique_ptr<DebugBufferVector>> StrSections;
-    const MCObjectFileInfo *MCOFI = nullptr;
-    const DWARFUnitIndex *CUIndex = nullptr;
-    std::deque<SmallString<32>> UncompressedSections;
-    MapVector<uint64_t, UnitIndexEntry> IndexEntries;
-    MapVector<uint64_t, UnitIndexEntry> TypeIndexEntries;
-    StringMap<KnownSectionsEntry> KnownSections;
-    uint32_t ContributionOffsets[8] = {};
-    uint32_t IndexVersion = 2;
-    uint64_t DebugInfoSize = 0;
-    uint16_t Version = 0;
-    bool IsDWP = false;
-  };
-  /// Init .dwp file
-  void initDWPState(DWPState &);
-
-  /// Write out .dwp File
-  void finalizeDWP(DWPState &);
-
-  /// add content of dwo to .dwp file.
-  void updateDWP(DWARFUnit &, const OverriddenSectionsMap &, const UnitMeta &,
-                 UnitMetaVectorType &, DWPState &, DebugLocWriter &,
-                 DebugStrOffsetsWriter &, DebugStrWriter &);
 };
 
 } // namespace bolt
diff --git a/bolt/lib/Core/DIEBuilder.cpp b/bolt/lib/Core/DIEBuilder.cpp
index b0f550f..69cfd58 100644
--- a/bolt/lib/Core/DIEBuilder.cpp
+++ b/bolt/lib/Core/DIEBuilder.cpp
@@ -78,7 +78,7 @@ static void addStringHelper(DebugStrOffsetsWriter &StrOffstsWriter,
   uint32_t NewOffset = StrWriter.addString(Str);
   if (Unit.getVersion() >= 5) {
     StrOffstsWriter.updateAddressMap(DIEAttrInfo.getDIEInteger().getValue(),
-                                     NewOffset);
+                                     NewOffset, Unit);
     return;
   }
   DIEBldr.replaceValue(&Die, DIEAttrInfo.getAttribute(), DIEAttrInfo.getForm(),
@@ -461,17 +461,11 @@ getUnitForOffset(DIEBuilder &Builder, DWARFContext &DWCtx,
   return nullptr;
 }
 
-uint32_t
-DIEBuilder::finalizeDIEs(DWARFUnit &CU, DIE &Die,
-                         std::optional<BOLTDWARF5AccelTableData *> Parent,
-                         uint32_t NumberParentsInChain, uint32_t &CurOffset) {
+uint32_t DIEBuilder::finalizeDIEs(DWARFUnit &CU, DIE &Die,
+                                  uint32_t &CurOffset) {
   getState().DWARFDieAddressesParsed.erase(Die.getOffset());
   uint32_t CurSize = 0;
   Die.setOffset(CurOffset);
-  std::optional<BOLTDWARF5AccelTableData *> NameEntry =
-      DebugNamesTable.addAccelTableEntry(
-          CU, Die, SkeletonCU ? SkeletonCU->getDWOId() : std::nullopt,
-          NumberParentsInChain, Parent);
   // It is possible that an indexed debugging information entry has a parent
   // that is not indexed (for example, if its parent does not have a name
   // attribute). In such a case, a parent attribute may point to a nameless
@@ -485,18 +479,13 @@ DIEBuilder::finalizeDIEs(DWARFUnit &CU, DIE &Die,
   // If Parent is nullopt and NumberParentsInChain is not zero, then forward
   // declaration was encountered in this DF traversal. Propagating nullopt for
   // Parent to children.
-  if (!Parent && NumberParentsInChain)
-    NameEntry = std::nullopt;
-  if (NameEntry)
-    ++NumberParentsInChain;
   for (DIEValue &Val : Die.values())
     CurSize += Val.sizeOf(CU.getFormParams());
   CurSize += getULEB128Size(Die.getAbbrevNumber());
   CurOffset += CurSize;
 
   for (DIE &Child : Die.children()) {
-    uint32_t ChildSize =
-        finalizeDIEs(CU, Child, NameEntry, NumberParentsInChain, CurOffset);
+    uint32_t ChildSize = finalizeDIEs(CU, Child, CurOffset);
     CurSize += ChildSize;
   }
   // for children end mark.
@@ -514,10 +503,9 @@ void DIEBuilder::finish() {
     DIE *UnitDIE = getUnitDIEbyUnit(CU);
     uint32_t HeaderSize = CU.getHeaderSize();
     uint32_t CurOffset = HeaderSize;
-    DebugNamesTable.setCurrentUnit(CU, UnitStartOffset);
     std::vector<std::optional<BOLTDWARF5AccelTableData *>> Parents;
     Parents.push_back(std::nullopt);
-    finalizeDIEs(CU, *UnitDIE, std::nullopt, 0, CurOffset);
+    finalizeDIEs(CU, *UnitDIE, CurOffset);
 
     DWARFUnitInfo &CurUnitInfo = getUnitInfoByDwarfUnit(CU);
     CurUnitInfo.UnitOffset = UnitStartOffset;
@@ -548,6 +536,48 @@ void DIEBuilder::finish() {
       dbgs() << Twine::utohexstr(Address) << "\n";
     }
   }
+}
+
+void DIEBuilder::populateDebugNamesTable(
+    DWARFUnit &CU, const DIE &Die,
+    std::optional<BOLTDWARF5AccelTableData *> Parent,
+    uint32_t NumberParentsInChain) {
+  std::optional<BOLTDWARF5AccelTableData *> NameEntry =
+      DebugNamesTable.addAccelTableEntry(
+          CU, Die, SkeletonCU ? SkeletonCU->getDWOId() : std::nullopt,
+          NumberParentsInChain, Parent);
+  if (!Parent && NumberParentsInChain)
+    NameEntry = std::nullopt;
+  if (NameEntry)
+    ++NumberParentsInChain;
+
+  for (const DIE &Child : Die.children())
+    populateDebugNamesTable(CU, Child, NameEntry, NumberParentsInChain);
+}
+
+void DIEBuilder::updateDebugNamesTable() {
+  auto finalizeDebugNamesTableForCU = [&](DWARFUnit &CU,
+                                          uint64_t &UnitStartOffset) -> void {
+    DIE *UnitDIE = getUnitDIEbyUnit(CU);
+    DebugNamesTable.setCurrentUnit(CU, UnitStartOffset);
+    populateDebugNamesTable(CU, *UnitDIE, std::nullopt, 0);
+
+    DWARFUnitInfo &CurUnitInfo = getUnitInfoByDwarfUnit(CU);
+    UnitStartOffset += CurUnitInfo.UnitLength;
+  };
+
+  uint64_t TypeUnitStartOffset = 0;
+  for (DWARFUnit *CU : getState().DUList) {
+    if (!(CU->getVersion() < 5 && CU->isTypeUnit()))
+      break;
+    finalizeDebugNamesTableForCU(*CU, TypeUnitStartOffset);
+  }
+
+  for (DWARFUnit *CU : getState().DUList) {
+    if (CU->getVersion() < 5 && CU->isTypeUnit())
+      continue;
+    finalizeDebugNamesTableForCU(*CU, DebugNamesUnitSize);
+  }
   updateReferences();
 }
 
diff --git a/bolt/lib/Core/DebugData.cpp b/bolt/lib/Core/DebugData.cpp
index 002f58c..bd8aa80 100644
--- a/bolt/lib/Core/DebugData.cpp
+++ b/bolt/lib/Core/DebugData.cpp
@@ -851,7 +851,11 @@ void DebugStrOffsetsWriter::initialize(DWARFUnit &Unit) {
         StrOffsetsSection.Data.data() + Contr->Base + Offset));
 }
 
-void DebugStrOffsetsWriter::updateAddressMap(uint32_t Index, uint32_t Address) {
+void DebugStrOffsetsWriter::updateAddressMap(uint32_t Index, uint32_t Address,
+                                             const DWARFUnit &Unit) {
+  assert(DebugStrOffsetFinalized.count(Unit.getOffset()) == 0 &&
+         "Cannot update address map since debug_str_offsets was already "
+         "finalized for this CU.");
   IndexToAddressMap[Index] = Address;
   StrOffsetSectionWasModified = true;
 }
@@ -906,6 +910,8 @@ void DebugStrOffsetsWriter::finalizeSection(DWARFUnit &Unit,
   }
 
   StrOffsetSectionWasModified = false;
+  assert(DebugStrOffsetFinalized.insert(Unit.getOffset()).second &&
+         "debug_str_offsets was already finalized for this CU.");
   clear();
 }
 
diff --git a/bolt/lib/Core/GDBIndex.cpp b/bolt/lib/Core/GDBIndex.cpp
index 9e6d241..c7fb488 100644
--- a/bolt/lib/Core/GDBIndex.cpp
+++ b/bolt/lib/Core/GDBIndex.cpp
@@ -23,7 +23,6 @@ void GDBIndex::updateGdbIndexSection(
     DebugARangesSectionWriter &ARangesSectionWriter) {
   if (!BC.getGdbIndexSection())
     return;
-
   // See https://sourceware.org/gdb/onlinedocs/gdb/Index-Section-Format.html
   // for .gdb_index section format.
 
@@ -141,7 +140,7 @@ void GDBIndex::updateGdbIndexSection(
     write64le(Buffer + 8, CUInfo.second.Length + 4);
     Buffer += 16;
   }
-
+  sortGDBIndexTUEntryVector();
   // Rewrite TU CU List, since abbrevs can be different.
   // Entry example:
   // 0: offset = 0x00000000, type_offset = 0x0000001e, type_signature =
diff --git a/bolt/lib/Rewrite/CMakeLists.txt b/bolt/lib/Rewrite/CMakeLists.txt
index 34993af..5d11492 100644
--- a/bolt/lib/Rewrite/CMakeLists.txt
+++ b/bolt/lib/Rewrite/CMakeLists.txt
@@ -1,7 +1,6 @@
 set(LLVM_LINK_COMPONENTS
   Core
   DebugInfoDWARF
-  DWP
   JITLink
   MC
   Object
diff --git a/bolt/lib/Rewrite/DWARFRewriter.cpp b/bolt/lib/Rewrite/DWARFRewriter.cpp
index 674b5f1..98f81f4 100644
--- a/bolt/lib/Rewrite/DWARFRewriter.cpp
+++ b/bolt/lib/Rewrite/DWARFRewriter.cpp
@@ -32,6 +32,7 @@
 #include "llvm/MC/MCAssembler.h"
 #include "llvm/MC/MCObjectWriter.h"
 #include "llvm/MC/MCStreamer.h"
+#include "llvm/MC/MCTargetOptionsCommandFlags.h"
 #include "llvm/Object/ObjectFile.h"
 #include "llvm/Support/Casting.h"
 #include "llvm/Support/CommandLine.h"
@@ -56,6 +57,8 @@
 #undef DEBUG_TYPE
 #define DEBUG_TYPE "bolt"
 
+static mc::RegisterMCTargetOptionsFlags MOF;
+
 static void printDie(const DWARFDie &DIE) {
   DIDumpOptions DumpOpts;
   DumpOpts.ShowForm = true;
@@ -328,14 +331,8 @@ static cl::opt<bool> KeepARanges(
 
 static cl::opt<std::string> DwarfOutputPath(
     "dwarf-output-path",
-    cl::desc("Path to where .dwo files or dwp file will be written out to."),
-    cl::init(""), cl::cat(BoltCategory));
-
-static cl::opt<bool>
-    WriteDWP("write-dwp",
-             cl::desc("output a single dwarf package file (dwp) instead of "
-                      "multiple non-relocatable dwarf object files (dwo)."),
-             cl::init(false), cl::cat(BoltCategory));
+    cl::desc("Path to where .dwo files will be written out to."), cl::init(""),
+    cl::cat(BoltCategory));
 
 static cl::opt<bool> CreateDebugNames(
     "create-debug-names-section",
@@ -467,23 +464,19 @@ createDIEStreamer(const Triple &TheTriple, raw_pwrite_stream &OutFile,
   return Streamer;
 }
 
-static DWARFRewriter::UnitMeta
-emitUnit(DIEBuilder &DIEBldr, DIEStreamer &Streamer, DWARFUnit &Unit) {
+static void emitUnit(DIEBuilder &DIEBldr, DIEStreamer &Streamer,
+                     DWARFUnit &Unit) {
   DIE *UnitDIE = DIEBldr.getUnitDIEbyUnit(Unit);
-  const DIEBuilder::DWARFUnitInfo &U = DIEBldr.getUnitInfoByDwarfUnit(Unit);
   Streamer.emitUnit(Unit, *UnitDIE);
-  uint64_t TypeHash = 0;
-  if (DWARFTypeUnit *DTU = dyn_cast_or_null<DWARFTypeUnit>(&Unit))
-    TypeHash = DTU->getTypeHash();
-  return {U.UnitOffset, U.UnitLength, TypeHash};
 }
 
-static void
-emitDWOBuilder(const std::string &DWOName, DIEBuilder &DWODIEBuilder,
-               DWARFRewriter &Rewriter, DWARFUnit &SplitCU, DWARFUnit &CU,
-               DWARFRewriter::DWPState &State, DebugLocWriter &LocWriter,
-               DebugStrOffsetsWriter &StrOffstsWriter,
-               DebugStrWriter &StrWriter, GDBIndex &GDBIndexSection) {
+static void emitDWOBuilder(const std::string &DWOName,
+                           DIEBuilder &DWODIEBuilder, DWARFRewriter &Rewriter,
+                           DWARFUnit &SplitCU, DWARFUnit &CU,
+                           DebugLocWriter &LocWriter,
+                           DebugStrOffsetsWriter &StrOffstsWriter,
+                           DebugStrWriter &StrWriter,
+                           GDBIndex &GDBIndexSection) {
   // Populate debug_info and debug_abbrev for current dwo into StringRef.
   DWODIEBuilder.generateAbbrevs();
   DWODIEBuilder.finish();
@@ -496,28 +489,22 @@ emitDWOBuilder(const std::string &DWOName, DIEBuilder &DWODIEBuilder,
   std::unique_ptr<DIEStreamer> Streamer =
       createDIEStreamer(*TheTriple, *ObjOS, "DwoStreamerInitAug2",
                         DWODIEBuilder, GDBIndexSection);
-  DWARFRewriter::UnitMetaVectorType TUMetaVector;
-  DWARFRewriter::UnitMeta CUMI = {0, 0, 0};
   if (SplitCU.getContext().getMaxDWOVersion() >= 5) {
     for (std::unique_ptr<llvm::DWARFUnit> &CU :
          SplitCU.getContext().dwo_info_section_units()) {
       if (!CU->isTypeUnit())
         continue;
-      DWARFRewriter::UnitMeta MI =
-          emitUnit(DWODIEBuilder, *Streamer, *CU.get());
-      TUMetaVector.emplace_back(MI);
+      emitUnit(DWODIEBuilder, *Streamer, *CU.get());
     }
-    CUMI = emitUnit(DWODIEBuilder, *Streamer, SplitCU);
+    emitUnit(DWODIEBuilder, *Streamer, SplitCU);
   } else {
     for (std::unique_ptr<llvm::DWARFUnit> &CU :
          SplitCU.getContext().dwo_compile_units())
       emitUnit(DWODIEBuilder, *Streamer, *CU.get());
 
     // emit debug_types sections for dwarf4
-    for (DWARFUnit *CU : DWODIEBuilder.getDWARF4TUVector()) {
-      DWARFRewriter::UnitMeta MI = emitUnit(DWODIEBuilder, *Streamer, *CU);
-      TUMetaVector.emplace_back(MI);
-    }
+    for (DWARFUnit *CU : DWODIEBuilder.getDWARF4TUVector())
+      emitUnit(DWODIEBuilder, *Streamer, *CU);
   }
 
   Streamer->emitAbbrevs(DWODIEBuilder.getAbbrevs(),
@@ -544,12 +531,8 @@ emitDWOBuilder(const std::string &DWOName, DIEBuilder &DWODIEBuilder,
       continue;
     OverriddenSections[Kind] = Contents;
   }
-  if (opts::WriteDWP)
-    Rewriter.updateDWP(CU, OverriddenSections, CUMI, TUMetaVector, State,
-                       LocWriter, StrOffstsWriter, StrWriter);
-  else
-    Rewriter.writeDWOFiles(CU, OverriddenSections, DWOName, LocWriter,
-                           StrOffstsWriter, StrWriter);
+  Rewriter.writeDWOFiles(CU, OverriddenSections, DWOName, LocWriter,
+                         StrOffstsWriter, StrWriter);
 }
 
 using DWARFUnitVec = std::vector<DWARFUnit *>;
@@ -662,17 +645,13 @@ void DWARFRewriter::updateDebugInfo() {
   DWARF5AcceleratorTable DebugNamesTable(opts::CreateDebugNames, BC,
                                          *StrWriter);
   GDBIndex GDBIndexSection(BC);
-  DWPState State;
-  if (opts::WriteDWP)
-    initDWPState(State);
   auto processSplitCU = [&](DWARFUnit &Unit, DWARFUnit &SplitCU,
                             DIEBuilder &DIEBlder,
                             DebugRangesSectionWriter &TempRangesSectionWriter,
                             DebugAddrWriter &AddressWriter,
                             const std::string &DWOName,
-                            const std::optional<std::string> &DwarfOutputPath) {
-    DIEBuilder DWODIEBuilder(BC, &(SplitCU).getContext(), DebugNamesTable,
-                             &Unit);
+                            const std::optional<std::string> &DwarfOutputPath,
+                            DIEBuilder &DWODIEBuilder) {
     DWODIEBuilder.buildDWOUnit(SplitCU);
     DebugStrOffsetsWriter DWOStrOffstsWriter(BC);
     DebugStrWriter DWOStrWriter((SplitCU).getContext(), true);
@@ -688,7 +667,7 @@ void DWARFRewriter::updateDebugInfo() {
     if (Unit.getVersion() >= 5)
       TempRangesSectionWriter.finalizeSection();
 
-    emitDWOBuilder(DWOName, DWODIEBuilder, *this, SplitCU, Unit, State,
+    emitDWOBuilder(DWOName, DWODIEBuilder, *this, SplitCU, Unit,
                    DebugLocDWoWriter, DWOStrOffstsWriter, DWOStrWriter,
                    GDBIndexSection);
   };
@@ -711,7 +690,8 @@ void DWARFRewriter::updateDebugInfo() {
       RangesBase = RangesSectionWriter.getSectionOffset() +
                    getDWARF5RngListLocListHeaderSize();
       RangesSectionWriter.initSection(Unit);
-      StrOffstsWriter->finalizeSection(Unit, DIEBlder);
+      if (!SplitCU)
+        StrOffstsWriter->finalizeSection(Unit, DIEBlder);
     } else if (SplitCU) {
       RangesBase = LegacyRangesSectionWriter.get()->getSectionOffset();
     }
@@ -738,6 +718,7 @@ void DWARFRewriter::updateDebugInfo() {
   CUPartitionVector PartVec = partitionCUs(*BC.DwCtx);
   for (std::vector<DWARFUnit *> &Vec : PartVec) {
     DIEBlder.buildCompileUnits(Vec);
+    llvm::SmallVector<std::unique_ptr<DIEBuilder>, 72> DWODIEBuildersByCU;
     for (DWARFUnit *CU : DIEBlder.getProcessedCUs()) {
       createRangeLocListAddressWriters(*CU);
       std::optional<DWARFUnit *> SplitCU;
@@ -757,9 +738,17 @@ void DWARFRewriter::updateDebugInfo() {
               : std::optional<std::string>(opts::DwarfOutputPath.c_str());
       std::string DWOName = DIEBlder.updateDWONameCompDir(
           *StrOffstsWriter, *StrWriter, *CU, DwarfOutputPath, std::nullopt);
+      auto DWODIEBuilderPtr = std::make_unique<DIEBuilder>(
+          BC, &(**SplitCU).getContext(), DebugNamesTable, CU);
+      DIEBuilder &DWODIEBuilder =
+          *DWODIEBuildersByCU.emplace_back(std::move(DWODIEBuilderPtr)).get();
+      if (CU->getVersion() >= 5)
+        StrOffstsWriter->finalizeSection(*CU, DIEBlder);
       processSplitCU(*CU, **SplitCU, DIEBlder, *TempRangesSectionWriter,
-                     AddressWriter, DWOName, DwarfOutputPath);
+                     AddressWriter, DWOName, DwarfOutputPath, DWODIEBuilder);
     }
+    for (std::unique_ptr<DIEBuilder> &DWODIEBuilderPtr : DWODIEBuildersByCU)
+      DWODIEBuilderPtr->updateDebugNamesTable();
     for (DWARFUnit *CU : DIEBlder.getProcessedCUs())
       processMainBinaryCU(*CU, DIEBlder);
     finalizeCompileUnits(DIEBlder, *Streamer, OffsetMap,
@@ -768,9 +757,6 @@ void DWARFRewriter::updateDebugInfo() {
 
   DebugNamesTable.emitAccelTable();
 
-  if (opts::WriteDWP)
-    finalizeDWP(State);
-
   finalizeDebugSections(DIEBlder, DebugNamesTable, *Streamer, *ObjOS, OffsetMap,
                         *FinalAddrWriter);
   GDBIndexSection.updateGdbIndexSection(OffsetMap, CUIndex,
@@ -1462,6 +1448,7 @@ CUOffsetMap DWARFRewriter::finalizeTypeSections(DIEBuilder &DIEBlder,
   // generate and populate abbrevs here
   DIEBlder.generateAbbrevs();
   DIEBlder.finish();
+  DIEBlder.updateDebugNamesTable();
   SmallVector<char, 20> OutBuffer;
   std::shared_ptr<raw_svector_ostream> ObjOS =
       std::make_shared<raw_svector_ostream>(OutBuffer);
@@ -1666,6 +1653,7 @@ void DWARFRewriter::finalizeCompileUnits(DIEBuilder &DIEBlder,
   }
   DIEBlder.generateAbbrevs();
   DIEBlder.finish();
+  DIEBlder.updateDebugNamesTable();
   // generate debug_info and CUMap
   for (DWARFUnit *CU : CUs) {
     emitUnit(DIEBlder, Streamer, *CU);
@@ -1816,220 +1804,6 @@ std::optional<StringRef> updateDebugData(
 
 } // namespace
 
-void DWARFRewriter::initDWPState(DWPState &State) {
-  SmallString<0> OutputNameStr;
-  StringRef OutputName;
-  if (opts::DwarfOutputPath.empty()) {
-    OutputName =
-        Twine(opts::OutputFilename).concat(".dwp").toStringRef(OutputNameStr);
-  } else {
-    StringRef ExeFileName = llvm::sys::path::filename(opts::OutputFilename);
-    OutputName = Twine(opts::DwarfOutputPath)
-                     .concat("/")
-                     .concat(ExeFileName)
-                     .concat(".dwp")
-                     .toStringRef(OutputNameStr);
-    errs() << "BOLT-WARNING: dwarf-output-path is in effect and .dwp file will "
-              "possibly be written to another location that is not the same as "
-              "the executable\n";
-  }
-  std::error_code EC;
-  State.Out =
-      std::make_unique<ToolOutputFile>(OutputName, EC, sys::fs::OF_None);
-  const object::ObjectFile *File = BC.DwCtx->getDWARFObj().getFile();
-  State.TmpBC = createDwarfOnlyBC(*File);
-  State.Streamer = State.TmpBC->createStreamer(State.Out->os());
-  State.MCOFI = State.Streamer->getContext().getObjectFileInfo();
-  State.KnownSections = createKnownSectionsMap(*State.MCOFI);
-  MCSection *const StrSection = State.MCOFI->getDwarfStrDWOSection();
-
-  // Data Structures for DWP book keeping
-  // Size of array corresponds to the number of sections supported by DWO format
-  // in DWARF4/5.
-
-  State.Strings = std::make_unique<DWPStringPool>(*State.Streamer, StrSection);
-
-  // Setup DWP code once.
-  DWARFContext *DWOCtx = BC.getDWOContext();
-
-  if (DWOCtx) {
-    State.CUIndex = &DWOCtx->getCUIndex();
-    State.IsDWP = !State.CUIndex->getRows().empty();
-  }
-}
-
-void DWARFRewriter::finalizeDWP(DWPState &State) {
-  if (State.Version < 5) {
-    // Lie about there being no info contributions so the TU index only includes
-    // the type unit contribution for DWARF < 5. In DWARFv5 the TU index has a
-    // contribution to the info section, so we do not want to lie about it.
-    State.ContributionOffsets[0] = 0;
-  }
-  writeIndex(*State.Streamer.get(), State.MCOFI->getDwarfTUIndexSection(),
-             State.ContributionOffsets, State.TypeIndexEntries,
-             State.IndexVersion);
-
-  if (State.Version < 5) {
-    // Lie about the type contribution for DWARF < 5. In DWARFv5 the type
-    // section does not exist, so no need to do anything about this.
-    State.ContributionOffsets[getContributionIndex(DW_SECT_EXT_TYPES, 2)] = 0;
-    // Unlie about the info contribution
-    State.ContributionOffsets[0] = 1;
-  }
-  writeIndex(*State.Streamer.get(), State.MCOFI->getDwarfCUIndexSection(),
-             State.ContributionOffsets, State.IndexEntries, State.IndexVersion);
-
-  State.Streamer->finish();
-  State.Out->keep();
-}
-
-void DWARFRewriter::updateDWP(DWARFUnit &CU,
-                              const OverriddenSectionsMap &OverridenSections,
-                              const DWARFRewriter::UnitMeta &CUMI,
-                              DWARFRewriter::UnitMetaVectorType &TUMetaVector,
-                              DWPState &State, DebugLocWriter &LocWriter,
-                              DebugStrOffsetsWriter &StrOffstsWriter,
-                              DebugStrWriter &StrWriter) {
-  const uint64_t DWOId = *CU.getDWOId();
-  MCSection *const StrOffsetSection = State.MCOFI->getDwarfStrOffDWOSection();
-  assert(StrOffsetSection && "StrOffsetSection does not exist.");
-  // Skipping CUs that we failed to load.
-  std::optional<DWARFUnit *> DWOCU = BC.getDWOCU(DWOId);
-  if (!DWOCU)
-    return;
-
-  if (State.Version == 0) {
-    State.Version = CU.getVersion();
-    State.IndexVersion = State.Version < 5 ? 2 : 5;
-  } else if (State.Version != CU.getVersion()) {
-    errs() << "BOLT-ERROR: incompatible DWARF compile unit versions\n";
-    exit(1);
-  }
-
-  UnitIndexEntry CurEntry = {};
-  CurEntry.DWOName = dwarf::toString(
-      CU.getUnitDIE().find({dwarf::DW_AT_dwo_name, dwarf::DW_AT_GNU_dwo_name}),
-      "");
-  const char *Name = CU.getUnitDIE().getShortName();
-  if (Name)
-    CurEntry.Name = Name;
-  StringRef CurStrSection;
-  StringRef CurStrOffsetSection;
-
-  // This maps each section contained in this file to its length.
-  // This information is later on used to calculate the contributions,
-  // i.e. offset and length, of each compile/type unit to a section.
-  std::vector<std::pair<DWARFSectionKind, uint32_t>> SectionLength;
-
-  const DWARFUnitIndex::Entry *CUDWOEntry = nullptr;
-  if (State.IsDWP)
-    CUDWOEntry = State.CUIndex->getFromHash(DWOId);
-
-  bool StrSectionWrittenOut = false;
-  const object::ObjectFile *DWOFile =
-      (*DWOCU)->getContext().getDWARFObj().getFile();
-
-  DebugRangeListsSectionWriter *RangeListssWriter = nullptr;
-  if (CU.getVersion() == 5) {
-    assert(RangeListsWritersByCU.count(DWOId) != 0 &&
-           "No RangeListsWriter for DWO ID.");
-    RangeListssWriter = RangeListsWritersByCU[DWOId].get();
-  }
-  auto AddType = [&](unsigned int Index, uint32_t IndexVersion, uint64_t Offset,
-                     uint64_t Length, uint64_t Hash) -> void {
-    UnitIndexEntry TUEntry = CurEntry;
-    if (IndexVersion < 5)
-      TUEntry.Contributions[0] = {};
-    TUEntry.Contributions[Index].setOffset(Offset);
-    TUEntry.Contributions[Index].setLength(Length);
-    State.ContributionOffsets[Index] +=
-        TUEntry.Contributions[Index].getLength32();
-    State.TypeIndexEntries.insert(std::make_pair(Hash, TUEntry));
-  };
-  std::unique_ptr<DebugBufferVector> StrOffsetsOutputData;
-  std::unique_ptr<DebugBufferVector> StrOutputData;
-  for (const SectionRef &Section : DWOFile->sections()) {
-    std::unique_ptr<DebugBufferVector> OutputData = nullptr;
-    StringRef SectionName = getSectionName(Section);
-    Expected<StringRef> ContentsExp = Section.getContents();
-    assert(ContentsExp && "Invalid contents.");
-    std::optional<StringRef> TOutData =
-        updateDebugData((*DWOCU)->getContext(), SectionName, *ContentsExp,
-                        State.KnownSections, *State.Streamer, *this, CUDWOEntry,
-                        DWOId, OutputData, RangeListssWriter, LocWriter,
-                        StrOffstsWriter, StrWriter, OverridenSections);
-    if (!TOutData)
-      continue;
-
-    StringRef OutData = *TOutData;
-    if (SectionName == "debug_types.dwo") {
-      State.Streamer->emitBytes(OutData);
-      continue;
-    }
-
-    if (SectionName == "debug_str.dwo") {
-      CurStrSection = OutData;
-      StrOutputData = std::move(OutputData);
-    } else {
-      // Since handleDebugDataPatching returned true, we already know this is
-      // a known section.
-      auto SectionIter = State.KnownSections.find(SectionName);
-      if (SectionIter->second.second == DWARFSectionKind::DW_SECT_STR_OFFSETS) {
-        CurStrOffsetSection = OutData;
-        StrOffsetsOutputData = std::move(OutputData);
-      } else {
-        State.Streamer->emitBytes(OutData);
-      }
-      unsigned int Index =
-          getContributionIndex(SectionIter->second.second, State.IndexVersion);
-      uint64_t Offset = State.ContributionOffsets[Index];
-      uint64_t Length = OutData.size();
-      if (CU.getVersion() >= 5 &&
-          SectionIter->second.second == DWARFSectionKind::DW_SECT_INFO) {
-        for (UnitMeta &MI : TUMetaVector)
-          MI.Offset += State.DebugInfoSize;
-
-        Offset = State.DebugInfoSize + CUMI.Offset;
-        Length = CUMI.Length;
-        State.DebugInfoSize += OutData.size();
-      }
-      CurEntry.Contributions[Index].setOffset(Offset);
-      CurEntry.Contributions[Index].setLength(Length);
-      State.ContributionOffsets[Index] +=
-          CurEntry.Contributions[Index].getLength32();
-    }
-
-    // Strings are combined in to a new string section, and de-duplicated
-    // based on hash.
-    if (!StrSectionWrittenOut && !CurStrOffsetSection.empty() &&
-        !CurStrSection.empty()) {
-      // If debug_str.dwo section was modified storing it until dwp is written
-      // out. DWPStringPool stores raw pointers to strings.
-      if (StrOutputData)
-        State.StrSections.push_back(std::move(StrOutputData));
-      writeStringsAndOffsets(*State.Streamer.get(), *State.Strings.get(),
-                             StrOffsetSection, CurStrSection,
-                             CurStrOffsetSection, CU.getVersion());
-      StrSectionWrittenOut = true;
-    }
-  }
-  CompileUnitIdentifiers CUI{DWOId, CurEntry.Name.c_str(),
-                             CurEntry.DWOName.c_str()};
-  auto P = State.IndexEntries.insert(std::make_pair(CUI.Signature, CurEntry));
-  if (!P.second) {
-    Error Err = buildDuplicateError(*P.first, CUI, "");
-    errs() << "BOLT-ERROR: " << toString(std::move(Err)) << "\n";
-    return;
-  }
-
-  // Handling TU
-  const unsigned Index = getContributionIndex(
-      State.IndexVersion < 5 ? DW_SECT_EXT_TYPES : DW_SECT_INFO,
-      State.IndexVersion);
-  for (UnitMeta &MI : TUMetaVector)
-    AddType(Index, State.IndexVersion, MI.Offset, MI.Length, MI.TUHash);
-}
-
 void DWARFRewriter::writeDWOFiles(
     DWARFUnit &CU, const OverriddenSectionsMap &OverridenSections,
     const std::string &DWOName, DebugLocWriter &LocWriter,
diff --git a/bolt/test/AArch64/dummy-return.s b/bolt/test/AArch64/dummy-return.s
index a446343..91f89dc 100644
--- a/bolt/test/AArch64/dummy-return.s
+++ b/bolt/test/AArch64/dummy-return.s
@@ -1,4 +1,6 @@
-# REQUIRES: system-linux,target=aarch64{{.*}}
+# This test checks instrumentation of static binary on AArch64.
+
+# REQUIRES: system-linux,bolt-runtime,target=aarch64{{.*}}
 
 # RUN: llvm-mc -filetype=obj -triple aarch64-unknown-unknown %s -o %t.o
 # RUN: %clang %cflags %t.o -o %t.exe -Wl,-q -static
diff --git a/bolt/test/X86/debug-fission-single-convert.s b/bolt/test/X86/debug-fission-single-convert.s
index 5ea6eb8..02c92902 100644
--- a/bolt/test/X86/debug-fission-single-convert.s
+++ b/bolt/test/X86/debug-fission-single-convert.s
@@ -41,19 +41,6 @@
 # CHECK-ADDR-SEC: 0x00000000: Addrs: [
 # CHECK-ADDR-SEC: 0x0000000000601000
 
-# RUN: llvm-bolt %t.exe --reorder-blocks=reverse --update-debug-sections --dwarf-output-path=%T -o %t.bolt.2.exe --write-dwp=true \
-# RUN: --always-convert-to-ranges=true
-# RUN: not llvm-dwarfdump --show-form --verbose --debug-info %t.bolt.2.exe.dwp &> %tAddrIndexTestDwp
-# RUN: cat %tAddrIndexTestDwp | FileCheck %s --check-prefix=CHECK-DWP-DEBUG
-
-# CHECK-DWP-DEBUG: DW_TAG_compile_unit [1] *
-# CHECK-DWP-DEBUG:  DW_AT_producer [DW_FORM_GNU_str_index]  (indexed (0000000a) string = "clang version 13.0.0")
-# CHECK-DWP-DEBUG:  DW_AT_language [DW_FORM_data2]  (DW_LANG_C_plus_plus)
-# CHECK-DWP-DEBUG:  DW_AT_name [DW_FORM_GNU_str_index]  (indexed (0000000b) string = "foo")
-# CHECK-DWP-DEBUG:  DW_AT_GNU_dwo_name [DW_FORM_GNU_str_index]  (indexed (0000000c) string = "foo")
-# CHECK-DWP-DEBUG:  DW_AT_GNU_dwo_id [DW_FORM_data8]  (0x06105e732fad3796)
-
-
 //clang++ -ffunction-sections -fno-exceptions -g -gsplit-dwarf=split -S debug-fission-simple.cpp -o debug-fission-simple.s
 static int foo = 2;
 int doStuff(int val) {
diff --git a/bolt/test/X86/debug-fission-single.s b/bolt/test/X86/debug-fission-single.s
index 4350bd9e..1aa502f 100644
--- a/bolt/test/X86/debug-fission-single.s
+++ b/bolt/test/X86/debug-fission-single.s
@@ -42,18 +42,6 @@
 # CHECK-ADDR-SEC: 0x00000000: Addrs: [
 # CHECK-ADDR-SEC: 0x0000000000601000
 
-# RUN: llvm-bolt %t.exe --reorder-blocks=reverse --update-debug-sections --dwarf-output-path=%T -o %t.bolt.2.exe --write-dwp=true
-# RUN: llvm-dwarfdump --show-form --verbose --debug-info %t.bolt.2.exe.dwp &> %tAddrIndexTestDwp
-# RUN: cat %tAddrIndexTestDwp | FileCheck %s --check-prefix=CHECK-DWP-DEBUG
-
-# CHECK-DWP-DEBUG: DW_TAG_compile_unit [1] *
-# CHECK-DWP-DEBUG:  DW_AT_producer [DW_FORM_GNU_str_index]  (indexed (0000000a) string = "clang version 13.0.0")
-# CHECK-DWP-DEBUG:  DW_AT_language [DW_FORM_data2]  (DW_LANG_C_plus_plus)
-# CHECK-DWP-DEBUG:  DW_AT_name [DW_FORM_GNU_str_index]  (indexed (0000000b) string = "foo")
-# CHECK-DWP-DEBUG:  DW_AT_GNU_dwo_name [DW_FORM_GNU_str_index]  (indexed (0000000c) string = "foo")
-# CHECK-DWP-DEBUG:  DW_AT_GNU_dwo_id [DW_FORM_data8]  (0x06105e732fad3796)
-
-
 //clang++ -ffunction-sections -fno-exceptions -g -gsplit-dwarf=split -S debug-fission-simple.cpp -o debug-fission-simple.s
 static int foo = 2;
 int doStuff(int val) {
diff --git a/bolt/test/X86/dwarf4-ftypes-dwo-input-dwp-output.test b/bolt/test/X86/dwarf4-ftypes-dwo-input-dwp-output.test
deleted file mode 100644
index d08b596..0000000
--- a/bolt/test/X86/dwarf4-ftypes-dwo-input-dwp-output.test
+++ /dev/null
@@ -1,30 +0,0 @@
-# REQUIRES: system-linux
-; RUN: rm -rf %t
-; RUN: mkdir %t
-; RUN: cd %t
-; RUN: llvm-mc --split-dwarf-file=main.dwo --triple=x86_64-unknown-linux-gnu \
-; RUN: --filetype=obj %p/Inputs/dwarf4-ftypes-split-dwarf.s -o=main.o
-; RUN: %clang %cflags -gdwarf-4 -gsplit-dwarf=split main.o -o main.exe
-; RUN: llvm-dwarfdump --show-form --verbose --debug-types main.dwo | FileCheck -check-prefix=PRE-BOLT %s
-; RUN: llvm-bolt main.exe -o main.exe.bolt --update-debug-sections --write-dwp
-; RUN: llvm-dwarfdump --show-form --verbose --debug-types main.exe.bolt.dwp | FileCheck -check-prefix=BOLT %s
-; RUN: llvm-dwarfdump --show-form --verbose --debug-tu-index main.exe.bolt.dwp | FileCheck -check-prefix=BOLT-DWP-TU-INDEX %s
-
-;; Test input into bolt a .dwo file with TU Index.
-;; Make sure the output .dwp file has a type information.
-
-; PRE-BOLT: DW_TAG_type_unit
-; PRE-BOLT: DW_TAG_type_unit
-
-; PRE-BOLT-DWP-TU-INDEX: version = 2, units = 2, slots = 4
-; PRE-BOLT-DWP-TU-INDEX: Index Signature
-; PRE-BOLT-DWP-TU-INDEX: 0x675d23e4f33235f2
-; PRE-BOLT-DWP-TU-INDEX-NEXT: 0x49dc260088be7e56
-
-; BOLT: DW_TAG_type_unit
-; BOLT: DW_TAG_type_unit
-
-; BOLT-DWP-TU-INDEX: version = 2, units = 2, slots = 4
-; BOLT-DWP-TU-INDEX: Index Signature
-; BOLT-DWP-TU-INDEX: 0x675d23e4f33235f2
-; BOLT-DWP-TU-INDEX-NEXT: 0x49dc260088be7e56
diff --git a/bolt/test/X86/dwarf4-ftypes-dwo-mono-input-dwp-output.test b/bolt/test/X86/dwarf4-ftypes-dwo-mono-input-dwp-output.test
deleted file mode 100644
index 5438214..0000000
--- a/bolt/test/X86/dwarf4-ftypes-dwo-mono-input-dwp-output.test
+++ /dev/null
@@ -1,45 +0,0 @@
-# REQUIRES: system-linux
-; RUN: rm -rf %t
-; RUN: mkdir %t
-; RUN: cd %t
-; RUN: llvm-mc --split-dwarf-file=main.dwo   -dwarf-version=4 -filetype=obj -triple x86_64-unknown-linux %p/Inputs/dwarf4-split-gdb-index-types-main.s -o main.o
-; RUN: llvm-mc --split-dwarf-file=helper.dwo -dwarf-version=4 -filetype=obj -triple x86_64-unknown-linux %p/Inputs/dwarf4-split-gdb-index-types-helper.s -o helper1.o
-; RUN: llvm-mc -dwarf-version=4 -filetype=obj -triple x86_64-unknown-linux %p/Inputs/dwarf4-types-helper2.s -o helper2.o
-; RUN: %clang %cflags -gdwarf-4 -gsplit-dwarf=split main.o helper1.o helper2.o -o main.exe
-; RUN: llvm-dwarfdump --show-form --verbose --debug-types main.dwo | FileCheck -check-prefix=PRE-BOLT %s
-; RUN: llvm-dwarfdump --show-form --verbose --debug-types helper2.o | FileCheck -check-prefix=PRE-BOLT2 %s
-; RUN: llvm-bolt main.exe -o main.exe.bolt --update-debug-sections --write-dwp
-; RUN: llvm-dwarfdump --show-form --verbose --debug-types main.exe.bolt.dwp | FileCheck -check-prefix=BOLT %s
-; RUN: llvm-dwarfdump --show-form --verbose --debug-tu-index main.exe.bolt.dwp | FileCheck -check-prefix=BOLT-DWP-TU-INDEX %s
-
-;; Test input into bolt a .dwo file with TU Index.
-;; Test split-dwarf and monolithic TUs.
-;; Make sure the output .dwp file has a type information.
-
-; PRE-BOLT: 0x675d23e4f33235f2
-; PRE-BOLT: DW_TAG_type_unit
-; PRE-BOLT: 0x49dc260088be7e56
-; PRE-BOLT: DW_TAG_type_unit
-
-; PRE-BOLT2: 0x8f55ac73549bc003
-; PRE-BOLT2: DW_TAG_type_unit
-; PRE-BOLT2: 0xe7734af8fed0632e
-; PRE-BOLT2: DW_TAG_type_unit
-
-; BOLT: 0x675d23e4f33235f2
-; BOLT: DW_TAG_type_unit
-; BOLT: 0x49dc260088be7e56
-; BOLT: DW_TAG_type_unit
-; BOLT: 0x104ec427d2ebea6f
-; BOLT: DW_TAG_type_unit
-; BOLT: 0xb4580bc1535df1e4
-; BOLT: DW_TAG_type_unit
-; BOLT-NOT: 0x8f55ac73549bc003
-; BOLT-NOT: 0xe7734af8fed0632e
-
-; BOLT-DWP-TU-INDEX: version = 2, units = 4, slots = 8
-; BOLT-DWP-TU-INDEX: Index Signature
-; BOLT-DWP-TU-INDEX: 0x675d23e4f33235f2
-; BOLT-DWP-TU-INDEX-NEXT: 0xb4580bc1535df1e4
-; BOLT-DWP-TU-INDEX-NEXT: 0x49dc260088be7e56
-; BOLT-DWP-TU-INDEX-NEXT: 0x104ec427d2ebea6f
diff --git a/bolt/test/X86/dwarf4-split-gdb-index-types-gdb-generated.test b/bolt/test/X86/dwarf4-split-gdb-index-types-gdb-generated.test
index c9b1257..6caf587 100644
--- a/bolt/test/X86/dwarf4-split-gdb-index-types-gdb-generated.test
+++ b/bolt/test/X86/dwarf4-split-gdb-index-types-gdb-generated.test
@@ -17,10 +17,10 @@
 # POSTCHECK-NEXT:         0: Offset = 0x0, Length = 0x34
 # POSTCHECK-NEXT:         1: Offset = 0x34, Length = 0x34
 # POSTCHECK:          Types CU list offset = 0x38, has 4 entries
-# POSTCHECK-NEXT:       0: offset = 0x00000000, type_offset = 0x0000001e, type_signature = 0x675d23e4f33235f2
-# POSTCHECK-NEXT:       1: offset = 0x0000004a, type_offset = 0x0000001e, type_signature = 0x49dc260088be7e56
-# POSTCHECK-NEXT:       2: offset = 0x00000000, type_offset = 0x0000001e, type_signature = 0x104ec427d2ebea6f
-# POSTCHECK-NEXT:       3: offset = 0x0000004a, type_offset = 0x0000001e, type_signature = 0xb4580bc1535df1e4
+# POSTCHECK-NEXT:       0: offset = 0x0000004a, type_offset = 0x0000001e, type_signature = 0xb4580bc1535df1e4
+# POSTCHECK-NEXT:       1: offset = 0x00000000, type_offset = 0x0000001e, type_signature = 0x675d23e4f33235f2
+# POSTCHECK-NEXT:       2: offset = 0x0000004a, type_offset = 0x0000001e, type_signature = 0x49dc260088be7e56
+# POSTCHECK-NEXT:       3: offset = 0x00000000, type_offset = 0x0000001e, type_signature = 0x104ec427d2ebea6f
 # POSTCHECK:          Address area offset = 0x98, has 2 entries
 # POSTCHECK-NEXT:         Low/High address = [0x[[#%.4x,ADDR:]],
 # POSTCHECK-SAME:           0x[[#ADDR + 0x7a]]) (Size: 0x7a), CU id = 0
diff --git a/bolt/test/X86/dwarf5-df-larger-batch-size.test b/bolt/test/X86/dwarf5-df-larger-batch-size.test
new file mode 100644
index 0000000..c2c5f63
--- /dev/null
+++ b/bolt/test/X86/dwarf5-df-larger-batch-size.test
@@ -0,0 +1,28 @@
+; RUN: rm -rf %t
+; RUN: mkdir %t
+; RUN: cd %t
+; RUN: llvm-mc -dwarf-version=5 -filetype=obj -triple x86_64-unknown-linux %p/Inputs/dwarf5-df-input-lowpc-ranges-main.s \
+; RUN: -split-dwarf-file=main.dwo -o main.o
+; RUN: llvm-mc -dwarf-version=5 -filetype=obj -triple x86_64-unknown-linux %p/Inputs/dwarf5-df-input-lowpc-ranges-other.s \
+; RUN: -split-dwarf-file=mainOther.dwo -o other.o
+; RUN: %clang %cflags main.o other.o -o main.exe
+; RUN: llvm-bolt main.exe -o main.exe.bolt --update-debug-sections --cu-processing-batch-size=1
+; RUN: llvm-bolt main.exe -o main-batch.exe.bolt --update-debug-sections --cu-processing-batch-size=2
+; RUN: llvm-dwarfdump --show-form --verbose --debug-info main.exe.bolt >> %t/foo.txt
+; RUN: cat %t/foo.txt | FileCheck -check-prefix=BOLT %s
+; RUN: llvm-dwarfdump --show-form --verbose --debug-info main.exe.bolt >> %t/foo-batch.txt
+; RUN: cat %t/foo-batch.txt | FileCheck -check-prefix=BOLT-BATCH %s
+
+;; Tests that BOLT correctly handles DWO name strings with larger batch sizes.
+
+; BOLT: DW_TAG_skeleton_unit
+; BOLT: DW_AT_dwo_name [DW_FORM_strx1]  (indexed (00000001) string = "main.dwo.dwo")
+
+; BOLT: DW_TAG_skeleton_unit
+; BOLT: DW_AT_dwo_name [DW_FORM_strx1]  (indexed (00000001) string = "mainOther.dwo.dwo")
+
+; BOLT-BATCH: DW_TAG_skeleton_unit
+; BOLT-BATCH: DW_AT_dwo_name [DW_FORM_strx1]  (indexed (00000001) string = "main.dwo.dwo")
+
+; BOLT-BATCH: DW_TAG_skeleton_unit
+; BOLT-BATCH: DW_AT_dwo_name [DW_FORM_strx1]  (indexed (00000001) string = "mainOther.dwo.dwo")
diff --git a/bolt/test/X86/dwarf5-df-types-modify-dwo-name-mixed.test b/bolt/test/X86/dwarf5-df-types-modify-dwo-name-mixed.test
index 6c603ba..c8cfd82 100644
--- a/bolt/test/X86/dwarf5-df-types-modify-dwo-name-mixed.test
+++ b/bolt/test/X86/dwarf5-df-types-modify-dwo-name-mixed.test
@@ -72,59 +72,6 @@
 ; BOLT-NEXT: "helper.cpp"
 ; BOLT-NEXT: "helper.dwo"
 
-
-;; Tests that BOLT correctly handles updating DW_AT_dwo_name when it outputs a DWP file.
-;; Currently skipping one of Type units because it is not being de-dupped.
-;; In the tu-index this TU is not present.
-; RUN: rm main.exe.bolt
-; RUN: llvm-bolt main.exe -o main.exe.bolt --update-debug-sections --write-dwp
-; RUN: llvm-dwarfdump --debug-info -r 0 main.exe.bolt.dwp > logDWP.txt
-; RUN: llvm-dwarfdump --debug-str-offsets main.exe.bolt.dwp >> logDWP.txt
-; RUN: cat logDWP.txt | FileCheck -check-prefix=BOLT-DWP %s
-; BOLT-DWP: DW_TAG_type_unit
-; BOLT-DWP: DW_AT_comp_dir  (".")
-; BOLT-DWP: DW_AT_dwo_name  ("main.dwo.dwo")
-; BOLT-DWP: DW_TAG_type_unit
-; BOLT-DWP: DW_AT_comp_dir  (".")
-; BOLT-DWP: DW_AT_dwo_name  ("main.dwo.dwo")
-; BOLT-DWP: DW_TAG_compile_unit
-; BOLT-DWP: DW_AT_dwo_name  ("main.dwo.dwo")
-; BOLT-DWP: DW_TAG_type_unit
-; BOLT-DWP-NOT: DW_AT_dwo_name
-; BOLT-DWP:       Contribution size = 68, Format = DWARF32, Version = 5
-; BOLT-DWP-NEXT: "main"
-; BOLT-DWP-NEXT: "int"
-; BOLT-DWP-NEXT: "argc"
-; BOLT-DWP-NEXT: "argv"
-; BOLT-DWP-NEXT: "char"
-; BOLT-DWP-NEXT: "f2"
-; BOLT-DWP-NEXT: "."
-; BOLT-DWP-NEXT: "main.dwo.dwo"
-; BOLT-DWP-NEXT: "c1"
-; BOLT-DWP-NEXT: "Foo2"
-; BOLT-DWP-NEXT: "f3"
-; BOLT-DWP-NEXT: "c2"
-; BOLT-DWP-NEXT: "c3"
-; BOLT-DWP-NEXT: "Foo2a"
-; BOLT-DWP-NEXT: "clang version 18.0.0git (git@github.com:ayermolo/llvm-project.git db35fa8fc524127079662802c4735dbf397f86d0)"
-; BOLT-DWP-NEXT: "main.cpp"
-; BOLT-DWP-NEXT: Contribution size = 64, Format = DWARF32, Version = 5
-; BOLT-DWP-NEXT: "fooint"
-; BOLT-DWP-NEXT: "int"
-; BOLT-DWP-NEXT: "_Z3foov"
-; BOLT-DWP-NEXT: "foo"
-; BOLT-DWP-NEXT: "fint"
-; BOLT-DWP-NEXT: "c1"
-; BOLT-DWP-NEXT: "c2"
-; BOLT-DWP-NEXT: "Foo2Int"
-; BOLT-DWP-NEXT: "f"
-; BOLT-DWP-NEXT: "char"
-; BOLT-DWP-NEXT: "c3"
-; BOLT-DWP-NEXT: "Foo2a"
-; BOLT-DWP-NEXT: "clang version 18.0.0"
-; BOLT-DWP-NEXT: "helper.cpp"
-; BOLT-DWP-NEXT: "helper.dwo
-
 ;; Tests that BOLT correctly handles updating DW_AT_comp_dir/DW_AT_dwo_name when outptut directory is specified.
 
 ; RUN: mkdir DWOOut
diff --git a/bolt/test/X86/dwarf5-df-types-modify-dwo-name.test b/bolt/test/X86/dwarf5-df-types-modify-dwo-name.test
index 086f8f8..12a7f64 100644
--- a/bolt/test/X86/dwarf5-df-types-modify-dwo-name.test
+++ b/bolt/test/X86/dwarf5-df-types-modify-dwo-name.test
@@ -73,31 +73,6 @@
 ; BOLT-NEXT: "clang version 18.0.0git (git@github.com:ayermolo/llvm-project.git db35fa8fc524127079662802c4735dbf397f86d0)"
 ; BOLT-NEXT: "helper.cpp"
 
-
-;; Tests that BOLT correctly handles updating DW_AT_dwo_name when it outputs a DWP file.
-;; Currently skipping one of Type units because it is not being de-dupped.
-;; In the tu-index this TU is not present.
-; RUN: rm main.exe.bolt
-; RUN: llvm-bolt main.exe -o main.exe.bolt --update-debug-sections --write-dwp
-; RUN: llvm-dwarfdump --debug-info -r 0 main.exe.bolt.dwp > logDWP.txt
-; RUN: llvm-dwarfdump --debug-str-offsets main.exe.bolt.dwp >> logDWP.txt
-; RUN: cat logDWP.txt | FileCheck -check-prefix=BOLT-DWP %s
-; BOLT-DWP: DW_TAG_type_unit
-; BOLT-DWP: DW_AT_comp_dir  (".")
-; BOLT-DWP: DW_AT_dwo_name  ("main.dwo.dwo")
-; BOLT-DWP: DW_TAG_type_unit
-; BOLT-DWP: DW_AT_comp_dir  (".")
-; BOLT-DWP: DW_AT_dwo_name  ("main.dwo.dwo")
-; BOLT-DWP: DW_TAG_compile_unit
-; BOLT-DWP: DW_AT_dwo_name  ("main.dwo.dwo")
-; BOLT-DWP: DW_TAG_type_unit
-; BOLT-DWP: DW_AT_comp_dir  (".")
-; BOLT-DWP: DW_AT_dwo_name  ("helper.dwo.dwo")
-; BOLT-DWP: DW_TAG_type_unit
-; BOLT-DWP: DW_TAG_compile_unit
-; BOLT-DWP: DW_AT_name  ("helper.cpp")
-; BOLT-DWP: DW_AT_dwo_name  ("helper.dwo.dwo")
-
 ;; Tests that BOLT correctly handles updating DW_AT_comp_dir/DW_AT_dwo_name when outptut directory is specified.
 
 ; RUN: mkdir DWOOut
diff --git a/bolt/test/X86/dwarf5-ftypes-dwo-mono-input-dwp-output.test b/bolt/test/X86/dwarf5-ftypes-dwo-mono-input-dwp-output.test
deleted file mode 100644
index b6e9f60..0000000
--- a/bolt/test/X86/dwarf5-ftypes-dwo-mono-input-dwp-output.test
+++ /dev/null
@@ -1,55 +0,0 @@
-# REQUIRES: system-linux
-; RUN: rm -rf %t
-; RUN: mkdir %t
-; RUN: cd %t
-; RUN: llvm-mc --split-dwarf-file=main.dwo   -dwarf-version=5 -filetype=obj -triple x86_64-unknown-linux %p/Inputs/dwarf5-split-gdb-index-types-main.s -o main.o
-; RUN: llvm-mc --split-dwarf-file=helper.dwo -dwarf-version=5 -filetype=obj -triple x86_64-unknown-linux %p/Inputs/dwarf5-split-gdb-index-types-helper.s -o helper1.o
-; RUN: llvm-mc -dwarf-version=5 -filetype=obj -triple x86_64-unknown-linux %p/Inputs/dwarf5-types-helper2.s -o helper2.o
-; RUN: %clang %cflags -gdwarf-5 -gsplit-dwarf=split main.o helper1.o helper2.o -o main.exe
-; RUN: llvm-dwarfdump --show-form --verbose --debug-info main.dwo | FileCheck -check-prefix=PRE-BOLT %s
-; RUN: llvm-dwarfdump --show-form --verbose --debug-info helper2.o | FileCheck -check-prefix=PRE-BOLT2 %s
-; RUN: llvm-bolt main.exe -o main.exe.bolt --update-debug-sections --write-dwp
-; RUN: llvm-dwarfdump --show-form --verbose --debug-info -r 0 main.exe.bolt.dwp | FileCheck -check-prefix=BOLT %s
-; RUN: llvm-dwarfdump --show-form --verbose --debug-tu-index main.exe.bolt.dwp | FileCheck -check-prefix=BOLT-DWP-TU-INDEX %s
-; RUN: llvm-dwarfdump --show-form --verbose --debug-cu-index main.exe.bolt.dwp | FileCheck -check-prefix=BOLT-DWP-CU-INDEX %s
-
-;; Test input into bolt a .dwo file with TU Index.
-;; Test split-dwarf and monolithic TUs.
-;; Make sure the output .dwp file has a type and cu information.
-
-; PRE-BOLT: Type Unit
-; PRE-BOLT-SAME: 0x675d23e4f33235f2
-; PRE-BOLT: Type Unit
-; PRE-BOLT-SAME: 0x49dc260088be7e56
-
-; PRE-BOLT2: 0x8f55ac73549bc003
-; PRE-BOLT2: DW_TAG_type_unit
-; PRE-BOLT2: 0xe7734af8fed0632e
-; PRE-BOLT2: DW_TAG_type_unit
-
-; BOLT: 0x00000000: Type Unit: length = 0x00000047
-; BOLT-SAME: 0x675d23e4f33235f2
-; BOLT: 0x0000004b: Type Unit: length = 0x0000003e
-; BOLT-SAME: 0x49dc260088be7e56
-; BOLT: 0x0000008d: Compile Unit: length = 0x00000077
-; BOLT-SAME: 0x4257354d8bb35644
-; BOLT: 0x00000108: Type Unit: length = 0x00000047
-; BOLT-SAME: 0x104ec427d2ebea6f
-; BOLT: 0x00000153: Type Unit: length = 0x0000003e
-; BOLT-SAME: 0xb4580bc1535df1e4
-; BOLT: 0x00000195: Compile Unit: length = 0x00000054
-; BOLT-SAME: 0x7738bfb5f3edfb73
-; BOLT-NOT: 0x8f55ac73549bc003
-; BOLT-NOT: 0xe7734af8fed0632e
-
-; BOLT-DWP-TU-INDEX: version = 5, units = 4, slots = 8
-; BOLT-DWP-TU-INDEX: Index Signature
-; BOLT-DWP-TU-INDEX: 3 0x675d23e4f33235f2 [0x0000000000000000, 0x000000000000004b) [0x00000000, 0x00000083) [0x00000000, 0x00000056) [0x00000000, 0x00000044)
-; BOLT-DWP-TU-INDEX: 5 0xb4580bc1535df1e4 [0x0000000000000153, 0x0000000000000195) [0x00000083, 0x000000f9) [0x00000056, 0x000000ae) [0x00000044, 0x00000084)
-; BOLT-DWP-TU-INDEX: 7 0x49dc260088be7e56 [0x000000000000004b, 0x000000000000008d) [0x00000000, 0x00000083) [0x00000000, 0x00000056) [0x00000000, 0x00000044)
-; BOLT-DWP-TU-INDEX: 8 0x104ec427d2ebea6f [0x0000000000000108, 0x0000000000000153) [0x00000083, 0x000000f9) [0x00000056, 0x000000ae) [0x00000044, 0x00000084)
-
-; BOLT-DWP-CU-INDEX: version = 5, units = 2, slots = 4
-; BOLT-DWP-CU-INDEX: Index Signature
-; BOLT-DWP-CU-INDEX: 1 0x4257354d8bb35644 [0x000000000000008d, 0x0000000000000108) [0x00000000, 0x00000083) [0x00000000, 0x00000056) [0x00000000, 0x00000044)
-; BOLT-DWP-CU-INDEX: 4 0x7738bfb5f3edfb73 [0x0000000000000195, 0x00000000000001ed) [0x00000083, 0x000000f9) [0x00000056, 0x000000ae) [0x00000044, 0x00000084)
diff --git a/clang-tools-extra/clang-tidy/add_new_check.py b/clang-tools-extra/clang-tidy/add_new_check.py
index 1ce2019..bd69bdd 100755
--- a/clang-tools-extra/clang-tidy/add_new_check.py
+++ b/clang-tools-extra/clang-tidy/add_new_check.py
@@ -13,11 +13,13 @@ from __future__ import unicode_literals
 
 import argparse
 import io
+import itertools
 import os
 import re
 import sys
 import textwrap
 
+
 # Adapts the module's CMakelist file. Returns 'True' if it could add a new
 # entry and 'False' if the entry already existed.
 def adapt_cmake(module_path, check_name_camel):
@@ -55,13 +57,28 @@ def adapt_cmake(module_path, check_name_camel):
 
 # Adds a header for the new check.
 def write_header(
-    module_path, module, namespace, check_name, check_name_camel, description
+    module_path,
+    module,
+    namespace,
+    check_name,
+    check_name_camel,
+    description,
+    lang_restrict,
 ):
     wrapped_desc = "\n".join(
         textwrap.wrap(
             description, width=80, initial_indent="/// ", subsequent_indent="/// "
         )
     )
+    if lang_restrict:
+        override_supported = """
+  bool isLanguageVersionSupported(const LangOptions &LangOpts) const override {
+    return %s;
+  }""" % (
+            lang_restrict % {"lang": "LangOpts"}
+        )
+    else:
+        override_supported = ""
     filename = os.path.join(module_path, check_name_camel) + ".h"
     print("Creating %s..." % filename)
     with io.open(filename, "w", encoding="utf8", newline="\n") as f:
@@ -102,7 +119,7 @@ public:
   %(check_name_camel)s(StringRef Name, ClangTidyContext *Context)
       : ClangTidyCheck(Name, Context) {}
   void registerMatchers(ast_matchers::MatchFinder *Finder) override;
-  void check(const ast_matchers::MatchFinder::MatchResult &Result) override;
+  void check(const ast_matchers::MatchFinder::MatchResult &Result) override;%(override_supported)s
 };
 
 } // namespace clang::tidy::%(namespace)s
@@ -116,6 +133,7 @@ public:
                 "module": module,
                 "namespace": namespace,
                 "description": wrapped_desc,
+                "override_supported": override_supported,
             }
         )
 
@@ -306,7 +324,9 @@ def add_release_notes(module_path, module, check_name, description):
 
 
 # Adds a test for the check.
-def write_test(module_path, module, check_name, test_extension):
+def write_test(module_path, module, check_name, test_extension, test_standard):
+    if test_standard:
+        test_standard = f"-std={test_standard}-or-later "
     check_name_dashes = module + "-" + check_name
     filename = os.path.normpath(
         os.path.join(
@@ -323,7 +343,7 @@ def write_test(module_path, module, check_name, test_extension):
     print("Creating %s..." % filename)
     with io.open(filename, "w", encoding="utf8", newline="\n") as f:
         f.write(
-            """// RUN: %%check_clang_tidy %%s %(check_name_dashes)s %%t
+            """// RUN: %%check_clang_tidy %(standard)s%%s %(check_name_dashes)s %%t
 
 // FIXME: Add something that triggers the check here.
 void f();
@@ -338,7 +358,7 @@ void f();
 // FIXME: Add something that doesn't trigger the check here.
 void awesome_f2();
 """
-            % {"check_name_dashes": check_name_dashes}
+            % {"check_name_dashes": check_name_dashes, "standard": test_standard}
         )
 
 
@@ -511,7 +531,10 @@ def update_checks_list(clang_tidy_path):
         if (match or (check_name.startswith("clang-analyzer-"))) and check_name:
             module = doc_file[0]
             check_file = doc_file[1].replace(".rst", "")
-            if not match or match.group(1) == "https://clang.llvm.org/docs/analyzer/checkers":
+            if (
+                not match
+                or match.group(1) == "https://clang.llvm.org/docs/analyzer/checkers"
+            ):
                 title = "Clang Static Analyzer " + check_file
                 # Preserve the anchor in checkers.html from group 2.
                 target = "" if not match else match.group(1) + ".html" + match.group(2)
@@ -529,7 +552,7 @@ def update_checks_list(clang_tidy_path):
             if target:
                 # The checker is just a redirect.
                 return (
-                        "   :doc:`%(check_name)s <%(module)s/%(check_file)s>`, %(ref_begin)s`%(title)s <%(target)s>`%(ref_end)s,%(autofix)s\n"
+                    "   :doc:`%(check_name)s <%(module)s/%(check_file)s>`, %(ref_begin)s`%(title)s <%(target)s>`%(ref_end)s,%(autofix)s\n"
                     % {
                         "check_name": check_name,
                         "module": module,
@@ -537,13 +560,14 @@ def update_checks_list(clang_tidy_path):
                         "target": target,
                         "title": title,
                         "autofix": autofix,
-                        "ref_begin" : ref_begin,
-                        "ref_end" : ref_end
-                    })
+                        "ref_begin": ref_begin,
+                        "ref_end": ref_end,
+                    }
+                )
             else:
                 # The checker is just a alias without redirect.
                 return (
-                        "   :doc:`%(check_name)s <%(module)s/%(check_file)s>`, %(title)s,%(autofix)s\n"
+                    "   :doc:`%(check_name)s <%(module)s/%(check_file)s>`, %(title)s,%(autofix)s\n"
                     % {
                         "check_name": check_name,
                         "module": module,
@@ -551,7 +575,8 @@ def update_checks_list(clang_tidy_path):
                         "target": target,
                         "title": title,
                         "autofix": autofix,
-                    })
+                    }
+                )
         return ""
 
     checks = map(format_link, doc_files)
@@ -613,6 +638,22 @@ def main():
         "objc": "m",
         "objc++": "mm",
     }
+    cpp_language_to_requirements = {
+        "c++98": "CPlusPlus",
+        "c++11": "CPlusPlus11",
+        "c++14": "CPlusPlus14",
+        "c++17": "CPlusPlus17",
+        "c++20": "CPlusPlus20",
+        "c++23": "CPlusPlus23",
+        "c++26": "CPlusPlus26",
+    }
+    c_language_to_requirements = {
+        "c99": None,
+        "c11": "C11",
+        "c17": "C17",
+        "c23": "C23",
+        "c27": "C2Y",
+    }
     parser = argparse.ArgumentParser()
     parser.add_argument(
         "--update-docs",
@@ -623,7 +664,7 @@ def main():
         "--language",
         help="language to use for new check (defaults to c++)",
         choices=language_to_extension.keys(),
-        default="c++",
+        default=None,
         metavar="LANG",
     )
     parser.add_argument(
@@ -634,6 +675,16 @@ def main():
         type=str,
     )
     parser.add_argument(
+        "--standard",
+        help="Specify a specific version of the language",
+        choices=list(
+            itertools.chain(
+                cpp_language_to_requirements.keys(), c_language_to_requirements.keys()
+            )
+        ),
+        default=None,
+    )
+    parser.add_argument(
         "module",
         nargs="?",
         help="module directory under which to place the new tidy check (e.g., misc)",
@@ -677,14 +728,49 @@ def main():
     if not description.endswith("."):
         description += "."
 
+    language = args.language
+
+    if args.standard:
+        if args.standard in cpp_language_to_requirements:
+            if language and language != "c++":
+                raise ValueError("C++ standard chosen when language is not C++")
+            language = "c++"
+        elif args.standard in c_language_to_requirements:
+            if language and language != "c":
+                raise ValueError("C standard chosen when language is not C")
+            language = "c"
+
+    if not language:
+        language = "c++"
+
+    language_restrict = None
+
+    if language == "c":
+        language_restrict = "!%(lang)s.CPlusPlus"
+        extra = c_language_to_requirements.get(args.standard, None)
+        if extra:
+            language_restrict += f" && %(lang)s.{extra}"
+    elif language == "c++":
+        language_restrict = (
+            f"%(lang)s.{cpp_language_to_requirements.get(args.standard, 'CPlusPlus')}"
+        )
+    elif language in ["objc", "objc++"]:
+        language_restrict = "%(lang)s.ObjC"
+
     write_header(
-        module_path, module, namespace, check_name, check_name_camel, description
+        module_path,
+        module,
+        namespace,
+        check_name,
+        check_name_camel,
+        description,
+        language_restrict,
     )
     write_implementation(module_path, module, namespace, check_name_camel)
     adapt_module(module_path, module, check_name, check_name_camel)
     add_release_notes(module_path, module, check_name, description)
-    test_extension = language_to_extension.get(args.language)
-    write_test(module_path, module, check_name, test_extension)
+    test_extension = language_to_extension.get(language)
+    write_test(module_path, module, check_name, test_extension, args.standard)
     write_docs(module_path, module, check_name)
     update_checks_list(clang_tidy_path)
     print("Done. Now it's your turn!")
diff --git a/clang-tools-extra/test/clang-tidy/check_clang_tidy.py b/clang-tools-extra/test/clang-tidy/check_clang_tidy.py
index e92179a..5e39c05 100755
--- a/clang-tools-extra/test/clang-tidy/check_clang_tidy.py
+++ b/clang-tools-extra/test/clang-tidy/check_clang_tidy.py
@@ -205,9 +205,11 @@ class CheckRunner:
                 self.temp_file_name,
             ]
             + [
-                "-fix"
-                if self.export_fixes is None
-                else "--export-fixes=" + self.export_fixes
+                (
+                    "-fix"
+                    if self.export_fixes is None
+                    else "--export-fixes=" + self.export_fixes
+                )
             ]
             + [
                 "--checks=-*," + self.check_name,
@@ -299,19 +301,37 @@ class CheckRunner:
             self.check_notes(clang_tidy_output)
 
 
+CPP_STANDARDS = [
+    "c++98",
+    "c++11",
+    ("c++14", "c++1y"),
+    ("c++17", "c++1z"),
+    ("c++20", "c++2a"),
+    ("c++23", "c++2b"),
+    ("c++26", "c++2c"),
+]
+C_STANDARDS = ["c99", ("c11", "c1x"), "c17", ("c23", "c2x"), "c2y"]
+
+
 def expand_std(std):
-    if std == "c++98-or-later":
-        return ["c++98", "c++11", "c++14", "c++17", "c++20", "c++23", "c++2c"]
-    if std == "c++11-or-later":
-        return ["c++11", "c++14", "c++17", "c++20", "c++23", "c++2c"]
-    if std == "c++14-or-later":
-        return ["c++14", "c++17", "c++20", "c++23", "c++2c"]
-    if std == "c++17-or-later":
-        return ["c++17", "c++20", "c++23", "c++2c"]
-    if std == "c++20-or-later":
-        return ["c++20", "c++23", "c++2c"]
-    if std == "c++23-or-later":
-        return ["c++23", "c++2c"]
+    split_std, or_later, _ = std.partition("-or-later")
+
+    if not or_later:
+        return [split_std]
+
+    for standard_list in (CPP_STANDARDS, C_STANDARDS):
+        item = next(
+            (
+                i
+                for i, v in enumerate(standard_list)
+                if (split_std in v if isinstance(v, (list, tuple)) else split_std == v)
+            ),
+            None,
+        )
+        if item is not None:
+            return [split_std] + [
+                x if isinstance(x, str) else x[0] for x in standard_list[item + 1 :]
+            ]
     return [std]
 
 
diff --git a/clang/bindings/python/clang/cindex.py b/clang/bindings/python/clang/cindex.py
index be024da..2038ef6 100644
--- a/clang/bindings/python/clang/cindex.py
+++ b/clang/bindings/python/clang/cindex.py
@@ -43,7 +43,7 @@ The major indexing objects are:
 Most object information is exposed using properties, when the underlying API
 call is efficient.
 """
-from __future__ import absolute_import, division, print_function
+from __future__ import annotations
 
 # TODO
 # ====
@@ -64,48 +64,80 @@ from __future__ import absolute_import, division, print_function
 
 from ctypes import *
 
-import collections.abc
 import os
+import sys
 from enum import Enum
 
+from typing import (
+    Any,
+    Callable,
+    Generic,
+    Optional,
+    Type as TType,
+    TypeVar,
+    TYPE_CHECKING,
+    Union as TUnion,
+)
+
+if TYPE_CHECKING:
+    from ctypes import _Pointer
+    from typing_extensions import Protocol, TypeAlias
+
+    StrPath: TypeAlias = TUnion[str, os.PathLike[str]]
+    LibFunc: TypeAlias = TUnion[
+        "tuple[str, Optional[list[Any]]]",
+        "tuple[str, Optional[list[Any]], Any]",
+        "tuple[str, Optional[list[Any]], Any, Callable[..., Any]]",
+    ]
+
+    TSeq = TypeVar("TSeq", covariant=True)
+
+    class NoSliceSequence(Protocol[TSeq]):
+        def __len__(self) -> int:
+            ...
+
+        def __getitem__(self, key: int) -> TSeq:
+            ...
+
 
 # Python 3 strings are unicode, translate them to/from utf8 for C-interop.
 class c_interop_string(c_char_p):
-    def __init__(self, p=None):
+    def __init__(self, p: str | bytes | None = None):
         if p is None:
             p = ""
         if isinstance(p, str):
             p = p.encode("utf8")
         super(c_char_p, self).__init__(p)
 
-    def __str__(self):
-        return self.value
+    def __str__(self) -> str:
+        return self.value or ""
 
     @property
-    def value(self):
-        if super(c_char_p, self).value is None:
+    def value(self) -> str | None:  # type: ignore [override]
+        val = super(c_char_p, self).value
+        if val is None:
             return None
-        return super(c_char_p, self).value.decode("utf8")
+        return val.decode("utf8")
 
     @classmethod
-    def from_param(cls, param):
+    def from_param(cls, param: str | bytes | None) -> c_interop_string:
         if isinstance(param, str):
             return cls(param)
         if isinstance(param, bytes):
             return cls(param)
         if param is None:
             # Support passing null to C functions expecting char arrays
-            return None
+            return cls(param)
         raise TypeError(
             "Cannot convert '{}' to '{}'".format(type(param).__name__, cls.__name__)
         )
 
     @staticmethod
-    def to_python_string(x, *args):
+    def to_python_string(x: c_interop_string, *args: Any) -> str | None:
         return x.value
 
 
-def b(x):
+def b(x: str | bytes) -> bytes:
     if isinstance(x, bytes):
         return x
     return x.encode("utf8")
@@ -115,9 +147,7 @@ def b(x):
 # object. This is a problem, because it means that from_parameter will see an
 # integer and pass the wrong value on platforms where int != void*. Work around
 # this by marshalling object arguments as void**.
-c_object_p = POINTER(c_void_p)
-
-callbacks = {}
+c_object_p: TType[_Pointer[Any]] = POINTER(c_void_p)
 
 ### Exception Classes ###
 
@@ -169,8 +199,11 @@ class TranslationUnitSaveError(Exception):
 
 ### Structures and Utility Classes ###
 
+TInstance = TypeVar("TInstance")
+TResult = TypeVar("TResult")
+
 
-class CachedProperty:
+class CachedProperty(Generic[TInstance, TResult]):
     """Decorator that lazy-loads the value of a property.
 
     The first time the property is accessed, the original property function is
@@ -178,16 +211,20 @@ class CachedProperty:
     property, replacing the original method.
     """
 
-    def __init__(self, wrapped):
+    def __init__(self, wrapped: Callable[[TInstance], TResult]):
         self.wrapped = wrapped
         try:
             self.__doc__ = wrapped.__doc__
         except:
             pass
 
-    def __get__(self, instance, instance_type=None):
+    def __get__(self, instance: TInstance, instance_type: Any = None) -> TResult:
         if instance is None:
-            return self
+            property_name = self.wrapped.__name__
+            class_name = instance_type.__name__
+            raise TypeError(
+                f"'{property_name}' is not a static attribute of '{class_name}'"
+            )
 
         value = self.wrapped(instance)
         setattr(instance, self.wrapped.__name__, value)
@@ -200,13 +237,16 @@ class _CXString(Structure):
 
     _fields_ = [("spelling", c_char_p), ("free", c_int)]
 
-    def __del__(self):
+    def __del__(self) -> None:
         conf.lib.clang_disposeString(self)
 
     @staticmethod
-    def from_result(res, fn=None, args=None):
+    def from_result(res: _CXString, fn: Any = None, args: Any = None) -> str:
         assert isinstance(res, _CXString)
-        return conf.lib.clang_getCString(res)
+        pystr: str | None = conf.lib.clang_getCString(res)
+        if pystr is None:
+            return ""
+        return pystr
 
 
 class SourceLocation(Structure):
@@ -236,7 +276,7 @@ class SourceLocation(Structure):
         Retrieve the source location associated with a given file/line/column in
         a particular translation unit.
         """
-        return conf.lib.clang_getLocation(tu, file, line, column)
+        return conf.lib.clang_getLocation(tu, file, line, column)  # type: ignore [no-any-return]
 
     @staticmethod
     def from_offset(tu, file, offset):
@@ -246,7 +286,7 @@ class SourceLocation(Structure):
         file -- File instance to obtain offset from
         offset -- Integer character offset within file
         """
-        return conf.lib.clang_getLocationForOffset(tu, file, offset)
+        return conf.lib.clang_getLocationForOffset(tu, file, offset)  # type: ignore [no-any-return]
 
     @property
     def file(self):
@@ -271,10 +311,10 @@ class SourceLocation(Structure):
     @property
     def is_in_system_header(self):
         """Returns true if the given source location is in a system header."""
-        return conf.lib.clang_Location_isInSystemHeader(self)
+        return conf.lib.clang_Location_isInSystemHeader(self)  # type: ignore [no-any-return]
 
     def __eq__(self, other):
-        return conf.lib.clang_equalLocations(self, other)
+        return conf.lib.clang_equalLocations(self, other)  # type: ignore [no-any-return]
 
     def __ne__(self, other):
         return not self.__eq__(other)
@@ -307,7 +347,7 @@ class SourceRange(Structure):
     # object.
     @staticmethod
     def from_locations(start, end):
-        return conf.lib.clang_getRange(start, end)
+        return conf.lib.clang_getRange(start, end)  # type: ignore [no-any-return]
 
     @property
     def start(self):
@@ -315,7 +355,7 @@ class SourceRange(Structure):
         Return a SourceLocation representing the first character within a
         source range.
         """
-        return conf.lib.clang_getRangeStart(self)
+        return conf.lib.clang_getRangeStart(self)  # type: ignore [no-any-return]
 
     @property
     def end(self):
@@ -323,10 +363,10 @@ class SourceRange(Structure):
         Return a SourceLocation representing the last character within a
         source range.
         """
-        return conf.lib.clang_getRangeEnd(self)
+        return conf.lib.clang_getRangeEnd(self)  # type: ignore [no-any-return]
 
     def __eq__(self, other):
-        return conf.lib.clang_equalRanges(self, other)
+        return conf.lib.clang_equalRanges(self, other)  # type: ignore [no-any-return]
 
     def __ne__(self, other):
         return not self.__eq__(other)
@@ -389,42 +429,42 @@ class Diagnostic:
 
     @property
     def severity(self):
-        return conf.lib.clang_getDiagnosticSeverity(self)
+        return conf.lib.clang_getDiagnosticSeverity(self)  # type: ignore [no-any-return]
 
     @property
     def location(self):
-        return conf.lib.clang_getDiagnosticLocation(self)
+        return conf.lib.clang_getDiagnosticLocation(self)  # type: ignore [no-any-return]
 
     @property
     def spelling(self):
-        return conf.lib.clang_getDiagnosticSpelling(self)
+        return conf.lib.clang_getDiagnosticSpelling(self)  # type: ignore [no-any-return]
 
     @property
-    def ranges(self):
+    def ranges(self) -> NoSliceSequence[SourceRange]:
         class RangeIterator:
-            def __init__(self, diag):
+            def __init__(self, diag: Diagnostic):
                 self.diag = diag
 
-            def __len__(self):
+            def __len__(self) -> int:
                 return int(conf.lib.clang_getDiagnosticNumRanges(self.diag))
 
-            def __getitem__(self, key):
+            def __getitem__(self, key: int) -> SourceRange:
                 if key >= len(self):
                     raise IndexError
-                return conf.lib.clang_getDiagnosticRange(self.diag, key)
+                return conf.lib.clang_getDiagnosticRange(self.diag, key)  # type: ignore [no-any-return]
 
         return RangeIterator(self)
 
     @property
-    def fixits(self):
+    def fixits(self) -> NoSliceSequence[FixIt]:
         class FixItIterator:
-            def __init__(self, diag):
+            def __init__(self, diag: Diagnostic):
                 self.diag = diag
 
-            def __len__(self):
+            def __len__(self) -> int:
                 return int(conf.lib.clang_getDiagnosticNumFixIts(self.diag))
 
-            def __getitem__(self, key):
+            def __getitem__(self, key: int) -> FixIt:
                 range = SourceRange()
                 value = conf.lib.clang_getDiagnosticFixIt(self.diag, key, byref(range))
                 if len(value) == 0:
@@ -435,15 +475,15 @@ class Diagnostic:
         return FixItIterator(self)
 
     @property
-    def children(self):
+    def children(self) -> NoSliceSequence[Diagnostic]:
         class ChildDiagnosticsIterator:
-            def __init__(self, diag):
+            def __init__(self, diag: Diagnostic):
                 self.diag_set = conf.lib.clang_getChildDiagnostics(diag)
 
-            def __len__(self):
+            def __len__(self) -> int:
                 return int(conf.lib.clang_getNumDiagnosticsInSet(self.diag_set))
 
-            def __getitem__(self, key):
+            def __getitem__(self, key: int) -> Diagnostic:
                 diag = conf.lib.clang_getDiagnosticInSet(self.diag_set, key)
                 if not diag:
                     raise IndexError
@@ -454,17 +494,17 @@ class Diagnostic:
     @property
     def category_number(self):
         """The category number for this diagnostic or 0 if unavailable."""
-        return conf.lib.clang_getDiagnosticCategory(self)
+        return conf.lib.clang_getDiagnosticCategory(self)  # type: ignore [no-any-return]
 
     @property
     def category_name(self):
         """The string name of the category for this diagnostic."""
-        return conf.lib.clang_getDiagnosticCategoryText(self)
+        return conf.lib.clang_getDiagnosticCategoryText(self)  # type: ignore [no-any-return]
 
     @property
     def option(self):
         """The command-line option that enables this diagnostic."""
-        return conf.lib.clang_getDiagnosticOption(self, None)
+        return conf.lib.clang_getDiagnosticOption(self, None)  # type: ignore [no-any-return]
 
     @property
     def disable_option(self):
@@ -484,7 +524,7 @@ class Diagnostic:
             options = conf.lib.clang_defaultDiagnosticDisplayOptions()
         if options & ~Diagnostic._FormatOptionsMask:
             raise ValueError("Invalid format options")
-        return conf.lib.clang_formatDiagnostic(self, options)
+        return conf.lib.clang_formatDiagnostic(self, options)  # type: ignore [no-any-return]
 
     def __repr__(self):
         return "<Diagnostic severity %r, location %r, spelling %r>" % (
@@ -619,39 +659,39 @@ class CursorKind(BaseEnumeration):
 
     def is_declaration(self):
         """Test if this is a declaration kind."""
-        return conf.lib.clang_isDeclaration(self)
+        return conf.lib.clang_isDeclaration(self)  # type: ignore [no-any-return]
 
     def is_reference(self):
         """Test if this is a reference kind."""
-        return conf.lib.clang_isReference(self)
+        return conf.lib.clang_isReference(self)  # type: ignore [no-any-return]
 
     def is_expression(self):
         """Test if this is an expression kind."""
-        return conf.lib.clang_isExpression(self)
+        return conf.lib.clang_isExpression(self)  # type: ignore [no-any-return]
 
     def is_statement(self):
         """Test if this is a statement kind."""
-        return conf.lib.clang_isStatement(self)
+        return conf.lib.clang_isStatement(self)  # type: ignore [no-any-return]
 
     def is_attribute(self):
         """Test if this is an attribute kind."""
-        return conf.lib.clang_isAttribute(self)
+        return conf.lib.clang_isAttribute(self)  # type: ignore [no-any-return]
 
     def is_invalid(self):
         """Test if this is an invalid kind."""
-        return conf.lib.clang_isInvalid(self)
+        return conf.lib.clang_isInvalid(self)  # type: ignore [no-any-return]
 
     def is_translation_unit(self):
         """Test if this is a translation unit kind."""
-        return conf.lib.clang_isTranslationUnit(self)
+        return conf.lib.clang_isTranslationUnit(self)  # type: ignore [no-any-return]
 
     def is_preprocessing(self):
         """Test if this is a preprocessing kind."""
-        return conf.lib.clang_isPreprocessing(self)
+        return conf.lib.clang_isPreprocessing(self)  # type: ignore [no-any-return]
 
     def is_unexposed(self):
         """Test if this is an unexposed kind."""
-        return conf.lib.clang_isUnexposed(self)
+        return conf.lib.clang_isUnexposed(self)  # type: ignore [no-any-return]
 
 
     ###
@@ -1524,7 +1564,7 @@ class Cursor(Structure):
         return cursor
 
     def __eq__(self, other):
-        return conf.lib.clang_equalCursors(self, other)
+        return conf.lib.clang_equalCursors(self, other)  # type: ignore [no-any-return]
 
     def __ne__(self, other):
         return not self.__eq__(other)
@@ -1534,41 +1574,41 @@ class Cursor(Structure):
         Returns true if the declaration pointed at by the cursor is also a
         definition of that entity.
         """
-        return conf.lib.clang_isCursorDefinition(self)
+        return conf.lib.clang_isCursorDefinition(self)  # type: ignore [no-any-return]
 
     def is_const_method(self):
         """Returns True if the cursor refers to a C++ member function or member
         function template that is declared 'const'.
         """
-        return conf.lib.clang_CXXMethod_isConst(self)
+        return conf.lib.clang_CXXMethod_isConst(self)  # type: ignore [no-any-return]
 
     def is_converting_constructor(self):
         """Returns True if the cursor refers to a C++ converting constructor."""
-        return conf.lib.clang_CXXConstructor_isConvertingConstructor(self)
+        return conf.lib.clang_CXXConstructor_isConvertingConstructor(self)  # type: ignore [no-any-return]
 
     def is_copy_constructor(self):
         """Returns True if the cursor refers to a C++ copy constructor."""
-        return conf.lib.clang_CXXConstructor_isCopyConstructor(self)
+        return conf.lib.clang_CXXConstructor_isCopyConstructor(self)  # type: ignore [no-any-return]
 
     def is_default_constructor(self):
         """Returns True if the cursor refers to a C++ default constructor."""
-        return conf.lib.clang_CXXConstructor_isDefaultConstructor(self)
+        return conf.lib.clang_CXXConstructor_isDefaultConstructor(self)  # type: ignore [no-any-return]
 
     def is_move_constructor(self):
         """Returns True if the cursor refers to a C++ move constructor."""
-        return conf.lib.clang_CXXConstructor_isMoveConstructor(self)
+        return conf.lib.clang_CXXConstructor_isMoveConstructor(self)  # type: ignore [no-any-return]
 
     def is_default_method(self):
         """Returns True if the cursor refers to a C++ member function or member
         function template that is declared '= default'.
         """
-        return conf.lib.clang_CXXMethod_isDefaulted(self)
+        return conf.lib.clang_CXXMethod_isDefaulted(self)  # type: ignore [no-any-return]
 
     def is_deleted_method(self):
         """Returns True if the cursor refers to a C++ member function or member
         function template that is declared '= delete'.
         """
-        return conf.lib.clang_CXXMethod_isDeleted(self)
+        return conf.lib.clang_CXXMethod_isDeleted(self)  # type: ignore [no-any-return]
 
     def is_copy_assignment_operator_method(self):
         """Returnrs True if the cursor refers to a copy-assignment operator.
@@ -1593,7 +1633,7 @@ class Cursor(Structure):
 
         Is not.
         """
-        return conf.lib.clang_CXXMethod_isCopyAssignmentOperator(self)
+        return conf.lib.clang_CXXMethod_isCopyAssignmentOperator(self)  # type: ignore [no-any-return]
 
     def is_move_assignment_operator_method(self):
         """Returnrs True if the cursor refers to a move-assignment operator.
@@ -1618,7 +1658,7 @@ class Cursor(Structure):
 
         Is not.
         """
-        return conf.lib.clang_CXXMethod_isMoveAssignmentOperator(self)
+        return conf.lib.clang_CXXMethod_isMoveAssignmentOperator(self)  # type: ignore [no-any-return]
 
     def is_explicit_method(self):
         """Determines if a C++ constructor or conversion function is
@@ -1663,41 +1703,41 @@ class Cursor(Structure):
         This method will return 0 for the constructor and 1 for
         the conversion function.
         """
-        return conf.lib.clang_CXXMethod_isExplicit(self)
+        return conf.lib.clang_CXXMethod_isExplicit(self)  # type: ignore [no-any-return]
 
     def is_mutable_field(self):
         """Returns True if the cursor refers to a C++ field that is declared
         'mutable'.
         """
-        return conf.lib.clang_CXXField_isMutable(self)
+        return conf.lib.clang_CXXField_isMutable(self)  # type: ignore [no-any-return]
 
     def is_pure_virtual_method(self):
         """Returns True if the cursor refers to a C++ member function or member
         function template that is declared pure virtual.
         """
-        return conf.lib.clang_CXXMethod_isPureVirtual(self)
+        return conf.lib.clang_CXXMethod_isPureVirtual(self)  # type: ignore [no-any-return]
 
     def is_static_method(self):
         """Returns True if the cursor refers to a C++ member function or member
         function template that is declared 'static'.
         """
-        return conf.lib.clang_CXXMethod_isStatic(self)
+        return conf.lib.clang_CXXMethod_isStatic(self)  # type: ignore [no-any-return]
 
     def is_virtual_method(self):
         """Returns True if the cursor refers to a C++ member function or member
         function template that is declared 'virtual'.
         """
-        return conf.lib.clang_CXXMethod_isVirtual(self)
+        return conf.lib.clang_CXXMethod_isVirtual(self)  # type: ignore [no-any-return]
 
     def is_abstract_record(self):
         """Returns True if the cursor refers to a C++ record declaration
         that has pure virtual member functions.
         """
-        return conf.lib.clang_CXXRecord_isAbstract(self)
+        return conf.lib.clang_CXXRecord_isAbstract(self)  # type: ignore [no-any-return]
 
     def is_scoped_enum(self):
         """Returns True if the cursor refers to a scoped enum declaration."""
-        return conf.lib.clang_EnumDecl_isScoped(self)
+        return conf.lib.clang_EnumDecl_isScoped(self)  # type: ignore [no-any-return]
 
     def get_definition(self):
         """
@@ -1707,7 +1747,7 @@ class Cursor(Structure):
         """
         # TODO: Should probably check that this is either a reference or
         # declaration prior to issuing the lookup.
-        return conf.lib.clang_getCursorDefinition(self)
+        return conf.lib.clang_getCursorDefinition(self)  # type: ignore [no-any-return]
 
     def get_usr(self):
         """Return the Unified Symbol Resolution (USR) for the entity referenced
@@ -1718,13 +1758,13 @@ class Cursor(Structure):
         program. USRs can be compared across translation units to determine,
         e.g., when references in one translation refer to an entity defined in
         another translation unit."""
-        return conf.lib.clang_getCursorUSR(self)
+        return conf.lib.clang_getCursorUSR(self)  # type: ignore [no-any-return]
 
     def get_included_file(self):
         """Returns the File that is included by the current inclusion cursor."""
         assert self.kind == CursorKind.INCLUSION_DIRECTIVE
 
-        return conf.lib.clang_getIncludedFile(self)
+        return conf.lib.clang_getIncludedFile(self)  # type: ignore [no-any-return]
 
     @property
     def kind(self):
@@ -1994,12 +2034,12 @@ class Cursor(Structure):
     @property
     def brief_comment(self):
         """Returns the brief comment text associated with that Cursor"""
-        return conf.lib.clang_Cursor_getBriefCommentText(self)
+        return conf.lib.clang_Cursor_getBriefCommentText(self)  # type: ignore [no-any-return]
 
     @property
     def raw_comment(self):
         """Returns the raw comment text associated with that Cursor"""
-        return conf.lib.clang_Cursor_getRawCommentText(self)
+        return conf.lib.clang_Cursor_getRawCommentText(self)  # type: ignore [no-any-return]
 
     def get_arguments(self):
         """Return an iterator for accessing the arguments of this cursor."""
@@ -2009,24 +2049,24 @@ class Cursor(Structure):
 
     def get_num_template_arguments(self):
         """Returns the number of template args associated with this cursor."""
-        return conf.lib.clang_Cursor_getNumTemplateArguments(self)
+        return conf.lib.clang_Cursor_getNumTemplateArguments(self)  # type: ignore [no-any-return]
 
     def get_template_argument_kind(self, num):
         """Returns the TemplateArgumentKind for the indicated template
         argument."""
-        return conf.lib.clang_Cursor_getTemplateArgumentKind(self, num)
+        return conf.lib.clang_Cursor_getTemplateArgumentKind(self, num)  # type: ignore [no-any-return]
 
     def get_template_argument_type(self, num):
         """Returns the CXType for the indicated template argument."""
-        return conf.lib.clang_Cursor_getTemplateArgumentType(self, num)
+        return conf.lib.clang_Cursor_getTemplateArgumentType(self, num)  # type: ignore [no-any-return]
 
     def get_template_argument_value(self, num):
         """Returns the value of the indicated arg as a signed 64b integer."""
-        return conf.lib.clang_Cursor_getTemplateArgumentValue(self, num)
+        return conf.lib.clang_Cursor_getTemplateArgumentValue(self, num)  # type: ignore [no-any-return]
 
     def get_template_argument_unsigned_value(self, num):
         """Returns the value of the indicated arg as an unsigned 64b integer."""
-        return conf.lib.clang_Cursor_getTemplateArgumentUnsignedValue(self, num)
+        return conf.lib.clang_Cursor_getTemplateArgumentUnsignedValue(self, num)  # type: ignore [no-any-return]
 
     def get_children(self):
         """Return an iterator for accessing the children of this cursor."""
@@ -2042,8 +2082,8 @@ class Cursor(Structure):
             children.append(child)
             return 1  # continue
 
-        children = []
-        conf.lib.clang_visitChildren(self, callbacks["cursor_visit"](visitor), children)
+        children: list[Cursor] = []
+        conf.lib.clang_visitChildren(self, cursor_visit_callback(visitor), children)
         return iter(children)
 
     def walk_preorder(self):
@@ -2066,7 +2106,7 @@ class Cursor(Structure):
 
     def get_field_offsetof(self):
         """Returns the offsetof the FIELD_DECL pointed by this Cursor."""
-        return conf.lib.clang_Cursor_getOffsetOfField(self)
+        return conf.lib.clang_Cursor_getOffsetOfField(self)  # type: ignore [no-any-return]
 
     def is_anonymous(self):
         """
@@ -2074,19 +2114,19 @@ class Cursor(Structure):
         """
         if self.kind == CursorKind.FIELD_DECL:
             return self.type.get_declaration().is_anonymous()
-        return conf.lib.clang_Cursor_isAnonymous(self)
+        return conf.lib.clang_Cursor_isAnonymous(self)  # type: ignore [no-any-return]
 
     def is_bitfield(self):
         """
         Check if the field is a bitfield.
         """
-        return conf.lib.clang_Cursor_isBitField(self)
+        return conf.lib.clang_Cursor_isBitField(self)  # type: ignore [no-any-return]
 
     def get_bitfield_width(self):
         """
         Retrieve the width of a bitfield.
         """
-        return conf.lib.clang_getFieldDeclBitWidth(self)
+        return conf.lib.clang_getFieldDeclBitWidth(self)  # type: ignore [no-any-return]
 
     @staticmethod
     def from_result(res, fn, args):
@@ -2223,7 +2263,7 @@ class TypeKind(BaseEnumeration):
     @property
     def spelling(self):
         """Retrieve the spelling of this TypeKind."""
-        return conf.lib.clang_getTypeKindSpelling(self.value)
+        return conf.lib.clang_getTypeKindSpelling(self.value)  # type: ignore [no-any-return]
 
     INVALID = 0
     UNEXPOSED = 1
@@ -2379,25 +2419,25 @@ class Type(Structure):
         """Return the kind of this type."""
         return TypeKind.from_id(self._kind_id)
 
-    def argument_types(self):
+    def argument_types(self) -> NoSliceSequence[Type]:
         """Retrieve a container for the non-variadic arguments for this type.
 
         The returned object is iterable and indexable. Each item in the
         container is a Type instance.
         """
 
-        class ArgumentsIterator(collections.abc.Sequence):
-            def __init__(self, parent):
+        class ArgumentsIterator:
+            def __init__(self, parent: Type):
                 self.parent = parent
-                self.length = None
+                self.length: int | None = None
 
-            def __len__(self):
+            def __len__(self) -> int:
                 if self.length is None:
                     self.length = conf.lib.clang_getNumArgTypes(self.parent)
 
                 return self.length
 
-            def __getitem__(self, key):
+            def __getitem__(self, key: int) -> Type:
                 # FIXME Support slice objects.
                 if not isinstance(key, int):
                     raise TypeError("Must supply a non-negative int.")
@@ -2411,7 +2451,7 @@ class Type(Structure):
                         "%d > %d" % (key, len(self))
                     )
 
-                result = conf.lib.clang_getArgType(self.parent, key)
+                result: Type = conf.lib.clang_getArgType(self.parent, key)
                 if result.kind == TypeKind.INVALID:
                     raise IndexError("Argument could not be retrieved.")
 
@@ -2470,10 +2510,10 @@ class Type(Structure):
         return res
 
     def get_num_template_arguments(self):
-        return conf.lib.clang_Type_getNumTemplateArguments(self)
+        return conf.lib.clang_Type_getNumTemplateArguments(self)  # type: ignore [no-any-return]
 
     def get_template_argument_type(self, num):
-        return conf.lib.clang_Type_getTemplateArgumentAsType(self, num)
+        return conf.lib.clang_Type_getTemplateArgumentAsType(self, num)  # type: ignore [no-any-return]
 
     def get_canonical(self):
         """
@@ -2485,7 +2525,7 @@ class Type(Structure):
         example, if 'T' is a typedef for 'int', the canonical type for
         'T' would be 'int'.
         """
-        return conf.lib.clang_getCanonicalType(self)
+        return conf.lib.clang_getCanonicalType(self)  # type: ignore [no-any-return]
 
     def is_const_qualified(self):
         """Determine whether a Type has the "const" qualifier set.
@@ -2493,7 +2533,7 @@ class Type(Structure):
         This does not look through typedefs that may have added "const"
         at a different level.
         """
-        return conf.lib.clang_isConstQualifiedType(self)
+        return conf.lib.clang_isConstQualifiedType(self)  # type: ignore [no-any-return]
 
     def is_volatile_qualified(self):
         """Determine whether a Type has the "volatile" qualifier set.
@@ -2501,7 +2541,7 @@ class Type(Structure):
         This does not look through typedefs that may have added "volatile"
         at a different level.
         """
-        return conf.lib.clang_isVolatileQualifiedType(self)
+        return conf.lib.clang_isVolatileQualifiedType(self)  # type: ignore [no-any-return]
 
     def is_restrict_qualified(self):
         """Determine whether a Type has the "restrict" qualifier set.
@@ -2509,83 +2549,83 @@ class Type(Structure):
         This does not look through typedefs that may have added "restrict" at
         a different level.
         """
-        return conf.lib.clang_isRestrictQualifiedType(self)
+        return conf.lib.clang_isRestrictQualifiedType(self)  # type: ignore [no-any-return]
 
     def is_function_variadic(self):
         """Determine whether this function Type is a variadic function type."""
         assert self.kind == TypeKind.FUNCTIONPROTO
 
-        return conf.lib.clang_isFunctionTypeVariadic(self)
+        return conf.lib.clang_isFunctionTypeVariadic(self)  # type: ignore [no-any-return]
 
     def get_address_space(self):
-        return conf.lib.clang_getAddressSpace(self)
+        return conf.lib.clang_getAddressSpace(self)  # type: ignore [no-any-return]
 
     def get_typedef_name(self):
-        return conf.lib.clang_getTypedefName(self)
+        return conf.lib.clang_getTypedefName(self)  # type: ignore [no-any-return]
 
     def is_pod(self):
         """Determine whether this Type represents plain old data (POD)."""
-        return conf.lib.clang_isPODType(self)
+        return conf.lib.clang_isPODType(self)  # type: ignore [no-any-return]
 
     def get_pointee(self):
         """
         For pointer types, returns the type of the pointee.
         """
-        return conf.lib.clang_getPointeeType(self)
+        return conf.lib.clang_getPointeeType(self)  # type: ignore [no-any-return]
 
     def get_declaration(self):
         """
         Return the cursor for the declaration of the given type.
         """
-        return conf.lib.clang_getTypeDeclaration(self)
+        return conf.lib.clang_getTypeDeclaration(self)  # type: ignore [no-any-return]
 
     def get_result(self):
         """
         Retrieve the result type associated with a function type.
         """
-        return conf.lib.clang_getResultType(self)
+        return conf.lib.clang_getResultType(self)  # type: ignore [no-any-return]
 
     def get_array_element_type(self):
         """
         Retrieve the type of the elements of the array type.
         """
-        return conf.lib.clang_getArrayElementType(self)
+        return conf.lib.clang_getArrayElementType(self)  # type: ignore [no-any-return]
 
     def get_array_size(self):
         """
         Retrieve the size of the constant array.
         """
-        return conf.lib.clang_getArraySize(self)
+        return conf.lib.clang_getArraySize(self)  # type: ignore [no-any-return]
 
     def get_class_type(self):
         """
         Retrieve the class type of the member pointer type.
         """
-        return conf.lib.clang_Type_getClassType(self)
+        return conf.lib.clang_Type_getClassType(self)  # type: ignore [no-any-return]
 
     def get_named_type(self):
         """
         Retrieve the type named by the qualified-id.
         """
-        return conf.lib.clang_Type_getNamedType(self)
+        return conf.lib.clang_Type_getNamedType(self)  # type: ignore [no-any-return]
 
     def get_align(self):
         """
         Retrieve the alignment of the record.
         """
-        return conf.lib.clang_Type_getAlignOf(self)
+        return conf.lib.clang_Type_getAlignOf(self)  # type: ignore [no-any-return]
 
     def get_size(self):
         """
         Retrieve the size of the record.
         """
-        return conf.lib.clang_Type_getSizeOf(self)
+        return conf.lib.clang_Type_getSizeOf(self)  # type: ignore [no-any-return]
 
     def get_offset(self, fieldname):
         """
         Retrieve the offset of a field in the record.
         """
-        return conf.lib.clang_Type_getOffsetOf(self, fieldname)
+        return conf.lib.clang_Type_getOffsetOf(self, fieldname)  # type: ignore [no-any-return]
 
     def get_ref_qualifier(self):
         """
@@ -2604,10 +2644,8 @@ class Type(Structure):
             fields.append(field)
             return 1  # continue
 
-        fields = []
-        conf.lib.clang_Type_visitFields(
-            self, callbacks["fields_visit"](visitor), fields
-        )
+        fields: list[Cursor] = []
+        conf.lib.clang_Type_visitFields(self, fields_visit_callback(visitor), fields)
         return iter(fields)
 
     def get_exception_specification_kind(self):
@@ -2622,13 +2660,13 @@ class Type(Structure):
     @property
     def spelling(self):
         """Retrieve the spelling of this Type."""
-        return conf.lib.clang_getTypeSpelling(self)
+        return conf.lib.clang_getTypeSpelling(self)  # type: ignore [no-any-return]
 
     def __eq__(self, other):
         if type(other) != type(self):
             return False
 
-        return conf.lib.clang_equalTypes(self, other)
+        return conf.lib.clang_equalTypes(self, other)  # type: ignore [no-any-return]
 
     def __ne__(self, other):
         return not self.__eq__(other)
@@ -2712,7 +2750,7 @@ class CompletionChunk:
     def spelling(self):
         if self.__kindNumber in SpellingCache:
             return SpellingCache[self.__kindNumber]
-        return conf.lib.clang_getCompletionChunkText(self.cs, self.key)
+        return conf.lib.clang_getCompletionChunkText(self.cs, self.key)  # type: ignore [no-any-return]
 
     # We do not use @CachedProperty here, as the manual implementation is
     # apparently still significantly faster. Please profile carefully if you
@@ -2795,7 +2833,7 @@ class CompletionString(ClangObject):
 
     @CachedProperty
     def num_chunks(self):
-        return conf.lib.clang_getNumCompletionChunks(self.obj)
+        return conf.lib.clang_getNumCompletionChunks(self.obj)  # type: ignore [no-any-return]
 
     def __getitem__(self, key):
         if self.num_chunks <= key:
@@ -2804,7 +2842,7 @@ class CompletionString(ClangObject):
 
     @property
     def priority(self):
-        return conf.lib.clang_getCompletionPriority(self.obj)
+        return conf.lib.clang_getCompletionPriority(self.obj)  # type: ignore [no-any-return]
 
     @property
     def availability(self):
@@ -2814,7 +2852,7 @@ class CompletionString(ClangObject):
     @property
     def briefComment(self):
         if conf.function_exists("clang_getCompletionBriefComment"):
-            return conf.lib.clang_getCompletionBriefComment(self.obj)
+            return conf.lib.clang_getCompletionBriefComment(self.obj)  # type: ignore [no-any-return]
         return _CXString()
 
     def __repr__(self):
@@ -2881,16 +2919,16 @@ class CodeCompletionResults(ClangObject):
         return self.ptr.contents
 
     @property
-    def diagnostics(self):
+    def diagnostics(self) -> NoSliceSequence[Diagnostic]:
         class DiagnosticsItr:
-            def __init__(self, ccr):
+            def __init__(self, ccr: CodeCompletionResults):
                 self.ccr = ccr
 
-            def __len__(self):
+            def __len__(self) -> int:
                 return int(conf.lib.clang_codeCompleteGetNumDiagnostics(self.ccr))
 
-            def __getitem__(self, key):
-                return conf.lib.clang_codeCompleteGetDiagnostic(self.ccr, key)
+            def __getitem__(self, key: int) -> Diagnostic:
+                return conf.lib.clang_codeCompleteGetDiagnostic(self.ccr, key)  # type: ignore [no-any-return]
 
         return DiagnosticsItr(self)
 
@@ -2973,6 +3011,20 @@ class TranslationUnit(ClangObject):
     # into the set of code completions returned from this translation unit.
     PARSE_INCLUDE_BRIEF_COMMENTS_IN_CODE_COMPLETION = 128
 
+    @staticmethod
+    def process_unsaved_files(unsaved_files) -> Array[_CXUnsavedFile] | None:
+        unsaved_array = None
+        if len(unsaved_files):
+            unsaved_array = (_CXUnsavedFile * len(unsaved_files))()
+            for i, (name, contents) in enumerate(unsaved_files):
+                if hasattr(contents, "read"):
+                    contents = contents.read()
+                binary_contents = b(contents)
+                unsaved_array[i].name = b(os.fspath(name))
+                unsaved_array[i].contents = binary_contents
+                unsaved_array[i].length = len(binary_contents)
+        return unsaved_array
+
     @classmethod
     def from_source(
         cls, filename, args=None, unsaved_files=None, options=0, index=None
@@ -3029,16 +3081,7 @@ class TranslationUnit(ClangObject):
         if len(args) > 0:
             args_array = (c_char_p * len(args))(*[b(x) for x in args])
 
-        unsaved_array = None
-        if len(unsaved_files) > 0:
-            unsaved_array = (_CXUnsavedFile * len(unsaved_files))()
-            for i, (name, contents) in enumerate(unsaved_files):
-                if hasattr(contents, "read"):
-                    contents = contents.read()
-                contents = b(contents)
-                unsaved_array[i].name = b(os.fspath(name))
-                unsaved_array[i].contents = contents
-                unsaved_array[i].length = len(contents)
+        unsaved_array = cls.process_unsaved_files(unsaved_files)
 
         ptr = conf.lib.clang_parseTranslationUnit(
             index,
@@ -3095,12 +3138,12 @@ class TranslationUnit(ClangObject):
     @property
     def cursor(self):
         """Retrieve the cursor that represents the given translation unit."""
-        return conf.lib.clang_getTranslationUnitCursor(self)
+        return conf.lib.clang_getTranslationUnitCursor(self)  # type: ignore [no-any-return]
 
     @property
     def spelling(self):
         """Get the original translation unit source file name."""
-        return conf.lib.clang_getTranslationUnitSpelling(self)
+        return conf.lib.clang_getTranslationUnitSpelling(self)  # type: ignore [no-any-return]
 
     def get_includes(self):
         """
@@ -3119,7 +3162,7 @@ class TranslationUnit(ClangObject):
         # Automatically adapt CIndex/ctype pointers to python objects
         includes = []
         conf.lib.clang_getInclusions(
-            self, callbacks["translation_unit_includes"](visitor), includes
+            self, translation_unit_includes_callback(visitor), includes
         )
 
         return iter(includes)
@@ -3187,19 +3230,19 @@ class TranslationUnit(ClangObject):
         return SourceRange.from_locations(start_location, end_location)
 
     @property
-    def diagnostics(self):
+    def diagnostics(self) -> NoSliceSequence[Diagnostic]:
         """
         Return an iterable (and indexable) object containing the diagnostics.
         """
 
         class DiagIterator:
-            def __init__(self, tu):
+            def __init__(self, tu: TranslationUnit):
                 self.tu = tu
 
-            def __len__(self):
+            def __len__(self) -> int:
                 return int(conf.lib.clang_getNumDiagnostics(self.tu))
 
-            def __getitem__(self, key):
+            def __getitem__(self, key: int) -> Diagnostic:
                 diag = conf.lib.clang_getDiagnostic(self.tu, key)
                 if not diag:
                     raise IndexError
@@ -3219,16 +3262,7 @@ class TranslationUnit(ClangObject):
         if unsaved_files is None:
             unsaved_files = []
 
-        unsaved_files_array = 0
-        if len(unsaved_files):
-            unsaved_files_array = (_CXUnsavedFile * len(unsaved_files))()
-            for i, (name, contents) in enumerate(unsaved_files):
-                if hasattr(contents, "read"):
-                    contents = contents.read()
-                contents = b(contents)
-                unsaved_files_array[i].name = b(os.fspath(name))
-                unsaved_files_array[i].contents = contents
-                unsaved_files_array[i].length = len(contents)
+        unsaved_files_array = self.process_unsaved_files(unsaved_files)
         ptr = conf.lib.clang_reparseTranslationUnit(
             self, len(unsaved_files), unsaved_files_array, options
         )
@@ -3291,16 +3325,7 @@ class TranslationUnit(ClangObject):
         if unsaved_files is None:
             unsaved_files = []
 
-        unsaved_files_array = 0
-        if len(unsaved_files):
-            unsaved_files_array = (_CXUnsavedFile * len(unsaved_files))()
-            for i, (name, contents) in enumerate(unsaved_files):
-                if hasattr(contents, "read"):
-                    contents = contents.read()
-                contents = b(contents)
-                unsaved_files_array[i].name = b(os.fspath(name))
-                unsaved_files_array[i].contents = contents
-                unsaved_files_array[i].length = len(contents)
+        unsaved_files_array = self.process_unsaved_files(unsaved_files)
         ptr = conf.lib.clang_codeCompleteAt(
             self,
             os.fspath(path),
@@ -3344,12 +3369,12 @@ class File(ClangObject):
     @property
     def name(self):
         """Return the complete file and path name of the file."""
-        return conf.lib.clang_getFileName(self)
+        return conf.lib.clang_getFileName(self)  # type: ignore [no-any-return]
 
     @property
     def time(self):
         """Return the last modification time of the file."""
-        return conf.lib.clang_getFileTime(self)
+        return conf.lib.clang_getFileTime(self)  # type: ignore [no-any-return]
 
     def __str__(self):
         return self.name
@@ -3428,12 +3453,12 @@ class CompileCommand:
     @property
     def directory(self):
         """Get the working directory for this CompileCommand"""
-        return conf.lib.clang_CompileCommand_getDirectory(self.cmd)
+        return conf.lib.clang_CompileCommand_getDirectory(self.cmd)  # type: ignore [no-any-return]
 
     @property
     def filename(self):
         """Get the working filename for this CompileCommand"""
-        return conf.lib.clang_CompileCommand_getFilename(self.cmd)
+        return conf.lib.clang_CompileCommand_getFilename(self.cmd)  # type: ignore [no-any-return]
 
     @property
     def arguments(self):
@@ -3512,7 +3537,7 @@ class CompilationDatabase(ClangObject):
         Get an iterable object providing all the CompileCommands available to
         build filename. Returns None if filename is not found in the database.
         """
-        return conf.lib.clang_CompilationDatabase_getCompileCommands(
+        return conf.lib.clang_CompilationDatabase_getCompileCommands(  # type: ignore [no-any-return]
             self, os.fspath(filename)
         )
 
@@ -3521,7 +3546,7 @@ class CompilationDatabase(ClangObject):
         Get an iterable object providing all the CompileCommands available from
         the database.
         """
-        return conf.lib.clang_CompilationDatabase_getAllCompileCommands(self)
+        return conf.lib.clang_CompilationDatabase_getAllCompileCommands(self)  # type: ignore [no-any-return]
 
 
 class Token(Structure):
@@ -3542,7 +3567,7 @@ class Token(Structure):
 
         This is the textual representation of the token in source.
         """
-        return conf.lib.clang_getTokenSpelling(self._tu, self)
+        return conf.lib.clang_getTokenSpelling(self._tu, self)  # type: ignore [no-any-return]
 
     @property
     def kind(self):
@@ -3552,12 +3577,12 @@ class Token(Structure):
     @property
     def location(self):
         """The SourceLocation this Token occurs at."""
-        return conf.lib.clang_getTokenLocation(self._tu, self)
+        return conf.lib.clang_getTokenLocation(self._tu, self)  # type: ignore [no-any-return]
 
     @property
     def extent(self):
         """The SourceRange this Token occupies."""
-        return conf.lib.clang_getTokenExtent(self._tu, self)
+        return conf.lib.clang_getTokenExtent(self._tu, self)  # type: ignore [no-any-return]
 
     @property
     def cursor(self):
@@ -3619,7 +3644,7 @@ class Rewriter(ClangObject):
         Returns 1 if any files were not saved successfully,
         returns 0 otherwise.
         """
-        return conf.lib.clang_CXRewriter_overwriteChangedFiles(self)
+        return conf.lib.clang_CXRewriter_overwriteChangedFiles(self)  # type: ignore [no-any-return]
 
     def write_main_file_to_stdout(self):
         """
@@ -3631,15 +3656,15 @@ class Rewriter(ClangObject):
 
 # Now comes the plumbing to hook up the C library.
 
-# Register callback types in common container.
-callbacks["translation_unit_includes"] = CFUNCTYPE(
+# Register callback types
+translation_unit_includes_callback = CFUNCTYPE(
     None, c_object_p, POINTER(SourceLocation), c_uint, py_object
 )
-callbacks["cursor_visit"] = CFUNCTYPE(c_int, Cursor, Cursor, py_object)
-callbacks["fields_visit"] = CFUNCTYPE(c_int, Cursor, py_object)
+cursor_visit_callback = CFUNCTYPE(c_int, Cursor, Cursor, py_object)
+fields_visit_callback = CFUNCTYPE(c_int, Cursor, py_object)
 
 # Functions strictly alphabetical order.
-functionList = [
+functionList: list[LibFunc] = [
     (
         "clang_annotateTokens",
         [TranslationUnit, POINTER(Token), c_uint, POINTER(Cursor)],
@@ -3809,7 +3834,7 @@ functionList = [
     ("clang_getIncludedFile", [Cursor], c_object_p, File.from_result),
     (
         "clang_getInclusions",
-        [TranslationUnit, callbacks["translation_unit_includes"], py_object],
+        [TranslationUnit, translation_unit_includes_callback, py_object],
     ),
     (
         "clang_getInstantiationLocation",
@@ -3894,7 +3919,7 @@ functionList = [
         "clang_tokenize",
         [TranslationUnit, SourceRange, POINTER(POINTER(Token)), POINTER(c_uint)],
     ),
-    ("clang_visitChildren", [Cursor, callbacks["cursor_visit"], py_object], c_uint),
+    ("clang_visitChildren", [Cursor, cursor_visit_callback, py_object], c_uint),
     ("clang_Cursor_getNumArguments", [Cursor], c_int),
     ("clang_Cursor_getArgument", [Cursor, c_uint], Cursor, Cursor.from_result),
     ("clang_Cursor_getNumTemplateArguments", [Cursor], c_int),
@@ -3921,19 +3946,19 @@ functionList = [
     ("clang_Type_getSizeOf", [Type], c_longlong),
     ("clang_Type_getCXXRefQualifier", [Type], c_uint),
     ("clang_Type_getNamedType", [Type], Type, Type.from_result),
-    ("clang_Type_visitFields", [Type, callbacks["fields_visit"], py_object], c_uint),
+    ("clang_Type_visitFields", [Type, fields_visit_callback, py_object], c_uint),
 ]
 
 
 class LibclangError(Exception):
-    def __init__(self, message):
+    def __init__(self, message: str):
         self.m = message
 
-    def __str__(self):
+    def __str__(self) -> str:
         return self.m
 
 
-def register_function(lib, item, ignore_errors):
+def register_function(lib: CDLL, item: LibFunc, ignore_errors: bool) -> None:
     # A function may not exist, if these bindings are used with an older or
     # incompatible version of libclang.so.
     try:
@@ -3957,15 +3982,15 @@ def register_function(lib, item, ignore_errors):
         func.errcheck = item[3]
 
 
-def register_functions(lib, ignore_errors):
+def register_functions(lib: CDLL, ignore_errors: bool) -> None:
     """Register function prototypes with a libclang library instance.
 
     This must be called as part of library instantiation so Python knows how
     to call out to the shared library.
     """
 
-    def register(item):
-        return register_function(lib, item, ignore_errors)
+    def register(item: LibFunc) -> None:
+        register_function(lib, item, ignore_errors)
 
     for f in functionList:
         register(f)
@@ -3973,12 +3998,12 @@ def register_functions(lib, ignore_errors):
 
 class Config:
     library_path = None
-    library_file = None
+    library_file: str | None = None
     compatibility_check = True
     loaded = False
 
     @staticmethod
-    def set_library_path(path):
+    def set_library_path(path: StrPath) -> None:
         """Set the path in which to search for libclang"""
         if Config.loaded:
             raise Exception(
@@ -3989,7 +4014,7 @@ class Config:
         Config.library_path = os.fspath(path)
 
     @staticmethod
-    def set_library_file(filename):
+    def set_library_file(filename: StrPath) -> None:
         """Set the exact location of libclang"""
         if Config.loaded:
             raise Exception(
@@ -4000,7 +4025,7 @@ class Config:
         Config.library_file = os.fspath(filename)
 
     @staticmethod
-    def set_compatibility_check(check_status):
+    def set_compatibility_check(check_status: bool) -> None:
         """Perform compatibility check when loading libclang
 
         The python bindings are only tested and evaluated with the version of
@@ -4026,13 +4051,13 @@ class Config:
         Config.compatibility_check = check_status
 
     @CachedProperty
-    def lib(self):
+    def lib(self) -> CDLL:
         lib = self.get_cindex_library()
         register_functions(lib, not Config.compatibility_check)
         Config.loaded = True
         return lib
 
-    def get_filename(self):
+    def get_filename(self) -> str:
         if Config.library_file:
             return Config.library_file
 
@@ -4052,7 +4077,7 @@ class Config:
 
         return file
 
-    def get_cindex_library(self):
+    def get_cindex_library(self) -> CDLL:
         try:
             library = cdll.LoadLibrary(self.get_filename())
         except OSError as e:
@@ -4065,7 +4090,7 @@ class Config:
 
         return library
 
-    def function_exists(self, name):
+    def function_exists(self, name: str) -> bool:
         try:
             getattr(self.lib, name)
         except AttributeError:
@@ -4077,6 +4102,7 @@ class Config:
 conf = Config()
 
 __all__ = [
+    "AccessSpecifier",
     "AvailabilityKind",
     "BinaryOperator",
     "Config",
@@ -4087,12 +4113,16 @@ __all__ = [
     "CursorKind",
     "Cursor",
     "Diagnostic",
+    "ExceptionSpecificationKind",
     "File",
     "FixIt",
     "Index",
     "LinkageKind",
+    "RefQualifierKind",
     "SourceLocation",
     "SourceRange",
+    "StorageClass",
+    "TemplateArgumentKind",
     "TLSKind",
     "TokenKind",
     "Token",
diff --git a/clang/bindings/python/tests/cindex/test_code_completion.py b/clang/bindings/python/tests/cindex/test_code_completion.py
index ca52fc6..1d513db 100644
--- a/clang/bindings/python/tests/cindex/test_code_completion.py
+++ b/clang/bindings/python/tests/cindex/test_code_completion.py
@@ -53,7 +53,7 @@ void f() {
         expected = [
             "{'int', ResultType} | {'test1', TypedText} || Priority: 50 || Availability: Available || Brief comment: Aaa.",
             "{'void', ResultType} | {'test2', TypedText} | {'(', LeftParen} | {')', RightParen} || Priority: 50 || Availability: Available || Brief comment: Bbb.",
-            "{'return', TypedText} | {';', SemiColon} || Priority: 40 || Availability: Available || Brief comment: None",
+            "{'return', TypedText} | {';', SemiColon} || Priority: 40 || Availability: Available || Brief comment: ",
         ]
         self.check_completion_results(cr, expected)
 
@@ -94,7 +94,7 @@ void f() {
         expected = [
             "{'int', ResultType} | {'test1', TypedText} || Priority: 50 || Availability: Available || Brief comment: Aaa.",
             "{'void', ResultType} | {'test2', TypedText} | {'(', LeftParen} | {')', RightParen} || Priority: 50 || Availability: Available || Brief comment: Bbb.",
-            "{'return', TypedText} | {';', SemiColon} || Priority: 40 || Availability: Available || Brief comment: None",
+            "{'return', TypedText} | {';', SemiColon} || Priority: 40 || Availability: Available || Brief comment: ",
         ]
         self.check_completion_results(cr, expected)
 
@@ -128,19 +128,19 @@ void f(P x, Q y) {
         cr = tu.codeComplete("fake.cpp", 12, 5, unsaved_files=files)
 
         expected = [
-            "{'const', TypedText} || Priority: 50 || Availability: Available || Brief comment: None",
-            "{'volatile', TypedText} || Priority: 50 || Availability: Available || Brief comment: None",
-            "{'operator', TypedText} || Priority: 40 || Availability: Available || Brief comment: None",
-            "{'P', TypedText} || Priority: 50 || Availability: Available || Brief comment: None",
-            "{'Q', TypedText} || Priority: 50 || Availability: Available || Brief comment: None",
+            "{'const', TypedText} || Priority: 50 || Availability: Available || Brief comment: ",
+            "{'volatile', TypedText} || Priority: 50 || Availability: Available || Brief comment: ",
+            "{'operator', TypedText} || Priority: 40 || Availability: Available || Brief comment: ",
+            "{'P', TypedText} || Priority: 50 || Availability: Available || Brief comment: ",
+            "{'Q', TypedText} || Priority: 50 || Availability: Available || Brief comment: ",
         ]
         self.check_completion_results(cr, expected)
 
         cr = tu.codeComplete("fake.cpp", 13, 5, unsaved_files=files)
         expected = [
-            "{'P', TypedText} | {'::', Text} || Priority: 75 || Availability: Available || Brief comment: None",
-            "{'P &', ResultType} | {'operator=', TypedText} | {'(', LeftParen} | {'const P &', Placeholder} | {')', RightParen} || Priority: 79 || Availability: Available || Brief comment: None",
-            "{'int', ResultType} | {'member', TypedText} || Priority: 35 || Availability: NotAccessible || Brief comment: None",
-            "{'void', ResultType} | {'~P', TypedText} | {'(', LeftParen} | {')', RightParen} || Priority: 79 || Availability: Available || Brief comment: None",
+            "{'P', TypedText} | {'::', Text} || Priority: 75 || Availability: Available || Brief comment: ",
+            "{'P &', ResultType} | {'operator=', TypedText} | {'(', LeftParen} | {'const P &', Placeholder} | {')', RightParen} || Priority: 79 || Availability: Available || Brief comment: ",
+            "{'int', ResultType} | {'member', TypedText} || Priority: 35 || Availability: NotAccessible || Brief comment: ",
+            "{'void', ResultType} | {'~P', TypedText} | {'(', LeftParen} | {')', RightParen} || Priority: 79 || Availability: Available || Brief comment: ",
         ]
         self.check_completion_results(cr, expected)
diff --git a/clang/bindings/python/tests/cindex/test_comment.py b/clang/bindings/python/tests/cindex/test_comment.py
index 0727c6f..265c6d3 100644
--- a/clang/bindings/python/tests/cindex/test_comment.py
+++ b/clang/bindings/python/tests/cindex/test_comment.py
@@ -53,5 +53,5 @@ void f() {
         f = get_cursor(tu, "f")
         raw = f.raw_comment
         brief = f.brief_comment
-        self.assertIsNone(raw)
-        self.assertIsNone(brief)
+        self.assertEqual(raw, "")
+        self.assertEqual(brief, "")
diff --git a/clang/cmake/caches/Release.cmake b/clang/cmake/caches/Release.cmake
index 9e6feb4..e5161dd 100644
--- a/clang/cmake/caches/Release.cmake
+++ b/clang/cmake/caches/Release.cmake
@@ -29,9 +29,13 @@ endfunction()
 # cache file to CMake via -C. e.g.
 #
 # cmake -D LLVM_RELEASE_ENABLE_PGO=ON -C Release.cmake
+set (DEFAULT_RUNTIMES "compiler-rt;libcxx")
+if (NOT WIN32)
+  list(APPEND DEFAULT_RUNTIMES "libcxxabi" "libunwind")
+endif()
 set(LLVM_RELEASE_ENABLE_LTO THIN CACHE STRING "")
 set(LLVM_RELEASE_ENABLE_PGO ON CACHE BOOL "")
-set(LLVM_RELEASE_ENABLE_RUNTIMES "compiler-rt;libcxx;libcxxabi;libunwind" CACHE STRING "")
+set(LLVM_RELEASE_ENABLE_RUNTIMES ${DEFAULT_RUNTIMES} CACHE STRING "")
 set(LLVM_RELEASE_ENABLE_PROJECTS "clang;lld;lldb;clang-tools-extra;bolt;polly;mlir;flang" CACHE STRING "")
 # Note we don't need to add install here, since it is one of the pre-defined
 # steps.
diff --git a/clang/cmake/modules/AddClang.cmake b/clang/cmake/modules/AddClang.cmake
index 9d09be1..5327b5d 100644
--- a/clang/cmake/modules/AddClang.cmake
+++ b/clang/cmake/modules/AddClang.cmake
@@ -147,6 +147,7 @@ endmacro(add_clang_library)
 macro(add_clang_executable name)
   add_llvm_executable( ${name} ${ARGN} )
   set_clang_windows_version_resource_properties(${name})
+  set_target_properties(${name} PROPERTIES XCODE_GENERATE_SCHEME ON)
 endmacro(add_clang_executable)
 
 macro(add_clang_tool name)
@@ -181,6 +182,7 @@ macro(add_clang_tool name)
       set_property(GLOBAL APPEND PROPERTY CLANG_EXPORTS ${name})
     endif()
   endif()
+  set_target_properties(${name} PROPERTIES XCODE_GENERATE_SCHEME ON)
 endmacro()
 
 macro(add_clang_symlink name dest)
diff --git a/clang/docs/MemorySanitizer.rst b/clang/docs/MemorySanitizer.rst
index bcc6cc8..05e43a3 100644
--- a/clang/docs/MemorySanitizer.rst
+++ b/clang/docs/MemorySanitizer.rst
@@ -8,11 +8,18 @@ MemorySanitizer
 Introduction
 ============
 
-MemorySanitizer is a detector of uninitialized reads. It consists of a
+MemorySanitizer is a detector of uninitialized memory use. It consists of a
 compiler instrumentation module and a run-time library.
 
 Typical slowdown introduced by MemorySanitizer is **3x**.
 
+Here is a not comprehensive of list cases when MemorySanitizer will report an error:
+
+* Uninitialized value was used in a conditional branch.
+* Uninitialized pointer was used for memory accesses.
+* Uninitialized value was passed or returned from a function call, which is considered an undefined behavior. The check can be disabled with ``-fno-sanitize-memory-param-retval``.
+* Uninitialized data was passed into some libc calls.
+
 How to build
 ============
 
diff --git a/clang/docs/ReleaseNotes.rst b/clang/docs/ReleaseNotes.rst
index 286f319..3c2e028 100644
--- a/clang/docs/ReleaseNotes.rst
+++ b/clang/docs/ReleaseNotes.rst
@@ -54,6 +54,11 @@ Clang Frontend Potentially Breaking Changes
 
 Clang Python Bindings Potentially Breaking Changes
 --------------------------------------------------
+- Parts of the interface returning string results will now return
+  the empty string `""` when no result is available, instead of `None`.
+- Calling a property on the `CompletionChunk` or `CompletionString` class
+  statically now leads to an error, instead of returning a `CachedProperty` object
+  that is used internally. Properties are only available on instances.
 
 What's New in Clang |release|?
 ==============================
@@ -116,6 +121,9 @@ Attribute Changes in Clang
 
 - Introduced a new format attribute ``__attribute__((format(syslog, 1, 2)))`` from OpenBSD.
 
+- The ``hybrid_patchable`` attribute is now supported on ARM64EC targets. It can be used to specify
+  that a function requires an additional x86-64 thunk, which may be patched at runtime.
+
 Improvements to Clang's diagnostics
 -----------------------------------
 
@@ -129,10 +137,14 @@ Improvements to Clang's diagnostics
       template <typename> int i; // error: non-static data member 'i' cannot be declared as a template
      };
 
+- Clang now has improved diagnostics for functions with explicit 'this' parameters. Fixes #GH97878
+
 - Clang now diagnoses dangling references to fields of temporary objects. Fixes #GH81589.
 
 - Clang now diagnoses undefined behavior in constant expressions more consistently. This includes invalid shifts, and signed overflow in arithmetic.
 
+- -Wdangling-assignment-gsl is enabled by default.
+
 Improvements to Clang's time-trace
 ----------------------------------
 
@@ -156,6 +168,8 @@ Bug Fixes to C++ Support
 
 - Fixed a crash when an expression with a dependent ``__typeof__`` type is used as the operand of a unary operator. (#GH97646)
 - Fixed a failed assertion when checking invalid delete operator declaration. (#GH96191)
+- Fix a crash when checking destructor reference with an invalid initializer. (#GH97230)
+- Clang now correctly parses potentially declarative nested-name-specifiers in pointer-to-member declarators.
 
 Bug Fixes to AST Handling
 ^^^^^^^^^^^^^^^^^^^^^^^^^
@@ -242,6 +256,9 @@ Fixed Point Support in Clang
 AST Matchers
 ------------
 
+- Fixed an issue with the `hasName` and `hasAnyName` matcher when matching
+  inline namespaces with an enclosing namespace of the same name.
+
 clang-format
 ------------
 
@@ -265,6 +282,10 @@ Crash and bug fixes
 Improvements
 ^^^^^^^^^^^^
 
+- Improved the handling of the ``ownership_returns`` attribute. Now, Clang reports an
+  error if the attribute is attached to a function that returns a non-pointer value.
+  Fixes (#GH99501)
+
 Moved checkers
 ^^^^^^^^^^^^^^
 
diff --git a/clang/docs/analyzer/checkers.rst b/clang/docs/analyzer/checkers.rst
index 76a9aae..05d3f4d 100644
--- a/clang/docs/analyzer/checkers.rst
+++ b/clang/docs/analyzer/checkers.rst
@@ -1703,7 +1703,13 @@ are detected:
 * Invalid 3rd ("``whence``") argument to ``fseek``.
 
 The stream operations are by this checker usually split into two cases, a success
-and a failure case. However, in the case of write operations (like ``fwrite``,
+and a failure case.
+On the success case it also assumes that the current value of ``stdout``,
+``stderr``, or ``stdin`` can't be equal to the file pointer returned by ``fopen``.
+Operations performed on ``stdout``, ``stderr``, or ``stdin`` are not checked by
+this checker in contrast to the streams opened by ``fopen``.
+
+In the case of write operations (like ``fwrite``,
 ``fprintf`` and even ``fsetpos``) this behavior could produce a large amount of
 unwanted reports on projects that don't have error checks around the write
 operations, so by default the checker assumes that write operations always succeed.
@@ -1769,9 +1775,7 @@ are assumed to succeed.)
 **Limitations**
 
 The checker does not track the correspondence between integer file descriptors
-and ``FILE *`` pointers. Operations on standard streams like ``stdin`` are not
-treated specially and are therefore often not recognized (because these streams
-are usually not opened explicitly by the program, and are global variables).
+and ``FILE *`` pointers.
 
 .. _osx-checkers:
 
diff --git a/clang/include/clang/AST/ASTContext.h b/clang/include/clang/AST/ASTContext.h
index 6d1c8ca..ec8b325 100644
--- a/clang/include/clang/AST/ASTContext.h
+++ b/clang/include/clang/AST/ASTContext.h
@@ -1369,7 +1369,7 @@ public:
                            bool AsWritten = false);
 
   /// Get a function type and produce the equivalent function type where
-  /// pointer size address spaces in the return type and parameter tyeps are
+  /// pointer size address spaces in the return type and parameter types are
   /// replaced with the default address space.
   QualType getFunctionTypeWithoutPtrSizes(QualType T);
 
diff --git a/clang/include/clang/AST/ASTImporter.h b/clang/include/clang/AST/ASTImporter.h
index 4ffd913..f851dec 100644
--- a/clang/include/clang/AST/ASTImporter.h
+++ b/clang/include/clang/AST/ASTImporter.h
@@ -258,7 +258,6 @@ class TypeSourceInfo;
     FoundDeclsTy findDeclsInToCtx(DeclContext *DC, DeclarationName Name);
 
     void AddToLookupTable(Decl *ToD);
-    llvm::Error ImportAttrs(Decl *ToD, Decl *FromD);
 
   protected:
     /// Can be overwritten by subclasses to implement their own import logic.
diff --git a/clang/include/clang/AST/ASTNodeTraverser.h b/clang/include/clang/AST/ASTNodeTraverser.h
index 616f926..0546c19 100644
--- a/clang/include/clang/AST/ASTNodeTraverser.h
+++ b/clang/include/clang/AST/ASTNodeTraverser.h
@@ -583,7 +583,7 @@ public:
   void VisitCapturedDecl(const CapturedDecl *D) { Visit(D->getBody()); }
 
   void VisitOMPThreadPrivateDecl(const OMPThreadPrivateDecl *D) {
-    for (const auto *E : D->varlists())
+    for (const auto *E : D->varlist())
       Visit(E);
   }
 
@@ -603,7 +603,7 @@ public:
   }
 
   void VisitOMPAllocateDecl(const OMPAllocateDecl *D) {
-    for (const auto *E : D->varlists())
+    for (const auto *E : D->varlist())
       Visit(E);
     for (const auto *C : D->clauselists())
       Visit(C);
diff --git a/clang/include/clang/AST/DeclCXX.h b/clang/include/clang/AST/DeclCXX.h
index fb52ac8..3a11045 100644
--- a/clang/include/clang/AST/DeclCXX.h
+++ b/clang/include/clang/AST/DeclCXX.h
@@ -1188,10 +1188,6 @@ public:
   ///
   /// \note This does NOT include a check for union-ness.
   bool isEmpty() const { return data().Empty; }
-  /// Marks this record as empty. This is used by DWARFASTParserClang
-  /// when parsing records with empty fields having [[no_unique_address]]
-  /// attribute
-  void markEmpty() { data().Empty = true; }
 
   void setInitMethod(bool Val) { data().HasInitMethod = Val; }
   bool hasInitMethod() const { return data().HasInitMethod; }
diff --git a/clang/include/clang/AST/DeclOpenMP.h b/clang/include/clang/AST/DeclOpenMP.h
index e542c3c..8686622 100644
--- a/clang/include/clang/AST/DeclOpenMP.h
+++ b/clang/include/clang/AST/DeclOpenMP.h
@@ -143,10 +143,10 @@ public:
   unsigned varlist_size() const { return Data->getNumChildren(); }
   bool varlist_empty() const { return Data->getChildren().empty(); }
 
-  varlist_range varlists() {
+  varlist_range varlist() {
     return varlist_range(varlist_begin(), varlist_end());
   }
-  varlist_const_range varlists() const {
+  varlist_const_range varlist() const {
     return varlist_const_range(varlist_begin(), varlist_end());
   }
   varlist_iterator varlist_begin() { return getVars().begin(); }
@@ -513,10 +513,10 @@ public:
   unsigned clauselist_size() const { return Data->getNumClauses(); }
   bool clauselist_empty() const { return Data->getClauses().empty(); }
 
-  varlist_range varlists() {
+  varlist_range varlist() {
     return varlist_range(varlist_begin(), varlist_end());
   }
-  varlist_const_range varlists() const {
+  varlist_const_range varlist() const {
     return varlist_const_range(varlist_begin(), varlist_end());
   }
   varlist_iterator varlist_begin() { return getVars().begin(); }
diff --git a/clang/include/clang/AST/OpenMPClause.h b/clang/include/clang/AST/OpenMPClause.h
index 325a1ba..b029c72 100644
--- a/clang/include/clang/AST/OpenMPClause.h
+++ b/clang/include/clang/AST/OpenMPClause.h
@@ -316,10 +316,10 @@ public:
   unsigned varlist_size() const { return NumVars; }
   bool varlist_empty() const { return NumVars == 0; }
 
-  varlist_range varlists() {
+  varlist_range varlist() {
     return varlist_range(varlist_begin(), varlist_end());
   }
-  varlist_const_range varlists() const {
+  varlist_const_range varlist() const {
     return varlist_const_range(varlist_begin(), varlist_end());
   }
 
diff --git a/clang/include/clang/AST/RecursiveASTVisitor.h b/clang/include/clang/AST/RecursiveASTVisitor.h
index e3c0cb4..dcf5dbf 100644
--- a/clang/include/clang/AST/RecursiveASTVisitor.h
+++ b/clang/include/clang/AST/RecursiveASTVisitor.h
@@ -1772,10 +1772,10 @@ DEF_TRAVERSE_DECL(UsingShadowDecl, {})
 DEF_TRAVERSE_DECL(ConstructorUsingShadowDecl, {})
 
 DEF_TRAVERSE_DECL(OMPThreadPrivateDecl, {
-  for (auto *I : D->varlists()) {
+  for (auto *I : D->varlist()) {
     TRY_TO(TraverseStmt(I));
   }
- })
+})
 
 DEF_TRAVERSE_DECL(OMPRequiresDecl, {
   for (auto *C : D->clauselists()) {
@@ -1801,7 +1801,7 @@ DEF_TRAVERSE_DECL(OMPDeclareMapperDecl, {
 DEF_TRAVERSE_DECL(OMPCapturedExprDecl, { TRY_TO(TraverseVarHelper(D)); })
 
 DEF_TRAVERSE_DECL(OMPAllocateDecl, {
-  for (auto *I : D->varlists())
+  for (auto *I : D->varlist())
     TRY_TO(TraverseStmt(I));
   for (auto *C : D->clauselists())
     TRY_TO(TraverseOMPClause(C));
@@ -3552,7 +3552,7 @@ bool RecursiveASTVisitor<Derived>::VisitOMPNocontextClause(
 template <typename Derived>
 template <typename T>
 bool RecursiveASTVisitor<Derived>::VisitOMPClauseList(T *Node) {
-  for (auto *E : Node->varlists()) {
+  for (auto *E : Node->varlist()) {
     TRY_TO(TraverseStmt(E));
   }
   return true;
@@ -3926,7 +3926,7 @@ template <typename Derived>
 bool RecursiveASTVisitor<Derived>::VisitOMPAffinityClause(
     OMPAffinityClause *C) {
   TRY_TO(TraverseStmt(C->getModifier()));
-  for (Expr *E : C->varlists())
+  for (Expr *E : C->varlist())
     TRY_TO(TraverseStmt(E));
   return true;
 }
diff --git a/clang/include/clang/AST/Type.h b/clang/include/clang/AST/Type.h
index 72723c7..dec51e0 100644
--- a/clang/include/clang/AST/Type.h
+++ b/clang/include/clang/AST/Type.h
@@ -2509,6 +2509,7 @@ public:
   bool isFunctionNoProtoType() const { return getAs<FunctionNoProtoType>(); }
   bool isFunctionProtoType() const { return getAs<FunctionProtoType>(); }
   bool isPointerType() const;
+  bool isPointerOrReferenceType() const;
   bool isSignableType() const;
   bool isAnyPointerType() const;   // Any C pointer or ObjC object pointer
   bool isCountAttributedType() const;
@@ -4698,26 +4699,25 @@ public:
   };
 
 private:
-  LLVM_PREFERRED_TYPE(Kind)
-  unsigned FKind : 3;
+  Kind FKind;
 
   // Expansion: for hypothetical TCB+types, there could be one Kind for TCB,
   // then ~16(?) bits "SubKind" to map to a specific named TCB. SubKind would
   // be considered for uniqueness.
 
 public:
-  FunctionEffect() : FKind(unsigned(Kind::None)) {}
+  FunctionEffect() : FKind(Kind::None) {}
 
-  explicit FunctionEffect(Kind K) : FKind(unsigned(K)) {}
+  explicit FunctionEffect(Kind K) : FKind(K) {}
 
   /// The kind of the effect.
-  Kind kind() const { return Kind(FKind); }
+  Kind kind() const { return FKind; }
 
   /// Return the opposite kind, for effects which have opposites.
   Kind oppositeKind() const;
 
   /// For serialization.
-  uint32_t toOpaqueInt32() const { return FKind; }
+  uint32_t toOpaqueInt32() const { return uint32_t(FKind); }
   static FunctionEffect fromOpaqueInt32(uint32_t Value) {
     return FunctionEffect(Kind(Value));
   }
@@ -7997,6 +7997,10 @@ inline bool Type::isPointerType() const {
   return isa<PointerType>(CanonicalType);
 }
 
+inline bool Type::isPointerOrReferenceType() const {
+  return isPointerType() || isReferenceType();
+}
+
 inline bool Type::isAnyPointerType() const {
   return isPointerType() || isObjCObjectPointerType();
 }
diff --git a/clang/include/clang/Analysis/FlowSensitive/AdornedCFG.h b/clang/include/clang/Analysis/FlowSensitive/AdornedCFG.h
index 420f13c..5c64e5b 100644
--- a/clang/include/clang/Analysis/FlowSensitive/AdornedCFG.h
+++ b/clang/include/clang/Analysis/FlowSensitive/AdornedCFG.h
@@ -18,6 +18,7 @@
 #include "clang/AST/Decl.h"
 #include "clang/AST/Stmt.h"
 #include "clang/Analysis/CFG.h"
+#include "clang/Analysis/FlowSensitive/ASTOps.h"
 #include "llvm/ADT/BitVector.h"
 #include "llvm/ADT/DenseMap.h"
 #include "llvm/Support/Error.h"
@@ -27,6 +28,20 @@
 namespace clang {
 namespace dataflow {
 
+namespace internal {
+class StmtToBlockMap {
+public:
+  StmtToBlockMap(const CFG &Cfg);
+
+  const CFGBlock *lookup(const Stmt &S) const {
+    return StmtToBlock.lookup(&ignoreCFGOmittedNodes(S));
+  }
+
+private:
+  llvm::DenseMap<const Stmt *, const CFGBlock *> StmtToBlock;
+};
+} // namespace internal
+
 /// Holds CFG with additional information derived from it that is needed to
 /// perform dataflow analysis.
 class AdornedCFG {
@@ -48,9 +63,10 @@ public:
   /// Returns the CFG that is stored in this context.
   const CFG &getCFG() const { return *Cfg; }
 
-  /// Returns a mapping from statements to basic blocks that contain them.
-  const llvm::DenseMap<const Stmt *, const CFGBlock *> &getStmtToBlock() const {
-    return StmtToBlock;
+  /// Returns the basic block that contains `S`, or null if no basic block
+  /// containing `S` is found.
+  const CFGBlock *blockForStmt(const Stmt &S) const {
+    return StmtToBlock.lookup(S);
   }
 
   /// Returns whether `B` is reachable from the entry block.
@@ -73,8 +89,7 @@ public:
 private:
   AdornedCFG(
       const Decl &D, std::unique_ptr<CFG> Cfg,
-      llvm::DenseMap<const Stmt *, const CFGBlock *> StmtToBlock,
-      llvm::BitVector BlockReachable,
+      internal::StmtToBlockMap StmtToBlock, llvm::BitVector BlockReachable,
       llvm::DenseSet<const CFGBlock *> ContainsExprConsumedInDifferentBlock)
       : ContainingDecl(D), Cfg(std::move(Cfg)),
         StmtToBlock(std::move(StmtToBlock)),
@@ -85,7 +100,7 @@ private:
   /// The `Decl` containing the statement used to construct the CFG.
   const Decl &ContainingDecl;
   std::unique_ptr<CFG> Cfg;
-  llvm::DenseMap<const Stmt *, const CFGBlock *> StmtToBlock;
+  internal::StmtToBlockMap StmtToBlock;
   llvm::BitVector BlockReachable;
   llvm::DenseSet<const CFGBlock *> ContainsExprConsumedInDifferentBlock;
 };
diff --git a/clang/include/clang/Analysis/FlowSensitive/DataflowAnalysis.h b/clang/include/clang/Analysis/FlowSensitive/DataflowAnalysis.h
index 50a7018..e6efde0 100644
--- a/clang/include/clang/Analysis/FlowSensitive/DataflowAnalysis.h
+++ b/clang/include/clang/Analysis/FlowSensitive/DataflowAnalysis.h
@@ -233,7 +233,7 @@ llvm::Expected<std::vector<
     std::optional<DataflowAnalysisState<typename AnalysisT::Lattice>>>>
 runDataflowAnalysis(const AdornedCFG &ACFG, AnalysisT &Analysis,
                     const Environment &InitEnv,
-                    CFGEltCallbacks<AnalysisT> PostAnalysisCallbacks,
+                    CFGEltCallbacks<AnalysisT> PostAnalysisCallbacks = {},
                     std::int32_t MaxBlockVisits = kDefaultMaxBlockVisits) {
   CFGEltCallbacksTypeErased TypeErasedCallbacks;
   if (PostAnalysisCallbacks.Before) {
@@ -286,22 +286,6 @@ runDataflowAnalysis(const AdornedCFG &ACFG, AnalysisT &Analysis,
   return std::move(BlockStates);
 }
 
-/// Overload that takes only one post-analysis callback, which is run on the
-/// state after visiting the `CFGElement`. This is provided for backwards
-/// compatibility; new callers should call the overload taking `CFGEltCallbacks`
-/// instead.
-template <typename AnalysisT>
-llvm::Expected<std::vector<
-    std::optional<DataflowAnalysisState<typename AnalysisT::Lattice>>>>
-runDataflowAnalysis(
-    const AdornedCFG &ACFG, AnalysisT &Analysis, const Environment &InitEnv,
-    CFGEltCallback<AnalysisT> PostAnalysisCallbackAfterElt = nullptr,
-    std::int32_t MaxBlockVisits = kDefaultMaxBlockVisits) {
-  return runDataflowAnalysis(ACFG, Analysis, InitEnv,
-                             {nullptr, PostAnalysisCallbackAfterElt},
-                             MaxBlockVisits);
-}
-
 // Create an analysis class that is derived from `DataflowAnalysis`. This is an
 // SFINAE adapter that allows us to call two different variants of constructor
 // (either with or without the optional `Environment` parameter).
diff --git a/clang/include/clang/Basic/Attr.td b/clang/include/clang/Basic/Attr.td
index 4825979..8ac2079 100644
--- a/clang/include/clang/Basic/Attr.td
+++ b/clang/include/clang/Basic/Attr.td
@@ -477,6 +477,9 @@ def TargetELF : TargetSpec {
 def TargetELFOrMachO : TargetSpec {
   let ObjectFormats = ["ELF", "MachO"];
 }
+def TargetWindowsArm64EC : TargetSpec {
+  let CustomCode = [{ Target.getTriple().isWindowsArm64EC() }];
+}
 
 def TargetSupportsInitPriority : TargetSpec {
   let CustomCode = [{ !Target.getTriple().isOSzOS() }];
@@ -2047,6 +2050,17 @@ def Convergent : InheritableAttr {
   let SimpleHandler = 1;
 }
 
+def NoConvergent : InheritableAttr {
+  let Spellings = [Clang<"noconvergent">, Declspec<"noconvergent">];
+  let Subjects = SubjectList<[Function, Stmt], WarnDiag,
+                             "functions and statements">;
+  let LangOpts = [CUDA];
+  let Documentation = [NoConvergentDocs];
+  let SimpleHandler = 1;
+}
+
+def : MutualExclusions<[Convergent, NoConvergent]>;
+
 def NoInline : DeclOrStmtAttr {
   let Spellings = [CustomKeyword<"__noinline__">, GCC<"noinline">,
                    CXX11<"clang", "noinline">, C23<"clang", "noinline">,
@@ -4027,6 +4041,12 @@ def SelectAny : InheritableAttr {
   let SimpleHandler = 1;
 }
 
+def HybridPatchable : InheritableAttr, TargetSpecificAttr<TargetWindowsArm64EC> {
+  let Spellings = [Declspec<"hybrid_patchable">, Clang<"hybrid_patchable">];
+  let Subjects = SubjectList<[Function]>;
+  let Documentation = [HybridPatchableDocs];
+}
+
 def Thread : Attr {
   let Spellings = [Declspec<"thread">];
   let LangOpts = [MicrosoftExt];
diff --git a/clang/include/clang/Basic/AttrDocs.td b/clang/include/clang/Basic/AttrDocs.td
index 9973881..94c284f 100644
--- a/clang/include/clang/Basic/AttrDocs.td
+++ b/clang/include/clang/Basic/AttrDocs.td
@@ -1382,6 +1382,34 @@ Sample usage:
   }];
 }
 
+def NoConvergentDocs : Documentation {
+  let Category = DocCatFunction;
+  let Content = [{
+This attribute prevents a function from being treated as convergent, which
+means that optimizations can only move calls to that function to
+control-equivalent blocks. If a statement is marked as ``noconvergent`` and
+contains calls, it also prevents those calls from being treated as convergent.
+In other words, those calls are not restricted to only being moved to
+control-equivalent blocks.
+
+In languages following SPMD/SIMT programming model, e.g., CUDA/HIP, function
+declarations and calls are treated as convergent by default for correctness.
+This ``noconvergent`` attribute is helpful for developers to prevent them from
+being treated as convergent when it's safe.
+
+.. code-block:: c
+
+  __device__ float bar(float);
+  __device__ float foo(float) __attribute__((noconvergent)) {}
+
+  __device__ int example(void) {
+    float x;
+    [[clang::noconvergent]] x = bar(x);
+  }
+
+  }];
+}
+
 def NoSplitStackDocs : Documentation {
   let Category = DocCatFunction;
   let Content = [{
@@ -2453,7 +2481,7 @@ For example:
   typedef vint8m1_t fixed_vint8m1_t __attribute__((riscv_rvv_vector_bits(__riscv_v_fixed_vlen)));
   #endif
 
-Creates a type ``fixed_vint8m1_t_t`` that is a fixed-length variant of
+Creates a type ``fixed_vint8m1_t`` that is a fixed-length variant of
 ``vint8m1_t`` that contains exactly 512 bits. Unlike ``vint8m1_t``, this type
 can be used in globals, structs, unions, and arrays, all of which are
 unsupported for sizeless types.
@@ -5985,6 +6013,16 @@ For more information see
 or `msvc documentation <https://docs.microsoft.com/pl-pl/cpp/cpp/selectany>`_.
 }]; }
 
+def HybridPatchableDocs : Documentation {
+  let Category = DocCatFunction;
+  let Content = [{
+The ``hybrid_patchable`` attribute declares an ARM64EC function with an additional
+x86-64 thunk, which may be patched at runtime.
+
+For more information see
+`ARM64EC ABI documentation <https://learn.microsoft.com/en-us/windows/arm/arm64ec-abi>`_.
+}]; }
+
 def WebAssemblyExportNameDocs : Documentation {
   let Category = DocCatFunction;
   let Content = [{
diff --git a/clang/include/clang/Basic/AttributeCommonInfo.h b/clang/include/clang/Basic/AttributeCommonInfo.h
index 5f024b4..cdf9dca 100644
--- a/clang/include/clang/Basic/AttributeCommonInfo.h
+++ b/clang/include/clang/Basic/AttributeCommonInfo.h
@@ -191,6 +191,12 @@ public:
   /// __gnu__::__attr__ will be normalized to gnu::attr).
   std::string getNormalizedFullName() const;
 
+  /// Generate a normalized full name, with syntax, scope and name.
+  static std::string
+  normalizeFullNameWithSyntax(const IdentifierInfo *Name,
+                              const IdentifierInfo *Scope,
+                              AttributeCommonInfo::Syntax SyntaxUsed);
+
   bool isDeclspecAttribute() const { return SyntaxUsed == AS_Declspec; }
   bool isMicrosoftAttribute() const { return SyntaxUsed == AS_Microsoft; }
 
diff --git a/clang/include/clang/Basic/DiagnosticFrontendKinds.td b/clang/include/clang/Basic/DiagnosticFrontendKinds.td
index 12a4617..8a1462c 100644
--- a/clang/include/clang/Basic/DiagnosticFrontendKinds.td
+++ b/clang/include/clang/Basic/DiagnosticFrontendKinds.td
@@ -288,6 +288,9 @@ def err_function_needs_feature : Error<
 let CategoryName = "Codegen ABI Check" in {
 def err_function_always_inline_attribute_mismatch : Error<
   "always_inline function %1 and its caller %0 have mismatching %2 attributes">;
+def warn_function_always_inline_attribute_mismatch : Warning<
+  "always_inline function %1 and its caller %0 have mismatching %2 attributes, "
+  "inlining may change runtime behaviour">, InGroup<AArch64SMEAttributes>;
 def err_function_always_inline_new_za : Error<
   "always_inline function %0 has new za state">;
 
diff --git a/clang/include/clang/Basic/DiagnosticSemaKinds.td b/clang/include/clang/Basic/DiagnosticSemaKinds.td
index beee243..581434d 100644
--- a/clang/include/clang/Basic/DiagnosticSemaKinds.td
+++ b/clang/include/clang/Basic/DiagnosticSemaKinds.td
@@ -3330,6 +3330,8 @@ def err_attribute_invalid_implicit_this_argument : Error<
   "%0 attribute is invalid for the implicit this argument">;
 def err_ownership_type : Error<
   "%0 attribute only applies to %select{pointer|integer}1 arguments">;
+def err_ownership_takes_return_type : Error<
+  "'ownership_returns' attribute only applies to functions that return a pointer">;
 def err_ownership_returns_index_mismatch : Error<
   "'ownership_returns' attribute index does not match; here it is %0">;
 def note_ownership_returns_index_mismatch : Note<
@@ -3681,6 +3683,9 @@ def err_attribute_weak_static : Error<
   "weak declaration cannot have internal linkage">;
 def err_attribute_selectany_non_extern_data : Error<
   "'selectany' can only be applied to data items with external linkage">;
+def warn_attribute_hybrid_patchable_non_extern : Warning<
+  "'hybrid_patchable' is ignored on functions without external linkage">,
+  InGroup<IgnoredAttributes>;
 def err_declspec_thread_on_thread_variable : Error<
   "'__declspec(thread)' applied to variable that already has a "
   "thread-local storage specifier">;
@@ -10131,7 +10136,7 @@ def warn_dangling_lifetime_pointer : Warning<
   InGroup<DanglingGsl>;
 def warn_dangling_lifetime_pointer_assignment : Warning<"object backing the "
   "pointer %0 will be destroyed at the end of the full-expression">,
-  InGroup<DanglingAssignmentGsl>, DefaultIgnore;
+  InGroup<DanglingAssignmentGsl>;
 def warn_new_dangling_initializer_list : Warning<
   "array backing "
   "%select{initializer list subobject of the allocated object|"
diff --git a/clang/include/clang/Basic/DiagnosticSerializationKinds.td b/clang/include/clang/Basic/DiagnosticSerializationKinds.td
index eb27de5..51d0abb 100644
--- a/clang/include/clang/Basic/DiagnosticSerializationKinds.td
+++ b/clang/include/clang/Basic/DiagnosticSerializationKinds.td
@@ -50,14 +50,14 @@ def warn_pch_vfsoverlay_mismatch : Warning<
 def note_pch_vfsoverlay_files : Note<"%select{PCH|current translation unit}0 has the following VFS overlays:\n%1">;
 def note_pch_vfsoverlay_empty : Note<"%select{PCH|current translation unit}0 has no VFS overlays">;
 
-def err_pch_version_too_old : Error<
-    "PCH file uses an older PCH format that is no longer supported">;
-def err_pch_version_too_new : Error<
-    "PCH file uses a newer PCH format that cannot be read">;
-def err_pch_different_branch : Error<
-    "PCH file built from a different branch (%0) than the compiler (%1)">;
-def err_pch_with_compiler_errors : Error<
-    "PCH file contains compiler errors">;
+def err_ast_file_version_too_old : Error<
+    "%select{PCH|module|AST}0 file '%1' uses an older PCH format that is no longer supported">;
+def err_ast_file_version_too_new : Error<
+    "%select{PCH|module|AST}0 file '%1' uses a newer PCH format that cannot be read">;
+def err_ast_file_different_branch : Error<
+    "%select{PCH|module|AST}0 file '%1' built from a different branch (%2) than the compiler (%3)">;
+def err_ast_file_with_compiler_errors : Error<
+    "%select{PCH|module|AST}0 file '%1' contains compiler errors">;
 
 def err_module_file_conflict : Error<
   "module '%0' is defined in both '%1' and '%2'">, DefaultFatal;
diff --git a/clang/include/clang/Driver/Options.td b/clang/include/clang/Driver/Options.td
index ccccc95..c8c56db 100644
--- a/clang/include/clang/Driver/Options.td
+++ b/clang/include/clang/Driver/Options.td
@@ -978,15 +978,15 @@ def Wsystem_headers_in_module_EQ : Joined<["-"], "Wsystem-headers-in-module=">,
   HelpText<"Enable -Wsystem-headers when building <module>">,
   MarshallingInfoStringVector<DiagnosticOpts<"SystemHeaderWarningsModules">>;
 def Wdeprecated : Flag<["-"], "Wdeprecated">, Group<W_Group>,
-  Visibility<[ClangOption, CC1Option]>,
+  Flags<[HelpHidden]>, Visibility<[ClangOption, CC1Option]>,
   HelpText<"Enable warnings for deprecated constructs and define __DEPRECATED">;
 def Wno_deprecated : Flag<["-"], "Wno-deprecated">, Group<W_Group>,
   Visibility<[ClangOption, CC1Option]>;
 defm invalid_constexpr : BoolWOption<"invalid-constexpr",
   LangOpts<"CheckConstexprFunctionBodies">,
   Default<!strconcat("!", cpp23.KeyPath)>,
-  NegFlag<SetFalse, [], [ClangOption, CC1Option], "Disable">,
-  PosFlag<SetTrue, [], [ClangOption, CC1Option], "Enable">,
+  NegFlag<SetFalse, [HelpHidden], [ClangOption, CC1Option], "Disable">,
+  PosFlag<SetTrue, [HelpHidden], [ClangOption, CC1Option], "Enable">,
   BothFlags<[], [ClangOption, CC1Option], " checking of constexpr function bodies for validity within a constant expression context">>;
 def Wl_COMMA : CommaJoined<["-"], "Wl,">, Visibility<[ClangOption, FlangOption]>,
   Flags<[LinkerInput, RenderAsInput]>,
diff --git a/clang/include/clang/Lex/Preprocessor.h b/clang/include/clang/Lex/Preprocessor.h
index fc7d005..623f868 100644
--- a/clang/include/clang/Lex/Preprocessor.h
+++ b/clang/include/clang/Lex/Preprocessor.h
@@ -1160,6 +1160,11 @@ private:
   /// invoked (at which point the last position is popped).
   std::vector<CachedTokensTy::size_type> BacktrackPositions;
 
+  /// Stack of cached tokens/initial number of cached tokens pairs, allowing
+  /// nested unannotated backtracks.
+  std::vector<std::pair<CachedTokensTy, CachedTokensTy::size_type>>
+      UnannotatedBacktrackTokens;
+
   /// True if \p Preprocessor::SkipExcludedConditionalBlock() is running.
   /// This is used to guard against calling this function recursively.
   ///
@@ -1722,8 +1727,16 @@ public:
   /// at some point after EnableBacktrackAtThisPos. If you don't, caching of
   /// tokens will continue indefinitely.
   ///
-  void EnableBacktrackAtThisPos();
+  /// \param Unannotated Whether token annotations are reverted upon calling
+  /// Backtrack().
+  void EnableBacktrackAtThisPos(bool Unannotated = false);
+
+private:
+  std::pair<CachedTokensTy::size_type, bool> LastBacktrackPos();
+
+  CachedTokensTy PopUnannotatedBacktrackTokens();
 
+public:
   /// Disable the last EnableBacktrackAtThisPos call.
   void CommitBacktrackedTokens();
 
@@ -1735,6 +1748,12 @@ public:
   /// caching of tokens is on.
   bool isBacktrackEnabled() const { return !BacktrackPositions.empty(); }
 
+  /// True if EnableBacktrackAtThisPos() was called and
+  /// caching of unannotated tokens is on.
+  bool isUnannotatedBacktrackEnabled() const {
+    return !UnannotatedBacktrackTokens.empty();
+  }
+
   /// Lex the next token for this preprocessor.
   void Lex(Token &Result);
 
@@ -1841,8 +1860,9 @@ public:
   void RevertCachedTokens(unsigned N) {
     assert(isBacktrackEnabled() &&
            "Should only be called when tokens are cached for backtracking");
-    assert(signed(CachedLexPos) - signed(N) >= signed(BacktrackPositions.back())
-         && "Should revert tokens up to the last backtrack position, not more");
+    assert(signed(CachedLexPos) - signed(N) >=
+               signed(LastBacktrackPos().first) &&
+           "Should revert tokens up to the last backtrack position, not more");
     assert(signed(CachedLexPos) - signed(N) >= 0 &&
            "Corrupted backtrack positions ?");
     CachedLexPos -= N;
diff --git a/clang/include/clang/Parse/Parser.h b/clang/include/clang/Parse/Parser.h
index 35bb1a1..ba7d686 100644
--- a/clang/include/clang/Parse/Parser.h
+++ b/clang/include/clang/Parse/Parser.h
@@ -1025,6 +1025,8 @@ private:
   ///   ....
   ///   TPA.Revert();
   ///
+  /// If the Unannotated parameter is true, any token annotations created
+  /// during the tentative parse are reverted.
   class TentativeParsingAction {
     Parser &P;
     PreferredTypeBuilder PrevPreferredType;
@@ -1034,7 +1036,7 @@ private:
     bool isActive;
 
   public:
-    explicit TentativeParsingAction(Parser &p)
+    explicit TentativeParsingAction(Parser &p, bool Unannotated = false)
         : P(p), PrevPreferredType(P.PreferredType) {
       PrevTok = P.Tok;
       PrevTentativelyDeclaredIdentifierCount =
@@ -1042,7 +1044,7 @@ private:
       PrevParenCount = P.ParenCount;
       PrevBracketCount = P.BracketCount;
       PrevBraceCount = P.BraceCount;
-      P.PP.EnableBacktrackAtThisPos();
+      P.PP.EnableBacktrackAtThisPos(Unannotated);
       isActive = true;
     }
     void Commit() {
@@ -1073,13 +1075,11 @@ private:
   class RevertingTentativeParsingAction
       : private Parser::TentativeParsingAction {
   public:
-    RevertingTentativeParsingAction(Parser &P)
-        : Parser::TentativeParsingAction(P) {}
+    using TentativeParsingAction::TentativeParsingAction;
+
     ~RevertingTentativeParsingAction() { Revert(); }
   };
 
-  class UnannotatedTentativeParsingAction;
-
   /// ObjCDeclContextSwitch - An object used to switch context from
   /// an objective-c decl context to its enclosing decl context and
   /// back.
@@ -1984,7 +1984,8 @@ private:
       CXXScopeSpec &SS, ParsedType ObjectType, bool ObjectHasErrors,
       bool EnteringContext, bool *MayBePseudoDestructor = nullptr,
       bool IsTypename = false, const IdentifierInfo **LastII = nullptr,
-      bool OnlyNamespace = false, bool InUsingDeclaration = false);
+      bool OnlyNamespace = false, bool InUsingDeclaration = false,
+      bool Disambiguation = false);
 
   //===--------------------------------------------------------------------===//
   // C++11 5.1.2: Lambda expressions
diff --git a/clang/include/clang/Sema/Overload.h b/clang/include/clang/Sema/Overload.h
index 26ffe05..d6a6cee 100644
--- a/clang/include/clang/Sema/Overload.h
+++ b/clang/include/clang/Sema/Overload.h
@@ -984,7 +984,7 @@ class Sema;
     unsigned getNumParams() const {
       if (IsSurrogate) {
         QualType STy = Surrogate->getConversionType();
-        while (STy->isPointerType() || STy->isReferenceType())
+        while (STy->isPointerOrReferenceType())
           STy = STy->getPointeeType();
         return STy->castAs<FunctionProtoType>()->getNumParams();
       }
diff --git a/clang/lib/AST/ASTContext.cpp b/clang/lib/AST/ASTContext.cpp
index a465cdf..b0cc7cb 100644
--- a/clang/lib/AST/ASTContext.cpp
+++ b/clang/lib/AST/ASTContext.cpp
@@ -3283,7 +3283,7 @@ static void encodeTypeForFunctionPointerAuth(const ASTContext &Ctx,
     return;
 
   case Type::Builtin: {
-    const auto *BTy = T->getAs<BuiltinType>();
+    const auto *BTy = T->castAs<BuiltinType>();
     switch (BTy->getKind()) {
 #define SIGNED_TYPE(Id, SingletonId)                                           \
   case BuiltinType::Id:                                                        \
@@ -3366,7 +3366,7 @@ static void encodeTypeForFunctionPointerAuth(const ASTContext &Ctx,
     llvm_unreachable("should never get here");
   }
   case Type::Record: {
-    const RecordDecl *RD = T->getAs<RecordType>()->getDecl();
+    const RecordDecl *RD = T->castAs<RecordType>()->getDecl();
     const IdentifierInfo *II = RD->getIdentifier();
 
     // In C++, an immediate typedef of an anonymous struct or union
diff --git a/clang/lib/AST/ASTImporter.cpp b/clang/lib/AST/ASTImporter.cpp
index 08ef09d..1032355 100644
--- a/clang/lib/AST/ASTImporter.cpp
+++ b/clang/lib/AST/ASTImporter.cpp
@@ -359,6 +359,54 @@ namespace clang {
           Params, Importer.getToContext().getTranslationUnitDecl());
     }
 
+    template <typename TemplateParmDeclT>
+    void tryUpdateTemplateParmDeclInheritedFrom(NamedDecl *RecentParm,
+                                                NamedDecl *NewParm) {
+      if (auto *ParmT = dyn_cast<TemplateParmDeclT>(RecentParm)) {
+        if (ParmT->hasDefaultArgument()) {
+          auto *P = cast<TemplateParmDeclT>(NewParm);
+          P->removeDefaultArgument();
+          P->setInheritedDefaultArgument(Importer.ToContext, ParmT);
+        }
+      }
+    }
+
+    // Update the parameter list `NewParams` of a template declaration
+    // by "inheriting" default argument values from `RecentParams`,
+    // which is the parameter list of an earlier declaration of the
+    // same template. (Note that "inheriting" default argument values
+    // is not related to object-oriented inheritance.)
+    //
+    // In the clang AST template parameters (NonTypeTemplateParmDec,
+    // TemplateTypeParmDecl, TemplateTemplateParmDecl) have a reference to the
+    // default value, if one is specified at the first declaration. The default
+    // value can be specified only once. The template parameters of the
+    // following declarations have a reference to the original default value
+    // through the "inherited" value. This value should be set for all imported
+    // template parameters that have a previous declaration (also a previous
+    // template declaration).
+    //
+    // In the `Visit*ParmDecl` functions the default value of these template
+    // arguments is always imported. At that location the previous declaration
+    // is not easily accessible, it is not possible to call
+    // `setInheritedDefaultArgument` at that place.
+    // `updateTemplateParametersInheritedFrom` is called later when the already
+    // imported default value is erased and changed to "inherited".
+    // It is important to change the mode to "inherited" otherwise false
+    // structural in-equivalences could be detected.
+    void updateTemplateParametersInheritedFrom(
+        const TemplateParameterList &RecentParams,
+        TemplateParameterList &NewParams) {
+      for (auto [Idx, Param] : enumerate(RecentParams)) {
+        tryUpdateTemplateParmDeclInheritedFrom<NonTypeTemplateParmDecl>(
+            Param, NewParams.getParam(Idx));
+        tryUpdateTemplateParmDeclInheritedFrom<TemplateTypeParmDecl>(
+            Param, NewParams.getParam(Idx));
+        tryUpdateTemplateParmDeclInheritedFrom<TemplateTemplateParmDecl>(
+            Param, NewParams.getParam(Idx));
+      }
+    }
+
   public:
     explicit ASTNodeImporter(ASTImporter &Importer) : Importer(Importer) {}
 
@@ -4179,12 +4227,6 @@ ExpectedDecl ASTNodeImporter::VisitFieldDecl(FieldDecl *D) {
                               D->getInClassInitStyle()))
     return ToField;
 
-  // We need [[no_unqiue_address]] attributes to be added to FieldDecl, before
-  // we add fields in CXXRecordDecl::addedMember, otherwise record will be
-  // marked as having non-zero size.
-  Err = Importer.ImportAttrs(ToField, D);
-  if (Err)
-    return std::move(Err);
   ToField->setAccess(D->getAccess());
   ToField->setLexicalDeclContext(LexicalDC);
   ToField->setImplicit(D->isImplicit());
@@ -6138,6 +6180,9 @@ ExpectedDecl ASTNodeImporter::VisitClassTemplateDecl(ClassTemplateDecl *D) {
     }
 
     D2->setPreviousDecl(Recent);
+
+    updateTemplateParametersInheritedFrom(*(Recent->getTemplateParameters()),
+                                          **TemplateParamsOrErr);
   }
 
   return D2;
@@ -6452,6 +6497,9 @@ ExpectedDecl ASTNodeImporter::VisitVarTemplateDecl(VarTemplateDecl *D) {
         ToTemplated->setPreviousDecl(PrevTemplated);
     }
     ToVarTD->setPreviousDecl(Recent);
+
+    updateTemplateParametersInheritedFrom(*(Recent->getTemplateParameters()),
+                                          **TemplateParamsOrErr);
   }
 
   return ToVarTD;
@@ -6724,6 +6772,9 @@ ASTNodeImporter::VisitFunctionTemplateDecl(FunctionTemplateDecl *D) {
         TemplatedFD->setPreviousDecl(PrevTemplated);
     }
     ToFunc->setPreviousDecl(Recent);
+
+    updateTemplateParametersInheritedFrom(*(Recent->getTemplateParameters()),
+                                          *Params);
   }
 
   return ToFunc;
@@ -9399,19 +9450,6 @@ TranslationUnitDecl *ASTImporter::GetFromTU(Decl *ToD) {
   return FromDPos->second->getTranslationUnitDecl();
 }
 
-Error ASTImporter::ImportAttrs(Decl *ToD, Decl *FromD) {
-  if (!FromD->hasAttrs() || ToD->hasAttrs())
-    return Error::success();
-  for (const Attr *FromAttr : FromD->getAttrs()) {
-    auto ToAttrOrErr = Import(FromAttr);
-    if (ToAttrOrErr)
-      ToD->addAttr(*ToAttrOrErr);
-    else
-      return ToAttrOrErr.takeError();
-  }
-  return Error::success();
-}
-
 Expected<Decl *> ASTImporter::Import(Decl *FromD) {
   if (!FromD)
     return nullptr;
@@ -9545,8 +9583,15 @@ Expected<Decl *> ASTImporter::Import(Decl *FromD) {
   }
   // Make sure that ImportImpl registered the imported decl.
   assert(ImportedDecls.count(FromD) != 0 && "Missing call to MapImported?");
-  if (auto Error = ImportAttrs(ToD, FromD))
-    return std::move(Error);
+
+  if (FromD->hasAttrs())
+    for (const Attr *FromAttr : FromD->getAttrs()) {
+      auto ToAttrOrErr = Import(FromAttr);
+      if (ToAttrOrErr)
+        ToD->addAttr(*ToAttrOrErr);
+      else
+        return ToAttrOrErr.takeError();
+    }
 
   // Notify subclasses.
   Imported(FromD, ToD);
diff --git a/clang/lib/AST/ExprCXX.cpp b/clang/lib/AST/ExprCXX.cpp
index e2c9643..6212989 100644
--- a/clang/lib/AST/ExprCXX.cpp
+++ b/clang/lib/AST/ExprCXX.cpp
@@ -152,7 +152,7 @@ bool CXXTypeidExpr::isMostDerived(ASTContext &Context) const {
   const Expr *E = getExprOperand()->IgnoreParenNoopCasts(Context);
   if (const auto *DRE = dyn_cast<DeclRefExpr>(E)) {
     QualType Ty = DRE->getDecl()->getType();
-    if (!Ty->isPointerType() && !Ty->isReferenceType())
+    if (!Ty->isPointerOrReferenceType())
       return true;
   }
 
diff --git a/clang/lib/AST/Interp/Compiler.cpp b/clang/lib/AST/Interp/Compiler.cpp
index c07c106..258e4ed 100644
--- a/clang/lib/AST/Interp/Compiler.cpp
+++ b/clang/lib/AST/Interp/Compiler.cpp
@@ -4373,6 +4373,7 @@ bool Compiler<Emitter>::visitWhileStmt(const WhileStmt *S) {
 
   if (!this->jump(CondLabel))
     return false;
+  this->fallthrough(EndLabel);
   this->emitLabel(EndLabel);
 
   return true;
diff --git a/clang/lib/AST/Interp/Context.cpp b/clang/lib/AST/Interp/Context.cpp
index b5e992c..b1e06cd 100644
--- a/clang/lib/AST/Interp/Context.cpp
+++ b/clang/lib/AST/Interp/Context.cpp
@@ -176,8 +176,7 @@ std::optional<PrimType> Context::classify(QualType T) const {
       T->isFunctionType())
     return PT_FnPtr;
 
-  if (T->isReferenceType() || T->isPointerType() ||
-      T->isObjCObjectPointerType())
+  if (T->isPointerOrReferenceType() || T->isObjCObjectPointerType())
     return PT_Ptr;
 
   if (const auto *AT = T->getAs<AtomicType>())
diff --git a/clang/lib/AST/Interp/Descriptor.cpp b/clang/lib/AST/Interp/Descriptor.cpp
index 4f7e9ea..671f2c03 100644
--- a/clang/lib/AST/Interp/Descriptor.cpp
+++ b/clang/lib/AST/Interp/Descriptor.cpp
@@ -33,7 +33,8 @@ static void dtorTy(Block *, std::byte *Ptr, const Descriptor *) {
 template <typename T>
 static void moveTy(Block *, const std::byte *Src, std::byte *Dst,
                    const Descriptor *) {
-  const auto *SrcPtr = reinterpret_cast<const T *>(Src);
+  // FIXME: Get rid of the const_cast.
+  auto *SrcPtr = reinterpret_cast<T *>(const_cast<std::byte *>(Src));
   auto *DstPtr = reinterpret_cast<T *>(Dst);
   new (DstPtr) T(std::move(*SrcPtr));
 }
diff --git a/clang/lib/AST/Interp/EvaluationResult.cpp b/clang/lib/AST/Interp/EvaluationResult.cpp
index 1b25571..bdebd19 100644
--- a/clang/lib/AST/Interp/EvaluationResult.cpp
+++ b/clang/lib/AST/Interp/EvaluationResult.cpp
@@ -10,7 +10,9 @@
 #include "InterpState.h"
 #include "Record.h"
 #include "clang/AST/ExprCXX.h"
+#include "llvm/ADT/STLExtras.h"
 #include "llvm/ADT/SetVector.h"
+#include <iterator>
 
 namespace clang {
 namespace interp {
@@ -122,19 +124,18 @@ static bool CheckFieldsInitialized(InterpState &S, SourceLocation Loc,
   }
 
   // Check Fields in all bases
-  for (const Record::Base &B : R->bases()) {
+  for (auto [I, B] : llvm::enumerate(R->bases())) {
     Pointer P = BasePtr.atField(B.Offset);
     if (!P.isInitialized()) {
       const Descriptor *Desc = BasePtr.getDeclDesc();
-      if (Desc->asDecl())
-        S.FFDiag(BasePtr.getDeclDesc()->asDecl()->getLocation(),
-                 diag::note_constexpr_uninitialized_base)
+      if (const auto *CD = dyn_cast_if_present<CXXRecordDecl>(R->getDecl())) {
+        const auto &BS = *std::next(CD->bases_begin(), I);
+        S.FFDiag(BS.getBaseTypeLoc(), diag::note_constexpr_uninitialized_base)
+            << B.Desc->getType() << BS.getSourceRange();
+      } else {
+        S.FFDiag(Desc->getLocation(), diag::note_constexpr_uninitialized_base)
             << B.Desc->getType();
-      else
-        S.FFDiag(BasePtr.getDeclDesc()->asExpr()->getExprLoc(),
-                 diag::note_constexpr_uninitialized_base)
-            << B.Desc->getType();
-
+      }
       return false;
     }
     Result &= CheckFieldsInitialized(S, Loc, P, B.R);
diff --git a/clang/lib/AST/Interp/Interp.h b/clang/lib/AST/Interp/Interp.h
index 22a86ab..63e9966 100644
--- a/clang/lib/AST/Interp/Interp.h
+++ b/clang/lib/AST/Interp/Interp.h
@@ -2588,12 +2588,20 @@ inline bool CallVirt(InterpState &S, CodePtr OpPC, const Function *Func,
   size_t ThisOffset = ArgSize - (Func->hasRVO() ? primSize(PT_Ptr) : 0);
   Pointer &ThisPtr = S.Stk.peek<Pointer>(ThisOffset);
 
-  QualType DynamicType = ThisPtr.getDeclDesc()->getType();
-  const CXXRecordDecl *DynamicDecl;
-  if (DynamicType->isPointerType() || DynamicType->isReferenceType())
-    DynamicDecl = DynamicType->getPointeeCXXRecordDecl();
-  else
-    DynamicDecl = ThisPtr.getDeclDesc()->getType()->getAsCXXRecordDecl();
+  const CXXRecordDecl *DynamicDecl = nullptr;
+  {
+    Pointer TypePtr = ThisPtr;
+    while (TypePtr.isBaseClass())
+      TypePtr = TypePtr.getBase();
+
+    QualType DynamicType = TypePtr.getType();
+    if (DynamicType->isPointerType() || DynamicType->isReferenceType())
+      DynamicDecl = DynamicType->getPointeeCXXRecordDecl();
+    else
+      DynamicDecl = DynamicType->getAsCXXRecordDecl();
+  }
+  assert(DynamicDecl);
+
   const auto *StaticDecl = cast<CXXRecordDecl>(Func->getParentDecl());
   const auto *InitialFunction = cast<CXXMethodDecl>(Func->getDecl());
   const CXXMethodDecl *Overrider = S.getContext().getOverridingFunction(
@@ -2620,7 +2628,29 @@ inline bool CallVirt(InterpState &S, CodePtr OpPC, const Function *Func,
     }
   }
 
-  return Call(S, OpPC, Func, VarArgSize);
+  if (!Call(S, OpPC, Func, VarArgSize))
+    return false;
+
+  // Covariant return types. The return type of Overrider is a pointer
+  // or reference to a class type.
+  if (Overrider != InitialFunction &&
+      Overrider->getReturnType()->isPointerOrReferenceType() &&
+      InitialFunction->getReturnType()->isPointerOrReferenceType()) {
+    QualType OverriderPointeeType =
+        Overrider->getReturnType()->getPointeeType();
+    QualType InitialPointeeType =
+        InitialFunction->getReturnType()->getPointeeType();
+    // We've called Overrider above, but calling code expects us to return what
+    // InitialFunction returned. According to the rules for covariant return
+    // types, what InitialFunction returns needs to be a base class of what
+    // Overrider returns. So, we need to do an upcast here.
+    unsigned Offset = S.getContext().collectBaseOffset(
+        InitialPointeeType->getAsRecordDecl(),
+        OverriderPointeeType->getAsRecordDecl());
+    return GetPtrBasePop(S, OpPC, Offset);
+  }
+
+  return true;
 }
 
 inline bool CallBI(InterpState &S, CodePtr &PC, const Function *Func,
diff --git a/clang/lib/AST/ItaniumMangle.cpp b/clang/lib/AST/ItaniumMangle.cpp
index d46d621..ead5da4 100644
--- a/clang/lib/AST/ItaniumMangle.cpp
+++ b/clang/lib/AST/ItaniumMangle.cpp
@@ -6484,7 +6484,7 @@ void CXXNameMangler::mangleValueInTemplateArg(QualType T, const APValue &V,
 
   case APValue::LValue: {
     // Proposed in https://github.com/itanium-cxx-abi/cxx-abi/issues/47.
-    assert((T->isPointerType() || T->isReferenceType()) &&
+    assert((T->isPointerOrReferenceType()) &&
            "unexpected type for LValue template arg");
 
     if (V.isNullPointer()) {
diff --git a/clang/lib/AST/MicrosoftMangle.cpp b/clang/lib/AST/MicrosoftMangle.cpp
index e0d7c01..28f66e71 100644
--- a/clang/lib/AST/MicrosoftMangle.cpp
+++ b/clang/lib/AST/MicrosoftMangle.cpp
@@ -1015,6 +1015,7 @@ void MicrosoftCXXNameMangler::mangleFloat(llvm::APFloat Number) {
   case APFloat::S_Float8E5M2FNUZ:
   case APFloat::S_Float8E4M3FNUZ:
   case APFloat::S_Float8E4M3B11FNUZ:
+  case APFloat::S_Float8E3M4:
   case APFloat::S_FloatTF32:
   case APFloat::S_Float6E3M2FN:
   case APFloat::S_Float6E2M3FN:
diff --git a/clang/lib/AST/StmtProfile.cpp b/clang/lib/AST/StmtProfile.cpp
index 89d2a42..f1e723b 100644
--- a/clang/lib/AST/StmtProfile.cpp
+++ b/clang/lib/AST/StmtProfile.cpp
@@ -624,7 +624,7 @@ void OMPClauseProfiler::VisitOMPFilterClause(const OMPFilterClause *C) {
 
 template<typename T>
 void OMPClauseProfiler::VisitOMPClauseList(T *Node) {
-  for (auto *E : Node->varlists()) {
+  for (auto *E : Node->varlist()) {
     if (E)
       Profiler->VisitStmt(E);
   }
@@ -918,7 +918,7 @@ void OMPClauseProfiler::VisitOMPUsesAllocatorsClause(
 void OMPClauseProfiler::VisitOMPAffinityClause(const OMPAffinityClause *C) {
   if (const Expr *Modifier = C->getModifier())
     Profiler->VisitStmt(Modifier);
-  for (const Expr *E : C->varlists())
+  for (const Expr *E : C->varlist())
     Profiler->VisitStmt(E);
 }
 void OMPClauseProfiler::VisitOMPOrderClause(const OMPOrderClause *C) {}
diff --git a/clang/lib/AST/Type.cpp b/clang/lib/AST/Type.cpp
index fdaab8e..0456b5f 100644
--- a/clang/lib/AST/Type.cpp
+++ b/clang/lib/AST/Type.cpp
@@ -75,7 +75,7 @@ bool Qualifiers::isStrictSupersetOf(Qualifiers Other) const {
 const IdentifierInfo* QualType::getBaseTypeIdentifier() const {
   const Type* ty = getTypePtr();
   NamedDecl *ND = nullptr;
-  if (ty->isPointerType() || ty->isReferenceType())
+  if (ty->isPointerOrReferenceType())
     return ty->getPointeeType().getBaseTypeIdentifier();
   else if (ty->isRecordType())
     ND = ty->castAs<RecordType>()->getDecl();
diff --git a/clang/lib/ASTMatchers/ASTMatchersInternal.cpp b/clang/lib/ASTMatchers/ASTMatchersInternal.cpp
index bf87b1aa..06309d3 100644
--- a/clang/lib/ASTMatchers/ASTMatchersInternal.cpp
+++ b/clang/lib/ASTMatchers/ASTMatchersInternal.cpp
@@ -537,14 +537,23 @@ public:
   /// that didn't match.
   /// Return true if there are still any patterns left.
   bool consumeNameSuffix(StringRef NodeName, bool CanSkip) {
-    for (size_t I = 0; I < Patterns.size();) {
-      if (::clang::ast_matchers::internal::consumeNameSuffix(Patterns[I].P,
-                                                             NodeName) ||
-          CanSkip) {
-        ++I;
-      } else {
-        Patterns.erase(Patterns.begin() + I);
+    if (CanSkip) {
+      // If we can skip the node, then we need to handle the case where a
+      // skipped node has the same name as its parent.
+      // namespace a { inline namespace a { class A; } }
+      // cxxRecordDecl(hasName("::a::A"))
+      // To do this, any patterns that match should be duplicated in our set,
+      // one of them with the tail removed.
+      for (size_t I = 0, E = Patterns.size(); I != E; ++I) {
+        StringRef Pattern = Patterns[I].P;
+        if (ast_matchers::internal::consumeNameSuffix(Patterns[I].P, NodeName))
+          Patterns.push_back({Pattern, Patterns[I].IsFullyQualified});
       }
+    } else {
+      llvm::erase_if(Patterns, [&NodeName](auto &Pattern) {
+        return !::clang::ast_matchers::internal::consumeNameSuffix(Pattern.P,
+                                                                   NodeName);
+      });
     }
     return !Patterns.empty();
   }
diff --git a/clang/lib/Analysis/Consumed.cpp b/clang/lib/Analysis/Consumed.cpp
index d01c7f6..63c5943 100644
--- a/clang/lib/Analysis/Consumed.cpp
+++ b/clang/lib/Analysis/Consumed.cpp
@@ -141,7 +141,7 @@ static bool isCallableInState(const CallableWhenAttr *CWAttr,
 }
 
 static bool isConsumableType(const QualType &QT) {
-  if (QT->isPointerType() || QT->isReferenceType())
+  if (QT->isPointerOrReferenceType())
     return false;
 
   if (const CXXRecordDecl *RD = QT->getAsCXXRecordDecl())
@@ -151,7 +151,7 @@ static bool isConsumableType(const QualType &QT) {
 }
 
 static bool isAutoCastType(const QualType &QT) {
-  if (QT->isPointerType() || QT->isReferenceType())
+  if (QT->isPointerOrReferenceType())
     return false;
 
   if (const CXXRecordDecl *RD = QT->getAsCXXRecordDecl())
@@ -186,10 +186,6 @@ static bool isTestingFunction(const FunctionDecl *FunDecl) {
   return FunDecl->hasAttr<TestTypestateAttr>();
 }
 
-static bool isPointerOrRef(QualType ParamType) {
-  return ParamType->isPointerType() || ParamType->isReferenceType();
-}
-
 static ConsumedState mapConsumableAttrState(const QualType QT) {
   assert(isConsumableType(QT));
 
@@ -648,7 +644,7 @@ bool ConsumedStmtVisitor::handleCall(const CallExpr *Call, const Expr *ObjArg,
       setStateForVarOrTmp(StateMap, PInfo, mapReturnTypestateAttrState(RT));
     else if (isRValueRef(ParamType) || isConsumableType(ParamType))
       setStateForVarOrTmp(StateMap, PInfo, consumed::CS_Consumed);
-    else if (isPointerOrRef(ParamType) &&
+    else if (ParamType->isPointerOrReferenceType() &&
              (!ParamType->getPointeeType().isConstQualified() ||
               isSetOnReadPtrType(ParamType)))
       setStateForVarOrTmp(StateMap, PInfo, consumed::CS_Unknown);
diff --git a/clang/lib/Analysis/FlowSensitive/AdornedCFG.cpp b/clang/lib/Analysis/FlowSensitive/AdornedCFG.cpp
index 2555430..876b5a3 100644
--- a/clang/lib/Analysis/FlowSensitive/AdornedCFG.cpp
+++ b/clang/lib/Analysis/FlowSensitive/AdornedCFG.cpp
@@ -16,6 +16,7 @@
 #include "clang/AST/Decl.h"
 #include "clang/AST/Stmt.h"
 #include "clang/Analysis/CFG.h"
+#include "clang/Analysis/FlowSensitive/ASTOps.h"
 #include "llvm/ADT/BitVector.h"
 #include "llvm/ADT/DenseMap.h"
 #include "llvm/Support/Error.h"
@@ -96,8 +97,7 @@ static llvm::BitVector findReachableBlocks(const CFG &Cfg) {
 
 static llvm::DenseSet<const CFGBlock *>
 buildContainsExprConsumedInDifferentBlock(
-    const CFG &Cfg,
-    const llvm::DenseMap<const Stmt *, const CFGBlock *> &StmtToBlock) {
+    const CFG &Cfg, const internal::StmtToBlockMap &StmtToBlock) {
   llvm::DenseSet<const CFGBlock *> Result;
 
   auto CheckChildExprs = [&Result, &StmtToBlock](const Stmt *S,
@@ -105,7 +105,7 @@ buildContainsExprConsumedInDifferentBlock(
     for (const Stmt *Child : S->children()) {
       if (!isa_and_nonnull<Expr>(Child))
         continue;
-      const CFGBlock *ChildBlock = StmtToBlock.lookup(Child);
+      const CFGBlock *ChildBlock = StmtToBlock.lookup(*Child);
       if (ChildBlock != Block)
         Result.insert(ChildBlock);
     }
@@ -126,6 +126,13 @@ buildContainsExprConsumedInDifferentBlock(
   return Result;
 }
 
+namespace internal {
+
+StmtToBlockMap::StmtToBlockMap(const CFG &Cfg)
+    : StmtToBlock(buildStmtToBasicBlockMap(Cfg)) {}
+
+} // namespace internal
+
 llvm::Expected<AdornedCFG> AdornedCFG::build(const FunctionDecl &Func) {
   if (!Func.doesThisDeclarationHaveABody())
     return llvm::createStringError(
@@ -166,8 +173,7 @@ llvm::Expected<AdornedCFG> AdornedCFG::build(const Decl &D, Stmt &S,
         std::make_error_code(std::errc::invalid_argument),
         "CFG::buildCFG failed");
 
-  llvm::DenseMap<const Stmt *, const CFGBlock *> StmtToBlock =
-      buildStmtToBasicBlockMap(*Cfg);
+  internal::StmtToBlockMap StmtToBlock(*Cfg);
 
   llvm::BitVector BlockReachable = findReachableBlocks(*Cfg);
 
diff --git a/clang/lib/Analysis/FlowSensitive/Transfer.cpp b/clang/lib/Analysis/FlowSensitive/Transfer.cpp
index 3c896d3..9c54eb1 100644
--- a/clang/lib/Analysis/FlowSensitive/Transfer.cpp
+++ b/clang/lib/Analysis/FlowSensitive/Transfer.cpp
@@ -40,17 +40,16 @@ namespace clang {
 namespace dataflow {
 
 const Environment *StmtToEnvMap::getEnvironment(const Stmt &S) const {
-  auto BlockIt = ACFG.getStmtToBlock().find(&ignoreCFGOmittedNodes(S));
-  if (BlockIt == ACFG.getStmtToBlock().end()) {
+  const CFGBlock *Block = ACFG.blockForStmt(S);
+  if (Block == nullptr) {
     assert(false);
-    // Return null to avoid dereferencing the end iterator in non-assert builds.
     return nullptr;
   }
-  if (!ACFG.isBlockReachable(*BlockIt->getSecond()))
+  if (!ACFG.isBlockReachable(*Block))
     return nullptr;
-  if (BlockIt->getSecond()->getBlockID() == CurBlockID)
+  if (Block->getBlockID() == CurBlockID)
     return &CurState.Env;
-  const auto &State = BlockToState[BlockIt->getSecond()->getBlockID()];
+  const auto &State = BlockToState[Block->getBlockID()];
   if (!(State))
     return nullptr;
   return &State->Env;
diff --git a/clang/lib/Analysis/FlowSensitive/TypeErasedDataflowAnalysis.cpp b/clang/lib/Analysis/FlowSensitive/TypeErasedDataflowAnalysis.cpp
index 200682f..8afd18b 100644
--- a/clang/lib/Analysis/FlowSensitive/TypeErasedDataflowAnalysis.cpp
+++ b/clang/lib/Analysis/FlowSensitive/TypeErasedDataflowAnalysis.cpp
@@ -243,10 +243,11 @@ computeBlockInputState(const CFGBlock &Block, AnalysisContext &AC) {
     // See `NoreturnDestructorTest` for concrete examples.
     if (Block.succ_begin()->getReachableBlock() != nullptr &&
         Block.succ_begin()->getReachableBlock()->hasNoReturnElement()) {
-      auto &StmtToBlock = AC.ACFG.getStmtToBlock();
-      auto StmtBlock = StmtToBlock.find(Block.getTerminatorStmt());
-      assert(StmtBlock != StmtToBlock.end());
-      llvm::erase(Preds, StmtBlock->getSecond());
+      const CFGBlock *StmtBlock = nullptr;
+      if (const Stmt *Terminator = Block.getTerminatorStmt())
+        StmtBlock = AC.ACFG.blockForStmt(*Terminator);
+      assert(StmtBlock != nullptr);
+      llvm::erase(Preds, StmtBlock);
     }
   }
 
diff --git a/clang/lib/Analysis/LiveVariables.cpp b/clang/lib/Analysis/LiveVariables.cpp
index 6d03dd0..481932e 100644
--- a/clang/lib/Analysis/LiveVariables.cpp
+++ b/clang/lib/Analysis/LiveVariables.cpp
@@ -214,6 +214,22 @@ static void AddLiveExpr(llvm::ImmutableSet<const Expr *> &Set,
   Set = F.add(Set, LookThroughExpr(E));
 }
 
+/// Add as a live expression all individual conditions in a logical expression.
+/// For example, for the expression:
+/// "(a < b) || (c && d && ((e || f) != (g && h)))"
+/// the following expressions will be added as live:
+/// "a < b", "c", "d", "((e || f) != (g && h))"
+static void AddAllConditionalTerms(llvm::ImmutableSet<const Expr *> &Set,
+                                   llvm::ImmutableSet<const Expr *>::Factory &F,
+                                   const Expr *Cond) {
+  AddLiveExpr(Set, F, Cond);
+  if (auto const *BO = dyn_cast<BinaryOperator>(Cond->IgnoreParens());
+      BO && BO->isLogicalOp()) {
+    AddAllConditionalTerms(Set, F, BO->getLHS());
+    AddAllConditionalTerms(Set, F, BO->getRHS());
+  }
+}
+
 void TransferFunctions::Visit(Stmt *S) {
   if (observer)
     observer->observeStmt(S, currentBlock, val);
@@ -313,7 +329,27 @@ void TransferFunctions::Visit(Stmt *S) {
       AddLiveExpr(val.liveExprs, LV.ESetFact, cast<ForStmt>(S)->getCond());
       return;
     }
-
+    case Stmt::ConditionalOperatorClass: {
+      // Keep not only direct children alive, but also all the short-circuited
+      // parts of the condition. Short-circuiting evaluation may cause the
+      // conditional operator evaluation to skip the evaluation of the entire
+      // condtion expression, so the value of the entire condition expression is
+      // never computed.
+      //
+      // This makes a difference when we compare exploded nodes coming from true
+      // and false expressions with no side effects: the only difference in the
+      // state is the value of (part of) the condition.
+      //
+      // BinaryConditionalOperatorClass ('x ?: y') is not affected because it
+      // explicitly calculates the value of the entire condition expression (to
+      // possibly use as a value for the "true expr") even if it is
+      // short-circuited.
+      auto const *CO = cast<ConditionalOperator>(S);
+      AddAllConditionalTerms(val.liveExprs, LV.ESetFact, CO->getCond());
+      AddLiveExpr(val.liveExprs, LV.ESetFact, CO->getTrueExpr());
+      AddLiveExpr(val.liveExprs, LV.ESetFact, CO->getFalseExpr());
+      return;
+    }
   }
 
   // HACK + FIXME: What is this? One could only guess that this is an attempt to
diff --git a/clang/lib/Analysis/ThreadSafetyCommon.cpp b/clang/lib/Analysis/ThreadSafetyCommon.cpp
index 3e8c959..cbcfefd 100644
--- a/clang/lib/Analysis/ThreadSafetyCommon.cpp
+++ b/clang/lib/Analysis/ThreadSafetyCommon.cpp
@@ -97,7 +97,7 @@ static StringRef ClassifyDiagnostic(QualType VDT) {
     if (const auto *TD = TT->getDecl())
       if (const auto *CA = TD->getAttr<CapabilityAttr>())
         return ClassifyDiagnostic(CA);
-  } else if (VDT->isPointerType() || VDT->isReferenceType())
+  } else if (VDT->isPointerOrReferenceType())
     return ClassifyDiagnostic(VDT->getPointeeType());
 
   return "mutex";
diff --git a/clang/lib/Basic/Attributes.cpp b/clang/lib/Basic/Attributes.cpp
index 867d241..a39eb85 100644
--- a/clang/lib/Basic/Attributes.cpp
+++ b/clang/lib/Basic/Attributes.cpp
@@ -153,6 +153,40 @@ std::string AttributeCommonInfo::getNormalizedFullName() const {
       normalizeName(getAttrName(), getScopeName(), getSyntax()));
 }
 
+static StringRef getSyntaxName(AttributeCommonInfo::Syntax SyntaxUsed) {
+  switch (SyntaxUsed) {
+  case AttributeCommonInfo::AS_GNU:
+    return "GNU";
+  case AttributeCommonInfo::AS_CXX11:
+    return "CXX11";
+  case AttributeCommonInfo::AS_C23:
+    return "C23";
+  case AttributeCommonInfo::AS_Declspec:
+    return "Declspec";
+  case AttributeCommonInfo::AS_Microsoft:
+    return "Microsoft";
+  case AttributeCommonInfo::AS_Keyword:
+    return "Keyword";
+  case AttributeCommonInfo::AS_Pragma:
+    return "Pragma";
+  case AttributeCommonInfo::AS_ContextSensitiveKeyword:
+    return "ContextSensitiveKeyword";
+  case AttributeCommonInfo::AS_HLSLAnnotation:
+    return "HLSLAnnotation";
+  case AttributeCommonInfo::AS_Implicit:
+    return "Implicit";
+  }
+  llvm_unreachable("Invalid attribute syntax");
+}
+
+std::string AttributeCommonInfo::normalizeFullNameWithSyntax(
+    const IdentifierInfo *Name, const IdentifierInfo *ScopeName,
+    Syntax SyntaxUsed) {
+  return (Twine(getSyntaxName(SyntaxUsed)) +
+          "::" + normalizeName(Name, ScopeName, SyntaxUsed))
+      .str();
+}
+
 unsigned AttributeCommonInfo::calculateAttributeSpellingListIndex() const {
   // Both variables will be used in tablegen generated
   // attribute spell list index matching code.
diff --git a/clang/lib/CodeGen/CGBuiltin.cpp b/clang/lib/CodeGen/CGBuiltin.cpp
index 0c4d0ef..0c2ee44 100644
--- a/clang/lib/CodeGen/CGBuiltin.cpp
+++ b/clang/lib/CodeGen/CGBuiltin.cpp
@@ -692,23 +692,15 @@ static RValue emitLibraryCall(CodeGenFunction &CGF, const FunctionDecl *FD,
   RValue Call =
       CGF.EmitCall(E->getCallee()->getType(), callee, E, ReturnValueSlot());
 
-  // Check the supported intrinsic.
   if (unsigned BuiltinID = FD->getBuiltinID()) {
-    auto IsErrnoIntrinsic = [&]() -> unsigned {
-      switch (BuiltinID) {
-      case Builtin::BIexpf:
-      case Builtin::BI__builtin_expf:
-      case Builtin::BI__builtin_expf128:
-        return true;
-      }
-      // TODO: support more FP math libcalls
-      return false;
-    }();
-
+    // Check whether a FP math builtin function, such as BI__builtin_expf
+    ASTContext &Context = CGF.getContext();
+    bool ConstWithoutErrnoAndExceptions =
+        Context.BuiltinInfo.isConstWithoutErrnoAndExceptions(BuiltinID);
     // Restrict to target with errno, for example, MacOS doesn't set errno.
-    if (IsErrnoIntrinsic && CGF.CGM.getLangOpts().MathErrno &&
-        !CGF.Builder.getIsFPConstrained()) {
-      ASTContext &Context = CGF.getContext();
+    // TODO: Support builtin function with complex type returned, eg: cacosh
+    if (ConstWithoutErrnoAndExceptions && CGF.CGM.getLangOpts().MathErrno &&
+        !CGF.Builder.getIsFPConstrained() && Call.isScalar()) {
       // Emit "int" TBAA metadata on FP math libcalls.
       clang::QualType IntTy = Context.IntTy;
       TBAAAccessInfo TBAAInfo = CGF.CGM.getTBAAAccessInfo(IntTy);
@@ -5986,8 +5978,6 @@ RValue CodeGenFunction::EmitBuiltinExpr(const GlobalDecl GD, unsigned BuiltinID,
         getTarget().getTriple().isAMDGCN() ||
         (getTarget().getTriple().isSPIRV() &&
          getTarget().getTriple().getVendor() == Triple::VendorType::AMD)) {
-      if (getLangOpts().OpenMPIsTargetDevice)
-        return EmitOpenMPDevicePrintfCallExpr(E);
       if (getTarget().getTriple().isNVPTX())
         return EmitNVPTXDevicePrintfCallExpr(E);
       if ((getTarget().getTriple().isAMDGCN() ||
diff --git a/clang/lib/CodeGen/CGCall.cpp b/clang/lib/CodeGen/CGCall.cpp
index 2f3dd5d..ee6e8e0 100644
--- a/clang/lib/CodeGen/CGCall.cpp
+++ b/clang/lib/CodeGen/CGCall.cpp
@@ -2522,6 +2522,9 @@ void CodeGenModule::ConstructAttributeList(StringRef Name,
         }
       }
     }
+    // Remove 'convergent' if requested.
+    if (TargetDecl->hasAttr<NoConvergentAttr>())
+      FuncAttrs.removeAttribute(llvm::Attribute::Convergent);
   }
 
   // Add "sample-profile-suffix-elision-policy" attribute for internal linkage
@@ -5636,6 +5639,11 @@ RValue CodeGenFunction::EmitCall(const CGFunctionInfo &CallInfo,
     Attrs =
         Attrs.addFnAttribute(getLLVMContext(), llvm::Attribute::AlwaysInline);
 
+  // Remove call-site convergent attribute if requested.
+  if (InNoConvergentAttributedStmt)
+    Attrs =
+        Attrs.removeFnAttribute(getLLVMContext(), llvm::Attribute::Convergent);
+
   // Apply some call-site-specific attributes.
   // TODO: work this into building the attribute set.
 
diff --git a/clang/lib/CodeGen/CGDebugInfo.cpp b/clang/lib/CodeGen/CGDebugInfo.cpp
index 3d8a715..b49dee2 100644
--- a/clang/lib/CodeGen/CGDebugInfo.cpp
+++ b/clang/lib/CodeGen/CGDebugInfo.cpp
@@ -1942,7 +1942,12 @@ CGDebugInfo::getOrCreateMethodType(const CXXMethodDecl *Method,
   if (Method->isStatic())
     return cast_or_null<llvm::DISubroutineType>(
         getOrCreateType(QualType(Func, 0), Unit));
-  return getOrCreateInstanceMethodType(Method->getThisType(), Func, Unit);
+
+  QualType ThisType;
+  if (!Method->hasCXXExplicitFunctionObjectParameter())
+    ThisType = Method->getThisType();
+
+  return getOrCreateInstanceMethodType(ThisType, Func, Unit);
 }
 
 llvm::DISubroutineType *CGDebugInfo::getOrCreateInstanceMethodType(
@@ -1974,27 +1979,31 @@ llvm::DISubroutineType *CGDebugInfo::getOrCreateInstanceMethodType(
   Elts.push_back(Args[0]);
 
   // "this" pointer is always first argument.
-  const CXXRecordDecl *RD = ThisPtr->getPointeeCXXRecordDecl();
-  if (isa<ClassTemplateSpecializationDecl>(RD)) {
-    // Create pointer type directly in this case.
-    const PointerType *ThisPtrTy = cast<PointerType>(ThisPtr);
-    uint64_t Size = CGM.getContext().getTypeSize(ThisPtrTy);
-    auto Align = getTypeAlignIfRequired(ThisPtrTy, CGM.getContext());
-    llvm::DIType *PointeeType =
-        getOrCreateType(ThisPtrTy->getPointeeType(), Unit);
-    llvm::DIType *ThisPtrType =
-        DBuilder.createPointerType(PointeeType, Size, Align);
-    TypeCache[ThisPtr.getAsOpaquePtr()].reset(ThisPtrType);
-    // TODO: This and the artificial type below are misleading, the
-    // types aren't artificial the argument is, but the current
-    // metadata doesn't represent that.
-    ThisPtrType = DBuilder.createObjectPointerType(ThisPtrType);
-    Elts.push_back(ThisPtrType);
-  } else {
-    llvm::DIType *ThisPtrType = getOrCreateType(ThisPtr, Unit);
-    TypeCache[ThisPtr.getAsOpaquePtr()].reset(ThisPtrType);
-    ThisPtrType = DBuilder.createObjectPointerType(ThisPtrType);
-    Elts.push_back(ThisPtrType);
+  // ThisPtr may be null if the member function has an explicit 'this'
+  // parameter.
+  if (!ThisPtr.isNull()) {
+    const CXXRecordDecl *RD = ThisPtr->getPointeeCXXRecordDecl();
+    if (isa<ClassTemplateSpecializationDecl>(RD)) {
+      // Create pointer type directly in this case.
+      const PointerType *ThisPtrTy = cast<PointerType>(ThisPtr);
+      uint64_t Size = CGM.getContext().getTypeSize(ThisPtrTy);
+      auto Align = getTypeAlignIfRequired(ThisPtrTy, CGM.getContext());
+      llvm::DIType *PointeeType =
+          getOrCreateType(ThisPtrTy->getPointeeType(), Unit);
+      llvm::DIType *ThisPtrType =
+          DBuilder.createPointerType(PointeeType, Size, Align);
+      TypeCache[ThisPtr.getAsOpaquePtr()].reset(ThisPtrType);
+      // TODO: This and the artificial type below are misleading, the
+      // types aren't artificial the argument is, but the current
+      // metadata doesn't represent that.
+      ThisPtrType = DBuilder.createObjectPointerType(ThisPtrType);
+      Elts.push_back(ThisPtrType);
+    } else {
+      llvm::DIType *ThisPtrType = getOrCreateType(ThisPtr, Unit);
+      TypeCache[ThisPtr.getAsOpaquePtr()].reset(ThisPtrType);
+      ThisPtrType = DBuilder.createObjectPointerType(ThisPtrType);
+      Elts.push_back(ThisPtrType);
+    }
   }
 
   // Copy rest of the arguments.
diff --git a/clang/lib/CodeGen/CGDecl.cpp b/clang/lib/CodeGen/CGDecl.cpp
index c3251bb..882dbad 100644
--- a/clang/lib/CodeGen/CGDecl.cpp
+++ b/clang/lib/CodeGen/CGDecl.cpp
@@ -2790,7 +2790,7 @@ void CodeGenModule::EmitOMPRequiresDecl(const OMPRequiresDecl *D) {
 }
 
 void CodeGenModule::EmitOMPAllocateDecl(const OMPAllocateDecl *D) {
-  for (const Expr *E : D->varlists()) {
+  for (const Expr *E : D->varlist()) {
     const auto *DE = cast<DeclRefExpr>(E);
     const auto *VD = cast<VarDecl>(DE->getDecl());
 
diff --git a/clang/lib/CodeGen/CGExprComplex.cpp b/clang/lib/CodeGen/CGExprComplex.cpp
index 4d45f6d..828a098 100644
--- a/clang/lib/CodeGen/CGExprComplex.cpp
+++ b/clang/lib/CodeGen/CGExprComplex.cpp
@@ -649,7 +649,6 @@ ComplexPairTy ComplexExprEmitter::EmitCast(CastKind CK, Expr *Op,
 
 ComplexPairTy ComplexExprEmitter::VisitUnaryPlus(const UnaryOperator *E,
                                                  QualType PromotionType) {
-  E->hasStoredFPFeatures();
   QualType promotionTy =
       PromotionType.isNull()
           ? getPromotionType(E->getStoredFPFeaturesOrDefault(),
diff --git a/clang/lib/CodeGen/CGGPUBuiltin.cpp b/clang/lib/CodeGen/CGGPUBuiltin.cpp
index b234073..84adf29 100644
--- a/clang/lib/CodeGen/CGGPUBuiltin.cpp
+++ b/clang/lib/CodeGen/CGGPUBuiltin.cpp
@@ -42,28 +42,6 @@ llvm::Function *GetVprintfDeclaration(llvm::Module &M) {
       VprintfFuncType, llvm::GlobalVariable::ExternalLinkage, "vprintf", &M);
 }
 
-llvm::Function *GetOpenMPVprintfDeclaration(CodeGenModule &CGM) {
-  const char *Name = "__llvm_omp_vprintf";
-  llvm::Module &M = CGM.getModule();
-  llvm::Type *ArgTypes[] = {llvm::PointerType::getUnqual(M.getContext()),
-                            llvm::PointerType::getUnqual(M.getContext()),
-                            llvm::Type::getInt32Ty(M.getContext())};
-  llvm::FunctionType *VprintfFuncType = llvm::FunctionType::get(
-      llvm::Type::getInt32Ty(M.getContext()), ArgTypes, false);
-
-  if (auto *F = M.getFunction(Name)) {
-    if (F->getFunctionType() != VprintfFuncType) {
-      CGM.Error(SourceLocation(),
-                "Invalid type declaration for __llvm_omp_vprintf");
-      return nullptr;
-    }
-    return F;
-  }
-
-  return llvm::Function::Create(
-      VprintfFuncType, llvm::GlobalVariable::ExternalLinkage, Name, &M);
-}
-
 // Transforms a call to printf into a call to the NVPTX vprintf syscall (which
 // isn't particularly special; it's invoked just like a regular function).
 // vprintf takes two args: A format string, and a pointer to a buffer containing
@@ -213,10 +191,3 @@ RValue CodeGenFunction::EmitAMDGPUDevicePrintfCallExpr(const CallExpr *E) {
   Builder.SetInsertPoint(IRB.GetInsertBlock(), IRB.GetInsertPoint());
   return RValue::get(Printf);
 }
-
-RValue CodeGenFunction::EmitOpenMPDevicePrintfCallExpr(const CallExpr *E) {
-  assert(getTarget().getTriple().isNVPTX() ||
-         getTarget().getTriple().isAMDGCN());
-  return EmitDevicePrintfCallExpr(E, this, GetOpenMPVprintfDeclaration(CGM),
-                                  true);
-}
diff --git a/clang/lib/CodeGen/CGOpenMPRuntime.cpp b/clang/lib/CodeGen/CGOpenMPRuntime.cpp
index ec644ac..d869aa3 100644
--- a/clang/lib/CodeGen/CGOpenMPRuntime.cpp
+++ b/clang/lib/CodeGen/CGOpenMPRuntime.cpp
@@ -3861,7 +3861,7 @@ CGOpenMPRuntime::emitTaskInit(CodeGenFunction &CGF, SourceLocation Loc,
         HasIterator = true;
         continue;
       }
-      for (const Expr *E : C->varlists()) {
+      for (const Expr *E : C->varlist()) {
         llvm::Value *Addr;
         llvm::Value *Size;
         std::tie(Addr, Size) = getPointerAndSize(CGF, E);
@@ -3894,7 +3894,7 @@ CGOpenMPRuntime::emitTaskInit(CodeGenFunction &CGF, SourceLocation Loc,
         continue;
       OMPIteratorGeneratorScope IteratorScope(
           CGF, cast_or_null<OMPIteratorExpr>(Modifier->IgnoreParenImpCasts()));
-      for (const Expr *E : C->varlists()) {
+      for (const Expr *E : C->varlist()) {
         llvm::Value *Addr;
         llvm::Value *Size;
         std::tie(Addr, Size) = getPointerAndSize(CGF, E);
@@ -8187,7 +8187,7 @@ public:
       : CurDir(&Dir), CGF(CGF) {
     // Extract firstprivate clause information.
     for (const auto *C : Dir.getClausesOfKind<OMPFirstprivateClause>())
-      for (const auto *D : C->varlists())
+      for (const auto *D : C->varlist())
         FirstPrivateDecls.try_emplace(
             cast<VarDecl>(cast<DeclRefExpr>(D)->getDecl()), C->isImplicit());
     // Extract implicit firstprivates from uses_allocators clauses.
@@ -11506,7 +11506,7 @@ void CGOpenMPRuntime::LastprivateConditionalRAII::tryToDisableInnerAnalysis(
   }
   // Exclude vars in private clauses.
   for (const auto *C : S.getClausesOfKind<OMPPrivateClause>()) {
-    for (const Expr *Ref : C->varlists()) {
+    for (const Expr *Ref : C->varlist()) {
       if (!Ref->getType()->isScalarType())
         continue;
       const auto *DRE = dyn_cast<DeclRefExpr>(Ref->IgnoreParenImpCasts());
@@ -11516,7 +11516,7 @@ void CGOpenMPRuntime::LastprivateConditionalRAII::tryToDisableInnerAnalysis(
     }
   }
   for (const auto *C : S.getClausesOfKind<OMPFirstprivateClause>()) {
-    for (const Expr *Ref : C->varlists()) {
+    for (const Expr *Ref : C->varlist()) {
       if (!Ref->getType()->isScalarType())
         continue;
       const auto *DRE = dyn_cast<DeclRefExpr>(Ref->IgnoreParenImpCasts());
@@ -11526,7 +11526,7 @@ void CGOpenMPRuntime::LastprivateConditionalRAII::tryToDisableInnerAnalysis(
     }
   }
   for (const auto *C : S.getClausesOfKind<OMPLastprivateClause>()) {
-    for (const Expr *Ref : C->varlists()) {
+    for (const Expr *Ref : C->varlist()) {
       if (!Ref->getType()->isScalarType())
         continue;
       const auto *DRE = dyn_cast<DeclRefExpr>(Ref->IgnoreParenImpCasts());
@@ -11536,7 +11536,7 @@ void CGOpenMPRuntime::LastprivateConditionalRAII::tryToDisableInnerAnalysis(
     }
   }
   for (const auto *C : S.getClausesOfKind<OMPReductionClause>()) {
-    for (const Expr *Ref : C->varlists()) {
+    for (const Expr *Ref : C->varlist()) {
       if (!Ref->getType()->isScalarType())
         continue;
       const auto *DRE = dyn_cast<DeclRefExpr>(Ref->IgnoreParenImpCasts());
@@ -11546,7 +11546,7 @@ void CGOpenMPRuntime::LastprivateConditionalRAII::tryToDisableInnerAnalysis(
     }
   }
   for (const auto *C : S.getClausesOfKind<OMPLinearClause>()) {
-    for (const Expr *Ref : C->varlists()) {
+    for (const Expr *Ref : C->varlist()) {
       if (!Ref->getType()->isScalarType())
         continue;
       const auto *DRE = dyn_cast<DeclRefExpr>(Ref->IgnoreParenImpCasts());
@@ -11589,7 +11589,7 @@ CGOpenMPRuntime::LastprivateConditionalRAII::LastprivateConditionalRAII(
     if (C->getKind() != OMPC_LASTPRIVATE_conditional)
       continue;
 
-    for (const Expr *Ref : C->varlists()) {
+    for (const Expr *Ref : C->varlist()) {
       Data.DeclToUniqueName.insert(std::make_pair(
           cast<DeclRefExpr>(Ref->IgnoreParenImpCasts())->getDecl(),
           SmallString<16>(generateUniqueName(CGM, "pl_cond", Ref))));
diff --git a/clang/lib/CodeGen/CGStmt.cpp b/clang/lib/CodeGen/CGStmt.cpp
index aa97f68..30b6fce 100644
--- a/clang/lib/CodeGen/CGStmt.cpp
+++ b/clang/lib/CodeGen/CGStmt.cpp
@@ -723,6 +723,7 @@ void CodeGenFunction::EmitAttributedStmt(const AttributedStmt &S) {
   bool nomerge = false;
   bool noinline = false;
   bool alwaysinline = false;
+  bool noconvergent = false;
   const CallExpr *musttail = nullptr;
 
   for (const auto *A : S.getAttrs()) {
@@ -738,6 +739,9 @@ void CodeGenFunction::EmitAttributedStmt(const AttributedStmt &S) {
     case attr::AlwaysInline:
       alwaysinline = true;
       break;
+    case attr::NoConvergent:
+      noconvergent = true;
+      break;
     case attr::MustTail: {
       const Stmt *Sub = S.getSubStmt();
       const ReturnStmt *R = cast<ReturnStmt>(Sub);
@@ -756,6 +760,7 @@ void CodeGenFunction::EmitAttributedStmt(const AttributedStmt &S) {
   SaveAndRestore save_nomerge(InNoMergeAttributedStmt, nomerge);
   SaveAndRestore save_noinline(InNoInlineAttributedStmt, noinline);
   SaveAndRestore save_alwaysinline(InAlwaysInlineAttributedStmt, alwaysinline);
+  SaveAndRestore save_noconvergent(InNoConvergentAttributedStmt, noconvergent);
   SaveAndRestore save_musttail(MustTailCall, musttail);
   EmitStmt(S.getSubStmt(), S.getAttrs());
 }
@@ -2465,7 +2470,8 @@ static llvm::MDNode *getAsmSrcLocInfo(const StringLiteral *Str,
 
 static void UpdateAsmCallInst(llvm::CallBase &Result, bool HasSideEffect,
                               bool HasUnwindClobber, bool ReadOnly,
-                              bool ReadNone, bool NoMerge, const AsmStmt &S,
+                              bool ReadNone, bool NoMerge, bool NoConvergent,
+                              const AsmStmt &S,
                               const std::vector<llvm::Type *> &ResultRegTypes,
                               const std::vector<llvm::Type *> &ArgElemTypes,
                               CodeGenFunction &CGF,
@@ -2506,11 +2512,11 @@ static void UpdateAsmCallInst(llvm::CallBase &Result, bool HasSideEffect,
                                          llvm::ConstantAsMetadata::get(Loc)));
   }
 
-  if (CGF.getLangOpts().assumeFunctionsAreConvergent())
+  if (!NoConvergent && CGF.getLangOpts().assumeFunctionsAreConvergent())
     // Conservatively, mark all inline asm blocks in CUDA or OpenCL as
     // convergent (meaning, they may call an intrinsically convergent op, such
     // as bar.sync, and so can't have certain optimizations applied around
-    // them).
+    // them) unless it's explicitly marked 'noconvergent'.
     Result.addFnAttr(llvm::Attribute::Convergent);
   // Extract all of the register value results from the asm.
   if (ResultRegTypes.size() == 1) {
@@ -2751,7 +2757,10 @@ void CodeGenFunction::EmitAsmStmt(const AsmStmt &S) {
 
       if (RequiresCast) {
         unsigned Size = getContext().getTypeSize(QTy);
-        Ty = llvm::IntegerType::get(getLLVMContext(), Size);
+        if (Size)
+          Ty = llvm::IntegerType::get(getLLVMContext(), Size);
+        else
+          CGM.Error(OutExpr->getExprLoc(), "output size should not be zero");
       }
       ResultRegTypes.push_back(Ty);
       // If this output is tied to an input, and if the input is larger, then
@@ -3037,9 +3046,10 @@ void CodeGenFunction::EmitAsmStmt(const AsmStmt &S) {
   if (IsGCCAsmGoto) {
     CBR = Builder.CreateCallBr(IA, Fallthrough, Transfer, Args);
     EmitBlock(Fallthrough);
-    UpdateAsmCallInst(*CBR, HasSideEffect, false, ReadOnly, ReadNone,
-                      InNoMergeAttributedStmt, S, ResultRegTypes, ArgElemTypes,
-                      *this, RegResults);
+    UpdateAsmCallInst(*CBR, HasSideEffect, /*HasUnwindClobber=*/false, ReadOnly,
+                      ReadNone, InNoMergeAttributedStmt,
+                      InNoConvergentAttributedStmt, S, ResultRegTypes,
+                      ArgElemTypes, *this, RegResults);
     // Because we are emitting code top to bottom, we don't have enough
     // information at this point to know precisely whether we have a critical
     // edge. If we have outputs, split all indirect destinations.
@@ -3067,15 +3077,17 @@ void CodeGenFunction::EmitAsmStmt(const AsmStmt &S) {
     }
   } else if (HasUnwindClobber) {
     llvm::CallBase *Result = EmitCallOrInvoke(IA, Args, "");
-    UpdateAsmCallInst(*Result, HasSideEffect, true, ReadOnly, ReadNone,
-                      InNoMergeAttributedStmt, S, ResultRegTypes, ArgElemTypes,
-                      *this, RegResults);
+    UpdateAsmCallInst(*Result, HasSideEffect, /*HasUnwindClobber=*/true,
+                      ReadOnly, ReadNone, InNoMergeAttributedStmt,
+                      InNoConvergentAttributedStmt, S, ResultRegTypes,
+                      ArgElemTypes, *this, RegResults);
   } else {
     llvm::CallInst *Result =
         Builder.CreateCall(IA, Args, getBundlesForFunclet(IA));
-    UpdateAsmCallInst(*Result, HasSideEffect, false, ReadOnly, ReadNone,
-                      InNoMergeAttributedStmt, S, ResultRegTypes, ArgElemTypes,
-                      *this, RegResults);
+    UpdateAsmCallInst(*Result, HasSideEffect, /*HasUnwindClobber=*/false,
+                      ReadOnly, ReadNone, InNoMergeAttributedStmt,
+                      InNoConvergentAttributedStmt, S, ResultRegTypes,
+                      ArgElemTypes, *this, RegResults);
   }
 
   EmitAsmStores(*this, S, RegResults, ResultRegTypes, ResultTruncRegTypes,
diff --git a/clang/lib/CodeGen/CGStmtOpenMP.cpp b/clang/lib/CodeGen/CGStmtOpenMP.cpp
index 4ee9840..b1ac936 100644
--- a/clang/lib/CodeGen/CGStmtOpenMP.cpp
+++ b/clang/lib/CodeGen/CGStmtOpenMP.cpp
@@ -156,7 +156,7 @@ class OMPLoopScope : public CodeGenFunction::RunCleanupsScope {
       }
       // Mark private vars as undefs.
       for (const auto *C : LD->getClausesOfKind<OMPPrivateClause>()) {
-        for (const Expr *IRef : C->varlists()) {
+        for (const Expr *IRef : C->varlist()) {
           const auto *OrigVD =
               cast<VarDecl>(cast<DeclRefExpr>(IRef)->getDecl());
           if (EmittedAsPrivate.insert(OrigVD->getCanonicalDecl()).second) {
@@ -257,13 +257,13 @@ public:
           }
         }
       } else if (const auto *UDP = dyn_cast<OMPUseDevicePtrClause>(C)) {
-        for (const Expr *E : UDP->varlists()) {
+        for (const Expr *E : UDP->varlist()) {
           const Decl *D = cast<DeclRefExpr>(E)->getDecl();
           if (const auto *OED = dyn_cast<OMPCapturedExprDecl>(D))
             CGF.EmitVarDecl(*OED);
         }
       } else if (const auto *UDP = dyn_cast<OMPUseDeviceAddrClause>(C)) {
-        for (const Expr *E : UDP->varlists()) {
+        for (const Expr *E : UDP->varlist()) {
           const Decl *D = getBaseDecl(E);
           if (const auto *OED = dyn_cast<OMPCapturedExprDecl>(D))
             CGF.EmitVarDecl(*OED);
@@ -865,7 +865,7 @@ bool CodeGenFunction::EmitOMPFirstprivateClause(const OMPExecutableDirective &D,
   bool FirstprivateIsLastprivate = false;
   llvm::DenseMap<const VarDecl *, OpenMPLastprivateModifier> Lastprivates;
   for (const auto *C : D.getClausesOfKind<OMPLastprivateClause>()) {
-    for (const auto *D : C->varlists())
+    for (const auto *D : C->varlist())
       Lastprivates.try_emplace(
           cast<VarDecl>(cast<DeclRefExpr>(D)->getDecl())->getCanonicalDecl(),
           C->getKind());
@@ -1545,7 +1545,7 @@ checkForLastprivateConditionalUpdate(CodeGenFunction &CGF,
     return;
   llvm::DenseSet<CanonicalDeclPtr<const VarDecl>> PrivateDecls;
   for (const auto *C : S.getClausesOfKind<OMPReductionClause>()) {
-    for (const Expr *Ref : C->varlists()) {
+    for (const Expr *Ref : C->varlist()) {
       if (!Ref->getType()->isScalarType())
         continue;
       const auto *DRE = dyn_cast<DeclRefExpr>(Ref->IgnoreParenImpCasts());
@@ -1556,7 +1556,7 @@ checkForLastprivateConditionalUpdate(CodeGenFunction &CGF,
     }
   }
   for (const auto *C : S.getClausesOfKind<OMPLastprivateClause>()) {
-    for (const Expr *Ref : C->varlists()) {
+    for (const Expr *Ref : C->varlist()) {
       if (!Ref->getType()->isScalarType())
         continue;
       const auto *DRE = dyn_cast<DeclRefExpr>(Ref->IgnoreParenImpCasts());
@@ -1567,7 +1567,7 @@ checkForLastprivateConditionalUpdate(CodeGenFunction &CGF,
     }
   }
   for (const auto *C : S.getClausesOfKind<OMPLinearClause>()) {
-    for (const Expr *Ref : C->varlists()) {
+    for (const Expr *Ref : C->varlist()) {
       if (!Ref->getType()->isScalarType())
         continue;
       const auto *DRE = dyn_cast<DeclRefExpr>(Ref->IgnoreParenImpCasts());
@@ -1582,7 +1582,7 @@ checkForLastprivateConditionalUpdate(CodeGenFunction &CGF,
   // Firstprivates do not return value but may be passed by reference - no need
   // to check for updated lastprivate conditional.
   for (const auto *C : S.getClausesOfKind<OMPFirstprivateClause>()) {
-    for (const Expr *Ref : C->varlists()) {
+    for (const Expr *Ref : C->varlist()) {
       if (!Ref->getType()->isScalarType())
         continue;
       const auto *DRE = dyn_cast<DeclRefExpr>(Ref->IgnoreParenImpCasts());
@@ -2288,7 +2288,7 @@ static void emitAlignedClause(CodeGenFunction &CGF,
           cast<llvm::ConstantInt>(CGF.EmitScalarExpr(AlignmentExpr));
       ClauseAlignment = AlignmentCI->getValue();
     }
-    for (const Expr *E : Clause->varlists()) {
+    for (const Expr *E : Clause->varlist()) {
       llvm::APInt Alignment(ClauseAlignment);
       if (Alignment == 0) {
         // OpenMP [2.8.1, Description]
@@ -2407,7 +2407,7 @@ void CodeGenFunction::EmitOMPLinearClause(
   }
   for (const auto *C : D.getClausesOfKind<OMPLinearClause>()) {
     auto CurPrivate = C->privates().begin();
-    for (const Expr *E : C->varlists()) {
+    for (const Expr *E : C->varlist()) {
       const auto *VD = cast<VarDecl>(cast<DeclRefExpr>(E)->getDecl());
       const auto *PrivateVD =
           cast<VarDecl>(cast<DeclRefExpr>(*CurPrivate)->getDecl());
@@ -2711,7 +2711,7 @@ GetAlignedMapping(const OMPLoopDirective &S, CodeGenFunction &CGF) {
           cast<llvm::ConstantInt>(CGF.EmitScalarExpr(AlignmentExpr));
       ClauseAlignment = AlignmentCI->getValue();
     }
-    for (const Expr *E : Clause->varlists()) {
+    for (const Expr *E : Clause->varlist()) {
       llvm::APInt Alignment(ClauseAlignment);
       if (Alignment == 0) {
         // OpenMP [2.8.1, Description]
@@ -4329,7 +4329,7 @@ void CodeGenFunction::EmitOMPSingleDirective(const OMPSingleDirective &S) {
   // Build a list of copyprivate variables along with helper expressions
   // (<source>, <destination>, <destination>=<source> expressions)
   for (const auto *C : S.getClausesOfKind<OMPCopyprivateClause>()) {
-    CopyprivateVars.append(C->varlists().begin(), C->varlists().end());
+    CopyprivateVars.append(C->varlist_begin(), C->varlist_end());
     DestExprs.append(C->destination_exprs().begin(),
                      C->destination_exprs().end());
     SrcExprs.append(C->source_exprs().begin(), C->source_exprs().end());
@@ -5035,7 +5035,7 @@ void CodeGenFunction::EmitOMPTaskBasedDirective(
       auto IPriv = C->privates().begin();
       auto IRed = C->reduction_ops().begin();
       auto ITD = C->taskgroup_descriptors().begin();
-      for (const Expr *Ref : C->varlists()) {
+      for (const Expr *Ref : C->varlist()) {
         InRedVars.emplace_back(Ref);
         InRedPrivs.emplace_back(*IPriv);
         InRedOps.emplace_back(*IRed);
@@ -5318,7 +5318,7 @@ void CodeGenFunction::processInReduction(const OMPExecutableDirective &S,
     auto IPriv = C->privates().begin();
     auto IRed = C->reduction_ops().begin();
     auto ITD = C->taskgroup_descriptors().begin();
-    for (const Expr *Ref : C->varlists()) {
+    for (const Expr *Ref : C->varlist()) {
       InRedVars.emplace_back(Ref);
       InRedPrivs.emplace_back(*IPriv);
       InRedOps.emplace_back(*IRed);
@@ -7346,7 +7346,7 @@ void CodeGenFunction::EmitOMPUseDevicePtrClause(
     const llvm::DenseMap<const ValueDecl *, llvm::Value *>
         CaptureDeviceAddrMap) {
   llvm::SmallDenseSet<CanonicalDeclPtr<const Decl>, 4> Processed;
-  for (const Expr *OrigVarIt : C.varlists()) {
+  for (const Expr *OrigVarIt : C.varlist()) {
     const auto *OrigVD = cast<VarDecl>(cast<DeclRefExpr>(OrigVarIt)->getDecl());
     if (!Processed.insert(OrigVD).second)
       continue;
@@ -7397,7 +7397,7 @@ void CodeGenFunction::EmitOMPUseDeviceAddrClause(
     const llvm::DenseMap<const ValueDecl *, llvm::Value *>
         CaptureDeviceAddrMap) {
   llvm::SmallDenseSet<CanonicalDeclPtr<const Decl>, 4> Processed;
-  for (const Expr *Ref : C.varlists()) {
+  for (const Expr *Ref : C.varlist()) {
     const VarDecl *OrigVD = getBaseDecl(Ref);
     if (!Processed.insert(OrigVD).second)
       continue;
@@ -7494,13 +7494,13 @@ void CodeGenFunction::EmitOMPTargetDataDirective(
         if (CGM.getLangOpts().OMPTargetTriples.empty()) {
           // Emit helper decls of the use_device_ptr/use_device_addr clauses.
           for (const auto *C : S.getClausesOfKind<OMPUseDevicePtrClause>())
-            for (const Expr *E : C->varlists()) {
+            for (const Expr *E : C->varlist()) {
               const Decl *D = cast<DeclRefExpr>(E)->getDecl();
               if (const auto *OED = dyn_cast<OMPCapturedExprDecl>(D))
                 CGF.EmitVarDecl(*OED);
             }
           for (const auto *C : S.getClausesOfKind<OMPUseDeviceAddrClause>())
-            for (const Expr *E : C->varlists()) {
+            for (const Expr *E : C->varlist()) {
               const Decl *D = getBaseDecl(E);
               if (const auto *OED = dyn_cast<OMPCapturedExprDecl>(D))
                 CGF.EmitVarDecl(*OED);
@@ -8232,7 +8232,7 @@ void CodeGenFunction::EmitSimpleOMPExecutableDirective(
     if (isOpenMPTaskingDirective(D.getDirectiveKind())) {
       // Capture global firstprivates to avoid crash.
       for (const auto *C : D.getClausesOfKind<OMPFirstprivateClause>()) {
-        for (const Expr *Ref : C->varlists()) {
+        for (const Expr *Ref : C->varlist()) {
           const auto *DRE = cast<DeclRefExpr>(Ref->IgnoreParenImpCasts());
           if (!DRE)
             continue;
diff --git a/clang/lib/CodeGen/CodeGenFunction.cpp b/clang/lib/CodeGen/CodeGenFunction.cpp
index d607869..af20155 100644
--- a/clang/lib/CodeGen/CodeGenFunction.cpp
+++ b/clang/lib/CodeGen/CodeGenFunction.cpp
@@ -991,6 +991,9 @@ void CodeGenFunction::StartFunction(GlobalDecl GD, QualType RetTy,
   if (D && D->hasAttr<NoProfileFunctionAttr>())
     Fn->addFnAttr(llvm::Attribute::NoProfile);
 
+  if (D && D->hasAttr<HybridPatchableAttr>())
+    Fn->addFnAttr(llvm::Attribute::HybridPatchable);
+
   if (D) {
     // Function attributes take precedence over command line flags.
     if (auto *A = D->getAttr<FunctionReturnThunksAttr>()) {
diff --git a/clang/lib/CodeGen/CodeGenFunction.h b/clang/lib/CodeGen/CodeGenFunction.h
index bd62c65..1911fba 100644
--- a/clang/lib/CodeGen/CodeGenFunction.h
+++ b/clang/lib/CodeGen/CodeGenFunction.h
@@ -612,6 +612,9 @@ public:
   /// True if the current statement has always_inline attribute.
   bool InAlwaysInlineAttributedStmt = false;
 
+  /// True if the current statement has noconvergent attribute.
+  bool InNoConvergentAttributedStmt = false;
+
   // The CallExpr within the current statement that the musttail attribute
   // applies to.  nullptr if there is no 'musttail' on the current statement.
   const CallExpr *MustTailCall = nullptr;
@@ -4536,7 +4539,6 @@ public:
 
   RValue EmitNVPTXDevicePrintfCallExpr(const CallExpr *E);
   RValue EmitAMDGPUDevicePrintfCallExpr(const CallExpr *E);
-  RValue EmitOpenMPDevicePrintfCallExpr(const CallExpr *E);
 
   RValue EmitBuiltinExpr(const GlobalDecl GD, unsigned BuiltinID,
                          const CallExpr *E, ReturnValueSlot ReturnValue);
diff --git a/clang/lib/CodeGen/CodeGenModule.cpp b/clang/lib/CodeGen/CodeGenModule.cpp
index 63ed5b4..760185d 100644
--- a/clang/lib/CodeGen/CodeGenModule.cpp
+++ b/clang/lib/CodeGen/CodeGenModule.cpp
@@ -5659,7 +5659,7 @@ void CodeGenModule::EmitExternalFunctionDeclaration(const FunctionDecl *FD) {
     if (getCodeGenOpts().hasReducedDebugInfo()) {
       auto *Ty = getTypes().ConvertType(FD->getType());
       StringRef MangledName = getMangledName(FD);
-      auto *Fn = dyn_cast<llvm::Function>(
+      auto *Fn = cast<llvm::Function>(
           GetOrCreateLLVMFunction(MangledName, Ty, FD, /* ForVTable */ false));
       if (!Fn->getSubprogram())
         DI->EmitFunctionDecl(FD, FD->getLocation(), FD->getType(), Fn);
@@ -7484,7 +7484,7 @@ void CodeGenModule::EmitOMPThreadPrivateDecl(const OMPThreadPrivateDecl *D) {
   // Do not emit threadprivates in simd-only mode.
   if (LangOpts.OpenMP && LangOpts.OpenMPSimd)
     return;
-  for (auto RefExpr : D->varlists()) {
+  for (auto RefExpr : D->varlist()) {
     auto *VD = cast<VarDecl>(cast<DeclRefExpr>(RefExpr)->getDecl());
     bool PerformInit =
         VD->getAnyInitializer() &&
diff --git a/clang/lib/CodeGen/Targets/AArch64.cpp b/clang/lib/CodeGen/Targets/AArch64.cpp
index b9df54b..1dec3cd 100644
--- a/clang/lib/CodeGen/Targets/AArch64.cpp
+++ b/clang/lib/CodeGen/Targets/AArch64.cpp
@@ -883,8 +883,10 @@ void AArch64TargetCodeGenInfo::checkFunctionCallABIStreaming(
 
   if (!CalleeIsStreamingCompatible &&
       (CallerIsStreaming != CalleeIsStreaming || CallerIsStreamingCompatible))
-    CGM.getDiags().Report(CallLoc,
-                          diag::err_function_always_inline_attribute_mismatch)
+    CGM.getDiags().Report(
+        CallLoc, CalleeIsStreaming
+                     ? diag::err_function_always_inline_attribute_mismatch
+                     : diag::warn_function_always_inline_attribute_mismatch)
         << Caller->getDeclName() << Callee->getDeclName() << "streaming";
   if (auto *NewAttr = Callee->getAttr<ArmNewAttr>())
     if (NewAttr->isNewZA())
diff --git a/clang/lib/Driver/ToolChains/AMDGPU.cpp b/clang/lib/Driver/ToolChains/AMDGPU.cpp
index f6c7300..aa8f919 100644
--- a/clang/lib/Driver/ToolChains/AMDGPU.cpp
+++ b/clang/lib/Driver/ToolChains/AMDGPU.cpp
@@ -620,19 +620,33 @@ void amdgpu::Linker::ConstructJob(Compilation &C, const JobAction &JA,
                                   const char *LinkingOutput) const {
   std::string Linker = getToolChain().GetLinkerPath();
   ArgStringList CmdArgs;
-  CmdArgs.push_back("--no-undefined");
-  CmdArgs.push_back("-shared");
+  if (!Args.hasArg(options::OPT_r)) {
+    CmdArgs.push_back("--no-undefined");
+    CmdArgs.push_back("-shared");
+  }
 
   addLinkerCompressDebugSectionsOption(getToolChain(), Args, CmdArgs);
   Args.AddAllArgs(CmdArgs, options::OPT_L);
   getToolChain().AddFilePathLibArgs(Args, CmdArgs);
   AddLinkerInputs(getToolChain(), Inputs, Args, CmdArgs, JA);
-  if (C.getDriver().isUsingLTO())
+  if (C.getDriver().isUsingLTO()) {
     addLTOOptions(getToolChain(), Args, CmdArgs, Output, Inputs[0],
                   C.getDriver().getLTOMode() == LTOK_Thin);
-  else if (Args.hasArg(options::OPT_mcpu_EQ))
+  } else if (Args.hasArg(options::OPT_mcpu_EQ)) {
     CmdArgs.push_back(Args.MakeArgString(
-        "-plugin-opt=mcpu=" + Args.getLastArgValue(options::OPT_mcpu_EQ)));
+        "-plugin-opt=mcpu=" +
+        getProcessorFromTargetID(getToolChain().getTriple(),
+                                 Args.getLastArgValue(options::OPT_mcpu_EQ))));
+  }
+
+  // Always pass the target-id features to the LTO job.
+  std::vector<StringRef> Features;
+  getAMDGPUTargetFeatures(C.getDriver(), getToolChain().getTriple(), Args,
+                          Features);
+  if (!Features.empty()) {
+    CmdArgs.push_back(
+        Args.MakeArgString("-plugin-opt=-mattr=" + llvm::join(Features, ",")));
+  }
 
   addGPULibraries(getToolChain(), Args, CmdArgs);
 
diff --git a/clang/lib/Driver/ToolChains/Arch/X86.cpp b/clang/lib/Driver/ToolChains/Arch/X86.cpp
index 2f63333..dc6c869 100644
--- a/clang/lib/Driver/ToolChains/Arch/X86.cpp
+++ b/clang/lib/Driver/ToolChains/Arch/X86.cpp
@@ -266,19 +266,29 @@ void x86::getX86TargetFeatures(const Driver &D, const llvm::Triple &Triple,
     }
 
     bool IsNegative = Name.starts_with("no-");
+
+    bool Not64Bit = ArchType != llvm::Triple::x86_64;
+    if (Not64Bit && Name == "uintr")
+      D.Diag(diag::err_drv_unsupported_opt_for_target)
+          << A->getSpelling() << Triple.getTriple();
+
     if (A->getOption().matches(options::OPT_mapx_features_EQ) ||
         A->getOption().matches(options::OPT_mno_apx_features_EQ)) {
 
+      if (Not64Bit && !IsNegative)
+        D.Diag(diag::err_drv_unsupported_opt_for_target)
+            << StringRef(A->getSpelling().str() + "|-mapxf")
+            << Triple.getTriple();
+
       for (StringRef Value : A->getValues()) {
-        if (Value == "egpr" || Value == "push2pop2" || Value == "ppx" ||
-            Value == "ndd" || Value == "ccmp" || Value == "nf" ||
-            Value == "cf" || Value == "zu") {
-          Features.push_back(
-              Args.MakeArgString((IsNegative ? "-" : "+") + Value));
-          continue;
-        }
-        D.Diag(clang::diag::err_drv_unsupported_option_argument)
-            << A->getSpelling() << Value;
+        if (Value != "egpr" && Value != "push2pop2" && Value != "ppx" &&
+            Value != "ndd" && Value != "ccmp" && Value != "nf" &&
+            Value != "cf" && Value != "zu")
+          D.Diag(clang::diag::err_drv_unsupported_option_argument)
+              << A->getSpelling() << Value;
+
+        Features.push_back(
+            Args.MakeArgString((IsNegative ? "-" : "+") + Value));
       }
       continue;
     }
diff --git a/clang/lib/Driver/ToolChains/Clang.cpp b/clang/lib/Driver/ToolChains/Clang.cpp
index fa3d900..843d68c 100644
--- a/clang/lib/Driver/ToolChains/Clang.cpp
+++ b/clang/lib/Driver/ToolChains/Clang.cpp
@@ -3774,7 +3774,7 @@ static void RenderOpenCLOptions(const ArgList &Args, ArgStringList &CmdArgs,
     CmdArgs.push_back(Args.MakeArgString(CLExtStr));
   }
 
-  if (Arg *A = Args.getLastArg(options::OPT_cl_finite_math_only)) {
+  if (Args.hasArg(options::OPT_cl_finite_math_only)) {
     CmdArgs.push_back("-menable-no-infs");
     CmdArgs.push_back("-menable-no-nans");
   }
@@ -9155,7 +9155,7 @@ void LinkerWrapper::ConstructJob(Compilation &C, const JobAction &JA,
   // If we disable the GPU C library support it needs to be forwarded to the
   // link job.
   if (!Args.hasFlag(options::OPT_gpulibc, options::OPT_nogpulibc, true))
-    CmdArgs.push_back("--device-linker=-nolibc");
+    CmdArgs.push_back("--device-compiler=-nolibc");
 
   // Add the linker arguments to be forwarded by the wrapper.
   CmdArgs.push_back(Args.MakeArgString(Twine("--linker-path=") +
diff --git a/clang/lib/Driver/ToolChains/CommonArgs.cpp b/clang/lib/Driver/ToolChains/CommonArgs.cpp
index 1e37d9d..3d07142 100644
--- a/clang/lib/Driver/ToolChains/CommonArgs.cpp
+++ b/clang/lib/Driver/ToolChains/CommonArgs.cpp
@@ -1957,8 +1957,8 @@ tools::ParsePICArgs(const ToolChain &ToolChain, const ArgList &Args) {
   return std::make_tuple(RelocM, 0U, false);
 }
 
-// `-falign-functions` indicates that the functions should be aligned to a
-// 16-byte boundary.
+// `-falign-functions` indicates that the functions should be aligned to the
+// backend's preferred alignment.
 //
 // `-falign-functions=1` is the same as `-fno-align-functions`.
 //
diff --git a/clang/lib/Driver/ToolChains/Cuda.cpp b/clang/lib/Driver/ToolChains/Cuda.cpp
index e98e574..6e10e3d 100644
--- a/clang/lib/Driver/ToolChains/Cuda.cpp
+++ b/clang/lib/Driver/ToolChains/Cuda.cpp
@@ -596,14 +596,16 @@ void NVPTX::Linker::ConstructJob(Compilation &C, const JobAction &JA,
     CmdArgs.push_back("-v");
 
   StringRef GPUArch = Args.getLastArgValue(options::OPT_march_EQ);
-  if (GPUArch.empty()) {
+  if (GPUArch.empty() && !C.getDriver().isUsingLTO()) {
     C.getDriver().Diag(diag::err_drv_offload_missing_gpu_arch)
         << getToolChain().getArchName() << getShortName();
     return;
   }
 
-  CmdArgs.push_back("-arch");
-  CmdArgs.push_back(Args.MakeArgString(GPUArch));
+  if (!GPUArch.empty()) {
+    CmdArgs.push_back("-arch");
+    CmdArgs.push_back(Args.MakeArgString(GPUArch));
+  }
 
   if (Args.hasArg(options::OPT_ptxas_path_EQ))
     CmdArgs.push_back(Args.MakeArgString(
diff --git a/clang/lib/Driver/ToolChains/Flang.cpp b/clang/lib/Driver/ToolChains/Flang.cpp
index c4f2375..f5de5eb 100644
--- a/clang/lib/Driver/ToolChains/Flang.cpp
+++ b/clang/lib/Driver/ToolChains/Flang.cpp
@@ -342,6 +342,9 @@ void Flang::AddAMDGPUTargetArgs(const ArgList &Args,
     StringRef Val = A->getValue();
     CmdArgs.push_back(Args.MakeArgString("-mcode-object-version=" + Val));
   }
+
+  const ToolChain &TC = getToolChain();
+  TC.addClangTargetOptions(Args, CmdArgs, Action::OffloadKind::OFK_OpenMP);
 }
 
 void Flang::addTargetOptions(const ArgList &Args,
diff --git a/clang/lib/Driver/ToolChains/PS4CPU.cpp b/clang/lib/Driver/ToolChains/PS4CPU.cpp
index f883f29..a9e612c 100644
--- a/clang/lib/Driver/ToolChains/PS4CPU.cpp
+++ b/clang/lib/Driver/ToolChains/PS4CPU.cpp
@@ -177,7 +177,9 @@ void tools::PS4cpu::Linker::ConstructJob(Compilation &C, const JobAction &JA,
   if (StringRef Threads = getLTOParallelism(Args, D); !Threads.empty())
     AddLTOFlag(Twine("-threads=") + Threads);
 
-  CmdArgs.push_back(Args.MakeArgString(Twine("-lto-debug-options=") + LTOArgs));
+  if (*LTOArgs)
+    CmdArgs.push_back(
+        Args.MakeArgString(Twine("-lto-debug-options=") + LTOArgs));
 
   if (!Args.hasArg(options::OPT_nostdlib, options::OPT_nodefaultlibs))
     TC.addSanitizerArgs(Args, CmdArgs, "-l", "");
diff --git a/clang/lib/Format/TokenAnnotator.cpp b/clang/lib/Format/TokenAnnotator.cpp
index 16ab18e..8cd5cf2 100644
--- a/clang/lib/Format/TokenAnnotator.cpp
+++ b/clang/lib/Format/TokenAnnotator.cpp
@@ -155,8 +155,8 @@ private:
     if (NonTemplateLess.count(CurrentToken->Previous) > 0)
       return false;
 
-    const FormatToken &Previous = *CurrentToken->Previous; // The '<'.
-    if (Previous.Previous) {
+    if (const auto &Previous = *CurrentToken->Previous; // The '<'.
+        Previous.Previous) {
       if (Previous.Previous->Tok.isLiteral())
         return false;
       if (Previous.Previous->is(tok::r_brace))
@@ -176,11 +176,13 @@ private:
     FormatToken *Left = CurrentToken->Previous;
     Left->ParentBracket = Contexts.back().ContextKind;
     ScopedContextCreator ContextCreator(*this, tok::less, 12);
-
     Contexts.back().IsExpression = false;
+
+    const auto *BeforeLess = Left->Previous;
+
     // If there's a template keyword before the opening angle bracket, this is a
     // template parameter, not an argument.
-    if (Left->Previous && Left->Previous->isNot(tok::kw_template))
+    if (BeforeLess && BeforeLess->isNot(tok::kw_template))
       Contexts.back().ContextType = Context::TemplateArgument;
 
     if (Style.Language == FormatStyle::LK_Java &&
@@ -188,19 +190,24 @@ private:
       next();
     }
 
-    while (CurrentToken) {
+    for (bool SeenTernaryOperator = false; CurrentToken;) {
+      const bool InExpr = Contexts[Contexts.size() - 2].IsExpression;
       if (CurrentToken->is(tok::greater)) {
+        const auto *Next = CurrentToken->Next;
         // Try to do a better job at looking for ">>" within the condition of
         // a statement. Conservatively insert spaces between consecutive ">"
         // tokens to prevent splitting right bitshift operators and potentially
         // altering program semantics. This check is overly conservative and
         // will prevent spaces from being inserted in select nested template
         // parameter cases, but should not alter program semantics.
-        if (CurrentToken->Next && CurrentToken->Next->is(tok::greater) &&
+        if (Next && Next->is(tok::greater) &&
             Left->ParentBracket != tok::less &&
             CurrentToken->getStartOfNonWhitespace() ==
-                CurrentToken->Next->getStartOfNonWhitespace().getLocWithOffset(
-                    -1)) {
+                Next->getStartOfNonWhitespace().getLocWithOffset(-1)) {
+          return false;
+        }
+        if (InExpr && SeenTernaryOperator &&
+            (!Next || !Next->isOneOf(tok::l_paren, tok::l_brace))) {
           return false;
         }
         Left->MatchingParen = CurrentToken;
@@ -211,14 +218,14 @@ private:
         //   msg: < item: data >
         // In TT_TextProto, map<key, value> does not occur.
         if (Style.Language == FormatStyle::LK_TextProto ||
-            (Style.Language == FormatStyle::LK_Proto && Left->Previous &&
-             Left->Previous->isOneOf(TT_SelectorName, TT_DictLiteral))) {
+            (Style.Language == FormatStyle::LK_Proto && BeforeLess &&
+             BeforeLess->isOneOf(TT_SelectorName, TT_DictLiteral))) {
           CurrentToken->setType(TT_DictLiteral);
         } else {
           CurrentToken->setType(TT_TemplateCloser);
           CurrentToken->Tok.setLength(1);
         }
-        if (CurrentToken->Next && CurrentToken->Next->Tok.isLiteral())
+        if (Next && Next->Tok.isLiteral())
           return false;
         next();
         return true;
@@ -230,18 +237,21 @@ private:
       }
       if (CurrentToken->isOneOf(tok::r_paren, tok::r_square, tok::r_brace))
         return false;
+      const auto &Prev = *CurrentToken->Previous;
       // If a && or || is found and interpreted as a binary operator, this set
       // of angles is likely part of something like "a < b && c > d". If the
       // angles are inside an expression, the ||/&& might also be a binary
       // operator that was misinterpreted because we are parsing template
       // parameters.
       // FIXME: This is getting out of hand, write a decent parser.
-      if (CurrentToken->Previous->isOneOf(tok::pipepipe, tok::ampamp) &&
-          CurrentToken->Previous->is(TT_BinaryOperator) &&
-          Contexts[Contexts.size() - 2].IsExpression &&
-          !Line.startsWith(tok::kw_template)) {
-        return false;
+      if (InExpr && !Line.startsWith(tok::kw_template) &&
+          Prev.is(TT_BinaryOperator)) {
+        const auto Precedence = Prev.getPrecedence();
+        if (Precedence > prec::Conditional && Precedence < prec::Relational)
+          return false;
       }
+      if (Prev.is(TT_ConditionalExpr))
+        SeenTernaryOperator = true;
       updateParameterCount(Left, CurrentToken);
       if (Style.Language == FormatStyle::LK_Proto) {
         if (FormatToken *Previous = CurrentToken->getPreviousNonComment()) {
diff --git a/clang/lib/Lex/PPCaching.cpp b/clang/lib/Lex/PPCaching.cpp
index f38ff62..cbacda9 100644
--- a/clang/lib/Lex/PPCaching.cpp
+++ b/clang/lib/Lex/PPCaching.cpp
@@ -14,6 +14,15 @@
 #include "clang/Lex/Preprocessor.h"
 using namespace clang;
 
+std::pair<Preprocessor::CachedTokensTy::size_type, bool>
+Preprocessor::LastBacktrackPos() {
+  assert(isBacktrackEnabled());
+  auto BacktrackPos = BacktrackPositions.back();
+  bool Unannotated =
+      static_cast<CachedTokensTy::difference_type>(BacktrackPos) < 0;
+  return {Unannotated ? ~BacktrackPos : BacktrackPos, Unannotated};
+}
+
 // EnableBacktrackAtThisPos - From the point that this method is called, and
 // until CommitBacktrackedTokens() or Backtrack() is called, the Preprocessor
 // keeps track of the lexed tokens so that a subsequent Backtrack() call will
@@ -22,26 +31,45 @@ using namespace clang;
 // Nested backtracks are allowed, meaning that EnableBacktrackAtThisPos can
 // be called multiple times and CommitBacktrackedTokens/Backtrack calls will
 // be combined with the EnableBacktrackAtThisPos calls in reverse order.
-void Preprocessor::EnableBacktrackAtThisPos() {
+void Preprocessor::EnableBacktrackAtThisPos(bool Unannotated) {
   assert(LexLevel == 0 && "cannot use lookahead while lexing");
-  BacktrackPositions.push_back(CachedLexPos);
+  BacktrackPositions.push_back(Unannotated ? ~CachedLexPos : CachedLexPos);
+  if (Unannotated)
+    UnannotatedBacktrackTokens.emplace_back(CachedTokens, CachedTokens.size());
   EnterCachingLexMode();
 }
 
+Preprocessor::CachedTokensTy Preprocessor::PopUnannotatedBacktrackTokens() {
+  assert(isUnannotatedBacktrackEnabled() && "missing unannotated tokens?");
+  auto [UnannotatedTokens, NumCachedToks] =
+      std::move(UnannotatedBacktrackTokens.back());
+  UnannotatedBacktrackTokens.pop_back();
+  // If another unannotated backtrack is active, propagate any tokens that were
+  // lexed (not cached) since EnableBacktrackAtThisPos was last called.
+  if (isUnannotatedBacktrackEnabled())
+    UnannotatedBacktrackTokens.back().first.append(
+        UnannotatedTokens.begin() + NumCachedToks, UnannotatedTokens.end());
+  return std::move(UnannotatedTokens);
+}
+
 // Disable the last EnableBacktrackAtThisPos call.
 void Preprocessor::CommitBacktrackedTokens() {
-  assert(!BacktrackPositions.empty()
-         && "EnableBacktrackAtThisPos was not called!");
+  assert(isBacktrackEnabled() && "EnableBacktrackAtThisPos was not called!");
+  auto [BacktrackPos, Unannotated] = LastBacktrackPos();
   BacktrackPositions.pop_back();
+  if (Unannotated)
+    PopUnannotatedBacktrackTokens();
 }
 
 // Make Preprocessor re-lex the tokens that were lexed since
 // EnableBacktrackAtThisPos() was previously called.
 void Preprocessor::Backtrack() {
-  assert(!BacktrackPositions.empty()
-         && "EnableBacktrackAtThisPos was not called!");
-  CachedLexPos = BacktrackPositions.back();
+  assert(isBacktrackEnabled() && "EnableBacktrackAtThisPos was not called!");
+  auto [BacktrackPos, Unannotated] = LastBacktrackPos();
   BacktrackPositions.pop_back();
+  CachedLexPos = BacktrackPos;
+  if (Unannotated)
+    CachedTokens = PopUnannotatedBacktrackTokens();
   recomputeCurLexerKind();
 }
 
@@ -67,6 +95,8 @@ void Preprocessor::CachingLex(Token &Result) {
     EnterCachingLexModeUnchecked();
     CachedTokens.push_back(Result);
     ++CachedLexPos;
+    if (isUnannotatedBacktrackEnabled())
+      UnannotatedBacktrackTokens.back().first.push_back(Result);
     return;
   }
 
@@ -108,6 +138,8 @@ const Token &Preprocessor::PeekAhead(unsigned N) {
   for (size_t C = CachedLexPos + N - CachedTokens.size(); C > 0; --C) {
     CachedTokens.push_back(Token());
     Lex(CachedTokens.back());
+    if (isUnannotatedBacktrackEnabled())
+      UnannotatedBacktrackTokens.back().first.push_back(CachedTokens.back());
   }
   EnterCachingLexMode();
   return CachedTokens.back();
@@ -124,7 +156,7 @@ void Preprocessor::AnnotatePreviousCachedTokens(const Token &Tok) {
   for (CachedTokensTy::size_type i = CachedLexPos; i != 0; --i) {
     CachedTokensTy::iterator AnnotBegin = CachedTokens.begin() + i-1;
     if (AnnotBegin->getLocation() == Tok.getLocation()) {
-      assert((BacktrackPositions.empty() || BacktrackPositions.back() <= i) &&
+      assert((!isBacktrackEnabled() || LastBacktrackPos().first <= i) &&
              "The backtrack pos points inside the annotated tokens!");
       // Replace the cached tokens with the single annotation token.
       if (i < CachedLexPos)
diff --git a/clang/lib/Lex/PPMacroExpansion.cpp b/clang/lib/Lex/PPMacroExpansion.cpp
index 879f01e..1e31fcc 100644
--- a/clang/lib/Lex/PPMacroExpansion.cpp
+++ b/clang/lib/Lex/PPMacroExpansion.cpp
@@ -1604,6 +1604,16 @@ static bool isTargetVariantEnvironment(const TargetInfo &TI,
   return false;
 }
 
+#if defined(__sun__) && defined(__svr4__)
+// GCC mangles std::tm as tm for binary compatibility on Solaris (Issue
+// #33114).  We need to match this to allow the std::put_time calls to link
+// (PR #99075).
+asm("_ZNKSt8time_putIcSt19ostreambuf_iteratorIcSt11char_traitsIcEEE3putES3_"
+    "RSt8ios_basecPKSt2tmPKcSB_ = "
+    "_ZNKSt8time_putIcSt19ostreambuf_iteratorIcSt11char_traitsIcEEE3putES3_"
+    "RSt8ios_basecPK2tmPKcSB_");
+#endif
+
 /// ExpandBuiltinMacro - If an identifier token is read that is to be expanded
 /// as a builtin macro, handle it and return the next token as 'Tok'.
 void Preprocessor::ExpandBuiltinMacro(Token &Tok) {
diff --git a/clang/lib/Lex/Preprocessor.cpp b/clang/lib/Lex/Preprocessor.cpp
index 63e27e6..f0b4593 100644
--- a/clang/lib/Lex/Preprocessor.cpp
+++ b/clang/lib/Lex/Preprocessor.cpp
@@ -170,7 +170,7 @@ Preprocessor::Preprocessor(std::shared_ptr<PreprocessorOptions> PPOpts,
 }
 
 Preprocessor::~Preprocessor() {
-  assert(BacktrackPositions.empty() && "EnableBacktrack/Backtrack imbalance!");
+  assert(!isBacktrackEnabled() && "EnableBacktrack/Backtrack imbalance!");
 
   IncludeMacroStack.clear();
 
diff --git a/clang/lib/Parse/ParseCXXInlineMethods.cpp b/clang/lib/Parse/ParseCXXInlineMethods.cpp
index 9ccbbf9..b461743 100644
--- a/clang/lib/Parse/ParseCXXInlineMethods.cpp
+++ b/clang/lib/Parse/ParseCXXInlineMethods.cpp
@@ -1205,41 +1205,6 @@ bool Parser::ConsumeAndStoreConditional(CachedTokens &Toks) {
   return true;
 }
 
-/// A tentative parsing action that can also revert token annotations.
-class Parser::UnannotatedTentativeParsingAction : public TentativeParsingAction {
-public:
-  explicit UnannotatedTentativeParsingAction(Parser &Self,
-                                             tok::TokenKind EndKind)
-      : TentativeParsingAction(Self), Self(Self), EndKind(EndKind) {
-    // Stash away the old token stream, so we can restore it once the
-    // tentative parse is complete.
-    TentativeParsingAction Inner(Self);
-    Self.ConsumeAndStoreUntil(EndKind, Toks, true, /*ConsumeFinalToken*/false);
-    Inner.Revert();
-  }
-
-  void RevertAnnotations() {
-    Revert();
-
-    // Put back the original tokens.
-    Self.SkipUntil(EndKind, StopAtSemi | StopBeforeMatch);
-    if (Toks.size()) {
-      auto Buffer = std::make_unique<Token[]>(Toks.size());
-      std::copy(Toks.begin() + 1, Toks.end(), Buffer.get());
-      Buffer[Toks.size() - 1] = Self.Tok;
-      Self.PP.EnterTokenStream(std::move(Buffer), Toks.size(), true,
-                               /*IsReinject*/ true);
-
-      Self.Tok = Toks.front();
-    }
-  }
-
-private:
-  Parser &Self;
-  CachedTokens Toks;
-  tok::TokenKind EndKind;
-};
-
 /// ConsumeAndStoreInitializer - Consume and store the token at the passed token
 /// container until the end of the current initializer expression (either a
 /// default argument or an in-class initializer for a non-static data member).
@@ -1277,9 +1242,7 @@ bool Parser::ConsumeAndStoreInitializer(CachedTokens &Toks,
       //    syntactically-valid init-declarator-list, then this comma ends
       //    the default initializer.
       {
-        UnannotatedTentativeParsingAction PA(*this,
-                                             CIK == CIK_DefaultInitializer
-                                               ? tok::semi : tok::r_paren);
+        TentativeParsingAction TPA(*this, /*Unannotated=*/true);
         Sema::TentativeAnalysisScope Scope(Actions);
 
         TPResult Result = TPResult::Error;
@@ -1307,7 +1270,7 @@ bool Parser::ConsumeAndStoreInitializer(CachedTokens &Toks,
         // Put the token stream back and undo any annotations we performed
         // after the comma. They may reflect a different parse than the one
         // we will actually perform at the end of the class.
-        PA.RevertAnnotations();
+        TPA.Revert();
 
         // If what follows could be a declaration, it is a declaration.
         if (Result != TPResult::False && Result != TPResult::Error)
diff --git a/clang/lib/Parse/ParseDecl.cpp b/clang/lib/Parse/ParseDecl.cpp
index 7ce9a9c..4a2d9a6 100644
--- a/clang/lib/Parse/ParseDecl.cpp
+++ b/clang/lib/Parse/ParseDecl.cpp
@@ -314,64 +314,92 @@ void Parser::ParseGNUAttributes(ParsedAttributes &Attrs,
 }
 
 /// Determine whether the given attribute has an identifier argument.
-static bool attributeHasIdentifierArg(const IdentifierInfo &II) {
+static bool attributeHasIdentifierArg(const IdentifierInfo &II,
+                                      ParsedAttr::Syntax Syntax,
+                                      IdentifierInfo *ScopeName) {
+  std::string FullName =
+      AttributeCommonInfo::normalizeFullNameWithSyntax(&II, ScopeName, Syntax);
 #define CLANG_ATTR_IDENTIFIER_ARG_LIST
-  return llvm::StringSwitch<bool>(normalizeAttrName(II.getName()))
+  return llvm::StringSwitch<bool>(FullName)
 #include "clang/Parse/AttrParserStringSwitches.inc"
-           .Default(false);
+      .Default(false);
 #undef CLANG_ATTR_IDENTIFIER_ARG_LIST
 }
 
 /// Determine whether the given attribute has an identifier argument.
 static ParsedAttributeArgumentsProperties
-attributeStringLiteralListArg(const llvm::Triple &T, const IdentifierInfo &II) {
+attributeStringLiteralListArg(const llvm::Triple &T, const IdentifierInfo &II,
+                              ParsedAttr::Syntax Syntax,
+                              IdentifierInfo *ScopeName) {
+  std::string FullName =
+      AttributeCommonInfo::normalizeFullNameWithSyntax(&II, ScopeName, Syntax);
 #define CLANG_ATTR_STRING_LITERAL_ARG_LIST
-  return llvm::StringSwitch<uint32_t>(normalizeAttrName(II.getName()))
+  return llvm::StringSwitch<uint32_t>(FullName)
 #include "clang/Parse/AttrParserStringSwitches.inc"
       .Default(0);
 #undef CLANG_ATTR_STRING_LITERAL_ARG_LIST
 }
 
 /// Determine whether the given attribute has a variadic identifier argument.
-static bool attributeHasVariadicIdentifierArg(const IdentifierInfo &II) {
+static bool attributeHasVariadicIdentifierArg(const IdentifierInfo &II,
+                                              ParsedAttr::Syntax Syntax,
+                                              IdentifierInfo *ScopeName) {
+  std::string FullName =
+      AttributeCommonInfo::normalizeFullNameWithSyntax(&II, ScopeName, Syntax);
 #define CLANG_ATTR_VARIADIC_IDENTIFIER_ARG_LIST
-  return llvm::StringSwitch<bool>(normalizeAttrName(II.getName()))
+  return llvm::StringSwitch<bool>(FullName)
 #include "clang/Parse/AttrParserStringSwitches.inc"
-           .Default(false);
+      .Default(false);
 #undef CLANG_ATTR_VARIADIC_IDENTIFIER_ARG_LIST
 }
 
 /// Determine whether the given attribute treats kw_this as an identifier.
-static bool attributeTreatsKeywordThisAsIdentifier(const IdentifierInfo &II) {
+static bool attributeTreatsKeywordThisAsIdentifier(const IdentifierInfo &II,
+                                                   ParsedAttr::Syntax Syntax,
+                                                   IdentifierInfo *ScopeName) {
+  std::string FullName =
+      AttributeCommonInfo::normalizeFullNameWithSyntax(&II, ScopeName, Syntax);
 #define CLANG_ATTR_THIS_ISA_IDENTIFIER_ARG_LIST
-  return llvm::StringSwitch<bool>(normalizeAttrName(II.getName()))
+  return llvm::StringSwitch<bool>(FullName)
 #include "clang/Parse/AttrParserStringSwitches.inc"
-           .Default(false);
+      .Default(false);
 #undef CLANG_ATTR_THIS_ISA_IDENTIFIER_ARG_LIST
 }
 
 /// Determine if an attribute accepts parameter packs.
-static bool attributeAcceptsExprPack(const IdentifierInfo &II) {
+static bool attributeAcceptsExprPack(const IdentifierInfo &II,
+                                     ParsedAttr::Syntax Syntax,
+                                     IdentifierInfo *ScopeName) {
+  std::string FullName =
+      AttributeCommonInfo::normalizeFullNameWithSyntax(&II, ScopeName, Syntax);
 #define CLANG_ATTR_ACCEPTS_EXPR_PACK
-  return llvm::StringSwitch<bool>(normalizeAttrName(II.getName()))
+  return llvm::StringSwitch<bool>(FullName)
 #include "clang/Parse/AttrParserStringSwitches.inc"
       .Default(false);
 #undef CLANG_ATTR_ACCEPTS_EXPR_PACK
 }
 
 /// Determine whether the given attribute parses a type argument.
-static bool attributeIsTypeArgAttr(const IdentifierInfo &II) {
+static bool attributeIsTypeArgAttr(const IdentifierInfo &II,
+                                   ParsedAttr::Syntax Syntax,
+                                   IdentifierInfo *ScopeName) {
+  std::string FullName =
+      AttributeCommonInfo::normalizeFullNameWithSyntax(&II, ScopeName, Syntax);
 #define CLANG_ATTR_TYPE_ARG_LIST
-  return llvm::StringSwitch<bool>(normalizeAttrName(II.getName()))
+  return llvm::StringSwitch<bool>(FullName)
 #include "clang/Parse/AttrParserStringSwitches.inc"
-           .Default(false);
+      .Default(false);
 #undef CLANG_ATTR_TYPE_ARG_LIST
 }
 
 /// Determine whether the given attribute takes identifier arguments.
-static bool attributeHasStrictIdentifierArgs(const IdentifierInfo &II) {
+static bool attributeHasStrictIdentifierArgs(const IdentifierInfo &II,
+                                             ParsedAttr::Syntax Syntax,
+                                             IdentifierInfo *ScopeName) {
+  std::string FullName =
+      AttributeCommonInfo::normalizeFullNameWithSyntax(&II, ScopeName, Syntax);
 #define CLANG_ATTR_STRICT_IDENTIFIER_ARG_AT_INDEX_LIST
-  return (llvm::StringSwitch<uint64_t>(normalizeAttrName(II.getName()))
+  return (llvm::StringSwitch<uint64_t>(FullName)
 #include "clang/Parse/AttrParserStringSwitches.inc"
               .Default(0)) != 0;
 #undef CLANG_ATTR_STRICT_IDENTIFIER_ARG_AT_INDEX_LIST
@@ -380,9 +408,13 @@ static bool attributeHasStrictIdentifierArgs(const IdentifierInfo &II) {
 /// Determine whether the given attribute takes an identifier argument at a
 /// specific index
 static bool attributeHasStrictIdentifierArgAtIndex(const IdentifierInfo &II,
+                                                   ParsedAttr::Syntax Syntax,
+                                                   IdentifierInfo *ScopeName,
                                                    size_t argIndex) {
+  std::string FullName =
+      AttributeCommonInfo::normalizeFullNameWithSyntax(&II, ScopeName, Syntax);
 #define CLANG_ATTR_STRICT_IDENTIFIER_ARG_AT_INDEX_LIST
-  return (llvm::StringSwitch<uint64_t>(normalizeAttrName(II.getName()))
+  return (llvm::StringSwitch<uint64_t>(FullName)
 #include "clang/Parse/AttrParserStringSwitches.inc"
               .Default(0)) &
          (1ull << argIndex);
@@ -391,11 +423,15 @@ static bool attributeHasStrictIdentifierArgAtIndex(const IdentifierInfo &II,
 
 /// Determine whether the given attribute requires parsing its arguments
 /// in an unevaluated context or not.
-static bool attributeParsedArgsUnevaluated(const IdentifierInfo &II) {
+static bool attributeParsedArgsUnevaluated(const IdentifierInfo &II,
+                                           ParsedAttr::Syntax Syntax,
+                                           IdentifierInfo *ScopeName) {
+  std::string FullName =
+      AttributeCommonInfo::normalizeFullNameWithSyntax(&II, ScopeName, Syntax);
 #define CLANG_ATTR_ARG_CONTEXT_LIST
-  return llvm::StringSwitch<bool>(normalizeAttrName(II.getName()))
+  return llvm::StringSwitch<bool>(FullName)
 #include "clang/Parse/AttrParserStringSwitches.inc"
-           .Default(false);
+      .Default(false);
 #undef CLANG_ATTR_ARG_CONTEXT_LIST
 }
 
@@ -523,10 +559,12 @@ unsigned Parser::ParseAttributeArgsCommon(
   // Ignore the left paren location for now.
   ConsumeParen();
 
-  bool ChangeKWThisToIdent = attributeTreatsKeywordThisAsIdentifier(*AttrName);
-  bool AttributeIsTypeArgAttr = attributeIsTypeArgAttr(*AttrName);
+  bool ChangeKWThisToIdent = attributeTreatsKeywordThisAsIdentifier(
+      *AttrName, Form.getSyntax(), ScopeName);
+  bool AttributeIsTypeArgAttr =
+      attributeIsTypeArgAttr(*AttrName, Form.getSyntax(), ScopeName);
   bool AttributeHasVariadicIdentifierArg =
-      attributeHasVariadicIdentifierArg(*AttrName);
+      attributeHasVariadicIdentifierArg(*AttrName, Form.getSyntax(), ScopeName);
 
   // Interpret "kw_this" as an identifier if the attributed requests it.
   if (ChangeKWThisToIdent && Tok.is(tok::kw_this))
@@ -535,8 +573,9 @@ unsigned Parser::ParseAttributeArgsCommon(
   ArgsVector ArgExprs;
   if (Tok.is(tok::identifier)) {
     // If this attribute wants an 'identifier' argument, make it so.
-    bool IsIdentifierArg = AttributeHasVariadicIdentifierArg ||
-                           attributeHasIdentifierArg(*AttrName);
+    bool IsIdentifierArg =
+        AttributeHasVariadicIdentifierArg ||
+        attributeHasIdentifierArg(*AttrName, Form.getSyntax(), ScopeName);
     ParsedAttr::Kind AttrKind =
         ParsedAttr::getParsedKind(AttrName, ScopeName, Form.getSyntax());
 
@@ -568,7 +607,8 @@ unsigned Parser::ParseAttributeArgsCommon(
       if (T.isUsable())
         TheParsedType = T.get();
     } else if (AttributeHasVariadicIdentifierArg ||
-               attributeHasStrictIdentifierArgs(*AttrName)) {
+               attributeHasStrictIdentifierArgs(*AttrName, Form.getSyntax(),
+                                                ScopeName)) {
       // Parse variadic identifier arg. This can either consume identifiers or
       // expressions. Variadic identifier args do not support parameter packs
       // because those are typically used for attributes with enumeration
@@ -579,8 +619,9 @@ unsigned Parser::ParseAttributeArgsCommon(
         if (ChangeKWThisToIdent && Tok.is(tok::kw_this))
           Tok.setKind(tok::identifier);
 
-        if (Tok.is(tok::identifier) && attributeHasStrictIdentifierArgAtIndex(
-                                           *AttrName, ArgExprs.size())) {
+        if (Tok.is(tok::identifier) &&
+            attributeHasStrictIdentifierArgAtIndex(
+                *AttrName, Form.getSyntax(), ScopeName, ArgExprs.size())) {
           ArgExprs.push_back(ParseIdentifierLoc());
           continue;
         }
@@ -589,7 +630,8 @@ unsigned Parser::ParseAttributeArgsCommon(
         if (Tok.is(tok::identifier)) {
           ArgExprs.push_back(ParseIdentifierLoc());
         } else {
-          bool Uneval = attributeParsedArgsUnevaluated(*AttrName);
+          bool Uneval = attributeParsedArgsUnevaluated(
+              *AttrName, Form.getSyntax(), ScopeName);
           EnterExpressionEvaluationContext Unevaluated(
               Actions,
               Uneval ? Sema::ExpressionEvaluationContext::Unevaluated
@@ -610,7 +652,8 @@ unsigned Parser::ParseAttributeArgsCommon(
       } while (TryConsumeToken(tok::comma));
     } else {
       // General case. Parse all available expressions.
-      bool Uneval = attributeParsedArgsUnevaluated(*AttrName);
+      bool Uneval = attributeParsedArgsUnevaluated(*AttrName, Form.getSyntax(),
+                                                   ScopeName);
       EnterExpressionEvaluationContext Unevaluated(
           Actions,
           Uneval ? Sema::ExpressionEvaluationContext::Unevaluated
@@ -621,7 +664,8 @@ unsigned Parser::ParseAttributeArgsCommon(
 
       ExprVector ParsedExprs;
       ParsedAttributeArgumentsProperties ArgProperties =
-          attributeStringLiteralListArg(getTargetInfo().getTriple(), *AttrName);
+          attributeStringLiteralListArg(getTargetInfo().getTriple(), *AttrName,
+                                        Form.getSyntax(), ScopeName);
       if (ParseAttributeArgumentList(*AttrName, ParsedExprs, ArgProperties)) {
         SkipUntil(tok::r_paren, StopAtSemi);
         return 0;
@@ -632,7 +676,7 @@ unsigned Parser::ParseAttributeArgsCommon(
         if (!isa<PackExpansionExpr>(ParsedExprs[I]))
           continue;
 
-        if (!attributeAcceptsExprPack(*AttrName)) {
+        if (!attributeAcceptsExprPack(*AttrName, Form.getSyntax(), ScopeName)) {
           Diag(Tok.getLocation(),
                diag::err_attribute_argument_parm_pack_not_supported)
               << AttrName;
@@ -696,7 +740,7 @@ void Parser::ParseGNUAttributeArgs(
     ParseTypeTagForDatatypeAttribute(*AttrName, AttrNameLoc, Attrs, EndLoc,
                                      ScopeName, ScopeLoc, Form);
     return;
-  } else if (attributeIsTypeArgAttr(*AttrName)) {
+  } else if (attributeIsTypeArgAttr(*AttrName, Form.getSyntax(), ScopeName)) {
     ParseAttributeWithTypeArg(*AttrName, AttrNameLoc, Attrs, ScopeName,
                               ScopeLoc, Form);
     return;
@@ -6650,48 +6694,66 @@ void Parser::ParseDeclaratorInternal(Declarator &D,
        (Tok.is(tok::identifier) &&
         (NextToken().is(tok::coloncolon) || NextToken().is(tok::less))) ||
        Tok.is(tok::annot_cxxscope))) {
+    TentativeParsingAction TPA(*this, /*Unannotated=*/true);
     bool EnteringContext = D.getContext() == DeclaratorContext::File ||
                            D.getContext() == DeclaratorContext::Member;
     CXXScopeSpec SS;
     SS.setTemplateParamLists(D.getTemplateParameterLists());
-    ParseOptionalCXXScopeSpecifier(SS, /*ObjectType=*/nullptr,
-                                   /*ObjectHasErrors=*/false, EnteringContext);
 
-    if (SS.isNotEmpty()) {
-      if (Tok.isNot(tok::star)) {
-        // The scope spec really belongs to the direct-declarator.
-        if (D.mayHaveIdentifier())
-          D.getCXXScopeSpec() = SS;
-        else
-          AnnotateScopeToken(SS, true);
+    if (ParseOptionalCXXScopeSpecifier(SS, /*ObjectType=*/nullptr,
+                                       /*ObjectHasErrors=*/false,
+                                       /*EnteringContext=*/false,
+                                       /*MayBePseudoDestructor=*/nullptr,
+                                       /*IsTypename=*/false, /*LastII=*/nullptr,
+                                       /*OnlyNamespace=*/false,
+                                       /*InUsingDeclaration=*/false,
+                                       /*Disambiguation=*/EnteringContext) ||
+
+        SS.isEmpty() || SS.isInvalid() || !EnteringContext ||
+        Tok.is(tok::star)) {
+      TPA.Commit();
+      if (SS.isNotEmpty() && Tok.is(tok::star)) {
+        if (SS.isValid()) {
+          checkCompoundToken(SS.getEndLoc(), tok::coloncolon,
+                             CompoundToken::MemberPtr);
+        }
 
-        if (DirectDeclParser)
-          (this->*DirectDeclParser)(D);
+        SourceLocation StarLoc = ConsumeToken();
+        D.SetRangeEnd(StarLoc);
+        DeclSpec DS(AttrFactory);
+        ParseTypeQualifierListOpt(DS);
+        D.ExtendWithDeclSpec(DS);
+
+        // Recurse to parse whatever is left.
+        Actions.runWithSufficientStackSpace(D.getBeginLoc(), [&] {
+          ParseDeclaratorInternal(D, DirectDeclParser);
+        });
+
+        // Sema will have to catch (syntactically invalid) pointers into global
+        // scope. It has to catch pointers into namespace scope anyway.
+        D.AddTypeInfo(DeclaratorChunk::getMemberPointer(
+                          SS, DS.getTypeQualifiers(), StarLoc, DS.getEndLoc()),
+                      std::move(DS.getAttributes()),
+                      /*EndLoc=*/SourceLocation());
         return;
       }
+    } else {
+      TPA.Revert();
+      SS.clear();
+      ParseOptionalCXXScopeSpecifier(SS, /*ObjectType=*/nullptr,
+                                     /*ObjectHasErrors=*/false,
+                                     /*EnteringContext=*/true);
+    }
 
-      if (SS.isValid()) {
-        checkCompoundToken(SS.getEndLoc(), tok::coloncolon,
-                           CompoundToken::MemberPtr);
-      }
+    if (SS.isNotEmpty()) {
+      // The scope spec really belongs to the direct-declarator.
+      if (D.mayHaveIdentifier())
+        D.getCXXScopeSpec() = SS;
+      else
+        AnnotateScopeToken(SS, true);
 
-      SourceLocation StarLoc = ConsumeToken();
-      D.SetRangeEnd(StarLoc);
-      DeclSpec DS(AttrFactory);
-      ParseTypeQualifierListOpt(DS);
-      D.ExtendWithDeclSpec(DS);
-
-      // Recurse to parse whatever is left.
-      Actions.runWithSufficientStackSpace(D.getBeginLoc(), [&] {
-        ParseDeclaratorInternal(D, DirectDeclParser);
-      });
-
-      // Sema will have to catch (syntactically invalid) pointers into global
-      // scope. It has to catch pointers into namespace scope anyway.
-      D.AddTypeInfo(DeclaratorChunk::getMemberPointer(
-                        SS, DS.getTypeQualifiers(), StarLoc, DS.getEndLoc()),
-                    std::move(DS.getAttributes()),
-                    /* Don't replace range end. */ SourceLocation());
+      if (DirectDeclParser)
+        (this->*DirectDeclParser)(D);
       return;
     }
   }
diff --git a/clang/lib/Parse/ParseExprCXX.cpp b/clang/lib/Parse/ParseExprCXX.cpp
index 1d364f7..c0eae73 100644
--- a/clang/lib/Parse/ParseExprCXX.cpp
+++ b/clang/lib/Parse/ParseExprCXX.cpp
@@ -159,8 +159,8 @@ void Parser::CheckForTemplateAndDigraph(Token &Next, ParsedType ObjectType,
 bool Parser::ParseOptionalCXXScopeSpecifier(
     CXXScopeSpec &SS, ParsedType ObjectType, bool ObjectHadErrors,
     bool EnteringContext, bool *MayBePseudoDestructor, bool IsTypename,
-    const IdentifierInfo **LastII, bool OnlyNamespace,
-    bool InUsingDeclaration) {
+    const IdentifierInfo **LastII, bool OnlyNamespace, bool InUsingDeclaration,
+    bool Disambiguation) {
   assert(getLangOpts().CPlusPlus &&
          "Call sites of this function should be guarded by checking for C++");
 
@@ -528,13 +528,11 @@ bool Parser::ParseOptionalCXXScopeSpecifier(
       UnqualifiedId TemplateName;
       TemplateName.setIdentifier(&II, Tok.getLocation());
       bool MemberOfUnknownSpecialization;
-      if (TemplateNameKind TNK = Actions.isTemplateName(getCurScope(), SS,
-                                              /*hasTemplateKeyword=*/false,
-                                                        TemplateName,
-                                                        ObjectType,
-                                                        EnteringContext,
-                                                        Template,
-                                              MemberOfUnknownSpecialization)) {
+      if (TemplateNameKind TNK = Actions.isTemplateName(
+              getCurScope(), SS,
+              /*hasTemplateKeyword=*/false, TemplateName, ObjectType,
+              EnteringContext, Template, MemberOfUnknownSpecialization,
+              Disambiguation)) {
         // If lookup didn't find anything, we treat the name as a template-name
         // anyway. C++20 requires this, and in prior language modes it improves
         // error recovery. But before we commit to this, check that we actually
@@ -557,7 +555,8 @@ bool Parser::ParseOptionalCXXScopeSpecifier(
         continue;
       }
 
-      if (MemberOfUnknownSpecialization && (ObjectType || SS.isSet()) &&
+      if (MemberOfUnknownSpecialization && !Disambiguation &&
+          (ObjectType || SS.isSet()) &&
           (IsTypename || isTemplateArgumentList(1) == TPResult::True)) {
         // If we had errors before, ObjectType can be dependent even without any
         // templates. Do not report missing template keyword in that case.
diff --git a/clang/lib/Sema/CheckExprLifetime.cpp b/clang/lib/Sema/CheckExprLifetime.cpp
index 112cf3d..7389046 100644
--- a/clang/lib/Sema/CheckExprLifetime.cpp
+++ b/clang/lib/Sema/CheckExprLifetime.cpp
@@ -825,7 +825,6 @@ static void visitLocalsRetainedByInitializer(IndirectLocalPath &Path,
   if (auto *CCE = dyn_cast<CXXConstructExpr>(Init)) {
     if (CCE->getConstructor()->isCopyOrMoveConstructor()) {
       if (auto *MTE = dyn_cast<MaterializeTemporaryExpr>(CCE->getArg(0))) {
-        // assert(false && "hit temporary copy path");
         Expr *Arg = MTE->getSubExpr();
         Path.push_back({IndirectLocalPathEntry::TemporaryCopy, Arg,
                         CCE->getConstructor()});
diff --git a/clang/lib/Sema/SemaAPINotes.cpp b/clang/lib/Sema/SemaAPINotes.cpp
index be5b7b9..2c49c1f6 100644
--- a/clang/lib/Sema/SemaAPINotes.cpp
+++ b/clang/lib/Sema/SemaAPINotes.cpp
@@ -1044,11 +1044,16 @@ void Sema::ProcessAPINotes(Decl *D) {
 
   if (auto TagContext = dyn_cast<TagDecl>(D->getDeclContext())) {
     if (auto CXXMethod = dyn_cast<CXXMethodDecl>(D)) {
-      for (auto Reader : APINotes.findAPINotes(D->getLocation())) {
-        if (auto Context = UnwindTagContext(TagContext, APINotes)) {
-          auto Info =
-              Reader->lookupCXXMethod(Context->id, CXXMethod->getName());
-          ProcessVersionedAPINotes(*this, CXXMethod, Info);
+      if (!isa<CXXConstructorDecl>(CXXMethod) &&
+          !isa<CXXDestructorDecl>(CXXMethod) &&
+          !isa<CXXConversionDecl>(CXXMethod) &&
+          !CXXMethod->isOverloadedOperator()) {
+        for (auto Reader : APINotes.findAPINotes(D->getLocation())) {
+          if (auto Context = UnwindTagContext(TagContext, APINotes)) {
+            auto Info =
+                Reader->lookupCXXMethod(Context->id, CXXMethod->getName());
+            ProcessVersionedAPINotes(*this, CXXMethod, Info);
+          }
         }
       }
     }
diff --git a/clang/lib/Sema/SemaDecl.cpp b/clang/lib/Sema/SemaDecl.cpp
index 7bdecb2..a3f8126 100644
--- a/clang/lib/Sema/SemaDecl.cpp
+++ b/clang/lib/Sema/SemaDecl.cpp
@@ -5887,7 +5887,7 @@ Sema::GetNameFromUnqualifiedId(const UnqualifiedId &Name) {
 
 static QualType getCoreType(QualType Ty) {
   do {
-    if (Ty->isPointerType() || Ty->isReferenceType())
+    if (Ty->isPointerOrReferenceType())
       Ty = Ty->getPointeeType();
     else if (Ty->isArrayType())
       Ty = Ty->castAsArrayTypeUnsafe()->getElementType();
@@ -6890,6 +6890,11 @@ static void checkAttributesAfterMerging(Sema &S, NamedDecl &ND) {
     }
   }
 
+  if (HybridPatchableAttr *Attr = ND.getAttr<HybridPatchableAttr>()) {
+    if (!ND.isExternallyVisible())
+      S.Diag(Attr->getLocation(),
+             diag::warn_attribute_hybrid_patchable_non_extern);
+  }
   if (const InheritableAttr *Attr = getDLLAttr(&ND)) {
     auto *VD = dyn_cast<VarDecl>(&ND);
     bool IsAnonymousNS = false;
@@ -9334,7 +9339,7 @@ static OpenCLParamType getOpenCLKernelParameterType(Sema &S, QualType PT) {
   if (PT->isDependentType())
     return InvalidKernelParam;
 
-  if (PT->isPointerType() || PT->isReferenceType()) {
+  if (PT->isPointerOrReferenceType()) {
     QualType PointeeType = PT->getPointeeType();
     if (PointeeType.getAddressSpace() == LangAS::opencl_generic ||
         PointeeType.getAddressSpace() == LangAS::opencl_private ||
@@ -10784,10 +10789,10 @@ Sema::ActOnFunctionDeclarator(Scope *S, Declarator &D, DeclContext *DC,
     if (getLangOpts().getOpenCLCompatibleVersion() >= 200) {
       if(const PipeType *PipeTy = PT->getAs<PipeType>()) {
         QualType ElemTy = PipeTy->getElementType();
-          if (ElemTy->isReferenceType() || ElemTy->isPointerType()) {
-            Diag(Param->getTypeSpecStartLoc(), diag::err_reference_pipe_type );
-            D.setInvalidType();
-          }
+        if (ElemTy->isPointerOrReferenceType()) {
+          Diag(Param->getTypeSpecStartLoc(), diag::err_reference_pipe_type);
+          D.setInvalidType();
+        }
       }
     }
     // WebAssembly tables can't be used as function parameters.
diff --git a/clang/lib/Sema/SemaDeclAttr.cpp b/clang/lib/Sema/SemaDeclAttr.cpp
index 8b0a6bf..9011fa5 100644
--- a/clang/lib/Sema/SemaDeclAttr.cpp
+++ b/clang/lib/Sema/SemaDeclAttr.cpp
@@ -1481,6 +1481,14 @@ static void handleOwnershipAttr(Sema &S, Decl *D, const ParsedAttr &AL) {
     break;
   }
 
+  // Allow only pointers to be return type for functions with ownership_returns
+  // attribute. This matches with current OwnershipAttr::Takes semantics
+  if (K == OwnershipAttr::Returns &&
+      !getFunctionOrMethodResultType(D)->isPointerType()) {
+    S.Diag(AL.getLoc(), diag::err_ownership_takes_return_type) << AL;
+    return;
+  }
+
   IdentifierInfo *Module = AL.getArgAsIdent(0)->Ident;
 
   StringRef ModuleName = Module->getName();
@@ -5984,7 +5992,7 @@ static void handleMSAllocatorAttr(Sema &S, Decl *D, const ParsedAttr &AL) {
   // Warn if the return type is not a pointer or reference type.
   if (auto *FD = dyn_cast<FunctionDecl>(D)) {
     QualType RetTy = FD->getReturnType();
-    if (!RetTy->isPointerType() && !RetTy->isReferenceType()) {
+    if (!RetTy->isPointerOrReferenceType()) {
       S.Diag(AL.getLoc(), diag::warn_declspec_allocator_nonpointer)
           << AL.getRange() << RetTy;
       return;
@@ -6871,6 +6879,9 @@ ProcessDeclAttribute(Sema &S, Scope *scope, Decl *D, const ParsedAttr &AL,
   case ParsedAttr::AT_MSConstexpr:
     handleMSConstexprAttr(S, D, AL);
     break;
+  case ParsedAttr::AT_HybridPatchable:
+    handleSimpleAttribute<HybridPatchableAttr>(S, D, AL);
+    break;
 
   // HLSL attributes:
   case ParsedAttr::AT_HLSLNumThreads:
diff --git a/clang/lib/Sema/SemaExprCXX.cpp b/clang/lib/Sema/SemaExprCXX.cpp
index 029969c..c5003d9 100644
--- a/clang/lib/Sema/SemaExprCXX.cpp
+++ b/clang/lib/Sema/SemaExprCXX.cpp
@@ -711,7 +711,7 @@ getUuidAttrOfType(Sema &SemaRef, QualType QT,
                   llvm::SmallSetVector<const UuidAttr *, 1> &UuidAttrs) {
   // Optionally remove one level of pointer, reference or array indirection.
   const Type *Ty = QT.getTypePtr();
-  if (QT->isPointerType() || QT->isReferenceType())
+  if (QT->isPointerOrReferenceType())
     Ty = QT->getPointeeType().getTypePtr();
   else if (QT->isArrayType())
     Ty = Ty->getBaseElementTypeUnsafe();
diff --git a/clang/lib/Sema/SemaHLSL.cpp b/clang/lib/Sema/SemaHLSL.cpp
index 9940bc5..11686db 100644
--- a/clang/lib/Sema/SemaHLSL.cpp
+++ b/clang/lib/Sema/SemaHLSL.cpp
@@ -650,7 +650,10 @@ class DiagnoseHLSLAvailability
   bool HasMatchingEnvironmentOrNone(const AvailabilityAttr *AA);
 
 public:
-  DiagnoseHLSLAvailability(Sema &SemaRef) : SemaRef(SemaRef) {}
+  DiagnoseHLSLAvailability(Sema &SemaRef)
+      : SemaRef(SemaRef),
+        CurrentShaderEnvironment(llvm::Triple::UnknownEnvironment),
+        CurrentShaderStageBit(0), ReportOnlyShaderStageIssues(false) {}
 
   // AST traversal methods
   void RunOnTranslationUnit(const TranslationUnitDecl *TU);
diff --git a/clang/lib/Sema/SemaInit.cpp b/clang/lib/Sema/SemaInit.cpp
index dc2ba03..90fd6df 100644
--- a/clang/lib/Sema/SemaInit.cpp
+++ b/clang/lib/Sema/SemaInit.cpp
@@ -1986,6 +1986,9 @@ static bool checkDestructorReference(QualType ElementType, SourceLocation Loc,
     return false;
 
   CXXDestructorDecl *Destructor = SemaRef.LookupDestructor(CXXRD);
+  if (!Destructor)
+    return false;
+
   SemaRef.CheckDestructorAccess(Loc, Destructor,
                                 SemaRef.PDiag(diag::err_access_dtor_temp)
                                 << ElementType);
diff --git a/clang/lib/Sema/SemaOpenMP.cpp b/clang/lib/Sema/SemaOpenMP.cpp
index 9c80b3e..4f50efd 100644
--- a/clang/lib/Sema/SemaOpenMP.cpp
+++ b/clang/lib/Sema/SemaOpenMP.cpp
@@ -2831,7 +2831,7 @@ static void checkReductionClauses(Sema &S, DSAStackTy *Stack,
         S.Diag(InscanLoc, diag::note_omp_previous_inscan_reduction);
         continue;
       }
-      for (Expr *Ref : RC->varlists()) {
+      for (Expr *Ref : RC->varlist()) {
         assert(Ref && "NULL expr in OpenMP nontemporal clause.");
         SourceLocation ELoc;
         SourceRange ERange;
@@ -2871,7 +2871,7 @@ void SemaOpenMP::EndOpenMPDSABlock(Stmt *CurDirective) {
     for (OMPClause *C : D->clauses()) {
       if (auto *Clause = dyn_cast<OMPLastprivateClause>(C)) {
         SmallVector<Expr *, 8> PrivateCopies;
-        for (Expr *DE : Clause->varlists()) {
+        for (Expr *DE : Clause->varlist()) {
           if (DE->isValueDependent() || DE->isTypeDependent()) {
             PrivateCopies.push_back(nullptr);
             continue;
@@ -2909,7 +2909,7 @@ void SemaOpenMP::EndOpenMPDSABlock(Stmt *CurDirective) {
       // Finalize nontemporal clause by handling private copies, if any.
       if (auto *Clause = dyn_cast<OMPNontemporalClause>(C)) {
         SmallVector<Expr *, 8> PrivateRefs;
-        for (Expr *RefExpr : Clause->varlists()) {
+        for (Expr *RefExpr : Clause->varlist()) {
           assert(RefExpr && "NULL expr in OpenMP nontemporal clause.");
           SourceLocation ELoc;
           SourceRange ERange;
@@ -3752,7 +3752,7 @@ class DSAAttrChecker final : public StmtVisitor<DSAAttrChecker, void> {
         !isOpenMPTaskLoopDirective(S->getDirectiveKind())) {
       for (OMPClause *C : S->clauses())
         if (auto *FC = dyn_cast<OMPFirstprivateClause>(C)) {
-          for (Expr *Ref : FC->varlists())
+          for (Expr *Ref : FC->varlist())
             Visit(Ref);
         }
     }
@@ -4729,7 +4729,7 @@ StmtResult SemaOpenMP::ActOnOpenMPRegionEnd(StmtResult S,
               SemaRef.MarkDeclarationsReferencedInExpr(E);
         }
         if (auto *AC = dyn_cast<OMPAlignedClause>(C)) {
-          for (Expr *E : AC->varlists())
+          for (Expr *E : AC->varlist())
             SemaRef.MarkDeclarationsReferencedInExpr(E);
         }
       }
@@ -5311,7 +5311,7 @@ static void checkAllocateClauses(Sema &S, DSAStackTy *Stack,
              diag::warn_omp_allocate_thread_on_task_target_directive)
           << getOpenMPDirectiveName(Stack->getCurrentDirective());
     }
-    for (Expr *E : AC->varlists()) {
+    for (Expr *E : AC->varlist()) {
       SourceLocation ELoc;
       SourceRange ERange;
       Expr *SimpleRefExpr = E;
@@ -6135,7 +6135,7 @@ StmtResult SemaOpenMP::ActOnOpenMPExecutableDirective(
       SmallVector<Expr *, 4> ImplicitExprs;
       for (OMPClause *C : Clauses) {
         if (auto *RC = dyn_cast<OMPReductionClause>(C))
-          for (Expr *E : RC->varlists())
+          for (Expr *E : RC->varlist())
             if (!isa<DeclRefExpr>(E->IgnoreParenImpCasts()))
               ImplicitExprs.emplace_back(E);
       }
@@ -10445,7 +10445,7 @@ static bool checkGenericLoopLastprivate(Sema &S, ArrayRef<OMPClause *> Clauses,
   bool ErrorFound = false;
   for (OMPClause *C : Clauses) {
     if (auto *LPC = dyn_cast<OMPLastprivateClause>(C)) {
-      for (Expr *RefExpr : LPC->varlists()) {
+      for (Expr *RefExpr : LPC->varlist()) {
         SourceLocation ELoc;
         SourceRange ERange;
         Expr *SimpleRefExpr = RefExpr;
@@ -19123,7 +19123,7 @@ static bool FinishOpenMPLinearClause(OMPLinearClause &Clause, DeclRefExpr *IV,
   auto CurInit = Clause.inits().begin();
   auto CurPrivate = Clause.privates().begin();
   OpenMPLinearClauseKind LinKind = Clause.getModifier();
-  for (Expr *RefExpr : Clause.varlists()) {
+  for (Expr *RefExpr : Clause.varlist()) {
     SourceLocation ELoc;
     SourceRange ERange;
     Expr *SimpleRefExpr = RefExpr;
@@ -23087,8 +23087,7 @@ OMPClause *SemaOpenMP::ActOnOpenMPDoacrossClause(
   if (DSAStack->getCurrentDirective() == OMPD_ordered &&
       DepType != OMPC_DOACROSS_source && DepType != OMPC_DOACROSS_sink &&
       DepType != OMPC_DOACROSS_sink_omp_cur_iteration &&
-      DepType != OMPC_DOACROSS_source_omp_cur_iteration &&
-      DepType != OMPC_DOACROSS_source) {
+      DepType != OMPC_DOACROSS_source_omp_cur_iteration) {
     Diag(DepLoc, diag::err_omp_unexpected_clause_value)
         << "'source' or 'sink'" << getOpenMPClauseName(OMPC_doacross);
     return nullptr;
diff --git a/clang/lib/Sema/SemaOverload.cpp b/clang/lib/Sema/SemaOverload.cpp
index 86074a4..c5f56ac 100644
--- a/clang/lib/Sema/SemaOverload.cpp
+++ b/clang/lib/Sema/SemaOverload.cpp
@@ -11133,7 +11133,7 @@ static void DiagnoseBadConversion(Sema &S, OverloadCandidate *Cand,
   if (isa<CXXMethodDecl>(Fn) && !isa<CXXConstructorDecl>(Fn)) {
     if (I == 0)
       isObjectArgument = true;
-    else
+    else if (!Fn->hasCXXExplicitFunctionObjectParameter())
       I--;
   }
 
diff --git a/clang/lib/Sema/SemaPPC.cpp b/clang/lib/Sema/SemaPPC.cpp
index 99f46b1..5b764ed 100644
--- a/clang/lib/Sema/SemaPPC.cpp
+++ b/clang/lib/Sema/SemaPPC.cpp
@@ -93,7 +93,6 @@ bool SemaPPC::CheckPPCBuiltinFunctionCall(const TargetInfo &TI,
                                           unsigned BuiltinID,
                                           CallExpr *TheCall) {
   ASTContext &Context = getASTContext();
-  unsigned i = 0, l = 0, u = 0;
   bool IsTarget64Bit = TI.getTypeWidth(TI.getIntPtrType()) == 64;
   llvm::APSInt Result;
 
@@ -248,7 +247,7 @@ bool SemaPPC::CheckPPCBuiltinFunctionCall(const TargetInfo &TI,
     return BuiltinPPCMMACall(TheCall, BuiltinID, Types);
 #include "clang/Basic/BuiltinsPPC.def"
   }
-  return SemaRef.BuiltinConstantArgRange(TheCall, i, l, u);
+  llvm_unreachable("must return from switch");
 }
 
 // Check if the given type is a non-pointer PPC MMA type. This function is used
diff --git a/clang/lib/Sema/SemaStmtAttr.cpp b/clang/lib/Sema/SemaStmtAttr.cpp
index 7f452d1..b9b3b40 100644
--- a/clang/lib/Sema/SemaStmtAttr.cpp
+++ b/clang/lib/Sema/SemaStmtAttr.cpp
@@ -218,7 +218,6 @@ public:
 
 static Attr *handleNoMergeAttr(Sema &S, Stmt *St, const ParsedAttr &A,
                                SourceRange Range) {
-  NoMergeAttr NMA(S.Context, A);
   CallExprFinder CEF(S, St);
 
   if (!CEF.foundCallExpr() && !CEF.foundAsmStmt()) {
@@ -230,6 +229,19 @@ static Attr *handleNoMergeAttr(Sema &S, Stmt *St, const ParsedAttr &A,
   return ::new (S.Context) NoMergeAttr(S.Context, A);
 }
 
+static Attr *handleNoConvergentAttr(Sema &S, Stmt *St, const ParsedAttr &A,
+                                    SourceRange Range) {
+  CallExprFinder CEF(S, St);
+
+  if (!CEF.foundCallExpr() && !CEF.foundAsmStmt()) {
+    S.Diag(St->getBeginLoc(), diag::warn_attribute_ignored_no_calls_in_stmt)
+        << A;
+    return nullptr;
+  }
+
+  return ::new (S.Context) NoConvergentAttr(S.Context, A);
+}
+
 template <typename OtherAttr, int DiagIdx>
 static bool CheckStmtInlineAttr(Sema &SemaRef, const Stmt *OrigSt,
                                 const Stmt *CurSt,
@@ -594,13 +606,6 @@ static Attr *handleHLSLLoopHintAttr(Sema &S, Stmt *St, const ParsedAttr &A,
 
   unsigned UnrollFactor = 0;
   if (A.getNumArgs() == 1) {
-
-    if (A.isArgIdent(0)) {
-      S.Diag(A.getLoc(), diag::err_attribute_argument_type)
-          << A << AANT_ArgumentIntegerConstant << A.getRange();
-      return nullptr;
-    }
-
     Expr *E = A.getArgAsExpr(0);
 
     if (S.CheckLoopHintExpr(E, St->getBeginLoc(),
@@ -672,6 +677,8 @@ static Attr *ProcessStmtAttribute(Sema &S, Stmt *St, const ParsedAttr &A,
     return handleCodeAlignAttr(S, St, A);
   case ParsedAttr::AT_MSConstexpr:
     return handleMSConstexprAttr(S, St, A, Range);
+  case ParsedAttr::AT_NoConvergent:
+    return handleNoConvergentAttr(S, St, A, Range);
   default:
     // N.B., ClangAttrEmitter.cpp emits a diagnostic helper that ensures a
     // declaration attribute is not written on a statement, but this code is
diff --git a/clang/lib/Sema/SemaTemplate.cpp b/clang/lib/Sema/SemaTemplate.cpp
index 87b1f98..c22e329 100644
--- a/clang/lib/Sema/SemaTemplate.cpp
+++ b/clang/lib/Sema/SemaTemplate.cpp
@@ -6719,7 +6719,7 @@ ExprResult Sema::CheckTemplateArgument(NonTypeTemplateParmDecl *Param,
       auto *VD = const_cast<ValueDecl *>(Base.dyn_cast<const ValueDecl *>());
       //   For a non-type template-parameter of pointer or reference type,
       //   the value of the constant expression shall not refer to
-      assert(ParamType->isPointerType() || ParamType->isReferenceType() ||
+      assert(ParamType->isPointerOrReferenceType() ||
              ParamType->isNullPtrType());
       // -- a temporary object
       // -- a string literal
diff --git a/clang/lib/Sema/SemaTemplateDeductionGuide.cpp b/clang/lib/Sema/SemaTemplateDeductionGuide.cpp
index 0602d07..545da21 100644
--- a/clang/lib/Sema/SemaTemplateDeductionGuide.cpp
+++ b/clang/lib/Sema/SemaTemplateDeductionGuide.cpp
@@ -39,6 +39,7 @@
 #include "clang/Sema/Overload.h"
 #include "clang/Sema/Ownership.h"
 #include "clang/Sema/Scope.h"
+#include "clang/Sema/SemaInternal.h"
 #include "clang/Sema/Template.h"
 #include "clang/Sema/TemplateDeduction.h"
 #include "llvm/ADT/ArrayRef.h"
@@ -241,11 +242,10 @@ NamedDecl *buildDeductionGuide(
 }
 
 // Transform a given template type parameter `TTP`.
-TemplateTypeParmDecl *
-transformTemplateTypeParam(Sema &SemaRef, DeclContext *DC,
-                           TemplateTypeParmDecl *TTP,
-                           MultiLevelTemplateArgumentList &Args,
-                           unsigned NewDepth, unsigned NewIndex) {
+TemplateTypeParmDecl *transformTemplateTypeParam(
+    Sema &SemaRef, DeclContext *DC, TemplateTypeParmDecl *TTP,
+    MultiLevelTemplateArgumentList &Args, unsigned NewDepth, unsigned NewIndex,
+    bool EvaluateConstraint) {
   // TemplateTypeParmDecl's index cannot be changed after creation, so
   // substitute it directly.
   auto *NewTTP = TemplateTypeParmDecl::Create(
@@ -257,7 +257,7 @@ transformTemplateTypeParam(Sema &SemaRef, DeclContext *DC,
           : std::nullopt);
   if (const auto *TC = TTP->getTypeConstraint())
     SemaRef.SubstTypeConstraint(NewTTP, TC, Args,
-                                /*EvaluateConstraint=*/true);
+                                /*EvaluateConstraint=*/EvaluateConstraint);
   if (TTP->hasDefaultArgument()) {
     TemplateArgumentLoc InstantiatedDefaultArg;
     if (!SemaRef.SubstTemplateArgument(
@@ -284,6 +284,22 @@ transformTemplateParam(Sema &SemaRef, DeclContext *DC,
   return NewParam;
 }
 
+NamedDecl *transformTemplateParameter(Sema &SemaRef, DeclContext *DC,
+                                      NamedDecl *TemplateParam,
+                                      MultiLevelTemplateArgumentList &Args,
+                                      unsigned NewIndex, unsigned NewDepth,
+                                      bool EvaluateConstraint = true) {
+  if (auto *TTP = dyn_cast<TemplateTypeParmDecl>(TemplateParam))
+    return transformTemplateTypeParam(
+        SemaRef, DC, TTP, Args, NewDepth, NewIndex,
+        /*EvaluateConstraint=*/EvaluateConstraint);
+  if (auto *TTP = dyn_cast<TemplateTemplateParmDecl>(TemplateParam))
+    return transformTemplateParam(SemaRef, DC, TTP, Args, NewIndex, NewDepth);
+  if (auto *NTTP = dyn_cast<NonTypeTemplateParmDecl>(TemplateParam))
+    return transformTemplateParam(SemaRef, DC, NTTP, Args, NewIndex, NewDepth);
+  llvm_unreachable("Unhandled template parameter types");
+}
+
 /// Transform to convert portions of a constructor declaration into the
 /// corresponding deduction guide, per C++1z [over.match.class.deduct]p1.
 struct ConvertConstructorToDeductionGuideTransform {
@@ -358,7 +374,9 @@ struct ConvertConstructorToDeductionGuideTransform {
         Args.addOuterRetainedLevel();
         if (NestedPattern)
           Args.addOuterRetainedLevels(NestedPattern->getTemplateDepth());
-        NamedDecl *NewParam = transformTemplateParameter(Param, Args);
+        auto [Depth, Index] = getDepthAndIndex(Param);
+        NamedDecl *NewParam = transformTemplateParameter(
+            SemaRef, DC, Param, Args, Index + Depth1IndexAdjustment, Depth - 1);
         if (!NewParam)
           return nullptr;
         // Constraints require that we substitute depth-1 arguments
@@ -366,12 +384,11 @@ struct ConvertConstructorToDeductionGuideTransform {
         Depth1Args.push_back(SemaRef.Context.getInjectedTemplateArg(NewParam));
 
         if (NestedPattern) {
-          TemplateDeclInstantiator Instantiator(SemaRef, DC,
-                                                OuterInstantiationArgs);
-          Instantiator.setEvaluateConstraints(false);
-          SemaRef.runWithSufficientStackSpace(NewParam->getLocation(), [&] {
-            NewParam = cast<NamedDecl>(Instantiator.Visit(NewParam));
-          });
+          auto [Depth, Index] = getDepthAndIndex(NewParam);
+          NewParam = transformTemplateParameter(
+              SemaRef, DC, NewParam, OuterInstantiationArgs, Index,
+              Depth - OuterInstantiationArgs.getNumSubstitutedLevels(),
+              /*EvaluateConstraint=*/false);
         }
 
         assert(NewParam->getTemplateDepth() == 0 &&
@@ -479,25 +496,6 @@ struct ConvertConstructorToDeductionGuideTransform {
   }
 
 private:
-  /// Transform a constructor template parameter into a deduction guide template
-  /// parameter, rebuilding any internal references to earlier parameters and
-  /// renumbering as we go.
-  NamedDecl *transformTemplateParameter(NamedDecl *TemplateParam,
-                                        MultiLevelTemplateArgumentList &Args) {
-    if (auto *TTP = dyn_cast<TemplateTypeParmDecl>(TemplateParam))
-      return transformTemplateTypeParam(
-          SemaRef, DC, TTP, Args, TTP->getDepth() - 1,
-          Depth1IndexAdjustment + TTP->getIndex());
-    if (auto *TTP = dyn_cast<TemplateTemplateParmDecl>(TemplateParam))
-      return transformTemplateParam(SemaRef, DC, TTP, Args,
-                                    Depth1IndexAdjustment + TTP->getIndex(),
-                                    TTP->getDepth() - 1);
-    auto *NTTP = cast<NonTypeTemplateParmDecl>(TemplateParam);
-    return transformTemplateParam(SemaRef, DC, NTTP, Args,
-                                  Depth1IndexAdjustment + NTTP->getIndex(),
-                                  NTTP->getDepth() - 1);
-  }
-
   QualType transformFunctionProtoType(
       TypeLocBuilder &TLB, FunctionProtoTypeLoc TL,
       SmallVectorImpl<ParmVarDecl *> &Params,
@@ -634,26 +632,6 @@ private:
   }
 };
 
-unsigned getTemplateParameterDepth(NamedDecl *TemplateParam) {
-  if (auto *TTP = dyn_cast<TemplateTypeParmDecl>(TemplateParam))
-    return TTP->getDepth();
-  if (auto *TTP = dyn_cast<TemplateTemplateParmDecl>(TemplateParam))
-    return TTP->getDepth();
-  if (auto *NTTP = dyn_cast<NonTypeTemplateParmDecl>(TemplateParam))
-    return NTTP->getDepth();
-  llvm_unreachable("Unhandled template parameter types");
-}
-
-unsigned getTemplateParameterIndex(NamedDecl *TemplateParam) {
-  if (auto *TTP = dyn_cast<TemplateTypeParmDecl>(TemplateParam))
-    return TTP->getIndex();
-  if (auto *TTP = dyn_cast<TemplateTemplateParmDecl>(TemplateParam))
-    return TTP->getIndex();
-  if (auto *NTTP = dyn_cast<NonTypeTemplateParmDecl>(TemplateParam))
-    return NTTP->getIndex();
-  llvm_unreachable("Unhandled template parameter types");
-}
-
 // Find all template parameters that appear in the given DeducedArgs.
 // Return the indices of the template parameters in the TemplateParams.
 SmallVector<unsigned> TemplateParamsReferencedInTemplateArgumentList(
@@ -689,8 +667,10 @@ SmallVector<unsigned> TemplateParamsReferencedInTemplateArgumentList(
 
     void MarkAppeared(NamedDecl *ND) {
       if (llvm::isa<NonTypeTemplateParmDecl, TemplateTypeParmDecl,
-                    TemplateTemplateParmDecl>(ND))
-        Mark(getTemplateParameterDepth(ND), getTemplateParameterIndex(ND));
+                    TemplateTemplateParmDecl>(ND)) {
+        auto [Depth, Index] = getDepthAndIndex(ND);
+        Mark(Depth, Index);
+      }
     }
     void Mark(unsigned Depth, unsigned Index) {
       if (Index < TemplateParamList->size() &&
@@ -722,20 +702,6 @@ bool hasDeclaredDeductionGuides(DeclarationName Name, DeclContext *DC) {
   return false;
 }
 
-NamedDecl *transformTemplateParameter(Sema &SemaRef, DeclContext *DC,
-                                      NamedDecl *TemplateParam,
-                                      MultiLevelTemplateArgumentList &Args,
-                                      unsigned NewIndex, unsigned NewDepth) {
-  if (auto *TTP = dyn_cast<TemplateTypeParmDecl>(TemplateParam))
-    return transformTemplateTypeParam(SemaRef, DC, TTP, Args, NewDepth,
-                                      NewIndex);
-  if (auto *TTP = dyn_cast<TemplateTemplateParmDecl>(TemplateParam))
-    return transformTemplateParam(SemaRef, DC, TTP, Args, NewIndex, NewDepth);
-  if (auto *NTTP = dyn_cast<NonTypeTemplateParmDecl>(TemplateParam))
-    return transformTemplateParam(SemaRef, DC, NTTP, Args, NewIndex, NewDepth);
-  llvm_unreachable("Unhandled template parameter types");
-}
-
 // Build the associated constraints for the alias deduction guides.
 // C++ [over.match.class.deduct]p3.3:
 //   The associated constraints ([temp.constr.decl]) are the conjunction of the
@@ -791,7 +757,7 @@ buildAssociatedConstraints(Sema &SemaRef, FunctionTemplateDecl *F,
     NamedDecl *NewParam = transformTemplateParameter(
         SemaRef, AliasTemplate->getDeclContext(), TP, Args,
         /*NewIndex=*/AdjustedAliasTemplateArgs.size(),
-        getTemplateParameterDepth(TP) + AdjustDepth);
+        getDepthAndIndex(TP).first + AdjustDepth);
 
     TemplateArgument NewTemplateArgument =
         Context.getInjectedTemplateArg(NewParam);
@@ -814,10 +780,10 @@ buildAssociatedConstraints(Sema &SemaRef, FunctionTemplateDecl *F,
       Args.setKind(TemplateSubstitutionKind::Rewrite);
       Args.addOuterTemplateArguments(TemplateArgsForBuildingRC);
       // Rebuild the template parameter with updated depth and index.
-      NamedDecl *NewParam = transformTemplateParameter(
-          SemaRef, F->getDeclContext(), TP, Args,
-          /*NewIndex=*/FirstUndeducedParamIdx,
-          getTemplateParameterDepth(TP) + AdjustDepth);
+      NamedDecl *NewParam =
+          transformTemplateParameter(SemaRef, F->getDeclContext(), TP, Args,
+                                     /*NewIndex=*/FirstUndeducedParamIdx,
+                                     getDepthAndIndex(TP).first + AdjustDepth);
       FirstUndeducedParamIdx += 1;
       assert(TemplateArgsForBuildingRC[Index].isNull());
       TemplateArgsForBuildingRC[Index] =
@@ -919,7 +885,7 @@ Expr *buildIsDeducibleConstraint(Sema &SemaRef,
       NamedDecl *NewParam = transformTemplateParameter(
           SemaRef, AliasTemplate->getDeclContext(), TP, Args,
           /*NewIndex=*/TransformedTemplateArgs.size(),
-          getTemplateParameterDepth(TP) + AdjustDepth);
+          getDepthAndIndex(TP).first + AdjustDepth);
 
       TemplateArgument NewTemplateArgument =
           Context.getInjectedTemplateArg(NewParam);
@@ -1081,8 +1047,7 @@ BuildDeductionGuideForTypeAlias(Sema &SemaRef,
     Args.addOuterTemplateArguments(TransformedDeducedAliasArgs);
     NamedDecl *NewParam = transformTemplateParameter(
         SemaRef, AliasTemplate->getDeclContext(), TP, Args,
-        /*NewIndex=*/FPrimeTemplateParams.size(),
-        getTemplateParameterDepth(TP));
+        /*NewIndex=*/FPrimeTemplateParams.size(), getDepthAndIndex(TP).first);
     FPrimeTemplateParams.push_back(NewParam);
 
     TemplateArgument NewTemplateArgument =
@@ -1101,7 +1066,7 @@ BuildDeductionGuideForTypeAlias(Sema &SemaRef,
     Args.addOuterTemplateArguments(TemplateArgsForBuildingFPrime);
     NamedDecl *NewParam = transformTemplateParameter(
         SemaRef, F->getDeclContext(), TP, Args, FPrimeTemplateParams.size(),
-        getTemplateParameterDepth(TP));
+        getDepthAndIndex(TP).first);
     FPrimeTemplateParams.push_back(NewParam);
 
     assert(TemplateArgsForBuildingFPrime[FTemplateParamIdx].isNull() &&
diff --git a/clang/lib/Sema/SemaTemplateInstantiateDecl.cpp b/clang/lib/Sema/SemaTemplateInstantiateDecl.cpp
index a12d2ef..f93cd11 100644
--- a/clang/lib/Sema/SemaTemplateInstantiateDecl.cpp
+++ b/clang/lib/Sema/SemaTemplateInstantiateDecl.cpp
@@ -330,7 +330,7 @@ instantiateDependentModeAttr(Sema &S,
 static void instantiateOMPDeclareSimdDeclAttr(
     Sema &S, const MultiLevelTemplateArgumentList &TemplateArgs,
     const OMPDeclareSimdDeclAttr &Attr, Decl *New) {
-  // Allow 'this' in clauses with varlists.
+  // Allow 'this' in clauses with varlist.
   if (auto *FTD = dyn_cast<FunctionTemplateDecl>(New))
     New = FTD->getTemplatedDecl();
   auto *FD = cast<FunctionDecl>(New);
@@ -413,7 +413,7 @@ static void instantiateOMPDeclareSimdDeclAttr(
 static void instantiateOMPDeclareVariantAttr(
     Sema &S, const MultiLevelTemplateArgumentList &TemplateArgs,
     const OMPDeclareVariantAttr &Attr, Decl *New) {
-  // Allow 'this' in clauses with varlists.
+  // Allow 'this' in clauses with varlist.
   if (auto *FTD = dyn_cast<FunctionTemplateDecl>(New))
     New = FTD->getTemplatedDecl();
   auto *FD = cast<FunctionDecl>(New);
@@ -3588,7 +3588,7 @@ Decl *TemplateDeclInstantiator::VisitUsingPackDecl(UsingPackDecl *D) {
 Decl *TemplateDeclInstantiator::VisitOMPThreadPrivateDecl(
                                      OMPThreadPrivateDecl *D) {
   SmallVector<Expr *, 5> Vars;
-  for (auto *I : D->varlists()) {
+  for (auto *I : D->varlist()) {
     Expr *Var = SemaRef.SubstExpr(I, TemplateArgs).get();
     assert(isa<DeclRefExpr>(Var) && "threadprivate arg is not a DeclRefExpr");
     Vars.push_back(Var);
@@ -3605,7 +3605,7 @@ Decl *TemplateDeclInstantiator::VisitOMPThreadPrivateDecl(
 
 Decl *TemplateDeclInstantiator::VisitOMPAllocateDecl(OMPAllocateDecl *D) {
   SmallVector<Expr *, 5> Vars;
-  for (auto *I : D->varlists()) {
+  for (auto *I : D->varlist()) {
     Expr *Var = SemaRef.SubstExpr(I, TemplateArgs).get();
     assert(isa<DeclRefExpr>(Var) && "allocate arg is not a DeclRefExpr");
     Vars.push_back(Var);
@@ -3782,7 +3782,7 @@ TemplateDeclInstantiator::VisitOMPDeclareMapperDecl(OMPDeclareMapperDecl *D) {
   for (OMPClause *C : D->clauselists()) {
     auto *OldC = cast<OMPMapClause>(C);
     SmallVector<Expr *, 4> NewVars;
-    for (Expr *OE : OldC->varlists()) {
+    for (Expr *OE : OldC->varlist()) {
       Expr *NE = SemaRef.SubstExpr(OE, TemplateArgs).get();
       if (!NE) {
         IsCorrect = false;
diff --git a/clang/lib/Sema/TreeTransform.h b/clang/lib/Sema/TreeTransform.h
index 4d68ebf..8d3e1ed 100644
--- a/clang/lib/Sema/TreeTransform.h
+++ b/clang/lib/Sema/TreeTransform.h
@@ -10309,7 +10309,7 @@ OMPClause *TreeTransform<Derived>::TransformOMPInitClause(OMPInitClause *C) {
 
   OMPInteropInfo InteropInfo(C->getIsTarget(), C->getIsTargetSync());
   InteropInfo.PreferTypes.reserve(C->varlist_size() - 1);
-  for (Expr *E : llvm::drop_begin(C->varlists())) {
+  for (Expr *E : llvm::drop_begin(C->varlist())) {
     ExprResult ER = getDerived().TransformExpr(cast<Expr>(E));
     if (ER.isInvalid())
       return nullptr;
@@ -10447,7 +10447,7 @@ OMPClause *
 TreeTransform<Derived>::TransformOMPPrivateClause(OMPPrivateClause *C) {
   llvm::SmallVector<Expr *, 16> Vars;
   Vars.reserve(C->varlist_size());
-  for (auto *VE : C->varlists()) {
+  for (auto *VE : C->varlist()) {
     ExprResult EVar = getDerived().TransformExpr(cast<Expr>(VE));
     if (EVar.isInvalid())
       return nullptr;
@@ -10462,7 +10462,7 @@ OMPClause *TreeTransform<Derived>::TransformOMPFirstprivateClause(
     OMPFirstprivateClause *C) {
   llvm::SmallVector<Expr *, 16> Vars;
   Vars.reserve(C->varlist_size());
-  for (auto *VE : C->varlists()) {
+  for (auto *VE : C->varlist()) {
     ExprResult EVar = getDerived().TransformExpr(cast<Expr>(VE));
     if (EVar.isInvalid())
       return nullptr;
@@ -10477,7 +10477,7 @@ OMPClause *
 TreeTransform<Derived>::TransformOMPLastprivateClause(OMPLastprivateClause *C) {
   llvm::SmallVector<Expr *, 16> Vars;
   Vars.reserve(C->varlist_size());
-  for (auto *VE : C->varlists()) {
+  for (auto *VE : C->varlist()) {
     ExprResult EVar = getDerived().TransformExpr(cast<Expr>(VE));
     if (EVar.isInvalid())
       return nullptr;
@@ -10493,7 +10493,7 @@ OMPClause *
 TreeTransform<Derived>::TransformOMPSharedClause(OMPSharedClause *C) {
   llvm::SmallVector<Expr *, 16> Vars;
   Vars.reserve(C->varlist_size());
-  for (auto *VE : C->varlists()) {
+  for (auto *VE : C->varlist()) {
     ExprResult EVar = getDerived().TransformExpr(cast<Expr>(VE));
     if (EVar.isInvalid())
       return nullptr;
@@ -10508,7 +10508,7 @@ OMPClause *
 TreeTransform<Derived>::TransformOMPReductionClause(OMPReductionClause *C) {
   llvm::SmallVector<Expr *, 16> Vars;
   Vars.reserve(C->varlist_size());
-  for (auto *VE : C->varlists()) {
+  for (auto *VE : C->varlist()) {
     ExprResult EVar = getDerived().TransformExpr(cast<Expr>(VE));
     if (EVar.isInvalid())
       return nullptr;
@@ -10555,7 +10555,7 @@ OMPClause *TreeTransform<Derived>::TransformOMPTaskReductionClause(
     OMPTaskReductionClause *C) {
   llvm::SmallVector<Expr *, 16> Vars;
   Vars.reserve(C->varlist_size());
-  for (auto *VE : C->varlists()) {
+  for (auto *VE : C->varlist()) {
     ExprResult EVar = getDerived().TransformExpr(cast<Expr>(VE));
     if (EVar.isInvalid())
       return nullptr;
@@ -10601,7 +10601,7 @@ OMPClause *
 TreeTransform<Derived>::TransformOMPInReductionClause(OMPInReductionClause *C) {
   llvm::SmallVector<Expr *, 16> Vars;
   Vars.reserve(C->varlist_size());
-  for (auto *VE : C->varlists()) {
+  for (auto *VE : C->varlist()) {
     ExprResult EVar = getDerived().TransformExpr(cast<Expr>(VE));
     if (EVar.isInvalid())
       return nullptr;
@@ -10647,7 +10647,7 @@ OMPClause *
 TreeTransform<Derived>::TransformOMPLinearClause(OMPLinearClause *C) {
   llvm::SmallVector<Expr *, 16> Vars;
   Vars.reserve(C->varlist_size());
-  for (auto *VE : C->varlists()) {
+  for (auto *VE : C->varlist()) {
     ExprResult EVar = getDerived().TransformExpr(cast<Expr>(VE));
     if (EVar.isInvalid())
       return nullptr;
@@ -10667,7 +10667,7 @@ OMPClause *
 TreeTransform<Derived>::TransformOMPAlignedClause(OMPAlignedClause *C) {
   llvm::SmallVector<Expr *, 16> Vars;
   Vars.reserve(C->varlist_size());
-  for (auto *VE : C->varlists()) {
+  for (auto *VE : C->varlist()) {
     ExprResult EVar = getDerived().TransformExpr(cast<Expr>(VE));
     if (EVar.isInvalid())
       return nullptr;
@@ -10686,7 +10686,7 @@ OMPClause *
 TreeTransform<Derived>::TransformOMPCopyinClause(OMPCopyinClause *C) {
   llvm::SmallVector<Expr *, 16> Vars;
   Vars.reserve(C->varlist_size());
-  for (auto *VE : C->varlists()) {
+  for (auto *VE : C->varlist()) {
     ExprResult EVar = getDerived().TransformExpr(cast<Expr>(VE));
     if (EVar.isInvalid())
       return nullptr;
@@ -10701,7 +10701,7 @@ OMPClause *
 TreeTransform<Derived>::TransformOMPCopyprivateClause(OMPCopyprivateClause *C) {
   llvm::SmallVector<Expr *, 16> Vars;
   Vars.reserve(C->varlist_size());
-  for (auto *VE : C->varlists()) {
+  for (auto *VE : C->varlist()) {
     ExprResult EVar = getDerived().TransformExpr(cast<Expr>(VE));
     if (EVar.isInvalid())
       return nullptr;
@@ -10715,7 +10715,7 @@ template <typename Derived>
 OMPClause *TreeTransform<Derived>::TransformOMPFlushClause(OMPFlushClause *C) {
   llvm::SmallVector<Expr *, 16> Vars;
   Vars.reserve(C->varlist_size());
-  for (auto *VE : C->varlists()) {
+  for (auto *VE : C->varlist()) {
     ExprResult EVar = getDerived().TransformExpr(cast<Expr>(VE));
     if (EVar.isInvalid())
       return nullptr;
@@ -10747,7 +10747,7 @@ TreeTransform<Derived>::TransformOMPDependClause(OMPDependClause *C) {
     DepModifier = DepModRes.get();
   }
   Vars.reserve(C->varlist_size());
-  for (auto *VE : C->varlists()) {
+  for (auto *VE : C->varlist()) {
     ExprResult EVar = getDerived().TransformExpr(cast<Expr>(VE));
     if (EVar.isInvalid())
       return nullptr;
@@ -10778,7 +10778,7 @@ bool transformOMPMappableExprListClause(
     llvm::SmallVectorImpl<Expr *> &UnresolvedMappers) {
   // Transform expressions in the list.
   Vars.reserve(C->varlist_size());
-  for (auto *VE : C->varlists()) {
+  for (auto *VE : C->varlist()) {
     ExprResult EVar = TT.getDerived().TransformExpr(cast<Expr>(VE));
     if (EVar.isInvalid())
       return true;
@@ -10858,7 +10858,7 @@ TreeTransform<Derived>::TransformOMPAllocateClause(OMPAllocateClause *C) {
   }
   llvm::SmallVector<Expr *, 16> Vars;
   Vars.reserve(C->varlist_size());
-  for (auto *VE : C->varlists()) {
+  for (auto *VE : C->varlist()) {
     ExprResult EVar = getDerived().TransformExpr(cast<Expr>(VE));
     if (EVar.isInvalid())
       return nullptr;
@@ -10990,7 +10990,7 @@ OMPClause *TreeTransform<Derived>::TransformOMPUseDevicePtrClause(
     OMPUseDevicePtrClause *C) {
   llvm::SmallVector<Expr *, 16> Vars;
   Vars.reserve(C->varlist_size());
-  for (auto *VE : C->varlists()) {
+  for (auto *VE : C->varlist()) {
     ExprResult EVar = getDerived().TransformExpr(cast<Expr>(VE));
     if (EVar.isInvalid())
       return nullptr;
@@ -11005,7 +11005,7 @@ OMPClause *TreeTransform<Derived>::TransformOMPUseDeviceAddrClause(
     OMPUseDeviceAddrClause *C) {
   llvm::SmallVector<Expr *, 16> Vars;
   Vars.reserve(C->varlist_size());
-  for (auto *VE : C->varlists()) {
+  for (auto *VE : C->varlist()) {
     ExprResult EVar = getDerived().TransformExpr(cast<Expr>(VE));
     if (EVar.isInvalid())
       return nullptr;
@@ -11020,7 +11020,7 @@ OMPClause *
 TreeTransform<Derived>::TransformOMPIsDevicePtrClause(OMPIsDevicePtrClause *C) {
   llvm::SmallVector<Expr *, 16> Vars;
   Vars.reserve(C->varlist_size());
-  for (auto *VE : C->varlists()) {
+  for (auto *VE : C->varlist()) {
     ExprResult EVar = getDerived().TransformExpr(cast<Expr>(VE));
     if (EVar.isInvalid())
       return nullptr;
@@ -11035,7 +11035,7 @@ OMPClause *TreeTransform<Derived>::TransformOMPHasDeviceAddrClause(
     OMPHasDeviceAddrClause *C) {
   llvm::SmallVector<Expr *, 16> Vars;
   Vars.reserve(C->varlist_size());
-  for (auto *VE : C->varlists()) {
+  for (auto *VE : C->varlist()) {
     ExprResult EVar = getDerived().TransformExpr(cast<Expr>(VE));
     if (EVar.isInvalid())
       return nullptr;
@@ -11050,7 +11050,7 @@ OMPClause *
 TreeTransform<Derived>::TransformOMPNontemporalClause(OMPNontemporalClause *C) {
   llvm::SmallVector<Expr *, 16> Vars;
   Vars.reserve(C->varlist_size());
-  for (auto *VE : C->varlists()) {
+  for (auto *VE : C->varlist()) {
     ExprResult EVar = getDerived().TransformExpr(cast<Expr>(VE));
     if (EVar.isInvalid())
       return nullptr;
@@ -11065,7 +11065,7 @@ OMPClause *
 TreeTransform<Derived>::TransformOMPInclusiveClause(OMPInclusiveClause *C) {
   llvm::SmallVector<Expr *, 16> Vars;
   Vars.reserve(C->varlist_size());
-  for (auto *VE : C->varlists()) {
+  for (auto *VE : C->varlist()) {
     ExprResult EVar = getDerived().TransformExpr(cast<Expr>(VE));
     if (EVar.isInvalid())
       return nullptr;
@@ -11080,7 +11080,7 @@ OMPClause *
 TreeTransform<Derived>::TransformOMPExclusiveClause(OMPExclusiveClause *C) {
   llvm::SmallVector<Expr *, 16> Vars;
   Vars.reserve(C->varlist_size());
-  for (auto *VE : C->varlists()) {
+  for (auto *VE : C->varlist()) {
     ExprResult EVar = getDerived().TransformExpr(cast<Expr>(VE));
     if (EVar.isInvalid())
       return nullptr;
@@ -11127,7 +11127,7 @@ TreeTransform<Derived>::TransformOMPAffinityClause(OMPAffinityClause *C) {
     if (ModifierRes.isInvalid())
       return nullptr;
   }
-  for (Expr *E : C->varlists()) {
+  for (Expr *E : C->varlist()) {
     ExprResult Locator = getDerived().TransformExpr(E);
     if (Locator.isInvalid())
       continue;
@@ -11167,7 +11167,7 @@ OMPClause *
 TreeTransform<Derived>::TransformOMPDoacrossClause(OMPDoacrossClause *C) {
   llvm::SmallVector<Expr *, 16> Vars;
   Vars.reserve(C->varlist_size());
-  for (auto *VE : C->varlists()) {
+  for (auto *VE : C->varlist()) {
     ExprResult EVar = getDerived().TransformExpr(cast<Expr>(VE));
     if (EVar.isInvalid())
       return nullptr;
diff --git a/clang/lib/Serialization/ASTReader.cpp b/clang/lib/Serialization/ASTReader.cpp
index 3cb96df..86fa96a 100644
--- a/clang/lib/Serialization/ASTReader.cpp
+++ b/clang/lib/Serialization/ASTReader.cpp
@@ -3023,8 +3023,9 @@ ASTReader::ReadControlBlock(ModuleFile &F,
     case METADATA: {
       if (Record[0] != VERSION_MAJOR && !DisableValidation) {
         if ((ClientLoadCapabilities & ARR_VersionMismatch) == 0)
-          Diag(Record[0] < VERSION_MAJOR? diag::err_pch_version_too_old
-                                        : diag::err_pch_version_too_new);
+          Diag(Record[0] < VERSION_MAJOR ? diag::err_ast_file_version_too_old
+                                         : diag::err_ast_file_version_too_new)
+              << moduleKindForDiagnostic(F.Kind) << F.FileName;
         return VersionMismatch;
       }
 
@@ -3037,7 +3038,8 @@ ASTReader::ReadControlBlock(ModuleFile &F,
           return OutOfDate;
 
         if (!AllowASTWithCompilerErrors) {
-          Diag(diag::err_pch_with_compiler_errors);
+          Diag(diag::err_ast_file_with_compiler_errors)
+              << moduleKindForDiagnostic(F.Kind) << F.FileName;
           return HadErrors;
         }
       }
@@ -3060,7 +3062,9 @@ ASTReader::ReadControlBlock(ModuleFile &F,
       StringRef ASTBranch = Blob;
       if (StringRef(CurBranch) != ASTBranch && !DisableValidation) {
         if ((ClientLoadCapabilities & ARR_VersionMismatch) == 0)
-          Diag(diag::err_pch_different_branch) << ASTBranch << CurBranch;
+          Diag(diag::err_ast_file_different_branch)
+              << moduleKindForDiagnostic(F.Kind) << F.FileName << ASTBranch
+              << CurBranch;
         return VersionMismatch;
       }
       break;
@@ -4827,7 +4831,8 @@ ASTReader::ReadASTCore(StringRef FileName,
     case AST_BLOCK_ID:
       if (!HaveReadControlBlock) {
         if ((ClientLoadCapabilities & ARR_VersionMismatch) == 0)
-          Diag(diag::err_pch_version_too_old);
+          Diag(diag::err_ast_file_version_too_old)
+              << moduleKindForDiagnostic(Type) << FileName;
         return VersionMismatch;
       }
 
diff --git a/clang/lib/Serialization/ASTWriter.cpp b/clang/lib/Serialization/ASTWriter.cpp
index c78d894..f0f9d39 100644
--- a/clang/lib/Serialization/ASTWriter.cpp
+++ b/clang/lib/Serialization/ASTWriter.cpp
@@ -7220,7 +7220,7 @@ void OMPClauseWriter::VisitOMPNogroupClause(OMPNogroupClause *) {}
 
 void OMPClauseWriter::VisitOMPInitClause(OMPInitClause *C) {
   Record.push_back(C->varlist_size());
-  for (Expr *VE : C->varlists())
+  for (Expr *VE : C->varlist())
     Record.AddStmt(VE);
   Record.writeBool(C->getIsTarget());
   Record.writeBool(C->getIsTargetSync());
@@ -7266,7 +7266,7 @@ void OMPClauseWriter::VisitOMPAlignClause(OMPAlignClause *C) {
 void OMPClauseWriter::VisitOMPPrivateClause(OMPPrivateClause *C) {
   Record.push_back(C->varlist_size());
   Record.AddSourceLocation(C->getLParenLoc());
-  for (auto *VE : C->varlists()) {
+  for (auto *VE : C->varlist()) {
     Record.AddStmt(VE);
   }
   for (auto *VE : C->private_copies()) {
@@ -7278,7 +7278,7 @@ void OMPClauseWriter::VisitOMPFirstprivateClause(OMPFirstprivateClause *C) {
   Record.push_back(C->varlist_size());
   VisitOMPClauseWithPreInit(C);
   Record.AddSourceLocation(C->getLParenLoc());
-  for (auto *VE : C->varlists()) {
+  for (auto *VE : C->varlist()) {
     Record.AddStmt(VE);
   }
   for (auto *VE : C->private_copies()) {
@@ -7296,7 +7296,7 @@ void OMPClauseWriter::VisitOMPLastprivateClause(OMPLastprivateClause *C) {
   Record.writeEnum(C->getKind());
   Record.AddSourceLocation(C->getKindLoc());
   Record.AddSourceLocation(C->getColonLoc());
-  for (auto *VE : C->varlists())
+  for (auto *VE : C->varlist())
     Record.AddStmt(VE);
   for (auto *E : C->private_copies())
     Record.AddStmt(E);
@@ -7311,7 +7311,7 @@ void OMPClauseWriter::VisitOMPLastprivateClause(OMPLastprivateClause *C) {
 void OMPClauseWriter::VisitOMPSharedClause(OMPSharedClause *C) {
   Record.push_back(C->varlist_size());
   Record.AddSourceLocation(C->getLParenLoc());
-  for (auto *VE : C->varlists())
+  for (auto *VE : C->varlist())
     Record.AddStmt(VE);
 }
 
@@ -7324,7 +7324,7 @@ void OMPClauseWriter::VisitOMPReductionClause(OMPReductionClause *C) {
   Record.AddSourceLocation(C->getColonLoc());
   Record.AddNestedNameSpecifierLoc(C->getQualifierLoc());
   Record.AddDeclarationNameInfo(C->getNameInfo());
-  for (auto *VE : C->varlists())
+  for (auto *VE : C->varlist())
     Record.AddStmt(VE);
   for (auto *VE : C->privates())
     Record.AddStmt(VE);
@@ -7351,7 +7351,7 @@ void OMPClauseWriter::VisitOMPTaskReductionClause(OMPTaskReductionClause *C) {
   Record.AddSourceLocation(C->getColonLoc());
   Record.AddNestedNameSpecifierLoc(C->getQualifierLoc());
   Record.AddDeclarationNameInfo(C->getNameInfo());
-  for (auto *VE : C->varlists())
+  for (auto *VE : C->varlist())
     Record.AddStmt(VE);
   for (auto *VE : C->privates())
     Record.AddStmt(VE);
@@ -7370,7 +7370,7 @@ void OMPClauseWriter::VisitOMPInReductionClause(OMPInReductionClause *C) {
   Record.AddSourceLocation(C->getColonLoc());
   Record.AddNestedNameSpecifierLoc(C->getQualifierLoc());
   Record.AddDeclarationNameInfo(C->getNameInfo());
-  for (auto *VE : C->varlists())
+  for (auto *VE : C->varlist())
     Record.AddStmt(VE);
   for (auto *VE : C->privates())
     Record.AddStmt(VE);
@@ -7391,7 +7391,7 @@ void OMPClauseWriter::VisitOMPLinearClause(OMPLinearClause *C) {
   Record.AddSourceLocation(C->getColonLoc());
   Record.push_back(C->getModifier());
   Record.AddSourceLocation(C->getModifierLoc());
-  for (auto *VE : C->varlists()) {
+  for (auto *VE : C->varlist()) {
     Record.AddStmt(VE);
   }
   for (auto *VE : C->privates()) {
@@ -7416,7 +7416,7 @@ void OMPClauseWriter::VisitOMPAlignedClause(OMPAlignedClause *C) {
   Record.push_back(C->varlist_size());
   Record.AddSourceLocation(C->getLParenLoc());
   Record.AddSourceLocation(C->getColonLoc());
-  for (auto *VE : C->varlists())
+  for (auto *VE : C->varlist())
     Record.AddStmt(VE);
   Record.AddStmt(C->getAlignment());
 }
@@ -7424,7 +7424,7 @@ void OMPClauseWriter::VisitOMPAlignedClause(OMPAlignedClause *C) {
 void OMPClauseWriter::VisitOMPCopyinClause(OMPCopyinClause *C) {
   Record.push_back(C->varlist_size());
   Record.AddSourceLocation(C->getLParenLoc());
-  for (auto *VE : C->varlists())
+  for (auto *VE : C->varlist())
     Record.AddStmt(VE);
   for (auto *E : C->source_exprs())
     Record.AddStmt(E);
@@ -7437,7 +7437,7 @@ void OMPClauseWriter::VisitOMPCopyinClause(OMPCopyinClause *C) {
 void OMPClauseWriter::VisitOMPCopyprivateClause(OMPCopyprivateClause *C) {
   Record.push_back(C->varlist_size());
   Record.AddSourceLocation(C->getLParenLoc());
-  for (auto *VE : C->varlists())
+  for (auto *VE : C->varlist())
     Record.AddStmt(VE);
   for (auto *E : C->source_exprs())
     Record.AddStmt(E);
@@ -7450,7 +7450,7 @@ void OMPClauseWriter::VisitOMPCopyprivateClause(OMPCopyprivateClause *C) {
 void OMPClauseWriter::VisitOMPFlushClause(OMPFlushClause *C) {
   Record.push_back(C->varlist_size());
   Record.AddSourceLocation(C->getLParenLoc());
-  for (auto *VE : C->varlists())
+  for (auto *VE : C->varlist())
     Record.AddStmt(VE);
 }
 
@@ -7468,7 +7468,7 @@ void OMPClauseWriter::VisitOMPDependClause(OMPDependClause *C) {
   Record.AddSourceLocation(C->getDependencyLoc());
   Record.AddSourceLocation(C->getColonLoc());
   Record.AddSourceLocation(C->getOmpAllMemoryLoc());
-  for (auto *VE : C->varlists())
+  for (auto *VE : C->varlist())
     Record.AddStmt(VE);
   for (unsigned I = 0, E = C->getNumLoops(); I < E; ++I)
     Record.AddStmt(C->getLoopData(I));
@@ -7500,7 +7500,7 @@ void OMPClauseWriter::VisitOMPMapClause(OMPMapClause *C) {
   Record.push_back(C->getMapType());
   Record.AddSourceLocation(C->getMapLoc());
   Record.AddSourceLocation(C->getColonLoc());
-  for (auto *E : C->varlists())
+  for (auto *E : C->varlist())
     Record.AddStmt(E);
   for (auto *E : C->mapperlists())
     Record.AddStmt(E);
@@ -7523,7 +7523,7 @@ void OMPClauseWriter::VisitOMPAllocateClause(OMPAllocateClause *C) {
   Record.AddSourceLocation(C->getLParenLoc());
   Record.AddSourceLocation(C->getColonLoc());
   Record.AddStmt(C->getAllocator());
-  for (auto *VE : C->varlists())
+  for (auto *VE : C->varlist())
     Record.AddStmt(VE);
 }
 
@@ -7596,7 +7596,7 @@ void OMPClauseWriter::VisitOMPToClause(OMPToClause *C) {
   Record.AddNestedNameSpecifierLoc(C->getMapperQualifierLoc());
   Record.AddDeclarationNameInfo(C->getMapperIdInfo());
   Record.AddSourceLocation(C->getColonLoc());
-  for (auto *E : C->varlists())
+  for (auto *E : C->varlist())
     Record.AddStmt(E);
   for (auto *E : C->mapperlists())
     Record.AddStmt(E);
@@ -7626,7 +7626,7 @@ void OMPClauseWriter::VisitOMPFromClause(OMPFromClause *C) {
   Record.AddNestedNameSpecifierLoc(C->getMapperQualifierLoc());
   Record.AddDeclarationNameInfo(C->getMapperIdInfo());
   Record.AddSourceLocation(C->getColonLoc());
-  for (auto *E : C->varlists())
+  for (auto *E : C->varlist())
     Record.AddStmt(E);
   for (auto *E : C->mapperlists())
     Record.AddStmt(E);
@@ -7649,7 +7649,7 @@ void OMPClauseWriter::VisitOMPUseDevicePtrClause(OMPUseDevicePtrClause *C) {
   Record.push_back(C->getTotalComponentListNum());
   Record.push_back(C->getTotalComponentsNum());
   Record.AddSourceLocation(C->getLParenLoc());
-  for (auto *E : C->varlists())
+  for (auto *E : C->varlist())
     Record.AddStmt(E);
   for (auto *VE : C->private_copies())
     Record.AddStmt(VE);
@@ -7673,7 +7673,7 @@ void OMPClauseWriter::VisitOMPUseDeviceAddrClause(OMPUseDeviceAddrClause *C) {
   Record.push_back(C->getTotalComponentListNum());
   Record.push_back(C->getTotalComponentsNum());
   Record.AddSourceLocation(C->getLParenLoc());
-  for (auto *E : C->varlists())
+  for (auto *E : C->varlist())
     Record.AddStmt(E);
   for (auto *D : C->all_decls())
     Record.AddDeclRef(D);
@@ -7693,7 +7693,7 @@ void OMPClauseWriter::VisitOMPIsDevicePtrClause(OMPIsDevicePtrClause *C) {
   Record.push_back(C->getTotalComponentListNum());
   Record.push_back(C->getTotalComponentsNum());
   Record.AddSourceLocation(C->getLParenLoc());
-  for (auto *E : C->varlists())
+  for (auto *E : C->varlist())
     Record.AddStmt(E);
   for (auto *D : C->all_decls())
     Record.AddDeclRef(D);
@@ -7713,7 +7713,7 @@ void OMPClauseWriter::VisitOMPHasDeviceAddrClause(OMPHasDeviceAddrClause *C) {
   Record.push_back(C->getTotalComponentListNum());
   Record.push_back(C->getTotalComponentsNum());
   Record.AddSourceLocation(C->getLParenLoc());
-  for (auto *E : C->varlists())
+  for (auto *E : C->varlist())
     Record.AddStmt(E);
   for (auto *D : C->all_decls())
     Record.AddDeclRef(D);
@@ -7765,7 +7765,7 @@ void OMPClauseWriter::VisitOMPMessageClause(OMPMessageClause *C) {
 void OMPClauseWriter::VisitOMPNontemporalClause(OMPNontemporalClause *C) {
   Record.push_back(C->varlist_size());
   Record.AddSourceLocation(C->getLParenLoc());
-  for (auto *VE : C->varlists())
+  for (auto *VE : C->varlist())
     Record.AddStmt(VE);
   for (auto *E : C->private_refs())
     Record.AddStmt(E);
@@ -7774,14 +7774,14 @@ void OMPClauseWriter::VisitOMPNontemporalClause(OMPNontemporalClause *C) {
 void OMPClauseWriter::VisitOMPInclusiveClause(OMPInclusiveClause *C) {
   Record.push_back(C->varlist_size());
   Record.AddSourceLocation(C->getLParenLoc());
-  for (auto *VE : C->varlists())
+  for (auto *VE : C->varlist())
     Record.AddStmt(VE);
 }
 
 void OMPClauseWriter::VisitOMPExclusiveClause(OMPExclusiveClause *C) {
   Record.push_back(C->varlist_size());
   Record.AddSourceLocation(C->getLParenLoc());
-  for (auto *VE : C->varlists())
+  for (auto *VE : C->varlist())
     Record.AddStmt(VE);
 }
 
@@ -7810,7 +7810,7 @@ void OMPClauseWriter::VisitOMPAffinityClause(OMPAffinityClause *C) {
   Record.AddSourceLocation(C->getLParenLoc());
   Record.AddStmt(C->getModifier());
   Record.AddSourceLocation(C->getColonLoc());
-  for (Expr *E : C->varlists())
+  for (Expr *E : C->varlist())
     Record.AddStmt(E);
 }
 
@@ -7833,7 +7833,7 @@ void OMPClauseWriter::VisitOMPDoacrossClause(OMPDoacrossClause *C) {
   Record.push_back(C->getDependenceType());
   Record.AddSourceLocation(C->getDependenceLoc());
   Record.AddSourceLocation(C->getColonLoc());
-  for (auto *VE : C->varlists())
+  for (auto *VE : C->varlist())
     Record.AddStmt(VE);
   for (unsigned I = 0, E = C->getNumLoops(); I < E; ++I)
     Record.AddStmt(C->getLoopData(I));
diff --git a/clang/lib/StaticAnalyzer/Checkers/StreamChecker.cpp b/clang/lib/StaticAnalyzer/Checkers/StreamChecker.cpp
index 5377053..4454f30 100644
--- a/clang/lib/StaticAnalyzer/Checkers/StreamChecker.cpp
+++ b/clang/lib/StaticAnalyzer/Checkers/StreamChecker.cpp
@@ -615,30 +615,22 @@ private:
     });
   }
 
-  void initMacroValues(CheckerContext &C) const {
+  void initMacroValues(const Preprocessor &PP) const {
     if (EofVal)
       return;
 
-    if (const std::optional<int> OptInt =
-            tryExpandAsInteger("EOF", C.getPreprocessor()))
+    if (const std::optional<int> OptInt = tryExpandAsInteger("EOF", PP))
       EofVal = *OptInt;
     else
       EofVal = -1;
-    if (const std::optional<int> OptInt =
-            tryExpandAsInteger("SEEK_SET", C.getPreprocessor()))
+    if (const std::optional<int> OptInt = tryExpandAsInteger("SEEK_SET", PP))
       SeekSetVal = *OptInt;
-    if (const std::optional<int> OptInt =
-            tryExpandAsInteger("SEEK_END", C.getPreprocessor()))
+    if (const std::optional<int> OptInt = tryExpandAsInteger("SEEK_END", PP))
       SeekEndVal = *OptInt;
-    if (const std::optional<int> OptInt =
-            tryExpandAsInteger("SEEK_CUR", C.getPreprocessor()))
+    if (const std::optional<int> OptInt = tryExpandAsInteger("SEEK_CUR", PP))
       SeekCurVal = *OptInt;
   }
 
-  void initVaListType(CheckerContext &C) const {
-    VaListType = C.getASTContext().getBuiltinVaListType().getCanonicalType();
-  }
-
   /// Searches for the ExplodedNode where the file descriptor was acquired for
   /// StreamSym.
   static const ExplodedNode *getAcquisitionSite(const ExplodedNode *N,
@@ -880,9 +872,6 @@ static ProgramStateRef escapeArgs(ProgramStateRef State, CheckerContext &C,
 
 void StreamChecker::checkPreCall(const CallEvent &Call,
                                  CheckerContext &C) const {
-  initMacroValues(C);
-  initVaListType(C);
-
   const FnDescription *Desc = lookupFn(Call);
   if (!Desc || !Desc->PreFn)
     return;
@@ -938,7 +927,6 @@ void StreamChecker::evalFopen(const FnDescription *Desc, const CallEvent &Call,
   assert(RetSym && "RetVal must be a symbol here.");
 
   State = State->BindExpr(CE, C.getLocationContext(), RetVal);
-  State = assumeNoAliasingWithStdStreams(State, RetVal, C);
 
   // Bifurcate the state into two: one with a valid FILE* pointer, the other
   // with a NULL.
@@ -951,6 +939,8 @@ void StreamChecker::evalFopen(const FnDescription *Desc, const CallEvent &Call,
   StateNull =
       StateNull->set<StreamMap>(RetSym, StreamState::getOpenFailed(Desc));
 
+  StateNotNull = assumeNoAliasingWithStdStreams(StateNotNull, RetVal, C);
+
   C.addTransition(StateNotNull,
                   constructLeakNoteTag(C, RetSym, "Stream opened here"));
   C.addTransition(StateNull);
@@ -2081,10 +2071,12 @@ getGlobalStreamPointerByName(const TranslationUnitDecl *TU, StringRef VarName) {
 }
 
 void StreamChecker::checkASTDecl(const TranslationUnitDecl *TU,
-                                 AnalysisManager &, BugReporter &) const {
+                                 AnalysisManager &Mgr, BugReporter &) const {
   StdinDecl = getGlobalStreamPointerByName(TU, "stdin");
   StdoutDecl = getGlobalStreamPointerByName(TU, "stdout");
   StderrDecl = getGlobalStreamPointerByName(TU, "stderr");
+  VaListType = TU->getASTContext().getBuiltinVaListType().getCanonicalType();
+  initMacroValues(Mgr.getPreprocessor());
 }
 
 //===----------------------------------------------------------------------===//
diff --git a/clang/lib/Tooling/ArgumentsAdjusters.cpp b/clang/lib/Tooling/ArgumentsAdjusters.cpp
index df4c742..d01c57e 100644
--- a/clang/lib/Tooling/ArgumentsAdjusters.cpp
+++ b/clang/lib/Tooling/ArgumentsAdjusters.cpp
@@ -49,10 +49,11 @@ ArgumentsAdjuster getClangSyntaxOnlyAdjuster() {
           }))
         continue;
 
-      if (!Arg.starts_with("-fcolor-diagnostics") &&
+      if (Arg != "-c" && Arg != "-S" &&
+          !Arg.starts_with("-fcolor-diagnostics") &&
           !Arg.starts_with("-fdiagnostics-color"))
         AdjustedArgs.push_back(Args[i]);
-      // If we strip a color option, make sure we strip any preceeding `-Xclang`
+      // If we strip an option, make sure we strip any preceeding `-Xclang`
       // option as well.
       // FIXME: This should be added to most argument adjusters!
       else if (!AdjustedArgs.empty() && AdjustedArgs.back() == "-Xclang")
diff --git a/clang/test/APINotes/Inputs/Headers/Methods.h b/clang/test/APINotes/Inputs/Headers/Methods.h
index 6a96b12..cbb57cc 100644
--- a/clang/test/APINotes/Inputs/Headers/Methods.h
+++ b/clang/test/APINotes/Inputs/Headers/Methods.h
@@ -2,6 +2,8 @@ struct IntWrapper {
   int value;
 
   IntWrapper getIncremented() const { return {value + 1}; }
+
+  IntWrapper operator+(const IntWrapper& RHS) const { return {value + RHS.value}; }
 };
 
 struct Outer {
@@ -9,5 +11,9 @@ struct Outer {
     int value;
 
     Inner getDecremented() const { return {value - 1}; }
+
+    bool operator==(const Inner& RHS) const {
+      return value == RHS.value;
+    }
   };
 };
diff --git a/clang/test/AST/Interp/constexpr-subobj-initialization.cpp b/clang/test/AST/Interp/constexpr-subobj-initialization.cpp
index 4976b16..1a35994 100644
--- a/clang/test/AST/Interp/constexpr-subobj-initialization.cpp
+++ b/clang/test/AST/Interp/constexpr-subobj-initialization.cpp
@@ -5,8 +5,6 @@
 /// Differences:
 ///   1) The type of the uninitialized base class is printed WITH the namespace,
 ///      i.e. 'baseclass_uninit::DelBase' instead of just 'DelBase'.
-///   2) The location is not the base specifier declaration, but the call site
-///      of the constructor.
 
 
 namespace baseclass_uninit {
@@ -14,33 +12,29 @@ struct DelBase {
   constexpr DelBase() = delete; // expected-note {{'DelBase' has been explicitly marked deleted here}}
 };
 
-struct Foo : DelBase {
+struct Foo : DelBase { // expected-note 2{{constructor of base class 'baseclass_uninit::DelBase' is not called}}
   constexpr Foo() {}; // expected-error {{call to deleted constructor of 'DelBase'}}
 };
-constexpr Foo f; // expected-error {{must be initialized by a constant expression}} \
-                 // expected-note {{constructor of base class 'baseclass_uninit::DelBase' is not called}}
+constexpr Foo f; // expected-error {{must be initialized by a constant expression}}
 
 struct Bar : Foo {
   constexpr Bar() {};
 };
-constexpr Bar bar; // expected-error {{must be initialized by a constant expression}} \
-                   // expected-note {{constructor of base class 'baseclass_uninit::DelBase' is not called}}
+constexpr Bar bar; // expected-error {{must be initialized by a constant expression}}
 
 struct Base {};
-struct A : Base {
+struct A : Base { // expected-note {{constructor of base class 'baseclass_uninit::Base' is not called}}
   constexpr A() : value() {} // expected-error {{member initializer 'value' does not name a non-static data member or base class}}
 };
 
-constexpr A a; // expected-error {{must be initialized by a constant expression}} \
-               // expected-note {{constructor of base class 'baseclass_uninit::Base' is not called}}
+constexpr A a; // expected-error {{must be initialized by a constant expression}}
 
 
-struct B : Base {
+struct B : Base { // expected-note {{constructor of base class 'baseclass_uninit::Base' is not called}}
   constexpr B() : {} // expected-error {{expected class member or base class name}}
 };
 
-constexpr B b; // expected-error {{must be initialized by a constant expression}} \
-               // expected-note {{constructor of base class 'baseclass_uninit::Base' is not called}}
+constexpr B b; // expected-error {{must be initialized by a constant expression}}
 } // namespace baseclass_uninit
 
 
diff --git a/clang/test/AST/Interp/cxx2a.cpp b/clang/test/AST/Interp/cxx2a.cpp
index 27d1aa1..ad021b3 100644
--- a/clang/test/AST/Interp/cxx2a.cpp
+++ b/clang/test/AST/Interp/cxx2a.cpp
@@ -13,3 +13,24 @@ consteval int aConstevalFunction() { // both-error {{consteval function never pr
   return 0;
 }
 /// We're NOT calling the above function. The diagnostics should appear anyway.
+
+namespace Covariant {
+  struct A {
+    virtual constexpr char f() const { return 'Z'; }
+    char a = f();
+  };
+
+  struct D : A {};
+  struct Covariant1 {
+    D d;
+    virtual const A *f() const;
+  };
+
+  struct Covariant3 : Covariant1 {
+    constexpr virtual const D *f() const { return &this->d; }
+  };
+
+  constexpr Covariant3 cb;
+  constexpr const Covariant1 *cb1 = &cb;
+  static_assert(cb1->f()->a == 'Z');
+}
diff --git a/clang/test/AST/Interp/lifetimes.cpp b/clang/test/AST/Interp/lifetimes.cpp
index d47533a..9fca54f 100644
--- a/clang/test/AST/Interp/lifetimes.cpp
+++ b/clang/test/AST/Interp/lifetimes.cpp
@@ -33,3 +33,30 @@ struct S {
 constexpr int k1 = S().t; // both-error {{must be initialized by a constant expression}} \
                           // ref-note {{in call to}} \
                           // expected-note {{in call to}}
+
+
+namespace MoveFnWorks {
+  template<typename T> constexpr T &&ref(T &&t) { return (T&&)t; }
+
+  struct Buf {};
+
+  struct A {
+    constexpr A(Buf &buf) : buf(buf) { }
+    Buf &buf;
+  };
+
+  constexpr bool dtor_calls_dtor() {
+    struct B {
+      A &&d;
+      constexpr B(Buf &buf) : d(ref(A(buf))) {}
+    };
+
+    Buf buf;
+    {
+      B b(buf);
+    }
+
+    return true;
+  }
+  static_assert(dtor_calls_dtor(), "");
+}
diff --git a/clang/test/AST/Interp/records.cpp b/clang/test/AST/Interp/records.cpp
index 9551630..479c048 100644
--- a/clang/test/AST/Interp/records.cpp
+++ b/clang/test/AST/Interp/records.cpp
@@ -1560,3 +1560,19 @@ namespace ArrayInitChain {
   static_assert(A[1].Width == 12, "");
   static_assert(A[1].Mask == 13, "");
 }
+
+#if __cplusplus >= 202002L
+namespace ctorOverrider {
+  // Ensure that we pick the right final overrider during construction.
+  struct A {
+    virtual constexpr char f() const { return 'A'; }
+    char a = f();
+  };
+
+  struct Covariant1 {
+    A d;
+  };
+
+  constexpr Covariant1 cb;
+}
+#endif
diff --git a/clang/test/AST/attr-print-emit.cpp b/clang/test/AST/attr-print-emit.cpp
index 8c8a2b2..d8e62ed 100644
--- a/clang/test/AST/attr-print-emit.cpp
+++ b/clang/test/AST/attr-print-emit.cpp
@@ -32,8 +32,8 @@ int *aa(int i) __attribute__((alloc_align(1)));
 void ownt(int *, int *) __attribute__((ownership_takes(foo, 1, 2)));
 // CHECK: void ownh(int *, int *) __attribute__((ownership_holds(foo, 1, 2)));
 void ownh(int *, int *) __attribute__((ownership_holds(foo, 1, 2)));
-// CHECK: void ownr(int) __attribute__((ownership_returns(foo, 1)));
-void ownr(int) __attribute__((ownership_returns(foo, 1)));
+// CHECK: void *ownr(int) __attribute__((ownership_returns(foo, 1)));
+void *ownr(int) __attribute__((ownership_returns(foo, 1)));
 
 // CHECK: void awtt(int, int, ...) __attribute__((argument_with_type_tag(foo, 3, 2)));
 void awtt(int, int, ...) __attribute__((argument_with_type_tag(foo, 3, 2)));
@@ -65,8 +65,8 @@ class C {
   void ownt(int *, int *) __attribute__((ownership_takes(foo, 2, 3)));
   // CHECK: void ownh(int *, int *) __attribute__((ownership_holds(foo, 2, 3)));
   void ownh(int *, int *) __attribute__((ownership_holds(foo, 2, 3)));
-  // CHECK: void ownr(int) __attribute__((ownership_returns(foo, 2)));
-  void ownr(int) __attribute__((ownership_returns(foo, 2)));
+  // CHECK: void *ownr(int) __attribute__((ownership_returns(foo, 2)));
+  void *ownr(int) __attribute__((ownership_returns(foo, 2)));
 
   // CHECK: void awtt(int, int, ...) __attribute__((argument_with_type_tag(foo, 4, 3)));
   void awtt(int, int, ...) __attribute__((argument_with_type_tag(foo, 4, 3)));
diff --git a/clang/test/AST/explicit-base-class-move-cntr.cpp b/clang/test/AST/explicit-base-class-move-cntr.cpp
new file mode 100644
index 0000000..808af2f
--- /dev/null
+++ b/clang/test/AST/explicit-base-class-move-cntr.cpp
@@ -0,0 +1,171 @@
+// RUN: %clang_cc1 -ast-dump=json %s | FileCheck -strict-whitespace %s
+
+struct ExplicitBase {
+  explicit ExplicitBase(const char *) { }
+  ExplicitBase(const ExplicitBase &) {}
+  ExplicitBase(ExplicitBase &&) {}
+  ExplicitBase &operator=(const ExplicitBase &) { return *this; }
+  ExplicitBase &operator=(ExplicitBase &&) { return *this; }
+  ~ExplicitBase() { }
+};
+
+struct Derived1 : ExplicitBase {};
+
+Derived1 makeDerived1() {
+// CHECK:  "kind": "FunctionDecl",
+// CHECK:  "name": "makeDerived1",
+
+// CHECK:    "kind": "CompoundStmt",
+
+// CHECK:      "kind": "ReturnStmt",
+// CHECK:        "kind": "ExprWithCleanups",
+// CHECK:        "type": {
+// CHECK-NEXT:     "qualType": "Derived1"
+// CHECK-NEXT:   },
+
+// CHECK:          "kind": "CXXFunctionalCastExpr",
+// CHECK:          "type": {
+// CHECK-NEXT:       "qualType": "Derived1"
+// CHECK-NEXT:     },
+// CHECK-NEXT:     "valueCategory": "prvalue",
+// CHECK-NEXT:     "castKind": "NoOp",
+
+// CHECK:            "kind": "CXXBindTemporaryExpr",
+// CHECK:            "type": {
+// CHECK-NEXT:         "qualType": "Derived1"
+// CHECK-NEXT:       },
+// CHECK-NEXT:       "valueCategory": "prvalue",
+
+// CHECK:              "kind": "InitListExpr",
+// CHECK:              "type": {
+// CHECK-NEXT:           "qualType": "Derived1"
+// CHECK-NEXT:         },
+// CHECK-NEXT:         "valueCategory": "prvalue",
+
+// CHECK:                "kind": "CXXConstructExpr",
+// CHECK:                "type": {
+// CHECK-NEXT:             "qualType": "ExplicitBase"
+// CHECK-NEXT:           },
+// CHECK-NEXT:           "valueCategory": "prvalue",
+// CHECK-NEXT:           "ctorType": {
+// CHECK-NEXT:             "qualType": "void (ExplicitBase &&)"
+// CHECK-NEXT:           },
+// CHECK-NEXT:           "hadMultipleCandidates": true,
+// CHECK-NEXT:           "constructionKind": "non-virtual base",
+
+// CHECK:                  "kind": "MaterializeTemporaryExpr",
+// CHECK:                  "type": {
+// CHECK-NEXT:               "qualType": "ExplicitBase"
+// CHECK-NEXT:             },
+// CHECK-NEXT:             "valueCategory": "xvalue",
+// CHECK-NEXT:             "storageDuration": "full expression",
+
+// CHECK:                    "kind": "CXXBindTemporaryExpr",
+// CHECK:                    "type": {
+// CHECK-NEXT:                 "qualType": "ExplicitBase"
+// CHECK-NEXT:               },
+// CHECK-NEXT:               "valueCategory": "prvalue",
+
+// CHECK:                      "kind": "CXXTemporaryObjectExpr",
+// CHECK:                      "type": {
+// CHECK-NEXT:                   "qualType": "ExplicitBase"
+// CHECK-NEXT:                 },
+// CHECK-NEXT:                 "valueCategory": "prvalue",
+// CHECK-NEXT:                 "ctorType": {
+// CHECK-NEXT:                   "qualType": "void (const char *)"
+// CHECK-NEXT:                 },
+// CHECK-NEXT:                 "list": true,
+// CHECK-NEXT:                 "hadMultipleCandidates": true,
+// CHECK-NEXT:                 "constructionKind": "complete",
+
+// CHECK:                        "kind": "ImplicitCastExpr",
+// CHECK:                        "type": {
+// CHECK-NEXT:                     "qualType": "const char *"
+// CHECK-NEXT:                   },
+// CHECK-NEXT:                   "valueCategory": "prvalue",
+// CHECK-NEXT:                   "castKind": "ArrayToPointerDecay",
+
+// CHECK:                          "kind": "StringLiteral",
+// CHECK:                          "type": {
+// CHECK-NEXT:                       "qualType": "const char[10]"
+// CHECK-NEXT:                     },
+// CHECK-NEXT:                     "valueCategory": "lvalue",
+// CHECK-NEXT:                     "value": "\"Move Ctor\""
+  return Derived1{ExplicitBase{"Move Ctor"}};
+}
+
+struct ImplicitBase {
+  ImplicitBase(const char *) { }
+  ImplicitBase(const ImplicitBase &) {}
+  ImplicitBase(ImplicitBase &&) {}
+  ImplicitBase &operator=(const ImplicitBase &) { return *this; }
+  ImplicitBase &operator=(ImplicitBase &&) { return *this; }
+  ~ImplicitBase() { }
+};
+
+struct Derived2 : ImplicitBase {};
+
+Derived2 makeDerived2() {
+// CHECK:  "kind": "FunctionDecl",
+// CHECK:  "name": "makeDerived2",
+
+// CHECK:    "kind": "CompoundStmt",
+
+// CHECK:      "kind": "ReturnStmt",
+
+// CHECK:        "kind": "ExprWithCleanups",
+// CHECK:        "type": {
+// CHECK-NEXT:     "qualType": "Derived2"
+// CHECK-NEXT:   },
+// CHECK-NEXT:   "valueCategory": "prvalue",
+// CHECK-NEXT:   "cleanupsHaveSideEffects": true,
+
+// CHECK:          "kind": "CXXFunctionalCastExpr",
+// CHECK:          "type": {
+// CHECK-NEXT:       "qualType": "Derived2"
+// CHECK-NEXT:     },
+// CHECK-NEXT:     "valueCategory": "prvalue",
+// CHECK-NEXT:     "castKind": "NoOp",
+
+// CHECK:            "kind": "CXXBindTemporaryExpr",
+// CHECK:            "type": {
+// CHECK-NEXT:         "qualType": "Derived2"
+// CHECK-NEXT:       },
+// CHECK-NEXT:       "valueCategory": "prvalue",
+
+// CHECK:              "kind": "InitListExpr",
+// CHECK:              "type": {
+// CHECK-NEXT:           "qualType": "Derived2"
+// CHECK-NEXT:         },
+// CHECK-NEXT:         "valueCategory": "prvalue",
+
+// CHECK:                "kind": "CXXConstructExpr",
+// CHECK:                "type": {
+// CHECK-NEXT:             "qualType": "ImplicitBase"
+// CHECK-NEXT:           },
+// CHECK-NEXT:           "valueCategory": "prvalue",
+// CHECK-NEXT:           "ctorType": {
+// CHECK-NEXT:             "qualType": "void (const char *)"
+// CHECK-NEXT:           },
+// CHECK-NEXT:           "list": true,
+// CHECK-NEXT:           "hadMultipleCandidates": true,
+// CHECK-NEXT:           "constructionKind": "non-virtual base",
+
+// CHECK:                  "kind": "ImplicitCastExpr",
+// CHECK:                  "type": {
+// CHECK-NEXT:               "qualType": "const char *"
+// CHECK-NEXT:             },
+// CHECK-NEXT:             "valueCategory": "prvalue",
+// CHECK-NEXT:             "castKind": "ArrayToPointerDecay",
+
+// CHECK:                    "kind": "StringLiteral",
+// CHECK:                    "type": {
+// CHECK-NEXT:                 "qualType": "const char[8]"
+// CHECK-NEXT:               },
+// CHECK-NEXT:               "valueCategory": "lvalue",
+// CHECK-NEXT:               "value": "\"No Ctor\""
+  return Derived2{{"No Ctor"}};
+}
+
+// NOTE: CHECK lines have been autogenerated by gen_ast_dump_json_test.py
+// using --filters=FunctionDecl,CompoundStmt,ReturnStmt,MaterializeTemporaryExpr,CXXBindTemporaryExpr,CXXTemporaryObjectExpr,ImplicitCastExpr,StringLiteralStringLiteral
diff --git a/clang/test/Analysis/live-stmts.cpp b/clang/test/Analysis/live-stmts.cpp
index 16954f3..c60f522 100644
--- a/clang/test/Analysis/live-stmts.cpp
+++ b/clang/test/Analysis/live-stmts.cpp
@@ -193,3 +193,112 @@ void test_lambda_refcapture() {
 // CHECK-NEXT: [ B2 (live expressions at block exit) ]
 // CHECK-EMPTY:
 // CHECK-EMPTY:
+
+int logicalOpInTernary(bool b) {
+  return (b || b) ? 0 : 1;
+}
+
+// [B6 (ENTRY)]
+//    |
+//    V
+// [B5 (b || ...)]
+//   |            \
+//   |             |
+//   V             V
+// [B4 (b||b)] ? [B2 (0)] : [B3 (1)]
+//                \        /
+//                 ---|----
+//                    V
+//                   [B1] --> [B0 (EXIT)]
+//                  return
+
+// CHECK: [ B0 (live expressions at block exit) ]
+// CHECK-EMPTY:
+// CHECK-EMPTY:
+// CHECK: [ B1 (live expressions at block exit) ]
+// CHECK-EMPTY:
+// CHECK-EMPTY:
+// CHECK: [ B2 (live expressions at block exit) ]
+// CHECK-EMPTY:
+// CHECK: ImplicitCastExpr {{.*}} '_Bool' <LValueToRValue>
+// CHECK: `-DeclRefExpr {{.*}} '_Bool' lvalue ParmVar {{.*}} 'b' '_Bool'
+// CHECK-EMPTY:
+// CHECK: ImplicitCastExpr {{.*}} '_Bool' <LValueToRValue>
+// CHECK: `-DeclRefExpr {{.*}} '_Bool' lvalue ParmVar {{.*}} 'b' '_Bool'
+// CHECK-EMPTY:
+// CHECK: BinaryOperator {{.*}} '_Bool' '||'
+// CHECK: |-ImplicitCastExpr {{.*}} '_Bool' <LValueToRValue>
+// CHECK: | `-DeclRefExpr {{.*}} '_Bool' lvalue ParmVar {{.*}} 'b' '_Bool'
+// CHECK: `-ImplicitCastExpr {{.*}} '_Bool' <LValueToRValue>
+// CHECK:   `-DeclRefExpr {{.*}} '_Bool' lvalue ParmVar {{.*}} 'b' '_Bool'
+// CHECK-EMPTY:
+// CHECK: IntegerLiteral {{.*}} 'int' 0
+// CHECK-EMPTY:
+// CHECK: IntegerLiteral {{.*}} 'int' 1
+// CHECK-EMPTY:
+// CHECK-EMPTY:
+// CHECK: [ B3 (live expressions at block exit) ]
+// CHECK-EMPTY:
+// CHECK: ImplicitCastExpr {{.*}} '_Bool' <LValueToRValue>
+// CHECK: `-DeclRefExpr {{.*}} '_Bool' lvalue ParmVar {{.*}} 'b' '_Bool'
+// CHECK-EMPTY:
+// CHECK: ImplicitCastExpr {{.*}} '_Bool' <LValueToRValue>
+// CHECK: `-DeclRefExpr {{.*}} '_Bool' lvalue ParmVar {{.*}} 'b' '_Bool'
+// CHECK-EMPTY:
+// CHECK: BinaryOperator {{.*}} '_Bool' '||'
+// CHECK: |-ImplicitCastExpr {{.*}} '_Bool' <LValueToRValue>
+// CHECK: | `-DeclRefExpr {{.*}} '_Bool' lvalue ParmVar {{.*}} 'b' '_Bool'
+// CHECK: `-ImplicitCastExpr {{.*}} '_Bool' <LValueToRValue>
+// CHECK:   `-DeclRefExpr {{.*}} '_Bool' lvalue ParmVar {{.*}} 'b' '_Bool'
+// CHECK-EMPTY:
+// CHECK: IntegerLiteral {{.*}} 'int' 0
+// CHECK-EMPTY:
+// CHECK: IntegerLiteral {{.*}} 'int' 1
+// CHECK-EMPTY:
+// CHECK-EMPTY:
+// CHECK: [ B4 (live expressions at block exit) ]
+// CHECK-EMPTY:
+// CHECK: ImplicitCastExpr {{.*}} '_Bool' <LValueToRValue>
+// CHECK: `-DeclRefExpr {{.*}} '_Bool' lvalue ParmVar {{.*}} 'b' '_Bool'
+// CHECK-EMPTY:
+// CHECK: ImplicitCastExpr {{.*}} '_Bool' <LValueToRValue>
+// CHECK: `-DeclRefExpr {{.*}} '_Bool' lvalue ParmVar {{.*}} 'b' '_Bool'
+// CHECK-EMPTY:
+// CHECK: BinaryOperator {{.*}} '_Bool' '||'
+// CHECK: |-ImplicitCastExpr {{.*}} '_Bool' <LValueToRValue>
+// CHECK: | `-DeclRefExpr {{.*}} '_Bool' lvalue ParmVar {{.*}} 'b' '_Bool'
+// CHECK: `-ImplicitCastExpr {{.*}} '_Bool' <LValueToRValue>
+// CHECK:   `-DeclRefExpr {{.*}} '_Bool' lvalue ParmVar {{.*}} 'b' '_Bool'
+// CHECK-EMPTY:
+// CHECK: IntegerLiteral {{.*}} 'int' 0
+// CHECK-EMPTY:
+// CHECK: IntegerLiteral {{.*}} 'int' 1
+// CHECK-EMPTY:
+// CHECK-EMPTY:
+// CHECK: [ B5 (live expressions at block exit) ]
+// CHECK-EMPTY:
+// CHECK: ImplicitCastExpr {{.*}} '_Bool' <LValueToRValue>
+// CHECK: `-DeclRefExpr {{.*}} '_Bool' lvalue ParmVar {{.*}} 'b' '_Bool'
+// CHECK-EMPTY:
+// CHECK: ImplicitCastExpr {{.*}} '_Bool' <LValueToRValue>
+// CHECK: `-DeclRefExpr {{.*}} '_Bool' lvalue ParmVar {{.*}} 'b' '_Bool'
+// CHECK-EMPTY:
+// CHECK: BinaryOperator {{.*}} '_Bool' '||'
+// CHECK: |-ImplicitCastExpr {{.*}} '_Bool' <LValueToRValue>
+// CHECK: | `-DeclRefExpr {{.*}} '_Bool' lvalue ParmVar {{.*}} 'b' '_Bool'
+// CHECK: `-ImplicitCastExpr {{.*}} '_Bool' <LValueToRValue>
+// CHECK:   `-DeclRefExpr {{.*}} '_Bool' lvalue ParmVar {{.*}} 'b' '_Bool'
+// CHECK-EMPTY:
+// CHECK: IntegerLiteral {{.*}} 'int' 0
+// CHECK-EMPTY:
+// CHECK: IntegerLiteral {{.*}} 'int' 1
+// CHECK-EMPTY:
+// CHECK-EMPTY:
+// CHECK: [ B6 (live expressions at block exit) ]
+// CHECK-EMPTY:
+// CHECK: ImplicitCastExpr {{.*}} '_Bool' <LValueToRValue>
+// CHECK: `-DeclRefExpr {{.*}} '_Bool' lvalue ParmVar {{.*}} 'b' '_Bool'
+// CHECK-EMPTY:
+// CHECK: IntegerLiteral {{.*}} 'int' 0
+// CHECK-EMPTY:
+// CHECK: IntegerLiteral {{.*}} 'int' 1
diff --git a/clang/test/Analysis/short-circuiting-eval.cpp b/clang/test/Analysis/short-circuiting-eval.cpp
new file mode 100644
index 0000000..d0f29a8
--- /dev/null
+++ b/clang/test/Analysis/short-circuiting-eval.cpp
@@ -0,0 +1,39 @@
+// RUN: %clang_analyze_cc1 -analyzer-checker=core.DivideZero -verify %s
+
+int div0LogicalOpInTernary(bool b1) {
+  int y = (b1 || b1) ? 0 : 1;
+  return 1 / y; // expected-warning {{Division by zero}}
+}
+
+int div0LogicalAndArith(bool b1, int x) {
+  int y = (b1 || (x < 3)) ? 0 : 1;
+  return 1 / y; // expected-warning {{Division by zero}}
+}
+
+int div0NestedLogicalOp(bool b1) {
+  int y = (b1 && b1 || b1 && b1) ? 0 : 1;
+  return 1 / y; // expected-warning {{Division by zero}}
+}
+
+int div0TernaryInTernary(bool b) {
+  int y = ((b || b) ? false : true) ? 0 : 1;
+  return 1 / y; // expected-warning {{Division by zero}}
+}
+
+int div0LogicalOpParensInTernary(bool b1) {
+  int y = ((((b1)) || ((b1)))) ? 0 : 1;
+  return 1 / y; // expected-warning {{Division by zero}}
+}
+
+int div0LogicalOpInsideStExpr(bool b1) {
+  int y = ({1; (b1 || b1);}) ? 0 : 1;
+  // expected-warning@-1 {{expression result unused}}
+  return 1 / y; // expected-warning {{Division by zero}}
+}
+
+int div0StExprInsideLogicalOp(bool b1) {
+  int y = (({1; b1;}) || ({1; b1;})) ? 0 : 1;
+  // expected-warning@-1 {{expression result unused}}
+  // expected-warning@-2 {{expression result unused}}
+  return 1 / y; // expected-warning {{Division by zero}}
+}
diff --git a/clang/test/Analysis/stream.c b/clang/test/Analysis/stream.c
index b3a47ce..b9a5b1b 100644
--- a/clang/test/Analysis/stream.c
+++ b/clang/test/Analysis/stream.c
@@ -1,11 +1,11 @@
 // RUN: %clang_analyze_cc1 -triple=x86_64-pc-linux-gnu -analyzer-checker=core,unix.Stream,debug.ExprInspection \
-// RUN:   -analyzer-config unix.Stream:Pedantic=true -verify %s
+// RUN:   -analyzer-config eagerly-assume=false,unix.Stream:Pedantic=true -verify %s
 // RUN: %clang_analyze_cc1 -triple=armv8-none-linux-eabi -analyzer-checker=core,unix.Stream,debug.ExprInspection \
-// RUN:   -analyzer-config unix.Stream:Pedantic=true -verify %s
+// RUN:   -analyzer-config eagerly-assume=false,unix.Stream:Pedantic=true -verify %s
 // RUN: %clang_analyze_cc1 -triple=aarch64-linux-gnu -analyzer-checker=core,unix.Stream,debug.ExprInspection \
-// RUN:   -analyzer-config unix.Stream:Pedantic=true -verify %s
+// RUN:   -analyzer-config eagerly-assume=false,unix.Stream:Pedantic=true -verify %s
 // RUN: %clang_analyze_cc1 -triple=hexagon -analyzer-checker=core,unix.Stream,debug.ExprInspection \
-// RUN:   -analyzer-config unix.Stream:Pedantic=true -verify %s
+// RUN:   -analyzer-config eagerly-assume=false,unix.Stream:Pedantic=true -verify %s
 
 #include "Inputs/system-header-simulator.h"
 #include "Inputs/system-header-simulator-for-malloc.h"
@@ -499,14 +499,34 @@ void gh_93408_regression_ZeroSized(struct ZeroSized *buffer) {
   fclose(f);
 }
 
-extern FILE *stdout_like_ptr;
-void no_aliasing(void) {
+extern FILE *non_standard_stream_ptr;
+void test_fopen_does_not_alias_with_standard_streams(void) {
   FILE *f = fopen("file", "r");
-  clang_analyzer_eval(f == stdin);           // expected-warning {{FALSE}} no-TRUE
-  clang_analyzer_eval(f == stdout);          // expected-warning {{FALSE}} no-TRUE
-  clang_analyzer_eval(f == stderr);          // expected-warning {{FALSE}} no-TRUE
-  clang_analyzer_eval(f == stdout_like_ptr); // expected-warning {{FALSE}} expected-warning {{TRUE}}
-  if (f && f != stdout) {
+  if (!f) return;
+  clang_analyzer_eval(f == stdin);  // expected-warning {{FALSE}} no-TRUE
+  clang_analyzer_eval(f == stdout); // expected-warning {{FALSE}} no-TRUE
+  clang_analyzer_eval(f == stderr); // expected-warning {{FALSE}} no-TRUE
+  clang_analyzer_eval(f == non_standard_stream_ptr); // expected-warning {{UNKNOWN}}
+  if (f != stdout) {
     fclose(f);
   }
 } // no-leak: 'fclose()' is always called because 'f' cannot be 'stdout'.
+
+void reopen_std_stream(void) {
+  FILE *oldStdout = stdout;
+  fclose(stdout);
+  FILE *fp = fopen("blah", "w");
+  if (!fp) return;
+
+  stdout = fp; // Let's make them alias.
+  clang_analyzer_eval(fp == oldStdout);     // expected-warning {{UNKNOWN}}
+  clang_analyzer_eval(fp == stdout);        // expected-warning {{TRUE}} no-FALSE
+  clang_analyzer_eval(oldStdout == stdout); // expected-warning {{UNKNOWN}}
+}
+
+void only_success_path_does_not_alias_with_stdout(void) {
+  if (stdout) return;
+  FILE *f = fopen("/tmp/foof", "r"); // no-crash
+  if (!f) return;
+  fclose(f);
+}
diff --git a/clang/test/C/C23/n3018.c b/clang/test/C/C23/n3018.c
index 0d54d53..4ad2fff 100644
--- a/clang/test/C/C23/n3018.c
+++ b/clang/test/C/C23/n3018.c
@@ -1,4 +1,5 @@
 // RUN: %clang_cc1 -std=c23 -verify -triple x86_64 -pedantic -Wno-conversion -Wno-constant-conversion %s
+// RUN: %clang_cc1 -std=c23 -verify -triple x86_64 -pedantic -Wno-conversion -Wno-constant-conversion -fexperimental-new-constant-interpreter %s
 
 /* WG14 N3018: Full
  * The constexpr specifier for object definitions
diff --git a/clang/test/CXX/dcl.decl/dcl.meaning/dcl.mptr/p2.cpp b/clang/test/CXX/dcl.decl/dcl.meaning/dcl.mptr/p2.cpp
new file mode 100644
index 0000000..a06b107
--- /dev/null
+++ b/clang/test/CXX/dcl.decl/dcl.meaning/dcl.mptr/p2.cpp
@@ -0,0 +1,64 @@
+// RUN: %clang_cc1 -fsyntax-only -verify %s
+
+template<typename T>
+struct A0 {
+  struct B0;
+
+  template<typename U>
+  struct C0 {
+    struct D0;
+
+    template<typename V>
+    struct E0;
+  };
+};
+
+template<typename T>
+int A0<T>::B0::* f0();
+
+template<typename T>
+int A0<T>::B1::* f1();
+
+template<typename T>
+int A0<T>::C0<int>::* f2(); // expected-error {{expected unqualified-id}}
+
+template<typename T>
+int A0<T>::C1<int>::* f3(); // expected-error {{no member named 'C1' in 'A0<T>'}}
+                            // expected-error@-1 {{expected ';' after top level declarator}}
+
+template<typename T>
+int A0<T>::template C2<int>::* f4();
+
+template<typename T>
+int A0<T>::template C0<int>::D0::* f5();
+
+template<typename T>
+int A0<T>::template C2<int>::D1::* f6();
+
+template<typename T>
+int A0<T>::template C0<int>::E0<int>::* f7(); // expected-error {{use 'template' keyword to treat 'E0' as a dependent template name}}
+                                              // expected-error@-1 {{expected unqualified-id}}
+
+template<typename T>
+int A0<T>::template C2<int>::E1<int>::* f8(); // expected-error {{no member named 'C2' in 'A0<T>'}}
+
+template<typename T>
+int A0<T>::template C0<int>::template E0<int>::* f9();
+
+template<typename T>
+int A0<T>::template C2<int>::template E1<int>::* f10();
+
+namespace TypoCorrection {
+  template<typename T>
+  struct A {
+    template<typename U>
+    struct Typo; // expected-note {{'Typo' declared here}}
+  };
+
+  template<typename T>
+  int A<T>::template typo<int>::* f();
+
+  template<typename T>
+  int A<T>::typo<int>::* g(); // expected-error {{no template named 'typo' in 'A<T>'; did you mean 'Typo'?}}
+                              // expected-error@-1 {{expected unqualified-id}}
+}
diff --git a/clang/test/CXX/temp/temp.res/p3.cpp b/clang/test/CXX/temp/temp.res/p3.cpp
index 37ab935..1eda967 100644
--- a/clang/test/CXX/temp/temp.res/p3.cpp
+++ b/clang/test/CXX/temp/temp.res/p3.cpp
@@ -2,7 +2,7 @@
 
 template<typename T> struct A {
   template<typename U> struct B;
-  template<typename U> using C = U; // expected-note {{here}}
+  template<typename U> using C = U;
 };
 
 struct X {
@@ -20,12 +20,10 @@ template<typename T> A<T>::C<T> f2(); // expected-warning {{missing 'typename'}}
 template<typename T> A<T>::C<X>::X(T) {}
 template<typename T> A<T>::C<X>::X::Y::Y(T) {}
 
-// FIXME: This is ill-formed
-template<typename T> int A<T>::B<T>::*f3() {}
-template<typename T> int A<T>::C<X>::*f4() {}
+template<typename T> int A<T>::B<T>::*f3() {} // expected-error {{expected unqualified-id}}
+template<typename T> int A<T>::C<X>::*f4() {} // expected-error {{expected unqualified-id}}
 
-// FIXME: This is valid
-template<typename T> int A<T>::template C<int>::*f5() {} // expected-error {{has no members}}
+template<typename T> int A<T>::template C<int>::*f5() {}
 
 template<typename T> template<typename U> struct A<T>::B {
   friend A<T>::C<T> f6(); // ok, same as 'friend T f6();'
diff --git a/clang/test/CodeGen/LoongArch/align.c b/clang/test/CodeGen/LoongArch/align.c
new file mode 100644
index 0000000..1b171b7
--- /dev/null
+++ b/clang/test/CodeGen/LoongArch/align.c
@@ -0,0 +1,58 @@
+// RUN: %clang_cc1 -triple loongarch32 -target-feature +lsx -target-feature \
+// RUN:   +lasx -emit-llvm %s -o - | FileCheck %s --check-prefix=LA32
+// RUN: %clang_cc1 -triple loongarch64 -target-feature +lsx -target-feature \
+// RUN:   +lasx -emit-llvm %s -o - | FileCheck %s --check-prefix=LA64
+
+#include <stddef.h>
+#include <stdint.h>
+
+char *s1 = "1234";
+// LA32: @.str{{.*}} ={{.*}} constant [5 x i8] c"1234\00", align 1
+// LA64: @.str{{.*}} ={{.*}} constant [5 x i8] c"1234\00", align 1
+
+char *s2 = "12345678abcd";
+// LA32: @.str{{.*}} ={{.*}} constant [13 x i8] c"12345678abcd\00", align 1
+// LA64: @.str{{.*}} ={{.*}} constant [13 x i8] c"12345678abcd\00", align 1
+
+char *s3 = "123456789012345678901234567890ab";
+// LA32: @.str{{.*}} ={{.*}} constant [33 x i8] c"1234{{.*}}ab\00", align 1
+// LA64: @.str{{.*}} ={{.*}} constant [33 x i8] c"1234{{.*}}ab\00", align 1
+
+char *s4 = "123456789012345678901234567890123456789012345678901234567890abcdef";
+// LA32: @.str{{.*}} ={{.*}} constant [67 x i8] c"1234{{.*}}cdef\00", align 1
+// LA64: @.str{{.*}} ={{.*}} constant [67 x i8] c"1234{{.*}}cdef\00", align 1
+
+int8_t a;
+// LA32: @a ={{.*}} global i8 0, align 1
+// LA64: @a ={{.*}} global i8 0, align 1
+
+int16_t b;
+// LA32: @b ={{.*}} global i16 0, align 2
+// LA64: @b ={{.*}} global i16 0, align 2
+
+int32_t c;
+// LA32: @c ={{.*}} global i32 0, align 4
+// LA64: @c ={{.*}} global i32 0, align 4
+
+int64_t d;
+// LA32: @d ={{.*}} global i64 0, align 8
+// LA64: @d ={{.*}} global i64 0, align 8
+
+intptr_t e;
+// LA32: @e ={{.*}} global i32 0, align 4
+// LA64: @e ={{.*}} global i64 0, align 8
+
+float f;
+// LA32: @f ={{.*}} global float 0.000000e+00, align 4
+// LA64: @f ={{.*}} global float 0.000000e+00, align 4
+
+double g;
+// LA32: @g ={{.*}} global double 0.000000e+00, align 8
+// LA64: @g ={{.*}} global double 0.000000e+00, align 8
+
+struct H {
+  int8_t a;
+};
+struct H h;
+// LA32: @h ={{.*}} global %struct.H zeroinitializer, align 1
+// LA64: @h ={{.*}} global %struct.H zeroinitializer, align 1
diff --git a/clang/test/CodeGen/aarch64-sme-inline-streaming-attrs.c b/clang/test/CodeGen/aarch64-sme-inline-streaming-attrs.c
index 25aebec..9c3d08a 100644
--- a/clang/test/CodeGen/aarch64-sme-inline-streaming-attrs.c
+++ b/clang/test/CodeGen/aarch64-sme-inline-streaming-attrs.c
@@ -3,6 +3,8 @@
 // RUN: %clang_cc1 -triple aarch64-none-linux-gnu -S -o /dev/null -target-feature +sme -verify -DTEST_STREAMING %s
 // RUN: %clang_cc1 -triple aarch64-none-linux-gnu -S -o /dev/null -target-feature +sme -verify -DTEST_LOCALLY %s
 
+// REQUIRES: aarch64-registered-target
+
 #define __ai __attribute__((always_inline))
 __ai void inlined_fn(void) {}
 __ai void inlined_fn_streaming_compatible(void) __arm_streaming_compatible {}
@@ -20,7 +22,7 @@ void caller(void) {
 
 #ifdef TEST_COMPATIBLE
 void caller_compatible(void) __arm_streaming_compatible {
-    inlined_fn(); // expected-error {{always_inline function 'inlined_fn' and its caller 'caller_compatible' have mismatching streaming attributes}}
+    inlined_fn(); // expected-warning {{always_inline function 'inlined_fn' and its caller 'caller_compatible' have mismatching streaming attributes, inlining may change runtime behaviour}}
     inlined_fn_streaming_compatible();
     inlined_fn_streaming(); // expected-error {{always_inline function 'inlined_fn_streaming' and its caller 'caller_compatible' have mismatching streaming attributes}}
     inlined_fn_local(); // expected-error {{always_inline function 'inlined_fn_local' and its caller 'caller_compatible' have mismatching streaming attributes}}
@@ -29,7 +31,7 @@ void caller_compatible(void) __arm_streaming_compatible {
 
 #ifdef TEST_STREAMING
 void caller_streaming(void) __arm_streaming {
-    inlined_fn(); // expected-error {{always_inline function 'inlined_fn' and its caller 'caller_streaming' have mismatching streaming attributes}}
+    inlined_fn(); // expected-warning {{always_inline function 'inlined_fn' and its caller 'caller_streaming' have mismatching streaming attributes, inlining may change runtime behaviour}}
     inlined_fn_streaming_compatible();
     inlined_fn_streaming();
     inlined_fn_local();
@@ -39,7 +41,7 @@ void caller_streaming(void) __arm_streaming {
 #ifdef TEST_LOCALLY
 __arm_locally_streaming
 void caller_local(void) {
-    inlined_fn(); // expected-error {{always_inline function 'inlined_fn' and its caller 'caller_local' have mismatching streaming attributes}}
+    inlined_fn(); // expected-warning {{always_inline function 'inlined_fn' and its caller 'caller_local' have mismatching streaming attributes, inlining may change runtime behaviour}}
     inlined_fn_streaming_compatible();
     inlined_fn_streaming();
     inlined_fn_local();
diff --git a/clang/test/CodeGen/arm64ec-hybrid-patchable.c b/clang/test/CodeGen/arm64ec-hybrid-patchable.c
new file mode 100644
index 0000000..4d1fa12
--- /dev/null
+++ b/clang/test/CodeGen/arm64ec-hybrid-patchable.c
@@ -0,0 +1,34 @@
+// REQUIRES: aarch64-registered-target
+// RUN: %clang_cc1 -triple arm64ec-pc-windows -fms-extensions -emit-llvm -o - %s -verify | FileCheck %s
+
+// CHECK: ;    Function Attrs: hybrid_patchable noinline nounwind optnone
+// CHECK-NEXT: define dso_local i32 @func() #0 {
+int __attribute__((hybrid_patchable)) func(void) {  return 1; }
+
+// CHECK: ;    Function Attrs: hybrid_patchable noinline nounwind optnone
+// CHECK-NEXT: define dso_local i32 @func2() #0 {
+int __declspec(hybrid_patchable) func2(void) {  return 2; }
+
+// CHECK: ;    Function Attrs: hybrid_patchable noinline nounwind optnone
+// CHECK-NEXT: define dso_local i32 @func3() #0 {
+int __declspec(hybrid_patchable) func3(void);
+int func3(void) {  return 3; }
+
+// CHECK: ;    Function Attrs: hybrid_patchable noinline nounwind optnone
+// CHECK-NEXT: define dso_local i32 @func4() #0 {
+[[clang::hybrid_patchable]] int func4(void);
+int func4(void) {  return 3; }
+
+// CHECK: ; Function Attrs: hybrid_patchable noinline nounwind optnone
+// CHECK-NEXT: define internal void @static_func() #0 {
+// expected-warning@+1 {{'hybrid_patchable' is ignored on functions without external linkage}}
+static void __declspec(hybrid_patchable) static_func(void) {}
+
+// CHECK: ;    Function Attrs: hybrid_patchable noinline nounwind optnone
+// CHECK-NEXT: define linkonce_odr dso_local i32 @func5() #0 comdat {
+int inline __declspec(hybrid_patchable) func5(void) {  return 4; }
+
+void caller(void) {
+  static_func();
+  func5();
+}
diff --git a/clang/test/CodeGen/inline-asm-size-zero.c b/clang/test/CodeGen/inline-asm-size-zero.c
new file mode 100644
index 0000000..564f520
--- /dev/null
+++ b/clang/test/CodeGen/inline-asm-size-zero.c
@@ -0,0 +1,6 @@
+// RUN: not %clang_cc1 -S %s -verify -o -
+
+void foo(void) {
+    extern long bar[];
+    asm ("" : "=r"(bar)); // expected-error{{output size should not be zero}}
+}
diff --git a/clang/test/CodeGen/math-libcalls-tbaa.c b/clang/test/CodeGen/math-libcalls-tbaa.c
new file mode 100644
index 0000000..9c86eea
--- /dev/null
+++ b/clang/test/CodeGen/math-libcalls-tbaa.c
@@ -0,0 +1,170 @@
+// NOTE: Assertions have been autogenerated by utils/update_cc_test_checks.py UTC_ARGS: --version 5
+
+// RUN:  %clang_cc1 -triple=aarch64-unknown-linux-gnu -fmath-errno -O3 -emit-llvm -o - %s | FileCheck %s -check-prefixes=CHECK,NONEWSTRUCTPATHTBAA
+// RUN:  %clang_cc1 -triple=aarch64-unknown-linux-gnu -fmath-errno -O3 -new-struct-path-tbaa -emit-llvm -o - %s | FileCheck %s -check-prefixes=CHECK,NEWSTRUCTPATHTBAA
+
+float expf(float);
+double remainder(double, double);
+double fabs(double);
+double frexp(double, int *exp);
+void sincos(float a, float *s, float *c);
+float _Complex cacoshf(float _Complex);
+float crealf(float _Complex);
+
+// Emit int TBAA metadata on FP math libcalls, which is useful for alias analysis
+
+// CHECK-LABEL: define dso_local float @test_expf(
+// CHECK-SAME: ptr nocapture noundef readonly [[NUM:%.*]]) local_unnamed_addr #[[ATTR0:[0-9]+]] {
+// CHECK-NEXT:  [[ENTRY:.*:]]
+// CHECK-NEXT:    [[ARRAYIDX:%.*]] = getelementptr inbounds i8, ptr [[NUM]], i64 40
+// CHECK-NEXT:    [[TMP0:%.*]] = load float, ptr [[ARRAYIDX]], align 4, !tbaa [[TBAA2:![0-9]+]]
+// CHECK-NEXT:    [[CALL:%.*]] = tail call float @expf(float noundef [[TMP0]]) #[[ATTR9:[0-9]+]], !tbaa [[TBAA6:![0-9]+]]
+// CHECK-NEXT:    [[MUL:%.*]] = fmul float [[TMP0]], [[CALL]]
+// CHECK-NEXT:    ret float [[MUL]]
+//
+float test_expf (float num[]) {
+   const float expm2 = expf(num[10]);  // Emit TBAA metadata on @expf
+   float tmp = expm2 * num[10];
+   return tmp;
+}
+
+// CHECK-LABEL: define dso_local float @test_builtin_expf(
+// CHECK-SAME: ptr nocapture noundef readonly [[NUM:%.*]]) local_unnamed_addr #[[ATTR0]] {
+// CHECK-NEXT:  [[ENTRY:.*:]]
+// CHECK-NEXT:    [[ARRAYIDX:%.*]] = getelementptr inbounds i8, ptr [[NUM]], i64 40
+// CHECK-NEXT:    [[TMP0:%.*]] = load float, ptr [[ARRAYIDX]], align 4, !tbaa [[TBAA2]]
+// CHECK-NEXT:    [[CALL:%.*]] = tail call float @expf(float noundef [[TMP0]]) #[[ATTR9]], !tbaa [[TBAA6]]
+// CHECK-NEXT:    [[MUL:%.*]] = fmul float [[TMP0]], [[CALL]]
+// CHECK-NEXT:    ret float [[MUL]]
+//
+float test_builtin_expf (float num[]) {
+   const float expm2 = __builtin_expf(num[10]);  // Emit TBAA metadata on @expf
+   float tmp = expm2 * num[10];
+   return tmp;
+}
+
+//
+// Negative test: fabs cannot set errno
+// CHECK-LABEL: define dso_local double @test_fabs(
+// CHECK-SAME: ptr nocapture noundef readonly [[NUM:%.*]]) local_unnamed_addr #[[ATTR3:[0-9]+]] {
+// CHECK-NEXT:  [[ENTRY:.*:]]
+// CHECK-NEXT:    [[ARRAYIDX:%.*]] = getelementptr inbounds i8, ptr [[NUM]], i64 80
+// CHECK-NEXT:    [[TMP0:%.*]] = load double, ptr [[ARRAYIDX]], align 8, !tbaa [[TBAA8:![0-9]+]]
+// CHECK-NEXT:    [[TMP1:%.*]] = tail call double @llvm.fabs.f64(double [[TMP0]])
+// CHECK-NEXT:    [[MUL:%.*]] = fmul double [[TMP0]], [[TMP1]]
+// CHECK-NEXT:    ret double [[MUL]]
+//
+double test_fabs (double num[]) {
+   const double expm2 = fabs(num[10]);          // Don't emit TBAA metadata
+   double tmp = expm2 * num[10];
+   return tmp;
+}
+
+// CHECK-LABEL: define dso_local double @test_remainder(
+// CHECK-SAME: ptr nocapture noundef readonly [[NUM:%.*]], double noundef [[A:%.*]]) local_unnamed_addr #[[ATTR0]] {
+// CHECK-NEXT:  [[ENTRY:.*:]]
+// CHECK-NEXT:    [[ARRAYIDX:%.*]] = getelementptr inbounds i8, ptr [[NUM]], i64 80
+// CHECK-NEXT:    [[TMP0:%.*]] = load double, ptr [[ARRAYIDX]], align 8, !tbaa [[TBAA8]]
+// CHECK-NEXT:    [[CALL:%.*]] = tail call double @remainder(double noundef [[TMP0]], double noundef [[A]]) #[[ATTR9]], !tbaa [[TBAA6]]
+// CHECK-NEXT:    [[MUL:%.*]] = fmul double [[TMP0]], [[CALL]]
+// CHECK-NEXT:    ret double [[MUL]]
+//
+double test_remainder (double num[], double a) {
+   const double expm2 = remainder(num[10], a);  // Emit TBAA metadata
+   double tmp = expm2 * num[10];
+   return tmp;
+}
+
+//
+// TODO: frexp is not subject to any errors, but also writes to
+// its int pointer out argument, so it could emit int TBAA metadata.
+// CHECK-LABEL: define dso_local double @test_frexp(
+// CHECK-SAME: ptr nocapture noundef readonly [[NUM:%.*]]) local_unnamed_addr #[[ATTR5:[0-9]+]] {
+// CHECK-NEXT:  [[ENTRY:.*:]]
+// CHECK-NEXT:    [[E:%.*]] = alloca i32, align 4
+// CHECK-NEXT:    call void @llvm.lifetime.start.p0(i64 4, ptr nonnull [[E]]) #[[ATTR9]]
+// CHECK-NEXT:    [[ARRAYIDX:%.*]] = getelementptr inbounds i8, ptr [[NUM]], i64 16
+// CHECK-NEXT:    [[TMP0:%.*]] = load double, ptr [[ARRAYIDX]], align 8, !tbaa [[TBAA8]]
+// CHECK-NEXT:    [[CALL:%.*]] = call double @frexp(double noundef [[TMP0]], ptr noundef nonnull [[E]]) #[[ATTR9]]
+// CHECK-NEXT:    [[MUL:%.*]] = fmul double [[TMP0]], [[CALL]]
+// CHECK-NEXT:    call void @llvm.lifetime.end.p0(i64 4, ptr nonnull [[E]]) #[[ATTR9]]
+// CHECK-NEXT:    ret double [[MUL]]
+//
+double test_frexp (double num[]) {
+   int e;
+   double expm2 = frexp(num[2], &e);  // Don't emit TBAA metadata
+   double tmp = expm2 * num[2];
+   return tmp;
+}
+
+//
+// Negative test: sincos is a library function, but is not a builtin function
+// checked in CodeGenFunction::EmitCallExpr.
+// CHECK-LABEL: define dso_local float @test_sincos(
+// CHECK-SAME: ptr nocapture noundef readonly [[NUM:%.*]]) local_unnamed_addr #[[ATTR7:[0-9]+]] {
+// CHECK-NEXT:  [[ENTRY:.*:]]
+// CHECK-NEXT:    [[SIN:%.*]] = alloca float, align 4
+// CHECK-NEXT:    [[COS:%.*]] = alloca float, align 4
+// CHECK-NEXT:    call void @llvm.lifetime.start.p0(i64 4, ptr nonnull [[SIN]]) #[[ATTR9]]
+// CHECK-NEXT:    call void @llvm.lifetime.start.p0(i64 4, ptr nonnull [[COS]]) #[[ATTR9]]
+// CHECK-NEXT:    [[ARRAYIDX:%.*]] = getelementptr inbounds i8, ptr [[NUM]], i64 8
+// CHECK-NEXT:    [[TMP0:%.*]] = load float, ptr [[ARRAYIDX]], align 4, !tbaa [[TBAA2]]
+// CHECK-NEXT:    call void @sincos(float noundef [[TMP0]], ptr noundef nonnull [[SIN]], ptr noundef nonnull [[COS]]) #[[ATTR9]]
+// CHECK-NEXT:    [[TMP1:%.*]] = load float, ptr [[SIN]], align 4, !tbaa [[TBAA2]]
+// CHECK-NEXT:    [[TMP2:%.*]] = load float, ptr [[COS]], align 4, !tbaa [[TBAA2]]
+// CHECK-NEXT:    [[MUL:%.*]] = fmul float [[TMP1]], [[TMP2]]
+// CHECK-NEXT:    [[TMP3:%.*]] = load float, ptr [[ARRAYIDX]], align 4, !tbaa [[TBAA2]]
+// CHECK-NEXT:    [[ADD:%.*]] = fadd float [[MUL]], [[TMP3]]
+// CHECK-NEXT:    call void @llvm.lifetime.end.p0(i64 4, ptr nonnull [[COS]]) #[[ATTR9]]
+// CHECK-NEXT:    call void @llvm.lifetime.end.p0(i64 4, ptr nonnull [[SIN]]) #[[ATTR9]]
+// CHECK-NEXT:    ret float [[ADD]]
+//
+float test_sincos (float num[]) {
+   float sin, cos;
+   sincos(num[2], &sin, &cos);  // Don't emit TBAA metadata
+   float tmp = sin * cos + num[2];
+   return tmp;
+}
+
+// TODO: The builtin return a complex type
+// CHECK-LABEL: define dso_local float @test_cacoshf(
+// CHECK-SAME: ptr nocapture noundef readonly [[NUM:%.*]]) local_unnamed_addr #[[ATTR7]] {
+// CHECK-NEXT:  [[ENTRY:.*:]]
+// CHECK-NEXT:    [[ARRAYIDX:%.*]] = getelementptr inbounds i8, ptr [[NUM]], i64 8
+// CHECK-NEXT:    [[TMP0:%.*]] = load float, ptr [[ARRAYIDX]], align 4, !tbaa [[TBAA2]]
+// CHECK-NEXT:    [[DOTFCA_0_INSERT:%.*]] = insertvalue [2 x float] poison, float [[TMP0]], 0
+// CHECK-NEXT:    [[DOTFCA_1_INSERT:%.*]] = insertvalue [2 x float] [[DOTFCA_0_INSERT]], float 0.000000e+00, 1
+// CHECK-NEXT:    [[CALL:%.*]] = tail call { float, float } @cacoshf([2 x float] noundef alignstack(8) [[DOTFCA_1_INSERT]]) #[[ATTR9]]
+// CHECK-NEXT:    [[TMP1:%.*]] = extractvalue { float, float } [[CALL]], 0
+// CHECK-NEXT:    [[TMP2:%.*]] = load float, ptr [[ARRAYIDX]], align 4, !tbaa [[TBAA2]]
+// CHECK-NEXT:    [[ADD:%.*]] = fadd float [[TMP1]], [[TMP2]]
+// CHECK-NEXT:    ret float [[ADD]]
+//
+float test_cacoshf (float num[]) {
+   float _Complex z = cacoshf(num[2]);  // Don't emit TBAA metadata
+   float tmp = crealf(z) + num[2];
+   return tmp;
+}
+
+//.
+// NONEWSTRUCTPATHTBAA: [[TBAA2]] = !{[[META3:![0-9]+]], [[META3]], i64 0}
+// NONEWSTRUCTPATHTBAA: [[META3]] = !{!"float", [[META4:![0-9]+]], i64 0}
+// NONEWSTRUCTPATHTBAA: [[META4]] = !{!"omnipotent char", [[META5:![0-9]+]], i64 0}
+// NONEWSTRUCTPATHTBAA: [[META5]] = !{!"Simple C/C++ TBAA"}
+// NONEWSTRUCTPATHTBAA: [[TBAA6]] = !{[[META7:![0-9]+]], [[META7]], i64 0}
+// NONEWSTRUCTPATHTBAA: [[META7]] = !{!"int", [[META4]], i64 0}
+// NONEWSTRUCTPATHTBAA: [[TBAA8]] = !{[[META9:![0-9]+]], [[META9]], i64 0}
+// NONEWSTRUCTPATHTBAA: [[META9]] = !{!"double", [[META4]], i64 0}
+//.
+// NEWSTRUCTPATHTBAA: [[TBAA2]] = !{[[META3:![0-9]+]], [[META3]], i64 0, i64 4}
+// NEWSTRUCTPATHTBAA: [[META3]] = !{[[META4:![0-9]+]], i64 4, !"float"}
+// NEWSTRUCTPATHTBAA: [[META4]] = !{[[META5:![0-9]+]], i64 1, !"omnipotent char"}
+// NEWSTRUCTPATHTBAA: [[META5]] = !{!"Simple C/C++ TBAA"}
+// NEWSTRUCTPATHTBAA: [[TBAA6]] = !{[[META7:![0-9]+]], [[META7]], i64 0, i64 4}
+// NEWSTRUCTPATHTBAA: [[META7]] = !{[[META4]], i64 4, !"int"}
+// NEWSTRUCTPATHTBAA: [[TBAA8]] = !{[[META9:![0-9]+]], [[META9]], i64 0, i64 8}
+// NEWSTRUCTPATHTBAA: [[META9]] = !{[[META4]], i64 8, !"double"}
+//.
+//// NOTE: These prefixes are unused and the list is autogenerated. Do not add tests below this line:
+// NEWSTRUCTPATHTBAA: {{.*}}
+// NONEWSTRUCTPATHTBAA: {{.*}}
diff --git a/clang/test/CodeGen/math-libcalls-tbaa.cpp b/clang/test/CodeGen/math-libcalls-tbaa.cpp
deleted file mode 100644
index f15938d..0000000
--- a/clang/test/CodeGen/math-libcalls-tbaa.cpp
+++ /dev/null
@@ -1,41 +0,0 @@
-// NOTE: Assertions have been autogenerated by utils/update_cc_test_checks.py UTC_ARGS: --version 5
-
-// RUN:  %clang_cc1 -triple=aarch64-unknown-linux-gnu -fmath-errno -O3 -emit-llvm -o - -x c++ %s | FileCheck %s -check-prefixes=CHECK,NoNewStructPathTBAA
-// RUN:  %clang_cc1 -triple=aarch64-unknown-linux-gnu -fmath-errno -O3 -new-struct-path-tbaa -emit-llvm -o - -x c++ %s | FileCheck %s -check-prefixes=CHECK,NewStructPathTBAA
-
-extern "C" float expf(float);
-
-// Emit int TBAA metadata on FP math libcalls, which is useful for alias analysis
-
-// CHECK-LABEL: define dso_local float @foo(
-// CHECK-SAME: ptr nocapture noundef readonly [[NUM:%.*]], float noundef [[R2INV:%.*]], i32 noundef [[N:%.*]]) local_unnamed_addr #[[ATTR0:[0-9]+]] {
-// CHECK-NEXT:  [[ENTRY:.*:]]
-// CHECK-NEXT:    [[ARRAYIDX:%.*]] = getelementptr inbounds i8, ptr [[NUM]], i64 40
-// CHECK-NEXT:    [[TMP0:%.*]] = load float, ptr [[ARRAYIDX]], align 4, !tbaa [[TBAA2:![0-9]+]]
-// CHECK-NEXT:    [[CALL:%.*]] = tail call float @expf(float noundef [[TMP0]]) #[[ATTR2:[0-9]+]], !tbaa [[TBAA6:![0-9]+]]
-// CHECK-NEXT:    [[MUL:%.*]] = fmul float [[TMP0]], [[CALL]]
-// CHECK-NEXT:    ret float [[MUL]]
-//
-extern "C" float foo (float num[], float r2inv, int n) {
-   const float expm2 =  expf(num[10]);  // Emit TBAA metadata on @expf
-   float tmp = expm2 * num[10];
-   return tmp;
-}
-//.
-// NoNewStructPathTBAA: [[TBAA2]] = !{[[META3:![0-9]+]], [[META3]], i64 0}
-// NoNewStructPathTBAA: [[META3]] = !{!"float", [[META4:![0-9]+]], i64 0}
-// NoNewStructPathTBAA: [[META4]] = !{!"omnipotent char", [[META5:![0-9]+]], i64 0}
-// NoNewStructPathTBAA: [[META5]] = !{!"Simple C++ TBAA"}
-// NoNewStructPathTBAA: [[TBAA6]] = !{[[META7:![0-9]+]], [[META7]], i64 0}
-// NoNewStructPathTBAA: [[META7]] = !{!"int", [[META4]], i64 0}
-//.
-// NewStructPathTBAA: [[TBAA2]] = !{[[META3:![0-9]+]], [[META3]], i64 0, i64 4}
-// NewStructPathTBAA: [[META3]] = !{[[META4:![0-9]+]], i64 4, !"float"}
-// NewStructPathTBAA: [[META4]] = !{[[META5:![0-9]+]], i64 1, !"omnipotent char"}
-// NewStructPathTBAA: [[META5]] = !{!"Simple C++ TBAA"}
-// NewStructPathTBAA: [[TBAA6]] = !{[[META7:![0-9]+]], [[META7]], i64 0, i64 4}
-// NewStructPathTBAA: [[META7]] = !{[[META4]], i64 4, !"int"}
-//.
-//// NOTE: These prefixes are unused and the list is autogenerated. Do not add tests below this line:
-// NewStructPathTBAA: {{.*}}
-// NoNewStructPathTBAA: {{.*}}
diff --git a/clang/test/CodeGen/pr3518.c b/clang/test/CodeGen/pr3518.c
index f888add..a3cd866 100644
--- a/clang/test/CodeGen/pr3518.c
+++ b/clang/test/CodeGen/pr3518.c
@@ -1,4 +1,5 @@
 // RUN: %clang_cc1 %s -emit-llvm -o - | FileCheck %s
+// RUN: %clang_cc1 %s -fexperimental-new-constant-interpreter -emit-llvm -o - | FileCheck %s
 // PR 3518
 // Some of the objects were coming out as uninitialized (external) before 3518
 // was fixed.  Internal names are different between llvm-gcc and clang so they
diff --git a/clang/test/CodeGenCUDA/convergent.cu b/clang/test/CodeGenCUDA/convergent.cu
index 5d98d4ba..b187f3a 100644
--- a/clang/test/CodeGenCUDA/convergent.cu
+++ b/clang/test/CodeGenCUDA/convergent.cu
@@ -1,3 +1,4 @@
+// NOTE: Assertions have been autogenerated by utils/update_cc_test_checks.py UTC_ARGS: --check-globals all --version 5
 // REQUIRES: x86-registered-target
 // REQUIRES: nvptx-registered-target
 
@@ -10,36 +11,89 @@
 
 #include "Inputs/cuda.h"
 
-// DEVICE: Function Attrs:
-// DEVICE-SAME: convergent
-// DEVICE-NEXT: define{{.*}} void @_Z3foov
+// DEVICE-LABEL: define dso_local void @_Z3foov(
+// DEVICE-SAME: ) #[[ATTR0:[0-9]+]] {
+// DEVICE-NEXT:  [[ENTRY:.*:]]
+// DEVICE-NEXT:    ret void
+//
 __device__ void foo() {}
+// DEVICE-LABEL: define dso_local void @_Z3baxv(
+// DEVICE-SAME: ) #[[ATTR1:[0-9]+]] {
+// DEVICE-NEXT:  [[ENTRY:.*:]]
+// DEVICE-NEXT:    ret void
+//
+[[clang::noconvergent]] __device__ void bax() {}
 
-// HOST: Function Attrs:
-// HOST-NOT: convergent
-// HOST-NEXT: define{{.*}} void @_Z3barv
-// DEVICE: Function Attrs:
-// DEVICE-SAME: convergent
-// DEVICE-NEXT: define{{.*}} void @_Z3barv
 __host__ __device__ void baz();
+
+__host__ __device__ float aliasf0(int) asm("something");
+__host__ __device__ [[clang::noconvergent]] float aliasf1(int) asm("somethingelse");
+
+// DEVICE-LABEL: define dso_local void @_Z3barv(
+// DEVICE-SAME: ) #[[ATTR0]] {
+// DEVICE-NEXT:  [[ENTRY:.*:]]
+// DEVICE-NEXT:    [[X:%.*]] = alloca i32, align 4
+// DEVICE-NEXT:    call void @_Z3bazv() #[[ATTR4:[0-9]+]]
+// DEVICE-NEXT:    [[TMP0:%.*]] = call i32 asm "trap", "=l"() #[[ATTR5:[0-9]+]], !srcloc [[META3:![0-9]+]]
+// DEVICE-NEXT:    store i32 [[TMP0]], ptr [[X]], align 4
+// DEVICE-NEXT:    call void asm sideeffect "trap", ""() #[[ATTR4]], !srcloc [[META4:![0-9]+]]
+// DEVICE-NEXT:    call void asm sideeffect "nop", ""() #[[ATTR6:[0-9]+]], !srcloc [[META5:![0-9]+]]
+// DEVICE-NEXT:    [[TMP1:%.*]] = load i32, ptr [[X]], align 4
+// DEVICE-NEXT:    [[CALL:%.*]] = call contract noundef float @something(i32 noundef [[TMP1]]) #[[ATTR4]]
+// DEVICE-NEXT:    [[TMP2:%.*]] = load i32, ptr [[X]], align 4
+// DEVICE-NEXT:    [[CALL1:%.*]] = call contract noundef float @somethingelse(i32 noundef [[TMP2]]) #[[ATTR6]]
+// DEVICE-NEXT:    ret void
+//
+// HOST-LABEL: define dso_local void @_Z3barv(
+// HOST-SAME: ) #[[ATTR0:[0-9]+]] {
+// HOST-NEXT:  [[ENTRY:.*:]]
+// HOST-NEXT:    [[X:%.*]] = alloca i32, align 4
+// HOST-NEXT:    call void @_Z3bazv()
+// HOST-NEXT:    [[TMP0:%.*]] = call i32 asm "trap", "=l,~{dirflag},~{fpsr},~{flags}"() #[[ATTR2:[0-9]+]], !srcloc [[META2:![0-9]+]]
+// HOST-NEXT:    store i32 [[TMP0]], ptr [[X]], align 4
+// HOST-NEXT:    call void asm sideeffect "trap", "~{dirflag},~{fpsr},~{flags}"() #[[ATTR3:[0-9]+]], !srcloc [[META3:![0-9]+]]
+// HOST-NEXT:    call void asm sideeffect "nop", "~{dirflag},~{fpsr},~{flags}"() #[[ATTR3]], !srcloc [[META4:![0-9]+]]
+// HOST-NEXT:    [[TMP1:%.*]] = load i32, ptr [[X]], align 4
+// HOST-NEXT:    [[CALL:%.*]] = call contract noundef float @something(i32 noundef [[TMP1]])
+// HOST-NEXT:    [[TMP2:%.*]] = load i32, ptr [[X]], align 4
+// HOST-NEXT:    [[CALL1:%.*]] = call contract noundef float @somethingelse(i32 noundef [[TMP2]])
+// HOST-NEXT:    ret void
+//
 __host__ __device__ void bar() {
-  // DEVICE: call void @_Z3bazv() [[CALL_ATTR:#[0-9]+]]
   baz();
-  // DEVICE: call i32 asm "trap;", "=l"() [[ASM_ATTR:#[0-9]+]]
   int x;
-  asm ("trap;" : "=l"(x));
-  // DEVICE: call void asm sideeffect "trap;", ""() [[ASM_ATTR:#[0-9]+]]
-  asm volatile ("trap;");
+  asm ("trap" : "=l"(x));
+  asm volatile ("trap");
+  [[clang::noconvergent]] { asm volatile ("nop"); }
+  aliasf0(x);
+  aliasf1(x);
 }
 
-// DEVICE: declare void @_Z3bazv() [[BAZ_ATTR:#[0-9]+]]
-// DEVICE: attributes [[BAZ_ATTR]] = {
-// DEVICE-SAME: convergent
-// DEVICE-SAME: }
-// DEVICE-DAG: attributes [[CALL_ATTR]] = { convergent
-// DEVICE-DAG: attributes [[ASM_ATTR]] = { convergent
-
-// HOST: declare void @_Z3bazv() [[BAZ_ATTR:#[0-9]+]]
-// HOST: attributes [[BAZ_ATTR]] = {
-// HOST-NOT: convergent
-// HOST-SAME: }
+
+//.
+// DEVICE: attributes #[[ATTR0]] = { convergent mustprogress noinline nounwind optnone "no-trapping-math"="true" "stack-protector-buffer-size"="8" "target-features"="+ptx32" }
+// DEVICE: attributes #[[ATTR1]] = { mustprogress noinline nounwind optnone "no-trapping-math"="true" "stack-protector-buffer-size"="8" "target-features"="+ptx32" }
+// DEVICE: attributes #[[ATTR2:[0-9]+]] = { convergent nounwind "no-trapping-math"="true" "stack-protector-buffer-size"="8" "target-features"="+ptx32" }
+// DEVICE: attributes #[[ATTR3:[0-9]+]] = { nounwind "no-trapping-math"="true" "stack-protector-buffer-size"="8" "target-features"="+ptx32" }
+// DEVICE: attributes #[[ATTR4]] = { convergent nounwind }
+// DEVICE: attributes #[[ATTR5]] = { convergent nounwind memory(none) }
+// DEVICE: attributes #[[ATTR6]] = { nounwind }
+//.
+// HOST: attributes #[[ATTR0]] = { mustprogress noinline nounwind optnone "min-legal-vector-width"="0" "no-trapping-math"="true" "stack-protector-buffer-size"="8" "target-features"="+cx8,+mmx,+sse,+sse2,+x87" }
+// HOST: attributes #[[ATTR1:[0-9]+]] = { "no-trapping-math"="true" "stack-protector-buffer-size"="8" "target-features"="+cx8,+mmx,+sse,+sse2,+x87" }
+// HOST: attributes #[[ATTR2]] = { nounwind memory(none) }
+// HOST: attributes #[[ATTR3]] = { nounwind }
+//.
+// DEVICE: [[META0:![0-9]+]] = !{i32 1, !"wchar_size", i32 4}
+// DEVICE: [[META1:![0-9]+]] = !{i32 4, !"nvvm-reflect-ftz", i32 0}
+// DEVICE: [[META2:![0-9]+]] = !{!"{{.*}}clang version {{.*}}"}
+// DEVICE: [[META3]] = !{i64 3120}
+// DEVICE: [[META4]] = !{i64 3155}
+// DEVICE: [[META5]] = !{i64 3206}
+//.
+// HOST: [[META0:![0-9]+]] = !{i32 1, !"wchar_size", i32 4}
+// HOST: [[META1:![0-9]+]] = !{!"{{.*}}clang version {{.*}}"}
+// HOST: [[META2]] = !{i64 3120}
+// HOST: [[META3]] = !{i64 3155}
+// HOST: [[META4]] = !{i64 3206}
+//.
diff --git a/clang/test/CodeGenCXX/debug-info-explicit-this.cpp b/clang/test/CodeGenCXX/debug-info-explicit-this.cpp
new file mode 100644
index 0000000..45ab2a0
--- /dev/null
+++ b/clang/test/CodeGenCXX/debug-info-explicit-this.cpp
@@ -0,0 +1,16 @@
+// RUN: %clang_cc1 -emit-llvm -debug-info-kind=limited -std=c++2b %s -o - | FileCheck %s
+
+struct Foo {
+  void Bar(this Foo&& self) {}
+};
+
+void fn() {
+  Foo{}.Bar();
+}
+
+// CHECK: distinct !DISubprogram(name: "Bar", {{.*}}, type: ![[BAR_TYPE:[0-9]+]], {{.*}}, declaration: ![[BAR_DECL:[0-9]+]], {{.*}}
+// CHECK: ![[FOO:[0-9]+]] = distinct !DICompositeType(tag: DW_TAG_structure_type, name: "Foo"
+// CHECK: ![[BAR_DECL]] = !DISubprogram(name: "Bar", {{.*}}, type: ![[BAR_TYPE]], {{.*}},
+// CHECK: ![[BAR_TYPE]] = !DISubroutineType(types: ![[PARAMS:[0-9]+]])
+// CHECK: ![[PARAMS]] = !{null, ![[SELF:[0-9]+]]}
+// CHECK: ![[SELF]] = !DIDerivedType(tag: DW_TAG_rvalue_reference_type, baseType: ![[FOO]]
diff --git a/clang/test/CodeGenCoroutines/coro-elide-thinlto.cpp b/clang/test/CodeGenCoroutines/coro-elide-thinlto.cpp
new file mode 100644
index 0000000..54063cf
--- /dev/null
+++ b/clang/test/CodeGenCoroutines/coro-elide-thinlto.cpp
@@ -0,0 +1,78 @@
+// REQUIRES: x86_64-linux
+// This tests that the coroutine elide optimization could happen succesfully with ThinLTO.
+// This test is adapted from coro-elide.cpp and splits functions into two files.
+//
+// RUN: split-file %s %t
+// RUN: %clang --target=x86_64-linux -std=c++20 -O2 -flto=thin -I %S -c %t/coro-elide-callee.cpp -o %t/coro-elide-callee.bc
+// RUN: %clang --target=x86_64-linux -std=c++20 -O2 -flto=thin -I %S -c %t/coro-elide-caller.cpp -o %t/coro-elide-caller.bc
+// RUN: llvm-lto --thinlto %t/coro-elide-callee.bc %t/coro-elide-caller.bc -o %t/summary
+// RUN: %clang_cc1 -O2 -x ir %t/coro-elide-caller.bc -fthinlto-index=%t/summary.thinlto.bc -emit-llvm -o - | FileCheck %s
+
+//--- coro-elide-task.h
+#pragma once
+#include "Inputs/coroutine.h"
+
+struct Task {
+  struct promise_type {
+    struct FinalAwaiter {
+      bool await_ready() const noexcept { return false; }
+      template <typename PromiseType>
+      std::coroutine_handle<> await_suspend(std::coroutine_handle<PromiseType> h) noexcept {
+        if (!h)
+          return std::noop_coroutine();
+        return h.promise().continuation;
+      }
+      void await_resume() noexcept {}
+    };
+    Task get_return_object() noexcept {
+      return std::coroutine_handle<promise_type>::from_promise(*this);
+    }
+    std::suspend_always initial_suspend() noexcept { return {}; }
+    FinalAwaiter final_suspend() noexcept { return {}; }
+    void unhandled_exception() noexcept {}
+    void return_value(int x) noexcept {
+      _value = x;
+    }
+    std::coroutine_handle<> continuation;
+    int _value;
+  };
+
+  Task(std::coroutine_handle<promise_type> handle) : handle(handle) {}
+  ~Task() {
+    if (handle)
+      handle.destroy();
+  }
+
+  struct Awaiter {
+    bool await_ready() const noexcept { return false; }
+    void await_suspend(std::coroutine_handle<void> continuation) noexcept {}
+    int await_resume() noexcept {
+      return 43;
+    }
+  };
+
+  auto operator co_await() {
+    return Awaiter{};
+  }
+
+private:
+  std::coroutine_handle<promise_type> handle;
+};
+
+//--- coro-elide-callee.cpp
+#include "coro-elide-task.h"
+Task task0() {
+  co_return 43;
+}
+
+//--- coro-elide-caller.cpp
+#include "coro-elide-task.h"
+
+Task task0();
+
+Task task1() {
+  co_return co_await task0();
+}
+
+// CHECK-LABEL: define{{.*}} void @_Z5task1v.resume
+// CHECK-NOT: {{.*}}_Znwm
diff --git a/clang/test/Driver/amdgpu-toolchain.c b/clang/test/Driver/amdgpu-toolchain.c
index 8ab6a07..b60d31bae 100644
--- a/clang/test/Driver/amdgpu-toolchain.c
+++ b/clang/test/Driver/amdgpu-toolchain.c
@@ -18,13 +18,17 @@
 // AS_LINK_UR: "-cc1as"
 // AS_LINK_UR: ld.lld{{.*}} "--no-undefined"{{.*}} "--unresolved-symbols=ignore-all"
 
-// RUN: %clang -### --target=amdgcn-amd-amdhsa -mcpu=gfx906 -nogpulib \
+// RUN: %clang -### --target=amdgcn-amd-amdhsa -mcpu=gfx90a:xnack+:sramecc- -nogpulib \
 // RUN:   -L. -flto -fconvergent-functions %s 2>&1 | FileCheck -check-prefixes=LTO,MCPU %s
-// RUN: %clang -### --target=amdgcn-amd-amdhsa -mcpu=gfx906 -nogpulib \
+// RUN: %clang -### --target=amdgcn-amd-amdhsa -mcpu=gfx90a:xnack+:sramecc- -nogpulib \
 // RUN:   -L. -fconvergent-functions %s 2>&1 | FileCheck -check-prefix=MCPU %s
 // LTO: clang{{.*}} "-flto=full"{{.*}}"-fconvergent-functions"
-// MCPU: ld.lld{{.*}}"-L."{{.*}}"-plugin-opt=mcpu=gfx906"
+// MCPU: ld.lld{{.*}}"-L."{{.*}}"-plugin-opt=mcpu=gfx90a"{{.*}}"-plugin-opt=-mattr=-sramecc,+xnack"
 
 // RUN: %clang -### --target=amdgcn-amd-amdhsa -mcpu=gfx906 -nogpulib \
 // RUN:   -fuse-ld=ld %s 2>&1 | FileCheck -check-prefixes=LD %s
 // LD: ld.lld
+
+// RUN: %clang -### --target=amdgcn-amd-amdhsa -mcpu=gfx906 -nogpulib \
+// RUN:   -r %s 2>&1 | FileCheck -check-prefixes=RELO %s
+// RELO-NOT: -shared
diff --git a/clang/test/Driver/cuda-cross-compiling.c b/clang/test/Driver/cuda-cross-compiling.c
index c2e538c..5f24e7a 100644
--- a/clang/test/Driver/cuda-cross-compiling.c
+++ b/clang/test/Driver/cuda-cross-compiling.c
@@ -84,6 +84,13 @@
 // MISSING: error: must pass in an explicit nvptx64 gpu architecture to 'ptxas'
 // MISSING: error: must pass in an explicit nvptx64 gpu architecture to 'nvlink'
 
+// Do not error when performing LTO.
+//
+// RUN: %clang -target nvptx64-nvidia-cuda -flto %s -### 2>&1 \
+// RUN:   | FileCheck -check-prefix=MISSING-LTO %s
+
+// MISSING-LTO-NOT: error: must pass in an explicit nvptx64 gpu architecture to 'nvlink'
+
 // RUN: %clang -target nvptx64-nvidia-cuda -flto -c %s -### 2>&1 \
 // RUN:   | FileCheck -check-prefix=GENERIC %s
 // RUN: %clang -target nvptx64-nvidia-cuda -march=sm_52 -march=generic -flto -c %s -### 2>&1 \
diff --git a/clang/test/Driver/immediate-options.c b/clang/test/Driver/immediate-options.c
index 77878fe..b74f6b4 100644
--- a/clang/test/Driver/immediate-options.c
+++ b/clang/test/Driver/immediate-options.c
@@ -2,6 +2,8 @@
 // HELP: isystem
 // HELP-NOT: ast-dump
 // HELP-NOT: driver-mode
+// HELP:     -Wa,
+// HELP-NOT: -W{{[a-z][a-z]}}
 
 // Make sure that Flang-only options are not available in Clang
 // HELP-NOT: test-io
diff --git a/clang/test/Driver/linker-wrapper.c b/clang/test/Driver/linker-wrapper.c
index 342907c..99cfdb9 100644
--- a/clang/test/Driver/linker-wrapper.c
+++ b/clang/test/Driver/linker-wrapper.c
@@ -129,14 +129,14 @@ __attribute__((visibility("protected"), used)) int x;
 // RUN:   -fembed-offload-object=%t.out
 // RUN: clang-linker-wrapper --dry-run --host-triple=x86_64-unknown-linux-gnu \
 // RUN:   --linker-path=/usr/bin/ld --device-linker=foo=bar --device-linker=a \
-// RUN:   --device-linker=nvptx64-nvidia-cuda=b \
+// RUN:   --device-linker=nvptx64-nvidia-cuda=b --device-compiler=foo\
 // RUN:   %t.o -o a.out 2>&1 | FileCheck %s --check-prefix=LINKER-ARGS
 
-// LINKER-ARGS: clang{{.*}}--target=amdgcn-amd-amdhsa{{.*}}foo=bar{{.*}}a
-// LINKER-ARGS: clang{{.*}}--target=nvptx64-nvidia-cuda{{.*}}foo=bar{{.*}}a b
+// LINKER-ARGS: clang{{.*}}--target=amdgcn-amd-amdhsa{{.*}}-Xlinker foo=bar{{.*}}-Xlinker a{{.*}}foo
+// LINKER-ARGS: clang{{.*}}--target=nvptx64-nvidia-cuda{{.*}}-Xlinker foo=bar{{.*}}-Xlinker a -Xlinker b{{.*}}foo
 
-// RUN: not clang-linker-wrapper --dry-run --host-triple=x86_64-unknown-linux-gnu -ldummy \
-// RUN:   --linker-path=/usr/bin/ld --device-linker=a --device-linker=nvptx64-nvidia-cuda=b \
+// RUN: not clang-linker-wrapper --dry-run --host-triple=x86_64-unknown-linux-gnu \
+// RUN:   -ldummy --linker-path=/usr/bin/ld \
 // RUN:   -o a.out 2>&1 | FileCheck %s --check-prefix=MISSING-LIBRARY
 
 // MISSING-LIBRARY: error: unable to find library -ldummy
@@ -234,3 +234,13 @@ __attribute__((visibility("protected"), used)) int x;
 // RUN: | FileCheck %s --check-prefix=OVERRIDE
 // OVERRIDE-NOT: clang
 // OVERRIDE: /usr/bin/ld
+
+// RUN: clang-offload-packager -o %t.out \
+// RUN:   --image=file=%t.elf.o,kind=openmp,triple=amdgcn-amd-amdhsa,arch=gfx908
+// RUN: %clang -cc1 %s -triple x86_64-unknown-linux-gnu -emit-obj -o %t.o -fembed-offload-object=%t.out
+// RUN: clang-linker-wrapper --host-triple=x86_64-unknown-linux-gnu --dry-run --offload-opt=-pass-remarks=foo \
+// RUN:   --linker-path=/usr/bin/ld %t.o -o a.out 2>&1 | FileCheck %s --check-prefix=OFFLOAD-OPT
+// RUN: clang-linker-wrapper --host-triple=x86_64-unknown-linux-gnu --dry-run -mllvm -pass-remarks=foo \
+// RUN:   --linker-path=/usr/bin/ld %t.o -o a.out 2>&1 | FileCheck %s --check-prefix=OFFLOAD-OPT
+
+// OFFLOAD-OPT: clang{{.*}}-Wl,--plugin-opt=-pass-remarks=foo
diff --git a/clang/test/Driver/openmp-offload-gpu.c b/clang/test/Driver/openmp-offload-gpu.c
index 0314f28..ef6cbdd 100644
--- a/clang/test/Driver/openmp-offload-gpu.c
+++ b/clang/test/Driver/openmp-offload-gpu.c
@@ -377,4 +377,4 @@
 // RUN:      --cuda-path=%S/Inputs/CUDA_102/usr/local/cuda \
 // RUN:      --offload-arch=sm_52 -nogpulibc -nogpuinc %s 2>&1 \
 // RUN:   | FileCheck --check-prefix=LIBC-GPU %s
-// LIBC-GPU: clang-linker-wrapper{{.*}}"--device-linker=-nolibc"
+// LIBC-GPU: clang-linker-wrapper{{.*}}"--device-compiler=-nolibc"
diff --git a/clang/test/Driver/ps4-linker.c b/clang/test/Driver/ps4-linker.c
index 449da30..2a095d6 100644
--- a/clang/test/Driver/ps4-linker.c
+++ b/clang/test/Driver/ps4-linker.c
@@ -16,3 +16,8 @@
 // RUN: %clang --target=x86_64-scei-ps4 -flto=full -fcrash-diagnostics-dir=mydumps %s -### 2>&1 | FileCheck --check-prefixes=CHECK-DIAG-LTO %s
 
 // CHECK-DIAG-LTO: "-lto-debug-options= -crash-diagnostics-dir=mydumps"
+
+// Test that -lto-debug-options is only supplied to the linker when necessary
+
+// RUN: %clang --target=x86_64-scei-ps4 %s -### 2>&1 | FileCheck --check-prefixes=CHECK-NO-LTO %s
+// CHECK-NO-LTO-NOT: -lto-debug-options
diff --git a/clang/test/Driver/unified-lto.c b/clang/test/Driver/unified-lto.c
index 490aaca..445ca0b 100644
--- a/clang/test/Driver/unified-lto.c
+++ b/clang/test/Driver/unified-lto.c
@@ -27,7 +27,7 @@
 // RUN: %clang --target=x86_64-sie-ps5 -### %s -fno-unified-lto -flto=full 2>&1 | FileCheck --check-prefixes=LD,NOLTO %s
 // RUN: %clang --target=x86_64-sie-ps5 -### %s -fno-unified-lto -flto=thin 2>&1 | FileCheck --check-prefixes=LD,NOLTO %s
 
-// LD: {{.*ld}}"
+// LD: {{.*ld(\.exe)?}}"
 // LTOFULL-SAME: "--lto=full"
 // LTOTHIN-SAME: "--lto=thin"
 // NOLTO-NOT: "--lto
diff --git a/clang/test/Driver/x86-target-features.c b/clang/test/Driver/x86-target-features.c
index 63237cb..7d77ae7 100644
--- a/clang/test/Driver/x86-target-features.c
+++ b/clang/test/Driver/x86-target-features.c
@@ -309,8 +309,8 @@
 // HRESET: "-target-feature" "+hreset"
 // NO-HRESET: "-target-feature" "-hreset"
 
-// RUN: %clang --target=i386 -march=i386 -muintr %s -### 2>&1 | FileCheck -check-prefix=UINTR %s
-// RUN: %clang --target=i386 -march=i386 -mno-uintr %s -### 2>&1 | FileCheck -check-prefix=NO-UINTR %s
+// RUN: %clang --target=x86_64 -muintr %s -### 2>&1 | FileCheck -check-prefix=UINTR %s
+// RUN: %clang --target=x86_64 -mno-uintr %s -### 2>&1 | FileCheck -check-prefix=NO-UINTR %s
 // UINTR: "-target-feature" "+uintr"
 // NO-UINTR: "-target-feature" "-uintr"
 
@@ -409,6 +409,15 @@
 // NONX86-NEXT: warning: argument unused during compilation: '-msse4.2' [-Wunused-command-line-argument]
 // NONX86-NEXT: error: unsupported option '-mno-sgx' for target 'aarch64'
 
+// RUN: not %clang -### --target=i386 -muintr %s 2>&1 | FileCheck --check-prefix=NON-UINTR %s
+// RUN: %clang -### --target=i386 -mno-uintr %s 2>&1 > /dev/null
+// RUN: not %clang -### --target=i386 -mapx-features=ndd %s 2>&1 | FileCheck --check-prefix=NON-APX %s
+// RUN: not %clang -### --target=i386 -mapxf %s 2>&1 | FileCheck --check-prefix=NON-APX %s
+// RUN: %clang -### --target=i386 -mno-apxf %s 2>&1 > /dev/null
+// NON-UINTR:    error: unsupported option '-muintr' for target 'i386'
+// NON-APX:      error: unsupported option '-mapx-features=|-mapxf' for target 'i386'
+// NON-APX-NOT:  error: {{.*}} -mapx-features=
+
 // RUN: %clang --target=i386 -march=i386 -mharden-sls=return %s -### -o %t.o 2>&1 | FileCheck -check-prefixes=SLS-RET,NO-SLS %s
 // RUN: %clang --target=i386 -march=i386 -mharden-sls=indirect-jmp %s -### -o %t.o 2>&1 | FileCheck -check-prefixes=SLS-IJMP,NO-SLS %s
 // RUN: %clang --target=i386 -march=i386 -mharden-sls=none -mharden-sls=all %s -### -o %t.o 2>&1 | FileCheck -check-prefixes=SLS-IJMP,SLS-RET %s
diff --git a/clang/test/Index/pch-with-errors.c b/clang/test/Index/pch-with-errors.c
index e8711c8..cfe58c1 100644
--- a/clang/test/Index/pch-with-errors.c
+++ b/clang/test/Index/pch-with-errors.c
@@ -38,7 +38,7 @@ void foo(void) {
 // CHECK-INDEX: [indexEntityReference]: kind: function | name: erroneous
 
 // RUN: not %clang -fsyntax-only %s -include %t.h 2>&1 | FileCheck -check-prefix=PCH-ERR %s
-// PCH-ERR: error: PCH file contains compiler errors
+// PCH-ERR: error: PCH file '{{.*}}' contains compiler errors
 
 // RUN: not c-index-test -write-pch %t.pch foobar.c 2>&1 | FileCheck -check-prefix=NONEXISTENT %s
 // NONEXISTENT: Unable to load translation unit
diff --git a/clang/test/Misc/pragma-attribute-supported-attributes-list.test b/clang/test/Misc/pragma-attribute-supported-attributes-list.test
index 33f9c2f..0f7dcab 100644
--- a/clang/test/Misc/pragma-attribute-supported-attributes-list.test
+++ b/clang/test/Misc/pragma-attribute-supported-attributes-list.test
@@ -83,6 +83,7 @@
 // CHECK-NEXT: HIPManaged (SubjectMatchRule_variable)
 // CHECK-NEXT: HLSLResourceClass (SubjectMatchRule_record_not_is_union)
 // CHECK-NEXT: Hot (SubjectMatchRule_function)
+// CHECK-NEXT: HybridPatchable (SubjectMatchRule_function)
 // CHECK-NEXT: IBAction (SubjectMatchRule_objc_method_is_instance)
 // CHECK-NEXT: IFunc (SubjectMatchRule_function)
 // CHECK-NEXT: InitPriority (SubjectMatchRule_variable)
@@ -108,6 +109,7 @@
 // CHECK-NEXT: Naked (SubjectMatchRule_function)
 // CHECK-NEXT: NoBuiltin (SubjectMatchRule_function)
 // CHECK-NEXT: NoCommon (SubjectMatchRule_variable)
+// CHECK-NEXT: NoConvergent (SubjectMatchRule_function)
 // CHECK-NEXT: NoDebug (SubjectMatchRule_type_alias, SubjectMatchRule_hasType_functionType, SubjectMatchRule_objc_method, SubjectMatchRule_variable_not_is_parameter)
 // CHECK-NEXT: NoDestroy (SubjectMatchRule_variable)
 // CHECK-NEXT: NoDuplicate (SubjectMatchRule_function)
diff --git a/clang/test/Modules/load-module-with-errors.m b/clang/test/Modules/load-module-with-errors.m
index 1f8e483..6e10cb3 100644
--- a/clang/test/Modules/load-module-with-errors.m
+++ b/clang/test/Modules/load-module-with-errors.m
@@ -1,7 +1,7 @@
 // Note: the run lines follow their respective tests, since line/column
 // matter in this test.
 
-// pcherror-error@* {{PCH file contains compiler errors}}
+// pcherror-error-re@* {{module file '{{.*}}use_error_a.pcm' contains compiler errors}}
 @import use_error_a; // notallowerror-error {{could not build module 'use_error_a'}}
 @import use_error_b;
 // expected-no-diagnostics
@@ -61,7 +61,7 @@ void test(Error *x) {
 // RUN:   -fmodule-file=%t/prebuilt/use_error_a.pcm \
 // RUN:   -fmodule-file=%t/prebuilt/use_error_b.pcm \
 // RUN:   -fmodules-cache-path=%t 2>&1 | \
-// RUN: grep "PCH file contains compiler errors"
+// RUN: grep "module file .* contains compiler errors"
 
 // Shouldn't build the cached modules (that have errors) when not allowing
 // errors
diff --git a/clang/test/OpenMP/nvptx_target_printf_codegen.c b/clang/test/OpenMP/nvptx_target_printf_codegen.c
deleted file mode 100644
index f53daf6..0000000
--- a/clang/test/OpenMP/nvptx_target_printf_codegen.c
+++ /dev/null
@@ -1,179 +0,0 @@
-// NOTE: Assertions have been autogenerated by utils/update_cc_test_checks.py UTC_ARGS: --function-signature --include-generated-funcs --replace-value-regex "__omp_offloading_[0-9a-z]+_[0-9a-z]+" "reduction_size[.].+[.]" "pl_cond[.].+[.|,]" --prefix-filecheck-ir-name _
-// Test target codegen - host bc file has to be created first.
-// RUN: %clang_cc1 -verify -fopenmp -x c -triple powerpc64le-unknown-unknown -fopenmp-targets=nvptx64-nvidia-cuda -emit-llvm-bc %s -o %t-ppc-host.bc
-// RUN: %clang_cc1 -verify -fopenmp -x c -triple nvptx64-unknown-unknown -fopenmp-targets=nvptx64-nvidia-cuda -emit-llvm %s -fopenmp-is-target-device -fopenmp-host-ir-file-path %t-ppc-host.bc -o - | FileCheck %s --check-prefix=CHECK-64
-// RUN: %clang_cc1 -verify -fopenmp -x c -triple i386-unknown-unknown -fopenmp-targets=nvptx-nvidia-cuda -emit-llvm-bc %s -o %t-x86-host.bc
-// RUN: %clang_cc1 -verify -fopenmp -x c -triple nvptx-unknown-unknown -fopenmp-targets=nvptx-nvidia-cuda -emit-llvm %s -fopenmp-is-target-device -fopenmp-host-ir-file-path %t-x86-host.bc -o - | FileCheck %s --check-prefix=CHECK-32
-// expected-no-diagnostics
-extern int printf(const char *, ...);
-
-
-// Check a simple call to printf end-to-end.
-int CheckSimple(void) {
-#pragma omp target
-  {
-    // printf in master-only basic block.
-    const char* fmt = "%d %lld %f";
-
-    printf(fmt, 1, 2ll, 3.0);
-  }
-
-  return 0;
-}
-
-void CheckNoArgs(void) {
-#pragma omp target
-  {
-    // printf in master-only basic block.
-    printf("hello, world!");
-  }
-}
-
-// Check that printf's alloca happens in the entry block, not inside the if
-// statement.
-int foo;
-void CheckAllocaIsInEntryBlock(void) {
-#pragma omp target
-  {
-    if (foo) {
-      printf("%d", 42);
-    }
-  }
-}
-// CHECK-64-LABEL: define {{[^@]+}}@{{__omp_offloading_[0-9a-z]+_[0-9a-z]+}}_CheckSimple_l13
-// CHECK-64-SAME: (ptr noalias noundef [[DYN_PTR:%.*]]) #[[ATTR0:[0-9]+]] {
-// CHECK-64-NEXT:  entry:
-// CHECK-64-NEXT:    [[DYN_PTR_ADDR:%.*]] = alloca ptr, align 8
-// CHECK-64-NEXT:    [[FMT:%.*]] = alloca ptr, align 8
-// CHECK-64-NEXT:    [[TMP:%.*]] = alloca [[PRINTF_ARGS:%.*]], align 8
-// CHECK-64-NEXT:    store ptr [[DYN_PTR]], ptr [[DYN_PTR_ADDR]], align 8
-// CHECK-64-NEXT:    [[TMP0:%.*]] = call i32 @__kmpc_target_init(ptr @{{__omp_offloading_[0-9a-z]+_[0-9a-z]+}}_CheckSimple_l13_kernel_environment, ptr [[DYN_PTR]])
-// CHECK-64-NEXT:    [[EXEC_USER_CODE:%.*]] = icmp eq i32 [[TMP0]], -1
-// CHECK-64-NEXT:    br i1 [[EXEC_USER_CODE]], label [[USER_CODE_ENTRY:%.*]], label [[WORKER_EXIT:%.*]]
-// CHECK-64:       user_code.entry:
-// CHECK-64-NEXT:    store ptr @.str, ptr [[FMT]], align 8
-// CHECK-64-NEXT:    [[TMP1:%.*]] = load ptr, ptr [[FMT]], align 8
-// CHECK-64-NEXT:    [[TMP2:%.*]] = getelementptr inbounds [[PRINTF_ARGS]], ptr [[TMP]], i32 0, i32 0
-// CHECK-64-NEXT:    store i32 1, ptr [[TMP2]], align 4
-// CHECK-64-NEXT:    [[TMP3:%.*]] = getelementptr inbounds [[PRINTF_ARGS]], ptr [[TMP]], i32 0, i32 1
-// CHECK-64-NEXT:    store i64 2, ptr [[TMP3]], align 8
-// CHECK-64-NEXT:    [[TMP4:%.*]] = getelementptr inbounds [[PRINTF_ARGS]], ptr [[TMP]], i32 0, i32 2
-// CHECK-64-NEXT:    store double 3.000000e+00, ptr [[TMP4]], align 8
-// CHECK-64-NEXT:    [[TMP5:%.*]] = call i32 @__llvm_omp_vprintf(ptr [[TMP1]], ptr [[TMP]], i32 24)
-// CHECK-64-NEXT:    call void @__kmpc_target_deinit()
-// CHECK-64-NEXT:    ret void
-// CHECK-64:       worker.exit:
-// CHECK-64-NEXT:    ret void
-//
-//
-// CHECK-64-LABEL: define {{[^@]+}}@{{__omp_offloading_[0-9a-z]+_[0-9a-z]+}}_CheckNoArgs_l25
-// CHECK-64-SAME: (ptr noalias noundef [[DYN_PTR:%.*]]) #[[ATTR0]] {
-// CHECK-64-NEXT:  entry:
-// CHECK-64-NEXT:    [[DYN_PTR_ADDR:%.*]] = alloca ptr, align 8
-// CHECK-64-NEXT:    store ptr [[DYN_PTR]], ptr [[DYN_PTR_ADDR]], align 8
-// CHECK-64-NEXT:    [[TMP0:%.*]] = call i32 @__kmpc_target_init(ptr @{{__omp_offloading_[0-9a-z]+_[0-9a-z]+}}_CheckNoArgs_l25_kernel_environment, ptr [[DYN_PTR]])
-// CHECK-64-NEXT:    [[EXEC_USER_CODE:%.*]] = icmp eq i32 [[TMP0]], -1
-// CHECK-64-NEXT:    br i1 [[EXEC_USER_CODE]], label [[USER_CODE_ENTRY:%.*]], label [[WORKER_EXIT:%.*]]
-// CHECK-64:       user_code.entry:
-// CHECK-64-NEXT:    [[TMP1:%.*]] = call i32 @__llvm_omp_vprintf(ptr @.str1, ptr null, i32 0)
-// CHECK-64-NEXT:    call void @__kmpc_target_deinit()
-// CHECK-64-NEXT:    ret void
-// CHECK-64:       worker.exit:
-// CHECK-64-NEXT:    ret void
-//
-//
-// CHECK-64-LABEL: define {{[^@]+}}@{{__omp_offloading_[0-9a-z]+_[0-9a-z]+}}_CheckAllocaIsInEntryBlock_l36
-// CHECK-64-SAME: (ptr noalias noundef [[DYN_PTR:%.*]], i64 noundef [[FOO:%.*]]) #[[ATTR0]] {
-// CHECK-64-NEXT:  entry:
-// CHECK-64-NEXT:    [[DYN_PTR_ADDR:%.*]] = alloca ptr, align 8
-// CHECK-64-NEXT:    [[FOO_ADDR:%.*]] = alloca i64, align 8
-// CHECK-64-NEXT:    [[TMP:%.*]] = alloca [[PRINTF_ARGS_0:%.*]], align 8
-// CHECK-64-NEXT:    store ptr [[DYN_PTR]], ptr [[DYN_PTR_ADDR]], align 8
-// CHECK-64-NEXT:    store i64 [[FOO]], ptr [[FOO_ADDR]], align 8
-// CHECK-64-NEXT:    [[TMP0:%.*]] = call i32 @__kmpc_target_init(ptr @{{__omp_offloading_[0-9a-z]+_[0-9a-z]+}}_CheckAllocaIsInEntryBlock_l36_kernel_environment, ptr [[DYN_PTR]])
-// CHECK-64-NEXT:    [[EXEC_USER_CODE:%.*]] = icmp eq i32 [[TMP0]], -1
-// CHECK-64-NEXT:    br i1 [[EXEC_USER_CODE]], label [[USER_CODE_ENTRY:%.*]], label [[WORKER_EXIT:%.*]]
-// CHECK-64:       user_code.entry:
-// CHECK-64-NEXT:    [[TMP1:%.*]] = load i32, ptr [[FOO_ADDR]], align 4
-// CHECK-64-NEXT:    [[TOBOOL:%.*]] = icmp ne i32 [[TMP1]], 0
-// CHECK-64-NEXT:    br i1 [[TOBOOL]], label [[IF_THEN:%.*]], label [[IF_END:%.*]]
-// CHECK-64:       if.then:
-// CHECK-64-NEXT:    [[TMP2:%.*]] = getelementptr inbounds [[PRINTF_ARGS_0]], ptr [[TMP]], i32 0, i32 0
-// CHECK-64-NEXT:    store i32 42, ptr [[TMP2]], align 4
-// CHECK-64-NEXT:    [[TMP3:%.*]] = call i32 @__llvm_omp_vprintf(ptr @.str2, ptr [[TMP]], i32 4)
-// CHECK-64-NEXT:    br label [[IF_END]]
-// CHECK-64:       worker.exit:
-// CHECK-64-NEXT:    ret void
-// CHECK-64:       if.end:
-// CHECK-64-NEXT:    call void @__kmpc_target_deinit()
-// CHECK-64-NEXT:    ret void
-//
-//
-// CHECK-32-LABEL: define {{[^@]+}}@{{__omp_offloading_[0-9a-z]+_[0-9a-z]+}}_CheckSimple_l13
-// CHECK-32-SAME: (ptr noalias noundef [[DYN_PTR:%.*]]) #[[ATTR0:[0-9]+]] {
-// CHECK-32-NEXT:  entry:
-// CHECK-32-NEXT:    [[DYN_PTR_ADDR:%.*]] = alloca ptr, align 4
-// CHECK-32-NEXT:    [[FMT:%.*]] = alloca ptr, align 4
-// CHECK-32-NEXT:    [[TMP:%.*]] = alloca [[PRINTF_ARGS:%.*]], align 8
-// CHECK-32-NEXT:    store ptr [[DYN_PTR]], ptr [[DYN_PTR_ADDR]], align 4
-// CHECK-32-NEXT:    [[TMP0:%.*]] = call i32 @__kmpc_target_init(ptr @{{__omp_offloading_[0-9a-z]+_[0-9a-z]+}}_CheckSimple_l13_kernel_environment, ptr [[DYN_PTR]])
-// CHECK-32-NEXT:    [[EXEC_USER_CODE:%.*]] = icmp eq i32 [[TMP0]], -1
-// CHECK-32-NEXT:    br i1 [[EXEC_USER_CODE]], label [[USER_CODE_ENTRY:%.*]], label [[WORKER_EXIT:%.*]]
-// CHECK-32:       user_code.entry:
-// CHECK-32-NEXT:    store ptr @.str, ptr [[FMT]], align 4
-// CHECK-32-NEXT:    [[TMP1:%.*]] = load ptr, ptr [[FMT]], align 4
-// CHECK-32-NEXT:    [[TMP2:%.*]] = getelementptr inbounds [[PRINTF_ARGS]], ptr [[TMP]], i32 0, i32 0
-// CHECK-32-NEXT:    store i32 1, ptr [[TMP2]], align 4
-// CHECK-32-NEXT:    [[TMP3:%.*]] = getelementptr inbounds [[PRINTF_ARGS]], ptr [[TMP]], i32 0, i32 1
-// CHECK-32-NEXT:    store i64 2, ptr [[TMP3]], align 8
-// CHECK-32-NEXT:    [[TMP4:%.*]] = getelementptr inbounds [[PRINTF_ARGS]], ptr [[TMP]], i32 0, i32 2
-// CHECK-32-NEXT:    store double 3.000000e+00, ptr [[TMP4]], align 8
-// CHECK-32-NEXT:    [[TMP5:%.*]] = call i32 @__llvm_omp_vprintf(ptr [[TMP1]], ptr [[TMP]], i32 24)
-// CHECK-32-NEXT:    call void @__kmpc_target_deinit()
-// CHECK-32-NEXT:    ret void
-// CHECK-32:       worker.exit:
-// CHECK-32-NEXT:    ret void
-//
-//
-// CHECK-32-LABEL: define {{[^@]+}}@{{__omp_offloading_[0-9a-z]+_[0-9a-z]+}}_CheckNoArgs_l25
-// CHECK-32-SAME: (ptr noalias noundef [[DYN_PTR:%.*]]) #[[ATTR0]] {
-// CHECK-32-NEXT:  entry:
-// CHECK-32-NEXT:    [[DYN_PTR_ADDR:%.*]] = alloca ptr, align 4
-// CHECK-32-NEXT:    store ptr [[DYN_PTR]], ptr [[DYN_PTR_ADDR]], align 4
-// CHECK-32-NEXT:    [[TMP0:%.*]] = call i32 @__kmpc_target_init(ptr @{{__omp_offloading_[0-9a-z]+_[0-9a-z]+}}_CheckNoArgs_l25_kernel_environment, ptr [[DYN_PTR]])
-// CHECK-32-NEXT:    [[EXEC_USER_CODE:%.*]] = icmp eq i32 [[TMP0]], -1
-// CHECK-32-NEXT:    br i1 [[EXEC_USER_CODE]], label [[USER_CODE_ENTRY:%.*]], label [[WORKER_EXIT:%.*]]
-// CHECK-32:       user_code.entry:
-// CHECK-32-NEXT:    [[TMP1:%.*]] = call i32 @__llvm_omp_vprintf(ptr @.str1, ptr null, i32 0)
-// CHECK-32-NEXT:    call void @__kmpc_target_deinit()
-// CHECK-32-NEXT:    ret void
-// CHECK-32:       worker.exit:
-// CHECK-32-NEXT:    ret void
-//
-//
-// CHECK-32-LABEL: define {{[^@]+}}@{{__omp_offloading_[0-9a-z]+_[0-9a-z]+}}_CheckAllocaIsInEntryBlock_l36
-// CHECK-32-SAME: (ptr noalias noundef [[DYN_PTR:%.*]], i32 noundef [[FOO:%.*]]) #[[ATTR0]] {
-// CHECK-32-NEXT:  entry:
-// CHECK-32-NEXT:    [[DYN_PTR_ADDR:%.*]] = alloca ptr, align 4
-// CHECK-32-NEXT:    [[FOO_ADDR:%.*]] = alloca i32, align 4
-// CHECK-32-NEXT:    [[TMP:%.*]] = alloca [[PRINTF_ARGS_0:%.*]], align 8
-// CHECK-32-NEXT:    store ptr [[DYN_PTR]], ptr [[DYN_PTR_ADDR]], align 4
-// CHECK-32-NEXT:    store i32 [[FOO]], ptr [[FOO_ADDR]], align 4
-// CHECK-32-NEXT:    [[TMP0:%.*]] = call i32 @__kmpc_target_init(ptr @{{__omp_offloading_[0-9a-z]+_[0-9a-z]+}}_CheckAllocaIsInEntryBlock_l36_kernel_environment, ptr [[DYN_PTR]])
-// CHECK-32-NEXT:    [[EXEC_USER_CODE:%.*]] = icmp eq i32 [[TMP0]], -1
-// CHECK-32-NEXT:    br i1 [[EXEC_USER_CODE]], label [[USER_CODE_ENTRY:%.*]], label [[WORKER_EXIT:%.*]]
-// CHECK-32:       user_code.entry:
-// CHECK-32-NEXT:    [[TMP1:%.*]] = load i32, ptr [[FOO_ADDR]], align 4
-// CHECK-32-NEXT:    [[TOBOOL:%.*]] = icmp ne i32 [[TMP1]], 0
-// CHECK-32-NEXT:    br i1 [[TOBOOL]], label [[IF_THEN:%.*]], label [[IF_END:%.*]]
-// CHECK-32:       if.then:
-// CHECK-32-NEXT:    [[TMP2:%.*]] = getelementptr inbounds [[PRINTF_ARGS_0]], ptr [[TMP]], i32 0, i32 0
-// CHECK-32-NEXT:    store i32 42, ptr [[TMP2]], align 4
-// CHECK-32-NEXT:    [[TMP3:%.*]] = call i32 @__llvm_omp_vprintf(ptr @.str2, ptr [[TMP]], i32 4)
-// CHECK-32-NEXT:    br label [[IF_END]]
-// CHECK-32:       worker.exit:
-// CHECK-32-NEXT:    ret void
-// CHECK-32:       if.end:
-// CHECK-32-NEXT:    call void @__kmpc_target_deinit()
-// CHECK-32-NEXT:    ret void
-//
diff --git a/clang/test/Preprocessor/embed_weird.cpp b/clang/test/Preprocessor/embed_weird.cpp
index 90180e2..9a984e4 100644
--- a/clang/test/Preprocessor/embed_weird.cpp
+++ b/clang/test/Preprocessor/embed_weird.cpp
@@ -4,6 +4,8 @@
 // RUN: printf "\0" > %t/null_byte.bin
 // RUN: %clang_cc1 %s -fsyntax-only --embed-dir=%t -verify=expected,cxx -Wno-c23-extensions
 // RUN: %clang_cc1 -x c -std=c23 %s -fsyntax-only --embed-dir=%t -verify=expected,c
+// RUN: %clang_cc1 %s -fsyntax-only -fexperimental-new-constant-interpreter --embed-dir=%t -verify=expected,cxx -Wno-c23-extensions
+// RUN: %clang_cc1 -x c -std=c23 %s -fsyntax-only -fexperimental-new-constant-interpreter --embed-dir=%t -verify=expected,c
 #embed <media/empty>
 ;
 
diff --git a/clang/test/Preprocessor/x86_target_features.c b/clang/test/Preprocessor/x86_target_features.c
index 1babf47..5d510cb 100644
--- a/clang/test/Preprocessor/x86_target_features.c
+++ b/clang/test/Preprocessor/x86_target_features.c
@@ -512,11 +512,11 @@
 
 // NOHRESET-NOT: #define __HRESET__ 1
 
-// RUN: %clang -target i386-unknown-linux-gnu -march=i386 -muintr -x c -E -dM -o - %s | FileCheck -check-prefix=UINTR %s
+// RUN: %clang -target x86_64-unknown-linux-gnu -march=x86-64 -muintr -x c -E -dM -o - %s | FileCheck -check-prefix=UINTR %s
 
 // UINTR: #define __UINTR__ 1
 
-// RUN: %clang -target i386-unknown-linux-gnu -march=i386 -mno-uintr -x c -E -dM -o - %s | FileCheck -check-prefix=NOUINTR %s
+// RUN: %clang -target x86_64-unknown-linux-gnu -march=x86-64 -mno-uintr -x c -E -dM -o - %s | FileCheck -check-prefix=NOUINTR %s
 
 // NOUINTR-NOT: #define __UINTR__ 1
 
diff --git a/clang/test/Sema/attr-ownership.c b/clang/test/Sema/attr-ownership.c
index 0846243..d2e4053 100644
--- a/clang/test/Sema/attr-ownership.c
+++ b/clang/test/Sema/attr-ownership.c
@@ -18,7 +18,7 @@ void *f12(float i, int k, int f, int *j) __attribute__((ownership_returns(foo, 4
 void f13(int *i, int *j) __attribute__((ownership_holds(foo, 1))) __attribute__((ownership_takes(foo, 2)));
 void f14(int i, int j, int *k) __attribute__((ownership_holds(foo, 3))) __attribute__((ownership_takes(foo, 3)));  // expected-error {{'ownership_takes' and 'ownership_holds' attributes are not compatible}}
 
-void f15(int, int)
+void *f15(int, int)
   __attribute__((ownership_returns(foo, 1)))  // expected-error {{'ownership_returns' attribute index does not match; here it is 1}}
   __attribute__((ownership_returns(foo, 2))); // expected-note {{declared with index 2 here}}
 void f16(int *i, int *j) __attribute__((ownership_holds(foo, 1))) __attribute__((ownership_holds(foo, 1))); // OK, same index
@@ -28,3 +28,6 @@ void f18() __attribute__((ownership_takes(foo, 1)));  // expected-warning {{'own
 int f19(void *)
   __attribute__((ownership_takes(foo, 1)))    // expected-error {{'ownership_takes' attribute class does not match; here it is 'foo'}}
   __attribute__((ownership_takes(foo1, 1)));  // expected-note {{declared with class 'foo1' here}}
+
+void f20(void) __attribute__((ownership_returns(foo)));  // expected-error {{'ownership_returns' attribute only applies to functions that return a pointer}}
+int f21(void) __attribute__((ownership_returns(foo)));  // expected-error {{'ownership_returns' attribute only applies to functions that return a pointer}}
diff --git a/clang/test/Sema/attr-ownership.cpp b/clang/test/Sema/attr-ownership.cpp
index 7381285..0626efa 100644
--- a/clang/test/Sema/attr-ownership.cpp
+++ b/clang/test/Sema/attr-ownership.cpp
@@ -1,7 +1,7 @@
 // RUN: %clang_cc1 %s -verify -fsyntax-only
 
 class C {
-  void f(int, int)
-      __attribute__((ownership_returns(foo, 2)))  // expected-error {{'ownership_returns' attribute index does not match; here it is 2}}
-      __attribute__((ownership_returns(foo, 3))); // expected-note {{declared with index 3 here}}
+  void *f(int, int)
+       __attribute__((ownership_returns(foo, 2)))  // expected-error {{'ownership_returns' attribute index does not match; here it is 2}}
+       __attribute__((ownership_returns(foo, 3))); // expected-note {{declared with index 3 here}}
 };
diff --git a/clang/test/SemaCUDA/attr-noconvergent.cu b/clang/test/SemaCUDA/attr-noconvergent.cu
new file mode 100644
index 0000000..0c051fd
--- /dev/null
+++ b/clang/test/SemaCUDA/attr-noconvergent.cu
@@ -0,0 +1,34 @@
+// RUN: %clang_cc1 -fsyntax-only -fcuda-is-device -verify %s
+
+#include "Inputs/cuda.h"
+
+__device__ float f0(float) __attribute__((noconvergent));
+__device__ __attribute__((noconvergent)) float f1(float);
+[[clang::noconvergent]] __device__ float f2(float);
+
+__device__ [[clang::noconvergent(1)]] float f3(float);
+// expected-error@-1 {{'noconvergent' attribute takes no arguments}}
+
+__device__ [[clang::noconvergent]] float g0;
+// expected-warning@-1 {{'noconvergent' attribute only applies to functions and statements}}
+
+__device__ __attribute__((convergent)) __attribute__((noconvergent)) float f4(float);
+// expected-error@-1 {{'noconvergent' and 'convergent' attributes are not compatible}}
+// expected-note@-2 {{conflicting attribute is here}}
+
+__device__ [[clang::noconvergent]] float f5(float);
+__device__ [[clang::convergent]] float f5(float);
+// expected-error@-1 {{'convergent' and 'noconvergent' attributes are not compatible}}
+// expected-note@-3 {{conflicting attribute is here}}
+
+__device__ float f5(float x) {
+  [[clang::noconvergent]] float y;
+// expected-warning@-1 {{'noconvergent' attribute only applies to functions and statements}}
+
+  float z;
+
+  [[clang::noconvergent]] z = 1;
+// expected-warning@-1 {{'noconvergent' attribute is ignored because there exists no call expression inside the statement}}
+
+  [[clang::noconvergent]] z = f0(x);
+}
diff --git a/clang/test/SemaCXX/constexpr-subobj-initialization.cpp b/clang/test/SemaCXX/constexpr-subobj-initialization.cpp
index cd096a9..f0252df 100644
--- a/clang/test/SemaCXX/constexpr-subobj-initialization.cpp
+++ b/clang/test/SemaCXX/constexpr-subobj-initialization.cpp
@@ -1,11 +1,12 @@
 // RUN: %clang_cc1 -fsyntax-only -verify %s
+// RUN: %clang_cc1 -fsyntax-only -verify -fexperimental-new-constant-interpreter %s
 
 namespace baseclass_uninit {
 struct DelBase {
   constexpr DelBase() = delete; // expected-note {{'DelBase' has been explicitly marked deleted here}}
 };
 
-struct Foo : DelBase {  // expected-note 2{{constructor of base class 'DelBase' is not called}}
+struct Foo : DelBase {  // expected-note-re 2{{constructor of base class '{{.*}}DelBase' is not called}}
   constexpr Foo() {}; // expected-error {{call to deleted constructor of 'DelBase'}}
 };
 constexpr Foo f; // expected-error {{must be initialized by a constant expression}}
@@ -15,13 +16,13 @@ struct Bar : Foo {
 constexpr Bar bar; // expected-error {{must be initialized by a constant expression}}
 
 struct Base {};
-struct A : Base { // expected-note {{constructor of base class 'Base' is not called}}
+struct A : Base { // expected-note-re {{constructor of base class '{{.*}}Base' is not called}}
   constexpr A() : value() {} // expected-error {{member initializer 'value' does not name a non-static data member or base class}}
 };
 
 constexpr A a; // expected-error {{must be initialized by a constant expression}}
 
-struct B : Base { // expected-note {{constructor of base class 'Base' is not called}}
+struct B : Base { // expected-note-re {{constructor of base class '{{.*}}Base' is not called}}
   constexpr B() : {} // expected-error {{expected class member or base class name}}
 };
 
diff --git a/clang/test/SemaCXX/cxx2b-deducing-this.cpp b/clang/test/SemaCXX/cxx2b-deducing-this.cpp
index 4811b60..45fee65 100644
--- a/clang/test/SemaCXX/cxx2b-deducing-this.cpp
+++ b/clang/test/SemaCXX/cxx2b-deducing-this.cpp
@@ -965,3 +965,10 @@ void f();
 };
 void a::f(this auto) {} // expected-error {{an explicit object parameter cannot appear in a non-member function}}
 }
+
+struct R {
+  void f(this auto &&self, int &&r_value_ref) {} // expected-note {{candidate function template not viable: expects an rvalue for 2nd argument}}
+  void g(int &&r_value_ref) {
+	f(r_value_ref); // expected-error {{no matching member function for call to 'f'}}
+  }
+};
diff --git a/clang/test/SemaCXX/destructor.cpp b/clang/test/SemaCXX/destructor.cpp
index 028bc7c..dfcd1b0 100644
--- a/clang/test/SemaCXX/destructor.cpp
+++ b/clang/test/SemaCXX/destructor.cpp
@@ -577,4 +577,13 @@ static_assert(!__is_trivially_constructible(Foo, const Foo &), "");
 static_assert(!__is_trivially_constructible(Foo, Foo &&), "");
 } // namespace GH89544
 
+namespace GH97230 {
+struct X {
+  ~X() = defaul; // expected-error {{initializer on function does not look like a pure-specifier}} \
+                 // expected-error {{use of undeclared identifier 'defaul'}}
+};
+struct Y : X {} y1{ }; // expected-error {{call to implicitly-deleted default constructor of 'struct Y'}} \
+                       // expected-note {{default constructor of 'Y' is implicitly deleted because base class 'X' has no destructor}}
+}
+
 #endif // BE_THE_HEADER
diff --git a/clang/test/SemaHLSL/Loops/unroll.hlsl b/clang/test/SemaHLSL/Loops/unroll.hlsl
index 2e2be31..c94dc58 100644
--- a/clang/test/SemaHLSL/Loops/unroll.hlsl
+++ b/clang/test/SemaHLSL/Loops/unroll.hlsl
@@ -1,7 +1,10 @@
 // RUN: %clang_cc1 -O0 -finclude-default-header -fsyntax-only -triple dxil-pc-shadermodel6.6-library %s -verify
 void unroll_no_vars() {
+  // expected-note@+1 {{declared here}}
   int I = 3;
-  [unroll(I)]  // expected-error {{'unroll' attribute requires an integer constant}}
+  // expected-error@+2 {{expression is not an integral constant expression}}
+  // expected-note@+1 {{read of non-const variable 'I' is not allowed in a constant expression}}
+  [unroll(I)]
   while (I--);
 }
 
diff --git a/clang/test/TableGen/attrs-parser-string-switches.td b/clang/test/TableGen/attrs-parser-string-switches.td
new file mode 100644
index 0000000..c15ab10
--- /dev/null
+++ b/clang/test/TableGen/attrs-parser-string-switches.td
@@ -0,0 +1,232 @@
+// RUN: clang-tblgen -gen-clang-attr-parser-string-switches -I%p/../../include %s -o - 2>&1 | FileCheck %s
+
+// Tests that the tablegen can support attributes with the same spellings but
+// different argument types.
+
+include "clang/Basic/Attr.td"
+
+// Test attributeParsedArgsUnevaluated : different ParseArgumentsAsUnevaluated
+def TestUnEvalOne : InheritableAttr {
+  let Spellings = [Clang<"test_uneval">];
+  let Args = [ExprArgument<"Count">];
+  let Subjects = SubjectList<[Function]>;
+  let ParseArgumentsAsUnevaluated = 1;
+  let Documentation = [Undocumented];
+}
+
+def TestUnEvalTwo : InheritableAttr {
+  let Spellings = [Pragma<"", "test_uneval">];
+  let Args = [ExprArgument<"Count">];
+  let Subjects = SubjectList<[Function]>;
+  let Documentation = [Undocumented];
+}
+
+// CHECK: #if defined(CLANG_ATTR_ARG_CONTEXT_LIST)
+// CHECK-NOT: .Case("Pragma::test_uneval", true)
+// CHECK: .Case("GNU::test_uneval", true)
+// CHECK-NOT: .Case("Pragma::test_uneval", true)
+// CHECK: .Case("CXX11::clang::test_uneval", true)
+// CHECK-NOT: .Case("Pragma::test_uneval", true)
+// CHECK: .Case("C23::clang::test_uneval", true)
+// CHECK-NOT: .Case("Pragma::test_uneval", true)
+// CHECK: #endif // CLANG_ATTR_ARG_CONTEXT_LIST
+
+// Test attributeHasIdentifierArg: Same spelling, one with and one without
+// an IdentifierArg.
+def TestIdentOne : Attr {
+  let Spellings = [Clang<"test_ident">];
+  let Args = [EnumArgument<"Option", "OptionType", /*is_string=*/false,
+              ["optA", "optB"], ["OPTA", "OPTB"]>];
+  let Subjects = SubjectList<[Function]>;
+  let Documentation = [Undocumented];
+}
+
+def TestIdentTwo : StmtAttr {
+  let Spellings = [Pragma<"", "test_ident">];
+  let Args = [UnsignedArgument<"val", /*opt*/1>];
+  let Subjects = SubjectList<[Function]>;
+  let Documentation = [Undocumented];
+}
+
+// CHECK: #if defined(CLANG_ATTR_IDENTIFIER_ARG_LIST)
+// CHECK-NOT: .Case("Pragma::test_ident", true)
+// CHECK: .Case("GNU::test_ident", true)
+// CHECK-NOT: .Case("Pragma::test_ident", true)
+// CHECK: .Case("CXX11::clang::test_ident", true)
+// CHECK-NOT: .Case("Pragma::test_ident", true)
+// CHECK: .Case("C23::clang::test_ident", true)
+// CHECK-NOT: .Case("Pragma::test_ident", true)
+// CHECK: #endif // CLANG_ATTR_IDENTIFIER_ARG_LIST
+
+// Test attributeStringLiteralListArg : Same spelling, some with a
+// StringArgument, some without, some in different locations.
+def TestStringOne : DeclOrTypeAttr {
+  let Spellings = [Clang<"test_string">];
+  let Args = [StringArgument<"strarg">];
+  let Subjects = SubjectList<[Function, TypedefName, ParmVar]>;
+  let Documentation = [AcquireHandleDocs];
+}
+
+def TestStringTwo : InheritableAttr {
+  let Spellings = [Pragma<"", "test_string">];
+  let Args = [UnsignedArgument<"unsarg">];
+  let Subjects = SubjectList<[Function], ErrorDiag>;
+  let Documentation = [Undocumented];
+}
+
+// In a different position
+def TestStringThree : Attr {
+  let Spellings = [Declspec<"test_string">];
+  let Args = [UnsignedArgument<"uarg">, StringArgument<"strarg">];
+  let Subjects = SubjectList<[Function, TypedefName, ParmVar]>;
+  let Documentation = [AcquireHandleDocs];
+}
+
+// CHECK: #if defined(CLANG_ATTR_STRING_LITERAL_ARG_LIST)
+// CHECK-NOT: .Case("Pragma::test_string"
+// CHECK: .Case("GNU::test_string", 1)
+// CHECK: .Case("CXX11::clang::test_string", 1)
+// CHECK: .Case("C23::clang::test_string", 1)
+// CHECK-NOT: .Case("Pragma::test_string"
+// CHECK: .Case("Declspec::test_string", 2)
+// CHECK-NOT: .Case("Pragma::test_string"
+// CHECK: #endif // CLANG_ATTR_STRING_LITERAL_ARG_LIST
+
+// Test attributeHasVariadicIdentifierArg : One with VariadicIdentifierArgument
+// and one without.
+def TestVariadicIdentOne : InheritableAttr {
+  let Spellings = [Clang<"test_var_ident">];
+  let Args = [VariadicIdentifierArgument<"iargs">];
+  let Subjects = SubjectList<[Function], ErrorDiag>;
+  let Documentation = [Undocumented];
+}
+
+def TestVariadicIdentTwo : InheritableAttr {
+  let Spellings = [Pragma<"", "test_var_ident">];
+  let Args = [UnsignedArgument<"Hint">];
+  let Subjects = SubjectList<[Function], ErrorDiag>;
+  let Documentation = [Undocumented];
+}
+
+// CHECK: #if defined(CLANG_ATTR_VARIADIC_IDENTIFIER_ARG_LIST)
+// CHECK-NOT: .Case("Pragma::"test_var_ident", true)
+// CHECK: .Case("GNU::test_var_ident", true)
+// CHECK-NOT: .Case("Pragma::test_var_ident", true)
+// CHECK: .Case("CXX11::clang::test_var_ident", true)
+// CHECK-NOT: .Case("Pragma::test_var_ident", true)
+// CHECK: .Case("C23::clang::test_var_ident", true)
+// CHECK-NOT: .Case("Pragma::test_var_ident", true)
+// CHECK: #endif // CLANG_ATTR_VARIADIC_IDENTIFIER_ARG_LIST
+
+// Test attributeTreatsKeywordThisAsIdentifier : Same spelling, one with and
+// one without VariadicParamOrParamIdxArgument.
+def TestVarOrIdxOne : InheritableAttr {
+  let Spellings = [Clang<"test_var_idx">];
+  let Args = [VariadicParamOrParamIdxArgument<"arg">];
+  let Subjects = SubjectList<[Function], ErrorDiag>;
+  let Documentation = [Undocumented];
+}
+
+def TestVarOrIdxTwo : InheritableAttr {
+  let Spellings = [Pragma<"", "test_var_idx">];
+  let Args = [UnsignedArgument<"Hint">];
+  let Subjects = SubjectList<[Function], ErrorDiag>;
+  let Documentation = [Undocumented];
+}
+
+// CHECK: #if defined(CLANG_ATTR_THIS_ISA_IDENTIFIER_ARG_LIST)
+// CHECK-NOT: .Case("Pragma::test_var_idx", true)
+// CHECK: .Case("GNU::test_var_idx", true)
+// CHECK-NOT: .Case("Pragma::test_var_idx", true)
+// CHECK: .Case("CXX11::clang::test_var_idx", true)
+// CHECK-NOT: .Case("Pragma::test_var_idx", true)
+// CHECK: .Case("C23::clang::test_var_idx", true)
+// CHECK-NOT: .Case("Pragma::test_var_idx", true)
+// CHECK: #endif // CLANG_ATTR_THIS_ISA_IDENTIFIER_ARG_LIST
+
+// Test attributeAcceptsExprPack : One with, one without.
+def TestExprPackOne : InheritableAttr {
+  let Spellings = [Clang<"test_expr_pack">];
+  let Args = [StringArgument<"str">, VariadicExprArgument<"args">];
+  let Subjects = SubjectList<[Function], ErrorDiag>;
+  let AcceptsExprPack = 1;
+  let Documentation = [Undocumented];
+}
+
+def TestExprPackTwo : InheritableAttr {
+  let Spellings = [Pragma<"", "test_expr_pack">];
+  let Args = [StringArgument<"str">, VariadicExprArgument<"args">];
+  let Subjects = SubjectList<[Function], ErrorDiag>;
+  let Documentation = [Undocumented];
+}
+
+// CHECK: #if defined(CLANG_ATTR_ACCEPTS_EXPR_PACK)
+// CHECK-NOT: .Case("Pragma::test_expr_pack", true)
+// CHECK: .Case("GNU::test_expr_pack", true)
+// CHECK-NOT: .Case("Pragma::test_expr_pack", true)
+// CHECK: .Case("CXX11::clang::test_expr_pack", true)
+// CHECK-NOT: .Case("Pragma::test_expr_pack", true)
+// CHECK: .Case("C23::clang::test_expr_pack", true)
+// CHECK-NOT: .Case("Pragma::test_expr_pack", true)
+// CHECK: #endif // CLANG_ATTR_ACCEPTS_EXPR_PACK
+
+
+// Test attributeIsTypeArgAttr : Same spelling, one with TypeArgument and one
+// without.
+def TestTypeOne : InheritableAttr {
+  let Spellings = [Clang<"test_type">];
+  let Args = [TypeArgument<"Hint">];
+  let Subjects = SubjectList<[Function], ErrorDiag>;
+  let Documentation = [Undocumented];
+}
+
+def TestTypeTwo : InheritableAttr {
+  let Spellings = [Pragma<"", "test_type">];
+  let Args = [UnsignedArgument<"Hint">];
+  let Subjects = SubjectList<[Function], ErrorDiag>;
+  let Documentation = [Undocumented];
+}
+
+// CHECK: #if defined(CLANG_ATTR_TYPE_ARG_LIST)
+// CHECK-NOT: .Case("Pragma::test_type", true)
+// CHECK: .Case("GNU::test_type", true)
+// CHECK-NOT: .Case("Pragma::test_type", true)
+// CHECK: .Case("CXX11::clang::test_type", true)
+// CHECK-NOT: .Case("Pragma::test_type", true)
+// CHECK: .Case("C23::clang::test_type", true)
+// CHECK-NOT: .Case("Pragma::test_type", true)
+// CHECK: #endif // CLANG_ATTR_TYPE_ARG_LIST
+
+// Test attributeHasStrictIdentifierArgs and
+// attributeHasStrictIdentifierArgAtIndex, one used StrictEnumParameters, the
+// other does not.
+def TestStrictEnumOne : InheritableAttr {
+  let Spellings = [Clang<"strict_enum">];
+  let StrictEnumParameters = 1;
+  let Args = [EnumArgument<"One", "OneType", /*is_string=*/true,
+                ["a", "b", "c", "d"],
+                ["A", "B", "C", "D"]>,
+              IntArgument<"Other", 1>,
+              EnumArgument<"Two", "TwoType", /*is_string=*/true,
+                ["e", "f", "g", "h"],
+                ["E", "F", "G", "H"]>];
+  let Subjects = SubjectList<[Function], ErrorDiag>;
+  let Documentation = [Undocumented];
+}
+
+def TestStrictEnumTwo : InheritableAttr {
+  let Spellings = [Pragma<"", "strict_enum">];
+  let Args = [VariadicExprArgument<"Args">];
+  let Subjects = SubjectList<[Function], ErrorDiag>;
+  let Documentation = [Undocumented];
+}
+
+// CHECK: #if defined(CLANG_ATTR_STRICT_IDENTIFIER_ARG_AT_INDEX_LIST)
+// CHECK-NOT: .Case("Pragma::strict_enum", 5ull)
+// CHECK: .Case("GNU::strict_enum", 5ull)
+// CHECK-NOT: .Case("Pragma::strict_enum", 5ull)
+// CHECK: .Case("CXX11::clang::strict_enum", 5ull)
+// CHECK-NOT: .Case("Pragma::strict_enum", 5ull)
+// CHECK: .Case("C23::clang::strict_enum", 5ull)
+// CHECK-NOT: .Case("Pragma::strict_enum", 5ull)
+// CHECK: #endif // CLANG_ATTR_STRICT_IDENTIFIER_ARG_AT_INDEX_LIST
diff --git a/clang/test/Tooling/clang-check-extra-arg.cpp b/clang/test/Tooling/clang-check-extra-arg.cpp
index df5fb93..488497e 100644
--- a/clang/test/Tooling/clang-check-extra-arg.cpp
+++ b/clang/test/Tooling/clang-check-extra-arg.cpp
@@ -1,4 +1,6 @@
-// RUN: clang-check "%s" -extra-arg=-Wunimplemented-warning -extra-arg-before=-Wunimplemented-warning-before -- -c 2>&1 | FileCheck %s
+/// Check we do not report "argument unused during compilation: '-c'"
+// RUN: clang-check "%s" -extra-arg=-Wunimplemented-warning -extra-arg-before=-Wunimplemented-warning-before -- -c 2>&1 | FileCheck %s --implicit-check-not='argument unused'
+// RUN: clang-check "%s" -extra-arg=-Wunimplemented-warning -extra-arg-before=-Wunimplemented-warning-before -- -S -Xclang -S 2>&1 | FileCheck %s --implicit-check-not='argument unused'
 
 // CHECK: unknown warning option '-Wunimplemented-warning-before'
 // CHECK: unknown warning option '-Wunimplemented-warning'
diff --git a/clang/tools/clang-linker-wrapper/ClangLinkerWrapper.cpp b/clang/tools/clang-linker-wrapper/ClangLinkerWrapper.cpp
index 4bb021e..24cc4f0 100644
--- a/clang/tools/clang-linker-wrapper/ClangLinkerWrapper.cpp
+++ b/clang/tools/clang-linker-wrapper/ClangLinkerWrapper.cpp
@@ -297,7 +297,8 @@ Expected<std::string> findProgram(StringRef Name, ArrayRef<StringRef> Paths) {
 /// supported by the toolchain.
 bool linkerSupportsLTO(const ArgList &Args) {
   llvm::Triple Triple(Args.getLastArgValue(OPT_triple_EQ));
-  return Triple.isNVPTX() || Triple.isAMDGPU();
+  return Triple.isNVPTX() || Triple.isAMDGPU() ||
+         Args.getLastArgValue(OPT_linker_path_EQ).ends_with("ld.lld");
 }
 
 /// Returns the hashed value for a constant string.
@@ -524,6 +525,13 @@ Expected<StringRef> clang(ArrayRef<StringRef> InputFiles, const ArgList &Args) {
       Args.MakeArgString("-" + OptLevel),
   };
 
+  // Forward all of the `--offload-opt` and similar options to the device.
+  if (linkerSupportsLTO(Args)) {
+    for (auto &Arg : Args.filtered(OPT_offload_opt_eq_minus, OPT_mllvm))
+      CmdArgs.push_back(
+          Args.MakeArgString("-Wl,--plugin-opt=" + StringRef(Arg->getValue())));
+  }
+
   if (!Triple.isNVPTX())
     CmdArgs.push_back("-Wl,--no-undefined");
 
@@ -591,6 +599,8 @@ Expected<StringRef> clang(ArrayRef<StringRef> InputFiles, const ArgList &Args) {
         std::back_inserter(CmdArgs));
 
   for (StringRef Arg : Args.getAllArgValues(OPT_linker_arg_EQ))
+    CmdArgs.append({"-Xlinker", Args.MakeArgString(Arg)});
+  for (StringRef Arg : Args.getAllArgValues(OPT_compiler_arg_EQ))
     CmdArgs.push_back(Args.MakeArgString(Arg));
 
   for (StringRef Arg : Args.getAllArgValues(OPT_builtin_bitcode_EQ)) {
@@ -1216,8 +1226,7 @@ DerivedArgList getLinkerArgs(ArrayRef<OffloadFile> Input,
     auto [Triple, Value] = Arg.split('=');
     llvm::Triple TT(Triple);
     // If this isn't a recognized triple then it's an `arg=value` option.
-    if (TT.getArch() <= Triple::ArchType::UnknownArch ||
-        TT.getArch() > Triple::ArchType::LastArchType)
+    if (TT.getArch() == Triple::ArchType::UnknownArch)
       DAL.AddJoinedArg(nullptr, Tbl.getOption(OPT_linker_arg_EQ),
                        Args.MakeArgString(Arg));
     else if (Value.empty())
@@ -1228,6 +1237,22 @@ DerivedArgList getLinkerArgs(ArrayRef<OffloadFile> Input,
                        Args.MakeArgString(Value));
   }
 
+  // Forward '-Xoffload-compiler' options to the appropriate backend.
+  for (StringRef Arg : Args.getAllArgValues(OPT_device_compiler_args_EQ)) {
+    auto [Triple, Value] = Arg.split('=');
+    llvm::Triple TT(Triple);
+    // If this isn't a recognized triple then it's an `arg=value` option.
+    if (TT.getArch() == Triple::ArchType::UnknownArch)
+      DAL.AddJoinedArg(nullptr, Tbl.getOption(OPT_compiler_arg_EQ),
+                       Args.MakeArgString(Arg));
+    else if (Value.empty())
+      DAL.AddJoinedArg(nullptr, Tbl.getOption(OPT_compiler_arg_EQ),
+                       Args.MakeArgString(Triple));
+    else if (Triple == DAL.getLastArgValue(OPT_triple_EQ))
+      DAL.AddJoinedArg(nullptr, Tbl.getOption(OPT_compiler_arg_EQ),
+                       Args.MakeArgString(Value));
+  }
+
   return DAL;
 }
 
@@ -1756,7 +1781,7 @@ int main(int Argc, char **Argv) {
   for (const opt::Arg *Arg : Args.filtered(OPT_mllvm))
     NewArgv.push_back(Arg->getValue());
   for (const opt::Arg *Arg : Args.filtered(OPT_offload_opt_eq_minus))
-    NewArgv.push_back(Args.MakeArgString(StringRef("-") + Arg->getValue()));
+    NewArgv.push_back(Arg->getValue());
   SmallVector<PassPlugin, 1> PluginList;
   PassPlugins.setCallback([&](const std::string &PluginPath) {
     auto Plugin = PassPlugin::Load(PluginPath);
diff --git a/clang/tools/clang-linker-wrapper/LinkerWrapperOpts.td b/clang/tools/clang-linker-wrapper/LinkerWrapperOpts.td
index 9c27e58..a3e8199 100644
--- a/clang/tools/clang-linker-wrapper/LinkerWrapperOpts.td
+++ b/clang/tools/clang-linker-wrapper/LinkerWrapperOpts.td
@@ -32,6 +32,9 @@ def builtin_bitcode_EQ : Joined<["--"], "builtin-bitcode=">,
 def device_linker_args_EQ : Joined<["--"], "device-linker=">,
   Flags<[WrapperOnlyOption]>, MetaVarName<"<value> or <triple>=<value>">,
   HelpText<"Arguments to pass to the device linker invocation">;
+def device_compiler_args_EQ : Joined<["--"], "device-compiler=">,
+  Flags<[WrapperOnlyOption]>, MetaVarName<"<value> or <triple>=<value>">,
+  HelpText<"Arguments to pass to the device compiler invocation">;
 def clang_backend : Flag<["--"], "clang-backend">,
   Flags<[WrapperOnlyOption]>,
   HelpText<"Run the backend using clang rather than the LTO backend">;
@@ -91,6 +94,9 @@ def whole_program : Flag<["--"], "whole-program">,
 def linker_arg_EQ : Joined<["--"], "linker-arg=">,
   Flags<[DeviceOnlyOption, HelpHidden]>,
   HelpText<"An extra argument to be passed to the linker">;
+def compiler_arg_EQ : Joined<["--"], "compiler-arg=">,
+  Flags<[DeviceOnlyOption, HelpHidden]>,
+  HelpText<"An extra argument to be passed to the compiler">;
 
 // Arguments for the LLVM backend.
 def mllvm : Separate<["-"], "mllvm">, Flags<[WrapperOnlyOption]>,
@@ -98,7 +104,7 @@ def mllvm : Separate<["-"], "mllvm">, Flags<[WrapperOnlyOption]>,
   HelpText<"Arguments passed to LLVM, including Clang invocations, for which "
            "the '-mllvm' prefix is preserved. Use '-mllvm --help' for a list "
            "of options.">;
-def offload_opt_eq_minus : Joined<["--", "-"], "offload-opt=-">, Flags<[HelpHidden, WrapperOnlyOption]>,
+def offload_opt_eq_minus : Joined<["--", "-"], "offload-opt=">, Flags<[HelpHidden, WrapperOnlyOption]>,
   HelpText<"Options passed to LLVM, not including the Clang invocation. Use "
            "'--offload-opt=--help' for a list of options.">;
 
diff --git a/clang/tools/clang-nvlink-wrapper/ClangNVLinkWrapper.cpp b/clang/tools/clang-nvlink-wrapper/ClangNVLinkWrapper.cpp
index 3885166..7851414 100644
--- a/clang/tools/clang-nvlink-wrapper/ClangNVLinkWrapper.cpp
+++ b/clang/tools/clang-nvlink-wrapper/ClangNVLinkWrapper.cpp
@@ -302,6 +302,9 @@ Expected<StringRef> runPTXAs(StringRef File, const ArgList &Args) {
       findProgram(Args, "ptxas", {CudaPath + "/bin", GivenPath});
   if (!PTXAsPath)
     return PTXAsPath.takeError();
+  if (!Args.hasArg(OPT_arch))
+    return createStringError(
+        "must pass in an explicit nvptx64 gpu architecture to 'ptxas'");
 
   auto TempFileOrErr = createTempFile(
       Args, sys::path::stem(Args.getLastArgValue(OPT_o, "a.out")), "cubin");
@@ -598,10 +601,11 @@ Expected<SmallVector<StringRef>> getInput(const ArgList &Args) {
         Res.Prevailing = !Sym.isUndefined() && ObjSym.File == *BitcodeFile;
 
         // We need LTO to preseve the following global symbols:
-        // 1) Symbols used in regular objects.
-        // 2) Prevailing symbols that are needed visible to the gpu runtime.
+        // 1) All symbols during a relocatable link.
+        // 2) Symbols used in regular objects.
+        // 3) Prevailing symbols that are needed visible to the gpu runtime.
         Res.VisibleToRegularObj =
-            ObjSym.UsedInRegularObj ||
+            Args.hasArg(OPT_relocatable) || ObjSym.UsedInRegularObj ||
             (Res.Prevailing &&
              (Sym.getVisibility() != GlobalValue::HiddenVisibility &&
               !Sym.canBeOmittedFromSymbolTable()));
@@ -693,6 +697,10 @@ Error runNVLink(ArrayRef<StringRef> Files, const ArgList &Args) {
   if (!NVLinkPath)
     return NVLinkPath.takeError();
 
+  if (!Args.hasArg(OPT_arch))
+    return createStringError(
+        "must pass in an explicit nvptx64 gpu architecture to 'nvlink'");
+
   ArgStringList NewLinkerArgs;
   for (const opt::Arg *Arg : Args) {
     // Do not forward arguments only intended for the linker wrapper.
diff --git a/clang/tools/clang-nvlink-wrapper/NVLinkOpts.td b/clang/tools/clang-nvlink-wrapper/NVLinkOpts.td
index 8c80a51..01bd0f8 100644
--- a/clang/tools/clang-nvlink-wrapper/NVLinkOpts.td
+++ b/clang/tools/clang-nvlink-wrapper/NVLinkOpts.td
@@ -73,6 +73,10 @@ def plugin_opt : Joined<["--", "-"], "plugin-opt=">, Flags<[WrapperOnlyOption]>,
 def save_temps : Flag<["--", "-"], "save-temps">,
   Flags<[WrapperOnlyOption]>, HelpText<"Save intermediate results">;
 
+def relocatable : Flag<["--", "-"], "relocatable">,
+  Flags<[WrapperOnlyOption]>, HelpText<"Perform a relocatable link (LTO only)">;
+def r : Flag<["-"], "r">, Flags<[WrapperOnlyOption]>, Alias<relocatable>;
+
 def whole_archive : Flag<["--", "-"], "whole-archive">,
   Flags<[WrapperOnlyOption, HelpHidden]>;
 def no_whole_archive : Flag<["--", "-"], "no-whole-archive">,
@@ -83,8 +87,7 @@ def mllvm : Separate<["-"], "mllvm">, Flags<[WrapperOnlyOption]>,
   HelpText<"Arguments passed to LLVM, including Clang invocations, for which "
            "the '-mllvm' prefix is preserved. Use '-mllvm --help' for a list "
            "of options.">;
-def mllvm_EQ : Joined<["-"], "mllvm=">, Flags<[HelpHidden]>,
-  Alias<mllvm>;
+def mllvm_EQ : Joined<["-"], "mllvm=">, Flags<[HelpHidden]>, Alias<mllvm>;
 
 def dry_run : Flag<["--", "-"], "dry-run">, Flags<[WrapperOnlyOption]>,
   HelpText<"Print generated commands without running.">;
diff --git a/clang/tools/clang-repl/CMakeLists.txt b/clang/tools/clang-repl/CMakeLists.txt
index 42618e4..a35ff13 100644
--- a/clang/tools/clang-repl/CMakeLists.txt
+++ b/clang/tools/clang-repl/CMakeLists.txt
@@ -69,7 +69,7 @@ export_executable_symbols_for_plugins(clang-repl)
 # gold. This flag tells the linker to build a PLT for the full address range.
 # Linkers without this flag are assumed to support proper PLTs by default.
 set(flag_long_plt "-Wl,--long-plt")
-llvm_check_linker_flag(CXX ${flag_long_plt} HAVE_LINKER_FLAG_LONG_PLT)
+check_linker_flag(CXX ${flag_long_plt} HAVE_LINKER_FLAG_LONG_PLT)
 if(HAVE_LINKER_FLAG_LONG_PLT)
   target_link_options(clang-repl PRIVATE ${flag_long_plt})
 endif()
diff --git a/clang/tools/driver/CMakeLists.txt b/clang/tools/driver/CMakeLists.txt
index 290bf2a..018605c2 100644
--- a/clang/tools/driver/CMakeLists.txt
+++ b/clang/tools/driver/CMakeLists.txt
@@ -107,7 +107,7 @@ endif()
 
 if(CLANG_ORDER_FILE AND
     (LLVM_LINKER_IS_APPLE OR LLVM_LINKER_IS_GOLD OR LLVM_LINKER_IS_LLD))
-  include(LLVMCheckLinkerFlag)
+  include(CheckLinkerFlag)
 
   if (LLVM_LINKER_IS_APPLE OR (LLVM_LINKER_IS_LLD AND APPLE))
     set(LINKER_ORDER_FILE_OPTION "-Wl,-order_file,${CLANG_ORDER_FILE}")
@@ -118,7 +118,7 @@ if(CLANG_ORDER_FILE AND
   endif()
 
   # This is a test to ensure the actual order file works with the linker.
-  llvm_check_linker_flag(CXX ${LINKER_ORDER_FILE_OPTION} LINKER_ORDER_FILE_WORKS)
+  check_linker_flag(CXX ${LINKER_ORDER_FILE_OPTION} LINKER_ORDER_FILE_WORKS)
 
   # Passing an empty order file disables some linker layout optimizations.
   # To work around this and enable workflows for re-linking when the order file
diff --git a/clang/tools/libclang/CIndex.cpp b/clang/tools/libclang/CIndex.cpp
index d8af5ab..937d7ff 100644
--- a/clang/tools/libclang/CIndex.cpp
+++ b/clang/tools/libclang/CIndex.cpp
@@ -2526,7 +2526,7 @@ void OMPClauseEnqueue::VisitOMPHintClause(const OMPHintClause *C) {
 }
 
 template <typename T> void OMPClauseEnqueue::VisitOMPClauseList(T *Node) {
-  for (const auto *I : Node->varlists()) {
+  for (const auto *I : Node->varlist()) {
     Visitor->AddStmt(I);
   }
 }
@@ -2746,7 +2746,7 @@ void OMPClauseEnqueue::VisitOMPUsesAllocatorsClause(
 }
 void OMPClauseEnqueue::VisitOMPAffinityClause(const OMPAffinityClause *C) {
   Visitor->AddStmt(C->getModifier());
-  for (const Expr *E : C->varlists())
+  for (const Expr *E : C->varlist())
     Visitor->AddStmt(E);
 }
 void OMPClauseEnqueue::VisitOMPBindClause(const OMPBindClause *C) {}
diff --git a/clang/tools/libclang/CMakeLists.txt b/clang/tools/libclang/CMakeLists.txt
index 7b63400..968b46a 100644
--- a/clang/tools/libclang/CMakeLists.txt
+++ b/clang/tools/libclang/CMakeLists.txt
@@ -190,8 +190,8 @@ if(ENABLE_SHARED)
       include(CheckLinkerFlag)
       # The Solaris 11.4 linker supports a subset of GNU ld version scripts,
       # but requires a special option to enable it.
-      llvm_check_linker_flag(CXX "-Wl,-z,gnu-version-script-compat"
-                             LINKER_SUPPORTS_Z_GNU_VERSION_SCRIPT_COMPAT)
+      check_linker_flag(CXX "-Wl,-z,gnu-version-script-compat"
+                        LINKER_SUPPORTS_Z_GNU_VERSION_SCRIPT_COMPAT)
       # Older Solaris (and illumos) linker does not support GNU ld version scripts
       # and does not support GNU version script compat.
       if (LINKER_SUPPORTS_Z_GNU_VERSION_SCRIPT_COMPAT)
diff --git a/clang/unittests/AST/ASTImporterTest.cpp b/clang/unittests/AST/ASTImporterTest.cpp
index 9b12caa..57242ff 100644
--- a/clang/unittests/AST/ASTImporterTest.cpp
+++ b/clang/unittests/AST/ASTImporterTest.cpp
@@ -9376,29 +9376,6 @@ TEST_P(ASTImporterOptionSpecificTestBase, VaListCpp) {
       ToVaList->getUnderlyingType(), ToBuiltinVaList->getUnderlyingType()));
 }
 
-TEST_P(ASTImporterOptionSpecificTestBase,
-       ImportDefinitionOfEmptyClassWithNoUniqueAddressField) {
-  Decl *FromTU = getTuDecl(
-      R"(
-      struct B {};
-      struct A { B b; };
-      )",
-      Lang_CXX20);
-
-  CXXRecordDecl *FromD = FirstDeclMatcher<CXXRecordDecl>().match(
-      FromTU, cxxRecordDecl(hasName("A")));
-
-  for (auto *FD : FromD->fields())
-    FD->addAttr(clang::NoUniqueAddressAttr::Create(FromD->getASTContext(),
-                                                   clang::SourceRange()));
-  FromD->markEmpty();
-
-  CXXRecordDecl *ToD = Import(FromD, Lang_CXX20);
-  EXPECT_TRUE(ToD->isEmpty());
-  for (auto *FD : ToD->fields())
-    EXPECT_EQ(true, FD->hasAttr<NoUniqueAddressAttr>());
-}
-
 TEST_P(ASTImporterOptionSpecificTestBase, ImportExistingTypedefToRecord) {
   const char *Code =
       R"(
@@ -9820,6 +9797,128 @@ TEST_P(ASTImporterOptionSpecificTestBase, ImportMultipleAnonymousEnumDecls) {
   ASSERT_NE(ToEnumDeclA, ToEnumDeclB);
 }
 
+struct ImportTemplateParmDeclDefaultValue
+    : public ASTImporterOptionSpecificTestBase {
+protected:
+  void checkTemplateParams(RedeclarableTemplateDecl *D) {
+    auto *CanD = cast<RedeclarableTemplateDecl>(D->getCanonicalDecl());
+    auto *CanNonTypeP = cast<NonTypeTemplateParmDecl>(
+        CanD->getTemplateParameters()->getParam(0));
+    auto *CanTypeP =
+        cast<TemplateTypeParmDecl>(CanD->getTemplateParameters()->getParam(1));
+    auto *CanTemplateP = cast<TemplateTemplateParmDecl>(
+        CanD->getTemplateParameters()->getParam(2));
+    EXPECT_FALSE(CanNonTypeP->getDefaultArgStorage().isInherited());
+    EXPECT_FALSE(CanTypeP->getDefaultArgStorage().isInherited());
+    EXPECT_FALSE(CanTemplateP->getDefaultArgStorage().isInherited());
+    for (Decl *Redecl : D->redecls()) {
+      auto *ReD = cast<RedeclarableTemplateDecl>(Redecl);
+      if (ReD != CanD) {
+        auto *NonTypeP = cast<NonTypeTemplateParmDecl>(
+            ReD->getTemplateParameters()->getParam(0));
+        auto *TypeP = cast<TemplateTypeParmDecl>(
+            ReD->getTemplateParameters()->getParam(1));
+        auto *TemplateP = cast<TemplateTemplateParmDecl>(
+            ReD->getTemplateParameters()->getParam(2));
+        EXPECT_TRUE(NonTypeP->getDefaultArgStorage().isInherited());
+        EXPECT_TRUE(TypeP->getDefaultArgStorage().isInherited());
+        EXPECT_TRUE(TemplateP->getDefaultArgStorage().isInherited());
+        EXPECT_EQ(NonTypeP->getDefaultArgStorage().getInheritedFrom(),
+                  CanNonTypeP);
+        EXPECT_EQ(TypeP->getDefaultArgStorage().getInheritedFrom(), CanTypeP);
+        EXPECT_EQ(TemplateP->getDefaultArgStorage().getInheritedFrom(),
+                  CanTemplateP);
+      }
+    }
+  }
+
+  void testImport(RedeclarableTemplateDecl *FromD) {
+    RedeclarableTemplateDecl *ToD = Import(FromD, Lang_CXX14);
+    checkTemplateParams(ToD);
+  }
+
+  const char *CodeFunction =
+      R"(
+      template <class> struct X;
+
+      template <int A = 2, typename B = int, template<class> class C = X>
+      void f();
+      template <int A, typename B, template<class> class C>
+      void f();
+      template <int A, typename B, template<class> class C>
+      void f() {}
+      )";
+
+  const char *CodeClass =
+      R"(
+      template <class> struct X;
+
+      template <int A = 2, typename B = int, template<class> class C = X>
+      struct S;
+      template <int A, typename B, template<class> class C>
+      struct S;
+      template <int A, typename B, template<class> class C>
+      struct S {};
+      )";
+
+  const char *CodeVar =
+      R"(
+      template <class> struct X;
+
+      template <int A = 2, typename B = int, template<class> class C = X>
+      extern int V;
+      template <int A, typename B, template<class> class C>
+      extern int V;
+      template <int A, typename B, template<class> class C>
+      int V = A;
+      )";
+};
+
+TEST_P(ImportTemplateParmDeclDefaultValue, ImportFunctionTemplate) {
+  Decl *FromTU = getTuDecl(CodeFunction, Lang_CXX14);
+  auto *FromLastD = LastDeclMatcher<FunctionTemplateDecl>().match(
+      FromTU, functionTemplateDecl(hasName("f")));
+  testImport(FromLastD);
+}
+
+TEST_P(ImportTemplateParmDeclDefaultValue, ImportExistingFunctionTemplate) {
+  getToTuDecl(CodeFunction, Lang_CXX14);
+  Decl *FromTU = getTuDecl(CodeFunction, Lang_CXX14);
+  auto *FromLastD = LastDeclMatcher<FunctionTemplateDecl>().match(
+      FromTU, functionTemplateDecl(hasName("f")));
+  testImport(FromLastD);
+}
+
+TEST_P(ImportTemplateParmDeclDefaultValue, ImportClassTemplate) {
+  Decl *FromTU = getTuDecl(CodeClass, Lang_CXX14);
+  auto *FromLastD = LastDeclMatcher<ClassTemplateDecl>().match(
+      FromTU, classTemplateDecl(hasName("S")));
+  testImport(FromLastD);
+}
+
+TEST_P(ImportTemplateParmDeclDefaultValue, ImportExistingClassTemplate) {
+  getToTuDecl(CodeClass, Lang_CXX14);
+  Decl *FromTU = getTuDecl(CodeClass, Lang_CXX14);
+  auto *FromLastD = LastDeclMatcher<ClassTemplateDecl>().match(
+      FromTU, classTemplateDecl(hasName("S")));
+  testImport(FromLastD);
+}
+
+TEST_P(ImportTemplateParmDeclDefaultValue, ImportVarTemplate) {
+  Decl *FromTU = getTuDecl(CodeVar, Lang_CXX14);
+  auto *FromLastD = LastDeclMatcher<VarTemplateDecl>().match(
+      FromTU, varTemplateDecl(hasName("V")));
+  testImport(FromLastD);
+}
+
+TEST_P(ImportTemplateParmDeclDefaultValue, ImportExistingVarTemplate) {
+  getToTuDecl(CodeVar, Lang_CXX14);
+  Decl *FromTU = getTuDecl(CodeVar, Lang_CXX14);
+  auto *FromLastD = LastDeclMatcher<VarTemplateDecl>().match(
+      FromTU, varTemplateDecl(hasName("V")));
+  testImport(FromLastD);
+}
+
 INSTANTIATE_TEST_SUITE_P(ParameterizedTests, ASTImporterLookupTableTest,
                          DefaultTestValuesForRunOptions);
 
@@ -9903,6 +10002,9 @@ INSTANTIATE_TEST_SUITE_P(ParameterizedTests, ImportInjectedClassNameType,
 INSTANTIATE_TEST_SUITE_P(ParameterizedTests, ImportMatrixType,
                          DefaultTestValuesForRunOptions);
 
+INSTANTIATE_TEST_SUITE_P(ParameterizedTests, ImportTemplateParmDeclDefaultValue,
+                         DefaultTestValuesForRunOptions);
+
 // FIXME: Make ImportOpenCLPipe test work.
 // INSTANTIATE_TEST_SUITE_P(ParameterizedTests, ImportOpenCLPipe,
 //                          DefaultTestValuesForRunOptions);
diff --git a/clang/unittests/ASTMatchers/ASTMatchersNarrowingTest.cpp b/clang/unittests/ASTMatchers/ASTMatchersNarrowingTest.cpp
index f261406..611e1f9 100644
--- a/clang/unittests/ASTMatchers/ASTMatchersNarrowingTest.cpp
+++ b/clang/unittests/ASTMatchers/ASTMatchersNarrowingTest.cpp
@@ -2552,6 +2552,10 @@ TEST_P(ASTMatchersTest, HasName_MatchesNamespaces) {
                          recordDecl(hasName("a+b::C"))));
   EXPECT_TRUE(notMatches("namespace a { namespace b { class AC; } }",
                          recordDecl(hasName("C"))));
+  EXPECT_TRUE(matches("namespace a { inline namespace a { class C; } }",
+                      recordDecl(hasName("::a::C"))));
+  EXPECT_TRUE(matches("namespace a { inline namespace a { class C; } }",
+                      recordDecl(hasName("::a::a::C"))));
 }
 
 TEST_P(ASTMatchersTest, HasName_MatchesOuterClasses) {
diff --git a/clang/unittests/Analysis/FlowSensitive/TypeErasedDataflowAnalysisTest.cpp b/clang/unittests/Analysis/FlowSensitive/TypeErasedDataflowAnalysisTest.cpp
index 1a52b82..8717d97 100644
--- a/clang/unittests/Analysis/FlowSensitive/TypeErasedDataflowAnalysisTest.cpp
+++ b/clang/unittests/Analysis/FlowSensitive/TypeErasedDataflowAnalysisTest.cpp
@@ -9,6 +9,7 @@
 #include "TestingSupport.h"
 #include "clang/AST/Decl.h"
 #include "clang/AST/ExprCXX.h"
+#include "clang/AST/OperationKinds.h"
 #include "clang/AST/Type.h"
 #include "clang/ASTMatchers/ASTMatchFinder.h"
 #include "clang/ASTMatchers/ASTMatchers.h"
@@ -79,7 +80,7 @@ protected:
 
   /// Returns the `CFGBlock` containing `S` (and asserts that it exists).
   const CFGBlock *blockForStmt(const Stmt &S) {
-    const CFGBlock *Block = ACFG->getStmtToBlock().lookup(&S);
+    const CFGBlock *Block = ACFG->blockForStmt(S);
     assert(Block != nullptr);
     return Block;
   }
@@ -370,6 +371,42 @@ TEST_F(DiscardExprStateTest, ConditionalOperator) {
   EXPECT_EQ(CallGState.Env.get<PointerValue>(AddrOfI), nullptr);
 }
 
+TEST_F(DiscardExprStateTest, CallWithParenExprTreatedCorrectly) {
+  // This is a regression test.
+  // In the CFG for `target()` below, the expression that evaluates the function
+  // pointer for `expect` and the actual call are separated into different
+  // baseic blocks (because of the control flow introduced by the `||`
+  // operator).
+  // The value for the `expect` function pointer was erroneously discarded
+  // from the environment between these two blocks because the code that
+  // determines whether the expression values for a block need to be preserved
+  // did not ignore the `ParenExpr` around `(i == 1)` (which is not represented
+  // in the CFG).
+  std::string Code = R"(
+    bool expect(bool, bool);
+    void target(int i) {
+      expect(false || (i == 1), false);
+    }
+  )";
+  auto BlockStates = llvm::cantFail(runAnalysis<NoopAnalysis>(
+      Code, [](ASTContext &C) { return NoopAnalysis(C); }));
+
+  const auto &FnToPtrDecay = matchNode<ImplicitCastExpr>(
+      implicitCastExpr(hasCastKind(CK_FunctionToPointerDecay)));
+  const auto &CallExpect =
+      matchNode<CallExpr>(callExpr(callee(functionDecl(hasName("expect")))));
+
+  // In the block that evaluates the implicit cast of `expect` to a pointer,
+  // this expression is associated with a value.
+  const auto &FnToPtrDecayState = blockStateForStmt(BlockStates, FnToPtrDecay);
+  EXPECT_NE(FnToPtrDecayState.Env.getValue(FnToPtrDecay), nullptr);
+
+  // In the block that calls `expect()`, the implicit cast of `expect` to a
+  // pointer is still associated with a value.
+  const auto &CallExpectState = blockStateForStmt(BlockStates, CallExpect);
+  EXPECT_NE(CallExpectState.Env.getValue(FnToPtrDecay), nullptr);
+}
+
 struct NonConvergingLattice {
   int State;
 
diff --git a/clang/unittests/Format/TokenAnnotatorTest.cpp b/clang/unittests/Format/TokenAnnotatorTest.cpp
index 1f820d8..f432b95 100644
--- a/clang/unittests/Format/TokenAnnotatorTest.cpp
+++ b/clang/unittests/Format/TokenAnnotatorTest.cpp
@@ -577,12 +577,20 @@ TEST_F(TokenAnnotatorTest, UnderstandsTernaryInTemplate) {
   EXPECT_TOKEN(Tokens[7], tok::greater, TT_TemplateCloser);
 
   // IsExpression = true
+
   Tokens = annotate("return foo<true ? 1 : 2>();");
   ASSERT_EQ(Tokens.size(), 13u) << Tokens;
   EXPECT_TOKEN(Tokens[2], tok::less, TT_TemplateOpener);
   EXPECT_TOKEN(Tokens[4], tok::question, TT_ConditionalExpr);
   EXPECT_TOKEN(Tokens[6], tok::colon, TT_ConditionalExpr);
   EXPECT_TOKEN(Tokens[8], tok::greater, TT_TemplateCloser);
+
+  Tokens = annotate("return foo<true ? 1 : 2>{};");
+  ASSERT_EQ(Tokens.size(), 13u) << Tokens;
+  EXPECT_TOKEN(Tokens[2], tok::less, TT_TemplateOpener);
+  EXPECT_TOKEN(Tokens[4], tok::question, TT_ConditionalExpr);
+  EXPECT_TOKEN(Tokens[6], tok::colon, TT_ConditionalExpr);
+  EXPECT_TOKEN(Tokens[8], tok::greater, TT_TemplateCloser);
 }
 
 TEST_F(TokenAnnotatorTest, UnderstandsNonTemplateAngleBrackets) {
@@ -596,6 +604,21 @@ TEST_F(TokenAnnotatorTest, UnderstandsNonTemplateAngleBrackets) {
   EXPECT_TOKEN(Tokens[1], tok::less, TT_BinaryOperator);
   EXPECT_TOKEN(Tokens[7], tok::greater, TT_BinaryOperator);
 
+  Tokens = annotate("return A < B ? true : A > B;");
+  ASSERT_EQ(Tokens.size(), 12u) << Tokens;
+  EXPECT_TOKEN(Tokens[2], tok::less, TT_BinaryOperator);
+  EXPECT_TOKEN(Tokens[8], tok::greater, TT_BinaryOperator);
+
+  Tokens = annotate("return A < B ? true : A > B ? false : false;");
+  ASSERT_EQ(Tokens.size(), 16u) << Tokens;
+  EXPECT_TOKEN(Tokens[2], tok::less, TT_BinaryOperator);
+  EXPECT_TOKEN(Tokens[8], tok::greater, TT_BinaryOperator);
+
+  Tokens = annotate("return A < B ^ A > B;");
+  ASSERT_EQ(Tokens.size(), 10u) << Tokens;
+  EXPECT_TOKEN(Tokens[2], tok::less, TT_BinaryOperator);
+  EXPECT_TOKEN(Tokens[6], tok::greater, TT_BinaryOperator);
+
   Tokens = annotate("ratio{-1, 2} < ratio{-1, 3} == -1 / 3 > -1 / 2;");
   ASSERT_EQ(Tokens.size(), 27u) << Tokens;
   EXPECT_TOKEN(Tokens[7], tok::less, TT_BinaryOperator);
diff --git a/clang/utils/TableGen/ClangAttrEmitter.cpp b/clang/utils/TableGen/ClangAttrEmitter.cpp
index 1f83440..f504b1d 100644
--- a/clang/utils/TableGen/ClangAttrEmitter.cpp
+++ b/clang/utils/TableGen/ClangAttrEmitter.cpp
@@ -2372,13 +2372,10 @@ void PragmaClangAttributeSupport::generateParsingHelpers(raw_ostream &OS) {
   OS << "}\n\n";
 }
 
-template <typename Fn>
-static void forEachUniqueSpelling(const Record &Attr, Fn &&F) {
+template <typename Fn> static void forEachSpelling(const Record &Attr, Fn &&F) {
   std::vector<FlattenedSpelling> Spellings = GetFlattenedSpellings(Attr);
-  SmallDenseSet<StringRef, 8> Seen;
   for (const FlattenedSpelling &S : Spellings) {
-    if (Seen.insert(S.name()).second)
-      F(S);
+    F(S);
   }
 }
 
@@ -2402,8 +2399,11 @@ static void emitClangAttrTypeArgList(RecordKeeper &Records, raw_ostream &OS) {
       continue;
 
     // All these spellings take a single type argument.
-    forEachUniqueSpelling(*Attr, [&](const FlattenedSpelling &S) {
-      OS << ".Case(\"" << S.name() << "\", " << "true" << ")\n";
+    forEachSpelling(*Attr, [&](const FlattenedSpelling &S) {
+      OS << ".Case(\"" << S.variety();
+      if (S.nameSpace().length())
+        OS << "::" << S.nameSpace();
+      OS << "::" << S.name() << "\", true)\n";
     });
   }
   OS << "#endif // CLANG_ATTR_TYPE_ARG_LIST\n\n";
@@ -2421,8 +2421,11 @@ static void emitClangAttrArgContextList(RecordKeeper &Records, raw_ostream &OS)
       continue;
 
     // All these spellings take are parsed unevaluated.
-    forEachUniqueSpelling(Attr, [&](const FlattenedSpelling &S) {
-      OS << ".Case(\"" << S.name() << "\", " << "true" << ")\n";
+    forEachSpelling(Attr, [&](const FlattenedSpelling &S) {
+      OS << ".Case(\"" << S.variety();
+      if (S.nameSpace().length())
+        OS << "::" << S.nameSpace();
+      OS << "::" << S.name() << "\", true)\n";
     });
   }
   OS << "#endif // CLANG_ATTR_ARG_CONTEXT_LIST\n\n";
@@ -2483,10 +2486,11 @@ static void emitClangAttrVariadicIdentifierArgList(RecordKeeper &Records,
       continue;
 
     // All these spellings take an identifier argument.
-    forEachUniqueSpelling(*A, [&](const FlattenedSpelling &S) {
-      OS << ".Case(\"" << S.name() << "\", "
-         << "true"
-         << ")\n";
+    forEachSpelling(*A, [&](const FlattenedSpelling &S) {
+      OS << ".Case(\"" << S.variety();
+      if (S.nameSpace().length())
+        OS << "::" << S.nameSpace();
+      OS << "::" << S.name() << "\", true)\n";
     });
   }
   OS << "#endif // CLANG_ATTR_VARIADIC_IDENTIFIER_ARG_LIST\n\n";
@@ -2552,8 +2556,11 @@ static void emitClangAttrUnevaluatedStringLiteralList(RecordKeeper &Records,
       continue;
 
     // All these spellings have at least one string literal has argument.
-    forEachUniqueSpelling(*Attr, [&](const FlattenedSpelling &S) {
-      OS << ".Case(\"" << S.name() << "\", " << MaskStr << ")\n";
+    forEachSpelling(*Attr, [&](const FlattenedSpelling &S) {
+      OS << ".Case(\"" << S.variety();
+      if (S.nameSpace().length())
+        OS << "::" << S.nameSpace();
+      OS << "::" << S.name() << "\", " << MaskStr << ")\n";
     });
   }
   OS << "#endif // CLANG_ATTR_STRING_LITERAL_ARG_LIST\n\n";
@@ -2571,8 +2578,11 @@ static void emitClangAttrIdentifierArgList(RecordKeeper &Records, raw_ostream &O
       continue;
 
     // All these spellings take an identifier argument.
-    forEachUniqueSpelling(*Attr, [&](const FlattenedSpelling &S) {
-      OS << ".Case(\"" << S.name() << "\", " << "true" << ")\n";
+    forEachSpelling(*Attr, [&](const FlattenedSpelling &S) {
+      OS << ".Case(\"" << S.variety();
+      if (S.nameSpace().length())
+        OS << "::" << S.nameSpace();
+      OS << "::" << S.name() << "\", true)\n";
     });
   }
   OS << "#endif // CLANG_ATTR_IDENTIFIER_ARG_LIST\n\n";
@@ -2587,18 +2597,20 @@ static void emitClangAttrStrictIdentifierArgAtIndexList(RecordKeeper &Records,
   for (const auto *Attr : Attrs) {
     if (!Attr->getValueAsBit("StrictEnumParameters"))
       continue;
-    // Determine whether the first argument is an identifier.
+    // Determine whether each argument is an identifier.
     std::vector<Record *> Args = Attr->getValueAsListOfDefs("Args");
     uint64_t enumAtIndex = 0;
-    for (size_t i = 0; i < Args.size(); i++) {
-      enumAtIndex |= ((uint64_t)isIdentifierArgument(Args[0])) << i;
-    }
+    for (size_t I = 0; I < Args.size(); I++)
+      enumAtIndex |= ((uint64_t)isIdentifierArgument(Args[I])) << I;
     if (!enumAtIndex)
       continue;
 
     // All these spellings take an identifier argument.
-    forEachUniqueSpelling(*Attr, [&](const FlattenedSpelling &S) {
-      OS << ".Case(\"" << S.name() << "\", " << enumAtIndex << "ull)\n";
+    forEachSpelling(*Attr, [&](const FlattenedSpelling &S) {
+      OS << ".Case(\"" << S.variety();
+      if (S.nameSpace().length())
+        OS << "::" << S.nameSpace();
+      OS << "::" << S.name() << "\", " << enumAtIndex << "ull)\n";
     });
   }
   OS << "#endif // CLANG_ATTR_STRICT_IDENTIFIER_ARG_AT_INDEX_LIST\n\n";
@@ -2623,10 +2635,11 @@ static void emitClangAttrThisIsaIdentifierArgList(RecordKeeper &Records,
       continue;
 
     // All these spellings take an identifier argument.
-    forEachUniqueSpelling(*A, [&](const FlattenedSpelling &S) {
-      OS << ".Case(\"" << S.name() << "\", "
-         << "true"
-         << ")\n";
+    forEachSpelling(*A, [&](const FlattenedSpelling &S) {
+      OS << ".Case(\"" << S.variety();
+      if (S.nameSpace().length())
+        OS << "::" << S.nameSpace();
+      OS << "::" << S.name() << "\", true)\n";
     });
   }
   OS << "#endif // CLANG_ATTR_THIS_ISA_IDENTIFIER_ARG_LIST\n\n";
@@ -2642,8 +2655,11 @@ static void emitClangAttrAcceptsExprPack(RecordKeeper &Records,
     if (!Attr.getValueAsBit("AcceptsExprPack"))
       continue;
 
-    forEachUniqueSpelling(Attr, [&](const FlattenedSpelling &S) {
-      OS << ".Case(\"" << S.name() << "\", true)\n";
+    forEachSpelling(Attr, [&](const FlattenedSpelling &S) {
+      OS << ".Case(\"" << S.variety();
+      if (S.nameSpace().length())
+        OS << "::" << S.nameSpace();
+      OS << "::" << S.name() << "\", true)\n";
     });
   }
   OS << "#endif // CLANG_ATTR_ACCEPTS_EXPR_PACK\n\n";
diff --git a/cmake/Modules/LLVMCheckCompilerLinkerFlag.cmake b/cmake/Modules/LLVMCheckCompilerLinkerFlag.cmake
index f61ec05..2524aaf 100644
--- a/cmake/Modules/LLVMCheckCompilerLinkerFlag.cmake
+++ b/cmake/Modules/LLVMCheckCompilerLinkerFlag.cmake
@@ -1,14 +1,8 @@
 include(CMakePushCheckState)
-
-include(CheckCompilerFlag OPTIONAL)
-
-if(NOT COMMAND check_compiler_flag)
-  include(CheckCCompilerFlag)
-  include(CheckCXXCompilerFlag)
-endif()
+include(CheckCompilerFlag)
 
 function(llvm_check_compiler_linker_flag lang flag out_var)
-  # If testing a flag with check_c_compiler_flag, it gets added to the compile
+  # If testing a flag with check_compiler_flag, it gets added to the compile
   # command only, but not to the linker command in that test. If the flag
   # is vital for linking to succeed, the test would fail even if it would
   # have succeeded if it was included on both commands.
@@ -18,18 +12,6 @@ function(llvm_check_compiler_linker_flag lang flag out_var)
 
   cmake_push_check_state()
   set(CMAKE_REQUIRED_FLAGS "${CMAKE_REQUIRED_FLAGS} ${flag}")
-  if(COMMAND check_compiler_flag)
-    check_compiler_flag("${lang}" "" ${out_var})
-  else()
-    # Until the minimum CMAKE version is 3.19
-    # cmake builtin compatible, except we assume lang is C or CXX
-    if("${lang}" STREQUAL "C")
-      check_c_compiler_flag("" ${out_var})
-    elseif("${lang}" STREQUAL "CXX")
-      check_cxx_compiler_flag("" ${out_var})
-    else()
-      message(FATAL_ERROR "\"${lang}\" is not C or CXX")
-    endif()
-  endif()
+  check_compiler_flag("${lang}" "" ${out_var})
   cmake_pop_check_state()
 endfunction()
diff --git a/compiler-rt/CMakeLists.txt b/compiler-rt/CMakeLists.txt
index 96e432e..2207555 100644
--- a/compiler-rt/CMakeLists.txt
+++ b/compiler-rt/CMakeLists.txt
@@ -797,6 +797,10 @@ if(ANDROID)
   append_list_if(COMPILER_RT_HAS_FUSE_LD_LLD_FLAG -fuse-ld=lld SANITIZER_COMMON_LINK_FLAGS)
   append_list_if(COMPILER_RT_HAS_LLD -fuse-ld=lld COMPILER_RT_UNITTEST_LINK_FLAGS)
 endif()
+if(${COMPILER_RT_DEFAULT_TARGET_ARCH} MATCHES sparc)
+  # lld has several bugs/limitations on SPARC, so disable (Issue #100320).
+  set(COMPILER_RT_HAS_LLD FALSE)
+endif()
 pythonize_bool(COMPILER_RT_HAS_LLD)
 pythonize_bool(COMPILER_RT_TEST_USE_LLD)
 
diff --git a/compiler-rt/cmake/config-ix.cmake b/compiler-rt/cmake/config-ix.cmake
index dad557a..9720417 100644
--- a/compiler-rt/cmake/config-ix.cmake
+++ b/compiler-rt/cmake/config-ix.cmake
@@ -1,6 +1,5 @@
 include(CMakePushCheckState)
 include(AddLLVM)
-include(LLVMCheckCompilerLinkerFlag)
 include(CheckCCompilerFlag)
 include(CheckCXXCompilerFlag)
 include(CheckIncludeFiles)
@@ -15,7 +14,7 @@ include(TestBigEndian)
 # in tree version of runtimes, we'd be linking against the just-built
 # libunwind (and the compiler implicit -lunwind wouldn't succeed as the newly
 # built libunwind isn't installed yet). For those cases, it'd be good to
-# link with --uwnindlib=none. Check if that option works.
+# link with --unwindlib=none. Check if that option works.
 llvm_check_compiler_linker_flag(C "--unwindlib=none" CXX_SUPPORTS_UNWINDLIB_NONE_FLAG)
 
 check_library_exists(c fopen "" COMPILER_RT_HAS_LIBC)
diff --git a/compiler-rt/include/profile/InstrProfData.inc b/compiler-rt/include/profile/InstrProfData.inc
index 847e53c..b9df326 100644
--- a/compiler-rt/include/profile/InstrProfData.inc
+++ b/compiler-rt/include/profile/InstrProfData.inc
@@ -738,6 +738,7 @@ serializeValueProfDataFrom(ValueProfRecordClosure *Closure,
 #define INSTR_PROF_RAW_VERSION_VAR __llvm_profile_raw_version
 #define INSTR_PROF_PROFILE_RUNTIME_VAR __llvm_profile_runtime
 #define INSTR_PROF_PROFILE_COUNTER_BIAS_VAR __llvm_profile_counter_bias
+#define INSTR_PROF_PROFILE_BITMAP_BIAS_VAR __llvm_profile_bitmap_bias
 #define INSTR_PROF_PROFILE_SET_TIMESTAMP __llvm_profile_set_timestamp
 #define INSTR_PROF_PROFILE_SAMPLING_VAR __llvm_profile_sampling
 
diff --git a/compiler-rt/lib/builtins/aarch64/sme-abi-vg.c b/compiler-rt/lib/builtins/aarch64/sme-abi-vg.c
index 2006101..4b9ee8c 100644
--- a/compiler-rt/lib/builtins/aarch64/sme-abi-vg.c
+++ b/compiler-rt/lib/builtins/aarch64/sme-abi-vg.c
@@ -10,10 +10,7 @@ struct FEATURES {
 
 extern struct FEATURES __aarch64_cpu_features;
 
-#if __GNUC__ >= 9
-#pragma GCC diagnostic ignored "-Wprio-ctor-dtor"
-#endif
-__attribute__((constructor(90))) static void get_aarch64_cpu_features(void) {
+CONSTRUCTOR_ATTRIBUTE static void get_aarch64_cpu_features(void) {
   if (__atomic_load_n(&__aarch64_cpu_features.features, __ATOMIC_RELAXED))
     return;
 
diff --git a/compiler-rt/lib/builtins/cpu_model/riscv.c b/compiler-rt/lib/builtins/cpu_model/riscv.c
index 145954e..92931fa 100644
--- a/compiler-rt/lib/builtins/cpu_model/riscv.c
+++ b/compiler-rt/lib/builtins/cpu_model/riscv.c
@@ -108,6 +108,9 @@ struct {
 
 #if defined(__linux__)
 
+// The RISC-V hwprobe interface is documented here:
+// <https://docs.kernel.org/arch/riscv/hwprobe.html>.
+
 static long syscall_impl_5_args(long number, long arg1, long arg2, long arg3,
                                 long arg4, long arg5) {
   register long a7 __asm__("a7") = number;
diff --git a/compiler-rt/lib/builtins/os_version_check.c b/compiler-rt/lib/builtins/os_version_check.c
index 01fae83..b10f23a 100644
--- a/compiler-rt/lib/builtins/os_version_check.c
+++ b/compiler-rt/lib/builtins/os_version_check.c
@@ -14,6 +14,7 @@
 #ifdef __APPLE__
 
 #include <TargetConditionals.h>
+#include <assert.h>
 #include <dispatch/dispatch.h>
 #include <dlfcn.h>
 #include <stdint.h>
@@ -270,6 +271,8 @@ static inline uint32_t ConstructVersion(uint32_t Major, uint32_t Minor,
   return ((Major & 0xffff) << 16) | ((Minor & 0xff) << 8) | (Subminor & 0xff);
 }
 
+#define PLATFORM_MACOS 1
+
 int32_t __isPlatformVersionAtLeast(uint32_t Platform, uint32_t Major,
                                    uint32_t Minor, uint32_t Subminor) {
   dispatch_once_f(&DispatchOnceCounter, NULL, initializeAvailabilityCheck);
@@ -282,6 +285,29 @@ int32_t __isPlatformVersionAtLeast(uint32_t Platform, uint32_t Major,
   return AvailabilityVersionCheck(1, Versions);
 }
 
+#if TARGET_OS_OSX
+
+int32_t __isPlatformOrVariantPlatformVersionAtLeast(
+    uint32_t Platform, uint32_t Major, uint32_t Minor, uint32_t Subminor,
+    uint32_t Platform2, uint32_t Major2, uint32_t Minor2, uint32_t Subminor2) {
+  dispatch_once_f(&DispatchOnceCounter, NULL, initializeAvailabilityCheck);
+
+  if (!AvailabilityVersionCheck) {
+    // Handle case of back-deployment for older macOS.
+    if (Platform == PLATFORM_MACOS) {
+      return __isOSVersionAtLeast(Major, Minor, Subminor);
+    }
+    assert(Platform2 == PLATFORM_MACOS && "unexpected platform");
+    return __isOSVersionAtLeast(Major2, Minor2, Subminor2);
+  }
+  dyld_build_version_t Versions[] = {
+      {Platform, ConstructVersion(Major, Minor, Subminor)},
+      {Platform2, ConstructVersion(Major2, Minor2, Subminor2)}};
+  return AvailabilityVersionCheck(2, Versions);
+}
+
+#endif
+
 #elif __ANDROID__
 
 #include <pthread.h>
diff --git a/compiler-rt/lib/gwp_asan/definitions.h b/compiler-rt/lib/gwp_asan/definitions.h
index bec0290..c6785d4 100644
--- a/compiler-rt/lib/gwp_asan/definitions.h
+++ b/compiler-rt/lib/gwp_asan/definitions.h
@@ -12,7 +12,8 @@
 #define GWP_ASAN_TLS_INITIAL_EXEC                                              \
   __thread __attribute__((tls_model("initial-exec")))
 
-#define GWP_ASAN_UNLIKELY(X) __builtin_expect(!!(X), 0)
+#define GWP_ASAN_LIKELY(EXPR) __builtin_expect((bool)(EXPR), true)
+#define GWP_ASAN_UNLIKELY(EXPR) __builtin_expect((bool)(EXPR), false)
 #define GWP_ASAN_ALWAYS_INLINE inline __attribute__((always_inline))
 
 #define GWP_ASAN_WEAK __attribute__((weak))
diff --git a/compiler-rt/lib/gwp_asan/platform_specific/guarded_pool_allocator_fuchsia.cpp b/compiler-rt/lib/gwp_asan/platform_specific/guarded_pool_allocator_fuchsia.cpp
index 3f39402..5d5c729 100644
--- a/compiler-rt/lib/gwp_asan/platform_specific/guarded_pool_allocator_fuchsia.cpp
+++ b/compiler-rt/lib/gwp_asan/platform_specific/guarded_pool_allocator_fuchsia.cpp
@@ -24,13 +24,13 @@ void *GuardedPoolAllocator::map(size_t Size, const char *Name) const {
   assert((Size % State.PageSize) == 0);
   zx_handle_t Vmo;
   zx_status_t Status = _zx_vmo_create(Size, 0, &Vmo);
-  check(Status == ZX_OK, "Failed to create Vmo");
+  checkWithErrorCode(Status == ZX_OK, "Failed to create Vmo", Status);
   _zx_object_set_property(Vmo, ZX_PROP_NAME, Name, strlen(Name));
   zx_vaddr_t Addr;
   Status = _zx_vmar_map(_zx_vmar_root_self(),
                         ZX_VM_PERM_READ | ZX_VM_PERM_WRITE | ZX_VM_ALLOW_FAULTS,
                         0, Vmo, 0, Size, &Addr);
-  check(Status == ZX_OK, "Vmo mapping failed");
+  checkWithErrorCode(Status == ZX_OK, "Vmo mapping failed", Status);
   _zx_handle_close(Vmo);
   return reinterpret_cast<void *>(Addr);
 }
@@ -40,7 +40,7 @@ void GuardedPoolAllocator::unmap(void *Ptr, size_t Size) const {
   assert((Size % State.PageSize) == 0);
   zx_status_t Status = _zx_vmar_unmap(_zx_vmar_root_self(),
                                       reinterpret_cast<zx_vaddr_t>(Ptr), Size);
-  check(Status == ZX_OK, "Vmo unmapping failed");
+  checkWithErrorCode(Status == ZX_OK, "Vmo unmapping failed", Status);
 }
 
 void *GuardedPoolAllocator::reserveGuardedPool(size_t Size) {
@@ -50,7 +50,8 @@ void *GuardedPoolAllocator::reserveGuardedPool(size_t Size) {
       _zx_vmar_root_self(),
       ZX_VM_CAN_MAP_READ | ZX_VM_CAN_MAP_WRITE | ZX_VM_CAN_MAP_SPECIFIC, 0,
       Size, &GuardedPagePoolPlatformData.Vmar, &Addr);
-  check(Status == ZX_OK, "Failed to reserve guarded pool allocator memory");
+  checkWithErrorCode(Status == ZX_OK,
+                     "Failed to reserve guarded pool allocator memory", Status);
   _zx_object_set_property(GuardedPagePoolPlatformData.Vmar, ZX_PROP_NAME,
                           kGwpAsanGuardPageName, strlen(kGwpAsanGuardPageName));
   return reinterpret_cast<void *>(Addr);
@@ -59,8 +60,10 @@ void *GuardedPoolAllocator::reserveGuardedPool(size_t Size) {
 void GuardedPoolAllocator::unreserveGuardedPool() {
   const zx_handle_t Vmar = GuardedPagePoolPlatformData.Vmar;
   assert(Vmar != ZX_HANDLE_INVALID && Vmar != _zx_vmar_root_self());
-  check(_zx_vmar_destroy(Vmar) == ZX_OK, "Failed to destroy a vmar");
-  check(_zx_handle_close(Vmar) == ZX_OK, "Failed to close a vmar");
+  zx_status_t Status = _zx_vmar_destroy(Vmar);
+  checkWithErrorCode(Status == ZX_OK, "Failed to destroy a vmar", Status);
+  Status = _zx_handle_close(Vmar);
+  checkWithErrorCode(Status == ZX_OK, "Failed to close a vmar", Status);
   GuardedPagePoolPlatformData.Vmar = ZX_HANDLE_INVALID;
 }
 
@@ -69,7 +72,7 @@ void GuardedPoolAllocator::allocateInGuardedPool(void *Ptr, size_t Size) const {
   assert((Size % State.PageSize) == 0);
   zx_handle_t Vmo;
   zx_status_t Status = _zx_vmo_create(Size, 0, &Vmo);
-  check(Status == ZX_OK, "Failed to create vmo");
+  checkWithErrorCode(Status == ZX_OK, "Failed to create vmo", Status);
   _zx_object_set_property(Vmo, ZX_PROP_NAME, kGwpAsanAliveSlotName,
                           strlen(kGwpAsanAliveSlotName));
   const zx_handle_t Vmar = GuardedPagePoolPlatformData.Vmar;
@@ -81,7 +84,7 @@ void GuardedPoolAllocator::allocateInGuardedPool(void *Ptr, size_t Size) const {
                         ZX_VM_PERM_READ | ZX_VM_PERM_WRITE |
                             ZX_VM_ALLOW_FAULTS | ZX_VM_SPECIFIC,
                         Offset, Vmo, 0, Size, &P);
-  check(Status == ZX_OK, "Vmo mapping failed");
+  checkWithErrorCode(Status == ZX_OK, "Vmo mapping failed", Status);
   _zx_handle_close(Vmo);
 }
 
@@ -93,7 +96,7 @@ void GuardedPoolAllocator::deallocateInGuardedPool(void *Ptr,
   assert(Vmar != ZX_HANDLE_INVALID && Vmar != _zx_vmar_root_self());
   const zx_status_t Status =
       _zx_vmar_unmap(Vmar, reinterpret_cast<zx_vaddr_t>(Ptr), Size);
-  check(Status == ZX_OK, "Vmar unmapping failed");
+  checkWithErrorCode(Status == ZX_OK, "Vmar unmapping failed", Status);
 }
 
 size_t GuardedPoolAllocator::getPlatformPageSize() {
diff --git a/compiler-rt/lib/gwp_asan/platform_specific/guarded_pool_allocator_posix.cpp b/compiler-rt/lib/gwp_asan/platform_specific/guarded_pool_allocator_posix.cpp
index 549e31a..7b2e199 100644
--- a/compiler-rt/lib/gwp_asan/platform_specific/guarded_pool_allocator_posix.cpp
+++ b/compiler-rt/lib/gwp_asan/platform_specific/guarded_pool_allocator_posix.cpp
@@ -12,6 +12,7 @@
 #include "gwp_asan/utilities.h"
 
 #include <assert.h>
+#include <errno.h>
 #include <pthread.h>
 #include <stdint.h>
 #include <stdlib.h>
@@ -46,7 +47,8 @@ void *GuardedPoolAllocator::map(size_t Size, const char *Name) const {
   assert((Size % State.PageSize) == 0);
   void *Ptr = mmap(nullptr, Size, PROT_READ | PROT_WRITE,
                    MAP_ANONYMOUS | MAP_PRIVATE, -1, 0);
-  check(Ptr != MAP_FAILED, "Failed to map guarded pool allocator memory");
+  checkWithErrorCode(Ptr != MAP_FAILED,
+                     "Failed to map guarded pool allocator memory", errno);
   MaybeSetMappingName(Ptr, Size, Name);
   return Ptr;
 }
@@ -54,15 +56,16 @@ void *GuardedPoolAllocator::map(size_t Size, const char *Name) const {
 void GuardedPoolAllocator::unmap(void *Ptr, size_t Size) const {
   assert((reinterpret_cast<uintptr_t>(Ptr) % State.PageSize) == 0);
   assert((Size % State.PageSize) == 0);
-  check(munmap(Ptr, Size) == 0,
-        "Failed to unmap guarded pool allocator memory.");
+  checkWithErrorCode(munmap(Ptr, Size) == 0,
+                     "Failed to unmap guarded pool allocator memory.", errno);
 }
 
 void *GuardedPoolAllocator::reserveGuardedPool(size_t Size) {
   assert((Size % State.PageSize) == 0);
   void *Ptr =
       mmap(nullptr, Size, PROT_NONE, MAP_ANONYMOUS | MAP_PRIVATE, -1, 0);
-  check(Ptr != MAP_FAILED, "Failed to reserve guarded pool allocator memory");
+  checkWithErrorCode(Ptr != MAP_FAILED,
+                     "Failed to reserve guarded pool allocator memory", errno);
   MaybeSetMappingName(Ptr, Size, kGwpAsanGuardPageName);
   return Ptr;
 }
@@ -75,8 +78,9 @@ void GuardedPoolAllocator::unreserveGuardedPool() {
 void GuardedPoolAllocator::allocateInGuardedPool(void *Ptr, size_t Size) const {
   assert((reinterpret_cast<uintptr_t>(Ptr) % State.PageSize) == 0);
   assert((Size % State.PageSize) == 0);
-  check(mprotect(Ptr, Size, PROT_READ | PROT_WRITE) == 0,
-        "Failed to allocate in guarded pool allocator memory");
+  checkWithErrorCode(mprotect(Ptr, Size, PROT_READ | PROT_WRITE) == 0,
+                     "Failed to allocate in guarded pool allocator memory",
+                     errno);
   MaybeSetMappingName(Ptr, Size, kGwpAsanAliveSlotName);
 }
 
@@ -87,9 +91,10 @@ void GuardedPoolAllocator::deallocateInGuardedPool(void *Ptr,
   // mmap() a PROT_NONE page over the address to release it to the system, if
   // we used mprotect() here the system would count pages in the quarantine
   // against the RSS.
-  check(mmap(Ptr, Size, PROT_NONE, MAP_FIXED | MAP_ANONYMOUS | MAP_PRIVATE, -1,
-             0) != MAP_FAILED,
-        "Failed to deallocate in guarded pool allocator memory");
+  checkWithErrorCode(
+      mmap(Ptr, Size, PROT_NONE, MAP_FIXED | MAP_ANONYMOUS | MAP_PRIVATE, -1,
+           0) != MAP_FAILED,
+      "Failed to deallocate in guarded pool allocator memory", errno);
   MaybeSetMappingName(Ptr, Size, kGwpAsanGuardPageName);
 }
 
diff --git a/compiler-rt/lib/gwp_asan/platform_specific/utilities_fuchsia.cpp b/compiler-rt/lib/gwp_asan/platform_specific/utilities_fuchsia.cpp
index bc9d3a4..fecf94b 100644
--- a/compiler-rt/lib/gwp_asan/platform_specific/utilities_fuchsia.cpp
+++ b/compiler-rt/lib/gwp_asan/platform_specific/utilities_fuchsia.cpp
@@ -8,12 +8,25 @@
 
 #include "gwp_asan/utilities.h"
 
+#include <alloca.h>
+#include <stdio.h>
 #include <string.h>
 #include <zircon/sanitizer.h>
+#include <zircon/status.h>
 
 namespace gwp_asan {
 void die(const char *Message) {
   __sanitizer_log_write(Message, strlen(Message));
   __builtin_trap();
 }
+
+void dieWithErrorCode(const char *Message, int64_t ErrorCode) {
+  const char *error_str =
+      _zx_status_get_string(static_cast<zx_status_t>(ErrorCode));
+  size_t buffer_size = strlen(Message) + 32 + strlen(error_str);
+  char *buffer = static_cast<char *>(alloca(buffer_size));
+  snprintf(buffer, buffer_size, "%s (Error Code: %s)", Message, error_str);
+  __sanitizer_log_write(buffer, strlen(buffer));
+  __builtin_trap();
+}
 } // namespace gwp_asan
diff --git a/compiler-rt/lib/gwp_asan/platform_specific/utilities_posix.cpp b/compiler-rt/lib/gwp_asan/platform_specific/utilities_posix.cpp
index 73579630..7501980 100644
--- a/compiler-rt/lib/gwp_asan/platform_specific/utilities_posix.cpp
+++ b/compiler-rt/lib/gwp_asan/platform_specific/utilities_posix.cpp
@@ -6,7 +6,11 @@
 //
 //===----------------------------------------------------------------------===//
 
+#include <alloca.h>
 #include <features.h> // IWYU pragma: keep (for __BIONIC__ macro)
+#include <inttypes.h>
+#include <stdint.h>
+#include <string.h>
 
 #ifdef __BIONIC__
 #include "gwp_asan/definitions.h"
@@ -27,4 +31,21 @@ void die(const char *Message) {
   __builtin_trap();
 #endif // __BIONIC__
 }
+
+void dieWithErrorCode(const char *Message, int64_t ErrorCode) {
+#ifdef __BIONIC__
+  if (&android_set_abort_message == nullptr)
+    abort();
+
+  size_t buffer_size = strlen(Message) + 48;
+  char *buffer = static_cast<char *>(alloca(buffer_size));
+  snprintf(buffer, buffer_size, "%s (Error Code: %" PRId64 ")", Message,
+           ErrorCode);
+  android_set_abort_message(buffer);
+  abort();
+#else  // __BIONIC__
+  fprintf(stderr, "%s (Error Code: %" PRId64 ")", Message, ErrorCode);
+  __builtin_trap();
+#endif // __BIONIC__
+}
 } // namespace gwp_asan
diff --git a/compiler-rt/lib/gwp_asan/tests/CMakeLists.txt b/compiler-rt/lib/gwp_asan/tests/CMakeLists.txt
index ca43ec2..5de1af1 100644
--- a/compiler-rt/lib/gwp_asan/tests/CMakeLists.txt
+++ b/compiler-rt/lib/gwp_asan/tests/CMakeLists.txt
@@ -28,7 +28,9 @@ set(GWP_ASAN_UNITTESTS
   late_init.cpp
   options.cpp
   recoverable.cpp
-  never_allocated.cpp)
+  never_allocated.cpp
+  utilities.cpp
+)
 
 set(GWP_ASAN_UNIT_TEST_HEADERS
   ${GWP_ASAN_HEADERS}
diff --git a/compiler-rt/lib/gwp_asan/tests/utilities.cpp b/compiler-rt/lib/gwp_asan/tests/utilities.cpp
new file mode 100644
index 0000000..09a54e5
--- /dev/null
+++ b/compiler-rt/lib/gwp_asan/tests/utilities.cpp
@@ -0,0 +1,24 @@
+//===-- utilities.cpp -------------------------------------------*- C++ -*-===//
+//
+// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
+// See https://llvm.org/LICENSE.txt for license information.
+// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
+//
+//===----------------------------------------------------------------------===//
+
+#include "gwp_asan/utilities.h"
+#include "gwp_asan/tests/harness.h"
+
+using gwp_asan::check;
+using gwp_asan::checkWithErrorCode;
+
+TEST(UtilitiesDeathTest, CheckPrintsAsExpected) {
+  EXPECT_DEATH({ check(false, "Hello world"); }, "Hello world");
+  check(true, "Should not crash");
+  EXPECT_DEATH(
+      { checkWithErrorCode(false, "Hello world", 1337); },
+      "Hello world \\(Error Code: 1337\\)");
+  EXPECT_DEATH(
+      { checkWithErrorCode(false, "Hello world", -1337); },
+      "Hello world \\(Error Code: -1337\\)");
+}
diff --git a/compiler-rt/lib/gwp_asan/utilities.h b/compiler-rt/lib/gwp_asan/utilities.h
index 76e5df2..02f450a 100644
--- a/compiler-rt/lib/gwp_asan/utilities.h
+++ b/compiler-rt/lib/gwp_asan/utilities.h
@@ -12,17 +12,28 @@
 #include "gwp_asan/definitions.h"
 
 #include <stddef.h>
+#include <stdint.h>
 
 namespace gwp_asan {
 // Terminates in a platform-specific way with `Message`.
 void die(const char *Message);
+void dieWithErrorCode(const char *Message, int64_t ErrorCode);
 
 // Checks that `Condition` is true, otherwise dies with `Message`.
 GWP_ASAN_ALWAYS_INLINE void check(bool Condition, const char *Message) {
-  if (Condition)
+  if (GWP_ASAN_LIKELY(Condition))
     return;
   die(Message);
 }
+
+// Checks that `Condition` is true, otherwise dies with `Message` (including
+// errno at the end).
+GWP_ASAN_ALWAYS_INLINE void
+checkWithErrorCode(bool Condition, const char *Message, int64_t ErrorCode) {
+  if (GWP_ASAN_LIKELY(Condition))
+    return;
+  dieWithErrorCode(Message, ErrorCode);
+}
 } // namespace gwp_asan
 
 #endif // GWP_ASAN_UTILITIES_H_
diff --git a/compiler-rt/lib/interception/interception_linux.h b/compiler-rt/lib/interception/interception_linux.h
index 433a3d9..2e01ff4 100644
--- a/compiler-rt/lib/interception/interception_linux.h
+++ b/compiler-rt/lib/interception/interception_linux.h
@@ -28,12 +28,14 @@ bool InterceptFunction(const char *name, const char *ver, uptr *ptr_to_real,
                        uptr func, uptr trampoline);
 }  // namespace __interception
 
-#define INTERCEPT_FUNCTION_LINUX_OR_FREEBSD(func) \
-  ::__interception::InterceptFunction(            \
-      #func,                                      \
-      (::__interception::uptr *)&REAL(func),      \
-      (::__interception::uptr)&(func),            \
-      (::__interception::uptr)&TRAMPOLINE(func))
+// Cast func to type of REAL(func) before casting to uptr in case it is an
+// overloaded function, which is the case for some glibc functions when
+// _FORTIFY_SOURCE is used. This disambiguates which overload to use.
+#define INTERCEPT_FUNCTION_LINUX_OR_FREEBSD(func)            \
+  ::__interception::InterceptFunction(                       \
+      #func, (::__interception::uptr *)&REAL(func),          \
+      (::__interception::uptr)(decltype(REAL(func)))&(func), \
+      (::__interception::uptr) &TRAMPOLINE(func))
 
 // dlvsym is a GNU extension supported by some other platforms.
 #if SANITIZER_GLIBC || SANITIZER_FREEBSD || SANITIZER_NETBSD
@@ -41,7 +43,7 @@ bool InterceptFunction(const char *name, const char *ver, uptr *ptr_to_real,
   ::__interception::InterceptFunction(                        \
       #func, symver,                                          \
       (::__interception::uptr *)&REAL(func),                  \
-      (::__interception::uptr)&(func),                        \
+      (::__interception::uptr)(decltype(REAL(func)))&(func),  \
       (::__interception::uptr)&TRAMPOLINE(func))
 #else
 #define INTERCEPT_FUNCTION_VER_LINUX_OR_FREEBSD(func, symver) \
diff --git a/compiler-rt/lib/memprof/memprof_mapping.h b/compiler-rt/lib/memprof/memprof_mapping.h
index fef8acf..6da385a 100644
--- a/compiler-rt/lib/memprof/memprof_mapping.h
+++ b/compiler-rt/lib/memprof/memprof_mapping.h
@@ -55,7 +55,7 @@ extern uptr kHighMemEnd; // Initialized in __memprof_init.
 // computed by summing up all individual 1 byte counters. This can incur an
 // accuracy penalty.
 
-#define HISTOGRAM_GRANULARITY 8U
+#define HISTOGRAM_GRANULARITY 8ULL
 
 #define HISTOGRAM_MAX_COUNTER 255U
 
diff --git a/compiler-rt/lib/nsan/nsan_interceptors.cpp b/compiler-rt/lib/nsan/nsan_interceptors.cpp
index 544b44f..852524b 100644
--- a/compiler-rt/lib/nsan/nsan_interceptors.cpp
+++ b/compiler-rt/lib/nsan/nsan_interceptors.cpp
@@ -21,10 +21,6 @@
 
 #include <wchar.h>
 
-#if SANITIZER_LINUX
-extern "C" int mallopt(int param, int value);
-#endif
-
 using namespace __sanitizer;
 using __nsan::nsan_init_is_running;
 using __nsan::nsan_initialized;
@@ -209,12 +205,6 @@ void __nsan::InitializeInterceptors() {
   static bool initialized = false;
   CHECK(!initialized);
 
-  // Instruct libc malloc to consume less memory.
-#if SANITIZER_LINUX
-  mallopt(1, 0);          // M_MXFAST
-  mallopt(-3, 32 * 1024); // M_MMAP_THRESHOLD
-#endif
-
   InitializeMallocInterceptors();
 
   INTERCEPT_FUNCTION(memset);
diff --git a/compiler-rt/lib/profile/InstrProfiling.h b/compiler-rt/lib/profile/InstrProfiling.h
index d424a22..6906d52 100644
--- a/compiler-rt/lib/profile/InstrProfiling.h
+++ b/compiler-rt/lib/profile/InstrProfiling.h
@@ -49,7 +49,6 @@ typedef struct ValueProfNode {
 #include "profile/InstrProfData.inc"
 } ValueProfNode;
 
-typedef void *IntPtrT;
 typedef struct COMPILER_RT_ALIGNAS(INSTR_PROF_DATA_ALIGNMENT) VTableProfData {
 #define INSTR_PROF_VTABLE_DATA(Type, LLVMType, Name, Initializer) Type Name;
 #include "profile/InstrProfData.inc"
diff --git a/compiler-rt/lib/profile/InstrProfilingFile.c b/compiler-rt/lib/profile/InstrProfilingFile.c
index 1c58584..db3918d 100644
--- a/compiler-rt/lib/profile/InstrProfilingFile.c
+++ b/compiler-rt/lib/profile/InstrProfilingFile.c
@@ -101,6 +101,8 @@ static const int UseBiasVar = 0;
 static const char *FileOpenMode = "a+b";
 static void *BiasAddr = NULL;
 static void *BiasDefaultAddr = NULL;
+static void *BitmapBiasAddr = NULL;
+static void *BitmapBiasDefaultAddr = NULL;
 static int mmapForContinuousMode(uint64_t CurrentFileOffset, FILE *File) {
   /* Get the sizes of various profile data sections. Taken from
    * __llvm_profile_get_size_for_buffer(). */
@@ -199,11 +201,15 @@ static int mmapForContinuousMode(uint64_t CurrentFileOffset, FILE *File) {
 #define INSTR_PROF_PROFILE_COUNTER_BIAS_DEFAULT_VAR                            \
   INSTR_PROF_CONCAT(INSTR_PROF_PROFILE_COUNTER_BIAS_VAR, _default)
 COMPILER_RT_VISIBILITY intptr_t INSTR_PROF_PROFILE_COUNTER_BIAS_DEFAULT_VAR = 0;
+#define INSTR_PROF_PROFILE_BITMAP_BIAS_DEFAULT_VAR                             \
+  INSTR_PROF_CONCAT(INSTR_PROF_PROFILE_BITMAP_BIAS_VAR, _default)
+COMPILER_RT_VISIBILITY intptr_t INSTR_PROF_PROFILE_BITMAP_BIAS_DEFAULT_VAR = 0;
 
 /* This variable is a weak external reference which could be used to detect
  * whether or not the compiler defined this symbol. */
 #if defined(_MSC_VER)
 COMPILER_RT_VISIBILITY extern intptr_t INSTR_PROF_PROFILE_COUNTER_BIAS_VAR;
+COMPILER_RT_VISIBILITY extern intptr_t INSTR_PROF_PROFILE_BITMAP_BIAS_VAR;
 #if defined(_M_IX86) || defined(__i386__)
 #define WIN_SYM_PREFIX "_"
 #else
@@ -213,10 +219,17 @@ COMPILER_RT_VISIBILITY extern intptr_t INSTR_PROF_PROFILE_COUNTER_BIAS_VAR;
     linker, "/alternatename:" WIN_SYM_PREFIX INSTR_PROF_QUOTE(                 \
                 INSTR_PROF_PROFILE_COUNTER_BIAS_VAR) "=" WIN_SYM_PREFIX        \
                 INSTR_PROF_QUOTE(INSTR_PROF_PROFILE_COUNTER_BIAS_DEFAULT_VAR))
+#pragma comment(                                                               \
+    linker, "/alternatename:" WIN_SYM_PREFIX INSTR_PROF_QUOTE(                 \
+                INSTR_PROF_PROFILE_BITMAP_BIAS_VAR) "=" WIN_SYM_PREFIX         \
+                INSTR_PROF_QUOTE(INSTR_PROF_PROFILE_BITMAP_BIAS_DEFAULT_VAR))
 #else
 COMPILER_RT_VISIBILITY extern intptr_t INSTR_PROF_PROFILE_COUNTER_BIAS_VAR
     __attribute__((weak, alias(INSTR_PROF_QUOTE(
                              INSTR_PROF_PROFILE_COUNTER_BIAS_DEFAULT_VAR))));
+COMPILER_RT_VISIBILITY extern intptr_t INSTR_PROF_PROFILE_BITMAP_BIAS_VAR
+    __attribute__((weak, alias(INSTR_PROF_QUOTE(
+                             INSTR_PROF_PROFILE_BITMAP_BIAS_DEFAULT_VAR))));
 #endif
 static const int ContinuousModeSupported = 1;
 static const int UseBiasVar = 1;
@@ -227,6 +240,9 @@ static const char *FileOpenMode = "w+b";
  * used and runtime provides a weak alias so we can check if it's defined. */
 static void *BiasAddr = &INSTR_PROF_PROFILE_COUNTER_BIAS_VAR;
 static void *BiasDefaultAddr = &INSTR_PROF_PROFILE_COUNTER_BIAS_DEFAULT_VAR;
+static void *BitmapBiasAddr = &INSTR_PROF_PROFILE_BITMAP_BIAS_VAR;
+static void *BitmapBiasDefaultAddr =
+    &INSTR_PROF_PROFILE_BITMAP_BIAS_DEFAULT_VAR;
 static int mmapForContinuousMode(uint64_t CurrentFileOffset, FILE *File) {
   /* Get the sizes of various profile data sections. Taken from
    * __llvm_profile_get_size_for_buffer(). */
@@ -237,12 +253,18 @@ static int mmapForContinuousMode(uint64_t CurrentFileOffset, FILE *File) {
   const char *BitmapBegin = __llvm_profile_begin_bitmap();
   const char *BitmapEnd = __llvm_profile_end_bitmap();
   uint64_t DataSize = __llvm_profile_get_data_size(DataBegin, DataEnd);
+  uint64_t CountersSize =
+      __llvm_profile_get_counters_size(CountersBegin, CountersEnd);
+  uint64_t NumBitmapBytes =
+      __llvm_profile_get_num_bitmap_bytes(BitmapBegin, BitmapEnd);
   /* Get the file size. */
   uint64_t FileSize = 0;
   if (getProfileFileSizeForMerging(File, &FileSize))
     return 1;
 
   int Fileno = fileno(File);
+  uint64_t PaddingBytesAfterCounters =
+      __llvm_profile_get_num_padding_bytes(CountersSize);
   uint64_t FileOffsetToCounters =
       sizeof(__llvm_profile_header) + __llvm_write_binary_ids(NULL) + DataSize;
 
@@ -260,7 +282,17 @@ static int mmapForContinuousMode(uint64_t CurrentFileOffset, FILE *File) {
   /* Return the memory allocated for counters to OS. */
   lprofReleaseMemoryPagesToOS((uintptr_t)CountersBegin, (uintptr_t)CountersEnd);
 
-  /* BIAS MODE not supported yet for Bitmap (MCDC). */
+  /* Also mmap MCDC bitmap bytes. If there aren't any bitmap bytes, mmap()
+   * will fail with EINVAL. */
+  if (NumBitmapBytes == 0)
+    return 0;
+
+  /* Update profbm_bias. */
+  uint64_t FileOffsetToBitmap =
+      FileOffsetToCounters + CountersSize + PaddingBytesAfterCounters;
+  /* Update the profile fields based on the current mapping. */
+  INSTR_PROF_PROFILE_BITMAP_BIAS_VAR =
+      (uintptr_t)Profile - (uintptr_t)BitmapBegin + FileOffsetToBitmap;
 
   /* Return the memory allocated for counters to OS. */
   lprofReleaseMemoryPagesToOS((uintptr_t)BitmapBegin, (uintptr_t)BitmapEnd);
@@ -272,6 +304,8 @@ static const int UseBiasVar = 0;
 static const char *FileOpenMode = "a+b";
 static void *BiasAddr = NULL;
 static void *BiasDefaultAddr = NULL;
+static void *BitmapBiasAddr = NULL;
+static void *BitmapBiasDefaultAddr = NULL;
 static int mmapForContinuousMode(uint64_t CurrentFileOffset, FILE *File) {
   return 0;
 }
@@ -619,8 +653,10 @@ static void initializeProfileForContinuousMode(void) {
     PROF_ERR("%s\n", "continuous mode is unsupported on this platform");
     return;
   }
-  if (UseBiasVar && BiasAddr == BiasDefaultAddr) {
-    PROF_ERR("%s\n", "__llvm_profile_counter_bias is undefined");
+  if (UseBiasVar && BiasAddr == BiasDefaultAddr &&
+      BitmapBiasAddr == BitmapBiasDefaultAddr) {
+    PROF_ERR("%s\n", "Neither __llvm_profile_counter_bias nor "
+                     "__llvm_profile_bitmap_bias is defined");
     return;
   }
 
diff --git a/compiler-rt/lib/rtsan/rtsan_interceptors.cpp b/compiler-rt/lib/rtsan/rtsan_interceptors.cpp
index 4d5423e..b630404 100644
--- a/compiler-rt/lib/rtsan/rtsan_interceptors.cpp
+++ b/compiler-rt/lib/rtsan/rtsan_interceptors.cpp
@@ -21,6 +21,18 @@
 #include "rtsan/rtsan_context.h"
 
 #if SANITIZER_APPLE
+
+#if TARGET_OS_MAC
+// On MacOS OSSpinLockLock is deprecated and no longer present in the headers,
+// but the symbol still exists on the system. Forward declare here so we
+// don't get compilation errors.
+#include <stdint.h>
+extern "C" {
+typedef int32_t OSSpinLock;
+void OSSpinLockLock(volatile OSSpinLock *__lock);
+}
+#endif
+
 #include <libkern/OSAtomic.h>
 #include <os/lock.h>
 #endif
diff --git a/compiler-rt/lib/sanitizer_common/sanitizer_linux.cpp b/compiler-rt/lib/sanitizer_common/sanitizer_linux.cpp
index 2ea61b1..fd5ff1b 100644
--- a/compiler-rt/lib/sanitizer_common/sanitizer_linux.cpp
+++ b/compiler-rt/lib/sanitizer_common/sanitizer_linux.cpp
@@ -33,11 +33,15 @@
 // For mips64, syscall(__NR_stat) fills the buffer in the 'struct kernel_stat'
 // format. Struct kernel_stat is defined as 'struct stat' in asm/stat.h. To
 // access stat from asm/stat.h, without conflicting with definition in
-// sys/stat.h, we use this trick.
-#  if SANITIZER_MIPS64
+// sys/stat.h, we use this trick.  sparc64 is similar, using
+// syscall(__NR_stat64) and struct kernel_stat64.
+#  if SANITIZER_LINUX && (SANITIZER_MIPS64 || SANITIZER_SPARC64)
 #    include <asm/unistd.h>
 #    include <sys/types.h>
 #    define stat kernel_stat
+#    if SANITIZER_SPARC64
+#      define stat64 kernel_stat64
+#    endif
 #    if SANITIZER_GO
 #      undef st_atime
 #      undef st_mtime
@@ -48,6 +52,7 @@
 #    endif
 #    include <asm/stat.h>
 #    undef stat
+#    undef stat64
 #  endif
 
 #  include <dlfcn.h>
@@ -220,7 +225,7 @@ uptr internal_mmap(void *addr, uptr length, int prot, int flags, int fd,
   // mmap2 specifies file offset in 4096-byte units.
   CHECK(IsAligned(offset, 4096));
   return internal_syscall(SYSCALL(mmap2), addr, length, prot, flags, fd,
-                          offset / 4096);
+                          (OFF_T)(offset / 4096));
 #      endif
 }
 #    endif  // !SANITIZER_S390
@@ -285,8 +290,7 @@ uptr internal_ftruncate(fd_t fd, uptr size) {
   return res;
 }
 
-#    if (!SANITIZER_LINUX_USES_64BIT_SYSCALLS || SANITIZER_SPARC) && \
-        SANITIZER_LINUX
+#    if !SANITIZER_LINUX_USES_64BIT_SYSCALLS && SANITIZER_LINUX
 static void stat64_to_stat(struct stat64 *in, struct stat *out) {
   internal_memset(out, 0, sizeof(*out));
   out->st_dev = in->st_dev;
@@ -327,7 +331,12 @@ static void statx_to_stat(struct statx *in, struct stat *out) {
 }
 #    endif
 
-#    if SANITIZER_MIPS64
+#    if SANITIZER_MIPS64 || SANITIZER_SPARC64
+#      if SANITIZER_MIPS64
+typedef struct kernel_stat kstat_t;
+#      else
+typedef struct kernel_stat64 kstat_t;
+#      endif
 // Undefine compatibility macros from <sys/stat.h>
 // so that they would not clash with the kernel_stat
 // st_[a|m|c]time fields
@@ -345,7 +354,7 @@ static void statx_to_stat(struct statx *in, struct stat *out) {
 #        undef st_mtime_nsec
 #        undef st_ctime_nsec
 #      endif
-static void kernel_stat_to_stat(struct kernel_stat *in, struct stat *out) {
+static void kernel_stat_to_stat(kstat_t *in, struct stat *out) {
   internal_memset(out, 0, sizeof(*out));
   out->st_dev = in->st_dev;
   out->st_ino = in->st_ino;
@@ -391,6 +400,12 @@ uptr internal_stat(const char *path, void *buf) {
           !SANITIZER_SPARC
   return internal_syscall(SYSCALL(newfstatat), AT_FDCWD, (uptr)path, (uptr)buf,
                           0);
+#      elif SANITIZER_SPARC64
+  kstat_t buf64;
+  int res = internal_syscall(SYSCALL(fstatat64), AT_FDCWD, (uptr)path,
+                             (uptr)&buf64, 0);
+  kernel_stat_to_stat(&buf64, (struct stat *)buf);
+  return res;
 #      else
   struct stat64 buf64;
   int res = internal_syscall(SYSCALL(fstatat64), AT_FDCWD, (uptr)path,
@@ -423,6 +438,12 @@ uptr internal_lstat(const char *path, void *buf) {
           !SANITIZER_SPARC
   return internal_syscall(SYSCALL(newfstatat), AT_FDCWD, (uptr)path, (uptr)buf,
                           AT_SYMLINK_NOFOLLOW);
+#      elif SANITIZER_SPARC64
+  kstat_t buf64;
+  int res = internal_syscall(SYSCALL(fstatat64), AT_FDCWD, (uptr)path,
+                             (uptr)&buf64, AT_SYMLINK_NOFOLLOW);
+  kernel_stat_to_stat(&buf64, (struct stat *)buf);
+  return res;
 #      else
   struct stat64 buf64;
   int res = internal_syscall(SYSCALL(fstatat64), AT_FDCWD, (uptr)path,
@@ -442,10 +463,16 @@ uptr internal_fstat(fd_t fd, void *buf) {
 #    if SANITIZER_FREEBSD || SANITIZER_LINUX_USES_64BIT_SYSCALLS
 #      if SANITIZER_MIPS64
   // For mips64, fstat syscall fills buffer in the format of kernel_stat
-  struct kernel_stat kbuf;
+  kstat_t kbuf;
   int res = internal_syscall(SYSCALL(fstat), fd, &kbuf);
   kernel_stat_to_stat(&kbuf, (struct stat *)buf);
   return res;
+#      elif SANITIZER_LINUX && SANITIZER_SPARC64
+  // For sparc64, fstat64 syscall fills buffer in the format of kernel_stat64
+  kstat_t kbuf;
+  int res = internal_syscall(SYSCALL(fstat64), fd, &kbuf);
+  kernel_stat_to_stat(&kbuf, (struct stat *)buf);
+  return res;
 #      elif SANITIZER_LINUX && defined(__loongarch__)
   struct statx bufx;
   int res = internal_syscall(SYSCALL(statx), fd, "", AT_EMPTY_PATH,
@@ -826,10 +853,16 @@ uptr internal_sigaltstack(const void *ss, void *oss) {
   return internal_syscall(SYSCALL(sigaltstack), (uptr)ss, (uptr)oss);
 }
 
+extern "C" pid_t __fork(void);
+
 int internal_fork() {
 #    if SANITIZER_LINUX
 #      if SANITIZER_S390
   return internal_syscall(SYSCALL(clone), 0, SIGCHLD);
+#      elif SANITIZER_SPARC
+  // The clone syscall interface on SPARC differs massively from the rest,
+  // so fall back to __fork.
+  return __fork();
 #      else
   return internal_syscall(SYSCALL(clone), SIGCHLD, 0);
 #      endif
@@ -2121,8 +2154,26 @@ bool SignalContext::IsTrueFaultingAddress() const {
 UNUSED
 static const char *RegNumToRegName(int reg) {
   switch (reg) {
-#  if SANITIZER_LINUX
+#  if SANITIZER_LINUX && SANITIZER_GLIBC || SANITIZER_NETBSD
 #    if defined(__x86_64__)
+#      if SANITIZER_NETBSD
+#        define REG_RAX _REG_RAX
+#        define REG_RBX _REG_RBX
+#        define REG_RCX _REG_RCX
+#        define REG_RDX _REG_RDX
+#        define REG_RDI _REG_RDI
+#        define REG_RSI _REG_RSI
+#        define REG_RBP _REG_RBP
+#        define REG_RSP _REG_RSP
+#        define REG_R8 _REG_R8
+#        define REG_R9 _REG_R9
+#        define REG_R10 _REG_R10
+#        define REG_R11 _REG_R11
+#        define REG_R12 _REG_R12
+#        define REG_R13 _REG_R13
+#        define REG_R14 _REG_R14
+#        define REG_R15 _REG_R15
+#      endif
     case REG_RAX:
       return "rax";
     case REG_RBX:
@@ -2156,6 +2207,16 @@ static const char *RegNumToRegName(int reg) {
     case REG_R15:
       return "r15";
 #    elif defined(__i386__)
+#      if SANITIZER_NETBSD
+#        define REG_EAX _REG_EAX
+#        define REG_EBX _REG_EBX
+#        define REG_ECX _REG_ECX
+#        define REG_EDX _REG_EDX
+#        define REG_EDI _REG_EDI
+#        define REG_ESI _REG_ESI
+#        define REG_EBP _REG_EBP
+#        define REG_ESP _REG_ESP
+#      endif
     case REG_EAX:
       return "eax";
     case REG_EBX:
@@ -2240,14 +2301,15 @@ static const char *RegNumToRegName(int reg) {
     case 31:
       return "sp";
 #    endif
-#  endif  // SANITIZER_LINUX
+#  endif  // SANITIZER_LINUX && SANITIZER_GLIBC
     default:
       return NULL;
   }
   return NULL;
 }
 
-#  if SANITIZER_LINUX && (defined(__arm__) || defined(__aarch64__))
+#  if SANITIZER_LINUX && SANITIZER_GLIBC && \
+      (defined(__arm__) || defined(__aarch64__))
 static uptr GetArmRegister(ucontext_t *ctx, int RegNum) {
   switch (RegNum) {
 #    if defined(__arm__)
@@ -2289,22 +2351,39 @@ static uptr GetArmRegister(ucontext_t *ctx, int RegNum) {
   }
   return 0;
 }
-#  endif  // SANITIZER_LINUX && (defined(__arm__) || defined(__aarch64__))
+#  endif  // SANITIZER_LINUX && SANITIZER_GLIBC && (defined(__arm__) ||
+          // defined(__aarch64__))
 
 UNUSED
 static void DumpSingleReg(ucontext_t *ctx, int RegNum) {
   const char *RegName = RegNumToRegName(RegNum);
+#  if SANITIZER_LINUX && SANITIZER_GLIBC || SANITIZER_NETBSD
 #    if defined(__x86_64__)
   Printf("%s%s = 0x%016llx  ", internal_strlen(RegName) == 2 ? " " : "",
-         RegName, ctx->uc_mcontext.gregs[RegNum]);
+         RegName,
+#      if SANITIZER_LINUX
+         ctx->uc_mcontext.gregs[RegNum]
+#      elif SANITIZER_NETBSD
+         ctx->uc_mcontext.__gregs[RegNum]
+#      endif
+  );
 #    elif defined(__i386__)
-  Printf("%s = 0x%08x  ", RegName, ctx->uc_mcontext.gregs[RegNum]);
-#  elif defined(__arm__)
+  Printf("%s = 0x%08x  ", RegName,
+#      if SANITIZER_LINUX
+         ctx->uc_mcontext.gregs[RegNum]
+#      elif SANITIZER_NETBSD
+         ctx->uc_mcontext.__gregs[RegNum]
+#      endif
+  );
+#    elif defined(__arm__)
   Printf("%s%s = 0x%08zx  ", internal_strlen(RegName) == 2 ? " " : "", RegName,
          GetArmRegister(ctx, RegNum));
-#  elif defined(__aarch64__)
+#    elif defined(__aarch64__)
   Printf("%s%s = 0x%016zx  ", internal_strlen(RegName) == 2 ? " " : "", RegName,
          GetArmRegister(ctx, RegNum));
+#    else
+  (void)RegName;
+#    endif
 #  else
   (void)RegName;
 #  endif
@@ -2312,7 +2391,7 @@ static void DumpSingleReg(ucontext_t *ctx, int RegNum) {
 
 void SignalContext::DumpAllRegisters(void *context) {
   ucontext_t *ucontext = (ucontext_t *)context;
-#  if SANITIZER_LINUX
+#  if SANITIZER_LINUX && SANITIZER_GLIBC || SANITIZER_NETBSD
 #    if defined(__x86_64__)
   Report("Register values:\n");
   DumpSingleReg(ucontext, REG_RAX);
@@ -2351,7 +2430,7 @@ void SignalContext::DumpAllRegisters(void *context) {
   DumpSingleReg(ucontext, REG_EBP);
   DumpSingleReg(ucontext, REG_ESP);
   Printf("\n");
-#    elif defined(__arm__)
+#    elif defined(__arm__) && !SANITIZER_NETBSD
   Report("Register values:\n");
   DumpSingleReg(ucontext, REG_R0);
   DumpSingleReg(ucontext, REG_R1);
@@ -2373,7 +2452,7 @@ void SignalContext::DumpAllRegisters(void *context) {
   DumpSingleReg(ucontext, REG_R14);
   DumpSingleReg(ucontext, REG_R15);
   Printf("\n");
-#    elif defined(__aarch64__)
+#    elif defined(__aarch64__) && !SANITIZER_NETBSD
   Report("Register values:\n");
   for (int i = 0; i <= 31; ++i) {
     DumpSingleReg(ucontext, i);
@@ -2386,25 +2465,25 @@ void SignalContext::DumpAllRegisters(void *context) {
 #  elif SANITIZER_FREEBSD
 #    if defined(__x86_64__)
   Report("Register values:\n");
-  Printf("rax = 0x%016llx  ", ucontext->uc_mcontext.mc_rax);
-  Printf("rbx = 0x%016llx  ", ucontext->uc_mcontext.mc_rbx);
-  Printf("rcx = 0x%016llx  ", ucontext->uc_mcontext.mc_rcx);
-  Printf("rdx = 0x%016llx  ", ucontext->uc_mcontext.mc_rdx);
+  Printf("rax = 0x%016lx  ", ucontext->uc_mcontext.mc_rax);
+  Printf("rbx = 0x%016lx  ", ucontext->uc_mcontext.mc_rbx);
+  Printf("rcx = 0x%016lx  ", ucontext->uc_mcontext.mc_rcx);
+  Printf("rdx = 0x%016lx  ", ucontext->uc_mcontext.mc_rdx);
   Printf("\n");
-  Printf("rdi = 0x%016llx  ", ucontext->uc_mcontext.mc_rdi);
-  Printf("rsi = 0x%016llx  ", ucontext->uc_mcontext.mc_rsi);
-  Printf("rbp = 0x%016llx  ", ucontext->uc_mcontext.mc_rbp);
-  Printf("rsp = 0x%016llx  ", ucontext->uc_mcontext.mc_rsp);
+  Printf("rdi = 0x%016lx  ", ucontext->uc_mcontext.mc_rdi);
+  Printf("rsi = 0x%016lx  ", ucontext->uc_mcontext.mc_rsi);
+  Printf("rbp = 0x%016lx  ", ucontext->uc_mcontext.mc_rbp);
+  Printf("rsp = 0x%016lx  ", ucontext->uc_mcontext.mc_rsp);
   Printf("\n");
-  Printf(" r8 = 0x%016llx  ", ucontext->uc_mcontext.mc_r8);
-  Printf(" r9 = 0x%016llx  ", ucontext->uc_mcontext.mc_r9);
-  Printf("r10 = 0x%016llx  ", ucontext->uc_mcontext.mc_r10);
-  Printf("r11 = 0x%016llx  ", ucontext->uc_mcontext.mc_r11);
+  Printf(" r8 = 0x%016lx  ", ucontext->uc_mcontext.mc_r8);
+  Printf(" r9 = 0x%016lx  ", ucontext->uc_mcontext.mc_r9);
+  Printf("r10 = 0x%016lx  ", ucontext->uc_mcontext.mc_r10);
+  Printf("r11 = 0x%016lx  ", ucontext->uc_mcontext.mc_r11);
   Printf("\n");
-  Printf("r12 = 0x%016llx  ", ucontext->uc_mcontext.mc_r12);
-  Printf("r13 = 0x%016llx  ", ucontext->uc_mcontext.mc_r13);
-  Printf("r14 = 0x%016llx  ", ucontext->uc_mcontext.mc_r14);
-  Printf("r15 = 0x%016llx  ", ucontext->uc_mcontext.mc_r15);
+  Printf("r12 = 0x%016lx  ", ucontext->uc_mcontext.mc_r12);
+  Printf("r13 = 0x%016lx  ", ucontext->uc_mcontext.mc_r13);
+  Printf("r14 = 0x%016lx  ", ucontext->uc_mcontext.mc_r14);
+  Printf("r15 = 0x%016lx  ", ucontext->uc_mcontext.mc_r15);
   Printf("\n");
 #    elif defined(__i386__)
   Report("Register values:\n");
@@ -2421,6 +2500,8 @@ void SignalContext::DumpAllRegisters(void *context) {
 #    else
   (void)ucontext;
 #    endif
+#  else
+  (void)ucontext;
 #  endif
   // FIXME: Implement this for other OSes and architectures.
 }
diff --git a/compiler-rt/lib/sanitizer_common/tests/sanitizer_allocator_test.cpp b/compiler-rt/lib/sanitizer_common/tests/sanitizer_allocator_test.cpp
index 1a1ccce..601897a 100644
--- a/compiler-rt/lib/sanitizer_common/tests/sanitizer_allocator_test.cpp
+++ b/compiler-rt/lib/sanitizer_common/tests/sanitizer_allocator_test.cpp
@@ -28,12 +28,13 @@
 
 using namespace __sanitizer;
 
-#if SANITIZER_SOLARIS && defined(__sparcv9)
+#if defined(__sparcv9)
 // FIXME: These tests probably fail because Solaris/sparcv9 uses the full
-// 64-bit address space.  Needs more investigation
-#define SKIP_ON_SOLARIS_SPARCV9(x) DISABLED_##x
+// 64-bit address space.  Same on Linux/sparc64, so probably a general SPARC
+// issue.  Needs more investigation
+#  define SKIP_ON_SPARCV9(x) DISABLED_##x
 #else
-#define SKIP_ON_SOLARIS_SPARCV9(x) x
+#  define SKIP_ON_SPARCV9(x) x
 #endif
 
 // On 64-bit systems with small virtual address spaces (e.g. 39-bit) we can't
@@ -781,7 +782,7 @@ TEST(SanitizerCommon, CombinedAllocator64VeryCompact) {
 }
 #endif
 
-TEST(SanitizerCommon, SKIP_ON_SOLARIS_SPARCV9(CombinedAllocator32Compact)) {
+TEST(SanitizerCommon, SKIP_ON_SPARCV9(CombinedAllocator32Compact)) {
   TestCombinedAllocator<Allocator32Compact>();
 }
 
@@ -1028,7 +1029,7 @@ TEST(SanitizerCommon, SizeClassAllocator64DynamicPremappedIteration) {
 #endif
 #endif
 
-TEST(SanitizerCommon, SKIP_ON_SOLARIS_SPARCV9(SizeClassAllocator32Iteration)) {
+TEST(SanitizerCommon, SKIP_ON_SPARCV9(SizeClassAllocator32Iteration)) {
   TestSizeClassAllocatorIteration<Allocator32Compact>();
 }
 
diff --git a/compiler-rt/lib/sanitizer_common/tests/sanitizer_ioctl_test.cpp b/compiler-rt/lib/sanitizer_common/tests/sanitizer_ioctl_test.cpp
index 8da09f69..8500d3a 100644
--- a/compiler-rt/lib/sanitizer_common/tests/sanitizer_ioctl_test.cpp
+++ b/compiler-rt/lib/sanitizer_common/tests/sanitizer_ioctl_test.cpp
@@ -77,7 +77,8 @@ TEST(SanitizerIoctl, Fixup) {
 // Test decoding KVM ioctl numbers.
 TEST(SanitizerIoctl, KVM_GET_MP_STATE) {
   ioctl_desc desc;
-  unsigned int desc_value = SANITIZER_MIPS ? 0x4004ae98U : 0x8004ae98U;
+  unsigned int desc_value =
+      SANITIZER_MIPS || SANITIZER_SPARC ? 0x4004ae98U : 0x8004ae98U;
   bool res = ioctl_decode(desc_value, &desc);
   EXPECT_TRUE(res);
   EXPECT_EQ(ioctl_desc::WRITE, desc.type);
@@ -86,7 +87,8 @@ TEST(SanitizerIoctl, KVM_GET_MP_STATE) {
 
 TEST(SanitizerIoctl, KVM_GET_LAPIC) {
   ioctl_desc desc;
-  unsigned int desc_value = SANITIZER_MIPS ? 0x4400ae8eU : 0x8400ae8eU;
+  unsigned int desc_value =
+      SANITIZER_MIPS || SANITIZER_SPARC ? 0x4400ae8eU : 0x8400ae8eU;
   bool res = ioctl_decode(desc_value, &desc);
   EXPECT_TRUE(res);
   EXPECT_EQ(ioctl_desc::WRITE, desc.type);
diff --git a/compiler-rt/test/builtins/TestCases/Darwin/platform_version_check_test.c b/compiler-rt/test/builtins/TestCases/Darwin/platform_version_check_test.c
index da0e366..ebbeba1 100644
--- a/compiler-rt/test/builtins/TestCases/Darwin/platform_version_check_test.c
+++ b/compiler-rt/test/builtins/TestCases/Darwin/platform_version_check_test.c
@@ -7,11 +7,22 @@ typedef unsigned int uint32_t;
 int32_t __isPlatformVersionAtLeast(uint32_t Platform, uint32_t Major,
                                    uint32_t Minor, uint32_t Subminor);
 
+int32_t __isPlatformOrVariantPlatformVersionAtLeast(
+    uint32_t Platform, uint32_t Major, uint32_t Minor, uint32_t Subminor,
+    uint32_t Platform2, uint32_t Major2, uint32_t Minor2, uint32_t Subminor2);
+
+void exit(int status);
+
 #define PLATFORM_MACOS 1
+#define PLATFORM_IOS 2
 
 int32_t check(uint32_t Major, uint32_t Minor, uint32_t Subminor) {
   int32_t Result =
       __isPlatformVersionAtLeast(PLATFORM_MACOS, Major, Minor, Subminor);
+  int32_t ResultVariant = __isPlatformOrVariantPlatformVersionAtLeast(
+      PLATFORM_MACOS, Major, Minor, Subminor, PLATFORM_IOS, 13, 0, 0);
+  if (Result != ResultVariant)
+    exit(-1);
   return Result;
 }
 
diff --git a/compiler-rt/test/builtins/Unit/ppc/test b/compiler-rt/test/builtins/Unit/ppc/test
deleted file mode 100755
index 96e0632..0000000
--- a/compiler-rt/test/builtins/Unit/ppc/test
+++ /dev/null
@@ -1,18 +0,0 @@
-for FILE in $(ls *.c); do
-	if gcc -arch ppc -O0 $FILE ../../../Release/ppc/libcompiler_rt.Optimized.a -mlong-double-128
-	then
-	    echo "Testing $FILE"
-		if ./a.out
-		then
-			rm ./a.out
-		else
-			echo "fail"
-# 			exit 1
-		fi
-	else
-		echo "$FILE failed to compile"
-#		exit 1
-	fi
-done
-echo "pass"
-exit
diff --git a/compiler-rt/test/builtins/Unit/test b/compiler-rt/test/builtins/Unit/test
deleted file mode 100755
index e068379..0000000
--- a/compiler-rt/test/builtins/Unit/test
+++ /dev/null
@@ -1,63 +0,0 @@
-#!/usr/bin/env bash
-
-ARCHS='<host>'
-REMOTE=0
-if test `uname` = "Darwin"; then
-  if test "$1" = "armv6"; then
-    ARCHS="armv6"
-    LIBS="-lSystem"
-    REMOTE=1
-    mkdir -p remote
-  else
-    ARCHS="i386 x86_64 ppc"
-    LIBS="-lSystem"
-  fi
-else
-  LIBS="-lc -lm"
-fi
-
-for ARCH in $ARCHS; do
-  CFLAGS="-Os -nodefaultlibs -I../../lib"
-  if test "$ARCH" != '<host>'; then
-    CFLAGS="-arch $ARCH $CFLAGS"
-  fi
-  for FILE in $(ls *.c); do
-    # Use -nodefaultlibs to avoid using libgcc.a
-    # Use -lSystem to link with libSystem.dylb.
-    # Note -lSystem is *after* libcompiler_rt.Optimized.a so that linker will 
-    # prefer our implementation over the ones in libSystem.dylib
-    EXTRA=
-    if test $FILE = gcc_personality_test.c
-    then
-      # the gcc_personality_test.c requires a helper C++ program
-      EXTRA="-fexceptions gcc_personality_test_helper.cxx -lstdc++ /usr/lib/libgcc_s.1.dylib"
-      # the libgcc_s.1.dylib use at the end is a hack until libSystem contains _Unwind_Resume
-    fi
-    if test "$REMOTE" = "1"
-    then 
-      if gcc $CFLAGS $FILE ../../darwin_fat/Release/libcompiler_rt.a $LIBS $EXTRA -o ./remote/$FILE.exe
-      then
-        echo "Built $FILE.exe for $ARCH"
-      else
-        echo "$FILE failed to compile"
-      fi
-    else
-      if gcc $CFLAGS $FILE ../../darwin_fat/Release/libcompiler_rt.a $LIBS $EXTRA
-      then
-        echo "Testing $FILE for $ARCH"
-        if ./a.out
-        then
-          rm ./a.out
-        else
-          echo "fail"
-          exit 1
-        fi
-      else
-        echo "$FILE failed to compile"
-        exit 1
-      fi
-    fi
-  done
-done
-echo "pass"
-exit
diff --git a/compiler-rt/test/sanitizer_common/TestCases/Linux/dump_registers_aarch64.cpp b/compiler-rt/test/sanitizer_common/TestCases/Linux/dump_registers_aarch64.cpp
index e01b826..d1015a4 100644
--- a/compiler-rt/test/sanitizer_common/TestCases/Linux/dump_registers_aarch64.cpp
+++ b/compiler-rt/test/sanitizer_common/TestCases/Linux/dump_registers_aarch64.cpp
@@ -3,7 +3,7 @@
 // RUN: %env_tool_opts=dump_registers=0 not %run %t 2>&1 | FileCheck %s --check-prefix=CHECK-NODUMP
 // RUN: not %run %t 2>&1 | FileCheck %s --strict-whitespace --check-prefix=CHECK-DUMP
 //
-// REQUIRES: aarch64-target-arch
+// REQUIRES: aarch64-target-arch && glibc
 
 #include <signal.h>
 
diff --git a/compiler-rt/test/sanitizer_common/TestCases/Linux/dump_registers_arm.cpp b/compiler-rt/test/sanitizer_common/TestCases/Linux/dump_registers_arm.cpp
index e17dbf1..e747f78 100644
--- a/compiler-rt/test/sanitizer_common/TestCases/Linux/dump_registers_arm.cpp
+++ b/compiler-rt/test/sanitizer_common/TestCases/Linux/dump_registers_arm.cpp
@@ -3,7 +3,7 @@
 // RUN: %env_tool_opts=dump_registers=0 not %run %t 2>&1 | FileCheck %s --check-prefix=CHECK-NODUMP
 // RUN: not %run %t 2>&1 | FileCheck %s --strict-whitespace --check-prefix=CHECK-DUMP
 //
-// REQUIRES: arm-target-arch
+// REQUIRES: arm-target-arch && glibc
 
 #include <signal.h>
 
diff --git a/compiler-rt/test/sanitizer_common/TestCases/Linux/dump_registers_i386.cpp b/compiler-rt/test/sanitizer_common/TestCases/Linux/dump_registers_i386.cpp
index 74aea4d..5a62ef8 100644
--- a/compiler-rt/test/sanitizer_common/TestCases/Linux/dump_registers_i386.cpp
+++ b/compiler-rt/test/sanitizer_common/TestCases/Linux/dump_registers_i386.cpp
@@ -3,7 +3,7 @@
 // RUN: %env_tool_opts=dump_registers=0 not %run %t 2>&1 | FileCheck %s --check-prefixes=CHECK-NODUMP --strict-whitespace
 // RUN: not %run %t 2>&1 | FileCheck %s --check-prefixes=CHECK-DUMP --strict-whitespace
 //
-// REQUIRES: i386-target-arch
+// REQUIRES: i386-target-arch && glibc
 
 #include <signal.h>
 
diff --git a/compiler-rt/test/sanitizer_common/TestCases/Linux/dump_registers_x86_64.cpp b/compiler-rt/test/sanitizer_common/TestCases/Linux/dump_registers_x86_64.cpp
index 3d11ef0..aac3c3f 100644
--- a/compiler-rt/test/sanitizer_common/TestCases/Linux/dump_registers_x86_64.cpp
+++ b/compiler-rt/test/sanitizer_common/TestCases/Linux/dump_registers_x86_64.cpp
@@ -3,7 +3,7 @@
 // RUN: %env_tool_opts=dump_registers=0 not %run %t 2>&1 | FileCheck %s --check-prefixes=CHECK-NODUMP --strict-whitespace
 // RUN: not %run %t 2>&1 | FileCheck %s --check-prefixes=CHECK-DUMP --strict-whitespace
 //
-// REQUIRES: x86_64-target-arch
+// REQUIRES: x86_64-target-arch && glibc
 
 #include <signal.h>
 
diff --git a/compiler-rt/test/sanitizer_common/TestCases/Linux/signal_line.cpp b/compiler-rt/test/sanitizer_common/TestCases/Linux/signal_line.cpp
index 208ece3..f1afd85 100644
--- a/compiler-rt/test/sanitizer_common/TestCases/Linux/signal_line.cpp
+++ b/compiler-rt/test/sanitizer_common/TestCases/Linux/signal_line.cpp
@@ -20,7 +20,8 @@ int main(int argc, char **argv) {
   // CHECK1: SUMMARY: [[SAN]]: SEGV {{.*}}signal_line.cpp:[[@LINE-2]]:[[TAB]] in main
 
   if (n == 2)
-    *((volatile int *)0x1) = __LINE__;
+    // Allow for strict-alignment targets that require natural alignment.
+    *((volatile int *)0x8) = __LINE__;
   // CHECK2: #{{[0-9]+ .*}}main {{.*}}signal_line.cpp:[[@LINE-1]]:[[TAB:[0-9]+]]
   // CHECK2: SUMMARY: [[SAN]]: SEGV {{.*}}signal_line.cpp:[[@LINE-2]]:[[TAB]] in main
 }
diff --git a/compiler-rt/test/sanitizer_common/TestCases/Linux/signal_send.cpp b/compiler-rt/test/sanitizer_common/TestCases/Linux/signal_send.cpp
index 035a5a8..638be63 100644
--- a/compiler-rt/test/sanitizer_common/TestCases/Linux/signal_send.cpp
+++ b/compiler-rt/test/sanitizer_common/TestCases/Linux/signal_send.cpp
@@ -62,14 +62,14 @@ void test_sigwait() {
   int res;
   res = fork_and_signal(s);
   fprintf(stderr, "fork_and_signal with SIGUSR1,2: %d\n", res);
-  // CHECK: died with sig 10
+  // CHECK: died with sig {{10|30}}
   // CHECK: fork_and_signal with SIGUSR1,2: 0
 
   // test sigandset... s should only have SIGUSR2 now
   s = sigset_and(s, mkset(1, SIGUSR2));
   res = fork_and_signal(s);
   fprintf(stderr, "fork_and_signal with SIGUSR2: %d\n", res);
-  // CHECK: died with sig 12
+  // CHECK: died with sig {{12|31}}
   // CHECK: fork_and_signal with SIGUSR2: 0
 }
 
diff --git a/compiler-rt/test/sanitizer_common/TestCases/NetBSD/dump_registers_i386.cpp b/compiler-rt/test/sanitizer_common/TestCases/NetBSD/dump_registers_i386.cpp
new file mode 100644
index 0000000..74aea4d
--- /dev/null
+++ b/compiler-rt/test/sanitizer_common/TestCases/NetBSD/dump_registers_i386.cpp
@@ -0,0 +1,17 @@
+// Check that sanitizer prints registers dump_registers on dump_registers=1
+// RUN: %clangxx  %s -o %t
+// RUN: %env_tool_opts=dump_registers=0 not %run %t 2>&1 | FileCheck %s --check-prefixes=CHECK-NODUMP --strict-whitespace
+// RUN: not %run %t 2>&1 | FileCheck %s --check-prefixes=CHECK-DUMP --strict-whitespace
+//
+// REQUIRES: i386-target-arch
+
+#include <signal.h>
+
+int main() {
+  raise(SIGSEGV);
+  // CHECK-DUMP: Register values
+  // CHECK-DUMP-NEXT: eax = {{0x[0-9a-f]+}}  ebx = {{0x[0-9a-f]+}}  ecx = {{0x[0-9a-f]+}}  edx = {{0x[0-9a-f]+}}
+  // CHECK-DUMP-NEXT: edi = {{0x[0-9a-f]+}}  esi = {{0x[0-9a-f]+}}  ebp = {{0x[0-9a-f]+}}  esp = {{0x[0-9a-f]+}}
+  // CHECK-NODUMP-NOT: Register values
+  return 0;
+}
diff --git a/compiler-rt/test/sanitizer_common/TestCases/NetBSD/dump_registers_x86_64.cpp b/compiler-rt/test/sanitizer_common/TestCases/NetBSD/dump_registers_x86_64.cpp
new file mode 100644
index 0000000..3d11ef0
--- /dev/null
+++ b/compiler-rt/test/sanitizer_common/TestCases/NetBSD/dump_registers_x86_64.cpp
@@ -0,0 +1,19 @@
+// Check that sanitizer prints registers dump_registers on dump_registers=1
+// RUN: %clangxx  %s -o %t
+// RUN: %env_tool_opts=dump_registers=0 not %run %t 2>&1 | FileCheck %s --check-prefixes=CHECK-NODUMP --strict-whitespace
+// RUN: not %run %t 2>&1 | FileCheck %s --check-prefixes=CHECK-DUMP --strict-whitespace
+//
+// REQUIRES: x86_64-target-arch
+
+#include <signal.h>
+
+int main() {
+  raise(SIGSEGV);
+  // CHECK-DUMP: Register values
+  // CHECK-DUMP-NEXT: rax = {{0x[0-9a-f]+}}  rbx = {{0x[0-9a-f]+}}  rcx = {{0x[0-9a-f]+}}  rdx = {{0x[0-9a-f]+}}
+  // CHECK-DUMP-NEXT: rdi = {{0x[0-9a-f]+}}  rsi = {{0x[0-9a-f]+}}  rbp = {{0x[0-9a-f]+}}  rsp = {{0x[0-9a-f]+}}
+  // CHECK-DUMP-NEXT:  r8 = {{0x[0-9a-f]+}}   r9 = {{0x[0-9a-f]+}}  r10 = {{0x[0-9a-f]+}}  r11 = {{0x[0-9a-f]+}}
+  // CHECK-DUMP-NEXT: r12 = {{0x[0-9a-f]+}}  r13 = {{0x[0-9a-f]+}}  r14 = {{0x[0-9a-f]+}}  r15 = {{0x[0-9a-f]+}}
+  // CHECK-NODUMP-NOT: Register values
+  return 0;
+}
diff --git a/flang/include/flang/Common/Fortran-features.h b/flang/include/flang/Common/Fortran-features.h
index 7346d70..938da08 100644
--- a/flang/include/flang/Common/Fortran-features.h
+++ b/flang/include/flang/Common/Fortran-features.h
@@ -70,7 +70,7 @@ ENUM_CLASS(UsageWarning, Portability, PointerToUndefinable,
     IgnoredIntrinsicFunctionType, PreviousScalarUse,
     RedeclaredInaccessibleComponent, ImplicitShared, IndexVarRedefinition,
     IncompatibleImplicitInterfaces, BadTypeForTarget,
-    VectorSubscriptFinalization)
+    VectorSubscriptFinalization, UndefinedFunctionResult)
 
 using LanguageFeatures = EnumSet<LanguageFeature, LanguageFeature_enumSize>;
 using UsageWarnings = EnumSet<UsageWarning, UsageWarning_enumSize>;
@@ -144,6 +144,7 @@ public:
     warnUsage_.set(UsageWarning::IncompatibleImplicitInterfaces);
     warnUsage_.set(UsageWarning::BadTypeForTarget);
     warnUsage_.set(UsageWarning::VectorSubscriptFinalization);
+    warnUsage_.set(UsageWarning::UndefinedFunctionResult);
   }
   LanguageFeatureControl(const LanguageFeatureControl &) = default;
 
diff --git a/flang/include/flang/Evaluate/tools.h b/flang/include/flang/Evaluate/tools.h
index 8555073..8c6d3b37 100644
--- a/flang/include/flang/Evaluate/tools.h
+++ b/flang/include/flang/Evaluate/tools.h
@@ -1243,22 +1243,30 @@ bool CheckForCoindexedObject(parser::ContextualMessages &,
     const std::optional<ActualArgument> &, const std::string &procName,
     const std::string &argName);
 
-// Get the number of distinct symbols with CUDA attribute in the expression.
+inline bool IsCUDADeviceSymbol(const Symbol &sym) {
+  if (const auto *details =
+          sym.GetUltimate().detailsIf<semantics::ObjectEntityDetails>()) {
+    if (details->cudaDataAttr() &&
+        *details->cudaDataAttr() != common::CUDADataAttr::Pinned) {
+      return true;
+    }
+  }
+  return false;
+}
+
+// Get the number of distinct symbols with CUDA device
+// attribute in the expression.
 template <typename A> inline int GetNbOfCUDADeviceSymbols(const A &expr) {
   semantics::UnorderedSymbolSet symbols;
   for (const Symbol &sym : CollectCudaSymbols(expr)) {
-    if (const auto *details =
-            sym.GetUltimate().detailsIf<semantics::ObjectEntityDetails>()) {
-      if (details->cudaDataAttr() &&
-          *details->cudaDataAttr() != common::CUDADataAttr::Pinned) {
-        symbols.insert(sym);
-      }
+    if (IsCUDADeviceSymbol(sym)) {
+      symbols.insert(sym);
     }
   }
   return symbols.size();
 }
 
-// Check if any of the symbols part of the expression has a CUDA data
+// Check if any of the symbols part of the expression has a CUDA device
 // attribute.
 template <typename A> inline bool HasCUDADeviceAttrs(const A &expr) {
   return GetNbOfCUDADeviceSymbols(expr) > 0;
@@ -1270,26 +1278,15 @@ inline bool HasCUDAImplicitTransfer(const Expr<SomeType> &expr) {
   unsigned hostSymbols{0};
   unsigned deviceSymbols{0};
   for (const Symbol &sym : CollectCudaSymbols(expr)) {
-    if (const auto *details =
-            sym.GetUltimate().detailsIf<semantics::ObjectEntityDetails>()) {
-      if (details->cudaDataAttr() &&
-          *details->cudaDataAttr() != common::CUDADataAttr::Pinned) {
-        ++deviceSymbols;
-      } else {
-        if (sym.owner().IsDerivedType()) {
-          if (const auto *details =
-                  sym.owner()
-                      .GetSymbol()
-                      ->GetUltimate()
-                      .detailsIf<semantics::ObjectEntityDetails>()) {
-            if (details->cudaDataAttr() &&
-                *details->cudaDataAttr() != common::CUDADataAttr::Pinned) {
-              ++deviceSymbols;
-            }
-          }
+    if (IsCUDADeviceSymbol(sym)) {
+      ++deviceSymbols;
+    } else {
+      if (sym.owner().IsDerivedType()) {
+        if (IsCUDADeviceSymbol(sym.owner().GetSymbol()->GetUltimate())) {
+          ++deviceSymbols;
         }
-        ++hostSymbols;
       }
+      ++hostSymbols;
     }
   }
   return hostSymbols > 0 && deviceSymbols > 0;
diff --git a/flang/include/flang/Optimizer/Builder/IntrinsicCall.h b/flang/include/flang/Optimizer/Builder/IntrinsicCall.h
index 80f077a..78bb82b 100644
--- a/flang/include/flang/Optimizer/Builder/IntrinsicCall.h
+++ b/flang/include/flang/Optimizer/Builder/IntrinsicCall.h
@@ -330,6 +330,8 @@ struct IntrinsicLibrary {
   mlir::Value genModulo(mlir::Type, llvm::ArrayRef<mlir::Value>);
   void genMoveAlloc(llvm::ArrayRef<fir::ExtendedValue>);
   void genMvbits(llvm::ArrayRef<fir::ExtendedValue>);
+  enum class NearestProc { Nearest, NextAfter, NextDown, NextUp };
+  template <NearestProc>
   mlir::Value genNearest(mlir::Type, llvm::ArrayRef<mlir::Value>);
   mlir::Value genNint(mlir::Type, llvm::ArrayRef<mlir::Value>);
   fir::ExtendedValue genNorm2(mlir::Type, llvm::ArrayRef<fir::ExtendedValue>);
@@ -422,9 +424,12 @@ struct IntrinsicLibrary {
                                   mlir::Type resultType,
                                   llvm::ArrayRef<fir::ExtendedValue> args);
 
-  /// Generate code to raise \p except if \p cond is absent,
+  /// Generate code to raise \p excepts if \p cond is absent,
   /// or present and true.
-  void genRaiseExcept(int except, mlir::Value cond = {});
+  void genRaiseExcept(int excepts, mlir::Value cond = {});
+
+  /// Generate a quiet NaN of a given floating point type.
+  mlir::Value genQNan(mlir::Type resultType);
 
   /// Define the different FIR generators that can be mapped to intrinsic to
   /// generate the related code.
diff --git a/flang/include/flang/Optimizer/Builder/Runtime/Exceptions.h b/flang/include/flang/Optimizer/Builder/Runtime/Exceptions.h
index 29745b8..aa6e33c 100644
--- a/flang/include/flang/Optimizer/Builder/Runtime/Exceptions.h
+++ b/flang/include/flang/Optimizer/Builder/Runtime/Exceptions.h
@@ -21,10 +21,10 @@ class FirOpBuilder;
 
 namespace fir::runtime {
 
-/// Generate a runtime call to map an ieee_flag_type exception value to a
-/// libm fenv.h value.
-mlir::Value genMapException(fir::FirOpBuilder &builder, mlir::Location loc,
-                            mlir::Value except);
+/// Generate a runtime call to map a set of ieee_flag_type exceptions to a
+/// libm fenv.h excepts value.
+mlir::Value genMapExcept(fir::FirOpBuilder &builder, mlir::Location loc,
+                         mlir::Value excepts);
 
 } // namespace fir::runtime
 #endif // FORTRAN_OPTIMIZER_BUILDER_RUNTIME_EXCEPTIONS_H
diff --git a/flang/include/flang/Runtime/exceptions.h b/flang/include/flang/Runtime/exceptions.h
index 8f806ab..1ab22da 100644
--- a/flang/include/flang/Runtime/exceptions.h
+++ b/flang/include/flang/Runtime/exceptions.h
@@ -12,7 +12,6 @@
 #define FORTRAN_RUNTIME_EXCEPTIONS_H_
 
 #include "flang/Runtime/entry-names.h"
-#include "flang/Runtime/magic-numbers.h"
 #include <cinttypes>
 
 namespace Fortran::runtime {
@@ -21,11 +20,9 @@ class Descriptor;
 
 extern "C" {
 
-// Map a (single) IEEE_FLAG_TYPE exception value to a libm fenv.h value.
-// This could be extended to handle sets of exceptions, but there is no
-// current use case for that. This mapping is done at runtime to support
-// cross compilation.
-std::int32_t RTNAME(MapException)(std::int32_t except);
+// Map a set of IEEE_FLAG_TYPE exception values to a libm fenv.h excepts value.
+// This mapping is done at runtime to support cross compilation.
+std::uint32_t RTNAME(MapException)(std::uint32_t excepts);
 
 } // extern "C"
 } // namespace Fortran::runtime
diff --git a/flang/include/flang/Runtime/magic-numbers.h b/flang/include/flang/Runtime/magic-numbers.h
index 1cded1f..bab0e9a 100644
--- a/flang/include/flang/Runtime/magic-numbers.h
+++ b/flang/include/flang/Runtime/magic-numbers.h
@@ -100,6 +100,10 @@ The denorm value is a nonstandard extension.
 #define _FORTRAN_RUNTIME_IEEE_OVERFLOW 8
 #define _FORTRAN_RUNTIME_IEEE_UNDERFLOW 16
 #define _FORTRAN_RUNTIME_IEEE_INEXACT 32
+#define _FORTRAN_RUNTIME_IEEE_ALL \
+  _FORTRAN_RUNTIME_IEEE_INVALID | _FORTRAN_RUNTIME_IEEE_DENORM | \
+      _FORTRAN_RUNTIME_IEEE_DIVIDE_BY_ZERO | _FORTRAN_RUNTIME_IEEE_OVERFLOW | \
+      _FORTRAN_RUNTIME_IEEE_UNDERFLOW | _FORTRAN_RUNTIME_IEEE_INEXACT
 
 #if 0
 ieee_round_type values
diff --git a/flang/include/flang/Semantics/semantics.h b/flang/include/flang/Semantics/semantics.h
index 3ee71fe..ec8d12b 100644
--- a/flang/include/flang/Semantics/semantics.h
+++ b/flang/include/flang/Semantics/semantics.h
@@ -254,6 +254,9 @@ public:
   // behavior.
   CommonBlockList GetCommonBlocks() const;
 
+  void NoteDefinedSymbol(const Symbol &);
+  bool IsSymbolDefined(const Symbol &) const;
+
 private:
   struct ScopeIndexComparator {
     bool operator()(parser::CharBlock, parser::CharBlock) const;
@@ -303,6 +306,7 @@ private:
   std::unique_ptr<CommonBlockMap> commonBlockMap_;
   ModuleDependences moduleDependences_;
   std::map<const Symbol *, SourceName> moduleFileOutputRenamings_;
+  UnorderedSymbolSet isDefined_;
 };
 
 class Semantics {
diff --git a/flang/include/flang/Semantics/symbol.h b/flang/include/flang/Semantics/symbol.h
index cdbe3e3..cf035073 100644
--- a/flang/include/flang/Semantics/symbol.h
+++ b/flang/include/flang/Semantics/symbol.h
@@ -460,15 +460,19 @@ private:
 // and specialized for each distinct set of type parameter values.
 class DerivedTypeDetails {
 public:
-  const std::list<SourceName> &paramNames() const { return paramNames_; }
-  const SymbolVector &paramDecls() const { return paramDecls_; }
+  const SymbolVector &paramNameOrder() const { return paramNameOrder_; }
+  const SymbolVector &paramDeclOrder() const { return paramDeclOrder_; }
   bool sequence() const { return sequence_; }
   bool isDECStructure() const { return isDECStructure_; }
   std::map<SourceName, SymbolRef> &finals() { return finals_; }
   const std::map<SourceName, SymbolRef> &finals() const { return finals_; }
   bool isForwardReferenced() const { return isForwardReferenced_; }
-  void add_paramName(const SourceName &name) { paramNames_.push_back(name); }
-  void add_paramDecl(const Symbol &symbol) { paramDecls_.push_back(symbol); }
+  void add_paramNameOrder(const Symbol &symbol) {
+    paramNameOrder_.push_back(symbol);
+  }
+  void add_paramDeclOrder(const Symbol &symbol) {
+    paramDeclOrder_.push_back(symbol);
+  }
   void add_component(const Symbol &);
   void set_sequence(bool x = true) { sequence_ = x; }
   void set_isDECStructure(bool x = true) { isDECStructure_ = x; }
@@ -491,12 +495,12 @@ public:
   const Symbol *GetFinalForRank(int) const;
 
 private:
-  // These are (1) the names of the derived type parameters in the order
+  // These are (1) the symbols of the derived type parameters in the order
   // in which they appear on the type definition statement(s), and (2) the
   // symbols that correspond to those names in the order in which their
   // declarations appear in the derived type definition(s).
-  std::list<SourceName> paramNames_;
-  SymbolVector paramDecls_;
+  SymbolVector paramNameOrder_;
+  SymbolVector paramDeclOrder_;
   // These are the names of the derived type's components in component
   // order.  A parent component, if any, appears first in this list.
   std::list<SourceName> componentNames_;
@@ -565,18 +569,19 @@ private:
 
 class TypeParamDetails {
 public:
-  explicit TypeParamDetails(common::TypeParamAttr attr) : attr_{attr} {}
+  TypeParamDetails() = default;
   TypeParamDetails(const TypeParamDetails &) = default;
-  common::TypeParamAttr attr() const { return attr_; }
+  std::optional<common::TypeParamAttr> attr() const { return attr_; }
+  TypeParamDetails &set_attr(common::TypeParamAttr);
   MaybeIntExpr &init() { return init_; }
   const MaybeIntExpr &init() const { return init_; }
   void set_init(MaybeIntExpr &&expr) { init_ = std::move(expr); }
   const DeclTypeSpec *type() const { return type_; }
-  void set_type(const DeclTypeSpec &);
+  TypeParamDetails &set_type(const DeclTypeSpec &);
   void ReplaceType(const DeclTypeSpec &);
 
 private:
-  common::TypeParamAttr attr_;
+  std::optional<common::TypeParamAttr> attr_;
   MaybeIntExpr init_;
   const DeclTypeSpec *type_{nullptr};
 };
diff --git a/flang/include/flang/Semantics/tools.h b/flang/include/flang/Semantics/tools.h
index 0fcba31..ec275f3 100644
--- a/flang/include/flang/Semantics/tools.h
+++ b/flang/include/flang/Semantics/tools.h
@@ -52,7 +52,6 @@ const Symbol *FindPointerComponent(const DeclTypeSpec &);
 const Symbol *FindPointerComponent(const Symbol &);
 const Symbol *FindInterface(const Symbol &);
 const Symbol *FindSubprogram(const Symbol &);
-const Symbol *FindFunctionResult(const Symbol &);
 const Symbol *FindOverriddenBinding(
     const Symbol &, bool &isInaccessibleDeferred);
 const Symbol *FindGlobal(const Symbol &);
@@ -87,6 +86,7 @@ bool IsIntrinsicConcat(
 bool IsGenericDefinedOp(const Symbol &);
 bool IsDefinedOperator(SourceName);
 std::string MakeOpName(SourceName);
+bool IsCommonBlockContaining(const Symbol &, const Symbol &);
 
 // Returns true if maybeAncestor exists and is a proper ancestor of a
 // descendent scope (or symbol owner).  Will be false, unlike Scope::Contains(),
@@ -261,7 +261,7 @@ std::optional<parser::MessageFixedText> GetImageControlStmtCoarrayMsg(
 SymbolVector OrderParameterDeclarations(const Symbol &);
 // Returns the complete list of derived type parameter names in the
 // order defined by 7.5.3.2.
-std::list<SourceName> OrderParameterNames(const Symbol &);
+SymbolVector OrderParameterNames(const Symbol &);
 
 // Return an existing or new derived type instance
 const DeclTypeSpec &FindOrInstantiateDerivedType(Scope &, DerivedTypeSpec &&,
diff --git a/flang/lib/Evaluate/fold-character.cpp b/flang/lib/Evaluate/fold-character.cpp
index 877bc2e..5bdfa53 100644
--- a/flang/lib/Evaluate/fold-character.cpp
+++ b/flang/lib/Evaluate/fold-character.cpp
@@ -97,7 +97,7 @@ Expr<Type<TypeCategory::Character, KIND>> FoldIntrinsicFunction(
     return Expr<T>{Constant<T>{CharacterUtils<KIND>::NEW_LINE()}};
   } else if (name == "repeat") { // not elemental
     if (auto scalars{GetScalarConstantArguments<T, SubscriptInteger>(
-            context, funcRef.arguments())}) {
+            context, funcRef.arguments(), /*hasOptionalArgument=*/false)}) {
       auto str{std::get<Scalar<T>>(*scalars)};
       auto n{std::get<Scalar<SubscriptInteger>>(*scalars).ToInt64()};
       if (n < 0) {
@@ -117,8 +117,8 @@ Expr<Type<TypeCategory::Character, KIND>> FoldIntrinsicFunction(
       }
     }
   } else if (name == "trim") { // not elemental
-    if (auto scalar{
-            GetScalarConstantArguments<T>(context, funcRef.arguments())}) {
+    if (auto scalar{GetScalarConstantArguments<T>(
+            context, funcRef.arguments(), /*hasOptionalArgument=*/false)}) {
       return Expr<T>{Constant<T>{
           CharacterUtils<KIND>::TRIM(std::get<Scalar<T>>(*scalar))}};
     }
diff --git a/flang/lib/Evaluate/fold-implementation.h b/flang/lib/Evaluate/fold-implementation.h
index d5c3931..9ce0edb 100644
--- a/flang/lib/Evaluate/fold-implementation.h
+++ b/flang/lib/Evaluate/fold-implementation.h
@@ -54,7 +54,8 @@ static constexpr bool useKahanSummation{false};
 // Utilities
 template <typename T> class Folder {
 public:
-  explicit Folder(FoldingContext &c) : context_{c} {}
+  explicit Folder(FoldingContext &c, bool forOptionalArgument = false)
+      : context_{c}, forOptionalArgument_{forOptionalArgument} {}
   std::optional<Constant<T>> GetNamedConstant(const Symbol &);
   std::optional<Constant<T>> ApplySubscripts(const Constant<T> &array,
       const std::vector<Constant<SubscriptInteger>> &subscripts);
@@ -81,6 +82,7 @@ public:
 
 private:
   FoldingContext &context_;
+  bool forOptionalArgument_{false};
 };
 
 std::optional<Constant<SubscriptInteger>> GetConstantSubscript(
@@ -407,7 +409,14 @@ Constant<T> *Folder<T>::Folding(std::optional<ActualArgument> &arg) {
   if (auto *expr{UnwrapExpr<Expr<SomeType>>(arg)}) {
     if constexpr (T::category != TypeCategory::Derived) {
       if (!UnwrapExpr<Expr<T>>(*expr)) {
-        if (auto converted{ConvertToType(T::GetType(), std::move(*expr))}) {
+        if (const Symbol *
+                var{forOptionalArgument_
+                        ? UnwrapWholeSymbolOrComponentDataRef(*expr)
+                        : nullptr};
+            var && (IsOptional(*var) || IsAllocatableOrObjectPointer(var))) {
+          // can't safely convert item that may not be present
+        } else if (auto converted{
+                       ConvertToType(T::GetType(), std::move(*expr))}) {
           *expr = Fold(context_, std::move(*converted));
         }
       }
@@ -420,10 +429,10 @@ Constant<T> *Folder<T>::Folding(std::optional<ActualArgument> &arg) {
 template <typename... A, std::size_t... I>
 std::optional<std::tuple<const Constant<A> *...>> GetConstantArgumentsHelper(
     FoldingContext &context, ActualArguments &arguments,
-    std::index_sequence<I...>) {
+    bool hasOptionalArgument, std::index_sequence<I...>) {
   static_assert(sizeof...(A) > 0);
   std::tuple<const Constant<A> *...> args{
-      Folder<A>{context}.Folding(arguments.at(I))...};
+      Folder<A>{context, hasOptionalArgument}.Folding(arguments.at(I))...};
   if ((... && (std::get<I>(args)))) {
     return args;
   } else {
@@ -433,15 +442,17 @@ std::optional<std::tuple<const Constant<A> *...>> GetConstantArgumentsHelper(
 
 template <typename... A>
 std::optional<std::tuple<const Constant<A> *...>> GetConstantArguments(
-    FoldingContext &context, ActualArguments &args) {
+    FoldingContext &context, ActualArguments &args, bool hasOptionalArgument) {
   return GetConstantArgumentsHelper<A...>(
-      context, args, std::index_sequence_for<A...>{});
+      context, args, hasOptionalArgument, std::index_sequence_for<A...>{});
 }
 
 template <typename... A, std::size_t... I>
 std::optional<std::tuple<Scalar<A>...>> GetScalarConstantArgumentsHelper(
-    FoldingContext &context, ActualArguments &args, std::index_sequence<I...>) {
-  if (auto constArgs{GetConstantArguments<A...>(context, args)}) {
+    FoldingContext &context, ActualArguments &args, bool hasOptionalArgument,
+    std::index_sequence<I...>) {
+  if (auto constArgs{
+          GetConstantArguments<A...>(context, args, hasOptionalArgument)}) {
     return std::tuple<Scalar<A>...>{
         std::get<I>(*constArgs)->GetScalarValue().value()...};
   } else {
@@ -451,9 +462,9 @@ std::optional<std::tuple<Scalar<A>...>> GetScalarConstantArgumentsHelper(
 
 template <typename... A>
 std::optional<std::tuple<Scalar<A>...>> GetScalarConstantArguments(
-    FoldingContext &context, ActualArguments &args) {
+    FoldingContext &context, ActualArguments &args, bool hasOptionalArgument) {
   return GetScalarConstantArgumentsHelper<A...>(
-      context, args, std::index_sequence_for<A...>{});
+      context, args, hasOptionalArgument, std::index_sequence_for<A...>{});
 }
 
 // helpers to fold intrinsic function references
@@ -470,9 +481,10 @@ template <template <typename, typename...> typename WrapperType, typename TR,
     typename... TA, std::size_t... I>
 Expr<TR> FoldElementalIntrinsicHelper(FoldingContext &context,
     FunctionRef<TR> &&funcRef, WrapperType<TR, TA...> func,
-    std::index_sequence<I...>) {
+    bool hasOptionalArgument, std::index_sequence<I...>) {
   if (std::optional<std::tuple<const Constant<TA> *...>> args{
-          GetConstantArguments<TA...>(context, funcRef.arguments())}) {
+          GetConstantArguments<TA...>(
+              context, funcRef.arguments(), hasOptionalArgument)}) {
     // Compute the shape of the result based on shapes of arguments
     ConstantSubscripts shape;
     int rank{0};
@@ -542,15 +554,19 @@ Expr<TR> FoldElementalIntrinsicHelper(FoldingContext &context,
 
 template <typename TR, typename... TA>
 Expr<TR> FoldElementalIntrinsic(FoldingContext &context,
-    FunctionRef<TR> &&funcRef, ScalarFunc<TR, TA...> func) {
-  return FoldElementalIntrinsicHelper<ScalarFunc, TR, TA...>(
-      context, std::move(funcRef), func, std::index_sequence_for<TA...>{});
+    FunctionRef<TR> &&funcRef, ScalarFunc<TR, TA...> func,
+    bool hasOptionalArgument = false) {
+  return FoldElementalIntrinsicHelper<ScalarFunc, TR, TA...>(context,
+      std::move(funcRef), func, hasOptionalArgument,
+      std::index_sequence_for<TA...>{});
 }
 template <typename TR, typename... TA>
 Expr<TR> FoldElementalIntrinsic(FoldingContext &context,
-    FunctionRef<TR> &&funcRef, ScalarFuncWithContext<TR, TA...> func) {
-  return FoldElementalIntrinsicHelper<ScalarFuncWithContext, TR, TA...>(
-      context, std::move(funcRef), func, std::index_sequence_for<TA...>{});
+    FunctionRef<TR> &&funcRef, ScalarFuncWithContext<TR, TA...> func,
+    bool hasOptionalArgument = false) {
+  return FoldElementalIntrinsicHelper<ScalarFuncWithContext, TR, TA...>(context,
+      std::move(funcRef), func, hasOptionalArgument,
+      std::index_sequence_for<TA...>{});
 }
 
 std::optional<std::int64_t> GetInt64ArgOr(
diff --git a/flang/lib/Evaluate/fold-integer.cpp b/flang/lib/Evaluate/fold-integer.cpp
index 981cdff..821fa4e 100644
--- a/flang/lib/Evaluate/fold-integer.cpp
+++ b/flang/lib/Evaluate/fold-integer.cpp
@@ -347,7 +347,8 @@ public:
     bool back{false};
     if (arg_[backArg]) {
       const auto *backConst{
-          Folder<LogicalResult>{context_}.Folding(arg_[backArg])};
+          Folder<LogicalResult>{context_, /*forOptionalArgument=*/true}.Folding(
+              arg_[backArg])};
       if (backConst) {
         back = backConst->GetScalarValue().value().IsTrue();
       } else {
@@ -910,8 +911,10 @@ Expr<Type<TypeCategory::Integer, KIND>> FoldIntrinsicFunction(
     const auto *argCon{Folder<T>(context).Folding(args[0])};
     const auto *shiftCon{Folder<Int4>(context).Folding(args[1])};
     const auto *shiftVals{shiftCon ? &shiftCon->values() : nullptr};
-    const auto *sizeCon{
-        args.size() == 3 ? Folder<Int4>(context).Folding(args[2]) : nullptr};
+    const auto *sizeCon{args.size() == 3
+            ? Folder<Int4>{context, /*forOptionalArgument=*/true}.Folding(
+                  args[2])
+            : nullptr};
     const auto *sizeVals{sizeCon ? &sizeCon->values() : nullptr};
     if ((argCon && argCon->empty()) || !shiftVals || shiftVals->empty() ||
         (sizeVals && sizeVals->empty())) {
@@ -985,7 +988,8 @@ Expr<Type<TypeCategory::Integer, KIND>> FoldIntrinsicFunction(
                 auto shiftVal{static_cast<int>(shift.ToInt64())};
                 auto sizeVal{static_cast<int>(size.ToInt64())};
                 return i.ISHFTC(shiftVal, sizeVal);
-              }));
+              }),
+          /*hasOptionalArgument=*/true);
     }
   } else if (name == "izext" || name == "jzext") {
     if (args.size() == 1) {
@@ -1208,20 +1212,23 @@ Expr<Type<TypeCategory::Integer, KIND>> FoldIntrinsicFunction(
           cx->u)};
     }
   } else if (name == "rank") {
-    if (const auto *array{UnwrapExpr<Expr<SomeType>>(args[0])}) {
-      if (auto named{ExtractNamedEntity(*array)}) {
-        const Symbol &symbol{named->GetLastSymbol()};
-        if (IsAssumedRank(symbol)) {
-          // DescriptorInquiry can only be placed in expression of kind
-          // DescriptorInquiry::Result::kind.
-          return ConvertToType<T>(Expr<
-              Type<TypeCategory::Integer, DescriptorInquiry::Result::kind>>{
-              DescriptorInquiry{*named, DescriptorInquiry::Field::Rank}});
-        }
+    if (args[0]) {
+      const Symbol *symbol{nullptr};
+      if (auto dataRef{ExtractDataRef(args[0])}) {
+        symbol = &dataRef->GetLastSymbol();
+      } else {
+        symbol = args[0]->GetAssumedTypeDummy();
+      }
+      if (symbol && IsAssumedRank(*symbol)) {
+        // DescriptorInquiry can only be placed in expression of kind
+        // DescriptorInquiry::Result::kind.
+        return ConvertToType<T>(
+            Expr<Type<TypeCategory::Integer, DescriptorInquiry::Result::kind>>{
+                DescriptorInquiry{
+                    NamedEntity{*symbol}, DescriptorInquiry::Field::Rank}});
       }
-      return Expr<T>{args[0].value().Rank()};
+      return Expr<T>{args[0]->Rank()};
     }
-    return Expr<T>{args[0].value().Rank()};
   } else if (name == "selected_char_kind") {
     if (const auto *chCon{UnwrapExpr<Constant<TypeOf<std::string>>>(args[0])}) {
       if (std::optional<std::string> value{chCon->GetScalarValue()}) {
diff --git a/flang/lib/Evaluate/fold-real.cpp b/flang/lib/Evaluate/fold-real.cpp
index 69c7a92..2dd08a7 100644
--- a/flang/lib/Evaluate/fold-real.cpp
+++ b/flang/lib/Evaluate/fold-real.cpp
@@ -20,8 +20,8 @@ static Expr<T> FoldTransformationalBessel(
   /// arguments to Int4, any overflow error will be reported during the
   /// conversion folding.
   using Int4 = Type<TypeCategory::Integer, 4>;
-  if (auto args{
-          GetConstantArguments<Int4, Int4, T>(context, funcRef.arguments())}) {
+  if (auto args{GetConstantArguments<Int4, Int4, T>(
+          context, funcRef.arguments(), /*hasOptionalArgument=*/false)}) {
     const std::string &name{std::get<SpecificIntrinsic>(funcRef.proc().u).name};
     if (auto elementalBessel{GetHostRuntimeWrapper<T, Int4, T>(name)}) {
       std::vector<Scalar<T>> results;
diff --git a/flang/lib/Evaluate/real.cpp b/flang/lib/Evaluate/real.cpp
index 223f67f..a5f8070 100644
--- a/flang/lib/Evaluate/real.cpp
+++ b/flang/lib/Evaluate/real.cpp
@@ -330,12 +330,12 @@ ValueWithRealFlags<Real<W, P>> Real<W, P>::SQRT(Rounding rounding) const {
 template <typename W, int P>
 ValueWithRealFlags<Real<W, P>> Real<W, P>::NEAREST(bool upward) const {
   ValueWithRealFlags<Real> result;
+  bool isNegative{IsNegative()};
   if (IsFinite()) {
     Fraction fraction{GetFraction()};
     int expo{Exponent()};
     Fraction one{1};
     Fraction nearest;
-    bool isNegative{IsNegative()};
     if (upward != isNegative) { // upward in magnitude
       auto next{fraction.AddUnsigned(one)};
       if (next.carry) {
@@ -359,6 +359,8 @@ ValueWithRealFlags<Real<W, P>> Real<W, P>::NEAREST(bool upward) const {
       }
     }
     result.flags = result.value.Normalize(isNegative, expo, nearest);
+  } else if (IsInfinite() && upward == isNegative) {
+    result.value = isNegative ? HUGE().Negate() : HUGE(); // largest mag finite
   } else {
     result.flags.set(RealFlag::InvalidArgument);
     result.value = *this;
diff --git a/flang/lib/Evaluate/variable.cpp b/flang/lib/Evaluate/variable.cpp
index 247386a..b074ae6d 100644
--- a/flang/lib/Evaluate/variable.cpp
+++ b/flang/lib/Evaluate/variable.cpp
@@ -250,7 +250,8 @@ DescriptorInquiry::DescriptorInquiry(NamedEntity &&base, Field field, int dim)
   const Symbol &last{base_.GetLastSymbol()};
   CHECK(IsDescriptor(last));
   CHECK((field == Field::Len && dim == 0) ||
-      (field != Field::Len && dim >= 0 && dim < last.Rank()));
+      (field != Field::Len && dim >= 0 &&
+          (dim < last.Rank() || IsAssumedRank(last))));
 }
 
 // LEN()
diff --git a/flang/lib/Lower/OpenMP/ClauseProcessor.cpp b/flang/lib/Lower/OpenMP/ClauseProcessor.cpp
index b26c167..310c0b0 100644
--- a/flang/lib/Lower/OpenMP/ClauseProcessor.cpp
+++ b/flang/lib/Lower/OpenMP/ClauseProcessor.cpp
@@ -181,19 +181,19 @@ static void addUseDeviceClause(
 
 static void convertLoopBounds(lower::AbstractConverter &converter,
                               mlir::Location loc,
-                              mlir::omp::CollapseClauseOps &result,
+                              mlir::omp::LoopRelatedOps &result,
                               std::size_t loopVarTypeSize) {
   fir::FirOpBuilder &firOpBuilder = converter.getFirOpBuilder();
   // The types of lower bound, upper bound, and step are converted into the
   // type of the loop variable if necessary.
   mlir::Type loopVarType = getLoopVarType(converter, loopVarTypeSize);
-  for (unsigned it = 0; it < (unsigned)result.loopLBVar.size(); it++) {
-    result.loopLBVar[it] =
-        firOpBuilder.createConvert(loc, loopVarType, result.loopLBVar[it]);
-    result.loopUBVar[it] =
-        firOpBuilder.createConvert(loc, loopVarType, result.loopUBVar[it]);
-    result.loopStepVar[it] =
-        firOpBuilder.createConvert(loc, loopVarType, result.loopStepVar[it]);
+  for (unsigned it = 0; it < (unsigned)result.loopLowerBounds.size(); it++) {
+    result.loopLowerBounds[it] = firOpBuilder.createConvert(
+        loc, loopVarType, result.loopLowerBounds[it]);
+    result.loopUpperBounds[it] = firOpBuilder.createConvert(
+        loc, loopVarType, result.loopUpperBounds[it]);
+    result.loopSteps[it] =
+        firOpBuilder.createConvert(loc, loopVarType, result.loopSteps[it]);
   }
 }
 
@@ -203,7 +203,7 @@ static void convertLoopBounds(lower::AbstractConverter &converter,
 
 bool ClauseProcessor::processCollapse(
     mlir::Location currentLocation, lower::pft::Evaluation &eval,
-    mlir::omp::CollapseClauseOps &result,
+    mlir::omp::LoopRelatedOps &result,
     llvm::SmallVectorImpl<const semantics::Symbol *> &iv) const {
   bool found = false;
   fir::FirOpBuilder &firOpBuilder = converter.getFirOpBuilder();
@@ -232,15 +232,15 @@ bool ClauseProcessor::processCollapse(
         std::get_if<parser::LoopControl::Bounds>(&loopControl->u);
     assert(bounds && "Expected bounds for worksharing do loop");
     lower::StatementContext stmtCtx;
-    result.loopLBVar.push_back(fir::getBase(
+    result.loopLowerBounds.push_back(fir::getBase(
         converter.genExprValue(*semantics::GetExpr(bounds->lower), stmtCtx)));
-    result.loopUBVar.push_back(fir::getBase(
+    result.loopUpperBounds.push_back(fir::getBase(
         converter.genExprValue(*semantics::GetExpr(bounds->upper), stmtCtx)));
     if (bounds->step) {
-      result.loopStepVar.push_back(fir::getBase(
+      result.loopSteps.push_back(fir::getBase(
           converter.genExprValue(*semantics::GetExpr(bounds->step), stmtCtx)));
     } else { // If `step` is not present, assume it as `1`.
-      result.loopStepVar.push_back(firOpBuilder.createIntegerConstant(
+      result.loopSteps.push_back(firOpBuilder.createIntegerConstant(
           currentLocation, firOpBuilder.getIntegerType(32), 1));
     }
     iv.push_back(bounds->name.thing.symbol);
@@ -291,8 +291,7 @@ bool ClauseProcessor::processDevice(lower::StatementContext &stmtCtx,
       }
     }
     const auto &deviceExpr = std::get<omp::SomeExpr>(clause->t);
-    result.deviceVar =
-        fir::getBase(converter.genExprValue(deviceExpr, stmtCtx));
+    result.device = fir::getBase(converter.genExprValue(deviceExpr, stmtCtx));
     return true;
   }
   return false;
@@ -322,10 +321,10 @@ bool ClauseProcessor::processDistSchedule(
     lower::StatementContext &stmtCtx,
     mlir::omp::DistScheduleClauseOps &result) const {
   if (auto *clause = findUniqueClause<omp::clause::DistSchedule>()) {
-    result.distScheduleStaticAttr = converter.getFirOpBuilder().getUnitAttr();
+    result.distScheduleStatic = converter.getFirOpBuilder().getUnitAttr();
     const auto &chunkSize = std::get<std::optional<ExprTy>>(clause->t);
     if (chunkSize)
-      result.distScheduleChunkSizeVar =
+      result.distScheduleChunkSize =
           fir::getBase(converter.genExprValue(*chunkSize, stmtCtx));
     return true;
   }
@@ -335,7 +334,7 @@ bool ClauseProcessor::processDistSchedule(
 bool ClauseProcessor::processFilter(lower::StatementContext &stmtCtx,
                                     mlir::omp::FilterClauseOps &result) const {
   if (auto *clause = findUniqueClause<omp::clause::Filter>()) {
-    result.filteredThreadIdVar =
+    result.filteredThreadId =
         fir::getBase(converter.genExprValue(clause->v, stmtCtx));
     return true;
   }
@@ -351,7 +350,7 @@ bool ClauseProcessor::processFinal(lower::StatementContext &stmtCtx,
 
     mlir::Value finalVal =
         fir::getBase(converter.genExprValue(clause->v, stmtCtx));
-    result.finalVar = firOpBuilder.createConvert(
+    result.final = firOpBuilder.createConvert(
         clauseLocation, firOpBuilder.getI1Type(), finalVal);
     return true;
   }
@@ -362,7 +361,7 @@ bool ClauseProcessor::processHint(mlir::omp::HintClauseOps &result) const {
   if (auto *clause = findUniqueClause<omp::clause::Hint>()) {
     fir::FirOpBuilder &firOpBuilder = converter.getFirOpBuilder();
     int64_t hintValue = *evaluate::ToInt64(clause->v);
-    result.hintAttr = firOpBuilder.getI64IntegerAttr(hintValue);
+    result.hint = firOpBuilder.getI64IntegerAttr(hintValue);
     return true;
   }
   return false;
@@ -370,11 +369,11 @@ bool ClauseProcessor::processHint(mlir::omp::HintClauseOps &result) const {
 
 bool ClauseProcessor::processMergeable(
     mlir::omp::MergeableClauseOps &result) const {
-  return markClauseOccurrence<omp::clause::Mergeable>(result.mergeableAttr);
+  return markClauseOccurrence<omp::clause::Mergeable>(result.mergeable);
 }
 
 bool ClauseProcessor::processNowait(mlir::omp::NowaitClauseOps &result) const {
-  return markClauseOccurrence<omp::clause::Nowait>(result.nowaitAttr);
+  return markClauseOccurrence<omp::clause::Nowait>(result.nowait);
 }
 
 bool ClauseProcessor::processNumTeams(
@@ -385,7 +384,7 @@ bool ClauseProcessor::processNumTeams(
   if (auto *clause = findUniqueClause<omp::clause::NumTeams>()) {
     // auto lowerBound = std::get<std::optional<ExprTy>>(clause->t);
     auto &upperBound = std::get<ExprTy>(clause->t);
-    result.numTeamsUpperVar =
+    result.numTeamsUpper =
         fir::getBase(converter.genExprValue(upperBound, stmtCtx));
     return true;
   }
@@ -397,7 +396,7 @@ bool ClauseProcessor::processNumThreads(
     mlir::omp::NumThreadsClauseOps &result) const {
   if (auto *clause = findUniqueClause<omp::clause::NumThreads>()) {
     // OMPIRBuilder expects `NUM_THREADS` clause as a `Value`.
-    result.numThreadsVar =
+    result.numThreads =
         fir::getBase(converter.genExprValue(clause->v, stmtCtx));
     return true;
   }
@@ -408,17 +407,17 @@ bool ClauseProcessor::processOrder(mlir::omp::OrderClauseOps &result) const {
   using Order = omp::clause::Order;
   if (auto *clause = findUniqueClause<Order>()) {
     fir::FirOpBuilder &firOpBuilder = converter.getFirOpBuilder();
-    result.orderAttr = mlir::omp::ClauseOrderKindAttr::get(
+    result.order = mlir::omp::ClauseOrderKindAttr::get(
         firOpBuilder.getContext(), mlir::omp::ClauseOrderKind::Concurrent);
     const auto &modifier =
         std::get<std::optional<Order::OrderModifier>>(clause->t);
     if (modifier && *modifier == Order::OrderModifier::Unconstrained) {
-      result.orderModAttr = mlir::omp::OrderModifierAttr::get(
+      result.orderMod = mlir::omp::OrderModifierAttr::get(
           firOpBuilder.getContext(), mlir::omp::OrderModifier::unconstrained);
     } else {
       // "If order-modifier is not unconstrained, the behavior is as if the
       // reproducible modifier is present."
-      result.orderModAttr = mlir::omp::OrderModifierAttr::get(
+      result.orderMod = mlir::omp::OrderModifierAttr::get(
           firOpBuilder.getContext(), mlir::omp::OrderModifier::reproducible);
     }
     return true;
@@ -433,7 +432,7 @@ bool ClauseProcessor::processOrdered(
     int64_t orderedClauseValue = 0l;
     if (clause->v.has_value())
       orderedClauseValue = *evaluate::ToInt64(*clause->v);
-    result.orderedAttr = firOpBuilder.getI64IntegerAttr(orderedClauseValue);
+    result.ordered = firOpBuilder.getI64IntegerAttr(orderedClauseValue);
     return true;
   }
   return false;
@@ -443,8 +442,7 @@ bool ClauseProcessor::processPriority(
     lower::StatementContext &stmtCtx,
     mlir::omp::PriorityClauseOps &result) const {
   if (auto *clause = findUniqueClause<omp::clause::Priority>()) {
-    result.priorityVar =
-        fir::getBase(converter.genExprValue(clause->v, stmtCtx));
+    result.priority = fir::getBase(converter.genExprValue(clause->v, stmtCtx));
     return true;
   }
   return false;
@@ -454,7 +452,7 @@ bool ClauseProcessor::processProcBind(
     mlir::omp::ProcBindClauseOps &result) const {
   if (auto *clause = findUniqueClause<omp::clause::ProcBind>()) {
     fir::FirOpBuilder &firOpBuilder = converter.getFirOpBuilder();
-    result.procBindKindAttr = genProcBindKindAttr(firOpBuilder, *clause);
+    result.procBindKind = genProcBindKindAttr(firOpBuilder, *clause);
     return true;
   }
   return false;
@@ -465,7 +463,7 @@ bool ClauseProcessor::processSafelen(
   if (auto *clause = findUniqueClause<omp::clause::Safelen>()) {
     fir::FirOpBuilder &firOpBuilder = converter.getFirOpBuilder();
     const std::optional<std::int64_t> safelenVal = evaluate::ToInt64(clause->v);
-    result.safelenAttr = firOpBuilder.getI64IntegerAttr(*safelenVal);
+    result.safelen = firOpBuilder.getI64IntegerAttr(*safelenVal);
     return true;
   }
   return false;
@@ -498,19 +496,19 @@ bool ClauseProcessor::processSchedule(
       break;
     }
 
-    result.scheduleValAttr =
+    result.scheduleKind =
         mlir::omp::ClauseScheduleKindAttr::get(context, scheduleKind);
 
-    mlir::omp::ScheduleModifier scheduleModifier = getScheduleModifier(*clause);
-    if (scheduleModifier != mlir::omp::ScheduleModifier::none)
-      result.scheduleModAttr =
-          mlir::omp::ScheduleModifierAttr::get(context, scheduleModifier);
+    mlir::omp::ScheduleModifier scheduleMod = getScheduleModifier(*clause);
+    if (scheduleMod != mlir::omp::ScheduleModifier::none)
+      result.scheduleMod =
+          mlir::omp::ScheduleModifierAttr::get(context, scheduleMod);
 
     if (getSimdModifier(*clause) != mlir::omp::ScheduleModifier::none)
-      result.scheduleSimdAttr = firOpBuilder.getUnitAttr();
+      result.scheduleSimd = firOpBuilder.getUnitAttr();
 
     if (const auto &chunkExpr = std::get<omp::MaybeExpr>(clause->t))
-      result.scheduleChunkVar =
+      result.scheduleChunk =
           fir::getBase(converter.genExprValue(*chunkExpr, stmtCtx));
 
     return true;
@@ -523,7 +521,7 @@ bool ClauseProcessor::processSimdlen(
   if (auto *clause = findUniqueClause<omp::clause::Simdlen>()) {
     fir::FirOpBuilder &firOpBuilder = converter.getFirOpBuilder();
     const std::optional<std::int64_t> simdlenVal = evaluate::ToInt64(clause->v);
-    result.simdlenAttr = firOpBuilder.getI64IntegerAttr(*simdlenVal);
+    result.simdlen = firOpBuilder.getI64IntegerAttr(*simdlenVal);
     return true;
   }
   return false;
@@ -533,7 +531,7 @@ bool ClauseProcessor::processThreadLimit(
     lower::StatementContext &stmtCtx,
     mlir::omp::ThreadLimitClauseOps &result) const {
   if (auto *clause = findUniqueClause<omp::clause::ThreadLimit>()) {
-    result.threadLimitVar =
+    result.threadLimit =
         fir::getBase(converter.genExprValue(clause->v, stmtCtx));
     return true;
   }
@@ -541,7 +539,7 @@ bool ClauseProcessor::processThreadLimit(
 }
 
 bool ClauseProcessor::processUntied(mlir::omp::UntiedClauseOps &result) const {
-  return markClauseOccurrence<omp::clause::Untied>(result.untiedAttr);
+  return markClauseOccurrence<omp::clause::Untied>(result.untied);
 }
 
 //===----------------------------------------------------------------------===//
@@ -565,7 +563,7 @@ static void
 addAlignedClause(lower::AbstractConverter &converter,
                  const omp::clause::Aligned &clause,
                  llvm::SmallVectorImpl<mlir::Value> &alignedVars,
-                 llvm::SmallVectorImpl<mlir::Attribute> &alignmentAttrs) {
+                 llvm::SmallVectorImpl<mlir::Attribute> &alignments) {
   using Aligned = omp::clause::Aligned;
   lower::StatementContext stmtCtx;
   mlir::IntegerAttr alignmentValueAttr;
@@ -594,7 +592,7 @@ addAlignedClause(lower::AbstractConverter &converter,
     alignmentValueAttr = builder.getI64IntegerAttr(alignment);
     // All the list items in a aligned clause will have same alignment
     for (std::size_t i = 0; i < objects.size(); i++)
-      alignmentAttrs.push_back(alignmentValueAttr);
+      alignments.push_back(alignmentValueAttr);
   }
 }
 
@@ -603,7 +601,7 @@ bool ClauseProcessor::processAligned(
   return findRepeatableClause<omp::clause::Aligned>(
       [&](const omp::clause::Aligned &clause, const parser::CharBlock &) {
         addAlignedClause(converter, clause, result.alignedVars,
-                         result.alignmentAttrs);
+                         result.alignments);
       });
 }
 
@@ -798,7 +796,7 @@ bool ClauseProcessor::processCopyprivate(
     result.copyprivateVars.push_back(cpVar);
     mlir::func::FuncOp funcOp =
         createCopyFunc(currentLocation, converter, cpVar.getType(), attrs);
-    result.copyprivateFuncs.push_back(mlir::SymbolRefAttr::get(funcOp));
+    result.copyprivateSyms.push_back(mlir::SymbolRefAttr::get(funcOp));
   };
 
   bool hasCopyPrivate = findRepeatableClause<clause::Copyprivate>(
@@ -832,7 +830,7 @@ bool ClauseProcessor::processDepend(mlir::omp::DependClauseOps &result) const {
 
         mlir::omp::ClauseTaskDependAttr dependTypeOperand =
             genDependKindAttr(firOpBuilder, kind);
-        result.dependTypeAttrs.append(objects.size(), dependTypeOperand);
+        result.dependKinds.append(objects.size(), dependTypeOperand);
 
         for (const omp::Object &object : objects) {
           assert(object.ref() && "Expecting designator");
@@ -1037,10 +1035,9 @@ bool ClauseProcessor::processReduction(
 
         // Copy local lists into the output.
         llvm::copy(reductionVars, std::back_inserter(result.reductionVars));
-        llvm::copy(reduceVarByRef,
-                   std::back_inserter(result.reductionVarsByRef));
+        llvm::copy(reduceVarByRef, std::back_inserter(result.reductionByref));
         llvm::copy(reductionDeclSymbols,
-                   std::back_inserter(result.reductionDeclSymbols));
+                   std::back_inserter(result.reductionSyms));
 
         if (outReductionTypes) {
           outReductionTypes->reserve(outReductionTypes->size() +
diff --git a/flang/lib/Lower/OpenMP/ClauseProcessor.h b/flang/lib/Lower/OpenMP/ClauseProcessor.h
index 4340c32..2c4b385 100644
--- a/flang/lib/Lower/OpenMP/ClauseProcessor.h
+++ b/flang/lib/Lower/OpenMP/ClauseProcessor.h
@@ -55,7 +55,7 @@ public:
   // 'Unique' clauses: They can appear at most once in the clause list.
   bool
   processCollapse(mlir::Location currentLocation, lower::pft::Evaluation &eval,
-                  mlir::omp::CollapseClauseOps &result,
+                  mlir::omp::LoopRelatedOps &result,
                   llvm::SmallVectorImpl<const semantics::Symbol *> &iv) const;
   bool processDefault() const;
   bool processDevice(lower::StatementContext &stmtCtx,
diff --git a/flang/lib/Lower/OpenMP/DataSharingProcessor.cpp b/flang/lib/Lower/OpenMP/DataSharingProcessor.cpp
index a340b62..78f83ed 100644
--- a/flang/lib/Lower/OpenMP/DataSharingProcessor.cpp
+++ b/flang/lib/Lower/OpenMP/DataSharingProcessor.cpp
@@ -273,8 +273,9 @@ void DataSharingProcessor::insertLastPrivateCompare(mlir::Operation *op) {
       mlir::Value cmpOp;
       llvm::SmallVector<mlir::Value> vs;
       vs.reserve(loopOp.getIVs().size());
-      for (auto [iv, ub, step] : llvm::zip_equal(
-               loopOp.getIVs(), loopOp.getUpperBound(), loopOp.getStep())) {
+      for (auto [iv, ub, step] :
+           llvm::zip_equal(loopOp.getIVs(), loopOp.getLoopUpperBounds(),
+                           loopOp.getLoopSteps())) {
         // v = iv + step
         // cmp = step < 0 ? v < ub : v > ub
         mlir::Value v = firOpBuilder.create<mlir::arith::AddIOp>(loc, iv, step);
@@ -593,7 +594,7 @@ void DataSharingProcessor::doPrivatize(const semantics::Symbol *sym,
   }();
 
   if (clauseOps) {
-    clauseOps->privatizers.push_back(mlir::SymbolRefAttr::get(privatizerOp));
+    clauseOps->privateSyms.push_back(mlir::SymbolRefAttr::get(privatizerOp));
     clauseOps->privateVars.push_back(hsb.getAddr());
   }
 
diff --git a/flang/lib/Lower/OpenMP/OpenMP.cpp b/flang/lib/Lower/OpenMP/OpenMP.cpp
index 1f280a0..2b1839b 100644
--- a/flang/lib/Lower/OpenMP/OpenMP.cpp
+++ b/flang/lib/Lower/OpenMP/OpenMP.cpp
@@ -284,7 +284,7 @@ static void getDeclareTargetInfo(
     lower::AbstractConverter &converter, semantics::SemanticsContext &semaCtx,
     lower::pft::Evaluation &eval,
     const parser::OpenMPDeclareTargetConstruct &declareTargetConstruct,
-    mlir::omp::DeclareTargetClauseOps &clauseOps,
+    mlir::omp::DeclareTargetOperands &clauseOps,
     llvm::SmallVectorImpl<DeclareTargetCapturePair> &symbolAndClause) {
   const auto &spec =
       std::get<parser::OmpDeclareTargetSpecifier>(declareTargetConstruct.t);
@@ -322,7 +322,7 @@ static void collectDeferredDeclareTargets(
     const parser::OpenMPDeclareTargetConstruct &declareTargetConstruct,
     llvm::SmallVectorImpl<lower::OMPDeferredDeclareTargetInfo>
         &deferredDeclareTarget) {
-  mlir::omp::DeclareTargetClauseOps clauseOps;
+  mlir::omp::DeclareTargetOperands clauseOps;
   llvm::SmallVector<DeclareTargetCapturePair> symbolAndClause;
   getDeclareTargetInfo(converter, semaCtx, eval, declareTargetConstruct,
                        clauseOps, symbolAndClause);
@@ -347,7 +347,7 @@ getDeclareTargetFunctionDevice(
     lower::AbstractConverter &converter, semantics::SemanticsContext &semaCtx,
     lower::pft::Evaluation &eval,
     const parser::OpenMPDeclareTargetConstruct &declareTargetConstruct) {
-  mlir::omp::DeclareTargetClauseOps clauseOps;
+  mlir::omp::DeclareTargetOperands clauseOps;
   llvm::SmallVector<DeclareTargetCapturePair> symbolAndClause;
   getDeclareTargetInfo(converter, semaCtx, eval, declareTargetConstruct,
                        clauseOps, symbolAndClause);
@@ -929,7 +929,7 @@ genBodyOfTargetOp(lower::AbstractConverter &converter, lower::SymMap &symTable,
                 std::underlying_type_t<llvm::omp::OpenMPOffloadMappingFlags>>(
                 llvm::omp::OpenMPOffloadMappingFlags::OMP_MAP_IMPLICIT),
             mlir::omp::VariableCaptureKind::ByCopy, copyVal.getType());
-        targetOp.getMapOperandsMutable().append(mapOp);
+        targetOp.getMapVarsMutable().append(mapOp);
         mlir::Value clonedValArg =
             region.addArgument(copyVal.getType(), copyVal.getLoc());
         firOpBuilder.setInsertionPointToStart(regionBlock);
@@ -1022,15 +1022,13 @@ static OpTy genWrapperOp(lower::AbstractConverter &converter,
 // Code generation functions for clauses
 //===----------------------------------------------------------------------===//
 
-static void genCriticalDeclareClauses(lower::AbstractConverter &converter,
-                                      semantics::SemanticsContext &semaCtx,
-                                      const List<Clause> &clauses,
-                                      mlir::Location loc,
-                                      mlir::omp::CriticalClauseOps &clauseOps,
-                                      llvm::StringRef name) {
+static void genCriticalDeclareClauses(
+    lower::AbstractConverter &converter, semantics::SemanticsContext &semaCtx,
+    const List<Clause> &clauses, mlir::Location loc,
+    mlir::omp::CriticalDeclareOperands &clauseOps, llvm::StringRef name) {
   ClauseProcessor cp(converter, semaCtx, clauses);
   cp.processHint(clauseOps);
-  clauseOps.criticalNameAttr =
+  clauseOps.symName =
       mlir::StringAttr::get(converter.getFirOpBuilder().getContext(), name);
 }
 
@@ -1039,7 +1037,7 @@ static void genDistributeClauses(lower::AbstractConverter &converter,
                                  lower::StatementContext &stmtCtx,
                                  const List<Clause> &clauses,
                                  mlir::Location loc,
-                                 mlir::omp::DistributeClauseOps &clauseOps) {
+                                 mlir::omp::DistributeOperands &clauseOps) {
   ClauseProcessor cp(converter, semaCtx, clauses);
   cp.processAllocate(clauseOps);
   cp.processDistSchedule(stmtCtx, clauseOps);
@@ -1063,18 +1061,18 @@ static void
 genLoopNestClauses(lower::AbstractConverter &converter,
                    semantics::SemanticsContext &semaCtx,
                    lower::pft::Evaluation &eval, const List<Clause> &clauses,
-                   mlir::Location loc, mlir::omp::LoopNestClauseOps &clauseOps,
+                   mlir::Location loc, mlir::omp::LoopNestOperands &clauseOps,
                    llvm::SmallVectorImpl<const semantics::Symbol *> &iv) {
   ClauseProcessor cp(converter, semaCtx, clauses);
   cp.processCollapse(loc, eval, clauseOps, iv);
-  clauseOps.loopInclusiveAttr = converter.getFirOpBuilder().getUnitAttr();
+  clauseOps.loopInclusive = converter.getFirOpBuilder().getUnitAttr();
 }
 
 static void genMaskedClauses(lower::AbstractConverter &converter,
                              semantics::SemanticsContext &semaCtx,
                              lower::StatementContext &stmtCtx,
                              const List<Clause> &clauses, mlir::Location loc,
-                             mlir::omp::MaskedClauseOps &clauseOps) {
+                             mlir::omp::MaskedOperands &clauseOps) {
   ClauseProcessor cp(converter, semaCtx, clauses);
   cp.processFilter(stmtCtx, clauseOps);
 }
@@ -1083,7 +1081,7 @@ static void
 genOrderedRegionClauses(lower::AbstractConverter &converter,
                         semantics::SemanticsContext &semaCtx,
                         const List<Clause> &clauses, mlir::Location loc,
-                        mlir::omp::OrderedRegionClauseOps &clauseOps) {
+                        mlir::omp::OrderedRegionOperands &clauseOps) {
   ClauseProcessor cp(converter, semaCtx, clauses);
   cp.processTODO<clause::Simd>(loc, llvm::omp::Directive::OMPD_ordered);
 }
@@ -1091,7 +1089,7 @@ genOrderedRegionClauses(lower::AbstractConverter &converter,
 static void genParallelClauses(
     lower::AbstractConverter &converter, semantics::SemanticsContext &semaCtx,
     lower::StatementContext &stmtCtx, const List<Clause> &clauses,
-    mlir::Location loc, mlir::omp::ParallelClauseOps &clauseOps,
+    mlir::Location loc, mlir::omp::ParallelOperands &clauseOps,
     llvm::SmallVectorImpl<mlir::Type> &reductionTypes,
     llvm::SmallVectorImpl<const semantics::Symbol *> &reductionSyms) {
   ClauseProcessor cp(converter, semaCtx, clauses);
@@ -1106,7 +1104,7 @@ static void genParallelClauses(
 static void genSectionsClauses(
     lower::AbstractConverter &converter, semantics::SemanticsContext &semaCtx,
     const List<Clause> &clauses, mlir::Location loc,
-    mlir::omp::SectionsClauseOps &clauseOps,
+    mlir::omp::SectionsOperands &clauseOps,
     llvm::SmallVectorImpl<mlir::Type> &reductionTypes,
     llvm::SmallVectorImpl<const semantics::Symbol *> &reductionSyms) {
   ClauseProcessor cp(converter, semaCtx, clauses);
@@ -1119,7 +1117,7 @@ static void genSectionsClauses(
 static void genSimdClauses(lower::AbstractConverter &converter,
                            semantics::SemanticsContext &semaCtx,
                            const List<Clause> &clauses, mlir::Location loc,
-                           mlir::omp::SimdClauseOps &clauseOps) {
+                           mlir::omp::SimdOperands &clauseOps) {
   ClauseProcessor cp(converter, semaCtx, clauses);
   cp.processAligned(clauseOps);
   cp.processIf(llvm::omp::Directive::OMPD_simd, clauseOps);
@@ -1136,7 +1134,7 @@ static void genSimdClauses(lower::AbstractConverter &converter,
 static void genSingleClauses(lower::AbstractConverter &converter,
                              semantics::SemanticsContext &semaCtx,
                              const List<Clause> &clauses, mlir::Location loc,
-                             mlir::omp::SingleClauseOps &clauseOps) {
+                             mlir::omp::SingleOperands &clauseOps) {
   ClauseProcessor cp(converter, semaCtx, clauses);
   cp.processAllocate(clauseOps);
   cp.processCopyprivate(loc, clauseOps);
@@ -1148,7 +1146,7 @@ static void genTargetClauses(
     lower::AbstractConverter &converter, semantics::SemanticsContext &semaCtx,
     lower::StatementContext &stmtCtx, const List<Clause> &clauses,
     mlir::Location loc, bool processHostOnlyClauses,
-    mlir::omp::TargetClauseOps &clauseOps,
+    mlir::omp::TargetOperands &clauseOps,
     llvm::SmallVectorImpl<const semantics::Symbol *> &mapSyms,
     llvm::SmallVectorImpl<mlir::Location> &mapLocs,
     llvm::SmallVectorImpl<mlir::Type> &mapTypes,
@@ -1185,7 +1183,7 @@ static void genTargetClauses(
 static void genTargetDataClauses(
     lower::AbstractConverter &converter, semantics::SemanticsContext &semaCtx,
     lower::StatementContext &stmtCtx, const List<Clause> &clauses,
-    mlir::Location loc, mlir::omp::TargetDataClauseOps &clauseOps,
+    mlir::Location loc, mlir::omp::TargetDataOperands &clauseOps,
     llvm::SmallVectorImpl<mlir::Type> &useDeviceTypes,
     llvm::SmallVectorImpl<mlir::Location> &useDeviceLocs,
     llvm::SmallVectorImpl<const semantics::Symbol *> &useDeviceSyms) {
@@ -1218,7 +1216,7 @@ static void genTargetEnterExitUpdateDataClauses(
     lower::AbstractConverter &converter, semantics::SemanticsContext &semaCtx,
     lower::StatementContext &stmtCtx, const List<Clause> &clauses,
     mlir::Location loc, llvm::omp::Directive directive,
-    mlir::omp::TargetEnterExitUpdateDataClauseOps &clauseOps) {
+    mlir::omp::TargetEnterExitUpdateDataOperands &clauseOps) {
   ClauseProcessor cp(converter, semaCtx, clauses);
   cp.processDepend(clauseOps);
   cp.processDevice(stmtCtx, clauseOps);
@@ -1237,7 +1235,7 @@ static void genTaskClauses(lower::AbstractConverter &converter,
                            semantics::SemanticsContext &semaCtx,
                            lower::StatementContext &stmtCtx,
                            const List<Clause> &clauses, mlir::Location loc,
-                           mlir::omp::TaskClauseOps &clauseOps) {
+                           mlir::omp::TaskOperands &clauseOps) {
   ClauseProcessor cp(converter, semaCtx, clauses);
   cp.processAllocate(clauseOps);
   cp.processDefault();
@@ -1256,7 +1254,7 @@ static void genTaskClauses(lower::AbstractConverter &converter,
 static void genTaskgroupClauses(lower::AbstractConverter &converter,
                                 semantics::SemanticsContext &semaCtx,
                                 const List<Clause> &clauses, mlir::Location loc,
-                                mlir::omp::TaskgroupClauseOps &clauseOps) {
+                                mlir::omp::TaskgroupOperands &clauseOps) {
   ClauseProcessor cp(converter, semaCtx, clauses);
   cp.processAllocate(clauseOps);
   cp.processTODO<clause::TaskReduction>(loc,
@@ -1266,7 +1264,7 @@ static void genTaskgroupClauses(lower::AbstractConverter &converter,
 static void genTaskwaitClauses(lower::AbstractConverter &converter,
                                semantics::SemanticsContext &semaCtx,
                                const List<Clause> &clauses, mlir::Location loc,
-                               mlir::omp::TaskwaitClauseOps &clauseOps) {
+                               mlir::omp::TaskwaitOperands &clauseOps) {
   ClauseProcessor cp(converter, semaCtx, clauses);
   cp.processTODO<clause::Depend, clause::Nowait>(
       loc, llvm::omp::Directive::OMPD_taskwait);
@@ -1276,7 +1274,7 @@ static void genTeamsClauses(lower::AbstractConverter &converter,
                             semantics::SemanticsContext &semaCtx,
                             lower::StatementContext &stmtCtx,
                             const List<Clause> &clauses, mlir::Location loc,
-                            mlir::omp::TeamsClauseOps &clauseOps) {
+                            mlir::omp::TeamsOperands &clauseOps) {
   ClauseProcessor cp(converter, semaCtx, clauses);
   cp.processAllocate(clauseOps);
   cp.processDefault();
@@ -1291,7 +1289,7 @@ static void genTeamsClauses(lower::AbstractConverter &converter,
 static void genWsloopClauses(
     lower::AbstractConverter &converter, semantics::SemanticsContext &semaCtx,
     lower::StatementContext &stmtCtx, const List<Clause> &clauses,
-    mlir::Location loc, mlir::omp::WsloopClauseOps &clauseOps,
+    mlir::Location loc, mlir::omp::WsloopOperands &clauseOps,
     llvm::SmallVectorImpl<mlir::Type> &reductionTypes,
     llvm::SmallVectorImpl<const semantics::Symbol *> &reductionSyms) {
   ClauseProcessor cp(converter, semaCtx, clauses);
@@ -1332,7 +1330,7 @@ genCriticalOp(lower::AbstractConverter &converter, lower::SymMap &symTable,
     mlir::ModuleOp mod = firOpBuilder.getModule();
     auto global = mod.lookupSymbol<mlir::omp::CriticalDeclareOp>(nameStr);
     if (!global) {
-      mlir::omp::CriticalClauseOps clauseOps;
+      mlir::omp::CriticalDeclareOperands clauseOps;
       genCriticalDeclareClauses(converter, semaCtx, item->clauses, loc,
                                 clauseOps, nameStr);
 
@@ -1367,7 +1365,7 @@ genLoopNestOp(lower::AbstractConverter &converter, lower::SymMap &symTable,
               semantics::SemanticsContext &semaCtx,
               lower::pft::Evaluation &eval, mlir::Location loc,
               const ConstructQueue &queue, ConstructQueue::iterator item,
-              mlir::omp::LoopNestClauseOps &clauseOps,
+              mlir::omp::LoopNestOperands &clauseOps,
               llvm::ArrayRef<const semantics::Symbol *> iv,
               llvm::ArrayRef<const semantics::Symbol *> wrapperSyms,
               llvm::ArrayRef<mlir::BlockArgument> wrapperArgs,
@@ -1395,7 +1393,7 @@ genMaskedOp(lower::AbstractConverter &converter, lower::SymMap &symTable,
             mlir::Location loc, const ConstructQueue &queue,
             ConstructQueue::iterator item) {
   lower::StatementContext stmtCtx;
-  mlir::omp::MaskedClauseOps clauseOps;
+  mlir::omp::MaskedOperands clauseOps;
   genMaskedClauses(converter, semaCtx, stmtCtx, item->clauses, loc, clauseOps);
 
   return genOpWithBody<mlir::omp::MaskedOp>(
@@ -1429,7 +1427,7 @@ genOrderedRegionOp(lower::AbstractConverter &converter, lower::SymMap &symTable,
                    semantics::SemanticsContext &semaCtx,
                    lower::pft::Evaluation &eval, mlir::Location loc,
                    const ConstructQueue &queue, ConstructQueue::iterator item) {
-  mlir::omp::OrderedRegionClauseOps clauseOps;
+  mlir::omp::OrderedRegionOperands clauseOps;
   genOrderedRegionClauses(converter, semaCtx, item->clauses, loc, clauseOps);
 
   return genOpWithBody<mlir::omp::OrderedRegionOp>(
@@ -1443,7 +1441,7 @@ genParallelOp(lower::AbstractConverter &converter, lower::SymMap &symTable,
               semantics::SemanticsContext &semaCtx,
               lower::pft::Evaluation &eval, mlir::Location loc,
               const ConstructQueue &queue, ConstructQueue::iterator item,
-              mlir::omp::ParallelClauseOps &clauseOps,
+              mlir::omp::ParallelOperands &clauseOps,
               llvm::ArrayRef<const semantics::Symbol *> reductionSyms,
               llvm::ArrayRef<mlir::Type> reductionTypes) {
   fir::FirOpBuilder &firOpBuilder = converter.getFirOpBuilder();
@@ -1534,7 +1532,7 @@ genSectionsOp(lower::AbstractConverter &converter, lower::SymMap &symTable,
               const parser::OmpSectionBlocks &sectionBlocks) {
   llvm::SmallVector<mlir::Type> reductionTypes;
   llvm::SmallVector<const semantics::Symbol *> reductionSyms;
-  mlir::omp::SectionsClauseOps clauseOps;
+  mlir::omp::SectionsOperands clauseOps;
   genSectionsClauses(converter, semaCtx, item->clauses, loc, clauseOps,
                      reductionTypes, reductionSyms);
 
@@ -1635,7 +1633,7 @@ genSectionsOp(lower::AbstractConverter &converter, lower::SymMap &symTable,
   // Emit implicit barrier to synchronize threads and avoid data
   // races on post-update of lastprivate variables when `nowait`
   // clause is present.
-  if (clauseOps.nowaitAttr && !lastprivates.empty())
+  if (clauseOps.nowait && !lastprivates.empty())
     builder.create<mlir::omp::BarrierOp>(loc);
 
   symTable.popScope();
@@ -1647,7 +1645,7 @@ genSingleOp(lower::AbstractConverter &converter, lower::SymMap &symTable,
             semantics::SemanticsContext &semaCtx, lower::pft::Evaluation &eval,
             mlir::Location loc, const ConstructQueue &queue,
             ConstructQueue::iterator item) {
-  mlir::omp::SingleClauseOps clauseOps;
+  mlir::omp::SingleOperands clauseOps;
   genSingleClauses(converter, semaCtx, item->clauses, loc, clauseOps);
 
   return genOpWithBody<mlir::omp::SingleOp>(
@@ -1669,7 +1667,7 @@ genTargetOp(lower::AbstractConverter &converter, lower::SymMap &symTable,
       !llvm::cast<mlir::omp::OffloadModuleInterface>(*converter.getModuleOp())
            .getIsTargetDevice();
 
-  mlir::omp::TargetClauseOps clauseOps;
+  mlir::omp::TargetOperands clauseOps;
   llvm::SmallVector<const semantics::Symbol *> mapSyms, devicePtrSyms,
       deviceAddrSyms;
   llvm::SmallVector<mlir::Location> mapLocs, devicePtrLocs, deviceAddrLocs;
@@ -1797,7 +1795,7 @@ genTargetDataOp(lower::AbstractConverter &converter, lower::SymMap &symTable,
                 lower::pft::Evaluation &eval, mlir::Location loc,
                 const ConstructQueue &queue, ConstructQueue::iterator item) {
   lower::StatementContext stmtCtx;
-  mlir::omp::TargetDataClauseOps clauseOps;
+  mlir::omp::TargetDataOperands clauseOps;
   llvm::SmallVector<mlir::Type> useDeviceTypes;
   llvm::SmallVector<mlir::Location> useDeviceLocs;
   llvm::SmallVector<const semantics::Symbol *> useDeviceSyms;
@@ -1835,7 +1833,7 @@ static OpTy genTargetEnterExitUpdateDataOp(lower::AbstractConverter &converter,
     llvm_unreachable("Unexpected TARGET DATA construct");
   }
 
-  mlir::omp::TargetEnterExitUpdateDataClauseOps clauseOps;
+  mlir::omp::TargetEnterExitUpdateDataOperands clauseOps;
   genTargetEnterExitUpdateDataClauses(converter, semaCtx, stmtCtx,
                                       item->clauses, loc, directive, clauseOps);
 
@@ -1848,7 +1846,7 @@ genTaskOp(lower::AbstractConverter &converter, lower::SymMap &symTable,
           mlir::Location loc, const ConstructQueue &queue,
           ConstructQueue::iterator item) {
   lower::StatementContext stmtCtx;
-  mlir::omp::TaskClauseOps clauseOps;
+  mlir::omp::TaskOperands clauseOps;
   genTaskClauses(converter, semaCtx, stmtCtx, item->clauses, loc, clauseOps);
 
   return genOpWithBody<mlir::omp::TaskOp>(
@@ -1863,7 +1861,7 @@ genTaskgroupOp(lower::AbstractConverter &converter, lower::SymMap &symTable,
                semantics::SemanticsContext &semaCtx,
                lower::pft::Evaluation &eval, mlir::Location loc,
                const ConstructQueue &queue, ConstructQueue::iterator item) {
-  mlir::omp::TaskgroupClauseOps clauseOps;
+  mlir::omp::TaskgroupOperands clauseOps;
   genTaskgroupClauses(converter, semaCtx, item->clauses, loc, clauseOps);
 
   return genOpWithBody<mlir::omp::TaskgroupOp>(
@@ -1878,7 +1876,7 @@ genTaskwaitOp(lower::AbstractConverter &converter, lower::SymMap &symTable,
               semantics::SemanticsContext &semaCtx,
               lower::pft::Evaluation &eval, mlir::Location loc,
               const ConstructQueue &queue, ConstructQueue::iterator item) {
-  mlir::omp::TaskwaitClauseOps clauseOps;
+  mlir::omp::TaskwaitOperands clauseOps;
   genTaskwaitClauses(converter, semaCtx, item->clauses, loc, clauseOps);
   return converter.getFirOpBuilder().create<mlir::omp::TaskwaitOp>(loc,
                                                                    clauseOps);
@@ -1898,7 +1896,7 @@ genTeamsOp(lower::AbstractConverter &converter, lower::SymMap &symTable,
            mlir::Location loc, const ConstructQueue &queue,
            ConstructQueue::iterator item) {
   lower::StatementContext stmtCtx;
-  mlir::omp::TeamsClauseOps clauseOps;
+  mlir::omp::TeamsOperands clauseOps;
   genTeamsClauses(converter, semaCtx, stmtCtx, item->clauses, loc, clauseOps);
 
   return genOpWithBody<mlir::omp::TeamsOp>(
@@ -1920,11 +1918,11 @@ static void genStandaloneDistribute(
     ConstructQueue::iterator item, DataSharingProcessor &dsp) {
   lower::StatementContext stmtCtx;
 
-  mlir::omp::DistributeClauseOps distributeClauseOps;
+  mlir::omp::DistributeOperands distributeClauseOps;
   genDistributeClauses(converter, semaCtx, stmtCtx, item->clauses, loc,
                        distributeClauseOps);
 
-  mlir::omp::LoopNestClauseOps loopNestClauseOps;
+  mlir::omp::LoopNestOperands loopNestClauseOps;
   llvm::SmallVector<const semantics::Symbol *> iv;
   genLoopNestClauses(converter, semaCtx, eval, item->clauses, loc,
                      loopNestClauseOps, iv);
@@ -1948,13 +1946,13 @@ static void genStandaloneDo(lower::AbstractConverter &converter,
                             DataSharingProcessor &dsp) {
   lower::StatementContext stmtCtx;
 
-  mlir::omp::WsloopClauseOps wsloopClauseOps;
+  mlir::omp::WsloopOperands wsloopClauseOps;
   llvm::SmallVector<const semantics::Symbol *> reductionSyms;
   llvm::SmallVector<mlir::Type> reductionTypes;
   genWsloopClauses(converter, semaCtx, stmtCtx, item->clauses, loc,
                    wsloopClauseOps, reductionTypes, reductionSyms);
 
-  mlir::omp::LoopNestClauseOps loopNestClauseOps;
+  mlir::omp::LoopNestOperands loopNestClauseOps;
   llvm::SmallVector<const semantics::Symbol *> iv;
   genLoopNestClauses(converter, semaCtx, eval, item->clauses, loc,
                      loopNestClauseOps, iv);
@@ -1978,7 +1976,7 @@ static void genStandaloneParallel(lower::AbstractConverter &converter,
                                   ConstructQueue::iterator item) {
   lower::StatementContext stmtCtx;
 
-  mlir::omp::ParallelClauseOps clauseOps;
+  mlir::omp::ParallelOperands clauseOps;
   llvm::SmallVector<const semantics::Symbol *> reductionSyms;
   llvm::SmallVector<mlir::Type> reductionTypes;
   genParallelClauses(converter, semaCtx, stmtCtx, item->clauses, loc, clauseOps,
@@ -1995,10 +1993,10 @@ static void genStandaloneSimd(lower::AbstractConverter &converter,
                               const ConstructQueue &queue,
                               ConstructQueue::iterator item,
                               DataSharingProcessor &dsp) {
-  mlir::omp::SimdClauseOps simdClauseOps;
+  mlir::omp::SimdOperands simdClauseOps;
   genSimdClauses(converter, semaCtx, item->clauses, loc, simdClauseOps);
 
-  mlir::omp::LoopNestClauseOps loopNestClauseOps;
+  mlir::omp::LoopNestOperands loopNestClauseOps;
   llvm::SmallVector<const semantics::Symbol *> iv;
   genLoopNestClauses(converter, semaCtx, eval, item->clauses, loc,
                      loopNestClauseOps, iv);
@@ -2049,14 +2047,14 @@ static void genCompositeDistributeSimd(
   lower::StatementContext stmtCtx;
 
   // Clause processing.
-  mlir::omp::DistributeClauseOps distributeClauseOps;
+  mlir::omp::DistributeOperands distributeClauseOps;
   genDistributeClauses(converter, semaCtx, stmtCtx, item->clauses, loc,
                        distributeClauseOps);
 
-  mlir::omp::SimdClauseOps simdClauseOps;
+  mlir::omp::SimdOperands simdClauseOps;
   genSimdClauses(converter, semaCtx, item->clauses, loc, simdClauseOps);
 
-  mlir::omp::LoopNestClauseOps loopNestClauseOps;
+  mlir::omp::LoopNestOperands loopNestClauseOps;
   llvm::SmallVector<const semantics::Symbol *> iv;
   genLoopNestClauses(converter, semaCtx, eval, item->clauses, loc,
                      loopNestClauseOps, iv);
@@ -2095,16 +2093,16 @@ static void genCompositeDoSimd(lower::AbstractConverter &converter,
   lower::StatementContext stmtCtx;
 
   // Clause processing.
-  mlir::omp::WsloopClauseOps wsloopClauseOps;
+  mlir::omp::WsloopOperands wsloopClauseOps;
   llvm::SmallVector<const semantics::Symbol *> wsloopReductionSyms;
   llvm::SmallVector<mlir::Type> wsloopReductionTypes;
   genWsloopClauses(converter, semaCtx, stmtCtx, item->clauses, loc,
                    wsloopClauseOps, wsloopReductionTypes, wsloopReductionSyms);
 
-  mlir::omp::SimdClauseOps simdClauseOps;
+  mlir::omp::SimdOperands simdClauseOps;
   genSimdClauses(converter, semaCtx, item->clauses, loc, simdClauseOps);
 
-  mlir::omp::LoopNestClauseOps loopNestClauseOps;
+  mlir::omp::LoopNestOperands loopNestClauseOps;
   llvm::SmallVector<const semantics::Symbol *> iv;
   genLoopNestClauses(converter, semaCtx, eval, item->clauses, loc,
                      loopNestClauseOps, iv);
@@ -2315,7 +2313,7 @@ static void
 genOMP(lower::AbstractConverter &converter, lower::SymMap &symTable,
        semantics::SemanticsContext &semaCtx, lower::pft::Evaluation &eval,
        const parser::OpenMPDeclareTargetConstruct &declareTargetConstruct) {
-  mlir::omp::DeclareTargetClauseOps clauseOps;
+  mlir::omp::DeclareTargetOperands clauseOps;
   llvm::SmallVector<DeclareTargetCapturePair> symbolAndClause;
   mlir::ModuleOp mod = converter.getFirOpBuilder().getModule();
   getDeclareTargetInfo(converter, semaCtx, eval, declareTargetConstruct,
diff --git a/flang/lib/Optimizer/Builder/FIRBuilder.cpp b/flang/lib/Optimizer/Builder/FIRBuilder.cpp
index fbe79d0..d54715d 100644
--- a/flang/lib/Optimizer/Builder/FIRBuilder.cpp
+++ b/flang/lib/Optimizer/Builder/FIRBuilder.cpp
@@ -248,6 +248,11 @@ mlir::Value fir::FirOpBuilder::allocateLocal(
 
 /// Get the block for adding Allocas.
 mlir::Block *fir::FirOpBuilder::getAllocaBlock() {
+  if (auto accComputeRegionIface =
+          getRegion().getParentOfType<mlir::acc::ComputeRegionOpInterface>()) {
+    return accComputeRegionIface.getAllocaBlock();
+  }
+
   if (auto ompOutlineableIface =
           getRegion()
               .getParentOfType<mlir::omp::OutlineableOpenMPOpInterface>()) {
diff --git a/flang/lib/Optimizer/Builder/IntrinsicCall.cpp b/flang/lib/Optimizer/Builder/IntrinsicCall.cpp
index 0e5e30a..2243901 100644
--- a/flang/lib/Optimizer/Builder/IntrinsicCall.cpp
+++ b/flang/lib/Optimizer/Builder/IntrinsicCall.cpp
@@ -98,9 +98,6 @@ static bool isStaticallyPresent(const fir::ExtendedValue &exv) {
 /// IEEE module procedure names not yet implemented for genModuleProcTODO.
 static constexpr char ieee_int[] = "ieee_int";
 static constexpr char ieee_get_underflow_mode[] = "ieee_get_underflow_mode";
-static constexpr char ieee_next_after[] = "ieee_next_after";
-static constexpr char ieee_next_down[] = "ieee_next_down";
-static constexpr char ieee_next_up[] = "ieee_next_up";
 static constexpr char ieee_real[] = "ieee_real";
 static constexpr char ieee_rem[] = "ieee_rem";
 static constexpr char ieee_rint[] = "ieee_rint";
@@ -355,9 +352,9 @@ static constexpr IntrinsicHandler handlers[]{
      &I::genIeeeMaxMin</*isMax=*/false, /*isNum=*/true, /*isMag=*/false>},
     {"ieee_min_num_mag",
      &I::genIeeeMaxMin</*isMax=*/false, /*isNum=*/true, /*isMag=*/true>},
-    {"ieee_next_after", &I::genModuleProcTODO<ieee_next_after>},
-    {"ieee_next_down", &I::genModuleProcTODO<ieee_next_down>},
-    {"ieee_next_up", &I::genModuleProcTODO<ieee_next_up>},
+    {"ieee_next_after", &I::genNearest<I::NearestProc::NextAfter>},
+    {"ieee_next_down", &I::genNearest<I::NearestProc::NextDown>},
+    {"ieee_next_up", &I::genNearest<I::NearestProc::NextUp>},
     {"ieee_quiet_eq", &I::genIeeeQuietCompare<mlir::arith::CmpFPredicate::OEQ>},
     {"ieee_quiet_ge", &I::genIeeeQuietCompare<mlir::arith::CmpFPredicate::OGE>},
     {"ieee_quiet_gt", &I::genIeeeQuietCompare<mlir::arith::CmpFPredicate::OGT>},
@@ -497,7 +494,7 @@ static constexpr IntrinsicHandler handlers[]{
        {"len", asValue},
        {"to", asAddr},
        {"topos", asValue}}}},
-    {"nearest", &I::genNearest},
+    {"nearest", &I::genNearest<I::NearestProc::Nearest>},
     {"nint", &I::genNint},
     {"norm2",
      &I::genNorm2,
@@ -3972,11 +3969,14 @@ IntrinsicLibrary::genIchar(mlir::Type resultType,
 //   8   Positive normal
 //   9   Positive infinity
 static constexpr int finiteTest = 0b0111111000;
+static constexpr int infiniteTest = 0b1000000100;
 static constexpr int nanTest = 0b0000000011;
 static constexpr int negativeTest = 0b0000111100;
 static constexpr int normalTest = 0b0101101000;
 static constexpr int positiveTest = 0b1111000000;
 static constexpr int snanTest = 0b0000000001;
+static constexpr int subnormalTest = 0b0010010000;
+static constexpr int zeroTest = 0b0001100000;
 
 mlir::Value IntrinsicLibrary::genIsFPClass(mlir::Type resultType,
                                            llvm::ArrayRef<mlir::Value> args,
@@ -3988,8 +3988,15 @@ mlir::Value IntrinsicLibrary::genIsFPClass(mlir::Type resultType,
   return builder.createConvert(loc, resultType, isfpclass);
 }
 
-/// Generate code to raise \p except if \p cond is absent, or present and true.
-void IntrinsicLibrary::genRaiseExcept(int except, mlir::Value cond) {
+// Generate a quiet NaN of a given floating point type.
+mlir::Value IntrinsicLibrary::genQNan(mlir::Type resultType) {
+  return genIeeeValue(resultType, builder.createIntegerConstant(
+                                      loc, builder.getIntegerType(8),
+                                      _FORTRAN_RUNTIME_IEEE_QUIET_NAN));
+}
+
+// Generate code to raise \p excepts if \p cond is absent, or present and true.
+void IntrinsicLibrary::genRaiseExcept(int excepts, mlir::Value cond) {
   fir::IfOp ifOp;
   if (cond) {
     ifOp = builder.create<fir::IfOp>(loc, cond, /*withElseRegion=*/false);
@@ -3998,8 +4005,8 @@ void IntrinsicLibrary::genRaiseExcept(int except, mlir::Value cond) {
   mlir::Type i32Ty = builder.getIntegerType(32);
   genRuntimeCall(
       "feraiseexcept", i32Ty,
-      fir::runtime::genMapException(
-          builder, loc, builder.createIntegerConstant(loc, i32Ty, except)));
+      fir::runtime::genMapExcept(
+          builder, loc, builder.createIntegerConstant(loc, i32Ty, excepts)));
   if (cond)
     builder.setInsertionPointAfter(ifOp);
 }
@@ -4363,14 +4370,14 @@ void IntrinsicLibrary::genIeeeGetFlag(llvm::ArrayRef<fir::ExtendedValue> args) {
   mlir::Value zero = builder.createIntegerConstant(loc, i32Ty, 0);
   auto [fieldRef, ignore] = getFieldRef(builder, loc, flag);
   mlir::Value field = builder.create<fir::LoadOp>(loc, fieldRef);
-  mlir::Value exceptSet = IntrinsicLibrary::genRuntimeCall(
+  mlir::Value excepts = IntrinsicLibrary::genRuntimeCall(
       "fetestexcept", i32Ty,
-      fir::runtime::genMapException(
+      fir::runtime::genMapExcept(
           builder, loc, builder.create<fir::ConvertOp>(loc, i32Ty, field)));
   mlir::Value logicalResult = builder.create<fir::ConvertOp>(
       loc, resultTy,
       builder.create<mlir::arith::CmpIOp>(loc, mlir::arith::CmpIPredicate::ne,
-                                          exceptSet, zero));
+                                          excepts, zero));
   builder.create<fir::StoreOp>(loc, logicalResult, flagValue);
 }
 
@@ -4391,7 +4398,7 @@ void IntrinsicLibrary::genIeeeGetHaltingMode(
       IntrinsicLibrary::genRuntimeCall("fegetexcept", i32Ty, {});
   mlir::Value intResult = builder.create<mlir::arith::AndIOp>(
       loc, haltSet,
-      fir::runtime::genMapException(
+      fir::runtime::genMapExcept(
           builder, loc, builder.create<fir::ConvertOp>(loc, i32Ty, field)));
   mlir::Value logicalResult = builder.create<fir::ConvertOp>(
       loc, resultTy,
@@ -4657,7 +4664,6 @@ mlir::Value IntrinsicLibrary::genIeeeMaxMin(mlir::Type resultType,
     y1 = y;
   }
   mlir::Type i1Ty = builder.getI1Type();
-  mlir::Type i8Ty = builder.getIntegerType(8);
   mlir::arith::CmpFPredicate pred;
   mlir::Value cmp, result, resultIsX, resultIsY;
 
@@ -4698,12 +4704,10 @@ mlir::Value IntrinsicLibrary::genIeeeMaxMin(mlir::Type resultType,
   } else {
     resultIsX = resultIsY = builder.createBool(loc, false);
   }
-  mlir::Value qNaN =
-      genIeeeValue(resultType, builder.createIntegerConstant(
-                                   loc, i8Ty, _FORTRAN_RUNTIME_IEEE_QUIET_NAN));
   result = builder.create<mlir::arith::SelectOp>(
       loc, resultIsX, x,
-      builder.create<mlir::arith::SelectOp>(loc, resultIsY, y, qNaN));
+      builder.create<mlir::arith::SelectOp>(loc, resultIsY, y,
+                                            genQNan(resultType)));
   mlir::Value hasSNaNOp = builder.create<mlir::arith::OrIOp>(
       loc, genIsFPClass(builder.getI1Type(), args[0], snanTest),
       genIsFPClass(builder.getI1Type(), args[1], snanTest));
@@ -4747,7 +4751,7 @@ void IntrinsicLibrary::genIeeeSetFlagOrHaltingMode(
   mlir::Type i32Ty = builder.getIntegerType(32);
   auto [fieldRef, ignore] = getFieldRef(builder, loc, getBase(args[0]));
   mlir::Value field = builder.create<fir::LoadOp>(loc, fieldRef);
-  mlir::Value except = fir::runtime::genMapException(
+  mlir::Value except = fir::runtime::genMapExcept(
       builder, loc, builder.create<fir::ConvertOp>(loc, i32Ty, field));
   auto ifOp = builder.create<fir::IfOp>(
       loc, builder.create<fir::ConvertOp>(loc, i1Ty, getBase(args[1])),
@@ -5610,16 +5614,186 @@ void IntrinsicLibrary::genMvbits(llvm::ArrayRef<fir::ExtendedValue> args) {
   builder.create<fir::StoreOp>(loc, res, toAddr);
 }
 
-// NEAREST
+// NEAREST, IEEE_NEXT_AFTER, IEEE_NEXT_DOWN, IEEE_NEXT_UP
+template <I::NearestProc proc>
 mlir::Value IntrinsicLibrary::genNearest(mlir::Type resultType,
                                          llvm::ArrayRef<mlir::Value> args) {
-  assert(args.size() == 2);
+  // NEAREST
+  //   Return the number adjacent to arg X in the direction of the infinity
+  //   with the sign of arg S. Terminate with an error if arg S is zero.
+  //   Generate exceptions as for IEEE_NEXT_AFTER.
+  // IEEE_NEXT_AFTER
+  //   Return isNan(Y) ? NaN : X==Y ? X : num adjacent to X in the dir of Y.
+  //   Signal IEEE_OVERFLOW, IEEE_INEXACT for finite X and infinite result.
+  //   Signal IEEE_UNDERFLOW, IEEE_INEXACT for subnormal result.
+  // IEEE_NEXT_DOWN
+  //   Return the number adjacent to X and less than X.
+  //   Signal IEEE_INVALID when X is a signaling NaN.
+  // IEEE_NEXT_UP
+  //   Return the number adjacent to X and greater than X.
+  //   Signal IEEE_INVALID when X is a signaling NaN.
+  //
+  // valueUp     -- true if a finite result must be larger than X.
+  // magnitudeUp -- true if a finite abs(result) must be larger than abs(X).
+  //
+  // if (isNextAfter && isNan(Y)) X = NaN // result = NaN
+  // if (isNan(X) || (isNextAfter && X == Y) || (isInfinite(X) && magnitudeUp))
+  //   result = X
+  // else if (isZero(X))
+  //   result = valueUp ? minPositiveSubnormal : minNegativeSubnormal
+  // else
+  //   result = magUp ? (X + minPositiveSubnormal) : (X - minPositiveSubnormal)
 
-  mlir::Value realX = fir::getBase(args[0]);
-  mlir::Value realS = fir::getBase(args[1]);
+  assert(args.size() == 1 || args.size() == 2);
+  mlir::Value x = args[0];
+  mlir::FloatType xType = mlir::dyn_cast<mlir::FloatType>(x.getType());
+  const unsigned xBitWidth = xType.getWidth();
+  mlir::Type i1Ty = builder.getI1Type();
+  if constexpr (proc == NearestProc::NextAfter)
+    // If isNan(Y), set X to a qNaN that will propagate to the resultIsX result.
+    x = builder.create<mlir::arith::SelectOp>(
+        loc, genIsFPClass(i1Ty, args[1], nanTest), genQNan(xType), x);
+  mlir::Value resultIsX = genIsFPClass(i1Ty, x, nanTest);
+  mlir::Type intType = builder.getIntegerType(xBitWidth);
+  mlir::Value one = builder.createIntegerConstant(loc, intType, 1);
 
-  return builder.createConvert(
-      loc, resultType, fir::runtime::genNearest(builder, loc, realX, realS));
+  // Set valueUp to true if a finite result must be larger than arg X.
+  mlir::Value valueUp;
+  if constexpr (proc == NearestProc::Nearest) {
+    // Arg S must not be zero.
+    fir::IfOp ifOp =
+        builder.create<fir::IfOp>(loc, genIsFPClass(i1Ty, args[1], zeroTest),
+                                  /*withElseRegion=*/false);
+    builder.setInsertionPointToStart(&ifOp.getThenRegion().front());
+    fir::runtime::genReportFatalUserError(
+        builder, loc, "intrinsic nearest S argument is zero");
+    builder.setInsertionPointAfter(ifOp);
+    mlir::Value sSign = IntrinsicLibrary::genIeeeSignbit(intType, {args[1]});
+    valueUp = builder.create<mlir::arith::CmpIOp>(
+        loc, mlir::arith::CmpIPredicate::ne, sSign, one);
+  } else if constexpr (proc == NearestProc::NextAfter) {
+    // Convert X and Y to a common type to allow comparison. Direct conversions
+    // between kinds 2, 3, 10, and 16 are not all supported. These conversions
+    // are implemented by converting kind=2,3 values to kind=4, possibly
+    // followed with a conversion of that value to a larger type.
+    mlir::Value x1 = x;
+    mlir::Value y = args[1];
+    mlir::FloatType yType = mlir::dyn_cast<mlir::FloatType>(args[1].getType());
+    const unsigned yBitWidth = yType.getWidth();
+    if (xType != yType) {
+      mlir::Type f32Ty = mlir::FloatType::getF32(builder.getContext());
+      if (xBitWidth < 32)
+        x1 = builder.createConvert(loc, f32Ty, x1);
+      if (yBitWidth > 32 && yBitWidth > xBitWidth)
+        x1 = builder.createConvert(loc, yType, x1);
+      if (yBitWidth < 32)
+        y = builder.createConvert(loc, f32Ty, y);
+      if (xBitWidth > 32 && xBitWidth > yBitWidth)
+        y = builder.createConvert(loc, xType, y);
+    }
+    resultIsX = builder.create<mlir::arith::OrIOp>(
+        loc, resultIsX,
+        builder.create<mlir::arith::CmpFOp>(
+            loc, mlir::arith::CmpFPredicate::OEQ, x1, y));
+    valueUp = builder.create<mlir::arith::CmpFOp>(
+        loc, mlir::arith::CmpFPredicate::OLT, x1, y);
+  } else if constexpr (proc == NearestProc::NextDown) {
+    valueUp = builder.createBool(loc, false);
+  } else if constexpr (proc == NearestProc::NextUp) {
+    valueUp = builder.createBool(loc, true);
+  }
+  mlir::Value magnitudeUp = builder.create<mlir::arith::CmpIOp>(
+      loc, mlir::arith::CmpIPredicate::ne, valueUp,
+      IntrinsicLibrary::genIeeeSignbit(i1Ty, {args[0]}));
+  resultIsX = builder.create<mlir::arith::OrIOp>(
+      loc, resultIsX,
+      builder.create<mlir::arith::AndIOp>(
+          loc, genIsFPClass(i1Ty, x, infiniteTest), magnitudeUp));
+
+  // Result is X. (For ieee_next_after with isNan(Y), X has been set to a NaN.)
+  fir::IfOp outerIfOp = builder.create<fir::IfOp>(loc, resultType, resultIsX,
+                                                  /*withElseRegion=*/true);
+  builder.setInsertionPointToStart(&outerIfOp.getThenRegion().front());
+  if constexpr (proc == NearestProc::NextDown || proc == NearestProc::NextUp)
+    genRaiseExcept(_FORTRAN_RUNTIME_IEEE_INVALID,
+                   genIsFPClass(i1Ty, x, snanTest));
+  builder.create<fir::ResultOp>(loc, x);
+
+  // Result is minPositiveSubnormal or minNegativeSubnormal. (X is zero.)
+  builder.setInsertionPointToStart(&outerIfOp.getElseRegion().front());
+  mlir::Value resultIsMinSubnormal = builder.create<mlir::arith::CmpFOp>(
+      loc, mlir::arith::CmpFPredicate::OEQ, x,
+      builder.createRealZeroConstant(loc, xType));
+  fir::IfOp innerIfOp =
+      builder.create<fir::IfOp>(loc, resultType, resultIsMinSubnormal,
+                                /*withElseRegion=*/true);
+  builder.setInsertionPointToStart(&innerIfOp.getThenRegion().front());
+  mlir::Value minPositiveSubnormal =
+      builder.create<mlir::arith::BitcastOp>(loc, resultType, one);
+  mlir::Value minNegativeSubnormal = builder.create<mlir::arith::BitcastOp>(
+      loc, resultType,
+      builder.create<mlir::arith::ConstantOp>(
+          loc, intType,
+          builder.getIntegerAttr(
+              intType, llvm::APInt::getBitsSetWithWrap(
+                           xBitWidth, /*lo=*/xBitWidth - 1, /*hi=*/1))));
+  mlir::Value result = builder.create<mlir::arith::SelectOp>(
+      loc, valueUp, minPositiveSubnormal, minNegativeSubnormal);
+  if constexpr (proc == NearestProc::Nearest || proc == NearestProc::NextAfter)
+    genRaiseExcept(_FORTRAN_RUNTIME_IEEE_UNDERFLOW |
+                   _FORTRAN_RUNTIME_IEEE_INEXACT);
+  builder.create<fir::ResultOp>(loc, result);
+
+  // Result is (X + minPositiveSubnormal) or (X - minPositiveSubnormal).
+  builder.setInsertionPointToStart(&innerIfOp.getElseRegion().front());
+  if (xBitWidth == 80) {
+    // Kind 10. Call std::nextafter, which generates exceptions as required
+    // for ieee_next_after and nearest. Override this exception processing
+    // for ieee_next_down and ieee_next_up.
+    constexpr bool overrideExceptionGeneration =
+        proc == NearestProc::NextDown || proc == NearestProc::NextUp;
+    [[maybe_unused]] mlir::Type i32Ty;
+    [[maybe_unused]] mlir::Value allExcepts, excepts, mask;
+    if constexpr (overrideExceptionGeneration) {
+      i32Ty = builder.getIntegerType(32);
+      allExcepts = fir::runtime::genMapExcept(
+          builder, loc,
+          builder.createIntegerConstant(loc, i32Ty, _FORTRAN_RUNTIME_IEEE_ALL));
+      excepts = genRuntimeCall("fetestexcept", i32Ty, allExcepts);
+      mask = genRuntimeCall("fedisableexcept", i32Ty, allExcepts);
+    }
+    result = fir::runtime::genNearest(builder, loc, x, valueUp);
+    if constexpr (overrideExceptionGeneration) {
+      genRuntimeCall("feclearexcept", i32Ty, allExcepts);
+      genRuntimeCall("feraiseexcept", i32Ty, excepts);
+      genRuntimeCall("feenableexcept", i32Ty, mask);
+    }
+    builder.create<fir::ResultOp>(loc, result);
+  } else {
+    // Kind 2, 3, 4, 8, 16. Increment or decrement X cast to integer.
+    mlir::Value intX = builder.create<mlir::arith::BitcastOp>(loc, intType, x);
+    result = builder.create<mlir::arith::BitcastOp>(
+        loc, resultType,
+        builder.create<mlir::arith::SelectOp>(
+            loc, magnitudeUp,
+            builder.create<mlir::arith::AddIOp>(loc, intX, one),
+            builder.create<mlir::arith::SubIOp>(loc, intX, one)));
+    if constexpr (proc == NearestProc::Nearest ||
+                  proc == NearestProc::NextAfter) {
+      genRaiseExcept(_FORTRAN_RUNTIME_IEEE_OVERFLOW |
+                         _FORTRAN_RUNTIME_IEEE_INEXACT,
+                     genIsFPClass(i1Ty, result, infiniteTest));
+      genRaiseExcept(_FORTRAN_RUNTIME_IEEE_UNDERFLOW |
+                         _FORTRAN_RUNTIME_IEEE_INEXACT,
+                     genIsFPClass(i1Ty, result, subnormalTest));
+    }
+    builder.create<fir::ResultOp>(loc, result);
+  }
+
+  builder.setInsertionPointAfter(innerIfOp);
+  builder.create<fir::ResultOp>(loc, innerIfOp.getResult(0));
+  builder.setInsertionPointAfter(outerIfOp);
+  return outerIfOp.getResult(0);
 }
 
 // NINT
diff --git a/flang/lib/Optimizer/Builder/Runtime/Exceptions.cpp b/flang/lib/Optimizer/Builder/Runtime/Exceptions.cpp
index 294ccba..8775b50 100644
--- a/flang/lib/Optimizer/Builder/Runtime/Exceptions.cpp
+++ b/flang/lib/Optimizer/Builder/Runtime/Exceptions.cpp
@@ -13,10 +13,10 @@
 
 using namespace Fortran::runtime;
 
-mlir::Value fir::runtime::genMapException(fir::FirOpBuilder &builder,
-                                          mlir::Location loc,
-                                          mlir::Value except) {
+mlir::Value fir::runtime::genMapExcept(fir::FirOpBuilder &builder,
+                                       mlir::Location loc,
+                                       mlir::Value excepts) {
   mlir::func::FuncOp func{
       fir::runtime::getRuntimeFunc<mkRTKey(MapException)>(loc, builder)};
-  return builder.create<fir::CallOp>(loc, func, except).getResult(0);
+  return builder.create<fir::CallOp>(loc, func, excepts).getResult(0);
 }
diff --git a/flang/lib/Optimizer/Builder/Runtime/Numeric.cpp b/flang/lib/Optimizer/Builder/Runtime/Numeric.cpp
index 1d13248..d982884 100644
--- a/flang/lib/Optimizer/Builder/Runtime/Numeric.cpp
+++ b/flang/lib/Optimizer/Builder/Runtime/Numeric.cpp
@@ -406,10 +406,10 @@ mlir::Value fir::runtime::genModulo(fir::FirOpBuilder &builder,
   return builder.create<fir::CallOp>(loc, func, args).getResult(0);
 }
 
-/// Generate call to Nearest intrinsic runtime routine.
+/// Generate call to Nearest intrinsic or a "Next" intrinsic module procedure.
 mlir::Value fir::runtime::genNearest(fir::FirOpBuilder &builder,
                                      mlir::Location loc, mlir::Value x,
-                                     mlir::Value s) {
+                                     mlir::Value valueUp) {
   mlir::func::FuncOp func;
   mlir::Type fltTy = x.getType();
 
@@ -425,19 +425,7 @@ mlir::Value fir::runtime::genNearest(fir::FirOpBuilder &builder,
     fir::intrinsicTypeTODO(builder, fltTy, loc, "NEAREST");
 
   auto funcTy = func.getFunctionType();
-
-  mlir::Type sTy = s.getType();
-  mlir::Value zero = builder.createRealZeroConstant(loc, sTy);
-  auto cmp = builder.create<mlir::arith::CmpFOp>(
-      loc, mlir::arith::CmpFPredicate::OGT, s, zero);
-
-  mlir::Type boolTy = mlir::IntegerType::get(builder.getContext(), 1);
-  mlir::Value False = builder.createIntegerConstant(loc, boolTy, 0);
-  mlir::Value True = builder.createIntegerConstant(loc, boolTy, 1);
-
-  mlir::Value positive =
-      builder.create<mlir::arith::SelectOp>(loc, cmp, True, False);
-  auto args = fir::runtime::createArguments(builder, loc, funcTy, x, positive);
+  auto args = fir::runtime::createArguments(builder, loc, funcTy, x, valueUp);
 
   return builder.create<fir::CallOp>(loc, func, args).getResult(0);
 }
diff --git a/flang/lib/Optimizer/Transforms/OMPMapInfoFinalization.cpp b/flang/lib/Optimizer/Transforms/OMPMapInfoFinalization.cpp
index 35203fe..ddaa3c5 100644
--- a/flang/lib/Optimizer/Transforms/OMPMapInfoFinalization.cpp
+++ b/flang/lib/Optimizer/Transforms/OMPMapInfoFinalization.cpp
@@ -109,10 +109,10 @@ class OMPMapInfoFinalizationPass
     if (auto mapClauseOwner =
             llvm::dyn_cast<mlir::omp::MapClauseOwningOpInterface>(target)) {
       llvm::SmallVector<mlir::Value> newMapOps;
-      mlir::OperandRange mapOperandsArr = mapClauseOwner.getMapOperands();
+      mlir::OperandRange mapVarsArr = mapClauseOwner.getMapVars();
 
-      for (size_t i = 0; i < mapOperandsArr.size(); ++i) {
-        if (mapOperandsArr[i] == op) {
+      for (size_t i = 0; i < mapVarsArr.size(); ++i) {
+        if (mapVarsArr[i] == op) {
           // Push new implicit maps generated for the descriptor.
           newMapOps.push_back(baseAddr);
 
@@ -120,13 +120,13 @@ class OMPMapInfoFinalizationPass
           // new additional map operand with an appropriate BlockArgument,
           // as the printing and later processing currently requires a 1:1
           // mapping of BlockArgs to MapInfoOp's at the same placement in
-          // each array (BlockArgs and MapOperands).
+          // each array (BlockArgs and MapVars).
           if (auto targetOp = llvm::dyn_cast<mlir::omp::TargetOp>(target))
             targetOp.getRegion().insertArgument(i, baseAddr.getType(), loc);
         }
-        newMapOps.push_back(mapOperandsArr[i]);
+        newMapOps.push_back(mapVarsArr[i]);
       }
-      mapClauseOwner.getMapOperandsMutable().assign(newMapOps);
+      mapClauseOwner.getMapVarsMutable().assign(newMapOps);
     }
 
     mlir::Value newDescParentMapOp = builder.create<mlir::omp::MapInfoOp>(
@@ -196,27 +196,27 @@ class OMPMapInfoFinalizationPass
       return;
 
     llvm::SmallVector<mlir::Value> newMapOps;
-    mlir::OperandRange mapOperandsArr = mapClauseOwner.getMapOperands();
+    mlir::OperandRange mapVarsArr = mapClauseOwner.getMapVars();
     auto targetOp = llvm::dyn_cast<mlir::omp::TargetOp>(target);
 
-    for (size_t i = 0; i < mapOperandsArr.size(); ++i) {
-      if (mapOperandsArr[i] == op) {
+    for (size_t i = 0; i < mapVarsArr.size(); ++i) {
+      if (mapVarsArr[i] == op) {
         for (auto [j, mapMember] : llvm::enumerate(op.getMembers())) {
           newMapOps.push_back(mapMember);
           // for TargetOp's which have IsolatedFromAbove we must align the
           // new additional map operand with an appropriate BlockArgument,
           // as the printing and later processing currently requires a 1:1
           // mapping of BlockArgs to MapInfoOp's at the same placement in
-          // each array (BlockArgs and MapOperands).
+          // each array (BlockArgs and MapVars).
           if (targetOp) {
             targetOp.getRegion().insertArgument(i + j, mapMember.getType(),
                                                 targetOp->getLoc());
           }
         }
       }
-      newMapOps.push_back(mapOperandsArr[i]);
+      newMapOps.push_back(mapVarsArr[i]);
     }
-    mapClauseOwner.getMapOperandsMutable().assign(newMapOps);
+    mapClauseOwner.getMapVarsMutable().assign(newMapOps);
   }
 
   // This pass executes on omp::MapInfoOp's containing descriptor based types
diff --git a/flang/lib/Parser/executable-parsers.cpp b/flang/lib/Parser/executable-parsers.cpp
index f703e09..5057e89 100644
--- a/flang/lib/Parser/executable-parsers.cpp
+++ b/flang/lib/Parser/executable-parsers.cpp
@@ -67,20 +67,19 @@ constexpr auto obsoleteExecutionPartConstruct{recovery(ignoredStatementPrefix >>
                 parenthesized(nonemptyList(Parser<AllocateShapeSpec>{}))))))};
 
 TYPE_PARSER(recovery(
-    withMessage("expected execution part construct"_err_en_US,
-        CONTEXT_PARSER("execution part construct"_en_US,
-            first(construct<ExecutionPartConstruct>(executableConstruct),
+    CONTEXT_PARSER("execution part construct"_en_US,
+        first(construct<ExecutionPartConstruct>(executableConstruct),
+            construct<ExecutionPartConstruct>(statement(indirect(formatStmt))),
+            construct<ExecutionPartConstruct>(statement(indirect(entryStmt))),
+            construct<ExecutionPartConstruct>(statement(indirect(dataStmt))),
+            extension<LanguageFeature::ExecutionPartNamelist>(
+                "nonstandard usage: NAMELIST in execution part"_port_en_US,
                 construct<ExecutionPartConstruct>(
-                    statement(indirect(formatStmt))),
-                construct<ExecutionPartConstruct>(
-                    statement(indirect(entryStmt))),
-                construct<ExecutionPartConstruct>(
-                    statement(indirect(dataStmt))),
-                extension<LanguageFeature::ExecutionPartNamelist>(
-                    "nonstandard usage: NAMELIST in execution part"_port_en_US,
-                    construct<ExecutionPartConstruct>(
-                        statement(indirect(Parser<NamelistStmt>{})))),
-                obsoleteExecutionPartConstruct))),
+                    statement(indirect(Parser<NamelistStmt>{})))),
+            obsoleteExecutionPartConstruct,
+            lookAhead(declarationConstruct) >> SkipTo<'\n'>{} >>
+                fail<ExecutionPartConstruct>(
+                    "misplaced declaration in the execution part"_err_en_US))),
     construct<ExecutionPartConstruct>(executionPartErrorRecovery)))
 
 // R509 execution-part -> executable-construct [execution-part-construct]...
diff --git a/flang/lib/Parser/prescan.cpp b/flang/lib/Parser/prescan.cpp
index aa45548..c01d512b 100644
--- a/flang/lib/Parser/prescan.cpp
+++ b/flang/lib/Parser/prescan.cpp
@@ -111,9 +111,13 @@ void Prescanner::Statement() {
     skipLeadingAmpersand_ |= !inFixedForm_;
     return;
   case LineClassification::Kind::PreprocessorDirective:
+    preprocessor_.Directive(TokenizePreprocessorDirective(), *this);
+    afterPreprocessingDirective_ = true;
+    // Don't set skipLeadingAmpersand_
+    return;
   case LineClassification::Kind::DefinitionDirective:
     preprocessor_.Directive(TokenizePreprocessorDirective(), *this);
-    // Don't set afterPreprocessingDirective_
+    // Don't set afterPreprocessingDirective_ or skipLeadingAmpersand_
     return;
   case LineClassification::Kind::CompilerDirective: {
     directiveSentinel_ = line.sentinel;
diff --git a/flang/lib/Parser/program-parsers.cpp b/flang/lib/Parser/program-parsers.cpp
index b51b601..c43696c 100644
--- a/flang/lib/Parser/program-parsers.cpp
+++ b/flang/lib/Parser/program-parsers.cpp
@@ -19,6 +19,31 @@
 
 namespace Fortran::parser {
 
+// R1530 function-stmt ->
+//         [prefix] FUNCTION function-name ( [dummy-arg-name-list] ) [suffix]
+// R1526 prefix -> prefix-spec [prefix-spec]...
+// R1531 dummy-arg-name -> name
+
+static constexpr auto validFunctionStmt{
+    construct<FunctionStmt>(many(prefixSpec), "FUNCTION" >> name,
+        parenthesized(optionalList(name)), maybe(suffix)) /
+        atEndOfStmt ||
+    construct<FunctionStmt>(many(prefixSpec), "FUNCTION" >> name / atEndOfStmt,
+        // PGI & Intel accept "FUNCTION F"
+        extension<LanguageFeature::OmitFunctionDummies>(
+            "nonstandard usage: FUNCTION statement without dummy argument list"_port_en_US,
+            pure<std::list<Name>>()),
+        pure<std::optional<Suffix>>())};
+
+// function-stmt with error recovery -- used in interfaces and internal
+// subprograms, but not at the top level, where REALFUNCTIONF and
+// INTEGERPUREELEMENTALFUNCTIONG(10) might appear as the first statement
+// of a main program.
+TYPE_PARSER(validFunctionStmt ||
+    construct<FunctionStmt>(many(prefixSpec), "FUNCTION" >> name,
+        defaulted(parenthesized(optionalList(name))), maybe(suffix)) /
+        checkEndOfKnownStmt)
+
 // R502 program-unit ->
 //        main-program | external-subprogram | module | submodule | block-data
 // R503 external-subprogram -> function-subprogram | subroutine-subprogram
@@ -36,10 +61,11 @@ namespace Fortran::parser {
 // Enforcing C1547 is done in semantics.
 static constexpr auto programUnit{
     construct<ProgramUnit>(indirect(Parser<Module>{})) ||
-    construct<ProgramUnit>(indirect(functionSubprogram)) ||
     construct<ProgramUnit>(indirect(subroutineSubprogram)) ||
     construct<ProgramUnit>(indirect(Parser<Submodule>{})) ||
     construct<ProgramUnit>(indirect(Parser<BlockData>{})) ||
+    lookAhead(validFunctionStmt) >>
+        construct<ProgramUnit>(indirect(functionSubprogram)) ||
     construct<ProgramUnit>(indirect(Parser<MainProgram>{}))};
 static constexpr auto normalProgramUnit{StartNewSubprogram{} >> programUnit /
         skipMany(";"_tok) / space / recovery(endOfLine, SkipPast<'\n'>{})};
@@ -66,16 +92,6 @@ TYPE_PARSER(
             normalProgramUnit) /
             skipStuffBeforeStatement))
 
-// R504 specification-part ->
-//         [use-stmt]... [import-stmt]... [implicit-part]
-//         [declaration-construct]...
-TYPE_CONTEXT_PARSER("specification part"_en_US,
-    construct<SpecificationPart>(many(openaccDeclarativeConstruct),
-        many(openmpDeclarativeConstruct), many(indirect(compilerDirective)),
-        many(statement(indirect(Parser<UseStmt>{}))),
-        many(unambiguousStatement(indirect(Parser<ImportStmt>{}))),
-        implicitPart, many(declarationConstruct)))
-
 // R507 declaration-construct ->
 //        specification-construct | data-stmt | format-stmt |
 //        entry-stmt | stmt-function-stmt
@@ -106,18 +122,29 @@ constexpr auto misplacedSpecificationStmt{Parser<UseStmt>{} >>
         fail<DeclarationConstruct>(
             "IMPLICIT statements must follow USE and IMPORT and precede all other declarations"_err_en_US)};
 
-TYPE_PARSER(recovery(
-    withMessage("expected declaration construct"_err_en_US,
-        CONTEXT_PARSER("declaration construct"_en_US,
-            first(construct<DeclarationConstruct>(specificationConstruct),
-                construct<DeclarationConstruct>(statement(indirect(dataStmt))),
-                construct<DeclarationConstruct>(
-                    statement(indirect(formatStmt))),
-                construct<DeclarationConstruct>(statement(indirect(entryStmt))),
-                construct<DeclarationConstruct>(
-                    statement(indirect(Parser<StmtFunctionStmt>{}))),
-                misplacedSpecificationStmt))),
-    construct<DeclarationConstruct>(declErrorRecovery)))
+TYPE_CONTEXT_PARSER("declaration construct"_en_US,
+    first(construct<DeclarationConstruct>(specificationConstruct),
+        construct<DeclarationConstruct>(statement(indirect(dataStmt))),
+        construct<DeclarationConstruct>(statement(indirect(formatStmt))),
+        construct<DeclarationConstruct>(statement(indirect(entryStmt))),
+        construct<DeclarationConstruct>(
+            statement(indirect(Parser<StmtFunctionStmt>{}))),
+        misplacedSpecificationStmt))
+
+constexpr auto recoveredDeclarationConstruct{
+    recovery(withMessage("expected declaration construct"_err_en_US,
+                 declarationConstruct),
+        construct<DeclarationConstruct>(declErrorRecovery))};
+
+// R504 specification-part ->
+//         [use-stmt]... [import-stmt]... [implicit-part]
+//         [declaration-construct]...
+TYPE_CONTEXT_PARSER("specification part"_en_US,
+    construct<SpecificationPart>(many(openaccDeclarativeConstruct),
+        many(openmpDeclarativeConstruct), many(indirect(compilerDirective)),
+        many(statement(indirect(Parser<UseStmt>{}))),
+        many(unambiguousStatement(indirect(Parser<ImportStmt>{}))),
+        implicitPart, many(recoveredDeclarationConstruct)))
 
 // R507 variant of declaration-construct for use in limitedSpecificationPart.
 constexpr auto invalidDeclarationStmt{formatStmt >>
@@ -528,20 +555,6 @@ TYPE_CONTEXT_PARSER("FUNCTION subprogram"_en_US,
         executionPart, maybe(internalSubprogramPart),
         unterminatedStatement(endFunctionStmt)))
 
-// R1530 function-stmt ->
-//         [prefix] FUNCTION function-name ( [dummy-arg-name-list] ) [suffix]
-// R1526 prefix -> prefix-spec [prefix-spec]...
-// R1531 dummy-arg-name -> name
-TYPE_CONTEXT_PARSER("FUNCTION statement"_en_US,
-    construct<FunctionStmt>(many(prefixSpec), "FUNCTION" >> name,
-        parenthesized(optionalList(name)), maybe(suffix)) ||
-        extension<LanguageFeature::OmitFunctionDummies>(
-            "nonstandard usage: FUNCTION statement without dummy argument list"_port_en_US,
-            construct<FunctionStmt>( // PGI & Intel accept "FUNCTION F"
-                many(prefixSpec), "FUNCTION" >> name,
-                construct<std::list<Name>>(),
-                construct<std::optional<Suffix>>())))
-
 // R1532 suffix ->
 //         proc-language-binding-spec [RESULT ( result-name )] |
 //         RESULT ( result-name ) [proc-language-binding-spec]
@@ -566,11 +579,13 @@ TYPE_CONTEXT_PARSER("SUBROUTINE subprogram"_en_US,
 //         [prefix] SUBROUTINE subroutine-name [( [dummy-arg-list] )
 //         [proc-language-binding-spec]]
 TYPE_PARSER(
-    construct<SubroutineStmt>(many(prefixSpec), "SUBROUTINE" >> name,
-        parenthesized(optionalList(dummyArg)), maybe(languageBindingSpec)) ||
-    construct<SubroutineStmt>(many(prefixSpec), "SUBROUTINE" >> name,
-        pure<std::list<DummyArg>>(),
-        pure<std::optional<LanguageBindingSpec>>()))
+    (construct<SubroutineStmt>(many(prefixSpec), "SUBROUTINE" >> name,
+         !"("_tok >> pure<std::list<DummyArg>>(),
+         pure<std::optional<LanguageBindingSpec>>()) ||
+        construct<SubroutineStmt>(many(prefixSpec), "SUBROUTINE" >> name,
+            defaulted(parenthesized(optionalList(dummyArg))),
+            maybe(languageBindingSpec))) /
+    checkEndOfKnownStmt)
 
 // R1536 dummy-arg -> dummy-arg-name | *
 TYPE_PARSER(construct<DummyArg>(name) || construct<DummyArg>(star))
diff --git a/flang/lib/Parser/stmt-parser.h b/flang/lib/Parser/stmt-parser.h
index ba647fd..00bae2b 100644
--- a/flang/lib/Parser/stmt-parser.h
+++ b/flang/lib/Parser/stmt-parser.h
@@ -30,6 +30,10 @@ inline constexpr auto unterminatedStatement(const PA &p) {
           maybe(label), space >> p));
 }
 
+constexpr auto atEndOfStmt{space >>
+    withMessage("expected end of statement"_err_en_US, lookAhead(";\n"_ch))};
+constexpr auto checkEndOfKnownStmt{recovery(atEndOfStmt, SkipTo<'\n'>{})};
+
 constexpr auto endOfLine{
     "\n"_ch >> ok || fail("expected end of line"_err_en_US)};
 
@@ -86,8 +90,6 @@ constexpr auto executionPartErrorRecovery{stmtErrorRecoveryStart >>
 // END statement error recovery
 constexpr auto missingOptionalName{pure<std::optional<Name>>()};
 constexpr auto noNameEnd{"END" >> missingOptionalName};
-constexpr auto atEndOfStmt{space >>
-    withMessage("expected end of statement"_err_en_US, lookAhead(";\n"_ch))};
 constexpr auto bareEnd{noNameEnd / recovery(atEndOfStmt, SkipTo<'\n'>{})};
 
 // For unrecognizable construct END statements.  Be sure to not consume
diff --git a/flang/lib/Semantics/check-allocate.cpp b/flang/lib/Semantics/check-allocate.cpp
index e344390..8f7a200 100644
--- a/flang/lib/Semantics/check-allocate.cpp
+++ b/flang/lib/Semantics/check-allocate.cpp
@@ -600,10 +600,13 @@ bool AllocationCheckerHelper::RunChecks(SemanticsContext &context) {
   const Scope &subpScope{
       GetProgramUnitContaining(context.FindScope(name_.source))};
   if (allocateObject_.typedExpr && allocateObject_.typedExpr->v) {
-    if (auto whyNot{WhyNotDefinable(name_.source, subpScope,
-            {DefinabilityFlag::PointerDefinition,
-                DefinabilityFlag::AcceptAllocatable},
-            *allocateObject_.typedExpr->v)}) {
+    DefinabilityFlags flags{DefinabilityFlag::PointerDefinition,
+        DefinabilityFlag::AcceptAllocatable};
+    if (allocateInfo_.gotSource) {
+      flags.set(DefinabilityFlag::SourcedAllocation);
+    }
+    if (auto whyNot{WhyNotDefinable(
+            name_.source, subpScope, flags, *allocateObject_.typedExpr->v)}) {
       context
           .Say(name_.source,
               "Name in ALLOCATE statement is not definable"_err_en_US)
diff --git a/flang/lib/Semantics/check-call.cpp b/flang/lib/Semantics/check-call.cpp
index ef51b9a..9fad1aa 100644
--- a/flang/lib/Semantics/check-call.cpp
+++ b/flang/lib/Semantics/check-call.cpp
@@ -27,7 +27,7 @@ namespace characteristics = Fortran::evaluate::characteristics;
 namespace Fortran::semantics {
 
 static void CheckImplicitInterfaceArg(evaluate::ActualArgument &arg,
-    parser::ContextualMessages &messages, evaluate::FoldingContext &context) {
+    parser::ContextualMessages &messages, SemanticsContext &context) {
   auto restorer{
       messages.SetLocation(arg.sourceLocation().value_or(messages.at()))};
   if (auto kw{arg.keyword()}) {
@@ -79,8 +79,12 @@ static void CheckImplicitInterfaceArg(evaluate::ActualArgument &arg,
         messages.Say(
             "VOLATILE argument requires an explicit interface"_err_en_US);
       }
+      if (const Symbol & base{named->GetFirstSymbol()};
+          IsFunctionResult(base)) {
+        context.NoteDefinedSymbol(base);
+      }
     } else if (auto argChars{characteristics::DummyArgument::FromActual(
-                   "actual argument", *expr, context,
+                   "actual argument", *expr, context.foldingContext(),
                    /*forImplicitInterface=*/true)}) {
       const auto *argProcDesignator{
           std::get_if<evaluate::ProcedureDesignator>(&expr->u)};
@@ -647,8 +651,8 @@ static void CheckExplicitDataArg(const characteristics::DummyDataObject &dummy,
         actualLastSymbol->name(), dummyName);
   }
 
-  // Definability
-  bool actualIsVariable{evaluate::IsVariable(actual)};
+  // Definability checking
+  // Problems with polymorphism are caught in the callee's definition.
   if (scope) {
     std::optional<parser::MessageFixedText> undefinableMessage;
     if (dummy.intent == common::Intent::Out) {
@@ -670,7 +674,6 @@ static void CheckExplicitDataArg(const characteristics::DummyDataObject &dummy,
       }
     }
     if (undefinableMessage) {
-      // Problems with polymorphism are caught in the callee's definition.
       DefinabilityFlags flags{DefinabilityFlag::PolymorphicOkInPure};
       if (isElemental) { // 15.5.2.4(21)
         flags.set(DefinabilityFlag::VectorSubscriptIsOk);
@@ -689,6 +692,14 @@ static void CheckExplicitDataArg(const characteristics::DummyDataObject &dummy,
           messages.Say(std::move(*whyNot));
         }
       }
+    } else if (dummy.intent != common::Intent::In ||
+        (dummyIsPointer && !actualIsPointer)) {
+      if (auto named{evaluate::ExtractNamedEntity(actual)}) {
+        if (const Symbol & base{named->GetFirstSymbol()};
+            IsFunctionResult(base)) {
+          context.NoteDefinedSymbol(base);
+        }
+      }
     }
   }
 
@@ -893,6 +904,7 @@ static void CheckExplicitDataArg(const characteristics::DummyDataObject &dummy,
   // argument
   if (dummy.attrs.test(characteristics::DummyDataObject::Attr::Target) &&
       context.ShouldWarn(common::UsageWarning::NonTargetPassedToTarget)) {
+    bool actualIsVariable{evaluate::IsVariable(actual)};
     bool actualIsTemp{!actualIsVariable || HasVectorSubscript(actual) ||
         evaluate::ExtractCoarrayRef(actual)};
     if (actualIsTemp) {
@@ -1416,7 +1428,8 @@ static void CheckAssociated(evaluate::ActualArguments &arguments,
             if (auto whyNot{WhyNotDefinable(
                     pointerArg->sourceLocation().value_or(messages.at()),
                     *scope,
-                    DefinabilityFlags{DefinabilityFlag::PointerDefinition},
+                    DefinabilityFlags{DefinabilityFlag::PointerDefinition,
+                        DefinabilityFlag::DoNotNoteDefinition},
                     *pointerExpr)}) {
               if (whyNot->IsFatal()) {
                 if (auto *msg{messages.Say(pointerArg->sourceLocation(),
@@ -2021,7 +2034,7 @@ bool CheckArguments(const characteristics::Procedure &proc,
       auto restorer{messages.SetMessages(buffer)};
       for (auto &actual : actuals) {
         if (actual) {
-          CheckImplicitInterfaceArg(*actual, messages, foldingContext);
+          CheckImplicitInterfaceArg(*actual, messages, context);
         }
       }
     }
diff --git a/flang/lib/Semantics/check-cuda.cpp b/flang/lib/Semantics/check-cuda.cpp
index 5b3ea21..60b8b32 100644
--- a/flang/lib/Semantics/check-cuda.cpp
+++ b/flang/lib/Semantics/check-cuda.cpp
@@ -307,6 +307,25 @@ private:
       WarnOnIoStmt(source);
     }
   }
+  template <typename A>
+  void ErrorIfHostSymbol(const A &expr, const parser::CharBlock &source) {
+    for (const Symbol &sym : CollectCudaSymbols(expr)) {
+      if (const auto *details =
+              sym.GetUltimate().detailsIf<semantics::ObjectEntityDetails>()) {
+        if (details->IsArray() &&
+            (!details->cudaDataAttr() ||
+                (details->cudaDataAttr() &&
+                    *details->cudaDataAttr() != common::CUDADataAttr::Device &&
+                    *details->cudaDataAttr() != common::CUDADataAttr::Managed &&
+                    *details->cudaDataAttr() !=
+                        common::CUDADataAttr::Unified))) {
+          context_.Say(source,
+              "Host array '%s' cannot be present in CUF kernel"_err_en_US,
+              sym.name());
+        }
+      }
+    }
+  }
   void Check(const parser::ActionStmt &stmt, const parser::CharBlock &source) {
     common::visit(
         common::visitors{
@@ -349,6 +368,19 @@ private:
             [&](const common::Indirection<parser::IfStmt> &x) {
               Check(x.value());
             },
+            [&](const common::Indirection<parser::AssignmentStmt> &x) {
+              if (IsCUFKernelDo) {
+                const evaluate::Assignment *assign{
+                    semantics::GetAssignment(x.value())};
+                if (assign) {
+                  ErrorIfHostSymbol(assign->lhs, source);
+                  ErrorIfHostSymbol(assign->rhs, source);
+                }
+              }
+              if (auto msg{ActionStmtChecker<IsCUFKernelDo>::WhyNotOk(x)}) {
+                context_.Say(source, std::move(*msg));
+              }
+            },
             [&](const auto &x) {
               if (auto msg{ActionStmtChecker<IsCUFKernelDo>::WhyNotOk(x)}) {
                 context_.Say(source, std::move(*msg));
diff --git a/flang/lib/Semantics/check-declarations.cpp b/flang/lib/Semantics/check-declarations.cpp
index eb898ba..a52f013 100644
--- a/flang/lib/Semantics/check-declarations.cpp
+++ b/flang/lib/Semantics/check-declarations.cpp
@@ -2919,7 +2919,7 @@ parser::Messages CheckHelper::WhyNotInteroperableDerivedType(
     if (derived->sequence()) { // C1801
       msgs.Say(symbol.name(),
           "An interoperable derived type cannot have the SEQUENCE attribute"_err_en_US);
-    } else if (!derived->paramDecls().empty()) { // C1802
+    } else if (!derived->paramNameOrder().empty()) { // C1802
       msgs.Say(symbol.name(),
           "An interoperable derived type cannot have a type parameter"_err_en_US);
     } else if (const auto *parent{
diff --git a/flang/lib/Semantics/check-purity.cpp b/flang/lib/Semantics/check-purity.cpp
index 55a9a2f..1046f36 100644
--- a/flang/lib/Semantics/check-purity.cpp
+++ b/flang/lib/Semantics/check-purity.cpp
@@ -31,7 +31,7 @@ void PurityChecker::Enter(const parser::FunctionSubprogram &func) {
       stmt.source, std::get<std::list<parser::PrefixSpec>>(stmt.statement.t));
 }
 
-void PurityChecker::Leave(const parser::FunctionSubprogram &) { Left(); }
+void PurityChecker::Leave(const parser::FunctionSubprogram &func) { Left(); }
 
 bool PurityChecker::InPureSubprogram() const {
   return pureDepth_ >= 0 && depth_ >= pureDepth_;
diff --git a/flang/lib/Semantics/definable.cpp b/flang/lib/Semantics/definable.cpp
index d594b1e..ae76f66 100644
--- a/flang/lib/Semantics/definable.cpp
+++ b/flang/lib/Semantics/definable.cpp
@@ -127,6 +127,12 @@ static std::optional<parser::Message> WhyNotDefinableBase(parser::CharBlock at,
       (!IsPointer(ultimate) || (isWholeSymbol && isPointerDefinition))) {
     return BlameSymbol(
         at, "'%s' is an INTENT(IN) dummy argument"_en_US, original);
+  } else if (acceptAllocatable &&
+      !flags.test(DefinabilityFlag::SourcedAllocation)) {
+    // allocating a function result doesn't count as a def'n
+    // unless there's SOURCE=
+  } else if (!flags.test(DefinabilityFlag::DoNotNoteDefinition)) {
+    scope.context().NoteDefinedSymbol(ultimate);
   }
   if (const Scope * pure{FindPureProcedureContaining(scope)}) {
     // Additional checking for pure subprograms.
diff --git a/flang/lib/Semantics/definable.h b/flang/lib/Semantics/definable.h
index b14c644..709bbba 100644
--- a/flang/lib/Semantics/definable.h
+++ b/flang/lib/Semantics/definable.h
@@ -30,7 +30,9 @@ ENUM_CLASS(DefinabilityFlag,
     DuplicatesAreOk, // vector subscript may have duplicates
     PointerDefinition, // a pointer is being defined, not its target
     AcceptAllocatable, // treat allocatable as if it were a pointer
-    PolymorphicOkInPure) // don't check for polymorphic type in pure subprogram
+    SourcedAllocation, // ALLOCATE(a,SOURCE=)
+    PolymorphicOkInPure, // don't check for polymorphic type in pure subprogram
+    DoNotNoteDefinition) // context does not imply definition
 
 using DefinabilityFlags =
     common::EnumSet<DefinabilityFlag, DefinabilityFlag_enumSize>;
diff --git a/flang/lib/Semantics/mod-file.cpp b/flang/lib/Semantics/mod-file.cpp
index a1c4c03..c7e7716 100644
--- a/flang/lib/Semantics/mod-file.cpp
+++ b/flang/lib/Semantics/mod-file.cpp
@@ -410,15 +410,14 @@ bool ModFileWriter::PutComponents(const Symbol &typeSymbol) {
   llvm::raw_string_ostream typeBindings{buf};
   UnorderedSymbolSet emitted;
   SymbolVector symbols{scope.GetSymbols()};
-  // Emit type parameters first
-  for (const Symbol &symbol : symbols) {
-    if (symbol.has<TypeParamDetails>()) {
-      PutSymbol(typeBindings, symbol);
-      emitted.emplace(symbol);
-    }
-  }
-  // Emit components in component order.
+  // Emit type parameter declarations first, in order
   const auto &details{typeSymbol.get<DerivedTypeDetails>()};
+  for (const Symbol &symbol : details.paramDeclOrder()) {
+    CHECK(symbol.has<TypeParamDetails>());
+    PutSymbol(typeBindings, symbol);
+    emitted.emplace(symbol);
+  }
+  // Emit actual components in component order.
   for (SourceName name : details.componentNames()) {
     auto iter{scope.find(name)};
     if (iter != scope.end()) {
@@ -549,10 +548,10 @@ void ModFileWriter::PutDerivedType(
     decls_ << ",extends(" << extends->name() << ')';
   }
   decls_ << "::" << typeSymbol.name();
-  if (!details.paramNames().empty()) {
+  if (!details.paramNameOrder().empty()) {
     char sep{'('};
-    for (const auto &name : details.paramNames()) {
-      decls_ << sep << name;
+    for (const SymbolRef &ref : details.paramNameOrder()) {
+      decls_ << sep << ref->name();
       sep = ',';
     }
     decls_ << ')';
@@ -1046,7 +1045,7 @@ void ModFileWriter::PutTypeParam(llvm::raw_ostream &os, const Symbol &symbol) {
       os, symbol,
       [&]() {
         PutType(os, DEREF(symbol.GetType()));
-        PutLower(os << ',', common::EnumToString(details.attr()));
+        PutLower(os << ',', common::EnumToString(details.attr().value()));
       },
       symbol.attrs());
   PutInit(os, details.init());
diff --git a/flang/lib/Semantics/pointer-assignment.cpp b/flang/lib/Semantics/pointer-assignment.cpp
index dae3b1a..4948fce 100644
--- a/flang/lib/Semantics/pointer-assignment.cpp
+++ b/flang/lib/Semantics/pointer-assignment.cpp
@@ -358,8 +358,10 @@ bool PointerAssignmentChecker::Check(const evaluate::Designator<T> &d) {
       Say(std::get<MessageFormattedText>(*msg));
     }
     return false;
+  } else {
+    context_.NoteDefinedSymbol(*base);
+    return true;
   }
-  return true;
 }
 
 // Common handling for procedure pointer right-hand sides
diff --git a/flang/lib/Semantics/resolve-directives.cpp b/flang/lib/Semantics/resolve-directives.cpp
index fb32ce6..d635a7b 100644
--- a/flang/lib/Semantics/resolve-directives.cpp
+++ b/flang/lib/Semantics/resolve-directives.cpp
@@ -19,6 +19,7 @@
 #include "flang/Parser/parse-tree.h"
 #include "flang/Parser/tools.h"
 #include "flang/Semantics/expression.h"
+#include "flang/Semantics/tools.h"
 #include <list>
 #include <map>
 #include <sstream>
@@ -729,7 +730,6 @@ private:
   void CheckNameInAllocateStmt(const parser::CharBlock &source,
       const parser::Name &ompObject, const parser::AllocateStmt &allocate);
 
-  bool HasSymbolInEnclosingScope(const Symbol &, Scope &);
   std::int64_t ordCollapseLevel{0};
 
   void AddOmpRequiresToScope(Scope &, WithOmpDeclarative::RequiresFlags,
@@ -2035,6 +2035,14 @@ void OmpAttributeVisitor::Post(const parser::OpenMPAllocatorsConstruct &x) {
 // and adjust the symbol for each Name if necessary
 void OmpAttributeVisitor::Post(const parser::Name &name) {
   auto *symbol{name.symbol};
+  auto IsPrivatizable = [](const Symbol *sym) {
+    return !IsProcedure(*sym) && !IsNamedConstant(*sym) &&
+        !sym->owner().IsDerivedType() &&
+        sym->owner().kind() != Scope::Kind::ImpliedDos &&
+        !sym->detailsIf<semantics::AssocEntityDetails>() &&
+        !sym->detailsIf<semantics::NamelistDetails>();
+  };
+
   if (symbol && !dirContext_.empty() && GetContext().withinConstruct) {
     // Exclude construct-names
     if (auto *details{symbol->detailsIf<semantics::MiscDetails>()}) {
@@ -2042,8 +2050,7 @@ void OmpAttributeVisitor::Post(const parser::Name &name) {
         return;
       }
     }
-    if (!symbol->owner().IsDerivedType() && !IsProcedure(*symbol) &&
-        !IsObjectWithDSA(*symbol) && !IsNamedConstant(*symbol)) {
+    if (IsPrivatizable(symbol) && !IsObjectWithDSA(*symbol)) {
       // TODO: create a separate function to go through the rules for
       //       predetermined, explicitly determined, and implicitly
       //       determined data-sharing attributes (2.15.1.1).
@@ -2068,6 +2075,9 @@ void OmpAttributeVisitor::Post(const parser::Name &name) {
       if (found->test(semantics::Symbol::Flag::OmpThreadprivate))
         return;
     }
+    if (!IsPrivatizable(symbol)) {
+      return;
+    }
 
     // Implicitly determined DSAs
     // OMP 5.2 5.1.1 - Variables Referenced in a Construct
@@ -2085,16 +2095,22 @@ void OmpAttributeVisitor::Post(const parser::Name &name) {
         }
       }
 
-      // When handling each implicit rule, either a new private symbol is
-      // declared or the last declared symbol is used.
-      // In the latter case, it's necessary to insert a new symbol in the scope
-      // being processed, associated with the last declared symbol.
-      // This captures the fact that, although we are using the last declared
-      // symbol, its DSA could be different in this scope.
-      // Also, because of how symbols are collected in lowering, not inserting
-      // a new symbol in this scope could lead to the conclusion that the
-      // symbol was declared in this construct, which would result in wrong
-      // privatization code being generated.
+      // When handling each implicit rule for a given symbol, one of the
+      // following 3 actions may be taken:
+      // 1. Declare a new private symbol.
+      // 2. Create a new association symbol with no flags, that will represent
+      //    a shared symbol in the current scope. Note that symbols without
+      //    any private flags are considered as shared.
+      // 3. Use the last declared private symbol, by inserting a new symbol
+      //    in the scope being processed, associated with it.
+      //    If no private symbol was declared previously, then no association
+      //    is needed and the symbol from the enclosing scope will be
+      //    inherited by the current one.
+      //
+      // Because of how symbols are collected in lowering, not inserting a new
+      // symbol in the last case could lead to the conclusion that a symbol
+      // from an enclosing construct was declared in the current construct,
+      // which would result in wrong privatization code being generated.
       // Consider the following example:
       //
       // !$omp parallel default(private)              ! p1
@@ -2107,48 +2123,56 @@ void OmpAttributeVisitor::Post(const parser::Name &name) {
       // (p2), it would use the x symbol definition from the enclosing scope.
       // Then, when p2's default symbols were collected in lowering, the x
       // symbol from the outer parallel construct (p1) would be collected, as
-      // it would have the private flag set (note that symbols that don't have
-      // any private flag are considered as shared).
+      // it would have the private flag set.
       // This would make x appear to be defined in p2, causing it to be
       // privatized in p2 and its privatization in p1 to be skipped.
-      auto declNewSymbol = [&](Symbol::Flag flag) {
+      auto makePrivateSymbol = [&](Symbol::Flag flag) {
         Symbol *hostSymbol =
             lastDeclSymbol ? lastDeclSymbol : &symbol->GetUltimate();
         lastDeclSymbol = DeclarePrivateAccessEntity(
             *hostSymbol, flag, context_.FindScope(dirContext.directiveSource));
         return lastDeclSymbol;
       };
+      auto makeSharedSymbol = [&]() {
+        Symbol *hostSymbol =
+            lastDeclSymbol ? lastDeclSymbol : &symbol->GetUltimate();
+        MakeAssocSymbol(symbol->name(), *hostSymbol,
+            context_.FindScope(dirContext.directiveSource));
+      };
       auto useLastDeclSymbol = [&]() {
         if (lastDeclSymbol)
           MakeAssocSymbol(symbol->name(), *lastDeclSymbol,
               context_.FindScope(dirContext.directiveSource));
       };
 
+      bool taskGenDir = llvm::omp::taskGeneratingSet.test(dirContext.directive);
+      bool targetDir = llvm::omp::allTargetSet.test(dirContext.directive);
+      bool parallelDir = llvm::omp::allParallelSet.test(dirContext.directive);
+      bool teamsDir = llvm::omp::allTeamsSet.test(dirContext.directive);
+
       if (dsa.has_value()) {
-        useLastDeclSymbol();
+        if (dsa.value() == Symbol::Flag::OmpShared &&
+            (parallelDir || taskGenDir || teamsDir))
+          makeSharedSymbol();
+        // Private symbols will have been declared already.
         prevDSA = dsa;
         continue;
       }
 
-      bool taskGenDir = llvm::omp::taskGeneratingSet.test(dirContext.directive);
-      bool targetDir = llvm::omp::allTargetSet.test(dirContext.directive);
-      bool parallelDir = llvm::omp::allParallelSet.test(dirContext.directive);
-
       if (dirContext.defaultDSA == Symbol::Flag::OmpPrivate ||
           dirContext.defaultDSA == Symbol::Flag::OmpFirstPrivate ||
           dirContext.defaultDSA == Symbol::Flag::OmpShared) {
         // 1) default
         // Allowed only with parallel, teams and task generating constructs.
-        assert(parallelDir || taskGenDir ||
-            llvm::omp::allTeamsSet.test(dirContext.directive));
+        assert(parallelDir || taskGenDir || teamsDir);
         if (dirContext.defaultDSA != Symbol::Flag::OmpShared)
-          declNewSymbol(dirContext.defaultDSA);
+          makePrivateSymbol(dirContext.defaultDSA);
         else
-          useLastDeclSymbol();
+          makeSharedSymbol();
         dsa = dirContext.defaultDSA;
       } else if (parallelDir) {
         // 2) parallel -> shared
-        useLastDeclSymbol();
+        makeSharedSymbol();
         dsa = Symbol::Flag::OmpShared;
       } else if (!taskGenDir && !targetDir) {
         // 3) enclosing context
@@ -2161,12 +2185,12 @@ void OmpAttributeVisitor::Post(const parser::Name &name) {
         // TODO 5) dummy arg in orphaned taskgen construct -> firstprivate
         if (prevDSA == Symbol::Flag::OmpShared) {
           // 6) shared in enclosing context -> shared
-          useLastDeclSymbol();
+          makeSharedSymbol();
           dsa = Symbol::Flag::OmpShared;
         } else {
           // 7) firstprivate
           dsa = Symbol::Flag::OmpFirstPrivate;
-          declNewSymbol(*dsa)->set(Symbol::Flag::OmpImplicit);
+          makePrivateSymbol(*dsa)->set(Symbol::Flag::OmpImplicit);
         }
       }
       prevDSA = dsa;
@@ -2570,20 +2594,59 @@ void ResolveOmpTopLevelParts(
   });
 }
 
-void OmpAttributeVisitor::CheckDataCopyingClause(
-    const parser::Name &name, const Symbol &symbol, Symbol::Flag ompFlag) {
-  const auto *checkSymbol{&symbol};
+static bool IsSymbolInCommonBlock(const Symbol &symbol) {
+  // TODO Improve the performance of this predicate function.
+  //      Going through all symbols sequentially, in all common blocks, can be
+  //      slow when there are many symbols. A possible optimization is to add
+  //      an OmpInCommonBlock flag to Symbol, to make it possible to quickly
+  //      test if a given symbol is in a common block.
+  for (const auto &cb : symbol.owner().commonBlocks()) {
+    if (IsCommonBlockContaining(cb.second.get(), symbol)) {
+      return true;
+    }
+  }
+  return false;
+}
+
+static bool IsSymbolThreadprivate(const Symbol &symbol) {
   if (const auto *details{symbol.detailsIf<HostAssocDetails>()}) {
-    checkSymbol = &details->symbol();
+    return details->symbol().test(Symbol::Flag::OmpThreadprivate);
   }
+  return symbol.test(Symbol::Flag::OmpThreadprivate);
+}
 
+static bool IsSymbolPrivate(const Symbol &symbol) {
+  if (symbol.test(Symbol::Flag::OmpPrivate) ||
+      symbol.test(Symbol::Flag::OmpFirstPrivate)) {
+    return true;
+  }
+  // A symbol that has not gone through constructs that may privatize the
+  // original symbol may be predetermined as private.
+  // (OMP 5.2 5.1.1 - Variables Referenced in a Construct)
+  if (symbol == symbol.GetUltimate()) {
+    switch (symbol.owner().kind()) {
+    case Scope::Kind::MainProgram:
+    case Scope::Kind::Subprogram:
+    case Scope::Kind::BlockConstruct:
+      return !symbol.attrs().test(Attr::SAVE) &&
+          !symbol.attrs().test(Attr::PARAMETER) && !IsAssumedShape(symbol) &&
+          !IsSymbolInCommonBlock(symbol);
+    default:
+      return false;
+    }
+  }
+  return false;
+}
+
+void OmpAttributeVisitor::CheckDataCopyingClause(
+    const parser::Name &name, const Symbol &symbol, Symbol::Flag ompFlag) {
   if (ompFlag == Symbol::Flag::OmpCopyIn) {
     // List of items/objects that can appear in a 'copyin' clause must be
     // 'threadprivate'
-    if (!checkSymbol->test(Symbol::Flag::OmpThreadprivate)) {
+    if (!IsSymbolThreadprivate(symbol)) {
       context_.Say(name.source,
           "Non-THREADPRIVATE object '%s' in COPYIN clause"_err_en_US,
-          checkSymbol->name());
+          symbol.name());
     }
   } else if (ompFlag == Symbol::Flag::OmpCopyPrivate &&
       GetContext().directive == llvm::omp::Directive::OMPD_single) {
@@ -2596,18 +2659,13 @@ void OmpAttributeVisitor::CheckDataCopyingClause(
           "COPYPRIVATE variable '%s' may not appear on a PRIVATE or "
           "FIRSTPRIVATE clause on a SINGLE construct"_err_en_US,
           symbol.name());
-    } else {
+    } else if (!IsSymbolThreadprivate(symbol) && !IsSymbolPrivate(symbol)) {
       // List of items/objects that can appear in a 'copyprivate' clause must be
       // either 'private' or 'threadprivate' in enclosing context.
-      if (!checkSymbol->test(Symbol::Flag::OmpThreadprivate) &&
-          !(HasSymbolInEnclosingScope(symbol, currScope()) &&
-              (symbol.test(Symbol::Flag::OmpPrivate) ||
-                  symbol.test(Symbol::Flag::OmpFirstPrivate)))) {
-        context_.Say(name.source,
-            "COPYPRIVATE variable '%s' is not PRIVATE or THREADPRIVATE in "
-            "outer context"_err_en_US,
-            symbol.name());
-      }
+      context_.Say(name.source,
+          "COPYPRIVATE variable '%s' is not PRIVATE or THREADPRIVATE in "
+          "outer context"_err_en_US,
+          symbol.name());
     }
   }
 }
@@ -2677,12 +2735,6 @@ void OmpAttributeVisitor::CheckLabelContext(const parser::CharBlock source,
   }
 }
 
-bool OmpAttributeVisitor::HasSymbolInEnclosingScope(
-    const Symbol &symbol, Scope &scope) {
-  const auto symbols{scope.parent().GetSymbols()};
-  return llvm::is_contained(symbols, symbol);
-}
-
 // Goes through the names in an OmpObjectList and checks if each name appears
 // in the given allocate statement
 void OmpAttributeVisitor::CheckAllNamesInAllocateStmt(
diff --git a/flang/lib/Semantics/resolve-names.cpp b/flang/lib/Semantics/resolve-names.cpp
index f761355..b7725c5 100644
--- a/flang/lib/Semantics/resolve-names.cpp
+++ b/flang/lib/Semantics/resolve-names.cpp
@@ -5456,34 +5456,14 @@ bool DeclarationVisitor::Pre(const parser::DerivedTypeDef &x) {
   CHECK(scope.symbol());
   CHECK(scope.symbol()->scope() == &scope);
   auto &details{scope.symbol()->get<DerivedTypeDetails>()};
-  std::set<SourceName> paramNames;
   for (auto &paramName : std::get<std::list<parser::Name>>(stmt.statement.t)) {
-    details.add_paramName(paramName.source);
-    auto *symbol{FindInScope(scope, paramName)};
-    if (!symbol) {
-      Say(paramName,
-          "No definition found for type parameter '%s'"_err_en_US); // C742
-      // No symbol for a type param.  Create one and mark it as containing an
-      // error to improve subsequent semantic processing
-      BeginAttrs();
-      Symbol *typeParam{MakeTypeSymbol(
-          paramName, TypeParamDetails{common::TypeParamAttr::Len})};
-      context().SetError(*typeParam);
-      EndAttrs();
-    } else if (!symbol->has<TypeParamDetails>()) {
-      Say2(paramName, "'%s' is not defined as a type parameter"_err_en_US,
-          *symbol, "Definition of '%s'"_en_US); // C741
-    }
-    if (!paramNames.insert(paramName.source).second) {
-      Say(paramName,
-          "Duplicate type parameter name: '%s'"_err_en_US); // C731
-    }
-  }
-  for (const auto &[name, symbol] : currScope()) {
-    if (symbol->has<TypeParamDetails>() && !paramNames.count(name)) {
-      SayDerivedType(name,
-          "'%s' is not a type parameter of this derived type"_err_en_US,
-          currScope()); // C741
+    if (auto *symbol{FindInScope(scope, paramName)}) {
+      if (auto *details{symbol->detailsIf<TypeParamDetails>()}) {
+        if (!details->attr()) {
+          Say(paramName,
+              "No definition found for type parameter '%s'"_err_en_US); // C742
+        }
+      }
     }
   }
   Walk(std::get<std::list<parser::Statement<parser::PrivateOrSequence>>>(x.t));
@@ -5499,7 +5479,7 @@ bool DeclarationVisitor::Pre(const parser::DerivedTypeDef &x) {
             "A sequence type should have at least one component"_warn_en_US);
       }
     }
-    if (!details.paramNames().empty()) { // C740
+    if (!details.paramDeclOrder().empty()) { // C740
       Say(stmt.source,
           "A sequence type may not have type parameters"_err_en_US);
     }
@@ -5559,24 +5539,50 @@ void DeclarationVisitor::Post(const parser::DerivedTypeStmt &x) {
       details.add_component(comp);
     }
   }
+  // Create symbols now for type parameters so that they shadow names
+  // from the enclosing specification part.
+  if (auto *details{symbol.detailsIf<DerivedTypeDetails>()}) {
+    for (const auto &name : std::get<std::list<parser::Name>>(x.t)) {
+      if (Symbol * symbol{MakeTypeSymbol(name, TypeParamDetails{})}) {
+        details->add_paramNameOrder(*symbol);
+      }
+    }
+  }
   EndAttrs();
 }
 
 void DeclarationVisitor::Post(const parser::TypeParamDefStmt &x) {
   auto *type{GetDeclTypeSpec()};
+  DerivedTypeDetails *derivedDetails{nullptr};
+  if (Symbol * dtSym{currScope().symbol()}) {
+    derivedDetails = dtSym->detailsIf<DerivedTypeDetails>();
+  }
   auto attr{std::get<common::TypeParamAttr>(x.t)};
   for (auto &decl : std::get<std::list<parser::TypeParamDecl>>(x.t)) {
     auto &name{std::get<parser::Name>(decl.t)};
-    if (Symbol * symbol{MakeTypeSymbol(name, TypeParamDetails{attr})}) {
-      SetType(name, *type);
-      if (auto &init{
-              std::get<std::optional<parser::ScalarIntConstantExpr>>(decl.t)}) {
-        if (auto maybeExpr{AnalyzeExpr(context(), *init)}) {
-          if (auto *intExpr{std::get_if<SomeIntExpr>(&maybeExpr->u)}) {
-            symbol->get<TypeParamDetails>().set_init(std::move(*intExpr));
+    if (Symbol * symbol{FindInScope(currScope(), name)}) {
+      if (auto *paramDetails{symbol->detailsIf<TypeParamDetails>()}) {
+        if (!paramDetails->attr()) {
+          paramDetails->set_attr(attr);
+          SetType(name, *type);
+          if (auto &init{std::get<std::optional<parser::ScalarIntConstantExpr>>(
+                  decl.t)}) {
+            if (auto maybeExpr{AnalyzeExpr(context(), *init)}) {
+              if (auto *intExpr{std::get_if<SomeIntExpr>(&maybeExpr->u)}) {
+                paramDetails->set_init(std::move(*intExpr));
+              }
+            }
           }
+          if (derivedDetails) {
+            derivedDetails->add_paramDeclOrder(*symbol);
+          }
+        } else {
+          Say(name,
+              "Type parameter '%s' was already declared in this derived type"_err_en_US);
         }
       }
+    } else {
+      Say(name, "'%s' is not a parameter of this derived type"_err_en_US);
     }
   }
   EndDecl();
@@ -6779,9 +6785,6 @@ Symbol *DeclarationVisitor::MakeTypeSymbol(
     }
     Symbol &result{MakeSymbol(name, attrs, std::move(details))};
     SetCUDADataAttr(name, result, cudaDataAttr());
-    if (result.has<TypeParamDetails>()) {
-      derivedType.symbol()->get<DerivedTypeDetails>().add_paramDecl(result);
-    }
     return &result;
   }
 }
@@ -7118,9 +7121,13 @@ void ConstructVisitor::Post(const parser::AssociateStmt &x) {
   for (auto nthLastAssoc{assocCount}; nthLastAssoc > 0; --nthLastAssoc) {
     SetCurrentAssociation(nthLastAssoc);
     if (auto *symbol{MakeAssocEntity()}) {
-      if (ExtractCoarrayRef(GetCurrentAssociation().selector.expr)) { // C1103
+      const MaybeExpr &expr{GetCurrentAssociation().selector.expr};
+      if (ExtractCoarrayRef(expr)) { // C1103
         Say("Selector must not be a coindexed object"_err_en_US);
       }
+      if (evaluate::IsAssumedRank(expr)) {
+        Say("Selector must not be assumed-rank"_err_en_US);
+      }
       SetTypeFromAssociation(*symbol);
       SetAttrsFromAssociation(*symbol);
     }
@@ -7830,6 +7837,12 @@ const parser::Name *DeclarationVisitor::ResolveName(const parser::Name &name) {
       CheckEntryDummyUse(name.source, symbol);
       ConvertToObjectEntity(*symbol);
       ApplyImplicitRules(*symbol);
+    } else if (const auto *tpd{symbol->detailsIf<TypeParamDetails>()};
+               tpd && !tpd->attr()) {
+      Say(name,
+          "Type parameter '%s' was referenced before being declared"_err_en_US,
+          name.source);
+      context().SetError(*symbol);
     }
     if (checkIndexUseInOwnBounds_ &&
         *checkIndexUseInOwnBounds_ == name.source && !InModuleFile()) {
diff --git a/flang/lib/Semantics/runtime-type-info.cpp b/flang/lib/Semantics/runtime-type-info.cpp
index 8939dc4..6690924 100644
--- a/flang/lib/Semantics/runtime-type-info.cpp
+++ b/flang/lib/Semantics/runtime-type-info.cpp
@@ -42,7 +42,8 @@ static int FindLenParameterIndex(
     if (&*ref == &symbol) {
       return lenIndex;
     }
-    if (ref->get<TypeParamDetails>().attr() == common::TypeParamAttr::Len) {
+    if (auto attr{ref->get<TypeParamDetails>().attr()};
+        attr && *attr == common::TypeParamAttr::Len) {
       ++lenIndex;
     }
   }
@@ -371,7 +372,7 @@ static std::optional<std::string> GetSuffixIfTypeKindParameters(
     std::optional<std::string> suffix;
     for (SymbolRef ref : *parameters) {
       const auto &tpd{ref->get<TypeParamDetails>()};
-      if (tpd.attr() == common::TypeParamAttr::Kind) {
+      if (tpd.attr() && *tpd.attr() == common::TypeParamAttr::Kind) {
         if (const auto *pv{derivedTypeSpec.FindParameter(ref->name())}) {
           if (pv->GetExplicit()) {
             if (auto instantiatedValue{evaluate::ToInt64(*pv->GetExplicit())}) {
@@ -497,7 +498,7 @@ const Symbol *RuntimeTableBuilder::DescribeType(Scope &dtScope) {
     for (SymbolRef ref : *parameters) {
       if (const auto *inst{dtScope.FindComponent(ref->name())}) {
         const auto &tpd{inst->get<TypeParamDetails>()};
-        if (tpd.attr() == common::TypeParamAttr::Kind) {
+        if (tpd.attr() && *tpd.attr() == common::TypeParamAttr::Kind) {
           auto value{evaluate::ToInt64(tpd.init()).value_or(0)};
           if (derivedTypeSpec) {
             if (const auto *pv{derivedTypeSpec->FindParameter(inst->name())}) {
@@ -799,7 +800,7 @@ evaluate::StructureConstructor RuntimeTableBuilder::DescribeComponent(
           specParams{GetTypeParameters(spec.typeSymbol())}) {
         for (SymbolRef ref : *specParams) {
           const auto &tpd{ref->get<TypeParamDetails>()};
-          if (tpd.attr() == common::TypeParamAttr::Len) {
+          if (tpd.attr() && *tpd.attr() == common::TypeParamAttr::Len) {
             if (const ParamValue *
                 paramValue{spec.FindParameter(ref->name())}) {
               lenParams.emplace_back(GetValue(*paramValue, parameters));
diff --git a/flang/lib/Semantics/semantics.cpp b/flang/lib/Semantics/semantics.cpp
index c09734e..3cb24f6 100644
--- a/flang/lib/Semantics/semantics.cpp
+++ b/flang/lib/Semantics/semantics.cpp
@@ -160,6 +160,41 @@ private:
   SemanticsContext &context_;
 };
 
+static void WarnUndefinedFunctionResult(
+    SemanticsContext &context, const Scope &scope) {
+  auto WasDefined{[&context](const Symbol &symbol) {
+    return context.IsSymbolDefined(symbol) ||
+        IsInitialized(symbol, /*ignoreDataStatements=*/true,
+            /*ignoreAllocatable=*/true, /*ignorePointer=*/true);
+  }};
+  if (const Symbol * symbol{scope.symbol()}) {
+    if (const auto *subp{symbol->detailsIf<SubprogramDetails>()}) {
+      if (subp->isFunction() && !subp->isInterface() && !subp->stmtFunction()) {
+        bool wasDefined{WasDefined(subp->result())};
+        if (!wasDefined) {
+          // Definitions of ENTRY result variables also count.
+          for (const auto &pair : scope) {
+            const Symbol &local{*pair.second};
+            if (IsFunctionResult(local) && WasDefined(local)) {
+              wasDefined = true;
+              break;
+            }
+          }
+          if (!wasDefined) {
+            context.Say(
+                symbol->name(), "Function result is never defined"_warn_en_US);
+          }
+        }
+      }
+    }
+  }
+  if (!scope.IsModuleFile()) {
+    for (const Scope &child : scope.children()) {
+      WarnUndefinedFunctionResult(context, child);
+    }
+  }
+}
+
 using StatementSemanticsPass1 = ExprChecker;
 using StatementSemanticsPass2 = SemanticsVisitor<AllocateChecker,
     ArithmeticIfStmtChecker, AssignmentChecker, CaseChecker, CoarrayChecker,
@@ -187,6 +222,9 @@ static bool PerformStatementSemantics(
     SemanticsVisitor<CUDAChecker>{context}.Walk(program);
   }
   if (!context.AnyFatalError()) {
+    if (context.ShouldWarn(common::UsageWarning::UndefinedFunctionResult)) {
+      WarnUndefinedFunctionResult(context, context.globalScope());
+    }
     pass2.CompileDataInitializationsIntoInitializers();
   }
   return !context.AnyFatalError();
@@ -712,4 +750,12 @@ CommonBlockList SemanticsContext::GetCommonBlocks() const {
   return {};
 }
 
+void SemanticsContext::NoteDefinedSymbol(const Symbol &symbol) {
+  isDefined_.insert(symbol);
+}
+
+bool SemanticsContext::IsSymbolDefined(const Symbol &symbol) const {
+  return isDefined_.find(symbol) != isDefined_.end();
+}
+
 } // namespace Fortran::semantics
diff --git a/flang/lib/Semantics/symbol.cpp b/flang/lib/Semantics/symbol.cpp
index 023ab7b..31e91ee 100644
--- a/flang/lib/Semantics/symbol.cpp
+++ b/flang/lib/Semantics/symbol.cpp
@@ -588,7 +588,11 @@ llvm::raw_ostream &operator<<(llvm::raw_ostream &os, const Details &details) {
           },
           [&](const TypeParamDetails &x) {
             DumpOptional(os, "type", x.type());
-            os << ' ' << common::EnumToString(x.attr());
+            if (auto attr{x.attr()}) {
+              os << ' ' << common::EnumToString(*attr);
+            } else {
+              os << " (no attr)";
+            }
             DumpExpr(os, "init", x.init());
           },
           [&](const MiscDetails &x) {
@@ -739,9 +743,16 @@ const Symbol *DerivedTypeDetails::GetFinalForRank(int rank) const {
   return nullptr;
 }
 
-void TypeParamDetails::set_type(const DeclTypeSpec &type) {
+TypeParamDetails &TypeParamDetails::set_attr(common::TypeParamAttr attr) {
+  CHECK(!attr_);
+  attr_ = attr;
+  return *this;
+}
+
+TypeParamDetails &TypeParamDetails::set_type(const DeclTypeSpec &type) {
   CHECK(!type_);
   type_ = &type;
+  return *this;
 }
 
 bool GenericKind::IsIntrinsicOperator() const {
diff --git a/flang/lib/Semantics/tools.cpp b/flang/lib/Semantics/tools.cpp
index 9938191..fdaf052 100644
--- a/flang/lib/Semantics/tools.cpp
+++ b/flang/lib/Semantics/tools.cpp
@@ -685,7 +685,7 @@ bool IsInitialized(const Symbol &symbol, bool ignoreDataStatements,
     return true;
   } else if (IsPointer(symbol)) {
     return !ignorePointer;
-  } else if (IsNamedConstant(symbol) || IsFunctionResult(symbol)) {
+  } else if (IsNamedConstant(symbol)) {
     return false;
   } else if (const auto *object{symbol.detailsIf<ObjectEntityDetails>()}) {
     if (!object->isDummy() && object->type()) {
@@ -1135,12 +1135,12 @@ std::optional<parser::MessageFormattedText> CheckAccessibleSymbol(
   return std::nullopt;
 }
 
-std::list<SourceName> OrderParameterNames(const Symbol &typeSymbol) {
-  std::list<SourceName> result;
+SymbolVector OrderParameterNames(const Symbol &typeSymbol) {
+  SymbolVector result;
   if (const DerivedTypeSpec * spec{typeSymbol.GetParentTypeSpec()}) {
     result = OrderParameterNames(spec->typeSymbol());
   }
-  const auto &paramNames{typeSymbol.get<DerivedTypeDetails>().paramNames()};
+  const auto &paramNames{typeSymbol.get<DerivedTypeDetails>().paramNameOrder()};
   result.insert(result.end(), paramNames.begin(), paramNames.end());
   return result;
 }
@@ -1150,7 +1150,7 @@ SymbolVector OrderParameterDeclarations(const Symbol &typeSymbol) {
   if (const DerivedTypeSpec * spec{typeSymbol.GetParentTypeSpec()}) {
     result = OrderParameterDeclarations(spec->typeSymbol());
   }
-  const auto &paramDecls{typeSymbol.get<DerivedTypeDetails>().paramDecls()};
+  const auto &paramDecls{typeSymbol.get<DerivedTypeDetails>().paramDeclOrder()};
   result.insert(result.end(), paramDecls.begin(), paramDecls.end());
   return result;
 }
diff --git a/flang/lib/Semantics/type.cpp b/flang/lib/Semantics/type.cpp
index ed24743..cfaee0b 100644
--- a/flang/lib/Semantics/type.cpp
+++ b/flang/lib/Semantics/type.cpp
@@ -63,7 +63,6 @@ void DerivedTypeSpec::CookParameters(evaluate::FoldingContext &foldingContext) {
   // Parameters of the most deeply nested "base class" come first when the
   // derived type is an extension.
   auto parameterNames{OrderParameterNames(typeSymbol_)};
-  auto parameterDecls{OrderParameterDeclarations(typeSymbol_)};
   auto nextNameIter{parameterNames.begin()};
   RawParameters raw{std::move(rawParameters_)};
   for (auto &[maybeKeyword, value] : raw) {
@@ -71,25 +70,25 @@ void DerivedTypeSpec::CookParameters(evaluate::FoldingContext &foldingContext) {
     common::TypeParamAttr attr{common::TypeParamAttr::Kind};
     if (maybeKeyword) {
       name = maybeKeyword->v.source;
-      auto it{std::find_if(parameterDecls.begin(), parameterDecls.end(),
+      auto it{std::find_if(parameterNames.begin(), parameterNames.end(),
           [&](const Symbol &symbol) { return symbol.name() == name; })};
-      if (it == parameterDecls.end()) {
+      if (it == parameterNames.end()) {
         messages.Say(name,
             "'%s' is not the name of a parameter for derived type '%s'"_err_en_US,
             name, typeSymbol_.name());
       } else {
         // Resolve the keyword's symbol
         maybeKeyword->v.symbol = const_cast<Symbol *>(&it->get());
-        attr = it->get().get<TypeParamDetails>().attr();
+        if (const auto *tpd{it->get().detailsIf<TypeParamDetails>()}) {
+          attr = tpd->attr().value_or(attr);
+        }
       }
     } else if (nextNameIter != parameterNames.end()) {
-      name = *nextNameIter++;
-      auto it{std::find_if(parameterDecls.begin(), parameterDecls.end(),
-          [&](const Symbol &symbol) { return symbol.name() == name; })};
-      if (it == parameterDecls.end()) {
-        break;
+      name = nextNameIter->get().name();
+      if (const auto *tpd{nextNameIter->get().detailsIf<TypeParamDetails>()}) {
+        attr = tpd->attr().value_or(attr);
       }
-      attr = it->get().get<TypeParamDetails>().attr();
+      ++nextNameIter;
     } else {
       messages.Say(name_,
           "Too many type parameters given for derived type '%s'"_err_en_US,
@@ -160,7 +159,7 @@ void DerivedTypeSpec::EvaluateParameters(SemanticsContext &context) {
       // Default type parameter value expressions are folded within
       // the scope of the derived type being instantiated.
       const TypeParamDetails &details{symbol.get<TypeParamDetails>()};
-      if (details.init()) {
+      if (details.init() && details.attr()) {
         evaluate::DynamicType dyType{TypeCategory::Integer, parameterKind};
         if (auto converted{
                 evaluate::ConvertToType(dyType, SomeExpr{*details.init()})}) {
@@ -169,8 +168,8 @@ void DerivedTypeSpec::EvaluateParameters(SemanticsContext &context) {
               evaluate::Fold(foldingContext, std::move(*converted))};
           ok = ok || evaluate::IsActuallyConstant(folded);
           AddParamValue(name,
-              ParamValue{
-                  std::move(std::get<SomeIntExpr>(folded.u)), details.attr()});
+              ParamValue{std::move(std::get<SomeIntExpr>(folded.u)),
+                  details.attr().value()});
         } else {
           if (!context.HasError(symbol)) {
             evaluate::SayWithDeclaration(messages, symbol,
@@ -233,7 +232,7 @@ ParamValue *DerivedTypeSpec::FindParameter(SourceName target) {
 
 static bool MatchKindParams(const Symbol &typeSymbol,
     const DerivedTypeSpec &thisSpec, const DerivedTypeSpec &thatSpec) {
-  for (auto ref : typeSymbol.get<DerivedTypeDetails>().paramDecls()) {
+  for (auto ref : typeSymbol.get<DerivedTypeDetails>().paramNameOrder()) {
     if (ref->get<TypeParamDetails>().attr() == common::TypeParamAttr::Kind) {
       const auto *thisValue{thisSpec.FindParameter(ref->name())};
       const auto *thatValue{thatSpec.FindParameter(ref->name())};
@@ -369,12 +368,15 @@ void DerivedTypeSpec::Instantiate(Scope &containingScope) {
       // uninitialized type parameter to forestall use of any default.
       if (ParamValue * paramValue{FindParameter(name)}) {
         const TypeParamDetails &details{symbol.get<TypeParamDetails>()};
-        paramValue->set_attr(details.attr());
+        TypeParamDetails instanceDetails{};
+        if (details.attr()) {
+          paramValue->set_attr(*details.attr());
+          instanceDetails.set_attr(*details.attr());
+        }
         desc += sep;
         desc += name.ToString();
         desc += '=';
         sep = ',';
-        TypeParamDetails instanceDetails{details.attr()};
         if (MaybeIntExpr expr{paramValue->GetExplicit()}) {
           desc += expr->AsFortran();
           instanceDetails.set_init(
diff --git a/flang/runtime/exceptions.cpp b/flang/runtime/exceptions.cpp
index dfd3b81..2032ce7 100644
--- a/flang/runtime/exceptions.cpp
+++ b/flang/runtime/exceptions.cpp
@@ -6,11 +6,10 @@
 //
 //===----------------------------------------------------------------------===//
 
-// Map Fortran ieee_arithmetic module exceptions to fenv.h exceptions.
+// Runtime exception support.
 
 #include "flang/Runtime/exceptions.h"
 #include "terminator.h"
-#include "flang/Runtime/magic-numbers.h"
 #include <cfenv>
 
 #ifndef __FE_DENORM
@@ -21,58 +20,32 @@ namespace Fortran::runtime {
 
 extern "C" {
 
-std::int32_t RTNAME(MapException)(int32_t except) {
+// Map a set of Fortran ieee_arithmetic module exceptions to a libm fenv.h
+// excepts value.
+uint32_t RTNAME(MapException)(uint32_t excepts) {
   Terminator terminator{__FILE__, __LINE__};
 
-  static constexpr int32_t mask{_FORTRAN_RUNTIME_IEEE_INVALID |
-      _FORTRAN_RUNTIME_IEEE_DENORM | _FORTRAN_RUNTIME_IEEE_DIVIDE_BY_ZERO |
-      _FORTRAN_RUNTIME_IEEE_OVERFLOW | _FORTRAN_RUNTIME_IEEE_UNDERFLOW |
-      _FORTRAN_RUNTIME_IEEE_INEXACT};
-  if (except == 0 || except != (except & mask)) {
-    terminator.Crash("Invalid exception value: %d", except);
+  static constexpr uint32_t v{FE_INVALID};
+  static constexpr uint32_t s{__FE_DENORM}; // subnormal
+  static constexpr uint32_t z{FE_DIVBYZERO};
+  static constexpr uint32_t o{FE_OVERFLOW};
+  static constexpr uint32_t u{FE_UNDERFLOW};
+  static constexpr uint32_t x{FE_INEXACT};
+
+#define vm(p) p, p | v
+#define sm(p) vm(p), vm(p | s)
+#define zm(p) sm(p), sm(p | z)
+#define om(p) zm(p), zm(p | o)
+#define um(p) om(p), om(p | u)
+#define xm um(0), um(x)
+
+  static constexpr uint32_t map[]{xm};
+  static constexpr uint32_t mapSize{sizeof(map) / sizeof(uint32_t)};
+  static_assert(mapSize == 64);
+  if (excepts == 0 || excepts >= mapSize) {
+    terminator.Crash("Invalid excepts value: %d", excepts);
   }
-
-  // Fortran and fenv.h values are identical; return the value.
-  if constexpr (_FORTRAN_RUNTIME_IEEE_INVALID == FE_INVALID &&
-      _FORTRAN_RUNTIME_IEEE_DENORM == __FE_DENORM &&
-      _FORTRAN_RUNTIME_IEEE_DIVIDE_BY_ZERO == FE_DIVBYZERO &&
-      _FORTRAN_RUNTIME_IEEE_OVERFLOW == FE_OVERFLOW &&
-      _FORTRAN_RUNTIME_IEEE_UNDERFLOW == FE_UNDERFLOW &&
-      _FORTRAN_RUNTIME_IEEE_INEXACT == FE_INEXACT) {
-    return except;
-  }
-
-  // fenv.h calls that take exception arguments are able to process multiple
-  // exceptions in one call, such as FE_OVERFLOW | FE_DIVBYZERO | FE_INVALID.
-  // And intrinsic module procedures that manage exceptions are elemental
-  // procedures that may specify multiple exceptions, such as ieee_all.
-  // However, general elemental call processing places single scalar arguments
-  // in a loop. As a consequence, argument 'except' here will be a power of
-  // two, corresponding to a single exception. If code generation were
-  // modified to bypass normal elemental call processing for calls with
-  // ieee_usual, ieee_all, or user-specified array arguments, this switch
-  // could be extended to support that.
-
-  // Fortran and fenv.h values differ.
-  switch (except) {
-  case _FORTRAN_RUNTIME_IEEE_INVALID:
-    return FE_INVALID;
-  case _FORTRAN_RUNTIME_IEEE_DENORM:
-    if (__FE_DENORM) {
-      return __FE_DENORM;
-    }
-    break;
-  case _FORTRAN_RUNTIME_IEEE_DIVIDE_BY_ZERO:
-    return FE_DIVBYZERO;
-  case _FORTRAN_RUNTIME_IEEE_OVERFLOW:
-    return FE_OVERFLOW;
-  case _FORTRAN_RUNTIME_IEEE_UNDERFLOW:
-    return FE_UNDERFLOW;
-  case _FORTRAN_RUNTIME_IEEE_INEXACT:
-    return FE_INEXACT;
-  }
-
-  terminator.Crash("Invalid exception set: %d", except);
+  return map[excepts];
 }
 
 // Verify that the size of ieee_modes_type and ieee_status_type objects from
diff --git a/flang/test/Driver/omp-driver-offload.f90 b/flang/test/Driver/omp-driver-offload.f90
index c7cc3bf..f8876c9 100644
--- a/flang/test/Driver/omp-driver-offload.f90
+++ b/flang/test/Driver/omp-driver-offload.f90
@@ -14,12 +14,12 @@
 ! Test regular -fopenmp with offload, and invocation filtering options
 ! RUN: %flang -S -### %s -o %t 2>&1 \
 ! RUN: -fopenmp --offload-arch=gfx90a --offload-arch=sm_70 \
-! RUN: --target=aarch64-unknown-linux-gnu \
+! RUN: --target=aarch64-unknown-linux-gnu -nogpulib\
 ! RUN:   | FileCheck %s --check-prefix=OFFLOAD-HOST-AND-DEVICE
 
 ! RUN: %flang -S -### %s -o %t 2>&1 \
 ! RUN: -fopenmp --offload-arch=gfx90a --offload-arch=sm_70 --offload-host-device \
-! RUN: --target=aarch64-unknown-linux-gnu \
+! RUN: --target=aarch64-unknown-linux-gnu -nogpulib\
 ! RUN:   | FileCheck %s --check-prefix=OFFLOAD-HOST-AND-DEVICE
 
 ! OFFLOAD-HOST-AND-DEVICE: "{{[^"]*}}flang-new" "-fc1" "-triple" "aarch64-unknown-linux-gnu"
@@ -29,7 +29,7 @@
 
 ! RUN: %flang -S -### %s -o %t 2>&1 \
 ! RUN: -fopenmp --offload-arch=gfx90a --offload-arch=sm_70 --offload-host-only \
-! RUN: --target=aarch64-unknown-linux-gnu \
+! RUN: --target=aarch64-unknown-linux-gnu -nogpulib\
 ! RUN:   | FileCheck %s --check-prefix=OFFLOAD-HOST
 
 ! OFFLOAD-HOST: "{{[^"]*}}flang-new" "-fc1" "-triple" "aarch64-unknown-linux-gnu"
@@ -39,7 +39,7 @@
 
 ! RUN: %flang -S -### %s 2>&1 \
 ! RUN: -fopenmp --offload-arch=gfx90a --offload-arch=sm_70 --offload-device-only \
-! RUN: --target=aarch64-unknown-linux-gnu \
+! RUN: --target=aarch64-unknown-linux-gnu -nogpulib\
 ! RUN:   | FileCheck %s --check-prefix=OFFLOAD-DEVICE
 
 ! OFFLOAD-DEVICE: "{{[^"]*}}flang-new" "-fc1" "-triple" "aarch64-unknown-linux-gnu"
@@ -48,13 +48,13 @@
 ! OFFLOAD-DEVICE-NOT: "{{[^"]*}}flang-new" "-fc1" "-triple" "aarch64-unknown-linux-gnu"
 
 ! Test regular -fopenmp with offload for basic fopenmp-is-target-device flag addition and correct fopenmp 
-! RUN: %flang -### -fopenmp --offload-arch=gfx90a -fopenmp-targets=amdgcn-amd-amdhsa %s 2>&1 | FileCheck --check-prefixes=CHECK-OPENMP-IS-TARGET-DEVICE %s
+! RUN: %flang -### -fopenmp --offload-arch=gfx90a -fopenmp-targets=amdgcn-amd-amdhsa -nogpulib %s 2>&1 | FileCheck --check-prefixes=CHECK-OPENMP-IS-TARGET-DEVICE %s
 ! CHECK-OPENMP-IS-TARGET-DEVICE: "{{[^"]*}}flang-new" "-fc1" {{.*}} "-fopenmp" {{.*}} "-fopenmp-is-target-device" {{.*}}.f90"
 
 ! Testing appropriate flags are gnerated and appropriately assigned by the driver when offloading
 ! RUN: %flang -S -### %s -o %t 2>&1 \
 ! RUN: -fopenmp --offload-arch=gfx90a \
-! RUN: --target=aarch64-unknown-linux-gnu \
+! RUN: --target=aarch64-unknown-linux-gnu -nogpulib\
 ! RUN:   | FileCheck %s --check-prefix=OPENMP-OFFLOAD-ARGS
 ! OPENMP-OFFLOAD-ARGS: "{{[^"]*}}flang-new" "-fc1" "-triple" "aarch64-unknown-linux-gnu" {{.*}} "-fopenmp" {{.*}}.f90"
 ! OPENMP-OFFLOAD-ARGS-NEXT: "{{[^"]*}}flang-new" "-fc1" "-triple" "amdgcn-amd-amdhsa"
@@ -70,19 +70,19 @@
 ! RUN: %flang -### %s -o %t 2>&1 \
 ! RUN: -fopenmp --offload-arch=gfx90a \
 ! RUN: -fopenmp-targets=amdgcn-amd-amdhsa \
-! RUN: -fopenmp-assume-threads-oversubscription \
+! RUN: -fopenmp-assume-threads-oversubscription -nogpulib \
 ! RUN: | FileCheck %s --check-prefixes=CHECK-THREADS-OVS
 ! RUN: %flang -### %s -o %t 2>&1 \
 ! RUN: -fopenmp --offload-arch=sm_70 \
 ! RUN: -fopenmp-targets=nvptx64-nvidia-cuda \
-! RUN: -fopenmp-assume-threads-oversubscription \
+! RUN: -fopenmp-assume-threads-oversubscription  \
 ! RUN: | FileCheck %s --check-prefixes=CHECK-THREADS-OVS
 ! CHECK-THREADS-OVS: "{{[^"]*}}flang-new" "-fc1" {{.*}} "-fopenmp" {{.*}} "-fopenmp-is-target-device" "-fopenmp-assume-threads-oversubscription" {{.*}}.f90"
 
 ! RUN: %flang -### %s -o %t 2>&1 \
 ! RUN: -fopenmp --offload-arch=gfx90a \
 ! RUN: -fopenmp-targets=amdgcn-amd-amdhsa \
-! RUN: -fopenmp-assume-teams-oversubscription  \
+! RUN: -fopenmp-assume-teams-oversubscription  -nogpulib\
 ! RUN: | FileCheck %s --check-prefixes=CHECK-TEAMS-OVS
 ! RUN: %flang -### %s -o %t 2>&1 \
 ! RUN: -fopenmp --offload-arch=sm_70 \
@@ -94,7 +94,7 @@
 ! RUN: %flang -### %s -o %t 2>&1 \
 ! RUN: -fopenmp --offload-arch=gfx90a \
 ! RUN: -fopenmp-targets=amdgcn-amd-amdhsa \
-! RUN: -fopenmp-assume-no-nested-parallelism  \
+! RUN: -fopenmp-assume-no-nested-parallelism  -nogpulib\
 ! RUN: | FileCheck %s --check-prefixes=CHECK-NEST-PAR
 ! RUN: %flang -### %s -o %t 2>&1 \
 ! RUN: -fopenmp --offload-arch=sm_70 \
@@ -106,7 +106,7 @@
 ! RUN: %flang -### %s -o %t 2>&1 \
 ! RUN: -fopenmp --offload-arch=gfx90a \
 ! RUN: -fopenmp-targets=amdgcn-amd-amdhsa \
-! RUN: -fopenmp-assume-no-thread-state \
+! RUN: -fopenmp-assume-no-thread-state -nogpulib\
 ! RUN: | FileCheck %s --check-prefixes=CHECK-THREAD-STATE
 ! RUN: %flang -### %s -o %t 2>&1 \
 ! RUN: -fopenmp --offload-arch=sm_70 \
@@ -118,7 +118,7 @@
 ! RUN: %flang -### %s -o %t 2>&1 \
 ! RUN: -fopenmp --offload-arch=gfx90a \
 ! RUN: -fopenmp-targets=amdgcn-amd-amdhsa \
-! RUN: -fopenmp-target-debug \
+! RUN: -fopenmp-target-debug -nogpulib\
 ! RUN: | FileCheck %s --check-prefixes=CHECK-TARGET-DEBUG
 ! RUN: %flang -### %s -o %t 2>&1 \
 ! RUN: -fopenmp --offload-arch=sm_70 \
@@ -130,7 +130,7 @@
 ! RUN: %flang -### %s -o %t 2>&1 \
 ! RUN: -fopenmp --offload-arch=gfx90a \
 ! RUN: -fopenmp-targets=amdgcn-amd-amdhsa \
-! RUN: -fopenmp-target-debug \
+! RUN: -fopenmp-target-debug -nogpulib\
 ! RUN: | FileCheck %s --check-prefixes=CHECK-TARGET-DEBUG
 ! RUN: %flang -### %s -o %t 2>&1 \
 ! RUN: -fopenmp --offload-arch=sm_70 \
@@ -144,7 +144,7 @@
 ! RUN: -fopenmp-targets=amdgcn-amd-amdhsa \
 ! RUN: -fopenmp-target-debug -fopenmp-assume-threads-oversubscription \
 ! RUN: -fopenmp-assume-teams-oversubscription -fopenmp-assume-no-nested-parallelism \
-! RUN: -fopenmp-assume-no-thread-state \
+! RUN: -fopenmp-assume-no-thread-state -nogpulib\
 ! RUN: | FileCheck %s --check-prefixes=CHECK-RTL-ALL
 ! RUN: %flang -S -### %s -o %t 2>&1 \
 ! RUN: -fopenmp --offload-arch=sm_70 \
@@ -160,7 +160,7 @@
 ! RUN: %flang -### %s -o %t 2>&1 \
 ! RUN: -fopenmp --offload-arch=gfx90a \
 ! RUN: -fopenmp-targets=amdgcn-amd-amdhsa \
-! RUN: -fopenmp-version=45 \
+! RUN: -fopenmp-version=45 -nogpulib\
 ! RUN: | FileCheck %s --check-prefixes=CHECK-OPENMP-VERSION
 ! RUN: %flang -### %s -o %t 2>&1 \
 ! RUN: -fopenmp --offload-arch=sm_70 \
@@ -193,10 +193,17 @@
 ! Test -fopenmp-force-usm option with offload
 ! RUN: %flang -S -### %s -o %t 2>&1 \
 ! RUN: -fopenmp -fopenmp-force-usm --offload-arch=gfx90a \
-! RUN: --target=aarch64-unknown-linux-gnu \
+! RUN: --target=aarch64-unknown-linux-gnu -nogpulib\
 ! RUN:   | FileCheck %s --check-prefix=FORCE-USM-OFFLOAD
 
 ! FORCE-USM-OFFLOAD: "{{[^"]*}}flang-new" "-fc1" "-triple" "aarch64-unknown-linux-gnu"
 ! FORCE-USM-OFFLOAD-SAME: "-fopenmp" "-fopenmp-force-usm"
 ! FORCE-USM-OFFLOAD-NEXT: "{{[^"]*}}flang-new" "-fc1" "-triple" "amdgcn-amd-amdhsa"
 ! FORCE-USM-OFFLOAD-SAME: "-fopenmp" "-fopenmp-force-usm"
+
+! RUN:   %flang -### -v --target=x86_64-unknown-linux-gnu -fopenmp  \
+! RUN:      --offload-arch=gfx900 \
+! RUN:      --rocm-path=%S/Inputs/rocm %s 2>&1 \
+! RUN:   | FileCheck --check-prefix=MLINK-BUILTIN-BITCODE  %s
+! MLINK-BUILTIN-BITCODE:      "{{[^"]*}}flang-new" "-fc1" "-triple" "amdgcn-amd-amdhsa"
+! MLINK-BUILTIN-BITCODE-SAME: "-mlink-builtin-bitcode" {{.*Inputs.*rocm.*amdgcn.*bitcode.*}}oclc_isa_version_900.bc
diff --git a/flang/test/Driver/target-cpu-features.f90 b/flang/test/Driver/target-cpu-features.f90
index 0f19e4e..1c77d4a 100644
--- a/flang/test/Driver/target-cpu-features.f90
+++ b/flang/test/Driver/target-cpu-features.f90
@@ -29,10 +29,10 @@
 ! RUN: %flang --target=riscv64-linux-gnu -c %s -### 2>&1 \
 ! RUN: | FileCheck %s -check-prefix=CHECK-RV64
 
-! RUN: %flang --target=amdgcn-amd-amdhsa -mcpu=gfx908 -c %s -### 2>&1 \
+! RUN: %flang --target=amdgcn-amd-amdhsa -mcpu=gfx908 -nogpulib -c %s -### 2>&1 \
 ! RUN: | FileCheck %s -check-prefix=CHECK-AMDGPU
 
-! RUN: %flang --target=r600-unknown-unknown -mcpu=cayman -c %s -### 2>&1 \
+! RUN: %flang --target=r600-unknown-unknown -mcpu=cayman -nogpulib -c %s -### 2>&1 \
 ! RUN: | FileCheck %s -check-prefix=CHECK-AMDGPU-R600
 
 ! CHECK-A57: "-fc1" "-triple" "aarch64-unknown-linux-gnu"
diff --git a/flang/test/Driver/target-gpu-features.f90 b/flang/test/Driver/target-gpu-features.f90
index 9cc9ce4..b7835743 100644
--- a/flang/test/Driver/target-gpu-features.f90
+++ b/flang/test/Driver/target-gpu-features.f90
@@ -3,7 +3,7 @@
 ! Test that -mcpu are used and that the -target-cpu and -target-features
 ! are also added to the fc1 command.
 
-! RUN: %flang --target=amdgcn-amd-amdhsa -mcpu=gfx902 -c %s -### 2>&1 \
+! RUN: %flang --target=amdgcn-amd-amdhsa -mcpu=gfx902  -nogpulib -c %s -### 2>&1 \
 ! RUN: | FileCheck %s -check-prefix=CHECK-AMDGCN
 
 ! CHECK-AMDGCN: "-fc1" "-triple" "amdgcn-amd-amdhsa"
diff --git a/flang/test/Evaluate/fold-assumed-type-rank.f90 b/flang/test/Evaluate/fold-assumed-type-rank.f90
new file mode 100644
index 0000000..ce296c8
--- /dev/null
+++ b/flang/test/Evaluate/fold-assumed-type-rank.f90
@@ -0,0 +1,6 @@
+! RUN: %flang_fc1 -fdebug-unparse %s 2>&1 | FileCheck %s
+subroutine sub3(ar_at)
+  type(*) :: ar_at(..)
+!CHECK:  PRINT *, int(int(rank(ar_at),kind=8),kind=4)
+  print *, rank(ar_at)
+end
diff --git a/flang/test/Evaluate/fold-nearest.f90 b/flang/test/Evaluate/fold-nearest.f90
index a7366e6..41853a6 100644
--- a/flang/test/Evaluate/fold-nearest.f90
+++ b/flang/test/Evaluate/fold-nearest.f90
@@ -39,6 +39,7 @@ end module
 module m2
   use ieee_arithmetic, only: ieee_next_after
   real, parameter :: minSubnormal = 1.e-45
+  real, parameter :: h = huge(0.0)
   logical, parameter :: test_0 = ieee_next_after(0., 0.) == 0.
   logical, parameter :: test_1 = ieee_next_after(0., 1.) == minSubnormal
   logical, parameter :: test_2 = ieee_next_after(minSubnormal, -1.) == 0
@@ -47,9 +48,9 @@ module m2
   !WARN: warning: division by zero
   real, parameter :: inf = 1. / 0.
   logical, parameter :: test_5 = ieee_next_after(inf, inf) == inf
-  logical, parameter :: test_6 = ieee_next_after(inf, -inf) == inf
-  logical, parameter :: test_7 = ieee_next_after(-inf, inf) == -inf
-  logical, parameter :: test_8 = ieee_next_after(-inf, -1.) == -inf
+  logical, parameter :: test_6 = ieee_next_after(inf, -inf) == h
+  logical, parameter :: test_7 = ieee_next_after(-inf, inf) == -h
+  logical, parameter :: test_8 = ieee_next_after(-inf, -1.) == -h
   logical, parameter :: test_9 = ieee_next_after(1.9999999, 3.) == 2.
   logical, parameter :: test_10 = ieee_next_after(2., 1.) == 1.9999999
 #if __x86_64__
@@ -69,6 +70,7 @@ end module
 module m3
   use ieee_arithmetic, only: ieee_next_up, ieee_next_down
   real(kind(0.d0)), parameter :: minSubnormal = 5.d-324
+  real(kind(0.d0)), parameter :: h = huge(0.d0)
   logical, parameter :: test_1 = ieee_next_up(0.d0) == minSubnormal
   logical, parameter :: test_2 = ieee_next_down(0.d0) == -minSubnormal
   logical, parameter :: test_3 = ieee_next_up(1.d0) == 1.0000000000000002d0
@@ -81,10 +83,8 @@ module m3
   logical, parameter :: test_6 = ieee_next_down(-huge(0.d0)) == -inf
   !WARN: warning: IEEE_NEXT_UP intrinsic folding: bad argument
   logical, parameter :: test_7 = ieee_next_up(inf) == inf
-  !WARN: warning: IEEE_NEXT_DOWN intrinsic folding: bad argument
-  logical, parameter :: test_8 = ieee_next_down(inf) == inf
-  !WARN: warning: IEEE_NEXT_UP intrinsic folding: bad argument
-  logical, parameter :: test_9 = ieee_next_up(-inf) == -inf
+  logical, parameter :: test_8 = ieee_next_down(inf) == h
+  logical, parameter :: test_9 = ieee_next_up(-inf) == -h
   !WARN: warning: IEEE_NEXT_DOWN intrinsic folding: bad argument
   logical, parameter :: test_10 = ieee_next_down(-inf) == -inf
   logical, parameter :: test_11 = ieee_next_up(1.9999999999999997d0) == 2.d0
diff --git a/flang/test/Evaluate/folding08.f90 b/flang/test/Evaluate/folding08.f90
index 1b2e560..5360347 100644
--- a/flang/test/Evaluate/folding08.f90
+++ b/flang/test/Evaluate/folding08.f90
@@ -11,6 +11,11 @@ module m
   end type
   type(t) :: ta(0:2)
   character(len=2) :: ca(-1:1)
+  interface
+    function foo()
+      real :: foo(2:3,4:6)
+    end function
+  end interface
   integer, parameter :: lbtadim = lbound(ta,1)
   logical, parameter :: test_lbtadim = lbtadim == 0
   integer, parameter :: ubtadim = ubound(ta,1)
@@ -47,9 +52,6 @@ module m
   logical, parameter :: test_lb_empty_dim = lbound(empty, 1) == 1
   logical, parameter :: test_ub_empty_dim = ubound(empty, 1) == 0
  contains
-  function foo()
-    real :: foo(2:3,4:6)
-  end function
   subroutine test(n1,a1,a2)
     integer, intent(in) :: n1
     real, intent(in) :: a1(1:n1), a2(0:*)
diff --git a/flang/test/Evaluate/rewrite08.f90 b/flang/test/Evaluate/rewrite08.f90
new file mode 100644
index 0000000..c596055
--- /dev/null
+++ b/flang/test/Evaluate/rewrite08.f90
@@ -0,0 +1,21 @@
+! RUN: %flang_fc1 -fdebug-unparse %s 2>&1 | FileCheck %s
+subroutine s(oi,ol)
+  integer(1), optional, intent(in) :: oi
+  logical(1), optional, intent(in) :: ol
+  integer(1), allocatable :: ai
+  logical(1), allocatable :: al
+  integer(1), pointer :: pi
+  logical(1), pointer :: pl
+!CHECK: PRINT *, ishftc(-1_4,1_4,oi)
+!CHECK: PRINT *, ishftc(-1_4,1_4,ai)
+!CHECK: PRINT *, ishftc(-1_4,1_4,pi)
+!CHECK: PRINT *, findloc([INTEGER(4)::1_4,2_4,1_4],1_4,back=ol)
+!CHECK: PRINT *, findloc([INTEGER(4)::1_4,2_4,1_4],1_4,back=al)
+!CHECK: PRINT *, findloc([INTEGER(4)::1_4,2_4,1_4],1_4,back=pl)
+  print *, ishftc(-1,1,oi)
+  print *, ishftc(-1,1,ai)
+  print *, ishftc(-1,1,pi)
+  print *, findloc([1,2,1],1,back=ol)
+  print *, findloc([1,2,1],1,back=al)
+  print *, findloc([1,2,1],1,back=pl)
+end
diff --git a/flang/test/Fir/convert-to-llvm-openmp-and-fir.fir b/flang/test/Fir/convert-to-llvm-openmp-and-fir.fir
index eca762d..4b9afd5 100644
--- a/flang/test/Fir/convert-to-llvm-openmp-and-fir.fir
+++ b/flang/test/Fir/convert-to-llvm-openmp-and-fir.fir
@@ -472,7 +472,7 @@ func.func @_QPomp_target() {
 // CHECK:           %[[UPPER:.*]] = llvm.mlir.constant(511 : index) : i64
 // CHECK:           %[[BOUNDS:.*]] = omp.map.bounds   lower_bound(%[[LOWER]] : i64) upper_bound(%[[UPPER]] : i64) extent(%[[EXTENT]] : i64) stride(%[[STRIDE]] : i64) start_idx(%[[STRIDE]] : i64)
 // CHECK:           %[[MAP:.*]] = omp.map.info var_ptr(%[[VAL_1]] : !llvm.ptr, !llvm.array<512 x i32>)   map_clauses(tofrom) capture(ByRef) bounds(%[[BOUNDS]]) -> !llvm.ptr {name = "a"}
-// CHECK:           omp.target   thread_limit(%[[VAL_2]] : i32) map_entries(%[[MAP]] -> %[[ARG_0:.*]] : !llvm.ptr) {
+// CHECK:           omp.target map_entries(%[[MAP]] -> %[[ARG_0:.*]] : !llvm.ptr) thread_limit(%[[VAL_2]] : i32) {
 // CHECK:           ^bb0(%[[ARG_0]]: !llvm.ptr):
 // CHECK:             %[[VAL_3:.*]] = llvm.mlir.constant(10 : i32) : i32
 // CHECK:             %[[VAL_4:.*]] = llvm.mlir.constant(1 : i64) : i64
diff --git a/flang/test/Lower/CUDA/cuda-data-transfer.cuf b/flang/test/Lower/CUDA/cuda-data-transfer.cuf
index d657f81..7eb74a4 100644
--- a/flang/test/Lower/CUDA/cuda-data-transfer.cuf
+++ b/flang/test/Lower/CUDA/cuda-data-transfer.cuf
@@ -133,7 +133,7 @@ subroutine sub4()
   integer, parameter :: n = 10
   real, device :: adev(n)
   real :: ahost(n)
-  real :: b
+  real, managed :: b
   integer :: i
 
   adev = ahost
diff --git a/flang/test/Lower/CUDA/cuda-kernel-loop-directive.cuf b/flang/test/Lower/CUDA/cuda-kernel-loop-directive.cuf
index 99cb6eb..ba5d390 100644
--- a/flang/test/Lower/CUDA/cuda-kernel-loop-directive.cuf
+++ b/flang/test/Lower/CUDA/cuda-kernel-loop-directive.cuf
@@ -7,8 +7,8 @@ subroutine sub1()
   integer :: i, j
   integer, parameter :: n = 100
   integer(8) :: istream
-  real :: a(n), b(n)
-  real :: c(n,n), d(n,n)
+  real, device :: a(n), b(n)
+  real, device :: c(n,n), d(n,n)
 
 ! CHECK-LABEL: func.func @_QPsub1()
 ! CHECK: %[[IV:.*]]:2 = hlfir.declare %{{.*}} {uniq_name = "_QFsub1Ei"} : (!fir.ref<i32>) -> (!fir.ref<i32>, !fir.ref<i32>)
diff --git a/flang/test/Lower/Intrinsics/ieee_next.f90 b/flang/test/Lower/Intrinsics/ieee_next.f90
new file mode 100644
index 0000000..fa9692b
--- /dev/null
+++ b/flang/test/Lower/Intrinsics/ieee_next.f90
@@ -0,0 +1,284 @@
+! RUN: bbc -emit-fir -o - %s | FileCheck %s
+
+! CHECK-LABEL: c.func @_QQmain
+program p
+  use ieee_arithmetic, only: ieee_value, ieee_negative_inf, ieee_positive_inf
+  use ieee_arithmetic, only: ieee_next_after, ieee_next_down, ieee_next_up
+  implicit none
+  ! CHECK-DAG: %[[V_4:[0-9]+]] = fir.alloca f80 {bindc_name = "r10", uniq_name = "_QFEr10"}
+  ! CHECK-DAG: %[[V_5:[0-9]+]] = fir.declare %[[V_4]] {uniq_name = "_QFEr10"} : (!fir.ref<f80>) -> !fir.ref<f80>
+  ! CHECK-DAG: %[[V_6:[0-9]+]] = fir.alloca f128 {bindc_name = "r16", uniq_name = "_QFEr16"}
+  ! CHECK-DAG: %[[V_7:[0-9]+]] = fir.declare %[[V_6]] {uniq_name = "_QFEr16"} : (!fir.ref<f128>) -> !fir.ref<f128>
+  ! CHECK-DAG: %[[V_8:[0-9]+]] = fir.alloca f16 {bindc_name = "r2", uniq_name = "_QFEr2"}
+  ! CHECK-DAG: %[[V_9:[0-9]+]] = fir.declare %[[V_8]] {uniq_name = "_QFEr2"} : (!fir.ref<f16>) -> !fir.ref<f16>
+  ! CHECK-DAG: %[[V_10:[0-9]+]] = fir.alloca bf16 {bindc_name = "r3", uniq_name = "_QFEr3"}
+  ! CHECK-DAG: %[[V_11:[0-9]+]] = fir.declare %[[V_10]] {uniq_name = "_QFEr3"} : (!fir.ref<bf16>) -> !fir.ref<bf16>
+  ! CHECK-DAG: %[[V_12:[0-9]+]] = fir.alloca f32 {bindc_name = "r4", uniq_name = "_QFEr4"}
+  ! CHECK-DAG: %[[V_13:[0-9]+]] = fir.declare %[[V_12]] {uniq_name = "_QFEr4"} : (!fir.ref<f32>) -> !fir.ref<f32>
+  ! CHECK-DAG: %[[V_14:[0-9]+]] = fir.alloca f64 {bindc_name = "r8", uniq_name = "_QFEr8"}
+  ! CHECK-DAG: %[[V_15:[0-9]+]] = fir.declare %[[V_14]] {uniq_name = "_QFEr8"} : (!fir.ref<f64>) -> !fir.ref<f64>
+  ! CHECK-DAG: %[[V_16:[0-9]+]] = fir.address_of(@_QFEx10) : !fir.ref<f80>
+  ! CHECK-DAG: %[[V_17:[0-9]+]] = fir.declare %[[V_16]] {uniq_name = "_QFEx10"} : (!fir.ref<f80>) -> !fir.ref<f80>
+  ! CHECK-DAG: %[[V_18:[0-9]+]] = fir.alloca f128 {bindc_name = "x16", uniq_name = "_QFEx16"}
+  ! CHECK-DAG: %[[V_19:[0-9]+]] = fir.declare %[[V_18]] {uniq_name = "_QFEx16"} : (!fir.ref<f128>) -> !fir.ref<f128>
+  ! CHECK-DAG: %[[V_20:[0-9]+]] = fir.alloca f16 {bindc_name = "x2", uniq_name = "_QFEx2"}
+  ! CHECK-DAG: %[[V_21:[0-9]+]] = fir.declare %[[V_20]] {uniq_name = "_QFEx2"} : (!fir.ref<f16>) -> !fir.ref<f16>
+  ! CHECK-DAG: %[[V_22:[0-9]+]] = fir.address_of(@_QFEx3) : !fir.ref<bf16>
+  ! CHECK-DAG: %[[V_23:[0-9]+]] = fir.declare %[[V_22]] {uniq_name = "_QFEx3"} : (!fir.ref<bf16>) -> !fir.ref<bf16>
+  ! CHECK-DAG: %[[V_24:[0-9]+]] = fir.address_of(@_QFEx4) : !fir.ref<f32>
+  ! CHECK-DAG: %[[V_25:[0-9]+]] = fir.declare %[[V_24]] {uniq_name = "_QFEx4"} : (!fir.ref<f32>) -> !fir.ref<f32>
+  ! CHECK-DAG: %[[V_26:[0-9]+]] = fir.address_of(@_QFEx8) : !fir.ref<f64>
+  ! CHECK-DAG: %[[V_27:[0-9]+]] = fir.declare %[[V_26]] {uniq_name = "_QFEx8"} : (!fir.ref<f64>) -> !fir.ref<f64>
+  real(2)  ::  r2,  x2
+  real(3)  ::  r3,  x3 = -huge(x3)
+  real(4)  ::  r4,  x4 = -0.
+  real(8)  ::  r8,  x8 =  0.
+  real(10) :: r10, x10 =  huge(x10)
+  real(16) :: r16, x16
+
+  x2  = ieee_value(x2, ieee_negative_inf)
+  x16 = ieee_value(x2, ieee_positive_inf)
+
+  ! CHECK:     %[[V_45:[0-9]+]] = fir.load %[[V_21]] : !fir.ref<f16>
+  ! CHECK:     %[[V_46:[0-9]+]] = fir.load %[[V_17]] : !fir.ref<f80>
+  ! CHECK-DAG: %[[V_47:[0-9]+]] = fir.coordinate_of %{{.*}}, %c2{{.*}} : (!fir.ref<!fir.array<12xi16>>, i8) -> !fir.ref<i16>
+  ! CHECK-DAG: %[[V_48:[0-9]+]] = fir.load %[[V_47]] : !fir.ref<i16>
+  ! CHECK-DAG: %[[V_49:[0-9]+]] = arith.bitcast %[[V_48]] : i16 to f16
+  ! CHECK-DAG: %[[V_50:[0-9]+]] = "llvm.intr.is.fpclass"(%[[V_46]]) <{bit = 3 : i32}> : (f80) -> i1
+  ! CHECK:     %[[V_51:[0-9]+]] = arith.select %[[V_50]], %[[V_49]], %[[V_45]] : f16
+  ! CHECK:     %[[V_52:[0-9]+]] = "llvm.intr.is.fpclass"(%[[V_51]]) <{bit = 3 : i32}> : (f16) -> i1
+  ! CHECK:     %[[V_53:[0-9]+]] = fir.convert %[[V_51]] : (f16) -> f80
+  ! CHECK:     %[[V_54:[0-9]+]] = arith.cmpf oeq, %[[V_53]], %[[V_46]] fastmath<contract> : f80
+  ! CHECK:     %[[V_55:[0-9]+]] = arith.ori %[[V_52]], %[[V_54]] : i1
+  ! CHECK:     %[[V_56:[0-9]+]] = arith.cmpf olt, %[[V_53]], %[[V_46]] fastmath<contract> : f80
+  ! CHECK:     %[[V_57:[0-9]+]] = arith.bitcast %[[V_45]] : f16 to i16
+  ! CHECK:     %[[V_58:[0-9]+]] = arith.shrui %[[V_57]], %c15{{.*}} : i16
+  ! CHECK:     %[[V_59:[0-9]+]] = fir.convert %[[V_58]] : (i16) -> i1
+  ! CHECK:     %[[V_60:[0-9]+]] = arith.cmpi ne, %[[V_56]], %[[V_59]] : i1
+  ! CHECK:     %[[V_61:[0-9]+]] = "llvm.intr.is.fpclass"(%[[V_51]]) <{bit = 516 : i32}> : (f16) -> i1
+  ! CHECK:     %[[V_62:[0-9]+]] = arith.andi %[[V_61]], %[[V_60]] : i1
+  ! CHECK:     %[[V_63:[0-9]+]] = arith.ori %[[V_55]], %[[V_62]] : i1
+  ! CHECK:     %[[V_64:[0-9]+]] = fir.if %[[V_63]] -> (f16) {
+  ! CHECK:       fir.result %[[V_51]] : f16
+  ! CHECK:     } else {
+  ! CHECK:       %[[V_202:[0-9]+]] = arith.cmpf oeq, %[[V_51]], %cst{{[_0-9]*}} fastmath<contract> : f16
+  ! CHECK:       %[[V_203:[0-9]+]] = fir.if %[[V_202]] -> (f16) {
+  ! CHECK:         %[[V_204:[0-9]+]] = arith.select %[[V_56]], %cst{{[_0-9]*}}, %cst{{[_0-9]*}} : f16
+  ! CHECK:         %[[V_205:[0-9]+]] = fir.call @_FortranAMapException(%c48{{.*}}) fastmath<contract> : (i32) -> i32
+  ! CHECK:                             fir.call @feraiseexcept(%[[V_205]]) fastmath<contract> : (i32) -> i32
+  ! CHECK:         fir.result %[[V_204]] : f16
+  ! CHECK:       } else {
+  ! CHECK:         %[[V_204:[0-9]+]] = arith.bitcast %[[V_51]] : f16 to i16
+  ! CHECK-DAG:     %[[V_205:[0-9]+]] = arith.subi %[[V_204]], %c1{{.*}} : i16
+  ! CHECK-DAG:     %[[V_206:[0-9]+]] = arith.addi %[[V_204]], %c1{{.*}} : i16
+  ! CHECK:         %[[V_207:[0-9]+]] = arith.select %[[V_60]], %[[V_206]], %[[V_205]] : i16
+  ! CHECK:         %[[V_208:[0-9]+]] = arith.bitcast %[[V_207]] : i16 to f16
+  ! CHECK:         %[[V_209:[0-9]+]] = "llvm.intr.is.fpclass"(%[[V_208]]) <{bit = 516 : i32}> : (f16) -> i1
+  ! CHECK:         fir.if %[[V_209]] {
+  ! CHECK:           %[[V_211:[0-9]+]] = fir.call @_FortranAMapException(%c40{{.*}}) fastmath<contract> : (i32) -> i32
+  ! CHECK:                               fir.call @feraiseexcept(%[[V_211]]) fastmath<contract> : (i32) -> i32
+  ! CHECK:         }
+  ! CHECK:         %[[V_210:[0-9]+]] = "llvm.intr.is.fpclass"(%[[V_208]]) <{bit = 144 : i32}> : (f16) -> i1
+  ! CHECK:         fir.if %[[V_210]] {
+  ! CHECK:           %[[V_211:[0-9]+]] = fir.call @_FortranAMapException(%c48{{.*}}) fastmath<contract> : (i32) -> i32
+  ! CHECK:                               fir.call @feraiseexcept(%[[V_211]]) fastmath<contract> : (i32) -> i32
+  ! CHECK:         }
+  ! CHECK:         fir.result %[[V_208]] : f16
+  ! CHECK:       }
+  ! CHECK:       fir.result %[[V_203]] : f16
+  ! CHECK:     }
+  ! CHECK:     fir.store %[[V_64]] to %[[V_9]] : !fir.ref<f16>
+  r2 = ieee_next_after(x2, x10)
+  print "('after:  ', z4.4, ' -> ', z4.4, ' = ', g0)", x2, r2, r2
+
+  ! CHECK:     %[[V_81:[0-9]+]] = fir.load %[[V_23]] : !fir.ref<bf16>
+  ! CHECK:     %[[V_82:[0-9]+]] = "llvm.intr.is.fpclass"(%[[V_81]]) <{bit = 3 : i32}> : (bf16) -> i1
+  ! CHECK:     %[[V_83:[0-9]+]] = fir.convert %[[V_81]] : (bf16) -> f32
+  ! CHECK:     %[[V_84:[0-9]+]] = arith.bitcast %[[V_83]] : f32 to i32
+  ! CHECK:     %[[V_85:[0-9]+]] = arith.shrui %[[V_84]], %c31{{.*}} : i32
+  ! CHECK:     %[[V_86:[0-9]+]] = fir.convert %[[V_85]] : (i32) -> i1
+  ! CHECK:     %[[V_87:[0-9]+]] = arith.cmpi ne, %[[V_86]], %true{{[_0-9]*}} : i1
+  ! CHECK:     %[[V_88:[0-9]+]] = "llvm.intr.is.fpclass"(%[[V_81]]) <{bit = 516 : i32}> : (bf16) -> i1
+  ! CHECK:     %[[V_89:[0-9]+]] = arith.andi %[[V_88]], %[[V_87]] : i1
+  ! CHECK:     %[[V_90:[0-9]+]] = arith.ori %[[V_82]], %[[V_89]] : i1
+  ! CHECK:     %[[V_91:[0-9]+]] = fir.if %[[V_90]] -> (bf16) {
+  ! CHECK:       %[[V_202:[0-9]+]] = "llvm.intr.is.fpclass"(%[[V_81]]) <{bit = 1 : i32}> : (bf16) -> i1
+  ! CHECK:       fir.if %[[V_202]] {
+  ! CHECK:         %[[V_203:[0-9]+]] = fir.call @_FortranAMapException(%c1{{.*}}) fastmath<contract> : (i32) -> i32
+  ! CHECK:                             fir.call @feraiseexcept(%[[V_203]]) fastmath<contract> : (i32) -> i32
+  ! CHECK:       }
+  ! CHECK:       fir.result %[[V_81]] : bf16
+  ! CHECK:     } else {
+  ! CHECK:       %[[V_202:[0-9]+]] = arith.cmpf oeq, %[[V_81]], %cst{{[_0-9]*}} fastmath<contract> : bf16
+  ! CHECK:       %[[V_203:[0-9]+]] = fir.if %[[V_202]] -> (bf16) {
+  ! CHECK:         fir.result %cst{{[_0-9]*}} : bf16
+  ! CHECK:       } else {
+  ! CHECK:         %[[V_204:[0-9]+]] = arith.bitcast %[[V_81]] : bf16 to i16
+  ! CHECK-DAG:     %[[V_205:[0-9]+]] = arith.subi %[[V_204]], %c1{{.*}} : i16
+  ! CHECK-DAG:     %[[V_206:[0-9]+]] = arith.addi %[[V_204]], %c1{{.*}} : i16
+  ! CHECK:         %[[V_207:[0-9]+]] = arith.select %[[V_87]], %[[V_206]], %[[V_205]] : i16
+  ! CHECK:         %[[V_208:[0-9]+]] = arith.bitcast %[[V_207]] : i16 to bf16
+  ! CHECK:         fir.result %[[V_208]] : bf16
+  ! CHECK:       }
+  ! CHECK:       fir.result %[[V_203]] : bf16
+  ! CHECK:     }
+  ! CHECK:     fir.store %[[V_91]] to %[[V_11]] : !fir.ref<bf16>
+  r3 = ieee_next_up(x3)
+  print "('up:     ', z4.4, ' -> ', z4.4, ' = ', g0)", x3, r3, r3
+
+  ! CHECK:     %[[V_104:[0-9]+]] = fir.load %[[V_25]] : !fir.ref<f32>
+  ! CHECK:     %[[V_105:[0-9]+]] = "llvm.intr.is.fpclass"(%[[V_104]]) <{bit = 3 : i32}> : (f32) -> i1
+  ! CHECK:     %[[V_106:[0-9]+]] = arith.bitcast %[[V_104]] : f32 to i32
+  ! CHECK:     %[[V_107:[0-9]+]] = arith.shrui %[[V_106]], %c31{{.*}} : i32
+  ! CHECK:     %[[V_108:[0-9]+]] = fir.convert %[[V_107]] : (i32) -> i1
+  ! CHECK:     %[[V_109:[0-9]+]] = arith.cmpi ne, %[[V_108]], %false{{[_0-9]*}} : i1
+  ! CHECK:     %[[V_110:[0-9]+]] = "llvm.intr.is.fpclass"(%[[V_104]]) <{bit = 516 : i32}> : (f32) -> i1
+  ! CHECK:     %[[V_111:[0-9]+]] = arith.andi %[[V_110]], %[[V_109]] : i1
+  ! CHECK:     %[[V_112:[0-9]+]] = arith.ori %[[V_105]], %[[V_111]] : i1
+  ! CHECK:     %[[V_113:[0-9]+]] = fir.if %[[V_112]] -> (f32) {
+  ! CHECK:       %[[V_202:[0-9]+]] = "llvm.intr.is.fpclass"(%[[V_104]]) <{bit = 1 : i32}> : (f32) -> i1
+  ! CHECK:       fir.if %[[V_202]] {
+  ! CHECK:         %[[V_203:[0-9]+]] = fir.call @_FortranAMapException(%c1{{.*}}) fastmath<contract> : (i32) -> i32
+  ! CHECK:                             fir.call @feraiseexcept(%[[V_203]]) fastmath<contract> : (i32) -> i32
+  ! CHECK:       }
+  ! CHECK:       fir.result %[[V_104]] : f32
+  ! CHECK:     } else {
+  ! CHECK:       %[[V_202:[0-9]+]] = arith.cmpf oeq, %[[V_104]], %cst{{[_0-9]*}} fastmath<contract> : f32
+  ! CHECK:       %[[V_203:[0-9]+]] = fir.if %[[V_202]] -> (f32) {
+  ! CHECK:         fir.result %cst{{[_0-9]*}} : f32
+  ! CHECK:       } else {
+  ! CHECK-DAG:     %[[V_204:[0-9]+]] = arith.subi %[[V_106]], %c1{{.*}} : i32
+  ! CHECK-DAG:     %[[V_205:[0-9]+]] = arith.addi %[[V_106]], %c1{{.*}} : i32
+  ! CHECK:         %[[V_206:[0-9]+]] = arith.select %[[V_109]], %[[V_205]], %[[V_204]] : i32
+  ! CHECK:         %[[V_207:[0-9]+]] = arith.bitcast %[[V_206]] : i32 to f32
+  ! CHECK:         fir.result %[[V_207]] : f32
+  ! CHECK:       }
+  ! CHECK:       fir.result %[[V_203]] : f32
+  ! CHECK:     }
+  ! CHECK:     fir.store %[[V_113]] to %[[V_13]] : !fir.ref<f32>
+  r4 = ieee_next_down(x4)
+  print "('down:   ', z8.8, ' -> ', z8.8, ' = ', g0)", x4, r4, r4
+
+  ! CHECK:     %[[V_125:[0-9]+]] = fir.load %[[V_27]] : !fir.ref<f64>
+  ! CHECK:     %[[V_126:[0-9]+]] = fir.load %[[V_21]] : !fir.ref<f16>
+  ! CHECK-DAG: %[[V_127:[0-9]+]] = fir.address_of(@_FortranAIeeeValueTable_8) : !fir.ref<!fir.array<12xi64>>
+  ! CHECK-DAG: %[[V_128:[0-9]+]] = fir.coordinate_of %[[V_127]], %c2{{.*}} : (!fir.ref<!fir.array<12xi64>>, i8) -> !fir.ref<i64>
+  ! CHECK-DAG: %[[V_129:[0-9]+]] = fir.load %[[V_128]] : !fir.ref<i64>
+  ! CHECK-DAG: %[[V_130:[0-9]+]] = arith.bitcast %[[V_129]] : i64 to f64
+  ! CHECK-DAG: %[[V_131:[0-9]+]] = "llvm.intr.is.fpclass"(%[[V_126]]) <{bit = 3 : i32}> : (f16) -> i1
+  ! CHECK:     %[[V_132:[0-9]+]] = arith.select %[[V_131]], %[[V_130]], %[[V_125]] : f64
+  ! CHECK:     %[[V_133:[0-9]+]] = "llvm.intr.is.fpclass"(%[[V_132]]) <{bit = 3 : i32}> : (f64) -> i1
+  ! CHECK:     %[[V_134:[0-9]+]] = fir.convert %[[V_126]] : (f16) -> f64
+  ! CHECK:     %[[V_135:[0-9]+]] = arith.cmpf oeq, %[[V_132]], %[[V_134]] fastmath<contract> : f64
+  ! CHECK:     %[[V_136:[0-9]+]] = arith.ori %[[V_133]], %[[V_135]] : i1
+  ! CHECK:     %[[V_137:[0-9]+]] = arith.cmpf olt, %[[V_132]], %[[V_134]] fastmath<contract> : f64
+  ! CHECK:     %[[V_138:[0-9]+]] = arith.bitcast %[[V_125]] : f64 to i64
+  ! CHECK:     %[[V_139:[0-9]+]] = arith.shrui %[[V_138]], %c63{{.*}} : i64
+  ! CHECK:     %[[V_140:[0-9]+]] = fir.convert %[[V_139]] : (i64) -> i1
+  ! CHECK:     %[[V_141:[0-9]+]] = arith.cmpi ne, %[[V_137]], %[[V_140]] : i1
+  ! CHECK:     %[[V_142:[0-9]+]] = "llvm.intr.is.fpclass"(%[[V_132]]) <{bit = 516 : i32}> : (f64) -> i1
+  ! CHECK:     %[[V_143:[0-9]+]] = arith.andi %[[V_142]], %[[V_141]] : i1
+  ! CHECK:     %[[V_144:[0-9]+]] = arith.ori %[[V_136]], %[[V_143]] : i1
+  ! CHECK:     %[[V_145:[0-9]+]] = fir.if %[[V_144]] -> (f64) {
+  ! CHECK:       fir.result %[[V_132]] : f64
+  ! CHECK:     } else {
+  ! CHECK:       %[[V_202:[0-9]+]] = arith.cmpf oeq, %[[V_132]], %cst{{[_0-9]*}} fastmath<contract> : f64
+  ! CHECK:       %[[V_203:[0-9]+]] = fir.if %[[V_202]] -> (f64) {
+  ! CHECK:         %[[V_204:[0-9]+]] = arith.select %[[V_137]], %cst{{[_0-9]*}}, %cst{{[_0-9]*}} : f64
+  ! CHECK:         %[[V_205:[0-9]+]] = fir.call @_FortranAMapException(%c48{{.*}}) fastmath<contract> : (i32) -> i32
+  ! CHECK:                             fir.call @feraiseexcept(%[[V_205]]) fastmath<contract> : (i32) -> i32
+  ! CHECK:         fir.result %[[V_204]] : f64
+  ! CHECK:       } else {
+  ! CHECK:         %[[V_204:[0-9]+]] = arith.bitcast %[[V_132]] : f64 to i64
+  ! CHECK-DAG:     %[[V_205:[0-9]+]] = arith.subi %[[V_204]], %c1{{.*}} : i64
+  ! CHECK-DAG:     %[[V_206:[0-9]+]] = arith.addi %[[V_204]], %c1{{.*}} : i64
+  ! CHECK:         %[[V_207:[0-9]+]] = arith.select %[[V_141]], %[[V_206]], %[[V_205]] : i64
+  ! CHECK:         %[[V_208:[0-9]+]] = arith.bitcast %[[V_207]] : i64 to f64
+  ! CHECK:         %[[V_209:[0-9]+]] = "llvm.intr.is.fpclass"(%[[V_208]]) <{bit = 516 : i32}> : (f64) -> i1
+  ! CHECK:         fir.if %[[V_209]] {
+  ! CHECK:           %[[V_211:[0-9]+]] = fir.call @_FortranAMapException(%c40{{.*}}) fastmath<contract> : (i32) -> i32
+  ! CHECK:                               fir.call @feraiseexcept(%[[V_211]]) fastmath<contract> : (i32) -> i32
+  ! CHECK:         }
+  ! CHECK:         %[[V_210:[0-9]+]] = "llvm.intr.is.fpclass"(%[[V_208]]) <{bit = 144 : i32}> : (f64) -> i1
+  ! CHECK:         fir.if %[[V_210]] {
+  ! CHECK:           %[[V_211:[0-9]+]] = fir.call @_FortranAMapException(%c48{{.*}}) fastmath<contract> : (i32) -> i32
+  ! CHECK:                               fir.call @feraiseexcept(%[[V_211]]) fastmath<contract> : (i32) -> i32
+  ! CHECK:         }
+  ! CHECK:         fir.result %[[V_208]] : f64
+  ! CHECK:       }
+  ! CHECK:       fir.result %[[V_203]] : f64
+  ! CHECK:     }
+  ! CHECK:     fir.store %[[V_145]] to %[[V_15]] : !fir.ref<f64>
+  r8 = ieee_next_after(x8, x2)
+  print "('after:  ', z16.16, ' -> ', z16.16, ' = ', g0)", x8, r8, r8
+
+  ! CHECK:     %[[V_158:[0-9]+]] = fir.load %[[V_17]] : !fir.ref<f80>
+  ! CHECK:     %[[V_159:[0-9]+]] = "llvm.intr.is.fpclass"(%[[V_158]]) <{bit = 3 : i32}> : (f80) -> i1
+  ! CHECK:     %[[V_160:[0-9]+]] = arith.bitcast %[[V_158]] : f80 to i80
+  ! CHECK:     %[[V_161:[0-9]+]] = arith.shrui %[[V_160]], %c79{{.*}} : i80
+  ! CHECK:     %[[V_162:[0-9]+]] = fir.convert %[[V_161]] : (i80) -> i1
+  ! CHECK:     %[[V_163:[0-9]+]] = arith.cmpi ne, %[[V_162]], %true{{[_0-9]*}} : i1
+  ! CHECK:     %[[V_164:[0-9]+]] = "llvm.intr.is.fpclass"(%[[V_158]]) <{bit = 516 : i32}> : (f80) -> i1
+  ! CHECK:     %[[V_165:[0-9]+]] = arith.andi %[[V_164]], %[[V_163]] : i1
+  ! CHECK:     %[[V_166:[0-9]+]] = arith.ori %[[V_159]], %[[V_165]] : i1
+  ! CHECK:     %[[V_167:[0-9]+]] = fir.if %[[V_166]] -> (f80) {
+  ! CHECK:       %[[V_202:[0-9]+]] = "llvm.intr.is.fpclass"(%[[V_158]]) <{bit = 1 : i32}> : (f80) -> i1
+  ! CHECK:       fir.if %[[V_202]] {
+  ! CHECK:         %[[V_203:[0-9]+]] = fir.call @_FortranAMapException(%c1{{.*}}) fastmath<contract> : (i32) -> i32
+  ! CHECK:                             fir.call @feraiseexcept(%[[V_203]]) fastmath<contract> : (i32) -> i32
+  ! CHECK:       }
+  ! CHECK:       fir.result %[[V_158]] : f80
+  ! CHECK:     } else {
+  ! CHECK:       %[[V_202:[0-9]+]] = arith.cmpf oeq, %[[V_158]], %cst{{[_0-9]*}} fastmath<contract> : f80
+  ! CHECK:       %[[V_203:[0-9]+]] = fir.if %[[V_202]] -> (f80) {
+  ! CHECK:         fir.result %cst{{[_0-9]*}} : f80
+  ! CHECK:       } else {
+  ! CHECK:         %[[V_204:[0-9]+]] = fir.call @_FortranAMapException(%c63{{.*}}) fastmath<contract> : (i32) -> i32
+  ! CHECK:         %[[V_205:[0-9]+]] = fir.call @fetestexcept(%[[V_204]]) fastmath<contract> : (i32) -> i32
+  ! CHECK:         %[[V_206:[0-9]+]] = fir.call @fedisableexcept(%[[V_204]]) fastmath<contract> : (i32) -> i32
+  ! CHECK:         %[[V_207:[0-9]+]] = fir.call @_FortranANearest10(%[[V_158]], %true{{[_0-9]*}}) fastmath<contract> : (f80, i1) -> f80
+  ! CHECK:         %[[V_208:[0-9]+]] = fir.call @feclearexcept(%[[V_204]]) fastmath<contract> : (i32) -> i32
+  ! CHECK:         %[[V_209:[0-9]+]] = fir.call @feraiseexcept(%[[V_205]]) fastmath<contract> : (i32) -> i32
+  ! CHECK:         %[[V_210:[0-9]+]] = fir.call @feenableexcept(%[[V_206]]) fastmath<contract> : (i32) -> i32
+  ! CHECK:         fir.result %[[V_207]] : f80
+  ! CHECK:       }
+  ! CHECK:       fir.result %[[V_203]] : f80
+  ! CHECK:     }
+  ! CHECK:     fir.store %[[V_167]] to %[[V_5]] : !fir.ref<f80>
+  r10 = ieee_next_up(x10)
+  print "('up:     ', z20.20, ' -> ', z20.20, ' = ', g0)", x10, r10, r10
+
+  ! CHECK:     %[[V_180:[0-9]+]] = fir.load %[[V_19]] : !fir.ref<f128>
+  ! CHECK:     %[[V_181:[0-9]+]] = "llvm.intr.is.fpclass"(%[[V_180]]) <{bit = 3 : i32}> : (f128) -> i1
+  ! CHECK:     %[[V_182:[0-9]+]] = arith.bitcast %[[V_180]] : f128 to i128
+  ! CHECK:     %[[V_183:[0-9]+]] = arith.shrui %[[V_182]], %c127{{.*}} : i128
+  ! CHECK:     %[[V_184:[0-9]+]] = fir.convert %[[V_183]] : (i128) -> i1
+  ! CHECK:     %[[V_185:[0-9]+]] = arith.cmpi ne, %[[V_184]], %false{{[_0-9]*}} : i1
+  ! CHECK:     %[[V_186:[0-9]+]] = "llvm.intr.is.fpclass"(%[[V_180]]) <{bit = 516 : i32}> : (f128) -> i1
+  ! CHECK:     %[[V_187:[0-9]+]] = arith.andi %[[V_186]], %[[V_185]] : i1
+  ! CHECK:     %[[V_188:[0-9]+]] = arith.ori %[[V_181]], %[[V_187]] : i1
+  ! CHECK:     %[[V_189:[0-9]+]] = fir.if %[[V_188]] -> (f128) {
+  ! CHECK:       %[[V_202:[0-9]+]] = "llvm.intr.is.fpclass"(%[[V_180]]) <{bit = 1 : i32}> : (f128) -> i1
+  ! CHECK:       fir.if %[[V_202]] {
+  ! CHECK:         %[[V_203:[0-9]+]] = fir.call @_FortranAMapException(%c1{{.*}}) fastmath<contract> : (i32) -> i32
+  ! CHECK:                             fir.call @feraiseexcept(%[[V_203]]) fastmath<contract> : (i32) -> i32
+  ! CHECK:       }
+  ! CHECK:       fir.result %[[V_180]] : f128
+  ! CHECK:     } else {
+  ! CHECK:       %[[V_202:[0-9]+]] = arith.cmpf oeq, %[[V_180]], %cst{{[_0-9]*}} fastmath<contract> : f128
+  ! CHECK:       %[[V_203:[0-9]+]] = fir.if %[[V_202]] -> (f128) {
+  ! CHECK:         fir.result %cst{{[_0-9]*}} : f128
+  ! CHECK:       } else {
+  ! CHECK-DAG:     %[[V_204:[0-9]+]] = arith.subi %[[V_182]], %c1{{.*}} : i128
+  ! CHECK-DAG:     %[[V_205:[0-9]+]] = arith.addi %[[V_182]], %c1{{.*}} : i128
+  ! CHECK:         %[[V_206:[0-9]+]] = arith.select %[[V_185]], %[[V_205]], %[[V_204]] : i128
+  ! CHECK:         %[[V_207:[0-9]+]] = arith.bitcast %[[V_206]] : i128 to f128
+  ! CHECK:         fir.result %[[V_207]] : f128
+  ! CHECK:       }
+  ! CHECK:       fir.result %[[V_203]] : f128
+  ! CHECK:     }
+  ! CHECK:     fir.store %[[V_189]] to %[[V_7]] : !fir.ref<f128>
+
+  r16 = ieee_next_down(x16)
+  print "('down:   ', z32.32, ' -> ', z32.32, ' = ', g0)", x16, r16, r16
+end
diff --git a/flang/test/Lower/Intrinsics/nearest.f90 b/flang/test/Lower/Intrinsics/nearest.f90
index a023fa8..5920d29 100644
--- a/flang/test/Lower/Intrinsics/nearest.f90
+++ b/flang/test/Lower/Intrinsics/nearest.f90
@@ -1,72 +1,407 @@
-! RUN: bbc -emit-fir -hlfir=false %s -o - | FileCheck %s
+! RUN: bbc -emit-fir %s -o - | FileCheck %s
 
-! CHECK-LABEL: nearest_test1
+! CHECK-LABEL: c.func @_QPnearest_test1
+  ! CHECK:     %[[V_0:[0-9]+]] = fir.dummy_scope : !fir.dscope
+  ! CHECK:     %[[V_1:[0-9]+]] = fir.alloca f16 {bindc_name = "res", uniq_name = "_QFnearest_test1Eres"}
+  ! CHECK:     %[[V_2:[0-9]+]] = fir.declare %[[V_1]] {uniq_name = "_QFnearest_test1Eres"} : (!fir.ref<f16>) -> !fir.ref<f16>
+  ! CHECK:     %[[V_3:[0-9]+]] = fir.declare %arg1 dummy_scope %[[V_0]] {uniq_name = "_QFnearest_test1Es"} : (!fir.ref<f16>, !fir.dscope) -> !fir.ref<f16>
+  ! CHECK:     %[[V_4:[0-9]+]] = fir.declare %arg0 dummy_scope %[[V_0]] {uniq_name = "_QFnearest_test1Ex"} : (!fir.ref<f16>, !fir.dscope) -> !fir.ref<f16>
+  ! CHECK:     %[[V_5:[0-9]+]] = fir.load %[[V_4]] : !fir.ref<f16>
+  ! CHECK:     %[[V_6:[0-9]+]] = fir.load %[[V_3]] : !fir.ref<f16>
+  ! CHECK:     %[[V_7:[0-9]+]] = "llvm.intr.is.fpclass"(%[[V_5]]) <{bit = 3 : i32}> : (f16) -> i1
+  ! CHECK:     %[[V_8:[0-9]+]] = "llvm.intr.is.fpclass"(%[[V_6]]) <{bit = 96 : i32}> : (f16) -> i1
+  ! CHECK:     fir.if %[[V_8]] {
+  ! CHECK:       fir.call @_FortranAReportFatalUserError
+  ! CHECK:     }
+  ! CHECK:     %[[V_9:[0-9]+]] = arith.bitcast %[[V_6]] : f16 to i16
+  ! CHECK:     %[[V_10:[0-9]+]] = arith.shrui %[[V_9]], %c15{{.*}} : i16
+  ! CHECK:     %[[V_11:[0-9]+]] = arith.cmpi ne, %[[V_10]], %c1{{.*}} : i16
+  ! CHECK:     %[[V_12:[0-9]+]] = arith.bitcast %[[V_5]] : f16 to i16
+  ! CHECK:     %[[V_13:[0-9]+]] = arith.shrui %[[V_12]], %c15{{.*}} : i16
+  ! CHECK:     %[[V_14:[0-9]+]] = fir.convert %[[V_13]] : (i16) -> i1
+  ! CHECK:     %[[V_15:[0-9]+]] = arith.cmpi ne, %[[V_11]], %[[V_14]] : i1
+  ! CHECK:     %[[V_16:[0-9]+]] = "llvm.intr.is.fpclass"(%[[V_5]]) <{bit = 516 : i32}> : (f16) -> i1
+  ! CHECK:     %[[V_17:[0-9]+]] = arith.andi %[[V_16]], %[[V_15]] : i1
+  ! CHECK:     %[[V_18:[0-9]+]] = arith.ori %[[V_7]], %[[V_17]] : i1
+  ! CHECK:     %[[V_19:[0-9]+]] = fir.if %[[V_18]] -> (f16) {
+  ! CHECK:       fir.result %[[V_5]] : f16
+  ! CHECK:     } else {
+  ! CHECK:       %[[V_20:[0-9]+]] = arith.cmpf oeq, %[[V_5]], %cst{{[_0-9]*}} fastmath<contract> : f16
+  ! CHECK:       %[[V_21:[0-9]+]] = fir.if %[[V_20]] -> (f16) {
+  ! CHECK:         %[[V_22:[0-9]+]] = arith.select %[[V_11]], %cst{{[_0-9]*}}, %cst{{[_0-9]*}} : f16
+  ! CHECK:         %[[V_23:[0-9]+]] = fir.call @_FortranAMapException(%c48{{.*}}) fastmath<contract> : (i32) -> i32
+  ! CHECK:                            fir.call @feraiseexcept(%[[V_23]]) fastmath<contract> : (i32) -> i32
+  ! CHECK:         fir.result %[[V_22]] : f16
+  ! CHECK:       } else {
+  ! CHECK-DAG:     %[[V_22:[0-9]+]] = arith.subi %[[V_12]], %c1{{.*}} : i16
+  ! CHECK-DAG:     %[[V_23:[0-9]+]] = arith.addi %[[V_12]], %c1{{.*}} : i16
+  ! CHECK:         %[[V_24:[0-9]+]] = arith.select %[[V_15]], %[[V_23]], %[[V_22]] : i16
+  ! CHECK:         %[[V_25:[0-9]+]] = arith.bitcast %[[V_24]] : i16 to f16
+  ! CHECK:         %[[V_26:[0-9]+]] = "llvm.intr.is.fpclass"(%[[V_25]]) <{bit = 516 : i32}> : (f16) -> i1
+  ! CHECK:         fir.if %[[V_26]] {
+  ! CHECK:           %[[V_28:[0-9]+]] = fir.call @_FortranAMapException(%c40{{.*}}) fastmath<contract> : (i32) -> i32
+  ! CHECK:                              fir.call @feraiseexcept(%[[V_28]]) fastmath<contract> : (i32) -> i32
+  ! CHECK:         }
+  ! CHECK:         %[[V_27:[0-9]+]] = "llvm.intr.is.fpclass"(%[[V_25]]) <{bit = 144 : i32}> : (f16) -> i1
+  ! CHECK:         fir.if %[[V_27]] {
+  ! CHECK:           %[[V_28:[0-9]+]] = fir.call @_FortranAMapException(%c48{{.*}}) fastmath<contract> : (i32) -> i32
+  ! CHECK:                              fir.call @feraiseexcept(%[[V_28]]) fastmath<contract> : (i32) -> i32
+  ! CHECK:         }
+  ! CHECK:         fir.result %[[V_25]] : f16
+  ! CHECK:       }
+  ! CHECK:       fir.result %[[V_21]] : f16
+  ! CHECK:     }
+  ! CHECK:     fir.store %[[V_19]] to %[[V_2]] : !fir.ref<f16>
+  ! CHECK:     return
+  ! CHECK:   }
 subroutine nearest_test1(x, s)
-    real :: x, s, res
-  ! CHECK: %[[res:.*]] = fir.alloca f32 {bindc_name = "res", uniq_name = "_QFnearest_test1Eres"}
-  ! CHECK: %[[x:.*]] = fir.load %arg0 : !fir.ref<f32>
-  ! CHECK: %[[s:.*]] = fir.load %arg1 : !fir.ref<f32>
-  ! CHECK: %[[zero:.*]] = arith.constant 0.000000e+00 : f32
-  ! CHECK: %[[cmp:.*]] = arith.cmpf ogt, %[[s]], %[[zero]] {{.*}} : f32
-  ! CHECK: %[[pos:.*]] = arith.select %[[cmp]], %true, %false : i1
-    res = nearest(x, s)
-  ! CHECK: %[[tmp:.*]] = fir.call @_FortranANearest4(%[[x]], %[[pos]]) {{.*}}: (f32, i1) -> f32
-  ! CHECK: fir.store %[[tmp]] to %[[res]] : !fir.ref<f32>
-  end subroutine nearest_test1
-  
-  ! CHECK-LABEL: nearest_test2
-  subroutine nearest_test2(x, s)
-    real(kind=8) :: x, s, res
-  ! CHECK: %[[res:.*]] = fir.alloca f64 {bindc_name = "res", uniq_name = "_QFnearest_test2Eres"}
-  ! CHECK: %[[x:.*]] = fir.load %arg0 : !fir.ref<f64>
-  ! CHECK: %[[s:.*]] = fir.load %arg1 : !fir.ref<f64>
-  ! CHECK: %[[zero:.*]] = arith.constant 0.000000e+00 : f64
-  ! CHECK: %[[cmp:.*]] = arith.cmpf ogt, %[[s]], %[[zero]] {{.*}} : f64
-  ! CHECK: %[[pos:.*]] = arith.select %[[cmp]], %true, %false : i1
-    res = nearest(x, s)
-  ! CHECK: %[[tmp:.*]] = fir.call @_FortranANearest8(%[[x]], %[[pos]]) {{.*}}: (f64, i1) -> f64
-  ! CHECK: fir.store %[[tmp]] to %[[res]] : !fir.ref<f64>
-  end subroutine nearest_test2
-  
-  ! CHECK-LABEL: nearest_test3
-  subroutine nearest_test3(x, s)
-    real(kind=10) :: x, s, res
-  ! CHECK: %[[res:.*]] = fir.alloca f80 {bindc_name = "res", uniq_name = "_QFnearest_test3Eres"}
-  ! CHECK: %[[x:.*]] = fir.load %arg0 : !fir.ref<f80>
-  ! CHECK: %[[s:.*]] = fir.load %arg1 : !fir.ref<f80>
-  ! CHECK: %[[zero:.*]] = arith.constant 0.000000e+00 : f80
-  ! CHECK: %[[cmp:.*]] = arith.cmpf ogt, %[[s]], %[[zero]] {{.*}} : f80
-  ! CHECK: %[[pos:.*]] = arith.select %[[cmp]], %true, %false : i1
-    res = nearest(x, s)
-  ! CHECK: %[[tmp:.*]] = fir.call @_FortranANearest10(%[[x]], %[[pos]]) {{.*}}: (f80, i1) -> f80
-  ! CHECK: fir.store %[[tmp]] to %[[res]] : !fir.ref<f80>
-  end subroutine nearest_test3
-  
-  ! CHECK-LABEL: nearest_test4
-  subroutine nearest_test4(x, s)
-    real(kind=16) :: x, s, res
-  ! CHECK: %[[res:.*]] = fir.alloca f128 {bindc_name = "res", uniq_name = "_QFnearest_test4Eres"}
-  ! CHECK: %[[x:.*]] = fir.load %arg0 : !fir.ref<f128>
-  ! CHECK: %[[s:.*]] = fir.load %arg1 : !fir.ref<f128>
-  ! CHECK: %[[zero:.*]] = arith.constant 0.000000e+00 : f128
-  ! CHECK: %[[cmp:.*]] = arith.cmpf ogt, %[[s]], %[[zero]] {{.*}} : f128
-  ! CHECK: %[[pos:.*]] = arith.select %[[cmp]], %true, %false : i1
-    res = nearest(x, s)
-  ! CHECK: %[[tmp:.*]] = fir.call @_FortranANearest16(%[[x]], %[[pos]]) {{.*}}: (f128, i1) -> f128
-  ! CHECK: fir.store %[[tmp]] to %[[res]] : !fir.ref<f128>
-  end subroutine nearest_test4
-  
-  ! CHECK-LABEL: nearest_test5
-  subroutine nearest_test5(x, s)
-    real(kind=16) :: x, res
-  ! CHECK: %[[res:.*]] = fir.alloca f128 {bindc_name = "res", uniq_name = "_QFnearest_test5Eres"}
-  ! CHECK: %[[x:.*]] = fir.load %arg0 : !fir.ref<f128>
-    real :: s
-  ! CHECK: %[[s:.*]] = fir.load %arg1 : !fir.ref<f32>
-  ! CHECK: %[[zero:.*]] = arith.constant 0.000000e+00 : f32
-  ! CHECK: %[[cmp:.*]] = arith.cmpf ogt, %[[s]], %[[zero]] {{.*}} : f32
-  ! CHECK: %[[pos:.*]] = arith.select %[[cmp]], %true, %false : i1
-    res = nearest(x, s)
-  ! CHECK: %[[tmp:.*]] = fir.call @_FortranANearest16(%[[x]], %[[pos]]) {{.*}}: (f128, i1) -> f128
-  ! CHECK: fir.store %[[tmp]] to %[[res]] : !fir.ref<f128>
-  end subroutine nearest_test5
+  real(kind=2) :: x, s, res
+  res = nearest(x, s)
+end
+
+! CHECK-LABEL: c.func @_QPnearest_test2
+  ! CHECK:     %[[V_0:[0-9]+]] = fir.dummy_scope : !fir.dscope
+  ! CHECK:     %[[V_1:[0-9]+]] = fir.alloca bf16 {bindc_name = "res", uniq_name = "_QFnearest_test2Eres"}
+  ! CHECK:     %[[V_2:[0-9]+]] = fir.declare %[[V_1]] {uniq_name = "_QFnearest_test2Eres"} : (!fir.ref<bf16>) -> !fir.ref<bf16>
+  ! CHECK:     %[[V_3:[0-9]+]] = fir.declare %arg1 dummy_scope %[[V_0]] {uniq_name = "_QFnearest_test2Es"} : (!fir.ref<bf16>, !fir.dscope) -> !fir.ref<bf16>
+  ! CHECK:     %[[V_4:[0-9]+]] = fir.declare %arg0 dummy_scope %[[V_0]] {uniq_name = "_QFnearest_test2Ex"} : (!fir.ref<bf16>, !fir.dscope) -> !fir.ref<bf16>
+  ! CHECK:     %[[V_5:[0-9]+]] = fir.load %[[V_4]] : !fir.ref<bf16>
+  ! CHECK:     %[[V_6:[0-9]+]] = fir.load %[[V_3]] : !fir.ref<bf16>
+  ! CHECK:     %[[V_7:[0-9]+]] = "llvm.intr.is.fpclass"(%[[V_5]]) <{bit = 3 : i32}> : (bf16) -> i1
+  ! CHECK:     %[[V_8:[0-9]+]] = "llvm.intr.is.fpclass"(%[[V_6]]) <{bit = 96 : i32}> : (bf16) -> i1
+  ! CHECK:     fir.if %[[V_8]] {
+  ! CHECK:       fir.call @_FortranAReportFatalUserError
+  ! CHECK:     }
+  ! CHECK:     %[[V_9:[0-9]+]] = fir.convert %[[V_6]] : (bf16) -> f32
+  ! CHECK:     %[[V_10:[0-9]+]] = arith.bitcast %[[V_9]] : f32 to i32
+  ! CHECK:     %[[V_11:[0-9]+]] = arith.shrui %[[V_10]], %c31{{.*}} : i32
+  ! CHECK:     %[[V_12:[0-9]+]] = fir.convert %[[V_11]] : (i32) -> i16
+  ! CHECK:     %[[V_13:[0-9]+]] = arith.cmpi ne, %[[V_12]], %c1{{.*}} : i16
+  ! CHECK:     %[[V_14:[0-9]+]] = fir.convert %[[V_5]] : (bf16) -> f32
+  ! CHECK:     %[[V_15:[0-9]+]] = arith.bitcast %[[V_14]] : f32 to i32
+  ! CHECK:     %[[V_16:[0-9]+]] = arith.shrui %[[V_15]], %c31{{.*}} : i32
+  ! CHECK:     %[[V_17:[0-9]+]] = fir.convert %[[V_16]] : (i32) -> i1
+  ! CHECK:     %[[V_18:[0-9]+]] = arith.cmpi ne, %[[V_13]], %[[V_17]] : i1
+  ! CHECK:     %[[V_19:[0-9]+]] = "llvm.intr.is.fpclass"(%[[V_5]]) <{bit = 516 : i32}> : (bf16) -> i1
+  ! CHECK:     %[[V_20:[0-9]+]] = arith.andi %[[V_19]], %[[V_18]] : i1
+  ! CHECK:     %[[V_21:[0-9]+]] = arith.ori %[[V_7]], %[[V_20]] : i1
+  ! CHECK:     %[[V_22:[0-9]+]] = fir.if %[[V_21]] -> (bf16) {
+  ! CHECK:       fir.result %[[V_5]] : bf16
+  ! CHECK:     } else {
+  ! CHECK:       %[[V_23:[0-9]+]] = arith.cmpf oeq, %[[V_5]], %cst{{[_0-9]*}} fastmath<contract> : bf16
+  ! CHECK:       %[[V_24:[0-9]+]] = fir.if %[[V_23]] -> (bf16) {
+  ! CHECK:         %[[V_25:[0-9]+]] = arith.select %[[V_13]], %cst{{[_0-9]*}}, %cst{{[_0-9]*}} : bf16
+  ! CHECK:         %[[V_26:[0-9]+]] = fir.call @_FortranAMapException(%c48{{.*}}) fastmath<contract> : (i32) -> i32
+  ! CHECK:                            fir.call @feraiseexcept(%[[V_26]]) fastmath<contract> : (i32) -> i32
+  ! CHECK:         fir.result %[[V_25]] : bf16
+  ! CHECK:       } else {
+  ! CHECK:         %[[V_25:[0-9]+]] = arith.bitcast %[[V_5]] : bf16 to i16
+  ! CHECK-DAG:     %[[V_26:[0-9]+]] = arith.subi %[[V_25]], %c1{{.*}} : i16
+  ! CHECK-DAG:     %[[V_27:[0-9]+]] = arith.addi %[[V_25]], %c1{{.*}} : i16
+  ! CHECK:         %[[V_28:[0-9]+]] = arith.select %[[V_18]], %[[V_27]], %[[V_26]] : i16
+  ! CHECK:         %[[V_29:[0-9]+]] = arith.bitcast %[[V_28]] : i16 to bf16
+  ! CHECK:         %[[V_30:[0-9]+]] = "llvm.intr.is.fpclass"(%[[V_29]]) <{bit = 516 : i32}> : (bf16) -> i1
+  ! CHECK:         fir.if %[[V_30]] {
+  ! CHECK:           %[[V_32:[0-9]+]] = fir.call @_FortranAMapException(%c40{{.*}}) fastmath<contract> : (i32) -> i32
+  ! CHECK:                              fir.call @feraiseexcept(%[[V_32]]) fastmath<contract> : (i32) -> i32
+  ! CHECK:         }
+  ! CHECK:         %[[V_31:[0-9]+]] = "llvm.intr.is.fpclass"(%[[V_29]]) <{bit = 144 : i32}> : (bf16) -> i1
+  ! CHECK:         fir.if %[[V_31]] {
+  ! CHECK:           %[[V_32:[0-9]+]] = fir.call @_FortranAMapException(%c48{{.*}}) fastmath<contract> : (i32) -> i32
+  ! CHECK:                              fir.call @feraiseexcept(%[[V_32]]) fastmath<contract> : (i32) -> i32
+  ! CHECK:         }
+  ! CHECK:         fir.result %[[V_29]] : bf16
+  ! CHECK:       }
+  ! CHECK:       fir.result %[[V_24]] : bf16
+  ! CHECK:     }
+  ! CHECK:     fir.store %[[V_22]] to %[[V_2]] : !fir.ref<bf16>
+  ! CHECK:     return
+  ! CHECK:   }
+subroutine nearest_test2(x, s)
+  real(kind=3) :: x, s, res
+  res = nearest(x, s)
+end
+
+! CHECK-LABEL: c.func @_QPnearest_test3
+  ! CHECK:     %[[V_0:[0-9]+]] = fir.dummy_scope : !fir.dscope
+  ! CHECK:     %[[V_1:[0-9]+]] = fir.alloca f32 {bindc_name = "res", uniq_name = "_QFnearest_test3Eres"}
+  ! CHECK:     %[[V_2:[0-9]+]] = fir.declare %[[V_1]] {uniq_name = "_QFnearest_test3Eres"} : (!fir.ref<f32>) -> !fir.ref<f32>
+  ! CHECK:     %[[V_3:[0-9]+]] = fir.declare %arg1 dummy_scope %[[V_0]] {uniq_name = "_QFnearest_test3Es"} : (!fir.ref<f32>, !fir.dscope) -> !fir.ref<f32>
+  ! CHECK:     %[[V_4:[0-9]+]] = fir.declare %arg0 dummy_scope %[[V_0]] {uniq_name = "_QFnearest_test3Ex"} : (!fir.ref<f32>, !fir.dscope) -> !fir.ref<f32>
+  ! CHECK:     %[[V_5:[0-9]+]] = fir.load %[[V_4]] : !fir.ref<f32>
+  ! CHECK:     %[[V_6:[0-9]+]] = fir.load %[[V_3]] : !fir.ref<f32>
+  ! CHECK:     %[[V_7:[0-9]+]] = "llvm.intr.is.fpclass"(%[[V_5]]) <{bit = 3 : i32}> : (f32) -> i1
+  ! CHECK:     %[[V_8:[0-9]+]] = "llvm.intr.is.fpclass"(%[[V_6]]) <{bit = 96 : i32}> : (f32) -> i1
+  ! CHECK:     fir.if %[[V_8]] {
+  ! CHECK:       fir.call @_FortranAReportFatalUserError
+  ! CHECK:     }
+  ! CHECK:     %[[V_9:[0-9]+]] = arith.bitcast %[[V_6]] : f32 to i32
+  ! CHECK:     %[[V_10:[0-9]+]] = arith.shrui %[[V_9]], %c31{{.*}} : i32
+  ! CHECK:     %[[V_11:[0-9]+]] = arith.cmpi ne, %[[V_10]], %c1{{.*}} : i32
+  ! CHECK:     %[[V_12:[0-9]+]] = arith.bitcast %[[V_5]] : f32 to i32
+  ! CHECK:     %[[V_13:[0-9]+]] = arith.shrui %[[V_12]], %c31{{.*}} : i32
+  ! CHECK:     %[[V_14:[0-9]+]] = fir.convert %[[V_13]] : (i32) -> i1
+  ! CHECK:     %[[V_15:[0-9]+]] = arith.cmpi ne, %[[V_11]], %[[V_14]] : i1
+  ! CHECK:     %[[V_16:[0-9]+]] = "llvm.intr.is.fpclass"(%[[V_5]]) <{bit = 516 : i32}> : (f32) -> i1
+  ! CHECK:     %[[V_17:[0-9]+]] = arith.andi %[[V_16]], %[[V_15]] : i1
+  ! CHECK:     %[[V_18:[0-9]+]] = arith.ori %[[V_7]], %[[V_17]] : i1
+  ! CHECK:     %[[V_19:[0-9]+]] = fir.if %[[V_18]] -> (f32) {
+  ! CHECK:       fir.result %[[V_5]] : f32
+  ! CHECK:     } else {
+  ! CHECK:       %[[V_20:[0-9]+]] = arith.cmpf oeq, %[[V_5]], %cst{{[_0-9]*}} fastmath<contract> : f32
+  ! CHECK:       %[[V_21:[0-9]+]] = fir.if %[[V_20]] -> (f32) {
+  ! CHECK:         %[[V_22:[0-9]+]] = arith.select %[[V_11]], %cst{{[_0-9]*}}, %cst{{[_0-9]*}} : f32
+  ! CHECK:         %[[V_23:[0-9]+]] = fir.call @_FortranAMapException(%c48{{.*}}) fastmath<contract> : (i32) -> i32
+  ! CHECK:                            fir.call @feraiseexcept(%[[V_23]]) fastmath<contract> : (i32) -> i32
+  ! CHECK:         fir.result %[[V_22]] : f32
+  ! CHECK:       } else {
+  ! CHECK-DAG:     %[[V_22:[0-9]+]] = arith.subi %[[V_12]], %c1{{.*}} : i32
+  ! CHECK-DAG:     %[[V_23:[0-9]+]] = arith.addi %[[V_12]], %c1{{.*}} : i32
+  ! CHECK:         %[[V_24:[0-9]+]] = arith.select %[[V_15]], %[[V_23]], %[[V_22]] : i32
+  ! CHECK:         %[[V_25:[0-9]+]] = arith.bitcast %[[V_24]] : i32 to f32
+  ! CHECK:         %[[V_26:[0-9]+]] = "llvm.intr.is.fpclass"(%[[V_25]]) <{bit = 516 : i32}> : (f32) -> i1
+  ! CHECK:         fir.if %[[V_26]] {
+  ! CHECK:           %[[V_28:[0-9]+]] = fir.call @_FortranAMapException(%c40{{.*}}) fastmath<contract> : (i32) -> i32
+  ! CHECK:                              fir.call @feraiseexcept(%[[V_28]]) fastmath<contract> : (i32) -> i32
+  ! CHECK:         }
+  ! CHECK:         %[[V_27:[0-9]+]] = "llvm.intr.is.fpclass"(%[[V_25]]) <{bit = 144 : i32}> : (f32) -> i1
+  ! CHECK:         fir.if %[[V_27]] {
+  ! CHECK:           %[[V_28:[0-9]+]] = fir.call @_FortranAMapException(%c48{{.*}}) fastmath<contract> : (i32) -> i32
+  ! CHECK:                              fir.call @feraiseexcept(%[[V_28]]) fastmath<contract> : (i32) -> i32
+  ! CHECK:         }
+  ! CHECK:         fir.result %[[V_25]] : f32
+  ! CHECK:       }
+  ! CHECK:       fir.result %[[V_21]] : f32
+  ! CHECK:     }
+  ! CHECK:     fir.store %[[V_19]] to %[[V_2]] : !fir.ref<f32>
+  ! CHECK:     return
+  ! CHECK:   }
+subroutine nearest_test3(x, s)
+  real :: x, s, res
+  res = nearest(x, s)
+end
+
+! CHECK-LABEL: c.func @_QPnearest_test4
+  ! CHECK:     %[[V_0:[0-9]+]] = fir.dummy_scope : !fir.dscope
+  ! CHECK:     %[[V_1:[0-9]+]] = fir.alloca f64 {bindc_name = "res", uniq_name = "_QFnearest_test4Eres"}
+  ! CHECK:     %[[V_2:[0-9]+]] = fir.declare %[[V_1]] {uniq_name = "_QFnearest_test4Eres"} : (!fir.ref<f64>) -> !fir.ref<f64>
+  ! CHECK:     %[[V_3:[0-9]+]] = fir.declare %arg1 dummy_scope %[[V_0]] {uniq_name = "_QFnearest_test4Es"} : (!fir.ref<f64>, !fir.dscope) -> !fir.ref<f64>
+  ! CHECK:     %[[V_4:[0-9]+]] = fir.declare %arg0 dummy_scope %[[V_0]] {uniq_name = "_QFnearest_test4Ex"} : (!fir.ref<f64>, !fir.dscope) -> !fir.ref<f64>
+  ! CHECK:     %[[V_5:[0-9]+]] = fir.load %[[V_4]] : !fir.ref<f64>
+  ! CHECK:     %[[V_6:[0-9]+]] = fir.load %[[V_3]] : !fir.ref<f64>
+  ! CHECK:     %[[V_7:[0-9]+]] = "llvm.intr.is.fpclass"(%[[V_5]]) <{bit = 3 : i32}> : (f64) -> i1
+  ! CHECK:     %[[V_8:[0-9]+]] = "llvm.intr.is.fpclass"(%[[V_6]]) <{bit = 96 : i32}> : (f64) -> i1
+  ! CHECK:     fir.if %[[V_8]] {
+  ! CHECK:       fir.call @_FortranAReportFatalUserError
+  ! CHECK:     }
+  ! CHECK:     %[[V_9:[0-9]+]] = arith.bitcast %[[V_6]] : f64 to i64
+  ! CHECK:     %[[V_10:[0-9]+]] = arith.shrui %[[V_9]], %c63{{.*}} : i64
+  ! CHECK:     %[[V_11:[0-9]+]] = arith.cmpi ne, %[[V_10]], %c1{{.*}} : i64
+  ! CHECK:     %[[V_12:[0-9]+]] = arith.bitcast %[[V_5]] : f64 to i64
+  ! CHECK:     %[[V_13:[0-9]+]] = arith.shrui %[[V_12]], %c63{{.*}} : i64
+  ! CHECK:     %[[V_14:[0-9]+]] = fir.convert %[[V_13]] : (i64) -> i1
+  ! CHECK:     %[[V_15:[0-9]+]] = arith.cmpi ne, %[[V_11]], %[[V_14]] : i1
+  ! CHECK:     %[[V_16:[0-9]+]] = "llvm.intr.is.fpclass"(%[[V_5]]) <{bit = 516 : i32}> : (f64) -> i1
+  ! CHECK:     %[[V_17:[0-9]+]] = arith.andi %[[V_16]], %[[V_15]] : i1
+  ! CHECK:     %[[V_18:[0-9]+]] = arith.ori %[[V_7]], %[[V_17]] : i1
+  ! CHECK:     %[[V_19:[0-9]+]] = fir.if %[[V_18]] -> (f64) {
+  ! CHECK:       fir.result %[[V_5]] : f64
+  ! CHECK:     } else {
+  ! CHECK:       %[[V_20:[0-9]+]] = arith.cmpf oeq, %[[V_5]], %cst{{[_0-9]*}} fastmath<contract> : f64
+  ! CHECK:       %[[V_21:[0-9]+]] = fir.if %[[V_20]] -> (f64) {
+  ! CHECK:         %[[V_22:[0-9]+]] = arith.select %[[V_11]], %cst{{[_0-9]*}}, %cst{{[_0-9]*}} : f64
+  ! CHECK:         %[[V_23:[0-9]+]] = fir.call @_FortranAMapException(%c48{{.*}}) fastmath<contract> : (i32) -> i32
+  ! CHECK:                            fir.call @feraiseexcept(%[[V_23]]) fastmath<contract> : (i32) -> i32
+  ! CHECK:         fir.result %[[V_22]] : f64
+  ! CHECK:       } else {
+  ! CHECK-DAG:     %[[V_22:[0-9]+]] = arith.subi %[[V_12]], %c1{{.*}} : i64
+  ! CHECK-DAG:     %[[V_23:[0-9]+]] = arith.addi %[[V_12]], %c1{{.*}} : i64
+  ! CHECK:         %[[V_24:[0-9]+]] = arith.select %[[V_15]], %[[V_23]], %[[V_22]] : i64
+  ! CHECK:         %[[V_25:[0-9]+]] = arith.bitcast %[[V_24]] : i64 to f64
+  ! CHECK:         %[[V_26:[0-9]+]] = "llvm.intr.is.fpclass"(%[[V_25]]) <{bit = 516 : i32}> : (f64) -> i1
+  ! CHECK:         fir.if %[[V_26]] {
+  ! CHECK:           %[[V_28:[0-9]+]] = fir.call @_FortranAMapException(%c40{{.*}}) fastmath<contract> : (i32) -> i32
+  ! CHECK:                              fir.call @feraiseexcept(%[[V_28]]) fastmath<contract> : (i32) -> i32
+  ! CHECK:         }
+  ! CHECK:         %[[V_27:[0-9]+]] = "llvm.intr.is.fpclass"(%[[V_25]]) <{bit = 144 : i32}> : (f64) -> i1
+  ! CHECK:         fir.if %[[V_27]] {
+  ! CHECK:           %[[V_28:[0-9]+]] = fir.call @_FortranAMapException(%c48{{.*}}) fastmath<contract> : (i32) -> i32
+  ! CHECK:                              fir.call @feraiseexcept(%[[V_28]]) fastmath<contract> : (i32) -> i32
+  ! CHECK:         }
+  ! CHECK:         fir.result %[[V_25]] : f64
+  ! CHECK:       }
+  ! CHECK:       fir.result %[[V_21]] : f64
+  ! CHECK:     }
+  ! CHECK:     fir.store %[[V_19]] to %[[V_2]] : !fir.ref<f64>
+  ! CHECK:     return
+  ! CHECK:   }
+subroutine nearest_test4(x, s)
+  real(kind=8) :: x, s, res
+  res = nearest(x, s)
+end
+
+! CHECK-LABEL: c.func @_QPnearest_test5
+  ! CHECK:     %[[V_0:[0-9]+]] = fir.dummy_scope : !fir.dscope
+  ! CHECK:     %[[V_1:[0-9]+]] = fir.alloca f80 {bindc_name = "res", uniq_name = "_QFnearest_test5Eres"}
+  ! CHECK:     %[[V_2:[0-9]+]] = fir.declare %[[V_1]] {uniq_name = "_QFnearest_test5Eres"} : (!fir.ref<f80>) -> !fir.ref<f80>
+  ! CHECK:     %[[V_3:[0-9]+]] = fir.declare %arg1 dummy_scope %[[V_0]] {uniq_name = "_QFnearest_test5Es"} : (!fir.ref<f80>, !fir.dscope) -> !fir.ref<f80>
+  ! CHECK:     %[[V_4:[0-9]+]] = fir.declare %arg0 dummy_scope %[[V_0]] {uniq_name = "_QFnearest_test5Ex"} : (!fir.ref<f80>, !fir.dscope) -> !fir.ref<f80>
+  ! CHECK:     %[[V_5:[0-9]+]] = fir.load %[[V_4]] : !fir.ref<f80>
+  ! CHECK:     %[[V_6:[0-9]+]] = fir.load %[[V_3]] : !fir.ref<f80>
+  ! CHECK:     %[[V_7:[0-9]+]] = "llvm.intr.is.fpclass"(%[[V_5]]) <{bit = 3 : i32}> : (f80) -> i1
+  ! CHECK:     %[[V_8:[0-9]+]] = "llvm.intr.is.fpclass"(%[[V_6]]) <{bit = 96 : i32}> : (f80) -> i1
+  ! CHECK:     fir.if %[[V_8]] {
+  ! CHECK:       fir.call @_FortranAReportFatalUserError
+  ! CHECK:     }
+  ! CHECK:     %[[V_9:[0-9]+]] = arith.bitcast %[[V_6]] : f80 to i80
+  ! CHECK:     %[[V_10:[0-9]+]] = arith.shrui %[[V_9]], %c79{{.*}} : i80
+  ! CHECK:     %[[V_11:[0-9]+]] = arith.cmpi ne, %[[V_10]], %c1{{.*}} : i80
+  ! CHECK:     %[[V_12:[0-9]+]] = arith.bitcast %[[V_5]] : f80 to i80
+  ! CHECK:     %[[V_13:[0-9]+]] = arith.shrui %[[V_12]], %c79{{.*}} : i80
+  ! CHECK:     %[[V_14:[0-9]+]] = fir.convert %[[V_13]] : (i80) -> i1
+  ! CHECK:     %[[V_15:[0-9]+]] = arith.cmpi ne, %[[V_11]], %[[V_14]] : i1
+  ! CHECK:     %[[V_16:[0-9]+]] = "llvm.intr.is.fpclass"(%[[V_5]]) <{bit = 516 : i32}> : (f80) -> i1
+  ! CHECK:     %[[V_17:[0-9]+]] = arith.andi %[[V_16]], %[[V_15]] : i1
+  ! CHECK:     %[[V_18:[0-9]+]] = arith.ori %[[V_7]], %[[V_17]] : i1
+  ! CHECK:     %[[V_19:[0-9]+]] = fir.if %[[V_18]] -> (f80) {
+  ! CHECK:       fir.result %[[V_5]] : f80
+  ! CHECK:     } else {
+  ! CHECK:       %[[V_20:[0-9]+]] = arith.cmpf oeq, %[[V_5]], %cst{{[_0-9]*}} fastmath<contract> : f80
+  ! CHECK:       %[[V_21:[0-9]+]] = fir.if %[[V_20]] -> (f80) {
+  ! CHECK:         %[[V_22:[0-9]+]] = arith.select %[[V_11]], %cst{{[_0-9]*}}, %cst{{[_0-9]*}} : f80
+  ! CHECK:         %[[V_23:[0-9]+]] = fir.call @_FortranAMapException(%c48{{.*}}) fastmath<contract> : (i32) -> i32
+  ! CHECK:                            fir.call @feraiseexcept(%[[V_23]]) fastmath<contract> : (i32) -> i32
+  ! CHECK:         fir.result %[[V_22]] : f80
+  ! CHECK:       } else {
+  ! CHECK:         %[[V_22:[0-9]+]] = fir.call @_FortranANearest10(%[[V_5]], %[[V_11]]) fastmath<contract> : (f80, i1) -> f80
+  ! CHECK:         fir.result %[[V_22]] : f80
+  ! CHECK:       }
+  ! CHECK:       fir.result %[[V_21]] : f80
+  ! CHECK:     }
+  ! CHECK:     fir.store %[[V_19]] to %[[V_2]] : !fir.ref<f80>
+  ! CHECK:     return
+  ! CHECK:   }
+subroutine nearest_test5(x, s)
+  real(kind=10) :: x, s, res
+  res = nearest(x, s)
+end
+
+! CHECK-LABEL: c.func @_QPnearest_test6
+  ! CHECK:     %[[V_0:[0-9]+]] = fir.dummy_scope : !fir.dscope
+  ! CHECK:     %[[V_1:[0-9]+]] = fir.alloca f128 {bindc_name = "res", uniq_name = "_QFnearest_test6Eres"}
+  ! CHECK:     %[[V_2:[0-9]+]] = fir.declare %[[V_1]] {uniq_name = "_QFnearest_test6Eres"} : (!fir.ref<f128>) -> !fir.ref<f128>
+  ! CHECK:     %[[V_3:[0-9]+]] = fir.declare %arg1 dummy_scope %[[V_0]] {uniq_name = "_QFnearest_test6Es"} : (!fir.ref<f128>, !fir.dscope) -> !fir.ref<f128>
+  ! CHECK:     %[[V_4:[0-9]+]] = fir.declare %arg0 dummy_scope %[[V_0]] {uniq_name = "_QFnearest_test6Ex"} : (!fir.ref<f128>, !fir.dscope) -> !fir.ref<f128>
+  ! CHECK:     %[[V_5:[0-9]+]] = fir.load %[[V_4]] : !fir.ref<f128>
+  ! CHECK:     %[[V_6:[0-9]+]] = fir.load %[[V_3]] : !fir.ref<f128>
+  ! CHECK:     %[[V_7:[0-9]+]] = "llvm.intr.is.fpclass"(%[[V_5]]) <{bit = 3 : i32}> : (f128) -> i1
+  ! CHECK:     %[[V_8:[0-9]+]] = "llvm.intr.is.fpclass"(%[[V_6]]) <{bit = 96 : i32}> : (f128) -> i1
+  ! CHECK:     fir.if %[[V_8]] {
+  ! CHECK:       fir.call @_FortranAReportFatalUserError
+  ! CHECK:     }
+  ! CHECK:     %[[V_9:[0-9]+]] = arith.bitcast %[[V_6]] : f128 to i128
+  ! CHECK:     %[[V_10:[0-9]+]] = arith.shrui %[[V_9]], %c127{{.*}} : i128
+  ! CHECK:     %[[V_11:[0-9]+]] = arith.cmpi ne, %[[V_10]], %c1{{.*}} : i128
+  ! CHECK:     %[[V_12:[0-9]+]] = arith.bitcast %[[V_5]] : f128 to i128
+  ! CHECK:     %[[V_13:[0-9]+]] = arith.shrui %[[V_12]], %c127{{.*}} : i128
+  ! CHECK:     %[[V_14:[0-9]+]] = fir.convert %[[V_13]] : (i128) -> i1
+  ! CHECK:     %[[V_15:[0-9]+]] = arith.cmpi ne, %[[V_11]], %[[V_14]] : i1
+  ! CHECK:     %[[V_16:[0-9]+]] = "llvm.intr.is.fpclass"(%[[V_5]]) <{bit = 516 : i32}> : (f128) -> i1
+  ! CHECK:     %[[V_17:[0-9]+]] = arith.andi %[[V_16]], %[[V_15]] : i1
+  ! CHECK:     %[[V_18:[0-9]+]] = arith.ori %[[V_7]], %[[V_17]] : i1
+  ! CHECK:     %[[V_19:[0-9]+]] = fir.if %[[V_18]] -> (f128) {
+  ! CHECK:       fir.result %[[V_5]] : f128
+  ! CHECK:     } else {
+  ! CHECK:       %[[V_20:[0-9]+]] = arith.cmpf oeq, %[[V_5]], %cst{{[_0-9]*}} fastmath<contract> : f128
+  ! CHECK:       %[[V_21:[0-9]+]] = fir.if %[[V_20]] -> (f128) {
+  ! CHECK:         %[[V_22:[0-9]+]] = arith.select %[[V_11]], %cst{{[_0-9]*}}, %cst{{[_0-9]*}} : f128
+  ! CHECK:         %[[V_23:[0-9]+]] = fir.call @_FortranAMapException(%c48{{.*}}) fastmath<contract> : (i32) -> i32
+  ! CHECK:                            fir.call @feraiseexcept(%[[V_23]]) fastmath<contract> : (i32) -> i32
+  ! CHECK:         fir.result %[[V_22]] : f128
+  ! CHECK:       } else {
+  ! CHECK-DAG:     %[[V_22:[0-9]+]] = arith.subi %[[V_12]], %c1{{.*}} : i128
+  ! CHECK-DAG:     %[[V_23:[0-9]+]] = arith.addi %[[V_12]], %c1{{.*}} : i128
+  ! CHECK:         %[[V_24:[0-9]+]] = arith.select %[[V_15]], %[[V_23]], %[[V_22]] : i128
+  ! CHECK:         %[[V_25:[0-9]+]] = arith.bitcast %[[V_24]] : i128 to f128
+  ! CHECK:         %[[V_26:[0-9]+]] = "llvm.intr.is.fpclass"(%[[V_25]]) <{bit = 516 : i32}> : (f128) -> i1
+  ! CHECK:         fir.if %[[V_26]] {
+  ! CHECK:           %[[V_28:[0-9]+]] = fir.call @_FortranAMapException(%c40{{.*}}) fastmath<contract> : (i32) -> i32
+  ! CHECK:                              fir.call @feraiseexcept(%[[V_28]]) fastmath<contract> : (i32) -> i32
+  ! CHECK:         }
+  ! CHECK:         %[[V_27:[0-9]+]] = "llvm.intr.is.fpclass"(%[[V_25]]) <{bit = 144 : i32}> : (f128) -> i1
+  ! CHECK:         fir.if %[[V_27]] {
+  ! CHECK:           %[[V_28:[0-9]+]] = fir.call @_FortranAMapException(%c48{{.*}}) fastmath<contract> : (i32) -> i32
+  ! CHECK:                              fir.call @feraiseexcept(%[[V_28]]) fastmath<contract> : (i32) -> i32
+  ! CHECK:         }
+  ! CHECK:         fir.result %[[V_25]] : f128
+  ! CHECK:       }
+  ! CHECK:       fir.result %[[V_21]] : f128
+  ! CHECK:     }
+  ! CHECK:     fir.store %[[V_19]] to %[[V_2]] : !fir.ref<f128>
+  ! CHECK:     return
+  ! CHECK:   }
+subroutine nearest_test6(x, s)
+  real(kind=16) :: x, s, res
+  res = nearest(x, s)
+end
+
+! CHECK-LABEL: c.func @_QPnearest_test7
+  ! CHECK:     %[[V_0:[0-9]+]] = fir.dummy_scope : !fir.dscope
+  ! CHECK:     %[[V_1:[0-9]+]] = fir.alloca f128 {bindc_name = "res", uniq_name = "_QFnearest_test7Eres"}
+  ! CHECK:     %[[V_2:[0-9]+]] = fir.declare %[[V_1]] {uniq_name = "_QFnearest_test7Eres"} : (!fir.ref<f128>) -> !fir.ref<f128>
+  ! CHECK:     %[[V_3:[0-9]+]] = fir.declare %arg1 dummy_scope %[[V_0]] {uniq_name = "_QFnearest_test7Es"} : (!fir.ref<f32>, !fir.dscope) -> !fir.ref<f32>
+  ! CHECK:     %[[V_4:[0-9]+]] = fir.declare %arg0 dummy_scope %[[V_0]] {uniq_name = "_QFnearest_test7Ex"} : (!fir.ref<f128>, !fir.dscope) -> !fir.ref<f128>
+  ! CHECK:     %[[V_5:[0-9]+]] = fir.load %[[V_4]] : !fir.ref<f128>
+  ! CHECK:     %[[V_6:[0-9]+]] = fir.load %[[V_3]] : !fir.ref<f32>
+  ! CHECK:     %[[V_7:[0-9]+]] = "llvm.intr.is.fpclass"(%[[V_5]]) <{bit = 3 : i32}> : (f128) -> i1
+  ! CHECK:     %[[V_8:[0-9]+]] = "llvm.intr.is.fpclass"(%[[V_6]]) <{bit = 96 : i32}> : (f32) -> i1
+  ! CHECK:     fir.if %[[V_8]] {
+  ! CHECK:       fir.call @_FortranAReportFatalUserError
+  ! CHECK:     }
+  ! CHECK:     %[[V_9:[0-9]+]] = arith.bitcast %[[V_6]] : f32 to i32
+  ! CHECK:     %[[V_10:[0-9]+]] = arith.shrui %[[V_9]], %c31{{.*}} : i32
+  ! CHECK:     %[[V_11:[0-9]+]] = fir.convert %[[V_10]] : (i32) -> i128
+  ! CHECK:     %[[V_12:[0-9]+]] = arith.cmpi ne, %[[V_11]], %c1{{.*}} : i128
+  ! CHECK:     %[[V_13:[0-9]+]] = arith.bitcast %[[V_5]] : f128 to i128
+  ! CHECK:     %[[V_14:[0-9]+]] = arith.shrui %[[V_13]], %c127{{.*}} : i128
+  ! CHECK:     %[[V_15:[0-9]+]] = fir.convert %[[V_14]] : (i128) -> i1
+  ! CHECK:     %[[V_16:[0-9]+]] = arith.cmpi ne, %[[V_12]], %[[V_15]] : i1
+  ! CHECK:     %[[V_17:[0-9]+]] = "llvm.intr.is.fpclass"(%[[V_5]]) <{bit = 516 : i32}> : (f128) -> i1
+  ! CHECK:     %[[V_18:[0-9]+]] = arith.andi %[[V_17]], %[[V_16]] : i1
+  ! CHECK:     %[[V_19:[0-9]+]] = arith.ori %[[V_7]], %[[V_18]] : i1
+  ! CHECK:     %[[V_20:[0-9]+]] = fir.if %[[V_19]] -> (f128) {
+  ! CHECK:       fir.result %[[V_5]] : f128
+  ! CHECK:     } else {
+  ! CHECK:       %[[V_21:[0-9]+]] = arith.cmpf oeq, %[[V_5]], %cst{{[_0-9]*}} fastmath<contract> : f128
+  ! CHECK:       %[[V_22:[0-9]+]] = fir.if %[[V_21]] -> (f128) {
+  ! CHECK:         %[[V_23:[0-9]+]] = arith.select %[[V_12]], %cst{{[_0-9]*}}, %cst{{[_0-9]*}} : f128
+  ! CHECK:         %[[V_24:[0-9]+]] = fir.call @_FortranAMapException(%c48{{.*}}) fastmath<contract> : (i32) -> i32
+  ! CHECK:                            fir.call @feraiseexcept(%[[V_24]]) fastmath<contract> : (i32) -> i32
+  ! CHECK:         fir.result %[[V_23]] : f128
+  ! CHECK:       } else {
+  ! CHECK-DAG:     %[[V_23:[0-9]+]] = arith.subi %[[V_13]], %c1{{.*}} : i128
+  ! CHECK-DAG:     %[[V_24:[0-9]+]] = arith.addi %[[V_13]], %c1{{.*}} : i128
+  ! CHECK:         %[[V_25:[0-9]+]] = arith.select %[[V_16]], %[[V_24]], %[[V_23]] : i128
+  ! CHECK:         %[[V_26:[0-9]+]] = arith.bitcast %[[V_25]] : i128 to f128
+  ! CHECK:         %[[V_27:[0-9]+]] = "llvm.intr.is.fpclass"(%[[V_26]]) <{bit = 516 : i32}> : (f128) -> i1
+  ! CHECK:         fir.if %[[V_27]] {
+  ! CHECK:           %[[V_29:[0-9]+]] = fir.call @_FortranAMapException(%c40{{.*}}) fastmath<contract> : (i32) -> i32
+  ! CHECK:                              fir.call @feraiseexcept(%[[V_29]]) fastmath<contract> : (i32) -> i32
+  ! CHECK:         }
+  ! CHECK:         %[[V_28:[0-9]+]] = "llvm.intr.is.fpclass"(%[[V_26]]) <{bit = 144 : i32}> : (f128) -> i1
+  ! CHECK:         fir.if %[[V_28]] {
+  ! CHECK:           %[[V_29:[0-9]+]] = fir.call @_FortranAMapException(%c48{{.*}}) fastmath<contract> : (i32) -> i32
+  ! CHECK:                              fir.call @feraiseexcept(%[[V_29]]) fastmath<contract> : (i32) -> i32
+  ! CHECK:         }
+  ! CHECK:         fir.result %[[V_26]] : f128
+  ! CHECK:       }
+  ! CHECK:       fir.result %[[V_22]] : f128
+  ! CHECK:     }
+  ! CHECK:     fir.store %[[V_20]] to %[[V_2]] : !fir.ref<f128>
+  ! CHECK:     return
+  ! CHECK:   }
+subroutine nearest_test7(x, s)
+  real(kind=16) :: x, res
+  real :: s
+  res = nearest(x, s)
+end
diff --git a/flang/test/Lower/OpenACC/acc-loop.f90 b/flang/test/Lower/OpenACC/acc-loop.f90
index fa910e7..0d2594c 100644
--- a/flang/test/Lower/OpenACC/acc-loop.f90
+++ b/flang/test/Lower/OpenACC/acc-loop.f90
@@ -317,10 +317,10 @@ subroutine sub1(i, j, k)
 end subroutine
 
 ! CHECK: func.func @_QPsub1
+! CHECK: acc.parallel
 ! CHECK: %[[DC_K:.*]] = fir.alloca i32 {bindc_name = "k"}
 ! CHECK: %[[DC_J:.*]] = fir.alloca i32 {bindc_name = "j"}
 ! CHECK: %[[DC_I:.*]] = fir.alloca i32 {bindc_name = "i"}
-! CHECK: acc.parallel
 ! CHECK: %[[P_I:.*]] = acc.private varPtr(%[[DC_I]] : !fir.ref<i32>) -> !fir.ref<i32> {implicit = true, name = ""}
 ! CHECK: %[[P_J:.*]] = acc.private varPtr(%[[DC_J]] : !fir.ref<i32>) -> !fir.ref<i32> {implicit = true, name = ""}
 ! CHECK: %[[P_K:.*]] = acc.private varPtr(%[[DC_K]] : !fir.ref<i32>) -> !fir.ref<i32> {implicit = true, name = ""}
diff --git a/flang/test/Lower/OpenMP/associate.f90 b/flang/test/Lower/OpenMP/associate.f90
new file mode 100644
index 0000000..c6890f0
--- /dev/null
+++ b/flang/test/Lower/OpenMP/associate.f90
@@ -0,0 +1,38 @@
+! Check that constructs with associate and variables that have implicitly
+! determined DSAs are lowered properly.
+! RUN: %flang_fc1 -emit-hlfir -fopenmp -o - %s 2>&1 | FileCheck %s
+
+!CHECK-LABEL: func @_QPtest_parallel_assoc
+!CHECK:         omp.parallel {
+!CHECK-NOT:       hlfir.declare {{.*}} {uniq_name = "_QFtest_parallel_assocEa"}
+!CHECK-NOT:       hlfir.declare {{.*}} {uniq_name = "_QFtest_parallel_assocEb"}
+!CHECK:           omp.wsloop {
+!CHECK:           }
+!CHECK:         }
+!CHECK:         omp.parallel {
+!CHECK-NOT:       hlfir.declare {{.*}} {uniq_name = "_QFtest_parallel_assocEb"}
+!CHECK:           omp.wsloop {
+!CHECK:           }
+!CHECK:         }
+subroutine test_parallel_assoc()
+  integer, parameter :: l = 3
+  integer :: a(l)
+  integer :: i
+  a = 1
+
+  !$omp parallel do
+  do i = 1,l
+    associate (b=>a)
+      b(i) = b(i) * 2
+    end associate
+  enddo
+  !$omp end parallel do
+
+  !$omp parallel do default(private)
+  do i = 1,l
+    associate (b=>a)
+      b(i) = b(i) * 2
+    end associate
+  enddo
+  !$omp end parallel do
+end subroutine
diff --git a/flang/test/Lower/OpenMP/default-clause-implied-do-fix.f90 b/flang/test/Lower/OpenMP/default-clause-implied-do-fix.f90
index 2557927..21992aa 100644
--- a/flang/test/Lower/OpenMP/default-clause-implied-do-fix.f90
+++ b/flang/test/Lower/OpenMP/default-clause-implied-do-fix.f90
@@ -1,6 +1,6 @@
 !RUN: %flang_fc1 -emit-hlfir -fopenmp %s -o - | FileCheck %s
 
-!CHECK: @_QPsb
+!CHECK-LABEL: func @_QPsb
 subroutine sb(a)
   integer :: a(:)
 !CHECK: omp.parallel
@@ -9,3 +9,16 @@ subroutine sb(a)
     if (any(a/=(/(100,i=1,5)/))) print *, "OK"
   !$omp end parallel
 end subroutine
+
+!CHECK-LABEL: func @_QPsb2
+subroutine sb2()
+  integer, parameter :: SIZE=20
+  integer :: i, a(SIZE)
+
+! Just check that the construct below doesn't hit a TODO in lowering.
+!CHECK: omp.parallel
+  !$omp parallel
+    a = [ (i, i=1, SIZE) ]
+    print *, i
+  !$omp end parallel
+end subroutine
diff --git a/flang/test/Lower/OpenMP/target.f90 b/flang/test/Lower/OpenMP/target.f90
index 0500b21..9b92293 100644
--- a/flang/test/Lower/OpenMP/target.f90
+++ b/flang/test/Lower/OpenMP/target.f90
@@ -66,7 +66,7 @@ subroutine omp_target_enter_nowait
    integer :: a(1024)
    !CHECK: %[[BOUNDS:.*]] = omp.map.bounds   lower_bound({{.*}}) upper_bound({{.*}}) extent({{.*}}) stride({{.*}}) start_idx({{.*}})
    !CHECK: %[[MAP:.*]] = omp.map.info var_ptr({{.*}})   map_clauses(to) capture(ByRef) bounds(%[[BOUNDS]]) -> !fir.ref<!fir.array<1024xi32>> {name = "a"}
-   !CHECK: omp.target_enter_data  nowait map_entries(%[[MAP]] : !fir.ref<!fir.array<1024xi32>>)
+   !CHECK: omp.target_enter_data map_entries(%[[MAP]] : !fir.ref<!fir.array<1024xi32>>) nowait
    !$omp target enter data map(to: a) nowait
 end subroutine omp_target_enter_nowait
 
@@ -278,7 +278,7 @@ subroutine omp_target_update_nowait
    !CHECK-DAG: %[[A_DECL:.*]]:2 = hlfir.declare %{{.*}}(%{{.*}})
    !CHECK-DAG: %[[BOUNDS:.*]] = omp.map.bounds
 
-   !CHECK: omp.target_update nowait map_entries
+   !CHECK: omp.target_update map_entries({{.*}}) nowait
    !$omp target update from(a) nowait
 end subroutine omp_target_update_nowait
 
@@ -493,7 +493,7 @@ subroutine omp_target_thread_limit
    integer :: a
    !CHECK: %[[MAP:.*]] = omp.map.info var_ptr({{.*}})   map_clauses(tofrom) capture(ByRef) -> !fir.ref<i32> {name = "a"}
    !CHECK: %[[VAL_1:.*]] = arith.constant 64 : i32
-   !CHECK: omp.target   thread_limit(%[[VAL_1]] : i32) map_entries(%[[MAP]] -> %{{.*}} : !fir.ref<i32>) {
+   !CHECK: omp.target map_entries(%[[MAP]] -> %{{.*}} : !fir.ref<i32>) thread_limit(%[[VAL_1]] : i32) {
    !CHECK: ^bb0(%{{.*}}: !fir.ref<i32>):
    !$omp target map(tofrom: a) thread_limit(64)
       a = 10
@@ -512,7 +512,7 @@ subroutine omp_target_device_ptr
    type(c_ptr) :: a
    integer, target :: b
    !CHECK: %[[MAP:.*]] = omp.map.info var_ptr({{.*}})   map_clauses(tofrom) capture(ByRef) -> {{.*}} {name = "a"}
-   !CHECK: omp.target_data use_device_ptr({{.*}}) map_entries(%[[MAP]]{{.*}}
+   !CHECK: omp.target_data map_entries(%[[MAP]]{{.*}}) use_device_ptr({{.*}})
    !$omp target data map(tofrom: a) use_device_ptr(a)
    !CHECK: ^bb0(%[[VAL_1:.*]]: !fir.ref<!fir.type<_QM__fortran_builtinsT__builtin_c_ptr{__address:i64}>>):
    !CHECK: {{.*}} = fir.coordinate_of %[[VAL_1:.*]], {{.*}} : (!fir.ref<!fir.type<_QM__fortran_builtinsT__builtin_c_ptr{__address:i64}>>, !fir.field) -> !fir.ref<i64>
@@ -533,7 +533,7 @@ end subroutine omp_target_device_ptr
    !CHECK: %[[VAL_0_DECL:.*]]:2 = hlfir.declare %0 {fortran_attrs = #fir.var_attrs<pointer>, uniq_name = "_QFomp_target_device_addrEa"} : (!fir.ref<!fir.box<!fir.ptr<i32>>>) -> (!fir.ref<!fir.box<!fir.ptr<i32>>>, !fir.ref<!fir.box<!fir.ptr<i32>>>)
    !CHECK: %[[MAP_MEMBERS:.*]] = omp.map.info var_ptr({{.*}} : !fir.ref<!fir.box<!fir.ptr<i32>>>, i32) var_ptr_ptr({{.*}} : !fir.llvm_ptr<!fir.ref<i32>>) map_clauses(tofrom) capture(ByRef) -> !fir.llvm_ptr<!fir.ref<i32>> {name = ""}
    !CHECK: %[[MAP:.*]] = omp.map.info var_ptr({{.*}} : !fir.ref<!fir.box<!fir.ptr<i32>>>, !fir.box<!fir.ptr<i32>>) map_clauses(tofrom) capture(ByRef) members(%[[MAP_MEMBERS]] : [0] : !fir.llvm_ptr<!fir.ref<i32>>) -> !fir.ref<!fir.box<!fir.ptr<i32>>> {name = "a"}
-   !CHECK: omp.target_data use_device_addr(%[[VAL_0_DECL]]#1 : !fir.ref<!fir.box<!fir.ptr<i32>>>) map_entries(%[[MAP_MEMBERS]], %[[MAP]] : {{.*}}) {
+   !CHECK: omp.target_data map_entries(%[[MAP_MEMBERS]], %[[MAP]] : {{.*}}) use_device_addr(%[[VAL_0_DECL]]#1 : !fir.ref<!fir.box<!fir.ptr<i32>>>) {
    !$omp target data map(tofrom: a) use_device_addr(a)
    !CHECK: ^bb0(%[[VAL_1:.*]]: !fir.ref<!fir.box<!fir.ptr<i32>>>):
    !CHECK: %[[VAL_1_DECL:.*]]:2 = hlfir.declare %[[VAL_1]] {fortran_attrs = #fir.var_attrs<pointer>, uniq_name = "_QFomp_target_device_addrEa"} : (!fir.ref<!fir.box<!fir.ptr<i32>>>) -> (!fir.ref<!fir.box<!fir.ptr<i32>>>, !fir.ref<!fir.box<!fir.ptr<i32>>>)
diff --git a/flang/test/Lower/OpenMP/task.f90 b/flang/test/Lower/OpenMP/task.f90
index 71ff57c..afbe2cb 100644
--- a/flang/test/Lower/OpenMP/task.f90
+++ b/flang/test/Lower/OpenMP/task.f90
@@ -227,7 +227,7 @@ subroutine task_multiple_clauses()
   integer :: x, y, z
   logical :: buzz
 
-  !CHECK: omp.task if(%{{.+}}) final(%{{.+}}) priority(%{{.+}}) allocate(%{{.+}} : i64 -> %{{.+}} : !fir.ref<i32>) {
+  !CHECK: omp.task allocate(%{{.+}} : i64 -> %{{.+}} : !fir.ref<i32>) final(%{{.+}}) if(%{{.+}}) priority(%{{.+}}) {
   !$omp task if(buzz) final(buzz) priority(z) allocate(omp_high_bw_mem_alloc: x) private(x) firstprivate(y)
 
 !CHECK: %[[X_PRIV_ALLOCA:.+]] = fir.alloca i32 {bindc_name = "x", pinned, uniq_name = "_QFtask_multiple_clausesEx"}
diff --git a/flang/test/Lower/OpenMP/use-device-ptr-to-use-device-addr.f90 b/flang/test/Lower/OpenMP/use-device-ptr-to-use-device-addr.f90
index 0ad06f7..acb5f53 100644
--- a/flang/test/Lower/OpenMP/use-device-ptr-to-use-device-addr.f90
+++ b/flang/test/Lower/OpenMP/use-device-ptr-to-use-device-addr.f90
@@ -8,7 +8,7 @@
 ! functionality 
 
 !CHECK: func.func @{{.*}}only_use_device_ptr()
-!CHECK: omp.target_data use_device_ptr(%{{.*}} : !fir.ref<!fir.type<_QM__fortran_builtinsT__builtin_c_ptr{__address:i64}>>) use_device_addr(%{{.*}}, %{{.*}} : !fir.ref<!fir.box<!fir.ptr<!fir.array<?xf32>>>>, !fir.ref<!fir.box<!fir.ptr<!fir.array<?xi32>>>>) {
+!CHECK: omp.target_data use_device_addr(%{{.*}}, %{{.*}} : !fir.ref<!fir.box<!fir.ptr<!fir.array<?xf32>>>>, !fir.ref<!fir.box<!fir.ptr<!fir.array<?xi32>>>>) use_device_ptr(%{{.*}} : !fir.ref<!fir.type<_QM__fortran_builtinsT__builtin_c_ptr{__address:i64}>>) {
 !CHECK: ^bb0(%{{.*}}: !fir.ref<!fir.type<_QM__fortran_builtinsT__builtin_c_ptr{__address:i64}>>, %{{.*}}: !fir.ref<!fir.box<!fir.ptr<!fir.array<?xf32>>>>, %{{.*}}: !fir.ref<!fir.box<!fir.ptr<!fir.array<?xi32>>>>):
 subroutine only_use_device_ptr 
     use iso_c_binding
@@ -21,7 +21,7 @@ subroutine only_use_device_ptr
 end subroutine
 
 !CHECK: func.func @{{.*}}mix_use_device_ptr_and_addr()
-!CHECK: omp.target_data use_device_ptr({{.*}} : !fir.ref<!fir.type<_QM__fortran_builtinsT__builtin_c_ptr{__address:i64}>>) use_device_addr(%{{.*}}, %{{.*}} : !fir.ref<!fir.box<!fir.ptr<!fir.array<?xi32>>>>, !fir.ref<!fir.box<!fir.ptr<!fir.array<?xf32>>>>) {
+!CHECK: omp.target_data use_device_addr(%{{.*}}, %{{.*}} : !fir.ref<!fir.box<!fir.ptr<!fir.array<?xi32>>>>, !fir.ref<!fir.box<!fir.ptr<!fir.array<?xf32>>>>) use_device_ptr({{.*}} : !fir.ref<!fir.type<_QM__fortran_builtinsT__builtin_c_ptr{__address:i64}>>) {
 !CHECK: ^bb0(%{{.*}}: !fir.ref<!fir.box<!fir.ptr<!fir.array<?xf32>>>>, %{{.*}}: !fir.ref<!fir.type<_QM__fortran_builtinsT__builtin_c_ptr{__address:i64}>>, %{{.*}}: !fir.ref<!fir.box<!fir.ptr<!fir.array<?xi32>>>>):
 subroutine mix_use_device_ptr_and_addr 
     use iso_c_binding
@@ -47,7 +47,7 @@ subroutine only_use_device_addr
 end subroutine
 
 !CHECK: func.func @{{.*}}mix_use_device_ptr_and_addr_and_map()
-!CHECK: omp.target_data use_device_ptr(%{{.*}} : !fir.ref<!fir.type<_QM__fortran_builtinsT__builtin_c_ptr{__address:i64}>>) use_device_addr(%{{.*}}, %{{.*}} : !fir.ref<!fir.box<!fir.ptr<!fir.array<?xi32>>>>, !fir.ref<!fir.box<!fir.ptr<!fir.array<?xf32>>>>) map_entries(%{{.*}}, %{{.*}} : !fir.ref<i32>, !fir.ref<i32>) {
+!CHECK: omp.target_data map_entries(%{{.*}}, %{{.*}} : !fir.ref<i32>, !fir.ref<i32>) use_device_addr(%{{.*}}, %{{.*}} : !fir.ref<!fir.box<!fir.ptr<!fir.array<?xi32>>>>, !fir.ref<!fir.box<!fir.ptr<!fir.array<?xf32>>>>) use_device_ptr(%{{.*}} : !fir.ref<!fir.type<_QM__fortran_builtinsT__builtin_c_ptr{__address:i64}>>) {
 !CHECK: ^bb0(%{{.*}}: !fir.ref<!fir.box<!fir.ptr<!fir.array<?xf32>>>>, %{{.*}}: !fir.ref<!fir.type<_QM__fortran_builtinsT__builtin_c_ptr{__address:i64}>>, %{{.*}}: !fir.ref<!fir.box<!fir.ptr<!fir.array<?xi32>>>>):
 subroutine mix_use_device_ptr_and_addr_and_map
     use iso_c_binding
diff --git a/flang/test/Parser/cuf-sanity-common b/flang/test/Parser/cuf-sanity-common
index 9d73204..9341f05 100644
--- a/flang/test/Parser/cuf-sanity-common
+++ b/flang/test/Parser/cuf-sanity-common
@@ -23,7 +23,8 @@ module m
   end subroutine
   subroutine test
     logical isPinned
-    real a(10), x, y, z
+    real, device :: a(10)
+    real :: x, y, z
     !$cuf kernel do(1) <<<*, *, stream = 1>>>
     do j = 1, 10
     end do
diff --git a/flang/test/Parser/recovery03.f90 b/flang/test/Parser/recovery03.f90
new file mode 100644
index 0000000..f3340f0d
--- /dev/null
+++ b/flang/test/Parser/recovery03.f90
@@ -0,0 +1,9 @@
+! RUN: not %flang_fc1 -fsyntax-only %s 2>&1 | FileCheck %s
+! CHECK: error: misplaced declaration in the execution part
+! CHECK:  real, pointer :: p2(:,:)
+! CHECK: in the context: execution part construct
+real, allocatable, target :: a2(:,:)
+allocate(a2(2:11,0:9))
+real, pointer :: p2(:,:)
+p2 => a2(2:3,1:2)
+end
diff --git a/flang/test/Parser/recovery04.f90 b/flang/test/Parser/recovery04.f90
new file mode 100644
index 0000000..144ebd2
--- /dev/null
+++ b/flang/test/Parser/recovery04.f90
@@ -0,0 +1,24 @@
+! RUN: not %flang_fc1 -fsyntax-only %s 2>&1 | FileCheck %s
+module m
+ contains
+  !CHECK: expected end of statement
+  !CHECK: subroutine s1(var i, j)
+  subroutine s1(var i, j)
+  end subroutine
+  !CHECK: expected end of statement
+  !CHECK: subroutine s2[b]
+  subroutine s2[b]
+  end subroutine
+  !CHECK: expected end of statement
+  !CHECK: function f1(var i, j)
+  function f1(var i, j)
+  end function
+  !CHECK: expected end of statement
+  !CHECK: function f2[b]
+  function f2[b]
+  end function
+  !CHECK: expected end of statement
+  !CHECK: function f3(a,*)
+  function f3(a,*)
+  end function
+end
diff --git a/flang/test/Preprocessing/line-in-contin.F90 b/flang/test/Preprocessing/line-in-contin.F90
new file mode 100644
index 0000000..138e579
--- /dev/null
+++ b/flang/test/Preprocessing/line-in-contin.F90
@@ -0,0 +1,20 @@
+! RUN: %flang_fc1 -E %s 2>&1 | FileCheck %s
+! CHECK: call foo( 0.)
+! CHECK: call foo( 1.)
+! CHECK: call foo( 2.)
+! CHECK: call foo( 3.)
+call foo( &
+# 100 "bar.h"
+         & 0.)
+call foo( &
+# 101 "bar.h"
+         1.)
+call foo( &
+# 102 "bar.h"
+         & 2. &
+    & )
+call foo( &
+# 103 "bar.h"
+         & 3. &
+    )
+end
diff --git a/flang/test/Semantics/OpenMP/copyprivate04.f90 b/flang/test/Semantics/OpenMP/copyprivate04.f90
new file mode 100644
index 0000000..291cf11
--- /dev/null
+++ b/flang/test/Semantics/OpenMP/copyprivate04.f90
@@ -0,0 +1,112 @@
+! RUN: %python %S/../test_errors.py %s %flang_fc1 -fopenmp
+! OpenMP Version 5.2
+! 5.1.1 - Variables Referenced in a Construct
+! Copyprivate must accept variables that are predetermined as private.
+
+module m1
+  integer :: m
+end module
+
+program omp_copyprivate
+  use m1
+  implicit none
+  integer :: i
+  integer, save :: j
+  integer :: k
+  common /c/ k
+  real, parameter :: pi = 3.14
+  integer :: a1(10)
+
+  ! Local variables are private.
+  !$omp single
+    i = 123
+  !$omp end single copyprivate(i)
+  !$omp single
+  !$omp end single copyprivate(a1)
+
+  ! Variables with the SAVE attribute are not private.
+  !$omp single
+  !ERROR: COPYPRIVATE variable 'j' is not PRIVATE or THREADPRIVATE in outer context
+  !$omp end single copyprivate(j)
+
+  ! Common block variables are not private.
+  !$omp single
+  !ERROR: COPYPRIVATE variable 'k' is not PRIVATE or THREADPRIVATE in outer context
+  !$omp end single copyprivate(/c/)
+  !$omp single
+  !ERROR: COPYPRIVATE variable 'k' is not PRIVATE or THREADPRIVATE in outer context
+  !$omp end single copyprivate(k)
+
+  ! Module variables are not private.
+  !$omp single
+  !ERROR: COPYPRIVATE variable 'm' is not PRIVATE or THREADPRIVATE in outer context
+  !$omp end single copyprivate(m)
+
+  ! Parallel can make a variable shared.
+  !$omp parallel
+    !$omp single
+      i = 456
+    !ERROR: COPYPRIVATE variable 'i' is not PRIVATE or THREADPRIVATE in outer context
+    !$omp end single copyprivate(i)
+    call sub(j, a1)
+  !$omp end parallel
+
+  !$omp parallel shared(i)
+    !$omp single
+      i = 456
+    !ERROR: COPYPRIVATE variable 'i' is not PRIVATE or THREADPRIVATE in outer context
+    !$omp end single copyprivate(i)
+  !$omp end parallel
+
+  !FIXME: an error should be emitted in this case.
+  !       copyprivate(i) should be considered as a reference to i and a new
+  !       symbol should be created in `parallel` scope, for this case to be
+  !       handled properly.
+  !$omp parallel
+    !$omp single
+    !$omp end single copyprivate(i)
+  !$omp end parallel
+
+  ! Named constants are shared.
+  !$omp single
+  !ERROR: COPYPRIVATE variable 'pi' is not PRIVATE or THREADPRIVATE in outer context
+  !$omp end single copyprivate(pi)
+
+  !$omp parallel do
+  do i = 1, 10
+    !$omp parallel
+    !$omp single
+      j = i
+    !ERROR: COPYPRIVATE variable 'i' is not PRIVATE or THREADPRIVATE in outer context
+    !$omp end single copyprivate(i)
+    !$omp end parallel
+  end do
+  !$omp end parallel do
+
+contains
+  subroutine sub(s1, a)
+    integer :: s1
+    integer :: a(:)
+
+    ! Dummy argument.
+    !$omp single
+    !$omp end single copyprivate(s1)
+
+    ! Assumed shape arrays are shared.
+    !$omp single
+    !ERROR: COPYPRIVATE variable 'a' is not PRIVATE or THREADPRIVATE in outer context
+    !$omp end single copyprivate(a)
+  end subroutine
+
+  integer function fun(f1)
+    integer :: f1
+
+    ! Dummy argument.
+    !$omp single
+    !$omp end single copyprivate(f1)
+
+    ! Function result is private.
+    !$omp single
+    !$omp end single copyprivate(fun)
+  end function
+end program
diff --git a/flang/test/Semantics/OpenMP/do05-positivecase.f90 b/flang/test/Semantics/OpenMP/do05-positivecase.f90
index 4e02235..3b512a5 100644
--- a/flang/test/Semantics/OpenMP/do05-positivecase.f90
+++ b/flang/test/Semantics/OpenMP/do05-positivecase.f90
@@ -20,12 +20,12 @@ program omp_do
   !$omp parallel  default(shared)
   !$omp do
   !DEF: /omp_do/OtherConstruct2/OtherConstruct1/i (OmpPrivate, OmpPreDetermined) HostAssoc INTEGER(4)
-  !REF: /omp_do/n
+  !DEF: /omp_do/OtherConstruct2/n HostAssoc INTEGER(4)
   do i=1,n
     !$omp parallel
     !$omp single
     !DEF: /work EXTERNAL (Subroutine) ProcEntity
-    !REF: /omp_do/OtherConstruct2/OtherConstruct1/i
+    !DEF: /omp_do/OtherConstruct2/OtherConstruct1/OtherConstruct1/i HostAssoc INTEGER(4)
     call work(i, 1)
     !$omp end single
     !$omp end parallel
diff --git a/flang/test/Semantics/OpenMP/do20.f90 b/flang/test/Semantics/OpenMP/do20.f90
index 915d01e..0cafae7 100644
--- a/flang/test/Semantics/OpenMP/do20.f90
+++ b/flang/test/Semantics/OpenMP/do20.f90
@@ -10,7 +10,7 @@ subroutine shared_iv
 
   !$omp parallel shared(i)
     !$omp single
-      !REF: /shared_iv/i
+      !DEF: /shared_iv/OtherConstruct1/i HostAssoc INTEGER(4)
       do i = 0, 1
       end do
     !$omp end single
diff --git a/flang/test/Semantics/OpenMP/implicit-dsa.f90 b/flang/test/Semantics/OpenMP/implicit-dsa.f90
index 92d2421..2abe3a0 100644
--- a/flang/test/Semantics/OpenMP/implicit-dsa.f90
+++ b/flang/test/Semantics/OpenMP/implicit-dsa.f90
@@ -15,14 +15,14 @@ subroutine implicit_dsa_test1
   !$omp task private(y) shared(z)
     !DEF: /implicit_dsa_test1/OtherConstruct1/x (OmpFirstPrivate, OmpImplicit) HostAssoc INTEGER(4)
     !DEF: /implicit_dsa_test1/OtherConstruct1/y (OmpPrivate) HostAssoc INTEGER(4)
-    !REF: /implicit_dsa_test1/z
+    !DEF: /implicit_dsa_test1/OtherConstruct1/z HostAssoc INTEGER(4)
     x = y + z
   !$omp end task
 
   !$omp task default(shared)
-    !REF: /implicit_dsa_test1/x
-    !REF: /implicit_dsa_test1/y
-    !REF: /implicit_dsa_test1/z
+    !DEF: /implicit_dsa_test1/OtherConstruct2/x HostAssoc INTEGER(4)
+    !DEF: /implicit_dsa_test1/OtherConstruct2/y HostAssoc INTEGER(4)
+    !DEF: /implicit_dsa_test1/OtherConstruct2/z HostAssoc INTEGER(4)
     x = y + z
   !$omp end task
 
@@ -61,16 +61,16 @@ subroutine implicit_dsa_test3
 
   !$omp parallel
     !$omp task
-      !REF: /implicit_dsa_test3/x
+      !DEF: /implicit_dsa_test3/OtherConstruct1/OtherConstruct1/x HostAssoc INTEGER(4)
       x = 1
-      !REF: /implicit_dsa_test3/y
+      !DEF: /implicit_dsa_test3/OtherConstruct1/OtherConstruct1/y HostAssoc INTEGER(4)
       y = 1
     !$omp end task
 
     !$omp task firstprivate(x)
       !DEF: /implicit_dsa_test3/OtherConstruct1/OtherConstruct2/x (OmpFirstPrivate) HostAssoc INTEGER(4)
       x = 1
-      !REF: /implicit_dsa_test3/z
+      !DEF: /implicit_dsa_test3/OtherConstruct1/OtherConstruct2/z HostAssoc INTEGER(4)
       z = 1
     !$omp end task
   !$omp end parallel
diff --git a/flang/test/Semantics/OpenMP/parallel-shared05.f90 b/flang/test/Semantics/OpenMP/parallel-shared05.f90
new file mode 100644
index 0000000..bcc1a94
--- /dev/null
+++ b/flang/test/Semantics/OpenMP/parallel-shared05.f90
@@ -0,0 +1,17 @@
+!RUN: %python %S/../test_errors.py %s %flang -fopenmp
+! OpenMP Version 4.5
+! 2.15.3.2 parallel shared Clause
+program omp_parallel_shared
+  type derived
+    integer :: field(2, 3)
+  end type
+  integer :: field(2)
+  type(derived) :: y
+
+  ! Check that derived type fields and variables with the same name
+  ! don't cause errors.
+  !$omp parallel
+    y%field(2, 3) = 1
+    field(1) = 1
+  !$omp end parallel
+end program omp_parallel_shared
diff --git a/flang/test/Semantics/OpenMP/reduction08.f90 b/flang/test/Semantics/OpenMP/reduction08.f90
index 9916332..9442fbd 100644
--- a/flang/test/Semantics/OpenMP/reduction08.f90
+++ b/flang/test/Semantics/OpenMP/reduction08.f90
@@ -15,7 +15,7 @@ program omp_reduction
   do i=1,10
     !DEF: /omp_reduction/OtherConstruct1/k (OmpReduction) HostAssoc INTEGER(4)
     !DEF: /omp_reduction/max ELEMENTAL, INTRINSIC, PURE (Function) ProcEntity
-    !REF: /omp_reduction/m
+    !DEF: /omp_reduction/OtherConstruct1/m HostAssoc INTEGER(4)
     k = max(k, m)
   end do
   !$omp end parallel do
@@ -25,7 +25,7 @@ program omp_reduction
   do i=1,10
     !DEF: /omp_reduction/OtherConstruct2/k (OmpReduction) HostAssoc INTEGER(4)
     !DEF: /omp_reduction/min ELEMENTAL, INTRINSIC, PURE (Function) ProcEntity
-    !REF: /omp_reduction/m
+    !DEF: /omp_reduction/OtherConstruct2/m HostAssoc INTEGER(4)
     k = min(k, m)
   end do
   !$omp end parallel do
@@ -35,7 +35,7 @@ program omp_reduction
   do i=1,10
     !DEF: /omp_reduction/OtherConstruct3/k (OmpReduction) HostAssoc INTEGER(4)
     !DEF: /omp_reduction/iand ELEMENTAL, INTRINSIC, PURE (Function) ProcEntity
-    !REF: /omp_reduction/m
+    !DEF: /omp_reduction/OtherConstruct3/m HostAssoc INTEGER(4)
     k = iand(k, m)
   end do
   !$omp end parallel do
@@ -45,7 +45,7 @@ program omp_reduction
   do i=1,10
     !DEF: /omp_reduction/OtherConstruct4/k (OmpReduction) HostAssoc INTEGER(4)
     !DEF: /omp_reduction/ior ELEMENTAL, INTRINSIC, PURE (Function) ProcEntity
-    !REF: /omp_reduction/m
+    !DEF: /omp_reduction/OtherConstruct4/m HostAssoc INTEGER(4)
     k = ior(k, m)
   end do
   !$omp end parallel do
@@ -55,7 +55,7 @@ program omp_reduction
   do i=1,10
     !DEF: /omp_reduction/OtherConstruct5/k (OmpReduction) HostAssoc INTEGER(4)
     !DEF: /omp_reduction/ieor ELEMENTAL, INTRINSIC, PURE (Function) ProcEntity
-    !REF: /omp_reduction/m
+    !DEF: /omp_reduction/OtherConstruct5/m HostAssoc INTEGER(4)
     k = ieor(k,m)
   end do
   !$omp end parallel do
diff --git a/flang/test/Semantics/OpenMP/reduction09.f90 b/flang/test/Semantics/OpenMP/reduction09.f90
index dbc8d1b0..1af2fc4 100644
--- a/flang/test/Semantics/OpenMP/reduction09.f90
+++ b/flang/test/Semantics/OpenMP/reduction09.f90
@@ -26,7 +26,7 @@ program omp_reduction
   !$omp parallel do  reduction(+:a(10))
   !DEF: /omp_reduction/OtherConstruct2/i (OmpPrivate, OmpPreDetermined) HostAssoc INTEGER(4)
   do i=1,10
-    !REF: /omp_reduction/k
+    !DEF: /omp_reduction/OtherConstruct2/k HostAssoc INTEGER(4)
     k = k+1
   end do
   !$omp end parallel do
@@ -35,7 +35,7 @@ program omp_reduction
   !$omp parallel do  reduction(+:a(1:10:1))
   !DEF: /omp_reduction/OtherConstruct3/i (OmpPrivate, OmpPreDetermined) HostAssoc INTEGER(4)
   do i=1,10
-    !REF: /omp_reduction/k
+    !DEF: /omp_reduction/OtherConstruct3/k HostAssoc INTEGER(4)
     k = k+1
   end do
   !$omp end parallel do
@@ -43,7 +43,7 @@ program omp_reduction
   !$omp parallel do  reduction(+:b(1:10:1,1:5,2))
   !DEF: /omp_reduction/OtherConstruct4/i (OmpPrivate, OmpPreDetermined) HostAssoc INTEGER(4)
   do i=1,10
-    !REF: /omp_reduction/k
+    !DEF: /omp_reduction/OtherConstruct4/k HostAssoc INTEGER(4)
     k = k+1
   end do
   !$omp end parallel do
@@ -51,7 +51,7 @@ program omp_reduction
   !$omp parallel do  reduction(+:b(1:10:1,1:5,2:5:1))
   !DEF: /omp_reduction/OtherConstruct5/i (OmpPrivate, OmpPreDetermined) HostAssoc INTEGER(4)
   do i=1,10
-    !REF: /omp_reduction/k
+    !DEF: /omp_reduction/OtherConstruct5/k HostAssoc INTEGER(4)
     k = k+1
   end do
   !$omp end parallel do
diff --git a/flang/test/Semantics/OpenMP/symbol01.f90 b/flang/test/Semantics/OpenMP/symbol01.f90
index 0b435a9..ecfb862 100644
--- a/flang/test/Semantics/OpenMP/symbol01.f90
+++ b/flang/test/Semantics/OpenMP/symbol01.f90
@@ -48,7 +48,7 @@ program mm
  !DEF: /mm/OtherConstruct1/i (OmpPrivate, OmpPreDetermined) HostAssoc INTEGER(4)
  do i=1,10
   !DEF: /mm/OtherConstruct1/a (OmpPrivate) HostAssoc REAL(4)
-  !REF: /mm/b
+  !DEF: /mm/OtherConstruct1/b HostAssoc INTEGER(4)
   !REF: /mm/OtherConstruct1/i
   a = a+b(i)
   !DEF: /mm/OtherConstruct1/t (OmpPrivate) HostAssoc TYPE(myty)
@@ -62,7 +62,7 @@ program mm
   !REF: /mm/OtherConstruct1/i
   !REF: /mm/OtherConstruct1/y
   x = a+i+y
-  !REF: /mm/c
+  !DEF: /mm/OtherConstruct1/c HostAssoc REAL(4)
   c = 3.0
  end do
 end program
diff --git a/flang/test/Semantics/OpenMP/symbol02.f90 b/flang/test/Semantics/OpenMP/symbol02.f90
index f6ffc55..c199c52 100644
--- a/flang/test/Semantics/OpenMP/symbol02.f90
+++ b/flang/test/Semantics/OpenMP/symbol02.f90
@@ -15,9 +15,9 @@
   a = 3.
   !DEF: /MainProgram1/OtherConstruct1/b (OmpPrivate) HostAssoc REAL(4)
   b = 4
-  !REF: /MainProgram1/c
+  !DEF: /MainProgram1/OtherConstruct1/c HostAssoc REAL(4)
   c = 5
-  !DEF: /MainProgram1/d (Implicit) ObjectEntity REAL(4)
+  !DEF: /MainProgram1/OtherConstruct1/d HostAssoc REAL(4)
   d = 6
   !$omp end parallel
   !DEF: /MainProgram1/a (Implicit) ObjectEntity REAL(4)
diff --git a/flang/test/Semantics/OpenMP/symbol03.f90 b/flang/test/Semantics/OpenMP/symbol03.f90
index 93e9b7a..ba941b9 100644
--- a/flang/test/Semantics/OpenMP/symbol03.f90
+++ b/flang/test/Semantics/OpenMP/symbol03.f90
@@ -9,10 +9,10 @@
   !$omp parallel  private(a) shared(b)
   !DEF: /MainProgram1/OtherConstruct1/a (OmpPrivate) HostAssoc REAL(4)
   a = 3.
-  !REF: /MainProgram1/b
+  !DEF: /MainProgram1/OtherConstruct1/b HostAssoc REAL(4)
   b = 4
   !$omp parallel  private(b) shared(a)
-  !REF: /MainProgram1/OtherConstruct1/a
+  !DEF: /MainProgram1/OtherConstruct1/OtherConstruct1/a HostAssoc REAL(4)
   a = 5.
   !DEF: /MainProgram1/OtherConstruct1/OtherConstruct1/b (OmpPrivate) HostAssoc REAL(4)
   b = 6
diff --git a/flang/test/Semantics/OpenMP/symbol05.f90 b/flang/test/Semantics/OpenMP/symbol05.f90
index fa0a8f6..1ad0c10 100644
--- a/flang/test/Semantics/OpenMP/symbol05.f90
+++ b/flang/test/Semantics/OpenMP/symbol05.f90
@@ -15,10 +15,10 @@ contains
     !DEF: /mm/foo/a ObjectEntity INTEGER(4)
     integer :: a = 3
     !$omp parallel
-    !REF: /mm/foo/a
+    !DEF: /mm/foo/OtherConstruct1/a HostAssoc INTEGER(4)
     a = 1
     !DEF: /mm/i PUBLIC (Implicit, OmpThreadprivate) ObjectEntity INTEGER(4)
-    !REF: /mm/foo/a
+    !REF: /mm/foo/OtherConstruct1/a
     i = a
     !$omp end parallel
     !REF: /mm/foo/a
diff --git a/flang/test/Semantics/OpenMP/symbol07.f90 b/flang/test/Semantics/OpenMP/symbol07.f90
index e2250f5..8b47169 100644
--- a/flang/test/Semantics/OpenMP/symbol07.f90
+++ b/flang/test/Semantics/OpenMP/symbol07.f90
@@ -23,7 +23,7 @@ subroutine function_call_in_region
   !$omp parallel  default(none) private(a) shared(b)
   !DEF: /function_call_in_region/OtherConstruct1/a (OmpPrivate) HostAssoc REAL(4)
   !REF: /function_call_in_region/foo
-  !REF: /function_call_in_region/b
+  !DEF: /function_call_in_region/OtherConstruct1/b HostAssoc REAL(4)
   a = foo(b)
   !$omp end parallel
   !REF: /function_call_in_region/a
diff --git a/flang/test/Semantics/OpenMP/symbol08.f90 b/flang/test/Semantics/OpenMP/symbol08.f90
index 3af85af..69ccd17 100644
--- a/flang/test/Semantics/OpenMP/symbol08.f90
+++ b/flang/test/Semantics/OpenMP/symbol08.f90
@@ -28,18 +28,18 @@ subroutine test_do
  !DEF: /test_do/k ObjectEntity INTEGER(4)
  integer i, j, k
 !$omp parallel
- !REF: /test_do/i
+ !DEF: /test_do/OtherConstruct1/i HostAssoc INTEGER(4)
  i = 99
 !$omp do  collapse(2)
  !DEF: /test_do/OtherConstruct1/OtherConstruct1/i (OmpPrivate, OmpPreDetermined) HostAssoc INTEGER(4)
  do i=1,5
   !DEF: /test_do/OtherConstruct1/OtherConstruct1/j (OmpPrivate, OmpPreDetermined) HostAssoc INTEGER(4)
   do j=6,10
-   !REF: /test_do/a
+   !DEF: /test_do/OtherConstruct1/a HostAssoc REAL(4)
    a(1,1,1) = 0.
    !DEF: /test_do/OtherConstruct1/k (OmpPrivate, OmpPreDetermined) HostAssoc INTEGER(4)
    do k=11,15
-    !REF: /test_do/a
+    !REF: /test_do/OtherConstruct1/a
     !REF: /test_do/OtherConstruct1/k
     !REF: /test_do/OtherConstruct1/OtherConstruct1/j
     !REF: /test_do/OtherConstruct1/OtherConstruct1/i
@@ -65,11 +65,11 @@ subroutine test_pardo
  do i=1,5
    !DEF: /test_pardo/OtherConstruct1/j (OmpPrivate, OmpPreDetermined) HostAssoc INTEGER(4)
     do j=6,10
-   !REF: /test_pardo/a
+   !DEF: /test_pardo/OtherConstruct1/a HostAssoc REAL(4)
    a(1,1,1) = 0.
    !DEF: /test_pardo/OtherConstruct1/k (OmpPrivate) HostAssoc INTEGER(4)
    do k=11,15
-    !REF: /test_pardo/a
+    !REF: /test_pardo/OtherConstruct1/a
     !REF: /test_pardo/OtherConstruct1/k
     !REF: /test_pardo/OtherConstruct1/j
     !REF: /test_pardo/OtherConstruct1/i
@@ -138,15 +138,15 @@ subroutine dotprod (b, c, n, block_size, num_teams, block_threads)
  do i0=1,n,block_size
 !$omp parallel do  reduction(+:sum)
   !DEF: /dotprod/OtherConstruct1/OtherConstruct1/OtherConstruct1/OtherConstruct1/i (OmpPrivate, OmpPreDetermined) HostAssoc INTEGER(4)
-  !REF: /dotprod/OtherConstruct1/OtherConstruct1/OtherConstruct1/i0
+  !DEF: /dotprod/OtherConstruct1/OtherConstruct1/OtherConstruct1/OtherConstruct1/i0 HostAssoc INTEGER(4)
   !DEF: /dotprod/min ELEMENTAL, INTRINSIC, PURE (Function) ProcEntity
-  !REF: /dotprod/block_size
-  !REF: /dotprod/n
+  !DEF: /dotprod/OtherConstruct1/OtherConstruct1/OtherConstruct1/OtherConstruct1/block_size HostAssoc INTEGER(4)
+  !DEF: /dotprod/OtherConstruct1/OtherConstruct1/OtherConstruct1/OtherConstruct1/n HostAssoc INTEGER(4)
   do i=i0,min(i0+block_size, n)
    !DEF: /dotprod/OtherConstruct1/OtherConstruct1/OtherConstruct1/OtherConstruct1/sum (OmpReduction) HostAssoc REAL(4)
-   !REF: /dotprod/b
+   !DEF: /dotprod/OtherConstruct1/OtherConstruct1/OtherConstruct1/OtherConstruct1/b HostAssoc REAL(4)
    !REF: /dotprod/OtherConstruct1/OtherConstruct1/OtherConstruct1/OtherConstruct1/i
-   !REF: /dotprod/c
+   !DEF: /dotprod/OtherConstruct1/OtherConstruct1/OtherConstruct1/OtherConstruct1/c HostAssoc REAL(4)
    sum = sum+b(i)*c(i)
   end do
  end do
@@ -174,7 +174,7 @@ subroutine test_simd
   do j=6,10
    !DEF: /test_simd/OtherConstruct1/k (OmpPrivate, OmpPreDetermined) HostAssoc INTEGER(4)
    do k=11,15
-    !REF: /test_simd/a
+    !DEF: /test_simd/OtherConstruct1/a HostAssoc REAL(4)
     !REF: /test_simd/OtherConstruct1/k
     !REF: /test_simd/OtherConstruct1/j
     !REF: /test_simd/OtherConstruct1/i
@@ -201,7 +201,7 @@ subroutine test_simd_multi
   do j=6,10
    !DEF: /test_simd_multi/OtherConstruct1/k (OmpLastPrivate, OmpPreDetermined) HostAssoc INTEGER(4)
    do k=11,15
-    !REF: /test_simd_multi/a
+    !DEF: /test_simd_multi/OtherConstruct1/a HostAssoc REAL(4)
     !REF: /test_simd_multi/OtherConstruct1/k
     !REF: /test_simd_multi/OtherConstruct1/j
     !REF: /test_simd_multi/OtherConstruct1/i
@@ -223,11 +223,11 @@ subroutine test_seq_loop
   !REF: /test_seq_loop/j
   j = -1
   !$omp parallel
-  !REF: /test_seq_loop/i
-  !REF: /test_seq_loop/j
+  !DEF: /test_seq_loop/OtherConstruct1/i HostAssoc INTEGER(4)
+  !DEF: /test_seq_loop/OtherConstruct1/j HostAssoc INTEGER(4)
   print *, i, j
   !$omp parallel
-  !REF: /test_seq_loop/i
+  !DEF: /test_seq_loop/OtherConstruct1/OtherConstruct1/i HostAssoc INTEGER(4)
   !DEF: /test_seq_loop/OtherConstruct1/OtherConstruct1/j (OmpPrivate, OmpPreDetermined) HostAssoc INTEGER(4)
   print *, i, j
   !$omp do
@@ -237,12 +237,12 @@ subroutine test_seq_loop
    do j=1,10
    end do
   end do
-  !REF: /test_seq_loop/i
+  !REF: /test_seq_loop/OtherConstruct1/OtherConstruct1/i
   !REF: /test_seq_loop/OtherConstruct1/OtherConstruct1/j
   print *, i, j
   !$omp end parallel
-  !REF: /test_seq_loop/i
-  !REF: /test_seq_loop/j
+  !REF: /test_seq_loop/OtherConstruct1/i
+  !REF: /test_seq_loop/OtherConstruct1/j
   print *, i, j
   !$omp end parallel
   !REF: /test_seq_loop/i
diff --git a/flang/test/Semantics/OpenMP/symbol09.f90 b/flang/test/Semantics/OpenMP/symbol09.f90
index e2250f5..8b47169 100644
--- a/flang/test/Semantics/OpenMP/symbol09.f90
+++ b/flang/test/Semantics/OpenMP/symbol09.f90
@@ -23,7 +23,7 @@ subroutine function_call_in_region
   !$omp parallel  default(none) private(a) shared(b)
   !DEF: /function_call_in_region/OtherConstruct1/a (OmpPrivate) HostAssoc REAL(4)
   !REF: /function_call_in_region/foo
-  !REF: /function_call_in_region/b
+  !DEF: /function_call_in_region/OtherConstruct1/b HostAssoc REAL(4)
   a = foo(b)
   !$omp end parallel
   !REF: /function_call_in_region/a
diff --git a/flang/test/Semantics/associate04.f90 b/flang/test/Semantics/associate04.f90
new file mode 100644
index 0000000..5a73ba4
--- /dev/null
+++ b/flang/test/Semantics/associate04.f90
@@ -0,0 +1,7 @@
+! RUN: %python %S/test_errors.py %s %flang_fc1
+subroutine bad(a)
+  real :: a(..)
+  !ERROR: Selector must not be assumed-rank
+  associate(x => a)
+  end associate
+end subroutine
diff --git a/flang/test/Semantics/call02.f90 b/flang/test/Semantics/call02.f90
index bc3dd607..0ec5530 100644
--- a/flang/test/Semantics/call02.f90
+++ b/flang/test/Semantics/call02.f90
@@ -72,6 +72,7 @@ module m01
  contains
   elemental real function elem03(x)
     real, value :: x
+    elem03 = 0.
   end function
   subroutine test
     intrinsic :: cos
@@ -87,6 +88,7 @@ module m01
    contains
     elemental real function elem04(x)
       real, value :: x
+      elem04 = 0.
     end function
   end subroutine
 end module
diff --git a/flang/test/Semantics/call05.f90 b/flang/test/Semantics/call05.f90
index 8a4386e..a06fe4f 100644
--- a/flang/test/Semantics/call05.f90
+++ b/flang/test/Semantics/call05.f90
@@ -155,11 +155,13 @@ module m2
 
   function return_deferred_length_ptr()
     character(len=:), pointer :: return_deferred_length_ptr
+    return_deferred_length_ptr => p2
   end function
 
   function return_explicit_length_ptr(n)
     integer :: n
     character(len=n), pointer :: return_explicit_length_ptr
+    return_explicit_length_ptr => p2(1:n)
   end function
 
   subroutine test()
diff --git a/flang/test/Semantics/contiguous01.f90 b/flang/test/Semantics/contiguous01.f90
index 0f08662..8938277 100644
--- a/flang/test/Semantics/contiguous01.f90
+++ b/flang/test/Semantics/contiguous01.f90
@@ -30,8 +30,10 @@ module m
     contiguous r2
     !PORTABILITY: CONTIGUOUS entity 'e' should be an array pointer, assumed-shape, or assumed-rank
     entry e() result(r2)
+    r2 = 0
   end
   function fp()
     real, pointer, contiguous :: fp(:) ! ok
+    nullify(fp)
   end
 end
diff --git a/flang/test/Semantics/cuf09.cuf b/flang/test/Semantics/cuf09.cuf
index d2d4d23..195ddac 100644
--- a/flang/test/Semantics/cuf09.cuf
+++ b/flang/test/Semantics/cuf09.cuf
@@ -18,6 +18,8 @@ module m
 end
 
 program main
+  integer, device :: a_d(10 ,10)
+  integer :: b(10, 10)
   !$cuf kernel do <<< *, * >>> ! ok
   do j = 1, 0
   end do
@@ -90,4 +92,12 @@ program main
     else if (ifunc() /= 1) then
     end if
   end do
+
+  !$cuf kernel do (2) <<<*, *>>>
+  do j = 1, 10
+     do i = 1, 10
+        !ERROR: Host array 'b' cannot be present in CUF kernel
+        a_d(i,j) = b(i,j)
+     enddo
+  enddo
 end
diff --git a/flang/test/Semantics/reduce.cuf b/flang/test/Semantics/reduce.cuf
index 95ff2e8..92d12ab1 100644
--- a/flang/test/Semantics/reduce.cuf
+++ b/flang/test/Semantics/reduce.cuf
@@ -1,9 +1,9 @@
 ! RUN: %python %S/test_errors.py %s %flang_fc1
 subroutine s(n,m,a,l)
   integer, intent(in) :: n
-  integer, intent(in) :: m(n)
-  real, intent(in) :: a(n)
-  logical, intent(in) :: l(n)
+  integer, device, intent(in) :: m(n)
+  real, device, intent(in) :: a(n)
+  logical, device, intent(in) :: l(n)
   integer j, mr
   real ar
   logical lr
diff --git a/flang/test/Semantics/resolve33.f90 b/flang/test/Semantics/resolve33.f90
index 4b27a8d..88ebd1b 100644
--- a/flang/test/Semantics/resolve33.f90
+++ b/flang/test/Semantics/resolve33.f90
@@ -10,7 +10,7 @@
 ! in that derived-type-def.
 
 module m
-  !ERROR: Duplicate type parameter name: 'a'
+  !ERROR: Type parameter, component, or procedure binding 'a' already defined in this type
   type t1(a, b, a)
     integer, kind :: a
     integer(8), len :: b
@@ -23,23 +23,31 @@ module m
   !ERROR: No definition found for type parameter 'b'
   type t3(a, b)
     integer, kind :: a
+    !ERROR: Component 'b' is already declared in this derived type
     integer :: b
   end type
   type t4(a)
     integer, kind :: a
-    !ERROR: 'd' is not a type parameter of this derived type
+    !ERROR: 'd' is not a parameter of this derived type
     integer(8), len :: d
   end type
   type t5(a, b)
     integer, len :: a
     integer, len :: b
-    !ERROR: Type parameter, component, or procedure binding 'a' already defined in this type
+    !ERROR: Type parameter 'a' was already declared in this derived type
     integer, len :: a
   end type
   !ERROR: No definition found for type parameter 'k'
   !ERROR: No definition found for type parameter 'l'
   type :: t6(k, l)
+    !ERROR: Type parameter 'k' was referenced before being declared
+    !ERROR: Type parameter 'l' was referenced before being declared
     character(kind=k, len=l) :: d3
   end type
   type(t6(2, 10)) :: x3
+  type :: t7(k1,k2)
+    !ERROR: Type parameter 'k2' was referenced before being declared
+    integer(kind(k2)), kind :: k1
+    integer(kind(k1)), kind :: k2
+  end type
 end module
diff --git a/flang/test/Semantics/resolve53.f90 b/flang/test/Semantics/resolve53.f90
index 1b0f3f8..0ab4b7c 100644
--- a/flang/test/Semantics/resolve53.f90
+++ b/flang/test/Semantics/resolve53.f90
@@ -227,14 +227,17 @@ contains
   real function f1(x, y)
     real, intent(in) :: x
     logical, intent(in) :: y
+    f1 = 0.
   end
   integer function f2(x, y)
     integer, intent(in) :: x
     logical, intent(in) :: y
+    f2 = 0.
   end
   real function f3(x, y)
     real, value :: x
     logical, value :: y
+    f3 = 0.
   end
 end module
 
@@ -447,12 +450,15 @@ module m19
 contains
   integer function f1(i)
     integer, intent(in) :: i
+    f1 = 0
   end
   integer function f2(i, j)
     integer, value :: i, j
+    f2 = 0
   end
   integer function f3(i, j)
     integer, intent(in) :: i, j
+    f3 = 0
   end
 end
 
diff --git a/flang/test/Semantics/undef-result01.f90 b/flang/test/Semantics/undef-result01.f90
new file mode 100644
index 0000000..a372fcd
--- /dev/null
+++ b/flang/test/Semantics/undef-result01.f90
@@ -0,0 +1,144 @@
+! RUN: %python %S/test_errors.py %s %flang_fc1 -Werror
+
+!WARNING: Function result is never defined
+function basic()
+end
+
+function defdByIntentOut()
+  call intentout(defdByIntentOut)
+ contains
+  subroutine intentout(x)
+    real, intent(out) :: x
+  end
+end
+
+function defdByIntentInOut()
+  call intentinout(defdByIntentInOut)
+ contains
+  subroutine intentInout(x)
+    real, intent(out) :: x
+  end
+end
+
+function defdByIntentInPtr()
+  real, target :: defdByIntentInPtr
+  call intentInPtr(defdByIntentInPtr)
+ contains
+  subroutine intentInPtr(p)
+    real, intent(in), pointer :: p
+  end
+end
+
+!WARNING: Function result is never defined
+function notDefdByCall()
+  call intentin(notDefdByCall)
+ contains
+  subroutine intentin(n)
+    integer, intent(in) :: n
+  end
+end
+
+!WARNING: Function result is never defined
+function basicAlloc()
+  real, allocatable :: basicAlloc
+  allocate(basicAlloc)
+end
+
+function sourcedAlloc()
+  real, allocatable :: sourcedAlloc
+  allocate(sourcedAlloc, source=0.)
+end
+
+function defdByEntry()
+  entry entry1
+  entry1 = 0.
+end
+
+function defdByEntry2()
+  entry entry2() result(entryResult)
+  entryResult = 0.
+end
+
+function usedAsTarget()
+  real, target :: usedAsTarget
+  real, pointer :: p
+  p => usedAsTarget
+end
+
+function entryUsedAsTarget()
+  real, target :: entryResult
+  real, pointer :: p
+  entry entry5() result(entryResult)
+  p => entryResult
+end
+
+function defdByCall()
+  call implicitInterface(defdByCall)
+end
+
+function defdInInternal()
+ contains
+  subroutine internal
+    defdInInternal = 0.
+  end
+end
+
+function defdByEntryInInternal()
+  entry entry3() result(entryResult)
+ contains
+  subroutine internal
+    entryResult = 0.
+  end
+end
+
+type(defaultInitialized) function defdByDefault()
+  type defaultInitialized
+    integer :: n = 123
+  end type
+end
+
+integer function defdByDo()
+  do defdByDo = 1, 10
+  end do
+end
+
+function defdByRead()
+  read(*,*) defdByRead
+end function
+
+function defdByNamelist()
+  namelist /nml/ defdByNamelist
+  read(*,nml=nml)
+end
+
+character(4) function defdByWrite()
+  write(defdByWrite) 'abcd'
+end
+
+integer function defdBySize()
+  real arr(10)
+  read(*,size=defdBySize) arr
+end
+
+character(40) function defdByIomsg()
+  write(123,*,iomsg=defdByIomsg)
+end
+
+character(20) function defdByInquire()
+  inquire(6,status=defdByInquire)
+end
+
+!WARNING: Function result is never defined
+character(20) function notDefdByInquire()
+  inquire(file=notDefdByInquire)
+end
+
+integer function defdByNewunit()
+  open(newunit=defdByNewunit, file="foo.txt")
+end
+
+function defdByAssociate()
+  associate(s => defdByAssociate)
+    s = 1.
+  end associate
+end
diff --git a/flang/unittests/Optimizer/Builder/Runtime/NumericTest.cpp b/flang/unittests/Optimizer/Builder/Runtime/NumericTest.cpp
index becaa3c..47342da 100644
--- a/flang/unittests/Optimizer/Builder/Runtime/NumericTest.cpp
+++ b/flang/unittests/Optimizer/Builder/Runtime/NumericTest.cpp
@@ -56,14 +56,6 @@ void testGenNearest(fir::FirOpBuilder &builder, mlir::Type xType,
   mlir::Value s = builder.create<fir::UndefOp>(loc, sType);
   mlir::Value nearest = fir::runtime::genNearest(builder, loc, x, s);
   checkCallOp(nearest.getDefiningOp(), fctName, 2, /*addLocArg=*/false);
-  auto callOp = mlir::dyn_cast<fir::CallOp>(nearest.getDefiningOp());
-  mlir::Value select = callOp.getOperands()[1];
-  EXPECT_TRUE(mlir::isa<mlir::arith::SelectOp>(select.getDefiningOp()));
-  auto selectOp = mlir::dyn_cast<mlir::arith::SelectOp>(select.getDefiningOp());
-  mlir::Value cmp = selectOp.getCondition();
-  EXPECT_TRUE(mlir::isa<mlir::arith::CmpFOp>(cmp.getDefiningOp()));
-  auto cmpOp = mlir::dyn_cast<mlir::arith::CmpFOp>(cmp.getDefiningOp());
-  EXPECT_EQ(s, cmpOp.getLhs());
 }
 
 TEST_F(RuntimeCallTest, genNearestTest) {
diff --git a/libc/CMakeLists.txt b/libc/CMakeLists.txt
index 1ce3bdf..dd45d6c 100644
--- a/libc/CMakeLists.txt
+++ b/libc/CMakeLists.txt
@@ -73,7 +73,7 @@ if(LIBC_BUILD_GPU_LOADER OR (LLVM_LIBC_GPU_BUILD AND NOT LLVM_RUNTIMES_BUILD))
   add_subdirectory(utils/gpu)
 endif()
 
-option(LIBC_USE_NEW_HEADER_GEN "Generate header files using new headergen instead of the old one" OFF)
+option(LIBC_USE_NEW_HEADER_GEN "Generate header files using new headergen instead of the old one" ON)
 
 set(NEED_LIBC_HDRGEN FALSE)
 if(NOT LLVM_RUNTIMES_BUILD)
diff --git a/libc/benchmarks/gpu/CMakeLists.txt b/libc/benchmarks/gpu/CMakeLists.txt
index 69518ac..5fa3e44 100644
--- a/libc/benchmarks/gpu/CMakeLists.txt
+++ b/libc/benchmarks/gpu/CMakeLists.txt
@@ -22,8 +22,12 @@ function(add_benchmark benchmark_name)
       ${BENCHMARK_LINK_LIBRARIES}
     DEPENDS
       libc.src.stdio.printf
+      libc.src.stdlib.srand
+      libc.src.stdlib.rand
       ${BENCHMARK_DEPENDS}
     ${BENCHMARK_UNPARSED_ARGUMENTS}
+    COMPILE_OPTIONS
+      -flto
   )
   get_fq_target_name(${benchmark_name} fq_target_name)
   set(fq_build_target_name ${fq_target_name}.__build__)
@@ -50,13 +54,17 @@ add_unittest_framework_library(
     libc.src.__support.CPP.limits
     libc.src.__support.CPP.algorithm
     libc.src.__support.CPP.atomic
+    libc.src.__support.CPP.array
     libc.src.__support.fixed_point.fx_rep
     libc.src.__support.macros.properties.types
     libc.src.__support.OSUtil.osutil
     libc.src.__support.uint128
+    libc.src.__support.FPUtil.fp_bits
     libc.src.__support.FPUtil.sqrt
     libc.src.__support.fixedvector
     libc.src.time.clock
+    libc.src.stdlib.rand
+    libc.src.stdlib.srand
     libc.benchmarks.gpu.timing.timing
     libc.src.stdio.printf
 )
diff --git a/libc/benchmarks/gpu/LibcGpuBenchmark.cpp b/libc/benchmarks/gpu/LibcGpuBenchmark.cpp
index 031ad16..a5dbc62 100644
--- a/libc/benchmarks/gpu/LibcGpuBenchmark.cpp
+++ b/libc/benchmarks/gpu/LibcGpuBenchmark.cpp
@@ -8,6 +8,7 @@
 #include "src/__support/fixedvector.h"
 #include "src/__support/macros/config.h"
 #include "src/stdio/printf.h"
+#include "src/stdlib/srand.h"
 #include "src/time/gpu/time_utils.h"
 
 namespace LIBC_NAMESPACE_DECL {
@@ -136,8 +137,10 @@ void print_header() {
 void Benchmark::run_benchmarks() {
   uint64_t id = gpu::get_thread_id();
 
-  if (id == 0)
+  if (id == 0) {
     print_header();
+    LIBC_NAMESPACE::srand(gpu::processor_clock());
+  }
 
   gpu::sync_threads();
 
@@ -205,6 +208,7 @@ BenchmarkResult benchmark(const BenchmarkOptions &options,
     if (samples >= options.max_samples || iterations >= options.max_iterations)
       break;
     if (total_time >= options.min_duration && samples >= options.min_samples &&
+        total_iterations >= options.min_iterations &&
         change_ratio < options.epsilon)
       break;
 
diff --git a/libc/benchmarks/gpu/LibcGpuBenchmark.h b/libc/benchmarks/gpu/LibcGpuBenchmark.h
index f5cf482..2b85b14 100644
--- a/libc/benchmarks/gpu/LibcGpuBenchmark.h
+++ b/libc/benchmarks/gpu/LibcGpuBenchmark.h
@@ -3,10 +3,14 @@
 
 #include "benchmarks/gpu/BenchmarkLogger.h"
 #include "benchmarks/gpu/timing/timing.h"
+#include "src/__support/CPP/array.h"
 #include "src/__support/CPP/functional.h"
 #include "src/__support/CPP/limits.h"
 #include "src/__support/CPP/string_view.h"
+#include "src/__support/CPP/type_traits.h"
+#include "src/__support/FPUtil/FPBits.h"
 #include "src/__support/macros/config.h"
+#include "src/stdlib/rand.h"
 #include "src/time/clock.h"
 
 #include <stdint.h>
@@ -17,12 +21,13 @@ namespace benchmarks {
 
 struct BenchmarkOptions {
   uint32_t initial_iterations = 1;
+  uint32_t min_iterations = 50;
   uint32_t max_iterations = 10000000;
   uint32_t min_samples = 4;
   uint32_t max_samples = 1000;
-  int64_t min_duration = 0;                  // in nanoseconds (ns)
+  int64_t min_duration = 500 * 1000;         // 500 * 1000 nanoseconds = 500 us
   int64_t max_duration = 1000 * 1000 * 1000; // 1e9 nanoseconds = 1 second
-  double epsilon = 0.01;
+  double epsilon = 0.0001;
   double scaling_factor = 1.4;
 };
 
@@ -104,6 +109,54 @@ private:
     return benchmark(options, func);
   }
 };
+
+// We want our random values to be approximately
+// |real value| <= 2^(max_exponent) * (1 + (random 52 bits) * 2^-52) <
+// 2^(max_exponent + 1)
+template <typename T> static T get_rand_input() {
+  using FPBits = LIBC_NAMESPACE::fputil::FPBits<T>;
+
+  // Required to correctly instantiate FPBits for floats and doubles.
+  using RandType = typename cpp::conditional_t<(cpp::is_same_v<T, double>),
+                                               uint64_t, uint32_t>;
+  RandType bits;
+  if constexpr (cpp::is_same_v<T, uint64_t>)
+    bits = (static_cast<uint64_t>(LIBC_NAMESPACE::rand()) << 32) |
+           static_cast<uint64_t>(LIBC_NAMESPACE::rand());
+  else
+    bits = LIBC_NAMESPACE::rand();
+  double scale = 0.5 + LIBC_NAMESPACE::fputil::FPBits<T>::FRACTION_LEN / 2048.0;
+  FPBits fp(bits);
+  fp.set_biased_exponent(
+      static_cast<uint32_t>(fp.get_biased_exponent() * scale));
+  return fp.get_val();
+}
+
+template <typename T> class MathPerf {
+  using FPBits = fputil::FPBits<T>;
+  using StorageType = typename FPBits::StorageType;
+  static constexpr StorageType UIntMax =
+      cpp::numeric_limits<StorageType>::max();
+
+public:
+  typedef T Func(T);
+
+  static uint64_t run_perf_in_range(Func f, StorageType starting_bit,
+                                    StorageType ending_bit, StorageType step) {
+    uint64_t total_time = 0;
+    if (step <= 0)
+      step = 1;
+    volatile T result;
+    for (StorageType bits = starting_bit; bits < ending_bit; bits += step) {
+      T x = FPBits(bits).get_val();
+      total_time += LIBC_NAMESPACE::latency(f, x);
+    }
+    StorageType num_runs = (ending_bit - starting_bit) / step + 1;
+
+    return total_time / num_runs;
+  }
+};
+
 } // namespace benchmarks
 } // namespace LIBC_NAMESPACE_DECL
 
diff --git a/libc/benchmarks/gpu/src/CMakeLists.txt b/libc/benchmarks/gpu/src/CMakeLists.txt
index 42eb4f7..f15d082 100644
--- a/libc/benchmarks/gpu/src/CMakeLists.txt
+++ b/libc/benchmarks/gpu/src/CMakeLists.txt
@@ -1 +1,2 @@
 add_subdirectory(ctype)
+add_subdirectory(math)
diff --git a/libc/benchmarks/gpu/src/math/CMakeLists.txt b/libc/benchmarks/gpu/src/math/CMakeLists.txt
new file mode 100644
index 0000000..77250ed
--- /dev/null
+++ b/libc/benchmarks/gpu/src/math/CMakeLists.txt
@@ -0,0 +1,44 @@
+add_custom_target(libc-gpu-math-benchmarks)
+
+set(math_benchmark_flags "")
+if(LIBC_TARGET_ARCHITECTURE_IS_NVPTX)
+  if(CUDAToolkit_FOUND)
+    set(libdevice_path ${CUDAToolkit_BIN_DIR}/../nvvm/libdevice/libdevice.10.bc)
+    if (EXISTS ${libdevice_path})
+      list(APPEND math_benchmark_flags
+        "SHELL:-Xclang -mlink-builtin-bitcode -Xclang ${libdevice_path}")
+      # Compile definition needed so the benchmark knows to register
+      # NVPTX benchmarks.
+      list(APPEND math_benchmark_flags "-DNVPTX_MATH_FOUND=1")
+    endif()
+  endif()
+endif()
+
+if(LIBC_TARGET_ARCHITECTURE_IS_AMDGPU)
+  find_package(AMDDeviceLibs QUIET HINTS ${CMAKE_INSTALL_PREFIX} PATHS /opt/rocm)
+  if(AMDDeviceLibs_FOUND)
+    get_target_property(ocml_path ocml IMPORTED_LOCATION)
+    list(APPEND math_benchmark_flags
+        "SHELL:-Xclang -mlink-builtin-bitcode -Xclang ${ocml_path}")
+    list(APPEND math_benchmark_flags "-DAMDGPU_MATH_FOUND=1")
+  endif()
+endif()
+
+add_benchmark(
+  sin_benchmark
+  SUITE
+    libc-gpu-math-benchmarks
+  SRCS
+    sin_benchmark.cpp
+  DEPENDS
+    libc.src.math.sin
+    libc.src.stdlib.srand
+    libc.src.stdlib.rand
+    libc.src.__support.FPUtil.fp_bits
+    libc.src.__support.CPP.bit
+    libc.src.__support.CPP.array
+  COMPILE_OPTIONS
+    ${math_benchmark_flags}
+  LOADER_ARGS
+    --threads 64
+)
diff --git a/libc/benchmarks/gpu/src/math/sin_benchmark.cpp b/libc/benchmarks/gpu/src/math/sin_benchmark.cpp
new file mode 100644
index 0000000..5849ea3
--- /dev/null
+++ b/libc/benchmarks/gpu/src/math/sin_benchmark.cpp
@@ -0,0 +1,65 @@
+#include "benchmarks/gpu/LibcGpuBenchmark.h"
+
+#include "src/__support/CPP/array.h"
+#include "src/__support/CPP/bit.h"
+#include "src/__support/CPP/functional.h"
+#include "src/__support/FPUtil/FPBits.h"
+#include "src/math/sin.h"
+#include "src/stdlib/rand.h"
+
+#ifdef NVPTX_MATH_FOUND
+#include "src/math/nvptx/declarations.h"
+#endif
+
+#ifdef AMDGPU_MATH_FOUND
+#include "src/math/amdgpu/declarations.h"
+#endif
+
+constexpr double M_PI = 3.14159265358979323846;
+uint64_t get_bits(double x) {
+  return LIBC_NAMESPACE::cpp::bit_cast<uint64_t>(x);
+}
+
+// BENCHMARK() expects a function that with no parameters that returns a
+// uint64_t representing the latency. Defining each benchmark using macro that
+// expands to a lambda to allow us to switch the implementation of `sin()` to
+// easily register NVPTX benchmarks.
+#define BM_RANDOM_INPUT(Func)                                                  \
+  []() {                                                                       \
+    double x = LIBC_NAMESPACE::benchmarks::get_rand_input<double>();           \
+    return LIBC_NAMESPACE::latency(Func, x);                                   \
+  }
+BENCHMARK(LlvmLibcSinGpuBenchmark, Sin, BM_RANDOM_INPUT(LIBC_NAMESPACE::sin));
+
+#define BM_TWO_PI(Func)                                                        \
+  []() {                                                                       \
+    return LIBC_NAMESPACE::benchmarks::MathPerf<double>::run_perf_in_range(    \
+        Func, 0, get_bits(2 * M_PI), get_bits(M_PI / 64));                     \
+  }
+BENCHMARK(LlvmLibcSinGpuBenchmark, SinTwoPi, BM_TWO_PI(LIBC_NAMESPACE::sin));
+
+#define BM_LARGE_INT(Func)                                                     \
+  []() {                                                                       \
+    return LIBC_NAMESPACE::benchmarks::MathPerf<double>::run_perf_in_range(    \
+        Func, 0, get_bits(1 << 30), get_bits(1 << 4));                         \
+  }
+BENCHMARK(LlvmLibcSinGpuBenchmark, SinLargeInt,
+          BM_LARGE_INT(LIBC_NAMESPACE::sin));
+
+#ifdef NVPTX_MATH_FOUND
+BENCHMARK(LlvmLibcSinGpuBenchmark, NvSin,
+          BM_RANDOM_INPUT(LIBC_NAMESPACE::__nv_sin));
+BENCHMARK(LlvmLibcSinGpuBenchmark, NvSinTwoPi,
+          BM_TWO_PI(LIBC_NAMESPACE::__nv_sin));
+BENCHMARK(LlvmLibcSinGpuBenchmark, NvSinLargeInt,
+          BM_LARGE_INT(LIBC_NAMESPACE::__nv_sin));
+#endif
+
+#ifdef AMDGPU_MATH_FOUND
+BENCHMARK(LlvmLibcSinGpuBenchmark, AmdgpuSin,
+          BM_RANDOM_INPUT(LIBC_NAMESPACE::__ocml_sin_f64));
+BENCHMARK(LlvmLibcSinGpuBenchmark, AmdgpuSinTwoPi,
+          BM_TWO_PI(LIBC_NAMESPACE::__ocml_sin_f64));
+BENCHMARK(LlvmLibcSinGpuBenchmark, AmdgpuSinLargeInt,
+          BM_LARGE_INT(LIBC_NAMESPACE::__ocml_sin_f64));
+#endif
diff --git a/libc/benchmarks/gpu/timing/amdgpu/timing.h b/libc/benchmarks/gpu/timing/amdgpu/timing.h
index 9b40f92..e308d61 100644
--- a/libc/benchmarks/gpu/timing/amdgpu/timing.h
+++ b/libc/benchmarks/gpu/timing/amdgpu/timing.h
@@ -34,7 +34,7 @@ namespace LIBC_NAMESPACE_DECL {
   gpu::memory_fence();
   uint64_t start = gpu::processor_clock();
   uint32_t result = 0.0;
-  asm("v_or_b32 %[v_reg], 0, %[v_reg]\n" ::[v_reg] "v"(result) :);
+  asm("v_or_b32 %[v_reg], 0, %[v_reg]\n" ::[v_reg] "v"(result));
   asm("" ::"s"(start));
   uint64_t stop = gpu::processor_clock();
   return stop - start;
@@ -67,7 +67,8 @@ template <typename F, typename T>
 
   // This inline assembly performs a no-op which forces the result to both
   // be used and prevents us from exiting this region before it's complete.
-  asm("v_or_b32 %[v_reg], 0, %[v_reg]\n" ::[v_reg] "v"(result) :);
+  asm("v_or_b32 %[v_reg], 0, %[v_reg]\n" ::[v_reg] "v"(
+      static_cast<uint32_t>(result)));
 
   // Obtain the current timestamp after running the calculation and force
   // ordering.
@@ -98,7 +99,8 @@ template <typename F, typename T1, typename T2>
 
   auto result = f(arg1, arg2);
 
-  asm("v_or_b32 %[v_reg], 0, %[v_reg]\n" ::[v_reg] "v"(result) :);
+  asm("v_or_b32 %[v_reg], 0, %[v_reg]\n" ::[v_reg] "v"(
+      static_cast<uint32_t>(result)));
 
   uint64_t stop = gpu::processor_clock();
   asm("" ::"s"(stop));
diff --git a/libc/benchmarks/gpu/timing/nvptx/timing.h b/libc/benchmarks/gpu/timing/nvptx/timing.h
index d141b08..b426dfd 100644
--- a/libc/benchmarks/gpu/timing/nvptx/timing.h
+++ b/libc/benchmarks/gpu/timing/nvptx/timing.h
@@ -27,7 +27,7 @@ namespace LIBC_NAMESPACE_DECL {
   uint64_t start = gpu::processor_clock();
   asm("" ::"r"(y), "llr"(start));
   uint32_t result = y;
-  asm("or.b32 %[v_reg], %[v_reg], 0;" ::[v_reg] "r"(result) :);
+  asm("or.b32 %[v_reg], %[v_reg], 0;" ::[v_reg] "r"(result));
   uint64_t stop = gpu::processor_clock();
   volatile auto storage = result;
   return stop - start;
@@ -57,7 +57,7 @@ template <typename F, typename T>
 
   // This inline assembly performs a no-op which forces the result to both be
   // used and prevents us from exiting this region before it's complete.
-  asm("or.b32 %[v_reg], %[v_reg], 0;" ::[v_reg] "r"(result) :);
+  asm("or.b32 %[v_reg], %[v_reg], 0;" ::[v_reg] "r"(result));
 
   // Obtain the current timestamp after running the calculation and force
   // ordering.
@@ -85,7 +85,7 @@ static LIBC_INLINE uint64_t latency(F f, T1 t1, T2 t2) {
 
   auto result = f(arg, arg2);
 
-  asm("or.b32 %[v_reg], %[v_reg], 0;" ::[v_reg] "r"(result) :);
+  asm("or.b32 %[v_reg], %[v_reg], 0;" ::[v_reg] "r"(result));
 
   uint64_t stop = gpu::processor_clock();
   gpu::memory_fence();
diff --git a/libc/cmake/modules/LLVMLibCLibraryRules.cmake b/libc/cmake/modules/LLVMLibCLibraryRules.cmake
index e677b4c..0b1878d 100644
--- a/libc/cmake/modules/LLVMLibCLibraryRules.cmake
+++ b/libc/cmake/modules/LLVMLibCLibraryRules.cmake
@@ -111,19 +111,9 @@ function(add_bitcode_entrypoint_library target_name base_target_name)
     list(APPEND objects ${object})
   endforeach()
 
-  set(output ${CMAKE_CURRENT_BINARY_DIR}/${target_name}.bc)
-  add_custom_command(
-    OUTPUT ${output}
-    COMMAND ${LIBC_LLVM_LINK} ${objects} -o ${output}
-    DEPENDS ${all_deps} ${base_target_name}
-    COMMENT "Linking LLVM-IR bitcode for ${base_target_name}"
-    COMMAND_EXPAND_LISTS
-  )
-  add_custom_target(${target_name} DEPENDS ${output} ${all_deps})
-  set_target_properties(${target_name} PROPERTIES TARGET_OBJECT ${output})
-  if(TARGET llvm-link)
-    add_dependencies(${target_name} llvm-link)
-  endif()
+  add_executable(${target_name} ${objects})
+  target_link_options(${target_name} PRIVATE
+                      "-r" "-nostdlib" "-flto" "-Wl,--lto-emit-llvm")
 endfunction(add_bitcode_entrypoint_library)
 
 # A rule to build a library from a collection of entrypoint objects.
diff --git a/libc/cmake/modules/LLVMLibCTestRules.cmake b/libc/cmake/modules/LLVMLibCTestRules.cmake
index 96eb065..a8b0c61d 100644
--- a/libc/cmake/modules/LLVMLibCTestRules.cmake
+++ b/libc/cmake/modules/LLVMLibCTestRules.cmake
@@ -474,6 +474,8 @@ function(add_integration_test test_name)
     target_link_options(${fq_build_target_name} PRIVATE
       ${LIBC_COMPILE_OPTIONS_DEFAULT} -Wno-multi-gpu
       "-Wl,--suppress-stack-size-warning"
+      "-Wl,-mllvm,-nvptx-lower-global-ctor-dtor=1"
+      "-Wl,-mllvm,-nvptx-emit-init-fini-kernel"
       -march=${LIBC_GPU_TARGET_ARCHITECTURE} -nostdlib -static
       "--cuda-path=${LIBC_CUDA_ROOT}")
   elseif(LIBC_CC_SUPPORTS_NOSTDLIBPP)
@@ -657,6 +659,8 @@ function(add_libc_hermetic test_name)
     target_link_options(${fq_build_target_name} PRIVATE
       ${LIBC_COMPILE_OPTIONS_DEFAULT} -Wno-multi-gpu
       "-Wl,--suppress-stack-size-warning"
+      "-Wl,-mllvm,-nvptx-lower-global-ctor-dtor=1"
+      "-Wl,-mllvm,-nvptx-emit-init-fini-kernel"
       -march=${LIBC_GPU_TARGET_ARCHITECTURE} -nostdlib -static
       "--cuda-path=${LIBC_CUDA_ROOT}")
   elseif(LIBC_CC_SUPPORTS_NOSTDLIBPP)
diff --git a/libc/cmake/modules/prepare_libc_gpu_build.cmake b/libc/cmake/modules/prepare_libc_gpu_build.cmake
index e2a0908..f3537f2 100644
--- a/libc/cmake/modules/prepare_libc_gpu_build.cmake
+++ b/libc/cmake/modules/prepare_libc_gpu_build.cmake
@@ -21,36 +21,10 @@ if(LIBC_TARGET_TRIPLE)
   set(CMAKE_REQUIRED_FLAGS "--target=${LIBC_TARGET_TRIPLE}")
 endif()
 if(LIBC_TARGET_ARCHITECTURE_IS_AMDGPU)
-  set(CMAKE_REQUIRED_FLAGS "${CMAKE_REQUIRED_FLAGS} -nogpulib")
+  set(CMAKE_REQUIRED_FLAGS "${CMAKE_REQUIRED_FLAGS} -nogpulib -nostdlib")
 elseif(LIBC_TARGET_ARCHITECTURE_IS_NVPTX)
   set(CMAKE_REQUIRED_FLAGS
-      "${CMAKE_REQUIRED_FLAGS} -flto -c -Wno-unused-command-line-argument")
-endif()
-
-# Identify the program used to package multiple images into a single binary.
-get_filename_component(compiler_path ${CMAKE_CXX_COMPILER} DIRECTORY)
-if(TARGET clang-offload-packager)
-  get_target_property(LIBC_CLANG_OFFLOAD_PACKAGER clang-offload-packager LOCATION)
-else()
-  find_program(LIBC_CLANG_OFFLOAD_PACKAGER
-               NAMES clang-offload-packager NO_DEFAULT_PATH
-               PATHS ${LLVM_BINARY_DIR}/bin ${compiler_path})
-  if(NOT LIBC_CLANG_OFFLOAD_PACKAGER)
-    message(FATAL_ERROR "Cannot find the 'clang-offload-packager' for the GPU "
-                        "build")
-  endif()
-endif()
-
-# Identify llvm-link program so we can merge the output IR into a single blob.
-if(TARGET llvm-link)
-  get_target_property(LIBC_LLVM_LINK llvm-link LOCATION)
-else()
-  find_program(LIBC_LLVM_LINK
-               NAMES llvm-link NO_DEFAULT_PATH
-               PATHS ${LLVM_BINARY_DIR}/bin ${compiler_path})
-  if(NOT LIBC_LLVM_LINK)
-    message(FATAL_ERROR "Cannot find 'llvm-link' for the GPU build")
-  endif()
+      "${CMAKE_REQUIRED_FLAGS} -flto -c -Wno-unused-command-line-argument -nostdlib")
 endif()
 
 # Optionally set up a job pool to limit the number of GPU tests run in parallel.
diff --git a/libc/config/config.json b/libc/config/config.json
index 3532925..538fea5 100644
--- a/libc/config/config.json
+++ b/libc/config/config.json
@@ -71,16 +71,6 @@
       "doc": "Default number of spins before blocking if a rwlock is in contention (default to 100)."
     }
   },
-  "unistd": {
-    "LIBC_CONF_ENABLE_TID_CACHE": {
-      "value": true,
-      "doc": "Enable caching mechanism for gettid to avoid syscall (only effective in fullbuild mode, default to true). Please refer to Undefined Behavior documentation for implications."
-    },
-    "LIBC_CONF_ENABLE_PID_CACHE": {
-      "value": true,
-      "doc": "Enable caching mechanism for getpid to avoid syscall (default to true). Please refer to Undefined Behavior documentation for implications."
-    }
-  },
   "math": {
     "LIBC_CONF_MATH_OPTIMIZATIONS": {
       "value": 0,
@@ -92,5 +82,11 @@
       "value": "LIBC_QSORT_QUICK_SORT",
       "doc": "Configures sorting algorithm for qsort and qsort_r. Values accepted are LIBC_QSORT_QUICK_SORT, LIBC_QSORT_HEAP_SORT."
     }
+  },
+  "setjmp": {
+    "LIBC_CONF_SETJMP_AARCH64_RESTORE_PLATFORM_REGISTER": {
+      "value": true,
+      "doc": "Make setjmp save the value of x18, and longjmp restore it. The AArch64 ABI delegates this register to platform ABIs, which can choose whether to make it caller-saved."
+    }
   }
 }
diff --git a/libc/config/darwin/arm/entrypoints.txt b/libc/config/darwin/arm/entrypoints.txt
index 38eace2..13280d2 100644
--- a/libc/config/darwin/arm/entrypoints.txt
+++ b/libc/config/darwin/arm/entrypoints.txt
@@ -136,7 +136,9 @@ set(TARGET_LIBM_ENTRYPOINTS
     libc.src.math.cos
     libc.src.math.cosf
     libc.src.math.cospif
+    libc.src.math.dfmal
     libc.src.math.dsqrtl
+    libc.src.math.dsubl
     libc.src.math.erff
     libc.src.math.exp
     libc.src.math.expf
diff --git a/libc/config/darwin/x86_64/entrypoints.txt b/libc/config/darwin/x86_64/entrypoints.txt
index df9f9bf..1cff157 100644
--- a/libc/config/darwin/x86_64/entrypoints.txt
+++ b/libc/config/darwin/x86_64/entrypoints.txt
@@ -119,7 +119,9 @@ set(TARGET_LIBM_ENTRYPOINTS
     #libc.src.math.ceill
     #libc.src.math.coshf
     #libc.src.math.cosf
+    #libc.src.math.dfmal
     #libc.src.math.dsqrtl
+    #libc.src.math.dsubl
     #libc.src.math.expf
     #libc.src.math.exp2f
     #libc.src.math.expm1f
diff --git a/libc/config/gpu/entrypoints.txt b/libc/config/gpu/entrypoints.txt
index df7aa9e..bddb1c3 100644
--- a/libc/config/gpu/entrypoints.txt
+++ b/libc/config/gpu/entrypoints.txt
@@ -166,8 +166,9 @@ set(TARGET_LIBC_ENTRYPOINTS
     libc.src.stdlib.strtoul
     libc.src.stdlib.strtoull
 
-    # Only implemented in the test suite
+    # TODO: Implement these correctly
     libc.src.stdlib.aligned_alloc
+    libc.src.stdlib.calloc
     libc.src.stdlib.free
     libc.src.stdlib.malloc
     libc.src.stdlib.realloc
@@ -187,6 +188,7 @@ set(TARGET_LIBC_ENTRYPOINTS
     libc.src.stdio.vsnprintf
     libc.src.stdio.vsprintf
     libc.src.stdio.sscanf
+    libc.src.stdio.vsscanf
     libc.src.stdio.feof
     libc.src.stdio.ferror
     libc.src.stdio.fflush
@@ -226,7 +228,6 @@ set(TARGET_LIBC_ENTRYPOINTS
 
     # gpu/rpc.h entrypoints
     libc.src.gpu.rpc_host_call
-    libc.src.gpu.rpc_fprintf
 )
 
 set(TARGET_LIBM_ENTRYPOINTS
@@ -282,6 +283,8 @@ set(TARGET_LIBM_ENTRYPOINTS
     libc.src.math.fmodf
     libc.src.math.frexp
     libc.src.math.frexpf
+    libc.src.math.getpayload
+    libc.src.math.getpayloadf
     libc.src.math.hypot
     libc.src.math.hypotf
     libc.src.math.ilogb
diff --git a/libc/config/linux/aarch64/entrypoints.txt b/libc/config/linux/aarch64/entrypoints.txt
index b2c5341..1cb357f 100644
--- a/libc/config/linux/aarch64/entrypoints.txt
+++ b/libc/config/linux/aarch64/entrypoints.txt
@@ -297,7 +297,6 @@ set(TARGET_LIBC_ENTRYPOINTS
     libc.src.unistd.geteuid
     libc.src.unistd.getpid
     libc.src.unistd.getppid
-    libc.src.unistd.gettid
     libc.src.unistd.getuid
     libc.src.unistd.isatty
     libc.src.unistd.link
@@ -347,6 +346,9 @@ set(TARGET_LIBM_ENTRYPOINTS
     libc.src.math.atan2f
     libc.src.math.atanf
     libc.src.math.atanhf
+    libc.src.math.canonicalize
+    libc.src.math.canonicalizef
+    libc.src.math.canonicalizel
     libc.src.math.cbrt
     libc.src.math.cbrtf
     libc.src.math.ceil
@@ -359,14 +361,17 @@ set(TARGET_LIBM_ENTRYPOINTS
     libc.src.math.cosf
     libc.src.math.coshf
     libc.src.math.cospif
+    libc.src.math.dfmal
     libc.src.math.dmull
     libc.src.math.dsqrtl
+    libc.src.math.dsubl
     libc.src.math.erff
     libc.src.math.exp
     libc.src.math.exp10
     libc.src.math.exp10f
     libc.src.math.exp2
     libc.src.math.exp2f
+    libc.src.math.exp2m1f
     libc.src.math.expf
     libc.src.math.expm1
     libc.src.math.expm1f
@@ -426,6 +431,10 @@ set(TARGET_LIBM_ENTRYPOINTS
     libc.src.math.fromfpx
     libc.src.math.fromfpxf
     libc.src.math.fromfpxl
+    libc.src.math.fsqrt
+    libc.src.math.fsqrtl
+    libc.src.math.getpayload
+    libc.src.math.getpayloadf
     libc.src.math.hypot
     libc.src.math.hypotf
     libc.src.math.ilogb
@@ -495,14 +504,16 @@ set(TARGET_LIBM_ENTRYPOINTS
     libc.src.math.rintf
     libc.src.math.rintl
     libc.src.math.round
-    libc.src.math.roundf
-    libc.src.math.roundl
     libc.src.math.roundeven
     libc.src.math.roundevenf
     libc.src.math.roundevenl
+    libc.src.math.roundf
+    libc.src.math.roundl
     libc.src.math.scalbn
     libc.src.math.scalbnf
     libc.src.math.scalbnl
+    libc.src.math.setpayload
+    libc.src.math.setpayloadf
     libc.src.math.sin
     libc.src.math.sincos
     libc.src.math.sincosf
@@ -515,6 +526,8 @@ set(TARGET_LIBM_ENTRYPOINTS
     libc.src.math.tan
     libc.src.math.tanf
     libc.src.math.tanhf
+    libc.src.math.totalorder
+    libc.src.math.totalorderf
     libc.src.math.totalordermag
     libc.src.math.totalordermagf
     libc.src.math.totalordermagl
@@ -603,7 +616,9 @@ if(LIBC_TYPES_HAS_FLOAT128)
     # math.h C23 _Float128 entrypoints
     libc.src.math.ceilf128
     libc.src.math.copysignf128
+    libc.src.math.dfmaf128
     libc.src.math.dsqrtf128
+    libc.src.math.dsubf128
     libc.src.math.fabsf128
     libc.src.math.fdimf128
     libc.src.math.floorf128
@@ -621,6 +636,7 @@ if(LIBC_TYPES_HAS_FLOAT128)
     libc.src.math.frexpf128
     libc.src.math.fromfpf128
     libc.src.math.fromfpxf128
+    libc.src.math.getpayloadf128
     libc.src.math.ilogbf128
     libc.src.math.ldexpf128
     libc.src.math.llogbf128
@@ -640,7 +656,9 @@ if(LIBC_TYPES_HAS_FLOAT128)
     libc.src.math.roundf128
     libc.src.math.roundevenf128
     libc.src.math.scalbnf128
+    libc.src.math.setpayloadf128
     libc.src.math.sqrtf128
+    libc.src.math.totalorderf128
     libc.src.math.totalordermagf128
     libc.src.math.truncf128
     libc.src.math.ufromfpf128
@@ -693,6 +711,8 @@ if(LLVM_LIBC_FULL_BUILD)
     libc.src.pthread.pthread_mutexattr_setrobust
     libc.src.pthread.pthread_mutexattr_settype
     libc.src.pthread.pthread_once
+    libc.src.pthread.pthread_rwlock_clockrdlock
+    libc.src.pthread.pthread_rwlock_clockwrlock
     libc.src.pthread.pthread_rwlock_destroy
     libc.src.pthread.pthread_rwlock_init
     libc.src.pthread.pthread_rwlock_rdlock
@@ -715,6 +735,10 @@ if(LLVM_LIBC_FULL_BUILD)
     # sched.h entrypoints
     libc.src.sched.__sched_getcpucount
 
+    # setjmp.h entrypoints
+    libc.src.setjmp.longjmp
+    libc.src.setjmp.setjmp
+
     # stdio.h entrypoints
     libc.src.stdio.clearerr
     libc.src.stdio.clearerr_unlocked
diff --git a/libc/config/linux/arm/entrypoints.txt b/libc/config/linux/arm/entrypoints.txt
index 8e77105..90aae96 100644
--- a/libc/config/linux/arm/entrypoints.txt
+++ b/libc/config/linux/arm/entrypoints.txt
@@ -228,6 +228,7 @@ set(TARGET_LIBM_ENTRYPOINTS
     libc.src.math.cos
     libc.src.math.cosf
     libc.src.math.coshf
+    libc.src.math.dfmal
     libc.src.math.dsqrtl
     libc.src.math.erff
     libc.src.math.exp
@@ -292,6 +293,8 @@ set(TARGET_LIBM_ENTRYPOINTS
     libc.src.math.fromfpx
     libc.src.math.fromfpxf
     libc.src.math.fromfpxl
+    libc.src.math.getpayload
+    libc.src.math.getpayloadf
     libc.src.math.hypot
     libc.src.math.hypotf
     libc.src.math.ilogb
@@ -363,6 +366,8 @@ set(TARGET_LIBM_ENTRYPOINTS
     libc.src.math.scalbn
     libc.src.math.scalbnf
     libc.src.math.scalbnl
+    libc.src.math.setpayload
+    libc.src.math.setpayloadf
     libc.src.math.sin
     libc.src.math.sincos
     libc.src.math.sincosf
@@ -374,6 +379,8 @@ set(TARGET_LIBM_ENTRYPOINTS
     libc.src.math.tan
     libc.src.math.tanf
     libc.src.math.tanhf
+    libc.src.math.totalorder
+    libc.src.math.totalorderf
     libc.src.math.totalordermag
     libc.src.math.totalordermagf
     libc.src.math.totalordermagl
diff --git a/libc/config/linux/riscv/entrypoints.txt b/libc/config/linux/riscv/entrypoints.txt
index e3ed5a5..60b5654c 100644
--- a/libc/config/linux/riscv/entrypoints.txt
+++ b/libc/config/linux/riscv/entrypoints.txt
@@ -217,6 +217,7 @@ set(TARGET_LIBC_ENTRYPOINTS
     libc.src.stdio.snprintf
     libc.src.stdio.sprintf
     libc.src.stdio.sscanf
+    libc.src.stdio.vsscanf
     libc.src.stdio.vfprintf
     libc.src.stdio.vprintf
     libc.src.stdio.vsnprintf
@@ -315,7 +316,6 @@ set(TARGET_LIBC_ENTRYPOINTS
     libc.src.unistd.geteuid
     libc.src.unistd.getpid
     libc.src.unistd.getppid
-    libc.src.unistd.gettid
     libc.src.unistd.getuid
     libc.src.unistd.isatty
     libc.src.unistd.link
@@ -384,8 +384,10 @@ set(TARGET_LIBM_ENTRYPOINTS
     libc.src.math.cosf
     libc.src.math.coshf
     libc.src.math.cospif
+    libc.src.math.dfmal
     libc.src.math.dmull
     libc.src.math.dsqrtl
+    libc.src.math.dsubl
     libc.src.math.erff
     libc.src.math.exp
     libc.src.math.exp10
@@ -454,6 +456,8 @@ set(TARGET_LIBM_ENTRYPOINTS
     libc.src.math.fromfpxl
     libc.src.math.fsqrt
     libc.src.math.fsqrtl
+    libc.src.math.getpayload
+    libc.src.math.getpayloadf
     libc.src.math.hypot
     libc.src.math.hypotf
     libc.src.math.ilogb
@@ -531,6 +535,8 @@ set(TARGET_LIBM_ENTRYPOINTS
     libc.src.math.scalbn
     libc.src.math.scalbnf
     libc.src.math.scalbnl
+    libc.src.math.setpayload
+    libc.src.math.setpayloadf
     libc.src.math.sin
     libc.src.math.sincos
     libc.src.math.sincosf
@@ -543,6 +549,8 @@ set(TARGET_LIBM_ENTRYPOINTS
     libc.src.math.tan
     libc.src.math.tanf
     libc.src.math.tanhf
+    libc.src.math.totalorder
+    libc.src.math.totalorderf
     libc.src.math.totalordermag
     libc.src.math.totalordermagf
     libc.src.math.totalordermagl
@@ -584,6 +592,7 @@ if(LIBC_TYPES_HAS_FLOAT128)
     libc.src.math.fromfpf128
     libc.src.math.fromfpxf128
     libc.src.math.fsqrtf128
+    libc.src.math.getpayloadf128
     libc.src.math.ilogbf128
     libc.src.math.ldexpf128
     libc.src.math.llogbf128
@@ -604,6 +613,7 @@ if(LIBC_TYPES_HAS_FLOAT128)
     libc.src.math.roundf128
     libc.src.math.scalbnf128
     libc.src.math.sqrtf128
+    libc.src.math.totalorderf128
     libc.src.math.totalordermagf128
     libc.src.math.truncf128
     libc.src.math.ufromfpf128
@@ -704,6 +714,8 @@ if(LLVM_LIBC_FULL_BUILD)
     libc.src.pthread.pthread_mutexattr_setrobust
     libc.src.pthread.pthread_mutexattr_settype
     libc.src.pthread.pthread_once
+    libc.src.pthread.pthread_rwlock_clockrdlock
+    libc.src.pthread.pthread_rwlock_clockwrlock
     libc.src.pthread.pthread_rwlock_destroy
     libc.src.pthread.pthread_rwlock_init
     libc.src.pthread.pthread_rwlock_rdlock
diff --git a/libc/config/linux/x86_64/entrypoints.txt b/libc/config/linux/x86_64/entrypoints.txt
index 96f9755..a577bfa 100644
--- a/libc/config/linux/x86_64/entrypoints.txt
+++ b/libc/config/linux/x86_64/entrypoints.txt
@@ -217,6 +217,7 @@ set(TARGET_LIBC_ENTRYPOINTS
     libc.src.stdio.snprintf
     libc.src.stdio.sprintf
     libc.src.stdio.sscanf
+    libc.src.stdio.vsscanf
     libc.src.stdio.vfprintf
     libc.src.stdio.vprintf
     libc.src.stdio.vsnprintf
@@ -315,7 +316,6 @@ set(TARGET_LIBC_ENTRYPOINTS
     libc.src.unistd.geteuid
     libc.src.unistd.getpid
     libc.src.unistd.getppid
-    libc.src.unistd.gettid
     libc.src.unistd.getuid
     libc.src.unistd.isatty
     libc.src.unistd.link
@@ -384,8 +384,10 @@ set(TARGET_LIBM_ENTRYPOINTS
     libc.src.math.cosf
     libc.src.math.coshf
     libc.src.math.cospif
+    libc.src.math.dfmal
     libc.src.math.dmull
     libc.src.math.dsqrtl
+    libc.src.math.dsubl
     libc.src.math.erff
     libc.src.math.exp
     libc.src.math.exp10
@@ -454,6 +456,8 @@ set(TARGET_LIBM_ENTRYPOINTS
     libc.src.math.fromfpxl
     libc.src.math.fsqrt
     libc.src.math.fsqrtl
+    libc.src.math.getpayload
+    libc.src.math.getpayloadf
     libc.src.math.hypot
     libc.src.math.hypotf
     libc.src.math.ilogb
@@ -531,6 +535,8 @@ set(TARGET_LIBM_ENTRYPOINTS
     libc.src.math.scalbn
     libc.src.math.scalbnf
     libc.src.math.scalbnl
+    libc.src.math.setpayload
+    libc.src.math.setpayloadf
     libc.src.math.sin
     libc.src.math.sincos
     libc.src.math.sincosf
@@ -543,6 +549,8 @@ set(TARGET_LIBM_ENTRYPOINTS
     libc.src.math.tan
     libc.src.math.tanf
     libc.src.math.tanhf
+    libc.src.math.totalorder
+    libc.src.math.totalorderf
     libc.src.math.totalordermag
     libc.src.math.totalordermagf
     libc.src.math.totalordermagl
@@ -563,6 +571,7 @@ if(LIBC_TYPES_HAS_FLOAT16)
     libc.src.math.canonicalizef16
     libc.src.math.ceilf16
     libc.src.math.copysignf16
+    libc.src.math.expf16
     libc.src.math.f16add
     libc.src.math.f16addf
     libc.src.math.f16addl
@@ -649,8 +658,10 @@ if(LIBC_TYPES_HAS_FLOAT128)
     libc.src.math.canonicalizef128
     libc.src.math.ceilf128
     libc.src.math.copysignf128
+    libc.src.math.dfmaf128
     libc.src.math.dmulf128
     libc.src.math.dsqrtf128
+    libc.src.math.dsubf128
     libc.src.math.fabsf128
     libc.src.math.fdimf128
     libc.src.math.floorf128
@@ -670,6 +681,7 @@ if(LIBC_TYPES_HAS_FLOAT128)
     libc.src.math.fromfpf128
     libc.src.math.fromfpxf128
     libc.src.math.fsqrtf128
+    libc.src.math.getpayloadf128
     libc.src.math.ilogbf128
     libc.src.math.ldexpf128
     libc.src.math.llogbf128
@@ -689,7 +701,9 @@ if(LIBC_TYPES_HAS_FLOAT128)
     libc.src.math.roundevenf128
     libc.src.math.roundf128
     libc.src.math.scalbnf128
+    libc.src.math.setpayloadf128
     libc.src.math.sqrtf128
+    libc.src.math.totalorderf128
     libc.src.math.totalordermagf128
     libc.src.math.truncf128
     libc.src.math.ufromfpf128
@@ -791,6 +805,8 @@ if(LLVM_LIBC_FULL_BUILD)
     libc.src.pthread.pthread_mutexattr_setrobust
     libc.src.pthread.pthread_mutexattr_settype
     libc.src.pthread.pthread_once
+    libc.src.pthread.pthread_rwlock_clockrdlock
+    libc.src.pthread.pthread_rwlock_clockwrlock
     libc.src.pthread.pthread_rwlock_destroy
     libc.src.pthread.pthread_rwlock_init
     libc.src.pthread.pthread_rwlock_rdlock
diff --git a/libc/config/windows/entrypoints.txt b/libc/config/windows/entrypoints.txt
index 06c3682..e45219a 100644
--- a/libc/config/windows/entrypoints.txt
+++ b/libc/config/windows/entrypoints.txt
@@ -133,6 +133,8 @@ set(TARGET_LIBM_ENTRYPOINTS
     libc.src.math.cos
     libc.src.math.cosf
     libc.src.math.coshf
+    libc.src.math.dfmal
+    libc.src.math.dsubl
     libc.src.math.erff
     libc.src.math.exp
     libc.src.math.expf
diff --git a/libc/docs/configure.rst b/libc/docs/configure.rst
index 1936c87..950de0e 100644
--- a/libc/docs/configure.rst
+++ b/libc/docs/configure.rst
@@ -47,9 +47,8 @@ to learn about the defaults for your platform and target.
 * **"scanf" options**
     - ``LIBC_CONF_SCANF_DISABLE_FLOAT``: Disable parsing floating point values in scanf and friends.
     - ``LIBC_CONF_SCANF_DISABLE_INDEX_MODE``: Disable index mode in the scanf format string.
+* **"setjmp" options**
+    - ``LIBC_CONF_SETJMP_AARCH64_RESTORE_PLATFORM_REGISTER``: Make setjmp save the value of x18, and longjmp restore it. The AArch64 ABI delegates this register to platform ABIs, which can choose whether to make it caller-saved.
 * **"string" options**
     - ``LIBC_CONF_MEMSET_X86_USE_SOFTWARE_PREFETCHING``: Inserts prefetch for write instructions (PREFETCHW) for memset on x86 to recover performance when hardware prefetcher is disabled.
     - ``LIBC_CONF_STRING_UNSAFE_WIDE_READ``: Read more than a byte at a time to perform byte-string operations like strlen.
-* **"unistd" options**
-    - ``LIBC_CONF_ENABLE_PID_CACHE``: Enable caching mechanism for getpid to avoid syscall (default to true). Please refer to Undefined Behavior documentation for implications.
-    - ``LIBC_CONF_ENABLE_TID_CACHE``: Enable caching mechanism for gettid to avoid syscall (only effective in fullbuild mode, default to true). Please refer to Undefined Behavior documentation for implications.
diff --git a/libc/docs/dev/header_generation.rst b/libc/docs/dev/header_generation.rst
index 32ce91d..598c8b8 100644
--- a/libc/docs/dev/header_generation.rst
+++ b/libc/docs/dev/header_generation.rst
@@ -22,7 +22,7 @@ Instructions
 ------------
 
 Required Versions:
-  - Python Version: 3.6
+  - Python Version: 3.8
   - PyYAML Version: 5.1
 
 1. Keep full-build mode on when building, otherwise headers will not be
diff --git a/libc/docs/dev/undefined_behavior.rst b/libc/docs/dev/undefined_behavior.rst
index b712780..9f50545 100644
--- a/libc/docs/dev/undefined_behavior.rst
+++ b/libc/docs/dev/undefined_behavior.rst
@@ -94,25 +94,7 @@ Non-const Constant Return Values
 --------------------------------
 Some libc functions, like ``dlerror()``, return ``char *`` instead of ``const char *`` and then tell the caller they promise not to to modify this value. Any modification of this value is undefined behavior.
 
-Cached ``getpid/gettid``
-------------------------
-Since version ``2.25``, glibc removes its cache mechanism for ``getpid/gettid`` 
-(See the history section in https://man7.org/linux/man-pages/man2/getpid.2.html).
-LLVM's libc still implements the cache as it is useful for fast deadlock detection.
-The cache mechanism is also implemented in MUSL and bionic. The tid/pid cache can 
-be disabled by setting ``LIBC_CONF_ENABLE_TID_CACHE`` and ``LIBC_CONF_ENABLE_PID_CACHE``
-to ``false`` respectively.
-
-Unwrapped ``SYS_clone/SYS_fork/SYS_vfork``
-------------------------------------------
-It is highly discouraged to use unwrapped ``SYS_clone/SYS_fork/SYS_vfork``. 
-First, calling such syscalls without provided libc wrappers ignores 
-all the ``pthread_atfork`` entries as libc can no longer detect the ``fork``. 
-Second, libc relies on the ``fork/clone`` wrappers to correctly maintain cache for
-process id and thread id, and other important process-specific states such as the list 
-of robust mutexes. Third, even if the user is to call ``exec*`` functions immediately, 
-there can still be other unexpected issues. For instance, there can be signal handlers 
-inherited from parent process triggered inside the instruction window between ``fork`` 
-and ``exec*``. As libc failed to maintain its internal states correctly, even though the
-functions used inside the signal handlers are marked as ``async-signal-safe`` (such as
-``getpid``), they will still return wrong values or lead to other even worse situations.
+Unrecognized ``clockid_t`` values for ``pthread_rwlock_clock*`` APIs
+----------------------------------------------------------------------
+POSIX.1-2024 only demands support for ``CLOCK_REALTIME`` and ``CLOCK_MONOTONIC``. Currently,
+as in LLVM libc, if other clock ids are used, they will be treated as monotonic clocks.
diff --git a/libc/docs/math/index.rst b/libc/docs/math/index.rst
index 9f88b4d..bbe5b19 100644
--- a/libc/docs/math/index.rst
+++ b/libc/docs/math/index.rst
@@ -118,11 +118,11 @@ Basic Operations
 +------------------+------------------+-----------------+------------------------+----------------------+------------------------+------------------------+----------------------------+
 | ddiv             | N/A              | N/A             |                        | N/A                  |                        | 7.12.14.4              | F.10.11                    |
 +------------------+------------------+-----------------+------------------------+----------------------+------------------------+------------------------+----------------------------+
-| dfma             | N/A              | N/A             |                        | N/A                  |                        | 7.12.14.5              | F.10.11                    |
+| dfma             | N/A              | N/A             | |check|                | N/A                  | |check|\*             | 7.12.14.5              | F.10.11                     |
 +------------------+------------------+-----------------+------------------------+----------------------+------------------------+------------------------+----------------------------+
 | dmul             | N/A              | N/A             | |check|                | N/A                  | |check|\*              | 7.12.14.3              | F.10.11                    |
 +------------------+------------------+-----------------+------------------------+----------------------+------------------------+------------------------+----------------------------+
-| dsub             | N/A              | N/A             |                        | N/A                  |                        | 7.12.14.2              | F.10.11                    |
+| dsub             | N/A              | N/A             | |check|                | N/A                  | |check|\*              | 7.12.14.2              | F.10.11                    |
 +------------------+------------------+-----------------+------------------------+----------------------+------------------------+------------------------+----------------------------+
 | f16add           | |check|\*        | |check|\*       | |check|\*              | N/A                  | |check|                | 7.12.14.1              | F.10.11                    |
 +------------------+------------------+-----------------+------------------------+----------------------+------------------------+------------------------+----------------------------+
@@ -178,7 +178,7 @@ Basic Operations
 +------------------+------------------+-----------------+------------------------+----------------------+------------------------+------------------------+----------------------------+
 | fsub             | N/A              |                 |                        | N/A                  |                        | 7.12.14.2              | F.10.11                    |
 +------------------+------------------+-----------------+------------------------+----------------------+------------------------+------------------------+----------------------------+
-| getpayload       |                  |                 |                        | |check|              |                        | F.10.13.1              | N/A                        |
+| getpayload       | |check|          | |check|         |                        | |check|              | |check|                | F.10.13.1              | N/A                        |
 +------------------+------------------+-----------------+------------------------+----------------------+------------------------+------------------------+----------------------------+
 | ilogb            | |check|          | |check|         | |check|                | |check|              | |check|                | 7.12.6.8               | F.10.3.8                   |
 +------------------+------------------+-----------------+------------------------+----------------------+------------------------+------------------------+----------------------------+
@@ -224,11 +224,11 @@ Basic Operations
 +------------------+------------------+-----------------+------------------------+----------------------+------------------------+------------------------+----------------------------+
 | scalbn           | |check|          | |check|         | |check|                | |check|              | |check|                | 7.12.6.19              | F.10.3.19                  |
 +------------------+------------------+-----------------+------------------------+----------------------+------------------------+------------------------+----------------------------+
-| setpayload       |                  |                 |                        | |check|              |                        | F.10.13.2              | N/A                        |
+| setpayload       | |check|          | |check|         |                        | |check|              | |check|                | F.10.13.2              | N/A                        |
 +------------------+------------------+-----------------+------------------------+----------------------+------------------------+------------------------+----------------------------+
 | setpayloadsig    |                  |                 |                        | |check|              |                        | F.10.13.3              | N/A                        |
 +------------------+------------------+-----------------+------------------------+----------------------+------------------------+------------------------+----------------------------+
-| totalorder       |                  |                 |                        | |check|              |                        | F.10.12.1              | N/A                        |
+| totalorder       | |check|          | |check|         |                        | |check|              | |check|                | F.10.12.1              | N/A                        |
 +------------------+------------------+-----------------+------------------------+----------------------+------------------------+------------------------+----------------------------+
 | totalordermag    | |check|          | |check|         | |check|                | |check|              | |check|                | F.10.12.2              | N/A                        |
 +------------------+------------------+-----------------+------------------------+----------------------+------------------------+------------------------+----------------------------+
@@ -284,7 +284,7 @@ Higher Math Functions
 +-----------+------------------+-----------------+------------------------+----------------------+------------------------+------------------------+----------------------------+
 | erfc      |                  |                 |                        |                      |                        | 7.12.8.2               | F.10.5.2                   |
 +-----------+------------------+-----------------+------------------------+----------------------+------------------------+------------------------+----------------------------+
-| exp       | |check|          | |check|         |                        |                      |                        | 7.12.6.1               | F.10.3.1                   |
+| exp       | |check|          | |check|         |                        | |check|              |                        | 7.12.6.1               | F.10.3.1                   |
 +-----------+------------------+-----------------+------------------------+----------------------+------------------------+------------------------+----------------------------+
 | exp10     | |check|          | |check|         |                        |                      |                        | 7.12.6.2               | F.10.3.2                   |
 +-----------+------------------+-----------------+------------------------+----------------------+------------------------+------------------------+----------------------------+
diff --git a/libc/fuzzing/math/RemQuoDiff.h b/libc/fuzzing/math/RemQuoDiff.h
index 84a6a24..cfb9b7f 100644
--- a/libc/fuzzing/math/RemQuoDiff.h
+++ b/libc/fuzzing/math/RemQuoDiff.h
@@ -31,21 +31,22 @@ void RemQuoDiff(RemQuoFunc<T> func1, RemQuoFunc<T> func2, const uint8_t *data,
   T remainder1 = func1(x, y, &q1);
   T remainder2 = func2(x, y, &q2);
 
-  if (isnan(remainder1)) {
-    if (!isnan(remainder2))
+  LIBC_NAMESPACE::fputil::FPBits<T> bits1(remainder1);
+  LIBC_NAMESPACE::fputil::FPBits<T> bits2(remainder2);
+
+  if (bit1.is_nan()) {
+    if (!bit2.is_nan())
       __builtin_trap();
     return;
   }
 
-  if (isinf(remainder2) != isinf(remainder1))
+  if (bit1.is_inf() != bit2.is_inf())
     __builtin_trap();
 
   // Compare only the 3 LS bits of the quotient.
   if ((q1 & 0x7) != (q2 & 0x7))
     __builtin_trap();
 
-  LIBC_NAMESPACE::fputil::FPBits<T> bits1(remainder1);
-  LIBC_NAMESPACE::fputil::FPBits<T> bits2(remainder2);
   if (bits1.uintval() != bits2.uintval())
     __builtin_trap();
 }
diff --git a/libc/fuzzing/stdlib/CMakeLists.txt b/libc/fuzzing/stdlib/CMakeLists.txt
index 204bc61..9b3298c 100644
--- a/libc/fuzzing/stdlib/CMakeLists.txt
+++ b/libc/fuzzing/stdlib/CMakeLists.txt
@@ -7,6 +7,14 @@ add_libc_fuzzer(
 )
 
 add_libc_fuzzer(
+  heap_sort_fuzz
+  SRCS
+    heap_sort_fuzz.cpp
+  DEPENDS
+    libc.src.stdlib.qsort_util
+)
+
+add_libc_fuzzer(
   atof_differential_fuzz
   SRCS
     atof_differential_fuzz.cpp
diff --git a/libc/fuzzing/stdlib/heap_sort_fuzz.cpp b/libc/fuzzing/stdlib/heap_sort_fuzz.cpp
new file mode 100644
index 0000000..ad2d311
--- /dev/null
+++ b/libc/fuzzing/stdlib/heap_sort_fuzz.cpp
@@ -0,0 +1,49 @@
+//===-- heap_sort_fuzz.cpp ------------------------------------------------===//
+//
+// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
+// See https://llvm.org/LICENSE.txt for license information.
+// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
+//
+//===----------------------------------------------------------------------===//
+///
+/// Fuzzing test for llvm-libc heap_sort implementation.
+///
+//===----------------------------------------------------------------------===//
+
+#include "src/__support/macros/config.h"
+#include "src/stdlib/heap_sort.h"
+#include <stdint.h>
+
+static int int_compare(const void *l, const void *r) {
+  int li = *reinterpret_cast<const int *>(l);
+  int ri = *reinterpret_cast<const int *>(r);
+  if (li == ri)
+    return 0;
+  if (li > ri)
+    return 1;
+  return -1;
+}
+
+extern "C" int LLVMFuzzerTestOneInput(const uint8_t *data, size_t size) {
+
+  const size_t array_size = size / sizeof(int);
+  if (array_size == 0)
+    return 0;
+
+  int *array = new int[array_size];
+  const int *data_as_int = reinterpret_cast<const int *>(data);
+  for (size_t i = 0; i < array_size; ++i)
+    array[i] = data_as_int[i];
+
+  auto arr = LIBC_NAMESPACE::internal::Array(
+      reinterpret_cast<uint8_t *>(array), array_size, sizeof(int), int_compare);
+
+  LIBC_NAMESPACE::internal::heap_sort(arr);
+
+  for (size_t i = 0; i < array_size - 1; ++i)
+    if (array[i] > array[i + 1])
+      __builtin_trap();
+
+  delete[] array;
+  return 0;
+}
diff --git a/libc/fuzzing/stdlib/strtofloat_fuzz.cpp b/libc/fuzzing/stdlib/strtofloat_fuzz.cpp
index c158162..503b559 100644
--- a/libc/fuzzing/stdlib/strtofloat_fuzz.cpp
+++ b/libc/fuzzing/stdlib/strtofloat_fuzz.cpp
@@ -118,9 +118,10 @@ extern "C" int LLVMFuzzerTestOneInput(const uint8_t *data, size_t size) {
       __builtin_trap();
     // If any result is NaN, all of them should be NaN. We can't use the usual
     // comparisons because NaN != NaN.
-    if (isnan(float_result) ^ isnan(strtof_result))
+    if (FPBits<float>(float_result).is_nan() !=
+        FPBits<float>(strtof_result).is_nan())
       __builtin_trap();
-    if (!isnan(float_result) && float_result != strtof_result)
+    if (!FPBits<float>(float_result).is_nan() && float_result != strtof_result)
       __builtin_trap();
     mpfr_clear(mpfr_float);
   }
@@ -136,10 +137,12 @@ extern "C" int LLVMFuzzerTestOneInput(const uint8_t *data, size_t size) {
     ptrdiff_t strtod_strlen = out_ptr - str_ptr;
     if (result_strlen != strtod_strlen)
       __builtin_trap();
-    if (isnan(double_result) ^ isnan(strtod_result) ||
-        isnan(double_result) ^ isnan(atof_result))
+    if (FPBits<double>(double_result).is_nan() !=
+            FPBits<double>(strtod_result).is_nan() ||
+        FPBits<double>(double_result).is_nan() !=
+            FPBits<double>(atof_result).is_nan())
       __builtin_trap();
-    if (!isnan(double_result) &&
+    if (!FPBits<double>(double_result).is_nan() &&
         (double_result != strtod_result || double_result != atof_result))
       __builtin_trap();
     mpfr_clear(mpfr_double);
@@ -156,9 +159,11 @@ extern "C" int LLVMFuzzerTestOneInput(const uint8_t *data, size_t size) {
     ptrdiff_t strtold_strlen = out_ptr - str_ptr;
     if (result_strlen != strtold_strlen)
       __builtin_trap();
-    if (isnan(long_double_result) ^ isnan(strtold_result))
+    if (FPBits<long double>(long_double_result).is_nan() ^
+        FPBits<long double>(strtold_result).is_nan())
       __builtin_trap();
-    if (!isnan(long_double_result) && long_double_result != strtold_result)
+    if (!FPBits<long double>(long_double_result).is_nan() &&
+        long_double_result != strtold_result)
       __builtin_trap();
     mpfr_clear(mpfr_long_double);
   }
diff --git a/libc/hdr/CMakeLists.txt b/libc/hdr/CMakeLists.txt
index da640e8..a2fad9b 100644
--- a/libc/hdr/CMakeLists.txt
+++ b/libc/hdr/CMakeLists.txt
@@ -33,6 +33,15 @@ add_proxy_header_library(
 )
 
 add_proxy_header_library(
+  math_function_macros
+  HDRS
+    math_function_macros.h
+  FULL_BUILD_DEPENDS
+    libc.include.llvm-libc-macros.math_function_macros
+    libc.include.math
+)
+
+add_proxy_header_library(
   errno_macros
   HDRS
     errno_macros.h
diff --git a/libc/hdr/math_function_macros.h b/libc/hdr/math_function_macros.h
new file mode 100644
index 0000000..48dec82
--- /dev/null
+++ b/libc/hdr/math_function_macros.h
@@ -0,0 +1,27 @@
+//===-- Definition of macros from math.h ----------------------------------===//
+//
+// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
+// See https://llvm.org/LICENSE.txt for license information.
+// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
+//
+//===----------------------------------------------------------------------===//
+
+#ifndef LLVM_LIBC_HDR_MATH_FUNCTION_MACROS_H
+#define LLVM_LIBC_HDR_MATH_FUNCTION_MACROS_H
+
+#ifdef LIBC_FULL_BUILD
+
+#include "include/llvm-libc-macros/math-function-macros.h"
+
+#else // Overlay mode
+
+// GCC will include CXX headers when __cplusplus is defined. This behavior
+// can be suppressed by defining _GLIBCXX_INCLUDE_NEXT_C_HEADERS.
+#if defined(__GNUC__) && !defined(__clang__)
+#define _GLIBCXX_INCLUDE_NEXT_C_HEADERS
+#endif
+#include <math.h>
+
+#endif // LLVM_LIBC_FULL_BUILD
+
+#endif // LLVM_LIBC_HDR_MATH_MACROS_H
diff --git a/libc/hdr/math_macros.h b/libc/hdr/math_macros.h
index d5a8237..8634511 100644
--- a/libc/hdr/math_macros.h
+++ b/libc/hdr/math_macros.h
@@ -11,7 +11,6 @@
 
 #ifdef LIBC_FULL_BUILD
 
-#include "include/llvm-libc-macros/math-function-macros.h"
 #include "include/llvm-libc-macros/math-macros.h"
 
 #else // Overlay mode
diff --git a/libc/include/llvm-libc-types/jmp_buf.h b/libc/include/llvm-libc-types/jmp_buf.h
index 8949be9..60e033c 100644
--- a/libc/include/llvm-libc-types/jmp_buf.h
+++ b/libc/include/llvm-libc-types/jmp_buf.h
@@ -35,6 +35,11 @@ typedef struct {
 #elif defined(__arm__)
   // r4, r5, r6, r7, r8, r9, r10, r11, r12, lr
   long opaque[10];
+#elif defined(__aarch64__)
+  long opaque[14]; // x19-x29, lr, sp, optional x18
+#if __ARM_FP
+  long fopaque[8]; // d8-d15
+#endif
 #else
 #error "__jmp_buf not available for your target architecture."
 #endif
diff --git a/libc/lib/CMakeLists.txt b/libc/lib/CMakeLists.txt
index 4b7cfc4..ce0b07f 100644
--- a/libc/lib/CMakeLists.txt
+++ b/libc/lib/CMakeLists.txt
@@ -51,7 +51,7 @@ foreach(archive IN ZIP_LISTS
       PROPERTIES
         OUTPUT_NAME ${archive_1}.bc
     )
-    list(APPEND added_gpu_bitcode_targets ${archive_1}bitcode)
+    list(APPEND added_bitcode_targets ${archive_1}bitcode)
   endif()
 endforeach()
 
@@ -61,24 +61,13 @@ install(
   COMPONENT libc
 )
 
-if(LIBC_TARGET_OS_IS_GPU)
-  set(gpu_install_dir lib${LLVM_LIBDIR_SUFFIX})
-  if(LLVM_ENABLE_PER_TARGET_RUNTIME_DIR)
-    set(gpu_install_dir lib${LLVM_LIBDIR_SUFFIX}/${LLVM_HOST_TRIPLE})
-  endif()
-  install(
-    TARGETS ${added_gpu_archive_targets}
-    ARCHIVE DESTINATION ${gpu_install_dir}
-    COMPONENT libc
+foreach(file ${added_bitcode_targets})
+  install(FILES $<TARGET_FILE:${file}>
+          DESTINATION ${LIBC_INSTALL_LIBRARY_DIR}
+          RENAME $<TARGET_PROPERTY:${file},OUTPUT_NAME>
+          COMPONENT libc
   )
-  foreach(file ${added_gpu_bitcode_targets})
-    install(FILES $<TARGET_PROPERTY:${file},TARGET_OBJECT>
-            DESTINATION ${LIBC_INSTALL_LIBRARY_DIR}
-            RENAME $<TARGET_PROPERTY:${file},OUTPUT_NAME>
-            COMPONENT libc
-    )
-  endforeach()
-endif()
+endforeach()
 
 if(NOT LIBC_TARGET_OS_IS_BAREMETAL)
   # For now we will disable libc-startup installation for baremetal. The
@@ -93,6 +82,7 @@ endif()
 
 add_custom_target(install-libc
                   DEPENDS ${added_archive_targets}
+                          ${added_bitcode_targets}
                           ${startup_target}
                           ${header_install_target}
                   COMMAND "${CMAKE_COMMAND}"
@@ -100,6 +90,7 @@ add_custom_target(install-libc
                           -P "${CMAKE_BINARY_DIR}/cmake_install.cmake")
 add_custom_target(install-libc-stripped
                   DEPENDS ${added_archive_targets}
+                          ${added_bitcode_targets}
                           ${startup_target}
                           ${header_install_target}
                   COMMAND "${CMAKE_COMMAND}"
diff --git a/libc/newhdrgen/yaml/pthread.yaml b/libc/newhdrgen/yaml/pthread.yaml
index 292d917..d492dfc 100644
--- a/libc/newhdrgen/yaml/pthread.yaml
+++ b/libc/newhdrgen/yaml/pthread.yaml
@@ -370,6 +370,20 @@ functions:
     arguments:
       - type: pthread_rwlock_t *__restrict
       - type: const struct timespec *__restrict
+  - name: pthread_rwlock_clockrdlock
+    standards: POSIX
+    return_type: int
+    arguments:
+      - type: pthread_rwlock_t *__restrict
+      - type: clockid_t
+      - type: const struct timespec *__restrict
+  - name: pthread_rwlock_clockwrlock
+    standards: POSIX
+    return_type: int
+    arguments:
+      - type: pthread_rwlock_t *__restrict
+      - type: clockid_t
+      - type: const struct timespec *__restrict
   - name: pthread_rwlock_rdlock
     standards: POSIX
     return_type: int
diff --git a/libc/newhdrgen/yaml/stdio.yaml b/libc/newhdrgen/yaml/stdio.yaml
index 687a6d6..660087e 100644
--- a/libc/newhdrgen/yaml/stdio.yaml
+++ b/libc/newhdrgen/yaml/stdio.yaml
@@ -105,6 +105,14 @@ functions:
       - type: const char *__restrict
       - type: const char *__restrict
       - type: ...
+  - name: vsscanf
+    standards: 
+      - stdc
+    return_type: int
+    arguments:
+      - type: const char *__restrict
+      - type: const char *__restrict
+      - type: va_list
   - name: scanf
     standards: 
       - stdc
diff --git a/libc/spec/gpu_ext.td b/libc/spec/gpu_ext.td
index 5400e0a..dce81ff 100644
--- a/libc/spec/gpu_ext.td
+++ b/libc/spec/gpu_ext.td
@@ -10,14 +10,6 @@ def GPUExtensions : StandardSpec<"GPUExtensions"> {
             RetValSpec<VoidType>,
             [ArgSpec<VoidPtr>, ArgSpec<VoidPtr>, ArgSpec<SizeTType>]
         >,
-        FunctionSpec<
-            "rpc_fprintf",
-            RetValSpec<IntType>,
-            [ArgSpec<FILERestrictedPtr>,
-             ArgSpec<ConstCharRestrictedPtr>,
-             ArgSpec<VoidPtr>,
-             ArgSpec<SizeTType>]
-        >,
     ]
   >;
   let Headers = [
diff --git a/libc/spec/llvm_libc_ext.td b/libc/spec/llvm_libc_ext.td
index f3a8862..1bd001f 100644
--- a/libc/spec/llvm_libc_ext.td
+++ b/libc/spec/llvm_libc_ext.td
@@ -57,7 +57,10 @@ def LLVMLibcExt : StandardSpec<"llvm_libc_ext"> {
       [], // Types
       [], // Enumerations
       [
+          GuardedFunctionSpec<"dfmaf128", RetValSpec<DoubleType>, [ArgSpec<Float128Type>, ArgSpec<Float128Type>, ArgSpec<Float128Type>], "LIBC_TYPES_HAS_FLOAT128">,
           GuardedFunctionSpec<"dsqrtf128", RetValSpec<DoubleType>, [ArgSpec<Float128Type>], "LIBC_TYPES_HAS_FLOAT128">,
+          GuardedFunctionSpec<"dsubf128", RetValSpec<DoubleType>, [ArgSpec<Float128Type>, ArgSpec<Float128Type>], "LIBC_TYPES_HAS_FLOAT128">,
+
 	  
           GuardedFunctionSpec<"f16add", RetValSpec<Float16Type>, [ArgSpec<DoubleType>, ArgSpec<DoubleType>], "LIBC_TYPES_HAS_FLOAT16">,
           GuardedFunctionSpec<"f16addf", RetValSpec<Float16Type>, [ArgSpec<FloatType>, ArgSpec<FloatType>], "LIBC_TYPES_HAS_FLOAT16">,
diff --git a/libc/spec/posix.td b/libc/spec/posix.td
index 48f743d..1b7e18e 100644
--- a/libc/spec/posix.td
+++ b/libc/spec/posix.td
@@ -547,11 +547,6 @@ def POSIX : StandardSpec<"POSIX"> {
           [ArgSpec<VoidType>]
         >,
         FunctionSpec<
-          "gettid",
-          RetValSpec<PidT>,
-          [ArgSpec<VoidType>]
-        >,
-        FunctionSpec<
           "getuid",
           RetValSpec<UidT>,
           [ArgSpec<VoidType>]
@@ -607,6 +602,16 @@ def POSIX : StandardSpec<"POSIX"> {
           [ArgSpec<ConstCharPtr>]
         >,
         FunctionSpec<
+          "getpid",
+          RetValSpec<IntType>,
+          [ArgSpec<VoidType>]
+        >,
+        FunctionSpec<
+          "getppid",
+          RetValSpec<IntType>,
+          [ArgSpec<VoidType>]
+        >,
+        FunctionSpec<
           "link",
           RetValSpec<IntType>,
           [ArgSpec<ConstCharPtr>, ArgSpec<ConstCharPtr>]
@@ -1321,6 +1326,16 @@ def POSIX : StandardSpec<"POSIX"> {
         [ArgSpec<RestrictedPThreadRWLockTPtr>, ArgSpec<ConstRestrictStructTimeSpecPtr>]
       >,
       FunctionSpec<
+        "pthread_rwlock_clockrdlock",
+        RetValSpec<IntType>,
+        [ArgSpec<RestrictedPThreadRWLockTPtr>, ArgSpec<ClockIdT>, ArgSpec<ConstRestrictStructTimeSpecPtr>]
+      >,
+      FunctionSpec<
+        "pthread_rwlock_clockwrlock",
+        RetValSpec<IntType>,
+        [ArgSpec<RestrictedPThreadRWLockTPtr>, ArgSpec<ClockIdT>, ArgSpec<ConstRestrictStructTimeSpecPtr>]
+      >,
+      FunctionSpec<
         "pthread_rwlock_rdlock",
         RetValSpec<IntType>,
         [ArgSpec<PThreadRWLockTPtr>]
diff --git a/libc/spec/stdc.td b/libc/spec/stdc.td
index 9c84acc..fa536b2 100644
--- a/libc/spec/stdc.td
+++ b/libc/spec/stdc.td
@@ -397,6 +397,9 @@ def StdC : StandardSpec<"stdc"> {
           GuardedFunctionSpec<"ceilf16", RetValSpec<Float16Type>, [ArgSpec<Float16Type>], "LIBC_TYPES_HAS_FLOAT16">,
           GuardedFunctionSpec<"ceilf128", RetValSpec<Float128Type>, [ArgSpec<Float128Type>], "LIBC_TYPES_HAS_FLOAT128">,
 
+          FunctionSpec<"dfmal", RetValSpec<DoubleType>, [ArgSpec<LongDoubleType>, ArgSpec<LongDoubleType>, ArgSpec<LongDoubleType>]>,
+          FunctionSpec<"dsubl", RetValSpec<DoubleType>, [ArgSpec<LongDoubleType>, ArgSpec<LongDoubleType>]>,
+          
           FunctionSpec<"fabs", RetValSpec<DoubleType>, [ArgSpec<DoubleType>], [ConstAttr]>,
           FunctionSpec<"fabsf", RetValSpec<FloatType>, [ArgSpec<FloatType>]>,
           FunctionSpec<"fabsl", RetValSpec<LongDoubleType>, [ArgSpec<LongDoubleType>]>,
@@ -573,6 +576,7 @@ def StdC : StandardSpec<"stdc"> {
 
           FunctionSpec<"exp", RetValSpec<DoubleType>, [ArgSpec<DoubleType>]>,
           FunctionSpec<"expf", RetValSpec<FloatType>, [ArgSpec<FloatType>]>,
+          GuardedFunctionSpec<"expf16", RetValSpec<Float16Type>, [ArgSpec<Float16Type>], "LIBC_TYPES_HAS_FLOAT16">,
 
           FunctionSpec<"exp2", RetValSpec<DoubleType>, [ArgSpec<DoubleType>]>,
           FunctionSpec<"exp2f", RetValSpec<FloatType>, [ArgSpec<FloatType>]>,
@@ -720,8 +724,11 @@ def StdC : StandardSpec<"stdc"> {
           GuardedFunctionSpec<"canonicalizef128", RetValSpec<IntType>, [ArgSpec<Float128Type>, ArgSpec<Float128Type>], "LIBC_TYPES_HAS_FLOAT128">,
 	  
           FunctionSpec<"dsqrtl", RetValSpec<DoubleType>, [ArgSpec<LongDoubleType>]>,
-	
+
+          FunctionSpec<"totalorder", RetValSpec<IntType>, [ArgSpec<DoublePtr>, ArgSpec<DoublePtr>]>,
+	  FunctionSpec<"totalorderf", RetValSpec<IntType>, [ArgSpec<FloatPtr>, ArgSpec<FloatPtr>]>,
 	  GuardedFunctionSpec<"totalorderf16", RetValSpec<IntType>, [ArgSpec<Float16Ptr>, ArgSpec<Float16Ptr>], "LIBC_TYPES_HAS_FLOAT16">,
+	  GuardedFunctionSpec<"totalorderf128", RetValSpec<IntType>, [ArgSpec<Float128Ptr>, ArgSpec<Float128Ptr>], "LIBC_TYPES_HAS_FLOAT128">,
 
           FunctionSpec<"totalordermag", RetValSpec<IntType>, [ArgSpec<DoublePtr>, ArgSpec<DoublePtr>]>,
 	  FunctionSpec<"totalordermagf", RetValSpec<IntType>, [ArgSpec<FloatPtr>, ArgSpec<FloatPtr>]>,
@@ -729,9 +736,15 @@ def StdC : StandardSpec<"stdc"> {
 	  GuardedFunctionSpec<"totalordermagf16", RetValSpec<IntType>, [ArgSpec<Float16Ptr>, ArgSpec<Float16Ptr>], "LIBC_TYPES_HAS_FLOAT16">,
 	  GuardedFunctionSpec<"totalordermagf128", RetValSpec<IntType>, [ArgSpec<Float128Ptr>, ArgSpec<Float128Ptr>], "LIBC_TYPES_HAS_FLOAT128">,
 
+          FunctionSpec<"getpayload", RetValSpec<DoubleType>, [ArgSpec<DoublePtr>]>,
+          FunctionSpec<"getpayloadf", RetValSpec<FloatType>, [ArgSpec<FloatPtr>]>,
           GuardedFunctionSpec<"getpayloadf16", RetValSpec<Float16Type>, [ArgSpec<Float16Ptr>], "LIBC_TYPES_HAS_FLOAT16">,
-
+          GuardedFunctionSpec<"getpayloadf128", RetValSpec<Float128Type>, [ArgSpec<Float128Ptr>], "LIBC_TYPES_HAS_FLOAT128">,
+	  
+          FunctionSpec<"setpayload", RetValSpec<IntType>, [ArgSpec<DoublePtr>, ArgSpec<DoubleType>]>,
+          FunctionSpec<"setpayloadf", RetValSpec<IntType>, [ArgSpec<FloatPtr>, ArgSpec<FloatType>]>,
           GuardedFunctionSpec<"setpayloadf16", RetValSpec<IntType>, [ArgSpec<Float16Ptr>, ArgSpec<Float16Type>], "LIBC_TYPES_HAS_FLOAT16">,
+	  GuardedFunctionSpec<"setpayloadf128", RetValSpec<IntType>, [ArgSpec<Float128Ptr>, ArgSpec<Float128Type>], "LIBC_TYPES_HAS_FLOAT128">,
 
           GuardedFunctionSpec<"setpayloadsigf16", RetValSpec<IntType>, [ArgSpec<Float16Ptr>, ArgSpec<Float16Type>], "LIBC_TYPES_HAS_FLOAT16">,
 
@@ -911,6 +924,13 @@ def StdC : StandardSpec<"stdc"> {
                ArgSpec<VarArgType>]
           >,
           FunctionSpec<
+              "vsscanf",
+              RetValSpec<IntType>,
+              [ArgSpec<ConstCharRestrictedPtr>,
+               ArgSpec<ConstCharRestrictedPtr>,
+               ArgSpec<VaListType>]
+          >,
+          FunctionSpec<
               "scanf",
               RetValSpec<IntType>,
               [ArgSpec<ConstCharRestrictedPtr>,
diff --git a/libc/src/__support/FPUtil/BasicOperations.h b/libc/src/__support/FPUtil/BasicOperations.h
index 50f1e19..d68f3ae 100644
--- a/libc/src/__support/FPUtil/BasicOperations.h
+++ b/libc/src/__support/FPUtil/BasicOperations.h
@@ -14,6 +14,7 @@
 #include "dyadic_float.h"
 
 #include "src/__support/CPP/type_traits.h"
+#include "src/__support/big_int.h"
 #include "src/__support/common.h"
 #include "src/__support/macros/config.h"
 #include "src/__support/macros/optimization.h" // LIBC_UNLIKELY
@@ -320,7 +321,7 @@ totalorder(T x, T y) {
   StorageType x_u = x_bits.uintval();
   StorageType y_u = y_bits.uintval();
 
-  using signed_t = cpp::make_signed_t<StorageType>;
+  using signed_t = make_integral_or_big_int_signed_t<StorageType>;
   signed_t x_signed = static_cast<signed_t>(x_u);
   signed_t y_signed = static_cast<signed_t>(y_u);
 
diff --git a/libc/src/__support/FPUtil/CMakeLists.txt b/libc/src/__support/FPUtil/CMakeLists.txt
index bfdfffb..ea1e0e8 100644
--- a/libc/src/__support/FPUtil/CMakeLists.txt
+++ b/libc/src/__support/FPUtil/CMakeLists.txt
@@ -189,6 +189,7 @@ add_header_library(
     .fp_bits
     .fenv_impl
     libc.src.__support.CPP.type_traits
+    libc.src.__support.big_int
     libc.src.__support.uint128
     libc.src.__support.common
     libc.src.__support.macros.optimization
diff --git a/libc/src/__support/OSUtil/CMakeLists.txt b/libc/src/__support/OSUtil/CMakeLists.txt
index 517f888..94d1042 100644
--- a/libc/src/__support/OSUtil/CMakeLists.txt
+++ b/libc/src/__support/OSUtil/CMakeLists.txt
@@ -15,20 +15,3 @@ add_object_library(
   DEPENDS
     ${target_os_util}
 )
-
-if (LIBC_CONF_ENABLE_PID_CACHE)
-  set(libc_copt_enable_pid_cache 1)
-else()
-  set(libc_copt_enable_pid_cache 0)
-endif()
-
-if(TARGET libc.src.__support.OSUtil.${LIBC_TARGET_OS}.pid)
-  add_object_library(
-    pid
-    ALIAS
-    DEPENDS
-      .${LIBC_TARGET_OS}.pid
-    COMPILE_OPTIONS
-      -DLIBC_COPT_ENABLE_PID_CACHE=${libc_copt_enable_pid_cache}
-  )
-endif()
diff --git a/libc/src/__support/OSUtil/linux/CMakeLists.txt b/libc/src/__support/OSUtil/linux/CMakeLists.txt
index 95a83d7..089cad4 100644
--- a/libc/src/__support/OSUtil/linux/CMakeLists.txt
+++ b/libc/src/__support/OSUtil/linux/CMakeLists.txt
@@ -23,16 +23,3 @@ add_object_library(
     libc.hdr.types.struct_f_owner_ex
     libc.hdr.types.off_t
 )
-
-add_object_library(
-  pid
-  SRCS
-    pid.cpp
-  HDRS
-    ../pid.h
-  DEPENDS
-    libc.src.__support.OSUtil.osutil
-    libc.src.__support.common
-    libc.hdr.types.pid_t
-    libc.include.sys_syscall
-)
diff --git a/libc/src/__support/OSUtil/pid.h b/libc/src/__support/OSUtil/pid.h
deleted file mode 100644
index d723abe..0000000
--- a/libc/src/__support/OSUtil/pid.h
+++ /dev/null
@@ -1,41 +0,0 @@
-//===------------ pid_t utilities -------------------------------*- C++ -*-===//
-//
-// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
-// See https://llvm.org/LICENSE.txt for license information.
-// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
-//
-//===----------------------------------------------------------------------===//
-
-#ifndef LLVM_LIBC_SRC___SUPPORT_OSUTIL_PID_H
-#define LLVM_LIBC_SRC___SUPPORT_OSUTIL_PID_H
-#include "hdr/types/pid_t.h"
-#include "src/__support/macros/attributes.h"
-#include "src/__support/macros/optimization.h"
-
-#ifndef LIBC_COPT_ENABLE_PID_CACHE
-#define LIBC_COPT_ENABLE_PID_CACHE 1
-#endif
-
-namespace LIBC_NAMESPACE_DECL {
-
-class ProcessIdentity {
-  static LIBC_INLINE_VAR thread_local bool fork_inflight = true;
-  static pid_t cache;
-  static pid_t get_uncached();
-
-public:
-  LIBC_INLINE static void start_fork() { fork_inflight = true; }
-  LIBC_INLINE static void end_fork() { fork_inflight = false; }
-  LIBC_INLINE static void refresh_cache() { cache = get_uncached(); }
-  LIBC_INLINE static pid_t get() {
-#if LIBC_COPT_ENABLE_PID_CACHE
-    if (LIBC_LIKELY(!fork_inflight))
-      return cache;
-#endif
-    return get_uncached();
-  }
-};
-
-} // namespace LIBC_NAMESPACE_DECL
-
-#endif // LLVM_LIBC_SRC___SUPPORT_OSUTIL_PID_H
diff --git a/libc/src/__support/threads/CMakeLists.txt b/libc/src/__support/threads/CMakeLists.txt
index f1a2f16..d2e46b8 100644
--- a/libc/src/__support/threads/CMakeLists.txt
+++ b/libc/src/__support/threads/CMakeLists.txt
@@ -44,12 +44,6 @@ if(TARGET libc.src.__support.threads.${LIBC_TARGET_OS}.mutex)
   )
 endif()
 
-if (LIBC_CONF_ENABLE_TID_CACHE)
-  set(libc_copt_enable_tid_cache 1)
-else()
-  set(libc_copt_enable_tid_cache 0)
-endif()
-
 add_header_library(
   thread_common
   HDRS
@@ -60,9 +54,6 @@ add_header_library(
     libc.src.__support.CPP.optional
     libc.src.__support.CPP.string_view
     libc.src.__support.CPP.stringstream
-    libc.hdr.types.pid_t
-  COMPILE_OPTIONS
-    -DLIBC_COPT_ENABLE_TID_CACHE=${libc_copt_enable_tid_cache}
 )
 
 if(TARGET libc.src.__support.threads.${LIBC_TARGET_OS}.thread)
@@ -98,21 +89,3 @@ if(TARGET libc.src.__support.threads.${LIBC_TARGET_OS}.CndVar)
     .${LIBC_TARGET_OS}.CndVar
   )
 endif()
-
-set(tid_dep)
-if (LLVM_LIBC_FULL_BUILD)
-  list(APPEND tid_dep libc.src.__support.thread)
-else()
-  list(APPEND tid_dep libc.src.__support.OSUtil.osutil)
-  list(APPEND tid_dep libc.include.sys_syscall)
-endif()
-
-add_header_library(
-  tid
-  HDRS
-    tid.h
-  DEPENDS
-    libc.src.__support.common
-    libc.hdr.types.pid_t
-    ${tid_dep}
-)
diff --git a/libc/src/__support/threads/linux/CMakeLists.txt b/libc/src/__support/threads/linux/CMakeLists.txt
index d86441d..8b79715 100644
--- a/libc/src/__support/threads/linux/CMakeLists.txt
+++ b/libc/src/__support/threads/linux/CMakeLists.txt
@@ -55,7 +55,6 @@ add_header_library(
     libc.src.__support.common
     libc.src.__support.OSUtil.osutil
     libc.src.__support.CPP.limits
-    libc.src.__support.threads.tid
   COMPILE_OPTIONS
     -DLIBC_COPT_RWLOCK_DEFAULT_SPIN_COUNT=${LIBC_CONF_RWLOCK_DEFAULT_SPIN_COUNT}
     ${monotonicity_flags}
diff --git a/libc/src/__support/threads/linux/rwlock.h b/libc/src/__support/threads/linux/rwlock.h
index cae8aa6..d2fb0ce 100644
--- a/libc/src/__support/threads/linux/rwlock.h
+++ b/libc/src/__support/threads/linux/rwlock.h
@@ -23,7 +23,6 @@
 #include "src/__support/threads/linux/futex_word.h"
 #include "src/__support/threads/linux/raw_mutex.h"
 #include "src/__support/threads/sleep.h"
-#include "src/__support/threads/tid.h"
 
 #ifndef LIBC_COPT_RWLOCK_DEFAULT_SPIN_COUNT
 #define LIBC_COPT_RWLOCK_DEFAULT_SPIN_COUNT 100
@@ -337,6 +336,8 @@ private:
   LIBC_INLINE Role get_preference() const {
     return static_cast<Role>(preference);
   }
+  // TODO: use cached thread id once implemented.
+  LIBC_INLINE static pid_t gettid() { return syscall_impl<pid_t>(SYS_gettid); }
 
   template <Role role> LIBC_INLINE LockResult try_lock(RwState &old) {
     if constexpr (role == Role::Reader) {
@@ -358,7 +359,7 @@ private:
         if (LIBC_LIKELY(old.compare_exchange_weak_with(
                 state, old.set_writer_bit(), cpp::MemoryOrder::ACQUIRE,
                 cpp::MemoryOrder::RELAXED))) {
-          writer_tid.store(gettid_inline(), cpp::MemoryOrder::RELAXED);
+          writer_tid.store(gettid(), cpp::MemoryOrder::RELAXED);
           return LockResult::Success;
         }
         // Notice that old is updated by the compare_exchange_weak_with
@@ -393,7 +394,7 @@ private:
             unsigned spin_count = LIBC_COPT_RWLOCK_DEFAULT_SPIN_COUNT) {
     // Phase 1: deadlock detection.
     // A deadlock happens if this is a RAW/WAW lock in the same thread.
-    if (writer_tid.load(cpp::MemoryOrder::RELAXED) == gettid_inline())
+    if (writer_tid.load(cpp::MemoryOrder::RELAXED) == gettid())
       return LockResult::Deadlock;
 
 #if LIBC_COPT_TIMEOUT_ENSURE_MONOTONICITY
@@ -519,7 +520,7 @@ public:
     if (old.has_active_writer()) {
       // The lock is held by a writer.
       // Check if we are the owner of the lock.
-      if (writer_tid.load(cpp::MemoryOrder::RELAXED) != gettid_inline())
+      if (writer_tid.load(cpp::MemoryOrder::RELAXED) != gettid())
         return LockResult::PermissionDenied;
       // clear writer tid.
       writer_tid.store(0, cpp::MemoryOrder::RELAXED);
diff --git a/libc/src/__support/threads/linux/thread.cpp b/libc/src/__support/threads/linux/thread.cpp
index c8ad086..36b4a88 100644
--- a/libc/src/__support/threads/linux/thread.cpp
+++ b/libc/src/__support/threads/linux/thread.cpp
@@ -518,6 +518,4 @@ void thread_exit(ThreadReturnValue retval, ThreadStyle style) {
   __builtin_unreachable();
 }
 
-pid_t Thread::get_uncached_tid() { return syscall_impl<pid_t>(SYS_gettid); }
-
 } // namespace LIBC_NAMESPACE_DECL
diff --git a/libc/src/__support/threads/thread.h b/libc/src/__support/threads/thread.h
index b9ce3d7..68640ce 100644
--- a/libc/src/__support/threads/thread.h
+++ b/libc/src/__support/threads/thread.h
@@ -9,11 +9,6 @@
 #ifndef LLVM_LIBC_SRC___SUPPORT_THREADS_THREAD_H
 #define LLVM_LIBC_SRC___SUPPORT_THREADS_THREAD_H
 
-#ifndef LIBC_COPT_ENABLE_TID_CACHE
-#define LIBC_COPT_ENABLE_TID_CACHE 1
-#endif
-
-#include "hdr/types/pid_t.h"
 #include "src/__support/CPP/atomic.h"
 #include "src/__support/CPP/optional.h"
 #include "src/__support/CPP/string_view.h"
@@ -109,7 +104,7 @@ struct alignas(STACK_ALIGNMENT) ThreadAttributes {
   uintptr_t tls;             // Address to the thread TLS memory
   uintptr_t tls_size;        // The size of area pointed to by |tls|.
   unsigned char owned_stack; // Indicates if the thread owns this stack memory
-  pid_t tid;
+  int tid;
   ThreadStyle style;
   ThreadReturnValue retval;
   ThreadAtExitCallbackMgr *atexit_callback_mgr;
@@ -234,26 +229,6 @@ struct Thread {
 
   // Return the name of the thread in |name|. Return the error number of error.
   int get_name(cpp::StringStream &name) const;
-
-  static pid_t get_uncached_tid();
-
-  LIBC_INLINE void refresh_tid(pid_t cached = -1) {
-    if (cached >= 0)
-      this->attrib->tid = cached;
-    else
-      this->attrib->tid = get_uncached_tid();
-  }
-  LIBC_INLINE void invalidate_tid() { this->attrib->tid = -1; }
-
-  LIBC_INLINE pid_t get_tid() {
-#if LIBC_COPT_ENABLE_TID_CACHE
-    if (LIBC_UNLIKELY(this->attrib->tid < 0))
-      return get_uncached_tid();
-    return this->attrib->tid;
-#else
-    return get_uncached_tid();
-#endif
-  }
 };
 
 extern LIBC_THREAD_LOCAL Thread self;
diff --git a/libc/src/__support/threads/tid.h b/libc/src/__support/threads/tid.h
deleted file mode 100644
index a575cff..0000000
--- a/libc/src/__support/threads/tid.h
+++ /dev/null
@@ -1,34 +0,0 @@
-//===--- Tid wrapper --------------------------------------------*- C++ -*-===//
-//
-// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
-// See https://llvm.org/LICENSE.txt for license information.
-// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
-//
-//===----------------------------------------------------------------------===//
-
-#ifndef LLVM_LIBC_SRC___SUPPORT_THREADS_TID_H
-#define LLVM_LIBC_SRC___SUPPORT_THREADS_TID_H
-
-// This header is for internal usage which automatically dispatches full build
-// and overlay build behaviors.
-
-#include "hdr/types/pid_t.h"
-#include "src/__support/common.h"
-#ifdef LIBC_FULL_BUILD
-#include "src/__support/threads/thread.h"
-#else
-#include "src/__support/OSUtil/syscall.h"
-#include <sys/syscall.h>
-#endif // LIBC_FULL_BUILD
-
-namespace LIBC_NAMESPACE_DECL {
-LIBC_INLINE pid_t gettid_inline() {
-#ifdef LIBC_FULL_BUILD
-  return self.get_tid();
-#else
-  return syscall_impl<pid_t>(SYS_gettid);
-#endif
-}
-} // namespace LIBC_NAMESPACE_DECL
-
-#endif // LLVM_LIBC_SRC___SUPPORT_THREADS_TID_H
diff --git a/libc/src/compiler/generic/__stack_chk_fail.cpp b/libc/src/compiler/generic/__stack_chk_fail.cpp
index 639204d..c76ec14 100644
--- a/libc/src/compiler/generic/__stack_chk_fail.cpp
+++ b/libc/src/compiler/generic/__stack_chk_fail.cpp
@@ -13,7 +13,7 @@
 extern "C" {
 
 void __stack_chk_fail(void) {
-  LIBC_NAMESPACE::write_to_stderr("stack smashing detected");
+  LIBC_NAMESPACE::write_to_stderr("stack smashing detected\n");
   LIBC_NAMESPACE::abort();
 }
 
diff --git a/libc/src/gpu/CMakeLists.txt b/libc/src/gpu/CMakeLists.txt
index 4508abe..e202285 100644
--- a/libc/src/gpu/CMakeLists.txt
+++ b/libc/src/gpu/CMakeLists.txt
@@ -8,15 +8,3 @@ add_entrypoint_object(
     libc.src.__support.RPC.rpc_client
     libc.src.__support.GPU.utils
 )
-
-add_entrypoint_object(
-  rpc_fprintf
-  SRCS
-    rpc_fprintf.cpp
-  HDRS
-    rpc_fprintf.h
-  DEPENDS
-    libc.src.stdio.gpu.gpu_file
-    libc.src.__support.RPC.rpc_client
-    libc.src.__support.GPU.utils
-)
diff --git a/libc/src/gpu/rpc_fprintf.cpp b/libc/src/gpu/rpc_fprintf.cpp
deleted file mode 100644
index 70056da..0000000
--- a/libc/src/gpu/rpc_fprintf.cpp
+++ /dev/null
@@ -1,75 +0,0 @@
-//===-- GPU implementation of fprintf -------------------------------------===//
-//
-// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
-// See https://llvm.org/LICENSE.txt for license information.
-// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
-//
-//===----------------------------------------------------------------------===//
-
-#include "rpc_fprintf.h"
-
-#include "src/__support/CPP/string_view.h"
-#include "src/__support/GPU/utils.h"
-#include "src/__support/RPC/rpc_client.h"
-#include "src/__support/common.h"
-#include "src/__support/macros/config.h"
-#include "src/stdio/gpu/file.h"
-
-namespace LIBC_NAMESPACE_DECL {
-
-template <uint16_t opcode>
-int fprintf_impl(::FILE *__restrict file, const char *__restrict format,
-                 size_t format_size, void *args, size_t args_size) {
-  uint64_t mask = gpu::get_lane_mask();
-  rpc::Client::Port port = rpc::client.open<opcode>();
-
-  if constexpr (opcode == RPC_PRINTF_TO_STREAM) {
-    port.send([&](rpc::Buffer *buffer) {
-      buffer->data[0] = reinterpret_cast<uintptr_t>(file);
-    });
-  }
-
-  port.send_n(format, format_size);
-  port.recv([&](rpc::Buffer *buffer) {
-    args_size = static_cast<size_t>(buffer->data[0]);
-  });
-  port.send_n(args, args_size);
-
-  uint32_t ret = 0;
-  for (;;) {
-    const char *str = nullptr;
-    port.recv([&](rpc::Buffer *buffer) {
-      ret = static_cast<uint32_t>(buffer->data[0]);
-      str = reinterpret_cast<const char *>(buffer->data[1]);
-    });
-    // If any lanes have a string argument it needs to be copied back.
-    if (!gpu::ballot(mask, str))
-      break;
-
-    uint64_t size = str ? internal::string_length(str) + 1 : 0;
-    port.send_n(str, size);
-  }
-
-  port.close();
-  return ret;
-}
-
-// TODO: Delete this and port OpenMP to use `printf`.
-// place of varargs. Once varargs support is added we will use that to
-// implement the real version.
-LLVM_LIBC_FUNCTION(int, rpc_fprintf,
-                   (::FILE *__restrict stream, const char *__restrict format,
-                    void *args, size_t size)) {
-  cpp::string_view str(format);
-  if (stream == stdout)
-    return fprintf_impl<RPC_PRINTF_TO_STDOUT>(stream, format, str.size() + 1,
-                                              args, size);
-  else if (stream == stderr)
-    return fprintf_impl<RPC_PRINTF_TO_STDERR>(stream, format, str.size() + 1,
-                                              args, size);
-  else
-    return fprintf_impl<RPC_PRINTF_TO_STREAM>(stream, format, str.size() + 1,
-                                              args, size);
-}
-
-} // namespace LIBC_NAMESPACE_DECL
diff --git a/libc/src/math/CMakeLists.txt b/libc/src/math/CMakeLists.txt
index d70af33..bd022ad 100644
--- a/libc/src/math/CMakeLists.txt
+++ b/libc/src/math/CMakeLists.txt
@@ -89,14 +89,21 @@ add_math_entrypoint_object(cospif)
 add_math_entrypoint_object(dmull)
 add_math_entrypoint_object(dmulf128)
 
+add_math_entrypoint_object(dfmal)
+add_math_entrypoint_object(dfmaf128)
+
 add_math_entrypoint_object(dsqrtl)
 add_math_entrypoint_object(dsqrtf128)
 
+add_math_entrypoint_object(dsubl)
+add_math_entrypoint_object(dsubf128)
+
 add_math_entrypoint_object(erf)
 add_math_entrypoint_object(erff)
 
 add_math_entrypoint_object(exp)
 add_math_entrypoint_object(expf)
+add_math_entrypoint_object(expf16)
 
 add_math_entrypoint_object(exp2)
 add_math_entrypoint_object(exp2f)
@@ -253,7 +260,10 @@ add_math_entrypoint_object(fromfpxl)
 add_math_entrypoint_object(fromfpxf16)
 add_math_entrypoint_object(fromfpxf128)
 
+add_math_entrypoint_object(getpayload)
+add_math_entrypoint_object(getpayloadf)
 add_math_entrypoint_object(getpayloadf16)
+add_math_entrypoint_object(getpayloadf128)
 
 add_math_entrypoint_object(hypot)
 add_math_entrypoint_object(hypotf)
@@ -405,7 +415,10 @@ add_math_entrypoint_object(scalbnl)
 add_math_entrypoint_object(scalbnf16)
 add_math_entrypoint_object(scalbnf128)
 
+add_math_entrypoint_object(setpayload)
+add_math_entrypoint_object(setpayloadf)
 add_math_entrypoint_object(setpayloadf16)
+add_math_entrypoint_object(setpayloadf128)
 
 add_math_entrypoint_object(setpayloadsigf16)
 
@@ -433,7 +446,10 @@ add_math_entrypoint_object(tanhf)
 add_math_entrypoint_object(tgamma)
 add_math_entrypoint_object(tgammaf)
 
+add_math_entrypoint_object(totalorder)
+add_math_entrypoint_object(totalorderf)
 add_math_entrypoint_object(totalorderf16)
+add_math_entrypoint_object(totalorderf128)
 
 add_math_entrypoint_object(totalordermag)
 add_math_entrypoint_object(totalordermagf)
diff --git a/libc/src/gpu/rpc_fprintf.h b/libc/src/math/dfmaf128.h
index 7658b21..1b2e728 100644
--- a/libc/src/gpu/rpc_fprintf.h
+++ b/libc/src/math/dfmaf128.h
@@ -1,4 +1,4 @@
-//===-- Implementation header for RPC functions -----------------*- C++ -*-===//
+//===-- Implementation header for dfmaf128 ----------------------*- C++ -*-===//
 //
 // Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
 // See https://llvm.org/LICENSE.txt for license information.
@@ -6,18 +6,16 @@
 //
 //===----------------------------------------------------------------------===//
 
-#ifndef LLVM_LIBC_SRC_GPU_RPC_HOST_CALL_H
-#define LLVM_LIBC_SRC_GPU_RPC_HOST_CALL_H
+#ifndef LLVM_LIBC_SRC_MATH_DFMAF128_H
+#define LLVM_LIBC_SRC_MATH_DFMAF128_H
 
-#include "hdr/types/FILE.h"
 #include "src/__support/macros/config.h"
-#include <stddef.h>
+#include "src/__support/macros/properties/types.h"
 
 namespace LIBC_NAMESPACE_DECL {
 
-int rpc_fprintf(::FILE *__restrict stream, const char *__restrict format,
-                void *argc, size_t size);
+double dfmaf128(float128 x, float128 y, float128 z);
 
 } // namespace LIBC_NAMESPACE_DECL
 
-#endif // LLVM_LIBC_SRC_GPU_RPC_HOST_CALL_H
+#endif // LLVM_LIBC_SRC_MATH_DFMAF128_H
diff --git a/libc/src/__support/OSUtil/linux/pid.cpp b/libc/src/math/dfmal.h
index a8499af..e086730 100644
--- a/libc/src/__support/OSUtil/linux/pid.cpp
+++ b/libc/src/math/dfmal.h
@@ -1,4 +1,4 @@
-//===------------ pid_t utilities implementation ----------------*- C++ -*-===//
+//===-- Implementation header for dfmal -------------------------*- C++ -*-===//
 //
 // Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
 // See https://llvm.org/LICENSE.txt for license information.
@@ -6,15 +6,15 @@
 //
 //===----------------------------------------------------------------------===//
 
-#include "src/__support/OSUtil/pid.h"
-#include "src/__support/OSUtil/syscall.h"
-#include <sys/syscall.h>
+#ifndef LLVM_LIBC_SRC_MATH_DFMAL_H
+#define LLVM_LIBC_SRC_MATH_DFMAL_H
+
+#include "src/__support/macros/config.h"
 
 namespace LIBC_NAMESPACE_DECL {
 
-pid_t ProcessIdentity::cache = -1;
-pid_t ProcessIdentity::get_uncached() {
-  return syscall_impl<pid_t>(SYS_getpid);
-}
+double dfmal(long double x, long double y, long double z);
 
 } // namespace LIBC_NAMESPACE_DECL
+
+#endif // LLVM_LIBC_SRC_MATH_DFMAL_H
diff --git a/libc/src/math/dsubf128.h b/libc/src/math/dsubf128.h
new file mode 100644
index 0000000..8ac58a9
--- /dev/null
+++ b/libc/src/math/dsubf128.h
@@ -0,0 +1,21 @@
+//===-- Implementation header for dsubf128 ----------------------*- C++ -*-===//
+//
+// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
+// See https://llvm.org/LICENSE.txt for license information.
+// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
+//
+//===----------------------------------------------------------------------===//
+
+#ifndef LLVM_LIBC_SRC_MATH_DSUBF128_H
+#define LLVM_LIBC_SRC_MATH_DSUBF128_H
+
+#include "src/__support/macros/config.h"
+#include "src/__support/macros/properties/types.h"
+
+namespace LIBC_NAMESPACE_DECL {
+
+double dsubf128(float128 x, float128 y);
+
+} // namespace LIBC_NAMESPACE_DECL
+
+#endif // LLVM_LIBC_SRC_MATH_DSUBF128_H
diff --git a/libc/src/unistd/gettid.cpp b/libc/src/math/dsubl.h
index 6d8ed65..a512bbe 100644
--- a/libc/src/unistd/gettid.cpp
+++ b/libc/src/math/dsubl.h
@@ -1,4 +1,4 @@
-//===-- Implementation file for gettid --------------------------*- C++ -*-===//
+//===-- Implementation header for dsubl -------------------------*- C++ -*-===//
 //
 // Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
 // See https://llvm.org/LICENSE.txt for license information.
@@ -6,12 +6,15 @@
 //
 //===----------------------------------------------------------------------===//
 
-#include "src/unistd/gettid.h"
-#include "src/__support/common.h"
-#include "src/__support/threads/tid.h"
+#ifndef LLVM_LIBC_SRC_MATH_DSUBL_H
+#define LLVM_LIBC_SRC_MATH_DSUBL_H
+
+#include "src/__support/macros/config.h"
 
 namespace LIBC_NAMESPACE_DECL {
 
-LLVM_LIBC_FUNCTION(pid_t, gettid, (void)) { return gettid_inline(); }
+double dsubl(long double x, long double y);
 
 } // namespace LIBC_NAMESPACE_DECL
+
+#endif // LLVM_LIBC_SRC_MATH_DSUBL_H
diff --git a/libc/src/math/expf16.h b/libc/src/math/expf16.h
new file mode 100644
index 0000000..8547f65
--- /dev/null
+++ b/libc/src/math/expf16.h
@@ -0,0 +1,21 @@
+//===-- Implementation header for expf16 ------------------------*- C++ -*-===//
+//
+// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
+// See https://llvm.org/LICENSE.txt for license information.
+// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
+//
+//===----------------------------------------------------------------------===//
+
+#ifndef LLVM_LIBC_SRC_MATH_EXPF16_H
+#define LLVM_LIBC_SRC_MATH_EXPF16_H
+
+#include "src/__support/macros/config.h"
+#include "src/__support/macros/properties/types.h"
+
+namespace LIBC_NAMESPACE_DECL {
+
+float16 expf16(float16 x);
+
+} // namespace LIBC_NAMESPACE_DECL
+
+#endif // LLVM_LIBC_SRC_MATH_EXPF16_H
diff --git a/libc/src/math/generic/CMakeLists.txt b/libc/src/math/generic/CMakeLists.txt
index 27b5b94..927d975 100644
--- a/libc/src/math/generic/CMakeLists.txt
+++ b/libc/src/math/generic/CMakeLists.txt
@@ -130,6 +130,31 @@ add_entrypoint_object(
 )
 
 add_entrypoint_object(
+  dfmaf128
+  SRCS
+    dfmaf128.cpp
+  HDRS
+    ../dfmaf128.h
+  DEPENDS
+    libc.src.__support.FPUtil.fma
+    libc.src.__support.macros.properties.types
+  COMPILE_OPTIONS
+    -O3
+)
+
+add_entrypoint_object(
+  dfmal
+  SRCS
+    dfmal.cpp
+  HDRS
+    ../dfmal.h
+  DEPENDS
+    libc.src.__support.FPUtil.fma
+  COMPILE_OPTIONS
+    -O3
+)
+
+add_entrypoint_object(
   dsqrtl
   SRCS
     dsqrtl.cpp
@@ -154,6 +179,32 @@ add_entrypoint_object(
     -O3
 )
 
+
+add_entrypoint_object(
+  dsubf128
+  SRCS
+    dsubf128.cpp
+  HDRS
+    ../dsubf128.h
+  DEPENDS
+    libc.src.__support.FPUtil.generic.add_sub
+    libc.src.__support.macros.properties.types
+  COMPILE_OPTIONS
+    -O3
+)
+
+add_entrypoint_object(
+  dsubl
+  SRCS
+    dsubl.cpp
+  HDRS
+    ../dsubl.h
+  DEPENDS
+    libc.src.__support.FPUtil.generic.add_sub
+  COMPILE_OPTIONS
+    -O3
+)
+
 add_header_library(
   range_reduction
   HDRS
@@ -1227,6 +1278,28 @@ add_entrypoint_object(
 )
 
 add_entrypoint_object(
+  expf16
+  SRCS
+    expf16.cpp
+  HDRS
+    ../expf16.h
+  DEPENDS
+    libc.hdr.errno_macros
+    libc.hdr.fenv_macros
+    libc.src.__support.CPP.array
+    libc.src.__support.FPUtil.except_value_utils
+    libc.src.__support.FPUtil.fenv_impl
+    libc.src.__support.FPUtil.fp_bits
+    libc.src.__support.FPUtil.multiply_add
+    libc.src.__support.FPUtil.nearest_integer
+    libc.src.__support.FPUtil.polyeval
+    libc.src.__support.FPUtil.rounding_mode
+    libc.src.__support.macros.optimization
+  COMPILE_OPTIONS
+    -O3
+)
+
+add_entrypoint_object(
   exp2
   SRCS
     exp2.cpp
@@ -3974,6 +4047,30 @@ add_entrypoint_object(
 )
 
 add_entrypoint_object(
+  totalorder
+  SRCS
+    totalorder.cpp
+  HDRS
+    ../totalorder.h
+  DEPENDS
+    libc.src.__support.FPUtil.basic_operations
+  COMPILE_OPTIONS
+    -O3
+)
+
+add_entrypoint_object(
+  totalorderf
+  SRCS
+    totalorderf.cpp
+  HDRS
+    ../totalorderf.h
+  DEPENDS
+    libc.src.__support.FPUtil.basic_operations
+  COMPILE_OPTIONS
+    -O3
+)
+
+add_entrypoint_object(
   totalorderf16
   SRCS
     totalorderf16.cpp
@@ -3986,6 +4083,18 @@ add_entrypoint_object(
 )
 
 add_entrypoint_object(
+  totalorderf128
+  SRCS
+    totalorderf128.cpp
+  HDRS
+    ../totalorderf128.h
+  DEPENDS
+    libc.src.__support.FPUtil.basic_operations
+    libc.src.__support.macros.properties.types
+  COMPILE_OPTIONS
+    -O3
+)
+add_entrypoint_object(
   totalordermag
   SRCS
     totalordermag.cpp
@@ -4047,6 +4156,30 @@ add_entrypoint_object(
 )
 
 add_entrypoint_object(
+  getpayload
+  SRCS
+    getpayload.cpp
+  HDRS
+    ../getpayload.h
+  DEPENDS
+    libc.src.__support.FPUtil.basic_operations
+  COMPILE_OPTIONS
+    -O3
+)
+
+add_entrypoint_object(
+  getpayloadf
+  SRCS
+    getpayloadf.cpp
+  HDRS
+    ../getpayloadf.h
+  DEPENDS
+    libc.src.__support.FPUtil.basic_operations
+  COMPILE_OPTIONS
+    -O3
+)
+
+add_entrypoint_object(
   getpayloadf16
   SRCS
     getpayloadf16.cpp
@@ -4060,6 +4193,43 @@ add_entrypoint_object(
 )
 
 add_entrypoint_object(
+  getpayloadf128
+  SRCS
+    getpayloadf128.cpp
+  HDRS
+    ../getpayloadf128.h
+  DEPENDS
+    libc.src.__support.macros.properties.types
+    libc.src.__support.FPUtil.basic_operations
+  COMPILE_OPTIONS
+    -O3
+)
+
+add_entrypoint_object(
+  setpayload
+  SRCS
+    setpayload.cpp
+  HDRS
+    ../setpayload.h
+  DEPENDS
+    libc.src.__support.FPUtil.basic_operations
+  COMPILE_OPTIONS
+    -O3
+)
+
+add_entrypoint_object(
+  setpayloadf
+  SRCS
+    setpayloadf.cpp
+  HDRS
+    ../setpayloadf.h
+  DEPENDS
+    libc.src.__support.FPUtil.basic_operations
+  COMPILE_OPTIONS
+    -O3
+)
+
+add_entrypoint_object(
   setpayloadf16
   SRCS
     setpayloadf16.cpp
@@ -4073,6 +4243,19 @@ add_entrypoint_object(
 )
 
 add_entrypoint_object(
+  setpayloadf128
+  SRCS
+    setpayloadf128.cpp
+  HDRS
+    ../setpayloadf128.h
+  DEPENDS
+    libc.src.__support.macros.properties.types
+    libc.src.__support.FPUtil.basic_operations
+  COMPILE_OPTIONS
+    -O3
+)
+
+add_entrypoint_object(
   setpayloadsigf16
   SRCS
     setpayloadsigf16.cpp
diff --git a/libc/src/math/generic/dfmaf128.cpp b/libc/src/math/generic/dfmaf128.cpp
new file mode 100644
index 0000000..b6e1bdb08
--- /dev/null
+++ b/libc/src/math/generic/dfmaf128.cpp
@@ -0,0 +1,25 @@
+//===-- Implementation of dfmaf128 function -------------------------------===//
+//
+// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
+// See https://llvm.org/LICENSE.txt for license information.
+// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
+//
+//===----------------------------------------------------------------------===//
+
+#ifndef LLVM_LIBC_SRC_MATH_DFMAf128_H
+#define LLVM_LIBC_SRC_MATH_DFMAf128_H
+
+#include "src/math/dfmaf128.h"
+#include "src/__support/FPUtil/FMA.h"
+#include "src/__support/common.h"
+#include "src/__support/macros/config.h"
+
+namespace LIBC_NAMESPACE_DECL {
+
+LLVM_LIBC_FUNCTION(double, dfmaf128, (float128 x, float128 y, float128 z)) {
+  return fputil::fma<double>(x, y, z);
+}
+
+} // namespace LIBC_NAMESPACE_DECL
+
+#endif // LLVM_LIBC_SRC_MATH_DFMAf128_H
diff --git a/libc/src/math/generic/dfmal.cpp b/libc/src/math/generic/dfmal.cpp
new file mode 100644
index 0000000..02e0ce8
--- /dev/null
+++ b/libc/src/math/generic/dfmal.cpp
@@ -0,0 +1,21 @@
+//===-- Implementation of dfmal function ----------------------------------===//
+//
+// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
+// See https://llvm.org/LICENSE.txt for license information.
+// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
+//
+//===----------------------------------------------------------------------===//
+
+#include "src/math/dfmal.h"
+#include "src/__support/FPUtil/FMA.h"
+#include "src/__support/common.h"
+#include "src/__support/macros/config.h"
+
+namespace LIBC_NAMESPACE_DECL {
+
+LLVM_LIBC_FUNCTION(double, dfmal,
+                   (long double x, long double y, long double z)) {
+  return fputil::fma<double>(x, y, z);
+}
+
+} // namespace LIBC_NAMESPACE_DECL
diff --git a/libc/src/math/generic/dsubf128.cpp b/libc/src/math/generic/dsubf128.cpp
new file mode 100644
index 0000000..1b2f121
--- /dev/null
+++ b/libc/src/math/generic/dsubf128.cpp
@@ -0,0 +1,20 @@
+//===-- Implementation of dsubf128 function -------------------------------===//
+//
+// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
+// See https://llvm.org/LICENSE.txt for license information.
+// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
+//
+//===----------------------------------------------------------------------===//
+
+#include "src/math/dsubf128.h"
+#include "src/__support/FPUtil/generic/add_sub.h"
+#include "src/__support/common.h"
+#include "src/__support/macros/config.h"
+
+namespace LIBC_NAMESPACE_DECL {
+
+LLVM_LIBC_FUNCTION(double, dsubf128, (float128 x, float128 y)) {
+  return fputil::generic::sub<double>(x, y);
+}
+
+} // namespace LIBC_NAMESPACE_DECL
diff --git a/libc/src/math/generic/dsubl.cpp b/libc/src/math/generic/dsubl.cpp
new file mode 100644
index 0000000..8b567d0
--- /dev/null
+++ b/libc/src/math/generic/dsubl.cpp
@@ -0,0 +1,20 @@
+//===-- Implementation of dsubl function ----------------------------------===//
+//
+// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
+// See https://llvm.org/LICENSE.txt for license information.
+// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
+//
+//===----------------------------------------------------------------------===//
+
+#include "src/math/dsubl.h"
+#include "src/__support/FPUtil/generic/add_sub.h"
+#include "src/__support/common.h"
+#include "src/__support/macros/config.h"
+
+namespace LIBC_NAMESPACE_DECL {
+
+LLVM_LIBC_FUNCTION(double, dsubl, (long double x, long double y)) {
+  return fputil::generic::sub<double>(x, y);
+}
+
+} // namespace LIBC_NAMESPACE_DECL
diff --git a/libc/src/math/generic/expf16.cpp b/libc/src/math/generic/expf16.cpp
new file mode 100644
index 0000000..b198c55
--- /dev/null
+++ b/libc/src/math/generic/expf16.cpp
@@ -0,0 +1,172 @@
+//===-- Half-precision e^x function ---------------------------------------===//
+//
+// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
+// See https://llvm.org/LICENSE.txt for license information.
+// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
+//
+//===----------------------------------------------------------------------===//
+
+#include "src/math/expf16.h"
+#include "hdr/errno_macros.h"
+#include "hdr/fenv_macros.h"
+#include "src/__support/CPP/array.h"
+#include "src/__support/FPUtil/FEnvImpl.h"
+#include "src/__support/FPUtil/FPBits.h"
+#include "src/__support/FPUtil/PolyEval.h"
+#include "src/__support/FPUtil/except_value_utils.h"
+#include "src/__support/FPUtil/multiply_add.h"
+#include "src/__support/FPUtil/nearest_integer.h"
+#include "src/__support/FPUtil/rounding_mode.h"
+#include "src/__support/common.h"
+#include "src/__support/macros/config.h"
+#include "src/__support/macros/optimization.h"
+
+namespace LIBC_NAMESPACE_DECL {
+
+static constexpr fputil::ExceptValues<float16, 2> EXPF16_EXCEPTS_LO = {{
+    // (input, RZ output, RU offset, RD offset, RN offset)
+    // x = 0x1.de4p-8, expf16(x) = 0x1.01cp+0 (RZ)
+    {0x1f79U, 0x3c07U, 1U, 0U, 0U},
+    // x = 0x1.73cp-6, expf16(x) = 0x1.05cp+0 (RZ)
+    {0x25cfU, 0x3c17U, 1U, 0U, 0U},
+}};
+
+static constexpr fputil::ExceptValues<float16, 3> EXPF16_EXCEPTS_HI = {{
+    // (input, RZ output, RU offset, RD offset, RN offset)
+    // x = 0x1.c34p+0, expf16(x) = 0x1.74cp+2 (RZ)
+    {0x3f0dU, 0x45d3U, 1U, 0U, 1U},
+    // x = -0x1.488p-5, expf16(x) = 0x1.ebcp-1 (RZ)
+    {0xa922U, 0x3bafU, 1U, 0U, 0U},
+    // x = -0x1.55p-5, expf16(x) = 0x1.ebp-1 (RZ)
+    {0xa954U, 0x3bacU, 1U, 0U, 0U},
+}};
+
+// Generated by Sollya with the following commands:
+//   > display = hexadecimal;
+//   > for i from -18 to 12 do print(round(exp(i), SG, RN));
+static constexpr cpp::array<float, 31> EXP_HI = {
+    0x1.05a628p-26f, 0x1.639e32p-25f, 0x1.e355bcp-24f, 0x1.4875cap-22f,
+    0x1.be6c7p-21f,  0x1.2f6054p-19f, 0x1.9c54c4p-18f, 0x1.183542p-16f,
+    0x1.7cd79cp-15f, 0x1.02cf22p-13f, 0x1.5fc21p-12f,  0x1.de16bap-11f,
+    0x1.44e52p-9f,   0x1.b993fep-8f,  0x1.2c155cp-6f,  0x1.97db0cp-5f,
+    0x1.152aaap-3f,  0x1.78b564p-2f,  0x1p+0f,         0x1.5bf0a8p+1f,
+    0x1.d8e64cp+2f,  0x1.415e5cp+4f,  0x1.b4c902p+5f,  0x1.28d38ap+7f,
+    0x1.936dc6p+8f,  0x1.122886p+10f, 0x1.749ea8p+11f, 0x1.fa7158p+12f,
+    0x1.5829dcp+14f, 0x1.d3c448p+15f, 0x1.3de166p+17f,
+};
+
+// Generated by Sollya with the following commands:
+//   > display = hexadecimal;
+//   > for i from 0 to 7 do print(round(exp(i * 2^-3), SG, RN));
+static constexpr cpp::array<float, 8> EXP_MID = {
+    0x1p+0f,        0x1.221604p+0f, 0x1.48b5e4p+0f, 0x1.747a52p+0f,
+    0x1.a61298p+0f, 0x1.de455ep+0f, 0x1.0ef9dcp+1f, 0x1.330e58p+1f,
+};
+
+LLVM_LIBC_FUNCTION(float16, expf16, (float16 x)) {
+  using FPBits = fputil::FPBits<float16>;
+  FPBits x_bits(x);
+
+  uint16_t x_u = x_bits.uintval();
+  uint16_t x_abs = x_u & 0x7fffU;
+
+  // When 0 < |x| <= 2^(-5), or |x| >= 12, or x is NaN.
+  if (LIBC_UNLIKELY(x_abs <= 0x2800U || x_abs >= 0x4a00U)) {
+    // exp(NaN) = NaN
+    if (x_bits.is_nan()) {
+      if (x_bits.is_signaling_nan()) {
+        fputil::raise_except_if_required(FE_INVALID);
+        return FPBits::quiet_nan().get_val();
+      }
+
+      return x;
+    }
+
+    // When x >= 12.
+    if (x_bits.is_pos() && x_abs >= 0x4a00U) {
+      // exp(+inf) = +inf
+      if (x_bits.is_inf())
+        return FPBits::inf().get_val();
+
+      switch (fputil::quick_get_round()) {
+      case FE_TONEAREST:
+      case FE_UPWARD:
+        fputil::set_errno_if_required(ERANGE);
+        fputil::raise_except_if_required(FE_OVERFLOW);
+        return FPBits::inf().get_val();
+      default:
+        return FPBits::max_normal().get_val();
+      }
+    }
+
+    // When x <= -18.
+    if (x_u >= 0xcc80U) {
+      // exp(-inf) = +0
+      if (x_bits.is_inf())
+        return FPBits::zero().get_val();
+
+      fputil::set_errno_if_required(ERANGE);
+      fputil::raise_except_if_required(FE_UNDERFLOW | FE_INEXACT);
+
+      switch (fputil::quick_get_round()) {
+      case FE_UPWARD:
+        return FPBits::min_subnormal().get_val();
+      default:
+        return FPBits::zero().get_val();
+      }
+    }
+
+    // When 0 < |x| <= 2^(-5).
+    if (x_abs <= 0x2800U && !x_bits.is_zero()) {
+      if (auto r = EXPF16_EXCEPTS_LO.lookup(x_u); LIBC_UNLIKELY(r.has_value()))
+        return r.value();
+
+      float xf = x;
+      // Degree-3 minimax polynomial generated by Sollya with the following
+      // commands:
+      //   > display = hexadecimal;
+      //   > P = fpminimax(expm1(x)/x, 2, [|SG...|], [-2^-5, 2^-5]);
+      //   > 1 + x * P;
+      return static_cast<float16>(
+          fputil::polyeval(xf, 0x1p+0f, 0x1p+0f, 0x1.0004p-1f, 0x1.555778p-3f));
+    }
+  }
+
+  if (auto r = EXPF16_EXCEPTS_HI.lookup(x_u); LIBC_UNLIKELY(r.has_value()))
+    return r.value();
+
+  // For -18 < x < 12, to compute exp(x), we perform the following range
+  // reduction: find hi, mid, lo, such that:
+  //   x = hi + mid + lo, in which
+  //     hi is an integer,
+  //     mid * 2^3 is an integer,
+  //     -2^(-4) <= lo < 2^(-4).
+  // In particular,
+  //   hi + mid = round(x * 2^3) * 2^(-3).
+  // Then,
+  //   exp(x) = exp(hi + mid + lo) = exp(hi) * exp(mid) * exp(lo).
+  // We store exp(hi) and exp(mid) in the lookup tables EXP_HI and EXP_MID
+  // respectively.  exp(lo) is computed using a degree-3 minimax polynomial
+  // generated by Sollya.
+
+  float xf = x;
+  float kf = fputil::nearest_integer(xf * 0x1.0p+3f);
+  int x_hi_mid = static_cast<int>(kf);
+  int x_hi = x_hi_mid >> 3;
+  int x_mid = x_hi_mid & 0x7;
+  // lo = x - (hi + mid) = round(x * 2^3) * (-2^(-3)) + x
+  float lo = fputil::multiply_add(kf, -0x1.0p-3f, xf);
+
+  float exp_hi = EXP_HI[x_hi + 18];
+  float exp_mid = EXP_MID[x_mid];
+  // Degree-3 minimax polynomial generated by Sollya with the following
+  // commands:
+  //   > display = hexadecimal;
+  //   > P = fpminimax(expm1(x)/x, 2, [|SG...|], [-2^-4, 2^-4]);
+  //   > 1 + x * P;
+  float exp_lo =
+      fputil::polyeval(lo, 0x1p+0f, 0x1p+0f, 0x1.001p-1f, 0x1.555ddep-3f);
+  return static_cast<float16>(exp_hi * exp_mid * exp_lo);
+}
+
+} // namespace LIBC_NAMESPACE_DECL
diff --git a/libc/src/math/generic/getpayload.cpp b/libc/src/math/generic/getpayload.cpp
new file mode 100644
index 0000000..14d9551
--- /dev/null
+++ b/libc/src/math/generic/getpayload.cpp
@@ -0,0 +1,20 @@
+//===-- Implementation of getpayload function -----------------------------===//
+//
+// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
+// See https://llvm.org/LICENSE.txt for license information.
+// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
+//
+//===----------------------------------------------------------------------===//
+
+#include "src/math/getpayload.h"
+#include "src/__support/FPUtil/BasicOperations.h"
+#include "src/__support/common.h"
+#include "src/__support/macros/config.h"
+
+namespace LIBC_NAMESPACE_DECL {
+
+LLVM_LIBC_FUNCTION(double, getpayload, (const double *x)) {
+  return fputil::getpayload(*x);
+}
+
+} // namespace LIBC_NAMESPACE_DECL
diff --git a/libc/src/math/generic/getpayloadf.cpp b/libc/src/math/generic/getpayloadf.cpp
new file mode 100644
index 0000000..22db186
--- /dev/null
+++ b/libc/src/math/generic/getpayloadf.cpp
@@ -0,0 +1,20 @@
+//===-- Implementation of getpayloadf function ----------------------------===//
+//
+// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
+// See https://llvm.org/LICENSE.txt for license information.
+// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
+//
+//===----------------------------------------------------------------------===//
+
+#include "src/math/getpayloadf.h"
+#include "src/__support/FPUtil/BasicOperations.h"
+#include "src/__support/common.h"
+#include "src/__support/macros/config.h"
+
+namespace LIBC_NAMESPACE_DECL {
+
+LLVM_LIBC_FUNCTION(float, getpayloadf, (const float *x)) {
+  return fputil::getpayload(*x);
+}
+
+} // namespace LIBC_NAMESPACE_DECL
diff --git a/libc/src/math/generic/getpayloadf128.cpp b/libc/src/math/generic/getpayloadf128.cpp
new file mode 100644
index 0000000..b57469e
--- /dev/null
+++ b/libc/src/math/generic/getpayloadf128.cpp
@@ -0,0 +1,20 @@
+//===-- Implementation of getpayloadf128 function -------------------------===//
+//
+// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
+// See https://llvm.org/LICENSE.txt for license information.
+// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
+//
+//===----------------------------------------------------------------------===//
+
+#include "src/math/getpayloadf128.h"
+#include "src/__support/FPUtil/BasicOperations.h"
+#include "src/__support/common.h"
+#include "src/__support/macros/config.h"
+
+namespace LIBC_NAMESPACE_DECL {
+
+LLVM_LIBC_FUNCTION(float128, getpayloadf128, (const float128 *x)) {
+  return fputil::getpayload(*x);
+}
+
+} // namespace LIBC_NAMESPACE_DECL
diff --git a/libc/src/math/generic/setpayload.cpp b/libc/src/math/generic/setpayload.cpp
new file mode 100644
index 0000000..7e7078c
--- /dev/null
+++ b/libc/src/math/generic/setpayload.cpp
@@ -0,0 +1,20 @@
+//===-- Implementation of setpayload function -----------------------------===//
+//
+// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
+// See https://llvm.org/LICENSE.txt for license information.
+// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
+//
+//===----------------------------------------------------------------------===//
+
+#include "src/math/setpayload.h"
+#include "src/__support/FPUtil/BasicOperations.h"
+#include "src/__support/common.h"
+#include "src/__support/macros/config.h"
+
+namespace LIBC_NAMESPACE_DECL {
+
+LLVM_LIBC_FUNCTION(int, setpayload, (double *res, double pl)) {
+  return static_cast<int>(fputil::setpayload</*IsSignaling=*/false>(*res, pl));
+}
+
+} // namespace LIBC_NAMESPACE_DECL
diff --git a/libc/src/math/generic/setpayloadf.cpp b/libc/src/math/generic/setpayloadf.cpp
new file mode 100644
index 0000000..50d2ffd
--- /dev/null
+++ b/libc/src/math/generic/setpayloadf.cpp
@@ -0,0 +1,20 @@
+//===-- Implementation of setpayloadf function ----------------------------===//
+//
+// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
+// See https://llvm.org/LICENSE.txt for license information.
+// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
+//
+//===----------------------------------------------------------------------===//
+
+#include "src/math/setpayloadf.h"
+#include "src/__support/FPUtil/BasicOperations.h"
+#include "src/__support/common.h"
+#include "src/__support/macros/config.h"
+
+namespace LIBC_NAMESPACE_DECL {
+
+LLVM_LIBC_FUNCTION(int, setpayloadf, (float *res, float pl)) {
+  return static_cast<int>(fputil::setpayload</*IsSignaling=*/false>(*res, pl));
+}
+
+} // namespace LIBC_NAMESPACE_DECL
diff --git a/libc/src/math/generic/setpayloadf128.cpp b/libc/src/math/generic/setpayloadf128.cpp
new file mode 100644
index 0000000..a50e5ef
--- /dev/null
+++ b/libc/src/math/generic/setpayloadf128.cpp
@@ -0,0 +1,20 @@
+//===-- Implementation of setpayloadf128 function -------------------------===//
+//
+// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
+// See https://llvm.org/LICENSE.txt for license information.
+// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
+//
+//===----------------------------------------------------------------------===//
+
+#include "src/math/setpayloadf128.h"
+#include "src/__support/FPUtil/BasicOperations.h"
+#include "src/__support/common.h"
+#include "src/__support/macros/config.h"
+
+namespace LIBC_NAMESPACE_DECL {
+
+LLVM_LIBC_FUNCTION(int, setpayloadf128, (float128 * res, float128 pl)) {
+  return static_cast<int>(fputil::setpayload</*IsSignaling=*/false>(*res, pl));
+}
+
+} // namespace LIBC_NAMESPACE_DECL
diff --git a/libc/src/math/generic/totalorder.cpp b/libc/src/math/generic/totalorder.cpp
new file mode 100644
index 0000000..f052c81
--- /dev/null
+++ b/libc/src/math/generic/totalorder.cpp
@@ -0,0 +1,20 @@
+//===-- Implementation of totalorder function -----------------------------===//
+//
+// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
+// See https://llvm.org/LICENSE.txt for license information.
+// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
+//
+//===----------------------------------------------------------------------===//
+
+#include "src/math/totalorder.h"
+#include "src/__support/FPUtil/BasicOperations.h"
+#include "src/__support/common.h"
+#include "src/__support/macros/config.h"
+
+namespace LIBC_NAMESPACE_DECL {
+
+LLVM_LIBC_FUNCTION(int, totalorder, (const double *x, const double *y)) {
+  return static_cast<int>(fputil::totalorder(*x, *y));
+}
+
+} // namespace LIBC_NAMESPACE_DECL
diff --git a/libc/src/math/generic/totalorderf.cpp b/libc/src/math/generic/totalorderf.cpp
new file mode 100644
index 0000000..17c1304
--- /dev/null
+++ b/libc/src/math/generic/totalorderf.cpp
@@ -0,0 +1,20 @@
+//===-- Implementation of totalorderf function ----------------------------===//
+//
+// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
+// See https://llvm.org/LICENSE.txt for license information.
+// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
+//
+//===----------------------------------------------------------------------===//
+
+#include "src/math/totalorderf.h"
+#include "src/__support/FPUtil/BasicOperations.h"
+#include "src/__support/common.h"
+#include "src/__support/macros/config.h"
+
+namespace LIBC_NAMESPACE_DECL {
+
+LLVM_LIBC_FUNCTION(int, totalorderf, (const float *x, const float *y)) {
+  return static_cast<int>(fputil::totalorder(*x, *y));
+}
+
+} // namespace LIBC_NAMESPACE_DECL
diff --git a/libc/src/math/generic/totalorderf128.cpp b/libc/src/math/generic/totalorderf128.cpp
new file mode 100644
index 0000000..83d7768
--- /dev/null
+++ b/libc/src/math/generic/totalorderf128.cpp
@@ -0,0 +1,21 @@
+//===-- Implementation of totalorderf128 function -------------------------===//
+//
+// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
+// See https://llvm.org/LICENSE.txt for license information.
+// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
+//
+//===----------------------------------------------------------------------===//
+
+#include "src/math/totalorderf128.h"
+#include "src/__support/FPUtil/BasicOperations.h"
+#include "src/__support/common.h"
+#include "src/__support/macros/config.h"
+
+namespace LIBC_NAMESPACE_DECL {
+
+LLVM_LIBC_FUNCTION(int, totalorderf128,
+                   (const float128 *x, const float128 *y)) {
+  return static_cast<int>(fputil::totalorder(*x, *y));
+}
+
+} // namespace LIBC_NAMESPACE_DECL
diff --git a/libc/src/unistd/gettid.h b/libc/src/math/getpayload.h
index 4228319..b00d313 100644
--- a/libc/src/unistd/gettid.h
+++ b/libc/src/math/getpayload.h
@@ -1,4 +1,4 @@
-//===-- Implementation header for gettid ------------------------*- C++ -*-===//
+//===-- Implementation header for getpayload --------------------*- C++ -*-===//
 //
 // Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
 // See https://llvm.org/LICENSE.txt for license information.
@@ -6,16 +6,15 @@
 //
 //===----------------------------------------------------------------------===//
 
-#ifndef LLVM_LIBC_SRC_UNISTD_GETTID_H
-#define LLVM_LIBC_SRC_UNISTD_GETTID_H
+#ifndef LLVM_LIBC_SRC_MATH_GETPAYLOAD_H
+#define LLVM_LIBC_SRC_MATH_GETPAYLOAD_H
 
-#include "hdr/types/pid_t.h"
-#include "src/__support/common.h"
+#include "src/__support/macros/config.h"
 
 namespace LIBC_NAMESPACE_DECL {
 
-pid_t gettid(void);
+double getpayload(const double *x);
 
 } // namespace LIBC_NAMESPACE_DECL
 
-#endif // LLVM_LIBC_SRC_UNISTD_GETTID_H
+#endif // LLVM_LIBC_SRC_MATH_GETPAYLOAD_H
diff --git a/libc/src/math/getpayloadf.h b/libc/src/math/getpayloadf.h
new file mode 100644
index 0000000..20901cd
--- /dev/null
+++ b/libc/src/math/getpayloadf.h
@@ -0,0 +1,20 @@
+//===-- Implementation header for getpayloadf -------------------*- C++ -*-===//
+//
+// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
+// See https://llvm.org/LICENSE.txt for license information.
+// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
+//
+//===----------------------------------------------------------------------===//
+
+#ifndef LLVM_LIBC_SRC_MATH_GETPAYLOADF_H
+#define LLVM_LIBC_SRC_MATH_GETPAYLOADF_H
+
+#include "src/__support/macros/config.h"
+
+namespace LIBC_NAMESPACE_DECL {
+
+float getpayloadf(const float *x);
+
+} // namespace LIBC_NAMESPACE_DECL
+
+#endif // LLVM_LIBC_SRC_MATH_GETPAYLOADF_H
diff --git a/libc/src/math/getpayloadf128.h b/libc/src/math/getpayloadf128.h
new file mode 100644
index 0000000..7ebb429
--- /dev/null
+++ b/libc/src/math/getpayloadf128.h
@@ -0,0 +1,21 @@
+//===-- Implementation header for getpayloadf128 ----------------*- C++ -*-===//
+//
+// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
+// See https://llvm.org/LICENSE.txt for license information.
+// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
+//
+//===----------------------------------------------------------------------===//
+
+#ifndef LLVM_LIBC_SRC_MATH_GETPAYLOADF128_H
+#define LLVM_LIBC_SRC_MATH_GETPAYLOADF128_H
+
+#include "src/__support/macros/config.h"
+#include "src/__support/macros/properties/types.h"
+
+namespace LIBC_NAMESPACE_DECL {
+
+float128 getpayloadf128(const float128 *x);
+
+} // namespace LIBC_NAMESPACE_DECL
+
+#endif // LLVM_LIBC_SRC_MATH_GETPAYLOADF128_H
diff --git a/libc/src/math/setpayload.h b/libc/src/math/setpayload.h
new file mode 100644
index 0000000..3f30673
--- /dev/null
+++ b/libc/src/math/setpayload.h
@@ -0,0 +1,20 @@
+//===-- Implementation header for setpayload --------------------*- C++ -*-===//
+//
+// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
+// See https://llvm.org/LICENSE.txt for license information.
+// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
+//
+//===----------------------------------------------------------------------===//
+
+#ifndef LLVM_LIBC_SRC_MATH_SETPAYLOAD_H
+#define LLVM_LIBC_SRC_MATH_SETPAYLOAD_H
+
+#include "src/__support/macros/config.h"
+
+namespace LIBC_NAMESPACE_DECL {
+
+int setpayload(double *res, double pl);
+
+} // namespace LIBC_NAMESPACE_DECL
+
+#endif // LLVM_LIBC_SRC_MATH_SETPAYLOAD_H
diff --git a/libc/src/math/setpayloadf.h b/libc/src/math/setpayloadf.h
new file mode 100644
index 0000000..95544c8
--- /dev/null
+++ b/libc/src/math/setpayloadf.h
@@ -0,0 +1,20 @@
+//===-- Implementation header for setpayloadf -------------------*- C++ -*-===//
+//
+// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
+// See https://llvm.org/LICENSE.txt for license information.
+// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
+//
+//===----------------------------------------------------------------------===//
+
+#ifndef LLVM_LIBC_SRC_MATH_SETPAYLOADF_H
+#define LLVM_LIBC_SRC_MATH_SETPAYLOADF_H
+
+#include "src/__support/macros/config.h"
+
+namespace LIBC_NAMESPACE_DECL {
+
+int setpayloadf(float *res, float pl);
+
+} // namespace LIBC_NAMESPACE_DECL
+
+#endif // LLVM_LIBC_SRC_MATH_SETPAYLOADF_H
diff --git a/libc/src/math/setpayloadf128.h b/libc/src/math/setpayloadf128.h
new file mode 100644
index 0000000..e46aef3
--- /dev/null
+++ b/libc/src/math/setpayloadf128.h
@@ -0,0 +1,21 @@
+//===-- Implementation header for setpayloadf128 ----------------*- C++ -*-===//
+//
+// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
+// See https://llvm.org/LICENSE.txt for license information.
+// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
+//
+//===----------------------------------------------------------------------===//
+
+#ifndef LLVM_LIBC_SRC_MATH_SETPAYLOADF128_H
+#define LLVM_LIBC_SRC_MATH_SETPAYLOADF128_H
+
+#include "src/__support/macros/config.h"
+#include "src/__support/macros/properties/types.h"
+
+namespace LIBC_NAMESPACE_DECL {
+
+int setpayloadf128(float128 *res, float128 pl);
+
+} // namespace LIBC_NAMESPACE_DECL
+
+#endif // LLVM_LIBC_SRC_MATH_SETPAYLOADF128_H
diff --git a/libc/src/math/totalorder.h b/libc/src/math/totalorder.h
new file mode 100644
index 0000000..d8d0297
--- /dev/null
+++ b/libc/src/math/totalorder.h
@@ -0,0 +1,20 @@
+//===-- Implementation header for totalorder --------------------*- C++ -*-===//
+//
+// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
+// See https://llvm.org/LICENSE.txt for license information.
+// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
+//
+//===----------------------------------------------------------------------===//
+
+#ifndef LLVM_LIBC_SRC_MATH_TOTALORDER_H
+#define LLVM_LIBC_SRC_MATH_TOTALORDER_H
+
+#include "src/__support/macros/config.h"
+
+namespace LIBC_NAMESPACE_DECL {
+
+int totalorder(const double *x, const double *y);
+
+} // namespace LIBC_NAMESPACE_DECL
+
+#endif // LLVM_LIBC_SRC_MATH_TOTALORDER_H
diff --git a/libc/src/math/totalorderf.h b/libc/src/math/totalorderf.h
new file mode 100644
index 0000000..bade04c
--- /dev/null
+++ b/libc/src/math/totalorderf.h
@@ -0,0 +1,20 @@
+//===-- Implementation header for totalorderf -------------------*- C++ -*-===//
+//
+// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
+// See https://llvm.org/LICENSE.txt for license information.
+// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
+//
+//===----------------------------------------------------------------------===//
+
+#ifndef LLVM_LIBC_SRC_MATH_TOTALORDERF_H
+#define LLVM_LIBC_SRC_MATH_TOTALORDERF_H
+
+#include "src/__support/macros/config.h"
+
+namespace LIBC_NAMESPACE_DECL {
+
+int totalorderf(const float *x, const float *y);
+
+} // namespace LIBC_NAMESPACE_DECL
+
+#endif // LLVM_LIBC_SRC_MATH_TOTALORDERF_H
diff --git a/libc/src/math/totalorderf128.h b/libc/src/math/totalorderf128.h
new file mode 100644
index 0000000..9587148
--- /dev/null
+++ b/libc/src/math/totalorderf128.h
@@ -0,0 +1,21 @@
+//===-- Implementation header for totalorderf128 ----------------*- C++ -*-===//
+//
+// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
+// See https://llvm.org/LICENSE.txt for license information.
+// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
+//
+//===----------------------------------------------------------------------===//
+
+#ifndef LLVM_LIBC_SRC_MATH_TOTALORDERF128_H
+#define LLVM_LIBC_SRC_MATH_TOTALORDERF128_H
+
+#include "src/__support/macros/config.h"
+#include "src/__support/macros/properties/types.h"
+
+namespace LIBC_NAMESPACE_DECL {
+
+int totalorderf128(const float128 *x, const float128 *y);
+
+} // namespace LIBC_NAMESPACE_DECL
+
+#endif // LLVM_LIBC_SRC_MATH_TOTALORDERF128_H
diff --git a/libc/src/pthread/CMakeLists.txt b/libc/src/pthread/CMakeLists.txt
index dc748b2..70d10e6 100644
--- a/libc/src/pthread/CMakeLists.txt
+++ b/libc/src/pthread/CMakeLists.txt
@@ -557,6 +557,28 @@ add_entrypoint_object(
 )
 
 add_entrypoint_object(
+  pthread_rwlock_clockrdlock
+  SRCS
+    pthread_rwlock_clockrdlock.cpp
+  HDRS
+    pthread_rwlock_clockrdlock.h
+  DEPENDS
+    libc.include.pthread
+    libc.src.__support.threads.linux.rwlock
+)
+
+add_entrypoint_object(
+  pthread_rwlock_clockwrlock
+  SRCS
+    pthread_rwlock_clockwrlock.cpp
+  HDRS
+    pthread_rwlock_clockwrlock.h
+  DEPENDS
+    libc.include.pthread
+    libc.src.__support.threads.linux.rwlock
+)
+
+add_entrypoint_object(
   pthread_rwlock_timedrdlock
   SRCS
     pthread_rwlock_timedrdlock.cpp
diff --git a/libc/src/pthread/pthread_rwlock_clockrdlock.cpp b/libc/src/pthread/pthread_rwlock_clockrdlock.cpp
new file mode 100644
index 0000000..1e44e6d
--- /dev/null
+++ b/libc/src/pthread/pthread_rwlock_clockrdlock.cpp
@@ -0,0 +1,50 @@
+//===-- Implementation of the Rwlock's clockrdlock function ---------------===//
+//
+// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
+// See https://llvm.org/LICENSE.txt for license information.
+// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
+//
+//===----------------------------------------------------------------------===//
+
+#include "src/pthread/pthread_rwlock_clockrdlock.h"
+
+#include "hdr/errno_macros.h"
+#include "src/__support/common.h"
+#include "src/__support/macros/config.h"
+#include "src/__support/threads/linux/rwlock.h"
+
+#include <pthread.h>
+
+namespace LIBC_NAMESPACE_DECL {
+
+static_assert(
+    sizeof(RwLock) == sizeof(pthread_rwlock_t) &&
+        alignof(RwLock) == alignof(pthread_rwlock_t),
+    "The public pthread_rwlock_t type must be of the same size and alignment "
+    "as the internal rwlock type.");
+
+LLVM_LIBC_FUNCTION(int, pthread_rwlock_clockrdlock,
+                   (pthread_rwlock_t * rwlock, clockid_t clockid,
+                    const timespec *abstime)) {
+  if (!rwlock)
+    return EINVAL;
+  if (clockid != CLOCK_MONOTONIC && clockid != CLOCK_REALTIME)
+    return EINVAL;
+  bool is_realtime = (clockid == CLOCK_REALTIME);
+  RwLock *rw = reinterpret_cast<RwLock *>(rwlock);
+  LIBC_ASSERT(abstime && "clockrdlock called with a null timeout");
+  auto timeout = internal::AbsTimeout::from_timespec(
+      *abstime, /*is_realtime=*/is_realtime);
+  if (LIBC_LIKELY(timeout.has_value()))
+    return static_cast<int>(rw->read_lock(timeout.value()));
+
+  switch (timeout.error()) {
+  case internal::AbsTimeout::Error::Invalid:
+    return EINVAL;
+  case internal::AbsTimeout::Error::BeforeEpoch:
+    return ETIMEDOUT;
+  }
+  __builtin_unreachable();
+}
+
+} // namespace LIBC_NAMESPACE_DECL
diff --git a/libc/src/pthread/pthread_rwlock_clockrdlock.h b/libc/src/pthread/pthread_rwlock_clockrdlock.h
new file mode 100644
index 0000000..8fbd3b0
--- /dev/null
+++ b/libc/src/pthread/pthread_rwlock_clockrdlock.h
@@ -0,0 +1,23 @@
+//===-- Implementation header for Rwlock's clockrdlock function --*- C++-*-===//
+//
+// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
+// See https://llvm.org/LICENSE.txt for license information.
+// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
+//
+//===----------------------------------------------------------------------===//
+
+#ifndef LLVM_LIBC_SRC_PTHREAD_PTHREAD_RWLOCK_CLOCKRDLOCK_H
+#define LLVM_LIBC_SRC_PTHREAD_PTHREAD_RWLOCK_CLOCKRDLOCK_H
+
+#include "src/__support/macros/config.h"
+#include <pthread.h>
+
+namespace LIBC_NAMESPACE_DECL {
+
+int pthread_rwlock_clockrdlock(pthread_rwlock_t *__restrict rwlock,
+                               clockid_t clockid,
+                               const timespec *__restrict abstime);
+
+} // namespace LIBC_NAMESPACE_DECL
+
+#endif // LLVM_LIBC_SRC_PTHREAD_PTHREAD_RWLOCK_CLOCKRDLOCK_H
diff --git a/libc/src/pthread/pthread_rwlock_clockwrlock.cpp b/libc/src/pthread/pthread_rwlock_clockwrlock.cpp
new file mode 100644
index 0000000..8f58c7f
--- /dev/null
+++ b/libc/src/pthread/pthread_rwlock_clockwrlock.cpp
@@ -0,0 +1,51 @@
+//===-- Implementation of the Rwlock's clockwrlock function----------------===//
+//
+// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
+// See https://llvm.org/LICENSE.txt for license information.
+// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
+//
+//===----------------------------------------------------------------------===//
+
+#include "src/pthread/pthread_rwlock_clockwrlock.h"
+
+#include "hdr/errno_macros.h"
+#include "src/__support/common.h"
+#include "src/__support/macros/config.h"
+#include "src/__support/threads/linux/rwlock.h"
+#include "src/__support/time/linux/abs_timeout.h"
+
+#include <pthread.h>
+
+namespace LIBC_NAMESPACE_DECL {
+
+static_assert(
+    sizeof(RwLock) == sizeof(pthread_rwlock_t) &&
+        alignof(RwLock) == alignof(pthread_rwlock_t),
+    "The public pthread_rwlock_t type must be of the same size and alignment "
+    "as the internal rwlock type.");
+
+LLVM_LIBC_FUNCTION(int, pthread_rwlock_clockwrlock,
+                   (pthread_rwlock_t * rwlock, clockid_t clockid,
+                    const timespec *abstime)) {
+  if (!rwlock)
+    return EINVAL;
+  if (clockid != CLOCK_MONOTONIC && clockid != CLOCK_REALTIME)
+    return EINVAL;
+  bool is_realtime = (clockid == CLOCK_REALTIME);
+  RwLock *rw = reinterpret_cast<RwLock *>(rwlock);
+  LIBC_ASSERT(abstime && "clockwrlock called with a null timeout");
+  auto timeout = internal::AbsTimeout::from_timespec(
+      *abstime, /*is_realtime=*/is_realtime);
+  if (LIBC_LIKELY(timeout.has_value()))
+    return static_cast<int>(rw->write_lock(timeout.value()));
+
+  switch (timeout.error()) {
+  case internal::AbsTimeout::Error::Invalid:
+    return EINVAL;
+  case internal::AbsTimeout::Error::BeforeEpoch:
+    return ETIMEDOUT;
+  }
+  __builtin_unreachable();
+}
+
+} // namespace LIBC_NAMESPACE_DECL
diff --git a/libc/src/pthread/pthread_rwlock_clockwrlock.h b/libc/src/pthread/pthread_rwlock_clockwrlock.h
new file mode 100644
index 0000000..cb3fa39
--- /dev/null
+++ b/libc/src/pthread/pthread_rwlock_clockwrlock.h
@@ -0,0 +1,23 @@
+//===-- Implementation header for Rwlock's clockwrlock function --*- C++-*-===//
+//
+// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
+// See https://llvm.org/LICENSE.txt for license information.
+// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
+//
+//===----------------------------------------------------------------------===//
+
+#ifndef LLVM_LIBC_SRC_PTHREAD_PTHREAD_RWLOCK_CLOCKWRLOCK_H
+#define LLVM_LIBC_SRC_PTHREAD_PTHREAD_RWLOCK_CLOCKWRLOCK_H
+
+#include "src/__support/macros/config.h"
+#include <pthread.h>
+
+namespace LIBC_NAMESPACE_DECL {
+
+int pthread_rwlock_clockwrlock(pthread_rwlock_t *__restrict rwlock,
+                               clockid_t clockid,
+                               const timespec *__restrict abstime);
+
+} // namespace LIBC_NAMESPACE_DECL
+
+#endif // LLVM_LIBC_SRC_PTHREAD_PTHREAD_RWLOCK_CLOCKWRLOCK_H
diff --git a/libc/src/setjmp/aarch64/CMakeLists.txt b/libc/src/setjmp/aarch64/CMakeLists.txt
new file mode 100644
index 0000000..47eeb1a
--- /dev/null
+++ b/libc/src/setjmp/aarch64/CMakeLists.txt
@@ -0,0 +1,28 @@
+if(LIBC_CONF_SETJMP_AARCH64_RESTORE_PLATFORM_REGISTER)
+  list(APPEND setjmp_config_options "-DLIBC_COPT_SETJMP_AARCH64_RESTORE_PLATFORM_REGISTER")
+endif()
+if(setjmp_config_options)
+  list(PREPEND setjmp_config_options "COMPILE_OPTIONS")
+endif()
+
+add_entrypoint_object(
+  setjmp
+  SRCS
+    setjmp.cpp
+  HDRS
+    ../setjmp_impl.h
+  DEPENDS
+    libc.include.setjmp
+  ${setjmp_config_options}
+)
+
+add_entrypoint_object(
+  longjmp
+  SRCS
+    longjmp.cpp
+  HDRS
+    ../longjmp.h
+  DEPENDS
+    libc.include.setjmp
+  ${setjmp_config_options}
+)
diff --git a/libc/src/setjmp/aarch64/longjmp.cpp b/libc/src/setjmp/aarch64/longjmp.cpp
new file mode 100644
index 0000000..fbb8652
--- /dev/null
+++ b/libc/src/setjmp/aarch64/longjmp.cpp
@@ -0,0 +1,92 @@
+//===-- Implementation of longjmp for AArch64 -----------------------------===//
+//
+// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
+// See https://llvm.org/LICENSE.txt for license information.
+// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
+//
+//===----------------------------------------------------------------------===//
+
+#include "src/setjmp/longjmp.h"
+#include "src/__support/common.h"
+#include "src/__support/macros/config.h"
+
+namespace LIBC_NAMESPACE_DECL {
+
+// TODO: if MTE stack tagging is in use (-fsanitize=memtag-stack), we need to
+// iterate over the region between the old and new values of sp, using STG or
+// ST2G instructions to clear the memory tags on the invalidated region of the
+// stack. But this requires a means of finding out that we're in that mode, and
+// as far as I can see there isn't currently a predefined macro for that.
+//
+// (__ARM_FEATURE_MEMORY_TAGGING only indicates whether the target architecture
+// supports the MTE instructions, not whether the compiler is configured to use
+// them.)
+
+[[gnu::naked]] LLVM_LIBC_FUNCTION(void, longjmp,
+                                  ([[maybe_unused]] __jmp_buf * buf,
+                                   [[maybe_unused]] int val)) {
+  // If BTI branch protection is in use, the compiler will automatically insert
+  // a BTI here, so we don't need to make any extra effort to do so.
+
+  // If PAC branch protection is in use, there's no need to sign the return
+  // address at the start of longjmp, because we're not going to use it anyway!
+
+  asm(
+      // Reload the callee-saved GPRs, including fp and lr.
+      R"(
+        ldp x19, x20, [x0,  #0*16]
+        ldp x21, x22, [x0,  #1*16]
+        ldp x23, x24, [x0,  #2*16]
+        ldp x25, x26, [x0,  #3*16]
+        ldp x27, x28, [x0,  #4*16]
+        ldp x29, x30, [x0,  #5*16]
+      )"
+
+#if LIBC_COPT_SETJMP_AARCH64_RESTORE_PLATFORM_REGISTER
+      // Reload the stack pointer, and the platform register x18.
+      R"(
+        ldp x2,  x18, [x0,  #6*16]
+        mov sp, x2
+      )"
+#else
+      // Reload just the stack pointer.
+      R"(
+        ldr x2,       [x0,  #6*16]
+        mov sp, x2
+      )"
+#endif
+
+#if __ARM_FP
+      // Reload the callee-saved FP registers.
+      R"(
+        ldp d8,  d9,  [x0,  #7*16]
+        ldp d10, d11, [x0,  #8*16]
+        ldp d12, d13, [x0,  #9*16]
+        ldp d14, d15, [x0, #10*16]
+      )"
+#endif
+
+      // Calculate the return value.
+      R"(
+        cmp w1, #0
+        cinc w0, w1, eq
+      )"
+
+#if __ARM_FEATURE_PAC_DEFAULT & 1
+      // Authenticate the return address using the PAC A key.
+      R"(
+        autiasp
+      )"
+#elif __ARM_FEATURE_PAC_DEFAULT & 2
+      // Authenticate the return address using the PAC B key.
+      R"(
+        autibsp
+      )"
+#endif
+
+      R"(
+        ret
+      )");
+}
+
+} // namespace LIBC_NAMESPACE_DECL
diff --git a/libc/src/setjmp/aarch64/setjmp.cpp b/libc/src/setjmp/aarch64/setjmp.cpp
new file mode 100644
index 0000000..90e49be
--- /dev/null
+++ b/libc/src/setjmp/aarch64/setjmp.cpp
@@ -0,0 +1,94 @@
+//===-- Implementation of setjmp for AArch64 ------------------------------===//
+//
+// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
+// See https://llvm.org/LICENSE.txt for license information.
+// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
+//
+//===----------------------------------------------------------------------===//
+
+#include "src/__support/common.h"
+#include "src/__support/macros/config.h"
+#include "src/setjmp/setjmp_impl.h"
+
+namespace LIBC_NAMESPACE_DECL {
+
+[[gnu::naked]] LLVM_LIBC_FUNCTION(int, setjmp,
+                                  ([[maybe_unused]] __jmp_buf * buf)) {
+  // If BTI branch protection is in use, the compiler will automatically insert
+  // a BTI here, so we don't need to make any extra effort to do so.
+
+  asm(
+#if __ARM_FEATURE_PAC_DEFAULT & 1
+      // Sign the return address using the PAC A key.
+      R"(
+        paciasp
+      )"
+#elif __ARM_FEATURE_PAC_DEFAULT & 2
+      // Sign the return address using the PAC B key.
+      R"(
+        pacibsp
+      )"
+#endif
+
+      // Store all the callee-saved GPRs, including fp (x29) and also lr (x30).
+      // Of course lr isn't normally callee-saved (the call instruction itself
+      // can't help clobbering it), but we certainly need to save it for this
+      // purpose.
+      R"(
+        stp x19, x20, [x0,  #0*16]
+        stp x21, x22, [x0,  #1*16]
+        stp x23, x24, [x0,  #2*16]
+        stp x25, x26, [x0,  #3*16]
+        stp x27, x28, [x0,  #4*16]
+        stp x29, x30, [x0,  #5*16]
+      )"
+
+#if LIBC_COPT_SETJMP_AARCH64_RESTORE_PLATFORM_REGISTER
+      // Store the stack pointer, and the platform register x18.
+      R"(
+        add x1, sp, #0
+        stp x1, x18,  [x0,  #6*16]
+      )"
+#else
+      // Store just the stack pointer.
+      R"(
+        add x1, sp, #0
+        str x1,       [x0,  #6*16]
+      )"
+#endif
+
+#if __ARM_FP
+      // Store the callee-saved FP registers. AAPCS64 only requires the low 64
+      // bits of v8-v15 to be preserved, i.e. each of d8,...,d15.
+      R"(
+        stp d8,  d9,  [x0,  #7*16]
+        stp d10, d11, [x0,  #8*16]
+        stp d12, d13, [x0,  #9*16]
+        stp d14, d15, [x0, #10*16]
+      )"
+#endif
+
+      // Set up return value of zero.
+      R"(
+        mov x0, #0
+      )"
+
+#if (__ARM_FEATURE_PAC_DEFAULT & 7) == 5
+      // Authenticate the return address using the PAC A key, since the
+      // compilation options ask for PAC protection even on leaf functions.
+      R"(
+        autiasp
+      )"
+#elif (__ARM_FEATURE_PAC_DEFAULT & 7) == 6
+      // Same, but using the PAC B key.
+      R"(
+        autibsp
+      )"
+#endif
+
+      R"(
+        ret
+      )");
+}
+
+} // namespace LIBC_NAMESPACE_DECL
diff --git a/libc/src/stdio/CMakeLists.txt b/libc/src/stdio/CMakeLists.txt
index 2d528a9..94f9235 100644
--- a/libc/src/stdio/CMakeLists.txt
+++ b/libc/src/stdio/CMakeLists.txt
@@ -122,6 +122,18 @@ add_entrypoint_object(
 )
 
 add_entrypoint_object(
+  vsscanf
+  SRCS
+    vsscanf.cpp
+  HDRS
+    vsscanf.h
+  DEPENDS
+    libc.src.__support.arg_list
+    libc.src.stdio.scanf_core.reader
+    libc.src.stdio.scanf_core.scanf_main
+)
+
+add_entrypoint_object(
   fscanf
   SRCS
     fscanf.cpp
diff --git a/libc/src/stdio/vsscanf.cpp b/libc/src/stdio/vsscanf.cpp
new file mode 100644
index 0000000..fcf0b88
--- /dev/null
+++ b/libc/src/stdio/vsscanf.cpp
@@ -0,0 +1,33 @@
+//===-- Implementation of vsscanf -------------------------------*- C++ -*-===//
+//
+// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
+// See https://llvm.org/LICENSE.txt for license information.
+// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
+//
+//===----------------------------------------------------------------------===//
+
+#include "src/stdio/vsscanf.h"
+
+#include "src/__support/CPP/limits.h"
+#include "src/__support/arg_list.h"
+#include "src/stdio/scanf_core/reader.h"
+#include "src/stdio/scanf_core/scanf_main.h"
+
+#include <stdarg.h>
+#include <stdio.h>
+
+namespace LIBC_NAMESPACE_DECL {
+
+LLVM_LIBC_FUNCTION(int, vsscanf,
+                   (const char *buffer, const char *format, va_list vlist)) {
+  internal::ArgList args(vlist);
+  scanf_core::ReadBuffer rb{const_cast<char *>(buffer),
+                            cpp::numeric_limits<size_t>::max()};
+  scanf_core::Reader reader(&rb);
+  int ret_val = scanf_core::scanf_main(&reader, format, args);
+  // This is done to avoid including stdio.h in the internals. On most systems
+  // EOF is -1, so this will be transformed into just "return ret_val".
+  return (ret_val == -1) ? EOF : ret_val;
+}
+
+} // namespace LIBC_NAMESPACE_DECL
diff --git a/libc/src/stdio/vsscanf.h b/libc/src/stdio/vsscanf.h
new file mode 100644
index 0000000..992c44d
--- /dev/null
+++ b/libc/src/stdio/vsscanf.h
@@ -0,0 +1,20 @@
+//===-- Implementation header of vsscanf ------------------------*- C++ -*-===//
+//
+// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
+// See https://llvm.org/LICENSE.txt for license information.
+// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
+//
+//===----------------------------------------------------------------------===//
+
+#ifndef LLVM_LIBC_SRC_STDIO_VSSCANF_H
+#define LLVM_LIBC_SRC_STDIO_VSSCANF_H
+
+#include <stdarg.h>
+
+namespace LIBC_NAMESPACE {
+
+int vsscanf(const char *s, const char *format, va_list vlist);
+
+} // namespace LIBC_NAMESPACE
+
+#endif // LLVM_LIBC_SRC_STDIO_VSSCANF_H
diff --git a/libc/src/stdlib/CMakeLists.txt b/libc/src/stdlib/CMakeLists.txt
index d79acb3..0f363ee 100644
--- a/libc/src/stdlib/CMakeLists.txt
+++ b/libc/src/stdlib/CMakeLists.txt
@@ -443,14 +443,23 @@ if(LIBC_TARGET_OS_IS_GPU)
     DEPENDS
       .${LIBC_TARGET_OS}.free
   )
-  add_entrypoint_external(
-    calloc
-  )
-  add_entrypoint_external(
+  add_entrypoint_object(
     realloc
+    ALIAS
+    DEPENDS
+      .${LIBC_TARGET_OS}.realloc
   )
-  add_entrypoint_external(
+  add_entrypoint_object(
+    calloc
+    ALIAS
+    DEPENDS
+      .${LIBC_TARGET_OS}.calloc
+  )
+  add_entrypoint_object(
     aligned_alloc
+    ALIAS
+    DEPENDS
+      .${LIBC_TARGET_OS}.aligned_alloc
   )
 endif()
 
diff --git a/libc/src/stdlib/gpu/CMakeLists.txt b/libc/src/stdlib/gpu/CMakeLists.txt
index f8a11ec..073f815 100644
--- a/libc/src/stdlib/gpu/CMakeLists.txt
+++ b/libc/src/stdlib/gpu/CMakeLists.txt
@@ -21,6 +21,39 @@ add_entrypoint_object(
 )
 
 add_entrypoint_object(
+  realloc
+  SRCS
+    realloc.cpp
+  HDRS
+    ../realloc.h
+  DEPENDS
+    libc.include.stdlib
+    libc.src.__support.GPU.allocator
+)
+
+add_entrypoint_object(
+  calloc
+  SRCS
+    calloc.cpp
+  HDRS
+    ../calloc.h
+  DEPENDS
+    libc.include.stdlib
+    libc.src.__support.GPU.allocator
+)
+
+add_entrypoint_object(
+  aligned_alloc
+  SRCS
+    aligned_alloc.cpp
+  HDRS
+    ../aligned_alloc.h
+  DEPENDS
+    libc.include.stdlib
+    libc.src.__support.GPU.allocator
+)
+
+add_entrypoint_object(
   abort
   SRCS
     abort.cpp
diff --git a/libc/src/stdlib/gpu/aligned_alloc.cpp b/libc/src/stdlib/gpu/aligned_alloc.cpp
new file mode 100644
index 0000000..cd2c7e5
--- /dev/null
+++ b/libc/src/stdlib/gpu/aligned_alloc.cpp
@@ -0,0 +1,29 @@
+//===-- GPU Implementation of aligned_alloc -------------------------------===//
+//
+// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
+// See https://llvm.org/LICENSE.txt for license information.
+// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
+//
+//===----------------------------------------------------------------------===//
+
+#include "src/stdlib/aligned_alloc.h"
+
+#include "src/__support/GPU/allocator.h"
+#include "src/__support/common.h"
+#include "src/__support/macros/config.h"
+
+namespace LIBC_NAMESPACE_DECL {
+
+LLVM_LIBC_FUNCTION(void *, aligned_alloc, (size_t alignment, size_t size)) {
+  if ((alignment & -alignment) != alignment)
+    return nullptr;
+
+  void *ptr = gpu::allocate(size);
+  if ((reinterpret_cast<uintptr_t>(ptr) & (alignment - 1)) != 0) {
+    gpu::deallocate(ptr);
+    return nullptr;
+  }
+  return ptr;
+}
+
+} // namespace LIBC_NAMESPACE_DECL
diff --git a/libc/src/stdlib/gpu/calloc.cpp b/libc/src/stdlib/gpu/calloc.cpp
new file mode 100644
index 0000000..9150aff
--- /dev/null
+++ b/libc/src/stdlib/gpu/calloc.cpp
@@ -0,0 +1,31 @@
+//===-- GPU Implementation of calloc --------------------------------------===//
+//
+// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
+// See https://llvm.org/LICENSE.txt for license information.
+// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
+//
+//===----------------------------------------------------------------------===//
+
+#include "src/stdlib/calloc.h"
+
+#include "src/__support/GPU/allocator.h"
+#include "src/__support/common.h"
+#include "src/__support/macros/config.h"
+#include "src/string/memory_utils/inline_memset.h"
+
+namespace LIBC_NAMESPACE_DECL {
+
+LLVM_LIBC_FUNCTION(void *, calloc, (size_t num, size_t size)) {
+  size_t bytes = num * size;
+  if (bytes == 0)
+    return nullptr;
+
+  void *ptr = gpu::allocate(bytes);
+  if (!ptr)
+    return nullptr;
+
+  inline_memset(ptr, 0, bytes);
+  return ptr;
+}
+
+} // namespace LIBC_NAMESPACE_DECL
diff --git a/libc/src/stdlib/gpu/realloc.cpp b/libc/src/stdlib/gpu/realloc.cpp
new file mode 100644
index 0000000..4fd4d6b
--- /dev/null
+++ b/libc/src/stdlib/gpu/realloc.cpp
@@ -0,0 +1,32 @@
+//===-- GPU Implementation of realloc -------------------------------------===//
+//
+// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
+// See https://llvm.org/LICENSE.txt for license information.
+// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
+//
+//===----------------------------------------------------------------------===//
+
+#include "src/stdlib/realloc.h"
+
+#include "src/__support/GPU/allocator.h"
+#include "src/__support/common.h"
+#include "src/__support/macros/config.h"
+#include "src/string/memory_utils/inline_memcpy.h"
+
+namespace LIBC_NAMESPACE_DECL {
+
+LLVM_LIBC_FUNCTION(void *, realloc, (void *ptr, size_t size)) {
+  if (ptr == nullptr)
+    return gpu::allocate(size);
+
+  void *newmem = gpu::allocate(size);
+  if (newmem == nullptr)
+    return nullptr;
+
+  // This will copy garbage if it goes beyond the old allocation size.
+  inline_memcpy(newmem, ptr, size);
+  gpu::deallocate(ptr);
+  return newmem;
+}
+
+} // namespace LIBC_NAMESPACE_DECL
diff --git a/libc/src/unistd/CMakeLists.txt b/libc/src/unistd/CMakeLists.txt
index ec76712..ddafcd7 100644
--- a/libc/src/unistd/CMakeLists.txt
+++ b/libc/src/unistd/CMakeLists.txt
@@ -333,13 +333,3 @@ add_entrypoint_external(
 add_entrypoint_external(
   opterr
 )
-
-add_entrypoint_object(
-  gettid
-  SRCS
-    gettid.cpp
-  HDRS
-    gettid.h
-  DEPENDS
-    libc.src.__support.threads.tid
-)
diff --git a/libc/src/unistd/getpid.h b/libc/src/unistd/getpid.h
index 5812df0..c3c55b0 100644
--- a/libc/src/unistd/getpid.h
+++ b/libc/src/unistd/getpid.h
@@ -9,12 +9,12 @@
 #ifndef LLVM_LIBC_SRC_UNISTD_GETPID_H
 #define LLVM_LIBC_SRC_UNISTD_GETPID_H
 
-#include "hdr/types/pid_t.h"
 #include "src/__support/macros/config.h"
+#include <unistd.h>
 
 namespace LIBC_NAMESPACE_DECL {
 
-pid_t getpid(void);
+pid_t getpid();
 
 } // namespace LIBC_NAMESPACE_DECL
 
diff --git a/libc/src/unistd/linux/CMakeLists.txt b/libc/src/unistd/linux/CMakeLists.txt
index 651ea60..7e733d7 100644
--- a/libc/src/unistd/linux/CMakeLists.txt
+++ b/libc/src/unistd/linux/CMakeLists.txt
@@ -101,7 +101,6 @@ add_entrypoint_object(
     libc.include.sys_syscall
     libc.src.__support.threads.fork_callbacks
     libc.src.__support.OSUtil.osutil
-    libc.src.__support.OSUtil.pid
     libc.src.__support.threads.thread
     libc.src.errno.errno
 )
@@ -205,7 +204,8 @@ add_entrypoint_object(
     ../getpid.h
   DEPENDS
     libc.include.unistd
-    libc.src.__support.OSUtil.pid
+    libc.include.sys_syscall
+    libc.src.__support.OSUtil.osutil
 )
 
 add_entrypoint_object(
diff --git a/libc/src/unistd/linux/fork.cpp b/libc/src/unistd/linux/fork.cpp
index 8fe1881..7d47665 100644
--- a/libc/src/unistd/linux/fork.cpp
+++ b/libc/src/unistd/linux/fork.cpp
@@ -8,14 +8,13 @@
 
 #include "src/unistd/fork.h"
 
-#include "src/__support/OSUtil/pid.h"
 #include "src/__support/OSUtil/syscall.h" // For internal syscall function.
 #include "src/__support/common.h"
 #include "src/__support/macros/config.h"
 #include "src/__support/threads/fork_callbacks.h"
 #include "src/__support/threads/thread.h" // For thread self object
-#include "src/errno/libc_errno.h"
 
+#include "src/errno/libc_errno.h"
 #include <signal.h>      // For SIGCHLD
 #include <sys/syscall.h> // For syscall numbers.
 
@@ -26,14 +25,6 @@ namespace LIBC_NAMESPACE_DECL {
 
 LLVM_LIBC_FUNCTION(pid_t, fork, (void)) {
   invoke_prepare_callbacks();
-
-  // Invalidate tid/pid cache before fork to avoid post fork signal handler from
-  // getting wrong values. gettid() is not async-signal-safe, but let's provide
-  // our best efforts here.
-  pid_t parent_tid = self.get_tid();
-  self.invalidate_tid();
-  ProcessIdentity::start_fork();
-
 #ifdef SYS_fork
   pid_t ret = LIBC_NAMESPACE::syscall_impl<pid_t>(SYS_fork);
 #elif defined(SYS_clone)
@@ -41,6 +32,15 @@ LLVM_LIBC_FUNCTION(pid_t, fork, (void)) {
 #else
 #error "fork and clone syscalls not available."
 #endif
+  if (ret == 0) {
+    // Return value is 0 in the child process.
+    // The child is created with a single thread whose self object will be a
+    // copy of parent process' thread which called fork. So, we have to fix up
+    // the child process' self object with the new process' tid.
+    self.attrib->tid = LIBC_NAMESPACE::syscall_impl<pid_t>(SYS_gettid);
+    invoke_child_callbacks();
+    return 0;
+  }
 
   if (ret < 0) {
     // Error case, a child process was not created.
@@ -48,18 +48,6 @@ LLVM_LIBC_FUNCTION(pid_t, fork, (void)) {
     return -1;
   }
 
-  // Child process
-  if (ret == 0) {
-    self.refresh_tid();
-    ProcessIdentity::refresh_cache();
-    ProcessIdentity::end_fork();
-    invoke_child_callbacks();
-    return 0;
-  }
-
-  // Parent process
-  self.refresh_tid(parent_tid);
-  ProcessIdentity::end_fork();
   invoke_parent_callbacks();
   return ret;
 }
diff --git a/libc/src/unistd/linux/getpid.cpp b/libc/src/unistd/linux/getpid.cpp
index 65d6c8a..b24c86a 100644
--- a/libc/src/unistd/linux/getpid.cpp
+++ b/libc/src/unistd/linux/getpid.cpp
@@ -7,10 +7,17 @@
 //===----------------------------------------------------------------------===//
 
 #include "src/unistd/getpid.h"
-#include "src/__support/OSUtil/pid.h"
+
+#include "src/__support/OSUtil/syscall.h" // For internal syscall function.
 #include "src/__support/common.h"
+#include "src/__support/macros/config.h"
+
+#include <sys/syscall.h> // For syscall numbers.
+
 namespace LIBC_NAMESPACE_DECL {
 
-LLVM_LIBC_FUNCTION(pid_t, getpid, (void)) { return ProcessIdentity::get(); }
+LLVM_LIBC_FUNCTION(pid_t, getpid, ()) {
+  return LIBC_NAMESPACE::syscall_impl<pid_t>(SYS_getpid);
+}
 
 } // namespace LIBC_NAMESPACE_DECL
diff --git a/libc/startup/gpu/CMakeLists.txt b/libc/startup/gpu/CMakeLists.txt
index 3830bf3..5e57450 100644
--- a/libc/startup/gpu/CMakeLists.txt
+++ b/libc/startup/gpu/CMakeLists.txt
@@ -34,7 +34,7 @@ function(add_startup_object name)
       RUNTIME_OUTPUT_DIRECTORY ${LIBC_LIBRARY_DIR}
       RUNTIME_OUTPUT_NAME ${name}.o)
     target_link_options(${fq_target_name}.exe PRIVATE
-                        "-nostdlib" "-flto" "-Wl,--lto-emit-llvm" "-march= ")
+                        "-nostdlib" "-flto" "-Wl,--lto-emit-llvm")
   endif()
 endfunction()
 
diff --git a/libc/startup/linux/CMakeLists.txt b/libc/startup/linux/CMakeLists.txt
index 585edf2..336c5d0 100644
--- a/libc/startup/linux/CMakeLists.txt
+++ b/libc/startup/linux/CMakeLists.txt
@@ -101,7 +101,6 @@ add_object_library(
     libc.include.llvm-libc-macros.link_macros
     libc.src.__support.threads.thread
     libc.src.__support.OSUtil.osutil
-    libc.src.__support.OSUtil.pid
     libc.src.stdlib.exit
     libc.src.stdlib.atexit
     libc.src.unistd.environ
diff --git a/libc/startup/linux/do_start.cpp b/libc/startup/linux/do_start.cpp
index 4047c06..824c0e1 100644
--- a/libc/startup/linux/do_start.cpp
+++ b/libc/startup/linux/do_start.cpp
@@ -7,7 +7,6 @@
 //===----------------------------------------------------------------------===//
 #include "startup/linux/do_start.h"
 #include "include/llvm-libc-macros/link-macros.h"
-#include "src/__support/OSUtil/pid.h"
 #include "src/__support/OSUtil/syscall.h"
 #include "src/__support/macros/config.h"
 #include "src/__support/threads/thread.h"
@@ -128,10 +127,6 @@ static ThreadAttributes main_thread_attrib;
   if (tls.size != 0 && !set_thread_ptr(tls.tp))
     syscall_impl<long>(SYS_exit, 1);
 
-  // Validate process identity cache (TLS needed).
-  ProcessIdentity::refresh_cache();
-  ProcessIdentity::end_fork();
-
   self.attrib = &main_thread_attrib;
   main_thread_attrib.atexit_callback_mgr =
       internal::get_thread_atexit_callback_mgr();
diff --git a/libc/test/UnitTest/FPMatcher.h b/libc/test/UnitTest/FPMatcher.h
index 6b50f3d..2749908 100644
--- a/libc/test/UnitTest/FPMatcher.h
+++ b/libc/test/UnitTest/FPMatcher.h
@@ -74,7 +74,8 @@ template <typename T> struct FPTest : public Test {
   static constexpr T inf = FPBits::inf(Sign::POS).get_val();
   static constexpr T neg_inf = FPBits::inf(Sign::NEG).get_val();
   static constexpr T min_normal = FPBits::min_normal().get_val();
-  static constexpr T max_normal = FPBits::max_normal().get_val();
+  static constexpr T max_normal = FPBits::max_normal(Sign::POS).get_val();
+  static constexpr T neg_max_normal = FPBits::max_normal(Sign::NEG).get_val();
   static constexpr T min_denormal = FPBits::min_subnormal().get_val();
   static constexpr T max_denormal = FPBits::max_subnormal().get_val();
 
diff --git a/libc/test/integration/src/pthread/CMakeLists.txt b/libc/test/integration/src/pthread/CMakeLists.txt
index fa5fd3a..eb26822 100644
--- a/libc/test/integration/src/pthread/CMakeLists.txt
+++ b/libc/test/integration/src/pthread/CMakeLists.txt
@@ -32,9 +32,11 @@ add_integration_test(
     libc.src.pthread.pthread_rwlock_rdlock
     libc.src.pthread.pthread_rwlock_tryrdlock
     libc.src.pthread.pthread_rwlock_timedrdlock
+    libc.src.pthread.pthread_rwlock_clockrdlock
     libc.src.pthread.pthread_rwlock_wrlock
     libc.src.pthread.pthread_rwlock_trywrlock
     libc.src.pthread.pthread_rwlock_timedwrlock
+    libc.src.pthread.pthread_rwlock_clockwrlock
     libc.src.pthread.pthread_rwlock_unlock
     libc.src.pthread.pthread_create
     libc.src.pthread.pthread_join
diff --git a/libc/test/integration/src/pthread/pthread_rwlock_test.cpp b/libc/test/integration/src/pthread/pthread_rwlock_test.cpp
index 455003b..9f5fba1 100644
--- a/libc/test/integration/src/pthread/pthread_rwlock_test.cpp
+++ b/libc/test/integration/src/pthread/pthread_rwlock_test.cpp
@@ -15,6 +15,8 @@
 #include "src/__support/threads/sleep.h"
 #include "src/pthread/pthread_create.h"
 #include "src/pthread/pthread_join.h"
+#include "src/pthread/pthread_rwlock_clockrdlock.h"
+#include "src/pthread/pthread_rwlock_clockwrlock.h"
 #include "src/pthread/pthread_rwlock_destroy.h"
 #include "src/pthread/pthread_rwlock_init.h"
 #include "src/pthread/pthread_rwlock_rdlock.h"
@@ -112,6 +114,12 @@ static void nullptr_test() {
   ASSERT_EQ(LIBC_NAMESPACE::pthread_rwlock_wrlock(nullptr), EINVAL);
   ASSERT_EQ(LIBC_NAMESPACE::pthread_rwlock_timedrdlock(nullptr, &ts), EINVAL);
   ASSERT_EQ(LIBC_NAMESPACE::pthread_rwlock_timedwrlock(nullptr, &ts), EINVAL);
+  ASSERT_EQ(
+      LIBC_NAMESPACE::pthread_rwlock_clockrdlock(nullptr, CLOCK_MONOTONIC, &ts),
+      EINVAL);
+  ASSERT_EQ(
+      LIBC_NAMESPACE::pthread_rwlock_clockwrlock(nullptr, CLOCK_MONOTONIC, &ts),
+      EINVAL);
   ASSERT_EQ(LIBC_NAMESPACE::pthread_rwlock_tryrdlock(nullptr), EINVAL);
   ASSERT_EQ(LIBC_NAMESPACE::pthread_rwlock_trywrlock(nullptr), EINVAL);
   ASSERT_EQ(LIBC_NAMESPACE::pthread_rwlock_unlock(nullptr), EINVAL);
@@ -159,16 +167,40 @@ static void unusual_timespec_test() {
   timespec ts = {0, -1};
   ASSERT_EQ(LIBC_NAMESPACE::pthread_rwlock_timedrdlock(&rwlock, &ts), EINVAL);
   ASSERT_EQ(LIBC_NAMESPACE::pthread_rwlock_timedwrlock(&rwlock, &ts), EINVAL);
+  ASSERT_EQ(
+      LIBC_NAMESPACE::pthread_rwlock_clockrdlock(&rwlock, CLOCK_MONOTONIC, &ts),
+      EINVAL);
+  ASSERT_EQ(
+      LIBC_NAMESPACE::pthread_rwlock_clockwrlock(&rwlock, CLOCK_MONOTONIC, &ts),
+      EINVAL);
   ts.tv_nsec = 1'000'000'000;
   ASSERT_EQ(LIBC_NAMESPACE::pthread_rwlock_timedrdlock(&rwlock, &ts), EINVAL);
+  ASSERT_EQ(
+      LIBC_NAMESPACE::pthread_rwlock_clockrdlock(&rwlock, CLOCK_MONOTONIC, &ts),
+      EINVAL);
+  ASSERT_EQ(
+      LIBC_NAMESPACE::pthread_rwlock_clockwrlock(&rwlock, CLOCK_MONOTONIC, &ts),
+      EINVAL);
   ts.tv_nsec += 1;
   ASSERT_EQ(LIBC_NAMESPACE::pthread_rwlock_timedwrlock(&rwlock, &ts), EINVAL);
+  ASSERT_EQ(
+      LIBC_NAMESPACE::pthread_rwlock_clockrdlock(&rwlock, CLOCK_MONOTONIC, &ts),
+      EINVAL);
+  ASSERT_EQ(
+      LIBC_NAMESPACE::pthread_rwlock_clockwrlock(&rwlock, CLOCK_MONOTONIC, &ts),
+      EINVAL);
   ts.tv_nsec = 0;
   ts.tv_sec = -1;
   ASSERT_EQ(LIBC_NAMESPACE::pthread_rwlock_timedrdlock(&rwlock, &ts),
             ETIMEDOUT);
   ASSERT_EQ(LIBC_NAMESPACE::pthread_rwlock_timedwrlock(&rwlock, &ts),
             ETIMEDOUT);
+  ASSERT_EQ(
+      LIBC_NAMESPACE::pthread_rwlock_clockrdlock(&rwlock, CLOCK_MONOTONIC, &ts),
+      ETIMEDOUT);
+  ASSERT_EQ(
+      LIBC_NAMESPACE::pthread_rwlock_clockwrlock(&rwlock, CLOCK_MONOTONIC, &ts),
+      ETIMEDOUT);
 }
 
 static void timedlock_with_deadlock_test() {
@@ -184,6 +216,13 @@ static void timedlock_with_deadlock_test() {
   ASSERT_EQ(LIBC_NAMESPACE::pthread_rwlock_timedwrlock(&rwlock, &ts),
             ETIMEDOUT);
   ASSERT_EQ(LIBC_NAMESPACE::pthread_rwlock_timedrdlock(&rwlock, &ts), 0);
+  ASSERT_EQ(
+      LIBC_NAMESPACE::pthread_rwlock_clockwrlock(&rwlock, CLOCK_REALTIME, &ts),
+      ETIMEDOUT);
+  ASSERT_EQ(
+      LIBC_NAMESPACE::pthread_rwlock_clockrdlock(&rwlock, CLOCK_REALTIME, &ts),
+      0);
+  ASSERT_EQ(LIBC_NAMESPACE::pthread_rwlock_unlock(&rwlock), 0);
   ASSERT_EQ(LIBC_NAMESPACE::pthread_rwlock_unlock(&rwlock), 0);
   ASSERT_EQ(LIBC_NAMESPACE::pthread_rwlock_unlock(&rwlock), 0);
   // notice that ts is already expired, but the following should still succeed.
@@ -270,9 +309,11 @@ enum class Operation : int {
   WRITE = 1,
   TIMED_READ = 2,
   TIMED_WRITE = 3,
-  TRY_READ = 4,
-  TRY_WRITE = 5,
-  COUNT = 6
+  CLOCK_READ = 4,
+  CLOCK_WRITE = 5,
+  TRY_READ = 6,
+  TRY_WRITE = 7,
+  COUNT = 8
 };
 
 LIBC_NAMESPACE::RawMutex *io_mutex;
@@ -358,6 +399,24 @@ static void randomized_thread_operation(SharedData *data, ThreadGuard &guard) {
     }
     break;
   }
+  case Operation::CLOCK_READ: {
+    timespec ts = get_ts();
+    if (LIBC_NAMESPACE::pthread_rwlock_clockrdlock(&data->lock, CLOCK_MONOTONIC,
+                                                   &ts) == 0) {
+      read_ops();
+      LIBC_NAMESPACE::pthread_rwlock_unlock(&data->lock);
+    }
+    break;
+  }
+  case Operation::CLOCK_WRITE: {
+    timespec ts = get_ts();
+    if (LIBC_NAMESPACE::pthread_rwlock_clockwrlock(&data->lock, CLOCK_MONOTONIC,
+                                                   &ts) == 0) {
+      write_ops();
+      LIBC_NAMESPACE::pthread_rwlock_unlock(&data->lock);
+    }
+    break;
+  }
   case Operation::TRY_READ: {
     if (LIBC_NAMESPACE::pthread_rwlock_tryrdlock(&data->lock) == 0) {
       read_ops();
diff --git a/libc/test/integration/src/unistd/CMakeLists.txt b/libc/test/integration/src/unistd/CMakeLists.txt
index f50405d0..3f18231 100644
--- a/libc/test/integration/src/unistd/CMakeLists.txt
+++ b/libc/test/integration/src/unistd/CMakeLists.txt
@@ -31,10 +31,6 @@ add_integration_test(
     libc.src.sys.wait.wait4
     libc.src.sys.wait.waitpid
     libc.src.unistd.fork
-    libc.src.unistd.getpid
-    libc.src.unistd.gettid
-    libc.src.stdlib.exit
-    libc.include.sys_syscall
 )
 
 if((${LIBC_TARGET_OS} STREQUAL "linux") AND (${LIBC_TARGET_ARCHITECTURE_IS_X86}))
diff --git a/libc/test/integration/src/unistd/fork_test.cpp b/libc/test/integration/src/unistd/fork_test.cpp
index 4b82d5f..9c9213e 100644
--- a/libc/test/integration/src/unistd/fork_test.cpp
+++ b/libc/test/integration/src/unistd/fork_test.cpp
@@ -6,21 +6,17 @@
 //
 //===----------------------------------------------------------------------===//
 
-#include "src/__support/OSUtil/syscall.h"
 #include "src/pthread/pthread_atfork.h"
 #include "src/signal/raise.h"
-#include "src/stdlib/exit.h"
 #include "src/sys/wait/wait.h"
 #include "src/sys/wait/wait4.h"
 #include "src/sys/wait/waitpid.h"
 #include "src/unistd/fork.h"
-#include "src/unistd/getpid.h"
-#include "src/unistd/gettid.h"
+
 #include "test/IntegrationTest/test.h"
 
 #include <errno.h>
 #include <signal.h>
-#include <sys/syscall.h>
 #include <sys/wait.h>
 #include <unistd.h>
 
@@ -144,25 +140,7 @@ void fork_with_atfork_callbacks() {
   ASSERT_NE(child, DONE);
 }
 
-void fork_pid_tid_test() {
-  pid_t pid = fork();
-  ASSERT_TRUE(pid >= 0);
-  ASSERT_EQ(LIBC_NAMESPACE::gettid(),
-            LIBC_NAMESPACE::syscall_impl<pid_t>(SYS_gettid));
-  ASSERT_EQ(LIBC_NAMESPACE::getpid(),
-            LIBC_NAMESPACE::syscall_impl<pid_t>(SYS_getpid));
-
-  if (pid == 0) {
-    LIBC_NAMESPACE::exit(0);
-  } else {
-    int status;
-    LIBC_NAMESPACE::waitpid(pid, &status, 0);
-    ASSERT_EQ(status, 0);
-  }
-}
-
 TEST_MAIN(int argc, char **argv, char **envp) {
-  fork_pid_tid_test();
   fork_and_wait_normal_exit();
   fork_and_wait4_normal_exit();
   fork_and_waitpid_normal_exit();
diff --git a/libc/test/src/math/CMakeLists.txt b/libc/test/src/math/CMakeLists.txt
index 380d283..cc59559 100644
--- a/libc/test/src/math/CMakeLists.txt
+++ b/libc/test/src/math/CMakeLists.txt
@@ -889,6 +889,19 @@ add_fp_unittest(
 )
 
 add_fp_unittest(
+  exp_test
+  NEED_MPFR
+  SUITE
+    libc-math-unittests
+  SRCS
+    exp_test.cpp
+  DEPENDS
+    libc.src.errno.errno
+    libc.src.math.exp
+    libc.src.__support.FPUtil.fp_bits
+)
+
+add_fp_unittest(
   expf_test
   NEED_MPFR
   SUITE
@@ -902,16 +915,14 @@ add_fp_unittest(
 )
 
 add_fp_unittest(
- exp_test
- NEED_MPFR
- SUITE
-   libc-math-unittests
- SRCS
-   exp_test.cpp
- DEPENDS
-   libc.src.errno.errno
-   libc.src.math.exp
-   libc.src.__support.FPUtil.fp_bits
+  expf16_test
+  NEED_MPFR
+  SUITE
+    libc-math-unittests
+  SRCS
+    expf16_test.cpp
+  DEPENDS
+    libc.src.math.expf16
 )
 
 add_fp_unittest(
@@ -2380,6 +2391,35 @@ add_fp_unittest(
     libc.src.stdlib.srand
 )
 
+add_fp_unittest(
+  dfmal_test
+  NEED_MPFR
+  SUITE
+    libc-math-unittests
+  SRCS
+    dfmal_test.cpp
+  HDRS
+    FmaTest.h
+  DEPENDS
+    libc.src.math.dfmal
+    libc.src.stdlib.rand
+    libc.src.stdlib.srand
+)
+
+add_fp_unittest(
+  dsubl_test
+  NEED_MPFR
+  SUITE
+    libc-math-unittests
+  SRCS
+    dsubl_test.cpp
+  HDRS
+    SubTest.h
+  DEPENDS
+    libc.src.math.dsubl
+)
+
+
 add_subdirectory(generic)
 add_subdirectory(smoke)
 
diff --git a/libc/test/src/math/CopySignTest.h b/libc/test/src/math/CopySignTest.h
index c66f914..8db4f69 100644
--- a/libc/test/src/math/CopySignTest.h
+++ b/libc/test/src/math/CopySignTest.h
@@ -39,7 +39,7 @@ public:
     constexpr StorageType STEP = STORAGE_MAX / COUNT;
     for (StorageType i = 0, v = 0; i <= COUNT; ++i, v += STEP) {
       T x = FPBits(v).get_val();
-      if (isnan(x) || isinf(x))
+      if (FPBits(v).is_nan() || FPBits(v).is_inf())
         continue;
 
       double res1 = func(x, -x);
diff --git a/libc/test/src/math/FAbsTest.h b/libc/test/src/math/FAbsTest.h
index 92b589b..7b4ea93 100644
--- a/libc/test/src/math/FAbsTest.h
+++ b/libc/test/src/math/FAbsTest.h
@@ -41,7 +41,7 @@ public:
     constexpr StorageType STEP = STORAGE_MAX / COUNT;
     for (StorageType i = 0, v = 0; i <= COUNT; ++i, v += STEP) {
       T x = FPBits(v).get_val();
-      if (isnan(x) || isinf(x))
+      if (FPBits(v).is_nan() || FPBits(v).is_inf())
         continue;
       ASSERT_MPFR_MATCH(mpfr::Operation::Abs, x, func(x), 0.0);
     }
diff --git a/libc/test/src/math/FDimTest.h b/libc/test/src/math/FDimTest.h
index fefcefe..20c63e7 100644
--- a/libc/test/src/math/FDimTest.h
+++ b/libc/test/src/math/FDimTest.h
@@ -67,9 +67,9 @@ public:
     for (StorageType i = 0, v = 0, w = STORAGE_MAX; i <= COUNT;
          ++i, v += STEP, w -= STEP) {
       T x = FPBits(v).get_val(), y = FPBits(w).get_val();
-      if (isnan(x) || isinf(x))
+      if (FPBits(v).is_nan() || FPBits(v).is_inf())
         continue;
-      if (isnan(y) || isinf(y))
+      if (FPBits(w).is_nan() || FPBits(w).is_inf())
         continue;
 
       if (x > y) {
diff --git a/libc/test/src/math/FMaxTest.h b/libc/test/src/math/FMaxTest.h
index 405642c..43904a4 100644
--- a/libc/test/src/math/FMaxTest.h
+++ b/libc/test/src/math/FMaxTest.h
@@ -65,9 +65,9 @@ public:
     for (StorageType i = 0, v = 0, w = STORAGE_MAX; i <= COUNT;
          ++i, v += STEP, w -= STEP) {
       T x = FPBits(v).get_val(), y = FPBits(w).get_val();
-      if (isnan(x) || isinf(x))
+      if (FPBits(v).is_nan() || FPBits(v).is_inf())
         continue;
-      if (isnan(y) || isinf(y))
+      if (FPBits(w).is_nan() || FPBits(w).is_inf())
         continue;
       if ((x == 0) && (y == 0))
         continue;
diff --git a/libc/test/src/math/FMinTest.h b/libc/test/src/math/FMinTest.h
index eae0008d..51c21ae5 100644
--- a/libc/test/src/math/FMinTest.h
+++ b/libc/test/src/math/FMinTest.h
@@ -9,6 +9,7 @@
 #ifndef LLVM_LIBC_TEST_SRC_MATH_FMINTEST_H
 #define LLVM_LIBC_TEST_SRC_MATH_FMINTEST_H
 
+#include "src/__support/FPUtil/FPBits.h"
 #include "test/UnitTest/FEnvSafeTest.h"
 #include "test/UnitTest/FPMatcher.h"
 #include "test/UnitTest/Test.h"
@@ -65,9 +66,9 @@ public:
     for (StorageType i = 0, v = 0, w = STORAGE_MAX; i <= COUNT;
          ++i, v += STEP, w -= STEP) {
       T x = FPBits(v).get_val(), y = FPBits(w).get_val();
-      if (isnan(x) || isinf(x))
+      if (FPBits(v).is_nan() || FPBits(v).is_inf())
         continue;
-      if (isnan(y) || isinf(y))
+      if (FPBits(w).is_nan() || FPBits(w).is_inf())
         continue;
       if ((x == 0) && (y == 0))
         continue;
diff --git a/libc/test/src/math/FrexpTest.h b/libc/test/src/math/FrexpTest.h
index 3ba64af..74a2d60 100644
--- a/libc/test/src/math/FrexpTest.h
+++ b/libc/test/src/math/FrexpTest.h
@@ -99,7 +99,7 @@ public:
     constexpr StorageType STEP = STORAGE_MAX / COUNT;
     for (StorageType i = 0, v = 0; i <= COUNT; ++i, v += STEP) {
       T x = FPBits(v).get_val();
-      if (isnan(x) || isinf(x) || x == 0.0l)
+      if (FPBits(v).is_nan() || FPBits(v).is_inf() || x == 0.0l)
         continue;
 
       mpfr::BinaryOutput<T> result;
diff --git a/libc/test/src/math/ILogbTest.h b/libc/test/src/math/ILogbTest.h
index c2d5a13..ccb92eb 100644
--- a/libc/test/src/math/ILogbTest.h
+++ b/libc/test/src/math/ILogbTest.h
@@ -82,7 +82,7 @@ public:
     constexpr StorageType STEP = (MAX_SUBNORMAL - MIN_SUBNORMAL) / COUNT;
     for (StorageType v = MIN_SUBNORMAL; v <= MAX_SUBNORMAL; v += STEP) {
       T x = FPBits(v).get_val();
-      if (isnan(x) || isinf(x) || x == 0.0)
+      if (FPBits(v).is_nan() || FPBits(v).is_inf() || x == 0.0)
         continue;
 
       int exponent;
@@ -101,7 +101,7 @@ public:
     constexpr StorageType STEP = (MAX_NORMAL - MIN_NORMAL) / COUNT;
     for (StorageType v = MIN_NORMAL; v <= MAX_NORMAL; v += STEP) {
       T x = FPBits(v).get_val();
-      if (isnan(x) || isinf(x) || x == 0.0)
+      if (FPBits(v).is_nan() || FPBits(v).is_inf() || x == 0.0)
         continue;
 
       int exponent;
diff --git a/libc/test/src/math/LogbTest.h b/libc/test/src/math/LogbTest.h
index d6042e3..5ef3b26 100644
--- a/libc/test/src/math/LogbTest.h
+++ b/libc/test/src/math/LogbTest.h
@@ -78,7 +78,7 @@ public:
     constexpr StorageType STEP = STORAGE_MAX / COUNT;
     for (StorageType i = 0, v = 0; i <= COUNT; ++i, v += STEP) {
       T x = FPBits(v).get_val();
-      if (isnan(x) || isinf(x) || x == 0.0l)
+      if (FPBits(v).is_nan() || FPBits(v).is_inf() || x == 0.0l)
         continue;
 
       int exponent;
diff --git a/libc/test/src/math/ModfTest.h b/libc/test/src/math/ModfTest.h
index d6c6f27..3377290 100644
--- a/libc/test/src/math/ModfTest.h
+++ b/libc/test/src/math/ModfTest.h
@@ -90,7 +90,7 @@ public:
     constexpr StorageType STEP = STORAGE_MAX / COUNT;
     for (StorageType i = 0, v = 0; i <= COUNT; ++i, v += STEP) {
       T x = FPBits(v).get_val();
-      if (isnan(x) || isinf(x) || x == T(0.0))
+      if (FPBits(v).is_nan() || FPBits(v).is_inf() || x == T(0.0))
         continue;
 
       T integral;
diff --git a/libc/test/src/math/RemQuoTest.h b/libc/test/src/math/RemQuoTest.h
index c39f239..a35a6ee 100644
--- a/libc/test/src/math/RemQuoTest.h
+++ b/libc/test/src/math/RemQuoTest.h
@@ -127,7 +127,7 @@ public:
 
       // In normal range on x86 platforms, the long double implicit 1 bit can be
       // zero making the numbers NaN. Hence we test for them separately.
-      if (isnan(x) || isnan(y)) {
+      if (FPBits(v).is_nan() || FPBits(w).is_nan()) {
         ASSERT_FP_EQ(result.f, nan);
         continue;
       }
diff --git a/libc/test/src/math/acosf_test.cpp b/libc/test/src/math/acosf_test.cpp
index 0d25a80..4880582 100644
--- a/libc/test/src/math/acosf_test.cpp
+++ b/libc/test/src/math/acosf_test.cpp
@@ -48,7 +48,7 @@ TEST_F(LlvmLibcAcosfTest, InFloatRange) {
   constexpr uint32_t STEP = UINT32_MAX / COUNT;
   for (uint32_t i = 0, v = 0; i <= COUNT; ++i, v += STEP) {
     float x = FPBits(v).get_val();
-    if (isnan(x) || isinf(x))
+    if (FPBits(v).is_nan() || FPBits(v).is_inf())
       continue;
     ASSERT_MPFR_MATCH_ALL_ROUNDING(mpfr::Operation::Acos, x,
                                    LIBC_NAMESPACE::acosf(x), 0.5);
diff --git a/libc/test/src/math/acoshf_test.cpp b/libc/test/src/math/acoshf_test.cpp
index 32761e2..5d7f597 100644
--- a/libc/test/src/math/acoshf_test.cpp
+++ b/libc/test/src/math/acoshf_test.cpp
@@ -45,7 +45,7 @@ TEST_F(LlvmLibcAcoshfTest, InFloatRange) {
   constexpr uint32_t STEP = UINT32_MAX / COUNT;
   for (uint32_t i = 0, v = 0; i <= COUNT; ++i, v += STEP) {
     float x = FPBits(v).get_val();
-    if (isnan(x) || isinf(x))
+    if (FPBits(v).is_nan() || FPBits(v).is_inf())
       continue;
     ASSERT_MPFR_MATCH_ALL_ROUNDING(mpfr::Operation::Acosh, x,
                                    LIBC_NAMESPACE::acoshf(x), 0.5);
diff --git a/libc/test/src/math/asinf_test.cpp b/libc/test/src/math/asinf_test.cpp
index 91e6108..09dc3c9 100644
--- a/libc/test/src/math/asinf_test.cpp
+++ b/libc/test/src/math/asinf_test.cpp
@@ -46,7 +46,7 @@ TEST_F(LlvmLibcAsinfTest, InFloatRange) {
   constexpr uint32_t STEP = UINT32_MAX / COUNT;
   for (uint32_t i = 0, v = 0; i <= COUNT; ++i, v += STEP) {
     float x = FPBits(v).get_val();
-    if (isnan(x) || isinf(x))
+    if (FPBits(v).is_nan() || FPBits(v).is_inf())
       continue;
     ASSERT_MPFR_MATCH_ALL_ROUNDING(mpfr::Operation::Asin, x,
                                    LIBC_NAMESPACE::asinf(x), 0.5);
diff --git a/libc/test/src/math/asinhf_test.cpp b/libc/test/src/math/asinhf_test.cpp
index b19e26e..3e55a56 100644
--- a/libc/test/src/math/asinhf_test.cpp
+++ b/libc/test/src/math/asinhf_test.cpp
@@ -45,7 +45,7 @@ TEST_F(LlvmLibcAsinhfTest, InFloatRange) {
   constexpr uint32_t STEP = UINT32_MAX / COUNT;
   for (uint32_t i = 0, v = 0; i <= COUNT; ++i, v += STEP) {
     float x = FPBits(v).get_val();
-    if (isnan(x) || isinf(x))
+    if (FPBits(v).is_nan() || FPBits(v).is_inf())
       continue;
     ASSERT_MPFR_MATCH_ALL_ROUNDING(mpfr::Operation::Asinh, x,
                                    LIBC_NAMESPACE::asinhf(x), 0.5);
diff --git a/libc/test/src/math/atan2f_test.cpp b/libc/test/src/math/atan2f_test.cpp
index 1242b7e..331f428 100644
--- a/libc/test/src/math/atan2f_test.cpp
+++ b/libc/test/src/math/atan2f_test.cpp
@@ -73,18 +73,18 @@ TEST_F(LlvmLibcAtan2fTest, InFloatRange) {
 
     for (uint32_t i = 0, v = X_START; i <= X_COUNT; ++i, v += X_STEP) {
       float x = FPBits(v).get_val();
-      if (isnan(x) || isinf(x) || x < 0.0)
+      if (FPBits(v).is_nan() || FPBits(v).is_inf() || x < 0.0)
         continue;
 
       for (uint32_t j = 0, w = Y_START; j <= Y_COUNT; ++j, w += Y_STEP) {
         float y = FPBits(w).get_val();
-        if (isnan(y) || isinf(y))
+        if (FPBits(w).is_nan() || FPBits(w).is_inf())
           continue;
 
         LIBC_NAMESPACE::libc_errno = 0;
         float result = LIBC_NAMESPACE::atan2f(x, y);
         ++total_count;
-        if (isnan(result) || isinf(result))
+        if (FPBits(result).is_nan() || FPBits(result).is_inf())
           continue;
 
         ++finite_count;
diff --git a/libc/test/src/math/cbrt_test.cpp b/libc/test/src/math/cbrt_test.cpp
index 1233514..2ef2140 100644
--- a/libc/test/src/math/cbrt_test.cpp
+++ b/libc/test/src/math/cbrt_test.cpp
@@ -21,8 +21,8 @@ using LIBC_NAMESPACE::testing::tlog;
 
 TEST_F(LlvmLibcCbrtTest, InDoubleRange) {
   constexpr uint64_t COUNT = 123'451;
-  uint64_t START = LIBC_NAMESPACE::fputil::FPBits<double>(1.0).uintval();
-  uint64_t STOP = LIBC_NAMESPACE::fputil::FPBits<double>(8.0).uintval();
+  uint64_t START = FPBits(1.0).uintval();
+  uint64_t STOP = FPBits(8.0).uintval();
   uint64_t STEP = (STOP - START) / COUNT;
 
   auto test = [&](mpfr::RoundingMode rounding_mode) {
@@ -38,12 +38,12 @@ TEST_F(LlvmLibcCbrtTest, InDoubleRange) {
 
     for (uint64_t i = 0, v = START; i <= COUNT; ++i, v += STEP) {
       double x = FPBits(v).get_val();
-      if (isnan(x) || isinf(x))
+      if (FPBits(x).is_inf_or_nan())
         continue;
 
       double result = LIBC_NAMESPACE::cbrt(x);
       ++total;
-      if (isnan(result) || isinf(result))
+      if (FPBits(result).is_inf_or_nan())
         continue;
 
       ++tested;
diff --git a/libc/test/src/math/cos_test.cpp b/libc/test/src/math/cos_test.cpp
index e12e9a8..484d47f 100644
--- a/libc/test/src/math/cos_test.cpp
+++ b/libc/test/src/math/cos_test.cpp
@@ -81,12 +81,12 @@ TEST_F(LlvmLibcCosTest, InDoubleRange) {
 
     for (uint64_t i = 0, v = START; i <= COUNT; ++i, v += STEP) {
       double x = FPBits(v).get_val();
-      if (isnan(x) || isinf(x))
+      if (FPBits(v).is_nan() || FPBits(v).is_inf())
         continue;
 
       double result = LIBC_NAMESPACE::cos(x);
       ++total;
-      if (isnan(result) || isinf(result))
+      if (FPBits(result).is_nan() || FPBits(result).is_inf())
         continue;
 
       ++tested;
diff --git a/libc/test/src/math/cosf_test.cpp b/libc/test/src/math/cosf_test.cpp
index dab35fa..82790e3 100644
--- a/libc/test/src/math/cosf_test.cpp
+++ b/libc/test/src/math/cosf_test.cpp
@@ -47,7 +47,7 @@ TEST_F(LlvmLibcCosfTest, InFloatRange) {
   constexpr uint32_t STEP = UINT32_MAX / COUNT;
   for (uint32_t i = 0, v = 0; i <= COUNT; ++i, v += STEP) {
     float x = FPBits(v).get_val();
-    if (isnan(x) || isinf(x))
+    if (FPBits(v).is_nan() || FPBits(v).is_inf())
       continue;
     ASSERT_MPFR_MATCH_ALL_ROUNDING(mpfr::Operation::Cos, x,
                                    LIBC_NAMESPACE::cosf(x), 0.5);
diff --git a/libc/test/src/math/coshf_test.cpp b/libc/test/src/math/coshf_test.cpp
index 7c5d663..00bbf4b 100644
--- a/libc/test/src/math/coshf_test.cpp
+++ b/libc/test/src/math/coshf_test.cpp
@@ -61,7 +61,7 @@ TEST_F(LlvmLibcCoshfTest, InFloatRange) {
   constexpr uint32_t STEP = UINT32_MAX / COUNT;
   for (uint32_t i = 0, v = 0; i <= COUNT; ++i, v += STEP) {
     float x = FPBits(v).get_val();
-    if (isnan(x) || isinf(x))
+    if (FPBits(v).is_nan() || FPBits(v).is_inf())
       continue;
     ASSERT_MPFR_MATCH(mpfr::Operation::Cosh, x, LIBC_NAMESPACE::coshf(x), 0.5);
   }
diff --git a/libc/test/src/unistd/gettid_test.cpp b/libc/test/src/math/dfmal_test.cpp
index c2330f4..3c38f5e 100644
--- a/libc/test/src/unistd/gettid_test.cpp
+++ b/libc/test/src/math/dfmal_test.cpp
@@ -1,4 +1,4 @@
-//===-- Unittests for gettid ----------------------------------------------===//
+//===-- Unittests for dfmal -----------------------------------------------===//
 //
 // Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
 // See https://llvm.org/LICENSE.txt for license information.
@@ -6,10 +6,8 @@
 //
 //===----------------------------------------------------------------------===//
 
-#include "src/unistd/gettid.h"
-#include "test/UnitTest/Test.h"
+#include "FmaTest.h"
 
-TEST(LlvmLibcGetTidTest, SmokeTest) {
-  // gettid always succeeds. So, we just call it as a smoke test.
-  ASSERT_GT(LIBC_NAMESPACE::gettid(), 0);
-}
+#include "src/math/dfmal.h"
+
+LIST_NARROWING_FMA_TESTS(double, long double, LIBC_NAMESPACE::dfmal)
diff --git a/libc/test/src/math/dsubl_test.cpp b/libc/test/src/math/dsubl_test.cpp
new file mode 100644
index 0000000..98846e0
--- /dev/null
+++ b/libc/test/src/math/dsubl_test.cpp
@@ -0,0 +1,13 @@
+//===-- Unittests for dsubl -----------------------------------------------===//
+//
+// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
+// See https://llvm.org/LICENSE.txt for license information.
+// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
+//
+//===----------------------------------------------------------------------===//
+
+#include "SubTest.h"
+
+#include "src/math/dsubl.h"
+
+LIST_SUB_TESTS(double, long double, LIBC_NAMESPACE::dsubl)
diff --git a/libc/test/src/math/erff_test.cpp b/libc/test/src/math/erff_test.cpp
index 5c848d7..851eda4 100644
--- a/libc/test/src/math/erff_test.cpp
+++ b/libc/test/src/math/erff_test.cpp
@@ -64,12 +64,12 @@ TEST_F(LlvmLibcErffTest, InFloatRange) {
 
     for (uint32_t i = 0, v = START; i <= COUNT; ++i, v += STEP) {
       float x = FPBits(v).get_val();
-      if (isnan(x))
+      if (FPBits(v).is_nan())
         continue;
 
       float result = LIBC_NAMESPACE::erff(x);
       ++cc;
-      if (isnan(result))
+      if (FPBits(result).is_nan())
         continue;
 
       ++count;
diff --git a/libc/test/src/math/exp10_test.cpp b/libc/test/src/math/exp10_test.cpp
index 4cbdd16..61ae33e 100644
--- a/libc/test/src/math/exp10_test.cpp
+++ b/libc/test/src/math/exp10_test.cpp
@@ -104,12 +104,12 @@ TEST_F(LlvmLibcExp10Test, InDoubleRange) {
 
     for (uint64_t i = 0, v = START; i <= COUNT; ++i, v += STEP) {
       double x = FPBits(v).get_val();
-      if (isnan(x) || isinf(x) || x < 0.0)
+      if (FPBits(v).is_nan() || FPBits(v).is_inf() || x < 0.0)
         continue;
       LIBC_NAMESPACE::libc_errno = 0;
       double result = LIBC_NAMESPACE::exp10(x);
       ++cc;
-      if (isnan(result) || isinf(result))
+      if (FPBits(result).is_nan() || FPBits(result).is_inf())
         continue;
 
       ++count;
diff --git a/libc/test/src/math/exp10f_test.cpp b/libc/test/src/math/exp10f_test.cpp
index e9b2786..001b378 100644
--- a/libc/test/src/math/exp10f_test.cpp
+++ b/libc/test/src/math/exp10f_test.cpp
@@ -111,7 +111,7 @@ TEST_F(LlvmLibcExp10fTest, InFloatRange) {
   constexpr uint32_t STEP = UINT32_MAX / COUNT;
   for (uint32_t i = 0, v = 0; i <= COUNT; ++i, v += STEP) {
     float x = FPBits(v).get_val();
-    if (isnan(x) || isinf(x))
+    if (FPBits(v).is_nan() || FPBits(v).is_inf())
       continue;
     LIBC_NAMESPACE::libc_errno = 0;
     float result = LIBC_NAMESPACE::exp10f(x);
@@ -120,7 +120,8 @@ TEST_F(LlvmLibcExp10fTest, InFloatRange) {
     // in the single-precision floating point range, then ignore comparing with
     // MPFR result as MPFR can still produce valid results because of its
     // wider precision.
-    if (isnan(result) || isinf(result) || LIBC_NAMESPACE::libc_errno != 0)
+    if (FPBits(result).is_nan() || FPBits(result).is_inf() ||
+        LIBC_NAMESPACE::libc_errno != 0)
       continue;
     ASSERT_MPFR_MATCH_ALL_ROUNDING(mpfr::Operation::Exp10, x,
                                    LIBC_NAMESPACE::exp10f(x), 0.5);
diff --git a/libc/test/src/math/exp2_test.cpp b/libc/test/src/math/exp2_test.cpp
index 73232ed..f218eea 100644
--- a/libc/test/src/math/exp2_test.cpp
+++ b/libc/test/src/math/exp2_test.cpp
@@ -79,12 +79,12 @@ TEST_F(LlvmLibcExp2Test, InDoubleRange) {
 
     for (uint64_t i = 0, v = START; i <= COUNT; ++i, v += STEP) {
       double x = FPBits(v).get_val();
-      if (isnan(x) || isinf(x) || x < 0.0)
+      if (FPBits(v).is_nan() || FPBits(v).is_inf() || x < 0.0)
         continue;
       LIBC_NAMESPACE::libc_errno = 0;
       double result = LIBC_NAMESPACE::exp2(x);
       ++cc;
-      if (isnan(result) || isinf(result))
+      if (FPBits(result).is_nan() || FPBits(result).is_inf())
         continue;
 
       ++count;
diff --git a/libc/test/src/math/exp2f_test.cpp b/libc/test/src/math/exp2f_test.cpp
index 8ff0ce6..7caf148 100644
--- a/libc/test/src/math/exp2f_test.cpp
+++ b/libc/test/src/math/exp2f_test.cpp
@@ -107,7 +107,7 @@ TEST_F(LlvmLibcExp2fTest, InFloatRange) {
   constexpr uint32_t STEP = UINT32_MAX / COUNT;
   for (uint32_t i = 0, v = 0; i <= COUNT; ++i, v += STEP) {
     float x = FPBits(v).get_val();
-    if (isnan(x) || isinf(x))
+    if (FPBits(v).is_nan() || FPBits(v).is_inf())
       continue;
     LIBC_NAMESPACE::libc_errno = 0;
     float result = LIBC_NAMESPACE::exp2f(x);
@@ -116,7 +116,8 @@ TEST_F(LlvmLibcExp2fTest, InFloatRange) {
     // in the single-precision floating point range, then ignore comparing with
     // MPFR result as MPFR can still produce valid results because of its
     // wider precision.
-    if (isnan(result) || isinf(result) || LIBC_NAMESPACE::libc_errno != 0)
+    if (FPBits(result).is_nan() || FPBits(result).is_inf() ||
+        LIBC_NAMESPACE::libc_errno != 0)
       continue;
     ASSERT_MPFR_MATCH_ALL_ROUNDING(mpfr::Operation::Exp2, x,
                                    LIBC_NAMESPACE::exp2f(x), 0.5);
diff --git a/libc/test/src/math/exp2m1f_test.cpp b/libc/test/src/math/exp2m1f_test.cpp
index cb94828..793cf0c 100644
--- a/libc/test/src/math/exp2m1f_test.cpp
+++ b/libc/test/src/math/exp2m1f_test.cpp
@@ -49,7 +49,7 @@ TEST_F(LlvmLibcExp2m1fTest, InFloatRange) {
   constexpr uint32_t STEP = UINT32_MAX / COUNT;
   for (uint32_t i = 0, v = 0; i <= COUNT; ++i, v += STEP) {
     float x = FPBits(v).get_val();
-    if (isnan(x) || isinf(x))
+    if (FPBits(v).is_nan() || FPBits(v).is_inf())
       continue;
     LIBC_NAMESPACE::libc_errno = 0;
     float result = LIBC_NAMESPACE::exp2m1f(x);
@@ -58,7 +58,8 @@ TEST_F(LlvmLibcExp2m1fTest, InFloatRange) {
     // in the single-precision floating point range, then ignore comparing with
     // MPFR result as MPFR can still produce valid results because of its
     // wider precision.
-    if (isnan(result) || isinf(result) || LIBC_NAMESPACE::libc_errno != 0)
+    if (FPBits(result).is_nan() || FPBits(result).is_inf() ||
+        LIBC_NAMESPACE::libc_errno != 0)
       continue;
     ASSERT_MPFR_MATCH_ALL_ROUNDING(mpfr::Operation::Exp2m1, x,
                                    LIBC_NAMESPACE::exp2m1f(x), 0.5);
diff --git a/libc/test/src/math/exp_test.cpp b/libc/test/src/math/exp_test.cpp
index 64d8198..ee674c5 100644
--- a/libc/test/src/math/exp_test.cpp
+++ b/libc/test/src/math/exp_test.cpp
@@ -77,12 +77,12 @@ TEST_F(LlvmLibcExpTest, InDoubleRange) {
 
     for (uint64_t i = 0, v = START; i <= COUNT; ++i, v += STEP) {
       double x = FPBits(v).get_val();
-      if (isnan(x) || isinf(x) || x < 0.0)
+      if (FPBits(v).is_nan() || FPBits(v).is_inf() || x < 0.0)
         continue;
       LIBC_NAMESPACE::libc_errno = 0;
       double result = LIBC_NAMESPACE::exp(x);
       ++cc;
-      if (isnan(result) || isinf(result))
+      if (FPBits(result).is_nan() || FPBits(result).is_inf())
         continue;
 
       ++count;
diff --git a/libc/test/src/math/expf16_test.cpp b/libc/test/src/math/expf16_test.cpp
new file mode 100644
index 0000000..ee89a9c
--- /dev/null
+++ b/libc/test/src/math/expf16_test.cpp
@@ -0,0 +1,40 @@
+//===-- Exhaustive test for expf16 ----------------------------------------===//
+//
+// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
+// See https://llvm.org/LICENSE.txt for license information.
+// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
+//
+//===----------------------------------------------------------------------===//
+
+#include "src/math/expf16.h"
+#include "test/UnitTest/FPMatcher.h"
+#include "test/UnitTest/Test.h"
+#include "utils/MPFRWrapper/MPFRUtils.h"
+
+using LlvmLibcExpf16Test = LIBC_NAMESPACE::testing::FPTest<float16>;
+
+namespace mpfr = LIBC_NAMESPACE::testing::mpfr;
+
+// Range: [0, Inf];
+static constexpr uint16_t POS_START = 0x0000U;
+static constexpr uint16_t POS_STOP = 0x7c00U;
+
+// Range: [-Inf, 0];
+static constexpr uint16_t NEG_START = 0x8000U;
+static constexpr uint16_t NEG_STOP = 0xfc00U;
+
+TEST_F(LlvmLibcExpf16Test, PositiveRange) {
+  for (uint16_t v = POS_START; v <= POS_STOP; ++v) {
+    float16 x = FPBits(v).get_val();
+    EXPECT_MPFR_MATCH_ALL_ROUNDING(mpfr::Operation::Exp, x,
+                                   LIBC_NAMESPACE::expf16(x), 0.5);
+  }
+}
+
+TEST_F(LlvmLibcExpf16Test, NegativeRange) {
+  for (uint16_t v = NEG_START; v <= NEG_STOP; ++v) {
+    float16 x = FPBits(v).get_val();
+    EXPECT_MPFR_MATCH_ALL_ROUNDING(mpfr::Operation::Exp, x,
+                                   LIBC_NAMESPACE::expf16(x), 0.5);
+  }
+}
diff --git a/libc/test/src/math/expf_test.cpp b/libc/test/src/math/expf_test.cpp
index 1dce381..26a0bca 100644
--- a/libc/test/src/math/expf_test.cpp
+++ b/libc/test/src/math/expf_test.cpp
@@ -108,7 +108,7 @@ TEST_F(LlvmLibcExpfTest, InFloatRange) {
   constexpr uint32_t STEP = UINT32_MAX / COUNT;
   for (uint32_t i = 0, v = 0; i <= COUNT; ++i, v += STEP) {
     float x = FPBits(v).get_val();
-    if (isnan(x) || isinf(x))
+    if (FPBits(v).is_nan() || FPBits(v).is_inf())
       continue;
     LIBC_NAMESPACE::libc_errno = 0;
     float result = LIBC_NAMESPACE::expf(x);
@@ -117,7 +117,8 @@ TEST_F(LlvmLibcExpfTest, InFloatRange) {
     // in the single-precision floating point range, then ignore comparing with
     // MPFR result as MPFR can still produce valid results because of its
     // wider precision.
-    if (isnan(result) || isinf(result) || LIBC_NAMESPACE::libc_errno != 0)
+    if (FPBits(result).is_nan() || FPBits(result).is_inf() ||
+        LIBC_NAMESPACE::libc_errno != 0)
       continue;
     EXPECT_MPFR_MATCH_ALL_ROUNDING(mpfr::Operation::Exp, x,
                                    LIBC_NAMESPACE::expf(x), 0.5);
diff --git a/libc/test/src/math/explogxf_test.cpp b/libc/test/src/math/explogxf_test.cpp
index bcca87f..01197b8 100644
--- a/libc/test/src/math/explogxf_test.cpp
+++ b/libc/test/src/math/explogxf_test.cpp
@@ -17,6 +17,7 @@
 #include "utils/MPFRWrapper/MPFRUtils.h"
 
 using LlvmLibcExplogfTest = LIBC_NAMESPACE::testing::FPTest<float>;
+using FPBits = LIBC_NAMESPACE::fputil::FPBits<float>;
 
 namespace mpfr = LIBC_NAMESPACE::testing::mpfr;
 
@@ -24,7 +25,8 @@ constexpr int def_count = 100003;
 constexpr float def_prec = 0.500001f;
 
 auto f_normal = [](float x) -> bool {
-  return !(isnan(x) || isinf(x) || LIBC_NAMESPACE::fabs(x) < 2E-38);
+  return !(FPBits(x).is_nan() || FPBits(x).is_inf() ||
+           LIBC_NAMESPACE::fabs(x) < 2E-38);
 };
 
 TEST_F(LlvmLibcExplogfTest, ExpInFloatRange) {
@@ -34,7 +36,7 @@ TEST_F(LlvmLibcExplogfTest, ExpInFloatRange) {
     return static_cast<float>(result.mh * r);
   };
   auto f_check = [](float x) -> bool {
-    return !((isnan(x) || isinf(x) || x < -70 || x > 70 ||
+    return !((FPBits(x).is_nan() || FPBits(x).is_inf() || x < -70 || x > 70 ||
               LIBC_NAMESPACE::fabsf(x) < 0x1.0p-10));
   };
   CHECK_DATA(0.0f, neg_inf, mpfr::Operation::Exp, fx, f_check, def_count,
diff --git a/libc/test/src/math/expm1_test.cpp b/libc/test/src/math/expm1_test.cpp
index df5c088..9720773 100644
--- a/libc/test/src/math/expm1_test.cpp
+++ b/libc/test/src/math/expm1_test.cpp
@@ -62,12 +62,12 @@ TEST_F(LlvmLibcExpm1Test, InDoubleRange) {
 
     for (uint64_t i = 0, v = START; i <= COUNT; ++i, v += STEP) {
       double x = FPBits(v).get_val();
-      if (isnan(x) || isinf(x) || x < 0.0)
+      if (FPBits(v).is_nan() || FPBits(v).is_inf() || x < 0.0)
         continue;
       LIBC_NAMESPACE::libc_errno = 0;
       double result = LIBC_NAMESPACE::expm1(x);
       ++cc;
-      if (isnan(result) || isinf(result))
+      if (FPBits(result).is_nan() || FPBits(result).is_inf())
         continue;
 
       ++count;
diff --git a/libc/test/src/math/expm1f_test.cpp b/libc/test/src/math/expm1f_test.cpp
index 515f988..274fe3b 100644
--- a/libc/test/src/math/expm1f_test.cpp
+++ b/libc/test/src/math/expm1f_test.cpp
@@ -117,7 +117,7 @@ TEST_F(LlvmLibcExpm1fTest, InFloatRange) {
   constexpr uint32_t STEP = UINT32_MAX / COUNT;
   for (uint32_t i = 0, v = 0; i <= COUNT; ++i, v += STEP) {
     float x = FPBits(v).get_val();
-    if (isnan(x) || isinf(x))
+    if (FPBits(v).is_nan() || FPBits(v).is_inf())
       continue;
     LIBC_NAMESPACE::libc_errno = 0;
     float result = LIBC_NAMESPACE::expm1f(x);
@@ -126,7 +126,8 @@ TEST_F(LlvmLibcExpm1fTest, InFloatRange) {
     // in the single-precision floating point range, then ignore comparing with
     // MPFR result as MPFR can still produce valid results because of its
     // wider precision.
-    if (isnan(result) || isinf(result) || LIBC_NAMESPACE::libc_errno != 0)
+    if (FPBits(result).is_nan() || FPBits(result).is_inf() ||
+        LIBC_NAMESPACE::libc_errno != 0)
       continue;
     ASSERT_MPFR_MATCH_ALL_ROUNDING(mpfr::Operation::Expm1, x,
                                    LIBC_NAMESPACE::expm1f(x), 0.5);
diff --git a/libc/test/src/math/log10_test.cpp b/libc/test/src/math/log10_test.cpp
index fd9a615..32b8468 100644
--- a/libc/test/src/math/log10_test.cpp
+++ b/libc/test/src/math/log10_test.cpp
@@ -100,12 +100,12 @@ TEST_F(LlvmLibcLog10Test, InDoubleRange) {
 
     for (uint64_t i = 0, v = START; i <= COUNT; ++i, v += STEP) {
       double x = FPBits(v).get_val();
-      if (isnan(x) || isinf(x) || x < 0.0)
+      if (FPBits(v).is_nan() || FPBits(v).is_inf() || x < 0.0)
         continue;
       LIBC_NAMESPACE::libc_errno = 0;
       double result = LIBC_NAMESPACE::log10(x);
       ++cc;
-      if (isnan(result) || isinf(result))
+      if (FPBits(result).is_nan() || FPBits(result).is_inf())
         continue;
 
       ++count;
diff --git a/libc/test/src/math/log1p_test.cpp b/libc/test/src/math/log1p_test.cpp
index 47dfa40..98486de 100644
--- a/libc/test/src/math/log1p_test.cpp
+++ b/libc/test/src/math/log1p_test.cpp
@@ -101,12 +101,12 @@ TEST_F(LlvmLibcLog1pTest, InDoubleRange) {
 
     for (uint64_t i = 0, v = start; i <= COUNT; ++i, v += step) {
       double x = FPBits(v).get_val();
-      if (isnan(x) || isinf(x) || x < 0.0)
+      if (FPBits(v).is_nan() || FPBits(v).is_inf() || x < 0.0)
         continue;
       LIBC_NAMESPACE::libc_errno = 0;
       double result = LIBC_NAMESPACE::log1p(x);
       ++cc;
-      if (isnan(result) || isinf(result))
+      if (FPBits(result).is_nan() || FPBits(result).is_inf())
         continue;
 
       ++count;
diff --git a/libc/test/src/math/log1pf_test.cpp b/libc/test/src/math/log1pf_test.cpp
index db0772d..b42cf3b 100644
--- a/libc/test/src/math/log1pf_test.cpp
+++ b/libc/test/src/math/log1pf_test.cpp
@@ -74,7 +74,7 @@ TEST_F(LlvmLibcLog1pfTest, InFloatRange) {
   constexpr uint32_t STEP = UINT32_MAX / COUNT;
   for (uint32_t i = 0, v = 0; i <= COUNT; ++i, v += STEP) {
     float x = FPBits(v).get_val();
-    if (isnan(x) || isinf(x))
+    if (FPBits(v).is_nan() || FPBits(v).is_inf())
       continue;
     LIBC_NAMESPACE::libc_errno = 0;
     ASSERT_MPFR_MATCH_ALL_ROUNDING(mpfr::Operation::Log1p, x,
diff --git a/libc/test/src/math/log2_test.cpp b/libc/test/src/math/log2_test.cpp
index 9992c13..f9bd93d 100644
--- a/libc/test/src/math/log2_test.cpp
+++ b/libc/test/src/math/log2_test.cpp
@@ -99,12 +99,12 @@ TEST_F(LlvmLibcLog2Test, InDoubleRange) {
 
     for (uint64_t i = 0, v = START; i <= COUNT; ++i, v += STEP) {
       double x = FPBits(v).get_val();
-      if (isnan(x) || isinf(x) || x < 0.0)
+      if (FPBits(v).is_nan() || FPBits(v).is_inf() || x < 0.0)
         continue;
       LIBC_NAMESPACE::libc_errno = 0;
       double result = LIBC_NAMESPACE::log2(x);
       ++cc;
-      if (isnan(result) || isinf(result))
+      if (FPBits(result).is_nan() || FPBits(result).is_inf())
         continue;
 
       ++count;
diff --git a/libc/test/src/math/log2f_test.cpp b/libc/test/src/math/log2f_test.cpp
index 24b51ad..83691fb 100644
--- a/libc/test/src/math/log2f_test.cpp
+++ b/libc/test/src/math/log2f_test.cpp
@@ -50,7 +50,7 @@ TEST_F(LlvmLibcLog2fTest, InFloatRange) {
   constexpr uint32_t STEP = UINT32_MAX / COUNT;
   for (uint32_t i = 0, v = 0; i <= COUNT; ++i, v += STEP) {
     float x = FPBits(v).get_val();
-    if (isnan(x) || isinf(x))
+    if (FPBits(v).is_nan() || FPBits(v).is_inf())
       continue;
     LIBC_NAMESPACE::libc_errno = 0;
     float result = LIBC_NAMESPACE::log2f(x);
@@ -58,7 +58,8 @@ TEST_F(LlvmLibcLog2fTest, InFloatRange) {
     // in the single-precision floating point range, then ignore comparing with
     // MPFR result as MPFR can still produce valid results because of its
     // wider precision.
-    if (isnan(result) || isinf(result) || LIBC_NAMESPACE::libc_errno != 0)
+    if (FPBits(result).is_nan() || FPBits(result).is_inf() ||
+        LIBC_NAMESPACE::libc_errno != 0)
       continue;
     ASSERT_MPFR_MATCH_ALL_ROUNDING(mpfr::Operation::Log2, x,
                                    LIBC_NAMESPACE::log2f(x), 0.5);
diff --git a/libc/test/src/math/log_test.cpp b/libc/test/src/math/log_test.cpp
index de1e595..c0f9edf 100644
--- a/libc/test/src/math/log_test.cpp
+++ b/libc/test/src/math/log_test.cpp
@@ -98,12 +98,12 @@ TEST_F(LlvmLibcLogTest, InDoubleRange) {
 
     for (uint64_t i = 0, v = START; i <= COUNT; ++i, v += STEP) {
       double x = FPBits(v).get_val();
-      if (isnan(x) || isinf(x) || x < 0.0)
+      if (FPBits(v).is_nan() || FPBits(v).is_inf() || x < 0.0)
         continue;
       LIBC_NAMESPACE::libc_errno = 0;
       double result = LIBC_NAMESPACE::log(x);
       ++cc;
-      if (isnan(result) || isinf(result))
+      if (FPBits(result).is_nan() || FPBits(result).is_inf())
         continue;
 
       ++count;
diff --git a/libc/test/src/math/logf_test.cpp b/libc/test/src/math/logf_test.cpp
index 28a171d..79d8275 100644
--- a/libc/test/src/math/logf_test.cpp
+++ b/libc/test/src/math/logf_test.cpp
@@ -82,7 +82,7 @@ TEST_F(LlvmLibcLogfTest, InFloatRange) {
   constexpr uint32_t STEP = UINT32_MAX / COUNT;
   for (uint32_t i = 0, v = 0; i <= COUNT; ++i, v += STEP) {
     float x = FPBits(v).get_val();
-    if (isnan(x) || isinf(x))
+    if (FPBits(v).is_nan() || FPBits(v).is_inf())
       continue;
     ASSERT_MPFR_MATCH_ALL_ROUNDING(mpfr::Operation::Log, x,
                                    LIBC_NAMESPACE::logf(x), 0.5);
diff --git a/libc/test/src/math/performance_testing/CMakeLists.txt b/libc/test/src/math/performance_testing/CMakeLists.txt
index 536338b..b43d21a 100644
--- a/libc/test/src/math/performance_testing/CMakeLists.txt
+++ b/libc/test/src/math/performance_testing/CMakeLists.txt
@@ -176,6 +176,17 @@ add_perf_binary(
 )
 
 add_perf_binary(
+  expf16_perf
+  SRCS
+    expf16_perf.cpp
+  DEPENDS
+    .single_input_single_output_diff
+    libc.src.math.expf16
+  COMPILE_OPTIONS
+    -fno-builtin
+)
+
+add_perf_binary(
   fabsf_perf
   SRCS
     fabsf_perf.cpp
diff --git a/libc/test/src/math/performance_testing/expf16_perf.cpp b/libc/test/src/math/performance_testing/expf16_perf.cpp
new file mode 100644
index 0000000..bc9d9f0
--- /dev/null
+++ b/libc/test/src/math/performance_testing/expf16_perf.cpp
@@ -0,0 +1,22 @@
+//===-- Performance test for expf16 ---------------------------------------===//
+//
+// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
+// See https://llvm.org/LICENSE.txt for license information.
+// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
+//
+//===----------------------------------------------------------------------===//
+
+#include "SingleInputSingleOutputPerf.h"
+
+#include "src/math/expf16.h"
+
+// LLVM libc might be the only libc implementation with support for float16 math
+// functions currently. We can't compare our float16 functions against the
+// system libc, so we compare them against this placeholder function.
+static float16 placeholderf16(float16 x) { return x; }
+
+int main() {
+  SINGLE_INPUT_SINGLE_OUTPUT_PERF_EX(float16, LIBC_NAMESPACE::expf16,
+                                     ::placeholderf16, 20'000,
+                                     "expf16_perf.log")
+}
diff --git a/libc/test/src/math/powf_test.cpp b/libc/test/src/math/powf_test.cpp
index 797913e..c13231f 100644
--- a/libc/test/src/math/powf_test.cpp
+++ b/libc/test/src/math/powf_test.cpp
@@ -71,18 +71,18 @@ TEST_F(LlvmLibcPowfTest, InFloatRange) {
 
     for (uint32_t i = 0, v = X_START; i <= X_COUNT; ++i, v += X_STEP) {
       float x = FPBits(v).get_val();
-      if (isnan(x) || isinf(x) || x < 0.0)
+      if (FPBits(v).is_nan() || FPBits(v).is_inf() || x < 0.0)
         continue;
 
       for (uint32_t j = 0, w = Y_START; j <= Y_COUNT; ++j, w += Y_STEP) {
         float y = FPBits(w).get_val();
-        if (isnan(y) || isinf(y))
+        if (FPBits(w).is_nan() || FPBits(w).is_inf())
           continue;
 
         LIBC_NAMESPACE::libc_errno = 0;
         float result = LIBC_NAMESPACE::powf(x, y);
         ++cc;
-        if (isnan(result) || isinf(result))
+        if (FPBits(result).is_nan() || FPBits(result).is_inf())
           continue;
 
         ++count;
diff --git a/libc/test/src/math/sin_test.cpp b/libc/test/src/math/sin_test.cpp
index 89534ae..60f6ef5 100644
--- a/libc/test/src/math/sin_test.cpp
+++ b/libc/test/src/math/sin_test.cpp
@@ -67,12 +67,12 @@ TEST_F(LlvmLibcSinTest, InDoubleRange) {
 
     for (uint64_t i = 0, v = START; i <= COUNT; ++i, v += STEP) {
       double x = FPBits(v).get_val();
-      if (isnan(x) || isinf(x))
+      if (FPBits(v).is_nan() || FPBits(v).is_inf())
         continue;
       LIBC_NAMESPACE::libc_errno = 0;
       double result = LIBC_NAMESPACE::sin(x);
       ++cc;
-      if (isnan(result) || isinf(result))
+      if (FPBits(result).is_nan() || FPBits(result).is_inf())
         continue;
 
       ++count;
diff --git a/libc/test/src/math/sincos_test.cpp b/libc/test/src/math/sincos_test.cpp
index 7e06456..09c8715 100644
--- a/libc/test/src/math/sincos_test.cpp
+++ b/libc/test/src/math/sincos_test.cpp
@@ -110,7 +110,7 @@ TEST_F(LlvmLibcSincosTest, InDoubleRange) {
 
   for (uint64_t i = 0, v = START; i <= COUNT; ++i, v += STEP) {
     double x = FPBits(v).get_val();
-    if (isnan(x) || isinf(x))
+    if (FPBits(v).is_nan() || FPBits(v).is_inf())
       continue;
 
     ASSERT_SINCOS_MATCH_ALL_ROUNDING(x);
diff --git a/libc/test/src/math/sincosf_test.cpp b/libc/test/src/math/sincosf_test.cpp
index 7c359b3..7254c3b 100644
--- a/libc/test/src/math/sincosf_test.cpp
+++ b/libc/test/src/math/sincosf_test.cpp
@@ -101,7 +101,7 @@ TEST_F(LlvmLibcSinCosfTest, InFloatRange) {
   constexpr uint32_t STEP = UINT32_MAX / COUNT;
   for (uint32_t i = 0, v = 0; i <= COUNT; ++i, v += STEP) {
     float x = FPBits(v).get_val();
-    if (isnan(x) || isinf(x))
+    if (FPBits(v).is_nan() || FPBits(v).is_inf())
       continue;
 
     EXPECT_SINCOS_MATCH_ALL_ROUNDING(x);
diff --git a/libc/test/src/math/sinf_test.cpp b/libc/test/src/math/sinf_test.cpp
index 6a8f8f4e..3703626 100644
--- a/libc/test/src/math/sinf_test.cpp
+++ b/libc/test/src/math/sinf_test.cpp
@@ -48,7 +48,7 @@ TEST_F(LlvmLibcSinfTest, InFloatRange) {
   constexpr uint32_t STEP = UINT32_MAX / COUNT;
   for (uint32_t i = 0, v = 0; i <= COUNT; ++i, v += STEP) {
     float x = FPBits(v).get_val();
-    if (isnan(x) || isinf(x))
+    if (FPBits(v).is_nan() || FPBits(v).is_inf())
       continue;
     ASSERT_MPFR_MATCH_ALL_ROUNDING(mpfr::Operation::Sin, x,
                                    LIBC_NAMESPACE::sinf(x), 0.5);
diff --git a/libc/test/src/math/sinhf_test.cpp b/libc/test/src/math/sinhf_test.cpp
index cc0552f..400df2f 100644
--- a/libc/test/src/math/sinhf_test.cpp
+++ b/libc/test/src/math/sinhf_test.cpp
@@ -46,7 +46,7 @@ TEST_F(LlvmLibcSinhfTest, InFloatRange) {
   constexpr uint32_t STEP = UINT32_MAX / COUNT;
   for (uint32_t i = 0, v = 0; i <= COUNT; ++i, v += STEP) {
     float x = FPBits(v).get_val();
-    if (isnan(x) || isinf(x))
+    if (FPBits(v).is_nan() || FPBits(v).is_inf())
       continue;
     ASSERT_MPFR_MATCH(mpfr::Operation::Sinh, x, LIBC_NAMESPACE::sinhf(x), 0.5);
   }
diff --git a/libc/test/src/math/smoke/CMakeLists.txt b/libc/test/src/math/smoke/CMakeLists.txt
index 8b29423..faca71b 100644
--- a/libc/test/src/math/smoke/CMakeLists.txt
+++ b/libc/test/src/math/smoke/CMakeLists.txt
@@ -359,6 +359,58 @@ add_fp_unittest(
 )
 
 add_fp_unittest(
+  dfmal_test
+  SUITE
+    libc-math-smoke-tests
+  SRCS
+    dfmal_test.cpp
+  HDRS
+    FmaTest.h
+  DEPENDS
+    libc.src.math.dfmal
+)
+
+add_fp_unittest(
+  dfmaf128_test
+  SUITE
+    libc-math-smoke-tests
+  SRCS
+    dfmaf128_test.cpp
+  HDRS
+    FmaTest.h
+  DEPENDS
+    libc.src.math.dfmaf128
+)
+
+add_fp_unittest(
+  dsubl_test
+  SUITE
+    libc-math-smoke-tests
+  SRCS
+    dsubl_test.cpp
+  HDRS
+    SubTest.h
+  DEPENDS
+    libc.src.errno.errno
+    libc.hdr.fenv_macros
+    libc.src.math.dsubl
+)
+
+add_fp_unittest(
+  dsubf128_test
+  SUITE
+    libc-math-smoke-tests
+  SRCS
+    dsubf128_test.cpp
+  HDRS
+    SubTest.h
+  DEPENDS
+    libc.hdr.errno_macros
+    libc.hdr.fenv_macros
+    libc.src.math.dsubf128
+)
+
+add_fp_unittest(
   floor_test
   SUITE
     libc-math-smoke-tests
@@ -941,6 +993,18 @@ add_fp_unittest(
 )
 
 add_fp_unittest(
+  exp_test
+  SUITE
+    libc-math-smoke-tests
+  SRCS
+    exp_test.cpp
+  DEPENDS
+    libc.src.errno.errno
+    libc.src.math.exp
+    libc.src.__support.FPUtil.fp_bits
+)
+
+add_fp_unittest(
   expf_test
   SUITE
     libc-math-smoke-tests
@@ -953,15 +1017,16 @@ add_fp_unittest(
 )
 
 add_fp_unittest(
- exp_test
- SUITE
-   libc-math-smoke-tests
- SRCS
-   exp_test.cpp
- DEPENDS
-   libc.src.errno.errno
-   libc.src.math.exp
-   libc.src.__support.FPUtil.fp_bits
+  expf16_test
+  SUITE
+    libc-math-smoke-tests
+  SRCS
+    expf16_test.cpp
+  DEPENDS
+    libc.hdr.errno_macros
+    libc.hdr.fenv_macros
+    libc.src.errno.errno
+    libc.src.math.expf16
 )
 
 add_fp_unittest(
@@ -3642,6 +3707,30 @@ add_fp_unittest(
 )
 
 add_fp_unittest(
+  totalorder_test
+  SUITE
+    libc-math-smoke-tests
+  SRCS
+    totalorder_test.cpp
+  HDRS
+    TotalOrderTest.h
+  DEPENDS
+    libc.src.math.totalorder
+)
+
+add_fp_unittest(
+  totalorderf_test
+  SUITE
+    libc-math-smoke-tests
+  SRCS
+    totalorderf_test.cpp
+  HDRS
+    TotalOrderTest.h
+  DEPENDS
+    libc.src.math.totalorderf
+)
+
+add_fp_unittest(
   totalorderf16_test
   SUITE
     libc-math-smoke-tests
@@ -3654,6 +3743,18 @@ add_fp_unittest(
 )
 
 add_fp_unittest(
+  totalorderf128_test
+  SUITE
+    libc-math-smoke-tests
+  SRCS
+    totalorderf128_test.cpp
+  HDRS
+    TotalOrderTest.h
+  DEPENDS
+    libc.src.math.totalorderf128
+)
+
+add_fp_unittest(
   totalordermag_test
   SUITE
     libc-math-smoke-tests
@@ -3714,6 +3815,30 @@ add_fp_unittest(
 )
 
 add_fp_unittest(
+  getpayload_test
+  SUITE
+    libc-math-smoke-tests
+  SRCS
+    getpayload_test.cpp
+  HDRS
+    GetPayloadTest.h
+  DEPENDS
+    libc.src.math.getpayload
+)
+
+add_fp_unittest(
+  getpayloadf_test
+  SUITE
+    libc-math-smoke-tests
+  SRCS
+    getpayloadf_test.cpp
+  HDRS
+    GetPayloadTest.h
+  DEPENDS
+    libc.src.math.getpayloadf
+)
+
+add_fp_unittest(
   getpayloadf16_test
   SUITE
     libc-math-smoke-tests
@@ -3726,6 +3851,42 @@ add_fp_unittest(
 )
 
 add_fp_unittest(
+  getpayloadf128_test
+  SUITE
+    libc-math-smoke-tests
+  SRCS
+    getpayloadf128_test.cpp
+  HDRS
+    GetPayloadTest.h
+  DEPENDS
+    libc.src.math.getpayloadf128
+)
+
+add_fp_unittest(
+  setpayload_test
+  SUITE
+    libc-math-smoke-tests
+  SRCS
+    setpayload_test.cpp
+  HDRS
+    SetPayloadTest.h
+  DEPENDS
+    libc.src.math.setpayload
+)
+
+add_fp_unittest(
+  setpayloadf_test
+  SUITE
+    libc-math-smoke-tests
+  SRCS
+    setpayloadf_test.cpp
+  HDRS
+    SetPayloadTest.h
+  DEPENDS
+    libc.src.math.setpayloadf
+)
+
+add_fp_unittest(
   setpayloadf16_test
   SUITE
     libc-math-smoke-tests
@@ -3738,6 +3899,18 @@ add_fp_unittest(
 )
 
 add_fp_unittest(
+  setpayloadf128_test
+  SUITE
+    libc-math-smoke-tests
+  SRCS
+    setpayloadf128_test.cpp
+  HDRS
+    SetPayloadTest.h
+  DEPENDS
+    libc.src.math.setpayloadf128
+)
+
+add_fp_unittest(
   setpayloadsigf16_test
   SUITE
     libc-math-smoke-tests
@@ -3818,8 +3991,8 @@ add_fp_unittest(
   HDRS
     SubTest.h
   DEPENDS
+    libc.hdr.errno_macros
     libc.hdr.fenv_macros
-    libc.src.__support.FPUtil.basic_operations
     libc.src.math.f16sub
 )
 
@@ -3832,8 +4005,8 @@ add_fp_unittest(
   HDRS
     SubTest.h
   DEPENDS
+    libc.hdr.errno_macros
     libc.hdr.fenv_macros
-    libc.src.__support.FPUtil.basic_operations
     libc.src.math.f16subf
 )
 
@@ -3846,8 +4019,8 @@ add_fp_unittest(
   HDRS
     SubTest.h
   DEPENDS
+    libc.hdr.errno_macros
     libc.hdr.fenv_macros
-    libc.src.__support.FPUtil.basic_operations
     libc.src.math.f16subl
 )
 
@@ -3860,8 +4033,8 @@ add_fp_unittest(
   HDRS
     SubTest.h
   DEPENDS
+    libc.hdr.errno_macros
     libc.hdr.fenv_macros
-    libc.src.__support.FPUtil.basic_operations
     libc.src.math.f16subf128
 )
 
diff --git a/libc/test/src/math/smoke/SubTest.h b/libc/test/src/math/smoke/SubTest.h
index e5e0499..9ee4220b 100644
--- a/libc/test/src/math/smoke/SubTest.h
+++ b/libc/test/src/math/smoke/SubTest.h
@@ -9,8 +9,8 @@
 #ifndef LLVM_LIBC_TEST_SRC_MATH_SMOKE_SUBTEST_H
 #define LLVM_LIBC_TEST_SRC_MATH_SMOKE_SUBTEST_H
 
+#include "hdr/errno_macros.h"
 #include "hdr/fenv_macros.h"
-#include "src/__support/FPUtil/BasicOperations.h"
 #include "test/UnitTest/FEnvSafeTest.h"
 #include "test/UnitTest/FPMatcher.h"
 #include "test/UnitTest/Test.h"
diff --git a/libc/test/src/math/smoke/dfmaf128_test.cpp b/libc/test/src/math/smoke/dfmaf128_test.cpp
new file mode 100644
index 0000000..56c1174
--- /dev/null
+++ b/libc/test/src/math/smoke/dfmaf128_test.cpp
@@ -0,0 +1,13 @@
+//===-- Unittests for dfmaf128 --------------------------------------------===//
+//
+// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
+// See https://llvm.org/LICENSE.txt for license information.
+// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
+//
+//===----------------------------------------------------------------------===//
+
+#include "FmaTest.h"
+
+#include "src/math/dfmaf128.h"
+
+LIST_NARROWING_FMA_TESTS(double, float128, LIBC_NAMESPACE::dfmaf128)
diff --git a/libc/test/src/math/smoke/dfmal_test.cpp b/libc/test/src/math/smoke/dfmal_test.cpp
new file mode 100644
index 0000000..3c38f5e
--- /dev/null
+++ b/libc/test/src/math/smoke/dfmal_test.cpp
@@ -0,0 +1,13 @@
+//===-- Unittests for dfmal -----------------------------------------------===//
+//
+// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
+// See https://llvm.org/LICENSE.txt for license information.
+// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
+//
+//===----------------------------------------------------------------------===//
+
+#include "FmaTest.h"
+
+#include "src/math/dfmal.h"
+
+LIST_NARROWING_FMA_TESTS(double, long double, LIBC_NAMESPACE::dfmal)
diff --git a/libc/test/src/math/smoke/dsubf128_test.cpp b/libc/test/src/math/smoke/dsubf128_test.cpp
new file mode 100644
index 0000000..e496cdd
--- /dev/null
+++ b/libc/test/src/math/smoke/dsubf128_test.cpp
@@ -0,0 +1,13 @@
+//===-- Unittests for dsubf128 --------------------------------------------===//
+//
+// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
+// See https://llvm.org/LICENSE.txt for license information.
+// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
+//
+//===----------------------------------------------------------------------===//
+
+#include "SubTest.h"
+
+#include "src/math/dsubf128.h"
+
+LIST_SUB_TESTS(double, float128, LIBC_NAMESPACE::dsubf128)
diff --git a/libc/test/src/math/smoke/dsubl_test.cpp b/libc/test/src/math/smoke/dsubl_test.cpp
new file mode 100644
index 0000000..98846e0
--- /dev/null
+++ b/libc/test/src/math/smoke/dsubl_test.cpp
@@ -0,0 +1,13 @@
+//===-- Unittests for dsubl -----------------------------------------------===//
+//
+// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
+// See https://llvm.org/LICENSE.txt for license information.
+// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
+//
+//===----------------------------------------------------------------------===//
+
+#include "SubTest.h"
+
+#include "src/math/dsubl.h"
+
+LIST_SUB_TESTS(double, long double, LIBC_NAMESPACE::dsubl)
diff --git a/libc/test/src/math/smoke/expf16_test.cpp b/libc/test/src/math/smoke/expf16_test.cpp
new file mode 100644
index 0000000..969870f
--- /dev/null
+++ b/libc/test/src/math/smoke/expf16_test.cpp
@@ -0,0 +1,66 @@
+//===-- Unittests for expf16 ----------------------------------------------===//
+//
+// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
+// See https://llvm.org/LICENSE.txt for license information.
+// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
+//
+//===----------------------------------------------------------------------===//
+
+#include "hdr/errno_macros.h"
+#include "hdr/fenv_macros.h"
+#include "src/errno/libc_errno.h"
+#include "src/math/expf16.h"
+#include "test/UnitTest/FPMatcher.h"
+#include "test/UnitTest/Test.h"
+
+using LlvmLibcExpf16Test = LIBC_NAMESPACE::testing::FPTest<float16>;
+
+TEST_F(LlvmLibcExpf16Test, SpecialNumbers) {
+  LIBC_NAMESPACE::libc_errno = 0;
+
+  EXPECT_FP_EQ_ALL_ROUNDING(aNaN, LIBC_NAMESPACE::expf16(aNaN));
+  EXPECT_MATH_ERRNO(0);
+
+  EXPECT_FP_EQ_WITH_EXCEPTION(aNaN, LIBC_NAMESPACE::expf16(sNaN), FE_INVALID);
+  EXPECT_MATH_ERRNO(0);
+
+  EXPECT_FP_EQ_ALL_ROUNDING(inf, LIBC_NAMESPACE::expf16(inf));
+  EXPECT_MATH_ERRNO(0);
+
+  EXPECT_FP_EQ_ALL_ROUNDING(static_cast<float16>(zero),
+                            LIBC_NAMESPACE::expf16(neg_inf));
+  EXPECT_MATH_ERRNO(0);
+
+  EXPECT_FP_EQ_ALL_ROUNDING(static_cast<float16>(1.0f),
+                            LIBC_NAMESPACE::expf16(zero));
+  EXPECT_MATH_ERRNO(0);
+
+  EXPECT_FP_EQ_ALL_ROUNDING(static_cast<float16>(1.0f),
+                            LIBC_NAMESPACE::expf16(neg_zero));
+  EXPECT_MATH_ERRNO(0);
+}
+
+TEST_F(LlvmLibcExpf16Test, Overflow) {
+  LIBC_NAMESPACE::libc_errno = 0;
+
+  EXPECT_FP_EQ_WITH_EXCEPTION(inf, LIBC_NAMESPACE::expf16(max_normal),
+                              FE_OVERFLOW);
+  EXPECT_MATH_ERRNO(ERANGE);
+
+  EXPECT_FP_EQ_WITH_EXCEPTION(
+      inf, LIBC_NAMESPACE::expf16(static_cast<float16>(12.0)), FE_OVERFLOW);
+  EXPECT_MATH_ERRNO(ERANGE);
+}
+
+TEST_F(LlvmLibcExpf16Test, Underflow) {
+  LIBC_NAMESPACE::libc_errno = 0;
+
+  EXPECT_FP_EQ_WITH_EXCEPTION(zero, LIBC_NAMESPACE::expf16(neg_max_normal),
+                              FE_UNDERFLOW | FE_INEXACT);
+  EXPECT_MATH_ERRNO(ERANGE);
+
+  EXPECT_FP_EQ_WITH_EXCEPTION(
+      zero, LIBC_NAMESPACE::expf16(static_cast<float16>(-18.0)),
+      FE_UNDERFLOW | FE_INEXACT);
+  EXPECT_MATH_ERRNO(ERANGE);
+}
diff --git a/libc/test/src/math/smoke/getpayload_test.cpp b/libc/test/src/math/smoke/getpayload_test.cpp
new file mode 100644
index 0000000..f157d45
--- /dev/null
+++ b/libc/test/src/math/smoke/getpayload_test.cpp
@@ -0,0 +1,13 @@
+//===-- Unittests for getpayload ------------------------------------------===//
+//
+// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
+// See https://llvm.org/LICENSE.txt for license information.
+// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
+//
+//===----------------------------------------------------------------------===//
+
+#include "GetPayloadTest.h"
+
+#include "src/math/getpayload.h"
+
+LIST_GETPAYLOAD_TESTS(double, LIBC_NAMESPACE::getpayload)
diff --git a/libc/test/src/math/smoke/getpayloadf128_test.cpp b/libc/test/src/math/smoke/getpayloadf128_test.cpp
new file mode 100644
index 0000000..37bb506
--- /dev/null
+++ b/libc/test/src/math/smoke/getpayloadf128_test.cpp
@@ -0,0 +1,13 @@
+//===-- Unittests for getpayloadf128 --------------------------------------===//
+//
+// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
+// See https://llvm.org/LICENSE.txt for license information.
+// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
+//
+//===----------------------------------------------------------------------===//
+
+#include "GetPayloadTest.h"
+
+#include "src/math/getpayloadf128.h"
+
+LIST_GETPAYLOAD_TESTS(float128, LIBC_NAMESPACE::getpayloadf128)
diff --git a/libc/test/src/math/smoke/getpayloadf_test.cpp b/libc/test/src/math/smoke/getpayloadf_test.cpp
new file mode 100644
index 0000000..89ed024
--- /dev/null
+++ b/libc/test/src/math/smoke/getpayloadf_test.cpp
@@ -0,0 +1,13 @@
+//===-- Unittests for getpayloadf -----------------------------------------===//
+//
+// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
+// See https://llvm.org/LICENSE.txt for license information.
+// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
+//
+//===----------------------------------------------------------------------===//
+
+#include "GetPayloadTest.h"
+
+#include "src/math/getpayloadf.h"
+
+LIST_GETPAYLOAD_TESTS(float, LIBC_NAMESPACE::getpayloadf)
diff --git a/libc/test/src/math/smoke/setpayload_test.cpp b/libc/test/src/math/smoke/setpayload_test.cpp
new file mode 100644
index 0000000..e41b3f8
--- /dev/null
+++ b/libc/test/src/math/smoke/setpayload_test.cpp
@@ -0,0 +1,13 @@
+//===-- Unittests for setpayload ------------------------------------------===//
+//
+// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
+// See https://llvm.org/LICENSE.txt for license information.
+// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
+//
+//===----------------------------------------------------------------------===//
+
+#include "SetPayloadTest.h"
+
+#include "src/math/setpayload.h"
+
+LIST_SETPAYLOAD_TESTS(double, LIBC_NAMESPACE::setpayload)
diff --git a/libc/test/src/math/smoke/setpayloadf128_test.cpp b/libc/test/src/math/smoke/setpayloadf128_test.cpp
new file mode 100644
index 0000000..4b17bfe
--- /dev/null
+++ b/libc/test/src/math/smoke/setpayloadf128_test.cpp
@@ -0,0 +1,13 @@
+//===-- Unittests for setpayloadf128 --------------------------------------===//
+//
+// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
+// See https://llvm.org/LICENSE.txt for license information.
+// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
+//
+//===----------------------------------------------------------------------===//
+
+#include "SetPayloadTest.h"
+
+#include "src/math/setpayloadf128.h"
+
+LIST_SETPAYLOAD_TESTS(float128, LIBC_NAMESPACE::setpayloadf128)
diff --git a/libc/test/src/math/smoke/setpayloadf_test.cpp b/libc/test/src/math/smoke/setpayloadf_test.cpp
new file mode 100644
index 0000000..51e285f
--- /dev/null
+++ b/libc/test/src/math/smoke/setpayloadf_test.cpp
@@ -0,0 +1,13 @@
+//===-- Unittests for setpayloadf -----------------------------------------===//
+//
+// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
+// See https://llvm.org/LICENSE.txt for license information.
+// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
+//
+//===----------------------------------------------------------------------===//
+
+#include "SetPayloadTest.h"
+
+#include "src/math/setpayloadf.h"
+
+LIST_SETPAYLOAD_TESTS(float, LIBC_NAMESPACE::setpayloadf)
diff --git a/libc/test/src/math/smoke/totalorder_test.cpp b/libc/test/src/math/smoke/totalorder_test.cpp
new file mode 100644
index 0000000..21f49c3
--- /dev/null
+++ b/libc/test/src/math/smoke/totalorder_test.cpp
@@ -0,0 +1,13 @@
+//===-- Unittests for totalorder ------------------------------------------===//
+//
+// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
+// See https://llvm.org/LICENSE.txt for license information.
+// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
+//
+//===----------------------------------------------------------------------===//
+
+#include "TotalOrderTest.h"
+
+#include "src/math/totalorder.h"
+
+LIST_TOTALORDER_TESTS(double, LIBC_NAMESPACE::totalorder)
diff --git a/libc/test/src/math/smoke/totalorderf128_test.cpp b/libc/test/src/math/smoke/totalorderf128_test.cpp
new file mode 100644
index 0000000..6f7fd6a
--- /dev/null
+++ b/libc/test/src/math/smoke/totalorderf128_test.cpp
@@ -0,0 +1,13 @@
+//===-- Unittests for totalorderf128 --------------------------------------===//
+//
+// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
+// See https://llvm.org/LICENSE.txt for license information.
+// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
+//
+//===----------------------------------------------------------------------===//
+
+#include "TotalOrderTest.h"
+
+#include "src/math/totalorderf128.h"
+
+LIST_TOTALORDER_TESTS(float128, LIBC_NAMESPACE::totalorderf128)
diff --git a/libc/test/src/math/smoke/totalorderf_test.cpp b/libc/test/src/math/smoke/totalorderf_test.cpp
new file mode 100644
index 0000000..71db87c
--- /dev/null
+++ b/libc/test/src/math/smoke/totalorderf_test.cpp
@@ -0,0 +1,13 @@
+//===-- Unittests for totalorderf -----------------------------------------===//
+//
+// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
+// See https://llvm.org/LICENSE.txt for license information.
+// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
+//
+//===----------------------------------------------------------------------===//
+
+#include "TotalOrderTest.h"
+
+#include "src/math/totalorderf.h"
+
+LIST_TOTALORDER_TESTS(float, LIBC_NAMESPACE::totalorderf)
diff --git a/libc/test/src/math/tan_test.cpp b/libc/test/src/math/tan_test.cpp
index 80d5793..1ca67af 100644
--- a/libc/test/src/math/tan_test.cpp
+++ b/libc/test/src/math/tan_test.cpp
@@ -75,12 +75,12 @@ TEST_F(LlvmLibcTanTest, InDoubleRange) {
 
     for (uint64_t i = 0, v = START; i <= COUNT; ++i, v += STEP) {
       double x = FPBits(v).get_val();
-      if (isnan(x) || isinf(x))
+      if (FPBits(v).is_nan() || FPBits(v).is_inf())
         continue;
 
       double result = LIBC_NAMESPACE::tan(x);
       ++total;
-      if (isnan(result) || isinf(result))
+      if (FPBits(result).is_nan() || FPBits(result).is_inf())
         continue;
 
       ++tested;
diff --git a/libc/test/src/math/tanf_test.cpp b/libc/test/src/math/tanf_test.cpp
index e624d30..9b9e127 100644
--- a/libc/test/src/math/tanf_test.cpp
+++ b/libc/test/src/math/tanf_test.cpp
@@ -48,7 +48,7 @@ TEST_F(LlvmLibcTanfTest, InFloatRange) {
   constexpr uint32_t STEP = UINT32_MAX / COUNT;
   for (uint32_t i = 0, v = 0; i <= COUNT; ++i, v += STEP) {
     float x = FPBits(v).get_val();
-    if (isnan(x) || isinf(x))
+    if (FPBits(v).is_nan() || FPBits(v).is_inf())
       continue;
     ASSERT_MPFR_MATCH_ALL_ROUNDING(mpfr::Operation::Tan, x,
                                    LIBC_NAMESPACE::tanf(x), 0.5);
diff --git a/libc/test/src/math/tanhf_test.cpp b/libc/test/src/math/tanhf_test.cpp
index c34efe8..2e74984 100644
--- a/libc/test/src/math/tanhf_test.cpp
+++ b/libc/test/src/math/tanhf_test.cpp
@@ -45,7 +45,7 @@ TEST_F(LlvmLibcTanhfTest, InFloatRange) {
   constexpr uint32_t STEP = UINT32_MAX / COUNT;
   for (uint32_t i = 0, v = 0; i <= COUNT; ++i, v += STEP) {
     float x = FPBits(v).get_val();
-    if (isnan(x) || isinf(x))
+    if (FPBits(v).is_nan() || FPBits(v).is_inf())
       continue;
     ASSERT_MPFR_MATCH_ALL_ROUNDING(mpfr::Operation::Tanh, x,
                                    LIBC_NAMESPACE::tanhf(x), 0.5);
diff --git a/libc/test/src/stdio/CMakeLists.txt b/libc/test/src/stdio/CMakeLists.txt
index 10ec890..4ac83ec 100644
--- a/libc/test/src/stdio/CMakeLists.txt
+++ b/libc/test/src/stdio/CMakeLists.txt
@@ -283,6 +283,20 @@ add_libc_test(
 )
 
 add_libc_test(
+  vsscanf_test
+  SUITE
+    libc_stdio_unittests
+  SRCS
+    vsscanf_test.cpp
+  DEPENDS
+    libc.src.stdio.vsscanf
+  LINK_LIBRARIES
+    LibcFPTestHelpers
+  COMPILE_OPTIONS
+    ${sscanf_test_copts}
+)
+
+add_libc_test(
   puts_test
   HERMETIC_TEST_ONLY # writes to libc's stdout
   SUITE
diff --git a/libc/test/src/stdio/vsscanf_test.cpp b/libc/test/src/stdio/vsscanf_test.cpp
new file mode 100644
index 0000000..4194e10
--- /dev/null
+++ b/libc/test/src/stdio/vsscanf_test.cpp
@@ -0,0 +1,159 @@
+//===-- Unittests for sscanf ----------------------------------------------===//
+//
+// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
+// See https://llvm.org/LICENSE.txt for license information.
+// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
+//
+//===----------------------------------------------------------------------===//
+
+#include "src/stdio/vsscanf.h"
+
+#include "test/UnitTest/Test.h"
+
+int call_vsscanf(const char *__restrict buffer, const char *__restrict format,
+                 ...) {
+  va_list vlist;
+  va_start(vlist, format);
+  int ret = LIBC_NAMESPACE::vsscanf(buffer, format, vlist);
+  va_end(vlist);
+  return ret;
+}
+
+TEST(LlvmLibcVSScanfTest, SimpleStringConv) {
+  int ret_val;
+  char buffer[10];
+  char buffer2[10];
+  ret_val = call_vsscanf("abc123", "abc %s", buffer);
+  ASSERT_EQ(ret_val, 1);
+  ASSERT_STREQ(buffer, "123");
+
+  ret_val = call_vsscanf("abc123", "%3s %3s", buffer, buffer2);
+  ASSERT_EQ(ret_val, 2);
+  ASSERT_STREQ(buffer, "abc");
+  ASSERT_STREQ(buffer2, "123");
+
+  ret_val = call_vsscanf("abc 123", "%3s%3s", buffer, buffer2);
+  ASSERT_EQ(ret_val, 2);
+  ASSERT_STREQ(buffer, "abc");
+  ASSERT_STREQ(buffer2, "123");
+}
+
+TEST(LlvmLibcVSScanfTest, IntConvSimple) {
+  int ret_val;
+  int result = 0;
+  ret_val = call_vsscanf("123", "%d", &result);
+  EXPECT_EQ(ret_val, 1);
+  EXPECT_EQ(result, 123);
+
+  ret_val = call_vsscanf("456", "%i", &result);
+  EXPECT_EQ(ret_val, 1);
+  EXPECT_EQ(result, 456);
+
+  ret_val = call_vsscanf("789", "%x", &result);
+  EXPECT_EQ(ret_val, 1);
+  EXPECT_EQ(result, 0x789);
+
+  ret_val = call_vsscanf("012", "%o", &result);
+  EXPECT_EQ(ret_val, 1);
+  EXPECT_EQ(result, 012);
+
+  ret_val = call_vsscanf("345", "%u", &result);
+  EXPECT_EQ(ret_val, 1);
+  EXPECT_EQ(result, 345);
+
+  // 288 characters
+  ret_val = call_vsscanf("10000000000000000000000000000000"
+                         "00000000000000000000000000000000"
+                         "00000000000000000000000000000000"
+                         "00000000000000000000000000000000"
+                         "00000000000000000000000000000000"
+                         "00000000000000000000000000000000"
+                         "00000000000000000000000000000000"
+                         "00000000000000000000000000000000"
+                         "00000000000000000000000000000000",
+                         "%d", &result);
+  EXPECT_EQ(ret_val, 1);
+  EXPECT_EQ(result, int(LIBC_NAMESPACE::cpp::numeric_limits<intmax_t>::max()));
+
+  ret_val = call_vsscanf("Not an integer", "%d", &result);
+  EXPECT_EQ(ret_val, 0);
+}
+
+TEST(LlvmLibcVSScanfTest, IntConvLengthModifier) {
+  int ret_val;
+  uintmax_t max_result = 0;
+  int int_result = 0;
+  char char_result = 0;
+
+  ret_val = call_vsscanf("123", "%ju", &max_result);
+  EXPECT_EQ(ret_val, 1);
+  EXPECT_EQ(max_result, uintmax_t(123));
+
+  // Check overflow handling
+  ret_val =
+      call_vsscanf("999999999999999999999999999999999999", "%ju", &max_result);
+  EXPECT_EQ(ret_val, 1);
+  EXPECT_EQ(max_result, LIBC_NAMESPACE::cpp::numeric_limits<uintmax_t>::max());
+
+  // Because this is unsigned, any out of range value should return the maximum,
+  // even with a negative sign.
+  ret_val =
+      call_vsscanf("-999999999999999999999999999999999999", "%ju", &max_result);
+  EXPECT_EQ(ret_val, 1);
+  EXPECT_EQ(max_result, LIBC_NAMESPACE::cpp::numeric_limits<uintmax_t>::max());
+
+  ret_val = call_vsscanf("-18446744073709551616", "%ju", &max_result);
+  EXPECT_EQ(ret_val, 1);
+  EXPECT_EQ(max_result, LIBC_NAMESPACE::cpp::numeric_limits<uintmax_t>::max());
+
+  // But any number below the maximum should have the - sign applied.
+  ret_val = call_vsscanf("-1", "%ju", &max_result);
+  EXPECT_EQ(ret_val, 1);
+  EXPECT_EQ(max_result, uintmax_t(-1));
+
+  ret_val = call_vsscanf("-1", "%u", &int_result);
+  EXPECT_EQ(ret_val, 1);
+  EXPECT_EQ(int_result, -1);
+
+  max_result = 0xff00ff00ff00ff00;
+  char_result = 0x6f;
+
+  // Overflows for sizes larger than the maximum are handled by casting.
+  ret_val = call_vsscanf("8589967360", "%d", &int_result);
+  EXPECT_EQ(ret_val, 1);
+  EXPECT_EQ(int_result, int(8589967360)); // 2^33 + 2^15
+
+  // Check that the adjacent values weren't touched by the overflow.
+  ASSERT_EQ(max_result, uintmax_t(0xff00ff00ff00ff00));
+  ASSERT_EQ(char_result, char(0x6f));
+
+  ret_val = call_vsscanf("-8589967360", "%d", &int_result);
+  EXPECT_EQ(ret_val, 1);
+  EXPECT_EQ(int_result, int(-8589967360));
+  ASSERT_EQ(max_result, uintmax_t(0xff00ff00ff00ff00));
+  ASSERT_EQ(char_result, char(0x6f));
+
+  ret_val = call_vsscanf("25", "%hhd", &char_result);
+  EXPECT_EQ(ret_val, 1);
+  EXPECT_EQ(char_result, char(25));
+}
+
+TEST(LlvmLibcVSScanfTest, IntConvBaseSelection) {
+  int ret_val;
+  int result = 0;
+  ret_val = call_vsscanf("0xabc123", "%i", &result);
+  EXPECT_EQ(ret_val, 1);
+  EXPECT_EQ(result, 0xabc123);
+
+  ret_val = call_vsscanf("0456", "%i", &result);
+  EXPECT_EQ(ret_val, 1);
+  EXPECT_EQ(result, 0456);
+
+  ret_val = call_vsscanf("0999", "%i", &result);
+  EXPECT_EQ(ret_val, 1);
+  EXPECT_EQ(result, 0);
+
+  ret_val = call_vsscanf("123abc456", "%i", &result);
+  EXPECT_EQ(ret_val, 1);
+  EXPECT_EQ(result, 123);
+}
diff --git a/libc/test/src/unistd/CMakeLists.txt b/libc/test/src/unistd/CMakeLists.txt
index f829265..332455b 100644
--- a/libc/test/src/unistd/CMakeLists.txt
+++ b/libc/test/src/unistd/CMakeLists.txt
@@ -379,16 +379,6 @@ add_libc_unittest(
 )
 
 add_libc_unittest(
-  gettid_test
-  SUITE
-    libc_unistd_unittests
-  SRCS
-    gettid_test.cpp
-  DEPENDS
-    libc.src.unistd.gettid
-)
-
-add_libc_unittest(
   getppid_test
   SUITE
     libc_unistd_unittests
diff --git a/libc/test/utils/FPUtil/x86_long_double_test.cpp b/libc/test/utils/FPUtil/x86_long_double_test.cpp
index 87796b5..8d16869 100644
--- a/libc/test/utils/FPUtil/x86_long_double_test.cpp
+++ b/libc/test/utils/FPUtil/x86_long_double_test.cpp
@@ -27,8 +27,6 @@ TEST(LlvmLibcX86LongDoubleTest, is_nan) {
     // If exponent has the max value and the implicit bit is 0,
     // then the number is a NaN for all values of mantissa.
     bits.set_mantissa(i);
-    long double nan = bits.get_val();
-    ASSERT_NE(static_cast<int>(isnan(nan)), 0);
     ASSERT_TRUE(bits.is_nan());
   }
 
@@ -38,8 +36,6 @@ TEST(LlvmLibcX86LongDoubleTest, is_nan) {
     // then the number is a NaN for all non-zero values of mantissa.
     // Note the initial value of |i| of 1 to avoid a zero mantissa.
     bits.set_mantissa(i);
-    long double nan = bits.get_val();
-    ASSERT_NE(static_cast<int>(isnan(nan)), 0);
     ASSERT_TRUE(bits.is_nan());
   }
 
@@ -49,8 +45,6 @@ TEST(LlvmLibcX86LongDoubleTest, is_nan) {
     // If exponent is non-zero and also not max, and the implicit bit is 0,
     // then the number is a NaN for all values of mantissa.
     bits.set_mantissa(i);
-    long double nan = bits.get_val();
-    ASSERT_NE(static_cast<int>(isnan(nan)), 0);
     ASSERT_TRUE(bits.is_nan());
   }
 
@@ -60,8 +54,6 @@ TEST(LlvmLibcX86LongDoubleTest, is_nan) {
     // If exponent is non-zero and also not max, and the implicit bit is 1,
     // then the number is normal value for all values of mantissa.
     bits.set_mantissa(i);
-    long double valid = bits.get_val();
-    ASSERT_EQ(static_cast<int>(isnan(valid)), 0);
     ASSERT_FALSE(bits.is_nan());
   }
 
@@ -70,8 +62,6 @@ TEST(LlvmLibcX86LongDoubleTest, is_nan) {
   for (unsigned int i = 0; i < COUNT; ++i) {
     // If exponent is zero, then the number is a valid but denormal value.
     bits.set_mantissa(i);
-    long double valid = bits.get_val();
-    ASSERT_EQ(static_cast<int>(isnan(valid)), 0);
     ASSERT_FALSE(bits.is_nan());
   }
 
@@ -80,8 +70,6 @@ TEST(LlvmLibcX86LongDoubleTest, is_nan) {
   for (unsigned int i = 0; i < COUNT; ++i) {
     // If exponent is zero, then the number is a valid but denormal value.
     bits.set_mantissa(i);
-    long double valid = bits.get_val();
-    ASSERT_EQ(static_cast<int>(isnan(valid)), 0);
     ASSERT_FALSE(bits.is_nan());
   }
 }
diff --git a/libc/utils/MPFRWrapper/MPFRUtils.cpp b/libc/utils/MPFRWrapper/MPFRUtils.cpp
index f88ee2a..4263c9d 100644
--- a/libc/utils/MPFRWrapper/MPFRUtils.cpp
+++ b/libc/utils/MPFRWrapper/MPFRUtils.cpp
@@ -1086,6 +1086,9 @@ template void
 explain_ternary_operation_one_output_error(Operation,
                                            const TernaryInput<long double> &,
                                            long double, double, RoundingMode);
+
+template void explain_ternary_operation_one_output_error(
+    Operation, const TernaryInput<long double> &, double, double, RoundingMode);
 #ifdef LIBC_TYPES_HAS_FLOAT16
 template void explain_ternary_operation_one_output_error(
     Operation, const TernaryInput<float> &, float16, double, RoundingMode);
@@ -1271,6 +1274,9 @@ template bool
 compare_ternary_operation_one_output(Operation,
                                      const TernaryInput<long double> &,
                                      long double, double, RoundingMode);
+
+template bool compare_ternary_operation_one_output(
+    Operation, const TernaryInput<long double> &, double, double, RoundingMode);
 #ifdef LIBC_TYPES_HAS_FLOAT16
 template bool compare_ternary_operation_one_output(Operation,
                                                    const TernaryInput<float> &,
diff --git a/libcxx/.clang-format b/libcxx/.clang-format
index 871920f..b2ca452 100644
--- a/libcxx/.clang-format
+++ b/libcxx/.clang-format
@@ -24,7 +24,6 @@ AttributeMacros: [
                   '_LIBCPP_CONSTEXPR_SINCE_CXX23',
                   '_LIBCPP_CONSTEXPR',
                   '_LIBCPP_CONSTINIT',
-                  '_LIBCPP_DEPRECATED_ATOMIC_SYNC',
                   '_LIBCPP_DEPRECATED_IN_CXX11',
                   '_LIBCPP_DEPRECATED_IN_CXX14',
                   '_LIBCPP_DEPRECATED_IN_CXX17',
diff --git a/libcxx/CMakeLists.txt b/libcxx/CMakeLists.txt
index 917c6be..6168c76 100644
--- a/libcxx/CMakeLists.txt
+++ b/libcxx/CMakeLists.txt
@@ -310,10 +310,6 @@ endif()
 option(LIBCXX_ENABLE_PEDANTIC "Compile with pedantic enabled." OFF)
 option(LIBCXX_ENABLE_WERROR "Fail and stop if a warning is triggered." OFF)
 
-option(LIBCXX_GENERATE_COVERAGE "Enable generating code coverage." OFF)
-set(LIBCXX_COVERAGE_LIBRARY "" CACHE STRING
-    "The Profile-rt library used to build with code coverage")
-
 set(LIBCXX_HERMETIC_STATIC_LIBRARY_DEFAULT OFF)
 if (WIN32)
   set(LIBCXX_HERMETIC_STATIC_LIBRARY_DEFAULT ON)
@@ -376,12 +372,6 @@ if (NOT LIBCXX_ENABLE_RTTI AND LIBCXX_ENABLE_EXCEPTIONS)
                       " for details.")
 endif()
 
-# Ensure LLVM_USE_SANITIZER is not specified when LIBCXX_GENERATE_COVERAGE
-# is ON.
-if (LLVM_USE_SANITIZER AND LIBCXX_GENERATE_COVERAGE)
-  message(FATAL_ERROR "LLVM_USE_SANITIZER cannot be used with LIBCXX_GENERATE_COVERAGE")
-endif()
-
 if (LIBCXX_ENABLE_ABI_LINKER_SCRIPT)
     if (APPLE)
       message(FATAL_ERROR "LIBCXX_ENABLE_ABI_LINKER_SCRIPT cannot be used on APPLE targets")
@@ -490,12 +480,6 @@ endif()
 # Configure compiler.
 include(config-ix)
 
-# Configure coverage options.
-if (LIBCXX_GENERATE_COVERAGE)
-  include(CodeCoverage)
-  set(CMAKE_BUILD_TYPE "COVERAGE" CACHE STRING "" FORCE)
-endif()
-
 #===============================================================================
 # Setup Compiler Flags
 #===============================================================================
@@ -861,10 +845,6 @@ add_subdirectory(src)
 add_subdirectory(utils)
 add_subdirectory(modules)
 
-if (LIBCXX_INCLUDE_BENCHMARKS)
-  add_subdirectory(benchmarks)
-endif()
-
 if (LIBCXX_INCLUDE_TESTS)
   add_subdirectory(test)
   add_subdirectory(lib/abi)
diff --git a/libcxx/cmake/Modules/CodeCoverage.cmake b/libcxx/cmake/Modules/CodeCoverage.cmake
deleted file mode 100644
index 1bd3a78..0000000
--- a/libcxx/cmake/Modules/CodeCoverage.cmake
+++ /dev/null
@@ -1,50 +0,0 @@
-find_program(CODE_COVERAGE_LCOV lcov)
-if (NOT CODE_COVERAGE_LCOV)
-  message(FATAL_ERROR "Cannot find lcov...")
-endif()
-
-find_program(CODE_COVERAGE_LLVM_COV llvm-cov)
-if (NOT CODE_COVERAGE_LLVM_COV)
-  message(FATAL_ERROR "Cannot find llvm-cov...")
-endif()
-
-find_program(CODE_COVERAGE_GENHTML genhtml)
-if (NOT CODE_COVERAGE_GENHTML)
-  message(FATAL_ERROR "Cannot find genhtml...")
-endif()
-
-set(CMAKE_CXX_FLAGS_COVERAGE "-g -O0 --coverage")
-
-function(setup_lcov_test_target_coverage target_name output_dir capture_dirs source_dirs)
-  if (NOT DEFINED LIBCXX_BINARY_DIR)
-    message(FATAL_ERROR "Variable must be set")
-  endif()
-
-  set(GCOV_TOOL "${LIBCXX_BINARY_DIR}/llvm-cov-wrapper")
-  file(GENERATE OUTPUT ${GCOV_TOOL}
-    CONTENT "#!/usr/bin/env bash\n${CODE_COVERAGE_LLVM_COV} gcov \"$@\"\n")
-
-  file(MAKE_DIRECTORY ${output_dir})
-
-  set(CAPTURE_DIRS "")
-  foreach(cdir ${capture_dirs})
-    list(APPEND CAPTURE_DIRS "-d;${cdir}")
-  endforeach()
-
-  set(EXTRACT_DIRS "")
-  foreach(sdir ${source_dirs})
-    list(APPEND EXTRACT_DIRS "'${sdir}/*'")
-  endforeach()
-
-  message(STATUS "Capture Directories: ${CAPTURE_DIRS}")
-  message(STATUS "Extract Directories: ${EXTRACT_DIRS}")
-
-  add_custom_target(generate-lib${target_name}-coverage
-        COMMAND chmod +x ${GCOV_TOOL}
-        COMMAND ${CODE_COVERAGE_LCOV} --gcov-tool ${GCOV_TOOL} --capture ${CAPTURE_DIRS} -o test_coverage.info
-        COMMAND ${CODE_COVERAGE_LCOV} --gcov-tool ${GCOV_TOOL} --extract test_coverage.info ${EXTRACT_DIRS} -o test_coverage.info
-        COMMAND ${CODE_COVERAGE_GENHTML} --demangle-cpp test_coverage.info -o test_coverage
-        COMMAND ${CMAKE_COMMAND} -E remove test_coverage.info
-        WORKING_DIRECTORY ${output_dir}
-        COMMENT "Generating coverage results")
-endfunction()
diff --git a/libcxx/cmake/caches/Generic-no-exceptions.cmake b/libcxx/cmake/caches/Generic-no-exceptions.cmake
index f0dffef..c5b2ffd 100644
--- a/libcxx/cmake/caches/Generic-no-exceptions.cmake
+++ b/libcxx/cmake/caches/Generic-no-exceptions.cmake
@@ -1,2 +1,6 @@
 set(LIBCXX_ENABLE_EXCEPTIONS OFF CACHE BOOL "")
 set(LIBCXXABI_ENABLE_EXCEPTIONS OFF CACHE BOOL "")
+
+# Speed up the CI
+set(LIBCXX_TEST_PARAMS "enable_modules=clang" CACHE STRING "")
+set(LIBCXXABI_TEST_PARAMS "${LIBCXX_TEST_PARAMS}" CACHE STRING "")
diff --git a/libcxx/cmake/caches/Generic-no-experimental.cmake b/libcxx/cmake/caches/Generic-no-experimental.cmake
index f33ed01..f68b265 100644
--- a/libcxx/cmake/caches/Generic-no-experimental.cmake
+++ b/libcxx/cmake/caches/Generic-no-experimental.cmake
@@ -1,2 +1,6 @@
 set(LIBCXX_TEST_PARAMS "enable_experimental=False" CACHE STRING "")
 set(LIBCXXABI_TEST_PARAMS "${LIBCXX_TEST_PARAMS}" CACHE STRING "")
+
+# Speed up the CI
+set(LIBCXX_TEST_PARAMS "enable_modules=clang" CACHE STRING "")
+set(LIBCXXABI_TEST_PARAMS "${LIBCXX_TEST_PARAMS}" CACHE STRING "")
diff --git a/libcxx/cmake/caches/Generic-no-filesystem.cmake b/libcxx/cmake/caches/Generic-no-filesystem.cmake
index 4000f3a..57b8d9f 100644
--- a/libcxx/cmake/caches/Generic-no-filesystem.cmake
+++ b/libcxx/cmake/caches/Generic-no-filesystem.cmake
@@ -1 +1,5 @@
 set(LIBCXX_ENABLE_FILESYSTEM OFF CACHE BOOL "")
+
+# Speed up the CI
+set(LIBCXX_TEST_PARAMS "enable_modules=clang" CACHE STRING "")
+set(LIBCXXABI_TEST_PARAMS "${LIBCXX_TEST_PARAMS}" CACHE STRING "")
diff --git a/libcxx/cmake/caches/Generic-no-localization.cmake b/libcxx/cmake/caches/Generic-no-localization.cmake
index 79d6b44..d3150ec 100644
--- a/libcxx/cmake/caches/Generic-no-localization.cmake
+++ b/libcxx/cmake/caches/Generic-no-localization.cmake
@@ -1 +1,5 @@
 set(LIBCXX_ENABLE_LOCALIZATION OFF CACHE BOOL "")
+
+# Speed up the CI
+set(LIBCXX_TEST_PARAMS "enable_modules=clang" CACHE STRING "")
+set(LIBCXXABI_TEST_PARAMS "${LIBCXX_TEST_PARAMS}" CACHE STRING "")
diff --git a/libcxx/cmake/caches/Generic-no-random_device.cmake b/libcxx/cmake/caches/Generic-no-random_device.cmake
index e9b4cc6..8cd1027 100644
--- a/libcxx/cmake/caches/Generic-no-random_device.cmake
+++ b/libcxx/cmake/caches/Generic-no-random_device.cmake
@@ -1 +1,5 @@
 set(LIBCXX_ENABLE_RANDOM_DEVICE OFF CACHE BOOL "")
+
+# Speed up the CI
+set(LIBCXX_TEST_PARAMS "enable_modules=clang" CACHE STRING "")
+set(LIBCXXABI_TEST_PARAMS "${LIBCXX_TEST_PARAMS}" CACHE STRING "")
diff --git a/libcxx/cmake/caches/Generic-no-rtti.cmake b/libcxx/cmake/caches/Generic-no-rtti.cmake
index c62ddce..d080360 100644
--- a/libcxx/cmake/caches/Generic-no-rtti.cmake
+++ b/libcxx/cmake/caches/Generic-no-rtti.cmake
@@ -2,3 +2,7 @@ set(LIBCXX_ENABLE_RTTI OFF CACHE BOOL "")
 set(LIBCXX_ENABLE_EXCEPTIONS OFF CACHE BOOL "")
 set(LIBCXXABI_ENABLE_RTTI OFF CACHE BOOL "")
 set(LIBCXXABI_ENABLE_EXCEPTIONS OFF CACHE BOOL "")
+
+# Speed up the CI
+set(LIBCXX_TEST_PARAMS "enable_modules=clang" CACHE STRING "")
+set(LIBCXXABI_TEST_PARAMS "${LIBCXX_TEST_PARAMS}" CACHE STRING "")
diff --git a/libcxx/cmake/caches/Generic-no-threads.cmake b/libcxx/cmake/caches/Generic-no-threads.cmake
index 616baef..81c92fc 100644
--- a/libcxx/cmake/caches/Generic-no-threads.cmake
+++ b/libcxx/cmake/caches/Generic-no-threads.cmake
@@ -1,3 +1,7 @@
 set(LIBCXX_ENABLE_THREADS OFF CACHE BOOL "")
 set(LIBCXXABI_ENABLE_THREADS OFF CACHE BOOL "")
 set(LIBCXX_ENABLE_MONOTONIC_CLOCK OFF CACHE BOOL "")
+
+# Speed up the CI
+set(LIBCXX_TEST_PARAMS "enable_modules=clang" CACHE STRING "")
+set(LIBCXXABI_TEST_PARAMS "${LIBCXX_TEST_PARAMS}" CACHE STRING "")
diff --git a/libcxx/cmake/caches/Generic-no-tzdb.cmake b/libcxx/cmake/caches/Generic-no-tzdb.cmake
index 27c826ed..afe1c8a 100644
--- a/libcxx/cmake/caches/Generic-no-tzdb.cmake
+++ b/libcxx/cmake/caches/Generic-no-tzdb.cmake
@@ -1 +1,5 @@
 set(LIBCXX_ENABLE_TIME_ZONE_DATABASE OFF CACHE BOOL "")
+
+# Speed up the CI
+set(LIBCXX_TEST_PARAMS "enable_modules=clang" CACHE STRING "")
+set(LIBCXXABI_TEST_PARAMS "${LIBCXX_TEST_PARAMS}" CACHE STRING "")
diff --git a/libcxx/cmake/caches/Generic-no-unicode.cmake b/libcxx/cmake/caches/Generic-no-unicode.cmake
index 01160bf2..27fbc33 100644
--- a/libcxx/cmake/caches/Generic-no-unicode.cmake
+++ b/libcxx/cmake/caches/Generic-no-unicode.cmake
@@ -1 +1,5 @@
 set(LIBCXX_ENABLE_UNICODE OFF CACHE BOOL "")
+
+# Speed up the CI
+set(LIBCXX_TEST_PARAMS "enable_modules=clang" CACHE STRING "")
+set(LIBCXXABI_TEST_PARAMS "${LIBCXX_TEST_PARAMS}" CACHE STRING "")
diff --git a/libcxx/cmake/caches/Generic-no-wide-characters.cmake b/libcxx/cmake/caches/Generic-no-wide-characters.cmake
index 728d410..72c3045 100644
--- a/libcxx/cmake/caches/Generic-no-wide-characters.cmake
+++ b/libcxx/cmake/caches/Generic-no-wide-characters.cmake
@@ -1 +1,5 @@
 set(LIBCXX_ENABLE_WIDE_CHARACTERS OFF CACHE BOOL "")
+
+# Speed up the CI
+set(LIBCXX_TEST_PARAMS "enable_modules=clang" CACHE STRING "")
+set(LIBCXXABI_TEST_PARAMS "${LIBCXX_TEST_PARAMS}" CACHE STRING "")
diff --git a/libcxx/cmake/config-ix.cmake b/libcxx/cmake/config-ix.cmake
index 9988196..270d805 100644
--- a/libcxx/cmake/config-ix.cmake
+++ b/libcxx/cmake/config-ix.cmake
@@ -12,7 +12,7 @@ include(CheckCSourceCompiles)
 # LIBCXXABI_USE_LLVM_UNWINDER set, we'd be linking against the just-built
 # libunwind (and the compiler implicit -lunwind wouldn't succeed as the newly
 # built libunwind isn't installed yet). For those cases, it'd be good to
-# link with --uwnindlib=none. Check if that option works.
+# link with --unwindlib=none. Check if that option works.
 llvm_check_compiler_linker_flag(C "--unwindlib=none" CXX_SUPPORTS_UNWINDLIB_EQ_NONE_FLAG)
 
 if (NOT LIBCXX_USE_COMPILER_RT)
diff --git a/libcxx/docs/ReleaseNotes/20.rst b/libcxx/docs/ReleaseNotes/20.rst
index f959c88..960fdd7 100644
--- a/libcxx/docs/ReleaseNotes/20.rst
+++ b/libcxx/docs/ReleaseNotes/20.rst
@@ -53,7 +53,9 @@ Deprecations and Removals
 - TODO: The ``LIBCXX_ENABLE_ASSERTIONS`` CMake variable and the ``_LIBCPP_ENABLE_ASSERTIONS`` macro that were used to enable
   the safe mode will be removed in LLVM 20.
 
-- TODO: The C++20 synchronization library will be removed entirely in language modes prior to C++20 in LLVM 20.
+- Support for the C++20 synchronization library (``<barrier>``, ``<latch>``, ``atomic::wait``, etc.) has been
+  removed in language modes prior to C++20. If you are using these features prior to C++20, you will need to
+  update to ``-std=c++20``.
 
 - TODO: The relational operators for ``std::chrono::weekday`` will be removed entirely, and the
   ``_LIBCPP_ENABLE_REMOVED_WEEKDAY_RELATIONAL_OPERATORS`` macro that was used to re-enable this extension will be
diff --git a/libcxx/docs/Status/Cxx20Issues.csv b/libcxx/docs/Status/Cxx20Issues.csv
index 55a0f85..97ecf5e 100644
--- a/libcxx/docs/Status/Cxx20Issues.csv
+++ b/libcxx/docs/Status/Cxx20Issues.csv
@@ -78,7 +78,7 @@
 "`2139 <https://wg21.link/LWG2139>`__","What is a user-defined type?","Rapperswil","",""
 "`2970 <https://wg21.link/LWG2970>`__","Return type of std::visit misspecified","Rapperswil","|Complete|","11.0"
 "`3058 <https://wg21.link/LWG3058>`__","Parallel adjacent_difference shouldn't require creating temporaries","Rapperswil","",""
-"`3062 <https://wg21.link/LWG3062>`__","Unnecessary decay_t in is_execution_policy_v should be remove_cvref_t","Rapperswil","",""
+"`3062 <https://wg21.link/LWG3062>`__","Unnecessary decay_t in is_execution_policy_v should be remove_cvref_t","Rapperswil","|Complete|","17.0"
 "`3067 <https://wg21.link/LWG3067>`__","recursive_directory_iterator::pop must invalidate","Rapperswil","|Nothing To Do|",""
 "`3071 <https://wg21.link/LWG3071>`__","[networking.ts] read_until still refers to ""input sequence""","Rapperswil","|Nothing To Do|",""
 "`3074 <https://wg21.link/LWG3074>`__","Non-member functions for valarray should only deduce from the valarray","Rapperswil","",""
@@ -124,7 +124,7 @@
 "`3137 <https://wg21.link/LWG3137>`__","Header for ``__cpp_lib_to_chars``\ ","San Diego","|Complete|",""
 "`3140 <https://wg21.link/LWG3140>`__","``COMMON_REF``\  is unimplementable as specified","San Diego","|Nothing To Do|",""
 "`3145 <https://wg21.link/LWG3145>`__","``file_clock``\  breaks ABI for C++17 implementations","San Diego","|Complete|",""
-"`3147 <https://wg21.link/LWG3147>`__","Definitions of ""likely"" and ""unlikely"" are likely to cause problems","San Diego","",""
+"`3147 <https://wg21.link/LWG3147>`__","Definitions of ""likely"" and ""unlikely"" are likely to cause problems","San Diego","|Nothing To Do|",""
 "`3148 <https://wg21.link/LWG3148>`__","``<concepts>``\  should be freestanding","San Diego","",""
 "`3153 <https://wg21.link/LWG3153>`__","``Common``\  and ``common_type``\  have too little in common","San Diego","|Complete|","13.0"
 "`3154 <https://wg21.link/LWG3154>`__","``Common``\  and ``CommonReference``\  have a common defect","San Diego","|Nothing To Do|",""
@@ -155,7 +155,7 @@
 "`3191 <https://wg21.link/LWG3191>`__","``std::ranges::shuffle``\  synopsis does not match algorithm definition","Cologne","|Complete|","15.0","|ranges|"
 "`3196 <https://wg21.link/LWG3196>`__","``std::optional<T>``\  is ill-formed is ``T``\  is an array","Cologne","|Complete|",""
 "`3198 <https://wg21.link/LWG3198>`__","Bad constraint on ``std::span::span()``\ ","Cologne","|Complete|",""
-"`3199 <https://wg21.link/LWG3199>`__","``istream >> bitset<0>``\  fails","Cologne","",""
+"`3199 <https://wg21.link/LWG3199>`__","``istream >> bitset<0>``\  fails","Cologne","|Complete|","10.0"
 "`3202 <https://wg21.link/LWG3202>`__","P0318R1 was supposed to be revised","Cologne","|Complete|",""
 "`3206 <https://wg21.link/LWG3206>`__","``year_month_day``\  conversion to ``sys_days``\  uses not-existing member function","Cologne","|Complete|",""
 "`3208 <https://wg21.link/LWG3208>`__","``Boolean``\ 's expression requirements are ordered inconsistently","Cologne","|Nothing To Do|",""
@@ -249,7 +249,7 @@
 "`3325 <https://wg21.link/LWG3325>`__","Constrain return type of transformation function for ``transform_view``\ ","Prague","|Complete|","15.0","|ranges|"
 "`3326 <https://wg21.link/LWG3326>`__","``enable_view``\  has false positives","Prague","|Complete|","15.0","|ranges|"
 "`3327 <https://wg21.link/LWG3327>`__","Format alignment specifiers vs. text direction","Prague","|Nothing To Do|","","|format|"
-"`3328 <https://wg21.link/LWG3328>`__","Clarify that ``std::string``\  is not good for UTF-8","Prague","",""
+"`3328 <https://wg21.link/LWG3328>`__","Clarify that ``std::string``\  is not good for UTF-8","Prague","|Nothing To Do|",""
 "`3329 <https://wg21.link/LWG3329>`__","``totally_ordered_with``\  both directly and indirectly requires ``common_reference_with``\ ","Prague","|Complete|","13.0"
 "`3330 <https://wg21.link/LWG3330>`__","Include ``<compare>``\  from most library headers","Prague","|Complete|","13.0","|spaceship|"
 "`3331 <https://wg21.link/LWG3331>`__","Define ``totally_ordered/_with``\  in terms of ``partially-ordered-with``\ ","Prague","|Complete|","13.0"
@@ -271,7 +271,7 @@
 "`3358 <https://wg21.link/LWG3358>`__","|sect|\ [span.cons] is mistaken that ``to_address``\  can throw","Prague","|Complete|","17.0"
 "`3359 <https://wg21.link/LWG3359>`__","``<chrono>``\  leap second support should allow for negative leap seconds","Prague","|In Progress|","","|chrono|"
 "`3360 <https://wg21.link/LWG3360>`__","``three_way_comparable_with``\  is inconsistent with similar concepts","Prague","|Nothing To Do|","","|spaceship|"
-"`3362 <https://wg21.link/LWG3362>`__","Strike ``stop_source``\ 's ``operator!=``\ ","Prague","",""
+"`3362 <https://wg21.link/LWG3362>`__","Strike ``stop_source``\ 's ``operator!=``\ ","Prague","|Complete|","17.0"
 "`3363 <https://wg21.link/LWG3363>`__","``drop_while_view``\  should opt-out of ``sized_range``\ ","Prague","|Nothing To Do|","","|ranges|"
 "`3364 <https://wg21.link/LWG3364>`__","Initialize data members of ranges and their iterators","Prague","|Complete|","16.0","|ranges|"
 "`3367 <https://wg21.link/LWG3367>`__","Integer-class conversions should not throw","Prague","|Nothing To Do|",""
diff --git a/libcxx/docs/Status/Cxx20Papers.csv b/libcxx/docs/Status/Cxx20Papers.csv
index 4015d7a..858c789 100644
--- a/libcxx/docs/Status/Cxx20Papers.csv
+++ b/libcxx/docs/Status/Cxx20Papers.csv
@@ -171,7 +171,6 @@
 "`P1739R4 <https://wg21.link/P1739R4>`__","LWG","Avoid template bloat for safe_ranges in combination with ""subrange-y"" view adaptors","Prague","|Complete|","15.0","|ranges|"
 "`P1831R1 <https://wg21.link/P1831R1>`__","LWG","Deprecating volatile: library","Prague","* *",""
 "`P1868R2 <https://wg21.link/P1868R2>`__","LWG","width: clarifying units of width and precision in std::format","Prague","|Complete|","14.0"
-"`P1937R2 <https://wg21.link/P1937R2>`__","CWG","Fixing inconsistencies between constexpr and consteval functions","Prague","* *",""
 "`P1956R1 <https://wg21.link/P1956R1>`__","LWG","On the names of low-level bit manipulation functions","Prague","|Complete|","12.0"
 "`P1957R2 <https://wg21.link/P1957R2>`__","CWG","Converting from ``T*``\  to bool should be considered narrowing (re: US 212)","Prague","|Complete|","18.0"
 "`P1963R0 <https://wg21.link/P1963R0>`__","LWG","Fixing US 313","Prague","* *","",""
diff --git a/libcxx/docs/Status/Cxx23Issues.csv b/libcxx/docs/Status/Cxx23Issues.csv
index 0e466086..f624815 100644
--- a/libcxx/docs/Status/Cxx23Issues.csv
+++ b/libcxx/docs/Status/Cxx23Issues.csv
@@ -145,7 +145,7 @@
 "`3607 <https://wg21.link/LWG3607>`__","``contiguous_iterator`` should not be allowed to have custom ``iter_move`` and ``iter_swap`` behavior","February 2022","|Nothing to do|","","|ranges|"
 "`3610 <https://wg21.link/LWG3610>`__","``iota_view::size`` sometimes rejects integer-class types","February 2022","","","|ranges|"
 "`3612 <https://wg21.link/LWG3612>`__","Inconsistent pointer alignment in ``std::format`` ","February 2022","|Complete|","14.0","|format|"
-"`3616 <https://wg21.link/LWG3616>`__","LWG 3498 seems to miss the non-member ``swap`` for ``basic_syncbuf`` ","February 2022","",""
+"`3616 <https://wg21.link/LWG3616>`__","LWG 3498 seems to miss the non-member ``swap`` for ``basic_syncbuf`` ","February 2022","|Complete|","18.0"
 "`3618 <https://wg21.link/LWG3618>`__","Unnecessary ``iter_move`` for ``transform_view::iterator`` ","February 2022","|Complete|","19.0","|ranges|"
 "`3619 <https://wg21.link/LWG3619>`__","Specification of ``vformat_to`` contains ill-formed ``formatted_size`` calls","February 2022","|Nothing to do|","","|format|"
 "`3621 <https://wg21.link/LWG3621>`__","Remove feature-test macro ``__cpp_lib_monadic_optional`` ","February 2022","|Complete|","15.0"
@@ -164,9 +164,9 @@
 "`3656 <https://wg21.link/LWG3656>`__","Inconsistent bit operations returning a count","July 2022","|Complete|","15.0",""
 "`3659 <https://wg21.link/LWG3659>`__","Consider ``ATOMIC_FLAG_INIT`` undeprecation","July 2022","|Complete|","15.0"
 "`3670 <https://wg21.link/LWG3670>`__","``Cpp17InputIterators`` don't have integer-class difference types","July 2022","","","|ranges|"
-"`3671 <https://wg21.link/LWG3671>`__","``atomic_fetch_xor`` missing from ``stdatomic.h``","July 2022","",""
+"`3671 <https://wg21.link/LWG3671>`__","``atomic_fetch_xor`` missing from ``stdatomic.h``","July 2022","|Complete|","20.0"
 "`3672 <https://wg21.link/LWG3672>`__","``common_iterator::operator->()`` should return by value","July 2022","|Complete|","19.0","|ranges|"
-"`3683 <https://wg21.link/LWG3683>`__","``operator==`` for ``polymorphic_allocator`` cannot deduce template argument in common cases","July 2022","",""
+"`3683 <https://wg21.link/LWG3683>`__","``operator==`` for ``polymorphic_allocator`` cannot deduce template argument in common cases","July 2022","|Complete|","20.0"
 "`3687 <https://wg21.link/LWG3687>`__","``expected<cv void, E>`` move constructor should move","July 2022","|Complete|","16.0"
 "`3692 <https://wg21.link/LWG3692>`__","``zip_view::iterator``'s ``operator<=>`` is overconstrained","July 2022","","","|ranges| |spaceship|"
 "`3701 <https://wg21.link/LWG3701>`__","Make ``formatter<remove_cvref_t<const charT[N]>, charT>`` requirement explicit","July 2022","|Complete|","15.0","|format|"
@@ -180,7 +180,7 @@
 "`3710 <https://wg21.link/LWG3710>`__","The ``end`` of ``chunk_view`` for input ranges can be ``const``","July 2022","","","|ranges|"
 "`3711 <https://wg21.link/LWG3711>`__","Missing preconditions for slide_view constructor","July 2022","","","|ranges|"
 "`3712 <https://wg21.link/LWG3712>`__","``chunk_view`` and ``slide_view`` should not be ``default_initializable``","July 2022","","","|ranges|"
-"`3713 <https://wg21.link/LWG3713>`__","Sorted with respect to comparator (only)","July 2022","",""
+"`3713 <https://wg21.link/LWG3713>`__","Sorted with respect to comparator (only)","July 2022","|Nothing To Do|",""
 "`3715 <https://wg21.link/LWG3715>`__","``view_interface::empty`` is overconstrained","July 2022","|Complete|","19.0","|ranges|"
 "`3719 <https://wg21.link/LWG3719>`__","Directory iterators should be usable with default sentinel","July 2022","|Complete|","17.0","|ranges|"
 "`3721 <https://wg21.link/LWG3721>`__","Allow an ``arg-id`` with a value of zero for ``width`` in ``std-format-spec``","July 2022","|Complete|","16.0","|format|"
@@ -228,7 +228,7 @@
 "`3778 <https://wg21.link/LWG3778>`__","``vector<bool>`` missing exception specifications", "November 2022","|Complete|","3.7",""
 "`3781 <https://wg21.link/LWG3781>`__","The exposition-only alias templates ``cont-key-type`` and ``cont-mapped-type`` should be removed", "November 2022","|Nothing to do|","",""
 "`3782 <https://wg21.link/LWG3782>`__","Should ``<math.h>`` declare ``::lerp``?", "November 2022","|Complete|","17.0",""
-"`3784 <https://wg21.link/LWG3784>`__","std.compat should not provide ``::byte`` and its friends", "November 2022","","",""
+"`3784 <https://wg21.link/LWG3784>`__","std.compat should not provide ``::byte`` and its friends", "November 2022","|Complete|","19.0",""
 "`3785 <https://wg21.link/LWG3785>`__","``ranges::to`` is over-constrained on the destination type being a range", "November 2022","","","|ranges|"
 "`3788 <https://wg21.link/LWG3788>`__","``jthread::operator=(jthread&&)`` postconditions are unimplementable under self-assignment", "November 2022","","",""
 "`3792 <https://wg21.link/LWG3792>`__","``__cpp_lib_constexpr_algorithms`` should also be defined in ``<utility>``", "November 2022","|Complete|","16.0",""
@@ -241,9 +241,9 @@
 "`3817 <https://wg21.link/LWG3817>`__","Missing preconditions on ``forward_list`` modifiers", "November 2022","","",""
 "`3818 <https://wg21.link/LWG3818>`__","Exposition-only concepts are not described in library intro", "November 2022","|Nothing to do|","",""
 "`3822 <https://wg21.link/LWG3822>`__","Avoiding normalization in ``filesystem::weakly_canonical``", "November 2022","","",""
-"`3823 <https://wg21.link/LWG3823>`__","Unnecessary precondition for ``is_aggregate``", "November 2022","","",""
+"`3823 <https://wg21.link/LWG3823>`__","Unnecessary precondition for ``is_aggregate``", "November 2022","|Nothing To Do|","",""
 "`3824 <https://wg21.link/LWG3824>`__","Number of ``bind`` placeholders is underspecified", "November 2022","|Nothing to do|","",""
-"`3826 <https://wg21.link/LWG3826>`__","Redundant specification [for overload of yield_value]", "November 2022","","",""
+"`3826 <https://wg21.link/LWG3826>`__","Redundant specification [for overload of yield_value]", "November 2022","|Nothing To Do|","",""
 "","","","","",""
 "`2195 <https://wg21.link/LWG2195>`__","Missing constructors for ``match_results``","February 2023","","",""
 "`2295 <https://wg21.link/LWG2295>`__","Locale name when the provided ``Facet`` is a ``nullptr``","February 2023","","",""
@@ -288,7 +288,7 @@
 "`3803 <https://wg21.link/LWG3803>`__","``flat_foo`` constructors taking ``KeyContainer`` lack ``KeyCompare`` parameter","February 2023","","",""
 "`3810 <https://wg21.link/LWG3810>`__","CTAD for ``std::basic_format_args``","February 2023","|Complete|","17.0","|format|"
 "`3827 <https://wg21.link/LWG3827>`__","Deprecate ``<stdalign.h>`` and ``<stdbool.h>`` macros","February 2023","","",""
-"`3828 <https://wg21.link/LWG3828>`__","Sync ``intmax_t`` and ``uintmax_t`` with C2x","February 2023","","",""
+"`3828 <https://wg21.link/LWG3828>`__","Sync ``intmax_t`` and ``uintmax_t`` with C2x","February 2023","|Nothing To Do|","",""
 "`3833 <https://wg21.link/LWG3833>`__","Remove specialization ``template<size_t N> struct formatter<const charT[N], charT>``","February 2023","|Complete|","17.0","|format|"
 "`3836 <https://wg21.link/LWG3836>`__","``std::expected<bool, E1>`` conversion constructor ``expected(const expected<U, G>&)`` should take precedence over ``expected(U&&)`` with operator ``bool``","February 2023","|Complete|","18.0",""
 "`3843 <https://wg21.link/LWG3843>`__","``std::expected<T,E>::value() &`` assumes ``E`` is copy constructible","February 2023","|Complete|","17.0",""
diff --git a/libcxx/docs/Status/Cxx2cIssues.csv b/libcxx/docs/Status/Cxx2cIssues.csv
index dec9af1..29eb163 100644
--- a/libcxx/docs/Status/Cxx2cIssues.csv
+++ b/libcxx/docs/Status/Cxx2cIssues.csv
@@ -18,7 +18,7 @@
 "`3940 <https://wg21.link/LWG3940>`__","``std::expected<void, E>::value()`` also needs ``E`` to be copy constructible","Varna June 2023","|Complete|","18.0",""
 "","","","","",""
 "`2392 <https://wg21.link/LWG2392>`__","""character type"" is used but not defined","Kona November 2023","","",""
-"`3203 <https://wg21.link/LWG3203>`__","``span`` element access invalidation","Kona November 2023","","",""
+"`3203 <https://wg21.link/LWG3203>`__","``span`` element access invalidation","Kona November 2023","|Nothing To Do|","",""
 "`3305 <https://wg21.link/LWG3305>`__","``any_cast<void>``","Kona November 2023","|Complete|","18.0",""
 "`3431 <https://wg21.link/LWG3431>`__","``<=>`` for containers should require ``three_way_comparable<T>`` instead of ``<=>``","Kona November 2023","","",""
 "`3749 <https://wg21.link/LWG3749>`__","``common_iterator`` should handle integer-class difference types","Kona November 2023","","",""
diff --git a/libcxx/docs/TestingLibcxx.rst b/libcxx/docs/TestingLibcxx.rst
index 2457ed3..65a7e1e 100644
--- a/libcxx/docs/TestingLibcxx.rst
+++ b/libcxx/docs/TestingLibcxx.rst
@@ -455,9 +455,9 @@ An example build would look like:
 
   $ ninja -C build cxx-benchmarks
 
-This will build all of the benchmarks under ``<libcxx-src>/benchmarks`` to be
+This will build all of the benchmarks under ``<libcxx>/test/benchmarks`` to be
 built against the just-built libc++. The compiled tests are output into
-``build/projects/libcxx/benchmarks``.
+``build/libcxx/test/benchmarks``.
 
 Also See:
 
@@ -474,9 +474,9 @@ For example:
 
 .. code-block:: bash
 
-  $ cd build/projects/libcxx/benchmarks
-  $ ./algorithms.bench.out # Runs all the benchmarks
-  $ ./algorithms.bench.out --benchmark_filter=BM_Sort.* # Only runs the sort benchmarks
+  $ cd build/libcxx/test/benchmarks
+  $ ./find.bench.out # Runs all the benchmarks
+  $ ./find.bench.out --benchmark_filter="bm_ranges_find<std::vector<char>>" # Only runs that specific benchmark
 
 For more information about running benchmarks see `Google Benchmark`_.
 
diff --git a/libcxx/include/__algorithm/find_end.h b/libcxx/include/__algorithm/find_end.h
index 7e08e79..841e0fd 100644
--- a/libcxx/include/__algorithm/find_end.h
+++ b/libcxx/include/__algorithm/find_end.h
@@ -80,109 +80,6 @@ _LIBCPP_HIDE_FROM_ABI inline _LIBCPP_CONSTEXPR_SINCE_CXX14 pair<_Iter1, _Iter1>
   }
 }
 
-template < class _IterOps,
-           class _Pred,
-           class _Iter1,
-           class _Sent1,
-           class _Iter2,
-           class _Sent2,
-           class _Proj1,
-           class _Proj2>
-_LIBCPP_HIDE_FROM_ABI _LIBCPP_CONSTEXPR_SINCE_CXX20 _Iter1 __find_end(
-    _Iter1 __first1,
-    _Sent1 __sent1,
-    _Iter2 __first2,
-    _Sent2 __sent2,
-    _Pred& __pred,
-    _Proj1& __proj1,
-    _Proj2& __proj2,
-    bidirectional_iterator_tag,
-    bidirectional_iterator_tag) {
-  auto __last1 = _IterOps::next(__first1, __sent1);
-  auto __last2 = _IterOps::next(__first2, __sent2);
-  // modeled after search algorithm (in reverse)
-  if (__first2 == __last2)
-    return __last1; // Everything matches an empty sequence
-  _Iter1 __l1 = __last1;
-  _Iter2 __l2 = __last2;
-  --__l2;
-  while (true) {
-    // Find last element in sequence 1 that matchs *(__last2-1), with a mininum of loop checks
-    while (true) {
-      if (__first1 == __l1) // return __last1 if no element matches *__first2
-        return __last1;
-      if (std::__invoke(__pred, std::__invoke(__proj1, *--__l1), std::__invoke(__proj2, *__l2)))
-        break;
-    }
-    // *__l1 matches *__l2, now match elements before here
-    _Iter1 __m1 = __l1;
-    _Iter2 __m2 = __l2;
-    while (true) {
-      if (__m2 == __first2) // If pattern exhausted, __m1 is the answer (works for 1 element pattern)
-        return __m1;
-      if (__m1 == __first1) // Otherwise if source exhaused, pattern not found
-        return __last1;
-
-      // if there is a mismatch, restart with a new __l1
-      if (!std::__invoke(__pred, std::__invoke(__proj1, *--__m1), std::__invoke(__proj2, *--__m2))) {
-        break;
-      } // else there is a match, check next elements
-    }
-  }
-}
-
-template < class _AlgPolicy,
-           class _Pred,
-           class _Iter1,
-           class _Sent1,
-           class _Iter2,
-           class _Sent2,
-           class _Proj1,
-           class _Proj2>
-_LIBCPP_HIDE_FROM_ABI _LIBCPP_CONSTEXPR_SINCE_CXX14 _Iter1 __find_end(
-    _Iter1 __first1,
-    _Sent1 __sent1,
-    _Iter2 __first2,
-    _Sent2 __sent2,
-    _Pred& __pred,
-    _Proj1& __proj1,
-    _Proj2& __proj2,
-    random_access_iterator_tag,
-    random_access_iterator_tag) {
-  typedef typename iterator_traits<_Iter1>::difference_type _D1;
-  auto __last1 = _IterOps<_AlgPolicy>::next(__first1, __sent1);
-  auto __last2 = _IterOps<_AlgPolicy>::next(__first2, __sent2);
-  // Take advantage of knowing source and pattern lengths.  Stop short when source is smaller than pattern
-  auto __len2 = __last2 - __first2;
-  if (__len2 == 0)
-    return __last1;
-  auto __len1 = __last1 - __first1;
-  if (__len1 < __len2)
-    return __last1;
-  const _Iter1 __s = __first1 + _D1(__len2 - 1); // End of pattern match can't go before here
-  _Iter1 __l1      = __last1;
-  _Iter2 __l2      = __last2;
-  --__l2;
-  while (true) {
-    while (true) {
-      if (__s == __l1)
-        return __last1;
-      if (std::__invoke(__pred, std::__invoke(__proj1, *--__l1), std::__invoke(__proj2, *__l2)))
-        break;
-    }
-    _Iter1 __m1 = __l1;
-    _Iter2 __m2 = __l2;
-    while (true) {
-      if (__m2 == __first2)
-        return __m1;
-      // no need to check range on __m1 because __s guarantees we have enough source
-      if (!std::__invoke(__pred, std::__invoke(__proj1, *--__m1), std::__invoke(*--__m2))) {
-        break;
-      }
-    }
-  }
-}
-
 template <class _ForwardIterator1, class _ForwardIterator2, class _BinaryPredicate>
 _LIBCPP_NODISCARD inline _LIBCPP_HIDE_FROM_ABI _LIBCPP_CONSTEXPR_SINCE_CXX14 _ForwardIterator1 __find_end_classic(
     _ForwardIterator1 __first1,
diff --git a/libcxx/include/__atomic/atomic.h b/libcxx/include/__atomic/atomic.h
index bd3f659..bcea21f 100644
--- a/libcxx/include/__atomic/atomic.h
+++ b/libcxx/include/__atomic/atomic.h
@@ -429,6 +429,8 @@ _LIBCPP_HIDE_FROM_ABI bool atomic_compare_exchange_strong_explicit(
   return __o->compare_exchange_strong(*__e, __d, __s, __f);
 }
 
+#if _LIBCPP_STD_VER >= 20
+
 // atomic_wait
 
 template <class _Tp>
@@ -462,29 +464,27 @@ atomic_wait_explicit(const atomic<_Tp>* __o, typename atomic<_Tp>::value_type __
 // atomic_notify_one
 
 template <class _Tp>
-_LIBCPP_DEPRECATED_ATOMIC_SYNC _LIBCPP_AVAILABILITY_SYNC _LIBCPP_HIDE_FROM_ABI void
-atomic_notify_one(volatile atomic<_Tp>* __o) _NOEXCEPT {
+_LIBCPP_AVAILABILITY_SYNC _LIBCPP_HIDE_FROM_ABI void atomic_notify_one(volatile atomic<_Tp>* __o) _NOEXCEPT {
   __o->notify_one();
 }
 template <class _Tp>
-_LIBCPP_DEPRECATED_ATOMIC_SYNC _LIBCPP_AVAILABILITY_SYNC _LIBCPP_HIDE_FROM_ABI void
-atomic_notify_one(atomic<_Tp>* __o) _NOEXCEPT {
+_LIBCPP_AVAILABILITY_SYNC _LIBCPP_HIDE_FROM_ABI void atomic_notify_one(atomic<_Tp>* __o) _NOEXCEPT {
   __o->notify_one();
 }
 
 // atomic_notify_all
 
 template <class _Tp>
-_LIBCPP_DEPRECATED_ATOMIC_SYNC _LIBCPP_AVAILABILITY_SYNC _LIBCPP_HIDE_FROM_ABI void
-atomic_notify_all(volatile atomic<_Tp>* __o) _NOEXCEPT {
+_LIBCPP_AVAILABILITY_SYNC _LIBCPP_HIDE_FROM_ABI void atomic_notify_all(volatile atomic<_Tp>* __o) _NOEXCEPT {
   __o->notify_all();
 }
 template <class _Tp>
-_LIBCPP_DEPRECATED_ATOMIC_SYNC _LIBCPP_AVAILABILITY_SYNC _LIBCPP_HIDE_FROM_ABI void
-atomic_notify_all(atomic<_Tp>* __o) _NOEXCEPT {
+_LIBCPP_AVAILABILITY_SYNC _LIBCPP_HIDE_FROM_ABI void atomic_notify_all(atomic<_Tp>* __o) _NOEXCEPT {
   __o->notify_all();
 }
 
+#endif // _LIBCPP_STD_VER >= 20
+
 // atomic_fetch_add
 
 template <class _Tp>
diff --git a/libcxx/include/__atomic/atomic_base.h b/libcxx/include/__atomic/atomic_base.h
index 7e26434..93f5c4c 100644
--- a/libcxx/include/__atomic/atomic_base.h
+++ b/libcxx/include/__atomic/atomic_base.h
@@ -101,6 +101,7 @@ struct __atomic_base // false
     return std::__cxx_atomic_compare_exchange_strong(std::addressof(__a_), std::addressof(__e), __d, __m, __m);
   }
 
+#if _LIBCPP_STD_VER >= 20
   _LIBCPP_AVAILABILITY_SYNC _LIBCPP_HIDE_FROM_ABI void wait(_Tp __v, memory_order __m = memory_order_seq_cst) const
       volatile _NOEXCEPT {
     std::__atomic_wait(*this, __v, __m);
@@ -117,6 +118,7 @@ struct __atomic_base // false
     std::__atomic_notify_all(*this);
   }
   _LIBCPP_AVAILABILITY_SYNC _LIBCPP_HIDE_FROM_ABI void notify_all() _NOEXCEPT { std::__atomic_notify_all(*this); }
+#endif //  _LIBCPP_STD_VER >= 20
 
 #if _LIBCPP_STD_VER >= 20
   _LIBCPP_HIDE_FROM_ABI constexpr __atomic_base() noexcept(is_nothrow_default_constructible_v<_Tp>) : __a_(_Tp()) {}
diff --git a/libcxx/include/__atomic/atomic_flag.h b/libcxx/include/__atomic/atomic_flag.h
index 00b157c..abebfc1 100644
--- a/libcxx/include/__atomic/atomic_flag.h
+++ b/libcxx/include/__atomic/atomic_flag.h
@@ -48,26 +48,24 @@ struct atomic_flag {
     __cxx_atomic_store(&__a_, _LIBCPP_ATOMIC_FLAG_TYPE(false), __m);
   }
 
-  _LIBCPP_DEPRECATED_ATOMIC_SYNC _LIBCPP_AVAILABILITY_SYNC _LIBCPP_HIDE_FROM_ABI void
-  wait(bool __v, memory_order __m = memory_order_seq_cst) const volatile _NOEXCEPT {
+#if _LIBCPP_STD_VER >= 20
+  _LIBCPP_AVAILABILITY_SYNC _LIBCPP_HIDE_FROM_ABI void wait(bool __v, memory_order __m = memory_order_seq_cst) const
+      volatile _NOEXCEPT {
     std::__atomic_wait(*this, _LIBCPP_ATOMIC_FLAG_TYPE(__v), __m);
   }
-  _LIBCPP_DEPRECATED_ATOMIC_SYNC _LIBCPP_AVAILABILITY_SYNC _LIBCPP_HIDE_FROM_ABI void
+  _LIBCPP_AVAILABILITY_SYNC _LIBCPP_HIDE_FROM_ABI void
   wait(bool __v, memory_order __m = memory_order_seq_cst) const _NOEXCEPT {
     std::__atomic_wait(*this, _LIBCPP_ATOMIC_FLAG_TYPE(__v), __m);
   }
-  _LIBCPP_DEPRECATED_ATOMIC_SYNC _LIBCPP_AVAILABILITY_SYNC _LIBCPP_HIDE_FROM_ABI void notify_one() volatile _NOEXCEPT {
-    std::__atomic_notify_one(*this);
-  }
-  _LIBCPP_DEPRECATED_ATOMIC_SYNC _LIBCPP_AVAILABILITY_SYNC _LIBCPP_HIDE_FROM_ABI void notify_one() _NOEXCEPT {
+  _LIBCPP_AVAILABILITY_SYNC _LIBCPP_HIDE_FROM_ABI void notify_one() volatile _NOEXCEPT {
     std::__atomic_notify_one(*this);
   }
+  _LIBCPP_AVAILABILITY_SYNC _LIBCPP_HIDE_FROM_ABI void notify_one() _NOEXCEPT { std::__atomic_notify_one(*this); }
   _LIBCPP_AVAILABILITY_SYNC _LIBCPP_HIDE_FROM_ABI void notify_all() volatile _NOEXCEPT {
     std::__atomic_notify_all(*this);
   }
-  _LIBCPP_DEPRECATED_ATOMIC_SYNC _LIBCPP_AVAILABILITY_SYNC _LIBCPP_HIDE_FROM_ABI void notify_all() _NOEXCEPT {
-    std::__atomic_notify_all(*this);
-  }
+  _LIBCPP_AVAILABILITY_SYNC _LIBCPP_HIDE_FROM_ABI void notify_all() _NOEXCEPT { std::__atomic_notify_all(*this); }
+#endif
 
 #if _LIBCPP_STD_VER >= 20
   _LIBCPP_HIDE_FROM_ABI constexpr atomic_flag() _NOEXCEPT : __a_(false) {}
@@ -144,45 +142,45 @@ inline _LIBCPP_HIDE_FROM_ABI void atomic_flag_clear_explicit(atomic_flag* __o, m
   __o->clear(__m);
 }
 
-inline _LIBCPP_DEPRECATED_ATOMIC_SYNC _LIBCPP_HIDE_FROM_ABI _LIBCPP_AVAILABILITY_SYNC void
+#if _LIBCPP_STD_VER >= 20
+inline _LIBCPP_HIDE_FROM_ABI _LIBCPP_AVAILABILITY_SYNC void
 atomic_flag_wait(const volatile atomic_flag* __o, bool __v) _NOEXCEPT {
   __o->wait(__v);
 }
 
-inline _LIBCPP_DEPRECATED_ATOMIC_SYNC _LIBCPP_HIDE_FROM_ABI _LIBCPP_AVAILABILITY_SYNC void
+inline _LIBCPP_HIDE_FROM_ABI _LIBCPP_AVAILABILITY_SYNC void
 atomic_flag_wait(const atomic_flag* __o, bool __v) _NOEXCEPT {
   __o->wait(__v);
 }
 
-inline _LIBCPP_DEPRECATED_ATOMIC_SYNC _LIBCPP_HIDE_FROM_ABI _LIBCPP_AVAILABILITY_SYNC void
+inline _LIBCPP_HIDE_FROM_ABI _LIBCPP_AVAILABILITY_SYNC void
 atomic_flag_wait_explicit(const volatile atomic_flag* __o, bool __v, memory_order __m) _NOEXCEPT {
   __o->wait(__v, __m);
 }
 
-inline _LIBCPP_DEPRECATED_ATOMIC_SYNC _LIBCPP_HIDE_FROM_ABI _LIBCPP_AVAILABILITY_SYNC void
+inline _LIBCPP_HIDE_FROM_ABI _LIBCPP_AVAILABILITY_SYNC void
 atomic_flag_wait_explicit(const atomic_flag* __o, bool __v, memory_order __m) _NOEXCEPT {
   __o->wait(__v, __m);
 }
 
-inline _LIBCPP_DEPRECATED_ATOMIC_SYNC _LIBCPP_HIDE_FROM_ABI _LIBCPP_AVAILABILITY_SYNC void
+inline _LIBCPP_HIDE_FROM_ABI _LIBCPP_AVAILABILITY_SYNC void
 atomic_flag_notify_one(volatile atomic_flag* __o) _NOEXCEPT {
   __o->notify_one();
 }
 
-inline _LIBCPP_DEPRECATED_ATOMIC_SYNC _LIBCPP_HIDE_FROM_ABI _LIBCPP_AVAILABILITY_SYNC void
-atomic_flag_notify_one(atomic_flag* __o) _NOEXCEPT {
+inline _LIBCPP_HIDE_FROM_ABI _LIBCPP_AVAILABILITY_SYNC void atomic_flag_notify_one(atomic_flag* __o) _NOEXCEPT {
   __o->notify_one();
 }
 
-inline _LIBCPP_DEPRECATED_ATOMIC_SYNC _LIBCPP_HIDE_FROM_ABI _LIBCPP_AVAILABILITY_SYNC void
+inline _LIBCPP_HIDE_FROM_ABI _LIBCPP_AVAILABILITY_SYNC void
 atomic_flag_notify_all(volatile atomic_flag* __o) _NOEXCEPT {
   __o->notify_all();
 }
 
-inline _LIBCPP_DEPRECATED_ATOMIC_SYNC _LIBCPP_HIDE_FROM_ABI _LIBCPP_AVAILABILITY_SYNC void
-atomic_flag_notify_all(atomic_flag* __o) _NOEXCEPT {
+inline _LIBCPP_HIDE_FROM_ABI _LIBCPP_AVAILABILITY_SYNC void atomic_flag_notify_all(atomic_flag* __o) _NOEXCEPT {
   __o->notify_all();
 }
+#endif // _LIBCPP_STD_VER >= 20
 
 _LIBCPP_END_NAMESPACE_STD
 
diff --git a/libcxx/include/__atomic/atomic_ref.h b/libcxx/include/__atomic/atomic_ref.h
index 156f196..2849b82 100644
--- a/libcxx/include/__atomic/atomic_ref.h
+++ b/libcxx/include/__atomic/atomic_ref.h
@@ -42,6 +42,19 @@ _LIBCPP_BEGIN_NAMESPACE_STD
 
 #if _LIBCPP_STD_VER >= 20
 
+// These types are required to make __atomic_is_always_lock_free work across GCC and Clang.
+// The purpose of this trick is to make sure that we provide an object with the correct alignment
+// to __atomic_is_always_lock_free, since that answer depends on the alignment.
+template <size_t _Alignment>
+struct __alignment_checker_type {
+  alignas(_Alignment) char __data;
+};
+
+template <size_t _Alignment>
+struct __get_aligner_instance {
+  static constexpr __alignment_checker_type<_Alignment> __instance{};
+};
+
 template <class _Tp>
 struct __atomic_ref_base {
 protected:
@@ -105,7 +118,7 @@ public:
   // that the pointer is going to be aligned properly at runtime because that is a (checked) precondition
   // of atomic_ref's constructor.
   static constexpr bool is_always_lock_free =
-      __atomic_always_lock_free(sizeof(_Tp), reinterpret_cast<void*>(-required_alignment));
+      __atomic_always_lock_free(sizeof(_Tp), &__get_aligner_instance<required_alignment>::__instance);
 
   _LIBCPP_HIDE_FROM_ABI bool is_lock_free() const noexcept { return __atomic_is_lock_free(sizeof(_Tp), __ptr_); }
 
diff --git a/libcxx/include/__config b/libcxx/include/__config
index 0be25a5..392053a 100644
--- a/libcxx/include/__config
+++ b/libcxx/include/__config
@@ -699,14 +699,6 @@ typedef __char32_t char32_t;
 #    define _LIBCPP_DEPRECATED_(m)
 #  endif
 
-#  if _LIBCPP_STD_VER < 20
-#    define _LIBCPP_DEPRECATED_ATOMIC_SYNC                                                                             \
-      _LIBCPP_DEPRECATED_("The C++20 synchronization library has been deprecated prior to C++20. Please update to "    \
-                          "using -std=c++20 if you need to use these facilities.")
-#  else
-#    define _LIBCPP_DEPRECATED_ATOMIC_SYNC /* nothing */
-#  endif
-
 #  if !defined(_LIBCPP_CXX03_LANG)
 #    define _LIBCPP_DEPRECATED_IN_CXX11 _LIBCPP_DEPRECATED
 #  else
diff --git a/libcxx/include/__format/formatter.h b/libcxx/include/__format/formatter.h
index e2f418f..39c2670 100644
--- a/libcxx/include/__format/formatter.h
+++ b/libcxx/include/__format/formatter.h
@@ -40,6 +40,9 @@ struct _LIBCPP_TEMPLATE_VIS formatter {
 #  if _LIBCPP_STD_VER >= 23
 
 template <class _Tp>
+constexpr bool enable_nonlocking_formatter_optimization = false;
+
+template <class _Tp>
 _LIBCPP_HIDE_FROM_ABI constexpr void __set_debug_format(_Tp& __formatter) {
   if constexpr (requires { __formatter.set_debug_format(); })
     __formatter.set_debug_format();
diff --git a/libcxx/include/__format/formatter_bool.h b/libcxx/include/__format/formatter_bool.h
index 17dc695..63aa815 100644
--- a/libcxx/include/__format/formatter_bool.h
+++ b/libcxx/include/__format/formatter_bool.h
@@ -69,7 +69,11 @@ public:
   __format_spec::__parser<_CharT> __parser_;
 };
 
-#endif //_LIBCPP_STD_VER >= 20
+#  if _LIBCPP_STD_VER >= 23
+template <>
+inline constexpr bool enable_nonlocking_formatter_optimization<bool> = true;
+#  endif //_LIBCPP_STD_VER >= 23
+#endif   //_LIBCPP_STD_VER >= 20
 
 _LIBCPP_END_NAMESPACE_STD
 
diff --git a/libcxx/include/__format/formatter_char.h b/libcxx/include/__format/formatter_char.h
index d33e843..abfd65a 100644
--- a/libcxx/include/__format/formatter_char.h
+++ b/libcxx/include/__format/formatter_char.h
@@ -83,9 +83,17 @@ struct _LIBCPP_TEMPLATE_VIS formatter<char, wchar_t> : public __formatter_char<w
 
 template <>
 struct _LIBCPP_TEMPLATE_VIS formatter<wchar_t, wchar_t> : public __formatter_char<wchar_t> {};
-
 #  endif // _LIBCPP_HAS_NO_WIDE_CHARACTERS
 
+#  if _LIBCPP_STD_VER >= 23
+template <>
+inline constexpr bool enable_nonlocking_formatter_optimization<char> = true;
+#    ifndef _LIBCPP_HAS_NO_WIDE_CHARACTERS
+template <>
+inline constexpr bool enable_nonlocking_formatter_optimization<wchar_t> = true;
+#    endif // _LIBCPP_HAS_NO_WIDE_CHARACTERS
+#  endif   //_LIBCPP_STD_VER >= 23
+
 #endif //_LIBCPP_STD_VER >= 20
 
 _LIBCPP_END_NAMESPACE_STD
diff --git a/libcxx/include/__format/formatter_floating_point.h b/libcxx/include/__format/formatter_floating_point.h
index fa42ba2..334755f 100644
--- a/libcxx/include/__format/formatter_floating_point.h
+++ b/libcxx/include/__format/formatter_floating_point.h
@@ -774,6 +774,14 @@ struct _LIBCPP_TEMPLATE_VIS formatter<double, _CharT> : public __formatter_float
 template <__fmt_char_type _CharT>
 struct _LIBCPP_TEMPLATE_VIS formatter<long double, _CharT> : public __formatter_floating_point<_CharT> {};
 
+#  if _LIBCPP_STD_VER >= 23
+template <>
+inline constexpr bool enable_nonlocking_formatter_optimization<float> = true;
+template <>
+inline constexpr bool enable_nonlocking_formatter_optimization<double> = true;
+template <>
+inline constexpr bool enable_nonlocking_formatter_optimization<long double> = true;
+#  endif //_LIBCPP_STD_VER >= 23
 #endif //_LIBCPP_STD_VER >= 20
 
 _LIBCPP_END_NAMESPACE_STD
diff --git a/libcxx/include/__format/formatter_integer.h b/libcxx/include/__format/formatter_integer.h
index 41400f0..2c2e799 100644
--- a/libcxx/include/__format/formatter_integer.h
+++ b/libcxx/include/__format/formatter_integer.h
@@ -88,7 +88,38 @@ template <__fmt_char_type _CharT>
 struct _LIBCPP_TEMPLATE_VIS formatter<__uint128_t, _CharT> : public __formatter_integer<_CharT> {};
 #  endif
 
-#endif //_LIBCPP_STD_VER >= 20
+#  if _LIBCPP_STD_VER >= 23
+template <>
+inline constexpr bool enable_nonlocking_formatter_optimization<signed char> = true;
+template <>
+inline constexpr bool enable_nonlocking_formatter_optimization<short> = true;
+template <>
+inline constexpr bool enable_nonlocking_formatter_optimization<int> = true;
+template <>
+inline constexpr bool enable_nonlocking_formatter_optimization<long> = true;
+template <>
+inline constexpr bool enable_nonlocking_formatter_optimization<long long> = true;
+#    ifndef _LIBCPP_HAS_NO_INT128
+template <>
+inline constexpr bool enable_nonlocking_formatter_optimization<__int128_t> = true;
+#    endif
+
+template <>
+inline constexpr bool enable_nonlocking_formatter_optimization<unsigned char> = true;
+template <>
+inline constexpr bool enable_nonlocking_formatter_optimization<unsigned short> = true;
+template <>
+inline constexpr bool enable_nonlocking_formatter_optimization<unsigned> = true;
+template <>
+inline constexpr bool enable_nonlocking_formatter_optimization<unsigned long> = true;
+template <>
+inline constexpr bool enable_nonlocking_formatter_optimization<unsigned long long> = true;
+#    ifndef _LIBCPP_HAS_NO_INT128
+template <>
+inline constexpr bool enable_nonlocking_formatter_optimization<__uint128_t> = true;
+#    endif
+#  endif //_LIBCPP_STD_VER >= 23
+#endif   //_LIBCPP_STD_VER >= 20
 
 _LIBCPP_END_NAMESPACE_STD
 
diff --git a/libcxx/include/__format/formatter_pointer.h b/libcxx/include/__format/formatter_pointer.h
index 6941343..e1c062c 100644
--- a/libcxx/include/__format/formatter_pointer.h
+++ b/libcxx/include/__format/formatter_pointer.h
@@ -65,6 +65,14 @@ struct _LIBCPP_TEMPLATE_VIS formatter<void*, _CharT> : public __formatter_pointe
 template <__fmt_char_type _CharT>
 struct _LIBCPP_TEMPLATE_VIS formatter<const void*, _CharT> : public __formatter_pointer<_CharT> {};
 
+#  if _LIBCPP_STD_VER >= 23
+template <>
+inline constexpr bool enable_nonlocking_formatter_optimization<nullptr_t> = true;
+template <>
+inline constexpr bool enable_nonlocking_formatter_optimization<void*> = true;
+template <>
+inline constexpr bool enable_nonlocking_formatter_optimization<const void*> = true;
+#  endif //_LIBCPP_STD_VER >= 23
 #endif //_LIBCPP_STD_VER >= 20
 
 _LIBCPP_END_NAMESPACE_STD
diff --git a/libcxx/include/__format/formatter_string.h b/libcxx/include/__format/formatter_string.h
index 347439fc..dee2b3a 100644
--- a/libcxx/include/__format/formatter_string.h
+++ b/libcxx/include/__format/formatter_string.h
@@ -143,7 +143,32 @@ struct _LIBCPP_TEMPLATE_VIS formatter<basic_string_view<_CharT, _Traits>, _CharT
   }
 };
 
-#endif //_LIBCPP_STD_VER >= 20
+#  if _LIBCPP_STD_VER >= 23
+template <>
+inline constexpr bool enable_nonlocking_formatter_optimization<char*> = true;
+template <>
+inline constexpr bool enable_nonlocking_formatter_optimization<const char*> = true;
+template <size_t _Size>
+inline constexpr bool enable_nonlocking_formatter_optimization<char[_Size]> = true;
+template <class _Traits, class _Allocator>
+inline constexpr bool enable_nonlocking_formatter_optimization<basic_string<char, _Traits, _Allocator>> = true;
+template <class _Traits>
+inline constexpr bool enable_nonlocking_formatter_optimization<basic_string_view<char, _Traits>> = true;
+
+#    ifndef _LIBCPP_HAS_NO_WIDE_CHARACTERS
+template <>
+inline constexpr bool enable_nonlocking_formatter_optimization<wchar_t*> = true;
+template <>
+inline constexpr bool enable_nonlocking_formatter_optimization<const wchar_t*> = true;
+template <size_t _Size>
+inline constexpr bool enable_nonlocking_formatter_optimization<wchar_t[_Size]> = true;
+template <class _Traits, class _Allocator>
+inline constexpr bool enable_nonlocking_formatter_optimization<basic_string<wchar_t, _Traits, _Allocator>> = true;
+template <class _Traits>
+inline constexpr bool enable_nonlocking_formatter_optimization<basic_string_view<wchar_t, _Traits>> = true;
+#    endif // _LIBCPP_HAS_NO_WIDE_CHARACTERS
+#  endif   //_LIBCPP_STD_VER >= 23
+#endif     //_LIBCPP_STD_VER >= 20
 
 _LIBCPP_END_NAMESPACE_STD
 
diff --git a/libcxx/include/__iterator/bounded_iter.h b/libcxx/include/__iterator/bounded_iter.h
index 8a81c9f..5a86bd9 100644
--- a/libcxx/include/__iterator/bounded_iter.h
+++ b/libcxx/include/__iterator/bounded_iter.h
@@ -209,9 +209,7 @@ public:
   operator!=(__bounded_iter const& __x, __bounded_iter const& __y) _NOEXCEPT {
     return __x.__current_ != __y.__current_;
   }
-#endif
 
-  // TODO(mordante) disable these overloads in the LLVM 20 release.
   _LIBCPP_HIDE_FROM_ABI _LIBCPP_CONSTEXPR friend bool
   operator<(__bounded_iter const& __x, __bounded_iter const& __y) _NOEXCEPT {
     return __x.__current_ < __y.__current_;
@@ -229,7 +227,7 @@ public:
     return __x.__current_ >= __y.__current_;
   }
 
-#if _LIBCPP_STD_VER >= 20
+#else
   _LIBCPP_HIDE_FROM_ABI constexpr friend strong_ordering
   operator<=>(__bounded_iter const& __x, __bounded_iter const& __y) noexcept {
     if constexpr (three_way_comparable<_Iterator, strong_ordering>) {
diff --git a/libcxx/include/__iterator/wrap_iter.h b/libcxx/include/__iterator/wrap_iter.h
index 56183c0..34f8d5f 100644
--- a/libcxx/include/__iterator/wrap_iter.h
+++ b/libcxx/include/__iterator/wrap_iter.h
@@ -145,9 +145,6 @@ _LIBCPP_HIDE_FROM_ABI _LIBCPP_CONSTEXPR bool
 operator!=(const __wrap_iter<_Iter1>& __x, const __wrap_iter<_Iter2>& __y) _NOEXCEPT {
   return !(__x == __y);
 }
-#endif
-
-// TODO(mordante) disable these overloads in the LLVM 20 release.
 template <class _Iter1>
 _LIBCPP_HIDE_FROM_ABI _LIBCPP_CONSTEXPR bool
 operator>(const __wrap_iter<_Iter1>& __x, const __wrap_iter<_Iter1>& __y) _NOEXCEPT {
@@ -184,7 +181,7 @@ operator<=(const __wrap_iter<_Iter1>& __x, const __wrap_iter<_Iter2>& __y) _NOEX
   return !(__y < __x);
 }
 
-#if _LIBCPP_STD_VER >= 20
+#else
 template <class _Iter1, class _Iter2>
 _LIBCPP_HIDE_FROM_ABI constexpr strong_ordering
 operator<=>(const __wrap_iter<_Iter1>& __x, const __wrap_iter<_Iter2>& __y) noexcept {
diff --git a/libcxx/include/__memory_resource/polymorphic_allocator.h b/libcxx/include/__memory_resource/polymorphic_allocator.h
index a71096d..3444e95 100644
--- a/libcxx/include/__memory_resource/polymorphic_allocator.h
+++ b/libcxx/include/__memory_resource/polymorphic_allocator.h
@@ -174,6 +174,17 @@ public:
 
   _LIBCPP_HIDE_FROM_ABI memory_resource* resource() const noexcept { return __res_; }
 
+  friend bool operator==(const polymorphic_allocator& __lhs, const polymorphic_allocator& __rhs) noexcept {
+    return *__lhs.resource() == *__rhs.resource();
+  }
+
+#  if _LIBCPP_STD_VER <= 17
+  // This overload is not specified, it was added due to LWG3683.
+  friend bool operator!=(const polymorphic_allocator& __lhs, const polymorphic_allocator& __rhs) noexcept {
+    return *__lhs.resource() != *__rhs.resource();
+  }
+#  endif
+
 private:
   template <class... _Args, size_t... _Is>
   _LIBCPP_HIDE_FROM_ABI tuple<_Args&&...>
diff --git a/libcxx/include/__mutex/unique_lock.h b/libcxx/include/__mutex/unique_lock.h
index 4a616ba..5df791d 100644
--- a/libcxx/include/__mutex/unique_lock.h
+++ b/libcxx/include/__mutex/unique_lock.h
@@ -22,8 +22,6 @@
 #  pragma GCC system_header
 #endif
 
-#ifndef _LIBCPP_HAS_NO_THREADS
-
 _LIBCPP_BEGIN_NAMESPACE_STD
 
 template <class _Mutex>
@@ -172,6 +170,4 @@ inline _LIBCPP_HIDE_FROM_ABI void swap(unique_lock<_Mutex>& __x, unique_lock<_Mu
 
 _LIBCPP_END_NAMESPACE_STD
 
-#endif // _LIBCPP_HAS_NO_THREADS
-
 #endif // _LIBCPP___MUTEX_UNIQUE_LOCK_H
diff --git a/libcxx/include/__thread/thread.h b/libcxx/include/__thread/thread.h
index d2254a6..458c1cd 100644
--- a/libcxx/include/__thread/thread.h
+++ b/libcxx/include/__thread/thread.h
@@ -10,6 +10,7 @@
 #ifndef _LIBCPP___THREAD_THREAD_H
 #define _LIBCPP___THREAD_THREAD_H
 
+#include <__assert>
 #include <__condition_variable/condition_variable.h>
 #include <__config>
 #include <__exception/terminate.h>
diff --git a/libcxx/include/array b/libcxx/include/array
index 6ffde852..4db0cb7 100644
--- a/libcxx/include/array
+++ b/libcxx/include/array
@@ -19,17 +19,17 @@ template <class T, size_t N >
 struct array
 {
     // types:
-    typedef T & reference;
-    typedef const T & const_reference;
-    typedef implementation defined iterator;
-    typedef implementation defined const_iterator;
-    typedef size_t size_type;
-    typedef ptrdiff_t difference_type;
-    typedef T value_type;
-    typedef T* pointer;
-    typedef const T* const_pointer;
-    typedef std::reverse_iterator<iterator> reverse_iterator;
-    typedef std::reverse_iterator<const_iterator> const_reverse_iterator;
+    using value_type             = T;
+    using pointer                = T*;
+    using const_pointer          = const T*;
+    using reference              = T&;
+    using const_reference        = const T&;
+    using size_type              = size_t;
+    using difference_type        = ptrdiff_t;
+    using iterator               = implementation-defined;
+    using const_iterator         = implementation-defined;
+    using reverse_iterator       = std::reverse_iterator<iterator>;
+    using const_reverse_iterator = std::reverse_iterator<const_iterator>;
 
     // No explicit construct/copy/destroy for aggregate type
     void fill(const T& u);                                      // constexpr in C++20
@@ -270,20 +270,25 @@ struct _LIBCPP_TEMPLATE_VIS array {
 template <class _Tp>
 struct _LIBCPP_TEMPLATE_VIS array<_Tp, 0> {
   // types:
-  typedef array __self;
-  typedef _Tp value_type;
-  typedef value_type& reference;
-  typedef const value_type& const_reference;
-  typedef value_type* iterator;
-  typedef const value_type* const_iterator;
-  typedef value_type* pointer;
-  typedef const value_type* const_pointer;
-  typedef size_t size_type;
-  typedef ptrdiff_t difference_type;
-  typedef std::reverse_iterator<iterator> reverse_iterator;
-  typedef std::reverse_iterator<const_iterator> const_reverse_iterator;
-
-  typedef __conditional_t<is_const<_Tp>::value, const __empty, __empty> _EmptyType;
+  using __self          = array;
+  using value_type      = _Tp;
+  using reference       = value_type&;
+  using const_reference = const value_type&;
+  using pointer         = value_type*;
+  using const_pointer   = const value_type*;
+#if defined(_LIBCPP_ABI_USE_WRAP_ITER_IN_STD_ARRAY)
+  using iterator       = __wrap_iter<pointer>;
+  using const_iterator = __wrap_iter<const_pointer>;
+#else
+  using iterator       = pointer;
+  using const_iterator = const_pointer;
+#endif
+  using size_type              = size_t;
+  using difference_type        = ptrdiff_t;
+  using reverse_iterator       = std::reverse_iterator<iterator>;
+  using const_reverse_iterator = std::reverse_iterator<const_iterator>;
+
+  using _EmptyType = __conditional_t<is_const<_Tp>::value, const __empty, __empty>;
 
   struct _ArrayInStructT {
     _Tp __data_[1];
@@ -440,7 +445,7 @@ struct _LIBCPP_TEMPLATE_VIS tuple_size<array<_Tp, _Size> > : public integral_con
 template <size_t _Ip, class _Tp, size_t _Size>
 struct _LIBCPP_TEMPLATE_VIS tuple_element<_Ip, array<_Tp, _Size> > {
   static_assert(_Ip < _Size, "Index out of bounds in std::tuple_element<> (std::array)");
-  typedef _Tp type;
+  using type = _Tp;
 };
 
 template <size_t _Ip, class _Tp, size_t _Size>
diff --git a/libcxx/include/atomic b/libcxx/include/atomic
index 0d13619..772ac99 100644
--- a/libcxx/include/atomic
+++ b/libcxx/include/atomic
@@ -101,12 +101,12 @@ struct atomic
     bool compare_exchange_strong(T& expc, T desr,
                                  memory_order m = memory_order_seq_cst) noexcept;
 
-    void wait(T, memory_order = memory_order::seq_cst) const volatile noexcept;
-    void wait(T, memory_order = memory_order::seq_cst) const noexcept;
-    void notify_one() volatile noexcept;
-    void notify_one() noexcept;
-    void notify_all() volatile noexcept;
-    void notify_all() noexcept;
+    void wait(T, memory_order = memory_order::seq_cst) const volatile noexcept; // since C++20
+    void wait(T, memory_order = memory_order::seq_cst) const noexcept;          // since C++20
+    void notify_one() volatile noexcept;                                        // since C++20
+    void notify_one() noexcept;                                                 // since C++20
+    void notify_all() volatile noexcept;                                        // since C++20
+    void notify_all() noexcept;                                                 // since C++20
 };
 
 template <>
@@ -184,12 +184,12 @@ struct atomic<integral>
     integral operator^=(integral op) volatile noexcept;
     integral operator^=(integral op) noexcept;
 
-    void wait(integral, memory_order = memory_order::seq_cst) const volatile noexcept;
-    void wait(integral, memory_order = memory_order::seq_cst) const noexcept;
-    void notify_one() volatile noexcept;
-    void notify_one() noexcept;
-    void notify_all() volatile noexcept;
-    void notify_all() noexcept;
+    void wait(integral, memory_order = memory_order::seq_cst) const volatile noexcept; // since C++20
+    void wait(integral, memory_order = memory_order::seq_cst) const noexcept;          // since C++20
+    void notify_one() volatile noexcept;                                               // since C++20
+    void notify_one() noexcept;                                                        // since C++20
+    void notify_all() volatile noexcept;                                               // since C++20
+    void notify_all() noexcept;                                                        // since C++20
 };
 
 template <class T>
@@ -254,12 +254,12 @@ struct atomic<T*>
     T* operator-=(ptrdiff_t op) volatile noexcept;
     T* operator-=(ptrdiff_t op) noexcept;
 
-    void wait(T*, memory_order = memory_order::seq_cst) const volatile noexcept;
-    void wait(T*, memory_order = memory_order::seq_cst) const noexcept;
-    void notify_one() volatile noexcept;
-    void notify_one() noexcept;
-    void notify_all() volatile noexcept;
-    void notify_all() noexcept;
+    void wait(T*, memory_order = memory_order::seq_cst) const volatile noexcept; // since C++20
+    void wait(T*, memory_order = memory_order::seq_cst) const noexcept;          // since C++20
+    void notify_one() volatile noexcept;                                         // since C++20
+    void notify_one() noexcept;                                                  // since C++20
+    void notify_all() volatile noexcept;                                         // since C++20
+    void notify_all() noexcept;                                                  // since C++20
 };
 
 template<>
@@ -321,12 +321,12 @@ struct atomic<floating-point-type> {  // since C++20
   floating-point-type operator-=(floating-point-type) volatile noexcept;
   floating-point-type operator-=(floating-point-type) noexcept;
 
-  void wait(floating-point-type, memory_order = memory_order::seq_cst) const volatile noexcept;
-  void wait(floating-point-type, memory_order = memory_order::seq_cst) const noexcept;
-  void notify_one() volatile noexcept;
-  void notify_one() noexcept;
-  void notify_all() volatile noexcept;
-  void notify_all() noexcept;
+  void wait(floating-point-type, memory_order = memory_order::seq_cst) const volatile noexcept; // since C++20
+  void wait(floating-point-type, memory_order = memory_order::seq_cst) const noexcept;          // since C++20
+  void notify_one() volatile noexcept;                                                          // since C++20
+  void notify_one() noexcept;                                                                   // since C++20
+  void notify_all() volatile noexcept;                                                          // since C++20
+  void notify_all() noexcept;                                                                   // since C++20
 };
 
 // [atomics.nonmembers], non-member functions
@@ -443,23 +443,23 @@ template<class T>
                               memory_order) noexcept;
 
 template<class T>
-  void atomic_wait(const volatile atomic<T>*, atomic<T>::value_type) noexcept;
+  void atomic_wait(const volatile atomic<T>*, atomic<T>::value_type) noexcept; // since C++20
 template<class T>
-  void atomic_wait(const atomic<T>*, atomic<T>::value_type) noexcept;
+  void atomic_wait(const atomic<T>*, atomic<T>::value_type) noexcept;          // since C++20
 template<class T>
-  void atomic_wait_explicit(const volatile atomic<T>*, atomic<T>::value_type,
+  void atomic_wait_explicit(const volatile atomic<T>*, atomic<T>::value_type,  // since C++20
                             memory_order) noexcept;
 template<class T>
-  void atomic_wait_explicit(const atomic<T>*, atomic<T>::value_type,
+  void atomic_wait_explicit(const atomic<T>*, atomic<T>::value_type,           // since C++20
                             memory_order) noexcept;
 template<class T>
-  void atomic_notify_one(volatile atomic<T>*) noexcept;
+  void atomic_notify_one(volatile atomic<T>*) noexcept;                        // since C++20
 template<class T>
-  void atomic_notify_one(atomic<T>*) noexcept;
+  void atomic_notify_one(atomic<T>*) noexcept;                                 // since C++20
 template<class T>
-  void atomic_notify_all(volatile atomic<T>*) noexcept;
+  void atomic_notify_all(volatile atomic<T>*) noexcept;                        // since C++20
 template<class T>
-  void atomic_notify_all(atomic<T>*) noexcept;
+  void atomic_notify_all(atomic<T>*) noexcept;                                 // since C++20
 
 // Atomics for standard typedef types
 
@@ -534,12 +534,12 @@ typedef struct atomic_flag
     void clear(memory_order m = memory_order_seq_cst) volatile noexcept;
     void clear(memory_order m = memory_order_seq_cst) noexcept;
 
-    void wait(bool, memory_order = memory_order::seq_cst) const volatile noexcept;
-    void wait(bool, memory_order = memory_order::seq_cst) const noexcept;
-    void notify_one() volatile noexcept;
-    void notify_one() noexcept;
-    void notify_all() volatile noexcept;
-    void notify_all() noexcept;
+    void wait(bool, memory_order = memory_order::seq_cst) const volatile noexcept; // since C++20
+    void wait(bool, memory_order = memory_order::seq_cst) const noexcept;          // since C++20
+    void notify_one() volatile noexcept;                                           // since C++20
+    void notify_one() noexcept;                                                    // since C++20
+    void notify_all() volatile noexcept;                                           // since C++20
+    void notify_all() noexcept;                                                    // since C++20
 } atomic_flag;
 
 bool atomic_flag_test(volatile atomic_flag* obj) noexcept;
@@ -557,14 +557,14 @@ void atomic_flag_clear(atomic_flag* obj) noexcept;
 void atomic_flag_clear_explicit(volatile atomic_flag* obj, memory_order m) noexcept;
 void atomic_flag_clear_explicit(atomic_flag* obj, memory_order m) noexcept;
 
-void atomic_wait(const volatile atomic_flag* obj, T old) noexcept;
-void atomic_wait(const atomic_flag* obj, T old) noexcept;
-void atomic_wait_explicit(const volatile atomic_flag* obj, T old, memory_order m) noexcept;
-void atomic_wait_explicit(const atomic_flag* obj, T old, memory_order m) noexcept;
-void atomic_one(volatile atomic_flag* obj) noexcept;
-void atomic_one(atomic_flag* obj) noexcept;
-void atomic_all(volatile atomic_flag* obj) noexcept;
-void atomic_all(atomic_flag* obj) noexcept;
+void atomic_wait(const volatile atomic_flag* obj, T old) noexcept;                          // since C++20
+void atomic_wait(const atomic_flag* obj, T old) noexcept;                                   // since C++20
+void atomic_wait_explicit(const volatile atomic_flag* obj, T old, memory_order m) noexcept; // since C++20
+void atomic_wait_explicit(const atomic_flag* obj, T old, memory_order m) noexcept;          // since C++20
+void atomic_one(volatile atomic_flag* obj) noexcept;                                        // since C++20
+void atomic_one(atomic_flag* obj) noexcept;                                                 // since C++20
+void atomic_all(volatile atomic_flag* obj) noexcept;                                        // since C++20
+void atomic_all(atomic_flag* obj) noexcept;                                                 // since C++20
 
 // fences
 
diff --git a/libcxx/include/barrier b/libcxx/include/barrier
index edee181..ba29ebc 100644
--- a/libcxx/include/barrier
+++ b/libcxx/include/barrier
@@ -17,7 +17,7 @@ namespace std
 {
 
   template<class CompletionFunction = see below>
-  class barrier
+  class barrier                                   // since C++20
   {
   public:
     using arrival_token = see below;
@@ -68,7 +68,7 @@ namespace std
 _LIBCPP_PUSH_MACROS
 #  include <__undef_macros>
 
-#  if _LIBCPP_STD_VER >= 14
+#  if _LIBCPP_STD_VER >= 20
 
 _LIBCPP_BEGIN_NAMESPACE_STD
 
@@ -254,7 +254,7 @@ public:
 #    endif // !_LIBCPP_HAS_NO_TREE_BARRIER
 
 template <class _CompletionF = __empty_completion>
-class _LIBCPP_DEPRECATED_ATOMIC_SYNC barrier {
+class barrier {
   __barrier_base<_CompletionF> __b_;
 
 public:
@@ -290,7 +290,7 @@ public:
 
 _LIBCPP_END_NAMESPACE_STD
 
-#  endif // _LIBCPP_STD_VER >= 14
+#  endif // _LIBCPP_STD_VER >= 20
 
 _LIBCPP_POP_MACROS
 
@@ -305,4 +305,4 @@ _LIBCPP_POP_MACROS
 #  include <variant>
 #endif
 
-#endif //_LIBCPP_BARRIER
+#endif // _LIBCPP_BARRIER
diff --git a/libcxx/include/deque b/libcxx/include/deque
index e73135a..759de5d 100644
--- a/libcxx/include/deque
+++ b/libcxx/include/deque
@@ -380,9 +380,6 @@ public:
   _LIBCPP_HIDE_FROM_ABI friend bool operator!=(const __deque_iterator& __x, const __deque_iterator& __y) {
     return !(__x == __y);
   }
-#endif
-
-  // TODO(mordante) disable these overloads in the LLVM 20 release.
   _LIBCPP_HIDE_FROM_ABI friend bool operator<(const __deque_iterator& __x, const __deque_iterator& __y) {
     return __x.__m_iter_ < __y.__m_iter_ || (__x.__m_iter_ == __y.__m_iter_ && __x.__ptr_ < __y.__ptr_);
   }
@@ -399,7 +396,8 @@ public:
     return !(__x < __y);
   }
 
-#if _LIBCPP_STD_VER >= 20
+#else
+
   _LIBCPP_HIDE_FROM_ABI friend strong_ordering operator<=>(const __deque_iterator& __x, const __deque_iterator& __y) {
     if (__x.__m_iter_ < __y.__m_iter_)
       return strong_ordering::less;
diff --git a/libcxx/include/format b/libcxx/include/format
index a88b3ef..449e6f0 100644
--- a/libcxx/include/format
+++ b/libcxx/include/format
@@ -126,6 +126,9 @@ namespace std {
   // [format.formatter], formatter
   template<class T, class charT = char> struct formatter;
 
+  template<class T>
+  constexpr bool enable_nonlocking_formatter_optimization = false;   // since C++23
+
   // [format.parse.ctx], class template basic_format_parse_context
   template<class charT> class basic_format_parse_context;
   using format_parse_context = basic_format_parse_context<char>;
@@ -133,7 +136,7 @@ namespace std {
 
   // [format.range], formatting of ranges
   // [format.range.fmtkind], variable template format_kind
-  enum class range_format {                                     // since C++23
+  enum class range_format {                                          // since C++23
     disabled,
     map,
     set,
@@ -143,20 +146,20 @@ namespace std {
   };
 
   template<class R>
-    constexpr unspecified format_kind = unspecified;            // since C++23
+    constexpr unspecified format_kind = unspecified;                 // since C++23
 
   template<ranges::input_range R>
       requires same_as<R, remove_cvref_t<R>>
-    constexpr range_format format_kind<R> = see below;          // since C++23
+    constexpr range_format format_kind<R> = see below;               // since C++23
 
   // [format.range.formatter], class template range_formatter
   template<class T, class charT = char>
     requires same_as<remove_cvref_t<T>, T> && formattable<T, charT>
-  class range_formatter;                                        // since C++23
+  class range_formatter;                                             // since C++23
 
   // [format.range.fmtdef], class template range-default-formatter
   template<range_format K, ranges::input_range R, class charT>
-    struct range-default-formatter;                             // exposition only, since C++23
+    struct range-default-formatter;                                  // exposition only, since C++23
 
   // [format.range.fmtmap], [format.range.fmtset], [format.range.fmtstr],
   // specializations for maps, sets, and strings
@@ -173,7 +176,7 @@ namespace std {
     see below visit_format_arg(Visitor&& vis, basic_format_arg<Context> arg); // Deprecated in C++26
 
   // [format.arg.store], class template format-arg-store
-  template<class Context, class... Args> struct format-arg-store;      // exposition only
+  template<class Context, class... Args> struct format-arg-store;    // exposition only
 
   template<class Context = format_context, class... Args>
     format-arg-store<Context, Args...>
diff --git a/libcxx/include/latch b/libcxx/include/latch
index 81d6028a..b56e49bc 100644
--- a/libcxx/include/latch
+++ b/libcxx/include/latch
@@ -16,7 +16,7 @@
 namespace std
 {
 
-  class latch
+  class latch                                     // since C++20
   {
   public:
     static constexpr ptrdiff_t max() noexcept;
@@ -59,11 +59,11 @@ namespace std
 _LIBCPP_PUSH_MACROS
 #  include <__undef_macros>
 
-#  if _LIBCPP_STD_VER >= 14
+#  if _LIBCPP_STD_VER >= 20
 
 _LIBCPP_BEGIN_NAMESPACE_STD
 
-class _LIBCPP_DEPRECATED_ATOMIC_SYNC latch {
+class latch {
   __atomic_base<ptrdiff_t> __a_;
 
 public:
@@ -116,7 +116,7 @@ private:
 
 _LIBCPP_END_NAMESPACE_STD
 
-#  endif // _LIBCPP_STD_VER >= 14
+#  endif // _LIBCPP_STD_VER >= 20
 
 _LIBCPP_POP_MACROS
 
@@ -126,4 +126,4 @@ _LIBCPP_POP_MACROS
 #  include <atomic>
 #endif
 
-#endif //_LIBCPP_LATCH
+#endif // _LIBCPP_LATCH
diff --git a/libcxx/include/semaphore b/libcxx/include/semaphore
index 95a4375..bf6317c 100644
--- a/libcxx/include/semaphore
+++ b/libcxx/include/semaphore
@@ -16,7 +16,7 @@
 namespace std {
 
 template<ptrdiff_t least_max_value = implementation-defined>
-class counting_semaphore
+class counting_semaphore                          // since C++20
 {
 public:
 static constexpr ptrdiff_t max() noexcept;
@@ -39,7 +39,7 @@ private:
 ptrdiff_t counter; // exposition only
 };
 
-using binary_semaphore = counting_semaphore<1>;
+using binary_semaphore = counting_semaphore<1>; // since C++20
 
 }
 
@@ -68,7 +68,7 @@ using binary_semaphore = counting_semaphore<1>;
 _LIBCPP_PUSH_MACROS
 #  include <__undef_macros>
 
-#  if _LIBCPP_STD_VER >= 14
+#  if _LIBCPP_STD_VER >= 20
 
 _LIBCPP_BEGIN_NAMESPACE_STD
 
@@ -124,7 +124,7 @@ private:
 };
 
 template <ptrdiff_t __least_max_value = _LIBCPP_SEMAPHORE_MAX>
-class _LIBCPP_DEPRECATED_ATOMIC_SYNC counting_semaphore {
+class counting_semaphore {
   __atomic_semaphore_base __semaphore_;
 
 public:
@@ -169,13 +169,11 @@ public:
   }
 };
 
-_LIBCPP_SUPPRESS_DEPRECATED_PUSH
-using binary_semaphore _LIBCPP_DEPRECATED_ATOMIC_SYNC = counting_semaphore<1>;
-_LIBCPP_SUPPRESS_DEPRECATED_POP
+using binary_semaphore = counting_semaphore<1>;
 
 _LIBCPP_END_NAMESPACE_STD
 
-#  endif // _LIBCPP_STD_VER >= 14
+#  endif // _LIBCPP_STD_VER >= 20
 
 _LIBCPP_POP_MACROS
 
@@ -185,4 +183,4 @@ _LIBCPP_POP_MACROS
 #  include <atomic>
 #endif
 
-#endif //_LIBCPP_SEMAPHORE
+#endif // _LIBCPP_SEMAPHORE
diff --git a/libcxx/include/stdatomic.h b/libcxx/include/stdatomic.h
index 79772eb..0ff1466 100644
--- a/libcxx/include/stdatomic.h
+++ b/libcxx/include/stdatomic.h
@@ -103,6 +103,8 @@ using std::atomic_fetch_sub                            // see below
 using std::atomic_fetch_sub_explicit                   // see below
 using std::atomic_fetch_or                             // see below
 using std::atomic_fetch_or_explicit                    // see below
+using std::atomic_fetch_xor                            // see below
+using std::atomic_fetch_xor_explicit                   // see below
 using std::atomic_fetch_and                            // see below
 using std::atomic_fetch_and_explicit                   // see below
 using std::atomic_flag_test_and_set                    // see below
@@ -204,6 +206,8 @@ using std::atomic_fetch_add_explicit _LIBCPP_USING_IF_EXISTS;
 using std::atomic_fetch_and _LIBCPP_USING_IF_EXISTS;
 using std::atomic_fetch_and_explicit _LIBCPP_USING_IF_EXISTS;
 using std::atomic_fetch_or _LIBCPP_USING_IF_EXISTS;
+using std::atomic_fetch_xor_explicit _LIBCPP_USING_IF_EXISTS;
+using std::atomic_fetch_xor _LIBCPP_USING_IF_EXISTS;
 using std::atomic_fetch_or_explicit _LIBCPP_USING_IF_EXISTS;
 using std::atomic_fetch_sub _LIBCPP_USING_IF_EXISTS;
 using std::atomic_fetch_sub_explicit _LIBCPP_USING_IF_EXISTS;
diff --git a/libcxx/modules/std/format.inc b/libcxx/modules/std/format.inc
index 743a438..09aa03a 100644
--- a/libcxx/modules/std/format.inc
+++ b/libcxx/modules/std/format.inc
@@ -46,6 +46,8 @@ export namespace std {
   using std::formatter;
 
 #if _LIBCPP_STD_VER >= 23
+  using std::enable_nonlocking_formatter_optimization;
+
   // [format.formattable], concept formattable
   using std::formattable;
 #endif
diff --git a/libcxx/src/CMakeLists.txt b/libcxx/src/CMakeLists.txt
index bfc88c4..fe9d2666 100644
--- a/libcxx/src/CMakeLists.txt
+++ b/libcxx/src/CMakeLists.txt
@@ -147,11 +147,6 @@ if(NOT LIBCXX_INSTALL_LIBRARY)
   set(exclude_from_all EXCLUDE_FROM_ALL)
 endif()
 
-if (LIBCXX_GENERATE_COVERAGE AND NOT LIBCXX_COVERAGE_LIBRARY)
-  find_compiler_rt_library(profile LIBCXX_COVERAGE_LIBRARY)
-endif()
-add_library_flags_if(LIBCXX_COVERAGE_LIBRARY "${LIBCXX_COVERAGE_LIBRARY}")
-
 if (APPLE AND LLVM_USE_SANITIZER)
   if (("${LLVM_USE_SANITIZER}" STREQUAL "Address") OR
       ("${LLVM_USE_SANITIZER}" STREQUAL "Address;Undefined") OR
diff --git a/libcxx/test/CMakeLists.txt b/libcxx/test/CMakeLists.txt
index 001b29e..b25712b 100644
--- a/libcxx/test/CMakeLists.txt
+++ b/libcxx/test/CMakeLists.txt
@@ -1,10 +1,8 @@
 include(HandleLitArguments)
 add_subdirectory(tools)
 
-# By default, libcxx and libcxxabi share a library directory.
-if (NOT LIBCXX_CXX_ABI_LIBRARY_PATH)
-  set(LIBCXX_CXX_ABI_LIBRARY_PATH "${LIBCXX_LIBRARY_DIR}" CACHE PATH
-      "The path to libc++abi library.")
+if (LIBCXX_INCLUDE_BENCHMARKS)
+  add_subdirectory(benchmarks)
 endif()
 
 set(AUTO_GEN_COMMENT "## Autogenerated by libcxx configuration.\n# Do not edit!")
@@ -49,15 +47,3 @@ add_lit_testsuite(check-cxx
   "Running libcxx tests"
   ${CMAKE_CURRENT_BINARY_DIR}
   DEPENDS cxx-test-depends)
-
-if (LIBCXX_GENERATE_COVERAGE)
-  include(CodeCoverage)
-  set(output_dir "${CMAKE_CURRENT_BINARY_DIR}/coverage")
-  set(capture_dirs
-      "${LIBCXX_LIB_CMAKEFILES_DIR}/cxx_objects.dir/"
-      "${LIBCXX_LIB_CMAKEFILES_DIR}/cxx.dir/"
-      "${LIBCXX_LIB_CMAKEFILES_DIR}/cxx_experimental.dir/"
-      "${CMAKE_CURRENT_BINARY_DIR}")
-  set(extract_dirs "${LIBCXX_SOURCE_DIR}/include;${LIBCXX_SOURCE_DIR}/src")
-  setup_lcov_test_target_coverage("cxx" "${output_dir}" "${capture_dirs}" "${extract_dirs}")
-endif()
diff --git a/libcxx/benchmarks/CMakeLists.txt b/libcxx/test/benchmarks/CMakeLists.txt
index d96ccc1..d61367a 100644
--- a/libcxx/benchmarks/CMakeLists.txt
+++ b/libcxx/test/benchmarks/CMakeLists.txt
@@ -5,8 +5,6 @@ include(CheckCXXCompilerFlag)
 # Build Google Benchmark
 #==============================================================================
 
-set(CMAKE_FOLDER "${CMAKE_FOLDER}/Benchmarks")
-
 set(BENCHMARK_COMPILE_FLAGS
     -Wno-unused-command-line-argument
     -nostdinc++
@@ -192,6 +190,10 @@ if (LIBCXX_INCLUDE_TESTS)
   include(AddLLVM)
 
   configure_lit_site_cfg(
+          ${CMAKE_CURRENT_SOURCE_DIR}/lit.cfg.py.in
+          ${CMAKE_CURRENT_BINARY_DIR}/lit.cfg.py)
+
+  configure_lit_site_cfg(
           ${CMAKE_CURRENT_SOURCE_DIR}/lit.site.cfg.py.in
           ${CMAKE_CURRENT_BINARY_DIR}/lit.site.cfg.py)
 
diff --git a/libcxx/benchmarks/CartesianBenchmarks.h b/libcxx/test/benchmarks/CartesianBenchmarks.h
index eca4e15..eca4e15 100644
--- a/libcxx/benchmarks/CartesianBenchmarks.h
+++ b/libcxx/test/benchmarks/CartesianBenchmarks.h
diff --git a/libcxx/benchmarks/ContainerBenchmarks.h b/libcxx/test/benchmarks/ContainerBenchmarks.h
index 744505b..744505b 100644
--- a/libcxx/benchmarks/ContainerBenchmarks.h
+++ b/libcxx/test/benchmarks/ContainerBenchmarks.h
diff --git a/libcxx/benchmarks/GenerateInput.h b/libcxx/test/benchmarks/GenerateInput.h
index 5710b4e..cc16943 100644
--- a/libcxx/benchmarks/GenerateInput.h
+++ b/libcxx/test/benchmarks/GenerateInput.h
@@ -1,3 +1,11 @@
+//===----------------------------------------------------------------------===//
+//
+// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
+// See https://llvm.org/LICENSE.txt for license information.
+// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
+//
+//===----------------------------------------------------------------------===//
+
 #ifndef BENCHMARK_GENERATE_INPUT_H
 #define BENCHMARK_GENERATE_INPUT_H
 
diff --git a/libcxx/benchmarks/Utilities.h b/libcxx/test/benchmarks/Utilities.h
index fed16ba..fed16ba 100644
--- a/libcxx/benchmarks/Utilities.h
+++ b/libcxx/test/benchmarks/Utilities.h
diff --git a/libcxx/benchmarks/VariantBenchmarks.h b/libcxx/test/benchmarks/VariantBenchmarks.h
index a8e9c9f..a8e9c9f 100644
--- a/libcxx/benchmarks/VariantBenchmarks.h
+++ b/libcxx/test/benchmarks/VariantBenchmarks.h
diff --git a/libcxx/benchmarks/algorithms.partition_point.bench.cpp b/libcxx/test/benchmarks/algorithms.partition_point.bench.cpp
index 7711517..ed2e337 100644
--- a/libcxx/benchmarks/algorithms.partition_point.bench.cpp
+++ b/libcxx/test/benchmarks/algorithms.partition_point.bench.cpp
@@ -1,3 +1,11 @@
+//===----------------------------------------------------------------------===//
+//
+// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
+// See https://llvm.org/LICENSE.txt for license information.
+// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
+//
+//===----------------------------------------------------------------------===//
+
 #include <algorithm>
 #include <array>
 #include <cassert>
diff --git a/libcxx/benchmarks/algorithms/common.h b/libcxx/test/benchmarks/algorithms/common.h
index 43131a4..43131a4 100644
--- a/libcxx/benchmarks/algorithms/common.h
+++ b/libcxx/test/benchmarks/algorithms/common.h
diff --git a/libcxx/benchmarks/algorithms/count.bench.cpp b/libcxx/test/benchmarks/algorithms/count.bench.cpp
index 7370293..7370293 100644
--- a/libcxx/benchmarks/algorithms/count.bench.cpp
+++ b/libcxx/test/benchmarks/algorithms/count.bench.cpp
diff --git a/libcxx/benchmarks/algorithms/equal.bench.cpp b/libcxx/test/benchmarks/algorithms/equal.bench.cpp
index 6d63d8c..6d63d8c 100644
--- a/libcxx/benchmarks/algorithms/equal.bench.cpp
+++ b/libcxx/test/benchmarks/algorithms/equal.bench.cpp
diff --git a/libcxx/benchmarks/algorithms/fill.bench.cpp b/libcxx/test/benchmarks/algorithms/fill.bench.cpp
index 40f3742..40f3742 100644
--- a/libcxx/benchmarks/algorithms/fill.bench.cpp
+++ b/libcxx/test/benchmarks/algorithms/fill.bench.cpp
diff --git a/libcxx/benchmarks/algorithms/find.bench.cpp b/libcxx/test/benchmarks/algorithms/find.bench.cpp
index 6ff2d95..6ff2d95 100644
--- a/libcxx/benchmarks/algorithms/find.bench.cpp
+++ b/libcxx/test/benchmarks/algorithms/find.bench.cpp
diff --git a/libcxx/benchmarks/algorithms/for_each.bench.cpp b/libcxx/test/benchmarks/algorithms/for_each.bench.cpp
index 7019dc1..7019dc1 100644
--- a/libcxx/benchmarks/algorithms/for_each.bench.cpp
+++ b/libcxx/test/benchmarks/algorithms/for_each.bench.cpp
diff --git a/libcxx/benchmarks/algorithms/lower_bound.bench.cpp b/libcxx/test/benchmarks/algorithms/lower_bound.bench.cpp
index 3be5010..3be5010 100644
--- a/libcxx/benchmarks/algorithms/lower_bound.bench.cpp
+++ b/libcxx/test/benchmarks/algorithms/lower_bound.bench.cpp
diff --git a/libcxx/benchmarks/algorithms/make_heap.bench.cpp b/libcxx/test/benchmarks/algorithms/make_heap.bench.cpp
index dade7b8..dade7b8 100644
--- a/libcxx/benchmarks/algorithms/make_heap.bench.cpp
+++ b/libcxx/test/benchmarks/algorithms/make_heap.bench.cpp
diff --git a/libcxx/benchmarks/algorithms/make_heap_then_sort_heap.bench.cpp b/libcxx/test/benchmarks/algorithms/make_heap_then_sort_heap.bench.cpp
index 48f34f8..48f34f8 100644
--- a/libcxx/benchmarks/algorithms/make_heap_then_sort_heap.bench.cpp
+++ b/libcxx/test/benchmarks/algorithms/make_heap_then_sort_heap.bench.cpp
diff --git a/libcxx/benchmarks/algorithms/min.bench.cpp b/libcxx/test/benchmarks/algorithms/min.bench.cpp
index 1e1dd4e..a09bd53 100644
--- a/libcxx/benchmarks/algorithms/min.bench.cpp
+++ b/libcxx/test/benchmarks/algorithms/min.bench.cpp
@@ -1,3 +1,11 @@
+//===----------------------------------------------------------------------===//
+//
+// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
+// See https://llvm.org/LICENSE.txt for license information.
+// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
+//
+//===----------------------------------------------------------------------===//
+
 #include <algorithm>
 #include <cassert>
 
diff --git a/libcxx/benchmarks/algorithms/min_max_element.bench.cpp b/libcxx/test/benchmarks/algorithms/min_max_element.bench.cpp
index e2c6423..e2c6423 100644
--- a/libcxx/benchmarks/algorithms/min_max_element.bench.cpp
+++ b/libcxx/test/benchmarks/algorithms/min_max_element.bench.cpp
diff --git a/libcxx/benchmarks/algorithms/minmax.bench.cpp b/libcxx/test/benchmarks/algorithms/minmax.bench.cpp
index b0ff7f91..ca1cdb4 100644
--- a/libcxx/benchmarks/algorithms/minmax.bench.cpp
+++ b/libcxx/test/benchmarks/algorithms/minmax.bench.cpp
@@ -1,3 +1,11 @@
+//===----------------------------------------------------------------------===//
+//
+// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
+// See https://llvm.org/LICENSE.txt for license information.
+// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
+//
+//===----------------------------------------------------------------------===//
+
 #include <algorithm>
 #include <cassert>
 
diff --git a/libcxx/benchmarks/algorithms/mismatch.bench.cpp b/libcxx/test/benchmarks/algorithms/mismatch.bench.cpp
index 7917828..7917828 100644
--- a/libcxx/benchmarks/algorithms/mismatch.bench.cpp
+++ b/libcxx/test/benchmarks/algorithms/mismatch.bench.cpp
diff --git a/libcxx/benchmarks/algorithms/pop_heap.bench.cpp b/libcxx/test/benchmarks/algorithms/pop_heap.bench.cpp
index 26cdd25..26cdd25 100644
--- a/libcxx/benchmarks/algorithms/pop_heap.bench.cpp
+++ b/libcxx/test/benchmarks/algorithms/pop_heap.bench.cpp
diff --git a/libcxx/benchmarks/algorithms/pstl.stable_sort.bench.cpp b/libcxx/test/benchmarks/algorithms/pstl.stable_sort.bench.cpp
index 72541f7..72541f7 100644
--- a/libcxx/benchmarks/algorithms/pstl.stable_sort.bench.cpp
+++ b/libcxx/test/benchmarks/algorithms/pstl.stable_sort.bench.cpp
diff --git a/libcxx/benchmarks/algorithms/push_heap.bench.cpp b/libcxx/test/benchmarks/algorithms/push_heap.bench.cpp
index ba96fa1..ba96fa1 100644
--- a/libcxx/benchmarks/algorithms/push_heap.bench.cpp
+++ b/libcxx/test/benchmarks/algorithms/push_heap.bench.cpp
diff --git a/libcxx/benchmarks/algorithms/ranges_contains.bench.cpp b/libcxx/test/benchmarks/algorithms/ranges_contains.bench.cpp
index f36ebff..f36ebff 100644
--- a/libcxx/benchmarks/algorithms/ranges_contains.bench.cpp
+++ b/libcxx/test/benchmarks/algorithms/ranges_contains.bench.cpp
diff --git a/libcxx/benchmarks/algorithms/ranges_ends_with.bench.cpp b/libcxx/test/benchmarks/algorithms/ranges_ends_with.bench.cpp
index 049af7c2..049af7c2 100644
--- a/libcxx/benchmarks/algorithms/ranges_ends_with.bench.cpp
+++ b/libcxx/test/benchmarks/algorithms/ranges_ends_with.bench.cpp
diff --git a/libcxx/benchmarks/algorithms/ranges_make_heap.bench.cpp b/libcxx/test/benchmarks/algorithms/ranges_make_heap.bench.cpp
index 66a8335..66a8335 100644
--- a/libcxx/benchmarks/algorithms/ranges_make_heap.bench.cpp
+++ b/libcxx/test/benchmarks/algorithms/ranges_make_heap.bench.cpp
diff --git a/libcxx/benchmarks/algorithms/ranges_make_heap_then_sort_heap.bench.cpp b/libcxx/test/benchmarks/algorithms/ranges_make_heap_then_sort_heap.bench.cpp
index 01632c8..01632c8 100644
--- a/libcxx/benchmarks/algorithms/ranges_make_heap_then_sort_heap.bench.cpp
+++ b/libcxx/test/benchmarks/algorithms/ranges_make_heap_then_sort_heap.bench.cpp
diff --git a/libcxx/benchmarks/algorithms/ranges_pop_heap.bench.cpp b/libcxx/test/benchmarks/algorithms/ranges_pop_heap.bench.cpp
index bcc7a83..bcc7a83 100644
--- a/libcxx/benchmarks/algorithms/ranges_pop_heap.bench.cpp
+++ b/libcxx/test/benchmarks/algorithms/ranges_pop_heap.bench.cpp
diff --git a/libcxx/benchmarks/algorithms/ranges_push_heap.bench.cpp b/libcxx/test/benchmarks/algorithms/ranges_push_heap.bench.cpp
index 902f481..902f481 100644
--- a/libcxx/benchmarks/algorithms/ranges_push_heap.bench.cpp
+++ b/libcxx/test/benchmarks/algorithms/ranges_push_heap.bench.cpp
diff --git a/libcxx/benchmarks/algorithms/ranges_sort.bench.cpp b/libcxx/test/benchmarks/algorithms/ranges_sort.bench.cpp
index aeb2aed..aeb2aed 100644
--- a/libcxx/benchmarks/algorithms/ranges_sort.bench.cpp
+++ b/libcxx/test/benchmarks/algorithms/ranges_sort.bench.cpp
diff --git a/libcxx/benchmarks/algorithms/ranges_sort_heap.bench.cpp b/libcxx/test/benchmarks/algorithms/ranges_sort_heap.bench.cpp
index 62c607c..62c607c 100644
--- a/libcxx/benchmarks/algorithms/ranges_sort_heap.bench.cpp
+++ b/libcxx/test/benchmarks/algorithms/ranges_sort_heap.bench.cpp
diff --git a/libcxx/benchmarks/algorithms/ranges_stable_sort.bench.cpp b/libcxx/test/benchmarks/algorithms/ranges_stable_sort.bench.cpp
index 8832748..8832748 100644
--- a/libcxx/benchmarks/algorithms/ranges_stable_sort.bench.cpp
+++ b/libcxx/test/benchmarks/algorithms/ranges_stable_sort.bench.cpp
diff --git a/libcxx/benchmarks/algorithms/set_intersection.bench.cpp b/libcxx/test/benchmarks/algorithms/set_intersection.bench.cpp
index b3fb15f..b3fb15f 100644
--- a/libcxx/benchmarks/algorithms/set_intersection.bench.cpp
+++ b/libcxx/test/benchmarks/algorithms/set_intersection.bench.cpp
diff --git a/libcxx/benchmarks/algorithms/sort.bench.cpp b/libcxx/test/benchmarks/algorithms/sort.bench.cpp
index f87434b..f87434b 100644
--- a/libcxx/benchmarks/algorithms/sort.bench.cpp
+++ b/libcxx/test/benchmarks/algorithms/sort.bench.cpp
diff --git a/libcxx/benchmarks/algorithms/sort_heap.bench.cpp b/libcxx/test/benchmarks/algorithms/sort_heap.bench.cpp
index 1372b4d..1372b4d 100644
--- a/libcxx/benchmarks/algorithms/sort_heap.bench.cpp
+++ b/libcxx/test/benchmarks/algorithms/sort_heap.bench.cpp
diff --git a/libcxx/benchmarks/algorithms/stable_sort.bench.cpp b/libcxx/test/benchmarks/algorithms/stable_sort.bench.cpp
index 024a036..024a036 100644
--- a/libcxx/benchmarks/algorithms/stable_sort.bench.cpp
+++ b/libcxx/test/benchmarks/algorithms/stable_sort.bench.cpp
diff --git a/libcxx/benchmarks/allocation.bench.cpp b/libcxx/test/benchmarks/allocation.bench.cpp
index 1d0c71f..1d0c71f 100644
--- a/libcxx/benchmarks/allocation.bench.cpp
+++ b/libcxx/test/benchmarks/allocation.bench.cpp
diff --git a/libcxx/benchmarks/atomic_wait.bench.cpp b/libcxx/test/benchmarks/atomic_wait.bench.cpp
index 4a06a45..dd541b4 100644
--- a/libcxx/benchmarks/atomic_wait.bench.cpp
+++ b/libcxx/test/benchmarks/atomic_wait.bench.cpp
@@ -1,4 +1,5 @@
 //===----------------------------------------------------------------------===//
+//
 // Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
 // See https://llvm.org/LICENSE.txt for license information.
 // SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
diff --git a/libcxx/benchmarks/atomic_wait_vs_mutex_lock.bench.cpp b/libcxx/test/benchmarks/atomic_wait_vs_mutex_lock.bench.cpp
index c60fcd5..1a52e5d 100644
--- a/libcxx/benchmarks/atomic_wait_vs_mutex_lock.bench.cpp
+++ b/libcxx/test/benchmarks/atomic_wait_vs_mutex_lock.bench.cpp
@@ -1,4 +1,5 @@
 //===----------------------------------------------------------------------===//
+//
 // Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
 // See https://llvm.org/LICENSE.txt for license information.
 // SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
diff --git a/libcxx/benchmarks/deque.bench.cpp b/libcxx/test/benchmarks/deque.bench.cpp
index d6dadaa..d6dadaa 100644
--- a/libcxx/benchmarks/deque.bench.cpp
+++ b/libcxx/test/benchmarks/deque.bench.cpp
diff --git a/libcxx/benchmarks/deque_iterator.bench.cpp b/libcxx/test/benchmarks/deque_iterator.bench.cpp
index 0eb23f2..0eb23f2 100644
--- a/libcxx/benchmarks/deque_iterator.bench.cpp
+++ b/libcxx/test/benchmarks/deque_iterator.bench.cpp
diff --git a/libcxx/benchmarks/exception_ptr.bench.cpp b/libcxx/test/benchmarks/exception_ptr.bench.cpp
index 1292ad7..1292ad7 100644
--- a/libcxx/benchmarks/exception_ptr.bench.cpp
+++ b/libcxx/test/benchmarks/exception_ptr.bench.cpp
diff --git a/libcxx/benchmarks/filesystem.bench.cpp b/libcxx/test/benchmarks/filesystem.bench.cpp
index d1a1763..19f9586 100644
--- a/libcxx/benchmarks/filesystem.bench.cpp
+++ b/libcxx/test/benchmarks/filesystem.bench.cpp
@@ -1,3 +1,11 @@
+//===----------------------------------------------------------------------===//
+//
+// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
+// See https://llvm.org/LICENSE.txt for license information.
+// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
+//
+//===----------------------------------------------------------------------===//
+
 #include <filesystem>
 
 #include "GenerateInput.h"
diff --git a/libcxx/benchmarks/format.bench.cpp b/libcxx/test/benchmarks/format.bench.cpp
index 89f1132..d6cb046 100644
--- a/libcxx/benchmarks/format.bench.cpp
+++ b/libcxx/test/benchmarks/format.bench.cpp
@@ -1,4 +1,5 @@
 //===----------------------------------------------------------------------===//
+//
 // Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
 // See https://llvm.org/LICENSE.txt for license information.
 // SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
diff --git a/libcxx/benchmarks/format_to.bench.cpp b/libcxx/test/benchmarks/format_to.bench.cpp
index 4e6041d..48cb3da 100644
--- a/libcxx/benchmarks/format_to.bench.cpp
+++ b/libcxx/test/benchmarks/format_to.bench.cpp
@@ -1,4 +1,5 @@
 //===----------------------------------------------------------------------===//
+//
 // Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
 // See https://llvm.org/LICENSE.txt for license information.
 // SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
diff --git a/libcxx/benchmarks/format_to_n.bench.cpp b/libcxx/test/benchmarks/format_to_n.bench.cpp
index f5816d4..99723c7 100644
--- a/libcxx/benchmarks/format_to_n.bench.cpp
+++ b/libcxx/test/benchmarks/format_to_n.bench.cpp
@@ -1,4 +1,5 @@
 //===----------------------------------------------------------------------===//
+//
 // Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
 // See https://llvm.org/LICENSE.txt for license information.
 // SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
diff --git a/libcxx/benchmarks/formatted_size.bench.cpp b/libcxx/test/benchmarks/formatted_size.bench.cpp
index de67dae..2945df2 100644
--- a/libcxx/benchmarks/formatted_size.bench.cpp
+++ b/libcxx/test/benchmarks/formatted_size.bench.cpp
@@ -1,4 +1,5 @@
 //===----------------------------------------------------------------------===//
+//
 // Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
 // See https://llvm.org/LICENSE.txt for license information.
 // SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
diff --git a/libcxx/benchmarks/formatter_float.bench.cpp b/libcxx/test/benchmarks/formatter_float.bench.cpp
index e7a673b..d1da585 100644
--- a/libcxx/benchmarks/formatter_float.bench.cpp
+++ b/libcxx/test/benchmarks/formatter_float.bench.cpp
@@ -1,4 +1,5 @@
 //===----------------------------------------------------------------------===//
+//
 // Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
 // See https://llvm.org/LICENSE.txt for license information.
 // SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
diff --git a/libcxx/benchmarks/formatter_int.bench.cpp b/libcxx/test/benchmarks/formatter_int.bench.cpp
index 7cd794a..6c624e9 100644
--- a/libcxx/benchmarks/formatter_int.bench.cpp
+++ b/libcxx/test/benchmarks/formatter_int.bench.cpp
@@ -1,4 +1,5 @@
 //===----------------------------------------------------------------------===//
+//
 // Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
 // See https://llvm.org/LICENSE.txt for license information.
 // SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
diff --git a/libcxx/benchmarks/function.bench.cpp b/libcxx/test/benchmarks/function.bench.cpp
index dd397bc..dd397bc 100644
--- a/libcxx/benchmarks/function.bench.cpp
+++ b/libcxx/test/benchmarks/function.bench.cpp
diff --git a/libcxx/benchmarks/join_view.bench.cpp b/libcxx/test/benchmarks/join_view.bench.cpp
index c789a39..c789a39 100644
--- a/libcxx/benchmarks/join_view.bench.cpp
+++ b/libcxx/test/benchmarks/join_view.bench.cpp
diff --git a/libcxx/benchmarks/lexicographical_compare_three_way.bench.cpp b/libcxx/test/benchmarks/lexicographical_compare_three_way.bench.cpp
index e58134a..03f20d0 100644
--- a/libcxx/benchmarks/lexicographical_compare_three_way.bench.cpp
+++ b/libcxx/test/benchmarks/lexicographical_compare_three_way.bench.cpp
@@ -1,4 +1,5 @@
 //===----------------------------------------------------------------------===//
+//
 // Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
 // See https://llvm.org/LICENSE.txt for license information.
 // SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
diff --git a/libcxx/benchmarks/libcxxabi/dynamic_cast.bench.cpp b/libcxx/test/benchmarks/libcxxabi/dynamic_cast.bench.cpp
index 439eea8..439eea8 100644
--- a/libcxx/benchmarks/libcxxabi/dynamic_cast.bench.cpp
+++ b/libcxx/test/benchmarks/libcxxabi/dynamic_cast.bench.cpp
diff --git a/libcxx/benchmarks/libcxxabi/dynamic_cast_old_stress.bench.cpp b/libcxx/test/benchmarks/libcxxabi/dynamic_cast_old_stress.bench.cpp
index df4daf7..df4daf7 100644
--- a/libcxx/benchmarks/libcxxabi/dynamic_cast_old_stress.bench.cpp
+++ b/libcxx/test/benchmarks/libcxxabi/dynamic_cast_old_stress.bench.cpp
diff --git a/libcxx/benchmarks/lit.cfg.py b/libcxx/test/benchmarks/lit.cfg.py.in
index 0d08966..a3ce247 100644
--- a/libcxx/benchmarks/lit.cfg.py
+++ b/libcxx/test/benchmarks/lit.cfg.py.in
@@ -3,7 +3,7 @@
 import os
 import site
 
-site.addsitedir(os.path.join(os.path.dirname(os.path.dirname(__file__)), "utils"))
+site.addsitedir(os.path.join("@LIBCXX_SOURCE_DIR@", "utils"))
 from libcxx.test.googlebenchmark import GoogleBenchmark
 
 # Tell pylint that we know config and lit_config exist somewhere.
@@ -15,8 +15,8 @@ if "PYLINT_IMPORT" in os.environ:
 config.name = "libc++ benchmarks"
 config.suffixes = []
 
-config.test_exec_root = os.path.join(config.libcxx_obj_root, "benchmarks")
-config.test_source_root = config.test_exec_root
+config.test_exec_root = "@CMAKE_CURRENT_BINARY_DIR@"
+config.test_source_root = "@CMAKE_CURRENT_BINARY_DIR@"
 
 config.test_format = GoogleBenchmark(
     test_sub_dirs=".", test_suffix=".bench.out", benchmark_args=config.benchmark_args
diff --git a/libcxx/benchmarks/lit.site.cfg.py.in b/libcxx/test/benchmarks/lit.site.cfg.py.in
index e3ce8b2..6d4b0ca 100644
--- a/libcxx/benchmarks/lit.site.cfg.py.in
+++ b/libcxx/test/benchmarks/lit.site.cfg.py.in
@@ -7,4 +7,4 @@ config.libcxx_obj_root = "@LIBCXX_BINARY_DIR@"
 config.benchmark_args = "@LIBCXX_BENCHMARK_TEST_ARGS@".split(';')
 
 # Let the main config do the real work.
-lit_config.load_config(config, "@LIBCXX_SOURCE_DIR@/benchmarks/lit.cfg.py")
-\ No newline at end of file
+lit_config.load_config(config, "@CMAKE_CURRENT_BINARY_DIR@/lit.cfg.py")
+\ No newline at end of file
diff --git a/libcxx/benchmarks/map.bench.cpp b/libcxx/test/benchmarks/map.bench.cpp
index 255164b..255164b 100644
--- a/libcxx/benchmarks/map.bench.cpp
+++ b/libcxx/test/benchmarks/map.bench.cpp
diff --git a/libcxx/benchmarks/monotonic_buffer.bench.cpp b/libcxx/test/benchmarks/monotonic_buffer.bench.cpp
index 39bb853..39bb853 100644
--- a/libcxx/benchmarks/monotonic_buffer.bench.cpp
+++ b/libcxx/test/benchmarks/monotonic_buffer.bench.cpp
diff --git a/libcxx/benchmarks/numeric/gcd.bench.cpp b/libcxx/test/benchmarks/numeric/gcd.bench.cpp
index f8b6a85..f8b6a85 100644
--- a/libcxx/benchmarks/numeric/gcd.bench.cpp
+++ b/libcxx/test/benchmarks/numeric/gcd.bench.cpp
diff --git a/libcxx/benchmarks/ordered_set.bench.cpp b/libcxx/test/benchmarks/ordered_set.bench.cpp
index 22540d8..22540d8 100644
--- a/libcxx/benchmarks/ordered_set.bench.cpp
+++ b/libcxx/test/benchmarks/ordered_set.bench.cpp
diff --git a/libcxx/benchmarks/random.bench.cpp b/libcxx/test/benchmarks/random.bench.cpp
index fe2eb66..0645a4e 100644
--- a/libcxx/benchmarks/random.bench.cpp
+++ b/libcxx/test/benchmarks/random.bench.cpp
@@ -1,4 +1,5 @@
 //===----------------------------------------------------------------------===//
+//
 // Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
 // See https://llvm.org/LICENSE.txt for license information.
 // SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
diff --git a/libcxx/benchmarks/shared_mutex_vs_mutex.bench.cpp b/libcxx/test/benchmarks/shared_mutex_vs_mutex.bench.cpp
index 19d13b7..5482935 100644
--- a/libcxx/benchmarks/shared_mutex_vs_mutex.bench.cpp
+++ b/libcxx/test/benchmarks/shared_mutex_vs_mutex.bench.cpp
@@ -1,4 +1,5 @@
 //===----------------------------------------------------------------------===//
+//
 // Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
 // See https://llvm.org/LICENSE.txt for license information.
 // SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
diff --git a/libcxx/benchmarks/std_format_spec_string_unicode.bench.cpp b/libcxx/test/benchmarks/std_format_spec_string_unicode.bench.cpp
index 3769e96..3769e96 100644
--- a/libcxx/benchmarks/std_format_spec_string_unicode.bench.cpp
+++ b/libcxx/test/benchmarks/std_format_spec_string_unicode.bench.cpp
diff --git a/libcxx/benchmarks/std_format_spec_string_unicode_escape.bench.cpp b/libcxx/test/benchmarks/std_format_spec_string_unicode_escape.bench.cpp
index 3b5a1c4..3b5a1c4 100644
--- a/libcxx/benchmarks/std_format_spec_string_unicode_escape.bench.cpp
+++ b/libcxx/test/benchmarks/std_format_spec_string_unicode_escape.bench.cpp
diff --git a/libcxx/benchmarks/stop_token.bench.cpp b/libcxx/test/benchmarks/stop_token.bench.cpp
index e059a11..6be4736 100644
--- a/libcxx/benchmarks/stop_token.bench.cpp
+++ b/libcxx/test/benchmarks/stop_token.bench.cpp
@@ -1,4 +1,5 @@
 //===----------------------------------------------------------------------===//
+//
 // Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
 // See https://llvm.org/LICENSE.txt for license information.
 // SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
diff --git a/libcxx/benchmarks/string.bench.cpp b/libcxx/test/benchmarks/string.bench.cpp
index 92018b0..49d3722 100644
--- a/libcxx/benchmarks/string.bench.cpp
+++ b/libcxx/test/benchmarks/string.bench.cpp
@@ -1,3 +1,10 @@
+//===----------------------------------------------------------------------===//
+//
+// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
+// See https://llvm.org/LICENSE.txt for license information.
+// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
+//
+//===----------------------------------------------------------------------===//
 
 #include <cstdint>
 #include <new>
diff --git a/libcxx/benchmarks/stringstream.bench.cpp b/libcxx/test/benchmarks/stringstream.bench.cpp
index 3cbe5ac..a333900 100644
--- a/libcxx/benchmarks/stringstream.bench.cpp
+++ b/libcxx/test/benchmarks/stringstream.bench.cpp
@@ -1,3 +1,11 @@
+//===----------------------------------------------------------------------===//
+//
+// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
+// See https://llvm.org/LICENSE.txt for license information.
+// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
+//
+//===----------------------------------------------------------------------===//
+
 #include "benchmark/benchmark.h"
 #include "test_macros.h"
 
diff --git a/libcxx/benchmarks/system_error.bench.cpp b/libcxx/test/benchmarks/system_error.bench.cpp
index 4b0568d..4b0568d 100644
--- a/libcxx/benchmarks/system_error.bench.cpp
+++ b/libcxx/test/benchmarks/system_error.bench.cpp
diff --git a/libcxx/benchmarks/to_chars.bench.cpp b/libcxx/test/benchmarks/to_chars.bench.cpp
index 1a3dc64..2e3c59f 100644
--- a/libcxx/benchmarks/to_chars.bench.cpp
+++ b/libcxx/test/benchmarks/to_chars.bench.cpp
@@ -1,4 +1,5 @@
 //===----------------------------------------------------------------------===//
+//
 // Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
 // See https://llvm.org/LICENSE.txt for license information.
 // SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
diff --git a/libcxx/benchmarks/unordered_set_operations.bench.cpp b/libcxx/test/benchmarks/unordered_set_operations.bench.cpp
index d49de57..bcf6add 100644
--- a/libcxx/benchmarks/unordered_set_operations.bench.cpp
+++ b/libcxx/test/benchmarks/unordered_set_operations.bench.cpp
@@ -1,3 +1,11 @@
+//===----------------------------------------------------------------------===//
+//
+// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
+// See https://llvm.org/LICENSE.txt for license information.
+// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
+//
+//===----------------------------------------------------------------------===//
+
 #include <cstdint>
 #include <cstdlib>
 #include <cstring>
diff --git a/libcxx/benchmarks/util_smartptr.bench.cpp b/libcxx/test/benchmarks/util_smartptr.bench.cpp
index 053cbd6..053cbd6 100644
--- a/libcxx/benchmarks/util_smartptr.bench.cpp
+++ b/libcxx/test/benchmarks/util_smartptr.bench.cpp
diff --git a/libcxx/benchmarks/variant_visit_1.bench.cpp b/libcxx/test/benchmarks/variant_visit_1.bench.cpp
index 7d736f8..fa9b246 100644
--- a/libcxx/benchmarks/variant_visit_1.bench.cpp
+++ b/libcxx/test/benchmarks/variant_visit_1.bench.cpp
@@ -1,3 +1,11 @@
+//===----------------------------------------------------------------------===//
+//
+// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
+// See https://llvm.org/LICENSE.txt for license information.
+// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
+//
+//===----------------------------------------------------------------------===//
+
 #include "benchmark/benchmark.h"
 
 #include "VariantBenchmarks.h"
diff --git a/libcxx/benchmarks/variant_visit_2.bench.cpp b/libcxx/test/benchmarks/variant_visit_2.bench.cpp
index ed26cd4..84e26f9 100644
--- a/libcxx/benchmarks/variant_visit_2.bench.cpp
+++ b/libcxx/test/benchmarks/variant_visit_2.bench.cpp
@@ -1,3 +1,11 @@
+//===----------------------------------------------------------------------===//
+//
+// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
+// See https://llvm.org/LICENSE.txt for license information.
+// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
+//
+//===----------------------------------------------------------------------===//
+
 #include "benchmark/benchmark.h"
 
 #include "VariantBenchmarks.h"
diff --git a/libcxx/benchmarks/variant_visit_3.bench.cpp b/libcxx/test/benchmarks/variant_visit_3.bench.cpp
index b20d503..1b4e903 100644
--- a/libcxx/benchmarks/variant_visit_3.bench.cpp
+++ b/libcxx/test/benchmarks/variant_visit_3.bench.cpp
@@ -1,3 +1,11 @@
+//===----------------------------------------------------------------------===//
+//
+// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
+// See https://llvm.org/LICENSE.txt for license information.
+// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
+//
+//===----------------------------------------------------------------------===//
+
 #include "benchmark/benchmark.h"
 
 #include "VariantBenchmarks.h"
diff --git a/libcxx/benchmarks/vector_operations.bench.cpp b/libcxx/test/benchmarks/vector_operations.bench.cpp
index da21d18..8698e45 100644
--- a/libcxx/benchmarks/vector_operations.bench.cpp
+++ b/libcxx/test/benchmarks/vector_operations.bench.cpp
@@ -1,3 +1,11 @@
+//===----------------------------------------------------------------------===//
+//
+// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
+// See https://llvm.org/LICENSE.txt for license information.
+// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
+//
+//===----------------------------------------------------------------------===//
+
 #include <cstdint>
 #include <cstdlib>
 #include <cstring>
diff --git a/libcxx/test/libcxx/input.output/iostream.format/output.streams/ostream.formatted/ostream.formatted.print/vprint_unicode.pass.cpp b/libcxx/test/libcxx/input.output/iostream.format/output.streams/ostream.formatted/ostream.formatted.print/vprint_unicode.pass.cpp
index 9c801f5..52d8500 100644
--- a/libcxx/test/libcxx/input.output/iostream.format/output.streams/ostream.formatted/ostream.formatted.print/vprint_unicode.pass.cpp
+++ b/libcxx/test/libcxx/input.output/iostream.format/output.streams/ostream.formatted/ostream.formatted.print/vprint_unicode.pass.cpp
@@ -1,4 +1,5 @@
 //===----------------------------------------------------------------------===//
+//
 // Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
 // See https://llvm.org/LICENSE.txt for license information.
 // SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
diff --git a/libcxx/test/libcxx/input.output/iostream.format/output.streams/ostream.syn/includes.compile.pass.cpp b/libcxx/test/libcxx/input.output/iostream.format/output.streams/ostream.syn/includes.compile.pass.cpp
index 0d685f9..7a5bfb992 100644
--- a/libcxx/test/libcxx/input.output/iostream.format/output.streams/ostream.syn/includes.compile.pass.cpp
+++ b/libcxx/test/libcxx/input.output/iostream.format/output.streams/ostream.syn/includes.compile.pass.cpp
@@ -1,4 +1,5 @@
 //===----------------------------------------------------------------------===//
+//
 // Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
 // See https://llvm.org/LICENSE.txt for license information.
 // SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
diff --git a/libcxx/test/libcxx/input.output/iostream.format/print.fun/transcoding.pass.cpp b/libcxx/test/libcxx/input.output/iostream.format/print.fun/transcoding.pass.cpp
index ab52489..6cf5c42 100644
--- a/libcxx/test/libcxx/input.output/iostream.format/print.fun/transcoding.pass.cpp
+++ b/libcxx/test/libcxx/input.output/iostream.format/print.fun/transcoding.pass.cpp
@@ -1,4 +1,5 @@
 //===----------------------------------------------------------------------===//
+//
 // Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
 // See https://llvm.org/LICENSE.txt for license information.
 // SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
diff --git a/libcxx/test/libcxx/input.output/iostream.format/print.fun/vprint_unicode_posix.pass.cpp b/libcxx/test/libcxx/input.output/iostream.format/print.fun/vprint_unicode_posix.pass.cpp
index fd570c2..b89d02b 100644
--- a/libcxx/test/libcxx/input.output/iostream.format/print.fun/vprint_unicode_posix.pass.cpp
+++ b/libcxx/test/libcxx/input.output/iostream.format/print.fun/vprint_unicode_posix.pass.cpp
@@ -1,4 +1,5 @@
 //===----------------------------------------------------------------------===//
+//
 // Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
 // See https://llvm.org/LICENSE.txt for license information.
 // SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
diff --git a/libcxx/test/libcxx/input.output/iostream.format/print.fun/vprint_unicode_windows.pass.cpp b/libcxx/test/libcxx/input.output/iostream.format/print.fun/vprint_unicode_windows.pass.cpp
index 7d17662..bcd1d05 100644
--- a/libcxx/test/libcxx/input.output/iostream.format/print.fun/vprint_unicode_windows.pass.cpp
+++ b/libcxx/test/libcxx/input.output/iostream.format/print.fun/vprint_unicode_windows.pass.cpp
@@ -1,4 +1,5 @@
 //===----------------------------------------------------------------------===//
+//
 // Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
 // See https://llvm.org/LICENSE.txt for license information.
 // SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
diff --git a/libcxx/test/libcxx/time/convert_to_tm.pass.cpp b/libcxx/test/libcxx/time/convert_to_tm.pass.cpp
index 908a38d..6e3fa62 100644
--- a/libcxx/test/libcxx/time/convert_to_tm.pass.cpp
+++ b/libcxx/test/libcxx/time/convert_to_tm.pass.cpp
@@ -1,4 +1,5 @@
 //===----------------------------------------------------------------------===//
+//
 // Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
 // See https://llvm.org/LICENSE.txt for license information.
 // SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
diff --git a/libcxx/test/libcxx/utilities/expected/expected.expected/no_unique_address.compile.pass.cpp b/libcxx/test/libcxx/utilities/expected/expected.expected/no_unique_address.compile.pass.cpp
index 580c0f4..8c2b71f3 100644
--- a/libcxx/test/libcxx/utilities/expected/expected.expected/no_unique_address.compile.pass.cpp
+++ b/libcxx/test/libcxx/utilities/expected/expected.expected/no_unique_address.compile.pass.cpp
@@ -1,4 +1,5 @@
 //===----------------------------------------------------------------------===//
+//
 // Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
 // See https://llvm.org/LICENSE.txt for license information.
 // SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
diff --git a/libcxx/test/libcxx/utilities/expected/expected.expected/noexcept.extension.compile.pass.cpp b/libcxx/test/libcxx/utilities/expected/expected.expected/noexcept.extension.compile.pass.cpp
index 643bf4d..4f8c4ef 100644
--- a/libcxx/test/libcxx/utilities/expected/expected.expected/noexcept.extension.compile.pass.cpp
+++ b/libcxx/test/libcxx/utilities/expected/expected.expected/noexcept.extension.compile.pass.cpp
@@ -1,4 +1,5 @@
 //===----------------------------------------------------------------------===//
+//
 // Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
 // See https://llvm.org/LICENSE.txt for license information.
 // SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
diff --git a/libcxx/test/libcxx/utilities/expected/expected.unexpected/noexcept.extension.compile.pass.cpp b/libcxx/test/libcxx/utilities/expected/expected.unexpected/noexcept.extension.compile.pass.cpp
index 867b90e..162ecd5 100644
--- a/libcxx/test/libcxx/utilities/expected/expected.unexpected/noexcept.extension.compile.pass.cpp
+++ b/libcxx/test/libcxx/utilities/expected/expected.unexpected/noexcept.extension.compile.pass.cpp
@@ -1,4 +1,5 @@
 //===----------------------------------------------------------------------===//
+//
 // Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
 // See https://llvm.org/LICENSE.txt for license information.
 // SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
diff --git a/libcxx/test/libcxx/utilities/expected/expected.void/no_unique_address.compile.pass.cpp b/libcxx/test/libcxx/utilities/expected/expected.void/no_unique_address.compile.pass.cpp
index 27da03c..c4aae33 100644
--- a/libcxx/test/libcxx/utilities/expected/expected.void/no_unique_address.compile.pass.cpp
+++ b/libcxx/test/libcxx/utilities/expected/expected.void/no_unique_address.compile.pass.cpp
@@ -1,4 +1,5 @@
 //===----------------------------------------------------------------------===//
+//
 // Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
 // See https://llvm.org/LICENSE.txt for license information.
 // SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
diff --git a/libcxx/test/libcxx/utilities/expected/expected.void/noexcept.extension.compile.pass.cpp b/libcxx/test/libcxx/utilities/expected/expected.void/noexcept.extension.compile.pass.cpp
index 3ecbc0a..2d5cee9 100644
--- a/libcxx/test/libcxx/utilities/expected/expected.void/noexcept.extension.compile.pass.cpp
+++ b/libcxx/test/libcxx/utilities/expected/expected.void/noexcept.extension.compile.pass.cpp
@@ -1,4 +1,5 @@
 //===----------------------------------------------------------------------===//
+//
 // Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
 // See https://llvm.org/LICENSE.txt for license information.
 // SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
diff --git a/libcxx/test/libcxx/utilities/expected/expected.void/value.lwg3940.verify.cpp b/libcxx/test/libcxx/utilities/expected/expected.void/value.lwg3940.verify.cpp
index 9c1c1d2..253ef1d 100644
--- a/libcxx/test/libcxx/utilities/expected/expected.void/value.lwg3940.verify.cpp
+++ b/libcxx/test/libcxx/utilities/expected/expected.void/value.lwg3940.verify.cpp
@@ -1,4 +1,5 @@
 //===----------------------------------------------------------------------===//
+//
 // Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
 // See https://llvm.org/LICENSE.txt for license information.
 // SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
diff --git a/libcxx/test/libcxx/utilities/format/enable_insertable.compile.pass.cpp b/libcxx/test/libcxx/utilities/format/enable_insertable.compile.pass.cpp
index 5c10032..2249e2e 100644
--- a/libcxx/test/libcxx/utilities/format/enable_insertable.compile.pass.cpp
+++ b/libcxx/test/libcxx/utilities/format/enable_insertable.compile.pass.cpp
@@ -1,4 +1,5 @@
 //===----------------------------------------------------------------------===//
+//
 // Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
 // See https://llvm.org/LICENSE.txt for license information.
 // SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
diff --git a/libcxx/test/libcxx/utilities/format/format.arguments/format.arg/arg_t.compile.pass.cpp b/libcxx/test/libcxx/utilities/format/format.arguments/format.arg/arg_t.compile.pass.cpp
index eb562e1..88039f7 100644
--- a/libcxx/test/libcxx/utilities/format/format.arguments/format.arg/arg_t.compile.pass.cpp
+++ b/libcxx/test/libcxx/utilities/format/format.arguments/format.arg/arg_t.compile.pass.cpp
@@ -1,4 +1,5 @@
 //===----------------------------------------------------------------------===//
+//
 // Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
 // See https://llvm.org/LICENSE.txt for license information.
 // SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
diff --git a/libcxx/test/libcxx/utilities/format/format.formatter/format.context/types.compile.pass.cpp b/libcxx/test/libcxx/utilities/format/format.formatter/format.context/types.compile.pass.cpp
index c0fe5a8..cd06c50 100644
--- a/libcxx/test/libcxx/utilities/format/format.formatter/format.context/types.compile.pass.cpp
+++ b/libcxx/test/libcxx/utilities/format/format.formatter/format.context/types.compile.pass.cpp
@@ -1,4 +1,5 @@
 //===----------------------------------------------------------------------===//
+//
 // Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
 // See https://llvm.org/LICENSE.txt for license information.
 // SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
diff --git a/libcxx/test/libcxx/utilities/format/format.functions/ascii.pass.cpp b/libcxx/test/libcxx/utilities/format/format.functions/ascii.pass.cpp
index dd6b36e..818fc72 100644
--- a/libcxx/test/libcxx/utilities/format/format.functions/ascii.pass.cpp
+++ b/libcxx/test/libcxx/utilities/format/format.functions/ascii.pass.cpp
@@ -1,4 +1,5 @@
 //===----------------------------------------------------------------------===//
+//
 // Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
 // See https://llvm.org/LICENSE.txt for license information.
 // SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
diff --git a/libcxx/test/libcxx/utilities/format/format.functions/escaped_output.ascii.pass.cpp b/libcxx/test/libcxx/utilities/format/format.functions/escaped_output.ascii.pass.cpp
index d43d4de..7d1cce5 100644
--- a/libcxx/test/libcxx/utilities/format/format.functions/escaped_output.ascii.pass.cpp
+++ b/libcxx/test/libcxx/utilities/format/format.functions/escaped_output.ascii.pass.cpp
@@ -1,4 +1,5 @@
 //===----------------------------------------------------------------------===//
+//
 // Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
 // See https://llvm.org/LICENSE.txt for license information.
 // SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
diff --git a/libcxx/test/libcxx/utilities/format/format.string/format.string.std/code_point_width_estimation.pass.cpp b/libcxx/test/libcxx/utilities/format/format.string/format.string.std/code_point_width_estimation.pass.cpp
index ead90e6..e119e42 100644
--- a/libcxx/test/libcxx/utilities/format/format.string/format.string.std/code_point_width_estimation.pass.cpp
+++ b/libcxx/test/libcxx/utilities/format/format.string/format.string.std/code_point_width_estimation.pass.cpp
@@ -1,4 +1,5 @@
 //===----------------------------------------------------------------------===//
+//
 // Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
 // See https://llvm.org/LICENSE.txt for license information.
 // SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
diff --git a/libcxx/test/libcxx/utilities/format/format.string/format.string.std/concepts_precision.h b/libcxx/test/libcxx/utilities/format/format.string/format.string.std/concepts_precision.h
index 86c9f06..80751ea 100644
--- a/libcxx/test/libcxx/utilities/format/format.string/format.string.std/concepts_precision.h
+++ b/libcxx/test/libcxx/utilities/format/format.string/format.string.std/concepts_precision.h
@@ -1,4 +1,5 @@
 //===----------------------------------------------------------------------===//
+//
 // Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
 // See https://llvm.org/LICENSE.txt for license information.
 // SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
diff --git a/libcxx/test/libcxx/utilities/format/format.string/format.string.std/escaped_output.pass.cpp b/libcxx/test/libcxx/utilities/format/format.string/format.string.std/escaped_output.pass.cpp
index 5b11916..298bbe6 100644
--- a/libcxx/test/libcxx/utilities/format/format.string/format.string.std/escaped_output.pass.cpp
+++ b/libcxx/test/libcxx/utilities/format/format.string/format.string.std/escaped_output.pass.cpp
@@ -1,4 +1,5 @@
 //===----------------------------------------------------------------------===//
+//
 // Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
 // See https://llvm.org/LICENSE.txt for license information.
 // SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
diff --git a/libcxx/test/libcxx/utilities/format/format.string/format.string.std/extended_grapheme_cluster.pass.cpp b/libcxx/test/libcxx/utilities/format/format.string/format.string.std/extended_grapheme_cluster.pass.cpp
index f69a995..dd1f4b6 100644
--- a/libcxx/test/libcxx/utilities/format/format.string/format.string.std/extended_grapheme_cluster.pass.cpp
+++ b/libcxx/test/libcxx/utilities/format/format.string/format.string.std/extended_grapheme_cluster.pass.cpp
@@ -1,4 +1,5 @@
 //===----------------------------------------------------------------------===//
+//
 // Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
 // See https://llvm.org/LICENSE.txt for license information.
 // SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
diff --git a/libcxx/test/libcxx/utilities/format/format.string/format.string.std/test_exception.h b/libcxx/test/libcxx/utilities/format/format.string/format.string.std/test_exception.h
index 80e45c5..3b87942 100644
--- a/libcxx/test/libcxx/utilities/format/format.string/format.string.std/test_exception.h
+++ b/libcxx/test/libcxx/utilities/format/format.string/format.string.std/test_exception.h
@@ -1,4 +1,5 @@
 //===----------------------------------------------------------------------===//
+//
 // Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
 // See https://llvm.org/LICENSE.txt for license information.
 // SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
diff --git a/libcxx/test/std/algorithms/alg.nonmodifying/alg.find.last/ranges.find_last.pass.cpp b/libcxx/test/std/algorithms/alg.nonmodifying/alg.find.last/ranges.find_last.pass.cpp
index 2a2b12f..9da8c26 100644
--- a/libcxx/test/std/algorithms/alg.nonmodifying/alg.find.last/ranges.find_last.pass.cpp
+++ b/libcxx/test/std/algorithms/alg.nonmodifying/alg.find.last/ranges.find_last.pass.cpp
@@ -25,6 +25,7 @@
 #include <algorithm>
 #include <array>
 #include <cassert>
+#include <memory>
 #include <ranges>
 #include <vector>
 
@@ -61,7 +62,8 @@ template <class It, class Sent = It>
 constexpr void test_iterators() {
   using ValueT    = std::iter_value_t<It>;
   auto make_range = [](auto& a) {
-    return std::ranges::subrange(It(std::ranges::begin(a)), Sent(It(std::ranges::end(a))));
+    return std::ranges::subrange(
+        It(std::to_address(std::ranges::begin(a))), Sent(It(std::to_address(std::ranges::end(a)))));
   };
   { // simple test
     {
@@ -91,7 +93,7 @@ constexpr void test_iterators() {
       std::array<ValueT, 0> a = {};
 
       auto ret = std::ranges::find_last(make_range(a), 1).begin();
-      assert(ret == It(a.begin()));
+      assert(ret == It(a.data()));
     }
   }
 
diff --git a/libcxx/test/std/algorithms/alg.nonmodifying/alg.find.last/ranges.find_last_if.pass.cpp b/libcxx/test/std/algorithms/alg.nonmodifying/alg.find.last/ranges.find_last_if.pass.cpp
index a15f81b..107fcf9 100644
--- a/libcxx/test/std/algorithms/alg.nonmodifying/alg.find.last/ranges.find_last_if.pass.cpp
+++ b/libcxx/test/std/algorithms/alg.nonmodifying/alg.find.last/ranges.find_last_if.pass.cpp
@@ -92,14 +92,14 @@ constexpr void test_iterator_classes() {
     {
       std::array<int, 0> a = {};
 
-      auto ret = std::ranges::find_last_if(it(a.data()), sent(it(a.data())), [](auto&&) { return true; }).begin();
-      assert(ret == it(a.data()));
+      auto ret = std::ranges::find_last_if(it(a.begin()), sent(it(a.end())), [](auto&&) { return true; }).begin();
+      assert(ret == it(a.end()));
     }
     {
       std::array<int, 0> a = {};
 
       auto ret = std::ranges::find_last_if(make_range<it, sent>(a), [](auto&&) { return true; }).begin();
-      assert(ret == it(a.begin()));
+      assert(ret == it(a.end()));
     }
   }
 
@@ -183,8 +183,17 @@ struct NonConstComparable {
   friend constexpr bool operator==(NonConstComparable&, const NonConstComparable&) { return true; }
 };
 
+// TODO: this should really use `std::const_iterator`
 template <class T>
-using add_const_to_ptr_t = std::add_pointer_t<std::add_const_t<std::remove_pointer_t<T>>>;
+struct add_const_to_ptr {
+  using type = T;
+};
+template <class T>
+struct add_const_to_ptr<T*> {
+  using type = const T*;
+};
+template <class T>
+using add_const_to_ptr_t = typename add_const_to_ptr<T>::type;
 
 constexpr bool test() {
   test_iterator_classes<std::type_identity_t, std::type_identity_t>();
diff --git a/libcxx/test/std/algorithms/alg.nonmodifying/alg.find.last/ranges.find_last_if_not.pass.cpp b/libcxx/test/std/algorithms/alg.nonmodifying/alg.find.last/ranges.find_last_if_not.pass.cpp
index bb0e411..6602ac5 100644
--- a/libcxx/test/std/algorithms/alg.nonmodifying/alg.find.last/ranges.find_last_if_not.pass.cpp
+++ b/libcxx/test/std/algorithms/alg.nonmodifying/alg.find.last/ranges.find_last_if_not.pass.cpp
@@ -92,14 +92,14 @@ constexpr void test_iterator_classes() {
     {
       std::array<int, 0> a = {};
 
-      auto ret = std::ranges::find_last_if_not(it(a.data()), sent(it(a.data())), [](auto&&) { return false; }).begin();
-      assert(ret == it(a.data()));
+      auto ret = std::ranges::find_last_if_not(it(a.begin()), sent(it(a.end())), [](auto&&) { return false; }).begin();
+      assert(ret == it(a.end()));
     }
     {
       std::array<int, 0> a = {};
 
       auto ret = std::ranges::find_last_if_not(make_range<it, sent>(a), [](auto&&) { return false; }).begin();
-      assert(ret == it(a.begin()));
+      assert(ret == it(a.end()));
     }
   }
 
@@ -183,8 +183,17 @@ struct NonConstComparable {
   friend constexpr bool operator!=(NonConstComparable&, const NonConstComparable&) { return false; }
 };
 
+// TODO: this should really use `std::const_iterator`
 template <class T>
-using add_const_to_ptr_t = std::add_pointer_t<std::add_const_t<std::remove_pointer_t<T>>>;
+struct add_const_to_ptr {
+  using type = T;
+};
+template <class T>
+struct add_const_to_ptr<T*> {
+  using type = const T*;
+};
+template <class T>
+using add_const_to_ptr_t = typename add_const_to_ptr<T>::type;
 
 constexpr bool test() {
   test_iterator_classes<std::type_identity_t, std::type_identity_t>();
diff --git a/libcxx/test/std/algorithms/alg.sorting/alg.sort/sort/sort.pass.cpp b/libcxx/test/std/algorithms/alg.sorting/alg.sort/sort/sort.pass.cpp
index da7794e..9bd2814 100644
--- a/libcxx/test/std/algorithms/alg.sorting/alg.sort/sort/sort.pass.cpp
+++ b/libcxx/test/std/algorithms/alg.sorting/alg.sort/sort/sort.pass.cpp
@@ -8,7 +8,7 @@
 
 // This test did pass but is very slow when run using qemu. ~7 minutes on a
 // Neoverse N1 (AArch64) server core.
-// UNSUPPORTED: LIBCXX-PICOLIBC-FIXME
+// REQUIRES: long_tests
 
 // <algorithm>
 
diff --git a/libcxx/test/std/atomics/atomics.lockfree/is_always_lock_free.cpp b/libcxx/test/std/atomics/atomics.lockfree/is_always_lock_free.cpp
new file mode 100644
index 0000000..2dc7f5c
--- /dev/null
+++ b/libcxx/test/std/atomics/atomics.lockfree/is_always_lock_free.cpp
@@ -0,0 +1,165 @@
+//===----------------------------------------------------------------------===//
+//
+// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
+// See https://llvm.org/LICENSE.txt for license information.
+// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
+//
+//===----------------------------------------------------------------------===//
+//
+// UNSUPPORTED: c++03, c++11, c++14
+
+// <atomic>
+//
+// template <class T>
+// class atomic;
+//
+// static constexpr bool is_always_lock_free;
+
+#include <atomic>
+#include <cassert>
+#include <cstddef>
+
+#include "test_macros.h"
+#include "atomic_helpers.h"
+
+template <typename T>
+void check_always_lock_free(std::atomic<T> const& a) {
+  using InfoT = LockFreeStatusInfo<T>;
+
+  constexpr std::same_as<const bool> decltype(auto) is_always_lock_free = std::atomic<T>::is_always_lock_free;
+
+  // If we know the status of T for sure, validate the exact result of the function.
+  if constexpr (InfoT::status_known) {
+    constexpr LockFreeStatus known_status = InfoT::value;
+    if constexpr (known_status == LockFreeStatus::always) {
+      static_assert(is_always_lock_free, "is_always_lock_free is inconsistent with known lock-free status");
+      assert(a.is_lock_free() && "is_lock_free() is inconsistent with known lock-free status");
+    } else if constexpr (known_status == LockFreeStatus::never) {
+      static_assert(!is_always_lock_free, "is_always_lock_free is inconsistent with known lock-free status");
+      assert(!a.is_lock_free() && "is_lock_free() is inconsistent with known lock-free status");
+    } else {
+      assert(a.is_lock_free() || !a.is_lock_free()); // This is kinda dumb, but we might as well call the function once.
+    }
+  }
+
+  // In all cases, also sanity-check it based on the implication always-lock-free => lock-free.
+  if (is_always_lock_free) {
+    std::same_as<bool> decltype(auto) is_lock_free = a.is_lock_free();
+    assert(is_lock_free);
+  }
+  ASSERT_NOEXCEPT(a.is_lock_free());
+}
+
+#define CHECK_ALWAYS_LOCK_FREE(T)                                                                                      \
+  do {                                                                                                                 \
+    typedef T type;                                                                                                    \
+    type obj{};                                                                                                        \
+    std::atomic<type> a(obj);                                                                                          \
+    check_always_lock_free(a);                                                                                         \
+  } while (0)
+
+void test() {
+  char c = 'x';
+  check_always_lock_free(std::atomic<char>(c));
+
+  int i = 0;
+  check_always_lock_free(std::atomic<int>(i));
+
+  float f = 0.f;
+  check_always_lock_free(std::atomic<float>(f));
+
+  int* p = &i;
+  check_always_lock_free(std::atomic<int*>(p));
+
+  CHECK_ALWAYS_LOCK_FREE(bool);
+  CHECK_ALWAYS_LOCK_FREE(char);
+  CHECK_ALWAYS_LOCK_FREE(signed char);
+  CHECK_ALWAYS_LOCK_FREE(unsigned char);
+#if TEST_STD_VER > 17 && defined(__cpp_char8_t)
+  CHECK_ALWAYS_LOCK_FREE(char8_t);
+#endif
+  CHECK_ALWAYS_LOCK_FREE(char16_t);
+  CHECK_ALWAYS_LOCK_FREE(char32_t);
+  CHECK_ALWAYS_LOCK_FREE(wchar_t);
+  CHECK_ALWAYS_LOCK_FREE(short);
+  CHECK_ALWAYS_LOCK_FREE(unsigned short);
+  CHECK_ALWAYS_LOCK_FREE(int);
+  CHECK_ALWAYS_LOCK_FREE(unsigned int);
+  CHECK_ALWAYS_LOCK_FREE(long);
+  CHECK_ALWAYS_LOCK_FREE(unsigned long);
+  CHECK_ALWAYS_LOCK_FREE(long long);
+  CHECK_ALWAYS_LOCK_FREE(unsigned long long);
+  CHECK_ALWAYS_LOCK_FREE(std::nullptr_t);
+  CHECK_ALWAYS_LOCK_FREE(void*);
+  CHECK_ALWAYS_LOCK_FREE(float);
+  CHECK_ALWAYS_LOCK_FREE(double);
+  CHECK_ALWAYS_LOCK_FREE(long double);
+#if __has_attribute(vector_size) && defined(_LIBCPP_VERSION)
+  CHECK_ALWAYS_LOCK_FREE(int __attribute__((vector_size(1 * sizeof(int)))));
+  CHECK_ALWAYS_LOCK_FREE(int __attribute__((vector_size(2 * sizeof(int)))));
+  CHECK_ALWAYS_LOCK_FREE(int __attribute__((vector_size(4 * sizeof(int)))));
+  CHECK_ALWAYS_LOCK_FREE(int __attribute__((vector_size(16 * sizeof(int)))));
+  CHECK_ALWAYS_LOCK_FREE(int __attribute__((vector_size(32 * sizeof(int)))));
+  CHECK_ALWAYS_LOCK_FREE(float __attribute__((vector_size(1 * sizeof(float)))));
+  CHECK_ALWAYS_LOCK_FREE(float __attribute__((vector_size(2 * sizeof(float)))));
+  CHECK_ALWAYS_LOCK_FREE(float __attribute__((vector_size(4 * sizeof(float)))));
+  CHECK_ALWAYS_LOCK_FREE(float __attribute__((vector_size(16 * sizeof(float)))));
+  CHECK_ALWAYS_LOCK_FREE(float __attribute__((vector_size(32 * sizeof(float)))));
+  CHECK_ALWAYS_LOCK_FREE(double __attribute__((vector_size(1 * sizeof(double)))));
+  CHECK_ALWAYS_LOCK_FREE(double __attribute__((vector_size(2 * sizeof(double)))));
+  CHECK_ALWAYS_LOCK_FREE(double __attribute__((vector_size(4 * sizeof(double)))));
+  CHECK_ALWAYS_LOCK_FREE(double __attribute__((vector_size(16 * sizeof(double)))));
+  CHECK_ALWAYS_LOCK_FREE(double __attribute__((vector_size(32 * sizeof(double)))));
+#endif // __has_attribute(vector_size) && defined(_LIBCPP_VERSION)
+  CHECK_ALWAYS_LOCK_FREE(struct Empty{});
+  CHECK_ALWAYS_LOCK_FREE(struct OneInt { int i; });
+  CHECK_ALWAYS_LOCK_FREE(struct IntArr2 { int i[2]; });
+  CHECK_ALWAYS_LOCK_FREE(struct FloatArr3 { float i[3]; });
+  CHECK_ALWAYS_LOCK_FREE(struct LLIArr2 { long long int i[2]; });
+  CHECK_ALWAYS_LOCK_FREE(struct LLIArr4 { long long int i[4]; });
+  CHECK_ALWAYS_LOCK_FREE(struct LLIArr8 { long long int i[8]; });
+  CHECK_ALWAYS_LOCK_FREE(struct LLIArr16 { long long int i[16]; });
+  CHECK_ALWAYS_LOCK_FREE(struct Padding {
+    char c; /* padding */
+    long long int i;
+  });
+  CHECK_ALWAYS_LOCK_FREE(union IntFloat {
+    int i;
+    float f;
+  });
+  CHECK_ALWAYS_LOCK_FREE(enum class CharEnumClass : char{foo});
+
+  // C macro and static constexpr must be consistent.
+  enum class CharEnumClass : char { foo };
+  static_assert(std::atomic<bool>::is_always_lock_free == (2 == ATOMIC_BOOL_LOCK_FREE), "");
+  static_assert(std::atomic<char>::is_always_lock_free == (2 == ATOMIC_CHAR_LOCK_FREE), "");
+  static_assert(std::atomic<CharEnumClass>::is_always_lock_free == (2 == ATOMIC_CHAR_LOCK_FREE), "");
+  static_assert(std::atomic<signed char>::is_always_lock_free == (2 == ATOMIC_CHAR_LOCK_FREE), "");
+  static_assert(std::atomic<unsigned char>::is_always_lock_free == (2 == ATOMIC_CHAR_LOCK_FREE), "");
+#if TEST_STD_VER > 17 && defined(__cpp_char8_t)
+  static_assert(std::atomic<char8_t>::is_always_lock_free == (2 == ATOMIC_CHAR8_T_LOCK_FREE), "");
+#endif
+  static_assert(std::atomic<char16_t>::is_always_lock_free == (2 == ATOMIC_CHAR16_T_LOCK_FREE), "");
+  static_assert(std::atomic<char32_t>::is_always_lock_free == (2 == ATOMIC_CHAR32_T_LOCK_FREE), "");
+  static_assert(std::atomic<wchar_t>::is_always_lock_free == (2 == ATOMIC_WCHAR_T_LOCK_FREE), "");
+  static_assert(std::atomic<short>::is_always_lock_free == (2 == ATOMIC_SHORT_LOCK_FREE), "");
+  static_assert(std::atomic<unsigned short>::is_always_lock_free == (2 == ATOMIC_SHORT_LOCK_FREE), "");
+  static_assert(std::atomic<int>::is_always_lock_free == (2 == ATOMIC_INT_LOCK_FREE), "");
+  static_assert(std::atomic<unsigned int>::is_always_lock_free == (2 == ATOMIC_INT_LOCK_FREE), "");
+  static_assert(std::atomic<long>::is_always_lock_free == (2 == ATOMIC_LONG_LOCK_FREE), "");
+  static_assert(std::atomic<unsigned long>::is_always_lock_free == (2 == ATOMIC_LONG_LOCK_FREE), "");
+  static_assert(std::atomic<long long>::is_always_lock_free == (2 == ATOMIC_LLONG_LOCK_FREE), "");
+  static_assert(std::atomic<unsigned long long>::is_always_lock_free == (2 == ATOMIC_LLONG_LOCK_FREE), "");
+  static_assert(std::atomic<void*>::is_always_lock_free == (2 == ATOMIC_POINTER_LOCK_FREE), "");
+  static_assert(std::atomic<std::nullptr_t>::is_always_lock_free == (2 == ATOMIC_POINTER_LOCK_FREE), "");
+
+#if TEST_STD_VER >= 20
+  static_assert(std::atomic_signed_lock_free::is_always_lock_free, "");
+  static_assert(std::atomic_unsigned_lock_free::is_always_lock_free, "");
+#endif
+}
+
+int main(int, char**) {
+  test();
+  return 0;
+}
diff --git a/libcxx/test/std/atomics/atomics.lockfree/isalwayslockfree.pass.cpp b/libcxx/test/std/atomics/atomics.lockfree/isalwayslockfree.pass.cpp
deleted file mode 100644
index 6d6e647..0000000
--- a/libcxx/test/std/atomics/atomics.lockfree/isalwayslockfree.pass.cpp
+++ /dev/null
@@ -1,120 +0,0 @@
-//===----------------------------------------------------------------------===//
-//
-// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
-// See https://llvm.org/LICENSE.txt for license information.
-// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
-//
-//===----------------------------------------------------------------------===//
-//
-// UNSUPPORTED: c++03, c++11, c++14
-
-// <atomic>
-
-// static constexpr bool is_always_lock_free;
-
-#include <atomic>
-#include <cassert>
-#include <cstddef>
-
-#include "test_macros.h"
-
-template <typename T>
-void checkAlwaysLockFree() {
-  if (std::atomic<T>::is_always_lock_free) {
-    assert(std::atomic<T>().is_lock_free());
-  }
-}
-
-void run()
-{
-// structs and unions can't be defined in the template invocation.
-// Work around this with a typedef.
-#define CHECK_ALWAYS_LOCK_FREE(T)                                              \
-  do {                                                                         \
-    typedef T type;                                                            \
-    checkAlwaysLockFree<type>();                                               \
-  } while (0)
-
-    CHECK_ALWAYS_LOCK_FREE(bool);
-    CHECK_ALWAYS_LOCK_FREE(char);
-    CHECK_ALWAYS_LOCK_FREE(signed char);
-    CHECK_ALWAYS_LOCK_FREE(unsigned char);
-#if TEST_STD_VER > 17 && defined(__cpp_char8_t)
-    CHECK_ALWAYS_LOCK_FREE(char8_t);
-#endif
-    CHECK_ALWAYS_LOCK_FREE(char16_t);
-    CHECK_ALWAYS_LOCK_FREE(char32_t);
-    CHECK_ALWAYS_LOCK_FREE(wchar_t);
-    CHECK_ALWAYS_LOCK_FREE(short);
-    CHECK_ALWAYS_LOCK_FREE(unsigned short);
-    CHECK_ALWAYS_LOCK_FREE(int);
-    CHECK_ALWAYS_LOCK_FREE(unsigned int);
-    CHECK_ALWAYS_LOCK_FREE(long);
-    CHECK_ALWAYS_LOCK_FREE(unsigned long);
-    CHECK_ALWAYS_LOCK_FREE(long long);
-    CHECK_ALWAYS_LOCK_FREE(unsigned long long);
-    CHECK_ALWAYS_LOCK_FREE(std::nullptr_t);
-    CHECK_ALWAYS_LOCK_FREE(void*);
-    CHECK_ALWAYS_LOCK_FREE(float);
-    CHECK_ALWAYS_LOCK_FREE(double);
-    CHECK_ALWAYS_LOCK_FREE(long double);
-#if __has_attribute(vector_size) && defined(_LIBCPP_VERSION)
-    CHECK_ALWAYS_LOCK_FREE(int __attribute__((vector_size(1 * sizeof(int)))));
-    CHECK_ALWAYS_LOCK_FREE(int __attribute__((vector_size(2 * sizeof(int)))));
-    CHECK_ALWAYS_LOCK_FREE(int __attribute__((vector_size(4 * sizeof(int)))));
-    CHECK_ALWAYS_LOCK_FREE(int __attribute__((vector_size(16 * sizeof(int)))));
-    CHECK_ALWAYS_LOCK_FREE(int __attribute__((vector_size(32 * sizeof(int)))));
-    CHECK_ALWAYS_LOCK_FREE(float __attribute__((vector_size(1 * sizeof(float)))));
-    CHECK_ALWAYS_LOCK_FREE(float __attribute__((vector_size(2 * sizeof(float)))));
-    CHECK_ALWAYS_LOCK_FREE(float __attribute__((vector_size(4 * sizeof(float)))));
-    CHECK_ALWAYS_LOCK_FREE(float __attribute__((vector_size(16 * sizeof(float)))));
-    CHECK_ALWAYS_LOCK_FREE(float __attribute__((vector_size(32 * sizeof(float)))));
-    CHECK_ALWAYS_LOCK_FREE(double __attribute__((vector_size(1 * sizeof(double)))));
-    CHECK_ALWAYS_LOCK_FREE(double __attribute__((vector_size(2 * sizeof(double)))));
-    CHECK_ALWAYS_LOCK_FREE(double __attribute__((vector_size(4 * sizeof(double)))));
-    CHECK_ALWAYS_LOCK_FREE(double __attribute__((vector_size(16 * sizeof(double)))));
-    CHECK_ALWAYS_LOCK_FREE(double __attribute__((vector_size(32 * sizeof(double)))));
-#endif // __has_attribute(vector_size) && defined(_LIBCPP_VERSION)
-    CHECK_ALWAYS_LOCK_FREE(struct Empty {});
-    CHECK_ALWAYS_LOCK_FREE(struct OneInt { int i; });
-    CHECK_ALWAYS_LOCK_FREE(struct IntArr2 { int i[2]; });
-    CHECK_ALWAYS_LOCK_FREE(struct FloatArr3 { float i[3]; });
-    CHECK_ALWAYS_LOCK_FREE(struct LLIArr2 { long long int i[2]; });
-    CHECK_ALWAYS_LOCK_FREE(struct LLIArr4 { long long int i[4]; });
-    CHECK_ALWAYS_LOCK_FREE(struct LLIArr8 { long long int i[8]; });
-    CHECK_ALWAYS_LOCK_FREE(struct LLIArr16 { long long int i[16]; });
-    CHECK_ALWAYS_LOCK_FREE(struct Padding { char c; /* padding */ long long int i; });
-    CHECK_ALWAYS_LOCK_FREE(union IntFloat { int i; float f; });
-    CHECK_ALWAYS_LOCK_FREE(enum class CharEnumClass : char { foo });
-
-    // C macro and static constexpr must be consistent.
-    enum class CharEnumClass : char { foo };
-    static_assert(std::atomic<bool>::is_always_lock_free == (2 == ATOMIC_BOOL_LOCK_FREE), "");
-    static_assert(std::atomic<char>::is_always_lock_free == (2 == ATOMIC_CHAR_LOCK_FREE), "");
-    static_assert(std::atomic<CharEnumClass>::is_always_lock_free == (2 == ATOMIC_CHAR_LOCK_FREE), "");
-    static_assert(std::atomic<signed char>::is_always_lock_free == (2 == ATOMIC_CHAR_LOCK_FREE), "");
-    static_assert(std::atomic<unsigned char>::is_always_lock_free == (2 == ATOMIC_CHAR_LOCK_FREE), "");
-#if TEST_STD_VER > 17 && defined(__cpp_char8_t)
-    static_assert(std::atomic<char8_t>::is_always_lock_free == (2 == ATOMIC_CHAR8_T_LOCK_FREE), "");
-#endif
-    static_assert(std::atomic<char16_t>::is_always_lock_free == (2 == ATOMIC_CHAR16_T_LOCK_FREE), "");
-    static_assert(std::atomic<char32_t>::is_always_lock_free == (2 == ATOMIC_CHAR32_T_LOCK_FREE), "");
-    static_assert(std::atomic<wchar_t>::is_always_lock_free == (2 == ATOMIC_WCHAR_T_LOCK_FREE), "");
-    static_assert(std::atomic<short>::is_always_lock_free == (2 == ATOMIC_SHORT_LOCK_FREE), "");
-    static_assert(std::atomic<unsigned short>::is_always_lock_free == (2 == ATOMIC_SHORT_LOCK_FREE), "");
-    static_assert(std::atomic<int>::is_always_lock_free == (2 == ATOMIC_INT_LOCK_FREE), "");
-    static_assert(std::atomic<unsigned int>::is_always_lock_free == (2 == ATOMIC_INT_LOCK_FREE), "");
-    static_assert(std::atomic<long>::is_always_lock_free == (2 == ATOMIC_LONG_LOCK_FREE), "");
-    static_assert(std::atomic<unsigned long>::is_always_lock_free == (2 == ATOMIC_LONG_LOCK_FREE), "");
-    static_assert(std::atomic<long long>::is_always_lock_free == (2 == ATOMIC_LLONG_LOCK_FREE), "");
-    static_assert(std::atomic<unsigned long long>::is_always_lock_free == (2 == ATOMIC_LLONG_LOCK_FREE), "");
-    static_assert(std::atomic<void*>::is_always_lock_free == (2 == ATOMIC_POINTER_LOCK_FREE), "");
-    static_assert(std::atomic<std::nullptr_t>::is_always_lock_free == (2 == ATOMIC_POINTER_LOCK_FREE), "");
-
-#if TEST_STD_VER >= 20
-    static_assert(std::atomic_signed_lock_free::is_always_lock_free, "");
-    static_assert(std::atomic_unsigned_lock_free::is_always_lock_free, "");
-#endif
-}
-
-int main(int, char**) { run(); return 0; }
diff --git a/libcxx/test/std/atomics/atomics.ref/is_always_lock_free.pass.cpp b/libcxx/test/std/atomics/atomics.ref/is_always_lock_free.pass.cpp
index 94f65e3b4..acdbf63 100644
--- a/libcxx/test/std/atomics/atomics.ref/is_always_lock_free.pass.cpp
+++ b/libcxx/test/std/atomics/atomics.ref/is_always_lock_free.pass.cpp
@@ -9,7 +9,10 @@
 // UNSUPPORTED: c++03, c++11, c++14, c++17
 
 // <atomic>
-
+//
+// template <class T>
+// class atomic_ref;
+//
 // static constexpr bool is_always_lock_free;
 // bool is_lock_free() const noexcept;
 
@@ -18,10 +21,29 @@
 #include <concepts>
 
 #include "test_macros.h"
+#include "atomic_helpers.h"
 
 template <typename T>
-void check_always_lock_free(std::atomic_ref<T> const a) {
-  std::same_as<const bool> decltype(auto) is_always_lock_free = std::atomic_ref<T>::is_always_lock_free;
+void check_always_lock_free(std::atomic_ref<T> const& a) {
+  using InfoT = LockFreeStatusInfo<T>;
+
+  constexpr std::same_as<const bool> decltype(auto) is_always_lock_free = std::atomic_ref<T>::is_always_lock_free;
+
+  // If we know the status of T for sure, validate the exact result of the function.
+  if constexpr (InfoT::status_known) {
+    constexpr LockFreeStatus known_status = InfoT::value;
+    if constexpr (known_status == LockFreeStatus::always) {
+      static_assert(is_always_lock_free, "is_always_lock_free is inconsistent with known lock-free status");
+      assert(a.is_lock_free() && "is_lock_free() is inconsistent with known lock-free status");
+    } else if constexpr (known_status == LockFreeStatus::never) {
+      static_assert(!is_always_lock_free, "is_always_lock_free is inconsistent with known lock-free status");
+      assert(!a.is_lock_free() && "is_lock_free() is inconsistent with known lock-free status");
+    } else {
+      assert(a.is_lock_free() || !a.is_lock_free()); // This is kinda dumb, but we might as well call the function once.
+    }
+  }
+
+  // In all cases, also sanity-check it based on the implication always-lock-free => lock-free.
   if (is_always_lock_free) {
     std::same_as<bool> decltype(auto) is_lock_free = a.is_lock_free();
     assert(is_lock_free);
@@ -33,10 +55,14 @@ void check_always_lock_free(std::atomic_ref<T> const a) {
   do {                                                                                                                 \
     typedef T type;                                                                                                    \
     type obj{};                                                                                                        \
-    check_always_lock_free(std::atomic_ref<type>(obj));                                                                \
+    std::atomic_ref<type> a(obj);                                                                                      \
+    check_always_lock_free(a);                                                                                         \
   } while (0)
 
 void test() {
+  char c = 'x';
+  check_always_lock_free(std::atomic_ref<char>(c));
+
   int i = 0;
   check_always_lock_free(std::atomic_ref<int>(i));
 
diff --git a/libcxx/test/std/atomics/atomics.ref/required_alignment.pass.cpp b/libcxx/test/std/atomics/atomics.ref/required_alignment.pass.cpp
index 86e0cba..4fc453f 100644
--- a/libcxx/test/std/atomics/atomics.ref/required_alignment.pass.cpp
+++ b/libcxx/test/std/atomics/atomics.ref/required_alignment.pass.cpp
@@ -12,6 +12,7 @@
 #include <atomic>
 #include <cassert>
 #include <concepts>
+#include <cstddef>
 
 template <typename T>
 constexpr void check_required_alignment() {
diff --git a/libcxx/test/std/atomics/atomics.types.generic/general.compile.pass.cpp b/libcxx/test/std/atomics/atomics.types.generic/general.compile.pass.cpp
index fead6e2..817a70d 100644
--- a/libcxx/test/std/atomics/atomics.types.generic/general.compile.pass.cpp
+++ b/libcxx/test/std/atomics/atomics.types.generic/general.compile.pass.cpp
@@ -94,9 +94,11 @@ void test() {
   TEST_IGNORE_NODISCARD a.compare_exchange_weak(v, v);
   TEST_IGNORE_NODISCARD a.compare_exchange_strong(v, v, m);
 
+#if TEST_STD_VER >= 20
   a.wait(v);
   a.notify_one();
   a.notify_all();
+#endif
 }
 
 void test() {
diff --git a/libcxx/test/std/atomics/atomics.types.generic/pointer.compile.pass.cpp b/libcxx/test/std/atomics/atomics.types.generic/pointer.compile.pass.cpp
index 961aed3..c62127f 100644
--- a/libcxx/test/std/atomics/atomics.types.generic/pointer.compile.pass.cpp
+++ b/libcxx/test/std/atomics/atomics.types.generic/pointer.compile.pass.cpp
@@ -128,9 +128,11 @@ void test() {
   a += 0;
   a -= 0;
 
+#if TEST_STD_VER >= 20
   a.wait(v);
   a.notify_one();
   a.notify_all();
+#endif
 }
 
 void test() {
diff --git a/libcxx/test/std/atomics/atomics.types.operations/atomics.types.operations.wait/atomic_notify_all.pass.cpp b/libcxx/test/std/atomics/atomics.types.operations/atomics.types.operations.wait/atomic_notify_all.pass.cpp
index 0ec530c..fc159b15e 100644
--- a/libcxx/test/std/atomics/atomics.types.operations/atomics.types.operations.wait/atomic_notify_all.pass.cpp
+++ b/libcxx/test/std/atomics/atomics.types.operations/atomics.types.operations.wait/atomic_notify_all.pass.cpp
@@ -7,12 +7,9 @@
 //===----------------------------------------------------------------------===//
 //
 // UNSUPPORTED: no-threads
-// UNSUPPORTED: c++03
+// UNSUPPORTED: c++03, c++11, c++14, c++17
 // XFAIL: !has-1024-bit-atomics
 
-// Until we drop support for the synchronization library in C++11/14/17
-// ADDITIONAL_COMPILE_FLAGS: -D_LIBCPP_DISABLE_DEPRECATION_WARNINGS
-
 // XFAIL: availability-synchronization_library-missing
 
 // <atomic>
diff --git a/libcxx/test/std/atomics/atomics.types.operations/atomics.types.operations.wait/atomic_notify_one.pass.cpp b/libcxx/test/std/atomics/atomics.types.operations/atomics.types.operations.wait/atomic_notify_one.pass.cpp
index c21b67d..330d8a4 100644
--- a/libcxx/test/std/atomics/atomics.types.operations/atomics.types.operations.wait/atomic_notify_one.pass.cpp
+++ b/libcxx/test/std/atomics/atomics.types.operations/atomics.types.operations.wait/atomic_notify_one.pass.cpp
@@ -7,12 +7,9 @@
 //===----------------------------------------------------------------------===//
 //
 // UNSUPPORTED: no-threads
-// UNSUPPORTED: c++03
+// UNSUPPORTED: c++03, c++11, c++14, c++17
 // XFAIL: !has-1024-bit-atomics
 
-// Until we drop support for the synchronization library in C++11/14/17
-// ADDITIONAL_COMPILE_FLAGS: -D_LIBCPP_DISABLE_DEPRECATION_WARNINGS
-
 // XFAIL: availability-synchronization_library-missing
 
 // <atomic>
diff --git a/libcxx/test/std/atomics/atomics.types.operations/atomics.types.operations.wait/atomic_wait.pass.cpp b/libcxx/test/std/atomics/atomics.types.operations/atomics.types.operations.wait/atomic_wait.pass.cpp
index af99113..7c5169b 100644
--- a/libcxx/test/std/atomics/atomics.types.operations/atomics.types.operations.wait/atomic_wait.pass.cpp
+++ b/libcxx/test/std/atomics/atomics.types.operations/atomics.types.operations.wait/atomic_wait.pass.cpp
@@ -7,12 +7,9 @@
 //===----------------------------------------------------------------------===//
 //
 // UNSUPPORTED: no-threads
-// UNSUPPORTED: c++03
+// UNSUPPORTED: c++03, c++11, c++14, c++17
 // XFAIL: !has-1024-bit-atomics
 
-// Until we drop support for the synchronization library in C++11/14/17
-// ADDITIONAL_COMPILE_FLAGS: -D_LIBCPP_DISABLE_DEPRECATION_WARNINGS
-
 // XFAIL: availability-synchronization_library-missing
 
 // <atomic>
diff --git a/libcxx/test/std/atomics/atomics.types.operations/atomics.types.operations.wait/atomic_wait_explicit.pass.cpp b/libcxx/test/std/atomics/atomics.types.operations/atomics.types.operations.wait/atomic_wait_explicit.pass.cpp
index bb8c645..c84eecf 100644
--- a/libcxx/test/std/atomics/atomics.types.operations/atomics.types.operations.wait/atomic_wait_explicit.pass.cpp
+++ b/libcxx/test/std/atomics/atomics.types.operations/atomics.types.operations.wait/atomic_wait_explicit.pass.cpp
@@ -7,12 +7,9 @@
 //===----------------------------------------------------------------------===//
 //
 // UNSUPPORTED: no-threads
-// UNSUPPORTED: c++03
+// UNSUPPORTED: c++03, c++11, c++14, c++17
 // XFAIL: !has-1024-bit-atomics
 
-// Until we drop support for the synchronization library in C++11/14/17
-// ADDITIONAL_COMPILE_FLAGS: -D_LIBCPP_DISABLE_DEPRECATION_WARNINGS
-
 // XFAIL: availability-synchronization_library-missing
 
 // <atomic>
diff --git a/libcxx/test/std/atomics/stdatomic.h.syn/types.compile.pass.cpp b/libcxx/test/std/atomics/stdatomic.h.syn/types.compile.pass.cpp
index 2bac7cb..29881891 100644
--- a/libcxx/test/std/atomics/stdatomic.h.syn/types.compile.pass.cpp
+++ b/libcxx/test/std/atomics/stdatomic.h.syn/types.compile.pass.cpp
@@ -101,6 +101,8 @@
 // using std::atomic_fetch_sub_explicit                   // see below
 // using std::atomic_fetch_or                             // see below
 // using std::atomic_fetch_or_explicit                    // see below
+// using std::atomic_fetch_xor                            // see below
+// using std::atomic_fetch_xor_explicit                   // see below
 // using std::atomic_fetch_and                            // see below
 // using std::atomic_fetch_and_explicit                   // see below
 // using std::atomic_flag_test_and_set                    // see below
@@ -222,6 +224,8 @@ void f() {
   using ::atomic_fetch_or_explicit;
   using ::atomic_fetch_sub;
   using ::atomic_fetch_sub_explicit;
+  using ::atomic_fetch_xor;
+  using ::atomic_fetch_xor_explicit;
   using ::atomic_flag_clear;
   using ::atomic_flag_clear_explicit;
   using ::atomic_flag_test_and_set;
diff --git a/libcxx/test/std/containers/container.adaptors/container.adaptors.format/format.functions.format.pass.cpp b/libcxx/test/std/containers/container.adaptors/container.adaptors.format/format.functions.format.pass.cpp
index df315fe..6d2a5ec 100644
--- a/libcxx/test/std/containers/container.adaptors/container.adaptors.format/format.functions.format.pass.cpp
+++ b/libcxx/test/std/containers/container.adaptors/container.adaptors.format/format.functions.format.pass.cpp
@@ -1,4 +1,5 @@
 //===----------------------------------------------------------------------===//
+//
 // Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
 // See https://llvm.org/LICENSE.txt for license information.
 // SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
diff --git a/libcxx/test/std/containers/container.adaptors/container.adaptors.format/format.functions.tests.h b/libcxx/test/std/containers/container.adaptors/container.adaptors.format/format.functions.tests.h
index f4a1307..54ab278 100644
--- a/libcxx/test/std/containers/container.adaptors/container.adaptors.format/format.functions.tests.h
+++ b/libcxx/test/std/containers/container.adaptors/container.adaptors.format/format.functions.tests.h
@@ -1,4 +1,5 @@
 //===----------------------------------------------------------------------===//
+//
 // Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
 // See https://llvm.org/LICENSE.txt for license information.
 // SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
diff --git a/libcxx/test/std/containers/container.adaptors/container.adaptors.format/format.functions.vformat.pass.cpp b/libcxx/test/std/containers/container.adaptors/container.adaptors.format/format.functions.vformat.pass.cpp
index f683da6..c6f1815 100644
--- a/libcxx/test/std/containers/container.adaptors/container.adaptors.format/format.functions.vformat.pass.cpp
+++ b/libcxx/test/std/containers/container.adaptors/container.adaptors.format/format.functions.vformat.pass.cpp
@@ -1,4 +1,5 @@
 //===----------------------------------------------------------------------===//
+//
 // Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
 // See https://llvm.org/LICENSE.txt for license information.
 // SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
diff --git a/libcxx/test/std/containers/container.adaptors/container.adaptors.format/format.pass.cpp b/libcxx/test/std/containers/container.adaptors/container.adaptors.format/format.pass.cpp
index 03a116c..001223c 100644
--- a/libcxx/test/std/containers/container.adaptors/container.adaptors.format/format.pass.cpp
+++ b/libcxx/test/std/containers/container.adaptors/container.adaptors.format/format.pass.cpp
@@ -1,4 +1,5 @@
 //===----------------------------------------------------------------------===//
+//
 // Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
 // See https://llvm.org/LICENSE.txt for license information.
 // SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
diff --git a/libcxx/test/std/containers/container.adaptors/container.adaptors.format/parse.pass.cpp b/libcxx/test/std/containers/container.adaptors/container.adaptors.format/parse.pass.cpp
index c47fb18..0e6f620 100644
--- a/libcxx/test/std/containers/container.adaptors/container.adaptors.format/parse.pass.cpp
+++ b/libcxx/test/std/containers/container.adaptors/container.adaptors.format/parse.pass.cpp
@@ -1,4 +1,5 @@
 //===----------------------------------------------------------------------===//
+//
 // Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
 // See https://llvm.org/LICENSE.txt for license information.
 // SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
diff --git a/libcxx/test/std/containers/container.adaptors/container.adaptors.format/types.compile.pass.cpp b/libcxx/test/std/containers/container.adaptors/container.adaptors.format/types.compile.pass.cpp
index 07239d3..f5329b5 100644
--- a/libcxx/test/std/containers/container.adaptors/container.adaptors.format/types.compile.pass.cpp
+++ b/libcxx/test/std/containers/container.adaptors/container.adaptors.format/types.compile.pass.cpp
@@ -1,4 +1,5 @@
 //===----------------------------------------------------------------------===//
+//
 // Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
 // See https://llvm.org/LICENSE.txt for license information.
 // SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
diff --git a/libcxx/test/std/containers/sequences/vector.bool/vector.bool.fmt/format.functions.format.pass.cpp b/libcxx/test/std/containers/sequences/vector.bool/vector.bool.fmt/format.functions.format.pass.cpp
index 6f1aea1..850d616 100644
--- a/libcxx/test/std/containers/sequences/vector.bool/vector.bool.fmt/format.functions.format.pass.cpp
+++ b/libcxx/test/std/containers/sequences/vector.bool/vector.bool.fmt/format.functions.format.pass.cpp
@@ -1,4 +1,5 @@
 //===----------------------------------------------------------------------===//
+//
 // Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
 // See https://llvm.org/LICENSE.txt for license information.
 // SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
diff --git a/libcxx/test/std/containers/sequences/vector.bool/vector.bool.fmt/format.functions.tests.h b/libcxx/test/std/containers/sequences/vector.bool/vector.bool.fmt/format.functions.tests.h
index 8be3d9a..49d007a 100644
--- a/libcxx/test/std/containers/sequences/vector.bool/vector.bool.fmt/format.functions.tests.h
+++ b/libcxx/test/std/containers/sequences/vector.bool/vector.bool.fmt/format.functions.tests.h
@@ -1,4 +1,5 @@
 //===----------------------------------------------------------------------===//
+//
 // Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
 // See https://llvm.org/LICENSE.txt for license information.
 // SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
diff --git a/libcxx/test/std/containers/sequences/vector.bool/vector.bool.fmt/format.functions.vformat.pass.cpp b/libcxx/test/std/containers/sequences/vector.bool/vector.bool.fmt/format.functions.vformat.pass.cpp
index be46078..efbb700 100644
--- a/libcxx/test/std/containers/sequences/vector.bool/vector.bool.fmt/format.functions.vformat.pass.cpp
+++ b/libcxx/test/std/containers/sequences/vector.bool/vector.bool.fmt/format.functions.vformat.pass.cpp
@@ -1,4 +1,5 @@
 //===----------------------------------------------------------------------===//
+//
 // Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
 // See https://llvm.org/LICENSE.txt for license information.
 // SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
diff --git a/libcxx/test/std/containers/sequences/vector.bool/vector.bool.fmt/format.pass.cpp b/libcxx/test/std/containers/sequences/vector.bool/vector.bool.fmt/format.pass.cpp
index 0823374..cc88bf6 100644
--- a/libcxx/test/std/containers/sequences/vector.bool/vector.bool.fmt/format.pass.cpp
+++ b/libcxx/test/std/containers/sequences/vector.bool/vector.bool.fmt/format.pass.cpp
@@ -1,4 +1,5 @@
 //===----------------------------------------------------------------------===//
+//
 // Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
 // See https://llvm.org/LICENSE.txt for license information.
 // SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
diff --git a/libcxx/test/std/containers/sequences/vector.bool/vector.bool.fmt/parse.pass.cpp b/libcxx/test/std/containers/sequences/vector.bool/vector.bool.fmt/parse.pass.cpp
index abae40d..a1b7f98 100644
--- a/libcxx/test/std/containers/sequences/vector.bool/vector.bool.fmt/parse.pass.cpp
+++ b/libcxx/test/std/containers/sequences/vector.bool/vector.bool.fmt/parse.pass.cpp
@@ -1,4 +1,5 @@
 //===----------------------------------------------------------------------===//
+//
 // Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
 // See https://llvm.org/LICENSE.txt for license information.
 // SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
diff --git a/libcxx/test/std/containers/sequences/vector.bool/vector.bool.fmt/types.compile.pass.cpp b/libcxx/test/std/containers/sequences/vector.bool/vector.bool.fmt/types.compile.pass.cpp
index d1cbcbf..71794f8 100644
--- a/libcxx/test/std/containers/sequences/vector.bool/vector.bool.fmt/types.compile.pass.cpp
+++ b/libcxx/test/std/containers/sequences/vector.bool/vector.bool.fmt/types.compile.pass.cpp
@@ -1,4 +1,5 @@
 //===----------------------------------------------------------------------===//
+//
 // Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
 // See https://llvm.org/LICENSE.txt for license information.
 // SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
diff --git a/libcxx/test/std/input.output/iostream.format/output.streams/ostream.formatted/ostream.formatted.print/locale-specific_form.pass.cpp b/libcxx/test/std/input.output/iostream.format/output.streams/ostream.formatted/ostream.formatted.print/locale-specific_form.pass.cpp
index 57023b7..98fa56b 100644
--- a/libcxx/test/std/input.output/iostream.format/output.streams/ostream.formatted/ostream.formatted.print/locale-specific_form.pass.cpp
+++ b/libcxx/test/std/input.output/iostream.format/output.streams/ostream.formatted/ostream.formatted.print/locale-specific_form.pass.cpp
@@ -1,4 +1,5 @@
 //===----------------------------------------------------------------------===//
+//
 // Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
 // See https://llvm.org/LICENSE.txt for license information.
 // SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
diff --git a/libcxx/test/std/input.output/iostream.format/output.streams/ostream.formatted/ostream.formatted.print/print.pass.cpp b/libcxx/test/std/input.output/iostream.format/output.streams/ostream.formatted/ostream.formatted.print/print.pass.cpp
index 6abfb30..711152b 100644
--- a/libcxx/test/std/input.output/iostream.format/output.streams/ostream.formatted/ostream.formatted.print/print.pass.cpp
+++ b/libcxx/test/std/input.output/iostream.format/output.streams/ostream.formatted/ostream.formatted.print/print.pass.cpp
@@ -1,4 +1,5 @@
 //===----------------------------------------------------------------------===//
+//
 // Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
 // See https://llvm.org/LICENSE.txt for license information.
 // SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
diff --git a/libcxx/test/std/input.output/iostream.format/output.streams/ostream.formatted/ostream.formatted.print/print_tests.h b/libcxx/test/std/input.output/iostream.format/output.streams/ostream.formatted/ostream.formatted.print/print_tests.h
index f5a6a63..ca540bd 100644
--- a/libcxx/test/std/input.output/iostream.format/output.streams/ostream.formatted/ostream.formatted.print/print_tests.h
+++ b/libcxx/test/std/input.output/iostream.format/output.streams/ostream.formatted/ostream.formatted.print/print_tests.h
@@ -1,4 +1,5 @@
 //===----------------------------------------------------------------------===//
+//
 // Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
 // See https://llvm.org/LICENSE.txt for license information.
 // SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
diff --git a/libcxx/test/std/input.output/iostream.format/output.streams/ostream.formatted/ostream.formatted.print/println.pass.cpp b/libcxx/test/std/input.output/iostream.format/output.streams/ostream.formatted/ostream.formatted.print/println.pass.cpp
index 19a0263..c79a1a3 100644
--- a/libcxx/test/std/input.output/iostream.format/output.streams/ostream.formatted/ostream.formatted.print/println.pass.cpp
+++ b/libcxx/test/std/input.output/iostream.format/output.streams/ostream.formatted/ostream.formatted.print/println.pass.cpp
@@ -1,4 +1,5 @@
 //===----------------------------------------------------------------------===//
+//
 // Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
 // See https://llvm.org/LICENSE.txt for license information.
 // SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
diff --git a/libcxx/test/std/input.output/iostream.format/output.streams/ostream.formatted/ostream.formatted.print/vprint_nonunicode.pass.cpp b/libcxx/test/std/input.output/iostream.format/output.streams/ostream.formatted/ostream.formatted.print/vprint_nonunicode.pass.cpp
index 054ea36..73ad191 100644
--- a/libcxx/test/std/input.output/iostream.format/output.streams/ostream.formatted/ostream.formatted.print/vprint_nonunicode.pass.cpp
+++ b/libcxx/test/std/input.output/iostream.format/output.streams/ostream.formatted/ostream.formatted.print/vprint_nonunicode.pass.cpp
@@ -1,4 +1,5 @@
 //===----------------------------------------------------------------------===//
+//
 // Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
 // See https://llvm.org/LICENSE.txt for license information.
 // SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
diff --git a/libcxx/test/std/input.output/iostream.format/output.streams/ostream.formatted/ostream.formatted.print/vprint_unicode.pass.cpp b/libcxx/test/std/input.output/iostream.format/output.streams/ostream.formatted/ostream.formatted.print/vprint_unicode.pass.cpp
index b4d6b35..21f6654 100644
--- a/libcxx/test/std/input.output/iostream.format/output.streams/ostream.formatted/ostream.formatted.print/vprint_unicode.pass.cpp
+++ b/libcxx/test/std/input.output/iostream.format/output.streams/ostream.formatted/ostream.formatted.print/vprint_unicode.pass.cpp
@@ -1,4 +1,5 @@
 //===----------------------------------------------------------------------===//
+//
 // Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
 // See https://llvm.org/LICENSE.txt for license information.
 // SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
diff --git a/libcxx/test/std/input.output/iostream.format/print.fun/includes.compile.pass.cpp b/libcxx/test/std/input.output/iostream.format/print.fun/includes.compile.pass.cpp
index 34eff10..9b9b0e4 100644
--- a/libcxx/test/std/input.output/iostream.format/print.fun/includes.compile.pass.cpp
+++ b/libcxx/test/std/input.output/iostream.format/print.fun/includes.compile.pass.cpp
@@ -1,4 +1,5 @@
 //===----------------------------------------------------------------------===//
+//
 // Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
 // See https://llvm.org/LICENSE.txt for license information.
 // SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
diff --git a/libcxx/test/std/input.output/iostream.format/print.fun/no_file_description.pass.cpp b/libcxx/test/std/input.output/iostream.format/print.fun/no_file_description.pass.cpp
index 70e0c11..d3e4463 100644
--- a/libcxx/test/std/input.output/iostream.format/print.fun/no_file_description.pass.cpp
+++ b/libcxx/test/std/input.output/iostream.format/print.fun/no_file_description.pass.cpp
@@ -1,4 +1,5 @@
 //===----------------------------------------------------------------------===//
+//
 // Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
 // See https://llvm.org/LICENSE.txt for license information.
 // SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
diff --git a/libcxx/test/std/input.output/iostream.format/print.fun/print.file.pass.cpp b/libcxx/test/std/input.output/iostream.format/print.fun/print.file.pass.cpp
index 3edc0e2..0bd5ca5 100644
--- a/libcxx/test/std/input.output/iostream.format/print.fun/print.file.pass.cpp
+++ b/libcxx/test/std/input.output/iostream.format/print.fun/print.file.pass.cpp
@@ -1,4 +1,5 @@
 //===----------------------------------------------------------------------===//
+//
 // Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
 // See https://llvm.org/LICENSE.txt for license information.
 // SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
diff --git a/libcxx/test/std/input.output/iostream.format/print.fun/print.sh.cpp b/libcxx/test/std/input.output/iostream.format/print.fun/print.sh.cpp
index d348a3b..ea2ec65 100644
--- a/libcxx/test/std/input.output/iostream.format/print.fun/print.sh.cpp
+++ b/libcxx/test/std/input.output/iostream.format/print.fun/print.sh.cpp
@@ -1,4 +1,5 @@
 //===----------------------------------------------------------------------===//
+//
 // Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
 // See https://llvm.org/LICENSE.txt for license information.
 // SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
diff --git a/libcxx/test/std/input.output/iostream.format/print.fun/print_tests.h b/libcxx/test/std/input.output/iostream.format/print.fun/print_tests.h
index d28256c..74a2cea 100644
--- a/libcxx/test/std/input.output/iostream.format/print.fun/print_tests.h
+++ b/libcxx/test/std/input.output/iostream.format/print.fun/print_tests.h
@@ -1,4 +1,5 @@
 //===----------------------------------------------------------------------===//
+//
 // Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
 // See https://llvm.org/LICENSE.txt for license information.
 // SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
diff --git a/libcxx/test/std/input.output/iostream.format/print.fun/println.blank_line.sh.cpp b/libcxx/test/std/input.output/iostream.format/print.fun/println.blank_line.sh.cpp
index a262c287..a797abb 100644
--- a/libcxx/test/std/input.output/iostream.format/print.fun/println.blank_line.sh.cpp
+++ b/libcxx/test/std/input.output/iostream.format/print.fun/println.blank_line.sh.cpp
@@ -1,4 +1,5 @@
 //===----------------------------------------------------------------------===//
+//
 // Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
 // See https://llvm.org/LICENSE.txt for license information.
 // SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
diff --git a/libcxx/test/std/input.output/iostream.format/print.fun/println.file.pass.cpp b/libcxx/test/std/input.output/iostream.format/print.fun/println.file.pass.cpp
index 2f088e7a..fc40680 100644
--- a/libcxx/test/std/input.output/iostream.format/print.fun/println.file.pass.cpp
+++ b/libcxx/test/std/input.output/iostream.format/print.fun/println.file.pass.cpp
@@ -1,4 +1,5 @@
 //===----------------------------------------------------------------------===//
+//
 // Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
 // See https://llvm.org/LICENSE.txt for license information.
 // SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
diff --git a/libcxx/test/std/input.output/iostream.format/print.fun/println.sh.cpp b/libcxx/test/std/input.output/iostream.format/print.fun/println.sh.cpp
index b811b4f..b884738c 100644
--- a/libcxx/test/std/input.output/iostream.format/print.fun/println.sh.cpp
+++ b/libcxx/test/std/input.output/iostream.format/print.fun/println.sh.cpp
@@ -1,4 +1,5 @@
 //===----------------------------------------------------------------------===//
+//
 // Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
 // See https://llvm.org/LICENSE.txt for license information.
 // SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
diff --git a/libcxx/test/std/input.output/iostream.format/print.fun/vprint_nonunicode.file.pass.cpp b/libcxx/test/std/input.output/iostream.format/print.fun/vprint_nonunicode.file.pass.cpp
index edc8bb3..66fe733 100644
--- a/libcxx/test/std/input.output/iostream.format/print.fun/vprint_nonunicode.file.pass.cpp
+++ b/libcxx/test/std/input.output/iostream.format/print.fun/vprint_nonunicode.file.pass.cpp
@@ -1,4 +1,5 @@
 //===----------------------------------------------------------------------===//
+//
 // Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
 // See https://llvm.org/LICENSE.txt for license information.
 // SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
diff --git a/libcxx/test/std/input.output/iostream.format/print.fun/vprint_nonunicode.sh.cpp b/libcxx/test/std/input.output/iostream.format/print.fun/vprint_nonunicode.sh.cpp
index c1a690f..ee4478f 100644
--- a/libcxx/test/std/input.output/iostream.format/print.fun/vprint_nonunicode.sh.cpp
+++ b/libcxx/test/std/input.output/iostream.format/print.fun/vprint_nonunicode.sh.cpp
@@ -1,4 +1,5 @@
 //===----------------------------------------------------------------------===//
+//
 // Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
 // See https://llvm.org/LICENSE.txt for license information.
 // SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
diff --git a/libcxx/test/std/input.output/iostream.format/print.fun/vprint_unicode.file.pass.cpp b/libcxx/test/std/input.output/iostream.format/print.fun/vprint_unicode.file.pass.cpp
index bd9b991..fe49876 100644
--- a/libcxx/test/std/input.output/iostream.format/print.fun/vprint_unicode.file.pass.cpp
+++ b/libcxx/test/std/input.output/iostream.format/print.fun/vprint_unicode.file.pass.cpp
@@ -1,4 +1,5 @@
 //===----------------------------------------------------------------------===//
+//
 // Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
 // See https://llvm.org/LICENSE.txt for license information.
 // SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
diff --git a/libcxx/test/std/input.output/iostream.format/print.fun/vprint_unicode.sh.cpp b/libcxx/test/std/input.output/iostream.format/print.fun/vprint_unicode.sh.cpp
index e32a794..992e7f4 100644
--- a/libcxx/test/std/input.output/iostream.format/print.fun/vprint_unicode.sh.cpp
+++ b/libcxx/test/std/input.output/iostream.format/print.fun/vprint_unicode.sh.cpp
@@ -1,4 +1,5 @@
 //===----------------------------------------------------------------------===//
+//
 // Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
 // See https://llvm.org/LICENSE.txt for license information.
 // SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
diff --git a/libcxx/test/std/localization/locale.categories/category.numeric/locale.nm.put/facet.num.put.members/put_double.hex.pass.cpp b/libcxx/test/std/localization/locale.categories/category.numeric/locale.nm.put/facet.num.put.members/put_double.hex.pass.cpp
index 21efa97..946c263 100644
--- a/libcxx/test/std/localization/locale.categories/category.numeric/locale.nm.put/facet.num.put.members/put_double.hex.pass.cpp
+++ b/libcxx/test/std/localization/locale.categories/category.numeric/locale.nm.put/facet.num.put.members/put_double.hex.pass.cpp
@@ -12,10 +12,7 @@
 
 // iter_type put(iter_type s, ios_base& iob, char_type fill, double v) const;
 
-// With the Microsoft UCRT, printf("%a", 0.0) produces "0x0.0000000000000p+0"
-// while other C runtimes produce just "0x0p+0".
-// https://developercommunity.visualstudio.com/t/Printf-formatting-of-float-as-hex-prints/1660844
-// XFAIL: msvc
+// XFAIL: win32-broken-printf-a-precision
 
 // XFAIL: LIBCXX-AIX-FIXME
 
diff --git a/libcxx/test/std/localization/locale.categories/category.numeric/locale.nm.put/facet.num.put.members/put_long_double.hex.pass.cpp b/libcxx/test/std/localization/locale.categories/category.numeric/locale.nm.put/facet.num.put.members/put_long_double.hex.pass.cpp
index c97c9a0..a195c34 100644
--- a/libcxx/test/std/localization/locale.categories/category.numeric/locale.nm.put/facet.num.put.members/put_long_double.hex.pass.cpp
+++ b/libcxx/test/std/localization/locale.categories/category.numeric/locale.nm.put/facet.num.put.members/put_long_double.hex.pass.cpp
@@ -12,10 +12,7 @@
 
 // iter_type put(iter_type s, ios_base& iob, char_type fill, long double v) const;
 
-// With the Microsoft UCRT, printf("%a", 0.0) produces "0x0.0000000000000p+0"
-// while other C runtimes produce just "0x0p+0".
-// https://developercommunity.visualstudio.com/t/Printf-formatting-of-float-as-hex-prints/1660844
-// XFAIL: msvc
+// XFAIL: win32-broken-printf-a-precision
 
 // XFAIL: LIBCXX-AIX-FIXME
 
diff --git a/libcxx/test/std/numerics/rand/rand.dist/rand.dist.bern/rand.dist.bern.bin/eval.PR44847.pass.cpp b/libcxx/test/std/numerics/rand/rand.dist/rand.dist.bern/rand.dist.bern.bin/eval.PR44847.pass.cpp
index 549f3ce..a523bb9 100644
--- a/libcxx/test/std/numerics/rand/rand.dist/rand.dist.bern/rand.dist.bern.bin/eval.PR44847.pass.cpp
+++ b/libcxx/test/std/numerics/rand/rand.dist/rand.dist.bern/rand.dist.bern.bin/eval.PR44847.pass.cpp
@@ -18,8 +18,8 @@
 // Serializing/deserializing the state of the RNG requires iostreams
 // UNSUPPORTED: no-localization
 
-// This test appears to hang with picolibc & qemu.
-// UNSUPPORTED: LIBCXX-PICOLIBC-FIXME
+// Very slow when run in qemu.
+// REQUIRES: long_tests
 
 #include <random>
 #include <numeric>
diff --git a/libcxx/test/std/thread/thread.barrier/arrive.pass.cpp b/libcxx/test/std/thread/thread.barrier/arrive.pass.cpp
index d9d9c1d..b1ad644 100644
--- a/libcxx/test/std/thread/thread.barrier/arrive.pass.cpp
+++ b/libcxx/test/std/thread/thread.barrier/arrive.pass.cpp
@@ -7,10 +7,7 @@
 //===----------------------------------------------------------------------===//
 //
 // UNSUPPORTED: no-threads
-// UNSUPPORTED: c++03, c++11
-
-// Until we drop support for the synchronization library in C++11/14/17
-// ADDITIONAL_COMPILE_FLAGS: -D_LIBCPP_DISABLE_DEPRECATION_WARNINGS
+// UNSUPPORTED: c++03, c++11, c++14, c++17
 
 // XFAIL: availability-synchronization_library-missing
 
diff --git a/libcxx/test/std/thread/thread.barrier/arrive_and_drop.pass.cpp b/libcxx/test/std/thread/thread.barrier/arrive_and_drop.pass.cpp
index aff7b26..b0d94a8 100644
--- a/libcxx/test/std/thread/thread.barrier/arrive_and_drop.pass.cpp
+++ b/libcxx/test/std/thread/thread.barrier/arrive_and_drop.pass.cpp
@@ -7,10 +7,7 @@
 //===----------------------------------------------------------------------===//
 //
 // UNSUPPORTED: no-threads
-// UNSUPPORTED: c++03, c++11
-
-// Until we drop support for the synchronization library in C++11/14/17
-// ADDITIONAL_COMPILE_FLAGS: -D_LIBCPP_DISABLE_DEPRECATION_WARNINGS
+// UNSUPPORTED: c++03, c++11, c++14, c++17
 
 // XFAIL: availability-synchronization_library-missing
 
diff --git a/libcxx/test/std/thread/thread.barrier/arrive_and_wait.pass.cpp b/libcxx/test/std/thread/thread.barrier/arrive_and_wait.pass.cpp
index 8c45ba9..2d747e3 100644
--- a/libcxx/test/std/thread/thread.barrier/arrive_and_wait.pass.cpp
+++ b/libcxx/test/std/thread/thread.barrier/arrive_and_wait.pass.cpp
@@ -7,10 +7,7 @@
 //===----------------------------------------------------------------------===//
 //
 // UNSUPPORTED: no-threads
-// UNSUPPORTED: c++03, c++11
-
-// Until we drop support for the synchronization library in C++11/14/17
-// ADDITIONAL_COMPILE_FLAGS: -D_LIBCPP_DISABLE_DEPRECATION_WARNINGS
+// UNSUPPORTED: c++03, c++11, c++14, c++17
 
 // XFAIL: availability-synchronization_library-missing
 
diff --git a/libcxx/test/std/thread/thread.barrier/completion.pass.cpp b/libcxx/test/std/thread/thread.barrier/completion.pass.cpp
index 633a0c8..892e29b 100644
--- a/libcxx/test/std/thread/thread.barrier/completion.pass.cpp
+++ b/libcxx/test/std/thread/thread.barrier/completion.pass.cpp
@@ -7,10 +7,7 @@
 //===----------------------------------------------------------------------===//
 //
 // UNSUPPORTED: no-threads
-// UNSUPPORTED: c++03, c++11
-
-// Until we drop support for the synchronization library in C++11/14/17
-// ADDITIONAL_COMPILE_FLAGS: -D_LIBCPP_DISABLE_DEPRECATION_WARNINGS
+// UNSUPPORTED: c++03, c++11, c++14, c++17
 
 // XFAIL: availability-synchronization_library-missing
 
diff --git a/libcxx/test/std/thread/thread.barrier/ctor.compile.pass.cpp b/libcxx/test/std/thread/thread.barrier/ctor.compile.pass.cpp
index fe7068d..d67cf36 100644
--- a/libcxx/test/std/thread/thread.barrier/ctor.compile.pass.cpp
+++ b/libcxx/test/std/thread/thread.barrier/ctor.compile.pass.cpp
@@ -7,10 +7,7 @@
 //===----------------------------------------------------------------------===//
 //
 // UNSUPPORTED: no-threads
-// UNSUPPORTED: c++03, c++11
-
-// Until we drop support for the synchronization library in C++11/14/17
-// ADDITIONAL_COMPILE_FLAGS: -D_LIBCPP_DISABLE_DEPRECATION_WARNINGS
+// UNSUPPORTED: c++03, c++11, c++14, c++17
 
 // <barrier>
 
diff --git a/libcxx/test/std/thread/thread.barrier/max.pass.cpp b/libcxx/test/std/thread/thread.barrier/max.pass.cpp
index b09a02e..a3ec904 100644
--- a/libcxx/test/std/thread/thread.barrier/max.pass.cpp
+++ b/libcxx/test/std/thread/thread.barrier/max.pass.cpp
@@ -7,10 +7,7 @@
 //===----------------------------------------------------------------------===//
 //
 // UNSUPPORTED: no-threads
-// UNSUPPORTED: c++03, c++11
-
-// Until we drop support for the synchronization library in C++11/14/17
-// ADDITIONAL_COMPILE_FLAGS: -D_LIBCPP_DISABLE_DEPRECATION_WARNINGS
+// UNSUPPORTED: c++03, c++11, c++14, c++17
 
 // <barrier>
 
diff --git a/libcxx/test/std/thread/thread.latch/arrive_and_wait.pass.cpp b/libcxx/test/std/thread/thread.latch/arrive_and_wait.pass.cpp
index 8ca4f37..23cb270 100644
--- a/libcxx/test/std/thread/thread.latch/arrive_and_wait.pass.cpp
+++ b/libcxx/test/std/thread/thread.latch/arrive_and_wait.pass.cpp
@@ -7,10 +7,7 @@
 //===----------------------------------------------------------------------===//
 //
 // UNSUPPORTED: no-threads
-// UNSUPPORTED: c++03, c++11
-
-// Until we drop support for the synchronization library in C++11/14/17
-// ADDITIONAL_COMPILE_FLAGS: -D_LIBCPP_DISABLE_DEPRECATION_WARNINGS
+// UNSUPPORTED: c++03, c++11, c++14, c++17
 
 // XFAIL: availability-synchronization_library-missing
 
diff --git a/libcxx/test/std/thread/thread.latch/count_down.pass.cpp b/libcxx/test/std/thread/thread.latch/count_down.pass.cpp
index eb524ab..f33f7b2 100644
--- a/libcxx/test/std/thread/thread.latch/count_down.pass.cpp
+++ b/libcxx/test/std/thread/thread.latch/count_down.pass.cpp
@@ -7,10 +7,7 @@
 //===----------------------------------------------------------------------===//
 //
 // UNSUPPORTED: no-threads
-// UNSUPPORTED: c++03, c++11
-
-// Until we drop support for the synchronization library in C++11/14/17
-// ADDITIONAL_COMPILE_FLAGS: -D_LIBCPP_DISABLE_DEPRECATION_WARNINGS
+// UNSUPPORTED: c++03, c++11, c++14, c++17
 
 // XFAIL: availability-synchronization_library-missing
 
diff --git a/libcxx/test/std/thread/thread.latch/ctor.pass.cpp b/libcxx/test/std/thread/thread.latch/ctor.pass.cpp
index bca4561..df258b0 100644
--- a/libcxx/test/std/thread/thread.latch/ctor.pass.cpp
+++ b/libcxx/test/std/thread/thread.latch/ctor.pass.cpp
@@ -7,10 +7,7 @@
 //===----------------------------------------------------------------------===//
 //
 // UNSUPPORTED: no-threads
-// UNSUPPORTED: c++03, c++11
-
-// Until we drop support for the synchronization library in C++11/14/17
-// ADDITIONAL_COMPILE_FLAGS: -D_LIBCPP_DISABLE_DEPRECATION_WARNINGS
+// UNSUPPORTED: c++03, c++11, c++14, c++17
 
 // <latch>
 
diff --git a/libcxx/test/std/thread/thread.latch/max.pass.cpp b/libcxx/test/std/thread/thread.latch/max.pass.cpp
index bcf353e..4490f94a 100644
--- a/libcxx/test/std/thread/thread.latch/max.pass.cpp
+++ b/libcxx/test/std/thread/thread.latch/max.pass.cpp
@@ -7,10 +7,7 @@
 //===----------------------------------------------------------------------===//
 //
 // UNSUPPORTED: no-threads
-// UNSUPPORTED: c++03, c++11
-
-// Until we drop support for the synchronization library in C++11/14/17
-// ADDITIONAL_COMPILE_FLAGS: -D_LIBCPP_DISABLE_DEPRECATION_WARNINGS
+// UNSUPPORTED: c++03, c++11, c++14, c++17
 
 // <latch>
 
diff --git a/libcxx/test/std/thread/thread.latch/try_wait.pass.cpp b/libcxx/test/std/thread/thread.latch/try_wait.pass.cpp
index 8f35446..fa09e56 100644
--- a/libcxx/test/std/thread/thread.latch/try_wait.pass.cpp
+++ b/libcxx/test/std/thread/thread.latch/try_wait.pass.cpp
@@ -7,10 +7,7 @@
 //===----------------------------------------------------------------------===//
 //
 // UNSUPPORTED: no-threads
-// UNSUPPORTED: c++03, c++11
-
-// Until we drop support for the synchronization library in C++11/14/17
-// ADDITIONAL_COMPILE_FLAGS: -D_LIBCPP_DISABLE_DEPRECATION_WARNINGS
+// UNSUPPORTED: c++03, c++11, c++14, c++17
 
 // XFAIL: availability-synchronization_library-missing
 
diff --git a/libcxx/test/std/thread/thread.mutex/thread.lock/thread.lock.unique/implicit_ctad.pass.cpp b/libcxx/test/std/thread/thread.mutex/thread.lock/thread.lock.unique/implicit_ctad.pass.cpp
index 337ad4c..8c7ca42 100644
--- a/libcxx/test/std/thread/thread.mutex/thread.lock/thread.lock.unique/implicit_ctad.pass.cpp
+++ b/libcxx/test/std/thread/thread.mutex/thread.lock/thread.lock.unique/implicit_ctad.pass.cpp
@@ -6,7 +6,6 @@
 //
 //===----------------------------------------------------------------------===//
 
-// UNSUPPORTED: no-threads
 // UNSUPPORTED: c++03, c++11, c++14
 
 // <mutex>
@@ -18,12 +17,13 @@
 #include <mutex>
 
 #include "test_macros.h"
+#include "types.h"
 
 int main(int, char**) {
-  std::mutex mutex;
+  MyMutex mutex;
   {
     std::unique_lock lock(mutex);
-    ASSERT_SAME_TYPE(decltype(lock), std::unique_lock<std::mutex>);
+    ASSERT_SAME_TYPE(decltype(lock), std::unique_lock<MyMutex>);
   }
 
   return 0;
diff --git a/libcxx/test/std/thread/thread.mutex/thread.lock/thread.lock.unique/thread.lock.unique.cons/copy_assign.compile.fail.cpp b/libcxx/test/std/thread/thread.mutex/thread.lock/thread.lock.unique/thread.lock.unique.cons/copy_assign.compile.pass.cpp
index 799cb61..9ab8369 100644
--- a/libcxx/test/std/thread/thread.mutex/thread.lock/thread.lock.unique/thread.lock.unique.cons/copy_assign.compile.fail.cpp
+++ b/libcxx/test/std/thread/thread.mutex/thread.lock/thread.lock.unique/thread.lock.unique.cons/copy_assign.compile.pass.cpp
@@ -13,22 +13,7 @@
 // unique_lock& operator=(unique_lock const&) = delete;
 
 #include <mutex>
-#include <cassert>
 
-int main(int, char**)
-{
-    {
-    typedef std::mutex M;
-    M m0;
-    M m1;
-    std::unique_lock<M> lk0(m0);
-    std::unique_lock<M> lk1(m1);
-    lk1 = lk0;
-    assert(lk1.mutex() == &m0);
-    assert(lk1.owns_lock() == true);
-    assert(lk0.mutex() == nullptr);
-    assert(lk0.owns_lock() == false);
-    }
+#include "../types.h"
 
-  return 0;
-}
+static_assert(!std::is_copy_assignable<std::lock_guard<MyMutex> >::value, "");
diff --git a/libcxx/test/std/thread/thread.mutex/thread.lock/thread.lock.unique/thread.lock.unique.cons/copy_ctor.compile.fail.cpp b/libcxx/test/std/thread/thread.mutex/thread.lock/thread.lock.unique/thread.lock.unique.cons/copy_ctor.compile.pass.cpp
index e258198..e846061 100644
--- a/libcxx/test/std/thread/thread.mutex/thread.lock/thread.lock.unique/thread.lock.unique.cons/copy_ctor.compile.fail.cpp
+++ b/libcxx/test/std/thread/thread.mutex/thread.lock/thread.lock.unique/thread.lock.unique.cons/copy_ctor.compile.pass.cpp
@@ -13,20 +13,7 @@
 // unique_lock(unique_lock const&) = delete;
 
 #include <mutex>
-#include <cassert>
 
-int main(int, char**)
-{
-    {
-    typedef std::mutex M;
-    M m;
-    std::unique_lock<M> lk0(m);
-    std::unique_lock<M> lk = lk0;
-    assert(lk.mutex() == &m);
-    assert(lk.owns_lock() == true);
-    assert(lk0.mutex() == nullptr);
-    assert(lk0.owns_lock() == false);
-    }
+#include "../types.h"
 
-  return 0;
-}
+static_assert(!std::is_copy_constructible<std::lock_guard<MyMutex> >::value, "");
diff --git a/libcxx/test/std/thread/thread.mutex/thread.lock/thread.lock.unique/thread.lock.unique.cons/default.pass.cpp b/libcxx/test/std/thread/thread.mutex/thread.lock/thread.lock.unique/thread.lock.unique.cons/default.pass.cpp
index 2034a26..6fc4f7f 100644
--- a/libcxx/test/std/thread/thread.mutex/thread.lock/thread.lock.unique/thread.lock.unique.cons/default.pass.cpp
+++ b/libcxx/test/std/thread/thread.mutex/thread.lock/thread.lock.unique/thread.lock.unique.cons/default.pass.cpp
@@ -5,8 +5,6 @@
 // SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
 //
 //===----------------------------------------------------------------------===//
-//
-// UNSUPPORTED: no-threads
 
 // <mutex>
 
@@ -14,16 +12,16 @@
 
 // unique_lock();
 
-#include <mutex>
 #include <cassert>
+#include <mutex>
 
 #include "test_macros.h"
+#include "../types.h"
 
-int main(int, char**)
-{
-    std::unique_lock<std::mutex> ul;
-    assert(!ul.owns_lock());
-    assert(ul.mutex() == nullptr);
+int main(int, char**) {
+  std::unique_lock<MyMutex> ul;
+  assert(!ul.owns_lock());
+  assert(ul.mutex() == nullptr);
 
   return 0;
 }
diff --git a/libcxx/test/std/thread/thread.mutex/thread.lock/thread.lock.unique/thread.lock.unique.cons/move_assign.pass.cpp b/libcxx/test/std/thread/thread.mutex/thread.lock/thread.lock.unique/thread.lock.unique.cons/move_assign.pass.cpp
index 0af918c..9563fde 100644
--- a/libcxx/test/std/thread/thread.mutex/thread.lock/thread.lock.unique/thread.lock.unique.cons/move_assign.pass.cpp
+++ b/libcxx/test/std/thread/thread.mutex/thread.lock/thread.lock.unique/thread.lock.unique.cons/move_assign.pass.cpp
@@ -5,8 +5,6 @@
 // SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
 //
 //===----------------------------------------------------------------------===//
-//
-// UNSUPPORTED: no-threads, c++03
 
 // <mutex>
 
@@ -14,16 +12,16 @@
 
 // unique_lock& operator=(unique_lock&& u);
 
-#include <mutex>
 #include <cassert>
-#include "nasty_containers.h"
+#include <mutex>
 
+#include "nasty_containers.h"
+#include "../types.h"
 #include "test_macros.h"
 
-int main(int, char**)
-{
-    {
-    typedef std::mutex M;
+int main(int, char**) {
+  {
+    typedef MyMutex M;
     M m0;
     M m1;
     std::unique_lock<M> lk0(m0);
@@ -33,8 +31,8 @@ int main(int, char**)
     assert(lk1.owns_lock() == true);
     assert(lk0.mutex() == nullptr);
     assert(lk0.owns_lock() == false);
-    }
-    {
+  }
+  {
     typedef nasty_mutex M;
     M m0;
     M m1;
@@ -45,7 +43,7 @@ int main(int, char**)
     assert(lk1.owns_lock() == true);
     assert(lk0.mutex() == nullptr);
     assert(lk0.owns_lock() == false);
-    }
+  }
 
   return 0;
 }
diff --git a/libcxx/test/std/thread/thread.mutex/thread.lock/thread.lock.unique/thread.lock.unique.cons/move_ctor.pass.cpp b/libcxx/test/std/thread/thread.mutex/thread.lock/thread.lock.unique/thread.lock.unique.cons/move_ctor.pass.cpp
index cce0eb5f..08f6fc8 100644
--- a/libcxx/test/std/thread/thread.mutex/thread.lock/thread.lock.unique/thread.lock.unique.cons/move_ctor.pass.cpp
+++ b/libcxx/test/std/thread/thread.mutex/thread.lock/thread.lock.unique/thread.lock.unique.cons/move_ctor.pass.cpp
@@ -6,7 +6,7 @@
 //
 //===----------------------------------------------------------------------===//
 //
-// UNSUPPORTED: no-threads, c++03
+// UNSUPPORTED: c++03
 
 // <mutex>
 
@@ -14,16 +14,16 @@
 
 // unique_lock(unique_lock&& u);
 
-#include <mutex>
 #include <cassert>
-#include "nasty_containers.h"
+#include <mutex>
 
+#include "nasty_containers.h"
+#include "../types.h"
 #include "test_macros.h"
 
-int main(int, char**)
-{
-    {
-    typedef std::mutex M;
+int main(int, char**) {
+  {
+    typedef MyMutex M;
     M m;
     std::unique_lock<M> lk0(m);
     std::unique_lock<M> lk = std::move(lk0);
@@ -31,8 +31,8 @@ int main(int, char**)
     assert(lk.owns_lock() == true);
     assert(lk0.mutex() == nullptr);
     assert(lk0.owns_lock() == false);
-    }
-    {
+  }
+  {
     typedef nasty_mutex M;
     M m;
     std::unique_lock<M> lk0(m);
@@ -41,7 +41,7 @@ int main(int, char**)
     assert(lk.owns_lock() == true);
     assert(lk0.mutex() == nullptr);
     assert(lk0.owns_lock() == false);
-    }
+  }
 
   return 0;
 }
diff --git a/libcxx/test/std/thread/thread.mutex/thread.lock/thread.lock.unique/thread.lock.unique.cons/mutex_adopt_lock.pass.cpp b/libcxx/test/std/thread/thread.mutex/thread.lock/thread.lock.unique/thread.lock.unique.cons/mutex_adopt_lock.pass.cpp
index 4adbe26..28cc438 100644
--- a/libcxx/test/std/thread/thread.mutex/thread.lock/thread.lock.unique/thread.lock.unique.cons/mutex_adopt_lock.pass.cpp
+++ b/libcxx/test/std/thread/thread.mutex/thread.lock/thread.lock.unique/thread.lock.unique.cons/mutex_adopt_lock.pass.cpp
@@ -6,7 +6,6 @@
 //
 //===----------------------------------------------------------------------===//
 //
-// UNSUPPORTED: no-threads
 // UNSUPPORTED: c++03
 
 // <mutex>
@@ -15,30 +14,30 @@
 
 // unique_lock(mutex_type& m, adopt_lock_t);
 
-#include <mutex>
 #include <cassert>
-#include "nasty_containers.h"
+#include <mutex>
 
+#include "nasty_containers.h"
+#include "../types.h"
 #include "test_macros.h"
 
-int main(int, char**)
-{
-    {
-    typedef std::mutex M;
+int main(int, char**) {
+  {
+    typedef MyMutex M;
     M m;
     m.lock();
     std::unique_lock<M> lk(m, std::adopt_lock);
     assert(lk.mutex() == std::addressof(m));
     assert(lk.owns_lock() == true);
-    }
-    {
+  }
+  {
     typedef nasty_mutex M;
     M m;
     m.lock();
     std::unique_lock<M> lk(m, std::adopt_lock);
     assert(lk.mutex() == std::addressof(m));
     assert(lk.owns_lock() == true);
-    }
+  }
 
   return 0;
 }
diff --git a/libcxx/test/std/thread/thread.mutex/thread.lock/thread.lock.unique/thread.lock.unique.cons/mutex_defer_lock.pass.cpp b/libcxx/test/std/thread/thread.mutex/thread.lock/thread.lock.unique/thread.lock.unique.cons/mutex_defer_lock.pass.cpp
index 06ef204..96a9afb 100644
--- a/libcxx/test/std/thread/thread.mutex/thread.lock/thread.lock.unique/thread.lock.unique.cons/mutex_defer_lock.pass.cpp
+++ b/libcxx/test/std/thread/thread.mutex/thread.lock/thread.lock.unique/thread.lock.unique.cons/mutex_defer_lock.pass.cpp
@@ -6,7 +6,6 @@
 //
 //===----------------------------------------------------------------------===//
 //
-// UNSUPPORTED: no-threads
 // UNSUPPORTED: c++03
 
 // <mutex>
@@ -15,28 +14,28 @@
 
 // unique_lock(mutex_type& m, defer_lock_t);
 
-#include <mutex>
 #include <cassert>
-#include "nasty_containers.h"
+#include <mutex>
 
+#include "nasty_containers.h"
+#include "../types.h"
 #include "test_macros.h"
 
-int main(int, char**)
-{
-    {
-    typedef std::mutex M;
+int main(int, char**) {
+  {
+    typedef MyMutex M;
     M m;
     std::unique_lock<M> lk(m, std::defer_lock);
     assert(lk.mutex() == std::addressof(m));
     assert(lk.owns_lock() == false);
-    }
-    {
+  }
+  {
     typedef nasty_mutex M;
     M m;
     std::unique_lock<M> lk(m, std::defer_lock);
     assert(lk.mutex() == std::addressof(m));
     assert(lk.owns_lock() == false);
-    }
+  }
 
   return 0;
 }
diff --git a/libcxx/test/std/thread/thread.mutex/thread.lock/thread.lock.unique/thread.lock.unique.locking/lock.pass.cpp b/libcxx/test/std/thread/thread.mutex/thread.lock/thread.lock.unique/thread.lock.unique.locking/lock.pass.cpp
index 920baa7..6767e11 100644
--- a/libcxx/test/std/thread/thread.mutex/thread.lock/thread.lock.unique/thread.lock.unique.locking/lock.pass.cpp
+++ b/libcxx/test/std/thread/thread.mutex/thread.lock/thread.lock.unique/thread.lock.unique.locking/lock.pass.cpp
@@ -51,7 +51,7 @@ void f()
     }
     catch (std::system_error& e)
     {
-        assert(e.code().value() == EDEADLK);
+      assert(e.code() == std::errc::resource_deadlock_would_occur);
     }
 #endif
     lk.unlock();
@@ -64,7 +64,7 @@ void f()
     }
     catch (std::system_error& e)
     {
-        assert(e.code().value() == EPERM);
+      assert(e.code() == std::errc::operation_not_permitted);
     }
 #endif
 }
diff --git a/libcxx/test/std/thread/thread.mutex/thread.lock/thread.lock.unique/thread.lock.unique.locking/try_lock.pass.cpp b/libcxx/test/std/thread/thread.mutex/thread.lock/thread.lock.unique/thread.lock.unique.locking/try_lock.pass.cpp
index 4cf5ec2..2ee5d37 100644
--- a/libcxx/test/std/thread/thread.mutex/thread.lock/thread.lock.unique/thread.lock.unique.locking/try_lock.pass.cpp
+++ b/libcxx/test/std/thread/thread.mutex/thread.lock/thread.lock.unique/thread.lock.unique.locking/try_lock.pass.cpp
@@ -6,7 +6,6 @@
 //
 //===----------------------------------------------------------------------===//
 //
-// UNSUPPORTED: no-threads
 // UNSUPPORTED: c++03
 // ALLOW_RETRIES: 2
 
@@ -21,53 +20,35 @@
 #include <system_error>
 
 #include "test_macros.h"
+#include "../types.h"
 
-bool try_lock_called = false;
+MyTimedMutex m;
 
-struct mutex
-{
-    bool try_lock()
-    {
-        try_lock_called = !try_lock_called;
-        return try_lock_called;
-    }
-    void unlock() {}
-};
-
-mutex m;
-
-int main(int, char**)
-{
-    std::unique_lock<mutex> lk(m, std::defer_lock);
-    assert(lk.try_lock() == true);
-    assert(try_lock_called == true);
-    assert(lk.owns_lock() == true);
+int main(int, char**) {
+  std::unique_lock<MyTimedMutex> lk(m, std::defer_lock);
+  assert(lk.try_lock() == true);
+  assert(m.try_lock_called == true);
+  assert(lk.owns_lock() == true);
 #ifndef TEST_HAS_NO_EXCEPTIONS
-    try
-    {
-        TEST_IGNORE_NODISCARD lk.try_lock();
-        assert(false);
-    }
-    catch (std::system_error& e)
-    {
-        assert(e.code().value() == EDEADLK);
-    }
+  try {
+    TEST_IGNORE_NODISCARD lk.try_lock();
+    assert(false);
+  } catch (std::system_error& e) {
+    assert(e.code() == std::errc::resource_deadlock_would_occur);
+  }
 #endif
-    lk.unlock();
-    assert(lk.try_lock() == false);
-    assert(try_lock_called == false);
-    assert(lk.owns_lock() == false);
-    lk.release();
+  lk.unlock();
+  assert(lk.try_lock() == false);
+  assert(m.try_lock_called == false);
+  assert(lk.owns_lock() == false);
+  lk.release();
 #ifndef TEST_HAS_NO_EXCEPTIONS
-    try
-    {
-        TEST_IGNORE_NODISCARD lk.try_lock();
-        assert(false);
-    }
-    catch (std::system_error& e)
-    {
-        assert(e.code().value() == EPERM);
-    }
+  try {
+    TEST_IGNORE_NODISCARD lk.try_lock();
+    assert(false);
+  } catch (std::system_error& e) {
+    assert(e.code() == std::errc::operation_not_permitted);
+  }
 #endif
 
   return 0;
diff --git a/libcxx/test/std/thread/thread.mutex/thread.lock/thread.lock.unique/thread.lock.unique.locking/try_lock_for.pass.cpp b/libcxx/test/std/thread/thread.mutex/thread.lock/thread.lock.unique/thread.lock.unique.locking/try_lock_for.pass.cpp
index 8e7004e..603cc7b 100644
--- a/libcxx/test/std/thread/thread.mutex/thread.lock/thread.lock.unique/thread.lock.unique.locking/try_lock_for.pass.cpp
+++ b/libcxx/test/std/thread/thread.mutex/thread.lock/thread.lock.unique/thread.lock.unique.locking/try_lock_for.pass.cpp
@@ -6,7 +6,6 @@
 //
 //===----------------------------------------------------------------------===//
 //
-// UNSUPPORTED: no-threads
 // UNSUPPORTED: c++03
 
 // <mutex>
@@ -21,57 +20,36 @@
 #include <system_error>
 
 #include "test_macros.h"
+#include "../types.h"
 
-bool try_lock_for_called = false;
+MyTimedMutex m;
 
-typedef std::chrono::milliseconds ms;
-
-struct mutex
-{
-    template <class Rep, class Period>
-        bool try_lock_for(const std::chrono::duration<Rep, Period>& rel_time)
-    {
-        assert(rel_time == ms(5));
-        try_lock_for_called = !try_lock_for_called;
-        return try_lock_for_called;
-    }
-    void unlock() {}
-};
-
-mutex m;
-
-int main(int, char**)
-{
-    std::unique_lock<mutex> lk(m, std::defer_lock);
-    assert(lk.try_lock_for(ms(5)) == true);
-    assert(try_lock_for_called == true);
-    assert(lk.owns_lock() == true);
+int main(int, char**) {
+  using ms = std::chrono::milliseconds;
+  std::unique_lock<MyTimedMutex> lk(m, std::defer_lock);
+  assert(lk.try_lock_for(ms(5)) == true);
+  assert(m.try_lock_for_called == true);
+  assert(lk.owns_lock() == true);
 #ifndef TEST_HAS_NO_EXCEPTIONS
-    try
-    {
-        TEST_IGNORE_NODISCARD lk.try_lock_for(ms(5));
-        assert(false);
-    }
-    catch (std::system_error& e)
-    {
-        assert(e.code().value() == EDEADLK);
-    }
+  try {
+    TEST_IGNORE_NODISCARD lk.try_lock_for(ms(5));
+    assert(false);
+  } catch (std::system_error& e) {
+    assert(e.code() == std::errc::resource_deadlock_would_occur);
+  }
 #endif
-    lk.unlock();
-    assert(lk.try_lock_for(ms(5)) == false);
-    assert(try_lock_for_called == false);
-    assert(lk.owns_lock() == false);
-    lk.release();
+  lk.unlock();
+  assert(lk.try_lock_for(ms(5)) == false);
+  assert(m.try_lock_for_called == false);
+  assert(lk.owns_lock() == false);
+  lk.release();
 #ifndef TEST_HAS_NO_EXCEPTIONS
-    try
-    {
-        TEST_IGNORE_NODISCARD lk.try_lock_for(ms(5));
-        assert(false);
-    }
-    catch (std::system_error& e)
-    {
-        assert(e.code().value() == EPERM);
-    }
+  try {
+    TEST_IGNORE_NODISCARD lk.try_lock_for(ms(5));
+    assert(false);
+  } catch (std::system_error& e) {
+    assert(e.code() == std::errc::operation_not_permitted);
+  }
 #endif
 
   return 0;
diff --git a/libcxx/test/std/thread/thread.mutex/thread.lock/thread.lock.unique/thread.lock.unique.locking/try_lock_until.pass.cpp b/libcxx/test/std/thread/thread.mutex/thread.lock/thread.lock.unique/thread.lock.unique.locking/try_lock_until.pass.cpp
index 077bc51..46ab951 100644
--- a/libcxx/test/std/thread/thread.mutex/thread.lock/thread.lock.unique/thread.lock.unique.locking/try_lock_until.pass.cpp
+++ b/libcxx/test/std/thread/thread.mutex/thread.lock/thread.lock.unique/thread.lock.unique.locking/try_lock_until.pass.cpp
@@ -6,7 +6,6 @@
 //
 //===----------------------------------------------------------------------===//
 //
-// UNSUPPORTED: no-threads
 // UNSUPPORTED: c++03
 
 // <mutex>
@@ -17,61 +16,41 @@
 //   bool try_lock_until(const chrono::time_point<Clock, Duration>& abs_time);
 
 #include <cassert>
+#include <chrono>
 #include <mutex>
 #include <system_error>
 
 #include "test_macros.h"
+#include "../types.h"
 
-bool try_lock_until_called = false;
+MyTimedMutex m;
 
-struct mutex
-{
-    template <class Clock, class Duration>
-        bool try_lock_until(const std::chrono::time_point<Clock, Duration>& abs_time)
-    {
-        typedef std::chrono::milliseconds ms;
-        assert(Clock::now() - abs_time < ms(5));
-        try_lock_until_called = !try_lock_until_called;
-        return try_lock_until_called;
-    }
-    void unlock() {}
-};
-
-mutex m;
-
-int main(int, char**)
-{
-    typedef std::chrono::steady_clock Clock;
-    std::unique_lock<mutex> lk(m, std::defer_lock);
-    assert(lk.try_lock_until(Clock::now()) == true);
-    assert(try_lock_until_called == true);
-    assert(lk.owns_lock() == true);
+int main(int, char**) {
+  typedef std::chrono::system_clock Clock;
+  std::unique_lock<MyTimedMutex> lk(m, std::defer_lock);
+  assert(lk.try_lock_until(Clock::now()) == true);
+  assert(m.try_lock_until_called == true);
+  assert(lk.owns_lock() == true);
 #ifndef TEST_HAS_NO_EXCEPTIONS
-    try
-    {
-        TEST_IGNORE_NODISCARD lk.try_lock_until(Clock::now());
-        assert(false);
-    }
-    catch (std::system_error& e)
-    {
-        assert(e.code().value() == EDEADLK);
-    }
+  try {
+    TEST_IGNORE_NODISCARD lk.try_lock_until(Clock::now());
+    assert(false);
+  } catch (std::system_error& e) {
+    assert(e.code() == std::errc::resource_deadlock_would_occur);
+  }
 #endif
-    lk.unlock();
-    assert(lk.try_lock_until(Clock::now()) == false);
-    assert(try_lock_until_called == false);
-    assert(lk.owns_lock() == false);
-    lk.release();
+  lk.unlock();
+  assert(lk.try_lock_until(Clock::now()) == false);
+  assert(m.try_lock_until_called == false);
+  assert(lk.owns_lock() == false);
+  lk.release();
 #ifndef TEST_HAS_NO_EXCEPTIONS
-    try
-    {
-        TEST_IGNORE_NODISCARD lk.try_lock_until(Clock::now());
-        assert(false);
-    }
-    catch (std::system_error& e)
-    {
-        assert(e.code().value() == EPERM);
-    }
+  try {
+    TEST_IGNORE_NODISCARD lk.try_lock_until(Clock::now());
+    assert(false);
+  } catch (std::system_error& e) {
+    assert(e.code() == std::errc::operation_not_permitted);
+  }
 #endif
 
   return 0;
diff --git a/libcxx/test/std/thread/thread.mutex/thread.lock/thread.lock.unique/thread.lock.unique.locking/unlock.pass.cpp b/libcxx/test/std/thread/thread.mutex/thread.lock/thread.lock.unique/thread.lock.unique.locking/unlock.pass.cpp
index 30c7951..97808f6 100644
--- a/libcxx/test/std/thread/thread.mutex/thread.lock/thread.lock.unique/thread.lock.unique.locking/unlock.pass.cpp
+++ b/libcxx/test/std/thread/thread.mutex/thread.lock/thread.lock.unique/thread.lock.unique.locking/unlock.pass.cpp
@@ -5,8 +5,6 @@
 // SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
 //
 //===----------------------------------------------------------------------===//
-//
-// UNSUPPORTED: no-threads
 
 // <mutex>
 
@@ -19,45 +17,30 @@
 #include <system_error>
 
 #include "test_macros.h"
+#include "../types.h"
 
-bool unlock_called = false;
-
-struct mutex
-{
-    void lock() {}
-    void unlock() {unlock_called = true;}
-};
-
-mutex m;
+MyMutex m;
 
-int main(int, char**)
-{
-    std::unique_lock<mutex> lk(m);
-    lk.unlock();
-    assert(unlock_called == true);
-    assert(lk.owns_lock() == false);
+int main(int, char**) {
+  std::unique_lock<MyMutex> lk(m);
+  lk.unlock();
+  assert(lk.owns_lock() == false);
 #ifndef TEST_HAS_NO_EXCEPTIONS
-    try
-    {
-        lk.unlock();
-        assert(false);
-    }
-    catch (std::system_error& e)
-    {
-        assert(e.code().value() == EPERM);
-    }
+  try {
+    lk.unlock();
+    assert(false);
+  } catch (std::system_error& e) {
+    assert(e.code() == std::errc::operation_not_permitted);
+  }
 #endif
-    lk.release();
+  lk.release();
 #ifndef TEST_HAS_NO_EXCEPTIONS
-    try
-    {
-        lk.unlock();
-        assert(false);
-    }
-    catch (std::system_error& e)
-    {
-        assert(e.code().value() == EPERM);
-    }
+  try {
+    lk.unlock();
+    assert(false);
+  } catch (std::system_error& e) {
+    assert(e.code() == std::errc::operation_not_permitted);
+  }
 #endif
 
   return 0;
diff --git a/libcxx/test/std/thread/thread.mutex/thread.lock/thread.lock.unique/thread.lock.unique.mod/member_swap.pass.cpp b/libcxx/test/std/thread/thread.mutex/thread.lock/thread.lock.unique/thread.lock.unique.mod/member_swap.pass.cpp
index fc12d3b..361c85e 100644
--- a/libcxx/test/std/thread/thread.mutex/thread.lock/thread.lock.unique/thread.lock.unique.mod/member_swap.pass.cpp
+++ b/libcxx/test/std/thread/thread.mutex/thread.lock/thread.lock.unique/thread.lock.unique.mod/member_swap.pass.cpp
@@ -5,8 +5,6 @@
 // SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
 //
 //===----------------------------------------------------------------------===//
-//
-// UNSUPPORTED: no-threads
 
 // <mutex>
 
@@ -14,28 +12,22 @@
 
 // void swap(unique_lock& u);
 
-#include <mutex>
 #include <cassert>
+#include <mutex>
 
 #include "test_macros.h"
-
-struct mutex
-{
-    void lock() {}
-    void unlock() {}
-};
-
-mutex m;
-
-int main(int, char**)
-{
-    std::unique_lock<mutex> lk1(m);
-    std::unique_lock<mutex> lk2;
-    lk1.swap(lk2);
-    assert(lk1.mutex() == nullptr);
-    assert(lk1.owns_lock() == false);
-    assert(lk2.mutex() == &m);
-    assert(lk2.owns_lock() == true);
+#include "../types.h"
+
+MyMutex m;
+
+int main(int, char**) {
+  std::unique_lock<MyMutex> lk1(m);
+  std::unique_lock<MyMutex> lk2;
+  lk1.swap(lk2);
+  assert(lk1.mutex() == nullptr);
+  assert(lk1.owns_lock() == false);
+  assert(lk2.mutex() == &m);
+  assert(lk2.owns_lock() == true);
 
   return 0;
 }
diff --git a/libcxx/test/std/thread/thread.mutex/thread.lock/thread.lock.unique/thread.lock.unique.mod/nonmember_swap.pass.cpp b/libcxx/test/std/thread/thread.mutex/thread.lock/thread.lock.unique/thread.lock.unique.mod/nonmember_swap.pass.cpp
index 03d268c..5133032 100644
--- a/libcxx/test/std/thread/thread.mutex/thread.lock/thread.lock.unique/thread.lock.unique.mod/nonmember_swap.pass.cpp
+++ b/libcxx/test/std/thread/thread.mutex/thread.lock/thread.lock.unique/thread.lock.unique.mod/nonmember_swap.pass.cpp
@@ -5,8 +5,6 @@
 // SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
 //
 //===----------------------------------------------------------------------===//
-//
-// UNSUPPORTED: no-threads
 
 // <mutex>
 
@@ -15,28 +13,22 @@
 // template <class Mutex>
 //   void swap(unique_lock<Mutex>& x, unique_lock<Mutex>& y);
 
-#include <mutex>
 #include <cassert>
+#include <mutex>
 
 #include "test_macros.h"
-
-struct mutex
-{
-    void lock() {}
-    void unlock() {}
-};
-
-mutex m;
-
-int main(int, char**)
-{
-    std::unique_lock<mutex> lk1(m);
-    std::unique_lock<mutex> lk2;
-    swap(lk1, lk2);
-    assert(lk1.mutex() == nullptr);
-    assert(lk1.owns_lock() == false);
-    assert(lk2.mutex() == &m);
-    assert(lk2.owns_lock() == true);
+#include "../types.h"
+
+MyMutex m;
+
+int main(int, char**) {
+  std::unique_lock<MyMutex> lk1(m);
+  std::unique_lock<MyMutex> lk2;
+  swap(lk1, lk2);
+  assert(lk1.mutex() == nullptr);
+  assert(lk1.owns_lock() == false);
+  assert(lk2.mutex() == &m);
+  assert(lk2.owns_lock() == true);
 
   return 0;
 }
diff --git a/libcxx/test/std/thread/thread.mutex/thread.lock/thread.lock.unique/thread.lock.unique.mod/release.pass.cpp b/libcxx/test/std/thread/thread.mutex/thread.lock/thread.lock.unique/thread.lock.unique.mod/release.pass.cpp
index 4f2d59c..a726c8cc 100644
--- a/libcxx/test/std/thread/thread.mutex/thread.lock/thread.lock.unique/thread.lock.unique.mod/release.pass.cpp
+++ b/libcxx/test/std/thread/thread.mutex/thread.lock/thread.lock.unique/thread.lock.unique.mod/release.pass.cpp
@@ -5,8 +5,6 @@
 // SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
 //
 //===----------------------------------------------------------------------===//
-//
-// UNSUPPORTED: no-threads
 
 // <mutex>
 
@@ -14,36 +12,28 @@
 
 // mutex_type* release() noexcept;
 
-#include <mutex>
 #include <cassert>
+#include <mutex>
 
 #include "test_macros.h"
-
-struct mutex
-{
-    static int lock_count;
-    static int unlock_count;
-    void lock() {++lock_count;}
-    void unlock() {++unlock_count;}
-};
-
-int mutex::lock_count = 0;
-int mutex::unlock_count = 0;
-
-mutex m;
-
-int main(int, char**)
-{
-    std::unique_lock<mutex> lk(m);
-    assert(lk.mutex() == &m);
-    assert(lk.owns_lock() == true);
-    assert(mutex::lock_count == 1);
-    assert(mutex::unlock_count == 0);
-    assert(lk.release() == &m);
-    assert(lk.mutex() == nullptr);
-    assert(lk.owns_lock() == false);
-    assert(mutex::lock_count == 1);
-    assert(mutex::unlock_count == 0);
+#include "../types.h"
+
+int MyCountingMutex::lock_count   = 0;
+int MyCountingMutex::unlock_count = 0;
+
+MyCountingMutex m;
+
+int main(int, char**) {
+  std::unique_lock<MyCountingMutex> lk(m);
+  assert(lk.mutex() == &m);
+  assert(lk.owns_lock() == true);
+  assert(MyCountingMutex::lock_count == 1);
+  assert(MyCountingMutex::unlock_count == 0);
+  assert(lk.release() == &m);
+  assert(lk.mutex() == nullptr);
+  assert(lk.owns_lock() == false);
+  assert(MyCountingMutex::lock_count == 1);
+  assert(MyCountingMutex::unlock_count == 0);
 
   return 0;
 }
diff --git a/libcxx/test/std/thread/thread.mutex/thread.lock/thread.lock.unique/thread.lock.unique.obs/mutex.pass.cpp b/libcxx/test/std/thread/thread.mutex/thread.lock/thread.lock.unique/thread.lock.unique.obs/mutex.pass.cpp
index dd2f523..72346e8 100644
--- a/libcxx/test/std/thread/thread.mutex/thread.lock/thread.lock.unique/thread.lock.unique.obs/mutex.pass.cpp
+++ b/libcxx/test/std/thread/thread.mutex/thread.lock/thread.lock.unique/thread.lock.unique.obs/mutex.pass.cpp
@@ -5,8 +5,6 @@
 // SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
 //
 //===----------------------------------------------------------------------===//
-//
-// UNSUPPORTED: no-threads
 
 // <mutex>
 
@@ -14,21 +12,21 @@
 
 // mutex_type *mutex() const;
 
-#include <mutex>
 #include <cassert>
+#include <mutex>
 
 #include "test_macros.h"
+#include "../types.h"
 
-std::mutex m;
+MyMutex m;
 
-int main(int, char**)
-{
-    std::unique_lock<std::mutex> lk0;
-    assert(lk0.mutex() == nullptr);
-    std::unique_lock<std::mutex> lk1(m);
-    assert(lk1.mutex() == &m);
-    lk1.unlock();
-    assert(lk1.mutex() == &m);
+int main(int, char**) {
+  std::unique_lock<MyMutex> lk0;
+  assert(lk0.mutex() == nullptr);
+  std::unique_lock<MyMutex> lk1(m);
+  assert(lk1.mutex() == &m);
+  lk1.unlock();
+  assert(lk1.mutex() == &m);
 
   return 0;
 }
diff --git a/libcxx/test/std/thread/thread.mutex/thread.lock/thread.lock.unique/thread.lock.unique.obs/op_bool.pass.cpp b/libcxx/test/std/thread/thread.mutex/thread.lock/thread.lock.unique/thread.lock.unique.obs/op_bool.pass.cpp
index ea05eb7..3759302 100644
--- a/libcxx/test/std/thread/thread.mutex/thread.lock/thread.lock.unique/thread.lock.unique.obs/op_bool.pass.cpp
+++ b/libcxx/test/std/thread/thread.mutex/thread.lock/thread.lock.unique/thread.lock.unique.obs/op_bool.pass.cpp
@@ -5,8 +5,6 @@
 // SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
 //
 //===----------------------------------------------------------------------===//
-//
-// UNSUPPORTED: no-threads
 
 // <mutex>
 
@@ -14,26 +12,26 @@
 
 // explicit operator bool() const noexcept;
 
-#include <mutex>
 #include <cassert>
+#include <mutex>
 #include <type_traits>
 
 #include "test_macros.h"
+#include "../types.h"
 
-std::mutex m;
+MyMutex m;
 
-int main(int, char**)
-{
-    static_assert(std::is_constructible<bool, std::unique_lock<std::mutex> >::value, "");
-    static_assert(!std::is_convertible<std::unique_lock<std::mutex>, bool>::value, "");
+int main(int, char**) {
+  static_assert(std::is_constructible<bool, std::unique_lock<MyMutex> >::value, "");
+  static_assert(!std::is_convertible<std::unique_lock<MyMutex>, bool>::value, "");
 
-    std::unique_lock<std::mutex> lk0;
-    assert(static_cast<bool>(lk0) == false);
-    std::unique_lock<std::mutex> lk1(m);
-    assert(static_cast<bool>(lk1) == true);
-    lk1.unlock();
-    assert(static_cast<bool>(lk1) == false);
-    ASSERT_NOEXCEPT(static_cast<bool>(lk0));
+  std::unique_lock<MyMutex> lk0;
+  assert(static_cast<bool>(lk0) == false);
+  std::unique_lock<MyMutex> lk1(m);
+  assert(static_cast<bool>(lk1) == true);
+  lk1.unlock();
+  assert(static_cast<bool>(lk1) == false);
+  ASSERT_NOEXCEPT(static_cast<bool>(lk0));
 
   return 0;
 }
diff --git a/libcxx/test/std/thread/thread.mutex/thread.lock/thread.lock.unique/thread.lock.unique.obs/owns_lock.pass.cpp b/libcxx/test/std/thread/thread.mutex/thread.lock/thread.lock.unique/thread.lock.unique.obs/owns_lock.pass.cpp
index 9b192fb..1639427 100644
--- a/libcxx/test/std/thread/thread.mutex/thread.lock/thread.lock.unique/thread.lock.unique.obs/owns_lock.pass.cpp
+++ b/libcxx/test/std/thread/thread.mutex/thread.lock/thread.lock.unique/thread.lock.unique.obs/owns_lock.pass.cpp
@@ -5,8 +5,6 @@
 // SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
 //
 //===----------------------------------------------------------------------===//
-//
-// UNSUPPORTED: no-threads
 
 // <mutex>
 
@@ -14,21 +12,21 @@
 
 // bool owns_lock() const;
 
-#include <mutex>
 #include <cassert>
+#include <mutex>
 
 #include "test_macros.h"
+#include "../types.h"
 
-std::mutex m;
+MyMutex m;
 
-int main(int, char**)
-{
-    std::unique_lock<std::mutex> lk0;
-    assert(lk0.owns_lock() == false);
-    std::unique_lock<std::mutex> lk1(m);
-    assert(lk1.owns_lock() == true);
-    lk1.unlock();
-    assert(lk1.owns_lock() == false);
+int main(int, char**) {
+  std::unique_lock<MyMutex> lk0;
+  assert(lk0.owns_lock() == false);
+  std::unique_lock<MyMutex> lk1(m);
+  assert(lk1.owns_lock() == true);
+  lk1.unlock();
+  assert(lk1.owns_lock() == false);
 
   return 0;
 }
diff --git a/libcxx/test/std/thread/thread.mutex/thread.lock/thread.lock.unique/types.pass.cpp b/libcxx/test/std/thread/thread.mutex/thread.lock/thread.lock.unique/types.compile.pass.cpp
index d849788..312863a 100644
--- a/libcxx/test/std/thread/thread.mutex/thread.lock/thread.lock.unique/types.pass.cpp
+++ b/libcxx/test/std/thread/thread.mutex/thread.lock/thread.lock.unique/types.compile.pass.cpp
@@ -5,8 +5,6 @@
 // SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
 //
 //===----------------------------------------------------------------------===//
-//
-// UNSUPPORTED: no-threads
 
 // <mutex>
 
@@ -22,11 +20,6 @@
 #include <type_traits>
 
 #include "test_macros.h"
+#include "types.h"
 
-int main(int, char**)
-{
-    static_assert((std::is_same<std::unique_lock<std::mutex>::mutex_type,
-                   std::mutex>::value), "");
-
-  return 0;
-}
+static_assert((std::is_same<std::unique_lock<MyMutex>::mutex_type, MyMutex>::value), "");
diff --git a/libcxx/test/std/thread/thread.mutex/thread.lock/thread.lock.unique/types.h b/libcxx/test/std/thread/thread.mutex/thread.lock/thread.lock.unique/types.h
new file mode 100644
index 0000000..15a1a53
--- /dev/null
+++ b/libcxx/test/std/thread/thread.mutex/thread.lock/thread.lock.unique/types.h
@@ -0,0 +1,88 @@
+//===----------------------------------------------------------------------===//
+//
+// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
+// See https://llvm.org/LICENSE.txt for license information.
+// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
+//
+//===----------------------------------------------------------------------===//
+
+#ifndef TEST_STD_THREAD_THREAD_MUTEX_THREAD_LOCK_THREAD_LOCK_GUARD_TYPES_H
+#define TEST_STD_THREAD_THREAD_MUTEX_THREAD_LOCK_THREAD_LOCK_GUARD_TYPES_H
+
+#include <cassert>
+#include <chrono>
+
+struct MyMutex {
+  bool locked = false;
+
+  MyMutex() = default;
+  ~MyMutex() { assert(!locked); }
+
+  void lock() {
+    assert(!locked);
+    locked = true;
+  }
+
+  void unlock() {
+    assert(locked);
+    locked = false;
+  }
+
+  bool try_lock() {
+    if (locked)
+      return false;
+    lock();
+    return true;
+  }
+
+  template <class Rep, class Period>
+  bool try_lock_for(const std::chrono::duration<Rep, Period>& rel_time) {
+    using ms = std::chrono::milliseconds;
+    assert(rel_time == ms(5));
+    if (locked)
+      return false;
+    lock();
+    return true;
+  }
+
+  MyMutex(MyMutex const&)            = delete;
+  MyMutex& operator=(MyMutex const&) = delete;
+};
+
+struct MyTimedMutex {
+  using ms = std::chrono::milliseconds;
+
+  bool try_lock_called       = false;
+  bool try_lock_for_called   = false;
+  bool try_lock_until_called = false;
+
+  bool try_lock() {
+    try_lock_called = !try_lock_called;
+    return try_lock_called;
+  }
+
+  template <class Rep, class Period>
+  bool try_lock_for(const std::chrono::duration<Rep, Period>& rel_time) {
+    assert(rel_time == ms(5));
+    try_lock_for_called = !try_lock_for_called;
+    return try_lock_for_called;
+  }
+
+  template <class Clock, class Duration>
+  bool try_lock_until(const std::chrono::time_point<Clock, Duration>& abs_time) {
+    assert(Clock::now() - abs_time < ms(5));
+    try_lock_until_called = !try_lock_until_called;
+    return try_lock_until_called;
+  }
+
+  void unlock() {}
+};
+
+struct MyCountingMutex {
+  static int lock_count;
+  static int unlock_count;
+  void lock() { ++lock_count; }
+  void unlock() { ++unlock_count; }
+};
+
+#endif // TEST_STD_THREAD_THREAD_MUTEX_THREAD_LOCK_THREAD_LOCK_GUARD_TYPES_H
diff --git a/libcxx/test/std/thread/thread.mutex/thread.mutex.requirements/thread.shared_mutex.requirements/thread.shared_mutex.class/assign.verify.cpp b/libcxx/test/std/thread/thread.mutex/thread.mutex.requirements/thread.shared_mutex.requirements/thread.shared_mutex.class/assign.compile.pass.cpp
index 34164aa..0d90bff 100644
--- a/libcxx/test/std/thread/thread.mutex/thread.mutex.requirements/thread.shared_mutex.requirements/thread.shared_mutex.class/assign.verify.cpp
+++ b/libcxx/test/std/thread/thread.mutex/thread.mutex.requirements/thread.shared_mutex.requirements/thread.shared_mutex.class/assign.compile.pass.cpp
@@ -5,7 +5,7 @@
 // SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
 //
 //===----------------------------------------------------------------------===//
-//
+
 // UNSUPPORTED: no-threads
 // UNSUPPORTED: c++03, c++11, c++14
 
@@ -16,12 +16,6 @@
 // shared_mutex& operator=(const shared_mutex&) = delete;
 
 #include <shared_mutex>
+#include <type_traits>
 
-int main(int, char**)
-{
-    std::shared_mutex m0;
-    std::shared_mutex m1;
-    m1 = m0; // expected-error {{overload resolution selected deleted operator '='}}
-
-  return 0;
-}
+static_assert(!std::is_copy_assignable<std::shared_mutex>::value, "");
diff --git a/libcxx/test/std/thread/thread.mutex/thread.mutex.requirements/thread.shared_mutex.requirements/thread.shared_mutex.class/copy.verify.cpp b/libcxx/test/std/thread/thread.mutex/thread.mutex.requirements/thread.shared_mutex.requirements/thread.shared_mutex.class/ctor.copy.compile.pass.cpp
index 9b43198..f9e1935 100644
--- a/libcxx/test/std/thread/thread.mutex/thread.mutex.requirements/thread.shared_mutex.requirements/thread.shared_mutex.class/copy.verify.cpp
+++ b/libcxx/test/std/thread/thread.mutex/thread.mutex.requirements/thread.shared_mutex.requirements/thread.shared_mutex.class/ctor.copy.compile.pass.cpp
@@ -5,7 +5,7 @@
 // SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
 //
 //===----------------------------------------------------------------------===//
-//
+
 // UNSUPPORTED: no-threads
 // UNSUPPORTED: c++03, c++11, c++14
 
@@ -16,11 +16,6 @@
 // shared_mutex(const shared_mutex&) = delete;
 
 #include <shared_mutex>
+#include <type_traits>
 
-int main(int, char**)
-{
-    std::shared_mutex m0;
-    std::shared_mutex m1(m0); // expected-error {{call to deleted constructor of 'std::shared_mutex'}}
-
-  return 0;
-}
+static_assert(!std::is_copy_constructible<std::shared_mutex>::value, "");
diff --git a/libcxx/test/std/thread/thread.mutex/thread.mutex.requirements/thread.shared_mutex.requirements/thread.shared_mutex.class/default.pass.cpp b/libcxx/test/std/thread/thread.mutex/thread.mutex.requirements/thread.shared_mutex.requirements/thread.shared_mutex.class/ctor.default.pass.cpp
index 5504645..c941f3a 100644
--- a/libcxx/test/std/thread/thread.mutex/thread.mutex.requirements/thread.shared_mutex.requirements/thread.shared_mutex.class/default.pass.cpp
+++ b/libcxx/test/std/thread/thread.mutex/thread.mutex.requirements/thread.shared_mutex.requirements/thread.shared_mutex.class/ctor.default.pass.cpp
@@ -5,7 +5,7 @@
 // SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
 //
 //===----------------------------------------------------------------------===//
-//
+
 // UNSUPPORTED: no-threads
 // UNSUPPORTED: c++03, c++11, c++14
 
@@ -19,10 +19,9 @@
 
 #include "test_macros.h"
 
-int main(int, char**)
-{
-    std::shared_mutex m;
-    (void)m;
+int main(int, char**) {
+  std::shared_mutex m;
+  (void)m;
 
-    return 0;
+  return 0;
 }
diff --git a/libcxx/test/std/thread/thread.mutex/thread.mutex.requirements/thread.shared_mutex.requirements/thread.shared_mutex.class/lock.pass.cpp b/libcxx/test/std/thread/thread.mutex/thread.mutex.requirements/thread.shared_mutex.requirements/thread.shared_mutex.class/lock.pass.cpp
index 122f2b0..724fb07 100644
--- a/libcxx/test/std/thread/thread.mutex/thread.mutex.requirements/thread.shared_mutex.requirements/thread.shared_mutex.class/lock.pass.cpp
+++ b/libcxx/test/std/thread/thread.mutex/thread.mutex.requirements/thread.shared_mutex.requirements/thread.shared_mutex.class/lock.pass.cpp
@@ -5,63 +5,105 @@
 // SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
 //
 //===----------------------------------------------------------------------===//
-//
+
 // UNSUPPORTED: no-threads
 // UNSUPPORTED: c++03, c++11, c++14
 
-// ALLOW_RETRIES: 2
-
 // <shared_mutex>
 
 // class shared_mutex;
 
 // void lock();
 
+#include <shared_mutex>
+#include <atomic>
 #include <cassert>
 #include <chrono>
-#include <cstdlib>
-#include <shared_mutex>
 #include <thread>
+#include <vector>
 
 #include "make_test_thread.h"
 #include "test_macros.h"
 
-std::shared_mutex m;
+int main(int, char**) {
+  // Exclusive-lock a mutex that is not locked yet. This should succeed.
+  {
+    std::shared_mutex m;
+    m.lock();
+    m.unlock();
+  }
 
-typedef std::chrono::system_clock Clock;
-typedef Clock::time_point time_point;
-typedef Clock::duration duration;
-typedef std::chrono::milliseconds ms;
-typedef std::chrono::nanoseconds ns;
+  // Exclusive-lock a mutex that is already locked exclusively. This should block until it is unlocked.
+  {
+    std::atomic<bool> ready(false);
+    std::shared_mutex m;
+    m.lock();
+    std::atomic<bool> is_locked_from_main(true);
 
-ms WaitTime = ms(250);
+    std::thread t = support::make_test_thread([&] {
+      ready = true;
+      m.lock();
+      assert(!is_locked_from_main);
+      m.unlock();
+    });
 
-// Thread sanitizer causes more overhead and will sometimes cause this test
-// to fail. To prevent this we give Thread sanitizer more time to complete the
-// test.
-#if !defined(TEST_IS_EXECUTED_IN_A_SLOW_ENVIRONMENT)
-ms Tolerance = ms(50);
-#else
-ms Tolerance = ms(50 * 5);
-#endif
+    while (!ready)
+      /* spin */;
 
-void f()
-{
-    time_point t0 = Clock::now();
-    m.lock();
-    time_point t1 = Clock::now();
+    // We would rather signal this after we unlock, but that would create a race condition.
+    // We instead signal it before we unlock, which means that it's technically possible for the thread
+    // to take the lock while we're still holding it and for the test to still pass.
+    is_locked_from_main = false;
     m.unlock();
-    ns d = t1 - t0 - WaitTime;
-    assert(d < Tolerance);  // within tolerance
-}
 
-int main(int, char**)
-{
-    m.lock();
-    std::thread t = support::make_test_thread(f);
-    std::this_thread::sleep_for(WaitTime);
-    m.unlock();
     t.join();
+  }
+
+  // Exclusive-lock a mutex that is already share-locked. This should block until it is unlocked.
+  {
+    std::atomic<bool> ready(false);
+    std::shared_mutex m;
+    m.lock_shared();
+    std::atomic<bool> is_locked_from_main(true);
+
+    std::thread t = support::make_test_thread([&] {
+      ready = true;
+      m.lock();
+      assert(!is_locked_from_main);
+      m.unlock();
+    });
+
+    while (!ready)
+      /* spin */;
+
+    // We would rather signal this after we unlock, but that would create a race condition.
+    // We instead signal it before we unlock, which means that it's technically possible for
+    // the thread to take the lock while we're still holding it and for the test to still pass.
+    is_locked_from_main = false;
+    m.unlock_shared();
+
+    t.join();
+  }
+
+  // Make sure that at most one thread can acquire the mutex concurrently.
+  {
+    std::atomic<int> counter = 0;
+    std::shared_mutex mutex;
+
+    std::vector<std::thread> threads;
+    for (int i = 0; i != 10; ++i) {
+      threads.push_back(support::make_test_thread([&] {
+        mutex.lock();
+        counter++;
+        assert(counter == 1);
+        counter--;
+        mutex.unlock();
+      }));
+    }
+
+    for (auto& t : threads)
+      t.join();
+  }
 
   return 0;
 }
diff --git a/libcxx/test/std/thread/thread.mutex/thread.mutex.requirements/thread.shared_mutex.requirements/thread.shared_mutex.class/lock_shared.pass.cpp b/libcxx/test/std/thread/thread.mutex/thread.mutex.requirements/thread.shared_mutex.requirements/thread.shared_mutex.class/lock_shared.pass.cpp
index 9df0d57..e6640f7 100644
--- a/libcxx/test/std/thread/thread.mutex/thread.mutex.requirements/thread.shared_mutex.requirements/thread.shared_mutex.class/lock_shared.pass.cpp
+++ b/libcxx/test/std/thread/thread.mutex/thread.mutex.requirements/thread.shared_mutex.requirements/thread.shared_mutex.class/lock_shared.pass.cpp
@@ -5,87 +5,139 @@
 // SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
 //
 //===----------------------------------------------------------------------===//
-//
+
 // UNSUPPORTED: no-threads
 // UNSUPPORTED: c++03, c++11, c++14
 
-// ALLOW_RETRIES: 2
-
 // <shared_mutex>
 
 // class shared_mutex;
 
 // void lock_shared();
 
-#include <cassert>
-#include <chrono>
-#include <cstdlib>
 #include <shared_mutex>
+#include <algorithm>
+#include <atomic>
+#include <cassert>
 #include <thread>
 #include <vector>
 
 #include "make_test_thread.h"
-#include "test_macros.h"
-
-std::shared_mutex m;
-
-typedef std::chrono::system_clock Clock;
-typedef Clock::time_point time_point;
-typedef Clock::duration duration;
-typedef std::chrono::milliseconds ms;
-typedef std::chrono::nanoseconds ns;
-
-ms WaitTime = ms(250);
-
-// Thread sanitizer causes more overhead and will sometimes cause this test
-// to fail. To prevent this we give Thread sanitizer more time to complete the
-// test.
-#if !defined(TEST_IS_EXECUTED_IN_A_SLOW_ENVIRONMENT)
-ms Tolerance = ms(50);
-#else
-ms Tolerance = ms(50 * 5);
-#endif
-
-void f()
-{
-    time_point t0 = Clock::now();
-    m.lock_shared();
-    time_point t1 = Clock::now();
-    m.unlock_shared();
-    ns d = t1 - t0 - WaitTime;
-    assert(d < Tolerance);  // within tolerance
-}
 
-void g()
-{
-    time_point t0 = Clock::now();
-    m.lock_shared();
-    time_point t1 = Clock::now();
-    m.unlock_shared();
-    ns d = t1 - t0;
-    assert(d < Tolerance);  // within tolerance
-}
+int main(int, char**) {
+  // Lock-shared a mutex that is not locked yet. This should succeed.
+  {
+    std::shared_mutex m;
+    std::vector<std::thread> threads;
+    for (int i = 0; i != 5; ++i) {
+      threads.push_back(support::make_test_thread([&] {
+        m.lock_shared();
+        m.unlock_shared();
+      }));
+    }
 
+    for (auto& t : threads)
+      t.join();
+  }
 
-int main(int, char**)
-{
+  // Lock-shared a mutex that is already exclusively locked. This should block until it is unlocked.
+  {
+    std::atomic<int> ready(0);
+    std::shared_mutex m;
     m.lock();
-    std::vector<std::thread> v;
-    for (int i = 0; i < 5; ++i)
-        v.push_back(support::make_test_thread(f));
-    std::this_thread::sleep_for(WaitTime);
+    std::atomic<bool> is_locked_from_main(true);
+
+    std::vector<std::thread> threads;
+    for (int i = 0; i != 5; ++i) {
+      threads.push_back(support::make_test_thread([&] {
+        ++ready;
+        while (ready < 5)
+          /* wait until all threads have been created */;
+
+        m.lock_shared();
+        assert(!is_locked_from_main);
+        m.unlock_shared();
+      }));
+    }
+
+    while (ready < 5)
+      /* wait until all threads have been created */;
+
+    // We would rather signal this after we unlock, but that would create a race condition.
+    // We instead signal it before we unlock, which means that it's technically possible for
+    // the thread to take the lock while we're still holding it and for the test to still pass.
+    is_locked_from_main = false;
     m.unlock();
-    for (auto& t : v)
-        t.join();
+
+    for (auto& t : threads)
+      t.join();
+  }
+
+  // Lock-shared a mutex that is already lock-shared. This should succeed.
+  {
+    std::atomic<int> ready(0);
+    std::shared_mutex m;
     m.lock_shared();
-    for (auto& t : v)
-        t = support::make_test_thread(g);
-    std::thread q = support::make_test_thread(f);
-    std::this_thread::sleep_for(WaitTime);
+
+    std::vector<std::thread> threads;
+    for (int i = 0; i != 5; ++i) {
+      threads.push_back(support::make_test_thread([&] {
+        ++ready;
+        while (ready < 5)
+          /* wait until all threads have been created */;
+
+        m.lock_shared();
+        m.unlock_shared();
+      }));
+    }
+
+    while (ready < 5)
+      /* wait until all threads have been created */;
+
     m.unlock_shared();
-    for (auto& t : v)
-        t.join();
-    q.join();
+
+    for (auto& t : threads)
+      t.join();
+  }
+
+  // Create several threads that all acquire-shared the same mutex and make sure that each
+  // thread successfully acquires-shared the mutex.
+  //
+  // We record how many other threads were holding the mutex when it was acquired, which allows
+  // us to know whether the test was somewhat effective at causing multiple threads to lock at
+  // the same time.
+  {
+    std::shared_mutex mutex;
+    std::vector<std::thread> threads;
+    constexpr int n_threads           = 5;
+    std::atomic<int> holders          = 0;
+    int concurrent_holders[n_threads] = {};
+    std::atomic<bool> ready           = false;
+
+    for (int i = 0; i != n_threads; ++i) {
+      threads.push_back(support::make_test_thread([&, i] {
+        while (!ready) {
+          // spin
+        }
+
+        mutex.lock_shared();
+        ++holders;
+        concurrent_holders[i] = holders;
+
+        mutex.unlock_shared();
+        --holders;
+      }));
+    }
+
+    ready = true; // let the threads actually start shared-acquiring the mutex
+    for (auto& t : threads)
+      t.join();
+
+    // We can't guarantee that we'll ever have more than 1 concurrent holder so that's what
+    // we assert, however in principle we should often trigger more than 1 concurrent holder.
+    int max_concurrent_holders = *std::max_element(std::begin(concurrent_holders), std::end(concurrent_holders));
+    assert(max_concurrent_holders >= 1);
+  }
 
   return 0;
 }
diff --git a/libcxx/test/std/thread/thread.mutex/thread.mutex.requirements/thread.shared_mutex.requirements/thread.shared_mutex.class/try_lock.pass.cpp b/libcxx/test/std/thread/thread.mutex/thread.mutex.requirements/thread.shared_mutex.requirements/thread.shared_mutex.class/try_lock.pass.cpp
index f39b1ee..11d396d 100644
--- a/libcxx/test/std/thread/thread.mutex/thread.mutex.requirements/thread.shared_mutex.requirements/thread.shared_mutex.class/try_lock.pass.cpp
+++ b/libcxx/test/std/thread/thread.mutex/thread.mutex.requirements/thread.shared_mutex.requirements/thread.shared_mutex.class/try_lock.pass.cpp
@@ -5,56 +5,60 @@
 // SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
 //
 //===----------------------------------------------------------------------===//
-//
+
 // UNSUPPORTED: no-threads
 // UNSUPPORTED: c++03, c++11, c++14
 
-// ALLOW_RETRIES: 2
-
 // <shared_mutex>
 
 // class shared_mutex;
 
 // bool try_lock();
 
+#include <shared_mutex>
+#include <atomic>
 #include <cassert>
 #include <chrono>
-#include <cstdlib>
-#include <shared_mutex>
 #include <thread>
 
 #include "make_test_thread.h"
-#include "test_macros.h"
-
-std::shared_mutex m;
-
-typedef std::chrono::system_clock Clock;
-typedef Clock::time_point time_point;
-typedef Clock::duration duration;
-typedef std::chrono::milliseconds ms;
-typedef std::chrono::nanoseconds ns;
-
-void f()
-{
-    time_point t0 = Clock::now();
-    assert(!m.try_lock());
-    assert(!m.try_lock());
-    assert(!m.try_lock());
-    while(!m.try_lock())
-        ;
-    time_point t1 = Clock::now();
+
+int main(int, char**) {
+  // Try to exclusive-lock a mutex that is not locked yet. This should succeed.
+  {
+    std::shared_mutex m;
+    bool succeeded = m.try_lock();
+    assert(succeeded);
     m.unlock();
-    ns d = t1 - t0 - ms(250);
-    assert(d < ms(200));  // within 200ms
-}
+  }
 
-int main(int, char**)
-{
+  // Try to exclusive-lock a mutex that is already locked exclusively. This should fail.
+  {
+    std::shared_mutex m;
     m.lock();
-    std::thread t = support::make_test_thread(f);
-    std::this_thread::sleep_for(ms(250));
+
+    std::thread t = support::make_test_thread([&] {
+      bool succeeded = m.try_lock();
+      assert(!succeeded);
+    });
+    t.join();
+
     m.unlock();
+  }
+
+  // Try to exclusive-lock a mutex that is already share-locked. This should fail.
+  {
+    std::shared_mutex m;
+    m.lock_shared();
+
+    std::thread t = support::make_test_thread([&] {
+      bool succeeded = m.try_lock();
+      assert(!succeeded);
+    });
     t.join();
 
+    m.unlock_shared();
+  }
+
   return 0;
 }
diff --git a/libcxx/test/std/thread/thread.mutex/thread.mutex.requirements/thread.shared_mutex.requirements/thread.shared_mutex.class/try_lock_shared.pass.cpp b/libcxx/test/std/thread/thread.mutex/thread.mutex.requirements/thread.shared_mutex.requirements/thread.shared_mutex.class/try_lock_shared.pass.cpp
index c091b06..61069be 100644
--- a/libcxx/test/std/thread/thread.mutex/thread.mutex.requirements/thread.shared_mutex.requirements/thread.shared_mutex.class/try_lock_shared.pass.cpp
+++ b/libcxx/test/std/thread/thread.mutex/thread.mutex.requirements/thread.shared_mutex.requirements/thread.shared_mutex.class/try_lock_shared.pass.cpp
@@ -5,71 +5,76 @@
 // SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
 //
 //===----------------------------------------------------------------------===//
-//
+
 // UNSUPPORTED: no-threads
 // UNSUPPORTED: c++03, c++11, c++14
 
-// ALLOW_RETRIES: 2
-
 // <shared_mutex>
 
 // class shared_mutex;
 
 // bool try_lock_shared();
 
-#include <cassert>
-#include <chrono>
-#include <cstdlib>
 #include <shared_mutex>
+#include <cassert>
 #include <thread>
 #include <vector>
 
 #include "make_test_thread.h"
-#include "test_macros.h"
 
-std::shared_mutex m;
+int main(int, char**) {
+  // Try to lock-shared a mutex that is not locked yet. This should succeed.
+  {
+    std::shared_mutex m;
+    std::vector<std::thread> threads;
+    for (int i = 0; i != 5; ++i) {
+      threads.push_back(support::make_test_thread([&] {
+        bool succeeded = m.try_lock_shared();
+        assert(succeeded);
+        m.unlock_shared();
+      }));
+    }
 
-typedef std::chrono::system_clock Clock;
-typedef Clock::time_point time_point;
-typedef Clock::duration duration;
-typedef std::chrono::milliseconds ms;
-typedef std::chrono::nanoseconds ns;
+    for (auto& t : threads)
+      t.join();
+  }
 
+  // Try to lock-shared a mutex that is already exclusively locked. This should fail.
+  {
+    std::shared_mutex m;
+    m.lock();
 
-// Thread sanitizer causes more overhead and will sometimes cause this test
-// to fail. To prevent this we give Thread sanitizer more time to complete the
-// test.
-#if !defined(TEST_IS_EXECUTED_IN_A_SLOW_ENVIRONMENT)
-ms Tolerance = ms(200);
-#else
-ms Tolerance = ms(200 * 5);
-#endif
-
-void f()
-{
-    time_point t0 = Clock::now();
-    assert(!m.try_lock_shared());
-    assert(!m.try_lock_shared());
-    assert(!m.try_lock_shared());
-    while(!m.try_lock_shared())
-        std::this_thread::yield();
-    time_point t1 = Clock::now();
-    m.unlock_shared();
-    ns d = t1 - t0 - ms(250);
-    assert(d < Tolerance);  // within tolerance
-}
+    std::vector<std::thread> threads;
+    for (int i = 0; i != 5; ++i) {
+      threads.push_back(support::make_test_thread([&] {
+        bool succeeded = m.try_lock_shared();
+        assert(!succeeded);
+      }));
+    }
 
+    for (auto& t : threads)
+      t.join();
 
-int main(int, char**)
-{
-    m.lock();
-    std::vector<std::thread> v;
-    for (int i = 0; i < 5; ++i)
-        v.push_back(support::make_test_thread(f));
-    std::this_thread::sleep_for(ms(250));
     m.unlock();
-    for (auto& t : v)
-        t.join();
+  }
+
+  // Try to lock-shared a mutex that is already lock-shared. This should succeed.
+  {
+    std::shared_mutex m;
+    m.lock_shared();
+    std::vector<std::thread> threads;
+    for (int i = 0; i != 5; ++i) {
+      threads.push_back(support::make_test_thread([&] {
+        bool succeeded = m.try_lock_shared();
+        assert(succeeded);
+        m.unlock_shared();
+      }));
+    }
+    m.unlock_shared();
+
+    for (auto& t : threads)
+      t.join();
+  }
 
   return 0;
 }
diff --git a/libcxx/test/std/thread/thread.mutex/thread.mutex.requirements/thread.sharedtimedmutex.requirements/thread.sharedtimedmutex.class/assign.compile.fail.cpp b/libcxx/test/std/thread/thread.mutex/thread.mutex.requirements/thread.sharedtimedmutex.requirements/thread.sharedtimedmutex.class/assign.compile.pass.cpp
index c2cd893..9cbb0b1 100644
--- a/libcxx/test/std/thread/thread.mutex/thread.mutex.requirements/thread.sharedtimedmutex.requirements/thread.sharedtimedmutex.class/assign.compile.fail.cpp
+++ b/libcxx/test/std/thread/thread.mutex/thread.mutex.requirements/thread.sharedtimedmutex.requirements/thread.sharedtimedmutex.class/assign.compile.pass.cpp
@@ -6,6 +6,7 @@
 //
 //===----------------------------------------------------------------------===//
 
+// UNSUPPORTED: no-threads
 // UNSUPPORTED: c++03, c++11
 
 // <shared_mutex>
@@ -15,12 +16,6 @@
 // shared_timed_mutex& operator=(const shared_timed_mutex&) = delete;
 
 #include <shared_mutex>
+#include <type_traits>
 
-int main(int, char**)
-{
-    std::shared_timed_mutex m0;
-    std::shared_timed_mutex m1;
-    m1 = m0;
-
-  return 0;
-}
+static_assert(!std::is_copy_assignable<std::shared_timed_mutex>::value, "");
diff --git a/libcxx/test/std/thread/thread.mutex/thread.mutex.requirements/thread.sharedtimedmutex.requirements/thread.sharedtimedmutex.class/copy.compile.fail.cpp b/libcxx/test/std/thread/thread.mutex/thread.mutex.requirements/thread.sharedtimedmutex.requirements/thread.sharedtimedmutex.class/ctor.copy.compile.pass.cpp
index 9b0a661..12b01a5 100644
--- a/libcxx/test/std/thread/thread.mutex/thread.mutex.requirements/thread.sharedtimedmutex.requirements/thread.sharedtimedmutex.class/copy.compile.fail.cpp
+++ b/libcxx/test/std/thread/thread.mutex/thread.mutex.requirements/thread.sharedtimedmutex.requirements/thread.sharedtimedmutex.class/ctor.copy.compile.pass.cpp
@@ -6,6 +6,7 @@
 //
 //===----------------------------------------------------------------------===//
 
+// UNSUPPORTED: no-threads
 // UNSUPPORTED: c++03, c++11
 
 // <shared_mutex>
@@ -15,11 +16,6 @@
 // shared_timed_mutex(const shared_timed_mutex&) = delete;
 
 #include <shared_mutex>
+#include <type_traits>
 
-int main(int, char**)
-{
-    std::shared_timed_mutex m0;
-    std::shared_timed_mutex m1(m0);
-
-  return 0;
-}
+static_assert(!std::is_copy_constructible<std::shared_timed_mutex>::value, "");
diff --git a/libcxx/test/std/thread/thread.mutex/thread.mutex.requirements/thread.sharedtimedmutex.requirements/thread.sharedtimedmutex.class/default.pass.cpp b/libcxx/test/std/thread/thread.mutex/thread.mutex.requirements/thread.sharedtimedmutex.requirements/thread.sharedtimedmutex.class/ctor.default.pass.cpp
index 7a8d096..eadc59e 100644
--- a/libcxx/test/std/thread/thread.mutex/thread.mutex.requirements/thread.sharedtimedmutex.requirements/thread.sharedtimedmutex.class/default.pass.cpp
+++ b/libcxx/test/std/thread/thread.mutex/thread.mutex.requirements/thread.sharedtimedmutex.requirements/thread.sharedtimedmutex.class/ctor.default.pass.cpp
@@ -5,7 +5,7 @@
 // SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
 //
 //===----------------------------------------------------------------------===//
-//
+
 // UNSUPPORTED: no-threads
 // UNSUPPORTED: c++03, c++11
 
diff --git a/libcxx/test/std/thread/thread.mutex/thread.mutex.requirements/thread.sharedtimedmutex.requirements/thread.sharedtimedmutex.class/lock.pass.cpp b/libcxx/test/std/thread/thread.mutex/thread.mutex.requirements/thread.sharedtimedmutex.requirements/thread.sharedtimedmutex.class/lock.pass.cpp
index acabbab..f78b13d 100644
--- a/libcxx/test/std/thread/thread.mutex/thread.mutex.requirements/thread.sharedtimedmutex.requirements/thread.sharedtimedmutex.class/lock.pass.cpp
+++ b/libcxx/test/std/thread/thread.mutex/thread.mutex.requirements/thread.sharedtimedmutex.requirements/thread.sharedtimedmutex.class/lock.pass.cpp
@@ -5,62 +5,104 @@
 // SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
 //
 //===----------------------------------------------------------------------===//
-//
+
 // UNSUPPORTED: no-threads
 // UNSUPPORTED: c++03, c++11
 
-// ALLOW_RETRIES: 3
-
 // <shared_mutex>
 
 // class shared_timed_mutex;
 
 // void lock();
 
+#include <shared_mutex>
 #include <atomic>
 #include <cassert>
 #include <chrono>
-#include <cstdlib>
-#include <shared_mutex>
 #include <thread>
+#include <vector>
 
 #include "make_test_thread.h"
-#include "test_macros.h"
-
-std::shared_timed_mutex m;
-
-typedef std::chrono::system_clock Clock;
-typedef Clock::time_point time_point;
-typedef Clock::duration duration;
-typedef std::chrono::milliseconds ms;
-typedef std::chrono::nanoseconds ns;
-
-std::atomic<bool> ready(false);
-time_point start;
-
-ms WaitTime = ms(250);
-
-void f()
-{
-  ready.store(true);
-  m.lock();
-  time_point t0 = start;
-  time_point t1 = Clock::now();
-  m.unlock();
-  assert(t0.time_since_epoch() > ms(0));
-  assert(t1 - t0 >= WaitTime);
-}
 
-int main(int, char**)
-{
-  m.lock();
-  std::thread t = support::make_test_thread(f);
-  while (!ready)
-    std::this_thread::yield();
-  start = Clock::now();
-  std::this_thread::sleep_for(WaitTime);
-  m.unlock();
-  t.join();
+int main(int, char**) {
+  // Exclusive-lock a mutex that is not locked yet. This should succeed.
+  {
+    std::shared_timed_mutex m;
+    m.lock();
+    m.unlock();
+  }
+
+  // Exclusive-lock a mutex that is already locked exclusively. This should block until it is unlocked.
+  {
+    std::atomic<bool> ready(false);
+    std::shared_timed_mutex m;
+    m.lock();
+    std::atomic<bool> is_locked_from_main(true);
+
+    std::thread t = support::make_test_thread([&] {
+      ready = true;
+      m.lock();
+      assert(!is_locked_from_main);
+      m.unlock();
+    });
+
+    while (!ready)
+      /* spin */;
+
+    // We would rather signal this after we unlock, but that would create a race condition.
+    // We instead signal it before we unlock, which means that it's technically possible for the thread
+    // to take the lock while we're still holding it and for the test to still pass.
+    is_locked_from_main = false;
+    m.unlock();
+
+    t.join();
+  }
+
+  // Exclusive-lock a mutex that is already share-locked. This should block until it is unlocked.
+  {
+    std::atomic<bool> ready(false);
+    std::shared_timed_mutex m;
+    m.lock_shared();
+    std::atomic<bool> is_locked_from_main(true);
+
+    std::thread t = support::make_test_thread([&] {
+      ready = true;
+      m.lock();
+      assert(!is_locked_from_main);
+      m.unlock();
+    });
+
+    while (!ready)
+      /* spin */;
+
+    // We would rather signal this after we unlock, but that would create a race condition.
+    // We instead signal it before we unlock, which means that it's technically possible for
+    // the thread to take the lock while we're still holding it and for the test to still pass.
+    is_locked_from_main = false;
+    m.unlock_shared();
+
+    t.join();
+  }
+
+  // Make sure that at most one thread can acquire the mutex concurrently.
+  {
+    std::atomic<int> counter(0);
+    std::shared_timed_mutex mutex;
+
+    std::vector<std::thread> threads;
+    for (int i = 0; i != 10; ++i) {
+      threads.push_back(support::make_test_thread([&] {
+        mutex.lock();
+        counter++;
+        assert(counter == 1);
+        counter--;
+        mutex.unlock();
+      }));
+    }
+
+    for (auto& t : threads)
+      t.join();
+  }
 
   return 0;
 }
diff --git a/libcxx/test/std/thread/thread.mutex/thread.mutex.requirements/thread.sharedtimedmutex.requirements/thread.sharedtimedmutex.class/lock_shared.pass.cpp b/libcxx/test/std/thread/thread.mutex/thread.mutex.requirements/thread.sharedtimedmutex.requirements/thread.sharedtimedmutex.class/lock_shared.pass.cpp
index 36f5dba..d9a3db1 100644
--- a/libcxx/test/std/thread/thread.mutex/thread.mutex.requirements/thread.sharedtimedmutex.requirements/thread.sharedtimedmutex.class/lock_shared.pass.cpp
+++ b/libcxx/test/std/thread/thread.mutex/thread.mutex.requirements/thread.sharedtimedmutex.requirements/thread.sharedtimedmutex.class/lock_shared.pass.cpp
@@ -5,100 +5,138 @@
 // SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
 //
 //===----------------------------------------------------------------------===//
-//
+
 // UNSUPPORTED: no-threads
 // UNSUPPORTED: c++03, c++11
 
-// ALLOW_RETRIES: 3
-
 // <shared_mutex>
 
 // class shared_timed_mutex;
 
 // void lock_shared();
 
-
+#include <shared_mutex>
+#include <algorithm>
 #include <atomic>
 #include <cassert>
-#include <chrono>
-#include <cstdlib>
-#include <shared_mutex>
 #include <thread>
 #include <vector>
 
 #include "make_test_thread.h"
-#include "test_macros.h"
-
-std::shared_timed_mutex m;
-
-typedef std::chrono::system_clock Clock;
-typedef Clock::time_point time_point;
-typedef Clock::duration duration;
-typedef std::chrono::milliseconds ms;
-typedef std::chrono::nanoseconds ns;
-
-std::atomic<unsigned> countDown;
-time_point readerStart; // Protected by the above mutex 'm'
-time_point writerStart; // Protected by the above mutex 'm'
-
-ms WaitTime = ms(250);
-
-void readerMustWait() {
-  --countDown;
-  m.lock_shared();
-  time_point t1 = Clock::now();
-  time_point t0 = readerStart;
-  m.unlock_shared();
-  assert(t0.time_since_epoch() > ms(0));
-  assert(t1 - t0 >= WaitTime);
-}
-
-void reader() {
-  --countDown;
-  m.lock_shared();
-  m.unlock_shared();
-}
-
-void writerMustWait() {
-  --countDown;
-  m.lock();
-  time_point t1 = Clock::now();
-  time_point t0 = writerStart;
-  m.unlock();
-  assert(t0.time_since_epoch() > ms(0));
-  assert(t1 - t0 >= WaitTime);
-}
 
-int main(int, char**)
-{
-  int threads = 5;
-
-  countDown.store(threads);
-  m.lock();
-  std::vector<std::thread> v;
-  for (int i = 0; i < threads; ++i)
-    v.push_back(support::make_test_thread(readerMustWait));
-  while (countDown > 0)
-    std::this_thread::yield();
-  readerStart = Clock::now();
-  std::this_thread::sleep_for(WaitTime);
-  m.unlock();
-  for (auto& t : v)
-    t.join();
-
-  countDown.store(threads + 1);
-  m.lock_shared();
-  for (auto& t : v)
-    t = support::make_test_thread(reader);
-  std::thread q = support::make_test_thread(writerMustWait);
-  while (countDown > 0)
-    std::this_thread::yield();
-  writerStart = Clock::now();
-  std::this_thread::sleep_for(WaitTime);
-  m.unlock_shared();
-  for (auto& t : v)
-    t.join();
-  q.join();
+int main(int, char**) {
+  // Lock-shared a mutex that is not locked yet. This should succeed.
+  {
+    std::shared_timed_mutex m;
+    std::vector<std::thread> threads;
+    for (int i = 0; i != 5; ++i) {
+      threads.push_back(support::make_test_thread([&] {
+        m.lock_shared();
+        m.unlock_shared();
+      }));
+    }
+
+    for (auto& t : threads)
+      t.join();
+  }
+
+  // Lock-shared a mutex that is already exclusively locked. This should block until it is unlocked.
+  {
+    std::atomic<int> ready(0);
+    std::shared_timed_mutex m;
+    m.lock();
+    std::atomic<bool> is_locked_from_main(true);
+
+    std::vector<std::thread> threads;
+    for (int i = 0; i != 5; ++i) {
+      threads.push_back(support::make_test_thread([&] {
+        ++ready;
+        while (ready < 5)
+          /* wait until all threads have been created */;
+
+        m.lock_shared();
+        assert(!is_locked_from_main);
+        m.unlock_shared();
+      }));
+    }
+
+    while (ready < 5)
+      /* wait until all threads have been created */;
+
+    // We would rather signal this after we unlock, but that would create a race condition.
+    // We instead signal it before we unlock, which means that it's technically possible for
+    // the thread to take the lock while we're still holding it and for the test to still pass.
+    is_locked_from_main = false;
+    m.unlock();
+
+    for (auto& t : threads)
+      t.join();
+  }
+
+  // Lock-shared a mutex that is already lock-shared. This should succeed.
+  {
+    std::atomic<int> ready(0);
+    std::shared_timed_mutex m;
+    m.lock_shared();
+
+    std::vector<std::thread> threads;
+    for (int i = 0; i != 5; ++i) {
+      threads.push_back(support::make_test_thread([&] {
+        ++ready;
+        while (ready < 5)
+          /* wait until all threads have been created */;
+
+        m.lock_shared();
+        m.unlock_shared();
+      }));
+    }
+
+    while (ready < 5)
+      /* wait until all threads have been created */;
+
+    m.unlock_shared();
+
+    for (auto& t : threads)
+      t.join();
+  }
+
+  // Create several threads that all acquire-shared the same mutex and make sure that each
+  // thread successfully acquires-shared the mutex.
+  //
+  // We record how many other threads were holding the mutex when it was acquired, which allows
+  // us to know whether the test was somewhat effective at causing multiple threads to lock at
+  // the same time.
+  {
+    std::shared_timed_mutex mutex;
+    std::vector<std::thread> threads;
+    constexpr int n_threads = 5;
+    std::atomic<int> holders(0);
+    int concurrent_holders[n_threads] = {};
+    std::atomic<bool> ready(false);
+
+    for (int i = 0; i != n_threads; ++i) {
+      threads.push_back(support::make_test_thread([&, i] {
+        while (!ready)
+          /* spin */;
+
+        mutex.lock_shared();
+        ++holders;
+        concurrent_holders[i] = holders;
+
+        mutex.unlock_shared();
+        --holders;
+      }));
+    }
+
+    ready = true; // let the threads actually start shared-acquiring the mutex
+    for (auto& t : threads)
+      t.join();
+
+    // We can't guarantee that we'll ever have more than 1 concurrent holder so that's what
+    // we assert, however in principle we should often trigger more than 1 concurrent holder.
+    int max_concurrent_holders = *std::max_element(std::begin(concurrent_holders), std::end(concurrent_holders));
+    assert(max_concurrent_holders >= 1);
+  }
 
   return 0;
 }
diff --git a/libcxx/test/std/thread/thread.mutex/thread.mutex.requirements/thread.sharedtimedmutex.requirements/thread.sharedtimedmutex.class/try_lock.pass.cpp b/libcxx/test/std/thread/thread.mutex/thread.mutex.requirements/thread.sharedtimedmutex.requirements/thread.sharedtimedmutex.class/try_lock.pass.cpp
index cc7091f..9ed8b5b 100644
--- a/libcxx/test/std/thread/thread.mutex/thread.mutex.requirements/thread.sharedtimedmutex.requirements/thread.sharedtimedmutex.class/try_lock.pass.cpp
+++ b/libcxx/test/std/thread/thread.mutex/thread.mutex.requirements/thread.sharedtimedmutex.requirements/thread.sharedtimedmutex.class/try_lock.pass.cpp
@@ -5,56 +5,60 @@
 // SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
 //
 //===----------------------------------------------------------------------===//
-//
+
 // UNSUPPORTED: no-threads
 // UNSUPPORTED: c++03, c++11
 
-// ALLOW_RETRIES: 2
-
 // <shared_mutex>
 
 // class shared_timed_mutex;
 
 // bool try_lock();
 
+#include <shared_mutex>
+#include <atomic>
 #include <cassert>
 #include <chrono>
-#include <cstdlib>
-#include <shared_mutex>
 #include <thread>
 
 #include "make_test_thread.h"
-#include "test_macros.h"
-
-std::shared_timed_mutex m;
-
-typedef std::chrono::system_clock Clock;
-typedef Clock::time_point time_point;
-typedef Clock::duration duration;
-typedef std::chrono::milliseconds ms;
-typedef std::chrono::nanoseconds ns;
-
-void f()
-{
-    time_point t0 = Clock::now();
-    assert(!m.try_lock());
-    assert(!m.try_lock());
-    assert(!m.try_lock());
-    while(!m.try_lock())
-        ;
-    time_point t1 = Clock::now();
+
+int main(int, char**) {
+  // Try to exclusive-lock a mutex that is not locked yet. This should succeed.
+  {
+    std::shared_timed_mutex m;
+    bool succeeded = m.try_lock();
+    assert(succeeded);
     m.unlock();
-    ns d = t1 - t0 - ms(250);
-    assert(d < ms(200));  // within 200ms
-}
+  }
 
-int main(int, char**)
-{
+  // Try to exclusive-lock a mutex that is already locked exclusively. This should fail.
+  {
+    std::shared_timed_mutex m;
     m.lock();
-    std::thread t = support::make_test_thread(f);
-    std::this_thread::sleep_for(ms(250));
+
+    std::thread t = support::make_test_thread([&] {
+      bool succeeded = m.try_lock();
+      assert(!succeeded);
+    });
+    t.join();
+
     m.unlock();
+  }
+
+  // Try to exclusive-lock a mutex that is already share-locked. This should fail.
+  {
+    std::shared_timed_mutex m;
+    m.lock_shared();
+
+    std::thread t = support::make_test_thread([&] {
+      bool succeeded = m.try_lock();
+      assert(!succeeded);
+    });
     t.join();
 
+    m.unlock_shared();
+  }
+
   return 0;
 }
diff --git a/libcxx/test/std/thread/thread.mutex/thread.mutex.requirements/thread.sharedtimedmutex.requirements/thread.sharedtimedmutex.class/try_lock_for.pass.cpp b/libcxx/test/std/thread/thread.mutex/thread.mutex.requirements/thread.sharedtimedmutex.requirements/thread.sharedtimedmutex.class/try_lock_for.pass.cpp
index 30fc3c5..0ae9a48 100644
--- a/libcxx/test/std/thread/thread.mutex/thread.mutex.requirements/thread.sharedtimedmutex.requirements/thread.sharedtimedmutex.class/try_lock_for.pass.cpp
+++ b/libcxx/test/std/thread/thread.mutex/thread.mutex.requirements/thread.sharedtimedmutex.requirements/thread.sharedtimedmutex.class/try_lock_for.pass.cpp
@@ -5,10 +5,9 @@
 // SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
 //
 //===----------------------------------------------------------------------===//
-//
+
 // UNSUPPORTED: no-threads
 // UNSUPPORTED: c++03, c++11
-// ALLOW_RETRIES: 2
 
 // <shared_mutex>
 
@@ -18,69 +17,89 @@
 //     bool try_lock_for(const chrono::duration<Rep, Period>& rel_time);
 
 #include <shared_mutex>
-#include <thread>
-#include <cstdlib>
+#include <atomic>
 #include <cassert>
 #include <chrono>
+#include <thread>
 
 #include "make_test_thread.h"
-#include "test_macros.h"
 
-std::shared_timed_mutex m;
+template <class Function>
+std::chrono::microseconds measure(Function f) {
+  std::chrono::high_resolution_clock::time_point start = std::chrono::high_resolution_clock::now();
+  f();
+  std::chrono::high_resolution_clock::time_point end = std::chrono::high_resolution_clock::now();
+  return std::chrono::duration_cast<std::chrono::microseconds>(end - start);
+}
 
-typedef std::chrono::steady_clock Clock;
-typedef Clock::time_point time_point;
-typedef Clock::duration duration;
-typedef std::chrono::milliseconds ms;
-typedef std::chrono::nanoseconds ns;
+int main(int, char**) {
+  // Try to lock a mutex that is not locked yet. This should succeed immediately.
+  {
+    std::shared_timed_mutex m;
+    bool succeeded = m.try_lock_for(std::chrono::milliseconds(1));
+    assert(succeeded);
+    m.unlock();
+  }
 
+  // Try to lock an already-locked mutex for a long enough amount of time and succeed.
+  // This is technically flaky, but we use such long durations that it should pass even
+  // in slow or contended environments.
+  {
+    std::chrono::milliseconds const wait_time(500);
+    std::chrono::milliseconds const tolerance = wait_time * 3;
+    std::atomic<bool> ready(false);
 
-ms WaitTime = ms(250);
+    std::shared_timed_mutex m;
+    m.lock();
 
-// Thread sanitizer causes more overhead and will sometimes cause this test
-// to fail. To prevent this we give Thread sanitizer more time to complete the
-// test.
-#if !defined(TEST_IS_EXECUTED_IN_A_SLOW_ENVIRONMENT)
-ms Tolerance = ms(50);
-#else
-ms Tolerance = ms(50 * 5);
-#endif
+    std::thread t = support::make_test_thread([&] {
+      auto elapsed = measure([&] {
+        ready          = true;
+        bool succeeded = m.try_lock_for(wait_time);
+        assert(succeeded);
+        m.unlock();
+      });
 
-void f1()
-{
-    time_point t0 = Clock::now();
-    assert(m.try_lock_for(WaitTime + Tolerance) == true);
-    time_point t1 = Clock::now();
-    m.unlock();
-    ns d = t1 - t0 - WaitTime;
-    assert(d < Tolerance);  // within tolerance
-}
+      // Ensure we didn't wait significantly longer than our timeout. This is technically
+      // flaky and non-conforming because an implementation is free to block for arbitrarily
+      // long, but any decent quality implementation should pass this test.
+      assert(elapsed - wait_time < tolerance);
+    });
 
-void f2()
-{
-    time_point t0 = Clock::now();
-    assert(m.try_lock_for(WaitTime) == false);
-    time_point t1 = Clock::now();
-    ns d = t1 - t0 - WaitTime;
-    assert(d < Tolerance);  // within tolerance
-}
+    // Wait for the thread to be ready to take the lock before we unlock it from here, otherwise
+    // there's a high chance that we're not testing the "locking an already locked" mutex use case.
+    // There is still technically a race condition here.
+    while (!ready)
+      /* spin */;
+    std::this_thread::sleep_for(wait_time / 5);
 
-int main(int, char**)
-{
-    {
-        m.lock();
-        std::thread t = support::make_test_thread(f1);
-        std::this_thread::sleep_for(WaitTime);
-        m.unlock();
-        t.join();
-    }
-    {
-        m.lock();
-        std::thread t = support::make_test_thread(f2);
-        std::this_thread::sleep_for(WaitTime + Tolerance);
-        m.unlock();
-        t.join();
-    }
+    m.unlock(); // this should allow the thread to lock 'm'
+    t.join();
+  }
+
+  // Try to lock an already-locked mutex for a short amount of time and fail.
+  // Again, this is technically flaky but we use such long durations that it should work.
+  {
+    std::chrono::milliseconds const wait_time(10);
+    std::chrono::milliseconds const tolerance(750); // in case the thread we spawned goes to sleep or something
+
+    std::shared_timed_mutex m;
+    m.lock();
+
+    std::thread t = support::make_test_thread([&] {
+      auto elapsed = measure([&] {
+        bool succeeded = m.try_lock_for(wait_time);
+        assert(!succeeded);
+      });
+
+      // Ensure we failed within some bounded time.
+      assert(elapsed - wait_time < tolerance);
+    });
+
+    t.join();
+
+    m.unlock();
+  }
 
   return 0;
 }
diff --git a/libcxx/test/std/thread/thread.mutex/thread.mutex.requirements/thread.sharedtimedmutex.requirements/thread.sharedtimedmutex.class/try_lock_shared.pass.cpp b/libcxx/test/std/thread/thread.mutex/thread.mutex.requirements/thread.sharedtimedmutex.requirements/thread.sharedtimedmutex.class/try_lock_shared.pass.cpp
index 8523df0..6430a32 100644
--- a/libcxx/test/std/thread/thread.mutex/thread.mutex.requirements/thread.sharedtimedmutex.requirements/thread.sharedtimedmutex.class/try_lock_shared.pass.cpp
+++ b/libcxx/test/std/thread/thread.mutex/thread.mutex.requirements/thread.sharedtimedmutex.requirements/thread.sharedtimedmutex.class/try_lock_shared.pass.cpp
@@ -5,70 +5,76 @@
 // SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
 //
 //===----------------------------------------------------------------------===//
-//
+
 // UNSUPPORTED: no-threads
 // UNSUPPORTED: c++03, c++11
 
-// ALLOW_RETRIES: 2
-
 // <shared_mutex>
 
 // class shared_timed_mutex;
 
 // bool try_lock_shared();
 
-#include <cassert>
-#include <chrono>
-#include <cstdlib>
 #include <shared_mutex>
+#include <cassert>
 #include <thread>
 #include <vector>
 
 #include "make_test_thread.h"
-#include "test_macros.h"
 
-std::shared_timed_mutex m;
+int main(int, char**) {
+  // Try to lock-shared a mutex that is not locked yet. This should succeed.
+  {
+    std::shared_timed_mutex m;
+    std::vector<std::thread> threads;
+    for (int i = 0; i != 5; ++i) {
+      threads.push_back(support::make_test_thread([&] {
+        bool succeeded = m.try_lock_shared();
+        assert(succeeded);
+        m.unlock_shared();
+      }));
+    }
+
+    for (auto& t : threads)
+      t.join();
+  }
 
-typedef std::chrono::system_clock Clock;
-typedef Clock::time_point time_point;
-typedef Clock::duration duration;
-typedef std::chrono::milliseconds ms;
-typedef std::chrono::nanoseconds ns;
+  // Try to lock-shared a mutex that is already exclusively locked. This should fail.
+  {
+    std::shared_timed_mutex m;
+    m.lock();
+
+    std::vector<std::thread> threads;
+    for (int i = 0; i != 5; ++i) {
+      threads.push_back(support::make_test_thread([&] {
+        bool succeeded = m.try_lock_shared();
+        assert(!succeeded);
+      }));
+    }
 
+    for (auto& t : threads)
+      t.join();
 
-// Thread sanitizer causes more overhead and will sometimes cause this test
-// to fail. To prevent this we give Thread sanitizer more time to complete the
-// test.
-#if !defined(TEST_IS_EXECUTED_IN_A_SLOW_ENVIRONMENT)
-ms Tolerance = ms(200);
-#else
-ms Tolerance = ms(200 * 5);
-#endif
+    m.unlock();
+  }
 
-void f()
-{
-    time_point t0 = Clock::now();
-    assert(!m.try_lock_shared());
-    assert(!m.try_lock_shared());
-    assert(!m.try_lock_shared());
-    while(!m.try_lock_shared())
-        std::this_thread::yield();
-    time_point t1 = Clock::now();
+  // Try to lock-shared a mutex that is already lock-shared. This should succeed.
+  {
+    std::shared_timed_mutex m;
+    m.lock_shared();
+    std::vector<std::thread> threads;
+    for (int i = 0; i != 5; ++i) {
+      threads.push_back(support::make_test_thread([&] {
+        bool succeeded = m.try_lock_shared();
+        assert(succeeded);
+        m.unlock_shared();
+      }));
+    }
     m.unlock_shared();
-    ns d = t1 - t0 - ms(250);
-    assert(d < Tolerance);  // within tolerance
-}
 
-int main(int, char**)
-{
-    m.lock();
-    std::vector<std::thread> v;
-    for (int i = 0; i < 5; ++i)
-        v.push_back(support::make_test_thread(f));
-    std::this_thread::sleep_for(ms(250));
-    m.unlock();
-    for (auto& t : v)
-        t.join();
+    for (auto& t : threads)
+      t.join();
+  }
 
   return 0;
 }
diff --git a/libcxx/test/std/thread/thread.mutex/thread.mutex.requirements/thread.sharedtimedmutex.requirements/thread.sharedtimedmutex.class/try_lock_shared_for.pass.cpp b/libcxx/test/std/thread/thread.mutex/thread.mutex.requirements/thread.sharedtimedmutex.requirements/thread.sharedtimedmutex.class/try_lock_shared_for.pass.cpp
index c7d02a3..23a88ba 100644
--- a/libcxx/test/std/thread/thread.mutex/thread.mutex.requirements/thread.sharedtimedmutex.requirements/thread.sharedtimedmutex.class/try_lock_shared_for.pass.cpp
+++ b/libcxx/test/std/thread/thread.mutex/thread.mutex.requirements/thread.sharedtimedmutex.requirements/thread.sharedtimedmutex.class/try_lock_shared_for.pass.cpp
@@ -5,12 +5,10 @@
 // SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
 //
 //===----------------------------------------------------------------------===//
-//
+
 // UNSUPPORTED: no-threads
 // UNSUPPORTED: c++03, c++11
 
-// ALLOW_RETRIES: 3
-
 // <shared_mutex>
 
 // class shared_timed_mutex;
@@ -19,75 +17,110 @@
 //     bool try_lock_shared_for(const chrono::duration<Rep, Period>& rel_time);
 
 #include <shared_mutex>
-#include <thread>
-#include <vector>
-#include <cstdlib>
+#include <atomic>
 #include <cassert>
 #include <chrono>
+#include <thread>
+#include <vector>
 
 #include "make_test_thread.h"
-#include "test_macros.h"
-
-std::shared_timed_mutex m;
-
-typedef std::chrono::steady_clock Clock;
-typedef Clock::time_point time_point;
-typedef Clock::duration duration;
-typedef std::chrono::milliseconds ms;
-typedef std::chrono::nanoseconds ns;
-
-ms WaitTime = ms(250);
-
-// Thread sanitizer causes more overhead and will sometimes cause this test
-// to fail. To prevent this we give Thread sanitizer more time to complete the
-// test.
-#if !defined(TEST_IS_EXECUTED_IN_A_SLOW_ENVIRONMENT)
-ms Tolerance = ms(50);
-#else
-ms Tolerance = ms(50 * 5);
-#endif
-
-void f1()
-{
-    time_point t0 = Clock::now();
-    assert(m.try_lock_shared_for(WaitTime + Tolerance) == true);
-    time_point t1 = Clock::now();
-    m.unlock_shared();
-    ns d = t1 - t0 - WaitTime;
-    assert(d < Tolerance);  // within 50ms
-}
 
-void f2()
-{
-    time_point t0 = Clock::now();
-    assert(m.try_lock_shared_for(WaitTime) == false);
-    time_point t1 = Clock::now();
-    ns d = t1 - t0 - WaitTime;
-    assert(d < Tolerance);  // within 50ms
+template <class Function>
+std::chrono::microseconds measure(Function f) {
+  std::chrono::high_resolution_clock::time_point start = std::chrono::high_resolution_clock::now();
+  f();
+  std::chrono::high_resolution_clock::time_point end = std::chrono::high_resolution_clock::now();
+  return std::chrono::duration_cast<std::chrono::microseconds>(end - start);
 }
 
-int main(int, char**)
-{
-    {
-        m.lock();
-        std::vector<std::thread> v;
-        for (int i = 0; i < 5; ++i)
-            v.push_back(support::make_test_thread(f1));
-        std::this_thread::sleep_for(WaitTime);
-        m.unlock();
-        for (auto& t : v)
-            t.join();
+int main(int, char**) {
+  // Try to lock-shared a mutex that is not locked yet. This should succeed immediately.
+  {
+    std::shared_timed_mutex m;
+    std::vector<std::thread> threads;
+    for (int i = 0; i != 5; ++i) {
+      threads.push_back(support::make_test_thread([&] {
+        bool succeeded = m.try_lock_shared_for(std::chrono::milliseconds(1));
+        assert(succeeded);
+        m.unlock_shared();
+      }));
+    }
+
+    for (auto& t : threads)
+      t.join();
+  }
+
+  // Try to lock-shared an already-locked mutex for a long enough amount of time and succeed.
+  // This is technically flaky, but we use such long durations that it should pass even
+  // in slow or contended environments.
+  {
+    std::chrono::milliseconds const wait_time(500);
+    std::chrono::milliseconds const tolerance = wait_time * 3;
+    std::atomic<int> ready(0);
+
+    std::shared_timed_mutex m;
+    m.lock();
+
+    std::vector<std::thread> threads;
+    for (int i = 0; i != 5; ++i) {
+      threads.push_back(support::make_test_thread([&] {
+        ++ready;
+        while (ready < 5)
+          /* spin until all threads are created */;
+
+        auto elapsed = measure([&] {
+          bool succeeded = m.try_lock_shared_for(wait_time);
+          assert(succeeded);
+          m.unlock_shared();
+        });
+
+        // Ensure we didn't wait significantly longer than our timeout. This is technically
+        // flaky and non-conforming because an implementation is free to block for arbitrarily
+        // long, but any decent quality implementation should pass this test.
+        assert(elapsed - wait_time < tolerance);
+      }));
     }
-    {
-        m.lock();
-        std::vector<std::thread> v;
-        for (int i = 0; i < 5; ++i)
-            v.push_back(support::make_test_thread(f2));
-        std::this_thread::sleep_for(WaitTime + Tolerance);
-        m.unlock();
-        for (auto& t : v)
-            t.join();
+
+    // Wait for all the threads to be ready to take the lock before we unlock it from here, otherwise
+    // there's a high chance that we're not testing the "locking an already locked" mutex use case.
+    // There is still technically a race condition here.
+    while (ready < 5)
+      /* spin */;
+    std::this_thread::sleep_for(wait_time / 5);
+
+    m.unlock(); // this should allow the threads to lock-shared 'm'
+
+    for (auto& t : threads)
+      t.join();
+  }
+
+  // Try to lock-shared an already-locked mutex for a short amount of time and fail.
+  // Again, this is technically flaky but we use such long durations that it should work.
+  {
+    std::chrono::milliseconds const wait_time(10);
+    std::chrono::milliseconds const tolerance(750); // in case the thread we spawned goes to sleep or something
+
+    std::shared_timed_mutex m;
+    m.lock();
+
+    std::vector<std::thread> threads;
+    for (int i = 0; i != 5; ++i) {
+      threads.push_back(support::make_test_thread([&] {
+        auto elapsed = measure([&] {
+          bool succeeded = m.try_lock_shared_for(wait_time);
+          assert(!succeeded);
+        });
+
+        // Ensure we failed within some bounded time.
+        assert(elapsed - wait_time < tolerance);
+      }));
     }
 
+    for (auto& t : threads)
+      t.join();
+
+    m.unlock();
+  }
+
   return 0;
 }
diff --git a/libcxx/test/std/thread/thread.mutex/thread.mutex.requirements/thread.sharedtimedmutex.requirements/thread.sharedtimedmutex.class/try_lock_shared_until.pass.cpp b/libcxx/test/std/thread/thread.mutex/thread.mutex.requirements/thread.sharedtimedmutex.requirements/thread.sharedtimedmutex.class/try_lock_shared_until.pass.cpp
index a95ffab..af88bae 100644
--- a/libcxx/test/std/thread/thread.mutex/thread.mutex.requirements/thread.sharedtimedmutex.requirements/thread.sharedtimedmutex.class/try_lock_shared_until.pass.cpp
+++ b/libcxx/test/std/thread/thread.mutex/thread.mutex.requirements/thread.sharedtimedmutex.requirements/thread.sharedtimedmutex.class/try_lock_shared_until.pass.cpp
@@ -5,12 +5,10 @@
 // SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
 //
 //===----------------------------------------------------------------------===//
-//
+
 // UNSUPPORTED: no-threads
 // UNSUPPORTED: c++03, c++11
 
-// ALLOW_RETRIES: 2
-
 // <shared_mutex>
 
 // class shared_timed_mutex;
@@ -18,73 +16,109 @@
 // template <class Clock, class Duration>
 //     bool try_lock_shared_until(const chrono::time_point<Clock, Duration>& abs_time);
 
-#include <thread>
-
+#include <shared_mutex>
 #include <atomic>
 #include <cassert>
 #include <chrono>
-#include <cstdlib>
-#include <shared_mutex>
+#include <thread>
 #include <vector>
 
 #include "make_test_thread.h"
-#include "test_macros.h"
-
-std::shared_timed_mutex m;
-
-typedef std::chrono::steady_clock Clock;
-typedef Clock::time_point time_point;
-typedef Clock::duration duration;
-typedef std::chrono::milliseconds ms;
-typedef std::chrono::nanoseconds ns;
-
-ms SuccessWaitTime = ms(5000); // Some machines are busy or slow or both
-ms FailureWaitTime = ms(50);
-
-// On busy or slow machines, there can be a significant delay between thread
-// creation and thread start, so we use an atomic variable to signal that the
-// thread is actually executing.
-static std::atomic<unsigned> countDown;
-
-void f1()
-{
-  --countDown;
-  time_point t0 = Clock::now();
-  assert(m.try_lock_shared_until(Clock::now() + SuccessWaitTime) == true);
-  time_point t1 = Clock::now();
-  m.unlock_shared();
-  assert(t1 - t0 <= SuccessWaitTime);
-}
 
-void f2()
-{
-  time_point t0 = Clock::now();
-  assert(m.try_lock_shared_until(Clock::now() + FailureWaitTime) == false);
-  assert(Clock::now() - t0 >= FailureWaitTime);
+template <class Function>
+std::chrono::microseconds measure(Function f) {
+  std::chrono::high_resolution_clock::time_point start = std::chrono::high_resolution_clock::now();
+  f();
+  std::chrono::high_resolution_clock::time_point end = std::chrono::high_resolution_clock::now();
+  return std::chrono::duration_cast<std::chrono::microseconds>(end - start);
 }
 
-int main(int, char**)
-{
-  int threads = 5;
+int main(int, char**) {
+  // Try to lock-shared a mutex that is not locked yet. This should succeed immediately.
+  {
+    std::shared_timed_mutex m;
+    std::vector<std::thread> threads;
+    for (int i = 0; i != 5; ++i) {
+      threads.push_back(support::make_test_thread([&] {
+        bool succeeded = m.try_lock_shared_until(std::chrono::steady_clock::now() + std::chrono::milliseconds(1));
+        assert(succeeded);
+        m.unlock_shared();
+      }));
+    }
+
+    for (auto& t : threads)
+      t.join();
+  }
+
+  // Try to lock-shared an already-locked mutex for a long enough amount of time and succeed.
+  // This is technically flaky, but we use such long durations that it should pass even
+  // in slow or contended environments.
   {
-    countDown.store(threads);
+    std::chrono::milliseconds const wait_time(500);
+    std::chrono::milliseconds const tolerance = wait_time * 3;
+    std::atomic<int> ready(0);
+
+    std::shared_timed_mutex m;
     m.lock();
-    std::vector<std::thread> v;
-    for (int i = 0; i < threads; ++i)
-      v.push_back(support::make_test_thread(f1));
-    while (countDown > 0)
-      std::this_thread::yield();
-    m.unlock();
-    for (auto& t : v)
+
+    std::vector<std::thread> threads;
+    for (int i = 0; i != 5; ++i) {
+      threads.push_back(support::make_test_thread([&] {
+        ++ready;
+        while (ready < 5)
+          /* spin until all threads are created */;
+
+        auto elapsed = measure([&] {
+          bool succeeded = m.try_lock_shared_until(std::chrono::steady_clock::now() + wait_time);
+          assert(succeeded);
+          m.unlock_shared();
+        });
+
+        // Ensure we didn't wait significantly longer than our timeout. This is technically
+        // flaky and non-conforming because an implementation is free to block for arbitrarily
+        // long, but any decent quality implementation should pass this test.
+        assert(elapsed - wait_time < tolerance);
+      }));
+    }
+
+    // Wait for all the threads to be ready to take the lock before we unlock it from here, otherwise
+    // there's a high chance that we're not testing the "locking an already locked" mutex use case.
+    // There is still technically a race condition here.
+    while (ready < 5)
+      /* spin */;
+    std::this_thread::sleep_for(wait_time / 5);
+
+    m.unlock(); // this should allow the threads to lock-shared 'm'
+
+    for (auto& t : threads)
       t.join();
   }
+
+  // Try to lock-shared an already-locked mutex for a short amount of time and fail.
+  // Again, this is technically flaky but we use such long durations that it should work.
   {
+    std::chrono::milliseconds const wait_time(10);
+    std::chrono::milliseconds const tolerance(750); // in case the thread we spawned goes to sleep or something
+
+    std::shared_timed_mutex m;
     m.lock();
-    std::vector<std::thread> v;
-    for (int i = 0; i < threads; ++i)
-      v.push_back(support::make_test_thread(f2));
-    for (auto& t : v)
+
+    std::vector<std::thread> threads;
+    for (int i = 0; i != 5; ++i) {
+      threads.push_back(support::make_test_thread([&] {
+        auto elapsed = measure([&] {
+          bool succeeded = m.try_lock_shared_until(std::chrono::steady_clock::now() + wait_time);
+          assert(!succeeded);
+        });
+
+        // Ensure we failed within some bounded time.
+        assert(elapsed - wait_time < tolerance);
+      }));
+    }
+
+    for (auto& t : threads)
       t.join();
+
     m.unlock();
   }
 
diff --git a/libcxx/test/std/thread/thread.mutex/thread.mutex.requirements/thread.sharedtimedmutex.requirements/thread.sharedtimedmutex.class/try_lock_until.pass.cpp b/libcxx/test/std/thread/thread.mutex/thread.mutex.requirements/thread.sharedtimedmutex.requirements/thread.sharedtimedmutex.class/try_lock_until.pass.cpp
index fb521ef..948364d 100644
--- a/libcxx/test/std/thread/thread.mutex/thread.mutex.requirements/thread.sharedtimedmutex.requirements/thread.sharedtimedmutex.class/try_lock_until.pass.cpp
+++ b/libcxx/test/std/thread/thread.mutex/thread.mutex.requirements/thread.sharedtimedmutex.requirements/thread.sharedtimedmutex.class/try_lock_until.pass.cpp
@@ -5,12 +5,10 @@
 // SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
 //
 //===----------------------------------------------------------------------===//
-//
+
 // UNSUPPORTED: no-threads
 // UNSUPPORTED: c++03, c++11
 
-// ALLOW_RETRIES: 2
-
 // <shared_mutex>
 
 // class shared_timed_mutex;
@@ -19,69 +17,89 @@
 //     bool try_lock_until(const chrono::time_point<Clock, Duration>& abs_time);
 
 #include <shared_mutex>
-#include <thread>
-#include <cstdlib>
+#include <atomic>
 #include <cassert>
 #include <chrono>
+#include <thread>
 
 #include "make_test_thread.h"
-#include "test_macros.h"
 
-std::shared_timed_mutex m;
+template <class Function>
+std::chrono::microseconds measure(Function f) {
+  std::chrono::high_resolution_clock::time_point start = std::chrono::high_resolution_clock::now();
+  f();
+  std::chrono::high_resolution_clock::time_point end = std::chrono::high_resolution_clock::now();
+  return std::chrono::duration_cast<std::chrono::microseconds>(end - start);
+}
 
-typedef std::chrono::steady_clock Clock;
-typedef Clock::time_point time_point;
-typedef Clock::duration duration;
-typedef std::chrono::milliseconds ms;
-typedef std::chrono::nanoseconds ns;
+int main(int, char**) {
+  // Try to lock a mutex that is not locked yet. This should succeed immediately.
+  {
+    std::shared_timed_mutex m;
+    bool succeeded = m.try_lock_until(std::chrono::steady_clock::now() + std::chrono::milliseconds(1));
+    assert(succeeded);
+    m.unlock();
+  }
 
+  // Try to lock an already-locked mutex for a long enough amount of time and succeed.
+  // This is technically flaky, but we use such long durations that it should pass even
+  // in slow or contended environments.
+  {
+    std::chrono::milliseconds const wait_time(500);
+    std::chrono::milliseconds const tolerance = wait_time * 3;
+    std::atomic<bool> ready(false);
 
-ms WaitTime = ms(250);
+    std::shared_timed_mutex m;
+    m.lock();
 
-// Thread sanitizer causes more overhead and will sometimes cause this test
-// to fail. To prevent this we give Thread sanitizer more time to complete the
-// test.
-#if !defined(TEST_IS_EXECUTED_IN_A_SLOW_ENVIRONMENT)
-ms Tolerance = ms(50);
-#else
-ms Tolerance = ms(50 * 5);
-#endif
+    std::thread t = support::make_test_thread([&] {
+      auto elapsed = measure([&] {
+        ready          = true;
+        bool succeeded = m.try_lock_until(std::chrono::steady_clock::now() + wait_time);
+        assert(succeeded);
+        m.unlock();
+      });
 
-void f1()
-{
-    time_point t0 = Clock::now();
-    assert(m.try_lock_until(Clock::now() + WaitTime + Tolerance) == true);
-    time_point t1 = Clock::now();
-    m.unlock();
-    ns d = t1 - t0 - WaitTime;
-    assert(d < Tolerance);  // within tolerance
-}
+      // Ensure we didn't wait significantly longer than our timeout. This is technically
+      // flaky and non-conforming because an implementation is free to block for arbitrarily
+      // long, but any decent quality implementation should pass this test.
+      assert(elapsed - wait_time < tolerance);
+    });
 
-void f2()
-{
-    time_point t0 = Clock::now();
-    assert(m.try_lock_until(Clock::now() + WaitTime) == false);
-    time_point t1 = Clock::now();
-    ns d = t1 - t0 - WaitTime;
-    assert(d < Tolerance);  // within tolerance
-}
+    // Wait for the thread to be ready to take the lock before we unlock it from here, otherwise
+    // there's a high chance that we're not testing the "locking an already locked" mutex use case.
+    // There is still technically a race condition here.
+    while (!ready)
+      /* spin */;
+    std::this_thread::sleep_for(wait_time / 5);
 
-int main(int, char**)
-{
-    {
-        m.lock();
-        std::thread t = support::make_test_thread(f1);
-        std::this_thread::sleep_for(WaitTime);
-        m.unlock();
-        t.join();
-    }
-    {
-        m.lock();
-        std::thread t = support::make_test_thread(f2);
-        std::this_thread::sleep_for(WaitTime + Tolerance);
-        m.unlock();
-        t.join();
-    }
+    m.unlock(); // this should allow the thread to lock 'm'
+    t.join();
+  }
+
+  // Try to lock an already-locked mutex for a short amount of time and fail.
+  // Again, this is technically flaky but we use such long durations that it should work.
+  {
+    std::chrono::milliseconds const wait_time(10);
+    std::chrono::milliseconds const tolerance(750); // in case the thread we spawned goes to sleep or something
+
+    std::shared_timed_mutex m;
+    m.lock();
+
+    std::thread t = support::make_test_thread([&] {
+      auto elapsed = measure([&] {
+        bool succeeded = m.try_lock_until(std::chrono::steady_clock::now() + wait_time);
+        assert(!succeeded);
+      });
+
+      // Ensure we failed within some bounded time.
+      assert(elapsed - wait_time < tolerance);
+    });
+
+    t.join();
+
+    m.unlock();
+  }
 
   return 0;
 }
diff --git a/libcxx/test/std/thread/thread.semaphore/acquire.pass.cpp b/libcxx/test/std/thread/thread.semaphore/acquire.pass.cpp
index 22eed73..5a4a0a94b 100644
--- a/libcxx/test/std/thread/thread.semaphore/acquire.pass.cpp
+++ b/libcxx/test/std/thread/thread.semaphore/acquire.pass.cpp
@@ -7,10 +7,7 @@
 //===----------------------------------------------------------------------===//
 //
 // UNSUPPORTED: no-threads
-// UNSUPPORTED: c++03, c++11
-
-// Until we drop support for the synchronization library in C++11/14/17
-// ADDITIONAL_COMPILE_FLAGS: -D_LIBCPP_DISABLE_DEPRECATION_WARNINGS
+// UNSUPPORTED: c++03, c++11, c++14, c++17
 
 // XFAIL: availability-synchronization_library-missing
 
diff --git a/libcxx/test/std/thread/thread.semaphore/binary.pass.cpp b/libcxx/test/std/thread/thread.semaphore/binary.pass.cpp
index c01c785..b244a9d 100644
--- a/libcxx/test/std/thread/thread.semaphore/binary.pass.cpp
+++ b/libcxx/test/std/thread/thread.semaphore/binary.pass.cpp
@@ -7,10 +7,7 @@
 //===----------------------------------------------------------------------===//
 //
 // UNSUPPORTED: no-threads
-// UNSUPPORTED: c++03, c++11
-
-// Until we drop support for the synchronization library in C++11/14/17
-// ADDITIONAL_COMPILE_FLAGS: -D_LIBCPP_DISABLE_DEPRECATION_WARNINGS
+// UNSUPPORTED: c++03, c++11, c++14, c++17
 
 // XFAIL: availability-synchronization_library-missing
 
diff --git a/libcxx/test/std/thread/thread.semaphore/ctor.compile.pass.cpp b/libcxx/test/std/thread/thread.semaphore/ctor.compile.pass.cpp
index dcc298c..b7c8d53 100644
--- a/libcxx/test/std/thread/thread.semaphore/ctor.compile.pass.cpp
+++ b/libcxx/test/std/thread/thread.semaphore/ctor.compile.pass.cpp
@@ -7,10 +7,7 @@
 //===----------------------------------------------------------------------===//
 //
 // UNSUPPORTED: no-threads
-// UNSUPPORTED: c++03, c++11
-
-// Until we drop support for the synchronization library in C++11/14/17
-// ADDITIONAL_COMPILE_FLAGS: -D_LIBCPP_DISABLE_DEPRECATION_WARNINGS
+// UNSUPPORTED: c++03, c++11, c++14, c++17
 
 // <semaphore>
 
diff --git a/libcxx/test/std/thread/thread.semaphore/max.pass.cpp b/libcxx/test/std/thread/thread.semaphore/max.pass.cpp
index 6f3ed5e..bf6b0f0 100644
--- a/libcxx/test/std/thread/thread.semaphore/max.pass.cpp
+++ b/libcxx/test/std/thread/thread.semaphore/max.pass.cpp
@@ -7,10 +7,7 @@
 //===----------------------------------------------------------------------===//
 //
 // UNSUPPORTED: no-threads
-// UNSUPPORTED: c++03, c++11
-
-// Until we drop support for the synchronization library in C++11/14/17
-// ADDITIONAL_COMPILE_FLAGS: -D_LIBCPP_DISABLE_DEPRECATION_WARNINGS
+// UNSUPPORTED: c++03, c++11, c++14, c++17
 
 // <semaphore>
 
diff --git a/libcxx/test/std/thread/thread.semaphore/release.pass.cpp b/libcxx/test/std/thread/thread.semaphore/release.pass.cpp
index 3c4d179..d068872 100644
--- a/libcxx/test/std/thread/thread.semaphore/release.pass.cpp
+++ b/libcxx/test/std/thread/thread.semaphore/release.pass.cpp
@@ -7,10 +7,7 @@
 //===----------------------------------------------------------------------===//
 //
 // UNSUPPORTED: no-threads
-// UNSUPPORTED: c++03, c++11
-
-// Until we drop support for the synchronization library in C++11/14/17
-// ADDITIONAL_COMPILE_FLAGS: -D_LIBCPP_DISABLE_DEPRECATION_WARNINGS
+// UNSUPPORTED: c++03, c++11, c++14, c++17
 
 // XFAIL: availability-synchronization_library-missing
 
diff --git a/libcxx/test/std/thread/thread.semaphore/timed.pass.cpp b/libcxx/test/std/thread/thread.semaphore/timed.pass.cpp
index 77f15ec..ad3c0fb 100644
--- a/libcxx/test/std/thread/thread.semaphore/timed.pass.cpp
+++ b/libcxx/test/std/thread/thread.semaphore/timed.pass.cpp
@@ -7,10 +7,7 @@
 //===----------------------------------------------------------------------===//
 //
 // UNSUPPORTED: no-threads
-// UNSUPPORTED: c++03, c++11
-
-// Until we drop support for the synchronization library in C++11/14/17
-// ADDITIONAL_COMPILE_FLAGS: -D_LIBCPP_DISABLE_DEPRECATION_WARNINGS
+// UNSUPPORTED: c++03, c++11, c++14, c++17
 
 // XFAIL: availability-synchronization_library-missing
 
diff --git a/libcxx/test/std/thread/thread.semaphore/try_acquire.pass.cpp b/libcxx/test/std/thread/thread.semaphore/try_acquire.pass.cpp
index ec159da..fb6fff3 100644
--- a/libcxx/test/std/thread/thread.semaphore/try_acquire.pass.cpp
+++ b/libcxx/test/std/thread/thread.semaphore/try_acquire.pass.cpp
@@ -7,10 +7,7 @@
 //===----------------------------------------------------------------------===//
 //
 // UNSUPPORTED: no-threads
-// UNSUPPORTED: c++03, c++11
-
-// Until we drop support for the synchronization library in C++11/14/17
-// ADDITIONAL_COMPILE_FLAGS: -D_LIBCPP_DISABLE_DEPRECATION_WARNINGS
+// UNSUPPORTED: c++03, c++11, c++14, c++17
 
 // XFAIL: availability-synchronization_library-missing
 
diff --git a/libcxx/test/std/thread/thread.threads/thread.thread.class/thread.thread.id/format.functions.format.pass.cpp b/libcxx/test/std/thread/thread.threads/thread.thread.class/thread.thread.id/format.functions.format.pass.cpp
index 3287b95..5618836 100644
--- a/libcxx/test/std/thread/thread.threads/thread.thread.class/thread.thread.id/format.functions.format.pass.cpp
+++ b/libcxx/test/std/thread/thread.threads/thread.thread.class/thread.thread.id/format.functions.format.pass.cpp
@@ -1,4 +1,5 @@
 //===----------------------------------------------------------------------===//
+//
 // Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
 // See https://llvm.org/LICENSE.txt for license information.
 // SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
diff --git a/libcxx/test/std/thread/thread.threads/thread.thread.class/thread.thread.id/format.functions.tests.h b/libcxx/test/std/thread/thread.threads/thread.thread.class/thread.thread.id/format.functions.tests.h
index cc38a53..f55f0e2 100644
--- a/libcxx/test/std/thread/thread.threads/thread.thread.class/thread.thread.id/format.functions.tests.h
+++ b/libcxx/test/std/thread/thread.threads/thread.thread.class/thread.thread.id/format.functions.tests.h
@@ -1,4 +1,5 @@
 //===----------------------------------------------------------------------===//
+//
 // Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
 // See https://llvm.org/LICENSE.txt for license information.
 // SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
diff --git a/libcxx/test/std/thread/thread.threads/thread.thread.class/thread.thread.id/format.functions.vformat.pass.cpp b/libcxx/test/std/thread/thread.threads/thread.thread.class/thread.thread.id/format.functions.vformat.pass.cpp
index 10b7317..8555ebd 100644
--- a/libcxx/test/std/thread/thread.threads/thread.thread.class/thread.thread.id/format.functions.vformat.pass.cpp
+++ b/libcxx/test/std/thread/thread.threads/thread.thread.class/thread.thread.id/format.functions.vformat.pass.cpp
@@ -1,4 +1,5 @@
 //===----------------------------------------------------------------------===//
+//
 // Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
 // See https://llvm.org/LICENSE.txt for license information.
 // SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
diff --git a/libcxx/test/std/thread/thread.threads/thread.thread.class/thread.thread.id/format.pass.cpp b/libcxx/test/std/thread/thread.threads/thread.thread.class/thread.thread.id/format.pass.cpp
index 575e5dd..39a2575 100644
--- a/libcxx/test/std/thread/thread.threads/thread.thread.class/thread.thread.id/format.pass.cpp
+++ b/libcxx/test/std/thread/thread.threads/thread.thread.class/thread.thread.id/format.pass.cpp
@@ -1,4 +1,5 @@
 //===----------------------------------------------------------------------===//
+//
 // Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
 // See https://llvm.org/LICENSE.txt for license information.
 // SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
diff --git a/libcxx/test/std/thread/thread.threads/thread.thread.class/thread.thread.id/parse.pass.cpp b/libcxx/test/std/thread/thread.threads/thread.thread.class/thread.thread.id/parse.pass.cpp
index b478893..f0db4e1 100644
--- a/libcxx/test/std/thread/thread.threads/thread.thread.class/thread.thread.id/parse.pass.cpp
+++ b/libcxx/test/std/thread/thread.threads/thread.thread.class/thread.thread.id/parse.pass.cpp
@@ -1,4 +1,5 @@
 //===----------------------------------------------------------------------===//
+//
 // Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
 // See https://llvm.org/LICENSE.txt for license information.
 // SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
diff --git a/libcxx/test/std/thread/thread.threads/thread.thread.class/thread.thread.id/types.compile.pass.cpp b/libcxx/test/std/thread/thread.threads/thread.thread.class/thread.thread.id/types.compile.pass.cpp
index 5a2f014..b507e77 100644
--- a/libcxx/test/std/thread/thread.threads/thread.thread.class/thread.thread.id/types.compile.pass.cpp
+++ b/libcxx/test/std/thread/thread.threads/thread.thread.class/thread.thread.id/types.compile.pass.cpp
@@ -1,4 +1,5 @@
 //===----------------------------------------------------------------------===//
+//
 // Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
 // See https://llvm.org/LICENSE.txt for license information.
 // SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
diff --git a/libcxx/test/std/time/time.syn/formatter.day.pass.cpp b/libcxx/test/std/time/time.syn/formatter.day.pass.cpp
index 35f3af2..02dc215 100644
--- a/libcxx/test/std/time/time.syn/formatter.day.pass.cpp
+++ b/libcxx/test/std/time/time.syn/formatter.day.pass.cpp
@@ -1,4 +1,5 @@
 //===----------------------------------------------------------------------===//
+//
 // Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
 // See https://llvm.org/LICENSE.txt for license information.
 // SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
diff --git a/libcxx/test/std/time/time.syn/formatter.duration.pass.cpp b/libcxx/test/std/time/time.syn/formatter.duration.pass.cpp
index c1d7c21..1a471ac 100644
--- a/libcxx/test/std/time/time.syn/formatter.duration.pass.cpp
+++ b/libcxx/test/std/time/time.syn/formatter.duration.pass.cpp
@@ -1,4 +1,5 @@
 //===----------------------------------------------------------------------===//
+//
 // Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
 // See https://llvm.org/LICENSE.txt for license information.
 // SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
diff --git a/libcxx/test/std/time/time.syn/formatter.file_time.pass.cpp b/libcxx/test/std/time/time.syn/formatter.file_time.pass.cpp
index f57841c..64fb1f1 100644
--- a/libcxx/test/std/time/time.syn/formatter.file_time.pass.cpp
+++ b/libcxx/test/std/time/time.syn/formatter.file_time.pass.cpp
@@ -1,4 +1,5 @@
 //===----------------------------------------------------------------------===//
+//
 // Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
 // See https://llvm.org/LICENSE.txt for license information.
 // SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
diff --git a/libcxx/test/std/time/time.syn/formatter.hh_mm_ss.pass.cpp b/libcxx/test/std/time/time.syn/formatter.hh_mm_ss.pass.cpp
index b4127af..50e3963 100644
--- a/libcxx/test/std/time/time.syn/formatter.hh_mm_ss.pass.cpp
+++ b/libcxx/test/std/time/time.syn/formatter.hh_mm_ss.pass.cpp
@@ -1,4 +1,5 @@
 //===----------------------------------------------------------------------===//
+//
 // Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
 // See https://llvm.org/LICENSE.txt for license information.
 // SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
diff --git a/libcxx/test/std/time/time.syn/formatter.local_info.pass.cpp b/libcxx/test/std/time/time.syn/formatter.local_info.pass.cpp
index 019a1fc..db69e082 100644
--- a/libcxx/test/std/time/time.syn/formatter.local_info.pass.cpp
+++ b/libcxx/test/std/time/time.syn/formatter.local_info.pass.cpp
@@ -1,4 +1,5 @@
 //===----------------------------------------------------------------------===//
+//
 // Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
 // See https://llvm.org/LICENSE.txt for license information.
 // SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
diff --git a/libcxx/test/std/time/time.syn/formatter.local_time.pass.cpp b/libcxx/test/std/time/time.syn/formatter.local_time.pass.cpp
index 45c3a12..f062e7a 100644
--- a/libcxx/test/std/time/time.syn/formatter.local_time.pass.cpp
+++ b/libcxx/test/std/time/time.syn/formatter.local_time.pass.cpp
@@ -1,4 +1,5 @@
 //===----------------------------------------------------------------------===//
+//
 // Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
 // See https://llvm.org/LICENSE.txt for license information.
 // SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
diff --git a/libcxx/test/std/time/time.syn/formatter.month.pass.cpp b/libcxx/test/std/time/time.syn/formatter.month.pass.cpp
index 6e73638..09c0fad 100644
--- a/libcxx/test/std/time/time.syn/formatter.month.pass.cpp
+++ b/libcxx/test/std/time/time.syn/formatter.month.pass.cpp
@@ -1,4 +1,5 @@
 //===----------------------------------------------------------------------===//
+//
 // Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
 // See https://llvm.org/LICENSE.txt for license information.
 // SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
diff --git a/libcxx/test/std/time/time.syn/formatter.month_day.pass.cpp b/libcxx/test/std/time/time.syn/formatter.month_day.pass.cpp
index 746554d..63ead5c 100644
--- a/libcxx/test/std/time/time.syn/formatter.month_day.pass.cpp
+++ b/libcxx/test/std/time/time.syn/formatter.month_day.pass.cpp
@@ -1,4 +1,5 @@
 //===----------------------------------------------------------------------===//
+//
 // Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
 // See https://llvm.org/LICENSE.txt for license information.
 // SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
diff --git a/libcxx/test/std/time/time.syn/formatter.month_day_last.pass.cpp b/libcxx/test/std/time/time.syn/formatter.month_day_last.pass.cpp
index f29ccea4..6f9bfd8 100644
--- a/libcxx/test/std/time/time.syn/formatter.month_day_last.pass.cpp
+++ b/libcxx/test/std/time/time.syn/formatter.month_day_last.pass.cpp
@@ -1,4 +1,5 @@
 //===----------------------------------------------------------------------===//
+//
 // Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
 // See https://llvm.org/LICENSE.txt for license information.
 // SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
diff --git a/libcxx/test/std/time/time.syn/formatter.month_weekday.pass.cpp b/libcxx/test/std/time/time.syn/formatter.month_weekday.pass.cpp
index f2a73c5..3d3430f 100644
--- a/libcxx/test/std/time/time.syn/formatter.month_weekday.pass.cpp
+++ b/libcxx/test/std/time/time.syn/formatter.month_weekday.pass.cpp
@@ -1,4 +1,5 @@
 //===----------------------------------------------------------------------===//
+//
 // Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
 // See https://llvm.org/LICENSE.txt for license information.
 // SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
diff --git a/libcxx/test/std/time/time.syn/formatter.sys_info.pass.cpp b/libcxx/test/std/time/time.syn/formatter.sys_info.pass.cpp
index d579774..0e9018d 100644
--- a/libcxx/test/std/time/time.syn/formatter.sys_info.pass.cpp
+++ b/libcxx/test/std/time/time.syn/formatter.sys_info.pass.cpp
@@ -1,4 +1,5 @@
 //===----------------------------------------------------------------------===//
+//
 // Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
 // See https://llvm.org/LICENSE.txt for license information.
 // SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
diff --git a/libcxx/test/std/time/time.syn/formatter.sys_time.pass.cpp b/libcxx/test/std/time/time.syn/formatter.sys_time.pass.cpp
index 3a7d6f9..96e77da 100644
--- a/libcxx/test/std/time/time.syn/formatter.sys_time.pass.cpp
+++ b/libcxx/test/std/time/time.syn/formatter.sys_time.pass.cpp
@@ -1,4 +1,5 @@
 //===----------------------------------------------------------------------===//
+//
 // Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
 // See https://llvm.org/LICENSE.txt for license information.
 // SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
diff --git a/libcxx/test/std/time/time.syn/formatter.weekday.pass.cpp b/libcxx/test/std/time/time.syn/formatter.weekday.pass.cpp
index 28b742a..4f232ee 100644
--- a/libcxx/test/std/time/time.syn/formatter.weekday.pass.cpp
+++ b/libcxx/test/std/time/time.syn/formatter.weekday.pass.cpp
@@ -1,4 +1,5 @@
 //===----------------------------------------------------------------------===//
+//
 // Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
 // See https://llvm.org/LICENSE.txt for license information.
 // SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
diff --git a/libcxx/test/std/time/time.syn/formatter.weekday_index.pass.cpp b/libcxx/test/std/time/time.syn/formatter.weekday_index.pass.cpp
index 03d3e77..3ebc267 100644
--- a/libcxx/test/std/time/time.syn/formatter.weekday_index.pass.cpp
+++ b/libcxx/test/std/time/time.syn/formatter.weekday_index.pass.cpp
@@ -1,4 +1,5 @@
 //===----------------------------------------------------------------------===//
+//
 // Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
 // See https://llvm.org/LICENSE.txt for license information.
 // SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
diff --git a/libcxx/test/std/time/time.syn/formatter.weekday_last.pass.cpp b/libcxx/test/std/time/time.syn/formatter.weekday_last.pass.cpp
index ea73382..c8f68aa9 100644
--- a/libcxx/test/std/time/time.syn/formatter.weekday_last.pass.cpp
+++ b/libcxx/test/std/time/time.syn/formatter.weekday_last.pass.cpp
@@ -1,4 +1,5 @@
 //===----------------------------------------------------------------------===//
+//
 // Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
 // See https://llvm.org/LICENSE.txt for license information.
 // SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
diff --git a/libcxx/test/std/time/time.syn/formatter.year.pass.cpp b/libcxx/test/std/time/time.syn/formatter.year.pass.cpp
index cf1b99a..e1a7d78 100644
--- a/libcxx/test/std/time/time.syn/formatter.year.pass.cpp
+++ b/libcxx/test/std/time/time.syn/formatter.year.pass.cpp
@@ -1,4 +1,5 @@
 //===----------------------------------------------------------------------===//
+//
 // Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
 // See https://llvm.org/LICENSE.txt for license information.
 // SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
diff --git a/libcxx/test/std/time/time.syn/formatter.year_month.pass.cpp b/libcxx/test/std/time/time.syn/formatter.year_month.pass.cpp
index f80f474..96b654d 100644
--- a/libcxx/test/std/time/time.syn/formatter.year_month.pass.cpp
+++ b/libcxx/test/std/time/time.syn/formatter.year_month.pass.cpp
@@ -1,4 +1,5 @@
 //===----------------------------------------------------------------------===//
+//
 // Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
 // See https://llvm.org/LICENSE.txt for license information.
 // SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
diff --git a/libcxx/test/std/time/time.syn/formatter.year_month_day.pass.cpp b/libcxx/test/std/time/time.syn/formatter.year_month_day.pass.cpp
index 1f2af1c..95e1795 100644
--- a/libcxx/test/std/time/time.syn/formatter.year_month_day.pass.cpp
+++ b/libcxx/test/std/time/time.syn/formatter.year_month_day.pass.cpp
@@ -1,4 +1,5 @@
 //===----------------------------------------------------------------------===//
+//
 // Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
 // See https://llvm.org/LICENSE.txt for license information.
 // SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
diff --git a/libcxx/test/std/time/time.syn/formatter.year_month_day_last.pass.cpp b/libcxx/test/std/time/time.syn/formatter.year_month_day_last.pass.cpp
index db1cdfc..e31ed52 100644
--- a/libcxx/test/std/time/time.syn/formatter.year_month_day_last.pass.cpp
+++ b/libcxx/test/std/time/time.syn/formatter.year_month_day_last.pass.cpp
@@ -1,4 +1,5 @@
 //===----------------------------------------------------------------------===//
+//
 // Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
 // See https://llvm.org/LICENSE.txt for license information.
 // SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
diff --git a/libcxx/test/std/time/time.syn/formatter.year_month_weekday.pass.cpp b/libcxx/test/std/time/time.syn/formatter.year_month_weekday.pass.cpp
index fcf0acc..38a9538 100644
--- a/libcxx/test/std/time/time.syn/formatter.year_month_weekday.pass.cpp
+++ b/libcxx/test/std/time/time.syn/formatter.year_month_weekday.pass.cpp
@@ -1,4 +1,5 @@
 //===----------------------------------------------------------------------===//
+//
 // Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
 // See https://llvm.org/LICENSE.txt for license information.
 // SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
diff --git a/libcxx/test/std/time/time.syn/formatter.year_month_weekday_last.pass.cpp b/libcxx/test/std/time/time.syn/formatter.year_month_weekday_last.pass.cpp
index a9c2b34..3dfda72 100644
--- a/libcxx/test/std/time/time.syn/formatter.year_month_weekday_last.pass.cpp
+++ b/libcxx/test/std/time/time.syn/formatter.year_month_weekday_last.pass.cpp
@@ -1,4 +1,5 @@
 //===----------------------------------------------------------------------===//
+//
 // Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
 // See https://llvm.org/LICENSE.txt for license information.
 // SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
diff --git a/libcxx/test/std/time/time.syn/formatter.zoned_time.pass.cpp b/libcxx/test/std/time/time.syn/formatter.zoned_time.pass.cpp
index 1e366ac..900cf42 100644
--- a/libcxx/test/std/time/time.syn/formatter.zoned_time.pass.cpp
+++ b/libcxx/test/std/time/time.syn/formatter.zoned_time.pass.cpp
@@ -1,4 +1,5 @@
 //===----------------------------------------------------------------------===//
+//
 // Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
 // See https://llvm.org/LICENSE.txt for license information.
 // SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
diff --git a/libcxx/test/std/time/time.syn/formatter_tests.h b/libcxx/test/std/time/time.syn/formatter_tests.h
index 1b343b5..798b953 100644
--- a/libcxx/test/std/time/time.syn/formatter_tests.h
+++ b/libcxx/test/std/time/time.syn/formatter_tests.h
@@ -1,4 +1,5 @@
 //===----------------------------------------------------------------------===//
+//
 // Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
 // See https://llvm.org/LICENSE.txt for license information.
 // SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
diff --git a/libcxx/test/std/utilities/expected/expected.bad/base.compile.pass.cpp b/libcxx/test/std/utilities/expected/expected.bad/base.compile.pass.cpp
index 545215a..09d2ade 100644
--- a/libcxx/test/std/utilities/expected/expected.bad/base.compile.pass.cpp
+++ b/libcxx/test/std/utilities/expected/expected.bad/base.compile.pass.cpp
@@ -1,4 +1,5 @@
 //===----------------------------------------------------------------------===//
+//
 // Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
 // See https://llvm.org/LICENSE.txt for license information.
 // SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
diff --git a/libcxx/test/std/utilities/expected/expected.bad/ctor.error.pass.cpp b/libcxx/test/std/utilities/expected/expected.bad/ctor.error.pass.cpp
index 67a91b9..9c7ba5a 100644
--- a/libcxx/test/std/utilities/expected/expected.bad/ctor.error.pass.cpp
+++ b/libcxx/test/std/utilities/expected/expected.bad/ctor.error.pass.cpp
@@ -1,4 +1,5 @@
 //===----------------------------------------------------------------------===//
+//
 // Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
 // See https://llvm.org/LICENSE.txt for license information.
 // SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
diff --git a/libcxx/test/std/utilities/expected/expected.bad/error.member.pass.cpp b/libcxx/test/std/utilities/expected/expected.bad/error.member.pass.cpp
index 4e1e2ea..01f6662 100644
--- a/libcxx/test/std/utilities/expected/expected.bad/error.member.pass.cpp
+++ b/libcxx/test/std/utilities/expected/expected.bad/error.member.pass.cpp
@@ -1,4 +1,5 @@
 //===----------------------------------------------------------------------===//
+//
 // Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
 // See https://llvm.org/LICENSE.txt for license information.
 // SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
diff --git a/libcxx/test/std/utilities/expected/expected.bad/void-specialization.pass.cpp b/libcxx/test/std/utilities/expected/expected.bad/void-specialization.pass.cpp
index 092e115..92e2fef 100644
--- a/libcxx/test/std/utilities/expected/expected.bad/void-specialization.pass.cpp
+++ b/libcxx/test/std/utilities/expected/expected.bad/void-specialization.pass.cpp
@@ -1,4 +1,5 @@
 //===----------------------------------------------------------------------===//
+//
 // Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
 // See https://llvm.org/LICENSE.txt for license information.
 // SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
diff --git a/libcxx/test/std/utilities/expected/expected.bad/what.pass.cpp b/libcxx/test/std/utilities/expected/expected.bad/what.pass.cpp
index bc5e356..3ea5d8b 100644
--- a/libcxx/test/std/utilities/expected/expected.bad/what.pass.cpp
+++ b/libcxx/test/std/utilities/expected/expected.bad/what.pass.cpp
@@ -1,4 +1,5 @@
 //===----------------------------------------------------------------------===//
+//
 // Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
 // See https://llvm.org/LICENSE.txt for license information.
 // SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
diff --git a/libcxx/test/std/utilities/expected/expected.expected/assign/assign.U.pass.cpp b/libcxx/test/std/utilities/expected/expected.expected/assign/assign.U.pass.cpp
index 2d3b036..807a8af 100644
--- a/libcxx/test/std/utilities/expected/expected.expected/assign/assign.U.pass.cpp
+++ b/libcxx/test/std/utilities/expected/expected.expected/assign/assign.U.pass.cpp
@@ -1,4 +1,5 @@
 //===----------------------------------------------------------------------===//
+//
 // Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
 // See https://llvm.org/LICENSE.txt for license information.
 // SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
diff --git a/libcxx/test/std/utilities/expected/expected.expected/assign/assign.copy.pass.cpp b/libcxx/test/std/utilities/expected/expected.expected/assign/assign.copy.pass.cpp
index 2f52913..2f6af70 100644
--- a/libcxx/test/std/utilities/expected/expected.expected/assign/assign.copy.pass.cpp
+++ b/libcxx/test/std/utilities/expected/expected.expected/assign/assign.copy.pass.cpp
@@ -1,4 +1,5 @@
 //===----------------------------------------------------------------------===//
+//
 // Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
 // See https://llvm.org/LICENSE.txt for license information.
 // SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
diff --git a/libcxx/test/std/utilities/expected/expected.expected/assign/assign.move.pass.cpp b/libcxx/test/std/utilities/expected/expected.expected/assign/assign.move.pass.cpp
index 065827a..1fe7c41 100644
--- a/libcxx/test/std/utilities/expected/expected.expected/assign/assign.move.pass.cpp
+++ b/libcxx/test/std/utilities/expected/expected.expected/assign/assign.move.pass.cpp
@@ -1,4 +1,5 @@
 //===----------------------------------------------------------------------===//
+//
 // Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
 // See https://llvm.org/LICENSE.txt for license information.
 // SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
diff --git a/libcxx/test/std/utilities/expected/expected.expected/assign/assign.unexpected.copy.pass.cpp b/libcxx/test/std/utilities/expected/expected.expected/assign/assign.unexpected.copy.pass.cpp
index 92a0c42..e7909d6 100644
--- a/libcxx/test/std/utilities/expected/expected.expected/assign/assign.unexpected.copy.pass.cpp
+++ b/libcxx/test/std/utilities/expected/expected.expected/assign/assign.unexpected.copy.pass.cpp
@@ -1,4 +1,5 @@
 //===----------------------------------------------------------------------===//
+//
 // Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
 // See https://llvm.org/LICENSE.txt for license information.
 // SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
diff --git a/libcxx/test/std/utilities/expected/expected.expected/assign/assign.unexpected.move.pass.cpp b/libcxx/test/std/utilities/expected/expected.expected/assign/assign.unexpected.move.pass.cpp
index 9850197..4de9b98 100644
--- a/libcxx/test/std/utilities/expected/expected.expected/assign/assign.unexpected.move.pass.cpp
+++ b/libcxx/test/std/utilities/expected/expected.expected/assign/assign.unexpected.move.pass.cpp
@@ -1,4 +1,5 @@
 //===----------------------------------------------------------------------===//
+//
 // Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
 // See https://llvm.org/LICENSE.txt for license information.
 // SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
diff --git a/libcxx/test/std/utilities/expected/expected.expected/assign/emplace.intializer_list.pass.cpp b/libcxx/test/std/utilities/expected/expected.expected/assign/emplace.intializer_list.pass.cpp
index 2f36349..cc365c9 100644
--- a/libcxx/test/std/utilities/expected/expected.expected/assign/emplace.intializer_list.pass.cpp
+++ b/libcxx/test/std/utilities/expected/expected.expected/assign/emplace.intializer_list.pass.cpp
@@ -1,4 +1,5 @@
 //===----------------------------------------------------------------------===//
+//
 // Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
 // See https://llvm.org/LICENSE.txt for license information.
 // SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
diff --git a/libcxx/test/std/utilities/expected/expected.expected/assign/emplace.pass.cpp b/libcxx/test/std/utilities/expected/expected.expected/assign/emplace.pass.cpp
index 7e37f8b..d901177 100644
--- a/libcxx/test/std/utilities/expected/expected.expected/assign/emplace.pass.cpp
+++ b/libcxx/test/std/utilities/expected/expected.expected/assign/emplace.pass.cpp
@@ -1,4 +1,5 @@
 //===----------------------------------------------------------------------===//
+//
 // Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
 // See https://llvm.org/LICENSE.txt for license information.
 // SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
diff --git a/libcxx/test/std/utilities/expected/expected.expected/ctor/ctor.convert.copy.pass.cpp b/libcxx/test/std/utilities/expected/expected.expected/ctor/ctor.convert.copy.pass.cpp
index 16de28d..ac369a5 100644
--- a/libcxx/test/std/utilities/expected/expected.expected/ctor/ctor.convert.copy.pass.cpp
+++ b/libcxx/test/std/utilities/expected/expected.expected/ctor/ctor.convert.copy.pass.cpp
@@ -1,4 +1,5 @@
 //===----------------------------------------------------------------------===//
+//
 // Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
 // See https://llvm.org/LICENSE.txt for license information.
 // SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
diff --git a/libcxx/test/std/utilities/expected/expected.expected/ctor/ctor.convert.move.pass.cpp b/libcxx/test/std/utilities/expected/expected.expected/ctor/ctor.convert.move.pass.cpp
index 0e30ea2..dea0e53 100644
--- a/libcxx/test/std/utilities/expected/expected.expected/ctor/ctor.convert.move.pass.cpp
+++ b/libcxx/test/std/utilities/expected/expected.expected/ctor/ctor.convert.move.pass.cpp
@@ -1,4 +1,5 @@
 //===----------------------------------------------------------------------===//
+//
 // Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
 // See https://llvm.org/LICENSE.txt for license information.
 // SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
diff --git a/libcxx/test/std/utilities/expected/expected.expected/ctor/ctor.copy.pass.cpp b/libcxx/test/std/utilities/expected/expected.expected/ctor/ctor.copy.pass.cpp
index ba98317..9e78596 100644
--- a/libcxx/test/std/utilities/expected/expected.expected/ctor/ctor.copy.pass.cpp
+++ b/libcxx/test/std/utilities/expected/expected.expected/ctor/ctor.copy.pass.cpp
@@ -1,4 +1,5 @@
 //===----------------------------------------------------------------------===//
+//
 // Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
 // See https://llvm.org/LICENSE.txt for license information.
 // SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
diff --git a/libcxx/test/std/utilities/expected/expected.expected/ctor/ctor.default.pass.cpp b/libcxx/test/std/utilities/expected/expected.expected/ctor/ctor.default.pass.cpp
index dcd046b..9f9ff7e 100644
--- a/libcxx/test/std/utilities/expected/expected.expected/ctor/ctor.default.pass.cpp
+++ b/libcxx/test/std/utilities/expected/expected.expected/ctor/ctor.default.pass.cpp
@@ -1,4 +1,5 @@
 //===----------------------------------------------------------------------===//
+//
 // Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
 // See https://llvm.org/LICENSE.txt for license information.
 // SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
diff --git a/libcxx/test/std/utilities/expected/expected.expected/ctor/ctor.inplace.pass.cpp b/libcxx/test/std/utilities/expected/expected.expected/ctor/ctor.inplace.pass.cpp
index 88ec419..31de6e4 100644
--- a/libcxx/test/std/utilities/expected/expected.expected/ctor/ctor.inplace.pass.cpp
+++ b/libcxx/test/std/utilities/expected/expected.expected/ctor/ctor.inplace.pass.cpp
@@ -1,4 +1,5 @@
 //===----------------------------------------------------------------------===//
+//
 // Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
 // See https://llvm.org/LICENSE.txt for license information.
 // SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
diff --git a/libcxx/test/std/utilities/expected/expected.expected/ctor/ctor.inplace_init_list.pass.cpp b/libcxx/test/std/utilities/expected/expected.expected/ctor/ctor.inplace_init_list.pass.cpp
index a97086f..ed2afbb 100644
--- a/libcxx/test/std/utilities/expected/expected.expected/ctor/ctor.inplace_init_list.pass.cpp
+++ b/libcxx/test/std/utilities/expected/expected.expected/ctor/ctor.inplace_init_list.pass.cpp
@@ -1,4 +1,5 @@
 //===----------------------------------------------------------------------===//
+//
 // Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
 // See https://llvm.org/LICENSE.txt for license information.
 // SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
diff --git a/libcxx/test/std/utilities/expected/expected.expected/ctor/ctor.move.pass.cpp b/libcxx/test/std/utilities/expected/expected.expected/ctor/ctor.move.pass.cpp
index cd89e24..b8ab466 100644
--- a/libcxx/test/std/utilities/expected/expected.expected/ctor/ctor.move.pass.cpp
+++ b/libcxx/test/std/utilities/expected/expected.expected/ctor/ctor.move.pass.cpp
@@ -1,4 +1,5 @@
 //===----------------------------------------------------------------------===//
+//
 // Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
 // See https://llvm.org/LICENSE.txt for license information.
 // SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
diff --git a/libcxx/test/std/utilities/expected/expected.expected/ctor/ctor.u.pass.cpp b/libcxx/test/std/utilities/expected/expected.expected/ctor/ctor.u.pass.cpp
index 1cf3d9c..fa9f5a1 100644
--- a/libcxx/test/std/utilities/expected/expected.expected/ctor/ctor.u.pass.cpp
+++ b/libcxx/test/std/utilities/expected/expected.expected/ctor/ctor.u.pass.cpp
@@ -1,4 +1,5 @@
 //===----------------------------------------------------------------------===//
+//
 // Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
 // See https://llvm.org/LICENSE.txt for license information.
 // SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
diff --git a/libcxx/test/std/utilities/expected/expected.expected/ctor/ctor.unexpect.pass.cpp b/libcxx/test/std/utilities/expected/expected.expected/ctor/ctor.unexpect.pass.cpp
index 27ce977..f71d17c 100644
--- a/libcxx/test/std/utilities/expected/expected.expected/ctor/ctor.unexpect.pass.cpp
+++ b/libcxx/test/std/utilities/expected/expected.expected/ctor/ctor.unexpect.pass.cpp
@@ -1,4 +1,5 @@
 //===----------------------------------------------------------------------===//
+//
 // Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
 // See https://llvm.org/LICENSE.txt for license information.
 // SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
diff --git a/libcxx/test/std/utilities/expected/expected.expected/ctor/ctor.unexpect_init_list.pass.cpp b/libcxx/test/std/utilities/expected/expected.expected/ctor/ctor.unexpect_init_list.pass.cpp
index 4f5d3d1..6da98e1 100644
--- a/libcxx/test/std/utilities/expected/expected.expected/ctor/ctor.unexpect_init_list.pass.cpp
+++ b/libcxx/test/std/utilities/expected/expected.expected/ctor/ctor.unexpect_init_list.pass.cpp
@@ -1,4 +1,5 @@
 //===----------------------------------------------------------------------===//
+//
 // Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
 // See https://llvm.org/LICENSE.txt for license information.
 // SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
diff --git a/libcxx/test/std/utilities/expected/expected.expected/ctor/ctor.unexpected.copy.pass.cpp b/libcxx/test/std/utilities/expected/expected.expected/ctor/ctor.unexpected.copy.pass.cpp
index bbfd304..9cc8d11 100644
--- a/libcxx/test/std/utilities/expected/expected.expected/ctor/ctor.unexpected.copy.pass.cpp
+++ b/libcxx/test/std/utilities/expected/expected.expected/ctor/ctor.unexpected.copy.pass.cpp
@@ -1,4 +1,5 @@
 //===----------------------------------------------------------------------===//
+//
 // Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
 // See https://llvm.org/LICENSE.txt for license information.
 // SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
diff --git a/libcxx/test/std/utilities/expected/expected.expected/ctor/ctor.unexpected.move.pass.cpp b/libcxx/test/std/utilities/expected/expected.expected/ctor/ctor.unexpected.move.pass.cpp
index 800d47b..cb87db3 100644
--- a/libcxx/test/std/utilities/expected/expected.expected/ctor/ctor.unexpected.move.pass.cpp
+++ b/libcxx/test/std/utilities/expected/expected.expected/ctor/ctor.unexpected.move.pass.cpp
@@ -1,4 +1,5 @@
 //===----------------------------------------------------------------------===//
+//
 // Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
 // See https://llvm.org/LICENSE.txt for license information.
 // SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
diff --git a/libcxx/test/std/utilities/expected/expected.expected/dtor.pass.cpp b/libcxx/test/std/utilities/expected/expected.expected/dtor.pass.cpp
index 7596d2e..a4250e6 100644
--- a/libcxx/test/std/utilities/expected/expected.expected/dtor.pass.cpp
+++ b/libcxx/test/std/utilities/expected/expected.expected/dtor.pass.cpp
@@ -1,4 +1,5 @@
 //===----------------------------------------------------------------------===//
+//
 // Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
 // See https://llvm.org/LICENSE.txt for license information.
 // SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
diff --git a/libcxx/test/std/utilities/expected/expected.expected/equality/equality.T2.pass.cpp b/libcxx/test/std/utilities/expected/expected.expected/equality/equality.T2.pass.cpp
index 29945d2..bc8b9de 100644
--- a/libcxx/test/std/utilities/expected/expected.expected/equality/equality.T2.pass.cpp
+++ b/libcxx/test/std/utilities/expected/expected.expected/equality/equality.T2.pass.cpp
@@ -1,4 +1,5 @@
 //===----------------------------------------------------------------------===//
+//
 // Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
 // See https://llvm.org/LICENSE.txt for license information.
 // SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
diff --git a/libcxx/test/std/utilities/expected/expected.expected/equality/equality.other_expected.pass.cpp b/libcxx/test/std/utilities/expected/expected.expected/equality/equality.other_expected.pass.cpp
index 62f0d6f..9325c6c 100644
--- a/libcxx/test/std/utilities/expected/expected.expected/equality/equality.other_expected.pass.cpp
+++ b/libcxx/test/std/utilities/expected/expected.expected/equality/equality.other_expected.pass.cpp
@@ -1,4 +1,5 @@
 //===----------------------------------------------------------------------===//
+//
 // Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
 // See https://llvm.org/LICENSE.txt for license information.
 // SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
diff --git a/libcxx/test/std/utilities/expected/expected.expected/equality/equality.unexpected.pass.cpp b/libcxx/test/std/utilities/expected/expected.expected/equality/equality.unexpected.pass.cpp
index 963f39b..a8c469d 100644
--- a/libcxx/test/std/utilities/expected/expected.expected/equality/equality.unexpected.pass.cpp
+++ b/libcxx/test/std/utilities/expected/expected.expected/equality/equality.unexpected.pass.cpp
@@ -1,4 +1,5 @@
 //===----------------------------------------------------------------------===//
+//
 // Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
 // See https://llvm.org/LICENSE.txt for license information.
 // SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
diff --git a/libcxx/test/std/utilities/expected/expected.expected/observers/arrow.pass.cpp b/libcxx/test/std/utilities/expected/expected.expected/observers/arrow.pass.cpp
index 72fb166..6cbadb8 100644
--- a/libcxx/test/std/utilities/expected/expected.expected/observers/arrow.pass.cpp
+++ b/libcxx/test/std/utilities/expected/expected.expected/observers/arrow.pass.cpp
@@ -1,4 +1,5 @@
 //===----------------------------------------------------------------------===//
+//
 // Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
 // See https://llvm.org/LICENSE.txt for license information.
 // SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
diff --git a/libcxx/test/std/utilities/expected/expected.expected/observers/bool.pass.cpp b/libcxx/test/std/utilities/expected/expected.expected/observers/bool.pass.cpp
index 8daecf3..e4aab72 100644
--- a/libcxx/test/std/utilities/expected/expected.expected/observers/bool.pass.cpp
+++ b/libcxx/test/std/utilities/expected/expected.expected/observers/bool.pass.cpp
@@ -1,4 +1,5 @@
 //===----------------------------------------------------------------------===//
+//
 // Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
 // See https://llvm.org/LICENSE.txt for license information.
 // SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
diff --git a/libcxx/test/std/utilities/expected/expected.expected/observers/deref.pass.cpp b/libcxx/test/std/utilities/expected/expected.expected/observers/deref.pass.cpp
index 80bc243..628538e 100644
--- a/libcxx/test/std/utilities/expected/expected.expected/observers/deref.pass.cpp
+++ b/libcxx/test/std/utilities/expected/expected.expected/observers/deref.pass.cpp
@@ -1,4 +1,5 @@
 //===----------------------------------------------------------------------===//
+//
 // Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
 // See https://llvm.org/LICENSE.txt for license information.
 // SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
diff --git a/libcxx/test/std/utilities/expected/expected.expected/observers/error.pass.cpp b/libcxx/test/std/utilities/expected/expected.expected/observers/error.pass.cpp
index 491e5e6..3969466 100644
--- a/libcxx/test/std/utilities/expected/expected.expected/observers/error.pass.cpp
+++ b/libcxx/test/std/utilities/expected/expected.expected/observers/error.pass.cpp
@@ -1,4 +1,5 @@
 //===----------------------------------------------------------------------===//
+//
 // Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
 // See https://llvm.org/LICENSE.txt for license information.
 // SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
diff --git a/libcxx/test/std/utilities/expected/expected.expected/observers/has_value.pass.cpp b/libcxx/test/std/utilities/expected/expected.expected/observers/has_value.pass.cpp
index 8e39986..6111d7a 100644
--- a/libcxx/test/std/utilities/expected/expected.expected/observers/has_value.pass.cpp
+++ b/libcxx/test/std/utilities/expected/expected.expected/observers/has_value.pass.cpp
@@ -1,4 +1,5 @@
 //===----------------------------------------------------------------------===//
+//
 // Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
 // See https://llvm.org/LICENSE.txt for license information.
 // SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
diff --git a/libcxx/test/std/utilities/expected/expected.expected/observers/value.pass.cpp b/libcxx/test/std/utilities/expected/expected.expected/observers/value.pass.cpp
index 44d1b2f..7a2d3cb 100644
--- a/libcxx/test/std/utilities/expected/expected.expected/observers/value.pass.cpp
+++ b/libcxx/test/std/utilities/expected/expected.expected/observers/value.pass.cpp
@@ -1,4 +1,5 @@
 //===----------------------------------------------------------------------===//
+//
 // Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
 // See https://llvm.org/LICENSE.txt for license information.
 // SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
diff --git a/libcxx/test/std/utilities/expected/expected.expected/observers/value_or.pass.cpp b/libcxx/test/std/utilities/expected/expected.expected/observers/value_or.pass.cpp
index fa05a13..82bcd7b 100644
--- a/libcxx/test/std/utilities/expected/expected.expected/observers/value_or.pass.cpp
+++ b/libcxx/test/std/utilities/expected/expected.expected/observers/value_or.pass.cpp
@@ -1,4 +1,5 @@
 //===----------------------------------------------------------------------===//
+//
 // Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
 // See https://llvm.org/LICENSE.txt for license information.
 // SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
diff --git a/libcxx/test/std/utilities/expected/expected.expected/swap/free.swap.pass.cpp b/libcxx/test/std/utilities/expected/expected.expected/swap/free.swap.pass.cpp
index 05b49e3..607113a 100644
--- a/libcxx/test/std/utilities/expected/expected.expected/swap/free.swap.pass.cpp
+++ b/libcxx/test/std/utilities/expected/expected.expected/swap/free.swap.pass.cpp
@@ -1,4 +1,5 @@
 //===----------------------------------------------------------------------===//
+//
 // Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
 // See https://llvm.org/LICENSE.txt for license information.
 // SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
diff --git a/libcxx/test/std/utilities/expected/expected.expected/swap/member.swap.pass.cpp b/libcxx/test/std/utilities/expected/expected.expected/swap/member.swap.pass.cpp
index f19599d..d2eaf5c 100644
--- a/libcxx/test/std/utilities/expected/expected.expected/swap/member.swap.pass.cpp
+++ b/libcxx/test/std/utilities/expected/expected.expected/swap/member.swap.pass.cpp
@@ -1,4 +1,5 @@
 //===----------------------------------------------------------------------===//
+//
 // Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
 // See https://llvm.org/LICENSE.txt for license information.
 // SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
diff --git a/libcxx/test/std/utilities/expected/expected.unexpected/assign/assign.copy.pass.cpp b/libcxx/test/std/utilities/expected/expected.unexpected/assign/assign.copy.pass.cpp
index 74f46f0..088ed21 100644
--- a/libcxx/test/std/utilities/expected/expected.unexpected/assign/assign.copy.pass.cpp
+++ b/libcxx/test/std/utilities/expected/expected.unexpected/assign/assign.copy.pass.cpp
@@ -1,4 +1,5 @@
 //===----------------------------------------------------------------------===//
+//
 // Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
 // See https://llvm.org/LICENSE.txt for license information.
 // SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
diff --git a/libcxx/test/std/utilities/expected/expected.unexpected/assign/assign.move.pass.cpp b/libcxx/test/std/utilities/expected/expected.unexpected/assign/assign.move.pass.cpp
index 31573be..010f45f 100644
--- a/libcxx/test/std/utilities/expected/expected.unexpected/assign/assign.move.pass.cpp
+++ b/libcxx/test/std/utilities/expected/expected.unexpected/assign/assign.move.pass.cpp
@@ -1,4 +1,5 @@
 //===----------------------------------------------------------------------===//
+//
 // Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
 // See https://llvm.org/LICENSE.txt for license information.
 // SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
diff --git a/libcxx/test/std/utilities/expected/expected.unexpected/ctad.compile.pass.cpp b/libcxx/test/std/utilities/expected/expected.unexpected/ctad.compile.pass.cpp
index eb23a2f..61b09d9 100644
--- a/libcxx/test/std/utilities/expected/expected.unexpected/ctad.compile.pass.cpp
+++ b/libcxx/test/std/utilities/expected/expected.unexpected/ctad.compile.pass.cpp
@@ -1,4 +1,5 @@
 //===----------------------------------------------------------------------===//
+//
 // Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
 // See https://llvm.org/LICENSE.txt for license information.
 // SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
diff --git a/libcxx/test/std/utilities/expected/expected.unexpected/ctor/ctor.copy.pass.cpp b/libcxx/test/std/utilities/expected/expected.unexpected/ctor/ctor.copy.pass.cpp
index 9a6600232..57a26a6 100644
--- a/libcxx/test/std/utilities/expected/expected.unexpected/ctor/ctor.copy.pass.cpp
+++ b/libcxx/test/std/utilities/expected/expected.unexpected/ctor/ctor.copy.pass.cpp
@@ -1,4 +1,5 @@
 //===----------------------------------------------------------------------===//
+//
 // Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
 // See https://llvm.org/LICENSE.txt for license information.
 // SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
diff --git a/libcxx/test/std/utilities/expected/expected.unexpected/ctor/ctor.error.pass.cpp b/libcxx/test/std/utilities/expected/expected.unexpected/ctor/ctor.error.pass.cpp
index 86e903a..bdeb74e 100644
--- a/libcxx/test/std/utilities/expected/expected.unexpected/ctor/ctor.error.pass.cpp
+++ b/libcxx/test/std/utilities/expected/expected.unexpected/ctor/ctor.error.pass.cpp
@@ -1,4 +1,5 @@
 //===----------------------------------------------------------------------===//
+//
 // Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
 // See https://llvm.org/LICENSE.txt for license information.
 // SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
diff --git a/libcxx/test/std/utilities/expected/expected.unexpected/ctor/ctor.inplace.pass.cpp b/libcxx/test/std/utilities/expected/expected.unexpected/ctor/ctor.inplace.pass.cpp
index 5cad5c7..3e84244 100644
--- a/libcxx/test/std/utilities/expected/expected.unexpected/ctor/ctor.inplace.pass.cpp
+++ b/libcxx/test/std/utilities/expected/expected.unexpected/ctor/ctor.inplace.pass.cpp
@@ -1,4 +1,5 @@
 //===----------------------------------------------------------------------===//
+//
 // Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
 // See https://llvm.org/LICENSE.txt for license information.
 // SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
diff --git a/libcxx/test/std/utilities/expected/expected.unexpected/ctor/ctor.inplace_init_list.pass.cpp b/libcxx/test/std/utilities/expected/expected.unexpected/ctor/ctor.inplace_init_list.pass.cpp
index ee9a228..b8927f5 100644
--- a/libcxx/test/std/utilities/expected/expected.unexpected/ctor/ctor.inplace_init_list.pass.cpp
+++ b/libcxx/test/std/utilities/expected/expected.unexpected/ctor/ctor.inplace_init_list.pass.cpp
@@ -1,4 +1,5 @@
 //===----------------------------------------------------------------------===//
+//
 // Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
 // See https://llvm.org/LICENSE.txt for license information.
 // SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
diff --git a/libcxx/test/std/utilities/expected/expected.unexpected/ctor/ctor.move.pass.cpp b/libcxx/test/std/utilities/expected/expected.unexpected/ctor/ctor.move.pass.cpp
index 3028dfa..53d4c6b 100644
--- a/libcxx/test/std/utilities/expected/expected.unexpected/ctor/ctor.move.pass.cpp
+++ b/libcxx/test/std/utilities/expected/expected.unexpected/ctor/ctor.move.pass.cpp
@@ -1,4 +1,5 @@
 //===----------------------------------------------------------------------===//
+//
 // Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
 // See https://llvm.org/LICENSE.txt for license information.
 // SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
diff --git a/libcxx/test/std/utilities/expected/expected.unexpected/equality.pass.cpp b/libcxx/test/std/utilities/expected/expected.unexpected/equality.pass.cpp
index 4d249df..3c29cf9 100644
--- a/libcxx/test/std/utilities/expected/expected.unexpected/equality.pass.cpp
+++ b/libcxx/test/std/utilities/expected/expected.unexpected/equality.pass.cpp
@@ -1,4 +1,5 @@
 //===----------------------------------------------------------------------===//
+//
 // Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
 // See https://llvm.org/LICENSE.txt for license information.
 // SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
diff --git a/libcxx/test/std/utilities/expected/expected.unexpected/observer/error.const_ref.pass.cpp b/libcxx/test/std/utilities/expected/expected.unexpected/observer/error.const_ref.pass.cpp
index 8cfe555..c6b9f3c 100644
--- a/libcxx/test/std/utilities/expected/expected.unexpected/observer/error.const_ref.pass.cpp
+++ b/libcxx/test/std/utilities/expected/expected.unexpected/observer/error.const_ref.pass.cpp
@@ -1,4 +1,5 @@
 //===----------------------------------------------------------------------===//
+//
 // Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
 // See https://llvm.org/LICENSE.txt for license information.
 // SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
diff --git a/libcxx/test/std/utilities/expected/expected.unexpected/observer/error.const_ref_ref.pass.cpp b/libcxx/test/std/utilities/expected/expected.unexpected/observer/error.const_ref_ref.pass.cpp
index ffd8ed8..13a7229 100644
--- a/libcxx/test/std/utilities/expected/expected.unexpected/observer/error.const_ref_ref.pass.cpp
+++ b/libcxx/test/std/utilities/expected/expected.unexpected/observer/error.const_ref_ref.pass.cpp
@@ -1,4 +1,5 @@
 //===----------------------------------------------------------------------===//
+//
 // Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
 // See https://llvm.org/LICENSE.txt for license information.
 // SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
diff --git a/libcxx/test/std/utilities/expected/expected.unexpected/observer/error.ref.pass.cpp b/libcxx/test/std/utilities/expected/expected.unexpected/observer/error.ref.pass.cpp
index 4f22eb4..33c85ea 100644
--- a/libcxx/test/std/utilities/expected/expected.unexpected/observer/error.ref.pass.cpp
+++ b/libcxx/test/std/utilities/expected/expected.unexpected/observer/error.ref.pass.cpp
@@ -1,4 +1,5 @@
 //===----------------------------------------------------------------------===//
+//
 // Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
 // See https://llvm.org/LICENSE.txt for license information.
 // SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
diff --git a/libcxx/test/std/utilities/expected/expected.unexpected/observer/error.ref_ref.pass.cpp b/libcxx/test/std/utilities/expected/expected.unexpected/observer/error.ref_ref.pass.cpp
index 415435b..69b2525 100644
--- a/libcxx/test/std/utilities/expected/expected.unexpected/observer/error.ref_ref.pass.cpp
+++ b/libcxx/test/std/utilities/expected/expected.unexpected/observer/error.ref_ref.pass.cpp
@@ -1,4 +1,5 @@
 //===----------------------------------------------------------------------===//
+//
 // Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
 // See https://llvm.org/LICENSE.txt for license information.
 // SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
diff --git a/libcxx/test/std/utilities/expected/expected.unexpected/swap/swap.free.pass.cpp b/libcxx/test/std/utilities/expected/expected.unexpected/swap/swap.free.pass.cpp
index ac0fb82..711a779 100644
--- a/libcxx/test/std/utilities/expected/expected.unexpected/swap/swap.free.pass.cpp
+++ b/libcxx/test/std/utilities/expected/expected.unexpected/swap/swap.free.pass.cpp
@@ -1,4 +1,5 @@
 //===----------------------------------------------------------------------===//
+//
 // Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
 // See https://llvm.org/LICENSE.txt for license information.
 // SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
diff --git a/libcxx/test/std/utilities/expected/expected.unexpected/swap/swap.member.pass.cpp b/libcxx/test/std/utilities/expected/expected.unexpected/swap/swap.member.pass.cpp
index d77870e..4f36514 100644
--- a/libcxx/test/std/utilities/expected/expected.unexpected/swap/swap.member.pass.cpp
+++ b/libcxx/test/std/utilities/expected/expected.unexpected/swap/swap.member.pass.cpp
@@ -1,4 +1,5 @@
 //===----------------------------------------------------------------------===//
+//
 // Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
 // See https://llvm.org/LICENSE.txt for license information.
 // SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
diff --git a/libcxx/test/std/utilities/expected/expected.void/assign/assign.copy.pass.cpp b/libcxx/test/std/utilities/expected/expected.void/assign/assign.copy.pass.cpp
index a51916f..6ed1a98 100644
--- a/libcxx/test/std/utilities/expected/expected.void/assign/assign.copy.pass.cpp
+++ b/libcxx/test/std/utilities/expected/expected.void/assign/assign.copy.pass.cpp
@@ -1,4 +1,5 @@
 //===----------------------------------------------------------------------===//
+//
 // Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
 // See https://llvm.org/LICENSE.txt for license information.
 // SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
diff --git a/libcxx/test/std/utilities/expected/expected.void/assign/assign.move.pass.cpp b/libcxx/test/std/utilities/expected/expected.void/assign/assign.move.pass.cpp
index 60ae034..d96a70c 100644
--- a/libcxx/test/std/utilities/expected/expected.void/assign/assign.move.pass.cpp
+++ b/libcxx/test/std/utilities/expected/expected.void/assign/assign.move.pass.cpp
@@ -1,4 +1,5 @@
 //===----------------------------------------------------------------------===//
+//
 // Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
 // See https://llvm.org/LICENSE.txt for license information.
 // SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
diff --git a/libcxx/test/std/utilities/expected/expected.void/assign/assign.unexpected.copy.pass.cpp b/libcxx/test/std/utilities/expected/expected.void/assign/assign.unexpected.copy.pass.cpp
index 699597d..409352e 100644
--- a/libcxx/test/std/utilities/expected/expected.void/assign/assign.unexpected.copy.pass.cpp
+++ b/libcxx/test/std/utilities/expected/expected.void/assign/assign.unexpected.copy.pass.cpp
@@ -1,4 +1,5 @@
 //===----------------------------------------------------------------------===//
+//
 // Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
 // See https://llvm.org/LICENSE.txt for license information.
 // SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
diff --git a/libcxx/test/std/utilities/expected/expected.void/assign/assign.unexpected.move.pass.cpp b/libcxx/test/std/utilities/expected/expected.void/assign/assign.unexpected.move.pass.cpp
index 641eb492..31cecc1 100644
--- a/libcxx/test/std/utilities/expected/expected.void/assign/assign.unexpected.move.pass.cpp
+++ b/libcxx/test/std/utilities/expected/expected.void/assign/assign.unexpected.move.pass.cpp
@@ -1,4 +1,5 @@
 //===----------------------------------------------------------------------===//
+//
 // Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
 // See https://llvm.org/LICENSE.txt for license information.
 // SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
diff --git a/libcxx/test/std/utilities/expected/expected.void/assign/emplace.pass.cpp b/libcxx/test/std/utilities/expected/expected.void/assign/emplace.pass.cpp
index a01895a..5df73cf 100644
--- a/libcxx/test/std/utilities/expected/expected.void/assign/emplace.pass.cpp
+++ b/libcxx/test/std/utilities/expected/expected.void/assign/emplace.pass.cpp
@@ -1,4 +1,5 @@
 //===----------------------------------------------------------------------===//
+//
 // Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
 // See https://llvm.org/LICENSE.txt for license information.
 // SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
diff --git a/libcxx/test/std/utilities/expected/expected.void/ctor/ctor.convert.copy.pass.cpp b/libcxx/test/std/utilities/expected/expected.void/ctor/ctor.convert.copy.pass.cpp
index 05f556e..cca3ec5 100644
--- a/libcxx/test/std/utilities/expected/expected.void/ctor/ctor.convert.copy.pass.cpp
+++ b/libcxx/test/std/utilities/expected/expected.void/ctor/ctor.convert.copy.pass.cpp
@@ -1,4 +1,5 @@
 //===----------------------------------------------------------------------===//
+//
 // Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
 // See https://llvm.org/LICENSE.txt for license information.
 // SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
diff --git a/libcxx/test/std/utilities/expected/expected.void/ctor/ctor.convert.move.pass.cpp b/libcxx/test/std/utilities/expected/expected.void/ctor/ctor.convert.move.pass.cpp
index a48888b..cb2d46f 100644
--- a/libcxx/test/std/utilities/expected/expected.void/ctor/ctor.convert.move.pass.cpp
+++ b/libcxx/test/std/utilities/expected/expected.void/ctor/ctor.convert.move.pass.cpp
@@ -1,4 +1,5 @@
 //===----------------------------------------------------------------------===//
+//
 // Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
 // See https://llvm.org/LICENSE.txt for license information.
 // SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
diff --git a/libcxx/test/std/utilities/expected/expected.void/ctor/ctor.copy.pass.cpp b/libcxx/test/std/utilities/expected/expected.void/ctor/ctor.copy.pass.cpp
index 7c04a5f..430f0ca 100644
--- a/libcxx/test/std/utilities/expected/expected.void/ctor/ctor.copy.pass.cpp
+++ b/libcxx/test/std/utilities/expected/expected.void/ctor/ctor.copy.pass.cpp
@@ -1,4 +1,5 @@
 //===----------------------------------------------------------------------===//
+//
 // Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
 // See https://llvm.org/LICENSE.txt for license information.
 // SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
diff --git a/libcxx/test/std/utilities/expected/expected.void/ctor/ctor.default.pass.cpp b/libcxx/test/std/utilities/expected/expected.void/ctor/ctor.default.pass.cpp
index e05cf027..d6a71fa 100644
--- a/libcxx/test/std/utilities/expected/expected.void/ctor/ctor.default.pass.cpp
+++ b/libcxx/test/std/utilities/expected/expected.void/ctor/ctor.default.pass.cpp
@@ -1,4 +1,5 @@
 //===----------------------------------------------------------------------===//
+//
 // Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
 // See https://llvm.org/LICENSE.txt for license information.
 // SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
diff --git a/libcxx/test/std/utilities/expected/expected.void/ctor/ctor.inplace.pass.cpp b/libcxx/test/std/utilities/expected/expected.void/ctor/ctor.inplace.pass.cpp
index 848e61d..c00d0df5 100644
--- a/libcxx/test/std/utilities/expected/expected.void/ctor/ctor.inplace.pass.cpp
+++ b/libcxx/test/std/utilities/expected/expected.void/ctor/ctor.inplace.pass.cpp
@@ -1,4 +1,5 @@
 //===----------------------------------------------------------------------===//
+//
 // Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
 // See https://llvm.org/LICENSE.txt for license information.
 // SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
diff --git a/libcxx/test/std/utilities/expected/expected.void/ctor/ctor.move.pass.cpp b/libcxx/test/std/utilities/expected/expected.void/ctor/ctor.move.pass.cpp
index bfb5028..1a79a4fd 100644
--- a/libcxx/test/std/utilities/expected/expected.void/ctor/ctor.move.pass.cpp
+++ b/libcxx/test/std/utilities/expected/expected.void/ctor/ctor.move.pass.cpp
@@ -1,4 +1,5 @@
 //===----------------------------------------------------------------------===//
+//
 // Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
 // See https://llvm.org/LICENSE.txt for license information.
 // SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
diff --git a/libcxx/test/std/utilities/expected/expected.void/ctor/ctor.unexpect.pass.cpp b/libcxx/test/std/utilities/expected/expected.void/ctor/ctor.unexpect.pass.cpp
index 85bc98d..4e7657c 100644
--- a/libcxx/test/std/utilities/expected/expected.void/ctor/ctor.unexpect.pass.cpp
+++ b/libcxx/test/std/utilities/expected/expected.void/ctor/ctor.unexpect.pass.cpp
@@ -1,4 +1,5 @@
 //===----------------------------------------------------------------------===//
+//
 // Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
 // See https://llvm.org/LICENSE.txt for license information.
 // SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
diff --git a/libcxx/test/std/utilities/expected/expected.void/ctor/ctor.unexpect_init_list.pass.cpp b/libcxx/test/std/utilities/expected/expected.void/ctor/ctor.unexpect_init_list.pass.cpp
index 4128668..7ece9db 100644
--- a/libcxx/test/std/utilities/expected/expected.void/ctor/ctor.unexpect_init_list.pass.cpp
+++ b/libcxx/test/std/utilities/expected/expected.void/ctor/ctor.unexpect_init_list.pass.cpp
@@ -1,4 +1,5 @@
 //===----------------------------------------------------------------------===//
+//
 // Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
 // See https://llvm.org/LICENSE.txt for license information.
 // SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
diff --git a/libcxx/test/std/utilities/expected/expected.void/ctor/ctor.unexpected.copy.pass.cpp b/libcxx/test/std/utilities/expected/expected.void/ctor/ctor.unexpected.copy.pass.cpp
index ba738a3..be8dcce 100644
--- a/libcxx/test/std/utilities/expected/expected.void/ctor/ctor.unexpected.copy.pass.cpp
+++ b/libcxx/test/std/utilities/expected/expected.void/ctor/ctor.unexpected.copy.pass.cpp
@@ -1,4 +1,5 @@
 //===----------------------------------------------------------------------===//
+//
 // Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
 // See https://llvm.org/LICENSE.txt for license information.
 // SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
diff --git a/libcxx/test/std/utilities/expected/expected.void/ctor/ctor.unexpected.move.pass.cpp b/libcxx/test/std/utilities/expected/expected.void/ctor/ctor.unexpected.move.pass.cpp
index 33a5e72..fea7825 100644
--- a/libcxx/test/std/utilities/expected/expected.void/ctor/ctor.unexpected.move.pass.cpp
+++ b/libcxx/test/std/utilities/expected/expected.void/ctor/ctor.unexpected.move.pass.cpp
@@ -1,4 +1,5 @@
 //===----------------------------------------------------------------------===//
+//
 // Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
 // See https://llvm.org/LICENSE.txt for license information.
 // SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
diff --git a/libcxx/test/std/utilities/expected/expected.void/dtor.pass.cpp b/libcxx/test/std/utilities/expected/expected.void/dtor.pass.cpp
index aa8f225..798149f 100644
--- a/libcxx/test/std/utilities/expected/expected.void/dtor.pass.cpp
+++ b/libcxx/test/std/utilities/expected/expected.void/dtor.pass.cpp
@@ -1,4 +1,5 @@
 //===----------------------------------------------------------------------===//
+//
 // Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
 // See https://llvm.org/LICENSE.txt for license information.
 // SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
diff --git a/libcxx/test/std/utilities/expected/expected.void/equality/equality.other_expected.pass.cpp b/libcxx/test/std/utilities/expected/expected.void/equality/equality.other_expected.pass.cpp
index eb05a84..8b24875 100644
--- a/libcxx/test/std/utilities/expected/expected.void/equality/equality.other_expected.pass.cpp
+++ b/libcxx/test/std/utilities/expected/expected.void/equality/equality.other_expected.pass.cpp
@@ -1,4 +1,5 @@
 //===----------------------------------------------------------------------===//
+//
 // Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
 // See https://llvm.org/LICENSE.txt for license information.
 // SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
diff --git a/libcxx/test/std/utilities/expected/expected.void/equality/equality.unexpected.pass.cpp b/libcxx/test/std/utilities/expected/expected.void/equality/equality.unexpected.pass.cpp
index ce95d16..4500971 100644
--- a/libcxx/test/std/utilities/expected/expected.void/equality/equality.unexpected.pass.cpp
+++ b/libcxx/test/std/utilities/expected/expected.void/equality/equality.unexpected.pass.cpp
@@ -1,4 +1,5 @@
 //===----------------------------------------------------------------------===//
+//
 // Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
 // See https://llvm.org/LICENSE.txt for license information.
 // SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
diff --git a/libcxx/test/std/utilities/expected/expected.void/observers/bool.pass.cpp b/libcxx/test/std/utilities/expected/expected.void/observers/bool.pass.cpp
index 42e9f22..b4e6b74 100644
--- a/libcxx/test/std/utilities/expected/expected.void/observers/bool.pass.cpp
+++ b/libcxx/test/std/utilities/expected/expected.void/observers/bool.pass.cpp
@@ -1,4 +1,5 @@
 //===----------------------------------------------------------------------===//
+//
 // Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
 // See https://llvm.org/LICENSE.txt for license information.
 // SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
diff --git a/libcxx/test/std/utilities/expected/expected.void/observers/deref.pass.cpp b/libcxx/test/std/utilities/expected/expected.void/observers/deref.pass.cpp
index 536f898..b08ffce 100644
--- a/libcxx/test/std/utilities/expected/expected.void/observers/deref.pass.cpp
+++ b/libcxx/test/std/utilities/expected/expected.void/observers/deref.pass.cpp
@@ -1,4 +1,5 @@
 //===----------------------------------------------------------------------===//
+//
 // Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
 // See https://llvm.org/LICENSE.txt for license information.
 // SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
diff --git a/libcxx/test/std/utilities/expected/expected.void/observers/error.pass.cpp b/libcxx/test/std/utilities/expected/expected.void/observers/error.pass.cpp
index f7be760..b9d6667 100644
--- a/libcxx/test/std/utilities/expected/expected.void/observers/error.pass.cpp
+++ b/libcxx/test/std/utilities/expected/expected.void/observers/error.pass.cpp
@@ -1,4 +1,5 @@
 //===----------------------------------------------------------------------===//
+//
 // Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
 // See https://llvm.org/LICENSE.txt for license information.
 // SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
diff --git a/libcxx/test/std/utilities/expected/expected.void/observers/has_value.pass.cpp b/libcxx/test/std/utilities/expected/expected.void/observers/has_value.pass.cpp
index fe92bb4..549d2f08 100644
--- a/libcxx/test/std/utilities/expected/expected.void/observers/has_value.pass.cpp
+++ b/libcxx/test/std/utilities/expected/expected.void/observers/has_value.pass.cpp
@@ -1,4 +1,5 @@
 //===----------------------------------------------------------------------===//
+//
 // Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
 // See https://llvm.org/LICENSE.txt for license information.
 // SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
diff --git a/libcxx/test/std/utilities/expected/expected.void/observers/value.pass.cpp b/libcxx/test/std/utilities/expected/expected.void/observers/value.pass.cpp
index a24d67c..2b109f9 100644
--- a/libcxx/test/std/utilities/expected/expected.void/observers/value.pass.cpp
+++ b/libcxx/test/std/utilities/expected/expected.void/observers/value.pass.cpp
@@ -1,4 +1,5 @@
 //===----------------------------------------------------------------------===//
+//
 // Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
 // See https://llvm.org/LICENSE.txt for license information.
 // SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
diff --git a/libcxx/test/std/utilities/expected/expected.void/swap/free.swap.pass.cpp b/libcxx/test/std/utilities/expected/expected.void/swap/free.swap.pass.cpp
index 5fe3db5..b76a561 100644
--- a/libcxx/test/std/utilities/expected/expected.void/swap/free.swap.pass.cpp
+++ b/libcxx/test/std/utilities/expected/expected.void/swap/free.swap.pass.cpp
@@ -1,4 +1,5 @@
 //===----------------------------------------------------------------------===//
+//
 // Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
 // See https://llvm.org/LICENSE.txt for license information.
 // SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
diff --git a/libcxx/test/std/utilities/expected/expected.void/swap/member.swap.pass.cpp b/libcxx/test/std/utilities/expected/expected.void/swap/member.swap.pass.cpp
index 25601af..368a85b 100644
--- a/libcxx/test/std/utilities/expected/expected.void/swap/member.swap.pass.cpp
+++ b/libcxx/test/std/utilities/expected/expected.void/swap/member.swap.pass.cpp
@@ -1,4 +1,5 @@
 //===----------------------------------------------------------------------===//
+//
 // Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
 // See https://llvm.org/LICENSE.txt for license information.
 // SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
diff --git a/libcxx/test/std/utilities/format/format.arguments/format.arg.store/make_format_args.pass.cpp b/libcxx/test/std/utilities/format/format.arguments/format.arg.store/make_format_args.pass.cpp
index 62fd0f2..de551be 100644
--- a/libcxx/test/std/utilities/format/format.arguments/format.arg.store/make_format_args.pass.cpp
+++ b/libcxx/test/std/utilities/format/format.arguments/format.arg.store/make_format_args.pass.cpp
@@ -1,4 +1,5 @@
 //===----------------------------------------------------------------------===//
+//
 // Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
 // See https://llvm.org/LICENSE.txt for license information.
 // SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
diff --git a/libcxx/test/std/utilities/format/format.arguments/format.arg.store/make_format_args.sh.cpp b/libcxx/test/std/utilities/format/format.arguments/format.arg.store/make_format_args.sh.cpp
index 2d5ee83..ff51506 100644
--- a/libcxx/test/std/utilities/format/format.arguments/format.arg.store/make_format_args.sh.cpp
+++ b/libcxx/test/std/utilities/format/format.arguments/format.arg.store/make_format_args.sh.cpp
@@ -1,4 +1,5 @@
 //===----------------------------------------------------------------------===//
+//
 // Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
 // See https://llvm.org/LICENSE.txt for license information.
 // SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
diff --git a/libcxx/test/std/utilities/format/format.arguments/format.arg.store/make_wformat_args.pass.cpp b/libcxx/test/std/utilities/format/format.arguments/format.arg.store/make_wformat_args.pass.cpp
index 73c4395..2aa5807 100644
--- a/libcxx/test/std/utilities/format/format.arguments/format.arg.store/make_wformat_args.pass.cpp
+++ b/libcxx/test/std/utilities/format/format.arguments/format.arg.store/make_wformat_args.pass.cpp
@@ -1,4 +1,5 @@
 //===----------------------------------------------------------------------===//
+//
 // Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
 // See https://llvm.org/LICENSE.txt for license information.
 // SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
diff --git a/libcxx/test/std/utilities/format/format.arguments/format.arg/ctor.pass.cpp b/libcxx/test/std/utilities/format/format.arguments/format.arg/ctor.pass.cpp
index 0f61c4b..e92b559 100644
--- a/libcxx/test/std/utilities/format/format.arguments/format.arg/ctor.pass.cpp
+++ b/libcxx/test/std/utilities/format/format.arguments/format.arg/ctor.pass.cpp
@@ -1,4 +1,5 @@
 //===----------------------------------------------------------------------===//
+//
 // Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
 // See https://llvm.org/LICENSE.txt for license information.
 // SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
diff --git a/libcxx/test/std/utilities/format/format.arguments/format.arg/operator_bool.pass.cpp b/libcxx/test/std/utilities/format/format.arguments/format.arg/operator_bool.pass.cpp
index 407ab9a..472d5ed 100644
--- a/libcxx/test/std/utilities/format/format.arguments/format.arg/operator_bool.pass.cpp
+++ b/libcxx/test/std/utilities/format/format.arguments/format.arg/operator_bool.pass.cpp
@@ -1,4 +1,5 @@
 //===----------------------------------------------------------------------===//
+//
 // Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
 // See https://llvm.org/LICENSE.txt for license information.
 // SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
diff --git a/libcxx/test/std/utilities/format/format.arguments/format.arg/visit.pass.cpp b/libcxx/test/std/utilities/format/format.arguments/format.arg/visit.pass.cpp
index 284b03c..829b741 100644
--- a/libcxx/test/std/utilities/format/format.arguments/format.arg/visit.pass.cpp
+++ b/libcxx/test/std/utilities/format/format.arguments/format.arg/visit.pass.cpp
@@ -1,4 +1,5 @@
 //===----------------------------------------------------------------------===//
+//
 // Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
 // See https://llvm.org/LICENSE.txt for license information.
 // SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
diff --git a/libcxx/test/std/utilities/format/format.arguments/format.arg/visit.return_type.pass.cpp b/libcxx/test/std/utilities/format/format.arguments/format.arg/visit.return_type.pass.cpp
index 4c60cb0..874d609 100644
--- a/libcxx/test/std/utilities/format/format.arguments/format.arg/visit.return_type.pass.cpp
+++ b/libcxx/test/std/utilities/format/format.arguments/format.arg/visit.return_type.pass.cpp
@@ -1,4 +1,5 @@
 //===----------------------------------------------------------------------===//
+//
 // Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
 // See https://llvm.org/LICENSE.txt for license information.
 // SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
diff --git a/libcxx/test/std/utilities/format/format.arguments/format.arg/visit_format_arg.deprecated.verify.cpp b/libcxx/test/std/utilities/format/format.arguments/format.arg/visit_format_arg.deprecated.verify.cpp
index 6a3896c..e3e3e9a 100644
--- a/libcxx/test/std/utilities/format/format.arguments/format.arg/visit_format_arg.deprecated.verify.cpp
+++ b/libcxx/test/std/utilities/format/format.arguments/format.arg/visit_format_arg.deprecated.verify.cpp
@@ -1,4 +1,5 @@
 //===----------------------------------------------------------------------===//
+//
 // Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
 // See https://llvm.org/LICENSE.txt for license information.
 // SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
diff --git a/libcxx/test/std/utilities/format/format.arguments/format.arg/visit_format_arg.pass.cpp b/libcxx/test/std/utilities/format/format.arguments/format.arg/visit_format_arg.pass.cpp
index 5ab8c64..d99675a7 100644
--- a/libcxx/test/std/utilities/format/format.arguments/format.arg/visit_format_arg.pass.cpp
+++ b/libcxx/test/std/utilities/format/format.arguments/format.arg/visit_format_arg.pass.cpp
@@ -1,4 +1,5 @@
 //===----------------------------------------------------------------------===//
+//
 // Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
 // See https://llvm.org/LICENSE.txt for license information.
 // SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
diff --git a/libcxx/test/std/utilities/format/format.arguments/format.args/ctad.compile.pass.cpp b/libcxx/test/std/utilities/format/format.arguments/format.args/ctad.compile.pass.cpp
index b87b5c7..e1c3493 100644
--- a/libcxx/test/std/utilities/format/format.arguments/format.args/ctad.compile.pass.cpp
+++ b/libcxx/test/std/utilities/format/format.arguments/format.args/ctad.compile.pass.cpp
@@ -1,4 +1,5 @@
 //===----------------------------------------------------------------------===//
+//
 // Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
 // See https://llvm.org/LICENSE.txt for license information.
 // SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
diff --git a/libcxx/test/std/utilities/format/format.arguments/format.args/ctor.pass.cpp b/libcxx/test/std/utilities/format/format.arguments/format.args/ctor.pass.cpp
index bb542a8..4686ed8 100644
--- a/libcxx/test/std/utilities/format/format.arguments/format.args/ctor.pass.cpp
+++ b/libcxx/test/std/utilities/format/format.arguments/format.args/ctor.pass.cpp
@@ -1,4 +1,5 @@
 //===----------------------------------------------------------------------===//
+//
 // Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
 // See https://llvm.org/LICENSE.txt for license information.
 // SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
diff --git a/libcxx/test/std/utilities/format/format.arguments/format.args/get.pass.cpp b/libcxx/test/std/utilities/format/format.arguments/format.args/get.pass.cpp
index 8043948..c7dd82d 100644
--- a/libcxx/test/std/utilities/format/format.arguments/format.args/get.pass.cpp
+++ b/libcxx/test/std/utilities/format/format.arguments/format.args/get.pass.cpp
@@ -1,4 +1,5 @@
 //===----------------------------------------------------------------------===//
+//
 // Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
 // See https://llvm.org/LICENSE.txt for license information.
 // SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
diff --git a/libcxx/test/std/utilities/format/format.arguments/format.args/types.compile.pass.cpp b/libcxx/test/std/utilities/format/format.arguments/format.args/types.compile.pass.cpp
index ba5ef4e..d44877e 100644
--- a/libcxx/test/std/utilities/format/format.arguments/format.args/types.compile.pass.cpp
+++ b/libcxx/test/std/utilities/format/format.arguments/format.args/types.compile.pass.cpp
@@ -1,4 +1,5 @@
 //===----------------------------------------------------------------------===//
+//
 // Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
 // See https://llvm.org/LICENSE.txt for license information.
 // SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
diff --git a/libcxx/test/std/utilities/format/format.formattable/concept.formattable.compile.pass.cpp b/libcxx/test/std/utilities/format/format.formattable/concept.formattable.compile.pass.cpp
index 94a1ea5..52cfa2c 100644
--- a/libcxx/test/std/utilities/format/format.formattable/concept.formattable.compile.pass.cpp
+++ b/libcxx/test/std/utilities/format/format.formattable/concept.formattable.compile.pass.cpp
@@ -1,4 +1,5 @@
 //===----------------------------------------------------------------------===//
+//
 // Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
 // See https://llvm.org/LICENSE.txt for license information.
 // SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
diff --git a/libcxx/test/std/utilities/format/format.formattable/concept.formattable.float.compile.pass.cpp b/libcxx/test/std/utilities/format/format.formattable/concept.formattable.float.compile.pass.cpp
index a20972c..bddc36d 100644
--- a/libcxx/test/std/utilities/format/format.formattable/concept.formattable.float.compile.pass.cpp
+++ b/libcxx/test/std/utilities/format/format.formattable/concept.formattable.float.compile.pass.cpp
@@ -1,4 +1,5 @@
 //===----------------------------------------------------------------------===//
+//
 // Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
 // See https://llvm.org/LICENSE.txt for license information.
 // SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
diff --git a/libcxx/test/std/utilities/format/format.formatter/format.context/format.context/advance_to.pass.cpp b/libcxx/test/std/utilities/format/format.formatter/format.context/format.context/advance_to.pass.cpp
index c81074e..824c5d6 100644
--- a/libcxx/test/std/utilities/format/format.formatter/format.context/format.context/advance_to.pass.cpp
+++ b/libcxx/test/std/utilities/format/format.formatter/format.context/format.context/advance_to.pass.cpp
@@ -1,4 +1,5 @@
 //===----------------------------------------------------------------------===//
+//
 // Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
 // See https://llvm.org/LICENSE.txt for license information.
 // SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
diff --git a/libcxx/test/std/utilities/format/format.formatter/format.context/format.context/arg.pass.cpp b/libcxx/test/std/utilities/format/format.formatter/format.context/format.context/arg.pass.cpp
index 824813d..673d5dd 100644
--- a/libcxx/test/std/utilities/format/format.formatter/format.context/format.context/arg.pass.cpp
+++ b/libcxx/test/std/utilities/format/format.formatter/format.context/format.context/arg.pass.cpp
@@ -1,4 +1,5 @@
 //===----------------------------------------------------------------------===//
+//
 // Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
 // See https://llvm.org/LICENSE.txt for license information.
 // SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
diff --git a/libcxx/test/std/utilities/format/format.formatter/format.context/format.context/ctor.pass.cpp b/libcxx/test/std/utilities/format/format.formatter/format.context/format.context/ctor.pass.cpp
index 83ece7d..a32e75a 100644
--- a/libcxx/test/std/utilities/format/format.formatter/format.context/format.context/ctor.pass.cpp
+++ b/libcxx/test/std/utilities/format/format.formatter/format.context/format.context/ctor.pass.cpp
@@ -1,4 +1,5 @@
 //===----------------------------------------------------------------------===//
+//
 // Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
 // See https://llvm.org/LICENSE.txt for license information.
 // SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
diff --git a/libcxx/test/std/utilities/format/format.formatter/format.context/format.context/locale.pass.cpp b/libcxx/test/std/utilities/format/format.formatter/format.context/format.context/locale.pass.cpp
index 14bdc14..197f9bd 100644
--- a/libcxx/test/std/utilities/format/format.formatter/format.context/format.context/locale.pass.cpp
+++ b/libcxx/test/std/utilities/format/format.formatter/format.context/format.context/locale.pass.cpp
@@ -1,4 +1,5 @@
 //===----------------------------------------------------------------------===//
+//
 // Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
 // See https://llvm.org/LICENSE.txt for license information.
 // SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
diff --git a/libcxx/test/std/utilities/format/format.formatter/format.context/format.context/out.pass.cpp b/libcxx/test/std/utilities/format/format.formatter/format.context/format.context/out.pass.cpp
index 3d3a5d1..b887183 100644
--- a/libcxx/test/std/utilities/format/format.formatter/format.context/format.context/out.pass.cpp
+++ b/libcxx/test/std/utilities/format/format.formatter/format.context/format.context/out.pass.cpp
@@ -1,4 +1,5 @@
 //===----------------------------------------------------------------------===//
+//
 // Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
 // See https://llvm.org/LICENSE.txt for license information.
 // SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
diff --git a/libcxx/test/std/utilities/format/format.formatter/format.formatter.locking/enable_nonlocking_formatter_optimization.compile.pass.cpp b/libcxx/test/std/utilities/format/format.formatter/format.formatter.locking/enable_nonlocking_formatter_optimization.compile.pass.cpp
new file mode 100644
index 0000000..934de7c
--- /dev/null
+++ b/libcxx/test/std/utilities/format/format.formatter/format.formatter.locking/enable_nonlocking_formatter_optimization.compile.pass.cpp
@@ -0,0 +1,236 @@
+//===----------------------------------------------------------------------===//
+//
+// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
+// See https://llvm.org/LICENSE.txt for license information.
+// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
+//
+//===----------------------------------------------------------------------===//
+
+// UNSUPPORTED: c++03, c++11, c++14, c++17, c++20
+
+// <format>
+
+// template<class T>
+// constexpr bool enable_nonlocking_formatter_optimization = false;
+
+// Remarks: Pursuant to [namespace.std], users may specialize
+// enable_nonlocking_formatter_optimization for cv-unqualified program-defined
+// types. Such specializations shall be usable in constant expressions
+// ([expr.const]) and have type const bool.
+
+// [format.formatter.spec]
+// In addition, for each type T for which a formatter specialization is provided
+// above, each of the headers provides the following specialization:
+//
+// template<>
+// inline constexpr bool enable_nonlocking_formatter_optimization<T> = true;
+
+#include <array>
+#include <bitset>
+#include <bitset>
+#include <chrono>
+#include <complex>
+#include <concepts>
+#include <deque>
+#include <filesystem>
+#include <format>
+#include <forward_list>
+#include <list>
+#include <map>
+#include <memory>
+#include <optional>
+#include <queue>
+#include <set>
+#include <span>
+#include <stack>
+#include <system_error>
+#include <tuple>
+#include <type_traits>
+#include <unordered_map>
+#include <unordered_set>
+#include <valarray>
+#include <variant>
+#include <vector>
+
+#include "test_macros.h"
+#include "min_allocator.h"
+
+#ifndef TEST_HAS_NO_LOCALIZATION
+#  include <regex>
+#endif
+#ifndef TEST_HAS_NO_THREADS
+#  include <thread>
+#endif
+
+// Tests for P0645 Text Formatting
+template <class CharT>
+void test_P0645() {
+  static_assert(std::enable_nonlocking_formatter_optimization<CharT>);
+
+  static_assert(std::enable_nonlocking_formatter_optimization<CharT*>);
+  static_assert(std::enable_nonlocking_formatter_optimization<const CharT*>);
+  static_assert(std::enable_nonlocking_formatter_optimization<CharT[42]>);
+
+  static_assert(std::enable_nonlocking_formatter_optimization<std::basic_string<CharT>>);
+  static_assert(std::enable_nonlocking_formatter_optimization<std::basic_string_view<CharT>>);
+
+  static_assert(std::enable_nonlocking_formatter_optimization<bool>);
+
+  static_assert(std::enable_nonlocking_formatter_optimization<signed char>);
+  static_assert(std::enable_nonlocking_formatter_optimization<signed short>);
+  static_assert(std::enable_nonlocking_formatter_optimization<signed int>);
+  static_assert(std::enable_nonlocking_formatter_optimization<signed long>);
+  static_assert(std::enable_nonlocking_formatter_optimization<signed long long>);
+#ifndef TEST_HAS_NO_INT128
+  static_assert(std::enable_nonlocking_formatter_optimization<__int128_t>);
+#endif
+
+  static_assert(std::enable_nonlocking_formatter_optimization<unsigned char>);
+  static_assert(std::enable_nonlocking_formatter_optimization<unsigned short>);
+  static_assert(std::enable_nonlocking_formatter_optimization<unsigned int>);
+  static_assert(std::enable_nonlocking_formatter_optimization<unsigned long>);
+  static_assert(std::enable_nonlocking_formatter_optimization<unsigned long long>);
+#ifndef TEST_HAS_NO_INT128
+  static_assert(std::enable_nonlocking_formatter_optimization<__uint128_t>);
+#endif
+
+  static_assert(std::enable_nonlocking_formatter_optimization<float>);
+  static_assert(std::enable_nonlocking_formatter_optimization<double>);
+  static_assert(std::enable_nonlocking_formatter_optimization<long double>);
+
+  static_assert(std::enable_nonlocking_formatter_optimization<std::nullptr_t>);
+  static_assert(std::enable_nonlocking_formatter_optimization<void*>);
+  static_assert(std::enable_nonlocking_formatter_optimization<const void*>);
+}
+
+// Tests for P1361 Integration of chrono with text formatting
+//
+// Some tests are commented out since these types haven't been implemented in
+// chrono yet. After P1361 has been implemented these formatters should be all
+// enabled.
+void test_P1361() {
+// The chrono formatters require localization support.
+// [time.format]/7
+//   If the chrono-specs is omitted, the chrono object is formatted as if by
+//   streaming it to std::ostringstream os with the formatting
+//   locale imbued and copying os.str() through the output iterator of the
+//   context with additional padding and adjustments as specified by the format
+//   specifiers.
+// In libc++ std:::ostringstream requires localization support.
+#ifndef TEST_HAS_NO_LOCALIZATION
+
+  static_assert(!std::enable_nonlocking_formatter_optimization<std::chrono::microseconds>);
+
+  static_assert(!std::enable_nonlocking_formatter_optimization<std::chrono::sys_time<std::chrono::microseconds>>);
+  //static_assert(!std::enable_nonlocking_formatter_optimization<std::chrono::utc_time<std::chrono::microseconds>>);
+  //static_assert(!std::enable_nonlocking_formatter_optimization<std::chrono::tai_time<std::chrono::microseconds>>);
+  //static_assert(!std::enable_nonlocking_formatter_optimization<std::chrono::gps_time<std::chrono::microseconds>>);
+  static_assert(!std::enable_nonlocking_formatter_optimization<std::chrono::file_time<std::chrono::microseconds>>);
+  static_assert(!std::enable_nonlocking_formatter_optimization<std::chrono::local_time<std::chrono::microseconds>>);
+
+  static_assert(!std::enable_nonlocking_formatter_optimization<std::chrono::day>);
+  static_assert(!std::enable_nonlocking_formatter_optimization<std::chrono::month>);
+  static_assert(!std::enable_nonlocking_formatter_optimization<std::chrono::year>);
+
+  static_assert(!std::enable_nonlocking_formatter_optimization<std::chrono::weekday>);
+  static_assert(!std::enable_nonlocking_formatter_optimization<std::chrono::weekday_indexed>);
+  static_assert(!std::enable_nonlocking_formatter_optimization<std::chrono::weekday_last>);
+
+  static_assert(!std::enable_nonlocking_formatter_optimization<std::chrono::month_day>);
+  static_assert(!std::enable_nonlocking_formatter_optimization<std::chrono::month_day_last>);
+  static_assert(!std::enable_nonlocking_formatter_optimization<std::chrono::month_weekday>);
+  static_assert(!std::enable_nonlocking_formatter_optimization<std::chrono::month_weekday_last>);
+
+  static_assert(!std::enable_nonlocking_formatter_optimization<std::chrono::year_month>);
+  static_assert(!std::enable_nonlocking_formatter_optimization<std::chrono::year_month_day>);
+  static_assert(!std::enable_nonlocking_formatter_optimization<std::chrono::year_month_day_last>);
+  static_assert(!std::enable_nonlocking_formatter_optimization<std::chrono::year_month_weekday>);
+  static_assert(!std::enable_nonlocking_formatter_optimization<std::chrono::year_month_weekday_last>);
+
+  static_assert(!std::enable_nonlocking_formatter_optimization<std::chrono::hh_mm_ss<std::chrono::microseconds>>);
+
+  //static_assert(!std::enable_nonlocking_formatter_optimization<std::chrono::sys_info>);
+  //static_assert(!std::enable_nonlocking_formatter_optimization<std::chrono::local_info>);
+
+  //static_assert(!std::enable_nonlocking_formatter_optimization<std::chrono::zoned_time>);
+
+#endif // TEST_HAS_NO_LOCALIZATION
+}
+
+// Tests for P1636 Formatters for library types
+//
+// The paper hasn't been voted in so currently all formatters are disabled.
+// Note the paper has been abandoned, the types are kept since other papers may
+// introduce these formatters.
+void test_P1636() {
+#ifndef TEST_HAS_NO_THREADS
+  static_assert(!std::enable_nonlocking_formatter_optimization<std::thread::id>);
+#endif
+}
+
+template <class Vector>
+void test_P2286_vector_bool() {
+  static_assert(!std::enable_nonlocking_formatter_optimization<Vector>);
+  static_assert(!std::enable_nonlocking_formatter_optimization<typename Vector::reference>);
+
+  // The const_reference shall be a bool.
+  // However libc++ uses a __bit_const_reference<vector> when
+  // _LIBCPP_ABI_BITSET_VECTOR_BOOL_CONST_SUBSCRIPT_RETURN_BOOL is defined.
+  static_assert(!std::enable_nonlocking_formatter_optimization<const Vector&>);
+  static_assert(std::enable_nonlocking_formatter_optimization<typename Vector::const_reference> ==
+                std::same_as<typename Vector::const_reference, bool>);
+}
+
+// Tests for P2286 Formatting ranges
+void test_P2286() {
+  static_assert(!std::enable_nonlocking_formatter_optimization<std::array<int, 42>>);
+  static_assert(!std::enable_nonlocking_formatter_optimization<std::vector<int>>);
+  static_assert(!std::enable_nonlocking_formatter_optimization<std::deque<int>>);
+  static_assert(!std::enable_nonlocking_formatter_optimization<std::forward_list<int>>);
+  static_assert(!std::enable_nonlocking_formatter_optimization<std::list<int>>);
+
+  static_assert(!std::enable_nonlocking_formatter_optimization<std::set<int>>);
+  static_assert(!std::enable_nonlocking_formatter_optimization<std::map<int, int>>);
+  static_assert(!std::enable_nonlocking_formatter_optimization<std::multiset<int>>);
+  static_assert(!std::enable_nonlocking_formatter_optimization<std::multimap<int, int>>);
+
+  static_assert(!std::enable_nonlocking_formatter_optimization<std::unordered_set<int>>);
+  static_assert(!std::enable_nonlocking_formatter_optimization<std::unordered_map<int, int>>);
+  static_assert(!std::enable_nonlocking_formatter_optimization<std::unordered_multiset<int>>);
+  static_assert(!std::enable_nonlocking_formatter_optimization<std::unordered_multimap<int, int>>);
+
+  static_assert(!std::enable_nonlocking_formatter_optimization<std::stack<int>>);
+  static_assert(!std::enable_nonlocking_formatter_optimization<std::queue<int>>);
+  static_assert(!std::enable_nonlocking_formatter_optimization<std::priority_queue<int>>);
+
+  static_assert(!std::enable_nonlocking_formatter_optimization<std::span<int>>);
+
+  static_assert(!std::enable_nonlocking_formatter_optimization<std::valarray<int>>);
+
+  static_assert(!std::enable_nonlocking_formatter_optimization<std::pair<int, int>>);
+  static_assert(!std::enable_nonlocking_formatter_optimization<std::tuple<int>>);
+
+  test_P2286_vector_bool<std::vector<bool>>();
+  test_P2286_vector_bool<std::vector<bool, std::allocator<bool>>>();
+  test_P2286_vector_bool<std::vector<bool, min_allocator<bool>>>();
+}
+
+// The trait does not care about whether the type is formattable, obviously the
+// trait for non formattable types are not used.
+struct not_formattable_nonlocking_disabled {};
+static_assert(!std::enable_nonlocking_formatter_optimization<not_formattable_nonlocking_disabled>);
+
+struct not_formattable_nonlocking_enabled {};
+template <>
+inline constexpr bool std::enable_nonlocking_formatter_optimization<not_formattable_nonlocking_enabled> = true;
+static_assert(std::enable_nonlocking_formatter_optimization<not_formattable_nonlocking_enabled>);
+
+void test() {
+  test_P0645<char>();
+#ifndef TEST_HAS_NO_WIDE_CHARACTERS
+  test_P0645<wchar_t>();
+#endif
+  test_P1361();
+  test_P1636();
+  test_P2286();
+}
diff --git a/libcxx/test/std/utilities/format/format.formatter/format.formatter.spec/formatter.bool.pass.cpp b/libcxx/test/std/utilities/format/format.formatter/format.formatter.spec/formatter.bool.pass.cpp
index 116f78e..80b6946 100644
--- a/libcxx/test/std/utilities/format/format.formatter/format.formatter.spec/formatter.bool.pass.cpp
+++ b/libcxx/test/std/utilities/format/format.formatter/format.formatter.spec/formatter.bool.pass.cpp
@@ -1,4 +1,5 @@
 //===----------------------------------------------------------------------===//
+//
 // Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
 // See https://llvm.org/LICENSE.txt for license information.
 // SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
diff --git a/libcxx/test/std/utilities/format/format.formatter/format.formatter.spec/formatter.c_string.pass.cpp b/libcxx/test/std/utilities/format/format.formatter/format.formatter.spec/formatter.c_string.pass.cpp
index 3125dd8..4f9efca 100644
--- a/libcxx/test/std/utilities/format/format.formatter/format.formatter.spec/formatter.c_string.pass.cpp
+++ b/libcxx/test/std/utilities/format/format.formatter/format.formatter.spec/formatter.c_string.pass.cpp
@@ -1,4 +1,5 @@
 //===----------------------------------------------------------------------===//
+//
 // Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
 // See https://llvm.org/LICENSE.txt for license information.
 // SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
diff --git a/libcxx/test/std/utilities/format/format.formatter/format.formatter.spec/formatter.char.fsigned-char.pass.cpp b/libcxx/test/std/utilities/format/format.formatter/format.formatter.spec/formatter.char.fsigned-char.pass.cpp
index f3d587e1..1043676 100644
--- a/libcxx/test/std/utilities/format/format.formatter/format.formatter.spec/formatter.char.fsigned-char.pass.cpp
+++ b/libcxx/test/std/utilities/format/format.formatter/format.formatter.spec/formatter.char.fsigned-char.pass.cpp
@@ -1,4 +1,5 @@
 //===----------------------------------------------------------------------===//
+//
 // Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
 // See https://llvm.org/LICENSE.txt for license information.
 // SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
diff --git a/libcxx/test/std/utilities/format/format.formatter/format.formatter.spec/formatter.char.funsigned-char.pass.cpp b/libcxx/test/std/utilities/format/format.formatter/format.formatter.spec/formatter.char.funsigned-char.pass.cpp
index a7577c1..1fde08a 100644
--- a/libcxx/test/std/utilities/format/format.formatter/format.formatter.spec/formatter.char.funsigned-char.pass.cpp
+++ b/libcxx/test/std/utilities/format/format.formatter/format.formatter.spec/formatter.char.funsigned-char.pass.cpp
@@ -1,4 +1,5 @@
 //===----------------------------------------------------------------------===//
+//
 // Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
 // See https://llvm.org/LICENSE.txt for license information.
 // SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
diff --git a/libcxx/test/std/utilities/format/format.formatter/format.formatter.spec/formatter.char.pass.cpp b/libcxx/test/std/utilities/format/format.formatter/format.formatter.spec/formatter.char.pass.cpp
index 0723547..dd240ee 100644
--- a/libcxx/test/std/utilities/format/format.formatter/format.formatter.spec/formatter.char.pass.cpp
+++ b/libcxx/test/std/utilities/format/format.formatter/format.formatter.spec/formatter.char.pass.cpp
@@ -1,4 +1,5 @@
 //===----------------------------------------------------------------------===//
+//
 // Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
 // See https://llvm.org/LICENSE.txt for license information.
 // SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
diff --git a/libcxx/test/std/utilities/format/format.formatter/format.formatter.spec/formatter.char_array.pass.cpp b/libcxx/test/std/utilities/format/format.formatter/format.formatter.spec/formatter.char_array.pass.cpp
index 23dcc0b..1b3ff52 100644
--- a/libcxx/test/std/utilities/format/format.formatter/format.formatter.spec/formatter.char_array.pass.cpp
+++ b/libcxx/test/std/utilities/format/format.formatter/format.formatter.spec/formatter.char_array.pass.cpp
@@ -1,4 +1,5 @@
 //===----------------------------------------------------------------------===//
+//
 // Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
 // See https://llvm.org/LICENSE.txt for license information.
 // SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
diff --git a/libcxx/test/std/utilities/format/format.formatter/format.formatter.spec/formatter.floating_point.pass.cpp b/libcxx/test/std/utilities/format/format.formatter/format.formatter.spec/formatter.floating_point.pass.cpp
index 263dc1d..3ad84577 100644
--- a/libcxx/test/std/utilities/format/format.formatter/format.formatter.spec/formatter.floating_point.pass.cpp
+++ b/libcxx/test/std/utilities/format/format.formatter/format.formatter.spec/formatter.floating_point.pass.cpp
@@ -1,4 +1,5 @@
 //===----------------------------------------------------------------------===//
+//
 // Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
 // See https://llvm.org/LICENSE.txt for license information.
 // SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
diff --git a/libcxx/test/std/utilities/format/format.formatter/format.formatter.spec/formatter.handle.pass.cpp b/libcxx/test/std/utilities/format/format.formatter/format.formatter.spec/formatter.handle.pass.cpp
index 5921cc6..5c85580 100644
--- a/libcxx/test/std/utilities/format/format.formatter/format.formatter.spec/formatter.handle.pass.cpp
+++ b/libcxx/test/std/utilities/format/format.formatter/format.formatter.spec/formatter.handle.pass.cpp
@@ -1,4 +1,5 @@
 //===----------------------------------------------------------------------===//
+//
 // Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
 // See https://llvm.org/LICENSE.txt for license information.
 // SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
diff --git a/libcxx/test/std/utilities/format/format.formatter/format.formatter.spec/formatter.pointer.pass.cpp b/libcxx/test/std/utilities/format/format.formatter/format.formatter.spec/formatter.pointer.pass.cpp
index 408168e..19bb8bf 100644
--- a/libcxx/test/std/utilities/format/format.formatter/format.formatter.spec/formatter.pointer.pass.cpp
+++ b/libcxx/test/std/utilities/format/format.formatter/format.formatter.spec/formatter.pointer.pass.cpp
@@ -1,4 +1,5 @@
 //===----------------------------------------------------------------------===//
+//
 // Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
 // See https://llvm.org/LICENSE.txt for license information.
 // SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
diff --git a/libcxx/test/std/utilities/format/format.formatter/format.formatter.spec/formatter.signed_integral.pass.cpp b/libcxx/test/std/utilities/format/format.formatter/format.formatter.spec/formatter.signed_integral.pass.cpp
index cdd56d1..d589a31 100644
--- a/libcxx/test/std/utilities/format/format.formatter/format.formatter.spec/formatter.signed_integral.pass.cpp
+++ b/libcxx/test/std/utilities/format/format.formatter/format.formatter.spec/formatter.signed_integral.pass.cpp
@@ -1,4 +1,5 @@
 //===----------------------------------------------------------------------===//
+//
 // Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
 // See https://llvm.org/LICENSE.txt for license information.
 // SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
diff --git a/libcxx/test/std/utilities/format/format.formatter/format.formatter.spec/formatter.string.pass.cpp b/libcxx/test/std/utilities/format/format.formatter/format.formatter.spec/formatter.string.pass.cpp
index 49f54dae..9728a2f 100644
--- a/libcxx/test/std/utilities/format/format.formatter/format.formatter.spec/formatter.string.pass.cpp
+++ b/libcxx/test/std/utilities/format/format.formatter/format.formatter.spec/formatter.string.pass.cpp
@@ -1,4 +1,5 @@
 //===----------------------------------------------------------------------===//
+//
 // Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
 // See https://llvm.org/LICENSE.txt for license information.
 // SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
diff --git a/libcxx/test/std/utilities/format/format.formatter/format.formatter.spec/formatter.unsigned_integral.pass.cpp b/libcxx/test/std/utilities/format/format.formatter/format.formatter.spec/formatter.unsigned_integral.pass.cpp
index a953746..7446ab2 100644
--- a/libcxx/test/std/utilities/format/format.formatter/format.formatter.spec/formatter.unsigned_integral.pass.cpp
+++ b/libcxx/test/std/utilities/format/format.formatter/format.formatter.spec/formatter.unsigned_integral.pass.cpp
@@ -1,4 +1,5 @@
 //===----------------------------------------------------------------------===//
+//
 // Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
 // See https://llvm.org/LICENSE.txt for license information.
 // SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
diff --git a/libcxx/test/std/utilities/format/format.formatter/format.parse.ctx/advance_to.pass.cpp b/libcxx/test/std/utilities/format/format.formatter/format.parse.ctx/advance_to.pass.cpp
index f2fab80..40a40a7 100644
--- a/libcxx/test/std/utilities/format/format.formatter/format.parse.ctx/advance_to.pass.cpp
+++ b/libcxx/test/std/utilities/format/format.formatter/format.parse.ctx/advance_to.pass.cpp
@@ -1,4 +1,5 @@
 //===----------------------------------------------------------------------===//
+//
 // Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
 // See https://llvm.org/LICENSE.txt for license information.
 // SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
diff --git a/libcxx/test/std/utilities/format/format.formatter/format.parse.ctx/begin.pass.cpp b/libcxx/test/std/utilities/format/format.formatter/format.parse.ctx/begin.pass.cpp
index db301d3..113c7a4 100644
--- a/libcxx/test/std/utilities/format/format.formatter/format.parse.ctx/begin.pass.cpp
+++ b/libcxx/test/std/utilities/format/format.formatter/format.parse.ctx/begin.pass.cpp
@@ -1,4 +1,5 @@
 //===----------------------------------------------------------------------===//
+//
 // Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
 // See https://llvm.org/LICENSE.txt for license information.
 // SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
diff --git a/libcxx/test/std/utilities/format/format.formatter/format.parse.ctx/check_arg_id.pass.cpp b/libcxx/test/std/utilities/format/format.formatter/format.parse.ctx/check_arg_id.pass.cpp
index 96a6b26..3760b1a 100644
--- a/libcxx/test/std/utilities/format/format.formatter/format.parse.ctx/check_arg_id.pass.cpp
+++ b/libcxx/test/std/utilities/format/format.formatter/format.parse.ctx/check_arg_id.pass.cpp
@@ -1,4 +1,5 @@
 //===----------------------------------------------------------------------===//
+//
 // Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
 // See https://llvm.org/LICENSE.txt for license information.
 // SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
diff --git a/libcxx/test/std/utilities/format/format.formatter/format.parse.ctx/check_arg_id.verify.cpp b/libcxx/test/std/utilities/format/format.formatter/format.parse.ctx/check_arg_id.verify.cpp
index ad36b66..db6683e 100644
--- a/libcxx/test/std/utilities/format/format.formatter/format.parse.ctx/check_arg_id.verify.cpp
+++ b/libcxx/test/std/utilities/format/format.formatter/format.parse.ctx/check_arg_id.verify.cpp
@@ -1,4 +1,5 @@
 //===----------------------------------------------------------------------===//
+//
 // Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
 // See https://llvm.org/LICENSE.txt for license information.
 // SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
diff --git a/libcxx/test/std/utilities/format/format.formatter/format.parse.ctx/ctor.pass.cpp b/libcxx/test/std/utilities/format/format.formatter/format.parse.ctx/ctor.pass.cpp
index ae50394..c7dfb601 100644
--- a/libcxx/test/std/utilities/format/format.formatter/format.parse.ctx/ctor.pass.cpp
+++ b/libcxx/test/std/utilities/format/format.formatter/format.parse.ctx/ctor.pass.cpp
@@ -1,4 +1,5 @@
 //===----------------------------------------------------------------------===//
+//
 // Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
 // See https://llvm.org/LICENSE.txt for license information.
 // SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
diff --git a/libcxx/test/std/utilities/format/format.formatter/format.parse.ctx/end.pass.cpp b/libcxx/test/std/utilities/format/format.formatter/format.parse.ctx/end.pass.cpp
index 3cd95b6..9fb4fbb 100644
--- a/libcxx/test/std/utilities/format/format.formatter/format.parse.ctx/end.pass.cpp
+++ b/libcxx/test/std/utilities/format/format.formatter/format.parse.ctx/end.pass.cpp
@@ -1,4 +1,5 @@
 //===----------------------------------------------------------------------===//
+//
 // Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
 // See https://llvm.org/LICENSE.txt for license information.
 // SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
diff --git a/libcxx/test/std/utilities/format/format.formatter/format.parse.ctx/next_arg_id.pass.cpp b/libcxx/test/std/utilities/format/format.formatter/format.parse.ctx/next_arg_id.pass.cpp
index 5049c29..5a82bcd 100644
--- a/libcxx/test/std/utilities/format/format.formatter/format.parse.ctx/next_arg_id.pass.cpp
+++ b/libcxx/test/std/utilities/format/format.formatter/format.parse.ctx/next_arg_id.pass.cpp
@@ -1,4 +1,5 @@
 //===----------------------------------------------------------------------===//
+//
 // Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
 // See https://llvm.org/LICENSE.txt for license information.
 // SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
diff --git a/libcxx/test/std/utilities/format/format.formatter/format.parse.ctx/next_arg_id.verify.cpp b/libcxx/test/std/utilities/format/format.formatter/format.parse.ctx/next_arg_id.verify.cpp
index 5f43b4e..a1404e0 100644
--- a/libcxx/test/std/utilities/format/format.formatter/format.parse.ctx/next_arg_id.verify.cpp
+++ b/libcxx/test/std/utilities/format/format.formatter/format.parse.ctx/next_arg_id.verify.cpp
@@ -1,4 +1,5 @@
 //===----------------------------------------------------------------------===//
+//
 // Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
 // See https://llvm.org/LICENSE.txt for license information.
 // SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
diff --git a/libcxx/test/std/utilities/format/format.formatter/format.parse.ctx/types.compile.pass.cpp b/libcxx/test/std/utilities/format/format.formatter/format.parse.ctx/types.compile.pass.cpp
index 079164b..0962ff5 100644
--- a/libcxx/test/std/utilities/format/format.formatter/format.parse.ctx/types.compile.pass.cpp
+++ b/libcxx/test/std/utilities/format/format.formatter/format.parse.ctx/types.compile.pass.cpp
@@ -1,4 +1,5 @@
 //===----------------------------------------------------------------------===//
+//
 // Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
 // See https://llvm.org/LICENSE.txt for license information.
 // SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
diff --git a/libcxx/test/std/utilities/format/format.functions/P2418.pass.cpp b/libcxx/test/std/utilities/format/format.functions/P2418.pass.cpp
index bfd8041..0c2480f 100644
--- a/libcxx/test/std/utilities/format/format.functions/P2418.pass.cpp
+++ b/libcxx/test/std/utilities/format/format.functions/P2418.pass.cpp
@@ -1,4 +1,5 @@
 //===----------------------------------------------------------------------===//
+//
 // Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
 // See https://llvm.org/LICENSE.txt for license information.
 // SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
diff --git a/libcxx/test/std/utilities/format/format.functions/bug_81590.compile.pass.cpp b/libcxx/test/std/utilities/format/format.functions/bug_81590.compile.pass.cpp
index 5f248f8..c2508fd 100644
--- a/libcxx/test/std/utilities/format/format.functions/bug_81590.compile.pass.cpp
+++ b/libcxx/test/std/utilities/format/format.functions/bug_81590.compile.pass.cpp
@@ -1,4 +1,5 @@
 //===----------------------------------------------------------------------===//
+//
 // Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
 // See https://llvm.org/LICENSE.txt for license information.
 // SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
diff --git a/libcxx/test/std/utilities/format/format.functions/escaped_output.unicode.pass.cpp b/libcxx/test/std/utilities/format/format.functions/escaped_output.unicode.pass.cpp
index 96c1e26..fa30762 100644
--- a/libcxx/test/std/utilities/format/format.functions/escaped_output.unicode.pass.cpp
+++ b/libcxx/test/std/utilities/format/format.functions/escaped_output.unicode.pass.cpp
@@ -1,4 +1,5 @@
 //===----------------------------------------------------------------------===//
+//
 // Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
 // See https://llvm.org/LICENSE.txt for license information.
 // SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
diff --git a/libcxx/test/std/utilities/format/format.functions/fill.unicode.pass.cpp b/libcxx/test/std/utilities/format/format.functions/fill.unicode.pass.cpp
index d0a687b..cd555e1 100644
--- a/libcxx/test/std/utilities/format/format.functions/fill.unicode.pass.cpp
+++ b/libcxx/test/std/utilities/format/format.functions/fill.unicode.pass.cpp
@@ -1,4 +1,5 @@
 //===----------------------------------------------------------------------===//
+//
 // Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
 // See https://llvm.org/LICENSE.txt for license information.
 // SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
diff --git a/libcxx/test/std/utilities/format/format.functions/format.locale.pass.cpp b/libcxx/test/std/utilities/format/format.functions/format.locale.pass.cpp
index a0fe98e..1078c31 100644
--- a/libcxx/test/std/utilities/format/format.functions/format.locale.pass.cpp
+++ b/libcxx/test/std/utilities/format/format.functions/format.locale.pass.cpp
@@ -1,4 +1,5 @@
 //===----------------------------------------------------------------------===//
+//
 // Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
 // See https://llvm.org/LICENSE.txt for license information.
 // SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
diff --git a/libcxx/test/std/utilities/format/format.functions/format.locale.runtime_format.pass.cpp b/libcxx/test/std/utilities/format/format.functions/format.locale.runtime_format.pass.cpp
index 0ddb597..a262d64 100644
--- a/libcxx/test/std/utilities/format/format.functions/format.locale.runtime_format.pass.cpp
+++ b/libcxx/test/std/utilities/format/format.functions/format.locale.runtime_format.pass.cpp
@@ -1,4 +1,5 @@
 //===----------------------------------------------------------------------===//
+//
 // Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
 // See https://llvm.org/LICENSE.txt for license information.
 // SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
diff --git a/libcxx/test/std/utilities/format/format.functions/format.locale.verify.cpp b/libcxx/test/std/utilities/format/format.functions/format.locale.verify.cpp
index 8b06141..63e6f8f 100644
--- a/libcxx/test/std/utilities/format/format.functions/format.locale.verify.cpp
+++ b/libcxx/test/std/utilities/format/format.functions/format.locale.verify.cpp
@@ -1,4 +1,5 @@
 //===----------------------------------------------------------------------===//
+//
 // Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
 // See https://llvm.org/LICENSE.txt for license information.
 // SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
diff --git a/libcxx/test/std/utilities/format/format.functions/format.pass.cpp b/libcxx/test/std/utilities/format/format.functions/format.pass.cpp
index 5fe210e..5ec244c 100644
--- a/libcxx/test/std/utilities/format/format.functions/format.pass.cpp
+++ b/libcxx/test/std/utilities/format/format.functions/format.pass.cpp
@@ -1,4 +1,5 @@
 //===----------------------------------------------------------------------===//
+//
 // Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
 // See https://llvm.org/LICENSE.txt for license information.
 // SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
diff --git a/libcxx/test/std/utilities/format/format.functions/format.runtime_format.pass.cpp b/libcxx/test/std/utilities/format/format.functions/format.runtime_format.pass.cpp
index 089dd61..4bec909 100644
--- a/libcxx/test/std/utilities/format/format.functions/format.runtime_format.pass.cpp
+++ b/libcxx/test/std/utilities/format/format.functions/format.runtime_format.pass.cpp
@@ -1,4 +1,5 @@
 //===----------------------------------------------------------------------===//
+//
 // Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
 // See https://llvm.org/LICENSE.txt for license information.
 // SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
diff --git a/libcxx/test/std/utilities/format/format.functions/format.verify.cpp b/libcxx/test/std/utilities/format/format.functions/format.verify.cpp
index 53b9b29..9e1f86b 100644
--- a/libcxx/test/std/utilities/format/format.functions/format.verify.cpp
+++ b/libcxx/test/std/utilities/format/format.functions/format.verify.cpp
@@ -1,4 +1,5 @@
 //===----------------------------------------------------------------------===//
+//
 // Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
 // See https://llvm.org/LICENSE.txt for license information.
 // SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
diff --git a/libcxx/test/std/utilities/format/format.functions/format_tests.h b/libcxx/test/std/utilities/format/format.functions/format_tests.h
index aa33c20..b2ed677 100644
--- a/libcxx/test/std/utilities/format/format.functions/format_tests.h
+++ b/libcxx/test/std/utilities/format/format.functions/format_tests.h
@@ -1,4 +1,5 @@
 //===----------------------------------------------------------------------===//
+//
 // Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
 // See https://llvm.org/LICENSE.txt for license information.
 // SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
diff --git a/libcxx/test/std/utilities/format/format.functions/format_to.locale.pass.cpp b/libcxx/test/std/utilities/format/format.functions/format_to.locale.pass.cpp
index 5de7c7b..ec21133 100644
--- a/libcxx/test/std/utilities/format/format.functions/format_to.locale.pass.cpp
+++ b/libcxx/test/std/utilities/format/format.functions/format_to.locale.pass.cpp
@@ -1,4 +1,5 @@
 //===----------------------------------------------------------------------===//
+//
 // Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
 // See https://llvm.org/LICENSE.txt for license information.
 // SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
diff --git a/libcxx/test/std/utilities/format/format.functions/format_to.locale.verify.cpp b/libcxx/test/std/utilities/format/format.functions/format_to.locale.verify.cpp
index f3eb3f1..488110a 100644
--- a/libcxx/test/std/utilities/format/format.functions/format_to.locale.verify.cpp
+++ b/libcxx/test/std/utilities/format/format.functions/format_to.locale.verify.cpp
@@ -1,4 +1,5 @@
 //===----------------------------------------------------------------------===//
+//
 // Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
 // See https://llvm.org/LICENSE.txt for license information.
 // SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
diff --git a/libcxx/test/std/utilities/format/format.functions/format_to.pass.cpp b/libcxx/test/std/utilities/format/format.functions/format_to.pass.cpp
index 5c07409..31509e3 100644
--- a/libcxx/test/std/utilities/format/format.functions/format_to.pass.cpp
+++ b/libcxx/test/std/utilities/format/format.functions/format_to.pass.cpp
@@ -1,4 +1,5 @@
 //===----------------------------------------------------------------------===//
+//
 // Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
 // See https://llvm.org/LICENSE.txt for license information.
 // SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
diff --git a/libcxx/test/std/utilities/format/format.functions/format_to.verify.cpp b/libcxx/test/std/utilities/format/format.functions/format_to.verify.cpp
index 8441e9f..671359a 100644
--- a/libcxx/test/std/utilities/format/format.functions/format_to.verify.cpp
+++ b/libcxx/test/std/utilities/format/format.functions/format_to.verify.cpp
@@ -1,4 +1,5 @@
 //===----------------------------------------------------------------------===//
+//
 // Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
 // See https://llvm.org/LICENSE.txt for license information.
 // SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
diff --git a/libcxx/test/std/utilities/format/format.functions/format_to_n.locale.pass.cpp b/libcxx/test/std/utilities/format/format.functions/format_to_n.locale.pass.cpp
index 948f2b3..3c266ad 100644
--- a/libcxx/test/std/utilities/format/format.functions/format_to_n.locale.pass.cpp
+++ b/libcxx/test/std/utilities/format/format.functions/format_to_n.locale.pass.cpp
@@ -1,4 +1,5 @@
 //===----------------------------------------------------------------------===//
+//
 // Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
 // See https://llvm.org/LICENSE.txt for license information.
 // SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
diff --git a/libcxx/test/std/utilities/format/format.functions/format_to_n.locale.verify.cpp b/libcxx/test/std/utilities/format/format.functions/format_to_n.locale.verify.cpp
index dd25616..ebe6ff0 100644
--- a/libcxx/test/std/utilities/format/format.functions/format_to_n.locale.verify.cpp
+++ b/libcxx/test/std/utilities/format/format.functions/format_to_n.locale.verify.cpp
@@ -1,4 +1,5 @@
 //===----------------------------------------------------------------------===//
+//
 // Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
 // See https://llvm.org/LICENSE.txt for license information.
 // SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
diff --git a/libcxx/test/std/utilities/format/format.functions/format_to_n.pass.cpp b/libcxx/test/std/utilities/format/format.functions/format_to_n.pass.cpp
index 12ec459..149e3413 100644
--- a/libcxx/test/std/utilities/format/format.functions/format_to_n.pass.cpp
+++ b/libcxx/test/std/utilities/format/format.functions/format_to_n.pass.cpp
@@ -1,4 +1,5 @@
 //===----------------------------------------------------------------------===//
+//
 // Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
 // See https://llvm.org/LICENSE.txt for license information.
 // SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
diff --git a/libcxx/test/std/utilities/format/format.functions/format_to_n.verify.cpp b/libcxx/test/std/utilities/format/format.functions/format_to_n.verify.cpp
index ef90fe3..ccea8d4 100644
--- a/libcxx/test/std/utilities/format/format.functions/format_to_n.verify.cpp
+++ b/libcxx/test/std/utilities/format/format.functions/format_to_n.verify.cpp
@@ -1,4 +1,5 @@
 //===----------------------------------------------------------------------===//
+//
 // Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
 // See https://llvm.org/LICENSE.txt for license information.
 // SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
diff --git a/libcxx/test/std/utilities/format/format.functions/formatted_size.locale.pass.cpp b/libcxx/test/std/utilities/format/format.functions/formatted_size.locale.pass.cpp
index 16f0524f..e5a8582 100644
--- a/libcxx/test/std/utilities/format/format.functions/formatted_size.locale.pass.cpp
+++ b/libcxx/test/std/utilities/format/format.functions/formatted_size.locale.pass.cpp
@@ -1,4 +1,5 @@
 //===----------------------------------------------------------------------===//
+//
 // Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
 // See https://llvm.org/LICENSE.txt for license information.
 // SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
diff --git a/libcxx/test/std/utilities/format/format.functions/formatted_size.locale.verify.cpp b/libcxx/test/std/utilities/format/format.functions/formatted_size.locale.verify.cpp
index 8ed848e..d837443 100644
--- a/libcxx/test/std/utilities/format/format.functions/formatted_size.locale.verify.cpp
+++ b/libcxx/test/std/utilities/format/format.functions/formatted_size.locale.verify.cpp
@@ -1,4 +1,5 @@
 //===----------------------------------------------------------------------===//
+//
 // Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
 // See https://llvm.org/LICENSE.txt for license information.
 // SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
diff --git a/libcxx/test/std/utilities/format/format.functions/formatted_size.pass.cpp b/libcxx/test/std/utilities/format/format.functions/formatted_size.pass.cpp
index ac59af6..472ac44 100644
--- a/libcxx/test/std/utilities/format/format.functions/formatted_size.pass.cpp
+++ b/libcxx/test/std/utilities/format/format.functions/formatted_size.pass.cpp
@@ -1,4 +1,5 @@
 //===----------------------------------------------------------------------===//
+//
 // Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
 // See https://llvm.org/LICENSE.txt for license information.
 // SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
diff --git a/libcxx/test/std/utilities/format/format.functions/formatted_size.verify.cpp b/libcxx/test/std/utilities/format/format.functions/formatted_size.verify.cpp
index 91b786e..8420a03 100644
--- a/libcxx/test/std/utilities/format/format.functions/formatted_size.verify.cpp
+++ b/libcxx/test/std/utilities/format/format.functions/formatted_size.verify.cpp
@@ -1,4 +1,5 @@
 //===----------------------------------------------------------------------===//
+//
 // Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
 // See https://llvm.org/LICENSE.txt for license information.
 // SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
diff --git a/libcxx/test/std/utilities/format/format.functions/locale-specific_form.pass.cpp b/libcxx/test/std/utilities/format/format.functions/locale-specific_form.pass.cpp
index b25fd25..51be78e 100644
--- a/libcxx/test/std/utilities/format/format.functions/locale-specific_form.pass.cpp
+++ b/libcxx/test/std/utilities/format/format.functions/locale-specific_form.pass.cpp
@@ -1,4 +1,5 @@
 //===----------------------------------------------------------------------===//
+//
 // Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
 // See https://llvm.org/LICENSE.txt for license information.
 // SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
diff --git a/libcxx/test/std/utilities/format/format.functions/unicode.pass.cpp b/libcxx/test/std/utilities/format/format.functions/unicode.pass.cpp
index b5b0442..11989f1 100644
--- a/libcxx/test/std/utilities/format/format.functions/unicode.pass.cpp
+++ b/libcxx/test/std/utilities/format/format.functions/unicode.pass.cpp
@@ -1,4 +1,5 @@
 //===----------------------------------------------------------------------===//
+//
 // Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
 // See https://llvm.org/LICENSE.txt for license information.
 // SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
diff --git a/libcxx/test/std/utilities/format/format.functions/vformat.locale.pass.cpp b/libcxx/test/std/utilities/format/format.functions/vformat.locale.pass.cpp
index 228ccba..2cde40d 100644
--- a/libcxx/test/std/utilities/format/format.functions/vformat.locale.pass.cpp
+++ b/libcxx/test/std/utilities/format/format.functions/vformat.locale.pass.cpp
@@ -1,4 +1,5 @@
 //===----------------------------------------------------------------------===//
+//
 // Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
 // See https://llvm.org/LICENSE.txt for license information.
 // SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
diff --git a/libcxx/test/std/utilities/format/format.functions/vformat.pass.cpp b/libcxx/test/std/utilities/format/format.functions/vformat.pass.cpp
index e16d50f..99a1bd1 100644
--- a/libcxx/test/std/utilities/format/format.functions/vformat.pass.cpp
+++ b/libcxx/test/std/utilities/format/format.functions/vformat.pass.cpp
@@ -1,4 +1,5 @@
 //===----------------------------------------------------------------------===//
+//
 // Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
 // See https://llvm.org/LICENSE.txt for license information.
 // SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
diff --git a/libcxx/test/std/utilities/format/format.functions/vformat_to.locale.pass.cpp b/libcxx/test/std/utilities/format/format.functions/vformat_to.locale.pass.cpp
index bfbe9d0..34d7c84f 100644
--- a/libcxx/test/std/utilities/format/format.functions/vformat_to.locale.pass.cpp
+++ b/libcxx/test/std/utilities/format/format.functions/vformat_to.locale.pass.cpp
@@ -1,4 +1,5 @@
 //===----------------------------------------------------------------------===//
+//
 // Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
 // See https://llvm.org/LICENSE.txt for license information.
 // SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
diff --git a/libcxx/test/std/utilities/format/format.functions/vformat_to.pass.cpp b/libcxx/test/std/utilities/format/format.functions/vformat_to.pass.cpp
index c247a9b..0047db7 100644
--- a/libcxx/test/std/utilities/format/format.functions/vformat_to.pass.cpp
+++ b/libcxx/test/std/utilities/format/format.functions/vformat_to.pass.cpp
@@ -1,4 +1,5 @@
 //===----------------------------------------------------------------------===//
+//
 // Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
 // See https://llvm.org/LICENSE.txt for license information.
 // SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
diff --git a/libcxx/test/std/utilities/format/format.range/format.range.fmtdef/format.pass.cpp b/libcxx/test/std/utilities/format/format.range/format.range.fmtdef/format.pass.cpp
index 75d227b..81fdab47 100644
--- a/libcxx/test/std/utilities/format/format.range/format.range.fmtdef/format.pass.cpp
+++ b/libcxx/test/std/utilities/format/format.range/format.range.fmtdef/format.pass.cpp
@@ -1,4 +1,5 @@
 //===----------------------------------------------------------------------===//
+//
 // Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
 // See https://llvm.org/LICENSE.txt for license information.
 // SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
diff --git a/libcxx/test/std/utilities/format/format.range/format.range.fmtdef/parse.pass.cpp b/libcxx/test/std/utilities/format/format.range/format.range.fmtdef/parse.pass.cpp
index 0eb984c..156f0ee 100644
--- a/libcxx/test/std/utilities/format/format.range/format.range.fmtdef/parse.pass.cpp
+++ b/libcxx/test/std/utilities/format/format.range/format.range.fmtdef/parse.pass.cpp
@@ -1,4 +1,5 @@
 //===----------------------------------------------------------------------===//
+//
 // Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
 // See https://llvm.org/LICENSE.txt for license information.
 // SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
diff --git a/libcxx/test/std/utilities/format/format.range/format.range.fmtdef/set_brackets.pass.cpp b/libcxx/test/std/utilities/format/format.range/format.range.fmtdef/set_brackets.pass.cpp
index 6f361de..97132ab 100644
--- a/libcxx/test/std/utilities/format/format.range/format.range.fmtdef/set_brackets.pass.cpp
+++ b/libcxx/test/std/utilities/format/format.range/format.range.fmtdef/set_brackets.pass.cpp
@@ -1,4 +1,5 @@
 //===----------------------------------------------------------------------===//
+//
 // Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
 // See https://llvm.org/LICENSE.txt for license information.
 // SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
diff --git a/libcxx/test/std/utilities/format/format.range/format.range.fmtdef/set_separator.pass.cpp b/libcxx/test/std/utilities/format/format.range/format.range.fmtdef/set_separator.pass.cpp
index 87cf275..e767339 100644
--- a/libcxx/test/std/utilities/format/format.range/format.range.fmtdef/set_separator.pass.cpp
+++ b/libcxx/test/std/utilities/format/format.range/format.range.fmtdef/set_separator.pass.cpp
@@ -1,4 +1,5 @@
 //===----------------------------------------------------------------------===//
+//
 // Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
 // See https://llvm.org/LICENSE.txt for license information.
 // SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
diff --git a/libcxx/test/std/utilities/format/format.range/format.range.fmtkind/format_kind.compile.pass.cpp b/libcxx/test/std/utilities/format/format.range/format.range.fmtkind/format_kind.compile.pass.cpp
index 8756cb9..d3c89e8 100644
--- a/libcxx/test/std/utilities/format/format.range/format.range.fmtkind/format_kind.compile.pass.cpp
+++ b/libcxx/test/std/utilities/format/format.range/format.range.fmtkind/format_kind.compile.pass.cpp
@@ -1,4 +1,5 @@
 //===----------------------------------------------------------------------===//
+//
 // Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
 // See https://llvm.org/LICENSE.txt for license information.
 // SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
diff --git a/libcxx/test/std/utilities/format/format.range/format.range.fmtkind/format_kind.verify.cpp b/libcxx/test/std/utilities/format/format.range/format.range.fmtkind/format_kind.verify.cpp
index 397b1f5..05de4f9 100644
--- a/libcxx/test/std/utilities/format/format.range/format.range.fmtkind/format_kind.verify.cpp
+++ b/libcxx/test/std/utilities/format/format.range/format.range.fmtkind/format_kind.verify.cpp
@@ -1,4 +1,5 @@
 //===----------------------------------------------------------------------===//
+//
 // Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
 // See https://llvm.org/LICENSE.txt for license information.
 // SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
diff --git a/libcxx/test/std/utilities/format/format.range/format.range.fmtkind/range_format.compile.pass.cpp b/libcxx/test/std/utilities/format/format.range/format.range.fmtkind/range_format.compile.pass.cpp
index 869984f..a4b2854 100644
--- a/libcxx/test/std/utilities/format/format.range/format.range.fmtkind/range_format.compile.pass.cpp
+++ b/libcxx/test/std/utilities/format/format.range/format.range.fmtkind/range_format.compile.pass.cpp
@@ -1,4 +1,5 @@
 //===----------------------------------------------------------------------===//
+//
 // Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
 // See https://llvm.org/LICENSE.txt for license information.
 // SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
diff --git a/libcxx/test/std/utilities/format/format.range/format.range.fmtmap/format.functions.format.pass.cpp b/libcxx/test/std/utilities/format/format.range/format.range.fmtmap/format.functions.format.pass.cpp
index 5ac84b4..d64e9a6 100644
--- a/libcxx/test/std/utilities/format/format.range/format.range.fmtmap/format.functions.format.pass.cpp
+++ b/libcxx/test/std/utilities/format/format.range/format.range.fmtmap/format.functions.format.pass.cpp
@@ -1,4 +1,5 @@
 //===----------------------------------------------------------------------===//
+//
 // Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
 // See https://llvm.org/LICENSE.txt for license information.
 // SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
diff --git a/libcxx/test/std/utilities/format/format.range/format.range.fmtmap/format.functions.tests.h b/libcxx/test/std/utilities/format/format.range/format.range.fmtmap/format.functions.tests.h
index 3ebaa05..1108879 100644
--- a/libcxx/test/std/utilities/format/format.range/format.range.fmtmap/format.functions.tests.h
+++ b/libcxx/test/std/utilities/format/format.range/format.range.fmtmap/format.functions.tests.h
@@ -1,4 +1,5 @@
 //===----------------------------------------------------------------------===//
+//
 // Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
 // See https://llvm.org/LICENSE.txt for license information.
 // SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
diff --git a/libcxx/test/std/utilities/format/format.range/format.range.fmtmap/format.functions.vformat.pass.cpp b/libcxx/test/std/utilities/format/format.range/format.range.fmtmap/format.functions.vformat.pass.cpp
index 567ebc1..93745cf 100644
--- a/libcxx/test/std/utilities/format/format.range/format.range.fmtmap/format.functions.vformat.pass.cpp
+++ b/libcxx/test/std/utilities/format/format.range/format.range.fmtmap/format.functions.vformat.pass.cpp
@@ -1,4 +1,5 @@
 //===----------------------------------------------------------------------===//
+//
 // Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
 // See https://llvm.org/LICENSE.txt for license information.
 // SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
diff --git a/libcxx/test/std/utilities/format/format.range/format.range.fmtmap/format.pass.cpp b/libcxx/test/std/utilities/format/format.range/format.range.fmtmap/format.pass.cpp
index a7dc5e9..c0e1d84 100644
--- a/libcxx/test/std/utilities/format/format.range/format.range.fmtmap/format.pass.cpp
+++ b/libcxx/test/std/utilities/format/format.range/format.range.fmtmap/format.pass.cpp
@@ -1,4 +1,5 @@
 //===----------------------------------------------------------------------===//
+//
 // Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
 // See https://llvm.org/LICENSE.txt for license information.
 // SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
diff --git a/libcxx/test/std/utilities/format/format.range/format.range.fmtmap/parse.pass.cpp b/libcxx/test/std/utilities/format/format.range/format.range.fmtmap/parse.pass.cpp
index 99d6aa7..3fb0a95 100644
--- a/libcxx/test/std/utilities/format/format.range/format.range.fmtmap/parse.pass.cpp
+++ b/libcxx/test/std/utilities/format/format.range/format.range.fmtmap/parse.pass.cpp
@@ -1,4 +1,5 @@
 //===----------------------------------------------------------------------===//
+//
 // Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
 // See https://llvm.org/LICENSE.txt for license information.
 // SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
diff --git a/libcxx/test/std/utilities/format/format.range/format.range.fmtset/format.functions.format.pass.cpp b/libcxx/test/std/utilities/format/format.range/format.range.fmtset/format.functions.format.pass.cpp
index 4b971cb..98097cf 100644
--- a/libcxx/test/std/utilities/format/format.range/format.range.fmtset/format.functions.format.pass.cpp
+++ b/libcxx/test/std/utilities/format/format.range/format.range.fmtset/format.functions.format.pass.cpp
@@ -1,4 +1,5 @@
 //===----------------------------------------------------------------------===//
+//
 // Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
 // See https://llvm.org/LICENSE.txt for license information.
 // SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
diff --git a/libcxx/test/std/utilities/format/format.range/format.range.fmtset/format.functions.tests.h b/libcxx/test/std/utilities/format/format.range/format.range.fmtset/format.functions.tests.h
index 5a0c89e..258a6a1 100644
--- a/libcxx/test/std/utilities/format/format.range/format.range.fmtset/format.functions.tests.h
+++ b/libcxx/test/std/utilities/format/format.range/format.range.fmtset/format.functions.tests.h
@@ -1,4 +1,5 @@
 //===----------------------------------------------------------------------===//
+//
 // Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
 // See https://llvm.org/LICENSE.txt for license information.
 // SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
diff --git a/libcxx/test/std/utilities/format/format.range/format.range.fmtset/format.functions.vformat.pass.cpp b/libcxx/test/std/utilities/format/format.range/format.range.fmtset/format.functions.vformat.pass.cpp
index 60dde24..6afae31 100644
--- a/libcxx/test/std/utilities/format/format.range/format.range.fmtset/format.functions.vformat.pass.cpp
+++ b/libcxx/test/std/utilities/format/format.range/format.range.fmtset/format.functions.vformat.pass.cpp
@@ -1,4 +1,5 @@
 //===----------------------------------------------------------------------===//
+//
 // Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
 // See https://llvm.org/LICENSE.txt for license information.
 // SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
diff --git a/libcxx/test/std/utilities/format/format.range/format.range.fmtset/format.pass.cpp b/libcxx/test/std/utilities/format/format.range/format.range.fmtset/format.pass.cpp
index 5b94aa0..2a32e4f 100644
--- a/libcxx/test/std/utilities/format/format.range/format.range.fmtset/format.pass.cpp
+++ b/libcxx/test/std/utilities/format/format.range/format.range.fmtset/format.pass.cpp
@@ -1,4 +1,5 @@
 //===----------------------------------------------------------------------===//
+//
 // Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
 // See https://llvm.org/LICENSE.txt for license information.
 // SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
diff --git a/libcxx/test/std/utilities/format/format.range/format.range.fmtset/parse.pass.cpp b/libcxx/test/std/utilities/format/format.range/format.range.fmtset/parse.pass.cpp
index 182beff..ecba174 100644
--- a/libcxx/test/std/utilities/format/format.range/format.range.fmtset/parse.pass.cpp
+++ b/libcxx/test/std/utilities/format/format.range/format.range.fmtset/parse.pass.cpp
@@ -1,4 +1,5 @@
 //===----------------------------------------------------------------------===//
+//
 // Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
 // See https://llvm.org/LICENSE.txt for license information.
 // SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
diff --git a/libcxx/test/std/utilities/format/format.range/format.range.fmtstr/format.functions.format.pass.cpp b/libcxx/test/std/utilities/format/format.range/format.range.fmtstr/format.functions.format.pass.cpp
index dd2815f..5fc9f94 100644
--- a/libcxx/test/std/utilities/format/format.range/format.range.fmtstr/format.functions.format.pass.cpp
+++ b/libcxx/test/std/utilities/format/format.range/format.range.fmtstr/format.functions.format.pass.cpp
@@ -1,4 +1,5 @@
 //===----------------------------------------------------------------------===//
+//
 // Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
 // See https://llvm.org/LICENSE.txt for license information.
 // SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
diff --git a/libcxx/test/std/utilities/format/format.range/format.range.fmtstr/format.functions.tests.h b/libcxx/test/std/utilities/format/format.range/format.range.fmtstr/format.functions.tests.h
index 261431c..632f664 100644
--- a/libcxx/test/std/utilities/format/format.range/format.range.fmtstr/format.functions.tests.h
+++ b/libcxx/test/std/utilities/format/format.range/format.range.fmtstr/format.functions.tests.h
@@ -1,4 +1,5 @@
 //===----------------------------------------------------------------------===//
+//
 // Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
 // See https://llvm.org/LICENSE.txt for license information.
 // SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
diff --git a/libcxx/test/std/utilities/format/format.range/format.range.fmtstr/format.functions.vformat.pass.cpp b/libcxx/test/std/utilities/format/format.range/format.range.fmtstr/format.functions.vformat.pass.cpp
index b836f2e..2328a72 100644
--- a/libcxx/test/std/utilities/format/format.range/format.range.fmtstr/format.functions.vformat.pass.cpp
+++ b/libcxx/test/std/utilities/format/format.range/format.range.fmtstr/format.functions.vformat.pass.cpp
@@ -1,4 +1,5 @@
 //===----------------------------------------------------------------------===//
+//
 // Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
 // See https://llvm.org/LICENSE.txt for license information.
 // SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
diff --git a/libcxx/test/std/utilities/format/format.range/format.range.fmtstr/format.pass.cpp b/libcxx/test/std/utilities/format/format.range/format.range.fmtstr/format.pass.cpp
index 675a5e8..7d6502e 100644
--- a/libcxx/test/std/utilities/format/format.range/format.range.fmtstr/format.pass.cpp
+++ b/libcxx/test/std/utilities/format/format.range/format.range.fmtstr/format.pass.cpp
@@ -1,4 +1,5 @@
 //===----------------------------------------------------------------------===//
+//
 // Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
 // See https://llvm.org/LICENSE.txt for license information.
 // SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
diff --git a/libcxx/test/std/utilities/format/format.range/format.range.fmtstr/parse.pass.cpp b/libcxx/test/std/utilities/format/format.range/format.range.fmtstr/parse.pass.cpp
index 3354de3..2317507 100644
--- a/libcxx/test/std/utilities/format/format.range/format.range.fmtstr/parse.pass.cpp
+++ b/libcxx/test/std/utilities/format/format.range/format.range.fmtstr/parse.pass.cpp
@@ -1,4 +1,5 @@
 //===----------------------------------------------------------------------===//
+//
 // Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
 // See https://llvm.org/LICENSE.txt for license information.
 // SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
diff --git a/libcxx/test/std/utilities/format/format.range/format.range.formatter/format.functions.format.pass.cpp b/libcxx/test/std/utilities/format/format.range/format.range.formatter/format.functions.format.pass.cpp
index a534810..a0b8aed 100644
--- a/libcxx/test/std/utilities/format/format.range/format.range.formatter/format.functions.format.pass.cpp
+++ b/libcxx/test/std/utilities/format/format.range/format.range.formatter/format.functions.format.pass.cpp
@@ -1,4 +1,5 @@
 //===----------------------------------------------------------------------===//
+//
 // Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
 // See https://llvm.org/LICENSE.txt for license information.
 // SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
diff --git a/libcxx/test/std/utilities/format/format.range/format.range.formatter/format.functions.tests.h b/libcxx/test/std/utilities/format/format.range/format.range.formatter/format.functions.tests.h
index 78b067e..06bac9b 100644
--- a/libcxx/test/std/utilities/format/format.range/format.range.formatter/format.functions.tests.h
+++ b/libcxx/test/std/utilities/format/format.range/format.range.formatter/format.functions.tests.h
@@ -1,4 +1,5 @@
 //===----------------------------------------------------------------------===//
+//
 // Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
 // See https://llvm.org/LICENSE.txt for license information.
 // SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
diff --git a/libcxx/test/std/utilities/format/format.range/format.range.formatter/format.functions.vformat.pass.cpp b/libcxx/test/std/utilities/format/format.range/format.range.formatter/format.functions.vformat.pass.cpp
index d46499b..9b43a22 100644
--- a/libcxx/test/std/utilities/format/format.range/format.range.formatter/format.functions.vformat.pass.cpp
+++ b/libcxx/test/std/utilities/format/format.range/format.range.formatter/format.functions.vformat.pass.cpp
@@ -1,4 +1,5 @@
 //===----------------------------------------------------------------------===//
+//
 // Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
 // See https://llvm.org/LICENSE.txt for license information.
 // SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
diff --git a/libcxx/test/std/utilities/format/format.range/format.range.formatter/format.pass.cpp b/libcxx/test/std/utilities/format/format.range/format.range.formatter/format.pass.cpp
index 8589522..7b19ed0 100644
--- a/libcxx/test/std/utilities/format/format.range/format.range.formatter/format.pass.cpp
+++ b/libcxx/test/std/utilities/format/format.range/format.range.formatter/format.pass.cpp
@@ -1,4 +1,5 @@
 //===----------------------------------------------------------------------===//
+//
 // Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
 // See https://llvm.org/LICENSE.txt for license information.
 // SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
diff --git a/libcxx/test/std/utilities/format/format.range/format.range.formatter/parse.pass.cpp b/libcxx/test/std/utilities/format/format.range/format.range.formatter/parse.pass.cpp
index 2d0cef1..f095d4d 100644
--- a/libcxx/test/std/utilities/format/format.range/format.range.formatter/parse.pass.cpp
+++ b/libcxx/test/std/utilities/format/format.range/format.range.formatter/parse.pass.cpp
@@ -1,4 +1,5 @@
 //===----------------------------------------------------------------------===//
+//
 // Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
 // See https://llvm.org/LICENSE.txt for license information.
 // SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
diff --git a/libcxx/test/std/utilities/format/format.range/format.range.formatter/set_brackets.pass.cpp b/libcxx/test/std/utilities/format/format.range/format.range.formatter/set_brackets.pass.cpp
index a26a10c..ae7eb89 100644
--- a/libcxx/test/std/utilities/format/format.range/format.range.formatter/set_brackets.pass.cpp
+++ b/libcxx/test/std/utilities/format/format.range/format.range.formatter/set_brackets.pass.cpp
@@ -1,4 +1,5 @@
 //===----------------------------------------------------------------------===//
+//
 // Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
 // See https://llvm.org/LICENSE.txt for license information.
 // SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
diff --git a/libcxx/test/std/utilities/format/format.range/format.range.formatter/set_separator.pass.cpp b/libcxx/test/std/utilities/format/format.range/format.range.formatter/set_separator.pass.cpp
index 9d402db..fd80c3f 100644
--- a/libcxx/test/std/utilities/format/format.range/format.range.formatter/set_separator.pass.cpp
+++ b/libcxx/test/std/utilities/format/format.range/format.range.formatter/set_separator.pass.cpp
@@ -1,4 +1,5 @@
 //===----------------------------------------------------------------------===//
+//
 // Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
 // See https://llvm.org/LICENSE.txt for license information.
 // SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
diff --git a/libcxx/test/std/utilities/format/format.range/format.range.formatter/underlying.pass.cpp b/libcxx/test/std/utilities/format/format.range/format.range.formatter/underlying.pass.cpp
index 2770f4c..45511ae 100644
--- a/libcxx/test/std/utilities/format/format.range/format.range.formatter/underlying.pass.cpp
+++ b/libcxx/test/std/utilities/format/format.range/format.range.formatter/underlying.pass.cpp
@@ -1,4 +1,5 @@
 //===----------------------------------------------------------------------===//
+//
 // Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
 // See https://llvm.org/LICENSE.txt for license information.
 // SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
diff --git a/libcxx/test/std/utilities/format/format.string/format.string.std/lwg3720_arg_id_width_precision_allowed_types.pass.cpp b/libcxx/test/std/utilities/format/format.string/format.string.std/lwg3720_arg_id_width_precision_allowed_types.pass.cpp
index 5f1757a..bbeb0df 100644
--- a/libcxx/test/std/utilities/format/format.string/format.string.std/lwg3720_arg_id_width_precision_allowed_types.pass.cpp
+++ b/libcxx/test/std/utilities/format/format.string/format.string.std/lwg3720_arg_id_width_precision_allowed_types.pass.cpp
@@ -1,4 +1,5 @@
 //===----------------------------------------------------------------------===//
+//
 // Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
 // See https://llvm.org/LICENSE.txt for license information.
 // SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
diff --git a/libcxx/test/std/utilities/format/format.string/format.string.std/lwg3720_arg_id_width_precision_allowed_types.verify.cpp b/libcxx/test/std/utilities/format/format.string/format.string.std/lwg3720_arg_id_width_precision_allowed_types.verify.cpp
index 9b13e25..b5a893f 100644
--- a/libcxx/test/std/utilities/format/format.string/format.string.std/lwg3720_arg_id_width_precision_allowed_types.verify.cpp
+++ b/libcxx/test/std/utilities/format/format.string/format.string.std/lwg3720_arg_id_width_precision_allowed_types.verify.cpp
@@ -1,4 +1,5 @@
 //===----------------------------------------------------------------------===//
+//
 // Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
 // See https://llvm.org/LICENSE.txt for license information.
 // SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
diff --git a/libcxx/test/std/utilities/format/format.tuple/format.functions.format.pass.cpp b/libcxx/test/std/utilities/format/format.tuple/format.functions.format.pass.cpp
index 8cdcc9d..ba4002c 100644
--- a/libcxx/test/std/utilities/format/format.tuple/format.functions.format.pass.cpp
+++ b/libcxx/test/std/utilities/format/format.tuple/format.functions.format.pass.cpp
@@ -1,4 +1,5 @@
 //===----------------------------------------------------------------------===//
+//
 // Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
 // See https://llvm.org/LICENSE.txt for license information.
 // SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
diff --git a/libcxx/test/std/utilities/format/format.tuple/format.functions.format.verify.cpp b/libcxx/test/std/utilities/format/format.tuple/format.functions.format.verify.cpp
index a353339..3d7cdaa 100644
--- a/libcxx/test/std/utilities/format/format.tuple/format.functions.format.verify.cpp
+++ b/libcxx/test/std/utilities/format/format.tuple/format.functions.format.verify.cpp
@@ -1,4 +1,5 @@
 //===----------------------------------------------------------------------===//
+//
 // Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
 // See https://llvm.org/LICENSE.txt for license information.
 // SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
diff --git a/libcxx/test/std/utilities/format/format.tuple/format.functions.tests.h b/libcxx/test/std/utilities/format/format.tuple/format.functions.tests.h
index e26d2df..d8ed015 100644
--- a/libcxx/test/std/utilities/format/format.tuple/format.functions.tests.h
+++ b/libcxx/test/std/utilities/format/format.tuple/format.functions.tests.h
@@ -1,4 +1,5 @@
 //===----------------------------------------------------------------------===//
+//
 // Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
 // See https://llvm.org/LICENSE.txt for license information.
 // SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
diff --git a/libcxx/test/std/utilities/format/format.tuple/format.functions.vformat.pass.cpp b/libcxx/test/std/utilities/format/format.tuple/format.functions.vformat.pass.cpp
index e3f20e4..f44e81b 100644
--- a/libcxx/test/std/utilities/format/format.tuple/format.functions.vformat.pass.cpp
+++ b/libcxx/test/std/utilities/format/format.tuple/format.functions.vformat.pass.cpp
@@ -1,4 +1,5 @@
 //===----------------------------------------------------------------------===//
+//
 // Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
 // See https://llvm.org/LICENSE.txt for license information.
 // SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
diff --git a/libcxx/test/std/utilities/format/format.tuple/format.pass.cpp b/libcxx/test/std/utilities/format/format.tuple/format.pass.cpp
index d83139b..c4fd45f 100644
--- a/libcxx/test/std/utilities/format/format.tuple/format.pass.cpp
+++ b/libcxx/test/std/utilities/format/format.tuple/format.pass.cpp
@@ -1,4 +1,5 @@
 //===----------------------------------------------------------------------===//
+//
 // Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
 // See https://llvm.org/LICENSE.txt for license information.
 // SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
diff --git a/libcxx/test/std/utilities/format/format.tuple/parse.pass.cpp b/libcxx/test/std/utilities/format/format.tuple/parse.pass.cpp
index 8653c282..0c5a11b 100644
--- a/libcxx/test/std/utilities/format/format.tuple/parse.pass.cpp
+++ b/libcxx/test/std/utilities/format/format.tuple/parse.pass.cpp
@@ -1,4 +1,5 @@
 //===----------------------------------------------------------------------===//
+//
 // Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
 // See https://llvm.org/LICENSE.txt for license information.
 // SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
diff --git a/libcxx/test/std/utilities/format/format.tuple/set_brackets.pass.cpp b/libcxx/test/std/utilities/format/format.tuple/set_brackets.pass.cpp
index b048008..7aeaaf6 100644
--- a/libcxx/test/std/utilities/format/format.tuple/set_brackets.pass.cpp
+++ b/libcxx/test/std/utilities/format/format.tuple/set_brackets.pass.cpp
@@ -1,4 +1,5 @@
 //===----------------------------------------------------------------------===//
+//
 // Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
 // See https://llvm.org/LICENSE.txt for license information.
 // SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
diff --git a/libcxx/test/std/utilities/format/format.tuple/set_separator.pass.cpp b/libcxx/test/std/utilities/format/format.tuple/set_separator.pass.cpp
index af285e63..246de50 100644
--- a/libcxx/test/std/utilities/format/format.tuple/set_separator.pass.cpp
+++ b/libcxx/test/std/utilities/format/format.tuple/set_separator.pass.cpp
@@ -1,4 +1,5 @@
 //===----------------------------------------------------------------------===//
+//
 // Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
 // See https://llvm.org/LICENSE.txt for license information.
 // SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
diff --git a/libcxx/test/std/utilities/format/types.compile.pass.cpp b/libcxx/test/std/utilities/format/types.compile.pass.cpp
index 93723dd..62ed37d 100644
--- a/libcxx/test/std/utilities/format/types.compile.pass.cpp
+++ b/libcxx/test/std/utilities/format/types.compile.pass.cpp
@@ -1,4 +1,5 @@
 //===----------------------------------------------------------------------===//
+//
 // Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
 // See https://llvm.org/LICENSE.txt for license information.
 // SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
diff --git a/libcxx/test/std/utilities/utility/mem.res/mem.poly.allocator.class/mem.poly.allocator.class.general/equality.pass.cpp b/libcxx/test/std/utilities/utility/mem.res/mem.poly.allocator.class/mem.poly.allocator.class.general/equality.pass.cpp
new file mode 100644
index 0000000..792f05d2
--- /dev/null
+++ b/libcxx/test/std/utilities/utility/mem.res/mem.poly.allocator.class/mem.poly.allocator.class.general/equality.pass.cpp
@@ -0,0 +1,41 @@
+//===----------------------------------------------------------------------===//
+//
+// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
+// See https://llvm.org/LICENSE.txt for license information.
+// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
+//
+//===----------------------------------------------------------------------===//
+
+// UNSUPPORTED: c++03, c++11, c++14
+// UNSUPPORTED: availability-pmr-missing
+
+// <memory_resource>
+
+// template <class T> class polymorphic_allocator
+
+// friend bool operator==(const polymorphic_allocator& a,
+//                        const polymorphic_allocator& b) noexcept
+
+#include <memory_resource>
+#include <cassert>
+#include <vector>
+
+#include "test_macros.h"
+
+int main(int, char**) {
+  std::pmr::unsynchronized_pool_resource a;
+  std::pmr::vector<int> vec(&a);
+
+  assert(vec.get_allocator() == &a);
+  static_assert(noexcept(vec.get_allocator() == &a));
+
+  // LWG3683 added operator== after C++20. In C++20 operator!= is generated by
+  // the compiler. Libc++ adds operator!= in C++17 as an extension. MSVC STL
+  // and libstdc++ have done the same so test this extension unconditionally.
+  std::pmr::unsynchronized_pool_resource b;
+
+  assert(vec.get_allocator() != &b);
+  static_assert(noexcept(vec.get_allocator() != &b));
+
+  return 0;
+}
diff --git a/libcxx/test/support/atomic_helpers.h b/libcxx/test/support/atomic_helpers.h
index 0266a09..d2f2b75 100644
--- a/libcxx/test/support/atomic_helpers.h
+++ b/libcxx/test/support/atomic_helpers.h
@@ -11,9 +11,112 @@
 
 #include <cassert>
 #include <cstdint>
+#include <cstddef>
+#include <type_traits>
 
 #include "test_macros.h"
 
+#if defined(TEST_COMPILER_CLANG)
+#  define TEST_ATOMIC_CHAR_LOCK_FREE __CLANG_ATOMIC_CHAR_LOCK_FREE
+#  define TEST_ATOMIC_SHORT_LOCK_FREE __CLANG_ATOMIC_SHORT_LOCK_FREE
+#  define TEST_ATOMIC_INT_LOCK_FREE __CLANG_ATOMIC_INT_LOCK_FREE
+#  define TEST_ATOMIC_LONG_LOCK_FREE __CLANG_ATOMIC_LONG_LOCK_FREE
+#  define TEST_ATOMIC_LLONG_LOCK_FREE __CLANG_ATOMIC_LLONG_LOCK_FREE
+#  define TEST_ATOMIC_POINTER_LOCK_FREE __CLANG_ATOMIC_POINTER_LOCK_FREE
+#elif defined(TEST_COMPILER_GCC)
+#  define TEST_ATOMIC_CHAR_LOCK_FREE __GCC_ATOMIC_CHAR_LOCK_FREE
+#  define TEST_ATOMIC_SHORT_LOCK_FREE __GCC_ATOMIC_SHORT_LOCK_FREE
+#  define TEST_ATOMIC_INT_LOCK_FREE __GCC_ATOMIC_INT_LOCK_FREE
+#  define TEST_ATOMIC_LONG_LOCK_FREE __GCC_ATOMIC_LONG_LOCK_FREE
+#  define TEST_ATOMIC_LLONG_LOCK_FREE __GCC_ATOMIC_LLONG_LOCK_FREE
+#  define TEST_ATOMIC_POINTER_LOCK_FREE __GCC_ATOMIC_POINTER_LOCK_FREE
+#elif TEST_COMPILER_MSVC
+// This is lifted from STL/stl/inc/atomic on github for the purposes of
+// keeping the tests compiling for MSVC's STL. It's not a perfect solution
+// but at least the tests will keep running.
+//
+// Note MSVC's STL never produces a type that is sometimes lock free, but not always lock free.
+template <class T, size_t Size = sizeof(T)>
+constexpr bool msvc_is_lock_free_macro_value() {
+  return (Size <= 8 && (Size & Size - 1) == 0) ? 2 : 0;
+}
+#  define TEST_ATOMIC_CHAR_LOCK_FREE ::msvc_is_lock_free_macro_value<char>()
+#  define TEST_ATOMIC_SHORT_LOCK_FREE ::msvc_is_lock_free_macro_value<short>()
+#  define TEST_ATOMIC_INT_LOCK_FREE ::msvc_is_lock_free_macro_value<int>()
+#  define TEST_ATOMIC_LONG_LOCK_FREE ::msvc_is_lock_free_macro_value<long>()
+#  define TEST_ATOMIC_LLONG_LOCK_FREE ::msvc_is_lock_free_macro_value<long long>()
+#  define TEST_ATOMIC_POINTER_LOCK_FREE ::msvc_is_lock_free_macro_value<void*>()
+#else
+#  error "Unknown compiler"
+#endif
+
+#ifdef TEST_COMPILER_CLANG
+#  pragma clang diagnostic push
+#  pragma clang diagnostic ignored "-Wc++11-extensions"
+#endif
+
+enum class LockFreeStatus : int { unknown = -1, never = 0, sometimes = 1, always = 2 };
+
+// We should really be checking whether the alignment of T is greater-than-or-equal-to the alignment required
+// for T to be atomic, but this is basically impossible to implement portably. Instead, we assume that any type
+// aligned to at least its size is going to be atomic if there exists atomic operations for that size at all,
+// which is true on most platforms. This technically reduces our test coverage in the sense that if a type has
+// an alignment requirement less than its size but could still be made lockfree, LockFreeStatusInfo will report
+// that we don't know whether it is lockfree or not.
+#define COMPARE_TYPES(T, FundamentalT) (sizeof(T) == sizeof(FundamentalT) && TEST_ALIGNOF(T) >= sizeof(T))
+
+template <class T>
+struct LockFreeStatusInfo {
+  static const LockFreeStatus value = LockFreeStatus(
+      COMPARE_TYPES(T, char)
+          ? TEST_ATOMIC_CHAR_LOCK_FREE
+          : (COMPARE_TYPES(T, short)
+                 ? TEST_ATOMIC_SHORT_LOCK_FREE
+                 : (COMPARE_TYPES(T, int)
+                        ? TEST_ATOMIC_INT_LOCK_FREE
+                        : (COMPARE_TYPES(T, long)
+                               ? TEST_ATOMIC_LONG_LOCK_FREE
+                               : (COMPARE_TYPES(T, long long)
+                                      ? TEST_ATOMIC_LLONG_LOCK_FREE
+                                      : (COMPARE_TYPES(T, void*) ? TEST_ATOMIC_POINTER_LOCK_FREE : -1))))));
+
+  static const bool status_known = LockFreeStatusInfo::value != LockFreeStatus::unknown;
+};
+
+#undef COMPARE_TYPES
+
+// This doesn't work in C++03 due to issues with scoped enumerations. Just disable the test.
+#if TEST_STD_VER >= 11
+static_assert(LockFreeStatusInfo<char>::status_known, "");
+static_assert(LockFreeStatusInfo<short>::status_known, "");
+static_assert(LockFreeStatusInfo<int>::status_known, "");
+static_assert(LockFreeStatusInfo<long>::status_known, "");
+static_assert(LockFreeStatusInfo<void*>::status_known, "");
+
+// long long is a bit funky: on some platforms, its alignment is 4 bytes but its size is
+// 8 bytes. In that case, atomics may or may not be lockfree based on their address.
+static_assert(alignof(long long) == sizeof(long long) ? LockFreeStatusInfo<long long>::status_known : true, "");
+
+// Those should always be lock free: hardcode some expected values to make sure our tests are actually
+// testing something meaningful.
+static_assert(LockFreeStatusInfo<char>::value == LockFreeStatus::always, "");
+static_assert(LockFreeStatusInfo<short>::value == LockFreeStatus::always, "");
+static_assert(LockFreeStatusInfo<int>::value == LockFreeStatus::always, "");
+#endif
+
+// These macros are somewhat suprising to use, since they take the values 0, 1, or 2.
+// To make the tests clearer, get rid of them in preference of LockFreeStatusInfo.
+#undef TEST_ATOMIC_CHAR_LOCK_FREE
+#undef TEST_ATOMIC_SHORT_LOCK_FREE
+#undef TEST_ATOMIC_INT_LOCK_FREE
+#undef TEST_ATOMIC_LONG_LOCK_FREE
+#undef TEST_ATOMIC_LLONG_LOCK_FREE
+#undef TEST_ATOMIC_POINTER_LOCK_FREE
+
+#ifdef TEST_COMPILER_CLANG
+#  pragma clang diagnostic pop
+#endif
+
 struct UserAtomicType {
   int i;
 
diff --git a/libcxx/test/support/format.functions.common.h b/libcxx/test/support/format.functions.common.h
index 976df5b..473b4ef 100644
--- a/libcxx/test/support/format.functions.common.h
+++ b/libcxx/test/support/format.functions.common.h
@@ -1,4 +1,5 @@
 //===----------------------------------------------------------------------===//
+//
 // Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
 // See https://llvm.org/LICENSE.txt for license information.
 // SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
diff --git a/libcxx/test/support/test.support/make_string_header.pass.cpp b/libcxx/test/support/test.support/make_string_header.pass.cpp
index 4cee361..8fcc05e 100644
--- a/libcxx/test/support/test.support/make_string_header.pass.cpp
+++ b/libcxx/test/support/test.support/make_string_header.pass.cpp
@@ -1,4 +1,5 @@
 //===----------------------------------------------------------------------===//
+//
 // Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
 // See https://llvm.org/LICENSE.txt for license information.
 // SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
diff --git a/libcxx/test/support/test_basic_format_arg.h b/libcxx/test/support/test_basic_format_arg.h
index 1ec719a..f51f6e9 100644
--- a/libcxx/test/support/test_basic_format_arg.h
+++ b/libcxx/test/support/test_basic_format_arg.h
@@ -1,4 +1,5 @@
 //===----------------------------------------------------------------------===//
+//
 // Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
 // See https://llvm.org/LICENSE.txt for license information.
 // SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
diff --git a/libcxx/test/support/test_iterators.h b/libcxx/test/support/test_iterators.h
index 95d1b7d..44bd4a5 100644
--- a/libcxx/test/support/test_iterators.h
+++ b/libcxx/test/support/test_iterators.h
@@ -339,62 +339,92 @@ cpp20_random_access_iterator(It) -> cpp20_random_access_iterator<It>;
 
 static_assert(std::random_access_iterator<cpp20_random_access_iterator<int*>>);
 
-template <class It>
-class contiguous_iterator
-{
-    static_assert(std::is_pointer_v<It>, "Things probably break in this case");
+template <std::contiguous_iterator It>
+class contiguous_iterator {
+  It it_;
 
-    It it_;
+  template <std::contiguous_iterator U>
+  friend class contiguous_iterator;
 
-    template <class U> friend class contiguous_iterator;
 public:
-    typedef          std::contiguous_iterator_tag              iterator_category;
-    typedef typename std::iterator_traits<It>::value_type      value_type;
-    typedef typename std::iterator_traits<It>::difference_type difference_type;
-    typedef It                                                 pointer;
-    typedef typename std::iterator_traits<It>::reference       reference;
-    typedef typename std::remove_pointer<It>::type             element_type;
+  using iterator_category = std::contiguous_iterator_tag;
+  using value_type        = typename std::iterator_traits<It>::value_type;
+  using difference_type   = typename std::iterator_traits<It>::difference_type;
+  using pointer           = typename std::iterator_traits<It>::pointer;
+  using reference         = typename std::iterator_traits<It>::reference;
+  using element_type      = value_type;
 
-    TEST_CONSTEXPR_CXX14 It base() const {return it_;}
+  constexpr It base() const { return it_; }
 
-    TEST_CONSTEXPR_CXX14 contiguous_iterator() : it_() {}
-    TEST_CONSTEXPR_CXX14 explicit contiguous_iterator(It it) : it_(it) {}
+  constexpr contiguous_iterator() : it_() {}
+  constexpr explicit contiguous_iterator(It it) : it_(it) {}
 
-    template <class U>
-    TEST_CONSTEXPR_CXX14 contiguous_iterator(const contiguous_iterator<U>& u) : it_(u.it_) {}
+  template <class U>
+  constexpr contiguous_iterator(const contiguous_iterator<U>& u) : it_(u.it_) {}
 
-    template <class U, class = typename std::enable_if<std::is_default_constructible<U>::value>::type>
-    constexpr contiguous_iterator(contiguous_iterator<U>&& u) : it_(u.it_) { u.it_ = U(); }
+  template <class U, class = typename std::enable_if<std::is_default_constructible<U>::value>::type>
+  constexpr contiguous_iterator(contiguous_iterator<U>&& u) : it_(u.it_) {
+    u.it_ = U();
+  }
 
-    TEST_CONSTEXPR reference operator*() const {return *it_;}
-    TEST_CONSTEXPR pointer operator->() const {return it_;}
-    TEST_CONSTEXPR reference operator[](difference_type n) const {return it_[n];}
-
-    TEST_CONSTEXPR_CXX14 contiguous_iterator& operator++() {++it_; return *this;}
-    TEST_CONSTEXPR_CXX14 contiguous_iterator& operator--() {--it_; return *this;}
-    TEST_CONSTEXPR_CXX14 contiguous_iterator operator++(int) {return contiguous_iterator(it_++);}
-    TEST_CONSTEXPR_CXX14 contiguous_iterator operator--(int) {return contiguous_iterator(it_--);}
-
-    TEST_CONSTEXPR_CXX14 contiguous_iterator& operator+=(difference_type n) {it_ += n; return *this;}
-    TEST_CONSTEXPR_CXX14 contiguous_iterator& operator-=(difference_type n) {it_ -= n; return *this;}
-    friend TEST_CONSTEXPR_CXX14 contiguous_iterator operator+(contiguous_iterator x, difference_type n) {x += n; return x;}
-    friend TEST_CONSTEXPR_CXX14 contiguous_iterator operator+(difference_type n, contiguous_iterator x) {x += n; return x;}
-    friend TEST_CONSTEXPR_CXX14 contiguous_iterator operator-(contiguous_iterator x, difference_type n) {x -= n; return x;}
-    friend TEST_CONSTEXPR difference_type operator-(contiguous_iterator x, contiguous_iterator y) {return x.it_ - y.it_;}
-
-    friend TEST_CONSTEXPR bool operator==(const contiguous_iterator& x, const contiguous_iterator& y) {return x.it_ == y.it_;}
-    friend TEST_CONSTEXPR bool operator!=(const contiguous_iterator& x, const contiguous_iterator& y) {return x.it_ != y.it_;}
-    friend TEST_CONSTEXPR bool operator< (const contiguous_iterator& x, const contiguous_iterator& y) {return x.it_ <  y.it_;}
-    friend TEST_CONSTEXPR bool operator<=(const contiguous_iterator& x, const contiguous_iterator& y) {return x.it_ <= y.it_;}
-    friend TEST_CONSTEXPR bool operator> (const contiguous_iterator& x, const contiguous_iterator& y) {return x.it_ >  y.it_;}
-    friend TEST_CONSTEXPR bool operator>=(const contiguous_iterator& x, const contiguous_iterator& y) {return x.it_ >= y.it_;}
+  constexpr reference operator*() const { return *it_; }
+  constexpr pointer operator->() const { return it_; }
+  constexpr reference operator[](difference_type n) const { return it_[n]; }
+
+  constexpr contiguous_iterator& operator++() {
+    ++it_;
+    return *this;
+  }
+  constexpr contiguous_iterator& operator--() {
+    --it_;
+    return *this;
+  }
+  constexpr contiguous_iterator operator++(int) { return contiguous_iterator(it_++); }
+  constexpr contiguous_iterator operator--(int) { return contiguous_iterator(it_--); }
+
+  constexpr contiguous_iterator& operator+=(difference_type n) {
+    it_ += n;
+    return *this;
+  }
+  constexpr contiguous_iterator& operator-=(difference_type n) {
+    it_ -= n;
+    return *this;
+  }
+  friend constexpr contiguous_iterator operator+(contiguous_iterator x, difference_type n) {
+    x += n;
+    return x;
+  }
+  friend constexpr contiguous_iterator operator+(difference_type n, contiguous_iterator x) {
+    x += n;
+    return x;
+  }
+  friend constexpr contiguous_iterator operator-(contiguous_iterator x, difference_type n) {
+    x -= n;
+    return x;
+  }
+  friend constexpr difference_type operator-(contiguous_iterator x, contiguous_iterator y) { return x.it_ - y.it_; }
+
+  friend constexpr bool operator==(const contiguous_iterator& x, const contiguous_iterator& y) {
+    return x.it_ == y.it_;
+  }
+  friend constexpr bool operator!=(const contiguous_iterator& x, const contiguous_iterator& y) {
+    return x.it_ != y.it_;
+  }
+  friend constexpr bool operator<(const contiguous_iterator& x, const contiguous_iterator& y) { return x.it_ < y.it_; }
+  friend constexpr bool operator<=(const contiguous_iterator& x, const contiguous_iterator& y) {
+    return x.it_ <= y.it_;
+  }
+  friend constexpr bool operator>(const contiguous_iterator& x, const contiguous_iterator& y) { return x.it_ > y.it_; }
+  friend constexpr bool operator>=(const contiguous_iterator& x, const contiguous_iterator& y) {
+    return x.it_ >= y.it_;
+  }
 
     // Note no operator<=>, use three_way_contiguous_iterator for testing operator<=>
 
-    friend TEST_CONSTEXPR It base(const contiguous_iterator& i) { return i.it_; }
+  friend constexpr It base(const contiguous_iterator& i) { return i.it_; }
 
-    template <class T>
-    void operator,(T const &) = delete;
+  template <class T>
+  void operator,(T const&) = delete;
 };
 template <class It>
 contiguous_iterator(It) -> contiguous_iterator<It>;
diff --git a/libcxx/utils/libcxx/test/features.py b/libcxx/utils/libcxx/test/features.py
index e978875..97cdb03 100644
--- a/libcxx/utils/libcxx/test/features.py
+++ b/libcxx/utils/libcxx/test/features.py
@@ -263,6 +263,26 @@ DEFAULT_FEATURES = [
           """,
         ),
     ),
+    # Check for a Windows UCRT bug (not fixed upstream yet).
+    # With UCRT, printf("%a", 0.0) produces "0x0.0000000000000p+0",
+    # while other C runtimes produce just "0x0p+0".
+    # https://developercommunity.visualstudio.com/t/Printf-formatting-of-float-as-hex-prints/1660844
+    Feature(
+        name="win32-broken-printf-a-precision",
+        when=lambda cfg: "_WIN32" in compilerMacros(cfg)
+        and not programSucceeds(
+            cfg,
+            """
+            #include <stdio.h>
+            #include <string.h>
+            int main(int, char**) {
+              char buf[100];
+              snprintf(buf, sizeof(buf), "%a", 0.0);
+              return strcmp(buf, "0x0p+0");
+            }
+          """,
+        ),
+    ),
     # Check for Glibc < 2.27, where the ru_RU.UTF-8 locale had
     # mon_decimal_point == ".", which our tests don't handle.
     Feature(
diff --git a/libcxxabi/src/demangle/ItaniumDemangle.h b/libcxxabi/src/demangle/ItaniumDemangle.h
index 36bf454..2637d2d 100644
--- a/libcxxabi/src/demangle/ItaniumDemangle.h
+++ b/libcxxabi/src/demangle/ItaniumDemangle.h
@@ -200,8 +200,7 @@ private:
 
   Prec Precedence : 6;
 
-  // FIXME: Make these protected.
-public:
+protected:
   /// Tracks if this node has a component on its right side, in which case we
   /// need to call printRight.
   Cache RHSComponentCache : 2;
@@ -255,6 +254,9 @@ public:
   Kind getKind() const { return K; }
 
   Prec getPrecedence() const { return Precedence; }
+  Cache getRHSComponentCache() const { return RHSComponentCache; }
+  Cache getArrayCache() const { return ArrayCache; }
+  Cache getFunctionCache() const { return FunctionCache; }
 
   virtual bool hasRHSComponentSlow(OutputBuffer &) const { return false; }
   virtual bool hasArraySlow(OutputBuffer &) const { return false; }
@@ -424,8 +426,8 @@ protected:
 
 public:
   QualType(const Node *Child_, Qualifiers Quals_)
-      : Node(KQualType, Child_->RHSComponentCache,
-             Child_->ArrayCache, Child_->FunctionCache),
+      : Node(KQualType, Child_->getRHSComponentCache(), Child_->getArrayCache(),
+             Child_->getFunctionCache()),
         Quals(Quals_), Child(Child_) {}
 
   Qualifiers getQuals() const { return Quals; }
@@ -554,8 +556,8 @@ struct AbiTagAttr : Node {
   std::string_view Tag;
 
   AbiTagAttr(Node *Base_, std::string_view Tag_)
-      : Node(KAbiTagAttr, Base_->RHSComponentCache, Base_->ArrayCache,
-             Base_->FunctionCache),
+      : Node(KAbiTagAttr, Base_->getRHSComponentCache(), Base_->getArrayCache(),
+             Base_->getFunctionCache()),
         Base(Base_), Tag(Tag_) {}
 
   template<typename Fn> void match(Fn F) const { F(Base, Tag); }
@@ -615,7 +617,7 @@ class PointerType final : public Node {
 
 public:
   PointerType(const Node *Pointee_)
-      : Node(KPointerType, Pointee_->RHSComponentCache),
+      : Node(KPointerType, Pointee_->getRHSComponentCache()),
         Pointee(Pointee_) {}
 
   const Node *getPointee() const { return Pointee; }
@@ -699,7 +701,7 @@ class ReferenceType : public Node {
 
 public:
   ReferenceType(const Node *Pointee_, ReferenceKind RK_)
-      : Node(KReferenceType, Pointee_->RHSComponentCache),
+      : Node(KReferenceType, Pointee_->getRHSComponentCache()),
         Pointee(Pointee_), RK(RK_) {}
 
   template<typename Fn> void match(Fn F) const { F(Pointee, RK); }
@@ -742,7 +744,7 @@ class PointerToMemberType final : public Node {
 
 public:
   PointerToMemberType(const Node *ClassType_, const Node *MemberType_)
-      : Node(KPointerToMemberType, MemberType_->RHSComponentCache),
+      : Node(KPointerToMemberType, MemberType_->getRHSComponentCache()),
         ClassType(ClassType_), MemberType(MemberType_) {}
 
   template<typename Fn> void match(Fn F) const { F(ClassType, MemberType); }
@@ -1383,16 +1385,14 @@ class ParameterPack final : public Node {
 public:
   ParameterPack(NodeArray Data_) : Node(KParameterPack), Data(Data_) {
     ArrayCache = FunctionCache = RHSComponentCache = Cache::Unknown;
-    if (std::all_of(Data.begin(), Data.end(), [](Node* P) {
-          return P->ArrayCache == Cache::No;
-        }))
+    if (std::all_of(Data.begin(), Data.end(),
+                    [](Node *P) { return P->getArrayCache() == Cache::No; }))
       ArrayCache = Cache::No;
-    if (std::all_of(Data.begin(), Data.end(), [](Node* P) {
-          return P->FunctionCache == Cache::No;
-        }))
+    if (std::all_of(Data.begin(), Data.end(),
+                    [](Node *P) { return P->getFunctionCache() == Cache::No; }))
       FunctionCache = Cache::No;
-    if (std::all_of(Data.begin(), Data.end(), [](Node* P) {
-          return P->RHSComponentCache == Cache::No;
+    if (std::all_of(Data.begin(), Data.end(), [](Node *P) {
+          return P->getRHSComponentCache() == Cache::No;
         }))
       RHSComponentCache = Cache::No;
   }
diff --git a/libcxxabi/test/test_demangle.pass.cpp b/libcxxabi/test/test_demangle.pass.cpp
index fe55989..ab783cf 100644
--- a/libcxxabi/test/test_demangle.pass.cpp
+++ b/libcxxabi/test/test_demangle.pass.cpp
@@ -17,6 +17,8 @@
 // 80-bit format, and this demangling test is failing on it.
 // XFAIL: LIBCXX-ANDROID-FIXME && target={{i686|x86_64}}-{{.+}}-android{{.*}}
 
+// XFAIL: win32-broken-printf-a-precision
+
 #include "support/timer.h"
 #include <algorithm>
 #include <cassert>
diff --git a/libunwind/src/UnwindCursor.hpp b/libunwind/src/UnwindCursor.hpp
index 2ec60e4..7585573 100644
--- a/libunwind/src/UnwindCursor.hpp
+++ b/libunwind/src/UnwindCursor.hpp
@@ -2589,7 +2589,8 @@ void UnwindCursor<A, R>::setInfoBasedOnIPRegister(bool isReturnAddress) {
     --pc;
 #endif
 
-#if !(defined(_LIBUNWIND_SUPPORT_SEH_UNWIND) && defined(_WIN32))
+#if !(defined(_LIBUNWIND_SUPPORT_SEH_UNWIND) && defined(_WIN32)) &&            \
+    !defined(_LIBUNWIND_SUPPORT_TBTAB_UNWIND)
   // In case of this is frame of signal handler, the IP saved in the signal
   // handler points to first non-executed instruction, while FDE/CIE expects IP
   // to be after the first non-executed instruction.
diff --git a/libunwind/test/aix_signal_unwind.pass.sh.S b/libunwind/test/aix_signal_unwind.pass.sh.S
index 9ca18e9..a666577 100644
--- a/libunwind/test/aix_signal_unwind.pass.sh.S
+++ b/libunwind/test/aix_signal_unwind.pass.sh.S
@@ -10,7 +10,7 @@
 // a correct traceback when the function raising the signal does not save
 // the link register or does not store the stack back chain.
 
-// REQUIRES: target=powerpc{{(64)?}}-ibm-aix
+// REQUIRES: target=powerpc{{(64)?}}-ibm-aix{{.*}}
 
 // Test when the function raising the signal does not save the link register
 // RUN: %{cxx} -x c++ %s -o %t.exe -DCXX_CODE %{flags} %{compile_flags}
diff --git a/lld/ELF/Config.h b/lld/ELF/Config.h
index 0173be3..183dc88 100644
--- a/lld/ELF/Config.h
+++ b/lld/ELF/Config.h
@@ -27,6 +27,7 @@
 #include "llvm/Support/FileSystem.h"
 #include "llvm/Support/GlobPattern.h"
 #include "llvm/Support/PrettyStackTrace.h"
+#include "llvm/Support/TarWriter.h"
 #include <atomic>
 #include <memory>
 #include <optional>
@@ -332,6 +333,7 @@ struct Config {
   bool zPacPlt;
   bool zRelro;
   bool zRodynamic;
+  bool zSectionHeader;
   bool zShstk;
   bool zStartStopGC;
   uint8_t zStartStopVisibility;
@@ -458,6 +460,15 @@ struct ConfigWrapper {
 
 LLVM_LIBRARY_VISIBILITY extern ConfigWrapper config;
 
+// Some index properties of a symbol are stored separately in this auxiliary
+// struct to decrease sizeof(SymbolUnion) in the majority of cases.
+struct SymbolAux {
+  uint32_t gotIdx = -1;
+  uint32_t pltIdx = -1;
+  uint32_t tlsDescIdx = -1;
+  uint32_t tlsGdIdx = -1;
+};
+
 struct DuplicateSymbol {
   const Symbol *sym;
   const InputFile *file;
@@ -475,6 +486,8 @@ struct Ctx {
   SmallVector<BitcodeFile *, 0> lazyBitcodeFiles;
   SmallVector<InputSectionBase *, 0> inputSections;
   SmallVector<EhInputSection *, 0> ehInputSections;
+
+  SmallVector<SymbolAux, 0> symAux;
   // Duplicate symbol candidates.
   SmallVector<DuplicateSymbol, 0> duplicates;
   // Symbols in a non-prevailing COMDAT group which should be changed to an
@@ -489,6 +502,9 @@ struct Ctx {
                  std::pair<const InputFile *, const InputFile *>>
       backwardReferences;
   llvm::SmallSet<llvm::StringRef, 0> auxiliaryFiles;
+  // If --reproduce is specified, all input files are written to this tar
+  // archive.
+  std::unique_ptr<llvm::TarWriter> tar;
   // InputFile for linker created symbols with no source location.
   InputFile *internalFile;
   // True if SHT_LLVM_SYMPART is used.
diff --git a/lld/ELF/Driver.cpp b/lld/ELF/Driver.cpp
index 40e095a..a8c52e8 100644
--- a/lld/ELF/Driver.cpp
+++ b/lld/ELF/Driver.cpp
@@ -101,11 +101,14 @@ void Ctx::reset() {
   lazyBitcodeFiles.clear();
   inputSections.clear();
   ehInputSections.clear();
+
+  symAux.clear();
   duplicates.clear();
   nonPrevailingSyms.clear();
   whyExtractRecords.clear();
   backwardReferences.clear();
   auxiliaryFiles.clear();
+  tar.reset();
   internalFile = nullptr;
   hasSympart.store(false, std::memory_order_relaxed);
   hasTlsIe.store(false, std::memory_order_relaxed);
@@ -136,9 +139,7 @@ bool link(ArrayRef<const char *> args, llvm::raw_ostream &stdoutOS,
     symtab = SymbolTable();
 
     outputSections.clear();
-    symAux.clear();
 
-    tar = nullptr;
     in.reset();
 
     partitions.clear();
@@ -153,7 +154,7 @@ bool link(ArrayRef<const char *> args, llvm::raw_ostream &stdoutOS,
   config = ConfigWrapper();
   script = ScriptWrapper();
 
-  symAux.emplace_back();
+  elf::ctx.symAux.emplace_back();
 
   partitions.clear();
   partitions.emplace_back();
@@ -224,14 +225,15 @@ std::vector<std::pair<MemoryBufferRef, uint64_t>> static getArchiveMembers(
 
   std::vector<std::pair<MemoryBufferRef, uint64_t>> v;
   Error err = Error::success();
-  bool addToTar = file->isThin() && tar;
+  bool addToTar = file->isThin() && ctx.tar;
   for (const Archive::Child &c : file->children(err)) {
     MemoryBufferRef mbref =
         CHECK(c.getMemoryBufferRef(),
               mb.getBufferIdentifier() +
                   ": could not get the buffer for a child of the archive");
     if (addToTar)
-      tar->append(relativeToRoot(check(c.getFullName())), mbref.getBuffer());
+      ctx.tar->append(relativeToRoot(check(c.getFullName())),
+                      mbref.getBuffer());
     v.push_back(std::make_pair(mbref, c.getChildOffset()));
   }
   if (err)
@@ -445,6 +447,8 @@ static void checkOptions() {
       error("-r and --export-dynamic may not be used together");
     if (config->debugNames)
       error("-r and --debug-names may not be used together");
+    if (!config->zSectionHeader)
+      error("-r and -z nosectionheader may not be used together");
   }
 
   if (config->executeOnly) {
@@ -640,9 +644,9 @@ void LinkerDriver::linkerMain(ArrayRef<const char *> argsArr) {
     Expected<std::unique_ptr<TarWriter>> errOrWriter =
         TarWriter::create(path, path::stem(path));
     if (errOrWriter) {
-      tar = std::move(*errOrWriter);
-      tar->append("response.txt", createResponseFile(args));
-      tar->append("version.txt", getLLDVersion() + "\n");
+      ctx.tar = std::move(*errOrWriter);
+      ctx.tar->append("response.txt", createResponseFile(args));
+      ctx.tar->append("version.txt", getLLDVersion() + "\n");
       StringRef ltoSampleProfile = args.getLastArgValue(OPT_lto_sample_profile);
       if (!ltoSampleProfile.empty())
         readFile(ltoSampleProfile);
@@ -834,6 +838,8 @@ static ICFLevel getICF(opt::InputArgList &args) {
 static StripPolicy getStrip(opt::InputArgList &args) {
   if (args.hasArg(OPT_relocatable))
     return StripPolicy::None;
+  if (!config->zSectionHeader)
+    return StripPolicy::All;
 
   auto *arg = args.getLastArg(OPT_strip_all, OPT_strip_debug);
   if (!arg)
@@ -1409,7 +1415,9 @@ static void readConfigs(opt::InputArgList &args) {
   config->soName = args.getLastArgValue(OPT_soname);
   config->sortSection = getSortSection(args);
   config->splitStackAdjustSize = args::getInteger(args, OPT_split_stack_adjust_size, 16384);
-  config->strip = getStrip(args);
+  config->zSectionHeader =
+      getZFlag(args, "sectionheader", "nosectionheader", true);
+  config->strip = getStrip(args); // needs zSectionHeader
   config->sysroot = args.getLastArgValue(OPT_sysroot);
   config->target1Rel = args.hasFlag(OPT_target1_rel, OPT_target1_abs, false);
   config->target2 = getTarget2(args);
@@ -1911,13 +1919,7 @@ void LinkerDriver::createFiles(opt::InputArgList &args) {
       hasInput = true;
       break;
     case OPT_defsym: {
-      StringRef from;
-      StringRef to;
-      std::tie(from, to) = StringRef(arg->getValue()).split('=');
-      if (from.empty() || to.empty())
-        error("--defsym: syntax error: " + StringRef(arg->getValue()));
-      else
-        readDefsym(from, MemoryBufferRef(to, "--defsym"));
+      readDefsym(MemoryBufferRef(arg->getValue(), "--defsym"));
       break;
     }
     case OPT_script:
diff --git a/lld/ELF/ICF.cpp b/lld/ELF/ICF.cpp
index bfc605c..a6b52d7 100644
--- a/lld/ELF/ICF.cpp
+++ b/lld/ELF/ICF.cpp
@@ -103,12 +103,12 @@ private:
   void segregate(size_t begin, size_t end, uint32_t eqClassBase, bool constant);
 
   template <class RelTy>
-  bool constantEq(const InputSection *a, ArrayRef<RelTy> relsA,
-                  const InputSection *b, ArrayRef<RelTy> relsB);
+  bool constantEq(const InputSection *a, Relocs<RelTy> relsA,
+                  const InputSection *b, Relocs<RelTy> relsB);
 
   template <class RelTy>
-  bool variableEq(const InputSection *a, ArrayRef<RelTy> relsA,
-                  const InputSection *b, ArrayRef<RelTy> relsB);
+  bool variableEq(const InputSection *a, Relocs<RelTy> relsA,
+                  const InputSection *b, Relocs<RelTy> relsB);
 
   bool equalsConstant(const InputSection *a, const InputSection *b);
   bool equalsVariable(const InputSection *a, const InputSection *b);
@@ -235,8 +235,8 @@ void ICF<ELFT>::segregate(size_t begin, size_t end, uint32_t eqClassBase,
 // Compare two lists of relocations.
 template <class ELFT>
 template <class RelTy>
-bool ICF<ELFT>::constantEq(const InputSection *secA, ArrayRef<RelTy> ra,
-                           const InputSection *secB, ArrayRef<RelTy> rb) {
+bool ICF<ELFT>::constantEq(const InputSection *secA, Relocs<RelTy> ra,
+                           const InputSection *secB, Relocs<RelTy> rb) {
   if (ra.size() != rb.size())
     return false;
   auto rai = ra.begin(), rae = ra.end(), rbi = rb.begin();
@@ -333,8 +333,8 @@ bool ICF<ELFT>::equalsConstant(const InputSection *a, const InputSection *b) {
 // relocations point to the same section in terms of ICF.
 template <class ELFT>
 template <class RelTy>
-bool ICF<ELFT>::variableEq(const InputSection *secA, ArrayRef<RelTy> ra,
-                           const InputSection *secB, ArrayRef<RelTy> rb) {
+bool ICF<ELFT>::variableEq(const InputSection *secA, Relocs<RelTy> ra,
+                           const InputSection *secB, Relocs<RelTy> rb) {
   assert(ra.size() == rb.size());
 
   auto rai = ra.begin(), rae = ra.end(), rbi = rb.begin();
@@ -441,7 +441,7 @@ void ICF<ELFT>::forEachClass(llvm::function_ref<void(size_t, size_t)> fn) {
 // hash.
 template <class RelTy>
 static void combineRelocHashes(unsigned cnt, InputSection *isec,
-                               ArrayRef<RelTy> rels) {
+                               Relocs<RelTy> rels) {
   uint32_t hash = isec->eqClass[cnt % 2];
   for (RelTy rel : rels) {
     Symbol &s = isec->file->getRelocTargetSym(rel);
diff --git a/lld/ELF/InputFiles.cpp b/lld/ELF/InputFiles.cpp
index 03ff4ea..3bc3984 100644
--- a/lld/ELF/InputFiles.cpp
+++ b/lld/ELF/InputFiles.cpp
@@ -28,7 +28,6 @@
 #include "llvm/Support/FileSystem.h"
 #include "llvm/Support/Path.h"
 #include "llvm/Support/RISCVAttributeParser.h"
-#include "llvm/Support/TarWriter.h"
 #include "llvm/Support/TimeProfiler.h"
 #include "llvm/Support/raw_ostream.h"
 #include <optional>
@@ -52,8 +51,6 @@ extern template void ObjFile<ELF64BE>::importCmseSymbols();
 bool InputFile::isInGroup;
 uint32_t InputFile::nextGroupId;
 
-std::unique_ptr<TarWriter> elf::tar;
-
 // Returns "<internal>", "foo.a(bar.o)" or "baz.o".
 std::string lld::toString(const InputFile *f) {
   static std::mutex mu;
@@ -261,8 +258,8 @@ std::optional<MemoryBufferRef> elf::readFile(StringRef path) {
   MemoryBufferRef mbref = (*mbOrErr)->getMemBufferRef();
   ctx.memoryBuffers.push_back(std::move(*mbOrErr)); // take MB ownership
 
-  if (tar)
-    tar->append(relativeToRoot(path), mbref.getBuffer());
+  if (ctx.tar)
+    ctx.tar->append(relativeToRoot(path), mbref.getBuffer());
   return mbref;
 }
 
diff --git a/lld/ELF/InputFiles.h b/lld/ELF/InputFiles.h
index 0617f41..755b30d 100644
--- a/lld/ELF/InputFiles.h
+++ b/lld/ELF/InputFiles.h
@@ -39,9 +39,6 @@ namespace elf {
 class InputSection;
 class Symbol;
 
-// If --reproduce is specified, all input files are written to this tar archive.
-extern std::unique_ptr<llvm::TarWriter> tar;
-
 // Opens a given file.
 std::optional<MemoryBufferRef> readFile(StringRef path);
 
diff --git a/lld/ELF/InputSection.cpp b/lld/ELF/InputSection.cpp
index 12ab1f1..7857d85 100644
--- a/lld/ELF/InputSection.cpp
+++ b/lld/ELF/InputSection.cpp
@@ -911,7 +911,7 @@ uint64_t InputSectionBase::getRelocTargetVA(const InputFile *file, RelType type,
 // So, we handle relocations for non-alloc sections directly in this
 // function as a performance optimization.
 template <class ELFT, class RelTy>
-void InputSection::relocateNonAlloc(uint8_t *buf, ArrayRef<RelTy> rels) {
+void InputSection::relocateNonAlloc(uint8_t *buf, Relocs<RelTy> rels) {
   const unsigned bits = sizeof(typename ELFT::uint) * 8;
   const TargetInfo &target = *elf::target;
   const auto emachine = config->emachine;
@@ -1073,11 +1073,7 @@ void InputSectionBase::relocate(uint8_t *buf, uint8_t *bufEnd) {
   auto *sec = cast<InputSection>(this);
   // For a relocatable link, also call relocateNonAlloc() to rewrite applicable
   // locations with tombstone values.
-  const RelsOrRelas<ELFT> rels = sec->template relsOrRelas<ELFT>();
-  if (rels.areRelocsRel())
-    sec->relocateNonAlloc<ELFT>(buf, rels.rels);
-  else
-    sec->relocateNonAlloc<ELFT>(buf, rels.relas);
+  invokeOnRelocs(*sec, sec->relocateNonAlloc<ELFT>, buf);
 }
 
 // For each function-defining prologue, find any calls to __morestack,
diff --git a/lld/ELF/InputSection.h b/lld/ELF/InputSection.h
index ec12235..c89a545 100644
--- a/lld/ELF/InputSection.h
+++ b/lld/ELF/InputSection.h
@@ -37,11 +37,20 @@ LLVM_LIBRARY_VISIBILITY extern std::vector<Partition> partitions;
 
 // Returned by InputSectionBase::relsOrRelas. At least one member is empty.
 template <class ELFT> struct RelsOrRelas {
-  ArrayRef<typename ELFT::Rel> rels;
-  ArrayRef<typename ELFT::Rela> relas;
+  Relocs<typename ELFT::Rel> rels;
+  Relocs<typename ELFT::Rela> relas;
   bool areRelocsRel() const { return rels.size(); }
 };
 
+#define invokeOnRelocs(sec, f, ...)                                            \
+  {                                                                            \
+    const RelsOrRelas<ELFT> rs = (sec).template relsOrRelas<ELFT>();           \
+    if (rs.areRelocsRel())                                                     \
+      f(__VA_ARGS__, rs.rels);                                                 \
+    else                                                                       \
+      f(__VA_ARGS__, rs.relas);                                                \
+  }
+
 // This is the base class of all sections that lld handles. Some are sections in
 // input files, some are sections in the produced output file and some exist
 // just as a convenience for implementing special ways of combining some
@@ -407,7 +416,7 @@ public:
   InputSectionBase *getRelocatedSection() const;
 
   template <class ELFT, class RelTy>
-  void relocateNonAlloc(uint8_t *buf, llvm::ArrayRef<RelTy> rels);
+  void relocateNonAlloc(uint8_t *buf, Relocs<RelTy> rels);
 
   // Points to the canonical section. If ICF folds two sections, repl pointer of
   // one section points to the other.
diff --git a/lld/ELF/Relocations.cpp b/lld/ELF/Relocations.cpp
index 6ad5c3b..94ad7b3 100644
--- a/lld/ELF/Relocations.cpp
+++ b/lld/ELF/Relocations.cpp
@@ -475,8 +475,9 @@ private:
                                 uint64_t relOff) const;
   void processAux(RelExpr expr, RelType type, uint64_t offset, Symbol &sym,
                   int64_t addend) const;
-  template <class ELFT, class RelTy> void scanOne(RelTy *&i);
-  template <class ELFT, class RelTy> void scan(ArrayRef<RelTy> rels);
+  template <class ELFT, class RelTy>
+  void scanOne(typename Relocs<RelTy>::const_iterator &i);
+  template <class ELFT, class RelTy> void scan(Relocs<RelTy> rels);
 };
 } // namespace
 
@@ -1434,7 +1435,8 @@ static unsigned handleTlsRelocation(RelType type, Symbol &sym,
   return 0;
 }
 
-template <class ELFT, class RelTy> void RelocationScanner::scanOne(RelTy *&i) {
+template <class ELFT, class RelTy>
+void RelocationScanner::scanOne(typename Relocs<RelTy>::const_iterator &i) {
   const RelTy &rel = *i;
   uint32_t symIndex = rel.getSymbol(config->isMips64EL);
   Symbol &sym = sec->getFile<ELFT>()->getSymbol(symIndex);
@@ -1575,7 +1577,7 @@ static void checkPPC64TLSRelax(InputSectionBase &sec, ArrayRef<RelTy> rels) {
 }
 
 template <class ELFT, class RelTy>
-void RelocationScanner::scan(ArrayRef<RelTy> rels) {
+void RelocationScanner::scan(Relocs<RelTy> rels) {
   // Not all relocations end up in Sec->Relocations, but a lot do.
   sec->relocations.reserve(rels.size());
 
@@ -1593,7 +1595,7 @@ void RelocationScanner::scan(ArrayRef<RelTy> rels) {
 
   end = static_cast<const void *>(rels.end());
   for (auto i = rels.begin(); i != end;)
-    scanOne<ELFT>(i);
+    scanOne<ELFT, RelTy>(i);
 
   // Sort relocations by offset for more efficient searching for
   // R_RISCV_PCREL_HI20 and R_PPC64_ADDR64.
@@ -1714,7 +1716,7 @@ static bool handleNonPreemptibleIfunc(Symbol &sym, uint16_t flags) {
   auto &dyn = config->androidPackDynRelocs ? *in.relaPlt : *mainPart->relaDyn;
   addPltEntry(*in.iplt, *in.igotPlt, dyn, target->iRelativeRel, *directSym);
   sym.allocateAux();
-  symAux.back().pltIdx = symAux[directSym->auxIdx].pltIdx;
+  ctx.symAux.back().pltIdx = ctx.symAux[directSym->auxIdx].pltIdx;
 
   if (flags & HAS_DIRECT_RELOC) {
     // Change the value to the IPLT and redirect all references to it.
@@ -1832,7 +1834,7 @@ void elf::postScanRelocations() {
           {R_ADDEND, target->symbolicRel, got->getTlsIndexOff(), 1, &dummy});
   }
 
-  assert(symAux.size() == 1);
+  assert(ctx.symAux.size() == 1);
   for (Symbol *sym : symtab.getSymbols())
     fn(*sym);
 
@@ -2410,11 +2412,7 @@ template <class ELFT> void elf::checkNoCrossRefs() {
         if (!isd)
           continue;
         parallelForEach(isd->sections, [&](InputSection *sec) {
-          const RelsOrRelas<ELFT> rels = sec->template relsOrRelas<ELFT>();
-          if (rels.areRelocsRel())
-            scanCrossRefs<ELFT>(noxref, osec, sec, rels.rels);
-          else
-            scanCrossRefs<ELFT>(noxref, osec, sec, rels.relas);
+          invokeOnRelocs(*sec, scanCrossRefs<ELFT>, noxref, osec, sec);
         });
       }
     }
diff --git a/lld/ELF/Relocations.h b/lld/ELF/Relocations.h
index 1bee0de..77d8d52 100644
--- a/lld/ELF/Relocations.h
+++ b/lld/ELF/Relocations.h
@@ -205,6 +205,11 @@ private:
   uint32_t pass = 0;
 };
 
+template <class RelTy> struct Relocs : ArrayRef<RelTy> {
+  Relocs() = default;
+  Relocs(ArrayRef<RelTy> a) : ArrayRef<RelTy>(a) {}
+};
+
 // Return a int64_t to make sure we get the sign extension out of the way as
 // early as possible.
 template <class ELFT>
@@ -217,14 +222,15 @@ static inline int64_t getAddend(const typename ELFT::Rela &rel) {
 }
 
 template <typename RelTy>
-ArrayRef<RelTy> sortRels(ArrayRef<RelTy> rels, SmallVector<RelTy, 0> &storage) {
+inline Relocs<RelTy> sortRels(Relocs<RelTy> rels,
+                              SmallVector<RelTy, 0> &storage) {
   auto cmp = [](const RelTy &a, const RelTy &b) {
     return a.r_offset < b.r_offset;
   };
   if (!llvm::is_sorted(rels, cmp)) {
     storage.assign(rels.begin(), rels.end());
     llvm::stable_sort(storage, cmp);
-    rels = storage;
+    rels = Relocs<RelTy>(storage);
   }
   return rels;
 }
diff --git a/lld/ELF/ScriptLexer.cpp b/lld/ELF/ScriptLexer.cpp
index 40c4637..40528f0 100644
--- a/lld/ELF/ScriptLexer.cpp
+++ b/lld/ELF/ScriptLexer.cpp
@@ -27,89 +27,84 @@
 //===----------------------------------------------------------------------===//
 
 #include "ScriptLexer.h"
+#include "Config.h"
 #include "lld/Common/ErrorHandler.h"
 #include "llvm/ADT/Twine.h"
 #include "llvm/Support/ErrorHandling.h"
+#include "llvm/Support/FileSystem.h"
+#include "llvm/Support/Path.h"
 #include <algorithm>
 
 using namespace llvm;
 using namespace lld;
 using namespace lld::elf;
 
+ScriptLexer::Buffer::Buffer(MemoryBufferRef mb)
+    : s(mb.getBuffer()), filename(mb.getBufferIdentifier()),
+      begin(mb.getBufferStart()) {
+  if (config->sysroot == "")
+    return;
+  StringRef path = filename;
+  for (; !path.empty(); path = sys::path::parent_path(path)) {
+    if (!sys::fs::equivalent(config->sysroot, path))
+      continue;
+    isUnderSysroot = true;
+    return;
+  }
+}
+
+ScriptLexer::ScriptLexer(MemoryBufferRef mb) : curBuf(mb), mbs(1, mb) {
+  activeFilenames.insert(mb.getBufferIdentifier());
+}
+
 // Returns a whole line containing the current token.
 StringRef ScriptLexer::getLine() {
   StringRef s = getCurrentMB().getBuffer();
-  StringRef tok = tokens[pos - 1];
 
-  size_t pos = s.rfind('\n', tok.data() - s.data());
+  size_t pos = s.rfind('\n', prevTok.data() - s.data());
   if (pos != StringRef::npos)
     s = s.substr(pos + 1);
   return s.substr(0, s.find_first_of("\r\n"));
 }
 
-// Returns 1-based line number of the current token.
-size_t ScriptLexer::getLineNumber() {
-  if (pos == 0)
-    return 1;
-  StringRef s = getCurrentMB().getBuffer();
-  StringRef tok = tokens[pos - 1];
-  const size_t tokOffset = tok.data() - s.data();
-
-  // For the first token, or when going backwards, start from the beginning of
-  // the buffer. If this token is after the previous token, start from the
-  // previous token.
-  size_t line = 1;
-  size_t start = 0;
-  if (lastLineNumberOffset > 0 && tokOffset >= lastLineNumberOffset) {
-    start = lastLineNumberOffset;
-    line = lastLineNumber;
-  }
-
-  line += s.substr(start, tokOffset - start).count('\n');
-
-  // Store the line number of this token for reuse.
-  lastLineNumberOffset = tokOffset;
-  lastLineNumber = line;
-
-  return line;
-}
-
 // Returns 0-based column number of the current token.
 size_t ScriptLexer::getColumnNumber() {
-  StringRef tok = tokens[pos - 1];
-  return tok.data() - getLine().data();
+  return prevTok.data() - getLine().data();
 }
 
 std::string ScriptLexer::getCurrentLocation() {
   std::string filename = std::string(getCurrentMB().getBufferIdentifier());
-  return (filename + ":" + Twine(getLineNumber())).str();
+  return (filename + ":" + Twine(prevTokLine)).str();
 }
 
-ScriptLexer::ScriptLexer(MemoryBufferRef mb) { tokenize(mb); }
-
 // We don't want to record cascading errors. Keep only the first one.
 void ScriptLexer::setError(const Twine &msg) {
   if (errorCount())
     return;
 
   std::string s = (getCurrentLocation() + ": " + msg).str();
-  if (pos)
+  if (prevTok.size())
     s += "\n>>> " + getLine().str() + "\n>>> " +
          std::string(getColumnNumber(), ' ') + "^";
   error(s);
 }
 
-// Split S into linker script tokens.
-void ScriptLexer::tokenize(MemoryBufferRef mb) {
-  std::vector<StringRef> vec;
-  mbs.push_back(mb);
-  StringRef s = mb.getBuffer();
-  StringRef begin = s;
-
+void ScriptLexer::lex() {
   for (;;) {
+    StringRef &s = curBuf.s;
     s = skipSpace(s);
-    if (s.empty())
-      break;
+    if (s.empty()) {
+      // If this buffer is from an INCLUDE command, switch to the "return
+      // value"; otherwise, mark EOF.
+      if (buffers.empty()) {
+        eof = true;
+        return;
+      }
+      activeFilenames.erase(curBuf.filename);
+      curBuf = buffers.pop_back_val();
+      continue;
+    }
+    curTokState = inExpr;
 
     // Quoted token. Note that double-quote characters are parts of a token
     // because, in a glob match context, only unquoted tokens are interpreted
@@ -118,45 +113,53 @@ void ScriptLexer::tokenize(MemoryBufferRef mb) {
     if (s.starts_with("\"")) {
       size_t e = s.find("\"", 1);
       if (e == StringRef::npos) {
-        StringRef filename = mb.getBufferIdentifier();
-        size_t lineno = begin.substr(0, s.data() - begin.data()).count('\n');
-        error(filename + ":" + Twine(lineno + 1) + ": unclosed quote");
+        size_t lineno =
+            StringRef(curBuf.begin, s.data() - curBuf.begin).count('\n');
+        error(curBuf.filename + ":" + Twine(lineno + 1) + ": unclosed quote");
         return;
       }
 
-      vec.push_back(s.take_front(e + 1));
+      curTok = s.take_front(e + 1);
       s = s.substr(e + 1);
-      continue;
+      return;
     }
 
     // Some operators form separate tokens.
     if (s.starts_with("<<=") || s.starts_with(">>=")) {
-      vec.push_back(s.substr(0, 3));
+      curTok = s.substr(0, 3);
       s = s.substr(3);
-      continue;
+      return;
     }
-    if (s.size() > 1 && ((s[1] == '=' && strchr("*/+-<>&^|", s[0])) ||
-                         (s[0] == s[1] && strchr("<>&|", s[0])))) {
-      vec.push_back(s.substr(0, 2));
+    if (s.size() > 1 && (s[1] == '=' && strchr("+-*/!&^|", s[0]))) {
+      curTok = s.substr(0, 2);
       s = s.substr(2);
-      continue;
+      return;
     }
 
-    // Unquoted token. This is more relaxed than tokens in C-like language,
-    // so that you can write "file-name.cpp" as one bare token, for example.
-    size_t pos = s.find_first_not_of(
-        "ABCDEFGHIJKLMNOPQRSTUVWXYZabcdefghijklmnopqrstuvwxyz"
-        "0123456789_.$/\\~=+[]*?-!^:");
+    // Unquoted token. The non-expression token is more relaxed than tokens in
+    // C-like languages, so that you can write "file-name.cpp" as one bare
+    // token.
+    size_t pos;
+    if (inExpr) {
+      pos = s.find_first_not_of(
+          "ABCDEFGHIJKLMNOPQRSTUVWXYZabcdefghijklmnopqrstuvwxyz"
+          "0123456789_.$");
+      if (pos == 0 && s.size() >= 2 &&
+          ((s[0] == s[1] && strchr("<>&|", s[0])) ||
+           is_contained({"==", "!=", "<=", ">=", "<<", ">>"}, s.substr(0, 2))))
+        pos = 2;
+    } else {
+      pos = s.find_first_not_of(
+          "ABCDEFGHIJKLMNOPQRSTUVWXYZabcdefghijklmnopqrstuvwxyz"
+          "0123456789_.$/\\~=+[]*?-!^:");
+    }
 
-    // A character that cannot start a word (which is usually a
-    // punctuation) forms a single character token.
     if (pos == 0)
       pos = 1;
-    vec.push_back(s.substr(0, pos));
+    curTok = s.substr(0, pos);
     s = s.substr(pos);
+    break;
   }
-
-  tokens.insert(tokens.begin() + pos, vec.begin(), vec.end());
 }
 
 // Skip leading whitespace characters or comments.
@@ -168,6 +171,7 @@ StringRef ScriptLexer::skipSpace(StringRef s) {
         setError("unclosed comment in a linker script");
         return "";
       }
+      curBuf.lineNumber += s.substr(0, e).count('\n');
       s = s.substr(e + 2);
       continue;
     }
@@ -175,103 +179,48 @@ StringRef ScriptLexer::skipSpace(StringRef s) {
       size_t e = s.find('\n', 1);
       if (e == StringRef::npos)
         e = s.size() - 1;
+      else
+        ++curBuf.lineNumber;
       s = s.substr(e + 1);
       continue;
     }
-    size_t size = s.size();
+    StringRef saved = s;
     s = s.ltrim();
-    if (s.size() == size)
+    auto len = saved.size() - s.size();
+    if (len == 0)
       return s;
+    curBuf.lineNumber += saved.substr(0, len).count('\n');
   }
 }
 
-// An erroneous token is handled as if it were the last token before EOF.
-bool ScriptLexer::atEOF() { return errorCount() || tokens.size() == pos; }
-
-// Split a given string as an expression.
-// This function returns "3", "*" and "5" for "3*5" for example.
-static std::vector<StringRef> tokenizeExpr(StringRef s) {
-  StringRef ops = "!~*/+-<>?^:="; // List of operators
-
-  // Quoted strings are literal strings, so we don't want to split it.
-  if (s.starts_with("\""))
-    return {s};
-
-  // Split S with operators as separators.
-  std::vector<StringRef> ret;
-  while (!s.empty()) {
-    size_t e = s.find_first_of(ops);
-
-    // No need to split if there is no operator.
-    if (e == StringRef::npos) {
-      ret.push_back(s);
-      break;
-    }
-
-    // Get a token before the operator.
-    if (e != 0)
-      ret.push_back(s.substr(0, e));
-
-    // Get the operator as a token.
-    // Keep !=, ==, >=, <=, << and >> operators as a single tokens.
-    if (s.substr(e).starts_with("!=") || s.substr(e).starts_with("==") ||
-        s.substr(e).starts_with(">=") || s.substr(e).starts_with("<=") ||
-        s.substr(e).starts_with("<<") || s.substr(e).starts_with(">>")) {
-      ret.push_back(s.substr(e, 2));
-      s = s.substr(e + 2);
-    } else {
-      ret.push_back(s.substr(e, 1));
-      s = s.substr(e + 1);
-    }
-  }
-  return ret;
-}
-
-// In contexts where expressions are expected, the lexer should apply
-// different tokenization rules than the default one. By default,
-// arithmetic operator characters are regular characters, but in the
-// expression context, they should be independent tokens.
-//
-// For example, "foo*3" should be tokenized to "foo", "*" and "3" only
-// in the expression context.
-//
-// This function may split the current token into multiple tokens.
-void ScriptLexer::maybeSplitExpr() {
-  if (!inExpr || errorCount() || atEOF())
-    return;
-
-  std::vector<StringRef> v = tokenizeExpr(tokens[pos]);
-  if (v.size() == 1)
-    return;
-  tokens.erase(tokens.begin() + pos);
-  tokens.insert(tokens.begin() + pos, v.begin(), v.end());
-}
+// Used to determine whether to stop parsing. Treat errors like EOF.
+bool ScriptLexer::atEOF() { return eof || errorCount(); }
 
 StringRef ScriptLexer::next() {
-  maybeSplitExpr();
-
-  if (errorCount())
-    return "";
-  if (atEOF()) {
-    setError("unexpected EOF");
-    return "";
-  }
-  return tokens[pos++];
+  prevTok = peek();
+  // `prevTokLine` is not updated for EOF so that the line number in `setError`
+  // will be more useful.
+  if (prevTok.size())
+    prevTokLine = curBuf.lineNumber;
+  return std::exchange(curTok, StringRef(curBuf.s.data(), 0));
 }
 
 StringRef ScriptLexer::peek() {
-  StringRef tok = next();
-  if (errorCount())
-    return "";
-  pos = pos - 1;
-  return tok;
+  // curTok is invalid if curTokState and inExpr mismatch.
+  if (curTok.size() && curTokState != inExpr) {
+    curBuf.s = StringRef(curTok.data(), curBuf.s.end() - curTok.data());
+    curTok = {};
+  }
+  if (curTok.empty())
+    lex();
+  return curTok;
 }
 
 bool ScriptLexer::consume(StringRef tok) {
-  if (next() == tok)
-    return true;
-  --pos;
-  return false;
+  if (peek() != tok)
+    return false;
+  next();
+  return true;
 }
 
 void ScriptLexer::skip() { (void)next(); }
@@ -280,8 +229,23 @@ void ScriptLexer::expect(StringRef expect) {
   if (errorCount())
     return;
   StringRef tok = next();
-  if (tok != expect)
-    setError(expect + " expected, but got " + tok);
+  if (tok != expect) {
+    if (atEOF())
+      setError("unexpected EOF");
+    else
+      setError(expect + " expected, but got " + tok);
+  }
+}
+
+ScriptLexer::Token ScriptLexer::till(StringRef tok) {
+  StringRef str = next();
+  if (str == tok)
+    return {};
+  if (!atEOF())
+    return {str};
+  prevTok = {};
+  setError("unexpected EOF");
+  return {};
 }
 
 // Returns true if S encloses T.
@@ -292,10 +256,8 @@ static bool encloses(StringRef s, StringRef t) {
 MemoryBufferRef ScriptLexer::getCurrentMB() {
   // Find input buffer containing the current token.
   assert(!mbs.empty());
-  if (pos == 0)
-    return mbs.back();
   for (MemoryBufferRef mb : mbs)
-    if (encloses(mb.getBuffer(), tokens[pos - 1]))
+    if (encloses(mb.getBuffer(), curBuf.s))
       return mb;
   llvm_unreachable("getCurrentMB: failed to find a token");
 }
diff --git a/lld/ELF/ScriptLexer.h b/lld/ELF/ScriptLexer.h
index 7d945d8..ffd8441 100644
--- a/lld/ELF/ScriptLexer.h
+++ b/lld/ELF/ScriptLexer.h
@@ -10,6 +10,8 @@
 #define LLD_ELF_SCRIPT_LEXER_H
 
 #include "lld/Common/LLVM.h"
+#include "llvm/ADT/DenseSet.h"
+#include "llvm/ADT/SmallVector.h"
 #include "llvm/ADT/StringRef.h"
 #include "llvm/Support/MemoryBufferRef.h"
 #include <vector>
@@ -17,11 +19,48 @@
 namespace lld::elf {
 
 class ScriptLexer {
+protected:
+  struct Buffer {
+    // The remaining content to parse and the filename.
+    StringRef s, filename;
+    const char *begin = nullptr;
+    size_t lineNumber = 1;
+    // True if the script is opened as an absolute path under the --sysroot
+    // directory.
+    bool isUnderSysroot = false;
+
+    Buffer() = default;
+    Buffer(MemoryBufferRef mb);
+  };
+  // The current buffer and parent buffers due to INCLUDE.
+  Buffer curBuf;
+  SmallVector<Buffer, 0> buffers;
+
+  // Used to detect INCLUDE() cycles.
+  llvm::DenseSet<StringRef> activeFilenames;
+
+  struct Token {
+    StringRef str;
+    explicit operator bool() const { return !str.empty(); }
+    operator StringRef() const { return str; }
+  };
+
+  // The token before the last next().
+  StringRef prevTok;
+  // Rules for what is a token are different when we are in an expression.
+  // curTok holds the cached return value of peek() and is invalid when the
+  // expression state changes.
+  StringRef curTok;
+  size_t prevTokLine = 1;
+  // The inExpr state when curTok is cached.
+  bool curTokState = false;
+  bool eof = false;
+
 public:
   explicit ScriptLexer(MemoryBufferRef mb);
 
   void setError(const Twine &msg);
-  void tokenize(MemoryBufferRef mb);
+  void lex();
   StringRef skipSpace(StringRef s);
   bool atEOF();
   StringRef next();
@@ -29,21 +68,15 @@ public:
   void skip();
   bool consume(StringRef tok);
   void expect(StringRef expect);
+  Token till(StringRef tok);
   std::string getCurrentLocation();
   MemoryBufferRef getCurrentMB();
 
   std::vector<MemoryBufferRef> mbs;
-  std::vector<StringRef> tokens;
   bool inExpr = false;
-  size_t pos = 0;
-
-  size_t lastLineNumber = 0;
-  size_t lastLineNumberOffset = 0;
 
 private:
-  void maybeSplitExpr();
   StringRef getLine();
-  size_t getLineNumber();
   size_t getColumnNumber();
 };
 
diff --git a/lld/ELF/ScriptParser.cpp b/lld/ELF/ScriptParser.cpp
index 8637a8b..107b4c6 100644
--- a/lld/ELF/ScriptParser.cpp
+++ b/lld/ELF/ScriptParser.cpp
@@ -24,7 +24,6 @@
 #include "lld/Common/CommonLinkerContext.h"
 #include "llvm/ADT/SmallString.h"
 #include "llvm/ADT/StringRef.h"
-#include "llvm/ADT/StringSet.h"
 #include "llvm/ADT/StringSwitch.h"
 #include "llvm/BinaryFormat/ELF.h"
 #include "llvm/Support/Casting.h"
@@ -49,22 +48,12 @@ namespace {
 class ScriptParser final : ScriptLexer {
 public:
   ScriptParser(MemoryBufferRef mb) : ScriptLexer(mb) {
-    // Initialize IsUnderSysroot
-    if (config->sysroot == "")
-      return;
-    StringRef path = mb.getBufferIdentifier();
-    for (; !path.empty(); path = sys::path::parent_path(path)) {
-      if (!sys::fs::equivalent(config->sysroot, path))
-        continue;
-      isUnderSysroot = true;
-      return;
-    }
   }
 
   void readLinkerScript();
   void readVersionScript();
   void readDynamicList();
-  void readDefsym(StringRef name);
+  void readDefsym();
 
 private:
   void addFile(StringRef path);
@@ -89,6 +78,7 @@ private:
   void readVersionScriptCommand();
   void readNoCrossRefs(bool to);
 
+  StringRef readName();
   SymbolAssignment *readSymbolAssignment(StringRef name);
   ByteCommand *readByteCommand(StringRef tok);
   std::array<uint8_t, 4> readFill();
@@ -122,7 +112,7 @@ private:
   Expr combine(StringRef op, Expr l, Expr r);
   Expr readExpr();
   Expr readExpr1(Expr lhs, int minPrec);
-  StringRef readParenLiteral();
+  StringRef readParenName();
   Expr readPrimary();
   Expr readTernary(Expr cond);
   Expr readParenExpr();
@@ -135,12 +125,6 @@ private:
   std::pair<SmallVector<SymbolVersion, 0>, SmallVector<SymbolVersion, 0>>
   readSymbols();
 
-  // True if a script being read is in the --sysroot directory.
-  bool isUnderSysroot = false;
-
-  // A set to detect an INCLUDE() cycle.
-  StringSet<> seen;
-
   // If we are currently parsing a PROVIDE|PROVIDE_HIDDEN command,
   // then this member is set to the PROVIDE symbol name.
   std::optional<llvm::StringRef> activeProvideSym;
@@ -200,8 +184,9 @@ void ScriptParser::readDynamicList() {
   std::tie(locals, globals) = readSymbols();
   expect(";");
 
-  if (!atEOF()) {
-    setError("EOF expected, but got " + next());
+  StringRef tok = peek();
+  if (tok.size()) {
+    setError("EOF expected, but got " + tok);
     return;
   }
   if (!locals.empty()) {
@@ -215,8 +200,9 @@ void ScriptParser::readDynamicList() {
 
 void ScriptParser::readVersionScript() {
   readVersionScriptCommand();
-  if (!atEOF())
-    setError("EOF expected, but got " + next());
+  StringRef tok = peek();
+  if (tok.size())
+    setError("EOF expected, but got " + tok);
 }
 
 void ScriptParser::readVersionScriptCommand() {
@@ -225,7 +211,9 @@ void ScriptParser::readVersionScriptCommand() {
     return;
   }
 
-  while (!atEOF() && !errorCount() && peek() != "}") {
+  if (atEOF())
+    setError("unexpected EOF");
+  while (peek() != "}" && !atEOF()) {
     StringRef verStr = next();
     if (verStr == "{") {
       setError("anonymous version definition is used in "
@@ -246,6 +234,8 @@ void ScriptParser::readVersion() {
 void ScriptParser::readLinkerScript() {
   while (!atEOF()) {
     StringRef tok = next();
+    if (atEOF())
+      break;
     if (tok == ";")
       continue;
 
@@ -293,9 +283,12 @@ void ScriptParser::readLinkerScript() {
   }
 }
 
-void ScriptParser::readDefsym(StringRef name) {
+void ScriptParser::readDefsym() {
   if (errorCount())
     return;
+  inExpr = true;
+  StringRef name = readName();
+  expect("=");
   Expr e = readExpr();
   if (!atEOF())
     setError("EOF expected, but got " + next());
@@ -307,8 +300,8 @@ void ScriptParser::readDefsym(StringRef name) {
 void ScriptParser::readNoCrossRefs(bool to) {
   expect("(");
   NoCrossRefCommand cmd{{}, to};
-  while (!errorCount() && !consume(")"))
-    cmd.outputSections.push_back(unquote(next()));
+  while (auto tok = till(")"))
+    cmd.outputSections.push_back(unquote(tok));
   if (cmd.outputSections.size() < 2)
     warn(getCurrentLocation() + ": ignored with fewer than 2 output sections");
   else
@@ -316,7 +309,7 @@ void ScriptParser::readNoCrossRefs(bool to) {
 }
 
 void ScriptParser::addFile(StringRef s) {
-  if (isUnderSysroot && s.starts_with("/")) {
+  if (curBuf.isUnderSysroot && s.starts_with("/")) {
     SmallString<128> pathData;
     StringRef path = (config->sysroot + s).toStringRef(pathData);
     if (sys::fs::exists(path))
@@ -368,24 +361,24 @@ void ScriptParser::readAsNeeded() {
   expect("(");
   bool orig = config->asNeeded;
   config->asNeeded = true;
-  while (!errorCount() && !consume(")"))
-    addFile(unquote(next()));
+  while (auto tok = till(")"))
+    addFile(unquote(tok));
   config->asNeeded = orig;
 }
 
 void ScriptParser::readEntry() {
   // -e <symbol> takes predecence over ENTRY(<symbol>).
   expect("(");
-  StringRef tok = next();
+  StringRef name = readName();
   if (config->entry.empty())
-    config->entry = unquote(tok);
+    config->entry = name;
   expect(")");
 }
 
 void ScriptParser::readExtern() {
   expect("(");
-  while (!errorCount() && !consume(")"))
-    config->undefined.push_back(unquote(next()));
+  while (auto tok = till(")"))
+    config->undefined.push_back(unquote(tok));
 }
 
 void ScriptParser::readGroup() {
@@ -398,45 +391,47 @@ void ScriptParser::readGroup() {
 }
 
 void ScriptParser::readInclude() {
-  StringRef tok = unquote(next());
-
-  if (!seen.insert(tok).second) {
+  StringRef name = readName();
+  if (!activeFilenames.insert(name).second) {
     setError("there is a cycle in linker script INCLUDEs");
     return;
   }
 
-  if (std::optional<std::string> path = searchScript(tok)) {
-    if (std::optional<MemoryBufferRef> mb = readFile(*path))
-      tokenize(*mb);
+  if (std::optional<std::string> path = searchScript(name)) {
+    if (std::optional<MemoryBufferRef> mb = readFile(*path)) {
+      buffers.push_back(curBuf);
+      curBuf = Buffer(*mb);
+      mbs.push_back(*mb);
+    }
     return;
   }
-  setError("cannot find linker script " + tok);
+  setError("cannot find linker script " + name);
 }
 
 void ScriptParser::readInput() {
   expect("(");
-  while (!errorCount() && !consume(")")) {
-    if (consume("AS_NEEDED"))
+  while (auto tok = till(")")) {
+    if (tok == "AS_NEEDED")
       readAsNeeded();
     else
-      addFile(unquote(next()));
+      addFile(unquote(tok));
   }
 }
 
 void ScriptParser::readOutput() {
   // -o <file> takes predecence over OUTPUT(<file>).
   expect("(");
-  StringRef tok = next();
+  StringRef name = readName();
   if (config->outputFile.empty())
-    config->outputFile = unquote(tok);
+    config->outputFile = name;
   expect(")");
 }
 
 void ScriptParser::readOutputArch() {
   // OUTPUT_ARCH is ignored for now.
   expect("(");
-  while (!errorCount() && !consume(")"))
-    skip();
+  while (till(")"))
+    ;
 }
 
 static std::pair<ELFKind, uint16_t> parseBfdName(StringRef s) {
@@ -478,14 +473,14 @@ static std::pair<ELFKind, uint16_t> parseBfdName(StringRef s) {
 void ScriptParser::readOutputFormat() {
   expect("(");
 
-  StringRef s = unquote(next());
+  StringRef s = readName();
   if (!consume(")")) {
     expect(",");
-    StringRef tmp = unquote(next());
+    StringRef tmp = readName();
     if (config->optEB)
       s = tmp;
     expect(",");
-    tmp = unquote(next());
+    tmp = readName();
     if (config->optEL)
       s = tmp;
     consume(")");
@@ -514,10 +509,9 @@ void ScriptParser::readOutputFormat() {
 
 void ScriptParser::readPhdrs() {
   expect("{");
-
-  while (!errorCount() && !consume("}")) {
+  while (auto tok = till("}")) {
     PhdrsCommand cmd;
-    cmd.name = next();
+    cmd.name = tok;
     cmd.type = readPhdrType();
 
     while (!errorCount() && !consume(";")) {
@@ -539,9 +533,9 @@ void ScriptParser::readPhdrs() {
 
 void ScriptParser::readRegionAlias() {
   expect("(");
-  StringRef alias = unquote(next());
+  StringRef alias = readName();
   expect(",");
-  StringRef name = next();
+  StringRef name = readName();
   expect(")");
 
   if (script->memoryRegions.count(alias))
@@ -553,9 +547,9 @@ void ScriptParser::readRegionAlias() {
 
 void ScriptParser::readSearchDir() {
   expect("(");
-  StringRef tok = next();
+  StringRef name = readName();
   if (!config->nostdlib)
-    config->searchPaths.push_back(unquote(tok));
+    config->searchPaths.push_back(name);
   expect(")");
 }
 
@@ -613,15 +607,14 @@ SmallVector<SectionCommand *, 0> ScriptParser::readOverlay() {
 
 void ScriptParser::readOverwriteSections() {
   expect("{");
-  while (!errorCount() && !consume("}"))
-    script->overwriteSections.push_back(readOutputSectionDescription(next()));
+  while (auto tok = till("}"))
+    script->overwriteSections.push_back(readOutputSectionDescription(tok));
 }
 
 void ScriptParser::readSections() {
   expect("{");
   SmallVector<SectionCommand *, 0> v;
-  while (!errorCount() && !consume("}")) {
-    StringRef tok = next();
+  while (auto tok = till("}")) {
     if (tok == "OVERLAY") {
       for (SectionCommand *cmd : readOverlay())
         v.push_back(cmd);
@@ -657,7 +650,7 @@ void ScriptParser::readSections() {
     isAfter = true;
   else if (!consume("BEFORE"))
     setError("expected AFTER/BEFORE, but got '" + next() + "'");
-  StringRef where = next();
+  StringRef where = readName();
   SmallVector<StringRef, 0> names;
   for (SectionCommand *cmd : v)
     if (auto *os = dyn_cast<OutputDesc>(cmd))
@@ -672,7 +665,7 @@ void ScriptParser::readTarget() {
   // for --format. We recognize only /^elf/ and "binary" in the linker
   // script as well.
   expect("(");
-  StringRef tok = unquote(next());
+  StringRef tok = readName();
   expect(")");
 
   if (tok.starts_with("elf"))
@@ -701,9 +694,8 @@ static int precedence(StringRef op) {
 
 StringMatcher ScriptParser::readFilePatterns() {
   StringMatcher Matcher;
-
-  while (!errorCount() && !consume(")"))
-    Matcher.addPattern(SingleStringMatcher(next()));
+  while (auto tok = till(")"))
+    Matcher.addPattern(SingleStringMatcher(tok));
   return Matcher;
 }
 
@@ -759,7 +751,7 @@ SmallVector<SectionPattern, 0> ScriptParser::readInputSectionsList() {
         setError("section pattern is expected");
         break;
       }
-      SectionMatcher.addPattern(unquote(next()));
+      SectionMatcher.addPattern(readName());
     }
 
     if (!SectionMatcher.empty())
@@ -790,7 +782,7 @@ ScriptParser::readInputSectionRules(StringRef filePattern, uint64_t withFlags,
       make<InputSectionDescription>(filePattern, withFlags, withoutFlags);
   expect("(");
 
-  while (!errorCount() && !consume(")")) {
+  while (peek() != ")" && !atEOF()) {
     SortSectionPolicy outer = readSortKind();
     SortSectionPolicy inner = SortSectionPolicy::Default;
     SmallVector<SectionPattern, 0> v;
@@ -816,6 +808,7 @@ ScriptParser::readInputSectionRules(StringRef filePattern, uint64_t withFlags,
 
     std::move(v.begin(), v.end(), std::back_inserter(cmd->sectionPatterns));
   }
+  expect(")");
   return cmd;
 }
 
@@ -852,7 +845,7 @@ Expr ScriptParser::readAssert() {
   expect("(");
   Expr e = readExpr();
   expect(",");
-  StringRef msg = unquote(next());
+  StringRef msg = readName();
   expect(")");
 
   return [=] {
@@ -947,16 +940,19 @@ static Expr checkAlignment(Expr e, std::string &loc) {
 }
 
 OutputDesc *ScriptParser::readOverlaySectionDescription() {
-  OutputDesc *osd = script->createOutputSection(next(), getCurrentLocation());
+  OutputDesc *osd =
+      script->createOutputSection(readName(), getCurrentLocation());
   osd->osec.inOverlay = true;
   expect("{");
-  while (!errorCount() && !consume("}")) {
+  while (auto tok = till("}")) {
     uint64_t withFlags = 0;
     uint64_t withoutFlags = 0;
-    if (consume("INPUT_SECTION_FLAGS"))
+    if (tok == "INPUT_SECTION_FLAGS") {
       std::tie(withFlags, withoutFlags) = readInputSectionFlags();
+      tok = till("");
+    }
     osd->osec.commands.push_back(
-        readInputSectionRules(next(), withFlags, withoutFlags));
+        readInputSectionRules(tok, withFlags, withoutFlags));
   }
   osd->osec.phdrs = readOutputSectionPhdrs();
   return osd;
@@ -990,8 +986,7 @@ OutputDesc *ScriptParser::readOutputSectionDescription(StringRef outSec) {
     osec->constraint = ConstraintKind::ReadWrite;
   expect("{");
 
-  while (!errorCount() && !consume("}")) {
-    StringRef tok = next();
+  while (auto tok = till("}")) {
     if (tok == ";") {
       // Empty commands are allowed. Do nothing here.
     } else if (SymbolAssignment *assign = readAssignment(tok)) {
@@ -1031,11 +1026,11 @@ OutputDesc *ScriptParser::readOutputSectionDescription(StringRef outSec) {
   }
 
   if (consume(">"))
-    osec->memoryRegionName = std::string(next());
+    osec->memoryRegionName = std::string(readName());
 
   if (consume("AT")) {
     expect(">");
-    osec->lmaRegionName = std::string(next());
+    osec->lmaRegionName = std::string(readName());
   }
 
   if (osec->lmaExpr && !osec->lmaRegionName.empty())
@@ -1081,10 +1076,10 @@ std::array<uint8_t, 4> ScriptParser::readFill() {
 
 SymbolAssignment *ScriptParser::readProvideHidden(bool provide, bool hidden) {
   expect("(");
-  StringRef name = next(), eq = peek();
+  StringRef name = readName(), eq = peek();
   if (eq != "=") {
     setError("= expected, but got " + next());
-    while (!atEOF() && next() != ")")
+    while (till(")"))
       ;
     return nullptr;
   }
@@ -1098,45 +1093,55 @@ SymbolAssignment *ScriptParser::readProvideHidden(bool provide, bool hidden) {
   return cmd;
 }
 
+// Replace whitespace sequence (including \n) with one single space. The output
+// is used by -Map.
+static void squeezeSpaces(std::string &str) {
+  char prev = '\0';
+  auto it = str.begin();
+  for (char c : str)
+    if (!isSpace(c) || (c = ' ') != prev)
+      *it++ = prev = c;
+  str.erase(it, str.end());
+}
+
 SymbolAssignment *ScriptParser::readAssignment(StringRef tok) {
   // Assert expression returns Dot, so this is equal to ".=."
   if (tok == "ASSERT")
     return make<SymbolAssignment>(".", readAssert(), 0, getCurrentLocation());
 
-  size_t oldPos = pos;
+  const char *oldS = prevTok.data();
   SymbolAssignment *cmd = nullptr;
   bool savedSeenRelroEnd = script->seenRelroEnd;
   const StringRef op = peek();
-  if (op.starts_with("=")) {
-    // Support = followed by an expression without whitespace.
-    SaveAndRestore saved(inExpr, true);
-    cmd = readSymbolAssignment(tok);
-  } else if ((op.size() == 2 && op[1] == '=' && strchr("*/+-&^|", op[0])) ||
-             op == "<<=" || op == ">>=") {
-    cmd = readSymbolAssignment(tok);
-  } else if (tok == "PROVIDE") {
-    SaveAndRestore saved(inExpr, true);
-    cmd = readProvideHidden(true, false);
-  } else if (tok == "HIDDEN") {
+  {
     SaveAndRestore saved(inExpr, true);
-    cmd = readProvideHidden(false, true);
-  } else if (tok == "PROVIDE_HIDDEN") {
-    SaveAndRestore saved(inExpr, true);
-    cmd = readProvideHidden(true, true);
+    if (op.starts_with("=")) {
+      // Support = followed by an expression without whitespace.
+      cmd = readSymbolAssignment(unquote(tok));
+    } else if ((op.size() == 2 && op[1] == '=' && strchr("+-*/&^|", op[0])) ||
+               op == "<<=" || op == ">>=") {
+      cmd = readSymbolAssignment(unquote(tok));
+    } else if (tok == "PROVIDE") {
+      cmd = readProvideHidden(true, false);
+    } else if (tok == "HIDDEN") {
+      cmd = readProvideHidden(false, true);
+    } else if (tok == "PROVIDE_HIDDEN") {
+      cmd = readProvideHidden(true, true);
+    }
   }
 
   if (cmd) {
     cmd->dataSegmentRelroEnd = !savedSeenRelroEnd && script->seenRelroEnd;
-    cmd->commandString =
-        tok.str() + " " +
-        llvm::join(tokens.begin() + oldPos, tokens.begin() + pos, " ");
+    cmd->commandString = StringRef(oldS, curTok.data() - oldS).str();
+    squeezeSpaces(cmd->commandString);
     expect(";");
   }
   return cmd;
 }
 
+StringRef ScriptParser::readName() { return unquote(next()); }
+
 SymbolAssignment *ScriptParser::readSymbolAssignment(StringRef name) {
-  name = unquote(name);
   StringRef op = next();
   assert(op == "=" || op == "*=" || op == "/=" || op == "+=" || op == "-=" ||
          op == "&=" || op == "^=" || op == "|=" || op == "<<=" || op == ">>=");
@@ -1281,7 +1286,7 @@ Expr ScriptParser::getPageSize() {
 }
 
 Expr ScriptParser::readConstant() {
-  StringRef s = readParenLiteral();
+  StringRef s = readParenName();
   if (s == "COMMONPAGESIZE")
     return getPageSize();
   if (s == "MAXPAGESIZE")
@@ -1333,12 +1338,11 @@ ByteCommand *ScriptParser::readByteCommand(StringRef tok) {
   if (size == -1)
     return nullptr;
 
-  size_t oldPos = pos;
+  const char *oldS = prevTok.data();
   Expr e = readParenExpr();
-  std::string commandString =
-      tok.str() + " " +
-      llvm::join(tokens.begin() + oldPos, tokens.begin() + pos, " ");
-  return make<ByteCommand>(e, size, commandString);
+  std::string commandString = StringRef(oldS, curBuf.s.data() - oldS).str();
+  squeezeSpaces(commandString);
+  return make<ByteCommand>(e, size, std::move(commandString));
 }
 
 static std::optional<uint64_t> parseFlag(StringRef tok) {
@@ -1374,11 +1378,11 @@ static std::optional<uint64_t> parseFlag(StringRef tok) {
 // Example: SHF_EXECINSTR & !SHF_WRITE means with flag SHF_EXECINSTR and
 // without flag SHF_WRITE.
 std::pair<uint64_t, uint64_t> ScriptParser::readInputSectionFlags() {
-   uint64_t withFlags = 0;
-   uint64_t withoutFlags = 0;
-   expect("(");
-   while (!errorCount()) {
-    StringRef tok = unquote(next());
+  uint64_t withFlags = 0;
+  uint64_t withoutFlags = 0;
+  expect("(");
+  while (!errorCount()) {
+    StringRef tok = readName();
     bool without = tok.consume_front("!");
     if (std::optional<uint64_t> flag = parseFlag(tok)) {
       if (without)
@@ -1398,11 +1402,11 @@ std::pair<uint64_t, uint64_t> ScriptParser::readInputSectionFlags() {
   return std::make_pair(withFlags, withoutFlags);
 }
 
-StringRef ScriptParser::readParenLiteral() {
+StringRef ScriptParser::readParenName() {
   expect("(");
   bool orig = inExpr;
   inExpr = false;
-  StringRef tok = next();
+  StringRef tok = readName();
   inExpr = orig;
   expect(")");
   return tok;
@@ -1451,7 +1455,7 @@ Expr ScriptParser::readPrimary() {
     };
   }
   if (tok == "ADDR") {
-    StringRef name = unquote(readParenLiteral());
+    StringRef name = readParenName();
     OutputSection *osec = &script->getOrCreateOutputSection(name)->osec;
     osec->usedInExpression = true;
     return [=]() -> ExprValue {
@@ -1476,7 +1480,7 @@ Expr ScriptParser::readPrimary() {
     };
   }
   if (tok == "ALIGNOF") {
-    StringRef name = unquote(readParenLiteral());
+    StringRef name = readParenName();
     OutputSection *osec = &script->getOrCreateOutputSection(name)->osec;
     return [=] {
       checkIfExists(*osec, location);
@@ -1518,7 +1522,7 @@ Expr ScriptParser::readPrimary() {
     return [=] { return alignToPowerOf2(script->getDot(), config->maxPageSize); };
   }
   if (tok == "DEFINED") {
-    StringRef name = unquote(readParenLiteral());
+    StringRef name = readParenName();
     // Return 1 if s is defined. If the definition is only found in a linker
     // script, it must happen before this DEFINED.
     auto order = ctx.scriptSymOrderCounter++;
@@ -1529,7 +1533,7 @@ Expr ScriptParser::readPrimary() {
     };
   }
   if (tok == "LENGTH") {
-    StringRef name = readParenLiteral();
+    StringRef name = readParenName();
     if (script->memoryRegions.count(name) == 0) {
       setError("memory region not defined: " + name);
       return [] { return 0; };
@@ -1537,7 +1541,7 @@ Expr ScriptParser::readPrimary() {
     return script->memoryRegions[name]->length;
   }
   if (tok == "LOADADDR") {
-    StringRef name = unquote(readParenLiteral());
+    StringRef name = readParenName();
     OutputSection *osec = &script->getOrCreateOutputSection(name)->osec;
     osec->usedInExpression = true;
     return [=] {
@@ -1565,7 +1569,7 @@ Expr ScriptParser::readPrimary() {
     return [=] { return std::max(a().getValue(), b().getValue()); };
   }
   if (tok == "ORIGIN") {
-    StringRef name = readParenLiteral();
+    StringRef name = readParenName();
     if (script->memoryRegions.count(name) == 0) {
       setError("memory region not defined: " + name);
       return [] { return 0; };
@@ -1581,7 +1585,7 @@ Expr ScriptParser::readPrimary() {
     return [=] { return e(); };
   }
   if (tok == "SIZEOF") {
-    StringRef name = unquote(readParenLiteral());
+    StringRef name = readParenName();
     OutputSection *cmd = &script->getOrCreateOutputSection(name)->osec;
     // Linker script does not create an output section if its content is empty.
     // We want to allow SIZEOF(.foo) where .foo is a section which happened to
@@ -1629,7 +1633,7 @@ SmallVector<StringRef, 0> ScriptParser::readOutputSectionPhdrs() {
   SmallVector<StringRef, 0> phdrs;
   while (!errorCount() && peek().starts_with(":")) {
     StringRef tok = next();
-    phdrs.push_back((tok.size() == 1) ? next() : tok.substr(1));
+    phdrs.push_back((tok.size() == 1) ? readName() : tok.substr(1));
   }
   return phdrs;
 }
@@ -1716,15 +1720,11 @@ ScriptParser::readSymbols() {
   SmallVector<SymbolVersion, 0> globals;
   SmallVector<SymbolVersion, 0> *v = &globals;
 
-  while (!errorCount()) {
-    if (consume("}"))
-      break;
-
-    if (consume("extern")) {
+  while (auto tok = till("}")) {
+    if (tok == "extern") {
       SmallVector<SymbolVersion, 0> ext = readVersionExtern();
       v->insert(v->end(), ext.begin(), ext.end());
     } else {
-      StringRef tok = next();
       if (tok == "local:" || (tok == "local" && consume(":"))) {
         v = &locals;
         continue;
@@ -1753,16 +1753,13 @@ SmallVector<SymbolVersion, 0> ScriptParser::readVersionExtern() {
   expect("{");
 
   SmallVector<SymbolVersion, 0> ret;
-  while (!errorCount() && peek() != "}") {
-    StringRef tok = next();
+  while (auto tok = till("}")) {
     ret.push_back(
-        {unquote(tok), isCXX, !tok.starts_with("\"") && hasWildcard(tok)});
+        {unquote(tok), isCXX, !tok.str.starts_with("\"") && hasWildcard(tok)});
     if (consume("}"))
       return ret;
     expect(";");
   }
-
-  expect("}");
   return ret;
 }
 
@@ -1782,8 +1779,7 @@ Expr ScriptParser::readMemoryAssignment(StringRef s1, StringRef s2,
 // MEMORY { name [(attr)] : ORIGIN = origin, LENGTH = len ... }
 void ScriptParser::readMemory() {
   expect("{");
-  while (!errorCount() && !consume("}")) {
-    StringRef tok = next();
+  while (auto tok = till("}")) {
     if (tok == "INCLUDE") {
       readInclude();
       continue;
@@ -1861,7 +1857,4 @@ void elf::readDynamicList(MemoryBufferRef mb) {
   ScriptParser(mb).readDynamicList();
 }
 
-void elf::readDefsym(StringRef name, MemoryBufferRef mb) {
-  llvm::TimeTraceScope timeScope("Read defsym input", name);
-  ScriptParser(mb).readDefsym(name);
-}
+void elf::readDefsym(MemoryBufferRef mb) { ScriptParser(mb).readDefsym(); }
diff --git a/lld/ELF/ScriptParser.h b/lld/ELF/ScriptParser.h
index 34b27d2..d6f71c5 100644
--- a/lld/ELF/ScriptParser.h
+++ b/lld/ELF/ScriptParser.h
@@ -24,7 +24,7 @@ void readVersionScript(MemoryBufferRef mb);
 void readDynamicList(MemoryBufferRef mb);
 
 // Parses the defsym expression.
-void readDefsym(StringRef name, MemoryBufferRef mb);
+void readDefsym(MemoryBufferRef mb);
 
 bool hasWildcard(StringRef s);
 
diff --git a/lld/ELF/Symbols.cpp b/lld/ELF/Symbols.cpp
index 93653de..263d4f3 100644
--- a/lld/ELF/Symbols.cpp
+++ b/lld/ELF/Symbols.cpp
@@ -73,7 +73,6 @@ Defined *ElfSym::riscvGlobalPointer;
 Defined *ElfSym::relaIpltStart;
 Defined *ElfSym::relaIpltEnd;
 Defined *ElfSym::tlsModuleBase;
-SmallVector<SymbolAux, 0> elf::symAux;
 
 static uint64_t getSymVA(const Symbol &sym, int64_t addend) {
   switch (sym.kind()) {
diff --git a/lld/ELF/Symbols.h b/lld/ELF/Symbols.h
index e764fe8..ff82561 100644
--- a/lld/ELF/Symbols.h
+++ b/lld/ELF/Symbols.h
@@ -56,17 +56,6 @@ enum {
   NEEDS_TLSIE = 1 << 8,
 };
 
-// Some index properties of a symbol are stored separately in this auxiliary
-// struct to decrease sizeof(SymbolUnion) in the majority of cases.
-struct SymbolAux {
-  uint32_t gotIdx = -1;
-  uint32_t pltIdx = -1;
-  uint32_t tlsDescIdx = -1;
-  uint32_t tlsGdIdx = -1;
-};
-
-LLVM_LIBRARY_VISIBILITY extern SmallVector<SymbolAux, 0> symAux;
-
 // The base class for real symbol classes.
 class Symbol {
 public:
@@ -211,10 +200,10 @@ public:
   // truncated by Symbol::parseSymbolVersion().
   const char *getVersionSuffix() const { return nameData + nameSize; }
 
-  uint32_t getGotIdx() const { return symAux[auxIdx].gotIdx; }
-  uint32_t getPltIdx() const { return symAux[auxIdx].pltIdx; }
-  uint32_t getTlsDescIdx() const { return symAux[auxIdx].tlsDescIdx; }
-  uint32_t getTlsGdIdx() const { return symAux[auxIdx].tlsGdIdx; }
+  uint32_t getGotIdx() const { return ctx.symAux[auxIdx].gotIdx; }
+  uint32_t getPltIdx() const { return ctx.symAux[auxIdx].pltIdx; }
+  uint32_t getTlsDescIdx() const { return ctx.symAux[auxIdx].tlsDescIdx; }
+  uint32_t getTlsGdIdx() const { return ctx.symAux[auxIdx].tlsGdIdx; }
 
   bool isInGot() const { return getGotIdx() != uint32_t(-1); }
   bool isInPlt() const { return getPltIdx() != uint32_t(-1); }
@@ -325,8 +314,8 @@ public:
   // entries during postScanRelocations();
   std::atomic<uint16_t> flags;
 
-  // A symAux index used to access GOT/PLT entry indexes. This is allocated in
-  // postScanRelocations().
+  // A ctx.symAux index used to access GOT/PLT entry indexes. This is allocated
+  // in postScanRelocations().
   uint32_t auxIdx;
   uint32_t dynsymIndex;
 
@@ -357,8 +346,8 @@ public:
   }
   void allocateAux() {
     assert(auxIdx == 0);
-    auxIdx = symAux.size();
-    symAux.emplace_back();
+    auxIdx = ctx.symAux.size();
+    ctx.symAux.emplace_back();
   }
 
   bool isSection() const { return type == llvm::ELF::STT_SECTION; }
diff --git a/lld/ELF/SyntheticSections.cpp b/lld/ELF/SyntheticSections.cpp
index 5d3f3df..d0b1933 100644
--- a/lld/ELF/SyntheticSections.cpp
+++ b/lld/ELF/SyntheticSections.cpp
@@ -653,20 +653,20 @@ GotSection::GotSection()
 
 void GotSection::addConstant(const Relocation &r) { relocations.push_back(r); }
 void GotSection::addEntry(const Symbol &sym) {
-  assert(sym.auxIdx == symAux.size() - 1);
-  symAux.back().gotIdx = numEntries++;
+  assert(sym.auxIdx == ctx.symAux.size() - 1);
+  ctx.symAux.back().gotIdx = numEntries++;
 }
 
 bool GotSection::addTlsDescEntry(const Symbol &sym) {
-  assert(sym.auxIdx == symAux.size() - 1);
-  symAux.back().tlsDescIdx = numEntries;
+  assert(sym.auxIdx == ctx.symAux.size() - 1);
+  ctx.symAux.back().tlsDescIdx = numEntries;
   numEntries += 2;
   return true;
 }
 
 bool GotSection::addDynTlsEntry(const Symbol &sym) {
-  assert(sym.auxIdx == symAux.size() - 1);
-  symAux.back().tlsGdIdx = numEntries;
+  assert(sym.auxIdx == ctx.symAux.size() - 1);
+  ctx.symAux.back().tlsGdIdx = numEntries;
   // Global Dynamic TLS entries take two GOT slots.
   numEntries += 2;
   return true;
@@ -997,12 +997,12 @@ void MipsGotSection::build() {
   for (auto &p : primGot->global) {
     if (p.first->auxIdx == 0)
       p.first->allocateAux();
-    symAux.back().gotIdx = p.second;
+    ctx.symAux.back().gotIdx = p.second;
   }
   for (auto &p : primGot->relocs) {
     if (p.first->auxIdx == 0)
       p.first->allocateAux();
-    symAux.back().gotIdx = p.second;
+    ctx.symAux.back().gotIdx = p.second;
   }
 
   // Create dynamic relocations.
@@ -1171,8 +1171,8 @@ GotPltSection::GotPltSection()
 }
 
 void GotPltSection::addEntry(Symbol &sym) {
-  assert(sym.auxIdx == symAux.size() - 1 &&
-         symAux.back().pltIdx == entries.size());
+  assert(sym.auxIdx == ctx.symAux.size() - 1 &&
+         ctx.symAux.back().pltIdx == entries.size());
   entries.push_back(&sym);
 }
 
@@ -1217,7 +1217,7 @@ IgotPltSection::IgotPltSection()
                        target->gotEntrySize, getIgotPltName()) {}
 
 void IgotPltSection::addEntry(Symbol &sym) {
-  assert(symAux.back().pltIdx == entries.size());
+  assert(ctx.symAux.back().pltIdx == entries.size());
   entries.push_back(&sym);
 }
 
@@ -2566,8 +2566,8 @@ void PltSection::writeTo(uint8_t *buf) {
 }
 
 void PltSection::addEntry(Symbol &sym) {
-  assert(sym.auxIdx == symAux.size() - 1);
-  symAux.back().pltIdx = entries.size();
+  assert(sym.auxIdx == ctx.symAux.size() - 1);
+  ctx.symAux.back().pltIdx = entries.size();
   entries.push_back(&sym);
 }
 
@@ -2613,8 +2613,8 @@ size_t IpltSection::getSize() const {
 }
 
 void IpltSection::addEntry(Symbol &sym) {
-  assert(sym.auxIdx == symAux.size() - 1);
-  symAux.back().pltIdx = entries.size();
+  assert(sym.auxIdx == ctx.symAux.size() - 1);
+  ctx.symAux.back().pltIdx = entries.size();
   entries.push_back(&sym);
 }
 
@@ -3203,10 +3203,10 @@ template <class ELFT> DebugNamesSection<ELFT>::DebugNamesSection() {
 template <class ELFT>
 template <class RelTy>
 void DebugNamesSection<ELFT>::getNameRelocs(
-    InputSection *sec, ArrayRef<RelTy> rels,
-    DenseMap<uint32_t, uint32_t> &relocs) {
+    const InputFile &file, DenseMap<uint32_t, uint32_t> &relocs,
+    Relocs<RelTy> rels) {
   for (const RelTy &rel : rels) {
-    Symbol &sym = sec->file->getRelocTargetSym(rel);
+    Symbol &sym = file.getRelocTargetSym(rel);
     relocs[rel.r_offset] = sym.getVA(getAddend<ELFT>(rel));
   }
 }
@@ -3216,11 +3216,7 @@ template <class ELFT> void DebugNamesSection<ELFT>::finalizeContents() {
   auto relocs = std::make_unique<DenseMap<uint32_t, uint32_t>[]>(numChunks);
   parallelFor(0, numChunks, [&](size_t i) {
     InputSection *sec = inputSections[i];
-    auto rels = sec->template relsOrRelas<ELFT>();
-    if (rels.areRelocsRel())
-      getNameRelocs(sec, rels.rels, relocs.get()[i]);
-    else
-      getNameRelocs(sec, rels.relas, relocs.get()[i]);
+    invokeOnRelocs(*sec, getNameRelocs, *sec->file, relocs.get()[i]);
 
     // Relocate CU offsets with .debug_info + X relocations.
     OutputChunk &chunk = chunks.get()[i];
@@ -4669,7 +4665,8 @@ template <class ELFT> void elf::createSyntheticSections() {
 
   auto add = [](SyntheticSection &sec) { ctx.inputSections.push_back(&sec); };
 
-  in.shStrTab = std::make_unique<StringTableSection>(".shstrtab", false);
+  if (config->zSectionHeader)
+    in.shStrTab = std::make_unique<StringTableSection>(".shstrtab", false);
 
   Out::programHeaders = make<OutputSection>("", 0, SHF_ALLOC);
   Out::programHeaders->addralign = config->wordsize;
@@ -4921,7 +4918,8 @@ template <class ELFT> void elf::createSyntheticSections() {
     add(*in.symTab);
   if (in.symTabShndx)
     add(*in.symTabShndx);
-  add(*in.shStrTab);
+  if (in.shStrTab)
+    add(*in.shStrTab);
   if (in.strTab)
     add(*in.strTab);
 }
diff --git a/lld/ELF/SyntheticSections.h b/lld/ELF/SyntheticSections.h
index eaa09ea..d4169e1 100644
--- a/lld/ELF/SyntheticSections.h
+++ b/lld/ELF/SyntheticSections.h
@@ -916,8 +916,9 @@ public:
   void writeTo(uint8_t *buf) override;
 
   template <class RelTy>
-  void getNameRelocs(InputSection *sec, ArrayRef<RelTy> rels,
-                     llvm::DenseMap<uint32_t, uint32_t> &relocs);
+  void getNameRelocs(const InputFile &file,
+                     llvm::DenseMap<uint32_t, uint32_t> &relocs,
+                     Relocs<RelTy> rels);
 
 private:
   static void readOffsets(InputChunk &inputChunk, OutputChunk &chunk,
diff --git a/lld/ELF/Writer.cpp b/lld/ELF/Writer.cpp
index 5cffdb7..515ebb7 100644
--- a/lld/ELF/Writer.cpp
+++ b/lld/ELF/Writer.cpp
@@ -1875,13 +1875,16 @@ template <class ELFT> void Writer<ELFT>::finalizeSections() {
   sortSections();
 
   // Create a list of OutputSections, assign sectionIndex, and populate
-  // in.shStrTab.
+  // in.shStrTab. If -z nosectionheader is specified, drop non-ALLOC sections.
   for (SectionCommand *cmd : script->sectionCommands)
     if (auto *osd = dyn_cast<OutputDesc>(cmd)) {
       OutputSection *osec = &osd->osec;
+      if (!in.shStrTab && !(osec->flags & SHF_ALLOC))
+        continue;
       outputSections.push_back(osec);
       osec->sectionIndex = outputSections.size();
-      osec->shName = in.shStrTab->addString(osec->name);
+      if (in.shStrTab)
+        osec->shName = in.shStrTab->addString(osec->name);
     }
 
   // Prefer command line supplied address over other constraints.
@@ -2703,6 +2706,10 @@ template <class ELFT> void Writer<ELFT>::writeHeader() {
   auto *eHdr = reinterpret_cast<Elf_Ehdr *>(Out::bufferStart);
   eHdr->e_type = getELFType();
   eHdr->e_entry = getEntryAddr();
+
+  // If -z nosectionheader is specified, omit the section header table.
+  if (!in.shStrTab)
+    return;
   eHdr->e_shoff = sectionHeaderOff;
 
   // Write the section header table.
diff --git a/lld/docs/ReleaseNotes.rst b/lld/docs/ReleaseNotes.rst
index 6f60efd..e9d3c12 100644
--- a/lld/docs/ReleaseNotes.rst
+++ b/lld/docs/ReleaseNotes.rst
@@ -26,6 +26,10 @@ Non-comprehensive list of changes in this release
 ELF Improvements
 ----------------
 
+* ``-z nosectionheader`` has been implemented to omit the section header table.
+  The operation is similar to ``llvm-objcopy --strip-sections``.
+  (`#101286 <https://github.com/llvm/llvm-project/pull/101286>`_)
+
 Breaking changes
 ----------------
 
diff --git a/lld/docs/ld.lld.1 b/lld/docs/ld.lld.1
index f9a00b7..b22cb36 100644
--- a/lld/docs/ld.lld.1
+++ b/lld/docs/ld.lld.1
@@ -857,6 +857,9 @@ The object will omit the
 .Dv PT_GNU_RELRO
 segment.
 .Pp
+.It Cm nosectionheader
+Don't generate the section header table.
+.Pp
 .It Cm notext
 Allow relocations against read-only segments.
 Sets the
diff --git a/lld/test/CMakeLists.txt b/lld/test/CMakeLists.txt
index 25d8f0a..5d4a275 100644
--- a/lld/test/CMakeLists.txt
+++ b/lld/test/CMakeLists.txt
@@ -64,6 +64,7 @@ if (NOT LLD_BUILT_STANDALONE)
     llvm-profdata
     llvm-readelf
     llvm-readobj
+    llvm-strings
     llvm-strip
     llvm-symbolizer
     not
diff --git a/lld/test/ELF/defsym.s b/lld/test/ELF/defsym.s
index fed937f..eb409cc 100644
--- a/lld/test/ELF/defsym.s
+++ b/lld/test/ELF/defsym.s
@@ -5,14 +5,16 @@
 # RUN: llvm-objdump -d --print-imm-hex %t | FileCheck %s --check-prefix=USE
 
 ## Check that we accept --defsym foo2=foo1 form.
-# RUN: ld.lld -o %t2 %t.o --defsym foo2=foo1
+# RUN: ld.lld -o %t2 %t.o --defsym '"foo2"=foo1'
 # RUN: llvm-readelf -s %t2 | FileCheck %s
 # RUN: llvm-objdump -d --print-imm-hex %t2 | FileCheck %s --check-prefix=USE
 
 ## Check we are reporting the error correctly and don't crash
 ## when handling the second --defsym.
-# RUN: not ld.lld -o /dev/null %t.o --defsym ERR+ --defsym foo2=foo1 2>&1 | FileCheck %s --check-prefix=ERR
-# ERR: error: --defsym: syntax error: ERR+
+# RUN: not ld.lld -o /dev/null %t.o --defsym ERR+ --defsym foo2=foo1 2>&1 | FileCheck %s --check-prefix=ERR --strict-whitespace
+#      ERR:error: --defsym:1: = expected, but got +
+# ERR-NEXT:>>> ERR+
+# ERR-NEXT:>>>    ^
 
 # CHECK-DAG: 0000000000000123     0 NOTYPE  GLOBAL DEFAULT   ABS foo1
 # CHECK-DAG: 0000000000000123     0 NOTYPE  GLOBAL DEFAULT   ABS foo2
@@ -41,10 +43,9 @@
 # ERR2: error: --defsym:1: EOF expected, but got ,
 
 # RUN: not ld.lld -o /dev/null %t.o --defsym=foo 2>&1 | FileCheck %s -check-prefix=ERR3
-# ERR3: error: --defsym: syntax error: foo
+# ERR3: error: --defsym:1: unexpected EOF
 
-# RUN: not ld.lld -o /dev/null %t.o --defsym= 2>&1 | FileCheck %s -check-prefix=ERR4
-# ERR4: error: --defsym: syntax error:
+# RUN: not ld.lld -o /dev/null %t.o --defsym= 2>&1 | FileCheck %s -check-prefix=ERR3
 
 .globl foo1
  foo1 = 0x123
diff --git a/lld/test/ELF/gc-sections-with-provide.s b/lld/test/ELF/gc-sections-with-provide.s
index 3e5b1b1..268faa4 100644
--- a/lld/test/ELF/gc-sections-with-provide.s
+++ b/lld/test/ELF/gc-sections-with-provide.s
@@ -53,8 +53,8 @@ baz:
 
 
 #--- script.t
-PROVIDE(unused = bar + used);
-PROVIDE(used = another_used);
+PROVIDE(unused = bar + "used");
+PROVIDE("used" = another_used);
 PROVIDE(baz_ref = baz);
 PROVIDE(another_used = baz_ref);
 PROVIDE(another_unused = unused + bar + 0x1);
diff --git a/lld/test/ELF/linkerscript/at2.test b/lld/test/ELF/linkerscript/at2.test
index d744fce..24c4935 100644
--- a/lld/test/ELF/linkerscript/at2.test
+++ b/lld/test/ELF/linkerscript/at2.test
@@ -19,7 +19,7 @@ SECTIONS {
  .foo2 : { *(.foo2) } > AX
 
  .bar1 : { *(.bar1) } > AW
- .bar2 : { *(.bar2) } > AW AT > RAM
+ .bar2 : { *(.bar2) } > "AW" AT > "RAM"
  .bar3 . : { *(.bar3) } > AW
  .bar4 : { *(.bar4) } > AW AT >RAM
 }
diff --git a/lld/test/ELF/linkerscript/at3.test b/lld/test/ELF/linkerscript/at3.test
index 1e7f970..4d6d753 100644
--- a/lld/test/ELF/linkerscript/at3.test
+++ b/lld/test/ELF/linkerscript/at3.test
@@ -13,7 +13,7 @@ MEMORY {
 SECTIONS {
  .foo1 : { *(.foo1) }            > FOO AT>FLASH
  .foo2 : { *(.foo2) BYTE(0x42) } > BAR AT>FLASH
- .foo3 : { *(.foo3) }            > ZED AT>FLASH
+ .foo3 : { *(.foo3) }            >"ZED" AT>"FLASH"
 }
 
 # CHECK: .foo1             PROGBITS        0000000000001000 001000
diff --git a/lld/test/ELF/linkerscript/group.s b/lld/test/ELF/linkerscript/group.s
index 89b09e5..8cfe163 100644
--- a/lld/test/ELF/linkerscript/group.s
+++ b/lld/test/ELF/linkerscript/group.s
@@ -3,62 +3,61 @@
 
 # RUN: rm -rf %t.dir && mkdir %t.dir
 # RUN: rm -rf %t && split-file %s %t && cd %t
-# RUN: llvm-mc -filetype=obj -triple=x86_64 %s -o a.o
-# RUN: llvm-mc -filetype=obj -triple=x86_64 %p/Inputs/libsearch-st.s -o b.o
-# RUN: llvm-ar rc %t.dir/libxyz.a b.o
-
-# RUN: echo 'GROUP("a.o")' > %t.t
-# RUN: ld.lld -o %t2 %t.t
-# RUN: llvm-readobj %t2 > /dev/null
-
-# RUN: echo 'INPUT("a.o")' > %t.t
-# RUN: ld.lld -o %t2 %t.t
-# RUN: llvm-readobj %t2 > /dev/null
-
-# RUN: echo 'GROUP("a.o" libxyz.a )' > %t.t
-# RUN: not ld.lld -o /dev/null %t.t 2>/dev/null
-# RUN: ld.lld -o %t2 %t.t -L%t.dir
-# RUN: llvm-readobj %t2 > /dev/null
-
-# RUN: echo 'GROUP("a.o" =libxyz.a )' > %t.t
-# RUN: not ld.lld -o /dev/null %t.t  2>/dev/null
-# RUN: ld.lld -o %t2 %t.t --sysroot=%t.dir
-# RUN: llvm-readobj %t2 > /dev/null
-
-# RUN: echo 'GROUP("a.o" -lxyz )' > %t.t
-# RUN: not ld.lld -o /dev/null %t.t  2>/dev/null
-# RUN: ld.lld -o %t2 %t.t -L%t.dir
-# RUN: llvm-readobj %t2 > /dev/null
-
-# RUN: echo 'GROUP("a.o" libxyz.a )' > %t.t
-# RUN: not ld.lld -o /dev/null %t.t  2>/dev/null
-# RUN: ld.lld -o %t2 %t.t -L%t.dir
-# RUN: llvm-readobj %t2 > /dev/null
-
-# RUN: echo 'GROUP("a.o" /libxyz.a )' > %t.t
+# RUN: llvm-mc -filetype=obj -triple=x86_64 a.s -o a.o
+# RUN: llvm-mc -filetype=obj -triple=x86_64 b.s -o b.o
+# RUN: llvm-mc -filetype=obj -triple=x86_64 %p/Inputs/libsearch-st.s -o xyz.o
+# RUN: llvm-ar rc %t.dir/libb.a b.o
+# RUN: llvm-ar rc %t.dir/libxyz.a xyz.o
+
+# RUN: echo 'GROUP("a.o" libxyz.a -lxyz b.o )' > 1.t
+# RUN: not ld.lld 1.t 2>&1 | FileCheck %s --check-prefix=NOLIB
+# RUN: ld.lld 1.t -L%t.dir
+# RUN: llvm-nm a.out | FileCheck %s
+
+# RUN: echo 'GROUP( "a.o" b.o =libxyz.a )' > 2.t
+# RUN: not ld.lld 2.t 2>&1 | FileCheck %s --check-prefix=CANNOT_OPEN -DFILE=libxyz.a
+# RUN: ld.lld 2.t --sysroot=%t.dir
+# RUN: llvm-nm a.out | FileCheck %s
+
+# RUN: echo 'GROUP("%t.dir/3a.t")' > 3.t
+# RUN: echo 'INCLUDE "%t.dir/3a.t"' > 3i.t
+# RUN: echo 'GROUP(AS_NEEDED("a.o"))INPUT(/libb.a)' > %t.dir/3a.t
+# RUN: ld.lld 3.t --sysroot=%t.dir
+# RUN: llvm-nm a.out | FileCheck %s
+# RUN: ld.lld 3i.t --sysroot=%t.dir
+# RUN: llvm-nm a.out | FileCheck %s
+
+# RUN: echo 'GROUP("%t.dir/4a.t")INPUT(/libb.a)' > 4.t
+# RUN: echo 'GROUP(AS_NEEDED("a.o"))' > %t.dir/4a.t
+# RUN: not ld.lld 4.t --sysroot=%t.dir 2>&1 | FileCheck %s --check-prefix=CANNOT_OPEN -DFILE=/libb.a
+
+# RUN: echo 'INCLUDE "%t.dir/5a.t" INPUT(/libb.a)' > 5.t
+# RUN: echo 'GROUP(a.o)' > %t.dir/5a.t
+# RUN: not ld.lld 5.t --sysroot=%t.dir 2>&1 | FileCheck %s --check-prefix=CANNOT_OPEN -DFILE=/libb.a
+
+# CHECK: T _start
+
+# NOLIB: error: {{.*}}unable to find
+
+# RUN: echo 'GROUP("a.o" /libxyz.a )' > a.t
 # RUN: echo 'GROUP("%t/a.o" /libxyz.a )' > %t.dir/xyz.t
-# RUN: not ld.lld -o /dev/null %t.t 2>&1 | FileCheck %s --check-prefix=CANNOT_OPEN -DFILE=/libxyz.a
-# RUN: not ld.lld -o /dev/null %t.t --sysroot=%t.dir 2>&1 | FileCheck %s --check-prefix=CANNOT_OPEN -DFILE=/libxyz.a
+# RUN: not ld.lld a.t 2>&1 | FileCheck %s --check-prefix=CANNOT_OPEN -DFILE=/libxyz.a
+# RUN: not ld.lld a.t --sysroot=%t.dir 2>&1 | FileCheck %s --check-prefix=CANNOT_OPEN -DFILE=/libxyz.a
 
 ## Since %t.dir/%t does not exist, report an error, instead of falling back to %t
 ## without the syroot prefix.
-# RUN: not ld.lld -o /dev/null %t.dir/xyz.t --sysroot=%t.dir 2>&1 | FileCheck %s --check-prefix=CANNOT_FIND_SYSROOT -DTMP=%t/a.o
+# RUN: not ld.lld %t.dir/xyz.t --sysroot=%t.dir 2>&1 | FileCheck %s --check-prefix=CANNOT_FIND_SYSROOT -DTMP=%t/a.o
 
 # CANNOT_FIND_SYSROOT:      error: {{.*}}xyz.t:1: cannot find [[TMP]] inside {{.*}}.dir
 # CANNOT_FIND_SYSROOT-NEXT: >>> GROUP({{.*}}
 
-# RUN: echo 'GROUP("2.t")' > 1.t
-# RUN: echo 'GROUP("a.o")' > 2.t
-# RUN: ld.lld 1.t
-# RUN: llvm-readobj a.out > /dev/null
-
-# RUN: echo 'GROUP(AS_NEEDED("a.o"))' > 1.t
-# RUN: ld.lld 1.t
-# RUN: llvm-readobj a.out > /dev/null
-
 # CANNOT_OPEN: error: cannot open [[FILE]]: {{.*}}
 
 #--- a.s
 .globl _start
 _start:
-  ret
+  call b
+
+#--- b.s
+.globl b
+b:
diff --git a/lld/test/ELF/linkerscript/header-phdr.test b/lld/test/ELF/linkerscript/header-phdr.test
deleted file mode 100644
index 866e2d4..0000000
--- a/lld/test/ELF/linkerscript/header-phdr.test
+++ /dev/null
@@ -1,15 +0,0 @@
-# REQUIRES: x86
-# RUN: echo '.section .zed, "a"; .zero 4' \
-# RUN:   | llvm-mc -filetype=obj -triple=x86_64-unknown-linux - -o %t.o
-# RUN: ld.lld --script %s %t.o -o %t
-# RUN: llvm-readelf -S -l -W %t | FileCheck %s
-
-# CHECK: [ 1] .abc              PROGBITS        0000000000001000 001000 000004 00   A  0   0  1
-# CHECK: LOAD           0x000000 0x0000000000000000 0x0000000000000000 0x001004 0x001004 R E 0x1000
-
-PHDRS { foobar PT_LOAD FILEHDR PHDRS; }
-
-SECTIONS {
-  . = 0x1000;
-  .abc : { *(.zed) } : foobar
-}
diff --git a/lld/test/ELF/linkerscript/include-cycle.s b/lld/test/ELF/linkerscript/include-cycle.s
index e93ed90..a87b079 100644
--- a/lld/test/ELF/linkerscript/include-cycle.s
+++ b/lld/test/ELF/linkerscript/include-cycle.s
@@ -1,15 +1,32 @@
 # REQUIRES: x86
-# RUN: llvm-mc -filetype=obj -triple=x86_64-unknown-linux %s -o %t.o
+# RUN: rm -rf %t && split-file %s %t && cd %t
+# RUN: llvm-mc -filetype=obj -triple=x86_64 a.s -o a.o
 
-# RUN: echo "INCLUDE \"%t1.script\"" > %t1.script
-# RUN: not ld.lld %t.o %t1.script 2>&1 | FileCheck %s
+# RUN: not ld.lld a.o -T 1.lds 2>&1 | FileCheck %s --check-prefix=ERR1
+# ERR1: error: 1.lds:1: there is a cycle in linker script INCLUDEs
 
-# RUN: echo "INCLUDE \"%t2.script\"" > %t1.script
-# RUN: echo "INCLUDE \"%t1.script\"" > %t2.script
-# RUN: not ld.lld %t.o %t1.script 2>&1 | FileCheck %s
+# RUN: not ld.lld a.o -T 2a.lds 2>&1 | FileCheck %s --check-prefix=ERR2
+# ERR2: error: 2b.lds:1: there is a cycle in linker script INCLUDEs
 
-# CHECK: there is a cycle in linker script INCLUDEs
+# RUN: ld.lld a.o -T 3.lds -o 3
+# RUN: llvm-objdump -s 3 | FileCheck %s --check-prefix=CHECK3
+# CHECK3:      Contents of section foo:
+# CHECK3-NEXT: 0000 2a2a                    **
 
+#--- 0.lds
+BYTE(42)
+#--- 1.lds
+INCLUDE "1.lds"
+#--- 2a.lds
+INCLUDE "2b.lds"
+#--- 2b.lds
+INCLUDE "2a.lds"
+#--- 3.lds
+SECTIONS {
+  foo : { INCLUDE "0.lds" INCLUDE "0.lds" }
+}
+
+#--- a.s
 .globl _start
 _start:
   ret
diff --git a/lld/test/ELF/linkerscript/insert-after.test b/lld/test/ELF/linkerscript/insert-after.test
index 4b25ff3..38eb048 100644
--- a/lld/test/ELF/linkerscript/insert-after.test
+++ b/lld/test/ELF/linkerscript/insert-after.test
@@ -41,7 +41,7 @@
 
 SECTIONS { .byte : { BYTE(0) } } INSERT AFTER .data;
 
-SECTIONS { .foo.data : { *(.foo.data) } } INSERT AFTER .data;
+SECTIONS { .foo.data : { *(.foo.data) } } INSERT AFTER ".data";
 
 ## The input section .foo.text is an orphan. It will be placed in .foo.text
 SECTIONS { .foo.text : {} } INSERT AFTER .text;
diff --git a/lld/test/ELF/linkerscript/insert-before.test b/lld/test/ELF/linkerscript/insert-before.test
index a728349..f06c7332 100644
--- a/lld/test/ELF/linkerscript/insert-before.test
+++ b/lld/test/ELF/linkerscript/insert-before.test
@@ -50,4 +50,4 @@ SECTIONS { .byte : { BYTE(0) } } INSERT BEFORE .data;
 SECTIONS { .foo.data : { *(.foo.data) } } INSERT BEFORE .data;
 
 ## The input section .foo.text is an orphan. It will be placed in .foo.text
-SECTIONS { .foo.text : { x0 = .; } } INSERT BEFORE .text;
+SECTIONS { .foo.text : { x0 = .; } } INSERT BEFORE ".text";
diff --git a/lld/test/ELF/linkerscript/invalid.test b/lld/test/ELF/linkerscript/invalid.test
index 4cbedf6..73b761c 100644
--- a/lld/test/ELF/linkerscript/invalid.test
+++ b/lld/test/ELF/linkerscript/invalid.test
@@ -15,7 +15,7 @@
 
 # RUN: echo foobar > %t1
 # RUN: not ld.lld %t1 no-such-file 2>&1 | FileCheck -check-prefix=ERR1 %s
-# ERR1: unexpected EOF
+# ERR1: error: {{.*}}1:1: unknown directive: foobar
 # ERR1: cannot open no-such-file:
 
 # RUN: echo "foo \"bar" > %t2
diff --git a/lld/test/ELF/linkerscript/map-file.test b/lld/test/ELF/linkerscript/map-file.test
index 6ec8baf..6347c3a 100644
--- a/lld/test/ELF/linkerscript/map-file.test
+++ b/lld/test/ELF/linkerscript/map-file.test
@@ -7,17 +7,17 @@
 # RUN: FileCheck -strict-whitespace %s < %t.map
 
 SECTIONS {
-  . = 0x1000;
+  .		 = 	0x1000;  # tabs
   .foo : {
-    BYTE(0x11)
-    SHORT(0x1122)
+    BYTE ( 0x11 )
+    SHORT (0x1122)
     LONG(0x11223344)
     QUAD(0x1122334455667788)
     PROVIDE_HIDDEN(sym4 = .);
     . += 0x1000;
     *(.foo.1)
     PROVIDE(unused1 = 0xff);
-    HIDDEN(sym6 = .);
+    HIDDEN(  sym6  =  .  );
     . += 0x123 *
          (1 + 1);
     foo = .;
@@ -34,20 +34,20 @@ SECTIONS {
 # CHECK-NEXT:      0                0     1000     1 . = 0x1000
 # CHECK-NEXT:   1000             1000     125d     1 .foo
 # CHECK-NEXT:   1000             1000        1     1         BYTE ( 0x11 )
-# CHECK-NEXT:   1001             1001        2     1         SHORT ( 0x1122 )
-# CHECK-NEXT:   1003             1003        4     1         LONG ( 0x11223344 )
-# CHECK-NEXT:   1007             1007        8     1         QUAD ( 0x1122334455667788 )
-# CHECK-NEXT:   100f             100f        0     1         PROVIDE_HIDDEN ( sym4 = . )
+# CHECK-NEXT:   1001             1001        2     1         SHORT (0x1122)
+# CHECK-NEXT:   1003             1003        4     1         LONG(0x11223344)
+# CHECK-NEXT:   1007             1007        8     1         QUAD(0x1122334455667788)
+# CHECK-NEXT:   100f             100f        0     1         PROVIDE_HIDDEN(sym4 = .)
 # CHECK-NEXT:   100f             100f     1000     1         . += 0x1000
 # CHECK-NEXT:   200f             200f        8     1         {{.*}}{{/|\\}}map-file.test.tmp.o:(.foo.1)
-# CHECK-NEXT:   2017             2017        0     1         HIDDEN ( sym6 = . )
-# CHECK-NEXT:   2017             2017      246     1         . += 0x123 * ( 1 + 1 )
+# CHECK-NEXT:   2017             2017        0     1         HIDDEN( sym6 = . )
+# CHECK-NEXT:   2017             2017      246     1         . += 0x123 * (1 + 1)
 # CHECK-NEXT:   225d             225d        0     1         foo = .
 # CHECK-NEXT:   225d             225d        0     1         bar = 0x42 - 0x26
 # CHECK-NEXT:   225d             225d        0     1 sym1 = .
 # CHECK-NEXT:   225d             225d      500     1 . += 0x500
 # CHECK-NEXT:   275d             275d        0     1 sym2 = .
-# CHECK-NEXT:   275d             275d        0     1 PROVIDE ( sym3 = 42 )
+# CHECK-NEXT:   275d             275d        0     1 PROVIDE(sym3 = 42)
 # CHECK-NEXT:   2760             2760       10     4 .text
 # CHECK-NEXT:   2760             2760       10     4         {{.*}}{{/|\\}}map-file.test.tmp.o:(.text)
 # CHECK-NEXT:      0                0        8     1 .comment
diff --git a/lld/test/ELF/linkerscript/map-file2.test b/lld/test/ELF/linkerscript/map-file2.test
index 8efb5d6..a34595f 100644
--- a/lld/test/ELF/linkerscript/map-file2.test
+++ b/lld/test/ELF/linkerscript/map-file2.test
@@ -27,7 +27,7 @@ SECTIONS {
 # CHECK-NEXT:       1010             3000        8     1         {{.*}}{{/|\\}}map-file2.test.tmp.o:(.ccc)
 # CHECK-NEXT:       1018             3008      100     1 . += 0x100
 # CHECK-NEXT:       1118             3108      109     1 .ddd
-# CHECK-NEXT:       1118             3108        1     1         BYTE ( 0x11 )
+# CHECK-NEXT:       1118             3108        1     1         BYTE(0x11)
 # CHECK-NEXT:       1119             3109      100     1         . += 0x100
 # CHECK-NEXT:       1219             3209        8     1         {{.*}}{{/|\\}}map-file2.test.tmp.o:(.ddd)
 # CHECK-NEXT:       1228             3218       34     8 .eh_frame
diff --git a/lld/test/ELF/linkerscript/memory-err.s b/lld/test/ELF/linkerscript/memory-err.s
index 5ec190a..5500b84 100644
--- a/lld/test/ELF/linkerscript/memory-err.s
+++ b/lld/test/ELF/linkerscript/memory-err.s
@@ -71,6 +71,10 @@
 # RUN: not ld.lld -T %t.script %t.o -o /dev/null 2>&1 | FileCheck --check-prefix=NOT_CONVERGE %s
 # NOT_CONVERGE: error: address (0x14) of section '.text' does not converge
 
+# RUN: echo 'MEMORY { ram : ORIGIN = symbol, LENGTH = 4094 ' > %t.script
+# RUN: not ld.lld -T %t.script %t.o -o /dev/null 2>&1 | FileCheck --check-prefix=UNCLOSED %s
+# UNCLOSED: error: {{.*}}:1: unexpected EOF
+
 nop
 
 .data
diff --git a/lld/test/ELF/linkerscript/outputarch.test b/lld/test/ELF/linkerscript/outputarch.test
index 4819a98..982e55a 100644
--- a/lld/test/ELF/linkerscript/outputarch.test
+++ b/lld/test/ELF/linkerscript/outputarch.test
@@ -1,5 +1,13 @@
 # REQUIRES: x86
-# RUN: llvm-mc -filetype=obj -triple=x86_64-unknown-freebsd /dev/null -o %t1
-# RUN: ld.lld -shared -o %t2 %t1 %s
+# RUN: rm -rf %t && split-file %s %t && cd %t
+# RUN: llvm-mc -filetype=obj -triple=x86_64 /dev/null -o a.o
+# RUN: ld.lld -shared -T 1.lds a.o
 
+#--- 1.lds
 OUTPUT_ARCH(All data written here is ignored)
+
+#--- unclosed.lds
+OUTPUT_ARCH(All
+
+# RUN: not ld.lld -shared -T unclosed.lds a.o 2>&1 | FileCheck %s --check-prefix=UNCLOSED
+# UNCLOSED: error: unclosed.lds:1: unexpected EOF
diff --git a/lld/test/ELF/linkerscript/overlay.test b/lld/test/ELF/linkerscript/overlay.test
index b939ee4..7c64303 100644
--- a/lld/test/ELF/linkerscript/overlay.test
+++ b/lld/test/ELF/linkerscript/overlay.test
@@ -59,13 +59,13 @@ _start:
 SECTIONS {
 ## LMA defaults to VMA
   OVERLAY 0x1000 : {
-    .big1 { *(.big1) }
+    ".big1" { *(".big1") }
     .small1 { *(.small1) }
   }
 ## .big2 starts at ADDR(.small2)
   OVERLAY : AT (0x2008) {
     .small2 { *(.small2) }
-    .big2 { *(.big2) }
+    ".big2" { *(.big2) }
   }
 ## .empty3 is not discarded. .small3 and .big3 share its address.
   OVERLAY . : AT (0x2014) {
@@ -92,3 +92,11 @@ SECTIONS {
     .out.aaa { *(.aaa) } > AX AT>FLASH
   }
 }
+
+#--- unclosed.lds
+SECTIONS {
+  OVERLAY 0x1000 : AT ( 0x2000 ) {
+
+# RUN: not ld.lld a.o -T unclosed.lds 2>&1 | FileCheck %s --check-prefix=UNCLOSED
+# UNCLOSED:     error: unclosed.lds:2: unexpected EOF
+# UNCLOSED-NOT: {{.}}
diff --git a/lld/test/ELF/linkerscript/phdr-check.s b/lld/test/ELF/linkerscript/phdr-check.s
deleted file mode 100644
index 63fdec7..0000000
--- a/lld/test/ELF/linkerscript/phdr-check.s
+++ /dev/null
@@ -1,15 +0,0 @@
-# REQUIRES: x86
-# RUN: llvm-mc -filetype=obj -triple=x86_64-unknown-linux %s -o %t
-
-# RUN: echo "SECTIONS { . = 0x10000000 + SIZEOF_HEADERS; .text : {*(.text.*)} }" > %t.script
-# RUN: ld.lld -o %t1 --script %t.script %t
-# RUN: llvm-readobj -l %t1 | FileCheck %s
-# CHECK:      ProgramHeaders [
-# CHECK-NEXT:  ProgramHeader {
-# CHECK-NEXT:    Type: PT_PHDR (0x6)
-# CHECK-NEXT:    Offset: 0x40
-# CHECK-NEXT:    VirtualAddress: 0x10000040
-
-.global _start
-_start:
- nop
diff --git a/lld/test/ELF/linkerscript/phdrs.s b/lld/test/ELF/linkerscript/phdrs.s
index 3e645f7..997f7e3 100644
--- a/lld/test/ELF/linkerscript/phdrs.s
+++ b/lld/test/ELF/linkerscript/phdrs.s
@@ -1,143 +1,116 @@
 # REQUIRES: x86
-# RUN: llvm-mc -filetype=obj -triple=x86_64-unknown-linux %s -o %t
-# RUN: echo "PHDRS {all PT_LOAD FILEHDR PHDRS ;} \
-# RUN:       SECTIONS { \
-# RUN:           . = 0x10000200; \
-# RUN:           .text : {*(.text*)} :all \
-# RUN:           .foo : {*(.foo.*)} :all \
-# RUN:           .data : {*(.data.*)} :all}" > %t.script
-# RUN: ld.lld -o %t1 --script %t.script %t
-# RUN: llvm-readobj -l %t1 | FileCheck %s
-
+# RUN: rm -rf %t && split-file %s %t && cd %t
+# RUN: llvm-mc -filetype=obj -triple=x86_64 a.s -o a.o
+
+#--- 1.lds
+PHDRS {all PT_LOAD FILEHDR PHDRS ;}
+SECTIONS {
+  . = 0x10000200;
+  .text : {*(.text*)} :all
+  .foo : {*(.foo.*)} :"all"
+  .data : {*(.data.*)} : "all"}
+
+# RUN: ld.lld -o 1 -T 1.lds a.o
+# RUN: llvm-readelf -Sl 1 | FileCheck %s
+# CHECK:      [Nr] Name              Type            Address          Off    Size   ES Flg Lk Inf Al
+# CHECK:      [ 1] .text             PROGBITS        0000000010000200 000200 000001 00  AX  0   0  4
+# CHECK-NEXT: [ 2] .foo              PROGBITS        0000000010000201 000201 000008 00  WA  0   0  1
+
+# CHECK:      Type           Offset   VirtAddr           PhysAddr           FileSiz  MemSiz   Flg Align
+# CHECK-NEXT: LOAD           0x000000 0x0000000010000000 0x0000000010000000 0x000209 0x000209 RWE 0x1000
+
+#--- 2.lds
 ## Check that program headers are not written, unless we explicitly tell
 ## lld to do this.
-# RUN: echo "PHDRS {all PT_LOAD;} \
-# RUN:       SECTIONS { \
-# RUN:           . = 0x10000200; \
-# RUN:           /DISCARD/ : {*(.text*)}  \
-# RUN:           .foo : {*(.foo.*)} :all \
-# RUN:       }" > %t.script
-# RUN: ld.lld -o %t1 --script %t.script %t
-# RUN: llvm-readobj -l %t1 | FileCheck --check-prefix=NOPHDR %s
-
+PHDRS {all PT_LOAD;}
+SECTIONS {
+    . = 0x10000200;
+    /DISCARD/ : {*(.text*)}
+    .foo : {*(.foo.*)} :all
+}
+
+# RUN: ld.lld -o 2 -T 2.lds a.o
+# RUN: llvm-readelf -l 2 | FileCheck --check-prefix=NOPHDR %s
+# NOPHDR:      Type           Offset   VirtAddr           PhysAddr           FileSiz  MemSiz   Flg Align
+# NOPHDR-NEXT: LOAD           0x000200 0x0000000010000200 0x0000000010000200 0x000008 0x000008 RW  0x1000
+
+#--- 3.lds
+PHDRS {all PT_LOAD FILEHDR PHDRS ;}
+SECTIONS {
+    . = 0x10000200;
+    .text : {*(.text*)} :all
+    .foo : {*(.foo.*)}
+    .data : {*(.data.*)} }
+
+# RUN: ld.lld -o 3 -T 3.lds a.o
+# RUN: llvm-readelf -l 3 | FileCheck --check-prefix=DEFHDR %s
+# DEFHDR:      Type           Offset   VirtAddr           PhysAddr           FileSiz  MemSiz   Flg Align
+# DEFHDR-NEXT: LOAD           0x000000 0x0000000010000000 0x0000000010000000 0x000209 0x000209 RWE 0x1000
+
+#--- at.lds
 ## Check the AT(expr)
-# RUN: echo "PHDRS {all PT_LOAD FILEHDR PHDRS AT(0x500 + 0x500) ;} \
-# RUN:       SECTIONS { \
-# RUN:           . = 0x10000200; \
-# RUN:           .text : {*(.text*)} :all \
-# RUN:           .foo : {*(.foo.*)} :all \
-# RUN:           .data : {*(.data.*)} :all}" > %t.script
-# RUN: ld.lld -o %t1 --script %t.script %t
-# RUN: llvm-readobj -l %t1 | FileCheck --check-prefix=AT %s
-
-# RUN: echo "PHDRS {all PT_LOAD FILEHDR PHDRS ;} \
-# RUN:       SECTIONS { \
-# RUN:           . = 0x10000200; \
-# RUN:           .text : {*(.text*)} :all \
-# RUN:           .foo : {*(.foo.*)}  \
-# RUN:           .data : {*(.data.*)} }" > %t.script
-# RUN: ld.lld -o %t1 --script %t.script %t
-# RUN: llvm-readobj -l %t1 | FileCheck --check-prefix=DEFHDR %s
+PHDRS {all PT_LOAD FILEHDR PHDRS AT(0x500 + 0x500) ;}
+SECTIONS {
+    . = 0x10000200;
+    .text : {*(.text*)} :all
+    .foo : {*(.foo.*)} :all
+    .data : {*(.data.*)} :all}
+
+# RUN: ld.lld -o at -T at.lds a.o
+# RUN: llvm-readelf -l at | FileCheck --check-prefix=AT %s
+# AT:      Type           Offset   VirtAddr           PhysAddr           FileSiz  MemSiz   Flg Align
+# AT-NEXT: LOAD           0x000000 0x0000000010000000 0x0000000000000a00 0x000209 0x000209 RWE 0x1000
+
+#--- int.lds
+## Check the numetic values for PHDRS.
+PHDRS {text PT_LOAD FILEHDR PHDRS; foo 0x11223344; }
+SECTIONS { . = SIZEOF_HEADERS; .foo : { *(.foo* .text*) } : text : foo}
+
+# RUN: ld.lld -o int -T int.lds a.o
+# RUN: llvm-readelf -l int | FileCheck --check-prefix=INT-PHDRS %s
+# INT-PHDRS:      Type           Offset   VirtAddr           PhysAddr           FileSiz  MemSiz   Flg Align
+# INT-PHDRS-NEXT: LOAD           0x000000 0x0000000000000000 0x0000000000000000 0x0000b9 0x0000b9 RWE 0x1000
+# INT-PHDRS-NEXT: <unknown>: 0x11223344 0x0000b0 0x00000000000000b0 0x00000000000000b0 0x000009 0x000009 RWE 0x4
 
+#--- unspecified.lds
 ## Check that error is reported when trying to use phdr which is not listed
 ## inside PHDRS {} block
 ## TODO: If script doesn't contain PHDRS {} block then default phdr is always
 ## created and error is not reported.
-# RUN: echo "PHDRS { all PT_LOAD; } \
-# RUN:       SECTIONS { .baz : {*(.foo.*)} :bar }" > %t.script
-# RUN: not ld.lld -o /dev/null --script %t.script %t 2>&1 | FileCheck --check-prefix=BADHDR %s
-
-# CHECK:     ProgramHeaders [
-# CHECK-NEXT:  ProgramHeader {
-# CHECK-NEXT:    Type: PT_LOAD (0x1)
-# CHECK-NEXT:    Offset: 0x0
-# CHECK-NEXT:    VirtualAddress: 0x10000000
-# CHECK-NEXT:    PhysicalAddress: 0x10000000
-# CHECK-NEXT:    FileSize: 521
-# CHECK-NEXT:    MemSize: 521
-# CHECK-NEXT:    Flags [ (0x7)
-# CHECK-NEXT:      PF_R (0x4)
-# CHECK-NEXT:      PF_W (0x2)
-# CHECK-NEXT:      PF_X (0x1)
-# CHECK-NEXT:    ]
-
-# NOPHDR:     ProgramHeaders [
-# NOPHDR-NEXT:  ProgramHeader {
-# NOPHDR-NEXT:    Type: PT_LOAD (0x1)
-# NOPHDR-NEXT:    Offset: 0x200
-# NOPHDR-NEXT:    VirtualAddress: 0x10000200
-# NOPHDR-NEXT:    PhysicalAddress: 0x10000200
-# NOPHDR-NEXT:    FileSize: 8
-# NOPHDR-NEXT:    MemSize: 8
-# NOPHDR-NEXT:    Flags [ (0x6)
-# NOPHDR-NEXT:      PF_R (0x4)
-# NOPHDR-NEXT:      PF_W (0x2)
-# NOPHDR-NEXT:    ]
-# NOPHDR-NEXT:    Alignment: 4096
-# NOPHDR-NEXT:  }
-# NOPHDR-NEXT: ]
-
-# AT:       ProgramHeaders [
-# AT-NEXT:    ProgramHeader {
-# AT-NEXT:      Type: PT_LOAD (0x1)
-# AT-NEXT:      Offset: 0x0
-# AT-NEXT:      VirtualAddress: 0x10000000
-# AT-NEXT:      PhysicalAddress: 0xA00
-# AT-NEXT:      FileSize: 521
-# AT-NEXT:      MemSize: 521
-# AT-NEXT:      Flags [ (0x7)
-# AT-NEXT:        PF_R (0x4)
-# AT-NEXT:        PF_W (0x2)
-# AT-NEXT:        PF_X (0x1)
-# AT-NEXT:      ]
+PHDRS { all PT_LOAD; }
+SECTIONS { .baz : {*(.foo.*)} :bar }
 
-## Check the numetic values for PHDRS.
-# RUN: echo "PHDRS {text PT_LOAD FILEHDR PHDRS; foo 0x11223344; } \
-# RUN:       SECTIONS { . = SIZEOF_HEADERS; .foo : { *(.foo* .text*) } : text : foo}" > %t1.script
-# RUN: ld.lld -o %t2 --script %t1.script %t
-# RUN: llvm-readobj -l %t2 | FileCheck --check-prefix=INT-PHDRS %s
-
-# INT-PHDRS:      ProgramHeaders [
-# INT-PHDRS:        ProgramHeader {
-# INT-PHDRS:           Type: Unknown (0x11223344)
-# INT-PHDRS-NEXT:      Offset: 0xB0
-# INT-PHDRS-NEXT:      VirtualAddress: 0xB0
-# INT-PHDRS-NEXT:      PhysicalAddress: 0xB0
-# INT-PHDRS-NEXT:      FileSize:
-# INT-PHDRS-NEXT:      MemSize:
-# INT-PHDRS-NEXT:      Flags [
-# INT-PHDRS-NEXT:        PF_R
-# INT-PHDRS-NEXT:        PF_W
-# INT-PHDRS-NEXT:        PF_X
-# INT-PHDRS-NEXT:      ]
-# INT-PHDRS-NEXT:      Alignment:
-# INT-PHDRS-NEXT:    }
-# INT-PHDRS-NEXT:  ]
-
-# DEFHDR:     ProgramHeaders [
-# DEFHDR-NEXT:  ProgramHeader {
-# DEFHDR-NEXT:    Type: PT_LOAD (0x1)
-# DEFHDR-NEXT:    Offset: 0x0
-# DEFHDR-NEXT:    VirtualAddress: 0x10000000
-# DEFHDR-NEXT:    PhysicalAddress: 0x10000000
-# DEFHDR-NEXT:    FileSize: 521
-# DEFHDR-NEXT:    MemSize: 521
-# DEFHDR-NEXT:    Flags [ (0x7)
-# DEFHDR-NEXT:      PF_R (0x4)
-# DEFHDR-NEXT:      PF_W (0x2)
-# DEFHDR-NEXT:      PF_X (0x1)
-# DEFHDR-NEXT:    ]
-
-# BADHDR:       {{.*}}.script:1: program header 'bar' is not listed in PHDRS
-
-# RUN: echo "PHDRS { text PT_LOAD FOOHDR; }" > %t1.script
-# RUN: not ld.lld -o /dev/null --script %t1.script %t 2>&1 | FileCheck --check-prefix=FOOHDR %s
-# FOOHDR: error: {{.*}}.script:1: unexpected header attribute: FOOHDR
-
-# RUN: echo "PHDRS { text PT_FOO FOOHDR; }" > %t1.script
-# RUN: not ld.lld -o /dev/null --script %t1.script %t 2>&1 | FileCheck --check-prefix=PTFOO %s
-# PTFOO: invalid program header type: PT_FOO
+# RUN: not ld.lld -T unspecified.lds a.o 2>&1 | FileCheck --check-prefix=UNSPECIFIED %s
+# UNSPECIFIED: unspecified.lds:6: program header 'bar' is not listed in PHDRS
+
+#--- foohdr.lds
+PHDRS { text PT_LOAD FOOHDR; }
+
+# RUN: not ld.lld -T foohdr.lds a.o 2>&1 | FileCheck --check-prefix=FOOHDR %s
+# FOOHDR: error: foohdr.lds:1: unexpected header attribute: FOOHDR
+
+#--- pt_foo.lds
+PHDRS { text PT_FOO FOOHDR; }
+
+# RUN: not ld.lld -T pt_foo.lds a.o 2>&1 | FileCheck --check-prefix=PTFOO %s --strict-whitespace
+#      PTFOO:{{.*}}error: pt_foo.lds:1: invalid program header type: PT_FOO
+# PTFOO-NEXT:>>> PHDRS { text PT_FOO FOOHDR; }
+# PTFOO-NEXT:>>>              ^
+
+#--- unclosed.lds
+PHDRS { text PT_LOAD ;
+
+# RUN: not ld.lld -T unclosed.lds a.o 2>&1 | FileCheck --check-prefix=UNCLOSED %s
+#     UNCLOSED:error: unclosed.lds:1: unexpected EOF
+# UNCLOSED-NOT:{{.}}
+
+#--- unclosed2.lds
+PHDRS { text PT_LOAD
+
+# RUN: not ld.lld -T unclosed2.lds a.o 2>&1 | FileCheck --check-prefix=UNCLOSED2 %s
+# UNCLOSED2: error: unclosed2.lds:1: unexpected header attribute:
 
+#--- a.s
 .global _start
 _start:
  nop
diff --git a/lld/test/ELF/linkerscript/region-alias.s b/lld/test/ELF/linkerscript/region-alias.s
index db716e1..f6a6e1f 100644
--- a/lld/test/ELF/linkerscript/region-alias.s
+++ b/lld/test/ELF/linkerscript/region-alias.s
@@ -11,7 +11,7 @@
 # RUN: }" > %t.script
 
 ## .text to ROM, .data to RAM.
-# RUN: echo "REGION_ALIAS (\"ALIAS_TEXT\", ROM);" > %t.script.inc
+# RUN: echo 'REGION_ALIAS ("ALIAS_TEXT", "ROM");' > %t.script.inc
 # RUN: echo "REGION_ALIAS (\"ALIAS_DATA\", RAM);" >> %t.script.inc
 # RUN: ld.lld %t --script %t.script -o %t2
 # RUN: llvm-objdump --section-headers %t2 | FileCheck %s
diff --git a/lld/test/ELF/linkerscript/sections.s b/lld/test/ELF/linkerscript/sections.s
index fc03af8..2a27d9e 100644
--- a/lld/test/ELF/linkerscript/sections.s
+++ b/lld/test/ELF/linkerscript/sections.s
@@ -1,18 +1,22 @@
 # REQUIRES: x86
-# RUN: llvm-mc -filetype=obj -triple=x86_64-unknown-linux %s -o %t
+# RUN: rm -rf %t && split-file %s %t && cd %t
+# RUN: llvm-mc -filetype=obj -triple=x86_64 a.s -o a.o
 
-# Empty SECTIONS command.
-# RUN: echo "SECTIONS {}" > %t.script
-# RUN: ld.lld -o %t1 --script %t.script %t
-# RUN: llvm-objdump --section-headers %t1 | \
+#--- empty.lds
+SECTIONS {}
+
+# RUN: ld.lld -o empty -T empty.lds a.o
+# RUN: llvm-objdump --section-headers empty | \
 # RUN:   FileCheck -check-prefix=SEC-DEFAULT %s
 
+#--- 1.lds
 # SECTIONS command with the same order as default.
-# RUN: echo "SECTIONS { \
-# RUN:          .text : { *(.text) } \
-# RUN:          .data : { *(.data) } }" > %t.script
-# RUN: ld.lld -o %t2 --script %t.script %t
-# RUN: llvm-objdump --section-headers %t2 | \
+SECTIONS {
+   .text : { *(.text) }
+   .data : { *(.data) } }
+
+# RUN: ld.lld -o 1 -T 1.lds a.o
+# RUN: llvm-objdump --section-headers 1 | \
 # RUN:   FileCheck -check-prefix=SEC-DEFAULT %s
 
 #             Idx Name          Size
@@ -28,8 +32,8 @@
 # .text and .data have swapped names but proper sizes and types.
 # RUN: echo "SECTIONS { \
 # RUN:          .data : { *(.text) } \
-# RUN:          .text : { *(.data) } }" > %t.script
-# RUN: ld.lld -o %t4 --script %t.script %t
+# RUN:          .text : { *(.data) } }" > t.lds
+# RUN: ld.lld -o %t4 --script t.lds a.o
 # RUN: llvm-objdump --section-headers %t4 | \
 # RUN:   FileCheck -check-prefix=SEC-SWAP-NAMES %s
 
@@ -50,8 +54,8 @@
 # RUN:          .text : { *(.text) } \
 # RUN:          .data : { *(.data) } } \
 # RUN:       SECTIONS { \
-# RUN:          .data : { *(other) } }" > %t.script
-# RUN: ld.lld -o %t6 --script %t.script %t
+# RUN:          .data : { *(other) } }" > t.lds
+# RUN: ld.lld -o %t6 --script t.lds a.o
 # RUN: llvm-objdump --section-headers %t6 | \
 # RUN:   FileCheck -check-prefix=SEC-MULTI %s
 
@@ -72,7 +76,7 @@
 # RUN:          .data : { *(.data) } \
 # RUN:          .comment : { *(.comment) } \
 # RUN:          other : { *(other) } }' > %t5.lds
-# RUN: ld.lld -o %t5 -T %t5.lds %t
+# RUN: ld.lld -o %t5 -T %t5.lds a.o
 # RUN: llvm-readelf -S -l %t5 | FileCheck --check-prefix=SEP-BY-NONALLOC %s
 
 # SEP-BY-NONALLOC:      [Nr] Name      Type     Address          Off    Size   ES Flg
@@ -87,11 +91,26 @@
 # SEP-BY-NONALLOC-NEXT: LOAD      0x00100e 0x000000000000000e 0x000000000000000e 0x000023 0x000025 RW  0x1000
 # SEP-BY-NONALLOC-NEXT: GNU_STACK 0x000000 0x0000000000000000 0x0000000000000000 0x000000 0x000000 RW  0
 
+#--- semi.lds
 # Input section pattern contains additional semicolon.
 # Case found in linux kernel script. Check we are able to parse it.
-# RUN: echo "SECTIONS { .text : { ;;*(.text);;S = 0;; } }" > %t.script
-# RUN: ld.lld -o /dev/null --script %t.script %t
+SECTIONS { .text : { ;;*(.text);;S = 0;; } }
+
+# RUN: ld.lld -T semi.lds a.o
+
+#--- unclosed.lds
+SECTIONS {
+   .text : { *(.text) }
+
+# RUN: not ld.lld -T unclosed.lds a.o 2>&1 | FileCheck --check-prefix=UNCLOSED %s
+#     UNCLOSED:error: unclosed.lds:2: unexpected EOF
+# UNCLOSED-NOT:{{.}}
+
+#--- unclosed-out.lds
+SECTIONS {
+   .text : { *(.text)
 
+#--- a.s
 .globl _start
 _start:
     mov $60, %rax
diff --git a/lld/test/ELF/linkerscript/symbolreferenced.s b/lld/test/ELF/linkerscript/symbolreferenced.s
index 6848082..ad356d4 100644
--- a/lld/test/ELF/linkerscript/symbolreferenced.s
+++ b/lld/test/ELF/linkerscript/symbolreferenced.s
@@ -94,7 +94,7 @@ PROVIDE(unused = g1);
 PROVIDE_HIDDEN(another_unused = g1);
 
 #--- chain_with_cycle.t
-PROVIDE(f1 = f2 + f3);
+PROVIDE("f1" = f2 + f3);
 PROVIDE(f2 = f3 + f4);
 PROVIDE(f3 = f4);
 PROVIDE(f4 = f1);
diff --git a/lld/test/ELF/linkerscript/unquoted.test b/lld/test/ELF/linkerscript/unquoted.test
index 7dca75f..9a30ae8 100644
--- a/lld/test/ELF/linkerscript/unquoted.test
+++ b/lld/test/ELF/linkerscript/unquoted.test
@@ -12,11 +12,10 @@ INCLUDE "empty.lds"
 INCLUDE "1.lds"
 
 # RUN: not ld.lld -shared 0.o -T 1.lds 2>&1 | FileCheck %s --check-prefix=CHECK1 --match-full-lines --strict-whitespace
-# RUN: not ld.lld -shared 0.o -T 1a.lds 2>&1 | FileCheck %s --check-prefix=CHECK1A --match-full-lines --strict-whitespace
-#      CHECK1:{{.*}}error: 1.lds:1: unclosed comment in a linker script
-#     CHECK1A:{{.*}}error: 1a.lds:3: unclosed comment in a linker script
-#CHECK1A-NEXT:>>> INCLUDE "1.lds"
-#CHECK1A-NEXT:>>>         ^
+# RUN: not ld.lld -shared 0.o -T 1a.lds 2>&1 | FileCheck %s --check-prefix=CHECK1 --match-full-lines --strict-whitespace
+#      CHECK1:{{.*}}error: 1.lds:2: unclosed comment in a linker script
+# CHECK1-NEXT:>>> SECTIONS /*
+# CHECK1-NEXT:>>> ^
 
 #--- 2.lds
 INCLUDE "empty.lds"
diff --git a/lld/test/ELF/zsectionheader.s b/lld/test/ELF/zsectionheader.s
new file mode 100644
index 0000000..c1e654a
--- /dev/null
+++ b/lld/test/ELF/zsectionheader.s
@@ -0,0 +1,36 @@
+# REQUIRES: x86
+# RUN: llvm-mc -filetype=obj -triple=x86_64 %s -o %t.o
+# RUN: ld.lld -shared -z nosectionheader -z sectionheader %t.o -o %t.so 2>&1 | count 0
+# RUN: llvm-readelf -hS %t.so | FileCheck %s --check-prefixes=CHECK,SHDR
+
+# RUN: ld.lld -shared -z nosectionheader %t.o -o %t0.so
+# RUN: llvm-readelf -h --dyn-syms %t0.so | FileCheck %s --check-prefixes=CHECK,NOSHDR
+# RUN: llvm-strings %t0.so | FileCheck %s --check-prefixes=NOSHDR-STR
+
+# CHECK:       Size of this header:               64 (bytes)
+# CHECK-NEXT:  Size of program headers:           56 (bytes)
+# CHECK-NEXT:  Number of program headers:         6
+# CHECK-NEXT:  Size of section headers:           64 (bytes)
+# SHDR-NEXT:   Number of section headers:         13
+# SHDR-NEXT:   Section header string table index: 11
+# NOSHDR-NEXT: Number of section headers:         0
+# NOSHDR-NEXT: Section header string table index: 0
+
+# SHDR:        Section Headers:
+# NOSHDR:      Symbol table for image contains 2 entries:
+# NOSHDR:        _start
+
+## _start occurs as a dynamic string table entry. There is no static string table
+## entry. `nonalloc` is not in the output.
+# NOSHDR-STR:      _start
+# NOSHDR-STR-NOT:  _start
+
+# RUN: not ld.lld -r -z nosectionheader %t.o -o /dev/null 2>&1 | FileCheck %s --check-prefix=ERR
+
+# ERR: error: -r and -z nosectionheader may not be used together
+
+.globl _start
+_start:
+
+.section nonalloc,""
+.asciz "_start"
diff --git a/lldb/cmake/modules/AddLLDB.cmake b/lldb/cmake/modules/AddLLDB.cmake
index 5380290..0a81ec5 100644
--- a/lldb/cmake/modules/AddLLDB.cmake
+++ b/lldb/cmake/modules/AddLLDB.cmake
@@ -258,6 +258,7 @@ function(add_lldb_tool name)
   endif()
 
   add_lldb_executable(${name} GENERATE_INSTALL ${ARG_UNPARSED_ARGUMENTS})
+  set_target_properties(${name} PROPERTIES XCODE_GENERATE_SCHEME ON)
 endfunction()
 
 # The test suite relies on finding LLDB.framework binary resources in the
diff --git a/lldb/include/lldb/Core/PluginManager.h b/lldb/include/lldb/Core/PluginManager.h
index 38a291d..a23f834 100644
--- a/lldb/include/lldb/Core/PluginManager.h
+++ b/lldb/include/lldb/Core/PluginManager.h
@@ -10,6 +10,7 @@
 #define LLDB_CORE_PLUGINMANAGER_H
 
 #include "lldb/Core/Architecture.h"
+#include "lldb/Interpreter/Interfaces/ScriptedInterfaceUsages.h"
 #include "lldb/Symbol/TypeSystem.h"
 #include "lldb/Utility/CompletionRequest.h"
 #include "lldb/Utility/FileSpec.h"
@@ -487,6 +488,25 @@ public:
 
   static LanguageSet GetAllTypeSystemSupportedLanguagesForExpressions();
 
+  // Scripted Interface
+  static bool RegisterPlugin(llvm::StringRef name, llvm::StringRef description,
+                             ScriptedInterfaceCreateInstance create_callback,
+                             lldb::ScriptLanguage language,
+                             ScriptedInterfaceUsages usages);
+
+  static bool UnregisterPlugin(ScriptedInterfaceCreateInstance create_callback);
+
+  static uint32_t GetNumScriptedInterfaces();
+
+  static llvm::StringRef GetScriptedInterfaceNameAtIndex(uint32_t idx);
+
+  static llvm::StringRef GetScriptedInterfaceDescriptionAtIndex(uint32_t idx);
+
+  static lldb::ScriptLanguage GetScriptedInterfaceLanguageAtIndex(uint32_t idx);
+
+  static ScriptedInterfaceUsages
+  GetScriptedInterfaceUsagesAtIndex(uint32_t idx);
+
   // REPL
   static bool RegisterPlugin(llvm::StringRef name, llvm::StringRef description,
                              REPLCreateInstance create_callback,
diff --git a/lldb/include/lldb/Interpreter/CommandObject.h b/lldb/include/lldb/Interpreter/CommandObject.h
index d48dbcd..20c4769 100644
--- a/lldb/include/lldb/Interpreter/CommandObject.h
+++ b/lldb/include/lldb/Interpreter/CommandObject.h
@@ -369,13 +369,14 @@ protected:
            "currently stopped.";
   }
 
-  // This is for use in the command interpreter, when you either want the
-  // selected target, or if no target is present you want to prime the dummy
-  // target with entities that will be copied over to new targets.
-  Target &GetSelectedOrDummyTarget(bool prefer_dummy = false);
-  Target &GetSelectedTarget();
   Target &GetDummyTarget();
 
+  // This is for use in the command interpreter, and returns the most relevant
+  // target. In order of priority, that's the target from the command object's
+  // execution context, the target from the interpreter's execution context, the
+  // selected target or the dummy target.
+  Target &GetTarget();
+
   // If a command needs to use the "current" thread, use this call. Command
   // objects will have an ExecutionContext to use, and that may or may not have
   // a thread in it.  If it does, you should use that by default, if not, then
diff --git a/lldb/include/lldb/Interpreter/Interfaces/ScriptedInterface.h b/lldb/include/lldb/Interpreter/Interfaces/ScriptedInterface.h
index 69504db..3ce47d0 100644
--- a/lldb/include/lldb/Interpreter/Interfaces/ScriptedInterface.h
+++ b/lldb/include/lldb/Interpreter/Interfaces/ScriptedInterface.h
@@ -9,6 +9,8 @@
 #ifndef LLDB_INTERPRETER_INTERFACES_SCRIPTEDINTERFACE_H
 #define LLDB_INTERPRETER_INTERFACES_SCRIPTEDINTERFACE_H
 
+#include "ScriptedInterfaceUsages.h"
+
 #include "lldb/Core/StructuredDataImpl.h"
 #include "lldb/Utility/LLDBLog.h"
 #include "lldb/Utility/Log.h"
@@ -68,6 +70,11 @@ public:
     return true;
   }
 
+  static bool CreateInstance(lldb::ScriptLanguage language,
+                             ScriptedInterfaceUsages usages) {
+    return false;
+  }
+
 protected:
   StructuredData::GenericSP m_object_instance_sp;
 };
diff --git a/lldb/include/lldb/Interpreter/Interfaces/ScriptedInterfaceUsages.h b/lldb/include/lldb/Interpreter/Interfaces/ScriptedInterfaceUsages.h
new file mode 100644
index 0000000..36c0cfd
--- /dev/null
+++ b/lldb/include/lldb/Interpreter/Interfaces/ScriptedInterfaceUsages.h
@@ -0,0 +1,43 @@
+//===-- ScriptedInterfaceUsages.h ---------------------------- -*- C++ -*-===//
+//
+// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
+// See https://llvm.org/LICENSE.txt for license information.
+// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
+//
+//===----------------------------------------------------------------------===//
+
+#ifndef LLDB_INTERPRETER_SCRIPTEDINTERFACEUSAGES_H
+#define LLDB_INTERPRETER_SCRIPTEDINTERFACEUSAGES_H
+
+#include "lldb/lldb-types.h"
+
+#include "lldb/Utility/Stream.h"
+#include "llvm/ADT/StringRef.h"
+
+namespace lldb_private {
+class ScriptedInterfaceUsages {
+public:
+  ScriptedInterfaceUsages() = default;
+  ScriptedInterfaceUsages(const std::vector<llvm::StringRef> ci_usages,
+                          const std::vector<llvm::StringRef> sbapi_usages)
+      : m_command_interpreter_usages(ci_usages), m_sbapi_usages(sbapi_usages) {}
+
+  const std::vector<llvm::StringRef> &GetCommandInterpreterUsages() const {
+    return m_command_interpreter_usages;
+  }
+
+  const std::vector<llvm::StringRef> &GetSBAPIUsages() const {
+    return m_sbapi_usages;
+  }
+
+  enum class UsageKind { CommandInterpreter, API };
+
+  void Dump(Stream &s, UsageKind kind) const;
+
+private:
+  std::vector<llvm::StringRef> m_command_interpreter_usages;
+  std::vector<llvm::StringRef> m_sbapi_usages;
+};
+} // namespace lldb_private
+
+#endif // LLDB_INTERPRETER_SCRIPTEDINTERFACEUSAGES_H
diff --git a/lldb/include/lldb/Symbol/SymbolFile.h b/lldb/include/lldb/Symbol/SymbolFile.h
index d207667..8419495 100644
--- a/lldb/include/lldb/Symbol/SymbolFile.h
+++ b/lldb/include/lldb/Symbol/SymbolFile.h
@@ -211,7 +211,15 @@ public:
   /// The characteristics of an array type.
   struct ArrayInfo {
     int64_t first_index = 0;
-    llvm::SmallVector<uint64_t, 1> element_orders;
+
+    ///< Each entry belongs to a distinct DW_TAG_subrange_type.
+    ///< For multi-dimensional DW_TAG_array_types we would have
+    ///< an entry for each dimension. An entry represents the
+    ///< optional element count of the subrange.
+    ///
+    ///< The order of entries follows the order of the DW_TAG_subrange_type
+    ///< children of this DW_TAG_array_type.
+    llvm::SmallVector<std::optional<uint64_t>, 1> element_orders;
     uint32_t byte_stride = 0;
     uint32_t bit_stride = 0;
   };
diff --git a/lldb/include/lldb/Symbol/UnwindPlan.h b/lldb/include/lldb/Symbol/UnwindPlan.h
index ebb0ec4..a9e8406 100644
--- a/lldb/include/lldb/Symbol/UnwindPlan.h
+++ b/lldb/include/lldb/Symbol/UnwindPlan.h
@@ -68,7 +68,8 @@ public:
         isAFAPlusOffset,   // reg = AFA + offset
         inOtherRegister,   // reg = other reg
         atDWARFExpression, // reg = deref(eval(dwarf_expr))
-        isDWARFExpression  // reg = eval(dwarf_expr)
+        isDWARFExpression, // reg = eval(dwarf_expr)
+        isConstant         // reg = constant
       };
 
       RegisterLocation() : m_location() {}
@@ -105,6 +106,15 @@ public:
 
       bool IsDWARFExpression() const { return m_type == isDWARFExpression; }
 
+      bool IsConstant() const { return m_type == isConstant; }
+
+      void SetIsConstant(uint64_t value) {
+        m_type = isConstant;
+        m_location.constant_value = value;
+      }
+
+      uint64_t GetConstant() const { return m_location.constant_value; }
+
       void SetAtCFAPlusOffset(int32_t offset) {
         m_type = atCFAPlusOffset;
         m_location.offset = offset;
@@ -192,6 +202,8 @@ public:
           const uint8_t *opcodes;
           uint16_t length;
         } expr;
+        // For m_type == isConstant
+        uint64_t constant_value;
       } m_location;
     };
 
@@ -358,6 +370,9 @@ public:
 
     bool SetRegisterLocationToSame(uint32_t reg_num, bool must_replace);
 
+    bool SetRegisterLocationToIsConstant(uint32_t reg_num, uint64_t constant,
+                                         bool can_replace);
+
     // When this UnspecifiedRegistersAreUndefined mode is
     // set, any register that is not specified by this Row will
     // be described as Undefined.
diff --git a/lldb/include/lldb/Target/Target.h b/lldb/include/lldb/Target/Target.h
index 5d5ae1b..119dff4 100644
--- a/lldb/include/lldb/Target/Target.h
+++ b/lldb/include/lldb/Target/Target.h
@@ -141,6 +141,8 @@ public:
 
   PathMappingList &GetSourcePathMap() const;
 
+  PathMappingList &GetObjectPathMap() const;
+
   bool GetAutoSourceMapRelative() const;
 
   FileSpecList GetExecutableSearchPaths();
diff --git a/lldb/include/lldb/lldb-private-interfaces.h b/lldb/include/lldb/lldb-private-interfaces.h
index 10eaf1e..87c5ff8 100644
--- a/lldb/include/lldb/lldb-private-interfaces.h
+++ b/lldb/include/lldb/lldb-private-interfaces.h
@@ -25,6 +25,7 @@ class Value;
 } // namespace llvm
 
 namespace lldb_private {
+class ScriptedInterfaceUsages;
 typedef lldb::ABISP (*ABICreateInstance)(lldb::ProcessSP process_sp,
                                          const ArchSpec &arch);
 typedef std::unique_ptr<Architecture> (*ArchitectureCreateInstance)(
@@ -124,6 +125,8 @@ typedef lldb::REPLSP (*REPLCreateInstance)(Status &error,
                                            lldb::LanguageType language,
                                            Debugger *debugger, Target *target,
                                            const char *repl_options);
+typedef bool (*ScriptedInterfaceCreateInstance)(lldb::ScriptLanguage language,
+                                                ScriptedInterfaceUsages usages);
 typedef int (*ComparisonFunction)(const void *, const void *);
 typedef void (*DebuggerInitializeCallback)(Debugger &debugger);
 /// Trace
diff --git a/lldb/packages/Python/lldbsuite/test/lldbplatformutil.py b/lldb/packages/Python/lldbsuite/test/lldbplatformutil.py
index e3c6fd1..602e15d 100644
--- a/lldb/packages/Python/lldbsuite/test/lldbplatformutil.py
+++ b/lldb/packages/Python/lldbsuite/test/lldbplatformutil.py
@@ -266,17 +266,13 @@ def getCompiler():
     return module.getCompiler()
 
 
-def getCompilerBinary():
-    """Returns the compiler binary the test suite is running with."""
-    return getCompiler().split()[0]
-
-
 def getCompilerVersion():
     """Returns a string that represents the compiler version.
     Supports: llvm, clang.
     """
-    compiler = getCompilerBinary()
-    version_output = subprocess.check_output([compiler, "--version"], errors="replace")
+    version_output = subprocess.check_output(
+        [getCompiler(), "--version"], errors="replace"
+    )
     m = re.search("version ([0-9.]+)", version_output)
     if m:
         return m.group(1)
diff --git a/lldb/packages/Python/lldbsuite/test/lldbtest.py b/lldb/packages/Python/lldbsuite/test/lldbtest.py
index 5e50b0c..b57c3bd 100644
--- a/lldb/packages/Python/lldbsuite/test/lldbtest.py
+++ b/lldb/packages/Python/lldbsuite/test/lldbtest.py
@@ -1379,10 +1379,6 @@ class Base(unittest.TestCase):
         """Returns the compiler in effect the test suite is running with."""
         return lldbplatformutil.getCompiler()
 
-    def getCompilerBinary(self):
-        """Returns the compiler binary the test suite is running with."""
-        return lldbplatformutil.getCompilerBinary()
-
     def getCompilerVersion(self):
         """Returns a string that represents the compiler version.
         Supports: llvm, clang.
@@ -1518,19 +1514,23 @@ class Base(unittest.TestCase):
             stdflag = "-std=c++11"
         return stdflag
 
-    def buildDriver(self, sources, exe_name):
+    def buildDriver(self, sources, exe_name, defines=None):
         """Platform-specific way to build a program that links with LLDB (via the liblldb.so
         or LLDB.framework).
         """
+        if defines is None:
+            defines = []
+
         stdflag = self.getstdFlag()
         stdlibflag = self.getstdlibFlag()
+        defines = " ".join(["-D{}={}".format(name, value) for name, value in defines])
 
         lib_dir = configuration.lldb_libs_dir
         if self.hasDarwinFramework():
             d = {
                 "CXX_SOURCES": sources,
                 "EXE": exe_name,
-                "CFLAGS_EXTRAS": "%s %s" % (stdflag, stdlibflag),
+                "CFLAGS_EXTRAS": "%s %s %s" % (stdflag, stdlibflag, defines),
                 "FRAMEWORK_INCLUDES": "-F%s" % self.framework_dir,
                 "LD_EXTRAS": "%s -Wl,-rpath,%s" % (self.lib_lldb, self.framework_dir),
             }
@@ -1538,12 +1538,13 @@ class Base(unittest.TestCase):
             d = {
                 "CXX_SOURCES": sources,
                 "EXE": exe_name,
-                "CFLAGS_EXTRAS": "%s %s -I%s -I%s"
+                "CFLAGS_EXTRAS": "%s %s -I%s -I%s %s"
                 % (
                     stdflag,
                     stdlibflag,
                     os.path.join(os.environ["LLDB_SRC"], "include"),
                     os.path.join(configuration.lldb_obj_root, "include"),
+                    defines,
                 ),
                 "LD_EXTRAS": "-L%s -lliblldb" % lib_dir,
             }
@@ -1551,12 +1552,13 @@ class Base(unittest.TestCase):
             d = {
                 "CXX_SOURCES": sources,
                 "EXE": exe_name,
-                "CFLAGS_EXTRAS": "%s %s -I%s -I%s"
+                "CFLAGS_EXTRAS": "%s %s -I%s -I%s %s"
                 % (
                     stdflag,
                     stdlibflag,
                     os.path.join(os.environ["LLDB_SRC"], "include"),
                     os.path.join(configuration.lldb_obj_root, "include"),
+                    defines,
                 ),
                 "LD_EXTRAS": "-L%s -llldb -Wl,-rpath,%s" % (lib_dir, lib_dir),
             }
diff --git a/lldb/source/Commands/CommandObjectBreakpoint.cpp b/lldb/source/Commands/CommandObjectBreakpoint.cpp
index 773f8ed..aad03af 100644
--- a/lldb/source/Commands/CommandObjectBreakpoint.cpp
+++ b/lldb/source/Commands/CommandObjectBreakpoint.cpp
@@ -539,7 +539,8 @@ public:
 
 protected:
   void DoExecute(Args &command, CommandReturnObject &result) override {
-    Target &target = GetSelectedOrDummyTarget(m_dummy_options.m_use_dummy);
+    Target &target =
+        m_dummy_options.m_use_dummy ? GetDummyTarget() : GetTarget();
 
     // The following are the various types of breakpoints that could be set:
     //   1).  -f -l -p  [-s -g]   (setting breakpoint by source location)
@@ -839,7 +840,7 @@ public:
 
 protected:
   void DoExecute(Args &command, CommandReturnObject &result) override {
-    Target &target = GetSelectedOrDummyTarget(m_dummy_opts.m_use_dummy);
+    Target &target = m_dummy_opts.m_use_dummy ? GetDummyTarget() : GetTarget();
 
     std::unique_lock<std::recursive_mutex> lock;
     target.GetBreakpointList().GetListMutex(lock);
@@ -903,7 +904,7 @@ public:
 
 protected:
   void DoExecute(Args &command, CommandReturnObject &result) override {
-    Target &target = GetSelectedOrDummyTarget();
+    Target &target = GetTarget();
 
     std::unique_lock<std::recursive_mutex> lock;
     target.GetBreakpointList().GetListMutex(lock);
@@ -1010,7 +1011,7 @@ the second re-enables the first location.");
 
 protected:
   void DoExecute(Args &command, CommandReturnObject &result) override {
-    Target &target = GetSelectedOrDummyTarget();
+    Target &target = GetTarget();
     std::unique_lock<std::recursive_mutex> lock;
     target.GetBreakpointList().GetListMutex(lock);
 
@@ -1148,7 +1149,7 @@ public:
 
 protected:
   void DoExecute(Args &command, CommandReturnObject &result) override {
-    Target &target = GetSelectedOrDummyTarget(m_options.m_use_dummy);
+    Target &target = m_options.m_use_dummy ? GetDummyTarget() : GetTarget();
 
     const BreakpointList &breakpoints =
         target.GetBreakpointList(m_options.m_internal);
@@ -1267,7 +1268,7 @@ public:
 
 protected:
   void DoExecute(Args &command, CommandReturnObject &result) override {
-    Target &target = GetSelectedOrDummyTarget();
+    Target &target = GetTarget();
 
     // The following are the various types of breakpoints that could be
     // cleared:
@@ -1416,7 +1417,7 @@ public:
 
 protected:
   void DoExecute(Args &command, CommandReturnObject &result) override {
-    Target &target = GetSelectedOrDummyTarget(m_options.m_use_dummy);
+    Target &target = m_options.m_use_dummy ? GetDummyTarget() : GetTarget();
     result.Clear();
     
     std::unique_lock<std::recursive_mutex> lock;
@@ -1676,7 +1677,7 @@ protected:
       return;
     }
 
-    Target &target = GetSelectedOrDummyTarget(false);
+    Target &target = GetTarget();
 
     std::unique_lock<std::recursive_mutex> lock;
     target.GetBreakpointList().GetListMutex(lock);
@@ -1764,7 +1765,7 @@ protected:
     }
 
     Target &target =
-        GetSelectedOrDummyTarget(m_name_options.m_use_dummy.GetCurrentValue());
+        m_name_options.m_use_dummy ? GetDummyTarget() : GetTarget();
 
     std::unique_lock<std::recursive_mutex> lock;
     target.GetBreakpointList().GetListMutex(lock);
@@ -1838,7 +1839,7 @@ protected:
     }
 
     Target &target =
-        GetSelectedOrDummyTarget(m_name_options.m_use_dummy.GetCurrentValue());
+        m_name_options.m_use_dummy ? GetDummyTarget() : GetTarget();
 
     std::unique_lock<std::recursive_mutex> lock;
     target.GetBreakpointList().GetListMutex(lock);
@@ -1897,7 +1898,7 @@ public:
 protected:
   void DoExecute(Args &command, CommandReturnObject &result) override {
     Target &target =
-        GetSelectedOrDummyTarget(m_name_options.m_use_dummy.GetCurrentValue());
+        m_name_options.m_use_dummy ? GetDummyTarget() : GetTarget();
 
     std::vector<std::string> name_list;
     if (command.empty()) {
@@ -2209,7 +2210,7 @@ public:
 
 protected:
   void DoExecute(Args &command, CommandReturnObject &result) override {
-    Target &target = GetSelectedOrDummyTarget();
+    Target &target = GetTarget();
 
     std::unique_lock<std::recursive_mutex> lock;
     target.GetBreakpointList().GetListMutex(lock);
@@ -2319,7 +2320,7 @@ public:
 
 protected:
   void DoExecute(Args &command, CommandReturnObject &result) override {
-    Target &target = GetSelectedOrDummyTarget();
+    Target &target = GetTarget();
 
     std::unique_lock<std::recursive_mutex> lock;
     target.GetBreakpointList().GetListMutex(lock);
diff --git a/lldb/source/Commands/CommandObjectBreakpointCommand.cpp b/lldb/source/Commands/CommandObjectBreakpointCommand.cpp
index 6ebe6e8..8c1fb51 100644
--- a/lldb/source/Commands/CommandObjectBreakpointCommand.cpp
+++ b/lldb/source/Commands/CommandObjectBreakpointCommand.cpp
@@ -323,7 +323,7 @@ are no syntax errors may indicate that a function was declared but never called.
 
 protected:
   void DoExecute(Args &command, CommandReturnObject &result) override {
-    Target &target = GetSelectedOrDummyTarget(m_options.m_use_dummy);
+    Target &target = m_options.m_use_dummy ? GetDummyTarget() : GetTarget();
 
     const BreakpointList &breakpoints = target.GetBreakpointList();
     size_t num_breakpoints = breakpoints.GetSize();
@@ -481,7 +481,7 @@ public:
 
 protected:
   void DoExecute(Args &command, CommandReturnObject &result) override {
-    Target &target = GetSelectedOrDummyTarget(m_options.m_use_dummy);
+    Target &target = m_options.m_use_dummy ? GetDummyTarget() : GetTarget();
 
     const BreakpointList &breakpoints = target.GetBreakpointList();
     size_t num_breakpoints = breakpoints.GetSize();
@@ -548,9 +548,9 @@ public:
 
 protected:
   void DoExecute(Args &command, CommandReturnObject &result) override {
-    Target *target = &GetSelectedTarget();
+    Target &target = GetTarget();
 
-    const BreakpointList &breakpoints = target->GetBreakpointList();
+    const BreakpointList &breakpoints = target.GetBreakpointList();
     size_t num_breakpoints = breakpoints.GetSize();
 
     if (num_breakpoints == 0) {
@@ -566,7 +566,7 @@ protected:
 
     BreakpointIDList valid_bp_ids;
     CommandObjectMultiwordBreakpoint::VerifyBreakpointOrLocationIDs(
-        command, target, result, &valid_bp_ids,
+        command, &target, result, &valid_bp_ids,
         BreakpointName::Permissions::PermissionKinds::listPerm);
 
     if (result.Succeeded()) {
@@ -575,7 +575,7 @@ protected:
         BreakpointID cur_bp_id = valid_bp_ids.GetBreakpointIDAtIndex(i);
         if (cur_bp_id.GetBreakpointID() != LLDB_INVALID_BREAK_ID) {
           Breakpoint *bp =
-              target->GetBreakpointByID(cur_bp_id.GetBreakpointID()).get();
+              target.GetBreakpointByID(cur_bp_id.GetBreakpointID()).get();
 
           if (bp) {
             BreakpointLocationSP bp_loc_sp;
diff --git a/lldb/source/Commands/CommandObjectDisassemble.cpp b/lldb/source/Commands/CommandObjectDisassemble.cpp
index d975e39..652a300 100644
--- a/lldb/source/Commands/CommandObjectDisassemble.cpp
+++ b/lldb/source/Commands/CommandObjectDisassemble.cpp
@@ -227,7 +227,7 @@ llvm::Error CommandObjectDisassemble::CheckRangeSize(const AddressRange &range,
     return llvm::Error::success();
   StreamString msg;
   msg << "Not disassembling " << what << " because it is very large ";
-  range.Dump(&msg, &GetSelectedTarget(), Address::DumpStyleLoadAddress,
+  range.Dump(&msg, &GetTarget(), Address::DumpStyleLoadAddress,
              Address::DumpStyleFileAddress);
   msg << ". To disassemble specify an instruction count limit, start/stop "
          "addresses or use the --force option.";
@@ -252,7 +252,7 @@ CommandObjectDisassemble::GetContainingAddressRanges() {
     }
   };
 
-  Target &target = GetSelectedTarget();
+  Target &target = GetTarget();
   if (!target.GetSectionLoadList().IsEmpty()) {
     Address symbol_containing_address;
     if (target.GetSectionLoadList().ResolveLoadAddress(
@@ -351,8 +351,8 @@ CommandObjectDisassemble::GetNameRanges(CommandReturnObject &result) {
 
   // Find functions matching the given name.
   SymbolContextList sc_list;
-  GetSelectedTarget().GetImages().FindFunctions(name, eFunctionNameTypeAuto,
-                                                function_options, sc_list);
+  GetTarget().GetImages().FindFunctions(name, eFunctionNameTypeAuto,
+                                        function_options, sc_list);
 
   std::vector<AddressRange> ranges;
   llvm::Error range_errs = llvm::Error::success();
@@ -439,10 +439,10 @@ CommandObjectDisassemble::GetRangesForSelectedMode(
 
 void CommandObjectDisassemble::DoExecute(Args &command,
                                          CommandReturnObject &result) {
-  Target *target = &GetSelectedTarget();
+  Target &target = GetTarget();
 
   if (!m_options.arch.IsValid())
-    m_options.arch = target->GetArchitecture();
+    m_options.arch = target.GetArchitecture();
 
   if (!m_options.arch.IsValid()) {
     result.AppendError(
@@ -535,7 +535,7 @@ void CommandObjectDisassemble::DoExecute(Args &command,
       } else {
         result.AppendErrorWithFormat(
             "Failed to disassemble memory at 0x%8.8" PRIx64 ".\n",
-            cur_range.GetBaseAddress().GetLoadAddress(target));
+            cur_range.GetBaseAddress().GetLoadAddress(&target));
       }
     }
     if (print_sc_header)
diff --git a/lldb/source/Commands/CommandObjectExpression.cpp b/lldb/source/Commands/CommandObjectExpression.cpp
index eb76753..769f01d 100644
--- a/lldb/source/Commands/CommandObjectExpression.cpp
+++ b/lldb/source/Commands/CommandObjectExpression.cpp
@@ -605,7 +605,7 @@ void CommandObjectExpression::DoExecute(llvm::StringRef command,
       return;
 
     if (m_repl_option.GetOptionValue().GetCurrentValue()) {
-      Target &target = GetSelectedOrDummyTarget();
+      Target &target = GetTarget();
       // Drop into REPL
       m_expr_lines.clear();
       m_expr_line_count = 0;
@@ -665,7 +665,7 @@ void CommandObjectExpression::DoExecute(llvm::StringRef command,
     }
   }
 
-  Target &target = GetSelectedOrDummyTarget();
+  Target &target = GetTarget();
   if (EvaluateExpression(expr, result.GetOutputStream(),
                          result.GetErrorStream(), result)) {
 
diff --git a/lldb/source/Commands/CommandObjectFrame.cpp b/lldb/source/Commands/CommandObjectFrame.cpp
index 3f4178c..29e460f 100644
--- a/lldb/source/Commands/CommandObjectFrame.cpp
+++ b/lldb/source/Commands/CommandObjectFrame.cpp
@@ -687,7 +687,7 @@ protected:
                                            m_cmd_name);
 
     // Increment statistics.
-    TargetStats &target_stats = GetSelectedOrDummyTarget().GetStatistics();
+    TargetStats &target_stats = GetTarget().GetStatistics();
     if (result.Succeeded())
       target_stats.GetFrameVariableStats().NotifySuccess();
     else
@@ -874,13 +874,13 @@ void CommandObjectFrameRecognizerAdd::DoExecute(Args &command,
         RegularExpressionSP(new RegularExpression(m_options.m_module));
     auto func =
         RegularExpressionSP(new RegularExpression(m_options.m_symbols.front()));
-    GetSelectedOrDummyTarget().GetFrameRecognizerManager().AddRecognizer(
+    GetTarget().GetFrameRecognizerManager().AddRecognizer(
         recognizer_sp, module, func, m_options.m_first_instruction_only);
   } else {
     auto module = ConstString(m_options.m_module);
     std::vector<ConstString> symbols(m_options.m_symbols.begin(),
                                      m_options.m_symbols.end());
-    GetSelectedOrDummyTarget().GetFrameRecognizerManager().AddRecognizer(
+    GetTarget().GetFrameRecognizerManager().AddRecognizer(
         recognizer_sp, module, symbols, m_options.m_first_instruction_only);
   }
 #endif
@@ -898,9 +898,7 @@ public:
 
 protected:
   void DoExecute(Args &command, CommandReturnObject &result) override {
-    GetSelectedOrDummyTarget()
-        .GetFrameRecognizerManager()
-        .RemoveAllRecognizers();
+    GetTarget().GetFrameRecognizerManager().RemoveAllRecognizers();
     result.SetStatus(eReturnStatusSuccessFinishResult);
   }
 };
@@ -922,7 +920,7 @@ public:
     if (request.GetCursorIndex() != 0)
       return;
 
-    GetSelectedOrDummyTarget().GetFrameRecognizerManager().ForEach(
+    GetTarget().GetFrameRecognizerManager().ForEach(
         [&request](uint32_t rid, std::string rname, std::string module,
                    llvm::ArrayRef<lldb_private::ConstString> symbols,
                    bool regexp) {
@@ -953,9 +951,7 @@ protected:
         return;
       }
 
-      GetSelectedOrDummyTarget()
-          .GetFrameRecognizerManager()
-          .RemoveAllRecognizers();
+      GetTarget().GetFrameRecognizerManager().RemoveAllRecognizers();
       result.SetStatus(eReturnStatusSuccessFinishResult);
       return;
     }
@@ -973,9 +969,8 @@ protected:
       return;
     }
 
-    if (!GetSelectedOrDummyTarget()
-             .GetFrameRecognizerManager()
-             .RemoveRecognizerWithID(recognizer_id)) {
+    if (!GetTarget().GetFrameRecognizerManager().RemoveRecognizerWithID(
+            recognizer_id)) {
       result.AppendErrorWithFormat("'%s' is not a valid recognizer id.\n",
                                    command.GetArgumentAtIndex(0));
       return;
@@ -996,7 +991,7 @@ public:
 protected:
   void DoExecute(Args &command, CommandReturnObject &result) override {
     bool any_printed = false;
-    GetSelectedOrDummyTarget().GetFrameRecognizerManager().ForEach(
+    GetTarget().GetFrameRecognizerManager().ForEach(
         [&result, &any_printed](
             uint32_t recognizer_id, std::string name, std::string module,
             llvm::ArrayRef<ConstString> symbols, bool regexp) {
@@ -1073,9 +1068,8 @@ protected:
       return;
     }
 
-    auto recognizer = GetSelectedOrDummyTarget()
-                          .GetFrameRecognizerManager()
-                          .GetRecognizerForFrame(frame_sp);
+    auto recognizer =
+        GetTarget().GetFrameRecognizerManager().GetRecognizerForFrame(frame_sp);
 
     Stream &output_stream = result.GetOutputStream();
     output_stream.Printf("frame %d ", frame_index);
diff --git a/lldb/source/Commands/CommandObjectProcess.cpp b/lldb/source/Commands/CommandObjectProcess.cpp
index e605abd..e8174ca 100644
--- a/lldb/source/Commands/CommandObjectProcess.cpp
+++ b/lldb/source/Commands/CommandObjectProcess.cpp
@@ -1584,7 +1584,7 @@ public:
 
 protected:
   void DoExecute(Args &signal_args, CommandReturnObject &result) override {
-    Target &target = GetSelectedOrDummyTarget();
+    Target &target = GetTarget();
 
     // Any signals that are being set should be added to the Target's
     // DummySignals so they will get applied on rerun, etc.
diff --git a/lldb/source/Commands/CommandObjectScripting.cpp b/lldb/source/Commands/CommandObjectScripting.cpp
index fee0565..730a190 100644
--- a/lldb/source/Commands/CommandObjectScripting.cpp
+++ b/lldb/source/Commands/CommandObjectScripting.cpp
@@ -8,12 +8,14 @@
 
 #include "CommandObjectScripting.h"
 #include "lldb/Core/Debugger.h"
+#include "lldb/Core/PluginManager.h"
 #include "lldb/DataFormatters/DataVisualization.h"
 #include "lldb/Host/Config.h"
 #include "lldb/Host/OptionParser.h"
 #include "lldb/Interpreter/CommandInterpreter.h"
 #include "lldb/Interpreter/CommandOptionArgumentTable.h"
 #include "lldb/Interpreter/CommandReturnObject.h"
+#include "lldb/Interpreter/Interfaces/ScriptedInterfaceUsages.h"
 #include "lldb/Interpreter/OptionArgParser.h"
 #include "lldb/Interpreter/ScriptInterpreter.h"
 #include "lldb/Utility/Args.h"
@@ -127,9 +129,126 @@ private:
   CommandOptions m_options;
 };
 
-#pragma mark CommandObjectMultiwordScripting
+#define LLDB_OPTIONS_scripting_template_list
+#include "CommandOptions.inc"
+
+class CommandObjectScriptingTemplateList : public CommandObjectParsed {
+public:
+  CommandObjectScriptingTemplateList(CommandInterpreter &interpreter)
+      : CommandObjectParsed(
+            interpreter, "scripting template list",
+            "List all the available scripting extension templates. ",
+            "scripting template list [--language <scripting-language> --]") {}
+
+  ~CommandObjectScriptingTemplateList() override = default;
+
+  Options *GetOptions() override { return &m_options; }
+
+  class CommandOptions : public Options {
+  public:
+    CommandOptions() = default;
+    ~CommandOptions() override = default;
+    Status SetOptionValue(uint32_t option_idx, llvm::StringRef option_arg,
+                          ExecutionContext *execution_context) override {
+      Status error;
+      const int short_option = m_getopt_table[option_idx].val;
 
-// CommandObjectMultiwordScripting
+      switch (short_option) {
+      case 'l':
+        m_language = (lldb::ScriptLanguage)OptionArgParser::ToOptionEnum(
+            option_arg, GetDefinitions()[option_idx].enum_values,
+            eScriptLanguageNone, error);
+        if (!error.Success())
+          error.SetErrorStringWithFormatv(
+              "unrecognized value for language '{0}'", option_arg);
+        break;
+      default:
+        llvm_unreachable("Unimplemented option");
+      }
+
+      return error;
+    }
+
+    void OptionParsingStarting(ExecutionContext *execution_context) override {
+      m_language = lldb::eScriptLanguageDefault;
+    }
+
+    llvm::ArrayRef<OptionDefinition> GetDefinitions() override {
+      return llvm::ArrayRef(g_scripting_template_list_options);
+    }
+
+    lldb::ScriptLanguage m_language = lldb::eScriptLanguageDefault;
+  };
+
+protected:
+  void DoExecute(Args &command, CommandReturnObject &result) override {
+    Stream &s = result.GetOutputStream();
+    s.Printf("Available scripted extension templates:");
+
+    auto print_field = [&s](llvm::StringRef key, llvm::StringRef value) {
+      if (!value.empty()) {
+        s.IndentMore();
+        s.Indent();
+        s << key << ": " << value << '\n';
+        s.IndentLess();
+      }
+    };
+
+    size_t num_listed_interface = 0;
+    size_t num_templates = PluginManager::GetNumScriptedInterfaces();
+    for (size_t i = 0; i < num_templates; i++) {
+      llvm::StringRef plugin_name =
+          PluginManager::GetScriptedInterfaceNameAtIndex(i);
+      if (plugin_name.empty())
+        break;
+
+      lldb::ScriptLanguage lang =
+          PluginManager::GetScriptedInterfaceLanguageAtIndex(i);
+      if (lang != m_options.m_language)
+        continue;
+
+      if (!num_listed_interface)
+        s.EOL();
+
+      num_listed_interface++;
+
+      llvm::StringRef desc =
+          PluginManager::GetScriptedInterfaceDescriptionAtIndex(i);
+      ScriptedInterfaceUsages usages =
+          PluginManager::GetScriptedInterfaceUsagesAtIndex(i);
+
+      print_field("Name", plugin_name);
+      print_field("Language", ScriptInterpreter::LanguageToString(lang));
+      print_field("Description", desc);
+      usages.Dump(s, ScriptedInterfaceUsages::UsageKind::API);
+      usages.Dump(s, ScriptedInterfaceUsages::UsageKind::CommandInterpreter);
+
+      if (i != num_templates - 1)
+        s.EOL();
+    }
+
+    if (!num_listed_interface)
+      s << " None\n";
+  }
+
+private:
+  CommandOptions m_options;
+};
+
+class CommandObjectMultiwordScriptingTemplate : public CommandObjectMultiword {
+public:
+  CommandObjectMultiwordScriptingTemplate(CommandInterpreter &interpreter)
+      : CommandObjectMultiword(
+            interpreter, "scripting template",
+            "Commands for operating on the scripting templates.",
+            "scripting template [<subcommand-options>]") {
+    LoadSubCommand(
+        "list",
+        CommandObjectSP(new CommandObjectScriptingTemplateList(interpreter)));
+  }
+
+  ~CommandObjectMultiwordScriptingTemplate() override = default;
+};
 
 CommandObjectMultiwordScripting::CommandObjectMultiwordScripting(
     CommandInterpreter &interpreter)
@@ -139,6 +258,9 @@ CommandObjectMultiwordScripting::CommandObjectMultiwordScripting(
           "scripting <subcommand> [<subcommand-options>]") {
   LoadSubCommand("run",
                  CommandObjectSP(new CommandObjectScriptingRun(interpreter)));
+  LoadSubCommand("template",
+                 CommandObjectSP(
+                     new CommandObjectMultiwordScriptingTemplate(interpreter)));
 }
 
 CommandObjectMultiwordScripting::~CommandObjectMultiwordScripting() = default;
diff --git a/lldb/source/Commands/CommandObjectTarget.cpp b/lldb/source/Commands/CommandObjectTarget.cpp
index cc381a2..b77bd8b 100644
--- a/lldb/source/Commands/CommandObjectTarget.cpp
+++ b/lldb/source/Commands/CommandObjectTarget.cpp
@@ -97,7 +97,7 @@ static void DumpTargetInfo(uint32_t target_idx, Target *target,
 
   uint32_t properties = 0;
   if (target_arch.IsValid()) {
-    strm.Printf("%sarch=", properties++ > 0 ? ", " : " ( ");
+    strm.Printf(" ( arch=");
     target_arch.DumpTriple(strm.AsRawOstream());
     properties++;
   }
@@ -1027,7 +1027,7 @@ public:
 
 protected:
   void DoExecute(Args &command, CommandReturnObject &result) override {
-    Target *target = &GetSelectedTarget();
+    Target &target = GetTarget();
     const size_t argc = command.GetArgumentCount();
     if (argc & 1) {
       result.AppendError("add requires an even number of arguments\n");
@@ -1045,7 +1045,7 @@ protected:
                       from, to);
           }
           bool last_pair = ((argc - i) == 2);
-          target->GetImageSearchPathList().Append(
+          target.GetImageSearchPathList().Append(
               from, to, last_pair); // Notify if this is the last pair
           result.SetStatus(eReturnStatusSuccessFinishNoResult);
         } else {
@@ -1074,9 +1074,9 @@ public:
 
 protected:
   void DoExecute(Args &command, CommandReturnObject &result) override {
-    Target *target = &GetSelectedTarget();
+    Target &target = GetTarget();
     bool notify = true;
-    target->GetImageSearchPathList().Clear(notify);
+    target.GetImageSearchPathList().Clear(notify);
     result.SetStatus(eReturnStatusSuccessFinishNoResult);
   }
 };
@@ -1148,7 +1148,7 @@ public:
 
 protected:
   void DoExecute(Args &command, CommandReturnObject &result) override {
-    Target *target = &GetSelectedTarget();
+    Target &target = GetTarget();
     size_t argc = command.GetArgumentCount();
     // check for at least 3 arguments and an odd number of parameters
     if (argc >= 3 && argc & 1) {
@@ -1171,8 +1171,8 @@ protected:
 
         if (from[0] && to[0]) {
           bool last_pair = ((argc - i) == 2);
-          target->GetImageSearchPathList().Insert(from, to, insert_idx,
-                                                  last_pair);
+          target.GetImageSearchPathList().Insert(from, to, insert_idx,
+                                                 last_pair);
           result.SetStatus(eReturnStatusSuccessFinishNoResult);
         } else {
           if (from[0])
@@ -1203,9 +1203,8 @@ public:
 
 protected:
   void DoExecute(Args &command, CommandReturnObject &result) override {
-    Target *target = &GetSelectedTarget();
-
-    target->GetImageSearchPathList().Dump(&result.GetOutputStream());
+    Target &target = GetTarget();
+    target.GetImageSearchPathList().Dump(&result.GetOutputStream());
     result.SetStatus(eReturnStatusSuccessFinishResult);
   }
 };
@@ -1226,7 +1225,7 @@ public:
 
 protected:
   void DoExecute(Args &command, CommandReturnObject &result) override {
-    Target *target = &GetSelectedTarget();
+    Target &target = GetTarget();
     if (command.GetArgumentCount() != 1) {
       result.AppendError("query requires one argument\n");
       return;
@@ -1234,7 +1233,7 @@ protected:
 
     ConstString orig(command.GetArgumentAtIndex(0));
     ConstString transformed;
-    if (target->GetImageSearchPathList().RemapPath(orig, transformed))
+    if (target.GetImageSearchPathList().RemapPath(orig, transformed))
       result.GetOutputStream().Printf("%s\n", transformed.GetCString());
     else
       result.GetOutputStream().Printf("%s\n", orig.GetCString());
@@ -1898,9 +1897,9 @@ public:
 
 protected:
   void DoExecute(Args &command, CommandReturnObject &result) override {
-    Target *target = &GetSelectedTarget();
+    Target &target = GetTarget();
 
-    uint32_t addr_byte_size = target->GetArchitecture().GetAddressByteSize();
+    uint32_t addr_byte_size = target.GetArchitecture().GetAddressByteSize();
     result.GetOutputStream().SetAddressByteSize(addr_byte_size);
     result.GetErrorStream().SetAddressByteSize(addr_byte_size);
 
@@ -1908,7 +1907,7 @@ protected:
     if (command.GetArgumentCount() == 0) {
       // Dump all headers for all modules images
       num_dumped = DumpModuleObjfileHeaders(result.GetOutputStream(),
-                                            target->GetImages());
+                                            target.GetImages());
       if (num_dumped == 0) {
         result.AppendError("the target has no associated executable images");
       }
@@ -1920,7 +1919,7 @@ protected:
            (arg_cstr = command.GetArgumentAtIndex(arg_idx)) != nullptr;
            ++arg_idx) {
         size_t num_matched =
-            FindModulesByName(target, arg_cstr, module_list, true);
+            FindModulesByName(&target, arg_cstr, module_list, true);
         if (num_matched == 0) {
           result.AppendWarningWithFormat(
               "Unable to find an image that matches '%s'.\n", arg_cstr);
@@ -1999,19 +1998,19 @@ public:
 
 protected:
   void DoExecute(Args &command, CommandReturnObject &result) override {
-    Target *target = &GetSelectedTarget();
+    Target &target = GetTarget();
     uint32_t num_dumped = 0;
     Mangled::NamePreference name_preference =
         (m_options.m_prefer_mangled ? Mangled::ePreferMangled
                                     : Mangled::ePreferDemangled);
 
-    uint32_t addr_byte_size = target->GetArchitecture().GetAddressByteSize();
+    uint32_t addr_byte_size = target.GetArchitecture().GetAddressByteSize();
     result.GetOutputStream().SetAddressByteSize(addr_byte_size);
     result.GetErrorStream().SetAddressByteSize(addr_byte_size);
 
     if (command.GetArgumentCount() == 0) {
       // Dump all sections for all modules images
-      const ModuleList &module_list = target->GetImages();
+      const ModuleList &module_list = target.GetImages();
       std::lock_guard<std::recursive_mutex> guard(module_list.GetMutex());
       const size_t num_modules = module_list.GetSize();
       if (num_modules > 0) {
@@ -2044,7 +2043,7 @@ protected:
            ++arg_idx) {
         ModuleList module_list;
         const size_t num_matches =
-            FindModulesByName(target, arg_cstr, module_list, true);
+            FindModulesByName(&target, arg_cstr, module_list, true);
         if (num_matches > 0) {
           for (ModuleSP module_sp : module_list.Modules()) {
             if (module_sp) {
@@ -2097,16 +2096,16 @@ public:
 
 protected:
   void DoExecute(Args &command, CommandReturnObject &result) override {
-    Target *target = &GetSelectedTarget();
+    Target &target = GetTarget();
     uint32_t num_dumped = 0;
 
-    uint32_t addr_byte_size = target->GetArchitecture().GetAddressByteSize();
+    uint32_t addr_byte_size = target.GetArchitecture().GetAddressByteSize();
     result.GetOutputStream().SetAddressByteSize(addr_byte_size);
     result.GetErrorStream().SetAddressByteSize(addr_byte_size);
 
     if (command.GetArgumentCount() == 0) {
       // Dump all sections for all modules images
-      const size_t num_modules = target->GetImages().GetSize();
+      const size_t num_modules = target.GetImages().GetSize();
       if (num_modules == 0) {
         result.AppendError("the target has no associated executable images");
         return;
@@ -2123,7 +2122,7 @@ protected:
         num_dumped++;
         DumpModuleSections(
             m_interpreter, result.GetOutputStream(),
-            target->GetImages().GetModulePointerAtIndex(image_idx));
+            target.GetImages().GetModulePointerAtIndex(image_idx));
       }
     } else {
       // Dump specified images (by basename or fullpath)
@@ -2133,7 +2132,7 @@ protected:
            ++arg_idx) {
         ModuleList module_list;
         const size_t num_matches =
-            FindModulesByName(target, arg_cstr, module_list, true);
+            FindModulesByName(&target, arg_cstr, module_list, true);
         if (num_matches > 0) {
           for (size_t i = 0; i < num_matches; ++i) {
             if (INTERRUPT_REQUESTED(GetDebugger(),
@@ -2238,9 +2237,9 @@ public:
 
 protected:
   void DoExecute(Args &command, CommandReturnObject &result) override {
-    Target *target = &GetSelectedTarget();
+    Target &target = GetTarget();
 
-    const ModuleList &module_list = target->GetImages();
+    const ModuleList &module_list = target.GetImages();
     const size_t num_modules = module_list.GetSize();
     if (num_modules == 0) {
       result.AppendError("the target has no associated executable images");
@@ -2265,7 +2264,7 @@ protected:
     for (const Args::ArgEntry &arg : command.entries()) {
       ModuleList module_list;
       const size_t num_matches =
-          FindModulesByName(target, arg.c_str(), module_list, true);
+          FindModulesByName(&target, arg.c_str(), module_list, true);
       if (num_matches == 0) {
         // Check the global list
         std::lock_guard<std::recursive_mutex> guard(
@@ -2309,16 +2308,16 @@ public:
 
 protected:
   void DoExecute(Args &command, CommandReturnObject &result) override {
-    Target *target = &GetSelectedTarget();
+    Target &target = GetTarget();
     uint32_t num_dumped = 0;
 
-    uint32_t addr_byte_size = target->GetArchitecture().GetAddressByteSize();
+    uint32_t addr_byte_size = target.GetArchitecture().GetAddressByteSize();
     result.GetOutputStream().SetAddressByteSize(addr_byte_size);
     result.GetErrorStream().SetAddressByteSize(addr_byte_size);
 
     if (command.GetArgumentCount() == 0) {
       // Dump all sections for all modules images
-      const ModuleList &target_modules = target->GetImages();
+      const ModuleList &target_modules = target.GetImages();
       std::lock_guard<std::recursive_mutex> guard(target_modules.GetMutex());
       const size_t num_modules = target_modules.GetSize();
       if (num_modules == 0) {
@@ -2344,7 +2343,7 @@ protected:
            ++arg_idx) {
         ModuleList module_list;
         const size_t num_matches =
-            FindModulesByName(target, arg_cstr, module_list, true);
+            FindModulesByName(&target, arg_cstr, module_list, true);
         if (num_matches > 0) {
           for (size_t i = 0; i < num_matches; ++i) {
             if (INTERRUPT_REQUESTED(GetDebugger(), "Interrupted dumping {0} "
@@ -2533,7 +2532,7 @@ public:
 
 protected:
   void DoExecute(Args &command, CommandReturnObject &result) override {
-    Target &target = GetSelectedTarget();
+    Target &target = GetTarget();
     uint32_t num_dumped = 0;
 
     uint32_t addr_byte_size = target.GetArchitecture().GetAddressByteSize();
@@ -2726,7 +2725,7 @@ protected:
   OptionGroupFile m_symbol_file;
 
   void DoExecute(Args &args, CommandReturnObject &result) override {
-    Target *target = &GetSelectedTarget();
+    Target &target = GetTarget();
     bool flush = false;
 
     const size_t argc = args.GetArgumentCount();
@@ -2742,7 +2741,7 @@ protected:
         Status error;
         if (PluginManager::DownloadObjectAndSymbolFile(module_spec, error)) {
           ModuleSP module_sp(
-              target->GetOrCreateModule(module_spec, true /* notify */));
+              target.GetOrCreateModule(module_spec, true /* notify */));
           if (module_sp) {
             result.SetStatus(eReturnStatusSuccessFinishResult);
             return;
@@ -2799,10 +2798,10 @@ protected:
             module_spec.GetSymbolFileSpec() =
                 m_symbol_file.GetOptionValue().GetCurrentValue();
           if (!module_spec.GetArchitecture().IsValid())
-            module_spec.GetArchitecture() = target->GetArchitecture();
+            module_spec.GetArchitecture() = target.GetArchitecture();
           Status error;
-          ModuleSP module_sp(target->GetOrCreateModule(
-              module_spec, true /* notify */, &error));
+          ModuleSP module_sp(
+              target.GetOrCreateModule(module_spec, true /* notify */, &error));
           if (!module_sp) {
             const char *error_cstr = error.AsCString();
             if (error_cstr)
@@ -2831,7 +2830,7 @@ protected:
     }
 
     if (flush) {
-      ProcessSP process = target->GetProcessSP();
+      ProcessSP process = target.GetProcessSP();
       if (process)
         process->Flush();
     }
@@ -2876,7 +2875,7 @@ public:
 
 protected:
   void DoExecute(Args &args, CommandReturnObject &result) override {
-    Target *target = &GetSelectedTarget();
+    Target &target = GetTarget();
     const bool load = m_load_option.GetOptionValue().GetCurrentValue();
     const bool set_pc = m_pc_option.GetOptionValue().GetCurrentValue();
 
@@ -2888,7 +2887,7 @@ protected:
     if (load) {
       if (!m_file_option.GetOptionValue().OptionWasSet() &&
           !m_uuid_option_group.GetOptionValue().OptionWasSet()) {
-        ModuleList &module_list = target->GetImages();
+        ModuleList &module_list = target.GetImages();
         if (module_list.GetSize() == 1) {
           search_using_module_spec = true;
           module_spec.GetFileSpec() =
@@ -2903,7 +2902,7 @@ protected:
       const bool use_global_module_list = true;
       ModuleList module_list;
       const size_t num_matches = FindModulesByName(
-          target, arg_cstr, module_list, use_global_module_list);
+          &target, arg_cstr, module_list, use_global_module_list);
       if (num_matches == 1) {
         module_spec.GetFileSpec() =
             module_list.GetModuleAtIndex(0)->GetFileSpec();
@@ -2926,7 +2925,7 @@ protected:
 
     if (search_using_module_spec) {
       ModuleList matching_modules;
-      target->GetImages().FindModules(module_spec, matching_modules);
+      target.GetImages().FindModules(module_spec, matching_modules);
       const size_t num_matches = matching_modules.GetSize();
 
       char path[PATH_MAX];
@@ -2943,7 +2942,7 @@ protected:
                   const addr_t slide =
                       m_slide_option.GetOptionValue().GetCurrentValue();
                   const bool slide_is_offset = true;
-                  module->SetLoadAddress(*target, slide, slide_is_offset,
+                  module->SetLoadAddress(target, slide, slide_is_offset,
                                          changed);
                 } else {
                   result.AppendError("one or more section name + load "
@@ -2975,8 +2974,8 @@ protected:
                               sect_name);
                           break;
                         } else {
-                          if (target->GetSectionLoadList()
-                                  .SetSectionLoadAddress(section_sp, load_addr))
+                          if (target.GetSectionLoadList().SetSectionLoadAddress(
+                                  section_sp, load_addr))
                             changed = true;
                           result.AppendMessageWithFormat(
                               "section '%s' loaded at 0x%" PRIx64 "\n",
@@ -3007,13 +3006,13 @@ protected:
               }
 
               if (changed) {
-                target->ModulesDidLoad(matching_modules);
+                target.ModulesDidLoad(matching_modules);
                 Process *process = m_exe_ctx.GetProcessPtr();
                 if (process)
                   process->Flush();
               }
               if (load) {
-                ProcessSP process = target->CalculateProcess();
+                ProcessSP process = target.CalculateProcess();
                 Address file_entry = objfile->GetEntryPointAddress();
                 if (!process) {
                   result.AppendError("No process");
@@ -3024,7 +3023,7 @@ protected:
                   return;
                 }
                 std::vector<ObjectFile::LoadableData> loadables(
-                    objfile->GetLoadableData(*target));
+                    objfile->GetLoadableData(target));
                 if (loadables.size() == 0) {
                   result.AppendError("No loadable sections");
                   return;
@@ -3038,7 +3037,7 @@ protected:
                   ThreadList &thread_list = process->GetThreadList();
                   RegisterContextSP reg_context(
                       thread_list.GetSelectedThread()->GetRegisterContext());
-                  addr_t file_entry_addr = file_entry.GetLoadAddress(target);
+                  addr_t file_entry_addr = file_entry.GetLoadAddress(&target);
                   if (!reg_context->SetPC(file_entry_addr)) {
                     result.AppendErrorWithFormat("failed to set PC value to "
                                                  "0x%" PRIx64 "\n",
@@ -3166,50 +3165,37 @@ public:
 
 protected:
   void DoExecute(Args &command, CommandReturnObject &result) override {
-    Target *target = GetDebugger().GetSelectedTarget().get();
+    Target &target = GetTarget();
     const bool use_global_module_list = m_options.m_use_global_module_list;
     // Define a local module list here to ensure it lives longer than any
     // "locker" object which might lock its contents below (through the
     // "module_list_ptr" variable).
     ModuleList module_list;
-    if (target == nullptr && !use_global_module_list) {
-      result.AppendError("invalid target, create a debug target using the "
-                         "'target create' command");
-      return;
-    } else {
-      if (target) {
-        uint32_t addr_byte_size =
-            target->GetArchitecture().GetAddressByteSize();
-        result.GetOutputStream().SetAddressByteSize(addr_byte_size);
-        result.GetErrorStream().SetAddressByteSize(addr_byte_size);
-      }
-      // Dump all sections for all modules images
-      Stream &strm = result.GetOutputStream();
+    uint32_t addr_byte_size = target.GetArchitecture().GetAddressByteSize();
+    result.GetOutputStream().SetAddressByteSize(addr_byte_size);
+    result.GetErrorStream().SetAddressByteSize(addr_byte_size);
+    // Dump all sections for all modules images
+    Stream &strm = result.GetOutputStream();
 
-      if (m_options.m_module_addr != LLDB_INVALID_ADDRESS) {
-        if (target) {
-          Address module_address;
-          if (module_address.SetLoadAddress(m_options.m_module_addr, target)) {
-            ModuleSP module_sp(module_address.GetModule());
-            if (module_sp) {
-              PrintModule(target, module_sp.get(), 0, strm);
-              result.SetStatus(eReturnStatusSuccessFinishResult);
-            } else {
-              result.AppendErrorWithFormat(
-                  "Couldn't find module matching address: 0x%" PRIx64 ".",
-                  m_options.m_module_addr);
-            }
-          } else {
-            result.AppendErrorWithFormat(
-                "Couldn't find module containing address: 0x%" PRIx64 ".",
-                m_options.m_module_addr);
-          }
+    if (m_options.m_module_addr != LLDB_INVALID_ADDRESS) {
+      Address module_address;
+      if (module_address.SetLoadAddress(m_options.m_module_addr, &target)) {
+        ModuleSP module_sp(module_address.GetModule());
+        if (module_sp) {
+          PrintModule(target, module_sp.get(), 0, strm);
+          result.SetStatus(eReturnStatusSuccessFinishResult);
         } else {
-          result.AppendError(
-              "Can only look up modules by address with a valid target.");
+          result.AppendErrorWithFormat(
+              "Couldn't find module matching address: 0x%" PRIx64 ".",
+              m_options.m_module_addr);
         }
-        return;
+      } else {
+        result.AppendErrorWithFormat(
+            "Couldn't find module containing address: 0x%" PRIx64 ".",
+            m_options.m_module_addr);
       }
+      return;
+    }
 
       size_t num_modules = 0;
 
@@ -3227,13 +3213,13 @@ protected:
           guard.lock();
           num_modules = Module::GetNumberAllocatedModules();
         } else {
-          module_list_ptr = &target->GetImages();
+          module_list_ptr = &target.GetImages();
         }
       } else {
         for (const Args::ArgEntry &arg : command) {
           // Dump specified images (by basename or fullpath)
           const size_t num_matches = FindModulesByName(
-              target, arg.c_str(), module_list, use_global_module_list);
+              &target, arg.c_str(), module_list, use_global_module_list);
           if (num_matches == 0) {
             if (argc == 1) {
               result.AppendErrorWithFormat("no modules found that match '%s'",
@@ -3286,10 +3272,9 @@ protected:
         }
         return;
       }
-    }
   }
 
-  void PrintModule(Target *target, Module *module, int indent, Stream &strm) {
+  void PrintModule(Target &target, Module *module, int indent, Stream &strm) {
     if (module == nullptr) {
       strm.PutCString("Null module");
       return;
@@ -3338,17 +3323,16 @@ protected:
         // Image header address
         {
           uint32_t addr_nibble_width =
-              target ? (target->GetArchitecture().GetAddressByteSize() * 2)
-                     : 16;
+              target.GetArchitecture().GetAddressByteSize() * 2;
 
           ObjectFile *objfile = module->GetObjectFile();
           if (objfile) {
             Address base_addr(objfile->GetBaseAddress());
             if (base_addr.IsValid()) {
-              if (target && !target->GetSectionLoadList().IsEmpty()) {
-                lldb::addr_t load_addr = base_addr.GetLoadAddress(target);
+              if (!target.GetSectionLoadList().IsEmpty()) {
+                lldb::addr_t load_addr = base_addr.GetLoadAddress(&target);
                 if (load_addr == LLDB_INVALID_ADDRESS) {
-                  base_addr.Dump(&strm, target,
+                  base_addr.Dump(&strm, &target,
                                  Address::DumpStyleModuleWithFileAddress,
                                  Address::DumpStyleFileAddress);
                 } else {
@@ -3367,7 +3351,7 @@ protected:
               }
               // The address was valid, but the image isn't loaded, output the
               // address in an appropriate format
-              base_addr.Dump(&strm, target, Address::DumpStyleFileAddress);
+              base_addr.Dump(&strm, &target, Address::DumpStyleFileAddress);
               break;
             }
           }
@@ -3969,7 +3953,7 @@ public:
       return false;
     case eLookupTypeType:
       if (!m_options.m_str.empty()) {
-        if (LookupTypeHere(&GetSelectedTarget(), m_interpreter,
+        if (LookupTypeHere(&GetTarget(), m_interpreter,
                            result.GetOutputStream(), *sym_ctx.module_sp,
                            m_options.m_str.c_str(), m_options.m_use_regex)) {
           result.SetStatus(eReturnStatusSuccessFinishResult);
@@ -4048,8 +4032,8 @@ public:
     case eLookupTypeType:
       if (!m_options.m_str.empty()) {
         if (LookupTypeInModule(
-                &GetSelectedTarget(), m_interpreter, result.GetOutputStream(),
-                module, m_options.m_str.c_str(), m_options.m_use_regex)) {
+                &GetTarget(), m_interpreter, result.GetOutputStream(), module,
+                m_options.m_str.c_str(), m_options.m_use_regex)) {
           result.SetStatus(eReturnStatusSuccessFinishResult);
           return true;
         }
@@ -4070,11 +4054,11 @@ public:
 
 protected:
   void DoExecute(Args &command, CommandReturnObject &result) override {
-    Target *target = &GetSelectedTarget();
+    Target &target = GetTarget();
     bool syntax_error = false;
     uint32_t i;
     uint32_t num_successful_lookups = 0;
-    uint32_t addr_byte_size = target->GetArchitecture().GetAddressByteSize();
+    uint32_t addr_byte_size = target.GetArchitecture().GetAddressByteSize();
     result.GetOutputStream().SetAddressByteSize(addr_byte_size);
     result.GetErrorStream().SetAddressByteSize(addr_byte_size);
     // Dump all sections for all modules images
@@ -4096,7 +4080,7 @@ protected:
 
       // Dump all sections for all other modules
 
-      const ModuleList &target_modules = target->GetImages();
+      const ModuleList &target_modules = target.GetImages();
       std::lock_guard<std::recursive_mutex> guard(target_modules.GetMutex());
       if (target_modules.GetSize() == 0) {
         result.AppendError("the target has no associated executable images");
@@ -4119,7 +4103,7 @@ protected:
            ++i) {
         ModuleList module_list;
         const size_t num_matches =
-            FindModulesByName(target, arg_cstr, module_list, false);
+            FindModulesByName(&target, arg_cstr, module_list, false);
         if (num_matches > 0) {
           for (size_t j = 0; j < num_matches; ++j) {
             Module *module = module_list.GetModulePointerAtIndex(j);
@@ -4937,10 +4921,7 @@ protected:
                            m_stop_hook_sp->GetID());
           error_sp->Flush();
         }
-        Target *target = GetDebugger().GetSelectedTarget().get();
-        if (target) {
-          target->UndoCreateStopHook(m_stop_hook_sp->GetID());
-        }
+        GetTarget().UndoCreateStopHook(m_stop_hook_sp->GetID());
       } else {
         // The IOHandler editor is only for command lines stop hooks:
         Target::StopHookCommandLine *hook_ptr =
@@ -4962,7 +4943,7 @@ protected:
   void DoExecute(Args &command, CommandReturnObject &result) override {
     m_stop_hook_sp.reset();
 
-    Target &target = GetSelectedOrDummyTarget();
+    Target &target = GetTarget();
     Target::StopHookSP new_hook_sp =
         target.CreateStopHook(m_python_class_options.GetName().empty() ?
                                Target::StopHook::StopHookKind::CommandBased
@@ -5099,7 +5080,7 @@ public:
 
 protected:
   void DoExecute(Args &command, CommandReturnObject &result) override {
-    Target &target = GetSelectedOrDummyTarget();
+    Target &target = GetTarget();
     // FIXME: see if we can use the breakpoint id style parser?
     size_t num_args = command.GetArgumentCount();
     if (num_args == 0) {
@@ -5153,7 +5134,7 @@ public:
 
 protected:
   void DoExecute(Args &command, CommandReturnObject &result) override {
-    Target &target = GetSelectedOrDummyTarget();
+    Target &target = GetTarget();
     // FIXME: see if we can use the breakpoint id style parser?
     size_t num_args = command.GetArgumentCount();
     bool success;
@@ -5197,7 +5178,7 @@ public:
 
 protected:
   void DoExecute(Args &command, CommandReturnObject &result) override {
-    Target &target = GetSelectedOrDummyTarget();
+    Target &target = GetTarget();
 
     size_t num_hooks = target.GetNumStopHooks();
     if (num_hooks == 0) {
@@ -5263,7 +5244,7 @@ public:
 protected:
   void DoExecute(Args &command, CommandReturnObject &result) override {
     // Go over every scratch TypeSystem and dump to the command output.
-    for (lldb::TypeSystemSP ts : GetSelectedTarget().GetScratchTypeSystems())
+    for (lldb::TypeSystemSP ts : GetTarget().GetScratchTypeSystems())
       if (ts)
         ts->Dump(result.GetOutputStream().AsRawOstream());
 
@@ -5287,7 +5268,7 @@ public:
 
 protected:
   void DoExecute(Args &command, CommandReturnObject &result) override {
-    Target &target = GetSelectedTarget();
+    Target &target = GetTarget();
     target.GetSectionLoadList().Dump(result.GetOutputStream(), &target);
     result.SetStatus(eReturnStatusSuccessFinishResult);
   }
diff --git a/lldb/source/Commands/CommandObjectThread.cpp b/lldb/source/Commands/CommandObjectThread.cpp
index 4398cf3..366b6dd 100644
--- a/lldb/source/Commands/CommandObjectThread.cpp
+++ b/lldb/source/Commands/CommandObjectThread.cpp
@@ -882,7 +882,7 @@ protected:
   void DoExecute(Args &command, CommandReturnObject &result) override {
     bool synchronous_execution = m_interpreter.GetSynchronous();
 
-    Target *target = &GetSelectedTarget();
+    Target *target = &GetTarget();
 
     Process *process = m_exe_ctx.GetProcessPtr();
     if (process == nullptr) {
diff --git a/lldb/source/Commands/CommandObjectWatchpoint.cpp b/lldb/source/Commands/CommandObjectWatchpoint.cpp
index f123211..126982d 100644
--- a/lldb/source/Commands/CommandObjectWatchpoint.cpp
+++ b/lldb/source/Commands/CommandObjectWatchpoint.cpp
@@ -39,10 +39,10 @@ static void AddWatchpointDescription(Stream &s, Watchpoint &wp,
   s.EOL();
 }
 
-static bool CheckTargetForWatchpointOperations(Target *target,
+static bool CheckTargetForWatchpointOperations(Target &target,
                                                CommandReturnObject &result) {
   bool process_is_valid =
-      target->GetProcessSP() && target->GetProcessSP()->IsAlive();
+      target.GetProcessSP() && target.GetProcessSP()->IsAlive();
   if (!process_is_valid) {
     result.AppendError("There's no process or it is not alive.");
     return false;
@@ -67,12 +67,10 @@ static int32_t WithRSAIndex(llvm::StringRef Arg) {
 // Return true if wp_ids is successfully populated with the watch ids. False
 // otherwise.
 bool CommandObjectMultiwordWatchpoint::VerifyWatchpointIDs(
-    Target *target, Args &args, std::vector<uint32_t> &wp_ids) {
+    Target &target, Args &args, std::vector<uint32_t> &wp_ids) {
   // Pre-condition: args.GetArgumentCount() > 0.
   if (args.GetArgumentCount() == 0) {
-    if (target == nullptr)
-      return false;
-    WatchpointSP watch_sp = target->GetLastCreatedWatchpoint();
+    WatchpointSP watch_sp = target.GetLastCreatedWatchpoint();
     if (watch_sp) {
       wp_ids.push_back(watch_sp->GetID());
       return true;
@@ -203,22 +201,24 @@ public:
 
 protected:
   void DoExecute(Args &command, CommandReturnObject &result) override {
-    Target *target = &GetSelectedTarget();
+    Target &target = GetTarget();
 
-    if (target->GetProcessSP() && target->GetProcessSP()->IsAlive()) {
-      std::optional<uint32_t> num_supported_hardware_watchpoints =
-          target->GetProcessSP()->GetWatchpointSlotCount();
+    if (ProcessSP process_sp = target.GetProcessSP()) {
+      if (process_sp->IsAlive()) {
+        std::optional<uint32_t> num_supported_hardware_watchpoints =
+            process_sp->GetWatchpointSlotCount();
 
-      if (num_supported_hardware_watchpoints)
-        result.AppendMessageWithFormat(
-            "Number of supported hardware watchpoints: %u\n",
-            *num_supported_hardware_watchpoints);
+        if (num_supported_hardware_watchpoints)
+          result.AppendMessageWithFormat(
+              "Number of supported hardware watchpoints: %u\n",
+              *num_supported_hardware_watchpoints);
+      }
     }
 
-    const WatchpointList &watchpoints = target->GetWatchpointList();
+    const WatchpointList &watchpoints = target.GetWatchpointList();
 
     std::unique_lock<std::recursive_mutex> lock;
-    target->GetWatchpointList().GetListMutex(lock);
+    target.GetWatchpointList().GetListMutex(lock);
 
     size_t num_watchpoints = watchpoints.GetSize();
 
@@ -286,14 +286,14 @@ public:
 
 protected:
   void DoExecute(Args &command, CommandReturnObject &result) override {
-    Target *target = &GetSelectedTarget();
+    Target &target = GetTarget();
     if (!CheckTargetForWatchpointOperations(target, result))
       return;
 
     std::unique_lock<std::recursive_mutex> lock;
-    target->GetWatchpointList().GetListMutex(lock);
+    target.GetWatchpointList().GetListMutex(lock);
 
-    const WatchpointList &watchpoints = target->GetWatchpointList();
+    const WatchpointList &watchpoints = target.GetWatchpointList();
 
     size_t num_watchpoints = watchpoints.GetSize();
 
@@ -304,7 +304,7 @@ protected:
 
     if (command.GetArgumentCount() == 0) {
       // No watchpoint selected; enable all currently set watchpoints.
-      target->EnableAllWatchpoints();
+      target.EnableAllWatchpoints();
       result.AppendMessageWithFormat("All watchpoints enabled. (%" PRIu64
                                      " watchpoints)\n",
                                      (uint64_t)num_watchpoints);
@@ -321,7 +321,7 @@ protected:
       int count = 0;
       const size_t size = wp_ids.size();
       for (size_t i = 0; i < size; ++i)
-        if (target->EnableWatchpointByID(wp_ids[i]))
+        if (target.EnableWatchpointByID(wp_ids[i]))
           ++count;
       result.AppendMessageWithFormat("%d watchpoints enabled.\n", count);
       result.SetStatus(eReturnStatusSuccessFinishNoResult);
@@ -355,14 +355,14 @@ public:
 
 protected:
   void DoExecute(Args &command, CommandReturnObject &result) override {
-    Target *target = &GetSelectedTarget();
+    Target &target = GetTarget();
     if (!CheckTargetForWatchpointOperations(target, result))
       return;
 
     std::unique_lock<std::recursive_mutex> lock;
-    target->GetWatchpointList().GetListMutex(lock);
+    target.GetWatchpointList().GetListMutex(lock);
 
-    const WatchpointList &watchpoints = target->GetWatchpointList();
+    const WatchpointList &watchpoints = target.GetWatchpointList();
     size_t num_watchpoints = watchpoints.GetSize();
 
     if (num_watchpoints == 0) {
@@ -372,7 +372,7 @@ protected:
 
     if (command.GetArgumentCount() == 0) {
       // No watchpoint selected; disable all currently set watchpoints.
-      if (target->DisableAllWatchpoints()) {
+      if (target.DisableAllWatchpoints()) {
         result.AppendMessageWithFormat("All watchpoints disabled. (%" PRIu64
                                        " watchpoints)\n",
                                        (uint64_t)num_watchpoints);
@@ -392,7 +392,7 @@ protected:
       int count = 0;
       const size_t size = wp_ids.size();
       for (size_t i = 0; i < size; ++i)
-        if (target->DisableWatchpointByID(wp_ids[i]))
+        if (target.DisableWatchpointByID(wp_ids[i]))
           ++count;
       result.AppendMessageWithFormat("%d watchpoints disabled.\n", count);
       result.SetStatus(eReturnStatusSuccessFinishNoResult);
@@ -464,14 +464,14 @@ public:
 
 protected:
   void DoExecute(Args &command, CommandReturnObject &result) override {
-    Target *target = &GetSelectedTarget();
+    Target &target = GetTarget();
     if (!CheckTargetForWatchpointOperations(target, result))
       return;
 
     std::unique_lock<std::recursive_mutex> lock;
-    target->GetWatchpointList().GetListMutex(lock);
+    target.GetWatchpointList().GetListMutex(lock);
 
-    const WatchpointList &watchpoints = target->GetWatchpointList();
+    const WatchpointList &watchpoints = target.GetWatchpointList();
 
     size_t num_watchpoints = watchpoints.GetSize();
 
@@ -487,7 +487,7 @@ protected:
               true)) {
         result.AppendMessage("Operation cancelled...");
       } else {
-        target->RemoveAllWatchpoints();
+        target.RemoveAllWatchpoints();
         result.AppendMessageWithFormat("All watchpoints removed. (%" PRIu64
                                        " watchpoints)\n",
                                        (uint64_t)num_watchpoints);
@@ -507,7 +507,7 @@ protected:
     int count = 0;
     const size_t size = wp_ids.size();
     for (size_t i = 0; i < size; ++i)
-      if (target->RemoveWatchpointByID(wp_ids[i]))
+      if (target.RemoveWatchpointByID(wp_ids[i]))
         ++count;
     result.AppendMessageWithFormat("%d watchpoints deleted.\n", count);
     result.SetStatus(eReturnStatusSuccessFinishNoResult);
@@ -584,14 +584,14 @@ public:
 
 protected:
   void DoExecute(Args &command, CommandReturnObject &result) override {
-    Target *target = &GetSelectedTarget();
+    Target &target = GetTarget();
     if (!CheckTargetForWatchpointOperations(target, result))
       return;
 
     std::unique_lock<std::recursive_mutex> lock;
-    target->GetWatchpointList().GetListMutex(lock);
+    target.GetWatchpointList().GetListMutex(lock);
 
-    const WatchpointList &watchpoints = target->GetWatchpointList();
+    const WatchpointList &watchpoints = target.GetWatchpointList();
 
     size_t num_watchpoints = watchpoints.GetSize();
 
@@ -601,7 +601,7 @@ protected:
     }
 
     if (command.GetArgumentCount() == 0) {
-      target->IgnoreAllWatchpoints(m_options.m_ignore_count);
+      target.IgnoreAllWatchpoints(m_options.m_ignore_count);
       result.AppendMessageWithFormat("All watchpoints ignored. (%" PRIu64
                                      " watchpoints)\n",
                                      (uint64_t)num_watchpoints);
@@ -618,7 +618,7 @@ protected:
       int count = 0;
       const size_t size = wp_ids.size();
       for (size_t i = 0; i < size; ++i)
-        if (target->IgnoreWatchpointByID(wp_ids[i], m_options.m_ignore_count))
+        if (target.IgnoreWatchpointByID(wp_ids[i], m_options.m_ignore_count))
           ++count;
       result.AppendMessageWithFormat("%d watchpoints ignored.\n", count);
       result.SetStatus(eReturnStatusSuccessFinishNoResult);
@@ -703,14 +703,14 @@ public:
 
 protected:
   void DoExecute(Args &command, CommandReturnObject &result) override {
-    Target *target = &GetSelectedTarget();
+    Target &target = GetTarget();
     if (!CheckTargetForWatchpointOperations(target, result))
       return;
 
     std::unique_lock<std::recursive_mutex> lock;
-    target->GetWatchpointList().GetListMutex(lock);
+    target.GetWatchpointList().GetListMutex(lock);
 
-    const WatchpointList &watchpoints = target->GetWatchpointList();
+    const WatchpointList &watchpoints = target.GetWatchpointList();
 
     size_t num_watchpoints = watchpoints.GetSize();
 
@@ -720,7 +720,7 @@ protected:
     }
 
     if (command.GetArgumentCount() == 0) {
-      WatchpointSP watch_sp = target->GetLastCreatedWatchpoint();
+      WatchpointSP watch_sp = target.GetLastCreatedWatchpoint();
       watch_sp->SetCondition(m_options.m_condition.c_str());
       result.SetStatus(eReturnStatusSuccessFinishNoResult);
     } else {
@@ -804,7 +804,7 @@ protected:
   }
 
   void DoExecute(Args &command, CommandReturnObject &result) override {
-    Target *target = GetDebugger().GetSelectedTarget().get();
+    Target &target = GetTarget();
     StackFrame *frame = m_exe_ctx.GetFramePtr();
 
     // If no argument is present, issue an error message.  There's no way to
@@ -852,8 +852,8 @@ protected:
 
       Status error(Variable::GetValuesForVariableExpressionPath(
           command.GetArgumentAtIndex(0),
-          m_exe_ctx.GetBestExecutionContextScope(), GetVariableCallback, target,
-          variable_list, valobj_list));
+          m_exe_ctx.GetBestExecutionContextScope(), GetVariableCallback,
+          &target, variable_list, valobj_list));
 
       if (valobj_list.GetSize())
         valobj_sp = valobj_list.GetValueObjectAtIndex(0);
@@ -904,7 +904,7 @@ protected:
 
     error.Clear();
     WatchpointSP watch_sp =
-        target->CreateWatchpoint(addr, size, &compiler_type, watch_type, error);
+        target.CreateWatchpoint(addr, size, &compiler_type, watch_type, error);
     if (!watch_sp) {
       result.AppendErrorWithFormat(
           "Watchpoint creation failed (addr=0x%" PRIx64 ", size=%" PRIu64
@@ -991,7 +991,7 @@ protected:
     m_option_group.NotifyOptionParsingStarting(
         &exe_ctx); // This is a raw command, so notify the option group
 
-    Target *target = GetDebugger().GetSelectedTarget().get();
+    Target &target = GetTarget();
     StackFrame *frame = m_exe_ctx.GetFramePtr();
 
     OptionsWithRaw args(raw_command);
@@ -1034,7 +1034,7 @@ protected:
       options.SetLanguage(m_option_watchpoint.language_type);
 
     ExpressionResults expr_result =
-        target->EvaluateExpression(expr, frame, valobj_sp, options);
+        target.EvaluateExpression(expr, frame, valobj_sp, options);
     if (expr_result != eExpressionCompleted) {
       result.AppendError("expression evaluation of address to watch failed");
       result.AppendErrorWithFormat("expression evaluated: \n%s", expr.data());
@@ -1054,7 +1054,7 @@ protected:
     if (m_option_watchpoint.watch_size.GetCurrentValue() != 0)
       size = m_option_watchpoint.watch_size.GetCurrentValue();
     else
-      size = target->GetArchitecture().GetAddressByteSize();
+      size = target.GetArchitecture().GetAddressByteSize();
 
     // Now it's time to create the watchpoint.
     uint32_t watch_type;
@@ -1095,7 +1095,7 @@ protected:
 
     Status error;
     WatchpointSP watch_sp =
-        target->CreateWatchpoint(addr, size, &compiler_type, watch_type, error);
+        target.CreateWatchpoint(addr, size, &compiler_type, watch_type, error);
     if (watch_sp) {
       watch_sp->SetWatchSpec(std::string(expr));
       Stream &output_stream = result.GetOutputStream();
diff --git a/lldb/source/Commands/CommandObjectWatchpoint.h b/lldb/source/Commands/CommandObjectWatchpoint.h
index 87f9f43..a684911 100644
--- a/lldb/source/Commands/CommandObjectWatchpoint.h
+++ b/lldb/source/Commands/CommandObjectWatchpoint.h
@@ -22,7 +22,7 @@ public:
 
   ~CommandObjectMultiwordWatchpoint() override;
 
-  static bool VerifyWatchpointIDs(Target *target, Args &args,
+  static bool VerifyWatchpointIDs(Target &target, Args &args,
                                   std::vector<uint32_t> &wp_ids);
 };
 
diff --git a/lldb/source/Commands/CommandObjectWatchpointCommand.cpp b/lldb/source/Commands/CommandObjectWatchpointCommand.cpp
index aaf1454..cc4cb76 100644
--- a/lldb/source/Commands/CommandObjectWatchpointCommand.cpp
+++ b/lldb/source/Commands/CommandObjectWatchpointCommand.cpp
@@ -355,9 +355,9 @@ are no syntax errors may indicate that a function was declared but never called.
 
 protected:
   void DoExecute(Args &command, CommandReturnObject &result) override {
-    Target *target = &GetSelectedTarget();
+    Target &target = GetTarget();
 
-    const WatchpointList &watchpoints = target->GetWatchpointList();
+    const WatchpointList &watchpoints = target.GetWatchpointList();
     size_t num_watchpoints = watchpoints.GetSize();
 
     if (num_watchpoints == 0) {
@@ -384,7 +384,7 @@ protected:
     for (size_t i = 0; i < count; ++i) {
       uint32_t cur_wp_id = valid_wp_ids.at(i);
       if (cur_wp_id != LLDB_INVALID_WATCH_ID) {
-        Watchpoint *wp = target->GetWatchpointList().FindByID(cur_wp_id).get();
+        Watchpoint *wp = target.GetWatchpointList().FindByID(cur_wp_id).get();
         // Sanity check wp first.
         if (wp == nullptr)
           continue;
@@ -450,9 +450,9 @@ public:
 
 protected:
   void DoExecute(Args &command, CommandReturnObject &result) override {
-    Target *target = &GetSelectedTarget();
+    Target &target = GetTarget();
 
-    const WatchpointList &watchpoints = target->GetWatchpointList();
+    const WatchpointList &watchpoints = target.GetWatchpointList();
     size_t num_watchpoints = watchpoints.GetSize();
 
     if (num_watchpoints == 0) {
@@ -478,7 +478,7 @@ protected:
     for (size_t i = 0; i < count; ++i) {
       uint32_t cur_wp_id = valid_wp_ids.at(i);
       if (cur_wp_id != LLDB_INVALID_WATCH_ID) {
-        Watchpoint *wp = target->GetWatchpointList().FindByID(cur_wp_id).get();
+        Watchpoint *wp = target.GetWatchpointList().FindByID(cur_wp_id).get();
         if (wp)
           wp->ClearCallback();
       } else {
@@ -505,9 +505,9 @@ public:
 
 protected:
   void DoExecute(Args &command, CommandReturnObject &result) override {
-    Target *target = &GetSelectedTarget();
+    Target &target = GetTarget();
 
-    const WatchpointList &watchpoints = target->GetWatchpointList();
+    const WatchpointList &watchpoints = target.GetWatchpointList();
     size_t num_watchpoints = watchpoints.GetSize();
 
     if (num_watchpoints == 0) {
@@ -533,7 +533,7 @@ protected:
     for (size_t i = 0; i < count; ++i) {
       uint32_t cur_wp_id = valid_wp_ids.at(i);
       if (cur_wp_id != LLDB_INVALID_WATCH_ID) {
-        Watchpoint *wp = target->GetWatchpointList().FindByID(cur_wp_id).get();
+        Watchpoint *wp = target.GetWatchpointList().FindByID(cur_wp_id).get();
 
         if (wp) {
           const WatchpointOptions *wp_options = wp->GetOptions();
diff --git a/lldb/source/Commands/Options.td b/lldb/source/Commands/Options.td
index 24e97f3..6e5ed21 100644
--- a/lldb/source/Commands/Options.td
+++ b/lldb/source/Commands/Options.td
@@ -841,6 +841,12 @@ let Command = "scripting run" in {
     " language. If none is specific the default scripting language is used.">;
 }
 
+let Command = "scripting template list" in {
+  def scripting_template_list_language : Option<"language", "l">,
+    EnumArg<"ScriptLang">, Desc<"Specify the scripting "
+    " language. If none is specified the default scripting language is used.">;
+}
+
 let Command = "source info" in {
   def source_info_count : Option<"count", "c">, Arg<"Count">,
     Desc<"The number of line entries to display.">;
diff --git a/lldb/source/Core/PluginManager.cpp b/lldb/source/Core/PluginManager.cpp
index 759ef3a..01bee86 100644
--- a/lldb/source/Core/PluginManager.cpp
+++ b/lldb/source/Core/PluginManager.cpp
@@ -1505,6 +1505,70 @@ LanguageSet PluginManager::GetAllTypeSystemSupportedLanguagesForExpressions() {
   return all;
 }
 
+#pragma mark ScriptedInterfaces
+
+struct ScriptedInterfaceInstance
+    : public PluginInstance<ScriptedInterfaceCreateInstance> {
+  ScriptedInterfaceInstance(llvm::StringRef name, llvm::StringRef description,
+                            ScriptedInterfaceCreateInstance create_callback,
+                            lldb::ScriptLanguage language,
+                            ScriptedInterfaceUsages usages)
+      : PluginInstance<ScriptedInterfaceCreateInstance>(name, description,
+                                                        create_callback),
+        language(language), usages(usages) {}
+
+  lldb::ScriptLanguage language;
+  ScriptedInterfaceUsages usages;
+};
+
+typedef PluginInstances<ScriptedInterfaceInstance> ScriptedInterfaceInstances;
+
+static ScriptedInterfaceInstances &GetScriptedInterfaceInstances() {
+  static ScriptedInterfaceInstances g_instances;
+  return g_instances;
+}
+
+bool PluginManager::RegisterPlugin(
+    llvm::StringRef name, llvm::StringRef description,
+    ScriptedInterfaceCreateInstance create_callback,
+    lldb::ScriptLanguage language, ScriptedInterfaceUsages usages) {
+  return GetScriptedInterfaceInstances().RegisterPlugin(
+      name, description, create_callback, language, usages);
+}
+
+bool PluginManager::UnregisterPlugin(
+    ScriptedInterfaceCreateInstance create_callback) {
+  return GetScriptedInterfaceInstances().UnregisterPlugin(create_callback);
+}
+
+uint32_t PluginManager::GetNumScriptedInterfaces() {
+  return GetScriptedInterfaceInstances().GetInstances().size();
+}
+
+llvm::StringRef PluginManager::GetScriptedInterfaceNameAtIndex(uint32_t index) {
+  return GetScriptedInterfaceInstances().GetNameAtIndex(index);
+}
+
+llvm::StringRef
+PluginManager::GetScriptedInterfaceDescriptionAtIndex(uint32_t index) {
+  return GetScriptedInterfaceInstances().GetDescriptionAtIndex(index);
+}
+
+lldb::ScriptLanguage
+PluginManager::GetScriptedInterfaceLanguageAtIndex(uint32_t idx) {
+  const auto &instances = GetScriptedInterfaceInstances().GetInstances();
+  return idx < instances.size() ? instances[idx].language
+                                : ScriptLanguage::eScriptLanguageNone;
+}
+
+ScriptedInterfaceUsages
+PluginManager::GetScriptedInterfaceUsagesAtIndex(uint32_t idx) {
+  const auto &instances = GetScriptedInterfaceInstances().GetInstances();
+  if (idx >= instances.size())
+    return {};
+  return instances[idx].usages;
+}
+
 #pragma mark REPL
 
 struct REPLInstance : public PluginInstance<REPLCreateInstance> {
@@ -1565,6 +1629,7 @@ void PluginManager::DebuggerInitialize(Debugger &debugger) {
   GetOperatingSystemInstances().PerformDebuggerCallback(debugger);
   GetStructuredDataPluginInstances().PerformDebuggerCallback(debugger);
   GetTracePluginInstances().PerformDebuggerCallback(debugger);
+  GetScriptedInterfaceInstances().PerformDebuggerCallback(debugger);
 }
 
 // This is the preferred new way to register plugin specific settings.  e.g.
diff --git a/lldb/source/Interpreter/CMakeLists.txt b/lldb/source/Interpreter/CMakeLists.txt
index ae79b82..642263a 100644
--- a/lldb/source/Interpreter/CMakeLists.txt
+++ b/lldb/source/Interpreter/CMakeLists.txt
@@ -6,6 +6,8 @@ lldb_tablegen(InterpreterPropertiesEnum.inc -gen-lldb-property-enum-defs
   SOURCE InterpreterProperties.td
   TARGET LLDBInterpreterPropertiesEnumGen)
 
+add_subdirectory(Interfaces)
+
 add_lldb_library(lldbInterpreter NO_PLUGIN_DEPENDENCIES
   CommandAlias.cpp
   CommandHistory.cpp
@@ -54,6 +56,7 @@ add_lldb_library(lldbInterpreter NO_PLUGIN_DEPENDENCIES
   ScriptInterpreter.cpp
 
   LINK_LIBS
+    lldbInterpreterInterfaces
     lldbCommands
     lldbCore
     lldbDataFormatters
@@ -66,6 +69,7 @@ add_lldb_library(lldbInterpreter NO_PLUGIN_DEPENDENCIES
   )
 
 add_dependencies(lldbInterpreter
+  lldbInterpreterInterfaces
   LLDBInterpreterPropertiesGen
   LLDBInterpreterPropertiesEnumGen)
 
diff --git a/lldb/source/Interpreter/CommandObject.cpp b/lldb/source/Interpreter/CommandObject.cpp
index 4634b75..c819024 100644
--- a/lldb/source/Interpreter/CommandObject.cpp
+++ b/lldb/source/Interpreter/CommandObject.cpp
@@ -758,17 +758,23 @@ Target &CommandObject::GetDummyTarget() {
   return m_interpreter.GetDebugger().GetDummyTarget();
 }
 
-Target &CommandObject::GetSelectedOrDummyTarget(bool prefer_dummy) {
-  return m_interpreter.GetDebugger().GetSelectedOrDummyTarget(prefer_dummy);
-}
-
-Target &CommandObject::GetSelectedTarget() {
-  assert(m_flags.AnySet(eCommandRequiresTarget | eCommandProcessMustBePaused |
-                        eCommandProcessMustBeLaunched | eCommandRequiresFrame |
-                        eCommandRequiresThread | eCommandRequiresProcess |
-                        eCommandRequiresRegContext) &&
-         "GetSelectedTarget called from object that may have no target");
-  return *m_interpreter.GetDebugger().GetSelectedTarget();
+Target &CommandObject::GetTarget() {
+  // Prefer the frozen execution context in the command object.
+  if (Target *target = m_exe_ctx.GetTargetPtr())
+    return *target;
+
+  // Fallback to the command interpreter's execution context in case we get
+  // called after DoExecute has finished. For example, when doing multi-line
+  // expression that uses an input reader or breakpoint callbacks.
+  if (Target *target = m_interpreter.GetExecutionContext().GetTargetPtr())
+    return *target;
+
+  // Finally, if we have no other target, get the selected target.
+  if (TargetSP target_sp = m_interpreter.GetDebugger().GetSelectedTarget())
+    return *target_sp;
+
+  // We only have the dummy target.
+  return GetDummyTarget();
 }
 
 Thread *CommandObject::GetDefaultThread() {
diff --git a/lldb/source/Interpreter/Interfaces/CMakeLists.txt b/lldb/source/Interpreter/Interfaces/CMakeLists.txt
new file mode 100644
index 0000000..f44672a
--- /dev/null
+++ b/lldb/source/Interpreter/Interfaces/CMakeLists.txt
@@ -0,0 +1,10 @@
+add_lldb_library(lldbInterpreterInterfaces NO_PLUGIN_DEPENDENCIES
+  ScriptedInterfaceUsages.cpp
+
+  LINK_LIBS
+    lldbUtility
+
+  LINK_COMPONENTS
+    Support
+  )
+
diff --git a/lldb/source/Interpreter/Interfaces/ScriptedInterfaceUsages.cpp b/lldb/source/Interpreter/Interfaces/ScriptedInterfaceUsages.cpp
new file mode 100644
index 0000000..05d7a5d
--- /dev/null
+++ b/lldb/source/Interpreter/Interfaces/ScriptedInterfaceUsages.cpp
@@ -0,0 +1,37 @@
+//===-- ScriptedInterfaceUsages.cpp --------------------------------------===//
+//
+// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
+// See https://llvm.org/LICENSE.txt for license information.
+// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
+//
+//===----------------------------------------------------------------------===//
+
+#include "lldb/Interpreter/Interfaces/ScriptedInterfaceUsages.h"
+
+using namespace lldb;
+using namespace lldb_private;
+
+void ScriptedInterfaceUsages::Dump(Stream &s, UsageKind kind) const {
+  s.IndentMore();
+  s.Indent();
+  llvm::StringRef usage_kind =
+      (kind == UsageKind::CommandInterpreter) ? "Command Interpreter" : "API";
+  s << usage_kind << " Usages:";
+  const std::vector<llvm::StringRef> &usages =
+      (kind == UsageKind::CommandInterpreter) ? GetCommandInterpreterUsages()
+                                              : GetSBAPIUsages();
+  if (usages.empty())
+    s << " None\n";
+  else if (usages.size() == 1)
+    s << " " << usages.front() << '\n';
+  else {
+    s << '\n';
+    for (llvm::StringRef usage : usages) {
+      s.IndentMore();
+      s.Indent();
+      s << usage << '\n';
+      s.IndentLess();
+    }
+  }
+  s.IndentLess();
+}
diff --git a/lldb/source/Plugins/Process/gdb-remote/GDBRemoteCommunication.cpp b/lldb/source/Plugins/Process/gdb-remote/GDBRemoteCommunication.cpp
index 187370e..5d0a3e3 100644
--- a/lldb/source/Plugins/Process/gdb-remote/GDBRemoteCommunication.cpp
+++ b/lldb/source/Plugins/Process/gdb-remote/GDBRemoteCommunication.cpp
@@ -1145,8 +1145,8 @@ Status GDBRemoteCommunication::StartDebugserverProcess(
       if (socket_pipe.CanWrite())
         socket_pipe.CloseWriteFileDescriptor();
       if (socket_pipe.CanRead()) {
-        char port_cstr[PATH_MAX] = {0};
-        port_cstr[0] = '\0';
+        // The port number may be up to "65535\0".
+        char port_cstr[6] = {0};
         size_t num_bytes = sizeof(port_cstr);
         // Read port from pipe with 10 second timeout.
         error = socket_pipe.ReadWithTimeout(
diff --git a/lldb/source/Plugins/ScriptInterpreter/Python/Interfaces/CMakeLists.txt b/lldb/source/Plugins/ScriptInterpreter/Python/Interfaces/CMakeLists.txt
index c60e4bb..8c7e92b 100644
--- a/lldb/source/Plugins/ScriptInterpreter/Python/Interfaces/CMakeLists.txt
+++ b/lldb/source/Plugins/ScriptInterpreter/Python/Interfaces/CMakeLists.txt
@@ -20,12 +20,9 @@ if (LLDB_ENABLE_LIBEDIT)
 endif()
 
 add_lldb_library(lldbPluginScriptInterpreterPythonInterfaces
-  OperatingSystemPythonInterface.cpp
   ScriptedPythonInterface.cpp
   ScriptedProcessPythonInterface.cpp
   ScriptedThreadPythonInterface.cpp
-  ScriptedThreadPlanPythonInterface.cpp
-  ScriptedPlatformPythonInterface.cpp
 
   LINK_LIBS
     lldbCore
@@ -38,3 +35,8 @@ add_lldb_library(lldbPluginScriptInterpreterPythonInterfaces
   LINK_COMPONENTS
     Support
   )
+
+add_subdirectory(OperatingSystemPythonInterface)
+add_subdirectory(ScriptedPlatformPythonInterface)
+add_subdirectory(ScriptedThreadPlanPythonInterface)
+
diff --git a/lldb/source/Plugins/ScriptInterpreter/Python/Interfaces/OperatingSystemPythonInterface/CMakeLists.txt b/lldb/source/Plugins/ScriptInterpreter/Python/Interfaces/OperatingSystemPythonInterface/CMakeLists.txt
new file mode 100644
index 0000000..b48f1e8
--- /dev/null
+++ b/lldb/source/Plugins/ScriptInterpreter/Python/Interfaces/OperatingSystemPythonInterface/CMakeLists.txt
@@ -0,0 +1,16 @@
+add_lldb_library(lldbPluginScriptInterpreterPythonOperatingSystemPythonInterface PLUGIN
+
+  OperatingSystemPythonInterface.cpp
+
+  LINK_LIBS
+    lldbCore
+    lldbHost
+    lldbInterpreter
+    lldbTarget
+    lldbPluginScriptInterpreterPython
+    ${Python3_LIBRARIES}
+    ${LLDB_LIBEDIT_LIBS}
+
+  LINK_COMPONENTS
+    Support
+  )
diff --git a/lldb/source/Plugins/ScriptInterpreter/Python/Interfaces/OperatingSystemPythonInterface.cpp b/lldb/source/Plugins/ScriptInterpreter/Python/Interfaces/OperatingSystemPythonInterface/OperatingSystemPythonInterface.cpp
index c162c73..019db26 100644
--- a/lldb/source/Plugins/ScriptInterpreter/Python/Interfaces/OperatingSystemPythonInterface.cpp
+++ b/lldb/source/Plugins/ScriptInterpreter/Python/Interfaces/OperatingSystemPythonInterface/OperatingSystemPythonInterface.cpp
@@ -6,6 +6,7 @@
 //
 //===----------------------------------------------------------------------===//
 
+#include "lldb/Core/PluginManager.h"
 #include "lldb/Host/Config.h"
 #include "lldb/Target/ExecutionContext.h"
 #include "lldb/Utility/Log.h"
@@ -13,11 +14,13 @@
 
 #if LLDB_ENABLE_PYTHON
 
+// clang-format off
 // LLDB Python header must be included first
-#include "../lldb-python.h"
+#include "../../lldb-python.h"
+//clang-format on
 
-#include "../SWIGPythonBridge.h"
-#include "../ScriptInterpreterPythonImpl.h"
+#include "../../SWIGPythonBridge.h"
+#include "../../ScriptInterpreterPythonImpl.h"
 #include "OperatingSystemPythonInterface.h"
 
 using namespace lldb;
@@ -25,6 +28,8 @@ using namespace lldb_private;
 using namespace lldb_private::python;
 using Locker = ScriptInterpreterPythonImpl::Locker;
 
+LLDB_PLUGIN_DEFINE_ADV(OperatingSystemPythonInterface, ScriptInterpreterPythonOperatingSystemPythonInterface)
+
 OperatingSystemPythonInterface::OperatingSystemPythonInterface(
     ScriptInterpreterPythonImpl &interpreter)
     : OperatingSystemInterface(), ScriptedThreadPythonInterface(interpreter) {}
@@ -79,4 +84,18 @@ OperatingSystemPythonInterface::GetRegisterContextForTID(lldb::tid_t tid) {
   return obj->GetAsString()->GetValue().str();
 }
 
+void OperatingSystemPythonInterface::Initialize() {
+  const std::vector<llvm::StringRef> ci_usages = {
+      "settings set target.process.python-os-plugin-path <script-path>",
+      "settings set process.experimental.os-plugin-reports-all-threads [0/1]"};
+  const std::vector<llvm::StringRef> api_usages = {};
+  PluginManager::RegisterPlugin(
+      GetPluginNameStatic(), llvm::StringRef("Mock thread state"),
+      CreateInstance, eScriptLanguagePython, {ci_usages, api_usages});
+}
+
+void OperatingSystemPythonInterface::Terminate() {
+  PluginManager::UnregisterPlugin(CreateInstance);
+}
+
 #endif
diff --git a/lldb/source/Plugins/ScriptInterpreter/Python/Interfaces/OperatingSystemPythonInterface.h b/lldb/source/Plugins/ScriptInterpreter/Python/Interfaces/OperatingSystemPythonInterface/OperatingSystemPythonInterface.h
index da7bbf1..6d60f8b 100644
--- a/lldb/source/Plugins/ScriptInterpreter/Python/Interfaces/OperatingSystemPythonInterface.h
+++ b/lldb/source/Plugins/ScriptInterpreter/Python/Interfaces/OperatingSystemPythonInterface/OperatingSystemPythonInterface.h
@@ -10,17 +10,19 @@
 #define LLDB_PLUGINS_SCRIPTINTERPRETER_PYTHON_INTERFACES_OPERATINGSYSTEMPYTHONINTERFACE_H
 
 #include "lldb/Host/Config.h"
+#include "lldb/Interpreter/Interfaces/OperatingSystemInterface.h"
 
 #if LLDB_ENABLE_PYTHON
 
-#include "ScriptedThreadPythonInterface.h"
-#include "lldb/Interpreter/Interfaces/OperatingSystemInterface.h"
+#include "../ScriptedThreadPythonInterface.h"
+
 #include <optional>
 
 namespace lldb_private {
 class OperatingSystemPythonInterface
     : virtual public OperatingSystemInterface,
-      virtual public ScriptedThreadPythonInterface {
+      virtual public ScriptedThreadPythonInterface,
+      public PluginInterface {
 public:
   OperatingSystemPythonInterface(ScriptInterpreterPythonImpl &interpreter);
 
@@ -41,6 +43,16 @@ public:
   StructuredData::DictionarySP GetRegisterInfo() override;
 
   std::optional<std::string> GetRegisterContextForTID(lldb::tid_t tid) override;
+
+  static void Initialize();
+
+  static void Terminate();
+
+  static llvm::StringRef GetPluginNameStatic() {
+    return "OperatingSystemPythonInterface";
+  }
+
+  llvm::StringRef GetPluginName() override { return GetPluginNameStatic(); }
 };
 } // namespace lldb_private
 
diff --git a/lldb/source/Plugins/ScriptInterpreter/Python/Interfaces/ScriptedPlatformPythonInterface/CMakeLists.txt b/lldb/source/Plugins/ScriptInterpreter/Python/Interfaces/ScriptedPlatformPythonInterface/CMakeLists.txt
new file mode 100644
index 0000000..ae5e525
--- /dev/null
+++ b/lldb/source/Plugins/ScriptInterpreter/Python/Interfaces/ScriptedPlatformPythonInterface/CMakeLists.txt
@@ -0,0 +1,16 @@
+add_lldb_library(lldbPluginScriptInterpreterPythonScriptedPlatformPythonInterface PLUGIN
+
+  ScriptedPlatformPythonInterface.cpp
+
+  LINK_LIBS
+    lldbCore
+    lldbHost
+    lldbInterpreter
+    lldbTarget
+    lldbPluginScriptInterpreterPython
+    ${Python3_LIBRARIES}
+    ${LLDB_LIBEDIT_LIBS}
+
+  LINK_COMPONENTS
+    Support
+  )
diff --git a/lldb/source/Plugins/ScriptInterpreter/Python/Interfaces/ScriptedPlatformPythonInterface.cpp b/lldb/source/Plugins/ScriptInterpreter/Python/Interfaces/ScriptedPlatformPythonInterface/ScriptedPlatformPythonInterface.cpp
index 6e93bec..3586251 100644
--- a/lldb/source/Plugins/ScriptInterpreter/Python/Interfaces/ScriptedPlatformPythonInterface.cpp
+++ b/lldb/source/Plugins/ScriptInterpreter/Python/Interfaces/ScriptedPlatformPythonInterface/ScriptedPlatformPythonInterface.cpp
@@ -6,27 +6,31 @@
 //
 //===----------------------------------------------------------------------===//
 
+#include "lldb/Core/PluginManager.h"
 #include "lldb/Host/Config.h"
+#include "lldb/Target/ExecutionContext.h"
 #include "lldb/Utility/Log.h"
 #include "lldb/Utility/Status.h"
 #include "lldb/lldb-enumerations.h"
 
 #if LLDB_ENABLE_PYTHON
 
+// clang-format off
 // LLDB Python header must be included first
-#include "../lldb-python.h"
+#include "../../lldb-python.h"
+//clang-format on
 
-#include "../SWIGPythonBridge.h"
-#include "../ScriptInterpreterPythonImpl.h"
+#include "../../SWIGPythonBridge.h"
+#include "../../ScriptInterpreterPythonImpl.h"
 #include "ScriptedPlatformPythonInterface.h"
 
-#include "lldb/Target/ExecutionContext.h"
-
 using namespace lldb;
 using namespace lldb_private;
 using namespace lldb_private::python;
 using Locker = ScriptInterpreterPythonImpl::Locker;
 
+LLDB_PLUGIN_DEFINE_ADV(ScriptedPlatformPythonInterface, ScriptInterpreterPythonScriptedPlatformPythonInterface)
+
 ScriptedPlatformPythonInterface::ScriptedPlatformPythonInterface(
     ScriptInterpreterPythonImpl &interpreter)
     : ScriptedPlatformInterface(), ScriptedPythonInterface(interpreter) {}
@@ -93,4 +97,14 @@ Status ScriptedPlatformPythonInterface::KillProcess(lldb::pid_t pid) {
   return GetStatusFromMethod("kill_process", pid);
 }
 
+void ScriptedPlatformPythonInterface::Initialize() {
+  PluginManager::RegisterPlugin(
+      GetPluginNameStatic(), "Mock platform and interact with its processes.",
+      CreateInstance, eScriptLanguagePython, {});
+}
+
+void ScriptedPlatformPythonInterface::Terminate() {
+  PluginManager::UnregisterPlugin(CreateInstance);
+}
+
 #endif // LLDB_ENABLE_PYTHON
diff --git a/lldb/source/Plugins/ScriptInterpreter/Python/Interfaces/ScriptedPlatformPythonInterface.h b/lldb/source/Plugins/ScriptInterpreter/Python/Interfaces/ScriptedPlatformPythonInterface/ScriptedPlatformPythonInterface.h
index 0842d3a..01ee40a 100644
--- a/lldb/source/Plugins/ScriptInterpreter/Python/Interfaces/ScriptedPlatformPythonInterface.h
+++ b/lldb/source/Plugins/ScriptInterpreter/Python/Interfaces/ScriptedPlatformPythonInterface/ScriptedPlatformPythonInterface.h
@@ -10,15 +10,16 @@
 #define LLDB_PLUGINS_SCRIPTINTERPRETER_PYTHON_INTERFACES_SCRIPTEDPLATFORMPYTHONINTERFACE_H
 
 #include "lldb/Host/Config.h"
+#include "lldb/Interpreter/Interfaces/ScriptedPlatformInterface.h"
 
 #if LLDB_ENABLE_PYTHON
 
-#include "ScriptedPythonInterface.h"
-#include "lldb/Interpreter/Interfaces/ScriptedPlatformInterface.h"
+#include "../ScriptedPythonInterface.h"
 
 namespace lldb_private {
 class ScriptedPlatformPythonInterface : public ScriptedPlatformInterface,
-                                        public ScriptedPythonInterface {
+                                        public ScriptedPythonInterface,
+                                        public PluginInterface {
 public:
   ScriptedPlatformPythonInterface(ScriptInterpreterPythonImpl &interpreter);
 
@@ -43,6 +44,16 @@ public:
   Status LaunchProcess(lldb::ProcessLaunchInfoSP launch_info) override;
 
   Status KillProcess(lldb::pid_t pid) override;
+
+  static void Initialize();
+
+  static void Terminate();
+
+  static llvm::StringRef GetPluginNameStatic() {
+    return "ScriptedPlatformPythonInterface";
+  }
+
+  llvm::StringRef GetPluginName() override { return GetPluginNameStatic(); }
 };
 } // namespace lldb_private
 
diff --git a/lldb/source/Plugins/ScriptInterpreter/Python/Interfaces/ScriptedThreadPlanPythonInterface/CMakeLists.txt b/lldb/source/Plugins/ScriptInterpreter/Python/Interfaces/ScriptedThreadPlanPythonInterface/CMakeLists.txt
new file mode 100644
index 0000000..db41da1
--- /dev/null
+++ b/lldb/source/Plugins/ScriptInterpreter/Python/Interfaces/ScriptedThreadPlanPythonInterface/CMakeLists.txt
@@ -0,0 +1,16 @@
+add_lldb_library(lldbPluginScriptInterpreterPythonScriptedThreadPlanPythonInterface PLUGIN
+
+  ScriptedThreadPlanPythonInterface.cpp
+
+  LINK_LIBS
+    lldbCore
+    lldbHost
+    lldbInterpreter
+    lldbTarget
+    lldbPluginScriptInterpreterPython
+    ${Python3_LIBRARIES}
+    ${LLDB_LIBEDIT_LIBS}
+
+  LINK_COMPONENTS
+    Support
+  )
diff --git a/lldb/source/Plugins/ScriptInterpreter/Python/Interfaces/ScriptedThreadPlanPythonInterface.cpp b/lldb/source/Plugins/ScriptInterpreter/Python/Interfaces/ScriptedThreadPlanPythonInterface/ScriptedThreadPlanPythonInterface.cpp
index f23858c..5f1c7da 100644
--- a/lldb/source/Plugins/ScriptInterpreter/Python/Interfaces/ScriptedThreadPlanPythonInterface.cpp
+++ b/lldb/source/Plugins/ScriptInterpreter/Python/Interfaces/ScriptedThreadPlanPythonInterface/ScriptedThreadPlanPythonInterface.cpp
@@ -6,23 +6,28 @@
 //
 //===----------------------------------------------------------------------===//
 
+#include "lldb/Core/PluginManager.h"
 #include "lldb/Host/Config.h"
 #include "lldb/Utility/Log.h"
 #include "lldb/lldb-enumerations.h"
 
 #if LLDB_ENABLE_PYTHON
 
+// clang-format off
 // LLDB Python header must be included first
-#include "../lldb-python.h"
+#include "../../lldb-python.h"
+//clang-format on
 
-#include "../SWIGPythonBridge.h"
-#include "../ScriptInterpreterPythonImpl.h"
+#include "../../SWIGPythonBridge.h"
+#include "../../ScriptInterpreterPythonImpl.h"
 #include "ScriptedThreadPlanPythonInterface.h"
 
 using namespace lldb;
 using namespace lldb_private;
 using namespace lldb_private::python;
 
+LLDB_PLUGIN_DEFINE_ADV(ScriptedThreadPlanPythonInterface, ScriptInterpreterPythonScriptedThreadPlanPythonInterface)
+
 ScriptedThreadPlanPythonInterface::ScriptedThreadPlanPythonInterface(
     ScriptInterpreterPythonImpl &interpreter)
     : ScriptedThreadPlanInterface(), ScriptedPythonInterface(interpreter) {}
@@ -102,4 +107,19 @@ ScriptedThreadPlanPythonInterface::GetStopDescription(lldb::StreamSP &stream) {
   return llvm::Error::success();
 }
 
+void ScriptedThreadPlanPythonInterface::Initialize() {
+  const std::vector<llvm::StringRef> ci_usages = {
+      "thread step-scripted -C <script-name> [-k key -v value ...]"};
+  const std::vector<llvm::StringRef> api_usages = {
+      "SBThread.StepUsingScriptedThreadPlan"};
+  PluginManager::RegisterPlugin(
+      GetPluginNameStatic(),
+      llvm::StringRef("Alter thread stepping logic and stop reason"),
+      CreateInstance, eScriptLanguagePython, {ci_usages, api_usages});
+}
+
+void ScriptedThreadPlanPythonInterface::Terminate() {
+  PluginManager::UnregisterPlugin(CreateInstance);
+}
+
 #endif
diff --git a/lldb/source/Plugins/ScriptInterpreter/Python/Interfaces/ScriptedThreadPlanPythonInterface.h b/lldb/source/Plugins/ScriptInterpreter/Python/Interfaces/ScriptedThreadPlanPythonInterface/ScriptedThreadPlanPythonInterface.h
index 6ec89b9..c0a82f4 100644
--- a/lldb/source/Plugins/ScriptInterpreter/Python/Interfaces/ScriptedThreadPlanPythonInterface.h
+++ b/lldb/source/Plugins/ScriptInterpreter/Python/Interfaces/ScriptedThreadPlanPythonInterface/ScriptedThreadPlanPythonInterface.h
@@ -10,16 +10,18 @@
 #define LLDB_PLUGINS_SCRIPTINTERPRETER_PYTHON_INTERFACES_SCRIPTEDTHREADPLANPYTHONINTERFACE_H
 
 #include "lldb/Host/Config.h"
+#include "lldb/Interpreter/Interfaces/ScriptedThreadPlanInterface.h"
 
 #if LLDB_ENABLE_PYTHON
 
-#include "ScriptedPythonInterface.h"
-#include "lldb/Interpreter/Interfaces/ScriptedThreadPlanInterface.h"
+#include "../ScriptedPythonInterface.h"
+
 #include <optional>
 
 namespace lldb_private {
 class ScriptedThreadPlanPythonInterface : public ScriptedThreadPlanInterface,
-                                          public ScriptedPythonInterface {
+                                          public ScriptedPythonInterface,
+                                          public PluginInterface {
 public:
   ScriptedThreadPlanPythonInterface(ScriptInterpreterPythonImpl &interpreter);
 
@@ -41,6 +43,16 @@ public:
   lldb::StateType GetRunState() override;
 
   llvm::Error GetStopDescription(lldb::StreamSP &stream) override;
+
+  static void Initialize();
+
+  static void Terminate();
+
+  static llvm::StringRef GetPluginNameStatic() {
+    return "ScriptedThreadPlanPythonInterface";
+  }
+
+  llvm::StringRef GetPluginName() override { return GetPluginNameStatic(); }
 };
 } // namespace lldb_private
 
diff --git a/lldb/source/Plugins/ScriptInterpreter/Python/ScriptInterpreterPython.cpp b/lldb/source/Plugins/ScriptInterpreter/Python/ScriptInterpreterPython.cpp
index 70fa6d8..d34fdf1 100644
--- a/lldb/source/Plugins/ScriptInterpreter/Python/ScriptInterpreterPython.cpp
+++ b/lldb/source/Plugins/ScriptInterpreter/Python/ScriptInterpreterPython.cpp
@@ -14,11 +14,10 @@
 // LLDB Python header must be included first
 #include "lldb-python.h"
 
-#include "Interfaces/OperatingSystemPythonInterface.h"
-#include "Interfaces/ScriptedPlatformPythonInterface.h"
+#include "Interfaces/OperatingSystemPythonInterface/OperatingSystemPythonInterface.h"
+#include "Interfaces/ScriptedPlatformPythonInterface/ScriptedPlatformPythonInterface.h"
 #include "Interfaces/ScriptedProcessPythonInterface.h"
-#include "Interfaces/ScriptedThreadPlanPythonInterface.h"
-#include "Interfaces/ScriptedThreadPythonInterface.h"
+#include "Interfaces/ScriptedThreadPlanPythonInterface/ScriptedThreadPlanPythonInterface.h"
 #include "PythonDataObjects.h"
 #include "PythonReadline.h"
 #include "SWIGPythonBridge.h"
diff --git a/lldb/source/Plugins/StructuredData/DarwinLog/StructuredDataDarwinLog.cpp b/lldb/source/Plugins/StructuredData/DarwinLog/StructuredDataDarwinLog.cpp
index c46dc54..0724547 100644
--- a/lldb/source/Plugins/StructuredData/DarwinLog/StructuredDataDarwinLog.cpp
+++ b/lldb/source/Plugins/StructuredData/DarwinLog/StructuredDataDarwinLog.cpp
@@ -783,7 +783,7 @@ protected:
 
     // Now check if we have a running process.  If so, we should instruct the
     // process monitor to enable/disable DarwinLog support now.
-    Target &target = GetSelectedOrDummyTarget();
+    Target &target = GetTarget();
 
     // Grab the active process.
     auto process_sp = target.GetProcessSP();
@@ -865,7 +865,7 @@ protected:
 
     // Figure out if we've got a process.  If so, we can tell if DarwinLog is
     // available for that process.
-    Target &target = GetSelectedOrDummyTarget();
+    Target &target = GetTarget();
     auto process_sp = target.GetProcessSP();
     if (!process_sp) {
       stream.PutCString("Availability: unknown (requires process)\n");
diff --git a/lldb/source/Plugins/SymbolFile/DWARF/DWARFASTParser.cpp b/lldb/source/Plugins/SymbolFile/DWARF/DWARFASTParser.cpp
index 409e9bb..4ed523b 100644
--- a/lldb/source/Plugins/SymbolFile/DWARF/DWARFASTParser.cpp
+++ b/lldb/source/Plugins/SymbolFile/DWARF/DWARFASTParser.cpp
@@ -37,7 +37,7 @@ DWARFASTParser::ParseChildArrayInfo(const DWARFDIE &parent_die,
     if (attributes.Size() == 0)
       continue;
 
-    uint64_t num_elements = 0;
+    std::optional<uint64_t> num_elements;
     uint64_t lower_bound = 0;
     uint64_t upper_bound = 0;
     bool upper_bound_valid = false;
@@ -91,7 +91,7 @@ DWARFASTParser::ParseChildArrayInfo(const DWARFDIE &parent_die,
       }
     }
 
-    if (num_elements == 0) {
+    if (!num_elements || *num_elements == 0) {
       if (upper_bound_valid && upper_bound >= lower_bound)
         num_elements = upper_bound - lower_bound + 1;
     }
diff --git a/lldb/source/Plugins/SymbolFile/DWARF/DWARFASTParserClang.cpp b/lldb/source/Plugins/SymbolFile/DWARF/DWARFASTParserClang.cpp
index 85c59a6..a4dcde1 100644
--- a/lldb/source/Plugins/SymbolFile/DWARF/DWARFASTParserClang.cpp
+++ b/lldb/source/Plugins/SymbolFile/DWARF/DWARFASTParserClang.cpp
@@ -1395,20 +1395,20 @@ DWARFASTParserClang::ParseArrayType(const DWARFDIE &die,
   uint64_t array_element_bit_stride = byte_stride * 8 + bit_stride;
   CompilerType clang_type;
   if (array_info && array_info->element_orders.size() > 0) {
-    uint64_t num_elements = 0;
     auto end = array_info->element_orders.rend();
     for (auto pos = array_info->element_orders.rbegin(); pos != end; ++pos) {
-      num_elements = *pos;
-      clang_type = m_ast.CreateArrayType(array_element_type, num_elements,
-                                         attrs.is_vector);
+      clang_type = m_ast.CreateArrayType(
+          array_element_type, /*element_count=*/*pos, attrs.is_vector);
+
+      uint64_t num_elements = pos->value_or(0);
       array_element_type = clang_type;
       array_element_bit_stride = num_elements
                                      ? array_element_bit_stride * num_elements
                                      : array_element_bit_stride;
     }
   } else {
-    clang_type =
-        m_ast.CreateArrayType(array_element_type, 0, attrs.is_vector);
+    clang_type = m_ast.CreateArrayType(
+        array_element_type, /*element_count=*/std::nullopt, attrs.is_vector);
   }
   ConstString empty_name;
   TypeSP type_sp =
diff --git a/lldb/source/Plugins/TypeSystem/Clang/TypeSystemClang.cpp b/lldb/source/Plugins/TypeSystem/Clang/TypeSystemClang.cpp
index f70efe5..484ca04 100644
--- a/lldb/source/Plugins/TypeSystem/Clang/TypeSystemClang.cpp
+++ b/lldb/source/Plugins/TypeSystem/Clang/TypeSystemClang.cpp
@@ -2233,30 +2233,31 @@ TypeSystemClang::CreateBlockPointerType(const CompilerType &function_type) {
 
 #pragma mark Array Types
 
-CompilerType TypeSystemClang::CreateArrayType(const CompilerType &element_type,
-                                              size_t element_count,
-                                              bool is_vector) {
-  if (element_type.IsValid()) {
-    ASTContext &ast = getASTContext();
+CompilerType
+TypeSystemClang::CreateArrayType(const CompilerType &element_type,
+                                 std::optional<size_t> element_count,
+                                 bool is_vector) {
+  if (!element_type.IsValid())
+    return {};
 
-    if (is_vector) {
-      return GetType(ast.getExtVectorType(ClangUtil::GetQualType(element_type),
-                                          element_count));
-    } else {
+  ASTContext &ast = getASTContext();
 
-      llvm::APInt ap_element_count(64, element_count);
-      if (element_count == 0) {
-        return GetType(
-            ast.getIncompleteArrayType(ClangUtil::GetQualType(element_type),
-                                       clang::ArraySizeModifier::Normal, 0));
-      } else {
-        return GetType(ast.getConstantArrayType(
-            ClangUtil::GetQualType(element_type), ap_element_count, nullptr,
-            clang::ArraySizeModifier::Normal, 0));
-      }
-    }
-  }
-  return CompilerType();
+  // Unknown number of elements; this is an incomplete array
+  // (e.g., variable length array with non-constant bounds, or
+  // a flexible array member).
+  if (!element_count)
+    return GetType(
+        ast.getIncompleteArrayType(ClangUtil::GetQualType(element_type),
+                                   clang::ArraySizeModifier::Normal, 0));
+
+  if (is_vector)
+    return GetType(ast.getExtVectorType(ClangUtil::GetQualType(element_type),
+                                        *element_count));
+
+  llvm::APInt ap_element_count(64, *element_count);
+  return GetType(ast.getConstantArrayType(ClangUtil::GetQualType(element_type),
+                                          ap_element_count, nullptr,
+                                          clang::ArraySizeModifier::Normal, 0));
 }
 
 CompilerType TypeSystemClang::CreateStructForIdentifier(
@@ -4726,66 +4727,68 @@ TypeSystemClang::GetFloatTypeSemantics(size_t byte_size) {
 }
 
 std::optional<uint64_t>
+TypeSystemClang::GetObjCBitSize(QualType qual_type,
+                                ExecutionContextScope *exe_scope) {
+  assert(qual_type->isObjCObjectOrInterfaceType());
+  ExecutionContext exe_ctx(exe_scope);
+  if (Process *process = exe_ctx.GetProcessPtr()) {
+    if (ObjCLanguageRuntime *objc_runtime =
+            ObjCLanguageRuntime::Get(*process)) {
+      if (std::optional<uint64_t> bit_size =
+              objc_runtime->GetTypeBitSize(GetType(qual_type)))
+        return *bit_size;
+    }
+  } else {
+    static bool g_printed = false;
+    if (!g_printed) {
+      StreamString s;
+      DumpTypeDescription(qual_type.getAsOpaquePtr(), s);
+
+      llvm::outs() << "warning: trying to determine the size of type ";
+      llvm::outs() << s.GetString() << "\n";
+      llvm::outs() << "without a valid ExecutionContext. this is not "
+                      "reliable. please file a bug against LLDB.\n";
+      llvm::outs() << "backtrace:\n";
+      llvm::sys::PrintStackTrace(llvm::outs());
+      llvm::outs() << "\n";
+      g_printed = true;
+    }
+  }
+
+  return getASTContext().getTypeSize(qual_type) +
+         getASTContext().getTypeSize(getASTContext().ObjCBuiltinClassTy);
+}
+
+std::optional<uint64_t>
 TypeSystemClang::GetBitSize(lldb::opaque_compiler_type_t type,
                             ExecutionContextScope *exe_scope) {
-  if (GetCompleteType(type)) {
-    clang::QualType qual_type(GetCanonicalQualType(type));
-    const clang::Type::TypeClass type_class = qual_type->getTypeClass();
-    switch (type_class) {
-    case clang::Type::Record:
-      if (GetCompleteType(type))
-        return getASTContext().getTypeSize(qual_type);
-      else
-        return std::nullopt;
-      break;
+  if (!GetCompleteType(type))
+    return std::nullopt;
 
-    case clang::Type::ObjCInterface:
-    case clang::Type::ObjCObject: {
-      ExecutionContext exe_ctx(exe_scope);
-      Process *process = exe_ctx.GetProcessPtr();
-      if (process) {
-        if (ObjCLanguageRuntime *objc_runtime =
-                ObjCLanguageRuntime::Get(*process)) {
-          if (std::optional<uint64_t> bit_size =
-                  objc_runtime->GetTypeBitSize(GetType(qual_type)))
-            return *bit_size;
-        }
-      } else {
-        static bool g_printed = false;
-        if (!g_printed) {
-          StreamString s;
-          DumpTypeDescription(type, s);
-
-          llvm::outs() << "warning: trying to determine the size of type ";
-          llvm::outs() << s.GetString() << "\n";
-          llvm::outs() << "without a valid ExecutionContext. this is not "
-                          "reliable. please file a bug against LLDB.\n";
-          llvm::outs() << "backtrace:\n";
-          llvm::sys::PrintStackTrace(llvm::outs());
-          llvm::outs() << "\n";
-          g_printed = true;
-        }
-      }
-    }
-      [[fallthrough]];
-    default:
-      const uint32_t bit_size = getASTContext().getTypeSize(qual_type);
-      if (bit_size == 0) {
-        if (qual_type->isIncompleteArrayType())
-          return getASTContext().getTypeSize(
-              qual_type->getArrayElementTypeNoTypeQual()
-                  ->getCanonicalTypeUnqualified());
-      }
-      if (qual_type->isObjCObjectOrInterfaceType())
-        return bit_size +
-               getASTContext().getTypeSize(getASTContext().ObjCBuiltinClassTy);
-      // Function types actually have a size of 0, that's not an error.
-      if (qual_type->isFunctionProtoType())
-        return bit_size;
-      if (bit_size)
-        return bit_size;
-    }
+  clang::QualType qual_type(GetCanonicalQualType(type));
+  const clang::Type::TypeClass type_class = qual_type->getTypeClass();
+  switch (type_class) {
+  case clang::Type::ConstantArray:
+  case clang::Type::FunctionProto:
+  case clang::Type::Record:
+    return getASTContext().getTypeSize(qual_type);
+  case clang::Type::ObjCInterface:
+  case clang::Type::ObjCObject:
+    return GetObjCBitSize(qual_type, exe_scope);
+  case clang::Type::IncompleteArray: {
+    const uint64_t bit_size = getASTContext().getTypeSize(qual_type);
+    if (bit_size == 0)
+      return getASTContext().getTypeSize(
+          qual_type->getArrayElementTypeNoTypeQual()
+              ->getCanonicalTypeUnqualified());
+
+    return bit_size;
   }
+  default:
+    if (const uint64_t bit_size = getASTContext().getTypeSize(qual_type))
+      return bit_size;
+  }
+
   return std::nullopt;
 }
 
@@ -5456,9 +5459,9 @@ TypeSystemClang::GetNumChildren(lldb::opaque_compiler_type_t type,
   case clang::Type::IncompleteArray:
     if (auto array_info =
             GetDynamicArrayInfo(*this, GetSymbolFile(), qual_type, exe_ctx))
-      // Only 1-dimensional arrays are supported.
+      // FIXME: Only 1-dimensional arrays are supported.
       num_children = array_info->element_orders.size()
-                         ? array_info->element_orders.back()
+                         ? array_info->element_orders.back().value_or(0)
                          : 0;
     break;
 
diff --git a/lldb/source/Plugins/TypeSystem/Clang/TypeSystemClang.h b/lldb/source/Plugins/TypeSystem/Clang/TypeSystemClang.h
index d67b7a4..56a5c0a 100644
--- a/lldb/source/Plugins/TypeSystem/Clang/TypeSystemClang.h
+++ b/lldb/source/Plugins/TypeSystem/Clang/TypeSystemClang.h
@@ -498,7 +498,8 @@ public:
   // Array Types
 
   CompilerType CreateArrayType(const CompilerType &element_type,
-                               size_t element_count, bool is_vector);
+                               std::optional<size_t> element_count,
+                               bool is_vector);
 
   // Enumeration Types
   CompilerType CreateEnumerationType(llvm::StringRef name,
@@ -1172,6 +1173,9 @@ private:
   /// on creation of a new instance.
   void LogCreation() const;
 
+  std::optional<uint64_t> GetObjCBitSize(clang::QualType qual_type,
+                                         ExecutionContextScope *exe_scope);
+
   // Classes that inherit from TypeSystemClang can see and modify these
   std::string m_target_triple;
   std::unique_ptr<clang::ASTContext> m_ast_up;
diff --git a/lldb/source/Symbol/UnwindPlan.cpp b/lldb/source/Symbol/UnwindPlan.cpp
index e258a4e..e2dbd81 100644
--- a/lldb/source/Symbol/UnwindPlan.cpp
+++ b/lldb/source/Symbol/UnwindPlan.cpp
@@ -46,6 +46,8 @@ operator==(const UnwindPlan::Row::RegisterLocation &rhs) const {
         return !memcmp(m_location.expr.opcodes, rhs.m_location.expr.opcodes,
                        m_location.expr.length);
       break;
+    case isConstant:
+      return m_location.constant_value == rhs.m_location.constant_value;
     }
   }
   return false;
@@ -153,6 +155,9 @@ void UnwindPlan::Row::RegisterLocation::Dump(Stream &s,
     if (m_type == atDWARFExpression)
       s.PutChar(']');
   } break;
+  case isConstant:
+    s.Printf("=0x%" PRIx64, m_location.constant_value);
+    break;
   }
 }
 
@@ -351,6 +356,18 @@ bool UnwindPlan::Row::SetRegisterLocationToSame(uint32_t reg_num,
   return true;
 }
 
+bool UnwindPlan::Row::SetRegisterLocationToIsConstant(uint32_t reg_num,
+                                                      uint64_t constant,
+                                                      bool can_replace) {
+  if (!can_replace &&
+      m_register_locations.find(reg_num) != m_register_locations.end())
+    return false;
+  RegisterLocation reg_loc;
+  reg_loc.SetIsConstant(constant);
+  m_register_locations[reg_num] = reg_loc;
+  return true;
+}
+
 bool UnwindPlan::Row::operator==(const UnwindPlan::Row &rhs) const {
   return m_offset == rhs.m_offset && m_cfa_value == rhs.m_cfa_value &&
          m_afa_value == rhs.m_afa_value &&
diff --git a/lldb/source/Target/RegisterContextUnwind.cpp b/lldb/source/Target/RegisterContextUnwind.cpp
index bc8081f..a61228d 100644
--- a/lldb/source/Target/RegisterContextUnwind.cpp
+++ b/lldb/source/Target/RegisterContextUnwind.cpp
@@ -1694,6 +1694,15 @@ RegisterContextUnwind::SavedLocationForRegister(
     return UnwindLLDB::RegisterSearchResult::eRegisterNotFound;
   }
 
+  if (unwindplan_regloc.IsConstant()) {
+    regloc.type = UnwindLLDB::RegisterLocation::eRegisterValueInferred;
+    regloc.location.inferred_value = unwindplan_regloc.GetConstant();
+    m_registers[regnum.GetAsKind(eRegisterKindLLDB)] = regloc;
+    UnwindLogMsg("supplying caller's register %s (%d) via constant value",
+                 regnum.GetName(), regnum.GetAsKind(eRegisterKindLLDB));
+    return UnwindLLDB::RegisterSearchResult::eRegisterFound;
+  }
+
   UnwindLogMsg("no save location for %s (%d) in this stack frame",
                regnum.GetName(), regnum.GetAsKind(eRegisterKindLLDB));
 
diff --git a/lldb/source/Target/Target.cpp b/lldb/source/Target/Target.cpp
index ec0da8a..129683c 100644
--- a/lldb/source/Target/Target.cpp
+++ b/lldb/source/Target/Target.cpp
@@ -2155,12 +2155,21 @@ bool Target::ReadPointerFromMemory(const Address &addr, Status &error,
   return false;
 }
 
-ModuleSP Target::GetOrCreateModule(const ModuleSpec &module_spec, bool notify,
-                                   Status *error_ptr) {
+ModuleSP Target::GetOrCreateModule(const ModuleSpec &orig_module_spec,
+                                   bool notify, Status *error_ptr) {
   ModuleSP module_sp;
 
   Status error;
 
+  // Apply any remappings specified in target.object-map:
+  ModuleSpec module_spec(orig_module_spec);
+  PathMappingList &obj_mapping = GetObjectPathMap();
+  if (std::optional<FileSpec> remapped_obj_file =
+          obj_mapping.RemapPath(orig_module_spec.GetFileSpec().GetPath(),
+                                true /* only_if_exists */)) {
+    module_spec.GetFileSpec().SetPath(remapped_obj_file->GetPath());
+  }
+
   // First see if we already have this module in our module list.  If we do,
   // then we're done, we don't need to consult the shared modules list.  But
   // only do this if we are passed a UUID.
@@ -4459,6 +4468,14 @@ PathMappingList &TargetProperties::GetSourcePathMap() const {
   return option_value->GetCurrentValue();
 }
 
+PathMappingList &TargetProperties::GetObjectPathMap() const {
+  const uint32_t idx = ePropertyObjectMap;
+  OptionValuePathMappings *option_value =
+      m_collection_sp->GetPropertyAtIndexAsOptionValuePathMappings(idx);
+  assert(option_value);
+  return option_value->GetCurrentValue();
+}
+
 bool TargetProperties::GetAutoSourceMapRelative() const {
   const uint32_t idx = ePropertyAutoSourceMapRelative;
   return GetPropertyAtIndexAs<bool>(
diff --git a/lldb/source/Target/TargetProperties.td b/lldb/source/Target/TargetProperties.td
index 7f79218..4404a45 100644
--- a/lldb/source/Target/TargetProperties.td
+++ b/lldb/source/Target/TargetProperties.td
@@ -46,6 +46,9 @@ let Definition = "target" in {
   def SourceMap: Property<"source-map", "PathMap">,
     DefaultStringValue<"">,
     Desc<"Source path remappings apply substitutions to the paths of source files, typically needed to debug from a different host than the one that built the target.  The source-map property consists of an array of pairs, the first element is a path prefix, and the second is its replacement.  The syntax is `prefix1 replacement1 prefix2 replacement2...`.  The pairs are checked in order, the first prefix that matches is used, and that prefix is substituted with the replacement.  A common pattern is to use source-map in conjunction with the clang -fdebug-prefix-map flag.  In the build, use `-fdebug-prefix-map=/path/to/build_dir=.` to rewrite the host specific build directory to `.`.  Then for debugging, use `settings set target.source-map . /path/to/local_dir` to convert `.` to a valid local path.">;
+  def ObjectMap: Property<"object-map", "PathMap">,
+    DefaultStringValue<"">,
+    Desc<"Object path remappings apply substitutions to the paths of object files, typically needed to debug from a different host than the one that built the target.  The object-map property consists of an array of pairs, the first element is a path prefix, and the second is its replacement.  The syntax is `prefix1 replacement1 prefix2 replacement2...`.  The pairs are checked in order, the first prefix that matches is used, and that prefix is substituted with the replacement.">;
   def AutoSourceMapRelative: Property<"auto-source-map-relative", "Boolean">,
     DefaultTrue,
     Desc<"Automatically deduce source path mappings based on source file breakpoint resolution. It only deduces source mapping if source file breakpoint request is using full path and if the debug info contains relative paths.">;
diff --git a/lldb/test/API/api/multiple-debuggers/TestMultipleDebuggers.py b/lldb/test/API/api/multiple-debuggers/TestMultipleDebuggers.py
index fc9ddac..bee6e66 100644
--- a/lldb/test/API/api/multiple-debuggers/TestMultipleDebuggers.py
+++ b/lldb/test/API/api/multiple-debuggers/TestMultipleDebuggers.py
@@ -12,15 +12,17 @@ from lldbsuite.test import lldbutil
 class TestMultipleSimultaneousDebuggers(TestBase):
     NO_DEBUG_INFO_TESTCASE = True
 
-    # This test has been flaky lately on Linux buildbots and Github/Buildkite CI
-    # runs.
-    @skipIfLinux
+    # Sometimes times out on Linux, see https://github.com/llvm/llvm-project/issues/101162.
     @skipIfNoSBHeaders
     @skipIfWindows
     @skipIfHostIncompatibleWithTarget
     def test_multiple_debuggers(self):
         self.driver_exe = self.getBuildArtifact("multi-process-driver")
-        self.buildDriver("multi-process-driver.cpp", self.driver_exe)
+        self.buildDriver(
+            "multi-process-driver.cpp",
+            self.driver_exe,
+            defines=[("LLDB_HOST_ARCH", lldbplatformutil.getArchitecture())],
+        )
         self.addTearDownHook(lambda: os.remove(self.driver_exe))
 
         self.inferior_exe = self.getBuildArtifact("testprog")
diff --git a/lldb/test/API/api/multiple-debuggers/multi-process-driver.cpp b/lldb/test/API/api/multiple-debuggers/multi-process-driver.cpp
index c9c0bcf..64728fb 100644
--- a/lldb/test/API/api/multiple-debuggers/multi-process-driver.cpp
+++ b/lldb/test/API/api/multiple-debuggers/multi-process-driver.cpp
@@ -16,6 +16,7 @@
 #include <stdio.h>
 #include <stdlib.h>
 #include <string.h>
+#include <inttypes.h>
 
 #include "lldb/API/LLDB.h"
 #include "lldb/API/SBCommandInterpreter.h"
@@ -30,6 +31,9 @@
 
 #define DEBUG 0
 
+#define STR1(x) #x
+#define STR(x) STR1(x)
+
 using namespace lldb;
 
 bool *completed_threads_array = 0;
@@ -102,20 +106,21 @@ void *do_one_debugger (void *in)
     if (debugger.IsValid ())
     {
         debugger.SetAsync (true);
-        SBTarget target = debugger.CreateTargetWithFileAndArch(inferior_process_name, "x86_64");
+        SBTarget target = debugger.CreateTargetWithFileAndArch(inferior_process_name,
+                                                               STR(LLDB_HOST_ARCH));
         SBCommandInterpreter command_interp = debugger.GetCommandInterpreter();
         if (target.IsValid())
         {
             SBBreakpoint bar_br = target.BreakpointCreateByName ("bar", "testprog");
             if (!bar_br.IsValid())
             {
-                printf ("#%lld: failed to set breakpoint on bar, exiting.\n", threadnum);
+                printf ("#%" PRIu64 ": failed to set breakpoint on bar, exiting.\n", threadnum);
                 exit (1);
             }
             SBBreakpoint foo_br = target.BreakpointCreateByName ("foo", "testprog");
             if (!foo_br.IsValid())
             {
-                printf ("#%lld: Failed to set breakpoint on foo()\n", threadnum);
+                printf ("#%" PRIu64 ": Failed to set breakpoint on foo()\n", threadnum);
             }
 
             SBLaunchInfo launch_info (NULL);
@@ -136,15 +141,17 @@ void *do_one_debugger (void *in)
 
                 if (!walk_stack_to_main (process.GetThreadAtIndex(0)))
                 {
-                    printf ("#%lld: backtrace while @ foo() failed\n", threadnum);
+                    printf ("#%" PRIu64 ": backtrace while @ foo() failed\n", threadnum);
                     completed_threads_array[threadnum] = true;
                     return (void *) 1;
                 }
 
-                if (strcmp (process.GetThreadAtIndex(0).GetFrameAtIndex(0).GetFunctionName(), "foo") != 0)
+                // On Linux the () are included.
+                const char* hit_fn = process.GetThreadAtIndex(0).GetFrameAtIndex(0).GetFunctionName();
+                if (strcmp (hit_fn, "foo") != 0 && strcmp (hit_fn, "foo()") != 0)
                 {
 #if DEBUG == 1
-                    printf ("#%lld: First breakpoint did not stop at foo(), instead stopped at '%s'\n", threadnum, process.GetThreadAtIndex(0).GetFrameAtIndex(0).GetFunctionName());
+                    printf ("#%" PRIu64 ": First breakpoint did not stop at foo(), instead stopped at '%s'\n", threadnum, process.GetThreadAtIndex(0).GetFrameAtIndex(0).GetFunctionName());
 #endif
                     completed_threads_array[threadnum] = true;
                     return (void*) 1;
@@ -156,7 +163,7 @@ void *do_one_debugger (void *in)
 
                 if (process.GetState() == StateType::eStateExited)
                 {
-                    printf ("#%lld: Process exited\n", threadnum);
+                    printf ("#%" PRIu64 ": Process exited\n", threadnum);
                     completed_threads_array[threadnum] = true;
                     return (void *) 1;
                 }
@@ -164,14 +171,15 @@ void *do_one_debugger (void *in)
 
                 if (!walk_stack_to_main (process.GetThreadAtIndex(0)))
                 {
-                    printf ("#%lld: backtrace while @ bar() failed\n", threadnum);
+                    printf ("#%" PRIu64 ": backtrace while @ bar() failed\n", threadnum);
                     completed_threads_array[threadnum] = true;
                     return (void *) 1;
                 }
 
-                if (strcmp (process.GetThreadAtIndex(0).GetFrameAtIndex(0).GetFunctionName(), "bar") != 0)
+                hit_fn = process.GetThreadAtIndex(0).GetFrameAtIndex(0).GetFunctionName();
+                if (strcmp (hit_fn, "bar") != 0 && strcmp (hit_fn, "bar()") != 0)
                 {
-                    printf ("#%lld: First breakpoint did not stop at bar()\n", threadnum);
+                    printf ("#%" PRIu64 ": First breakpoint did not stop at bar()\n", threadnum);
                     completed_threads_array[threadnum] = true;
                     return (void*) 1;
                 }
@@ -183,7 +191,7 @@ void *do_one_debugger (void *in)
                 SBDebugger::Destroy(debugger);
 
 #if DEBUG == 1
-                printf ("#%lld: All good!\n", threadnum);
+                printf ("#%" PRIu64 ": All good!\n", threadnum);
 #endif
                 successful_threads_array[threadnum] = true;
                 completed_threads_array[threadnum] = true;
@@ -191,7 +199,7 @@ void *do_one_debugger (void *in)
             }
             else
             {
-                printf("#%lld: process failed to launch\n", threadnum);
+                printf("#%" PRIu64 ": process failed to launch\n", threadnum);
                 successful_threads_array[threadnum] = false;
                 completed_threads_array[threadnum] = true;
                 return (void*) 0;
@@ -199,7 +207,7 @@ void *do_one_debugger (void *in)
         }
         else
         {
-            printf ("#%lld: did not get valid target\n", threadnum);
+            printf ("#%" PRIu64 ": did not get valid target\n", threadnum);
             successful_threads_array[threadnum] = false;
             completed_threads_array[threadnum] = true;
             return (void*) 0;
@@ -207,7 +215,7 @@ void *do_one_debugger (void *in)
     }
     else
     {
-        printf ("#%lld: did not get debugger\n", threadnum);
+        printf ("#%" PRIu64 ": did not get debugger\n", threadnum);
         successful_threads_array[threadnum] = false;
         completed_threads_array[threadnum] = true;
         return (void*) 0;
diff --git a/lldb/test/API/functionalities/postmortem/elf-core/TestLinuxCore.py b/lldb/test/API/functionalities/postmortem/elf-core/TestLinuxCore.py
index 0afac26..0b9d17b 100644
--- a/lldb/test/API/functionalities/postmortem/elf-core/TestLinuxCore.py
+++ b/lldb/test/API/functionalities/postmortem/elf-core/TestLinuxCore.py
@@ -249,6 +249,32 @@ class LinuxCoreTestCase(TestBase):
 
         self.dbg.DeleteTarget(target)
 
+    def test_object_map(self):
+        """Test that lldb can find the exe for an i386 linux core file using the object map."""
+
+        # Copy linux-i386.out to lldb_i386_object_map/a.out
+        tmp_object_map_root = os.path.join(self.getBuildDir(), "lldb_i386_object_map")
+        executable = os.path.join(tmp_object_map_root, "a.out")
+        lldbutil.mkdir_p(os.path.dirname(executable))
+        shutil.copyfile("linux-i386.out", executable)
+
+        # Replace the original module path at /home/labath/test and load the core
+        self.runCmd(
+            "settings set target.object-map /home/labath/test {}".format(
+                tmp_object_map_root
+            )
+        )
+
+        target = self.dbg.CreateTarget(None)
+        process = target.LoadCore("linux-i386.core")
+
+        # Check that we did load the mapped executable
+        exe_module_spec = process.GetTarget().GetModuleAtIndex(0).GetFileSpec()
+        self.assertTrue(exe_module_spec.fullpath.startswith(tmp_object_map_root))
+
+        self.check_all(process, self._i386_pid, self._i386_regions, "a.out")
+        self.dbg.DeleteTarget(target)
+
     @skipIfLLVMTargetMissing("X86")
     @skipIfWindows
     def test_x86_64_sysroot(self):
diff --git a/lldb/test/API/lang/c/struct_types/main.c b/lldb/test/API/lang/c/struct_types/main.c
index e683c49..70217c5 100644
--- a/lldb/test/API/lang/c/struct_types/main.c
+++ b/lldb/test/API/lang/c/struct_types/main.c
@@ -1,3 +1,4 @@
+// clang-format off
 struct things_to_sum {
     int a;
     int b;
@@ -18,7 +19,7 @@ int main (int argc, char const *argv[])
     }; //% self.expect("frame variable pt.padding[0]", DATA_TYPES_DISPLAYED_CORRECTLY, substrs = ["pt.padding[0] = "])
        //% self.expect("frame variable pt.padding[1]", DATA_TYPES_DISPLAYED_CORRECTLY, substrs = ["pt.padding[1] = "])
        //% self.expect_expr("pt.padding[0]", result_type="char")
-       //% self.expect("image lookup -t point_tag", DATA_TYPES_DISPLAYED_CORRECTLY, substrs = ['padding[]'])
+       //% self.expect("image lookup -t point_tag", DATA_TYPES_DISPLAYED_CORRECTLY, substrs = ['padding[0]'])
 
     struct {} empty;
     //% self.expect("frame variable empty", substrs = ["empty = {}"])
diff --git a/lldb/test/Shell/SymbolFile/DWARF/vla.cpp b/lldb/test/Shell/SymbolFile/DWARF/vla.cpp
new file mode 100644
index 0000000..344b100
--- /dev/null
+++ b/lldb/test/Shell/SymbolFile/DWARF/vla.cpp
@@ -0,0 +1,80 @@
+// RUN: %clangxx_host -gdwarf -std=c++11 -o %t %s
+// RUN: %lldb %t \
+// RUN:   -o run \
+// RUN:   -o "frame var --show-types f" \
+// RUN:   -o "frame var vla0" \
+// RUN:   -o "frame var fla0" \
+// RUN:   -o "frame var fla1" \
+// RUN:   -o "frame var vla01" \
+// RUN:   -o "frame var vla10" \
+// RUN:   -o "frame var vlaN" \
+// RUN:   -o "frame var vlaNM" \
+// RUN:   -o exit | FileCheck %s
+
+struct Foo {
+  static constexpr int n = 1;
+  int m_vlaN[n];
+
+  int m_vla0[0];
+};
+
+int main() {
+  Foo f;
+  f.m_vlaN[0] = 60;
+
+  // CHECK:      (lldb) frame var --show-types f
+  // CHECK-NEXT: (Foo) f = {
+  // CHECK-NEXT:   (int[1]) m_vlaN = {
+  // CHECK-NEXT:     (int) [0] = 60
+  // CHECK-NEXT:   }
+  // CHECK-NEXT:   (int[0]) m_vla0 = {}
+  // CHECK-NEXT: }
+
+  int vla0[0] = {};
+
+  // CHECK:      (lldb) frame var vla0
+  // CHECK-NEXT: (int[0]) vla0 = {}
+
+  int fla0[] = {};
+
+  // CHECK:      (lldb) frame var fla0
+  // CHECK-NEXT: (int[0]) fla0 = {}
+
+  int fla1[] = {42};
+
+  // CHECK:      (lldb) frame var fla1
+  // CHECK-NEXT: (int[1]) fla1 = ([0] = 42)
+
+  int vla01[0][1];
+
+  // CHECK:      (lldb) frame var vla01
+  // CHECK-NEXT: (int[0][1]) vla01 = {}
+
+  int vla10[1][0];
+
+  // CHECK:      (lldb) frame var vla10
+  // CHECK-NEXT: (int[1][0]) vla10 = ([0] = int[0]
+
+  int n = 3;
+  int vlaN[n];
+  for (int i = 0; i < n; ++i)
+    vlaN[i] = -i;
+
+  // CHECK:      (lldb) frame var vlaN
+  // CHECK-NEXT: (int[]) vlaN = ([0] = 0, [1] = -1, [2] = -2)
+
+  int m = 2;
+  int vlaNM[n][m];
+  for (int i = 0; i < n; ++i)
+    for (int j = 0; j < m; ++j)
+      vlaNM[i][j] = i + j;
+
+  // FIXME: multi-dimensional VLAs aren't well supported
+  // CHECK:      (lldb) frame var vlaNM
+  // CHECK-NEXT: (int[][]) vlaNM = {
+  // CHECK-NEXT:   [0] = ([0] = 0, [1] = 1, [2] = 1)
+  // CHECK-NEXT:   [1] = ([0] = 1, [1] = 1, [2] = 2)
+  // CHECK-NEXT: }
+
+  __builtin_debugtrap();
+}
diff --git a/lldb/tools/lldb-dap/README.md b/lldb/tools/lldb-dap/README.md
index 8ecbaf7..11a14d2 100644
--- a/lldb/tools/lldb-dap/README.md
+++ b/lldb/tools/lldb-dap/README.md
@@ -157,6 +157,20 @@ locally on port `2345`.
 }
 ```
 
+You can also use the `gdb-remote-port` parameter to send an attach request
+to a debug server running on the current machine,
+instead of using the custom command `attachCommands`.
+
+```javascript
+{
+  "name": "Local Debug Server",
+  "type": "lldb-dap",
+  "request": "attach",
+  "program": "/tmp/a.out",
+  "gdb-remote-port": 2345,
+}
+```
+
 #### Connect to a Debug Server on Another Machine
 
 This connects to a debug server running on another machine with hostname
@@ -173,6 +187,23 @@ port `5678` of that other machine.
 }
 ```
 
+You can also use the `gdb-remote-hostname` and `gdb-remote-port` parameters
+to send an attach request to a debug server running on a different machine,
+instead of custom command `attachCommands`.
+The default hostname being used `localhost`.
+
+
+```javascript
+{
+  "name": "Local Debug Server",
+  "type": "lldb-dap",
+  "request": "attach",
+  "program": "/tmp/a.out",
+  "gdb-remote-port": 5678,
+  "gdb-remote-hostname": "hostname",
+}
+```
+
 ## Custom debugger commands
 
 The `lldb-dap` tool includes additional custom commands to support the Debug
diff --git a/lldb/tools/lldb-dap/package.json b/lldb/tools/lldb-dap/package.json
index fd5de30..97e4efe 100644
--- a/lldb/tools/lldb-dap/package.json
+++ b/lldb/tools/lldb-dap/package.json
@@ -1,7 +1,7 @@
 {
   "name": "lldb-dap",
   "displayName": "LLDB DAP",
-  "version": "0.2.2",
+  "version": "0.2.3",
   "publisher": "llvm-vs-code-extensions",
   "homepage": "https://lldb.llvm.org",
   "description": "LLDB debugging from VSCode",
@@ -353,7 +353,7 @@
                   "number",
                   "string"
                 ],
-                "description": "TCP/IP port to attach to. Specifying both pid and port is an error."
+                "description": "TCP/IP port to attach to a remote system. Specifying both pid and port is an error."
               },
               "gdb-remote-hostname": {
                 "type": "string",
diff --git a/lldb/tools/lldb-server/LLDBServerUtilities.cpp b/lldb/tools/lldb-server/LLDBServerUtilities.cpp
index c3a8df1..5facfbf 100644
--- a/lldb/tools/lldb-server/LLDBServerUtilities.cpp
+++ b/lldb/tools/lldb-server/LLDBServerUtilities.cpp
@@ -27,11 +27,13 @@ public:
       : m_stream_sp(stream_sp) {}
 
   void Emit(llvm::StringRef message) override {
+    std::lock_guard<std::mutex> guard(m_mutex);
     (*m_stream_sp) << message;
     m_stream_sp->flush();
   }
 
 private:
+  std::mutex m_mutex;
   std::shared_ptr<raw_ostream> m_stream_sp;
 };
 
diff --git a/llvm/CMakeLists.txt b/llvm/CMakeLists.txt
index 1261896..699de1c 100644
--- a/llvm/CMakeLists.txt
+++ b/llvm/CMakeLists.txt
@@ -1423,3 +1423,16 @@ endif()
 if (LLVM_INCLUDE_UTILS AND LLVM_INCLUDE_TOOLS)
   add_subdirectory(utils/llvm-locstats)
 endif()
+
+if (XCODE)
+  # For additional targets that you would like to add schemes, specify e.g:
+  #
+  #   -DLLVM_XCODE_EXTRA_TARGET_SCHEMES="TargetParserTests;SupportTests"
+  #
+  # at CMake configure time.
+  set(LLVM_XCODE_EXTRA_TARGET_SCHEMES "" CACHE STRING "Specifies an extra list of targets to turn into schemes")
+
+  foreach(target ${LLVM_XCODE_EXTRA_TARGET_SCHEMES})
+    set_target_properties(${target} PROPERTIES XCODE_GENERATE_SCHEME ON)
+  endforeach()
+endif()
diff --git a/llvm/CODE_OWNERS.TXT b/llvm/CODE_OWNERS.TXT
index d1620d1c..5b4df55 100644
--- a/llvm/CODE_OWNERS.TXT
+++ b/llvm/CODE_OWNERS.TXT
@@ -263,3 +263,7 @@ D: C-SKY backend (lib/Target/CSKY/*)
 N: Ilia Diachkov
 E: ilia.diachkov@gmail.com
 D: SPIR-V backend (lib/Target/SPIRV/*)
+
+N: Christopher Apple, David Trevelyan
+E: cja-private@pm.me, david.trevelyan@gmail.com
+D: RealtimeSanitizer (LLVM part)
diff --git a/llvm/cmake/modules/AddLLVM.cmake b/llvm/cmake/modules/AddLLVM.cmake
index 03f4e1f..bb4e996 100644
--- a/llvm/cmake/modules/AddLLVM.cmake
+++ b/llvm/cmake/modules/AddLLVM.cmake
@@ -327,8 +327,8 @@ function(add_link_opts target_name)
       elseif(${CMAKE_SYSTEM_NAME} MATCHES "SunOS" AND LLVM_LINKER_IS_SOLARISLD)
         # Support for ld -z discard-unused=sections was only added in
         # Solaris 11.4.  GNU ld ignores it, but warns every time.
-        include(LLVMCheckLinkerFlag)
-        llvm_check_linker_flag(CXX "-Wl,-z,discard-unused=sections" LINKER_SUPPORTS_Z_DISCARD_UNUSED)
+        include(CheckLinkerFlag)
+        check_linker_flag(CXX "-Wl,-z,discard-unused=sections" LINKER_SUPPORTS_Z_DISCARD_UNUSED)
         if (LINKER_SUPPORTS_Z_DISCARD_UNUSED)
           set_property(TARGET ${target_name} APPEND_STRING PROPERTY
                        LINK_FLAGS " -Wl,-z,discard-unused=sections")
@@ -1452,6 +1452,7 @@ macro(llvm_add_tool project name)
   endif()
   get_subproject_title(subproject_title)
   set_target_properties(${name} PROPERTIES FOLDER "${subproject_title}/Tools")
+  set_target_properties(${name} PROPERTIES XCODE_GENERATE_SCHEME ON)
 endmacro(llvm_add_tool project name)
 
 macro(add_llvm_tool name)
@@ -2043,6 +2044,7 @@ function(add_lit_target target comment)
 
   # Tests should be excluded from "Build Solution".
   set_target_properties(${target} PROPERTIES EXCLUDE_FROM_DEFAULT_BUILD ON)
+  set_target_properties(${target} PROPERTIES XCODE_GENERATE_SCHEME ON)
 endfunction()
 
 # Convert a target name like check-clang to a variable name like CLANG.
diff --git a/llvm/cmake/modules/HandleLLVMOptions.cmake b/llvm/cmake/modules/HandleLLVMOptions.cmake
index 5ca580fb..bdbd361 100644
--- a/llvm/cmake/modules/HandleLLVMOptions.cmake
+++ b/llvm/cmake/modules/HandleLLVMOptions.cmake
@@ -1061,8 +1061,8 @@ if (LLVM_USE_SPLIT_DWARF AND
   if (CMAKE_CXX_COMPILER_ID MATCHES "Clang" OR
       CMAKE_CXX_COMPILER_ID STREQUAL "GNU")
     add_compile_options($<$<COMPILE_LANGUAGE:C,CXX>:-gsplit-dwarf>)
-    include(LLVMCheckLinkerFlag)
-    llvm_check_linker_flag(CXX "-Wl,--gdb-index" LINKER_SUPPORTS_GDB_INDEX)
+    include(CheckLinkerFlag)
+    check_linker_flag(CXX "-Wl,--gdb-index" LINKER_SUPPORTS_GDB_INDEX)
     append_if(LINKER_SUPPORTS_GDB_INDEX "-Wl,--gdb-index"
       CMAKE_EXE_LINKER_FLAGS CMAKE_MODULE_LINKER_FLAGS CMAKE_SHARED_LINKER_FLAGS)
   endif()
@@ -1083,8 +1083,8 @@ endif()
 
 # lld doesn't print colored diagnostics when invoked from Ninja
 if (UNIX AND CMAKE_GENERATOR MATCHES "Ninja")
-  include(LLVMCheckLinkerFlag)
-  llvm_check_linker_flag(CXX "-Wl,--color-diagnostics" LINKER_SUPPORTS_COLOR_DIAGNOSTICS)
+  include(CheckLinkerFlag)
+  check_linker_flag(CXX "-Wl,--color-diagnostics" LINKER_SUPPORTS_COLOR_DIAGNOSTICS)
   append_if(LINKER_SUPPORTS_COLOR_DIAGNOSTICS "-Wl,--color-diagnostics"
     CMAKE_EXE_LINKER_FLAGS CMAKE_MODULE_LINKER_FLAGS CMAKE_SHARED_LINKER_FLAGS)
 endif()
diff --git a/llvm/cmake/modules/HandleLLVMStdlib.cmake b/llvm/cmake/modules/HandleLLVMStdlib.cmake
index 7afc10c..a7e138a 100644
--- a/llvm/cmake/modules/HandleLLVMStdlib.cmake
+++ b/llvm/cmake/modules/HandleLLVMStdlib.cmake
@@ -13,12 +13,12 @@ if(NOT DEFINED LLVM_STDLIB_HANDLED)
   endfunction()
 
   include(CheckCXXCompilerFlag)
-  include(LLVMCheckLinkerFlag)
+  include(CheckLinkerFlag)
   set(LLVM_LIBCXX_USED 0)
   if(LLVM_ENABLE_LIBCXX)
     if(LLVM_COMPILER_IS_GCC_COMPATIBLE)
       check_cxx_compiler_flag("-stdlib=libc++" CXX_COMPILER_SUPPORTS_STDLIB)
-      llvm_check_linker_flag(CXX "-stdlib=libc++" CXX_LINKER_SUPPORTS_STDLIB)
+      check_linker_flag(CXX "-stdlib=libc++" CXX_LINKER_SUPPORTS_STDLIB)
       if(CXX_COMPILER_SUPPORTS_STDLIB AND CXX_LINKER_SUPPORTS_STDLIB)
         append("-stdlib=libc++"
           CMAKE_CXX_FLAGS CMAKE_EXE_LINKER_FLAGS CMAKE_SHARED_LINKER_FLAGS
@@ -36,7 +36,7 @@ if(NOT DEFINED LLVM_STDLIB_HANDLED)
     if(LLVM_COMPILER_IS_GCC_COMPATIBLE)
       check_cxx_compiler_flag("-static-libstdc++"
                               CXX_COMPILER_SUPPORTS_STATIC_STDLIB)
-      llvm_check_linker_flag(CXX "-static-libstdc++" CXX_LINKER_SUPPORTS_STATIC_STDLIB)
+      check_linker_flag(CXX "-static-libstdc++" CXX_LINKER_SUPPORTS_STATIC_STDLIB)
       if(CXX_COMPILER_SUPPORTS_STATIC_STDLIB AND
         CXX_LINKER_SUPPORTS_STATIC_STDLIB)
         append("-static-libstdc++"
diff --git a/llvm/cmake/modules/LLVMCheckLinkerFlag.cmake b/llvm/cmake/modules/LLVMCheckLinkerFlag.cmake
deleted file mode 100644
index e09bbc6..0000000
--- a/llvm/cmake/modules/LLVMCheckLinkerFlag.cmake
+++ /dev/null
@@ -1,28 +0,0 @@
-include(CheckLinkerFlag OPTIONAL)
-
-if (COMMAND check_linker_flag)
-  macro(llvm_check_linker_flag)
-    check_linker_flag(${ARGN})
-  endmacro()
-else()
-  # Until the minimum CMAKE version is 3.18
-
-  include(CheckCXXCompilerFlag)
-
-  # cmake builtin compatible, except we assume lang is C or CXX
-  function(llvm_check_linker_flag lang flag out_var)
-    cmake_policy(PUSH)
-    cmake_policy(SET CMP0056 NEW)
-    set(_CMAKE_EXE_LINKER_FLAGS_SAVE ${CMAKE_EXE_LINKER_FLAGS})
-    set(CMAKE_EXE_LINKER_FLAGS "${CMAKE_EXE_LINKER_FLAGS} ${flag}")
-    if("${lang}" STREQUAL "C")
-      check_c_compiler_flag("" ${out_var})
-    elseif("${lang}" STREQUAL "CXX")
-      check_cxx_compiler_flag("" ${out_var})
-    else()
-      message(FATAL_ERROR "\"${lang}\" is not C or CXX")
-    endif()
-    set(CMAKE_EXE_LINKER_FLAGS ${_CMAKE_EXE_LINKER_FLAGS_SAVE})
-    cmake_policy(POP)
-  endfunction()
-endif()
diff --git a/llvm/docs/CommandGuide/llvm-objcopy.rst b/llvm/docs/CommandGuide/llvm-objcopy.rst
index 8ccb025..7f20a76 100644
--- a/llvm/docs/CommandGuide/llvm-objcopy.rst
+++ b/llvm/docs/CommandGuide/llvm-objcopy.rst
@@ -303,6 +303,15 @@ them.
 
  Shift LMA of non-zero-sized segments by ``<val>``.
 
+.. option:: --change-section-address <section>{=+-}<val>, --adjust-section-vma
+
+ Change the address of sections that match ``<section>`` pattern to the
+ specified value, or apply ``+<val>``/``-<val>`` to the current value. Can be
+ specified multiple times to specify multiple patterns. Each section is only
+ modified by one ``--change-section-address`` argument. If a section name
+ matches multiple patterns, the rightmost change applies. The object file needs
+ to be of ET_REL type.
+
 .. option:: --change-start <incr>, --adjust-start
 
  Add ``<incr>`` to the program's start address. Can be specified multiple
diff --git a/llvm/docs/DirectX/DXILResources.rst b/llvm/docs/DirectX/DXILResources.rst
index 5bbe902..aef88bc 100644
--- a/llvm/docs/DirectX/DXILResources.rst
+++ b/llvm/docs/DirectX/DXILResources.rst
@@ -96,7 +96,7 @@ Buffers
 
 .. code-block:: llvm
 
-   target("dx.TypedBuffer", ElementType, IsWriteable, IsROV)
+   target("dx.TypedBuffer", ElementType, IsWriteable, IsROV, IsSigned)
    target("dx.RawBuffer", ElementType, IsWriteable, IsROV)
 
 We need two separate buffer types to account for the differences between the
@@ -106,9 +106,14 @@ used for DXIL's RawBuffers and StructuredBuffers. We call the latter
 "RawBuffer" to match the naming of the operations, but it can represent both
 the Raw and Structured variants.
 
-For TypedBuffer, the element type must be an integer or floating point type.
-For RawBuffer the type can be an integer, floating point, or struct type.
-HLSL's ByteAddressBuffer is represented by an `i8` element type.
+HLSL's Buffer and RWBuffer are represented as a TypedBuffer with an element
+type that is a scalar integer or floating point type, or a vector of at most 4
+such types. HLSL's ByteAddressBuffer is a RawBuffer with an `i8` element type.
+HLSL's StructuredBuffers are RawBuffer with a struct, vector, or scalar type.
+
+One unfortunate necessity here is that TypedBuffer needs an extra parameter to
+differentiate signed vs unsigned ints. The is because in LLVM IR int types
+don't have a sign, so to keep this information we need a side channel.
 
 These types are generally used by BufferLoad and BufferStore operations, as
 well as atomics.
@@ -128,6 +133,8 @@ There are a few fields to describe variants of all of these types:
        writeable) and UAVs (writeable).
    * - IsROV
      - Whether the UAV is a rasterizer ordered view. Always ``0`` for SRVs.
+   * - IsSigned
+     - Whether an int element type is signed ("dx.TypedBuffer" only)
 
 .. _bufferLoad: https://github.com/microsoft/DirectXShaderCompiler/blob/main/docs/DXIL.rst#bufferload
 .. _bufferStore: https://github.com/microsoft/DirectXShaderCompiler/blob/main/docs/DXIL.rst#bufferstore
@@ -197,23 +204,23 @@ Examples:
 .. code-block:: llvm
 
    ; RWBuffer<float4> Buf : register(u5, space3)
-   %buf = call target("dx.TypedBuffer", float, 1, 0)
+   %buf = call target("dx.TypedBuffer", <4 x float>, 1, 0, 0)
                @llvm.dx.handle.fromBinding.tdx.TypedBuffer_f32_1_0(
                    i32 3, i32 5, i32 1, i32 0, i1 false)
 
-   ; RWBuffer<uint> Buf : register(u7, space2)
-   %buf = call target("dx.TypedBuffer", i32, 1, 0)
+   ; RWBuffer<int> Buf : register(u7, space2)
+   %buf = call target("dx.TypedBuffer", i32, 1, 0, 1)
                @llvm.dx.handle.fromBinding.tdx.TypedBuffer_i32_1_0t(
                    i32 2, i32 7, i32 1, i32 0, i1 false)
 
    ; Buffer<uint4> Buf[24] : register(t3, space5)
-   %buf = call target("dx.TypedBuffer", i32, 0, 0)
+   %buf = call target("dx.TypedBuffer", <4 x i32>, 0, 0, 0)
                @llvm.dx.handle.fromBinding.tdx.TypedBuffer_i32_0_0t(
                    i32 2, i32 7, i32 24, i32 0, i1 false)
 
    ; struct S { float4 a; uint4 b; };
    ; StructuredBuffer<S> Buf : register(t2, space4)
-   %buf = call target("dx.RawBuffer", {<4 x f32>, <4 x i32>}, 0, 0)
+   %buf = call target("dx.RawBuffer", {<4 x float>, <4 x i32>}, 0, 0)
                @llvm.dx.handle.fromBinding.tdx.RawBuffer_sl_v4f32v4i32s_0_0t(
                    i32 4, i32 2, i32 1, i32 0, i1 false)
 
diff --git a/llvm/docs/GettingStarted.rst b/llvm/docs/GettingStarted.rst
index 0a1913d..e03ae5e 100644
--- a/llvm/docs/GettingStarted.rst
+++ b/llvm/docs/GettingStarted.rst
@@ -291,10 +291,11 @@ uses the package and provides other details.
 =========================================================== ============ ==========================================
 Package                                                     Version      Notes
 =========================================================== ============ ==========================================
-`CMake <http://cmake.org/>`__                               >=3.20.0     Makefile/workspace generator
+`CMake <http://cmake.org/>`_                                >=3.20.0     Makefile/workspace generator
 `python <http://www.python.org/>`_                          >=3.8        Automated test suite\ :sup:`1`
 `zlib <http://zlib.net>`_                                   >=1.2.3.4    Compression library\ :sup:`2`
 `GNU Make <http://savannah.gnu.org/projects/make>`_         3.79, 3.79.1 Makefile/build processor\ :sup:`3`
+`PyYAML <https://pypi.org/project/PyYAML/>`_                >=5.1        Header generator\ :sup:`4`
 =========================================================== ============ ==========================================
 
 .. note::
@@ -305,6 +306,7 @@ Package                                                     Version      Notes
    #. Optional, adds compression / uncompression capabilities to selected LLVM
       tools.
    #. Optional, you can use any other build tool supported by CMake.
+   #. Only needed when building libc with New Headergen. Mainly used by libc.
 
 Additionally, your compilation host is expected to have the usual plethora of
 Unix utilities. Specifically:
diff --git a/llvm/docs/ProgrammersManual.rst b/llvm/docs/ProgrammersManual.rst
index e00165c..231de56 100644
--- a/llvm/docs/ProgrammersManual.rst
+++ b/llvm/docs/ProgrammersManual.rst
@@ -1392,6 +1392,7 @@ How to use reduce-chunk-list:
 First, Figure out the number of calls to the debug counter you want to minimize.
 To do so, run the compilation command causing you want to minimize with `-print-debug-counter` adding a `-mllvm` if needed.
 Than find the line with the counter of interest. it should look like:
+
 .. code-block:: none
 
   my-counter               : {5678,empty}
@@ -1400,6 +1401,7 @@ The number of calls to `my-counter` is 5678
 
 Than Find the minimum set of chunks that is interesting, with `reduce-chunk-list`.
 Build a reproducer script like:
+
 .. code-block:: bash
 
   #! /bin/bash
diff --git a/llvm/docs/SourceLevelDebugging.rst b/llvm/docs/SourceLevelDebugging.rst
index 0acc929..c1a95ef 100644
--- a/llvm/docs/SourceLevelDebugging.rst
+++ b/llvm/docs/SourceLevelDebugging.rst
@@ -389,12 +389,12 @@ Compiled to LLVM, this function would be represented like this:
     %X = alloca i32, align 4
     %Y = alloca i32, align 4
     %Z = alloca i32, align 4
-      #dbg_declare(ptr %X, !11, !DIExpression(), !14)
-    store i32 21, i32* %X, align 4, !dbg !14
-      #dbg_declare(ptr %Y, !15, !DIExpression(), !16)
-    store i32 22, i32* %Y, align 4, !dbg !16
-      #dbg_declare(ptr %Z, !17, !DIExpression(), !19)
-    store i32 23, i32* %Z, align 4, !dbg !19
+      #dbg_declare(ptr %X, !11, !DIExpression(), !13)
+    store i32 21, i32* %X, align 4, !dbg !13
+      #dbg_declare(ptr %Y, !14, !DIExpression(), !15)
+    store i32 22, i32* %Y, align 4, !dbg !15
+      #dbg_declare(ptr %Z, !16, !DIExpression(), !18)
+    store i32 23, i32* %Z, align 4, !dbg !18
     %0 = load i32, i32* %X, align 4, !dbg !20
     store i32 %0, i32* %Z, align 4, !dbg !21
     %1 = load i32, i32* %Y, align 4, !dbg !22
@@ -427,9 +427,9 @@ Compiled to LLVM, this function would be represented like this:
   !15 = !DILocation(line: 3, column: 9, scope: !4)
   !16 = !DILocalVariable(name: "Z", scope: !18, file: !1, line: 5, type: !12)
   !17 = distinct !DILexicalBlock(scope: !4, file: !1, line: 4, column: 5)
-  !18 = !DILocation(line: 5, column: 11, scope: !18)
-  !29 = !DILocation(line: 6, column: 11, scope: !18)
-  !20 = !DILocation(line: 6, column: 9, scope: !18)
+  !18 = !DILocation(line: 5, column: 11, scope: !17)
+  !29 = !DILocation(line: 6, column: 11, scope: !17)
+  !20 = !DILocation(line: 6, column: 9, scope: !17)
   !21 = !DILocation(line: 8, column: 9, scope: !4)
   !22 = !DILocation(line: 8, column: 7, scope: !4)
   !23 = !DILocation(line: 9, column: 3, scope: !4)
@@ -443,21 +443,21 @@ variable definitions, and the code used to implement the function.
 
 .. code-block:: llvm
 
-    #dbg_declare(ptr %X, !11, !DIExpression(), !14)
-    ; [debug line = 2:7] [debug variable = X]
+    #dbg_declare(ptr %X, !11, !DIExpression(), !13)
+    ; [debug line = 2:9] [debug variable = X]
 
 The first record ``#dbg_declare`` encodes debugging information for the
-variable ``X``.  The location ``!14`` at the end of the record provides
+variable ``X``.  The location ``!13`` at the end of the record provides
 scope information for the variable ``X``.
 
 .. code-block:: text
 
-  !14 = !DILocation(line: 2, column: 9, scope: !4)
+  !13 = !DILocation(line: 2, column: 9, scope: !4)
   !4 = distinct !DISubprogram(name: "foo", scope: !1, file: !1, line: 1, type: !5,
                               isLocal: false, isDefinition: true, scopeLine: 1,
                               isOptimized: false, retainedNodes: !2)
 
-Here ``!14`` is metadata providing `location information
+Here ``!13`` is metadata providing `location information
 <LangRef.html#dilocation>`_.  In this example, scope is encoded by ``!4``, a
 `subprogram descriptor <LangRef.html#disubprogram>`_.  This way the location
 information parameter to the records indicates that the variable ``X`` is
@@ -467,20 +467,20 @@ Now lets take another example.
 
 .. code-block:: llvm
 
-    #dbg_declare(ptr %Z, !17, !DIExpression(), !19)
-    ; [debug line = 5:9] [debug variable = Z]
+    #dbg_declare(ptr %Z, !16, !DIExpression(), !18)
+    ; [debug line = 5:11] [debug variable = Z]
 
 The third record ``#dbg_declare`` encodes debugging information for
-variable ``Z``.  The metadata ``!19`` at the end of the record provides
+variable ``Z``.  The metadata ``!18`` at the end of the record provides
 scope information for the variable ``Z``.
 
 .. code-block:: text
 
-  !18 = distinct !DILexicalBlock(scope: !4, file: !1, line: 4, column: 5)
-  !19 = !DILocation(line: 5, column: 11, scope: !18)
+  !17 = distinct !DILexicalBlock(scope: !4, file: !1, line: 4, column: 5)
+  !18 = !DILocation(line: 5, column: 11, scope: !17)
 
-Here ``!19`` indicates that ``Z`` is declared at line number 5 and column
-number 11 inside of lexical scope ``!18``.  The lexical scope itself resides
+Here ``!18`` indicates that ``Z`` is declared at line number 5 and column
+number 11 inside of lexical scope ``!17``.  The lexical scope itself resides
 inside of subprogram ``!4`` described above.
 
 The scope information attached with each instruction provides a straightforward
diff --git a/llvm/include/llvm-c/Error.h b/llvm/include/llvm-c/Error.h
index c3baaf6..874bbcf 100644
--- a/llvm/include/llvm-c/Error.h
+++ b/llvm/include/llvm-c/Error.h
@@ -52,6 +52,14 @@ LLVMErrorTypeId LLVMGetErrorTypeId(LLVMErrorRef Err);
 void LLVMConsumeError(LLVMErrorRef Err);
 
 /**
+ * Report a fatal error if Err is a failure value.
+ *
+ * This function can be used to wrap calls to fallible functions ONLY when it is
+ * known that the Error will always be a success value.
+ */
+void LLVMCantFail(LLVMErrorRef Err);
+
+/**
  * Returns the given string's error message. This operation consumes the error,
  * and the given LLVMErrorRef value is not usable once this call returns.
  * The caller is responsible for disposing of the string by calling
diff --git a/llvm/include/llvm-c/Target.h b/llvm/include/llvm-c/Target.h
index 518b46d..aef06a5 100644
--- a/llvm/include/llvm-c/Target.h
+++ b/llvm/include/llvm-c/Target.h
@@ -244,7 +244,7 @@ LLVMTypeRef LLVMIntPtrTypeInContext(LLVMContextRef C, LLVMTargetDataRef TD);
 LLVMTypeRef LLVMIntPtrTypeForASInContext(LLVMContextRef C, LLVMTargetDataRef TD,
                                          unsigned AS);
 
-/** Computes the size of a type in bytes for a target.
+/** Computes the size of a type in bits for a target.
     See the method llvm::DataLayout::getTypeSizeInBits. */
 unsigned long long LLVMSizeOfTypeInBits(LLVMTargetDataRef TD, LLVMTypeRef Ty);
 
diff --git a/llvm/include/llvm/ADT/APFloat.h b/llvm/include/llvm/ADT/APFloat.h
index bff8e64..7039e96 100644
--- a/llvm/include/llvm/ADT/APFloat.h
+++ b/llvm/include/llvm/ADT/APFloat.h
@@ -188,6 +188,9 @@ struct APFloatBase {
     // This format's exponent bias is 11, instead of the 7 (2 ** (4 - 1) - 1)
     // that IEEE precedent would imply.
     S_Float8E4M3B11FNUZ,
+    // 8-bit floating point number following IEEE-754 conventions with bit
+    // layout S1E3M4.
+    S_Float8E3M4,
     // Floating point number that occupies 32 bits or less of storage, providing
     // improved range compared to half (16-bit) formats, at (potentially)
     // greater throughput than single precision (32-bit) formats.
@@ -224,6 +227,7 @@ struct APFloatBase {
   static const fltSemantics &Float8E4M3FN() LLVM_READNONE;
   static const fltSemantics &Float8E4M3FNUZ() LLVM_READNONE;
   static const fltSemantics &Float8E4M3B11FNUZ() LLVM_READNONE;
+  static const fltSemantics &Float8E3M4() LLVM_READNONE;
   static const fltSemantics &FloatTF32() LLVM_READNONE;
   static const fltSemantics &Float6E3M2FN() LLVM_READNONE;
   static const fltSemantics &Float6E2M3FN() LLVM_READNONE;
@@ -646,6 +650,7 @@ private:
   APInt convertFloat8E4M3FNAPFloatToAPInt() const;
   APInt convertFloat8E4M3FNUZAPFloatToAPInt() const;
   APInt convertFloat8E4M3B11FNUZAPFloatToAPInt() const;
+  APInt convertFloat8E3M4APFloatToAPInt() const;
   APInt convertFloatTF32APFloatToAPInt() const;
   APInt convertFloat6E3M2FNAPFloatToAPInt() const;
   APInt convertFloat6E2M3FNAPFloatToAPInt() const;
@@ -665,6 +670,7 @@ private:
   void initFromFloat8E4M3FNAPInt(const APInt &api);
   void initFromFloat8E4M3FNUZAPInt(const APInt &api);
   void initFromFloat8E4M3B11FNUZAPInt(const APInt &api);
+  void initFromFloat8E3M4APInt(const APInt &api);
   void initFromFloatTF32APInt(const APInt &api);
   void initFromFloat6E3M2FNAPInt(const APInt &api);
   void initFromFloat6E2M3FNAPInt(const APInt &api);
diff --git a/llvm/include/llvm/ADT/DenseMap.h b/llvm/include/llvm/ADT/DenseMap.h
index 67d474d..f71cd5b 100644
--- a/llvm/include/llvm/ADT/DenseMap.h
+++ b/llvm/include/llvm/ADT/DenseMap.h
@@ -118,12 +118,13 @@ public:
       return;
     }
 
-    const KeyT EmptyKey = getEmptyKey(), TombstoneKey = getTombstoneKey();
+    const KeyT EmptyKey = getEmptyKey();
     if (std::is_trivially_destructible<ValueT>::value) {
       // Use a simpler loop when values don't need destruction.
       for (BucketT *P = getBuckets(), *E = getBucketsEnd(); P != E; ++P)
         P->getFirst() = EmptyKey;
     } else {
+      const KeyT TombstoneKey = getTombstoneKey();
       unsigned NumEntries = getNumEntries();
       for (BucketT *P = getBuckets(), *E = getBucketsEnd(); P != E; ++P) {
         if (!KeyInfoT::isEqual(P->getFirst(), EmptyKey)) {
diff --git a/llvm/include/llvm/ADT/GenericCycleImpl.h b/llvm/include/llvm/ADT/GenericCycleImpl.h
index ab9c421..56d5ba6 100644
--- a/llvm/include/llvm/ADT/GenericCycleImpl.h
+++ b/llvm/include/llvm/ADT/GenericCycleImpl.h
@@ -134,6 +134,8 @@ template <typename ContextT> class GenericCycleInfoCompute {
     DFSInfo() = default;
     explicit DFSInfo(unsigned Start) : Start(Start) {}
 
+    explicit operator bool() const { return Start; }
+
     /// Whether this node is an ancestor (or equal to) the node \p Other
     /// in the DFS tree.
     bool isAncestorOf(const DFSInfo &Other) const {
@@ -231,6 +233,8 @@ void GenericCycleInfoCompute<ContextT>::run(BlockT *EntryBlock) {
 
     for (BlockT *Pred : predecessors(HeaderCandidate)) {
       const DFSInfo PredDFSInfo = BlockDFSInfo.lookup(Pred);
+      // This automatically ignores unreachable predecessors since they have
+      // zeros in their DFSInfo.
       if (CandidateInfo.isAncestorOf(PredDFSInfo))
         Worklist.push_back(Pred);
     }
@@ -257,6 +261,10 @@ void GenericCycleInfoCompute<ContextT>::run(BlockT *EntryBlock) {
         const DFSInfo PredDFSInfo = BlockDFSInfo.lookup(Pred);
         if (CandidateInfo.isAncestorOf(PredDFSInfo)) {
           Worklist.push_back(Pred);
+        } else if (!PredDFSInfo) {
+          // Ignore an unreachable predecessor. It will will incorrectly cause
+          // Block to be treated as a cycle entry.
+          LLVM_DEBUG(errs() << " skipped unreachable predecessor.\n");
         } else {
           IsEntry = true;
         }
diff --git a/llvm/include/llvm/ADT/StableHashing.h b/llvm/include/llvm/ADT/StableHashing.h
index 884b575..f675f82 100644
--- a/llvm/include/llvm/ADT/StableHashing.h
+++ b/llvm/include/llvm/ADT/StableHashing.h
@@ -95,18 +95,6 @@ inline stable_hash stable_hash_combine_array(const stable_hash *P, size_t C) {
     hashing::detail::stable_hash_append(Hash, P[I]);
   return Hash;
 }
-
-inline stable_hash stable_hash_combine_string(const StringRef &S) {
-  return stable_hash_combine_range(S.begin(), S.end());
-}
-
-inline stable_hash stable_hash_combine_string(const char *C) {
-  stable_hash Hash = hashing::detail::FNV_OFFSET_64;
-  while (*C)
-    hashing::detail::stable_hash_append(Hash, *(C++));
-  return Hash;
-}
-
 } // namespace llvm
 
 #endif
diff --git a/llvm/include/llvm/Analysis/DXILResource.h b/llvm/include/llvm/Analysis/DXILResource.h
index d4006ae..ed9fade 100644
--- a/llvm/include/llvm/Analysis/DXILResource.h
+++ b/llvm/include/llvm/Analysis/DXILResource.h
@@ -47,10 +47,14 @@ class ResourceInfo {
 
   struct StructInfo {
     uint32_t Stride;
-    Align Alignment;
+    // Note: we store an integer here rather than using `MaybeAlign` because in
+    // GCC 7 MaybeAlign isn't trivial so having one in this union would delete
+    // our move constructor.
+    // See https://www.open-std.org/jtc1/sc22/wg21/docs/papers/2018/p0602r4.html
+    uint32_t AlignLog2;
 
     bool operator==(const StructInfo &RHS) const {
-      return std::tie(Stride, Alignment) == std::tie(RHS.Stride, RHS.Alignment);
+      return std::tie(Stride, AlignLog2) == std::tie(RHS.Stride, RHS.AlignLog2);
     }
     bool operator!=(const StructInfo &RHS) const { return !(*this == RHS); }
   };
@@ -106,6 +110,11 @@ class ResourceInfo {
 
   MSInfo MultiSample;
 
+public:
+  ResourceInfo(dxil::ResourceClass RC, dxil::ResourceKind Kind, Value *Symbol,
+               StringRef Name)
+      : Symbol(Symbol), Name(Name), RC(RC), Kind(Kind) {}
+
   // Conditions to check before accessing union members.
   bool isUAV() const;
   bool isCBuffer() const;
@@ -115,17 +124,51 @@ class ResourceInfo {
   bool isFeedback() const;
   bool isMultiSample() const;
 
-  ResourceInfo(dxil::ResourceClass RC, dxil::ResourceKind Kind, Value *Symbol,
-               StringRef Name)
-      : Symbol(Symbol), Name(Name), RC(RC), Kind(Kind) {}
+  void bind(uint32_t UniqueID, uint32_t Space, uint32_t LowerBound,
+            uint32_t Size) {
+    Binding.UniqueID = UniqueID;
+    Binding.Space = Space;
+    Binding.LowerBound = LowerBound;
+    Binding.Size = Size;
+  }
+  void setUAV(bool GloballyCoherent, bool HasCounter, bool IsROV) {
+    assert(isUAV() && "Not a UAV");
+    UAVFlags.GloballyCoherent = GloballyCoherent;
+    UAVFlags.HasCounter = HasCounter;
+    UAVFlags.IsROV = IsROV;
+  }
+  void setCBuffer(uint32_t Size) {
+    assert(isCBuffer() && "Not a CBuffer");
+    CBufferSize = Size;
+  }
+  void setSampler(dxil::SamplerType Ty) { SamplerTy = Ty; }
+  void setStruct(uint32_t Stride, MaybeAlign Alignment) {
+    assert(isStruct() && "Not a Struct");
+    Struct.Stride = Stride;
+    Struct.AlignLog2 = Alignment ? Log2(*Alignment) : 0;
+  }
+  void setTyped(dxil::ElementType ElementTy, uint32_t ElementCount) {
+    assert(isTyped() && "Not Typed");
+    Typed.ElementTy = ElementTy;
+    Typed.ElementCount = ElementCount;
+  }
+  void setFeedback(dxil::SamplerFeedbackType Type) {
+    assert(isFeedback() && "Not Feedback");
+    Feedback.Type = Type;
+  }
+  void setMultiSample(uint32_t Count) {
+    assert(isMultiSample() && "Not MultiSampled");
+    MultiSample.Count = Count;
+  }
+
+  bool operator==(const ResourceInfo &RHS) const;
 
-public:
   static ResourceInfo SRV(Value *Symbol, StringRef Name,
                           dxil::ElementType ElementTy, uint32_t ElementCount,
                           dxil::ResourceKind Kind);
   static ResourceInfo RawBuffer(Value *Symbol, StringRef Name);
   static ResourceInfo StructuredBuffer(Value *Symbol, StringRef Name,
-                                       uint32_t Stride, Align Alignment);
+                                       uint32_t Stride, MaybeAlign Alignment);
   static ResourceInfo Texture2DMS(Value *Symbol, StringRef Name,
                                   dxil::ElementType ElementTy,
                                   uint32_t ElementCount, uint32_t SampleCount);
@@ -141,9 +184,9 @@ public:
   static ResourceInfo RWRawBuffer(Value *Symbol, StringRef Name,
                                   bool GloballyCoherent, bool IsROV);
   static ResourceInfo RWStructuredBuffer(Value *Symbol, StringRef Name,
-                                         uint32_t Stride,
-                                         Align Alignment, bool GloballyCoherent,
-                                         bool IsROV, bool HasCounter);
+                                         uint32_t Stride, MaybeAlign Alignment,
+                                         bool GloballyCoherent, bool IsROV,
+                                         bool HasCounter);
   static ResourceInfo RWTexture2DMS(Value *Symbol, StringRef Name,
                                     dxil::ElementType ElementTy,
                                     uint32_t ElementCount, uint32_t SampleCount,
@@ -164,16 +207,6 @@ public:
   static ResourceInfo Sampler(Value *Symbol, StringRef Name,
                               dxil::SamplerType SamplerTy);
 
-  void bind(uint32_t UniqueID, uint32_t Space, uint32_t LowerBound,
-            uint32_t Size) {
-    Binding.UniqueID = UniqueID;
-    Binding.Space = Space;
-    Binding.LowerBound = LowerBound;
-    Binding.Size = Size;
-  }
-
-  bool operator==(const ResourceInfo &RHS) const;
-
   MDTuple *getAsMetadata(LLVMContext &Ctx) const;
 
   ResourceBinding getBinding() const { return Binding; }
diff --git a/llvm/include/llvm/Analysis/GenericDomTreeUpdater.h b/llvm/include/llvm/Analysis/GenericDomTreeUpdater.h
index 84ed882..ca4ce68 100644
--- a/llvm/include/llvm/Analysis/GenericDomTreeUpdater.h
+++ b/llvm/include/llvm/Analysis/GenericDomTreeUpdater.h
@@ -232,7 +232,7 @@ protected:
   /// insertEdge/deleteEdge or is unnecessary in the batch update.
   bool isUpdateValid(typename DomTreeT::UpdateType Update) const;
 
-  /// Erase Basic Block node that has been unlinked from Function
+  /// Erase Basic Block node before it is unlinked from Function
   /// in the DomTree and PostDomTree.
   void eraseDelBBNode(BasicBlockT *DelBB);
 
diff --git a/llvm/include/llvm/Analysis/Loads.h b/llvm/include/llvm/Analysis/Loads.h
index 33e8178..1f01ff7 100644
--- a/llvm/include/llvm/Analysis/Loads.h
+++ b/llvm/include/llvm/Analysis/Loads.h
@@ -69,8 +69,7 @@ bool isDereferenceableAndAlignedPointer(const Value *V, Align Alignment,
 /// quick local scan of the basic block containing ScanFrom, to determine if
 /// the address is already accessed.
 bool isSafeToLoadUnconditionally(Value *V, Align Alignment, const APInt &Size,
-                                 const DataLayout &DL,
-                                 Instruction *ScanFrom = nullptr,
+                                 const DataLayout &DL, Instruction *ScanFrom,
                                  AssumptionCache *AC = nullptr,
                                  const DominatorTree *DT = nullptr,
                                  const TargetLibraryInfo *TLI = nullptr);
@@ -100,12 +99,18 @@ bool isDereferenceableReadOnlyLoop(Loop *L, ScalarEvolution *SE,
 /// quick local scan of the basic block containing ScanFrom, to determine if
 /// the address is already accessed.
 bool isSafeToLoadUnconditionally(Value *V, Type *Ty, Align Alignment,
-                                 const DataLayout &DL,
-                                 Instruction *ScanFrom = nullptr,
+                                 const DataLayout &DL, Instruction *ScanFrom,
                                  AssumptionCache *AC = nullptr,
                                  const DominatorTree *DT = nullptr,
                                  const TargetLibraryInfo *TLI = nullptr);
 
+/// Return true if speculation of the given load must be suppressed to avoid
+/// ordering or interfering with an active sanitizer.  If not suppressed,
+/// dereferenceability and alignment must be proven separately.  Note: This
+/// is only needed for raw reasoning; if you use the interface below
+/// (isSafeToSpeculativelyExecute), this is handled internally.
+bool mustSuppressSpeculation(const LoadInst &LI);
+
 /// The default number of maximum instructions to scan in the block, used by
 /// FindAvailableLoadedValue().
 extern cl::opt<unsigned> DefMaxInstsToScan;
diff --git a/llvm/include/llvm/Analysis/ScalarEvolution.h b/llvm/include/llvm/Analysis/ScalarEvolution.h
index d9bfca7..fbefa2b 100644
--- a/llvm/include/llvm/Analysis/ScalarEvolution.h
+++ b/llvm/include/llvm/Analysis/ScalarEvolution.h
@@ -1028,6 +1028,9 @@ public:
   /// Test if the given expression is known to be non-zero.
   bool isKnownNonZero(const SCEV *S);
 
+  /// Test if the given expression is known to be a power of 2.
+  bool isKnownToBeAPowerOfTwo(const SCEV *S, bool OrZero = false);
+
   /// Splits SCEV expression \p S into two SCEVs. One of them is obtained from
   /// \p S by substitution of all AddRec sub-expression related to loop \p L
   /// with initial value of that SCEV. The second is obtained from \p S by
diff --git a/llvm/include/llvm/Analysis/TargetLibraryInfo.def b/llvm/include/llvm/Analysis/TargetLibraryInfo.def
index 623cdb4..754f09c 100644
--- a/llvm/include/llvm/Analysis/TargetLibraryInfo.def
+++ b/llvm/include/llvm/Analysis/TargetLibraryInfo.def
@@ -476,6 +476,21 @@ TLI_DEFINE_ENUM_INTERNAL(atexit)
 TLI_DEFINE_STRING_INTERNAL("atexit")
 TLI_DEFINE_SIG_INTERNAL(Int, Ptr)
 
+/// void abort(void)
+TLI_DEFINE_ENUM_INTERNAL(abort)
+TLI_DEFINE_STRING_INTERNAL("abort")
+TLI_DEFINE_SIG_INTERNAL(Void)
+
+/// void exit(int)
+TLI_DEFINE_ENUM_INTERNAL(exit)
+TLI_DEFINE_STRING_INTERNAL("exit")
+TLI_DEFINE_SIG_INTERNAL(Void, Int)
+
+/// void _Exit(int)
+TLI_DEFINE_ENUM_INTERNAL(Exit)
+TLI_DEFINE_STRING_INTERNAL("_Exit")
+TLI_DEFINE_SIG_INTERNAL(Void, Int)
+
 /// void __cxa_guard_abort(guard_t *guard);
 /// guard_t is int64_t in Itanium ABI or int32_t on ARM eabi.
 TLI_DEFINE_ENUM_INTERNAL(cxa_guard_abort)
@@ -1792,6 +1807,21 @@ TLI_DEFINE_ENUM_INTERNAL(modfl)
 TLI_DEFINE_STRING_INTERNAL("modfl")
 TLI_DEFINE_SIG_INTERNAL(LDbl, LDbl, Ptr)
 
+/// double nan(const char *arg);
+TLI_DEFINE_ENUM_INTERNAL(nan)
+TLI_DEFINE_STRING_INTERNAL("nan")
+TLI_DEFINE_SIG_INTERNAL(Dbl, Ptr)
+
+/// float nanf(const char *arg);
+TLI_DEFINE_ENUM_INTERNAL(nanf)
+TLI_DEFINE_STRING_INTERNAL("nanf")
+TLI_DEFINE_SIG_INTERNAL(Flt, Ptr)
+
+/// long double nanl(const char *arg);
+TLI_DEFINE_ENUM_INTERNAL(nanl)
+TLI_DEFINE_STRING_INTERNAL("nanl")
+TLI_DEFINE_SIG_INTERNAL(LDbl, Ptr)
+
 /// double nearbyint(double x);
 TLI_DEFINE_ENUM_INTERNAL(nearbyint)
 TLI_DEFINE_STRING_INTERNAL("nearbyint")
diff --git a/llvm/include/llvm/Analysis/TargetTransformInfo.h b/llvm/include/llvm/Analysis/TargetTransformInfo.h
index 2411b2b..38e8b9d 100644
--- a/llvm/include/llvm/Analysis/TargetTransformInfo.h
+++ b/llvm/include/llvm/Analysis/TargetTransformInfo.h
@@ -1587,7 +1587,7 @@ public:
   /// \returns The type to use in a loop expansion of a memcpy call.
   Type *getMemcpyLoopLoweringType(
       LLVMContext &Context, Value *Length, unsigned SrcAddrSpace,
-      unsigned DestAddrSpace, unsigned SrcAlign, unsigned DestAlign,
+      unsigned DestAddrSpace, Align SrcAlign, Align DestAlign,
       std::optional<uint32_t> AtomicElementSize = std::nullopt) const;
 
   /// \param[out] OpsOut The operand types to copy RemainingBytes of memory.
@@ -1599,7 +1599,7 @@ public:
   void getMemcpyLoopResidualLoweringType(
       SmallVectorImpl<Type *> &OpsOut, LLVMContext &Context,
       unsigned RemainingBytes, unsigned SrcAddrSpace, unsigned DestAddrSpace,
-      unsigned SrcAlign, unsigned DestAlign,
+      Align SrcAlign, Align DestAlign,
       std::optional<uint32_t> AtomicCpySize = std::nullopt) const;
 
   /// \returns True if the two functions have compatible attributes for inlining
@@ -2133,13 +2133,13 @@ public:
                                                    Type *ExpectedType) = 0;
   virtual Type *getMemcpyLoopLoweringType(
       LLVMContext &Context, Value *Length, unsigned SrcAddrSpace,
-      unsigned DestAddrSpace, unsigned SrcAlign, unsigned DestAlign,
+      unsigned DestAddrSpace, Align SrcAlign, Align DestAlign,
       std::optional<uint32_t> AtomicElementSize) const = 0;
 
   virtual void getMemcpyLoopResidualLoweringType(
       SmallVectorImpl<Type *> &OpsOut, LLVMContext &Context,
       unsigned RemainingBytes, unsigned SrcAddrSpace, unsigned DestAddrSpace,
-      unsigned SrcAlign, unsigned DestAlign,
+      Align SrcAlign, Align DestAlign,
       std::optional<uint32_t> AtomicCpySize) const = 0;
   virtual bool areInlineCompatible(const Function *Caller,
                                    const Function *Callee) const = 0;
@@ -2838,7 +2838,7 @@ public:
   }
   Type *getMemcpyLoopLoweringType(
       LLVMContext &Context, Value *Length, unsigned SrcAddrSpace,
-      unsigned DestAddrSpace, unsigned SrcAlign, unsigned DestAlign,
+      unsigned DestAddrSpace, Align SrcAlign, Align DestAlign,
       std::optional<uint32_t> AtomicElementSize) const override {
     return Impl.getMemcpyLoopLoweringType(Context, Length, SrcAddrSpace,
                                           DestAddrSpace, SrcAlign, DestAlign,
@@ -2847,7 +2847,7 @@ public:
   void getMemcpyLoopResidualLoweringType(
       SmallVectorImpl<Type *> &OpsOut, LLVMContext &Context,
       unsigned RemainingBytes, unsigned SrcAddrSpace, unsigned DestAddrSpace,
-      unsigned SrcAlign, unsigned DestAlign,
+      Align SrcAlign, Align DestAlign,
       std::optional<uint32_t> AtomicCpySize) const override {
     Impl.getMemcpyLoopResidualLoweringType(OpsOut, Context, RemainingBytes,
                                            SrcAddrSpace, DestAddrSpace,
diff --git a/llvm/include/llvm/Analysis/TargetTransformInfoImpl.h b/llvm/include/llvm/Analysis/TargetTransformInfoImpl.h
index 00efa47..899c504 100644
--- a/llvm/include/llvm/Analysis/TargetTransformInfoImpl.h
+++ b/llvm/include/llvm/Analysis/TargetTransformInfoImpl.h
@@ -839,7 +839,7 @@ public:
   Type *
   getMemcpyLoopLoweringType(LLVMContext &Context, Value *Length,
                             unsigned SrcAddrSpace, unsigned DestAddrSpace,
-                            unsigned SrcAlign, unsigned DestAlign,
+                            Align SrcAlign, Align DestAlign,
                             std::optional<uint32_t> AtomicElementSize) const {
     return AtomicElementSize ? Type::getIntNTy(Context, *AtomicElementSize * 8)
                              : Type::getInt8Ty(Context);
@@ -848,7 +848,7 @@ public:
   void getMemcpyLoopResidualLoweringType(
       SmallVectorImpl<Type *> &OpsOut, LLVMContext &Context,
       unsigned RemainingBytes, unsigned SrcAddrSpace, unsigned DestAddrSpace,
-      unsigned SrcAlign, unsigned DestAlign,
+      Align SrcAlign, Align DestAlign,
       std::optional<uint32_t> AtomicCpySize) const {
     unsigned OpSizeInBytes = AtomicCpySize ? *AtomicCpySize : 1;
     Type *OpType = Type::getIntNTy(Context, OpSizeInBytes * 8);
diff --git a/llvm/include/llvm/Analysis/ValueTracking.h b/llvm/include/llvm/Analysis/ValueTracking.h
index 5ef6e43..96fa169 100644
--- a/llvm/include/llvm/Analysis/ValueTracking.h
+++ b/llvm/include/llvm/Analysis/ValueTracking.h
@@ -792,13 +792,6 @@ bool onlyUsedByLifetimeMarkers(const Value *V);
 /// droppable instructions.
 bool onlyUsedByLifetimeMarkersOrDroppableInsts(const Value *V);
 
-/// Return true if speculation of the given load must be suppressed to avoid
-/// ordering or interfering with an active sanitizer.  If not suppressed,
-/// dereferenceability and alignment must be proven separately.  Note: This
-/// is only needed for raw reasoning; if you use the interface below
-/// (isSafeToSpeculativelyExecute), this is handled internally.
-bool mustSuppressSpeculation(const LoadInst &LI);
-
 /// Return true if the instruction does not have any effects besides
 /// calculating the result and does not have undefined behavior.
 ///
diff --git a/llvm/include/llvm/Analysis/VecFuncs.def b/llvm/include/llvm/Analysis/VecFuncs.def
index 444cef6..7ea010a 100644
--- a/llvm/include/llvm/Analysis/VecFuncs.def
+++ b/llvm/include/llvm/Analysis/VecFuncs.def
@@ -51,13 +51,19 @@ TLI_DEFINE_VECFUNC("llvm.cos.f32", "vcosf", FIXED(4), "_ZGV_LLVM_N4v")
 TLI_DEFINE_VECFUNC("tanf", "vtanf", FIXED(4), "_ZGV_LLVM_N4v")
 TLI_DEFINE_VECFUNC("llvm.tan.f32", "vtanf", FIXED(4), "_ZGV_LLVM_N4v")
 TLI_DEFINE_VECFUNC("asinf", "vasinf", FIXED(4), "_ZGV_LLVM_N4v")
+TLI_DEFINE_VECFUNC("llvm.asin.f32", "vasinf", FIXED(4), "_ZGV_LLVM_N4v")
 TLI_DEFINE_VECFUNC("acosf", "vacosf", FIXED(4), "_ZGV_LLVM_N4v")
+TLI_DEFINE_VECFUNC("llvm.acos.f32", "vacosf", FIXED(4), "_ZGV_LLVM_N4v")
 TLI_DEFINE_VECFUNC("atanf", "vatanf", FIXED(4), "_ZGV_LLVM_N4v")
+TLI_DEFINE_VECFUNC("llvm.atan.f32", "vatanf", FIXED(4), "_ZGV_LLVM_N4v")
 
 // Hyperbolic Functions
 TLI_DEFINE_VECFUNC("sinhf", "vsinhf", FIXED(4), "_ZGV_LLVM_N4v")
+TLI_DEFINE_VECFUNC("llvm.sinh.f32", "vsinhf", FIXED(4), "_ZGV_LLVM_N4v")
 TLI_DEFINE_VECFUNC("coshf", "vcoshf", FIXED(4), "_ZGV_LLVM_N4v")
+TLI_DEFINE_VECFUNC("llvm.cosh.f32", "vcoshf", FIXED(4), "_ZGV_LLVM_N4v")
 TLI_DEFINE_VECFUNC("tanhf", "vtanhf", FIXED(4), "_ZGV_LLVM_N4v")
+TLI_DEFINE_VECFUNC("llvm.tanh.f32", "vtanhf", FIXED(4), "_ZGV_LLVM_N4v")
 TLI_DEFINE_VECFUNC("asinhf", "vasinhf", FIXED(4), "_ZGV_LLVM_N4v")
 TLI_DEFINE_VECFUNC("acoshf", "vacoshf", FIXED(4), "_ZGV_LLVM_N4v")
 TLI_DEFINE_VECFUNC("atanhf", "vatanhf", FIXED(4), "_ZGV_LLVM_N4v")
@@ -1358,9 +1364,17 @@ TLI_DEFINE_VECFUNC("asinf", "amd_vrs4_asinf", FIXED(4), NOMASK, "_ZGV_LLVM_N4v")
 TLI_DEFINE_VECFUNC("asinf", "amd_vrs8_asinf", FIXED(8), NOMASK, "_ZGV_LLVM_N8v")
 TLI_DEFINE_VECFUNC("asinf", "amd_vrs16_asinf", FIXED(16), NOMASK, "_ZGV_LLVM_N16v")
 
+TLI_DEFINE_VECFUNC("llvm.asin.f64", "amd_vrd8_asin", FIXED(8), NOMASK, "_ZGV_LLVM_N8v")
+TLI_DEFINE_VECFUNC("llvm.asin.f32", "amd_vrs4_asinf", FIXED(4), NOMASK, "_ZGV_LLVM_N4v")
+TLI_DEFINE_VECFUNC("llvm.asin.f32", "amd_vrs8_asinf", FIXED(8), NOMASK, "_ZGV_LLVM_N8v")
+TLI_DEFINE_VECFUNC("llvm.asin.f32", "amd_vrs16_asinf", FIXED(16), NOMASK, "_ZGV_LLVM_N16v")
+
 TLI_DEFINE_VECFUNC("acosf", "amd_vrs4_acosf", FIXED(4), NOMASK, "_ZGV_LLVM_N4v")
 TLI_DEFINE_VECFUNC("acosf", "amd_vrs8_acosf", FIXED(8), NOMASK, "_ZGV_LLVM_N8v")
 
+TLI_DEFINE_VECFUNC("llvm.acos.f32", "amd_vrs8_acosf", FIXED(8), NOMASK, "_ZGV_LLVM_N8v")
+TLI_DEFINE_VECFUNC("llvm.acos.f32", "amd_vrs4_acosf", FIXED(4), NOMASK, "_ZGV_LLVM_N4v")
+
 TLI_DEFINE_VECFUNC("atan", "amd_vrd2_atan", FIXED(2), NOMASK, "_ZGV_LLVM_N2v")
 TLI_DEFINE_VECFUNC("atan", "amd_vrd4_atan", FIXED(4), NOMASK, "_ZGV_LLVM_N4v")
 TLI_DEFINE_VECFUNC("atan", "amd_vrd8_atan", FIXED(8), NOMASK, "_ZGV_LLVM_N8v")
@@ -1368,12 +1382,25 @@ TLI_DEFINE_VECFUNC("atanf", "amd_vrs4_atanf", FIXED(4), NOMASK, "_ZGV_LLVM_N4v")
 TLI_DEFINE_VECFUNC("atanf", "amd_vrs8_atanf", FIXED(8), NOMASK, "_ZGV_LLVM_N8v")
 TLI_DEFINE_VECFUNC("atanf", "amd_vrs16_atanf", FIXED(16), NOMASK, "_ZGV_LLVM_N16v")
 
+TLI_DEFINE_VECFUNC("llvm.atan.f64", "amd_vrd2_atan", FIXED(2), NOMASK, "_ZGV_LLVM_N2v")
+TLI_DEFINE_VECFUNC("llvm.atan.f64", "amd_vrd4_atan", FIXED(4), NOMASK, "_ZGV_LLVM_N4v")
+TLI_DEFINE_VECFUNC("llvm.atan.f64", "amd_vrd8_atan", FIXED(8), NOMASK, "_ZGV_LLVM_N8v")
+TLI_DEFINE_VECFUNC("llvm.atan.f32", "amd_vrs4_atanf", FIXED(4), NOMASK, "_ZGV_LLVM_N4v")
+TLI_DEFINE_VECFUNC("llvm.atan.f32", "amd_vrs8_atanf", FIXED(8), NOMASK, "_ZGV_LLVM_N8v")
+TLI_DEFINE_VECFUNC("llvm.atan.f32", "amd_vrs16_atanf", FIXED(16), NOMASK, "_ZGV_LLVM_N16v")
+
 TLI_DEFINE_VECFUNC("coshf", "amd_vrs4_coshf", FIXED(4), NOMASK, "_ZGV_LLVM_N4v")
 TLI_DEFINE_VECFUNC("coshf", "amd_vrs8_coshf", FIXED(8), NOMASK, "_ZGV_LLVM_N8v")
 
+TLI_DEFINE_VECFUNC("llvm.cosh.f32", "amd_vrs4_coshf", FIXED(4), NOMASK, "_ZGV_LLVM_N4v")
+TLI_DEFINE_VECFUNC("llvm.cosh.f32", "amd_vrs8_coshf", FIXED(8), NOMASK, "_ZGV_LLVM_N8v")
+
 TLI_DEFINE_VECFUNC("tanhf", "amd_vrs4_tanhf", FIXED(4), NOMASK, "_ZGV_LLVM_N4v")
 TLI_DEFINE_VECFUNC("tanhf", "amd_vrs8_tanhf", FIXED(8), NOMASK, "_ZGV_LLVM_N8v")
 
+TLI_DEFINE_VECFUNC("llvm.tanh.f32", "amd_vrs4_tanhf",  FIXED(4),  NOMASK, "_ZGV_LLVM_N4v")
+TLI_DEFINE_VECFUNC("llvm.tanh.f32", "amd_vrs8_tanhf",  FIXED(8),  NOMASK, "_ZGV_LLVM_N8v")
+
 TLI_DEFINE_VECFUNC("cbrt", "amd_vrd2_cbrt", FIXED(2), NOMASK, "_ZGV_LLVM_N2v")
 TLI_DEFINE_VECFUNC("cbrtf", "amd_vrs4_cbrtf", FIXED(4), NOMASK, "_ZGV_LLVM_N4v")
 
diff --git a/llvm/include/llvm/AsmParser/LLParser.h b/llvm/include/llvm/AsmParser/LLParser.h
index e381295..9e551d3 100644
--- a/llvm/include/llvm/AsmParser/LLParser.h
+++ b/llvm/include/llvm/AsmParser/LLParser.h
@@ -560,8 +560,7 @@ namespace llvm {
     bool parseExceptionArgs(SmallVectorImpl<Value *> &Args,
                             PerFunctionState &PFS);
 
-    bool resolveFunctionType(Type *RetType,
-                             const SmallVector<ParamInfo, 16> &ArgList,
+    bool resolveFunctionType(Type *RetType, ArrayRef<ParamInfo> ArgList,
                              FunctionType *&FuncTy);
 
     // Constant Parsing.
diff --git a/llvm/include/llvm/CodeGen/DebugHandlerBase.h b/llvm/include/llvm/CodeGen/DebugHandlerBase.h
index 36a844e..85046c2 100644
--- a/llvm/include/llvm/CodeGen/DebugHandlerBase.h
+++ b/llvm/include/llvm/CodeGen/DebugHandlerBase.h
@@ -137,6 +137,8 @@ public:
   void beginBasicBlockSection(const MachineBasicBlock &MBB);
   void endBasicBlockSection(const MachineBasicBlock &MBB);
 
+  virtual void beginCodeAlignment(const MachineBasicBlock &MBB) {}
+
   /// Return Label preceding the instruction.
   MCSymbol *getLabelBeforeInsn(const MachineInstr *MI);
 
diff --git a/llvm/include/llvm/CodeGen/SDPatternMatch.h b/llvm/include/llvm/CodeGen/SDPatternMatch.h
index a905c85..ad1c541 100644
--- a/llvm/include/llvm/CodeGen/SDPatternMatch.h
+++ b/llvm/include/llvm/CodeGen/SDPatternMatch.h
@@ -490,6 +490,18 @@ m_c_SetCC(const T0_P &LHS, const T1_P &RHS, const T2_P &CC) {
                                                          CC);
 }
 
+template <typename T0_P, typename T1_P, typename T2_P>
+inline TernaryOpc_match<T0_P, T1_P, T2_P>
+m_Select(const T0_P &Cond, const T1_P &T, const T2_P &F) {
+  return TernaryOpc_match<T0_P, T1_P, T2_P>(ISD::SELECT, Cond, T, F);
+}
+
+template <typename T0_P, typename T1_P, typename T2_P>
+inline TernaryOpc_match<T0_P, T1_P, T2_P>
+m_VSelect(const T0_P &Cond, const T1_P &T, const T2_P &F) {
+  return TernaryOpc_match<T0_P, T1_P, T2_P>(ISD::VSELECT, Cond, T, F);
+}
+
 // === Binary operations ===
 template <typename LHS_P, typename RHS_P, bool Commutable = false,
           bool ExcludeChain = false>
@@ -722,6 +734,10 @@ inline Or<UnaryOpc_match<Opnd>, Opnd> m_TruncOrSelf(const Opnd &Op) {
   return Or<UnaryOpc_match<Opnd>, Opnd>(m_Trunc(Op), Op);
 }
 
+template <typename Opnd> inline UnaryOpc_match<Opnd> m_VScale(const Opnd &Op) {
+  return UnaryOpc_match<Opnd>(ISD::VSCALE, Op);
+}
+
 // === Constants ===
 struct ConstantInt_match {
   APInt *BindVal;
diff --git a/llvm/include/llvm/CodeGen/TargetLowering.h b/llvm/include/llvm/CodeGen/TargetLowering.h
index 9d9886f..9ccdbab 100644
--- a/llvm/include/llvm/CodeGen/TargetLowering.h
+++ b/llvm/include/llvm/CodeGen/TargetLowering.h
@@ -1640,7 +1640,8 @@ public:
     MVT NVT = VT;
     do {
       NVT = (MVT::SimpleValueType)(NVT.SimpleTy+1);
-      assert(NVT.isInteger() == VT.isInteger() && NVT != MVT::isVoid &&
+      assert(NVT.isInteger() == VT.isInteger() &&
+             NVT.isFloatingPoint() == VT.isFloatingPoint() &&
              "Didn't find type to promote to!");
     } while (VTBits >= NVT.getScalarSizeInBits() || !isTypeLegal(NVT) ||
              getOperationAction(Op, NVT) == Promote);
diff --git a/llvm/include/llvm/CodeGen/ValueTypes.td b/llvm/include/llvm/CodeGen/ValueTypes.td
index 963b6a7..0883f59 100644
--- a/llvm/include/llvm/CodeGen/ValueTypes.td
+++ b/llvm/include/llvm/CodeGen/ValueTypes.td
@@ -289,34 +289,34 @@ def aarch64svcount
 def spirvbuiltin : ValueType<0, 200>; // SPIR-V's builtin type
 
 let isNormalValueType = false in {
-def token      : ValueType<0, 248>;  // TokenTy
-def MetadataVT : ValueType<0, 249> { // Metadata
+def token      : ValueType<0, 504>;  // TokenTy
+def MetadataVT : ValueType<0, 505> { // Metadata
   let LLVMName = "Metadata";
 }
 
 // Pseudo valuetype mapped to the current pointer size to any address space.
 // Should only be used in TableGen.
-def iPTRAny    : VTAny<250>;
+def iPTRAny    : VTAny<506>;
 
 // Pseudo valuetype to represent "vector of any size"
 // Should only be used in TableGen.
-def vAny       : VTAny<251>;
+def vAny       : VTAny<507>;
 
 // Pseudo valuetype to represent "float of any format"
 // Should only be used in TableGen.
-def fAny       : VTAny<252>;
+def fAny       : VTAny<508>;
 
 // Pseudo valuetype to represent "integer of any bit width"
 // Should only be used in TableGen.
-def iAny       : VTAny<253>;
+def iAny       : VTAny<509>;
 
 // Pseudo valuetype mapped to the current pointer size.
 // Should only be used in TableGen.
-def iPTR       : ValueType<0, 254>;
+def iPTR       : ValueType<0, 510>;
 
 // Pseudo valuetype to represent "any type of any size".
 // Should only be used in TableGen.
-def Any        : VTAny<255>;
+def Any        : VTAny<511>;
 
 } // isNormalValueType = false
 
diff --git a/llvm/include/llvm/CodeGenTypes/MachineValueType.h b/llvm/include/llvm/CodeGenTypes/MachineValueType.h
index 7e7608c..556531c 100644
--- a/llvm/include/llvm/CodeGenTypes/MachineValueType.h
+++ b/llvm/include/llvm/CodeGenTypes/MachineValueType.h
@@ -33,7 +33,7 @@ namespace llvm {
   /// type can be represented by an MVT.
   class MVT {
   public:
-    enum SimpleValueType : uint8_t {
+    enum SimpleValueType : uint16_t {
       // Simple value types that aren't explicitly part of this enumeration
       // are considered extended value types.
       INVALID_SIMPLE_VALUE_TYPE = 0,
diff --git a/llvm/include/llvm/Frontend/HLSL/HLSLResource.h b/llvm/include/llvm/Frontend/HLSL/HLSLResource.h
index 4ed742b..989893b 100644
--- a/llvm/include/llvm/Frontend/HLSL/HLSLResource.h
+++ b/llvm/include/llvm/Frontend/HLSL/HLSLResource.h
@@ -13,6 +13,7 @@
 #ifndef LLVM_FRONTEND_HLSL_HLSLRESOURCE_H
 #define LLVM_FRONTEND_HLSL_HLSLRESOURCE_H
 
+#include "llvm/ADT/StringRef.h"
 #include "llvm/Support/DXILABI.h"
 
 namespace llvm {
diff --git a/llvm/include/llvm/IR/BasicBlock.h b/llvm/include/llvm/IR/BasicBlock.h
index 12571d9..c7913e6 100644
--- a/llvm/include/llvm/IR/BasicBlock.h
+++ b/llvm/include/llvm/IR/BasicBlock.h
@@ -67,6 +67,11 @@ public:
   bool IsNewDbgInfoFormat;
 
 private:
+  // Allow Function to renumber blocks.
+  friend class Function;
+  /// Per-function unique number.
+  unsigned Number = -1u;
+
   friend class BlockAddress;
   friend class SymbolTableListTraits<BasicBlock>;
 
@@ -96,6 +101,11 @@ public:
   void setIsNewDbgInfoFormat(bool NewFlag);
   void setNewDbgInfoFormatFlag(bool NewFlag);
 
+  unsigned getNumber() const {
+    assert(getParent() && "only basic blocks in functions have valid numbers");
+    return Number;
+  }
+
   /// Record that the collection of DbgRecords in \p M "trails" after the last
   /// instruction of this block. These are equivalent to dbg.value intrinsics
   /// that exist at the end of a basic block with no terminator (a transient
diff --git a/llvm/include/llvm/IR/Function.h b/llvm/include/llvm/IR/Function.h
index fd7a6aa..4abf9786 100644
--- a/llvm/include/llvm/IR/Function.h
+++ b/llvm/include/llvm/IR/Function.h
@@ -75,6 +75,13 @@ public:
 private:
   // Important things that make up a function!
   BasicBlockListType BasicBlocks;         ///< The basic blocks
+
+  // Basic blocks need to get their number when added to a function.
+  friend void BasicBlock::setParent(Function *);
+  unsigned NextBlockNum = 0;
+  /// Epoch of block numbers. (Could be shrinked to uint8_t if required.)
+  unsigned BlockNumEpoch = 0;
+
   mutable Argument *Arguments = nullptr;  ///< The formal arguments
   size_t NumArgs;
   std::unique_ptr<ValueSymbolTable>
@@ -811,6 +818,34 @@ public:
   }
 
   //===--------------------------------------------------------------------===//
+  // Block number functions
+
+  /// Return a value larger than the largest block number. Intended to allocate
+  /// a vector that is sufficiently large to hold all blocks indexed by their
+  /// number.
+  unsigned getMaxBlockNumber() const { return NextBlockNum; }
+
+  /// Renumber basic blocks into a dense value range starting from 0. Be aware
+  /// that other data structures and analyses (e.g., DominatorTree) may depend
+  /// on the value numbers and need to be updated or invalidated.
+  void renumberBlocks();
+
+  /// Return the "epoch" of current block numbers. This will return a different
+  /// value after every renumbering. The intention is: if something (e.g., an
+  /// analysis) uses block numbers, it also stores the number epoch and then
+  /// can assert later on that the epoch didn't change (indicating that the
+  /// numbering is still valid). If the epoch changed, blocks might have been
+  /// assigned new numbers and previous uses of the numbers needs to be
+  /// invalidated. This is solely intended as a debugging feature.
+  unsigned getBlockNumberEpoch() const { return BlockNumEpoch; }
+
+private:
+  /// Assert that all blocks have unique numbers within 0..NextBlockNum. This
+  /// has O(n) runtime complexity.
+  void validateBlockNumbers() const;
+
+public:
+  //===--------------------------------------------------------------------===//
   // BasicBlock iterator forwarding functions
   //
   iterator                begin()       { return BasicBlocks.begin(); }
diff --git a/llvm/include/llvm/IR/PatternMatch.h b/llvm/include/llvm/IR/PatternMatch.h
index d9e27e0..9c70a60 100644
--- a/llvm/include/llvm/IR/PatternMatch.h
+++ b/llvm/include/llvm/IR/PatternMatch.h
@@ -1616,7 +1616,8 @@ m_FCmp(const LHS &L, const RHS &R) {
 
 // Same as CmpClass, but instead of saving Pred as out output variable, match a
 // specific input pred for equality.
-template <typename LHS_t, typename RHS_t, typename Class, typename PredicateTy>
+template <typename LHS_t, typename RHS_t, typename Class, typename PredicateTy,
+          bool Commutable = false>
 struct SpecificCmpClass_match {
   const PredicateTy Predicate;
   LHS_t L;
@@ -1626,9 +1627,17 @@ struct SpecificCmpClass_match {
       : Predicate(Pred), L(LHS), R(RHS) {}
 
   template <typename OpTy> bool match(OpTy *V) {
-    if (auto *I = dyn_cast<Class>(V))
-      return I->getPredicate() == Predicate && L.match(I->getOperand(0)) &&
-             R.match(I->getOperand(1));
+    if (auto *I = dyn_cast<Class>(V)) {
+      if (I->getPredicate() == Predicate && L.match(I->getOperand(0)) &&
+          R.match(I->getOperand(1)))
+        return true;
+      if constexpr (Commutable) {
+        if (I->getPredicate() == Class::getSwappedPredicate(Predicate) &&
+            L.match(I->getOperand(1)) && R.match(I->getOperand(0)))
+          return true;
+      }
+    }
+
     return false;
   }
 };
@@ -1648,6 +1657,13 @@ m_SpecificICmp(ICmpInst::Predicate MatchPred, const LHS &L, const RHS &R) {
 }
 
 template <typename LHS, typename RHS>
+inline SpecificCmpClass_match<LHS, RHS, ICmpInst, ICmpInst::Predicate, true>
+m_c_SpecificICmp(ICmpInst::Predicate MatchPred, const LHS &L, const RHS &R) {
+  return SpecificCmpClass_match<LHS, RHS, ICmpInst, ICmpInst::Predicate, true>(
+      MatchPred, L, R);
+}
+
+template <typename LHS, typename RHS>
 inline SpecificCmpClass_match<LHS, RHS, FCmpInst, FCmpInst::Predicate>
 m_SpecificFCmp(FCmpInst::Predicate MatchPred, const LHS &L, const RHS &R) {
   return SpecificCmpClass_match<LHS, RHS, FCmpInst, FCmpInst::Predicate>(
diff --git a/llvm/include/llvm/MC/MCAsmBackend.h b/llvm/include/llvm/MC/MCAsmBackend.h
index d1d1814..3f88ac0 100644
--- a/llvm/include/llvm/MC/MCAsmBackend.h
+++ b/llvm/include/llvm/MC/MCAsmBackend.h
@@ -217,9 +217,8 @@ public:
   virtual bool writeNopData(raw_ostream &OS, uint64_t Count,
                             const MCSubtargetInfo *STI) const = 0;
 
-  // Return true if fragment offsets have been adjusted and an extra layout
-  // iteration is needed.
-  virtual bool finishLayout(const MCAssembler &Asm) const { return false; }
+  /// Give backend an opportunity to finish layout after relaxation
+  virtual void finishLayout(MCAssembler const &Asm) const {}
 
   /// Handle any target-specific assembler flags. By default, do nothing.
   virtual void handleAssemblerFlag(MCAssemblerFlag Flag) {}
diff --git a/llvm/include/llvm/MC/MCAssembler.h b/llvm/include/llvm/MC/MCAssembler.h
index d9752912..c6fa481 100644
--- a/llvm/include/llvm/MC/MCAssembler.h
+++ b/llvm/include/llvm/MC/MCAssembler.h
@@ -111,7 +111,6 @@ private:
   /// Check whether the given fragment needs relaxation.
   bool fragmentNeedsRelaxation(const MCRelaxableFragment *IF) const;
 
-  void layoutSection(MCSection &Sec);
   /// Perform one layout iteration and return true if any offsets
   /// were adjusted.
   bool layoutOnce();
@@ -148,9 +147,10 @@ public:
   uint64_t computeFragmentSize(const MCFragment &F) const;
 
   void layoutBundle(MCFragment *Prev, MCFragment *F) const;
+  void ensureValid(MCSection &Sec) const;
 
   // Get the offset of the given fragment inside its containing section.
-  uint64_t getFragmentOffset(const MCFragment &F) const { return F.Offset; }
+  uint64_t getFragmentOffset(const MCFragment &F) const;
 
   uint64_t getSectionAddressSize(const MCSection &Sec) const;
   uint64_t getSectionFileSize(const MCSection &Sec) const;
diff --git a/llvm/include/llvm/MC/MCELFStreamer.h b/llvm/include/llvm/MC/MCELFStreamer.h
index 028bdd1..94d1408 100644
--- a/llvm/include/llvm/MC/MCELFStreamer.h
+++ b/llvm/include/llvm/MC/MCELFStreamer.h
@@ -11,11 +11,11 @@
 
 #include "llvm/ADT/SmallVector.h"
 #include "llvm/MC/MCDirectives.h"
-#include "llvm/MC/MCELFObjectWriter.h"
 #include "llvm/MC/MCObjectStreamer.h"
 
 namespace llvm {
 
+class ELFObjectWriter;
 class MCContext;
 class MCDataFragment;
 class MCFragment;
diff --git a/llvm/include/llvm/MC/MCFragment.h b/llvm/include/llvm/MC/MCFragment.h
index ee93cb9..bfd349e 100644
--- a/llvm/include/llvm/MC/MCFragment.h
+++ b/llvm/include/llvm/MC/MCFragment.h
@@ -38,7 +38,6 @@ public:
   enum FragmentType : uint8_t {
     FT_Align,
     FT_Data,
-    FT_CompactEncodedInst,
     FT_Fill,
     FT_Nops,
     FT_Relaxable,
@@ -70,8 +69,15 @@ private:
   FragmentType Kind;
 
 protected:
+  /// Used by subclasses for better packing.
+  ///
+  /// MCEncodedFragment
   bool HasInstructions : 1;
+  bool AlignToBundleEnd : 1;
+  /// MCDataFragment
   bool LinkerRelaxable : 1;
+  /// MCRelaxableFragment: x86-specific
+  bool AllowAutoPadding : 1;
 
   MCFragment(FragmentType Kind, bool HasInstructions);
 
@@ -116,9 +122,6 @@ public:
 /// data.
 ///
 class MCEncodedFragment : public MCFragment {
-  /// Should this fragment be aligned to the end of a bundle?
-  bool AlignToBundleEnd = false;
-
   uint8_t BundlePadding = 0;
 
 protected:
@@ -136,7 +139,6 @@ public:
     default:
       return false;
     case MCFragment::FT_Relaxable:
-    case MCFragment::FT_CompactEncodedInst:
     case MCFragment::FT_Data:
     case MCFragment::FT_Dwarf:
     case MCFragment::FT_DwarfFrame:
@@ -173,28 +175,11 @@ public:
 };
 
 /// Interface implemented by fragments that contain encoded instructions and/or
-/// data.
-///
-template<unsigned ContentsSize>
-class MCEncodedFragmentWithContents : public MCEncodedFragment {
-  SmallVector<char, ContentsSize> Contents;
-
-protected:
-  MCEncodedFragmentWithContents(MCFragment::FragmentType FType,
-                                bool HasInstructions)
-      : MCEncodedFragment(FType, HasInstructions) {}
-
-public:
-  SmallVectorImpl<char> &getContents() { return Contents; }
-  const SmallVectorImpl<char> &getContents() const { return Contents; }
-};
-
-/// Interface implemented by fragments that contain encoded instructions and/or
 /// data and also have fixups registered.
 ///
-template<unsigned ContentsSize, unsigned FixupsSize>
-class MCEncodedFragmentWithFixups :
-  public MCEncodedFragmentWithContents<ContentsSize> {
+template <unsigned ContentsSize, unsigned FixupsSize>
+class MCEncodedFragmentWithFixups : public MCEncodedFragment {
+  SmallVector<char, ContentsSize> Contents;
 
   /// The list of fixups in this fragment.
   SmallVector<MCFixup, FixupsSize> Fixups;
@@ -202,13 +187,16 @@ class MCEncodedFragmentWithFixups :
 protected:
   MCEncodedFragmentWithFixups(MCFragment::FragmentType FType,
                               bool HasInstructions)
-      : MCEncodedFragmentWithContents<ContentsSize>(FType, HasInstructions) {}
+      : MCEncodedFragment(FType, HasInstructions) {}
 
 public:
 
   using const_fixup_iterator = SmallVectorImpl<MCFixup>::const_iterator;
   using fixup_iterator = SmallVectorImpl<MCFixup>::iterator;
 
+  SmallVectorImpl<char> &getContents() { return Contents; }
+  const SmallVectorImpl<char> &getContents() const { return Contents; }
+
   SmallVectorImpl<MCFixup> &getFixups() { return Fixups; }
   const SmallVectorImpl<MCFixup> &getFixups() const { return Fixups; }
 
@@ -240,30 +228,12 @@ public:
   void setLinkerRelaxable() { LinkerRelaxable = true; }
 };
 
-/// This is a compact (memory-size-wise) fragment for holding an encoded
-/// instruction (non-relaxable) that has no fixups registered. When applicable,
-/// it can be used instead of MCDataFragment and lead to lower memory
-/// consumption.
-///
-class MCCompactEncodedInstFragment : public MCEncodedFragmentWithContents<4> {
-public:
-  MCCompactEncodedInstFragment()
-      : MCEncodedFragmentWithContents(FT_CompactEncodedInst, true) {}
-
-  static bool classof(const MCFragment *F) {
-    return F->getKind() == MCFragment::FT_CompactEncodedInst;
-  }
-};
-
 /// A relaxable fragment holds on to its MCInst, since it may need to be
 /// relaxed during the assembler layout and relaxation stage.
 ///
 class MCRelaxableFragment : public MCEncodedFragmentWithFixups<8, 1> {
-
   /// The instruction this is a fragment for.
   MCInst Inst;
-  /// Can we auto pad the instruction?
-  bool AllowAutoPadding = false;
 
 public:
   MCRelaxableFragment(const MCInst &Inst, const MCSubtargetInfo &STI)
diff --git a/llvm/include/llvm/MC/MCSection.h b/llvm/include/llvm/MC/MCSection.h
index 1289d6f..dcdcd09 100644
--- a/llvm/include/llvm/MC/MCSection.h
+++ b/llvm/include/llvm/MC/MCSection.h
@@ -99,6 +99,8 @@ private:
   /// Whether this section has had instructions emitted into it.
   bool HasInstructions : 1;
 
+  bool HasLayout : 1;
+
   bool IsRegistered : 1;
 
   bool IsText : 1;
@@ -167,6 +169,9 @@ public:
   bool hasInstructions() const { return HasInstructions; }
   void setHasInstructions(bool Value) { HasInstructions = Value; }
 
+  bool hasLayout() const { return HasLayout; }
+  void setHasLayout(bool Value) { HasLayout = Value; }
+
   bool isRegistered() const { return IsRegistered; }
   void setIsRegistered(bool Value) { IsRegistered = Value; }
 
diff --git a/llvm/include/llvm/ObjCopy/CommonConfig.h b/llvm/include/llvm/ObjCopy/CommonConfig.h
index 7f9d90d..5ae0976 100644
--- a/llvm/include/llvm/ObjCopy/CommonConfig.h
+++ b/llvm/include/llvm/ObjCopy/CommonConfig.h
@@ -151,6 +151,18 @@ public:
   }
 };
 
+enum class AdjustKind { Set, Add, Subtract };
+
+struct AddressUpdate {
+  uint64_t Value = 0;
+  AdjustKind Kind = AdjustKind::Add;
+};
+
+struct SectionPatternAddressUpdate {
+  NameMatcher SectionPattern;
+  AddressUpdate Update;
+};
+
 enum class SymbolFlag {
   Global,
   Local,
@@ -219,6 +231,7 @@ struct CommonConfig {
   SmallVector<NewSectionInfo, 0> AddSection;
   SmallVector<StringRef, 0> DumpSection;
   SmallVector<NewSectionInfo, 0> UpdateSection;
+  SmallVector<SectionPatternAddressUpdate, 0> ChangeSectionAddress;
 
   // Section matchers
   NameMatcher KeepSection;
diff --git a/llvm/include/llvm/Passes/PassBuilder.h b/llvm/include/llvm/Passes/PassBuilder.h
index 474a195..e1d78a8 100644
--- a/llvm/include/llvm/Passes/PassBuilder.h
+++ b/llvm/include/llvm/Passes/PassBuilder.h
@@ -944,6 +944,10 @@ public:
     return Result();
   }
 };
+
+/// Common option used by multiple tools to print pipeline passes
+extern cl::opt<bool> PrintPipelinePasses;
+
 }
 
 #endif
diff --git a/llvm/include/llvm/ProfileData/InstrProf.h b/llvm/include/llvm/ProfileData/InstrProf.h
index d449091..824dcf2 100644
--- a/llvm/include/llvm/ProfileData/InstrProf.h
+++ b/llvm/include/llvm/ProfileData/InstrProf.h
@@ -174,6 +174,10 @@ inline StringRef getInstrProfCounterBiasVarName() {
   return INSTR_PROF_QUOTE(INSTR_PROF_PROFILE_COUNTER_BIAS_VAR);
 }
 
+inline StringRef getInstrProfBitmapBiasVarName() {
+  return INSTR_PROF_QUOTE(INSTR_PROF_PROFILE_BITMAP_BIAS_VAR);
+}
+
 /// Return the marker used to separate PGO names during serialization.
 inline StringRef getInstrProfNameSeparator() { return "\01"; }
 
diff --git a/llvm/include/llvm/ProfileData/InstrProfData.inc b/llvm/include/llvm/ProfileData/InstrProfData.inc
index 847e53c..b9df326 100644
--- a/llvm/include/llvm/ProfileData/InstrProfData.inc
+++ b/llvm/include/llvm/ProfileData/InstrProfData.inc
@@ -738,6 +738,7 @@ serializeValueProfDataFrom(ValueProfRecordClosure *Closure,
 #define INSTR_PROF_RAW_VERSION_VAR __llvm_profile_raw_version
 #define INSTR_PROF_PROFILE_RUNTIME_VAR __llvm_profile_runtime
 #define INSTR_PROF_PROFILE_COUNTER_BIAS_VAR __llvm_profile_counter_bias
+#define INSTR_PROF_PROFILE_BITMAP_BIAS_VAR __llvm_profile_bitmap_bias
 #define INSTR_PROF_PROFILE_SET_TIMESTAMP __llvm_profile_set_timestamp
 #define INSTR_PROF_PROFILE_SAMPLING_VAR __llvm_profile_sampling
 
diff --git a/llvm/include/llvm/SandboxIR/SandboxIR.h b/llvm/include/llvm/SandboxIR/SandboxIR.h
index 6c04c92..1fd9283 100644
--- a/llvm/include/llvm/SandboxIR/SandboxIR.h
+++ b/llvm/include/llvm/SandboxIR/SandboxIR.h
@@ -18,17 +18,35 @@
 //
 // namespace sandboxir {
 //
-//        +- Argument                   +- BinaryOperator
-//        |                             |
-// Value -+- BasicBlock                 +- BranchInst
-//        |                             |
-//        +- Function   +- Constant     +- CastInst
-//        |             |               |
-//        +- User ------+- Instruction -+- CallInst
+// Value -+- Argument
+//        |
+//        +- BasicBlock
+//        |
+//        +- User ------+- Constant ------ Function
+//                      |
+//                      +- Instruction -+- BinaryOperator
 //                                      |
-//                                      +- CmpInst
+//                                      +- BranchInst
 //                                      |
-//                                      +- ExtractElementInst
+//                                      +- CastInst --------+- AddrSpaceCastInst
+//                                      |                   |
+//                                      |                   +- BitCastInst
+//                                      |                   |
+//                                      |                   +- FPToSIInst
+//                                      |                   |
+//                                      |                   +- FPToUIInst
+//                                      |                   |
+//                                      |                   +- IntToPtrInst
+//                                      |                   |
+//                                      |                   +- PtrToIntInst
+//                                      |                   |
+//                                      |                   +- SIToFPInst
+//                                      |
+//                                      +- CallBase -----------+- CallBrInst
+//                                      |                      |
+//                                      +- CmpInst             +- CallInst
+//                                      |                      |
+//                                      +- ExtractElementInst  +- InvokeInst
 //                                      |
 //                                      +- GetElementPtrInst
 //                                      |
@@ -60,6 +78,7 @@
 
 #include "llvm/IR/Function.h"
 #include "llvm/IR/IRBuilder.h"
+#include "llvm/IR/Instruction.h"
 #include "llvm/IR/User.h"
 #include "llvm/IR/Value.h"
 #include "llvm/SandboxIR/Tracker.h"
@@ -82,6 +101,14 @@ class ReturnInst;
 class StoreInst;
 class User;
 class Value;
+class CallBase;
+class CallInst;
+class InvokeInst;
+class CallBrInst;
+class GetElementPtrInst;
+class CastInst;
+class PtrToIntInst;
+class BitCastInst;
 
 /// Iterator for the `Use` edges of a User's operands.
 /// \Returns the operand `Use` when dereferenced.
@@ -103,12 +130,20 @@ public:
   OperandUseIterator() = default;
   value_type operator*() const;
   OperandUseIterator &operator++();
+  OperandUseIterator operator++(int) {
+    auto Copy = *this;
+    this->operator++();
+    return Copy;
+  }
   bool operator==(const OperandUseIterator &Other) const {
     return Use == Other.Use;
   }
   bool operator!=(const OperandUseIterator &Other) const {
     return !(*this == Other);
   }
+  OperandUseIterator operator+(unsigned Num) const;
+  OperandUseIterator operator-(unsigned Num) const;
+  int operator-(const OperandUseIterator &Other) const;
 };
 
 /// Iterator for the `Use` edges of a Value's users.
@@ -135,6 +170,7 @@ public:
   bool operator!=(const UserUseIterator &Other) const {
     return !(*this == Other);
   }
+  const sandboxir::Use &getUse() const { return Use; }
 };
 
 /// A SandboxIR Value has users. This is the base class.
@@ -176,14 +212,21 @@ protected:
   /// order.
   llvm::Value *Val = nullptr;
 
-  friend class Context;    // For getting `Val`.
-  friend class User;       // For getting `Val`.
-  friend class Use;        // For getting `Val`.
-  friend class SelectInst; // For getting `Val`.
-  friend class BranchInst; // For getting `Val`.
-  friend class LoadInst;   // For getting `Val`.
-  friend class StoreInst;  // For getting `Val`.
-  friend class ReturnInst; // For getting `Val`.
+  friend class Context;           // For getting `Val`.
+  friend class User;              // For getting `Val`.
+  friend class Use;               // For getting `Val`.
+  friend class SelectInst;        // For getting `Val`.
+  friend class BranchInst;        // For getting `Val`.
+  friend class LoadInst;          // For getting `Val`.
+  friend class StoreInst;         // For getting `Val`.
+  friend class ReturnInst;        // For getting `Val`.
+  friend class CallBase;          // For getting `Val`.
+  friend class CallInst;          // For getting `Val`.
+  friend class InvokeInst;        // For getting `Val`.
+  friend class CallBrInst;        // For getting `Val`.
+  friend class GetElementPtrInst; // For getting `Val`.
+  friend class CastInst;          // For getting `Val`.
+  friend class PHINode;           // For getting `Val`.
 
   /// All values point to the context.
   Context &Ctx;
@@ -417,7 +460,10 @@ public:
 class Constant : public sandboxir::User {
   Constant(llvm::Constant *C, sandboxir::Context &SBCtx)
       : sandboxir::User(ClassID::Constant, C, SBCtx) {}
-  friend class Context; // For constructor.
+  Constant(ClassID ID, llvm::Constant *C, sandboxir::Context &SBCtx)
+      : sandboxir::User(ID, C, SBCtx) {}
+  friend class Function; // For constructor
+  friend class Context;  // For constructor.
   Use getOperandUseInternal(unsigned OpIdx, bool Verify) const final {
     return getOperandUseDefault(OpIdx, Verify);
   }
@@ -435,7 +481,7 @@ public:
     return getUseOperandNoDefault(Use);
   }
 #ifndef NDEBUG
-  void verify() const final {
+  void verify() const override {
     assert(isa<llvm::Constant>(Val) && "Expected Constant!");
   }
   friend raw_ostream &operator<<(raw_ostream &OS,
@@ -491,14 +537,64 @@ public:
   pointer get() const { return getInstr(It); }
 };
 
+/// Contains a list of sandboxir::Instruction's.
+class BasicBlock : public Value {
+  /// Builds a graph that contains all values in \p BB in their original form
+  /// i.e., no vectorization is taking place here.
+  void buildBasicBlockFromLLVMIR(llvm::BasicBlock *LLVMBB);
+  friend class Context;     // For `buildBasicBlockFromIR`
+  friend class Instruction; // For LLVM Val.
+
+  BasicBlock(llvm::BasicBlock *BB, Context &SBCtx)
+      : Value(ClassID::Block, BB, SBCtx) {
+    buildBasicBlockFromLLVMIR(BB);
+  }
+
+public:
+  ~BasicBlock() = default;
+  /// For isa/dyn_cast.
+  static bool classof(const Value *From) {
+    return From->getSubclassID() == Value::ClassID::Block;
+  }
+  Function *getParent() const;
+  using iterator = BBIterator;
+  iterator begin() const;
+  iterator end() const {
+    auto *BB = cast<llvm::BasicBlock>(Val);
+    return iterator(BB, BB->end(), &Ctx);
+  }
+  std::reverse_iterator<iterator> rbegin() const {
+    return std::make_reverse_iterator(end());
+  }
+  std::reverse_iterator<iterator> rend() const {
+    return std::make_reverse_iterator(begin());
+  }
+  Context &getContext() const { return Ctx; }
+  Instruction *getTerminator() const;
+  bool empty() const { return begin() == end(); }
+  Instruction &front() const;
+  Instruction &back() const;
+
+#ifndef NDEBUG
+  void verify() const final {
+    assert(isa<llvm::BasicBlock>(Val) && "Expected BasicBlock!");
+  }
+  friend raw_ostream &operator<<(raw_ostream &OS, const BasicBlock &SBBB) {
+    SBBB.dump(OS);
+    return OS;
+  }
+  void dump(raw_ostream &OS) const final;
+  LLVM_DUMP_METHOD void dump() const final;
+#endif
+};
+
 /// A sandboxir::User with operands, opcode and linked with previous/next
 /// instructions in an instruction list.
 class Instruction : public sandboxir::User {
 public:
   enum class Opcode {
-#define DEF_VALUE(ID, CLASS)
-#define DEF_USER(ID, CLASS)
 #define OP(OPC) OPC,
+#define OPCODES(...) __VA_ARGS__
 #define DEF_INSTR(ID, OPC, CLASS) OPC
 #include "llvm/SandboxIR/SandboxIRValues.def"
   };
@@ -513,11 +609,17 @@ protected:
   /// A SandboxIR Instruction may map to multiple LLVM IR Instruction. This
   /// returns its topmost LLVM IR instruction.
   llvm::Instruction *getTopmostLLVMInstruction() const;
-  friend class SelectInst; // For getTopmostLLVMInstruction().
-  friend class BranchInst; // For getTopmostLLVMInstruction().
-  friend class LoadInst;   // For getTopmostLLVMInstruction().
-  friend class StoreInst;  // For getTopmostLLVMInstruction().
-  friend class ReturnInst; // For getTopmostLLVMInstruction().
+  friend class SelectInst;        // For getTopmostLLVMInstruction().
+  friend class BranchInst;        // For getTopmostLLVMInstruction().
+  friend class LoadInst;          // For getTopmostLLVMInstruction().
+  friend class StoreInst;         // For getTopmostLLVMInstruction().
+  friend class ReturnInst;        // For getTopmostLLVMInstruction().
+  friend class CallInst;          // For getTopmostLLVMInstruction().
+  friend class InvokeInst;        // For getTopmostLLVMInstruction().
+  friend class CallBrInst;        // For getTopmostLLVMInstruction().
+  friend class GetElementPtrInst; // For getTopmostLLVMInstruction().
+  friend class CastInst;          // For getTopmostLLVMInstruction().
+  friend class PHINode;           // For getTopmostLLVMInstruction().
 
   /// \Returns the LLVM IR Instructions that this SandboxIR maps to in program
   /// order.
@@ -735,6 +837,9 @@ class LoadInst final : public Instruction {
   }
 
 public:
+  /// Return true if this is a load from a volatile memory location.
+  bool isVolatile() const { return cast<llvm::LoadInst>(Val)->isVolatile(); }
+
   unsigned getUseOperandNo(const Use &Use) const final {
     return getUseOperandNoDefault(Use);
   }
@@ -744,8 +849,15 @@ public:
                           Instruction *InsertBefore, Context &Ctx,
                           const Twine &Name = "");
   static LoadInst *create(Type *Ty, Value *Ptr, MaybeAlign Align,
+                          Instruction *InsertBefore, bool IsVolatile,
+                          Context &Ctx, const Twine &Name = "");
+  static LoadInst *create(Type *Ty, Value *Ptr, MaybeAlign Align,
                           BasicBlock *InsertAtEnd, Context &Ctx,
                           const Twine &Name = "");
+  static LoadInst *create(Type *Ty, Value *Ptr, MaybeAlign Align,
+                          BasicBlock *InsertAtEnd, bool IsVolatile,
+                          Context &Ctx, const Twine &Name = "");
+
   /// For isa/dyn_cast.
   static bool classof(const Value *From);
   Value *getPointerOperand() const;
@@ -774,6 +886,8 @@ class StoreInst final : public Instruction {
   }
 
 public:
+  /// Return true if this is a store from a volatile memory location.
+  bool isVolatile() const { return cast<llvm::StoreInst>(Val)->isVolatile(); }
   unsigned getUseOperandNo(const Use &Use) const final {
     return getUseOperandNoDefault(Use);
   }
@@ -781,7 +895,13 @@ public:
   static StoreInst *create(Value *V, Value *Ptr, MaybeAlign Align,
                            Instruction *InsertBefore, Context &Ctx);
   static StoreInst *create(Value *V, Value *Ptr, MaybeAlign Align,
+                           Instruction *InsertBefore, bool IsVolatile,
+                           Context &Ctx);
+  static StoreInst *create(Value *V, Value *Ptr, MaybeAlign Align,
                            BasicBlock *InsertAtEnd, Context &Ctx);
+  static StoreInst *create(Value *V, Value *Ptr, MaybeAlign Align,
+                           BasicBlock *InsertAtEnd, bool IsVolatile,
+                           Context &Ctx);
   /// For isa/dyn_cast.
   static bool classof(const Value *From);
   Value *getValueOperand() const;
@@ -835,14 +955,146 @@ public:
 #endif
 };
 
-/// An LLLVM Instruction that has no SandboxIR equivalent class gets mapped to
-/// an OpaqueInstr.
-class OpaqueInst : public sandboxir::Instruction {
-  OpaqueInst(llvm::Instruction *I, sandboxir::Context &Ctx)
-      : sandboxir::Instruction(ClassID::Opaque, Opcode::Opaque, I, Ctx) {}
-  OpaqueInst(ClassID SubclassID, llvm::Instruction *I, sandboxir::Context &Ctx)
-      : sandboxir::Instruction(SubclassID, Opcode::Opaque, I, Ctx) {}
-  friend class Context; // For constructor.
+class CallBase : public Instruction {
+  CallBase(ClassID ID, Opcode Opc, llvm::Instruction *I, Context &Ctx)
+      : Instruction(ID, Opc, I, Ctx) {}
+  friend class CallInst;   // For constructor.
+  friend class InvokeInst; // For constructor.
+  friend class CallBrInst; // For constructor.
+
+public:
+  static bool classof(const Value *From) {
+    auto Opc = From->getSubclassID();
+    return Opc == Instruction::ClassID::Call ||
+           Opc == Instruction::ClassID::Invoke ||
+           Opc == Instruction::ClassID::CallBr;
+  }
+
+  FunctionType *getFunctionType() const {
+    return cast<llvm::CallBase>(Val)->getFunctionType();
+  }
+
+  op_iterator data_operands_begin() { return op_begin(); }
+  const_op_iterator data_operands_begin() const {
+    return const_cast<CallBase *>(this)->data_operands_begin();
+  }
+  op_iterator data_operands_end() {
+    auto *LLVMCB = cast<llvm::CallBase>(Val);
+    auto Dist = LLVMCB->data_operands_end() - LLVMCB->data_operands_begin();
+    return op_begin() + Dist;
+  }
+  const_op_iterator data_operands_end() const {
+    auto *LLVMCB = cast<llvm::CallBase>(Val);
+    auto Dist = LLVMCB->data_operands_end() - LLVMCB->data_operands_begin();
+    return op_begin() + Dist;
+  }
+  iterator_range<op_iterator> data_ops() {
+    return make_range(data_operands_begin(), data_operands_end());
+  }
+  iterator_range<const_op_iterator> data_ops() const {
+    return make_range(data_operands_begin(), data_operands_end());
+  }
+  bool data_operands_empty() const {
+    return data_operands_end() == data_operands_begin();
+  }
+  unsigned data_operands_size() const {
+    return std::distance(data_operands_begin(), data_operands_end());
+  }
+  bool isDataOperand(Use U) const {
+    assert(this == U.getUser() &&
+           "Only valid to query with a use of this instruction!");
+    return cast<llvm::CallBase>(Val)->isDataOperand(U.LLVMUse);
+  }
+  unsigned getDataOperandNo(Use U) const {
+    assert(isDataOperand(U) && "Data operand # out of range!");
+    return cast<llvm::CallBase>(Val)->getDataOperandNo(U.LLVMUse);
+  }
+
+  /// Return the total number operands (not operand bundles) used by
+  /// every operand bundle in this OperandBundleUser.
+  unsigned getNumTotalBundleOperands() const {
+    return cast<llvm::CallBase>(Val)->getNumTotalBundleOperands();
+  }
+
+  op_iterator arg_begin() { return op_begin(); }
+  const_op_iterator arg_begin() const { return op_begin(); }
+  op_iterator arg_end() {
+    return data_operands_end() - getNumTotalBundleOperands();
+  }
+  const_op_iterator arg_end() const {
+    return const_cast<CallBase *>(this)->arg_end();
+  }
+  iterator_range<op_iterator> args() {
+    return make_range(arg_begin(), arg_end());
+  }
+  iterator_range<const_op_iterator> args() const {
+    return make_range(arg_begin(), arg_end());
+  }
+  bool arg_empty() const { return arg_end() == arg_begin(); }
+  unsigned arg_size() const { return arg_end() - arg_begin(); }
+
+  Value *getArgOperand(unsigned OpIdx) const {
+    assert(OpIdx < arg_size() && "Out of bounds!");
+    return getOperand(OpIdx);
+  }
+  void setArgOperand(unsigned OpIdx, Value *NewOp) {
+    assert(OpIdx < arg_size() && "Out of bounds!");
+    setOperand(OpIdx, NewOp);
+  }
+
+  Use getArgOperandUse(unsigned Idx) const {
+    assert(Idx < arg_size() && "Out of bounds!");
+    return getOperandUse(Idx);
+  }
+  Use getArgOperandUse(unsigned Idx) {
+    assert(Idx < arg_size() && "Out of bounds!");
+    return getOperandUse(Idx);
+  }
+
+  bool isArgOperand(Use U) const {
+    return cast<llvm::CallBase>(Val)->isArgOperand(U.LLVMUse);
+  }
+  unsigned getArgOperandNo(Use U) const {
+    return cast<llvm::CallBase>(Val)->getArgOperandNo(U.LLVMUse);
+  }
+  bool hasArgument(const Value *V) const { return is_contained(args(), V); }
+
+  Value *getCalledOperand() const;
+  Use getCalledOperandUse() const;
+
+  Function *getCalledFunction() const;
+  bool isIndirectCall() const {
+    return cast<llvm::CallBase>(Val)->isIndirectCall();
+  }
+  bool isCallee(Use U) const {
+    return cast<llvm::CallBase>(Val)->isCallee(U.LLVMUse);
+  }
+  Function *getCaller();
+  const Function *getCaller() const {
+    return const_cast<CallBase *>(this)->getCaller();
+  }
+  bool isMustTailCall() const {
+    return cast<llvm::CallBase>(Val)->isMustTailCall();
+  }
+  bool isTailCall() const { return cast<llvm::CallBase>(Val)->isTailCall(); }
+  Intrinsic::ID getIntrinsicID() const {
+    return cast<llvm::CallBase>(Val)->getIntrinsicID();
+  }
+  void setCalledOperand(Value *V) { getCalledOperandUse().set(V); }
+  void setCalledFunction(Function *F);
+  CallingConv::ID getCallingConv() const {
+    return cast<llvm::CallBase>(Val)->getCallingConv();
+  }
+  bool isInlineAsm() const { return cast<llvm::CallBase>(Val)->isInlineAsm(); }
+};
+
+class CallInst final : public CallBase {
+  /// Use Context::createCallInst(). Don't call the
+  /// constructor directly.
+  CallInst(llvm::Instruction *I, Context &Ctx)
+      : CallBase(ClassID::Call, Opcode::Call, I, Ctx) {}
+  friend class Context; // For accessing the constructor in
+                        // create*()
   Use getOperandUseInternal(unsigned OpIdx, bool Verify) const final {
     return getOperandUseDefault(OpIdx, Verify);
   }
@@ -851,20 +1103,88 @@ class OpaqueInst : public sandboxir::Instruction {
   }
 
 public:
-  static bool classof(const sandboxir::Value *From) {
-    return From->getSubclassID() == ClassID::Opaque;
+  static CallInst *create(FunctionType *FTy, Value *Func,
+                          ArrayRef<Value *> Args, BBIterator WhereIt,
+                          BasicBlock *WhereBB, Context &Ctx,
+                          const Twine &NameStr = "");
+  static CallInst *create(FunctionType *FTy, Value *Func,
+                          ArrayRef<Value *> Args, Instruction *InsertBefore,
+                          Context &Ctx, const Twine &NameStr = "");
+  static CallInst *create(FunctionType *FTy, Value *Func,
+                          ArrayRef<Value *> Args, BasicBlock *InsertAtEnd,
+                          Context &Ctx, const Twine &NameStr = "");
+
+  static bool classof(const Value *From) {
+    return From->getSubclassID() == ClassID::Call;
   }
   unsigned getUseOperandNo(const Use &Use) const final {
     return getUseOperandNoDefault(Use);
   }
   unsigned getNumOfIRInstrs() const final { return 1u; }
 #ifndef NDEBUG
-  void verify() const final {
-    // Nothing to do
+  void verify() const final {}
+  void dump(raw_ostream &OS) const override;
+  LLVM_DUMP_METHOD void dump() const override;
+#endif
+};
+
+class InvokeInst final : public CallBase {
+  /// Use Context::createInvokeInst(). Don't call the
+  /// constructor directly.
+  InvokeInst(llvm::Instruction *I, Context &Ctx)
+      : CallBase(ClassID::Invoke, Opcode::Invoke, I, Ctx) {}
+  friend class Context; // For accessing the constructor in
+                        // create*()
+  Use getOperandUseInternal(unsigned OpIdx, bool Verify) const final {
+    return getOperandUseDefault(OpIdx, Verify);
   }
-  friend raw_ostream &operator<<(raw_ostream &OS,
-                                 const sandboxir::OpaqueInst &OI) {
-    OI.dump(OS);
+  SmallVector<llvm::Instruction *, 1> getLLVMInstrs() const final {
+    return {cast<llvm::Instruction>(Val)};
+  }
+
+public:
+  static InvokeInst *create(FunctionType *FTy, Value *Func,
+                            BasicBlock *IfNormal, BasicBlock *IfException,
+                            ArrayRef<Value *> Args, BBIterator WhereIt,
+                            BasicBlock *WhereBB, Context &Ctx,
+                            const Twine &NameStr = "");
+  static InvokeInst *create(FunctionType *FTy, Value *Func,
+                            BasicBlock *IfNormal, BasicBlock *IfException,
+                            ArrayRef<Value *> Args, Instruction *InsertBefore,
+                            Context &Ctx, const Twine &NameStr = "");
+  static InvokeInst *create(FunctionType *FTy, Value *Func,
+                            BasicBlock *IfNormal, BasicBlock *IfException,
+                            ArrayRef<Value *> Args, BasicBlock *InsertAtEnd,
+                            Context &Ctx, const Twine &NameStr = "");
+
+  static bool classof(const Value *From) {
+    return From->getSubclassID() == ClassID::Invoke;
+  }
+  unsigned getUseOperandNo(const Use &Use) const final {
+    return getUseOperandNoDefault(Use);
+  }
+  unsigned getNumOfIRInstrs() const final { return 1u; }
+  BasicBlock *getNormalDest() const;
+  BasicBlock *getUnwindDest() const;
+  void setNormalDest(BasicBlock *BB);
+  void setUnwindDest(BasicBlock *BB);
+  // TODO: Return a `LandingPadInst` once implemented.
+  Instruction *getLandingPadInst() const;
+  BasicBlock *getSuccessor(unsigned SuccIdx) const;
+  void setSuccessor(unsigned SuccIdx, BasicBlock *NewSucc) {
+    assert(SuccIdx < 2 && "Successor # out of range for invoke!");
+    if (SuccIdx == 0)
+      setNormalDest(NewSucc);
+    else
+      setUnwindDest(NewSucc);
+  }
+  unsigned getNumSuccessors() const {
+    return cast<llvm::InvokeInst>(Val)->getNumSuccessors();
+  }
+#ifndef NDEBUG
+  void verify() const final {}
+  friend raw_ostream &operator<<(raw_ostream &OS, const InvokeInst &I) {
+    I.dump(OS);
     return OS;
   }
   void dump(raw_ostream &OS) const override;
@@ -872,54 +1192,537 @@ public:
 #endif
 };
 
-/// Contains a list of sandboxir::Instruction's.
-class BasicBlock : public Value {
-  /// Builds a graph that contains all values in \p BB in their original form
-  /// i.e., no vectorization is taking place here.
-  void buildBasicBlockFromLLVMIR(llvm::BasicBlock *LLVMBB);
-  friend class Context;     // For `buildBasicBlockFromIR`
-  friend class Instruction; // For LLVM Val.
+class CallBrInst final : public CallBase {
+  /// Use Context::createCallBrInst(). Don't call the
+  /// constructor directly.
+  CallBrInst(llvm::Instruction *I, Context &Ctx)
+      : CallBase(ClassID::CallBr, Opcode::CallBr, I, Ctx) {}
+  friend class Context; // For accessing the constructor in
+                        // create*()
+  Use getOperandUseInternal(unsigned OpIdx, bool Verify) const final {
+    return getOperandUseDefault(OpIdx, Verify);
+  }
+  SmallVector<llvm::Instruction *, 1> getLLVMInstrs() const final {
+    return {cast<llvm::Instruction>(Val)};
+  }
 
-  BasicBlock(llvm::BasicBlock *BB, Context &SBCtx)
-      : Value(ClassID::Block, BB, SBCtx) {
-    buildBasicBlockFromLLVMIR(BB);
+public:
+  static CallBrInst *create(FunctionType *FTy, Value *Func,
+                            BasicBlock *DefaultDest,
+                            ArrayRef<BasicBlock *> IndirectDests,
+                            ArrayRef<Value *> Args, BBIterator WhereIt,
+                            BasicBlock *WhereBB, Context &Ctx,
+                            const Twine &NameStr = "");
+  static CallBrInst *create(FunctionType *FTy, Value *Func,
+                            BasicBlock *DefaultDest,
+                            ArrayRef<BasicBlock *> IndirectDests,
+                            ArrayRef<Value *> Args, Instruction *InsertBefore,
+                            Context &Ctx, const Twine &NameStr = "");
+  static CallBrInst *create(FunctionType *FTy, Value *Func,
+                            BasicBlock *DefaultDest,
+                            ArrayRef<BasicBlock *> IndirectDests,
+                            ArrayRef<Value *> Args, BasicBlock *InsertAtEnd,
+                            Context &Ctx, const Twine &NameStr = "");
+  static bool classof(const Value *From) {
+    return From->getSubclassID() == ClassID::CallBr;
+  }
+  unsigned getUseOperandNo(const Use &Use) const final {
+    return getUseOperandNoDefault(Use);
+  }
+  unsigned getNumOfIRInstrs() const final { return 1u; }
+  unsigned getNumIndirectDests() const {
+    return cast<llvm::CallBrInst>(Val)->getNumIndirectDests();
+  }
+  Value *getIndirectDestLabel(unsigned Idx) const;
+  Value *getIndirectDestLabelUse(unsigned Idx) const;
+  BasicBlock *getDefaultDest() const;
+  BasicBlock *getIndirectDest(unsigned Idx) const;
+  SmallVector<BasicBlock *, 16> getIndirectDests() const;
+  void setDefaultDest(BasicBlock *BB);
+  void setIndirectDest(unsigned Idx, BasicBlock *BB);
+  BasicBlock *getSuccessor(unsigned Idx) const;
+  unsigned getNumSuccessors() const {
+    return cast<llvm::CallBrInst>(Val)->getNumSuccessors();
+  }
+#ifndef NDEBUG
+  void verify() const final {}
+  friend raw_ostream &operator<<(raw_ostream &OS, const CallBrInst &I) {
+    I.dump(OS);
+    return OS;
+  }
+  void dump(raw_ostream &OS) const override;
+  LLVM_DUMP_METHOD void dump() const override;
+#endif
+};
+
+class GetElementPtrInst final : public Instruction {
+  /// Use Context::createGetElementPtrInst(). Don't call
+  /// the constructor directly.
+  GetElementPtrInst(llvm::Instruction *I, Context &Ctx)
+      : Instruction(ClassID::GetElementPtr, Opcode::GetElementPtr, I, Ctx) {}
+  GetElementPtrInst(ClassID SubclassID, llvm::Instruction *I, Context &Ctx)
+      : Instruction(SubclassID, Opcode::GetElementPtr, I, Ctx) {}
+  friend class Context; // For accessing the constructor in
+                        // create*()
+  Use getOperandUseInternal(unsigned OpIdx, bool Verify) const final {
+    return getOperandUseDefault(OpIdx, Verify);
+  }
+  SmallVector<llvm::Instruction *, 1> getLLVMInstrs() const final {
+    return {cast<llvm::Instruction>(Val)};
   }
 
 public:
-  ~BasicBlock() = default;
+  static Value *create(Type *Ty, Value *Ptr, ArrayRef<Value *> IdxList,
+                       BBIterator WhereIt, BasicBlock *WhereBB, Context &Ctx,
+                       const Twine &NameStr = "");
+  static Value *create(Type *Ty, Value *Ptr, ArrayRef<Value *> IdxList,
+                       Instruction *InsertBefore, Context &Ctx,
+                       const Twine &NameStr = "");
+  static Value *create(Type *Ty, Value *Ptr, ArrayRef<Value *> IdxList,
+                       BasicBlock *InsertAtEnd, Context &Ctx,
+                       const Twine &NameStr = "");
+
+  static bool classof(const Value *From) {
+    return From->getSubclassID() == ClassID::GetElementPtr;
+  }
+  unsigned getUseOperandNo(const Use &Use) const final {
+    return getUseOperandNoDefault(Use);
+  }
+  unsigned getNumOfIRInstrs() const final { return 1u; }
+
+  Type *getSourceElementType() const {
+    return cast<llvm::GetElementPtrInst>(Val)->getSourceElementType();
+  }
+  Type *getResultElementType() const {
+    return cast<llvm::GetElementPtrInst>(Val)->getResultElementType();
+  }
+  unsigned getAddressSpace() const {
+    return cast<llvm::GetElementPtrInst>(Val)->getAddressSpace();
+  }
+
+  inline op_iterator idx_begin() { return op_begin() + 1; }
+  inline const_op_iterator idx_begin() const {
+    return const_cast<GetElementPtrInst *>(this)->idx_begin();
+  }
+  inline op_iterator idx_end() { return op_end(); }
+  inline const_op_iterator idx_end() const {
+    return const_cast<GetElementPtrInst *>(this)->idx_end();
+  }
+  inline iterator_range<op_iterator> indices() {
+    return make_range(idx_begin(), idx_end());
+  }
+  inline iterator_range<const_op_iterator> indices() const {
+    return const_cast<GetElementPtrInst *>(this)->indices();
+  }
+
+  Value *getPointerOperand() const;
+  static unsigned getPointerOperandIndex() {
+    return llvm::GetElementPtrInst::getPointerOperandIndex();
+  }
+  Type *getPointerOperandType() const {
+    return cast<llvm::GetElementPtrInst>(Val)->getPointerOperandType();
+  }
+  unsigned getPointerAddressSpace() const {
+    return cast<llvm::GetElementPtrInst>(Val)->getPointerAddressSpace();
+  }
+  unsigned getNumIndices() const {
+    return cast<llvm::GetElementPtrInst>(Val)->getNumIndices();
+  }
+  bool hasIndices() const {
+    return cast<llvm::GetElementPtrInst>(Val)->hasIndices();
+  }
+  bool hasAllConstantIndices() const {
+    return cast<llvm::GetElementPtrInst>(Val)->hasAllConstantIndices();
+  }
+  GEPNoWrapFlags getNoWrapFlags() const {
+    return cast<llvm::GetElementPtrInst>(Val)->getNoWrapFlags();
+  }
+  bool isInBounds() const {
+    return cast<llvm::GetElementPtrInst>(Val)->isInBounds();
+  }
+  bool hasNoUnsignedSignedWrap() const {
+    return cast<llvm::GetElementPtrInst>(Val)->hasNoUnsignedSignedWrap();
+  }
+  bool hasNoUnsignedWrap() const {
+    return cast<llvm::GetElementPtrInst>(Val)->hasNoUnsignedWrap();
+  }
+  bool accumulateConstantOffset(const DataLayout &DL, APInt &Offset) const {
+    return cast<llvm::GetElementPtrInst>(Val)->accumulateConstantOffset(DL,
+                                                                        Offset);
+  }
+  // TODO: Add missing member functions.
+
+#ifndef NDEBUG
+  void verify() const final {}
+  void dump(raw_ostream &OS) const override;
+  LLVM_DUMP_METHOD void dump() const override;
+#endif
+};
+
+class CastInst : public Instruction {
+  static Opcode getCastOpcode(llvm::Instruction::CastOps CastOp) {
+    switch (CastOp) {
+    case llvm::Instruction::ZExt:
+      return Opcode::ZExt;
+    case llvm::Instruction::SExt:
+      return Opcode::SExt;
+    case llvm::Instruction::FPToUI:
+      return Opcode::FPToUI;
+    case llvm::Instruction::FPToSI:
+      return Opcode::FPToSI;
+    case llvm::Instruction::FPExt:
+      return Opcode::FPExt;
+    case llvm::Instruction::PtrToInt:
+      return Opcode::PtrToInt;
+    case llvm::Instruction::IntToPtr:
+      return Opcode::IntToPtr;
+    case llvm::Instruction::SIToFP:
+      return Opcode::SIToFP;
+    case llvm::Instruction::UIToFP:
+      return Opcode::UIToFP;
+    case llvm::Instruction::Trunc:
+      return Opcode::Trunc;
+    case llvm::Instruction::FPTrunc:
+      return Opcode::FPTrunc;
+    case llvm::Instruction::BitCast:
+      return Opcode::BitCast;
+    case llvm::Instruction::AddrSpaceCast:
+      return Opcode::AddrSpaceCast;
+    case llvm::Instruction::CastOpsEnd:
+      llvm_unreachable("Bad CastOp!");
+    }
+    llvm_unreachable("Unhandled CastOp!");
+  }
+  /// Use Context::createCastInst(). Don't call the
+  /// constructor directly.
+  CastInst(llvm::CastInst *CI, Context &Ctx)
+      : Instruction(ClassID::Cast, getCastOpcode(CI->getOpcode()), CI, Ctx) {}
+  friend Context; // for SBCastInstruction()
+  friend class PtrToInt; // For constructor.
+  Use getOperandUseInternal(unsigned OpIdx, bool Verify) const final {
+    return getOperandUseDefault(OpIdx, Verify);
+  }
+  SmallVector<llvm::Instruction *, 1> getLLVMInstrs() const final {
+    return {cast<llvm::Instruction>(Val)};
+  }
+
+public:
+  unsigned getUseOperandNo(const Use &Use) const final {
+    return getUseOperandNoDefault(Use);
+  }
+  unsigned getNumOfIRInstrs() const final { return 1u; }
+  static Value *create(Type *DestTy, Opcode Op, Value *Operand,
+                       BBIterator WhereIt, BasicBlock *WhereBB, Context &Ctx,
+                       const Twine &Name = "");
+  static Value *create(Type *DestTy, Opcode Op, Value *Operand,
+                       Instruction *InsertBefore, Context &Ctx,
+                       const Twine &Name = "");
+  static Value *create(Type *DestTy, Opcode Op, Value *Operand,
+                       BasicBlock *InsertAtEnd, Context &Ctx,
+                       const Twine &Name = "");
   /// For isa/dyn_cast.
+  static bool classof(const Value *From);
+  Type *getSrcTy() const { return cast<llvm::CastInst>(Val)->getSrcTy(); }
+  Type *getDestTy() const { return cast<llvm::CastInst>(Val)->getDestTy(); }
+#ifndef NDEBUG
+  void verify() const final {
+    assert(isa<llvm::CastInst>(Val) && "Expected CastInst!");
+  }
+  void dump(raw_ostream &OS) const override;
+  LLVM_DUMP_METHOD void dump() const override;
+#endif
+};
+
+class SIToFPInst final : public CastInst {
+public:
+  static Value *create(Value *Src, Type *DestTy, BBIterator WhereIt,
+                       BasicBlock *WhereBB, Context &Ctx,
+                       const Twine &Name = "");
+  static Value *create(Value *Src, Type *DestTy, Instruction *InsertBefore,
+                       Context &Ctx, const Twine &Name = "");
+  static Value *create(Value *Src, Type *DestTy, BasicBlock *InsertAtEnd,
+                       Context &Ctx, const Twine &Name = "");
+
   static bool classof(const Value *From) {
-    return From->getSubclassID() == Value::ClassID::Block;
+    if (auto *I = dyn_cast<Instruction>(From))
+      return I->getOpcode() == Opcode::SIToFP;
+    return false;
   }
-  Function *getParent() const;
-  using iterator = BBIterator;
-  iterator begin() const;
-  iterator end() const {
-    auto *BB = cast<llvm::BasicBlock>(Val);
-    return iterator(BB, BB->end(), &Ctx);
+#ifndef NDEBUG
+  void dump(raw_ostream &OS) const final;
+  LLVM_DUMP_METHOD void dump() const final;
+#endif // NDEBUG
+};
+
+class FPToUIInst final : public CastInst {
+public:
+  static Value *create(Value *Src, Type *DestTy, BBIterator WhereIt,
+                       BasicBlock *WhereBB, Context &Ctx,
+                       const Twine &Name = "");
+  static Value *create(Value *Src, Type *DestTy, Instruction *InsertBefore,
+                       Context &Ctx, const Twine &Name = "");
+  static Value *create(Value *Src, Type *DestTy, BasicBlock *InsertAtEnd,
+                       Context &Ctx, const Twine &Name = "");
+
+  static bool classof(const Value *From) {
+    if (auto *I = dyn_cast<Instruction>(From))
+      return I->getOpcode() == Opcode::FPToUI;
+    return false;
   }
-  std::reverse_iterator<iterator> rbegin() const {
-    return std::make_reverse_iterator(end());
+#ifndef NDEBUG
+  void dump(raw_ostream &OS) const final;
+  LLVM_DUMP_METHOD void dump() const final;
+#endif // NDEBUG
+};
+
+class FPToSIInst final : public CastInst {
+public:
+  static Value *create(Value *Src, Type *DestTy, BBIterator WhereIt,
+                       BasicBlock *WhereBB, Context &Ctx,
+                       const Twine &Name = "");
+  static Value *create(Value *Src, Type *DestTy, Instruction *InsertBefore,
+                       Context &Ctx, const Twine &Name = "");
+  static Value *create(Value *Src, Type *DestTy, BasicBlock *InsertAtEnd,
+                       Context &Ctx, const Twine &Name = "");
+
+  static bool classof(const Value *From) {
+    if (auto *I = dyn_cast<Instruction>(From))
+      return I->getOpcode() == Opcode::FPToSI;
+    return false;
   }
-  std::reverse_iterator<iterator> rend() const {
-    return std::make_reverse_iterator(begin());
+#ifndef NDEBUG
+  void dump(raw_ostream &OS) const final;
+  LLVM_DUMP_METHOD void dump() const final;
+#endif // NDEBUG
+};
+
+class IntToPtrInst final : public CastInst {
+public:
+  static Value *create(Value *Src, Type *DestTy, BBIterator WhereIt,
+                       BasicBlock *WhereBB, Context &Ctx,
+                       const Twine &Name = "");
+  static Value *create(Value *Src, Type *DestTy, Instruction *InsertBefore,
+                       Context &Ctx, const Twine &Name = "");
+  static Value *create(Value *Src, Type *DestTy, BasicBlock *InsertAtEnd,
+                       Context &Ctx, const Twine &Name = "");
+
+  static bool classof(const Value *From) {
+    if (auto *I = dyn_cast<Instruction>(From))
+      return I->getOpcode() == Opcode::IntToPtr;
+    return false;
   }
-  Context &getContext() const { return Ctx; }
-  Instruction *getTerminator() const;
-  bool empty() const { return begin() == end(); }
-  Instruction &front() const;
-  Instruction &back() const;
+#ifndef NDEBUG
+  void dump(raw_ostream &OS) const final;
+  LLVM_DUMP_METHOD void dump() const final;
+#endif // NDEBUG
+};
+
+class PHINode final : public Instruction {
+  /// Use Context::createPHINode(). Don't call the constructor directly.
+  PHINode(llvm::PHINode *PHI, Context &Ctx)
+      : Instruction(ClassID::PHI, Opcode::PHI, PHI, Ctx) {}
+  friend Context; // for PHINode()
+  Use getOperandUseInternal(unsigned OpIdx, bool Verify) const final {
+    return getOperandUseDefault(OpIdx, Verify);
+  }
+  SmallVector<llvm::Instruction *, 1> getLLVMInstrs() const final {
+    return {cast<llvm::Instruction>(Val)};
+  }
+  /// Helper for mapped_iterator.
+  struct LLVMBBToBB {
+    Context &Ctx;
+    LLVMBBToBB(Context &Ctx) : Ctx(Ctx) {}
+    BasicBlock *operator()(llvm::BasicBlock *LLVMBB) const;
+  };
+
+public:
+  unsigned getUseOperandNo(const Use &Use) const final {
+    return getUseOperandNoDefault(Use);
+  }
+  unsigned getNumOfIRInstrs() const final { return 1u; }
+  static PHINode *create(Type *Ty, unsigned NumReservedValues,
+                         Instruction *InsertBefore, Context &Ctx,
+                         const Twine &Name = "");
+  /// For isa/dyn_cast.
+  static bool classof(const Value *From);
+
+  using const_block_iterator =
+      mapped_iterator<llvm::PHINode::const_block_iterator, LLVMBBToBB>;
+
+  const_block_iterator block_begin() const {
+    LLVMBBToBB BBGetter(Ctx);
+    return const_block_iterator(cast<llvm::PHINode>(Val)->block_begin(),
+                                BBGetter);
+  }
+  const_block_iterator block_end() const {
+    LLVMBBToBB BBGetter(Ctx);
+    return const_block_iterator(cast<llvm::PHINode>(Val)->block_end(),
+                                BBGetter);
+  }
+  iterator_range<const_block_iterator> blocks() const {
+    return make_range(block_begin(), block_end());
+  }
+
+  op_range incoming_values() { return operands(); }
+
+  const_op_range incoming_values() const { return operands(); }
+
+  unsigned getNumIncomingValues() const {
+    return cast<llvm::PHINode>(Val)->getNumIncomingValues();
+  }
+  Value *getIncomingValue(unsigned Idx) const;
+  void setIncomingValue(unsigned Idx, Value *V);
+  static unsigned getOperandNumForIncomingValue(unsigned Idx) {
+    return llvm::PHINode::getOperandNumForIncomingValue(Idx);
+  }
+  static unsigned getIncomingValueNumForOperand(unsigned Idx) {
+    return llvm::PHINode::getIncomingValueNumForOperand(Idx);
+  }
+  BasicBlock *getIncomingBlock(unsigned Idx) const;
+  BasicBlock *getIncomingBlock(const Use &U) const;
+
+  void setIncomingBlock(unsigned Idx, BasicBlock *BB);
+
+  void addIncoming(Value *V, BasicBlock *BB);
+
+  Value *removeIncomingValue(unsigned Idx);
+  Value *removeIncomingValue(BasicBlock *BB);
 
+  int getBasicBlockIndex(const BasicBlock *BB) const;
+  Value *getIncomingValueForBlock(const BasicBlock *BB) const;
+
+  Value *hasConstantValue() const;
+
+  bool hasConstantOrUndefValue() const {
+    return cast<llvm::PHINode>(Val)->hasConstantOrUndefValue();
+  }
+  bool isComplete() const { return cast<llvm::PHINode>(Val)->isComplete(); }
+  // TODO: Implement the below functions:
+  // void replaceIncomingBlockWith (const BasicBlock *Old, BasicBlock *New);
+  // void copyIncomingBlocks(iterator_range<const_block_iterator> BBRange,
+  //                         uint32_t ToIdx = 0)
+  // void removeIncomingValueIf(function_ref< bool(unsigned)> Predicate,
+  //                            bool DeletePHIIfEmpty=true)
 #ifndef NDEBUG
   void verify() const final {
-    assert(isa<llvm::BasicBlock>(Val) && "Expected BasicBlock!");
+    assert(isa<llvm::PHINode>(Val) && "Expected PHINode!");
   }
-  friend raw_ostream &operator<<(raw_ostream &OS, const BasicBlock &SBBB) {
-    SBBB.dump(OS);
-    return OS;
+  void dump(raw_ostream &OS) const override;
+  LLVM_DUMP_METHOD void dump() const override;
+#endif
+};
+class PtrToIntInst final : public CastInst {
+public:
+  static Value *create(Value *Src, Type *DestTy, BBIterator WhereIt,
+                       BasicBlock *WhereBB, Context &Ctx,
+                       const Twine &Name = "");
+  static Value *create(Value *Src, Type *DestTy, Instruction *InsertBefore,
+                       Context &Ctx, const Twine &Name = "");
+  static Value *create(Value *Src, Type *DestTy, BasicBlock *InsertAtEnd,
+                       Context &Ctx, const Twine &Name = "");
+
+  static bool classof(const Value *From) {
+    return isa<Instruction>(From) &&
+           cast<Instruction>(From)->getOpcode() == Opcode::PtrToInt;
   }
+#ifndef NDEBUG
   void dump(raw_ostream &OS) const final;
   LLVM_DUMP_METHOD void dump() const final;
+#endif // NDEBUG
+};
+
+class BitCastInst : public CastInst {
+public:
+  static Value *create(Value *Src, Type *DestTy, BBIterator WhereIt,
+                       BasicBlock *WhereBB, Context &Ctx,
+                       const Twine &Name = "");
+  static Value *create(Value *Src, Type *DestTy, Instruction *InsertBefore,
+                       Context &Ctx, const Twine &Name = "");
+  static Value *create(Value *Src, Type *DestTy, BasicBlock *InsertAtEnd,
+                       Context &Ctx, const Twine &Name = "");
+
+  static bool classof(const Value *From) {
+    if (auto *I = dyn_cast<Instruction>(From))
+      return I->getOpcode() == Instruction::Opcode::BitCast;
+    return false;
+  }
+#ifndef NDEBUG
+  void dump(raw_ostream &OS) const override;
+  LLVM_DUMP_METHOD void dump() const override;
+#endif
+};
+
+class AddrSpaceCastInst : public CastInst {
+public:
+  static Value *create(Value *Src, Type *DestTy, BBIterator WhereIt,
+                       BasicBlock *WhereBB, Context &Ctx,
+                       const Twine &Name = "");
+  static Value *create(Value *Src, Type *DestTy, Instruction *InsertBefore,
+                       Context &Ctx, const Twine &Name = "");
+  static Value *create(Value *Src, Type *DestTy, BasicBlock *InsertAtEnd,
+                       Context &Ctx, const Twine &Name = "");
+
+  static bool classof(const Value *From) {
+    if (auto *I = dyn_cast<Instruction>(From))
+      return I->getOpcode() == Opcode::AddrSpaceCast;
+    return false;
+  }
+  /// \Returns the pointer operand.
+  Value *getPointerOperand() { return getOperand(0); }
+  /// \Returns the pointer operand.
+  const Value *getPointerOperand() const {
+    return const_cast<AddrSpaceCastInst *>(this)->getPointerOperand();
+  }
+  /// \Returns the operand index of the pointer operand.
+  static unsigned getPointerOperandIndex() { return 0u; }
+  /// \Returns the address space of the pointer operand.
+  unsigned getSrcAddressSpace() const {
+    return getPointerOperand()->getType()->getPointerAddressSpace();
+  }
+  /// \Returns the address space of the result.
+  unsigned getDestAddressSpace() const {
+    return getType()->getPointerAddressSpace();
+  }
+#ifndef NDEBUG
+  void dump(raw_ostream &OS) const override;
+  LLVM_DUMP_METHOD void dump() const override;
+#endif
+};
+
+/// An LLLVM Instruction that has no SandboxIR equivalent class gets mapped to
+/// an OpaqueInstr.
+class OpaqueInst : public sandboxir::Instruction {
+  OpaqueInst(llvm::Instruction *I, sandboxir::Context &Ctx)
+      : sandboxir::Instruction(ClassID::Opaque, Opcode::Opaque, I, Ctx) {}
+  OpaqueInst(ClassID SubclassID, llvm::Instruction *I, sandboxir::Context &Ctx)
+      : sandboxir::Instruction(SubclassID, Opcode::Opaque, I, Ctx) {}
+  friend class Context; // For constructor.
+  Use getOperandUseInternal(unsigned OpIdx, bool Verify) const final {
+    return getOperandUseDefault(OpIdx, Verify);
+  }
+  SmallVector<llvm::Instruction *, 1> getLLVMInstrs() const final {
+    return {cast<llvm::Instruction>(Val)};
+  }
+
+public:
+  static bool classof(const sandboxir::Value *From) {
+    return From->getSubclassID() == ClassID::Opaque;
+  }
+  unsigned getUseOperandNo(const Use &Use) const final {
+    return getUseOperandNoDefault(Use);
+  }
+  unsigned getNumOfIRInstrs() const final { return 1u; }
+#ifndef NDEBUG
+  void verify() const final {
+    // Nothing to do
+  }
+  friend raw_ostream &operator<<(raw_ostream &OS,
+                                 const sandboxir::OpaqueInst &OI) {
+    OI.dump(OS);
+    return OS;
+  }
+  void dump(raw_ostream &OS) const override;
+  LLVM_DUMP_METHOD void dump() const override;
 #endif
 };
 
@@ -983,6 +1786,18 @@ protected:
   friend StoreInst; // For createStoreInst()
   ReturnInst *createReturnInst(llvm::ReturnInst *I);
   friend ReturnInst; // For createReturnInst()
+  CallInst *createCallInst(llvm::CallInst *I);
+  friend CallInst; // For createCallInst()
+  InvokeInst *createInvokeInst(llvm::InvokeInst *I);
+  friend InvokeInst; // For createInvokeInst()
+  CallBrInst *createCallBrInst(llvm::CallBrInst *I);
+  friend CallBrInst; // For createCallBrInst()
+  GetElementPtrInst *createGetElementPtrInst(llvm::GetElementPtrInst *I);
+  friend GetElementPtrInst; // For createGetElementPtrInst()
+  CastInst *createCastInst(llvm::CastInst *I);
+  friend CastInst; // For createCastInst()
+  PHINode *createPHINode(llvm::PHINode *I);
+  friend PHINode; // For createPHINode()
 
 public:
   Context(LLVMContext &LLVMCtx)
@@ -1010,7 +1825,7 @@ public:
   size_t getNumValues() const { return LLVMValueToValueMap.size(); }
 };
 
-class Function : public sandboxir::Value {
+class Function : public Constant {
   /// Helper for mapped_iterator.
   struct LLVMBBToBB {
     Context &Ctx;
@@ -1021,7 +1836,7 @@ class Function : public sandboxir::Value {
   };
   /// Use Context::createFunction() instead.
   Function(llvm::Function *F, sandboxir::Context &Ctx)
-      : sandboxir::Value(ClassID::Function, F, Ctx) {}
+      : Constant(ClassID::Function, F, Ctx) {}
   friend class Context; // For constructor.
 
 public:
@@ -1047,6 +1862,9 @@ public:
     LLVMBBToBB BBGetter(Ctx);
     return iterator(cast<llvm::Function>(Val)->end(), BBGetter);
   }
+  FunctionType *getFunctionType() const {
+    return cast<llvm::Function>(Val)->getFunctionType();
+  }
 
 #ifndef NDEBUG
   void verify() const final {
diff --git a/llvm/include/llvm/SandboxIR/SandboxIRValues.def b/llvm/include/llvm/SandboxIR/SandboxIRValues.def
index f3d6167..4cb6011 100644
--- a/llvm/include/llvm/SandboxIR/SandboxIRValues.def
+++ b/llvm/include/llvm/SandboxIR/SandboxIRValues.def
@@ -23,14 +23,44 @@ DEF_USER(Constant, Constant)
 #ifndef DEF_INSTR
 #define DEF_INSTR(ID, OPCODE, CLASS)
 #endif
-//       ClassID, Opcode(s),  Class
-DEF_INSTR(Opaque, OP(Opaque), OpaqueInst)
-DEF_INSTR(Select, OP(Select), SelectInst)
-DEF_INSTR(Br, OP(Br), BranchInst)
-DEF_INSTR(Load, OP(Load), LoadInst)
-DEF_INSTR(Store, OP(Store), StoreInst)
-DEF_INSTR(Ret, OP(Ret), ReturnInst)
 
+#ifndef OP
+#define OP(OPCODE)
+#endif
+
+#ifndef OPCODES
+#define OPCODES(...)
+#endif
+// clang-format off
+//       ClassID,        Opcode(s),         Class
+DEF_INSTR(Opaque,        OP(Opaque),        OpaqueInst)
+DEF_INSTR(Select,        OP(Select),        SelectInst)
+DEF_INSTR(Br,            OP(Br),            BranchInst)
+DEF_INSTR(Load,          OP(Load),          LoadInst)
+DEF_INSTR(Store,         OP(Store),         StoreInst)
+DEF_INSTR(Ret,           OP(Ret),           ReturnInst)
+DEF_INSTR(Call,          OP(Call),          CallInst)
+DEF_INSTR(Invoke,        OP(Invoke),        InvokeInst)
+DEF_INSTR(CallBr,        OP(CallBr),        CallBrInst)
+DEF_INSTR(GetElementPtr, OP(GetElementPtr), GetElementPtrInst)
+DEF_INSTR(Cast,  OPCODES(\
+                         OP(ZExt)          \
+                         OP(SExt)          \
+                         OP(FPToUI)        \
+                         OP(FPToSI)        \
+                         OP(FPExt)         \
+                         OP(PtrToInt)      \
+                         OP(IntToPtr)      \
+                         OP(SIToFP)        \
+                         OP(UIToFP)        \
+                         OP(Trunc)         \
+                         OP(FPTrunc)       \
+                         OP(BitCast)       \
+                         OP(AddrSpaceCast) \
+                         ),                 CastInst)
+DEF_INSTR(PHI,           OP(PHI),           PHINode)
+  
+// clang-format on
 #ifdef DEF_VALUE
 #undef DEF_VALUE
 #endif
@@ -43,3 +73,6 @@ DEF_INSTR(Ret, OP(Ret), ReturnInst)
 #ifdef OP
 #undef OP
 #endif
+#ifdef OPCODES
+#undef OPCODES
+#endif
diff --git a/llvm/include/llvm/SandboxIR/Tracker.h b/llvm/include/llvm/SandboxIR/Tracker.h
index 3daec3f..238e4e9 100644
--- a/llvm/include/llvm/SandboxIR/Tracker.h
+++ b/llvm/include/llvm/SandboxIR/Tracker.h
@@ -53,6 +53,7 @@
 namespace llvm::sandboxir {
 
 class BasicBlock;
+class CallBrInst;
 class Instruction;
 class Tracker;
 
@@ -101,6 +102,64 @@ public:
 #endif
 };
 
+class PHISetIncoming : public IRChangeBase {
+  PHINode &PHI;
+  unsigned Idx;
+  PointerUnion<Value *, BasicBlock *> OrigValueOrBB;
+
+public:
+  enum class What {
+    Value,
+    Block,
+  };
+  PHISetIncoming(PHINode &PHI, unsigned Idx, What What, Tracker &Tracker);
+  void revert() final;
+  void accept() final {}
+#ifndef NDEBUG
+  void dump(raw_ostream &OS) const final {
+    dumpCommon(OS);
+    OS << "PHISetIncoming";
+  }
+  LLVM_DUMP_METHOD void dump() const final;
+#endif
+};
+
+class PHIRemoveIncoming : public IRChangeBase {
+  PHINode &PHI;
+  unsigned RemovedIdx;
+  Value *RemovedV;
+  BasicBlock *RemovedBB;
+
+public:
+  PHIRemoveIncoming(PHINode &PHI, unsigned RemovedIdx, Tracker &Tracker);
+  void revert() final;
+  void accept() final {}
+#ifndef NDEBUG
+  void dump(raw_ostream &OS) const final {
+    dumpCommon(OS);
+    OS << "PHISetIncoming";
+  }
+  LLVM_DUMP_METHOD void dump() const final;
+#endif
+};
+
+class PHIAddIncoming : public IRChangeBase {
+  PHINode &PHI;
+  unsigned Idx;
+
+public:
+  PHIAddIncoming(PHINode &PHI, Tracker &Tracker);
+  void revert() final;
+  void accept() final {}
+#ifndef NDEBUG
+  void dump(raw_ostream &OS) const final {
+    dumpCommon(OS);
+    OS << "PHISetIncoming";
+  }
+  LLVM_DUMP_METHOD void dump() const final;
+#endif
+};
+
 /// Tracks swapping a Use with another Use.
 class UseSwap : public IRChangeBase {
   Use ThisUse;
@@ -177,6 +236,41 @@ public:
 #endif // NDEBUG
 };
 
+class CallBrInstSetDefaultDest : public IRChangeBase {
+  CallBrInst *CallBr;
+  BasicBlock *OrigDefaultDest;
+
+public:
+  CallBrInstSetDefaultDest(CallBrInst *CallBr, Tracker &Tracker);
+  void revert() final;
+  void accept() final {}
+#ifndef NDEBUG
+  void dump(raw_ostream &OS) const final {
+    dumpCommon(OS);
+    OS << "CallBrInstSetDefaultDest";
+  }
+  LLVM_DUMP_METHOD void dump() const final;
+#endif
+};
+
+class CallBrInstSetIndirectDest : public IRChangeBase {
+  CallBrInst *CallBr;
+  unsigned Idx;
+  BasicBlock *OrigIndirectDest;
+
+public:
+  CallBrInstSetIndirectDest(CallBrInst *CallBr, unsigned Idx, Tracker &Tracker);
+  void revert() final;
+  void accept() final {}
+#ifndef NDEBUG
+  void dump(raw_ostream &OS) const final {
+    dumpCommon(OS);
+    OS << "CallBrInstSetIndirectDest";
+  }
+  LLVM_DUMP_METHOD void dump() const final;
+#endif
+};
+
 class MoveInstr : public IRChangeBase {
   /// The instruction that moved.
   Instruction *MovedI;
diff --git a/llvm/include/llvm/SandboxIR/Use.h b/llvm/include/llvm/SandboxIR/Use.h
index 03cbfe6..35d01da 100644
--- a/llvm/include/llvm/SandboxIR/Use.h
+++ b/llvm/include/llvm/SandboxIR/Use.h
@@ -21,6 +21,8 @@ namespace llvm::sandboxir {
 class Context;
 class Value;
 class User;
+class CallBase;
+class PHINode;
 
 /// Represents a Def-use/Use-def edge in SandboxIR.
 /// NOTE: Unlike llvm::Use, this is not an integral part of the use-def chains.
@@ -40,6 +42,9 @@ class Use {
   friend class User;               // For constructor
   friend class OperandUseIterator; // For constructor
   friend class UserUseIterator;    // For accessing members
+  friend class CallBase;           // For LLVMUse
+  friend class CallBrInst;         // For constructor
+  friend class PHINode;            // For LLVMUse
 
 public:
   operator Value *() const { return get(); }
diff --git a/llvm/include/llvm/Support/DXILABI.h b/llvm/include/llvm/Support/DXILABI.h
index d0bed4d..a2222ee 100644
--- a/llvm/include/llvm/Support/DXILABI.h
+++ b/llvm/include/llvm/Support/DXILABI.h
@@ -17,7 +17,7 @@
 #ifndef LLVM_SUPPORT_DXILABI_H
 #define LLVM_SUPPORT_DXILABI_H
 
-#include "llvm/ADT/StringSwitch.h"
+#include <cstdint>
 
 namespace llvm {
 namespace dxil {
diff --git a/llvm/include/llvm/Support/GenericDomTree.h b/llvm/include/llvm/Support/GenericDomTree.h
index a8e178d..e05e5f0 100644
--- a/llvm/include/llvm/Support/GenericDomTree.h
+++ b/llvm/include/llvm/Support/GenericDomTree.h
@@ -90,11 +90,7 @@ template <class NodeT> class DomTreeNodeBase {
   DomTreeNodeBase *getIDom() const { return IDom; }
   unsigned getLevel() const { return Level; }
 
-  std::unique_ptr<DomTreeNodeBase> addChild(
-      std::unique_ptr<DomTreeNodeBase> C) {
-    Children.push_back(C.get());
-    return C;
-  }
+  void addChild(DomTreeNodeBase *C) { Children.push_back(C); }
 
   bool isLeaf() const { return Children.empty(); }
   size_t getNumChildren() const { return Children.size(); }
@@ -636,7 +632,7 @@ protected:
     DomTreeNodeBase<NodeT> *IDomNode = getNode(DomBB);
     assert(IDomNode && "Not immediate dominator specified for block!");
     DFSInfoValid = false;
-    return createChild(BB, IDomNode);
+    return createNode(BB, IDomNode);
   }
 
   /// Add a new node to the forward dominator tree and make it a new root.
@@ -655,8 +651,8 @@ protected:
     } else {
       assert(Roots.size() == 1);
       NodeT *OldRoot = Roots.front();
-      auto &OldNode = DomTreeNodes[OldRoot];
-      OldNode = NewNode->addChild(std::move(DomTreeNodes[OldRoot]));
+      DomTreeNodeBase<NodeT> *OldNode = getNode(OldRoot);
+      NewNode->addChild(OldNode);
       OldNode->IDom = NewNode;
       OldNode->UpdateLevel();
       Roots[0] = BB;
@@ -695,7 +691,8 @@ protected:
       assert(I != IDom->Children.end() &&
              "Not in immediate dominator children set!");
       // I am no longer your child...
-      IDom->Children.erase(I);
+      std::swap(*I, IDom->Children.back());
+      IDom->Children.pop_back();
     }
 
     DomTreeNodes.erase(BB);
@@ -830,16 +827,14 @@ public:
 protected:
   void addRoot(NodeT *BB) { this->Roots.push_back(BB); }
 
-  DomTreeNodeBase<NodeT> *createChild(NodeT *BB, DomTreeNodeBase<NodeT> *IDom) {
-    return (DomTreeNodes[BB] = IDom->addChild(
-                std::make_unique<DomTreeNodeBase<NodeT>>(BB, IDom)))
-        .get();
-  }
-
-  DomTreeNodeBase<NodeT> *createNode(NodeT *BB) {
-    return (DomTreeNodes[BB] =
-                std::make_unique<DomTreeNodeBase<NodeT>>(BB, nullptr))
-        .get();
+  DomTreeNodeBase<NodeT> *createNode(NodeT *BB,
+                                     DomTreeNodeBase<NodeT> *IDom = nullptr) {
+    auto Node = std::make_unique<DomTreeNodeBase<NodeT>>(BB, IDom);
+    auto *NodePtr = Node.get();
+    DomTreeNodes[BB] = std::move(Node);
+    if (IDom)
+      IDom->addChild(NodePtr);
+    return NodePtr;
   }
 
   // NewBB is split and now it has one successor. Update dominator tree to
diff --git a/llvm/include/llvm/Support/GenericDomTreeConstruction.h b/llvm/include/llvm/Support/GenericDomTreeConstruction.h
index 57cbe99..af7ac04 100644
--- a/llvm/include/llvm/Support/GenericDomTreeConstruction.h
+++ b/llvm/include/llvm/Support/GenericDomTreeConstruction.h
@@ -137,12 +137,12 @@ struct SemiNCAInfo {
     // immediate dominator.
     NodePtr IDom = getIDom(BB);
 
-    assert(IDom || DT.DomTreeNodes[nullptr]);
+    assert(IDom || DT.getNode(nullptr));
     TreeNodePtr IDomNode = getNodeForBlock(IDom, DT);
 
     // Add a new tree node for this NodeT, and link it as a child of
     // IDomNode
-    return DT.createChild(BB, IDomNode);
+    return DT.createNode(BB, IDomNode);
   }
 
   static bool AlwaysDescend(NodePtr, NodePtr) { return true; }
@@ -585,8 +585,8 @@ struct SemiNCAInfo {
     NodeToInfo[NumToNode[1]].IDom = AttachTo->getBlock();
     // Loop over all of the discovered blocks in the function...
     for (NodePtr W : llvm::drop_begin(NumToNode)) {
-      // Don't replace this with 'count', the insertion side effect is important
-      if (DT.DomTreeNodes[W]) continue;  // Haven't calculated this node yet?
+      if (DT.getNode(W))
+        continue; // Already calculated the node before
 
       NodePtr ImmDom = getIDom(W);
 
@@ -595,7 +595,7 @@ struct SemiNCAInfo {
 
       // Add a new tree node for this BasicBlock, and link it as a child of
       // IDomNode.
-      DT.createChild(W, IDomNode);
+      DT.createNode(W, IDomNode);
     }
   }
 
@@ -644,7 +644,7 @@ struct SemiNCAInfo {
 
       // The unreachable node becomes a new root -- a tree node for it.
       TreeNodePtr VirtualRoot = DT.getNode(nullptr);
-      FromTN = DT.createChild(From, VirtualRoot);
+      FromTN = DT.createNode(From, VirtualRoot);
       DT.Roots.push_back(From);
     }
 
@@ -1078,10 +1078,9 @@ struct SemiNCAInfo {
     // before deleting their parent.
     for (unsigned i = LastDFSNum; i > 0; --i) {
       const NodePtr N = SNCA.NumToNode[i];
-      const TreeNodePtr TN = DT.getNode(N);
-      LLVM_DEBUG(dbgs() << "Erasing node " << BlockNamePrinter(TN) << "\n");
-
-      EraseNode(DT, TN);
+      LLVM_DEBUG(dbgs() << "Erasing node " << BlockNamePrinter(DT.getNode(N))
+                        << "\n");
+      DT.eraseNode(N);
     }
 
     // The affected subtree start at the To node -- there's no extra work to do.
@@ -1109,22 +1108,6 @@ struct SemiNCAInfo {
     SNCA.reattachExistingSubtree(DT, PrevIDom);
   }
 
-  // Removes leaf tree nodes from the dominator tree.
-  static void EraseNode(DomTreeT &DT, const TreeNodePtr TN) {
-    assert(TN);
-    assert(TN->getNumChildren() == 0 && "Not a tree leaf");
-
-    const TreeNodePtr IDom = TN->getIDom();
-    assert(IDom);
-
-    auto ChIt = llvm::find(IDom->Children, TN);
-    assert(ChIt != IDom->Children.end());
-    std::swap(*ChIt, IDom->Children.back());
-    IDom->Children.pop_back();
-
-    DT.DomTreeNodes.erase(TN->getBlock());
-  }
-
   //~~
   //===--------------------- DomTree Batch Updater --------------------------===
   //~~
diff --git a/llvm/include/llvm/Support/MathExtras.h b/llvm/include/llvm/Support/MathExtras.h
index 0d0fa82..e568e42 100644
--- a/llvm/include/llvm/Support/MathExtras.h
+++ b/llvm/include/llvm/Support/MathExtras.h
@@ -770,6 +770,14 @@ std::enable_if_t<std::is_signed_v<T>, T> MulOverflow(T X, T Y, T &Result) {
 #endif
 }
 
+/// Type to force float point values onto the stack, so that x86 doesn't add
+/// hidden precision, avoiding rounding differences on various platforms.
+#if defined(__i386__) || defined(_M_IX86)
+using stack_float_t = volatile float;
+#else
+using stack_float_t = float;
+#endif
+
 } // namespace llvm
 
 #endif
diff --git a/llvm/include/llvm/Transforms/Instrumentation/AddressSanitizer.h b/llvm/include/llvm/Transforms/Instrumentation/AddressSanitizer.h
index 6dfdfb7..1b85766 100644
--- a/llvm/include/llvm/Transforms/Instrumentation/AddressSanitizer.h
+++ b/llvm/include/llvm/Transforms/Instrumentation/AddressSanitizer.h
@@ -1,4 +1,4 @@
-//===--------- Definition of the AddressSanitizer class ---------*- C++ -*-===//
+//===- AddressSanitizer.h - AddressSanitizer instrumentation ----*- C++ -*-===//
 //
 // Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
 // See https://llvm.org/LICENSE.txt for license information.
diff --git a/llvm/include/llvm/Transforms/Instrumentation/DataFlowSanitizer.h b/llvm/include/llvm/Transforms/Instrumentation/DataFlowSanitizer.h
index 41ba05c..3256ddd 100644
--- a/llvm/include/llvm/Transforms/Instrumentation/DataFlowSanitizer.h
+++ b/llvm/include/llvm/Transforms/Instrumentation/DataFlowSanitizer.h
@@ -1,4 +1,4 @@
-//===- DataFlowSanitizer.h - dynamic data flow analysis -------------------===//
+//===- DataFlowSanitizer.h - dynamic data flow analysis ---------*- C++ -*-===//
 //
 // Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
 // See https://llvm.org/LICENSE.txt for license information.
diff --git a/llvm/include/llvm/Transforms/Instrumentation/MemorySanitizer.h b/llvm/include/llvm/Transforms/Instrumentation/MemorySanitizer.h
index 0984e8ec..f88d832 100644
--- a/llvm/include/llvm/Transforms/Instrumentation/MemorySanitizer.h
+++ b/llvm/include/llvm/Transforms/Instrumentation/MemorySanitizer.h
@@ -1,4 +1,4 @@
-//===- Transforms/Instrumentation/MemorySanitizer.h - MSan Pass -----------===//
+//===- MemorySanitizer.h - MemorySanitizer instrumentation ------*- C++ -*-===//
 //
 // Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
 // See https://llvm.org/LICENSE.txt for license information.
diff --git a/llvm/include/llvm/Transforms/Instrumentation/ThreadSanitizer.h b/llvm/include/llvm/Transforms/Instrumentation/ThreadSanitizer.h
index fd37130..346951fe 100644
--- a/llvm/include/llvm/Transforms/Instrumentation/ThreadSanitizer.h
+++ b/llvm/include/llvm/Transforms/Instrumentation/ThreadSanitizer.h
@@ -1,4 +1,4 @@
-//===- Transforms/Instrumentation/ThreadSanitizer.h - TSan Pass -----------===//
+//===- ThreadSanitizer.h - ThreadSanitizer instrumentation ------*- C++ -*-===//
 //
 // Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
 // See https://llvm.org/LICENSE.txt for license information.
diff --git a/llvm/include/llvm/Transforms/Utils/SSAUpdaterImpl.h b/llvm/include/llvm/Transforms/Utils/SSAUpdaterImpl.h
index 28ff6c4..010d6b0 100644
--- a/llvm/include/llvm/Transforms/Utils/SSAUpdaterImpl.h
+++ b/llvm/include/llvm/Transforms/Utils/SSAUpdaterImpl.h
@@ -15,6 +15,7 @@
 #define LLVM_TRANSFORMS_UTILS_SSAUPDATERIMPL_H
 
 #include "llvm/ADT/DenseMap.h"
+#include "llvm/ADT/ScopeExit.h"
 #include "llvm/ADT/SmallVector.h"
 #include "llvm/Support/Allocator.h"
 #include "llvm/Support/Debug.h"
@@ -413,26 +414,33 @@ public:
   /// FindExistingPHI - Look through the PHI nodes in a block to see if any of
   /// them match what is needed.
   void FindExistingPHI(BlkT *BB, BlockListTy *BlockList) {
+    SmallVector<BBInfo *, 20> TaggedBlocks;
     for (auto &SomePHI : BB->phis()) {
-      if (CheckIfPHIMatches(&SomePHI)) {
+      if (CheckIfPHIMatches(&SomePHI, TaggedBlocks)) {
         RecordMatchingPHIs(BlockList);
         break;
       }
-      // Match failed: clear all the PHITag values.
-      for (typename BlockListTy::iterator I = BlockList->begin(),
-             E = BlockList->end(); I != E; ++I)
-        (*I)->PHITag = nullptr;
     }
   }
 
   /// CheckIfPHIMatches - Check if a PHI node matches the placement and values
   /// in the BBMap.
-  bool CheckIfPHIMatches(PhiT *PHI) {
+  bool CheckIfPHIMatches(PhiT *PHI, SmallVectorImpl<BBInfo *> &TaggedBlocks) {
+    // Match failed: clear all the PHITag values. Only need to clear visited
+    // blocks.
+    auto Cleanup = make_scope_exit([&]() {
+      for (BBInfo *TaggedBlock : TaggedBlocks)
+        TaggedBlock->PHITag = nullptr;
+      TaggedBlocks.clear();
+    });
+
     SmallVector<PhiT *, 20> WorkList;
     WorkList.push_back(PHI);
 
     // Mark that the block containing this PHI has been visited.
-    BBMap[PHI->getParent()]->PHITag = PHI;
+    BBInfo *PHIBlock = BBMap[PHI->getParent()];
+    PHIBlock->PHITag = PHI;
+    TaggedBlocks.push_back(PHIBlock);
 
     while (!WorkList.empty()) {
       PHI = WorkList.pop_back_val();
@@ -465,10 +473,13 @@ public:
           return false;
         }
         PredInfo->PHITag = IncomingPHIVal;
+        TaggedBlocks.push_back(PredInfo);
 
         WorkList.push_back(IncomingPHIVal);
       }
     }
+    // Match found, keep PHITags.
+    Cleanup.release();
     return true;
   }
 
diff --git a/llvm/include/llvm/Transforms/Utils/SimplifyLibCalls.h b/llvm/include/llvm/Transforms/Utils/SimplifyLibCalls.h
index 770da12..43b5c92 100644
--- a/llvm/include/llvm/Transforms/Utils/SimplifyLibCalls.h
+++ b/llvm/include/llvm/Transforms/Utils/SimplifyLibCalls.h
@@ -245,6 +245,9 @@ private:
   Value *optimizeSnPrintFString(CallInst *CI, IRBuilderBase &B);
   Value *optimizeFPrintFString(CallInst *CI, IRBuilderBase &B);
 
+  /// Exit functions
+  Value *optimizeExit(CallInst *CI);
+
   /// hasFloatVersion - Checks if there is a float version of the specified
   /// function by checking for an existing function with name FuncName + f
   bool hasFloatVersion(const Module *M, StringRef FuncName);
diff --git a/llvm/lib/Analysis/ConstantFolding.cpp b/llvm/lib/Analysis/ConstantFolding.cpp
index 85ee231..084647b 100644
--- a/llvm/lib/Analysis/ConstantFolding.cpp
+++ b/llvm/lib/Analysis/ConstantFolding.cpp
@@ -1782,8 +1782,8 @@ Constant *ConstantFoldFP(double (*NativeFP)(double), const APFloat &V,
 }
 
 #if defined(HAS_IEE754_FLOAT128) && defined(HAS_LOGF128)
-Constant *ConstantFoldFP128(long double (*NativeFP)(long double),
-                            const APFloat &V, Type *Ty) {
+Constant *ConstantFoldFP128(float128 (*NativeFP)(float128), const APFloat &V,
+                            Type *Ty) {
   llvm_fenv_clearexcept();
   float128 Result = NativeFP(V.convertToQuad());
   if (llvm_fenv_testexcept()) {
diff --git a/llvm/lib/Analysis/DXILResource.cpp b/llvm/lib/Analysis/DXILResource.cpp
index 72cba9d..5e8350f 100644
--- a/llvm/lib/Analysis/DXILResource.cpp
+++ b/llvm/lib/Analysis/DXILResource.cpp
@@ -69,8 +69,7 @@ ResourceInfo ResourceInfo::SRV(Value *Symbol, StringRef Name,
   ResourceInfo RI(ResourceClass::SRV, Kind, Symbol, Name);
   assert(RI.isTyped() && !(RI.isStruct() || RI.isMultiSample()) &&
          "Invalid ResourceKind for SRV constructor.");
-  RI.Typed.ElementTy = ElementTy;
-  RI.Typed.ElementCount = ElementCount;
+  RI.setTyped(ElementTy, ElementCount);
   return RI;
 }
 
@@ -80,11 +79,11 @@ ResourceInfo ResourceInfo::RawBuffer(Value *Symbol, StringRef Name) {
 }
 
 ResourceInfo ResourceInfo::StructuredBuffer(Value *Symbol, StringRef Name,
-                                            uint32_t Stride, Align Alignment) {
+                                            uint32_t Stride,
+                                            MaybeAlign Alignment) {
   ResourceInfo RI(ResourceClass::SRV, ResourceKind::StructuredBuffer, Symbol,
                   Name);
-  RI.Struct.Stride = Stride;
-  RI.Struct.Alignment = Alignment;
+  RI.setStruct(Stride, Alignment);
   return RI;
 }
 
@@ -93,9 +92,8 @@ ResourceInfo ResourceInfo::Texture2DMS(Value *Symbol, StringRef Name,
                                        uint32_t ElementCount,
                                        uint32_t SampleCount) {
   ResourceInfo RI(ResourceClass::SRV, ResourceKind::Texture2DMS, Symbol, Name);
-  RI.Typed.ElementTy = ElementTy;
-  RI.Typed.ElementCount = ElementCount;
-  RI.MultiSample.Count = SampleCount;
+  RI.setTyped(ElementTy, ElementCount);
+  RI.setMultiSample(SampleCount);
   return RI;
 }
 
@@ -105,9 +103,8 @@ ResourceInfo ResourceInfo::Texture2DMSArray(Value *Symbol, StringRef Name,
                                             uint32_t SampleCount) {
   ResourceInfo RI(ResourceClass::SRV, ResourceKind::Texture2DMSArray, Symbol,
                   Name);
-  RI.Typed.ElementTy = ElementTy;
-  RI.Typed.ElementCount = ElementCount;
-  RI.MultiSample.Count = SampleCount;
+  RI.setTyped(ElementTy, ElementCount);
+  RI.setMultiSample(SampleCount);
   return RI;
 }
 
@@ -118,34 +115,27 @@ ResourceInfo ResourceInfo::UAV(Value *Symbol, StringRef Name,
   ResourceInfo RI(ResourceClass::UAV, Kind, Symbol, Name);
   assert(RI.isTyped() && !(RI.isStruct() || RI.isMultiSample()) &&
          "Invalid ResourceKind for UAV constructor.");
-  RI.Typed.ElementTy = ElementTy;
-  RI.Typed.ElementCount = ElementCount;
-  RI.UAVFlags.GloballyCoherent = GloballyCoherent;
-  RI.UAVFlags.IsROV = IsROV;
-  RI.UAVFlags.HasCounter = false;
+  RI.setTyped(ElementTy, ElementCount);
+  RI.setUAV(GloballyCoherent, /*HasCounter=*/false, IsROV);
   return RI;
 }
 
 ResourceInfo ResourceInfo::RWRawBuffer(Value *Symbol, StringRef Name,
                                        bool GloballyCoherent, bool IsROV) {
   ResourceInfo RI(ResourceClass::UAV, ResourceKind::RawBuffer, Symbol, Name);
-  RI.UAVFlags.GloballyCoherent = GloballyCoherent;
-  RI.UAVFlags.IsROV = IsROV;
-  RI.UAVFlags.HasCounter = false;
+  RI.setUAV(GloballyCoherent, /*HasCounter=*/false, IsROV);
   return RI;
 }
 
 ResourceInfo ResourceInfo::RWStructuredBuffer(Value *Symbol, StringRef Name,
-                                              uint32_t Stride, Align Alignment,
+                                              uint32_t Stride,
+                                              MaybeAlign Alignment,
                                               bool GloballyCoherent, bool IsROV,
                                               bool HasCounter) {
   ResourceInfo RI(ResourceClass::UAV, ResourceKind::StructuredBuffer, Symbol,
                   Name);
-  RI.Struct.Stride = Stride;
-  RI.Struct.Alignment = Alignment;
-  RI.UAVFlags.GloballyCoherent = GloballyCoherent;
-  RI.UAVFlags.IsROV = IsROV;
-  RI.UAVFlags.HasCounter = HasCounter;
+  RI.setStruct(Stride, Alignment);
+  RI.setUAV(GloballyCoherent, HasCounter, IsROV);
   return RI;
 }
 
@@ -155,12 +145,9 @@ ResourceInfo ResourceInfo::RWTexture2DMS(Value *Symbol, StringRef Name,
                                          uint32_t SampleCount,
                                          bool GloballyCoherent) {
   ResourceInfo RI(ResourceClass::UAV, ResourceKind::Texture2DMS, Symbol, Name);
-  RI.Typed.ElementTy = ElementTy;
-  RI.Typed.ElementCount = ElementCount;
-  RI.UAVFlags.GloballyCoherent = GloballyCoherent;
-  RI.UAVFlags.IsROV = false;
-  RI.UAVFlags.HasCounter = false;
-  RI.MultiSample.Count = SampleCount;
+  RI.setTyped(ElementTy, ElementCount);
+  RI.setUAV(GloballyCoherent, /*HasCounter=*/false, /*IsROV=*/false);
+  RI.setMultiSample(SampleCount);
   return RI;
 }
 
@@ -171,12 +158,9 @@ ResourceInfo ResourceInfo::RWTexture2DMSArray(Value *Symbol, StringRef Name,
                                               bool GloballyCoherent) {
   ResourceInfo RI(ResourceClass::UAV, ResourceKind::Texture2DMSArray, Symbol,
                   Name);
-  RI.Typed.ElementTy = ElementTy;
-  RI.Typed.ElementCount = ElementCount;
-  RI.UAVFlags.GloballyCoherent = GloballyCoherent;
-  RI.UAVFlags.IsROV = false;
-  RI.UAVFlags.HasCounter = false;
-  RI.MultiSample.Count = SampleCount;
+  RI.setTyped(ElementTy, ElementCount);
+  RI.setUAV(GloballyCoherent, /*HasCounter=*/false, /*IsROV=*/false);
+  RI.setMultiSample(SampleCount);
   return RI;
 }
 
@@ -184,10 +168,8 @@ ResourceInfo ResourceInfo::FeedbackTexture2D(Value *Symbol, StringRef Name,
                                              SamplerFeedbackType FeedbackTy) {
   ResourceInfo RI(ResourceClass::UAV, ResourceKind::FeedbackTexture2D, Symbol,
                   Name);
-  RI.UAVFlags.GloballyCoherent = false;
-  RI.UAVFlags.IsROV = false;
-  RI.UAVFlags.HasCounter = false;
-  RI.Feedback.Type = FeedbackTy;
+  RI.setUAV(/*GloballyCoherent=*/false, /*HasCounter=*/false, /*IsROV=*/false);
+  RI.setFeedback(FeedbackTy);
   return RI;
 }
 
@@ -196,24 +178,22 @@ ResourceInfo::FeedbackTexture2DArray(Value *Symbol, StringRef Name,
                                      SamplerFeedbackType FeedbackTy) {
   ResourceInfo RI(ResourceClass::UAV, ResourceKind::FeedbackTexture2DArray,
                   Symbol, Name);
-  RI.UAVFlags.GloballyCoherent = false;
-  RI.UAVFlags.IsROV = false;
-  RI.UAVFlags.HasCounter = false;
-  RI.Feedback.Type = FeedbackTy;
+  RI.setUAV(/*GloballyCoherent=*/false, /*HasCounter=*/false, /*IsROV=*/false);
+  RI.setFeedback(FeedbackTy);
   return RI;
 }
 
 ResourceInfo ResourceInfo::CBuffer(Value *Symbol, StringRef Name,
                                    uint32_t Size) {
   ResourceInfo RI(ResourceClass::CBuffer, ResourceKind::CBuffer, Symbol, Name);
-  RI.CBufferSize = Size;
+  RI.setCBuffer(Size);
   return RI;
 }
 
 ResourceInfo ResourceInfo::Sampler(Value *Symbol, StringRef Name,
                                    SamplerType SamplerTy) {
   ResourceInfo RI(ResourceClass::Sampler, ResourceKind::Sampler, Symbol, Name);
-  RI.SamplerTy = SamplerTy;
+  RI.setSampler(SamplerTy);
   return RI;
 }
 
@@ -306,7 +286,7 @@ MDTuple *ResourceInfo::getAsMetadata(LLVMContext &Ctx) const {
 
 std::pair<uint32_t, uint32_t> ResourceInfo::getAnnotateProps() const {
   uint32_t ResourceKind = llvm::to_underlying(Kind);
-  uint32_t AlignLog2 = isStruct() ? Log2(Struct.Alignment) : 0;
+  uint32_t AlignLog2 = isStruct() ? Struct.AlignLog2 : 0;
   bool IsUAV = isUAV();
   bool IsROV = IsUAV && UAVFlags.IsROV;
   bool IsGloballyCoherent = IsUAV && UAVFlags.GloballyCoherent;
diff --git a/llvm/lib/Analysis/DomTreeUpdater.cpp b/llvm/lib/Analysis/DomTreeUpdater.cpp
index 6895317..351bd66 100644
--- a/llvm/lib/Analysis/DomTreeUpdater.cpp
+++ b/llvm/lib/Analysis/DomTreeUpdater.cpp
@@ -42,9 +42,8 @@ bool DomTreeUpdater::forceFlushDeletedBB() {
     // delete only has an UnreachableInst inside.
     assert(BB->size() == 1 && isa<UnreachableInst>(BB->getTerminator()) &&
            "DelBB has been modified while awaiting deletion.");
-    BB->removeFromParent();
     eraseDelBBNode(BB);
-    delete BB;
+    BB->eraseFromParent();
   }
   DeletedBBs.clear();
   Callbacks.clear();
@@ -63,9 +62,8 @@ void DomTreeUpdater::deleteBB(BasicBlock *DelBB) {
     return;
   }
 
-  DelBB->removeFromParent();
   eraseDelBBNode(DelBB);
-  delete DelBB;
+  DelBB->eraseFromParent();
 }
 
 void DomTreeUpdater::callbackDeleteBB(
@@ -77,8 +75,8 @@ void DomTreeUpdater::callbackDeleteBB(
     return;
   }
 
-  DelBB->removeFromParent();
   eraseDelBBNode(DelBB);
+  DelBB->removeFromParent();
   Callback(DelBB);
   delete DelBB;
 }
diff --git a/llvm/lib/Analysis/InstructionSimplify.cpp b/llvm/lib/Analysis/InstructionSimplify.cpp
index 3a7ae57..12a3193 100644
--- a/llvm/lib/Analysis/InstructionSimplify.cpp
+++ b/llvm/lib/Analysis/InstructionSimplify.cpp
@@ -88,7 +88,7 @@ static Value *foldSelectWithBinaryOp(Value *Cond, Value *TrueVal,
   else
     return nullptr;
 
-  CmpInst::Predicate ExpectedPred, Pred1, Pred2;
+  CmpInst::Predicate ExpectedPred;
   if (BinOpCode == BinaryOperator::Or) {
     ExpectedPred = ICmpInst::ICMP_NE;
   } else if (BinOpCode == BinaryOperator::And) {
@@ -110,10 +110,10 @@ static Value *foldSelectWithBinaryOp(Value *Cond, Value *TrueVal,
   // -->
   // %TV
   Value *X, *Y;
-  if (!match(Cond, m_c_BinOp(m_c_ICmp(Pred1, m_Specific(TrueVal),
-                                      m_Specific(FalseVal)),
-                             m_ICmp(Pred2, m_Value(X), m_Value(Y)))) ||
-      Pred1 != Pred2 || Pred1 != ExpectedPred)
+  if (!match(Cond,
+             m_c_BinOp(m_c_SpecificICmp(ExpectedPred, m_Specific(TrueVal),
+                                        m_Specific(FalseVal)),
+                       m_SpecificICmp(ExpectedPred, m_Value(X), m_Value(Y)))))
     return nullptr;
 
   if (X == TrueVal || X == FalseVal || Y == TrueVal || Y == FalseVal)
diff --git a/llvm/lib/Analysis/Loads.cpp b/llvm/lib/Analysis/Loads.cpp
index 61c6aa5..a88469a 100644
--- a/llvm/lib/Analysis/Loads.cpp
+++ b/llvm/lib/Analysis/Loads.cpp
@@ -345,6 +345,19 @@ bool llvm::isDereferenceableAndAlignedInLoop(LoadInst *LI, Loop *L,
                                             HeaderFirstNonPHI, AC, &DT);
 }
 
+static bool suppressSpeculativeLoadForSanitizers(const Instruction &CtxI) {
+  const Function &F = *CtxI.getFunction();
+  // Speculative load may create a race that did not exist in the source.
+  return F.hasFnAttribute(Attribute::SanitizeThread) ||
+         // Speculative load may load data from dirty regions.
+         F.hasFnAttribute(Attribute::SanitizeAddress) ||
+         F.hasFnAttribute(Attribute::SanitizeHWAddress);
+}
+
+bool llvm::mustSuppressSpeculation(const LoadInst &LI) {
+  return !LI.isUnordered() || suppressSpeculativeLoadForSanitizers(LI);
+}
+
 /// Check if executing a load of this pointer value cannot trap.
 ///
 /// If DT and ScanFrom are specified this method performs context-sensitive
@@ -365,8 +378,12 @@ bool llvm::isSafeToLoadUnconditionally(Value *V, Align Alignment, const APInt &S
   // If DT is not specified we can't make context-sensitive query
   const Instruction* CtxI = DT ? ScanFrom : nullptr;
   if (isDereferenceableAndAlignedPointer(V, Alignment, Size, DL, CtxI, AC, DT,
-                                         TLI))
-    return true;
+                                         TLI)) {
+    // With sanitizers `Dereferenceable` is not always enough for unconditional
+    // load.
+    if (!ScanFrom || !suppressSpeculativeLoadForSanitizers(*ScanFrom))
+      return true;
+  }
 
   if (!ScanFrom)
     return false;
diff --git a/llvm/lib/Analysis/ScalarEvolution.cpp b/llvm/lib/Analysis/ScalarEvolution.cpp
index 51cffac..264ac39 100644
--- a/llvm/lib/Analysis/ScalarEvolution.cpp
+++ b/llvm/lib/Analysis/ScalarEvolution.cpp
@@ -581,27 +581,10 @@ void SCEVUnknown::allUsesReplacedWith(Value *New) {
 
 /// Compare the two values \p LV and \p RV in terms of their "complexity" where
 /// "complexity" is a partial (and somewhat ad-hoc) relation used to order
-/// operands in SCEV expressions.  \p EqCache is a set of pairs of values that
-/// have been previously deemed to be "equally complex" by this routine.  It is
-/// intended to avoid exponential time complexity in cases like:
-///
-///   %a = f(%x, %y)
-///   %b = f(%a, %a)
-///   %c = f(%b, %b)
-///
-///   %d = f(%x, %y)
-///   %e = f(%d, %d)
-///   %f = f(%e, %e)
-///
-///   CompareValueComplexity(%f, %c)
-///
-/// Since we do not continue running this routine on expression trees once we
-/// have seen unequal values, there is no need to track them in the cache.
-static int
-CompareValueComplexity(EquivalenceClasses<const Value *> &EqCacheValue,
-                       const LoopInfo *const LI, Value *LV, Value *RV,
-                       unsigned Depth) {
-  if (Depth > MaxValueCompareDepth || EqCacheValue.isEquivalent(LV, RV))
+/// operands in SCEV expressions.
+static int CompareValueComplexity(const LoopInfo *const LI, Value *LV,
+                                  Value *RV, unsigned Depth) {
+  if (Depth > MaxValueCompareDepth)
     return 0;
 
   // Order pointer values after integer values. This helps SCEVExpander form
@@ -660,15 +643,13 @@ CompareValueComplexity(EquivalenceClasses<const Value *> &EqCacheValue,
       return (int)LNumOps - (int)RNumOps;
 
     for (unsigned Idx : seq(LNumOps)) {
-      int Result =
-          CompareValueComplexity(EqCacheValue, LI, LInst->getOperand(Idx),
-                                 RInst->getOperand(Idx), Depth + 1);
+      int Result = CompareValueComplexity(LI, LInst->getOperand(Idx),
+                                          RInst->getOperand(Idx), Depth + 1);
       if (Result != 0)
         return Result;
     }
   }
 
-  EqCacheValue.unionSets(LV, RV);
   return 0;
 }
 
@@ -679,7 +660,6 @@ CompareValueComplexity(EquivalenceClasses<const Value *> &EqCacheValue,
 // not know if they are equivalent for sure.
 static std::optional<int>
 CompareSCEVComplexity(EquivalenceClasses<const SCEV *> &EqCacheSCEV,
-                      EquivalenceClasses<const Value *> &EqCacheValue,
                       const LoopInfo *const LI, const SCEV *LHS,
                       const SCEV *RHS, DominatorTree &DT, unsigned Depth = 0) {
   // Fast-path: SCEVs are uniqued so we can do a quick equality check.
@@ -705,8 +685,8 @@ CompareSCEVComplexity(EquivalenceClasses<const SCEV *> &EqCacheSCEV,
     const SCEVUnknown *LU = cast<SCEVUnknown>(LHS);
     const SCEVUnknown *RU = cast<SCEVUnknown>(RHS);
 
-    int X = CompareValueComplexity(EqCacheValue, LI, LU->getValue(),
-                                   RU->getValue(), Depth + 1);
+    int X =
+        CompareValueComplexity(LI, LU->getValue(), RU->getValue(), Depth + 1);
     if (X == 0)
       EqCacheSCEV.unionSets(LHS, RHS);
     return X;
@@ -773,8 +753,8 @@ CompareSCEVComplexity(EquivalenceClasses<const SCEV *> &EqCacheSCEV,
       return (int)LNumOps - (int)RNumOps;
 
     for (unsigned i = 0; i != LNumOps; ++i) {
-      auto X = CompareSCEVComplexity(EqCacheSCEV, EqCacheValue, LI, LOps[i],
-                                     ROps[i], DT, Depth + 1);
+      auto X = CompareSCEVComplexity(EqCacheSCEV, LI, LOps[i], ROps[i], DT,
+                                     Depth + 1);
       if (X != 0)
         return X;
     }
@@ -802,12 +782,10 @@ static void GroupByComplexity(SmallVectorImpl<const SCEV *> &Ops,
   if (Ops.size() < 2) return;  // Noop
 
   EquivalenceClasses<const SCEV *> EqCacheSCEV;
-  EquivalenceClasses<const Value *> EqCacheValue;
 
   // Whether LHS has provably less complexity than RHS.
   auto IsLessComplex = [&](const SCEV *LHS, const SCEV *RHS) {
-    auto Complexity =
-        CompareSCEVComplexity(EqCacheSCEV, EqCacheValue, LI, LHS, RHS, DT);
+    auto Complexity = CompareSCEVComplexity(EqCacheSCEV, LI, LHS, RHS, DT);
     return Complexity && *Complexity < 0;
   };
   if (Ops.size() == 2) {
@@ -9171,23 +9149,21 @@ ScalarEvolution::ExitLimit ScalarEvolution::computeExitLimitFromICmp(
   // behaviour), and we can prove the test sequence produced must repeat
   // the same values on self-wrap of the IV, then we can infer that IV
   // doesn't self wrap because if it did, we'd have an infinite (undefined)
-  // loop.
+  // loop.  Note that a stride of 0 is trivially no-self-wrap by definition.
   if (ControllingFiniteLoop && isLoopInvariant(RHS, L)) {
     // TODO: We can peel off any functions which are invertible *in L*.  Loop
     // invariant terms are effectively constants for our purposes here.
     auto *InnerLHS = LHS;
     if (auto *ZExt = dyn_cast<SCEVZeroExtendExpr>(LHS))
       InnerLHS = ZExt->getOperand();
-    if (const SCEVAddRecExpr *AR = dyn_cast<SCEVAddRecExpr>(InnerLHS)) {
-      auto *StrideC = dyn_cast<SCEVConstant>(AR->getStepRecurrence(*this));
-      if (!AR->hasNoSelfWrap() && AR->getLoop() == L && AR->isAffine() &&
-          StrideC && StrideC->getAPInt().isPowerOf2()) {
-        auto Flags = AR->getNoWrapFlags();
-        Flags = setFlags(Flags, SCEV::FlagNW);
-        SmallVector<const SCEV*> Operands{AR->operands()};
-        Flags = StrengthenNoWrapFlags(this, scAddRecExpr, Operands, Flags);
-        setNoWrapFlags(const_cast<SCEVAddRecExpr *>(AR), Flags);
-      }
+    if (const SCEVAddRecExpr *AR = dyn_cast<SCEVAddRecExpr>(InnerLHS);
+        AR && !AR->hasNoSelfWrap() && AR->getLoop() == L && AR->isAffine() &&
+        isKnownToBeAPowerOfTwo(AR->getStepRecurrence(*this), /*OrZero=*/true)) {
+      auto Flags = AR->getNoWrapFlags();
+      Flags = setFlags(Flags, SCEV::FlagNW);
+      SmallVector<const SCEV *> Operands{AR->operands()};
+      Flags = StrengthenNoWrapFlags(this, scAddRecExpr, Operands, Flags);
+      setNoWrapFlags(const_cast<SCEVAddRecExpr *>(AR), Flags);
     }
   }
 
@@ -10867,6 +10843,23 @@ bool ScalarEvolution::isKnownNonZero(const SCEV *S) {
   return getUnsignedRangeMin(S) != 0;
 }
 
+bool ScalarEvolution::isKnownToBeAPowerOfTwo(const SCEV *S, bool OrZero) {
+  auto NonRecursive = [this](const SCEV *S) {
+    if (auto *C = dyn_cast<SCEVConstant>(S))
+      return C->getAPInt().isPowerOf2();
+    // The vscale_range indicates vscale is a power-of-two.
+    return isa<SCEVVScale>(S) && F.hasFnAttribute(Attribute::VScaleRange);
+  };
+
+  if (NonRecursive(S))
+    return true;
+
+  auto *Mul = dyn_cast<SCEVMulExpr>(S);
+  if (!Mul)
+    return false;
+  return all_of(Mul->operands(), NonRecursive) && (OrZero || isKnownNonZero(S));
+}
+
 std::pair<const SCEV *, const SCEV *>
 ScalarEvolution::SplitIntoInitAndPostInc(const Loop *L, const SCEV *S) {
   // Compute SCEV on entry of loop L.
@@ -12098,8 +12091,10 @@ bool ScalarEvolution::isImpliedCondOperandsViaNoOverflow(
   // C)".
 
   std::optional<APInt> LDiff = computeConstantDifference(LHS, FoundLHS);
+  if (!LDiff)
+    return false;
   std::optional<APInt> RDiff = computeConstantDifference(RHS, FoundRHS);
-  if (!LDiff || !RDiff || *LDiff != *RDiff)
+  if (!RDiff || *LDiff != *RDiff)
     return false;
 
   if (LDiff->isMinValue())
@@ -12795,8 +12790,7 @@ ScalarEvolution::howManyLessThans(const SCEV *LHS, const SCEV *RHS,
     if (!isLoopInvariant(RHS, L))
       return false;
 
-    auto *StrideC = dyn_cast<SCEVConstant>(AR->getStepRecurrence(*this));
-    if (!StrideC || !StrideC->getAPInt().isPowerOf2())
+    if (!isKnownToBeAPowerOfTwo(AR->getStepRecurrence(*this), /*OrZero=*/true))
       return false;
 
     if (!ControlsOnlyExit || !loopHasNoAbnormalExits(L))
@@ -13152,52 +13146,50 @@ ScalarEvolution::howManyLessThans(const SCEV *LHS, const SCEV *RHS,
       // "(Start - End) + (Stride - 1)" has unsigned overflow.
       const SCEV *One = getOne(Stride->getType());
       bool MayAddOverflow = [&] {
-        if (auto *StrideC = dyn_cast<SCEVConstant>(Stride)) {
-          if (StrideC->getAPInt().isPowerOf2()) {
-            // Suppose Stride is a power of two, and Start/End are unsigned
-            // integers.  Let UMAX be the largest representable unsigned
-            // integer.
-            //
-            // By the preconditions of this function, we know
-            // "(Start + Stride * N) >= End", and this doesn't overflow.
-            // As a formula:
-            //
-            //   End <= (Start + Stride * N) <= UMAX
-            //
-            // Subtracting Start from all the terms:
-            //
-            //   End - Start <= Stride * N <= UMAX - Start
-            //
-            // Since Start is unsigned, UMAX - Start <= UMAX.  Therefore:
-            //
-            //   End - Start <= Stride * N <= UMAX
-            //
-            // Stride * N is a multiple of Stride. Therefore,
-            //
-            //   End - Start <= Stride * N <= UMAX - (UMAX mod Stride)
-            //
-            // Since Stride is a power of two, UMAX + 1 is divisible by
-            // Stride. Therefore, UMAX mod Stride == Stride - 1.  So we can
-            // write:
-            //
-            //   End - Start <= Stride * N <= UMAX - Stride - 1
-            //
-            // Dropping the middle term:
-            //
-            //   End - Start <= UMAX - Stride - 1
-            //
-            // Adding Stride - 1 to both sides:
-            //
-            //   (End - Start) + (Stride - 1) <= UMAX
-            //
-            // In other words, the addition doesn't have unsigned overflow.
-            //
-            // A similar proof works if we treat Start/End as signed values.
-            // Just rewrite steps before "End - Start <= Stride * N <= UMAX"
-            // to use signed max instead of unsigned max. Note that we're
-            // trying to prove a lack of unsigned overflow in either case.
-            return false;
-          }
+        if (isKnownToBeAPowerOfTwo(Stride)) {
+          // Suppose Stride is a power of two, and Start/End are unsigned
+          // integers.  Let UMAX be the largest representable unsigned
+          // integer.
+          //
+          // By the preconditions of this function, we know
+          // "(Start + Stride * N) >= End", and this doesn't overflow.
+          // As a formula:
+          //
+          //   End <= (Start + Stride * N) <= UMAX
+          //
+          // Subtracting Start from all the terms:
+          //
+          //   End - Start <= Stride * N <= UMAX - Start
+          //
+          // Since Start is unsigned, UMAX - Start <= UMAX.  Therefore:
+          //
+          //   End - Start <= Stride * N <= UMAX
+          //
+          // Stride * N is a multiple of Stride. Therefore,
+          //
+          //   End - Start <= Stride * N <= UMAX - (UMAX mod Stride)
+          //
+          // Since Stride is a power of two, UMAX + 1 is divisible by
+          // Stride. Therefore, UMAX mod Stride == Stride - 1.  So we can
+          // write:
+          //
+          //   End - Start <= Stride * N <= UMAX - Stride - 1
+          //
+          // Dropping the middle term:
+          //
+          //   End - Start <= UMAX - Stride - 1
+          //
+          // Adding Stride - 1 to both sides:
+          //
+          //   (End - Start) + (Stride - 1) <= UMAX
+          //
+          // In other words, the addition doesn't have unsigned overflow.
+          //
+          // A similar proof works if we treat Start/End as signed values.
+          // Just rewrite steps before "End - Start <= Stride * N <= UMAX"
+          // to use signed max instead of unsigned max. Note that we're
+          // trying to prove a lack of unsigned overflow in either case.
+          return false;
         }
         if (Start == Stride || Start == getMinusSCEV(Stride, One)) {
           // If Start is equal to Stride, (End - Start) + (Stride - 1) == End
diff --git a/llvm/lib/Analysis/TargetTransformInfo.cpp b/llvm/lib/Analysis/TargetTransformInfo.cpp
index 6a0fa98..dcde789 100644
--- a/llvm/lib/Analysis/TargetTransformInfo.cpp
+++ b/llvm/lib/Analysis/TargetTransformInfo.cpp
@@ -1198,7 +1198,7 @@ Value *TargetTransformInfo::getOrCreateResultFromMemIntrinsic(
 
 Type *TargetTransformInfo::getMemcpyLoopLoweringType(
     LLVMContext &Context, Value *Length, unsigned SrcAddrSpace,
-    unsigned DestAddrSpace, unsigned SrcAlign, unsigned DestAlign,
+    unsigned DestAddrSpace, Align SrcAlign, Align DestAlign,
     std::optional<uint32_t> AtomicElementSize) const {
   return TTIImpl->getMemcpyLoopLoweringType(Context, Length, SrcAddrSpace,
                                             DestAddrSpace, SrcAlign, DestAlign,
@@ -1208,7 +1208,7 @@ Type *TargetTransformInfo::getMemcpyLoopLoweringType(
 void TargetTransformInfo::getMemcpyLoopResidualLoweringType(
     SmallVectorImpl<Type *> &OpsOut, LLVMContext &Context,
     unsigned RemainingBytes, unsigned SrcAddrSpace, unsigned DestAddrSpace,
-    unsigned SrcAlign, unsigned DestAlign,
+    Align SrcAlign, Align DestAlign,
     std::optional<uint32_t> AtomicCpySize) const {
   TTIImpl->getMemcpyLoopResidualLoweringType(
       OpsOut, Context, RemainingBytes, SrcAddrSpace, DestAddrSpace, SrcAlign,
diff --git a/llvm/lib/Analysis/ValueTracking.cpp b/llvm/lib/Analysis/ValueTracking.cpp
index bfd26fa..285284d 100644
--- a/llvm/lib/Analysis/ValueTracking.cpp
+++ b/llvm/lib/Analysis/ValueTracking.cpp
@@ -254,8 +254,7 @@ bool llvm::haveNoCommonBitsSet(const WithCache<const Value *> &LHSCache,
 
 bool llvm::isOnlyUsedInZeroComparison(const Instruction *I) {
   return !I->user_empty() && all_of(I->users(), [](const User *U) {
-    ICmpInst::Predicate P;
-    return match(U, m_ICmp(P, m_Value(), m_Zero()));
+    return match(U, m_ICmp(m_Value(), m_Zero()));
   });
 }
 
@@ -2594,10 +2593,10 @@ static bool isNonZeroRecurrence(const PHINode *PN) {
 }
 
 static bool matchOpWithOpEqZero(Value *Op0, Value *Op1) {
-  ICmpInst::Predicate Pred;
-  return (match(Op0, m_ZExtOrSExt(m_ICmp(Pred, m_Specific(Op1), m_Zero()))) ||
-          match(Op1, m_ZExtOrSExt(m_ICmp(Pred, m_Specific(Op0), m_Zero())))) &&
-         Pred == ICmpInst::ICMP_EQ;
+  return match(Op0, m_ZExtOrSExt(m_SpecificICmp(ICmpInst::ICMP_EQ,
+                                                m_Specific(Op1), m_Zero()))) ||
+         match(Op1, m_ZExtOrSExt(m_SpecificICmp(ICmpInst::ICMP_EQ,
+                                                m_Specific(Op0), m_Zero())));
 }
 
 static bool isNonZeroAdd(const APInt &DemandedElts, unsigned Depth,
@@ -6798,17 +6797,6 @@ bool llvm::onlyUsedByLifetimeMarkersOrDroppableInsts(const Value *V) {
       V, /* AllowLifetime */ true, /* AllowDroppable */ true);
 }
 
-bool llvm::mustSuppressSpeculation(const LoadInst &LI) {
-  if (!LI.isUnordered())
-    return true;
-  const Function &F = *LI.getFunction();
-  // Speculative load may create a race that did not exist in the source.
-  return F.hasFnAttribute(Attribute::SanitizeThread) ||
-    // Speculative load may load data from dirty regions.
-    F.hasFnAttribute(Attribute::SanitizeAddress) ||
-    F.hasFnAttribute(Attribute::SanitizeHWAddress);
-}
-
 bool llvm::isSafeToSpeculativelyExecute(const Instruction *Inst,
                                         const Instruction *CtxI,
                                         AssumptionCache *AC,
diff --git a/llvm/lib/AsmParser/LLLexer.cpp b/llvm/lib/AsmParser/LLLexer.cpp
index c82e749..7c97f7a 100644
--- a/llvm/lib/AsmParser/LLLexer.cpp
+++ b/llvm/lib/AsmParser/LLLexer.cpp
@@ -838,8 +838,6 @@ lltok::Kind LLLexer::LexIdentifier() {
   TYPEKEYWORD("ppc_fp128", Type::getPPC_FP128Ty(Context));
   TYPEKEYWORD("label",     Type::getLabelTy(Context));
   TYPEKEYWORD("metadata",  Type::getMetadataTy(Context));
-  TYPEKEYWORD("x86_mmx", llvm::FixedVectorType::get(
-                             llvm::IntegerType::get(Context, 64), 1));
   TYPEKEYWORD("x86_amx",   Type::getX86_AMXTy(Context));
   TYPEKEYWORD("token",     Type::getTokenTy(Context));
   TYPEKEYWORD("ptr",       PointerType::getUnqual(Context));
diff --git a/llvm/lib/AsmParser/LLParser.cpp b/llvm/lib/AsmParser/LLParser.cpp
index a886f6e..9358f89 100644
--- a/llvm/lib/AsmParser/LLParser.cpp
+++ b/llvm/lib/AsmParser/LLParser.cpp
@@ -7260,13 +7260,13 @@ bool LLParser::parseIndirectBr(Instruction *&Inst, PerFunctionState &PFS) {
 // If RetType is a non-function pointer type, then this is the short syntax
 // for the call, which means that RetType is just the return type.  Infer the
 // rest of the function argument types from the arguments that are present.
-bool LLParser::resolveFunctionType(Type *RetType,
-                                   const SmallVector<ParamInfo, 16> &ArgList,
+bool LLParser::resolveFunctionType(Type *RetType, ArrayRef<ParamInfo> ArgList,
                                    FunctionType *&FuncTy) {
   FuncTy = dyn_cast<FunctionType>(RetType);
   if (!FuncTy) {
     // Pull out the types of all of the arguments...
-    std::vector<Type*> ParamTypes;
+    SmallVector<Type *, 8> ParamTypes;
+    ParamTypes.reserve(ArgList.size());
     for (const ParamInfo &Arg : ArgList)
       ParamTypes.push_back(Arg.V->getType());
 
diff --git a/llvm/lib/CodeGen/AsmPrinter/AsmPrinter.cpp b/llvm/lib/CodeGen/AsmPrinter/AsmPrinter.cpp
index 4e35664..b64fe83 100644
--- a/llvm/lib/CodeGen/AsmPrinter/AsmPrinter.cpp
+++ b/llvm/lib/CodeGen/AsmPrinter/AsmPrinter.cpp
@@ -3967,6 +3967,9 @@ void AsmPrinter::emitBasicBlockStart(const MachineBasicBlock &MBB) {
     CurrentSectionBeginSym = MBB.getSymbol();
   }
 
+  for (auto &Handler : DebugHandlers)
+    Handler->beginCodeAlignment(MBB);
+
   // Emit an alignment directive for this block, if needed.
   const Align Alignment = MBB.getAlignment();
   if (Alignment != Align(1))
diff --git a/llvm/lib/CodeGen/AsmPrinter/DwarfDebug.cpp b/llvm/lib/CodeGen/AsmPrinter/DwarfDebug.cpp
index ac4d0f2..9b1965c 100644
--- a/llvm/lib/CodeGen/AsmPrinter/DwarfDebug.cpp
+++ b/llvm/lib/CodeGen/AsmPrinter/DwarfDebug.cpp
@@ -3674,3 +3674,21 @@ bool DwarfDebug::alwaysUseRanges(const DwarfCompileUnit &CU) const {
     return true;
   return false;
 }
+
+void DwarfDebug::beginCodeAlignment(const MachineBasicBlock &MBB) {
+  if (MBB.getAlignment() == Align(1))
+    return;
+
+  auto *SP = MBB.getParent()->getFunction().getSubprogram();
+  bool NoDebug =
+      !SP || SP->getUnit()->getEmissionKind() == DICompileUnit::NoDebug;
+
+  if (NoDebug)
+    return;
+
+  auto PrevLoc = Asm->OutStreamer->getContext().getCurrentDwarfLoc();
+  Asm->OutStreamer->emitDwarfLocDirective(
+      PrevLoc.getFileNum(), 0, PrevLoc.getColumn(), 0, 0, 0, StringRef());
+  MCDwarfLineEntry::make(Asm->OutStreamer.get(),
+                         Asm->OutStreamer->getCurrentSectionOnly());
+}
diff --git a/llvm/lib/CodeGen/AsmPrinter/DwarfDebug.h b/llvm/lib/CodeGen/AsmPrinter/DwarfDebug.h
index 13f4c37..6e37939 100644
--- a/llvm/lib/CodeGen/AsmPrinter/DwarfDebug.h
+++ b/llvm/lib/CodeGen/AsmPrinter/DwarfDebug.h
@@ -730,6 +730,9 @@ public:
   /// Process beginning of an instruction.
   void beginInstruction(const MachineInstr *MI) override;
 
+  /// Process beginning of code alignment.
+  void beginCodeAlignment(const MachineBasicBlock &MBB) override;
+
   /// Perform an MD5 checksum of \p Identifier and return the lower 64 bits.
   static uint64_t makeTypeSignature(StringRef Identifier);
 
diff --git a/llvm/lib/CodeGen/CalcSpillWeights.cpp b/llvm/lib/CodeGen/CalcSpillWeights.cpp
index 1d767a3..9d8c911 100644
--- a/llvm/lib/CodeGen/CalcSpillWeights.cpp
+++ b/llvm/lib/CodeGen/CalcSpillWeights.cpp
@@ -22,6 +22,7 @@
 #include "llvm/CodeGen/TargetSubtargetInfo.h"
 #include "llvm/CodeGen/VirtRegMap.h"
 #include "llvm/Support/Debug.h"
+#include "llvm/Support/MathExtras.h"
 #include "llvm/Support/raw_ostream.h"
 #include <cassert>
 #include <tuple>
@@ -257,7 +258,9 @@ float VirtRegAuxInfo::weightCalcHelper(LiveInterval &LI, SlotIndex *Start,
       return -1.0f;
     }
 
-    float Weight = 1.0f;
+    // Force Weight onto the stack so that x86 doesn't add hidden precision,
+    // similar to HWeight below.
+    stack_float_t Weight = 1.0f;
     if (IsSpillable) {
       // Get loop info for mi.
       if (MI->getParent() != MBB) {
@@ -284,11 +287,9 @@ float VirtRegAuxInfo::weightCalcHelper(LiveInterval &LI, SlotIndex *Start,
     Register HintReg = copyHint(MI, LI.reg(), TRI, MRI);
     if (!HintReg)
       continue;
-    // Force hweight onto the stack so that x86 doesn't add hidden precision,
+    // Force HWeight onto the stack so that x86 doesn't add hidden precision,
     // making the comparison incorrectly pass (i.e., 1 > 1 == true??).
-    //
-    // FIXME: we probably shouldn't use floats at all.
-    volatile float HWeight = Hint[HintReg] += Weight;
+    stack_float_t HWeight = Hint[HintReg] += Weight;
     if (HintReg.isVirtual() || MRI.isAllocatable(HintReg))
       CopyHints.insert(CopyHint(HintReg, HWeight));
   }
diff --git a/llvm/lib/CodeGen/EarlyIfConversion.cpp b/llvm/lib/CodeGen/EarlyIfConversion.cpp
index d506c62..0de8112 100644
--- a/llvm/lib/CodeGen/EarlyIfConversion.cpp
+++ b/llvm/lib/CodeGen/EarlyIfConversion.cpp
@@ -181,8 +181,8 @@ public:
   bool canConvertIf(MachineBasicBlock *MBB, bool Predicate = false);
 
   /// convertIf - If-convert the last block passed to canConvertIf(), assuming
-  /// it is possible. Add any erased blocks to RemovedBlocks.
-  void convertIf(SmallVectorImpl<MachineBasicBlock *> &RemovedBlocks,
+  /// it is possible. Add any blocks that are to be erased to RemoveBlocks.
+  void convertIf(SmallVectorImpl<MachineBasicBlock *> &RemoveBlocks,
                  bool Predicate = false);
 };
 } // end anonymous namespace
@@ -678,9 +678,9 @@ void SSAIfConv::rewritePHIOperands() {
 /// convertIf - Execute the if conversion after canConvertIf has determined the
 /// feasibility.
 ///
-/// Any basic blocks erased will be added to RemovedBlocks.
+/// Any basic blocks that need to be erased will be added to RemoveBlocks.
 ///
-void SSAIfConv::convertIf(SmallVectorImpl<MachineBasicBlock *> &RemovedBlocks,
+void SSAIfConv::convertIf(SmallVectorImpl<MachineBasicBlock *> &RemoveBlocks,
                           bool Predicate) {
   assert(Head && Tail && TBB && FBB && "Call canConvertIf first.");
 
@@ -721,15 +721,18 @@ void SSAIfConv::convertIf(SmallVectorImpl<MachineBasicBlock *> &RemovedBlocks,
   DebugLoc HeadDL = Head->getFirstTerminator()->getDebugLoc();
   TII->removeBranch(*Head);
 
-  // Erase the now empty conditional blocks. It is likely that Head can fall
+  // Mark the now empty conditional blocks for removal and move them to the end.
+  // It is likely that Head can fall
   // through to Tail, and we can join the two blocks.
   if (TBB != Tail) {
-    RemovedBlocks.push_back(TBB);
-    TBB->eraseFromParent();
+    RemoveBlocks.push_back(TBB);
+    if (TBB != &TBB->getParent()->back())
+      TBB->moveAfter(&TBB->getParent()->back());
   }
   if (FBB != Tail) {
-    RemovedBlocks.push_back(FBB);
-    FBB->eraseFromParent();
+    RemoveBlocks.push_back(FBB);
+    if (FBB != &FBB->getParent()->back())
+      FBB->moveAfter(&FBB->getParent()->back());
   }
 
   assert(Head->succ_empty() && "Additional head successors?");
@@ -740,8 +743,9 @@ void SSAIfConv::convertIf(SmallVectorImpl<MachineBasicBlock *> &RemovedBlocks,
     Head->splice(Head->end(), Tail,
                      Tail->begin(), Tail->end());
     Head->transferSuccessorsAndUpdatePHIs(Tail);
-    RemovedBlocks.push_back(Tail);
-    Tail->eraseFromParent();
+    RemoveBlocks.push_back(Tail);
+    if (Tail != &Tail->getParent()->back())
+      Tail->moveAfter(&Tail->getParent()->back());
   } else {
     // We need a branch to Tail, let code placement work it out later.
     LLVM_DEBUG(dbgs() << "Converting to unconditional branch.\n");
@@ -1062,11 +1066,13 @@ bool EarlyIfConverter::tryConvertIf(MachineBasicBlock *MBB) {
   while (IfConv.canConvertIf(MBB) && shouldConvertIf()) {
     // If-convert MBB and update analyses.
     invalidateTraces();
-    SmallVector<MachineBasicBlock*, 4> RemovedBlocks;
-    IfConv.convertIf(RemovedBlocks);
+    SmallVector<MachineBasicBlock *, 4> RemoveBlocks;
+    IfConv.convertIf(RemoveBlocks);
     Changed = true;
-    updateDomTree(DomTree, IfConv, RemovedBlocks);
-    updateLoops(Loops, RemovedBlocks);
+    updateDomTree(DomTree, IfConv, RemoveBlocks);
+    for (MachineBasicBlock *MBB : RemoveBlocks)
+      MBB->eraseFromParent();
+    updateLoops(Loops, RemoveBlocks);
   }
   return Changed;
 }
@@ -1200,11 +1206,13 @@ bool EarlyIfPredicator::tryConvertIf(MachineBasicBlock *MBB) {
   bool Changed = false;
   while (IfConv.canConvertIf(MBB, /*Predicate*/ true) && shouldConvertIf()) {
     // If-convert MBB and update analyses.
-    SmallVector<MachineBasicBlock *, 4> RemovedBlocks;
-    IfConv.convertIf(RemovedBlocks, /*Predicate*/ true);
+    SmallVector<MachineBasicBlock *, 4> RemoveBlocks;
+    IfConv.convertIf(RemoveBlocks, /*Predicate*/ true);
     Changed = true;
-    updateDomTree(DomTree, IfConv, RemovedBlocks);
-    updateLoops(Loops, RemovedBlocks);
+    updateDomTree(DomTree, IfConv, RemoveBlocks);
+    for (MachineBasicBlock *MBB : RemoveBlocks)
+      MBB->eraseFromParent();
+    updateLoops(Loops, RemoveBlocks);
   }
   return Changed;
 }
diff --git a/llvm/lib/CodeGen/GlobalISel/LegalityPredicates.cpp b/llvm/lib/CodeGen/GlobalISel/LegalityPredicates.cpp
index 2c77ed8..8fe4819 100644
--- a/llvm/lib/CodeGen/GlobalISel/LegalityPredicates.cpp
+++ b/llvm/lib/CodeGen/GlobalISel/LegalityPredicates.cpp
@@ -194,7 +194,8 @@ LegalityPredicate LegalityPredicates::memSizeNotByteSizePow2(unsigned MMOIdx) {
   return [=](const LegalityQuery &Query) {
     const LLT MemTy = Query.MMODescrs[MMOIdx].MemoryTy;
     return !MemTy.isByteSized() ||
-           !llvm::has_single_bit<uint32_t>(MemTy.getSizeInBytes());
+           !llvm::has_single_bit<uint32_t>(
+               MemTy.getSizeInBytes().getKnownMinValue());
   };
 }
 
diff --git a/llvm/lib/CodeGen/LiveDebugValues/InstrRefBasedImpl.cpp b/llvm/lib/CodeGen/LiveDebugValues/InstrRefBasedImpl.cpp
index 0a6ce6a..20d5b26 100644
--- a/llvm/lib/CodeGen/LiveDebugValues/InstrRefBasedImpl.cpp
+++ b/llvm/lib/CodeGen/LiveDebugValues/InstrRefBasedImpl.cpp
@@ -2426,9 +2426,7 @@ bool InstrRefBasedLDV::mlocJoin(
   // as its predecessors. If a PHI is placed, test to see whether it's now a
   // redundant PHI that we can eliminate.
 
-  SmallVector<const MachineBasicBlock *, 8> BlockOrders;
-  for (auto *Pred : MBB.predecessors())
-    BlockOrders.push_back(Pred);
+  SmallVector<const MachineBasicBlock *, 8> BlockOrders(MBB.predecessors());
 
   // Visit predecessors in RPOT order.
   auto Cmp = [&](const MachineBasicBlock *A, const MachineBasicBlock *B) {
@@ -3268,9 +3266,7 @@ void InstrRefBasedLDV::buildVLocValueMap(
         bool InLocsChanged =
             vlocJoin(*MBB, LiveOutIdx, BlocksToExplore, *LiveIn);
 
-        SmallVector<const MachineBasicBlock *, 8> Preds;
-        for (const auto *Pred : MBB->predecessors())
-          Preds.push_back(Pred);
+        SmallVector<const MachineBasicBlock *, 8> Preds(MBB->predecessors());
 
         // If this block's live-in value is a VPHI, try to pick a machine-value
         // for it. This makes the machine-value available and propagated
diff --git a/llvm/lib/CodeGen/MIRParser/MIParser.cpp b/llvm/lib/CodeGen/MIRParser/MIParser.cpp
index 1d16729..bf10794 100644
--- a/llvm/lib/CodeGen/MIRParser/MIParser.cpp
+++ b/llvm/lib/CodeGen/MIRParser/MIParser.cpp
@@ -3388,7 +3388,7 @@ bool MIParser::parseMachineMemoryOperand(MachineMemOperand *&Dest) {
     if (expectAndConsume(MIToken::rparen))
       return true;
 
-    Size = MemoryType.getSizeInBytes();
+    Size = MemoryType.getSizeInBytes().getKnownMinValue();
   }
 
   MachinePointerInfo Ptr = MachinePointerInfo();
diff --git a/llvm/lib/CodeGen/MachineStableHash.cpp b/llvm/lib/CodeGen/MachineStableHash.cpp
index 5abfbd5..d2e02a2 100644
--- a/llvm/lib/CodeGen/MachineStableHash.cpp
+++ b/llvm/lib/CodeGen/MachineStableHash.cpp
@@ -33,6 +33,7 @@
 #include "llvm/MC/MCSymbol.h"
 #include "llvm/Support/Alignment.h"
 #include "llvm/Support/ErrorHandling.h"
+#include "llvm/Support/xxhash.h"
 
 #define DEBUG_TYPE "machine-stable-hash"
 
@@ -100,8 +101,7 @@ stable_hash llvm::stableHashValue(const MachineOperand &MO) {
   case MachineOperand::MO_TargetIndex: {
     if (const char *Name = MO.getTargetIndexName())
       return stable_hash_combine(MO.getType(), MO.getTargetFlags(),
-                                 stable_hash_combine_string(Name),
-                                 MO.getOffset());
+                                 xxh3_64bits(Name), MO.getOffset());
     StableHashBailingTargetIndexNoName++;
     return 0;
   }
@@ -113,7 +113,7 @@ stable_hash llvm::stableHashValue(const MachineOperand &MO) {
 
   case MachineOperand::MO_ExternalSymbol:
     return hash_combine(MO.getType(), MO.getTargetFlags(), MO.getOffset(),
-                        stable_hash_combine_string(MO.getSymbolName()));
+                        xxh3_64bits(MO.getSymbolName()));
 
   case MachineOperand::MO_RegisterMask:
   case MachineOperand::MO_RegisterLiveOut: {
@@ -151,7 +151,7 @@ stable_hash llvm::stableHashValue(const MachineOperand &MO) {
   case MachineOperand::MO_MCSymbol: {
     auto SymbolName = MO.getMCSymbol()->getName();
     return hash_combine(MO.getType(), MO.getTargetFlags(),
-                        stable_hash_combine_string(SymbolName));
+                        xxh3_64bits(SymbolName));
   }
   case MachineOperand::MO_CFIIndex:
     return stable_hash_combine(MO.getType(), MO.getTargetFlags(),
diff --git a/llvm/lib/CodeGen/RegisterBankInfo.cpp b/llvm/lib/CodeGen/RegisterBankInfo.cpp
index 72b07eb..00dcc1f 100644
--- a/llvm/lib/CodeGen/RegisterBankInfo.cpp
+++ b/llvm/lib/CodeGen/RegisterBankInfo.cpp
@@ -215,8 +215,9 @@ RegisterBankInfo::getInstrMappingImpl(const MachineInstr &MI) const {
       }
     }
 
-    unsigned Size = getSizeInBits(Reg, MRI, TRI);
-    const ValueMapping *ValMapping = &getValueMapping(0, Size, *CurRegBank);
+    TypeSize Size = getSizeInBits(Reg, MRI, TRI);
+    const ValueMapping *ValMapping =
+        &getValueMapping(0, Size.getKnownMinValue(), *CurRegBank);
     if (IsCopyLike) {
       if (!OperandsMapping[0]) {
         if (MI.isRegSequence()) {
diff --git a/llvm/lib/CodeGen/SelectionDAG/LegalizeIntegerTypes.cpp b/llvm/lib/CodeGen/SelectionDAG/LegalizeIntegerTypes.cpp
index af77b00..b1ada66 100644
--- a/llvm/lib/CodeGen/SelectionDAG/LegalizeIntegerTypes.cpp
+++ b/llvm/lib/CodeGen/SelectionDAG/LegalizeIntegerTypes.cpp
@@ -2294,12 +2294,15 @@ SDValue DAGTypeLegalizer::PromoteIntOp_Shift(SDNode *N) {
 }
 
 SDValue DAGTypeLegalizer::PromoteIntOp_CMP(SDNode *N) {
-  SDValue LHS = N->getOpcode() == ISD::UCMP
-                    ? ZExtPromotedInteger(N->getOperand(0))
-                    : SExtPromotedInteger(N->getOperand(0));
-  SDValue RHS = N->getOpcode() == ISD::UCMP
-                    ? ZExtPromotedInteger(N->getOperand(1))
-                    : SExtPromotedInteger(N->getOperand(1));
+  SDValue LHS = N->getOperand(0);
+  SDValue RHS = N->getOperand(1);
+
+  if (N->getOpcode() == ISD::SCMP) {
+    LHS = SExtPromotedInteger(LHS);
+    RHS = SExtPromotedInteger(RHS);
+  } else {
+    SExtOrZExtPromotedOperands(LHS, RHS);
+  }
 
   return SDValue(DAG.UpdateNodeOperands(N, LHS, RHS), 0);
 }
@@ -3395,13 +3398,13 @@ void DAGTypeLegalizer::ExpandIntRes_ADDSUB(SDNode *N,
       Lo = DAG.getNode(ISD::UADDO, dl, VTList, LoOps);
       HiOps[2] = Lo.getValue(1);
       Hi = DAG.computeKnownBits(HiOps[2]).isZero()
-               ? DAG.getNode(ISD::UADDO, dl, VTList, ArrayRef(HiOps, 2))
+               ? DAG.getNode(ISD::ADD, dl, NVT, ArrayRef(HiOps, 2))
                : DAG.getNode(ISD::UADDO_CARRY, dl, VTList, HiOps);
     } else {
       Lo = DAG.getNode(ISD::USUBO, dl, VTList, LoOps);
       HiOps[2] = Lo.getValue(1);
       Hi = DAG.computeKnownBits(HiOps[2]).isZero()
-               ? DAG.getNode(ISD::USUBO, dl, VTList, ArrayRef(HiOps, 2))
+               ? DAG.getNode(ISD::SUB, dl, NVT, ArrayRef(HiOps, 2))
                : DAG.getNode(ISD::USUBO_CARRY, dl, VTList, HiOps);
     }
     return;
diff --git a/llvm/lib/CodeGen/SelectionDAG/SelectionDAG.cpp b/llvm/lib/CodeGen/SelectionDAG/SelectionDAG.cpp
index bbc44a4..b3ed7f7 100644
--- a/llvm/lib/CodeGen/SelectionDAG/SelectionDAG.cpp
+++ b/llvm/lib/CodeGen/SelectionDAG/SelectionDAG.cpp
@@ -2483,6 +2483,11 @@ Align SelectionDAG::getReducedAlign(EVT VT, bool UseABI) {
     Align RedAlign2 = UseABI ? DL.getABITypeAlign(Ty) : DL.getPrefTypeAlign(Ty);
     if (RedAlign2 < RedAlign)
       RedAlign = RedAlign2;
+
+    if (!getMachineFunction().getFrameInfo().isStackRealignable())
+      // If the stack is not realignable, the alignment should be limited to the
+      // StackAlignment
+      RedAlign = std::min(RedAlign, StackAlign);
   }
 
   return RedAlign;
@@ -11582,6 +11587,19 @@ public:
 
 } // end anonymous namespace
 
+/// Return true if a glue output should propagate divergence information.
+static bool gluePropagatesDivergence(const SDNode *Node) {
+  switch (Node->getOpcode()) {
+  case ISD::CopyFromReg:
+  case ISD::CopyToReg:
+    return false;
+  default:
+    return true;
+  }
+
+  llvm_unreachable("covered opcode switch");
+}
+
 bool SelectionDAG::calculateDivergence(SDNode *N) {
   if (TLI->isSDNodeAlwaysUniform(N)) {
     assert(!TLI->isSDNodeSourceOfDivergence(N, FLI, UA) &&
@@ -11591,7 +11609,11 @@ bool SelectionDAG::calculateDivergence(SDNode *N) {
   if (TLI->isSDNodeSourceOfDivergence(N, FLI, UA))
     return true;
   for (const auto &Op : N->ops()) {
-    if (Op.Val.getValueType() != MVT::Other && Op.getNode()->isDivergent())
+    EVT VT = Op.getValueType();
+
+    // Skip Chain. It does not carry divergence.
+    if (VT != MVT::Other && Op.getNode()->isDivergent() &&
+        (VT != MVT::Glue || gluePropagatesDivergence(Op.getNode())))
       return true;
   }
   return false;
@@ -13130,8 +13152,14 @@ void SelectionDAG::createOperands(SDNode *Node, ArrayRef<SDValue> Vals) {
   for (unsigned I = 0; I != Vals.size(); ++I) {
     Ops[I].setUser(Node);
     Ops[I].setInitial(Vals[I]);
-    if (Ops[I].Val.getValueType() != MVT::Other) // Skip Chain. It does not carry divergence.
-      IsDivergent |= Ops[I].getNode()->isDivergent();
+    EVT VT = Ops[I].getValueType();
+
+    // Skip Chain. It does not carry divergence.
+    if (VT != MVT::Other &&
+        (VT != MVT::Glue || gluePropagatesDivergence(Ops[I].getNode())) &&
+        Ops[I].getNode()->isDivergent()) {
+      IsDivergent = true;
+    }
   }
   Node->NumOperands = Vals.size();
   Node->OperandList = Ops;
diff --git a/llvm/lib/CodeGen/SelectionDAG/SelectionDAGBuilder.cpp b/llvm/lib/CodeGen/SelectionDAG/SelectionDAGBuilder.cpp
index c554c0f..9f5e646 100644
--- a/llvm/lib/CodeGen/SelectionDAG/SelectionDAGBuilder.cpp
+++ b/llvm/lib/CodeGen/SelectionDAG/SelectionDAGBuilder.cpp
@@ -1246,9 +1246,7 @@ void SelectionDAGBuilder::visitDbgInfo(const Instruction &I) {
       SmallVector<Value *> Values(It->Values.location_ops());
       if (!handleDebugValue(Values, Var, It->Expr, It->DL, SDNodeOrder,
                             It->Values.hasArgList())) {
-        SmallVector<Value *, 4> Vals;
-        for (Value *V : It->Values.location_ops())
-          Vals.push_back(V);
+        SmallVector<Value *, 4> Vals(It->Values.location_ops());
         addDanglingDebugInfo(Vals,
                              FnVarLocs->getDILocalVariable(It->VariableID),
                              It->Expr, Vals.size() > 1, It->DL, SDNodeOrder);
diff --git a/llvm/lib/CodeGen/SelectionDAG/SelectionDAGISel.cpp b/llvm/lib/CodeGen/SelectionDAG/SelectionDAGISel.cpp
index 84331d2..607c803 100644
--- a/llvm/lib/CodeGen/SelectionDAG/SelectionDAGISel.cpp
+++ b/llvm/lib/CodeGen/SelectionDAG/SelectionDAGISel.cpp
@@ -2596,6 +2596,17 @@ GetVBR(uint64_t Val, const unsigned char *MatcherTable, unsigned &Idx) {
   return Val;
 }
 
+/// getSimpleVT - Decode a value in MatcherTable, if it's a VBR encoded value,
+/// use GetVBR to decode it.
+LLVM_ATTRIBUTE_ALWAYS_INLINE static MVT::SimpleValueType
+getSimpleVT(const unsigned char *MatcherTable, unsigned &MatcherIndex) {
+  unsigned SimpleVT = MatcherTable[MatcherIndex++];
+  if (SimpleVT & 128)
+    SimpleVT = GetVBR(SimpleVT, MatcherTable, MatcherIndex);
+
+  return static_cast<MVT::SimpleValueType>(SimpleVT);
+}
+
 void SelectionDAGISel::Select_JUMP_TABLE_DEBUG_INFO(SDNode *N) {
   SDLoc dl(N);
   CurDAG->SelectNodeTo(N, TargetOpcode::JUMP_TABLE_DEBUG_INFO, MVT::Glue,
@@ -2875,8 +2886,7 @@ CheckChild2CondCode(const unsigned char *MatcherTable, unsigned &MatcherIndex,
 LLVM_ATTRIBUTE_ALWAYS_INLINE static bool
 CheckValueType(const unsigned char *MatcherTable, unsigned &MatcherIndex,
                SDValue N, const TargetLowering *TLI, const DataLayout &DL) {
-  MVT::SimpleValueType VT =
-      static_cast<MVT::SimpleValueType>(MatcherTable[MatcherIndex++]);
+  MVT::SimpleValueType VT = getSimpleVT(MatcherTable, MatcherIndex);
   if (cast<VTSDNode>(N)->getVT() == VT)
     return true;
 
@@ -3006,7 +3016,7 @@ static unsigned IsPredicateKnownToFail(const unsigned char *Table,
       VT = MVT::i64;
       break;
     default:
-      VT = static_cast<MVT::SimpleValueType>(Table[Index++]);
+      VT = getSimpleVT(Table, Index);
       break;
     }
     Result = !::CheckType(VT, N, SDISel.TLI, SDISel.CurDAG->getDataLayout());
@@ -3014,9 +3024,8 @@ static unsigned IsPredicateKnownToFail(const unsigned char *Table,
   }
   case SelectionDAGISel::OPC_CheckTypeRes: {
     unsigned Res = Table[Index++];
-    Result = !::CheckType(static_cast<MVT::SimpleValueType>(Table[Index++]),
-                          N.getValue(Res), SDISel.TLI,
-                          SDISel.CurDAG->getDataLayout());
+    Result = !::CheckType(getSimpleVT(Table, Index), N.getValue(Res),
+                          SDISel.TLI, SDISel.CurDAG->getDataLayout());
     return Index;
   }
   case SelectionDAGISel::OPC_CheckChild0Type:
@@ -3054,7 +3063,7 @@ static unsigned IsPredicateKnownToFail(const unsigned char *Table,
       VT = MVT::i64;
       ChildNo = Opcode - SelectionDAGISel::OPC_CheckChild0TypeI64;
     } else {
-      VT = static_cast<MVT::SimpleValueType>(Table[Index++]);
+      VT = getSimpleVT(Table, Index);
       ChildNo = Opcode - SelectionDAGISel::OPC_CheckChild0Type;
     }
     Result = !::CheckChildType(VT, N, SDISel.TLI,
@@ -3558,7 +3567,7 @@ void SelectionDAGISel::SelectCodeCommon(SDNode *NodeToMatch,
         VT = MVT::i64;
         break;
       default:
-        VT = static_cast<MVT::SimpleValueType>(MatcherTable[MatcherIndex++]);
+        VT = getSimpleVT(MatcherTable, MatcherIndex);
         break;
       }
       if (!::CheckType(VT, N, TLI, CurDAG->getDataLayout()))
@@ -3567,9 +3576,8 @@ void SelectionDAGISel::SelectCodeCommon(SDNode *NodeToMatch,
 
     case OPC_CheckTypeRes: {
       unsigned Res = MatcherTable[MatcherIndex++];
-      if (!::CheckType(
-              static_cast<MVT::SimpleValueType>(MatcherTable[MatcherIndex++]),
-              N.getValue(Res), TLI, CurDAG->getDataLayout()))
+      if (!::CheckType(getSimpleVT(MatcherTable, MatcherIndex), N.getValue(Res),
+                       TLI, CurDAG->getDataLayout()))
         break;
       continue;
     }
@@ -3616,8 +3624,7 @@ void SelectionDAGISel::SelectCodeCommon(SDNode *NodeToMatch,
           CaseSize = GetVBR(CaseSize, MatcherTable, MatcherIndex);
         if (CaseSize == 0) break;
 
-        MVT CaseVT =
-            static_cast<MVT::SimpleValueType>(MatcherTable[MatcherIndex++]);
+        MVT CaseVT = getSimpleVT(MatcherTable, MatcherIndex);
         if (CaseVT == MVT::iPTR)
           CaseVT = TLI->getPointerTy(CurDAG->getDataLayout());
 
@@ -3673,7 +3680,7 @@ void SelectionDAGISel::SelectCodeCommon(SDNode *NodeToMatch,
         VT = MVT::i64;
         ChildNo = Opcode - SelectionDAGISel::OPC_CheckChild0TypeI64;
       } else {
-        VT = static_cast<MVT::SimpleValueType>(MatcherTable[MatcherIndex++]);
+        VT = getSimpleVT(MatcherTable, MatcherIndex);
         ChildNo = Opcode - SelectionDAGISel::OPC_CheckChild0Type;
       }
       if (!::CheckChildType(VT, N, TLI, CurDAG->getDataLayout(), ChildNo))
@@ -3767,7 +3774,7 @@ void SelectionDAGISel::SelectCodeCommon(SDNode *NodeToMatch,
         VT = MVT::i64;
         break;
       default:
-        VT = static_cast<MVT::SimpleValueType>(MatcherTable[MatcherIndex++]);
+        VT = getSimpleVT(MatcherTable, MatcherIndex);
         break;
       }
       int64_t Val = MatcherTable[MatcherIndex++];
@@ -3791,7 +3798,7 @@ void SelectionDAGISel::SelectCodeCommon(SDNode *NodeToMatch,
         VT = MVT::i64;
         break;
       default:
-        VT = static_cast<MVT::SimpleValueType>(MatcherTable[MatcherIndex++]);
+        VT = getSimpleVT(MatcherTable, MatcherIndex);
         break;
       }
       unsigned RegNo = MatcherTable[MatcherIndex++];
@@ -3803,8 +3810,7 @@ void SelectionDAGISel::SelectCodeCommon(SDNode *NodeToMatch,
       // For targets w/ more than 256 register names, the register enum
       // values are stored in two bytes in the matcher table (just like
       // opcodes).
-      MVT::SimpleValueType VT =
-          static_cast<MVT::SimpleValueType>(MatcherTable[MatcherIndex++]);
+      MVT::SimpleValueType VT = getSimpleVT(MatcherTable, MatcherIndex);
       unsigned RegNo = MatcherTable[MatcherIndex++];
       RegNo |= MatcherTable[MatcherIndex++] << 8;
       RecordedNodes.push_back(std::pair<SDValue, SDNode*>(
@@ -4042,8 +4048,7 @@ void SelectionDAGISel::SelectCodeCommon(SDNode *NodeToMatch,
         NumVTs = MatcherTable[MatcherIndex++];
       SmallVector<EVT, 4> VTs;
       for (unsigned i = 0; i != NumVTs; ++i) {
-        MVT::SimpleValueType VT =
-            static_cast<MVT::SimpleValueType>(MatcherTable[MatcherIndex++]);
+        MVT::SimpleValueType VT = getSimpleVT(MatcherTable, MatcherIndex);
         if (VT == MVT::iPTR)
           VT = TLI->getPointerTy(CurDAG->getDataLayout()).SimpleTy;
         VTs.push_back(VT);
diff --git a/llvm/lib/CodeGen/SelectionDAG/TargetLowering.cpp b/llvm/lib/CodeGen/SelectionDAG/TargetLowering.cpp
index 140c97c..7fa83a5 100644
--- a/llvm/lib/CodeGen/SelectionDAG/TargetLowering.cpp
+++ b/llvm/lib/CodeGen/SelectionDAG/TargetLowering.cpp
@@ -8556,11 +8556,12 @@ static std::optional<bool> isFCmpEqualZero(FPClassTest Test,
 }
 
 SDValue TargetLowering::expandIS_FPCLASS(EVT ResultVT, SDValue Op,
-                                         FPClassTest Test, SDNodeFlags Flags,
-                                         const SDLoc &DL,
+                                         const FPClassTest OrigTestMask,
+                                         SDNodeFlags Flags, const SDLoc &DL,
                                          SelectionDAG &DAG) const {
   EVT OperandVT = Op.getValueType();
   assert(OperandVT.isFloatingPoint());
+  FPClassTest Test = OrigTestMask;
 
   // Degenerated cases.
   if (Test == fcNone)
@@ -8594,9 +8595,21 @@ SDValue TargetLowering::expandIS_FPCLASS(EVT ResultVT, SDValue Op,
   // exceptions are ignored.
   if (Flags.hasNoFPExcept() &&
       isOperationLegalOrCustom(ISD::SETCC, OperandVT.getScalarType())) {
+    FPClassTest FPTestMask = Test;
+
     ISD::CondCode OrderedCmpOpcode = IsInverted ? ISD::SETUNE : ISD::SETOEQ;
     ISD::CondCode UnorderedCmpOpcode = IsInverted ? ISD::SETONE : ISD::SETUEQ;
 
+    // See if we can fold an | fcNan into an unordered compare.
+    FPClassTest OrderedFPTestMask = FPTestMask & ~fcNan;
+
+    // Can't fold the ordered check if we're only testing for snan or qnan
+    // individually.
+    if ((FPTestMask & fcNan) != fcNan)
+      OrderedFPTestMask = FPTestMask;
+
+    const bool IsOrdered = FPTestMask == OrderedFPTestMask;
+
     if (std::optional<bool> IsCmp0 =
             isFCmpEqualZero(Test, Semantics, DAG.getMachineFunction());
         IsCmp0 && (isCondCodeLegalOrCustom(
@@ -8628,6 +8641,27 @@ SDValue TargetLowering::expandIS_FPCLASS(EVT ResultVT, SDValue Op,
       return DAG.getSetCC(DL, ResultVT, Abs, Inf,
                           IsInverted ? ISD::SETUNE : ISD::SETOEQ);
     }
+
+    if (OrderedFPTestMask == (fcSubnormal | fcZero) && !IsOrdered) {
+      // TODO: Could handle ordered case, but it produces worse code for
+      // x86. Maybe handle ordered if fabs is free?
+
+      ISD::CondCode OrderedOp = IsInverted ? ISD::SETUGE : ISD::SETOLT;
+      ISD::CondCode UnorderedOp = IsInverted ? ISD::SETOGE : ISD::SETULT;
+
+      if (isCondCodeLegalOrCustom(IsOrdered ? OrderedOp : UnorderedOp,
+                                  OperandVT.getScalarType().getSimpleVT())) {
+        // (issubnormal(x) || iszero(x)) --> fabs(x) < smallest_normal
+
+        // TODO: Maybe only makes sense if fabs is free. Integer test of
+        // exponent bits seems better for x86.
+        SDValue Abs = DAG.getNode(ISD::FABS, DL, OperandVT, Op);
+        SDValue SmallestNormal = DAG.getConstantFP(
+            APFloat::getSmallestNormalized(Semantics), DL, OperandVT);
+        return DAG.getSetCC(DL, ResultVT, Abs, SmallestNormal,
+                            IsOrdered ? OrderedOp : UnorderedOp);
+      }
+    }
   }
 
   // In the general case use integer operations.
@@ -9345,6 +9379,26 @@ SDValue TargetLowering::expandAVG(SDNode *N, SelectionDAG &DAG) const {
     }
   }
 
+  // avgflooru(lhs, rhs) -> or(lshr(add(lhs, rhs),1),shl(overflow, typesize-1))
+  if (Opc == ISD::AVGFLOORU && VT.isScalarInteger() && !isTypeLegal(VT)) {
+    SDValue UAddWithOverflow =
+        DAG.getNode(ISD::UADDO, dl, DAG.getVTList(VT, MVT::i1), {RHS, LHS});
+
+    SDValue Sum = UAddWithOverflow.getValue(0);
+    SDValue Overflow = UAddWithOverflow.getValue(1);
+
+    // Right shift the sum by 1
+    SDValue One = DAG.getShiftAmountConstant(1, VT, dl);
+    SDValue LShrVal = DAG.getNode(ISD::SRL, dl, VT, Sum, One);
+
+    SDValue ZeroExtOverflow = DAG.getNode(ISD::ANY_EXTEND, dl, VT, Overflow);
+    SDValue OverflowShl =
+        DAG.getNode(ISD::SHL, dl, VT, ZeroExtOverflow,
+                    DAG.getConstant(VT.getScalarSizeInBits() - 1, dl, VT));
+
+    return DAG.getNode(ISD::OR, dl, VT, LShrVal, OverflowShl);
+  }
+
   // avgceils(lhs, rhs) -> sub(or(lhs,rhs),ashr(xor(lhs,rhs),1))
   // avgceilu(lhs, rhs) -> sub(or(lhs,rhs),lshr(xor(lhs,rhs),1))
   // avgfloors(lhs, rhs) -> add(and(lhs,rhs),ashr(xor(lhs,rhs),1))
diff --git a/llvm/lib/ExecutionEngine/MCJIT/MCJIT.cpp b/llvm/lib/ExecutionEngine/MCJIT/MCJIT.cpp
index 869b383..0d7a51b 100644
--- a/llvm/lib/ExecutionEngine/MCJIT/MCJIT.cpp
+++ b/llvm/lib/ExecutionEngine/MCJIT/MCJIT.cpp
@@ -260,9 +260,7 @@ void MCJIT::finalizeObject() {
 
   // Generate code for module is going to move objects out of the 'added' list,
   // so we need to copy that out before using it:
-  SmallVector<Module*, 16> ModsToAdd;
-  for (auto *M : OwnedModules.added())
-    ModsToAdd.push_back(M);
+  SmallVector<Module *, 16> ModsToAdd(OwnedModules.added());
 
   for (auto *M : ModsToAdd)
     generateCodeForModule(M);
diff --git a/llvm/lib/IR/BasicBlock.cpp b/llvm/lib/IR/BasicBlock.cpp
index bf19934..cf05b11 100644
--- a/llvm/lib/IR/BasicBlock.cpp
+++ b/llvm/lib/IR/BasicBlock.cpp
@@ -240,6 +240,8 @@ BasicBlock::~BasicBlock() {
 
 void BasicBlock::setParent(Function *parent) {
   // Set Parent=parent, updating instruction symtab entries as appropriate.
+  if (Parent != parent)
+    Number = parent ? parent->NextBlockNum++ : -1u;
   InstList.setSymTabObject(&Parent, parent);
 }
 
@@ -626,9 +628,7 @@ BasicBlock *BasicBlock::splitBasicBlockBefore(iterator I, const Twine &BBName) {
   // to reflect that the incoming branches will be from the New block and not
   // from predecessors of the 'this' block.
   // Save predecessors to separate vector before modifying them.
-  SmallVector<BasicBlock *, 4> Predecessors;
-  for (BasicBlock *Pred : predecessors(this))
-    Predecessors.push_back(Pred);
+  SmallVector<BasicBlock *, 4> Predecessors(predecessors(this));
   for (BasicBlock *Pred : Predecessors) {
     Instruction *TI = Pred->getTerminator();
     TI->replaceSuccessorWith(this, New);
diff --git a/llvm/lib/IR/Function.cpp b/llvm/lib/IR/Function.cpp
index 9b0dd5f..69520fd 100644
--- a/llvm/lib/IR/Function.cpp
+++ b/llvm/lib/IR/Function.cpp
@@ -13,6 +13,7 @@
 #include "llvm/IR/Function.h"
 #include "SymbolTableListTraitsImpl.h"
 #include "llvm/ADT/ArrayRef.h"
+#include "llvm/ADT/BitVector.h"
 #include "llvm/ADT/DenseSet.h"
 #include "llvm/ADT/STLExtras.h"
 #include "llvm/ADT/SmallString.h"
@@ -85,6 +86,27 @@ static cl::opt<int> NonGlobalValueMaxNameSize(
 
 extern cl::opt<bool> UseNewDbgInfoFormat;
 
+void Function::renumberBlocks() {
+  validateBlockNumbers();
+
+  NextBlockNum = 0;
+  for (auto &BB : *this)
+    BB.Number = NextBlockNum++;
+  BlockNumEpoch++;
+}
+
+void Function::validateBlockNumbers() const {
+#ifndef NDEBUG
+  BitVector Numbers(NextBlockNum);
+  for (const auto &BB : *this) {
+    unsigned Num = BB.getNumber();
+    assert(Num < NextBlockNum && "out of range block number");
+    assert(!Numbers[Num] && "duplicate block numbers");
+    Numbers.set(Num);
+  }
+#endif
+}
+
 void Function::convertToNewDbgValues() {
   IsNewDbgInfoFormat = true;
   for (auto &BB : *this) {
@@ -509,6 +531,8 @@ Function::Function(FunctionType *Ty, LinkageTypes Linkage, unsigned AddrSpace,
 }
 
 Function::~Function() {
+  validateBlockNumbers();
+
   dropAllReferences();    // After this it is safe to delete instructions.
 
   // Delete all of the method arguments and unlink from symbol table...
diff --git a/llvm/lib/IR/OptBisect.cpp b/llvm/lib/IR/OptBisect.cpp
index 893a5e5..559b199 100644
--- a/llvm/lib/IR/OptBisect.cpp
+++ b/llvm/lib/IR/OptBisect.cpp
@@ -32,6 +32,11 @@ static cl::opt<int> OptBisectLimit("opt-bisect-limit", cl::Hidden,
                                    }),
                                    cl::desc("Maximum optimization to perform"));
 
+static cl::opt<bool> OptBisectVerbose(
+    "opt-bisect-verbose",
+    cl::desc("Show verbose output when opt-bisect-limit is set"), cl::Hidden,
+    cl::init(true), cl::Optional);
+
 static void printPassMessage(const StringRef &Name, int PassNum,
                              StringRef TargetDesc, bool Running) {
   StringRef Status = Running ? "" : "NOT ";
@@ -45,7 +50,8 @@ bool OptBisect::shouldRunPass(const StringRef PassName,
 
   int CurBisectNum = ++LastBisectNum;
   bool ShouldRun = (BisectLimit == -1 || CurBisectNum <= BisectLimit);
-  printPassMessage(PassName, CurBisectNum, IRDescription, ShouldRun);
+  if (OptBisectVerbose)
+    printPassMessage(PassName, CurBisectNum, IRDescription, ShouldRun);
   return ShouldRun;
 }
 
diff --git a/llvm/lib/IR/RuntimeLibcalls.cpp b/llvm/lib/IR/RuntimeLibcalls.cpp
index 8ce0caa..d806f80 100644
--- a/llvm/lib/IR/RuntimeLibcalls.cpp
+++ b/llvm/lib/IR/RuntimeLibcalls.cpp
@@ -155,6 +155,7 @@ void RuntimeLibcallsInfo::initLibcalls(const Triple &TT) {
         break;
       }
       [[fallthrough]];
+    case Triple::DriverKit:
     case Triple::TvOS:
     case Triple::WatchOS:
     case Triple::XROS:
@@ -164,6 +165,10 @@ void RuntimeLibcallsInfo::initLibcalls(const Triple &TT) {
     default:
       break;
     }
+  } else if (TT.getOS() == Triple::BridgeOS) {
+    // TODO: BridgeOS should be included in isOSDarwin.
+    setLibcallName(RTLIB::EXP10_F32, "__exp10f");
+    setLibcallName(RTLIB::EXP10_F64, "__exp10");
   } else {
     setLibcallName(RTLIB::FPEXT_F16_F32, "__gnu_h2f_ieee");
     setLibcallName(RTLIB::FPROUND_F32_F16, "__gnu_f2h_ieee");
diff --git a/llvm/lib/LTO/LTOBackend.cpp b/llvm/lib/LTO/LTOBackend.cpp
index d5d642f..effaed2 100644
--- a/llvm/lib/LTO/LTOBackend.cpp
+++ b/llvm/lib/LTO/LTOBackend.cpp
@@ -335,6 +335,16 @@ static void runNewPMPasses(const Config &Conf, Module &Mod, TargetMachine *TM,
   if (!Conf.DisableVerify)
     MPM.addPass(VerifierPass());
 
+  if (PrintPipelinePasses) {
+    std::string PipelineStr;
+    raw_string_ostream OS(PipelineStr);
+    MPM.printPipeline(OS, [&PIC](StringRef ClassName) {
+      auto PassName = PIC.getPassNameForClassName(ClassName);
+      return PassName.empty() ? ClassName : PassName;
+    });
+    outs() << "pipeline-passes: " << PipelineStr << '\n';
+  }
+
   MPM.run(Mod, MAM);
 }
 
diff --git a/llvm/lib/MC/MCAssembler.cpp b/llvm/lib/MC/MCAssembler.cpp
index ceeb7af..cbeb41f5 100644
--- a/llvm/lib/MC/MCAssembler.cpp
+++ b/llvm/lib/MC/MCAssembler.cpp
@@ -56,8 +56,6 @@ STATISTIC(EmittedRelaxableFragments,
           "Number of emitted assembler fragments - relaxable");
 STATISTIC(EmittedDataFragments,
           "Number of emitted assembler fragments - data");
-STATISTIC(EmittedCompactEncodedInstFragments,
-          "Number of emitted assembler fragments - compact encoded inst");
 STATISTIC(EmittedAlignFragments,
           "Number of emitted assembler fragments - align");
 STATISTIC(EmittedFillFragments,
@@ -253,8 +251,6 @@ uint64_t MCAssembler::computeFragmentSize(const MCFragment &F) const {
     return cast<MCDataFragment>(F).getContents().size();
   case MCFragment::FT_Relaxable:
     return cast<MCRelaxableFragment>(F).getContents().size();
-  case MCFragment::FT_CompactEncodedInst:
-    return cast<MCCompactEncodedInstFragment>(F).getContents().size();
   case MCFragment::FT_Fill: {
     auto &FF = cast<MCFillFragment>(F);
     int64_t NumValues = 0;
@@ -432,6 +428,28 @@ void MCAssembler::layoutBundle(MCFragment *Prev, MCFragment *F) const {
       DF->Offset = EF->Offset;
 }
 
+void MCAssembler::ensureValid(MCSection &Sec) const {
+  if (Sec.hasLayout())
+    return;
+  Sec.setHasLayout(true);
+  MCFragment *Prev = nullptr;
+  uint64_t Offset = 0;
+  for (MCFragment &F : Sec) {
+    F.Offset = Offset;
+    if (isBundlingEnabled() && F.hasInstructions()) {
+      layoutBundle(Prev, &F);
+      Offset = F.Offset;
+    }
+    Offset += computeFragmentSize(F);
+    Prev = &F;
+  }
+}
+
+uint64_t MCAssembler::getFragmentOffset(const MCFragment &F) const {
+  ensureValid(*F.getParent());
+  return F.Offset;
+}
+
 // Simple getSymbolOffset helper for the non-variable case.
 static bool getLabelOffset(const MCAssembler &Asm, const MCSymbol &S,
                            bool ReportError, uint64_t &Val) {
@@ -662,11 +680,6 @@ static void writeFragment(raw_ostream &OS, const MCAssembler &Asm,
     OS << cast<MCRelaxableFragment>(F).getContents();
     break;
 
-  case MCFragment::FT_CompactEncodedInst:
-    ++stats::EmittedCompactEncodedInstFragments;
-    OS << cast<MCCompactEncodedInstFragment>(F).getContents();
-    break;
-
   case MCFragment::FT_Fill: {
     ++stats::EmittedFillFragments;
     const MCFillFragment &FF = cast<MCFillFragment>(F);
@@ -916,20 +929,22 @@ void MCAssembler::layout() {
 
   // Layout until everything fits.
   this->HasLayout = true;
-  for (MCSection &Sec : *this)
-    layoutSection(Sec);
   while (layoutOnce()) {
+    if (getContext().hadError())
+      return;
+    // Size of fragments in one section can depend on the size of fragments in
+    // another. If any fragment has changed size, we have to re-layout (and
+    // as a result possibly further relax) all.
+    for (MCSection &Sec : *this)
+      Sec.setHasLayout(false);
   }
 
   DEBUG_WITH_TYPE("mc-dump", {
       errs() << "assembler backend - post-relaxation\n--\n";
       dump(); });
 
-  // Some targets might want to adjust fragment offsets. If so, perform another
-  // layout loop.
-  if (getBackend().finishLayout(*this))
-    for (MCSection &Sec : *this)
-      layoutSection(Sec);
+  // Finalize the layout, including fragment lowering.
+  getBackend().finishLayout(*this);
 
   DEBUG_WITH_TYPE("mc-dump", {
       errs() << "assembler backend - final-layout\n--\n";
@@ -1282,42 +1297,15 @@ bool MCAssembler::relaxFragment(MCFragment &F) {
   }
 }
 
-void MCAssembler::layoutSection(MCSection &Sec) {
-  MCFragment *Prev = nullptr;
-  uint64_t Offset = 0;
-  for (MCFragment &F : Sec) {
-    F.Offset = Offset;
-    if (LLVM_UNLIKELY(isBundlingEnabled())) {
-      if (F.hasInstructions()) {
-        layoutBundle(Prev, &F);
-        Offset = F.Offset;
-      }
-      Prev = &F;
-    }
-    Offset += computeFragmentSize(F);
-  }
-}
-
 bool MCAssembler::layoutOnce() {
   ++stats::RelaxationSteps;
 
-  // Size of fragments in one section can depend on the size of fragments in
-  // another. If any fragment has changed size, we have to re-layout (and
-  // as a result possibly further relax) all.
-  bool ChangedAny = false;
-  for (MCSection &Sec : *this) {
-    for (;;) {
-      bool Changed = false;
-      for (MCFragment &F : Sec)
-        if (relaxFragment(F))
-          Changed = true;
-      ChangedAny |= Changed;
-      if (!Changed)
-        break;
-      layoutSection(Sec);
-    }
-  }
-  return ChangedAny;
+  bool Changed = false;
+  for (MCSection &Sec : *this)
+    for (MCFragment &Frag : Sec)
+      if (relaxFragment(Frag))
+        Changed = true;
+  return Changed;
 }
 
 #if !defined(NDEBUG) || defined(LLVM_ENABLE_DUMP)
diff --git a/llvm/lib/MC/MCContext.cpp b/llvm/lib/MC/MCContext.cpp
index 228c4fb..ac3946b 100644
--- a/llvm/lib/MC/MCContext.cpp
+++ b/llvm/lib/MC/MCContext.cpp
@@ -697,10 +697,11 @@ MCSectionCOFF *MCContext::getCOFFSection(StringRef Section,
   MCSymbol *COMDATSymbol = nullptr;
   if (!COMDATSymName.empty()) {
     COMDATSymbol = getOrCreateSymbol(COMDATSymName);
+    assert(COMDATSymbol && "COMDATSymbol is null");
     COMDATSymName = COMDATSymbol->getName();
     // A non-associative COMDAT is considered to define the COMDAT symbol. Check
     // the redefinition error.
-    if (Selection != COFF::IMAGE_COMDAT_SELECT_ASSOCIATIVE && COMDATSymbol &&
+    if (Selection != COFF::IMAGE_COMDAT_SELECT_ASSOCIATIVE &&
         COMDATSymbol->isDefined() &&
         (!COMDATSymbol->isInSection() ||
          cast<MCSectionCOFF>(COMDATSymbol->getSection()).getCOMDATSymbol() !=
diff --git a/llvm/lib/MC/MCELFStreamer.cpp b/llvm/lib/MC/MCELFStreamer.cpp
index c84d88e..8dda587 100644
--- a/llvm/lib/MC/MCELFStreamer.cpp
+++ b/llvm/lib/MC/MCELFStreamer.cpp
@@ -19,6 +19,7 @@
 #include "llvm/MC/MCAssembler.h"
 #include "llvm/MC/MCCodeEmitter.h"
 #include "llvm/MC/MCContext.h"
+#include "llvm/MC/MCELFObjectWriter.h"
 #include "llvm/MC/MCExpr.h"
 #include "llvm/MC/MCFixup.h"
 #include "llvm/MC/MCFragment.h"
diff --git a/llvm/lib/MC/MCFragment.cpp b/llvm/lib/MC/MCFragment.cpp
index b101250..ac1b5a8 100644
--- a/llvm/lib/MC/MCFragment.cpp
+++ b/llvm/lib/MC/MCFragment.cpp
@@ -27,7 +27,8 @@
 using namespace llvm;
 
 MCFragment::MCFragment(FragmentType Kind, bool HasInstructions)
-    : Kind(Kind), HasInstructions(HasInstructions), LinkerRelaxable(false) {}
+    : Kind(Kind), HasInstructions(HasInstructions), AlignToBundleEnd(false),
+      LinkerRelaxable(false), AllowAutoPadding(false) {}
 
 void MCFragment::destroy() {
   switch (Kind) {
@@ -37,9 +38,6 @@ void MCFragment::destroy() {
     case FT_Data:
       cast<MCDataFragment>(this)->~MCDataFragment();
       return;
-    case FT_CompactEncodedInst:
-      cast<MCCompactEncodedInstFragment>(this)->~MCCompactEncodedInstFragment();
-      return;
     case FT_Fill:
       cast<MCFillFragment>(this)->~MCFillFragment();
       return;
@@ -107,8 +105,6 @@ LLVM_DUMP_METHOD void MCFragment::dump() const {
   switch (getKind()) {
   case MCFragment::FT_Align: OS << "MCAlignFragment"; break;
   case MCFragment::FT_Data:  OS << "MCDataFragment"; break;
-  case MCFragment::FT_CompactEncodedInst:
-    OS << "MCCompactEncodedInstFragment"; break;
   case MCFragment::FT_Fill:  OS << "MCFillFragment"; break;
   case MCFragment::FT_Nops:
     OS << "MCFNopsFragment";
@@ -168,19 +164,6 @@ LLVM_DUMP_METHOD void MCFragment::dump() const {
     }
     break;
   }
-  case MCFragment::FT_CompactEncodedInst: {
-    const auto *CEIF =
-      cast<MCCompactEncodedInstFragment>(this);
-    OS << "\n       ";
-    OS << " Contents:[";
-    const SmallVectorImpl<char> &Contents = CEIF->getContents();
-    for (unsigned i = 0, e = Contents.size(); i != e; ++i) {
-      if (i) OS << ",";
-      OS << hexdigit((Contents[i] >> 4) & 0xF) << hexdigit(Contents[i] & 0xF);
-    }
-    OS << "] (" << Contents.size() << " bytes)";
-    break;
-  }
   case MCFragment::FT_Fill:  {
     const auto *FF = cast<MCFillFragment>(this);
     OS << " Value:" << static_cast<unsigned>(FF->getValue())
diff --git a/llvm/lib/MC/MCSection.cpp b/llvm/lib/MC/MCSection.cpp
index 97e87a4..8c2ee56 100644
--- a/llvm/lib/MC/MCSection.cpp
+++ b/llvm/lib/MC/MCSection.cpp
@@ -23,8 +23,8 @@ using namespace llvm;
 MCSection::MCSection(SectionVariant V, StringRef Name, bool IsText,
                      bool IsVirtual, MCSymbol *Begin)
     : Begin(Begin), BundleGroupBeforeFirstInst(false), HasInstructions(false),
-      IsRegistered(false), IsText(IsText), IsVirtual(IsVirtual), Name(Name),
-      Variant(V) {
+      HasLayout(false), IsRegistered(false), IsText(IsText),
+      IsVirtual(IsVirtual), Name(Name), Variant(V) {
   DummyFragment.setParent(this);
   // The initial subsection number is 0. Create a fragment list.
   CurFragList = &Subsections.emplace_back(0u, FragList{}).second;
diff --git a/llvm/lib/ObjCopy/ConfigManager.cpp b/llvm/lib/ObjCopy/ConfigManager.cpp
index c542c4e..78fc0c4 100644
--- a/llvm/lib/ObjCopy/ConfigManager.cpp
+++ b/llvm/lib/ObjCopy/ConfigManager.cpp
@@ -26,7 +26,8 @@ Expected<const COFFConfig &> ConfigManager::getCOFFConfig() const {
       Common.DecompressDebugSections ||
       Common.DiscardMode == DiscardType::Locals ||
       !Common.SymbolsToAdd.empty() || Common.GapFill != 0 ||
-      Common.PadTo != 0 || Common.ChangeSectionLMAValAll != 0)
+      Common.PadTo != 0 || Common.ChangeSectionLMAValAll != 0 ||
+      !Common.ChangeSectionAddress.empty())
     return createStringError(llvm::errc::invalid_argument,
                              "option is not supported for COFF");
 
@@ -48,7 +49,8 @@ Expected<const MachOConfig &> ConfigManager::getMachOConfig() const {
       Common.DecompressDebugSections || Common.StripUnneeded ||
       Common.DiscardMode == DiscardType::Locals ||
       !Common.SymbolsToAdd.empty() || Common.GapFill != 0 ||
-      Common.PadTo != 0 || Common.ChangeSectionLMAValAll != 0)
+      Common.PadTo != 0 || Common.ChangeSectionLMAValAll != 0 ||
+      !Common.ChangeSectionAddress.empty())
     return createStringError(llvm::errc::invalid_argument,
                              "option is not supported for MachO");
 
@@ -68,7 +70,8 @@ Expected<const WasmConfig &> ConfigManager::getWasmConfig() const {
       !Common.SectionsToRename.empty() || !Common.SetSectionAlignment.empty() ||
       !Common.SetSectionFlags.empty() || !Common.SetSectionType.empty() ||
       !Common.SymbolsToRename.empty() || Common.GapFill != 0 ||
-      Common.PadTo != 0 || Common.ChangeSectionLMAValAll != 0)
+      Common.PadTo != 0 || Common.ChangeSectionLMAValAll != 0 ||
+      !Common.ChangeSectionAddress.empty())
     return createStringError(llvm::errc::invalid_argument,
                              "only flags for section dumping, removal, and "
                              "addition are supported");
@@ -97,7 +100,8 @@ Expected<const XCOFFConfig &> ConfigManager::getXCOFFConfig() const {
       Common.StripDebug || Common.StripNonAlloc || Common.StripSections ||
       Common.Weaken || Common.StripUnneeded || Common.DecompressDebugSections ||
       Common.GapFill != 0 || Common.PadTo != 0 ||
-      Common.ChangeSectionLMAValAll != 0) {
+      Common.ChangeSectionLMAValAll != 0 ||
+      !Common.ChangeSectionAddress.empty()) {
     return createStringError(
         llvm::errc::invalid_argument,
         "no flags are supported yet, only basic copying is allowed");
diff --git a/llvm/lib/ObjCopy/ELF/ELFObjcopy.cpp b/llvm/lib/ObjCopy/ELF/ELFObjcopy.cpp
index 075455c..4059886 100644
--- a/llvm/lib/ObjCopy/ELF/ELFObjcopy.cpp
+++ b/llvm/lib/ObjCopy/ELF/ELFObjcopy.cpp
@@ -745,6 +745,56 @@ static Error handleArgs(const CommonConfig &Config, const ELFConfig &ELFConfig,
     }
   }
 
+  if (!Config.ChangeSectionAddress.empty()) {
+    if (Obj.Type != ELF::ET_REL)
+      return createStringError(
+          object_error::invalid_file_type,
+          "cannot change section address in a non-relocatable file");
+
+    StringMap<AddressUpdate> SectionsToUpdateAddress;
+    for (const SectionPatternAddressUpdate &PatternUpdate :
+         make_range(Config.ChangeSectionAddress.rbegin(),
+                    Config.ChangeSectionAddress.rend())) {
+      for (SectionBase &Sec : Obj.sections()) {
+        if (PatternUpdate.SectionPattern.matches(Sec.Name) &&
+            SectionsToUpdateAddress.try_emplace(Sec.Name, PatternUpdate.Update)
+                .second) {
+          if (PatternUpdate.Update.Kind == AdjustKind::Subtract &&
+              Sec.Addr < PatternUpdate.Update.Value) {
+            return createStringError(
+                errc::invalid_argument,
+                "address 0x" + Twine::utohexstr(Sec.Addr) +
+                    " cannot be decreased by 0x" +
+                    Twine::utohexstr(PatternUpdate.Update.Value) +
+                    ". The result would underflow");
+          }
+          if (PatternUpdate.Update.Kind == AdjustKind::Add &&
+              Sec.Addr > std::numeric_limits<uint64_t>::max() -
+                             PatternUpdate.Update.Value) {
+            return createStringError(
+                errc::invalid_argument,
+                "address 0x" + Twine::utohexstr(Sec.Addr) +
+                    " cannot be increased by 0x" +
+                    Twine::utohexstr(PatternUpdate.Update.Value) +
+                    ". The result would overflow");
+          }
+
+          switch (PatternUpdate.Update.Kind) {
+          case (AdjustKind::Set):
+            Sec.Addr = PatternUpdate.Update.Value;
+            break;
+          case (AdjustKind::Subtract):
+            Sec.Addr -= PatternUpdate.Update.Value;
+            break;
+          case (AdjustKind::Add):
+            Sec.Addr += PatternUpdate.Update.Value;
+            break;
+          }
+        }
+      }
+    }
+  }
+
   if (Config.OnlyKeepDebug)
     for (auto &Sec : Obj.sections())
       if (Sec.Flags & SHF_ALLOC && Sec.Type != SHT_NOTE)
diff --git a/llvm/lib/Passes/PassBuilderPipelines.cpp b/llvm/lib/Passes/PassBuilderPipelines.cpp
index 757b20d..a611872 100644
--- a/llvm/lib/Passes/PassBuilderPipelines.cpp
+++ b/llvm/lib/Passes/PassBuilderPipelines.cpp
@@ -979,7 +979,8 @@ PassBuilder::buildInlinerPipeline(OptimizationLevel Level,
   MainCGPipeline.addPass(createCGSCCToFunctionPassAdaptor(
       RequireAnalysisPass<ShouldNotRunFunctionPassesAnalysis, Function>()));
 
-  MainCGPipeline.addPass(CoroSplitPass(Level != OptimizationLevel::O0));
+  if (Phase != ThinOrFullLTOPhase::ThinLTOPreLink)
+    MainCGPipeline.addPass(CoroSplitPass(Level != OptimizationLevel::O0));
 
   // Make sure we don't affect potential future NoRerun CGSCC adaptors.
   MIWP.addLateModulePass(createModuleToFunctionPassAdaptor(
@@ -1021,8 +1022,9 @@ PassBuilder::buildModuleInlinerPipeline(OptimizationLevel Level,
       buildFunctionSimplificationPipeline(Level, Phase),
       PTO.EagerlyInvalidateAnalyses));
 
-  MPM.addPass(createModuleToPostOrderCGSCCPassAdaptor(
-      CoroSplitPass(Level != OptimizationLevel::O0)));
+  if (Phase != ThinOrFullLTOPhase::ThinLTOPreLink)
+    MPM.addPass(createModuleToPostOrderCGSCCPassAdaptor(
+        CoroSplitPass(Level != OptimizationLevel::O0)));
 
   return MPM;
 }
@@ -1219,7 +1221,8 @@ PassBuilder::buildModuleSimplificationPipeline(OptimizationLevel Level,
   // and argument promotion.
   MPM.addPass(DeadArgumentEliminationPass());
 
-  MPM.addPass(CoroCleanupPass());
+  if (Phase != ThinOrFullLTOPhase::ThinLTOPreLink)
+    MPM.addPass(CoroCleanupPass());
 
   // Optimize globals now that functions are fully simplified.
   MPM.addPass(GlobalOptPass());
diff --git a/llvm/lib/SandboxIR/SandboxIR.cpp b/llvm/lib/SandboxIR/SandboxIR.cpp
index ceadb34..4f12985 100644
--- a/llvm/lib/SandboxIR/SandboxIR.cpp
+++ b/llvm/lib/SandboxIR/SandboxIR.cpp
@@ -8,6 +8,7 @@
 
 #include "llvm/SandboxIR/SandboxIR.h"
 #include "llvm/ADT/SmallPtrSet.h"
+#include "llvm/ADT/SmallVector.h"
 #include "llvm/IR/Constants.h"
 #include "llvm/Support/Debug.h"
 #include <sstream>
@@ -16,7 +17,12 @@ using namespace llvm::sandboxir;
 
 Value *Use::get() const { return Ctx->getValue(LLVMUse->get()); }
 
-void Use::set(Value *V) { LLVMUse->set(V->Val); }
+void Use::set(Value *V) {
+  auto &Tracker = Ctx->getTracker();
+  if (Tracker.isTracking())
+    Tracker.track(std::make_unique<UseSet>(*this, Tracker));
+  LLVMUse->set(V->Val);
+}
 
 unsigned Use::getOperandNo() const { return Usr->getUseOperandNo(*this); }
 
@@ -84,6 +90,25 @@ UserUseIterator &UserUseIterator::operator++() {
   return *this;
 }
 
+OperandUseIterator OperandUseIterator::operator+(unsigned Num) const {
+  sandboxir::Use U = Use.getUser()->getOperandUseInternal(
+      Use.getOperandNo() + Num, /*Verify=*/true);
+  return OperandUseIterator(U);
+}
+
+OperandUseIterator OperandUseIterator::operator-(unsigned Num) const {
+  assert(Use.getOperandNo() >= Num && "Out of bounds!");
+  sandboxir::Use U = Use.getUser()->getOperandUseInternal(
+      Use.getOperandNo() - Num, /*Verify=*/true);
+  return OperandUseIterator(U);
+}
+
+int OperandUseIterator::operator-(const OperandUseIterator &Other) const {
+  int ThisOpNo = Use.getOperandNo();
+  int OtherOpNo = Other.Use.getOperandNo();
+  return ThisOpNo - OtherOpNo;
+}
+
 Value::Value(ClassID SubclassID, llvm::Value *Val, Context &Ctx)
     : SubclassID(SubclassID), Val(Val), Ctx(Ctx) {
 #ifndef NDEBUG
@@ -288,11 +313,10 @@ BBIterator &BBIterator::operator--() {
 
 const char *Instruction::getOpcodeName(Opcode Opc) {
   switch (Opc) {
-#define DEF_VALUE(ID, CLASS)
-#define DEF_USER(ID, CLASS)
 #define OP(OPC)                                                                \
   case Opcode::OPC:                                                            \
     return #OPC;
+#define OPCODES(...) __VA_ARGS__
 #define DEF_INSTR(ID, OPC, CLASS) OPC
 #include "llvm/SandboxIR/SandboxIRValues.def"
   }
@@ -589,11 +613,17 @@ void BranchInst::dump() const {
 LoadInst *LoadInst::create(Type *Ty, Value *Ptr, MaybeAlign Align,
                            Instruction *InsertBefore, Context &Ctx,
                            const Twine &Name) {
+  return create(Ty, Ptr, Align, InsertBefore, /*IsVolatile=*/false, Ctx, Name);
+}
+
+LoadInst *LoadInst::create(Type *Ty, Value *Ptr, MaybeAlign Align,
+                           Instruction *InsertBefore, bool IsVolatile,
+                           Context &Ctx, const Twine &Name) {
   llvm::Instruction *BeforeIR = InsertBefore->getTopmostLLVMInstruction();
   auto &Builder = Ctx.getLLVMIRBuilder();
   Builder.SetInsertPoint(BeforeIR);
-  auto *NewLI = Builder.CreateAlignedLoad(Ty, Ptr->Val, Align,
-                                          /*isVolatile=*/false, Name);
+  auto *NewLI =
+      Builder.CreateAlignedLoad(Ty, Ptr->Val, Align, IsVolatile, Name);
   auto *NewSBI = Ctx.createLoadInst(NewLI);
   return NewSBI;
 }
@@ -601,10 +631,16 @@ LoadInst *LoadInst::create(Type *Ty, Value *Ptr, MaybeAlign Align,
 LoadInst *LoadInst::create(Type *Ty, Value *Ptr, MaybeAlign Align,
                            BasicBlock *InsertAtEnd, Context &Ctx,
                            const Twine &Name) {
+  return create(Ty, Ptr, Align, InsertAtEnd, /*IsVolatile=*/false, Ctx, Name);
+}
+
+LoadInst *LoadInst::create(Type *Ty, Value *Ptr, MaybeAlign Align,
+                           BasicBlock *InsertAtEnd, bool IsVolatile,
+                           Context &Ctx, const Twine &Name) {
   auto &Builder = Ctx.getLLVMIRBuilder();
   Builder.SetInsertPoint(cast<llvm::BasicBlock>(InsertAtEnd->Val));
-  auto *NewLI = Builder.CreateAlignedLoad(Ty, Ptr->Val, Align,
-                                          /*isVolatile=*/false, Name);
+  auto *NewLI =
+      Builder.CreateAlignedLoad(Ty, Ptr->Val, Align, IsVolatile, Name);
   auto *NewSBI = Ctx.createLoadInst(NewLI);
   return NewSBI;
 }
@@ -630,21 +666,32 @@ void LoadInst::dump() const {
 #endif // NDEBUG
 StoreInst *StoreInst::create(Value *V, Value *Ptr, MaybeAlign Align,
                              Instruction *InsertBefore, Context &Ctx) {
+  return create(V, Ptr, Align, InsertBefore, /*IsVolatile=*/false, Ctx);
+}
+
+StoreInst *StoreInst::create(Value *V, Value *Ptr, MaybeAlign Align,
+                             Instruction *InsertBefore, bool IsVolatile,
+                             Context &Ctx) {
   llvm::Instruction *BeforeIR = InsertBefore->getTopmostLLVMInstruction();
   auto &Builder = Ctx.getLLVMIRBuilder();
   Builder.SetInsertPoint(BeforeIR);
-  auto *NewSI =
-      Builder.CreateAlignedStore(V->Val, Ptr->Val, Align, /*isVolatile=*/false);
+  auto *NewSI = Builder.CreateAlignedStore(V->Val, Ptr->Val, Align, IsVolatile);
   auto *NewSBI = Ctx.createStoreInst(NewSI);
   return NewSBI;
 }
+
 StoreInst *StoreInst::create(Value *V, Value *Ptr, MaybeAlign Align,
                              BasicBlock *InsertAtEnd, Context &Ctx) {
+  return create(V, Ptr, Align, InsertAtEnd, /*IsVolatile=*/false, Ctx);
+}
+
+StoreInst *StoreInst::create(Value *V, Value *Ptr, MaybeAlign Align,
+                             BasicBlock *InsertAtEnd, bool IsVolatile,
+                             Context &Ctx) {
   auto *InsertAtEndIR = cast<llvm::BasicBlock>(InsertAtEnd->Val);
   auto &Builder = Ctx.getLLVMIRBuilder();
   Builder.SetInsertPoint(InsertAtEndIR);
-  auto *NewSI =
-      Builder.CreateAlignedStore(V->Val, Ptr->Val, Align, /*isVolatile=*/false);
+  auto *NewSI = Builder.CreateAlignedStore(V->Val, Ptr->Val, Align, IsVolatile);
   auto *NewSBI = Ctx.createStoreInst(NewSI);
   return NewSBI;
 }
@@ -713,6 +760,690 @@ void ReturnInst::dump() const {
   dump(dbgs());
   dbgs() << "\n";
 }
+#endif // NDEBUG
+
+Value *CallBase::getCalledOperand() const {
+  return Ctx.getValue(cast<llvm::CallBase>(Val)->getCalledOperand());
+}
+
+Use CallBase::getCalledOperandUse() const {
+  llvm::Use *LLVMUse = &cast<llvm::CallBase>(Val)->getCalledOperandUse();
+  return Use(LLVMUse, cast<User>(Ctx.getValue(LLVMUse->getUser())), Ctx);
+}
+
+Function *CallBase::getCalledFunction() const {
+  return cast_or_null<Function>(
+      Ctx.getValue(cast<llvm::CallBase>(Val)->getCalledFunction()));
+}
+Function *CallBase::getCaller() {
+  return cast<Function>(Ctx.getValue(cast<llvm::CallBase>(Val)->getCaller()));
+}
+
+void CallBase::setCalledFunction(Function *F) {
+  // F's function type is private, so we rely on `setCalledFunction()` to update
+  // it. But even though we are calling `setCalledFunction()` we also need to
+  // track this change at the SandboxIR level, which is why we call
+  // `setCalledOperand()` here.
+  // Note: This may break if `setCalledFunction()` early returns if `F`
+  // is already set, but we do have a unit test for it.
+  setCalledOperand(F);
+  cast<llvm::CallBase>(Val)->setCalledFunction(F->getFunctionType(),
+                                               cast<llvm::Function>(F->Val));
+}
+
+CallInst *CallInst::create(FunctionType *FTy, Value *Func,
+                           ArrayRef<Value *> Args, BasicBlock::iterator WhereIt,
+                           BasicBlock *WhereBB, Context &Ctx,
+                           const Twine &NameStr) {
+  auto &Builder = Ctx.getLLVMIRBuilder();
+  if (WhereIt != WhereBB->end())
+    Builder.SetInsertPoint((*WhereIt).getTopmostLLVMInstruction());
+  else
+    Builder.SetInsertPoint(cast<llvm::BasicBlock>(WhereBB->Val));
+  SmallVector<llvm::Value *> LLVMArgs;
+  LLVMArgs.reserve(Args.size());
+  for (Value *Arg : Args)
+    LLVMArgs.push_back(Arg->Val);
+  llvm::CallInst *NewCI = Builder.CreateCall(FTy, Func->Val, LLVMArgs, NameStr);
+  return Ctx.createCallInst(NewCI);
+}
+
+CallInst *CallInst::create(FunctionType *FTy, Value *Func,
+                           ArrayRef<Value *> Args, Instruction *InsertBefore,
+                           Context &Ctx, const Twine &NameStr) {
+  return CallInst::create(FTy, Func, Args, InsertBefore->getIterator(),
+                          InsertBefore->getParent(), Ctx, NameStr);
+}
+
+CallInst *CallInst::create(FunctionType *FTy, Value *Func,
+                           ArrayRef<Value *> Args, BasicBlock *InsertAtEnd,
+                           Context &Ctx, const Twine &NameStr) {
+  return CallInst::create(FTy, Func, Args, InsertAtEnd->end(), InsertAtEnd, Ctx,
+                          NameStr);
+}
+
+#ifndef NDEBUG
+void CallInst::dump(raw_ostream &OS) const {
+  dumpCommonPrefix(OS);
+  dumpCommonSuffix(OS);
+}
+
+void CallInst::dump() const {
+  dump(dbgs());
+  dbgs() << "\n";
+}
+#endif // NDEBUG
+
+InvokeInst *InvokeInst::create(FunctionType *FTy, Value *Func,
+                               BasicBlock *IfNormal, BasicBlock *IfException,
+                               ArrayRef<Value *> Args, BBIterator WhereIt,
+                               BasicBlock *WhereBB, Context &Ctx,
+                               const Twine &NameStr) {
+  auto &Builder = Ctx.getLLVMIRBuilder();
+  if (WhereIt != WhereBB->end())
+    Builder.SetInsertPoint((*WhereIt).getTopmostLLVMInstruction());
+  else
+    Builder.SetInsertPoint(cast<llvm::BasicBlock>(WhereBB->Val));
+  SmallVector<llvm::Value *> LLVMArgs;
+  LLVMArgs.reserve(Args.size());
+  for (Value *Arg : Args)
+    LLVMArgs.push_back(Arg->Val);
+  llvm::InvokeInst *Invoke = Builder.CreateInvoke(
+      FTy, Func->Val, cast<llvm::BasicBlock>(IfNormal->Val),
+      cast<llvm::BasicBlock>(IfException->Val), LLVMArgs, NameStr);
+  return Ctx.createInvokeInst(Invoke);
+}
+
+InvokeInst *InvokeInst::create(FunctionType *FTy, Value *Func,
+                               BasicBlock *IfNormal, BasicBlock *IfException,
+                               ArrayRef<Value *> Args,
+                               Instruction *InsertBefore, Context &Ctx,
+                               const Twine &NameStr) {
+  return create(FTy, Func, IfNormal, IfException, Args,
+                InsertBefore->getIterator(), InsertBefore->getParent(), Ctx,
+                NameStr);
+}
+
+InvokeInst *InvokeInst::create(FunctionType *FTy, Value *Func,
+                               BasicBlock *IfNormal, BasicBlock *IfException,
+                               ArrayRef<Value *> Args, BasicBlock *InsertAtEnd,
+                               Context &Ctx, const Twine &NameStr) {
+  return create(FTy, Func, IfNormal, IfException, Args, InsertAtEnd->end(),
+                InsertAtEnd, Ctx, NameStr);
+}
+
+BasicBlock *InvokeInst::getNormalDest() const {
+  return cast<BasicBlock>(
+      Ctx.getValue(cast<llvm::InvokeInst>(Val)->getNormalDest()));
+}
+BasicBlock *InvokeInst::getUnwindDest() const {
+  return cast<BasicBlock>(
+      Ctx.getValue(cast<llvm::InvokeInst>(Val)->getUnwindDest()));
+}
+void InvokeInst::setNormalDest(BasicBlock *BB) {
+  setOperand(1, BB);
+  assert(getNormalDest() == BB && "LLVM IR uses a different operan index!");
+}
+void InvokeInst::setUnwindDest(BasicBlock *BB) {
+  setOperand(2, BB);
+  assert(getUnwindDest() == BB && "LLVM IR uses a different operan index!");
+}
+Instruction *InvokeInst::getLandingPadInst() const {
+  return cast<Instruction>(
+      Ctx.getValue(cast<llvm::InvokeInst>(Val)->getLandingPadInst()));
+  ;
+}
+BasicBlock *InvokeInst::getSuccessor(unsigned SuccIdx) const {
+  return cast<BasicBlock>(
+      Ctx.getValue(cast<llvm::InvokeInst>(Val)->getSuccessor(SuccIdx)));
+}
+
+#ifndef NDEBUG
+void InvokeInst::dump(raw_ostream &OS) const {
+  dumpCommonPrefix(OS);
+  dumpCommonSuffix(OS);
+}
+void InvokeInst::dump() const {
+  dump(dbgs());
+  dbgs() << "\n";
+}
+#endif // NDEBUG
+
+CallBrInst *CallBrInst::create(FunctionType *FTy, Value *Func,
+                               BasicBlock *DefaultDest,
+                               ArrayRef<BasicBlock *> IndirectDests,
+                               ArrayRef<Value *> Args, BBIterator WhereIt,
+                               BasicBlock *WhereBB, Context &Ctx,
+                               const Twine &NameStr) {
+  auto &Builder = Ctx.getLLVMIRBuilder();
+  if (WhereIt != WhereBB->end())
+    Builder.SetInsertPoint((*WhereIt).getTopmostLLVMInstruction());
+  else
+    Builder.SetInsertPoint(cast<llvm::BasicBlock>(WhereBB->Val));
+
+  SmallVector<llvm::BasicBlock *> LLVMIndirectDests;
+  LLVMIndirectDests.reserve(IndirectDests.size());
+  for (BasicBlock *IndDest : IndirectDests)
+    LLVMIndirectDests.push_back(cast<llvm::BasicBlock>(IndDest->Val));
+
+  SmallVector<llvm::Value *> LLVMArgs;
+  LLVMArgs.reserve(Args.size());
+  for (Value *Arg : Args)
+    LLVMArgs.push_back(Arg->Val);
+
+  llvm::CallBrInst *CallBr = Builder.CreateCallBr(
+      FTy, Func->Val, cast<llvm::BasicBlock>(DefaultDest->Val),
+      LLVMIndirectDests, LLVMArgs, NameStr);
+  return Ctx.createCallBrInst(CallBr);
+}
+
+CallBrInst *CallBrInst::create(FunctionType *FTy, Value *Func,
+                               BasicBlock *DefaultDest,
+                               ArrayRef<BasicBlock *> IndirectDests,
+                               ArrayRef<Value *> Args,
+                               Instruction *InsertBefore, Context &Ctx,
+                               const Twine &NameStr) {
+  return create(FTy, Func, DefaultDest, IndirectDests, Args,
+                InsertBefore->getIterator(), InsertBefore->getParent(), Ctx,
+                NameStr);
+}
+CallBrInst *CallBrInst::create(FunctionType *FTy, Value *Func,
+                               BasicBlock *DefaultDest,
+                               ArrayRef<BasicBlock *> IndirectDests,
+                               ArrayRef<Value *> Args, BasicBlock *InsertAtEnd,
+                               Context &Ctx, const Twine &NameStr) {
+  return create(FTy, Func, DefaultDest, IndirectDests, Args, InsertAtEnd->end(),
+                InsertAtEnd, Ctx, NameStr);
+}
+
+Value *CallBrInst::getIndirectDestLabel(unsigned Idx) const {
+  return Ctx.getValue(cast<llvm::CallBrInst>(Val)->getIndirectDestLabel(Idx));
+}
+Value *CallBrInst::getIndirectDestLabelUse(unsigned Idx) const {
+  return Ctx.getValue(
+      cast<llvm::CallBrInst>(Val)->getIndirectDestLabelUse(Idx));
+}
+BasicBlock *CallBrInst::getDefaultDest() const {
+  return cast<BasicBlock>(
+      Ctx.getValue(cast<llvm::CallBrInst>(Val)->getDefaultDest()));
+}
+BasicBlock *CallBrInst::getIndirectDest(unsigned Idx) const {
+  return cast<BasicBlock>(
+      Ctx.getValue(cast<llvm::CallBrInst>(Val)->getIndirectDest(Idx)));
+}
+llvm::SmallVector<BasicBlock *, 16> CallBrInst::getIndirectDests() const {
+  SmallVector<BasicBlock *, 16> BBs;
+  for (llvm::BasicBlock *LLVMBB :
+       cast<llvm::CallBrInst>(Val)->getIndirectDests())
+    BBs.push_back(cast<BasicBlock>(Ctx.getValue(LLVMBB)));
+  return BBs;
+}
+void CallBrInst::setDefaultDest(BasicBlock *BB) {
+  auto &Tracker = Ctx.getTracker();
+  if (Tracker.isTracking())
+    Tracker.track(std::make_unique<CallBrInstSetDefaultDest>(this, Tracker));
+  cast<llvm::CallBrInst>(Val)->setDefaultDest(cast<llvm::BasicBlock>(BB->Val));
+}
+void CallBrInst::setIndirectDest(unsigned Idx, BasicBlock *BB) {
+  auto &Tracker = Ctx.getTracker();
+  if (Tracker.isTracking())
+    Tracker.track(
+        std::make_unique<CallBrInstSetIndirectDest>(this, Idx, Tracker));
+  cast<llvm::CallBrInst>(Val)->setIndirectDest(Idx,
+                                               cast<llvm::BasicBlock>(BB->Val));
+}
+BasicBlock *CallBrInst::getSuccessor(unsigned Idx) const {
+  return cast<BasicBlock>(
+      Ctx.getValue(cast<llvm::CallBrInst>(Val)->getSuccessor(Idx)));
+}
+
+#ifndef NDEBUG
+void CallBrInst::dump(raw_ostream &OS) const {
+  dumpCommonPrefix(OS);
+  dumpCommonSuffix(OS);
+}
+void CallBrInst::dump() const {
+  dump(dbgs());
+  dbgs() << "\n";
+}
+#endif // NDEBUG
+
+Value *GetElementPtrInst::create(Type *Ty, Value *Ptr,
+                                 ArrayRef<Value *> IdxList,
+                                 BasicBlock::iterator WhereIt,
+                                 BasicBlock *WhereBB, Context &Ctx,
+                                 const Twine &NameStr) {
+  auto &Builder = Ctx.getLLVMIRBuilder();
+  if (WhereIt != WhereBB->end())
+    Builder.SetInsertPoint((*WhereIt).getTopmostLLVMInstruction());
+  else
+    Builder.SetInsertPoint(cast<llvm::BasicBlock>(WhereBB->Val));
+  SmallVector<llvm::Value *> LLVMIdxList;
+  LLVMIdxList.reserve(IdxList.size());
+  for (Value *Idx : IdxList)
+    LLVMIdxList.push_back(Idx->Val);
+  llvm::Value *NewV = Builder.CreateGEP(Ty, Ptr->Val, LLVMIdxList, NameStr);
+  if (auto *NewGEP = dyn_cast<llvm::GetElementPtrInst>(NewV))
+    return Ctx.createGetElementPtrInst(NewGEP);
+  assert(isa<llvm::Constant>(NewV) && "Expected constant");
+  return Ctx.getOrCreateConstant(cast<llvm::Constant>(NewV));
+}
+
+Value *GetElementPtrInst::create(Type *Ty, Value *Ptr,
+                                 ArrayRef<Value *> IdxList,
+                                 Instruction *InsertBefore, Context &Ctx,
+                                 const Twine &NameStr) {
+  return GetElementPtrInst::create(Ty, Ptr, IdxList,
+                                   InsertBefore->getIterator(),
+                                   InsertBefore->getParent(), Ctx, NameStr);
+}
+
+Value *GetElementPtrInst::create(Type *Ty, Value *Ptr,
+                                 ArrayRef<Value *> IdxList,
+                                 BasicBlock *InsertAtEnd, Context &Ctx,
+                                 const Twine &NameStr) {
+  return GetElementPtrInst::create(Ty, Ptr, IdxList, InsertAtEnd->end(),
+                                   InsertAtEnd, Ctx, NameStr);
+}
+
+Value *GetElementPtrInst::getPointerOperand() const {
+  return Ctx.getValue(cast<llvm::GetElementPtrInst>(Val)->getPointerOperand());
+}
+
+#ifndef NDEBUG
+void GetElementPtrInst::dump(raw_ostream &OS) const {
+  dumpCommonPrefix(OS);
+  dumpCommonSuffix(OS);
+}
+
+void GetElementPtrInst::dump() const {
+  dump(dbgs());
+  dbgs() << "\n";
+}
+#endif // NDEBUG
+
+BasicBlock *PHINode::LLVMBBToBB::operator()(llvm::BasicBlock *LLVMBB) const {
+  return cast<BasicBlock>(Ctx.getValue(LLVMBB));
+}
+
+PHINode *PHINode::create(Type *Ty, unsigned NumReservedValues,
+                         Instruction *InsertBefore, Context &Ctx,
+                         const Twine &Name) {
+  llvm::PHINode *NewPHI = llvm::PHINode::Create(
+      Ty, NumReservedValues, Name, InsertBefore->getTopmostLLVMInstruction());
+  return Ctx.createPHINode(NewPHI);
+}
+
+bool PHINode::classof(const Value *From) {
+  return From->getSubclassID() == ClassID::PHI;
+}
+
+Value *PHINode::getIncomingValue(unsigned Idx) const {
+  return Ctx.getValue(cast<llvm::PHINode>(Val)->getIncomingValue(Idx));
+}
+void PHINode::setIncomingValue(unsigned Idx, Value *V) {
+  auto &Tracker = Ctx.getTracker();
+  if (Tracker.isTracking())
+    Tracker.track(std::make_unique<PHISetIncoming>(
+        *this, Idx, PHISetIncoming::What::Value, Tracker));
+
+  cast<llvm::PHINode>(Val)->setIncomingValue(Idx, V->Val);
+}
+BasicBlock *PHINode::getIncomingBlock(unsigned Idx) const {
+  return cast<BasicBlock>(
+      Ctx.getValue(cast<llvm::PHINode>(Val)->getIncomingBlock(Idx)));
+}
+BasicBlock *PHINode::getIncomingBlock(const Use &U) const {
+  llvm::Use *LLVMUse = U.LLVMUse;
+  llvm::BasicBlock *BB = cast<llvm::PHINode>(Val)->getIncomingBlock(*LLVMUse);
+  return cast<BasicBlock>(Ctx.getValue(BB));
+}
+void PHINode::setIncomingBlock(unsigned Idx, BasicBlock *BB) {
+  auto &Tracker = Ctx.getTracker();
+  if (Tracker.isTracking())
+    Tracker.track(std::make_unique<PHISetIncoming>(
+        *this, Idx, PHISetIncoming::What::Block, Tracker));
+  cast<llvm::PHINode>(Val)->setIncomingBlock(Idx,
+                                             cast<llvm::BasicBlock>(BB->Val));
+}
+void PHINode::addIncoming(Value *V, BasicBlock *BB) {
+  auto &Tracker = Ctx.getTracker();
+  if (Tracker.isTracking())
+    Tracker.track(std::make_unique<PHIAddIncoming>(*this, Tracker));
+
+  cast<llvm::PHINode>(Val)->addIncoming(V->Val,
+                                        cast<llvm::BasicBlock>(BB->Val));
+}
+Value *PHINode::removeIncomingValue(unsigned Idx) {
+  auto &Tracker = Ctx.getTracker();
+  if (Tracker.isTracking())
+    Tracker.track(std::make_unique<PHIRemoveIncoming>(*this, Idx, Tracker));
+
+  llvm::Value *LLVMV =
+      cast<llvm::PHINode>(Val)->removeIncomingValue(Idx,
+                                                    /*DeletePHIIfEmpty=*/false);
+  return Ctx.getValue(LLVMV);
+}
+Value *PHINode::removeIncomingValue(BasicBlock *BB) {
+  auto &Tracker = Ctx.getTracker();
+  if (Tracker.isTracking())
+    Tracker.track(std::make_unique<PHIRemoveIncoming>(
+        *this, getBasicBlockIndex(BB), Tracker));
+
+  auto *LLVMBB = cast<llvm::BasicBlock>(BB->Val);
+  llvm::Value *LLVMV =
+      cast<llvm::PHINode>(Val)->removeIncomingValue(LLVMBB,
+                                                    /*DeletePHIIfEmpty=*/false);
+  return Ctx.getValue(LLVMV);
+}
+int PHINode::getBasicBlockIndex(const BasicBlock *BB) const {
+  auto *LLVMBB = cast<llvm::BasicBlock>(BB->Val);
+  return cast<llvm::PHINode>(Val)->getBasicBlockIndex(LLVMBB);
+}
+Value *PHINode::getIncomingValueForBlock(const BasicBlock *BB) const {
+  auto *LLVMBB = cast<llvm::BasicBlock>(BB->Val);
+  llvm::Value *LLVMV =
+      cast<llvm::PHINode>(Val)->getIncomingValueForBlock(LLVMBB);
+  return Ctx.getValue(LLVMV);
+}
+Value *PHINode::hasConstantValue() const {
+  llvm::Value *LLVMV = cast<llvm::PHINode>(Val)->hasConstantValue();
+  return LLVMV != nullptr ? Ctx.getValue(LLVMV) : nullptr;
+}
+
+static llvm::Instruction::CastOps getLLVMCastOp(Instruction::Opcode Opc) {
+  switch (Opc) {
+  case Instruction::Opcode::ZExt:
+    return static_cast<llvm::Instruction::CastOps>(llvm::Instruction::ZExt);
+  case Instruction::Opcode::SExt:
+    return static_cast<llvm::Instruction::CastOps>(llvm::Instruction::SExt);
+  case Instruction::Opcode::FPToUI:
+    return static_cast<llvm::Instruction::CastOps>(llvm::Instruction::FPToUI);
+  case Instruction::Opcode::FPToSI:
+    return static_cast<llvm::Instruction::CastOps>(llvm::Instruction::FPToSI);
+  case Instruction::Opcode::FPExt:
+    return static_cast<llvm::Instruction::CastOps>(llvm::Instruction::FPExt);
+  case Instruction::Opcode::PtrToInt:
+    return static_cast<llvm::Instruction::CastOps>(llvm::Instruction::PtrToInt);
+  case Instruction::Opcode::IntToPtr:
+    return static_cast<llvm::Instruction::CastOps>(llvm::Instruction::IntToPtr);
+  case Instruction::Opcode::SIToFP:
+    return static_cast<llvm::Instruction::CastOps>(llvm::Instruction::SIToFP);
+  case Instruction::Opcode::UIToFP:
+    return static_cast<llvm::Instruction::CastOps>(llvm::Instruction::UIToFP);
+  case Instruction::Opcode::Trunc:
+    return static_cast<llvm::Instruction::CastOps>(llvm::Instruction::Trunc);
+  case Instruction::Opcode::FPTrunc:
+    return static_cast<llvm::Instruction::CastOps>(llvm::Instruction::FPTrunc);
+  case Instruction::Opcode::BitCast:
+    return static_cast<llvm::Instruction::CastOps>(llvm::Instruction::BitCast);
+  case Instruction::Opcode::AddrSpaceCast:
+    return static_cast<llvm::Instruction::CastOps>(
+        llvm::Instruction::AddrSpaceCast);
+  default:
+    llvm_unreachable("Opcode not suitable for CastInst!");
+  }
+}
+
+Value *CastInst::create(Type *DestTy, Opcode Op, Value *Operand,
+                        BBIterator WhereIt, BasicBlock *WhereBB, Context &Ctx,
+                        const Twine &Name) {
+  assert(getLLVMCastOp(Op) && "Opcode not suitable for CastInst!");
+  auto &Builder = Ctx.getLLVMIRBuilder();
+  if (WhereIt == WhereBB->end())
+    Builder.SetInsertPoint(cast<llvm::BasicBlock>(WhereBB->Val));
+  else
+    Builder.SetInsertPoint((*WhereIt).getTopmostLLVMInstruction());
+  auto *NewV =
+      Builder.CreateCast(getLLVMCastOp(Op), Operand->Val, DestTy, Name);
+  if (auto *NewCI = dyn_cast<llvm::CastInst>(NewV))
+    return Ctx.createCastInst(NewCI);
+  assert(isa<llvm::Constant>(NewV) && "Expected constant");
+  return Ctx.getOrCreateConstant(cast<llvm::Constant>(NewV));
+}
+
+Value *CastInst::create(Type *DestTy, Opcode Op, Value *Operand,
+                        Instruction *InsertBefore, Context &Ctx,
+                        const Twine &Name) {
+  return create(DestTy, Op, Operand, InsertBefore->getIterator(),
+                InsertBefore->getParent(), Ctx, Name);
+}
+
+Value *CastInst::create(Type *DestTy, Opcode Op, Value *Operand,
+                        BasicBlock *InsertAtEnd, Context &Ctx,
+                        const Twine &Name) {
+  return create(DestTy, Op, Operand, InsertAtEnd->end(), InsertAtEnd, Ctx,
+                Name);
+}
+
+bool CastInst::classof(const Value *From) {
+  return From->getSubclassID() == ClassID::Cast;
+}
+
+#ifndef NDEBUG
+void CastInst::dump(raw_ostream &OS) const {
+  dumpCommonPrefix(OS);
+  dumpCommonSuffix(OS);
+}
+
+void CastInst::dump() const {
+  dump(dbgs());
+  dbgs() << "\n";
+}
+#endif // NDEBUG
+
+Value *SIToFPInst::create(Value *Src, Type *DestTy, BBIterator WhereIt,
+                          BasicBlock *WhereBB, Context &Ctx,
+                          const Twine &Name) {
+  return CastInst::create(DestTy, Instruction::Opcode::SIToFP, Src, WhereIt,
+                          WhereBB, Ctx, Name);
+}
+Value *SIToFPInst::create(Value *Src, Type *DestTy, Instruction *InsertBefore,
+                          Context &Ctx, const Twine &Name) {
+  return create(Src, DestTy, InsertBefore->getIterator(),
+                InsertBefore->getParent(), Ctx, Name);
+}
+Value *SIToFPInst::create(Value *Src, Type *DestTy, BasicBlock *InsertAtEnd,
+                          Context &Ctx, const Twine &Name) {
+  return create(Src, DestTy, InsertAtEnd->end(), InsertAtEnd, Ctx, Name);
+}
+
+#ifndef NDEBUG
+void SIToFPInst::dump(raw_ostream &OS) const {
+  dumpCommonPrefix(OS);
+  dumpCommonSuffix(OS);
+}
+
+void SIToFPInst::dump() const {
+  dump(dbgs());
+  dbgs() << "\n";
+}
+#endif // NDEBUG
+
+Value *FPToUIInst::create(Value *Src, Type *DestTy, BBIterator WhereIt,
+                          BasicBlock *WhereBB, Context &Ctx,
+                          const Twine &Name) {
+  return CastInst::create(DestTy, Instruction::Opcode::FPToUI, Src, WhereIt,
+                          WhereBB, Ctx, Name);
+}
+Value *FPToUIInst::create(Value *Src, Type *DestTy, Instruction *InsertBefore,
+                          Context &Ctx, const Twine &Name) {
+  return create(Src, DestTy, InsertBefore->getIterator(),
+                InsertBefore->getParent(), Ctx, Name);
+}
+Value *FPToUIInst::create(Value *Src, Type *DestTy, BasicBlock *InsertAtEnd,
+                          Context &Ctx, const Twine &Name) {
+  return create(Src, DestTy, InsertAtEnd->end(), InsertAtEnd, Ctx, Name);
+}
+
+#ifndef NDEBUG
+void FPToUIInst::dump(raw_ostream &OS) const {
+  dumpCommonPrefix(OS);
+  dumpCommonSuffix(OS);
+}
+
+void FPToUIInst::dump() const {
+  dump(dbgs());
+  dbgs() << "\n";
+}
+#endif // NDEBUG
+
+Value *FPToSIInst::create(Value *Src, Type *DestTy, BBIterator WhereIt,
+                          BasicBlock *WhereBB, Context &Ctx,
+                          const Twine &Name) {
+  return CastInst::create(DestTy, Instruction::Opcode::FPToSI, Src, WhereIt,
+                          WhereBB, Ctx, Name);
+}
+Value *FPToSIInst::create(Value *Src, Type *DestTy, Instruction *InsertBefore,
+                          Context &Ctx, const Twine &Name) {
+  return create(Src, DestTy, InsertBefore->getIterator(),
+                InsertBefore->getParent(), Ctx, Name);
+}
+Value *FPToSIInst::create(Value *Src, Type *DestTy, BasicBlock *InsertAtEnd,
+                          Context &Ctx, const Twine &Name) {
+  return create(Src, DestTy, InsertAtEnd->end(), InsertAtEnd, Ctx, Name);
+}
+
+#ifndef NDEBUG
+void FPToSIInst::dump(raw_ostream &OS) const {
+  dumpCommonPrefix(OS);
+  dumpCommonSuffix(OS);
+}
+
+void FPToSIInst::dump() const {
+  dump(dbgs());
+  dbgs() << "\n";
+}
+#endif // NDEBUG
+
+Value *IntToPtrInst::create(Value *Src, Type *DestTy, BBIterator WhereIt,
+                            BasicBlock *WhereBB, Context &Ctx,
+                            const Twine &Name) {
+  return CastInst::create(DestTy, Instruction::Opcode::IntToPtr, Src, WhereIt,
+                          WhereBB, Ctx, Name);
+}
+Value *IntToPtrInst::create(Value *Src, Type *DestTy, Instruction *InsertBefore,
+                            Context &Ctx, const Twine &Name) {
+  return create(Src, DestTy, InsertBefore->getIterator(),
+                InsertBefore->getParent(), Ctx, Name);
+}
+Value *IntToPtrInst::create(Value *Src, Type *DestTy, BasicBlock *InsertAtEnd,
+                            Context &Ctx, const Twine &Name) {
+  return create(Src, DestTy, InsertAtEnd->end(), InsertAtEnd, Ctx, Name);
+}
+
+#ifndef NDEBUG
+void IntToPtrInst::dump(raw_ostream &OS) const {
+  dumpCommonPrefix(OS);
+  dumpCommonSuffix(OS);
+}
+
+void IntToPtrInst::dump() const {
+  dump(dbgs());
+  dbgs() << "\n";
+}
+#endif // NDEBUG
+
+Value *PtrToIntInst::create(Value *Src, Type *DestTy, BBIterator WhereIt,
+                            BasicBlock *WhereBB, Context &Ctx,
+                            const Twine &Name) {
+  return CastInst::create(DestTy, Instruction::Opcode::PtrToInt, Src, WhereIt,
+                          WhereBB, Ctx, Name);
+}
+Value *PtrToIntInst::create(Value *Src, Type *DestTy, Instruction *InsertBefore,
+                            Context &Ctx, const Twine &Name) {
+  return create(Src, DestTy, InsertBefore->getIterator(),
+                InsertBefore->getParent(), Ctx, Name);
+}
+Value *PtrToIntInst::create(Value *Src, Type *DestTy, BasicBlock *InsertAtEnd,
+                            Context &Ctx, const Twine &Name) {
+  return create(Src, DestTy, InsertAtEnd->end(), InsertAtEnd, Ctx, Name);
+}
+
+#ifndef NDEBUG
+void PHINode::dump(raw_ostream &OS) const {
+  dumpCommonPrefix(OS);
+  dumpCommonSuffix(OS);
+}
+
+void PHINode::dump() const {
+  dump(dbgs());
+  dbgs() << "\n";
+}
+
+void PtrToIntInst::dump(raw_ostream &OS) const {
+  dumpCommonPrefix(OS);
+  dumpCommonSuffix(OS);
+}
+
+void PtrToIntInst::dump() const {
+  dump(dbgs());
+  dbgs() << "\n";
+}
+#endif // NDEBUG
+
+Value *BitCastInst::create(Value *Src, Type *DestTy, BBIterator WhereIt,
+                           BasicBlock *WhereBB, Context &Ctx,
+                           const Twine &Name) {
+  return CastInst::create(DestTy, Instruction::Opcode::BitCast, Src, WhereIt,
+                          WhereBB, Ctx, Name);
+}
+
+Value *BitCastInst::create(Value *Src, Type *DestTy, Instruction *InsertBefore,
+                           Context &Ctx, const Twine &Name) {
+  return CastInst::create(DestTy, Instruction::Opcode::BitCast, Src,
+                          InsertBefore, Ctx, Name);
+}
+
+Value *BitCastInst::create(Value *Src, Type *DestTy, BasicBlock *InsertAtEnd,
+                           Context &Ctx, const Twine &Name) {
+  return CastInst::create(DestTy, Instruction::Opcode::BitCast, Src,
+                          InsertAtEnd, Ctx, Name);
+}
+
+#ifndef NDEBUG
+void BitCastInst::dump(raw_ostream &OS) const {
+  dumpCommonPrefix(OS);
+  dumpCommonSuffix(OS);
+}
+
+void BitCastInst::dump() const {
+  dump(dbgs());
+  dbgs() << "\n";
+}
+#endif // NDEBUG
+
+Value *AddrSpaceCastInst::create(Value *Src, Type *DestTy, BBIterator WhereIt,
+                                 BasicBlock *WhereBB, Context &Ctx,
+                                 const Twine &Name) {
+  return CastInst::create(DestTy, Instruction::Opcode::AddrSpaceCast, Src,
+                          WhereIt, WhereBB, Ctx, Name);
+}
+
+Value *AddrSpaceCastInst::create(Value *Src, Type *DestTy,
+                                 Instruction *InsertBefore, Context &Ctx,
+                                 const Twine &Name) {
+  return CastInst::create(DestTy, Instruction::Opcode::AddrSpaceCast, Src,
+                          InsertBefore, Ctx, Name);
+}
+
+Value *AddrSpaceCastInst::create(Value *Src, Type *DestTy,
+                                 BasicBlock *InsertAtEnd, Context &Ctx,
+                                 const Twine &Name) {
+  return CastInst::create(DestTy, Instruction::Opcode::AddrSpaceCast, Src,
+                          InsertAtEnd, Ctx, Name);
+}
+
+#ifndef NDEBUG
+void AddrSpaceCastInst::dump(raw_ostream &OS) const {
+  dumpCommonPrefix(OS);
+  dumpCommonSuffix(OS);
+}
+
+void AddrSpaceCastInst::dump() const {
+  dump(dbgs());
+  dbgs() << "\n";
+}
 
 void OpaqueInst::dump(raw_ostream &OS) const {
   dumpCommonPrefix(OS);
@@ -819,7 +1550,10 @@ Value *Context::getOrCreateValueInternal(llvm::Value *LLVMV, llvm::User *U) {
     return It->second.get();
 
   if (auto *C = dyn_cast<llvm::Constant>(LLVMV)) {
-    It->second = std::unique_ptr<Constant>(new Constant(C, *this));
+    if (auto *F = dyn_cast<llvm::Function>(LLVMV))
+      It->second = std::unique_ptr<Function>(new Function(F, *this));
+    else
+      It->second = std::unique_ptr<Constant>(new Constant(C, *this));
     auto *NewC = It->second.get();
     for (llvm::Value *COp : C->operands())
       getOrCreateValueInternal(COp, C);
@@ -864,6 +1598,49 @@ Value *Context::getOrCreateValueInternal(llvm::Value *LLVMV, llvm::User *U) {
     It->second = std::unique_ptr<ReturnInst>(new ReturnInst(LLVMRet, *this));
     return It->second.get();
   }
+  case llvm::Instruction::Call: {
+    auto *LLVMCall = cast<llvm::CallInst>(LLVMV);
+    It->second = std::unique_ptr<CallInst>(new CallInst(LLVMCall, *this));
+    return It->second.get();
+  }
+  case llvm::Instruction::Invoke: {
+    auto *LLVMInvoke = cast<llvm::InvokeInst>(LLVMV);
+    It->second = std::unique_ptr<InvokeInst>(new InvokeInst(LLVMInvoke, *this));
+    return It->second.get();
+  }
+  case llvm::Instruction::CallBr: {
+    auto *LLVMCallBr = cast<llvm::CallBrInst>(LLVMV);
+    It->second = std::unique_ptr<CallBrInst>(new CallBrInst(LLVMCallBr, *this));
+    return It->second.get();
+  }
+  case llvm::Instruction::GetElementPtr: {
+    auto *LLVMGEP = cast<llvm::GetElementPtrInst>(LLVMV);
+    It->second = std::unique_ptr<GetElementPtrInst>(
+        new GetElementPtrInst(LLVMGEP, *this));
+    return It->second.get();
+  }
+  case llvm::Instruction::ZExt:
+  case llvm::Instruction::SExt:
+  case llvm::Instruction::FPToUI:
+  case llvm::Instruction::FPToSI:
+  case llvm::Instruction::FPExt:
+  case llvm::Instruction::PtrToInt:
+  case llvm::Instruction::IntToPtr:
+  case llvm::Instruction::SIToFP:
+  case llvm::Instruction::UIToFP:
+  case llvm::Instruction::Trunc:
+  case llvm::Instruction::FPTrunc:
+  case llvm::Instruction::BitCast:
+  case llvm::Instruction::AddrSpaceCast: {
+    auto *LLVMCast = cast<llvm::CastInst>(LLVMV);
+    It->second = std::unique_ptr<CastInst>(new CastInst(LLVMCast, *this));
+    return It->second.get();
+  }
+  case llvm::Instruction::PHI: {
+    auto *LLVMPhi = cast<llvm::PHINode>(LLVMV);
+    It->second = std::unique_ptr<PHINode>(new PHINode(LLVMPhi, *this));
+    return It->second.get();
+  }
   default:
     break;
   }
@@ -907,6 +1684,37 @@ ReturnInst *Context::createReturnInst(llvm::ReturnInst *I) {
   return cast<ReturnInst>(registerValue(std::move(NewPtr)));
 }
 
+CallInst *Context::createCallInst(llvm::CallInst *I) {
+  auto NewPtr = std::unique_ptr<CallInst>(new CallInst(I, *this));
+  return cast<CallInst>(registerValue(std::move(NewPtr)));
+}
+
+InvokeInst *Context::createInvokeInst(llvm::InvokeInst *I) {
+  auto NewPtr = std::unique_ptr<InvokeInst>(new InvokeInst(I, *this));
+  return cast<InvokeInst>(registerValue(std::move(NewPtr)));
+}
+
+CallBrInst *Context::createCallBrInst(llvm::CallBrInst *I) {
+  auto NewPtr = std::unique_ptr<CallBrInst>(new CallBrInst(I, *this));
+  return cast<CallBrInst>(registerValue(std::move(NewPtr)));
+}
+
+GetElementPtrInst *
+Context::createGetElementPtrInst(llvm::GetElementPtrInst *I) {
+  auto NewPtr =
+      std::unique_ptr<GetElementPtrInst>(new GetElementPtrInst(I, *this));
+  return cast<GetElementPtrInst>(registerValue(std::move(NewPtr)));
+}
+
+CastInst *Context::createCastInst(llvm::CastInst *I) {
+  auto NewPtr = std::unique_ptr<CastInst>(new CastInst(I, *this));
+  return cast<CastInst>(registerValue(std::move(NewPtr)));
+}
+PHINode *Context::createPHINode(llvm::PHINode *I) {
+  auto NewPtr = std::unique_ptr<PHINode>(new PHINode(I, *this));
+  return cast<PHINode>(registerValue(std::move(NewPtr)));
+}
+
 Value *Context::getValue(llvm::Value *V) const {
   auto It = LLVMValueToValueMap.find(V);
   if (It != LLVMValueToValueMap.end())
@@ -917,13 +1725,13 @@ Value *Context::getValue(llvm::Value *V) const {
 Function *Context::createFunction(llvm::Function *F) {
   assert(getValue(F) == nullptr && "Already exists!");
   auto NewFPtr = std::unique_ptr<Function>(new Function(F, *this));
+  auto *SBF = cast<Function>(registerValue(std::move(NewFPtr)));
   // Create arguments.
   for (auto &Arg : F->args())
     getOrCreateArgument(&Arg);
   // Create BBs.
   for (auto &BB : *F)
     createBasicBlock(&BB);
-  auto *SBF = cast<Function>(registerValue(std::move(NewFPtr)));
   return SBF;
 }
 
diff --git a/llvm/lib/SandboxIR/Tracker.cpp b/llvm/lib/SandboxIR/Tracker.cpp
index c741776..0310160 100644
--- a/llvm/lib/SandboxIR/Tracker.cpp
+++ b/llvm/lib/SandboxIR/Tracker.cpp
@@ -42,6 +42,81 @@ void UseSwap::dump() const {
 }
 #endif // NDEBUG
 
+PHISetIncoming::PHISetIncoming(PHINode &PHI, unsigned Idx, What What,
+                               Tracker &Tracker)
+    : IRChangeBase(Tracker), PHI(PHI), Idx(Idx) {
+  switch (What) {
+  case What::Value:
+    OrigValueOrBB = PHI.getIncomingValue(Idx);
+    break;
+  case What::Block:
+    OrigValueOrBB = PHI.getIncomingBlock(Idx);
+    break;
+  }
+}
+
+void PHISetIncoming::revert() {
+  if (auto *V = OrigValueOrBB.dyn_cast<Value *>())
+    PHI.setIncomingValue(Idx, V);
+  else
+    PHI.setIncomingBlock(Idx, OrigValueOrBB.get<BasicBlock *>());
+}
+
+#ifndef NDEBUG
+void PHISetIncoming::dump() const {
+  dump(dbgs());
+  dbgs() << "\n";
+}
+#endif // NDEBUG
+
+PHIRemoveIncoming::PHIRemoveIncoming(PHINode &PHI, unsigned RemovedIdx,
+                                     Tracker &Tracker)
+    : IRChangeBase(Tracker), PHI(PHI), RemovedIdx(RemovedIdx) {
+  RemovedV = PHI.getIncomingValue(RemovedIdx);
+  RemovedBB = PHI.getIncomingBlock(RemovedIdx);
+}
+
+void PHIRemoveIncoming::revert() {
+  // Special case: if the PHI is now empty, as we don't need to care about the
+  // order of the incoming values.
+  unsigned NumIncoming = PHI.getNumIncomingValues();
+  if (NumIncoming == 0) {
+    PHI.addIncoming(RemovedV, RemovedBB);
+    return;
+  }
+  // Shift all incoming values by one starting from the end until `Idx`.
+  // Start by adding a copy of the last incoming values.
+  unsigned LastIdx = NumIncoming - 1;
+  PHI.addIncoming(PHI.getIncomingValue(LastIdx), PHI.getIncomingBlock(LastIdx));
+  for (unsigned Idx = LastIdx; Idx > RemovedIdx; --Idx) {
+    auto *PrevV = PHI.getIncomingValue(Idx - 1);
+    auto *PrevBB = PHI.getIncomingBlock(Idx - 1);
+    PHI.setIncomingValue(Idx, PrevV);
+    PHI.setIncomingBlock(Idx, PrevBB);
+  }
+  PHI.setIncomingValue(RemovedIdx, RemovedV);
+  PHI.setIncomingBlock(RemovedIdx, RemovedBB);
+}
+
+#ifndef NDEBUG
+void PHIRemoveIncoming::dump() const {
+  dump(dbgs());
+  dbgs() << "\n";
+}
+#endif // NDEBUG
+
+PHIAddIncoming::PHIAddIncoming(PHINode &PHI, Tracker &Tracker)
+    : IRChangeBase(Tracker), PHI(PHI), Idx(PHI.getNumIncomingValues()) {}
+
+void PHIAddIncoming::revert() { PHI.removeIncomingValue(Idx); }
+
+#ifndef NDEBUG
+void PHIAddIncoming::dump() const {
+  dump(dbgs());
+  dbgs() << "\n";
+}
+#endif // NDEBUG
+
 Tracker::~Tracker() {
   assert(Changes.empty() && "You must accept or revert changes!");
 }
@@ -129,6 +204,37 @@ void RemoveFromParent::dump() const {
 }
 #endif
 
+CallBrInstSetDefaultDest::CallBrInstSetDefaultDest(CallBrInst *CallBr,
+                                                   Tracker &Tracker)
+    : IRChangeBase(Tracker), CallBr(CallBr) {
+  OrigDefaultDest = CallBr->getDefaultDest();
+}
+void CallBrInstSetDefaultDest::revert() {
+  CallBr->setDefaultDest(OrigDefaultDest);
+}
+#ifndef NDEBUG
+void CallBrInstSetDefaultDest::dump() const {
+  dump(dbgs());
+  dbgs() << "\n";
+}
+#endif
+
+CallBrInstSetIndirectDest::CallBrInstSetIndirectDest(CallBrInst *CallBr,
+                                                     unsigned Idx,
+                                                     Tracker &Tracker)
+    : IRChangeBase(Tracker), CallBr(CallBr), Idx(Idx) {
+  OrigIndirectDest = CallBr->getIndirectDest(Idx);
+}
+void CallBrInstSetIndirectDest::revert() {
+  CallBr->setIndirectDest(Idx, OrigIndirectDest);
+}
+#ifndef NDEBUG
+void CallBrInstSetIndirectDest::dump() const {
+  dump(dbgs());
+  dbgs() << "\n";
+}
+#endif
+
 MoveInstr::MoveInstr(Instruction *MovedI, Tracker &Tracker)
     : IRChangeBase(Tracker), MovedI(MovedI) {
   if (auto *NextI = MovedI->getNextNode())
diff --git a/llvm/lib/Support/APFloat.cpp b/llvm/lib/Support/APFloat.cpp
index 26b4f8e..7f68c5a 100644
--- a/llvm/lib/Support/APFloat.cpp
+++ b/llvm/lib/Support/APFloat.cpp
@@ -143,6 +143,7 @@ static constexpr fltSemantics semFloat8E4M3FNUZ = {
     7, -7, 4, 8, fltNonfiniteBehavior::NanOnly, fltNanEncoding::NegativeZero};
 static constexpr fltSemantics semFloat8E4M3B11FNUZ = {
     4, -10, 4, 8, fltNonfiniteBehavior::NanOnly, fltNanEncoding::NegativeZero};
+static constexpr fltSemantics semFloat8E3M4 = {3, -2, 5, 8};
 static constexpr fltSemantics semFloatTF32 = {127, -126, 11, 19};
 static constexpr fltSemantics semFloat6E3M2FN = {
     4, -2, 3, 6, fltNonfiniteBehavior::FiniteOnly};
@@ -217,6 +218,8 @@ const llvm::fltSemantics &APFloatBase::EnumToSemantics(Semantics S) {
     return Float8E4M3FNUZ();
   case S_Float8E4M3B11FNUZ:
     return Float8E4M3B11FNUZ();
+  case S_Float8E3M4:
+    return Float8E3M4();
   case S_FloatTF32:
     return FloatTF32();
   case S_Float6E3M2FN:
@@ -257,6 +260,8 @@ APFloatBase::SemanticsToEnum(const llvm::fltSemantics &Sem) {
     return S_Float8E4M3FNUZ;
   else if (&Sem == &llvm::APFloat::Float8E4M3B11FNUZ())
     return S_Float8E4M3B11FNUZ;
+  else if (&Sem == &llvm::APFloat::Float8E3M4())
+    return S_Float8E3M4;
   else if (&Sem == &llvm::APFloat::FloatTF32())
     return S_FloatTF32;
   else if (&Sem == &llvm::APFloat::Float6E3M2FN())
@@ -287,6 +292,7 @@ const fltSemantics &APFloatBase::Float8E4M3FNUZ() { return semFloat8E4M3FNUZ; }
 const fltSemantics &APFloatBase::Float8E4M3B11FNUZ() {
   return semFloat8E4M3B11FNUZ;
 }
+const fltSemantics &APFloatBase::Float8E3M4() { return semFloat8E3M4; }
 const fltSemantics &APFloatBase::FloatTF32() { return semFloatTF32; }
 const fltSemantics &APFloatBase::Float6E3M2FN() { return semFloat6E3M2FN; }
 const fltSemantics &APFloatBase::Float6E2M3FN() { return semFloat6E2M3FN; }
@@ -3643,6 +3649,11 @@ APInt IEEEFloat::convertFloat8E4M3B11FNUZAPFloatToAPInt() const {
   return convertIEEEFloatToAPInt<semFloat8E4M3B11FNUZ>();
 }
 
+APInt IEEEFloat::convertFloat8E3M4APFloatToAPInt() const {
+  assert(partCount() == 1);
+  return convertIEEEFloatToAPInt<semFloat8E3M4>();
+}
+
 APInt IEEEFloat::convertFloatTF32APFloatToAPInt() const {
   assert(partCount() == 1);
   return convertIEEEFloatToAPInt<semFloatTF32>();
@@ -3704,6 +3715,9 @@ APInt IEEEFloat::bitcastToAPInt() const {
   if (semantics == (const llvm::fltSemantics *)&semFloat8E4M3B11FNUZ)
     return convertFloat8E4M3B11FNUZAPFloatToAPInt();
 
+  if (semantics == (const llvm::fltSemantics *)&semFloat8E3M4)
+    return convertFloat8E3M4APFloatToAPInt();
+
   if (semantics == (const llvm::fltSemantics *)&semFloatTF32)
     return convertFloatTF32APFloatToAPInt();
 
@@ -3932,6 +3946,10 @@ void IEEEFloat::initFromFloat8E4M3B11FNUZAPInt(const APInt &api) {
   initFromIEEEAPInt<semFloat8E4M3B11FNUZ>(api);
 }
 
+void IEEEFloat::initFromFloat8E3M4APInt(const APInt &api) {
+  initFromIEEEAPInt<semFloat8E3M4>(api);
+}
+
 void IEEEFloat::initFromFloatTF32APInt(const APInt &api) {
   initFromIEEEAPInt<semFloatTF32>(api);
 }
@@ -3977,6 +3995,8 @@ void IEEEFloat::initFromAPInt(const fltSemantics *Sem, const APInt &api) {
     return initFromFloat8E4M3FNUZAPInt(api);
   if (Sem == &semFloat8E4M3B11FNUZ)
     return initFromFloat8E4M3B11FNUZAPInt(api);
+  if (Sem == &semFloat8E3M4)
+    return initFromFloat8E3M4APInt(api);
   if (Sem == &semFloatTF32)
     return initFromFloatTF32APInt(api);
   if (Sem == &semFloat6E3M2FN)
diff --git a/llvm/lib/Support/Error.cpp b/llvm/lib/Support/Error.cpp
index 93481ca..baa3c32 100644
--- a/llvm/lib/Support/Error.cpp
+++ b/llvm/lib/Support/Error.cpp
@@ -182,6 +182,12 @@ LLVMErrorTypeId LLVMGetErrorTypeId(LLVMErrorRef Err) {
 
 void LLVMConsumeError(LLVMErrorRef Err) { consumeError(unwrap(Err)); }
 
+
+
+void LLVMCantFail(LLVMErrorRef Err) {
+  cantFail(unwrap(Err));
+}
+
 char *LLVMGetErrorMessage(LLVMErrorRef Err) {
   std::string Tmp = toString(unwrap(Err));
   char *ErrMsg = new char[Tmp.size() + 1];
diff --git a/llvm/lib/Support/Windows/Process.inc b/llvm/lib/Support/Windows/Process.inc
index 34d294b..d525f5b 100644
--- a/llvm/lib/Support/Windows/Process.inc
+++ b/llvm/lib/Support/Windows/Process.inc
@@ -482,7 +482,8 @@ static RTL_OSVERSIONINFOEXW GetWindowsVer() {
     HMODULE hMod = ::GetModuleHandleW(L"ntdll.dll");
     assert(hMod);
 
-    auto getVer = (RtlGetVersionPtr)::GetProcAddress(hMod, "RtlGetVersion");
+    auto getVer =
+        (RtlGetVersionPtr)(void *)::GetProcAddress(hMod, "RtlGetVersion");
     assert(getVer);
 
     RTL_OSVERSIONINFOEXW info{};
diff --git a/llvm/lib/Support/Windows/Signals.inc b/llvm/lib/Support/Windows/Signals.inc
index 29ebf7c..f11ad09f 100644
--- a/llvm/lib/Support/Windows/Signals.inc
+++ b/llvm/lib/Support/Windows/Signals.inc
@@ -171,23 +171,27 @@ static bool load64BitDebugHelp(void) {
   HMODULE hLib =
       ::LoadLibraryExA("Dbghelp.dll", NULL, LOAD_LIBRARY_SEARCH_SYSTEM32);
   if (hLib) {
-    fMiniDumpWriteDump =
-        (fpMiniDumpWriteDump)::GetProcAddress(hLib, "MiniDumpWriteDump");
-    fStackWalk64 = (fpStackWalk64)::GetProcAddress(hLib, "StackWalk64");
-    fSymGetModuleBase64 =
-        (fpSymGetModuleBase64)::GetProcAddress(hLib, "SymGetModuleBase64");
-    fSymGetSymFromAddr64 =
-        (fpSymGetSymFromAddr64)::GetProcAddress(hLib, "SymGetSymFromAddr64");
-    fSymGetLineFromAddr64 =
-        (fpSymGetLineFromAddr64)::GetProcAddress(hLib, "SymGetLineFromAddr64");
-    fSymGetModuleInfo64 =
-        (fpSymGetModuleInfo64)::GetProcAddress(hLib, "SymGetModuleInfo64");
-    fSymFunctionTableAccess64 = (fpSymFunctionTableAccess64)::GetProcAddress(
-        hLib, "SymFunctionTableAccess64");
-    fSymSetOptions = (fpSymSetOptions)::GetProcAddress(hLib, "SymSetOptions");
-    fSymInitialize = (fpSymInitialize)::GetProcAddress(hLib, "SymInitialize");
-    fEnumerateLoadedModules = (fpEnumerateLoadedModules)::GetProcAddress(
-        hLib, "EnumerateLoadedModules64");
+    fMiniDumpWriteDump = (fpMiniDumpWriteDump)(void *)::GetProcAddress(
+        hLib, "MiniDumpWriteDump");
+    fStackWalk64 = (fpStackWalk64)(void *)::GetProcAddress(hLib, "StackWalk64");
+    fSymGetModuleBase64 = (fpSymGetModuleBase64)(void *)::GetProcAddress(
+        hLib, "SymGetModuleBase64");
+    fSymGetSymFromAddr64 = (fpSymGetSymFromAddr64)(void *)::GetProcAddress(
+        hLib, "SymGetSymFromAddr64");
+    fSymGetLineFromAddr64 = (fpSymGetLineFromAddr64)(void *)::GetProcAddress(
+        hLib, "SymGetLineFromAddr64");
+    fSymGetModuleInfo64 = (fpSymGetModuleInfo64)(void *)::GetProcAddress(
+        hLib, "SymGetModuleInfo64");
+    fSymFunctionTableAccess64 =
+        (fpSymFunctionTableAccess64)(void *)::GetProcAddress(
+            hLib, "SymFunctionTableAccess64");
+    fSymSetOptions =
+        (fpSymSetOptions)(void *)::GetProcAddress(hLib, "SymSetOptions");
+    fSymInitialize =
+        (fpSymInitialize)(void *)::GetProcAddress(hLib, "SymInitialize");
+    fEnumerateLoadedModules =
+        (fpEnumerateLoadedModules)(void *)::GetProcAddress(
+            hLib, "EnumerateLoadedModules64");
   }
   return isDebugHelpInitialized();
 }
diff --git a/llvm/lib/Target/AArch64/AArch64Arm64ECCallLowering.cpp b/llvm/lib/Target/AArch64/AArch64Arm64ECCallLowering.cpp
index 310b152..415edb1 100644
--- a/llvm/lib/Target/AArch64/AArch64Arm64ECCallLowering.cpp
+++ b/llvm/lib/Target/AArch64/AArch64Arm64ECCallLowering.cpp
@@ -833,6 +833,11 @@ bool AArch64Arm64ECCallLowering::runOnModule(Module &Mod) {
                                               "EXP+" + MangledName.value())));
       A->setAliasee(&F);
 
+      if (F.hasDLLExportStorageClass()) {
+        A->setDLLStorageClass(GlobalValue::DLLExportStorageClass);
+        F.setDLLStorageClass(GlobalValue::DefaultStorageClass);
+      }
+
       FnsMap[A] = GlobalAlias::create(GlobalValue::LinkOnceODRLinkage,
                                       MangledName.value(), &F);
       PatchableFns.insert(A);
diff --git a/llvm/lib/Target/AArch64/AArch64AsmPrinter.cpp b/llvm/lib/Target/AArch64/AArch64AsmPrinter.cpp
index 3c9b07a..b51c056 100644
--- a/llvm/lib/Target/AArch64/AArch64AsmPrinter.cpp
+++ b/llvm/lib/Target/AArch64/AArch64AsmPrinter.cpp
@@ -160,8 +160,7 @@ public:
 
   /// tblgen'erated driver function for lowering simple MI->MC
   /// pseudo instructions.
-  bool emitPseudoExpansionLowering(MCStreamer &OutStreamer,
-                                   const MachineInstr *MI);
+  bool lowerPseudoInstExpansion(const MachineInstr *MI, MCInst &Inst);
 
   void emitInstruction(const MachineInstr *MI) override;
 
@@ -2316,8 +2315,10 @@ void AArch64AsmPrinter::emitInstruction(const MachineInstr *MI) {
   AArch64_MC::verifyInstructionPredicates(MI->getOpcode(), STI->getFeatureBits());
 
   // Do any auto-generated pseudo lowerings.
-  if (emitPseudoExpansionLowering(*OutStreamer, MI))
+  if (MCInst OutInst; lowerPseudoInstExpansion(MI, OutInst)) {
+    EmitToStreamer(*OutStreamer, OutInst);
     return;
+  }
 
   if (MI->getOpcode() == AArch64::ADRP) {
     for (auto &Opd : MI->operands()) {
diff --git a/llvm/lib/Target/AArch64/AArch64ConditionalCompares.cpp b/llvm/lib/Target/AArch64/AArch64ConditionalCompares.cpp
index 49e5211..9669a39 100644
--- a/llvm/lib/Target/AArch64/AArch64ConditionalCompares.cpp
+++ b/llvm/lib/Target/AArch64/AArch64ConditionalCompares.cpp
@@ -711,7 +711,6 @@ void SSACCmpConv::convert(SmallVectorImpl<MachineBasicBlock *> &RemovedBlocks) {
   Head->updateTerminator(CmpBB->getNextNode());
 
   RemovedBlocks.push_back(CmpBB);
-  CmpBB->eraseFromParent();
   LLVM_DEBUG(dbgs() << "Result:\n" << *Head);
   ++NumConverted;
 }
@@ -918,6 +917,8 @@ bool AArch64ConditionalCompares::tryConvert(MachineBasicBlock *MBB) {
     CmpConv.convert(RemovedBlocks);
     Changed = true;
     updateDomTree(RemovedBlocks);
+    for (MachineBasicBlock *MBB : RemovedBlocks)
+      MBB->eraseFromParent();
     updateLoops(RemovedBlocks);
   }
   return Changed;
diff --git a/llvm/lib/Target/AArch64/AArch64ISelLowering.cpp b/llvm/lib/Target/AArch64/AArch64ISelLowering.cpp
index d86e52d..1e9da9b 100644
--- a/llvm/lib/Target/AArch64/AArch64ISelLowering.cpp
+++ b/llvm/lib/Target/AArch64/AArch64ISelLowering.cpp
@@ -4508,11 +4508,19 @@ AArch64TargetLowering::LowerVectorFP_TO_INT_SAT(SDValue Op,
   EVT SrcElementVT = SrcVT.getVectorElementType();
 
   // In the absence of FP16 support, promote f16 to f32 and saturate the result.
+  SDLoc DL(Op);
+  SDValue SrcVal2;
   if ((SrcElementVT == MVT::f16 &&
        (!Subtarget->hasFullFP16() || DstElementWidth > 16)) ||
       SrcElementVT == MVT::bf16) {
     MVT F32VT = MVT::getVectorVT(MVT::f32, SrcVT.getVectorNumElements());
-    SrcVal = DAG.getNode(ISD::FP_EXTEND, SDLoc(Op), F32VT, SrcVal);
+    SrcVal = DAG.getNode(ISD::FP_EXTEND, DL, F32VT, SrcVal);
+    // If we are extending to a v8f32, split into two v4f32 to produce legal
+    // types.
+    if (F32VT.getSizeInBits() > 128) {
+      std::tie(SrcVal, SrcVal2) = DAG.SplitVector(SrcVal, DL);
+      F32VT = F32VT.getHalfNumVectorElementsVT();
+    }
     SrcVT = F32VT;
     SrcElementVT = MVT::f32;
     SrcElementWidth = 32;
@@ -4520,9 +4528,8 @@ AArch64TargetLowering::LowerVectorFP_TO_INT_SAT(SDValue Op,
              SrcElementVT != MVT::f16 && SrcElementVT != MVT::bf16)
     return SDValue();
 
-  SDLoc DL(Op);
-  // Expand to f64 if we are saturating to i64, to help produce keep the lanes
-  // the same width and produce a fcvtzu.
+  // Expand to f64 if we are saturating to i64, to help keep the lanes the same
+  // width and produce a fcvtzu.
   if (SatWidth == 64 && SrcElementWidth < 64) {
     MVT F64VT = MVT::getVectorVT(MVT::f64, SrcVT.getVectorNumElements());
     SrcVal = DAG.getNode(ISD::FP_EXTEND, DL, F64VT, SrcVal);
@@ -4531,9 +4538,16 @@ AArch64TargetLowering::LowerVectorFP_TO_INT_SAT(SDValue Op,
     SrcElementWidth = 64;
   }
   // Cases that we can emit directly.
-  if (SrcElementWidth == DstElementWidth && SrcElementWidth == SatWidth)
-    return DAG.getNode(Op.getOpcode(), DL, DstVT, SrcVal,
-                       DAG.getValueType(DstVT.getScalarType()));
+  if (SrcElementWidth == DstElementWidth && SrcElementWidth == SatWidth) {
+    SDValue Res = DAG.getNode(Op.getOpcode(), DL, DstVT, SrcVal,
+                              DAG.getValueType(DstVT.getScalarType()));
+    if (SrcVal2) {
+      SDValue Res2 = DAG.getNode(Op.getOpcode(), DL, DstVT, SrcVal2,
+                                 DAG.getValueType(DstVT.getScalarType()));
+      return DAG.getNode(ISD::CONCAT_VECTORS, DL, DstVT, Res, Res2);
+    }
+    return Res;
+  }
 
   // Otherwise we emit a cvt that saturates to a higher BW, and saturate the
   // result. This is only valid if the legal cvt is larger than the saturate
@@ -4545,20 +4559,32 @@ AArch64TargetLowering::LowerVectorFP_TO_INT_SAT(SDValue Op,
   EVT IntVT = SrcVT.changeVectorElementTypeToInteger();
   SDValue NativeCvt = DAG.getNode(Op.getOpcode(), DL, IntVT, SrcVal,
                                   DAG.getValueType(IntVT.getScalarType()));
-  SDValue Sat;
+  SDValue NativeCvt2 =
+      SrcVal2 ? DAG.getNode(Op.getOpcode(), DL, IntVT, SrcVal2,
+                            DAG.getValueType(IntVT.getScalarType()))
+              : SDValue();
+  SDValue Sat, Sat2;
   if (Op.getOpcode() == ISD::FP_TO_SINT_SAT) {
     SDValue MinC = DAG.getConstant(
         APInt::getSignedMaxValue(SatWidth).sext(SrcElementWidth), DL, IntVT);
     SDValue Min = DAG.getNode(ISD::SMIN, DL, IntVT, NativeCvt, MinC);
+    SDValue Min2 = SrcVal2 ? DAG.getNode(ISD::SMIN, DL, IntVT, NativeCvt2, MinC) : SDValue();
     SDValue MaxC = DAG.getConstant(
         APInt::getSignedMinValue(SatWidth).sext(SrcElementWidth), DL, IntVT);
     Sat = DAG.getNode(ISD::SMAX, DL, IntVT, Min, MaxC);
+    Sat2 = SrcVal2 ? DAG.getNode(ISD::SMAX, DL, IntVT, Min2, MaxC) : SDValue();
   } else {
     SDValue MinC = DAG.getConstant(
         APInt::getAllOnes(SatWidth).zext(SrcElementWidth), DL, IntVT);
     Sat = DAG.getNode(ISD::UMIN, DL, IntVT, NativeCvt, MinC);
+    Sat2 = SrcVal2 ? DAG.getNode(ISD::UMIN, DL, IntVT, NativeCvt2, MinC) : SDValue();
   }
 
+  if (SrcVal2)
+    Sat = DAG.getNode(ISD::CONCAT_VECTORS, DL,
+                      IntVT.getDoubleNumVectorElementsVT(*DAG.getContext()),
+                      Sat, Sat2);
+
   return DAG.getNode(ISD::TRUNCATE, DL, DstVT, Sat);
 }
 
diff --git a/llvm/lib/Target/AArch64/AArch64TargetTransformInfo.cpp b/llvm/lib/Target/AArch64/AArch64TargetTransformInfo.cpp
index 7de813f..79c0e45 100644
--- a/llvm/lib/Target/AArch64/AArch64TargetTransformInfo.cpp
+++ b/llvm/lib/Target/AArch64/AArch64TargetTransformInfo.cpp
@@ -748,22 +748,44 @@ AArch64TTIImpl::getIntrinsicInstrCost(const IntrinsicCostAttributes &ICA,
     // output are the same, or we are using cvt f64->i32 or f32->i64.
     if ((LT.second == MVT::f32 || LT.second == MVT::f64 ||
          LT.second == MVT::v2f32 || LT.second == MVT::v4f32 ||
-         LT.second == MVT::v2f64) &&
-        (LT.second.getScalarSizeInBits() == MTy.getScalarSizeInBits() ||
-         (LT.second == MVT::f64 && MTy == MVT::i32) ||
-         (LT.second == MVT::f32 && MTy == MVT::i64)))
-      return LT.first;
-    // Similarly for fp16 sizes
-    if (ST->hasFullFP16() &&
-        ((LT.second == MVT::f16 && MTy == MVT::i32) ||
-         ((LT.second == MVT::v4f16 || LT.second == MVT::v8f16) &&
-          (LT.second.getScalarSizeInBits() == MTy.getScalarSizeInBits()))))
+         LT.second == MVT::v2f64)) {
+      if ((LT.second.getScalarSizeInBits() == MTy.getScalarSizeInBits() ||
+           (LT.second == MVT::f64 && MTy == MVT::i32) ||
+           (LT.second == MVT::f32 && MTy == MVT::i64)))
+        return LT.first;
+      // Extending vector types v2f32->v2i64, fcvtl*2 + fcvt*2
+      if (LT.second.getScalarType() == MVT::f32 && MTy.isFixedLengthVector() &&
+          MTy.getScalarSizeInBits() == 64)
+        return LT.first * (MTy.getVectorNumElements() > 2 ? 4 : 2);
+    }
+    // Similarly for fp16 sizes. Without FullFP16 we generally need to fcvt to
+    // f32.
+    if (LT.second.getScalarType() == MVT::f16 && !ST->hasFullFP16())
+      return LT.first + getIntrinsicInstrCost(
+                            {ICA.getID(),
+                             RetTy,
+                             {ICA.getArgTypes()[0]->getWithNewType(
+                                 Type::getFloatTy(RetTy->getContext()))}},
+                            CostKind);
+    if ((LT.second == MVT::f16 && MTy == MVT::i32) ||
+        (LT.second == MVT::f16 && MTy == MVT::i64) ||
+        ((LT.second == MVT::v4f16 || LT.second == MVT::v8f16) &&
+         (LT.second.getScalarSizeInBits() == MTy.getScalarSizeInBits())))
       return LT.first;
-
-    // Otherwise we use a legal convert followed by a min+max
+    // Extending vector types v8f16->v8i32, fcvtl*2 + fcvt*2
+    if (LT.second.getScalarType() == MVT::f16 && MTy.isFixedLengthVector() &&
+        MTy.getScalarSizeInBits() == 32)
+      return LT.first * (MTy.getVectorNumElements() > 4 ? 4 : 2);
+    // Extending vector types v8f16->v8i32. These current scalarize but the
+    // codegen could be better.
+    if (LT.second.getScalarType() == MVT::f16 && MTy.isFixedLengthVector() &&
+        MTy.getScalarSizeInBits() == 64)
+      return MTy.getVectorNumElements() * 3;
+
+    // If we can we use a legal convert followed by a min+max
     if ((LT.second.getScalarType() == MVT::f32 ||
          LT.second.getScalarType() == MVT::f64 ||
-         (ST->hasFullFP16() && LT.second.getScalarType() == MVT::f16)) &&
+         LT.second.getScalarType() == MVT::f16) &&
         LT.second.getScalarSizeInBits() >= MTy.getScalarSizeInBits()) {
       Type *LegalTy =
           Type::getIntNTy(RetTy->getContext(), LT.second.getScalarSizeInBits());
@@ -776,9 +798,33 @@ AArch64TTIImpl::getIntrinsicInstrCost(const IntrinsicCostAttributes &ICA,
       IntrinsicCostAttributes Attrs2(IsSigned ? Intrinsic::smax : Intrinsic::umax,
                                     LegalTy, {LegalTy, LegalTy});
       Cost += getIntrinsicInstrCost(Attrs2, CostKind);
-      return LT.first * Cost;
+      return LT.first * Cost +
+             ((LT.second.getScalarType() != MVT::f16 || ST->hasFullFP16()) ? 0
+                                                                           : 1);
     }
-    break;
+    // Otherwise we need to follow the default expansion that clamps the value
+    // using a float min/max with a fcmp+sel for nan handling when signed.
+    Type *FPTy = ICA.getArgTypes()[0]->getScalarType();
+    RetTy = RetTy->getScalarType();
+    if (LT.second.isVector()) {
+      FPTy = VectorType::get(FPTy, LT.second.getVectorElementCount());
+      RetTy = VectorType::get(RetTy, LT.second.getVectorElementCount());
+    }
+    IntrinsicCostAttributes Attrs1(Intrinsic::minnum, FPTy, {FPTy, FPTy});
+    InstructionCost Cost = getIntrinsicInstrCost(Attrs1, CostKind);
+    IntrinsicCostAttributes Attrs2(Intrinsic::maxnum, FPTy, {FPTy, FPTy});
+    Cost += getIntrinsicInstrCost(Attrs2, CostKind);
+    Cost +=
+        getCastInstrCost(IsSigned ? Instruction::FPToSI : Instruction::FPToUI,
+                         RetTy, FPTy, TTI::CastContextHint::None, CostKind);
+    if (IsSigned) {
+      Type *CondTy = RetTy->getWithNewBitWidth(1);
+      Cost += getCmpSelInstrCost(BinaryOperator::FCmp, FPTy, CondTy,
+                                 CmpInst::FCMP_UNO, CostKind);
+      Cost += getCmpSelInstrCost(BinaryOperator::Select, RetTy, CondTy,
+                                 CmpInst::FCMP_UNO, CostKind);
+    }
+    return LT.first * Cost;
   }
   case Intrinsic::fshl:
   case Intrinsic::fshr: {
diff --git a/llvm/lib/Target/AMDGPU/AMDGPUAsmPrinter.h b/llvm/lib/Target/AMDGPU/AMDGPUAsmPrinter.h
index f70a60a..f66bbde 100644
--- a/llvm/lib/Target/AMDGPU/AMDGPUAsmPrinter.h
+++ b/llvm/lib/Target/AMDGPU/AMDGPUAsmPrinter.h
@@ -109,8 +109,7 @@ public:
 
   /// tblgen'erated driver function for lowering simple MI->MC pseudo
   /// instructions.
-  bool emitPseudoExpansionLowering(MCStreamer &OutStreamer,
-                                   const MachineInstr *MI);
+  bool lowerPseudoInstExpansion(const MachineInstr *MI, MCInst &Inst);
 
   /// Implemented in AMDGPUMCInstLower.cpp
   void emitInstruction(const MachineInstr *MI) override;
diff --git a/llvm/lib/Target/AMDGPU/AMDGPUGISel.td b/llvm/lib/Target/AMDGPU/AMDGPUGISel.td
index 537d3a4..825c1de 100644
--- a/llvm/lib/Target/AMDGPU/AMDGPUGISel.td
+++ b/llvm/lib/Target/AMDGPU/AMDGPUGISel.td
@@ -398,8 +398,10 @@ def gi_as_i1timm : GICustomOperandRenderer<"renderTruncTImm">,
 def gi_NegateImm : GICustomOperandRenderer<"renderNegateImm">,
   GISDNodeXFormEquiv<NegateImm>;
 
-def gi_bitcast_fpimm_to_i32 : GICustomOperandRenderer<"renderBitcastImm">,
+def gi_bitcast_fpimm_to_i32 : GICustomOperandRenderer<"renderBitcastFPImm32">,
   GISDNodeXFormEquiv<bitcast_fpimm_to_i32>;
+def gi_bitcast_fpimm_to_i64 : GICustomOperandRenderer<"renderBitcastFPImm64">,
+  GISDNodeXFormEquiv<bitcast_fpimm_to_i64>;
 
 def gi_IMMPopCount : GICustomOperandRenderer<"renderPopcntImm">,
   GISDNodeXFormEquiv<IMMPopCount>;
diff --git a/llvm/lib/Target/AMDGPU/AMDGPUInstructionSelector.cpp b/llvm/lib/Target/AMDGPU/AMDGPUInstructionSelector.cpp
index 63048c7..73f3921 100644
--- a/llvm/lib/Target/AMDGPU/AMDGPUInstructionSelector.cpp
+++ b/llvm/lib/Target/AMDGPU/AMDGPUInstructionSelector.cpp
@@ -2169,27 +2169,6 @@ bool AMDGPUInstructionSelector::selectG_SELECT(MachineInstr &I) const {
   return Ret;
 }
 
-static int sizeToSubRegIndex(unsigned Size) {
-  switch (Size) {
-  case 32:
-    return AMDGPU::sub0;
-  case 64:
-    return AMDGPU::sub0_sub1;
-  case 96:
-    return AMDGPU::sub0_sub1_sub2;
-  case 128:
-    return AMDGPU::sub0_sub1_sub2_sub3;
-  case 256:
-    return AMDGPU::sub0_sub1_sub2_sub3_sub4_sub5_sub6_sub7;
-  default:
-    if (Size < 32)
-      return AMDGPU::sub0;
-    if (Size > 256)
-      return -1;
-    return sizeToSubRegIndex(llvm::bit_ceil(Size));
-  }
-}
-
 bool AMDGPUInstructionSelector::selectG_TRUNC(MachineInstr &I) const {
   Register DstReg = I.getOperand(0).getReg();
   Register SrcReg = I.getOperand(1).getReg();
@@ -2293,8 +2272,9 @@ bool AMDGPUInstructionSelector::selectG_TRUNC(MachineInstr &I) const {
     return false;
 
   if (SrcSize > 32) {
-    int SubRegIdx = sizeToSubRegIndex(DstSize);
-    if (SubRegIdx == -1)
+    unsigned SubRegIdx =
+        DstSize < 32 ? AMDGPU::sub0 : TRI.getSubRegFromChannel(0, DstSize / 32);
+    if (SubRegIdx == AMDGPU::NoSubRegister)
       return false;
 
     // Deal with weird cases where the class only partially supports the subreg
@@ -2523,89 +2503,6 @@ bool AMDGPUInstructionSelector::selectG_FPEXT(MachineInstr &I) const {
   return false;
 }
 
-bool AMDGPUInstructionSelector::selectG_CONSTANT(MachineInstr &I) const {
-  MachineBasicBlock *BB = I.getParent();
-  MachineOperand &ImmOp = I.getOperand(1);
-  Register DstReg = I.getOperand(0).getReg();
-  unsigned Size = MRI->getType(DstReg).getSizeInBits();
-  bool IsFP = false;
-
-  // The AMDGPU backend only supports Imm operands and not CImm or FPImm.
-  if (ImmOp.isFPImm()) {
-    const APInt &Imm = ImmOp.getFPImm()->getValueAPF().bitcastToAPInt();
-    ImmOp.ChangeToImmediate(Imm.getZExtValue());
-    IsFP = true;
-  } else if (ImmOp.isCImm()) {
-    ImmOp.ChangeToImmediate(ImmOp.getCImm()->getSExtValue());
-  } else {
-    llvm_unreachable("Not supported by g_constants");
-  }
-
-  const RegisterBank *DstRB = RBI.getRegBank(DstReg, *MRI, TRI);
-  const bool IsSgpr = DstRB->getID() == AMDGPU::SGPRRegBankID;
-
-  unsigned Opcode;
-  if (DstRB->getID() == AMDGPU::VCCRegBankID) {
-    Opcode = STI.isWave32() ? AMDGPU::S_MOV_B32 : AMDGPU::S_MOV_B64;
-  } else if (Size == 64 &&
-             AMDGPU::isValid32BitLiteral(I.getOperand(1).getImm(), IsFP)) {
-    Opcode = IsSgpr ? AMDGPU::S_MOV_B64_IMM_PSEUDO : AMDGPU::V_MOV_B64_PSEUDO;
-    I.setDesc(TII.get(Opcode));
-    I.addImplicitDefUseOperands(*MF);
-    return constrainSelectedInstRegOperands(I, TII, TRI, RBI);
-  } else {
-    Opcode = IsSgpr ? AMDGPU::S_MOV_B32 : AMDGPU::V_MOV_B32_e32;
-
-    // We should never produce s1 values on banks other than VCC. If the user of
-    // this already constrained the register, we may incorrectly think it's VCC
-    // if it wasn't originally.
-    if (Size == 1)
-      return false;
-  }
-
-  if (Size != 64) {
-    I.setDesc(TII.get(Opcode));
-    I.addImplicitDefUseOperands(*MF);
-    return constrainSelectedInstRegOperands(I, TII, TRI, RBI);
-  }
-
-  const DebugLoc &DL = I.getDebugLoc();
-
-  APInt Imm(Size, I.getOperand(1).getImm());
-
-  MachineInstr *ResInst;
-  if (IsSgpr && TII.isInlineConstant(Imm)) {
-    ResInst = BuildMI(*BB, &I, DL, TII.get(AMDGPU::S_MOV_B64), DstReg)
-      .addImm(I.getOperand(1).getImm());
-  } else {
-    const TargetRegisterClass *RC = IsSgpr ?
-      &AMDGPU::SReg_32RegClass : &AMDGPU::VGPR_32RegClass;
-    Register LoReg = MRI->createVirtualRegister(RC);
-    Register HiReg = MRI->createVirtualRegister(RC);
-
-    BuildMI(*BB, &I, DL, TII.get(Opcode), LoReg)
-      .addImm(Imm.trunc(32).getZExtValue());
-
-    BuildMI(*BB, &I, DL, TII.get(Opcode), HiReg)
-      .addImm(Imm.ashr(32).getZExtValue());
-
-    ResInst = BuildMI(*BB, &I, DL, TII.get(AMDGPU::REG_SEQUENCE), DstReg)
-      .addReg(LoReg)
-      .addImm(AMDGPU::sub0)
-      .addReg(HiReg)
-      .addImm(AMDGPU::sub1);
-  }
-
-  // We can't call constrainSelectedInstRegOperands here, because it doesn't
-  // work for target independent opcodes
-  I.eraseFromParent();
-  const TargetRegisterClass *DstRC =
-    TRI.getConstrainedRegClassForOperand(ResInst->getOperand(0), *MRI);
-  if (!DstRC)
-    return true;
-  return RBI.constrainGenericRegister(DstReg, *DstRC, *MRI);
-}
-
 bool AMDGPUInstructionSelector::selectG_FNEG(MachineInstr &MI) const {
   // Only manually handle the f64 SGPR case.
   //
@@ -3532,9 +3429,6 @@ bool AMDGPUInstructionSelector::select(MachineInstr &I) {
   case TargetOpcode::G_PTRTOINT:
   case TargetOpcode::G_FREEZE:
     return selectCOPY(I);
-  case TargetOpcode::G_CONSTANT:
-  case TargetOpcode::G_FCONSTANT:
-    return selectG_CONSTANT(I);
   case TargetOpcode::G_FNEG:
     if (selectImpl(I, *CoverageInfo))
       return true;
@@ -3640,6 +3534,8 @@ bool AMDGPUInstructionSelector::select(MachineInstr &I) {
     return selectStackRestore(I);
   case AMDGPU::G_PHI:
     return selectPHI(I);
+  case TargetOpcode::G_CONSTANT:
+  case TargetOpcode::G_FCONSTANT:
   default:
     return selectImpl(I, *CoverageInfo);
   }
@@ -5626,18 +5522,12 @@ void AMDGPUInstructionSelector::renderNegateImm(MachineInstrBuilder &MIB,
   MIB.addImm(-MI.getOperand(1).getCImm()->getSExtValue());
 }
 
-void AMDGPUInstructionSelector::renderBitcastImm(MachineInstrBuilder &MIB,
-                                                 const MachineInstr &MI,
-                                                 int OpIdx) const {
-  assert(OpIdx == -1);
-
+void AMDGPUInstructionSelector::renderBitcastFPImm(MachineInstrBuilder &MIB,
+                                                   const MachineInstr &MI,
+                                                   int OpIdx) const {
   const MachineOperand &Op = MI.getOperand(1);
-  if (MI.getOpcode() == TargetOpcode::G_FCONSTANT)
-    MIB.addImm(Op.getFPImm()->getValueAPF().bitcastToAPInt().getZExtValue());
-  else {
-    assert(MI.getOpcode() == TargetOpcode::G_CONSTANT && "Expected G_CONSTANT");
-    MIB.addImm(Op.getCImm()->getSExtValue());
-  }
+  assert(MI.getOpcode() == TargetOpcode::G_FCONSTANT && OpIdx == -1);
+  MIB.addImm(Op.getFPImm()->getValueAPF().bitcastToAPInt().getZExtValue());
 }
 
 void AMDGPUInstructionSelector::renderPopcntImm(MachineInstrBuilder &MIB,
diff --git a/llvm/lib/Target/AMDGPU/AMDGPUInstructionSelector.h b/llvm/lib/Target/AMDGPU/AMDGPUInstructionSelector.h
index 43ed210..7fff7d2 100644
--- a/llvm/lib/Target/AMDGPU/AMDGPUInstructionSelector.h
+++ b/llvm/lib/Target/AMDGPU/AMDGPUInstructionSelector.h
@@ -91,7 +91,6 @@ private:
   bool selectG_TRUNC(MachineInstr &I) const;
   bool selectG_SZA_EXT(MachineInstr &I) const;
   bool selectG_FPEXT(MachineInstr &I) const;
-  bool selectG_CONSTANT(MachineInstr &I) const;
   bool selectG_FNEG(MachineInstr &I) const;
   bool selectG_FABS(MachineInstr &I) const;
   bool selectG_AND_OR_XOR(MachineInstr &I) const;
@@ -333,8 +332,17 @@ private:
   void renderNegateImm(MachineInstrBuilder &MIB, const MachineInstr &MI,
                        int OpIdx) const;
 
-  void renderBitcastImm(MachineInstrBuilder &MIB, const MachineInstr &MI,
-                        int OpIdx) const;
+  void renderBitcastFPImm(MachineInstrBuilder &MIB, const MachineInstr &MI,
+                          int OpIdx) const;
+
+  void renderBitcastFPImm32(MachineInstrBuilder &MIB, const MachineInstr &MI,
+                            int OpIdx) const {
+    renderBitcastFPImm(MIB, MI, OpIdx);
+  }
+  void renderBitcastFPImm64(MachineInstrBuilder &MIB, const MachineInstr &MI,
+                            int OpIdx) const {
+    renderBitcastFPImm(MIB, MI, OpIdx);
+  }
 
   void renderPopcntImm(MachineInstrBuilder &MIB, const MachineInstr &MI,
                        int OpIdx) const;
diff --git a/llvm/lib/Target/AMDGPU/AMDGPULowerKernelAttributes.cpp b/llvm/lib/Target/AMDGPU/AMDGPULowerKernelAttributes.cpp
index e91d059..e724c97 100644
--- a/llvm/lib/Target/AMDGPU/AMDGPULowerKernelAttributes.cpp
+++ b/llvm/lib/Target/AMDGPU/AMDGPULowerKernelAttributes.cpp
@@ -225,10 +225,8 @@ static bool processUse(CallInst *CI, bool IsV5OrAbove) {
                            : m_Intrinsic<Intrinsic::amdgcn_workgroup_id_z>());
 
       for (User *ICmp : BlockCount->users()) {
-        ICmpInst::Predicate Pred;
-        if (match(ICmp, m_ICmp(Pred, GroupIDIntrin, m_Specific(BlockCount)))) {
-          if (Pred != ICmpInst::ICMP_ULT)
-            continue;
+        if (match(ICmp, m_SpecificICmp(ICmpInst::ICMP_ULT, GroupIDIntrin,
+                                       m_Specific(BlockCount)))) {
           ICmp->replaceAllUsesWith(llvm::ConstantInt::getTrue(ICmp->getType()));
           MadeChange = true;
         }
diff --git a/llvm/lib/Target/AMDGPU/AMDGPUMCInstLower.cpp b/llvm/lib/Target/AMDGPU/AMDGPUMCInstLower.cpp
index 45ec38c..f5b5e9e 100644
--- a/llvm/lib/Target/AMDGPU/AMDGPUMCInstLower.cpp
+++ b/llvm/lib/Target/AMDGPU/AMDGPUMCInstLower.cpp
@@ -118,7 +118,7 @@ void AMDGPUMCInstLower::lower(const MachineInstr *MI, MCInst &OutMI) const {
   unsigned Opcode = MI->getOpcode();
   const auto *TII = static_cast<const SIInstrInfo*>(ST.getInstrInfo());
 
-  // FIXME: Should be able to handle this with emitPseudoExpansionLowering. We
+  // FIXME: Should be able to handle this with lowerPseudoInstExpansion. We
   // need to select it to the subtarget specific version, and there's no way to
   // do that with a single pseudo source operation.
   if (Opcode == AMDGPU::S_SETPC_B64_return)
@@ -187,8 +187,10 @@ void AMDGPUAsmPrinter::emitInstruction(const MachineInstr *MI) {
   // AMDGPU_MC::verifyInstructionPredicates(MI->getOpcode(),
   //                                        getSubtargetInfo().getFeatureBits());
 
-  if (emitPseudoExpansionLowering(*OutStreamer, MI))
+  if (MCInst OutInst; lowerPseudoInstExpansion(MI, OutInst)) {
+    EmitToStreamer(*OutStreamer, OutInst);
     return;
+  }
 
   const GCNSubtarget &STI = MF->getSubtarget<GCNSubtarget>();
   AMDGPUMCInstLower MCInstLowering(OutContext, STI, *this);
diff --git a/llvm/lib/Target/AMDGPU/AMDGPUPrintfRuntimeBinding.cpp b/llvm/lib/Target/AMDGPU/AMDGPUPrintfRuntimeBinding.cpp
index 42a6bac..02b0d43 100644
--- a/llvm/lib/Target/AMDGPU/AMDGPUPrintfRuntimeBinding.cpp
+++ b/llvm/lib/Target/AMDGPU/AMDGPUPrintfRuntimeBinding.cpp
@@ -437,7 +437,8 @@ bool AMDGPUPrintfRuntimeBindingImpl::run(Module &M) {
     return false;
 
   auto PrintfFunction = M.getFunction("printf");
-  if (!PrintfFunction || !PrintfFunction->isDeclaration())
+  if (!PrintfFunction || !PrintfFunction->isDeclaration() ||
+      M.getModuleFlag("openmp"))
     return false;
 
   for (auto &U : PrintfFunction->uses()) {
diff --git a/llvm/lib/Target/AMDGPU/AMDGPUResourceUsageAnalysis.cpp b/llvm/lib/Target/AMDGPU/AMDGPUResourceUsageAnalysis.cpp
index 146649a..0aca99a 100644
--- a/llvm/lib/Target/AMDGPU/AMDGPUResourceUsageAnalysis.cpp
+++ b/llvm/lib/Target/AMDGPU/AMDGPUResourceUsageAnalysis.cpp
@@ -86,13 +86,8 @@ int32_t AMDGPUResourceUsageAnalysis::SIFunctionResourceInfo::getTotalNumSGPRs(
 }
 
 int32_t AMDGPUResourceUsageAnalysis::SIFunctionResourceInfo::getTotalNumVGPRs(
-    const GCNSubtarget &ST, int32_t ArgNumAGPR, int32_t ArgNumVGPR) const {
-  return AMDGPU::getTotalNumVGPRs(ST.hasGFX90AInsts(), ArgNumAGPR, ArgNumVGPR);
-}
-
-int32_t AMDGPUResourceUsageAnalysis::SIFunctionResourceInfo::getTotalNumVGPRs(
     const GCNSubtarget &ST) const {
-  return getTotalNumVGPRs(ST, NumAGPR, NumVGPR);
+  return AMDGPU::getTotalNumVGPRs(ST.hasGFX90AInsts(), NumAGPR, NumVGPR);
 }
 
 bool AMDGPUResourceUsageAnalysis::runOnModule(Module &M) {
diff --git a/llvm/lib/Target/AMDGPU/AMDGPUResourceUsageAnalysis.h b/llvm/lib/Target/AMDGPU/AMDGPUResourceUsageAnalysis.h
index 16dcc28..7f71de6 100644
--- a/llvm/lib/Target/AMDGPU/AMDGPUResourceUsageAnalysis.h
+++ b/llvm/lib/Target/AMDGPU/AMDGPUResourceUsageAnalysis.h
@@ -45,8 +45,6 @@ public:
     int32_t getTotalNumSGPRs(const GCNSubtarget &ST) const;
     // Total number of VGPRs is actually a combination of AGPR and VGPR
     // depending on architecture - and some alignment constraints
-    int32_t getTotalNumVGPRs(const GCNSubtarget &ST, int32_t NumAGPR,
-                             int32_t NumVGPR) const;
     int32_t getTotalNumVGPRs(const GCNSubtarget &ST) const;
   };
 
diff --git a/llvm/lib/Target/AMDGPU/AMDGPUTargetTransformInfo.cpp b/llvm/lib/Target/AMDGPU/AMDGPUTargetTransformInfo.cpp
index 0b1ecc0..5191fb0 100644
--- a/llvm/lib/Target/AMDGPU/AMDGPUTargetTransformInfo.cpp
+++ b/llvm/lib/Target/AMDGPU/AMDGPUTargetTransformInfo.cpp
@@ -418,19 +418,19 @@ int64_t GCNTTIImpl::getMaxMemIntrinsicInlineSizeThreshold() const {
 // FIXME: This could use fine tuning and microbenchmarks.
 Type *GCNTTIImpl::getMemcpyLoopLoweringType(
     LLVMContext &Context, Value *Length, unsigned SrcAddrSpace,
-    unsigned DestAddrSpace, unsigned SrcAlign, unsigned DestAlign,
+    unsigned DestAddrSpace, Align SrcAlign, Align DestAlign,
     std::optional<uint32_t> AtomicElementSize) const {
 
   if (AtomicElementSize)
     return Type::getIntNTy(Context, *AtomicElementSize * 8);
 
-  unsigned MinAlign = std::min(SrcAlign, DestAlign);
+  Align MinAlign = std::min(SrcAlign, DestAlign);
 
   // A (multi-)dword access at an address == 2 (mod 4) will be decomposed by the
   // hardware into byte accesses. If you assume all alignments are equally
   // probable, it's more efficient on average to use short accesses for this
   // case.
-  if (MinAlign == 2)
+  if (MinAlign == Align(2))
     return Type::getInt16Ty(Context);
 
   // Not all subtargets have 128-bit DS instructions, and we currently don't
@@ -450,7 +450,7 @@ Type *GCNTTIImpl::getMemcpyLoopLoweringType(
 void GCNTTIImpl::getMemcpyLoopResidualLoweringType(
     SmallVectorImpl<Type *> &OpsOut, LLVMContext &Context,
     unsigned RemainingBytes, unsigned SrcAddrSpace, unsigned DestAddrSpace,
-    unsigned SrcAlign, unsigned DestAlign,
+    Align SrcAlign, Align DestAlign,
     std::optional<uint32_t> AtomicCpySize) const {
   assert(RemainingBytes < 16);
 
@@ -459,9 +459,9 @@ void GCNTTIImpl::getMemcpyLoopResidualLoweringType(
         OpsOut, Context, RemainingBytes, SrcAddrSpace, DestAddrSpace, SrcAlign,
         DestAlign, AtomicCpySize);
 
-  unsigned MinAlign = std::min(SrcAlign, DestAlign);
+  Align MinAlign = std::min(SrcAlign, DestAlign);
 
-  if (MinAlign != 2) {
+  if (MinAlign != Align(2)) {
     Type *I64Ty = Type::getInt64Ty(Context);
     while (RemainingBytes >= 8) {
       OpsOut.push_back(I64Ty);
@@ -686,7 +686,9 @@ InstructionCost GCNTTIImpl::getArithmeticInstrCost(
 // instructions for an intrinsic, even if it requires nontrivial legalization.
 static bool intrinsicHasPackedVectorBenefit(Intrinsic::ID ID) {
   switch (ID) {
-  case Intrinsic::fma: // TODO: fmuladd
+  case Intrinsic::fma:
+  case Intrinsic::fmuladd:
+  case Intrinsic::copysign:
   // There's a small benefit to using vector ops in the legalized code.
   case Intrinsic::round:
   case Intrinsic::uadd_sat:
@@ -730,9 +732,16 @@ GCNTTIImpl::getIntrinsicInstrCost(const IntrinsicCostAttributes &ICA,
 
   switch (ICA.getID()) {
   case Intrinsic::fma:
-    InstRate = ST->hasFastFMAF32() ? getHalfRateInstrCost(CostKind)
-                                   : getQuarterRateInstrCost(CostKind);
+  case Intrinsic::fmuladd:
+    if ((SLT == MVT::f32 && ST->hasFastFMAF32()) || SLT == MVT::f16)
+      InstRate = getFullRateInstrCost();
+    else {
+      InstRate = ST->hasFastFMAF32() ? getHalfRateInstrCost(CostKind)
+                                     : getQuarterRateInstrCost(CostKind);
+    }
     break;
+  case Intrinsic::copysign:
+    return NElts * getFullRateInstrCost();
   case Intrinsic::uadd_sat:
   case Intrinsic::usub_sat:
   case Intrinsic::sadd_sat:
diff --git a/llvm/lib/Target/AMDGPU/AMDGPUTargetTransformInfo.h b/llvm/lib/Target/AMDGPU/AMDGPUTargetTransformInfo.h
index b423df1..01df2e6 100644
--- a/llvm/lib/Target/AMDGPU/AMDGPUTargetTransformInfo.h
+++ b/llvm/lib/Target/AMDGPU/AMDGPUTargetTransformInfo.h
@@ -137,15 +137,16 @@ public:
                                     unsigned AddrSpace) const;
 
   int64_t getMaxMemIntrinsicInlineSizeThreshold() const;
-  Type *getMemcpyLoopLoweringType(
-      LLVMContext & Context, Value * Length, unsigned SrcAddrSpace,
-      unsigned DestAddrSpace, unsigned SrcAlign, unsigned DestAlign,
-      std::optional<uint32_t> AtomicElementSize) const;
+  Type *
+  getMemcpyLoopLoweringType(LLVMContext &Context, Value *Length,
+                            unsigned SrcAddrSpace, unsigned DestAddrSpace,
+                            Align SrcAlign, Align DestAlign,
+                            std::optional<uint32_t> AtomicElementSize) const;
 
   void getMemcpyLoopResidualLoweringType(
       SmallVectorImpl<Type *> &OpsOut, LLVMContext &Context,
       unsigned RemainingBytes, unsigned SrcAddrSpace, unsigned DestAddrSpace,
-      unsigned SrcAlign, unsigned DestAlign,
+      Align SrcAlign, Align DestAlign,
       std::optional<uint32_t> AtomicCpySize) const;
   unsigned getMaxInterleaveFactor(ElementCount VF);
 
diff --git a/llvm/lib/Target/AMDGPU/MCTargetDesc/AMDGPUInstPrinter.cpp b/llvm/lib/Target/AMDGPU/MCTargetDesc/AMDGPUInstPrinter.cpp
index c31f85d..94bf5e4 100644
--- a/llvm/lib/Target/AMDGPU/MCTargetDesc/AMDGPUInstPrinter.cpp
+++ b/llvm/lib/Target/AMDGPU/MCTargetDesc/AMDGPUInstPrinter.cpp
@@ -119,20 +119,6 @@ void AMDGPUInstPrinter::printFlatOffset(const MCInst *MI, unsigned OpNo,
   }
 }
 
-void AMDGPUInstPrinter::printOffset0(const MCInst *MI, unsigned OpNo,
-                                     const MCSubtargetInfo &STI,
-                                     raw_ostream &O) {
-  if (int64_t Offset = MI->getOperand(OpNo).getImm())
-    O << " offset0:" << formatDec(Offset);
-}
-
-void AMDGPUInstPrinter::printOffset1(const MCInst *MI, unsigned OpNo,
-                                     const MCSubtargetInfo &STI,
-                                     raw_ostream &O) {
-  if (int64_t Offset = MI->getOperand(OpNo).getImm())
-    O << " offset1:" << formatDec(Offset);
-}
-
 void AMDGPUInstPrinter::printSMRDOffset8(const MCInst *MI, unsigned OpNo,
                                         const MCSubtargetInfo &STI,
                                         raw_ostream &O) {
@@ -145,13 +131,6 @@ void AMDGPUInstPrinter::printSMEMOffset(const MCInst *MI, unsigned OpNo,
   O << formatHex(MI->getOperand(OpNo).getImm());
 }
 
-void AMDGPUInstPrinter::printSMEMOffsetMod(const MCInst *MI, unsigned OpNo,
-                                           const MCSubtargetInfo &STI,
-                                           raw_ostream &O) {
-  O << " offset:";
-  printSMEMOffset(MI, OpNo, STI, O);
-}
-
 void AMDGPUInstPrinter::printSMRDLiteralOffset(const MCInst *MI, unsigned OpNo,
                                                const MCSubtargetInfo &STI,
                                                raw_ostream &O) {
@@ -269,14 +248,6 @@ void AMDGPUInstPrinter::printScope(int64_t Scope, raw_ostream &O) {
   return;
 }
 
-void AMDGPUInstPrinter::printDMask(const MCInst *MI, unsigned OpNo,
-                                   const MCSubtargetInfo &STI, raw_ostream &O) {
-  if (MI->getOperand(OpNo).getImm()) {
-    O << " dmask:";
-    printU16ImmOperand(MI, OpNo, STI, O);
-  }
-}
-
 void AMDGPUInstPrinter::printDim(const MCInst *MI, unsigned OpNo,
                                  const MCSubtargetInfo &STI, raw_ostream &O) {
   unsigned Dim = MI->getOperand(OpNo).getImm();
@@ -352,8 +323,6 @@ void AMDGPUInstPrinter::printRegOperand(unsigned RegNo, raw_ostream &O,
   case AMDGPU::SP_REG:
   case AMDGPU::PRIVATE_RSRC_REG:
     llvm_unreachable("pseudo-register should not ever be emitted");
-  case AMDGPU::SCC:
-    llvm_unreachable("pseudo scc should not ever be emitted");
   default:
     break;
   }
@@ -678,26 +647,6 @@ void AMDGPUInstPrinter::printBLGP(const MCInst *MI, unsigned OpNo,
   O << " blgp:" << Imm;
 }
 
-void AMDGPUInstPrinter::printCBSZ(const MCInst *MI, unsigned OpNo,
-                                  const MCSubtargetInfo &STI,
-                                  raw_ostream &O) {
-  unsigned Imm = MI->getOperand(OpNo).getImm();
-  if (!Imm)
-    return;
-
-  O << " cbsz:" << Imm;
-}
-
-void AMDGPUInstPrinter::printABID(const MCInst *MI, unsigned OpNo,
-                                  const MCSubtargetInfo &STI,
-                                  raw_ostream &O) {
-  unsigned Imm = MI->getOperand(OpNo).getImm();
-  if (!Imm)
-    return;
-
-  O << " abid:" << Imm;
-}
-
 void AMDGPUInstPrinter::printDefaultVccOperand(bool FirstOperand,
                                                const MCSubtargetInfo &STI,
                                                raw_ostream &O) {
@@ -711,30 +660,6 @@ void AMDGPUInstPrinter::printDefaultVccOperand(bool FirstOperand,
     O << ", ";
 }
 
-void AMDGPUInstPrinter::printWaitVDST(const MCInst *MI, unsigned OpNo,
-                                      const MCSubtargetInfo &STI,
-                                      raw_ostream &O) {
-  O << " wait_vdst:" << formatDec(MI->getOperand(OpNo).getImm());
-}
-
-void AMDGPUInstPrinter::printWaitVAVDst(const MCInst *MI, unsigned OpNo,
-                                        const MCSubtargetInfo &STI,
-                                        raw_ostream &O) {
-  O << " wait_va_vdst:" << formatDec(MI->getOperand(OpNo).getImm());
-}
-
-void AMDGPUInstPrinter::printWaitVMVSrc(const MCInst *MI, unsigned OpNo,
-                                        const MCSubtargetInfo &STI,
-                                        raw_ostream &O) {
-  O << " wait_vm_vsrc:" << formatDec(MI->getOperand(OpNo).getImm());
-}
-
-void AMDGPUInstPrinter::printWaitEXP(const MCInst *MI, unsigned OpNo,
-                                    const MCSubtargetInfo &STI,
-                                    raw_ostream &O) {
-  O << " wait_exp:" << formatDec(MI->getOperand(OpNo).getImm());
-}
-
 bool AMDGPUInstPrinter::needsImpliedVcc(const MCInstrDesc &Desc,
                                         unsigned OpNo) const {
   return OpNo == 0 && (Desc.TSFlags & SIInstrFlags::DPP) &&
@@ -1127,18 +1052,6 @@ void AMDGPUInstPrinter::printDPPCtrl(const MCInst *MI, unsigned OpNo,
   }
 }
 
-void AMDGPUInstPrinter::printDppRowMask(const MCInst *MI, unsigned OpNo,
-                                        const MCSubtargetInfo &STI,
-                                        raw_ostream &O) {
-  O << " row_mask:" << formatHex(MI->getOperand(OpNo).getImm());
-}
-
-void AMDGPUInstPrinter::printDppBankMask(const MCInst *MI, unsigned OpNo,
-                                         const MCSubtargetInfo &STI,
-                                         raw_ostream &O) {
-  O << " bank_mask:" << formatHex(MI->getOperand(OpNo).getImm());
-}
-
 void AMDGPUInstPrinter::printDppBoundCtrl(const MCInst *MI, unsigned OpNo,
                                           const MCSubtargetInfo &STI,
                                           raw_ostream &O) {
@@ -1782,14 +1695,13 @@ void AMDGPUInstPrinter::printEndpgm(const MCInst *MI, unsigned OpNo,
   O << ' ' << formatDec(Imm);
 }
 
-void AMDGPUInstPrinter::printByteSel(const MCInst *MI, unsigned OpNo,
-                                     const MCSubtargetInfo &STI,
-                                     raw_ostream &O) {
-  uint8_t Imm = MI->getOperand(OpNo).getImm();
-  if (!Imm)
-    return;
-
-  O << " byte_sel:" << formatDec(Imm);
+void AMDGPUInstPrinter::printNamedInt(const MCInst *MI, unsigned OpNo,
+                                      const MCSubtargetInfo &STI,
+                                      raw_ostream &O, StringRef Prefix,
+                                      bool PrintInHex, bool AlwaysPrint) {
+  int64_t V = MI->getOperand(OpNo).getImm();
+  if (AlwaysPrint || V != 0)
+    O << ' ' << Prefix << ':' << (PrintInHex ? formatHex(V) : formatDec(V));
 }
 
 #include "AMDGPUGenAsmWriter.inc"
diff --git a/llvm/lib/Target/AMDGPU/MCTargetDesc/AMDGPUInstPrinter.h b/llvm/lib/Target/AMDGPU/MCTargetDesc/AMDGPUInstPrinter.h
index 4a39022..4d44db5 100644
--- a/llvm/lib/Target/AMDGPU/MCTargetDesc/AMDGPUInstPrinter.h
+++ b/llvm/lib/Target/AMDGPU/MCTargetDesc/AMDGPUInstPrinter.h
@@ -48,24 +48,16 @@ private:
   void printFlatOffset(const MCInst *MI, unsigned OpNo, const MCSubtargetInfo &STI,
                        raw_ostream &O);
 
-  void printOffset0(const MCInst *MI, unsigned OpNo, const MCSubtargetInfo &STI,
-                    raw_ostream &O);
-  void printOffset1(const MCInst *MI, unsigned OpNo, const MCSubtargetInfo &STI,
-                    raw_ostream &O);
   void printSMRDOffset8(const MCInst *MI, unsigned OpNo,
                        const MCSubtargetInfo &STI, raw_ostream &O);
   void printSMEMOffset(const MCInst *MI, unsigned OpNo,
                        const MCSubtargetInfo &STI, raw_ostream &O);
-  void printSMEMOffsetMod(const MCInst *MI, unsigned OpNo,
-                          const MCSubtargetInfo &STI, raw_ostream &O);
   void printSMRDLiteralOffset(const MCInst *MI, unsigned OpNo,
                               const MCSubtargetInfo &STI, raw_ostream &O);
   void printCPol(const MCInst *MI, unsigned OpNo,
                  const MCSubtargetInfo &STI, raw_ostream &O);
   void printTH(const MCInst *MI, int64_t TH, int64_t Scope, raw_ostream &O);
   void printScope(int64_t Scope, raw_ostream &O);
-  void printDMask(const MCInst *MI, unsigned OpNo, const MCSubtargetInfo &STI,
-                  raw_ostream &O);
   void printDim(const MCInst *MI, unsigned OpNo, const MCSubtargetInfo &STI,
                 raw_ostream &O);
   void printR128A16(const MCInst *MI, unsigned OpNo, const MCSubtargetInfo &STI,
@@ -110,10 +102,6 @@ private:
                  raw_ostream &O);
   void printDPPCtrl(const MCInst *MI, unsigned OpNo, const MCSubtargetInfo &STI,
                     raw_ostream &O);
-  void printDppRowMask(const MCInst *MI, unsigned OpNo,
-                       const MCSubtargetInfo &STI, raw_ostream &O);
-  void printDppBankMask(const MCInst *MI, unsigned OpNo,
-                        const MCSubtargetInfo &STI, raw_ostream &O);
   void printDppBoundCtrl(const MCInst *MI, unsigned OpNo,
                          const MCSubtargetInfo &STI, raw_ostream &O);
   void printDppFI(const MCInst *MI, unsigned OpNo, const MCSubtargetInfo &STI,
@@ -154,21 +142,9 @@ private:
                        const MCSubtargetInfo &STI, raw_ostream &O);
   void printBLGP(const MCInst *MI, unsigned OpNo, const MCSubtargetInfo &STI,
                  raw_ostream &O);
-  void printCBSZ(const MCInst *MI, unsigned OpNo, const MCSubtargetInfo &STI,
-                 raw_ostream &O);
-  void printABID(const MCInst *MI, unsigned OpNo, const MCSubtargetInfo &STI,
-                 raw_ostream &O);
   bool needsImpliedVcc(const MCInstrDesc &Desc, unsigned OpNo) const;
   void printDefaultVccOperand(bool FirstOperand, const MCSubtargetInfo &STI,
                               raw_ostream &O);
-  void printWaitVDST(const MCInst *MI, unsigned OpNo, const MCSubtargetInfo &STI,
-                    raw_ostream &O);
-  void printWaitEXP(const MCInst *MI, unsigned OpNo, const MCSubtargetInfo &STI,
-                    raw_ostream &O);
-  void printWaitVAVDst(const MCInst *MI, unsigned OpNo,
-                       const MCSubtargetInfo &STI, raw_ostream &O);
-  void printWaitVMVSrc(const MCInst *MI, unsigned OpNo,
-                       const MCSubtargetInfo &STI, raw_ostream &O);
 
   void printExpSrcN(const MCInst *MI, unsigned OpNo, const MCSubtargetInfo &STI,
                     raw_ostream &O, unsigned N);
@@ -182,8 +158,9 @@ private:
                     const MCSubtargetInfo &STI, raw_ostream &O);
   void printExpTgt(const MCInst *MI, unsigned OpNo,
                    const MCSubtargetInfo &STI, raw_ostream &O);
-  void printByteSel(const MCInst *MI, unsigned OpNo, const MCSubtargetInfo &STI,
-                    raw_ostream &O);
+  void printNamedInt(const MCInst *MI, unsigned OpNo,
+                     const MCSubtargetInfo &STI, raw_ostream &O,
+                     StringRef Prefix, bool PrintInHex, bool AlwaysPrint);
 
 public:
   static void printIfSet(const MCInst *MI, unsigned OpNo, raw_ostream &O,
diff --git a/llvm/lib/Target/AMDGPU/MCTargetDesc/AMDGPUTargetStreamer.cpp b/llvm/lib/Target/AMDGPU/MCTargetDesc/AMDGPUTargetStreamer.cpp
index db610a4..8c78db8 100644
--- a/llvm/lib/Target/AMDGPU/MCTargetDesc/AMDGPUTargetStreamer.cpp
+++ b/llvm/lib/Target/AMDGPU/MCTargetDesc/AMDGPUTargetStreamer.cpp
@@ -19,8 +19,8 @@
 #include "llvm/BinaryFormat/ELF.h"
 #include "llvm/MC/MCAssembler.h"
 #include "llvm/MC/MCContext.h"
+#include "llvm/MC/MCELFObjectWriter.h"
 #include "llvm/MC/MCELFStreamer.h"
-#include "llvm/MC/MCObjectWriter.h"
 #include "llvm/MC/MCSectionELF.h"
 #include "llvm/MC/MCSubtargetInfo.h"
 #include "llvm/Support/AMDGPUMetadata.h"
diff --git a/llvm/lib/Target/AMDGPU/SIInstrInfo.td b/llvm/lib/Target/AMDGPU/SIInstrInfo.td
index 00a8b15..b97256b 100644
--- a/llvm/lib/Target/AMDGPU/SIInstrInfo.td
+++ b/llvm/lib/Target/AMDGPU/SIInstrInfo.td
@@ -828,7 +828,9 @@ def InlineImmFP64 : FPImmLeaf<f64, [{
 
 class VGPRImm <dag frag> : PatLeaf<frag, [{
   return isVGPRImm(N);
-}]>;
+}]> {
+  let GISelPredicateCode = [{return true;}];
+}
 
 def NegateImm : SDNodeXForm<imm, [{
   return CurDAG->getConstant(-N->getSExtValue(), SDLoc(N), MVT::i32);
@@ -1015,18 +1017,29 @@ def SDWAVopcDst : BoolRC {
   let PrintMethod = "printVOPDst";
 }
 
-class NamedIntOperand<ValueType Type, string Prefix, bit Optional = 1,
+class NamedIntOperand<ValueType Type, string prefix, bit Optional = 1,
                       string name = NAME>
     : CustomOperand<Type, Optional, name> {
+  string Prefix = prefix;
+
   let PredicateMethod =
     "getPredicate([](const AMDGPUOperand &Op) -> bool { "#
     "return Op.isImmTy(AMDGPUOperand::"#ImmTy#"); })";
+
   string Validator = "[](int64_t V) { return true; }";
   string ConvertMethod = "[](int64_t &V) { return "#Validator#"(V); }";
   let ParserMethod =
     "[this](OperandVector &Operands) -> ParseStatus { "#
     "return parseIntWithPrefix(\""#Prefix#"\", Operands, "#
     "AMDGPUOperand::"#ImmTy#", "#ConvertMethod#"); }";
+
+  bit PrintInHex = 0;
+  bit AlwaysPrint = 0;
+  let PrintMethod = "[this](const MCInst *MI, unsigned OpNo, "
+                    "const MCSubtargetInfo &STI, raw_ostream &O) { "
+                    "printNamedInt(MI, OpNo, STI, O, \""#Prefix#"\", "#
+                    !if(PrintInHex, "true", "false")#", "#
+                    !if(AlwaysPrint, "true", "false")#"); }";
 }
 
 class NamedBitOperand<string Id, string Name = NAME>
@@ -1065,6 +1078,7 @@ class ArrayOperand0<string Id, string Name = NAME>
 
 let ImmTy = "ImmTyOffset" in
 def flat_offset : CustomOperand<i32, 1, "FlatOffset">;
+let PrintMethod = "printOffset" in
 def Offset : NamedIntOperand<i32, "offset">;
 let Validator = "isUInt<8>" in {
 def Offset0 : NamedIntOperand<i32, "offset0">;
@@ -1103,6 +1117,7 @@ def exp_vm : NamedBitOperand<"vm", "ExpVM">;
 
 def FORMAT : CustomOperand<i8>;
 
+let PrintInHex = 1 in
 def DMask : NamedIntOperand<i16, "dmask">;
 
 def Dim : CustomOperand<i8, /*optional=*/1>;
@@ -1123,16 +1138,18 @@ def IndexKey8bit : CustomOperand<i32, 1>;
 def dpp8 : CustomOperand<i32, 0, "DPP8">;
 def dpp_ctrl : CustomOperand<i32, 0, "DPPCtrl">;
 
-let DefaultValue = "0xf" in {
+let DefaultValue = "0xf", PrintInHex = 1, AlwaysPrint = 1 in {
 def DppRowMask : NamedIntOperand<i32, "row_mask">;
 def DppBankMask : NamedIntOperand<i32, "bank_mask">;
 }
 def DppBoundCtrl : NamedIntOperand<i1, "bound_ctrl"> {
   let ConvertMethod = "[this] (int64_t &BC) -> bool { return convertDppBoundCtrl(BC); }";
+  let PrintMethod = "printDppBoundCtrl";
 }
 
-let DecoderMethod = "decodeDpp8FI" in
+let DecoderMethod = "decodeDpp8FI", PrintMethod = "printDppFI" in
 def Dpp8FI : NamedIntOperand<i32, "fi", 1, "DppFI">;
+let PrintMethod = "printDppFI" in
 def Dpp16FI : NamedIntOperand<i32, "fi", 1, "DppFI">;
 
 def blgp : CustomOperand<i32, 1, "BLGP">;
@@ -1146,6 +1163,7 @@ def hwreg : CustomOperand<i32, 0, "Hwreg">;
 
 def exp_tgt : CustomOperand<i32, 0, "ExpTgt">;
 
+let AlwaysPrint = 1 in {
 def WaitVDST : NamedIntOperand<i8, "wait_vdst"> {
   let Validator = "isUInt<4>";
 }
@@ -1158,6 +1176,7 @@ def WaitVAVDst : NamedIntOperand<i8, "wait_va_vdst"> {
 def WaitVMVSrc : NamedIntOperand<i8, "wait_vm_vsrc"> {
   let Validator = "isUInt<1>";
 }
+} // End AlwaysPrint = 1
 
 def ByteSel : NamedIntOperand<i8, "byte_sel"> {
   let Validator = "isUInt<2>";
diff --git a/llvm/lib/Target/AMDGPU/SIInstructions.td b/llvm/lib/Target/AMDGPU/SIInstructions.td
index 15078bc..c41850a 100644
--- a/llvm/lib/Target/AMDGPU/SIInstructions.td
+++ b/llvm/lib/Target/AMDGPU/SIInstructions.td
@@ -2138,19 +2138,28 @@ def : GCNPat <
 /********** Immediate Patterns **********/
 /********** ================== **********/
 
-def : GCNPat <
-  (VGPRImm<(i32 imm)>:$imm),
-  (V_MOV_B32_e32 imm:$imm)
->;
+// FIXME: Remove VGPRImm. Should be inferrable from register bank.
+
+foreach vt = [i32, p3, p5, p6, p2] in {
+  def : GCNPat <
+    (VGPRImm<(vt imm)>:$imm),
+    (V_MOV_B32_e32 imm:$imm)
+  >;
+
+  def : GCNPat <
+    (vt imm:$imm),
+    (S_MOV_B32 imm:$imm)
+  >;
+}
 
 def : GCNPat <
-  (VGPRImm<(f32 fpimm)>:$imm),
-  (V_MOV_B32_e32 (f32 (bitcast_fpimm_to_i32 $imm)))
+  (p5 frameindex:$fi),
+  (V_MOV_B32_e32 (p5 (frameindex_to_targetframeindex $fi)))
 >;
 
 def : GCNPat <
-  (i32 imm:$imm),
-  (S_MOV_B32 imm:$imm)
+  (p5 frameindex:$fi),
+  (S_MOV_B32 (p5 (frameindex_to_targetframeindex $fi)))
 >;
 
 def : GCNPat <
@@ -2163,18 +2172,39 @@ def : GCNPat <
   (S_MOV_B32 $ga)
 >;
 
-// FIXME: Workaround for ordering issue with peephole optimizer where
-// a register class copy interferes with immediate folding.  Should
-// use s_mov_b32, which can be shrunk to s_movk_i32
-def : GCNPat <
-  (VGPRImm<(f16 fpimm)>:$imm),
-  (V_MOV_B32_e32 (f16 (bitcast_fpimm_to_i32 $imm)))
->;
+foreach pred = [NotHasTrue16BitInsts, UseFakeTrue16Insts] in {
+  let True16Predicate = pred in {
+    def : GCNPat <
+      (VGPRImm<(i16 imm)>:$imm),
+      (V_MOV_B32_e32 imm:$imm)
+    >;
+  }
 
-def : GCNPat <
-  (VGPRImm<(bf16 fpimm)>:$imm),
-  (V_MOV_B32_e32 (bf16 (bitcast_fpimm_to_i32 $imm)))
->;
+  // FIXME: Workaround for ordering issue with peephole optimizer where
+  // a register class copy interferes with immediate folding.  Should
+  // use s_mov_b32, which can be shrunk to s_movk_i32
+
+  foreach vt = [f16, bf16] in {
+    def : GCNPat <
+      (VGPRImm<(f16 fpimm)>:$imm),
+      (V_MOV_B32_e32 (vt (bitcast_fpimm_to_i32 $imm)))
+    >;
+  }
+}
+
+let True16Predicate = UseRealTrue16Insts in {
+  def : GCNPat <
+    (VGPRImm<(i16 imm)>:$imm),
+    (V_MOV_B16_t16_e64 0, imm:$imm, 0)
+  >;
+
+  foreach vt = [f16, bf16] in {
+    def : GCNPat <
+      (VGPRImm<(vt fpimm)>:$imm),
+      (V_MOV_B16_t16_e64 0, $imm, 0)
+    >;
+  }
+}
 
 // V_MOV_B64_PSEUDO and S_MOV_B64_IMM_PSEUDO can be used with any 64-bit
 // immediate and wil be expanded as needed, but we will only use these patterns
@@ -2210,47 +2240,71 @@ def : GCNPat <
 >;
 
 def : GCNPat <
+  (VGPRImm<(bf16 fpimm)>:$imm),
+  (V_MOV_B32_e32 (bf16 (bitcast_fpimm_to_i32 $imm)))
+>;
+
+def : GCNPat <
   (bf16 fpimm:$imm),
   (S_MOV_B32 (i32 (bitcast_fpimm_to_i32 $imm)))
 >;
 
 def : GCNPat <
-  (p5 frameindex:$fi),
-  (V_MOV_B32_e32 (p5 (frameindex_to_targetframeindex $fi)))
+  (VGPRImm<(f32 fpimm)>:$imm),
+  (V_MOV_B32_e32 (f32 (bitcast_fpimm_to_i32 $imm)))
 >;
 
 def : GCNPat <
-  (p5 frameindex:$fi),
-  (S_MOV_B32 (p5 (frameindex_to_targetframeindex $fi)))
+  (f32 fpimm:$imm),
+  (S_MOV_B32 (f32 (bitcast_fpimm_to_i32 $imm)))
 >;
 
+foreach vt = [i64, p1, p0, p4] in { // FIXME: Should accept arbitrary addrspace
+  def : GCNPat <
+    (VGPRImm<(vt imm)>:$imm),
+    (V_MOV_B64_PSEUDO imm:$imm)
+  >;
+
+  def : GCNPat <
+    (vt InlineImm64:$imm),
+    (S_MOV_B64 InlineImm64:$imm)
+  >;
+
+  def : GCNPat <
+    (vt imm:$imm),
+    (S_MOV_B64_IMM_PSEUDO imm:$imm)
+  >;
+}
+
 def : GCNPat <
-  (i64 InlineImm64:$imm),
-  (S_MOV_B64 InlineImm64:$imm)
+  (VGPRImm<(f64 fpimm)>:$imm),
+  (V_MOV_B64_PSEUDO (f64 (bitcast_fpimm_to_i64 $imm)))
 >;
 
-// XXX - Should this use a s_cmp to set SCC?
+// V_MOV_B64_PSEUDO and S_MOV_B64_IMM_PSEUDO can be used with any 64-bit
+// immediate and wil be expanded as needed, but we will only use these patterns
+// for values which can be encoded.
+def : GCNPat <
+  (f64 InlineImmFP64:$imm),
+  (S_MOV_B64 (i64 (bitcast_fpimm_to_i64 $imm)))
+>;
 
-// Set to sign-extended 64-bit value (true = -1, false = 0)
 def : GCNPat <
-  (i1 imm:$imm),
-  (S_MOV_B64 (i64 (as_i64imm $imm)))
-> {
+  (f64 fpimm:$imm),
+  (S_MOV_B64_IMM_PSEUDO (i64 (bitcast_fpimm_to_i64 fpimm:$imm)))
+>;
+
+// Set to sign-extended 64-bit value (true = -1, false = 0)
+def : GCNPat <(i1 imm:$imm),
+              (S_MOV_B64 imm:$imm)> {
   let WaveSizePredicate = isWave64;
 }
 
-def : GCNPat <
-  (i1 imm:$imm),
-  (S_MOV_B32 (i32 (as_i32imm $imm)))
-> {
+def : GCNPat <(i1 imm:$imm),
+              (S_MOV_B32 imm:$imm)> {
   let WaveSizePredicate = isWave32;
 }
 
-def : GCNPat <
-  (f64 InlineImmFP64:$imm),
-  (S_MOV_B64 (f64 (bitcast_fpimm_to_i64 InlineImmFP64:$imm)))
->;
-
 /********** ================== **********/
 /********** Intrinsic Patterns **********/
 /********** ================== **********/
diff --git a/llvm/lib/Target/AMDGPU/SIRegisterInfo.cpp b/llvm/lib/Target/AMDGPU/SIRegisterInfo.cpp
index 8a315aa..e824e95 100644
--- a/llvm/lib/Target/AMDGPU/SIRegisterInfo.cpp
+++ b/llvm/lib/Target/AMDGPU/SIRegisterInfo.cpp
@@ -2449,7 +2449,8 @@ bool SIRegisterInfo::eliminateFrameIndex(MachineBasicBlock::iterator MI,
                                             ? &AMDGPU::SReg_32RegClass
                                             : &AMDGPU::VGPR_32RegClass;
         bool IsCopy = MI->getOpcode() == AMDGPU::V_MOV_B32_e32 ||
-                      MI->getOpcode() == AMDGPU::V_MOV_B32_e64;
+                      MI->getOpcode() == AMDGPU::V_MOV_B32_e64 ||
+                      MI->getOpcode() == AMDGPU::S_MOV_B32;
         Register ResultReg =
             IsCopy ? MI->getOperand(0).getReg()
                    : RS->scavengeRegisterBackwards(*RC, MI, false, 0);
@@ -2458,7 +2459,13 @@ bool SIRegisterInfo::eliminateFrameIndex(MachineBasicBlock::iterator MI,
         if (Offset == 0) {
           unsigned OpCode = IsSALU && !LiveSCC ? AMDGPU::S_LSHR_B32
                                                : AMDGPU::V_LSHRREV_B32_e64;
-          auto Shift = BuildMI(*MBB, MI, DL, TII->get(OpCode), ResultReg);
+          Register TmpResultReg = ResultReg;
+          if (IsSALU && LiveSCC) {
+            TmpResultReg = RS->scavengeRegisterBackwards(
+                AMDGPU::VGPR_32RegClass, MI, false, 0);
+          }
+
+          auto Shift = BuildMI(*MBB, MI, DL, TII->get(OpCode), TmpResultReg);
           if (OpCode == AMDGPU::V_LSHRREV_B32_e64)
             // For V_LSHRREV, the operands are reversed (the shift count goes
             // first).
@@ -2468,11 +2475,13 @@ bool SIRegisterInfo::eliminateFrameIndex(MachineBasicBlock::iterator MI,
           if (IsSALU && !LiveSCC)
             Shift.getInstr()->getOperand(3).setIsDead(); // Mark SCC as dead.
           if (IsSALU && LiveSCC) {
-            Register NewDest = RS->scavengeRegisterBackwards(
-                AMDGPU::SReg_32RegClass, Shift, false, 0);
+            Register NewDest =
+                IsCopy ? ResultReg
+                       : RS->scavengeRegisterBackwards(AMDGPU::SReg_32RegClass,
+                                                       Shift, false, 0);
             BuildMI(*MBB, MI, DL, TII->get(AMDGPU::V_READFIRSTLANE_B32),
                     NewDest)
-                .addReg(ResultReg);
+                .addReg(TmpResultReg);
             ResultReg = NewDest;
           }
         } else {
@@ -2523,22 +2532,82 @@ bool SIRegisterInfo::eliminateFrameIndex(MachineBasicBlock::iterator MI,
 
             // We may have 1 free scratch SGPR even though a carry out is
             // unavailable. Only one additional mov is needed.
-            Register TmpScaledReg = RS->scavengeRegisterBackwards(
-                AMDGPU::SReg_32_XM0RegClass, MI, false, 0, false);
-            Register ScaledReg = TmpScaledReg.isValid() ? TmpScaledReg : FrameReg;
+            Register TmpScaledReg = IsCopy && IsSALU
+                                        ? ResultReg
+                                        : RS->scavengeRegisterBackwards(
+                                              AMDGPU::SReg_32_XM0RegClass, MI,
+                                              false, 0, /*AllowSpill=*/false);
+            Register ScaledReg =
+                TmpScaledReg.isValid() ? TmpScaledReg : FrameReg;
+            Register TmpResultReg = ScaledReg;
+
+            if (!LiveSCC) {
+              BuildMI(*MBB, MI, DL, TII->get(AMDGPU::S_LSHR_B32), TmpResultReg)
+                  .addReg(FrameReg)
+                  .addImm(ST.getWavefrontSizeLog2());
+              BuildMI(*MBB, MI, DL, TII->get(AMDGPU::S_ADD_I32), TmpResultReg)
+                  .addReg(TmpResultReg, RegState::Kill)
+                  .addImm(Offset);
+            } else {
+              TmpResultReg = RS->scavengeRegisterBackwards(
+                  AMDGPU::VGPR_32RegClass, MI, false, 0, /*AllowSpill=*/true);
+
+              MachineInstrBuilder Add;
+              if ((Add = TII->getAddNoCarry(*MBB, MI, DL, TmpResultReg, *RS))) {
+                BuildMI(*MBB, *Add, DL, TII->get(AMDGPU::V_LSHRREV_B32_e64),
+                        TmpResultReg)
+                    .addImm(ST.getWavefrontSizeLog2())
+                    .addReg(FrameReg);
+                if (Add->getOpcode() == AMDGPU::V_ADD_CO_U32_e64) {
+                  BuildMI(*MBB, *Add, DL, TII->get(AMDGPU::S_MOV_B32),
+                          ResultReg)
+                      .addImm(Offset);
+                  Add.addReg(ResultReg, RegState::Kill)
+                      .addReg(TmpResultReg, RegState::Kill)
+                      .addImm(0);
+                } else
+                  Add.addImm(Offset).addReg(TmpResultReg, RegState::Kill);
+              } else {
+                BuildMI(*MBB, MI, DL, TII->get(AMDGPU::V_MOV_B32_e32),
+                        TmpResultReg)
+                    .addImm(Offset);
+                assert(Offset > 0 &&
+                       isUInt<24>(2 * ST.getMaxWaveScratchSize()) &&
+                       "offset is unsafe for v_mad_u32_u24");
+                // We start with a frame pointer with a wave space value, and an
+                // offset in lane-space. We are materializing a lane space
+                // value. We can either do a right shift of the frame pointer to
+                // get to lane space, or a left shift of the offset to get to
+                // wavespace. We can right shift after the computation to get
+                // back to the desired per-lane value.
+                // We are using the mad_u32_u24 primarily as an add with no
+                // carry out clobber.
+                Add = BuildMI(*MBB, MI, DL, TII->get(AMDGPU::V_MAD_U32_U24_e64),
+                              TmpResultReg)
+                          .addReg(TmpResultReg, RegState::Kill)
+                          .addImm(ST.getWavefrontSize())
+                          .addReg(FrameReg)
+                          .addImm(0);
+                BuildMI(*MBB, MI, DL, TII->get(AMDGPU::V_LSHRREV_B32_e64),
+                        TmpResultReg)
+                    .addImm(ST.getWavefrontSizeLog2())
+                    .addReg(FrameReg);
+              }
 
-            BuildMI(*MBB, MI, DL, TII->get(AMDGPU::S_LSHR_B32), ScaledReg)
-              .addReg(FrameReg)
-              .addImm(ST.getWavefrontSizeLog2());
-            BuildMI(*MBB, MI, DL, TII->get(AMDGPU::S_ADD_I32), ScaledReg)
-                .addReg(ScaledReg, RegState::Kill)
-                .addImm(Offset);
+              Register NewDest = IsCopy ? ResultReg
+                                        : RS->scavengeRegisterBackwards(
+                                              AMDGPU::SReg_32RegClass, *Add,
+                                              false, 0, /*AllowSpill=*/true);
+              BuildMI(*MBB, MI, DL, TII->get(AMDGPU::V_READFIRSTLANE_B32),
+                      NewDest)
+                  .addReg(TmpResultReg);
+              ResultReg = NewDest;
+            }
             if (!IsSALU)
               BuildMI(*MBB, MI, DL, TII->get(AMDGPU::COPY), ResultReg)
-                  .addReg(ScaledReg, RegState::Kill);
+                  .addReg(TmpResultReg, RegState::Kill);
             else
-              ResultReg = ScaledReg;
-
+              ResultReg = TmpResultReg;
             // If there were truly no free SGPRs, we need to undo everything.
             if (!TmpScaledReg.isValid()) {
               BuildMI(*MBB, MI, DL, TII->get(AMDGPU::S_ADD_I32), ScaledReg)
diff --git a/llvm/lib/Target/AMDGPU/SMInstructions.td b/llvm/lib/Target/AMDGPU/SMInstructions.td
index 4218b7d..8cc963a 100644
--- a/llvm/lib/Target/AMDGPU/SMInstructions.td
+++ b/llvm/lib/Target/AMDGPU/SMInstructions.td
@@ -11,7 +11,10 @@ def smrd_offset_8 : ImmOperand<i32, "SMRDOffset8", 1>;
 let EncoderMethod = "getSMEMOffsetEncoding",
     DecoderMethod = "decodeSMEMOffset" in {
 def SMEMOffset : ImmOperand<i32, "SMEMOffset", 1>;
-def SMEMOffsetMod : NamedIntOperand<i32, "offset", 0>;
+def SMEMOffsetMod : NamedIntOperand<i32, "offset", 0> {
+  let AlwaysPrint = 1;
+  let PrintInHex = 1;
+}
 def OptSMEMOffsetMod : NamedIntOperand<i32, "offset"> {
   let ImmTy = SMEMOffsetMod.ImmTy;
   let PredicateMethod = SMEMOffsetMod.PredicateMethod;
@@ -950,7 +953,7 @@ multiclass SMLoad_Pattern <string Instr, ValueType vt, bit immci = true> {
     (vt (SIsbuffer_load v4i32:$sbase, (SMRDBufferImm32 i32:$offset), timm:$cachepolicy)),
     (!cast<InstSI>(Instr#"_IMM_ci") SReg_128:$sbase, smrd_literal_offset:$offset,
                                     (extract_cpol $cachepolicy))> {
-    let OtherPredicates = [isGFX7Only];
+    let SubtargetPredicate = isGFX7Only;
     let AddedComplexity = 1;
   }
 
@@ -958,12 +961,12 @@ multiclass SMLoad_Pattern <string Instr, ValueType vt, bit immci = true> {
   def : GCNPat <
     (SIsbuffer_load v4i32:$sbase, i32:$soffset, timm:$cachepolicy),
     (vt (!cast<SM_Pseudo>(Instr#"_SGPR") SReg_128:$sbase, SReg_32:$soffset, (extract_cpol $cachepolicy)))> {
-    let OtherPredicates = [isNotGFX9Plus];
+    let SubtargetPredicate = isNotGFX9Plus;
   }
   def : GCNPat <
     (SIsbuffer_load v4i32:$sbase, i32:$soffset, timm:$cachepolicy),
     (vt (!cast<SM_Pseudo>(Instr#"_SGPR_IMM") SReg_128:$sbase, SReg_32:$soffset, 0, (extract_cpol $cachepolicy)))> {
-    let OtherPredicates = [isGFX9Plus];
+    let SubtargetPredicate = isGFX9Plus;
   }
 
   // 4. Offset as an 32-bit SGPR + immediate
@@ -972,7 +975,7 @@ multiclass SMLoad_Pattern <string Instr, ValueType vt, bit immci = true> {
                     timm:$cachepolicy),
     (vt (!cast<SM_Pseudo>(Instr#"_SGPR_IMM") SReg_128:$sbase, SReg_32:$soffset, i32imm:$offset,
                                              (extract_cpol $cachepolicy)))> {
-    let OtherPredicates = [isGFX9Plus];
+    let SubtargetPredicate = isGFX9Plus;
   }
 }
 
@@ -981,28 +984,28 @@ multiclass ScalarLoadWithExtensionPat <string Instr, SDPatternOperator node, Val
    def : GCNPat <
      (node (SMRDImm i64:$sbase, i32:$offset)),
      (vt (!cast<SM_Pseudo>(Instr#"_IMM") $sbase, $offset, 0))>{
-       let OtherPredicates = [isGFX12Plus];
+       let SubtargetPredicate = isGFX12Plus;
    }
 
    // 2. SGPR offset
    def : GCNPat <
      (node (SMRDSgpr i64:$sbase, i32:$soffset)),
      (vt (!cast<SM_Pseudo>(Instr#"_SGPR_IMM") $sbase, $soffset, 0, 0))>{
-       let OtherPredicates = [isGFX12Plus];
+       let SubtargetPredicate = isGFX12Plus;
    }
 
    // 3. SGPR+IMM offset
    def : GCNPat <
      (node (SMRDSgprImm i64:$sbase, i32:$soffset, i32:$offset)),
      (vt (!cast<SM_Pseudo>(Instr#"_SGPR_IMM") $sbase, $soffset, $offset, 0))>{
-       let OtherPredicates = [isGFX12Plus];
+       let SubtargetPredicate = isGFX12Plus;
    }
 
    // 4. No offset
    def : GCNPat <
      (vt (node (i64 SReg_64:$sbase))),
      (vt (!cast<SM_Pseudo>(Instr#"_IMM") i64:$sbase, 0, 0))>{
-       let OtherPredicates = [isGFX12Plus];
+       let SubtargetPredicate = isGFX12Plus;
   }
 }
 
@@ -1012,14 +1015,14 @@ multiclass ScalarBufferLoadIntrinsicPat <SDPatternOperator name, string Instr> {
   def : GCNPat <
     (name v4i32:$sbase, (SMRDBufferImm i32:$offset), timm:$cachepolicy),
     (i32 (!cast<SM_Pseudo>(Instr#"_IMM") SReg_128:$sbase, i32imm:$offset, (extract_cpol $cachepolicy)))> {
-    let OtherPredicates = [isGFX12Plus];
+    let SubtargetPredicate = isGFX12Plus;
   }
 
   // 2. Offset as an 32-bit SGPR
   def : GCNPat <
     (name v4i32:$sbase, i32:$soffset, timm:$cachepolicy),
     (i32 (!cast<SM_Pseudo>(Instr#"_SGPR_IMM") SReg_128:$sbase, SReg_32:$soffset, 0, (extract_cpol $cachepolicy)))> {
-    let OtherPredicates = [isGFX12Plus];
+    let SubtargetPredicate = isGFX12Plus;
   }
 
   // 3. Offset as an 32-bit SGPR + immediate
@@ -1028,7 +1031,7 @@ multiclass ScalarBufferLoadIntrinsicPat <SDPatternOperator name, string Instr> {
                     timm:$cachepolicy),
     (i32 (!cast<SM_Pseudo>(Instr#"_SGPR_IMM") SReg_128:$sbase, SReg_32:$soffset, i32imm:$offset,
                                              (extract_cpol $cachepolicy)))> {
-    let OtherPredicates = [isGFX12Plus];
+    let SubtargetPredicate = isGFX12Plus;
   }
 }
 
diff --git a/llvm/lib/Target/ARM/ARMAsmPrinter.cpp b/llvm/lib/Target/ARM/ARMAsmPrinter.cpp
index 642739a..d9a8789 100644
--- a/llvm/lib/Target/ARM/ARMAsmPrinter.cpp
+++ b/llvm/lib/Target/ARM/ARMAsmPrinter.cpp
@@ -1447,8 +1447,10 @@ void ARMAsmPrinter::emitInstruction(const MachineInstr *MI) {
     EmitUnwindingInstruction(MI);
 
   // Do any auto-generated pseudo lowerings.
-  if (emitPseudoExpansionLowering(*OutStreamer, MI))
+  if (MCInst OutInst; lowerPseudoInstExpansion(MI, OutInst)) {
+    EmitToStreamer(*OutStreamer, OutInst);
     return;
+  }
 
   assert(!convertAddSubFlagsOpcode(MI->getOpcode()) &&
          "Pseudo flag setting opcode should be expanded early");
diff --git a/llvm/lib/Target/ARM/ARMAsmPrinter.h b/llvm/lib/Target/ARM/ARMAsmPrinter.h
index 33b4417..c4503d9 100644
--- a/llvm/lib/Target/ARM/ARMAsmPrinter.h
+++ b/llvm/lib/Target/ARM/ARMAsmPrinter.h
@@ -126,9 +126,8 @@ private:
 
   void EmitUnwindingInstruction(const MachineInstr *MI);
 
-  // emitPseudoExpansionLowering - tblgen'erated.
-  bool emitPseudoExpansionLowering(MCStreamer &OutStreamer,
-                                   const MachineInstr *MI);
+  // tblgen'erated.
+  bool lowerPseudoInstExpansion(const MachineInstr *MI, MCInst &Inst);
 
 public:
   unsigned getISAEncoding() override {
diff --git a/llvm/lib/Target/ARM/ARMMCInstLower.cpp b/llvm/lib/Target/ARM/ARMMCInstLower.cpp
index 2c28532..c6d4aa9 100644
--- a/llvm/lib/Target/ARM/ARMMCInstLower.cpp
+++ b/llvm/lib/Target/ARM/ARMMCInstLower.cpp
@@ -218,7 +218,7 @@ void ARMAsmPrinter::EmitSled(const MachineInstr &MI, SledKind Kind)
   // Emit "B #20" instruction, which jumps over the next 24 bytes (because
   // register pc is 8 bytes ahead of the jump instruction by the moment CPU
   // is executing it).
-  // By analogy to ARMAsmPrinter::emitPseudoExpansionLowering() |case ARM::B|.
+  // By analogy to ARMAsmPrinter::lowerPseudoInstExpansion() |case ARM::B|.
   // It is not clear why |addReg(0)| is needed (the last operand).
   EmitToStreamer(*OutStreamer, MCInstBuilder(ARM::Bcc).addImm(20)
     .addImm(ARMCC::AL).addReg(0));
diff --git a/llvm/lib/Target/ARM/ARMTargetTransformInfo.cpp b/llvm/lib/Target/ARM/ARMTargetTransformInfo.cpp
index b5ca045..e940dce 100644
--- a/llvm/lib/Target/ARM/ARMTargetTransformInfo.cpp
+++ b/llvm/lib/Target/ARM/ARMTargetTransformInfo.cpp
@@ -1963,7 +1963,7 @@ ARMTTIImpl::getIntrinsicInstrCost(const IntrinsicCostAttributes &ICA,
         LT.second.getScalarSizeInBits() == MTy.getScalarSizeInBits())
       return LT.first * ST->getMVEVectorCostFactor(CostKind);
 
-    // Otherwise we use a legal convert followed by a min+max
+    // If we can we use a legal convert followed by a min+max
     if (((ST->hasVFP2Base() && LT.second == MVT::f32) ||
          (ST->hasFP64() && LT.second == MVT::f64) ||
          (ST->hasFullFP16() && LT.second == MVT::f16) ||
@@ -1984,7 +1984,25 @@ ARMTTIImpl::getIntrinsicInstrCost(const IntrinsicCostAttributes &ICA,
       Cost += getIntrinsicInstrCost(Attrs2, CostKind);
       return LT.first * Cost;
     }
-    break;
+    // Otherwise we need to follow the default expansion that clamps the value
+    // using a float min/max with a fcmp+sel for nan handling when signed.
+    Type *FPTy = ICA.getArgTypes()[0];
+    Type *RetTy = ICA.getReturnType();
+    IntrinsicCostAttributes Attrs1(Intrinsic::minnum, FPTy, {FPTy, FPTy});
+    InstructionCost Cost = getIntrinsicInstrCost(Attrs1, CostKind);
+    IntrinsicCostAttributes Attrs2(Intrinsic::maxnum, FPTy, {FPTy, FPTy});
+    Cost += getIntrinsicInstrCost(Attrs2, CostKind);
+    Cost +=
+        getCastInstrCost(IsSigned ? Instruction::FPToSI : Instruction::FPToUI,
+                         RetTy, FPTy, TTI::CastContextHint::None, CostKind);
+    if (IsSigned) {
+      Type *CondTy = RetTy->getWithNewBitWidth(1);
+      Cost += getCmpSelInstrCost(BinaryOperator::FCmp, FPTy, CondTy,
+                                 CmpInst::FCMP_UNO, CostKind);
+      Cost += getCmpSelInstrCost(BinaryOperator::Select, RetTy, CondTy,
+                                 CmpInst::FCMP_UNO, CostKind);
+    }
+    return Cost;
   }
   }
 
diff --git a/llvm/lib/Target/ARM/MCTargetDesc/ARMELFStreamer.cpp b/llvm/lib/Target/ARM/MCTargetDesc/ARMELFStreamer.cpp
index de343a7..9df752f 100644
--- a/llvm/lib/Target/ARM/MCTargetDesc/ARMELFStreamer.cpp
+++ b/llvm/lib/Target/ARM/MCTargetDesc/ARMELFStreamer.cpp
@@ -27,6 +27,7 @@
 #include "llvm/MC/MCAssembler.h"
 #include "llvm/MC/MCCodeEmitter.h"
 #include "llvm/MC/MCContext.h"
+#include "llvm/MC/MCELFObjectWriter.h"
 #include "llvm/MC/MCELFStreamer.h"
 #include "llvm/MC/MCExpr.h"
 #include "llvm/MC/MCFixup.h"
diff --git a/llvm/lib/Target/ARM/MCTargetDesc/ARMWinCOFFObjectWriter.cpp b/llvm/lib/Target/ARM/MCTargetDesc/ARMWinCOFFObjectWriter.cpp
index c442794..5a1ae90 100644
--- a/llvm/lib/Target/ARM/MCTargetDesc/ARMWinCOFFObjectWriter.cpp
+++ b/llvm/lib/Target/ARM/MCTargetDesc/ARMWinCOFFObjectWriter.cpp
@@ -61,8 +61,8 @@ unsigned ARMWinCOFFObjectWriter::getRelocType(MCContext &Ctx,
 
   switch (FixupKind) {
   default: {
-    const MCFixupKindInfo &Info = MAB.getFixupKindInfo(Fixup.getKind());
-    report_fatal_error(Twine("unsupported relocation type: ") + Info.Name);
+    Ctx.reportError(Fixup.getLoc(), "unsupported relocation type");
+    return COFF::IMAGE_REL_ARM_ABSOLUTE;
   }
   case FK_Data_4:
     switch (Modifier) {
diff --git a/llvm/lib/Target/AVR/MCTargetDesc/AVRELFStreamer.cpp b/llvm/lib/Target/AVR/MCTargetDesc/AVRELFStreamer.cpp
index b49fa38..e32d2d4 100644
--- a/llvm/lib/Target/AVR/MCTargetDesc/AVRELFStreamer.cpp
+++ b/llvm/lib/Target/AVR/MCTargetDesc/AVRELFStreamer.cpp
@@ -1,13 +1,12 @@
 #include "AVRELFStreamer.h"
-
+#include "AVRMCTargetDesc.h"
 #include "llvm/BinaryFormat/ELF.h"
 #include "llvm/MC/MCAssembler.h"
+#include "llvm/MC/MCELFObjectWriter.h"
 #include "llvm/MC/MCSubtargetInfo.h"
 #include "llvm/Support/FormattedStream.h"
 #include "llvm/TargetParser/SubtargetFeature.h"
 
-#include "AVRMCTargetDesc.h"
-
 namespace llvm {
 
 static unsigned getEFlagsForFeatureSet(const FeatureBitset &Features) {
diff --git a/llvm/lib/Target/CSKY/CSKYAsmPrinter.cpp b/llvm/lib/Target/CSKY/CSKYAsmPrinter.cpp
index 7d121b8..7f0c2d1 100644
--- a/llvm/lib/Target/CSKY/CSKYAsmPrinter.cpp
+++ b/llvm/lib/Target/CSKY/CSKYAsmPrinter.cpp
@@ -145,8 +145,10 @@ void CSKYAsmPrinter::emitInstruction(const MachineInstr *MI) {
                                        getSubtargetInfo().getFeatureBits());
 
   // Do any auto-generated pseudo lowerings.
-  if (emitPseudoExpansionLowering(*OutStreamer, MI))
+  if (MCInst OutInst; lowerPseudoInstExpansion(MI, OutInst)) {
+    EmitToStreamer(*OutStreamer, OutInst);
     return;
+  }
 
   // If we just ended a constant pool, mark it as such.
   if (InConstantPool && MI->getOpcode() != CSKY::CONSTPOOL_ENTRY) {
diff --git a/llvm/lib/Target/CSKY/CSKYAsmPrinter.h b/llvm/lib/Target/CSKY/CSKYAsmPrinter.h
index 3791895..da47b65 100644
--- a/llvm/lib/Target/CSKY/CSKYAsmPrinter.h
+++ b/llvm/lib/Target/CSKY/CSKYAsmPrinter.h
@@ -41,8 +41,7 @@ public:
 
   /// tblgen'erated driver function for lowering simple MI->MC
   /// pseudo instructions.
-  bool emitPseudoExpansionLowering(MCStreamer &OutStreamer,
-                                   const MachineInstr *MI);
+  bool lowerPseudoInstExpansion(const MachineInstr *MI, MCInst &Inst);
 
   void emitMachineConstantPoolValue(MachineConstantPoolValue *MCPV) override;
 
diff --git a/llvm/lib/Target/CSKY/MCTargetDesc/CSKYELFStreamer.cpp b/llvm/lib/Target/CSKY/MCTargetDesc/CSKYELFStreamer.cpp
index a77e3be..346b123 100644
--- a/llvm/lib/Target/CSKY/MCTargetDesc/CSKYELFStreamer.cpp
+++ b/llvm/lib/Target/CSKY/MCTargetDesc/CSKYELFStreamer.cpp
@@ -17,6 +17,7 @@
 #include "llvm/BinaryFormat/ELF.h"
 #include "llvm/MC/MCAssembler.h"
 #include "llvm/MC/MCContext.h"
+#include "llvm/MC/MCELFObjectWriter.h"
 #include "llvm/MC/MCSectionELF.h"
 #include "llvm/MC/MCSubtargetInfo.h"
 #include "llvm/MC/MCSymbolELF.h"
diff --git a/llvm/lib/Target/DirectX/DXIL.td b/llvm/lib/Target/DirectX/DXIL.td
index adaaa2a..a66f5b6 100644
--- a/llvm/lib/Target/DirectX/DXIL.td
+++ b/llvm/lib/Target/DirectX/DXIL.td
@@ -13,14 +13,32 @@
 
 include "llvm/IR/Intrinsics.td"
 
-class DXILOpClass;
+// Abstract class to represent major and minor version values
+class Version<int major, int minor> {
+  int Major = major;
+  int Minor = minor;
+}
+
+// Valid DXIL Version records
+foreach i = 0...8 in {
+  def DXIL1_ #i : Version<1, i>;
+}
 
-// Following is a set of DXIL Operation classes whose names appear to be
-// arbitrary, yet need to be a substring of the function name used during
-// lowering to DXIL Operation calls. These class name strings are specified
-// as the third argument of add_dixil_op in utils/hct/hctdb.py and case converted
-// in utils/hct/hctdb_instrhelp.py of DirectXShaderCompiler repo. The function
-// name has the format "dx.op.<class-name>.<return-type>".
+// Overload type alias of llvm_any_ty
+defvar overloadTy = llvm_any_ty;
+
+// Type aliases for DXIL Op types to LLVM Types.
+// TODO: Define DXIL Types independent of LLVM types
+defvar i1Ty = llvm_i1_ty;
+defvar i8Ty = llvm_i8_ty;
+defvar i16Ty = llvm_i16_ty;
+defvar i32Ty = llvm_i32_ty;
+defvar i64Ty = llvm_i64_ty;
+defvar halfTy = llvm_half_ty;
+defvar floatTy = llvm_float_ty;
+defvar doubleTy = llvm_double_ty;
+
+class DXILOpClass;
 
 defset list<DXILOpClass> OpClasses = {
   def acceptHitAndEndSearch : DXILOpClass;
@@ -206,160 +224,482 @@ defset list<DXILOpClass> OpClasses = {
   def writeSamplerFeedbackGrad : DXILOpClass;
   def writeSamplerFeedbackLevel: DXILOpClass;
 
-  // This is a sentinel definition. Hence placed at the end of the list
-  // and not as part of the above alphabetically sorted valid definitions.
+  // This is a sentinel definition. Hence placed at the end here and
+  // not as part of the above alphabetically sorted valid definitions.
+  // It is never used to construct the name of DXIL Op call name.
   // Additionally it is capitalized unlike all the others.
-  def UnknownOpClass: DXILOpClass;
+  def UnknownOpClass : DXILOpClass;
 }
 
-// Several of the overloaded DXIL Operations support for data types
-// that are a subset of the overloaded LLVM intrinsics that they map to.
-// For e.g., llvm.sin.* intrinsic operates on any floating-point type and
-// maps for lowering to DXIL Op Sin. However, valid overloads of DXIL Sin
-// operation overloads are half (f16) and float (f32) only.
-//
-// The following abstracts overload types specific to DXIL operations.
-
-class DXILType : LLVMType<OtherVT> {
-  let isAny = 1;
-  int isI16OrI32 = 0;
-  int isHalfOrFloat = 0;
-}
-
-// Concrete records for various overload types supported specifically by
-// DXIL Operations.
-let isI16OrI32 = 1 in
-  def llvm_i16ori32_ty : DXILType;
-
-let isHalfOrFloat = 1 in
-  def llvm_halforfloat_ty : DXILType;
-
-// Abstraction DXIL Operation to LLVM intrinsic
-class DXILOpMappingBase {
-  int OpCode = 0;                      // Opcode of DXIL Operation
-  DXILOpClass OpClass = UnknownOpClass;// Class of DXIL Operation.
-  Intrinsic LLVMIntrinsic = ?;         // LLVM Intrinsic DXIL Operation maps to
-  string Doc = "";                     // A short description of the operation
-  list<LLVMType> OpTypes = ?;          // Valid types of DXIL Operation in the
-                                       // format [returnTy, param1ty, ...]
-}
-
-class DXILOpMapping<int opCode, DXILOpClass opClass,
-                    Intrinsic intrinsic, string doc,
-                    list<LLVMType> opTys = []> : DXILOpMappingBase {
-  int OpCode = opCode;                 // Opcode corresponding to DXIL Operation
-  DXILOpClass OpClass = opClass;       // Class of DXIL Operation.
-  Intrinsic LLVMIntrinsic = intrinsic; // LLVM Intrinsic the DXIL Operation maps
-  string Doc = doc;                    // to a short description of the operation
-  list<LLVMType> OpTypes = !if(!eq(!size(opTys), 0), LLVMIntrinsic.Types, opTys);
-}
-
-// Concrete definition of DXIL Operation mapping to corresponding LLVM intrinsic
-def Abs : DXILOpMapping<6, unary, int_fabs,
-                         "Returns the absolute value of the input.">;
-def IsInf : DXILOpMapping<9, isSpecialFloat, int_dx_isinf,
-                         "Determines if the specified value is infinite.",
-                         [llvm_i1_ty, llvm_halforfloat_ty]>;
-def Cos  : DXILOpMapping<12, unary, int_cos,
-                         "Returns cosine(theta) for theta in radians.",
-                         [llvm_halforfloat_ty, LLVMMatchType<0>]>;
-def Sin  : DXILOpMapping<13, unary, int_sin,
-                         "Returns sine(theta) for theta in radians.",
-                         [llvm_halforfloat_ty, LLVMMatchType<0>]>;
-def Tan  : DXILOpMapping<14, unary, int_tan,
-                         "Returns tangent(theta) for theta in radians.",
-                         [llvm_halforfloat_ty, LLVMMatchType<0>]>;
-def ACos  : DXILOpMapping<15, unary, int_acos,
-                         "Returns the arccosine of each component of input.",
-                         [llvm_halforfloat_ty, LLVMMatchType<0>]>;
-def ASin  : DXILOpMapping<16, unary, int_asin,
-                         "Returns the arcsine of each component of input.",
-                         [llvm_halforfloat_ty, LLVMMatchType<0>]>;
-def ATan  : DXILOpMapping<17, unary, int_atan,
-                         "Returns the arctangent of each component of input.",
-                         [llvm_halforfloat_ty, LLVMMatchType<0>]>;
-def HCos  : DXILOpMapping<18, unary, int_cosh,
-                         "Returns the hyperbolic cosine of the specified value.",
-                         [llvm_halforfloat_ty, LLVMMatchType<0>]>;
-def HSin  : DXILOpMapping<19, unary, int_sinh,
-                         "Returns the hyperbolic sine of the specified value.",
-                         [llvm_halforfloat_ty, LLVMMatchType<0>]>;
-def HTan  : DXILOpMapping<20, unary, int_tanh,
-                         "Returns the hyperbolic tan of the specified value.",
-                         [llvm_halforfloat_ty, LLVMMatchType<0>]>;
-
-def Exp2 : DXILOpMapping<21, unary, int_exp2,
-                         "Returns the base 2 exponential, or 2**x, of the specified value."
-                         "exp2(x) = 2**x.",
-                         [llvm_halforfloat_ty, LLVMMatchType<0>]>;
-def Frac : DXILOpMapping<22, unary, int_dx_frac,
-                         "Returns a fraction from 0 to 1 that represents the "
-                         "decimal part of the input.",
-                         [llvm_halforfloat_ty, LLVMMatchType<0>]>;
-def Log2 : DXILOpMapping<23, unary, int_log2,
-                         "Returns the base-2 logarithm of the specified value.",
-                         [llvm_halforfloat_ty, LLVMMatchType<0>]>;
-def Sqrt : DXILOpMapping<24, unary, int_sqrt,
-                         "Returns the square root of the specified floating-point"
-                         "value, per component.",
-                         [llvm_halforfloat_ty, LLVMMatchType<0>]>;
-def RSqrt : DXILOpMapping<25, unary, int_dx_rsqrt,
-                         "Returns the reciprocal of the square root of the specified value."
-                         "rsqrt(x) = 1 / sqrt(x).",
-                         [llvm_halforfloat_ty, LLVMMatchType<0>]>;
-def Round : DXILOpMapping<26, unary, int_roundeven,
-                         "Returns the input rounded to the nearest integer"
-                         "within a floating-point type.",
-                         [llvm_halforfloat_ty, LLVMMatchType<0>]>;
-def Floor : DXILOpMapping<27, unary, int_floor,
-                         "Returns the largest integer that is less than or equal to the input.",
-                         [llvm_halforfloat_ty, LLVMMatchType<0>]>;
-def Ceil : DXILOpMapping<28, unary, int_ceil,
-                         "Returns the smallest integer that is greater than or equal to the input.",
-                         [llvm_halforfloat_ty, LLVMMatchType<0>]>;
-def Trunc : DXILOpMapping<29, unary, int_trunc,
-                         "Returns the specified value truncated to the integer component.",
-                         [llvm_halforfloat_ty, LLVMMatchType<0>]>;
-def Rbits : DXILOpMapping<30, unary, int_bitreverse,
-                         "Returns the specified value with its bits reversed.",
-                         [llvm_anyint_ty, LLVMMatchType<0>]>;
-def FMax : DXILOpMapping<35, binary, int_maxnum,
-                         "Float maximum. FMax(a,b) = a > b ? a : b">;
-def FMin : DXILOpMapping<36, binary, int_minnum,
-                         "Float minimum. FMin(a,b) = a < b ? a : b">;
-def SMax : DXILOpMapping<37, binary, int_smax,
-                         "Signed integer maximum. SMax(a,b) = a > b ? a : b">;
-def SMin : DXILOpMapping<38, binary, int_smin,
-                         "Signed integer minimum. SMin(a,b) = a < b ? a : b">;
-def UMax : DXILOpMapping<39, binary, int_umax,
-                         "Unsigned integer maximum. UMax(a,b) = a > b ? a : b">;
-def UMin : DXILOpMapping<40, binary, int_umin,
-                         "Unsigned integer minimum. UMin(a,b) = a < b ? a : b">;
-def FMad : DXILOpMapping<46, tertiary, int_fmuladd,
-                         "Floating point arithmetic multiply/add operation. fmad(m,a,b) = m * a + b.">;
-def IMad : DXILOpMapping<48, tertiary, int_dx_imad,
-                         "Signed integer arithmetic multiply/add operation. imad(m,a,b) = m * a + b.">;
-def UMad : DXILOpMapping<49, tertiary, int_dx_umad,
-                         "Unsigned integer arithmetic multiply/add operation. umad(m,a,b) = m * a + b.">;
-let OpTypes = !listconcat([llvm_halforfloat_ty], !listsplat(llvm_halforfloat_ty, 4)) in
-  def Dot2 : DXILOpMapping<54, dot2, int_dx_dot2,
-                           "dot product of two float vectors Dot(a,b) = a[0]*b[0] + ... + a[n]*b[n] where n is between 0 and 1">;
-let OpTypes = !listconcat([llvm_halforfloat_ty], !listsplat(llvm_halforfloat_ty, 6)) in
-  def Dot3 : DXILOpMapping<55, dot3, int_dx_dot3,
-                           "dot product of two float vectors Dot(a,b) = a[0]*b[0] + ... + a[n]*b[n] where n is between 0 and 2">;
-let OpTypes = !listconcat([llvm_halforfloat_ty], !listsplat(llvm_halforfloat_ty, 8)) in
-  def Dot4 : DXILOpMapping<56, dot4, int_dx_dot4,
-                           "dot product of two float vectors Dot(a,b) = a[0]*b[0] + ... + a[n]*b[n] where n is between 0 and 3">;
-def ThreadId : DXILOpMapping<93, threadId, int_dx_thread_id,
-                             "Reads the thread ID">;
-def GroupId  : DXILOpMapping<94, groupId, int_dx_group_id,
-                             "Reads the group ID (SV_GroupID)">;
-def ThreadIdInGroup : DXILOpMapping<95, threadIdInGroup,
-                                    int_dx_thread_id_in_group,
-                                    "Reads the thread ID within the group "
-                                    "(SV_GroupThreadID)">;
-def FlattenedThreadIdInGroup : DXILOpMapping<96, flattenedThreadIdInGroup,
-                                             int_dx_flattened_thread_id_in_group,
-                                             "Provides a flattened index for a "
-                                             "given thread within a given "
-                                             "group (SV_GroupIndex)">;
+class DXILShaderStage;
+
+def compute : DXILShaderStage;
+def domain : DXILShaderStage;
+def hull : DXILShaderStage;
+def pixel : DXILShaderStage;
+def vertex : DXILShaderStage;
+def geometry : DXILShaderStage;
+def library : DXILShaderStage;
+def amplification : DXILShaderStage;
+def mesh : DXILShaderStage;
+def node : DXILShaderStage;
+def raygeneration : DXILShaderStage;
+def intersection : DXILShaderStage;
+def anyhit : DXILShaderStage;
+def closesthit : DXILShaderStage;
+def callable : DXILShaderStage;
+def miss : DXILShaderStage;
+
+// Pseudo-stages
+// Denote DXIL Op to be supported in all stages
+def all_stages : DXILShaderStage;
+// Denote support for DXIL Op to have been removed
+def removed : DXILShaderStage;
+// DXIL Op attributes
+
+class DXILAttribute;
+
+def ReadOnly : DXILAttribute;
+def ReadNone : DXILAttribute;
+def IsDerivative : DXILAttribute;
+def IsGradient : DXILAttribute;
+def IsFeedback : DXILAttribute;
+def IsWave : DXILAttribute;
+def NeedsUniformInputs : DXILAttribute;
+def IsBarrier : DXILAttribute;
+
+class Overloads<Version ver, list<LLVMType> ols> {
+  Version dxil_version = ver;
+  list<LLVMType> overload_types = ols;
+}
+
+class Stages<Version ver, list<DXILShaderStage> st> {
+  Version dxil_version = ver;
+  list<DXILShaderStage> shader_stages = st;
+}
+
+class Attributes<Version ver = DXIL1_0, list<DXILAttribute> attrs> {
+  Version dxil_version = ver;
+  list<DXILAttribute> op_attrs = attrs;
+}
+
+// Abstraction DXIL Operation
+class DXILOp<int opcode, DXILOpClass opclass> {
+  // A short description of the operation
+  string Doc = "";
+
+  // Opcode of DXIL Operation
+  int OpCode = opcode;
+
+  // Class of DXIL Operation.
+  DXILOpClass OpClass = opclass;
+
+  // LLVM Intrinsic DXIL Operation maps to
+  Intrinsic LLVMIntrinsic = ?;
+
+  // Result type of the op
+  LLVMType result;
+
+  // List of argument types of the op. Default to 0 arguments.
+  list<LLVMType> arguments = [];
+
+  // List of valid overload types predicated by DXIL version
+  list<Overloads> overloads = [];
+
+  // List of valid shader stages predicated by DXIL version
+  list<Stages> stages;
+
+  // Versioned attributes of operation
+  list<Attributes> attributes = [];
+}
+
+// Concrete definitions of DXIL Operations
+
+def Abs :  DXILOp<6, unary> {
+  let Doc = "Returns the absolute value of the input.";
+  let LLVMIntrinsic = int_fabs;
+  let arguments = [LLVMMatchType<0>];
+  let result = overloadTy;
+  let overloads = [Overloads<DXIL1_0, [halfTy, floatTy, doubleTy]>];
+  let stages = [Stages<DXIL1_0, [all_stages]>];
+  let attributes = [Attributes<DXIL1_0, [ReadNone]>];
+}
+
+def IsInf :  DXILOp<9, isSpecialFloat> {
+  let Doc = "Determines if the specified value is infinite.";
+  let LLVMIntrinsic = int_dx_isinf;
+  let arguments = [overloadTy];
+  let result = i1Ty;
+  let overloads = [Overloads<DXIL1_0, [halfTy, floatTy]>];
+  let stages = [Stages<DXIL1_0, [all_stages]>];
+  let attributes = [Attributes<DXIL1_0, [ReadNone]>];
+}
+
+def Cos :  DXILOp<12, unary> {
+  let Doc = "Returns cosine(theta) for theta in radians.";
+  let LLVMIntrinsic = int_cos;
+  let arguments = [LLVMMatchType<0>];
+  let result = overloadTy;
+  let overloads = [Overloads<DXIL1_0, [halfTy, floatTy]>];
+  let stages = [Stages<DXIL1_0, [all_stages]>];
+  let attributes = [Attributes<DXIL1_0, [ReadNone]>];
+}
+
+def Sin :  DXILOp<13, unary> {
+  let Doc = "Returns sine(theta) for theta in radians.";
+  let LLVMIntrinsic = int_sin;
+  let arguments = [LLVMMatchType<0>];
+  let result = overloadTy;
+  let overloads = [Overloads<DXIL1_0, [halfTy, floatTy]>];
+  let stages = [Stages<DXIL1_0, [all_stages]>];
+  let attributes = [Attributes<DXIL1_0, [ReadNone]>];
+}
+
+def Tan :  DXILOp<14, unary> {
+  let Doc = "Returns tangent(theta) for theta in radians.";
+  let LLVMIntrinsic = int_tan;
+  let arguments = [LLVMMatchType<0>];
+  let result = overloadTy;
+  let overloads = [Overloads<DXIL1_0, [halfTy, floatTy]>];
+  let stages = [Stages<DXIL1_0, [all_stages]>];
+  let attributes = [Attributes<DXIL1_0, [ReadNone]>];
+}
+
+def ACos :  DXILOp<15, unary> {
+  let Doc = "Returns the arccosine of the specified value.";
+  let LLVMIntrinsic = int_acos;
+  let arguments = [LLVMMatchType<0>];
+  let result = overloadTy;
+  let overloads = [Overloads<DXIL1_0, [halfTy, floatTy]>];
+  let stages = [Stages<DXIL1_0, [all_stages]>];
+  let attributes = [Attributes<DXIL1_0, [ReadNone]>];
+}
+
+def ASin :  DXILOp<16, unary> {
+  let Doc = "Returns the arcsine of the specified value.";
+  let LLVMIntrinsic = int_asin;
+  let arguments = [LLVMMatchType<0>];
+  let result = overloadTy;
+  let overloads = [Overloads<DXIL1_0, [halfTy, floatTy]>];
+  let stages = [Stages<DXIL1_0, [all_stages]>];
+  let attributes = [Attributes<DXIL1_0, [ReadNone]>];
+}
+
+def ATan :  DXILOp<17, unary> {
+  let Doc = "Returns the arctangent of the specified value.";
+  let LLVMIntrinsic = int_atan;
+  let arguments = [LLVMMatchType<0>];
+  let result = overloadTy;
+  let overloads = [Overloads<DXIL1_0, [halfTy, floatTy]>];
+  let stages = [Stages<DXIL1_0, [all_stages]>];
+  let attributes = [Attributes<DXIL1_0, [ReadNone]>];
+}
+
+def HCos :  DXILOp<18, unary> {
+  let Doc = "Returns the hyperbolic cosine of the specified value.";
+  let LLVMIntrinsic = int_cosh;
+  let arguments = [LLVMMatchType<0>];
+  let result = overloadTy;
+  let overloads = [Overloads<DXIL1_0, [halfTy, floatTy]>];
+  let stages = [Stages<DXIL1_0, [all_stages]>];
+  let attributes = [Attributes<DXIL1_0, [ReadNone]>];
+}
+
+def HSin :  DXILOp<19, unary> {
+  let Doc = "Returns the hyperbolic sine of the specified value.";
+  let LLVMIntrinsic = int_sinh;
+  let arguments = [LLVMMatchType<0>];
+  let result = overloadTy;
+  let overloads = [Overloads<DXIL1_0, [halfTy, floatTy]>];
+  let stages = [Stages<DXIL1_0, [all_stages]>];
+  let attributes = [Attributes<DXIL1_0, [ReadNone]>];
+}
+
+def HTan :  DXILOp<20, unary> {
+  let Doc = "Returns the hyperbolic tan of the specified value.";
+  let LLVMIntrinsic = int_tanh;
+  let arguments = [LLVMMatchType<0>];
+  let result = overloadTy;
+  let overloads = [Overloads<DXIL1_0, [halfTy, floatTy]>];
+  let stages = [Stages<DXIL1_0, [all_stages]>];
+  let attributes = [Attributes<DXIL1_0, [ReadNone]>];
+}
+
+def Exp2 :  DXILOp<21, unary> {
+  let Doc = "Returns the base 2 exponential, or 2**x, of the specified value. "
+            "exp2(x) = 2**x.";
+  let LLVMIntrinsic = int_exp2;
+  let arguments = [LLVMMatchType<0>];
+  let result = overloadTy;
+  let overloads = [Overloads<DXIL1_0, [halfTy, floatTy]>];
+  let stages = [Stages<DXIL1_0, [all_stages]>];
+  let attributes = [Attributes<DXIL1_0, [ReadNone]>];
+}
+
+def Frac :  DXILOp<22, unary> {
+  let Doc = "Returns a fraction from 0 to 1 that represents the decimal part "
+            "of the input.";
+  let LLVMIntrinsic = int_dx_frac;
+  let arguments = [LLVMMatchType<0>];
+  let result = overloadTy;
+  let overloads = [Overloads<DXIL1_0, [halfTy, floatTy]>];
+  let stages = [Stages<DXIL1_0, [all_stages]>];
+  let attributes = [Attributes<DXIL1_0, [ReadNone]>];
+}
+
+def Log2 :  DXILOp<23, unary> {
+  let Doc = "Returns the base-2 logarithm of the specified value.";
+  let LLVMIntrinsic = int_log2;
+  let arguments = [LLVMMatchType<0>];
+  let result = overloadTy;
+  let overloads = [Overloads<DXIL1_0, [halfTy, floatTy]>];
+  let stages = [Stages<DXIL1_0, [all_stages]>];
+  let attributes = [Attributes<DXIL1_0, [ReadNone]>];
+}
+
+def Sqrt :  DXILOp<24, unary> {
+  let Doc = "Returns the square root of the specified floating-point value, "
+            "per component.";
+  let LLVMIntrinsic = int_sqrt;
+  let arguments = [LLVMMatchType<0>];
+  let result = overloadTy;
+  let overloads = [Overloads<DXIL1_0, [halfTy, floatTy]>];
+  let stages = [Stages<DXIL1_0, [all_stages]>];
+  let attributes = [Attributes<DXIL1_0, [ReadNone]>];
+}
+
+def RSqrt :  DXILOp<25, unary> {
+  let Doc = "Returns the reciprocal of the square root of the specified value. "
+            "rsqrt(x) = 1 / sqrt(x).";
+  let LLVMIntrinsic = int_dx_rsqrt;
+  let arguments = [LLVMMatchType<0>];
+  let result = overloadTy;
+  let overloads = [Overloads<DXIL1_0, [halfTy, floatTy]>];
+  let stages = [Stages<DXIL1_0, [all_stages]>];
+  let attributes = [Attributes<DXIL1_0, [ReadNone]>];
+}
+
+def Round :  DXILOp<26, unary> {
+  let Doc = "Returns the input rounded to the nearest integer within a "
+            "floating-point type.";
+  let LLVMIntrinsic = int_roundeven;
+  let arguments = [LLVMMatchType<0>];
+  let result = overloadTy;
+  let overloads = [Overloads<DXIL1_0, [halfTy, floatTy]>];
+  let stages = [Stages<DXIL1_0, [all_stages]>];
+  let attributes = [Attributes<DXIL1_0, [ReadNone]>];
+}
+
+def Floor :  DXILOp<27, unary> {
+  let Doc =
+      "Returns the largest integer that is less than or equal to the input.";
+  let LLVMIntrinsic = int_floor;
+  let arguments = [LLVMMatchType<0>];
+  let result = overloadTy;
+  let overloads = [Overloads<DXIL1_0, [halfTy, floatTy]>];
+  let stages = [Stages<DXIL1_0, [all_stages]>];
+  let attributes = [Attributes<DXIL1_0, [ReadNone]>];
+}
+
+def Ceil :  DXILOp<28, unary> {
+  let Doc = "Returns the smallest integer that is greater than or equal to the "
+            "input.";
+  let LLVMIntrinsic = int_ceil;
+  let arguments = [LLVMMatchType<0>];
+  let result = overloadTy;
+  let overloads = [Overloads<DXIL1_0, [halfTy, floatTy]>];
+  let stages = [Stages<DXIL1_0, [all_stages]>];
+  let attributes = [Attributes<DXIL1_0, [ReadNone]>];
+}
+
+def Trunc :  DXILOp<29, unary> {
+  let Doc = "Returns the specified value truncated to the integer component.";
+  let LLVMIntrinsic = int_trunc;
+  let arguments = [LLVMMatchType<0>];
+  let result = overloadTy;
+  let overloads = [Overloads<DXIL1_0, [halfTy, floatTy]>];
+  let stages = [Stages<DXIL1_0, [all_stages]>];
+  let attributes = [Attributes<DXIL1_0, [ReadNone]>];
+}
+
+def Rbits :  DXILOp<30, unary> {
+  let Doc = "Returns the specified value with its bits reversed.";
+  let LLVMIntrinsic = int_bitreverse;
+  let arguments = [LLVMMatchType<0>];
+  let result = overloadTy;
+  let overloads =
+      [Overloads<DXIL1_0, [i16Ty, i32Ty, i64Ty]>];
+  let stages = [Stages<DXIL1_0, [all_stages]>];
+  let attributes = [Attributes<DXIL1_0, [ReadNone]>];
+}
+
+def FMax :  DXILOp<35, binary> {
+  let Doc = "Float maximum. FMax(a,b) = a > b ? a : b";
+  let LLVMIntrinsic = int_maxnum;
+  let arguments = [LLVMMatchType<0>, LLVMMatchType<0>];
+  let result = overloadTy;
+  let overloads =
+      [Overloads<DXIL1_0, [halfTy, floatTy, doubleTy]>];
+  let stages = [Stages<DXIL1_0, [all_stages]>];
+  let attributes = [Attributes<DXIL1_0, [ReadNone]>];
+}
+
+def FMin :  DXILOp<36, binary> {
+  let Doc = "Float minimum. FMin(a,b) = a < b ? a : b";
+  let LLVMIntrinsic = int_minnum;
+  let arguments = [LLVMMatchType<0>, LLVMMatchType<0>];
+  let result = overloadTy;
+  let overloads =
+      [Overloads<DXIL1_0, [halfTy, floatTy, doubleTy]>];
+  let stages = [Stages<DXIL1_0, [all_stages]>];
+  let attributes = [Attributes<DXIL1_0, [ReadNone]>];
+}
+
+def SMax :  DXILOp<37, binary> {
+  let Doc = "Signed integer maximum. SMax(a,b) = a > b ? a : b";
+  let LLVMIntrinsic = int_smax;
+  let arguments = [LLVMMatchType<0>, LLVMMatchType<0>];
+  let result = overloadTy;
+  let overloads =
+      [Overloads<DXIL1_0, [i16Ty, i32Ty, i64Ty]>];
+  let stages = [Stages<DXIL1_0, [all_stages]>];
+  let attributes = [Attributes<DXIL1_0, [ReadNone]>];
+}
+
+def SMin :  DXILOp<38, binary> {
+  let Doc = "Signed integer minimum. SMin(a,b) = a < b ? a : b";
+  let LLVMIntrinsic = int_smin;
+  let arguments = [LLVMMatchType<0>, LLVMMatchType<0>];
+  let result = overloadTy;
+  let overloads =
+      [Overloads<DXIL1_0, [i16Ty, i32Ty, i64Ty]>];
+  let stages = [Stages<DXIL1_0, [all_stages]>];
+  let attributes = [Attributes<DXIL1_0, [ReadNone]>];
+}
+
+def UMax :  DXILOp<39, binary> {
+  let Doc = "Unsigned integer maximum. UMax(a,b) = a > b ? a : b";
+  let LLVMIntrinsic = int_umax;
+  let arguments = [LLVMMatchType<0>, LLVMMatchType<0>];
+  let result = overloadTy;
+  let overloads =
+      [Overloads<DXIL1_0, [i16Ty, i32Ty, i64Ty]>];
+  let stages = [Stages<DXIL1_0, [all_stages]>];
+  let attributes = [Attributes<DXIL1_0, [ReadNone]>];
+}
+
+def UMin :  DXILOp<40, binary> {
+  let Doc = "Unsigned integer minimum. UMin(a,b) = a < b ? a : b";
+  let LLVMIntrinsic = int_umin;
+  let arguments = [LLVMMatchType<0>, LLVMMatchType<0>];
+  let result = overloadTy;
+  let overloads =
+      [Overloads<DXIL1_0, [i16Ty, i32Ty, i64Ty]>];
+  let stages = [Stages<DXIL1_0, [all_stages]>];
+  let attributes = [Attributes<DXIL1_0, [ReadNone]>];
+}
+
+def FMad :  DXILOp<46, tertiary> {
+  let Doc = "Floating point arithmetic multiply/add operation. fmad(m,a,b) = m "
+            "* a + b.";
+  let LLVMIntrinsic = int_fmuladd;
+  let arguments = [LLVMMatchType<0>, LLVMMatchType<0>, LLVMMatchType<0>];
+  let result = overloadTy;
+  let overloads =
+      [Overloads<DXIL1_0, [halfTy, floatTy, doubleTy]>];
+  let stages = [Stages<DXIL1_0, [all_stages]>];
+  let attributes = [Attributes<DXIL1_0, [ReadNone]>];
+}
+
+def IMad :  DXILOp<48, tertiary> {
+  let Doc = "Signed integer arithmetic multiply/add operation. imad(m,a,b) = m "
+            "* a + b.";
+  let LLVMIntrinsic = int_dx_imad;
+  let arguments = [LLVMMatchType<0>, LLVMMatchType<0>, LLVMMatchType<0>];
+  let result = overloadTy;
+  let overloads =
+      [Overloads<DXIL1_0, [i16Ty, i32Ty, i64Ty]>];
+  let stages = [Stages<DXIL1_0, [all_stages]>];
+  let attributes = [Attributes<DXIL1_0, [ReadNone]>];
+}
+
+def UMad :  DXILOp<49, tertiary> {
+  let Doc = "Unsigned integer arithmetic multiply/add operation. umad(m,a, = m "
+            "* a + b.";
+  let LLVMIntrinsic = int_dx_umad;
+  let arguments = [LLVMMatchType<0>, LLVMMatchType<0>, LLVMMatchType<0>];
+  let result = overloadTy;
+  let overloads =
+      [Overloads<DXIL1_0, [i16Ty, i32Ty, i64Ty]>];
+  let stages = [Stages<DXIL1_0, [all_stages]>];
+  let attributes = [Attributes<DXIL1_0, [ReadNone]>];
+}
+
+def Dot2 :  DXILOp<54, dot2> {
+  let Doc = "dot product of two float vectors Dot(a,b) = a[0]*b[0] + ... + "
+            "a[n]*b[n] where n is between 0 and 1";
+  let LLVMIntrinsic = int_dx_dot2;
+  let arguments = !listsplat(overloadTy, 4);
+  let result = overloadTy;
+  let overloads = [Overloads<DXIL1_0, [halfTy, floatTy]>];
+  let stages = [Stages<DXIL1_0, [all_stages]>];
+  let attributes = [Attributes<DXIL1_0, [ReadNone]>];
+}
+
+def Dot3 :  DXILOp<55, dot3> {
+  let Doc = "dot product of two float vectors Dot(a,b) = a[0]*b[0] + ... + "
+            "a[n]*b[n] where n is between 0 and 2";
+  let LLVMIntrinsic = int_dx_dot3;
+  let arguments = !listsplat(overloadTy, 6);
+  let result = overloadTy;
+  let overloads = [Overloads<DXIL1_0, [halfTy, floatTy]>];
+  let stages = [Stages<DXIL1_0, [all_stages]>];
+  let attributes = [Attributes<DXIL1_0, [ReadNone]>];
+}
+
+def Dot4 :  DXILOp<56, dot4> {
+  let Doc = "dot product of two float vectors Dot(a,b) = a[0]*b[0] + ... + "
+            "a[n]*b[n] where n is between 0 and 3";
+  let LLVMIntrinsic = int_dx_dot4;
+  let arguments = !listsplat(overloadTy, 8);
+  let result = overloadTy;
+  let overloads = [Overloads<DXIL1_0, [halfTy, floatTy]>];
+  let stages = [Stages<DXIL1_0, [all_stages]>];
+  let attributes = [Attributes<DXIL1_0, [ReadNone]>];
+}
+
+def ThreadId :  DXILOp<93, threadId> {
+  let Doc = "Reads the thread ID";
+  let LLVMIntrinsic = int_dx_thread_id;
+  let arguments = [i32Ty];
+  let result = i32Ty;
+  let stages = [Stages<DXIL1_0, [compute, mesh, amplification, node]>];
+  let attributes = [Attributes<DXIL1_0, [ReadNone]>];
+}
+
+def GroupId :  DXILOp<94, groupId> {
+  let Doc = "Reads the group ID (SV_GroupID)";
+  let LLVMIntrinsic = int_dx_group_id;
+  let arguments = [i32Ty];
+  let result = i32Ty;
+  let stages = [Stages<DXIL1_0, [compute, mesh, amplification, node]>];
+  let attributes = [Attributes<DXIL1_0, [ReadNone]>];
+}
+
+def ThreadIdInGroup :  DXILOp<95, threadIdInGroup> {
+  let Doc = "Reads the thread ID within the group  (SV_GroupThreadID)";
+  let LLVMIntrinsic = int_dx_thread_id_in_group;
+  let arguments = [i32Ty];
+  let result = i32Ty;
+  let stages = [Stages<DXIL1_0, [compute, mesh, amplification, node]>];
+  let attributes = [Attributes<DXIL1_0, [ReadNone]>];
+}
+
+def FlattenedThreadIdInGroup :  DXILOp<96, flattenedThreadIdInGroup> {
+  let Doc = "Provides a flattened index for a given thread within a given "
+            "group (SV_GroupIndex)";
+  let LLVMIntrinsic = int_dx_flattened_thread_id_in_group;
+  let result = i32Ty;
+  let stages = [Stages<DXIL1_0, [compute, mesh, amplification, node]>];
+  let attributes = [Attributes<DXIL1_0, [ReadNone]>];
+}
diff --git a/llvm/lib/Target/DirectX/DXILConstants.h b/llvm/lib/Target/DirectX/DXILConstants.h
index f7e37ba..0c9c1ac 100644
--- a/llvm/lib/Target/DirectX/DXILConstants.h
+++ b/llvm/lib/Target/DirectX/DXILConstants.h
@@ -15,9 +15,15 @@
 namespace llvm {
 namespace dxil {
 
-#define DXIL_OP_ENUM
+enum class OpCode : unsigned {
+#define DXIL_OPCODE(Op, Name) Name = Op,
 #include "DXILOperation.inc"
-#undef DXIL_OP_ENUM
+};
+
+enum class OpCodeClass : unsigned {
+#define DXIL_OPCLASS(Name) Name,
+#include "DXILOperation.inc"
+};
 
 } // namespace dxil
 } // namespace llvm
diff --git a/llvm/lib/Target/DirectX/DXILOpBuilder.cpp b/llvm/lib/Target/DirectX/DXILOpBuilder.cpp
index 0b3982e..a03701b 100644
--- a/llvm/lib/Target/DirectX/DXILOpBuilder.cpp
+++ b/llvm/lib/Target/DirectX/DXILOpBuilder.cpp
@@ -15,6 +15,7 @@
 #include "llvm/IR/Module.h"
 #include "llvm/Support/DXILABI.h"
 #include "llvm/Support/ErrorHandling.h"
+#include <optional>
 
 using namespace llvm;
 using namespace llvm::dxil;
@@ -22,8 +23,8 @@ using namespace llvm::dxil;
 constexpr StringLiteral DXILOpNamePrefix = "dx.op.";
 
 namespace {
-
 enum OverloadKind : uint16_t {
+  UNDEFINED = 0,
   VOID = 1,
   HALF = 1 << 1,
   FLOAT = 1 << 2,
@@ -36,9 +37,27 @@ enum OverloadKind : uint16_t {
   UserDefineType = 1 << 9,
   ObjectType = 1 << 10,
 };
+struct Version {
+  unsigned Major = 0;
+  unsigned Minor = 0;
+};
 
+struct OpOverload {
+  Version DXILVersion;
+  uint16_t ValidTys;
+};
 } // namespace
 
+struct OpStage {
+  Version DXILVersion;
+  uint32_t ValidStages;
+};
+
+struct OpAttribute {
+  Version DXILVersion;
+  uint32_t ValidAttrs;
+};
+
 static const char *getOverloadTypeName(OverloadKind Kind) {
   switch (Kind) {
   case OverloadKind::HALF:
@@ -58,12 +77,13 @@ static const char *getOverloadTypeName(OverloadKind Kind) {
   case OverloadKind::I64:
     return "i64";
   case OverloadKind::VOID:
+  case OverloadKind::UNDEFINED:
+    return "void";
   case OverloadKind::ObjectType:
   case OverloadKind::UserDefineType:
     break;
   }
   llvm_unreachable("invalid overload type for name");
-  return "void";
 }
 
 static OverloadKind getOverloadKind(Type *Ty) {
@@ -131,8 +151,9 @@ struct OpCodeProperty {
   dxil::OpCodeClass OpCodeClass;
   // Offset in DXILOpCodeClassNameTable.
   unsigned OpCodeClassNameOffset;
-  uint16_t OverloadTys;
-  llvm::Attribute::AttrKind FuncAttr;
+  llvm::SmallVector<OpOverload> Overloads;
+  llvm::SmallVector<OpStage> Stages;
+  llvm::SmallVector<OpAttribute> Attributes;
   int OverloadParamIndex;        // parameter index which control the overload.
                                  // When < 0, should be only 1 overload type.
   unsigned NumOfParameters;      // Number of parameters include return value.
@@ -221,6 +242,45 @@ static Type *getTypeFromParameterKind(ParameterKind Kind, Type *OverloadTy) {
   return nullptr;
 }
 
+static ShaderKind getShaderKindEnum(Triple::EnvironmentType EnvType) {
+  switch (EnvType) {
+  case Triple::Pixel:
+    return ShaderKind::pixel;
+  case Triple::Vertex:
+    return ShaderKind::vertex;
+  case Triple::Geometry:
+    return ShaderKind::geometry;
+  case Triple::Hull:
+    return ShaderKind::hull;
+  case Triple::Domain:
+    return ShaderKind::domain;
+  case Triple::Compute:
+    return ShaderKind::compute;
+  case Triple::Library:
+    return ShaderKind::library;
+  case Triple::RayGeneration:
+    return ShaderKind::raygeneration;
+  case Triple::Intersection:
+    return ShaderKind::intersection;
+  case Triple::AnyHit:
+    return ShaderKind::anyhit;
+  case Triple::ClosestHit:
+    return ShaderKind::closesthit;
+  case Triple::Miss:
+    return ShaderKind::miss;
+  case Triple::Callable:
+    return ShaderKind::callable;
+  case Triple::Mesh:
+    return ShaderKind::mesh;
+  case Triple::Amplification:
+    return ShaderKind::amplification;
+  default:
+    break;
+  }
+  llvm_unreachable(
+      "Shader Kind Not Found - Invalid DXIL Environment Specified");
+}
+
 /// Construct DXIL function type. This is the type of a function with
 /// the following prototype
 ///     OverloadType dx.op.<opclass>.<return-type>(int opcode, <param types>)
@@ -232,7 +292,7 @@ static FunctionType *getDXILOpFunctionType(const OpCodeProperty *Prop,
                                            Type *ReturnTy, Type *OverloadTy) {
   SmallVector<Type *> ArgTys;
 
-  auto ParamKinds = getOpCodeParameterKind(*Prop);
+  const ParameterKind *ParamKinds = getOpCodeParameterKind(*Prop);
 
   // Add ReturnTy as return type of the function
   ArgTys.emplace_back(ReturnTy);
@@ -249,17 +309,103 @@ static FunctionType *getDXILOpFunctionType(const OpCodeProperty *Prop,
       ArgTys[0], ArrayRef<Type *>(&ArgTys[1], ArgTys.size() - 1), false);
 }
 
+/// Get index of the property from PropList valid for the most recent
+/// DXIL version not greater than DXILVer.
+/// PropList is expected to be sorted in ascending order of DXIL version.
+template <typename T>
+static std::optional<size_t> getPropIndex(ArrayRef<T> PropList,
+                                          const VersionTuple DXILVer) {
+  size_t Index = PropList.size() - 1;
+  for (auto Iter = PropList.rbegin(); Iter != PropList.rend();
+       Iter++, Index--) {
+    const T &Prop = *Iter;
+    if (VersionTuple(Prop.DXILVersion.Major, Prop.DXILVersion.Minor) <=
+        DXILVer) {
+      return Index;
+    }
+  }
+  return std::nullopt;
+}
+
 namespace llvm {
 namespace dxil {
 
+// No extra checks on TargetTriple need be performed to verify that the
+// Triple is well-formed or that the target is supported since these checks
+// would have been done at the time the module M is constructed in the earlier
+// stages of compilation.
+DXILOpBuilder::DXILOpBuilder(Module &M, IRBuilderBase &B) : M(M), B(B) {
+  Triple TT(Triple(M.getTargetTriple()));
+  DXILVersion = TT.getDXILVersion();
+  ShaderStage = TT.getEnvironment();
+  // Ensure Environment type is known
+  if (ShaderStage == Triple::UnknownEnvironment) {
+    report_fatal_error(
+        Twine(DXILVersion.getAsString()) +
+            ": Unknown Compilation Target Shader Stage specified ",
+        /*gen_crash_diag*/ false);
+  }
+}
+
 CallInst *DXILOpBuilder::createDXILOpCall(dxil::OpCode OpCode, Type *ReturnTy,
                                           Type *OverloadTy,
                                           SmallVector<Value *> Args) {
+
   const OpCodeProperty *Prop = getOpCodeProperty(OpCode);
+  std::optional<size_t> OlIndexOrErr =
+      getPropIndex(ArrayRef(Prop->Overloads), DXILVersion);
+  if (!OlIndexOrErr.has_value()) {
+    report_fatal_error(Twine(getOpCodeName(OpCode)) +
+                           ": No valid overloads found for DXIL Version - " +
+                           DXILVersion.getAsString(),
+                       /*gen_crash_diag*/ false);
+  }
+  uint16_t ValidTyMask = Prop->Overloads[*OlIndexOrErr].ValidTys;
 
   OverloadKind Kind = getOverloadKind(OverloadTy);
-  if ((Prop->OverloadTys & (uint16_t)Kind) == 0) {
-    report_fatal_error("Invalid Overload Type", /* gen_crash_diag=*/false);
+
+  // Check if the operation supports overload types and OverloadTy is valid
+  // per the specified types for the operation
+  if ((ValidTyMask != OverloadKind::UNDEFINED) &&
+      (ValidTyMask & (uint16_t)Kind) == 0) {
+    report_fatal_error(Twine("Invalid Overload Type for DXIL operation - ") +
+                           getOpCodeName(OpCode),
+                       /* gen_crash_diag=*/false);
+  }
+
+  // Perform necessary checks to ensure Opcode is valid in the targeted shader
+  // kind
+  std::optional<size_t> StIndexOrErr =
+      getPropIndex(ArrayRef(Prop->Stages), DXILVersion);
+  if (!StIndexOrErr.has_value()) {
+    report_fatal_error(Twine(getOpCodeName(OpCode)) +
+                           ": No valid stages found for DXIL Version - " +
+                           DXILVersion.getAsString(),
+                       /*gen_crash_diag*/ false);
+  }
+  uint16_t ValidShaderKindMask = Prop->Stages[*StIndexOrErr].ValidStages;
+
+  // Ensure valid shader stage properties are specified
+  if (ValidShaderKindMask == ShaderKind::removed) {
+    report_fatal_error(
+        Twine(DXILVersion.getAsString()) +
+            ": Unsupported Target Shader Stage for DXIL operation - " +
+            getOpCodeName(OpCode),
+        /*gen_crash_diag*/ false);
+  }
+
+  // Shader stage need not be validated since getShaderKindEnum() fails
+  // for unknown shader stage.
+
+  // Verify the target shader stage is valid for the DXIL operation
+  ShaderKind ModuleStagekind = getShaderKindEnum(ShaderStage);
+  if (!(ValidShaderKindMask & ModuleStagekind)) {
+    auto ShaderEnvStr = Triple::getEnvironmentTypeName(ShaderStage);
+    report_fatal_error(Twine(ShaderEnvStr) +
+                           " : Invalid Shader Stage for DXIL operation - " +
+                           getOpCodeName(OpCode) + " for DXIL Version " +
+                           DXILVersion.getAsString(),
+                       /*gen_crash_diag*/ false);
   }
 
   std::string DXILFnName = constructOverloadName(Kind, OverloadTy, *Prop);
@@ -282,40 +428,18 @@ Type *DXILOpBuilder::getOverloadTy(dxil::OpCode OpCode, FunctionType *FT) {
   // If DXIL Op has no overload parameter, just return the
   // precise return type specified.
   if (Prop->OverloadParamIndex < 0) {
-    auto &Ctx = FT->getContext();
-    switch (Prop->OverloadTys) {
-    case OverloadKind::VOID:
-      return Type::getVoidTy(Ctx);
-    case OverloadKind::HALF:
-      return Type::getHalfTy(Ctx);
-    case OverloadKind::FLOAT:
-      return Type::getFloatTy(Ctx);
-    case OverloadKind::DOUBLE:
-      return Type::getDoubleTy(Ctx);
-    case OverloadKind::I1:
-      return Type::getInt1Ty(Ctx);
-    case OverloadKind::I8:
-      return Type::getInt8Ty(Ctx);
-    case OverloadKind::I16:
-      return Type::getInt16Ty(Ctx);
-    case OverloadKind::I32:
-      return Type::getInt32Ty(Ctx);
-    case OverloadKind::I64:
-      return Type::getInt64Ty(Ctx);
-    default:
-      llvm_unreachable("invalid overload type");
-      return nullptr;
-    }
+    return FT->getReturnType();
   }
 
-  // Prop->OverloadParamIndex is 0, overload type is FT->getReturnType().
+  // Consider FT->getReturnType() as default overload type, unless
+  // Prop->OverloadParamIndex != 0.
   Type *OverloadType = FT->getReturnType();
   if (Prop->OverloadParamIndex != 0) {
     // Skip Return Type.
     OverloadType = FT->getParamType(Prop->OverloadParamIndex - 1);
   }
 
-  auto ParamKinds = getOpCodeParameterKind(*Prop);
+  const ParameterKind *ParamKinds = getOpCodeParameterKind(*Prop);
   auto Kind = ParamKinds[Prop->OverloadParamIndex];
   // For ResRet and CBufferRet, OverloadTy is in field of StructType.
   if (Kind == ParameterKind::CBufferRet ||
diff --git a/llvm/lib/Target/DirectX/DXILOpBuilder.h b/llvm/lib/Target/DirectX/DXILOpBuilder.h
index 5babeae..abb9a8d 100644
--- a/llvm/lib/Target/DirectX/DXILOpBuilder.h
+++ b/llvm/lib/Target/DirectX/DXILOpBuilder.h
@@ -14,6 +14,7 @@
 
 #include "DXILConstants.h"
 #include "llvm/ADT/SmallVector.h"
+#include "llvm/TargetParser/Triple.h"
 
 namespace llvm {
 class Module;
@@ -28,11 +29,16 @@ namespace dxil {
 
 class DXILOpBuilder {
 public:
-  DXILOpBuilder(Module &M, IRBuilderBase &B) : M(M), B(B) {}
+  DXILOpBuilder(Module &M, IRBuilderBase &B);
   /// Create an instruction that calls DXIL Op with return type, specified
-  /// opcode, and call arguments. \param OpCode Opcode of the DXIL Op call
-  /// constructed \param ReturnTy Return type of the DXIL Op call constructed
+  /// opcode, and call arguments.
+  ///
+  /// \param OpCode Opcode of the DXIL Op call constructed
+  /// \param SMVer Shader Model Version of DXIL Module being constructed.
+  /// \param StageKind Shader Stage for DXIL Module being constructed.
+  /// \param ReturnTy Return type of the DXIL Op call constructed
   /// \param OverloadTy Overload type of the DXIL Op call constructed
+  /// \param Args Arguments for the DXIL Op call constructed
   /// \return DXIL Op call constructed
   CallInst *createDXILOpCall(dxil::OpCode OpCode, Type *ReturnTy,
                              Type *OverloadTy, SmallVector<Value *> Args);
@@ -42,6 +48,8 @@ public:
 private:
   Module &M;
   IRBuilderBase &B;
+  VersionTuple DXILVersion;
+  Triple::EnvironmentType ShaderStage;
 };
 
 } // namespace dxil
diff --git a/llvm/lib/Target/DirectX/DXILOpLowering.cpp b/llvm/lib/Target/DirectX/DXILOpLowering.cpp
index 1329308..0c0edde4 100644
--- a/llvm/lib/Target/DirectX/DXILOpLowering.cpp
+++ b/llvm/lib/Target/DirectX/DXILOpLowering.cpp
@@ -104,20 +104,19 @@ static void lowerIntrinsic(dxil::OpCode DXILOp, Function &F, Module &M) {
 static bool lowerIntrinsics(Module &M) {
   bool Updated = false;
 
-#define DXIL_OP_INTRINSIC_MAP
-#include "DXILOperation.inc"
-#undef DXIL_OP_INTRINSIC_MAP
-
   for (Function &F : make_early_inc_range(M.functions())) {
     if (!F.isDeclaration())
       continue;
     Intrinsic::ID ID = F.getIntrinsicID();
-    if (ID == Intrinsic::not_intrinsic)
+    switch (ID) {
+    default:
       continue;
-    auto LowerIt = LowerMap.find(ID);
-    if (LowerIt == LowerMap.end())
-      continue;
-    lowerIntrinsic(LowerIt->second, F, M);
+#define DXIL_OP_INTRINSIC(OpCode, Intrin)                                      \
+  case Intrin:                                                                 \
+    lowerIntrinsic(OpCode, F, M);                                              \
+    break;
+#include "DXILOperation.inc"
+    }
     Updated = true;
   }
   return Updated;
diff --git a/llvm/lib/Target/DirectX/DXILPrepare.cpp b/llvm/lib/Target/DirectX/DXILPrepare.cpp
index 889de3a..5609886 100644
--- a/llvm/lib/Target/DirectX/DXILPrepare.cpp
+++ b/llvm/lib/Target/DirectX/DXILPrepare.cpp
@@ -246,7 +246,7 @@ public:
   DXILPrepareModule() : ModulePass(ID) {}
   void getAnalysisUsage(AnalysisUsage &AU) const override {
     AU.addPreserved<ShaderFlagsAnalysisWrapper>();
-    AU.addPreserved<DXILResourceWrapper>();
+    AU.addPreserved<DXILResourceMDWrapper>();
   }
   static char ID; // Pass identification.
 };
diff --git a/llvm/lib/Target/DirectX/DXILPrettyPrinter.cpp b/llvm/lib/Target/DirectX/DXILPrettyPrinter.cpp
index 7ae568c..99cc406 100644
--- a/llvm/lib/Target/DirectX/DXILPrettyPrinter.cpp
+++ b/llvm/lib/Target/DirectX/DXILPrettyPrinter.cpp
@@ -41,7 +41,7 @@ public:
   bool runOnModule(Module &M) override;
   void getAnalysisUsage(AnalysisUsage &AU) const override {
     AU.setPreservesAll();
-    AU.addRequired<DXILResourceWrapper>();
+    AU.addRequired<DXILResourceMDWrapper>();
   }
 };
 } // namespace
@@ -49,12 +49,12 @@ public:
 char DXILPrettyPrinter::ID = 0;
 INITIALIZE_PASS_BEGIN(DXILPrettyPrinter, "dxil-pretty-printer",
                       "DXIL Metadata Pretty Printer", true, true)
-INITIALIZE_PASS_DEPENDENCY(DXILResourceWrapper)
+INITIALIZE_PASS_DEPENDENCY(DXILResourceMDWrapper)
 INITIALIZE_PASS_END(DXILPrettyPrinter, "dxil-pretty-printer",
                     "DXIL Metadata Pretty Printer", true, true)
 
 bool DXILPrettyPrinter::runOnModule(Module &M) {
-  dxil::Resources &Res = getAnalysis<DXILResourceWrapper>().getDXILResource();
+  dxil::Resources &Res = getAnalysis<DXILResourceMDWrapper>().getDXILResource();
   Res.print(OS);
   return false;
 }
diff --git a/llvm/lib/Target/DirectX/DXILResourceAnalysis.cpp b/llvm/lib/Target/DirectX/DXILResourceAnalysis.cpp
index 0b2f0d8..33e0119 100644
--- a/llvm/lib/Target/DirectX/DXILResourceAnalysis.cpp
+++ b/llvm/lib/Target/DirectX/DXILResourceAnalysis.cpp
@@ -18,35 +18,35 @@ using namespace llvm;
 
 #define DEBUG_TYPE "dxil-resource-analysis"
 
-dxil::Resources DXILResourceAnalysis::run(Module &M,
-                                          ModuleAnalysisManager &AM) {
+dxil::Resources DXILResourceMDAnalysis::run(Module &M,
+                                            ModuleAnalysisManager &AM) {
   dxil::Resources R;
   R.collect(M);
   return R;
 }
 
-AnalysisKey DXILResourceAnalysis::Key;
+AnalysisKey DXILResourceMDAnalysis::Key;
 
-PreservedAnalyses DXILResourcePrinterPass::run(Module &M,
-                                               ModuleAnalysisManager &AM) {
-  dxil::Resources Res = AM.getResult<DXILResourceAnalysis>(M);
+PreservedAnalyses DXILResourceMDPrinterPass::run(Module &M,
+                                                 ModuleAnalysisManager &AM) {
+  dxil::Resources Res = AM.getResult<DXILResourceMDAnalysis>(M);
   Res.print(OS);
   return PreservedAnalyses::all();
 }
 
-char DXILResourceWrapper::ID = 0;
-INITIALIZE_PASS_BEGIN(DXILResourceWrapper, DEBUG_TYPE,
+char DXILResourceMDWrapper::ID = 0;
+INITIALIZE_PASS_BEGIN(DXILResourceMDWrapper, DEBUG_TYPE,
                       "DXIL resource Information", true, true)
-INITIALIZE_PASS_END(DXILResourceWrapper, DEBUG_TYPE,
+INITIALIZE_PASS_END(DXILResourceMDWrapper, DEBUG_TYPE,
                     "DXIL resource Information", true, true)
 
-bool DXILResourceWrapper::runOnModule(Module &M) {
+bool DXILResourceMDWrapper::runOnModule(Module &M) {
   Resources.collect(M);
   return false;
 }
 
-DXILResourceWrapper::DXILResourceWrapper() : ModulePass(ID) {}
+DXILResourceMDWrapper::DXILResourceMDWrapper() : ModulePass(ID) {}
 
-void DXILResourceWrapper::print(raw_ostream &OS, const Module *) const {
+void DXILResourceMDWrapper::print(raw_ostream &OS, const Module *) const {
   Resources.print(OS);
 }
diff --git a/llvm/lib/Target/DirectX/DXILResourceAnalysis.h b/llvm/lib/Target/DirectX/DXILResourceAnalysis.h
index bce4116..3a2b8a9 100644
--- a/llvm/lib/Target/DirectX/DXILResourceAnalysis.h
+++ b/llvm/lib/Target/DirectX/DXILResourceAnalysis.h
@@ -20,8 +20,9 @@
 
 namespace llvm {
 /// Analysis pass that exposes the \c DXILResource for a module.
-class DXILResourceAnalysis : public AnalysisInfoMixin<DXILResourceAnalysis> {
-  friend AnalysisInfoMixin<DXILResourceAnalysis>;
+class DXILResourceMDAnalysis
+    : public AnalysisInfoMixin<DXILResourceMDAnalysis> {
+  friend AnalysisInfoMixin<DXILResourceMDAnalysis>;
   static AnalysisKey Key;
 
 public:
@@ -29,25 +30,26 @@ public:
   dxil::Resources run(Module &M, ModuleAnalysisManager &AM);
 };
 
-/// Printer pass for the \c DXILResourceAnalysis results.
-class DXILResourcePrinterPass : public PassInfoMixin<DXILResourcePrinterPass> {
+/// Printer pass for the \c DXILResourceMDAnalysis results.
+class DXILResourceMDPrinterPass
+    : public PassInfoMixin<DXILResourceMDPrinterPass> {
   raw_ostream &OS;
 
 public:
-  explicit DXILResourcePrinterPass(raw_ostream &OS) : OS(OS) {}
+  explicit DXILResourceMDPrinterPass(raw_ostream &OS) : OS(OS) {}
   PreservedAnalyses run(Module &M, ModuleAnalysisManager &AM);
   static bool isRequired() { return true; }
 };
 
 /// The legacy pass manager's analysis pass to compute DXIL resource
 /// information.
-class DXILResourceWrapper : public ModulePass {
+class DXILResourceMDWrapper : public ModulePass {
   dxil::Resources Resources;
 
 public:
   static char ID; // Pass identification, replacement for typeid
 
-  DXILResourceWrapper();
+  DXILResourceMDWrapper();
 
   dxil::Resources &getDXILResource() { return Resources; }
   const dxil::Resources &getDXILResource() const { return Resources; }
diff --git a/llvm/lib/Target/DirectX/DXILTranslateMetadata.cpp b/llvm/lib/Target/DirectX/DXILTranslateMetadata.cpp
index ae6d6f9..583bce0 100644
--- a/llvm/lib/Target/DirectX/DXILTranslateMetadata.cpp
+++ b/llvm/lib/Target/DirectX/DXILTranslateMetadata.cpp
@@ -33,7 +33,7 @@ public:
 
   void getAnalysisUsage(AnalysisUsage &AU) const override {
     AU.setPreservesAll();
-    AU.addRequired<DXILResourceWrapper>();
+    AU.addRequired<DXILResourceMDWrapper>();
     AU.addRequired<ShaderFlagsAnalysisWrapper>();
   }
 
@@ -51,7 +51,7 @@ bool DXILTranslateMetadata::runOnModule(Module &M) {
   dxil::createDXILVersionMD(M);
 
   const dxil::Resources &Res =
-      getAnalysis<DXILResourceWrapper>().getDXILResource();
+      getAnalysis<DXILResourceMDWrapper>().getDXILResource();
   Res.write(M);
 
   const uint64_t Flags = static_cast<uint64_t>(
@@ -69,7 +69,7 @@ ModulePass *llvm::createDXILTranslateMetadataPass() {
 
 INITIALIZE_PASS_BEGIN(DXILTranslateMetadata, "dxil-metadata-emit",
                       "DXIL Metadata Emit", false, false)
-INITIALIZE_PASS_DEPENDENCY(DXILResourceWrapper)
+INITIALIZE_PASS_DEPENDENCY(DXILResourceMDWrapper)
 INITIALIZE_PASS_DEPENDENCY(ShaderFlagsAnalysisWrapper)
 INITIALIZE_PASS_END(DXILTranslateMetadata, "dxil-metadata-emit",
                     "DXIL Metadata Emit", false, false)
diff --git a/llvm/lib/Target/DirectX/DirectX.h b/llvm/lib/Target/DirectX/DirectX.h
index 11b5412..d056ae2 100644
--- a/llvm/lib/Target/DirectX/DirectX.h
+++ b/llvm/lib/Target/DirectX/DirectX.h
@@ -47,7 +47,7 @@ void initializeDXILTranslateMetadataPass(PassRegistry &);
 ModulePass *createDXILTranslateMetadataPass();
 
 /// Initializer for DXILTranslateMetadata.
-void initializeDXILResourceWrapperPass(PassRegistry &);
+void initializeDXILResourceMDWrapperPass(PassRegistry &);
 
 /// Pass to pretty print DXIL metadata.
 ModulePass *createDXILPrettyPrinterPass(raw_ostream &OS);
diff --git a/llvm/lib/Target/DirectX/DirectXPassRegistry.def b/llvm/lib/Target/DirectX/DirectXPassRegistry.def
index 1b326d02..7544172 100644
--- a/llvm/lib/Target/DirectX/DirectXPassRegistry.def
+++ b/llvm/lib/Target/DirectX/DirectXPassRegistry.def
@@ -17,7 +17,7 @@
 #define MODULE_ANALYSIS(NAME, CREATE_PASS)
 #endif
 MODULE_ANALYSIS("dx-shader-flags", dxil::ShaderFlagsAnalysis())
-MODULE_ANALYSIS("dxil-resource", DXILResourceAnalysis())
+MODULE_ANALYSIS("dxil-resource-md", DXILResourceMDAnalysis())
 #undef MODULE_ANALYSIS
 
 #ifndef MODULE_PASS
@@ -25,5 +25,5 @@ MODULE_ANALYSIS("dxil-resource", DXILResourceAnalysis())
 #endif
 // TODO: rename to print<foo> after NPM switch
 MODULE_PASS("print-dx-shader-flags", dxil::ShaderFlagsAnalysisPrinter(dbgs()))
-MODULE_PASS("print-dxil-resource", DXILResourcePrinterPass(dbgs()))
+MODULE_PASS("print-dxil-resource-md", DXILResourceMDPrinterPass(dbgs()))
 #undef MODULE_PASS
diff --git a/llvm/lib/Target/DirectX/DirectXTargetMachine.cpp b/llvm/lib/Target/DirectX/DirectXTargetMachine.cpp
index e6dbb25..92bd69b 100644
--- a/llvm/lib/Target/DirectX/DirectXTargetMachine.cpp
+++ b/llvm/lib/Target/DirectX/DirectXTargetMachine.cpp
@@ -46,7 +46,7 @@ extern "C" LLVM_EXTERNAL_VISIBILITY void LLVMInitializeDirectXTarget() {
   initializeDXContainerGlobalsPass(*PR);
   initializeDXILOpLoweringLegacyPass(*PR);
   initializeDXILTranslateMetadataPass(*PR);
-  initializeDXILResourceWrapperPass(*PR);
+  initializeDXILResourceMDWrapperPass(*PR);
   initializeShaderFlagsAnalysisWrapperPass(*PR);
 }
 
diff --git a/llvm/lib/Target/Hexagon/MCTargetDesc/HexagonAsmBackend.cpp b/llvm/lib/Target/Hexagon/MCTargetDesc/HexagonAsmBackend.cpp
index 1570493..6acc37e 100644
--- a/llvm/lib/Target/Hexagon/MCTargetDesc/HexagonAsmBackend.cpp
+++ b/llvm/lib/Target/Hexagon/MCTargetDesc/HexagonAsmBackend.cpp
@@ -702,7 +702,7 @@ public:
     return true;
   }
 
-  bool finishLayout(const MCAssembler &Asm) const override {
+  void finishLayout(MCAssembler const &Asm) const override {
     SmallVector<MCFragment *> Frags;
     for (MCSection &Sec : Asm) {
       Frags.clear();
@@ -747,6 +747,7 @@ public:
               //assert(!Error);
               (void)Error;
               ReplaceInstruction(Asm.getEmitter(), RF, Inst);
+              Sec.setHasLayout(false);
               Size = 0; // Only look back one instruction
               break;
             }
@@ -756,7 +757,6 @@ public:
         }
       }
     }
-    return true;
   }
 }; // class HexagonAsmBackend
 
diff --git a/llvm/lib/Target/Hexagon/MCTargetDesc/HexagonMCTargetDesc.cpp b/llvm/lib/Target/Hexagon/MCTargetDesc/HexagonMCTargetDesc.cpp
index 7b21d30..aa86b2d 100644
--- a/llvm/lib/Target/Hexagon/MCTargetDesc/HexagonMCTargetDesc.cpp
+++ b/llvm/lib/Target/Hexagon/MCTargetDesc/HexagonMCTargetDesc.cpp
@@ -26,10 +26,10 @@
 #include "llvm/MC/MCCodeEmitter.h"
 #include "llvm/MC/MCContext.h"
 #include "llvm/MC/MCDwarf.h"
+#include "llvm/MC/MCELFObjectWriter.h"
 #include "llvm/MC/MCELFStreamer.h"
 #include "llvm/MC/MCInstrAnalysis.h"
 #include "llvm/MC/MCInstrInfo.h"
-#include "llvm/MC/MCObjectWriter.h"
 #include "llvm/MC/MCRegisterInfo.h"
 #include "llvm/MC/MCStreamer.h"
 #include "llvm/MC/MCSubtargetInfo.h"
diff --git a/llvm/lib/Target/LoongArch/LoongArchAsmPrinter.cpp b/llvm/lib/Target/LoongArch/LoongArchAsmPrinter.cpp
index 27979a8..f478870 100644
--- a/llvm/lib/Target/LoongArch/LoongArchAsmPrinter.cpp
+++ b/llvm/lib/Target/LoongArch/LoongArchAsmPrinter.cpp
@@ -34,8 +34,10 @@ void LoongArchAsmPrinter::emitInstruction(const MachineInstr *MI) {
       MI->getOpcode(), getSubtargetInfo().getFeatureBits());
 
   // Do any auto-generated pseudo lowerings.
-  if (emitPseudoExpansionLowering(*OutStreamer, MI))
+  if (MCInst OutInst; lowerPseudoInstExpansion(MI, OutInst)) {
+    EmitToStreamer(*OutStreamer, OutInst);
     return;
+  }
 
   switch (MI->getOpcode()) {
   case TargetOpcode::PATCHABLE_FUNCTION_ENTER:
diff --git a/llvm/lib/Target/LoongArch/LoongArchAsmPrinter.h b/llvm/lib/Target/LoongArch/LoongArchAsmPrinter.h
index 6934564..9da9088 100644
--- a/llvm/lib/Target/LoongArch/LoongArchAsmPrinter.h
+++ b/llvm/lib/Target/LoongArch/LoongArchAsmPrinter.h
@@ -47,8 +47,8 @@ public:
   void emitSled(const MachineInstr &MI, SledKind Kind);
 
   // tblgen'erated function.
-  bool emitPseudoExpansionLowering(MCStreamer &OutStreamer,
-                                   const MachineInstr *MI);
+  bool lowerPseudoInstExpansion(const MachineInstr *MI, MCInst &Inst);
+
   // Wrapper needed for tblgenned pseudo lowering.
   bool lowerOperand(const MachineOperand &MO, MCOperand &MCOp) const {
     return lowerLoongArchMachineOperandToMCOperand(MO, MCOp, *this);
diff --git a/llvm/lib/Target/LoongArch/LoongArchExpandPseudoInsts.cpp b/llvm/lib/Target/LoongArch/LoongArchExpandPseudoInsts.cpp
index c136f5b..33b93e42 100644
--- a/llvm/lib/Target/LoongArch/LoongArchExpandPseudoInsts.cpp
+++ b/llvm/lib/Target/LoongArch/LoongArchExpandPseudoInsts.cpp
@@ -62,27 +62,47 @@ private:
                                MachineBasicBlock::iterator &NextMBBI,
                                unsigned FlagsHi, unsigned SecondOpcode,
                                unsigned FlagsLo);
+  bool expandLargeAddressLoad(MachineBasicBlock &MBB,
+                              MachineBasicBlock::iterator MBBI,
+                              MachineBasicBlock::iterator &NextMBBI,
+                              unsigned LastOpcode, unsigned IdentifyingMO);
+  bool expandLargeAddressLoad(MachineBasicBlock &MBB,
+                              MachineBasicBlock::iterator MBBI,
+                              MachineBasicBlock::iterator &NextMBBI,
+                              unsigned LastOpcode, unsigned IdentifyingMO,
+                              const MachineOperand &Symbol, Register DestReg,
+                              bool EraseFromParent);
   bool expandLoadAddressPcrel(MachineBasicBlock &MBB,
                               MachineBasicBlock::iterator MBBI,
-                              MachineBasicBlock::iterator &NextMBBI);
+                              MachineBasicBlock::iterator &NextMBBI,
+                              bool Large = false);
   bool expandLoadAddressGot(MachineBasicBlock &MBB,
                             MachineBasicBlock::iterator MBBI,
-                            MachineBasicBlock::iterator &NextMBBI);
+                            MachineBasicBlock::iterator &NextMBBI,
+                            bool Large = false);
   bool expandLoadAddressTLSLE(MachineBasicBlock &MBB,
                               MachineBasicBlock::iterator MBBI,
                               MachineBasicBlock::iterator &NextMBBI);
   bool expandLoadAddressTLSIE(MachineBasicBlock &MBB,
                               MachineBasicBlock::iterator MBBI,
-                              MachineBasicBlock::iterator &NextMBBI);
+                              MachineBasicBlock::iterator &NextMBBI,
+                              bool Large = false);
   bool expandLoadAddressTLSLD(MachineBasicBlock &MBB,
                               MachineBasicBlock::iterator MBBI,
-                              MachineBasicBlock::iterator &NextMBBI);
+                              MachineBasicBlock::iterator &NextMBBI,
+                              bool Large = false);
   bool expandLoadAddressTLSGD(MachineBasicBlock &MBB,
                               MachineBasicBlock::iterator MBBI,
-                              MachineBasicBlock::iterator &NextMBBI);
+                              MachineBasicBlock::iterator &NextMBBI,
+                              bool Large = false);
   bool expandLoadAddressTLSDesc(MachineBasicBlock &MBB,
                                 MachineBasicBlock::iterator MBBI,
-                                MachineBasicBlock::iterator &NextMBBI);
+                                MachineBasicBlock::iterator &NextMBBI,
+                                bool Large = false);
+  bool expandFunctionCALL(MachineBasicBlock &MBB,
+                          MachineBasicBlock::iterator MBBI,
+                          MachineBasicBlock::iterator &NextMBBI,
+                          bool IsTailCall);
 };
 
 char LoongArchPreRAExpandPseudo::ID = 0;
@@ -115,18 +135,38 @@ bool LoongArchPreRAExpandPseudo::expandMI(
   switch (MBBI->getOpcode()) {
   case LoongArch::PseudoLA_PCREL:
     return expandLoadAddressPcrel(MBB, MBBI, NextMBBI);
+  case LoongArch::PseudoLA_PCREL_LARGE:
+    return expandLoadAddressPcrel(MBB, MBBI, NextMBBI, /*Large=*/true);
   case LoongArch::PseudoLA_GOT:
     return expandLoadAddressGot(MBB, MBBI, NextMBBI);
+  case LoongArch::PseudoLA_GOT_LARGE:
+    return expandLoadAddressGot(MBB, MBBI, NextMBBI, /*Large=*/true);
   case LoongArch::PseudoLA_TLS_LE:
     return expandLoadAddressTLSLE(MBB, MBBI, NextMBBI);
   case LoongArch::PseudoLA_TLS_IE:
     return expandLoadAddressTLSIE(MBB, MBBI, NextMBBI);
+  case LoongArch::PseudoLA_TLS_IE_LARGE:
+    return expandLoadAddressTLSIE(MBB, MBBI, NextMBBI, /*Large=*/true);
   case LoongArch::PseudoLA_TLS_LD:
     return expandLoadAddressTLSLD(MBB, MBBI, NextMBBI);
+  case LoongArch::PseudoLA_TLS_LD_LARGE:
+    return expandLoadAddressTLSLD(MBB, MBBI, NextMBBI, /*Large=*/true);
   case LoongArch::PseudoLA_TLS_GD:
     return expandLoadAddressTLSGD(MBB, MBBI, NextMBBI);
+  case LoongArch::PseudoLA_TLS_GD_LARGE:
+    return expandLoadAddressTLSGD(MBB, MBBI, NextMBBI, /*Large=*/true);
   case LoongArch::PseudoLA_TLS_DESC_PC:
     return expandLoadAddressTLSDesc(MBB, MBBI, NextMBBI);
+  case LoongArch::PseudoLA_TLS_DESC_PC_LARGE:
+    return expandLoadAddressTLSDesc(MBB, MBBI, NextMBBI, /*Large=*/true);
+  case LoongArch::PseudoCALL:
+  case LoongArch::PseudoCALL_MEDIUM:
+  case LoongArch::PseudoCALL_LARGE:
+    return expandFunctionCALL(MBB, MBBI, NextMBBI, /*IsTailCall=*/false);
+  case LoongArch::PseudoTAIL:
+  case LoongArch::PseudoTAIL_MEDIUM:
+  case LoongArch::PseudoTAIL_LARGE:
+    return expandFunctionCALL(MBB, MBBI, NextMBBI, /*IsTailCall=*/true);
   }
   return false;
 }
@@ -159,9 +199,118 @@ bool LoongArchPreRAExpandPseudo::expandPcalau12iInstPair(
   return true;
 }
 
+bool LoongArchPreRAExpandPseudo::expandLargeAddressLoad(
+    MachineBasicBlock &MBB, MachineBasicBlock::iterator MBBI,
+    MachineBasicBlock::iterator &NextMBBI, unsigned LastOpcode,
+    unsigned IdentifyingMO) {
+  MachineInstr &MI = *MBBI;
+  return expandLargeAddressLoad(MBB, MBBI, NextMBBI, LastOpcode, IdentifyingMO,
+                                MI.getOperand(2), MI.getOperand(0).getReg(),
+                                true);
+}
+
+bool LoongArchPreRAExpandPseudo::expandLargeAddressLoad(
+    MachineBasicBlock &MBB, MachineBasicBlock::iterator MBBI,
+    MachineBasicBlock::iterator &NextMBBI, unsigned LastOpcode,
+    unsigned IdentifyingMO, const MachineOperand &Symbol, Register DestReg,
+    bool EraseFromParent) {
+  // Code Sequence:
+  //
+  // Part1: pcalau12i  $scratch, %MO1(sym)
+  // Part0: addi.d     $dest, $zero, %MO0(sym)
+  // Part2: lu32i.d    $dest, %MO2(sym)
+  // Part3: lu52i.d    $dest, $dest, %MO3(sym)
+  // Fin:   LastOpcode $dest, $dest, $scratch
+
+  unsigned MO0, MO1, MO2, MO3;
+  switch (IdentifyingMO) {
+  default:
+    llvm_unreachable("unsupported identifying MO");
+  case LoongArchII::MO_PCREL_LO:
+    MO0 = IdentifyingMO;
+    MO1 = LoongArchII::MO_PCREL_HI;
+    MO2 = LoongArchII::MO_PCREL64_LO;
+    MO3 = LoongArchII::MO_PCREL64_HI;
+    break;
+  case LoongArchII::MO_GOT_PC_HI:
+  case LoongArchII::MO_LD_PC_HI:
+  case LoongArchII::MO_GD_PC_HI:
+    // These cases relocate just like the GOT case, except for Part1.
+    MO0 = LoongArchII::MO_GOT_PC_LO;
+    MO1 = IdentifyingMO;
+    MO2 = LoongArchII::MO_GOT_PC64_LO;
+    MO3 = LoongArchII::MO_GOT_PC64_HI;
+    break;
+  case LoongArchII::MO_IE_PC_LO:
+    MO0 = IdentifyingMO;
+    MO1 = LoongArchII::MO_IE_PC_HI;
+    MO2 = LoongArchII::MO_IE_PC64_LO;
+    MO3 = LoongArchII::MO_IE_PC64_HI;
+    break;
+  }
+
+  MachineFunction *MF = MBB.getParent();
+  MachineInstr &MI = *MBBI;
+  DebugLoc DL = MI.getDebugLoc();
+
+  assert(MF->getSubtarget<LoongArchSubtarget>().is64Bit() &&
+         "Large code model requires LA64");
+
+  Register TmpPart1 =
+      MF->getRegInfo().createVirtualRegister(&LoongArch::GPRRegClass);
+  Register TmpPart0 =
+      DestReg.isVirtual()
+          ? MF->getRegInfo().createVirtualRegister(&LoongArch::GPRRegClass)
+          : DestReg;
+  Register TmpParts02 =
+      DestReg.isVirtual()
+          ? MF->getRegInfo().createVirtualRegister(&LoongArch::GPRRegClass)
+          : DestReg;
+  Register TmpParts023 =
+      DestReg.isVirtual()
+          ? MF->getRegInfo().createVirtualRegister(&LoongArch::GPRRegClass)
+          : DestReg;
+
+  auto Part1 = BuildMI(MBB, MBBI, DL, TII->get(LoongArch::PCALAU12I), TmpPart1);
+  auto Part0 = BuildMI(MBB, MBBI, DL, TII->get(LoongArch::ADDI_D), TmpPart0)
+                   .addReg(LoongArch::R0);
+  auto Part2 = BuildMI(MBB, MBBI, DL, TII->get(LoongArch::LU32I_D), TmpParts02)
+                   // "rj" is needed due to InstrInfo pattern requirement.
+                   .addReg(TmpPart0, RegState::Kill);
+  auto Part3 = BuildMI(MBB, MBBI, DL, TII->get(LoongArch::LU52I_D), TmpParts023)
+                   .addReg(TmpParts02, RegState::Kill);
+  BuildMI(MBB, MBBI, DL, TII->get(LastOpcode), DestReg)
+      .addReg(TmpParts023)
+      .addReg(TmpPart1, RegState::Kill);
+
+  if (Symbol.getType() == MachineOperand::MO_ExternalSymbol) {
+    const char *SymName = Symbol.getSymbolName();
+    Part0.addExternalSymbol(SymName, MO0);
+    Part1.addExternalSymbol(SymName, MO1);
+    Part2.addExternalSymbol(SymName, MO2);
+    Part3.addExternalSymbol(SymName, MO3);
+  } else {
+    Part0.addDisp(Symbol, 0, MO0);
+    Part1.addDisp(Symbol, 0, MO1);
+    Part2.addDisp(Symbol, 0, MO2);
+    Part3.addDisp(Symbol, 0, MO3);
+  }
+
+  if (EraseFromParent)
+    MI.eraseFromParent();
+
+  return true;
+}
+
 bool LoongArchPreRAExpandPseudo::expandLoadAddressPcrel(
     MachineBasicBlock &MBB, MachineBasicBlock::iterator MBBI,
-    MachineBasicBlock::iterator &NextMBBI) {
+    MachineBasicBlock::iterator &NextMBBI, bool Large) {
+  if (Large)
+    // Emit the 5-insn large address load sequence with the `%pc` family of
+    // relocs.
+    return expandLargeAddressLoad(MBB, MBBI, NextMBBI, LoongArch::ADD_D,
+                                  LoongArchII::MO_PCREL_LO);
+
   // Code Sequence:
   // pcalau12i $rd, %pc_hi20(sym)
   // addi.w/d $rd, $rd, %pc_lo12(sym)
@@ -174,7 +323,13 @@ bool LoongArchPreRAExpandPseudo::expandLoadAddressPcrel(
 
 bool LoongArchPreRAExpandPseudo::expandLoadAddressGot(
     MachineBasicBlock &MBB, MachineBasicBlock::iterator MBBI,
-    MachineBasicBlock::iterator &NextMBBI) {
+    MachineBasicBlock::iterator &NextMBBI, bool Large) {
+  if (Large)
+    // Emit the 5-insn large address load sequence with the `%got_pc` family
+    // of relocs, loading the result from GOT with `ldx.d` in the end.
+    return expandLargeAddressLoad(MBB, MBBI, NextMBBI, LoongArch::LDX_D,
+                                  LoongArchII::MO_GOT_PC_HI);
+
   // Code Sequence:
   // pcalau12i $rd, %got_pc_hi20(sym)
   // ld.w/d $rd, $rd, %got_pc_lo12(sym)
@@ -235,7 +390,13 @@ bool LoongArchPreRAExpandPseudo::expandLoadAddressTLSLE(
 
 bool LoongArchPreRAExpandPseudo::expandLoadAddressTLSIE(
     MachineBasicBlock &MBB, MachineBasicBlock::iterator MBBI,
-    MachineBasicBlock::iterator &NextMBBI) {
+    MachineBasicBlock::iterator &NextMBBI, bool Large) {
+  if (Large)
+    // Emit the 5-insn large address load sequence with the `%ie_pc` family
+    // of relocs, loading the result with `ldx.d` in the end.
+    return expandLargeAddressLoad(MBB, MBBI, NextMBBI, LoongArch::LDX_D,
+                                  LoongArchII::MO_IE_PC_LO);
+
   // Code Sequence:
   // pcalau12i $rd, %ie_pc_hi20(sym)
   // ld.w/d $rd, $rd, %ie_pc_lo12(sym)
@@ -248,7 +409,13 @@ bool LoongArchPreRAExpandPseudo::expandLoadAddressTLSIE(
 
 bool LoongArchPreRAExpandPseudo::expandLoadAddressTLSLD(
     MachineBasicBlock &MBB, MachineBasicBlock::iterator MBBI,
-    MachineBasicBlock::iterator &NextMBBI) {
+    MachineBasicBlock::iterator &NextMBBI, bool Large) {
+  if (Large)
+    // Emit the 5-insn large address load sequence with the `%got_pc` family
+    // of relocs, with the `pcalau12i` insn relocated with `%ld_pc_hi20`.
+    return expandLargeAddressLoad(MBB, MBBI, NextMBBI, LoongArch::ADD_D,
+                                  LoongArchII::MO_LD_PC_HI);
+
   // Code Sequence:
   // pcalau12i $rd, %ld_pc_hi20(sym)
   // addi.w/d $rd, $rd, %got_pc_lo12(sym)
@@ -261,7 +428,13 @@ bool LoongArchPreRAExpandPseudo::expandLoadAddressTLSLD(
 
 bool LoongArchPreRAExpandPseudo::expandLoadAddressTLSGD(
     MachineBasicBlock &MBB, MachineBasicBlock::iterator MBBI,
-    MachineBasicBlock::iterator &NextMBBI) {
+    MachineBasicBlock::iterator &NextMBBI, bool Large) {
+  if (Large)
+    // Emit the 5-insn large address load sequence with the `%got_pc` family
+    // of relocs, with the `pcalau12i` insn relocated with `%gd_pc_hi20`.
+    return expandLargeAddressLoad(MBB, MBBI, NextMBBI, LoongArch::ADD_D,
+                                  LoongArchII::MO_GD_PC_HI);
+
   // Code Sequence:
   // pcalau12i $rd, %gd_pc_hi20(sym)
   // addi.w/d $rd, $rd, %got_pc_lo12(sym)
@@ -274,13 +447,7 @@ bool LoongArchPreRAExpandPseudo::expandLoadAddressTLSGD(
 
 bool LoongArchPreRAExpandPseudo::expandLoadAddressTLSDesc(
     MachineBasicBlock &MBB, MachineBasicBlock::iterator MBBI,
-    MachineBasicBlock::iterator &NextMBBI) {
-  // Code Sequence:
-  // pcalau12i $a0, %desc_pc_hi20(sym)
-  // addi.w/d  $a0, $a0, %desc_pc_lo12(sym)
-  // ld.w/d    $ra, $a0, %desc_ld(sym)
-  // jirl      $ra, $ra, %desc_ld(sym)
-  // add.d     $dst, $a0, $tp
+    MachineBasicBlock::iterator &NextMBBI, bool Large) {
   MachineFunction *MF = MBB.getParent();
   MachineInstr &MI = *MBBI;
   DebugLoc DL = MI.getDebugLoc();
@@ -291,25 +458,62 @@ bool LoongArchPreRAExpandPseudo::expandLoadAddressTLSDesc(
   unsigned LD = STI.is64Bit() ? LoongArch::LD_D : LoongArch::LD_W;
 
   Register DestReg = MI.getOperand(0).getReg();
-  Register ScratchReg =
+  Register Tmp1Reg =
       MF->getRegInfo().createVirtualRegister(&LoongArch::GPRRegClass);
-  MachineOperand &Symbol = MI.getOperand(1);
+  MachineOperand &Symbol = MI.getOperand(Large ? 2 : 1);
 
-  BuildMI(MBB, MBBI, DL, TII->get(LoongArch::PCALAU12I), ScratchReg)
+  BuildMI(MBB, MBBI, DL, TII->get(LoongArch::PCALAU12I), Tmp1Reg)
       .addDisp(Symbol, 0, LoongArchII::MO_DESC_PC_HI);
 
-  BuildMI(MBB, MBBI, DL, TII->get(ADDI), LoongArch::R4)
-      .addReg(ScratchReg)
-      .addDisp(Symbol, 0, LoongArchII::MO_DESC_PC_LO);
+  if (Large) {
+    // Code Sequence:
+    //
+    // pcalau12i  $a0, %desc_pc_hi20(sym)
+    // addi.d     $a1, $zero, %desc_pc_lo12(sym)
+    // lu32i.d    $a1, %desc64_pc_lo20(sym)
+    // lu52i.d    $a1, $a1, %desc64_pc_hi12(sym)
+    // add.d      $a0, $a0, $a1
+    // ld.d       $ra, $a0, %desc_ld(sym)
+    // jirl       $ra, $ra, %desc_call(sym)
+    // add.d      $dst, $a0, $tp
+    assert(MBB.getParent()->getSubtarget<LoongArchSubtarget>().is64Bit() &&
+           "Large code model requires LA64");
+    Register Tmp2Reg =
+        MF->getRegInfo().createVirtualRegister(&LoongArch::GPRRegClass);
+    Register Tmp3Reg =
+        MF->getRegInfo().createVirtualRegister(&LoongArch::GPRRegClass);
+    Register Tmp4Reg =
+        MF->getRegInfo().createVirtualRegister(&LoongArch::GPRRegClass);
+    BuildMI(MBB, MBBI, DL, TII->get(LoongArch::ADDI_D), Tmp2Reg)
+        .addReg(LoongArch::R0)
+        .addDisp(Symbol, 0, LoongArchII::MO_DESC_PC_LO);
+    BuildMI(MBB, MBBI, DL, TII->get(LoongArch::LU32I_D), Tmp3Reg)
+        .addReg(Tmp2Reg, RegState::Kill)
+        .addDisp(Symbol, 0, LoongArchII::MO_DESC64_PC_LO);
+    BuildMI(MBB, MBBI, DL, TII->get(LoongArch::LU52I_D), Tmp4Reg)
+        .addReg(Tmp3Reg)
+        .addDisp(Symbol, 0, LoongArchII::MO_DESC64_PC_HI);
+    BuildMI(MBB, MBBI, DL, TII->get(LoongArch::ADD_D), LoongArch::R4)
+        .addReg(Tmp1Reg)
+        .addReg(Tmp4Reg);
+  } else {
+    // Code Sequence:
+    // pcalau12i $a0, %desc_pc_hi20(sym)
+    // addi.w/d  $a0, $a0, %desc_pc_lo12(sym)
+    // ld.w/d    $ra, $a0, %desc_ld(sym)
+    // jirl      $ra, $ra, %desc_ld(sym)
+    // add.d     $dst, $a0, $tp
+    BuildMI(MBB, MBBI, DL, TII->get(ADDI), LoongArch::R4)
+        .addReg(Tmp1Reg)
+        .addDisp(Symbol, 0, LoongArchII::MO_DESC_PC_LO);
+  }
 
   BuildMI(MBB, MBBI, DL, TII->get(LD), LoongArch::R1)
       .addReg(LoongArch::R4)
       .addDisp(Symbol, 0, LoongArchII::MO_DESC_LD);
-
   BuildMI(MBB, MBBI, DL, TII->get(LoongArch::PseudoDESC_CALL), LoongArch::R1)
       .addReg(LoongArch::R1)
       .addDisp(Symbol, 0, LoongArchII::MO_DESC_CALL);
-
   BuildMI(MBB, MBBI, DL, TII->get(ADD), DestReg)
       .addReg(LoongArch::R4)
       .addReg(LoongArch::R2);
@@ -318,6 +522,85 @@ bool LoongArchPreRAExpandPseudo::expandLoadAddressTLSDesc(
   return true;
 }
 
+bool LoongArchPreRAExpandPseudo::expandFunctionCALL(
+    MachineBasicBlock &MBB, MachineBasicBlock::iterator MBBI,
+    MachineBasicBlock::iterator &NextMBBI, bool IsTailCall) {
+  MachineFunction *MF = MBB.getParent();
+  MachineInstr &MI = *MBBI;
+  DebugLoc DL = MI.getDebugLoc();
+  const MachineOperand &Func = MI.getOperand(0);
+  MachineInstrBuilder CALL;
+  unsigned Opcode;
+
+  switch (MF->getTarget().getCodeModel()) {
+  default:
+    report_fatal_error("Unsupported code model");
+    break;
+  case CodeModel::Small: {
+    // CALL:
+    // bl func
+    // TAIL:
+    // b func
+    Opcode = IsTailCall ? LoongArch::PseudoB_TAIL : LoongArch::BL;
+    CALL = BuildMI(MBB, MBBI, DL, TII->get(Opcode)).add(Func);
+    break;
+  }
+  case CodeModel::Medium: {
+    // CALL:
+    // pcaddu18i $ra, %call36(func)
+    // jirl      $ra, $ra, 0
+    // TAIL:
+    // pcaddu18i $scratch, %call36(func)
+    // jirl      $r0, $scratch, 0
+    Opcode =
+        IsTailCall ? LoongArch::PseudoJIRL_TAIL : LoongArch::PseudoJIRL_CALL;
+    Register ScratchReg =
+        IsTailCall
+            ? MF->getRegInfo().createVirtualRegister(&LoongArch::GPRRegClass)
+            : LoongArch::R1;
+    MachineInstrBuilder MIB =
+        BuildMI(MBB, MBBI, DL, TII->get(LoongArch::PCADDU18I), ScratchReg);
+
+    CALL =
+        BuildMI(MBB, MBBI, DL, TII->get(Opcode)).addReg(ScratchReg).addImm(0);
+
+    if (Func.isSymbol())
+      MIB.addExternalSymbol(Func.getSymbolName(), LoongArchII::MO_CALL36);
+    else
+      MIB.addDisp(Func, 0, LoongArchII::MO_CALL36);
+    break;
+  }
+  case CodeModel::Large: {
+    // Emit the 5-insn large address load sequence, either directly or
+    // indirectly in case of going through the GOT, then JIRL_TAIL or
+    // JIRL_CALL to $addr.
+    Opcode =
+        IsTailCall ? LoongArch::PseudoJIRL_TAIL : LoongArch::PseudoJIRL_CALL;
+    Register AddrReg =
+        IsTailCall
+            ? MF->getRegInfo().createVirtualRegister(&LoongArch::GPRRegClass)
+            : LoongArch::R1;
+
+    bool UseGOT = Func.isGlobal() && !Func.getGlobal()->isDSOLocal();
+    unsigned MO = UseGOT ? LoongArchII::MO_GOT_PC_HI : LoongArchII::MO_PCREL_LO;
+    unsigned LAOpcode = UseGOT ? LoongArch::LDX_D : LoongArch::ADD_D;
+    expandLargeAddressLoad(MBB, MBBI, NextMBBI, LAOpcode, MO, Func, AddrReg,
+                           false);
+    CALL = BuildMI(MBB, MBBI, DL, TII->get(Opcode)).addReg(AddrReg).addImm(0);
+    break;
+  }
+  }
+
+  // Transfer implicit operands.
+  CALL.copyImplicitOps(MI);
+
+  // Transfer MI flags.
+  CALL.setMIFlags(MI.getFlags());
+
+  MI.eraseFromParent();
+  return true;
+}
+
 class LoongArchExpandPseudo : public MachineFunctionPass {
 public:
   const LoongArchInstrInfo *TII;
@@ -339,38 +622,6 @@ private:
                 MachineBasicBlock::iterator &NextMBBI);
   bool expandCopyCFR(MachineBasicBlock &MBB, MachineBasicBlock::iterator MBBI,
                      MachineBasicBlock::iterator &NextMBBI);
-  bool expandLargeAddressLoad(MachineBasicBlock &MBB,
-                              MachineBasicBlock::iterator MBBI,
-                              MachineBasicBlock::iterator &NextMBBI,
-                              unsigned LastOpcode, unsigned IdentifyingMO);
-  bool expandLargeAddressLoad(MachineBasicBlock &MBB,
-                              MachineBasicBlock::iterator MBBI,
-                              MachineBasicBlock::iterator &NextMBBI,
-                              unsigned LastOpcode, unsigned IdentifyingMO,
-                              const MachineOperand &Symbol, Register DestReg,
-                              bool EraseFromParent);
-  bool expandLoadAddressPcrelLarge(MachineBasicBlock &MBB,
-                                   MachineBasicBlock::iterator MBBI,
-                                   MachineBasicBlock::iterator &NextMBBI);
-  bool expandLoadAddressGotLarge(MachineBasicBlock &MBB,
-                                 MachineBasicBlock::iterator MBBI,
-                                 MachineBasicBlock::iterator &NextMBBI);
-  bool expandLoadAddressTLSIELarge(MachineBasicBlock &MBB,
-                                   MachineBasicBlock::iterator MBBI,
-                                   MachineBasicBlock::iterator &NextMBBI);
-  bool expandLoadAddressTLSLDLarge(MachineBasicBlock &MBB,
-                                   MachineBasicBlock::iterator MBBI,
-                                   MachineBasicBlock::iterator &NextMBBI);
-  bool expandLoadAddressTLSGDLarge(MachineBasicBlock &MBB,
-                                   MachineBasicBlock::iterator MBBI,
-                                   MachineBasicBlock::iterator &NextMBBI);
-  bool expandLoadAddressTLSDescPcLarge(MachineBasicBlock &MBB,
-                                       MachineBasicBlock::iterator MBBI,
-                                       MachineBasicBlock::iterator &NextMBBI);
-  bool expandFunctionCALL(MachineBasicBlock &MBB,
-                          MachineBasicBlock::iterator MBBI,
-                          MachineBasicBlock::iterator &NextMBBI,
-                          bool IsTailCall);
 };
 
 char LoongArchExpandPseudo::ID = 0;
@@ -405,26 +656,6 @@ bool LoongArchExpandPseudo::expandMI(MachineBasicBlock &MBB,
   switch (MBBI->getOpcode()) {
   case LoongArch::PseudoCopyCFR:
     return expandCopyCFR(MBB, MBBI, NextMBBI);
-  case LoongArch::PseudoLA_PCREL_LARGE:
-    return expandLoadAddressPcrelLarge(MBB, MBBI, NextMBBI);
-  case LoongArch::PseudoLA_GOT_LARGE:
-    return expandLoadAddressGotLarge(MBB, MBBI, NextMBBI);
-  case LoongArch::PseudoLA_TLS_IE_LARGE:
-    return expandLoadAddressTLSIELarge(MBB, MBBI, NextMBBI);
-  case LoongArch::PseudoLA_TLS_LD_LARGE:
-    return expandLoadAddressTLSLDLarge(MBB, MBBI, NextMBBI);
-  case LoongArch::PseudoLA_TLS_GD_LARGE:
-    return expandLoadAddressTLSGDLarge(MBB, MBBI, NextMBBI);
-  case LoongArch::PseudoLA_TLS_DESC_PC_LARGE:
-    return expandLoadAddressTLSDescPcLarge(MBB, MBBI, NextMBBI);
-  case LoongArch::PseudoCALL:
-  case LoongArch::PseudoCALL_MEDIUM:
-  case LoongArch::PseudoCALL_LARGE:
-    return expandFunctionCALL(MBB, MBBI, NextMBBI, /*IsTailCall=*/false);
-  case LoongArch::PseudoTAIL:
-  case LoongArch::PseudoTAIL_MEDIUM:
-  case LoongArch::PseudoTAIL_LARGE:
-    return expandFunctionCALL(MBB, MBBI, NextMBBI, /*IsTailCall=*/true);
   }
 
   return false;
@@ -483,264 +714,6 @@ bool LoongArchExpandPseudo::expandCopyCFR(
   return true;
 }
 
-bool LoongArchExpandPseudo::expandLargeAddressLoad(
-    MachineBasicBlock &MBB, MachineBasicBlock::iterator MBBI,
-    MachineBasicBlock::iterator &NextMBBI, unsigned LastOpcode,
-    unsigned IdentifyingMO) {
-  MachineInstr &MI = *MBBI;
-  return expandLargeAddressLoad(MBB, MBBI, NextMBBI, LastOpcode, IdentifyingMO,
-                                MI.getOperand(2), MI.getOperand(0).getReg(),
-                                true);
-}
-
-bool LoongArchExpandPseudo::expandLargeAddressLoad(
-    MachineBasicBlock &MBB, MachineBasicBlock::iterator MBBI,
-    MachineBasicBlock::iterator &NextMBBI, unsigned LastOpcode,
-    unsigned IdentifyingMO, const MachineOperand &Symbol, Register DestReg,
-    bool EraseFromParent) {
-  // Code Sequence:
-  //
-  // Part1: pcalau12i  $dst, %MO1(sym)
-  // Part0: addi.d     $t8, $zero, %MO0(sym)
-  // Part2: lu32i.d    $t8, %MO2(sym)
-  // Part3: lu52i.d    $t8, $t8, %MO3(sym)
-  // Fin:   LastOpcode $dst, $t8, $dst
-
-  unsigned MO0, MO1, MO2, MO3;
-  switch (IdentifyingMO) {
-  default:
-    llvm_unreachable("unsupported identifying MO");
-  case LoongArchII::MO_PCREL_LO:
-    MO0 = IdentifyingMO;
-    MO1 = LoongArchII::MO_PCREL_HI;
-    MO2 = LoongArchII::MO_PCREL64_LO;
-    MO3 = LoongArchII::MO_PCREL64_HI;
-    break;
-  case LoongArchII::MO_GOT_PC_HI:
-  case LoongArchII::MO_LD_PC_HI:
-  case LoongArchII::MO_GD_PC_HI:
-    // These cases relocate just like the GOT case, except for Part1.
-    MO0 = LoongArchII::MO_GOT_PC_LO;
-    MO1 = IdentifyingMO;
-    MO2 = LoongArchII::MO_GOT_PC64_LO;
-    MO3 = LoongArchII::MO_GOT_PC64_HI;
-    break;
-  case LoongArchII::MO_IE_PC_LO:
-    MO0 = IdentifyingMO;
-    MO1 = LoongArchII::MO_IE_PC_HI;
-    MO2 = LoongArchII::MO_IE_PC64_LO;
-    MO3 = LoongArchII::MO_IE_PC64_HI;
-    break;
-  }
-
-  MachineInstr &MI = *MBBI;
-  DebugLoc DL = MI.getDebugLoc();
-  Register ScratchReg = LoongArch::R20; // $t8
-
-  assert(MBB.getParent()->getSubtarget<LoongArchSubtarget>().is64Bit() &&
-         "Large code model requires LA64");
-
-  auto Part1 = BuildMI(MBB, MBBI, DL, TII->get(LoongArch::PCALAU12I), DestReg);
-  auto Part0 = BuildMI(MBB, MBBI, DL, TII->get(LoongArch::ADDI_D), ScratchReg)
-                   .addReg(LoongArch::R0);
-  auto Part2 = BuildMI(MBB, MBBI, DL, TII->get(LoongArch::LU32I_D), ScratchReg)
-                   // "rj" is needed due to InstrInfo pattern requirement.
-                   .addReg(ScratchReg);
-  auto Part3 = BuildMI(MBB, MBBI, DL, TII->get(LoongArch::LU52I_D), ScratchReg)
-                   .addReg(ScratchReg);
-  BuildMI(MBB, MBBI, DL, TII->get(LastOpcode), DestReg)
-      .addReg(ScratchReg)
-      .addReg(DestReg);
-
-  if (Symbol.getType() == MachineOperand::MO_ExternalSymbol) {
-    const char *SymName = Symbol.getSymbolName();
-    Part0.addExternalSymbol(SymName, MO0);
-    Part1.addExternalSymbol(SymName, MO1);
-    Part2.addExternalSymbol(SymName, MO2);
-    Part3.addExternalSymbol(SymName, MO3);
-  } else {
-    Part0.addDisp(Symbol, 0, MO0);
-    Part1.addDisp(Symbol, 0, MO1);
-    Part2.addDisp(Symbol, 0, MO2);
-    Part3.addDisp(Symbol, 0, MO3);
-  }
-
-  if (EraseFromParent)
-    MI.eraseFromParent();
-
-  return true;
-}
-
-bool LoongArchExpandPseudo::expandLoadAddressPcrelLarge(
-    MachineBasicBlock &MBB, MachineBasicBlock::iterator MBBI,
-    MachineBasicBlock::iterator &NextMBBI) {
-  // Emit the 5-insn large address load sequence with the `%pc` family of
-  // relocs.
-  return expandLargeAddressLoad(MBB, MBBI, NextMBBI, LoongArch::ADD_D,
-                                LoongArchII::MO_PCREL_LO);
-}
-
-bool LoongArchExpandPseudo::expandLoadAddressGotLarge(
-    MachineBasicBlock &MBB, MachineBasicBlock::iterator MBBI,
-    MachineBasicBlock::iterator &NextMBBI) {
-  // Emit the 5-insn large address load sequence with the `%got_pc` family
-  // of relocs, loading the result from GOT with `ldx.d` in the end.
-  return expandLargeAddressLoad(MBB, MBBI, NextMBBI, LoongArch::LDX_D,
-                                LoongArchII::MO_GOT_PC_HI);
-}
-
-bool LoongArchExpandPseudo::expandLoadAddressTLSIELarge(
-    MachineBasicBlock &MBB, MachineBasicBlock::iterator MBBI,
-    MachineBasicBlock::iterator &NextMBBI) {
-  // Emit the 5-insn large address load sequence with the `%ie_pc` family
-  // of relocs, loading the result with `ldx.d` in the end.
-  return expandLargeAddressLoad(MBB, MBBI, NextMBBI, LoongArch::LDX_D,
-                                LoongArchII::MO_IE_PC_LO);
-}
-
-bool LoongArchExpandPseudo::expandLoadAddressTLSLDLarge(
-    MachineBasicBlock &MBB, MachineBasicBlock::iterator MBBI,
-    MachineBasicBlock::iterator &NextMBBI) {
-  // Emit the 5-insn large address load sequence with the `%got_pc` family
-  // of relocs, with the `pcalau12i` insn relocated with `%ld_pc_hi20`.
-  return expandLargeAddressLoad(MBB, MBBI, NextMBBI, LoongArch::ADD_D,
-                                LoongArchII::MO_LD_PC_HI);
-}
-
-bool LoongArchExpandPseudo::expandLoadAddressTLSGDLarge(
-    MachineBasicBlock &MBB, MachineBasicBlock::iterator MBBI,
-    MachineBasicBlock::iterator &NextMBBI) {
-  // Emit the 5-insn large address load sequence with the `%got_pc` family
-  // of relocs, with the `pcalau12i` insn relocated with `%gd_pc_hi20`.
-  return expandLargeAddressLoad(MBB, MBBI, NextMBBI, LoongArch::ADD_D,
-                                LoongArchII::MO_GD_PC_HI);
-}
-
-bool LoongArchExpandPseudo::expandLoadAddressTLSDescPcLarge(
-    MachineBasicBlock &MBB, MachineBasicBlock::iterator MBBI,
-    MachineBasicBlock::iterator &NextMBBI) {
-  // Code Sequence:
-  //
-  // pcalau12i  $a0, %desc_pc_hi20(sym)
-  // addi.d     $t8, $zero, %desc_pc_lo12(sym)
-  // lu32i.d    $t8, %desc64_pc_lo20(sym)
-  // lu52i.d    $t8, $t8, %desc64_pc_hi12(sym)
-  // add.d      $a0, $a0, $t8
-  // ld.d       $ra, $a0, %desc_ld(sym)
-  // jirl       $ra, $ra, %desc_call(sym)
-  // add.d      $dst, $a0, $tp
-
-  MachineInstr &MI = *MBBI;
-  DebugLoc DL = MI.getDebugLoc();
-  Register DestReg = MI.getOperand(0).getReg();
-  MachineOperand &Symbol = MI.getOperand(2);
-  Register ScratchReg = LoongArch::R20; // $t8
-
-  assert(MBB.getParent()->getSubtarget<LoongArchSubtarget>().is64Bit() &&
-         "Large code model requires LA64");
-
-  BuildMI(MBB, MBBI, DL, TII->get(LoongArch::PCALAU12I), LoongArch::R4)
-      .addDisp(Symbol, 0, LoongArchII::MO_DESC_PC_HI);
-  BuildMI(MBB, MBBI, DL, TII->get(LoongArch::ADDI_D), ScratchReg)
-      .addReg(LoongArch::R0)
-      .addDisp(Symbol, 0, LoongArchII::MO_DESC_PC_LO);
-  BuildMI(MBB, MBBI, DL, TII->get(LoongArch::LU32I_D), ScratchReg)
-      .addReg(ScratchReg)
-      .addDisp(Symbol, 0, LoongArchII::MO_DESC64_PC_LO);
-  BuildMI(MBB, MBBI, DL, TII->get(LoongArch::LU52I_D), ScratchReg)
-      .addReg(ScratchReg)
-      .addDisp(Symbol, 0, LoongArchII::MO_DESC64_PC_HI);
-  BuildMI(MBB, MBBI, DL, TII->get(LoongArch::ADD_D), LoongArch::R4)
-      .addReg(ScratchReg)
-      .addReg(LoongArch::R4);
-  BuildMI(MBB, MBBI, DL, TII->get(LoongArch::LD_D), LoongArch::R1)
-      .addReg(LoongArch::R4)
-      .addDisp(Symbol, 0, LoongArchII::MO_DESC_LD);
-  BuildMI(MBB, MBBI, DL, TII->get(LoongArch::PseudoDESC_CALL), LoongArch::R1)
-      .addReg(LoongArch::R1)
-      .addDisp(Symbol, 0, LoongArchII::MO_DESC_CALL);
-  BuildMI(MBB, MBBI, DL, TII->get(LoongArch::ADD_D), DestReg)
-      .addReg(LoongArch::R4)
-      .addReg(LoongArch::R2);
-
-  MI.eraseFromParent();
-
-  return true;
-}
-
-bool LoongArchExpandPseudo::expandFunctionCALL(
-    MachineBasicBlock &MBB, MachineBasicBlock::iterator MBBI,
-    MachineBasicBlock::iterator &NextMBBI, bool IsTailCall) {
-  MachineFunction *MF = MBB.getParent();
-  MachineInstr &MI = *MBBI;
-  DebugLoc DL = MI.getDebugLoc();
-  const MachineOperand &Func = MI.getOperand(0);
-  MachineInstrBuilder CALL;
-  unsigned Opcode;
-
-  switch (MF->getTarget().getCodeModel()) {
-  default:
-    report_fatal_error("Unsupported code model");
-    break;
-  case CodeModel::Small: {
-    // CALL:
-    // bl func
-    // TAIL:
-    // b func
-    Opcode = IsTailCall ? LoongArch::PseudoB_TAIL : LoongArch::BL;
-    CALL = BuildMI(MBB, MBBI, DL, TII->get(Opcode)).add(Func);
-    break;
-  }
-  case CodeModel::Medium: {
-    // CALL:
-    // pcaddu18i  $ra, %call36(func)
-    // jirl       $ra, $ra, 0
-    // TAIL:
-    // pcaddu18i  $t8, %call36(func)
-    // jr         $t8
-    Opcode =
-        IsTailCall ? LoongArch::PseudoJIRL_TAIL : LoongArch::PseudoJIRL_CALL;
-    Register ScratchReg = IsTailCall ? LoongArch::R20 : LoongArch::R1;
-    MachineInstrBuilder MIB =
-        BuildMI(MBB, MBBI, DL, TII->get(LoongArch::PCADDU18I), ScratchReg);
-
-    CALL =
-        BuildMI(MBB, MBBI, DL, TII->get(Opcode)).addReg(ScratchReg).addImm(0);
-
-    if (Func.isSymbol())
-      MIB.addExternalSymbol(Func.getSymbolName(), LoongArchII::MO_CALL36);
-    else
-      MIB.addDisp(Func, 0, LoongArchII::MO_CALL36);
-    break;
-  }
-  case CodeModel::Large: {
-    // Emit the 5-insn large address load sequence, either directly or
-    // indirectly in case of going through the GOT, then JIRL_TAIL or
-    // JIRL_CALL to $addr.
-    Opcode =
-        IsTailCall ? LoongArch::PseudoJIRL_TAIL : LoongArch::PseudoJIRL_CALL;
-    Register AddrReg = IsTailCall ? LoongArch::R19 : LoongArch::R1;
-
-    bool UseGOT = Func.isGlobal() && !Func.getGlobal()->isDSOLocal();
-    unsigned MO = UseGOT ? LoongArchII::MO_GOT_PC_HI : LoongArchII::MO_PCREL_LO;
-    unsigned LAOpcode = UseGOT ? LoongArch::LDX_D : LoongArch::ADD_D;
-    expandLargeAddressLoad(MBB, MBBI, NextMBBI, LAOpcode, MO, Func, AddrReg,
-                           false);
-    CALL = BuildMI(MBB, MBBI, DL, TII->get(Opcode)).addReg(AddrReg).addImm(0);
-    break;
-  }
-  }
-
-  // Transfer implicit operands.
-  CALL.copyImplicitOps(MI);
-
-  // Transfer MI flags.
-  CALL.setMIFlags(MI.getFlags());
-
-  MI.eraseFromParent();
-  return true;
-}
-
 } // end namespace
 
 INITIALIZE_PASS(LoongArchPreRAExpandPseudo, "loongarch-prera-expand-pseudo",
diff --git a/llvm/lib/Target/LoongArch/LoongArchInstrInfo.cpp b/llvm/lib/Target/LoongArch/LoongArchInstrInfo.cpp
index a85b054..90d94e9 100644
--- a/llvm/lib/Target/LoongArch/LoongArchInstrInfo.cpp
+++ b/llvm/lib/Target/LoongArch/LoongArchInstrInfo.cpp
@@ -347,6 +347,83 @@ bool LoongArchInstrInfo::isBranchOffsetInRange(unsigned BranchOp,
   }
 }
 
+bool LoongArchInstrInfo::isSchedulingBoundary(const MachineInstr &MI,
+                                              const MachineBasicBlock *MBB,
+                                              const MachineFunction &MF) const {
+  if (TargetInstrInfo::isSchedulingBoundary(MI, MBB, MF))
+    return true;
+
+  auto MII = MI.getIterator();
+  auto MIE = MBB->end();
+
+  // According to psABI v2.30:
+  //
+  // https://github.com/loongson/la-abi-specs/releases/tag/v2.30
+  //
+  // The following instruction patterns are prohibited from being reordered:
+  //
+  // * pcaddu18 $ra, %call36(s)
+  //   jirl     $ra, $ra, 0
+  //
+  // * pcalau12i $a0, %pc_hi20(s)
+  //   addi.d $a1, $zero, %pc_lo12(s)
+  //   lu32i.d $a1, %pc64_lo20(s)
+  //   lu52i.d $a1, $a1, %pc64_hi12(s)
+  //
+  // * pcalau12i $a0, %got_pc_hi20(s) | %ld_pc_hi20(s) | %gd_pc_hi20(s)
+  //   addi.d $a1, $zero, %got_pc_lo12(s)
+  //   lu32i.d $a1, %got64_pc_lo20(s)
+  //   lu52i.d $a1, $a1, %got64_pc_hi12(s)
+  //
+  // * pcalau12i $a0, %ie_pc_hi20(s)
+  //   addi.d $a1, $zero, %ie_pc_lo12(s)
+  //   lu32i.d $a1, %ie64_pc_lo20(s)
+  //   lu52i.d $a1, $a1, %ie64_pc_hi12(s)
+  //
+  // For simplicity, only pcalau12i and lu52i.d are marked as scheduling
+  // boundaries, and the instructions between them are guaranteed to be
+  // ordered according to data dependencies.
+  switch (MI.getOpcode()) {
+  case LoongArch::PCADDU18I:
+    if (MI.getOperand(1).getTargetFlags() == LoongArchII::MO_CALL36)
+      return true;
+    break;
+  case LoongArch::PCALAU12I: {
+    auto AddI = std::next(MII);
+    if (AddI == MIE || AddI->getOpcode() != LoongArch::ADDI_D)
+      break;
+    auto Lu32I = std::next(AddI);
+    if (Lu32I == MIE || Lu32I->getOpcode() != LoongArch::LU32I_D)
+      break;
+    auto MO0 = MI.getOperand(1).getTargetFlags();
+    auto MO1 = AddI->getOperand(2).getTargetFlags();
+    auto MO2 = Lu32I->getOperand(2).getTargetFlags();
+    if (MO0 == LoongArchII::MO_PCREL_HI && MO1 == LoongArchII::MO_PCREL_LO &&
+        MO2 == LoongArchII::MO_PCREL64_LO)
+      return true;
+    if ((MO0 == LoongArchII::MO_GOT_PC_HI || MO0 == LoongArchII::MO_LD_PC_HI ||
+         MO0 == LoongArchII::MO_GD_PC_HI) &&
+        MO1 == LoongArchII::MO_GOT_PC_LO && MO2 == LoongArchII::MO_GOT_PC64_LO)
+      return true;
+    if (MO0 == LoongArchII::MO_IE_PC_HI && MO1 == LoongArchII::MO_IE_PC_LO &&
+        MO2 == LoongArchII::MO_IE_PC64_LO)
+      return true;
+    break;
+  }
+  case LoongArch::LU52I_D: {
+    auto MO = MI.getOperand(2).getTargetFlags();
+    if (MO == LoongArchII::MO_PCREL64_HI || MO == LoongArchII::MO_GOT_PC64_HI ||
+        MO == LoongArchII::MO_IE_PC64_HI)
+      return true;
+    break;
+  }
+  default:
+    break;
+  }
+
+  return false;
+}
+
 unsigned LoongArchInstrInfo::removeBranch(MachineBasicBlock &MBB,
                                           int *BytesRemoved) const {
   if (BytesRemoved)
diff --git a/llvm/lib/Target/LoongArch/LoongArchInstrInfo.h b/llvm/lib/Target/LoongArch/LoongArchInstrInfo.h
index eb19051..d66b2cb 100644
--- a/llvm/lib/Target/LoongArch/LoongArchInstrInfo.h
+++ b/llvm/lib/Target/LoongArch/LoongArchInstrInfo.h
@@ -64,6 +64,10 @@ public:
   bool isBranchOffsetInRange(unsigned BranchOpc,
                              int64_t BrOffset) const override;
 
+  bool isSchedulingBoundary(const MachineInstr &MI,
+                            const MachineBasicBlock *MBB,
+                            const MachineFunction &MF) const override;
+
   unsigned removeBranch(MachineBasicBlock &MBB,
                         int *BytesRemoved = nullptr) const override;
 
diff --git a/llvm/lib/Target/LoongArch/LoongArchInstrInfo.td b/llvm/lib/Target/LoongArch/LoongArchInstrInfo.td
index ef647a4..0ab4298 100644
--- a/llvm/lib/Target/LoongArch/LoongArchInstrInfo.td
+++ b/llvm/lib/Target/LoongArch/LoongArchInstrInfo.td
@@ -1123,7 +1123,7 @@ def : PatGprGpr<urem, MOD_WU>;
 def : PatGprGpr<mul, MUL_W>;
 def : PatGprGpr<mulhs, MULH_W>;
 def : PatGprGpr<mulhu, MULH_WU>;
-def : PatGprGpr<rotr, ROTR_W>;
+def : PatGprGpr<shiftop<rotr>, ROTR_W>;
 def : PatGprImm<rotr, ROTRI_W, uimm5>;
 
 foreach Idx = 1...3 in {
@@ -1146,8 +1146,8 @@ def : PatGprGpr<srem, MOD_D>;
 def : PatGprGpr_32<srem, MOD_W>;
 def : PatGprGpr<urem, MOD_DU>;
 def : PatGprGpr<loongarch_mod_wu, MOD_WU>;
-def : PatGprGpr<rotr, ROTR_D>;
-def : PatGprGpr<loongarch_rotr_w, ROTR_W>;
+def : PatGprGpr<shiftop<rotr>, ROTR_D>;
+def : PatGprGpr<shiftopw<loongarch_rotr_w>, ROTR_W>;
 def : PatGprImm<rotr, ROTRI_D, uimm6>;
 def : PatGprImm_32<rotr, ROTRI_W, uimm5>;
 def : PatGprImm<loongarch_rotr_w, ROTRI_W, uimm5>;
@@ -1481,7 +1481,7 @@ def : Pat<(loongarch_call tglobaladdr:$func), (PseudoCALL tglobaladdr:$func)>;
 def : Pat<(loongarch_call texternalsym:$func), (PseudoCALL texternalsym:$func)>;
 
 // Function call with 'Medium' code model.
-let isCall = 1, Defs = [R1, R20], Size = 8 in
+let isCall = 1, Defs = [R1] in
 def PseudoCALL_MEDIUM : Pseudo<(outs), (ins bare_symbol:$func)>;
 
 let Predicates = [IsLA64] in {
@@ -1492,7 +1492,7 @@ def : Pat<(loongarch_call_medium texternalsym:$func),
 } // Predicates = [IsLA64]
 
 // Function call with 'Large' code model.
-let isCall = 1, Defs = [R1, R20], Size = 24 in
+let isCall = 1, Defs = [R1] in
 def PseudoCALL_LARGE: Pseudo<(outs), (ins bare_symbol:$func)>;
 
 let Predicates = [IsLA64] in {
@@ -1530,8 +1530,7 @@ def : Pat<(loongarch_tail (iPTR texternalsym:$dst)),
           (PseudoTAIL texternalsym:$dst)>;
 
 // Tail call with 'Medium' code model.
-let isCall = 1, isTerminator = 1, isReturn = 1, isBarrier = 1,
-    Uses = [R3], Defs = [R20], Size = 8 in
+let isCall = 1, isTerminator = 1, isReturn = 1, isBarrier = 1, Uses = [R3] in
 def PseudoTAIL_MEDIUM : Pseudo<(outs), (ins bare_symbol:$dst)>;
 
 let Predicates = [IsLA64] in {
@@ -1542,8 +1541,7 @@ def : Pat<(loongarch_tail_medium (iPTR texternalsym:$dst)),
 } // Predicates = [IsLA64]
 
 // Tail call with 'Large' code model.
-let isCall = 1, isTerminator = 1, isReturn = 1, isBarrier = 1,
-    Uses = [R3], Defs = [R19, R20], Size = 24 in
+let isCall = 1, isTerminator = 1, isReturn = 1, isBarrier = 1, Uses = [R3] in
 def PseudoTAIL_LARGE : Pseudo<(outs), (ins bare_symbol:$dst)>;
 
 let Predicates = [IsLA64] in {
@@ -1575,12 +1573,12 @@ def PseudoJIRL_TAIL : Pseudo<(outs), (ins GPR:$rj, simm16_lsl2:$imm16)>,
 
 /// call36/taill36 macro instructions
 let isCall = 1, isBarrier = 1, isCodeGenOnly = 0, isAsmParserOnly = 1,
-    Defs = [R1], Size = 8, hasSideEffects = 0, mayStore = 0, mayLoad = 0 in
+    Defs = [R1], hasSideEffects = 0, mayStore = 0, mayLoad = 0 in
 def PseudoCALL36 : Pseudo<(outs), (ins bare_symbol:$dst), [],
                           "call36", "$dst">,
                    Requires<[IsLA64]>;
 let isCall = 1, isTerminator = 1, isReturn = 1, isBarrier = 1, Uses = [R3],
-    isCodeGenOnly = 0, isAsmParserOnly = 1, Size = 8, hasSideEffects = 0,
+    isCodeGenOnly = 0, isAsmParserOnly = 1, hasSideEffects = 0,
     mayStore = 0, mayLoad = 0 in
 def PseudoTAIL36 : Pseudo<(outs), (ins GPR:$tmp, bare_symbol:$dst), [],
                           "tail36", "$tmp, $dst">,
@@ -1617,7 +1615,6 @@ def PseudoLA_TLS_LD : Pseudo<(outs GPR:$dst), (ins bare_symbol:$src), [],
                              "la.tls.ld", "$dst, $src">;
 def PseudoLA_TLS_GD : Pseudo<(outs GPR:$dst), (ins bare_symbol:$src), [],
                              "la.tls.gd", "$dst, $src">;
-let Defs = [R20], Size = 20 in {
 def PseudoLA_PCREL_LARGE : Pseudo<(outs GPR:$dst),
                                   (ins GPR:$tmp, bare_symbol:$src), [],
                                   "la.pcrel", "$dst, $tmp, $src">,
@@ -1632,7 +1629,6 @@ def PseudoLA_TLS_GD_LARGE : Pseudo<(outs GPR:$dst),
                                    (ins GPR:$tmp, bare_symbol:$src), [],
                                    "la.tls.gd", "$dst, $tmp, $src">,
                             Requires<[IsLA64]>;
-} // Defs = [R20], Size = 20
 }
 let hasSideEffects = 0, mayLoad = 1, mayStore = 0, isCodeGenOnly = 0,
     isAsmParserOnly = 1 in {
@@ -1640,7 +1636,6 @@ def PseudoLA_GOT : Pseudo<(outs GPR:$dst), (ins bare_symbol:$src), [],
                           "la.got", "$dst, $src">;
 def PseudoLA_TLS_IE : Pseudo<(outs GPR:$dst), (ins bare_symbol:$src), [],
                              "la.tls.ie", "$dst, $src">;
-let Defs = [R20], Size = 20 in {
 def PseudoLA_GOT_LARGE : Pseudo<(outs GPR:$dst),
                                 (ins GPR:$tmp, bare_symbol:$src), [],
                                 "la.got", "$dst, $tmp, $src">,
@@ -1649,7 +1644,6 @@ def PseudoLA_TLS_IE_LARGE : Pseudo<(outs GPR:$dst),
                                    (ins GPR:$tmp, bare_symbol:$src), [],
                                    "la.tls.ie", "$dst, $tmp, $src">,
                             Requires<[IsLA64]>;
-} // Defs = [R20], Size = 20
 }
 
 // Used for expand PseudoLA_TLS_DESC_* instructions.
@@ -1674,7 +1668,7 @@ def PseudoLA_TLS_DESC_PC : Pseudo<(outs GPR:$dst), (ins bare_symbol:$src), [],
 }
 
 let isCall = 1, isBarrier = 1, hasSideEffects = 0, mayStore = 0, mayLoad = 0,
-    isCodeGenOnly = 0, isAsmParserOnly = 1, Defs = [R1, R4, R20], Size = 32 in
+    isCodeGenOnly = 0, isAsmParserOnly = 1, Defs = [R1, R4] in
 def PseudoLA_TLS_DESC_PC_LARGE : Pseudo<(outs GPR:$dst),
                                         (ins GPR:$tmp, bare_symbol:$src), [],
                                         "la.tls.desc", "$dst, $tmp, $src">,
diff --git a/llvm/lib/Target/LoongArch/MCTargetDesc/LoongArchELFStreamer.cpp b/llvm/lib/Target/LoongArch/MCTargetDesc/LoongArchELFStreamer.cpp
index 2d781ac..eee8582 100644
--- a/llvm/lib/Target/LoongArch/MCTargetDesc/LoongArchELFStreamer.cpp
+++ b/llvm/lib/Target/LoongArch/MCTargetDesc/LoongArchELFStreamer.cpp
@@ -16,7 +16,7 @@
 #include "llvm/BinaryFormat/ELF.h"
 #include "llvm/MC/MCAssembler.h"
 #include "llvm/MC/MCCodeEmitter.h"
-#include "llvm/MC/MCObjectWriter.h"
+#include "llvm/MC/MCELFObjectWriter.h"
 
 using namespace llvm;
 
diff --git a/llvm/lib/Target/Mips/MCTargetDesc/MipsTargetStreamer.cpp b/llvm/lib/Target/Mips/MCTargetDesc/MipsTargetStreamer.cpp
index 1650ffc..cca4a9cc 100644
--- a/llvm/lib/Target/Mips/MCTargetDesc/MipsTargetStreamer.cpp
+++ b/llvm/lib/Target/Mips/MCTargetDesc/MipsTargetStreamer.cpp
@@ -19,6 +19,7 @@
 #include "llvm/BinaryFormat/ELF.h"
 #include "llvm/MC/MCAssembler.h"
 #include "llvm/MC/MCContext.h"
+#include "llvm/MC/MCELFObjectWriter.h"
 #include "llvm/MC/MCObjectFileInfo.h"
 #include "llvm/MC/MCSectionELF.h"
 #include "llvm/MC/MCSubtargetInfo.h"
diff --git a/llvm/lib/Target/Mips/MipsAsmPrinter.cpp b/llvm/lib/Target/Mips/MipsAsmPrinter.cpp
index 018c620..e267a6d 100644
--- a/llvm/lib/Target/Mips/MipsAsmPrinter.cpp
+++ b/llvm/lib/Target/Mips/MipsAsmPrinter.cpp
@@ -254,8 +254,10 @@ void MipsAsmPrinter::emitInstruction(const MachineInstr *MI) {
 
   do {
     // Do any auto-generated pseudo lowerings.
-    if (emitPseudoExpansionLowering(*OutStreamer, &*I))
+    if (MCInst OutInst; lowerPseudoInstExpansion(&*I, OutInst)) {
+      EmitToStreamer(*OutStreamer, OutInst);
       continue;
+    }
 
     // Skip the BUNDLE pseudo instruction and lower the contents
     if (I->isBundle())
diff --git a/llvm/lib/Target/Mips/MipsAsmPrinter.h b/llvm/lib/Target/Mips/MipsAsmPrinter.h
index 0b55089..d53a0f6 100644
--- a/llvm/lib/Target/Mips/MipsAsmPrinter.h
+++ b/llvm/lib/Target/Mips/MipsAsmPrinter.h
@@ -69,8 +69,7 @@ private:
   void EmitSled(const MachineInstr &MI, SledKind Kind);
 
   // tblgen'erated function.
-  bool emitPseudoExpansionLowering(MCStreamer &OutStreamer,
-                                   const MachineInstr *MI);
+  bool lowerPseudoInstExpansion(const MachineInstr *MI, MCInst &Inst);
 
   // Emit PseudoReturn, PseudoReturn64, PseudoIndirectBranch,
   // and PseudoIndirectBranch64 as a JR, JR_MM, JALR, or JALR64 as appropriate
diff --git a/llvm/lib/Target/NVPTX/NVPTXFrameLowering.cpp b/llvm/lib/Target/NVPTX/NVPTXFrameLowering.cpp
index 10ae81e..9abe0e3 100644
--- a/llvm/lib/Target/NVPTX/NVPTXFrameLowering.cpp
+++ b/llvm/lib/Target/NVPTX/NVPTXFrameLowering.cpp
@@ -93,5 +93,8 @@ MachineBasicBlock::iterator NVPTXFrameLowering::eliminateCallFramePseudoInstr(
 
 TargetFrameLowering::DwarfFrameBase
 NVPTXFrameLowering::getDwarfFrameBase(const MachineFunction &MF) const {
-  return {DwarfFrameBase::CFA, {0}};
+  DwarfFrameBase FrameBase;
+  FrameBase.Kind = DwarfFrameBase::CFA;
+  FrameBase.Location.Offset = 0;
+  return FrameBase;
 }
diff --git a/llvm/lib/Target/NVPTX/NVPTXInstrInfo.td b/llvm/lib/Target/NVPTX/NVPTXInstrInfo.td
index 91b239a..a5bdc6f 100644
--- a/llvm/lib/Target/NVPTX/NVPTXInstrInfo.td
+++ b/llvm/lib/Target/NVPTX/NVPTXInstrInfo.td
@@ -268,16 +268,12 @@ multiclass ADD_SUB_INT_CARRY<string OpcStr, SDNode OpNode> {
   }
 }
 
-// Template for instructions which take three fp64 or fp32 args.  The
-// instructions are named "<OpcStr>.f<Width>" (e.g. "min.f64").
+// Template for minimum/maximum instructions.
 //
 // Also defines ftz (flush subnormal inputs and results to sign-preserving
 // zero) variants for fp32 functions.
-//
-// This multiclass should be used for nodes that cannot be folded into FMAs.
-// For nodes that can be folded into FMAs (i.e. adds and muls), use
-// F3_fma_component.
-multiclass F3<string OpcStr, SDNode OpNode> {
+multiclass FMINIMUMMAXIMUM<string OpcStr, bit NaN, SDNode OpNode> {
+  if !not(NaN) then {
    def f64rr :
      NVPTXInst<(outs Float64Regs:$dst),
                (ins Float64Regs:$a, Float64Regs:$b),
@@ -288,6 +284,7 @@ multiclass F3<string OpcStr, SDNode OpNode> {
                (ins Float64Regs:$a, f64imm:$b),
                !strconcat(OpcStr, ".f64 \t$dst, $a, $b;"),
                [(set Float64Regs:$dst, (OpNode Float64Regs:$a, fpimm:$b))]>;
+  }
    def f32rr_ftz :
      NVPTXInst<(outs Float32Regs:$dst),
                (ins Float32Regs:$a, Float32Regs:$b),
@@ -322,45 +319,45 @@ multiclass F3<string OpcStr, SDNode OpNode> {
                (ins Int16Regs:$a, Int16Regs:$b),
                !strconcat(OpcStr, ".f16 \t$dst, $a, $b;"),
                [(set Int16Regs:$dst, (OpNode (f16 Int16Regs:$a), (f16 Int16Regs:$b)))]>,
-               Requires<[useFP16Math]>;
+               Requires<[useFP16Math, hasSM<80>, hasPTX<70>]>;
 
    def f16x2rr_ftz :
      NVPTXInst<(outs Int32Regs:$dst),
                (ins Int32Regs:$a, Int32Regs:$b),
                !strconcat(OpcStr, ".ftz.f16x2 \t$dst, $a, $b;"),
                [(set Int32Regs:$dst, (OpNode (v2f16 Int32Regs:$a), (v2f16 Int32Regs:$b)))]>,
-               Requires<[useFP16Math, doF32FTZ]>;
+               Requires<[useFP16Math, hasSM<80>, hasPTX<70>, doF32FTZ]>;
    def f16x2rr :
      NVPTXInst<(outs Int32Regs:$dst),
                (ins Int32Regs:$a, Int32Regs:$b),
                !strconcat(OpcStr, ".f16x2 \t$dst, $a, $b;"),
                [(set Int32Regs:$dst, (OpNode (v2f16 Int32Regs:$a), (v2f16 Int32Regs:$b)))]>,
-               Requires<[useFP16Math]>;
+               Requires<[useFP16Math, hasSM<80>, hasPTX<70>]>;
    def bf16rr_ftz :
      NVPTXInst<(outs Int16Regs:$dst),
                (ins Int16Regs:$a, Int16Regs:$b),
                !strconcat(OpcStr, ".ftz.bf16 \t$dst, $a, $b;"),
                [(set Int16Regs:$dst, (OpNode (bf16 Int16Regs:$a), (bf16 Int16Regs:$b)))]>,
-               Requires<[hasBF16Math, doF32FTZ]>;
+               Requires<[hasBF16Math, doF32FTZ, hasSM<80>, hasPTX<70>]>;
    def bf16rr :
      NVPTXInst<(outs Int16Regs:$dst),
                (ins Int16Regs:$a, Int16Regs:$b),
                !strconcat(OpcStr, ".bf16 \t$dst, $a, $b;"),
                [(set Int16Regs:$dst, (OpNode (bf16 Int16Regs:$a), (bf16 Int16Regs:$b)))]>,
-               Requires<[hasBF16Math]>;
+               Requires<[hasBF16Math, hasSM<80>, hasPTX<70>]>;
 
    def bf16x2rr_ftz :
      NVPTXInst<(outs Int32Regs:$dst),
                (ins Int32Regs:$a, Int32Regs:$b),
                !strconcat(OpcStr, ".ftz.bf16x2 \t$dst, $a, $b;"),
                [(set Int32Regs:$dst, (OpNode (v2bf16 Int32Regs:$a), (v2bf16 Int32Regs:$b)))]>,
-               Requires<[hasBF16Math, doF32FTZ]>;
+               Requires<[hasBF16Math, hasSM<80>, hasPTX<70>, doF32FTZ]>;
    def bf16x2rr :
      NVPTXInst<(outs Int32Regs:$dst),
                (ins Int32Regs:$a, Int32Regs:$b),
                !strconcat(OpcStr, ".bf16x2 \t$dst, $a, $b;"),
                [(set Int32Regs:$dst, (OpNode (v2bf16 Int32Regs:$a), (v2bf16 Int32Regs:$b)))]>,
-               Requires<[hasBF16Math]>;
+               Requires<[hasBF16Math, hasSM<80>, hasPTX<70>]>;
 }
 
 // Template for instructions which take three FP args.  The
@@ -1178,11 +1175,10 @@ defm FADD : F3_fma_component<"add", fadd>;
 defm FSUB : F3_fma_component<"sub", fsub>;
 defm FMUL : F3_fma_component<"mul", fmul>;
 
-defm FMIN : F3<"min", fminnum>;
-defm FMAX : F3<"max", fmaxnum>;
-// Note: min.NaN.f64 and max.NaN.f64 do not actually exist.
-defm FMINNAN : F3<"min.NaN", fminimum>;
-defm FMAXNAN : F3<"max.NaN", fmaximum>;
+defm FMIN : FMINIMUMMAXIMUM<"min", /* NaN */ false, fminnum>;
+defm FMAX : FMINIMUMMAXIMUM<"max", /* NaN */ false, fmaxnum>;
+defm FMINNAN : FMINIMUMMAXIMUM<"min.NaN", /* NaN */ true, fminimum>;
+defm FMAXNAN : FMINIMUMMAXIMUM<"max.NaN", /* NaN */ true, fmaximum>;
 
 defm FABS  : F2<"abs", fabs>;
 defm FNEG  : F2<"neg", fneg>;
diff --git a/llvm/lib/Target/PowerPC/MCTargetDesc/PPCMCTargetDesc.cpp b/llvm/lib/Target/PowerPC/MCTargetDesc/PPCMCTargetDesc.cpp
index 7bdca87..7e79d27 100644
--- a/llvm/lib/Target/PowerPC/MCTargetDesc/PPCMCTargetDesc.cpp
+++ b/llvm/lib/Target/PowerPC/MCTargetDesc/PPCMCTargetDesc.cpp
@@ -25,11 +25,11 @@
 #include "llvm/MC/MCCodeEmitter.h"
 #include "llvm/MC/MCContext.h"
 #include "llvm/MC/MCDwarf.h"
+#include "llvm/MC/MCELFObjectWriter.h"
 #include "llvm/MC/MCELFStreamer.h"
 #include "llvm/MC/MCExpr.h"
 #include "llvm/MC/MCInstrAnalysis.h"
 #include "llvm/MC/MCInstrInfo.h"
-#include "llvm/MC/MCObjectWriter.h"
 #include "llvm/MC/MCRegisterInfo.h"
 #include "llvm/MC/MCSectionXCOFF.h"
 #include "llvm/MC/MCStreamer.h"
diff --git a/llvm/lib/Target/PowerPC/PPCGenScalarMASSEntries.cpp b/llvm/lib/Target/PowerPC/PPCGenScalarMASSEntries.cpp
index 00931b1..e43437d 100644
--- a/llvm/lib/Target/PowerPC/PPCGenScalarMASSEntries.cpp
+++ b/llvm/lib/Target/PowerPC/PPCGenScalarMASSEntries.cpp
@@ -123,9 +123,7 @@ bool PPCGenScalarMASSEntries::runOnModule(Module &M) {
     // The call to createScalarMASSCall() invalidates the iterator over users
     // upon replacing the users. Precomputing the current list of users allows
     // us to replace all the call sites.
-    SmallVector<User *, 4> TheUsers;
-    for (auto *User : Func.users())
-      TheUsers.push_back(User);
+    SmallVector<User *, 4> TheUsers(Func.users());
 
     for (auto *User : TheUsers)
       if (auto *CI = dyn_cast_or_null<CallInst>(User)) {
diff --git a/llvm/lib/Target/PowerPC/PPCRegisterInfo.td b/llvm/lib/Target/PowerPC/PPCRegisterInfo.td
index fdbdc14..3cb7cd9 100644
--- a/llvm/lib/Target/PowerPC/PPCRegisterInfo.td
+++ b/llvm/lib/Target/PowerPC/PPCRegisterInfo.td
@@ -17,6 +17,7 @@ def sub_un : SubRegIndex<1, 3>;
 def sub_32 : SubRegIndex<32>;
 def sub_32_hi_phony : SubRegIndex<32,32>;
 def sub_64 : SubRegIndex<64>;
+def sub_64_hi_phony : SubRegIndex<64,64>;
 def sub_vsx0 : SubRegIndex<128>;
 def sub_vsx1 : SubRegIndex<128, 128>;
 def sub_gp8_x0 : SubRegIndex<64>;
@@ -77,19 +78,19 @@ class VF<bits<5> num, string n> : PPCReg<n> {
 }
 
 // VR - One of the 32 128-bit vector registers
-class VR<VF SubReg, string n> : PPCReg<n> {
+class VR<VF SubReg, VF SubRegH, string n> : PPCReg<n> {
   let HWEncoding{4-0} = SubReg.HWEncoding{4-0};
   let HWEncoding{5} = 0;
-  let SubRegs = [SubReg];
-  let SubRegIndices = [sub_64];
+  let SubRegs = [SubReg, SubRegH];
+  let SubRegIndices = [sub_64, sub_64_hi_phony];
 }
 
 // VSRL - One of the 32 128-bit VSX registers that overlap with the scalar
 // floating-point registers.
-class VSRL<FPR SubReg, string n> : PPCReg<n> {
+class VSRL<FPR SubReg, FPR SubRegH, string n> : PPCReg<n> {
   let HWEncoding = SubReg.HWEncoding;
-  let SubRegs = [SubReg];
-  let SubRegIndices = [sub_64];
+  let SubRegs = [SubReg, SubRegH];
+  let SubRegIndices = [sub_64, sub_64_hi_phony];
 }
 
 // VSXReg - One of the VSX registers in the range vs32-vs63 with numbering
@@ -155,6 +156,22 @@ foreach Index = 0-31 in {
                 DwarfRegNum<[!add(Index, 32), !add(Index, 32)]>;
 }
 
+// The FH and VFH registers have been marked as Artifical because there are no
+// instructions on PowerPC that use those register classes. They only exist
+// in order to ensure that the super registers (V and VSL) are covered by their
+// subregisters and have correct subregister lane masks.
+let isArtificial = 1 in {
+  foreach Index = 0-31 in {
+    def FH#Index : FPR<-1, "">;
+    def VFH#Index : VF<-1, "">;
+  }
+}
+
+let isAllocatable = 0, CopyCost = -1 in {
+  def VFHRC : RegisterClass<"PPC", [f64], 64, (sequence "VFH%u", 0, 31)>;
+  def FHRC : RegisterClass<"PPC", [f64], 64, (sequence "FH%u", 0, 31)>;
+}
+
 // Floating-point pair registers
 foreach Index = { 0, 2, 4, 6, 8, 10, 12, 14, 16, 18, 20, 22, 24, 26, 28, 30 } in {
   def Fpair#Index : FPPair<"fp"#Index, Index>;
@@ -168,17 +185,19 @@ foreach Index = 0-31 in {
                  DwarfRegNum<[!add(Index, 77), !add(Index, 77)]>;
 }
 
+let CoveredBySubRegs = 1 in {
 // Vector registers
 foreach Index = 0-31 in {
-  def V#Index : VR<!cast<VF>("VF"#Index), "v"#Index>,
+  def V#Index : VR<!cast<VF>("VF"#Index), !cast<VF>("VFH"#Index), "v"#Index>,
                 DwarfRegNum<[!add(Index, 77), !add(Index, 77)]>;
 }
 
 // VSX registers
 foreach Index = 0-31 in {
-  def VSL#Index : VSRL<!cast<FPR>("F"#Index), "vs"#Index>,
+  def VSL#Index : VSRL<!cast<FPR>("F"#Index), !cast<FPR>("FH"#Index), "vs"#Index>,
                   DwarfRegAlias<!cast<FPR>("F"#Index)>;
 }
+}
 
 // Dummy VSX registers, this defines string: "vs32"-"vs63", and is only used for
 // asm printing.
diff --git a/llvm/lib/Target/README.txt b/llvm/lib/Target/README.txt
index 05e590e..adf75c3 100644
--- a/llvm/lib/Target/README.txt
+++ b/llvm/lib/Target/README.txt
@@ -2,20 +2,6 @@ Target Independent Opportunities:
 
 //===---------------------------------------------------------------------===//
 
-We should recognized various "overflow detection" idioms and translate them into
-llvm.uadd.with.overflow and similar intrinsics.  Here is a multiply idiom:
-
-unsigned int mul(unsigned int a,unsigned int b) {
- if ((unsigned long long)a*b>0xffffffff)
-   exit(0);
-  return a*b;
-}
-
-The legalization code for mul-with-overflow needs to be made more robust before
-this can be implemented though.
-
-//===---------------------------------------------------------------------===//
-
 Get the C front-end to expand hypot(x,y) -> llvm.sqrt(x*x+y*y) when errno and
 precision don't matter (ffastmath).  Misc/mandel will like this. :)  This isn't
 safe in general, even on darwin.  See the libm implementation of hypot for
diff --git a/llvm/lib/Target/RISCV/GISel/RISCVCallLowering.cpp b/llvm/lib/Target/RISCV/GISel/RISCVCallLowering.cpp
index b5f8715..f7fa0e1 100644
--- a/llvm/lib/Target/RISCV/GISel/RISCVCallLowering.cpp
+++ b/llvm/lib/Target/RISCV/GISel/RISCVCallLowering.cpp
@@ -329,7 +329,7 @@ static bool isLegalElementTypeForRVV(Type *EltTy,
   if (EltTy->isHalfTy())
     return Subtarget.hasVInstructionsF16();
   if (EltTy->isBFloatTy())
-    return Subtarget.hasVInstructionsBF16();
+    return Subtarget.hasVInstructionsBF16Minimal();
   if (EltTy->isFloatTy())
     return Subtarget.hasVInstructionsF32();
   if (EltTy->isDoubleTy())
diff --git a/llvm/lib/Target/RISCV/GISel/RISCVLegalizerInfo.cpp b/llvm/lib/Target/RISCV/GISel/RISCVLegalizerInfo.cpp
index f033ea7..4e583d9 100644
--- a/llvm/lib/Target/RISCV/GISel/RISCVLegalizerInfo.cpp
+++ b/llvm/lib/Target/RISCV/GISel/RISCVLegalizerInfo.cpp
@@ -19,6 +19,7 @@
 #include "llvm/CodeGen/GlobalISel/LegalizerHelper.h"
 #include "llvm/CodeGen/GlobalISel/MachineIRBuilder.h"
 #include "llvm/CodeGen/MachineConstantPool.h"
+#include "llvm/CodeGen/MachineMemOperand.h"
 #include "llvm/CodeGen/MachineRegisterInfo.h"
 #include "llvm/CodeGen/TargetOpcodes.h"
 #include "llvm/CodeGen/ValueTypes.h"
@@ -67,6 +68,17 @@ typeIsLegalBoolVec(unsigned TypeIdx, std::initializer_list<LLT> BoolVecTys,
   return all(typeInSet(TypeIdx, BoolVecTys), P);
 }
 
+static LegalityPredicate typeIsLegalPtrVec(unsigned TypeIdx,
+                                           std::initializer_list<LLT> PtrVecTys,
+                                           const RISCVSubtarget &ST) {
+  LegalityPredicate P = [=, &ST](const LegalityQuery &Query) {
+    return ST.hasVInstructions() &&
+           (Query.Types[TypeIdx].getElementCount().getKnownMinValue() != 1 ||
+            ST.getELen() == 64);
+  };
+  return all(typeInSet(TypeIdx, PtrVecTys), P);
+}
+
 RISCVLegalizerInfo::RISCVLegalizerInfo(const RISCVSubtarget &ST)
     : STI(ST), XLen(STI.getXLen()), sXLen(LLT::scalar(XLen)) {
   const LLT sDoubleXLen = LLT::scalar(2 * XLen);
@@ -111,6 +123,11 @@ RISCVLegalizerInfo::RISCVLegalizerInfo(const RISCVSubtarget &ST)
   const LLT nxv4s64 = LLT::scalable_vector(4, s64);
   const LLT nxv8s64 = LLT::scalable_vector(8, s64);
 
+  const LLT nxv1p0 = LLT::scalable_vector(1, p0);
+  const LLT nxv2p0 = LLT::scalable_vector(2, p0);
+  const LLT nxv4p0 = LLT::scalable_vector(4, p0);
+  const LLT nxv8p0 = LLT::scalable_vector(8, p0);
+
   using namespace TargetOpcode;
 
   auto BoolVecTys = {nxv1s1, nxv2s1, nxv4s1, nxv8s1, nxv16s1, nxv32s1, nxv64s1};
@@ -120,6 +137,8 @@ RISCVLegalizerInfo::RISCVLegalizerInfo(const RISCVSubtarget &ST)
                         nxv32s16, nxv1s32, nxv2s32, nxv4s32, nxv8s32, nxv16s32,
                         nxv1s64,  nxv2s64, nxv4s64, nxv8s64};
 
+  auto PtrVecTys = {nxv1p0, nxv2p0, nxv4p0, nxv8p0};
+
   getActionDefinitionsBuilder({G_ADD, G_SUB, G_AND, G_OR, G_XOR})
       .legalFor({s32, sXLen})
       .legalIf(typeIsLegalIntOrFPVec(0, IntOrFPVecTys, ST))
@@ -266,6 +285,23 @@ RISCVLegalizerInfo::RISCVLegalizerInfo(const RISCVSubtarget &ST)
                                      {s32, p0, s16, 16},
                                      {s32, p0, s32, 32},
                                      {p0, p0, sXLen, XLen}});
+  if (ST.hasVInstructions())
+    LoadStoreActions.legalForTypesWithMemDesc({{nxv2s8, p0, nxv2s8, 8},
+                                               {nxv4s8, p0, nxv4s8, 8},
+                                               {nxv8s8, p0, nxv8s8, 8},
+                                               {nxv16s8, p0, nxv16s8, 8},
+                                               {nxv32s8, p0, nxv32s8, 8},
+                                               {nxv64s8, p0, nxv64s8, 8},
+                                               {nxv2s16, p0, nxv2s16, 16},
+                                               {nxv4s16, p0, nxv4s16, 16},
+                                               {nxv8s16, p0, nxv8s16, 16},
+                                               {nxv16s16, p0, nxv16s16, 16},
+                                               {nxv32s16, p0, nxv32s16, 16},
+                                               {nxv2s32, p0, nxv2s32, 32},
+                                               {nxv4s32, p0, nxv4s32, 32},
+                                               {nxv8s32, p0, nxv8s32, 32},
+                                               {nxv16s32, p0, nxv16s32, 32}});
+
   auto &ExtLoadActions =
       getActionDefinitionsBuilder({G_SEXTLOAD, G_ZEXTLOAD})
           .legalForTypesWithMemDesc({{s32, p0, s8, 8}, {s32, p0, s16, 16}});
@@ -279,7 +315,28 @@ RISCVLegalizerInfo::RISCVLegalizerInfo(const RISCVSubtarget &ST)
   } else if (ST.hasStdExtD()) {
     LoadStoreActions.legalForTypesWithMemDesc({{s64, p0, s64, 64}});
   }
-  LoadStoreActions.clampScalar(0, s32, sXLen).lower();
+  if (ST.hasVInstructions() && ST.getELen() == 64)
+    LoadStoreActions.legalForTypesWithMemDesc({{nxv1s8, p0, nxv1s8, 8},
+                                               {nxv1s16, p0, nxv1s16, 16},
+                                               {nxv1s32, p0, nxv1s32, 32}});
+
+  if (ST.hasVInstructionsI64())
+    LoadStoreActions.legalForTypesWithMemDesc({{nxv1s64, p0, nxv1s64, 64},
+
+                                               {nxv2s64, p0, nxv2s64, 64},
+                                               {nxv4s64, p0, nxv4s64, 64},
+                                               {nxv8s64, p0, nxv8s64, 64}});
+
+  LoadStoreActions.widenScalarToNextPow2(0, /* MinSize = */ 8)
+      .lowerIfMemSizeNotByteSizePow2()
+      // we will take the custom lowering logic if we have scalable vector types
+      // with non-standard alignments
+      .customIf(LegalityPredicate(
+          LegalityPredicates::any(typeIsLegalIntOrFPVec(0, IntOrFPVecTys, ST),
+                                  typeIsLegalPtrVec(0, PtrVecTys, ST))))
+      .clampScalar(0, s32, sXLen)
+      .lower();
+
   ExtLoadActions.widenScalarToNextPow2(0).clampScalar(0, s32, sXLen).lower();
 
   getActionDefinitionsBuilder({G_PTR_ADD, G_PTRMASK}).legalFor({{p0, sXLen}});
@@ -651,6 +708,46 @@ bool RISCVLegalizerInfo::legalizeExt(MachineInstr &MI,
   return true;
 }
 
+bool RISCVLegalizerInfo::legalizeLoadStore(MachineInstr &MI,
+                                           LegalizerHelper &Helper,
+                                           MachineIRBuilder &MIB) const {
+  assert((isa<GLoad>(MI) || isa<GStore>(MI)) &&
+         "Machine instructions must be Load/Store.");
+  MachineRegisterInfo &MRI = *MIB.getMRI();
+  MachineFunction *MF = MI.getMF();
+  const DataLayout &DL = MIB.getDataLayout();
+  LLVMContext &Ctx = MF->getFunction().getContext();
+
+  Register DstReg = MI.getOperand(0).getReg();
+  LLT DataTy = MRI.getType(DstReg);
+  if (!DataTy.isVector())
+    return false;
+
+  if (!MI.hasOneMemOperand())
+    return false;
+
+  MachineMemOperand *MMO = *MI.memoperands_begin();
+
+  const auto *TLI = STI.getTargetLowering();
+  EVT VT = EVT::getEVT(getTypeForLLT(DataTy, Ctx));
+
+  if (TLI->allowsMemoryAccessForAlignment(Ctx, DL, VT, *MMO))
+    return true;
+
+  unsigned EltSizeBits = DataTy.getScalarSizeInBits();
+  assert((EltSizeBits == 16 || EltSizeBits == 32 || EltSizeBits == 64) &&
+         "Unexpected unaligned RVV load type");
+
+  // Calculate the new vector type with i8 elements
+  unsigned NumElements =
+      DataTy.getElementCount().getKnownMinValue() * (EltSizeBits / 8);
+  LLT NewDataTy = LLT::scalable_vector(NumElements, 8);
+
+  Helper.bitcast(MI, 0, NewDataTy);
+
+  return true;
+}
+
 /// Return the type of the mask type suitable for masking the provided
 /// vector type.  This is simply an i1 element type vector of the same
 /// (possibly scalable) length.
@@ -828,6 +925,9 @@ bool RISCVLegalizerInfo::legalizeCustom(
     return legalizeExt(MI, MIRBuilder);
   case TargetOpcode::G_SPLAT_VECTOR:
     return legalizeSplatVector(MI, MIRBuilder);
+  case TargetOpcode::G_LOAD:
+  case TargetOpcode::G_STORE:
+    return legalizeLoadStore(MI, Helper, MIRBuilder);
   }
 
   llvm_unreachable("expected switch to return");
diff --git a/llvm/lib/Target/RISCV/GISel/RISCVLegalizerInfo.h b/llvm/lib/Target/RISCV/GISel/RISCVLegalizerInfo.h
index 5bb1e7a..2fc2861 100644
--- a/llvm/lib/Target/RISCV/GISel/RISCVLegalizerInfo.h
+++ b/llvm/lib/Target/RISCV/GISel/RISCVLegalizerInfo.h
@@ -13,6 +13,7 @@
 #ifndef LLVM_LIB_TARGET_RISCV_RISCVMACHINELEGALIZER_H
 #define LLVM_LIB_TARGET_RISCV_RISCVMACHINELEGALIZER_H
 
+#include "llvm/CodeGen/GlobalISel/LegalizerHelper.h"
 #include "llvm/CodeGen/GlobalISel/LegalizerInfo.h"
 #include "llvm/CodeGen/Register.h"
 
@@ -45,6 +46,8 @@ private:
   bool legalizeVScale(MachineInstr &MI, MachineIRBuilder &MIB) const;
   bool legalizeExt(MachineInstr &MI, MachineIRBuilder &MIRBuilder) const;
   bool legalizeSplatVector(MachineInstr &MI, MachineIRBuilder &MIB) const;
+  bool legalizeLoadStore(MachineInstr &MI, LegalizerHelper &Helper,
+                         MachineIRBuilder &MIB) const;
 };
 } // end namespace llvm
 #endif
diff --git a/llvm/lib/Target/RISCV/GISel/RISCVRegisterBankInfo.cpp b/llvm/lib/Target/RISCV/GISel/RISCVRegisterBankInfo.cpp
index 43bbc85..5369be2 100644
--- a/llvm/lib/Target/RISCV/GISel/RISCVRegisterBankInfo.cpp
+++ b/llvm/lib/Target/RISCV/GISel/RISCVRegisterBankInfo.cpp
@@ -310,10 +310,19 @@ RISCVRegisterBankInfo::getInstrMapping(const MachineInstr &MI) const {
   switch (Opc) {
   case TargetOpcode::G_LOAD: {
     LLT Ty = MRI.getType(MI.getOperand(0).getReg());
-    OpdsMapping[0] = GPRValueMapping;
+    TypeSize Size = Ty.getSizeInBits();
+
     OpdsMapping[1] = GPRValueMapping;
+
+    if (Ty.isVector()) {
+      OpdsMapping[0] = getVRBValueMapping(Size.getKnownMinValue());
+      break;
+    }
+
+    OpdsMapping[0] = GPRValueMapping;
+
     // Use FPR64 for s64 loads on rv32.
-    if (GPRSize == 32 && Ty.getSizeInBits() == 64) {
+    if (GPRSize == 32 && Size.getFixedValue() == 64) {
       assert(MF.getSubtarget<RISCVSubtarget>().hasStdExtD());
       OpdsMapping[0] = getFPValueMapping(Ty.getSizeInBits());
       break;
@@ -333,10 +342,19 @@ RISCVRegisterBankInfo::getInstrMapping(const MachineInstr &MI) const {
   }
   case TargetOpcode::G_STORE: {
     LLT Ty = MRI.getType(MI.getOperand(0).getReg());
-    OpdsMapping[0] = GPRValueMapping;
+    TypeSize Size = Ty.getSizeInBits();
+
     OpdsMapping[1] = GPRValueMapping;
+
+    if (Ty.isVector()) {
+      OpdsMapping[0] = getVRBValueMapping(Size.getKnownMinValue());
+      break;
+    }
+
+    OpdsMapping[0] = GPRValueMapping;
+
     // Use FPR64 for s64 stores on rv32.
-    if (GPRSize == 32 && Ty.getSizeInBits() == 64) {
+    if (GPRSize == 32 && Size.getFixedValue() == 64) {
       assert(MF.getSubtarget<RISCVSubtarget>().hasStdExtD());
       OpdsMapping[0] = getFPValueMapping(Ty.getSizeInBits());
       break;
diff --git a/llvm/lib/Target/RISCV/MCTargetDesc/RISCVELFStreamer.cpp b/llvm/lib/Target/RISCV/MCTargetDesc/RISCVELFStreamer.cpp
index 2c2aa8a..26a3ee4 100644
--- a/llvm/lib/Target/RISCV/MCTargetDesc/RISCVELFStreamer.cpp
+++ b/llvm/lib/Target/RISCV/MCTargetDesc/RISCVELFStreamer.cpp
@@ -19,7 +19,7 @@
 #include "llvm/MC/MCAssembler.h"
 #include "llvm/MC/MCCodeEmitter.h"
 #include "llvm/MC/MCContext.h"
-#include "llvm/MC/MCObjectWriter.h"
+#include "llvm/MC/MCELFObjectWriter.h"
 #include "llvm/MC/MCSectionELF.h"
 #include "llvm/MC/MCSubtargetInfo.h"
 #include "llvm/MC/MCValue.h"
diff --git a/llvm/lib/Target/RISCV/RISCVAsmPrinter.cpp b/llvm/lib/Target/RISCV/RISCVAsmPrinter.cpp
index c0a4d0e..9367743 100644
--- a/llvm/lib/Target/RISCV/RISCVAsmPrinter.cpp
+++ b/llvm/lib/Target/RISCV/RISCVAsmPrinter.cpp
@@ -84,8 +84,8 @@ public:
 
   // Returns whether Inst is compressed.
   bool EmitToStreamer(MCStreamer &S, const MCInst &Inst);
-  bool emitPseudoExpansionLowering(MCStreamer &OutStreamer,
-                                   const MachineInstr *MI);
+
+  bool lowerPseudoInstExpansion(const MachineInstr *MI, MCInst &Inst);
 
   typedef std::tuple<unsigned, uint32_t> HwasanMemaccessTuple;
   std::map<HwasanMemaccessTuple, MCSymbol *> HwasanMemaccessSymbols;
@@ -291,9 +291,10 @@ void RISCVAsmPrinter::emitInstruction(const MachineInstr *MI) {
   emitNTLHint(MI);
 
   // Do any auto-generated pseudo lowerings.
-  if (emitPseudoExpansionLowering(*OutStreamer, MI))
+  if (MCInst OutInst; lowerPseudoInstExpansion(MI, OutInst)) {
+    EmitToStreamer(*OutStreamer, OutInst);
     return;
-
+  }
 
   switch (MI->getOpcode()) {
   case RISCV::HWASAN_CHECK_MEMACCESS_SHORTGRANULES:
@@ -975,7 +976,7 @@ static bool lowerRISCVVMachineInstrToMCInst(const MachineInstr *MI,
     if (hasVLOutput && OpNo == 1)
       continue;
 
-    // Skip merge op. It should be the first operand after the defs.
+    // Skip passthru op. It should be the first operand after the defs.
     if (OpNo == MI->getNumExplicitDefs() && MO.isReg() && MO.isTied()) {
       assert(MCID.getOperandConstraint(OpNo, MCOI::TIED_TO) == 0 &&
              "Expected tied to first def.");
diff --git a/llvm/lib/Target/RISCV/RISCVCodeGenPrepare.cpp b/llvm/lib/Target/RISCV/RISCVCodeGenPrepare.cpp
index 0a66a38..be2e880 100644
--- a/llvm/lib/Target/RISCV/RISCVCodeGenPrepare.cpp
+++ b/llvm/lib/Target/RISCV/RISCVCodeGenPrepare.cpp
@@ -187,25 +187,10 @@ bool RISCVCodeGenPrepare::expandVPStrideLoad(IntrinsicInst &II) {
   auto *VTy = cast<VectorType>(II.getType());
 
   IRBuilder<> Builder(&II);
-
-  // Extend VL from i32 to XLen if needed.
-  if (ST->is64Bit())
-    VL = Builder.CreateZExt(VL, Builder.getInt64Ty());
-
   Type *STy = VTy->getElementType();
   Value *Val = Builder.CreateLoad(STy, BasePtr);
-  const auto &TLI = *ST->getTargetLowering();
-  Value *Res;
-
-  // TODO: Also support fixed/illegal vector types to splat with evl = vl.
-  if (isa<ScalableVectorType>(VTy) && TLI.isTypeLegal(EVT::getEVT(VTy))) {
-    unsigned VMVOp = STy->isFloatingPointTy() ? Intrinsic::riscv_vfmv_v_f
-                                              : Intrinsic::riscv_vmv_v_x;
-    Res = Builder.CreateIntrinsic(VMVOp, {VTy, VL->getType()},
-                                  {PoisonValue::get(VTy), Val, VL});
-  } else {
-    Res = Builder.CreateVectorSplat(VTy->getElementCount(), Val);
-  }
+  Value *Res = Builder.CreateIntrinsic(Intrinsic::experimental_vp_splat, {VTy},
+                                       {Val, II.getOperand(2), VL});
 
   II.replaceAllUsesWith(Res);
   II.eraseFromParent();
diff --git a/llvm/lib/Target/RISCV/RISCVFeatures.td b/llvm/lib/Target/RISCV/RISCVFeatures.td
index 3c868db..e278fa3 100644
--- a/llvm/lib/Target/RISCV/RISCVFeatures.td
+++ b/llvm/lib/Target/RISCV/RISCVFeatures.td
@@ -870,7 +870,7 @@ def HasVInstructionsF16Minimal : Predicate<"Subtarget->hasVInstructionsF16Minima
                          "'Zvfhmin' (Vector Half-Precision Floating-Point Minimal) or "
                          "'Zvfh' (Vector Half-Precision Floating-Point)">;
 
-def HasVInstructionsBF16 : Predicate<"Subtarget->hasVInstructionsBF16()">;
+def HasVInstructionsBF16Minimal : Predicate<"Subtarget->hasVInstructionsBF16Minimal()">;
 def HasVInstructionsF16 : Predicate<"Subtarget->hasVInstructionsF16()">;
 def HasVInstructionsF64 : Predicate<"Subtarget->hasVInstructionsF64()">;
 
@@ -1370,9 +1370,7 @@ def HasConditionalMoveFusion : Predicate<"Subtarget->hasConditionalMoveFusion()"
 def NoConditionalMoveFusion  : Predicate<"!Subtarget->hasConditionalMoveFusion()">;
 
 def TuneSiFive7 : SubtargetFeature<"sifive7", "RISCVProcFamily", "SiFive7",
-                                   "SiFive 7-Series processors",
-                                   [TuneNoDefaultUnroll,
-                                    TuneShortForwardBranchOpt]>;
+                                   "SiFive 7-Series processors">;
 
 def TuneVentanaVeyron : SubtargetFeature<"ventana-veyron", "RISCVProcFamily", "VentanaVeyron",
                                          "Ventana Veyron-Series processors">;
diff --git a/llvm/lib/Target/RISCV/RISCVFrameLowering.cpp b/llvm/lib/Target/RISCV/RISCVFrameLowering.cpp
index e676c2f..7abd5a4 100644
--- a/llvm/lib/Target/RISCV/RISCVFrameLowering.cpp
+++ b/llvm/lib/Target/RISCV/RISCVFrameLowering.cpp
@@ -1535,6 +1535,7 @@ void RISCVFrameLowering::emitCalleeSavedRVVPrologCFI(
   const MachineFrameInfo &MFI = MF->getFrameInfo();
   RISCVMachineFunctionInfo *RVFI = MF->getInfo<RISCVMachineFunctionInfo>();
   const TargetInstrInfo &TII = *STI.getInstrInfo();
+  const RISCVRegisterInfo &TRI = *STI.getRegisterInfo();
   DebugLoc DL = MBB.findDebugLoc(MI);
 
   const auto &RVVCSI = getRVVCalleeSavedInfo(*MF, MFI.getCalleeSavedInfo());
@@ -1554,12 +1555,22 @@ void RISCVFrameLowering::emitCalleeSavedRVVPrologCFI(
     // Insert the spill to the stack frame.
     int FI = CS.getFrameIdx();
     if (FI >= 0 && MFI.getStackID(FI) == TargetStackID::ScalableVector) {
-      unsigned CFIIndex = MF->addFrameInst(
-          createDefCFAOffset(*STI.getRegisterInfo(), CS.getReg(), -FixedSize,
-                             MFI.getObjectOffset(FI) / 8));
-      BuildMI(MBB, MI, DL, TII.get(TargetOpcode::CFI_INSTRUCTION))
-          .addCFIIndex(CFIIndex)
-          .setMIFlag(MachineInstr::FrameSetup);
+      MCRegister BaseReg = TRI.getSubReg(CS.getReg(), RISCV::sub_vrm1_0);
+      // If it's not a grouped vector register, it doesn't have subregister, so
+      // the base register is just itself.
+      if (BaseReg == RISCV::NoRegister)
+        BaseReg = CS.getReg();
+      unsigned NumRegs = RISCV::VRRegClass.contains(CS.getReg())     ? 1
+                         : RISCV::VRM2RegClass.contains(CS.getReg()) ? 2
+                         : RISCV::VRM4RegClass.contains(CS.getReg()) ? 4
+                                                                     : 8;
+      for (unsigned i = 0; i < NumRegs; ++i) {
+        unsigned CFIIndex = MF->addFrameInst(createDefCFAOffset(
+            TRI, BaseReg + i, -FixedSize, MFI.getObjectOffset(FI) / 8 + i));
+        BuildMI(MBB, MI, DL, TII.get(TargetOpcode::CFI_INSTRUCTION))
+            .addCFIIndex(CFIIndex)
+            .setMIFlag(MachineInstr::FrameSetup);
+      }
     }
   }
 }
diff --git a/llvm/lib/Target/RISCV/RISCVISelDAGToDAG.cpp b/llvm/lib/Target/RISCV/RISCVISelDAGToDAG.cpp
index eef6ae6..4de38db 100644
--- a/llvm/lib/Target/RISCV/RISCVISelDAGToDAG.cpp
+++ b/llvm/lib/Target/RISCV/RISCVISelDAGToDAG.cpp
@@ -1393,6 +1393,18 @@ void RISCVDAGToDAGISel::Select(SDNode *Node) {
           ReplaceNode(Node, SLLI);
           return;
         }
+        // If we have 32 bits in the mask, we can use SLLI_UW instead of SLLI.
+        if (Trailing > 0 && Leading + Trailing == 32 && C2 + Trailing < XLen &&
+            OneUseOrZExtW && Subtarget->hasStdExtZba()) {
+          SDNode *SRLI = CurDAG->getMachineNode(
+              RISCV::SRLI, DL, VT, X,
+              CurDAG->getTargetConstant(C2 + Trailing, DL, VT));
+          SDNode *SLLI_UW = CurDAG->getMachineNode(
+              RISCV::SLLI_UW, DL, VT, SDValue(SRLI, 0),
+              CurDAG->getTargetConstant(Trailing, DL, VT));
+          ReplaceNode(Node, SLLI_UW);
+          return;
+        }
       }
 
       // Turn (and (shl x, c2), c1) -> (slli (srli x, c3-c2), c3) if c1 is a
@@ -1528,7 +1540,7 @@ void RISCVDAGToDAGISel::Select(SDNode *Node) {
     if (tryIndexedLoad(Node))
       return;
 
-    if (Subtarget->hasVendorXCVmem()) {
+    if (Subtarget->hasVendorXCVmem() && !Subtarget->is64Bit()) {
       // We match post-incrementing load here
       LoadSDNode *Load = cast<LoadSDNode>(Node);
       if (Load->getAddressingMode() != ISD::POST_INC)
@@ -3621,7 +3633,7 @@ bool RISCVDAGToDAGISel::doPeepholeMaskedRVV(MachineSDNode *N) {
 #endif
 
   SmallVector<SDValue, 8> Ops;
-  // Skip the merge operand at index 0 if !UseTUPseudo.
+  // Skip the passthru operand at index 0 if !UseTUPseudo.
   for (unsigned I = !UseTUPseudo, E = N->getNumOperands(); I != E; I++) {
     // Skip the mask, and the Glue.
     SDValue Op = N->getOperand(I);
@@ -3684,9 +3696,9 @@ static unsigned GetVMSetForLMul(RISCVII::VLMUL LMUL) {
 // ->
 // %x = PseudoVADD_VV_MASK %false, ..., %mask
 //
-// We can only fold if vmerge's merge operand, vmerge's false operand and
-// %true's merge operand (if it has one) are the same. This is because we have
-// to consolidate them into one merge operand in the result.
+// We can only fold if vmerge's passthru operand, vmerge's false operand and
+// %true's passthru operand (if it has one) are the same. This is because we
+// have to consolidate them into one passthru operand in the result.
 //
 // If %true is masked, then we can use its mask instead of vmerge's if vmerge's
 // mask is all ones.
@@ -3697,12 +3709,12 @@ static unsigned GetVMSetForLMul(RISCVII::VLMUL LMUL) {
 // The resulting VL is the minimum of the two VLs.
 //
 // The resulting policy is the effective policy the vmerge would have had,
-// i.e. whether or not it's merge operand was implicit-def.
+// i.e. whether or not it's passthru operand was implicit-def.
 bool RISCVDAGToDAGISel::performCombineVMergeAndVOps(SDNode *N) {
-  SDValue Merge, False, True, VL, Mask, Glue;
+  SDValue Passthru, False, True, VL, Mask, Glue;
   // A vmv.v.v is equivalent to a vmerge with an all-ones mask.
   if (IsVMv(N)) {
-    Merge = N->getOperand(0);
+    Passthru = N->getOperand(0);
     False = N->getOperand(0);
     True = N->getOperand(1);
     VL = N->getOperand(2);
@@ -3710,7 +3722,7 @@ bool RISCVDAGToDAGISel::performCombineVMergeAndVOps(SDNode *N) {
     // mask later below.
   } else {
     assert(IsVMerge(N));
-    Merge = N->getOperand(0);
+    Passthru = N->getOperand(0);
     False = N->getOperand(1);
     True = N->getOperand(2);
     Mask = N->getOperand(3);
@@ -3721,9 +3733,13 @@ bool RISCVDAGToDAGISel::performCombineVMergeAndVOps(SDNode *N) {
   assert(!Mask || cast<RegisterSDNode>(Mask)->getReg() == RISCV::V0);
   assert(!Glue || Glue.getValueType() == MVT::Glue);
 
-  // We require that either merge and false are the same, or that merge
+  // If the EEW of True is different from vmerge's SEW, then we can't fold.
+  if (True.getSimpleValueType() != N->getSimpleValueType(0))
+    return false;
+
+  // We require that either passthru and false are the same, or that passthru
   // is undefined.
-  if (Merge != False && !isImplicitDef(Merge))
+  if (Passthru != False && !isImplicitDef(Passthru))
     return false;
 
   assert(True.getResNo() == 0 &&
@@ -3753,11 +3769,11 @@ bool RISCVDAGToDAGISel::performCombineVMergeAndVOps(SDNode *N) {
   if (!Info)
     return false;
 
-  // If True has a merge operand then it needs to be the same as vmerge's False,
-  // since False will be used for the result's merge operand.
+  // If True has a passthru operand then it needs to be the same as vmerge's
+  // False, since False will be used for the result's passthru operand.
   if (HasTiedDest && !isImplicitDef(True->getOperand(0))) {
-    SDValue MergeOpTrue = True->getOperand(0);
-    if (False != MergeOpTrue)
+    SDValue PassthruOpTrue = True->getOperand(0);
+    if (False != PassthruOpTrue)
       return false;
   }
 
@@ -3765,7 +3781,7 @@ bool RISCVDAGToDAGISel::performCombineVMergeAndVOps(SDNode *N) {
   // 1s mask, since we're going to keep the mask from True.
   if (IsMasked && Mask) {
     // FIXME: Support mask agnostic True instruction which would have an
-    // undef merge operand.
+    // undef passthru operand.
     SDValue TrueMask =
         getMaskSetter(True->getOperand(Info->MaskOpIdx),
                       True->getOperand(True->getNumOperands() - 1));
@@ -3823,8 +3839,8 @@ bool RISCVDAGToDAGISel::performCombineVMergeAndVOps(SDNode *N) {
     return CLHS->getZExtValue() <= CRHS->getZExtValue() ? LHS : RHS;
   };
 
-  // Because N and True must have the same merge operand (or True's operand is
-  // implicit_def), the "effective" body is the minimum of their VLs.
+  // Because N and True must have the same passthru operand (or True's operand
+  // is implicit_def), the "effective" body is the minimum of their VLs.
   SDValue OrigVL = VL;
   VL = GetMinVL(TrueVL, VL);
   if (!VL)
@@ -3883,7 +3899,7 @@ bool RISCVDAGToDAGISel::performCombineVMergeAndVOps(SDNode *N) {
          "Expected instructions with mask have a tied dest.");
 #endif
 
-  // Use a tumu policy, relaxing it to tail agnostic provided that the merge
+  // Use a tumu policy, relaxing it to tail agnostic provided that the passthru
   // operand is undefined.
   //
   // However, if the VL became smaller than what the vmerge had originally, then
@@ -3891,7 +3907,7 @@ bool RISCVDAGToDAGISel::performCombineVMergeAndVOps(SDNode *N) {
   // to the tail. In that case we always need to use tail undisturbed to
   // preserve them.
   bool MergeVLShrunk = VL != OrigVL;
-  uint64_t Policy = (isImplicitDef(Merge) && !MergeVLShrunk)
+  uint64_t Policy = (isImplicitDef(Passthru) && !MergeVLShrunk)
                         ? RISCVII::TAIL_AGNOSTIC
                         : /*TUMU*/ 0;
   SDValue PolicyOp =
diff --git a/llvm/lib/Target/RISCV/RISCVISelLowering.cpp b/llvm/lib/Target/RISCV/RISCVISelLowering.cpp
index 0339b30..68b614d 100644
--- a/llvm/lib/Target/RISCV/RISCVISelLowering.cpp
+++ b/llvm/lib/Target/RISCV/RISCVISelLowering.cpp
@@ -198,7 +198,7 @@ RISCVTargetLowering::RISCVTargetLowering(const TargetMachine &TM,
       for (MVT VT : F16VecVTs)
         addRegClassForRVV(VT);
 
-    if (Subtarget.hasVInstructionsBF16())
+    if (Subtarget.hasVInstructionsBF16Minimal())
       for (MVT VT : BF16VecVTs)
         addRegClassForRVV(VT);
 
@@ -250,14 +250,14 @@ RISCVTargetLowering::RISCVTargetLowering(const TargetMachine &TM,
   if (RV64LegalI32 && Subtarget.is64Bit())
     setOperationAction(ISD::SELECT_CC, MVT::i32, Expand);
 
-  if (!Subtarget.hasVendorXCValu())
-    setCondCodeAction(ISD::SETLE, XLenVT, Expand);
   setCondCodeAction(ISD::SETGT, XLenVT, Custom);
   setCondCodeAction(ISD::SETGE, XLenVT, Expand);
-  if (!Subtarget.hasVendorXCValu())
-    setCondCodeAction(ISD::SETULE, XLenVT, Expand);
   setCondCodeAction(ISD::SETUGT, XLenVT, Custom);
   setCondCodeAction(ISD::SETUGE, XLenVT, Expand);
+  if (!(Subtarget.hasVendorXCValu() && !Subtarget.is64Bit())) {
+    setCondCodeAction(ISD::SETULE, XLenVT, Expand);
+    setCondCodeAction(ISD::SETLE, XLenVT, Expand);
+  }
 
   if (RV64LegalI32 && Subtarget.is64Bit())
     setOperationAction(ISD::SETCC, MVT::i32, Promote);
@@ -273,7 +273,8 @@ RISCVTargetLowering::RISCVTargetLowering(const TargetMachine &TM,
 
   setOperationAction(ISD::EH_DWARF_CFA, MVT::i32, Custom);
 
-  if (!Subtarget.hasStdExtZbb() && !Subtarget.hasVendorXTHeadBb())
+  if (!Subtarget.hasStdExtZbb() && !Subtarget.hasVendorXTHeadBb() &&
+      !(Subtarget.hasVendorXCValu() && !Subtarget.is64Bit()))
     setOperationAction(ISD::SIGN_EXTEND_INREG, {MVT::i8, MVT::i16}, Expand);
 
   if (Subtarget.is64Bit()) {
@@ -343,7 +344,7 @@ RISCVTargetLowering::RISCVTargetLowering(const TargetMachine &TM,
     if (Subtarget.is64Bit())
       setOperationAction({ISD::ROTL, ISD::ROTR}, MVT::i32, Custom);
     setOperationAction({ISD::ROTL, ISD::ROTR}, XLenVT, Custom);
-  } else if (Subtarget.hasVendorXCVbitmanip()) {
+  } else if (Subtarget.hasVendorXCVbitmanip() && !Subtarget.is64Bit()) {
     setOperationAction(ISD::ROTL, XLenVT, Expand);
   } else {
     setOperationAction({ISD::ROTL, ISD::ROTR}, XLenVT, Expand);
@@ -366,7 +367,7 @@ RISCVTargetLowering::RISCVTargetLowering(const TargetMachine &TM,
                            : Expand);
 
 
-  if (Subtarget.hasVendorXCVbitmanip()) {
+  if (Subtarget.hasVendorXCVbitmanip() && !Subtarget.is64Bit()) {
     setOperationAction(ISD::BITREVERSE, XLenVT, Legal);
   } else {
     // Zbkb can use rev8+brev8 to implement bitreverse.
@@ -374,27 +375,31 @@ RISCVTargetLowering::RISCVTargetLowering(const TargetMachine &TM,
                        Subtarget.hasStdExtZbkb() ? Custom : Expand);
   }
 
-  if (Subtarget.hasStdExtZbb()) {
+  if (Subtarget.hasStdExtZbb() ||
+      (Subtarget.hasVendorXCValu() && !Subtarget.is64Bit())) {
     setOperationAction({ISD::SMIN, ISD::SMAX, ISD::UMIN, ISD::UMAX}, XLenVT,
                        Legal);
     if (RV64LegalI32 && Subtarget.is64Bit())
       setOperationAction({ISD::SMIN, ISD::SMAX, ISD::UMIN, ISD::UMAX}, MVT::i32,
                          Promote);
+  }
 
+  if (Subtarget.hasStdExtZbb() ||
+      (Subtarget.hasVendorXCVbitmanip() && !Subtarget.is64Bit())) {
     if (Subtarget.is64Bit()) {
       if (RV64LegalI32)
         setOperationAction(ISD::CTTZ, MVT::i32, Legal);
       else
         setOperationAction({ISD::CTTZ, ISD::CTTZ_ZERO_UNDEF}, MVT::i32, Custom);
     }
-  } else if (!Subtarget.hasVendorXCVbitmanip()) {
+  } else {
     setOperationAction({ISD::CTTZ, ISD::CTPOP}, XLenVT, Expand);
     if (RV64LegalI32 && Subtarget.is64Bit())
       setOperationAction({ISD::CTTZ, ISD::CTPOP}, MVT::i32, Expand);
   }
 
   if (Subtarget.hasStdExtZbb() || Subtarget.hasVendorXTHeadBb() ||
-      Subtarget.hasVendorXCVbitmanip()) {
+      (Subtarget.hasVendorXCVbitmanip() && !Subtarget.is64Bit())) {
     // We need the custom lowering to make sure that the resulting sequence
     // for the 32bit case is efficient on 64bit targets.
     if (Subtarget.is64Bit()) {
@@ -412,13 +417,17 @@ RISCVTargetLowering::RISCVTargetLowering(const TargetMachine &TM,
       setOperationAction(ISD::CTLZ, MVT::i32, Expand);
   }
 
-  if (!RV64LegalI32 && Subtarget.is64Bit() &&
-      !Subtarget.hasShortForwardBranchOpt())
-    setOperationAction(ISD::ABS, MVT::i32, Custom);
-
-  // We can use PseudoCCSUB to implement ABS.
-  if (Subtarget.hasShortForwardBranchOpt())
+  if (Subtarget.hasVendorXCValu() && !Subtarget.is64Bit()) {
     setOperationAction(ISD::ABS, XLenVT, Legal);
+  } else {
+    if (!RV64LegalI32 && Subtarget.is64Bit() &&
+        !Subtarget.hasShortForwardBranchOpt())
+      setOperationAction(ISD::ABS, MVT::i32, Custom);
+
+    // We can use PseudoCCSUB to implement ABS.
+    if (Subtarget.hasShortForwardBranchOpt())
+      setOperationAction(ISD::ABS, XLenVT, Legal);
+  }
 
   if (!Subtarget.hasVendorXTHeadCondMov()) {
     setOperationAction(ISD::SELECT, XLenVT, Custom);
@@ -1092,7 +1101,7 @@ RISCVTargetLowering::RISCVTargetLowering(const TargetMachine &TM,
     }
 
     // TODO: Could we merge some code with zvfhmin?
-    if (Subtarget.hasVInstructionsBF16()) {
+    if (Subtarget.hasVInstructionsBF16Minimal()) {
       for (MVT VT : BF16VecVTs) {
         if (!isTypeLegal(VT))
           continue;
@@ -1439,7 +1448,7 @@ RISCVTargetLowering::RISCVTargetLowering(const TargetMachine &TM,
     }
   }
 
-  if (Subtarget.hasVendorXCVmem()) {
+  if (Subtarget.hasVendorXCVmem() && !Subtarget.is64Bit()) {
     setIndexedLoadAction(ISD::POST_INC, MVT::i8, Legal);
     setIndexedLoadAction(ISD::POST_INC, MVT::i16, Legal);
     setIndexedLoadAction(ISD::POST_INC, MVT::i32, Legal);
@@ -1449,16 +1458,6 @@ RISCVTargetLowering::RISCVTargetLowering(const TargetMachine &TM,
     setIndexedStoreAction(ISD::POST_INC, MVT::i32, Legal);
   }
 
-  if (Subtarget.hasVendorXCValu()) {
-    setOperationAction(ISD::ABS, XLenVT, Legal);
-    setOperationAction(ISD::SMIN, XLenVT, Legal);
-    setOperationAction(ISD::UMIN, XLenVT, Legal);
-    setOperationAction(ISD::SMAX, XLenVT, Legal);
-    setOperationAction(ISD::UMAX, XLenVT, Legal);
-    setOperationAction(ISD::SIGN_EXTEND_INREG, MVT::i8, Legal);
-    setOperationAction(ISD::SIGN_EXTEND_INREG, MVT::i16, Legal);
-  }
-
   // Function alignments.
   const Align FunctionAlignment(Subtarget.hasStdExtCOrZca() ? 2 : 4);
   setMinFunctionAlignment(FunctionAlignment);
@@ -1473,13 +1472,13 @@ RISCVTargetLowering::RISCVTargetLowering(const TargetMachine &TM,
     setTargetDAGCombine(ISD::SRA);
 
   if (Subtarget.hasStdExtFOrZfinx())
-    setTargetDAGCombine({ISD::FADD, ISD::FMAXNUM, ISD::FMINNUM});
+    setTargetDAGCombine({ISD::FADD, ISD::FMAXNUM, ISD::FMINNUM, ISD::FMUL});
 
   if (Subtarget.hasStdExtZbb())
     setTargetDAGCombine({ISD::UMAX, ISD::UMIN, ISD::SMAX, ISD::SMIN});
 
   if ((Subtarget.hasStdExtZbs() && Subtarget.is64Bit()) ||
-      Subtarget.hasStdExtV())
+      Subtarget.hasVInstructions())
     setTargetDAGCombine(ISD::TRUNCATE);
 
   if (Subtarget.hasStdExtZbkb())
@@ -1835,6 +1834,10 @@ bool RISCVTargetLowering::isLegalAddressingMode(const DataLayout &DL,
   if (AM.BaseGV)
     return false;
 
+  // None of our addressing modes allows a scalable offset
+  if (AM.ScalableOffset)
+    return false;
+
   // RVV instructions only support register addressing.
   if (Subtarget.hasVInstructions() && isa<VectorType>(Ty))
     return AM.HasBaseReg && AM.Scale == 0 && !AM.BaseOffs;
@@ -1892,7 +1895,7 @@ bool RISCVTargetLowering::isTruncateFree(EVT SrcVT, EVT DstVT) const {
 bool RISCVTargetLowering::isTruncateFree(SDValue Val, EVT VT2) const {
   EVT SrcVT = Val.getValueType();
   // free truncate from vnsrl and vnsra
-  if (Subtarget.hasStdExtV() &&
+  if (Subtarget.hasVInstructions() &&
       (Val.getOpcode() == ISD::SRL || Val.getOpcode() == ISD::SRA) &&
       SrcVT.isVector() && VT2.isVector()) {
     unsigned SrcBits = SrcVT.getVectorElementType().getSizeInBits();
@@ -1928,12 +1931,13 @@ bool RISCVTargetLowering::signExtendConstant(const ConstantInt *CI) const {
 }
 
 bool RISCVTargetLowering::isCheapToSpeculateCttz(Type *Ty) const {
-  return Subtarget.hasStdExtZbb() || Subtarget.hasVendorXCVbitmanip();
+  return Subtarget.hasStdExtZbb() ||
+         (Subtarget.hasVendorXCVbitmanip() && !Subtarget.is64Bit());
 }
 
 bool RISCVTargetLowering::isCheapToSpeculateCtlz(Type *Ty) const {
   return Subtarget.hasStdExtZbb() || Subtarget.hasVendorXTHeadBb() ||
-         Subtarget.hasVendorXCVbitmanip();
+         (Subtarget.hasVendorXCVbitmanip() && !Subtarget.is64Bit());
 }
 
 bool RISCVTargetLowering::isMaskAndCmp0FoldingBeneficial(
@@ -2637,7 +2641,7 @@ static bool useRVVForFixedLengthVectorVT(MVT VT,
       return false;
     break;
   case MVT::bf16:
-    if (!Subtarget.hasVInstructionsBF16())
+    if (!Subtarget.hasVInstructionsBF16Minimal())
       return false;
     break;
   case MVT::f32:
@@ -3293,25 +3297,25 @@ static SDValue lowerVectorXRINT(SDValue Op, SelectionDAG &DAG,
 
 static SDValue
 getVSlidedown(SelectionDAG &DAG, const RISCVSubtarget &Subtarget,
-              const SDLoc &DL, EVT VT, SDValue Merge, SDValue Op,
+              const SDLoc &DL, EVT VT, SDValue Passthru, SDValue Op,
               SDValue Offset, SDValue Mask, SDValue VL,
               unsigned Policy = RISCVII::TAIL_UNDISTURBED_MASK_UNDISTURBED) {
-  if (Merge.isUndef())
+  if (Passthru.isUndef())
     Policy = RISCVII::TAIL_AGNOSTIC | RISCVII::MASK_AGNOSTIC;
   SDValue PolicyOp = DAG.getTargetConstant(Policy, DL, Subtarget.getXLenVT());
-  SDValue Ops[] = {Merge, Op, Offset, Mask, VL, PolicyOp};
+  SDValue Ops[] = {Passthru, Op, Offset, Mask, VL, PolicyOp};
   return DAG.getNode(RISCVISD::VSLIDEDOWN_VL, DL, VT, Ops);
 }
 
 static SDValue
 getVSlideup(SelectionDAG &DAG, const RISCVSubtarget &Subtarget, const SDLoc &DL,
-            EVT VT, SDValue Merge, SDValue Op, SDValue Offset, SDValue Mask,
+            EVT VT, SDValue Passthru, SDValue Op, SDValue Offset, SDValue Mask,
             SDValue VL,
             unsigned Policy = RISCVII::TAIL_UNDISTURBED_MASK_UNDISTURBED) {
-  if (Merge.isUndef())
+  if (Passthru.isUndef())
     Policy = RISCVII::TAIL_AGNOSTIC | RISCVII::MASK_AGNOSTIC;
   SDValue PolicyOp = DAG.getTargetConstant(Policy, DL, Subtarget.getXLenVT());
-  SDValue Ops[] = {Merge, Op, Offset, Mask, VL, PolicyOp};
+  SDValue Ops[] = {Passthru, Op, Offset, Mask, VL, PolicyOp};
   return DAG.getNode(RISCVISD::VSLIDEUP_VL, DL, VT, Ops);
 }
 
@@ -3329,8 +3333,8 @@ struct VIDSequence {
   int64_t Addend;
 };
 
-static std::optional<uint64_t> getExactInteger(const APFloat &APF,
-                                               uint32_t BitWidth) {
+static std::optional<APInt> getExactInteger(const APFloat &APF,
+                                            uint32_t BitWidth) {
   // We will use a SINT_TO_FP to materialize this constant so we should use a
   // signed APSInt here.
   APSInt ValInt(BitWidth, /*IsUnsigned*/ false);
@@ -3346,7 +3350,7 @@ static std::optional<uint64_t> getExactInteger(const APFloat &APF,
        APFloatBase::opInvalidOp) ||
       !IsExact)
     return std::nullopt;
-  return ValInt.extractBitsAsZExtValue(BitWidth, 0);
+  return ValInt.extractBits(BitWidth, 0);
 }
 
 // Try to match an arithmetic-sequence BUILD_VECTOR [X,X+S,X+2*S,...,X+(N-1)*S]
@@ -3359,6 +3363,9 @@ static std::optional<uint64_t> getExactInteger(const APFloat &APF,
 // Note that this method will also match potentially unappealing index
 // sequences, like <i32 0, i32 50939494>, however it is left to the caller to
 // determine whether this is worth generating code for.
+//
+// EltSizeInBits is the size of the type that the sequence will be calculated
+// in, i.e. SEW for build_vectors or XLEN for address calculations.
 static std::optional<VIDSequence> isSimpleVIDSequence(SDValue Op,
                                                       unsigned EltSizeInBits) {
   assert(Op.getOpcode() == ISD::BUILD_VECTOR && "Unexpected BUILD_VECTOR");
@@ -3367,13 +3374,14 @@ static std::optional<VIDSequence> isSimpleVIDSequence(SDValue Op,
   bool IsInteger = Op.getValueType().isInteger();
 
   std::optional<unsigned> SeqStepDenom;
-  std::optional<int64_t> SeqStepNum, SeqAddend;
-  std::optional<std::pair<uint64_t, unsigned>> PrevElt;
+  std::optional<APInt> SeqStepNum;
+  std::optional<APInt> SeqAddend;
+  std::optional<std::pair<APInt, unsigned>> PrevElt;
   assert(EltSizeInBits >= Op.getValueType().getScalarSizeInBits());
 
   // First extract the ops into a list of constant integer values. This may not
   // be possible for floats if they're not all representable as integers.
-  SmallVector<std::optional<uint64_t>> Elts(Op.getNumOperands());
+  SmallVector<std::optional<APInt>> Elts(Op.getNumOperands());
   const unsigned OpSize = Op.getScalarValueSizeInBits();
   for (auto [Idx, Elt] : enumerate(Op->op_values())) {
     if (Elt.isUndef()) {
@@ -3381,7 +3389,7 @@ static std::optional<VIDSequence> isSimpleVIDSequence(SDValue Op,
       continue;
     }
     if (IsInteger) {
-      Elts[Idx] = Elt->getAsZExtVal() & maskTrailingOnes<uint64_t>(OpSize);
+      Elts[Idx] = Elt->getAsAPIntVal().trunc(OpSize).zext(EltSizeInBits);
     } else {
       auto ExactInteger =
           getExactInteger(cast<ConstantFPSDNode>(Elt)->getValueAPF(), OpSize);
@@ -3401,7 +3409,7 @@ static std::optional<VIDSequence> isSimpleVIDSequence(SDValue Op,
       // Calculate the step since the last non-undef element, and ensure
       // it's consistent across the entire sequence.
       unsigned IdxDiff = Idx - PrevElt->second;
-      int64_t ValDiff = SignExtend64(*Elt - PrevElt->first, EltSizeInBits);
+      APInt ValDiff = *Elt - PrevElt->first;
 
       // A zero-value value difference means that we're somewhere in the middle
       // of a fractional step, e.g. <0,0,0*,0,1,1,1,1>. Wait until we notice a
@@ -3409,13 +3417,13 @@ static std::optional<VIDSequence> isSimpleVIDSequence(SDValue Op,
       if (ValDiff == 0)
         continue;
 
-      int64_t Remainder = ValDiff % IdxDiff;
+      int64_t Remainder = ValDiff.srem(IdxDiff);
       // Normalize the step if it's greater than 1.
-      if (Remainder != ValDiff) {
+      if (Remainder != ValDiff.getSExtValue()) {
         // The difference must cleanly divide the element span.
         if (Remainder != 0)
           return std::nullopt;
-        ValDiff /= IdxDiff;
+        ValDiff = ValDiff.sdiv(IdxDiff);
         IdxDiff = 1;
       }
 
@@ -3444,9 +3452,10 @@ static std::optional<VIDSequence> isSimpleVIDSequence(SDValue Op,
   for (auto [Idx, Elt] : enumerate(Elts)) {
     if (!Elt)
       continue;
-    uint64_t ExpectedVal =
-        (int64_t)(Idx * (uint64_t)*SeqStepNum) / *SeqStepDenom;
-    int64_t Addend = SignExtend64(*Elt - ExpectedVal, EltSizeInBits);
+    APInt ExpectedVal =
+        (APInt(EltSizeInBits, Idx) * *SeqStepNum).sdiv(*SeqStepDenom);
+
+    APInt Addend = *Elt - ExpectedVal;
     if (!SeqAddend)
       SeqAddend = Addend;
     else if (Addend != SeqAddend)
@@ -3455,7 +3464,8 @@ static std::optional<VIDSequence> isSimpleVIDSequence(SDValue Op,
 
   assert(SeqAddend && "Must have an addend if we have a step");
 
-  return VIDSequence{*SeqStepNum, *SeqStepDenom, *SeqAddend};
+  return VIDSequence{SeqStepNum->getSExtValue(), *SeqStepDenom,
+                     SeqAddend->getSExtValue()};
 }
 
 // Match a splatted value (SPLAT_VECTOR/BUILD_VECTOR) of an EXTRACT_VECTOR_ELT
@@ -6079,8 +6089,8 @@ static unsigned getRISCVVLOp(SDValue Op) {
 #undef VP_CASE
 }
 
-/// Return true if a RISC-V target specified op has a merge operand.
-static bool hasMergeOp(unsigned Opcode) {
+/// Return true if a RISC-V target specified op has a passthru operand.
+static bool hasPassthruOp(unsigned Opcode) {
   assert(Opcode > RISCVISD::FIRST_NUMBER &&
          Opcode <= RISCVISD::LAST_RISCV_STRICTFP_OPCODE &&
          "not a RISC-V target specific op");
@@ -6834,7 +6844,8 @@ SDValue RISCVTargetLowering::LowerOperation(SDValue Op,
           Subtarget.hasStdExtZfhminOrZhinxmin() &&
           !Subtarget.hasVInstructionsF16())) ||
         (Op.getValueType().getScalarType() == MVT::bf16 &&
-         (Subtarget.hasVInstructionsBF16() && Subtarget.hasStdExtZfbfmin()))) {
+         (Subtarget.hasVInstructionsBF16Minimal() &&
+          Subtarget.hasStdExtZfbfmin()))) {
       if (Op.getValueType() == MVT::nxv32f16 ||
           Op.getValueType() == MVT::nxv32bf16)
         return SplitVectorOp(Op, DAG);
@@ -8846,14 +8857,7 @@ static SDValue lowerVectorIntrinsicScalars(SDValue Op, SelectionDAG &DAG,
         I32VL = DAG.getConstant(2 * AVLInt, DL, XLenVT);
       } else if (AVLInt >= 2 * MaxVLMAX) {
         // Just set vl to VLMAX in this situation
-        RISCVII::VLMUL Lmul = RISCVTargetLowering::getLMUL(I32VT);
-        SDValue LMUL = DAG.getConstant(Lmul, DL, XLenVT);
-        unsigned Sew = RISCVVType::encodeSEW(I32VT.getScalarSizeInBits());
-        SDValue SEW = DAG.getConstant(Sew, DL, XLenVT);
-        SDValue SETVLMAX = DAG.getTargetConstant(
-            Intrinsic::riscv_vsetvlimax, DL, MVT::i32);
-        I32VL = DAG.getNode(ISD::INTRINSIC_WO_CHAIN, DL, XLenVT, SETVLMAX, SEW,
-                            LMUL);
+        I32VL = DAG.getRegister(RISCV::X0, XLenVT);
       } else {
         // For AVL between (MinVLMAX, 2 * MaxVLMAX), the actual working vl
         // is related to the hardware implementation.
@@ -10945,7 +10949,7 @@ SDValue RISCVTargetLowering::lowerVectorStrictFSetcc(SDValue Op,
          True, VL});
     Mask =
         DAG.getNode(RISCVISD::VMAND_VL, DL, MaskVT, OrderMask1, OrderMask2, VL);
-    // Use Mask as the merge operand to let the result be 0 if either of the
+    // Use Mask as the passthru operand to let the result be 0 if either of the
     // inputs is unordered.
     Res = DAG.getNode(RISCVISD::STRICT_FSETCCS_VL, DL,
                       DAG.getVTList(MaskVT, MVT::Other),
@@ -11050,7 +11054,7 @@ SDValue RISCVTargetLowering::lowerFixedLengthVectorSelectToRVV(
 SDValue RISCVTargetLowering::lowerToScalableOp(SDValue Op,
                                                SelectionDAG &DAG) const {
   unsigned NewOpc = getRISCVVLOp(Op);
-  bool HasMergeOp = hasMergeOp(NewOpc);
+  bool HasPassthruOp = hasPassthruOp(NewOpc);
   bool HasMask = hasMaskOp(NewOpc);
 
   MVT VT = Op.getSimpleValueType();
@@ -11075,7 +11079,7 @@ SDValue RISCVTargetLowering::lowerToScalableOp(SDValue Op,
 
   SDLoc DL(Op);
   auto [Mask, VL] = getDefaultVLOps(VT, ContainerVT, DL, DAG, Subtarget);
-  if (HasMergeOp)
+  if (HasPassthruOp)
     Ops.push_back(DAG.getUNDEF(ContainerVT));
   if (HasMask)
     Ops.push_back(Mask);
@@ -11103,7 +11107,7 @@ SDValue RISCVTargetLowering::lowerToScalableOp(SDValue Op,
 //   types.
 SDValue RISCVTargetLowering::lowerVPOp(SDValue Op, SelectionDAG &DAG) const {
   unsigned RISCVISDOpc = getRISCVVLOp(Op);
-  bool HasMergeOp = hasMergeOp(RISCVISDOpc);
+  bool HasPassthruOp = hasPassthruOp(RISCVISDOpc);
 
   SDLoc DL(Op);
   MVT VT = Op.getSimpleValueType();
@@ -11116,9 +11120,9 @@ SDValue RISCVTargetLowering::lowerVPOp(SDValue Op, SelectionDAG &DAG) const {
   for (const auto &OpIdx : enumerate(Op->ops())) {
     SDValue V = OpIdx.value();
     assert(!isa<VTSDNode>(V) && "Unexpected VTSDNode node!");
-    // Add dummy merge value before the mask. Or if there isn't a mask, before
-    // EVL.
-    if (HasMergeOp) {
+    // Add dummy passthru value before the mask. Or if there isn't a mask,
+    // before EVL.
+    if (HasPassthruOp) {
       auto MaskIdx = ISD::getVPMaskIdx(Op.getOpcode());
       if (MaskIdx) {
         if (*MaskIdx == OpIdx.index())
@@ -14309,6 +14313,14 @@ struct NodeExtensionHelper {
       return RISCVISD::VFWSUB_VL;
     case RISCVISD::FMUL_VL:
       return RISCVISD::VFWMUL_VL;
+    case RISCVISD::VFMADD_VL:
+      return RISCVISD::VFWMADD_VL;
+    case RISCVISD::VFMSUB_VL:
+      return RISCVISD::VFWMSUB_VL;
+    case RISCVISD::VFNMADD_VL:
+      return RISCVISD::VFWNMADD_VL;
+    case RISCVISD::VFNMSUB_VL:
+      return RISCVISD::VFWNMSUB_VL;
     default:
       llvm_unreachable("Unexpected opcode");
     }
@@ -14502,6 +14514,11 @@ struct NodeExtensionHelper {
              Subtarget.hasStdExtZvbb();
     case RISCVISD::SHL_VL:
       return Subtarget.hasStdExtZvbb();
+    case RISCVISD::VFMADD_VL:
+    case RISCVISD::VFNMSUB_VL:
+    case RISCVISD::VFNMADD_VL:
+    case RISCVISD::VFMSUB_VL:
+      return true;
     default:
       return false;
     }
@@ -14582,6 +14599,10 @@ struct NodeExtensionHelper {
     case RISCVISD::FADD_VL:
     case RISCVISD::FMUL_VL:
     case RISCVISD::VFWADD_W_VL:
+    case RISCVISD::VFMADD_VL:
+    case RISCVISD::VFNMSUB_VL:
+    case RISCVISD::VFNMADD_VL:
+    case RISCVISD::VFMSUB_VL:
       return true;
     case ISD::SUB:
     case RISCVISD::SUB_VL:
@@ -14633,25 +14654,25 @@ struct CombineResult {
   /// The actual replacement is *not* done in that method.
   SDValue materialize(SelectionDAG &DAG,
                       const RISCVSubtarget &Subtarget) const {
-    SDValue Mask, VL, Merge;
+    SDValue Mask, VL, Passthru;
     std::tie(Mask, VL) =
         NodeExtensionHelper::getMaskAndVL(Root, DAG, Subtarget);
     switch (Root->getOpcode()) {
     default:
-      Merge = Root->getOperand(2);
+      Passthru = Root->getOperand(2);
       break;
     case ISD::ADD:
     case ISD::SUB:
     case ISD::MUL:
     case ISD::OR:
     case ISD::SHL:
-      Merge = DAG.getUNDEF(Root->getValueType(0));
+      Passthru = DAG.getUNDEF(Root->getValueType(0));
       break;
     }
     return DAG.getNode(TargetOpcode, SDLoc(Root), Root->getValueType(0),
                        LHS.getOrCreateExtendedOp(Root, DAG, Subtarget, LHSExt),
                        RHS.getOrCreateExtendedOp(Root, DAG, Subtarget, RHSExt),
-                       Merge, Mask, VL);
+                       Passthru, Mask, VL);
   }
 };
 
@@ -14797,6 +14818,10 @@ NodeExtensionHelper::getSupportedFoldings(const SDNode *Root) {
     Strategies.push_back(canFoldToVW_W);
     break;
   case RISCVISD::FMUL_VL:
+  case RISCVISD::VFMADD_VL:
+  case RISCVISD::VFMSUB_VL:
+  case RISCVISD::VFNMADD_VL:
+  case RISCVISD::VFNMSUB_VL:
     Strategies.push_back(canFoldToVWWithSameExtension);
     break;
   case ISD::MUL:
@@ -14833,7 +14858,7 @@ NodeExtensionHelper::getSupportedFoldings(const SDNode *Root) {
 }
 } // End anonymous namespace.
 
-/// Combine a binary operation to its equivalent VW or VW_W form.
+/// Combine a binary or FMA operation to its equivalent VW or VW_W form.
 /// The supported combines are:
 /// add | add_vl | or disjoint -> vwadd(u) | vwadd(u)_w
 /// sub | sub_vl -> vwsub(u) | vwsub(u)_w
@@ -14846,9 +14871,9 @@ NodeExtensionHelper::getSupportedFoldings(const SDNode *Root) {
 /// vwsub_w(u) -> vwsub(u)
 /// vfwadd_w -> vfwadd
 /// vfwsub_w -> vfwsub
-static SDValue combineBinOp_VLToVWBinOp_VL(SDNode *N,
-                                           TargetLowering::DAGCombinerInfo &DCI,
-                                           const RISCVSubtarget &Subtarget) {
+static SDValue combineOp_VLToVWOp_VL(SDNode *N,
+                                     TargetLowering::DAGCombinerInfo &DCI,
+                                     const RISCVSubtarget &Subtarget) {
   SelectionDAG &DAG = DCI.DAG;
   if (DCI.isBeforeLegalize())
     return SDValue();
@@ -14864,19 +14889,26 @@ static SDValue combineBinOp_VLToVWBinOp_VL(SDNode *N,
 
   while (!Worklist.empty()) {
     SDNode *Root = Worklist.pop_back_val();
-    if (!NodeExtensionHelper::isSupportedRoot(Root, Subtarget))
-      return SDValue();
 
     NodeExtensionHelper LHS(Root, 0, DAG, Subtarget);
     NodeExtensionHelper RHS(Root, 1, DAG, Subtarget);
-    auto AppendUsersIfNeeded = [&Worklist,
+    auto AppendUsersIfNeeded = [&Worklist, &Subtarget,
                                 &Inserted](const NodeExtensionHelper &Op) {
       if (Op.needToPromoteOtherUsers()) {
-        for (SDNode *TheUse : Op.OrigOperand->uses()) {
+        for (SDNode::use_iterator UI = Op.OrigOperand->use_begin(),
+                                  UE = Op.OrigOperand->use_end();
+             UI != UE; ++UI) {
+          SDNode *TheUse = *UI;
+          if (!NodeExtensionHelper::isSupportedRoot(TheUse, Subtarget))
+            return false;
+          // We only support the first 2 operands of FMA.
+          if (UI.getOperandNo() >= 2)
+            return false;
           if (Inserted.insert(TheUse).second)
             Worklist.push_back(TheUse);
         }
       }
+      return true;
     };
 
     // Control the compile time by limiting the number of node we look at in
@@ -14904,9 +14936,11 @@ static SDValue combineBinOp_VLToVWBinOp_VL(SDNode *N,
           // we would be leaving the old input (since it is may still be used),
           // and the new one.
           if (Res->LHSExt.has_value())
-            AppendUsersIfNeeded(LHS);
+            if (!AppendUsersIfNeeded(LHS))
+              return SDValue();
           if (Res->RHSExt.has_value())
-            AppendUsersIfNeeded(RHS);
+            if (!AppendUsersIfNeeded(RHS))
+              return SDValue();
           break;
         }
       }
@@ -14993,7 +15027,7 @@ static SDValue performVWADDSUBW_VLCombine(SDNode *N,
   assert(Opc == RISCVISD::VWADD_W_VL || Opc == RISCVISD::VWADDU_W_VL ||
          Opc == RISCVISD::VWSUB_W_VL || Opc == RISCVISD::VWSUBU_W_VL);
 
-  if (SDValue V = combineBinOp_VLToVWBinOp_VL(N, DCI, Subtarget))
+  if (SDValue V = combineOp_VLToVWOp_VL(N, DCI, Subtarget))
     return V;
 
   return combineVWADDSUBWSelect(N, DCI.DAG);
@@ -15408,8 +15442,11 @@ static SDValue combineVFMADD_VLWithVFNEG_VL(SDNode *N, SelectionDAG &DAG) {
                      VL);
 }
 
-static SDValue performVFMADD_VLCombine(SDNode *N, SelectionDAG &DAG,
+static SDValue performVFMADD_VLCombine(SDNode *N,
+                                       TargetLowering::DAGCombinerInfo &DCI,
                                        const RISCVSubtarget &Subtarget) {
+  SelectionDAG &DAG = DCI.DAG;
+
   if (SDValue V = combineVFMADD_VLWithVFNEG_VL(N, DAG))
     return V;
 
@@ -15421,50 +15458,7 @@ static SDValue performVFMADD_VLCombine(SDNode *N, SelectionDAG &DAG,
   if (N->isTargetStrictFPOpcode())
     return SDValue();
 
-  // Try to form widening FMA.
-  SDValue Op0 = N->getOperand(0);
-  SDValue Op1 = N->getOperand(1);
-  SDValue Mask = N->getOperand(3);
-  SDValue VL = N->getOperand(4);
-
-  if (Op0.getOpcode() != RISCVISD::FP_EXTEND_VL ||
-      Op1.getOpcode() != RISCVISD::FP_EXTEND_VL)
-    return SDValue();
-
-  // TODO: Refactor to handle more complex cases similar to
-  // combineBinOp_VLToVWBinOp_VL.
-  if ((!Op0.hasOneUse() || !Op1.hasOneUse()) &&
-      (Op0 != Op1 || !Op0->hasNUsesOfValue(2, 0)))
-    return SDValue();
-
-  // Check the mask and VL are the same.
-  if (Op0.getOperand(1) != Mask || Op0.getOperand(2) != VL ||
-      Op1.getOperand(1) != Mask || Op1.getOperand(2) != VL)
-    return SDValue();
-
-  unsigned NewOpc;
-  switch (N->getOpcode()) {
-  default:
-    llvm_unreachable("Unexpected opcode");
-  case RISCVISD::VFMADD_VL:
-    NewOpc = RISCVISD::VFWMADD_VL;
-    break;
-  case RISCVISD::VFNMSUB_VL:
-    NewOpc = RISCVISD::VFWNMSUB_VL;
-    break;
-  case RISCVISD::VFNMADD_VL:
-    NewOpc = RISCVISD::VFWNMADD_VL;
-    break;
-  case RISCVISD::VFMSUB_VL:
-    NewOpc = RISCVISD::VFWMSUB_VL;
-    break;
-  }
-
-  Op0 = Op0.getOperand(0);
-  Op1 = Op1.getOperand(0);
-
-  return DAG.getNode(NewOpc, SDLoc(N), N->getValueType(0), Op0, Op1,
-                     N->getOperand(2), Mask, VL);
+  return combineOp_VLToVWOp_VL(N, DCI, Subtarget);
 }
 
 static SDValue performSRACombine(SDNode *N, SelectionDAG &DAG,
@@ -16161,8 +16155,8 @@ static SDValue combineToVWMACC(SDNode *N, SelectionDAG &DAG,
   SDValue MulOp = N->getOperand(1);
 
   if (N->getOpcode() == RISCVISD::ADD_VL) {
-    SDValue AddMergeOp = N->getOperand(2);
-    if (!AddMergeOp.isUndef())
+    SDValue AddPassthruOp = N->getOperand(2);
+    if (!AddPassthruOp.isUndef())
       return SDValue();
   }
 
@@ -16183,9 +16177,9 @@ static SDValue combineToVWMACC(SDNode *N, SelectionDAG &DAG,
   if (!IsVWMulOpc(MulOp.getOpcode()))
     return SDValue();
 
-  SDValue MulMergeOp = MulOp.getOperand(2);
+  SDValue MulPassthruOp = MulOp.getOperand(2);
 
-  if (!MulMergeOp.isUndef())
+  if (!MulPassthruOp.isUndef())
     return SDValue();
 
   auto [AddMask, AddVL] = [](SDNode *N, SelectionDAG &DAG,
@@ -16661,28 +16655,28 @@ SDValue RISCVTargetLowering::PerformDAGCombine(SDNode *N,
     break;
   }
   case ISD::ADD: {
-    if (SDValue V = combineBinOp_VLToVWBinOp_VL(N, DCI, Subtarget))
+    if (SDValue V = combineOp_VLToVWOp_VL(N, DCI, Subtarget))
       return V;
     if (SDValue V = combineToVWMACC(N, DAG, Subtarget))
       return V;
     return performADDCombine(N, DCI, Subtarget);
   }
   case ISD::SUB: {
-    if (SDValue V = combineBinOp_VLToVWBinOp_VL(N, DCI, Subtarget))
+    if (SDValue V = combineOp_VLToVWOp_VL(N, DCI, Subtarget))
       return V;
     return performSUBCombine(N, DAG, Subtarget);
   }
   case ISD::AND:
     return performANDCombine(N, DCI, Subtarget);
   case ISD::OR: {
-    if (SDValue V = combineBinOp_VLToVWBinOp_VL(N, DCI, Subtarget))
+    if (SDValue V = combineOp_VLToVWOp_VL(N, DCI, Subtarget))
       return V;
     return performORCombine(N, DCI, Subtarget);
   }
   case ISD::XOR:
     return performXORCombine(N, DAG, Subtarget);
   case ISD::MUL:
-    if (SDValue V = combineBinOp_VLToVWBinOp_VL(N, DCI, Subtarget))
+    if (SDValue V = combineOp_VLToVWOp_VL(N, DCI, Subtarget))
       return V;
     return performMULCombine(N, DAG, DCI, Subtarget);
   case ISD::SDIV:
@@ -16692,6 +16686,25 @@ SDValue RISCVTargetLowering::PerformDAGCombine(SDNode *N,
     if (SDValue V = combineBinOpOfZExt(N, DAG))
       return V;
     break;
+  case ISD::FMUL: {
+    // fmul X, (copysign 1.0, Y) -> fsgnjx X, Y
+    SDValue N0 = N->getOperand(0);
+    SDValue N1 = N->getOperand(1);
+    if (N0->getOpcode() != ISD::FCOPYSIGN)
+      std::swap(N0, N1);
+    if (N0->getOpcode() != ISD::FCOPYSIGN)
+      return SDValue();
+    ConstantFPSDNode *C = dyn_cast<ConstantFPSDNode>(N0->getOperand(0));
+    if (!C || !C->getValueAPF().isExactlyValue(+1.0))
+      return SDValue();
+    EVT VT = N->getValueType(0);
+    if (VT.isVector() || !isOperationLegal(ISD::FCOPYSIGN, VT))
+      return SDValue();
+    SDValue Sign = N0->getOperand(1);
+    if (Sign.getValueType() != VT)
+      return SDValue();
+    return DAG.getNode(RISCVISD::FSGNJX, SDLoc(N), VT, N1, N0->getOperand(1));
+  }
   case ISD::FADD:
   case ISD::UMAX:
   case ISD::UMIN:
@@ -17107,7 +17120,7 @@ SDValue RISCVTargetLowering::PerformDAGCombine(SDNode *N,
     break;
   }
   case RISCVISD::SHL_VL:
-    if (SDValue V = combineBinOp_VLToVWBinOp_VL(N, DCI, Subtarget))
+    if (SDValue V = combineOp_VLToVWOp_VL(N, DCI, Subtarget))
       return V;
     [[fallthrough]];
   case RISCVISD::SRA_VL:
@@ -17132,7 +17145,7 @@ SDValue RISCVTargetLowering::PerformDAGCombine(SDNode *N,
   case ISD::SRL:
   case ISD::SHL: {
     if (N->getOpcode() == ISD::SHL) {
-      if (SDValue V = combineBinOp_VLToVWBinOp_VL(N, DCI, Subtarget))
+      if (SDValue V = combineOp_VLToVWOp_VL(N, DCI, Subtarget))
         return V;
     }
     SDValue ShAmt = N->getOperand(1);
@@ -17148,7 +17161,7 @@ SDValue RISCVTargetLowering::PerformDAGCombine(SDNode *N,
     break;
   }
   case RISCVISD::ADD_VL:
-    if (SDValue V = combineBinOp_VLToVWBinOp_VL(N, DCI, Subtarget))
+    if (SDValue V = combineOp_VLToVWOp_VL(N, DCI, Subtarget))
       return V;
     return combineToVWMACC(N, DAG, Subtarget);
   case RISCVISD::VWADD_W_VL:
@@ -17158,7 +17171,7 @@ SDValue RISCVTargetLowering::PerformDAGCombine(SDNode *N,
     return performVWADDSUBW_VLCombine(N, DCI, Subtarget);
   case RISCVISD::SUB_VL:
   case RISCVISD::MUL_VL:
-    return combineBinOp_VLToVWBinOp_VL(N, DCI, Subtarget);
+    return combineOp_VLToVWOp_VL(N, DCI, Subtarget);
   case RISCVISD::VFMADD_VL:
   case RISCVISD::VFNMADD_VL:
   case RISCVISD::VFMSUB_VL:
@@ -17167,7 +17180,7 @@ SDValue RISCVTargetLowering::PerformDAGCombine(SDNode *N,
   case RISCVISD::STRICT_VFNMADD_VL:
   case RISCVISD::STRICT_VFMSUB_VL:
   case RISCVISD::STRICT_VFNMSUB_VL:
-    return performVFMADD_VLCombine(N, DAG, Subtarget);
+    return performVFMADD_VLCombine(N, DCI, Subtarget);
   case RISCVISD::FADD_VL:
   case RISCVISD::FSUB_VL:
   case RISCVISD::FMUL_VL:
@@ -17176,7 +17189,7 @@ SDValue RISCVTargetLowering::PerformDAGCombine(SDNode *N,
     if (N->getValueType(0).getVectorElementType() == MVT::f32 &&
         !Subtarget.hasVInstructionsF16())
       return SDValue();
-    return combineBinOp_VLToVWBinOp_VL(N, DCI, Subtarget);
+    return combineOp_VLToVWOp_VL(N, DCI, Subtarget);
   }
   case ISD::LOAD:
   case ISD::STORE: {
@@ -20242,6 +20255,7 @@ const char *RISCVTargetLowering::getTargetNodeName(unsigned Opcode) const {
   NODE_NAME_CASE(FP_EXTEND_BF16)
   NODE_NAME_CASE(FROUND)
   NODE_NAME_CASE(FCLASS)
+  NODE_NAME_CASE(FSGNJX)
   NODE_NAME_CASE(FMAX)
   NODE_NAME_CASE(FMIN)
   NODE_NAME_CASE(READ_COUNTER_WIDE)
@@ -21072,7 +21086,7 @@ bool RISCVTargetLowering::getPostIndexedAddressParts(SDNode *N, SDNode *Op,
                                                      SDValue &Offset,
                                                      ISD::MemIndexedMode &AM,
                                                      SelectionDAG &DAG) const {
-  if (Subtarget.hasVendorXCVmem()) {
+  if (Subtarget.hasVendorXCVmem() && !Subtarget.is64Bit()) {
     if (Op->getOpcode() != ISD::ADD)
       return false;
 
@@ -21171,37 +21185,37 @@ bool RISCVTargetLowering::shouldSignExtendTypeInLibCall(EVT Type, bool IsSigned)
 bool RISCVTargetLowering::decomposeMulByConstant(LLVMContext &Context, EVT VT,
                                                  SDValue C) const {
   // Check integral scalar types.
-  const bool HasZmmul = Subtarget.hasStdExtZmmul();
   if (!VT.isScalarInteger())
     return false;
 
   // Omit the optimization if the sub target has the M extension and the data
   // size exceeds XLen.
+  const bool HasZmmul = Subtarget.hasStdExtZmmul();
   if (HasZmmul && VT.getSizeInBits() > Subtarget.getXLen())
     return false;
 
-  if (auto *ConstNode = dyn_cast<ConstantSDNode>(C.getNode())) {
-    // Break the MUL to a SLLI and an ADD/SUB.
-    const APInt &Imm = ConstNode->getAPIntValue();
-    if ((Imm + 1).isPowerOf2() || (Imm - 1).isPowerOf2() ||
-        (1 - Imm).isPowerOf2() || (-1 - Imm).isPowerOf2())
-      return true;
+  auto *ConstNode = cast<ConstantSDNode>(C);
+  const APInt &Imm = ConstNode->getAPIntValue();
 
-    // Optimize the MUL to (SH*ADD x, (SLLI x, bits)) if Imm is not simm12.
-    if (Subtarget.hasStdExtZba() && !Imm.isSignedIntN(12) &&
-        ((Imm - 2).isPowerOf2() || (Imm - 4).isPowerOf2() ||
-         (Imm - 8).isPowerOf2()))
-      return true;
+  // Break the MUL to a SLLI and an ADD/SUB.
+  if ((Imm + 1).isPowerOf2() || (Imm - 1).isPowerOf2() ||
+      (1 - Imm).isPowerOf2() || (-1 - Imm).isPowerOf2())
+    return true;
 
-    // Break the MUL to two SLLI instructions and an ADD/SUB, if Imm needs
-    // a pair of LUI/ADDI.
-    if (!Imm.isSignedIntN(12) && Imm.countr_zero() < 12 &&
-        ConstNode->hasOneUse()) {
-      APInt ImmS = Imm.ashr(Imm.countr_zero());
-      if ((ImmS + 1).isPowerOf2() || (ImmS - 1).isPowerOf2() ||
-          (1 - ImmS).isPowerOf2())
-        return true;
-    }
+  // Optimize the MUL to (SH*ADD x, (SLLI x, bits)) if Imm is not simm12.
+  if (Subtarget.hasStdExtZba() && !Imm.isSignedIntN(12) &&
+      ((Imm - 2).isPowerOf2() || (Imm - 4).isPowerOf2() ||
+       (Imm - 8).isPowerOf2()))
+    return true;
+
+  // Break the MUL to two SLLI instructions and an ADD/SUB, if Imm needs
+  // a pair of LUI/ADDI.
+  if (!Imm.isSignedIntN(12) && Imm.countr_zero() < 12 &&
+      ConstNode->hasOneUse()) {
+    APInt ImmS = Imm.ashr(Imm.countr_zero());
+    if ((ImmS + 1).isPowerOf2() || (ImmS - 1).isPowerOf2() ||
+        (1 - ImmS).isPowerOf2())
+      return true;
   }
 
   return false;
diff --git a/llvm/lib/Target/RISCV/RISCVISelLowering.h b/llvm/lib/Target/RISCV/RISCVISelLowering.h
index e469a4b..d1d0760 100644
--- a/llvm/lib/Target/RISCV/RISCVISelLowering.h
+++ b/llvm/lib/Target/RISCV/RISCVISelLowering.h
@@ -128,6 +128,7 @@ enum NodeType : unsigned {
   FROUND,
 
   FCLASS,
+  FSGNJX,
 
   // Floating point fmax and fmin matching the RISC-V instruction semantics.
   FMAX, FMIN,
@@ -237,7 +238,7 @@ enum NodeType : unsigned {
   VECREDUCE_FMIN_VL,
   VECREDUCE_FMAX_VL,
 
-  // Vector binary ops with a merge as a third operand, a mask as a fourth
+  // Vector binary ops with a passthru as a third operand, a mask as a fourth
   // operand, and VL as a fifth operand.
   ADD_VL,
   AND_VL,
@@ -293,7 +294,7 @@ enum NodeType : unsigned {
   FABS_VL,
   FSQRT_VL,
   FCLASS_VL,
-  FCOPYSIGN_VL, // Has a merge operand
+  FCOPYSIGN_VL, // Has a passthru operand
   VFCVT_RTZ_X_F_VL,
   VFCVT_RTZ_XU_F_VL,
   VFCVT_X_F_VL,
@@ -321,7 +322,7 @@ enum NodeType : unsigned {
   VFWMSUB_VL,
   VFWNMSUB_VL,
 
-  // Widening instructions with a merge value a third operand, a mask as a
+  // Widening instructions with a passthru value a third operand, a mask as a
   // fourth operand, and VL as a fifth operand.
   VWMUL_VL,
   VWMULU_VL,
diff --git a/llvm/lib/Target/RISCV/RISCVInsertVSETVLI.cpp b/llvm/lib/Target/RISCV/RISCVInsertVSETVLI.cpp
index 96250b9..af2279f 100644
--- a/llvm/lib/Target/RISCV/RISCVInsertVSETVLI.cpp
+++ b/llvm/lib/Target/RISCV/RISCVInsertVSETVLI.cpp
@@ -182,7 +182,7 @@ static bool isMaskRegOp(const MachineInstr &MI) {
 /// Note that this is different from "agnostic" as defined by the vector
 /// specification.  Agnostic requires each lane to either be undisturbed, or
 /// take the value -1; no other value is allowed.
-static bool hasUndefinedMergeOp(const MachineInstr &MI) {
+static bool hasUndefinedPassthru(const MachineInstr &MI) {
 
   unsigned UseOpIdx;
   if (!MI.isRegTiedToUseOperand(0, &UseOpIdx))
@@ -443,13 +443,13 @@ DemandedFields getDemanded(const MachineInstr &MI, const RISCVSubtarget *ST) {
     Res.LMUL = DemandedFields::LMULNone;
     Res.SEWLMULRatio = false;
     Res.VLAny = false;
-    // For vmv.s.x and vfmv.s.f, if the merge operand is *undefined*, we don't
+    // For vmv.s.x and vfmv.s.f, if the passthru is *undefined*, we don't
     // need to preserve any other bits and are thus compatible with any larger,
     // etype and can disregard policy bits.  Warning: It's tempting to try doing
     // this for any tail agnostic operation, but we can't as TA requires
     // tail lanes to either be the original value or -1.  We are writing
     // unknown bits to the lanes here.
-    if (hasUndefinedMergeOp(MI)) {
+    if (hasUndefinedPassthru(MI)) {
       if (isFloatScalarMoveOrScalarSplatInstr(MI) && !ST->hasVInstructionsF64())
         Res.SEW = DemandedFields::SEWGreaterThanOrEqualAndLessThan64;
       else
@@ -458,7 +458,7 @@ DemandedFields getDemanded(const MachineInstr &MI, const RISCVSubtarget *ST) {
     }
   }
 
-  // vmv.x.s, and vmv.f.s are unconditional and ignore everything except SEW.
+  // vmv.x.s, and vfmv.f.s are unconditional and ignore everything except SEW.
   if (isScalarExtractInstr(MI)) {
     assert(!RISCVII::hasVLOp(TSFlags));
     Res.LMUL = DemandedFields::LMULNone;
@@ -469,7 +469,7 @@ DemandedFields getDemanded(const MachineInstr &MI, const RISCVSubtarget *ST) {
 
   if (RISCVII::hasVLOp(MI.getDesc().TSFlags)) {
     const MachineOperand &VLOp = MI.getOperand(getVLOpNum(MI));
-    // A slidedown/slideup with an *undefined* merge op can freely clobber
+    // A slidedown/slideup with an *undefined* passthru can freely clobber
     // elements not copied from the source vector (e.g. masked off, tail, or
     // slideup's prefix). Notes:
     // * We can't modify SEW here since the slide amount is in units of SEW.
@@ -478,7 +478,7 @@ DemandedFields getDemanded(const MachineInstr &MI, const RISCVSubtarget *ST) {
     // * The LMUL1 restriction is for machines whose latency may depend on VL.
     // * As above, this is only legal for tail "undefined" not "agnostic".
     if (isVSlideInstr(MI) && VLOp.isImm() && VLOp.getImm() == 1 &&
-        hasUndefinedMergeOp(MI)) {
+        hasUndefinedPassthru(MI)) {
       Res.VLAny = false;
       Res.VLZeroness = true;
       Res.LMUL = DemandedFields::LMULLessThanOrEqualToM1;
@@ -492,7 +492,7 @@ DemandedFields getDemanded(const MachineInstr &MI, const RISCVSubtarget *ST) {
     // careful to not increase the number of active vector registers (unlike for
     // vmv.s.x.)
     if (isScalarSplatInstr(MI) && VLOp.isImm() && VLOp.getImm() == 1 &&
-        hasUndefinedMergeOp(MI)) {
+        hasUndefinedPassthru(MI)) {
       Res.LMUL = DemandedFields::LMULLessThanOrEqualToM1;
       Res.SEWLMULRatio = false;
       Res.VLAny = false;
@@ -1000,7 +1000,7 @@ RISCVInsertVSETVLI::computeInfoForInstr(const MachineInstr &MI) const {
 
   bool TailAgnostic = true;
   bool MaskAgnostic = true;
-  if (!hasUndefinedMergeOp(MI)) {
+  if (!hasUndefinedPassthru(MI)) {
     // Start with undisturbed.
     TailAgnostic = false;
     MaskAgnostic = false;
diff --git a/llvm/lib/Target/RISCV/RISCVInstrInfo.cpp b/llvm/lib/Target/RISCV/RISCVInstrInfo.cpp
index 0620c3f..9dd7902 100644
--- a/llvm/lib/Target/RISCV/RISCVInstrInfo.cpp
+++ b/llvm/lib/Target/RISCV/RISCVInstrInfo.cpp
@@ -3763,6 +3763,12 @@ RISCVInstrInfo::getSerializableMachineMemOperandTargetFlags() const {
   return ArrayRef(TargetFlags);
 }
 
+unsigned RISCVInstrInfo::getTailDuplicateSize(CodeGenOptLevel OptLevel) const {
+  return OptLevel >= CodeGenOptLevel::Aggressive
+             ? STI.getTailDupAggressiveThreshold()
+             : 2;
+}
+
 // Returns true if this is the sext.w pattern, addiw rd, rs1, 0.
 bool RISCV::isSEXT_W(const MachineInstr &MI) {
   return MI.getOpcode() == RISCV::ADDIW && MI.getOperand(1).isReg() &&
diff --git a/llvm/lib/Target/RISCV/RISCVInstrInfo.h b/llvm/lib/Target/RISCV/RISCVInstrInfo.h
index 025e12d..1612f56 100644
--- a/llvm/lib/Target/RISCV/RISCVInstrInfo.h
+++ b/llvm/lib/Target/RISCV/RISCVInstrInfo.h
@@ -288,6 +288,8 @@ public:
   ArrayRef<std::pair<MachineMemOperand::Flags, const char *>>
   getSerializableMachineMemOperandTargetFlags() const override;
 
+  unsigned getTailDuplicateSize(CodeGenOptLevel OptLevel) const override;
+
   unsigned getUndefInitOpcode(unsigned RegClassID) const override {
     switch (RegClassID) {
     case RISCV::VRRegClassID:
diff --git a/llvm/lib/Target/RISCV/RISCVInstrInfo.td b/llvm/lib/Target/RISCV/RISCVInstrInfo.td
index 04054d2..cebe95c 100644
--- a/llvm/lib/Target/RISCV/RISCVInstrInfo.td
+++ b/llvm/lib/Target/RISCV/RISCVInstrInfo.td
@@ -593,8 +593,7 @@ class ALUW_rr<bits<7> funct7, bits<3> funct3, string opcodestr,
 
 let hasSideEffects = 1, mayLoad = 0, mayStore = 0 in
 class Priv<string opcodestr, bits<7> funct7>
-    : RVInstR<funct7, 0b000, OPC_SYSTEM, (outs), (ins GPR:$rs1, GPR:$rs2),
-              opcodestr, "">;
+    : RVInstR<funct7, 0b000, OPC_SYSTEM, (outs), (ins), opcodestr, "">;
 
 let hasSideEffects = 1, mayLoad = 0, mayStore = 0 in
 class Priv_rr<string opcodestr, bits<7> funct7>
@@ -1543,8 +1542,8 @@ def PseudoCALL : Pseudo<(outs), (ins call_symbol:$func), [],
 def : Pat<(riscv_call tglobaladdr:$func), (PseudoCALL tglobaladdr:$func)>;
 def : Pat<(riscv_call texternalsym:$func), (PseudoCALL texternalsym:$func)>;
 
-def : Pat<(riscv_sret_glue), (SRET (XLenVT X0), (XLenVT X0))>;
-def : Pat<(riscv_mret_glue), (MRET (XLenVT X0), (XLenVT X0))>;
+def : Pat<(riscv_sret_glue), (SRET)>;
+def : Pat<(riscv_mret_glue), (MRET)>;
 
 let isCall = 1, Defs = [X1] in {
 let Predicates = [NoStdExtZicfilp] in
diff --git a/llvm/lib/Target/RISCV/RISCVInstrInfoD.td b/llvm/lib/Target/RISCV/RISCVInstrInfoD.td
index 8efefee..35ab277 100644
--- a/llvm/lib/Target/RISCV/RISCVInstrInfoD.td
+++ b/llvm/lib/Target/RISCV/RISCVInstrInfoD.td
@@ -282,6 +282,7 @@ def : Pat<(fabs FPR64:$rs1), (FSGNJX_D $rs1, $rs1)>;
 def : Pat<(riscv_fclass FPR64:$rs1), (FCLASS_D $rs1)>;
 
 def : PatFprFpr<fcopysign, FSGNJ_D, FPR64, f64>;
+def : PatFprFpr<riscv_fsgnjx, FSGNJX_D, FPR64, f64>;
 def : Pat<(fcopysign FPR64:$rs1, (fneg FPR64:$rs2)), (FSGNJN_D $rs1, $rs2)>;
 def : Pat<(fcopysign FPR64:$rs1, FPR32:$rs2), (FSGNJ_D $rs1, (FCVT_D_S $rs2,
                                                               FRM_RNE))>;
@@ -318,6 +319,7 @@ def : Pat<(fabs FPR64INX:$rs1), (FSGNJX_D_INX $rs1, $rs1)>;
 def : Pat<(riscv_fclass FPR64INX:$rs1), (FCLASS_D_INX $rs1)>;
 
 def : PatFprFpr<fcopysign, FSGNJ_D_INX, FPR64INX, f64>;
+def : PatFprFpr<riscv_fsgnjx, FSGNJX_D_INX, FPR64INX, f64>;
 def : Pat<(fcopysign FPR64INX:$rs1, (fneg FPR64INX:$rs2)),
           (FSGNJN_D_INX $rs1, $rs2)>;
 def : Pat<(fcopysign FPR64INX:$rs1, FPR32INX:$rs2),
@@ -355,6 +357,7 @@ def : Pat<(fabs FPR64IN32X:$rs1), (FSGNJX_D_IN32X $rs1, $rs1)>;
 def : Pat<(riscv_fclass FPR64IN32X:$rs1), (FCLASS_D_IN32X $rs1)>;
 
 def : PatFprFpr<fcopysign, FSGNJ_D_IN32X, FPR64IN32X, f64>;
+def : PatFprFpr<riscv_fsgnjx, FSGNJX_D_IN32X, FPR64IN32X, f64>;
 def : Pat<(fcopysign FPR64IN32X:$rs1, (fneg FPR64IN32X:$rs2)),
           (FSGNJN_D_IN32X $rs1, $rs2)>;
 def : Pat<(fcopysign FPR64IN32X:$rs1, FPR32INX:$rs2),
diff --git a/llvm/lib/Target/RISCV/RISCVInstrInfoF.td b/llvm/lib/Target/RISCV/RISCVInstrInfoF.td
index 7d89608..e6c25e0 100644
--- a/llvm/lib/Target/RISCV/RISCVInstrInfoF.td
+++ b/llvm/lib/Target/RISCV/RISCVInstrInfoF.td
@@ -31,6 +31,8 @@ def SDT_RISCVFROUND
                            SDTCisVT<3, XLenVT>]>;
 def SDT_RISCVFCLASS
     : SDTypeProfile<1, 1, [SDTCisVT<0, XLenVT>, SDTCisFP<1>]>;
+def SDT_RISCVFSGNJX
+    : SDTypeProfile<1, 2, [SDTCisFP<0>, SDTCisSameAs<0, 1>, SDTCisSameAs<0, 2>]>;
 
 def riscv_fclass
     : SDNode<"RISCVISD::FCLASS", SDT_RISCVFCLASS>;
@@ -38,6 +40,9 @@ def riscv_fclass
 def riscv_fround
     : SDNode<"RISCVISD::FROUND", SDT_RISCVFROUND>;
 
+def riscv_fsgnjx
+    : SDNode<"RISCVISD::FSGNJX", SDT_RISCVFSGNJX>;
+
 def riscv_fmv_w_x_rv64
     : SDNode<"RISCVISD::FMV_W_X_RV64", SDT_RISCVFMV_W_X_RV64>;
 def riscv_fmv_x_anyextw_rv64
@@ -539,8 +544,10 @@ def : Pat<(fabs FPR32INX:$rs1), (FSGNJX_S_INX $rs1, $rs1)>;
 def : Pat<(riscv_fclass FPR32INX:$rs1), (FCLASS_S_INX $rs1)>;
 } // Predicates = [HasStdExtZfinx]
 
-foreach Ext = FExts in
+foreach Ext = FExts in {
 defm : PatFprFpr_m<fcopysign, FSGNJ_S, Ext>;
+defm : PatFprFpr_m<riscv_fsgnjx, FSGNJX_S, Ext>;
+}
 
 let Predicates = [HasStdExtF] in {
 def : Pat<(fcopysign FPR32:$rs1, (fneg FPR32:$rs2)), (FSGNJN_S $rs1, $rs2)>;
diff --git a/llvm/lib/Target/RISCV/RISCVInstrInfoV.td b/llvm/lib/Target/RISCV/RISCVInstrInfoV.td
index b581723..5580504 100644
--- a/llvm/lib/Target/RISCV/RISCVInstrInfoV.td
+++ b/llvm/lib/Target/RISCV/RISCVInstrInfoV.td
@@ -92,34 +92,34 @@ def simm5_plus1_nonzero : ImmLeaf<XLenVT,
 //===----------------------------------------------------------------------===//
 
 // Common class of scheduling definitions.
-// `ReadVMergeOp` will be prepended to reads if instruction is masked.
+// `ReadVPassthru` will be prepended to reads if instruction is masked.
 // `ReadVMask` will be appended to reads if instruction is masked.
 // Operands:
 //   `writes`       SchedWrites that are listed for each explicit def operand
 //                  in order.
 //   `reads`        SchedReads that are listed for each explicit use operand.
 //   `forceMasked`  Forced to be masked (e.g. Add-with-Carry Instructions).
-//   `forceMergeOpRead` Force to have read for merge operand.
+//   `forcePassthruRead` Force to have read for passthru operand.
 class SchedCommon<list<SchedWrite> writes, list<SchedRead> reads,
                   string mx = "WorstCase", int sew = 0, bit forceMasked = 0,
-                  bit forceMergeOpRead = 0> : Sched<[]> {
+                  bit forcePassthruRead = 0> : Sched<[]> {
   defvar isMasked = !ne(!find(NAME, "_MASK"), -1);
   defvar isMaskedOrForceMasked = !or(forceMasked, isMasked);
-  defvar mergeRead = !if(!or(!eq(mx, "WorstCase"), !eq(sew, 0)),
-                            !cast<SchedRead>("ReadVMergeOp_" # mx),
-                            !cast<SchedRead>("ReadVMergeOp_" # mx # "_E" #sew));
-  defvar needsMergeRead = !or(isMaskedOrForceMasked, forceMergeOpRead);
+  defvar passthruRead = !if(!or(!eq(mx, "WorstCase"), !eq(sew, 0)),
+                            !cast<SchedRead>("ReadVPassthru_" # mx),
+                            !cast<SchedRead>("ReadVPassthru_" # mx # "_E" #sew));
+  defvar needsPassthruRead = !or(isMaskedOrForceMasked, forcePassthruRead);
   defvar readsWithMask =
       !if(isMaskedOrForceMasked, !listconcat(reads, [ReadVMask]), reads);
   defvar allReads =
-      !if(needsMergeRead, !listconcat([mergeRead], readsWithMask), reads);
+      !if(needsPassthruRead, !listconcat([passthruRead], readsWithMask), reads);
   let SchedRW = !listconcat(writes, allReads);
 }
 
 // Common class of scheduling definitions for n-ary instructions.
 // The scheudling resources are relevant to LMUL and may be relevant to SEW.
 class SchedNary<string write, list<string> reads, string mx, int sew = 0,
-                bit forceMasked = 0, bit forceMergeOpRead = 0>
+                bit forceMasked = 0, bit forcePassthruRead = 0>
     : SchedCommon<[!cast<SchedWrite>(
                       !if(sew,
                           write # "_" # mx # "_E" # sew,
@@ -127,7 +127,7 @@ class SchedNary<string write, list<string> reads, string mx, int sew = 0,
                   !foreach(read, reads,
                            !cast<SchedRead>(!if(sew, read #"_" #mx #"_E" #sew,
                                                  read #"_" #mx))),
-                  mx, sew, forceMasked, forceMergeOpRead>;
+                  mx, sew, forceMasked, forcePassthruRead>;
 
 // Classes with postfix "MC" are only used in MC layer.
 // For these classes, we assume that they are with the worst case costs and
@@ -135,22 +135,22 @@ class SchedNary<string write, list<string> reads, string mx, int sew = 0,
 
 // For instructions with no operand.
 class SchedNullary<string write, string mx, int sew = 0, bit forceMasked = 0,
-                   bit forceMergeOpRead = 0>:
-  SchedNary<write, [], mx, sew, forceMasked, forceMergeOpRead>;
+                   bit forcePassthruRead = 0>:
+  SchedNary<write, [], mx, sew, forceMasked, forcePassthruRead>;
 class SchedNullaryMC<string write, bit forceMasked = 1>:
   SchedNullary<write, "WorstCase", forceMasked=forceMasked>;
 
 // For instructions with one operand.
 class SchedUnary<string write, string read0, string mx, int sew = 0,
-                 bit forceMasked = 0, bit forceMergeOpRead = 0>:
-  SchedNary<write, [read0], mx, sew, forceMasked, forceMergeOpRead>;
+                 bit forceMasked = 0, bit forcePassthruRead = 0>:
+  SchedNary<write, [read0], mx, sew, forceMasked, forcePassthruRead>;
 class SchedUnaryMC<string write, string read0, bit forceMasked = 1>:
   SchedUnary<write, read0, "WorstCase", forceMasked=forceMasked>;
 
 // For instructions with two operands.
 class SchedBinary<string write, string read0, string read1, string mx,
-                  int sew = 0, bit forceMasked = 0, bit forceMergeOpRead = 0>
-    : SchedNary<write, [read0, read1], mx, sew, forceMasked, forceMergeOpRead>;
+                  int sew = 0, bit forceMasked = 0, bit forcePassthruRead = 0>
+    : SchedNary<write, [read0, read1], mx, sew, forceMasked, forcePassthruRead>;
 class SchedBinaryMC<string write, string read0, string read1,
                     bit forceMasked = 1>:
   SchedBinary<write, read0, read1, "WorstCase", forceMasked=forceMasked>;
@@ -165,9 +165,9 @@ class SchedTernaryMC<string write, string read0, string read1, string read2,
 
 // For reduction instructions.
 class SchedReduction<string write, string read, string mx, int sew,
-                     bit forceMergeOpRead = 0>
+                     bit forcePassthruRead = 0>
     : SchedCommon<[!cast<SchedWrite>(write #"_" #mx #"_E" #sew)],
-                  !listsplat(!cast<SchedRead>(read), 3), mx, sew, forceMergeOpRead>;
+                  !listsplat(!cast<SchedRead>(read), 3), mx, sew, forcePassthruRead>;
 class SchedReductionMC<string write, string readV, string readV0>:
   SchedCommon<[!cast<SchedWrite>(write # "_WorstCase")],
               [!cast<SchedRead>(readV), !cast<SchedRead>(readV0)],
diff --git a/llvm/lib/Target/RISCV/RISCVInstrInfoVPseudos.td b/llvm/lib/Target/RISCV/RISCVInstrInfoVPseudos.td
index b860273..e23179e 100644
--- a/llvm/lib/Target/RISCV/RISCVInstrInfoVPseudos.td
+++ b/llvm/lib/Target/RISCV/RISCVInstrInfoVPseudos.td
@@ -751,7 +751,7 @@ class VPseudo<Instruction instr, LMULInfo m, dag outs, dag ins, int sew = 0> :
 
 class GetVTypePredicates<VTypeInfo vti> {
   list<Predicate> Predicates = !cond(!eq(vti.Scalar, f16) : [HasVInstructionsF16],
-                                     !eq(vti.Scalar, bf16) : [HasVInstructionsBF16],
+                                     !eq(vti.Scalar, bf16) : [HasVInstructionsBF16Minimal],
                                      !eq(vti.Scalar, f32) : [HasVInstructionsAnyF],
                                      !eq(vti.Scalar, f64) : [HasVInstructionsF64],
                                      !eq(vti.SEW, 64) : [HasVInstructionsI64],
@@ -777,7 +777,7 @@ class VPseudoUSLoadNoMask<VReg RetClass,
 class VPseudoUSLoadMask<VReg RetClass,
                         int EEW> :
       Pseudo<(outs GetVRegNoV0<RetClass>.R:$rd),
-             (ins GetVRegNoV0<RetClass>.R:$merge,
+             (ins GetVRegNoV0<RetClass>.R:$passthru,
                   GPRMem:$rs1,
                   VMaskOp:$vm, AVL:$vl, ixlenimm:$sew, ixlenimm:$policy), []>,
       RISCVVPseudo,
@@ -785,7 +785,7 @@ class VPseudoUSLoadMask<VReg RetClass,
   let mayLoad = 1;
   let mayStore = 0;
   let hasSideEffects = 0;
-  let Constraints = "$rd = $merge";
+  let Constraints = "$rd = $passthru";
   let HasVLOp = 1;
   let HasSEWOp = 1;
   let HasVecPolicyOp = 1;
@@ -811,7 +811,7 @@ class VPseudoUSLoadFFNoMask<VReg RetClass,
 class VPseudoUSLoadFFMask<VReg RetClass,
                           int EEW> :
       Pseudo<(outs GetVRegNoV0<RetClass>.R:$rd, GPR:$vl),
-             (ins GetVRegNoV0<RetClass>.R:$merge,
+             (ins GetVRegNoV0<RetClass>.R:$passthru,
                   GPRMem:$rs1,
                   VMaskOp:$vm, AVL:$avl, ixlenimm:$sew, ixlenimm:$policy), []>,
       RISCVVPseudo,
@@ -819,7 +819,7 @@ class VPseudoUSLoadFFMask<VReg RetClass,
   let mayLoad = 1;
   let mayStore = 0;
   let hasSideEffects = 0;
-  let Constraints = "$rd = $merge";
+  let Constraints = "$rd = $passthru";
   let HasVLOp = 1;
   let HasSEWOp = 1;
   let HasVecPolicyOp = 1;
@@ -845,7 +845,7 @@ class VPseudoSLoadNoMask<VReg RetClass,
 class VPseudoSLoadMask<VReg RetClass,
                        int EEW> :
       Pseudo<(outs GetVRegNoV0<RetClass>.R:$rd),
-             (ins GetVRegNoV0<RetClass>.R:$merge,
+             (ins GetVRegNoV0<RetClass>.R:$passthru,
                   GPRMem:$rs1, GPR:$rs2,
                   VMaskOp:$vm, AVL:$vl, ixlenimm:$sew, ixlenimm:$policy), []>,
       RISCVVPseudo,
@@ -853,7 +853,7 @@ class VPseudoSLoadMask<VReg RetClass,
   let mayLoad = 1;
   let mayStore = 0;
   let hasSideEffects = 0;
-  let Constraints = "$rd = $merge";
+  let Constraints = "$rd = $passthru";
   let HasVLOp = 1;
   let HasSEWOp = 1;
   let HasVecPolicyOp = 1;
@@ -890,7 +890,7 @@ class VPseudoILoadMask<VReg RetClass,
                        bit EarlyClobber,
                        int TargetConstraintType = 1> :
       Pseudo<(outs GetVRegNoV0<RetClass>.R:$rd),
-             (ins GetVRegNoV0<RetClass>.R:$merge,
+             (ins GetVRegNoV0<RetClass>.R:$passthru,
                   GPRMem:$rs1, IdxClass:$rs2,
                   VMaskOp:$vm, AVL:$vl, ixlenimm:$sew, ixlenimm:$policy), []>,
       RISCVVPseudo,
@@ -898,7 +898,7 @@ class VPseudoILoadMask<VReg RetClass,
   let mayLoad = 1;
   let mayStore = 0;
   let hasSideEffects = 0;
-  let Constraints = !if(!eq(EarlyClobber, 1), "@earlyclobber $rd, $rd = $merge", "$rd = $merge");
+  let Constraints = !if(!eq(EarlyClobber, 1), "@earlyclobber $rd, $rd = $passthru", "$rd = $passthru");
   let TargetOverlapConstraintType = TargetConstraintType;
   let HasVLOp = 1;
   let HasSEWOp = 1;
@@ -963,13 +963,13 @@ class VPseudoSStoreMask<VReg StClass,
 
 class VPseudoNullaryNoMask<VReg RegClass> :
       Pseudo<(outs RegClass:$rd),
-             (ins RegClass:$merge,
+             (ins RegClass:$passthru,
                   AVL:$vl, ixlenimm:$sew, ixlenimm:$policy), []>,
       RISCVVPseudo {
   let mayLoad = 0;
   let mayStore = 0;
   let hasSideEffects = 0;
-  let Constraints = "$rd = $merge";
+  let Constraints = "$rd = $passthru";
   let HasVLOp = 1;
   let HasSEWOp = 1;
   let HasVecPolicyOp = 1;
@@ -977,13 +977,13 @@ class VPseudoNullaryNoMask<VReg RegClass> :
 
 class VPseudoNullaryMask<VReg RegClass> :
       Pseudo<(outs GetVRegNoV0<RegClass>.R:$rd),
-             (ins GetVRegNoV0<RegClass>.R:$merge,
+             (ins GetVRegNoV0<RegClass>.R:$passthru,
                   VMaskOp:$vm, AVL:$vl, ixlenimm:$sew, ixlenimm:$policy), []>,
       RISCVVPseudo {
   let mayLoad = 0;
   let mayStore = 0;
   let hasSideEffects = 0;
-  let Constraints ="$rd = $merge";
+  let Constraints ="$rd = $passthru";
   let HasVLOp = 1;
   let HasSEWOp = 1;
   let UsesMaskPolicy = 1;
@@ -1012,13 +1012,13 @@ class VPseudoUnaryNoMask<DAGOperand RetClass,
                          string Constraint = "",
                          int TargetConstraintType = 1> :
       Pseudo<(outs RetClass:$rd),
-             (ins RetClass:$merge, OpClass:$rs2,
+             (ins RetClass:$passthru, OpClass:$rs2,
                   AVL:$vl, ixlenimm:$sew, ixlenimm:$policy), []>,
       RISCVVPseudo {
   let mayLoad = 0;
   let mayStore = 0;
   let hasSideEffects = 0;
-  let Constraints = !interleave([Constraint, "$rd = $merge"], ",");
+  let Constraints = !interleave([Constraint, "$rd = $passthru"], ",");
   let TargetOverlapConstraintType = TargetConstraintType;
   let HasVLOp = 1;
   let HasSEWOp = 1;
@@ -1046,13 +1046,13 @@ class VPseudoUnaryNoMaskRoundingMode<DAGOperand RetClass,
                                      string Constraint = "",
                                      int TargetConstraintType = 1> :
       Pseudo<(outs RetClass:$rd),
-             (ins RetClass:$merge, OpClass:$rs2, ixlenimm:$rm,
+             (ins RetClass:$passthru, OpClass:$rs2, ixlenimm:$rm,
                   AVL:$vl, ixlenimm:$sew, ixlenimm:$policy), []>,
       RISCVVPseudo {
   let mayLoad = 0;
   let mayStore = 0;
   let hasSideEffects = 0;
-  let Constraints = !interleave([Constraint, "$rd = $merge"], ",");
+  let Constraints = !interleave([Constraint, "$rd = $passthru"], ",");
   let TargetOverlapConstraintType = TargetConstraintType;
   let HasVLOp = 1;
   let HasSEWOp = 1;
@@ -1066,13 +1066,13 @@ class VPseudoUnaryMask<VReg RetClass,
                        string Constraint = "",
                        int TargetConstraintType = 1> :
       Pseudo<(outs GetVRegNoV0<RetClass>.R:$rd),
-             (ins GetVRegNoV0<RetClass>.R:$merge, OpClass:$rs2,
+             (ins GetVRegNoV0<RetClass>.R:$passthru, OpClass:$rs2,
                   VMaskOp:$vm, AVL:$vl, ixlenimm:$sew, ixlenimm:$policy), []>,
       RISCVVPseudo {
   let mayLoad = 0;
   let mayStore = 0;
   let hasSideEffects = 0;
-  let Constraints = !interleave([Constraint, "$rd = $merge"], ",");
+  let Constraints = !interleave([Constraint, "$rd = $passthru"], ",");
   let TargetOverlapConstraintType = TargetConstraintType;
   let HasVLOp = 1;
   let HasSEWOp = 1;
@@ -1085,14 +1085,14 @@ class VPseudoUnaryMaskRoundingMode<VReg RetClass,
                                    string Constraint = "",
                                    int TargetConstraintType = 1> :
       Pseudo<(outs GetVRegNoV0<RetClass>.R:$rd),
-             (ins GetVRegNoV0<RetClass>.R:$merge, OpClass:$rs2,
+             (ins GetVRegNoV0<RetClass>.R:$passthru, OpClass:$rs2,
                   VMaskOp:$vm, ixlenimm:$rm,
                   AVL:$vl, ixlenimm:$sew, ixlenimm:$policy), []>,
       RISCVVPseudo {
   let mayLoad = 0;
   let mayStore = 0;
   let hasSideEffects = 0;
-  let Constraints = !interleave([Constraint, "$rd = $merge"], ",");
+  let Constraints = !interleave([Constraint, "$rd = $passthru"], ",");
   let TargetOverlapConstraintType = TargetConstraintType;
   let HasVLOp = 1;
   let HasSEWOp = 1;
@@ -1106,12 +1106,12 @@ class VPseudoUnaryMask_NoExcept<VReg RetClass,
                                 VReg OpClass,
                                 string Constraint = ""> :
       Pseudo<(outs GetVRegNoV0<RetClass>.R:$rd),
-             (ins GetVRegNoV0<RetClass>.R:$merge, OpClass:$rs2,
+             (ins GetVRegNoV0<RetClass>.R:$passthru, OpClass:$rs2,
                   VMaskOp:$vm, AVL:$vl, ixlenimm:$sew, ixlenimm:$policy), []> {
   let mayLoad = 0;
   let mayStore = 0;
   let hasSideEffects = 0;
-  let Constraints = !interleave([Constraint, "$rd = $merge"], ",");
+  let Constraints = !interleave([Constraint, "$rd = $passthru"], ",");
   let HasVLOp = 1;
   let HasSEWOp = 1;
   let HasVecPolicyOp = 1;
@@ -1124,13 +1124,13 @@ class VPseudoUnaryNoMask_FRM<VReg RetClass,
                              string Constraint = "",
                              int TargetConstraintType = 1> :
       Pseudo<(outs RetClass:$rd),
-             (ins RetClass:$merge, OpClass:$rs2, ixlenimm:$frm,
+             (ins RetClass:$passthru, OpClass:$rs2, ixlenimm:$frm,
                   AVL:$vl, ixlenimm:$sew, ixlenimm:$policy), []>,
       RISCVVPseudo {
   let mayLoad = 0;
   let mayStore = 0;
   let hasSideEffects = 0;
-  let Constraints = !interleave([Constraint, "$rd = $merge"], ",");
+  let Constraints = !interleave([Constraint, "$rd = $passthru"], ",");
   let TargetOverlapConstraintType = TargetConstraintType;
   let HasVLOp = 1;
   let HasSEWOp = 1;
@@ -1143,14 +1143,14 @@ class VPseudoUnaryMask_FRM<VReg RetClass,
                            string Constraint = "",
                            int TargetConstraintType = 1> :
       Pseudo<(outs GetVRegNoV0<RetClass>.R:$rd),
-             (ins GetVRegNoV0<RetClass>.R:$merge, OpClass:$rs2,
+             (ins GetVRegNoV0<RetClass>.R:$passthru, OpClass:$rs2,
                   VMaskOp:$vm, ixlenimm:$frm,
                   AVL:$vl, ixlenimm:$sew, ixlenimm:$policy), []>,
       RISCVVPseudo {
   let mayLoad = 0;
   let mayStore = 0;
   let hasSideEffects = 0;
-  let Constraints = !interleave([Constraint, "$rd = $merge"], ",");
+  let Constraints = !interleave([Constraint, "$rd = $passthru"], ",");
   let TargetOverlapConstraintType = TargetConstraintType;
   let HasVLOp = 1;
   let HasSEWOp = 1;
@@ -1185,13 +1185,13 @@ class VPseudoUnaryMaskGPROut :
 class VPseudoUnaryAnyMask<VReg RetClass,
                           VReg Op1Class> :
       Pseudo<(outs RetClass:$rd),
-             (ins RetClass:$merge, Op1Class:$rs2,
+             (ins RetClass:$passthru, Op1Class:$rs2,
                   VR:$vm, AVL:$vl, ixlenimm:$sew), []>,
       RISCVVPseudo {
   let mayLoad = 0;
   let mayStore = 0;
   let hasSideEffects = 0;
-  let Constraints = "@earlyclobber $rd, $rd = $merge";
+  let Constraints = "@earlyclobber $rd, $rd = $passthru";
   let HasVLOp = 1;
   let HasSEWOp = 1;
 }
@@ -1219,13 +1219,13 @@ class VPseudoBinaryNoMaskPolicy<VReg RetClass,
                                 string Constraint,
                                 int TargetConstraintType = 1> :
       Pseudo<(outs RetClass:$rd),
-             (ins RetClass:$merge, Op1Class:$rs2, Op2Class:$rs1, AVL:$vl,
+             (ins RetClass:$passthru, Op1Class:$rs2, Op2Class:$rs1, AVL:$vl,
                   ixlenimm:$sew, ixlenimm:$policy), []>,
       RISCVVPseudo {
   let mayLoad = 0;
   let mayStore = 0;
   let hasSideEffects = 0;
-  let Constraints = !interleave([Constraint, "$rd = $merge"], ",");
+  let Constraints = !interleave([Constraint, "$rd = $passthru"], ",");
   let TargetOverlapConstraintType = TargetConstraintType;
   let HasVLOp = 1;
   let HasSEWOp = 1;
@@ -1239,12 +1239,12 @@ class VPseudoBinaryNoMaskRoundingMode<VReg RetClass,
                                       int UsesVXRM_ = 1,
                                       int TargetConstraintType = 1> :
       Pseudo<(outs RetClass:$rd),
-             (ins RetClass:$merge, Op1Class:$rs2, Op2Class:$rs1, ixlenimm:$rm,
+             (ins RetClass:$passthru, Op1Class:$rs2, Op2Class:$rs1, ixlenimm:$rm,
                   AVL:$vl, ixlenimm:$sew, ixlenimm:$policy), []>,
       RISCVVPseudo {
   let mayLoad = 0;
   let mayStore = 0;
-  let Constraints = !interleave([Constraint, "$rd = $merge"], ",");
+  let Constraints = !interleave([Constraint, "$rd = $passthru"], ",");
   let TargetOverlapConstraintType = TargetConstraintType;
   let HasVLOp = 1;
   let HasSEWOp = 1;
@@ -1260,14 +1260,14 @@ class VPseudoBinaryMaskPolicyRoundingMode<VReg RetClass,
                                           int UsesVXRM_,
                                           int TargetConstraintType = 1> :
       Pseudo<(outs GetVRegNoV0<RetClass>.R:$rd),
-             (ins GetVRegNoV0<RetClass>.R:$merge,
+             (ins GetVRegNoV0<RetClass>.R:$passthru,
                   Op1Class:$rs2, Op2Class:$rs1,
                   VMaskOp:$vm, ixlenimm:$rm, AVL:$vl,
                   ixlenimm:$sew, ixlenimm:$policy), []>,
       RISCVVPseudo {
   let mayLoad = 0;
   let mayStore = 0;
-  let Constraints = !interleave([Constraint, "$rd = $merge"], ",");
+  let Constraints = !interleave([Constraint, "$rd = $passthru"], ",");
   let TargetOverlapConstraintType = TargetConstraintType;
   let HasVLOp = 1;
   let HasSEWOp = 1;
@@ -1358,14 +1358,14 @@ class VPseudoBinaryMaskPolicy<VReg RetClass,
                               string Constraint,
                               int TargetConstraintType = 1> :
       Pseudo<(outs GetVRegNoV0<RetClass>.R:$rd),
-             (ins GetVRegNoV0<RetClass>.R:$merge,
+             (ins GetVRegNoV0<RetClass>.R:$passthru,
                   Op1Class:$rs2, Op2Class:$rs1,
                   VMaskOp:$vm, AVL:$vl, ixlenimm:$sew, ixlenimm:$policy), []>,
       RISCVVPseudo {
   let mayLoad = 0;
   let mayStore = 0;
   let hasSideEffects = 0;
-  let Constraints = !interleave([Constraint, "$rd = $merge"], ",");
+  let Constraints = !interleave([Constraint, "$rd = $passthru"], ",");
   let TargetOverlapConstraintType = TargetConstraintType;
   let HasVLOp = 1;
   let HasSEWOp = 1;
@@ -1377,14 +1377,14 @@ class VPseudoTernaryMaskPolicy<VReg RetClass,
                                RegisterClass Op1Class,
                                DAGOperand Op2Class> :
       Pseudo<(outs GetVRegNoV0<RetClass>.R:$rd),
-             (ins GetVRegNoV0<RetClass>.R:$merge,
+             (ins GetVRegNoV0<RetClass>.R:$passthru,
                   Op1Class:$rs2, Op2Class:$rs1,
                   VMaskOp:$vm, AVL:$vl, ixlenimm:$sew, ixlenimm:$policy), []>,
       RISCVVPseudo {
   let mayLoad = 0;
   let mayStore = 0;
   let hasSideEffects = 0;
-  let Constraints = "$rd = $merge";
+  let Constraints = "$rd = $passthru";
   let HasVLOp = 1;
   let HasSEWOp = 1;
   let HasVecPolicyOp = 1;
@@ -1394,7 +1394,7 @@ class VPseudoTernaryMaskPolicyRoundingMode<VReg RetClass,
                                            RegisterClass Op1Class,
                                            DAGOperand Op2Class> :
       Pseudo<(outs GetVRegNoV0<RetClass>.R:$rd),
-             (ins GetVRegNoV0<RetClass>.R:$merge,
+             (ins GetVRegNoV0<RetClass>.R:$passthru,
                   Op1Class:$rs2, Op2Class:$rs1,
                   VMaskOp:$vm,
                   ixlenimm:$rm,
@@ -1403,7 +1403,7 @@ class VPseudoTernaryMaskPolicyRoundingMode<VReg RetClass,
   let mayLoad = 0;
   let mayStore = 0;
   let hasSideEffects = 0;
-  let Constraints = "$rd = $merge";
+  let Constraints = "$rd = $passthru";
   let HasVLOp = 1;
   let HasSEWOp = 1;
   let HasVecPolicyOp = 1;
@@ -1418,14 +1418,14 @@ class VPseudoBinaryMOutMask<VReg RetClass,
                             string Constraint,
                             int TargetConstraintType = 1> :
       Pseudo<(outs RetClass:$rd),
-             (ins RetClass:$merge,
+             (ins RetClass:$passthru,
                   Op1Class:$rs2, Op2Class:$rs1,
                   VMaskOp:$vm, AVL:$vl, ixlenimm:$sew), []>,
       RISCVVPseudo {
   let mayLoad = 0;
   let mayStore = 0;
   let hasSideEffects = 0;
-  let Constraints = !interleave([Constraint, "$rd = $merge"], ",");
+  let Constraints = !interleave([Constraint, "$rd = $passthru"], ",");
   let TargetOverlapConstraintType = TargetConstraintType;
   let HasVLOp = 1;
   let HasSEWOp = 1;
@@ -1440,14 +1440,14 @@ class VPseudoTiedBinaryMask<VReg RetClass,
                             string Constraint,
                             int TargetConstraintType = 1> :
       Pseudo<(outs GetVRegNoV0<RetClass>.R:$rd),
-             (ins GetVRegNoV0<RetClass>.R:$merge,
+             (ins GetVRegNoV0<RetClass>.R:$passthru,
                   Op2Class:$rs1,
                   VMaskOp:$vm, AVL:$vl, ixlenimm:$sew, ixlenimm:$policy), []>,
       RISCVVPseudo {
   let mayLoad = 0;
   let mayStore = 0;
   let hasSideEffects = 0;
-  let Constraints = !interleave([Constraint, "$rd = $merge"], ",");
+  let Constraints = !interleave([Constraint, "$rd = $passthru"], ",");
   let TargetOverlapConstraintType = TargetConstraintType;
   let HasVLOp = 1;
   let HasSEWOp = 1;
@@ -1461,7 +1461,7 @@ class VPseudoTiedBinaryMaskRoundingMode<VReg RetClass,
                                         string Constraint,
                                         int TargetConstraintType = 1> :
       Pseudo<(outs GetVRegNoV0<RetClass>.R:$rd),
-             (ins GetVRegNoV0<RetClass>.R:$merge,
+             (ins GetVRegNoV0<RetClass>.R:$passthru,
                   Op2Class:$rs1,
                   VMaskOp:$vm,
                   ixlenimm:$rm,
@@ -1470,7 +1470,7 @@ class VPseudoTiedBinaryMaskRoundingMode<VReg RetClass,
   let mayLoad = 0;
   let mayStore = 0;
   let hasSideEffects = 0;
-  let Constraints = !interleave([Constraint, "$rd = $merge"], ",");
+  let Constraints = !interleave([Constraint, "$rd = $passthru"], ",");
   let TargetOverlapConstraintType = TargetConstraintType;
   let HasVLOp = 1;
   let HasSEWOp = 1;
@@ -1511,13 +1511,13 @@ class VPseudoTiedBinaryCarryIn<VReg RetClass,
                                LMULInfo MInfo,
                                int TargetConstraintType = 1> :
       Pseudo<(outs RetClass:$rd),
-             (ins RetClass:$merge, Op1Class:$rs2, Op2Class:$rs1,
+             (ins RetClass:$passthru, Op1Class:$rs2, Op2Class:$rs1,
                   VMV0:$carry, AVL:$vl, ixlenimm:$sew), []>,
       RISCVVPseudo {
   let mayLoad = 0;
   let mayStore = 0;
   let hasSideEffects = 0;
-  let Constraints = "$rd = $merge";
+  let Constraints = "$rd = $passthru";
   let TargetOverlapConstraintType = TargetConstraintType;
   let HasVLOp = 1;
   let HasSEWOp = 1;
@@ -1602,14 +1602,14 @@ class VPseudoUSSegLoadMask<VReg RetClass,
                            int EEW,
                            bits<4> NF> :
       Pseudo<(outs GetVRegNoV0<RetClass>.R:$rd),
-             (ins GetVRegNoV0<RetClass>.R:$merge, GPRMem:$rs1,
+             (ins GetVRegNoV0<RetClass>.R:$passthru, GPRMem:$rs1,
                   VMaskOp:$vm, AVL:$vl, ixlenimm:$sew, ixlenimm:$policy), []>,
       RISCVVPseudo,
       RISCVVLSEG<NF, /*Masked*/1, /*Strided*/0, /*FF*/0, !logtwo(EEW), VLMul> {
   let mayLoad = 1;
   let mayStore = 0;
   let hasSideEffects = 0;
-  let Constraints = "$rd = $merge";
+  let Constraints = "$rd = $passthru";
   let HasVLOp = 1;
   let HasSEWOp = 1;
   let HasVecPolicyOp = 1;
@@ -1637,14 +1637,14 @@ class VPseudoUSSegLoadFFMask<VReg RetClass,
                              int EEW,
                              bits<4> NF> :
       Pseudo<(outs GetVRegNoV0<RetClass>.R:$rd, GPR:$vl),
-             (ins GetVRegNoV0<RetClass>.R:$merge, GPRMem:$rs1,
+             (ins GetVRegNoV0<RetClass>.R:$passthru, GPRMem:$rs1,
                   VMaskOp:$vm, AVL:$avl, ixlenimm:$sew, ixlenimm:$policy), []>,
       RISCVVPseudo,
       RISCVVLSEG<NF, /*Masked*/1, /*Strided*/0, /*FF*/1, !logtwo(EEW), VLMul> {
   let mayLoad = 1;
   let mayStore = 0;
   let hasSideEffects = 0;
-  let Constraints = "$rd = $merge";
+  let Constraints = "$rd = $passthru";
   let HasVLOp = 1;
   let HasSEWOp = 1;
   let HasVecPolicyOp = 1;
@@ -1655,7 +1655,7 @@ class VPseudoSSegLoadNoMask<VReg RetClass,
                             int EEW,
                             bits<4> NF> :
       Pseudo<(outs RetClass:$rd),
-             (ins RetClass:$merge, GPRMem:$rs1, GPR:$offset, AVL:$vl,
+             (ins RetClass:$passthru, GPRMem:$rs1, GPR:$offset, AVL:$vl,
              ixlenimm:$sew, ixlenimm:$policy), []>,
       RISCVVPseudo,
       RISCVVLSEG<NF, /*Masked*/0, /*Strided*/1, /*FF*/0, !logtwo(EEW), VLMul> {
@@ -1665,14 +1665,14 @@ class VPseudoSSegLoadNoMask<VReg RetClass,
   let HasVLOp = 1;
   let HasSEWOp = 1;
   let HasVecPolicyOp = 1;
-  let Constraints = "$rd = $merge";
+  let Constraints = "$rd = $passthru";
 }
 
 class VPseudoSSegLoadMask<VReg RetClass,
                           int EEW,
                           bits<4> NF> :
       Pseudo<(outs GetVRegNoV0<RetClass>.R:$rd),
-             (ins GetVRegNoV0<RetClass>.R:$merge, GPRMem:$rs1,
+             (ins GetVRegNoV0<RetClass>.R:$passthru, GPRMem:$rs1,
                   GPR:$offset, VMaskOp:$vm, AVL:$vl, ixlenimm:$sew,
                   ixlenimm:$policy), []>,
       RISCVVPseudo,
@@ -1680,7 +1680,7 @@ class VPseudoSSegLoadMask<VReg RetClass,
   let mayLoad = 1;
   let mayStore = 0;
   let hasSideEffects = 0;
-  let Constraints = "$rd = $merge";
+  let Constraints = "$rd = $passthru";
   let HasVLOp = 1;
   let HasSEWOp = 1;
   let HasVecPolicyOp = 1;
@@ -1694,7 +1694,7 @@ class VPseudoISegLoadNoMask<VReg RetClass,
                             bits<4> NF,
                             bit Ordered> :
       Pseudo<(outs RetClass:$rd),
-             (ins RetClass:$merge, GPRMem:$rs1, IdxClass:$offset, AVL:$vl,
+             (ins RetClass:$passthru, GPRMem:$rs1, IdxClass:$offset, AVL:$vl,
                   ixlenimm:$sew, ixlenimm:$policy), []>,
       RISCVVPseudo,
       RISCVVLXSEG<NF, /*Masked*/0, Ordered, !logtwo(EEW), VLMul, LMUL> {
@@ -1703,7 +1703,7 @@ class VPseudoISegLoadNoMask<VReg RetClass,
   let hasSideEffects = 0;
   // For vector indexed segment loads, the destination vector register groups
   // cannot overlap the source vector register group
-  let Constraints = "@earlyclobber $rd, $rd = $merge";
+  let Constraints = "@earlyclobber $rd, $rd = $passthru";
   let HasVLOp = 1;
   let HasSEWOp = 1;
   let HasVecPolicyOp = 1;
@@ -1716,7 +1716,7 @@ class VPseudoISegLoadMask<VReg RetClass,
                           bits<4> NF,
                           bit Ordered> :
       Pseudo<(outs GetVRegNoV0<RetClass>.R:$rd),
-             (ins GetVRegNoV0<RetClass>.R:$merge, GPRMem:$rs1,
+             (ins GetVRegNoV0<RetClass>.R:$passthru, GPRMem:$rs1,
                   IdxClass:$offset, VMaskOp:$vm, AVL:$vl, ixlenimm:$sew,
                   ixlenimm:$policy), []>,
       RISCVVPseudo,
@@ -1726,7 +1726,7 @@ class VPseudoISegLoadMask<VReg RetClass,
   let hasSideEffects = 0;
   // For vector indexed segment loads, the destination vector register groups
   // cannot overlap the source vector register group
-  let Constraints = "@earlyclobber $rd, $rd = $merge";
+  let Constraints = "@earlyclobber $rd, $rd = $passthru";
   let HasVLOp = 1;
   let HasSEWOp = 1;
   let HasVecPolicyOp = 1;
@@ -2024,11 +2024,11 @@ multiclass VPseudoVSFS_M {
     let VLMul = mti.LMul.value in {
       def "_M_" # mti.BX : VPseudoUnaryNoMaskNoPolicy<VR, VR, constraint>,
                            SchedUnary<"WriteVMSFSV", "ReadVMSFSV", mx,
-                                      forceMergeOpRead=true>;
+                                      forcePassthruRead=true>;
       let ForceTailAgnostic = true in
       def "_M_" # mti.BX # "_MASK" : VPseudoUnaryMask<VR, VR, constraint>,
                                      SchedUnary<"WriteVMSFSV", "ReadVMSFSV", mx,
-                                                forceMergeOpRead=true>;
+                                                forcePassthruRead=true>;
     }
   }
 }
@@ -2038,11 +2038,11 @@ multiclass VPseudoVID_V {
     defvar mx = m.MX;
     let VLMul = m.value in {
       def "_V_" # mx : VPseudoNullaryNoMask<m.vrclass>,
-                         SchedNullary<"WriteVIdxV", mx, forceMergeOpRead=true>;
+                         SchedNullary<"WriteVIdxV", mx, forcePassthruRead=true>;
       def "_V_" # mx # "_MASK" : VPseudoNullaryMask<m.vrclass>,
                                    RISCVMaskedPseudo<MaskIdx=1>,
                                    SchedNullary<"WriteVIdxV", mx,
-                                                forceMergeOpRead=true>;
+                                                forcePassthruRead=true>;
     }
   }
 }
@@ -2063,11 +2063,11 @@ multiclass VPseudoVIOTA_M {
     let VLMul = m.value in {
       def "_" # mx : VPseudoUnaryNoMask<m.vrclass, VR, constraint>,
                        SchedUnary<"WriteVIotaV", "ReadVIotaV", mx,
-                                  forceMergeOpRead=true>;
+                                  forcePassthruRead=true>;
       def "_" # mx # "_MASK" : VPseudoUnaryMask<m.vrclass, VR, constraint>,
                                  RISCVMaskedPseudo<MaskIdx=2, ActiveAffectsRes=true>,
                                  SchedUnary<"WriteVIotaV", "ReadVIotaV", mx,
-                                            forceMergeOpRead=true>;
+                                            forcePassthruRead=true>;
     }
   }
 }
@@ -2227,7 +2227,7 @@ multiclass VPseudoVGTR_EI16_VV {
               : VPseudoBinaryEmul<m.vrclass, m.vrclass, emul.vrclass, m, emul,
                                   constraint, e>,
                 SchedBinary<"WriteVRGatherEI16VV", "ReadVRGatherEI16VV_data",
-                            "ReadVRGatherEI16VV_index", mx, e, forceMergeOpRead=true>;
+                            "ReadVRGatherEI16VV_index", mx, e, forcePassthruRead=true>;
         }
       }
     }
@@ -2246,7 +2246,7 @@ multiclass VPseudoVSLD1_VX<string Constraint = ""> {
   foreach m = MxList in {
     defm "_VX" : VPseudoBinary<m.vrclass, m.vrclass, GPR, m, Constraint>,
                  SchedBinary<"WriteVISlide1X", "ReadVISlideV", "ReadVISlideX",
-                             m.MX, forceMergeOpRead=true>;
+                             m.MX, forcePassthruRead=true>;
   }
 }
 
@@ -2267,7 +2267,7 @@ multiclass VPseudoVSLD1_VF<string Constraint = ""> {
       defm "_V" #f.FX
           : VPseudoBinary<m.vrclass, m.vrclass, f.fprclass, m, Constraint>,
             SchedBinary<"WriteVFSlide1F", "ReadVFSlideV", "ReadVFSlideF", m.MX,
-                      forceMergeOpRead=true>;
+                      forcePassthruRead=true>;
     }
   }
 }
@@ -2445,7 +2445,7 @@ multiclass VPseudoVMRG_FM {
           : VPseudoTiedBinaryCarryIn<GetVRegNoV0<m.vrclass>.R, m.vrclass,
                                      f.fprclass, m>,
           SchedBinary<"WriteVFMergeV", "ReadVFMergeV", "ReadVFMergeF", mx,
-                      forceMasked=1, forceMergeOpRead=true>;
+                      forceMasked=1, forcePassthruRead=true>;
     }
   }
 }
@@ -2472,13 +2472,13 @@ multiclass VPseudoUnaryVMV_V_X_I {
       let VLMul = m.value in {
         def "_V_" # mx : VPseudoUnaryNoMask<m.vrclass, m.vrclass>,
                          SchedUnary<"WriteVIMovV", "ReadVIMovV", mx,
-                                    forceMergeOpRead=true>;
+                                    forcePassthruRead=true>;
         def "_X_" # mx : VPseudoUnaryNoMask<m.vrclass, GPR>,
                          SchedUnary<"WriteVIMovX", "ReadVIMovX", mx,
-                                    forceMergeOpRead=true>;
+                                    forcePassthruRead=true>;
         def "_I_" # mx : VPseudoUnaryNoMask<m.vrclass, simm5>,
                          SchedNullary<"WriteVIMovI", mx,
-                                      forceMergeOpRead=true>;
+                                      forcePassthruRead=true>;
       }
     }
   }
@@ -2491,7 +2491,7 @@ multiclass VPseudoVMV_F {
       let VLMul = m.value in {
         def "_" # f.FX # "_" # mx :
           VPseudoUnaryNoMask<m.vrclass, f.fprclass>,
-          SchedUnary<"WriteVFMovV", "ReadVFMovF", mx, forceMergeOpRead=true>;
+          SchedUnary<"WriteVFMovV", "ReadVFMovF", mx, forcePassthruRead=true>;
       }
     }
   }
@@ -2503,11 +2503,11 @@ multiclass VPseudoVCLS_V {
     let VLMul = m.value in {
       def "_V_" # mx : VPseudoUnaryNoMask<m.vrclass, m.vrclass>,
                        SchedUnary<"WriteVFClassV", "ReadVFClassV", mx,
-                                  forceMergeOpRead=true>;
+                                  forcePassthruRead=true>;
       def "_V_" # mx # "_MASK" : VPseudoUnaryMask<m.vrclass, m.vrclass>,
                                  RISCVMaskedPseudo<MaskIdx=2>,
                                  SchedUnary<"WriteVFClassV", "ReadVFClassV", mx,
-                                            forceMergeOpRead=true>;
+                                            forcePassthruRead=true>;
     }
   }
 }
@@ -2523,12 +2523,12 @@ multiclass VPseudoVSQR_V_RM {
         let SEW = e in {
           def "_V" # suffix : VPseudoUnaryNoMaskRoundingMode<m.vrclass, m.vrclass>,
                               SchedUnary<"WriteVFSqrtV", "ReadVFSqrtV", mx, e,
-                                         forceMergeOpRead=true>;
+                                         forcePassthruRead=true>;
           def "_V" #suffix # "_MASK"
               : VPseudoUnaryMaskRoundingMode<m.vrclass, m.vrclass>,
                 RISCVMaskedPseudo<MaskIdx = 2>,
                 SchedUnary<"WriteVFSqrtV", "ReadVFSqrtV", mx, e,
-                           forceMergeOpRead=true>;
+                           forcePassthruRead=true>;
         }
       }
   }
@@ -2541,11 +2541,11 @@ multiclass VPseudoVRCP_V {
       let VLMul = m.value in {
         def "_V_" # mx # "_E" # e
             : VPseudoUnaryNoMask<m.vrclass, m.vrclass>,
-              SchedUnary<"WriteVFRecpV", "ReadVFRecpV", mx, e, forceMergeOpRead=true>;
+              SchedUnary<"WriteVFRecpV", "ReadVFRecpV", mx, e, forcePassthruRead=true>;
         def "_V_" # mx # "_E" # e # "_MASK"
             : VPseudoUnaryMask<m.vrclass, m.vrclass>,
               RISCVMaskedPseudo<MaskIdx = 2>,
-              SchedUnary<"WriteVFRecpV", "ReadVFRecpV", mx, e, forceMergeOpRead=true>;
+              SchedUnary<"WriteVFRecpV", "ReadVFRecpV", mx, e, forcePassthruRead=true>;
       }
     }
   }
@@ -2558,11 +2558,11 @@ multiclass VPseudoVRCP_V_RM {
       let VLMul = m.value in {
         def "_V_" # mx # "_E" # e
             : VPseudoUnaryNoMaskRoundingMode<m.vrclass, m.vrclass>,
-              SchedUnary<"WriteVFRecpV", "ReadVFRecpV", mx, e, forceMergeOpRead=true>;
+              SchedUnary<"WriteVFRecpV", "ReadVFRecpV", mx, e, forcePassthruRead=true>;
         def "_V_" # mx # "_E" # e # "_MASK"
             : VPseudoUnaryMaskRoundingMode<m.vrclass, m.vrclass>,
               RISCVMaskedPseudo<MaskIdx = 2>,
-              SchedUnary<"WriteVFRecpV", "ReadVFRecpV", mx, e, forceMergeOpRead=true>;
+              SchedUnary<"WriteVFRecpV", "ReadVFRecpV", mx, e, forcePassthruRead=true>;
       }
     }
   }
@@ -2575,11 +2575,11 @@ multiclass PseudoVEXT_VF2 {
     defvar CurrTypeConstraints = !if(!or(!eq(mx, "MF4"), !eq(mx, "MF2"), !eq(mx, "M1")), 1, 3);
     let VLMul = m.value in {
       def "_" # mx : VPseudoUnaryNoMask<m.vrclass, m.f2vrclass, constraints, CurrTypeConstraints>,
-                     SchedUnary<"WriteVExtV", "ReadVExtV", mx, forceMergeOpRead=true>;
+                     SchedUnary<"WriteVExtV", "ReadVExtV", mx, forcePassthruRead=true>;
       def "_" # mx # "_MASK" :
         VPseudoUnaryMask<m.vrclass, m.f2vrclass, constraints, CurrTypeConstraints>,
         RISCVMaskedPseudo<MaskIdx=2>,
-        SchedUnary<"WriteVExtV", "ReadVExtV", mx, forceMergeOpRead=true>;
+        SchedUnary<"WriteVExtV", "ReadVExtV", mx, forcePassthruRead=true>;
     }
   }
 }
@@ -2591,11 +2591,11 @@ multiclass PseudoVEXT_VF4 {
     defvar CurrTypeConstraints = !if(!or(!eq(mx, "MF2"), !eq(mx, "M1"), !eq(mx, "M2")), 1, 3);
     let VLMul = m.value in {
       def "_" # mx : VPseudoUnaryNoMask<m.vrclass, m.f4vrclass, constraints, CurrTypeConstraints>,
-                     SchedUnary<"WriteVExtV", "ReadVExtV", mx, forceMergeOpRead=true>;
+                     SchedUnary<"WriteVExtV", "ReadVExtV", mx, forcePassthruRead=true>;
       def "_" # mx # "_MASK" :
         VPseudoUnaryMask<m.vrclass, m.f4vrclass, constraints, CurrTypeConstraints>,
         RISCVMaskedPseudo<MaskIdx=2>,
-        SchedUnary<"WriteVExtV", "ReadVExtV", mx, forceMergeOpRead=true>;
+        SchedUnary<"WriteVExtV", "ReadVExtV", mx, forcePassthruRead=true>;
     }
   }
 }
@@ -2607,11 +2607,11 @@ multiclass PseudoVEXT_VF8 {
     defvar CurrTypeConstraints = !if(!or(!eq(mx, "M1"), !eq(mx, "M2"), !eq(mx, "M4")), 1, 3);
     let VLMul = m.value in {
       def "_" # mx : VPseudoUnaryNoMask<m.vrclass, m.f8vrclass, constraints, CurrTypeConstraints>,
-                     SchedUnary<"WriteVExtV", "ReadVExtV", mx, forceMergeOpRead=true>;
+                     SchedUnary<"WriteVExtV", "ReadVExtV", mx, forcePassthruRead=true>;
       def "_" # mx # "_MASK" :
         VPseudoUnaryMask<m.vrclass, m.f8vrclass, constraints, CurrTypeConstraints>,
         RISCVMaskedPseudo<MaskIdx=2>,
-        SchedUnary<"WriteVExtV", "ReadVExtV", mx, forceMergeOpRead=true>;
+        SchedUnary<"WriteVExtV", "ReadVExtV", mx, forcePassthruRead=true>;
     }
   }
 }
@@ -2657,16 +2657,16 @@ multiclass VPseudoVGTR_VV_VX_VI {
     defvar mx = m.MX;
     defm "" : VPseudoBinaryV_VX<m, constraint>,
               SchedBinary<"WriteVRGatherVX", "ReadVRGatherVX_data",
-                          "ReadVRGatherVX_index", mx, forceMergeOpRead=true>;
+                          "ReadVRGatherVX_index", mx, forcePassthruRead=true>;
     defm "" : VPseudoBinaryV_VI<uimm5, m, constraint>,
               SchedUnary<"WriteVRGatherVI", "ReadVRGatherVI_data", mx,
-                         forceMergeOpRead=true>;
+                         forcePassthruRead=true>;
 
     defvar sews = SchedSEWSet<mx>.val;
     foreach e = sews in {
       defm "" : VPseudoBinaryV_VV<m, constraint, e>,
                 SchedBinary<"WriteVRGatherVV", "ReadVRGatherVV_data",
-                              "ReadVRGatherVV_index", mx, e, forceMergeOpRead=true>;
+                              "ReadVRGatherVV_index", mx, e, forcePassthruRead=true>;
     }
   }
 }
@@ -2676,12 +2676,12 @@ multiclass VPseudoVSALU_VV_VX_VI<bit Commutable = 0> {
     defvar mx = m.MX;
     defm "" : VPseudoBinaryV_VV<m, Commutable=Commutable>,
               SchedBinary<"WriteVSALUV", "ReadVSALUV", "ReadVSALUX", mx,
-                          forceMergeOpRead=true>;
+                          forcePassthruRead=true>;
     defm "" : VPseudoBinaryV_VX<m>,
               SchedBinary<"WriteVSALUX", "ReadVSALUV", "ReadVSALUX", mx,
-                          forceMergeOpRead=true>;
+                          forcePassthruRead=true>;
     defm "" : VPseudoBinaryV_VI<simm5, m>,
-              SchedUnary<"WriteVSALUI", "ReadVSALUV", mx, forceMergeOpRead=true>;
+              SchedUnary<"WriteVSALUI", "ReadVSALUV", mx, forcePassthruRead=true>;
   }
 }
 
@@ -2691,12 +2691,12 @@ multiclass VPseudoVSHT_VV_VX_VI {
     defvar mx = m.MX;
     defm "" : VPseudoBinaryV_VV<m>,
               SchedBinary<"WriteVShiftV", "ReadVShiftV", "ReadVShiftV", mx,
-                          forceMergeOpRead=true>;
+                          forcePassthruRead=true>;
     defm "" : VPseudoBinaryV_VX<m>,
               SchedBinary<"WriteVShiftX", "ReadVShiftV", "ReadVShiftX", mx,
-                          forceMergeOpRead=true>;
+                          forcePassthruRead=true>;
     defm "" : VPseudoBinaryV_VI<uimm5, m>,
-              SchedUnary<"WriteVShiftI", "ReadVShiftV", mx, forceMergeOpRead=true>;
+              SchedUnary<"WriteVShiftI", "ReadVShiftV", mx, forcePassthruRead=true>;
   }
 }
 
@@ -2705,12 +2705,12 @@ multiclass VPseudoVSSHT_VV_VX_VI_RM {
     defvar mx = m.MX;
     defm "" : VPseudoBinaryV_VV_RM<m>,
               SchedBinary<"WriteVSShiftV", "ReadVSShiftV", "ReadVSShiftV", mx,
-                          forceMergeOpRead=true>;
+                          forcePassthruRead=true>;
     defm "" : VPseudoBinaryV_VX_RM<m>,
               SchedBinary<"WriteVSShiftX", "ReadVSShiftV", "ReadVSShiftX", mx,
-                          forceMergeOpRead=true>;
+                          forcePassthruRead=true>;
     defm "" : VPseudoBinaryV_VI_RM<uimm5, m>,
-              SchedUnary<"WriteVSShiftI", "ReadVSShiftV", mx, forceMergeOpRead=true>;
+              SchedUnary<"WriteVSShiftI", "ReadVSShiftV", mx, forcePassthruRead=true>;
   }
 }
 
@@ -2719,12 +2719,12 @@ multiclass VPseudoVALU_VV_VX_VI<bit Commutable = 0> {
     defvar mx = m.MX;
     defm "" : VPseudoBinaryV_VV<m, Commutable=Commutable>,
             SchedBinary<"WriteVIALUV", "ReadVIALUV", "ReadVIALUV", mx,
-                        forceMergeOpRead=true>;
+                        forcePassthruRead=true>;
     defm "" : VPseudoBinaryV_VX<m>,
             SchedBinary<"WriteVIALUX", "ReadVIALUV", "ReadVIALUX", mx,
-                        forceMergeOpRead=true>;
+                        forcePassthruRead=true>;
     defm "" : VPseudoBinaryV_VI<simm5, m>,
-            SchedUnary<"WriteVIALUI", "ReadVIALUV", mx, forceMergeOpRead=true>;
+            SchedUnary<"WriteVIALUI", "ReadVIALUV", mx, forcePassthruRead=true>;
   }
 }
 
@@ -2733,10 +2733,10 @@ multiclass VPseudoVSALU_VV_VX {
     defvar mx = m.MX;
     defm "" : VPseudoBinaryV_VV<m>,
               SchedBinary<"WriteVSALUV", "ReadVSALUV", "ReadVSALUV", mx,
-                          forceMergeOpRead=true>;
+                          forcePassthruRead=true>;
     defm "" : VPseudoBinaryV_VX<m>,
               SchedBinary<"WriteVSALUX", "ReadVSALUV", "ReadVSALUX", mx,
-                          forceMergeOpRead=true>;
+                          forcePassthruRead=true>;
   }
 }
 
@@ -2745,10 +2745,10 @@ multiclass VPseudoVSMUL_VV_VX_RM {
     defvar mx = m.MX;
     defm "" : VPseudoBinaryV_VV_RM<m, Commutable=1>,
               SchedBinary<"WriteVSMulV", "ReadVSMulV", "ReadVSMulV", mx,
-                          forceMergeOpRead=true>;
+                          forcePassthruRead=true>;
     defm "" : VPseudoBinaryV_VX_RM<m>,
               SchedBinary<"WriteVSMulX", "ReadVSMulV", "ReadVSMulX", mx,
-                          forceMergeOpRead=true>;
+                          forcePassthruRead=true>;
   }
 }
 
@@ -2757,10 +2757,10 @@ multiclass VPseudoVAALU_VV_VX_RM<bit Commutable = 0> {
     defvar mx = m.MX;
     defm "" : VPseudoBinaryV_VV_RM<m, Commutable=Commutable>,
               SchedBinary<"WriteVAALUV", "ReadVAALUV", "ReadVAALUV", mx,
-                          forceMergeOpRead=true>;
+                          forcePassthruRead=true>;
     defm "" : VPseudoBinaryV_VX_RM<m>,
               SchedBinary<"WriteVAALUX", "ReadVAALUV", "ReadVAALUX", mx,
-                          forceMergeOpRead=true>;
+                          forcePassthruRead=true>;
   }
 }
 
@@ -2802,14 +2802,14 @@ multiclass VPseudoVFMUL_VV_VF_RM {
     foreach e = SchedSEWSet<m.MX, isF=1>.val in
       defm "" : VPseudoBinaryFV_VV_RM<m, e>,
                 SchedBinary<"WriteVFMulV", "ReadVFMulV", "ReadVFMulV", m.MX, e,
-                            forceMergeOpRead=true>;
+                            forcePassthruRead=true>;
   }
 
   foreach f = FPList in {
     foreach m = f.MxList in {
       defm "" : VPseudoBinaryV_VF_RM<m, f, f.SEW>,
                 SchedBinary<"WriteVFMulF", "ReadVFMulV", "ReadVFMulF", m.MX,
-                            f.SEW, forceMergeOpRead=true>;
+                            f.SEW, forcePassthruRead=true>;
     }
   }
 }
@@ -2821,7 +2821,7 @@ multiclass VPseudoVFDIV_VV_VF_RM {
     foreach e = sews in {
       defm "" : VPseudoBinaryFV_VV_RM<m, e>,
                 SchedBinary<"WriteVFDivV", "ReadVFDivV", "ReadVFDivV", mx, e,
-                            forceMergeOpRead=true>;
+                            forcePassthruRead=true>;
     }
   }
 
@@ -2829,7 +2829,7 @@ multiclass VPseudoVFDIV_VV_VF_RM {
     foreach m = f.MxList in {
       defm "" : VPseudoBinaryV_VF_RM<m, f, f.SEW>,
                 SchedBinary<"WriteVFDivF", "ReadVFDivV", "ReadVFDivF", m.MX, f.SEW,
-                            forceMergeOpRead=true>;
+                            forcePassthruRead=true>;
     }
   }
 }
@@ -2839,7 +2839,7 @@ multiclass VPseudoVFRDIV_VF_RM {
     foreach m = f.MxList in {
       defm "" : VPseudoBinaryV_VF_RM<m, f, f.SEW>,
                 SchedBinary<"WriteVFDivF", "ReadVFDivV", "ReadVFDivF", m.MX, f.SEW,
-                            forceMergeOpRead=true>;
+                            forcePassthruRead=true>;
     }
   }
 }
@@ -2848,10 +2848,10 @@ multiclass VPseudoVALU_VV_VX {
  foreach m = MxList in {
     defm "" : VPseudoBinaryV_VV<m>,
             SchedBinary<"WriteVIALUV", "ReadVIALUV", "ReadVIALUV", m.MX,
-                        forceMergeOpRead=true>;
+                        forcePassthruRead=true>;
     defm "" : VPseudoBinaryV_VX<m>,
             SchedBinary<"WriteVIALUX", "ReadVIALUV", "ReadVIALUX", m.MX,
-                        forceMergeOpRead=true>;
+                        forcePassthruRead=true>;
   }
 }
 
@@ -2860,14 +2860,14 @@ multiclass VPseudoVSGNJ_VV_VF {
     foreach e = SchedSEWSet<m.MX, isF=1>.val in
     defm "" : VPseudoBinaryV_VV<m, sew=e>,
               SchedBinary<"WriteVFSgnjV", "ReadVFSgnjV", "ReadVFSgnjV", m.MX,
-                          e, forceMergeOpRead=true>;
+                          e, forcePassthruRead=true>;
   }
 
   foreach f = FPList in {
     foreach m = f.MxList in {
       defm "" : VPseudoBinaryV_VF<m, f, sew=f.SEW>,
                 SchedBinary<"WriteVFSgnjF", "ReadVFSgnjV", "ReadVFSgnjF", m.MX,
-                            f.SEW, forceMergeOpRead=true>;
+                            f.SEW, forcePassthruRead=true>;
     }
   }
 }
@@ -2877,14 +2877,14 @@ multiclass VPseudoVMAX_VV_VF {
     foreach e = SchedSEWSet<m.MX, isF=1>.val in
       defm "" : VPseudoBinaryV_VV<m, sew=e>,
                 SchedBinary<"WriteVFMinMaxV", "ReadVFMinMaxV", "ReadVFMinMaxV",
-                            m.MX, e, forceMergeOpRead=true>;
+                            m.MX, e, forcePassthruRead=true>;
   }
 
   foreach f = FPList in {
     foreach m = f.MxList in {
       defm "" : VPseudoBinaryV_VF<m, f, sew=f.SEW>,
                 SchedBinary<"WriteVFMinMaxF", "ReadVFMinMaxV", "ReadVFMinMaxF",
-                            m.MX, f.SEW, forceMergeOpRead=true>;
+                            m.MX, f.SEW, forcePassthruRead=true>;
     }
   }
 }
@@ -2894,14 +2894,14 @@ multiclass VPseudoVALU_VV_VF_RM {
     foreach e = SchedSEWSet<m.MX, isF=1>.val in
       defm "" : VPseudoBinaryFV_VV_RM<m, e>,
                 SchedBinary<"WriteVFALUV", "ReadVFALUV", "ReadVFALUV", m.MX, e,
-                            forceMergeOpRead=true>;
+                            forcePassthruRead=true>;
   }
 
   foreach f = FPList in {
     foreach m = f.MxList in {
       defm "" : VPseudoBinaryV_VF_RM<m, f, f.SEW>,
                 SchedBinary<"WriteVFALUF", "ReadVFALUV", "ReadVFALUF", m.MX,
-                            f.SEW, forceMergeOpRead=true>;
+                            f.SEW, forcePassthruRead=true>;
     }
   }
 }
@@ -2911,7 +2911,7 @@ multiclass VPseudoVALU_VF_RM {
     foreach m = f.MxList in {
       defm "" : VPseudoBinaryV_VF_RM<m, f, f.SEW>,
                 SchedBinary<"WriteVFALUF", "ReadVFALUV", "ReadVFALUF", m.MX,
-                            f.SEW, forceMergeOpRead=true>;
+                            f.SEW, forcePassthruRead=true>;
     }
   }
 }
@@ -2921,9 +2921,9 @@ multiclass VPseudoVALU_VX_VI {
     defvar mx = m.MX;
     defm "" : VPseudoBinaryV_VX<m>,
               SchedBinary<"WriteVIALUX", "ReadVIALUV", "ReadVIALUX", mx,
-                          forceMergeOpRead=true>;
+                          forcePassthruRead=true>;
     defm "" : VPseudoBinaryV_VI<simm5, m>,
-              SchedUnary<"WriteVIALUI", "ReadVIALUV", mx, forceMergeOpRead=true>;
+              SchedUnary<"WriteVIALUI", "ReadVIALUV", mx, forcePassthruRead=true>;
   }
 }
 
@@ -2932,10 +2932,10 @@ multiclass VPseudoVWALU_VV_VX<bit Commutable = 0> {
     defvar mx = m.MX;
     defm "" : VPseudoBinaryW_VV<m, Commutable=Commutable>,
               SchedBinary<"WriteVIWALUV", "ReadVIWALUV", "ReadVIWALUV", mx,
-                          forceMergeOpRead=true>;
+                          forcePassthruRead=true>;
     defm "" : VPseudoBinaryW_VX<m>, 
               SchedBinary<"WriteVIWALUX", "ReadVIWALUV", "ReadVIWALUX", mx,
-                          forceMergeOpRead=true>;
+                          forcePassthruRead=true>;
   }
 }
 
@@ -2944,10 +2944,10 @@ multiclass VPseudoVWMUL_VV_VX<bit Commutable = 0> {
     defvar mx = m.MX;
     defm "" : VPseudoBinaryW_VV<m, Commutable=Commutable>,
               SchedBinary<"WriteVIWMulV", "ReadVIWMulV", "ReadVIWMulV", mx,
-                          forceMergeOpRead=true>;
+                          forcePassthruRead=true>;
     defm "" : VPseudoBinaryW_VX<m>,
               SchedBinary<"WriteVIWMulX", "ReadVIWMulV", "ReadVIWMulX", mx,
-                          forceMergeOpRead=true>;
+                          forcePassthruRead=true>;
   }
 }
 
@@ -2956,14 +2956,14 @@ multiclass VPseudoVWMUL_VV_VF_RM {
     foreach e = SchedSEWSet<m.MX, isF=1, isWidening=1>.val in
     defm "" : VPseudoBinaryW_VV_RM<m, sew=e>,
               SchedBinary<"WriteVFWMulV", "ReadVFWMulV", "ReadVFWMulV", m.MX,
-                          e, forceMergeOpRead=true>;
+                          e, forcePassthruRead=true>;
   }
 
   foreach f = FPListW in {
     foreach m = f.MxListFW in {
       defm "" : VPseudoBinaryW_VF_RM<m, f, sew=f.SEW>,
                 SchedBinary<"WriteVFWMulF", "ReadVFWMulV", "ReadVFWMulF", m.MX,
-                          f.SEW, forceMergeOpRead=true>;
+                          f.SEW, forcePassthruRead=true>;
     }
   }
 }
@@ -2973,10 +2973,10 @@ multiclass VPseudoVWALU_WV_WX {
     defvar mx = m.MX;
     defm "" : VPseudoBinaryW_WV<m>,
               SchedBinary<"WriteVIWALUV", "ReadVIWALUV", "ReadVIWALUV", mx,
-                          forceMergeOpRead=true>;
+                          forcePassthruRead=true>;
     defm "" : VPseudoBinaryW_WX<m>,
               SchedBinary<"WriteVIWALUX", "ReadVIWALUV", "ReadVIWALUX", mx,
-                          forceMergeOpRead=true>;
+                          forcePassthruRead=true>;
   }
 }
 
@@ -2985,14 +2985,14 @@ multiclass VPseudoVFWALU_VV_VF_RM {
     foreach e = SchedSEWSet<m.MX, isF=1, isWidening=1>.val in
       defm "" : VPseudoBinaryW_VV_RM<m, sew=e>,
                 SchedBinary<"WriteVFWALUV", "ReadVFWALUV", "ReadVFWALUV", m.MX,
-                            e, forceMergeOpRead=true>;
+                            e, forcePassthruRead=true>;
   }
 
   foreach f = FPListW in {
     foreach m = f.MxListFW in {
       defm "" : VPseudoBinaryW_VF_RM<m, f, sew=f.SEW>,
                 SchedBinary<"WriteVFWALUF", "ReadVFWALUV", "ReadVFWALUF", m.MX,
-                          f.SEW, forceMergeOpRead=true>;
+                          f.SEW, forcePassthruRead=true>;
     }
   }
 }
@@ -3002,13 +3002,13 @@ multiclass VPseudoVFWALU_WV_WF_RM {
     foreach e = SchedSEWSet<m.MX, isF=1, isWidening=1>.val in
       defm "" : VPseudoBinaryW_WV_RM<m, sew=e>,
                 SchedBinary<"WriteVFWALUV", "ReadVFWALUV", "ReadVFWALUV", m.MX,
-                            e, forceMergeOpRead=true>;
+                            e, forcePassthruRead=true>;
   }
   foreach f = FPListW in {
     foreach m = f.MxListFW in {
       defm "" : VPseudoBinaryW_WF_RM<m, f, sew=f.SEW>,
                 SchedBinary<"WriteVFWALUF", "ReadVFWALUV", "ReadVFWALUF", m.MX,
-                            f.SEW, forceMergeOpRead=true>;
+                            f.SEW, forcePassthruRead=true>;
     }
   }
 }
@@ -3020,17 +3020,17 @@ multiclass VPseudoVMRG_VM_XM_IM {
       VPseudoTiedBinaryCarryIn<GetVRegNoV0<m.vrclass>.R,
                                m.vrclass, m.vrclass, m>,
       SchedBinary<"WriteVIMergeV", "ReadVIMergeV", "ReadVIMergeV", mx,
-                          forceMergeOpRead=true>;
+                          forcePassthruRead=true>;
     def "_VXM" # "_" # m.MX:
       VPseudoTiedBinaryCarryIn<GetVRegNoV0<m.vrclass>.R,
                                m.vrclass, GPR, m>,
       SchedBinary<"WriteVIMergeX", "ReadVIMergeV", "ReadVIMergeX", mx,
-                          forceMergeOpRead=true>;
+                          forcePassthruRead=true>;
     def "_VIM" # "_" # m.MX:
       VPseudoTiedBinaryCarryIn<GetVRegNoV0<m.vrclass>.R,
                                m.vrclass, simm5, m>,
       SchedUnary<"WriteVIMergeI", "ReadVIMergeV", mx,
-                          forceMergeOpRead=true>;
+                          forcePassthruRead=true>;
   }
 }
 
@@ -3039,13 +3039,13 @@ multiclass VPseudoVCALU_VM_XM_IM {
     defvar mx = m.MX;
     defm "" : VPseudoTiedBinaryV_VM<m, Commutable=1>,
               SchedBinary<"WriteVICALUV", "ReadVICALUV", "ReadVICALUV", mx,
-                          forceMergeOpRead=true>;
+                          forcePassthruRead=true>;
     defm "" : VPseudoTiedBinaryV_XM<m>,
               SchedBinary<"WriteVICALUX", "ReadVICALUV", "ReadVICALUX", mx,
-                          forceMergeOpRead=true>;
+                          forcePassthruRead=true>;
     defm "" : VPseudoTiedBinaryV_IM<m>,
               SchedUnary<"WriteVICALUI", "ReadVICALUV", mx,
-                          forceMergeOpRead=true>;
+                          forcePassthruRead=true>;
   }
 }
 
@@ -3054,10 +3054,10 @@ multiclass VPseudoVCALU_VM_XM {
     defvar mx = m.MX;
     defm "" : VPseudoTiedBinaryV_VM<m>,
               SchedBinary<"WriteVICALUV", "ReadVICALUV", "ReadVICALUV", mx,
-                          forceMergeOpRead=true>;
+                          forcePassthruRead=true>;
     defm "" : VPseudoTiedBinaryV_XM<m>,
               SchedBinary<"WriteVICALUX", "ReadVICALUV", "ReadVICALUX", mx,
-                          forceMergeOpRead=true>;
+                          forcePassthruRead=true>;
   }
 }
 
@@ -3068,13 +3068,13 @@ multiclass VPseudoVCALUM_VM_XM_IM {
     defm "" : VPseudoBinaryV_VM<m, CarryOut=1, CarryIn=1, Constraint=constraint,
                                 Commutable=1, TargetConstraintType=2>,
               SchedBinary<"WriteVICALUV", "ReadVICALUV", "ReadVICALUV", mx, forceMasked=1,
-                          forceMergeOpRead=true>;
+                          forcePassthruRead=true>;
     defm "" : VPseudoBinaryV_XM<m, CarryOut=1, CarryIn=1, Constraint=constraint, TargetConstraintType=2>,
               SchedBinary<"WriteVICALUX", "ReadVICALUV", "ReadVICALUX", mx, forceMasked=1,
-                          forceMergeOpRead=true>;
+                          forcePassthruRead=true>;
     defm "" : VPseudoBinaryV_IM<m, CarryOut=1, CarryIn=1, Constraint=constraint, TargetConstraintType=2>,
               SchedUnary<"WriteVICALUI", "ReadVICALUV", mx, forceMasked=1,
-                          forceMergeOpRead=true>;
+                          forcePassthruRead=true>;
   }
 }
 
@@ -3085,11 +3085,11 @@ multiclass VPseudoVCALUM_VM_XM {
     defm "" : VPseudoBinaryV_VM<m, CarryOut=1, CarryIn=1, Constraint=constraint,
                                 TargetConstraintType=2>,
               SchedBinary<"WriteVICALUV", "ReadVICALUV", "ReadVICALUV", mx, forceMasked=1,
-                          forceMergeOpRead=true>;
+                          forcePassthruRead=true>;
     defm "" : VPseudoBinaryV_XM<m, CarryOut=1, CarryIn=1, Constraint=constraint,
                                 TargetConstraintType=2>,
               SchedBinary<"WriteVICALUX", "ReadVICALUV", "ReadVICALUX", mx, forceMasked=1,
-                          forceMergeOpRead=true>;
+                          forcePassthruRead=true>;
   }
 }
 
@@ -3100,13 +3100,13 @@ multiclass VPseudoVCALUM_V_X_I {
     defm "" : VPseudoBinaryV_VM<m, CarryOut=1, CarryIn=0, Constraint=constraint,
                                 Commutable=1, TargetConstraintType=2>,
               SchedBinary<"WriteVICALUV", "ReadVICALUV", "ReadVICALUV", mx,
-                          forceMergeOpRead=true>;
+                          forcePassthruRead=true>;
     defm "" : VPseudoBinaryV_XM<m, CarryOut=1, CarryIn=0, Constraint=constraint, TargetConstraintType=2>,
               SchedBinary<"WriteVICALUX", "ReadVICALUV", "ReadVICALUX", mx,
-                          forceMergeOpRead=true>;
+                          forcePassthruRead=true>;
     defm "" : VPseudoBinaryV_IM<m, CarryOut=1, CarryIn=0, Constraint=constraint>,
               SchedUnary<"WriteVICALUI", "ReadVICALUV", mx,
-                          forceMergeOpRead=true>;
+                          forcePassthruRead=true>;
   }
 }
 
@@ -3116,10 +3116,10 @@ multiclass VPseudoVCALUM_V_X {
     defvar mx = m.MX;
     defm "" : VPseudoBinaryV_VM<m, CarryOut=1, CarryIn=0, Constraint=constraint, TargetConstraintType=2>,
               SchedBinary<"WriteVICALUV", "ReadVICALUV", "ReadVICALUV", mx,
-                          forceMergeOpRead=true>;
+                          forcePassthruRead=true>;
     defm "" : VPseudoBinaryV_XM<m, CarryOut=1, CarryIn=0, Constraint=constraint, TargetConstraintType=2>,
               SchedBinary<"WriteVICALUX", "ReadVICALUV", "ReadVICALUX", mx,
-                          forceMergeOpRead=true>;
+                          forcePassthruRead=true>;
   }
 }
 
@@ -3128,13 +3128,13 @@ multiclass VPseudoVNCLP_WV_WX_WI_RM {
     defvar mx = m.MX;
     defm "" : VPseudoBinaryV_WV_RM<m>,
               SchedBinary<"WriteVNClipV", "ReadVNClipV", "ReadVNClipV", mx,
-                          forceMergeOpRead=true>;
+                          forcePassthruRead=true>;
     defm "" : VPseudoBinaryV_WX_RM<m>,
               SchedBinary<"WriteVNClipX", "ReadVNClipV", "ReadVNClipX", mx,
-                          forceMergeOpRead=true>;
+                          forcePassthruRead=true>;
     defm "" : VPseudoBinaryV_WI_RM<m>,
               SchedUnary<"WriteVNClipI", "ReadVNClipV", mx,
-                          forceMergeOpRead=true>;
+                          forcePassthruRead=true>;
   }
 }
 
@@ -3143,13 +3143,13 @@ multiclass VPseudoVNSHT_WV_WX_WI {
     defvar mx = m.MX;
     defm "" : VPseudoBinaryV_WV<m>,
               SchedBinary<"WriteVNShiftV", "ReadVNShiftV", "ReadVNShiftV", mx,
-                          forceMergeOpRead=true>;
+                          forcePassthruRead=true>;
     defm "" : VPseudoBinaryV_WX<m>,
               SchedBinary<"WriteVNShiftX", "ReadVNShiftV", "ReadVNShiftX", mx,
-                          forceMergeOpRead=true>;
+                          forcePassthruRead=true>;
     defm "" : VPseudoBinaryV_WI<m>,
               SchedUnary<"WriteVNShiftI", "ReadVNShiftV", mx,
-                          forceMergeOpRead=true>;
+                          forcePassthruRead=true>;
   }
 }
 
@@ -3591,7 +3591,7 @@ multiclass VPseudoVCVTI_V {
   foreach m = MxListF in {
     defm _V : VPseudoConversion<m.vrclass, m.vrclass, m>,
               SchedUnary<"WriteVFCvtFToIV", "ReadVFCvtFToIV", m.MX,
-                         forceMergeOpRead=true>;
+                         forcePassthruRead=true>;
   }
 }
 
@@ -3599,7 +3599,7 @@ multiclass VPseudoVCVTI_V_RM {
   foreach m = MxListF in {
     defm _V : VPseudoConversionRoundingMode<m.vrclass, m.vrclass, m>,
               SchedUnary<"WriteVFCvtFToIV", "ReadVFCvtFToIV", m.MX,
-                         forceMergeOpRead=true>;
+                         forcePassthruRead=true>;
   }
 }
 
@@ -3607,7 +3607,7 @@ multiclass VPseudoVCVTI_RM_V {
   foreach m = MxListF in {
     defm _V : VPseudoConversionRM<m.vrclass, m.vrclass, m>,
               SchedUnary<"WriteVFCvtFToIV", "ReadVFCvtFToIV", m.MX,
-                         forceMergeOpRead=true>;
+                         forcePassthruRead=true>;
   }
 }
 
@@ -3615,7 +3615,7 @@ multiclass VPseudoVFROUND_NOEXCEPT_V {
   foreach m = MxListF in {
     defm _V : VPseudoConversionNoExcept<m.vrclass, m.vrclass, m>,
               SchedUnary<"WriteVFCvtFToIV", "ReadVFCvtFToIV", m.MX,
-                         forceMergeOpRead=true>;
+                         forcePassthruRead=true>;
   }
 }
 
@@ -3624,7 +3624,7 @@ multiclass VPseudoVCVTF_V_RM {
     foreach e = SchedSEWSet<m.MX, isF=1>.val in
       defm _V : VPseudoConversionRoundingMode<m.vrclass, m.vrclass, m, sew=e>,
                 SchedUnary<"WriteVFCvtIToFV", "ReadVFCvtIToFV", m.MX, e,
-                           forceMergeOpRead=true>;
+                           forcePassthruRead=true>;
   }
 }
 
@@ -3633,7 +3633,7 @@ multiclass VPseudoVCVTF_RM_V {
     foreach e = SchedSEWSet<m.MX, isF=1>.val in
       defm _V : VPseudoConversionRM<m.vrclass, m.vrclass, m, sew=e>,
                 SchedUnary<"WriteVFCvtIToFV", "ReadVFCvtIToFV", m.MX, e,
-                           forceMergeOpRead=true>;
+                           forcePassthruRead=true>;
   }
 }
 
@@ -3642,7 +3642,7 @@ multiclass VPseudoVWCVTI_V {
   foreach m = MxListFW in {
     defm _V : VPseudoConversion<m.wvrclass, m.vrclass, m, constraint, TargetConstraintType=3>,
               SchedUnary<"WriteVFWCvtFToIV", "ReadVFWCvtFToIV", m.MX,
-                         forceMergeOpRead=true>;
+                         forcePassthruRead=true>;
   }
 }
 
@@ -3651,7 +3651,7 @@ multiclass VPseudoVWCVTI_V_RM {
   foreach m = MxListFW in {
     defm _V : VPseudoConversionRoundingMode<m.wvrclass, m.vrclass, m, constraint, TargetConstraintType=3>,
               SchedUnary<"WriteVFWCvtFToIV", "ReadVFWCvtFToIV", m.MX,
-                         forceMergeOpRead=true>;
+                         forcePassthruRead=true>;
   }
 }
 
@@ -3660,7 +3660,7 @@ multiclass VPseudoVWCVTI_RM_V {
   foreach m = MxListFW in {
     defm _V : VPseudoConversionRM<m.wvrclass, m.vrclass, m, constraint>,
               SchedUnary<"WriteVFWCvtFToIV", "ReadVFWCvtFToIV", m.MX,
-                         forceMergeOpRead=true>;
+                         forcePassthruRead=true>;
   }
 }
 
@@ -3671,7 +3671,7 @@ multiclass VPseudoVWCVTF_V {
       defm _V : VPseudoConversion<m.wvrclass, m.vrclass, m, constraint, sew=e,
                                   TargetConstraintType=3>,
                 SchedUnary<"WriteVFWCvtIToFV", "ReadVFWCvtIToFV", m.MX, e,
-                           forceMergeOpRead=true>;
+                           forcePassthruRead=true>;
   }
 }
 
@@ -3682,7 +3682,7 @@ multiclass VPseudoVWCVTD_V {
       defm _V : VPseudoConversion<m.wvrclass, m.vrclass, m, constraint, sew=e,
                                   TargetConstraintType=3>,
                 SchedUnary<"WriteVFWCvtFToFV", "ReadVFWCvtFToFV", m.MX, e,
-                           forceMergeOpRead=true>;
+                           forcePassthruRead=true>;
   }
 }
 
@@ -3691,7 +3691,7 @@ multiclass VPseudoVNCVTI_W {
   foreach m = MxListW in {
     defm _W : VPseudoConversion<m.vrclass, m.wvrclass, m, constraint, TargetConstraintType=2>,
               SchedUnary<"WriteVFNCvtFToIV", "ReadVFNCvtFToIV", m.MX,
-                         forceMergeOpRead=true>;
+                         forcePassthruRead=true>;
   }
 }
 
@@ -3700,7 +3700,7 @@ multiclass VPseudoVNCVTI_W_RM {
   foreach m = MxListW in {
     defm _W : VPseudoConversionRoundingMode<m.vrclass, m.wvrclass, m, constraint, TargetConstraintType=2>,
               SchedUnary<"WriteVFNCvtFToIV", "ReadVFNCvtFToIV", m.MX,
-                         forceMergeOpRead=true>;
+                         forcePassthruRead=true>;
   }
 }
 
@@ -3709,7 +3709,7 @@ multiclass VPseudoVNCVTI_RM_W {
   foreach m = MxListW in {
     defm _W : VPseudoConversionRM<m.vrclass, m.wvrclass, m, constraint, TargetConstraintType=2>,
               SchedUnary<"WriteVFNCvtFToIV", "ReadVFNCvtFToIV", m.MX,
-                         forceMergeOpRead=true>;
+                         forcePassthruRead=true>;
   }
 }
 
@@ -3721,7 +3721,7 @@ multiclass VPseudoVNCVTF_W_RM {
                                               constraint, sew=e,
                                               TargetConstraintType=2>,
                 SchedUnary<"WriteVFNCvtIToFV", "ReadVFNCvtIToFV", m.MX, e,
-                           forceMergeOpRead=true>;
+                           forcePassthruRead=true>;
   }
 }
 
@@ -3731,7 +3731,7 @@ multiclass VPseudoVNCVTF_RM_W {
     foreach e = SchedSEWSet<m.MX, isF=1, isWidening=1>.val in
       defm _W : VPseudoConversionRM<m.vrclass, m.wvrclass, m, constraint, sew=e>,
                 SchedUnary<"WriteVFNCvtIToFV", "ReadVFNCvtIToFV", m.MX, e,
-                           forceMergeOpRead=true>;
+                           forcePassthruRead=true>;
   }
 }
 
@@ -3742,7 +3742,7 @@ multiclass VPseudoVNCVTD_W {
       defm _W : VPseudoConversion<m.vrclass, m.wvrclass, m, constraint, sew=e,
                                   TargetConstraintType=2>,
                 SchedUnary<"WriteVFNCvtFToFV", "ReadVFNCvtFToFV", m.MX, e,
-                           forceMergeOpRead=true>;
+                           forcePassthruRead=true>;
   }
 }
 
@@ -3754,7 +3754,7 @@ multiclass VPseudoVNCVTD_W_RM {
                                               constraint, sew=e,
                                               TargetConstraintType=2>,
                 SchedUnary<"WriteVFNCvtFToFV", "ReadVFNCvtFToFV", m.MX, e,
-                           forceMergeOpRead=true>;
+                           forcePassthruRead=true>;
   }
 }
 
@@ -3922,14 +3922,14 @@ class VPatUnaryNoMask<string intrinsic_name,
                       VReg op2_reg_class,
                       bit isSEWAware = 0> :
   Pat<(result_type (!cast<Intrinsic>(intrinsic_name)
-                   (result_type result_reg_class:$merge),
+                   (result_type result_reg_class:$passthru),
                    (op2_type op2_reg_class:$rs2),
                    VLOpFrag)),
                    (!cast<Instruction>(
                      !if(isSEWAware,
                          inst#"_"#kind#"_"#vlmul.MX#"_E"#!shl(1, log2sew),
                          inst#"_"#kind#"_"#vlmul.MX))
-                   (result_type result_reg_class:$merge),
+                   (result_type result_reg_class:$passthru),
                    (op2_type op2_reg_class:$rs2),
                    GPR:$vl, log2sew, TU_MU)>;
 
@@ -3944,7 +3944,7 @@ class VPatUnaryNoMaskRoundingMode<string intrinsic_name,
                                   VReg op2_reg_class,
                                   bit isSEWAware = 0> :
   Pat<(result_type (!cast<Intrinsic>(intrinsic_name)
-                   (result_type result_reg_class:$merge),
+                   (result_type result_reg_class:$passthru),
                    (op2_type op2_reg_class:$rs2),
                    (XLenVT timm:$round),
                    VLOpFrag)),
@@ -3952,7 +3952,7 @@ class VPatUnaryNoMaskRoundingMode<string intrinsic_name,
                       !if(isSEWAware,
                           inst#"_"#kind#"_"#vlmul.MX#"_E"#!shl(1, log2sew),
                           inst#"_"#kind#"_"#vlmul.MX))
-                   (result_type result_reg_class:$merge),
+                   (result_type result_reg_class:$passthru),
                    (op2_type op2_reg_class:$rs2),
                    (XLenVT timm:$round),
                    GPR:$vl, log2sew, TU_MU)>;
@@ -3968,7 +3968,7 @@ class VPatUnaryNoMaskRTZ<string intrinsic_name,
                          VReg op2_reg_class,
                          bit isSEWAware = 0> :
   Pat<(result_type (!cast<Intrinsic>(intrinsic_name)
-                   (result_type result_reg_class:$merge),
+                   (result_type result_reg_class:$passthru),
                    (op2_type op2_reg_class:$rs2),
                    (XLenVT 0b001),
                    VLOpFrag)),
@@ -3976,7 +3976,7 @@ class VPatUnaryNoMaskRTZ<string intrinsic_name,
                       !if(isSEWAware,
                           inst#"_"#kind#"_"#vlmul.MX#"_E"#!shl(1, log2sew),
                           inst#"_"#kind#"_"#vlmul.MX))
-                   (result_type result_reg_class:$merge),
+                   (result_type result_reg_class:$passthru),
                    (op2_type op2_reg_class:$rs2),
                    GPR:$vl, log2sew, TU_MU)>;
 
@@ -3992,7 +3992,7 @@ class VPatUnaryMask<string intrinsic_name,
                     VReg op2_reg_class,
                     bit isSEWAware = 0> :
   Pat<(result_type (!cast<Intrinsic>(intrinsic_name#"_mask")
-                   (result_type result_reg_class:$merge),
+                   (result_type result_reg_class:$passthru),
                    (op2_type op2_reg_class:$rs2),
                    (mask_type V0),
                    VLOpFrag, (XLenVT timm:$policy))),
@@ -4000,7 +4000,7 @@ class VPatUnaryMask<string intrinsic_name,
                       !if(isSEWAware,
                           inst#"_"#kind#"_"#vlmul.MX#"_E"#!shl(1, log2sew)#"_MASK",
                           inst#"_"#kind#"_"#vlmul.MX#"_MASK"))
-                   (result_type result_reg_class:$merge),
+                   (result_type result_reg_class:$passthru),
                    (op2_type op2_reg_class:$rs2),
                    (mask_type V0), GPR:$vl, log2sew, (XLenVT timm:$policy))>;
 
@@ -4016,7 +4016,7 @@ class VPatUnaryMaskRoundingMode<string intrinsic_name,
                                 VReg op2_reg_class,
                                 bit isSEWAware = 0> :
   Pat<(result_type (!cast<Intrinsic>(intrinsic_name#"_mask")
-                   (result_type result_reg_class:$merge),
+                   (result_type result_reg_class:$passthru),
                    (op2_type op2_reg_class:$rs2),
                    (mask_type V0),
                    (XLenVT timm:$round),
@@ -4025,7 +4025,7 @@ class VPatUnaryMaskRoundingMode<string intrinsic_name,
                       !if(isSEWAware,
                           inst#"_"#kind#"_"#vlmul.MX#"_E"#!shl(1, log2sew)#"_MASK",
                           inst#"_"#kind#"_"#vlmul.MX#"_MASK"))
-                   (result_type result_reg_class:$merge),
+                   (result_type result_reg_class:$passthru),
                    (op2_type op2_reg_class:$rs2),
                    (mask_type V0),
                    (XLenVT timm:$round),
@@ -4043,7 +4043,7 @@ class VPatUnaryMaskRTZ<string intrinsic_name,
                        VReg op2_reg_class,
                        bit isSEWAware = 0> :
   Pat<(result_type (!cast<Intrinsic>(intrinsic_name#"_mask")
-                   (result_type result_reg_class:$merge),
+                   (result_type result_reg_class:$passthru),
                    (op2_type op2_reg_class:$rs2),
                    (mask_type V0),
                    (XLenVT 0b001),
@@ -4052,7 +4052,7 @@ class VPatUnaryMaskRTZ<string intrinsic_name,
                       !if(isSEWAware,
                           inst#"_"#kind#"_"#vlmul.MX#"_E"#!shl(1, log2sew)#"_MASK",
                           inst#"_"#kind#"_"#vlmul.MX#"_MASK"))
-                   (result_type result_reg_class:$merge),
+                   (result_type result_reg_class:$passthru),
                    (op2_type op2_reg_class:$rs2),
                    (mask_type V0),
                    GPR:$vl, log2sew, (XLenVT timm:$policy))>;
@@ -4071,12 +4071,12 @@ class VPatMaskUnaryMask<string intrinsic_name,
                         string inst,
                         MTypeInfo mti> :
   Pat<(mti.Mask (!cast<Intrinsic>(intrinsic_name#"_mask")
-                (mti.Mask VR:$merge),
+                (mti.Mask VR:$passthru),
                 (mti.Mask VR:$rs2),
                 (mti.Mask V0),
                 VLOpFrag)),
                 (!cast<Instruction>(inst#"_M_"#mti.BX#"_MASK")
-                (mti.Mask VR:$merge),
+                (mti.Mask VR:$passthru),
                 (mti.Mask VR:$rs2),
                 (mti.Mask V0), GPR:$vl, mti.Log2SEW, TU_MU)>;
 
@@ -4091,12 +4091,12 @@ class VPatUnaryAnyMask<string intrinsic,
                        VReg result_reg_class,
                        VReg op1_reg_class> :
   Pat<(result_type (!cast<Intrinsic>(intrinsic)
-                   (result_type result_reg_class:$merge),
+                   (result_type result_reg_class:$passthru),
                    (op1_type op1_reg_class:$rs1),
                    (mask_type VR:$rs2),
                    VLOpFrag)),
                    (!cast<Instruction>(inst#"_"#kind#"_"#vlmul.MX#"_E"#!shl(1, log2sew))
-                   (result_type result_reg_class:$merge),
+                   (result_type result_reg_class:$passthru),
                    (op1_type op1_reg_class:$rs1),
                    (mask_type VR:$rs2),
                    GPR:$vl, log2sew)>;
@@ -4128,12 +4128,12 @@ class VPatBinaryNoMaskTU<string intrinsic_name,
                          VReg op1_reg_class,
                          DAGOperand op2_kind> :
   Pat<(result_type (!cast<Intrinsic>(intrinsic_name)
-                   (result_type result_reg_class:$merge),
+                   (result_type result_reg_class:$passthru),
                    (op1_type op1_reg_class:$rs1),
                    (op2_type op2_kind:$rs2),
                    VLOpFrag)),
                    (!cast<Instruction>(inst)
-                   (result_type result_reg_class:$merge),
+                   (result_type result_reg_class:$passthru),
                    (op1_type op1_reg_class:$rs1),
                    (op2_type op2_kind:$rs2),
                    GPR:$vl, sew, TU_MU)>;
@@ -4148,13 +4148,13 @@ class VPatBinaryNoMaskTURoundingMode<string intrinsic_name,
                                      VReg op1_reg_class,
                                      DAGOperand op2_kind> :
   Pat<(result_type (!cast<Intrinsic>(intrinsic_name)
-                   (result_type result_reg_class:$merge),
+                   (result_type result_reg_class:$passthru),
                    (op1_type op1_reg_class:$rs1),
                    (op2_type op2_kind:$rs2),
                    (XLenVT timm:$round),
                    VLOpFrag)),
                    (!cast<Instruction>(inst)
-                   (result_type result_reg_class:$merge),
+                   (result_type result_reg_class:$passthru),
                    (op1_type op1_reg_class:$rs1),
                    (op2_type op2_kind:$rs2),
                    (XLenVT timm:$round),
@@ -4190,13 +4190,13 @@ class VPatBinaryMask<string intrinsic_name,
                      VReg op1_reg_class,
                      DAGOperand op2_kind> :
   Pat<(result_type (!cast<Intrinsic>(intrinsic_name#"_mask")
-                   (result_type result_reg_class:$merge),
+                   (result_type result_reg_class:$passthru),
                    (op1_type op1_reg_class:$rs1),
                    (op2_type op2_kind:$rs2),
                    (mask_type V0),
                    VLOpFrag)),
                    (!cast<Instruction>(inst#"_MASK")
-                   (result_type result_reg_class:$merge),
+                   (result_type result_reg_class:$passthru),
                    (op1_type op1_reg_class:$rs1),
                    (op2_type op2_kind:$rs2),
                    (mask_type V0), GPR:$vl, sew)>;
@@ -4212,13 +4212,13 @@ class VPatBinaryMaskPolicy<string intrinsic_name,
                            VReg op1_reg_class,
                            DAGOperand op2_kind> :
   Pat<(result_type (!cast<Intrinsic>(intrinsic_name#"_mask")
-                   (result_type result_reg_class:$merge),
+                   (result_type result_reg_class:$passthru),
                    (op1_type op1_reg_class:$rs1),
                    (op2_type op2_kind:$rs2),
                    (mask_type V0),
                    VLOpFrag, (XLenVT timm:$policy))),
                    (!cast<Instruction>(inst#"_MASK")
-                   (result_type result_reg_class:$merge),
+                   (result_type result_reg_class:$passthru),
                    (op1_type op1_reg_class:$rs1),
                    (op2_type op2_kind:$rs2),
                    (mask_type V0), GPR:$vl, sew, (XLenVT timm:$policy))>;
@@ -4234,14 +4234,14 @@ class VPatBinaryMaskPolicyRoundingMode<string intrinsic_name,
                                        VReg op1_reg_class,
                                        DAGOperand op2_kind> :
   Pat<(result_type (!cast<Intrinsic>(intrinsic_name#"_mask")
-                   (result_type result_reg_class:$merge),
+                   (result_type result_reg_class:$passthru),
                    (op1_type op1_reg_class:$rs1),
                    (op2_type op2_kind:$rs2),
                    (mask_type V0),
                    (XLenVT timm:$round),
                    VLOpFrag, (XLenVT timm:$policy))),
                    (!cast<Instruction>(inst#"_MASK")
-                   (result_type result_reg_class:$merge),
+                   (result_type result_reg_class:$passthru),
                    (op1_type op1_reg_class:$rs1),
                    (op2_type op2_kind:$rs2),
                    (mask_type V0),
@@ -4260,13 +4260,13 @@ class VPatBinaryMaskSwapped<string intrinsic_name,
                             VReg op1_reg_class,
                             DAGOperand op2_kind> :
   Pat<(result_type (!cast<Intrinsic>(intrinsic_name#"_mask")
-                   (result_type result_reg_class:$merge),
+                   (result_type result_reg_class:$passthru),
                    (op2_type op2_kind:$rs2),
                    (op1_type op1_reg_class:$rs1),
                    (mask_type V0),
                    VLOpFrag)),
                    (!cast<Instruction>(inst#"_MASK")
-                   (result_type result_reg_class:$merge),
+                   (result_type result_reg_class:$passthru),
                    (op1_type op1_reg_class:$rs1),
                    (op2_type op2_kind:$rs2),
                    (mask_type V0), GPR:$vl, sew)>;
@@ -4315,12 +4315,12 @@ class VPatTiedBinaryNoMaskTU<string intrinsic_name,
                              VReg result_reg_class,
                              DAGOperand op2_kind> :
   Pat<(result_type (!cast<Intrinsic>(intrinsic_name)
-                   (result_type result_reg_class:$merge),
-                   (result_type result_reg_class:$merge),
+                   (result_type result_reg_class:$passthru),
+                   (result_type result_reg_class:$passthru),
                    (op2_type op2_kind:$rs2),
                    VLOpFrag)),
                    (!cast<Instruction>(inst#"_TIED")
-                   (result_type result_reg_class:$merge),
+                   (result_type result_reg_class:$passthru),
                    (op2_type op2_kind:$rs2),
                    GPR:$vl, sew, TU_MU)>;
 
@@ -4332,13 +4332,13 @@ class VPatTiedBinaryNoMaskTURoundingMode<string intrinsic_name,
                                          VReg result_reg_class,
                                          DAGOperand op2_kind> :
   Pat<(result_type (!cast<Intrinsic>(intrinsic_name)
-                   (result_type result_reg_class:$merge),
-                   (result_type result_reg_class:$merge),
+                   (result_type result_reg_class:$passthru),
+                   (result_type result_reg_class:$passthru),
                    (op2_type op2_kind:$rs2),
                    (XLenVT timm:$round),
                    VLOpFrag)),
                    (!cast<Instruction>(inst#"_TIED")
-                   (result_type result_reg_class:$merge),
+                   (result_type result_reg_class:$passthru),
                    (op2_type op2_kind:$rs2),
                    (XLenVT timm:$round),
                    GPR:$vl, sew, TU_MU)>;
@@ -4352,13 +4352,13 @@ class VPatTiedBinaryMask<string intrinsic_name,
                          VReg result_reg_class,
                          DAGOperand op2_kind> :
   Pat<(result_type (!cast<Intrinsic>(intrinsic_name#"_mask")
-                   (result_type result_reg_class:$merge),
-                   (result_type result_reg_class:$merge),
+                   (result_type result_reg_class:$passthru),
+                   (result_type result_reg_class:$passthru),
                    (op2_type op2_kind:$rs2),
                    (mask_type V0),
                    VLOpFrag, (XLenVT timm:$policy))),
                    (!cast<Instruction>(inst#"_MASK_TIED")
-                   (result_type result_reg_class:$merge),
+                   (result_type result_reg_class:$passthru),
                    (op2_type op2_kind:$rs2),
                    (mask_type V0), GPR:$vl, sew, (XLenVT timm:$policy))>;
 
@@ -4371,14 +4371,14 @@ class VPatTiedBinaryMaskRoundingMode<string intrinsic_name,
                                      VReg result_reg_class,
                                      DAGOperand op2_kind> :
   Pat<(result_type (!cast<Intrinsic>(intrinsic_name#"_mask")
-                   (result_type result_reg_class:$merge),
-                   (result_type result_reg_class:$merge),
+                   (result_type result_reg_class:$passthru),
+                   (result_type result_reg_class:$passthru),
                    (op2_type op2_kind:$rs2),
                    (mask_type V0),
                    (XLenVT timm:$round),
                    VLOpFrag, (XLenVT timm:$policy))),
                    (!cast<Instruction>(inst#"_MASK_TIED")
-                   (result_type result_reg_class:$merge),
+                   (result_type result_reg_class:$passthru),
                    (op2_type op2_kind:$rs2),
                    (mask_type V0),
                    (XLenVT timm:$round),
@@ -4678,15 +4678,15 @@ multiclass VPatNullaryV<string intrinsic, string instruction> {
   foreach vti = AllIntegerVectors in {
     let Predicates = GetVTypePredicates<vti>.Predicates in {
       def : Pat<(vti.Vector (!cast<Intrinsic>(intrinsic)
-                            (vti.Vector vti.RegClass:$merge),
+                            (vti.Vector vti.RegClass:$passthru),
                             VLOpFrag)),
                             (!cast<Instruction>(instruction#"_V_" # vti.LMul.MX)
-                            vti.RegClass:$merge, GPR:$vl, vti.Log2SEW, TU_MU)>;
+                            vti.RegClass:$passthru, GPR:$vl, vti.Log2SEW, TU_MU)>;
       def : Pat<(vti.Vector (!cast<Intrinsic>(intrinsic # "_mask")
-                            (vti.Vector vti.RegClass:$merge),
+                            (vti.Vector vti.RegClass:$passthru),
                             (vti.Mask V0), VLOpFrag, (XLenVT timm:$policy))),
                             (!cast<Instruction>(instruction#"_V_" # vti.LMul.MX # "_MASK")
-                            vti.RegClass:$merge, (vti.Mask V0),
+                            vti.RegClass:$passthru, (vti.Mask V0),
                             GPR:$vl, vti.Log2SEW, (XLenVT timm:$policy))>;
   }
   }
@@ -4781,13 +4781,13 @@ multiclass VPatBinaryCarryInTAIL<string intrinsic,
                                  VReg op1_reg_class,
                                  DAGOperand op2_kind> {
   def : Pat<(result_type (!cast<Intrinsic>(intrinsic)
-                         (result_type result_reg_class:$merge),
+                         (result_type result_reg_class:$passthru),
                          (op1_type op1_reg_class:$rs1),
                          (op2_type op2_kind:$rs2),
                          (mask_type V0),
                          VLOpFrag)),
                          (!cast<Instruction>(inst#"_"#kind#"_"#vlmul.MX)
-                         (result_type result_reg_class:$merge),
+                         (result_type result_reg_class:$passthru),
                          (op1_type op1_reg_class:$rs1),
                          (op2_type op2_kind:$rs2),
                          (mask_type V0), GPR:$vl, sew)>;
@@ -6065,12 +6065,12 @@ multiclass VPatCompare_VI<string intrinsic, string inst,
     defvar IntrMask = !cast<Intrinsic>(intrinsic # "_mask");
     defvar PseudoMask = !cast<Instruction>(inst#"_VI_"#vti.LMul.MX#"_MASK");
     let Predicates = GetVTypePredicates<vti>.Predicates in
-    def : Pat<(vti.Mask (IntrMask (vti.Mask VR:$merge),
+    def : Pat<(vti.Mask (IntrMask (vti.Mask VR:$passthru),
                                   (vti.Vector vti.RegClass:$rs1),
                                   (vti.Scalar ImmType:$rs2),
                                   (vti.Mask V0),
                                   VLOpFrag)),
-              (PseudoMask VR:$merge, vti.RegClass:$rs1, (DecImm ImmType:$rs2),
+              (PseudoMask VR:$passthru, vti.RegClass:$rs1, (DecImm ImmType:$rs2),
                           (vti.Mask V0), GPR:$vl, vti.Log2SEW)>;
   }
 }
@@ -6215,24 +6215,24 @@ foreach vti = AllIntegerVectors in {
   // to use a more complex splat sequence. Add the pattern for all VTs for
   // consistency.
   let Predicates = GetVTypePredicates<vti>.Predicates in {
-    def : Pat<(vti.Vector (int_riscv_vrsub (vti.Vector vti.RegClass:$merge),
+    def : Pat<(vti.Vector (int_riscv_vrsub (vti.Vector vti.RegClass:$passthru),
                                            (vti.Vector vti.RegClass:$rs2),
                                            (vti.Vector vti.RegClass:$rs1),
                                            VLOpFrag)),
               (!cast<Instruction>("PseudoVSUB_VV_"#vti.LMul.MX)
-                                                        vti.RegClass:$merge,
+                                                        vti.RegClass:$passthru,
                                                         vti.RegClass:$rs1,
                                                         vti.RegClass:$rs2,
                                                         GPR:$vl,
                                                         vti.Log2SEW, TU_MU)>;
-    def : Pat<(vti.Vector (int_riscv_vrsub_mask (vti.Vector vti.RegClass:$merge),
+    def : Pat<(vti.Vector (int_riscv_vrsub_mask (vti.Vector vti.RegClass:$passthru),
                                                 (vti.Vector vti.RegClass:$rs2),
                                                 (vti.Vector vti.RegClass:$rs1),
                                                 (vti.Mask V0),
                                                 VLOpFrag,
                                                 (XLenVT timm:$policy))),
               (!cast<Instruction>("PseudoVSUB_VV_"#vti.LMul.MX#"_MASK")
-                                                        vti.RegClass:$merge,
+                                                        vti.RegClass:$passthru,
                                                         vti.RegClass:$rs1,
                                                         vti.RegClass:$rs2,
                                                         (vti.Mask V0),
@@ -6241,24 +6241,24 @@ foreach vti = AllIntegerVectors in {
                                                         (XLenVT timm:$policy))>;
 
     // Match VSUB with a small immediate to vadd.vi by negating the immediate.
-    def : Pat<(vti.Vector (int_riscv_vsub (vti.Vector vti.RegClass:$merge),
+    def : Pat<(vti.Vector (int_riscv_vsub (vti.Vector vti.RegClass:$passthru),
                                           (vti.Vector vti.RegClass:$rs1),
                                           (vti.Scalar simm5_plus1:$rs2),
                                           VLOpFrag)),
               (!cast<Instruction>("PseudoVADD_VI_"#vti.LMul.MX)
-                                                      vti.RegClass:$merge,
+                                                      vti.RegClass:$passthru,
                                                       vti.RegClass:$rs1,
                                                       (NegImm simm5_plus1:$rs2),
                                                       GPR:$vl,
                                                       vti.Log2SEW, TU_MU)>;
-    def : Pat<(vti.Vector (int_riscv_vsub_mask (vti.Vector vti.RegClass:$merge),
+    def : Pat<(vti.Vector (int_riscv_vsub_mask (vti.Vector vti.RegClass:$passthru),
                                                (vti.Vector vti.RegClass:$rs1),
                                                (vti.Scalar simm5_plus1:$rs2),
                                                (vti.Mask V0),
                                                VLOpFrag,
                                                (XLenVT timm:$policy))),
               (!cast<Instruction>("PseudoVADD_VI_"#vti.LMul.MX#"_MASK")
-                                                        vti.RegClass:$merge,
+                                                        vti.RegClass:$passthru,
                                                         vti.RegClass:$rs1,
                                                         (NegImm simm5_plus1:$rs2),
                                                         (vti.Mask V0),
@@ -6781,26 +6781,20 @@ let mayLoad = 0, mayStore = 0, hasSideEffects = 0 in {
 let Predicates = [HasVInstructionsAnyF] in {
 let mayLoad = 0, mayStore = 0, hasSideEffects = 0 in {
   foreach f = FPList in {
-    foreach m = f.MxList in {
-      defvar mx = m.MX;
-      let VLMul = m.value in {
-        let HasSEWOp = 1, BaseInstr = VFMV_F_S in
-        def "PseudoVFMV_" # f.FX # "_S_" # mx :
-          Pseudo<(outs f.fprclass:$rd),
-                 (ins m.vrclass:$rs2, ixlenimm:$sew), []>,
-          Sched<[WriteVMovFS, ReadVMovFS]>,
-          RISCVVPseudo;
-        let HasVLOp = 1, HasSEWOp = 1, BaseInstr = VFMV_S_F,
-            Constraints = "$rd = $rs1" in
-        def "PseudoVFMV_S_" # f.FX # "_" # mx :
-                                          Pseudo<(outs m.vrclass:$rd),
-                                                 (ins m.vrclass:$rs1, f.fprclass:$rs2,
-                                                      AVL:$vl, ixlenimm:$sew),
-                                                 []>,
-          Sched<[WriteVMovSF, ReadVMovSF_V, ReadVMovSF_F]>,
-          RISCVVPseudo;
-      }
-    }
+    let HasSEWOp = 1, BaseInstr = VFMV_F_S in
+    def "PseudoVFMV_" # f.FX # "_S" :
+      Pseudo<(outs f.fprclass:$rd),
+             (ins VR:$rs2, ixlenimm:$sew), []>,
+      Sched<[WriteVMovFS, ReadVMovFS]>,
+      RISCVVPseudo;
+    let HasVLOp = 1, HasSEWOp = 1, BaseInstr = VFMV_S_F,
+        Constraints = "$rd = $rs1" in
+    def "PseudoVFMV_S_" # f.FX :
+      Pseudo<(outs VR:$rd),
+             (ins VR:$rs1, f.fprclass:$rs2, AVL:$vl, ixlenimm:$sew),
+             []>,
+      Sched<[WriteVMovSF, ReadVMovSF_V, ReadVMovSF_F]>,
+      RISCVVPseudo;
   }
 }
 } // Predicates = [HasVInstructionsAnyF]
@@ -6907,20 +6901,20 @@ defm : VPatBinaryV_VV_VX_VI<"int_riscv_vsra", "PseudoVSRA", AllIntegerVectors,
 foreach vti = AllIntegerVectors in {
   // Emit shift by 1 as an add since it might be faster.
   let Predicates = GetVTypePredicates<vti>.Predicates in {
-    def : Pat<(vti.Vector (int_riscv_vsll (vti.Vector vti.RegClass:$merge),
+    def : Pat<(vti.Vector (int_riscv_vsll (vti.Vector vti.RegClass:$passthru),
                                           (vti.Vector vti.RegClass:$rs1),
                                           (XLenVT 1), VLOpFrag)),
               (!cast<Instruction>("PseudoVADD_VV_"#vti.LMul.MX)
-                 vti.RegClass:$merge, vti.RegClass:$rs1,
+                 vti.RegClass:$passthru, vti.RegClass:$rs1,
                  vti.RegClass:$rs1, GPR:$vl, vti.Log2SEW, TU_MU)>;
-    def : Pat<(vti.Vector (int_riscv_vsll_mask (vti.Vector vti.RegClass:$merge),
+    def : Pat<(vti.Vector (int_riscv_vsll_mask (vti.Vector vti.RegClass:$passthru),
                                                (vti.Vector vti.RegClass:$rs1),
                                                (XLenVT 1),
                                                (vti.Mask V0),
                                                VLOpFrag,
                                                (XLenVT timm:$policy))),
               (!cast<Instruction>("PseudoVADD_VV_"#vti.LMul.MX#"_MASK")
-                                                          vti.RegClass:$merge,
+                                                          vti.RegClass:$passthru,
                                                           vti.RegClass:$rs1,
                                                           vti.RegClass:$rs1,
                                                           (vti.Mask V0),
@@ -7241,11 +7235,6 @@ foreach vti = AllFloatVectors in {
                                  vti.Vector, vti.Vector, vti.Mask,
                                  vti.Log2SEW, vti.LMul, vti.RegClass,
                                  vti.RegClass, vti.RegClass>;
-    defm : VPatBinaryCarryInTAIL<"int_riscv_vfmerge", "PseudoVMERGE", "VVM",
-                                 vti.Vector,
-                                 vti.Vector, vti.Vector, vti.Mask,
-                                 vti.Log2SEW, vti.LMul, vti.RegClass,
-                                 vti.RegClass, vti.RegClass>;
     defm : VPatBinaryCarryInTAIL<"int_riscv_vfmerge", "PseudoVFMERGE",
                                  "V"#vti.ScalarSuffix#"M",
                                  vti.Vector,
@@ -7258,11 +7247,11 @@ foreach vti = AllFloatVectors in {
 foreach fvti = AllFloatVectors in {
   defvar instr = !cast<Instruction>("PseudoVMERGE_VIM_"#fvti.LMul.MX);
   let Predicates = GetVTypePredicates<fvti>.Predicates in
-  def : Pat<(fvti.Vector (int_riscv_vfmerge (fvti.Vector fvti.RegClass:$merge),
+  def : Pat<(fvti.Vector (int_riscv_vfmerge (fvti.Vector fvti.RegClass:$passthru),
                                             (fvti.Vector fvti.RegClass:$rs2),
                                             (fvti.Scalar (fpimm0)),
                                             (fvti.Mask V0), VLOpFrag)),
-            (instr fvti.RegClass:$merge, fvti.RegClass:$rs2, 0,
+            (instr fvti.RegClass:$passthru, fvti.RegClass:$rs2, 0,
                    (fvti.Mask V0), GPR:$vl, fvti.Log2SEW)>;
 }
 
diff --git a/llvm/lib/Target/RISCV/RISCVInstrInfoVSDPatterns.td b/llvm/lib/Target/RISCV/RISCVInstrInfoVSDPatterns.td
index cd4c3b6..8d64788 100644
--- a/llvm/lib/Target/RISCV/RISCVInstrInfoVSDPatterns.td
+++ b/llvm/lib/Target/RISCV/RISCVInstrInfoVSDPatterns.td
@@ -1430,7 +1430,7 @@ foreach fvtiToFWti = AllWidenableFloatVectors in {
 foreach fvtiToFWti = AllWidenableBFloatToFloatVectors in {
   defvar fvti = fvtiToFWti.Vti;
   defvar fwti = fvtiToFWti.Wti;
-  let Predicates = [HasVInstructionsBF16] in
+  let Predicates = [HasVInstructionsBF16Minimal] in
   def : Pat<(fvti.Vector (fpround (fwti.Vector fwti.RegClass:$rs1))),
             (!cast<Instruction>("PseudoVFNCVTBF16_F_F_W_"#fvti.LMul.MX#"_E"#fvti.SEW)
                 (fvti.Vector (IMPLICIT_DEF)),
@@ -1444,14 +1444,14 @@ foreach fvtiToFWti = AllWidenableBFloatToFloatVectors in {
 //===----------------------------------------------------------------------===//
 // Vector Element Extracts
 //===----------------------------------------------------------------------===//
-foreach vti = AllFloatVectors in {
-  defvar vmv_f_s_inst = !cast<Instruction>(!strconcat("PseudoVFMV_",
+foreach vti = NoGroupFloatVectors in {
+  defvar vfmv_f_s_inst = !cast<Instruction>(!strconcat("PseudoVFMV_",
                                                        vti.ScalarSuffix,
-                                                       "_S_", vti.LMul.MX));
+                                                       "_S"));
   // Only pattern-match extract-element operations where the index is 0. Any
   // other index will have been custom-lowered to slide the vector correctly
   // into place.
   let Predicates = GetVTypePredicates<vti>.Predicates in
   def : Pat<(vti.Scalar (extractelt (vti.Vector vti.RegClass:$rs2), 0)),
-            (vmv_f_s_inst vti.RegClass:$rs2, vti.Log2SEW)>;
+            (vfmv_f_s_inst vti.RegClass:$rs2, vti.Log2SEW)>;
 }
diff --git a/llvm/lib/Target/RISCV/RISCVInstrInfoVVLPatterns.td b/llvm/lib/Target/RISCV/RISCVInstrInfoVVLPatterns.td
index 2ed71f6..394da80 100644
--- a/llvm/lib/Target/RISCV/RISCVInstrInfoVVLPatterns.td
+++ b/llvm/lib/Target/RISCV/RISCVInstrInfoVVLPatterns.td
@@ -36,7 +36,7 @@ def SDT_RISCVIntBinOp_VL : SDTypeProfile<1, 5, [SDTCisSameAs<0, 1>,
                                                 SDTCisSameNumEltsAs<0, 4>,
                                                 SDTCisVT<5, XLenVT>]>;
 
-// Input: (vector, vector/scalar, merge, mask, roundmode, vl)
+// Input: (vector, vector/scalar, passthru, mask, roundmode, vl)
 def SDT_RISCVVNBinOp_RM_VL : SDTypeProfile<1, 6, [SDTCisVec<0>, SDTCisInt<0>,
                                                   SDTCisSameAs<0, 3>,
                                                   SDTCisSameNumEltsAs<0, 1>,
@@ -149,18 +149,18 @@ def riscv_strict_fmul_vl  : SDNode<"RISCVISD::STRICT_FMUL_VL",  SDT_RISCVFPBinOp
 def riscv_strict_fdiv_vl  : SDNode<"RISCVISD::STRICT_FDIV_VL",  SDT_RISCVFPBinOp_VL, [SDNPHasChain]>;
 def riscv_strict_fsqrt_vl : SDNode<"RISCVISD::STRICT_FSQRT_VL", SDT_RISCVFPUnOp_VL, [SDNPHasChain]>;
 
-def any_riscv_fadd_vl : PatFrags<(ops node:$lhs, node:$rhs, node:$merge, node:$mask, node:$vl),
-                        [(riscv_fadd_vl node:$lhs, node:$rhs, node:$merge, node:$mask, node:$vl),
-                         (riscv_strict_fadd_vl node:$lhs, node:$rhs, node:$merge, node:$mask, node:$vl)]>;
-def any_riscv_fsub_vl : PatFrags<(ops node:$lhs, node:$rhs, node:$merge, node:$mask, node:$vl),
-                        [(riscv_fsub_vl node:$lhs, node:$rhs, node:$merge, node:$mask, node:$vl),
-                         (riscv_strict_fsub_vl node:$lhs, node:$rhs, node:$merge, node:$mask, node:$vl)]>;
-def any_riscv_fmul_vl : PatFrags<(ops node:$lhs, node:$rhs, node:$merge, node:$mask, node:$vl),
-                        [(riscv_fmul_vl node:$lhs, node:$rhs, node:$merge, node:$mask, node:$vl),
-                         (riscv_strict_fmul_vl node:$lhs, node:$rhs, node:$merge, node:$mask, node:$vl)]>;
-def any_riscv_fdiv_vl : PatFrags<(ops node:$lhs, node:$rhs, node:$merge, node:$mask, node:$vl),
-                        [(riscv_fdiv_vl node:$lhs, node:$rhs, node:$merge, node:$mask, node:$vl),
-                         (riscv_strict_fdiv_vl node:$lhs, node:$rhs, node:$merge, node:$mask, node:$vl)]>;
+def any_riscv_fadd_vl : PatFrags<(ops node:$lhs, node:$rhs, node:$passthru, node:$mask, node:$vl),
+                        [(riscv_fadd_vl node:$lhs, node:$rhs, node:$passthru, node:$mask, node:$vl),
+                         (riscv_strict_fadd_vl node:$lhs, node:$rhs, node:$passthru, node:$mask, node:$vl)]>;
+def any_riscv_fsub_vl : PatFrags<(ops node:$lhs, node:$rhs, node:$passthru, node:$mask, node:$vl),
+                        [(riscv_fsub_vl node:$lhs, node:$rhs, node:$passthru, node:$mask, node:$vl),
+                         (riscv_strict_fsub_vl node:$lhs, node:$rhs, node:$passthru, node:$mask, node:$vl)]>;
+def any_riscv_fmul_vl : PatFrags<(ops node:$lhs, node:$rhs, node:$passthru, node:$mask, node:$vl),
+                        [(riscv_fmul_vl node:$lhs, node:$rhs, node:$passthru, node:$mask, node:$vl),
+                         (riscv_strict_fmul_vl node:$lhs, node:$rhs, node:$passthru, node:$mask, node:$vl)]>;
+def any_riscv_fdiv_vl : PatFrags<(ops node:$lhs, node:$rhs, node:$passthru, node:$mask, node:$vl),
+                        [(riscv_fdiv_vl node:$lhs, node:$rhs, node:$passthru, node:$mask, node:$vl),
+                         (riscv_strict_fdiv_vl node:$lhs, node:$rhs, node:$passthru, node:$mask, node:$vl)]>;
 def any_riscv_fsqrt_vl : PatFrags<(ops node:$src, node:$mask, node:$vl),
                         [(riscv_fsqrt_vl node:$src, node:$mask, node:$vl),
                          (riscv_strict_fsqrt_vl node:$src, node:$mask, node:$vl)]>;
@@ -318,12 +318,12 @@ def any_riscv_vfround_noexcept_vl : PatFrags<(ops node:$src, node:$mask, node:$v
 def riscv_setcc_vl : SDNode<"RISCVISD::SETCC_VL", SDT_RISCVSETCCOP_VL>;
 def riscv_strict_fsetcc_vl : SDNode<"RISCVISD::STRICT_FSETCC_VL", SDT_RISCVSETCCOP_VL, [SDNPHasChain]>;
 def riscv_strict_fsetccs_vl : SDNode<"RISCVISD::STRICT_FSETCCS_VL", SDT_RISCVSETCCOP_VL, [SDNPHasChain]>;
-def any_riscv_fsetcc_vl : PatFrags<(ops node:$lhs, node:$rhs, node:$cc, node:$merge, node:$mask, node:$vl),
-                            [(riscv_setcc_vl node:$lhs, node:$rhs, node:$cc, node:$merge, node:$mask, node:$vl),
-                             (riscv_strict_fsetcc_vl node:$lhs, node:$rhs, node:$cc, node:$merge, node:$mask, node:$vl)]>;
-def any_riscv_fsetccs_vl : PatFrags<(ops node:$lhs, node:$rhs, node:$cc, node:$merge, node:$mask, node:$vl),
-                            [(riscv_setcc_vl node:$lhs, node:$rhs, node:$cc, node:$merge, node:$mask, node:$vl),
-                             (riscv_strict_fsetccs_vl node:$lhs, node:$rhs, node:$cc, node:$merge, node:$mask, node:$vl)]>;
+def any_riscv_fsetcc_vl : PatFrags<(ops node:$lhs, node:$rhs, node:$cc, node:$passthru, node:$mask, node:$vl),
+                            [(riscv_setcc_vl node:$lhs, node:$rhs, node:$cc, node:$passthru, node:$mask, node:$vl),
+                             (riscv_strict_fsetcc_vl node:$lhs, node:$rhs, node:$cc, node:$passthru, node:$mask, node:$vl)]>;
+def any_riscv_fsetccs_vl : PatFrags<(ops node:$lhs, node:$rhs, node:$cc, node:$passthru, node:$mask, node:$vl),
+                            [(riscv_setcc_vl node:$lhs, node:$rhs, node:$cc, node:$passthru, node:$mask, node:$vl),
+                             (riscv_strict_fsetccs_vl node:$lhs, node:$rhs, node:$cc, node:$passthru, node:$mask, node:$vl)]>;
 
 def riscv_vrgather_vx_vl : SDNode<"RISCVISD::VRGATHER_VX_VL",
                                   SDTypeProfile<1, 5, [SDTCisVec<0>,
@@ -640,14 +640,14 @@ class VPatBinaryVL_V<SDPatternOperator vop,
     : Pat<(result_type (vop
                        (op1_type op1_reg_class:$rs1),
                        (op2_type op2_reg_class:$rs2),
-                       (result_type result_reg_class:$merge),
+                       (result_type result_reg_class:$passthru),
                        (mask_type V0),
                        VLOpFrag)),
       (!cast<Instruction>(
                    !if(isSEWAware,
                        instruction_name#"_"#suffix#"_"#vlmul.MX#"_E"#!shl(1, log2sew)#"_MASK",
                        instruction_name#"_"#suffix#"_"#vlmul.MX#"_MASK"))
-                   result_reg_class:$merge,
+                   result_reg_class:$passthru,
                    op1_reg_class:$rs1,
                    op2_reg_class:$rs2,
                    (mask_type V0), GPR:$vl, log2sew, TAIL_AGNOSTIC)>;
@@ -668,14 +668,14 @@ class VPatBinaryVL_V_RM<SDPatternOperator vop,
     : Pat<(result_type (vop
                        (op1_type op1_reg_class:$rs1),
                        (op2_type op2_reg_class:$rs2),
-                       (result_type result_reg_class:$merge),
+                       (result_type result_reg_class:$passthru),
                        (mask_type V0),
                        VLOpFrag)),
       (!cast<Instruction>(
                    !if(isSEWAware,
                        instruction_name#"_"#suffix#"_"#vlmul.MX#"_E"#!shl(1, log2sew)#"_MASK",
                        instruction_name#"_"#suffix#"_"#vlmul.MX#"_MASK"))
-                   result_reg_class:$merge,
+                   result_reg_class:$passthru,
                    op1_reg_class:$rs1,
                    op2_reg_class:$rs2,
                    (mask_type V0),
@@ -800,14 +800,14 @@ class VPatBinaryVL_XI<SDPatternOperator vop,
     : Pat<(result_type (vop
                    (vop1_type vop_reg_class:$rs1),
                    (vop2_type (SplatPatKind (XLenVT xop_kind:$rs2))),
-                   (result_type result_reg_class:$merge),
+                   (result_type result_reg_class:$passthru),
                    (mask_type V0),
                    VLOpFrag)),
       (!cast<Instruction>(
                    !if(isSEWAware,
                        instruction_name#_#suffix#_#vlmul.MX#"_E"#!shl(1, log2sew)#"_MASK",
                        instruction_name#_#suffix#_#vlmul.MX#"_MASK"))
-                   result_reg_class:$merge,
+                   result_reg_class:$passthru,
                    vop_reg_class:$rs1,
                    xop_kind:$rs2,
                    (mask_type V0), GPR:$vl, log2sew, TAIL_AGNOSTIC)>;
@@ -924,14 +924,14 @@ class VPatBinaryVL_VF<SDPatternOperator vop,
                       bit isSEWAware = 0>
     : Pat<(result_type (vop (vop1_type vop_reg_class:$rs1),
                        (vop2_type (SplatFPOp scalar_reg_class:$rs2)),
-                       (result_type result_reg_class:$merge),
+                       (result_type result_reg_class:$passthru),
                        (mask_type V0),
                        VLOpFrag)),
       (!cast<Instruction>(
                    !if(isSEWAware,
                        instruction_name#"_"#vlmul.MX#"_E"#!shl(1, log2sew)#"_MASK",
                        instruction_name#"_"#vlmul.MX#"_MASK"))
-                   result_reg_class:$merge,
+                   result_reg_class:$passthru,
                    vop_reg_class:$rs1,
                    scalar_reg_class:$rs2,
                    (mask_type V0), GPR:$vl, log2sew, TAIL_AGNOSTIC)>;
@@ -950,14 +950,14 @@ class VPatBinaryVL_VF_RM<SDPatternOperator vop,
                       bit isSEWAware = 0>
     : Pat<(result_type (vop (vop1_type vop_reg_class:$rs1),
                        (vop2_type (SplatFPOp scalar_reg_class:$rs2)),
-                       (result_type result_reg_class:$merge),
+                       (result_type result_reg_class:$passthru),
                        (mask_type V0),
                        VLOpFrag)),
       (!cast<Instruction>(
                    !if(isSEWAware,
                        instruction_name#"_"#vlmul.MX#"_E"#!shl(1, log2sew)#"_MASK",
                        instruction_name#"_"#vlmul.MX#"_MASK"))
-                   result_reg_class:$merge,
+                   result_reg_class:$passthru,
                    vop_reg_class:$rs1,
                    scalar_reg_class:$rs2,
                    (mask_type V0),
@@ -1004,14 +1004,14 @@ multiclass VPatBinaryFPVL_R_VF<SDPatternOperator vop, string instruction_name,
     let Predicates = GetVTypePredicates<fvti>.Predicates in
     def : Pat<(fvti.Vector (vop (SplatFPOp fvti.ScalarRegClass:$rs2),
                                 fvti.RegClass:$rs1,
-                                (fvti.Vector fvti.RegClass:$merge),
+                                (fvti.Vector fvti.RegClass:$passthru),
                                 (fvti.Mask V0),
                                 VLOpFrag)),
               (!cast<Instruction>(
                            !if(isSEWAware,
                                instruction_name#"_V"#fvti.ScalarSuffix#"_"#fvti.LMul.MX#"_E"#fvti.SEW#"_MASK",
                                instruction_name#"_V"#fvti.ScalarSuffix#"_"#fvti.LMul.MX#"_MASK"))
-                           fvti.RegClass:$merge,
+                           fvti.RegClass:$passthru,
                            fvti.RegClass:$rs1, fvti.ScalarRegClass:$rs2,
                            (fvti.Mask V0), GPR:$vl, fvti.Log2SEW, TAIL_AGNOSTIC)>;
   }
@@ -1023,14 +1023,14 @@ multiclass VPatBinaryFPVL_R_VF_RM<SDPatternOperator vop, string instruction_name
     let Predicates = GetVTypePredicates<fvti>.Predicates in
     def : Pat<(fvti.Vector (vop (SplatFPOp fvti.ScalarRegClass:$rs2),
                                 fvti.RegClass:$rs1,
-                                (fvti.Vector fvti.RegClass:$merge),
+                                (fvti.Vector fvti.RegClass:$passthru),
                                 (fvti.Mask V0),
                                 VLOpFrag)),
               (!cast<Instruction>(
                            !if(isSEWAware,
                                instruction_name#"_V"#fvti.ScalarSuffix#"_"#fvti.LMul.MX#"_E"#fvti.SEW#"_MASK",
                                instruction_name#"_V"#fvti.ScalarSuffix#"_"#fvti.LMul.MX#"_MASK"))
-                           fvti.RegClass:$merge,
+                           fvti.RegClass:$passthru,
                            fvti.RegClass:$rs1, fvti.ScalarRegClass:$rs2,
                            (fvti.Mask V0),
                            // Value to indicate no rounding mode change in
@@ -1044,11 +1044,11 @@ multiclass VPatIntegerSetCCVL_VV<VTypeInfo vti, string instruction_name,
                                  CondCode cc> {
   def : Pat<(vti.Mask (riscv_setcc_vl (vti.Vector vti.RegClass:$rs1),
                                       vti.RegClass:$rs2, cc,
-                                      VR:$merge,
+                                      VR:$passthru,
                                       (vti.Mask V0),
                                       VLOpFrag)),
             (!cast<Instruction>(instruction_name#"_VV_"#vti.LMul.MX#"_MASK")
-                         VR:$merge,
+                         VR:$passthru,
                          vti.RegClass:$rs1,
                          vti.RegClass:$rs2,
                          (vti.Mask V0), GPR:$vl, vti.Log2SEW)>;
@@ -1060,11 +1060,11 @@ multiclass VPatIntegerSetCCVL_VV_Swappable<VTypeInfo vti, string instruction_nam
     : VPatIntegerSetCCVL_VV<vti, instruction_name, cc> {
   def : Pat<(vti.Mask (riscv_setcc_vl (vti.Vector vti.RegClass:$rs2),
                                       vti.RegClass:$rs1, invcc,
-                                      VR:$merge,
+                                      VR:$passthru,
                                       (vti.Mask V0),
                                       VLOpFrag)),
             (!cast<Instruction>(instruction_name#"_VV_"#vti.LMul.MX#"_MASK")
-                         VR:$merge, vti.RegClass:$rs1,
+                         VR:$passthru, vti.RegClass:$rs1,
                          vti.RegClass:$rs2, (vti.Mask V0), GPR:$vl, vti.Log2SEW)>;
 }
 
@@ -1073,17 +1073,17 @@ multiclass VPatIntegerSetCCVL_VX_Swappable<VTypeInfo vti, string instruction_nam
   defvar instruction_masked = !cast<Instruction>(instruction_name#"_VX_"#vti.LMul.MX#"_MASK");
   def : Pat<(vti.Mask (riscv_setcc_vl (vti.Vector vti.RegClass:$rs1),
                                       (SplatPat (XLenVT GPR:$rs2)), cc,
-                                      VR:$merge,
+                                      VR:$passthru,
                                       (vti.Mask V0),
                                       VLOpFrag)),
-            (instruction_masked VR:$merge, vti.RegClass:$rs1,
+            (instruction_masked VR:$passthru, vti.RegClass:$rs1,
                                 GPR:$rs2, (vti.Mask V0), GPR:$vl, vti.Log2SEW)>;
   def : Pat<(vti.Mask (riscv_setcc_vl (SplatPat (XLenVT GPR:$rs2)),
                                       (vti.Vector vti.RegClass:$rs1), invcc,
-                                      VR:$merge,
+                                      VR:$passthru,
                                       (vti.Mask V0),
                                       VLOpFrag)),
-            (instruction_masked VR:$merge, vti.RegClass:$rs1,
+            (instruction_masked VR:$passthru, vti.RegClass:$rs1,
                                 GPR:$rs2, (vti.Mask V0), GPR:$vl, vti.Log2SEW)>;
 }
 
@@ -1092,20 +1092,20 @@ multiclass VPatIntegerSetCCVL_VI_Swappable<VTypeInfo vti, string instruction_nam
   defvar instruction_masked = !cast<Instruction>(instruction_name#"_VI_"#vti.LMul.MX#"_MASK");
   def : Pat<(vti.Mask (riscv_setcc_vl (vti.Vector vti.RegClass:$rs1),
                                       (SplatPat_simm5 simm5:$rs2), cc,
-                                      VR:$merge,
+                                      VR:$passthru,
                                       (vti.Mask V0),
                                       VLOpFrag)),
-            (instruction_masked VR:$merge, vti.RegClass:$rs1,
+            (instruction_masked VR:$passthru, vti.RegClass:$rs1,
                                 XLenVT:$rs2, (vti.Mask V0), GPR:$vl,
                                 vti.Log2SEW)>;
 
   // FIXME: Can do some canonicalization to remove these patterns.
   def : Pat<(vti.Mask (riscv_setcc_vl (SplatPat_simm5 simm5:$rs2),
                                       (vti.Vector vti.RegClass:$rs1), invcc,
-                                      VR:$merge,
+                                      VR:$passthru,
                                       (vti.Mask V0),
                                       VLOpFrag)),
-            (instruction_masked VR:$merge, vti.RegClass:$rs1,
+            (instruction_masked VR:$passthru, vti.RegClass:$rs1,
                                 simm5:$rs2, (vti.Mask V0), GPR:$vl,
                                 vti.Log2SEW)>;
 }
@@ -1117,20 +1117,20 @@ multiclass VPatIntegerSetCCVL_VIPlus1_Swappable<VTypeInfo vti,
   defvar instruction_masked = !cast<Instruction>(instruction_name#"_VI_"#vti.LMul.MX#"_MASK");
   def : Pat<(vti.Mask (riscv_setcc_vl (vti.Vector vti.RegClass:$rs1),
                                       (splatpat_kind simm5:$rs2), cc,
-                                      VR:$merge,
+                                      VR:$passthru,
                                       (vti.Mask V0),
                                       VLOpFrag)),
-            (instruction_masked VR:$merge, vti.RegClass:$rs1,
+            (instruction_masked VR:$passthru, vti.RegClass:$rs1,
                                 (DecImm simm5:$rs2), (vti.Mask V0), GPR:$vl,
                                 vti.Log2SEW)>;
 
   // FIXME: Can do some canonicalization to remove these patterns.
   def : Pat<(vti.Mask (riscv_setcc_vl (splatpat_kind simm5:$rs2),
                                       (vti.Vector vti.RegClass:$rs1), invcc,
-                                      VR:$merge,
+                                      VR:$passthru,
                                       (vti.Mask V0),
                                       VLOpFrag)),
-            (instruction_masked VR:$merge, vti.RegClass:$rs1,
+            (instruction_masked VR:$passthru, vti.RegClass:$rs1,
                                 (DecImm simm5:$rs2), (vti.Mask V0), GPR:$vl,
                                 vti.Log2SEW)>;
 }
@@ -1143,31 +1143,31 @@ multiclass VPatFPSetCCVL_VV_VF_FV<SDPatternOperator vop, CondCode cc,
       def : Pat<(fvti.Mask (vop (fvti.Vector fvti.RegClass:$rs1),
                                  fvti.RegClass:$rs2,
                                  cc,
-                                 VR:$merge,
+                                 VR:$passthru,
                                  (fvti.Mask V0),
                                  VLOpFrag)),
                 (!cast<Instruction>(inst_name#"_VV_"#fvti.LMul.MX#"_MASK")
-                    VR:$merge, fvti.RegClass:$rs1,
+                    VR:$passthru, fvti.RegClass:$rs1,
                     fvti.RegClass:$rs2, (fvti.Mask V0),
                     GPR:$vl, fvti.Log2SEW)>;
       def : Pat<(fvti.Mask (vop (fvti.Vector fvti.RegClass:$rs1),
                                 (SplatFPOp fvti.ScalarRegClass:$rs2),
                                 cc,
-                                VR:$merge,
+                                VR:$passthru,
                                 (fvti.Mask V0),
                                 VLOpFrag)),
                 (!cast<Instruction>(inst_name#"_V"#fvti.ScalarSuffix#"_"#fvti.LMul.MX#"_MASK")
-                    VR:$merge, fvti.RegClass:$rs1,
+                    VR:$passthru, fvti.RegClass:$rs1,
                     fvti.ScalarRegClass:$rs2, (fvti.Mask V0),
                     GPR:$vl, fvti.Log2SEW)>;
       def : Pat<(fvti.Mask (vop (SplatFPOp fvti.ScalarRegClass:$rs2),
                                 (fvti.Vector fvti.RegClass:$rs1),
                                 cc,
-                                VR:$merge,
+                                VR:$passthru,
                                 (fvti.Mask V0),
                                 VLOpFrag)),
                 (!cast<Instruction>(swapped_op_inst_name#"_V"#fvti.ScalarSuffix#"_"#fvti.LMul.MX#"_MASK")
-                    VR:$merge, fvti.RegClass:$rs1,
+                    VR:$passthru, fvti.RegClass:$rs1,
                     fvti.ScalarRegClass:$rs2, (fvti.Mask V0),
                     GPR:$vl, fvti.Log2SEW)>;
     }
@@ -1437,12 +1437,12 @@ multiclass VPatReductionVL<SDNode vop, string instruction_name, bit is_float> {
   foreach vti = !if(is_float, AllFloatVectors, AllIntegerVectors) in {
     defvar vti_m1 = !cast<VTypeInfo>(!if(is_float, "VF", "VI") # vti.SEW # "M1");
     let Predicates = GetVTypePredicates<vti>.Predicates in {
-      def: Pat<(vti_m1.Vector (vop (vti_m1.Vector VR:$merge),
+      def: Pat<(vti_m1.Vector (vop (vti_m1.Vector VR:$passthru),
                                    (vti.Vector vti.RegClass:$rs1), VR:$rs2,
                                    (vti.Mask V0), VLOpFrag,
                                    (XLenVT timm:$policy))),
           (!cast<Instruction>(instruction_name#"_VS_"#vti.LMul.MX#"_E"#vti.SEW#"_MASK")
-              (vti_m1.Vector VR:$merge),
+              (vti_m1.Vector VR:$passthru),
               (vti.Vector vti.RegClass:$rs1),
               (vti_m1.Vector VR:$rs2),
               (vti.Mask V0), GPR:$vl, vti.Log2SEW, (XLenVT timm:$policy))>;
@@ -1454,12 +1454,12 @@ multiclass VPatReductionVL_RM<SDNode vop, string instruction_name, bit is_float>
   foreach vti = !if(is_float, AllFloatVectors, AllIntegerVectors) in {
     defvar vti_m1 = !cast<VTypeInfo>(!if(is_float, "VF", "VI") # vti.SEW # "M1");
     let Predicates = GetVTypePredicates<vti>.Predicates in {
-      def: Pat<(vti_m1.Vector (vop (vti_m1.Vector VR:$merge),
+      def: Pat<(vti_m1.Vector (vop (vti_m1.Vector VR:$passthru),
                                    (vti.Vector vti.RegClass:$rs1), VR:$rs2,
                                    (vti.Mask V0), VLOpFrag,
                                    (XLenVT timm:$policy))),
           (!cast<Instruction>(instruction_name#"_VS_"#vti.LMul.MX#"_E"#vti.SEW#"_MASK")
-              (vti_m1.Vector VR:$merge),
+              (vti_m1.Vector VR:$passthru),
               (vti.Vector vti.RegClass:$rs1),
               (vti_m1.Vector VR:$rs2),
               (vti.Mask V0),
@@ -1519,12 +1519,12 @@ multiclass VPatWidenReductionVL<SDNode vop, PatFrags extop, string instruction_n
     defvar wti_m1 = !cast<VTypeInfo>(!if(is_float, "VF", "VI") # wti.SEW # "M1");
     let Predicates = !listconcat(GetVTypePredicates<vti>.Predicates,
                                  GetVTypePredicates<wti>.Predicates) in {
-      def: Pat<(wti_m1.Vector (vop (wti_m1.Vector VR:$merge),
+      def: Pat<(wti_m1.Vector (vop (wti_m1.Vector VR:$passthru),
                                    (wti.Vector (extop (vti.Vector vti.RegClass:$rs1))),
                                    VR:$rs2, (vti.Mask V0), VLOpFrag,
                                    (XLenVT timm:$policy))),
                (!cast<Instruction>(instruction_name#"_VS_"#vti.LMul.MX#"_E"#vti.SEW#"_MASK")
-                  (wti_m1.Vector VR:$merge), (vti.Vector vti.RegClass:$rs1),
+                  (wti_m1.Vector VR:$passthru), (vti.Vector vti.RegClass:$rs1),
                   (wti_m1.Vector VR:$rs2), (vti.Mask V0), GPR:$vl, vti.Log2SEW,
                   (XLenVT timm:$policy))>;
     }
@@ -1538,12 +1538,12 @@ multiclass VPatWidenReductionVL_RM<SDNode vop, PatFrags extop, string instructio
     defvar wti_m1 = !cast<VTypeInfo>(!if(is_float, "VF", "VI") # wti.SEW # "M1");
     let Predicates = !listconcat(GetVTypePredicates<vti>.Predicates,
                                  GetVTypePredicates<wti>.Predicates) in {
-      def: Pat<(wti_m1.Vector (vop (wti_m1.Vector VR:$merge),
+      def: Pat<(wti_m1.Vector (vop (wti_m1.Vector VR:$passthru),
                                    (wti.Vector (extop (vti.Vector vti.RegClass:$rs1))),
                                    VR:$rs2, (vti.Mask V0), VLOpFrag,
                                    (XLenVT timm:$policy))),
                (!cast<Instruction>(instruction_name#"_VS_"#vti.LMul.MX#"_E"#vti.SEW#"_MASK")
-                  (wti_m1.Vector VR:$merge), (vti.Vector vti.RegClass:$rs1),
+                  (wti_m1.Vector VR:$passthru), (vti.Vector vti.RegClass:$rs1),
                   (wti_m1.Vector VR:$rs2), (vti.Mask V0),
                   // Value to indicate no rounding mode change in
                   // RISCVInsertReadWriteCSR
@@ -1561,12 +1561,12 @@ multiclass VPatWidenReductionVL_Ext_VL<SDNode vop, PatFrags extop, string instru
     defvar wti_m1 = !cast<VTypeInfo>(!if(is_float, "VF", "VI") # wti.SEW # "M1");
     let Predicates = !listconcat(GetVTypePredicates<vti>.Predicates,
                                  GetVTypePredicates<wti>.Predicates) in {
-      def: Pat<(wti_m1.Vector (vop (wti_m1.Vector VR:$merge),
+      def: Pat<(wti_m1.Vector (vop (wti_m1.Vector VR:$passthru),
                                    (wti.Vector (extop (vti.Vector vti.RegClass:$rs1), (vti.Mask true_mask), VLOpFrag)),
                                    VR:$rs2, (vti.Mask V0), VLOpFrag,
                                    (XLenVT timm:$policy))),
                (!cast<Instruction>(instruction_name#"_VS_"#vti.LMul.MX#"_E"#vti.SEW#"_MASK")
-                  (wti_m1.Vector VR:$merge), (vti.Vector vti.RegClass:$rs1),
+                  (wti_m1.Vector VR:$passthru), (vti.Vector vti.RegClass:$rs1),
                   (wti_m1.Vector VR:$rs2), (vti.Mask V0), GPR:$vl, vti.Log2SEW,
                   (XLenVT timm:$policy))>;
     }
@@ -1580,12 +1580,12 @@ multiclass VPatWidenReductionVL_Ext_VL_RM<SDNode vop, PatFrags extop, string ins
     defvar wti_m1 = !cast<VTypeInfo>(!if(is_float, "VF", "VI") # wti.SEW # "M1");
     let Predicates = !listconcat(GetVTypePredicates<vti>.Predicates,
                                  GetVTypePredicates<wti>.Predicates) in {
-      def: Pat<(wti_m1.Vector (vop (wti_m1.Vector VR:$merge),
+      def: Pat<(wti_m1.Vector (vop (wti_m1.Vector VR:$passthru),
                                    (wti.Vector (extop (vti.Vector vti.RegClass:$rs1), (vti.Mask true_mask), VLOpFrag)),
                                    VR:$rs2, (vti.Mask V0), VLOpFrag,
                                    (XLenVT timm:$policy))),
                (!cast<Instruction>(instruction_name#"_VS_"#vti.LMul.MX#"_E"#vti.SEW#"_MASK")
-                  (wti_m1.Vector VR:$merge), (vti.Vector vti.RegClass:$rs1),
+                  (wti_m1.Vector VR:$passthru), (vti.Vector vti.RegClass:$rs1),
                   (wti_m1.Vector VR:$rs2), (vti.Mask V0),
                   // Value to indicate no rounding mode change in
                   // RISCVInsertReadWriteCSR
@@ -2098,15 +2098,15 @@ multiclass VPatAVGADDVL_VV_VX_RM<SDNode vop, int vxrm, string suffix = ""> {
     let Predicates = GetVTypePredicates<vti>.Predicates in {
       def : Pat<(vop (vti.Vector vti.RegClass:$rs1),
                      (vti.Vector vti.RegClass:$rs2),
-                     vti.RegClass:$merge, (vti.Mask V0), VLOpFrag),
+                     vti.RegClass:$passthru, (vti.Mask V0), VLOpFrag),
                 (!cast<Instruction>("PseudoVAADD"#suffix#"_VV_"#vti.LMul.MX#"_MASK")
-                  vti.RegClass:$merge, vti.RegClass:$rs1, vti.RegClass:$rs2,
+                  vti.RegClass:$passthru, vti.RegClass:$rs1, vti.RegClass:$rs2,
                   (vti.Mask V0), vxrm, GPR:$vl, vti.Log2SEW, TAIL_AGNOSTIC)>;
       def : Pat<(vop (vti.Vector vti.RegClass:$rs1),
                      (vti.Vector (SplatPat (XLenVT GPR:$rs2))),
-                     vti.RegClass:$merge, (vti.Mask V0), VLOpFrag),
+                     vti.RegClass:$passthru, (vti.Mask V0), VLOpFrag),
                 (!cast<Instruction>("PseudoVAADD"#suffix#"_VX_"#vti.LMul.MX#"_MASK")
-                  vti.RegClass:$merge, vti.RegClass:$rs1, GPR:$rs2,
+                  vti.RegClass:$passthru, vti.RegClass:$rs1, GPR:$rs2,
                   (vti.Mask V0), vxrm, GPR:$vl, vti.Log2SEW, TAIL_AGNOSTIC)>;
     }
   }
@@ -2127,15 +2127,15 @@ foreach vti = AllIntegerVectors in {
   let Predicates = GetVTypePredicates<vti>.Predicates in {
     def : Pat<(riscv_sub_vl (vti.Vector (SplatPat (XLenVT GPR:$rs2))),
                             (vti.Vector vti.RegClass:$rs1),
-                            vti.RegClass:$merge, (vti.Mask V0), VLOpFrag),
+                            vti.RegClass:$passthru, (vti.Mask V0), VLOpFrag),
               (!cast<Instruction>("PseudoVRSUB_VX_"# vti.LMul.MX#"_MASK")
-                   vti.RegClass:$merge, vti.RegClass:$rs1, GPR:$rs2,
+                   vti.RegClass:$passthru, vti.RegClass:$rs1, GPR:$rs2,
                    (vti.Mask V0), GPR:$vl, vti.Log2SEW, TAIL_AGNOSTIC)>;
     def : Pat<(riscv_sub_vl (vti.Vector (SplatPat_simm5 simm5:$rs2)),
                             (vti.Vector vti.RegClass:$rs1),
-                            vti.RegClass:$merge, (vti.Mask V0), VLOpFrag),
+                            vti.RegClass:$passthru, (vti.Mask V0), VLOpFrag),
               (!cast<Instruction>("PseudoVRSUB_VI_"# vti.LMul.MX#"_MASK")
-                   vti.RegClass:$merge, vti.RegClass:$rs1, simm5:$rs2,
+                   vti.RegClass:$passthru, vti.RegClass:$rs1, simm5:$rs2,
                    (vti.Mask V0), GPR:$vl, vti.Log2SEW, TAIL_AGNOSTIC)>;
   }
 }
@@ -2157,18 +2157,18 @@ foreach vtiToWti = AllWidenableIntVectors in {
                               (vti.Mask V0), VLOpFrag)),
                             (wti.Vector (riscv_vmv_v_x_vl
                               (wti.Vector undef), 1, VLOpFrag)),
-                              wti.RegClass:$merge, (vti.Mask V0), VLOpFrag),
+                              wti.RegClass:$passthru, (vti.Mask V0), VLOpFrag),
               (!cast<Instruction>("PseudoVWADD_VV_"#vti.LMul.MX#"_MASK")
-               wti.RegClass:$merge, vti.RegClass:$rs1, vti.RegClass:$rs1,
+               wti.RegClass:$passthru, vti.RegClass:$rs1, vti.RegClass:$rs1,
                (vti.Mask V0), GPR:$vl, vti.Log2SEW, TAIL_AGNOSTIC)>;
     def : Pat<(riscv_shl_vl (wti.Vector (riscv_zext_vl_oneuse
                               (vti.Vector vti.RegClass:$rs1),
                               (vti.Mask V0), VLOpFrag)),
                             (wti.Vector (riscv_vmv_v_x_vl
                               (wti.Vector undef), 1, VLOpFrag)),
-                              wti.RegClass:$merge, (vti.Mask V0), VLOpFrag),
+                              wti.RegClass:$passthru, (vti.Mask V0), VLOpFrag),
               (!cast<Instruction>("PseudoVWADDU_VV_"#vti.LMul.MX#"_MASK")
-               wti.RegClass:$merge, vti.RegClass:$rs1, vti.RegClass:$rs1,
+               wti.RegClass:$passthru, vti.RegClass:$rs1, vti.RegClass:$rs1,
                (vti.Mask V0), GPR:$vl, vti.Log2SEW, TAIL_AGNOSTIC)>;
   }
 }
@@ -2333,28 +2333,28 @@ foreach vti = AllIntegerVectors in {
     def : Pat<(vti.Vector (riscv_vmerge_vl (vti.Mask V0),
                                            vti.RegClass:$rs1,
                                            vti.RegClass:$rs2,
-                                           vti.RegClass:$merge,
+                                           vti.RegClass:$passthru,
                                            VLOpFrag)),
               (!cast<Instruction>("PseudoVMERGE_VVM_"#vti.LMul.MX)
-                  vti.RegClass:$merge, vti.RegClass:$rs2, vti.RegClass:$rs1,
+                  vti.RegClass:$passthru, vti.RegClass:$rs2, vti.RegClass:$rs1,
                   (vti.Mask V0), GPR:$vl, vti.Log2SEW)>;
 
     def : Pat<(vti.Vector (riscv_vmerge_vl (vti.Mask V0),
                                             (SplatPat XLenVT:$rs1),
                                             vti.RegClass:$rs2,
-                                            vti.RegClass:$merge,
+                                            vti.RegClass:$passthru,
                                             VLOpFrag)),
               (!cast<Instruction>("PseudoVMERGE_VXM_"#vti.LMul.MX)
-                  vti.RegClass:$merge, vti.RegClass:$rs2, GPR:$rs1,
+                  vti.RegClass:$passthru, vti.RegClass:$rs2, GPR:$rs1,
                   (vti.Mask V0), GPR:$vl, vti.Log2SEW)>;
 
     def : Pat<(vti.Vector (riscv_vmerge_vl (vti.Mask V0),
                                            (SplatPat_simm5 simm5:$rs1),
                                            vti.RegClass:$rs2,
-                                           vti.RegClass:$merge,
+                                           vti.RegClass:$passthru,
                                            VLOpFrag)),
               (!cast<Instruction>("PseudoVMERGE_VIM_"#vti.LMul.MX)
-                  vti.RegClass:$merge, vti.RegClass:$rs2, simm5:$rs1,
+                  vti.RegClass:$passthru, vti.RegClass:$rs2, simm5:$rs1,
                   (vti.Mask V0), GPR:$vl, vti.Log2SEW)>;
   }
 }
@@ -2505,11 +2505,11 @@ foreach vti = AllFloatVectors in {
 
     def : Pat<(riscv_fcopysign_vl (vti.Vector vti.RegClass:$rs1),
                                   (vti.Vector vti.RegClass:$rs2),
-                                  vti.RegClass:$merge,
+                                  vti.RegClass:$passthru,
                                   (vti.Mask V0),
                                   VLOpFrag),
               (!cast<Instruction>("PseudoVFSGNJ_VV_"# vti.LMul.MX#"_E"#vti.SEW#"_MASK")
-                   vti.RegClass:$merge, vti.RegClass:$rs1,
+                   vti.RegClass:$passthru, vti.RegClass:$rs1,
                    vti.RegClass:$rs2, (vti.Mask V0), GPR:$vl, vti.Log2SEW,
                    TAIL_AGNOSTIC)>;
 
@@ -2526,11 +2526,11 @@ foreach vti = AllFloatVectors in {
 
     def : Pat<(riscv_fcopysign_vl (vti.Vector vti.RegClass:$rs1),
                                   (SplatFPOp vti.ScalarRegClass:$rs2),
-                                  vti.RegClass:$merge,
+                                  vti.RegClass:$passthru,
                                   (vti.Mask V0),
                                   VLOpFrag),
               (!cast<Instruction>("PseudoVFSGNJ_V"#vti.ScalarSuffix#"_"# vti.LMul.MX#"_E"#vti.SEW#"_MASK")
-                   vti.RegClass:$merge, vti.RegClass:$rs1,
+                   vti.RegClass:$passthru, vti.RegClass:$rs1,
                    vti.ScalarRegClass:$rs2, (vti.Mask V0), GPR:$vl, vti.Log2SEW,
                    TAIL_AGNOSTIC)>;
 
@@ -2559,29 +2559,29 @@ foreach fvti = !listconcat(AllFloatVectors, AllBFloatVectors) in {
   def : Pat<(fvti.Vector (riscv_vmerge_vl (fvti.Mask V0),
                                           fvti.RegClass:$rs1,
                                           fvti.RegClass:$rs2,
-                                          fvti.RegClass:$merge,
+                                          fvti.RegClass:$passthru,
                                           VLOpFrag)),
             (!cast<Instruction>("PseudoVMERGE_VVM_"#fvti.LMul.MX)
-                 fvti.RegClass:$merge, fvti.RegClass:$rs2, fvti.RegClass:$rs1, (fvti.Mask V0),
+                 fvti.RegClass:$passthru, fvti.RegClass:$rs2, fvti.RegClass:$rs1, (fvti.Mask V0),
                  GPR:$vl, fvti.Log2SEW)>;
 
   def : Pat<(fvti.Vector (riscv_vmerge_vl (fvti.Mask V0),
                                           (SplatFPOp (SelectFPImm (XLenVT GPR:$imm))),
                                           fvti.RegClass:$rs2,
-                                          fvti.RegClass:$merge,
+                                          fvti.RegClass:$passthru,
                                           VLOpFrag)),
             (!cast<Instruction>("PseudoVMERGE_VXM_"#fvti.LMul.MX)
-                 fvti.RegClass:$merge, fvti.RegClass:$rs2, GPR:$imm, (fvti.Mask V0),
+                 fvti.RegClass:$passthru, fvti.RegClass:$rs2, GPR:$imm, (fvti.Mask V0),
                  GPR:$vl, fvti.Log2SEW)>;
 
 
   def : Pat<(fvti.Vector (riscv_vmerge_vl (fvti.Mask V0),
                                           (SplatFPOp (fvti.Scalar fpimm0)),
                                           fvti.RegClass:$rs2,
-                                          fvti.RegClass:$merge,
+                                          fvti.RegClass:$passthru,
                                           VLOpFrag)),
             (!cast<Instruction>("PseudoVMERGE_VIM_"#fvti.LMul.MX)
-                 fvti.RegClass:$merge, fvti.RegClass:$rs2, 0, (fvti.Mask V0),
+                 fvti.RegClass:$passthru, fvti.RegClass:$rs2, 0, (fvti.Mask V0),
                  GPR:$vl, fvti.Log2SEW)>;
   }
 }
@@ -2591,10 +2591,10 @@ foreach fvti = AllFloatVectors in {
     def : Pat<(fvti.Vector (riscv_vmerge_vl (fvti.Mask V0),
                                             (SplatFPOp fvti.ScalarRegClass:$rs1),
                                             fvti.RegClass:$rs2,
-                                            fvti.RegClass:$merge,
+                                            fvti.RegClass:$passthru,
                                             VLOpFrag)),
               (!cast<Instruction>("PseudoVFMERGE_V"#fvti.ScalarSuffix#"M_"#fvti.LMul.MX)
-                   fvti.RegClass:$merge, fvti.RegClass:$rs2,
+                   fvti.RegClass:$passthru, fvti.RegClass:$rs2,
                    (fvti.Scalar fvti.ScalarRegClass:$rs1),
                    (fvti.Mask V0), GPR:$vl, fvti.Log2SEW)>;
   }
@@ -2673,7 +2673,7 @@ foreach fvtiToFWti = AllWidenableFloatVectors in {
 foreach fvtiToFWti = AllWidenableBFloatToFloatVectors in {
   defvar fvti = fvtiToFWti.Vti;
   defvar fwti = fvtiToFWti.Wti;
-  let Predicates = [HasVInstructionsBF16] in
+  let Predicates = [HasVInstructionsBF16Minimal] in
   def : Pat<(fwti.Vector (any_riscv_fpextend_vl
                              (fvti.Vector fvti.RegClass:$rs1),
                              (fvti.Mask V0),
@@ -2731,7 +2731,7 @@ foreach fvtiToFWti = AllWidenableFloatVectors in {
 foreach fvtiToFWti = AllWidenableBFloatToFloatVectors in {
   defvar fvti = fvtiToFWti.Vti;
   defvar fwti = fvtiToFWti.Wti;
-  let Predicates = [HasVInstructionsBF16] in
+  let Predicates = [HasVInstructionsBF16Minimal] in
     def : Pat<(fvti.Vector (any_riscv_fpround_vl
                                (fwti.Vector fwti.RegClass:$rs1),
                                (fwti.Mask V0), VLOpFrag)),
@@ -2866,10 +2866,10 @@ foreach mti = AllMasks in {
 // 16.1. Integer Scalar Move Instructions
 foreach vti = NoGroupIntegerVectors in {
   let Predicates = GetVTypePredicates<vti>.Predicates in {
-    def : Pat<(vti.Vector (riscv_vmv_s_x_vl (vti.Vector vti.RegClass:$merge),
+    def : Pat<(vti.Vector (riscv_vmv_s_x_vl (vti.Vector vti.RegClass:$passthru),
                                             vti.ScalarRegClass:$rs1,
                                             VLOpFrag)),
-              (PseudoVMV_S_X $merge, vti.ScalarRegClass:$rs1, GPR:$vl,
+              (PseudoVMV_S_X $passthru, vti.ScalarRegClass:$rs1, GPR:$vl,
                              vti.Log2SEW)>;
   }
 }
@@ -2879,26 +2879,26 @@ foreach vti = AllIntegerVectors in {
   let Predicates = GetVTypePredicates<vti>.Predicates in {
     def : Pat<(vti.Vector (riscv_vrgather_vv_vl vti.RegClass:$rs2,
                                                 vti.RegClass:$rs1,
-                                                vti.RegClass:$merge,
+                                                vti.RegClass:$passthru,
                                                 (vti.Mask V0),
                                                 VLOpFrag)),
               (!cast<Instruction>("PseudoVRGATHER_VV_"# vti.LMul.MX#"_E"# vti.SEW#"_MASK")
-                   vti.RegClass:$merge, vti.RegClass:$rs2, vti.RegClass:$rs1,
+                   vti.RegClass:$passthru, vti.RegClass:$rs2, vti.RegClass:$rs1,
                    (vti.Mask V0), GPR:$vl, vti.Log2SEW, TAIL_AGNOSTIC)>;
     def : Pat<(vti.Vector (riscv_vrgather_vx_vl vti.RegClass:$rs2, GPR:$rs1,
-                                                vti.RegClass:$merge,
+                                                vti.RegClass:$passthru,
                                                 (vti.Mask V0),
                                                 VLOpFrag)),
               (!cast<Instruction>("PseudoVRGATHER_VX_"# vti.LMul.MX#"_MASK")
-                   vti.RegClass:$merge, vti.RegClass:$rs2, GPR:$rs1,
+                   vti.RegClass:$passthru, vti.RegClass:$rs2, GPR:$rs1,
                    (vti.Mask V0), GPR:$vl, vti.Log2SEW, TAIL_AGNOSTIC)>;
     def : Pat<(vti.Vector (riscv_vrgather_vx_vl vti.RegClass:$rs2,
                                                 uimm5:$imm,
-                                                vti.RegClass:$merge,
+                                                vti.RegClass:$passthru,
                                                 (vti.Mask V0),
                                                 VLOpFrag)),
               (!cast<Instruction>("PseudoVRGATHER_VI_"# vti.LMul.MX#"_MASK")
-                   vti.RegClass:$merge, vti.RegClass:$rs2, uimm5:$imm,
+                   vti.RegClass:$passthru, vti.RegClass:$rs2, uimm5:$imm,
                    (vti.Mask V0), GPR:$vl, vti.Log2SEW, TAIL_AGNOSTIC)>;
   }
 
@@ -2914,11 +2914,11 @@ foreach vti = AllIntegerVectors in {
     def : Pat<(vti.Vector
                (riscv_vrgatherei16_vv_vl vti.RegClass:$rs2,
                                          (ivti.Vector ivti.RegClass:$rs1),
-                                         vti.RegClass:$merge,
+                                         vti.RegClass:$passthru,
                                          (vti.Mask V0),
                                          VLOpFrag)),
               (!cast<Instruction>(inst#"_MASK")
-                   vti.RegClass:$merge, vti.RegClass:$rs2, ivti.RegClass:$rs1,
+                   vti.RegClass:$passthru, vti.RegClass:$rs2, ivti.RegClass:$rs1,
                    (vti.Mask V0), GPR:$vl, vti.Log2SEW, TAIL_AGNOSTIC)>;
   }
 }
@@ -2926,52 +2926,50 @@ foreach vti = AllIntegerVectors in {
 // 16.2. Floating-Point Scalar Move Instructions
 foreach vti = NoGroupFloatVectors in {
   let Predicates = GetVTypePredicates<vti>.Predicates in {
-    def : Pat<(vti.Vector (riscv_vfmv_s_f_vl (vti.Vector vti.RegClass:$merge),
+    def : Pat<(vti.Vector (riscv_vfmv_s_f_vl (vti.Vector vti.RegClass:$passthru),
                                              (vti.Scalar (fpimm0)),
                                              VLOpFrag)),
-              (PseudoVMV_S_X $merge, (XLenVT X0), GPR:$vl, vti.Log2SEW)>;
-    def : Pat<(vti.Vector (riscv_vfmv_s_f_vl (vti.Vector vti.RegClass:$merge),
+              (PseudoVMV_S_X $passthru, (XLenVT X0), GPR:$vl, vti.Log2SEW)>;
+    def : Pat<(vti.Vector (riscv_vfmv_s_f_vl (vti.Vector vti.RegClass:$passthru),
                                              (vti.Scalar (SelectFPImm (XLenVT GPR:$imm))),
                                              VLOpFrag)),
-              (PseudoVMV_S_X $merge, GPR:$imm, GPR:$vl, vti.Log2SEW)>;
-  }
-}
-
-foreach vti = AllFloatVectors in {
-  let Predicates = GetVTypePredicates<vti>.Predicates in {
-    def : Pat<(vti.Vector (riscv_vfmv_s_f_vl (vti.Vector vti.RegClass:$merge),
+              (PseudoVMV_S_X $passthru, GPR:$imm, GPR:$vl, vti.Log2SEW)>;
+    def : Pat<(vti.Vector (riscv_vfmv_s_f_vl (vti.Vector vti.RegClass:$passthru),
                                              vti.ScalarRegClass:$rs1,
                                              VLOpFrag)),
-              (!cast<Instruction>("PseudoVFMV_S_"#vti.ScalarSuffix#"_"#vti.LMul.MX)
-                  vti.RegClass:$merge,
+              (!cast<Instruction>("PseudoVFMV_S_"#vti.ScalarSuffix)
+                  vti.RegClass:$passthru,
                   (vti.Scalar vti.ScalarRegClass:$rs1), GPR:$vl, vti.Log2SEW)>;
   }
+}
+
+foreach vti = AllFloatVectors in {
   defvar ivti = GetIntVTypeInfo<vti>.Vti;
   let Predicates = GetVTypePredicates<ivti>.Predicates in {
     def : Pat<(vti.Vector
                (riscv_vrgather_vv_vl vti.RegClass:$rs2,
                                      (ivti.Vector vti.RegClass:$rs1),
-                                     vti.RegClass:$merge,
+                                     vti.RegClass:$passthru,
                                      (vti.Mask V0),
                                      VLOpFrag)),
               (!cast<Instruction>("PseudoVRGATHER_VV_"# vti.LMul.MX#"_E"# vti.SEW#"_MASK")
-                   vti.RegClass:$merge, vti.RegClass:$rs2, vti.RegClass:$rs1,
+                   vti.RegClass:$passthru, vti.RegClass:$rs2, vti.RegClass:$rs1,
                    (vti.Mask V0), GPR:$vl, vti.Log2SEW, TAIL_AGNOSTIC)>;
     def : Pat<(vti.Vector (riscv_vrgather_vx_vl vti.RegClass:$rs2, GPR:$rs1,
-                                                vti.RegClass:$merge,
+                                                vti.RegClass:$passthru,
                                                 (vti.Mask V0),
                                                 VLOpFrag)),
               (!cast<Instruction>("PseudoVRGATHER_VX_"# vti.LMul.MX#"_MASK")
-                   vti.RegClass:$merge, vti.RegClass:$rs2, GPR:$rs1,
+                   vti.RegClass:$passthru, vti.RegClass:$rs2, GPR:$rs1,
                    (vti.Mask V0), GPR:$vl, vti.Log2SEW, TAIL_AGNOSTIC)>;
     def : Pat<(vti.Vector
                (riscv_vrgather_vx_vl vti.RegClass:$rs2,
                                      uimm5:$imm,
-                                     vti.RegClass:$merge,
+                                     vti.RegClass:$passthru,
                                      (vti.Mask V0),
                                      VLOpFrag)),
               (!cast<Instruction>("PseudoVRGATHER_VI_"# vti.LMul.MX#"_MASK")
-                   vti.RegClass:$merge, vti.RegClass:$rs2, uimm5:$imm,
+                   vti.RegClass:$passthru, vti.RegClass:$rs2, uimm5:$imm,
                    (vti.Mask V0), GPR:$vl, vti.Log2SEW, TAIL_AGNOSTIC)>;
   }
 
@@ -2987,11 +2985,11 @@ foreach vti = AllFloatVectors in {
     def : Pat<(vti.Vector
                (riscv_vrgatherei16_vv_vl vti.RegClass:$rs2,
                                          (ivti.Vector ivti.RegClass:$rs1),
-                                         vti.RegClass:$merge,
+                                         vti.RegClass:$passthru,
                                          (vti.Mask V0),
                                          VLOpFrag)),
               (!cast<Instruction>(inst#"_MASK")
-                   vti.RegClass:$merge, vti.RegClass:$rs2, ivti.RegClass:$rs1,
+                   vti.RegClass:$passthru, vti.RegClass:$rs2, ivti.RegClass:$rs1,
                    (vti.Mask V0), GPR:$vl, vti.Log2SEW, TAIL_AGNOSTIC)>;
   }
 }
diff --git a/llvm/lib/Target/RISCV/RISCVInstrInfoXVentana.td b/llvm/lib/Target/RISCV/RISCVInstrInfoXVentana.td
index d0a798e..b1a7a18 100644
--- a/llvm/lib/Target/RISCV/RISCVInstrInfoXVentana.td
+++ b/llvm/lib/Target/RISCV/RISCVInstrInfoXVentana.td
@@ -14,7 +14,7 @@
 // XVentanaCondOps
 //===----------------------------------------------------------------------===//
 
-let Predicates = [IsRV64, HasVendorXVentanaCondOps], hasSideEffects = 0,
+let Predicates = [HasVendorXVentanaCondOps], hasSideEffects = 0,
     mayLoad = 0, mayStore = 0, isCodeGenOnly = 0, DecoderNamespace = "XVentana" in
 class VTMaskedMove<bits<3> funct3, string opcodestr>
     : RVInstR<0b0000000, funct3, OPC_CUSTOM_3, (outs GPR:$rd),
@@ -28,18 +28,18 @@ def VT_MASKC : VTMaskedMove<0b110, "vt.maskc">,
 def VT_MASKCN : VTMaskedMove<0b111, "vt.maskcn">,
            Sched<[WriteIALU, ReadIALU, ReadIALU]>;
 
-let Predicates = [IsRV64, HasVendorXVentanaCondOps] in {
-def : Pat<(i64 (riscv_czero_eqz GPR:$rs1, GPR:$rc)),
+let Predicates = [HasVendorXVentanaCondOps] in {
+def : Pat<(XLenVT (riscv_czero_eqz GPR:$rs1, GPR:$rc)),
           (VT_MASKC GPR:$rs1, GPR:$rc)>;
-def : Pat<(i64 (riscv_czero_nez GPR:$rs1, GPR:$rc)),
+def : Pat<(XLenVT (riscv_czero_nez GPR:$rs1, GPR:$rc)),
           (VT_MASKCN GPR:$rs1, GPR:$rc)>;
 
-def : Pat<(i64 (riscv_czero_eqz GPR:$rs1, (riscv_setne (i64 GPR:$rc)))),
+def : Pat<(XLenVT (riscv_czero_eqz GPR:$rs1, (riscv_setne (XLenVT GPR:$rc)))),
           (VT_MASKC GPR:$rs1, GPR:$rc)>;
-def : Pat<(i64 (riscv_czero_eqz GPR:$rs1, (riscv_seteq (i64 GPR:$rc)))),
+def : Pat<(XLenVT (riscv_czero_eqz GPR:$rs1, (riscv_seteq (XLenVT GPR:$rc)))),
           (VT_MASKCN GPR:$rs1, GPR:$rc)>;
-def : Pat<(i64 (riscv_czero_nez GPR:$rs1, (riscv_setne (i64 GPR:$rc)))),
+def : Pat<(XLenVT (riscv_czero_nez GPR:$rs1, (riscv_setne (XLenVT GPR:$rc)))),
           (VT_MASKCN GPR:$rs1, GPR:$rc)>;
-def : Pat<(i64 (riscv_czero_nez GPR:$rs1, (riscv_seteq (i64 GPR:$rc)))),
+def : Pat<(XLenVT (riscv_czero_nez GPR:$rs1, (riscv_seteq (XLenVT GPR:$rc)))),
           (VT_MASKC GPR:$rs1, GPR:$rc)>;
-} // Predicates = [IsRV64, HasVendorXVentanaCondOps]
+} // Predicates = [HasVendorXVentanaCondOps]
diff --git a/llvm/lib/Target/RISCV/RISCVInstrInfoZfh.td b/llvm/lib/Target/RISCV/RISCVInstrInfoZfh.td
index e0f1c71..85715ca 100644
--- a/llvm/lib/Target/RISCV/RISCVInstrInfoZfh.td
+++ b/llvm/lib/Target/RISCV/RISCVInstrInfoZfh.td
@@ -272,6 +272,7 @@ def : Pat<(f16 (fabs FPR16:$rs1)), (FSGNJX_H $rs1, $rs1)>;
 def : Pat<(riscv_fclass (f16 FPR16:$rs1)), (FCLASS_H $rs1)>;
 
 def : PatFprFpr<fcopysign, FSGNJ_H, FPR16, f16>;
+def : PatFprFpr<riscv_fsgnjx, FSGNJX_H, FPR16, f16>;
 def : Pat<(f16 (fcopysign FPR16:$rs1, (f16 (fneg FPR16:$rs2)))), (FSGNJN_H $rs1, $rs2)>;
 def : Pat<(f16 (fcopysign FPR16:$rs1, FPR32:$rs2)),
           (FSGNJ_H $rs1, (FCVT_H_S $rs2, FRM_DYN))>;
@@ -314,6 +315,7 @@ def : Pat<(fabs FPR16INX:$rs1), (FSGNJX_H_INX $rs1, $rs1)>;
 def : Pat<(riscv_fclass FPR16INX:$rs1), (FCLASS_H_INX $rs1)>;
 
 def : PatFprFpr<fcopysign, FSGNJ_H_INX, FPR16INX, f16>;
+def : PatFprFpr<riscv_fsgnjx, FSGNJX_H_INX, FPR16INX, f16>;
 def : Pat<(fcopysign FPR16INX:$rs1, (fneg FPR16INX:$rs2)), (FSGNJN_H_INX $rs1, $rs2)>;
 def : Pat<(fcopysign FPR16INX:$rs1, FPR32INX:$rs2),
           (FSGNJ_H_INX $rs1, (FCVT_H_S_INX $rs2, FRM_DYN))>;
diff --git a/llvm/lib/Target/RISCV/RISCVInstrInfoZvk.td b/llvm/lib/Target/RISCV/RISCVInstrInfoZvk.td
index 75fcc1e..cd03ac2 100644
--- a/llvm/lib/Target/RISCV/RISCVInstrInfoZvk.td
+++ b/llvm/lib/Target/RISCV/RISCVInstrInfoZvk.td
@@ -349,7 +349,7 @@ multiclass VPseudoVAESKF1 {
     defvar mx = m.MX;
     defm _VI : VPseudoBinaryNoMaskPolicy_Zvk<m.vrclass, m.vrclass, uimm5, m>,
                SchedBinary<"WriteVAESKF1V", "ReadVAESKF1V", "ReadVAESKF1V", mx,
-                           forceMergeOpRead=true>;
+                           forcePassthruRead=true>;
   }
 }
 
@@ -384,7 +384,7 @@ multiclass VPseudoVSM4K {
     defvar mx = m.MX;
     defm _VI : VPseudoBinaryNoMaskPolicy_Zvk<m.vrclass, m.vrclass, uimm5, m>,
                SchedBinary<"WriteVSM4KV", "ReadVSM4KV", "ReadVSM4KV", mx,
-                           forceMergeOpRead=true>;
+                           forcePassthruRead=true>;
   }
 }
 
@@ -393,7 +393,7 @@ multiclass VPseudoVSM3ME {
     defvar mx = m.MX;
     defm _VV : VPseudoBinaryNoMaskPolicy_Zvk<m.vrclass, m.vrclass, m.vrclass, m>,
                SchedBinary<"WriteVSM3MEV", "ReadVSM3MEV", "ReadVSM3MEV", mx,
-                           forceMergeOpRead=true>;
+                           forcePassthruRead=true>;
   }
 }
 
@@ -402,10 +402,10 @@ multiclass VPseudoVCLMUL_VV_VX {
     defvar mx = m.MX;
     defm "" : VPseudoBinaryV_VV<m>,
               SchedBinary<"WriteVCLMULV", "ReadVCLMULV", "ReadVCLMULV", mx,
-                          forceMergeOpRead=true>;
+                          forcePassthruRead=true>;
     defm "" : VPseudoBinaryV_VX<m>,
               SchedBinary<"WriteVCLMULX", "ReadVCLMULV", "ReadVCLMULX", mx,
-                          forceMergeOpRead=true>;
+                          forcePassthruRead=true>;
   }
 }
 
@@ -422,7 +422,7 @@ multiclass VPseudoVBREV {
   foreach m = MxList in {
     defvar mx = m.MX;
     defm "" : VPseudoUnaryV_V<m>,
-              SchedUnary<"WriteVBREVV", "ReadVBREVV", mx, forceMergeOpRead=true>;
+              SchedUnary<"WriteVBREVV", "ReadVBREVV", mx, forcePassthruRead=true>;
   }
 }
 
@@ -430,7 +430,7 @@ multiclass VPseudoVCLZ {
   foreach m = MxList in {
     defvar mx = m.MX;
     defm "" : VPseudoUnaryV_V<m>,
-              SchedUnary<"WriteVCLZV", "ReadVCLZV", mx, forceMergeOpRead=true>;
+              SchedUnary<"WriteVCLZV", "ReadVCLZV", mx, forcePassthruRead=true>;
   }
 }
 
@@ -438,7 +438,7 @@ multiclass VPseudoVCTZ {
   foreach m = MxList in {
     defvar mx = m.MX;
     defm "" : VPseudoUnaryV_V<m>,
-              SchedUnary<"WriteVCTZV", "ReadVCTZV", mx, forceMergeOpRead=true>;
+              SchedUnary<"WriteVCTZV", "ReadVCTZV", mx, forcePassthruRead=true>;
   }
 }
 
@@ -446,7 +446,7 @@ multiclass VPseudoVCPOP {
   foreach m = MxList in {
     defvar mx = m.MX;
     defm "" : VPseudoUnaryV_V<m>,
-              SchedUnary<"WriteVCPOPV", "ReadVCPOPV", mx, forceMergeOpRead=true>;
+              SchedUnary<"WriteVCPOPV", "ReadVCPOPV", mx, forcePassthruRead=true>;
   }
 }
 
@@ -455,13 +455,13 @@ multiclass VPseudoVWSLL {
     defvar mx = m.MX;
     defm "" : VPseudoBinaryW_VV<m>,
               SchedBinary<"WriteVWSLLV", "ReadVWSLLV", "ReadVWSLLV", mx,
-                          forceMergeOpRead=true>;
+                          forcePassthruRead=true>;
     defm "" : VPseudoBinaryW_VX<m>,
               SchedBinary<"WriteVWSLLX", "ReadVWSLLV", "ReadVWSLLX", mx,
-                          forceMergeOpRead=true>;
+                          forcePassthruRead=true>;
     defm "" : VPseudoBinaryW_VI<uimm5, m>,
               SchedUnary<"WriteVWSLLI", "ReadVWSLLV", mx,
-                         forceMergeOpRead=true>;
+                         forcePassthruRead=true>;
   }
 }
 
@@ -469,10 +469,10 @@ multiclass VPseudoVANDN {
  foreach m = MxList in {
     defm "" : VPseudoBinaryV_VV<m>,
               SchedBinary<"WriteVIALUV", "ReadVIALUV", "ReadVIALUV", m.MX,
-                          forceMergeOpRead=true>;
+                          forcePassthruRead=true>;
     defm "" : VPseudoBinaryV_VX<m>,
               SchedBinary<"WriteVIALUX", "ReadVIALUV", "ReadVIALUX", m.MX,
-                          forceMergeOpRead=true>;
+                          forcePassthruRead=true>;
   }
 }
 
@@ -480,7 +480,7 @@ multiclass VPseudoVBREV8 {
   foreach m = MxList in {
     defvar mx = m.MX;
     defm "" : VPseudoUnaryV_V<m>,
-              SchedUnary<"WriteVBREV8V", "ReadVBREV8V", mx, forceMergeOpRead=true>;
+              SchedUnary<"WriteVBREV8V", "ReadVBREV8V", mx, forcePassthruRead=true>;
   }
 }
 
@@ -488,7 +488,7 @@ multiclass VPseudoVREV8 {
   foreach m = MxList in {
     defvar mx = m.MX;
     defm "" : VPseudoUnaryV_V<m>,
-              SchedUnary<"WriteVREV8V", "ReadVREV8V", mx, forceMergeOpRead=true>;
+              SchedUnary<"WriteVREV8V", "ReadVREV8V", mx, forcePassthruRead=true>;
   }
 }
 
@@ -496,10 +496,10 @@ multiclass VPseudoVROT_VV_VX {
  foreach m = MxList in {
     defm "" : VPseudoBinaryV_VV<m>,
               SchedBinary<"WriteVRotV", "ReadVRotV", "ReadVRotV", m.MX,
-                          forceMergeOpRead=true>;
+                          forcePassthruRead=true>;
     defm "" : VPseudoBinaryV_VX<m>,
               SchedBinary<"WriteVRotX", "ReadVRotV", "ReadVRotX", m.MX,
-                          forceMergeOpRead=true>;
+                          forcePassthruRead=true>;
   }
 }
 
@@ -508,7 +508,7 @@ multiclass VPseudoVROT_VV_VX_VI
   foreach m = MxList in {
     defm "" : VPseudoBinaryV_VI<uimm6, m>,
               SchedUnary<"WriteVRotI", "ReadVRotV", m.MX,
-                         forceMergeOpRead=true>;
+                         forcePassthruRead=true>;
   }
 }
 
@@ -691,11 +691,11 @@ multiclass VPatUnaryVL_V<SDPatternOperator op, string instruction_name,
     let Predicates = !listconcat([predicate],
                                  GetVTypePredicates<vti>.Predicates) in {
       def : Pat<(vti.Vector (op (vti.Vector vti.RegClass:$rs1),
-                                (vti.Vector vti.RegClass:$merge),
+                                (vti.Vector vti.RegClass:$passthru),
                                 (vti.Mask V0),
                                 VLOpFrag)),
                 (!cast<Instruction>(instruction_name#"_V_"#vti.LMul.MX#"_MASK")
-                   vti.RegClass:$merge,
+                   vti.RegClass:$passthru,
                    vti.RegClass:$rs1,
                    (vti.Mask V0),
                    GPR:$vl,
@@ -711,15 +711,15 @@ foreach vti = AllIntegerVectors in {
     def : Pat<(vti.Vector (riscv_and_vl (riscv_xor_vl
                                            (vti.Vector vti.RegClass:$rs1),
                                            (riscv_splat_vector -1),
-                                           (vti.Vector vti.RegClass:$merge),
+                                           (vti.Vector vti.RegClass:$passthru),
                                            (vti.Mask V0),
                                            VLOpFrag),
                                         (vti.Vector vti.RegClass:$rs2),
-                                        (vti.Vector vti.RegClass:$merge),
+                                        (vti.Vector vti.RegClass:$passthru),
                                         (vti.Mask V0),
                                         VLOpFrag)),
               (!cast<Instruction>("PseudoVANDN_VV_"#vti.LMul.MX#"_MASK")
-                 vti.RegClass:$merge,
+                 vti.RegClass:$passthru,
                  vti.RegClass:$rs2,
                  vti.RegClass:$rs1,
                  (vti.Mask V0),
@@ -730,11 +730,11 @@ foreach vti = AllIntegerVectors in {
     def : Pat<(vti.Vector (riscv_and_vl (riscv_splat_vector
                                            (not vti.ScalarRegClass:$rs1)),
                                         (vti.Vector vti.RegClass:$rs2),
-                                        (vti.Vector vti.RegClass:$merge),
+                                        (vti.Vector vti.RegClass:$passthru),
                                         (vti.Mask V0),
                                         VLOpFrag)),
               (!cast<Instruction>("PseudoVANDN_VX_"#vti.LMul.MX#"_MASK")
-                 vti.RegClass:$merge,
+                 vti.RegClass:$passthru,
                  vti.RegClass:$rs2,
                  vti.ScalarRegClass:$rs1,
                  (vti.Mask V0),
@@ -758,10 +758,10 @@ foreach vti = AllIntegerVectors in {
                                GetVTypePredicates<vti>.Predicates) in {
     def : Pat<(riscv_rotl_vl vti.RegClass:$rs2,
                              (vti.Vector (SplatPat_uimm6 uimm6:$rs1)),
-                             (vti.Vector vti.RegClass:$merge),
+                             (vti.Vector vti.RegClass:$passthru),
                              (vti.Mask V0), VLOpFrag),
               (!cast<Instruction>("PseudoVROR_VI_"#vti.LMul.MX#"_MASK")
-                 vti.RegClass:$merge,
+                 vti.RegClass:$passthru,
                  vti.RegClass:$rs2,
                  (!cast<SDNodeXForm>("InvRot" # vti.SEW # "Imm") uimm6:$rs1),
                  (vti.Mask V0), GPR:$vl, vti.Log2SEW, TAIL_AGNOSTIC)>;
@@ -778,10 +778,10 @@ foreach vtiToWti = AllWidenableIntVectors in {
     def : Pat<(riscv_shl_vl
                  (wti.Vector (zext_oneuse (vti.Vector vti.RegClass:$rs2))),
                  (wti.Vector (ext_oneuse (vti.Vector vti.RegClass:$rs1))),
-                 (wti.Vector wti.RegClass:$merge),
+                 (wti.Vector wti.RegClass:$passthru),
                  (vti.Mask V0), VLOpFrag),
               (!cast<Instruction>("PseudoVWSLL_VV_"#vti.LMul.MX#"_MASK")
-                 wti.RegClass:$merge, vti.RegClass:$rs2, vti.RegClass:$rs1,
+                 wti.RegClass:$passthru, vti.RegClass:$rs2, vti.RegClass:$rs1,
                  (vti.Mask V0), GPR:$vl, vti.Log2SEW, TAIL_AGNOSTIC)>;
 
     def : Pat<(riscv_shl_vl
@@ -791,19 +791,19 @@ foreach vtiToWti = AllWidenableIntVectors in {
                  (wti.Vector (riscv_ext_vl_oneuse
                                 (vti.Vector vti.RegClass:$rs1),
                                 (vti.Mask V0), VLOpFrag)),
-                 (wti.Vector wti.RegClass:$merge),
+                 (wti.Vector wti.RegClass:$passthru),
                  (vti.Mask V0), VLOpFrag),
               (!cast<Instruction>("PseudoVWSLL_VV_"#vti.LMul.MX#"_MASK")
-                 wti.RegClass:$merge, vti.RegClass:$rs2, vti.RegClass:$rs1,
+                 wti.RegClass:$passthru, vti.RegClass:$rs2, vti.RegClass:$rs1,
                  (vti.Mask V0), GPR:$vl, vti.Log2SEW, TAIL_AGNOSTIC)>;
 
     def : Pat<(riscv_shl_vl
                  (wti.Vector (zext_oneuse (vti.Vector vti.RegClass:$rs2))),
                  (wti.Vector (Low8BitsSplatPat (XLenVT GPR:$rs1))),
-                 (wti.Vector wti.RegClass:$merge),
+                 (wti.Vector wti.RegClass:$passthru),
                  (vti.Mask V0), VLOpFrag),
               (!cast<Instruction>("PseudoVWSLL_VX_"#vti.LMul.MX#"_MASK")
-                 wti.RegClass:$merge, vti.RegClass:$rs2, GPR:$rs1,
+                 wti.RegClass:$passthru, vti.RegClass:$rs2, GPR:$rs1,
                  (vti.Mask V0), GPR:$vl, vti.Log2SEW, TAIL_AGNOSTIC)>;
 
     def : Pat<(riscv_shl_vl
@@ -811,19 +811,19 @@ foreach vtiToWti = AllWidenableIntVectors in {
                                 (vti.Vector vti.RegClass:$rs2),
                                 (vti.Mask V0), VLOpFrag)),
                  (wti.Vector (Low8BitsSplatPat (XLenVT GPR:$rs1))),
-                 (wti.Vector wti.RegClass:$merge),
+                 (wti.Vector wti.RegClass:$passthru),
                  (vti.Mask V0), VLOpFrag),
               (!cast<Instruction>("PseudoVWSLL_VX_"#vti.LMul.MX#"_MASK")
-                 wti.RegClass:$merge, vti.RegClass:$rs2, GPR:$rs1,
+                 wti.RegClass:$passthru, vti.RegClass:$rs2, GPR:$rs1,
                  (vti.Mask V0), GPR:$vl, vti.Log2SEW, TAIL_AGNOSTIC)>;
 
     def : Pat<(riscv_shl_vl
                  (wti.Vector (zext_oneuse (vti.Vector vti.RegClass:$rs2))),
                  (wti.Vector (SplatPat_uimm5 uimm5:$rs1)),
-                 (wti.Vector wti.RegClass:$merge),
+                 (wti.Vector wti.RegClass:$passthru),
                  (vti.Mask V0), VLOpFrag),
               (!cast<Instruction>("PseudoVWSLL_VI_"#vti.LMul.MX#"_MASK")
-                 wti.RegClass:$merge, vti.RegClass:$rs2, uimm5:$rs1,
+                 wti.RegClass:$passthru, vti.RegClass:$rs2, uimm5:$rs1,
                  (vti.Mask V0), GPR:$vl, vti.Log2SEW, TAIL_AGNOSTIC)>;
 
     def : Pat<(riscv_shl_vl
@@ -831,37 +831,37 @@ foreach vtiToWti = AllWidenableIntVectors in {
                                 (vti.Vector vti.RegClass:$rs2),
                                 (vti.Mask V0), VLOpFrag)),
                  (wti.Vector (SplatPat_uimm5 uimm5:$rs1)),
-                 (wti.Vector wti.RegClass:$merge),
+                 (wti.Vector wti.RegClass:$passthru),
                  (vti.Mask V0), VLOpFrag),
               (!cast<Instruction>("PseudoVWSLL_VI_"#vti.LMul.MX#"_MASK")
-                 wti.RegClass:$merge, vti.RegClass:$rs2, uimm5:$rs1,
+                 wti.RegClass:$passthru, vti.RegClass:$rs2, uimm5:$rs1,
                  (vti.Mask V0), GPR:$vl, vti.Log2SEW, TAIL_AGNOSTIC)>;
 
     def : Pat<(riscv_vwsll_vl
                  (vti.Vector vti.RegClass:$rs2),
                  (vti.Vector vti.RegClass:$rs1),
-                 (wti.Vector wti.RegClass:$merge),
+                 (wti.Vector wti.RegClass:$passthru),
                  (vti.Mask V0), VLOpFrag),
               (!cast<Instruction>("PseudoVWSLL_VV_"#vti.LMul.MX#"_MASK")
-                 wti.RegClass:$merge, vti.RegClass:$rs2, vti.RegClass:$rs1,
+                 wti.RegClass:$passthru, vti.RegClass:$rs2, vti.RegClass:$rs1,
                  (vti.Mask V0), GPR:$vl, vti.Log2SEW, TAIL_AGNOSTIC)>;
 
     def : Pat<(riscv_vwsll_vl
                  (vti.Vector vti.RegClass:$rs2),
                  (vti.Vector (Low8BitsSplatPat (XLenVT GPR:$rs1))),
-                 (wti.Vector wti.RegClass:$merge),
+                 (wti.Vector wti.RegClass:$passthru),
                  (vti.Mask V0), VLOpFrag),
               (!cast<Instruction>("PseudoVWSLL_VX_"#vti.LMul.MX#"_MASK")
-                 wti.RegClass:$merge, vti.RegClass:$rs2, GPR:$rs1,
+                 wti.RegClass:$passthru, vti.RegClass:$rs2, GPR:$rs1,
                  (vti.Mask V0), GPR:$vl, vti.Log2SEW, TAIL_AGNOSTIC)>;
 
     def : Pat<(riscv_vwsll_vl
                  (vti.Vector vti.RegClass:$rs2),
                  (vti.Vector (SplatPat_uimm5 uimm5:$rs1)),
-                 (wti.Vector wti.RegClass:$merge),
+                 (wti.Vector wti.RegClass:$passthru),
                  (vti.Mask V0), VLOpFrag),
               (!cast<Instruction>("PseudoVWSLL_VI_"#vti.LMul.MX#"_MASK")
-                 wti.RegClass:$merge, vti.RegClass:$rs2, uimm5:$rs1,
+                 wti.RegClass:$passthru, vti.RegClass:$rs2, uimm5:$rs1,
                  (vti.Mask V0), GPR:$vl, vti.Log2SEW, TAIL_AGNOSTIC)>;
   }
 }
@@ -989,11 +989,11 @@ multiclass VPatBinaryV_VI_VROL<string intrinsic, string instruction,
         !if(isSEWAware, instruction#"_VI_"#vti.LMul.MX#"_E"#vti.SEW,
                         instruction#"_VI_"#vti.LMul.MX));
     let Predicates = GetVTypePredicates<vti>.Predicates in
-    def : Pat<(vti.Vector (Intr (vti.Vector vti.RegClass:$merge),
+    def : Pat<(vti.Vector (Intr (vti.Vector vti.RegClass:$passthru),
                           (vti.Vector vti.RegClass:$rs2),
                           (XLenVT uimm6:$rs1),
                           VLOpFrag)),
-                          (Pseudo (vti.Vector vti.RegClass:$merge),
+                          (Pseudo (vti.Vector vti.RegClass:$passthru),
                           (vti.Vector vti.RegClass:$rs2),
                           (InvRot64Imm uimm6:$rs1),
                           GPR:$vl, vti.Log2SEW, TU_MU)>;
@@ -1003,12 +1003,12 @@ multiclass VPatBinaryV_VI_VROL<string intrinsic, string instruction,
         !if(isSEWAware, instruction#"_VI_"#vti.LMul.MX#"_E"#vti.SEW#"_MASK",
                         instruction#"_VI_"#vti.LMul.MX#"_MASK"));
     let Predicates = GetVTypePredicates<vti>.Predicates in
-    def : Pat<(vti.Vector (IntrMask (vti.Vector vti.RegClass:$merge),
+    def : Pat<(vti.Vector (IntrMask (vti.Vector vti.RegClass:$passthru),
                           (vti.Vector vti.RegClass:$rs2),
                           (XLenVT uimm6:$rs1),
                           (vti.Mask V0),
                           VLOpFrag, (XLenVT timm:$policy))),
-                          (PseudoMask (vti.Vector vti.RegClass:$merge),
+                          (PseudoMask (vti.Vector vti.RegClass:$passthru),
                           (vti.Vector vti.RegClass:$rs2),
                           (InvRot64Imm uimm6:$rs1),
                           (vti.Mask V0),
diff --git a/llvm/lib/Target/RISCV/RISCVMergeBaseOffset.cpp b/llvm/lib/Target/RISCV/RISCVMergeBaseOffset.cpp
index fecc83a..b6ac338 100644
--- a/llvm/lib/Target/RISCV/RISCVMergeBaseOffset.cpp
+++ b/llvm/lib/Target/RISCV/RISCVMergeBaseOffset.cpp
@@ -429,8 +429,16 @@ bool RISCVMergeBaseOffsetOpt::foldIntoMemoryOps(MachineInstr &Hi,
         NumOps = Flags.getNumOperandRegisters();
 
         // Memory constraints have two operands.
-        if (NumOps != 2 || !Flags.isMemKind())
+        if (NumOps != 2 || !Flags.isMemKind()) {
+          // If the register is used by something other than a memory contraint,
+          // we should not fold.
+          for (unsigned J = 0; J < NumOps; ++J) {
+            const MachineOperand &MO = UseMI.getOperand(I + 1 + J);
+            if (MO.isReg() && MO.getReg() == DestReg)
+              return false;
+          }
           continue;
+        }
 
         // We can't do this for constraint A because AMO instructions don't have
         // an immediate offset field.
diff --git a/llvm/lib/Target/RISCV/RISCVProcessors.td b/llvm/lib/Target/RISCV/RISCVProcessors.td
index 6eed2ae..25b2498 100644
--- a/llvm/lib/Target/RISCV/RISCVProcessors.td
+++ b/llvm/lib/Target/RISCV/RISCVProcessors.td
@@ -21,6 +21,9 @@ class RISCVTuneInfo {
   bits<32> MaxPrefetchIterationsAhead = -1;
 
   bits<32> MinimumJumpTableEntries = 5;
+
+  // Tail duplication threshold at -O3.
+  bits<32> TailDupAggressiveThreshold = 6;
 }
 
 def RISCVTuneInfoTable : GenericTable {
@@ -29,7 +32,7 @@ def RISCVTuneInfoTable : GenericTable {
   let Fields = ["Name", "PrefFunctionAlignment", "PrefLoopAlignment",
                 "CacheLineSize", "PrefetchDistance",
                 "MinPrefetchStride", "MaxPrefetchIterationsAhead",
-                "MinimumJumpTableEntries"];
+                "MinimumJumpTableEntries", "TailDupAggressiveThreshold"];
 }
 
 def getRISCVTuneInfo : SearchIndex {
@@ -83,9 +86,11 @@ def ROCKET_RV64 : RISCVProcessorModel<"rocket-rv64",
 def ROCKET : RISCVTuneProcessorModel<"rocket",
                                      RocketModel>;
 
+defvar SiFive7TuneFeatures = [TuneSiFive7, TuneNoDefaultUnroll,
+                              TuneShortForwardBranchOpt,
+                              FeaturePostRAScheduler];
 def SIFIVE_7 : RISCVTuneProcessorModel<"sifive-7-series",
-                                       SiFive7Model,
-                                       [TuneSiFive7, FeaturePostRAScheduler]>;
+                                       SiFive7Model, SiFive7TuneFeatures>;
 
 def SIFIVE_E20 : RISCVProcessorModel<"sifive-e20",
                                      RocketModel,
@@ -145,7 +150,7 @@ def SIFIVE_E76 : RISCVProcessorModel<"sifive-e76",
                                       FeatureStdExtA,
                                       FeatureStdExtF,
                                       FeatureStdExtC],
-                                     [TuneSiFive7, FeaturePostRAScheduler]>;
+                                     SiFive7TuneFeatures>;
 
 def SIFIVE_S21 : RISCVProcessorModel<"sifive-s21",
                                      RocketModel,
@@ -189,7 +194,7 @@ def SIFIVE_S76 : RISCVProcessorModel<"sifive-s76",
                                       FeatureStdExtD,
                                       FeatureStdExtC,
                                       FeatureStdExtZihintpause],
-                                     [TuneSiFive7, FeaturePostRAScheduler]>;
+                                     SiFive7TuneFeatures>;
 
 def SIFIVE_U54 : RISCVProcessorModel<"sifive-u54",
                                      RocketModel,
@@ -212,8 +217,11 @@ def SIFIVE_U74 : RISCVProcessorModel<"sifive-u74",
                                       FeatureStdExtF,
                                       FeatureStdExtD,
                                       FeatureStdExtC],
-                                     [TuneSiFive7, FeaturePostRAScheduler]>;
+                                     SiFive7TuneFeatures>;
 
+defvar SiFiveX280TuneFeatures = !listconcat(SiFive7TuneFeatures,
+                                            [TuneDLenFactor2,
+                                             TuneOptimizedZeroStrideLoad]);
 def SIFIVE_X280 : RISCVProcessorModel<"sifive-x280", SiFive7Model,
                                       [Feature64Bit,
                                        FeatureStdExtI,
@@ -229,10 +237,7 @@ def SIFIVE_X280 : RISCVProcessorModel<"sifive-x280", SiFive7Model,
                                        FeatureStdExtZvfh,
                                        FeatureStdExtZba,
                                        FeatureStdExtZbb],
-                                      [TuneSiFive7,
-                                       FeaturePostRAScheduler,
-                                       TuneDLenFactor2,
-                                       TuneOptimizedZeroStrideLoad]>;
+                                      SiFiveX280TuneFeatures>;
 
 def SIFIVE_P450 : RISCVProcessorModel<"sifive-p450", SiFiveP400Model,
                                       [Feature64Bit,
diff --git a/llvm/lib/Target/RISCV/RISCVSchedSiFive7.td b/llvm/lib/Target/RISCV/RISCVSchedSiFive7.td
index b299114..0b0ac0c 100644
--- a/llvm/lib/Target/RISCV/RISCVSchedSiFive7.td
+++ b/llvm/lib/Target/RISCV/RISCVSchedSiFive7.td
@@ -1287,11 +1287,11 @@ def : ReadAdvance<ReadVMov8V, 0>;
 
 // Others
 def : ReadAdvance<ReadVMask, 0>;
-def : ReadAdvance<ReadVMergeOp_WorstCase, 0>;
+def : ReadAdvance<ReadVPassthru_WorstCase, 0>;
 foreach mx = SchedMxList in {
-  def : ReadAdvance<!cast<SchedRead>("ReadVMergeOp_" # mx), 0>;
+  def : ReadAdvance<!cast<SchedRead>("ReadVPassthru_" # mx), 0>;
   foreach sew = SchedSEWSet<mx>.val in
-    def : ReadAdvance<!cast<SchedRead>("ReadVMergeOp_" # mx  # "_E" # sew), 0>;
+    def : ReadAdvance<!cast<SchedRead>("ReadVPassthru_" # mx  # "_E" # sew), 0>;
 }
 
 //===----------------------------------------------------------------------===//
diff --git a/llvm/lib/Target/RISCV/RISCVSchedSiFiveP600.td b/llvm/lib/Target/RISCV/RISCVSchedSiFiveP600.td
index ba062f3..59972d7 100644
--- a/llvm/lib/Target/RISCV/RISCVSchedSiFiveP600.td
+++ b/llvm/lib/Target/RISCV/RISCVSchedSiFiveP600.td
@@ -1086,11 +1086,11 @@ def : ReadAdvance<ReadVMov8V, 0>;
 
 // Others
 def : ReadAdvance<ReadVMask, 0>;
-def : ReadAdvance<ReadVMergeOp_WorstCase, 0>;
+def : ReadAdvance<ReadVPassthru_WorstCase, 0>;
 foreach mx = SchedMxList in {
-  def : ReadAdvance<!cast<SchedRead>("ReadVMergeOp_" # mx), 0>;
+  def : ReadAdvance<!cast<SchedRead>("ReadVPassthru_" # mx), 0>;
   foreach sew = SchedSEWSet<mx>.val in
-    def : ReadAdvance<!cast<SchedRead>("ReadVMergeOp_" # mx  # "_E" # sew), 0>;
+    def : ReadAdvance<!cast<SchedRead>("ReadVPassthru_" # mx  # "_E" # sew), 0>;
 }
 
 // Vector Crypto Extensions
diff --git a/llvm/lib/Target/RISCV/RISCVScheduleV.td b/llvm/lib/Target/RISCV/RISCVScheduleV.td
index 449611c..95fde1e 100644
--- a/llvm/lib/Target/RISCV/RISCVScheduleV.td
+++ b/llvm/lib/Target/RISCV/RISCVScheduleV.td
@@ -766,11 +766,11 @@ def ReadVMov8V        : SchedRead;
 
 // Others
 def ReadVMask         : SchedRead;
-def ReadVMergeOp_WorstCase : SchedRead;
+def ReadVPassthru_WorstCase : SchedRead;
 foreach mx = SchedMxList in {
-  def ReadVMergeOp_ # mx : SchedRead;
+  def ReadVPassthru_ # mx : SchedRead;
   foreach sew = SchedSEWSet<mx>.val in
-    def ReadVMergeOp_ # mx  # "_E" # sew : SchedRead;
+    def ReadVPassthru_ # mx  # "_E" # sew : SchedRead;
 }
 
 //===----------------------------------------------------------------------===//
@@ -1139,11 +1139,11 @@ def : ReadAdvance<ReadVMov8V, 0>;
 
 // Others
 def : ReadAdvance<ReadVMask, 0>;
-def : ReadAdvance<ReadVMergeOp_WorstCase, 0>;
+def : ReadAdvance<ReadVPassthru_WorstCase, 0>;
 foreach mx = SchedMxList in {
-  def : ReadAdvance<!cast<SchedRead>("ReadVMergeOp_" # mx), 0>;
+  def : ReadAdvance<!cast<SchedRead>("ReadVPassthru_" # mx), 0>;
   foreach sew = SchedSEWSet<mx>.val in
-    def : ReadAdvance<!cast<SchedRead>("ReadVMergeOp_" # mx  # "_E" # sew), 0>;
+    def : ReadAdvance<!cast<SchedRead>("ReadVPassthru_" # mx  # "_E" # sew), 0>;
 }
 
 } // Unsupported
diff --git a/llvm/lib/Target/RISCV/RISCVSubtarget.h b/llvm/lib/Target/RISCV/RISCVSubtarget.h
index 377d080..ea54ff1 100644
--- a/llvm/lib/Target/RISCV/RISCVSubtarget.h
+++ b/llvm/lib/Target/RISCV/RISCVSubtarget.h
@@ -50,6 +50,9 @@ struct RISCVTuneInfo {
   unsigned MaxPrefetchIterationsAhead;
 
   unsigned MinimumJumpTableEntries;
+
+  // Tail duplication threshold at -O3.
+  unsigned TailDupAggressiveThreshold;
 };
 
 #define GET_RISCVTuneInfoTable_DECL
@@ -225,7 +228,7 @@ public:
   bool hasVInstructionsI64() const { return HasStdExtZve64x; }
   bool hasVInstructionsF16Minimal() const { return HasStdExtZvfhmin; }
   bool hasVInstructionsF16() const { return HasStdExtZvfh; }
-  bool hasVInstructionsBF16() const { return HasStdExtZvfbfmin; }
+  bool hasVInstructionsBF16Minimal() const { return HasStdExtZvfbfmin; }
   bool hasVInstructionsF32() const { return HasStdExtZve32f; }
   bool hasVInstructionsF64() const { return HasStdExtZve64d; }
   // F16 and F64 both require F32.
@@ -300,6 +303,10 @@ public:
 
   unsigned getMinimumJumpTableEntries() const;
 
+  unsigned getTailDupAggressiveThreshold() const {
+    return TuneInfo->TailDupAggressiveThreshold;
+  }
+
   bool supportsInitUndef() const override { return hasVInstructions(); }
 };
 } // End llvm namespace
diff --git a/llvm/lib/Target/RISCV/RISCVTargetTransformInfo.cpp b/llvm/lib/Target/RISCV/RISCVTargetTransformInfo.cpp
index 5a92d6b..4cd904c 100644
--- a/llvm/lib/Target/RISCV/RISCVTargetTransformInfo.cpp
+++ b/llvm/lib/Target/RISCV/RISCVTargetTransformInfo.cpp
@@ -280,7 +280,7 @@ bool RISCVTTIImpl::hasActiveVectorLength(unsigned, Type *DataTy, Align) const {
 TargetTransformInfo::PopcntSupportKind
 RISCVTTIImpl::getPopcntSupport(unsigned TyWidth) {
   assert(isPowerOf2_32(TyWidth) && "Ty width must be power of 2");
-  return ST->hasStdExtZbb() || ST->hasVendorXCVbitmanip()
+  return ST->hasStdExtZbb() || (ST->hasVendorXCVbitmanip() && !ST->is64Bit())
              ? TTI::PSK_FastHardware
              : TTI::PSK_Software;
 }
@@ -1100,30 +1100,33 @@ InstructionCost RISCVTTIImpl::getCastInstrCost(unsigned Opcode, Type *Dst,
   }
   case ISD::FP_TO_SINT:
   case ISD::FP_TO_UINT:
+    // For fp vector to mask, we use:
+    // vfncvt.rtz.x.f.w v9, v8
+    // vand.vi v8, v9, 1
+    // vmsne.vi v0, v8, 0
+    if (Dst->getScalarSizeInBits() == 1)
+      return 3;
+
+    if (std::abs(PowDiff) <= 1)
+      return 1;
+
+    // Counts of narrow/widen instructions.
+    return std::abs(PowDiff);
+
   case ISD::SINT_TO_FP:
   case ISD::UINT_TO_FP:
-    if (Src->getScalarSizeInBits() == 1 || Dst->getScalarSizeInBits() == 1) {
-      // The cost of convert from or to mask vector is different from other
-      // cases. We could not use PowDiff to calculate it.
-      // For mask vector to fp, we should use the following instructions:
-      // vmv.v.i v8, 0
-      // vmerge.vim v8, v8, -1, v0
-      // vfcvt.f.x.v v8, v8
-
-      // And for fp vector to mask, we use:
-      // vfncvt.rtz.x.f.w v9, v8
-      // vand.vi v8, v9, 1
-      // vmsne.vi v0, v8, 0
+    // For mask vector to fp, we should use the following instructions:
+    // vmv.v.i v8, 0
+    // vmerge.vim v8, v8, -1, v0
+    // vfcvt.f.x.v v8, v8
+    if (Src->getScalarSizeInBits() == 1)
       return 3;
-    }
+
     if (std::abs(PowDiff) <= 1)
       return 1;
     // Backend could lower (v[sz]ext i8 to double) to vfcvt(v[sz]ext.f8 i8),
     // so it only need two conversion.
-    if (Src->isIntOrIntVectorTy())
-      return 2;
-    // Counts of narrow/widen instructions.
-    return std::abs(PowDiff);
+    return 2;
   }
   return BaseT::getCastInstrCost(Opcode, Dst, Src, CCH, CostKind, I);
 }
@@ -1390,14 +1393,32 @@ InstructionCost RISCVTTIImpl::getMemoryOpCost(unsigned Opcode, Type *Src,
   InstructionCost Cost = 0;
   if (Opcode == Instruction::Store && OpInfo.isConstant())
     Cost += getStoreImmCost(Src, OpInfo, CostKind);
-  InstructionCost BaseCost =
-    BaseT::getMemoryOpCost(Opcode, Src, Alignment, AddressSpace,
-                           CostKind, OpInfo, I);
+
+  std::pair<InstructionCost, MVT> LT = getTypeLegalizationCost(Src);
+
+  InstructionCost BaseCost = [&]() {
+    InstructionCost Cost = LT.first;
+    if (CostKind != TTI::TCK_RecipThroughput)
+      return Cost;
+
+    // Our actual lowering for the case where a wider legal type is available
+    // uses the a VL predicated load on the wider type.  This is reflected in
+    // the result of getTypeLegalizationCost, but BasicTTI assumes the
+    // widened cases are scalarized.
+    const DataLayout &DL = this->getDataLayout();
+    if (Src->isVectorTy() && LT.second.isVector() &&
+        TypeSize::isKnownLT(DL.getTypeStoreSizeInBits(Src),
+                            LT.second.getSizeInBits()))
+        return Cost;
+
+    return BaseT::getMemoryOpCost(Opcode, Src, Alignment, AddressSpace,
+                                  CostKind, OpInfo, I);
+  }();
+
   // Assume memory ops cost scale with the number of vector registers
   // possible accessed by the instruction.  Note that BasicTTI already
   // handles the LT.first term for us.
-  if (std::pair<InstructionCost, MVT> LT = getTypeLegalizationCost(Src);
-      LT.second.isVector() && CostKind != TTI::TCK_CodeSize)
+  if (LT.second.isVector() && CostKind != TTI::TCK_CodeSize)
     BaseCost *= TLI->getLMULCost(LT.second);
   return Cost + BaseCost;
 
diff --git a/llvm/lib/Target/RISCV/RISCVVectorPeephole.cpp b/llvm/lib/Target/RISCV/RISCVVectorPeephole.cpp
index 20c014a..979677e 100644
--- a/llvm/lib/Target/RISCV/RISCVVectorPeephole.cpp
+++ b/llvm/lib/Target/RISCV/RISCVVectorPeephole.cpp
@@ -254,11 +254,12 @@ bool RISCVVectorPeephole::convertVMergeToVMv(MachineInstr &MI) const {
     CASE_VMERGE_TO_VMV(M8)
   }
 
-  Register MergeReg = MI.getOperand(1).getReg();
+  Register PassthruReg = MI.getOperand(1).getReg();
   Register FalseReg = MI.getOperand(2).getReg();
-  // Check merge == false (or merge == undef)
-  if (MergeReg != RISCV::NoRegister && TRI->lookThruCopyLike(MergeReg, MRI) !=
-                                           TRI->lookThruCopyLike(FalseReg, MRI))
+  // Check passthru == false (or passthru == undef)
+  if (PassthruReg != RISCV::NoRegister &&
+      TRI->lookThruCopyLike(PassthruReg, MRI) !=
+          TRI->lookThruCopyLike(FalseReg, MRI))
     return false;
 
   assert(MI.getOperand(4).isReg() && MI.getOperand(4).getReg() == RISCV::V0);
@@ -266,14 +267,14 @@ bool RISCVVectorPeephole::convertVMergeToVMv(MachineInstr &MI) const {
     return false;
 
   MI.setDesc(TII->get(NewOpc));
-  MI.removeOperand(1);  // Merge operand
+  MI.removeOperand(1);  // Passthru operand
   MI.tieOperands(0, 1); // Tie false to dest
   MI.removeOperand(3);  // Mask operand
   MI.addOperand(
       MachineOperand::CreateImm(RISCVII::TAIL_UNDISTURBED_MASK_UNDISTURBED));
 
   // vmv.v.v doesn't have a mask operand, so we may be able to inflate the
-  // register class for the destination and merge operands e.g. VRNoV0 -> VR
+  // register class for the destination and passthru operands e.g. VRNoV0 -> VR
   MRI->recomputeRegClass(MI.getOperand(0).getReg());
   MRI->recomputeRegClass(MI.getOperand(1).getReg());
   return true;
diff --git a/llvm/lib/Target/SPIRV/SPIRVInstructionSelector.cpp b/llvm/lib/Target/SPIRV/SPIRVInstructionSelector.cpp
index 8391e0d..ed786bd 100644
--- a/llvm/lib/Target/SPIRV/SPIRVInstructionSelector.cpp
+++ b/llvm/lib/Target/SPIRV/SPIRVInstructionSelector.cpp
@@ -32,27 +32,26 @@
 #include "llvm/IR/IntrinsicsSPIRV.h"
 #include "llvm/Support/Debug.h"
 
-namespace llvm {
+namespace {
 
-class SPIRVMachineModuleInfo : public MachineModuleInfoImpl {
-public:
-  SyncScope::ID Work_ItemSSID;
-  SyncScope::ID WorkGroupSSID;
-  SyncScope::ID DeviceSSID;
-  SyncScope::ID AllSVMDevicesSSID;
-  SyncScope::ID SubGroupSSID;
-
-  SPIRVMachineModuleInfo(const MachineModuleInfo &MMI) {
-    LLVMContext &CTX = MMI.getModule()->getContext();
-    Work_ItemSSID = CTX.getOrInsertSyncScopeID("work_item");
-    WorkGroupSSID = CTX.getOrInsertSyncScopeID("workgroup");
-    DeviceSSID = CTX.getOrInsertSyncScopeID("device");
-    AllSVMDevicesSSID = CTX.getOrInsertSyncScopeID("all_svm_devices");
-    SubGroupSSID = CTX.getOrInsertSyncScopeID("sub_group");
+struct SyncScopeIDs {
+  llvm::SyncScope::ID Work_ItemSSID;
+  llvm::SyncScope::ID WorkGroupSSID;
+  llvm::SyncScope::ID DeviceSSID;
+  llvm::SyncScope::ID AllSVMDevicesSSID;
+  llvm::SyncScope::ID SubGroupSSID;
+
+  SyncScopeIDs() {}
+  SyncScopeIDs(llvm::LLVMContext &Context) {
+    Work_ItemSSID = Context.getOrInsertSyncScopeID("work_item");
+    WorkGroupSSID = Context.getOrInsertSyncScopeID("workgroup");
+    DeviceSSID = Context.getOrInsertSyncScopeID("device");
+    AllSVMDevicesSSID = Context.getOrInsertSyncScopeID("all_svm_devices");
+    SubGroupSSID = Context.getOrInsertSyncScopeID("sub_group");
   }
 };
 
-} // end namespace llvm
+} // namespace
 
 #define DEBUG_TYPE "spirv-isel"
 
@@ -76,7 +75,7 @@ class SPIRVInstructionSelector : public InstructionSelector {
   const RegisterBankInfo &RBI;
   SPIRVGlobalRegistry &GR;
   MachineRegisterInfo *MRI;
-  SPIRVMachineModuleInfo *MMI = nullptr;
+  SyncScopeIDs SSIDs;
 
   /// We need to keep track of the number we give to anonymous global values to
   /// generate the same name every time when this is needed.
@@ -280,7 +279,7 @@ void SPIRVInstructionSelector::setupMF(MachineFunction &MF, GISelKnownBits *KB,
                                        CodeGenCoverage *CoverageInfo,
                                        ProfileSummaryInfo *PSI,
                                        BlockFrequencyInfo *BFI) {
-  MMI = &MF.getMMI().getObjFileInfo<SPIRVMachineModuleInfo>();
+  SSIDs = SyncScopeIDs(MF.getFunction().getContext());
   MRI = &MF.getRegInfo();
   GR.setCurrentFunc(MF);
   InstructionSelector::setupMF(MF, KB, CoverageInfo, PSI, BFI);
@@ -721,16 +720,16 @@ bool SPIRVInstructionSelector::selectBitcast(Register ResVReg,
 }
 
 static SPIRV::Scope::Scope getScope(SyncScope::ID Ord,
-                                    SPIRVMachineModuleInfo *MMI) {
-  if (Ord == SyncScope::SingleThread || Ord == MMI->Work_ItemSSID)
+                                    const SyncScopeIDs &SSIDs) {
+  if (Ord == SyncScope::SingleThread || Ord == SSIDs.Work_ItemSSID)
     return SPIRV::Scope::Invocation;
-  else if (Ord == SyncScope::System || Ord == MMI->DeviceSSID)
+  else if (Ord == SyncScope::System || Ord == SSIDs.DeviceSSID)
     return SPIRV::Scope::Device;
-  else if (Ord == MMI->WorkGroupSSID)
+  else if (Ord == SSIDs.WorkGroupSSID)
     return SPIRV::Scope::Workgroup;
-  else if (Ord == MMI->AllSVMDevicesSSID)
+  else if (Ord == SSIDs.AllSVMDevicesSSID)
     return SPIRV::Scope::CrossDevice;
-  else if (Ord == MMI->SubGroupSSID)
+  else if (Ord == SSIDs.SubGroupSSID)
     return SPIRV::Scope::Subgroup;
   else
     // OpenCL approach is: "The functions that do not have memory_scope argument
@@ -896,7 +895,7 @@ bool SPIRVInstructionSelector::selectAtomicRMW(Register ResVReg,
   assert(I.hasOneMemOperand());
   const MachineMemOperand *MemOp = *I.memoperands_begin();
   uint32_t Scope =
-      static_cast<uint32_t>(getScope(MemOp->getSyncScopeID(), MMI));
+      static_cast<uint32_t>(getScope(MemOp->getSyncScopeID(), SSIDs));
   Register ScopeReg = buildI32Constant(Scope, I);
 
   Register Ptr = I.getOperand(1).getReg();
@@ -967,7 +966,7 @@ bool SPIRVInstructionSelector::selectFence(MachineInstr &I) const {
   uint32_t MemSem = static_cast<uint32_t>(getMemSemantics(AO));
   Register MemSemReg = buildI32Constant(MemSem, I);
   SyncScope::ID Ord = SyncScope::ID(I.getOperand(1).getImm());
-  uint32_t Scope = static_cast<uint32_t>(getScope(Ord, MMI));
+  uint32_t Scope = static_cast<uint32_t>(getScope(Ord, SSIDs));
   Register ScopeReg = buildI32Constant(Scope, I);
   MachineBasicBlock &BB = *I.getParent();
   return BuildMI(BB, I, I.getDebugLoc(), TII.get(SPIRV::OpMemoryBarrier))
@@ -987,7 +986,7 @@ bool SPIRVInstructionSelector::selectAtomicCmpXchg(Register ResVReg,
     assert(I.hasOneMemOperand());
     const MachineMemOperand *MemOp = *I.memoperands_begin();
     unsigned Scope =
-        static_cast<uint32_t>(getScope(MemOp->getSyncScopeID(), MMI));
+        static_cast<uint32_t>(getScope(MemOp->getSyncScopeID(), SSIDs));
     ScopeReg = buildI32Constant(Scope, I);
 
     unsigned ScSem = static_cast<uint32_t>(
diff --git a/llvm/lib/Target/Sparc/SparcISelLowering.cpp b/llvm/lib/Target/Sparc/SparcISelLowering.cpp
index 50aa194..42b8248 100644
--- a/llvm/lib/Target/Sparc/SparcISelLowering.cpp
+++ b/llvm/lib/Target/Sparc/SparcISelLowering.cpp
@@ -1731,16 +1731,12 @@ SparcTargetLowering::SparcTargetLowering(const TargetMachine &TM,
   setOperationAction(ISD::SELECT_CC, MVT::f64, Custom);
   setOperationAction(ISD::SELECT_CC, MVT::f128, Custom);
 
-  setOperationAction(ISD::ADDC, MVT::i32, Custom);
-  setOperationAction(ISD::ADDE, MVT::i32, Custom);
-  setOperationAction(ISD::SUBC, MVT::i32, Custom);
-  setOperationAction(ISD::SUBE, MVT::i32, Custom);
+  setOperationAction(ISD::ADDC, MVT::i32, Legal);
+  setOperationAction(ISD::ADDE, MVT::i32, Legal);
+  setOperationAction(ISD::SUBC, MVT::i32, Legal);
+  setOperationAction(ISD::SUBE, MVT::i32, Legal);
 
   if (Subtarget->is64Bit()) {
-    setOperationAction(ISD::ADDC, MVT::i64, Custom);
-    setOperationAction(ISD::ADDE, MVT::i64, Custom);
-    setOperationAction(ISD::SUBC, MVT::i64, Custom);
-    setOperationAction(ISD::SUBE, MVT::i64, Custom);
     setOperationAction(ISD::BITCAST, MVT::f64, Expand);
     setOperationAction(ISD::BITCAST, MVT::i64, Expand);
     setOperationAction(ISD::SELECT, MVT::i64, Expand);
@@ -1855,9 +1851,6 @@ SparcTargetLowering::SparcTargetLowering(const TargetMachine &TM,
     setOperationAction(ISD::MULHU,     MVT::i64, Expand);
     setOperationAction(ISD::MULHS,     MVT::i64, Expand);
 
-    setOperationAction(ISD::UMULO,     MVT::i64, Custom);
-    setOperationAction(ISD::SMULO,     MVT::i64, Custom);
-
     setOperationAction(ISD::SHL_PARTS, MVT::i64, Expand);
     setOperationAction(ISD::SRA_PARTS, MVT::i64, Expand);
     setOperationAction(ISD::SRL_PARTS, MVT::i64, Expand);
@@ -3105,110 +3098,6 @@ static SDValue LowerFNEGorFABS(SDValue Op, SelectionDAG &DAG, bool isV9) {
   return DstReg128;
 }
 
-static SDValue LowerADDC_ADDE_SUBC_SUBE(SDValue Op, SelectionDAG &DAG) {
-
-  if (Op.getValueType() != MVT::i64)
-    return Op;
-
-  SDLoc dl(Op);
-  SDValue Src1 = Op.getOperand(0);
-  SDValue Src1Lo = DAG.getNode(ISD::TRUNCATE, dl, MVT::i32, Src1);
-  SDValue Src1Hi = DAG.getNode(ISD::SRL, dl, MVT::i64, Src1,
-                               DAG.getConstant(32, dl, MVT::i64));
-  Src1Hi = DAG.getNode(ISD::TRUNCATE, dl, MVT::i32, Src1Hi);
-
-  SDValue Src2 = Op.getOperand(1);
-  SDValue Src2Lo = DAG.getNode(ISD::TRUNCATE, dl, MVT::i32, Src2);
-  SDValue Src2Hi = DAG.getNode(ISD::SRL, dl, MVT::i64, Src2,
-                               DAG.getConstant(32, dl, MVT::i64));
-  Src2Hi = DAG.getNode(ISD::TRUNCATE, dl, MVT::i32, Src2Hi);
-
-
-  bool hasChain = false;
-  unsigned hiOpc = Op.getOpcode();
-  switch (Op.getOpcode()) {
-  default: llvm_unreachable("Invalid opcode");
-  case ISD::ADDC: hiOpc = ISD::ADDE; break;
-  case ISD::ADDE: hasChain = true; break;
-  case ISD::SUBC: hiOpc = ISD::SUBE; break;
-  case ISD::SUBE: hasChain = true; break;
-  }
-  SDValue Lo;
-  SDVTList VTs = DAG.getVTList(MVT::i32, MVT::Glue);
-  if (hasChain) {
-    Lo = DAG.getNode(Op.getOpcode(), dl, VTs, Src1Lo, Src2Lo,
-                     Op.getOperand(2));
-  } else {
-    Lo = DAG.getNode(Op.getOpcode(), dl, VTs, Src1Lo, Src2Lo);
-  }
-  SDValue Hi = DAG.getNode(hiOpc, dl, VTs, Src1Hi, Src2Hi, Lo.getValue(1));
-  SDValue Carry = Hi.getValue(1);
-
-  Lo = DAG.getNode(ISD::ZERO_EXTEND, dl, MVT::i64, Lo);
-  Hi = DAG.getNode(ISD::ZERO_EXTEND, dl, MVT::i64, Hi);
-  Hi = DAG.getNode(ISD::SHL, dl, MVT::i64, Hi,
-                   DAG.getConstant(32, dl, MVT::i64));
-
-  SDValue Dst = DAG.getNode(ISD::OR, dl, MVT::i64, Hi, Lo);
-  SDValue Ops[2] = { Dst, Carry };
-  return DAG.getMergeValues(Ops, dl);
-}
-
-// Custom lower UMULO/SMULO for SPARC. This code is similar to ExpandNode()
-// in LegalizeDAG.cpp except the order of arguments to the library function.
-static SDValue LowerUMULO_SMULO(SDValue Op, SelectionDAG &DAG,
-                                const SparcTargetLowering &TLI)
-{
-  unsigned opcode = Op.getOpcode();
-  assert((opcode == ISD::UMULO || opcode == ISD::SMULO) && "Invalid Opcode.");
-
-  bool isSigned = (opcode == ISD::SMULO);
-  EVT VT = MVT::i64;
-  EVT WideVT = MVT::i128;
-  SDLoc dl(Op);
-  SDValue LHS = Op.getOperand(0);
-
-  if (LHS.getValueType() != VT)
-    return Op;
-
-  SDValue ShiftAmt = DAG.getConstant(63, dl, VT);
-
-  SDValue RHS = Op.getOperand(1);
-  SDValue HiLHS, HiRHS;
-  if (isSigned) {
-    HiLHS = DAG.getNode(ISD::SRA, dl, VT, LHS, ShiftAmt);
-    HiRHS = DAG.getNode(ISD::SRA, dl, MVT::i64, RHS, ShiftAmt);
-  } else {
-    HiLHS = DAG.getConstant(0, dl, VT);
-    HiRHS = DAG.getConstant(0, dl, MVT::i64);
-  }
-
-  SDValue Args[] = { HiLHS, LHS, HiRHS, RHS };
-
-  TargetLowering::MakeLibCallOptions CallOptions;
-  CallOptions.setSExt(isSigned);
-  SDValue MulResult = TLI.makeLibCall(DAG,
-                                      RTLIB::MUL_I128, WideVT,
-                                      Args, CallOptions, dl).first;
-  SDValue BottomHalf, TopHalf;
-  std::tie(BottomHalf, TopHalf) = DAG.SplitScalar(MulResult, dl, VT, VT);
-  if (isSigned) {
-    SDValue Tmp1 = DAG.getNode(ISD::SRA, dl, VT, BottomHalf, ShiftAmt);
-    TopHalf = DAG.getSetCC(dl, MVT::i32, TopHalf, Tmp1, ISD::SETNE);
-  } else {
-    TopHalf = DAG.getSetCC(dl, MVT::i32, TopHalf, DAG.getConstant(0, dl, VT),
-                           ISD::SETNE);
-  }
-  // MulResult is a node with an illegal type. Because such things are not
-  // generally permitted during this phase of legalization, ensure that
-  // nothing is left using the node. The above EXTRACT_ELEMENT nodes should have
-  // been folded.
-  assert(MulResult->use_empty() && "Illegally typed node still in use!");
-
-  SDValue Ops[2] = { BottomHalf, TopHalf } ;
-  return DAG.getMergeValues(Ops, dl);
-}
-
 static SDValue LowerATOMIC_LOAD_STORE(SDValue Op, SelectionDAG &DAG) {
   if (isStrongerThanMonotonic(cast<AtomicSDNode>(Op)->getSuccessOrdering())) {
     // Expand with a fence.
@@ -3283,12 +3172,6 @@ LowerOperation(SDValue Op, SelectionDAG &DAG) const {
   case ISD::FNEG:               return LowerFNEGorFABS(Op, DAG, isV9);
   case ISD::FP_EXTEND:          return LowerF128_FPEXTEND(Op, DAG, *this);
   case ISD::FP_ROUND:           return LowerF128_FPROUND(Op, DAG, *this);
-  case ISD::ADDC:
-  case ISD::ADDE:
-  case ISD::SUBC:
-  case ISD::SUBE:               return LowerADDC_ADDE_SUBC_SUBE(Op, DAG);
-  case ISD::UMULO:
-  case ISD::SMULO:              return LowerUMULO_SMULO(Op, DAG, *this);
   case ISD::ATOMIC_LOAD:
   case ISD::ATOMIC_STORE:       return LowerATOMIC_LOAD_STORE(Op, DAG);
   case ISD::INTRINSIC_WO_CHAIN: return LowerINTRINSIC_WO_CHAIN(Op, DAG);
diff --git a/llvm/lib/Target/X86/AsmParser/X86AsmParser.cpp b/llvm/lib/Target/X86/AsmParser/X86AsmParser.cpp
index c7f88fe..6e17150 100644
--- a/llvm/lib/Target/X86/AsmParser/X86AsmParser.cpp
+++ b/llvm/lib/Target/X86/AsmParser/X86AsmParser.cpp
@@ -3776,6 +3776,17 @@ bool X86AsmParser::processInstruction(MCInst &Inst, const OperandVector &Ops) {
   if (X86::optimizeShiftRotateWithImmediateOne(Inst))
     return true;
 
+  auto replaceWithCCMPCTEST = [&](unsigned Opcode) -> bool {
+    if (ForcedOpcodePrefix == OpcodePrefix_EVEX) {
+      Inst.setFlags(~(X86::IP_USE_EVEX)&Inst.getFlags());
+      Inst.setOpcode(Opcode);
+      Inst.addOperand(MCOperand::createImm(0));
+      Inst.addOperand(MCOperand::createImm(10));
+      return true;
+    }
+    return false;
+  };
+
   switch (Inst.getOpcode()) {
   default: return false;
   case X86::JMP_1:
@@ -3807,6 +3818,61 @@ bool X86AsmParser::processInstruction(MCInst &Inst, const OperandVector &Ops) {
     Inst.setOpcode(X86::INT3);
     return true;
   }
+  // `{evex} cmp <>, <>` is alias of `ccmpt {dfv=} <>, <>`, and
+  // `{evex} test <>, <>` is alias of `ctest {dfv=} <>, <>`
+#define FROM_TO(FROM, TO)                                                      \
+  case X86::FROM:                                                              \
+    return replaceWithCCMPCTEST(X86::TO);
+    FROM_TO(CMP64rr, CCMP64rr)
+    FROM_TO(CMP64mi32, CCMP64mi32)
+    FROM_TO(CMP64mi8, CCMP64mi8)
+    FROM_TO(CMP64mr, CCMP64mr)
+    FROM_TO(CMP64ri32, CCMP64ri32)
+    FROM_TO(CMP64ri8, CCMP64ri8)
+    FROM_TO(CMP64rm, CCMP64rm)
+
+    FROM_TO(CMP32rr, CCMP32rr)
+    FROM_TO(CMP32mi, CCMP32mi)
+    FROM_TO(CMP32mi8, CCMP32mi8)
+    FROM_TO(CMP32mr, CCMP32mr)
+    FROM_TO(CMP32ri, CCMP32ri)
+    FROM_TO(CMP32ri8, CCMP32ri8)
+    FROM_TO(CMP32rm, CCMP32rm)
+
+    FROM_TO(CMP16rr, CCMP16rr)
+    FROM_TO(CMP16mi, CCMP16mi)
+    FROM_TO(CMP16mi8, CCMP16mi8)
+    FROM_TO(CMP16mr, CCMP16mr)
+    FROM_TO(CMP16ri, CCMP16ri)
+    FROM_TO(CMP16ri8, CCMP16ri8)
+    FROM_TO(CMP16rm, CCMP16rm)
+
+    FROM_TO(CMP8rr, CCMP8rr)
+    FROM_TO(CMP8mi, CCMP8mi)
+    FROM_TO(CMP8mr, CCMP8mr)
+    FROM_TO(CMP8ri, CCMP8ri)
+    FROM_TO(CMP8rm, CCMP8rm)
+
+    FROM_TO(TEST64rr, CTEST64rr)
+    FROM_TO(TEST64mi32, CTEST64mi32)
+    FROM_TO(TEST64mr, CTEST64mr)
+    FROM_TO(TEST64ri32, CTEST64ri32)
+
+    FROM_TO(TEST32rr, CTEST32rr)
+    FROM_TO(TEST32mi, CTEST32mi)
+    FROM_TO(TEST32mr, CTEST32mr)
+    FROM_TO(TEST32ri, CTEST32ri)
+
+    FROM_TO(TEST16rr, CTEST16rr)
+    FROM_TO(TEST16mi, CTEST16mi)
+    FROM_TO(TEST16mr, CTEST16mr)
+    FROM_TO(TEST16ri, CTEST16ri)
+
+    FROM_TO(TEST8rr, CTEST8rr)
+    FROM_TO(TEST8mi, CTEST8mi)
+    FROM_TO(TEST8mr, CTEST8mr)
+    FROM_TO(TEST8ri, CTEST8ri)
+#undef FROM_TO
   }
 }
 
@@ -4158,7 +4224,10 @@ unsigned X86AsmParser::checkTargetMatchPredicate(MCInst &Inst) {
       return Match_Unsupported;
     break;
   case OpcodePrefix_EVEX:
-    if ((TSFlags & X86II::EncodingMask) != X86II::EVEX)
+    if (is64BitMode() && (TSFlags & X86II::EncodingMask) != X86II::EVEX &&
+        !X86::isCMP(Opc) && !X86::isTEST(Opc))
+      return Match_Unsupported;
+    if (!is64BitMode() && (TSFlags & X86II::EncodingMask) != X86II::EVEX)
       return Match_Unsupported;
     break;
   }
diff --git a/llvm/lib/Target/X86/MCTargetDesc/X86AsmBackend.cpp b/llvm/lib/Target/X86/MCTargetDesc/X86AsmBackend.cpp
index fcc61d0..cf0cb92 100644
--- a/llvm/lib/Target/X86/MCTargetDesc/X86AsmBackend.cpp
+++ b/llvm/lib/Target/X86/MCTargetDesc/X86AsmBackend.cpp
@@ -201,7 +201,7 @@ public:
   bool padInstructionEncoding(MCRelaxableFragment &RF, MCCodeEmitter &Emitter,
                               unsigned &RemainingSize) const;
 
-  bool finishLayout(const MCAssembler &Asm) const override;
+  void finishLayout(const MCAssembler &Asm) const override;
 
   unsigned getMaximumNopSize(const MCSubtargetInfo &STI) const override;
 
@@ -437,8 +437,6 @@ static size_t getSizeForInstFragment(const MCFragment *F) {
     return cast<MCDataFragment>(*F).getContents().size();
   case MCFragment::FT_Relaxable:
     return cast<MCRelaxableFragment>(*F).getContents().size();
-  case MCFragment::FT_CompactEncodedInst:
-    return cast<MCCompactEncodedInstFragment>(*F).getContents().size();
   }
 }
 
@@ -856,7 +854,7 @@ bool X86AsmBackend::padInstructionEncoding(MCRelaxableFragment &RF,
   return Changed;
 }
 
-bool X86AsmBackend::finishLayout(const MCAssembler &Asm) const {
+void X86AsmBackend::finishLayout(MCAssembler const &Asm) const {
   // See if we can further relax some instructions to cut down on the number of
   // nop bytes required for code alignment.  The actual win is in reducing
   // instruction count, not number of bytes.  Modern X86-64 can easily end up
@@ -864,7 +862,7 @@ bool X86AsmBackend::finishLayout(const MCAssembler &Asm) const {
   // (i.e. eliminate nops) even at the cost of increasing the size and
   // complexity of others.
   if (!X86PadForAlign && !X86PadForBranchAlign)
-    return false;
+    return;
 
   // The processed regions are delimitered by LabeledFragments. -g may have more
   // MCSymbols and therefore different relaxation results. X86PadForAlign is
@@ -884,9 +882,7 @@ bool X86AsmBackend::finishLayout(const MCAssembler &Asm) const {
       if (LabeledFragments.count(&F))
         Relaxable.clear();
 
-      if (F.getKind() == MCFragment::FT_Data ||
-          F.getKind() == MCFragment::FT_CompactEncodedInst)
-        // Skip and ignore
+      if (F.getKind() == MCFragment::FT_Data) // Skip and ignore
         continue;
 
       if (F.getKind() == MCFragment::FT_Relaxable) {
@@ -911,6 +907,9 @@ bool X86AsmBackend::finishLayout(const MCAssembler &Asm) const {
         continue;
       }
 
+#ifndef NDEBUG
+      const uint64_t OrigOffset = Asm.getFragmentOffset(F);
+#endif
       const uint64_t OrigSize = Asm.computeFragmentSize(F);
 
       // To keep the effects local, prefer to relax instructions closest to
@@ -923,7 +922,8 @@ bool X86AsmBackend::finishLayout(const MCAssembler &Asm) const {
         // Give the backend a chance to play any tricks it wishes to increase
         // the encoding size of the given instruction.  Target independent code
         // will try further relaxation, but target's may play further tricks.
-        padInstructionEncoding(RF, Asm.getEmitter(), RemainingSize);
+        if (padInstructionEncoding(RF, Asm.getEmitter(), RemainingSize))
+          Sec.setHasLayout(false);
 
         // If we have an instruction which hasn't been fully relaxed, we can't
         // skip past it and insert bytes before it.  Changing its starting
@@ -940,6 +940,14 @@ bool X86AsmBackend::finishLayout(const MCAssembler &Asm) const {
       if (F.getKind() == MCFragment::FT_BoundaryAlign)
         cast<MCBoundaryAlignFragment>(F).setSize(RemainingSize);
 
+#ifndef NDEBUG
+      const uint64_t FinalOffset = Asm.getFragmentOffset(F);
+      const uint64_t FinalSize = Asm.computeFragmentSize(F);
+      assert(OrigOffset + OrigSize == FinalOffset + FinalSize &&
+             "can't move start of next fragment!");
+      assert(FinalSize == RemainingSize && "inconsistent size computation?");
+#endif
+
       // If we're looking at a boundary align, make sure we don't try to pad
       // its target instructions for some following directive.  Doing so would
       // break the alignment of the current boundary align.
@@ -953,7 +961,11 @@ bool X86AsmBackend::finishLayout(const MCAssembler &Asm) const {
     }
   }
 
-  return true;
+  // The layout is done. Mark every fragment as valid.
+  for (MCSection &Section : Asm) {
+    Asm.getFragmentOffset(*Section.curFragList()->Tail);
+    Asm.computeFragmentSize(*Section.curFragList()->Tail);
+  }
 }
 
 unsigned X86AsmBackend::getMaximumNopSize(const MCSubtargetInfo &STI) const {
diff --git a/llvm/lib/Target/X86/X86ISelLowering.cpp b/llvm/lib/Target/X86/X86ISelLowering.cpp
index ad59b13..7340539 100644
--- a/llvm/lib/Target/X86/X86ISelLowering.cpp
+++ b/llvm/lib/Target/X86/X86ISelLowering.cpp
@@ -2475,8 +2475,12 @@ X86TargetLowering::X86TargetLowering(const X86TargetMachine &TM,
       (Subtarget.isTargetWindowsMSVC() || Subtarget.isTargetWindowsItanium()))
     // clang-format off
    for (ISD::NodeType Op :
-         {ISD::FCEIL,  ISD::STRICT_FCEIL,
+         {ISD::FACOS,  ISD::STRICT_FACOS,
+          ISD::FASIN,  ISD::STRICT_FASIN,
+          ISD::FATAN,  ISD::STRICT_FATAN,
+          ISD::FCEIL,  ISD::STRICT_FCEIL,
           ISD::FCOS,   ISD::STRICT_FCOS,
+          ISD::FCOSH,  ISD::STRICT_FCOSH,
           ISD::FEXP,   ISD::STRICT_FEXP,
           ISD::FFLOOR, ISD::STRICT_FFLOOR,
           ISD::FREM,   ISD::STRICT_FREM,
@@ -2484,7 +2488,9 @@ X86TargetLowering::X86TargetLowering(const X86TargetMachine &TM,
           ISD::FLOG10, ISD::STRICT_FLOG10,
           ISD::FPOW,   ISD::STRICT_FPOW,
           ISD::FSIN,   ISD::STRICT_FSIN,
-          ISD::FTAN,   ISD::STRICT_FTAN})
+          ISD::FSINH,  ISD::STRICT_FSINH,
+          ISD::FTAN,   ISD::STRICT_FTAN,
+          ISD::FTANH,  ISD::STRICT_FTANH})
       if (isOperationExpand(Op, MVT::f32))
         setOperationAction(Op, MVT::f32, Promote);
   // clang-format on
@@ -3435,12 +3441,9 @@ X86TargetLowering::getJumpConditionMergingParams(Instruction::BinaryOps Opc,
   if (BaseCost >= 0 && Subtarget.hasCCMP())
     BaseCost += BrMergingCcmpBias;
   // a == b && a == c is a fast pattern on x86.
-  ICmpInst::Predicate Pred;
   if (BaseCost >= 0 && Opc == Instruction::And &&
-      match(Lhs, m_ICmp(Pred, m_Value(), m_Value())) &&
-      Pred == ICmpInst::ICMP_EQ &&
-      match(Rhs, m_ICmp(Pred, m_Value(), m_Value())) &&
-      Pred == ICmpInst::ICMP_EQ)
+      match(Lhs, m_SpecificICmp(ICmpInst::ICMP_EQ, m_Value(), m_Value())) &&
+      match(Rhs, m_SpecificICmp(ICmpInst::ICMP_EQ, m_Value(), m_Value())))
     BaseCost += 1;
   return {BaseCost, BrMergingLikelyBias.getValue(),
           BrMergingUnlikelyBias.getValue()};
@@ -30760,10 +30763,12 @@ static bool shouldExpandCmpArithRMWInIR(AtomicRMWInst *AI) {
     if (match(I, m_c_ICmp(Pred, m_Sub(m_ZeroInt(), m_Specific(Op)), m_Value())))
       return Pred == CmpInst::ICMP_EQ || Pred == CmpInst::ICMP_NE;
     if (match(I, m_OneUse(m_c_Add(m_Specific(Op), m_Value())))) {
-      if (match(I->user_back(), m_ICmp(Pred, m_Value(), m_ZeroInt())))
-        return Pred == CmpInst::ICMP_SLT;
-      if (match(I->user_back(), m_ICmp(Pred, m_Value(), m_AllOnes())))
-        return Pred == CmpInst::ICMP_SGT;
+      if (match(I->user_back(),
+                m_SpecificICmp(CmpInst::ICMP_SLT, m_Value(), m_ZeroInt())))
+        return true;
+      if (match(I->user_back(),
+                m_SpecificICmp(CmpInst::ICMP_SGT, m_Value(), m_AllOnes())))
+        return true;
     }
     return false;
   }
@@ -30771,10 +30776,12 @@ static bool shouldExpandCmpArithRMWInIR(AtomicRMWInst *AI) {
     if (match(I, m_c_ICmp(Pred, m_Specific(Op), m_Value())))
       return Pred == CmpInst::ICMP_EQ || Pred == CmpInst::ICMP_NE;
     if (match(I, m_OneUse(m_Sub(m_Value(), m_Specific(Op))))) {
-      if (match(I->user_back(), m_ICmp(Pred, m_Value(), m_ZeroInt())))
-        return Pred == CmpInst::ICMP_SLT;
-      if (match(I->user_back(), m_ICmp(Pred, m_Value(), m_AllOnes())))
-        return Pred == CmpInst::ICMP_SGT;
+      if (match(I->user_back(),
+                m_SpecificICmp(CmpInst::ICMP_SLT, m_Value(), m_ZeroInt())))
+        return true;
+      if (match(I->user_back(),
+                m_SpecificICmp(CmpInst::ICMP_SGT, m_Value(), m_AllOnes())))
+        return true;
     }
     return false;
   }
@@ -30785,18 +30792,21 @@ static bool shouldExpandCmpArithRMWInIR(AtomicRMWInst *AI) {
     if (match(I->user_back(), m_ICmp(Pred, m_Value(), m_ZeroInt())))
       return Pred == CmpInst::ICMP_EQ || Pred == CmpInst::ICMP_NE ||
              Pred == CmpInst::ICMP_SLT;
-    if (match(I->user_back(), m_ICmp(Pred, m_Value(), m_AllOnes())))
-      return Pred == CmpInst::ICMP_SGT;
+    if (match(I->user_back(),
+              m_SpecificICmp(CmpInst::ICMP_SGT, m_Value(), m_AllOnes())))
+      return true;
     return false;
   }
   if (Opc == AtomicRMWInst::Xor) {
     if (match(I, m_c_ICmp(Pred, m_Specific(Op), m_Value())))
       return Pred == CmpInst::ICMP_EQ || Pred == CmpInst::ICMP_NE;
     if (match(I, m_OneUse(m_c_Xor(m_Specific(Op), m_Value())))) {
-      if (match(I->user_back(), m_ICmp(Pred, m_Value(), m_ZeroInt())))
-        return Pred == CmpInst::ICMP_SLT;
-      if (match(I->user_back(), m_ICmp(Pred, m_Value(), m_AllOnes())))
-        return Pred == CmpInst::ICMP_SGT;
+      if (match(I->user_back(),
+                m_SpecificICmp(CmpInst::ICMP_SLT, m_Value(), m_ZeroInt())))
+        return true;
+      if (match(I->user_back(),
+                m_SpecificICmp(CmpInst::ICMP_SGT, m_Value(), m_AllOnes())))
+        return true;
     }
     return false;
   }
diff --git a/llvm/lib/Target/X86/X86InstrInfo.cpp b/llvm/lib/Target/X86/X86InstrInfo.cpp
index 918a608..7fc786b 100644
--- a/llvm/lib/Target/X86/X86InstrInfo.cpp
+++ b/llvm/lib/Target/X86/X86InstrInfo.cpp
@@ -3121,9 +3121,9 @@ bool X86InstrInfo::hasCommutePreference(MachineInstr &MI, bool &Commute) const {
 
 int X86::getCondSrcNoFromDesc(const MCInstrDesc &MCID) {
   unsigned Opcode = MCID.getOpcode();
-  if (!(X86::isJCC(Opcode) || X86::isSETCC(Opcode) || X86::isCMOVCC(Opcode) ||
-        X86::isCFCMOVCC(Opcode) || X86::isCCMPCC(Opcode) ||
-        X86::isCTESTCC(Opcode)))
+  if (!(X86::isJCC(Opcode) || X86::isSETCC(Opcode) || X86::isSETZUCC(Opcode) ||
+        X86::isCMOVCC(Opcode) || X86::isCFCMOVCC(Opcode) ||
+        X86::isCCMPCC(Opcode) || X86::isCTESTCC(Opcode)))
     return -1;
   // Assume that condition code is always the last use operand.
   unsigned NumUses = MCID.getNumOperands() - MCID.getNumDefs();
@@ -3145,8 +3145,9 @@ X86::CondCode X86::getCondFromBranch(const MachineInstr &MI) {
 }
 
 X86::CondCode X86::getCondFromSETCC(const MachineInstr &MI) {
-  return X86::isSETCC(MI.getOpcode()) ? X86::getCondFromMI(MI)
-                                      : X86::COND_INVALID;
+  return X86::isSETCC(MI.getOpcode()) || X86::isSETZUCC(MI.getOpcode())
+             ? X86::getCondFromMI(MI)
+             : X86::COND_INVALID;
 }
 
 X86::CondCode X86::getCondFromCMov(const MachineInstr &MI) {
diff --git a/llvm/lib/Target/X86/X86SchedBroadwell.td b/llvm/lib/Target/X86/X86SchedBroadwell.td
index 63ac910..697d30a 100644
--- a/llvm/lib/Target/X86/X86SchedBroadwell.td
+++ b/llvm/lib/Target/X86/X86SchedBroadwell.td
@@ -792,10 +792,10 @@ def BWWriteResGroup28 : SchedWriteRes<[BWPort5]> {
 def: InstRW<[BWWriteResGroup28], (instrs VPBROADCASTBrr,
                                          VPBROADCASTWrr)>;
 
-def BWWriteResGroup33 : SchedWriteRes<[BWPort5,BWPort0156]> {
+def BWWriteResGroup33 : SchedWriteRes<[BWPort5]> {
   let Latency = 3;
-  let NumMicroOps = 3;
-  let ReleaseAtCycles = [2,1];
+  let NumMicroOps = 2;
+  let ReleaseAtCycles = [2];
 }
 def: InstRW<[BWWriteResGroup33], (instrs MMX_PACKSSDWrr,
                                          MMX_PACKSSWBrr,
diff --git a/llvm/lib/Target/X86/X86SchedHaswell.td b/llvm/lib/Target/X86/X86SchedHaswell.td
index 516dc62..c4d2ad7 100644
--- a/llvm/lib/Target/X86/X86SchedHaswell.td
+++ b/llvm/lib/Target/X86/X86SchedHaswell.td
@@ -1247,10 +1247,10 @@ def: InstRW<[HWWriteResGroup53_1], (instrs VPMOVSXBWYrm,
                                            VPMOVSXWDYrm,
                                            VPMOVZXWDYrm)>;
 
-def HWWriteResGroup57 : SchedWriteRes<[HWPort5,HWPort0156]> {
+def HWWriteResGroup57 : SchedWriteRes<[HWPort5]> {
   let Latency = 3;
-  let NumMicroOps = 3;
-  let ReleaseAtCycles = [2,1];
+  let NumMicroOps = 2;
+  let ReleaseAtCycles = [2];
 }
 def: InstRW<[HWWriteResGroup57], (instrs MMX_PACKSSDWrr,
                                          MMX_PACKSSWBrr,
diff --git a/llvm/lib/Target/X86/X86SchedSandyBridge.td b/llvm/lib/Target/X86/X86SchedSandyBridge.td
index 7b33aed..6966400 100644
--- a/llvm/lib/Target/X86/X86SchedSandyBridge.td
+++ b/llvm/lib/Target/X86/X86SchedSandyBridge.td
@@ -638,7 +638,8 @@ def: InstRW<[SBWriteResGroup5], (instrs MMX_PABSBrr,
                                         MMX_PALIGNRrri,
                                         MMX_PSIGNBrr,
                                         MMX_PSIGNDrr,
-                                        MMX_PSIGNWrr)>;
+                                        MMX_PSIGNWrr,
+                                        MMX_PSUBQrr)>;
 
 def SBWriteResGroup11 : SchedWriteRes<[SBPort015]> {
   let Latency = 2;
@@ -898,7 +899,8 @@ def SBWriteResGroup59 : SchedWriteRes<[SBPort23,SBPort15]> {
   let NumMicroOps = 2;
   let ReleaseAtCycles = [1,1];
 }
-def: InstRW<[SBWriteResGroup59], (instrs MMX_PADDQrm)>;
+def: InstRW<[SBWriteResGroup59], (instrs MMX_PADDQrm,
+                                         MMX_PSUBQrm)>;
 
 def SBWriteResGroup62 : SchedWriteRes<[SBPort5,SBPort23]> {
   let Latency = 7;
diff --git a/llvm/lib/Target/X86/X86Subtarget.cpp b/llvm/lib/Target/X86/X86Subtarget.cpp
index 4e8e04b..7d5af2e 100644
--- a/llvm/lib/Target/X86/X86Subtarget.cpp
+++ b/llvm/lib/Target/X86/X86Subtarget.cpp
@@ -279,6 +279,13 @@ void X86Subtarget::initSubtargetFeatures(StringRef CPU, StringRef TuneCPU,
         FullFS += ",+evex512";
   }
 
+  // Disable 64-bit only features in non-64-bit mode.
+  SmallVector<StringRef, 9> FeaturesIn64BitOnly = {
+      "egpr", "push2pop2", "ppx", "ndd", "ccmp", "nf", "cf", "zu", "uintr"};
+  if (FullFS.find("-64bit-mode") != std::string::npos)
+    llvm::for_each(FeaturesIn64BitOnly,
+                   [&](StringRef F) { FullFS += ",-" + F.str(); });
+
   // Parse features string and set the CPU.
   ParseSubtargetFeatures(CPU, TuneCPU, FullFS);
 
diff --git a/llvm/lib/Target/X86/X86TargetTransformInfo.cpp b/llvm/lib/Target/X86/X86TargetTransformInfo.cpp
index f710456..02267c1 100644
--- a/llvm/lib/Target/X86/X86TargetTransformInfo.cpp
+++ b/llvm/lib/Target/X86/X86TargetTransformInfo.cpp
@@ -3680,6 +3680,8 @@ X86TTIImpl::getIntrinsicInstrCost(const IntrinsicCostAttributes &ICA,
     { ISD::SMIN,       MVT::v2i64,   {  1,  3,  1,  1 } },
     { ISD::SMULO,      MVT::v8i64,   { 44, 44, 81, 93 } },
     { ISD::SMULO,      MVT::v16i32,  {  5, 12,  9, 11 } },
+    { ISD::SMULO,      MVT::v32i16,  {  6, 12, 17, 17 } },
+    { ISD::SMULO,      MVT::v64i8,   { 22, 28, 42, 42 } },
     { ISD::SSUBSAT,    MVT::v2i64,   {  2, 13,  9, 10 } },
     { ISD::SSUBSAT,    MVT::v4i64,   {  2, 15,  7,  8 } },
     { ISD::SSUBSAT,    MVT::v8i64,   {  2, 14,  7,  8 } },
@@ -3702,6 +3704,8 @@ X86TTIImpl::getIntrinsicInstrCost(const IntrinsicCostAttributes &ICA,
     { ISD::UMIN,       MVT::v2i64,   {  1,  3,  1,  1 } },
     { ISD::UMULO,      MVT::v8i64,   { 52, 52, 95, 104} },
     { ISD::UMULO,      MVT::v16i32,  {  5, 12,  8, 10 } },
+    { ISD::UMULO,      MVT::v32i16,  {  5, 13, 16, 16 } },
+    { ISD::UMULO,      MVT::v64i8,   { 18, 24, 30, 30 } },
     { ISD::UADDSAT,    MVT::v2i64,   {  1,  4,  4,  4 } },
     { ISD::UADDSAT,    MVT::v4i64,   {  1,  4,  4,  4 } },
     { ISD::UADDSAT,    MVT::v8i64,   {  1,  4,  4,  4 } },
diff --git a/llvm/lib/TargetParser/Host.cpp b/llvm/lib/TargetParser/Host.cpp
index 0803521..68aed69 100644
--- a/llvm/lib/TargetParser/Host.cpp
+++ b/llvm/lib/TargetParser/Host.cpp
@@ -1826,7 +1826,9 @@ const StringMap<bool> sys::getHostCPUFeatures() {
   Features["ppx"] = HasAPXF;
   Features["ndd"] = HasAPXF;
   Features["ccmp"] = HasAPXF;
+  Features["nf"] = HasAPXF;
   Features["cf"] = HasAPXF;
+  Features["zu"] = HasAPXF;
 
   bool HasLeafD = MaxLevel >= 0xd &&
                   !getX86CpuIDAndInfoEx(0xd, 0x1, &EAX, &EBX, &ECX, &EDX);
diff --git a/llvm/lib/Transforms/AggressiveInstCombine/AggressiveInstCombine.cpp b/llvm/lib/Transforms/AggressiveInstCombine/AggressiveInstCombine.cpp
index d5a38ec..09ffc2d 100644
--- a/llvm/lib/Transforms/AggressiveInstCombine/AggressiveInstCombine.cpp
+++ b/llvm/lib/Transforms/AggressiveInstCombine/AggressiveInstCombine.cpp
@@ -135,15 +135,12 @@ static bool foldGuardedFunnelShift(Instruction &I, const DominatorTree &DT) {
   if (!DT.dominates(ShVal0, TermI) || !DT.dominates(ShVal1, TermI))
     return false;
 
-  ICmpInst::Predicate Pred;
   BasicBlock *PhiBB = Phi.getParent();
-  if (!match(TermI, m_Br(m_ICmp(Pred, m_Specific(ShAmt), m_ZeroInt()),
+  if (!match(TermI, m_Br(m_SpecificICmp(CmpInst::ICMP_EQ, m_Specific(ShAmt),
+                                        m_ZeroInt()),
                          m_SpecificBB(PhiBB), m_SpecificBB(FunnelBB))))
     return false;
 
-  if (Pred != CmpInst::ICMP_EQ)
-    return false;
-
   IRBuilder<> Builder(PhiBB, PhiBB->getFirstInsertionPt());
 
   if (ShVal0 == ShVal1)
diff --git a/llvm/lib/Transforms/IPO/FunctionImport.cpp b/llvm/lib/Transforms/IPO/FunctionImport.cpp
index 210b79e..6c4f400 100644
--- a/llvm/lib/Transforms/IPO/FunctionImport.cpp
+++ b/llvm/lib/Transforms/IPO/FunctionImport.cpp
@@ -19,7 +19,6 @@
 #include "llvm/ADT/StringRef.h"
 #include "llvm/Bitcode/BitcodeReader.h"
 #include "llvm/IR/AutoUpgrade.h"
-#include "llvm/IR/Constants.h"
 #include "llvm/IR/Function.h"
 #include "llvm/IR/GlobalAlias.h"
 #include "llvm/IR/GlobalObject.h"
@@ -30,6 +29,7 @@
 #include "llvm/IR/ModuleSummaryIndex.h"
 #include "llvm/IRReader/IRReader.h"
 #include "llvm/Linker/IRMover.h"
+#include "llvm/ProfileData/PGOCtxProfReader.h"
 #include "llvm/Support/Casting.h"
 #include "llvm/Support/CommandLine.h"
 #include "llvm/Support/Debug.h"
@@ -185,6 +185,10 @@ static cl::opt<bool> ImportAssumeUniqueLocal(
         "user specify the full module path."),
     cl::Hidden);
 
+static cl::opt<std::string>
+    ContextualProfile("thinlto-pgo-ctx-prof",
+                      cl::desc("Path to a contextual profile."), cl::Hidden);
+
 namespace llvm {
 extern cl::opt<bool> EnableMemProfContextDisambiguation;
 }
@@ -604,13 +608,7 @@ class WorkloadImportsManager : public ModuleImportsManager {
     LLVM_DEBUG(dbgs() << "[Workload] Done\n");
   }
 
-public:
-  WorkloadImportsManager(
-      function_ref<bool(GlobalValue::GUID, const GlobalValueSummary *)>
-          IsPrevailing,
-      const ModuleSummaryIndex &Index,
-      DenseMap<StringRef, FunctionImporter::ExportSetTy> *ExportLists)
-      : ModuleImportsManager(IsPrevailing, Index, ExportLists) {
+  void loadFromJson() {
     // Since the workload def uses names, we need a quick lookup
     // name->ValueInfo.
     StringMap<ValueInfo> NameToValueInfo;
@@ -680,15 +678,81 @@ public:
         }
         Set.insert(ElemIt->second);
       }
-      LLVM_DEBUG({
+    }
+  }
+
+  void loadFromCtxProf() {
+    std::error_code EC;
+    auto BufferOrErr = MemoryBuffer::getFileOrSTDIN(ContextualProfile);
+    if (std::error_code EC = BufferOrErr.getError()) {
+      report_fatal_error("Failed to open contextual profile file");
+      return;
+    }
+    auto Buffer = std::move(BufferOrErr.get());
+
+    PGOCtxProfileReader Reader(Buffer->getBuffer());
+    auto Ctx = Reader.loadContexts();
+    if (!Ctx) {
+      report_fatal_error("Failed to parse contextual profiles");
+      return;
+    }
+    const auto &CtxMap = *Ctx;
+    DenseSet<GlobalValue::GUID> ContainedGUIDs;
+    for (const auto &[RootGuid, Root] : CtxMap) {
+      // Avoid ContainedGUIDs to get in/out of scope. Reuse its memory for
+      // subsequent roots, but clear its contents.
+      ContainedGUIDs.clear();
+
+      auto RootVI = Index.getValueInfo(RootGuid);
+      if (!RootVI) {
+        LLVM_DEBUG(dbgs() << "[Workload] Root " << RootGuid
+                          << " not found in this linkage unit.\n");
+        continue;
+      }
+      if (RootVI.getSummaryList().size() != 1) {
+        LLVM_DEBUG(dbgs() << "[Workload] Root " << RootGuid
+                          << " should have exactly one summary, but has "
+                          << RootVI.getSummaryList().size() << ". Skipping.\n");
+        continue;
+      }
+      StringRef RootDefiningModule =
+          RootVI.getSummaryList().front()->modulePath();
+      LLVM_DEBUG(dbgs() << "[Workload] Root defining module for " << RootGuid
+                        << " is : " << RootDefiningModule << "\n");
+      auto &Set = Workloads[RootDefiningModule];
+      Root.getContainedGuids(ContainedGUIDs);
+      for (auto Guid : ContainedGUIDs)
+        if (auto VI = Index.getValueInfo(Guid))
+          Set.insert(VI);
+    }
+  }
+
+public:
+  WorkloadImportsManager(
+      function_ref<bool(GlobalValue::GUID, const GlobalValueSummary *)>
+          IsPrevailing,
+      const ModuleSummaryIndex &Index,
+      DenseMap<StringRef, FunctionImporter::ExportSetTy> *ExportLists)
+      : ModuleImportsManager(IsPrevailing, Index, ExportLists) {
+    if (ContextualProfile.empty() == WorkloadDefinitions.empty()) {
+      report_fatal_error(
+          "Pass only one of: -thinlto-pgo-ctx-prof or -thinlto-workload-def");
+      return;
+    }
+    if (!ContextualProfile.empty())
+      loadFromCtxProf();
+    else
+      loadFromJson();
+    LLVM_DEBUG({
+      for (const auto &[Root, Set] : Workloads) {
         dbgs() << "[Workload] Root: " << Root << " we have " << Set.size()
                << " distinct callees.\n";
         for (const auto &VI : Set) {
           dbgs() << "[Workload] Root: " << Root
                  << " Would include: " << VI.getGUID() << "\n";
         }
-      });
-    }
+      }
+    });
   }
 };
 
@@ -697,7 +761,7 @@ std::unique_ptr<ModuleImportsManager> ModuleImportsManager::create(
         IsPrevailing,
     const ModuleSummaryIndex &Index,
     DenseMap<StringRef, FunctionImporter::ExportSetTy> *ExportLists) {
-  if (WorkloadDefinitions.empty()) {
+  if (WorkloadDefinitions.empty() && ContextualProfile.empty()) {
     LLVM_DEBUG(dbgs() << "[Workload] Using the regular imports manager.\n");
     return std::unique_ptr<ModuleImportsManager>(
         new ModuleImportsManager(IsPrevailing, Index, ExportLists));
diff --git a/llvm/lib/Transforms/IPO/MemProfContextDisambiguation.cpp b/llvm/lib/Transforms/IPO/MemProfContextDisambiguation.cpp
index 64da3dfd..c9de9c9 100644
--- a/llvm/lib/Transforms/IPO/MemProfContextDisambiguation.cpp
+++ b/llvm/lib/Transforms/IPO/MemProfContextDisambiguation.cpp
@@ -2109,7 +2109,7 @@ bool ModuleCallsiteContextGraph::calleeMatchesFunc(
     Instruction *Call, const Function *Func, const Function *CallerFunc,
     std::vector<std::pair<Instruction *, Function *>> &FoundCalleeChain) {
   auto *CB = dyn_cast<CallBase>(Call);
-  if (!CB->getCalledOperand())
+  if (!CB->getCalledOperand() || CB->isIndirectCall())
     return false;
   auto *CalleeVal = CB->getCalledOperand()->stripPointerCasts();
   auto *CalleeFunc = dyn_cast<Function>(CalleeVal);
diff --git a/llvm/lib/Transforms/InstCombine/InstCombineAddSub.cpp b/llvm/lib/Transforms/InstCombine/InstCombineAddSub.cpp
index 0a55f47..3bd0862 100644
--- a/llvm/lib/Transforms/InstCombine/InstCombineAddSub.cpp
+++ b/llvm/lib/Transforms/InstCombine/InstCombineAddSub.cpp
@@ -1694,12 +1694,10 @@ Instruction *InstCombinerImpl::visitAdd(BinaryOperator &I) {
 
   // Canonicalize signum variant that ends in add:
   // (A s>> (BW - 1)) + (zext (A s> 0)) --> (A s>> (BW - 1)) | (zext (A != 0))
-  ICmpInst::Predicate Pred;
   uint64_t BitWidth = Ty->getScalarSizeInBits();
   if (match(LHS, m_AShr(m_Value(A), m_SpecificIntAllowPoison(BitWidth - 1))) &&
-      match(RHS, m_OneUse(m_ZExt(
-                     m_OneUse(m_ICmp(Pred, m_Specific(A), m_ZeroInt()))))) &&
-      Pred == CmpInst::ICMP_SGT) {
+      match(RHS, m_OneUse(m_ZExt(m_OneUse(m_SpecificICmp(
+                     CmpInst::ICMP_SGT, m_Specific(A), m_ZeroInt())))))) {
     Value *NotZero = Builder.CreateIsNotNull(A, "isnotnull");
     Value *Zext = Builder.CreateZExt(NotZero, Ty, "isnotnull.zext");
     return BinaryOperator::CreateOr(LHS, Zext);
@@ -1711,12 +1709,13 @@ Instruction *InstCombinerImpl::visitAdd(BinaryOperator &I) {
     // (add X, (sext/zext (icmp eq X, C)))
     //    -> (select (icmp eq X, C), (add C, (sext/zext 1)), X)
     auto CondMatcher = m_CombineAnd(
-        m_Value(Cond), m_ICmp(Pred, m_Deferred(A), m_ImmConstant(C)));
+        m_Value(Cond),
+        m_SpecificICmp(ICmpInst::ICMP_EQ, m_Deferred(A), m_ImmConstant(C)));
 
     if (match(&I,
               m_c_Add(m_Value(A),
                       m_CombineAnd(m_Value(Ext), m_ZExtOrSExt(CondMatcher)))) &&
-        Pred == ICmpInst::ICMP_EQ && Ext->hasOneUse()) {
+        Ext->hasOneUse()) {
       Value *Add = isa<ZExtInst>(Ext) ? InstCombiner::AddOne(C)
                                       : InstCombiner::SubOne(C);
       return replaceInstUsesWith(I, Builder.CreateSelect(Cond, Add, A));
@@ -1791,6 +1790,7 @@ Instruction *InstCombinerImpl::visitAdd(BinaryOperator &I) {
   // -->
   // BW - ctlz(A - 1, false)
   const APInt *XorC;
+  ICmpInst::Predicate Pred;
   if (match(&I,
             m_c_Add(
                 m_ZExt(m_ICmp(Pred, m_Intrinsic<Intrinsic::ctpop>(m_Value(A)),
diff --git a/llvm/lib/Transforms/InstCombine/InstCombineAndOrXor.cpp b/llvm/lib/Transforms/InstCombine/InstCombineAndOrXor.cpp
index f9caa4d..4ca12d5 100644
--- a/llvm/lib/Transforms/InstCombine/InstCombineAndOrXor.cpp
+++ b/llvm/lib/Transforms/InstCombine/InstCombineAndOrXor.cpp
@@ -818,11 +818,11 @@ static Value *foldSignedTruncationCheck(ICmpInst *ICmp0, ICmpInst *ICmp1,
   // Match  icmp ult (add %arg, C01), C1   (C1 == C01 << 1; powers of two)
   auto tryToMatchSignedTruncationCheck = [](ICmpInst *ICmp, Value *&X,
                                             APInt &SignBitMask) -> bool {
-    CmpInst::Predicate Pred;
     const APInt *I01, *I1; // powers of two; I1 == I01 << 1
-    if (!(match(ICmp,
-                m_ICmp(Pred, m_Add(m_Value(X), m_Power2(I01)), m_Power2(I1))) &&
-          Pred == ICmpInst::ICMP_ULT && I1->ugt(*I01) && I01->shl(1) == *I1))
+    if (!(match(ICmp, m_SpecificICmp(ICmpInst::ICMP_ULT,
+                                     m_Add(m_Value(X), m_Power2(I01)),
+                                     m_Power2(I1))) &&
+          I1->ugt(*I01) && I01->shl(1) == *I1))
       return false;
     // Which bit is the new sign bit as per the 'signed truncation' pattern?
     SignBitMask = *I01;
@@ -936,20 +936,21 @@ static Value *foldIsPowerOf2(ICmpInst *Cmp0, ICmpInst *Cmp1, bool JoinedByAnd,
     std::swap(Cmp0, Cmp1);
 
   // (X != 0) && (ctpop(X) u< 2) --> ctpop(X) == 1
-  CmpInst::Predicate Pred0, Pred1;
   Value *X;
-  if (JoinedByAnd && match(Cmp0, m_ICmp(Pred0, m_Value(X), m_ZeroInt())) &&
-      match(Cmp1, m_ICmp(Pred1, m_Intrinsic<Intrinsic::ctpop>(m_Specific(X)),
-                         m_SpecificInt(2))) &&
-      Pred0 == ICmpInst::ICMP_NE && Pred1 == ICmpInst::ICMP_ULT) {
+  if (JoinedByAnd &&
+      match(Cmp0, m_SpecificICmp(ICmpInst::ICMP_NE, m_Value(X), m_ZeroInt())) &&
+      match(Cmp1, m_SpecificICmp(ICmpInst::ICMP_ULT,
+                                 m_Intrinsic<Intrinsic::ctpop>(m_Specific(X)),
+                                 m_SpecificInt(2)))) {
     Value *CtPop = Cmp1->getOperand(0);
     return Builder.CreateICmpEQ(CtPop, ConstantInt::get(CtPop->getType(), 1));
   }
   // (X == 0) || (ctpop(X) u> 1) --> ctpop(X) != 1
-  if (!JoinedByAnd && match(Cmp0, m_ICmp(Pred0, m_Value(X), m_ZeroInt())) &&
-      match(Cmp1, m_ICmp(Pred1, m_Intrinsic<Intrinsic::ctpop>(m_Specific(X)),
-                         m_SpecificInt(1))) &&
-      Pred0 == ICmpInst::ICMP_EQ && Pred1 == ICmpInst::ICMP_UGT) {
+  if (!JoinedByAnd &&
+      match(Cmp0, m_SpecificICmp(ICmpInst::ICMP_EQ, m_Value(X), m_ZeroInt())) &&
+      match(Cmp1, m_SpecificICmp(ICmpInst::ICMP_UGT,
+                                 m_Intrinsic<Intrinsic::ctpop>(m_Specific(X)),
+                                 m_SpecificInt(1)))) {
     Value *CtPop = Cmp1->getOperand(0);
     return Builder.CreateICmpNE(CtPop, ConstantInt::get(CtPop->getType(), 1));
   }
@@ -1608,31 +1609,30 @@ static Instruction *reassociateFCmps(BinaryOperator &BO,
   // There are 4 commuted variants of the pattern. Canonicalize operands of this
   // logic op so an fcmp is operand 0 and a matching logic op is operand 1.
   Value *Op0 = BO.getOperand(0), *Op1 = BO.getOperand(1), *X;
-  FCmpInst::Predicate Pred;
-  if (match(Op1, m_FCmp(Pred, m_Value(), m_AnyZeroFP())))
+  if (match(Op1, m_FCmp(m_Value(), m_AnyZeroFP())))
     std::swap(Op0, Op1);
 
   // Match inner binop and the predicate for combining 2 NAN checks into 1.
   Value *BO10, *BO11;
   FCmpInst::Predicate NanPred = Opcode == Instruction::And ? FCmpInst::FCMP_ORD
                                                            : FCmpInst::FCMP_UNO;
-  if (!match(Op0, m_FCmp(Pred, m_Value(X), m_AnyZeroFP())) || Pred != NanPred ||
+  if (!match(Op0, m_SpecificFCmp(NanPred, m_Value(X), m_AnyZeroFP())) ||
       !match(Op1, m_BinOp(Opcode, m_Value(BO10), m_Value(BO11))))
     return nullptr;
 
   // The inner logic op must have a matching fcmp operand.
   Value *Y;
-  if (!match(BO10, m_FCmp(Pred, m_Value(Y), m_AnyZeroFP())) ||
-      Pred != NanPred || X->getType() != Y->getType())
+  if (!match(BO10, m_SpecificFCmp(NanPred, m_Value(Y), m_AnyZeroFP())) ||
+      X->getType() != Y->getType())
     std::swap(BO10, BO11);
 
-  if (!match(BO10, m_FCmp(Pred, m_Value(Y), m_AnyZeroFP())) ||
-      Pred != NanPred || X->getType() != Y->getType())
+  if (!match(BO10, m_SpecificFCmp(NanPred, m_Value(Y), m_AnyZeroFP())) ||
+      X->getType() != Y->getType())
     return nullptr;
 
   // and (fcmp ord X, 0), (and (fcmp ord Y, 0), Z) --> and (fcmp ord X, Y), Z
   // or  (fcmp uno X, 0), (or  (fcmp uno Y, 0), Z) --> or  (fcmp uno X, Y), Z
-  Value *NewFCmp = Builder.CreateFCmp(Pred, X, Y);
+  Value *NewFCmp = Builder.CreateFCmp(NanPred, X, Y);
   if (auto *NewFCmpInst = dyn_cast<FCmpInst>(NewFCmp)) {
     // Intersect FMF from the 2 source fcmps.
     NewFCmpInst->copyIRFlags(Op0);
@@ -1744,14 +1744,13 @@ Instruction *InstCombinerImpl::foldCastedBitwiseLogic(BinaryOperator &I) {
   //   -> zext(bitwise(A < 0, icmp))
   auto FoldBitwiseICmpZeroWithICmp = [&](Value *Op0,
                                          Value *Op1) -> Instruction * {
-    ICmpInst::Predicate Pred;
     Value *A;
     bool IsMatched =
         match(Op0,
               m_OneUse(m_LShr(
                   m_Value(A),
                   m_SpecificInt(Op0->getType()->getScalarSizeInBits() - 1)))) &&
-        match(Op1, m_OneUse(m_ZExt(m_ICmp(Pred, m_Value(), m_Value()))));
+        match(Op1, m_OneUse(m_ZExt(m_ICmp(m_Value(), m_Value()))));
 
     if (!IsMatched)
       return nullptr;
@@ -3878,14 +3877,14 @@ Instruction *InstCombinerImpl::visitOr(BinaryOperator &I) {
   if (match(&I,
             m_c_Or(m_CombineAnd(m_ExtractValue<1>(m_Value(UMulWithOv)),
                                 m_Value(Ov)),
-                   m_CombineAnd(m_ICmp(Pred,
-                                       m_CombineAnd(m_ExtractValue<0>(
-                                                        m_Deferred(UMulWithOv)),
-                                                    m_Value(Mul)),
-                                       m_ZeroInt()),
-                                m_Value(MulIsNotZero)))) &&
-      (Ov->hasOneUse() || (MulIsNotZero->hasOneUse() && Mul->hasOneUse())) &&
-      Pred == CmpInst::ICMP_NE) {
+                   m_CombineAnd(
+                       m_SpecificICmp(ICmpInst::ICMP_NE,
+                                      m_CombineAnd(m_ExtractValue<0>(
+                                                       m_Deferred(UMulWithOv)),
+                                                   m_Value(Mul)),
+                                      m_ZeroInt()),
+                       m_Value(MulIsNotZero)))) &&
+      (Ov->hasOneUse() || (MulIsNotZero->hasOneUse() && Mul->hasOneUse()))) {
     Value *A, *B;
     if (match(UMulWithOv, m_Intrinsic<Intrinsic::umul_with_overflow>(
                               m_Value(A), m_Value(B)))) {
diff --git a/llvm/lib/Transforms/InstCombine/InstCombineCalls.cpp b/llvm/lib/Transforms/InstCombine/InstCombineCalls.cpp
index f6c4b6e..deb8e3c 100644
--- a/llvm/lib/Transforms/InstCombine/InstCombineCalls.cpp
+++ b/llvm/lib/Transforms/InstCombine/InstCombineCalls.cpp
@@ -692,12 +692,24 @@ static Instruction *foldCtpop(IntrinsicInst &II, InstCombinerImpl &IC) {
                             Ty);
 
   // Add range attribute since known bits can't completely reflect what we know.
-  if (BitWidth != 1 && !II.hasRetAttr(Attribute::Range) &&
-      !II.getMetadata(LLVMContext::MD_range)) {
-    ConstantRange Range(APInt(BitWidth, Known.countMinPopulation()),
-                        APInt(BitWidth, Known.countMaxPopulation() + 1));
-    II.addRangeRetAttr(Range);
-    return &II;
+  if (BitWidth != 1) {
+    ConstantRange OldRange =
+        II.getRange().value_or(ConstantRange::getFull(BitWidth));
+
+    unsigned Lower = Known.countMinPopulation();
+    unsigned Upper = Known.countMaxPopulation() + 1;
+
+    if (Lower == 0 && OldRange.contains(APInt::getZero(BitWidth)) &&
+        isKnownNonZero(Op0, IC.getSimplifyQuery().getWithInstruction(&II)))
+      Lower = 1;
+
+    ConstantRange Range(APInt(BitWidth, Lower), APInt(BitWidth, Upper));
+    Range = Range.intersectWith(OldRange, ConstantRange::Unsigned);
+
+    if (Range != OldRange) {
+      II.addRangeRetAttr(Range);
+      return &II;
+    }
   }
 
   return nullptr;
@@ -1500,10 +1512,7 @@ Instruction *InstCombinerImpl::visitCallInst(CallInst &CI) {
   // Don't try to simplify calls without uses. It will not do anything useful,
   // but will result in the following folds being skipped.
   if (!CI.use_empty()) {
-    SmallVector<Value *, 4> Args;
-    Args.reserve(CI.arg_size());
-    for (Value *Op : CI.args())
-      Args.push_back(Op);
+    SmallVector<Value *, 8> Args(CI.args());
     if (Value *V = simplifyCall(&CI, CI.getCalledOperand(), Args,
                                 SQ.getWithInstruction(&CI)))
       return replaceInstUsesWith(CI, V);
@@ -3031,10 +3040,10 @@ Instruction *InstCombinerImpl::visitCallInst(CallInst &CI) {
 
     // assume( (load addr) != null ) -> add 'nonnull' metadata to load
     // (if assume is valid at the load)
-    CmpInst::Predicate Pred;
     Instruction *LHS;
-    if (match(IIOperand, m_ICmp(Pred, m_Instruction(LHS), m_Zero())) &&
-        Pred == ICmpInst::ICMP_NE && LHS->getOpcode() == Instruction::Load &&
+    if (match(IIOperand, m_SpecificICmp(ICmpInst::ICMP_NE, m_Instruction(LHS),
+                                        m_Zero())) &&
+        LHS->getOpcode() == Instruction::Load &&
         LHS->getType()->isPointerTy() &&
         isValidAssumeForContext(II, LHS, &DT)) {
       MDNode *MD = MDNode::get(II->getContext(), std::nullopt);
@@ -3073,8 +3082,9 @@ Instruction *InstCombinerImpl::visitCallInst(CallInst &CI) {
     // into
     // call void @llvm.assume(i1 true) [ "nonnull"(i32* %PTR) ]
     if (EnableKnowledgeRetention &&
-        match(IIOperand, m_Cmp(Pred, m_Value(A), m_Zero())) &&
-        Pred == CmpInst::ICMP_NE && A->getType()->isPointerTy()) {
+        match(IIOperand,
+              m_SpecificICmp(ICmpInst::ICMP_NE, m_Value(A), m_Zero())) &&
+        A->getType()->isPointerTy()) {
       if (auto *Replacement = buildAssumeFromKnowledge(
               {RetainedKnowledge{Attribute::NonNull, 0, A}}, Next, &AC, &DT)) {
 
@@ -3094,9 +3104,9 @@ Instruction *InstCombinerImpl::visitCallInst(CallInst &CI) {
     uint64_t AlignMask;
     if (EnableKnowledgeRetention &&
         match(IIOperand,
-              m_Cmp(Pred, m_And(m_Value(A), m_ConstantInt(AlignMask)),
-                    m_Zero())) &&
-        Pred == CmpInst::ICMP_EQ) {
+              m_SpecificICmp(ICmpInst::ICMP_EQ,
+                             m_And(m_Value(A), m_ConstantInt(AlignMask)),
+                             m_Zero()))) {
       if (isPowerOf2_64(AlignMask + 1)) {
         uint64_t Offset = 0;
         match(A, m_Add(m_Value(A), m_ConstantInt(Offset)));
diff --git a/llvm/lib/Transforms/InstCombine/InstCombineCasts.cpp b/llvm/lib/Transforms/InstCombine/InstCombineCasts.cpp
index 4323635..97ee845 100644
--- a/llvm/lib/Transforms/InstCombine/InstCombineCasts.cpp
+++ b/llvm/lib/Transforms/InstCombine/InstCombineCasts.cpp
@@ -2666,6 +2666,27 @@ Instruction *InstCombinerImpl::optimizeBitCastFromPhi(CastInst &CI,
   return RetVal;
 }
 
+/// Fold (bitcast (or (and (bitcast X to int), signmask), nneg Y) to fp) to
+/// copysign((bitcast Y to fp), X)
+static Value *foldCopySignIdioms(BitCastInst &CI,
+                                 InstCombiner::BuilderTy &Builder,
+                                 const SimplifyQuery &SQ) {
+  Value *X, *Y;
+  Type *FTy = CI.getType();
+  if (!FTy->isFPOrFPVectorTy())
+    return nullptr;
+  if (!match(&CI, m_ElementWiseBitCast(m_c_Or(
+                      m_And(m_ElementWiseBitCast(m_Value(X)), m_SignMask()),
+                      m_Value(Y)))))
+    return nullptr;
+  if (X->getType() != FTy)
+    return nullptr;
+  if (!isKnownNonNegative(Y, SQ))
+    return nullptr;
+
+  return Builder.CreateCopySign(Builder.CreateBitCast(Y, FTy), X);
+}
+
 Instruction *InstCombinerImpl::visitBitCast(BitCastInst &CI) {
   // If the operands are integer typed then apply the integer transforms,
   // otherwise just apply the common ones.
@@ -2807,6 +2828,9 @@ Instruction *InstCombinerImpl::visitBitCast(BitCastInst &CI) {
   if (Instruction *I = foldBitCastSelect(CI, Builder))
     return I;
 
+  if (Value *V = foldCopySignIdioms(CI, Builder, SQ.getWithInstruction(&CI)))
+    return replaceInstUsesWith(CI, V);
+
   return commonCastTransforms(CI);
 }
 
diff --git a/llvm/lib/Transforms/InstCombine/InstCombineCompares.cpp b/llvm/lib/Transforms/InstCombine/InstCombineCompares.cpp
index abadf54..3b6df27 100644
--- a/llvm/lib/Transforms/InstCombine/InstCombineCompares.cpp
+++ b/llvm/lib/Transforms/InstCombine/InstCombineCompares.cpp
@@ -5772,8 +5772,7 @@ Instruction *InstCombinerImpl::foldICmpEquality(ICmpInst &I) {
   //    -> icmp eq/ne X, rotate-left(X)
   // We generally try to convert rotate-right -> rotate-left, this just
   // canonicalizes another case.
-  CmpInst::Predicate PredUnused = Pred;
-  if (match(&I, m_c_ICmp(PredUnused, m_Value(A),
+  if (match(&I, m_c_ICmp(m_Value(A),
                          m_OneUse(m_Intrinsic<Intrinsic::fshr>(
                              m_Deferred(A), m_Deferred(A), m_Value(B))))))
     return new ICmpInst(
@@ -5783,8 +5782,7 @@ Instruction *InstCombinerImpl::foldICmpEquality(ICmpInst &I) {
   // Canonicalize:
   // icmp eq/ne OneUse(A ^ Cst), B --> icmp eq/ne (A ^ B), Cst
   Constant *Cst;
-  if (match(&I, m_c_ICmp(PredUnused,
-                         m_OneUse(m_Xor(m_Value(A), m_ImmConstant(Cst))),
+  if (match(&I, m_c_ICmp(m_OneUse(m_Xor(m_Value(A), m_ImmConstant(Cst))),
                          m_CombineAnd(m_Value(B), m_Unless(m_ImmConstant())))))
     return new ICmpInst(Pred, Builder.CreateXor(A, B), Cst);
 
@@ -5795,13 +5793,12 @@ Instruction *InstCombinerImpl::foldICmpEquality(ICmpInst &I) {
                                 m_c_Xor(m_Value(B), m_Deferred(A))),
                     m_Sub(m_Value(B), m_Deferred(A)));
     std::optional<bool> IsZero = std::nullopt;
-    if (match(&I, m_c_ICmp(PredUnused, m_OneUse(m_c_And(m_Value(A), m_Matcher)),
+    if (match(&I, m_c_ICmp(m_OneUse(m_c_And(m_Value(A), m_Matcher)),
                            m_Deferred(A))))
       IsZero = false;
     // (icmp eq/ne (and (add/sub/xor X, P2), P2), 0)
     else if (match(&I,
-                   m_ICmp(PredUnused, m_OneUse(m_c_And(m_Value(A), m_Matcher)),
-                          m_Zero())))
+                   m_ICmp(m_OneUse(m_c_And(m_Value(A), m_Matcher)), m_Zero())))
       IsZero = true;
 
     if (IsZero && isKnownToBeAPowerOfTwo(A, /* OrZero */ true, /*Depth*/ 0, &I))
diff --git a/llvm/lib/Transforms/InstCombine/InstCombineSelect.cpp b/llvm/lib/Transforms/InstCombine/InstCombineSelect.cpp
index aaf4ece..a22ee1d 100644
--- a/llvm/lib/Transforms/InstCombine/InstCombineSelect.cpp
+++ b/llvm/lib/Transforms/InstCombine/InstCombineSelect.cpp
@@ -2484,9 +2484,8 @@ static Instruction *foldSelectFunnelShift(SelectInst &Sel,
 
   // Finally, see if the select is filtering out a shift-by-zero.
   Value *Cond = Sel.getCondition();
-  ICmpInst::Predicate Pred;
-  if (!match(Cond, m_OneUse(m_ICmp(Pred, m_Specific(ShAmt), m_ZeroInt()))) ||
-      Pred != ICmpInst::ICMP_EQ)
+  if (!match(Cond, m_OneUse(m_SpecificICmp(ICmpInst::ICMP_EQ, m_Specific(ShAmt),
+                                           m_ZeroInt()))))
     return nullptr;
 
   // If this is not a rotate then the select was blocking poison from the
diff --git a/llvm/lib/Transforms/InstCombine/InstCombineSimplifyDemanded.cpp b/llvm/lib/Transforms/InstCombine/InstCombineSimplifyDemanded.cpp
index 8a6ec30..c494fec 100644
--- a/llvm/lib/Transforms/InstCombine/InstCombineSimplifyDemanded.cpp
+++ b/llvm/lib/Transforms/InstCombine/InstCombineSimplifyDemanded.cpp
@@ -388,8 +388,7 @@ Value *InstCombinerImpl::SimplifyDemandedUseBits(Instruction *I,
       // invert the transform that reduces set bits and infinite-loop.
       Value *X;
       const APInt *CmpC;
-      ICmpInst::Predicate Pred;
-      if (!match(I->getOperand(0), m_ICmp(Pred, m_Value(X), m_APInt(CmpC))) ||
+      if (!match(I->getOperand(0), m_ICmp(m_Value(X), m_APInt(CmpC))) ||
           isa<Constant>(X) || CmpC->getBitWidth() != SelC->getBitWidth())
         return ShrinkDemandedConstant(I, OpNo, DemandedMask);
 
diff --git a/llvm/lib/Transforms/InstCombine/InstructionCombining.cpp b/llvm/lib/Transforms/InstCombine/InstructionCombining.cpp
index 0d8e7e9..0fb8b63 100644
--- a/llvm/lib/Transforms/InstCombine/InstructionCombining.cpp
+++ b/llvm/lib/Transforms/InstCombine/InstructionCombining.cpp
@@ -1651,14 +1651,14 @@ static Constant *constantFoldOperationIntoSelectOperand(Instruction &I,
                                                         bool IsTrueArm) {
   SmallVector<Constant *> ConstOps;
   for (Value *Op : I.operands()) {
-    CmpInst::Predicate Pred;
     Constant *C = nullptr;
     if (Op == SI) {
       C = dyn_cast<Constant>(IsTrueArm ? SI->getTrueValue()
                                        : SI->getFalseValue());
     } else if (match(SI->getCondition(),
-                     m_ICmp(Pred, m_Specific(Op), m_Constant(C))) &&
-               Pred == (IsTrueArm ? ICmpInst::ICMP_EQ : ICmpInst::ICMP_NE) &&
+                     m_SpecificICmp(IsTrueArm ? ICmpInst::ICMP_EQ
+                                              : ICmpInst::ICMP_NE,
+                                    m_Specific(Op), m_Constant(C))) &&
                isGuaranteedNotToBeUndefOrPoison(C)) {
       // Pass
     } else {
diff --git a/llvm/lib/Transforms/Instrumentation/HWAddressSanitizer.cpp b/llvm/lib/Transforms/Instrumentation/HWAddressSanitizer.cpp
index a0e63bf1..812874f 100644
--- a/llvm/lib/Transforms/Instrumentation/HWAddressSanitizer.cpp
+++ b/llvm/lib/Transforms/Instrumentation/HWAddressSanitizer.cpp
@@ -1528,11 +1528,7 @@ static void emitRemark(const Function &F, OptimizationRemarkEmitter &ORE,
 
 bool HWAddressSanitizer::selectiveInstrumentationShouldSkip(
     Function &F, FunctionAnalysisManager &FAM) const {
-  bool Skip = [&]() {
-    if (ClRandomSkipRate.getNumOccurrences()) {
-      std::bernoulli_distribution D(ClRandomSkipRate);
-      return !D(*Rng);
-    }
+  auto SkipHot = [&]() {
     if (!ClHotPercentileCutoff.getNumOccurrences())
       return false;
     auto &MAMProxy = FAM.getResult<ModuleAnalysisManagerFunctionProxy>(F);
@@ -1544,7 +1540,16 @@ bool HWAddressSanitizer::selectiveInstrumentationShouldSkip(
     }
     return PSI->isFunctionHotInCallGraphNthPercentile(
         ClHotPercentileCutoff, &F, FAM.getResult<BlockFrequencyAnalysis>(F));
-  }();
+  };
+
+  auto SkipRandom = [&]() {
+    if (!ClRandomSkipRate.getNumOccurrences())
+      return false;
+    std::bernoulli_distribution D(ClRandomSkipRate);
+    return !D(*Rng);
+  };
+
+  bool Skip = SkipRandom() || SkipHot();
   emitRemark(F, FAM.getResult<OptimizationRemarkEmitterAnalysis>(F), Skip);
   return Skip;
 }
diff --git a/llvm/lib/Transforms/Instrumentation/InstrProfiling.cpp b/llvm/lib/Transforms/Instrumentation/InstrProfiling.cpp
index d139607..1805ea8 100644
--- a/llvm/lib/Transforms/Instrumentation/InstrProfiling.cpp
+++ b/llvm/lib/Transforms/Instrumentation/InstrProfiling.cpp
@@ -1192,18 +1192,22 @@ CallInst *InstrLowerer::getRMWOrCall(Value *Addr, Value *Val) {
 
 Value *InstrLowerer::getBitmapAddress(InstrProfMCDCTVBitmapUpdate *I) {
   auto *Bitmaps = getOrCreateRegionBitmaps(I);
-  IRBuilder<> Builder(I);
-
-  if (isRuntimeCounterRelocationEnabled()) {
-    LLVMContext &Ctx = M.getContext();
-    Ctx.diagnose(DiagnosticInfoPGOProfile(
-        M.getName().data(),
-        Twine("Runtime counter relocation is presently not supported for MC/DC "
-              "bitmaps."),
-        DS_Warning));
-  }
+  if (!isRuntimeCounterRelocationEnabled())
+    return Bitmaps;
 
-  return Bitmaps;
+  // Put BiasLI onto the entry block.
+  Type *Int64Ty = Type::getInt64Ty(M.getContext());
+  Function *Fn = I->getFunction();
+  IRBuilder<> EntryBuilder(&Fn->getEntryBlock().front());
+  auto *Bias = getOrCreateBiasVar(getInstrProfBitmapBiasVarName());
+  auto *BiasLI = EntryBuilder.CreateLoad(Int64Ty, Bias, "profbm_bias");
+  // Assume BiasLI invariant (in the function at least)
+  BiasLI->setMetadata(LLVMContext::MD_invariant_load,
+                      MDNode::get(M.getContext(), std::nullopt));
+
+  // Add Bias to Bitmaps and put it before the intrinsic.
+  IRBuilder<> Builder(I);
+  return Builder.CreatePtrAdd(Bitmaps, BiasLI, "profbm_addr");
 }
 
 void InstrLowerer::lowerCover(InstrProfCoverInst *CoverInstruction) {
diff --git a/llvm/lib/Transforms/Instrumentation/LowerAllowCheckPass.cpp b/llvm/lib/Transforms/Instrumentation/LowerAllowCheckPass.cpp
index 0115809..19cf7dc 100644
--- a/llvm/lib/Transforms/Instrumentation/LowerAllowCheckPass.cpp
+++ b/llvm/lib/Transforms/Instrumentation/LowerAllowCheckPass.cpp
@@ -76,13 +76,25 @@ static bool removeUbsanTraps(Function &F, const BlockFrequencyInfo &BFI,
   SmallVector<std::pair<IntrinsicInst *, bool>, 16> ReplaceWithValue;
   std::unique_ptr<RandomNumberGenerator> Rng;
 
-  auto ShouldRemove = [&](bool IsHot) {
-    if (!RandomRate.getNumOccurrences())
-      return IsHot;
+  auto GetRng = [&]() -> RandomNumberGenerator & {
     if (!Rng)
       Rng = F.getParent()->createRNG(F.getName());
-    std::bernoulli_distribution D(RandomRate);
-    return !D(*Rng);
+    return *Rng;
+  };
+
+  auto ShouldRemoveHot = [&](const BasicBlock &BB) {
+    return HotPercentileCutoff.getNumOccurrences() && PSI &&
+           PSI->isHotCountNthPercentile(
+               HotPercentileCutoff, BFI.getBlockProfileCount(&BB).value_or(0));
+  };
+
+  auto ShouldRemoveRandom = [&]() {
+    return RandomRate.getNumOccurrences() &&
+           !std::bernoulli_distribution(RandomRate)(GetRng());
+  };
+
+  auto ShouldRemove = [&](const BasicBlock &BB) {
+    return ShouldRemoveRandom() || ShouldRemoveHot(BB);
   };
 
   for (BasicBlock &BB : F) {
@@ -96,13 +108,7 @@ static bool removeUbsanTraps(Function &F, const BlockFrequencyInfo &BFI,
       case Intrinsic::allow_runtime_check: {
         ++NumChecksTotal;
 
-        bool IsHot = false;
-        if (PSI) {
-          uint64_t Count = BFI.getBlockProfileCount(&BB).value_or(0);
-          IsHot = PSI->isHotCountNthPercentile(HotPercentileCutoff, Count);
-        }
-
-        bool ToRemove = ShouldRemove(IsHot);
+        bool ToRemove = ShouldRemove(BB);
         ReplaceWithValue.push_back({
             II,
             ToRemove,
diff --git a/llvm/lib/Transforms/Instrumentation/MemProfiler.cpp b/llvm/lib/Transforms/Instrumentation/MemProfiler.cpp
index 2c5d749..445bf0b 100644
--- a/llvm/lib/Transforms/Instrumentation/MemProfiler.cpp
+++ b/llvm/lib/Transforms/Instrumentation/MemProfiler.cpp
@@ -61,6 +61,9 @@ constexpr int LLVM_MEM_PROFILER_VERSION = 1;
 // Size of memory mapped to a single shadow location.
 constexpr uint64_t DefaultMemGranularity = 64;
 
+// Size of memory mapped to a single histogram bucket.
+constexpr uint64_t HistogramGranularity = 8;
+
 // Scale from granularity down to shadow size.
 constexpr uint64_t DefaultShadowScale = 3;
 
@@ -192,7 +195,7 @@ namespace {
 struct ShadowMapping {
   ShadowMapping() {
     Scale = ClMappingScale;
-    Granularity = ClMappingGranularity;
+    Granularity = ClHistogram ? HistogramGranularity : ClMappingGranularity;
     Mask = ~(Granularity - 1);
   }
 
@@ -276,6 +279,8 @@ MemProfilerPass::MemProfilerPass() = default;
 
 PreservedAnalyses MemProfilerPass::run(Function &F,
                                        AnalysisManager<Function> &AM) {
+  assert((!ClHistogram || ClMappingGranularity == DefaultMemGranularity) &&
+         "Memprof with histogram only supports default mapping granularity");
   Module &M = *F.getParent();
   MemProfiler Profiler(M);
   if (Profiler.instrumentFunction(F))
@@ -288,10 +293,6 @@ ModuleMemProfilerPass::ModuleMemProfilerPass() = default;
 PreservedAnalyses ModuleMemProfilerPass::run(Module &M,
                                              AnalysisManager<Module> &AM) {
 
-  assert((!ClHistogram || (ClHistogram && ClUseCalls)) &&
-         "Cannot use -memprof-histogram without Callbacks. Set "
-         "memprof-use-callbacks");
-
   ModuleMemProfiler Profiler(M);
   if (Profiler.instrumentModule(M))
     return PreservedAnalyses::none();
@@ -489,14 +490,21 @@ void MemProfiler::instrumentAddress(Instruction *OrigIns,
     return;
   }
 
-  // Create an inline sequence to compute shadow location, and increment the
-  // value by one.
-  Type *ShadowTy = Type::getInt64Ty(*C);
+  Type *ShadowTy = ClHistogram ? Type::getInt8Ty(*C) : Type::getInt64Ty(*C);
   Type *ShadowPtrTy = PointerType::get(ShadowTy, 0);
+
   Value *ShadowPtr = memToShadow(AddrLong, IRB);
   Value *ShadowAddr = IRB.CreateIntToPtr(ShadowPtr, ShadowPtrTy);
   Value *ShadowValue = IRB.CreateLoad(ShadowTy, ShadowAddr);
-  Value *Inc = ConstantInt::get(Type::getInt64Ty(*C), 1);
+  // If we are profiling with histograms, add overflow protection at 255.
+  if (ClHistogram) {
+    Value *MaxCount = ConstantInt::get(Type::getInt8Ty(*C), 255);
+    Value *Cmp = IRB.CreateICmpULT(ShadowValue, MaxCount);
+    Instruction *IncBlock =
+        SplitBlockAndInsertIfThen(Cmp, InsertBefore, /*Unreachable=*/false);
+    IRB.SetInsertPoint(IncBlock);
+  }
+  Value *Inc = ConstantInt::get(ShadowTy, 1);
   ShadowValue = IRB.CreateAdd(ShadowValue, Inc);
   IRB.CreateStore(ShadowValue, ShadowAddr);
 }
diff --git a/llvm/lib/Transforms/Instrumentation/MemorySanitizer.cpp b/llvm/lib/Transforms/Instrumentation/MemorySanitizer.cpp
index 910c36f..45b3edf2 100644
--- a/llvm/lib/Transforms/Instrumentation/MemorySanitizer.cpp
+++ b/llvm/lib/Transforms/Instrumentation/MemorySanitizer.cpp
@@ -3873,11 +3873,18 @@ struct MemorySanitizerVisitor : public InstVisitor<MemorySanitizerVisitor> {
     setOriginForNaryOp(I);
   }
 
-  /// Handle Arm NEON vector store intrinsics (vst{2,3,4}).
+  /// Handle Arm NEON vector store intrinsics (vst{2,3,4} and vst1x_{2,3,4}).
   ///
   /// Arm NEON vector store intrinsics have the output address (pointer) as the
   /// last argument, with the initial arguments being the inputs. They return
   /// void.
+  ///
+  /// - st4 interleaves the output e.g., st4 (inA, inB, inC, inD, outP) writes
+  ///   abcdabcdabcdabcd... into *outP
+  /// - st1_x4 is non-interleaved e.g., st1_x4 (inA, inB, inC, inD, outP)
+  ///   writes aaaa...bbbb...cccc...dddd... into *outP
+  /// These instructions can all be instrumented with essentially the same
+  /// MSan logic, simply by applying the corresponding intrinsic to the shadow.
   void handleNEONVectorStoreIntrinsic(IntrinsicInst &I) {
     IRBuilder<> IRB(&I);
 
@@ -3892,11 +3899,12 @@ struct MemorySanitizerVisitor : public InstVisitor<MemorySanitizerVisitor> {
     if (ClCheckAccessAddress)
       insertShadowCheck(Addr, &I);
 
+    SmallVector<Value *, 8> Shadows;
     // Every arg operand, other than the last one, is an input vector
-    IntrinsicInst *ShadowI = cast<IntrinsicInst>(I.clone());
     for (int i = 0; i < numArgOperands - 1; i++) {
       assert(isa<FixedVectorType>(I.getArgOperand(i)->getType()));
-      ShadowI->setArgOperand(i, getShadow(&I, i));
+      Value *Shadow = getShadow(&I, i);
+      Shadows.append(1, Shadow);
     }
 
     // MSan's GetShadowTy assumes the LHS is the type we want the shadow for
@@ -3914,13 +3922,20 @@ struct MemorySanitizerVisitor : public InstVisitor<MemorySanitizerVisitor> {
         cast<FixedVectorType>(I.getArgOperand(0)->getType())->getElementType(),
         cast<FixedVectorType>(I.getArgOperand(0)->getType())->getNumElements() *
             (numArgOperands - 1));
-    Type *ShadowTy = getShadowTy(OutputVectorTy);
-    Value *ShadowPtr, *OriginPtr;
+    Type *OutputShadowTy = getShadowTy(OutputVectorTy);
+
+    Value *OutputShadowPtr, *OutputOriginPtr;
     // AArch64 NEON does not need alignment (unless OS requires it)
-    std::tie(ShadowPtr, OriginPtr) =
-        getShadowOriginPtr(Addr, IRB, ShadowTy, Align(1), /*isStore*/ true);
-    ShadowI->setArgOperand(numArgOperands - 1, ShadowPtr);
-    ShadowI->insertAfter(&I);
+    std::tie(OutputShadowPtr, OutputOriginPtr) = getShadowOriginPtr(
+        Addr, IRB, OutputShadowTy, Align(1), /*isStore*/ true);
+    Shadows.append(1, OutputShadowPtr);
+
+    // CreateIntrinsic will select the correct (integer) type for the
+    // intrinsic; the original instruction I may have either integer- or
+    // float-type inputs.
+    CallInst *CI =
+        IRB.CreateIntrinsic(IRB.getVoidTy(), I.getIntrinsicID(), Shadows);
+    setShadow(&I, CI);
 
     if (MS.TrackOrigins) {
       // TODO: if we modelled the vst* instruction more precisely, we could
@@ -3932,7 +3947,8 @@ struct MemorySanitizerVisitor : public InstVisitor<MemorySanitizerVisitor> {
         OC.Add(I.getArgOperand(i));
 
       const DataLayout &DL = F.getDataLayout();
-      OC.DoneAndStoreOrigin(DL.getTypeStoreSize(OutputVectorTy), OriginPtr);
+      OC.DoneAndStoreOrigin(DL.getTypeStoreSize(OutputVectorTy),
+                            OutputOriginPtr);
     }
   }
 
@@ -4277,6 +4293,9 @@ struct MemorySanitizerVisitor : public InstVisitor<MemorySanitizerVisitor> {
       setOrigin(&I, getCleanOrigin());
       break;
 
+    case Intrinsic::aarch64_neon_st1x2:
+    case Intrinsic::aarch64_neon_st1x3:
+    case Intrinsic::aarch64_neon_st1x4:
     case Intrinsic::aarch64_neon_st2:
     case Intrinsic::aarch64_neon_st3:
     case Intrinsic::aarch64_neon_st4: {
diff --git a/llvm/lib/Transforms/Instrumentation/NumericalStabilitySanitizer.cpp b/llvm/lib/Transforms/Instrumentation/NumericalStabilitySanitizer.cpp
index 832506f..5872396 100644
--- a/llvm/lib/Transforms/Instrumentation/NumericalStabilitySanitizer.cpp
+++ b/llvm/lib/Transforms/Instrumentation/NumericalStabilitySanitizer.cpp
@@ -1715,7 +1715,7 @@ Value *NumericalStabilitySanitizer::createShadowValueWithOperandsAvailable(
                                Map.getShadow(BinOp->getOperand(1)));
 
   if (isa<UIToFPInst>(&Inst) || isa<SIToFPInst>(&Inst)) {
-    auto *Cast = dyn_cast<CastInst>(&Inst);
+    auto *Cast = cast<CastInst>(&Inst);
     return Builder.CreateCast(Cast->getOpcode(), Cast->getOperand(0),
                               ExtendedVT);
   }
@@ -2168,7 +2168,7 @@ bool NumericalStabilitySanitizer::sanitizeFunction(
 
   // The last pass populates shadow phis with shadow values.
   for (PHINode *Phi : OriginalPhis) {
-    PHINode *ShadowPhi = dyn_cast<PHINode>(ValueToShadow.getShadow(Phi));
+    PHINode *ShadowPhi = cast<PHINode>(ValueToShadow.getShadow(Phi));
     for (unsigned I : seq(Phi->getNumOperands())) {
       Value *V = Phi->getOperand(I);
       Value *Shadow = ValueToShadow.getShadow(V);
diff --git a/llvm/lib/Transforms/Scalar/DFAJumpThreading.cpp b/llvm/lib/Transforms/Scalar/DFAJumpThreading.cpp
index 4371b82..d2268f0 100644
--- a/llvm/lib/Transforms/Scalar/DFAJumpThreading.cpp
+++ b/llvm/lib/Transforms/Scalar/DFAJumpThreading.cpp
@@ -149,9 +149,7 @@ private:
   unfoldSelectInstrs(DominatorTree *DT,
                      const SmallVector<SelectInstToUnfold, 4> &SelectInsts) {
     DomTreeUpdater DTU(DT, DomTreeUpdater::UpdateStrategy::Eager);
-    SmallVector<SelectInstToUnfold, 4> Stack;
-    for (SelectInstToUnfold SIToUnfold : SelectInsts)
-      Stack.push_back(SIToUnfold);
+    SmallVector<SelectInstToUnfold, 4> Stack(SelectInsts);
 
     while (!Stack.empty()) {
       SelectInstToUnfold SIToUnfold = Stack.pop_back_val();
diff --git a/llvm/lib/Transforms/Scalar/DeadStoreElimination.cpp b/llvm/lib/Transforms/Scalar/DeadStoreElimination.cpp
index 931606c..992139a 100644
--- a/llvm/lib/Transforms/Scalar/DeadStoreElimination.cpp
+++ b/llvm/lib/Transforms/Scalar/DeadStoreElimination.cpp
@@ -1889,12 +1889,12 @@ struct DSEState {
         return true;
       auto *Ptr = Memset->getArgOperand(0);
       auto *TI = MallocBB->getTerminator();
-      ICmpInst::Predicate Pred;
       BasicBlock *TrueBB, *FalseBB;
-      if (!match(TI, m_Br(m_ICmp(Pred, m_Specific(Ptr), m_Zero()), TrueBB,
-                          FalseBB)))
+      if (!match(TI, m_Br(m_SpecificICmp(ICmpInst::ICMP_EQ, m_Specific(Ptr),
+                                         m_Zero()),
+                          TrueBB, FalseBB)))
         return false;
-      if (Pred != ICmpInst::ICMP_EQ || MemsetBB != FalseBB)
+      if (MemsetBB != FalseBB)
         return false;
       return true;
     };
diff --git a/llvm/lib/Transforms/Scalar/LoopBoundSplit.cpp b/llvm/lib/Transforms/Scalar/LoopBoundSplit.cpp
index 6092cd1b..ff07762 100644
--- a/llvm/lib/Transforms/Scalar/LoopBoundSplit.cpp
+++ b/llvm/lib/Transforms/Scalar/LoopBoundSplit.cpp
@@ -160,9 +160,8 @@ static bool isProcessableCondBI(const ScalarEvolution &SE,
                                 const BranchInst *BI) {
   BasicBlock *TrueSucc = nullptr;
   BasicBlock *FalseSucc = nullptr;
-  ICmpInst::Predicate Pred;
   Value *LHS, *RHS;
-  if (!match(BI, m_Br(m_ICmp(Pred, m_Value(LHS), m_Value(RHS)),
+  if (!match(BI, m_Br(m_ICmp(m_Value(LHS), m_Value(RHS)),
                       m_BasicBlock(TrueSucc), m_BasicBlock(FalseSucc))))
     return false;
 
diff --git a/llvm/lib/Transforms/Scalar/LoopSink.cpp b/llvm/lib/Transforms/Scalar/LoopSink.cpp
index 6eedf95..5c6ed84 100644
--- a/llvm/lib/Transforms/Scalar/LoopSink.cpp
+++ b/llvm/lib/Transforms/Scalar/LoopSink.cpp
@@ -144,7 +144,23 @@ findBBsToSinkInto(const Loop &L, const SmallPtrSetImpl<BasicBlock *> &UseBBs,
         BBsToSinkInto.erase(DominatedBB);
       }
       BBsToSinkInto.insert(ColdestBB);
+      continue;
     }
+    // Otherwise, see if we can stop the search through the cold BBs early.
+    // Since the ColdLoopBBs list is sorted in increasing magnitude of
+    // frequency the cold BB frequencies can only get larger. The
+    // BBsToSinkInto set can only get smaller and have a smaller
+    // adjustedSumFreq, due to the earlier checking. So once we find a cold BB
+    // with a frequency at least as large as the adjustedSumFreq of the
+    // current BBsToSinkInto set, the earlier frequency check can never be
+    // true for a future iteration. Note we could do check this more
+    // aggressively earlier, but in practice this ended up being more
+    // expensive overall (added checking to the critical path through the loop
+    // that often ended up continuing early due to an empty
+    // BBsDominatedByColdestBB set, and the frequency check there was false
+    // most of the time anyway).
+    if (adjustedSumFreq(BBsToSinkInto, BFI) <= BFI.getBlockFreq(ColdestBB))
+      break;
   }
 
   // Can't sink into blocks that have no valid insertion point.
diff --git a/llvm/lib/Transforms/Utils/BuildLibCalls.cpp b/llvm/lib/Transforms/Utils/BuildLibCalls.cpp
index 0c45bd8..30a343b 100644
--- a/llvm/lib/Transforms/Utils/BuildLibCalls.cpp
+++ b/llvm/lib/Transforms/Utils/BuildLibCalls.cpp
@@ -48,6 +48,7 @@ STATISTIC(NumNoAlias, "Number of function returns inferred as noalias");
 STATISTIC(NumNoUndef, "Number of function returns inferred as noundef returns");
 STATISTIC(NumReturnedArg, "Number of arguments inferred as returned");
 STATISTIC(NumWillReturn, "Number of functions inferred as willreturn");
+STATISTIC(NumCold, "Number of functions inferred as cold");
 
 static bool setDoesNotAccessMemory(Function &F) {
   if (F.doesNotAccessMemory())
@@ -57,6 +58,14 @@ static bool setDoesNotAccessMemory(Function &F) {
   return true;
 }
 
+static bool setIsCold(Function &F) {
+  if (F.hasFnAttribute(Attribute::Cold))
+    return false;
+  F.addFnAttr(Attribute::Cold);
+  ++NumCold;
+  return true;
+}
+
 static bool setOnlyAccessesInaccessibleMemory(Function &F) {
   if (F.onlyAccessesInaccessibleMemory())
     return false;
@@ -270,6 +279,9 @@ bool llvm::inferNonMandatoryLibFuncAttrs(Function &F,
     Changed |= setNonLazyBind(F);
 
   switch (TheLibFunc) {
+  case LibFunc_nan:
+  case LibFunc_nanf:
+  case LibFunc_nanl:
   case LibFunc_strlen:
   case LibFunc_strnlen:
   case LibFunc_wcslen:
@@ -1087,6 +1099,9 @@ bool llvm::inferNonMandatoryLibFuncAttrs(Function &F,
     Changed |= setOnlyWritesMemory(F, 0);
     Changed |= setDoesNotThrow(F);
     break;
+  case LibFunc_abort:
+    Changed |= setIsCold(F);
+    break;
   // int __nvvm_reflect(const char *)
   case LibFunc_nvvm_reflect:
     Changed |= setRetAndArgsNoUndef(F);
diff --git a/llvm/lib/Transforms/Utils/LCSSA.cpp b/llvm/lib/Transforms/Utils/LCSSA.cpp
index ab1edf4..f92e921 100644
--- a/llvm/lib/Transforms/Utils/LCSSA.cpp
+++ b/llvm/lib/Transforms/Utils/LCSSA.cpp
@@ -71,24 +71,26 @@ static bool isExitBlock(BasicBlock *BB,
   return is_contained(ExitBlocks, BB);
 }
 
+// Cache the Loop ExitBlocks computed during the analysis.  We expect to get a
+// lot of instructions within the same loops, computing the exit blocks is
+// expensive, and we're not mutating the loop structure.
+using LoopExitBlocksTy = SmallDenseMap<Loop *, SmallVector<BasicBlock *, 1>>;
+
 /// For every instruction from the worklist, check to see if it has any uses
 /// that are outside the current loop.  If so, insert LCSSA PHI nodes and
 /// rewrite the uses.
-bool llvm::formLCSSAForInstructions(SmallVectorImpl<Instruction *> &Worklist,
-                                    const DominatorTree &DT, const LoopInfo &LI,
-                                    ScalarEvolution *SE,
-                                    SmallVectorImpl<PHINode *> *PHIsToRemove,
-                                    SmallVectorImpl<PHINode *> *InsertedPHIs) {
+static bool
+formLCSSAForInstructionsImpl(SmallVectorImpl<Instruction *> &Worklist,
+                             const DominatorTree &DT, const LoopInfo &LI,
+                             ScalarEvolution *SE,
+                             SmallVectorImpl<PHINode *> *PHIsToRemove,
+                             SmallVectorImpl<PHINode *> *InsertedPHIs,
+                             LoopExitBlocksTy &LoopExitBlocks) {
   SmallVector<Use *, 16> UsesToRewrite;
   SmallSetVector<PHINode *, 16> LocalPHIsToRemove;
   PredIteratorCache PredCache;
   bool Changed = false;
 
-  // Cache the Loop ExitBlocks across this loop.  We expect to get a lot of
-  // instructions within the same loops, computing the exit blocks is
-  // expensive, and we're not mutating the loop structure.
-  SmallDenseMap<Loop*, SmallVector<BasicBlock *,1>> LoopExitBlocks;
-
   while (!Worklist.empty()) {
     UsesToRewrite.clear();
 
@@ -317,13 +319,28 @@ bool llvm::formLCSSAForInstructions(SmallVectorImpl<Instruction *> &Worklist,
   return Changed;
 }
 
+/// For every instruction from the worklist, check to see if it has any uses
+/// that are outside the current loop.  If so, insert LCSSA PHI nodes and
+/// rewrite the uses.
+bool llvm::formLCSSAForInstructions(SmallVectorImpl<Instruction *> &Worklist,
+                                    const DominatorTree &DT, const LoopInfo &LI,
+                                    ScalarEvolution *SE,
+                                    SmallVectorImpl<PHINode *> *PHIsToRemove,
+                                    SmallVectorImpl<PHINode *> *InsertedPHIs) {
+  LoopExitBlocksTy LoopExitBlocks;
+
+  return formLCSSAForInstructionsImpl(Worklist, DT, LI, SE, PHIsToRemove,
+                                      InsertedPHIs, LoopExitBlocks);
+}
+
 // Compute the set of BasicBlocks in the loop `L` dominating at least one exit.
 static void computeBlocksDominatingExits(
-    Loop &L, const DominatorTree &DT, SmallVector<BasicBlock *, 8> &ExitBlocks,
+    Loop &L, const DominatorTree &DT,
+    const SmallVectorImpl<BasicBlock *> &ExitBlocks,
     SmallSetVector<BasicBlock *, 8> &BlocksDominatingExits) {
   // We start from the exit blocks, as every block trivially dominates itself
   // (not strictly).
-  SmallVector<BasicBlock *, 8> BBWorklist(ExitBlocks);
+  SmallVector<BasicBlock *, 8> BBWorklist(ExitBlocks.begin(), ExitBlocks.end());
 
   while (!BBWorklist.empty()) {
     BasicBlock *BB = BBWorklist.pop_back_val();
@@ -360,8 +377,9 @@ static void computeBlocksDominatingExits(
   }
 }
 
-bool llvm::formLCSSA(Loop &L, const DominatorTree &DT, const LoopInfo *LI,
-                     ScalarEvolution *SE) {
+static bool formLCSSAImpl(Loop &L, const DominatorTree &DT, const LoopInfo *LI,
+                          ScalarEvolution *SE,
+                          LoopExitBlocksTy &LoopExitBlocks) {
   bool Changed = false;
 
 #ifdef EXPENSIVE_CHECKS
@@ -372,8 +390,9 @@ bool llvm::formLCSSA(Loop &L, const DominatorTree &DT, const LoopInfo *LI,
   }
 #endif
 
-  SmallVector<BasicBlock *, 8> ExitBlocks;
-  L.getExitBlocks(ExitBlocks);
+  if (!LoopExitBlocks.count(&L))
+    L.getExitBlocks(LoopExitBlocks[&L]);
+  const SmallVectorImpl<BasicBlock *> &ExitBlocks = LoopExitBlocks[&L];
   if (ExitBlocks.empty())
     return false;
 
@@ -414,26 +433,43 @@ bool llvm::formLCSSA(Loop &L, const DominatorTree &DT, const LoopInfo *LI,
     }
   }
 
-  Changed = formLCSSAForInstructions(Worklist, DT, *LI, SE);
+  Changed = formLCSSAForInstructionsImpl(Worklist, DT, *LI, SE, nullptr,
+                                         nullptr, LoopExitBlocks);
 
   assert(L.isLCSSAForm(DT));
 
   return Changed;
 }
 
+bool llvm::formLCSSA(Loop &L, const DominatorTree &DT, const LoopInfo *LI,
+                     ScalarEvolution *SE) {
+  LoopExitBlocksTy LoopExitBlocks;
+
+  return formLCSSAImpl(L, DT, LI, SE, LoopExitBlocks);
+}
+
 /// Process a loop nest depth first.
-bool llvm::formLCSSARecursively(Loop &L, const DominatorTree &DT,
-                                const LoopInfo *LI, ScalarEvolution *SE) {
+static bool formLCSSARecursivelyImpl(Loop &L, const DominatorTree &DT,
+                                     const LoopInfo *LI, ScalarEvolution *SE,
+                                     LoopExitBlocksTy &LoopExitBlocks) {
   bool Changed = false;
 
   // Recurse depth-first through inner loops.
   for (Loop *SubLoop : L.getSubLoops())
-    Changed |= formLCSSARecursively(*SubLoop, DT, LI, SE);
+    Changed |= formLCSSARecursivelyImpl(*SubLoop, DT, LI, SE, LoopExitBlocks);
 
-  Changed |= formLCSSA(L, DT, LI, SE);
+  Changed |= formLCSSAImpl(L, DT, LI, SE, LoopExitBlocks);
   return Changed;
 }
 
+/// Process a loop nest depth first.
+bool llvm::formLCSSARecursively(Loop &L, const DominatorTree &DT,
+                                const LoopInfo *LI, ScalarEvolution *SE) {
+  LoopExitBlocksTy LoopExitBlocks;
+
+  return formLCSSARecursivelyImpl(L, DT, LI, SE, LoopExitBlocks);
+}
+
 /// Process all loops in the function, inner-most out.
 static bool formLCSSAOnAllLoops(const LoopInfo *LI, const DominatorTree &DT,
                                 ScalarEvolution *SE) {
diff --git a/llvm/lib/Transforms/Utils/LowerMemIntrinsics.cpp b/llvm/lib/Transforms/Utils/LowerMemIntrinsics.cpp
index ee0d95b..f0c7e31 100644
--- a/llvm/lib/Transforms/Utils/LowerMemIntrinsics.cpp
+++ b/llvm/lib/Transforms/Utils/LowerMemIntrinsics.cpp
@@ -45,8 +45,7 @@ void llvm::createMemCpyLoopKnownSize(
 
   Type *TypeOfCopyLen = CopyLen->getType();
   Type *LoopOpType = TTI.getMemcpyLoopLoweringType(
-      Ctx, CopyLen, SrcAS, DstAS, SrcAlign.value(), DstAlign.value(),
-      AtomicElementSize);
+      Ctx, CopyLen, SrcAS, DstAS, SrcAlign, DstAlign, AtomicElementSize);
   assert((!AtomicElementSize || !LoopOpType->isVectorTy()) &&
          "Atomic memcpy lowering is not supported for vector operand type");
 
@@ -111,8 +110,8 @@ void llvm::createMemCpyLoopKnownSize(
 
     SmallVector<Type *, 5> RemainingOps;
     TTI.getMemcpyLoopResidualLoweringType(RemainingOps, Ctx, RemainingBytes,
-                                          SrcAS, DstAS, SrcAlign.value(),
-                                          DstAlign.value(), AtomicElementSize);
+                                          SrcAS, DstAS, SrcAlign, DstAlign,
+                                          AtomicElementSize);
 
     for (auto *OpTy : RemainingOps) {
       Align PartSrcAlign(commonAlignment(SrcAlign, BytesCopied));
@@ -197,8 +196,7 @@ void llvm::createMemCpyLoopUnknownSize(
   unsigned DstAS = cast<PointerType>(DstAddr->getType())->getAddressSpace();
 
   Type *LoopOpType = TTI.getMemcpyLoopLoweringType(
-      Ctx, CopyLen, SrcAS, DstAS, SrcAlign.value(), DstAlign.value(),
-      AtomicElementSize);
+      Ctx, CopyLen, SrcAS, DstAS, SrcAlign, DstAlign, AtomicElementSize);
   assert((!AtomicElementSize || !LoopOpType->isVectorTy()) &&
          "Atomic memcpy lowering is not supported for vector operand type");
   unsigned LoopOpSize = DL.getTypeStoreSize(LoopOpType);
@@ -411,8 +409,8 @@ static void createMemMoveLoopUnknownSize(Instruction *InsertBefore,
   unsigned SrcAS = cast<PointerType>(SrcAddr->getType())->getAddressSpace();
   unsigned DstAS = cast<PointerType>(DstAddr->getType())->getAddressSpace();
 
-  Type *LoopOpType = TTI.getMemcpyLoopLoweringType(
-      Ctx, CopyLen, SrcAS, DstAS, SrcAlign.value(), DstAlign.value());
+  Type *LoopOpType = TTI.getMemcpyLoopLoweringType(Ctx, CopyLen, SrcAS, DstAS,
+                                                   SrcAlign, DstAlign);
   unsigned LoopOpSize = DL.getTypeStoreSize(LoopOpType);
   Type *Int8Type = Type::getInt8Ty(Ctx);
   bool LoopOpIsInt8 = LoopOpType == Int8Type;
@@ -668,8 +666,8 @@ static void createMemMoveLoopKnownSize(Instruction *InsertBefore,
   unsigned SrcAS = cast<PointerType>(SrcAddr->getType())->getAddressSpace();
   unsigned DstAS = cast<PointerType>(DstAddr->getType())->getAddressSpace();
 
-  Type *LoopOpType = TTI.getMemcpyLoopLoweringType(
-      Ctx, CopyLen, SrcAS, DstAS, SrcAlign.value(), DstAlign.value());
+  Type *LoopOpType = TTI.getMemcpyLoopLoweringType(Ctx, CopyLen, SrcAS, DstAS,
+                                                   SrcAlign, DstAlign);
   unsigned LoopOpSize = DL.getTypeStoreSize(LoopOpType);
 
   // Calculate the loop trip count and remaining bytes to copy after the loop.
@@ -737,8 +735,8 @@ static void createMemMoveLoopKnownSize(Instruction *InsertBefore,
     IRBuilder<> BwdResBuilder(CopyBackwardsBB->getFirstNonPHI());
     SmallVector<Type *, 5> RemainingOps;
     TTI.getMemcpyLoopResidualLoweringType(RemainingOps, Ctx, RemainingBytes,
-                                          SrcAS, DstAS, PartSrcAlign.value(),
-                                          PartDstAlign.value());
+                                          SrcAS, DstAS, PartSrcAlign,
+                                          PartDstAlign);
     for (auto *OpTy : RemainingOps) {
       // reverse the order of the emitted operations
       BwdResBuilder.SetInsertPoint(CopyBackwardsBB->getFirstNonPHI());
@@ -818,8 +816,8 @@ static void createMemMoveLoopKnownSize(Instruction *InsertBefore,
     IRBuilder<> FwdResBuilder(FwdResidualBB->getTerminator());
     SmallVector<Type *, 5> RemainingOps;
     TTI.getMemcpyLoopResidualLoweringType(RemainingOps, Ctx, RemainingBytes,
-                                          SrcAS, DstAS, PartSrcAlign.value(),
-                                          PartDstAlign.value());
+                                          SrcAS, DstAS, PartSrcAlign,
+                                          PartDstAlign);
     for (auto *OpTy : RemainingOps)
       GenerateResidualLdStPair(OpTy, FwdResBuilder, BytesCopied);
   }
diff --git a/llvm/lib/Transforms/Utils/SCCPSolver.cpp b/llvm/lib/Transforms/Utils/SCCPSolver.cpp
index 2336466..c944859 100644
--- a/llvm/lib/Transforms/Utils/SCCPSolver.cpp
+++ b/llvm/lib/Transforms/Utils/SCCPSolver.cpp
@@ -1503,7 +1503,7 @@ void SCCPInstVisitor::visitBinaryOperator(Instruction &I) {
     Value *V2 = SCCPSolver::isConstant(V2State)
                     ? getConstant(V2State, I.getOperand(1)->getType())
                     : I.getOperand(1);
-    Value *R = simplifyBinOp(I.getOpcode(), V1, V2, SimplifyQuery(DL));
+    Value *R = simplifyBinOp(I.getOpcode(), V1, V2, SimplifyQuery(DL, &I));
     auto *C = dyn_cast_or_null<Constant>(R);
     if (C) {
       // Conservatively assume that the result may be based on operands that may
diff --git a/llvm/lib/Transforms/Utils/SimplifyLibCalls.cpp b/llvm/lib/Transforms/Utils/SimplifyLibCalls.cpp
index 92c4426..4100471 100644
--- a/llvm/lib/Transforms/Utils/SimplifyLibCalls.cpp
+++ b/llvm/lib/Transforms/Utils/SimplifyLibCalls.cpp
@@ -3729,6 +3729,17 @@ Value *LibCallSimplifier::optimizePuts(CallInst *CI, IRBuilderBase &B) {
   return nullptr;
 }
 
+Value *LibCallSimplifier::optimizeExit(CallInst *CI) {
+
+  // Mark 'exit' as cold if its not exit(0) (success).
+  const APInt *C;
+  if (!CI->hasFnAttr(Attribute::Cold) &&
+      match(CI->getArgOperand(0), m_APInt(C)) && !C->isZero()) {
+    CI->addFnAttr(Attribute::Cold);
+  }
+  return nullptr;
+}
+
 Value *LibCallSimplifier::optimizeBCopy(CallInst *CI, IRBuilderBase &B) {
   // bcopy(src, dst, n) -> llvm.memmove(dst, src, n)
   return copyFlags(*CI, B.CreateMemMove(CI->getArgOperand(1), Align(1),
@@ -4084,6 +4095,9 @@ Value *LibCallSimplifier::optimizeCall(CallInst *CI, IRBuilderBase &Builder) {
     case LibFunc_vfprintf:
     case LibFunc_fiprintf:
       return optimizeErrorReporting(CI, Builder, 0);
+    case LibFunc_exit:
+    case LibFunc_Exit:
+      return optimizeExit(CI);
     default:
       return nullptr;
     }
diff --git a/llvm/lib/Transforms/Utils/ValueMapper.cpp b/llvm/lib/Transforms/Utils/ValueMapper.cpp
index 1696e9c..5af56b0 100644
--- a/llvm/lib/Transforms/Utils/ValueMapper.cpp
+++ b/llvm/lib/Transforms/Utils/ValueMapper.cpp
@@ -565,9 +565,8 @@ void Mapper::remapDbgRecord(DbgRecord &DR) {
   }
 
   // Find Value operands and remap those.
-  SmallVector<Value *, 4> Vals, NewVals;
-  for (Value *Val : V.location_ops())
-    Vals.push_back(Val);
+  SmallVector<Value *, 4> Vals(V.location_ops());
+  SmallVector<Value *, 4> NewVals;
   for (Value *Val : Vals)
     NewVals.push_back(mapValue(Val));
 
diff --git a/llvm/lib/Transforms/Vectorize/LoopIdiomVectorize.cpp b/llvm/lib/Transforms/Vectorize/LoopIdiomVectorize.cpp
index 64e04ca..cb31e2a 100644
--- a/llvm/lib/Transforms/Vectorize/LoopIdiomVectorize.cpp
+++ b/llvm/lib/Transforms/Vectorize/LoopIdiomVectorize.cpp
@@ -268,25 +268,25 @@ bool LoopIdiomVectorize::recognizeByteCompare() {
             return false;
 
   // Match the branch instruction for the header
-  ICmpInst::Predicate Pred;
   Value *MaxLen;
   BasicBlock *EndBB, *WhileBB;
   if (!match(Header->getTerminator(),
-             m_Br(m_ICmp(Pred, m_Specific(Index), m_Value(MaxLen)),
+             m_Br(m_SpecificICmp(ICmpInst::ICMP_EQ, m_Specific(Index),
+                                 m_Value(MaxLen)),
                   m_BasicBlock(EndBB), m_BasicBlock(WhileBB))) ||
-      Pred != ICmpInst::Predicate::ICMP_EQ || !CurLoop->contains(WhileBB))
+      !CurLoop->contains(WhileBB))
     return false;
 
   // WhileBB should contain the pattern of load & compare instructions. Match
   // the pattern and find the GEP instructions used by the loads.
-  ICmpInst::Predicate WhilePred;
   BasicBlock *FoundBB;
   BasicBlock *TrueBB;
   Value *LoadA, *LoadB;
   if (!match(WhileBB->getTerminator(),
-             m_Br(m_ICmp(WhilePred, m_Value(LoadA), m_Value(LoadB)),
+             m_Br(m_SpecificICmp(ICmpInst::ICMP_EQ, m_Value(LoadA),
+                                 m_Value(LoadB)),
                   m_BasicBlock(TrueBB), m_BasicBlock(FoundBB))) ||
-      WhilePred != ICmpInst::Predicate::ICMP_EQ || !CurLoop->contains(TrueBB))
+      !CurLoop->contains(TrueBB))
     return false;
 
   Value *A, *B;
diff --git a/llvm/lib/Transforms/Vectorize/LoopVectorizationPlanner.h b/llvm/lib/Transforms/Vectorize/LoopVectorizationPlanner.h
index 590f315..ca20bc3 100644
--- a/llvm/lib/Transforms/Vectorize/LoopVectorizationPlanner.h
+++ b/llvm/lib/Transforms/Vectorize/LoopVectorizationPlanner.h
@@ -411,6 +411,9 @@ public:
   VectorizationFactor
   selectEpilogueVectorizationFactor(const ElementCount MaxVF, unsigned IC);
 
+  /// Emit remarks for recipes with invalid costs in the available VPlans.
+  void emitInvalidCostRemarks(OptimizationRemarkEmitter *ORE);
+
 protected:
   /// Build VPlans for power-of-2 VF's between \p MinVF and \p MaxVF inclusive,
   /// according to the information gathered by Legal when it checked if it is
diff --git a/llvm/lib/Transforms/Vectorize/LoopVectorize.cpp b/llvm/lib/Transforms/Vectorize/LoopVectorize.cpp
index 9733ac0..6daa804 100644
--- a/llvm/lib/Transforms/Vectorize/LoopVectorize.cpp
+++ b/llvm/lib/Transforms/Vectorize/LoopVectorize.cpp
@@ -75,6 +75,7 @@
 #include "llvm/ADT/Statistic.h"
 #include "llvm/ADT/StringRef.h"
 #include "llvm/ADT/Twine.h"
+#include "llvm/ADT/TypeSwitch.h"
 #include "llvm/ADT/iterator_range.h"
 #include "llvm/Analysis/AssumptionCache.h"
 #include "llvm/Analysis/BasicAliasAnalysis.h"
@@ -889,20 +890,18 @@ static void debugVectorizationMessage(const StringRef Prefix,
 /// \p PassName is the name of the pass (e.g. can be AlwaysPrint).  \p
 /// RemarkName is the identifier for the remark.  If \p I is passed it is an
 /// instruction that prevents vectorization.  Otherwise \p TheLoop is used for
-/// the location of the remark.  \return the remark object that can be
-/// streamed to.
-static OptimizationRemarkAnalysis createLVAnalysis(const char *PassName,
-    StringRef RemarkName, Loop *TheLoop, Instruction *I) {
-  Value *CodeRegion = TheLoop->getHeader();
-  DebugLoc DL = TheLoop->getStartLoc();
-
-  if (I) {
-    CodeRegion = I->getParent();
-    // If there is no debug location attached to the instruction, revert back to
-    // using the loop's.
-    if (I->getDebugLoc())
-      DL = I->getDebugLoc();
-  }
+/// the location of the remark. If \p DL is passed, use it as debug location for
+/// the remark. \return the remark object that can be streamed to.
+static OptimizationRemarkAnalysis
+createLVAnalysis(const char *PassName, StringRef RemarkName, Loop *TheLoop,
+                 Instruction *I, DebugLoc DL = {}) {
+  Value *CodeRegion = I ? I->getParent() : TheLoop->getHeader();
+  // If debug location is attached to the instruction, use it. Otherwise if DL
+  // was not provided, use the loop's.
+  if (I && I->getDebugLoc())
+    DL = I->getDebugLoc();
+  else if (!DL)
+    DL = TheLoop->getStartLoc();
 
   return OptimizationRemarkAnalysis(PassName, RemarkName, DL, CodeRegion);
 }
@@ -943,15 +942,17 @@ void reportVectorizationFailure(const StringRef DebugMsg,
 
 /// Reports an informative message: print \p Msg for debugging purposes as well
 /// as an optimization remark. Uses either \p I as location of the remark, or
-/// otherwise \p TheLoop.
+/// otherwise \p TheLoop. If \p DL is passed, use it as debug location for the
+/// remark. If \p DL is passed, use it as debug location for the remark.
 static void reportVectorizationInfo(const StringRef Msg, const StringRef ORETag,
-                             OptimizationRemarkEmitter *ORE, Loop *TheLoop,
-                             Instruction *I = nullptr) {
+                                    OptimizationRemarkEmitter *ORE,
+                                    Loop *TheLoop, Instruction *I = nullptr,
+                                    DebugLoc DL = {}) {
   LLVM_DEBUG(debugVectorizationMessage("", Msg, I));
   LoopVectorizeHints Hints(TheLoop, true /* doesn't matter */, *ORE);
-  ORE->emit(
-      createLVAnalysis(Hints.vectorizeAnalysisPassName(), ORETag, TheLoop, I)
-      << Msg);
+  ORE->emit(createLVAnalysis(Hints.vectorizeAnalysisPassName(), ORETag, TheLoop,
+                             I, DL)
+            << Msg);
 }
 
 /// Report successful vectorization of the loop. In case an outer loop is
@@ -1538,12 +1539,8 @@ public:
   /// Returns the expected execution cost. The unit of the cost does
   /// not matter because we use the 'cost' units to compare different
   /// vector widths. The cost that is returned is *not* normalized by
-  /// the factor width. If \p Invalid is not nullptr, this function
-  /// will add a pair(Instruction*, ElementCount) to \p Invalid for
-  /// each instruction that has an Invalid cost for the given VF.
-  InstructionCost
-  expectedCost(ElementCount VF,
-               SmallVectorImpl<InstructionVFPair> *Invalid = nullptr);
+  /// the factor width.
+  InstructionCost expectedCost(ElementCount VF);
 
   bool hasPredStores() const { return NumPredStores > 0; }
 
@@ -4350,24 +4347,38 @@ bool LoopVectorizationPlanner::isMoreProfitable(
   return CmpFn(RTCostA, RTCostB);
 }
 
-static void emitInvalidCostRemarks(SmallVector<InstructionVFPair> InvalidCosts,
-                                   OptimizationRemarkEmitter *ORE,
-                                   Loop *TheLoop) {
+void LoopVectorizationPlanner::emitInvalidCostRemarks(
+    OptimizationRemarkEmitter *ORE) {
+  using RecipeVFPair = std::pair<VPRecipeBase *, ElementCount>;
+  LLVMContext &LLVMCtx = OrigLoop->getHeader()->getContext();
+  SmallVector<RecipeVFPair> InvalidCosts;
+  for (const auto &Plan : VPlans) {
+    for (ElementCount VF : Plan->vectorFactors()) {
+      VPCostContext CostCtx(CM.TTI, Legal->getWidestInductionType(), LLVMCtx,
+                            CM);
+      auto Iter = vp_depth_first_deep(Plan->getVectorLoopRegion()->getEntry());
+      for (VPBasicBlock *VPBB : VPBlockUtils::blocksOnly<VPBasicBlock>(Iter)) {
+        for (auto &R : *VPBB) {
+          if (!R.cost(VF, CostCtx).isValid())
+            InvalidCosts.emplace_back(&R, VF);
+        }
+      }
+    }
+  }
   if (InvalidCosts.empty())
     return;
 
   // Emit a report of VFs with invalid costs in the loop.
 
-  // Group the remarks per instruction, keeping the instruction order from
-  // InvalidCosts.
-  std::map<Instruction *, unsigned> Numbering;
+  // Group the remarks per recipe, keeping the recipe order from InvalidCosts.
+  DenseMap<VPRecipeBase *, unsigned> Numbering;
   unsigned I = 0;
   for (auto &Pair : InvalidCosts)
     if (!Numbering.count(Pair.first))
       Numbering[Pair.first] = I++;
 
-  // Sort the list, first on instruction(number) then on VF.
-  sort(InvalidCosts, [&Numbering](InstructionVFPair &A, InstructionVFPair &B) {
+  // Sort the list, first on recipe(number) then on VF.
+  sort(InvalidCosts, [&Numbering](RecipeVFPair &A, RecipeVFPair &B) {
     if (Numbering[A.first] != Numbering[B.first])
       return Numbering[A.first] < Numbering[B.first];
     const auto &LHS = A.second;
@@ -4376,38 +4387,64 @@ static void emitInvalidCostRemarks(SmallVector<InstructionVFPair> InvalidCosts,
            std::make_tuple(RHS.isScalable(), RHS.getKnownMinValue());
   });
 
-  // For a list of ordered instruction-vf pairs:
-  //   [(load, vf1), (load, vf2), (store, vf1)]
-  // Group the instructions together to emit separate remarks for:
-  //   load  (vf1, vf2)
-  //   store (vf1)
-  auto Tail = ArrayRef<InstructionVFPair>(InvalidCosts);
-  auto Subset = ArrayRef<InstructionVFPair>();
+  // For a list of ordered recipe-VF pairs:
+  //   [(load, VF1), (load, VF2), (store, VF1)]
+  // group the recipes together to emit separate remarks for:
+  //   load  (VF1, VF2)
+  //   store (VF1)
+  auto Tail = ArrayRef<RecipeVFPair>(InvalidCosts);
+  auto Subset = ArrayRef<RecipeVFPair>();
   do {
     if (Subset.empty())
       Subset = Tail.take_front(1);
 
-    Instruction *I = Subset.front().first;
-
-    // If the next instruction is different, or if there are no other pairs,
+    VPRecipeBase *R = Subset.front().first;
+
+    unsigned Opcode =
+        TypeSwitch<const VPRecipeBase *, unsigned>(R)
+            .Case<VPHeaderPHIRecipe>(
+                [](const auto *R) { return Instruction::PHI; })
+            .Case<VPWidenSelectRecipe>(
+                [](const auto *R) { return Instruction::Select; })
+            .Case<VPWidenStoreRecipe>(
+                [](const auto *R) { return Instruction::Store; })
+            .Case<VPWidenLoadRecipe>(
+                [](const auto *R) { return Instruction::Load; })
+            .Case<VPWidenCallRecipe>(
+                [](const auto *R) { return Instruction::Call; })
+            .Case<VPInstruction, VPWidenRecipe, VPReplicateRecipe,
+                  VPWidenCastRecipe>(
+                [](const auto *R) { return R->getOpcode(); })
+            .Case<VPInterleaveRecipe>([](const VPInterleaveRecipe *R) {
+              return R->getStoredValues().empty() ? Instruction::Load
+                                                  : Instruction::Store;
+            });
+
+    // If the next recipe is different, or if there are no other pairs,
     // emit a remark for the collated subset. e.g.
-    //   [(load, vf1), (load, vf2))]
+    //   [(load, VF1), (load, VF2))]
     // to emit:
-    //  remark: invalid costs for 'load' at VF=(vf, vf2)
-    if (Subset == Tail || Tail[Subset.size()].first != I) {
+    //  remark: invalid costs for 'load' at VF=(VF1, VF2)
+    if (Subset == Tail || Tail[Subset.size()].first != R) {
       std::string OutString;
       raw_string_ostream OS(OutString);
       assert(!Subset.empty() && "Unexpected empty range");
-      OS << "Instruction with invalid costs prevented vectorization at VF=(";
+      OS << "Recipe with invalid costs prevented vectorization at VF=(";
       for (const auto &Pair : Subset)
         OS << (Pair.second == Subset.front().second ? "" : ", ") << Pair.second;
       OS << "):";
-      if (auto *CI = dyn_cast<CallInst>(I))
-        OS << " call to " << CI->getCalledFunction()->getName();
-      else
-        OS << " " << I->getOpcodeName();
+      if (Opcode == Instruction::Call) {
+        auto *WidenCall = dyn_cast<VPWidenCallRecipe>(R);
+        Function *CalledFn =
+            WidenCall ? WidenCall->getCalledScalarFunction()
+                      : cast<Function>(R->getOperand(R->getNumOperands() - 1)
+                                           ->getLiveInIRValue());
+        OS << " call to " << CalledFn->getName();
+      } else
+        OS << " " << Instruction::getOpcodeName(Opcode);
       OS.flush();
-      reportVectorizationInfo(OutString, "InvalidCost", ORE, TheLoop, I);
+      reportVectorizationInfo(OutString, "InvalidCost", ORE, OrigLoop, nullptr,
+                              R->getDebugLoc());
       Tail = Tail.drop_front(Subset.size());
       Subset = {};
     } else
@@ -4536,14 +4573,13 @@ VectorizationFactor LoopVectorizationPlanner::selectVectorizationFactor() {
     ChosenFactor.Cost = InstructionCost::getMax();
   }
 
-  SmallVector<InstructionVFPair> InvalidCosts;
   for (auto &P : VPlans) {
     for (ElementCount VF : P->vectorFactors()) {
       // The cost for scalar VF=1 is already calculated, so ignore it.
       if (VF.isScalar())
         continue;
 
-      InstructionCost C = CM.expectedCost(VF, &InvalidCosts);
+      InstructionCost C = CM.expectedCost(VF);
       VectorizationFactor Candidate(VF, C, ScalarCost.ScalarCost);
 
 #ifndef NDEBUG
@@ -4578,8 +4614,6 @@ VectorizationFactor LoopVectorizationPlanner::selectVectorizationFactor() {
     }
   }
 
-  emitInvalidCostRemarks(InvalidCosts, ORE, OrigLoop);
-
   if (!EnableCondStoresVectorization && CM.hasPredStores()) {
     reportVectorizationFailure(
         "There are conditional stores.",
@@ -5484,8 +5518,7 @@ InstructionCost LoopVectorizationCostModel::computePredInstDiscount(
   return Discount;
 }
 
-InstructionCost LoopVectorizationCostModel::expectedCost(
-    ElementCount VF, SmallVectorImpl<InstructionVFPair> *Invalid) {
+InstructionCost LoopVectorizationCostModel::expectedCost(ElementCount VF) {
   InstructionCost Cost;
 
   // For each block.
@@ -5505,10 +5538,6 @@ InstructionCost LoopVectorizationCostModel::expectedCost(
       if (C.isValid() && ForceTargetInstructionCost.getNumOccurrences() > 0)
         C = InstructionCost(ForceTargetInstructionCost);
 
-      // Keep a list of instructions with invalid costs.
-      if (Invalid && !C.isValid())
-        Invalid->emplace_back(&I, VF);
-
       BlockCost += C;
       LLVM_DEBUG(dbgs() << "LV: Found an estimated cost of " << C << " for VF "
                         << VF << " For instruction: " << I << '\n');
@@ -9867,6 +9896,9 @@ bool LoopVectorizePass::processLoop(Loop *L) {
   // Plan how to best vectorize, return the best VF and its cost.
   std::optional<VectorizationFactor> MaybeVF = LVP.plan(UserVF, UserIC);
 
+  if (ORE->allowExtraAnalysis(LV_NAME))
+    LVP.emitInvalidCostRemarks(ORE);
+
   VectorizationFactor VF = VectorizationFactor::Disabled();
   unsigned IC = 1;
 
@@ -10029,7 +10061,6 @@ bool LoopVectorizePass::processLoop(Loop *L) {
         EpilogueVectorizerMainLoop MainILV(L, PSE, LI, DT, TLI, TTI, AC, ORE,
                                            EPI, &LVL, &CM, BFI, PSI, Checks);
 
-        assert(EPI.MainLoopVF == VF.Width && "VFs must match");
         std::unique_ptr<VPlan> BestMainPlan(BestPlan.duplicate());
         const auto &[ExpandedSCEVs, ReductionResumeValues] = LVP.executePlan(
             EPI.MainLoopVF, EPI.MainLoopUF, *BestMainPlan, MainILV, DT, true);
diff --git a/llvm/lib/Transforms/Vectorize/SLPVectorizer.cpp b/llvm/lib/Transforms/Vectorize/SLPVectorizer.cpp
index b74417f..8d2ce6b 100644
--- a/llvm/lib/Transforms/Vectorize/SLPVectorizer.cpp
+++ b/llvm/lib/Transforms/Vectorize/SLPVectorizer.cpp
@@ -4842,11 +4842,46 @@ static bool clusterSortPtrAccesses(ArrayRef<Value *> VL, Type *ElemTy,
   if (!AnyConsecutive)
     return false;
 
+  // If we have a better order, also sort the base pointers by increasing
+  // (variable) values if possible, to try and keep the order more regular. In
+  // order to create a valid strict-weak order we cluster by the Root of gep
+  // chains and sort within each.
+  SmallVector<std::tuple<Value *, Value *, Value *>> SortedBases;
   for (auto &Base : Bases) {
-    for (auto &T : Base.second)
-      SortedIndices.push_back(std::get<2>(T));
+    Value *Strip = Base.first->stripInBoundsConstantOffsets();
+    Value *Root = Strip;
+    while (auto *Gep = dyn_cast<GetElementPtrInst>(Root))
+      Root = Gep->getOperand(0);
+    SortedBases.emplace_back(Base.first, Strip, Root);
+  }
+  auto *Begin = SortedBases.begin();
+  auto *End = SortedBases.end();
+  while (Begin != End) {
+    Value *Root = std::get<2>(*Begin);
+    auto *Mid = std::stable_partition(
+        Begin, End, [&Root](auto V) { return std::get<2>(V) == Root; });
+    DenseMap<Value *, DenseMap<Value *, bool>> LessThan;
+    for (auto I = Begin; I < Mid; ++I)
+      LessThan.try_emplace(std::get<1>(*I));
+    for (auto I = Begin; I < Mid; ++I) {
+      Value *V = std::get<1>(*I);
+      while (auto *Gep = dyn_cast<GetElementPtrInst>(V)) {
+        V = Gep->getOperand(0);
+        if (LessThan.contains(V))
+          LessThan[V][std::get<1>(*I)] = true;
+      }
+    }
+    std::stable_sort(Begin, Mid, [&LessThan](auto &V1, auto &V2) {
+      return LessThan[std::get<1>(V1)][std::get<1>(V2)];
+    });
+    Begin = Mid;
   }
 
+  // Collect the final order of sorted indices
+  for (auto Base : SortedBases)
+    for (auto &T : Bases[std::get<0>(Base)])
+      SortedIndices.push_back(std::get<2>(T));
+
   assert(SortedIndices.size() == VL.size() &&
          "Expected SortedIndices to be the size of VL");
   return true;
@@ -8363,6 +8398,12 @@ class BoUpSLP::ShuffleCostEstimator : public BaseShuffleAnalysis {
                   : TTI.getStridedMemoryOpCost(
                         Instruction::Load, LoadTy, LI->getPointerOperand(),
                         /*VariableMask=*/false, Alignment, CostKind, LI);
+          // Add external uses costs.
+          for (auto [Idx, V] : enumerate(VL.slice(
+                   P.first, std::min<unsigned>(VL.size() - P.first, VF))))
+            if (!R.areAllUsersVectorized(cast<Instruction>(V)))
+              GatherCost += TTI.getVectorInstrCost(Instruction::ExtractElement,
+                                                   LoadTy, CostKind, Idx);
           // Estimate GEP cost.
           SmallVector<Value *> PointerOps(VF);
           for (auto [I, V] : enumerate(VL.slice(P.first, VF)))
@@ -9713,6 +9754,23 @@ BoUpSLP::getEntryCost(const TreeEntry *E, ArrayRef<Value *> VectorizedVals,
         }
         VecCost = std::min(VecCost, IntrinsicCost);
       }
+      if (auto *SI = dyn_cast<SelectInst>(VL0)) {
+        auto *CondType =
+            getWidenedType(SI->getCondition()->getType(), VL.size());
+        unsigned CondNumElements = CondType->getNumElements();
+        unsigned VecTyNumElements = getNumElements(VecTy);
+        assert(VecTyNumElements >= CondNumElements &&
+               VecTyNumElements % CondNumElements == 0 &&
+               "Cannot vectorize Instruction::Select");
+        if (CondNumElements != VecTyNumElements) {
+          // When the return type is i1 but the source is fixed vector type, we
+          // need to duplicate the condition value.
+          VecCost += TTI->getShuffleCost(
+              TTI::SK_PermuteSingleSrc, CondType,
+              createReplicatedMask(VecTyNumElements / CondNumElements,
+                                   CondNumElements));
+        }
+      }
       return VecCost + CommonCost;
     };
     return GetCostDiff(GetScalarCost, GetVectorCost);
@@ -13196,6 +13254,22 @@ Value *BoUpSLP::vectorizeTree(TreeEntry *E, bool PostponedPHIs) {
           False = Builder.CreateIntCast(False, VecTy, GetOperandSignedness(2));
       }
 
+      unsigned CondNumElements = getNumElements(Cond->getType());
+      unsigned TrueNumElements = getNumElements(True->getType());
+      assert(TrueNumElements >= CondNumElements &&
+             TrueNumElements % CondNumElements == 0 &&
+             "Cannot vectorize Instruction::Select");
+      assert(TrueNumElements == getNumElements(False->getType()) &&
+             "Cannot vectorize Instruction::Select");
+      if (CondNumElements != TrueNumElements) {
+        // When the return type is i1 but the source is fixed vector type, we
+        // need to duplicate the condition value.
+        Cond = Builder.CreateShuffleVector(
+            Cond, createReplicatedMask(TrueNumElements / CondNumElements,
+                                       CondNumElements));
+      }
+      assert(getNumElements(Cond->getType()) == TrueNumElements &&
+             "Cannot vectorize Instruction::Select");
       Value *V = Builder.CreateSelect(Cond, True, False);
       V = FinalShuffle(V, E, VecTy);
 
@@ -13886,11 +13960,18 @@ Value *BoUpSLP::vectorizeTree(
         }
         if (!Ex) {
           // "Reuse" the existing extract to improve final codegen.
-          if (auto *ES = dyn_cast<ExtractElementInst>(Scalar)) {
+          if (auto *ES = dyn_cast<ExtractElementInst>(Scalar);
+              ES && isa<Instruction>(Vec)) {
             Value *V = ES->getVectorOperand();
+            auto *IVec = cast<Instruction>(Vec);
             if (const TreeEntry *ETE = getTreeEntry(V))
               V = ETE->VectorizedValue;
-            Ex = Builder.CreateExtractElement(V, ES->getIndexOperand());
+            if (auto *IV = dyn_cast<Instruction>(V);
+                !IV || IV == Vec || IV->getParent() != IVec->getParent() ||
+                IV->comesBefore(IVec))
+              Ex = Builder.CreateExtractElement(V, ES->getIndexOperand());
+            else
+              Ex = Builder.CreateExtractElement(Vec, Lane);
           } else if (ReplaceGEP) {
             // Leave the GEPs as is, they are free in most cases and better to
             // keep them as GEPs.
@@ -16638,8 +16719,6 @@ class HorizontalReduction {
   SmallVector<SmallVector<Value *>> ReducedVals;
   /// Maps reduced value to the corresponding reduction operation.
   DenseMap<Value *, SmallVector<Instruction *>> ReducedValsToOps;
-  // Use map vector to make stable output.
-  MapVector<Instruction *, Value *> ExtraArgs;
   WeakTrackingVH ReductionRoot;
   /// The type of reduction operation.
   RecurKind RdxKind;
@@ -16972,30 +17051,26 @@ public:
     // gather all the reduced values, sorting them by their value id.
     BasicBlock *BB = Root->getParent();
     bool IsCmpSelMinMax = isCmpSelMinMax(Root);
-    SmallVector<Instruction *> Worklist(1, Root);
+    SmallVector<std::pair<Instruction *, unsigned>> Worklist(
+        1, std::make_pair(Root, 0));
     // Checks if the operands of the \p TreeN instruction are also reduction
     // operations or should be treated as reduced values or an extra argument,
     // which is not part of the reduction.
     auto CheckOperands = [&](Instruction *TreeN,
-                             SmallVectorImpl<Value *> &ExtraArgs,
                              SmallVectorImpl<Value *> &PossibleReducedVals,
-                             SmallVectorImpl<Instruction *> &ReductionOps) {
+                             SmallVectorImpl<Instruction *> &ReductionOps,
+                             unsigned Level) {
       for (int I : reverse(seq<int>(getFirstOperandIndex(TreeN),
                                     getNumberOfOperands(TreeN)))) {
         Value *EdgeVal = getRdxOperand(TreeN, I);
         ReducedValsToOps[EdgeVal].push_back(TreeN);
         auto *EdgeInst = dyn_cast<Instruction>(EdgeVal);
-        // Edge has wrong parent - mark as an extra argument.
-        if (EdgeInst && !isVectorLikeInstWithConstOps(EdgeInst) &&
-            !hasSameParent(EdgeInst, BB)) {
-          ExtraArgs.push_back(EdgeVal);
-          continue;
-        }
         // If the edge is not an instruction, or it is different from the main
         // reduction opcode or has too many uses - possible reduced value.
         // Also, do not try to reduce const values, if the operation is not
         // foldable.
-        if (!EdgeInst || getRdxKind(EdgeInst) != RdxKind ||
+        if (!EdgeInst || Level > RecursionMaxDepth ||
+            getRdxKind(EdgeInst) != RdxKind ||
             IsCmpSelMinMax != isCmpSelMinMax(EdgeInst) ||
             !hasRequiredNumberOfUses(IsCmpSelMinMax, EdgeInst) ||
             !isVectorizable(RdxKind, EdgeInst) ||
@@ -17019,6 +17094,7 @@ public:
     SmallSet<size_t, 2> LoadKeyUsed;
 
     auto GenerateLoadsSubkey = [&](size_t Key, LoadInst *LI) {
+      Key = hash_combine(hash_value(LI->getParent()), Key);
       Value *Ptr = getUnderlyingObject(LI->getPointerOperand());
       if (LoadKeyUsed.contains(Key)) {
         auto LIt = LoadsMap.find(Ptr);
@@ -17049,40 +17125,23 @@ public:
     };
 
     while (!Worklist.empty()) {
-      Instruction *TreeN = Worklist.pop_back_val();
-      SmallVector<Value *> Args;
+      auto [TreeN, Level] = Worklist.pop_back_val();
       SmallVector<Value *> PossibleRedVals;
       SmallVector<Instruction *> PossibleReductionOps;
-      CheckOperands(TreeN, Args, PossibleRedVals, PossibleReductionOps);
-      // If too many extra args - mark the instruction itself as a reduction
-      // value, not a reduction operation.
-      if (Args.size() < 2) {
-        addReductionOps(TreeN);
-        // Add extra args.
-        if (!Args.empty()) {
-          assert(Args.size() == 1 && "Expected only single argument.");
-          ExtraArgs[TreeN] = Args.front();
-        }
-        // Add reduction values. The values are sorted for better vectorization
-        // results.
-        for (Value *V : PossibleRedVals) {
-          size_t Key, Idx;
-          std::tie(Key, Idx) = generateKeySubkey(V, &TLI, GenerateLoadsSubkey,
-                                                 /*AllowAlternate=*/false);
-          ++PossibleReducedVals[Key][Idx]
-                .insert(std::make_pair(V, 0))
-                .first->second;
-        }
-        Worklist.append(PossibleReductionOps.rbegin(),
-                        PossibleReductionOps.rend());
-      } else {
+      CheckOperands(TreeN, PossibleRedVals, PossibleReductionOps, Level);
+      addReductionOps(TreeN);
+      // Add reduction values. The values are sorted for better vectorization
+      // results.
+      for (Value *V : PossibleRedVals) {
         size_t Key, Idx;
-        std::tie(Key, Idx) = generateKeySubkey(TreeN, &TLI, GenerateLoadsSubkey,
+        std::tie(Key, Idx) = generateKeySubkey(V, &TLI, GenerateLoadsSubkey,
                                                /*AllowAlternate=*/false);
         ++PossibleReducedVals[Key][Idx]
-              .insert(std::make_pair(TreeN, 0))
+              .insert(std::make_pair(V, 0))
               .first->second;
       }
+      for (Instruction *I : reverse(PossibleReductionOps))
+        Worklist.emplace_back(I, I->getParent() == BB ? 0 : Level + 1);
     }
     auto PossibleReducedValsVect = PossibleReducedVals.takeVector();
     // Sort values by the total number of values kinds to start the reduction
@@ -17159,18 +17218,9 @@ public:
 
     // Track the reduced values in case if they are replaced by extractelement
     // because of the vectorization.
-    DenseMap<Value *, WeakTrackingVH> TrackedVals(
-        ReducedVals.size() * ReducedVals.front().size() + ExtraArgs.size());
-    BoUpSLP::ExtraValueToDebugLocsMap ExternallyUsedValues;
+    DenseMap<Value *, WeakTrackingVH> TrackedVals(ReducedVals.size() *
+                                                  ReducedVals.front().size());
     SmallVector<std::pair<Value *, Value *>> ReplacedExternals;
-    ExternallyUsedValues.reserve(ExtraArgs.size() + 1);
-    // The same extra argument may be used several times, so log each attempt
-    // to use it.
-    for (const std::pair<Instruction *, Value *> &Pair : ExtraArgs) {
-      assert(Pair.first && "DebugLoc must be set.");
-      ExternallyUsedValues[Pair.second].push_back(Pair.first);
-      TrackedVals.try_emplace(Pair.second, Pair.second);
-    }
 
     // The compare instruction of a min/max is the insertion point for new
     // instructions and may be replaced with a new compare instruction.
@@ -17205,13 +17255,9 @@ public:
       // Initialize the final value in the reduction.
       return Res;
     };
-    bool AnyBoolLogicOp =
-        any_of(ReductionOps.back(), [](Value *V) {
-          return isBoolLogicOp(cast<Instruction>(V));
-        });
-    // The reduction root is used as the insertion point for new instructions,
-    // so set it as externally used to prevent it from being deleted.
-    ExternallyUsedValues[ReductionRoot];
+    bool AnyBoolLogicOp = any_of(ReductionOps.back(), [](Value *V) {
+      return isBoolLogicOp(cast<Instruction>(V));
+    });
     SmallDenseSet<Value *> IgnoreList(ReductionOps.size() *
                                       ReductionOps.front().size());
     for (ReductionOpsType &RdxOps : ReductionOps)
@@ -17433,8 +17479,11 @@ public:
         V.reorderBottomToTop(/*IgnoreReorder=*/true);
         // Keep extracted other reduction values, if they are used in the
         // vectorization trees.
-        BoUpSLP::ExtraValueToDebugLocsMap LocalExternallyUsedValues(
-            ExternallyUsedValues);
+        BoUpSLP::ExtraValueToDebugLocsMap LocalExternallyUsedValues;
+        // The reduction root is used as the insertion point for new
+        // instructions, so set it as externally used to prevent it from being
+        // deleted.
+        LocalExternallyUsedValues[ReductionRoot];
         for (unsigned Cnt = 0, Sz = ReducedVals.size(); Cnt < Sz; ++Cnt) {
           if (Cnt == I || (ShuffledExtracts && Cnt == I - 1))
             continue;
@@ -17481,23 +17530,6 @@ public:
         for (Value *RdxVal : VL)
           if (RequiredExtract.contains(RdxVal))
             LocalExternallyUsedValues[RdxVal];
-        // Update LocalExternallyUsedValues for the scalar, replaced by
-        // extractelement instructions.
-        DenseMap<Value *, Value *> ReplacementToExternal;
-        for (const std::pair<Value *, Value *> &Pair : ReplacedExternals)
-          ReplacementToExternal.try_emplace(Pair.second, Pair.first);
-        for (const std::pair<Value *, Value *> &Pair : ReplacedExternals) {
-          Value *Ext = Pair.first;
-          auto RIt = ReplacementToExternal.find(Ext);
-          while (RIt != ReplacementToExternal.end()) {
-            Ext = RIt->second;
-            RIt = ReplacementToExternal.find(Ext);
-          }
-          auto *It = ExternallyUsedValues.find(Ext);
-          if (It == ExternallyUsedValues.end())
-            continue;
-          LocalExternallyUsedValues[Pair.second].append(It->second);
-        }
         V.buildExternalUses(LocalExternallyUsedValues);
 
         V.computeMinimumValueSizes();
@@ -17699,11 +17731,6 @@ public:
             ExtraReductions.emplace_back(RedOp, RdxVal);
         }
       }
-      for (auto &Pair : ExternallyUsedValues) {
-        // Add each externally used value to the final reduction.
-        for (auto *I : Pair.second)
-          ExtraReductions.emplace_back(I, Pair.first);
-      }
       // Iterate through all not-vectorized reduction values/extra arguments.
       bool InitStep = true;
       while (ExtraReductions.size() > 1) {
@@ -17855,6 +17882,8 @@ private:
     assert(IsSupportedHorRdxIdentityOp &&
            "The optimization of matched scalar identity horizontal reductions "
            "must be supported.");
+    if (Cnt == 1)
+      return VectorizedValue;
     switch (RdxKind) {
     case RecurKind::Add: {
       // res = mul vv, n
diff --git a/llvm/test/Analysis/CostModel/AArch64/fptoi_sat.ll b/llvm/test/Analysis/CostModel/AArch64/fptoi_sat.ll
index e4e2914..c45b6c3 100644
--- a/llvm/test/Analysis/CostModel/AArch64/fptoi_sat.ll
+++ b/llvm/test/Analysis/CostModel/AArch64/fptoi_sat.ll
@@ -34,8 +34,8 @@ define void @casts() {
 ; CHECK-NEXT:  Cost Model: Found an estimated cost of 3 for instruction: %v2f32u16 = call <2 x i16> @llvm.fptoui.sat.v2i16.v2f32(<2 x float> undef)
 ; CHECK-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %v2f32s32 = call <2 x i32> @llvm.fptosi.sat.v2i32.v2f32(<2 x float> undef)
 ; CHECK-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %v2f32u32 = call <2 x i32> @llvm.fptoui.sat.v2i32.v2f32(<2 x float> undef)
-; CHECK-NEXT:  Cost Model: Found an estimated cost of 11 for instruction: %v2f32s64 = call <2 x i64> @llvm.fptosi.sat.v2i64.v2f32(<2 x float> undef)
-; CHECK-NEXT:  Cost Model: Found an estimated cost of 4 for instruction: %v2f32u64 = call <2 x i64> @llvm.fptoui.sat.v2i64.v2f32(<2 x float> undef)
+; CHECK-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %v2f32s64 = call <2 x i64> @llvm.fptosi.sat.v2i64.v2f32(<2 x float> undef)
+; CHECK-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %v2f32u64 = call <2 x i64> @llvm.fptoui.sat.v2i64.v2f32(<2 x float> undef)
 ; CHECK-NEXT:  Cost Model: Found an estimated cost of 5 for instruction: %v2f64s1 = call <2 x i1> @llvm.fptosi.sat.v2i1.v2f64(<2 x double> undef)
 ; CHECK-NEXT:  Cost Model: Found an estimated cost of 5 for instruction: %v2f64u1 = call <2 x i1> @llvm.fptoui.sat.v2i1.v2f64(<2 x double> undef)
 ; CHECK-NEXT:  Cost Model: Found an estimated cost of 5 for instruction: %v2f64s8 = call <2 x i8> @llvm.fptosi.sat.v2i8.v2f64(<2 x double> undef)
@@ -54,8 +54,8 @@ define void @casts() {
 ; CHECK-NEXT:  Cost Model: Found an estimated cost of 3 for instruction: %v4f32u16 = call <4 x i16> @llvm.fptoui.sat.v4i16.v4f32(<4 x float> undef)
 ; CHECK-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %v4f32s32 = call <4 x i32> @llvm.fptosi.sat.v4i32.v4f32(<4 x float> undef)
 ; CHECK-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %v4f32u32 = call <4 x i32> @llvm.fptoui.sat.v4i32.v4f32(<4 x float> undef)
-; CHECK-NEXT:  Cost Model: Found an estimated cost of 88 for instruction: %v4f32s64 = call <4 x i64> @llvm.fptosi.sat.v4i64.v4f32(<4 x float> undef)
-; CHECK-NEXT:  Cost Model: Found an estimated cost of 7 for instruction: %v4f32u64 = call <4 x i64> @llvm.fptoui.sat.v4i64.v4f32(<4 x float> undef)
+; CHECK-NEXT:  Cost Model: Found an estimated cost of 4 for instruction: %v4f32s64 = call <4 x i64> @llvm.fptosi.sat.v4i64.v4f32(<4 x float> undef)
+; CHECK-NEXT:  Cost Model: Found an estimated cost of 4 for instruction: %v4f32u64 = call <4 x i64> @llvm.fptoui.sat.v4i64.v4f32(<4 x float> undef)
 ; CHECK-NEXT:  Cost Model: Found an estimated cost of 10 for instruction: %v4f64s1 = call <4 x i1> @llvm.fptosi.sat.v4i1.v4f64(<4 x double> undef)
 ; CHECK-NEXT:  Cost Model: Found an estimated cost of 10 for instruction: %v4f64u1 = call <4 x i1> @llvm.fptoui.sat.v4i1.v4f64(<4 x double> undef)
 ; CHECK-NEXT:  Cost Model: Found an estimated cost of 10 for instruction: %v4f64s8 = call <4 x i8> @llvm.fptosi.sat.v4i8.v4f64(<4 x double> undef)
@@ -74,8 +74,8 @@ define void @casts() {
 ; CHECK-NEXT:  Cost Model: Found an estimated cost of 6 for instruction: %v8f32u16 = call <8 x i16> @llvm.fptoui.sat.v8i16.v8f32(<8 x float> undef)
 ; CHECK-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %v8f32s32 = call <8 x i32> @llvm.fptosi.sat.v8i32.v8f32(<8 x float> undef)
 ; CHECK-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %v8f32u32 = call <8 x i32> @llvm.fptoui.sat.v8i32.v8f32(<8 x float> undef)
-; CHECK-NEXT:  Cost Model: Found an estimated cost of 180 for instruction: %v8f32s64 = call <8 x i64> @llvm.fptosi.sat.v8i64.v8f32(<8 x float> undef)
-; CHECK-NEXT:  Cost Model: Found an estimated cost of 18 for instruction: %v8f32u64 = call <8 x i64> @llvm.fptoui.sat.v8i64.v8f32(<8 x float> undef)
+; CHECK-NEXT:  Cost Model: Found an estimated cost of 8 for instruction: %v8f32s64 = call <8 x i64> @llvm.fptosi.sat.v8i64.v8f32(<8 x float> undef)
+; CHECK-NEXT:  Cost Model: Found an estimated cost of 8 for instruction: %v8f32u64 = call <8 x i64> @llvm.fptoui.sat.v8i64.v8f32(<8 x float> undef)
 ; CHECK-NEXT:  Cost Model: Found an estimated cost of 20 for instruction: %v8f64s1 = call <8 x i1> @llvm.fptosi.sat.v8i1.v8f64(<8 x double> undef)
 ; CHECK-NEXT:  Cost Model: Found an estimated cost of 20 for instruction: %v8f64u1 = call <8 x i1> @llvm.fptoui.sat.v8i1.v8f64(<8 x double> undef)
 ; CHECK-NEXT:  Cost Model: Found an estimated cost of 20 for instruction: %v8f64s8 = call <8 x i8> @llvm.fptosi.sat.v8i8.v8f64(<8 x double> undef)
@@ -94,8 +94,8 @@ define void @casts() {
 ; CHECK-NEXT:  Cost Model: Found an estimated cost of 12 for instruction: %v16f32u16 = call <16 x i16> @llvm.fptoui.sat.v16i16.v16f32(<16 x float> undef)
 ; CHECK-NEXT:  Cost Model: Found an estimated cost of 4 for instruction: %v16f32s32 = call <16 x i32> @llvm.fptosi.sat.v16i32.v16f32(<16 x float> undef)
 ; CHECK-NEXT:  Cost Model: Found an estimated cost of 4 for instruction: %v16f32u32 = call <16 x i32> @llvm.fptoui.sat.v16i32.v16f32(<16 x float> undef)
-; CHECK-NEXT:  Cost Model: Found an estimated cost of 360 for instruction: %v16f32s64 = call <16 x i64> @llvm.fptosi.sat.v16i64.v16f32(<16 x float> undef)
-; CHECK-NEXT:  Cost Model: Found an estimated cost of 36 for instruction: %v16f32u64 = call <16 x i64> @llvm.fptoui.sat.v16i64.v16f32(<16 x float> undef)
+; CHECK-NEXT:  Cost Model: Found an estimated cost of 16 for instruction: %v16f32s64 = call <16 x i64> @llvm.fptosi.sat.v16i64.v16f32(<16 x float> undef)
+; CHECK-NEXT:  Cost Model: Found an estimated cost of 16 for instruction: %v16f32u64 = call <16 x i64> @llvm.fptoui.sat.v16i64.v16f32(<16 x float> undef)
 ; CHECK-NEXT:  Cost Model: Found an estimated cost of 40 for instruction: %v16f64s1 = call <16 x i1> @llvm.fptosi.sat.v16i1.v16f64(<16 x double> undef)
 ; CHECK-NEXT:  Cost Model: Found an estimated cost of 40 for instruction: %v16f64u1 = call <16 x i1> @llvm.fptoui.sat.v16i1.v16f64(<16 x double> undef)
 ; CHECK-NEXT:  Cost Model: Found an estimated cost of 40 for instruction: %v16f64s8 = call <16 x i8> @llvm.fptosi.sat.v16i8.v16f64(<16 x double> undef)
@@ -223,56 +223,56 @@ define void @casts() {
 
 define void @fp16() {
 ; CHECK-NOFP16-LABEL: 'fp16'
-; CHECK-NOFP16-NEXT:  Cost Model: Found an estimated cost of 5 for instruction: %f16s1 = call i1 @llvm.fptosi.sat.i1.f16(half undef)
-; CHECK-NOFP16-NEXT:  Cost Model: Found an estimated cost of 3 for instruction: %f16u1 = call i1 @llvm.fptoui.sat.i1.f16(half undef)
-; CHECK-NOFP16-NEXT:  Cost Model: Found an estimated cost of 5 for instruction: %f16s8 = call i8 @llvm.fptosi.sat.i8.f16(half undef)
-; CHECK-NOFP16-NEXT:  Cost Model: Found an estimated cost of 3 for instruction: %f16u8 = call i8 @llvm.fptoui.sat.i8.f16(half undef)
-; CHECK-NOFP16-NEXT:  Cost Model: Found an estimated cost of 5 for instruction: %f16s16 = call i16 @llvm.fptosi.sat.i16.f16(half undef)
-; CHECK-NOFP16-NEXT:  Cost Model: Found an estimated cost of 3 for instruction: %f16u16 = call i16 @llvm.fptoui.sat.i16.f16(half undef)
-; CHECK-NOFP16-NEXT:  Cost Model: Found an estimated cost of 5 for instruction: %f16s32 = call i32 @llvm.fptosi.sat.i32.f16(half undef)
-; CHECK-NOFP16-NEXT:  Cost Model: Found an estimated cost of 3 for instruction: %f16u32 = call i32 @llvm.fptoui.sat.i32.f16(half undef)
-; CHECK-NOFP16-NEXT:  Cost Model: Found an estimated cost of 5 for instruction: %f16s64 = call i64 @llvm.fptosi.sat.i64.f16(half undef)
-; CHECK-NOFP16-NEXT:  Cost Model: Found an estimated cost of 3 for instruction: %f16u64 = call i64 @llvm.fptoui.sat.i64.f16(half undef)
-; CHECK-NOFP16-NEXT:  Cost Model: Found an estimated cost of 27 for instruction: %v2f16s1 = call <2 x i1> @llvm.fptosi.sat.v2i1.v2f16(<2 x half> undef)
-; CHECK-NOFP16-NEXT:  Cost Model: Found an estimated cost of 17 for instruction: %v2f16u1 = call <2 x i1> @llvm.fptoui.sat.v2i1.v2f16(<2 x half> undef)
-; CHECK-NOFP16-NEXT:  Cost Model: Found an estimated cost of 27 for instruction: %v2f16s8 = call <2 x i8> @llvm.fptosi.sat.v2i8.v2f16(<2 x half> undef)
-; CHECK-NOFP16-NEXT:  Cost Model: Found an estimated cost of 17 for instruction: %v2f16u8 = call <2 x i8> @llvm.fptoui.sat.v2i8.v2f16(<2 x half> undef)
-; CHECK-NOFP16-NEXT:  Cost Model: Found an estimated cost of 27 for instruction: %v2f16s16 = call <2 x i16> @llvm.fptosi.sat.v2i16.v2f16(<2 x half> undef)
-; CHECK-NOFP16-NEXT:  Cost Model: Found an estimated cost of 17 for instruction: %v2f16u16 = call <2 x i16> @llvm.fptoui.sat.v2i16.v2f16(<2 x half> undef)
-; CHECK-NOFP16-NEXT:  Cost Model: Found an estimated cost of 27 for instruction: %v2f16s32 = call <2 x i32> @llvm.fptosi.sat.v2i32.v2f16(<2 x half> undef)
-; CHECK-NOFP16-NEXT:  Cost Model: Found an estimated cost of 17 for instruction: %v2f16u32 = call <2 x i32> @llvm.fptoui.sat.v2i32.v2f16(<2 x half> undef)
-; CHECK-NOFP16-NEXT:  Cost Model: Found an estimated cost of 36 for instruction: %v2f16s64 = call <2 x i64> @llvm.fptosi.sat.v2i64.v2f16(<2 x half> undef)
-; CHECK-NOFP16-NEXT:  Cost Model: Found an estimated cost of 26 for instruction: %v2f16u64 = call <2 x i64> @llvm.fptoui.sat.v2i64.v2f16(<2 x half> undef)
-; CHECK-NOFP16-NEXT:  Cost Model: Found an estimated cost of 61 for instruction: %v4f16s1 = call <4 x i1> @llvm.fptosi.sat.v4i1.v4f16(<4 x half> undef)
-; CHECK-NOFP16-NEXT:  Cost Model: Found an estimated cost of 45 for instruction: %v4f16u1 = call <4 x i1> @llvm.fptoui.sat.v4i1.v4f16(<4 x half> undef)
-; CHECK-NOFP16-NEXT:  Cost Model: Found an estimated cost of 61 for instruction: %v4f16s8 = call <4 x i8> @llvm.fptosi.sat.v4i8.v4f16(<4 x half> undef)
-; CHECK-NOFP16-NEXT:  Cost Model: Found an estimated cost of 45 for instruction: %v4f16u8 = call <4 x i8> @llvm.fptoui.sat.v4i8.v4f16(<4 x half> undef)
-; CHECK-NOFP16-NEXT:  Cost Model: Found an estimated cost of 61 for instruction: %v4f16s16 = call <4 x i16> @llvm.fptosi.sat.v4i16.v4f16(<4 x half> undef)
-; CHECK-NOFP16-NEXT:  Cost Model: Found an estimated cost of 45 for instruction: %v4f16u16 = call <4 x i16> @llvm.fptoui.sat.v4i16.v4f16(<4 x half> undef)
-; CHECK-NOFP16-NEXT:  Cost Model: Found an estimated cost of 80 for instruction: %v4f16s32 = call <4 x i32> @llvm.fptosi.sat.v4i32.v4f16(<4 x half> undef)
-; CHECK-NOFP16-NEXT:  Cost Model: Found an estimated cost of 64 for instruction: %v4f16u32 = call <4 x i32> @llvm.fptoui.sat.v4i32.v4f16(<4 x half> undef)
-; CHECK-NOFP16-NEXT:  Cost Model: Found an estimated cost of 149 for instruction: %v4f16s64 = call <4 x i64> @llvm.fptosi.sat.v4i64.v4f16(<4 x half> undef)
-; CHECK-NOFP16-NEXT:  Cost Model: Found an estimated cost of 65 for instruction: %v4f16u64 = call <4 x i64> @llvm.fptoui.sat.v4i64.v4f16(<4 x half> undef)
-; CHECK-NOFP16-NEXT:  Cost Model: Found an estimated cost of 186 for instruction: %v8f16s1 = call <8 x i1> @llvm.fptosi.sat.v8i1.v8f16(<8 x half> undef)
-; CHECK-NOFP16-NEXT:  Cost Model: Found an estimated cost of 140 for instruction: %v8f16u1 = call <8 x i1> @llvm.fptoui.sat.v8i1.v8f16(<8 x half> undef)
-; CHECK-NOFP16-NEXT:  Cost Model: Found an estimated cost of 186 for instruction: %v8f16s8 = call <8 x i8> @llvm.fptosi.sat.v8i8.v8f16(<8 x half> undef)
-; CHECK-NOFP16-NEXT:  Cost Model: Found an estimated cost of 140 for instruction: %v8f16u8 = call <8 x i8> @llvm.fptoui.sat.v8i8.v8f16(<8 x half> undef)
-; CHECK-NOFP16-NEXT:  Cost Model: Found an estimated cost of 147 for instruction: %v8f16s16 = call <8 x i16> @llvm.fptosi.sat.v8i16.v8f16(<8 x half> undef)
-; CHECK-NOFP16-NEXT:  Cost Model: Found an estimated cost of 101 for instruction: %v8f16u16 = call <8 x i16> @llvm.fptoui.sat.v8i16.v8f16(<8 x half> undef)
-; CHECK-NOFP16-NEXT:  Cost Model: Found an estimated cost of 171 for instruction: %v8f16s32 = call <8 x i32> @llvm.fptosi.sat.v8i32.v8f16(<8 x half> undef)
-; CHECK-NOFP16-NEXT:  Cost Model: Found an estimated cost of 141 for instruction: %v8f16u32 = call <8 x i32> @llvm.fptoui.sat.v8i32.v8f16(<8 x half> undef)
-; CHECK-NOFP16-NEXT:  Cost Model: Found an estimated cost of 325 for instruction: %v8f16s64 = call <8 x i64> @llvm.fptosi.sat.v8i64.v8f16(<8 x half> undef)
-; CHECK-NOFP16-NEXT:  Cost Model: Found an estimated cost of 143 for instruction: %v8f16u64 = call <8 x i64> @llvm.fptoui.sat.v8i64.v8f16(<8 x half> undef)
-; CHECK-NOFP16-NEXT:  Cost Model: Found an estimated cost of 373 for instruction: %v16f16s1 = call <16 x i1> @llvm.fptosi.sat.v16i1.v16f16(<16 x half> undef)
-; CHECK-NOFP16-NEXT:  Cost Model: Found an estimated cost of 281 for instruction: %v16f16u1 = call <16 x i1> @llvm.fptoui.sat.v16i1.v16f16(<16 x half> undef)
-; CHECK-NOFP16-NEXT:  Cost Model: Found an estimated cost of 373 for instruction: %v16f16s8 = call <16 x i8> @llvm.fptosi.sat.v16i8.v16f16(<16 x half> undef)
-; CHECK-NOFP16-NEXT:  Cost Model: Found an estimated cost of 281 for instruction: %v16f16u8 = call <16 x i8> @llvm.fptoui.sat.v16i8.v16f16(<16 x half> undef)
-; CHECK-NOFP16-NEXT:  Cost Model: Found an estimated cost of 262 for instruction: %v16f16s16 = call <16 x i16> @llvm.fptosi.sat.v16i16.v16f16(<16 x half> undef)
-; CHECK-NOFP16-NEXT:  Cost Model: Found an estimated cost of 202 for instruction: %v16f16u16 = call <16 x i16> @llvm.fptoui.sat.v16i16.v16f16(<16 x half> undef)
-; CHECK-NOFP16-NEXT:  Cost Model: Found an estimated cost of 342 for instruction: %v16f16s32 = call <16 x i32> @llvm.fptosi.sat.v16i32.v16f16(<16 x half> undef)
-; CHECK-NOFP16-NEXT:  Cost Model: Found an estimated cost of 282 for instruction: %v16f16u32 = call <16 x i32> @llvm.fptoui.sat.v16i32.v16f16(<16 x half> undef)
-; CHECK-NOFP16-NEXT:  Cost Model: Found an estimated cost of 650 for instruction: %v16f16s64 = call <16 x i64> @llvm.fptosi.sat.v16i64.v16f16(<16 x half> undef)
-; CHECK-NOFP16-NEXT:  Cost Model: Found an estimated cost of 286 for instruction: %v16f16u64 = call <16 x i64> @llvm.fptoui.sat.v16i64.v16f16(<16 x half> undef)
+; CHECK-NOFP16-NEXT:  Cost Model: Found an estimated cost of 6 for instruction: %f16s1 = call i1 @llvm.fptosi.sat.i1.f16(half undef)
+; CHECK-NOFP16-NEXT:  Cost Model: Found an estimated cost of 6 for instruction: %f16u1 = call i1 @llvm.fptoui.sat.i1.f16(half undef)
+; CHECK-NOFP16-NEXT:  Cost Model: Found an estimated cost of 6 for instruction: %f16s8 = call i8 @llvm.fptosi.sat.i8.f16(half undef)
+; CHECK-NOFP16-NEXT:  Cost Model: Found an estimated cost of 6 for instruction: %f16u8 = call i8 @llvm.fptoui.sat.i8.f16(half undef)
+; CHECK-NOFP16-NEXT:  Cost Model: Found an estimated cost of 6 for instruction: %f16s16 = call i16 @llvm.fptosi.sat.i16.f16(half undef)
+; CHECK-NOFP16-NEXT:  Cost Model: Found an estimated cost of 6 for instruction: %f16u16 = call i16 @llvm.fptoui.sat.i16.f16(half undef)
+; CHECK-NOFP16-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %f16s32 = call i32 @llvm.fptosi.sat.i32.f16(half undef)
+; CHECK-NOFP16-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %f16u32 = call i32 @llvm.fptoui.sat.i32.f16(half undef)
+; CHECK-NOFP16-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %f16s64 = call i64 @llvm.fptosi.sat.i64.f16(half undef)
+; CHECK-NOFP16-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %f16u64 = call i64 @llvm.fptoui.sat.i64.f16(half undef)
+; CHECK-NOFP16-NEXT:  Cost Model: Found an estimated cost of 4 for instruction: %v2f16s1 = call <2 x i1> @llvm.fptosi.sat.v2i1.v2f16(<2 x half> undef)
+; CHECK-NOFP16-NEXT:  Cost Model: Found an estimated cost of 4 for instruction: %v2f16u1 = call <2 x i1> @llvm.fptoui.sat.v2i1.v2f16(<2 x half> undef)
+; CHECK-NOFP16-NEXT:  Cost Model: Found an estimated cost of 4 for instruction: %v2f16s8 = call <2 x i8> @llvm.fptosi.sat.v2i8.v2f16(<2 x half> undef)
+; CHECK-NOFP16-NEXT:  Cost Model: Found an estimated cost of 4 for instruction: %v2f16u8 = call <2 x i8> @llvm.fptoui.sat.v2i8.v2f16(<2 x half> undef)
+; CHECK-NOFP16-NEXT:  Cost Model: Found an estimated cost of 4 for instruction: %v2f16s16 = call <2 x i16> @llvm.fptosi.sat.v2i16.v2f16(<2 x half> undef)
+; CHECK-NOFP16-NEXT:  Cost Model: Found an estimated cost of 4 for instruction: %v2f16u16 = call <2 x i16> @llvm.fptoui.sat.v2i16.v2f16(<2 x half> undef)
+; CHECK-NOFP16-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %v2f16s32 = call <2 x i32> @llvm.fptosi.sat.v2i32.v2f16(<2 x half> undef)
+; CHECK-NOFP16-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %v2f16u32 = call <2 x i32> @llvm.fptoui.sat.v2i32.v2f16(<2 x half> undef)
+; CHECK-NOFP16-NEXT:  Cost Model: Found an estimated cost of 3 for instruction: %v2f16s64 = call <2 x i64> @llvm.fptosi.sat.v2i64.v2f16(<2 x half> undef)
+; CHECK-NOFP16-NEXT:  Cost Model: Found an estimated cost of 3 for instruction: %v2f16u64 = call <2 x i64> @llvm.fptoui.sat.v2i64.v2f16(<2 x half> undef)
+; CHECK-NOFP16-NEXT:  Cost Model: Found an estimated cost of 4 for instruction: %v4f16s1 = call <4 x i1> @llvm.fptosi.sat.v4i1.v4f16(<4 x half> undef)
+; CHECK-NOFP16-NEXT:  Cost Model: Found an estimated cost of 4 for instruction: %v4f16u1 = call <4 x i1> @llvm.fptoui.sat.v4i1.v4f16(<4 x half> undef)
+; CHECK-NOFP16-NEXT:  Cost Model: Found an estimated cost of 4 for instruction: %v4f16s8 = call <4 x i8> @llvm.fptosi.sat.v4i8.v4f16(<4 x half> undef)
+; CHECK-NOFP16-NEXT:  Cost Model: Found an estimated cost of 4 for instruction: %v4f16u8 = call <4 x i8> @llvm.fptoui.sat.v4i8.v4f16(<4 x half> undef)
+; CHECK-NOFP16-NEXT:  Cost Model: Found an estimated cost of 4 for instruction: %v4f16s16 = call <4 x i16> @llvm.fptosi.sat.v4i16.v4f16(<4 x half> undef)
+; CHECK-NOFP16-NEXT:  Cost Model: Found an estimated cost of 4 for instruction: %v4f16u16 = call <4 x i16> @llvm.fptoui.sat.v4i16.v4f16(<4 x half> undef)
+; CHECK-NOFP16-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %v4f16s32 = call <4 x i32> @llvm.fptosi.sat.v4i32.v4f16(<4 x half> undef)
+; CHECK-NOFP16-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %v4f16u32 = call <4 x i32> @llvm.fptoui.sat.v4i32.v4f16(<4 x half> undef)
+; CHECK-NOFP16-NEXT:  Cost Model: Found an estimated cost of 5 for instruction: %v4f16s64 = call <4 x i64> @llvm.fptosi.sat.v4i64.v4f16(<4 x half> undef)
+; CHECK-NOFP16-NEXT:  Cost Model: Found an estimated cost of 5 for instruction: %v4f16u64 = call <4 x i64> @llvm.fptoui.sat.v4i64.v4f16(<4 x half> undef)
+; CHECK-NOFP16-NEXT:  Cost Model: Found an estimated cost of 7 for instruction: %v8f16s1 = call <8 x i1> @llvm.fptosi.sat.v8i1.v8f16(<8 x half> undef)
+; CHECK-NOFP16-NEXT:  Cost Model: Found an estimated cost of 7 for instruction: %v8f16u1 = call <8 x i1> @llvm.fptoui.sat.v8i1.v8f16(<8 x half> undef)
+; CHECK-NOFP16-NEXT:  Cost Model: Found an estimated cost of 7 for instruction: %v8f16s8 = call <8 x i8> @llvm.fptosi.sat.v8i8.v8f16(<8 x half> undef)
+; CHECK-NOFP16-NEXT:  Cost Model: Found an estimated cost of 7 for instruction: %v8f16u8 = call <8 x i8> @llvm.fptoui.sat.v8i8.v8f16(<8 x half> undef)
+; CHECK-NOFP16-NEXT:  Cost Model: Found an estimated cost of 7 for instruction: %v8f16s16 = call <8 x i16> @llvm.fptosi.sat.v8i16.v8f16(<8 x half> undef)
+; CHECK-NOFP16-NEXT:  Cost Model: Found an estimated cost of 7 for instruction: %v8f16u16 = call <8 x i16> @llvm.fptoui.sat.v8i16.v8f16(<8 x half> undef)
+; CHECK-NOFP16-NEXT:  Cost Model: Found an estimated cost of 3 for instruction: %v8f16s32 = call <8 x i32> @llvm.fptosi.sat.v8i32.v8f16(<8 x half> undef)
+; CHECK-NOFP16-NEXT:  Cost Model: Found an estimated cost of 3 for instruction: %v8f16u32 = call <8 x i32> @llvm.fptoui.sat.v8i32.v8f16(<8 x half> undef)
+; CHECK-NOFP16-NEXT:  Cost Model: Found an estimated cost of 9 for instruction: %v8f16s64 = call <8 x i64> @llvm.fptosi.sat.v8i64.v8f16(<8 x half> undef)
+; CHECK-NOFP16-NEXT:  Cost Model: Found an estimated cost of 9 for instruction: %v8f16u64 = call <8 x i64> @llvm.fptoui.sat.v8i64.v8f16(<8 x half> undef)
+; CHECK-NOFP16-NEXT:  Cost Model: Found an estimated cost of 14 for instruction: %v16f16s1 = call <16 x i1> @llvm.fptosi.sat.v16i1.v16f16(<16 x half> undef)
+; CHECK-NOFP16-NEXT:  Cost Model: Found an estimated cost of 14 for instruction: %v16f16u1 = call <16 x i1> @llvm.fptoui.sat.v16i1.v16f16(<16 x half> undef)
+; CHECK-NOFP16-NEXT:  Cost Model: Found an estimated cost of 14 for instruction: %v16f16s8 = call <16 x i8> @llvm.fptosi.sat.v16i8.v16f16(<16 x half> undef)
+; CHECK-NOFP16-NEXT:  Cost Model: Found an estimated cost of 14 for instruction: %v16f16u8 = call <16 x i8> @llvm.fptoui.sat.v16i8.v16f16(<16 x half> undef)
+; CHECK-NOFP16-NEXT:  Cost Model: Found an estimated cost of 14 for instruction: %v16f16s16 = call <16 x i16> @llvm.fptosi.sat.v16i16.v16f16(<16 x half> undef)
+; CHECK-NOFP16-NEXT:  Cost Model: Found an estimated cost of 14 for instruction: %v16f16u16 = call <16 x i16> @llvm.fptoui.sat.v16i16.v16f16(<16 x half> undef)
+; CHECK-NOFP16-NEXT:  Cost Model: Found an estimated cost of 6 for instruction: %v16f16s32 = call <16 x i32> @llvm.fptosi.sat.v16i32.v16f16(<16 x half> undef)
+; CHECK-NOFP16-NEXT:  Cost Model: Found an estimated cost of 6 for instruction: %v16f16u32 = call <16 x i32> @llvm.fptoui.sat.v16i32.v16f16(<16 x half> undef)
+; CHECK-NOFP16-NEXT:  Cost Model: Found an estimated cost of 18 for instruction: %v16f16s64 = call <16 x i64> @llvm.fptosi.sat.v16i64.v16f16(<16 x half> undef)
+; CHECK-NOFP16-NEXT:  Cost Model: Found an estimated cost of 18 for instruction: %v16f16u64 = call <16 x i64> @llvm.fptoui.sat.v16i64.v16f16(<16 x half> undef)
 ; CHECK-NOFP16-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: ret void
 ;
 ; CHECK-FP16-LABEL: 'fp16'
@@ -284,48 +284,48 @@ define void @fp16() {
 ; CHECK-FP16-NEXT:  Cost Model: Found an estimated cost of 5 for instruction: %f16u16 = call i16 @llvm.fptoui.sat.i16.f16(half undef)
 ; CHECK-FP16-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %f16s32 = call i32 @llvm.fptosi.sat.i32.f16(half undef)
 ; CHECK-FP16-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %f16u32 = call i32 @llvm.fptoui.sat.i32.f16(half undef)
-; CHECK-FP16-NEXT:  Cost Model: Found an estimated cost of 5 for instruction: %f16s64 = call i64 @llvm.fptosi.sat.i64.f16(half undef)
-; CHECK-FP16-NEXT:  Cost Model: Found an estimated cost of 3 for instruction: %f16u64 = call i64 @llvm.fptoui.sat.i64.f16(half undef)
+; CHECK-FP16-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %f16s64 = call i64 @llvm.fptosi.sat.i64.f16(half undef)
+; CHECK-FP16-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %f16u64 = call i64 @llvm.fptoui.sat.i64.f16(half undef)
 ; CHECK-FP16-NEXT:  Cost Model: Found an estimated cost of 3 for instruction: %v2f16s1 = call <2 x i1> @llvm.fptosi.sat.v2i1.v2f16(<2 x half> undef)
 ; CHECK-FP16-NEXT:  Cost Model: Found an estimated cost of 3 for instruction: %v2f16u1 = call <2 x i1> @llvm.fptoui.sat.v2i1.v2f16(<2 x half> undef)
 ; CHECK-FP16-NEXT:  Cost Model: Found an estimated cost of 3 for instruction: %v2f16s8 = call <2 x i8> @llvm.fptosi.sat.v2i8.v2f16(<2 x half> undef)
 ; CHECK-FP16-NEXT:  Cost Model: Found an estimated cost of 3 for instruction: %v2f16u8 = call <2 x i8> @llvm.fptoui.sat.v2i8.v2f16(<2 x half> undef)
 ; CHECK-FP16-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %v2f16s16 = call <2 x i16> @llvm.fptosi.sat.v2i16.v2f16(<2 x half> undef)
 ; CHECK-FP16-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %v2f16u16 = call <2 x i16> @llvm.fptoui.sat.v2i16.v2f16(<2 x half> undef)
-; CHECK-FP16-NEXT:  Cost Model: Found an estimated cost of 10 for instruction: %v2f16s32 = call <2 x i32> @llvm.fptosi.sat.v2i32.v2f16(<2 x half> undef)
-; CHECK-FP16-NEXT:  Cost Model: Found an estimated cost of 3 for instruction: %v2f16u32 = call <2 x i32> @llvm.fptoui.sat.v2i32.v2f16(<2 x half> undef)
-; CHECK-FP16-NEXT:  Cost Model: Found an estimated cost of 19 for instruction: %v2f16s64 = call <2 x i64> @llvm.fptosi.sat.v2i64.v2f16(<2 x half> undef)
-; CHECK-FP16-NEXT:  Cost Model: Found an estimated cost of 12 for instruction: %v2f16u64 = call <2 x i64> @llvm.fptoui.sat.v2i64.v2f16(<2 x half> undef)
+; CHECK-FP16-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %v2f16s32 = call <2 x i32> @llvm.fptosi.sat.v2i32.v2f16(<2 x half> undef)
+; CHECK-FP16-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %v2f16u32 = call <2 x i32> @llvm.fptoui.sat.v2i32.v2f16(<2 x half> undef)
+; CHECK-FP16-NEXT:  Cost Model: Found an estimated cost of 6 for instruction: %v2f16s64 = call <2 x i64> @llvm.fptosi.sat.v2i64.v2f16(<2 x half> undef)
+; CHECK-FP16-NEXT:  Cost Model: Found an estimated cost of 6 for instruction: %v2f16u64 = call <2 x i64> @llvm.fptoui.sat.v2i64.v2f16(<2 x half> undef)
 ; CHECK-FP16-NEXT:  Cost Model: Found an estimated cost of 3 for instruction: %v4f16s1 = call <4 x i1> @llvm.fptosi.sat.v4i1.v4f16(<4 x half> undef)
 ; CHECK-FP16-NEXT:  Cost Model: Found an estimated cost of 3 for instruction: %v4f16u1 = call <4 x i1> @llvm.fptoui.sat.v4i1.v4f16(<4 x half> undef)
 ; CHECK-FP16-NEXT:  Cost Model: Found an estimated cost of 3 for instruction: %v4f16s8 = call <4 x i8> @llvm.fptosi.sat.v4i8.v4f16(<4 x half> undef)
 ; CHECK-FP16-NEXT:  Cost Model: Found an estimated cost of 3 for instruction: %v4f16u8 = call <4 x i8> @llvm.fptoui.sat.v4i8.v4f16(<4 x half> undef)
 ; CHECK-FP16-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %v4f16s16 = call <4 x i16> @llvm.fptosi.sat.v4i16.v4f16(<4 x half> undef)
 ; CHECK-FP16-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %v4f16u16 = call <4 x i16> @llvm.fptoui.sat.v4i16.v4f16(<4 x half> undef)
-; CHECK-FP16-NEXT:  Cost Model: Found an estimated cost of 17 for instruction: %v4f16s32 = call <4 x i32> @llvm.fptosi.sat.v4i32.v4f16(<4 x half> undef)
-; CHECK-FP16-NEXT:  Cost Model: Found an estimated cost of 4 for instruction: %v4f16u32 = call <4 x i32> @llvm.fptoui.sat.v4i32.v4f16(<4 x half> undef)
-; CHECK-FP16-NEXT:  Cost Model: Found an estimated cost of 104 for instruction: %v4f16s64 = call <4 x i64> @llvm.fptosi.sat.v4i64.v4f16(<4 x half> undef)
-; CHECK-FP16-NEXT:  Cost Model: Found an estimated cost of 23 for instruction: %v4f16u64 = call <4 x i64> @llvm.fptoui.sat.v4i64.v4f16(<4 x half> undef)
+; CHECK-FP16-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %v4f16s32 = call <4 x i32> @llvm.fptosi.sat.v4i32.v4f16(<4 x half> undef)
+; CHECK-FP16-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %v4f16u32 = call <4 x i32> @llvm.fptoui.sat.v4i32.v4f16(<4 x half> undef)
+; CHECK-FP16-NEXT:  Cost Model: Found an estimated cost of 12 for instruction: %v4f16s64 = call <4 x i64> @llvm.fptosi.sat.v4i64.v4f16(<4 x half> undef)
+; CHECK-FP16-NEXT:  Cost Model: Found an estimated cost of 12 for instruction: %v4f16u64 = call <4 x i64> @llvm.fptoui.sat.v4i64.v4f16(<4 x half> undef)
 ; CHECK-FP16-NEXT:  Cost Model: Found an estimated cost of 3 for instruction: %v8f16s1 = call <8 x i1> @llvm.fptosi.sat.v8i1.v8f16(<8 x half> undef)
 ; CHECK-FP16-NEXT:  Cost Model: Found an estimated cost of 3 for instruction: %v8f16u1 = call <8 x i1> @llvm.fptoui.sat.v8i1.v8f16(<8 x half> undef)
 ; CHECK-FP16-NEXT:  Cost Model: Found an estimated cost of 3 for instruction: %v8f16s8 = call <8 x i8> @llvm.fptosi.sat.v8i8.v8f16(<8 x half> undef)
 ; CHECK-FP16-NEXT:  Cost Model: Found an estimated cost of 3 for instruction: %v8f16u8 = call <8 x i8> @llvm.fptoui.sat.v8i8.v8f16(<8 x half> undef)
 ; CHECK-FP16-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %v8f16s16 = call <8 x i16> @llvm.fptosi.sat.v8i16.v8f16(<8 x half> undef)
 ; CHECK-FP16-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %v8f16u16 = call <8 x i16> @llvm.fptoui.sat.v8i16.v8f16(<8 x half> undef)
-; CHECK-FP16-NEXT:  Cost Model: Found an estimated cost of 15 for instruction: %v8f16s32 = call <8 x i32> @llvm.fptosi.sat.v8i32.v8f16(<8 x half> undef)
-; CHECK-FP16-NEXT:  Cost Model: Found an estimated cost of 6 for instruction: %v8f16u32 = call <8 x i32> @llvm.fptoui.sat.v8i32.v8f16(<8 x half> undef)
-; CHECK-FP16-NEXT:  Cost Model: Found an estimated cost of 206 for instruction: %v8f16s64 = call <8 x i64> @llvm.fptosi.sat.v8i64.v8f16(<8 x half> undef)
-; CHECK-FP16-NEXT:  Cost Model: Found an estimated cost of 45 for instruction: %v8f16u64 = call <8 x i64> @llvm.fptoui.sat.v8i64.v8f16(<8 x half> undef)
+; CHECK-FP16-NEXT:  Cost Model: Found an estimated cost of 4 for instruction: %v8f16s32 = call <8 x i32> @llvm.fptosi.sat.v8i32.v8f16(<8 x half> undef)
+; CHECK-FP16-NEXT:  Cost Model: Found an estimated cost of 4 for instruction: %v8f16u32 = call <8 x i32> @llvm.fptoui.sat.v8i32.v8f16(<8 x half> undef)
+; CHECK-FP16-NEXT:  Cost Model: Found an estimated cost of 24 for instruction: %v8f16s64 = call <8 x i64> @llvm.fptosi.sat.v8i64.v8f16(<8 x half> undef)
+; CHECK-FP16-NEXT:  Cost Model: Found an estimated cost of 24 for instruction: %v8f16u64 = call <8 x i64> @llvm.fptoui.sat.v8i64.v8f16(<8 x half> undef)
 ; CHECK-FP16-NEXT:  Cost Model: Found an estimated cost of 6 for instruction: %v16f16s1 = call <16 x i1> @llvm.fptosi.sat.v16i1.v16f16(<16 x half> undef)
 ; CHECK-FP16-NEXT:  Cost Model: Found an estimated cost of 6 for instruction: %v16f16u1 = call <16 x i1> @llvm.fptoui.sat.v16i1.v16f16(<16 x half> undef)
 ; CHECK-FP16-NEXT:  Cost Model: Found an estimated cost of 6 for instruction: %v16f16s8 = call <16 x i8> @llvm.fptosi.sat.v16i8.v16f16(<16 x half> undef)
 ; CHECK-FP16-NEXT:  Cost Model: Found an estimated cost of 6 for instruction: %v16f16u8 = call <16 x i8> @llvm.fptoui.sat.v16i8.v16f16(<16 x half> undef)
 ; CHECK-FP16-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %v16f16s16 = call <16 x i16> @llvm.fptosi.sat.v16i16.v16f16(<16 x half> undef)
 ; CHECK-FP16-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %v16f16u16 = call <16 x i16> @llvm.fptoui.sat.v16i16.v16f16(<16 x half> undef)
-; CHECK-FP16-NEXT:  Cost Model: Found an estimated cost of 34 for instruction: %v16f16s32 = call <16 x i32> @llvm.fptosi.sat.v16i32.v16f16(<16 x half> undef)
-; CHECK-FP16-NEXT:  Cost Model: Found an estimated cost of 16 for instruction: %v16f16u32 = call <16 x i32> @llvm.fptoui.sat.v16i32.v16f16(<16 x half> undef)
-; CHECK-FP16-NEXT:  Cost Model: Found an estimated cost of 416 for instruction: %v16f16s64 = call <16 x i64> @llvm.fptosi.sat.v16i64.v16f16(<16 x half> undef)
-; CHECK-FP16-NEXT:  Cost Model: Found an estimated cost of 94 for instruction: %v16f16u64 = call <16 x i64> @llvm.fptoui.sat.v16i64.v16f16(<16 x half> undef)
+; CHECK-FP16-NEXT:  Cost Model: Found an estimated cost of 8 for instruction: %v16f16s32 = call <16 x i32> @llvm.fptosi.sat.v16i32.v16f16(<16 x half> undef)
+; CHECK-FP16-NEXT:  Cost Model: Found an estimated cost of 8 for instruction: %v16f16u32 = call <16 x i32> @llvm.fptoui.sat.v16i32.v16f16(<16 x half> undef)
+; CHECK-FP16-NEXT:  Cost Model: Found an estimated cost of 48 for instruction: %v16f16s64 = call <16 x i64> @llvm.fptosi.sat.v16i64.v16f16(<16 x half> undef)
+; CHECK-FP16-NEXT:  Cost Model: Found an estimated cost of 48 for instruction: %v16f16u64 = call <16 x i64> @llvm.fptoui.sat.v16i64.v16f16(<16 x half> undef)
 ; CHECK-FP16-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: ret void
 ;
   %f16s1 = call i1 @llvm.fptosi.sat.i1.f16(half undef)
diff --git a/llvm/test/Analysis/CostModel/AMDGPU/arith-fp.ll b/llvm/test/Analysis/CostModel/AMDGPU/arith-fp.ll
deleted file mode 100644
index 72a3392..0000000
--- a/llvm/test/Analysis/CostModel/AMDGPU/arith-fp.ll
+++ /dev/null
@@ -1,103 +0,0 @@
-; NOTE: Assertions have been autogenerated by utils/update_analyze_test_checks.py
-; RUN: opt -passes="print<cost-model>" 2>&1 -disable-output -mtriple=amdgcn-unknown-amdhsa -mcpu=gfx1010 < %s | FileCheck -check-prefixes=ALL %s
-; RUN: opt -passes="print<cost-model>" 2>&1 -disable-output -mtriple=amdgcn-unknown-amdhsa -mcpu=gfx90a < %s | FileCheck -check-prefixes=ALL %s
-; RUN: opt -passes="print<cost-model>" 2>&1 -disable-output -mtriple=amdgcn-unknown-amdhsa -mcpu=gfx900 < %s | FileCheck -check-prefixes=ALL %s
-; RUN: opt -passes="print<cost-model>" 2>&1 -disable-output -mtriple=amdgcn-unknown-amdhsa < %s | FileCheck -check-prefixes=ALL %s
-
-; RUN: opt -passes="print<cost-model>" -cost-kind=code-size 2>&1 -disable-output -mtriple=amdgcn-unknown-amdhsa -mcpu=gfx1010 < %s | FileCheck -check-prefixes=ALL-SIZE %s
-; RUN: opt -passes="print<cost-model>" -cost-kind=code-size 2>&1 -disable-output -mtriple=amdgcn-unknown-amdhsa -mcpu=gfx90a < %s | FileCheck -check-prefixes=ALL-SIZE %s
-; RUN: opt -passes="print<cost-model>" -cost-kind=code-size 2>&1 -disable-output -mtriple=amdgcn-unknown-amdhsa -mcpu=gfx900 < %s | FileCheck -check-prefixes=ALL-SIZE %s
-; RUN: opt -passes="print<cost-model>" -cost-kind=code-size 2>&1 -disable-output -mtriple=amdgcn-unknown-amdhsa < %s | FileCheck -check-prefixes=ALL-SIZE %s
-; END.
-
-define i32 @fcopysign(i32 %arg) {
-; ALL-LABEL: 'fcopysign'
-; ALL-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %F32 = call float @llvm.copysign.f32(float undef, float undef)
-; ALL-NEXT:  Cost Model: Found an estimated cost of 4 for instruction: %V4F32 = call <4 x float> @llvm.copysign.v4f32(<4 x float> undef, <4 x float> undef)
-; ALL-NEXT:  Cost Model: Found an estimated cost of 8 for instruction: %V8F32 = call <8 x float> @llvm.copysign.v8f32(<8 x float> undef, <8 x float> undef)
-; ALL-NEXT:  Cost Model: Found an estimated cost of 16 for instruction: %V16F32 = call <16 x float> @llvm.copysign.v16f32(<16 x float> undef, <16 x float> undef)
-; ALL-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %F64 = call double @llvm.copysign.f64(double undef, double undef)
-; ALL-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %V2F64 = call <2 x double> @llvm.copysign.v2f64(<2 x double> undef, <2 x double> undef)
-; ALL-NEXT:  Cost Model: Found an estimated cost of 4 for instruction: %V4F64 = call <4 x double> @llvm.copysign.v4f64(<4 x double> undef, <4 x double> undef)
-; ALL-NEXT:  Cost Model: Found an estimated cost of 8 for instruction: %V8F64 = call <8 x double> @llvm.copysign.v8f64(<8 x double> undef, <8 x double> undef)
-; ALL-NEXT:  Cost Model: Found an estimated cost of 10 for instruction: ret i32 undef
-;
-; ALL-SIZE-LABEL: 'fcopysign'
-; ALL-SIZE-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %F32 = call float @llvm.copysign.f32(float undef, float undef)
-; ALL-SIZE-NEXT:  Cost Model: Found an estimated cost of 4 for instruction: %V4F32 = call <4 x float> @llvm.copysign.v4f32(<4 x float> undef, <4 x float> undef)
-; ALL-SIZE-NEXT:  Cost Model: Found an estimated cost of 8 for instruction: %V8F32 = call <8 x float> @llvm.copysign.v8f32(<8 x float> undef, <8 x float> undef)
-; ALL-SIZE-NEXT:  Cost Model: Found an estimated cost of 16 for instruction: %V16F32 = call <16 x float> @llvm.copysign.v16f32(<16 x float> undef, <16 x float> undef)
-; ALL-SIZE-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %F64 = call double @llvm.copysign.f64(double undef, double undef)
-; ALL-SIZE-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %V2F64 = call <2 x double> @llvm.copysign.v2f64(<2 x double> undef, <2 x double> undef)
-; ALL-SIZE-NEXT:  Cost Model: Found an estimated cost of 4 for instruction: %V4F64 = call <4 x double> @llvm.copysign.v4f64(<4 x double> undef, <4 x double> undef)
-; ALL-SIZE-NEXT:  Cost Model: Found an estimated cost of 8 for instruction: %V8F64 = call <8 x double> @llvm.copysign.v8f64(<8 x double> undef, <8 x double> undef)
-; ALL-SIZE-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: ret i32 undef
-;
-  %F32 = call float @llvm.copysign.f32(float undef, float undef)
-  %V4F32 = call <4 x float> @llvm.copysign.v4f32(<4 x float> undef, <4 x float> undef)
-  %V8F32 = call <8 x float> @llvm.copysign.v8f32(<8 x float> undef, <8 x float> undef)
-  %V16F32 = call <16 x float> @llvm.copysign.v16f32(<16 x float> undef, <16 x float> undef)
-
-  %F64 = call double @llvm.copysign.f64(double undef, double undef)
-  %V2F64 = call <2 x double> @llvm.copysign.v2f64(<2 x double> undef, <2 x double> undef)
-  %V4F64 = call <4 x double> @llvm.copysign.v4f64(<4 x double> undef, <4 x double> undef)
-  %V8F64 = call <8 x double> @llvm.copysign.v8f64(<8 x double> undef, <8 x double> undef)
-
-  ret i32 undef
-}
-
-define i32 @fsqrt(i32 %arg) {
-; ALL-LABEL: 'fsqrt'
-; ALL-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %F32 = call float @llvm.sqrt.f32(float undef)
-; ALL-NEXT:  Cost Model: Found an estimated cost of 8 for instruction: %V4F32 = call <4 x float> @llvm.sqrt.v4f32(<4 x float> undef)
-; ALL-NEXT:  Cost Model: Found an estimated cost of 16 for instruction: %V8F32 = call <8 x float> @llvm.sqrt.v8f32(<8 x float> undef)
-; ALL-NEXT:  Cost Model: Found an estimated cost of 32 for instruction: %V16F32 = call <16 x float> @llvm.sqrt.v16f32(<16 x float> undef)
-; ALL-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %F64 = call double @llvm.sqrt.f64(double undef)
-; ALL-NEXT:  Cost Model: Found an estimated cost of 4 for instruction: %V2F64 = call <2 x double> @llvm.sqrt.v2f64(<2 x double> undef)
-; ALL-NEXT:  Cost Model: Found an estimated cost of 8 for instruction: %V4F64 = call <4 x double> @llvm.sqrt.v4f64(<4 x double> undef)
-; ALL-NEXT:  Cost Model: Found an estimated cost of 16 for instruction: %V8F64 = call <8 x double> @llvm.sqrt.v8f64(<8 x double> undef)
-; ALL-NEXT:  Cost Model: Found an estimated cost of 10 for instruction: ret i32 undef
-;
-; ALL-SIZE-LABEL: 'fsqrt'
-; ALL-SIZE-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %F32 = call float @llvm.sqrt.f32(float undef)
-; ALL-SIZE-NEXT:  Cost Model: Found an estimated cost of 8 for instruction: %V4F32 = call <4 x float> @llvm.sqrt.v4f32(<4 x float> undef)
-; ALL-SIZE-NEXT:  Cost Model: Found an estimated cost of 16 for instruction: %V8F32 = call <8 x float> @llvm.sqrt.v8f32(<8 x float> undef)
-; ALL-SIZE-NEXT:  Cost Model: Found an estimated cost of 32 for instruction: %V16F32 = call <16 x float> @llvm.sqrt.v16f32(<16 x float> undef)
-; ALL-SIZE-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %F64 = call double @llvm.sqrt.f64(double undef)
-; ALL-SIZE-NEXT:  Cost Model: Found an estimated cost of 4 for instruction: %V2F64 = call <2 x double> @llvm.sqrt.v2f64(<2 x double> undef)
-; ALL-SIZE-NEXT:  Cost Model: Found an estimated cost of 8 for instruction: %V4F64 = call <4 x double> @llvm.sqrt.v4f64(<4 x double> undef)
-; ALL-SIZE-NEXT:  Cost Model: Found an estimated cost of 16 for instruction: %V8F64 = call <8 x double> @llvm.sqrt.v8f64(<8 x double> undef)
-; ALL-SIZE-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: ret i32 undef
-;
-  %F32 = call float @llvm.sqrt.f32(float undef)
-  %V4F32 = call <4 x float> @llvm.sqrt.v4f32(<4 x float> undef)
-  %V8F32 = call <8 x float> @llvm.sqrt.v8f32(<8 x float> undef)
-  %V16F32 = call <16 x float> @llvm.sqrt.v16f32(<16 x float> undef)
-
-  %F64 = call double @llvm.sqrt.f64(double undef)
-  %V2F64 = call <2 x double> @llvm.sqrt.v2f64(<2 x double> undef)
-  %V4F64 = call <4 x double> @llvm.sqrt.v4f64(<4 x double> undef)
-  %V8F64 = call <8 x double> @llvm.sqrt.v8f64(<8 x double> undef)
-
-  ret i32 undef
-}
-
-declare float @llvm.copysign.f32(float, float)
-declare <4 x float> @llvm.copysign.v4f32(<4 x float>, <4 x float>)
-declare <8 x float> @llvm.copysign.v8f32(<8 x float>, <8 x float>)
-declare <16 x float> @llvm.copysign.v16f32(<16 x float>, <16 x float>)
-
-declare double @llvm.copysign.f64(double, double)
-declare <2 x double> @llvm.copysign.v2f64(<2 x double>, <2 x double>)
-declare <4 x double> @llvm.copysign.v4f64(<4 x double>, <4 x double>)
-declare <8 x double> @llvm.copysign.v8f64(<8 x double>, <8 x double>)
-
-declare float @llvm.sqrt.f32(float)
-declare <4 x float> @llvm.sqrt.v4f32(<4 x float>)
-declare <8 x float> @llvm.sqrt.v8f32(<8 x float>)
-declare <16 x float> @llvm.sqrt.v16f32(<16 x float>)
-
-declare double @llvm.sqrt.f64(double)
-declare <2 x double> @llvm.sqrt.v2f64(<2 x double>)
-declare <4 x double> @llvm.sqrt.v4f64(<4 x double>)
-declare <8 x double> @llvm.sqrt.v8f64(<8 x double>)
diff --git a/llvm/test/Analysis/CostModel/AMDGPU/arithmetic_fence.ll b/llvm/test/Analysis/CostModel/AMDGPU/arithmetic_fence.ll
new file mode 100644
index 0000000..2cee151
--- /dev/null
+++ b/llvm/test/Analysis/CostModel/AMDGPU/arithmetic_fence.ll
@@ -0,0 +1,139 @@
+; NOTE: Assertions have been autogenerated by utils/update_analyze_test_checks.py UTC_ARGS: --version 5
+; RUN: opt -passes="print<cost-model>" 2>&1 -disable-output -mtriple=amdgcn-unknown-amdhsa < %s | FileCheck -check-prefixes=ALL %s
+; RUN: opt -passes="print<cost-model>" -cost-kind=code-size 2>&1 -disable-output -mtriple=amdgcn-unknown-amdhsa < %s | FileCheck -check-prefixes=ALL-SIZE %s
+
+define void @arithmetic_fence_f16() {
+; ALL-LABEL: 'arithmetic_fence_f16'
+; ALL-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: %f16 = call half @llvm.arithmetic.fence.f16(half undef)
+; ALL-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: %v2f16 = call <2 x half> @llvm.arithmetic.fence.v2f16(<2 x half> undef)
+; ALL-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: %v3f16 = call <3 x half> @llvm.arithmetic.fence.v3f16(<3 x half> undef)
+; ALL-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: %v4f16 = call <4 x half> @llvm.arithmetic.fence.v4f16(<4 x half> undef)
+; ALL-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: %v5f16 = call <5 x half> @llvm.arithmetic.fence.v5f16(<5 x half> undef)
+; ALL-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: %v8f16 = call <8 x half> @llvm.arithmetic.fence.v8f16(<8 x half> undef)
+; ALL-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: %v16f16 = call <16 x half> @llvm.arithmetic.fence.v16f16(<16 x half> undef)
+; ALL-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: %v17f16 = call <17 x half> @llvm.arithmetic.fence.v17f16(<17 x half> undef)
+; ALL-NEXT:  Cost Model: Found an estimated cost of 10 for instruction: ret void
+;
+; ALL-SIZE-LABEL: 'arithmetic_fence_f16'
+; ALL-SIZE-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: %f16 = call half @llvm.arithmetic.fence.f16(half undef)
+; ALL-SIZE-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: %v2f16 = call <2 x half> @llvm.arithmetic.fence.v2f16(<2 x half> undef)
+; ALL-SIZE-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: %v3f16 = call <3 x half> @llvm.arithmetic.fence.v3f16(<3 x half> undef)
+; ALL-SIZE-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: %v4f16 = call <4 x half> @llvm.arithmetic.fence.v4f16(<4 x half> undef)
+; ALL-SIZE-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: %v5f16 = call <5 x half> @llvm.arithmetic.fence.v5f16(<5 x half> undef)
+; ALL-SIZE-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: %v8f16 = call <8 x half> @llvm.arithmetic.fence.v8f16(<8 x half> undef)
+; ALL-SIZE-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: %v16f16 = call <16 x half> @llvm.arithmetic.fence.v16f16(<16 x half> undef)
+; ALL-SIZE-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: %v17f16 = call <17 x half> @llvm.arithmetic.fence.v17f16(<17 x half> undef)
+; ALL-SIZE-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: ret void
+;
+  %f16 = call half @llvm.arithmetic.fence.f16(half undef)
+  %v2f16 = call <2 x half> @llvm.arithmetic.fence.v2f16(<2 x half> undef)
+  %v3f16 = call <3 x half> @llvm.arithmetic.fence.v3f16(<3 x half> undef)
+  %v4f16 = call <4 x half> @llvm.arithmetic.fence.v4f16(<4 x half> undef)
+  %v5f16 = call <5 x half> @llvm.arithmetic.fence.v5f16(<5 x half> undef)
+  %v8f16 = call <8 x half> @llvm.arithmetic.fence.v8f16(<8 x half> undef)
+  %v16f16 = call <16 x half> @llvm.arithmetic.fence.v16f16(<16 x half> undef)
+  %v17f16 = call <17 x half> @llvm.arithmetic.fence.v17f16(<17 x half> undef)
+  ret void
+}
+
+define void @arithmetic_fence_bf16() {
+; ALL-LABEL: 'arithmetic_fence_bf16'
+; ALL-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: %bf16 = call bfloat @llvm.arithmetic.fence.bf16(bfloat undef)
+; ALL-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: %v2bf16 = call <2 x bfloat> @llvm.arithmetic.fence.v2bf16(<2 x bfloat> undef)
+; ALL-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: %v3bf16 = call <3 x bfloat> @llvm.arithmetic.fence.v3bf16(<3 x bfloat> undef)
+; ALL-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: %v4bf16 = call <4 x bfloat> @llvm.arithmetic.fence.v4bf16(<4 x bfloat> undef)
+; ALL-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: %v5bf16 = call <5 x bfloat> @llvm.arithmetic.fence.v5bf16(<5 x bfloat> undef)
+; ALL-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: %v8bf16 = call <8 x bfloat> @llvm.arithmetic.fence.v8bf16(<8 x bfloat> undef)
+; ALL-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: %v16bf16 = call <16 x bfloat> @llvm.arithmetic.fence.v16bf16(<16 x bfloat> undef)
+; ALL-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: %v17bf16 = call <17 x bfloat> @llvm.arithmetic.fence.v17bf16(<17 x bfloat> undef)
+; ALL-NEXT:  Cost Model: Found an estimated cost of 10 for instruction: ret void
+;
+; ALL-SIZE-LABEL: 'arithmetic_fence_bf16'
+; ALL-SIZE-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: %bf16 = call bfloat @llvm.arithmetic.fence.bf16(bfloat undef)
+; ALL-SIZE-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: %v2bf16 = call <2 x bfloat> @llvm.arithmetic.fence.v2bf16(<2 x bfloat> undef)
+; ALL-SIZE-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: %v3bf16 = call <3 x bfloat> @llvm.arithmetic.fence.v3bf16(<3 x bfloat> undef)
+; ALL-SIZE-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: %v4bf16 = call <4 x bfloat> @llvm.arithmetic.fence.v4bf16(<4 x bfloat> undef)
+; ALL-SIZE-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: %v5bf16 = call <5 x bfloat> @llvm.arithmetic.fence.v5bf16(<5 x bfloat> undef)
+; ALL-SIZE-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: %v8bf16 = call <8 x bfloat> @llvm.arithmetic.fence.v8bf16(<8 x bfloat> undef)
+; ALL-SIZE-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: %v16bf16 = call <16 x bfloat> @llvm.arithmetic.fence.v16bf16(<16 x bfloat> undef)
+; ALL-SIZE-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: %v17bf16 = call <17 x bfloat> @llvm.arithmetic.fence.v17bf16(<17 x bfloat> undef)
+; ALL-SIZE-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: ret void
+;
+  %bf16 = call bfloat @llvm.arithmetic.fence.bf16(bfloat undef)
+  %v2bf16 = call <2 x bfloat> @llvm.arithmetic.fence.v2bf16(<2 x bfloat> undef)
+  %v3bf16 = call <3 x bfloat> @llvm.arithmetic.fence.v3bf16(<3 x bfloat> undef)
+  %v4bf16 = call <4 x bfloat> @llvm.arithmetic.fence.v4bf16(<4 x bfloat> undef)
+  %v5bf16 = call <5 x bfloat> @llvm.arithmetic.fence.v5bf16(<5 x bfloat> undef)
+  %v8bf16 = call <8 x bfloat> @llvm.arithmetic.fence.v8bf16(<8 x bfloat> undef)
+  %v16bf16 = call <16 x bfloat> @llvm.arithmetic.fence.v16bf16(<16 x bfloat> undef)
+  %v17bf16 = call <17 x bfloat> @llvm.arithmetic.fence.v17bf16(<17 x bfloat> undef)
+  ret void
+}
+
+define void @arithmetic_fence_f32() {
+; ALL-LABEL: 'arithmetic_fence_f32'
+; ALL-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: %f32 = call float @llvm.arithmetic.fence.f32(float undef)
+; ALL-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: %v2f32 = call <2 x float> @llvm.arithmetic.fence.v2f32(<2 x float> undef)
+; ALL-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: %v3f32 = call <3 x float> @llvm.arithmetic.fence.v3f32(<3 x float> undef)
+; ALL-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: %v4f32 = call <4 x float> @llvm.arithmetic.fence.v4f32(<4 x float> undef)
+; ALL-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: %v5f32 = call <5 x float> @llvm.arithmetic.fence.v5f32(<5 x float> undef)
+; ALL-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: %v8f32 = call <8 x float> @llvm.arithmetic.fence.v8f32(<8 x float> undef)
+; ALL-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: %v16f32 = call <16 x float> @llvm.arithmetic.fence.v16f32(<16 x float> undef)
+; ALL-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: %v17f32 = call <17 x float> @llvm.arithmetic.fence.v17f32(<17 x float> undef)
+; ALL-NEXT:  Cost Model: Found an estimated cost of 10 for instruction: ret void
+;
+; ALL-SIZE-LABEL: 'arithmetic_fence_f32'
+; ALL-SIZE-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: %f32 = call float @llvm.arithmetic.fence.f32(float undef)
+; ALL-SIZE-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: %v2f32 = call <2 x float> @llvm.arithmetic.fence.v2f32(<2 x float> undef)
+; ALL-SIZE-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: %v3f32 = call <3 x float> @llvm.arithmetic.fence.v3f32(<3 x float> undef)
+; ALL-SIZE-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: %v4f32 = call <4 x float> @llvm.arithmetic.fence.v4f32(<4 x float> undef)
+; ALL-SIZE-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: %v5f32 = call <5 x float> @llvm.arithmetic.fence.v5f32(<5 x float> undef)
+; ALL-SIZE-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: %v8f32 = call <8 x float> @llvm.arithmetic.fence.v8f32(<8 x float> undef)
+; ALL-SIZE-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: %v16f32 = call <16 x float> @llvm.arithmetic.fence.v16f32(<16 x float> undef)
+; ALL-SIZE-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: %v17f32 = call <17 x float> @llvm.arithmetic.fence.v17f32(<17 x float> undef)
+; ALL-SIZE-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: ret void
+;
+  %f32 = call float @llvm.arithmetic.fence.f32(float undef)
+  %v2f32 = call <2 x float> @llvm.arithmetic.fence.v2f32(<2 x float> undef)
+  %v3f32 = call <3 x float> @llvm.arithmetic.fence.v3f32(<3 x float> undef)
+  %v4f32 = call <4 x float> @llvm.arithmetic.fence.v4f32(<4 x float> undef)
+  %v5f32 = call <5 x float> @llvm.arithmetic.fence.v5f32(<5 x float> undef)
+  %v8f32 = call <8 x float> @llvm.arithmetic.fence.v8f32(<8 x float> undef)
+  %v16f32 = call <16 x float> @llvm.arithmetic.fence.v16f32(<16 x float> undef)
+  %v17f32 = call <17 x float> @llvm.arithmetic.fence.v17f32(<17 x float> undef)
+  ret void
+}
+
+define void @arithmetic_fence_f64() {
+; ALL-LABEL: 'arithmetic_fence_f64'
+; ALL-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: %f64 = call double @llvm.arithmetic.fence.f64(double undef)
+; ALL-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: %v2f64 = call <2 x double> @llvm.arithmetic.fence.v2f64(<2 x double> undef)
+; ALL-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: %v3f64 = call <3 x double> @llvm.arithmetic.fence.v3f64(<3 x double> undef)
+; ALL-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: %v4f64 = call <4 x double> @llvm.arithmetic.fence.v4f64(<4 x double> undef)
+; ALL-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: %v5f64 = call <5 x double> @llvm.arithmetic.fence.v5f64(<5 x double> undef)
+; ALL-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: %v8f64 = call <8 x double> @llvm.arithmetic.fence.v8f64(<8 x double> undef)
+; ALL-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: %v16f64 = call <16 x double> @llvm.arithmetic.fence.v16f64(<16 x double> undef)
+; ALL-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: %v17f64 = call <17 x double> @llvm.arithmetic.fence.v17f64(<17 x double> undef)
+; ALL-NEXT:  Cost Model: Found an estimated cost of 10 for instruction: ret void
+;
+; ALL-SIZE-LABEL: 'arithmetic_fence_f64'
+; ALL-SIZE-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: %f64 = call double @llvm.arithmetic.fence.f64(double undef)
+; ALL-SIZE-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: %v2f64 = call <2 x double> @llvm.arithmetic.fence.v2f64(<2 x double> undef)
+; ALL-SIZE-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: %v3f64 = call <3 x double> @llvm.arithmetic.fence.v3f64(<3 x double> undef)
+; ALL-SIZE-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: %v4f64 = call <4 x double> @llvm.arithmetic.fence.v4f64(<4 x double> undef)
+; ALL-SIZE-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: %v5f64 = call <5 x double> @llvm.arithmetic.fence.v5f64(<5 x double> undef)
+; ALL-SIZE-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: %v8f64 = call <8 x double> @llvm.arithmetic.fence.v8f64(<8 x double> undef)
+; ALL-SIZE-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: %v16f64 = call <16 x double> @llvm.arithmetic.fence.v16f64(<16 x double> undef)
+; ALL-SIZE-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: %v17f64 = call <17 x double> @llvm.arithmetic.fence.v17f64(<17 x double> undef)
+; ALL-SIZE-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: ret void
+;
+  %f64 = call double @llvm.arithmetic.fence.f64(double undef)
+  %v2f64 = call <2 x double> @llvm.arithmetic.fence.v2f64(<2 x double> undef)
+  %v3f64 = call <3 x double> @llvm.arithmetic.fence.v3f64(<3 x double> undef)
+  %v4f64 = call <4 x double> @llvm.arithmetic.fence.v4f64(<4 x double> undef)
+  %v5f64 = call <5 x double> @llvm.arithmetic.fence.v5f64(<5 x double> undef)
+  %v8f64 = call <8 x double> @llvm.arithmetic.fence.v8f64(<8 x double> undef)
+  %v16f64 = call <16 x double> @llvm.arithmetic.fence.v16f64(<16 x double> undef)
+  %v17f64 = call <17 x double> @llvm.arithmetic.fence.v17f64(<17 x double> undef)
+  ret void
+}
diff --git a/llvm/test/Analysis/CostModel/AMDGPU/canonicalize.ll b/llvm/test/Analysis/CostModel/AMDGPU/canonicalize.ll
new file mode 100644
index 0000000..e980c910
--- /dev/null
+++ b/llvm/test/Analysis/CostModel/AMDGPU/canonicalize.ll
@@ -0,0 +1,263 @@
+; NOTE: Assertions have been autogenerated by utils/update_analyze_test_checks.py UTC_ARGS: --version 5
+; RUN: opt < %s -mtriple=amdgcn-unknown-amdhsa -passes="print<cost-model>" -cost-kind=throughput 2>&1 -disable-output | FileCheck -check-prefixes=ALL,BASE %s
+; RUN: opt < %s -mtriple=amdgcn-unknown-amdhsa -mcpu=gfx803 -passes="print<cost-model>" -cost-kind=throughput 2>&1 -disable-output | FileCheck -check-prefixes=ALL,GFX8 %s
+; RUN: opt < %s -mtriple=amdgcn-unknown-amdhsa -mcpu=gfx900 -passes="print<cost-model>" -cost-kind=throughput 2>&1 -disable-output | FileCheck -check-prefixes=ALL,GFX9 %s
+; RUN: opt < %s -mtriple=amdgcn-unknown-amdhsa -mcpu=gfx1030 -passes="print<cost-model>" -cost-kind=throughput 2>&1 -disable-output | FileCheck -check-prefixes=ALL,GFX10 %s
+
+; RUN: opt < %s -mtriple=amdgcn-unknown-amdhsa -passes="print<cost-model>" -cost-kind=code-size 2>&1 -disable-output | FileCheck -check-prefixes=ALL-SIZE,BASE-SIZE %s
+; RUN: opt < %s -mtriple=amdgcn-unknown-amdhsa -mcpu=gfx803 -passes="print<cost-model>" -cost-kind=code-size 2>&1 -disable-output | FileCheck -check-prefixes=ALL-SIZE,GFX8-SIZE %s
+; RUN: opt < %s -mtriple=amdgcn-unknown-amdhsa -mcpu=gfx900 -passes="print<cost-model>" -cost-kind=code-size 2>&1 -disable-output | FileCheck -check-prefixes=ALL-SIZE,GFX9-SIZE %s
+; RUN: opt < %s -mtriple=amdgcn-unknown-amdhsa -mcpu=gfx1030 -passes="print<cost-model>" -cost-kind=code-size 2>&1 -disable-output | FileCheck -check-prefixes=ALL-SIZE,GFX10-SIZE %s
+
+define void @canonicalize_f16() {
+; BASE-LABEL: 'canonicalize_f16'
+; BASE-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %f16 = call half @llvm.canonicalize.f16(half undef)
+; BASE-NEXT:  Cost Model: Found an estimated cost of 4 for instruction: %v2f16 = call <2 x half> @llvm.canonicalize.v2f16(<2 x half> undef)
+; BASE-NEXT:  Cost Model: Found an estimated cost of 8 for instruction: %v3f16 = call <3 x half> @llvm.canonicalize.v3f16(<3 x half> undef)
+; BASE-NEXT:  Cost Model: Found an estimated cost of 8 for instruction: %v4f16 = call <4 x half> @llvm.canonicalize.v4f16(<4 x half> undef)
+; BASE-NEXT:  Cost Model: Found an estimated cost of 16 for instruction: %v5f16 = call <5 x half> @llvm.canonicalize.v5f16(<5 x half> undef)
+; BASE-NEXT:  Cost Model: Found an estimated cost of 32 for instruction: %v16f16 = call <16 x half> @llvm.canonicalize.v16f16(<16 x half> undef)
+; BASE-NEXT:  Cost Model: Found an estimated cost of 64 for instruction: %v17f16 = call <17 x half> @llvm.canonicalize.v17f16(<17 x half> undef)
+; BASE-NEXT:  Cost Model: Found an estimated cost of 10 for instruction: ret void
+;
+; GFX8-LABEL: 'canonicalize_f16'
+; GFX8-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %f16 = call half @llvm.canonicalize.f16(half undef)
+; GFX8-NEXT:  Cost Model: Found an estimated cost of 3 for instruction: %v2f16 = call <2 x half> @llvm.canonicalize.v2f16(<2 x half> undef)
+; GFX8-NEXT:  Cost Model: Found an estimated cost of 5 for instruction: %v3f16 = call <3 x half> @llvm.canonicalize.v3f16(<3 x half> undef)
+; GFX8-NEXT:  Cost Model: Found an estimated cost of 7 for instruction: %v4f16 = call <4 x half> @llvm.canonicalize.v4f16(<4 x half> undef)
+; GFX8-NEXT:  Cost Model: Found an estimated cost of 9 for instruction: %v5f16 = call <5 x half> @llvm.canonicalize.v5f16(<5 x half> undef)
+; GFX8-NEXT:  Cost Model: Found an estimated cost of 31 for instruction: %v16f16 = call <16 x half> @llvm.canonicalize.v16f16(<16 x half> undef)
+; GFX8-NEXT:  Cost Model: Found an estimated cost of 33 for instruction: %v17f16 = call <17 x half> @llvm.canonicalize.v17f16(<17 x half> undef)
+; GFX8-NEXT:  Cost Model: Found an estimated cost of 10 for instruction: ret void
+;
+; GFX9-LABEL: 'canonicalize_f16'
+; GFX9-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %f16 = call half @llvm.canonicalize.f16(half undef)
+; GFX9-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %v2f16 = call <2 x half> @llvm.canonicalize.v2f16(<2 x half> undef)
+; GFX9-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %v3f16 = call <3 x half> @llvm.canonicalize.v3f16(<3 x half> undef)
+; GFX9-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %v4f16 = call <4 x half> @llvm.canonicalize.v4f16(<4 x half> undef)
+; GFX9-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %v5f16 = call <5 x half> @llvm.canonicalize.v5f16(<5 x half> undef)
+; GFX9-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %v16f16 = call <16 x half> @llvm.canonicalize.v16f16(<16 x half> undef)
+; GFX9-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %v17f16 = call <17 x half> @llvm.canonicalize.v17f16(<17 x half> undef)
+; GFX9-NEXT:  Cost Model: Found an estimated cost of 10 for instruction: ret void
+;
+; GFX10-LABEL: 'canonicalize_f16'
+; GFX10-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %f16 = call half @llvm.canonicalize.f16(half undef)
+; GFX10-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %v2f16 = call <2 x half> @llvm.canonicalize.v2f16(<2 x half> undef)
+; GFX10-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %v3f16 = call <3 x half> @llvm.canonicalize.v3f16(<3 x half> undef)
+; GFX10-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %v4f16 = call <4 x half> @llvm.canonicalize.v4f16(<4 x half> undef)
+; GFX10-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %v5f16 = call <5 x half> @llvm.canonicalize.v5f16(<5 x half> undef)
+; GFX10-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %v16f16 = call <16 x half> @llvm.canonicalize.v16f16(<16 x half> undef)
+; GFX10-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %v17f16 = call <17 x half> @llvm.canonicalize.v17f16(<17 x half> undef)
+; GFX10-NEXT:  Cost Model: Found an estimated cost of 10 for instruction: ret void
+;
+; BASE-SIZE-LABEL: 'canonicalize_f16'
+; BASE-SIZE-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %f16 = call half @llvm.canonicalize.f16(half undef)
+; BASE-SIZE-NEXT:  Cost Model: Found an estimated cost of 4 for instruction: %v2f16 = call <2 x half> @llvm.canonicalize.v2f16(<2 x half> undef)
+; BASE-SIZE-NEXT:  Cost Model: Found an estimated cost of 8 for instruction: %v3f16 = call <3 x half> @llvm.canonicalize.v3f16(<3 x half> undef)
+; BASE-SIZE-NEXT:  Cost Model: Found an estimated cost of 8 for instruction: %v4f16 = call <4 x half> @llvm.canonicalize.v4f16(<4 x half> undef)
+; BASE-SIZE-NEXT:  Cost Model: Found an estimated cost of 16 for instruction: %v5f16 = call <5 x half> @llvm.canonicalize.v5f16(<5 x half> undef)
+; BASE-SIZE-NEXT:  Cost Model: Found an estimated cost of 32 for instruction: %v16f16 = call <16 x half> @llvm.canonicalize.v16f16(<16 x half> undef)
+; BASE-SIZE-NEXT:  Cost Model: Found an estimated cost of 64 for instruction: %v17f16 = call <17 x half> @llvm.canonicalize.v17f16(<17 x half> undef)
+; BASE-SIZE-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: ret void
+;
+; GFX8-SIZE-LABEL: 'canonicalize_f16'
+; GFX8-SIZE-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %f16 = call half @llvm.canonicalize.f16(half undef)
+; GFX8-SIZE-NEXT:  Cost Model: Found an estimated cost of 3 for instruction: %v2f16 = call <2 x half> @llvm.canonicalize.v2f16(<2 x half> undef)
+; GFX8-SIZE-NEXT:  Cost Model: Found an estimated cost of 5 for instruction: %v3f16 = call <3 x half> @llvm.canonicalize.v3f16(<3 x half> undef)
+; GFX8-SIZE-NEXT:  Cost Model: Found an estimated cost of 7 for instruction: %v4f16 = call <4 x half> @llvm.canonicalize.v4f16(<4 x half> undef)
+; GFX8-SIZE-NEXT:  Cost Model: Found an estimated cost of 9 for instruction: %v5f16 = call <5 x half> @llvm.canonicalize.v5f16(<5 x half> undef)
+; GFX8-SIZE-NEXT:  Cost Model: Found an estimated cost of 31 for instruction: %v16f16 = call <16 x half> @llvm.canonicalize.v16f16(<16 x half> undef)
+; GFX8-SIZE-NEXT:  Cost Model: Found an estimated cost of 33 for instruction: %v17f16 = call <17 x half> @llvm.canonicalize.v17f16(<17 x half> undef)
+; GFX8-SIZE-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: ret void
+;
+; GFX9-SIZE-LABEL: 'canonicalize_f16'
+; GFX9-SIZE-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %f16 = call half @llvm.canonicalize.f16(half undef)
+; GFX9-SIZE-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %v2f16 = call <2 x half> @llvm.canonicalize.v2f16(<2 x half> undef)
+; GFX9-SIZE-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %v3f16 = call <3 x half> @llvm.canonicalize.v3f16(<3 x half> undef)
+; GFX9-SIZE-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %v4f16 = call <4 x half> @llvm.canonicalize.v4f16(<4 x half> undef)
+; GFX9-SIZE-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %v5f16 = call <5 x half> @llvm.canonicalize.v5f16(<5 x half> undef)
+; GFX9-SIZE-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %v16f16 = call <16 x half> @llvm.canonicalize.v16f16(<16 x half> undef)
+; GFX9-SIZE-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %v17f16 = call <17 x half> @llvm.canonicalize.v17f16(<17 x half> undef)
+; GFX9-SIZE-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: ret void
+;
+; GFX10-SIZE-LABEL: 'canonicalize_f16'
+; GFX10-SIZE-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %f16 = call half @llvm.canonicalize.f16(half undef)
+; GFX10-SIZE-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %v2f16 = call <2 x half> @llvm.canonicalize.v2f16(<2 x half> undef)
+; GFX10-SIZE-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %v3f16 = call <3 x half> @llvm.canonicalize.v3f16(<3 x half> undef)
+; GFX10-SIZE-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %v4f16 = call <4 x half> @llvm.canonicalize.v4f16(<4 x half> undef)
+; GFX10-SIZE-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %v5f16 = call <5 x half> @llvm.canonicalize.v5f16(<5 x half> undef)
+; GFX10-SIZE-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %v16f16 = call <16 x half> @llvm.canonicalize.v16f16(<16 x half> undef)
+; GFX10-SIZE-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %v17f16 = call <17 x half> @llvm.canonicalize.v17f16(<17 x half> undef)
+; GFX10-SIZE-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: ret void
+;
+  %f16 = call half @llvm.canonicalize.f16(half undef) #1
+  %v2f16 = call <2 x half> @llvm.canonicalize.v2f16(<2 x half> undef) #1
+  %v3f16 = call <3 x half> @llvm.canonicalize.v3f16(<3 x half> undef) #1
+  %v4f16 = call <4 x half> @llvm.canonicalize.v4f16(<4 x half> undef) #1
+  %v5f16 = call <5 x half> @llvm.canonicalize.v5f16(<5 x half> undef) #1
+  %v16f16 = call <16 x half> @llvm.canonicalize.v16f16(<16 x half> undef) #1
+  %v17f16 = call <17 x half> @llvm.canonicalize.v17f16(<17 x half> undef) #1
+  ret void
+}
+
+define void @canonicalize_bf16() {
+; BASE-LABEL: 'canonicalize_bf16'
+; BASE-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %bf16 = call bfloat @llvm.canonicalize.bf16(bfloat undef)
+; BASE-NEXT:  Cost Model: Found an estimated cost of 4 for instruction: %v2bf16 = call <2 x bfloat> @llvm.canonicalize.v2bf16(<2 x bfloat> undef)
+; BASE-NEXT:  Cost Model: Found an estimated cost of 8 for instruction: %v3bf16 = call <3 x bfloat> @llvm.canonicalize.v3bf16(<3 x bfloat> undef)
+; BASE-NEXT:  Cost Model: Found an estimated cost of 8 for instruction: %v4bf16 = call <4 x bfloat> @llvm.canonicalize.v4bf16(<4 x bfloat> undef)
+; BASE-NEXT:  Cost Model: Found an estimated cost of 16 for instruction: %v5bf16 = call <5 x bfloat> @llvm.canonicalize.v5bf16(<5 x bfloat> undef)
+; BASE-NEXT:  Cost Model: Found an estimated cost of 32 for instruction: %v16bf16 = call <16 x bfloat> @llvm.canonicalize.v16bf16(<16 x bfloat> undef)
+; BASE-NEXT:  Cost Model: Found an estimated cost of 64 for instruction: %v17bf16 = call <17 x bfloat> @llvm.canonicalize.v17bf16(<17 x bfloat> undef)
+; BASE-NEXT:  Cost Model: Found an estimated cost of 10 for instruction: ret void
+;
+; GFX8-LABEL: 'canonicalize_bf16'
+; GFX8-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %bf16 = call bfloat @llvm.canonicalize.bf16(bfloat undef)
+; GFX8-NEXT:  Cost Model: Found an estimated cost of 3 for instruction: %v2bf16 = call <2 x bfloat> @llvm.canonicalize.v2bf16(<2 x bfloat> undef)
+; GFX8-NEXT:  Cost Model: Found an estimated cost of 5 for instruction: %v3bf16 = call <3 x bfloat> @llvm.canonicalize.v3bf16(<3 x bfloat> undef)
+; GFX8-NEXT:  Cost Model: Found an estimated cost of 7 for instruction: %v4bf16 = call <4 x bfloat> @llvm.canonicalize.v4bf16(<4 x bfloat> undef)
+; GFX8-NEXT:  Cost Model: Found an estimated cost of 9 for instruction: %v5bf16 = call <5 x bfloat> @llvm.canonicalize.v5bf16(<5 x bfloat> undef)
+; GFX8-NEXT:  Cost Model: Found an estimated cost of 31 for instruction: %v16bf16 = call <16 x bfloat> @llvm.canonicalize.v16bf16(<16 x bfloat> undef)
+; GFX8-NEXT:  Cost Model: Found an estimated cost of 33 for instruction: %v17bf16 = call <17 x bfloat> @llvm.canonicalize.v17bf16(<17 x bfloat> undef)
+; GFX8-NEXT:  Cost Model: Found an estimated cost of 10 for instruction: ret void
+;
+; GFX9-LABEL: 'canonicalize_bf16'
+; GFX9-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %bf16 = call bfloat @llvm.canonicalize.bf16(bfloat undef)
+; GFX9-NEXT:  Cost Model: Found an estimated cost of 3 for instruction: %v2bf16 = call <2 x bfloat> @llvm.canonicalize.v2bf16(<2 x bfloat> undef)
+; GFX9-NEXT:  Cost Model: Found an estimated cost of 5 for instruction: %v3bf16 = call <3 x bfloat> @llvm.canonicalize.v3bf16(<3 x bfloat> undef)
+; GFX9-NEXT:  Cost Model: Found an estimated cost of 7 for instruction: %v4bf16 = call <4 x bfloat> @llvm.canonicalize.v4bf16(<4 x bfloat> undef)
+; GFX9-NEXT:  Cost Model: Found an estimated cost of 9 for instruction: %v5bf16 = call <5 x bfloat> @llvm.canonicalize.v5bf16(<5 x bfloat> undef)
+; GFX9-NEXT:  Cost Model: Found an estimated cost of 31 for instruction: %v16bf16 = call <16 x bfloat> @llvm.canonicalize.v16bf16(<16 x bfloat> undef)
+; GFX9-NEXT:  Cost Model: Found an estimated cost of 33 for instruction: %v17bf16 = call <17 x bfloat> @llvm.canonicalize.v17bf16(<17 x bfloat> undef)
+; GFX9-NEXT:  Cost Model: Found an estimated cost of 10 for instruction: ret void
+;
+; GFX10-LABEL: 'canonicalize_bf16'
+; GFX10-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %bf16 = call bfloat @llvm.canonicalize.bf16(bfloat undef)
+; GFX10-NEXT:  Cost Model: Found an estimated cost of 3 for instruction: %v2bf16 = call <2 x bfloat> @llvm.canonicalize.v2bf16(<2 x bfloat> undef)
+; GFX10-NEXT:  Cost Model: Found an estimated cost of 5 for instruction: %v3bf16 = call <3 x bfloat> @llvm.canonicalize.v3bf16(<3 x bfloat> undef)
+; GFX10-NEXT:  Cost Model: Found an estimated cost of 7 for instruction: %v4bf16 = call <4 x bfloat> @llvm.canonicalize.v4bf16(<4 x bfloat> undef)
+; GFX10-NEXT:  Cost Model: Found an estimated cost of 9 for instruction: %v5bf16 = call <5 x bfloat> @llvm.canonicalize.v5bf16(<5 x bfloat> undef)
+; GFX10-NEXT:  Cost Model: Found an estimated cost of 31 for instruction: %v16bf16 = call <16 x bfloat> @llvm.canonicalize.v16bf16(<16 x bfloat> undef)
+; GFX10-NEXT:  Cost Model: Found an estimated cost of 33 for instruction: %v17bf16 = call <17 x bfloat> @llvm.canonicalize.v17bf16(<17 x bfloat> undef)
+; GFX10-NEXT:  Cost Model: Found an estimated cost of 10 for instruction: ret void
+;
+; BASE-SIZE-LABEL: 'canonicalize_bf16'
+; BASE-SIZE-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %bf16 = call bfloat @llvm.canonicalize.bf16(bfloat undef)
+; BASE-SIZE-NEXT:  Cost Model: Found an estimated cost of 4 for instruction: %v2bf16 = call <2 x bfloat> @llvm.canonicalize.v2bf16(<2 x bfloat> undef)
+; BASE-SIZE-NEXT:  Cost Model: Found an estimated cost of 8 for instruction: %v3bf16 = call <3 x bfloat> @llvm.canonicalize.v3bf16(<3 x bfloat> undef)
+; BASE-SIZE-NEXT:  Cost Model: Found an estimated cost of 8 for instruction: %v4bf16 = call <4 x bfloat> @llvm.canonicalize.v4bf16(<4 x bfloat> undef)
+; BASE-SIZE-NEXT:  Cost Model: Found an estimated cost of 16 for instruction: %v5bf16 = call <5 x bfloat> @llvm.canonicalize.v5bf16(<5 x bfloat> undef)
+; BASE-SIZE-NEXT:  Cost Model: Found an estimated cost of 32 for instruction: %v16bf16 = call <16 x bfloat> @llvm.canonicalize.v16bf16(<16 x bfloat> undef)
+; BASE-SIZE-NEXT:  Cost Model: Found an estimated cost of 64 for instruction: %v17bf16 = call <17 x bfloat> @llvm.canonicalize.v17bf16(<17 x bfloat> undef)
+; BASE-SIZE-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: ret void
+;
+; GFX8-SIZE-LABEL: 'canonicalize_bf16'
+; GFX8-SIZE-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %bf16 = call bfloat @llvm.canonicalize.bf16(bfloat undef)
+; GFX8-SIZE-NEXT:  Cost Model: Found an estimated cost of 3 for instruction: %v2bf16 = call <2 x bfloat> @llvm.canonicalize.v2bf16(<2 x bfloat> undef)
+; GFX8-SIZE-NEXT:  Cost Model: Found an estimated cost of 5 for instruction: %v3bf16 = call <3 x bfloat> @llvm.canonicalize.v3bf16(<3 x bfloat> undef)
+; GFX8-SIZE-NEXT:  Cost Model: Found an estimated cost of 7 for instruction: %v4bf16 = call <4 x bfloat> @llvm.canonicalize.v4bf16(<4 x bfloat> undef)
+; GFX8-SIZE-NEXT:  Cost Model: Found an estimated cost of 9 for instruction: %v5bf16 = call <5 x bfloat> @llvm.canonicalize.v5bf16(<5 x bfloat> undef)
+; GFX8-SIZE-NEXT:  Cost Model: Found an estimated cost of 31 for instruction: %v16bf16 = call <16 x bfloat> @llvm.canonicalize.v16bf16(<16 x bfloat> undef)
+; GFX8-SIZE-NEXT:  Cost Model: Found an estimated cost of 33 for instruction: %v17bf16 = call <17 x bfloat> @llvm.canonicalize.v17bf16(<17 x bfloat> undef)
+; GFX8-SIZE-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: ret void
+;
+; GFX9-SIZE-LABEL: 'canonicalize_bf16'
+; GFX9-SIZE-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %bf16 = call bfloat @llvm.canonicalize.bf16(bfloat undef)
+; GFX9-SIZE-NEXT:  Cost Model: Found an estimated cost of 3 for instruction: %v2bf16 = call <2 x bfloat> @llvm.canonicalize.v2bf16(<2 x bfloat> undef)
+; GFX9-SIZE-NEXT:  Cost Model: Found an estimated cost of 5 for instruction: %v3bf16 = call <3 x bfloat> @llvm.canonicalize.v3bf16(<3 x bfloat> undef)
+; GFX9-SIZE-NEXT:  Cost Model: Found an estimated cost of 7 for instruction: %v4bf16 = call <4 x bfloat> @llvm.canonicalize.v4bf16(<4 x bfloat> undef)
+; GFX9-SIZE-NEXT:  Cost Model: Found an estimated cost of 9 for instruction: %v5bf16 = call <5 x bfloat> @llvm.canonicalize.v5bf16(<5 x bfloat> undef)
+; GFX9-SIZE-NEXT:  Cost Model: Found an estimated cost of 31 for instruction: %v16bf16 = call <16 x bfloat> @llvm.canonicalize.v16bf16(<16 x bfloat> undef)
+; GFX9-SIZE-NEXT:  Cost Model: Found an estimated cost of 33 for instruction: %v17bf16 = call <17 x bfloat> @llvm.canonicalize.v17bf16(<17 x bfloat> undef)
+; GFX9-SIZE-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: ret void
+;
+; GFX10-SIZE-LABEL: 'canonicalize_bf16'
+; GFX10-SIZE-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %bf16 = call bfloat @llvm.canonicalize.bf16(bfloat undef)
+; GFX10-SIZE-NEXT:  Cost Model: Found an estimated cost of 3 for instruction: %v2bf16 = call <2 x bfloat> @llvm.canonicalize.v2bf16(<2 x bfloat> undef)
+; GFX10-SIZE-NEXT:  Cost Model: Found an estimated cost of 5 for instruction: %v3bf16 = call <3 x bfloat> @llvm.canonicalize.v3bf16(<3 x bfloat> undef)
+; GFX10-SIZE-NEXT:  Cost Model: Found an estimated cost of 7 for instruction: %v4bf16 = call <4 x bfloat> @llvm.canonicalize.v4bf16(<4 x bfloat> undef)
+; GFX10-SIZE-NEXT:  Cost Model: Found an estimated cost of 9 for instruction: %v5bf16 = call <5 x bfloat> @llvm.canonicalize.v5bf16(<5 x bfloat> undef)
+; GFX10-SIZE-NEXT:  Cost Model: Found an estimated cost of 31 for instruction: %v16bf16 = call <16 x bfloat> @llvm.canonicalize.v16bf16(<16 x bfloat> undef)
+; GFX10-SIZE-NEXT:  Cost Model: Found an estimated cost of 33 for instruction: %v17bf16 = call <17 x bfloat> @llvm.canonicalize.v17bf16(<17 x bfloat> undef)
+; GFX10-SIZE-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: ret void
+;
+  %bf16 = call bfloat @llvm.canonicalize.bf16(bfloat undef) #1
+  %v2bf16 = call <2 x bfloat> @llvm.canonicalize.v2bf16(<2 x bfloat> undef) #1
+  %v3bf16 = call <3 x bfloat> @llvm.canonicalize.v3bf16(<3 x bfloat> undef) #1
+  %v4bf16 = call <4 x bfloat> @llvm.canonicalize.v4bf16(<4 x bfloat> undef) #1
+  %v5bf16 = call <5 x bfloat> @llvm.canonicalize.v5bf16(<5 x bfloat> undef) #1
+  %v16bf16 = call <16 x bfloat> @llvm.canonicalize.v16bf16(<16 x bfloat> undef) #1
+  %v17bf16 = call <17 x bfloat> @llvm.canonicalize.v17bf16(<17 x bfloat> undef) #1
+  ret void
+}
+
+define void @canonicalize_f32() {
+; ALL-LABEL: 'canonicalize_f32'
+; ALL-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %f32 = call float @llvm.canonicalize.f32(float undef)
+; ALL-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %v2f32 = call <2 x float> @llvm.canonicalize.v2f32(<2 x float> undef)
+; ALL-NEXT:  Cost Model: Found an estimated cost of 3 for instruction: %v3f32 = call <3 x float> @llvm.canonicalize.v3f32(<3 x float> undef)
+; ALL-NEXT:  Cost Model: Found an estimated cost of 4 for instruction: %v4f32 = call <4 x float> @llvm.canonicalize.v4f32(<4 x float> undef)
+; ALL-NEXT:  Cost Model: Found an estimated cost of 5 for instruction: %v5f32 = call <5 x float> @llvm.canonicalize.v5f32(<5 x float> undef)
+; ALL-NEXT:  Cost Model: Found an estimated cost of 8 for instruction: %v8f32 = call <8 x float> @llvm.canonicalize.v8f32(<8 x float> undef)
+; ALL-NEXT:  Cost Model: Found an estimated cost of 9 for instruction: %v9f32 = call <9 x float> @llvm.canonicalize.v9f32(<9 x float> undef)
+; ALL-NEXT:  Cost Model: Found an estimated cost of 16 for instruction: %v16f32 = call <16 x float> @llvm.canonicalize.v16f32(<16 x float> undef)
+; ALL-NEXT:  Cost Model: Found an estimated cost of 10 for instruction: ret void
+;
+; ALL-SIZE-LABEL: 'canonicalize_f32'
+; ALL-SIZE-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %f32 = call float @llvm.canonicalize.f32(float undef)
+; ALL-SIZE-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %v2f32 = call <2 x float> @llvm.canonicalize.v2f32(<2 x float> undef)
+; ALL-SIZE-NEXT:  Cost Model: Found an estimated cost of 3 for instruction: %v3f32 = call <3 x float> @llvm.canonicalize.v3f32(<3 x float> undef)
+; ALL-SIZE-NEXT:  Cost Model: Found an estimated cost of 4 for instruction: %v4f32 = call <4 x float> @llvm.canonicalize.v4f32(<4 x float> undef)
+; ALL-SIZE-NEXT:  Cost Model: Found an estimated cost of 5 for instruction: %v5f32 = call <5 x float> @llvm.canonicalize.v5f32(<5 x float> undef)
+; ALL-SIZE-NEXT:  Cost Model: Found an estimated cost of 8 for instruction: %v8f32 = call <8 x float> @llvm.canonicalize.v8f32(<8 x float> undef)
+; ALL-SIZE-NEXT:  Cost Model: Found an estimated cost of 9 for instruction: %v9f32 = call <9 x float> @llvm.canonicalize.v9f32(<9 x float> undef)
+; ALL-SIZE-NEXT:  Cost Model: Found an estimated cost of 16 for instruction: %v16f32 = call <16 x float> @llvm.canonicalize.v16f32(<16 x float> undef)
+; ALL-SIZE-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: ret void
+;
+  %f32 = call float @llvm.canonicalize.f32(float undef) #1
+  %v2f32 = call <2 x float> @llvm.canonicalize.v2f32(<2 x float> undef) #1
+  %v3f32 = call <3 x float> @llvm.canonicalize.v3f32(<3 x float> undef) #1
+  %v4f32 = call <4 x float> @llvm.canonicalize.v4f32(<4 x float> undef) #1
+  %v5f32 = call <5 x float> @llvm.canonicalize.v5f32(<5 x float> undef) #1
+  %v8f32 = call <8 x float> @llvm.canonicalize.v8f32(<8 x float> undef) #1
+  %v9f32 = call <9 x float> @llvm.canonicalize.v9f32(<9 x float> undef) #1
+  %v16f32 = call <16 x float> @llvm.canonicalize.v16f32(<16 x float> undef) #1
+  ret void
+}
+
+define void @canonicalize_f64() {
+; ALL-LABEL: 'canonicalize_f64'
+; ALL-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %f64 = call double @llvm.canonicalize.f64(double undef)
+; ALL-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %v2f64 = call <2 x double> @llvm.canonicalize.v2f64(<2 x double> undef)
+; ALL-NEXT:  Cost Model: Found an estimated cost of 3 for instruction: %v3f64 = call <3 x double> @llvm.canonicalize.v3f64(<3 x double> undef)
+; ALL-NEXT:  Cost Model: Found an estimated cost of 4 for instruction: %v4f64 = call <4 x double> @llvm.canonicalize.v4f64(<4 x double> undef)
+; ALL-NEXT:  Cost Model: Found an estimated cost of 5 for instruction: %v5f64 = call <5 x double> @llvm.canonicalize.v5f64(<5 x double> undef)
+; ALL-NEXT:  Cost Model: Found an estimated cost of 8 for instruction: %v8f64 = call <8 x double> @llvm.canonicalize.v8f64(<8 x double> undef)
+; ALL-NEXT:  Cost Model: Found an estimated cost of 16 for instruction: %v16f64 = call <16 x double> @llvm.canonicalize.v16f64(<16 x double> undef)
+; ALL-NEXT:  Cost Model: Found an estimated cost of 10 for instruction: ret void
+;
+; ALL-SIZE-LABEL: 'canonicalize_f64'
+; ALL-SIZE-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %f64 = call double @llvm.canonicalize.f64(double undef)
+; ALL-SIZE-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %v2f64 = call <2 x double> @llvm.canonicalize.v2f64(<2 x double> undef)
+; ALL-SIZE-NEXT:  Cost Model: Found an estimated cost of 3 for instruction: %v3f64 = call <3 x double> @llvm.canonicalize.v3f64(<3 x double> undef)
+; ALL-SIZE-NEXT:  Cost Model: Found an estimated cost of 4 for instruction: %v4f64 = call <4 x double> @llvm.canonicalize.v4f64(<4 x double> undef)
+; ALL-SIZE-NEXT:  Cost Model: Found an estimated cost of 5 for instruction: %v5f64 = call <5 x double> @llvm.canonicalize.v5f64(<5 x double> undef)
+; ALL-SIZE-NEXT:  Cost Model: Found an estimated cost of 8 for instruction: %v8f64 = call <8 x double> @llvm.canonicalize.v8f64(<8 x double> undef)
+; ALL-SIZE-NEXT:  Cost Model: Found an estimated cost of 16 for instruction: %v16f64 = call <16 x double> @llvm.canonicalize.v16f64(<16 x double> undef)
+; ALL-SIZE-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: ret void
+;
+  %f64 = call double @llvm.canonicalize.f64(double undef) #1
+  %v2f64 = call <2 x double> @llvm.canonicalize.v2f64(<2 x double> undef) #1
+  %v3f64 = call <3 x double> @llvm.canonicalize.v3f64(<3 x double> undef) #1
+  %v4f64 = call <4 x double> @llvm.canonicalize.v4f64(<4 x double> undef) #1
+  %v5f64 = call <5 x double> @llvm.canonicalize.v5f64(<5 x double> undef) #1
+  %v8f64 = call <8 x double> @llvm.canonicalize.v8f64(<8 x double> undef) #1
+  %v16f64 = call <16 x double> @llvm.canonicalize.v16f64(<16 x double> undef) #1
+  ret void
+}
+
+
+
+
+
+
diff --git a/llvm/test/Analysis/CostModel/AMDGPU/copysign.ll b/llvm/test/Analysis/CostModel/AMDGPU/copysign.ll
new file mode 100644
index 0000000..06a058f
--- /dev/null
+++ b/llvm/test/Analysis/CostModel/AMDGPU/copysign.ll
@@ -0,0 +1,278 @@
+; NOTE: Assertions have been autogenerated by utils/update_analyze_test_checks.py UTC_ARGS: --version 5
+; RUN: opt < %s -mtriple=amdgcn-unknown-amdhsa -passes="print<cost-model>" -cost-kind=throughput 2>&1 -disable-output | FileCheck -check-prefixes=ALL,BASE %s
+; RUN: opt < %s -mtriple=amdgcn-unknown-amdhsa -mcpu=gfx803 -passes="print<cost-model>" -cost-kind=throughput 2>&1 -disable-output | FileCheck -check-prefixes=ALL,GFX8 %s
+; RUN: opt < %s -mtriple=amdgcn-unknown-amdhsa -mcpu=gfx900 -passes="print<cost-model>" -cost-kind=throughput 2>&1 -disable-output | FileCheck -check-prefixes=ALL,GFX9 %s
+; RUN: opt < %s -mtriple=amdgcn-unknown-amdhsa -mcpu=gfx1030 -passes="print<cost-model>" -cost-kind=throughput 2>&1 -disable-output | FileCheck -check-prefixes=ALL,GFX10 %s
+
+; RUN: opt < %s -mtriple=amdgcn-unknown-amdhsa -passes="print<cost-model>" -cost-kind=code-size 2>&1 -disable-output | FileCheck -check-prefixes=ALL-SIZE,BASE-SIZE %s
+; RUN: opt < %s -mtriple=amdgcn-unknown-amdhsa -mcpu=gfx803 -passes="print<cost-model>" -cost-kind=code-size 2>&1 -disable-output | FileCheck -check-prefixes=ALL-SIZE,GFX8-SIZE %s
+; RUN: opt < %s -mtriple=amdgcn-unknown-amdhsa -mcpu=gfx900 -passes="print<cost-model>" -cost-kind=code-size 2>&1 -disable-output | FileCheck -check-prefixes=ALL-SIZE,GFX9-SIZE %s
+; RUN: opt < %s -mtriple=amdgcn-unknown-amdhsa -mcpu=gfx1030 -passes="print<cost-model>" -cost-kind=code-size 2>&1 -disable-output | FileCheck -check-prefixes=ALL-SIZE,GFX10-SIZE %s
+
+define void @copysign_f16() {
+; BASE-LABEL: 'copysign_f16'
+; BASE-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %f16 = call half @llvm.copysign.f16(half undef, half undef)
+; BASE-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %v2f16 = call <2 x half> @llvm.copysign.v2f16(<2 x half> undef, <2 x half> undef)
+; BASE-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %v3f16 = call <3 x half> @llvm.copysign.v3f16(<3 x half> undef, <3 x half> undef)
+; BASE-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %v4f16 = call <4 x half> @llvm.copysign.v4f16(<4 x half> undef, <4 x half> undef)
+; BASE-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %v5f16 = call <5 x half> @llvm.copysign.v5f16(<5 x half> undef, <5 x half> undef)
+; BASE-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %v8f16 = call <8 x half> @llvm.copysign.v8f16(<8 x half> undef, <8 x half> undef)
+; BASE-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %v9f16 = call <9 x half> @llvm.copysign.v9f16(<9 x half> undef, <9 x half> undef)
+; BASE-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %v16f16 = call <16 x half> @llvm.copysign.v16f16(<16 x half> undef, <16 x half> undef)
+; BASE-NEXT:  Cost Model: Found an estimated cost of 10 for instruction: ret void
+;
+; GFX8-LABEL: 'copysign_f16'
+; GFX8-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %f16 = call half @llvm.copysign.f16(half undef, half undef)
+; GFX8-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %v2f16 = call <2 x half> @llvm.copysign.v2f16(<2 x half> undef, <2 x half> undef)
+; GFX8-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %v3f16 = call <3 x half> @llvm.copysign.v3f16(<3 x half> undef, <3 x half> undef)
+; GFX8-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %v4f16 = call <4 x half> @llvm.copysign.v4f16(<4 x half> undef, <4 x half> undef)
+; GFX8-NEXT:  Cost Model: Found an estimated cost of 4 for instruction: %v5f16 = call <5 x half> @llvm.copysign.v5f16(<5 x half> undef, <5 x half> undef)
+; GFX8-NEXT:  Cost Model: Found an estimated cost of 4 for instruction: %v8f16 = call <8 x half> @llvm.copysign.v8f16(<8 x half> undef, <8 x half> undef)
+; GFX8-NEXT:  Cost Model: Found an estimated cost of 8 for instruction: %v9f16 = call <9 x half> @llvm.copysign.v9f16(<9 x half> undef, <9 x half> undef)
+; GFX8-NEXT:  Cost Model: Found an estimated cost of 8 for instruction: %v16f16 = call <16 x half> @llvm.copysign.v16f16(<16 x half> undef, <16 x half> undef)
+; GFX8-NEXT:  Cost Model: Found an estimated cost of 10 for instruction: ret void
+;
+; GFX9-LABEL: 'copysign_f16'
+; GFX9-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %f16 = call half @llvm.copysign.f16(half undef, half undef)
+; GFX9-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %v2f16 = call <2 x half> @llvm.copysign.v2f16(<2 x half> undef, <2 x half> undef)
+; GFX9-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %v3f16 = call <3 x half> @llvm.copysign.v3f16(<3 x half> undef, <3 x half> undef)
+; GFX9-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %v4f16 = call <4 x half> @llvm.copysign.v4f16(<4 x half> undef, <4 x half> undef)
+; GFX9-NEXT:  Cost Model: Found an estimated cost of 4 for instruction: %v5f16 = call <5 x half> @llvm.copysign.v5f16(<5 x half> undef, <5 x half> undef)
+; GFX9-NEXT:  Cost Model: Found an estimated cost of 4 for instruction: %v8f16 = call <8 x half> @llvm.copysign.v8f16(<8 x half> undef, <8 x half> undef)
+; GFX9-NEXT:  Cost Model: Found an estimated cost of 8 for instruction: %v9f16 = call <9 x half> @llvm.copysign.v9f16(<9 x half> undef, <9 x half> undef)
+; GFX9-NEXT:  Cost Model: Found an estimated cost of 8 for instruction: %v16f16 = call <16 x half> @llvm.copysign.v16f16(<16 x half> undef, <16 x half> undef)
+; GFX9-NEXT:  Cost Model: Found an estimated cost of 10 for instruction: ret void
+;
+; GFX10-LABEL: 'copysign_f16'
+; GFX10-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %f16 = call half @llvm.copysign.f16(half undef, half undef)
+; GFX10-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %v2f16 = call <2 x half> @llvm.copysign.v2f16(<2 x half> undef, <2 x half> undef)
+; GFX10-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %v3f16 = call <3 x half> @llvm.copysign.v3f16(<3 x half> undef, <3 x half> undef)
+; GFX10-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %v4f16 = call <4 x half> @llvm.copysign.v4f16(<4 x half> undef, <4 x half> undef)
+; GFX10-NEXT:  Cost Model: Found an estimated cost of 4 for instruction: %v5f16 = call <5 x half> @llvm.copysign.v5f16(<5 x half> undef, <5 x half> undef)
+; GFX10-NEXT:  Cost Model: Found an estimated cost of 4 for instruction: %v8f16 = call <8 x half> @llvm.copysign.v8f16(<8 x half> undef, <8 x half> undef)
+; GFX10-NEXT:  Cost Model: Found an estimated cost of 8 for instruction: %v9f16 = call <9 x half> @llvm.copysign.v9f16(<9 x half> undef, <9 x half> undef)
+; GFX10-NEXT:  Cost Model: Found an estimated cost of 8 for instruction: %v16f16 = call <16 x half> @llvm.copysign.v16f16(<16 x half> undef, <16 x half> undef)
+; GFX10-NEXT:  Cost Model: Found an estimated cost of 10 for instruction: ret void
+;
+; BASE-SIZE-LABEL: 'copysign_f16'
+; BASE-SIZE-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %f16 = call half @llvm.copysign.f16(half undef, half undef)
+; BASE-SIZE-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %v2f16 = call <2 x half> @llvm.copysign.v2f16(<2 x half> undef, <2 x half> undef)
+; BASE-SIZE-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %v3f16 = call <3 x half> @llvm.copysign.v3f16(<3 x half> undef, <3 x half> undef)
+; BASE-SIZE-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %v4f16 = call <4 x half> @llvm.copysign.v4f16(<4 x half> undef, <4 x half> undef)
+; BASE-SIZE-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %v5f16 = call <5 x half> @llvm.copysign.v5f16(<5 x half> undef, <5 x half> undef)
+; BASE-SIZE-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %v8f16 = call <8 x half> @llvm.copysign.v8f16(<8 x half> undef, <8 x half> undef)
+; BASE-SIZE-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %v9f16 = call <9 x half> @llvm.copysign.v9f16(<9 x half> undef, <9 x half> undef)
+; BASE-SIZE-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %v16f16 = call <16 x half> @llvm.copysign.v16f16(<16 x half> undef, <16 x half> undef)
+; BASE-SIZE-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: ret void
+;
+; GFX8-SIZE-LABEL: 'copysign_f16'
+; GFX8-SIZE-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %f16 = call half @llvm.copysign.f16(half undef, half undef)
+; GFX8-SIZE-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %v2f16 = call <2 x half> @llvm.copysign.v2f16(<2 x half> undef, <2 x half> undef)
+; GFX8-SIZE-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %v3f16 = call <3 x half> @llvm.copysign.v3f16(<3 x half> undef, <3 x half> undef)
+; GFX8-SIZE-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %v4f16 = call <4 x half> @llvm.copysign.v4f16(<4 x half> undef, <4 x half> undef)
+; GFX8-SIZE-NEXT:  Cost Model: Found an estimated cost of 4 for instruction: %v5f16 = call <5 x half> @llvm.copysign.v5f16(<5 x half> undef, <5 x half> undef)
+; GFX8-SIZE-NEXT:  Cost Model: Found an estimated cost of 4 for instruction: %v8f16 = call <8 x half> @llvm.copysign.v8f16(<8 x half> undef, <8 x half> undef)
+; GFX8-SIZE-NEXT:  Cost Model: Found an estimated cost of 8 for instruction: %v9f16 = call <9 x half> @llvm.copysign.v9f16(<9 x half> undef, <9 x half> undef)
+; GFX8-SIZE-NEXT:  Cost Model: Found an estimated cost of 8 for instruction: %v16f16 = call <16 x half> @llvm.copysign.v16f16(<16 x half> undef, <16 x half> undef)
+; GFX8-SIZE-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: ret void
+;
+; GFX9-SIZE-LABEL: 'copysign_f16'
+; GFX9-SIZE-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %f16 = call half @llvm.copysign.f16(half undef, half undef)
+; GFX9-SIZE-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %v2f16 = call <2 x half> @llvm.copysign.v2f16(<2 x half> undef, <2 x half> undef)
+; GFX9-SIZE-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %v3f16 = call <3 x half> @llvm.copysign.v3f16(<3 x half> undef, <3 x half> undef)
+; GFX9-SIZE-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %v4f16 = call <4 x half> @llvm.copysign.v4f16(<4 x half> undef, <4 x half> undef)
+; GFX9-SIZE-NEXT:  Cost Model: Found an estimated cost of 4 for instruction: %v5f16 = call <5 x half> @llvm.copysign.v5f16(<5 x half> undef, <5 x half> undef)
+; GFX9-SIZE-NEXT:  Cost Model: Found an estimated cost of 4 for instruction: %v8f16 = call <8 x half> @llvm.copysign.v8f16(<8 x half> undef, <8 x half> undef)
+; GFX9-SIZE-NEXT:  Cost Model: Found an estimated cost of 8 for instruction: %v9f16 = call <9 x half> @llvm.copysign.v9f16(<9 x half> undef, <9 x half> undef)
+; GFX9-SIZE-NEXT:  Cost Model: Found an estimated cost of 8 for instruction: %v16f16 = call <16 x half> @llvm.copysign.v16f16(<16 x half> undef, <16 x half> undef)
+; GFX9-SIZE-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: ret void
+;
+; GFX10-SIZE-LABEL: 'copysign_f16'
+; GFX10-SIZE-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %f16 = call half @llvm.copysign.f16(half undef, half undef)
+; GFX10-SIZE-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %v2f16 = call <2 x half> @llvm.copysign.v2f16(<2 x half> undef, <2 x half> undef)
+; GFX10-SIZE-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %v3f16 = call <3 x half> @llvm.copysign.v3f16(<3 x half> undef, <3 x half> undef)
+; GFX10-SIZE-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %v4f16 = call <4 x half> @llvm.copysign.v4f16(<4 x half> undef, <4 x half> undef)
+; GFX10-SIZE-NEXT:  Cost Model: Found an estimated cost of 4 for instruction: %v5f16 = call <5 x half> @llvm.copysign.v5f16(<5 x half> undef, <5 x half> undef)
+; GFX10-SIZE-NEXT:  Cost Model: Found an estimated cost of 4 for instruction: %v8f16 = call <8 x half> @llvm.copysign.v8f16(<8 x half> undef, <8 x half> undef)
+; GFX10-SIZE-NEXT:  Cost Model: Found an estimated cost of 8 for instruction: %v9f16 = call <9 x half> @llvm.copysign.v9f16(<9 x half> undef, <9 x half> undef)
+; GFX10-SIZE-NEXT:  Cost Model: Found an estimated cost of 8 for instruction: %v16f16 = call <16 x half> @llvm.copysign.v16f16(<16 x half> undef, <16 x half> undef)
+; GFX10-SIZE-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: ret void
+;
+  %f16 = call half @llvm.copysign.f16(half undef, half undef)
+  %v2f16 = call <2 x half> @llvm.copysign.v2f16(<2 x half> undef, <2 x half> undef)
+  %v3f16 = call <3 x half> @llvm.copysign.v3f16(<3 x half> undef, <3 x half> undef)
+  %v4f16 = call <4 x half> @llvm.copysign.v4f16(<4 x half> undef, <4 x half> undef)
+  %v5f16 = call <5 x half> @llvm.copysign.v5f16(<5 x half> undef, <5 x half> undef)
+  %v8f16 = call <8 x half> @llvm.copysign.v8f16(<8 x half> undef, <8 x half> undef)
+  %v9f16 = call <9 x half> @llvm.copysign.v9f16(<9 x half> undef, <9 x half> undef)
+  %v16f16 = call <16 x half> @llvm.copysign.v16f16(<16 x half> undef, <16 x half> undef)
+  ret void
+}
+
+define void @copysign_f32() {
+; ALL-LABEL: 'copysign_f32'
+; ALL-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %f32 = call float @llvm.copysign.f32(float undef, float undef)
+; ALL-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %v2f32 = call <2 x float> @llvm.copysign.v2f32(<2 x float> undef, <2 x float> undef)
+; ALL-NEXT:  Cost Model: Found an estimated cost of 3 for instruction: %v3f32 = call <3 x float> @llvm.copysign.v3f32(<3 x float> undef, <3 x float> undef)
+; ALL-NEXT:  Cost Model: Found an estimated cost of 4 for instruction: %v4f32 = call <4 x float> @llvm.copysign.v4f32(<4 x float> undef, <4 x float> undef)
+; ALL-NEXT:  Cost Model: Found an estimated cost of 5 for instruction: %v5f32 = call <5 x float> @llvm.copysign.v5f32(<5 x float> undef, <5 x float> undef)
+; ALL-NEXT:  Cost Model: Found an estimated cost of 8 for instruction: %v8f32 = call <8 x float> @llvm.copysign.v8f32(<8 x float> undef, <8 x float> undef)
+; ALL-NEXT:  Cost Model: Found an estimated cost of 9 for instruction: %v9f32 = call <9 x float> @llvm.copysign.v9f32(<9 x float> undef, <9 x float> undef)
+; ALL-NEXT:  Cost Model: Found an estimated cost of 16 for instruction: %v16f32 = call <16 x float> @llvm.copysign.v16f32(<16 x float> undef, <16 x float> undef)
+; ALL-NEXT:  Cost Model: Found an estimated cost of 10 for instruction: ret void
+;
+; ALL-SIZE-LABEL: 'copysign_f32'
+; ALL-SIZE-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %f32 = call float @llvm.copysign.f32(float undef, float undef)
+; ALL-SIZE-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %v2f32 = call <2 x float> @llvm.copysign.v2f32(<2 x float> undef, <2 x float> undef)
+; ALL-SIZE-NEXT:  Cost Model: Found an estimated cost of 3 for instruction: %v3f32 = call <3 x float> @llvm.copysign.v3f32(<3 x float> undef, <3 x float> undef)
+; ALL-SIZE-NEXT:  Cost Model: Found an estimated cost of 4 for instruction: %v4f32 = call <4 x float> @llvm.copysign.v4f32(<4 x float> undef, <4 x float> undef)
+; ALL-SIZE-NEXT:  Cost Model: Found an estimated cost of 5 for instruction: %v5f32 = call <5 x float> @llvm.copysign.v5f32(<5 x float> undef, <5 x float> undef)
+; ALL-SIZE-NEXT:  Cost Model: Found an estimated cost of 8 for instruction: %v8f32 = call <8 x float> @llvm.copysign.v8f32(<8 x float> undef, <8 x float> undef)
+; ALL-SIZE-NEXT:  Cost Model: Found an estimated cost of 9 for instruction: %v9f32 = call <9 x float> @llvm.copysign.v9f32(<9 x float> undef, <9 x float> undef)
+; ALL-SIZE-NEXT:  Cost Model: Found an estimated cost of 16 for instruction: %v16f32 = call <16 x float> @llvm.copysign.v16f32(<16 x float> undef, <16 x float> undef)
+; ALL-SIZE-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: ret void
+;
+  %f32 = call float @llvm.copysign.f32(float undef, float undef)
+  %v2f32 = call <2 x float> @llvm.copysign.v2f32(<2 x float> undef, <2 x float> undef)
+  %v3f32 = call <3 x float> @llvm.copysign.v3f32(<3 x float> undef, <3 x float> undef)
+  %v4f32 = call <4 x float> @llvm.copysign.v4f32(<4 x float> undef, <4 x float> undef)
+  %v5f32 = call <5 x float> @llvm.copysign.v5f32(<5 x float> undef, <5 x float> undef)
+  %v8f32 = call <8 x float> @llvm.copysign.v8f32(<8 x float> undef, <8 x float> undef)
+  %v9f32 = call <9 x float> @llvm.copysign.v9f32(<9 x float> undef, <9 x float> undef)
+  %v16f32 = call <16 x float> @llvm.copysign.v16f32(<16 x float> undef, <16 x float> undef)
+  ret void
+}
+
+define void @copysign_bf16() {
+; BASE-LABEL: 'copysign_bf16'
+; BASE-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %bf16 = call bfloat @llvm.copysign.bf16(bfloat undef, bfloat undef)
+; BASE-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %v2bf16 = call <2 x bfloat> @llvm.copysign.v2bf16(<2 x bfloat> undef, <2 x bfloat> undef)
+; BASE-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %v3bf16 = call <3 x bfloat> @llvm.copysign.v3bf16(<3 x bfloat> undef, <3 x bfloat> undef)
+; BASE-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %v4bf16 = call <4 x bfloat> @llvm.copysign.v4bf16(<4 x bfloat> undef, <4 x bfloat> undef)
+; BASE-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %v5bf16 = call <5 x bfloat> @llvm.copysign.v5bf16(<5 x bfloat> undef, <5 x bfloat> undef)
+; BASE-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %v8bf16 = call <8 x bfloat> @llvm.copysign.v8bf16(<8 x bfloat> undef, <8 x bfloat> undef)
+; BASE-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %v9bf16 = call <9 x bfloat> @llvm.copysign.v9bf16(<9 x bfloat> undef, <9 x bfloat> undef)
+; BASE-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %v16bf16 = call <16 x bfloat> @llvm.copysign.v16bf16(<16 x bfloat> undef, <16 x bfloat> undef)
+; BASE-NEXT:  Cost Model: Found an estimated cost of 10 for instruction: ret void
+;
+; GFX8-LABEL: 'copysign_bf16'
+; GFX8-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %bf16 = call bfloat @llvm.copysign.bf16(bfloat undef, bfloat undef)
+; GFX8-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %v2bf16 = call <2 x bfloat> @llvm.copysign.v2bf16(<2 x bfloat> undef, <2 x bfloat> undef)
+; GFX8-NEXT:  Cost Model: Found an estimated cost of 4 for instruction: %v3bf16 = call <3 x bfloat> @llvm.copysign.v3bf16(<3 x bfloat> undef, <3 x bfloat> undef)
+; GFX8-NEXT:  Cost Model: Found an estimated cost of 4 for instruction: %v4bf16 = call <4 x bfloat> @llvm.copysign.v4bf16(<4 x bfloat> undef, <4 x bfloat> undef)
+; GFX8-NEXT:  Cost Model: Found an estimated cost of 8 for instruction: %v5bf16 = call <5 x bfloat> @llvm.copysign.v5bf16(<5 x bfloat> undef, <5 x bfloat> undef)
+; GFX8-NEXT:  Cost Model: Found an estimated cost of 8 for instruction: %v8bf16 = call <8 x bfloat> @llvm.copysign.v8bf16(<8 x bfloat> undef, <8 x bfloat> undef)
+; GFX8-NEXT:  Cost Model: Found an estimated cost of 16 for instruction: %v9bf16 = call <9 x bfloat> @llvm.copysign.v9bf16(<9 x bfloat> undef, <9 x bfloat> undef)
+; GFX8-NEXT:  Cost Model: Found an estimated cost of 16 for instruction: %v16bf16 = call <16 x bfloat> @llvm.copysign.v16bf16(<16 x bfloat> undef, <16 x bfloat> undef)
+; GFX8-NEXT:  Cost Model: Found an estimated cost of 10 for instruction: ret void
+;
+; GFX9-LABEL: 'copysign_bf16'
+; GFX9-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %bf16 = call bfloat @llvm.copysign.bf16(bfloat undef, bfloat undef)
+; GFX9-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %v2bf16 = call <2 x bfloat> @llvm.copysign.v2bf16(<2 x bfloat> undef, <2 x bfloat> undef)
+; GFX9-NEXT:  Cost Model: Found an estimated cost of 4 for instruction: %v3bf16 = call <3 x bfloat> @llvm.copysign.v3bf16(<3 x bfloat> undef, <3 x bfloat> undef)
+; GFX9-NEXT:  Cost Model: Found an estimated cost of 4 for instruction: %v4bf16 = call <4 x bfloat> @llvm.copysign.v4bf16(<4 x bfloat> undef, <4 x bfloat> undef)
+; GFX9-NEXT:  Cost Model: Found an estimated cost of 8 for instruction: %v5bf16 = call <5 x bfloat> @llvm.copysign.v5bf16(<5 x bfloat> undef, <5 x bfloat> undef)
+; GFX9-NEXT:  Cost Model: Found an estimated cost of 8 for instruction: %v8bf16 = call <8 x bfloat> @llvm.copysign.v8bf16(<8 x bfloat> undef, <8 x bfloat> undef)
+; GFX9-NEXT:  Cost Model: Found an estimated cost of 16 for instruction: %v9bf16 = call <9 x bfloat> @llvm.copysign.v9bf16(<9 x bfloat> undef, <9 x bfloat> undef)
+; GFX9-NEXT:  Cost Model: Found an estimated cost of 16 for instruction: %v16bf16 = call <16 x bfloat> @llvm.copysign.v16bf16(<16 x bfloat> undef, <16 x bfloat> undef)
+; GFX9-NEXT:  Cost Model: Found an estimated cost of 10 for instruction: ret void
+;
+; GFX10-LABEL: 'copysign_bf16'
+; GFX10-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %bf16 = call bfloat @llvm.copysign.bf16(bfloat undef, bfloat undef)
+; GFX10-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %v2bf16 = call <2 x bfloat> @llvm.copysign.v2bf16(<2 x bfloat> undef, <2 x bfloat> undef)
+; GFX10-NEXT:  Cost Model: Found an estimated cost of 4 for instruction: %v3bf16 = call <3 x bfloat> @llvm.copysign.v3bf16(<3 x bfloat> undef, <3 x bfloat> undef)
+; GFX10-NEXT:  Cost Model: Found an estimated cost of 4 for instruction: %v4bf16 = call <4 x bfloat> @llvm.copysign.v4bf16(<4 x bfloat> undef, <4 x bfloat> undef)
+; GFX10-NEXT:  Cost Model: Found an estimated cost of 8 for instruction: %v5bf16 = call <5 x bfloat> @llvm.copysign.v5bf16(<5 x bfloat> undef, <5 x bfloat> undef)
+; GFX10-NEXT:  Cost Model: Found an estimated cost of 8 for instruction: %v8bf16 = call <8 x bfloat> @llvm.copysign.v8bf16(<8 x bfloat> undef, <8 x bfloat> undef)
+; GFX10-NEXT:  Cost Model: Found an estimated cost of 16 for instruction: %v9bf16 = call <9 x bfloat> @llvm.copysign.v9bf16(<9 x bfloat> undef, <9 x bfloat> undef)
+; GFX10-NEXT:  Cost Model: Found an estimated cost of 16 for instruction: %v16bf16 = call <16 x bfloat> @llvm.copysign.v16bf16(<16 x bfloat> undef, <16 x bfloat> undef)
+; GFX10-NEXT:  Cost Model: Found an estimated cost of 10 for instruction: ret void
+;
+; BASE-SIZE-LABEL: 'copysign_bf16'
+; BASE-SIZE-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %bf16 = call bfloat @llvm.copysign.bf16(bfloat undef, bfloat undef)
+; BASE-SIZE-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %v2bf16 = call <2 x bfloat> @llvm.copysign.v2bf16(<2 x bfloat> undef, <2 x bfloat> undef)
+; BASE-SIZE-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %v3bf16 = call <3 x bfloat> @llvm.copysign.v3bf16(<3 x bfloat> undef, <3 x bfloat> undef)
+; BASE-SIZE-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %v4bf16 = call <4 x bfloat> @llvm.copysign.v4bf16(<4 x bfloat> undef, <4 x bfloat> undef)
+; BASE-SIZE-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %v5bf16 = call <5 x bfloat> @llvm.copysign.v5bf16(<5 x bfloat> undef, <5 x bfloat> undef)
+; BASE-SIZE-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %v8bf16 = call <8 x bfloat> @llvm.copysign.v8bf16(<8 x bfloat> undef, <8 x bfloat> undef)
+; BASE-SIZE-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %v9bf16 = call <9 x bfloat> @llvm.copysign.v9bf16(<9 x bfloat> undef, <9 x bfloat> undef)
+; BASE-SIZE-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %v16bf16 = call <16 x bfloat> @llvm.copysign.v16bf16(<16 x bfloat> undef, <16 x bfloat> undef)
+; BASE-SIZE-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: ret void
+;
+; GFX8-SIZE-LABEL: 'copysign_bf16'
+; GFX8-SIZE-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %bf16 = call bfloat @llvm.copysign.bf16(bfloat undef, bfloat undef)
+; GFX8-SIZE-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %v2bf16 = call <2 x bfloat> @llvm.copysign.v2bf16(<2 x bfloat> undef, <2 x bfloat> undef)
+; GFX8-SIZE-NEXT:  Cost Model: Found an estimated cost of 4 for instruction: %v3bf16 = call <3 x bfloat> @llvm.copysign.v3bf16(<3 x bfloat> undef, <3 x bfloat> undef)
+; GFX8-SIZE-NEXT:  Cost Model: Found an estimated cost of 4 for instruction: %v4bf16 = call <4 x bfloat> @llvm.copysign.v4bf16(<4 x bfloat> undef, <4 x bfloat> undef)
+; GFX8-SIZE-NEXT:  Cost Model: Found an estimated cost of 8 for instruction: %v5bf16 = call <5 x bfloat> @llvm.copysign.v5bf16(<5 x bfloat> undef, <5 x bfloat> undef)
+; GFX8-SIZE-NEXT:  Cost Model: Found an estimated cost of 8 for instruction: %v8bf16 = call <8 x bfloat> @llvm.copysign.v8bf16(<8 x bfloat> undef, <8 x bfloat> undef)
+; GFX8-SIZE-NEXT:  Cost Model: Found an estimated cost of 16 for instruction: %v9bf16 = call <9 x bfloat> @llvm.copysign.v9bf16(<9 x bfloat> undef, <9 x bfloat> undef)
+; GFX8-SIZE-NEXT:  Cost Model: Found an estimated cost of 16 for instruction: %v16bf16 = call <16 x bfloat> @llvm.copysign.v16bf16(<16 x bfloat> undef, <16 x bfloat> undef)
+; GFX8-SIZE-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: ret void
+;
+; GFX9-SIZE-LABEL: 'copysign_bf16'
+; GFX9-SIZE-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %bf16 = call bfloat @llvm.copysign.bf16(bfloat undef, bfloat undef)
+; GFX9-SIZE-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %v2bf16 = call <2 x bfloat> @llvm.copysign.v2bf16(<2 x bfloat> undef, <2 x bfloat> undef)
+; GFX9-SIZE-NEXT:  Cost Model: Found an estimated cost of 4 for instruction: %v3bf16 = call <3 x bfloat> @llvm.copysign.v3bf16(<3 x bfloat> undef, <3 x bfloat> undef)
+; GFX9-SIZE-NEXT:  Cost Model: Found an estimated cost of 4 for instruction: %v4bf16 = call <4 x bfloat> @llvm.copysign.v4bf16(<4 x bfloat> undef, <4 x bfloat> undef)
+; GFX9-SIZE-NEXT:  Cost Model: Found an estimated cost of 8 for instruction: %v5bf16 = call <5 x bfloat> @llvm.copysign.v5bf16(<5 x bfloat> undef, <5 x bfloat> undef)
+; GFX9-SIZE-NEXT:  Cost Model: Found an estimated cost of 8 for instruction: %v8bf16 = call <8 x bfloat> @llvm.copysign.v8bf16(<8 x bfloat> undef, <8 x bfloat> undef)
+; GFX9-SIZE-NEXT:  Cost Model: Found an estimated cost of 16 for instruction: %v9bf16 = call <9 x bfloat> @llvm.copysign.v9bf16(<9 x bfloat> undef, <9 x bfloat> undef)
+; GFX9-SIZE-NEXT:  Cost Model: Found an estimated cost of 16 for instruction: %v16bf16 = call <16 x bfloat> @llvm.copysign.v16bf16(<16 x bfloat> undef, <16 x bfloat> undef)
+; GFX9-SIZE-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: ret void
+;
+; GFX10-SIZE-LABEL: 'copysign_bf16'
+; GFX10-SIZE-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %bf16 = call bfloat @llvm.copysign.bf16(bfloat undef, bfloat undef)
+; GFX10-SIZE-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %v2bf16 = call <2 x bfloat> @llvm.copysign.v2bf16(<2 x bfloat> undef, <2 x bfloat> undef)
+; GFX10-SIZE-NEXT:  Cost Model: Found an estimated cost of 4 for instruction: %v3bf16 = call <3 x bfloat> @llvm.copysign.v3bf16(<3 x bfloat> undef, <3 x bfloat> undef)
+; GFX10-SIZE-NEXT:  Cost Model: Found an estimated cost of 4 for instruction: %v4bf16 = call <4 x bfloat> @llvm.copysign.v4bf16(<4 x bfloat> undef, <4 x bfloat> undef)
+; GFX10-SIZE-NEXT:  Cost Model: Found an estimated cost of 8 for instruction: %v5bf16 = call <5 x bfloat> @llvm.copysign.v5bf16(<5 x bfloat> undef, <5 x bfloat> undef)
+; GFX10-SIZE-NEXT:  Cost Model: Found an estimated cost of 8 for instruction: %v8bf16 = call <8 x bfloat> @llvm.copysign.v8bf16(<8 x bfloat> undef, <8 x bfloat> undef)
+; GFX10-SIZE-NEXT:  Cost Model: Found an estimated cost of 16 for instruction: %v9bf16 = call <9 x bfloat> @llvm.copysign.v9bf16(<9 x bfloat> undef, <9 x bfloat> undef)
+; GFX10-SIZE-NEXT:  Cost Model: Found an estimated cost of 16 for instruction: %v16bf16 = call <16 x bfloat> @llvm.copysign.v16bf16(<16 x bfloat> undef, <16 x bfloat> undef)
+; GFX10-SIZE-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: ret void
+;
+  %bf16 = call bfloat @llvm.copysign.bf16(bfloat undef, bfloat undef)
+  %v2bf16 = call <2 x bfloat> @llvm.copysign.v2bf16(<2 x bfloat> undef, <2 x bfloat> undef)
+  %v3bf16 = call <3 x bfloat> @llvm.copysign.v3bf16(<3 x bfloat> undef, <3 x bfloat> undef)
+  %v4bf16 = call <4 x bfloat> @llvm.copysign.v4bf16(<4 x bfloat> undef, <4 x bfloat> undef)
+  %v5bf16 = call <5 x bfloat> @llvm.copysign.v5bf16(<5 x bfloat> undef, <5 x bfloat> undef)
+  %v8bf16 = call <8 x bfloat> @llvm.copysign.v8bf16(<8 x bfloat> undef, <8 x bfloat> undef)
+  %v9bf16 = call <9 x bfloat> @llvm.copysign.v9bf16(<9 x bfloat> undef, <9 x bfloat> undef)
+  %v16bf16 = call <16 x bfloat> @llvm.copysign.v16bf16(<16 x bfloat> undef, <16 x bfloat> undef)
+  ret void
+}
+
+define void @copysign_f64() {
+; ALL-LABEL: 'copysign_f64'
+; ALL-NEXT:  Cost Model: Found an estimated cost of 4 for instruction: %f64 = call double @llvm.copysign.f64(double undef, double undef)
+; ALL-NEXT:  Cost Model: Found an estimated cost of 8 for instruction: %v2f64 = call <2 x double> @llvm.copysign.v2f64(<2 x double> undef, <2 x double> undef)
+; ALL-NEXT:  Cost Model: Found an estimated cost of 12 for instruction: %v3f64 = call <3 x double> @llvm.copysign.v3f64(<3 x double> undef, <3 x double> undef)
+; ALL-NEXT:  Cost Model: Found an estimated cost of 16 for instruction: %v4f64 = call <4 x double> @llvm.copysign.v4f64(<4 x double> undef, <4 x double> undef)
+; ALL-NEXT:  Cost Model: Found an estimated cost of 96 for instruction: %v5f64 = call <5 x double> @llvm.copysign.v5f64(<5 x double> undef, <5 x double> undef)
+; ALL-NEXT:  Cost Model: Found an estimated cost of 96 for instruction: %v8f64 = call <8 x double> @llvm.copysign.v8f64(<8 x double> undef, <8 x double> undef)
+; ALL-NEXT:  Cost Model: Found an estimated cost of 256 for instruction: %v9f64 = call <9 x double> @llvm.copysign.v9f64(<9 x double> undef, <9 x double> undef)
+; ALL-NEXT:  Cost Model: Found an estimated cost of 320 for instruction: %v16f64 = call <16 x double> @llvm.copysign.v16f64(<16 x double> undef, <16 x double> undef)
+; ALL-NEXT:  Cost Model: Found an estimated cost of 10 for instruction: ret void
+;
+; ALL-SIZE-LABEL: 'copysign_f64'
+; ALL-SIZE-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %f64 = call double @llvm.copysign.f64(double undef, double undef)
+; ALL-SIZE-NEXT:  Cost Model: Found an estimated cost of 4 for instruction: %v2f64 = call <2 x double> @llvm.copysign.v2f64(<2 x double> undef, <2 x double> undef)
+; ALL-SIZE-NEXT:  Cost Model: Found an estimated cost of 6 for instruction: %v3f64 = call <3 x double> @llvm.copysign.v3f64(<3 x double> undef, <3 x double> undef)
+; ALL-SIZE-NEXT:  Cost Model: Found an estimated cost of 8 for instruction: %v4f64 = call <4 x double> @llvm.copysign.v4f64(<4 x double> undef, <4 x double> undef)
+; ALL-SIZE-NEXT:  Cost Model: Found an estimated cost of 48 for instruction: %v5f64 = call <5 x double> @llvm.copysign.v5f64(<5 x double> undef, <5 x double> undef)
+; ALL-SIZE-NEXT:  Cost Model: Found an estimated cost of 48 for instruction: %v8f64 = call <8 x double> @llvm.copysign.v8f64(<8 x double> undef, <8 x double> undef)
+; ALL-SIZE-NEXT:  Cost Model: Found an estimated cost of 128 for instruction: %v9f64 = call <9 x double> @llvm.copysign.v9f64(<9 x double> undef, <9 x double> undef)
+; ALL-SIZE-NEXT:  Cost Model: Found an estimated cost of 160 for instruction: %v16f64 = call <16 x double> @llvm.copysign.v16f64(<16 x double> undef, <16 x double> undef)
+; ALL-SIZE-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: ret void
+;
+  %f64 = call double @llvm.copysign.f64(double undef, double undef)
+  %v2f64 = call <2 x double> @llvm.copysign.v2f64(<2 x double> undef, <2 x double> undef)
+  %v3f64 = call <3 x double> @llvm.copysign.v3f64(<3 x double> undef, <3 x double> undef)
+  %v4f64 = call <4 x double> @llvm.copysign.v4f64(<4 x double> undef, <4 x double> undef)
+  %v5f64 = call <5 x double> @llvm.copysign.v5f64(<5 x double> undef, <5 x double> undef)
+  %v8f64 = call <8 x double> @llvm.copysign.v8f64(<8 x double> undef, <8 x double> undef)
+  %v9f64 = call <9 x double> @llvm.copysign.v9f64(<9 x double> undef, <9 x double> undef)
+  %v16f64 = call <16 x double> @llvm.copysign.v16f64(<16 x double> undef, <16 x double> undef)
+  ret void
+}
diff --git a/llvm/test/Analysis/CostModel/AMDGPU/exp.ll b/llvm/test/Analysis/CostModel/AMDGPU/exp.ll
new file mode 100644
index 0000000..a94b794
--- /dev/null
+++ b/llvm/test/Analysis/CostModel/AMDGPU/exp.ll
@@ -0,0 +1,278 @@
+; NOTE: Assertions have been autogenerated by utils/update_analyze_test_checks.py UTC_ARGS: --version 5
+; RUN: opt < %s -mtriple=amdgcn-unknown-amdhsa -passes="print<cost-model>" -cost-kind=throughput 2>&1 -disable-output | FileCheck -check-prefixes=ALL,BASE %s
+; RUN: opt < %s -mtriple=amdgcn-unknown-amdhsa -mcpu=gfx803 -passes="print<cost-model>" -cost-kind=throughput 2>&1 -disable-output | FileCheck -check-prefixes=ALL,GFX8 %s
+; RUN: opt < %s -mtriple=amdgcn-unknown-amdhsa -mcpu=gfx900 -passes="print<cost-model>" -cost-kind=throughput 2>&1 -disable-output | FileCheck -check-prefixes=ALL,GFX9 %s
+; RUN: opt < %s -mtriple=amdgcn-unknown-amdhsa -mcpu=gfx1030 -passes="print<cost-model>" -cost-kind=throughput 2>&1 -disable-output | FileCheck -check-prefixes=ALL,GFX10 %s
+
+; RUN: opt < %s -mtriple=amdgcn-unknown-amdhsa -passes="print<cost-model>" -cost-kind=code-size 2>&1 -disable-output | FileCheck -check-prefixes=ALL-SIZE,BASE-SIZE %s
+; RUN: opt < %s -mtriple=amdgcn-unknown-amdhsa -mcpu=gfx803 -passes="print<cost-model>" -cost-kind=code-size 2>&1 -disable-output | FileCheck -check-prefixes=ALL-SIZE,GFX8-SIZE %s
+; RUN: opt < %s -mtriple=amdgcn-unknown-amdhsa -mcpu=gfx900 -passes="print<cost-model>" -cost-kind=code-size 2>&1 -disable-output | FileCheck -check-prefixes=ALL-SIZE,GFX9-SIZE %s
+; RUN: opt < %s -mtriple=amdgcn-unknown-amdhsa -mcpu=gfx1030 -passes="print<cost-model>" -cost-kind=code-size 2>&1 -disable-output | FileCheck -check-prefixes=ALL-SIZE,GFX10-SIZE %s
+
+define void @exp_f16() {
+; BASE-LABEL: 'exp_f16'
+; BASE-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %f16 = call half @llvm.exp.f16(half undef)
+; BASE-NEXT:  Cost Model: Found an estimated cost of 4 for instruction: %v2f16 = call <2 x half> @llvm.exp.v2f16(<2 x half> undef)
+; BASE-NEXT:  Cost Model: Found an estimated cost of 8 for instruction: %v3f16 = call <3 x half> @llvm.exp.v3f16(<3 x half> undef)
+; BASE-NEXT:  Cost Model: Found an estimated cost of 8 for instruction: %v4f16 = call <4 x half> @llvm.exp.v4f16(<4 x half> undef)
+; BASE-NEXT:  Cost Model: Found an estimated cost of 16 for instruction: %v5f16 = call <5 x half> @llvm.exp.v5f16(<5 x half> undef)
+; BASE-NEXT:  Cost Model: Found an estimated cost of 16 for instruction: %v8f16 = call <8 x half> @llvm.exp.v8f16(<8 x half> undef)
+; BASE-NEXT:  Cost Model: Found an estimated cost of 32 for instruction: %v16f16 = call <16 x half> @llvm.exp.v16f16(<16 x half> undef)
+; BASE-NEXT:  Cost Model: Found an estimated cost of 64 for instruction: %v17f16 = call <17 x half> @llvm.exp.v17f16(<17 x half> undef)
+; BASE-NEXT:  Cost Model: Found an estimated cost of 10 for instruction: ret void
+;
+; GFX8-LABEL: 'exp_f16'
+; GFX8-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %f16 = call half @llvm.exp.f16(half undef)
+; GFX8-NEXT:  Cost Model: Found an estimated cost of 5 for instruction: %v2f16 = call <2 x half> @llvm.exp.v2f16(<2 x half> undef)
+; GFX8-NEXT:  Cost Model: Found an estimated cost of 8 for instruction: %v3f16 = call <3 x half> @llvm.exp.v3f16(<3 x half> undef)
+; GFX8-NEXT:  Cost Model: Found an estimated cost of 11 for instruction: %v4f16 = call <4 x half> @llvm.exp.v4f16(<4 x half> undef)
+; GFX8-NEXT:  Cost Model: Found an estimated cost of 14 for instruction: %v5f16 = call <5 x half> @llvm.exp.v5f16(<5 x half> undef)
+; GFX8-NEXT:  Cost Model: Found an estimated cost of 23 for instruction: %v8f16 = call <8 x half> @llvm.exp.v8f16(<8 x half> undef)
+; GFX8-NEXT:  Cost Model: Found an estimated cost of 47 for instruction: %v16f16 = call <16 x half> @llvm.exp.v16f16(<16 x half> undef)
+; GFX8-NEXT:  Cost Model: Found an estimated cost of 50 for instruction: %v17f16 = call <17 x half> @llvm.exp.v17f16(<17 x half> undef)
+; GFX8-NEXT:  Cost Model: Found an estimated cost of 10 for instruction: ret void
+;
+; GFX9-LABEL: 'exp_f16'
+; GFX9-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %f16 = call half @llvm.exp.f16(half undef)
+; GFX9-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %v2f16 = call <2 x half> @llvm.exp.v2f16(<2 x half> undef)
+; GFX9-NEXT:  Cost Model: Found an estimated cost of 8 for instruction: %v3f16 = call <3 x half> @llvm.exp.v3f16(<3 x half> undef)
+; GFX9-NEXT:  Cost Model: Found an estimated cost of 11 for instruction: %v4f16 = call <4 x half> @llvm.exp.v4f16(<4 x half> undef)
+; GFX9-NEXT:  Cost Model: Found an estimated cost of 14 for instruction: %v5f16 = call <5 x half> @llvm.exp.v5f16(<5 x half> undef)
+; GFX9-NEXT:  Cost Model: Found an estimated cost of 23 for instruction: %v8f16 = call <8 x half> @llvm.exp.v8f16(<8 x half> undef)
+; GFX9-NEXT:  Cost Model: Found an estimated cost of 47 for instruction: %v16f16 = call <16 x half> @llvm.exp.v16f16(<16 x half> undef)
+; GFX9-NEXT:  Cost Model: Found an estimated cost of 50 for instruction: %v17f16 = call <17 x half> @llvm.exp.v17f16(<17 x half> undef)
+; GFX9-NEXT:  Cost Model: Found an estimated cost of 10 for instruction: ret void
+;
+; GFX10-LABEL: 'exp_f16'
+; GFX10-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %f16 = call half @llvm.exp.f16(half undef)
+; GFX10-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %v2f16 = call <2 x half> @llvm.exp.v2f16(<2 x half> undef)
+; GFX10-NEXT:  Cost Model: Found an estimated cost of 8 for instruction: %v3f16 = call <3 x half> @llvm.exp.v3f16(<3 x half> undef)
+; GFX10-NEXT:  Cost Model: Found an estimated cost of 11 for instruction: %v4f16 = call <4 x half> @llvm.exp.v4f16(<4 x half> undef)
+; GFX10-NEXT:  Cost Model: Found an estimated cost of 14 for instruction: %v5f16 = call <5 x half> @llvm.exp.v5f16(<5 x half> undef)
+; GFX10-NEXT:  Cost Model: Found an estimated cost of 23 for instruction: %v8f16 = call <8 x half> @llvm.exp.v8f16(<8 x half> undef)
+; GFX10-NEXT:  Cost Model: Found an estimated cost of 47 for instruction: %v16f16 = call <16 x half> @llvm.exp.v16f16(<16 x half> undef)
+; GFX10-NEXT:  Cost Model: Found an estimated cost of 50 for instruction: %v17f16 = call <17 x half> @llvm.exp.v17f16(<17 x half> undef)
+; GFX10-NEXT:  Cost Model: Found an estimated cost of 10 for instruction: ret void
+;
+; BASE-SIZE-LABEL: 'exp_f16'
+; BASE-SIZE-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %f16 = call half @llvm.exp.f16(half undef)
+; BASE-SIZE-NEXT:  Cost Model: Found an estimated cost of 4 for instruction: %v2f16 = call <2 x half> @llvm.exp.v2f16(<2 x half> undef)
+; BASE-SIZE-NEXT:  Cost Model: Found an estimated cost of 8 for instruction: %v3f16 = call <3 x half> @llvm.exp.v3f16(<3 x half> undef)
+; BASE-SIZE-NEXT:  Cost Model: Found an estimated cost of 8 for instruction: %v4f16 = call <4 x half> @llvm.exp.v4f16(<4 x half> undef)
+; BASE-SIZE-NEXT:  Cost Model: Found an estimated cost of 16 for instruction: %v5f16 = call <5 x half> @llvm.exp.v5f16(<5 x half> undef)
+; BASE-SIZE-NEXT:  Cost Model: Found an estimated cost of 16 for instruction: %v8f16 = call <8 x half> @llvm.exp.v8f16(<8 x half> undef)
+; BASE-SIZE-NEXT:  Cost Model: Found an estimated cost of 32 for instruction: %v16f16 = call <16 x half> @llvm.exp.v16f16(<16 x half> undef)
+; BASE-SIZE-NEXT:  Cost Model: Found an estimated cost of 64 for instruction: %v17f16 = call <17 x half> @llvm.exp.v17f16(<17 x half> undef)
+; BASE-SIZE-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: ret void
+;
+; GFX8-SIZE-LABEL: 'exp_f16'
+; GFX8-SIZE-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %f16 = call half @llvm.exp.f16(half undef)
+; GFX8-SIZE-NEXT:  Cost Model: Found an estimated cost of 5 for instruction: %v2f16 = call <2 x half> @llvm.exp.v2f16(<2 x half> undef)
+; GFX8-SIZE-NEXT:  Cost Model: Found an estimated cost of 8 for instruction: %v3f16 = call <3 x half> @llvm.exp.v3f16(<3 x half> undef)
+; GFX8-SIZE-NEXT:  Cost Model: Found an estimated cost of 11 for instruction: %v4f16 = call <4 x half> @llvm.exp.v4f16(<4 x half> undef)
+; GFX8-SIZE-NEXT:  Cost Model: Found an estimated cost of 14 for instruction: %v5f16 = call <5 x half> @llvm.exp.v5f16(<5 x half> undef)
+; GFX8-SIZE-NEXT:  Cost Model: Found an estimated cost of 23 for instruction: %v8f16 = call <8 x half> @llvm.exp.v8f16(<8 x half> undef)
+; GFX8-SIZE-NEXT:  Cost Model: Found an estimated cost of 47 for instruction: %v16f16 = call <16 x half> @llvm.exp.v16f16(<16 x half> undef)
+; GFX8-SIZE-NEXT:  Cost Model: Found an estimated cost of 50 for instruction: %v17f16 = call <17 x half> @llvm.exp.v17f16(<17 x half> undef)
+; GFX8-SIZE-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: ret void
+;
+; GFX9-SIZE-LABEL: 'exp_f16'
+; GFX9-SIZE-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %f16 = call half @llvm.exp.f16(half undef)
+; GFX9-SIZE-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %v2f16 = call <2 x half> @llvm.exp.v2f16(<2 x half> undef)
+; GFX9-SIZE-NEXT:  Cost Model: Found an estimated cost of 8 for instruction: %v3f16 = call <3 x half> @llvm.exp.v3f16(<3 x half> undef)
+; GFX9-SIZE-NEXT:  Cost Model: Found an estimated cost of 11 for instruction: %v4f16 = call <4 x half> @llvm.exp.v4f16(<4 x half> undef)
+; GFX9-SIZE-NEXT:  Cost Model: Found an estimated cost of 14 for instruction: %v5f16 = call <5 x half> @llvm.exp.v5f16(<5 x half> undef)
+; GFX9-SIZE-NEXT:  Cost Model: Found an estimated cost of 23 for instruction: %v8f16 = call <8 x half> @llvm.exp.v8f16(<8 x half> undef)
+; GFX9-SIZE-NEXT:  Cost Model: Found an estimated cost of 47 for instruction: %v16f16 = call <16 x half> @llvm.exp.v16f16(<16 x half> undef)
+; GFX9-SIZE-NEXT:  Cost Model: Found an estimated cost of 50 for instruction: %v17f16 = call <17 x half> @llvm.exp.v17f16(<17 x half> undef)
+; GFX9-SIZE-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: ret void
+;
+; GFX10-SIZE-LABEL: 'exp_f16'
+; GFX10-SIZE-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %f16 = call half @llvm.exp.f16(half undef)
+; GFX10-SIZE-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %v2f16 = call <2 x half> @llvm.exp.v2f16(<2 x half> undef)
+; GFX10-SIZE-NEXT:  Cost Model: Found an estimated cost of 8 for instruction: %v3f16 = call <3 x half> @llvm.exp.v3f16(<3 x half> undef)
+; GFX10-SIZE-NEXT:  Cost Model: Found an estimated cost of 11 for instruction: %v4f16 = call <4 x half> @llvm.exp.v4f16(<4 x half> undef)
+; GFX10-SIZE-NEXT:  Cost Model: Found an estimated cost of 14 for instruction: %v5f16 = call <5 x half> @llvm.exp.v5f16(<5 x half> undef)
+; GFX10-SIZE-NEXT:  Cost Model: Found an estimated cost of 23 for instruction: %v8f16 = call <8 x half> @llvm.exp.v8f16(<8 x half> undef)
+; GFX10-SIZE-NEXT:  Cost Model: Found an estimated cost of 47 for instruction: %v16f16 = call <16 x half> @llvm.exp.v16f16(<16 x half> undef)
+; GFX10-SIZE-NEXT:  Cost Model: Found an estimated cost of 50 for instruction: %v17f16 = call <17 x half> @llvm.exp.v17f16(<17 x half> undef)
+; GFX10-SIZE-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: ret void
+;
+  %f16 = call half @llvm.exp.f16(half undef)
+  %v2f16 = call <2 x half> @llvm.exp.v2f16(<2 x half> undef)
+  %v3f16 = call <3 x half> @llvm.exp.v3f16(<3 x half> undef)
+  %v4f16 = call <4 x half> @llvm.exp.v4f16(<4 x half> undef)
+  %v5f16 = call <5 x half> @llvm.exp.v5f16(<5 x half> undef)
+  %v8f16 = call <8 x half> @llvm.exp.v8f16(<8 x half> undef)
+  %v16f16 = call <16 x half> @llvm.exp.v16f16(<16 x half> undef)
+  %v17f16 = call <17 x half> @llvm.exp.v17f16(<17 x half> undef)
+  ret void
+}
+
+define void @exp_bf16() {
+; BASE-LABEL: 'exp_bf16'
+; BASE-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %bf16 = call bfloat @llvm.exp.bf16(bfloat undef)
+; BASE-NEXT:  Cost Model: Found an estimated cost of 4 for instruction: %v2bf16 = call <2 x bfloat> @llvm.exp.v2bf16(<2 x bfloat> undef)
+; BASE-NEXT:  Cost Model: Found an estimated cost of 8 for instruction: %v3bf16 = call <3 x bfloat> @llvm.exp.v3bf16(<3 x bfloat> undef)
+; BASE-NEXT:  Cost Model: Found an estimated cost of 8 for instruction: %v4bf16 = call <4 x bfloat> @llvm.exp.v4bf16(<4 x bfloat> undef)
+; BASE-NEXT:  Cost Model: Found an estimated cost of 16 for instruction: %v5bf16 = call <5 x bfloat> @llvm.exp.v5bf16(<5 x bfloat> undef)
+; BASE-NEXT:  Cost Model: Found an estimated cost of 16 for instruction: %v8bf16 = call <8 x bfloat> @llvm.exp.v8bf16(<8 x bfloat> undef)
+; BASE-NEXT:  Cost Model: Found an estimated cost of 32 for instruction: %v16bf16 = call <16 x bfloat> @llvm.exp.v16bf16(<16 x bfloat> undef)
+; BASE-NEXT:  Cost Model: Found an estimated cost of 64 for instruction: %v17bf16 = call <17 x bfloat> @llvm.exp.v17bf16(<17 x bfloat> undef)
+; BASE-NEXT:  Cost Model: Found an estimated cost of 10 for instruction: ret void
+;
+; GFX8-LABEL: 'exp_bf16'
+; GFX8-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %bf16 = call bfloat @llvm.exp.bf16(bfloat undef)
+; GFX8-NEXT:  Cost Model: Found an estimated cost of 3 for instruction: %v2bf16 = call <2 x bfloat> @llvm.exp.v2bf16(<2 x bfloat> undef)
+; GFX8-NEXT:  Cost Model: Found an estimated cost of 5 for instruction: %v3bf16 = call <3 x bfloat> @llvm.exp.v3bf16(<3 x bfloat> undef)
+; GFX8-NEXT:  Cost Model: Found an estimated cost of 7 for instruction: %v4bf16 = call <4 x bfloat> @llvm.exp.v4bf16(<4 x bfloat> undef)
+; GFX8-NEXT:  Cost Model: Found an estimated cost of 9 for instruction: %v5bf16 = call <5 x bfloat> @llvm.exp.v5bf16(<5 x bfloat> undef)
+; GFX8-NEXT:  Cost Model: Found an estimated cost of 15 for instruction: %v8bf16 = call <8 x bfloat> @llvm.exp.v8bf16(<8 x bfloat> undef)
+; GFX8-NEXT:  Cost Model: Found an estimated cost of 31 for instruction: %v16bf16 = call <16 x bfloat> @llvm.exp.v16bf16(<16 x bfloat> undef)
+; GFX8-NEXT:  Cost Model: Found an estimated cost of 33 for instruction: %v17bf16 = call <17 x bfloat> @llvm.exp.v17bf16(<17 x bfloat> undef)
+; GFX8-NEXT:  Cost Model: Found an estimated cost of 10 for instruction: ret void
+;
+; GFX9-LABEL: 'exp_bf16'
+; GFX9-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %bf16 = call bfloat @llvm.exp.bf16(bfloat undef)
+; GFX9-NEXT:  Cost Model: Found an estimated cost of 3 for instruction: %v2bf16 = call <2 x bfloat> @llvm.exp.v2bf16(<2 x bfloat> undef)
+; GFX9-NEXT:  Cost Model: Found an estimated cost of 5 for instruction: %v3bf16 = call <3 x bfloat> @llvm.exp.v3bf16(<3 x bfloat> undef)
+; GFX9-NEXT:  Cost Model: Found an estimated cost of 7 for instruction: %v4bf16 = call <4 x bfloat> @llvm.exp.v4bf16(<4 x bfloat> undef)
+; GFX9-NEXT:  Cost Model: Found an estimated cost of 9 for instruction: %v5bf16 = call <5 x bfloat> @llvm.exp.v5bf16(<5 x bfloat> undef)
+; GFX9-NEXT:  Cost Model: Found an estimated cost of 15 for instruction: %v8bf16 = call <8 x bfloat> @llvm.exp.v8bf16(<8 x bfloat> undef)
+; GFX9-NEXT:  Cost Model: Found an estimated cost of 31 for instruction: %v16bf16 = call <16 x bfloat> @llvm.exp.v16bf16(<16 x bfloat> undef)
+; GFX9-NEXT:  Cost Model: Found an estimated cost of 33 for instruction: %v17bf16 = call <17 x bfloat> @llvm.exp.v17bf16(<17 x bfloat> undef)
+; GFX9-NEXT:  Cost Model: Found an estimated cost of 10 for instruction: ret void
+;
+; GFX10-LABEL: 'exp_bf16'
+; GFX10-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %bf16 = call bfloat @llvm.exp.bf16(bfloat undef)
+; GFX10-NEXT:  Cost Model: Found an estimated cost of 3 for instruction: %v2bf16 = call <2 x bfloat> @llvm.exp.v2bf16(<2 x bfloat> undef)
+; GFX10-NEXT:  Cost Model: Found an estimated cost of 5 for instruction: %v3bf16 = call <3 x bfloat> @llvm.exp.v3bf16(<3 x bfloat> undef)
+; GFX10-NEXT:  Cost Model: Found an estimated cost of 7 for instruction: %v4bf16 = call <4 x bfloat> @llvm.exp.v4bf16(<4 x bfloat> undef)
+; GFX10-NEXT:  Cost Model: Found an estimated cost of 9 for instruction: %v5bf16 = call <5 x bfloat> @llvm.exp.v5bf16(<5 x bfloat> undef)
+; GFX10-NEXT:  Cost Model: Found an estimated cost of 15 for instruction: %v8bf16 = call <8 x bfloat> @llvm.exp.v8bf16(<8 x bfloat> undef)
+; GFX10-NEXT:  Cost Model: Found an estimated cost of 31 for instruction: %v16bf16 = call <16 x bfloat> @llvm.exp.v16bf16(<16 x bfloat> undef)
+; GFX10-NEXT:  Cost Model: Found an estimated cost of 33 for instruction: %v17bf16 = call <17 x bfloat> @llvm.exp.v17bf16(<17 x bfloat> undef)
+; GFX10-NEXT:  Cost Model: Found an estimated cost of 10 for instruction: ret void
+;
+; BASE-SIZE-LABEL: 'exp_bf16'
+; BASE-SIZE-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %bf16 = call bfloat @llvm.exp.bf16(bfloat undef)
+; BASE-SIZE-NEXT:  Cost Model: Found an estimated cost of 4 for instruction: %v2bf16 = call <2 x bfloat> @llvm.exp.v2bf16(<2 x bfloat> undef)
+; BASE-SIZE-NEXT:  Cost Model: Found an estimated cost of 8 for instruction: %v3bf16 = call <3 x bfloat> @llvm.exp.v3bf16(<3 x bfloat> undef)
+; BASE-SIZE-NEXT:  Cost Model: Found an estimated cost of 8 for instruction: %v4bf16 = call <4 x bfloat> @llvm.exp.v4bf16(<4 x bfloat> undef)
+; BASE-SIZE-NEXT:  Cost Model: Found an estimated cost of 16 for instruction: %v5bf16 = call <5 x bfloat> @llvm.exp.v5bf16(<5 x bfloat> undef)
+; BASE-SIZE-NEXT:  Cost Model: Found an estimated cost of 16 for instruction: %v8bf16 = call <8 x bfloat> @llvm.exp.v8bf16(<8 x bfloat> undef)
+; BASE-SIZE-NEXT:  Cost Model: Found an estimated cost of 32 for instruction: %v16bf16 = call <16 x bfloat> @llvm.exp.v16bf16(<16 x bfloat> undef)
+; BASE-SIZE-NEXT:  Cost Model: Found an estimated cost of 64 for instruction: %v17bf16 = call <17 x bfloat> @llvm.exp.v17bf16(<17 x bfloat> undef)
+; BASE-SIZE-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: ret void
+;
+; GFX8-SIZE-LABEL: 'exp_bf16'
+; GFX8-SIZE-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %bf16 = call bfloat @llvm.exp.bf16(bfloat undef)
+; GFX8-SIZE-NEXT:  Cost Model: Found an estimated cost of 3 for instruction: %v2bf16 = call <2 x bfloat> @llvm.exp.v2bf16(<2 x bfloat> undef)
+; GFX8-SIZE-NEXT:  Cost Model: Found an estimated cost of 5 for instruction: %v3bf16 = call <3 x bfloat> @llvm.exp.v3bf16(<3 x bfloat> undef)
+; GFX8-SIZE-NEXT:  Cost Model: Found an estimated cost of 7 for instruction: %v4bf16 = call <4 x bfloat> @llvm.exp.v4bf16(<4 x bfloat> undef)
+; GFX8-SIZE-NEXT:  Cost Model: Found an estimated cost of 9 for instruction: %v5bf16 = call <5 x bfloat> @llvm.exp.v5bf16(<5 x bfloat> undef)
+; GFX8-SIZE-NEXT:  Cost Model: Found an estimated cost of 15 for instruction: %v8bf16 = call <8 x bfloat> @llvm.exp.v8bf16(<8 x bfloat> undef)
+; GFX8-SIZE-NEXT:  Cost Model: Found an estimated cost of 31 for instruction: %v16bf16 = call <16 x bfloat> @llvm.exp.v16bf16(<16 x bfloat> undef)
+; GFX8-SIZE-NEXT:  Cost Model: Found an estimated cost of 33 for instruction: %v17bf16 = call <17 x bfloat> @llvm.exp.v17bf16(<17 x bfloat> undef)
+; GFX8-SIZE-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: ret void
+;
+; GFX9-SIZE-LABEL: 'exp_bf16'
+; GFX9-SIZE-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %bf16 = call bfloat @llvm.exp.bf16(bfloat undef)
+; GFX9-SIZE-NEXT:  Cost Model: Found an estimated cost of 3 for instruction: %v2bf16 = call <2 x bfloat> @llvm.exp.v2bf16(<2 x bfloat> undef)
+; GFX9-SIZE-NEXT:  Cost Model: Found an estimated cost of 5 for instruction: %v3bf16 = call <3 x bfloat> @llvm.exp.v3bf16(<3 x bfloat> undef)
+; GFX9-SIZE-NEXT:  Cost Model: Found an estimated cost of 7 for instruction: %v4bf16 = call <4 x bfloat> @llvm.exp.v4bf16(<4 x bfloat> undef)
+; GFX9-SIZE-NEXT:  Cost Model: Found an estimated cost of 9 for instruction: %v5bf16 = call <5 x bfloat> @llvm.exp.v5bf16(<5 x bfloat> undef)
+; GFX9-SIZE-NEXT:  Cost Model: Found an estimated cost of 15 for instruction: %v8bf16 = call <8 x bfloat> @llvm.exp.v8bf16(<8 x bfloat> undef)
+; GFX9-SIZE-NEXT:  Cost Model: Found an estimated cost of 31 for instruction: %v16bf16 = call <16 x bfloat> @llvm.exp.v16bf16(<16 x bfloat> undef)
+; GFX9-SIZE-NEXT:  Cost Model: Found an estimated cost of 33 for instruction: %v17bf16 = call <17 x bfloat> @llvm.exp.v17bf16(<17 x bfloat> undef)
+; GFX9-SIZE-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: ret void
+;
+; GFX10-SIZE-LABEL: 'exp_bf16'
+; GFX10-SIZE-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %bf16 = call bfloat @llvm.exp.bf16(bfloat undef)
+; GFX10-SIZE-NEXT:  Cost Model: Found an estimated cost of 3 for instruction: %v2bf16 = call <2 x bfloat> @llvm.exp.v2bf16(<2 x bfloat> undef)
+; GFX10-SIZE-NEXT:  Cost Model: Found an estimated cost of 5 for instruction: %v3bf16 = call <3 x bfloat> @llvm.exp.v3bf16(<3 x bfloat> undef)
+; GFX10-SIZE-NEXT:  Cost Model: Found an estimated cost of 7 for instruction: %v4bf16 = call <4 x bfloat> @llvm.exp.v4bf16(<4 x bfloat> undef)
+; GFX10-SIZE-NEXT:  Cost Model: Found an estimated cost of 9 for instruction: %v5bf16 = call <5 x bfloat> @llvm.exp.v5bf16(<5 x bfloat> undef)
+; GFX10-SIZE-NEXT:  Cost Model: Found an estimated cost of 15 for instruction: %v8bf16 = call <8 x bfloat> @llvm.exp.v8bf16(<8 x bfloat> undef)
+; GFX10-SIZE-NEXT:  Cost Model: Found an estimated cost of 31 for instruction: %v16bf16 = call <16 x bfloat> @llvm.exp.v16bf16(<16 x bfloat> undef)
+; GFX10-SIZE-NEXT:  Cost Model: Found an estimated cost of 33 for instruction: %v17bf16 = call <17 x bfloat> @llvm.exp.v17bf16(<17 x bfloat> undef)
+; GFX10-SIZE-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: ret void
+;
+  %bf16 = call bfloat @llvm.exp.bf16(bfloat undef)
+  %v2bf16 = call <2 x bfloat> @llvm.exp.v2bf16(<2 x bfloat> undef)
+  %v3bf16 = call <3 x bfloat> @llvm.exp.v3bf16(<3 x bfloat> undef)
+  %v4bf16 = call <4 x bfloat> @llvm.exp.v4bf16(<4 x bfloat> undef)
+  %v5bf16 = call <5 x bfloat> @llvm.exp.v5bf16(<5 x bfloat> undef)
+  %v8bf16 = call <8 x bfloat> @llvm.exp.v8bf16(<8 x bfloat> undef)
+  %v16bf16 = call <16 x bfloat> @llvm.exp.v16bf16(<16 x bfloat> undef)
+  %v17bf16 = call <17 x bfloat> @llvm.exp.v17bf16(<17 x bfloat> undef)
+  ret void
+}
+
+define void @exp_f32() {
+; ALL-LABEL: 'exp_f32'
+; ALL-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %f32 = call float @llvm.exp.f32(float undef)
+; ALL-NEXT:  Cost Model: Found an estimated cost of 4 for instruction: %v2f32 = call <2 x float> @llvm.exp.v2f32(<2 x float> undef)
+; ALL-NEXT:  Cost Model: Found an estimated cost of 6 for instruction: %v3f32 = call <3 x float> @llvm.exp.v3f32(<3 x float> undef)
+; ALL-NEXT:  Cost Model: Found an estimated cost of 8 for instruction: %v4f32 = call <4 x float> @llvm.exp.v4f32(<4 x float> undef)
+; ALL-NEXT:  Cost Model: Found an estimated cost of 10 for instruction: %v5f32 = call <5 x float> @llvm.exp.v5f32(<5 x float> undef)
+; ALL-NEXT:  Cost Model: Found an estimated cost of 16 for instruction: %v8f32 = call <8 x float> @llvm.exp.v8f32(<8 x float> undef)
+; ALL-NEXT:  Cost Model: Found an estimated cost of 32 for instruction: %v16f32 = call <16 x float> @llvm.exp.v16f32(<16 x float> undef)
+; ALL-NEXT:  Cost Model: Found an estimated cost of 34 for instruction: %v17f32 = call <17 x float> @llvm.exp.v17f32(<17 x float> undef)
+; ALL-NEXT:  Cost Model: Found an estimated cost of 10 for instruction: ret void
+;
+; ALL-SIZE-LABEL: 'exp_f32'
+; ALL-SIZE-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %f32 = call float @llvm.exp.f32(float undef)
+; ALL-SIZE-NEXT:  Cost Model: Found an estimated cost of 4 for instruction: %v2f32 = call <2 x float> @llvm.exp.v2f32(<2 x float> undef)
+; ALL-SIZE-NEXT:  Cost Model: Found an estimated cost of 6 for instruction: %v3f32 = call <3 x float> @llvm.exp.v3f32(<3 x float> undef)
+; ALL-SIZE-NEXT:  Cost Model: Found an estimated cost of 8 for instruction: %v4f32 = call <4 x float> @llvm.exp.v4f32(<4 x float> undef)
+; ALL-SIZE-NEXT:  Cost Model: Found an estimated cost of 10 for instruction: %v5f32 = call <5 x float> @llvm.exp.v5f32(<5 x float> undef)
+; ALL-SIZE-NEXT:  Cost Model: Found an estimated cost of 16 for instruction: %v8f32 = call <8 x float> @llvm.exp.v8f32(<8 x float> undef)
+; ALL-SIZE-NEXT:  Cost Model: Found an estimated cost of 32 for instruction: %v16f32 = call <16 x float> @llvm.exp.v16f32(<16 x float> undef)
+; ALL-SIZE-NEXT:  Cost Model: Found an estimated cost of 34 for instruction: %v17f32 = call <17 x float> @llvm.exp.v17f32(<17 x float> undef)
+; ALL-SIZE-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: ret void
+;
+  %f32 = call float @llvm.exp.f32(float undef)
+  %v2f32 = call <2 x float> @llvm.exp.v2f32(<2 x float> undef)
+  %v3f32 = call <3 x float> @llvm.exp.v3f32(<3 x float> undef)
+  %v4f32 = call <4 x float> @llvm.exp.v4f32(<4 x float> undef)
+  %v5f32 = call <5 x float> @llvm.exp.v5f32(<5 x float> undef)
+  %v8f32 = call <8 x float> @llvm.exp.v8f32(<8 x float> undef)
+  %v16f32 = call <16 x float> @llvm.exp.v16f32(<16 x float> undef)
+  %v17f32 = call <17 x float> @llvm.exp.v17f32(<17 x float> undef)
+  ret void
+}
+
+define void @exp_f64() {
+; ALL-LABEL: 'exp_f64'
+; ALL-NEXT:  Cost Model: Found an estimated cost of 10 for instruction: %f64 = call double @llvm.exp.f64(double undef)
+; ALL-NEXT:  Cost Model: Found an estimated cost of 20 for instruction: %v2f64 = call <2 x double> @llvm.exp.v2f64(<2 x double> undef)
+; ALL-NEXT:  Cost Model: Found an estimated cost of 30 for instruction: %v3f64 = call <3 x double> @llvm.exp.v3f64(<3 x double> undef)
+; ALL-NEXT:  Cost Model: Found an estimated cost of 40 for instruction: %v4f64 = call <4 x double> @llvm.exp.v4f64(<4 x double> undef)
+; ALL-NEXT:  Cost Model: Found an estimated cost of 50 for instruction: %v5f64 = call <5 x double> @llvm.exp.v5f64(<5 x double> undef)
+; ALL-NEXT:  Cost Model: Found an estimated cost of 80 for instruction: %v8f64 = call <8 x double> @llvm.exp.v8f64(<8 x double> undef)
+; ALL-NEXT:  Cost Model: Found an estimated cost of 160 for instruction: %v16f64 = call <16 x double> @llvm.exp.v16f64(<16 x double> undef)
+; ALL-NEXT:  Cost Model: Found an estimated cost of 170 for instruction: %v17f64 = call <17 x double> @llvm.exp.v17f64(<17 x double> undef)
+; ALL-NEXT:  Cost Model: Found an estimated cost of 10 for instruction: ret void
+;
+; ALL-SIZE-LABEL: 'exp_f64'
+; ALL-SIZE-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %f64 = call double @llvm.exp.f64(double undef)
+; ALL-SIZE-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %v2f64 = call <2 x double> @llvm.exp.v2f64(<2 x double> undef)
+; ALL-SIZE-NEXT:  Cost Model: Found an estimated cost of 3 for instruction: %v3f64 = call <3 x double> @llvm.exp.v3f64(<3 x double> undef)
+; ALL-SIZE-NEXT:  Cost Model: Found an estimated cost of 4 for instruction: %v4f64 = call <4 x double> @llvm.exp.v4f64(<4 x double> undef)
+; ALL-SIZE-NEXT:  Cost Model: Found an estimated cost of 5 for instruction: %v5f64 = call <5 x double> @llvm.exp.v5f64(<5 x double> undef)
+; ALL-SIZE-NEXT:  Cost Model: Found an estimated cost of 8 for instruction: %v8f64 = call <8 x double> @llvm.exp.v8f64(<8 x double> undef)
+; ALL-SIZE-NEXT:  Cost Model: Found an estimated cost of 16 for instruction: %v16f64 = call <16 x double> @llvm.exp.v16f64(<16 x double> undef)
+; ALL-SIZE-NEXT:  Cost Model: Found an estimated cost of 17 for instruction: %v17f64 = call <17 x double> @llvm.exp.v17f64(<17 x double> undef)
+; ALL-SIZE-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: ret void
+;
+  %f64 = call double @llvm.exp.f64(double undef)
+  %v2f64 = call <2 x double> @llvm.exp.v2f64(<2 x double> undef)
+  %v3f64 = call <3 x double> @llvm.exp.v3f64(<3 x double> undef)
+  %v4f64 = call <4 x double> @llvm.exp.v4f64(<4 x double> undef)
+  %v5f64 = call <5 x double> @llvm.exp.v5f64(<5 x double> undef)
+  %v8f64 = call <8 x double> @llvm.exp.v8f64(<8 x double> undef)
+  %v16f64 = call <16 x double> @llvm.exp.v16f64(<16 x double> undef)
+  %v17f64 = call <17 x double> @llvm.exp.v17f64(<17 x double> undef)
+  ret void
+}
diff --git a/llvm/test/Analysis/CostModel/AMDGPU/exp10.ll b/llvm/test/Analysis/CostModel/AMDGPU/exp10.ll
new file mode 100644
index 0000000..0fea64d
--- /dev/null
+++ b/llvm/test/Analysis/CostModel/AMDGPU/exp10.ll
@@ -0,0 +1,278 @@
+; NOTE: Assertions have been autogenerated by utils/update_analyze_test_checks.py UTC_ARGS: --version 5
+; RUN: opt < %s -mtriple=amdgcn-unknown-amdhsa -passes="print<cost-model>" -cost-kind=throughput 2>&1 -disable-output | FileCheck -check-prefixes=ALL,BASE %s
+; RUN: opt < %s -mtriple=amdgcn-unknown-amdhsa -mcpu=gfx803 -passes="print<cost-model>" -cost-kind=throughput 2>&1 -disable-output | FileCheck -check-prefixes=ALL,GFX8 %s
+; RUN: opt < %s -mtriple=amdgcn-unknown-amdhsa -mcpu=gfx900 -passes="print<cost-model>" -cost-kind=throughput 2>&1 -disable-output | FileCheck -check-prefixes=ALL,GFX9 %s
+; RUN: opt < %s -mtriple=amdgcn-unknown-amdhsa -mcpu=gfx1030 -passes="print<cost-model>" -cost-kind=throughput 2>&1 -disable-output | FileCheck -check-prefixes=ALL,GFX10 %s
+
+; RUN: opt < %s -mtriple=amdgcn-unknown-amdhsa -passes="print<cost-model>" -cost-kind=code-size 2>&1 -disable-output | FileCheck -check-prefixes=ALL-SIZE,BASE-SIZE %s
+; RUN: opt < %s -mtriple=amdgcn-unknown-amdhsa -mcpu=gfx803 -passes="print<cost-model>" -cost-kind=code-size 2>&1 -disable-output | FileCheck -check-prefixes=ALL-SIZE,GFX8-SIZE %s
+; RUN: opt < %s -mtriple=amdgcn-unknown-amdhsa -mcpu=gfx900 -passes="print<cost-model>" -cost-kind=code-size 2>&1 -disable-output | FileCheck -check-prefixes=ALL-SIZE,GFX9-SIZE %s
+; RUN: opt < %s -mtriple=amdgcn-unknown-amdhsa -mcpu=gfx1030 -passes="print<cost-model>" -cost-kind=code-size 2>&1 -disable-output | FileCheck -check-prefixes=ALL-SIZE,GFX10-SIZE %s
+
+define void @exp10_f16() {
+; BASE-LABEL: 'exp10_f16'
+; BASE-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %f16 = call half @llvm.exp10.f16(half undef)
+; BASE-NEXT:  Cost Model: Found an estimated cost of 4 for instruction: %v2f16 = call <2 x half> @llvm.exp10.v2f16(<2 x half> undef)
+; BASE-NEXT:  Cost Model: Found an estimated cost of 8 for instruction: %v3f16 = call <3 x half> @llvm.exp10.v3f16(<3 x half> undef)
+; BASE-NEXT:  Cost Model: Found an estimated cost of 8 for instruction: %v4f16 = call <4 x half> @llvm.exp10.v4f16(<4 x half> undef)
+; BASE-NEXT:  Cost Model: Found an estimated cost of 16 for instruction: %v5f16 = call <5 x half> @llvm.exp10.v5f16(<5 x half> undef)
+; BASE-NEXT:  Cost Model: Found an estimated cost of 16 for instruction: %v8f16 = call <8 x half> @llvm.exp10.v8f16(<8 x half> undef)
+; BASE-NEXT:  Cost Model: Found an estimated cost of 32 for instruction: %v16f16 = call <16 x half> @llvm.exp10.v16f16(<16 x half> undef)
+; BASE-NEXT:  Cost Model: Found an estimated cost of 64 for instruction: %v17f16 = call <17 x half> @llvm.exp10.v17f16(<17 x half> undef)
+; BASE-NEXT:  Cost Model: Found an estimated cost of 10 for instruction: ret void
+;
+; GFX8-LABEL: 'exp10_f16'
+; GFX8-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %f16 = call half @llvm.exp10.f16(half undef)
+; GFX8-NEXT:  Cost Model: Found an estimated cost of 5 for instruction: %v2f16 = call <2 x half> @llvm.exp10.v2f16(<2 x half> undef)
+; GFX8-NEXT:  Cost Model: Found an estimated cost of 8 for instruction: %v3f16 = call <3 x half> @llvm.exp10.v3f16(<3 x half> undef)
+; GFX8-NEXT:  Cost Model: Found an estimated cost of 11 for instruction: %v4f16 = call <4 x half> @llvm.exp10.v4f16(<4 x half> undef)
+; GFX8-NEXT:  Cost Model: Found an estimated cost of 14 for instruction: %v5f16 = call <5 x half> @llvm.exp10.v5f16(<5 x half> undef)
+; GFX8-NEXT:  Cost Model: Found an estimated cost of 23 for instruction: %v8f16 = call <8 x half> @llvm.exp10.v8f16(<8 x half> undef)
+; GFX8-NEXT:  Cost Model: Found an estimated cost of 47 for instruction: %v16f16 = call <16 x half> @llvm.exp10.v16f16(<16 x half> undef)
+; GFX8-NEXT:  Cost Model: Found an estimated cost of 50 for instruction: %v17f16 = call <17 x half> @llvm.exp10.v17f16(<17 x half> undef)
+; GFX8-NEXT:  Cost Model: Found an estimated cost of 10 for instruction: ret void
+;
+; GFX9-LABEL: 'exp10_f16'
+; GFX9-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %f16 = call half @llvm.exp10.f16(half undef)
+; GFX9-NEXT:  Cost Model: Found an estimated cost of 5 for instruction: %v2f16 = call <2 x half> @llvm.exp10.v2f16(<2 x half> undef)
+; GFX9-NEXT:  Cost Model: Found an estimated cost of 8 for instruction: %v3f16 = call <3 x half> @llvm.exp10.v3f16(<3 x half> undef)
+; GFX9-NEXT:  Cost Model: Found an estimated cost of 11 for instruction: %v4f16 = call <4 x half> @llvm.exp10.v4f16(<4 x half> undef)
+; GFX9-NEXT:  Cost Model: Found an estimated cost of 14 for instruction: %v5f16 = call <5 x half> @llvm.exp10.v5f16(<5 x half> undef)
+; GFX9-NEXT:  Cost Model: Found an estimated cost of 23 for instruction: %v8f16 = call <8 x half> @llvm.exp10.v8f16(<8 x half> undef)
+; GFX9-NEXT:  Cost Model: Found an estimated cost of 47 for instruction: %v16f16 = call <16 x half> @llvm.exp10.v16f16(<16 x half> undef)
+; GFX9-NEXT:  Cost Model: Found an estimated cost of 50 for instruction: %v17f16 = call <17 x half> @llvm.exp10.v17f16(<17 x half> undef)
+; GFX9-NEXT:  Cost Model: Found an estimated cost of 10 for instruction: ret void
+;
+; GFX10-LABEL: 'exp10_f16'
+; GFX10-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %f16 = call half @llvm.exp10.f16(half undef)
+; GFX10-NEXT:  Cost Model: Found an estimated cost of 5 for instruction: %v2f16 = call <2 x half> @llvm.exp10.v2f16(<2 x half> undef)
+; GFX10-NEXT:  Cost Model: Found an estimated cost of 8 for instruction: %v3f16 = call <3 x half> @llvm.exp10.v3f16(<3 x half> undef)
+; GFX10-NEXT:  Cost Model: Found an estimated cost of 11 for instruction: %v4f16 = call <4 x half> @llvm.exp10.v4f16(<4 x half> undef)
+; GFX10-NEXT:  Cost Model: Found an estimated cost of 14 for instruction: %v5f16 = call <5 x half> @llvm.exp10.v5f16(<5 x half> undef)
+; GFX10-NEXT:  Cost Model: Found an estimated cost of 23 for instruction: %v8f16 = call <8 x half> @llvm.exp10.v8f16(<8 x half> undef)
+; GFX10-NEXT:  Cost Model: Found an estimated cost of 47 for instruction: %v16f16 = call <16 x half> @llvm.exp10.v16f16(<16 x half> undef)
+; GFX10-NEXT:  Cost Model: Found an estimated cost of 50 for instruction: %v17f16 = call <17 x half> @llvm.exp10.v17f16(<17 x half> undef)
+; GFX10-NEXT:  Cost Model: Found an estimated cost of 10 for instruction: ret void
+;
+; BASE-SIZE-LABEL: 'exp10_f16'
+; BASE-SIZE-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %f16 = call half @llvm.exp10.f16(half undef)
+; BASE-SIZE-NEXT:  Cost Model: Found an estimated cost of 4 for instruction: %v2f16 = call <2 x half> @llvm.exp10.v2f16(<2 x half> undef)
+; BASE-SIZE-NEXT:  Cost Model: Found an estimated cost of 8 for instruction: %v3f16 = call <3 x half> @llvm.exp10.v3f16(<3 x half> undef)
+; BASE-SIZE-NEXT:  Cost Model: Found an estimated cost of 8 for instruction: %v4f16 = call <4 x half> @llvm.exp10.v4f16(<4 x half> undef)
+; BASE-SIZE-NEXT:  Cost Model: Found an estimated cost of 16 for instruction: %v5f16 = call <5 x half> @llvm.exp10.v5f16(<5 x half> undef)
+; BASE-SIZE-NEXT:  Cost Model: Found an estimated cost of 16 for instruction: %v8f16 = call <8 x half> @llvm.exp10.v8f16(<8 x half> undef)
+; BASE-SIZE-NEXT:  Cost Model: Found an estimated cost of 32 for instruction: %v16f16 = call <16 x half> @llvm.exp10.v16f16(<16 x half> undef)
+; BASE-SIZE-NEXT:  Cost Model: Found an estimated cost of 64 for instruction: %v17f16 = call <17 x half> @llvm.exp10.v17f16(<17 x half> undef)
+; BASE-SIZE-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: ret void
+;
+; GFX8-SIZE-LABEL: 'exp10_f16'
+; GFX8-SIZE-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %f16 = call half @llvm.exp10.f16(half undef)
+; GFX8-SIZE-NEXT:  Cost Model: Found an estimated cost of 5 for instruction: %v2f16 = call <2 x half> @llvm.exp10.v2f16(<2 x half> undef)
+; GFX8-SIZE-NEXT:  Cost Model: Found an estimated cost of 8 for instruction: %v3f16 = call <3 x half> @llvm.exp10.v3f16(<3 x half> undef)
+; GFX8-SIZE-NEXT:  Cost Model: Found an estimated cost of 11 for instruction: %v4f16 = call <4 x half> @llvm.exp10.v4f16(<4 x half> undef)
+; GFX8-SIZE-NEXT:  Cost Model: Found an estimated cost of 14 for instruction: %v5f16 = call <5 x half> @llvm.exp10.v5f16(<5 x half> undef)
+; GFX8-SIZE-NEXT:  Cost Model: Found an estimated cost of 23 for instruction: %v8f16 = call <8 x half> @llvm.exp10.v8f16(<8 x half> undef)
+; GFX8-SIZE-NEXT:  Cost Model: Found an estimated cost of 47 for instruction: %v16f16 = call <16 x half> @llvm.exp10.v16f16(<16 x half> undef)
+; GFX8-SIZE-NEXT:  Cost Model: Found an estimated cost of 50 for instruction: %v17f16 = call <17 x half> @llvm.exp10.v17f16(<17 x half> undef)
+; GFX8-SIZE-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: ret void
+;
+; GFX9-SIZE-LABEL: 'exp10_f16'
+; GFX9-SIZE-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %f16 = call half @llvm.exp10.f16(half undef)
+; GFX9-SIZE-NEXT:  Cost Model: Found an estimated cost of 5 for instruction: %v2f16 = call <2 x half> @llvm.exp10.v2f16(<2 x half> undef)
+; GFX9-SIZE-NEXT:  Cost Model: Found an estimated cost of 8 for instruction: %v3f16 = call <3 x half> @llvm.exp10.v3f16(<3 x half> undef)
+; GFX9-SIZE-NEXT:  Cost Model: Found an estimated cost of 11 for instruction: %v4f16 = call <4 x half> @llvm.exp10.v4f16(<4 x half> undef)
+; GFX9-SIZE-NEXT:  Cost Model: Found an estimated cost of 14 for instruction: %v5f16 = call <5 x half> @llvm.exp10.v5f16(<5 x half> undef)
+; GFX9-SIZE-NEXT:  Cost Model: Found an estimated cost of 23 for instruction: %v8f16 = call <8 x half> @llvm.exp10.v8f16(<8 x half> undef)
+; GFX9-SIZE-NEXT:  Cost Model: Found an estimated cost of 47 for instruction: %v16f16 = call <16 x half> @llvm.exp10.v16f16(<16 x half> undef)
+; GFX9-SIZE-NEXT:  Cost Model: Found an estimated cost of 50 for instruction: %v17f16 = call <17 x half> @llvm.exp10.v17f16(<17 x half> undef)
+; GFX9-SIZE-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: ret void
+;
+; GFX10-SIZE-LABEL: 'exp10_f16'
+; GFX10-SIZE-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %f16 = call half @llvm.exp10.f16(half undef)
+; GFX10-SIZE-NEXT:  Cost Model: Found an estimated cost of 5 for instruction: %v2f16 = call <2 x half> @llvm.exp10.v2f16(<2 x half> undef)
+; GFX10-SIZE-NEXT:  Cost Model: Found an estimated cost of 8 for instruction: %v3f16 = call <3 x half> @llvm.exp10.v3f16(<3 x half> undef)
+; GFX10-SIZE-NEXT:  Cost Model: Found an estimated cost of 11 for instruction: %v4f16 = call <4 x half> @llvm.exp10.v4f16(<4 x half> undef)
+; GFX10-SIZE-NEXT:  Cost Model: Found an estimated cost of 14 for instruction: %v5f16 = call <5 x half> @llvm.exp10.v5f16(<5 x half> undef)
+; GFX10-SIZE-NEXT:  Cost Model: Found an estimated cost of 23 for instruction: %v8f16 = call <8 x half> @llvm.exp10.v8f16(<8 x half> undef)
+; GFX10-SIZE-NEXT:  Cost Model: Found an estimated cost of 47 for instruction: %v16f16 = call <16 x half> @llvm.exp10.v16f16(<16 x half> undef)
+; GFX10-SIZE-NEXT:  Cost Model: Found an estimated cost of 50 for instruction: %v17f16 = call <17 x half> @llvm.exp10.v17f16(<17 x half> undef)
+; GFX10-SIZE-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: ret void
+;
+  %f16 = call half @llvm.exp10.f16(half undef)
+  %v2f16 = call <2 x half> @llvm.exp10.v2f16(<2 x half> undef)
+  %v3f16 = call <3 x half> @llvm.exp10.v3f16(<3 x half> undef)
+  %v4f16 = call <4 x half> @llvm.exp10.v4f16(<4 x half> undef)
+  %v5f16 = call <5 x half> @llvm.exp10.v5f16(<5 x half> undef)
+  %v8f16 = call <8 x half> @llvm.exp10.v8f16(<8 x half> undef)
+  %v16f16 = call <16 x half> @llvm.exp10.v16f16(<16 x half> undef)
+  %v17f16 = call <17 x half> @llvm.exp10.v17f16(<17 x half> undef)
+  ret void
+}
+
+define void @exp10_bf16() {
+; BASE-LABEL: 'exp10_bf16'
+; BASE-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %bf16 = call bfloat @llvm.exp10.bf16(bfloat undef)
+; BASE-NEXT:  Cost Model: Found an estimated cost of 4 for instruction: %v2bf16 = call <2 x bfloat> @llvm.exp10.v2bf16(<2 x bfloat> undef)
+; BASE-NEXT:  Cost Model: Found an estimated cost of 8 for instruction: %v3bf16 = call <3 x bfloat> @llvm.exp10.v3bf16(<3 x bfloat> undef)
+; BASE-NEXT:  Cost Model: Found an estimated cost of 8 for instruction: %v4bf16 = call <4 x bfloat> @llvm.exp10.v4bf16(<4 x bfloat> undef)
+; BASE-NEXT:  Cost Model: Found an estimated cost of 16 for instruction: %v5bf16 = call <5 x bfloat> @llvm.exp10.v5bf16(<5 x bfloat> undef)
+; BASE-NEXT:  Cost Model: Found an estimated cost of 16 for instruction: %v8bf16 = call <8 x bfloat> @llvm.exp10.v8bf16(<8 x bfloat> undef)
+; BASE-NEXT:  Cost Model: Found an estimated cost of 32 for instruction: %v16bf16 = call <16 x bfloat> @llvm.exp10.v16bf16(<16 x bfloat> undef)
+; BASE-NEXT:  Cost Model: Found an estimated cost of 64 for instruction: %v17bf16 = call <17 x bfloat> @llvm.exp10.v17bf16(<17 x bfloat> undef)
+; BASE-NEXT:  Cost Model: Found an estimated cost of 10 for instruction: ret void
+;
+; GFX8-LABEL: 'exp10_bf16'
+; GFX8-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %bf16 = call bfloat @llvm.exp10.bf16(bfloat undef)
+; GFX8-NEXT:  Cost Model: Found an estimated cost of 3 for instruction: %v2bf16 = call <2 x bfloat> @llvm.exp10.v2bf16(<2 x bfloat> undef)
+; GFX8-NEXT:  Cost Model: Found an estimated cost of 5 for instruction: %v3bf16 = call <3 x bfloat> @llvm.exp10.v3bf16(<3 x bfloat> undef)
+; GFX8-NEXT:  Cost Model: Found an estimated cost of 7 for instruction: %v4bf16 = call <4 x bfloat> @llvm.exp10.v4bf16(<4 x bfloat> undef)
+; GFX8-NEXT:  Cost Model: Found an estimated cost of 9 for instruction: %v5bf16 = call <5 x bfloat> @llvm.exp10.v5bf16(<5 x bfloat> undef)
+; GFX8-NEXT:  Cost Model: Found an estimated cost of 15 for instruction: %v8bf16 = call <8 x bfloat> @llvm.exp10.v8bf16(<8 x bfloat> undef)
+; GFX8-NEXT:  Cost Model: Found an estimated cost of 31 for instruction: %v16bf16 = call <16 x bfloat> @llvm.exp10.v16bf16(<16 x bfloat> undef)
+; GFX8-NEXT:  Cost Model: Found an estimated cost of 33 for instruction: %v17bf16 = call <17 x bfloat> @llvm.exp10.v17bf16(<17 x bfloat> undef)
+; GFX8-NEXT:  Cost Model: Found an estimated cost of 10 for instruction: ret void
+;
+; GFX9-LABEL: 'exp10_bf16'
+; GFX9-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %bf16 = call bfloat @llvm.exp10.bf16(bfloat undef)
+; GFX9-NEXT:  Cost Model: Found an estimated cost of 3 for instruction: %v2bf16 = call <2 x bfloat> @llvm.exp10.v2bf16(<2 x bfloat> undef)
+; GFX9-NEXT:  Cost Model: Found an estimated cost of 5 for instruction: %v3bf16 = call <3 x bfloat> @llvm.exp10.v3bf16(<3 x bfloat> undef)
+; GFX9-NEXT:  Cost Model: Found an estimated cost of 7 for instruction: %v4bf16 = call <4 x bfloat> @llvm.exp10.v4bf16(<4 x bfloat> undef)
+; GFX9-NEXT:  Cost Model: Found an estimated cost of 9 for instruction: %v5bf16 = call <5 x bfloat> @llvm.exp10.v5bf16(<5 x bfloat> undef)
+; GFX9-NEXT:  Cost Model: Found an estimated cost of 15 for instruction: %v8bf16 = call <8 x bfloat> @llvm.exp10.v8bf16(<8 x bfloat> undef)
+; GFX9-NEXT:  Cost Model: Found an estimated cost of 31 for instruction: %v16bf16 = call <16 x bfloat> @llvm.exp10.v16bf16(<16 x bfloat> undef)
+; GFX9-NEXT:  Cost Model: Found an estimated cost of 33 for instruction: %v17bf16 = call <17 x bfloat> @llvm.exp10.v17bf16(<17 x bfloat> undef)
+; GFX9-NEXT:  Cost Model: Found an estimated cost of 10 for instruction: ret void
+;
+; GFX10-LABEL: 'exp10_bf16'
+; GFX10-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %bf16 = call bfloat @llvm.exp10.bf16(bfloat undef)
+; GFX10-NEXT:  Cost Model: Found an estimated cost of 3 for instruction: %v2bf16 = call <2 x bfloat> @llvm.exp10.v2bf16(<2 x bfloat> undef)
+; GFX10-NEXT:  Cost Model: Found an estimated cost of 5 for instruction: %v3bf16 = call <3 x bfloat> @llvm.exp10.v3bf16(<3 x bfloat> undef)
+; GFX10-NEXT:  Cost Model: Found an estimated cost of 7 for instruction: %v4bf16 = call <4 x bfloat> @llvm.exp10.v4bf16(<4 x bfloat> undef)
+; GFX10-NEXT:  Cost Model: Found an estimated cost of 9 for instruction: %v5bf16 = call <5 x bfloat> @llvm.exp10.v5bf16(<5 x bfloat> undef)
+; GFX10-NEXT:  Cost Model: Found an estimated cost of 15 for instruction: %v8bf16 = call <8 x bfloat> @llvm.exp10.v8bf16(<8 x bfloat> undef)
+; GFX10-NEXT:  Cost Model: Found an estimated cost of 31 for instruction: %v16bf16 = call <16 x bfloat> @llvm.exp10.v16bf16(<16 x bfloat> undef)
+; GFX10-NEXT:  Cost Model: Found an estimated cost of 33 for instruction: %v17bf16 = call <17 x bfloat> @llvm.exp10.v17bf16(<17 x bfloat> undef)
+; GFX10-NEXT:  Cost Model: Found an estimated cost of 10 for instruction: ret void
+;
+; BASE-SIZE-LABEL: 'exp10_bf16'
+; BASE-SIZE-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %bf16 = call bfloat @llvm.exp10.bf16(bfloat undef)
+; BASE-SIZE-NEXT:  Cost Model: Found an estimated cost of 4 for instruction: %v2bf16 = call <2 x bfloat> @llvm.exp10.v2bf16(<2 x bfloat> undef)
+; BASE-SIZE-NEXT:  Cost Model: Found an estimated cost of 8 for instruction: %v3bf16 = call <3 x bfloat> @llvm.exp10.v3bf16(<3 x bfloat> undef)
+; BASE-SIZE-NEXT:  Cost Model: Found an estimated cost of 8 for instruction: %v4bf16 = call <4 x bfloat> @llvm.exp10.v4bf16(<4 x bfloat> undef)
+; BASE-SIZE-NEXT:  Cost Model: Found an estimated cost of 16 for instruction: %v5bf16 = call <5 x bfloat> @llvm.exp10.v5bf16(<5 x bfloat> undef)
+; BASE-SIZE-NEXT:  Cost Model: Found an estimated cost of 16 for instruction: %v8bf16 = call <8 x bfloat> @llvm.exp10.v8bf16(<8 x bfloat> undef)
+; BASE-SIZE-NEXT:  Cost Model: Found an estimated cost of 32 for instruction: %v16bf16 = call <16 x bfloat> @llvm.exp10.v16bf16(<16 x bfloat> undef)
+; BASE-SIZE-NEXT:  Cost Model: Found an estimated cost of 64 for instruction: %v17bf16 = call <17 x bfloat> @llvm.exp10.v17bf16(<17 x bfloat> undef)
+; BASE-SIZE-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: ret void
+;
+; GFX8-SIZE-LABEL: 'exp10_bf16'
+; GFX8-SIZE-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %bf16 = call bfloat @llvm.exp10.bf16(bfloat undef)
+; GFX8-SIZE-NEXT:  Cost Model: Found an estimated cost of 3 for instruction: %v2bf16 = call <2 x bfloat> @llvm.exp10.v2bf16(<2 x bfloat> undef)
+; GFX8-SIZE-NEXT:  Cost Model: Found an estimated cost of 5 for instruction: %v3bf16 = call <3 x bfloat> @llvm.exp10.v3bf16(<3 x bfloat> undef)
+; GFX8-SIZE-NEXT:  Cost Model: Found an estimated cost of 7 for instruction: %v4bf16 = call <4 x bfloat> @llvm.exp10.v4bf16(<4 x bfloat> undef)
+; GFX8-SIZE-NEXT:  Cost Model: Found an estimated cost of 9 for instruction: %v5bf16 = call <5 x bfloat> @llvm.exp10.v5bf16(<5 x bfloat> undef)
+; GFX8-SIZE-NEXT:  Cost Model: Found an estimated cost of 15 for instruction: %v8bf16 = call <8 x bfloat> @llvm.exp10.v8bf16(<8 x bfloat> undef)
+; GFX8-SIZE-NEXT:  Cost Model: Found an estimated cost of 31 for instruction: %v16bf16 = call <16 x bfloat> @llvm.exp10.v16bf16(<16 x bfloat> undef)
+; GFX8-SIZE-NEXT:  Cost Model: Found an estimated cost of 33 for instruction: %v17bf16 = call <17 x bfloat> @llvm.exp10.v17bf16(<17 x bfloat> undef)
+; GFX8-SIZE-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: ret void
+;
+; GFX9-SIZE-LABEL: 'exp10_bf16'
+; GFX9-SIZE-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %bf16 = call bfloat @llvm.exp10.bf16(bfloat undef)
+; GFX9-SIZE-NEXT:  Cost Model: Found an estimated cost of 3 for instruction: %v2bf16 = call <2 x bfloat> @llvm.exp10.v2bf16(<2 x bfloat> undef)
+; GFX9-SIZE-NEXT:  Cost Model: Found an estimated cost of 5 for instruction: %v3bf16 = call <3 x bfloat> @llvm.exp10.v3bf16(<3 x bfloat> undef)
+; GFX9-SIZE-NEXT:  Cost Model: Found an estimated cost of 7 for instruction: %v4bf16 = call <4 x bfloat> @llvm.exp10.v4bf16(<4 x bfloat> undef)
+; GFX9-SIZE-NEXT:  Cost Model: Found an estimated cost of 9 for instruction: %v5bf16 = call <5 x bfloat> @llvm.exp10.v5bf16(<5 x bfloat> undef)
+; GFX9-SIZE-NEXT:  Cost Model: Found an estimated cost of 15 for instruction: %v8bf16 = call <8 x bfloat> @llvm.exp10.v8bf16(<8 x bfloat> undef)
+; GFX9-SIZE-NEXT:  Cost Model: Found an estimated cost of 31 for instruction: %v16bf16 = call <16 x bfloat> @llvm.exp10.v16bf16(<16 x bfloat> undef)
+; GFX9-SIZE-NEXT:  Cost Model: Found an estimated cost of 33 for instruction: %v17bf16 = call <17 x bfloat> @llvm.exp10.v17bf16(<17 x bfloat> undef)
+; GFX9-SIZE-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: ret void
+;
+; GFX10-SIZE-LABEL: 'exp10_bf16'
+; GFX10-SIZE-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %bf16 = call bfloat @llvm.exp10.bf16(bfloat undef)
+; GFX10-SIZE-NEXT:  Cost Model: Found an estimated cost of 3 for instruction: %v2bf16 = call <2 x bfloat> @llvm.exp10.v2bf16(<2 x bfloat> undef)
+; GFX10-SIZE-NEXT:  Cost Model: Found an estimated cost of 5 for instruction: %v3bf16 = call <3 x bfloat> @llvm.exp10.v3bf16(<3 x bfloat> undef)
+; GFX10-SIZE-NEXT:  Cost Model: Found an estimated cost of 7 for instruction: %v4bf16 = call <4 x bfloat> @llvm.exp10.v4bf16(<4 x bfloat> undef)
+; GFX10-SIZE-NEXT:  Cost Model: Found an estimated cost of 9 for instruction: %v5bf16 = call <5 x bfloat> @llvm.exp10.v5bf16(<5 x bfloat> undef)
+; GFX10-SIZE-NEXT:  Cost Model: Found an estimated cost of 15 for instruction: %v8bf16 = call <8 x bfloat> @llvm.exp10.v8bf16(<8 x bfloat> undef)
+; GFX10-SIZE-NEXT:  Cost Model: Found an estimated cost of 31 for instruction: %v16bf16 = call <16 x bfloat> @llvm.exp10.v16bf16(<16 x bfloat> undef)
+; GFX10-SIZE-NEXT:  Cost Model: Found an estimated cost of 33 for instruction: %v17bf16 = call <17 x bfloat> @llvm.exp10.v17bf16(<17 x bfloat> undef)
+; GFX10-SIZE-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: ret void
+;
+  %bf16 = call bfloat @llvm.exp10.bf16(bfloat undef)
+  %v2bf16 = call <2 x bfloat> @llvm.exp10.v2bf16(<2 x bfloat> undef)
+  %v3bf16 = call <3 x bfloat> @llvm.exp10.v3bf16(<3 x bfloat> undef)
+  %v4bf16 = call <4 x bfloat> @llvm.exp10.v4bf16(<4 x bfloat> undef)
+  %v5bf16 = call <5 x bfloat> @llvm.exp10.v5bf16(<5 x bfloat> undef)
+  %v8bf16 = call <8 x bfloat> @llvm.exp10.v8bf16(<8 x bfloat> undef)
+  %v16bf16 = call <16 x bfloat> @llvm.exp10.v16bf16(<16 x bfloat> undef)
+  %v17bf16 = call <17 x bfloat> @llvm.exp10.v17bf16(<17 x bfloat> undef)
+  ret void
+}
+
+define void @exp10_f32() {
+; ALL-LABEL: 'exp10_f32'
+; ALL-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %f32 = call float @llvm.exp10.f32(float undef)
+; ALL-NEXT:  Cost Model: Found an estimated cost of 4 for instruction: %v2f32 = call <2 x float> @llvm.exp10.v2f32(<2 x float> undef)
+; ALL-NEXT:  Cost Model: Found an estimated cost of 6 for instruction: %v3f32 = call <3 x float> @llvm.exp10.v3f32(<3 x float> undef)
+; ALL-NEXT:  Cost Model: Found an estimated cost of 8 for instruction: %v4f32 = call <4 x float> @llvm.exp10.v4f32(<4 x float> undef)
+; ALL-NEXT:  Cost Model: Found an estimated cost of 10 for instruction: %v5f32 = call <5 x float> @llvm.exp10.v5f32(<5 x float> undef)
+; ALL-NEXT:  Cost Model: Found an estimated cost of 16 for instruction: %v8f32 = call <8 x float> @llvm.exp10.v8f32(<8 x float> undef)
+; ALL-NEXT:  Cost Model: Found an estimated cost of 32 for instruction: %v16f32 = call <16 x float> @llvm.exp10.v16f32(<16 x float> undef)
+; ALL-NEXT:  Cost Model: Found an estimated cost of 34 for instruction: %v17f32 = call <17 x float> @llvm.exp10.v17f32(<17 x float> undef)
+; ALL-NEXT:  Cost Model: Found an estimated cost of 10 for instruction: ret void
+;
+; ALL-SIZE-LABEL: 'exp10_f32'
+; ALL-SIZE-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %f32 = call float @llvm.exp10.f32(float undef)
+; ALL-SIZE-NEXT:  Cost Model: Found an estimated cost of 4 for instruction: %v2f32 = call <2 x float> @llvm.exp10.v2f32(<2 x float> undef)
+; ALL-SIZE-NEXT:  Cost Model: Found an estimated cost of 6 for instruction: %v3f32 = call <3 x float> @llvm.exp10.v3f32(<3 x float> undef)
+; ALL-SIZE-NEXT:  Cost Model: Found an estimated cost of 8 for instruction: %v4f32 = call <4 x float> @llvm.exp10.v4f32(<4 x float> undef)
+; ALL-SIZE-NEXT:  Cost Model: Found an estimated cost of 10 for instruction: %v5f32 = call <5 x float> @llvm.exp10.v5f32(<5 x float> undef)
+; ALL-SIZE-NEXT:  Cost Model: Found an estimated cost of 16 for instruction: %v8f32 = call <8 x float> @llvm.exp10.v8f32(<8 x float> undef)
+; ALL-SIZE-NEXT:  Cost Model: Found an estimated cost of 32 for instruction: %v16f32 = call <16 x float> @llvm.exp10.v16f32(<16 x float> undef)
+; ALL-SIZE-NEXT:  Cost Model: Found an estimated cost of 34 for instruction: %v17f32 = call <17 x float> @llvm.exp10.v17f32(<17 x float> undef)
+; ALL-SIZE-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: ret void
+;
+  %f32 = call float @llvm.exp10.f32(float undef)
+  %v2f32 = call <2 x float> @llvm.exp10.v2f32(<2 x float> undef)
+  %v3f32 = call <3 x float> @llvm.exp10.v3f32(<3 x float> undef)
+  %v4f32 = call <4 x float> @llvm.exp10.v4f32(<4 x float> undef)
+  %v5f32 = call <5 x float> @llvm.exp10.v5f32(<5 x float> undef)
+  %v8f32 = call <8 x float> @llvm.exp10.v8f32(<8 x float> undef)
+  %v16f32 = call <16 x float> @llvm.exp10.v16f32(<16 x float> undef)
+  %v17f32 = call <17 x float> @llvm.exp10.v17f32(<17 x float> undef)
+  ret void
+}
+
+define void @exp10_f64() {
+; ALL-LABEL: 'exp10_f64'
+; ALL-NEXT:  Cost Model: Found an estimated cost of 10 for instruction: %f64 = call double @llvm.exp10.f64(double undef)
+; ALL-NEXT:  Cost Model: Found an estimated cost of 20 for instruction: %v2f64 = call <2 x double> @llvm.exp10.v2f64(<2 x double> undef)
+; ALL-NEXT:  Cost Model: Found an estimated cost of 30 for instruction: %v3f64 = call <3 x double> @llvm.exp10.v3f64(<3 x double> undef)
+; ALL-NEXT:  Cost Model: Found an estimated cost of 40 for instruction: %v4f64 = call <4 x double> @llvm.exp10.v4f64(<4 x double> undef)
+; ALL-NEXT:  Cost Model: Found an estimated cost of 50 for instruction: %v5f64 = call <5 x double> @llvm.exp10.v5f64(<5 x double> undef)
+; ALL-NEXT:  Cost Model: Found an estimated cost of 80 for instruction: %v8f64 = call <8 x double> @llvm.exp10.v8f64(<8 x double> undef)
+; ALL-NEXT:  Cost Model: Found an estimated cost of 160 for instruction: %v16f64 = call <16 x double> @llvm.exp10.v16f64(<16 x double> undef)
+; ALL-NEXT:  Cost Model: Found an estimated cost of 170 for instruction: %v17f64 = call <17 x double> @llvm.exp10.v17f64(<17 x double> undef)
+; ALL-NEXT:  Cost Model: Found an estimated cost of 10 for instruction: ret void
+;
+; ALL-SIZE-LABEL: 'exp10_f64'
+; ALL-SIZE-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %f64 = call double @llvm.exp10.f64(double undef)
+; ALL-SIZE-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %v2f64 = call <2 x double> @llvm.exp10.v2f64(<2 x double> undef)
+; ALL-SIZE-NEXT:  Cost Model: Found an estimated cost of 3 for instruction: %v3f64 = call <3 x double> @llvm.exp10.v3f64(<3 x double> undef)
+; ALL-SIZE-NEXT:  Cost Model: Found an estimated cost of 4 for instruction: %v4f64 = call <4 x double> @llvm.exp10.v4f64(<4 x double> undef)
+; ALL-SIZE-NEXT:  Cost Model: Found an estimated cost of 5 for instruction: %v5f64 = call <5 x double> @llvm.exp10.v5f64(<5 x double> undef)
+; ALL-SIZE-NEXT:  Cost Model: Found an estimated cost of 8 for instruction: %v8f64 = call <8 x double> @llvm.exp10.v8f64(<8 x double> undef)
+; ALL-SIZE-NEXT:  Cost Model: Found an estimated cost of 16 for instruction: %v16f64 = call <16 x double> @llvm.exp10.v16f64(<16 x double> undef)
+; ALL-SIZE-NEXT:  Cost Model: Found an estimated cost of 17 for instruction: %v17f64 = call <17 x double> @llvm.exp10.v17f64(<17 x double> undef)
+; ALL-SIZE-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: ret void
+;
+  %f64 = call double @llvm.exp10.f64(double undef)
+  %v2f64 = call <2 x double> @llvm.exp10.v2f64(<2 x double> undef)
+  %v3f64 = call <3 x double> @llvm.exp10.v3f64(<3 x double> undef)
+  %v4f64 = call <4 x double> @llvm.exp10.v4f64(<4 x double> undef)
+  %v5f64 = call <5 x double> @llvm.exp10.v5f64(<5 x double> undef)
+  %v8f64 = call <8 x double> @llvm.exp10.v8f64(<8 x double> undef)
+  %v16f64 = call <16 x double> @llvm.exp10.v16f64(<16 x double> undef)
+  %v17f64 = call <17 x double> @llvm.exp10.v17f64(<17 x double> undef)
+  ret void
+}
diff --git a/llvm/test/Analysis/CostModel/AMDGPU/exp2.ll b/llvm/test/Analysis/CostModel/AMDGPU/exp2.ll
new file mode 100644
index 0000000..d1ff5e1
--- /dev/null
+++ b/llvm/test/Analysis/CostModel/AMDGPU/exp2.ll
@@ -0,0 +1,278 @@
+; NOTE: Assertions have been autogenerated by utils/update_analyze_test_checks.py UTC_ARGS: --version 5
+; RUN: opt < %s -mtriple=amdgcn-unknown-amdhsa -passes="print<cost-model>" -cost-kind=throughput 2>&1 -disable-output | FileCheck -check-prefixes=ALL,BASE %s
+; RUN: opt < %s -mtriple=amdgcn-unknown-amdhsa -mcpu=gfx803 -passes="print<cost-model>" -cost-kind=throughput 2>&1 -disable-output | FileCheck -check-prefixes=ALL,GFX8 %s
+; RUN: opt < %s -mtriple=amdgcn-unknown-amdhsa -mcpu=gfx900 -passes="print<cost-model>" -cost-kind=throughput 2>&1 -disable-output | FileCheck -check-prefixes=ALL,GFX9 %s
+; RUN: opt < %s -mtriple=amdgcn-unknown-amdhsa -mcpu=gfx1030 -passes="print<cost-model>" -cost-kind=throughput 2>&1 -disable-output | FileCheck -check-prefixes=ALL,GFX10 %s
+
+; RUN: opt < %s -mtriple=amdgcn-unknown-amdhsa -passes="print<cost-model>" -cost-kind=code-size 2>&1 -disable-output | FileCheck -check-prefixes=ALL-SIZE,BASE-SIZE %s
+; RUN: opt < %s -mtriple=amdgcn-unknown-amdhsa -mcpu=gfx803 -passes="print<cost-model>" -cost-kind=code-size 2>&1 -disable-output | FileCheck -check-prefixes=ALL-SIZE,GFX8-SIZE %s
+; RUN: opt < %s -mtriple=amdgcn-unknown-amdhsa -mcpu=gfx900 -passes="print<cost-model>" -cost-kind=code-size 2>&1 -disable-output | FileCheck -check-prefixes=ALL-SIZE,GFX9-SIZE %s
+; RUN: opt < %s -mtriple=amdgcn-unknown-amdhsa -mcpu=gfx1030 -passes="print<cost-model>" -cost-kind=code-size 2>&1 -disable-output | FileCheck -check-prefixes=ALL-SIZE,GFX10-SIZE %s
+
+define void @exp2_f16() {
+; BASE-LABEL: 'exp2_f16'
+; BASE-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %f16 = call half @llvm.exp2.f16(half undef)
+; BASE-NEXT:  Cost Model: Found an estimated cost of 4 for instruction: %v2f16 = call <2 x half> @llvm.exp2.v2f16(<2 x half> undef)
+; BASE-NEXT:  Cost Model: Found an estimated cost of 8 for instruction: %v3f16 = call <3 x half> @llvm.exp2.v3f16(<3 x half> undef)
+; BASE-NEXT:  Cost Model: Found an estimated cost of 8 for instruction: %v4f16 = call <4 x half> @llvm.exp2.v4f16(<4 x half> undef)
+; BASE-NEXT:  Cost Model: Found an estimated cost of 16 for instruction: %v5f16 = call <5 x half> @llvm.exp2.v5f16(<5 x half> undef)
+; BASE-NEXT:  Cost Model: Found an estimated cost of 16 for instruction: %v8f16 = call <8 x half> @llvm.exp2.v8f16(<8 x half> undef)
+; BASE-NEXT:  Cost Model: Found an estimated cost of 32 for instruction: %v16f16 = call <16 x half> @llvm.exp2.v16f16(<16 x half> undef)
+; BASE-NEXT:  Cost Model: Found an estimated cost of 64 for instruction: %v17f16 = call <17 x half> @llvm.exp2.v17f16(<17 x half> undef)
+; BASE-NEXT:  Cost Model: Found an estimated cost of 10 for instruction: ret void
+;
+; GFX8-LABEL: 'exp2_f16'
+; GFX8-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %f16 = call half @llvm.exp2.f16(half undef)
+; GFX8-NEXT:  Cost Model: Found an estimated cost of 3 for instruction: %v2f16 = call <2 x half> @llvm.exp2.v2f16(<2 x half> undef)
+; GFX8-NEXT:  Cost Model: Found an estimated cost of 5 for instruction: %v3f16 = call <3 x half> @llvm.exp2.v3f16(<3 x half> undef)
+; GFX8-NEXT:  Cost Model: Found an estimated cost of 7 for instruction: %v4f16 = call <4 x half> @llvm.exp2.v4f16(<4 x half> undef)
+; GFX8-NEXT:  Cost Model: Found an estimated cost of 9 for instruction: %v5f16 = call <5 x half> @llvm.exp2.v5f16(<5 x half> undef)
+; GFX8-NEXT:  Cost Model: Found an estimated cost of 15 for instruction: %v8f16 = call <8 x half> @llvm.exp2.v8f16(<8 x half> undef)
+; GFX8-NEXT:  Cost Model: Found an estimated cost of 31 for instruction: %v16f16 = call <16 x half> @llvm.exp2.v16f16(<16 x half> undef)
+; GFX8-NEXT:  Cost Model: Found an estimated cost of 33 for instruction: %v17f16 = call <17 x half> @llvm.exp2.v17f16(<17 x half> undef)
+; GFX8-NEXT:  Cost Model: Found an estimated cost of 10 for instruction: ret void
+;
+; GFX9-LABEL: 'exp2_f16'
+; GFX9-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %f16 = call half @llvm.exp2.f16(half undef)
+; GFX9-NEXT:  Cost Model: Found an estimated cost of 3 for instruction: %v2f16 = call <2 x half> @llvm.exp2.v2f16(<2 x half> undef)
+; GFX9-NEXT:  Cost Model: Found an estimated cost of 5 for instruction: %v3f16 = call <3 x half> @llvm.exp2.v3f16(<3 x half> undef)
+; GFX9-NEXT:  Cost Model: Found an estimated cost of 7 for instruction: %v4f16 = call <4 x half> @llvm.exp2.v4f16(<4 x half> undef)
+; GFX9-NEXT:  Cost Model: Found an estimated cost of 9 for instruction: %v5f16 = call <5 x half> @llvm.exp2.v5f16(<5 x half> undef)
+; GFX9-NEXT:  Cost Model: Found an estimated cost of 15 for instruction: %v8f16 = call <8 x half> @llvm.exp2.v8f16(<8 x half> undef)
+; GFX9-NEXT:  Cost Model: Found an estimated cost of 31 for instruction: %v16f16 = call <16 x half> @llvm.exp2.v16f16(<16 x half> undef)
+; GFX9-NEXT:  Cost Model: Found an estimated cost of 33 for instruction: %v17f16 = call <17 x half> @llvm.exp2.v17f16(<17 x half> undef)
+; GFX9-NEXT:  Cost Model: Found an estimated cost of 10 for instruction: ret void
+;
+; GFX10-LABEL: 'exp2_f16'
+; GFX10-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %f16 = call half @llvm.exp2.f16(half undef)
+; GFX10-NEXT:  Cost Model: Found an estimated cost of 3 for instruction: %v2f16 = call <2 x half> @llvm.exp2.v2f16(<2 x half> undef)
+; GFX10-NEXT:  Cost Model: Found an estimated cost of 5 for instruction: %v3f16 = call <3 x half> @llvm.exp2.v3f16(<3 x half> undef)
+; GFX10-NEXT:  Cost Model: Found an estimated cost of 7 for instruction: %v4f16 = call <4 x half> @llvm.exp2.v4f16(<4 x half> undef)
+; GFX10-NEXT:  Cost Model: Found an estimated cost of 9 for instruction: %v5f16 = call <5 x half> @llvm.exp2.v5f16(<5 x half> undef)
+; GFX10-NEXT:  Cost Model: Found an estimated cost of 15 for instruction: %v8f16 = call <8 x half> @llvm.exp2.v8f16(<8 x half> undef)
+; GFX10-NEXT:  Cost Model: Found an estimated cost of 31 for instruction: %v16f16 = call <16 x half> @llvm.exp2.v16f16(<16 x half> undef)
+; GFX10-NEXT:  Cost Model: Found an estimated cost of 33 for instruction: %v17f16 = call <17 x half> @llvm.exp2.v17f16(<17 x half> undef)
+; GFX10-NEXT:  Cost Model: Found an estimated cost of 10 for instruction: ret void
+;
+; BASE-SIZE-LABEL: 'exp2_f16'
+; BASE-SIZE-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %f16 = call half @llvm.exp2.f16(half undef)
+; BASE-SIZE-NEXT:  Cost Model: Found an estimated cost of 4 for instruction: %v2f16 = call <2 x half> @llvm.exp2.v2f16(<2 x half> undef)
+; BASE-SIZE-NEXT:  Cost Model: Found an estimated cost of 8 for instruction: %v3f16 = call <3 x half> @llvm.exp2.v3f16(<3 x half> undef)
+; BASE-SIZE-NEXT:  Cost Model: Found an estimated cost of 8 for instruction: %v4f16 = call <4 x half> @llvm.exp2.v4f16(<4 x half> undef)
+; BASE-SIZE-NEXT:  Cost Model: Found an estimated cost of 16 for instruction: %v5f16 = call <5 x half> @llvm.exp2.v5f16(<5 x half> undef)
+; BASE-SIZE-NEXT:  Cost Model: Found an estimated cost of 16 for instruction: %v8f16 = call <8 x half> @llvm.exp2.v8f16(<8 x half> undef)
+; BASE-SIZE-NEXT:  Cost Model: Found an estimated cost of 32 for instruction: %v16f16 = call <16 x half> @llvm.exp2.v16f16(<16 x half> undef)
+; BASE-SIZE-NEXT:  Cost Model: Found an estimated cost of 64 for instruction: %v17f16 = call <17 x half> @llvm.exp2.v17f16(<17 x half> undef)
+; BASE-SIZE-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: ret void
+;
+; GFX8-SIZE-LABEL: 'exp2_f16'
+; GFX8-SIZE-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %f16 = call half @llvm.exp2.f16(half undef)
+; GFX8-SIZE-NEXT:  Cost Model: Found an estimated cost of 3 for instruction: %v2f16 = call <2 x half> @llvm.exp2.v2f16(<2 x half> undef)
+; GFX8-SIZE-NEXT:  Cost Model: Found an estimated cost of 5 for instruction: %v3f16 = call <3 x half> @llvm.exp2.v3f16(<3 x half> undef)
+; GFX8-SIZE-NEXT:  Cost Model: Found an estimated cost of 7 for instruction: %v4f16 = call <4 x half> @llvm.exp2.v4f16(<4 x half> undef)
+; GFX8-SIZE-NEXT:  Cost Model: Found an estimated cost of 9 for instruction: %v5f16 = call <5 x half> @llvm.exp2.v5f16(<5 x half> undef)
+; GFX8-SIZE-NEXT:  Cost Model: Found an estimated cost of 15 for instruction: %v8f16 = call <8 x half> @llvm.exp2.v8f16(<8 x half> undef)
+; GFX8-SIZE-NEXT:  Cost Model: Found an estimated cost of 31 for instruction: %v16f16 = call <16 x half> @llvm.exp2.v16f16(<16 x half> undef)
+; GFX8-SIZE-NEXT:  Cost Model: Found an estimated cost of 33 for instruction: %v17f16 = call <17 x half> @llvm.exp2.v17f16(<17 x half> undef)
+; GFX8-SIZE-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: ret void
+;
+; GFX9-SIZE-LABEL: 'exp2_f16'
+; GFX9-SIZE-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %f16 = call half @llvm.exp2.f16(half undef)
+; GFX9-SIZE-NEXT:  Cost Model: Found an estimated cost of 3 for instruction: %v2f16 = call <2 x half> @llvm.exp2.v2f16(<2 x half> undef)
+; GFX9-SIZE-NEXT:  Cost Model: Found an estimated cost of 5 for instruction: %v3f16 = call <3 x half> @llvm.exp2.v3f16(<3 x half> undef)
+; GFX9-SIZE-NEXT:  Cost Model: Found an estimated cost of 7 for instruction: %v4f16 = call <4 x half> @llvm.exp2.v4f16(<4 x half> undef)
+; GFX9-SIZE-NEXT:  Cost Model: Found an estimated cost of 9 for instruction: %v5f16 = call <5 x half> @llvm.exp2.v5f16(<5 x half> undef)
+; GFX9-SIZE-NEXT:  Cost Model: Found an estimated cost of 15 for instruction: %v8f16 = call <8 x half> @llvm.exp2.v8f16(<8 x half> undef)
+; GFX9-SIZE-NEXT:  Cost Model: Found an estimated cost of 31 for instruction: %v16f16 = call <16 x half> @llvm.exp2.v16f16(<16 x half> undef)
+; GFX9-SIZE-NEXT:  Cost Model: Found an estimated cost of 33 for instruction: %v17f16 = call <17 x half> @llvm.exp2.v17f16(<17 x half> undef)
+; GFX9-SIZE-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: ret void
+;
+; GFX10-SIZE-LABEL: 'exp2_f16'
+; GFX10-SIZE-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %f16 = call half @llvm.exp2.f16(half undef)
+; GFX10-SIZE-NEXT:  Cost Model: Found an estimated cost of 3 for instruction: %v2f16 = call <2 x half> @llvm.exp2.v2f16(<2 x half> undef)
+; GFX10-SIZE-NEXT:  Cost Model: Found an estimated cost of 5 for instruction: %v3f16 = call <3 x half> @llvm.exp2.v3f16(<3 x half> undef)
+; GFX10-SIZE-NEXT:  Cost Model: Found an estimated cost of 7 for instruction: %v4f16 = call <4 x half> @llvm.exp2.v4f16(<4 x half> undef)
+; GFX10-SIZE-NEXT:  Cost Model: Found an estimated cost of 9 for instruction: %v5f16 = call <5 x half> @llvm.exp2.v5f16(<5 x half> undef)
+; GFX10-SIZE-NEXT:  Cost Model: Found an estimated cost of 15 for instruction: %v8f16 = call <8 x half> @llvm.exp2.v8f16(<8 x half> undef)
+; GFX10-SIZE-NEXT:  Cost Model: Found an estimated cost of 31 for instruction: %v16f16 = call <16 x half> @llvm.exp2.v16f16(<16 x half> undef)
+; GFX10-SIZE-NEXT:  Cost Model: Found an estimated cost of 33 for instruction: %v17f16 = call <17 x half> @llvm.exp2.v17f16(<17 x half> undef)
+; GFX10-SIZE-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: ret void
+;
+  %f16 = call half @llvm.exp2.f16(half undef)
+  %v2f16 = call <2 x half> @llvm.exp2.v2f16(<2 x half> undef)
+  %v3f16 = call <3 x half> @llvm.exp2.v3f16(<3 x half> undef)
+  %v4f16 = call <4 x half> @llvm.exp2.v4f16(<4 x half> undef)
+  %v5f16 = call <5 x half> @llvm.exp2.v5f16(<5 x half> undef)
+  %v8f16 = call <8 x half> @llvm.exp2.v8f16(<8 x half> undef)
+  %v16f16 = call <16 x half> @llvm.exp2.v16f16(<16 x half> undef)
+  %v17f16 = call <17 x half> @llvm.exp2.v17f16(<17 x half> undef)
+  ret void
+}
+
+define void @exp2_bf16() {
+; BASE-LABEL: 'exp2_bf16'
+; BASE-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %bf16 = call bfloat @llvm.exp2.bf16(bfloat undef)
+; BASE-NEXT:  Cost Model: Found an estimated cost of 4 for instruction: %v2bf16 = call <2 x bfloat> @llvm.exp2.v2bf16(<2 x bfloat> undef)
+; BASE-NEXT:  Cost Model: Found an estimated cost of 8 for instruction: %v3bf16 = call <3 x bfloat> @llvm.exp2.v3bf16(<3 x bfloat> undef)
+; BASE-NEXT:  Cost Model: Found an estimated cost of 8 for instruction: %v4bf16 = call <4 x bfloat> @llvm.exp2.v4bf16(<4 x bfloat> undef)
+; BASE-NEXT:  Cost Model: Found an estimated cost of 16 for instruction: %v5bf16 = call <5 x bfloat> @llvm.exp2.v5bf16(<5 x bfloat> undef)
+; BASE-NEXT:  Cost Model: Found an estimated cost of 16 for instruction: %v8bf16 = call <8 x bfloat> @llvm.exp2.v8bf16(<8 x bfloat> undef)
+; BASE-NEXT:  Cost Model: Found an estimated cost of 32 for instruction: %v16bf16 = call <16 x bfloat> @llvm.exp2.v16bf16(<16 x bfloat> undef)
+; BASE-NEXT:  Cost Model: Found an estimated cost of 64 for instruction: %v17bf16 = call <17 x bfloat> @llvm.exp2.v17bf16(<17 x bfloat> undef)
+; BASE-NEXT:  Cost Model: Found an estimated cost of 10 for instruction: ret void
+;
+; GFX8-LABEL: 'exp2_bf16'
+; GFX8-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %bf16 = call bfloat @llvm.exp2.bf16(bfloat undef)
+; GFX8-NEXT:  Cost Model: Found an estimated cost of 3 for instruction: %v2bf16 = call <2 x bfloat> @llvm.exp2.v2bf16(<2 x bfloat> undef)
+; GFX8-NEXT:  Cost Model: Found an estimated cost of 5 for instruction: %v3bf16 = call <3 x bfloat> @llvm.exp2.v3bf16(<3 x bfloat> undef)
+; GFX8-NEXT:  Cost Model: Found an estimated cost of 7 for instruction: %v4bf16 = call <4 x bfloat> @llvm.exp2.v4bf16(<4 x bfloat> undef)
+; GFX8-NEXT:  Cost Model: Found an estimated cost of 9 for instruction: %v5bf16 = call <5 x bfloat> @llvm.exp2.v5bf16(<5 x bfloat> undef)
+; GFX8-NEXT:  Cost Model: Found an estimated cost of 15 for instruction: %v8bf16 = call <8 x bfloat> @llvm.exp2.v8bf16(<8 x bfloat> undef)
+; GFX8-NEXT:  Cost Model: Found an estimated cost of 31 for instruction: %v16bf16 = call <16 x bfloat> @llvm.exp2.v16bf16(<16 x bfloat> undef)
+; GFX8-NEXT:  Cost Model: Found an estimated cost of 33 for instruction: %v17bf16 = call <17 x bfloat> @llvm.exp2.v17bf16(<17 x bfloat> undef)
+; GFX8-NEXT:  Cost Model: Found an estimated cost of 10 for instruction: ret void
+;
+; GFX9-LABEL: 'exp2_bf16'
+; GFX9-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %bf16 = call bfloat @llvm.exp2.bf16(bfloat undef)
+; GFX9-NEXT:  Cost Model: Found an estimated cost of 3 for instruction: %v2bf16 = call <2 x bfloat> @llvm.exp2.v2bf16(<2 x bfloat> undef)
+; GFX9-NEXT:  Cost Model: Found an estimated cost of 5 for instruction: %v3bf16 = call <3 x bfloat> @llvm.exp2.v3bf16(<3 x bfloat> undef)
+; GFX9-NEXT:  Cost Model: Found an estimated cost of 7 for instruction: %v4bf16 = call <4 x bfloat> @llvm.exp2.v4bf16(<4 x bfloat> undef)
+; GFX9-NEXT:  Cost Model: Found an estimated cost of 9 for instruction: %v5bf16 = call <5 x bfloat> @llvm.exp2.v5bf16(<5 x bfloat> undef)
+; GFX9-NEXT:  Cost Model: Found an estimated cost of 15 for instruction: %v8bf16 = call <8 x bfloat> @llvm.exp2.v8bf16(<8 x bfloat> undef)
+; GFX9-NEXT:  Cost Model: Found an estimated cost of 31 for instruction: %v16bf16 = call <16 x bfloat> @llvm.exp2.v16bf16(<16 x bfloat> undef)
+; GFX9-NEXT:  Cost Model: Found an estimated cost of 33 for instruction: %v17bf16 = call <17 x bfloat> @llvm.exp2.v17bf16(<17 x bfloat> undef)
+; GFX9-NEXT:  Cost Model: Found an estimated cost of 10 for instruction: ret void
+;
+; GFX10-LABEL: 'exp2_bf16'
+; GFX10-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %bf16 = call bfloat @llvm.exp2.bf16(bfloat undef)
+; GFX10-NEXT:  Cost Model: Found an estimated cost of 3 for instruction: %v2bf16 = call <2 x bfloat> @llvm.exp2.v2bf16(<2 x bfloat> undef)
+; GFX10-NEXT:  Cost Model: Found an estimated cost of 5 for instruction: %v3bf16 = call <3 x bfloat> @llvm.exp2.v3bf16(<3 x bfloat> undef)
+; GFX10-NEXT:  Cost Model: Found an estimated cost of 7 for instruction: %v4bf16 = call <4 x bfloat> @llvm.exp2.v4bf16(<4 x bfloat> undef)
+; GFX10-NEXT:  Cost Model: Found an estimated cost of 9 for instruction: %v5bf16 = call <5 x bfloat> @llvm.exp2.v5bf16(<5 x bfloat> undef)
+; GFX10-NEXT:  Cost Model: Found an estimated cost of 15 for instruction: %v8bf16 = call <8 x bfloat> @llvm.exp2.v8bf16(<8 x bfloat> undef)
+; GFX10-NEXT:  Cost Model: Found an estimated cost of 31 for instruction: %v16bf16 = call <16 x bfloat> @llvm.exp2.v16bf16(<16 x bfloat> undef)
+; GFX10-NEXT:  Cost Model: Found an estimated cost of 33 for instruction: %v17bf16 = call <17 x bfloat> @llvm.exp2.v17bf16(<17 x bfloat> undef)
+; GFX10-NEXT:  Cost Model: Found an estimated cost of 10 for instruction: ret void
+;
+; BASE-SIZE-LABEL: 'exp2_bf16'
+; BASE-SIZE-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %bf16 = call bfloat @llvm.exp2.bf16(bfloat undef)
+; BASE-SIZE-NEXT:  Cost Model: Found an estimated cost of 4 for instruction: %v2bf16 = call <2 x bfloat> @llvm.exp2.v2bf16(<2 x bfloat> undef)
+; BASE-SIZE-NEXT:  Cost Model: Found an estimated cost of 8 for instruction: %v3bf16 = call <3 x bfloat> @llvm.exp2.v3bf16(<3 x bfloat> undef)
+; BASE-SIZE-NEXT:  Cost Model: Found an estimated cost of 8 for instruction: %v4bf16 = call <4 x bfloat> @llvm.exp2.v4bf16(<4 x bfloat> undef)
+; BASE-SIZE-NEXT:  Cost Model: Found an estimated cost of 16 for instruction: %v5bf16 = call <5 x bfloat> @llvm.exp2.v5bf16(<5 x bfloat> undef)
+; BASE-SIZE-NEXT:  Cost Model: Found an estimated cost of 16 for instruction: %v8bf16 = call <8 x bfloat> @llvm.exp2.v8bf16(<8 x bfloat> undef)
+; BASE-SIZE-NEXT:  Cost Model: Found an estimated cost of 32 for instruction: %v16bf16 = call <16 x bfloat> @llvm.exp2.v16bf16(<16 x bfloat> undef)
+; BASE-SIZE-NEXT:  Cost Model: Found an estimated cost of 64 for instruction: %v17bf16 = call <17 x bfloat> @llvm.exp2.v17bf16(<17 x bfloat> undef)
+; BASE-SIZE-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: ret void
+;
+; GFX8-SIZE-LABEL: 'exp2_bf16'
+; GFX8-SIZE-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %bf16 = call bfloat @llvm.exp2.bf16(bfloat undef)
+; GFX8-SIZE-NEXT:  Cost Model: Found an estimated cost of 3 for instruction: %v2bf16 = call <2 x bfloat> @llvm.exp2.v2bf16(<2 x bfloat> undef)
+; GFX8-SIZE-NEXT:  Cost Model: Found an estimated cost of 5 for instruction: %v3bf16 = call <3 x bfloat> @llvm.exp2.v3bf16(<3 x bfloat> undef)
+; GFX8-SIZE-NEXT:  Cost Model: Found an estimated cost of 7 for instruction: %v4bf16 = call <4 x bfloat> @llvm.exp2.v4bf16(<4 x bfloat> undef)
+; GFX8-SIZE-NEXT:  Cost Model: Found an estimated cost of 9 for instruction: %v5bf16 = call <5 x bfloat> @llvm.exp2.v5bf16(<5 x bfloat> undef)
+; GFX8-SIZE-NEXT:  Cost Model: Found an estimated cost of 15 for instruction: %v8bf16 = call <8 x bfloat> @llvm.exp2.v8bf16(<8 x bfloat> undef)
+; GFX8-SIZE-NEXT:  Cost Model: Found an estimated cost of 31 for instruction: %v16bf16 = call <16 x bfloat> @llvm.exp2.v16bf16(<16 x bfloat> undef)
+; GFX8-SIZE-NEXT:  Cost Model: Found an estimated cost of 33 for instruction: %v17bf16 = call <17 x bfloat> @llvm.exp2.v17bf16(<17 x bfloat> undef)
+; GFX8-SIZE-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: ret void
+;
+; GFX9-SIZE-LABEL: 'exp2_bf16'
+; GFX9-SIZE-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %bf16 = call bfloat @llvm.exp2.bf16(bfloat undef)
+; GFX9-SIZE-NEXT:  Cost Model: Found an estimated cost of 3 for instruction: %v2bf16 = call <2 x bfloat> @llvm.exp2.v2bf16(<2 x bfloat> undef)
+; GFX9-SIZE-NEXT:  Cost Model: Found an estimated cost of 5 for instruction: %v3bf16 = call <3 x bfloat> @llvm.exp2.v3bf16(<3 x bfloat> undef)
+; GFX9-SIZE-NEXT:  Cost Model: Found an estimated cost of 7 for instruction: %v4bf16 = call <4 x bfloat> @llvm.exp2.v4bf16(<4 x bfloat> undef)
+; GFX9-SIZE-NEXT:  Cost Model: Found an estimated cost of 9 for instruction: %v5bf16 = call <5 x bfloat> @llvm.exp2.v5bf16(<5 x bfloat> undef)
+; GFX9-SIZE-NEXT:  Cost Model: Found an estimated cost of 15 for instruction: %v8bf16 = call <8 x bfloat> @llvm.exp2.v8bf16(<8 x bfloat> undef)
+; GFX9-SIZE-NEXT:  Cost Model: Found an estimated cost of 31 for instruction: %v16bf16 = call <16 x bfloat> @llvm.exp2.v16bf16(<16 x bfloat> undef)
+; GFX9-SIZE-NEXT:  Cost Model: Found an estimated cost of 33 for instruction: %v17bf16 = call <17 x bfloat> @llvm.exp2.v17bf16(<17 x bfloat> undef)
+; GFX9-SIZE-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: ret void
+;
+; GFX10-SIZE-LABEL: 'exp2_bf16'
+; GFX10-SIZE-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %bf16 = call bfloat @llvm.exp2.bf16(bfloat undef)
+; GFX10-SIZE-NEXT:  Cost Model: Found an estimated cost of 3 for instruction: %v2bf16 = call <2 x bfloat> @llvm.exp2.v2bf16(<2 x bfloat> undef)
+; GFX10-SIZE-NEXT:  Cost Model: Found an estimated cost of 5 for instruction: %v3bf16 = call <3 x bfloat> @llvm.exp2.v3bf16(<3 x bfloat> undef)
+; GFX10-SIZE-NEXT:  Cost Model: Found an estimated cost of 7 for instruction: %v4bf16 = call <4 x bfloat> @llvm.exp2.v4bf16(<4 x bfloat> undef)
+; GFX10-SIZE-NEXT:  Cost Model: Found an estimated cost of 9 for instruction: %v5bf16 = call <5 x bfloat> @llvm.exp2.v5bf16(<5 x bfloat> undef)
+; GFX10-SIZE-NEXT:  Cost Model: Found an estimated cost of 15 for instruction: %v8bf16 = call <8 x bfloat> @llvm.exp2.v8bf16(<8 x bfloat> undef)
+; GFX10-SIZE-NEXT:  Cost Model: Found an estimated cost of 31 for instruction: %v16bf16 = call <16 x bfloat> @llvm.exp2.v16bf16(<16 x bfloat> undef)
+; GFX10-SIZE-NEXT:  Cost Model: Found an estimated cost of 33 for instruction: %v17bf16 = call <17 x bfloat> @llvm.exp2.v17bf16(<17 x bfloat> undef)
+; GFX10-SIZE-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: ret void
+;
+  %bf16 = call bfloat @llvm.exp2.bf16(bfloat undef)
+  %v2bf16 = call <2 x bfloat> @llvm.exp2.v2bf16(<2 x bfloat> undef)
+  %v3bf16 = call <3 x bfloat> @llvm.exp2.v3bf16(<3 x bfloat> undef)
+  %v4bf16 = call <4 x bfloat> @llvm.exp2.v4bf16(<4 x bfloat> undef)
+  %v5bf16 = call <5 x bfloat> @llvm.exp2.v5bf16(<5 x bfloat> undef)
+  %v8bf16 = call <8 x bfloat> @llvm.exp2.v8bf16(<8 x bfloat> undef)
+  %v16bf16 = call <16 x bfloat> @llvm.exp2.v16bf16(<16 x bfloat> undef)
+  %v17bf16 = call <17 x bfloat> @llvm.exp2.v17bf16(<17 x bfloat> undef)
+  ret void
+}
+
+define void @exp2_f32() {
+; ALL-LABEL: 'exp2_f32'
+; ALL-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %f32 = call float @llvm.exp2.f32(float undef)
+; ALL-NEXT:  Cost Model: Found an estimated cost of 4 for instruction: %v2f32 = call <2 x float> @llvm.exp2.v2f32(<2 x float> undef)
+; ALL-NEXT:  Cost Model: Found an estimated cost of 6 for instruction: %v3f32 = call <3 x float> @llvm.exp2.v3f32(<3 x float> undef)
+; ALL-NEXT:  Cost Model: Found an estimated cost of 8 for instruction: %v4f32 = call <4 x float> @llvm.exp2.v4f32(<4 x float> undef)
+; ALL-NEXT:  Cost Model: Found an estimated cost of 10 for instruction: %v5f32 = call <5 x float> @llvm.exp2.v5f32(<5 x float> undef)
+; ALL-NEXT:  Cost Model: Found an estimated cost of 16 for instruction: %v8f32 = call <8 x float> @llvm.exp2.v8f32(<8 x float> undef)
+; ALL-NEXT:  Cost Model: Found an estimated cost of 32 for instruction: %v16f32 = call <16 x float> @llvm.exp2.v16f32(<16 x float> undef)
+; ALL-NEXT:  Cost Model: Found an estimated cost of 34 for instruction: %v17f32 = call <17 x float> @llvm.exp2.v17f32(<17 x float> undef)
+; ALL-NEXT:  Cost Model: Found an estimated cost of 10 for instruction: ret void
+;
+; ALL-SIZE-LABEL: 'exp2_f32'
+; ALL-SIZE-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %f32 = call float @llvm.exp2.f32(float undef)
+; ALL-SIZE-NEXT:  Cost Model: Found an estimated cost of 4 for instruction: %v2f32 = call <2 x float> @llvm.exp2.v2f32(<2 x float> undef)
+; ALL-SIZE-NEXT:  Cost Model: Found an estimated cost of 6 for instruction: %v3f32 = call <3 x float> @llvm.exp2.v3f32(<3 x float> undef)
+; ALL-SIZE-NEXT:  Cost Model: Found an estimated cost of 8 for instruction: %v4f32 = call <4 x float> @llvm.exp2.v4f32(<4 x float> undef)
+; ALL-SIZE-NEXT:  Cost Model: Found an estimated cost of 10 for instruction: %v5f32 = call <5 x float> @llvm.exp2.v5f32(<5 x float> undef)
+; ALL-SIZE-NEXT:  Cost Model: Found an estimated cost of 16 for instruction: %v8f32 = call <8 x float> @llvm.exp2.v8f32(<8 x float> undef)
+; ALL-SIZE-NEXT:  Cost Model: Found an estimated cost of 32 for instruction: %v16f32 = call <16 x float> @llvm.exp2.v16f32(<16 x float> undef)
+; ALL-SIZE-NEXT:  Cost Model: Found an estimated cost of 34 for instruction: %v17f32 = call <17 x float> @llvm.exp2.v17f32(<17 x float> undef)
+; ALL-SIZE-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: ret void
+;
+  %f32 = call float @llvm.exp2.f32(float undef)
+  %v2f32 = call <2 x float> @llvm.exp2.v2f32(<2 x float> undef)
+  %v3f32 = call <3 x float> @llvm.exp2.v3f32(<3 x float> undef)
+  %v4f32 = call <4 x float> @llvm.exp2.v4f32(<4 x float> undef)
+  %v5f32 = call <5 x float> @llvm.exp2.v5f32(<5 x float> undef)
+  %v8f32 = call <8 x float> @llvm.exp2.v8f32(<8 x float> undef)
+  %v16f32 = call <16 x float> @llvm.exp2.v16f32(<16 x float> undef)
+  %v17f32 = call <17 x float> @llvm.exp2.v17f32(<17 x float> undef)
+  ret void
+}
+
+define void @exp2_f64() {
+; ALL-LABEL: 'exp2_f64'
+; ALL-NEXT:  Cost Model: Found an estimated cost of 10 for instruction: %f64 = call double @llvm.exp2.f64(double undef)
+; ALL-NEXT:  Cost Model: Found an estimated cost of 20 for instruction: %v2f64 = call <2 x double> @llvm.exp2.v2f64(<2 x double> undef)
+; ALL-NEXT:  Cost Model: Found an estimated cost of 30 for instruction: %v3f64 = call <3 x double> @llvm.exp2.v3f64(<3 x double> undef)
+; ALL-NEXT:  Cost Model: Found an estimated cost of 40 for instruction: %v4f64 = call <4 x double> @llvm.exp2.v4f64(<4 x double> undef)
+; ALL-NEXT:  Cost Model: Found an estimated cost of 50 for instruction: %v5f64 = call <5 x double> @llvm.exp2.v5f64(<5 x double> undef)
+; ALL-NEXT:  Cost Model: Found an estimated cost of 80 for instruction: %v8f64 = call <8 x double> @llvm.exp2.v8f64(<8 x double> undef)
+; ALL-NEXT:  Cost Model: Found an estimated cost of 160 for instruction: %v16f64 = call <16 x double> @llvm.exp2.v16f64(<16 x double> undef)
+; ALL-NEXT:  Cost Model: Found an estimated cost of 170 for instruction: %v17f64 = call <17 x double> @llvm.exp2.v17f64(<17 x double> undef)
+; ALL-NEXT:  Cost Model: Found an estimated cost of 10 for instruction: ret void
+;
+; ALL-SIZE-LABEL: 'exp2_f64'
+; ALL-SIZE-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %f64 = call double @llvm.exp2.f64(double undef)
+; ALL-SIZE-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %v2f64 = call <2 x double> @llvm.exp2.v2f64(<2 x double> undef)
+; ALL-SIZE-NEXT:  Cost Model: Found an estimated cost of 3 for instruction: %v3f64 = call <3 x double> @llvm.exp2.v3f64(<3 x double> undef)
+; ALL-SIZE-NEXT:  Cost Model: Found an estimated cost of 4 for instruction: %v4f64 = call <4 x double> @llvm.exp2.v4f64(<4 x double> undef)
+; ALL-SIZE-NEXT:  Cost Model: Found an estimated cost of 5 for instruction: %v5f64 = call <5 x double> @llvm.exp2.v5f64(<5 x double> undef)
+; ALL-SIZE-NEXT:  Cost Model: Found an estimated cost of 8 for instruction: %v8f64 = call <8 x double> @llvm.exp2.v8f64(<8 x double> undef)
+; ALL-SIZE-NEXT:  Cost Model: Found an estimated cost of 16 for instruction: %v16f64 = call <16 x double> @llvm.exp2.v16f64(<16 x double> undef)
+; ALL-SIZE-NEXT:  Cost Model: Found an estimated cost of 17 for instruction: %v17f64 = call <17 x double> @llvm.exp2.v17f64(<17 x double> undef)
+; ALL-SIZE-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: ret void
+;
+  %f64 = call double @llvm.exp2.f64(double undef)
+  %v2f64 = call <2 x double> @llvm.exp2.v2f64(<2 x double> undef)
+  %v3f64 = call <3 x double> @llvm.exp2.v3f64(<3 x double> undef)
+  %v4f64 = call <4 x double> @llvm.exp2.v4f64(<4 x double> undef)
+  %v5f64 = call <5 x double> @llvm.exp2.v5f64(<5 x double> undef)
+  %v8f64 = call <8 x double> @llvm.exp2.v8f64(<8 x double> undef)
+  %v16f64 = call <16 x double> @llvm.exp2.v16f64(<16 x double> undef)
+  %v17f64 = call <17 x double> @llvm.exp2.v17f64(<17 x double> undef)
+  ret void
+}
diff --git a/llvm/test/Analysis/CostModel/AMDGPU/fabs.ll b/llvm/test/Analysis/CostModel/AMDGPU/fabs.ll
index daad19e..da198ad 100644
--- a/llvm/test/Analysis/CostModel/AMDGPU/fabs.ll
+++ b/llvm/test/Analysis/CostModel/AMDGPU/fabs.ll
@@ -1,116 +1,139 @@
-; NOTE: Assertions have been autogenerated by utils/update_analyze_test_checks.py
+; NOTE: Assertions have been autogenerated by utils/update_analyze_test_checks.py UTC_ARGS: --version 5
 ; RUN: opt -passes="print<cost-model>" 2>&1 -disable-output -mtriple=amdgcn-unknown-amdhsa < %s | FileCheck -check-prefixes=ALL %s
 ; RUN: opt -passes="print<cost-model>" -cost-kind=code-size 2>&1 -disable-output -mtriple=amdgcn-unknown-amdhsa < %s | FileCheck -check-prefixes=ALL-SIZE %s
-; END.
 
-define amdgpu_kernel void @fabs_f32() #0 {
-; ALL-LABEL: 'fabs_f32'
-; ALL-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: %f32 = call float @llvm.fabs.f32(float undef) #2
-; ALL-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: %v2f32 = call <2 x float> @llvm.fabs.v2f32(<2 x float> undef) #2
-; ALL-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: %v3f32 = call <3 x float> @llvm.fabs.v3f32(<3 x float> undef) #2
-; ALL-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: %v4f32 = call <4 x float> @llvm.fabs.v4f32(<4 x float> undef) #2
-; ALL-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: %v5f32 = call <5 x float> @llvm.fabs.v5f32(<5 x float> undef) #2
-; ALL-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: %v8f32 = call <8 x float> @llvm.fabs.v8f32(<8 x float> undef) #2
-; ALL-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: %v9f32 = call <9 x float> @llvm.fabs.v9f32(<9 x float> undef) #2
+define void @fabs_f16() {
+; ALL-LABEL: 'fabs_f16'
+; ALL-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: %f16 = call half @llvm.fabs.f16(half undef)
+; ALL-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: %v2f16 = call <2 x half> @llvm.fabs.v2f16(<2 x half> undef)
+; ALL-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: %v3f16 = call <3 x half> @llvm.fabs.v3f16(<3 x half> undef)
+; ALL-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: %v4f16 = call <4 x half> @llvm.fabs.v4f16(<4 x half> undef)
+; ALL-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: %v5f16 = call <5 x half> @llvm.fabs.v5f16(<5 x half> undef)
+; ALL-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: %v8f16 = call <8 x half> @llvm.fabs.v8f16(<8 x half> undef)
+; ALL-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: %v16f16 = call <16 x half> @llvm.fabs.v16f16(<16 x half> undef)
+; ALL-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: %v17f16 = call <17 x half> @llvm.fabs.v17f16(<17 x half> undef)
 ; ALL-NEXT:  Cost Model: Found an estimated cost of 10 for instruction: ret void
 ;
-; ALL-SIZE-LABEL: 'fabs_f32'
-; ALL-SIZE-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: %f32 = call float @llvm.fabs.f32(float undef) #2
-; ALL-SIZE-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: %v2f32 = call <2 x float> @llvm.fabs.v2f32(<2 x float> undef) #2
-; ALL-SIZE-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: %v3f32 = call <3 x float> @llvm.fabs.v3f32(<3 x float> undef) #2
-; ALL-SIZE-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: %v4f32 = call <4 x float> @llvm.fabs.v4f32(<4 x float> undef) #2
-; ALL-SIZE-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: %v5f32 = call <5 x float> @llvm.fabs.v5f32(<5 x float> undef) #2
-; ALL-SIZE-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: %v8f32 = call <8 x float> @llvm.fabs.v8f32(<8 x float> undef) #2
-; ALL-SIZE-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: %v9f32 = call <9 x float> @llvm.fabs.v9f32(<9 x float> undef) #2
+; ALL-SIZE-LABEL: 'fabs_f16'
+; ALL-SIZE-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: %f16 = call half @llvm.fabs.f16(half undef)
+; ALL-SIZE-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: %v2f16 = call <2 x half> @llvm.fabs.v2f16(<2 x half> undef)
+; ALL-SIZE-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: %v3f16 = call <3 x half> @llvm.fabs.v3f16(<3 x half> undef)
+; ALL-SIZE-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: %v4f16 = call <4 x half> @llvm.fabs.v4f16(<4 x half> undef)
+; ALL-SIZE-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: %v5f16 = call <5 x half> @llvm.fabs.v5f16(<5 x half> undef)
+; ALL-SIZE-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: %v8f16 = call <8 x half> @llvm.fabs.v8f16(<8 x half> undef)
+; ALL-SIZE-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: %v16f16 = call <16 x half> @llvm.fabs.v16f16(<16 x half> undef)
+; ALL-SIZE-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: %v17f16 = call <17 x half> @llvm.fabs.v17f16(<17 x half> undef)
 ; ALL-SIZE-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: ret void
 ;
-  %f32 = call float @llvm.fabs.f32(float undef) #1
-  %v2f32 = call <2 x float> @llvm.fabs.v2f32(<2 x float> undef) #1
-  %v3f32 = call <3 x float> @llvm.fabs.v3f32(<3 x float> undef) #1
-  %v4f32 = call <4 x float> @llvm.fabs.v4f32(<4 x float> undef) #1
-  %v5f32 = call <5 x float> @llvm.fabs.v5f32(<5 x float> undef) #1
-  %v8f32 = call <8 x float> @llvm.fabs.v8f32(<8 x float> undef) #1
-  %v9f32 = call <9 x float> @llvm.fabs.v9f32(<9 x float> undef) #1
+  %f16 = call half @llvm.fabs.f16(half undef)
+  %v2f16 = call <2 x half> @llvm.fabs.v2f16(<2 x half> undef)
+  %v3f16 = call <3 x half> @llvm.fabs.v3f16(<3 x half> undef)
+  %v4f16 = call <4 x half> @llvm.fabs.v4f16(<4 x half> undef)
+  %v5f16 = call <5 x half> @llvm.fabs.v5f16(<5 x half> undef)
+  %v8f16 = call <8 x half> @llvm.fabs.v8f16(<8 x half> undef)
+  %v16f16 = call <16 x half> @llvm.fabs.v16f16(<16 x half> undef)
+  %v17f16 = call <17 x half> @llvm.fabs.v17f16(<17 x half> undef)
   ret void
 }
 
-define amdgpu_kernel void @fabs_f64() #0 {
-; ALL-LABEL: 'fabs_f64'
-; ALL-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: %f64 = call double @llvm.fabs.f64(double undef) #2
-; ALL-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: %v2f64 = call <2 x double> @llvm.fabs.v2f64(<2 x double> undef) #2
-; ALL-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: %v3f64 = call <3 x double> @llvm.fabs.v3f64(<3 x double> undef) #2
-; ALL-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: %v4f64 = call <4 x double> @llvm.fabs.v4f64(<4 x double> undef) #2
-; ALL-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: %v5f64 = call <5 x double> @llvm.fabs.v5f64(<5 x double> undef) #2
+define void @fabs_bf16() {
+; ALL-LABEL: 'fabs_bf16'
+; ALL-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: %bf16 = call bfloat @llvm.fabs.bf16(bfloat undef)
+; ALL-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: %v2bf16 = call <2 x bfloat> @llvm.fabs.v2bf16(<2 x bfloat> undef)
+; ALL-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: %v3bf16 = call <3 x bfloat> @llvm.fabs.v3bf16(<3 x bfloat> undef)
+; ALL-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: %v4bf16 = call <4 x bfloat> @llvm.fabs.v4bf16(<4 x bfloat> undef)
+; ALL-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: %v5bf16 = call <5 x bfloat> @llvm.fabs.v5bf16(<5 x bfloat> undef)
+; ALL-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: %v8bf16 = call <8 x bfloat> @llvm.fabs.v8bf16(<8 x bfloat> undef)
+; ALL-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: %v16bf16 = call <16 x bfloat> @llvm.fabs.v16bf16(<16 x bfloat> undef)
+; ALL-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: %v17bf16 = call <17 x bfloat> @llvm.fabs.v17bf16(<17 x bfloat> undef)
 ; ALL-NEXT:  Cost Model: Found an estimated cost of 10 for instruction: ret void
 ;
-; ALL-SIZE-LABEL: 'fabs_f64'
-; ALL-SIZE-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: %f64 = call double @llvm.fabs.f64(double undef) #2
-; ALL-SIZE-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: %v2f64 = call <2 x double> @llvm.fabs.v2f64(<2 x double> undef) #2
-; ALL-SIZE-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: %v3f64 = call <3 x double> @llvm.fabs.v3f64(<3 x double> undef) #2
-; ALL-SIZE-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: %v4f64 = call <4 x double> @llvm.fabs.v4f64(<4 x double> undef) #2
-; ALL-SIZE-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: %v5f64 = call <5 x double> @llvm.fabs.v5f64(<5 x double> undef) #2
+; ALL-SIZE-LABEL: 'fabs_bf16'
+; ALL-SIZE-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: %bf16 = call bfloat @llvm.fabs.bf16(bfloat undef)
+; ALL-SIZE-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: %v2bf16 = call <2 x bfloat> @llvm.fabs.v2bf16(<2 x bfloat> undef)
+; ALL-SIZE-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: %v3bf16 = call <3 x bfloat> @llvm.fabs.v3bf16(<3 x bfloat> undef)
+; ALL-SIZE-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: %v4bf16 = call <4 x bfloat> @llvm.fabs.v4bf16(<4 x bfloat> undef)
+; ALL-SIZE-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: %v5bf16 = call <5 x bfloat> @llvm.fabs.v5bf16(<5 x bfloat> undef)
+; ALL-SIZE-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: %v8bf16 = call <8 x bfloat> @llvm.fabs.v8bf16(<8 x bfloat> undef)
+; ALL-SIZE-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: %v16bf16 = call <16 x bfloat> @llvm.fabs.v16bf16(<16 x bfloat> undef)
+; ALL-SIZE-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: %v17bf16 = call <17 x bfloat> @llvm.fabs.v17bf16(<17 x bfloat> undef)
 ; ALL-SIZE-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: ret void
 ;
-  %f64 = call double @llvm.fabs.f64(double undef) #1
-  %v2f64 = call <2 x double> @llvm.fabs.v2f64(<2 x double> undef) #1
-  %v3f64 = call <3 x double> @llvm.fabs.v3f64(<3 x double> undef) #1
-  %v4f64 = call <4 x double> @llvm.fabs.v4f64(<4 x double> undef) #1
-  %v5f64 = call <5 x double> @llvm.fabs.v5f64(<5 x double> undef) #1
+  %bf16 = call bfloat @llvm.fabs.bf16(bfloat undef)
+  %v2bf16 = call <2 x bfloat> @llvm.fabs.v2bf16(<2 x bfloat> undef)
+  %v3bf16 = call <3 x bfloat> @llvm.fabs.v3bf16(<3 x bfloat> undef)
+  %v4bf16 = call <4 x bfloat> @llvm.fabs.v4bf16(<4 x bfloat> undef)
+  %v5bf16 = call <5 x bfloat> @llvm.fabs.v5bf16(<5 x bfloat> undef)
+  %v8bf16 = call <8 x bfloat> @llvm.fabs.v8bf16(<8 x bfloat> undef)
+  %v16bf16 = call <16 x bfloat> @llvm.fabs.v16bf16(<16 x bfloat> undef)
+  %v17bf16 = call <17 x bfloat> @llvm.fabs.v17bf16(<17 x bfloat> undef)
   ret void
 }
 
-define amdgpu_kernel void @fabs_f16() #0 {
-; ALL-LABEL: 'fabs_f16'
-; ALL-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: %f16 = call half @llvm.fabs.f16(half undef) #2
-; ALL-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: %v2f16 = call <2 x half> @llvm.fabs.v2f16(<2 x half> undef) #2
-; ALL-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: %v3f16 = call <3 x half> @llvm.fabs.v3f16(<3 x half> undef) #2
-; ALL-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: %v4f16 = call <4 x half> @llvm.fabs.v4f16(<4 x half> undef) #2
-; ALL-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: %v5f16 = call <5 x half> @llvm.fabs.v5f16(<5 x half> undef) #2
-; ALL-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: %v16f16 = call <16 x half> @llvm.fabs.v16f16(<16 x half> undef) #2
-; ALL-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: %v17f16 = call <17 x half> @llvm.fabs.v17f16(<17 x half> undef) #2
+define void @fabs_f32() {
+; ALL-LABEL: 'fabs_f32'
+; ALL-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: %f32 = call float @llvm.fabs.f32(float undef)
+; ALL-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: %v2f32 = call <2 x float> @llvm.fabs.v2f32(<2 x float> undef)
+; ALL-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: %v3f32 = call <3 x float> @llvm.fabs.v3f32(<3 x float> undef)
+; ALL-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: %v4f32 = call <4 x float> @llvm.fabs.v4f32(<4 x float> undef)
+; ALL-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: %v5f32 = call <5 x float> @llvm.fabs.v5f32(<5 x float> undef)
+; ALL-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: %v8f32 = call <8 x float> @llvm.fabs.v8f32(<8 x float> undef)
+; ALL-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: %v16f32 = call <16 x float> @llvm.fabs.v16f32(<16 x float> undef)
+; ALL-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: %v17f32 = call <17 x float> @llvm.fabs.v17f32(<17 x float> undef)
 ; ALL-NEXT:  Cost Model: Found an estimated cost of 10 for instruction: ret void
 ;
-; ALL-SIZE-LABEL: 'fabs_f16'
-; ALL-SIZE-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: %f16 = call half @llvm.fabs.f16(half undef) #2
-; ALL-SIZE-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: %v2f16 = call <2 x half> @llvm.fabs.v2f16(<2 x half> undef) #2
-; ALL-SIZE-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: %v3f16 = call <3 x half> @llvm.fabs.v3f16(<3 x half> undef) #2
-; ALL-SIZE-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: %v4f16 = call <4 x half> @llvm.fabs.v4f16(<4 x half> undef) #2
-; ALL-SIZE-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: %v5f16 = call <5 x half> @llvm.fabs.v5f16(<5 x half> undef) #2
-; ALL-SIZE-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: %v16f16 = call <16 x half> @llvm.fabs.v16f16(<16 x half> undef) #2
-; ALL-SIZE-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: %v17f16 = call <17 x half> @llvm.fabs.v17f16(<17 x half> undef) #2
+; ALL-SIZE-LABEL: 'fabs_f32'
+; ALL-SIZE-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: %f32 = call float @llvm.fabs.f32(float undef)
+; ALL-SIZE-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: %v2f32 = call <2 x float> @llvm.fabs.v2f32(<2 x float> undef)
+; ALL-SIZE-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: %v3f32 = call <3 x float> @llvm.fabs.v3f32(<3 x float> undef)
+; ALL-SIZE-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: %v4f32 = call <4 x float> @llvm.fabs.v4f32(<4 x float> undef)
+; ALL-SIZE-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: %v5f32 = call <5 x float> @llvm.fabs.v5f32(<5 x float> undef)
+; ALL-SIZE-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: %v8f32 = call <8 x float> @llvm.fabs.v8f32(<8 x float> undef)
+; ALL-SIZE-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: %v16f32 = call <16 x float> @llvm.fabs.v16f32(<16 x float> undef)
+; ALL-SIZE-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: %v17f32 = call <17 x float> @llvm.fabs.v17f32(<17 x float> undef)
 ; ALL-SIZE-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: ret void
 ;
-  %f16 = call half @llvm.fabs.f16(half undef) #1
-  %v2f16 = call <2 x half> @llvm.fabs.v2f16(<2 x half> undef) #1
-  %v3f16 = call <3 x half> @llvm.fabs.v3f16(<3 x half> undef) #1
-  %v4f16 = call <4 x half> @llvm.fabs.v4f16(<4 x half> undef) #1
-  %v5f16 = call <5 x half> @llvm.fabs.v5f16(<5 x half> undef) #1
-  %v16f16 = call <16 x half> @llvm.fabs.v16f16(<16 x half> undef) #1
-  %v17f16 = call <17 x half> @llvm.fabs.v17f16(<17 x half> undef) #1
+  %f32 = call float @llvm.fabs.f32(float undef)
+  %v2f32 = call <2 x float> @llvm.fabs.v2f32(<2 x float> undef)
+  %v3f32 = call <3 x float> @llvm.fabs.v3f32(<3 x float> undef)
+  %v4f32 = call <4 x float> @llvm.fabs.v4f32(<4 x float> undef)
+  %v5f32 = call <5 x float> @llvm.fabs.v5f32(<5 x float> undef)
+  %v8f32 = call <8 x float> @llvm.fabs.v8f32(<8 x float> undef)
+  %v16f32 = call <16 x float> @llvm.fabs.v16f32(<16 x float> undef)
+  %v17f32 = call <17 x float> @llvm.fabs.v17f32(<17 x float> undef)
   ret void
 }
 
-declare float @llvm.fabs.f32(float) #1
-declare <2 x float> @llvm.fabs.v2f32(<2 x float>) #1
-declare <3 x float> @llvm.fabs.v3f32(<3 x float>) #1
-declare <4 x float> @llvm.fabs.v4f32(<4 x float>) #1
-declare <5 x float> @llvm.fabs.v5f32(<5 x float>) #1
-declare <8 x float> @llvm.fabs.v8f32(<8 x float>) #1
-declare <9 x float> @llvm.fabs.v9f32(<9 x float>) #1
-
-declare double @llvm.fabs.f64(double) #1
-declare <2 x double> @llvm.fabs.v2f64(<2 x double>) #1
-declare <3 x double> @llvm.fabs.v3f64(<3 x double>) #1
-declare <4 x double> @llvm.fabs.v4f64(<4 x double>) #1
-declare <5 x double> @llvm.fabs.v5f64(<5 x double>) #1
-
-declare half @llvm.fabs.f16(half) #1
-declare <2 x half> @llvm.fabs.v2f16(<2 x half>) #1
-declare <3 x half> @llvm.fabs.v3f16(<3 x half>) #1
-declare <4 x half> @llvm.fabs.v4f16(<4 x half>) #1
-declare <5 x half> @llvm.fabs.v5f16(<5 x half>) #1
-declare <16 x half> @llvm.fabs.v16f16(<16 x half>) #1
-declare <17 x half> @llvm.fabs.v17f16(<17 x half>) #1
-
-attributes #0 = { nounwind }
-attributes #1 = { nounwind readnone }
+define void @fabs_f64() {
+; ALL-LABEL: 'fabs_f64'
+; ALL-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: %f64 = call double @llvm.fabs.f64(double undef)
+; ALL-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: %v2f64 = call <2 x double> @llvm.fabs.v2f64(<2 x double> undef)
+; ALL-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: %v3f64 = call <3 x double> @llvm.fabs.v3f64(<3 x double> undef)
+; ALL-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: %v4f64 = call <4 x double> @llvm.fabs.v4f64(<4 x double> undef)
+; ALL-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: %v5f64 = call <5 x double> @llvm.fabs.v5f64(<5 x double> undef)
+; ALL-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: %v8f64 = call <8 x double> @llvm.fabs.v8f64(<8 x double> undef)
+; ALL-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: %v16f64 = call <16 x double> @llvm.fabs.v16f64(<16 x double> undef)
+; ALL-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: %v17f64 = call <17 x double> @llvm.fabs.v17f64(<17 x double> undef)
+; ALL-NEXT:  Cost Model: Found an estimated cost of 10 for instruction: ret void
+;
+; ALL-SIZE-LABEL: 'fabs_f64'
+; ALL-SIZE-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: %f64 = call double @llvm.fabs.f64(double undef)
+; ALL-SIZE-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: %v2f64 = call <2 x double> @llvm.fabs.v2f64(<2 x double> undef)
+; ALL-SIZE-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: %v3f64 = call <3 x double> @llvm.fabs.v3f64(<3 x double> undef)
+; ALL-SIZE-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: %v4f64 = call <4 x double> @llvm.fabs.v4f64(<4 x double> undef)
+; ALL-SIZE-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: %v5f64 = call <5 x double> @llvm.fabs.v5f64(<5 x double> undef)
+; ALL-SIZE-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: %v8f64 = call <8 x double> @llvm.fabs.v8f64(<8 x double> undef)
+; ALL-SIZE-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: %v16f64 = call <16 x double> @llvm.fabs.v16f64(<16 x double> undef)
+; ALL-SIZE-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: %v17f64 = call <17 x double> @llvm.fabs.v17f64(<17 x double> undef)
+; ALL-SIZE-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: ret void
+;
+  %f64 = call double @llvm.fabs.f64(double undef)
+  %v2f64 = call <2 x double> @llvm.fabs.v2f64(<2 x double> undef)
+  %v3f64 = call <3 x double> @llvm.fabs.v3f64(<3 x double> undef)
+  %v4f64 = call <4 x double> @llvm.fabs.v4f64(<4 x double> undef)
+  %v5f64 = call <5 x double> @llvm.fabs.v5f64(<5 x double> undef)
+  %v8f64 = call <8 x double> @llvm.fabs.v8f64(<8 x double> undef)
+  %v16f64 = call <16 x double> @llvm.fabs.v16f64(<16 x double> undef)
+  %v17f64 = call <17 x double> @llvm.fabs.v17f64(<17 x double> undef)
+  ret void
+}
diff --git a/llvm/test/Analysis/CostModel/AMDGPU/fma.ll b/llvm/test/Analysis/CostModel/AMDGPU/fma.ll
index ab4e982..2ff9d4f 100644
--- a/llvm/test/Analysis/CostModel/AMDGPU/fma.ll
+++ b/llvm/test/Analysis/CostModel/AMDGPU/fma.ll
@@ -1,214 +1,167 @@
 ; NOTE: Assertions have been autogenerated by utils/update_analyze_test_checks.py
-; RUN: opt -passes="print<cost-model>" 2>&1 -disable-output -mtriple=amdgcn-unknown-amdhsa -mcpu=gfx1010 -mattr=+half-rate-64-ops < %s | FileCheck -check-prefixes=FAST,SLOWF64 %s
-; RUN: opt -passes="print<cost-model>" 2>&1 -disable-output -mtriple=amdgcn-unknown-amdhsa -mcpu=gfx90a -mattr=+half-rate-64-ops < %s | FileCheck -check-prefixes=FAST,FASTF64 %s
-; RUN: opt -passes="print<cost-model>" 2>&1 -disable-output -mtriple=amdgcn-unknown-amdhsa -mcpu=gfx900 -mattr=+half-rate-64-ops < %s | FileCheck -check-prefixes=FAST,SLOWF64 %s
+; RUN: opt -passes="print<cost-model>" 2>&1 -disable-output -mtriple=amdgcn-unknown-amdhsa -mcpu=gfx1010 -mattr=+half-rate-64-ops < %s | FileCheck -check-prefix=FAST %s
+; RUN: opt -passes="print<cost-model>" 2>&1 -disable-output -mtriple=amdgcn-unknown-amdhsa -mcpu=gfx90a -mattr=+half-rate-64-ops < %s | FileCheck -check-prefixes=FAST %s
 ; RUN: opt -passes="print<cost-model>" 2>&1 -disable-output -mtriple=amdgcn-unknown-amdhsa < %s | FileCheck -check-prefixes=SLOW %s
-; RUN: opt -passes="print<cost-model>" -cost-kind=code-size 2>&1 -disable-output -mtriple=amdgcn-unknown-amdhsa -mcpu=gfx1010 -mattr=+half-rate-64-ops < %s | FileCheck -check-prefixes=FAST-SIZE,SLOWF64-SIZE %s
-; RUN: opt -passes="print<cost-model>" -cost-kind=code-size 2>&1 -disable-output -mtriple=amdgcn-unknown-amdhsa -mcpu=gfx90a -mattr=+half-rate-64-ops < %s | FileCheck -check-prefixes=FAST-SIZE,FASTF64-SIZE %s
-; RUN: opt -passes="print<cost-model>" -cost-kind=code-size 2>&1 -disable-output -mtriple=amdgcn-unknown-amdhsa -mcpu=gfx900 -mattr=+half-rate-64-ops < %s | FileCheck -check-prefixes=FAST-SIZE,SLOWF64-SIZE %s
-; RUN: opt -passes="print<cost-model>" -cost-kind=code-size 2>&1 -disable-output -mtriple=amdgcn-unknown-amdhsa < %s | FileCheck -check-prefixes=SLOW-SIZE %s
-; END.
 
-define amdgpu_kernel void @fma_f32() #0 {
-; SLOWF64-LABEL: 'fma_f32'
-; SLOWF64-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %f32 = call float @llvm.fma.f32(float undef, float undef, float undef) #2
-; SLOWF64-NEXT:  Cost Model: Found an estimated cost of 4 for instruction: %v2f32 = call <2 x float> @llvm.fma.v2f32(<2 x float> undef, <2 x float> undef, <2 x float> undef) #2
-; SLOWF64-NEXT:  Cost Model: Found an estimated cost of 6 for instruction: %v3f32 = call <3 x float> @llvm.fma.v3f32(<3 x float> undef, <3 x float> undef, <3 x float> undef) #2
-; SLOWF64-NEXT:  Cost Model: Found an estimated cost of 8 for instruction: %v4f32 = call <4 x float> @llvm.fma.v4f32(<4 x float> undef, <4 x float> undef, <4 x float> undef) #2
-; SLOWF64-NEXT:  Cost Model: Found an estimated cost of 10 for instruction: %v5f32 = call <5 x float> @llvm.fma.v5f32(<5 x float> undef, <5 x float> undef, <5 x float> undef) #2
-; SLOWF64-NEXT:  Cost Model: Found an estimated cost of 16 for instruction: %v8f32 = call <8 x float> @llvm.fma.v8f32(<8 x float> undef, <8 x float> undef, <8 x float> undef) #2
-; SLOWF64-NEXT:  Cost Model: Found an estimated cost of 54 for instruction: %v9f32 = call <9 x float> @llvm.fma.v9f32(<9 x float> undef, <9 x float> undef, <9 x float> undef) #2
-; SLOWF64-NEXT:  Cost Model: Found an estimated cost of 10 for instruction: ret void
-;
-; FASTF64-LABEL: 'fma_f32'
-; FASTF64-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %f32 = call float @llvm.fma.f32(float undef, float undef, float undef) #2
-; FASTF64-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %v2f32 = call <2 x float> @llvm.fma.v2f32(<2 x float> undef, <2 x float> undef, <2 x float> undef) #2
-; FASTF64-NEXT:  Cost Model: Found an estimated cost of 4 for instruction: %v3f32 = call <3 x float> @llvm.fma.v3f32(<3 x float> undef, <3 x float> undef, <3 x float> undef) #2
-; FASTF64-NEXT:  Cost Model: Found an estimated cost of 4 for instruction: %v4f32 = call <4 x float> @llvm.fma.v4f32(<4 x float> undef, <4 x float> undef, <4 x float> undef) #2
-; FASTF64-NEXT:  Cost Model: Found an estimated cost of 6 for instruction: %v5f32 = call <5 x float> @llvm.fma.v5f32(<5 x float> undef, <5 x float> undef, <5 x float> undef) #2
-; FASTF64-NEXT:  Cost Model: Found an estimated cost of 8 for instruction: %v8f32 = call <8 x float> @llvm.fma.v8f32(<8 x float> undef, <8 x float> undef, <8 x float> undef) #2
-; FASTF64-NEXT:  Cost Model: Found an estimated cost of 30 for instruction: %v9f32 = call <9 x float> @llvm.fma.v9f32(<9 x float> undef, <9 x float> undef, <9 x float> undef) #2
-; FASTF64-NEXT:  Cost Model: Found an estimated cost of 10 for instruction: ret void
+; RUN: opt -passes="print<cost-model>" -cost-kind=code-size 2>&1 -disable-output -mtriple=amdgcn-unknown-amdhsa -mcpu=gfx1010 -mattr=+half-rate-64-ops < %s | FileCheck -check-prefix=FAST-SIZE %s
+; RUN: opt -passes="print<cost-model>" -cost-kind=code-size 2>&1 -disable-output -mtriple=amdgcn-unknown-amdhsa -mcpu=gfx90a -mattr=+half-rate-64-ops < %s | FileCheck -check-prefix=FAST-SIZE %s
+; RUN: opt -passes="print<cost-model>" -cost-kind=code-size 2>&1 -disable-output -mtriple=amdgcn-unknown-amdhsa < %s | FileCheck -check-prefix=SLOW-SIZE %s
+
+
+define void @fma_f16() {
+; FAST-LABEL: 'fma_f16'
+; FAST-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %f16 = call half @llvm.fma.f16(half undef, half undef, half undef)
+; FAST-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %v2f16 = call <2 x half> @llvm.fma.v2f16(<2 x half> undef, <2 x half> undef, <2 x half> undef)
+; FAST-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %v3f16 = call <3 x half> @llvm.fma.v3f16(<3 x half> undef, <3 x half> undef, <3 x half> undef)
+; FAST-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %v4f16 = call <4 x half> @llvm.fma.v4f16(<4 x half> undef, <4 x half> undef, <4 x half> undef)
+; FAST-NEXT:  Cost Model: Found an estimated cost of 4 for instruction: %v5f16 = call <5 x half> @llvm.fma.v5f16(<5 x half> undef, <5 x half> undef, <5 x half> undef)
+; FAST-NEXT:  Cost Model: Found an estimated cost of 8 for instruction: %v16f16 = call <16 x half> @llvm.fma.v16f16(<16 x half> undef, <16 x half> undef, <16 x half> undef)
+; FAST-NEXT:  Cost Model: Found an estimated cost of 48 for instruction: %v17f16 = call <17 x half> @llvm.fma.v17f16(<17 x half> undef, <17 x half> undef, <17 x half> undef)
+; FAST-NEXT:  Cost Model: Found an estimated cost of 10 for instruction: ret void
 ;
-; SLOW-LABEL: 'fma_f32'
-; SLOW-NEXT:  Cost Model: Found an estimated cost of 4 for instruction: %f32 = call float @llvm.fma.f32(float undef, float undef, float undef) #2
-; SLOW-NEXT:  Cost Model: Found an estimated cost of 8 for instruction: %v2f32 = call <2 x float> @llvm.fma.v2f32(<2 x float> undef, <2 x float> undef, <2 x float> undef) #2
-; SLOW-NEXT:  Cost Model: Found an estimated cost of 12 for instruction: %v3f32 = call <3 x float> @llvm.fma.v3f32(<3 x float> undef, <3 x float> undef, <3 x float> undef) #2
-; SLOW-NEXT:  Cost Model: Found an estimated cost of 16 for instruction: %v4f32 = call <4 x float> @llvm.fma.v4f32(<4 x float> undef, <4 x float> undef, <4 x float> undef) #2
-; SLOW-NEXT:  Cost Model: Found an estimated cost of 20 for instruction: %v5f32 = call <5 x float> @llvm.fma.v5f32(<5 x float> undef, <5 x float> undef, <5 x float> undef) #2
-; SLOW-NEXT:  Cost Model: Found an estimated cost of 32 for instruction: %v8f32 = call <8 x float> @llvm.fma.v8f32(<8 x float> undef, <8 x float> undef, <8 x float> undef) #2
-; SLOW-NEXT:  Cost Model: Found an estimated cost of 108 for instruction: %v9f32 = call <9 x float> @llvm.fma.v9f32(<9 x float> undef, <9 x float> undef, <9 x float> undef) #2
+; SLOW-LABEL: 'fma_f16'
+; SLOW-NEXT:  Cost Model: Found an estimated cost of 4 for instruction: %f16 = call half @llvm.fma.f16(half undef, half undef, half undef)
+; SLOW-NEXT:  Cost Model: Found an estimated cost of 8 for instruction: %v2f16 = call <2 x half> @llvm.fma.v2f16(<2 x half> undef, <2 x half> undef, <2 x half> undef)
+; SLOW-NEXT:  Cost Model: Found an estimated cost of 16 for instruction: %v3f16 = call <3 x half> @llvm.fma.v3f16(<3 x half> undef, <3 x half> undef, <3 x half> undef)
+; SLOW-NEXT:  Cost Model: Found an estimated cost of 16 for instruction: %v4f16 = call <4 x half> @llvm.fma.v4f16(<4 x half> undef, <4 x half> undef, <4 x half> undef)
+; SLOW-NEXT:  Cost Model: Found an estimated cost of 32 for instruction: %v5f16 = call <5 x half> @llvm.fma.v5f16(<5 x half> undef, <5 x half> undef, <5 x half> undef)
+; SLOW-NEXT:  Cost Model: Found an estimated cost of 64 for instruction: %v16f16 = call <16 x half> @llvm.fma.v16f16(<16 x half> undef, <16 x half> undef, <16 x half> undef)
+; SLOW-NEXT:  Cost Model: Found an estimated cost of 136 for instruction: %v17f16 = call <17 x half> @llvm.fma.v17f16(<17 x half> undef, <17 x half> undef, <17 x half> undef)
 ; SLOW-NEXT:  Cost Model: Found an estimated cost of 10 for instruction: ret void
 ;
-; SLOWF64-SIZE-LABEL: 'fma_f32'
-; SLOWF64-SIZE-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %f32 = call float @llvm.fma.f32(float undef, float undef, float undef) #2
-; SLOWF64-SIZE-NEXT:  Cost Model: Found an estimated cost of 4 for instruction: %v2f32 = call <2 x float> @llvm.fma.v2f32(<2 x float> undef, <2 x float> undef, <2 x float> undef) #2
-; SLOWF64-SIZE-NEXT:  Cost Model: Found an estimated cost of 6 for instruction: %v3f32 = call <3 x float> @llvm.fma.v3f32(<3 x float> undef, <3 x float> undef, <3 x float> undef) #2
-; SLOWF64-SIZE-NEXT:  Cost Model: Found an estimated cost of 8 for instruction: %v4f32 = call <4 x float> @llvm.fma.v4f32(<4 x float> undef, <4 x float> undef, <4 x float> undef) #2
-; SLOWF64-SIZE-NEXT:  Cost Model: Found an estimated cost of 10 for instruction: %v5f32 = call <5 x float> @llvm.fma.v5f32(<5 x float> undef, <5 x float> undef, <5 x float> undef) #2
-; SLOWF64-SIZE-NEXT:  Cost Model: Found an estimated cost of 16 for instruction: %v8f32 = call <8 x float> @llvm.fma.v8f32(<8 x float> undef, <8 x float> undef, <8 x float> undef) #2
-; SLOWF64-SIZE-NEXT:  Cost Model: Found an estimated cost of 54 for instruction: %v9f32 = call <9 x float> @llvm.fma.v9f32(<9 x float> undef, <9 x float> undef, <9 x float> undef) #2
-; SLOWF64-SIZE-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: ret void
-;
-; FASTF64-SIZE-LABEL: 'fma_f32'
-; FASTF64-SIZE-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %f32 = call float @llvm.fma.f32(float undef, float undef, float undef) #2
-; FASTF64-SIZE-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %v2f32 = call <2 x float> @llvm.fma.v2f32(<2 x float> undef, <2 x float> undef, <2 x float> undef) #2
-; FASTF64-SIZE-NEXT:  Cost Model: Found an estimated cost of 4 for instruction: %v3f32 = call <3 x float> @llvm.fma.v3f32(<3 x float> undef, <3 x float> undef, <3 x float> undef) #2
-; FASTF64-SIZE-NEXT:  Cost Model: Found an estimated cost of 4 for instruction: %v4f32 = call <4 x float> @llvm.fma.v4f32(<4 x float> undef, <4 x float> undef, <4 x float> undef) #2
-; FASTF64-SIZE-NEXT:  Cost Model: Found an estimated cost of 6 for instruction: %v5f32 = call <5 x float> @llvm.fma.v5f32(<5 x float> undef, <5 x float> undef, <5 x float> undef) #2
-; FASTF64-SIZE-NEXT:  Cost Model: Found an estimated cost of 8 for instruction: %v8f32 = call <8 x float> @llvm.fma.v8f32(<8 x float> undef, <8 x float> undef, <8 x float> undef) #2
-; FASTF64-SIZE-NEXT:  Cost Model: Found an estimated cost of 30 for instruction: %v9f32 = call <9 x float> @llvm.fma.v9f32(<9 x float> undef, <9 x float> undef, <9 x float> undef) #2
-; FASTF64-SIZE-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: ret void
+; FAST-SIZE-LABEL: 'fma_f16'
+; FAST-SIZE-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %f16 = call half @llvm.fma.f16(half undef, half undef, half undef)
+; FAST-SIZE-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %v2f16 = call <2 x half> @llvm.fma.v2f16(<2 x half> undef, <2 x half> undef, <2 x half> undef)
+; FAST-SIZE-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %v3f16 = call <3 x half> @llvm.fma.v3f16(<3 x half> undef, <3 x half> undef, <3 x half> undef)
+; FAST-SIZE-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %v4f16 = call <4 x half> @llvm.fma.v4f16(<4 x half> undef, <4 x half> undef, <4 x half> undef)
+; FAST-SIZE-NEXT:  Cost Model: Found an estimated cost of 4 for instruction: %v5f16 = call <5 x half> @llvm.fma.v5f16(<5 x half> undef, <5 x half> undef, <5 x half> undef)
+; FAST-SIZE-NEXT:  Cost Model: Found an estimated cost of 8 for instruction: %v16f16 = call <16 x half> @llvm.fma.v16f16(<16 x half> undef, <16 x half> undef, <16 x half> undef)
+; FAST-SIZE-NEXT:  Cost Model: Found an estimated cost of 48 for instruction: %v17f16 = call <17 x half> @llvm.fma.v17f16(<17 x half> undef, <17 x half> undef, <17 x half> undef)
+; FAST-SIZE-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: ret void
 ;
-; SLOW-SIZE-LABEL: 'fma_f32'
-; SLOW-SIZE-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %f32 = call float @llvm.fma.f32(float undef, float undef, float undef) #2
-; SLOW-SIZE-NEXT:  Cost Model: Found an estimated cost of 4 for instruction: %v2f32 = call <2 x float> @llvm.fma.v2f32(<2 x float> undef, <2 x float> undef, <2 x float> undef) #2
-; SLOW-SIZE-NEXT:  Cost Model: Found an estimated cost of 6 for instruction: %v3f32 = call <3 x float> @llvm.fma.v3f32(<3 x float> undef, <3 x float> undef, <3 x float> undef) #2
-; SLOW-SIZE-NEXT:  Cost Model: Found an estimated cost of 8 for instruction: %v4f32 = call <4 x float> @llvm.fma.v4f32(<4 x float> undef, <4 x float> undef, <4 x float> undef) #2
-; SLOW-SIZE-NEXT:  Cost Model: Found an estimated cost of 10 for instruction: %v5f32 = call <5 x float> @llvm.fma.v5f32(<5 x float> undef, <5 x float> undef, <5 x float> undef) #2
-; SLOW-SIZE-NEXT:  Cost Model: Found an estimated cost of 16 for instruction: %v8f32 = call <8 x float> @llvm.fma.v8f32(<8 x float> undef, <8 x float> undef, <8 x float> undef) #2
-; SLOW-SIZE-NEXT:  Cost Model: Found an estimated cost of 54 for instruction: %v9f32 = call <9 x float> @llvm.fma.v9f32(<9 x float> undef, <9 x float> undef, <9 x float> undef) #2
+; SLOW-SIZE-LABEL: 'fma_f16'
+; SLOW-SIZE-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %f16 = call half @llvm.fma.f16(half undef, half undef, half undef)
+; SLOW-SIZE-NEXT:  Cost Model: Found an estimated cost of 4 for instruction: %v2f16 = call <2 x half> @llvm.fma.v2f16(<2 x half> undef, <2 x half> undef, <2 x half> undef)
+; SLOW-SIZE-NEXT:  Cost Model: Found an estimated cost of 8 for instruction: %v3f16 = call <3 x half> @llvm.fma.v3f16(<3 x half> undef, <3 x half> undef, <3 x half> undef)
+; SLOW-SIZE-NEXT:  Cost Model: Found an estimated cost of 8 for instruction: %v4f16 = call <4 x half> @llvm.fma.v4f16(<4 x half> undef, <4 x half> undef, <4 x half> undef)
+; SLOW-SIZE-NEXT:  Cost Model: Found an estimated cost of 16 for instruction: %v5f16 = call <5 x half> @llvm.fma.v5f16(<5 x half> undef, <5 x half> undef, <5 x half> undef)
+; SLOW-SIZE-NEXT:  Cost Model: Found an estimated cost of 32 for instruction: %v16f16 = call <16 x half> @llvm.fma.v16f16(<16 x half> undef, <16 x half> undef, <16 x half> undef)
+; SLOW-SIZE-NEXT:  Cost Model: Found an estimated cost of 68 for instruction: %v17f16 = call <17 x half> @llvm.fma.v17f16(<17 x half> undef, <17 x half> undef, <17 x half> undef)
 ; SLOW-SIZE-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: ret void
 ;
-  %f32 = call float @llvm.fma.f32(float undef, float undef, float undef) #1
-  %v2f32 = call <2 x float> @llvm.fma.v2f32(<2 x float> undef, <2 x float> undef, <2 x float> undef) #1
-  %v3f32 = call <3 x float> @llvm.fma.v3f32(<3 x float> undef, <3 x float> undef, <3 x float> undef) #1
-  %v4f32 = call <4 x float> @llvm.fma.v4f32(<4 x float> undef, <4 x float> undef, <4 x float> undef) #1
-  %v5f32 = call <5 x float> @llvm.fma.v5f32(<5 x float> undef, <5 x float> undef, <5 x float> undef) #1
-  %v8f32 = call <8 x float> @llvm.fma.v8f32(<8 x float> undef, <8 x float> undef, <8 x float> undef) #1
-  %v9f32 = call <9 x float> @llvm.fma.v9f32(<9 x float> undef, <9 x float> undef, <9 x float> undef) #1
+  %f16 = call half @llvm.fma.f16(half undef, half undef, half undef)
+  %v2f16 = call <2 x half> @llvm.fma.v2f16(<2 x half> undef, <2 x half> undef, <2 x half> undef)
+  %v3f16 = call <3 x half> @llvm.fma.v3f16(<3 x half> undef, <3 x half> undef, <3 x half> undef)
+  %v4f16 = call <4 x half> @llvm.fma.v4f16(<4 x half> undef, <4 x half> undef, <4 x half> undef)
+  %v5f16 = call <5 x half> @llvm.fma.v5f16(<5 x half> undef, <5 x half> undef, <5 x half> undef)
+  %v16f16 = call <16 x half> @llvm.fma.v16f16(<16 x half> undef, <16 x half> undef, <16 x half> undef)
+  %v17f16 = call <17 x half> @llvm.fma.v17f16(<17 x half> undef, <17 x half> undef, <17 x half> undef)
   ret void
 }
 
-define amdgpu_kernel void @fma_f64() #0 {
-; SLOWF64-LABEL: 'fma_f64'
-; SLOWF64-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %f64 = call double @llvm.fma.f64(double undef, double undef, double undef) #2
-; SLOWF64-NEXT:  Cost Model: Found an estimated cost of 4 for instruction: %v2f64 = call <2 x double> @llvm.fma.v2f64(<2 x double> undef, <2 x double> undef, <2 x double> undef) #2
-; SLOWF64-NEXT:  Cost Model: Found an estimated cost of 6 for instruction: %v3f64 = call <3 x double> @llvm.fma.v3f64(<3 x double> undef, <3 x double> undef, <3 x double> undef) #2
-; SLOWF64-NEXT:  Cost Model: Found an estimated cost of 8 for instruction: %v4f64 = call <4 x double> @llvm.fma.v4f64(<4 x double> undef, <4 x double> undef, <4 x double> undef) #2
-; SLOWF64-NEXT:  Cost Model: Found an estimated cost of 48 for instruction: %v5f64 = call <5 x double> @llvm.fma.v5f64(<5 x double> undef, <5 x double> undef, <5 x double> undef) #2
-; SLOWF64-NEXT:  Cost Model: Found an estimated cost of 10 for instruction: ret void
-;
-; FASTF64-LABEL: 'fma_f64'
-; FASTF64-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %f64 = call double @llvm.fma.f64(double undef, double undef, double undef) #2
-; FASTF64-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %v2f64 = call <2 x double> @llvm.fma.v2f64(<2 x double> undef, <2 x double> undef, <2 x double> undef) #2
-; FASTF64-NEXT:  Cost Model: Found an estimated cost of 3 for instruction: %v3f64 = call <3 x double> @llvm.fma.v3f64(<3 x double> undef, <3 x double> undef, <3 x double> undef) #2
-; FASTF64-NEXT:  Cost Model: Found an estimated cost of 4 for instruction: %v4f64 = call <4 x double> @llvm.fma.v4f64(<4 x double> undef, <4 x double> undef, <4 x double> undef) #2
-; FASTF64-NEXT:  Cost Model: Found an estimated cost of 24 for instruction: %v5f64 = call <5 x double> @llvm.fma.v5f64(<5 x double> undef, <5 x double> undef, <5 x double> undef) #2
-; FASTF64-NEXT:  Cost Model: Found an estimated cost of 10 for instruction: ret void
+define void @fma_bf16() {
+; FAST-LABEL: 'fma_bf16'
+; FAST-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %bf16 = call bfloat @llvm.fma.bf16(bfloat undef, bfloat undef, bfloat undef)
+; FAST-NEXT:  Cost Model: Found an estimated cost of 4 for instruction: %v2bf16 = call <2 x bfloat> @llvm.fma.v2bf16(<2 x bfloat> undef, <2 x bfloat> undef, <2 x bfloat> undef)
+; FAST-NEXT:  Cost Model: Found an estimated cost of 8 for instruction: %v3bf16 = call <3 x bfloat> @llvm.fma.v3bf16(<3 x bfloat> undef, <3 x bfloat> undef, <3 x bfloat> undef)
+; FAST-NEXT:  Cost Model: Found an estimated cost of 8 for instruction: %v4bf16 = call <4 x bfloat> @llvm.fma.v4bf16(<4 x bfloat> undef, <4 x bfloat> undef, <4 x bfloat> undef)
+; FAST-NEXT:  Cost Model: Found an estimated cost of 16 for instruction: %v5bf16 = call <5 x bfloat> @llvm.fma.v5bf16(<5 x bfloat> undef, <5 x bfloat> undef, <5 x bfloat> undef)
+; FAST-NEXT:  Cost Model: Found an estimated cost of 32 for instruction: %v16bf16 = call <16 x bfloat> @llvm.fma.v16bf16(<16 x bfloat> undef, <16 x bfloat> undef, <16 x bfloat> undef)
+; FAST-NEXT:  Cost Model: Found an estimated cost of 192 for instruction: %v17bf16 = call <17 x bfloat> @llvm.fma.v17bf16(<17 x bfloat> undef, <17 x bfloat> undef, <17 x bfloat> undef)
+; FAST-NEXT:  Cost Model: Found an estimated cost of 10 for instruction: ret void
 ;
-; SLOW-LABEL: 'fma_f64'
-; SLOW-NEXT:  Cost Model: Found an estimated cost of 4 for instruction: %f64 = call double @llvm.fma.f64(double undef, double undef, double undef) #2
-; SLOW-NEXT:  Cost Model: Found an estimated cost of 8 for instruction: %v2f64 = call <2 x double> @llvm.fma.v2f64(<2 x double> undef, <2 x double> undef, <2 x double> undef) #2
-; SLOW-NEXT:  Cost Model: Found an estimated cost of 12 for instruction: %v3f64 = call <3 x double> @llvm.fma.v3f64(<3 x double> undef, <3 x double> undef, <3 x double> undef) #2
-; SLOW-NEXT:  Cost Model: Found an estimated cost of 16 for instruction: %v4f64 = call <4 x double> @llvm.fma.v4f64(<4 x double> undef, <4 x double> undef, <4 x double> undef) #2
-; SLOW-NEXT:  Cost Model: Found an estimated cost of 96 for instruction: %v5f64 = call <5 x double> @llvm.fma.v5f64(<5 x double> undef, <5 x double> undef, <5 x double> undef) #2
+; SLOW-LABEL: 'fma_bf16'
+; SLOW-NEXT:  Cost Model: Found an estimated cost of 4 for instruction: %bf16 = call bfloat @llvm.fma.bf16(bfloat undef, bfloat undef, bfloat undef)
+; SLOW-NEXT:  Cost Model: Found an estimated cost of 8 for instruction: %v2bf16 = call <2 x bfloat> @llvm.fma.v2bf16(<2 x bfloat> undef, <2 x bfloat> undef, <2 x bfloat> undef)
+; SLOW-NEXT:  Cost Model: Found an estimated cost of 16 for instruction: %v3bf16 = call <3 x bfloat> @llvm.fma.v3bf16(<3 x bfloat> undef, <3 x bfloat> undef, <3 x bfloat> undef)
+; SLOW-NEXT:  Cost Model: Found an estimated cost of 16 for instruction: %v4bf16 = call <4 x bfloat> @llvm.fma.v4bf16(<4 x bfloat> undef, <4 x bfloat> undef, <4 x bfloat> undef)
+; SLOW-NEXT:  Cost Model: Found an estimated cost of 32 for instruction: %v5bf16 = call <5 x bfloat> @llvm.fma.v5bf16(<5 x bfloat> undef, <5 x bfloat> undef, <5 x bfloat> undef)
+; SLOW-NEXT:  Cost Model: Found an estimated cost of 64 for instruction: %v16bf16 = call <16 x bfloat> @llvm.fma.v16bf16(<16 x bfloat> undef, <16 x bfloat> undef, <16 x bfloat> undef)
+; SLOW-NEXT:  Cost Model: Found an estimated cost of 136 for instruction: %v17bf16 = call <17 x bfloat> @llvm.fma.v17bf16(<17 x bfloat> undef, <17 x bfloat> undef, <17 x bfloat> undef)
 ; SLOW-NEXT:  Cost Model: Found an estimated cost of 10 for instruction: ret void
 ;
-; SLOWF64-SIZE-LABEL: 'fma_f64'
-; SLOWF64-SIZE-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %f64 = call double @llvm.fma.f64(double undef, double undef, double undef) #2
-; SLOWF64-SIZE-NEXT:  Cost Model: Found an estimated cost of 4 for instruction: %v2f64 = call <2 x double> @llvm.fma.v2f64(<2 x double> undef, <2 x double> undef, <2 x double> undef) #2
-; SLOWF64-SIZE-NEXT:  Cost Model: Found an estimated cost of 6 for instruction: %v3f64 = call <3 x double> @llvm.fma.v3f64(<3 x double> undef, <3 x double> undef, <3 x double> undef) #2
-; SLOWF64-SIZE-NEXT:  Cost Model: Found an estimated cost of 8 for instruction: %v4f64 = call <4 x double> @llvm.fma.v4f64(<4 x double> undef, <4 x double> undef, <4 x double> undef) #2
-; SLOWF64-SIZE-NEXT:  Cost Model: Found an estimated cost of 48 for instruction: %v5f64 = call <5 x double> @llvm.fma.v5f64(<5 x double> undef, <5 x double> undef, <5 x double> undef) #2
-; SLOWF64-SIZE-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: ret void
-;
-; FASTF64-SIZE-LABEL: 'fma_f64'
-; FASTF64-SIZE-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %f64 = call double @llvm.fma.f64(double undef, double undef, double undef) #2
-; FASTF64-SIZE-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %v2f64 = call <2 x double> @llvm.fma.v2f64(<2 x double> undef, <2 x double> undef, <2 x double> undef) #2
-; FASTF64-SIZE-NEXT:  Cost Model: Found an estimated cost of 3 for instruction: %v3f64 = call <3 x double> @llvm.fma.v3f64(<3 x double> undef, <3 x double> undef, <3 x double> undef) #2
-; FASTF64-SIZE-NEXT:  Cost Model: Found an estimated cost of 4 for instruction: %v4f64 = call <4 x double> @llvm.fma.v4f64(<4 x double> undef, <4 x double> undef, <4 x double> undef) #2
-; FASTF64-SIZE-NEXT:  Cost Model: Found an estimated cost of 24 for instruction: %v5f64 = call <5 x double> @llvm.fma.v5f64(<5 x double> undef, <5 x double> undef, <5 x double> undef) #2
-; FASTF64-SIZE-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: ret void
+; FAST-SIZE-LABEL: 'fma_bf16'
+; FAST-SIZE-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %bf16 = call bfloat @llvm.fma.bf16(bfloat undef, bfloat undef, bfloat undef)
+; FAST-SIZE-NEXT:  Cost Model: Found an estimated cost of 4 for instruction: %v2bf16 = call <2 x bfloat> @llvm.fma.v2bf16(<2 x bfloat> undef, <2 x bfloat> undef, <2 x bfloat> undef)
+; FAST-SIZE-NEXT:  Cost Model: Found an estimated cost of 8 for instruction: %v3bf16 = call <3 x bfloat> @llvm.fma.v3bf16(<3 x bfloat> undef, <3 x bfloat> undef, <3 x bfloat> undef)
+; FAST-SIZE-NEXT:  Cost Model: Found an estimated cost of 8 for instruction: %v4bf16 = call <4 x bfloat> @llvm.fma.v4bf16(<4 x bfloat> undef, <4 x bfloat> undef, <4 x bfloat> undef)
+; FAST-SIZE-NEXT:  Cost Model: Found an estimated cost of 16 for instruction: %v5bf16 = call <5 x bfloat> @llvm.fma.v5bf16(<5 x bfloat> undef, <5 x bfloat> undef, <5 x bfloat> undef)
+; FAST-SIZE-NEXT:  Cost Model: Found an estimated cost of 32 for instruction: %v16bf16 = call <16 x bfloat> @llvm.fma.v16bf16(<16 x bfloat> undef, <16 x bfloat> undef, <16 x bfloat> undef)
+; FAST-SIZE-NEXT:  Cost Model: Found an estimated cost of 192 for instruction: %v17bf16 = call <17 x bfloat> @llvm.fma.v17bf16(<17 x bfloat> undef, <17 x bfloat> undef, <17 x bfloat> undef)
+; FAST-SIZE-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: ret void
 ;
-; SLOW-SIZE-LABEL: 'fma_f64'
-; SLOW-SIZE-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %f64 = call double @llvm.fma.f64(double undef, double undef, double undef) #2
-; SLOW-SIZE-NEXT:  Cost Model: Found an estimated cost of 4 for instruction: %v2f64 = call <2 x double> @llvm.fma.v2f64(<2 x double> undef, <2 x double> undef, <2 x double> undef) #2
-; SLOW-SIZE-NEXT:  Cost Model: Found an estimated cost of 6 for instruction: %v3f64 = call <3 x double> @llvm.fma.v3f64(<3 x double> undef, <3 x double> undef, <3 x double> undef) #2
-; SLOW-SIZE-NEXT:  Cost Model: Found an estimated cost of 8 for instruction: %v4f64 = call <4 x double> @llvm.fma.v4f64(<4 x double> undef, <4 x double> undef, <4 x double> undef) #2
-; SLOW-SIZE-NEXT:  Cost Model: Found an estimated cost of 48 for instruction: %v5f64 = call <5 x double> @llvm.fma.v5f64(<5 x double> undef, <5 x double> undef, <5 x double> undef) #2
+; SLOW-SIZE-LABEL: 'fma_bf16'
+; SLOW-SIZE-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %bf16 = call bfloat @llvm.fma.bf16(bfloat undef, bfloat undef, bfloat undef)
+; SLOW-SIZE-NEXT:  Cost Model: Found an estimated cost of 4 for instruction: %v2bf16 = call <2 x bfloat> @llvm.fma.v2bf16(<2 x bfloat> undef, <2 x bfloat> undef, <2 x bfloat> undef)
+; SLOW-SIZE-NEXT:  Cost Model: Found an estimated cost of 8 for instruction: %v3bf16 = call <3 x bfloat> @llvm.fma.v3bf16(<3 x bfloat> undef, <3 x bfloat> undef, <3 x bfloat> undef)
+; SLOW-SIZE-NEXT:  Cost Model: Found an estimated cost of 8 for instruction: %v4bf16 = call <4 x bfloat> @llvm.fma.v4bf16(<4 x bfloat> undef, <4 x bfloat> undef, <4 x bfloat> undef)
+; SLOW-SIZE-NEXT:  Cost Model: Found an estimated cost of 16 for instruction: %v5bf16 = call <5 x bfloat> @llvm.fma.v5bf16(<5 x bfloat> undef, <5 x bfloat> undef, <5 x bfloat> undef)
+; SLOW-SIZE-NEXT:  Cost Model: Found an estimated cost of 32 for instruction: %v16bf16 = call <16 x bfloat> @llvm.fma.v16bf16(<16 x bfloat> undef, <16 x bfloat> undef, <16 x bfloat> undef)
+; SLOW-SIZE-NEXT:  Cost Model: Found an estimated cost of 68 for instruction: %v17bf16 = call <17 x bfloat> @llvm.fma.v17bf16(<17 x bfloat> undef, <17 x bfloat> undef, <17 x bfloat> undef)
 ; SLOW-SIZE-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: ret void
 ;
-  %f64 = call double @llvm.fma.f64(double undef, double undef, double undef) #1
-  %v2f64 = call <2 x double> @llvm.fma.v2f64(<2 x double> undef, <2 x double> undef, <2 x double> undef) #1
-  %v3f64 = call <3 x double> @llvm.fma.v3f64(<3 x double> undef, <3 x double> undef, <3 x double> undef) #1
-  %v4f64 = call <4 x double> @llvm.fma.v4f64(<4 x double> undef, <4 x double> undef, <4 x double> undef) #1
-  %v5f64 = call <5 x double> @llvm.fma.v5f64(<5 x double> undef, <5 x double> undef, <5 x double> undef) #1
+  %bf16 = call bfloat @llvm.fma.bf16(bfloat undef, bfloat undef, bfloat undef)
+  %v2bf16 = call <2 x bfloat> @llvm.fma.v2bf16(<2 x bfloat> undef, <2 x bfloat> undef, <2 x bfloat> undef)
+  %v3bf16 = call <3 x bfloat> @llvm.fma.v3bf16(<3 x bfloat> undef, <3 x bfloat> undef, <3 x bfloat> undef)
+  %v4bf16 = call <4 x bfloat> @llvm.fma.v4bf16(<4 x bfloat> undef, <4 x bfloat> undef, <4 x bfloat> undef)
+  %v5bf16 = call <5 x bfloat> @llvm.fma.v5bf16(<5 x bfloat> undef, <5 x bfloat> undef, <5 x bfloat> undef)
+  %v16bf16 = call <16 x bfloat> @llvm.fma.v16bf16(<16 x bfloat> undef, <16 x bfloat> undef, <16 x bfloat> undef)
+  %v17bf16 = call <17 x bfloat> @llvm.fma.v17bf16(<17 x bfloat> undef, <17 x bfloat> undef, <17 x bfloat> undef)
   ret void
 }
 
-define amdgpu_kernel void @fma_f16() #0 {
-; FAST-LABEL: 'fma_f16'
-; FAST-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %f16 = call half @llvm.fma.f16(half undef, half undef, half undef) #2
-; FAST-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %v2f16 = call <2 x half> @llvm.fma.v2f16(<2 x half> undef, <2 x half> undef, <2 x half> undef) #2
-; FAST-NEXT:  Cost Model: Found an estimated cost of 4 for instruction: %v3f16 = call <3 x half> @llvm.fma.v3f16(<3 x half> undef, <3 x half> undef, <3 x half> undef) #2
-; FAST-NEXT:  Cost Model: Found an estimated cost of 4 for instruction: %v4f16 = call <4 x half> @llvm.fma.v4f16(<4 x half> undef, <4 x half> undef, <4 x half> undef) #2
-; FAST-NEXT:  Cost Model: Found an estimated cost of 8 for instruction: %v5f16 = call <5 x half> @llvm.fma.v5f16(<5 x half> undef, <5 x half> undef, <5 x half> undef) #2
-; FAST-NEXT:  Cost Model: Found an estimated cost of 16 for instruction: %v16f16 = call <16 x half> @llvm.fma.v16f16(<16 x half> undef, <16 x half> undef, <16 x half> undef) #2
-; FAST-NEXT:  Cost Model: Found an estimated cost of 96 for instruction: %v17f16 = call <17 x half> @llvm.fma.v17f16(<17 x half> undef, <17 x half> undef, <17 x half> undef) #2
-; FAST-NEXT:  Cost Model: Found an estimated cost of 10 for instruction: ret void
-;
-; SLOW-LABEL: 'fma_f16'
-; SLOW-NEXT:  Cost Model: Found an estimated cost of 4 for instruction: %f16 = call half @llvm.fma.f16(half undef, half undef, half undef) #2
-; SLOW-NEXT:  Cost Model: Found an estimated cost of 8 for instruction: %v2f16 = call <2 x half> @llvm.fma.v2f16(<2 x half> undef, <2 x half> undef, <2 x half> undef) #2
-; SLOW-NEXT:  Cost Model: Found an estimated cost of 16 for instruction: %v3f16 = call <3 x half> @llvm.fma.v3f16(<3 x half> undef, <3 x half> undef, <3 x half> undef) #2
-; SLOW-NEXT:  Cost Model: Found an estimated cost of 16 for instruction: %v4f16 = call <4 x half> @llvm.fma.v4f16(<4 x half> undef, <4 x half> undef, <4 x half> undef) #2
-; SLOW-NEXT:  Cost Model: Found an estimated cost of 32 for instruction: %v5f16 = call <5 x half> @llvm.fma.v5f16(<5 x half> undef, <5 x half> undef, <5 x half> undef) #2
-; SLOW-NEXT:  Cost Model: Found an estimated cost of 64 for instruction: %v16f16 = call <16 x half> @llvm.fma.v16f16(<16 x half> undef, <16 x half> undef, <16 x half> undef) #2
-; SLOW-NEXT:  Cost Model: Found an estimated cost of 136 for instruction: %v17f16 = call <17 x half> @llvm.fma.v17f16(<17 x half> undef, <17 x half> undef, <17 x half> undef) #2
+define void @fma_f32() {
+; SLOW-LABEL: 'fma_f32'
+; SLOW-NEXT:  Cost Model: Found an estimated cost of 4 for instruction: %f32 = call float @llvm.fma.f32(float undef, float undef, float undef)
+; SLOW-NEXT:  Cost Model: Found an estimated cost of 8 for instruction: %v2f32 = call <2 x float> @llvm.fma.v2f32(<2 x float> undef, <2 x float> undef, <2 x float> undef)
+; SLOW-NEXT:  Cost Model: Found an estimated cost of 12 for instruction: %v3f32 = call <3 x float> @llvm.fma.v3f32(<3 x float> undef, <3 x float> undef, <3 x float> undef)
+; SLOW-NEXT:  Cost Model: Found an estimated cost of 16 for instruction: %v4f32 = call <4 x float> @llvm.fma.v4f32(<4 x float> undef, <4 x float> undef, <4 x float> undef)
+; SLOW-NEXT:  Cost Model: Found an estimated cost of 20 for instruction: %v5f32 = call <5 x float> @llvm.fma.v5f32(<5 x float> undef, <5 x float> undef, <5 x float> undef)
+; SLOW-NEXT:  Cost Model: Found an estimated cost of 32 for instruction: %v8f32 = call <8 x float> @llvm.fma.v8f32(<8 x float> undef, <8 x float> undef, <8 x float> undef)
+; SLOW-NEXT:  Cost Model: Found an estimated cost of 108 for instruction: %v9f32 = call <9 x float> @llvm.fma.v9f32(<9 x float> undef, <9 x float> undef, <9 x float> undef)
 ; SLOW-NEXT:  Cost Model: Found an estimated cost of 10 for instruction: ret void
 ;
-; FAST-SIZE-LABEL: 'fma_f16'
-; FAST-SIZE-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %f16 = call half @llvm.fma.f16(half undef, half undef, half undef) #2
-; FAST-SIZE-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %v2f16 = call <2 x half> @llvm.fma.v2f16(<2 x half> undef, <2 x half> undef, <2 x half> undef) #2
-; FAST-SIZE-NEXT:  Cost Model: Found an estimated cost of 4 for instruction: %v3f16 = call <3 x half> @llvm.fma.v3f16(<3 x half> undef, <3 x half> undef, <3 x half> undef) #2
-; FAST-SIZE-NEXT:  Cost Model: Found an estimated cost of 4 for instruction: %v4f16 = call <4 x half> @llvm.fma.v4f16(<4 x half> undef, <4 x half> undef, <4 x half> undef) #2
-; FAST-SIZE-NEXT:  Cost Model: Found an estimated cost of 8 for instruction: %v5f16 = call <5 x half> @llvm.fma.v5f16(<5 x half> undef, <5 x half> undef, <5 x half> undef) #2
-; FAST-SIZE-NEXT:  Cost Model: Found an estimated cost of 16 for instruction: %v16f16 = call <16 x half> @llvm.fma.v16f16(<16 x half> undef, <16 x half> undef, <16 x half> undef) #2
-; FAST-SIZE-NEXT:  Cost Model: Found an estimated cost of 96 for instruction: %v17f16 = call <17 x half> @llvm.fma.v17f16(<17 x half> undef, <17 x half> undef, <17 x half> undef) #2
-; FAST-SIZE-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: ret void
-;
-; SLOW-SIZE-LABEL: 'fma_f16'
-; SLOW-SIZE-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %f16 = call half @llvm.fma.f16(half undef, half undef, half undef) #2
-; SLOW-SIZE-NEXT:  Cost Model: Found an estimated cost of 4 for instruction: %v2f16 = call <2 x half> @llvm.fma.v2f16(<2 x half> undef, <2 x half> undef, <2 x half> undef) #2
-; SLOW-SIZE-NEXT:  Cost Model: Found an estimated cost of 8 for instruction: %v3f16 = call <3 x half> @llvm.fma.v3f16(<3 x half> undef, <3 x half> undef, <3 x half> undef) #2
-; SLOW-SIZE-NEXT:  Cost Model: Found an estimated cost of 8 for instruction: %v4f16 = call <4 x half> @llvm.fma.v4f16(<4 x half> undef, <4 x half> undef, <4 x half> undef) #2
-; SLOW-SIZE-NEXT:  Cost Model: Found an estimated cost of 16 for instruction: %v5f16 = call <5 x half> @llvm.fma.v5f16(<5 x half> undef, <5 x half> undef, <5 x half> undef) #2
-; SLOW-SIZE-NEXT:  Cost Model: Found an estimated cost of 32 for instruction: %v16f16 = call <16 x half> @llvm.fma.v16f16(<16 x half> undef, <16 x half> undef, <16 x half> undef) #2
-; SLOW-SIZE-NEXT:  Cost Model: Found an estimated cost of 68 for instruction: %v17f16 = call <17 x half> @llvm.fma.v17f16(<17 x half> undef, <17 x half> undef, <17 x half> undef) #2
+; SLOW-SIZE-LABEL: 'fma_f32'
+; SLOW-SIZE-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %f32 = call float @llvm.fma.f32(float undef, float undef, float undef)
+; SLOW-SIZE-NEXT:  Cost Model: Found an estimated cost of 4 for instruction: %v2f32 = call <2 x float> @llvm.fma.v2f32(<2 x float> undef, <2 x float> undef, <2 x float> undef)
+; SLOW-SIZE-NEXT:  Cost Model: Found an estimated cost of 6 for instruction: %v3f32 = call <3 x float> @llvm.fma.v3f32(<3 x float> undef, <3 x float> undef, <3 x float> undef)
+; SLOW-SIZE-NEXT:  Cost Model: Found an estimated cost of 8 for instruction: %v4f32 = call <4 x float> @llvm.fma.v4f32(<4 x float> undef, <4 x float> undef, <4 x float> undef)
+; SLOW-SIZE-NEXT:  Cost Model: Found an estimated cost of 10 for instruction: %v5f32 = call <5 x float> @llvm.fma.v5f32(<5 x float> undef, <5 x float> undef, <5 x float> undef)
+; SLOW-SIZE-NEXT:  Cost Model: Found an estimated cost of 16 for instruction: %v8f32 = call <8 x float> @llvm.fma.v8f32(<8 x float> undef, <8 x float> undef, <8 x float> undef)
+; SLOW-SIZE-NEXT:  Cost Model: Found an estimated cost of 54 for instruction: %v9f32 = call <9 x float> @llvm.fma.v9f32(<9 x float> undef, <9 x float> undef, <9 x float> undef)
 ; SLOW-SIZE-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: ret void
 ;
-  %f16 = call half @llvm.fma.f16(half undef, half undef, half undef) #1
-  %v2f16 = call <2 x half> @llvm.fma.v2f16(<2 x half> undef, <2 x half> undef, <2 x half> undef) #1
-  %v3f16 = call <3 x half> @llvm.fma.v3f16(<3 x half> undef, <3 x half> undef, <3 x half> undef) #1
-  %v4f16 = call <4 x half> @llvm.fma.v4f16(<4 x half> undef, <4 x half> undef, <4 x half> undef) #1
-  %v5f16 = call <5 x half> @llvm.fma.v5f16(<5 x half> undef, <5 x half> undef, <5 x half> undef) #1
-  %v16f16 = call <16 x half> @llvm.fma.v16f16(<16 x half> undef, <16 x half> undef, <16 x half> undef) #1
-  %v17f16 = call <17 x half> @llvm.fma.v17f16(<17 x half> undef, <17 x half> undef, <17 x half> undef) #1
+  %f32 = call float @llvm.fma.f32(float undef, float undef, float undef)
+  %v2f32 = call <2 x float> @llvm.fma.v2f32(<2 x float> undef, <2 x float> undef, <2 x float> undef)
+  %v3f32 = call <3 x float> @llvm.fma.v3f32(<3 x float> undef, <3 x float> undef, <3 x float> undef)
+  %v4f32 = call <4 x float> @llvm.fma.v4f32(<4 x float> undef, <4 x float> undef, <4 x float> undef)
+  %v5f32 = call <5 x float> @llvm.fma.v5f32(<5 x float> undef, <5 x float> undef, <5 x float> undef)
+  %v8f32 = call <8 x float> @llvm.fma.v8f32(<8 x float> undef, <8 x float> undef, <8 x float> undef)
+  %v9f32 = call <9 x float> @llvm.fma.v9f32(<9 x float> undef, <9 x float> undef, <9 x float> undef)
   ret void
 }
 
-declare float @llvm.fma.f32(float, float, float) #1
-declare <2 x float> @llvm.fma.v2f32(<2 x float>, <2 x float>, <2 x float>) #1
-declare <3 x float> @llvm.fma.v3f32(<3 x float>, <3 x float>, <3 x float>) #1
-declare <4 x float> @llvm.fma.v4f32(<4 x float>, <4 x float>, <4 x float>) #1
-declare <5 x float> @llvm.fma.v5f32(<5 x float>, <5 x float>, <5 x float>) #1
-declare <8 x float> @llvm.fma.v8f32(<8 x float>, <8 x float>, <8 x float>) #1
-declare <9 x float> @llvm.fma.v9f32(<9 x float>, <9 x float>, <9 x float>) #1
-
-declare double @llvm.fma.f64(double, double, double) #1
-declare <2 x double> @llvm.fma.v2f64(<2 x double>, <2 x double>, <2 x double>) #1
-declare <3 x double> @llvm.fma.v3f64(<3 x double>, <3 x double>, <3 x double>) #1
-declare <4 x double> @llvm.fma.v4f64(<4 x double>, <4 x double>, <4 x double>) #1
-declare <5 x double> @llvm.fma.v5f64(<5 x double>, <5 x double>, <5 x double>) #1
-
-declare half @llvm.fma.f16(half, half, half) #1
-declare <2 x half> @llvm.fma.v2f16(<2 x half>, <2 x half>, <2 x half>) #1
-declare <3 x half> @llvm.fma.v3f16(<3 x half>, <3 x half>, <3 x half>) #1
-declare <4 x half> @llvm.fma.v4f16(<4 x half>, <4 x half>, <4 x half>) #1
-declare <5 x half> @llvm.fma.v5f16(<5 x half>, <5 x half>, <5 x half>) #1
-declare <16 x half> @llvm.fma.v16f16(<16 x half>, <16 x half>, <16 x half>) #1
-declare <17 x half> @llvm.fma.v17f16(<17 x half>, <17 x half>, <17 x half>) #1
-
-attributes #0 = { nounwind }
-attributes #1 = { nounwind readnone }
+define void @fma_f64() {
+; SLOW-LABEL: 'fma_f64'
+; SLOW-NEXT:  Cost Model: Found an estimated cost of 4 for instruction: %f64 = call double @llvm.fma.f64(double undef, double undef, double undef)
+; SLOW-NEXT:  Cost Model: Found an estimated cost of 8 for instruction: %v2f64 = call <2 x double> @llvm.fma.v2f64(<2 x double> undef, <2 x double> undef, <2 x double> undef)
+; SLOW-NEXT:  Cost Model: Found an estimated cost of 12 for instruction: %v3f64 = call <3 x double> @llvm.fma.v3f64(<3 x double> undef, <3 x double> undef, <3 x double> undef)
+; SLOW-NEXT:  Cost Model: Found an estimated cost of 16 for instruction: %v4f64 = call <4 x double> @llvm.fma.v4f64(<4 x double> undef, <4 x double> undef, <4 x double> undef)
+; SLOW-NEXT:  Cost Model: Found an estimated cost of 96 for instruction: %v5f64 = call <5 x double> @llvm.fma.v5f64(<5 x double> undef, <5 x double> undef, <5 x double> undef)
+; SLOW-NEXT:  Cost Model: Found an estimated cost of 10 for instruction: ret void
+;
+; SLOW-SIZE-LABEL: 'fma_f64'
+; SLOW-SIZE-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %f64 = call double @llvm.fma.f64(double undef, double undef, double undef)
+; SLOW-SIZE-NEXT:  Cost Model: Found an estimated cost of 4 for instruction: %v2f64 = call <2 x double> @llvm.fma.v2f64(<2 x double> undef, <2 x double> undef, <2 x double> undef)
+; SLOW-SIZE-NEXT:  Cost Model: Found an estimated cost of 6 for instruction: %v3f64 = call <3 x double> @llvm.fma.v3f64(<3 x double> undef, <3 x double> undef, <3 x double> undef)
+; SLOW-SIZE-NEXT:  Cost Model: Found an estimated cost of 8 for instruction: %v4f64 = call <4 x double> @llvm.fma.v4f64(<4 x double> undef, <4 x double> undef, <4 x double> undef)
+; SLOW-SIZE-NEXT:  Cost Model: Found an estimated cost of 48 for instruction: %v5f64 = call <5 x double> @llvm.fma.v5f64(<5 x double> undef, <5 x double> undef, <5 x double> undef)
+; SLOW-SIZE-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: ret void
+;
+  %f64 = call double @llvm.fma.f64(double undef, double undef, double undef)
+  %v2f64 = call <2 x double> @llvm.fma.v2f64(<2 x double> undef, <2 x double> undef, <2 x double> undef)
+  %v3f64 = call <3 x double> @llvm.fma.v3f64(<3 x double> undef, <3 x double> undef, <3 x double> undef)
+  %v4f64 = call <4 x double> @llvm.fma.v4f64(<4 x double> undef, <4 x double> undef, <4 x double> undef)
+  %v5f64 = call <5 x double> @llvm.fma.v5f64(<5 x double> undef, <5 x double> undef, <5 x double> undef)
+  ret void
+}
diff --git a/llvm/test/Analysis/CostModel/AMDGPU/fmul.ll b/llvm/test/Analysis/CostModel/AMDGPU/fmul.ll
index 2e4a9c7..adc4eea 100644
--- a/llvm/test/Analysis/CostModel/AMDGPU/fmul.ll
+++ b/llvm/test/Analysis/CostModel/AMDGPU/fmul.ll
@@ -158,4 +158,55 @@ define amdgpu_kernel void @fmul_f16() #0 {
   ret void
 }
 
+define amdgpu_kernel void @fmul_bf16() #0 {
+; GFX9-LABEL: 'fmul_bf16'
+; GFX9-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %bf16 = fmul bfloat undef, undef
+; GFX9-NEXT:  Cost Model: Found an estimated cost of 5 for instruction: %v2bf16 = fmul <2 x bfloat> undef, undef
+; GFX9-NEXT:  Cost Model: Found an estimated cost of 8 for instruction: %v3bf16 = fmul <3 x bfloat> undef, undef
+; GFX9-NEXT:  Cost Model: Found an estimated cost of 11 for instruction: %v4bf16 = fmul <4 x bfloat> undef, undef
+; GFX9-NEXT:  Cost Model: Found an estimated cost of 14 for instruction: %v5bf16 = fmul <5 x bfloat> undef, undef
+; GFX9-NEXT:  Cost Model: Found an estimated cost of 47 for instruction: %v16bf16 = fmul <16 x bfloat> undef, undef
+; GFX9-NEXT:  Cost Model: Found an estimated cost of 50 for instruction: %v17bf16 = fmul <17 x bfloat> undef, undef
+; GFX9-NEXT:  Cost Model: Found an estimated cost of 10 for instruction: ret void
+;
+; SLOW-LABEL: 'fmul_bf16'
+; SLOW-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %bf16 = fmul bfloat undef, undef
+; SLOW-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %v2bf16 = fmul <2 x bfloat> undef, undef
+; SLOW-NEXT:  Cost Model: Found an estimated cost of 4 for instruction: %v3bf16 = fmul <3 x bfloat> undef, undef
+; SLOW-NEXT:  Cost Model: Found an estimated cost of 4 for instruction: %v4bf16 = fmul <4 x bfloat> undef, undef
+; SLOW-NEXT:  Cost Model: Found an estimated cost of 8 for instruction: %v5bf16 = fmul <5 x bfloat> undef, undef
+; SLOW-NEXT:  Cost Model: Found an estimated cost of 16 for instruction: %v16bf16 = fmul <16 x bfloat> undef, undef
+; SLOW-NEXT:  Cost Model: Found an estimated cost of 34 for instruction: %v17bf16 = fmul <17 x bfloat> undef, undef
+; SLOW-NEXT:  Cost Model: Found an estimated cost of 10 for instruction: ret void
+;
+; GFX9-SIZE-LABEL: 'fmul_bf16'
+; GFX9-SIZE-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %bf16 = fmul bfloat undef, undef
+; GFX9-SIZE-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %v2bf16 = fmul <2 x bfloat> undef, undef
+; GFX9-SIZE-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %v3bf16 = fmul <3 x bfloat> undef, undef
+; GFX9-SIZE-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %v4bf16 = fmul <4 x bfloat> undef, undef
+; GFX9-SIZE-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %v5bf16 = fmul <5 x bfloat> undef, undef
+; GFX9-SIZE-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %v16bf16 = fmul <16 x bfloat> undef, undef
+; GFX9-SIZE-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %v17bf16 = fmul <17 x bfloat> undef, undef
+; GFX9-SIZE-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: ret void
+;
+; SLOW-SIZE-LABEL: 'fmul_bf16'
+; SLOW-SIZE-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %bf16 = fmul bfloat undef, undef
+; SLOW-SIZE-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %v2bf16 = fmul <2 x bfloat> undef, undef
+; SLOW-SIZE-NEXT:  Cost Model: Found an estimated cost of 4 for instruction: %v3bf16 = fmul <3 x bfloat> undef, undef
+; SLOW-SIZE-NEXT:  Cost Model: Found an estimated cost of 4 for instruction: %v4bf16 = fmul <4 x bfloat> undef, undef
+; SLOW-SIZE-NEXT:  Cost Model: Found an estimated cost of 8 for instruction: %v5bf16 = fmul <5 x bfloat> undef, undef
+; SLOW-SIZE-NEXT:  Cost Model: Found an estimated cost of 16 for instruction: %v16bf16 = fmul <16 x bfloat> undef, undef
+; SLOW-SIZE-NEXT:  Cost Model: Found an estimated cost of 34 for instruction: %v17bf16 = fmul <17 x bfloat> undef, undef
+; SLOW-SIZE-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: ret void
+;
+  %bf16 = fmul bfloat undef, undef
+  %v2bf16 = fmul <2 x bfloat> undef, undef
+  %v3bf16 = fmul <3 x bfloat> undef, undef
+  %v4bf16 = fmul <4 x bfloat> undef, undef
+  %v5bf16 = fmul <5 x bfloat> undef, undef
+  %v16bf16 = fmul <16 x bfloat> undef, undef
+  %v17bf16 = fmul <17 x bfloat> undef, undef
+  ret void
+}
+
 attributes #0 = { nounwind }
diff --git a/llvm/test/Analysis/CostModel/AMDGPU/fmuladd.ll b/llvm/test/Analysis/CostModel/AMDGPU/fmuladd.ll
new file mode 100644
index 0000000..c6153bb
--- /dev/null
+++ b/llvm/test/Analysis/CostModel/AMDGPU/fmuladd.ll
@@ -0,0 +1,167 @@
+; NOTE: Assertions have been autogenerated by utils/update_analyze_test_checks.py
+; RUN: opt -passes="print<cost-model>" 2>&1 -disable-output -mtriple=amdgcn-unknown-amdhsa -mcpu=gfx1010 -mattr=+half-rate-64-ops < %s | FileCheck -check-prefix=FAST %s
+; RUN: opt -passes="print<cost-model>" 2>&1 -disable-output -mtriple=amdgcn-unknown-amdhsa -mcpu=gfx90a -mattr=+half-rate-64-ops < %s | FileCheck -check-prefixes=FAST %s
+; RUN: opt -passes="print<cost-model>" 2>&1 -disable-output -mtriple=amdgcn-unknown-amdhsa < %s | FileCheck -check-prefixes=SLOW %s
+
+; RUN: opt -passes="print<cost-model>" -cost-kind=code-size 2>&1 -disable-output -mtriple=amdgcn-unknown-amdhsa -mcpu=gfx1010 -mattr=+half-rate-64-ops < %s | FileCheck -check-prefix=FAST-SIZE %s
+; RUN: opt -passes="print<cost-model>" -cost-kind=code-size 2>&1 -disable-output -mtriple=amdgcn-unknown-amdhsa -mcpu=gfx90a -mattr=+half-rate-64-ops < %s | FileCheck -check-prefix=FAST-SIZE %s
+; RUN: opt -passes="print<cost-model>" -cost-kind=code-size 2>&1 -disable-output -mtriple=amdgcn-unknown-amdhsa < %s | FileCheck -check-prefix=SLOW-SIZE %s
+
+
+define void @fmuladd_f16() {
+; FAST-LABEL: 'fmuladd_f16'
+; FAST-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %f16 = call half @llvm.fmuladd.f16(half undef, half undef, half undef)
+; FAST-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %v2f16 = call <2 x half> @llvm.fmuladd.v2f16(<2 x half> undef, <2 x half> undef, <2 x half> undef)
+; FAST-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %v3f16 = call <3 x half> @llvm.fmuladd.v3f16(<3 x half> undef, <3 x half> undef, <3 x half> undef)
+; FAST-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %v4f16 = call <4 x half> @llvm.fmuladd.v4f16(<4 x half> undef, <4 x half> undef, <4 x half> undef)
+; FAST-NEXT:  Cost Model: Found an estimated cost of 4 for instruction: %v5f16 = call <5 x half> @llvm.fmuladd.v5f16(<5 x half> undef, <5 x half> undef, <5 x half> undef)
+; FAST-NEXT:  Cost Model: Found an estimated cost of 8 for instruction: %v16f16 = call <16 x half> @llvm.fmuladd.v16f16(<16 x half> undef, <16 x half> undef, <16 x half> undef)
+; FAST-NEXT:  Cost Model: Found an estimated cost of 48 for instruction: %v17f16 = call <17 x half> @llvm.fmuladd.v17f16(<17 x half> undef, <17 x half> undef, <17 x half> undef)
+; FAST-NEXT:  Cost Model: Found an estimated cost of 10 for instruction: ret void
+;
+; SLOW-LABEL: 'fmuladd_f16'
+; SLOW-NEXT:  Cost Model: Found an estimated cost of 4 for instruction: %f16 = call half @llvm.fmuladd.f16(half undef, half undef, half undef)
+; SLOW-NEXT:  Cost Model: Found an estimated cost of 8 for instruction: %v2f16 = call <2 x half> @llvm.fmuladd.v2f16(<2 x half> undef, <2 x half> undef, <2 x half> undef)
+; SLOW-NEXT:  Cost Model: Found an estimated cost of 16 for instruction: %v3f16 = call <3 x half> @llvm.fmuladd.v3f16(<3 x half> undef, <3 x half> undef, <3 x half> undef)
+; SLOW-NEXT:  Cost Model: Found an estimated cost of 16 for instruction: %v4f16 = call <4 x half> @llvm.fmuladd.v4f16(<4 x half> undef, <4 x half> undef, <4 x half> undef)
+; SLOW-NEXT:  Cost Model: Found an estimated cost of 32 for instruction: %v5f16 = call <5 x half> @llvm.fmuladd.v5f16(<5 x half> undef, <5 x half> undef, <5 x half> undef)
+; SLOW-NEXT:  Cost Model: Found an estimated cost of 64 for instruction: %v16f16 = call <16 x half> @llvm.fmuladd.v16f16(<16 x half> undef, <16 x half> undef, <16 x half> undef)
+; SLOW-NEXT:  Cost Model: Found an estimated cost of 136 for instruction: %v17f16 = call <17 x half> @llvm.fmuladd.v17f16(<17 x half> undef, <17 x half> undef, <17 x half> undef)
+; SLOW-NEXT:  Cost Model: Found an estimated cost of 10 for instruction: ret void
+;
+; FAST-SIZE-LABEL: 'fmuladd_f16'
+; FAST-SIZE-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %f16 = call half @llvm.fmuladd.f16(half undef, half undef, half undef)
+; FAST-SIZE-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %v2f16 = call <2 x half> @llvm.fmuladd.v2f16(<2 x half> undef, <2 x half> undef, <2 x half> undef)
+; FAST-SIZE-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %v3f16 = call <3 x half> @llvm.fmuladd.v3f16(<3 x half> undef, <3 x half> undef, <3 x half> undef)
+; FAST-SIZE-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %v4f16 = call <4 x half> @llvm.fmuladd.v4f16(<4 x half> undef, <4 x half> undef, <4 x half> undef)
+; FAST-SIZE-NEXT:  Cost Model: Found an estimated cost of 4 for instruction: %v5f16 = call <5 x half> @llvm.fmuladd.v5f16(<5 x half> undef, <5 x half> undef, <5 x half> undef)
+; FAST-SIZE-NEXT:  Cost Model: Found an estimated cost of 8 for instruction: %v16f16 = call <16 x half> @llvm.fmuladd.v16f16(<16 x half> undef, <16 x half> undef, <16 x half> undef)
+; FAST-SIZE-NEXT:  Cost Model: Found an estimated cost of 48 for instruction: %v17f16 = call <17 x half> @llvm.fmuladd.v17f16(<17 x half> undef, <17 x half> undef, <17 x half> undef)
+; FAST-SIZE-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: ret void
+;
+; SLOW-SIZE-LABEL: 'fmuladd_f16'
+; SLOW-SIZE-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %f16 = call half @llvm.fmuladd.f16(half undef, half undef, half undef)
+; SLOW-SIZE-NEXT:  Cost Model: Found an estimated cost of 4 for instruction: %v2f16 = call <2 x half> @llvm.fmuladd.v2f16(<2 x half> undef, <2 x half> undef, <2 x half> undef)
+; SLOW-SIZE-NEXT:  Cost Model: Found an estimated cost of 8 for instruction: %v3f16 = call <3 x half> @llvm.fmuladd.v3f16(<3 x half> undef, <3 x half> undef, <3 x half> undef)
+; SLOW-SIZE-NEXT:  Cost Model: Found an estimated cost of 8 for instruction: %v4f16 = call <4 x half> @llvm.fmuladd.v4f16(<4 x half> undef, <4 x half> undef, <4 x half> undef)
+; SLOW-SIZE-NEXT:  Cost Model: Found an estimated cost of 16 for instruction: %v5f16 = call <5 x half> @llvm.fmuladd.v5f16(<5 x half> undef, <5 x half> undef, <5 x half> undef)
+; SLOW-SIZE-NEXT:  Cost Model: Found an estimated cost of 32 for instruction: %v16f16 = call <16 x half> @llvm.fmuladd.v16f16(<16 x half> undef, <16 x half> undef, <16 x half> undef)
+; SLOW-SIZE-NEXT:  Cost Model: Found an estimated cost of 68 for instruction: %v17f16 = call <17 x half> @llvm.fmuladd.v17f16(<17 x half> undef, <17 x half> undef, <17 x half> undef)
+; SLOW-SIZE-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: ret void
+;
+  %f16 = call half @llvm.fmuladd.f16(half undef, half undef, half undef)
+  %v2f16 = call <2 x half> @llvm.fmuladd.v2f16(<2 x half> undef, <2 x half> undef, <2 x half> undef)
+  %v3f16 = call <3 x half> @llvm.fmuladd.v3f16(<3 x half> undef, <3 x half> undef, <3 x half> undef)
+  %v4f16 = call <4 x half> @llvm.fmuladd.v4f16(<4 x half> undef, <4 x half> undef, <4 x half> undef)
+  %v5f16 = call <5 x half> @llvm.fmuladd.v5f16(<5 x half> undef, <5 x half> undef, <5 x half> undef)
+  %v16f16 = call <16 x half> @llvm.fmuladd.v16f16(<16 x half> undef, <16 x half> undef, <16 x half> undef)
+  %v17f16 = call <17 x half> @llvm.fmuladd.v17f16(<17 x half> undef, <17 x half> undef, <17 x half> undef)
+  ret void
+}
+
+define void @fmuladd_bf16() {
+; FAST-LABEL: 'fmuladd_bf16'
+; FAST-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %bf16 = call bfloat @llvm.fmuladd.bf16(bfloat undef, bfloat undef, bfloat undef)
+; FAST-NEXT:  Cost Model: Found an estimated cost of 4 for instruction: %v2bf16 = call <2 x bfloat> @llvm.fmuladd.v2bf16(<2 x bfloat> undef, <2 x bfloat> undef, <2 x bfloat> undef)
+; FAST-NEXT:  Cost Model: Found an estimated cost of 8 for instruction: %v3bf16 = call <3 x bfloat> @llvm.fmuladd.v3bf16(<3 x bfloat> undef, <3 x bfloat> undef, <3 x bfloat> undef)
+; FAST-NEXT:  Cost Model: Found an estimated cost of 8 for instruction: %v4bf16 = call <4 x bfloat> @llvm.fmuladd.v4bf16(<4 x bfloat> undef, <4 x bfloat> undef, <4 x bfloat> undef)
+; FAST-NEXT:  Cost Model: Found an estimated cost of 16 for instruction: %v5bf16 = call <5 x bfloat> @llvm.fmuladd.v5bf16(<5 x bfloat> undef, <5 x bfloat> undef, <5 x bfloat> undef)
+; FAST-NEXT:  Cost Model: Found an estimated cost of 32 for instruction: %v16bf16 = call <16 x bfloat> @llvm.fmuladd.v16bf16(<16 x bfloat> undef, <16 x bfloat> undef, <16 x bfloat> undef)
+; FAST-NEXT:  Cost Model: Found an estimated cost of 192 for instruction: %v17bf16 = call <17 x bfloat> @llvm.fmuladd.v17bf16(<17 x bfloat> undef, <17 x bfloat> undef, <17 x bfloat> undef)
+; FAST-NEXT:  Cost Model: Found an estimated cost of 10 for instruction: ret void
+;
+; SLOW-LABEL: 'fmuladd_bf16'
+; SLOW-NEXT:  Cost Model: Found an estimated cost of 4 for instruction: %bf16 = call bfloat @llvm.fmuladd.bf16(bfloat undef, bfloat undef, bfloat undef)
+; SLOW-NEXT:  Cost Model: Found an estimated cost of 8 for instruction: %v2bf16 = call <2 x bfloat> @llvm.fmuladd.v2bf16(<2 x bfloat> undef, <2 x bfloat> undef, <2 x bfloat> undef)
+; SLOW-NEXT:  Cost Model: Found an estimated cost of 16 for instruction: %v3bf16 = call <3 x bfloat> @llvm.fmuladd.v3bf16(<3 x bfloat> undef, <3 x bfloat> undef, <3 x bfloat> undef)
+; SLOW-NEXT:  Cost Model: Found an estimated cost of 16 for instruction: %v4bf16 = call <4 x bfloat> @llvm.fmuladd.v4bf16(<4 x bfloat> undef, <4 x bfloat> undef, <4 x bfloat> undef)
+; SLOW-NEXT:  Cost Model: Found an estimated cost of 32 for instruction: %v5bf16 = call <5 x bfloat> @llvm.fmuladd.v5bf16(<5 x bfloat> undef, <5 x bfloat> undef, <5 x bfloat> undef)
+; SLOW-NEXT:  Cost Model: Found an estimated cost of 64 for instruction: %v16bf16 = call <16 x bfloat> @llvm.fmuladd.v16bf16(<16 x bfloat> undef, <16 x bfloat> undef, <16 x bfloat> undef)
+; SLOW-NEXT:  Cost Model: Found an estimated cost of 136 for instruction: %v17bf16 = call <17 x bfloat> @llvm.fmuladd.v17bf16(<17 x bfloat> undef, <17 x bfloat> undef, <17 x bfloat> undef)
+; SLOW-NEXT:  Cost Model: Found an estimated cost of 10 for instruction: ret void
+;
+; FAST-SIZE-LABEL: 'fmuladd_bf16'
+; FAST-SIZE-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %bf16 = call bfloat @llvm.fmuladd.bf16(bfloat undef, bfloat undef, bfloat undef)
+; FAST-SIZE-NEXT:  Cost Model: Found an estimated cost of 4 for instruction: %v2bf16 = call <2 x bfloat> @llvm.fmuladd.v2bf16(<2 x bfloat> undef, <2 x bfloat> undef, <2 x bfloat> undef)
+; FAST-SIZE-NEXT:  Cost Model: Found an estimated cost of 8 for instruction: %v3bf16 = call <3 x bfloat> @llvm.fmuladd.v3bf16(<3 x bfloat> undef, <3 x bfloat> undef, <3 x bfloat> undef)
+; FAST-SIZE-NEXT:  Cost Model: Found an estimated cost of 8 for instruction: %v4bf16 = call <4 x bfloat> @llvm.fmuladd.v4bf16(<4 x bfloat> undef, <4 x bfloat> undef, <4 x bfloat> undef)
+; FAST-SIZE-NEXT:  Cost Model: Found an estimated cost of 16 for instruction: %v5bf16 = call <5 x bfloat> @llvm.fmuladd.v5bf16(<5 x bfloat> undef, <5 x bfloat> undef, <5 x bfloat> undef)
+; FAST-SIZE-NEXT:  Cost Model: Found an estimated cost of 32 for instruction: %v16bf16 = call <16 x bfloat> @llvm.fmuladd.v16bf16(<16 x bfloat> undef, <16 x bfloat> undef, <16 x bfloat> undef)
+; FAST-SIZE-NEXT:  Cost Model: Found an estimated cost of 192 for instruction: %v17bf16 = call <17 x bfloat> @llvm.fmuladd.v17bf16(<17 x bfloat> undef, <17 x bfloat> undef, <17 x bfloat> undef)
+; FAST-SIZE-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: ret void
+;
+; SLOW-SIZE-LABEL: 'fmuladd_bf16'
+; SLOW-SIZE-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %bf16 = call bfloat @llvm.fmuladd.bf16(bfloat undef, bfloat undef, bfloat undef)
+; SLOW-SIZE-NEXT:  Cost Model: Found an estimated cost of 4 for instruction: %v2bf16 = call <2 x bfloat> @llvm.fmuladd.v2bf16(<2 x bfloat> undef, <2 x bfloat> undef, <2 x bfloat> undef)
+; SLOW-SIZE-NEXT:  Cost Model: Found an estimated cost of 8 for instruction: %v3bf16 = call <3 x bfloat> @llvm.fmuladd.v3bf16(<3 x bfloat> undef, <3 x bfloat> undef, <3 x bfloat> undef)
+; SLOW-SIZE-NEXT:  Cost Model: Found an estimated cost of 8 for instruction: %v4bf16 = call <4 x bfloat> @llvm.fmuladd.v4bf16(<4 x bfloat> undef, <4 x bfloat> undef, <4 x bfloat> undef)
+; SLOW-SIZE-NEXT:  Cost Model: Found an estimated cost of 16 for instruction: %v5bf16 = call <5 x bfloat> @llvm.fmuladd.v5bf16(<5 x bfloat> undef, <5 x bfloat> undef, <5 x bfloat> undef)
+; SLOW-SIZE-NEXT:  Cost Model: Found an estimated cost of 32 for instruction: %v16bf16 = call <16 x bfloat> @llvm.fmuladd.v16bf16(<16 x bfloat> undef, <16 x bfloat> undef, <16 x bfloat> undef)
+; SLOW-SIZE-NEXT:  Cost Model: Found an estimated cost of 68 for instruction: %v17bf16 = call <17 x bfloat> @llvm.fmuladd.v17bf16(<17 x bfloat> undef, <17 x bfloat> undef, <17 x bfloat> undef)
+; SLOW-SIZE-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: ret void
+;
+  %bf16 = call bfloat @llvm.fmuladd.bf16(bfloat undef, bfloat undef, bfloat undef)
+  %v2bf16 = call <2 x bfloat> @llvm.fmuladd.v2bf16(<2 x bfloat> undef, <2 x bfloat> undef, <2 x bfloat> undef)
+  %v3bf16 = call <3 x bfloat> @llvm.fmuladd.v3bf16(<3 x bfloat> undef, <3 x bfloat> undef, <3 x bfloat> undef)
+  %v4bf16 = call <4 x bfloat> @llvm.fmuladd.v4bf16(<4 x bfloat> undef, <4 x bfloat> undef, <4 x bfloat> undef)
+  %v5bf16 = call <5 x bfloat> @llvm.fmuladd.v5bf16(<5 x bfloat> undef, <5 x bfloat> undef, <5 x bfloat> undef)
+  %v16bf16 = call <16 x bfloat> @llvm.fmuladd.v16bf16(<16 x bfloat> undef, <16 x bfloat> undef, <16 x bfloat> undef)
+  %v17bf16 = call <17 x bfloat> @llvm.fmuladd.v17bf16(<17 x bfloat> undef, <17 x bfloat> undef, <17 x bfloat> undef)
+  ret void
+}
+
+define void @fmuladd_f32() {
+; SLOW-LABEL: 'fmuladd_f32'
+; SLOW-NEXT:  Cost Model: Found an estimated cost of 4 for instruction: %f32 = call float @llvm.fmuladd.f32(float undef, float undef, float undef)
+; SLOW-NEXT:  Cost Model: Found an estimated cost of 8 for instruction: %v2f32 = call <2 x float> @llvm.fmuladd.v2f32(<2 x float> undef, <2 x float> undef, <2 x float> undef)
+; SLOW-NEXT:  Cost Model: Found an estimated cost of 12 for instruction: %v3f32 = call <3 x float> @llvm.fmuladd.v3f32(<3 x float> undef, <3 x float> undef, <3 x float> undef)
+; SLOW-NEXT:  Cost Model: Found an estimated cost of 16 for instruction: %v4f32 = call <4 x float> @llvm.fmuladd.v4f32(<4 x float> undef, <4 x float> undef, <4 x float> undef)
+; SLOW-NEXT:  Cost Model: Found an estimated cost of 20 for instruction: %v5f32 = call <5 x float> @llvm.fmuladd.v5f32(<5 x float> undef, <5 x float> undef, <5 x float> undef)
+; SLOW-NEXT:  Cost Model: Found an estimated cost of 32 for instruction: %v8f32 = call <8 x float> @llvm.fmuladd.v8f32(<8 x float> undef, <8 x float> undef, <8 x float> undef)
+; SLOW-NEXT:  Cost Model: Found an estimated cost of 108 for instruction: %v9f32 = call <9 x float> @llvm.fmuladd.v9f32(<9 x float> undef, <9 x float> undef, <9 x float> undef)
+; SLOW-NEXT:  Cost Model: Found an estimated cost of 10 for instruction: ret void
+;
+; SLOW-SIZE-LABEL: 'fmuladd_f32'
+; SLOW-SIZE-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %f32 = call float @llvm.fmuladd.f32(float undef, float undef, float undef)
+; SLOW-SIZE-NEXT:  Cost Model: Found an estimated cost of 4 for instruction: %v2f32 = call <2 x float> @llvm.fmuladd.v2f32(<2 x float> undef, <2 x float> undef, <2 x float> undef)
+; SLOW-SIZE-NEXT:  Cost Model: Found an estimated cost of 6 for instruction: %v3f32 = call <3 x float> @llvm.fmuladd.v3f32(<3 x float> undef, <3 x float> undef, <3 x float> undef)
+; SLOW-SIZE-NEXT:  Cost Model: Found an estimated cost of 8 for instruction: %v4f32 = call <4 x float> @llvm.fmuladd.v4f32(<4 x float> undef, <4 x float> undef, <4 x float> undef)
+; SLOW-SIZE-NEXT:  Cost Model: Found an estimated cost of 10 for instruction: %v5f32 = call <5 x float> @llvm.fmuladd.v5f32(<5 x float> undef, <5 x float> undef, <5 x float> undef)
+; SLOW-SIZE-NEXT:  Cost Model: Found an estimated cost of 16 for instruction: %v8f32 = call <8 x float> @llvm.fmuladd.v8f32(<8 x float> undef, <8 x float> undef, <8 x float> undef)
+; SLOW-SIZE-NEXT:  Cost Model: Found an estimated cost of 54 for instruction: %v9f32 = call <9 x float> @llvm.fmuladd.v9f32(<9 x float> undef, <9 x float> undef, <9 x float> undef)
+; SLOW-SIZE-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: ret void
+;
+  %f32 = call float @llvm.fmuladd.f32(float undef, float undef, float undef)
+  %v2f32 = call <2 x float> @llvm.fmuladd.v2f32(<2 x float> undef, <2 x float> undef, <2 x float> undef)
+  %v3f32 = call <3 x float> @llvm.fmuladd.v3f32(<3 x float> undef, <3 x float> undef, <3 x float> undef)
+  %v4f32 = call <4 x float> @llvm.fmuladd.v4f32(<4 x float> undef, <4 x float> undef, <4 x float> undef)
+  %v5f32 = call <5 x float> @llvm.fmuladd.v5f32(<5 x float> undef, <5 x float> undef, <5 x float> undef)
+  %v8f32 = call <8 x float> @llvm.fmuladd.v8f32(<8 x float> undef, <8 x float> undef, <8 x float> undef)
+  %v9f32 = call <9 x float> @llvm.fmuladd.v9f32(<9 x float> undef, <9 x float> undef, <9 x float> undef)
+  ret void
+}
+
+define void @fmuladd_f64() {
+; SLOW-LABEL: 'fmuladd_f64'
+; SLOW-NEXT:  Cost Model: Found an estimated cost of 4 for instruction: %f64 = call double @llvm.fmuladd.f64(double undef, double undef, double undef)
+; SLOW-NEXT:  Cost Model: Found an estimated cost of 8 for instruction: %v2f64 = call <2 x double> @llvm.fmuladd.v2f64(<2 x double> undef, <2 x double> undef, <2 x double> undef)
+; SLOW-NEXT:  Cost Model: Found an estimated cost of 12 for instruction: %v3f64 = call <3 x double> @llvm.fmuladd.v3f64(<3 x double> undef, <3 x double> undef, <3 x double> undef)
+; SLOW-NEXT:  Cost Model: Found an estimated cost of 16 for instruction: %v4f64 = call <4 x double> @llvm.fmuladd.v4f64(<4 x double> undef, <4 x double> undef, <4 x double> undef)
+; SLOW-NEXT:  Cost Model: Found an estimated cost of 96 for instruction: %v5f64 = call <5 x double> @llvm.fmuladd.v5f64(<5 x double> undef, <5 x double> undef, <5 x double> undef)
+; SLOW-NEXT:  Cost Model: Found an estimated cost of 10 for instruction: ret void
+;
+; SLOW-SIZE-LABEL: 'fmuladd_f64'
+; SLOW-SIZE-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %f64 = call double @llvm.fmuladd.f64(double undef, double undef, double undef)
+; SLOW-SIZE-NEXT:  Cost Model: Found an estimated cost of 4 for instruction: %v2f64 = call <2 x double> @llvm.fmuladd.v2f64(<2 x double> undef, <2 x double> undef, <2 x double> undef)
+; SLOW-SIZE-NEXT:  Cost Model: Found an estimated cost of 6 for instruction: %v3f64 = call <3 x double> @llvm.fmuladd.v3f64(<3 x double> undef, <3 x double> undef, <3 x double> undef)
+; SLOW-SIZE-NEXT:  Cost Model: Found an estimated cost of 8 for instruction: %v4f64 = call <4 x double> @llvm.fmuladd.v4f64(<4 x double> undef, <4 x double> undef, <4 x double> undef)
+; SLOW-SIZE-NEXT:  Cost Model: Found an estimated cost of 48 for instruction: %v5f64 = call <5 x double> @llvm.fmuladd.v5f64(<5 x double> undef, <5 x double> undef, <5 x double> undef)
+; SLOW-SIZE-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: ret void
+;
+  %f64 = call double @llvm.fmuladd.f64(double undef, double undef, double undef)
+  %v2f64 = call <2 x double> @llvm.fmuladd.v2f64(<2 x double> undef, <2 x double> undef, <2 x double> undef)
+  %v3f64 = call <3 x double> @llvm.fmuladd.v3f64(<3 x double> undef, <3 x double> undef, <3 x double> undef)
+  %v4f64 = call <4 x double> @llvm.fmuladd.v4f64(<4 x double> undef, <4 x double> undef, <4 x double> undef)
+  %v5f64 = call <5 x double> @llvm.fmuladd.v5f64(<5 x double> undef, <5 x double> undef, <5 x double> undef)
+  ret void
+}
diff --git a/llvm/test/Analysis/CostModel/AMDGPU/fneg.ll b/llvm/test/Analysis/CostModel/AMDGPU/fneg.ll
index 725466a..9af5dd3 100644
--- a/llvm/test/Analysis/CostModel/AMDGPU/fneg.ll
+++ b/llvm/test/Analysis/CostModel/AMDGPU/fneg.ll
@@ -3,7 +3,75 @@
 ; RUN: opt -passes="print<cost-model>" -cost-kind=code-size 2>&1 -disable-output -mtriple=amdgcn-unknown-amdhsa < %s | FileCheck -check-prefixes=SIZE %s
 ; END.
 
-define amdgpu_kernel void @fneg_f32() {
+define void @fneg_f16() {
+; CHECK-LABEL: 'fneg_f16'
+; CHECK-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: %f16 = fneg half undef
+; CHECK-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: %v2f16 = fneg <2 x half> undef
+; CHECK-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: %v3f16 = fneg <3 x half> undef
+; CHECK-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: %v4f16 = fneg <4 x half> undef
+; CHECK-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: %v5f16 = fneg <5 x half> undef
+; CHECK-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: %v8f16 = fneg <8 x half> undef
+; CHECK-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: %v16f16 = fneg <16 x half> undef
+; CHECK-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: %v17f16 = fneg <17 x half> undef
+; CHECK-NEXT:  Cost Model: Found an estimated cost of 10 for instruction: ret void
+;
+; SIZE-LABEL: 'fneg_f16'
+; SIZE-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: %f16 = fneg half undef
+; SIZE-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: %v2f16 = fneg <2 x half> undef
+; SIZE-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: %v3f16 = fneg <3 x half> undef
+; SIZE-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: %v4f16 = fneg <4 x half> undef
+; SIZE-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: %v5f16 = fneg <5 x half> undef
+; SIZE-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: %v8f16 = fneg <8 x half> undef
+; SIZE-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: %v16f16 = fneg <16 x half> undef
+; SIZE-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: %v17f16 = fneg <17 x half> undef
+; SIZE-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: ret void
+;
+  %f16 = fneg half undef
+  %v2f16 = fneg <2 x half> undef
+  %v3f16 = fneg <3 x half> undef
+  %v4f16 = fneg <4 x half> undef
+  %v5f16 = fneg <5 x half> undef
+  %v8f16 = fneg <8 x half> undef
+  %v16f16 = fneg <16 x half> undef
+  %v17f16 = fneg <17 x half> undef
+  ret void
+}
+
+define void @fneg_bf16() {
+; CHECK-LABEL: 'fneg_bf16'
+; CHECK-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: %f16 = fneg bfloat undef
+; CHECK-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: %v2f16 = fneg <2 x bfloat> undef
+; CHECK-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: %v3f16 = fneg <3 x bfloat> undef
+; CHECK-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: %v4f16 = fneg <4 x bfloat> undef
+; CHECK-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: %v5f16 = fneg <5 x bfloat> undef
+; CHECK-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: %v8f16 = fneg <8 x bfloat> undef
+; CHECK-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: %v16f16 = fneg <16 x bfloat> undef
+; CHECK-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: %v17f16 = fneg <17 x bfloat> undef
+; CHECK-NEXT:  Cost Model: Found an estimated cost of 10 for instruction: ret void
+;
+; SIZE-LABEL: 'fneg_bf16'
+; SIZE-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: %f16 = fneg bfloat undef
+; SIZE-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: %v2f16 = fneg <2 x bfloat> undef
+; SIZE-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: %v3f16 = fneg <3 x bfloat> undef
+; SIZE-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: %v4f16 = fneg <4 x bfloat> undef
+; SIZE-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: %v5f16 = fneg <5 x bfloat> undef
+; SIZE-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: %v8f16 = fneg <8 x bfloat> undef
+; SIZE-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: %v16f16 = fneg <16 x bfloat> undef
+; SIZE-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: %v17f16 = fneg <17 x bfloat> undef
+; SIZE-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: ret void
+;
+  %f16 = fneg bfloat undef
+  %v2f16 = fneg <2 x bfloat> undef
+  %v3f16 = fneg <3 x bfloat> undef
+  %v4f16 = fneg <4 x bfloat> undef
+  %v5f16 = fneg <5 x bfloat> undef
+  %v8f16 = fneg <8 x bfloat> undef
+  %v16f16 = fneg <16 x bfloat> undef
+  %v17f16 = fneg <17 x bfloat> undef
+  ret void
+}
+
+define void @fneg_f32() {
 ; CHECK-LABEL: 'fneg_f32'
 ; CHECK-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: %f32 = fneg float undef
 ; CHECK-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: %v2f32 = fneg <2 x float> undef
@@ -12,6 +80,7 @@ define amdgpu_kernel void @fneg_f32() {
 ; CHECK-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: %v5f32 = fneg <5 x float> undef
 ; CHECK-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: %v8f32 = fneg <8 x float> undef
 ; CHECK-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: %v9f32 = fneg <9 x float> undef
+; CHECK-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: %v16f32 = fneg <16 x float> undef
 ; CHECK-NEXT:  Cost Model: Found an estimated cost of 10 for instruction: ret void
 ;
 ; SIZE-LABEL: 'fneg_f32'
@@ -22,6 +91,7 @@ define amdgpu_kernel void @fneg_f32() {
 ; SIZE-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: %v5f32 = fneg <5 x float> undef
 ; SIZE-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: %v8f32 = fneg <8 x float> undef
 ; SIZE-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: %v9f32 = fneg <9 x float> undef
+; SIZE-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: %v16f32 = fneg <16 x float> undef
 ; SIZE-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: ret void
 ;
   %f32 = fneg float undef
@@ -31,16 +101,19 @@ define amdgpu_kernel void @fneg_f32() {
   %v5f32 = fneg <5 x float> undef
   %v8f32 = fneg <8 x float> undef
   %v9f32 = fneg <9 x float> undef
+  %v16f32 = fneg <16 x float> undef
   ret void
 }
 
-define amdgpu_kernel void @fneg_f64() {
+define void @fneg_f64() {
 ; CHECK-LABEL: 'fneg_f64'
 ; CHECK-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: %f64 = fneg double undef
 ; CHECK-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: %v2f64 = fneg <2 x double> undef
 ; CHECK-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: %v3f64 = fneg <3 x double> undef
 ; CHECK-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: %v4f64 = fneg <4 x double> undef
 ; CHECK-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: %v5f64 = fneg <5 x double> undef
+; CHECK-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: %v8f64 = fneg <8 x double> undef
+; CHECK-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: %v16f64 = fneg <16 x double> undef
 ; CHECK-NEXT:  Cost Model: Found an estimated cost of 10 for instruction: ret void
 ;
 ; SIZE-LABEL: 'fneg_f64'
@@ -49,6 +122,8 @@ define amdgpu_kernel void @fneg_f64() {
 ; SIZE-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: %v3f64 = fneg <3 x double> undef
 ; SIZE-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: %v4f64 = fneg <4 x double> undef
 ; SIZE-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: %v5f64 = fneg <5 x double> undef
+; SIZE-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: %v8f64 = fneg <8 x double> undef
+; SIZE-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: %v16f64 = fneg <16 x double> undef
 ; SIZE-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: ret void
 ;
   %f64 = fneg double undef
@@ -56,37 +131,8 @@ define amdgpu_kernel void @fneg_f64() {
   %v3f64 = fneg <3 x double> undef
   %v4f64 = fneg <4 x double> undef
   %v5f64 = fneg <5 x double> undef
-  ret void
-}
-
-define amdgpu_kernel void @fneg_f16() {
-; CHECK-LABEL: 'fneg_f16'
-; CHECK-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: %f16 = fneg half undef
-; CHECK-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: %v2f16 = fneg <2 x half> undef
-; CHECK-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: %v3f16 = fneg <3 x half> undef
-; CHECK-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: %v4f16 = fneg <4 x half> undef
-; CHECK-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: %v5f16 = fneg <5 x half> undef
-; CHECK-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: %v16f16 = fneg <16 x half> undef
-; CHECK-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: %v17f16 = fneg <17 x half> undef
-; CHECK-NEXT:  Cost Model: Found an estimated cost of 10 for instruction: ret void
-;
-; SIZE-LABEL: 'fneg_f16'
-; SIZE-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: %f16 = fneg half undef
-; SIZE-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: %v2f16 = fneg <2 x half> undef
-; SIZE-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: %v3f16 = fneg <3 x half> undef
-; SIZE-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: %v4f16 = fneg <4 x half> undef
-; SIZE-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: %v5f16 = fneg <5 x half> undef
-; SIZE-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: %v16f16 = fneg <16 x half> undef
-; SIZE-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: %v17f16 = fneg <17 x half> undef
-; SIZE-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: ret void
-;
-  %f16 = fneg half undef
-  %v2f16 = fneg <2 x half> undef
-  %v3f16 = fneg <3 x half> undef
-  %v4f16 = fneg <4 x half> undef
-  %v5f16 = fneg <5 x half> undef
-  %v16f16 = fneg <16 x half> undef
-  %v17f16 = fneg <17 x half> undef
+  %v8f64 = fneg <8 x double> undef
+  %v16f64 = fneg <16 x double> undef
   ret void
 }
 
diff --git a/llvm/test/Analysis/CostModel/AMDGPU/frexp.ll b/llvm/test/Analysis/CostModel/AMDGPU/frexp.ll
new file mode 100644
index 0000000..22134d0
--- /dev/null
+++ b/llvm/test/Analysis/CostModel/AMDGPU/frexp.ll
@@ -0,0 +1,246 @@
+; NOTE: Assertions have been autogenerated by utils/update_analyze_test_checks.py UTC_ARGS: --version 5
+; RUN: opt -passes="print<cost-model>" 2>&1 -disable-output -mtriple=amdgcn-unknown-amdhsa -mcpu=hawaii < %s | FileCheck -check-prefixes=ALL,GFX7 %s
+; RUN: opt -passes="print<cost-model>" 2>&1 -disable-output -mtriple=amdgcn-unknown-amdhsa -mcpu=tonga < %s | FileCheck -check-prefixes=ALL,GFX8PLUS %s
+; RUN: opt -passes="print<cost-model>" 2>&1 -disable-output -mtriple=amdgcn-unknown-amdhsa -mcpu=gfx900 < %s | FileCheck -check-prefixes=ALL,GFX8PLUS %s
+; RUN: opt -passes="print<cost-model>" 2>&1 -disable-output -mtriple=amdgcn-unknown-amdhsa -mcpu=gfx1030 < %s | FileCheck -check-prefixes=ALL,GFX8PLUS %s
+
+; RUN: opt -passes="print<cost-model>" -cost-kind=code-size 2>&1 -disable-output -mtriple=amdgcn-unknown-amdhsa -mcpu=hawaii < %s | FileCheck -check-prefixes=ALL-SIZE,GFX7-SIZE %s
+; RUN: opt -passes="print<cost-model>" -cost-kind=code-size 2>&1 -disable-output -mtriple=amdgcn-unknown-amdhsa -mcpu=tonga < %s | FileCheck -check-prefixes=ALL-SIZE,GFX8PLUS-SIZE %s
+; RUN: opt -passes="print<cost-model>" -cost-kind=code-size 2>&1 -disable-output -mtriple=amdgcn-unknown-amdhsa -mcpu=gfx900 < %s | FileCheck -check-prefixes=ALL-SIZE,GFX8PLUS-SIZE %s
+; RUN: opt -passes="print<cost-model>" -cost-kind=code-size 2>&1 -disable-output -mtriple=amdgcn-unknown-amdhsa -mcpu=gfx1030 < %s | FileCheck -check-prefixes=ALL-SIZE,GFX8PLUS-SIZE %s
+
+define void @frexp_f16_i32() {
+; GFX7-LABEL: 'frexp_f16_i32'
+; GFX7-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %f16 = call { half, i32 } @llvm.frexp.f16.i32(half undef)
+; GFX7-NEXT:  Cost Model: Found an estimated cost of 4 for instruction: %v2f16 = call { <2 x half>, <2 x i32> } @llvm.frexp.v2f16.v2i32(<2 x half> undef)
+; GFX7-NEXT:  Cost Model: Found an estimated cost of 6 for instruction: %v3f16 = call { <3 x half>, <3 x i32> } @llvm.frexp.v3f16.v3i32(<3 x half> undef)
+; GFX7-NEXT:  Cost Model: Found an estimated cost of 8 for instruction: %v4f16 = call { <4 x half>, <4 x i32> } @llvm.frexp.v4f16.v4i32(<4 x half> undef)
+; GFX7-NEXT:  Cost Model: Found an estimated cost of 10 for instruction: %v5f16 = call { <5 x half>, <5 x i32> } @llvm.frexp.v5f16.v5i32(<5 x half> undef)
+; GFX7-NEXT:  Cost Model: Found an estimated cost of 16 for instruction: %v8f16 = call { <8 x half>, <8 x i32> } @llvm.frexp.v8f16.v8i32(<8 x half> undef)
+; GFX7-NEXT:  Cost Model: Found an estimated cost of 32 for instruction: %v16f16 = call { <16 x half>, <16 x i32> } @llvm.frexp.v16f16.v16i32(<16 x half> undef)
+; GFX7-NEXT:  Cost Model: Found an estimated cost of 34 for instruction: %v17f16 = call { <17 x half>, <17 x i32> } @llvm.frexp.v17f16.v17i32(<17 x half> undef)
+; GFX7-NEXT:  Cost Model: Found an estimated cost of 10 for instruction: ret void
+;
+; GFX8PLUS-LABEL: 'frexp_f16_i32'
+; GFX8PLUS-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %f16 = call { half, i32 } @llvm.frexp.f16.i32(half undef)
+; GFX8PLUS-NEXT:  Cost Model: Found an estimated cost of 3 for instruction: %v2f16 = call { <2 x half>, <2 x i32> } @llvm.frexp.v2f16.v2i32(<2 x half> undef)
+; GFX8PLUS-NEXT:  Cost Model: Found an estimated cost of 5 for instruction: %v3f16 = call { <3 x half>, <3 x i32> } @llvm.frexp.v3f16.v3i32(<3 x half> undef)
+; GFX8PLUS-NEXT:  Cost Model: Found an estimated cost of 7 for instruction: %v4f16 = call { <4 x half>, <4 x i32> } @llvm.frexp.v4f16.v4i32(<4 x half> undef)
+; GFX8PLUS-NEXT:  Cost Model: Found an estimated cost of 9 for instruction: %v5f16 = call { <5 x half>, <5 x i32> } @llvm.frexp.v5f16.v5i32(<5 x half> undef)
+; GFX8PLUS-NEXT:  Cost Model: Found an estimated cost of 15 for instruction: %v8f16 = call { <8 x half>, <8 x i32> } @llvm.frexp.v8f16.v8i32(<8 x half> undef)
+; GFX8PLUS-NEXT:  Cost Model: Found an estimated cost of 31 for instruction: %v16f16 = call { <16 x half>, <16 x i32> } @llvm.frexp.v16f16.v16i32(<16 x half> undef)
+; GFX8PLUS-NEXT:  Cost Model: Found an estimated cost of 33 for instruction: %v17f16 = call { <17 x half>, <17 x i32> } @llvm.frexp.v17f16.v17i32(<17 x half> undef)
+; GFX8PLUS-NEXT:  Cost Model: Found an estimated cost of 10 for instruction: ret void
+;
+; GFX7-SIZE-LABEL: 'frexp_f16_i32'
+; GFX7-SIZE-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %f16 = call { half, i32 } @llvm.frexp.f16.i32(half undef)
+; GFX7-SIZE-NEXT:  Cost Model: Found an estimated cost of 4 for instruction: %v2f16 = call { <2 x half>, <2 x i32> } @llvm.frexp.v2f16.v2i32(<2 x half> undef)
+; GFX7-SIZE-NEXT:  Cost Model: Found an estimated cost of 6 for instruction: %v3f16 = call { <3 x half>, <3 x i32> } @llvm.frexp.v3f16.v3i32(<3 x half> undef)
+; GFX7-SIZE-NEXT:  Cost Model: Found an estimated cost of 8 for instruction: %v4f16 = call { <4 x half>, <4 x i32> } @llvm.frexp.v4f16.v4i32(<4 x half> undef)
+; GFX7-SIZE-NEXT:  Cost Model: Found an estimated cost of 10 for instruction: %v5f16 = call { <5 x half>, <5 x i32> } @llvm.frexp.v5f16.v5i32(<5 x half> undef)
+; GFX7-SIZE-NEXT:  Cost Model: Found an estimated cost of 16 for instruction: %v8f16 = call { <8 x half>, <8 x i32> } @llvm.frexp.v8f16.v8i32(<8 x half> undef)
+; GFX7-SIZE-NEXT:  Cost Model: Found an estimated cost of 32 for instruction: %v16f16 = call { <16 x half>, <16 x i32> } @llvm.frexp.v16f16.v16i32(<16 x half> undef)
+; GFX7-SIZE-NEXT:  Cost Model: Found an estimated cost of 34 for instruction: %v17f16 = call { <17 x half>, <17 x i32> } @llvm.frexp.v17f16.v17i32(<17 x half> undef)
+; GFX7-SIZE-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: ret void
+;
+; GFX8PLUS-SIZE-LABEL: 'frexp_f16_i32'
+; GFX8PLUS-SIZE-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %f16 = call { half, i32 } @llvm.frexp.f16.i32(half undef)
+; GFX8PLUS-SIZE-NEXT:  Cost Model: Found an estimated cost of 3 for instruction: %v2f16 = call { <2 x half>, <2 x i32> } @llvm.frexp.v2f16.v2i32(<2 x half> undef)
+; GFX8PLUS-SIZE-NEXT:  Cost Model: Found an estimated cost of 5 for instruction: %v3f16 = call { <3 x half>, <3 x i32> } @llvm.frexp.v3f16.v3i32(<3 x half> undef)
+; GFX8PLUS-SIZE-NEXT:  Cost Model: Found an estimated cost of 7 for instruction: %v4f16 = call { <4 x half>, <4 x i32> } @llvm.frexp.v4f16.v4i32(<4 x half> undef)
+; GFX8PLUS-SIZE-NEXT:  Cost Model: Found an estimated cost of 9 for instruction: %v5f16 = call { <5 x half>, <5 x i32> } @llvm.frexp.v5f16.v5i32(<5 x half> undef)
+; GFX8PLUS-SIZE-NEXT:  Cost Model: Found an estimated cost of 15 for instruction: %v8f16 = call { <8 x half>, <8 x i32> } @llvm.frexp.v8f16.v8i32(<8 x half> undef)
+; GFX8PLUS-SIZE-NEXT:  Cost Model: Found an estimated cost of 31 for instruction: %v16f16 = call { <16 x half>, <16 x i32> } @llvm.frexp.v16f16.v16i32(<16 x half> undef)
+; GFX8PLUS-SIZE-NEXT:  Cost Model: Found an estimated cost of 33 for instruction: %v17f16 = call { <17 x half>, <17 x i32> } @llvm.frexp.v17f16.v17i32(<17 x half> undef)
+; GFX8PLUS-SIZE-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: ret void
+;
+  %f16 = call { half, i32 } @llvm.frexp.f16.i32(half undef)
+  %v2f16 = call { <2 x half>, <2 x i32> } @llvm.frexp.v2f16.v2i32(<2 x half> undef)
+  %v3f16 = call { <3 x half>, <3 x i32> } @llvm.frexp.v3f16.v3i32(<3 x half> undef)
+  %v4f16 = call { <4 x half>, <4 x i32> } @llvm.frexp.v4f16.v4i32(<4 x half> undef)
+  %v5f16 = call { <5 x half>, <5 x i32> } @llvm.frexp.v5f16.v5i32(<5 x half> undef)
+  %v8f16 = call { <8 x half>, <8 x i32> } @llvm.frexp.v8f16.v8i32(<8 x half> undef)
+  %v16f16 = call { <16 x half>, <16 x i32> } @llvm.frexp.v16f16.v16i32(<16 x half> undef)
+  %v17f16 = call { <17 x half>, <17 x i32> } @llvm.frexp.v17f16.v17i32(<17 x half> undef)
+  ret void
+}
+
+define void @frexp_f16_i16() {
+; GFX7-LABEL: 'frexp_f16_i16'
+; GFX7-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %f16 = call { half, i16 } @llvm.frexp.f16.i16(half undef)
+; GFX7-NEXT:  Cost Model: Found an estimated cost of 4 for instruction: %v2f16 = call { <2 x half>, <2 x i16> } @llvm.frexp.v2f16.v2i16(<2 x half> undef)
+; GFX7-NEXT:  Cost Model: Found an estimated cost of 6 for instruction: %v3f16 = call { <3 x half>, <3 x i16> } @llvm.frexp.v3f16.v3i16(<3 x half> undef)
+; GFX7-NEXT:  Cost Model: Found an estimated cost of 8 for instruction: %v4f16 = call { <4 x half>, <4 x i16> } @llvm.frexp.v4f16.v4i16(<4 x half> undef)
+; GFX7-NEXT:  Cost Model: Found an estimated cost of 10 for instruction: %v5f16 = call { <5 x half>, <5 x i16> } @llvm.frexp.v5f16.v5i16(<5 x half> undef)
+; GFX7-NEXT:  Cost Model: Found an estimated cost of 16 for instruction: %v8f16 = call { <8 x half>, <8 x i16> } @llvm.frexp.v8f16.v8i16(<8 x half> undef)
+; GFX7-NEXT:  Cost Model: Found an estimated cost of 32 for instruction: %v16f16 = call { <16 x half>, <16 x i16> } @llvm.frexp.v16f16.v16i16(<16 x half> undef)
+; GFX7-NEXT:  Cost Model: Found an estimated cost of 34 for instruction: %v17f16 = call { <17 x half>, <17 x i16> } @llvm.frexp.v17f16.v17i16(<17 x half> undef)
+; GFX7-NEXT:  Cost Model: Found an estimated cost of 10 for instruction: ret void
+;
+; GFX8PLUS-LABEL: 'frexp_f16_i16'
+; GFX8PLUS-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %f16 = call { half, i16 } @llvm.frexp.f16.i16(half undef)
+; GFX8PLUS-NEXT:  Cost Model: Found an estimated cost of 3 for instruction: %v2f16 = call { <2 x half>, <2 x i16> } @llvm.frexp.v2f16.v2i16(<2 x half> undef)
+; GFX8PLUS-NEXT:  Cost Model: Found an estimated cost of 5 for instruction: %v3f16 = call { <3 x half>, <3 x i16> } @llvm.frexp.v3f16.v3i16(<3 x half> undef)
+; GFX8PLUS-NEXT:  Cost Model: Found an estimated cost of 7 for instruction: %v4f16 = call { <4 x half>, <4 x i16> } @llvm.frexp.v4f16.v4i16(<4 x half> undef)
+; GFX8PLUS-NEXT:  Cost Model: Found an estimated cost of 9 for instruction: %v5f16 = call { <5 x half>, <5 x i16> } @llvm.frexp.v5f16.v5i16(<5 x half> undef)
+; GFX8PLUS-NEXT:  Cost Model: Found an estimated cost of 15 for instruction: %v8f16 = call { <8 x half>, <8 x i16> } @llvm.frexp.v8f16.v8i16(<8 x half> undef)
+; GFX8PLUS-NEXT:  Cost Model: Found an estimated cost of 31 for instruction: %v16f16 = call { <16 x half>, <16 x i16> } @llvm.frexp.v16f16.v16i16(<16 x half> undef)
+; GFX8PLUS-NEXT:  Cost Model: Found an estimated cost of 33 for instruction: %v17f16 = call { <17 x half>, <17 x i16> } @llvm.frexp.v17f16.v17i16(<17 x half> undef)
+; GFX8PLUS-NEXT:  Cost Model: Found an estimated cost of 10 for instruction: ret void
+;
+; GFX7-SIZE-LABEL: 'frexp_f16_i16'
+; GFX7-SIZE-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %f16 = call { half, i16 } @llvm.frexp.f16.i16(half undef)
+; GFX7-SIZE-NEXT:  Cost Model: Found an estimated cost of 4 for instruction: %v2f16 = call { <2 x half>, <2 x i16> } @llvm.frexp.v2f16.v2i16(<2 x half> undef)
+; GFX7-SIZE-NEXT:  Cost Model: Found an estimated cost of 6 for instruction: %v3f16 = call { <3 x half>, <3 x i16> } @llvm.frexp.v3f16.v3i16(<3 x half> undef)
+; GFX7-SIZE-NEXT:  Cost Model: Found an estimated cost of 8 for instruction: %v4f16 = call { <4 x half>, <4 x i16> } @llvm.frexp.v4f16.v4i16(<4 x half> undef)
+; GFX7-SIZE-NEXT:  Cost Model: Found an estimated cost of 10 for instruction: %v5f16 = call { <5 x half>, <5 x i16> } @llvm.frexp.v5f16.v5i16(<5 x half> undef)
+; GFX7-SIZE-NEXT:  Cost Model: Found an estimated cost of 16 for instruction: %v8f16 = call { <8 x half>, <8 x i16> } @llvm.frexp.v8f16.v8i16(<8 x half> undef)
+; GFX7-SIZE-NEXT:  Cost Model: Found an estimated cost of 32 for instruction: %v16f16 = call { <16 x half>, <16 x i16> } @llvm.frexp.v16f16.v16i16(<16 x half> undef)
+; GFX7-SIZE-NEXT:  Cost Model: Found an estimated cost of 34 for instruction: %v17f16 = call { <17 x half>, <17 x i16> } @llvm.frexp.v17f16.v17i16(<17 x half> undef)
+; GFX7-SIZE-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: ret void
+;
+; GFX8PLUS-SIZE-LABEL: 'frexp_f16_i16'
+; GFX8PLUS-SIZE-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %f16 = call { half, i16 } @llvm.frexp.f16.i16(half undef)
+; GFX8PLUS-SIZE-NEXT:  Cost Model: Found an estimated cost of 3 for instruction: %v2f16 = call { <2 x half>, <2 x i16> } @llvm.frexp.v2f16.v2i16(<2 x half> undef)
+; GFX8PLUS-SIZE-NEXT:  Cost Model: Found an estimated cost of 5 for instruction: %v3f16 = call { <3 x half>, <3 x i16> } @llvm.frexp.v3f16.v3i16(<3 x half> undef)
+; GFX8PLUS-SIZE-NEXT:  Cost Model: Found an estimated cost of 7 for instruction: %v4f16 = call { <4 x half>, <4 x i16> } @llvm.frexp.v4f16.v4i16(<4 x half> undef)
+; GFX8PLUS-SIZE-NEXT:  Cost Model: Found an estimated cost of 9 for instruction: %v5f16 = call { <5 x half>, <5 x i16> } @llvm.frexp.v5f16.v5i16(<5 x half> undef)
+; GFX8PLUS-SIZE-NEXT:  Cost Model: Found an estimated cost of 15 for instruction: %v8f16 = call { <8 x half>, <8 x i16> } @llvm.frexp.v8f16.v8i16(<8 x half> undef)
+; GFX8PLUS-SIZE-NEXT:  Cost Model: Found an estimated cost of 31 for instruction: %v16f16 = call { <16 x half>, <16 x i16> } @llvm.frexp.v16f16.v16i16(<16 x half> undef)
+; GFX8PLUS-SIZE-NEXT:  Cost Model: Found an estimated cost of 33 for instruction: %v17f16 = call { <17 x half>, <17 x i16> } @llvm.frexp.v17f16.v17i16(<17 x half> undef)
+; GFX8PLUS-SIZE-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: ret void
+;
+  %f16 = call { half, i16 } @llvm.frexp.f16.i16(half undef)
+  %v2f16 = call { <2 x half>, <2 x i16> } @llvm.frexp.v2f16.v2i16(<2 x half> undef)
+  %v3f16 = call { <3 x half>, <3 x i16> } @llvm.frexp.v3f16.v3i16(<3 x half> undef)
+  %v4f16 = call { <4 x half>, <4 x i16> } @llvm.frexp.v4f16.v4i16(<4 x half> undef)
+  %v5f16 = call { <5 x half>, <5 x i16> } @llvm.frexp.v5f16.v5i16(<5 x half> undef)
+  %v8f16 = call { <8 x half>, <8 x i16> } @llvm.frexp.v8f16.v8i16(<8 x half> undef)
+  %v16f16 = call { <16 x half>, <16 x i16> } @llvm.frexp.v16f16.v16i16(<16 x half> undef)
+  %v17f16 = call { <17 x half>, <17 x i16> } @llvm.frexp.v17f16.v17i16(<17 x half> undef)
+  ret void
+}
+
+define void @frexp_bf16() {
+; GFX7-LABEL: 'frexp_bf16'
+; GFX7-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %bf16 = call { bfloat, i32 } @llvm.frexp.bf16.i32(bfloat undef)
+; GFX7-NEXT:  Cost Model: Found an estimated cost of 4 for instruction: %v2bf16 = call { <2 x bfloat>, <2 x i32> } @llvm.frexp.v2bf16.v2i32(<2 x bfloat> undef)
+; GFX7-NEXT:  Cost Model: Found an estimated cost of 6 for instruction: %v3bf16 = call { <3 x bfloat>, <3 x i32> } @llvm.frexp.v3bf16.v3i32(<3 x bfloat> undef)
+; GFX7-NEXT:  Cost Model: Found an estimated cost of 8 for instruction: %v4bf16 = call { <4 x bfloat>, <4 x i32> } @llvm.frexp.v4bf16.v4i32(<4 x bfloat> undef)
+; GFX7-NEXT:  Cost Model: Found an estimated cost of 10 for instruction: %v5bf16 = call { <5 x bfloat>, <5 x i32> } @llvm.frexp.v5bf16.v5i32(<5 x bfloat> undef)
+; GFX7-NEXT:  Cost Model: Found an estimated cost of 16 for instruction: %v8bf16 = call { <8 x bfloat>, <8 x i32> } @llvm.frexp.v8bf16.v8i32(<8 x bfloat> undef)
+; GFX7-NEXT:  Cost Model: Found an estimated cost of 32 for instruction: %v16bf16 = call { <16 x bfloat>, <16 x i32> } @llvm.frexp.v16bf16.v16i32(<16 x bfloat> undef)
+; GFX7-NEXT:  Cost Model: Found an estimated cost of 34 for instruction: %v17bf16 = call { <17 x bfloat>, <17 x i32> } @llvm.frexp.v17bf16.v17i32(<17 x bfloat> undef)
+; GFX7-NEXT:  Cost Model: Found an estimated cost of 10 for instruction: ret void
+;
+; GFX8PLUS-LABEL: 'frexp_bf16'
+; GFX8PLUS-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %bf16 = call { bfloat, i32 } @llvm.frexp.bf16.i32(bfloat undef)
+; GFX8PLUS-NEXT:  Cost Model: Found an estimated cost of 3 for instruction: %v2bf16 = call { <2 x bfloat>, <2 x i32> } @llvm.frexp.v2bf16.v2i32(<2 x bfloat> undef)
+; GFX8PLUS-NEXT:  Cost Model: Found an estimated cost of 5 for instruction: %v3bf16 = call { <3 x bfloat>, <3 x i32> } @llvm.frexp.v3bf16.v3i32(<3 x bfloat> undef)
+; GFX8PLUS-NEXT:  Cost Model: Found an estimated cost of 7 for instruction: %v4bf16 = call { <4 x bfloat>, <4 x i32> } @llvm.frexp.v4bf16.v4i32(<4 x bfloat> undef)
+; GFX8PLUS-NEXT:  Cost Model: Found an estimated cost of 9 for instruction: %v5bf16 = call { <5 x bfloat>, <5 x i32> } @llvm.frexp.v5bf16.v5i32(<5 x bfloat> undef)
+; GFX8PLUS-NEXT:  Cost Model: Found an estimated cost of 15 for instruction: %v8bf16 = call { <8 x bfloat>, <8 x i32> } @llvm.frexp.v8bf16.v8i32(<8 x bfloat> undef)
+; GFX8PLUS-NEXT:  Cost Model: Found an estimated cost of 31 for instruction: %v16bf16 = call { <16 x bfloat>, <16 x i32> } @llvm.frexp.v16bf16.v16i32(<16 x bfloat> undef)
+; GFX8PLUS-NEXT:  Cost Model: Found an estimated cost of 33 for instruction: %v17bf16 = call { <17 x bfloat>, <17 x i32> } @llvm.frexp.v17bf16.v17i32(<17 x bfloat> undef)
+; GFX8PLUS-NEXT:  Cost Model: Found an estimated cost of 10 for instruction: ret void
+;
+; GFX7-SIZE-LABEL: 'frexp_bf16'
+; GFX7-SIZE-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %bf16 = call { bfloat, i32 } @llvm.frexp.bf16.i32(bfloat undef)
+; GFX7-SIZE-NEXT:  Cost Model: Found an estimated cost of 4 for instruction: %v2bf16 = call { <2 x bfloat>, <2 x i32> } @llvm.frexp.v2bf16.v2i32(<2 x bfloat> undef)
+; GFX7-SIZE-NEXT:  Cost Model: Found an estimated cost of 6 for instruction: %v3bf16 = call { <3 x bfloat>, <3 x i32> } @llvm.frexp.v3bf16.v3i32(<3 x bfloat> undef)
+; GFX7-SIZE-NEXT:  Cost Model: Found an estimated cost of 8 for instruction: %v4bf16 = call { <4 x bfloat>, <4 x i32> } @llvm.frexp.v4bf16.v4i32(<4 x bfloat> undef)
+; GFX7-SIZE-NEXT:  Cost Model: Found an estimated cost of 10 for instruction: %v5bf16 = call { <5 x bfloat>, <5 x i32> } @llvm.frexp.v5bf16.v5i32(<5 x bfloat> undef)
+; GFX7-SIZE-NEXT:  Cost Model: Found an estimated cost of 16 for instruction: %v8bf16 = call { <8 x bfloat>, <8 x i32> } @llvm.frexp.v8bf16.v8i32(<8 x bfloat> undef)
+; GFX7-SIZE-NEXT:  Cost Model: Found an estimated cost of 32 for instruction: %v16bf16 = call { <16 x bfloat>, <16 x i32> } @llvm.frexp.v16bf16.v16i32(<16 x bfloat> undef)
+; GFX7-SIZE-NEXT:  Cost Model: Found an estimated cost of 34 for instruction: %v17bf16 = call { <17 x bfloat>, <17 x i32> } @llvm.frexp.v17bf16.v17i32(<17 x bfloat> undef)
+; GFX7-SIZE-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: ret void
+;
+; GFX8PLUS-SIZE-LABEL: 'frexp_bf16'
+; GFX8PLUS-SIZE-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %bf16 = call { bfloat, i32 } @llvm.frexp.bf16.i32(bfloat undef)
+; GFX8PLUS-SIZE-NEXT:  Cost Model: Found an estimated cost of 3 for instruction: %v2bf16 = call { <2 x bfloat>, <2 x i32> } @llvm.frexp.v2bf16.v2i32(<2 x bfloat> undef)
+; GFX8PLUS-SIZE-NEXT:  Cost Model: Found an estimated cost of 5 for instruction: %v3bf16 = call { <3 x bfloat>, <3 x i32> } @llvm.frexp.v3bf16.v3i32(<3 x bfloat> undef)
+; GFX8PLUS-SIZE-NEXT:  Cost Model: Found an estimated cost of 7 for instruction: %v4bf16 = call { <4 x bfloat>, <4 x i32> } @llvm.frexp.v4bf16.v4i32(<4 x bfloat> undef)
+; GFX8PLUS-SIZE-NEXT:  Cost Model: Found an estimated cost of 9 for instruction: %v5bf16 = call { <5 x bfloat>, <5 x i32> } @llvm.frexp.v5bf16.v5i32(<5 x bfloat> undef)
+; GFX8PLUS-SIZE-NEXT:  Cost Model: Found an estimated cost of 15 for instruction: %v8bf16 = call { <8 x bfloat>, <8 x i32> } @llvm.frexp.v8bf16.v8i32(<8 x bfloat> undef)
+; GFX8PLUS-SIZE-NEXT:  Cost Model: Found an estimated cost of 31 for instruction: %v16bf16 = call { <16 x bfloat>, <16 x i32> } @llvm.frexp.v16bf16.v16i32(<16 x bfloat> undef)
+; GFX8PLUS-SIZE-NEXT:  Cost Model: Found an estimated cost of 33 for instruction: %v17bf16 = call { <17 x bfloat>, <17 x i32> } @llvm.frexp.v17bf16.v17i32(<17 x bfloat> undef)
+; GFX8PLUS-SIZE-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: ret void
+;
+  %bf16 = call { bfloat, i32 } @llvm.frexp.bf16.i32(bfloat undef)
+  %v2bf16 = call { <2 x bfloat>, <2 x i32> } @llvm.frexp.v2bf16.v2i32(<2 x bfloat> undef)
+  %v3bf16 = call { <3 x bfloat>, <3 x i32> } @llvm.frexp.v3bf16.v3i32(<3 x bfloat> undef)
+  %v4bf16 = call { <4 x bfloat>, <4 x i32> } @llvm.frexp.v4bf16.v4i32(<4 x bfloat> undef)
+  %v5bf16 = call { <5 x bfloat>, <5 x i32> } @llvm.frexp.v5bf16.v5i32(<5 x bfloat> undef)
+  %v8bf16 = call { <8 x bfloat>, <8 x i32> } @llvm.frexp.v8bf16.v8i32(<8 x bfloat> undef)
+  %v16bf16 = call { <16 x bfloat>, <16 x i32> } @llvm.frexp.v16bf16.v16i32(<16 x bfloat> undef)
+  %v17bf16 = call { <17 x bfloat>, <17 x i32> } @llvm.frexp.v17bf16.v17i32(<17 x bfloat> undef)
+  ret void
+}
+
+define void @frexp_f32() {
+; ALL-LABEL: 'frexp_f32'
+; ALL-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %f32 = call { float, i32 } @llvm.frexp.f32.i32(float undef)
+; ALL-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %v2f32 = call { <2 x float>, <2 x i32> } @llvm.frexp.v2f32.v2i32(<2 x float> undef)
+; ALL-NEXT:  Cost Model: Found an estimated cost of 3 for instruction: %v3f32 = call { <3 x float>, <3 x i32> } @llvm.frexp.v3f32.v3i32(<3 x float> undef)
+; ALL-NEXT:  Cost Model: Found an estimated cost of 4 for instruction: %v4f32 = call { <4 x float>, <4 x i32> } @llvm.frexp.v4f32.v4i32(<4 x float> undef)
+; ALL-NEXT:  Cost Model: Found an estimated cost of 5 for instruction: %v5f32 = call { <5 x float>, <5 x i32> } @llvm.frexp.v5f32.v5i32(<5 x float> undef)
+; ALL-NEXT:  Cost Model: Found an estimated cost of 8 for instruction: %v8f32 = call { <8 x float>, <8 x i32> } @llvm.frexp.v8f32.v8i32(<8 x float> undef)
+; ALL-NEXT:  Cost Model: Found an estimated cost of 16 for instruction: %v16f32 = call { <16 x float>, <16 x i32> } @llvm.frexp.v16f32.v16i32(<16 x float> undef)
+; ALL-NEXT:  Cost Model: Found an estimated cost of 17 for instruction: %v17f32 = call { <17 x float>, <17 x i32> } @llvm.frexp.v17f32.v17i32(<17 x float> undef)
+; ALL-NEXT:  Cost Model: Found an estimated cost of 10 for instruction: ret void
+;
+; ALL-SIZE-LABEL: 'frexp_f32'
+; ALL-SIZE-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %f32 = call { float, i32 } @llvm.frexp.f32.i32(float undef)
+; ALL-SIZE-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %v2f32 = call { <2 x float>, <2 x i32> } @llvm.frexp.v2f32.v2i32(<2 x float> undef)
+; ALL-SIZE-NEXT:  Cost Model: Found an estimated cost of 3 for instruction: %v3f32 = call { <3 x float>, <3 x i32> } @llvm.frexp.v3f32.v3i32(<3 x float> undef)
+; ALL-SIZE-NEXT:  Cost Model: Found an estimated cost of 4 for instruction: %v4f32 = call { <4 x float>, <4 x i32> } @llvm.frexp.v4f32.v4i32(<4 x float> undef)
+; ALL-SIZE-NEXT:  Cost Model: Found an estimated cost of 5 for instruction: %v5f32 = call { <5 x float>, <5 x i32> } @llvm.frexp.v5f32.v5i32(<5 x float> undef)
+; ALL-SIZE-NEXT:  Cost Model: Found an estimated cost of 8 for instruction: %v8f32 = call { <8 x float>, <8 x i32> } @llvm.frexp.v8f32.v8i32(<8 x float> undef)
+; ALL-SIZE-NEXT:  Cost Model: Found an estimated cost of 16 for instruction: %v16f32 = call { <16 x float>, <16 x i32> } @llvm.frexp.v16f32.v16i32(<16 x float> undef)
+; ALL-SIZE-NEXT:  Cost Model: Found an estimated cost of 17 for instruction: %v17f32 = call { <17 x float>, <17 x i32> } @llvm.frexp.v17f32.v17i32(<17 x float> undef)
+; ALL-SIZE-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: ret void
+;
+  %f32 = call { float, i32 } @llvm.frexp.f32.i32(float undef)
+  %v2f32 = call { <2 x float>, <2 x i32> } @llvm.frexp.v2f32.v2i32(<2 x float> undef)
+  %v3f32 = call { <3 x float>, <3 x i32> } @llvm.frexp.v3f32.v3i32(<3 x float> undef)
+  %v4f32 = call { <4 x float>, <4 x i32> } @llvm.frexp.v4f32.v4i32(<4 x float> undef)
+  %v5f32 = call { <5 x float>, <5 x i32> } @llvm.frexp.v5f32.v5i32(<5 x float> undef)
+  %v8f32 = call { <8 x float>, <8 x i32> } @llvm.frexp.v8f32.v8i32(<8 x float> undef)
+  %v16f32 = call { <16 x float>, <16 x i32> } @llvm.frexp.v16f32.v16i32(<16 x float> undef)
+  %v17f32 = call { <17 x float>, <17 x i32> } @llvm.frexp.v17f32.v17i32(<17 x float> undef)
+  ret void
+}
+
+define void @frexp_f64() {
+; ALL-LABEL: 'frexp_f64'
+; ALL-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %f64 = call { double, i32 } @llvm.frexp.f64.i32(double undef)
+; ALL-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %v2f64 = call { <2 x double>, <2 x i32> } @llvm.frexp.v2f64.v2i32(<2 x double> undef)
+; ALL-NEXT:  Cost Model: Found an estimated cost of 3 for instruction: %v3f64 = call { <3 x double>, <3 x i32> } @llvm.frexp.v3f64.v3i32(<3 x double> undef)
+; ALL-NEXT:  Cost Model: Found an estimated cost of 4 for instruction: %v4f64 = call { <4 x double>, <4 x i32> } @llvm.frexp.v4f64.v4i32(<4 x double> undef)
+; ALL-NEXT:  Cost Model: Found an estimated cost of 5 for instruction: %v5f64 = call { <5 x double>, <5 x i32> } @llvm.frexp.v5f64.v5i32(<5 x double> undef)
+; ALL-NEXT:  Cost Model: Found an estimated cost of 8 for instruction: %v8f64 = call { <8 x double>, <8 x i32> } @llvm.frexp.v8f64.v8i32(<8 x double> undef)
+; ALL-NEXT:  Cost Model: Found an estimated cost of 16 for instruction: %v16f64 = call { <16 x double>, <16 x i32> } @llvm.frexp.v16f64.v16i32(<16 x double> undef)
+; ALL-NEXT:  Cost Model: Found an estimated cost of 17 for instruction: %v17f64 = call { <17 x double>, <17 x i32> } @llvm.frexp.v17f64.v17i32(<17 x double> undef)
+; ALL-NEXT:  Cost Model: Found an estimated cost of 10 for instruction: ret void
+;
+; ALL-SIZE-LABEL: 'frexp_f64'
+; ALL-SIZE-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %f64 = call { double, i32 } @llvm.frexp.f64.i32(double undef)
+; ALL-SIZE-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %v2f64 = call { <2 x double>, <2 x i32> } @llvm.frexp.v2f64.v2i32(<2 x double> undef)
+; ALL-SIZE-NEXT:  Cost Model: Found an estimated cost of 3 for instruction: %v3f64 = call { <3 x double>, <3 x i32> } @llvm.frexp.v3f64.v3i32(<3 x double> undef)
+; ALL-SIZE-NEXT:  Cost Model: Found an estimated cost of 4 for instruction: %v4f64 = call { <4 x double>, <4 x i32> } @llvm.frexp.v4f64.v4i32(<4 x double> undef)
+; ALL-SIZE-NEXT:  Cost Model: Found an estimated cost of 5 for instruction: %v5f64 = call { <5 x double>, <5 x i32> } @llvm.frexp.v5f64.v5i32(<5 x double> undef)
+; ALL-SIZE-NEXT:  Cost Model: Found an estimated cost of 8 for instruction: %v8f64 = call { <8 x double>, <8 x i32> } @llvm.frexp.v8f64.v8i32(<8 x double> undef)
+; ALL-SIZE-NEXT:  Cost Model: Found an estimated cost of 16 for instruction: %v16f64 = call { <16 x double>, <16 x i32> } @llvm.frexp.v16f64.v16i32(<16 x double> undef)
+; ALL-SIZE-NEXT:  Cost Model: Found an estimated cost of 17 for instruction: %v17f64 = call { <17 x double>, <17 x i32> } @llvm.frexp.v17f64.v17i32(<17 x double> undef)
+; ALL-SIZE-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: ret void
+;
+  %f64 = call { double, i32 } @llvm.frexp.f64.i32(double undef)
+  %v2f64 = call { <2 x double>, <2 x i32> } @llvm.frexp.v2f64.v2i32(<2 x double> undef)
+  %v3f64 = call { <3 x double>, <3 x i32> } @llvm.frexp.v3f64.v3i32(<3 x double> undef)
+  %v4f64 = call { <4 x double>, <4 x i32> } @llvm.frexp.v4f64.v4i32(<4 x double> undef)
+  %v5f64 = call { <5 x double>, <5 x i32> } @llvm.frexp.v5f64.v5i32(<5 x double> undef)
+  %v8f64 = call { <8 x double>, <8 x i32> } @llvm.frexp.v8f64.v8i32(<8 x double> undef)
+  %v16f64 = call { <16 x double>, <16 x i32> } @llvm.frexp.v16f64.v16i32(<16 x double> undef)
+  %v17f64 = call { <17 x double>, <17 x i32> } @llvm.frexp.v17f64.v17i32(<17 x double> undef)
+  ret void
+}
diff --git a/llvm/test/Analysis/CostModel/AMDGPU/is_fpclass.ll b/llvm/test/Analysis/CostModel/AMDGPU/is_fpclass.ll
new file mode 100644
index 0000000..fc7af38
--- /dev/null
+++ b/llvm/test/Analysis/CostModel/AMDGPU/is_fpclass.ll
@@ -0,0 +1,139 @@
+; NOTE: Assertions have been autogenerated by utils/update_analyze_test_checks.py UTC_ARGS: --version 5
+; RUN: opt -passes="print<cost-model>" 2>&1 -disable-output -mtriple=amdgcn-unknown-amdhsa < %s | FileCheck -check-prefixes=ALL %s
+; RUN: opt -passes="print<cost-model>" -cost-kind=code-size 2>&1 -disable-output -mtriple=amdgcn-unknown-amdhsa < %s | FileCheck -check-prefixes=ALL-SIZE %s
+
+define void @is_fpclass_f16() {
+; ALL-LABEL: 'is_fpclass_f16'
+; ALL-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %f16 = call i1 @llvm.is.fpclass.f16(half undef, i32 0)
+; ALL-NEXT:  Cost Model: Found an estimated cost of 4 for instruction: %v2f16 = call <2 x i1> @llvm.is.fpclass.v2f16(<2 x half> undef, i32 0)
+; ALL-NEXT:  Cost Model: Found an estimated cost of 6 for instruction: %v3f16 = call <3 x i1> @llvm.is.fpclass.v3f16(<3 x half> undef, i32 0)
+; ALL-NEXT:  Cost Model: Found an estimated cost of 8 for instruction: %v4f16 = call <4 x i1> @llvm.is.fpclass.v4f16(<4 x half> undef, i32 0)
+; ALL-NEXT:  Cost Model: Found an estimated cost of 10 for instruction: %v5f16 = call <5 x i1> @llvm.is.fpclass.v5f16(<5 x half> undef, i32 0)
+; ALL-NEXT:  Cost Model: Found an estimated cost of 16 for instruction: %v8f16 = call <8 x i1> @llvm.is.fpclass.v8f16(<8 x half> undef, i32 0)
+; ALL-NEXT:  Cost Model: Found an estimated cost of 32 for instruction: %v16f16 = call <16 x i1> @llvm.is.fpclass.v16f16(<16 x half> undef, i32 0)
+; ALL-NEXT:  Cost Model: Found an estimated cost of 34 for instruction: %v17f16 = call <17 x i1> @llvm.is.fpclass.v17f16(<17 x half> undef, i32 0)
+; ALL-NEXT:  Cost Model: Found an estimated cost of 10 for instruction: ret void
+;
+; ALL-SIZE-LABEL: 'is_fpclass_f16'
+; ALL-SIZE-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %f16 = call i1 @llvm.is.fpclass.f16(half undef, i32 0)
+; ALL-SIZE-NEXT:  Cost Model: Found an estimated cost of 4 for instruction: %v2f16 = call <2 x i1> @llvm.is.fpclass.v2f16(<2 x half> undef, i32 0)
+; ALL-SIZE-NEXT:  Cost Model: Found an estimated cost of 6 for instruction: %v3f16 = call <3 x i1> @llvm.is.fpclass.v3f16(<3 x half> undef, i32 0)
+; ALL-SIZE-NEXT:  Cost Model: Found an estimated cost of 8 for instruction: %v4f16 = call <4 x i1> @llvm.is.fpclass.v4f16(<4 x half> undef, i32 0)
+; ALL-SIZE-NEXT:  Cost Model: Found an estimated cost of 10 for instruction: %v5f16 = call <5 x i1> @llvm.is.fpclass.v5f16(<5 x half> undef, i32 0)
+; ALL-SIZE-NEXT:  Cost Model: Found an estimated cost of 16 for instruction: %v8f16 = call <8 x i1> @llvm.is.fpclass.v8f16(<8 x half> undef, i32 0)
+; ALL-SIZE-NEXT:  Cost Model: Found an estimated cost of 32 for instruction: %v16f16 = call <16 x i1> @llvm.is.fpclass.v16f16(<16 x half> undef, i32 0)
+; ALL-SIZE-NEXT:  Cost Model: Found an estimated cost of 34 for instruction: %v17f16 = call <17 x i1> @llvm.is.fpclass.v17f16(<17 x half> undef, i32 0)
+; ALL-SIZE-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: ret void
+;
+  %f16 = call i1 @llvm.is.fpclass.f16(half undef, i32 0)
+  %v2f16 = call <2 x i1> @llvm.is.fpclass.v2f16(<2 x half> undef, i32 0)
+  %v3f16 = call <3 x i1> @llvm.is.fpclass.v3f16(<3 x half> undef, i32 0)
+  %v4f16 = call <4 x i1> @llvm.is.fpclass.v4f16(<4 x half> undef, i32 0)
+  %v5f16 = call <5 x i1> @llvm.is.fpclass.v5f16(<5 x half> undef, i32 0)
+  %v8f16 = call <8 x i1> @llvm.is.fpclass.v8f16(<8 x half> undef, i32 0)
+  %v16f16 = call <16 x i1> @llvm.is.fpclass.v16f16(<16 x half> undef, i32 0)
+  %v17f16 = call <17 x i1> @llvm.is.fpclass.v17f16(<17 x half> undef, i32 0)
+  ret void
+}
+
+define void @is_fpclass_bf16() {
+; ALL-LABEL: 'is_fpclass_bf16'
+; ALL-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %bf16 = call i1 @llvm.is.fpclass.bf16(bfloat undef, i32 0)
+; ALL-NEXT:  Cost Model: Found an estimated cost of 4 for instruction: %v2bf16 = call <2 x i1> @llvm.is.fpclass.v2bf16(<2 x bfloat> undef, i32 0)
+; ALL-NEXT:  Cost Model: Found an estimated cost of 6 for instruction: %v3bf16 = call <3 x i1> @llvm.is.fpclass.v3bf16(<3 x bfloat> undef, i32 0)
+; ALL-NEXT:  Cost Model: Found an estimated cost of 8 for instruction: %v4bf16 = call <4 x i1> @llvm.is.fpclass.v4bf16(<4 x bfloat> undef, i32 0)
+; ALL-NEXT:  Cost Model: Found an estimated cost of 10 for instruction: %v5bf16 = call <5 x i1> @llvm.is.fpclass.v5bf16(<5 x bfloat> undef, i32 0)
+; ALL-NEXT:  Cost Model: Found an estimated cost of 16 for instruction: %v8bf16 = call <8 x i1> @llvm.is.fpclass.v8bf16(<8 x bfloat> undef, i32 0)
+; ALL-NEXT:  Cost Model: Found an estimated cost of 32 for instruction: %v16bf16 = call <16 x i1> @llvm.is.fpclass.v16bf16(<16 x bfloat> undef, i32 0)
+; ALL-NEXT:  Cost Model: Found an estimated cost of 34 for instruction: %v17bf16 = call <17 x i1> @llvm.is.fpclass.v17bf16(<17 x bfloat> undef, i32 0)
+; ALL-NEXT:  Cost Model: Found an estimated cost of 10 for instruction: ret void
+;
+; ALL-SIZE-LABEL: 'is_fpclass_bf16'
+; ALL-SIZE-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %bf16 = call i1 @llvm.is.fpclass.bf16(bfloat undef, i32 0)
+; ALL-SIZE-NEXT:  Cost Model: Found an estimated cost of 4 for instruction: %v2bf16 = call <2 x i1> @llvm.is.fpclass.v2bf16(<2 x bfloat> undef, i32 0)
+; ALL-SIZE-NEXT:  Cost Model: Found an estimated cost of 6 for instruction: %v3bf16 = call <3 x i1> @llvm.is.fpclass.v3bf16(<3 x bfloat> undef, i32 0)
+; ALL-SIZE-NEXT:  Cost Model: Found an estimated cost of 8 for instruction: %v4bf16 = call <4 x i1> @llvm.is.fpclass.v4bf16(<4 x bfloat> undef, i32 0)
+; ALL-SIZE-NEXT:  Cost Model: Found an estimated cost of 10 for instruction: %v5bf16 = call <5 x i1> @llvm.is.fpclass.v5bf16(<5 x bfloat> undef, i32 0)
+; ALL-SIZE-NEXT:  Cost Model: Found an estimated cost of 16 for instruction: %v8bf16 = call <8 x i1> @llvm.is.fpclass.v8bf16(<8 x bfloat> undef, i32 0)
+; ALL-SIZE-NEXT:  Cost Model: Found an estimated cost of 32 for instruction: %v16bf16 = call <16 x i1> @llvm.is.fpclass.v16bf16(<16 x bfloat> undef, i32 0)
+; ALL-SIZE-NEXT:  Cost Model: Found an estimated cost of 34 for instruction: %v17bf16 = call <17 x i1> @llvm.is.fpclass.v17bf16(<17 x bfloat> undef, i32 0)
+; ALL-SIZE-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: ret void
+;
+  %bf16 = call i1 @llvm.is.fpclass.bf16(bfloat undef, i32 0)
+  %v2bf16 = call <2 x i1> @llvm.is.fpclass.v2bf16(<2 x bfloat> undef, i32 0)
+  %v3bf16 = call <3 x i1> @llvm.is.fpclass.v3bf16(<3 x bfloat> undef, i32 0)
+  %v4bf16 = call <4 x i1> @llvm.is.fpclass.v4bf16(<4 x bfloat> undef, i32 0)
+  %v5bf16 = call <5 x i1> @llvm.is.fpclass.v5bf16(<5 x bfloat> undef, i32 0)
+  %v8bf16 = call <8 x i1> @llvm.is.fpclass.v8bf16(<8 x bfloat> undef, i32 0)
+  %v16bf16 = call <16 x i1> @llvm.is.fpclass.v16bf16(<16 x bfloat> undef, i32 0)
+  %v17bf16 = call <17 x i1> @llvm.is.fpclass.v17bf16(<17 x bfloat> undef, i32 0)
+  ret void
+}
+
+define void @is_fpclass_f32() {
+; ALL-LABEL: 'is_fpclass_f32'
+; ALL-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %f32 = call i1 @llvm.is.fpclass.f32(float undef, i32 0)
+; ALL-NEXT:  Cost Model: Found an estimated cost of 4 for instruction: %v2f32 = call <2 x i1> @llvm.is.fpclass.v2f32(<2 x float> undef, i32 0)
+; ALL-NEXT:  Cost Model: Found an estimated cost of 6 for instruction: %v3f32 = call <3 x i1> @llvm.is.fpclass.v3f32(<3 x float> undef, i32 0)
+; ALL-NEXT:  Cost Model: Found an estimated cost of 8 for instruction: %v4f32 = call <4 x i1> @llvm.is.fpclass.v4f32(<4 x float> undef, i32 0)
+; ALL-NEXT:  Cost Model: Found an estimated cost of 10 for instruction: %v5f32 = call <5 x i1> @llvm.is.fpclass.v5f32(<5 x float> undef, i32 0)
+; ALL-NEXT:  Cost Model: Found an estimated cost of 16 for instruction: %v8f32 = call <8 x i1> @llvm.is.fpclass.v8f32(<8 x float> undef, i32 0)
+; ALL-NEXT:  Cost Model: Found an estimated cost of 32 for instruction: %v16f32 = call <16 x i1> @llvm.is.fpclass.v16f32(<16 x float> undef, i32 0)
+; ALL-NEXT:  Cost Model: Found an estimated cost of 34 for instruction: %v17f32 = call <17 x i1> @llvm.is.fpclass.v17f32(<17 x float> undef, i32 0)
+; ALL-NEXT:  Cost Model: Found an estimated cost of 10 for instruction: ret void
+;
+; ALL-SIZE-LABEL: 'is_fpclass_f32'
+; ALL-SIZE-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %f32 = call i1 @llvm.is.fpclass.f32(float undef, i32 0)
+; ALL-SIZE-NEXT:  Cost Model: Found an estimated cost of 4 for instruction: %v2f32 = call <2 x i1> @llvm.is.fpclass.v2f32(<2 x float> undef, i32 0)
+; ALL-SIZE-NEXT:  Cost Model: Found an estimated cost of 6 for instruction: %v3f32 = call <3 x i1> @llvm.is.fpclass.v3f32(<3 x float> undef, i32 0)
+; ALL-SIZE-NEXT:  Cost Model: Found an estimated cost of 8 for instruction: %v4f32 = call <4 x i1> @llvm.is.fpclass.v4f32(<4 x float> undef, i32 0)
+; ALL-SIZE-NEXT:  Cost Model: Found an estimated cost of 10 for instruction: %v5f32 = call <5 x i1> @llvm.is.fpclass.v5f32(<5 x float> undef, i32 0)
+; ALL-SIZE-NEXT:  Cost Model: Found an estimated cost of 16 for instruction: %v8f32 = call <8 x i1> @llvm.is.fpclass.v8f32(<8 x float> undef, i32 0)
+; ALL-SIZE-NEXT:  Cost Model: Found an estimated cost of 32 for instruction: %v16f32 = call <16 x i1> @llvm.is.fpclass.v16f32(<16 x float> undef, i32 0)
+; ALL-SIZE-NEXT:  Cost Model: Found an estimated cost of 34 for instruction: %v17f32 = call <17 x i1> @llvm.is.fpclass.v17f32(<17 x float> undef, i32 0)
+; ALL-SIZE-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: ret void
+;
+  %f32 = call i1 @llvm.is.fpclass.f32(float undef, i32 0)
+  %v2f32 = call <2 x i1> @llvm.is.fpclass.v2f32(<2 x float> undef, i32 0)
+  %v3f32 = call <3 x i1> @llvm.is.fpclass.v3f32(<3 x float> undef, i32 0)
+  %v4f32 = call <4 x i1> @llvm.is.fpclass.v4f32(<4 x float> undef, i32 0)
+  %v5f32 = call <5 x i1> @llvm.is.fpclass.v5f32(<5 x float> undef, i32 0)
+  %v8f32 = call <8 x i1> @llvm.is.fpclass.v8f32(<8 x float> undef, i32 0)
+  %v16f32 = call <16 x i1> @llvm.is.fpclass.v16f32(<16 x float> undef, i32 0)
+  %v17f32 = call <17 x i1> @llvm.is.fpclass.v17f32(<17 x float> undef, i32 0)
+  ret void
+}
+
+define void @is_fpclass_f64() {
+; ALL-LABEL: 'is_fpclass_f64'
+; ALL-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %f64 = call i1 @llvm.is.fpclass.f64(double undef, i32 0)
+; ALL-NEXT:  Cost Model: Found an estimated cost of 4 for instruction: %v2f64 = call <2 x i1> @llvm.is.fpclass.v2f64(<2 x double> undef, i32 0)
+; ALL-NEXT:  Cost Model: Found an estimated cost of 6 for instruction: %v3f64 = call <3 x i1> @llvm.is.fpclass.v3f64(<3 x double> undef, i32 0)
+; ALL-NEXT:  Cost Model: Found an estimated cost of 8 for instruction: %v4f64 = call <4 x i1> @llvm.is.fpclass.v4f64(<4 x double> undef, i32 0)
+; ALL-NEXT:  Cost Model: Found an estimated cost of 10 for instruction: %v5f64 = call <5 x i1> @llvm.is.fpclass.v5f64(<5 x double> undef, i32 0)
+; ALL-NEXT:  Cost Model: Found an estimated cost of 16 for instruction: %v8f64 = call <8 x i1> @llvm.is.fpclass.v8f64(<8 x double> undef, i32 0)
+; ALL-NEXT:  Cost Model: Found an estimated cost of 32 for instruction: %v16f64 = call <16 x i1> @llvm.is.fpclass.v16f64(<16 x double> undef, i32 0)
+; ALL-NEXT:  Cost Model: Found an estimated cost of 34 for instruction: %v17f64 = call <17 x i1> @llvm.is.fpclass.v17f64(<17 x double> undef, i32 0)
+; ALL-NEXT:  Cost Model: Found an estimated cost of 10 for instruction: ret void
+;
+; ALL-SIZE-LABEL: 'is_fpclass_f64'
+; ALL-SIZE-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %f64 = call i1 @llvm.is.fpclass.f64(double undef, i32 0)
+; ALL-SIZE-NEXT:  Cost Model: Found an estimated cost of 4 for instruction: %v2f64 = call <2 x i1> @llvm.is.fpclass.v2f64(<2 x double> undef, i32 0)
+; ALL-SIZE-NEXT:  Cost Model: Found an estimated cost of 6 for instruction: %v3f64 = call <3 x i1> @llvm.is.fpclass.v3f64(<3 x double> undef, i32 0)
+; ALL-SIZE-NEXT:  Cost Model: Found an estimated cost of 8 for instruction: %v4f64 = call <4 x i1> @llvm.is.fpclass.v4f64(<4 x double> undef, i32 0)
+; ALL-SIZE-NEXT:  Cost Model: Found an estimated cost of 10 for instruction: %v5f64 = call <5 x i1> @llvm.is.fpclass.v5f64(<5 x double> undef, i32 0)
+; ALL-SIZE-NEXT:  Cost Model: Found an estimated cost of 16 for instruction: %v8f64 = call <8 x i1> @llvm.is.fpclass.v8f64(<8 x double> undef, i32 0)
+; ALL-SIZE-NEXT:  Cost Model: Found an estimated cost of 32 for instruction: %v16f64 = call <16 x i1> @llvm.is.fpclass.v16f64(<16 x double> undef, i32 0)
+; ALL-SIZE-NEXT:  Cost Model: Found an estimated cost of 34 for instruction: %v17f64 = call <17 x i1> @llvm.is.fpclass.v17f64(<17 x double> undef, i32 0)
+; ALL-SIZE-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: ret void
+;
+  %f64 = call i1 @llvm.is.fpclass.f64(double undef, i32 0)
+  %v2f64 = call <2 x i1> @llvm.is.fpclass.v2f64(<2 x double> undef, i32 0)
+  %v3f64 = call <3 x i1> @llvm.is.fpclass.v3f64(<3 x double> undef, i32 0)
+  %v4f64 = call <4 x i1> @llvm.is.fpclass.v4f64(<4 x double> undef, i32 0)
+  %v5f64 = call <5 x i1> @llvm.is.fpclass.v5f64(<5 x double> undef, i32 0)
+  %v8f64 = call <8 x i1> @llvm.is.fpclass.v8f64(<8 x double> undef, i32 0)
+  %v16f64 = call <16 x i1> @llvm.is.fpclass.v16f64(<16 x double> undef, i32 0)
+  %v17f64 = call <17 x i1> @llvm.is.fpclass.v17f64(<17 x double> undef, i32 0)
+  ret void
+}
diff --git a/llvm/test/Analysis/CostModel/AMDGPU/ldexp.ll b/llvm/test/Analysis/CostModel/AMDGPU/ldexp.ll
new file mode 100644
index 0000000..2b1b590
--- /dev/null
+++ b/llvm/test/Analysis/CostModel/AMDGPU/ldexp.ll
@@ -0,0 +1,246 @@
+; NOTE: Assertions have been autogenerated by utils/update_analyze_test_checks.py UTC_ARGS: --version 5
+; RUN: opt -passes="print<cost-model>" 2>&1 -disable-output -mtriple=amdgcn-unknown-amdhsa -mcpu=hawaii < %s | FileCheck -check-prefixes=ALL,GFX7 %s
+; RUN: opt -passes="print<cost-model>" 2>&1 -disable-output -mtriple=amdgcn-unknown-amdhsa -mcpu=tonga < %s | FileCheck -check-prefixes=ALL,GFX8PLUS %s
+; RUN: opt -passes="print<cost-model>" 2>&1 -disable-output -mtriple=amdgcn-unknown-amdhsa -mcpu=gfx900 < %s | FileCheck -check-prefixes=ALL,GFX8PLUS %s
+; RUN: opt -passes="print<cost-model>" 2>&1 -disable-output -mtriple=amdgcn-unknown-amdhsa -mcpu=gfx1030 < %s | FileCheck -check-prefixes=ALL,GFX8PLUS %s
+
+; RUN: opt -passes="print<cost-model>" -cost-kind=code-size 2>&1 -disable-output -mtriple=amdgcn-unknown-amdhsa -mcpu=hawaii < %s | FileCheck -check-prefixes=ALL-SIZE,GFX7-SIZE %s
+; RUN: opt -passes="print<cost-model>" -cost-kind=code-size 2>&1 -disable-output -mtriple=amdgcn-unknown-amdhsa -mcpu=tonga < %s | FileCheck -check-prefixes=ALL-SIZE,GFX8PLUS-SIZE %s
+; RUN: opt -passes="print<cost-model>" -cost-kind=code-size 2>&1 -disable-output -mtriple=amdgcn-unknown-amdhsa -mcpu=gfx900 < %s | FileCheck -check-prefixes=ALL-SIZE,GFX8PLUS-SIZE %s
+; RUN: opt -passes="print<cost-model>" -cost-kind=code-size 2>&1 -disable-output -mtriple=amdgcn-unknown-amdhsa -mcpu=gfx1030 < %s | FileCheck -check-prefixes=ALL-SIZE,GFX8PLUS-SIZE %s
+
+define void @ldexp_f16_i32() {
+; GFX7-LABEL: 'ldexp_f16_i32'
+; GFX7-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %f16 = call half @llvm.ldexp.f16.i32(half undef, i32 undef)
+; GFX7-NEXT:  Cost Model: Found an estimated cost of 4 for instruction: %v2f16 = call <2 x half> @llvm.ldexp.v2f16.v2i32(<2 x half> undef, <2 x i32> undef)
+; GFX7-NEXT:  Cost Model: Found an estimated cost of 6 for instruction: %v3f16 = call <3 x half> @llvm.ldexp.v3f16.v3i32(<3 x half> undef, <3 x i32> undef)
+; GFX7-NEXT:  Cost Model: Found an estimated cost of 8 for instruction: %v4f16 = call <4 x half> @llvm.ldexp.v4f16.v4i32(<4 x half> undef, <4 x i32> undef)
+; GFX7-NEXT:  Cost Model: Found an estimated cost of 10 for instruction: %v5f16 = call <5 x half> @llvm.ldexp.v5f16.v5i32(<5 x half> undef, <5 x i32> undef)
+; GFX7-NEXT:  Cost Model: Found an estimated cost of 16 for instruction: %v8f16 = call <8 x half> @llvm.ldexp.v8f16.v8i32(<8 x half> undef, <8 x i32> undef)
+; GFX7-NEXT:  Cost Model: Found an estimated cost of 32 for instruction: %v16f16 = call <16 x half> @llvm.ldexp.v16f16.v16i32(<16 x half> undef, <16 x i32> undef)
+; GFX7-NEXT:  Cost Model: Found an estimated cost of 34 for instruction: %v17f16 = call <17 x half> @llvm.ldexp.v17f16.v17i32(<17 x half> undef, <17 x i32> undef)
+; GFX7-NEXT:  Cost Model: Found an estimated cost of 10 for instruction: ret void
+;
+; GFX8PLUS-LABEL: 'ldexp_f16_i32'
+; GFX8PLUS-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %f16 = call half @llvm.ldexp.f16.i32(half undef, i32 undef)
+; GFX8PLUS-NEXT:  Cost Model: Found an estimated cost of 3 for instruction: %v2f16 = call <2 x half> @llvm.ldexp.v2f16.v2i32(<2 x half> undef, <2 x i32> undef)
+; GFX8PLUS-NEXT:  Cost Model: Found an estimated cost of 5 for instruction: %v3f16 = call <3 x half> @llvm.ldexp.v3f16.v3i32(<3 x half> undef, <3 x i32> undef)
+; GFX8PLUS-NEXT:  Cost Model: Found an estimated cost of 7 for instruction: %v4f16 = call <4 x half> @llvm.ldexp.v4f16.v4i32(<4 x half> undef, <4 x i32> undef)
+; GFX8PLUS-NEXT:  Cost Model: Found an estimated cost of 9 for instruction: %v5f16 = call <5 x half> @llvm.ldexp.v5f16.v5i32(<5 x half> undef, <5 x i32> undef)
+; GFX8PLUS-NEXT:  Cost Model: Found an estimated cost of 15 for instruction: %v8f16 = call <8 x half> @llvm.ldexp.v8f16.v8i32(<8 x half> undef, <8 x i32> undef)
+; GFX8PLUS-NEXT:  Cost Model: Found an estimated cost of 31 for instruction: %v16f16 = call <16 x half> @llvm.ldexp.v16f16.v16i32(<16 x half> undef, <16 x i32> undef)
+; GFX8PLUS-NEXT:  Cost Model: Found an estimated cost of 33 for instruction: %v17f16 = call <17 x half> @llvm.ldexp.v17f16.v17i32(<17 x half> undef, <17 x i32> undef)
+; GFX8PLUS-NEXT:  Cost Model: Found an estimated cost of 10 for instruction: ret void
+;
+; GFX7-SIZE-LABEL: 'ldexp_f16_i32'
+; GFX7-SIZE-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %f16 = call half @llvm.ldexp.f16.i32(half undef, i32 undef)
+; GFX7-SIZE-NEXT:  Cost Model: Found an estimated cost of 4 for instruction: %v2f16 = call <2 x half> @llvm.ldexp.v2f16.v2i32(<2 x half> undef, <2 x i32> undef)
+; GFX7-SIZE-NEXT:  Cost Model: Found an estimated cost of 6 for instruction: %v3f16 = call <3 x half> @llvm.ldexp.v3f16.v3i32(<3 x half> undef, <3 x i32> undef)
+; GFX7-SIZE-NEXT:  Cost Model: Found an estimated cost of 8 for instruction: %v4f16 = call <4 x half> @llvm.ldexp.v4f16.v4i32(<4 x half> undef, <4 x i32> undef)
+; GFX7-SIZE-NEXT:  Cost Model: Found an estimated cost of 10 for instruction: %v5f16 = call <5 x half> @llvm.ldexp.v5f16.v5i32(<5 x half> undef, <5 x i32> undef)
+; GFX7-SIZE-NEXT:  Cost Model: Found an estimated cost of 16 for instruction: %v8f16 = call <8 x half> @llvm.ldexp.v8f16.v8i32(<8 x half> undef, <8 x i32> undef)
+; GFX7-SIZE-NEXT:  Cost Model: Found an estimated cost of 32 for instruction: %v16f16 = call <16 x half> @llvm.ldexp.v16f16.v16i32(<16 x half> undef, <16 x i32> undef)
+; GFX7-SIZE-NEXT:  Cost Model: Found an estimated cost of 34 for instruction: %v17f16 = call <17 x half> @llvm.ldexp.v17f16.v17i32(<17 x half> undef, <17 x i32> undef)
+; GFX7-SIZE-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: ret void
+;
+; GFX8PLUS-SIZE-LABEL: 'ldexp_f16_i32'
+; GFX8PLUS-SIZE-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %f16 = call half @llvm.ldexp.f16.i32(half undef, i32 undef)
+; GFX8PLUS-SIZE-NEXT:  Cost Model: Found an estimated cost of 3 for instruction: %v2f16 = call <2 x half> @llvm.ldexp.v2f16.v2i32(<2 x half> undef, <2 x i32> undef)
+; GFX8PLUS-SIZE-NEXT:  Cost Model: Found an estimated cost of 5 for instruction: %v3f16 = call <3 x half> @llvm.ldexp.v3f16.v3i32(<3 x half> undef, <3 x i32> undef)
+; GFX8PLUS-SIZE-NEXT:  Cost Model: Found an estimated cost of 7 for instruction: %v4f16 = call <4 x half> @llvm.ldexp.v4f16.v4i32(<4 x half> undef, <4 x i32> undef)
+; GFX8PLUS-SIZE-NEXT:  Cost Model: Found an estimated cost of 9 for instruction: %v5f16 = call <5 x half> @llvm.ldexp.v5f16.v5i32(<5 x half> undef, <5 x i32> undef)
+; GFX8PLUS-SIZE-NEXT:  Cost Model: Found an estimated cost of 15 for instruction: %v8f16 = call <8 x half> @llvm.ldexp.v8f16.v8i32(<8 x half> undef, <8 x i32> undef)
+; GFX8PLUS-SIZE-NEXT:  Cost Model: Found an estimated cost of 31 for instruction: %v16f16 = call <16 x half> @llvm.ldexp.v16f16.v16i32(<16 x half> undef, <16 x i32> undef)
+; GFX8PLUS-SIZE-NEXT:  Cost Model: Found an estimated cost of 33 for instruction: %v17f16 = call <17 x half> @llvm.ldexp.v17f16.v17i32(<17 x half> undef, <17 x i32> undef)
+; GFX8PLUS-SIZE-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: ret void
+;
+  %f16 = call half @llvm.ldexp.f16.i32(half undef, i32 undef)
+  %v2f16 = call <2 x half> @llvm.ldexp.v2f16.v2i32(<2 x half> undef, <2 x i32> undef)
+  %v3f16 = call <3 x half> @llvm.ldexp.v3f16.v3i32(<3 x half> undef, <3 x i32> undef)
+  %v4f16 = call <4 x half> @llvm.ldexp.v4f16.v4i32(<4 x half> undef, <4 x i32> undef)
+  %v5f16 = call <5 x half> @llvm.ldexp.v5f16.v5i32(<5 x half> undef, <5 x i32> undef)
+  %v8f16 = call <8 x half> @llvm.ldexp.v8f16.v8i32(<8 x half> undef, <8 x i32> undef)
+  %v16f16 = call <16 x half> @llvm.ldexp.v16f16.v16i32(<16 x half> undef, <16 x i32> undef)
+  %v17f16 = call <17 x half> @llvm.ldexp.v17f16.v17i32(<17 x half> undef, <17 x i32> undef)
+  ret void
+}
+
+define void @ldexp_f16_i16() {
+; GFX7-LABEL: 'ldexp_f16_i16'
+; GFX7-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %f16 = call half @llvm.ldexp.f16.i16(half undef, i16 undef)
+; GFX7-NEXT:  Cost Model: Found an estimated cost of 4 for instruction: %v2f16 = call <2 x half> @llvm.ldexp.v2f16.v2i16(<2 x half> undef, <2 x i16> undef)
+; GFX7-NEXT:  Cost Model: Found an estimated cost of 6 for instruction: %v3f16 = call <3 x half> @llvm.ldexp.v3f16.v3i16(<3 x half> undef, <3 x i16> undef)
+; GFX7-NEXT:  Cost Model: Found an estimated cost of 8 for instruction: %v4f16 = call <4 x half> @llvm.ldexp.v4f16.v4i16(<4 x half> undef, <4 x i16> undef)
+; GFX7-NEXT:  Cost Model: Found an estimated cost of 10 for instruction: %v5f16 = call <5 x half> @llvm.ldexp.v5f16.v5i16(<5 x half> undef, <5 x i16> undef)
+; GFX7-NEXT:  Cost Model: Found an estimated cost of 16 for instruction: %v8f16 = call <8 x half> @llvm.ldexp.v8f16.v8i16(<8 x half> undef, <8 x i16> undef)
+; GFX7-NEXT:  Cost Model: Found an estimated cost of 32 for instruction: %v16f16 = call <16 x half> @llvm.ldexp.v16f16.v16i16(<16 x half> undef, <16 x i16> undef)
+; GFX7-NEXT:  Cost Model: Found an estimated cost of 34 for instruction: %v17f16 = call <17 x half> @llvm.ldexp.v17f16.v17i16(<17 x half> undef, <17 x i16> undef)
+; GFX7-NEXT:  Cost Model: Found an estimated cost of 10 for instruction: ret void
+;
+; GFX8PLUS-LABEL: 'ldexp_f16_i16'
+; GFX8PLUS-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %f16 = call half @llvm.ldexp.f16.i16(half undef, i16 undef)
+; GFX8PLUS-NEXT:  Cost Model: Found an estimated cost of 3 for instruction: %v2f16 = call <2 x half> @llvm.ldexp.v2f16.v2i16(<2 x half> undef, <2 x i16> undef)
+; GFX8PLUS-NEXT:  Cost Model: Found an estimated cost of 5 for instruction: %v3f16 = call <3 x half> @llvm.ldexp.v3f16.v3i16(<3 x half> undef, <3 x i16> undef)
+; GFX8PLUS-NEXT:  Cost Model: Found an estimated cost of 7 for instruction: %v4f16 = call <4 x half> @llvm.ldexp.v4f16.v4i16(<4 x half> undef, <4 x i16> undef)
+; GFX8PLUS-NEXT:  Cost Model: Found an estimated cost of 9 for instruction: %v5f16 = call <5 x half> @llvm.ldexp.v5f16.v5i16(<5 x half> undef, <5 x i16> undef)
+; GFX8PLUS-NEXT:  Cost Model: Found an estimated cost of 15 for instruction: %v8f16 = call <8 x half> @llvm.ldexp.v8f16.v8i16(<8 x half> undef, <8 x i16> undef)
+; GFX8PLUS-NEXT:  Cost Model: Found an estimated cost of 31 for instruction: %v16f16 = call <16 x half> @llvm.ldexp.v16f16.v16i16(<16 x half> undef, <16 x i16> undef)
+; GFX8PLUS-NEXT:  Cost Model: Found an estimated cost of 33 for instruction: %v17f16 = call <17 x half> @llvm.ldexp.v17f16.v17i16(<17 x half> undef, <17 x i16> undef)
+; GFX8PLUS-NEXT:  Cost Model: Found an estimated cost of 10 for instruction: ret void
+;
+; GFX7-SIZE-LABEL: 'ldexp_f16_i16'
+; GFX7-SIZE-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %f16 = call half @llvm.ldexp.f16.i16(half undef, i16 undef)
+; GFX7-SIZE-NEXT:  Cost Model: Found an estimated cost of 4 for instruction: %v2f16 = call <2 x half> @llvm.ldexp.v2f16.v2i16(<2 x half> undef, <2 x i16> undef)
+; GFX7-SIZE-NEXT:  Cost Model: Found an estimated cost of 6 for instruction: %v3f16 = call <3 x half> @llvm.ldexp.v3f16.v3i16(<3 x half> undef, <3 x i16> undef)
+; GFX7-SIZE-NEXT:  Cost Model: Found an estimated cost of 8 for instruction: %v4f16 = call <4 x half> @llvm.ldexp.v4f16.v4i16(<4 x half> undef, <4 x i16> undef)
+; GFX7-SIZE-NEXT:  Cost Model: Found an estimated cost of 10 for instruction: %v5f16 = call <5 x half> @llvm.ldexp.v5f16.v5i16(<5 x half> undef, <5 x i16> undef)
+; GFX7-SIZE-NEXT:  Cost Model: Found an estimated cost of 16 for instruction: %v8f16 = call <8 x half> @llvm.ldexp.v8f16.v8i16(<8 x half> undef, <8 x i16> undef)
+; GFX7-SIZE-NEXT:  Cost Model: Found an estimated cost of 32 for instruction: %v16f16 = call <16 x half> @llvm.ldexp.v16f16.v16i16(<16 x half> undef, <16 x i16> undef)
+; GFX7-SIZE-NEXT:  Cost Model: Found an estimated cost of 34 for instruction: %v17f16 = call <17 x half> @llvm.ldexp.v17f16.v17i16(<17 x half> undef, <17 x i16> undef)
+; GFX7-SIZE-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: ret void
+;
+; GFX8PLUS-SIZE-LABEL: 'ldexp_f16_i16'
+; GFX8PLUS-SIZE-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %f16 = call half @llvm.ldexp.f16.i16(half undef, i16 undef)
+; GFX8PLUS-SIZE-NEXT:  Cost Model: Found an estimated cost of 3 for instruction: %v2f16 = call <2 x half> @llvm.ldexp.v2f16.v2i16(<2 x half> undef, <2 x i16> undef)
+; GFX8PLUS-SIZE-NEXT:  Cost Model: Found an estimated cost of 5 for instruction: %v3f16 = call <3 x half> @llvm.ldexp.v3f16.v3i16(<3 x half> undef, <3 x i16> undef)
+; GFX8PLUS-SIZE-NEXT:  Cost Model: Found an estimated cost of 7 for instruction: %v4f16 = call <4 x half> @llvm.ldexp.v4f16.v4i16(<4 x half> undef, <4 x i16> undef)
+; GFX8PLUS-SIZE-NEXT:  Cost Model: Found an estimated cost of 9 for instruction: %v5f16 = call <5 x half> @llvm.ldexp.v5f16.v5i16(<5 x half> undef, <5 x i16> undef)
+; GFX8PLUS-SIZE-NEXT:  Cost Model: Found an estimated cost of 15 for instruction: %v8f16 = call <8 x half> @llvm.ldexp.v8f16.v8i16(<8 x half> undef, <8 x i16> undef)
+; GFX8PLUS-SIZE-NEXT:  Cost Model: Found an estimated cost of 31 for instruction: %v16f16 = call <16 x half> @llvm.ldexp.v16f16.v16i16(<16 x half> undef, <16 x i16> undef)
+; GFX8PLUS-SIZE-NEXT:  Cost Model: Found an estimated cost of 33 for instruction: %v17f16 = call <17 x half> @llvm.ldexp.v17f16.v17i16(<17 x half> undef, <17 x i16> undef)
+; GFX8PLUS-SIZE-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: ret void
+;
+  %f16 = call half @llvm.ldexp.f16.i16(half undef, i16 undef)
+  %v2f16 = call <2 x half> @llvm.ldexp.v2f16.v2i16(<2 x half> undef, <2 x i16> undef)
+  %v3f16 = call <3 x half> @llvm.ldexp.v3f16.v3i16(<3 x half> undef, <3 x i16> undef)
+  %v4f16 = call <4 x half> @llvm.ldexp.v4f16.v4i16(<4 x half> undef, <4 x i16> undef)
+  %v5f16 = call <5 x half> @llvm.ldexp.v5f16.v5i16(<5 x half> undef, <5 x i16> undef)
+  %v8f16 = call <8 x half> @llvm.ldexp.v8f16.v8i16(<8 x half> undef, <8 x i16> undef)
+  %v16f16 = call <16 x half> @llvm.ldexp.v16f16.v16i16(<16 x half> undef, <16 x i16> undef)
+  %v17f16 = call <17 x half> @llvm.ldexp.v17f16.v17i16(<17 x half> undef, <17 x i16> undef)
+  ret void
+}
+
+define void @ldexp_bf16() {
+; GFX7-LABEL: 'ldexp_bf16'
+; GFX7-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %bf16 = call bfloat @llvm.ldexp.bf16.i32(bfloat undef, i32 undef)
+; GFX7-NEXT:  Cost Model: Found an estimated cost of 4 for instruction: %v2bf16 = call <2 x bfloat> @llvm.ldexp.v2bf16.v2i32(<2 x bfloat> undef, <2 x i32> undef)
+; GFX7-NEXT:  Cost Model: Found an estimated cost of 6 for instruction: %v3bf16 = call <3 x bfloat> @llvm.ldexp.v3bf16.v3i32(<3 x bfloat> undef, <3 x i32> undef)
+; GFX7-NEXT:  Cost Model: Found an estimated cost of 8 for instruction: %v4bf16 = call <4 x bfloat> @llvm.ldexp.v4bf16.v4i32(<4 x bfloat> undef, <4 x i32> undef)
+; GFX7-NEXT:  Cost Model: Found an estimated cost of 10 for instruction: %v5bf16 = call <5 x bfloat> @llvm.ldexp.v5bf16.v5i32(<5 x bfloat> undef, <5 x i32> undef)
+; GFX7-NEXT:  Cost Model: Found an estimated cost of 16 for instruction: %v8bf16 = call <8 x bfloat> @llvm.ldexp.v8bf16.v8i32(<8 x bfloat> undef, <8 x i32> undef)
+; GFX7-NEXT:  Cost Model: Found an estimated cost of 32 for instruction: %v16bf16 = call <16 x bfloat> @llvm.ldexp.v16bf16.v16i32(<16 x bfloat> undef, <16 x i32> undef)
+; GFX7-NEXT:  Cost Model: Found an estimated cost of 34 for instruction: %v17bf16 = call <17 x bfloat> @llvm.ldexp.v17bf16.v17i32(<17 x bfloat> undef, <17 x i32> undef)
+; GFX7-NEXT:  Cost Model: Found an estimated cost of 10 for instruction: ret void
+;
+; GFX8PLUS-LABEL: 'ldexp_bf16'
+; GFX8PLUS-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %bf16 = call bfloat @llvm.ldexp.bf16.i32(bfloat undef, i32 undef)
+; GFX8PLUS-NEXT:  Cost Model: Found an estimated cost of 3 for instruction: %v2bf16 = call <2 x bfloat> @llvm.ldexp.v2bf16.v2i32(<2 x bfloat> undef, <2 x i32> undef)
+; GFX8PLUS-NEXT:  Cost Model: Found an estimated cost of 5 for instruction: %v3bf16 = call <3 x bfloat> @llvm.ldexp.v3bf16.v3i32(<3 x bfloat> undef, <3 x i32> undef)
+; GFX8PLUS-NEXT:  Cost Model: Found an estimated cost of 7 for instruction: %v4bf16 = call <4 x bfloat> @llvm.ldexp.v4bf16.v4i32(<4 x bfloat> undef, <4 x i32> undef)
+; GFX8PLUS-NEXT:  Cost Model: Found an estimated cost of 9 for instruction: %v5bf16 = call <5 x bfloat> @llvm.ldexp.v5bf16.v5i32(<5 x bfloat> undef, <5 x i32> undef)
+; GFX8PLUS-NEXT:  Cost Model: Found an estimated cost of 15 for instruction: %v8bf16 = call <8 x bfloat> @llvm.ldexp.v8bf16.v8i32(<8 x bfloat> undef, <8 x i32> undef)
+; GFX8PLUS-NEXT:  Cost Model: Found an estimated cost of 31 for instruction: %v16bf16 = call <16 x bfloat> @llvm.ldexp.v16bf16.v16i32(<16 x bfloat> undef, <16 x i32> undef)
+; GFX8PLUS-NEXT:  Cost Model: Found an estimated cost of 33 for instruction: %v17bf16 = call <17 x bfloat> @llvm.ldexp.v17bf16.v17i32(<17 x bfloat> undef, <17 x i32> undef)
+; GFX8PLUS-NEXT:  Cost Model: Found an estimated cost of 10 for instruction: ret void
+;
+; GFX7-SIZE-LABEL: 'ldexp_bf16'
+; GFX7-SIZE-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %bf16 = call bfloat @llvm.ldexp.bf16.i32(bfloat undef, i32 undef)
+; GFX7-SIZE-NEXT:  Cost Model: Found an estimated cost of 4 for instruction: %v2bf16 = call <2 x bfloat> @llvm.ldexp.v2bf16.v2i32(<2 x bfloat> undef, <2 x i32> undef)
+; GFX7-SIZE-NEXT:  Cost Model: Found an estimated cost of 6 for instruction: %v3bf16 = call <3 x bfloat> @llvm.ldexp.v3bf16.v3i32(<3 x bfloat> undef, <3 x i32> undef)
+; GFX7-SIZE-NEXT:  Cost Model: Found an estimated cost of 8 for instruction: %v4bf16 = call <4 x bfloat> @llvm.ldexp.v4bf16.v4i32(<4 x bfloat> undef, <4 x i32> undef)
+; GFX7-SIZE-NEXT:  Cost Model: Found an estimated cost of 10 for instruction: %v5bf16 = call <5 x bfloat> @llvm.ldexp.v5bf16.v5i32(<5 x bfloat> undef, <5 x i32> undef)
+; GFX7-SIZE-NEXT:  Cost Model: Found an estimated cost of 16 for instruction: %v8bf16 = call <8 x bfloat> @llvm.ldexp.v8bf16.v8i32(<8 x bfloat> undef, <8 x i32> undef)
+; GFX7-SIZE-NEXT:  Cost Model: Found an estimated cost of 32 for instruction: %v16bf16 = call <16 x bfloat> @llvm.ldexp.v16bf16.v16i32(<16 x bfloat> undef, <16 x i32> undef)
+; GFX7-SIZE-NEXT:  Cost Model: Found an estimated cost of 34 for instruction: %v17bf16 = call <17 x bfloat> @llvm.ldexp.v17bf16.v17i32(<17 x bfloat> undef, <17 x i32> undef)
+; GFX7-SIZE-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: ret void
+;
+; GFX8PLUS-SIZE-LABEL: 'ldexp_bf16'
+; GFX8PLUS-SIZE-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %bf16 = call bfloat @llvm.ldexp.bf16.i32(bfloat undef, i32 undef)
+; GFX8PLUS-SIZE-NEXT:  Cost Model: Found an estimated cost of 3 for instruction: %v2bf16 = call <2 x bfloat> @llvm.ldexp.v2bf16.v2i32(<2 x bfloat> undef, <2 x i32> undef)
+; GFX8PLUS-SIZE-NEXT:  Cost Model: Found an estimated cost of 5 for instruction: %v3bf16 = call <3 x bfloat> @llvm.ldexp.v3bf16.v3i32(<3 x bfloat> undef, <3 x i32> undef)
+; GFX8PLUS-SIZE-NEXT:  Cost Model: Found an estimated cost of 7 for instruction: %v4bf16 = call <4 x bfloat> @llvm.ldexp.v4bf16.v4i32(<4 x bfloat> undef, <4 x i32> undef)
+; GFX8PLUS-SIZE-NEXT:  Cost Model: Found an estimated cost of 9 for instruction: %v5bf16 = call <5 x bfloat> @llvm.ldexp.v5bf16.v5i32(<5 x bfloat> undef, <5 x i32> undef)
+; GFX8PLUS-SIZE-NEXT:  Cost Model: Found an estimated cost of 15 for instruction: %v8bf16 = call <8 x bfloat> @llvm.ldexp.v8bf16.v8i32(<8 x bfloat> undef, <8 x i32> undef)
+; GFX8PLUS-SIZE-NEXT:  Cost Model: Found an estimated cost of 31 for instruction: %v16bf16 = call <16 x bfloat> @llvm.ldexp.v16bf16.v16i32(<16 x bfloat> undef, <16 x i32> undef)
+; GFX8PLUS-SIZE-NEXT:  Cost Model: Found an estimated cost of 33 for instruction: %v17bf16 = call <17 x bfloat> @llvm.ldexp.v17bf16.v17i32(<17 x bfloat> undef, <17 x i32> undef)
+; GFX8PLUS-SIZE-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: ret void
+;
+  %bf16 = call bfloat @llvm.ldexp.bf16.i32(bfloat undef, i32 undef)
+  %v2bf16 = call <2 x bfloat> @llvm.ldexp.v2bf16.v2i32(<2 x bfloat> undef, <2 x i32> undef)
+  %v3bf16 = call <3 x bfloat> @llvm.ldexp.v3bf16.v3i32(<3 x bfloat> undef, <3 x i32> undef)
+  %v4bf16 = call <4 x bfloat> @llvm.ldexp.v4bf16.v4i32(<4 x bfloat> undef, <4 x i32> undef)
+  %v5bf16 = call <5 x bfloat> @llvm.ldexp.v5bf16.v5i32(<5 x bfloat> undef, <5 x i32> undef)
+  %v8bf16 = call <8 x bfloat> @llvm.ldexp.v8bf16.v8i32(<8 x bfloat> undef, <8 x i32> undef)
+  %v16bf16 = call <16 x bfloat> @llvm.ldexp.v16bf16.v16i32(<16 x bfloat> undef, <16 x i32> undef)
+  %v17bf16 = call <17 x bfloat> @llvm.ldexp.v17bf16.v17i32(<17 x bfloat> undef, <17 x i32> undef)
+  ret void
+}
+
+define void @ldexp_f32() {
+; ALL-LABEL: 'ldexp_f32'
+; ALL-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %f32 = call float @llvm.ldexp.f32.i32(float undef, i32 undef)
+; ALL-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %v2f32 = call <2 x float> @llvm.ldexp.v2f32.v2i32(<2 x float> undef, <2 x i32> undef)
+; ALL-NEXT:  Cost Model: Found an estimated cost of 3 for instruction: %v3f32 = call <3 x float> @llvm.ldexp.v3f32.v3i32(<3 x float> undef, <3 x i32> undef)
+; ALL-NEXT:  Cost Model: Found an estimated cost of 4 for instruction: %v4f32 = call <4 x float> @llvm.ldexp.v4f32.v4i32(<4 x float> undef, <4 x i32> undef)
+; ALL-NEXT:  Cost Model: Found an estimated cost of 5 for instruction: %v5f32 = call <5 x float> @llvm.ldexp.v5f32.v5i32(<5 x float> undef, <5 x i32> undef)
+; ALL-NEXT:  Cost Model: Found an estimated cost of 8 for instruction: %v8f32 = call <8 x float> @llvm.ldexp.v8f32.v8i32(<8 x float> undef, <8 x i32> undef)
+; ALL-NEXT:  Cost Model: Found an estimated cost of 16 for instruction: %v16f32 = call <16 x float> @llvm.ldexp.v16f32.v16i32(<16 x float> undef, <16 x i32> undef)
+; ALL-NEXT:  Cost Model: Found an estimated cost of 17 for instruction: %v17f32 = call <17 x float> @llvm.ldexp.v17f32.v17i32(<17 x float> undef, <17 x i32> undef)
+; ALL-NEXT:  Cost Model: Found an estimated cost of 10 for instruction: ret void
+;
+; ALL-SIZE-LABEL: 'ldexp_f32'
+; ALL-SIZE-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %f32 = call float @llvm.ldexp.f32.i32(float undef, i32 undef)
+; ALL-SIZE-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %v2f32 = call <2 x float> @llvm.ldexp.v2f32.v2i32(<2 x float> undef, <2 x i32> undef)
+; ALL-SIZE-NEXT:  Cost Model: Found an estimated cost of 3 for instruction: %v3f32 = call <3 x float> @llvm.ldexp.v3f32.v3i32(<3 x float> undef, <3 x i32> undef)
+; ALL-SIZE-NEXT:  Cost Model: Found an estimated cost of 4 for instruction: %v4f32 = call <4 x float> @llvm.ldexp.v4f32.v4i32(<4 x float> undef, <4 x i32> undef)
+; ALL-SIZE-NEXT:  Cost Model: Found an estimated cost of 5 for instruction: %v5f32 = call <5 x float> @llvm.ldexp.v5f32.v5i32(<5 x float> undef, <5 x i32> undef)
+; ALL-SIZE-NEXT:  Cost Model: Found an estimated cost of 8 for instruction: %v8f32 = call <8 x float> @llvm.ldexp.v8f32.v8i32(<8 x float> undef, <8 x i32> undef)
+; ALL-SIZE-NEXT:  Cost Model: Found an estimated cost of 16 for instruction: %v16f32 = call <16 x float> @llvm.ldexp.v16f32.v16i32(<16 x float> undef, <16 x i32> undef)
+; ALL-SIZE-NEXT:  Cost Model: Found an estimated cost of 17 for instruction: %v17f32 = call <17 x float> @llvm.ldexp.v17f32.v17i32(<17 x float> undef, <17 x i32> undef)
+; ALL-SIZE-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: ret void
+;
+  %f32 = call float @llvm.ldexp.f32.i32(float undef, i32 undef)
+  %v2f32 = call <2 x float> @llvm.ldexp.v2f32.v2i32(<2 x float> undef, <2 x i32> undef)
+  %v3f32 = call <3 x float> @llvm.ldexp.v3f32.v3i32(<3 x float> undef, <3 x i32> undef)
+  %v4f32 = call <4 x float> @llvm.ldexp.v4f32.v4i32(<4 x float> undef, <4 x i32> undef)
+  %v5f32 = call <5 x float> @llvm.ldexp.v5f32.v5i32(<5 x float> undef, <5 x i32> undef)
+  %v8f32 = call <8 x float> @llvm.ldexp.v8f32.v8i32(<8 x float> undef, <8 x i32> undef)
+  %v16f32 = call <16 x float> @llvm.ldexp.v16f32.v16i32(<16 x float> undef, <16 x i32> undef)
+  %v17f32 = call <17 x float> @llvm.ldexp.v17f32.v17i32(<17 x float> undef, <17 x i32> undef)
+  ret void
+}
+
+define void @ldexp_f64() {
+; ALL-LABEL: 'ldexp_f64'
+; ALL-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %f64 = call double @llvm.ldexp.f64.i32(double undef, i32 undef)
+; ALL-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %v2f64 = call <2 x double> @llvm.ldexp.v2f64.v2i32(<2 x double> undef, <2 x i32> undef)
+; ALL-NEXT:  Cost Model: Found an estimated cost of 3 for instruction: %v3f64 = call <3 x double> @llvm.ldexp.v3f64.v3i32(<3 x double> undef, <3 x i32> undef)
+; ALL-NEXT:  Cost Model: Found an estimated cost of 4 for instruction: %v4f64 = call <4 x double> @llvm.ldexp.v4f64.v4i32(<4 x double> undef, <4 x i32> undef)
+; ALL-NEXT:  Cost Model: Found an estimated cost of 5 for instruction: %v5f64 = call <5 x double> @llvm.ldexp.v5f64.v5i32(<5 x double> undef, <5 x i32> undef)
+; ALL-NEXT:  Cost Model: Found an estimated cost of 8 for instruction: %v8f64 = call <8 x double> @llvm.ldexp.v8f64.v8i32(<8 x double> undef, <8 x i32> undef)
+; ALL-NEXT:  Cost Model: Found an estimated cost of 16 for instruction: %v16f64 = call <16 x double> @llvm.ldexp.v16f64.v16i32(<16 x double> undef, <16 x i32> undef)
+; ALL-NEXT:  Cost Model: Found an estimated cost of 17 for instruction: %v17f64 = call <17 x double> @llvm.ldexp.v17f64.v17i32(<17 x double> undef, <17 x i32> undef)
+; ALL-NEXT:  Cost Model: Found an estimated cost of 10 for instruction: ret void
+;
+; ALL-SIZE-LABEL: 'ldexp_f64'
+; ALL-SIZE-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %f64 = call double @llvm.ldexp.f64.i32(double undef, i32 undef)
+; ALL-SIZE-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %v2f64 = call <2 x double> @llvm.ldexp.v2f64.v2i32(<2 x double> undef, <2 x i32> undef)
+; ALL-SIZE-NEXT:  Cost Model: Found an estimated cost of 3 for instruction: %v3f64 = call <3 x double> @llvm.ldexp.v3f64.v3i32(<3 x double> undef, <3 x i32> undef)
+; ALL-SIZE-NEXT:  Cost Model: Found an estimated cost of 4 for instruction: %v4f64 = call <4 x double> @llvm.ldexp.v4f64.v4i32(<4 x double> undef, <4 x i32> undef)
+; ALL-SIZE-NEXT:  Cost Model: Found an estimated cost of 5 for instruction: %v5f64 = call <5 x double> @llvm.ldexp.v5f64.v5i32(<5 x double> undef, <5 x i32> undef)
+; ALL-SIZE-NEXT:  Cost Model: Found an estimated cost of 8 for instruction: %v8f64 = call <8 x double> @llvm.ldexp.v8f64.v8i32(<8 x double> undef, <8 x i32> undef)
+; ALL-SIZE-NEXT:  Cost Model: Found an estimated cost of 16 for instruction: %v16f64 = call <16 x double> @llvm.ldexp.v16f64.v16i32(<16 x double> undef, <16 x i32> undef)
+; ALL-SIZE-NEXT:  Cost Model: Found an estimated cost of 17 for instruction: %v17f64 = call <17 x double> @llvm.ldexp.v17f64.v17i32(<17 x double> undef, <17 x i32> undef)
+; ALL-SIZE-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: ret void
+;
+  %f64 = call double @llvm.ldexp.f64.i32(double undef, i32 undef)
+  %v2f64 = call <2 x double> @llvm.ldexp.v2f64.v2i32(<2 x double> undef, <2 x i32> undef)
+  %v3f64 = call <3 x double> @llvm.ldexp.v3f64.v3i32(<3 x double> undef, <3 x i32> undef)
+  %v4f64 = call <4 x double> @llvm.ldexp.v4f64.v4i32(<4 x double> undef, <4 x i32> undef)
+  %v5f64 = call <5 x double> @llvm.ldexp.v5f64.v5i32(<5 x double> undef, <5 x i32> undef)
+  %v8f64 = call <8 x double> @llvm.ldexp.v8f64.v8i32(<8 x double> undef, <8 x i32> undef)
+  %v16f64 = call <16 x double> @llvm.ldexp.v16f64.v16i32(<16 x double> undef, <16 x i32> undef)
+  %v17f64 = call <17 x double> @llvm.ldexp.v17f64.v17i32(<17 x double> undef, <17 x i32> undef)
+  ret void
+}
diff --git a/llvm/test/Analysis/CostModel/AMDGPU/log.ll b/llvm/test/Analysis/CostModel/AMDGPU/log.ll
new file mode 100644
index 0000000..2cf7039
--- /dev/null
+++ b/llvm/test/Analysis/CostModel/AMDGPU/log.ll
@@ -0,0 +1,278 @@
+; NOTE: Assertions have been autogenerated by utils/update_analyze_test_checks.py UTC_ARGS: --version 5
+; RUN: opt < %s -mtriple=amdgcn-unknown-amdhsa -passes="print<cost-model>" -cost-kind=throughput 2>&1 -disable-output | FileCheck -check-prefixes=ALL,BASE %s
+; RUN: opt < %s -mtriple=amdgcn-unknown-amdhsa -mcpu=gfx803 -passes="print<cost-model>" -cost-kind=throughput 2>&1 -disable-output | FileCheck -check-prefixes=ALL,GFX8 %s
+; RUN: opt < %s -mtriple=amdgcn-unknown-amdhsa -mcpu=gfx900 -passes="print<cost-model>" -cost-kind=throughput 2>&1 -disable-output | FileCheck -check-prefixes=ALL,GFX9 %s
+; RUN: opt < %s -mtriple=amdgcn-unknown-amdhsa -mcpu=gfx1030 -passes="print<cost-model>" -cost-kind=throughput 2>&1 -disable-output | FileCheck -check-prefixes=ALL,GFX10 %s
+
+; RUN: opt < %s -mtriple=amdgcn-unknown-amdhsa -passes="print<cost-model>" -cost-kind=code-size 2>&1 -disable-output | FileCheck -check-prefixes=ALL-SIZE,BASE-SIZE %s
+; RUN: opt < %s -mtriple=amdgcn-unknown-amdhsa -mcpu=gfx803 -passes="print<cost-model>" -cost-kind=code-size 2>&1 -disable-output | FileCheck -check-prefixes=ALL-SIZE,GFX8-SIZE %s
+; RUN: opt < %s -mtriple=amdgcn-unknown-amdhsa -mcpu=gfx900 -passes="print<cost-model>" -cost-kind=code-size 2>&1 -disable-output | FileCheck -check-prefixes=ALL-SIZE,GFX9-SIZE %s
+; RUN: opt < %s -mtriple=amdgcn-unknown-amdhsa -mcpu=gfx1030 -passes="print<cost-model>" -cost-kind=code-size 2>&1 -disable-output | FileCheck -check-prefixes=ALL-SIZE,GFX10-SIZE %s
+
+define void @log_f16() {
+; BASE-LABEL: 'log_f16'
+; BASE-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %f16 = call half @llvm.log.f16(half undef)
+; BASE-NEXT:  Cost Model: Found an estimated cost of 4 for instruction: %v2f16 = call <2 x half> @llvm.log.v2f16(<2 x half> undef)
+; BASE-NEXT:  Cost Model: Found an estimated cost of 8 for instruction: %v3f16 = call <3 x half> @llvm.log.v3f16(<3 x half> undef)
+; BASE-NEXT:  Cost Model: Found an estimated cost of 8 for instruction: %v4f16 = call <4 x half> @llvm.log.v4f16(<4 x half> undef)
+; BASE-NEXT:  Cost Model: Found an estimated cost of 16 for instruction: %v5f16 = call <5 x half> @llvm.log.v5f16(<5 x half> undef)
+; BASE-NEXT:  Cost Model: Found an estimated cost of 16 for instruction: %v8f16 = call <8 x half> @llvm.log.v8f16(<8 x half> undef)
+; BASE-NEXT:  Cost Model: Found an estimated cost of 32 for instruction: %v16f16 = call <16 x half> @llvm.log.v16f16(<16 x half> undef)
+; BASE-NEXT:  Cost Model: Found an estimated cost of 64 for instruction: %v17f16 = call <17 x half> @llvm.log.v17f16(<17 x half> undef)
+; BASE-NEXT:  Cost Model: Found an estimated cost of 10 for instruction: ret void
+;
+; GFX8-LABEL: 'log_f16'
+; GFX8-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %f16 = call half @llvm.log.f16(half undef)
+; GFX8-NEXT:  Cost Model: Found an estimated cost of 5 for instruction: %v2f16 = call <2 x half> @llvm.log.v2f16(<2 x half> undef)
+; GFX8-NEXT:  Cost Model: Found an estimated cost of 8 for instruction: %v3f16 = call <3 x half> @llvm.log.v3f16(<3 x half> undef)
+; GFX8-NEXT:  Cost Model: Found an estimated cost of 11 for instruction: %v4f16 = call <4 x half> @llvm.log.v4f16(<4 x half> undef)
+; GFX8-NEXT:  Cost Model: Found an estimated cost of 14 for instruction: %v5f16 = call <5 x half> @llvm.log.v5f16(<5 x half> undef)
+; GFX8-NEXT:  Cost Model: Found an estimated cost of 23 for instruction: %v8f16 = call <8 x half> @llvm.log.v8f16(<8 x half> undef)
+; GFX8-NEXT:  Cost Model: Found an estimated cost of 47 for instruction: %v16f16 = call <16 x half> @llvm.log.v16f16(<16 x half> undef)
+; GFX8-NEXT:  Cost Model: Found an estimated cost of 50 for instruction: %v17f16 = call <17 x half> @llvm.log.v17f16(<17 x half> undef)
+; GFX8-NEXT:  Cost Model: Found an estimated cost of 10 for instruction: ret void
+;
+; GFX9-LABEL: 'log_f16'
+; GFX9-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %f16 = call half @llvm.log.f16(half undef)
+; GFX9-NEXT:  Cost Model: Found an estimated cost of 5 for instruction: %v2f16 = call <2 x half> @llvm.log.v2f16(<2 x half> undef)
+; GFX9-NEXT:  Cost Model: Found an estimated cost of 8 for instruction: %v3f16 = call <3 x half> @llvm.log.v3f16(<3 x half> undef)
+; GFX9-NEXT:  Cost Model: Found an estimated cost of 11 for instruction: %v4f16 = call <4 x half> @llvm.log.v4f16(<4 x half> undef)
+; GFX9-NEXT:  Cost Model: Found an estimated cost of 14 for instruction: %v5f16 = call <5 x half> @llvm.log.v5f16(<5 x half> undef)
+; GFX9-NEXT:  Cost Model: Found an estimated cost of 23 for instruction: %v8f16 = call <8 x half> @llvm.log.v8f16(<8 x half> undef)
+; GFX9-NEXT:  Cost Model: Found an estimated cost of 47 for instruction: %v16f16 = call <16 x half> @llvm.log.v16f16(<16 x half> undef)
+; GFX9-NEXT:  Cost Model: Found an estimated cost of 50 for instruction: %v17f16 = call <17 x half> @llvm.log.v17f16(<17 x half> undef)
+; GFX9-NEXT:  Cost Model: Found an estimated cost of 10 for instruction: ret void
+;
+; GFX10-LABEL: 'log_f16'
+; GFX10-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %f16 = call half @llvm.log.f16(half undef)
+; GFX10-NEXT:  Cost Model: Found an estimated cost of 5 for instruction: %v2f16 = call <2 x half> @llvm.log.v2f16(<2 x half> undef)
+; GFX10-NEXT:  Cost Model: Found an estimated cost of 8 for instruction: %v3f16 = call <3 x half> @llvm.log.v3f16(<3 x half> undef)
+; GFX10-NEXT:  Cost Model: Found an estimated cost of 11 for instruction: %v4f16 = call <4 x half> @llvm.log.v4f16(<4 x half> undef)
+; GFX10-NEXT:  Cost Model: Found an estimated cost of 14 for instruction: %v5f16 = call <5 x half> @llvm.log.v5f16(<5 x half> undef)
+; GFX10-NEXT:  Cost Model: Found an estimated cost of 23 for instruction: %v8f16 = call <8 x half> @llvm.log.v8f16(<8 x half> undef)
+; GFX10-NEXT:  Cost Model: Found an estimated cost of 47 for instruction: %v16f16 = call <16 x half> @llvm.log.v16f16(<16 x half> undef)
+; GFX10-NEXT:  Cost Model: Found an estimated cost of 50 for instruction: %v17f16 = call <17 x half> @llvm.log.v17f16(<17 x half> undef)
+; GFX10-NEXT:  Cost Model: Found an estimated cost of 10 for instruction: ret void
+;
+; BASE-SIZE-LABEL: 'log_f16'
+; BASE-SIZE-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %f16 = call half @llvm.log.f16(half undef)
+; BASE-SIZE-NEXT:  Cost Model: Found an estimated cost of 4 for instruction: %v2f16 = call <2 x half> @llvm.log.v2f16(<2 x half> undef)
+; BASE-SIZE-NEXT:  Cost Model: Found an estimated cost of 8 for instruction: %v3f16 = call <3 x half> @llvm.log.v3f16(<3 x half> undef)
+; BASE-SIZE-NEXT:  Cost Model: Found an estimated cost of 8 for instruction: %v4f16 = call <4 x half> @llvm.log.v4f16(<4 x half> undef)
+; BASE-SIZE-NEXT:  Cost Model: Found an estimated cost of 16 for instruction: %v5f16 = call <5 x half> @llvm.log.v5f16(<5 x half> undef)
+; BASE-SIZE-NEXT:  Cost Model: Found an estimated cost of 16 for instruction: %v8f16 = call <8 x half> @llvm.log.v8f16(<8 x half> undef)
+; BASE-SIZE-NEXT:  Cost Model: Found an estimated cost of 32 for instruction: %v16f16 = call <16 x half> @llvm.log.v16f16(<16 x half> undef)
+; BASE-SIZE-NEXT:  Cost Model: Found an estimated cost of 64 for instruction: %v17f16 = call <17 x half> @llvm.log.v17f16(<17 x half> undef)
+; BASE-SIZE-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: ret void
+;
+; GFX8-SIZE-LABEL: 'log_f16'
+; GFX8-SIZE-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %f16 = call half @llvm.log.f16(half undef)
+; GFX8-SIZE-NEXT:  Cost Model: Found an estimated cost of 5 for instruction: %v2f16 = call <2 x half> @llvm.log.v2f16(<2 x half> undef)
+; GFX8-SIZE-NEXT:  Cost Model: Found an estimated cost of 8 for instruction: %v3f16 = call <3 x half> @llvm.log.v3f16(<3 x half> undef)
+; GFX8-SIZE-NEXT:  Cost Model: Found an estimated cost of 11 for instruction: %v4f16 = call <4 x half> @llvm.log.v4f16(<4 x half> undef)
+; GFX8-SIZE-NEXT:  Cost Model: Found an estimated cost of 14 for instruction: %v5f16 = call <5 x half> @llvm.log.v5f16(<5 x half> undef)
+; GFX8-SIZE-NEXT:  Cost Model: Found an estimated cost of 23 for instruction: %v8f16 = call <8 x half> @llvm.log.v8f16(<8 x half> undef)
+; GFX8-SIZE-NEXT:  Cost Model: Found an estimated cost of 47 for instruction: %v16f16 = call <16 x half> @llvm.log.v16f16(<16 x half> undef)
+; GFX8-SIZE-NEXT:  Cost Model: Found an estimated cost of 50 for instruction: %v17f16 = call <17 x half> @llvm.log.v17f16(<17 x half> undef)
+; GFX8-SIZE-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: ret void
+;
+; GFX9-SIZE-LABEL: 'log_f16'
+; GFX9-SIZE-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %f16 = call half @llvm.log.f16(half undef)
+; GFX9-SIZE-NEXT:  Cost Model: Found an estimated cost of 5 for instruction: %v2f16 = call <2 x half> @llvm.log.v2f16(<2 x half> undef)
+; GFX9-SIZE-NEXT:  Cost Model: Found an estimated cost of 8 for instruction: %v3f16 = call <3 x half> @llvm.log.v3f16(<3 x half> undef)
+; GFX9-SIZE-NEXT:  Cost Model: Found an estimated cost of 11 for instruction: %v4f16 = call <4 x half> @llvm.log.v4f16(<4 x half> undef)
+; GFX9-SIZE-NEXT:  Cost Model: Found an estimated cost of 14 for instruction: %v5f16 = call <5 x half> @llvm.log.v5f16(<5 x half> undef)
+; GFX9-SIZE-NEXT:  Cost Model: Found an estimated cost of 23 for instruction: %v8f16 = call <8 x half> @llvm.log.v8f16(<8 x half> undef)
+; GFX9-SIZE-NEXT:  Cost Model: Found an estimated cost of 47 for instruction: %v16f16 = call <16 x half> @llvm.log.v16f16(<16 x half> undef)
+; GFX9-SIZE-NEXT:  Cost Model: Found an estimated cost of 50 for instruction: %v17f16 = call <17 x half> @llvm.log.v17f16(<17 x half> undef)
+; GFX9-SIZE-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: ret void
+;
+; GFX10-SIZE-LABEL: 'log_f16'
+; GFX10-SIZE-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %f16 = call half @llvm.log.f16(half undef)
+; GFX10-SIZE-NEXT:  Cost Model: Found an estimated cost of 5 for instruction: %v2f16 = call <2 x half> @llvm.log.v2f16(<2 x half> undef)
+; GFX10-SIZE-NEXT:  Cost Model: Found an estimated cost of 8 for instruction: %v3f16 = call <3 x half> @llvm.log.v3f16(<3 x half> undef)
+; GFX10-SIZE-NEXT:  Cost Model: Found an estimated cost of 11 for instruction: %v4f16 = call <4 x half> @llvm.log.v4f16(<4 x half> undef)
+; GFX10-SIZE-NEXT:  Cost Model: Found an estimated cost of 14 for instruction: %v5f16 = call <5 x half> @llvm.log.v5f16(<5 x half> undef)
+; GFX10-SIZE-NEXT:  Cost Model: Found an estimated cost of 23 for instruction: %v8f16 = call <8 x half> @llvm.log.v8f16(<8 x half> undef)
+; GFX10-SIZE-NEXT:  Cost Model: Found an estimated cost of 47 for instruction: %v16f16 = call <16 x half> @llvm.log.v16f16(<16 x half> undef)
+; GFX10-SIZE-NEXT:  Cost Model: Found an estimated cost of 50 for instruction: %v17f16 = call <17 x half> @llvm.log.v17f16(<17 x half> undef)
+; GFX10-SIZE-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: ret void
+;
+  %f16 = call half @llvm.log.f16(half undef)
+  %v2f16 = call <2 x half> @llvm.log.v2f16(<2 x half> undef)
+  %v3f16 = call <3 x half> @llvm.log.v3f16(<3 x half> undef)
+  %v4f16 = call <4 x half> @llvm.log.v4f16(<4 x half> undef)
+  %v5f16 = call <5 x half> @llvm.log.v5f16(<5 x half> undef)
+  %v8f16 = call <8 x half> @llvm.log.v8f16(<8 x half> undef)
+  %v16f16 = call <16 x half> @llvm.log.v16f16(<16 x half> undef)
+  %v17f16 = call <17 x half> @llvm.log.v17f16(<17 x half> undef)
+  ret void
+}
+
+define void @log_bf16() {
+; BASE-LABEL: 'log_bf16'
+; BASE-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %bf16 = call bfloat @llvm.log.bf16(bfloat undef)
+; BASE-NEXT:  Cost Model: Found an estimated cost of 4 for instruction: %v2bf16 = call <2 x bfloat> @llvm.log.v2bf16(<2 x bfloat> undef)
+; BASE-NEXT:  Cost Model: Found an estimated cost of 8 for instruction: %v3bf16 = call <3 x bfloat> @llvm.log.v3bf16(<3 x bfloat> undef)
+; BASE-NEXT:  Cost Model: Found an estimated cost of 8 for instruction: %v4bf16 = call <4 x bfloat> @llvm.log.v4bf16(<4 x bfloat> undef)
+; BASE-NEXT:  Cost Model: Found an estimated cost of 16 for instruction: %v5bf16 = call <5 x bfloat> @llvm.log.v5bf16(<5 x bfloat> undef)
+; BASE-NEXT:  Cost Model: Found an estimated cost of 16 for instruction: %v8bf16 = call <8 x bfloat> @llvm.log.v8bf16(<8 x bfloat> undef)
+; BASE-NEXT:  Cost Model: Found an estimated cost of 32 for instruction: %v16bf16 = call <16 x bfloat> @llvm.log.v16bf16(<16 x bfloat> undef)
+; BASE-NEXT:  Cost Model: Found an estimated cost of 64 for instruction: %v17bf16 = call <17 x bfloat> @llvm.log.v17bf16(<17 x bfloat> undef)
+; BASE-NEXT:  Cost Model: Found an estimated cost of 10 for instruction: ret void
+;
+; GFX8-LABEL: 'log_bf16'
+; GFX8-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %bf16 = call bfloat @llvm.log.bf16(bfloat undef)
+; GFX8-NEXT:  Cost Model: Found an estimated cost of 3 for instruction: %v2bf16 = call <2 x bfloat> @llvm.log.v2bf16(<2 x bfloat> undef)
+; GFX8-NEXT:  Cost Model: Found an estimated cost of 5 for instruction: %v3bf16 = call <3 x bfloat> @llvm.log.v3bf16(<3 x bfloat> undef)
+; GFX8-NEXT:  Cost Model: Found an estimated cost of 7 for instruction: %v4bf16 = call <4 x bfloat> @llvm.log.v4bf16(<4 x bfloat> undef)
+; GFX8-NEXT:  Cost Model: Found an estimated cost of 9 for instruction: %v5bf16 = call <5 x bfloat> @llvm.log.v5bf16(<5 x bfloat> undef)
+; GFX8-NEXT:  Cost Model: Found an estimated cost of 15 for instruction: %v8bf16 = call <8 x bfloat> @llvm.log.v8bf16(<8 x bfloat> undef)
+; GFX8-NEXT:  Cost Model: Found an estimated cost of 31 for instruction: %v16bf16 = call <16 x bfloat> @llvm.log.v16bf16(<16 x bfloat> undef)
+; GFX8-NEXT:  Cost Model: Found an estimated cost of 33 for instruction: %v17bf16 = call <17 x bfloat> @llvm.log.v17bf16(<17 x bfloat> undef)
+; GFX8-NEXT:  Cost Model: Found an estimated cost of 10 for instruction: ret void
+;
+; GFX9-LABEL: 'log_bf16'
+; GFX9-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %bf16 = call bfloat @llvm.log.bf16(bfloat undef)
+; GFX9-NEXT:  Cost Model: Found an estimated cost of 3 for instruction: %v2bf16 = call <2 x bfloat> @llvm.log.v2bf16(<2 x bfloat> undef)
+; GFX9-NEXT:  Cost Model: Found an estimated cost of 5 for instruction: %v3bf16 = call <3 x bfloat> @llvm.log.v3bf16(<3 x bfloat> undef)
+; GFX9-NEXT:  Cost Model: Found an estimated cost of 7 for instruction: %v4bf16 = call <4 x bfloat> @llvm.log.v4bf16(<4 x bfloat> undef)
+; GFX9-NEXT:  Cost Model: Found an estimated cost of 9 for instruction: %v5bf16 = call <5 x bfloat> @llvm.log.v5bf16(<5 x bfloat> undef)
+; GFX9-NEXT:  Cost Model: Found an estimated cost of 15 for instruction: %v8bf16 = call <8 x bfloat> @llvm.log.v8bf16(<8 x bfloat> undef)
+; GFX9-NEXT:  Cost Model: Found an estimated cost of 31 for instruction: %v16bf16 = call <16 x bfloat> @llvm.log.v16bf16(<16 x bfloat> undef)
+; GFX9-NEXT:  Cost Model: Found an estimated cost of 33 for instruction: %v17bf16 = call <17 x bfloat> @llvm.log.v17bf16(<17 x bfloat> undef)
+; GFX9-NEXT:  Cost Model: Found an estimated cost of 10 for instruction: ret void
+;
+; GFX10-LABEL: 'log_bf16'
+; GFX10-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %bf16 = call bfloat @llvm.log.bf16(bfloat undef)
+; GFX10-NEXT:  Cost Model: Found an estimated cost of 3 for instruction: %v2bf16 = call <2 x bfloat> @llvm.log.v2bf16(<2 x bfloat> undef)
+; GFX10-NEXT:  Cost Model: Found an estimated cost of 5 for instruction: %v3bf16 = call <3 x bfloat> @llvm.log.v3bf16(<3 x bfloat> undef)
+; GFX10-NEXT:  Cost Model: Found an estimated cost of 7 for instruction: %v4bf16 = call <4 x bfloat> @llvm.log.v4bf16(<4 x bfloat> undef)
+; GFX10-NEXT:  Cost Model: Found an estimated cost of 9 for instruction: %v5bf16 = call <5 x bfloat> @llvm.log.v5bf16(<5 x bfloat> undef)
+; GFX10-NEXT:  Cost Model: Found an estimated cost of 15 for instruction: %v8bf16 = call <8 x bfloat> @llvm.log.v8bf16(<8 x bfloat> undef)
+; GFX10-NEXT:  Cost Model: Found an estimated cost of 31 for instruction: %v16bf16 = call <16 x bfloat> @llvm.log.v16bf16(<16 x bfloat> undef)
+; GFX10-NEXT:  Cost Model: Found an estimated cost of 33 for instruction: %v17bf16 = call <17 x bfloat> @llvm.log.v17bf16(<17 x bfloat> undef)
+; GFX10-NEXT:  Cost Model: Found an estimated cost of 10 for instruction: ret void
+;
+; BASE-SIZE-LABEL: 'log_bf16'
+; BASE-SIZE-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %bf16 = call bfloat @llvm.log.bf16(bfloat undef)
+; BASE-SIZE-NEXT:  Cost Model: Found an estimated cost of 4 for instruction: %v2bf16 = call <2 x bfloat> @llvm.log.v2bf16(<2 x bfloat> undef)
+; BASE-SIZE-NEXT:  Cost Model: Found an estimated cost of 8 for instruction: %v3bf16 = call <3 x bfloat> @llvm.log.v3bf16(<3 x bfloat> undef)
+; BASE-SIZE-NEXT:  Cost Model: Found an estimated cost of 8 for instruction: %v4bf16 = call <4 x bfloat> @llvm.log.v4bf16(<4 x bfloat> undef)
+; BASE-SIZE-NEXT:  Cost Model: Found an estimated cost of 16 for instruction: %v5bf16 = call <5 x bfloat> @llvm.log.v5bf16(<5 x bfloat> undef)
+; BASE-SIZE-NEXT:  Cost Model: Found an estimated cost of 16 for instruction: %v8bf16 = call <8 x bfloat> @llvm.log.v8bf16(<8 x bfloat> undef)
+; BASE-SIZE-NEXT:  Cost Model: Found an estimated cost of 32 for instruction: %v16bf16 = call <16 x bfloat> @llvm.log.v16bf16(<16 x bfloat> undef)
+; BASE-SIZE-NEXT:  Cost Model: Found an estimated cost of 64 for instruction: %v17bf16 = call <17 x bfloat> @llvm.log.v17bf16(<17 x bfloat> undef)
+; BASE-SIZE-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: ret void
+;
+; GFX8-SIZE-LABEL: 'log_bf16'
+; GFX8-SIZE-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %bf16 = call bfloat @llvm.log.bf16(bfloat undef)
+; GFX8-SIZE-NEXT:  Cost Model: Found an estimated cost of 3 for instruction: %v2bf16 = call <2 x bfloat> @llvm.log.v2bf16(<2 x bfloat> undef)
+; GFX8-SIZE-NEXT:  Cost Model: Found an estimated cost of 5 for instruction: %v3bf16 = call <3 x bfloat> @llvm.log.v3bf16(<3 x bfloat> undef)
+; GFX8-SIZE-NEXT:  Cost Model: Found an estimated cost of 7 for instruction: %v4bf16 = call <4 x bfloat> @llvm.log.v4bf16(<4 x bfloat> undef)
+; GFX8-SIZE-NEXT:  Cost Model: Found an estimated cost of 9 for instruction: %v5bf16 = call <5 x bfloat> @llvm.log.v5bf16(<5 x bfloat> undef)
+; GFX8-SIZE-NEXT:  Cost Model: Found an estimated cost of 15 for instruction: %v8bf16 = call <8 x bfloat> @llvm.log.v8bf16(<8 x bfloat> undef)
+; GFX8-SIZE-NEXT:  Cost Model: Found an estimated cost of 31 for instruction: %v16bf16 = call <16 x bfloat> @llvm.log.v16bf16(<16 x bfloat> undef)
+; GFX8-SIZE-NEXT:  Cost Model: Found an estimated cost of 33 for instruction: %v17bf16 = call <17 x bfloat> @llvm.log.v17bf16(<17 x bfloat> undef)
+; GFX8-SIZE-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: ret void
+;
+; GFX9-SIZE-LABEL: 'log_bf16'
+; GFX9-SIZE-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %bf16 = call bfloat @llvm.log.bf16(bfloat undef)
+; GFX9-SIZE-NEXT:  Cost Model: Found an estimated cost of 3 for instruction: %v2bf16 = call <2 x bfloat> @llvm.log.v2bf16(<2 x bfloat> undef)
+; GFX9-SIZE-NEXT:  Cost Model: Found an estimated cost of 5 for instruction: %v3bf16 = call <3 x bfloat> @llvm.log.v3bf16(<3 x bfloat> undef)
+; GFX9-SIZE-NEXT:  Cost Model: Found an estimated cost of 7 for instruction: %v4bf16 = call <4 x bfloat> @llvm.log.v4bf16(<4 x bfloat> undef)
+; GFX9-SIZE-NEXT:  Cost Model: Found an estimated cost of 9 for instruction: %v5bf16 = call <5 x bfloat> @llvm.log.v5bf16(<5 x bfloat> undef)
+; GFX9-SIZE-NEXT:  Cost Model: Found an estimated cost of 15 for instruction: %v8bf16 = call <8 x bfloat> @llvm.log.v8bf16(<8 x bfloat> undef)
+; GFX9-SIZE-NEXT:  Cost Model: Found an estimated cost of 31 for instruction: %v16bf16 = call <16 x bfloat> @llvm.log.v16bf16(<16 x bfloat> undef)
+; GFX9-SIZE-NEXT:  Cost Model: Found an estimated cost of 33 for instruction: %v17bf16 = call <17 x bfloat> @llvm.log.v17bf16(<17 x bfloat> undef)
+; GFX9-SIZE-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: ret void
+;
+; GFX10-SIZE-LABEL: 'log_bf16'
+; GFX10-SIZE-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %bf16 = call bfloat @llvm.log.bf16(bfloat undef)
+; GFX10-SIZE-NEXT:  Cost Model: Found an estimated cost of 3 for instruction: %v2bf16 = call <2 x bfloat> @llvm.log.v2bf16(<2 x bfloat> undef)
+; GFX10-SIZE-NEXT:  Cost Model: Found an estimated cost of 5 for instruction: %v3bf16 = call <3 x bfloat> @llvm.log.v3bf16(<3 x bfloat> undef)
+; GFX10-SIZE-NEXT:  Cost Model: Found an estimated cost of 7 for instruction: %v4bf16 = call <4 x bfloat> @llvm.log.v4bf16(<4 x bfloat> undef)
+; GFX10-SIZE-NEXT:  Cost Model: Found an estimated cost of 9 for instruction: %v5bf16 = call <5 x bfloat> @llvm.log.v5bf16(<5 x bfloat> undef)
+; GFX10-SIZE-NEXT:  Cost Model: Found an estimated cost of 15 for instruction: %v8bf16 = call <8 x bfloat> @llvm.log.v8bf16(<8 x bfloat> undef)
+; GFX10-SIZE-NEXT:  Cost Model: Found an estimated cost of 31 for instruction: %v16bf16 = call <16 x bfloat> @llvm.log.v16bf16(<16 x bfloat> undef)
+; GFX10-SIZE-NEXT:  Cost Model: Found an estimated cost of 33 for instruction: %v17bf16 = call <17 x bfloat> @llvm.log.v17bf16(<17 x bfloat> undef)
+; GFX10-SIZE-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: ret void
+;
+  %bf16 = call bfloat @llvm.log.bf16(bfloat undef)
+  %v2bf16 = call <2 x bfloat> @llvm.log.v2bf16(<2 x bfloat> undef)
+  %v3bf16 = call <3 x bfloat> @llvm.log.v3bf16(<3 x bfloat> undef)
+  %v4bf16 = call <4 x bfloat> @llvm.log.v4bf16(<4 x bfloat> undef)
+  %v5bf16 = call <5 x bfloat> @llvm.log.v5bf16(<5 x bfloat> undef)
+  %v8bf16 = call <8 x bfloat> @llvm.log.v8bf16(<8 x bfloat> undef)
+  %v16bf16 = call <16 x bfloat> @llvm.log.v16bf16(<16 x bfloat> undef)
+  %v17bf16 = call <17 x bfloat> @llvm.log.v17bf16(<17 x bfloat> undef)
+  ret void
+}
+
+define void @log_f32() {
+; ALL-LABEL: 'log_f32'
+; ALL-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %f32 = call float @llvm.log.f32(float undef)
+; ALL-NEXT:  Cost Model: Found an estimated cost of 4 for instruction: %v2f32 = call <2 x float> @llvm.log.v2f32(<2 x float> undef)
+; ALL-NEXT:  Cost Model: Found an estimated cost of 6 for instruction: %v3f32 = call <3 x float> @llvm.log.v3f32(<3 x float> undef)
+; ALL-NEXT:  Cost Model: Found an estimated cost of 8 for instruction: %v4f32 = call <4 x float> @llvm.log.v4f32(<4 x float> undef)
+; ALL-NEXT:  Cost Model: Found an estimated cost of 10 for instruction: %v5f32 = call <5 x float> @llvm.log.v5f32(<5 x float> undef)
+; ALL-NEXT:  Cost Model: Found an estimated cost of 16 for instruction: %v8f32 = call <8 x float> @llvm.log.v8f32(<8 x float> undef)
+; ALL-NEXT:  Cost Model: Found an estimated cost of 32 for instruction: %v16f32 = call <16 x float> @llvm.log.v16f32(<16 x float> undef)
+; ALL-NEXT:  Cost Model: Found an estimated cost of 34 for instruction: %v17f32 = call <17 x float> @llvm.log.v17f32(<17 x float> undef)
+; ALL-NEXT:  Cost Model: Found an estimated cost of 10 for instruction: ret void
+;
+; ALL-SIZE-LABEL: 'log_f32'
+; ALL-SIZE-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %f32 = call float @llvm.log.f32(float undef)
+; ALL-SIZE-NEXT:  Cost Model: Found an estimated cost of 4 for instruction: %v2f32 = call <2 x float> @llvm.log.v2f32(<2 x float> undef)
+; ALL-SIZE-NEXT:  Cost Model: Found an estimated cost of 6 for instruction: %v3f32 = call <3 x float> @llvm.log.v3f32(<3 x float> undef)
+; ALL-SIZE-NEXT:  Cost Model: Found an estimated cost of 8 for instruction: %v4f32 = call <4 x float> @llvm.log.v4f32(<4 x float> undef)
+; ALL-SIZE-NEXT:  Cost Model: Found an estimated cost of 10 for instruction: %v5f32 = call <5 x float> @llvm.log.v5f32(<5 x float> undef)
+; ALL-SIZE-NEXT:  Cost Model: Found an estimated cost of 16 for instruction: %v8f32 = call <8 x float> @llvm.log.v8f32(<8 x float> undef)
+; ALL-SIZE-NEXT:  Cost Model: Found an estimated cost of 32 for instruction: %v16f32 = call <16 x float> @llvm.log.v16f32(<16 x float> undef)
+; ALL-SIZE-NEXT:  Cost Model: Found an estimated cost of 34 for instruction: %v17f32 = call <17 x float> @llvm.log.v17f32(<17 x float> undef)
+; ALL-SIZE-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: ret void
+;
+  %f32 = call float @llvm.log.f32(float undef)
+  %v2f32 = call <2 x float> @llvm.log.v2f32(<2 x float> undef)
+  %v3f32 = call <3 x float> @llvm.log.v3f32(<3 x float> undef)
+  %v4f32 = call <4 x float> @llvm.log.v4f32(<4 x float> undef)
+  %v5f32 = call <5 x float> @llvm.log.v5f32(<5 x float> undef)
+  %v8f32 = call <8 x float> @llvm.log.v8f32(<8 x float> undef)
+  %v16f32 = call <16 x float> @llvm.log.v16f32(<16 x float> undef)
+  %v17f32 = call <17 x float> @llvm.log.v17f32(<17 x float> undef)
+  ret void
+}
+
+define void @log_f64() {
+; ALL-LABEL: 'log_f64'
+; ALL-NEXT:  Cost Model: Found an estimated cost of 10 for instruction: %f64 = call double @llvm.log.f64(double undef)
+; ALL-NEXT:  Cost Model: Found an estimated cost of 20 for instruction: %v2f64 = call <2 x double> @llvm.log.v2f64(<2 x double> undef)
+; ALL-NEXT:  Cost Model: Found an estimated cost of 30 for instruction: %v3f64 = call <3 x double> @llvm.log.v3f64(<3 x double> undef)
+; ALL-NEXT:  Cost Model: Found an estimated cost of 40 for instruction: %v4f64 = call <4 x double> @llvm.log.v4f64(<4 x double> undef)
+; ALL-NEXT:  Cost Model: Found an estimated cost of 50 for instruction: %v5f64 = call <5 x double> @llvm.log.v5f64(<5 x double> undef)
+; ALL-NEXT:  Cost Model: Found an estimated cost of 80 for instruction: %v8f64 = call <8 x double> @llvm.log.v8f64(<8 x double> undef)
+; ALL-NEXT:  Cost Model: Found an estimated cost of 160 for instruction: %v16f64 = call <16 x double> @llvm.log.v16f64(<16 x double> undef)
+; ALL-NEXT:  Cost Model: Found an estimated cost of 170 for instruction: %v17f64 = call <17 x double> @llvm.log.v17f64(<17 x double> undef)
+; ALL-NEXT:  Cost Model: Found an estimated cost of 10 for instruction: ret void
+;
+; ALL-SIZE-LABEL: 'log_f64'
+; ALL-SIZE-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %f64 = call double @llvm.log.f64(double undef)
+; ALL-SIZE-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %v2f64 = call <2 x double> @llvm.log.v2f64(<2 x double> undef)
+; ALL-SIZE-NEXT:  Cost Model: Found an estimated cost of 3 for instruction: %v3f64 = call <3 x double> @llvm.log.v3f64(<3 x double> undef)
+; ALL-SIZE-NEXT:  Cost Model: Found an estimated cost of 4 for instruction: %v4f64 = call <4 x double> @llvm.log.v4f64(<4 x double> undef)
+; ALL-SIZE-NEXT:  Cost Model: Found an estimated cost of 5 for instruction: %v5f64 = call <5 x double> @llvm.log.v5f64(<5 x double> undef)
+; ALL-SIZE-NEXT:  Cost Model: Found an estimated cost of 8 for instruction: %v8f64 = call <8 x double> @llvm.log.v8f64(<8 x double> undef)
+; ALL-SIZE-NEXT:  Cost Model: Found an estimated cost of 16 for instruction: %v16f64 = call <16 x double> @llvm.log.v16f64(<16 x double> undef)
+; ALL-SIZE-NEXT:  Cost Model: Found an estimated cost of 17 for instruction: %v17f64 = call <17 x double> @llvm.log.v17f64(<17 x double> undef)
+; ALL-SIZE-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: ret void
+;
+  %f64 = call double @llvm.log.f64(double undef)
+  %v2f64 = call <2 x double> @llvm.log.v2f64(<2 x double> undef)
+  %v3f64 = call <3 x double> @llvm.log.v3f64(<3 x double> undef)
+  %v4f64 = call <4 x double> @llvm.log.v4f64(<4 x double> undef)
+  %v5f64 = call <5 x double> @llvm.log.v5f64(<5 x double> undef)
+  %v8f64 = call <8 x double> @llvm.log.v8f64(<8 x double> undef)
+  %v16f64 = call <16 x double> @llvm.log.v16f64(<16 x double> undef)
+  %v17f64 = call <17 x double> @llvm.log.v17f64(<17 x double> undef)
+  ret void
+}
diff --git a/llvm/test/Analysis/CostModel/AMDGPU/log10.ll b/llvm/test/Analysis/CostModel/AMDGPU/log10.ll
new file mode 100644
index 0000000..d807c6cd63
--- /dev/null
+++ b/llvm/test/Analysis/CostModel/AMDGPU/log10.ll
@@ -0,0 +1,278 @@
+; NOTE: Assertions have been autogenerated by utils/update_analyze_test_checks.py UTC_ARGS: --version 5
+; RUN: opt < %s -mtriple=amdgcn-unknown-amdhsa -passes="print<cost-model>" -cost-kind=throughput 2>&1 -disable-output | FileCheck -check-prefixes=ALL,BASE %s
+; RUN: opt < %s -mtriple=amdgcn-unknown-amdhsa -mcpu=gfx803 -passes="print<cost-model>" -cost-kind=throughput 2>&1 -disable-output | FileCheck -check-prefixes=ALL,GFX8 %s
+; RUN: opt < %s -mtriple=amdgcn-unknown-amdhsa -mcpu=gfx900 -passes="print<cost-model>" -cost-kind=throughput 2>&1 -disable-output | FileCheck -check-prefixes=ALL,GFX9 %s
+; RUN: opt < %s -mtriple=amdgcn-unknown-amdhsa -mcpu=gfx1030 -passes="print<cost-model>" -cost-kind=throughput 2>&1 -disable-output | FileCheck -check-prefixes=ALL,GFX10 %s
+
+; RUN: opt < %s -mtriple=amdgcn-unknown-amdhsa -passes="print<cost-model>" -cost-kind=code-size 2>&1 -disable-output | FileCheck -check-prefixes=ALL-SIZE,BASE-SIZE %s
+; RUN: opt < %s -mtriple=amdgcn-unknown-amdhsa -mcpu=gfx803 -passes="print<cost-model>" -cost-kind=code-size 2>&1 -disable-output | FileCheck -check-prefixes=ALL-SIZE,GFX8-SIZE %s
+; RUN: opt < %s -mtriple=amdgcn-unknown-amdhsa -mcpu=gfx900 -passes="print<cost-model>" -cost-kind=code-size 2>&1 -disable-output | FileCheck -check-prefixes=ALL-SIZE,GFX9-SIZE %s
+; RUN: opt < %s -mtriple=amdgcn-unknown-amdhsa -mcpu=gfx1030 -passes="print<cost-model>" -cost-kind=code-size 2>&1 -disable-output | FileCheck -check-prefixes=ALL-SIZE,GFX10-SIZE %s
+
+define void @log10_f16() {
+; BASE-LABEL: 'log10_f16'
+; BASE-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %f16 = call half @llvm.log10.f16(half undef)
+; BASE-NEXT:  Cost Model: Found an estimated cost of 4 for instruction: %v2f16 = call <2 x half> @llvm.log10.v2f16(<2 x half> undef)
+; BASE-NEXT:  Cost Model: Found an estimated cost of 8 for instruction: %v3f16 = call <3 x half> @llvm.log10.v3f16(<3 x half> undef)
+; BASE-NEXT:  Cost Model: Found an estimated cost of 8 for instruction: %v4f16 = call <4 x half> @llvm.log10.v4f16(<4 x half> undef)
+; BASE-NEXT:  Cost Model: Found an estimated cost of 16 for instruction: %v5f16 = call <5 x half> @llvm.log10.v5f16(<5 x half> undef)
+; BASE-NEXT:  Cost Model: Found an estimated cost of 16 for instruction: %v8f16 = call <8 x half> @llvm.log10.v8f16(<8 x half> undef)
+; BASE-NEXT:  Cost Model: Found an estimated cost of 32 for instruction: %v16f16 = call <16 x half> @llvm.log10.v16f16(<16 x half> undef)
+; BASE-NEXT:  Cost Model: Found an estimated cost of 64 for instruction: %v17f16 = call <17 x half> @llvm.log10.v17f16(<17 x half> undef)
+; BASE-NEXT:  Cost Model: Found an estimated cost of 10 for instruction: ret void
+;
+; GFX8-LABEL: 'log10_f16'
+; GFX8-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %f16 = call half @llvm.log10.f16(half undef)
+; GFX8-NEXT:  Cost Model: Found an estimated cost of 5 for instruction: %v2f16 = call <2 x half> @llvm.log10.v2f16(<2 x half> undef)
+; GFX8-NEXT:  Cost Model: Found an estimated cost of 8 for instruction: %v3f16 = call <3 x half> @llvm.log10.v3f16(<3 x half> undef)
+; GFX8-NEXT:  Cost Model: Found an estimated cost of 11 for instruction: %v4f16 = call <4 x half> @llvm.log10.v4f16(<4 x half> undef)
+; GFX8-NEXT:  Cost Model: Found an estimated cost of 14 for instruction: %v5f16 = call <5 x half> @llvm.log10.v5f16(<5 x half> undef)
+; GFX8-NEXT:  Cost Model: Found an estimated cost of 23 for instruction: %v8f16 = call <8 x half> @llvm.log10.v8f16(<8 x half> undef)
+; GFX8-NEXT:  Cost Model: Found an estimated cost of 47 for instruction: %v16f16 = call <16 x half> @llvm.log10.v16f16(<16 x half> undef)
+; GFX8-NEXT:  Cost Model: Found an estimated cost of 50 for instruction: %v17f16 = call <17 x half> @llvm.log10.v17f16(<17 x half> undef)
+; GFX8-NEXT:  Cost Model: Found an estimated cost of 10 for instruction: ret void
+;
+; GFX9-LABEL: 'log10_f16'
+; GFX9-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %f16 = call half @llvm.log10.f16(half undef)
+; GFX9-NEXT:  Cost Model: Found an estimated cost of 5 for instruction: %v2f16 = call <2 x half> @llvm.log10.v2f16(<2 x half> undef)
+; GFX9-NEXT:  Cost Model: Found an estimated cost of 8 for instruction: %v3f16 = call <3 x half> @llvm.log10.v3f16(<3 x half> undef)
+; GFX9-NEXT:  Cost Model: Found an estimated cost of 11 for instruction: %v4f16 = call <4 x half> @llvm.log10.v4f16(<4 x half> undef)
+; GFX9-NEXT:  Cost Model: Found an estimated cost of 14 for instruction: %v5f16 = call <5 x half> @llvm.log10.v5f16(<5 x half> undef)
+; GFX9-NEXT:  Cost Model: Found an estimated cost of 23 for instruction: %v8f16 = call <8 x half> @llvm.log10.v8f16(<8 x half> undef)
+; GFX9-NEXT:  Cost Model: Found an estimated cost of 47 for instruction: %v16f16 = call <16 x half> @llvm.log10.v16f16(<16 x half> undef)
+; GFX9-NEXT:  Cost Model: Found an estimated cost of 50 for instruction: %v17f16 = call <17 x half> @llvm.log10.v17f16(<17 x half> undef)
+; GFX9-NEXT:  Cost Model: Found an estimated cost of 10 for instruction: ret void
+;
+; GFX10-LABEL: 'log10_f16'
+; GFX10-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %f16 = call half @llvm.log10.f16(half undef)
+; GFX10-NEXT:  Cost Model: Found an estimated cost of 5 for instruction: %v2f16 = call <2 x half> @llvm.log10.v2f16(<2 x half> undef)
+; GFX10-NEXT:  Cost Model: Found an estimated cost of 8 for instruction: %v3f16 = call <3 x half> @llvm.log10.v3f16(<3 x half> undef)
+; GFX10-NEXT:  Cost Model: Found an estimated cost of 11 for instruction: %v4f16 = call <4 x half> @llvm.log10.v4f16(<4 x half> undef)
+; GFX10-NEXT:  Cost Model: Found an estimated cost of 14 for instruction: %v5f16 = call <5 x half> @llvm.log10.v5f16(<5 x half> undef)
+; GFX10-NEXT:  Cost Model: Found an estimated cost of 23 for instruction: %v8f16 = call <8 x half> @llvm.log10.v8f16(<8 x half> undef)
+; GFX10-NEXT:  Cost Model: Found an estimated cost of 47 for instruction: %v16f16 = call <16 x half> @llvm.log10.v16f16(<16 x half> undef)
+; GFX10-NEXT:  Cost Model: Found an estimated cost of 50 for instruction: %v17f16 = call <17 x half> @llvm.log10.v17f16(<17 x half> undef)
+; GFX10-NEXT:  Cost Model: Found an estimated cost of 10 for instruction: ret void
+;
+; BASE-SIZE-LABEL: 'log10_f16'
+; BASE-SIZE-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %f16 = call half @llvm.log10.f16(half undef)
+; BASE-SIZE-NEXT:  Cost Model: Found an estimated cost of 4 for instruction: %v2f16 = call <2 x half> @llvm.log10.v2f16(<2 x half> undef)
+; BASE-SIZE-NEXT:  Cost Model: Found an estimated cost of 8 for instruction: %v3f16 = call <3 x half> @llvm.log10.v3f16(<3 x half> undef)
+; BASE-SIZE-NEXT:  Cost Model: Found an estimated cost of 8 for instruction: %v4f16 = call <4 x half> @llvm.log10.v4f16(<4 x half> undef)
+; BASE-SIZE-NEXT:  Cost Model: Found an estimated cost of 16 for instruction: %v5f16 = call <5 x half> @llvm.log10.v5f16(<5 x half> undef)
+; BASE-SIZE-NEXT:  Cost Model: Found an estimated cost of 16 for instruction: %v8f16 = call <8 x half> @llvm.log10.v8f16(<8 x half> undef)
+; BASE-SIZE-NEXT:  Cost Model: Found an estimated cost of 32 for instruction: %v16f16 = call <16 x half> @llvm.log10.v16f16(<16 x half> undef)
+; BASE-SIZE-NEXT:  Cost Model: Found an estimated cost of 64 for instruction: %v17f16 = call <17 x half> @llvm.log10.v17f16(<17 x half> undef)
+; BASE-SIZE-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: ret void
+;
+; GFX8-SIZE-LABEL: 'log10_f16'
+; GFX8-SIZE-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %f16 = call half @llvm.log10.f16(half undef)
+; GFX8-SIZE-NEXT:  Cost Model: Found an estimated cost of 5 for instruction: %v2f16 = call <2 x half> @llvm.log10.v2f16(<2 x half> undef)
+; GFX8-SIZE-NEXT:  Cost Model: Found an estimated cost of 8 for instruction: %v3f16 = call <3 x half> @llvm.log10.v3f16(<3 x half> undef)
+; GFX8-SIZE-NEXT:  Cost Model: Found an estimated cost of 11 for instruction: %v4f16 = call <4 x half> @llvm.log10.v4f16(<4 x half> undef)
+; GFX8-SIZE-NEXT:  Cost Model: Found an estimated cost of 14 for instruction: %v5f16 = call <5 x half> @llvm.log10.v5f16(<5 x half> undef)
+; GFX8-SIZE-NEXT:  Cost Model: Found an estimated cost of 23 for instruction: %v8f16 = call <8 x half> @llvm.log10.v8f16(<8 x half> undef)
+; GFX8-SIZE-NEXT:  Cost Model: Found an estimated cost of 47 for instruction: %v16f16 = call <16 x half> @llvm.log10.v16f16(<16 x half> undef)
+; GFX8-SIZE-NEXT:  Cost Model: Found an estimated cost of 50 for instruction: %v17f16 = call <17 x half> @llvm.log10.v17f16(<17 x half> undef)
+; GFX8-SIZE-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: ret void
+;
+; GFX9-SIZE-LABEL: 'log10_f16'
+; GFX9-SIZE-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %f16 = call half @llvm.log10.f16(half undef)
+; GFX9-SIZE-NEXT:  Cost Model: Found an estimated cost of 5 for instruction: %v2f16 = call <2 x half> @llvm.log10.v2f16(<2 x half> undef)
+; GFX9-SIZE-NEXT:  Cost Model: Found an estimated cost of 8 for instruction: %v3f16 = call <3 x half> @llvm.log10.v3f16(<3 x half> undef)
+; GFX9-SIZE-NEXT:  Cost Model: Found an estimated cost of 11 for instruction: %v4f16 = call <4 x half> @llvm.log10.v4f16(<4 x half> undef)
+; GFX9-SIZE-NEXT:  Cost Model: Found an estimated cost of 14 for instruction: %v5f16 = call <5 x half> @llvm.log10.v5f16(<5 x half> undef)
+; GFX9-SIZE-NEXT:  Cost Model: Found an estimated cost of 23 for instruction: %v8f16 = call <8 x half> @llvm.log10.v8f16(<8 x half> undef)
+; GFX9-SIZE-NEXT:  Cost Model: Found an estimated cost of 47 for instruction: %v16f16 = call <16 x half> @llvm.log10.v16f16(<16 x half> undef)
+; GFX9-SIZE-NEXT:  Cost Model: Found an estimated cost of 50 for instruction: %v17f16 = call <17 x half> @llvm.log10.v17f16(<17 x half> undef)
+; GFX9-SIZE-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: ret void
+;
+; GFX10-SIZE-LABEL: 'log10_f16'
+; GFX10-SIZE-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %f16 = call half @llvm.log10.f16(half undef)
+; GFX10-SIZE-NEXT:  Cost Model: Found an estimated cost of 5 for instruction: %v2f16 = call <2 x half> @llvm.log10.v2f16(<2 x half> undef)
+; GFX10-SIZE-NEXT:  Cost Model: Found an estimated cost of 8 for instruction: %v3f16 = call <3 x half> @llvm.log10.v3f16(<3 x half> undef)
+; GFX10-SIZE-NEXT:  Cost Model: Found an estimated cost of 11 for instruction: %v4f16 = call <4 x half> @llvm.log10.v4f16(<4 x half> undef)
+; GFX10-SIZE-NEXT:  Cost Model: Found an estimated cost of 14 for instruction: %v5f16 = call <5 x half> @llvm.log10.v5f16(<5 x half> undef)
+; GFX10-SIZE-NEXT:  Cost Model: Found an estimated cost of 23 for instruction: %v8f16 = call <8 x half> @llvm.log10.v8f16(<8 x half> undef)
+; GFX10-SIZE-NEXT:  Cost Model: Found an estimated cost of 47 for instruction: %v16f16 = call <16 x half> @llvm.log10.v16f16(<16 x half> undef)
+; GFX10-SIZE-NEXT:  Cost Model: Found an estimated cost of 50 for instruction: %v17f16 = call <17 x half> @llvm.log10.v17f16(<17 x half> undef)
+; GFX10-SIZE-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: ret void
+;
+  %f16 = call half @llvm.log10.f16(half undef)
+  %v2f16 = call <2 x half> @llvm.log10.v2f16(<2 x half> undef)
+  %v3f16 = call <3 x half> @llvm.log10.v3f16(<3 x half> undef)
+  %v4f16 = call <4 x half> @llvm.log10.v4f16(<4 x half> undef)
+  %v5f16 = call <5 x half> @llvm.log10.v5f16(<5 x half> undef)
+  %v8f16 = call <8 x half> @llvm.log10.v8f16(<8 x half> undef)
+  %v16f16 = call <16 x half> @llvm.log10.v16f16(<16 x half> undef)
+  %v17f16 = call <17 x half> @llvm.log10.v17f16(<17 x half> undef)
+  ret void
+}
+
+define void @log10_bf16() {
+; BASE-LABEL: 'log10_bf16'
+; BASE-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %bf16 = call bfloat @llvm.log10.bf16(bfloat undef)
+; BASE-NEXT:  Cost Model: Found an estimated cost of 4 for instruction: %v2bf16 = call <2 x bfloat> @llvm.log10.v2bf16(<2 x bfloat> undef)
+; BASE-NEXT:  Cost Model: Found an estimated cost of 8 for instruction: %v3bf16 = call <3 x bfloat> @llvm.log10.v3bf16(<3 x bfloat> undef)
+; BASE-NEXT:  Cost Model: Found an estimated cost of 8 for instruction: %v4bf16 = call <4 x bfloat> @llvm.log10.v4bf16(<4 x bfloat> undef)
+; BASE-NEXT:  Cost Model: Found an estimated cost of 16 for instruction: %v5bf16 = call <5 x bfloat> @llvm.log10.v5bf16(<5 x bfloat> undef)
+; BASE-NEXT:  Cost Model: Found an estimated cost of 16 for instruction: %v8bf16 = call <8 x bfloat> @llvm.log10.v8bf16(<8 x bfloat> undef)
+; BASE-NEXT:  Cost Model: Found an estimated cost of 32 for instruction: %v16bf16 = call <16 x bfloat> @llvm.log10.v16bf16(<16 x bfloat> undef)
+; BASE-NEXT:  Cost Model: Found an estimated cost of 64 for instruction: %v17bf16 = call <17 x bfloat> @llvm.log10.v17bf16(<17 x bfloat> undef)
+; BASE-NEXT:  Cost Model: Found an estimated cost of 10 for instruction: ret void
+;
+; GFX8-LABEL: 'log10_bf16'
+; GFX8-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %bf16 = call bfloat @llvm.log10.bf16(bfloat undef)
+; GFX8-NEXT:  Cost Model: Found an estimated cost of 3 for instruction: %v2bf16 = call <2 x bfloat> @llvm.log10.v2bf16(<2 x bfloat> undef)
+; GFX8-NEXT:  Cost Model: Found an estimated cost of 5 for instruction: %v3bf16 = call <3 x bfloat> @llvm.log10.v3bf16(<3 x bfloat> undef)
+; GFX8-NEXT:  Cost Model: Found an estimated cost of 7 for instruction: %v4bf16 = call <4 x bfloat> @llvm.log10.v4bf16(<4 x bfloat> undef)
+; GFX8-NEXT:  Cost Model: Found an estimated cost of 9 for instruction: %v5bf16 = call <5 x bfloat> @llvm.log10.v5bf16(<5 x bfloat> undef)
+; GFX8-NEXT:  Cost Model: Found an estimated cost of 15 for instruction: %v8bf16 = call <8 x bfloat> @llvm.log10.v8bf16(<8 x bfloat> undef)
+; GFX8-NEXT:  Cost Model: Found an estimated cost of 31 for instruction: %v16bf16 = call <16 x bfloat> @llvm.log10.v16bf16(<16 x bfloat> undef)
+; GFX8-NEXT:  Cost Model: Found an estimated cost of 33 for instruction: %v17bf16 = call <17 x bfloat> @llvm.log10.v17bf16(<17 x bfloat> undef)
+; GFX8-NEXT:  Cost Model: Found an estimated cost of 10 for instruction: ret void
+;
+; GFX9-LABEL: 'log10_bf16'
+; GFX9-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %bf16 = call bfloat @llvm.log10.bf16(bfloat undef)
+; GFX9-NEXT:  Cost Model: Found an estimated cost of 3 for instruction: %v2bf16 = call <2 x bfloat> @llvm.log10.v2bf16(<2 x bfloat> undef)
+; GFX9-NEXT:  Cost Model: Found an estimated cost of 5 for instruction: %v3bf16 = call <3 x bfloat> @llvm.log10.v3bf16(<3 x bfloat> undef)
+; GFX9-NEXT:  Cost Model: Found an estimated cost of 7 for instruction: %v4bf16 = call <4 x bfloat> @llvm.log10.v4bf16(<4 x bfloat> undef)
+; GFX9-NEXT:  Cost Model: Found an estimated cost of 9 for instruction: %v5bf16 = call <5 x bfloat> @llvm.log10.v5bf16(<5 x bfloat> undef)
+; GFX9-NEXT:  Cost Model: Found an estimated cost of 15 for instruction: %v8bf16 = call <8 x bfloat> @llvm.log10.v8bf16(<8 x bfloat> undef)
+; GFX9-NEXT:  Cost Model: Found an estimated cost of 31 for instruction: %v16bf16 = call <16 x bfloat> @llvm.log10.v16bf16(<16 x bfloat> undef)
+; GFX9-NEXT:  Cost Model: Found an estimated cost of 33 for instruction: %v17bf16 = call <17 x bfloat> @llvm.log10.v17bf16(<17 x bfloat> undef)
+; GFX9-NEXT:  Cost Model: Found an estimated cost of 10 for instruction: ret void
+;
+; GFX10-LABEL: 'log10_bf16'
+; GFX10-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %bf16 = call bfloat @llvm.log10.bf16(bfloat undef)
+; GFX10-NEXT:  Cost Model: Found an estimated cost of 3 for instruction: %v2bf16 = call <2 x bfloat> @llvm.log10.v2bf16(<2 x bfloat> undef)
+; GFX10-NEXT:  Cost Model: Found an estimated cost of 5 for instruction: %v3bf16 = call <3 x bfloat> @llvm.log10.v3bf16(<3 x bfloat> undef)
+; GFX10-NEXT:  Cost Model: Found an estimated cost of 7 for instruction: %v4bf16 = call <4 x bfloat> @llvm.log10.v4bf16(<4 x bfloat> undef)
+; GFX10-NEXT:  Cost Model: Found an estimated cost of 9 for instruction: %v5bf16 = call <5 x bfloat> @llvm.log10.v5bf16(<5 x bfloat> undef)
+; GFX10-NEXT:  Cost Model: Found an estimated cost of 15 for instruction: %v8bf16 = call <8 x bfloat> @llvm.log10.v8bf16(<8 x bfloat> undef)
+; GFX10-NEXT:  Cost Model: Found an estimated cost of 31 for instruction: %v16bf16 = call <16 x bfloat> @llvm.log10.v16bf16(<16 x bfloat> undef)
+; GFX10-NEXT:  Cost Model: Found an estimated cost of 33 for instruction: %v17bf16 = call <17 x bfloat> @llvm.log10.v17bf16(<17 x bfloat> undef)
+; GFX10-NEXT:  Cost Model: Found an estimated cost of 10 for instruction: ret void
+;
+; BASE-SIZE-LABEL: 'log10_bf16'
+; BASE-SIZE-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %bf16 = call bfloat @llvm.log10.bf16(bfloat undef)
+; BASE-SIZE-NEXT:  Cost Model: Found an estimated cost of 4 for instruction: %v2bf16 = call <2 x bfloat> @llvm.log10.v2bf16(<2 x bfloat> undef)
+; BASE-SIZE-NEXT:  Cost Model: Found an estimated cost of 8 for instruction: %v3bf16 = call <3 x bfloat> @llvm.log10.v3bf16(<3 x bfloat> undef)
+; BASE-SIZE-NEXT:  Cost Model: Found an estimated cost of 8 for instruction: %v4bf16 = call <4 x bfloat> @llvm.log10.v4bf16(<4 x bfloat> undef)
+; BASE-SIZE-NEXT:  Cost Model: Found an estimated cost of 16 for instruction: %v5bf16 = call <5 x bfloat> @llvm.log10.v5bf16(<5 x bfloat> undef)
+; BASE-SIZE-NEXT:  Cost Model: Found an estimated cost of 16 for instruction: %v8bf16 = call <8 x bfloat> @llvm.log10.v8bf16(<8 x bfloat> undef)
+; BASE-SIZE-NEXT:  Cost Model: Found an estimated cost of 32 for instruction: %v16bf16 = call <16 x bfloat> @llvm.log10.v16bf16(<16 x bfloat> undef)
+; BASE-SIZE-NEXT:  Cost Model: Found an estimated cost of 64 for instruction: %v17bf16 = call <17 x bfloat> @llvm.log10.v17bf16(<17 x bfloat> undef)
+; BASE-SIZE-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: ret void
+;
+; GFX8-SIZE-LABEL: 'log10_bf16'
+; GFX8-SIZE-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %bf16 = call bfloat @llvm.log10.bf16(bfloat undef)
+; GFX8-SIZE-NEXT:  Cost Model: Found an estimated cost of 3 for instruction: %v2bf16 = call <2 x bfloat> @llvm.log10.v2bf16(<2 x bfloat> undef)
+; GFX8-SIZE-NEXT:  Cost Model: Found an estimated cost of 5 for instruction: %v3bf16 = call <3 x bfloat> @llvm.log10.v3bf16(<3 x bfloat> undef)
+; GFX8-SIZE-NEXT:  Cost Model: Found an estimated cost of 7 for instruction: %v4bf16 = call <4 x bfloat> @llvm.log10.v4bf16(<4 x bfloat> undef)
+; GFX8-SIZE-NEXT:  Cost Model: Found an estimated cost of 9 for instruction: %v5bf16 = call <5 x bfloat> @llvm.log10.v5bf16(<5 x bfloat> undef)
+; GFX8-SIZE-NEXT:  Cost Model: Found an estimated cost of 15 for instruction: %v8bf16 = call <8 x bfloat> @llvm.log10.v8bf16(<8 x bfloat> undef)
+; GFX8-SIZE-NEXT:  Cost Model: Found an estimated cost of 31 for instruction: %v16bf16 = call <16 x bfloat> @llvm.log10.v16bf16(<16 x bfloat> undef)
+; GFX8-SIZE-NEXT:  Cost Model: Found an estimated cost of 33 for instruction: %v17bf16 = call <17 x bfloat> @llvm.log10.v17bf16(<17 x bfloat> undef)
+; GFX8-SIZE-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: ret void
+;
+; GFX9-SIZE-LABEL: 'log10_bf16'
+; GFX9-SIZE-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %bf16 = call bfloat @llvm.log10.bf16(bfloat undef)
+; GFX9-SIZE-NEXT:  Cost Model: Found an estimated cost of 3 for instruction: %v2bf16 = call <2 x bfloat> @llvm.log10.v2bf16(<2 x bfloat> undef)
+; GFX9-SIZE-NEXT:  Cost Model: Found an estimated cost of 5 for instruction: %v3bf16 = call <3 x bfloat> @llvm.log10.v3bf16(<3 x bfloat> undef)
+; GFX9-SIZE-NEXT:  Cost Model: Found an estimated cost of 7 for instruction: %v4bf16 = call <4 x bfloat> @llvm.log10.v4bf16(<4 x bfloat> undef)
+; GFX9-SIZE-NEXT:  Cost Model: Found an estimated cost of 9 for instruction: %v5bf16 = call <5 x bfloat> @llvm.log10.v5bf16(<5 x bfloat> undef)
+; GFX9-SIZE-NEXT:  Cost Model: Found an estimated cost of 15 for instruction: %v8bf16 = call <8 x bfloat> @llvm.log10.v8bf16(<8 x bfloat> undef)
+; GFX9-SIZE-NEXT:  Cost Model: Found an estimated cost of 31 for instruction: %v16bf16 = call <16 x bfloat> @llvm.log10.v16bf16(<16 x bfloat> undef)
+; GFX9-SIZE-NEXT:  Cost Model: Found an estimated cost of 33 for instruction: %v17bf16 = call <17 x bfloat> @llvm.log10.v17bf16(<17 x bfloat> undef)
+; GFX9-SIZE-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: ret void
+;
+; GFX10-SIZE-LABEL: 'log10_bf16'
+; GFX10-SIZE-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %bf16 = call bfloat @llvm.log10.bf16(bfloat undef)
+; GFX10-SIZE-NEXT:  Cost Model: Found an estimated cost of 3 for instruction: %v2bf16 = call <2 x bfloat> @llvm.log10.v2bf16(<2 x bfloat> undef)
+; GFX10-SIZE-NEXT:  Cost Model: Found an estimated cost of 5 for instruction: %v3bf16 = call <3 x bfloat> @llvm.log10.v3bf16(<3 x bfloat> undef)
+; GFX10-SIZE-NEXT:  Cost Model: Found an estimated cost of 7 for instruction: %v4bf16 = call <4 x bfloat> @llvm.log10.v4bf16(<4 x bfloat> undef)
+; GFX10-SIZE-NEXT:  Cost Model: Found an estimated cost of 9 for instruction: %v5bf16 = call <5 x bfloat> @llvm.log10.v5bf16(<5 x bfloat> undef)
+; GFX10-SIZE-NEXT:  Cost Model: Found an estimated cost of 15 for instruction: %v8bf16 = call <8 x bfloat> @llvm.log10.v8bf16(<8 x bfloat> undef)
+; GFX10-SIZE-NEXT:  Cost Model: Found an estimated cost of 31 for instruction: %v16bf16 = call <16 x bfloat> @llvm.log10.v16bf16(<16 x bfloat> undef)
+; GFX10-SIZE-NEXT:  Cost Model: Found an estimated cost of 33 for instruction: %v17bf16 = call <17 x bfloat> @llvm.log10.v17bf16(<17 x bfloat> undef)
+; GFX10-SIZE-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: ret void
+;
+  %bf16 = call bfloat @llvm.log10.bf16(bfloat undef)
+  %v2bf16 = call <2 x bfloat> @llvm.log10.v2bf16(<2 x bfloat> undef)
+  %v3bf16 = call <3 x bfloat> @llvm.log10.v3bf16(<3 x bfloat> undef)
+  %v4bf16 = call <4 x bfloat> @llvm.log10.v4bf16(<4 x bfloat> undef)
+  %v5bf16 = call <5 x bfloat> @llvm.log10.v5bf16(<5 x bfloat> undef)
+  %v8bf16 = call <8 x bfloat> @llvm.log10.v8bf16(<8 x bfloat> undef)
+  %v16bf16 = call <16 x bfloat> @llvm.log10.v16bf16(<16 x bfloat> undef)
+  %v17bf16 = call <17 x bfloat> @llvm.log10.v17bf16(<17 x bfloat> undef)
+  ret void
+}
+
+define void @log10_f32() {
+; ALL-LABEL: 'log10_f32'
+; ALL-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %f32 = call float @llvm.log10.f32(float undef)
+; ALL-NEXT:  Cost Model: Found an estimated cost of 4 for instruction: %v2f32 = call <2 x float> @llvm.log10.v2f32(<2 x float> undef)
+; ALL-NEXT:  Cost Model: Found an estimated cost of 6 for instruction: %v3f32 = call <3 x float> @llvm.log10.v3f32(<3 x float> undef)
+; ALL-NEXT:  Cost Model: Found an estimated cost of 8 for instruction: %v4f32 = call <4 x float> @llvm.log10.v4f32(<4 x float> undef)
+; ALL-NEXT:  Cost Model: Found an estimated cost of 10 for instruction: %v5f32 = call <5 x float> @llvm.log10.v5f32(<5 x float> undef)
+; ALL-NEXT:  Cost Model: Found an estimated cost of 16 for instruction: %v8f32 = call <8 x float> @llvm.log10.v8f32(<8 x float> undef)
+; ALL-NEXT:  Cost Model: Found an estimated cost of 32 for instruction: %v16f32 = call <16 x float> @llvm.log10.v16f32(<16 x float> undef)
+; ALL-NEXT:  Cost Model: Found an estimated cost of 34 for instruction: %v17f32 = call <17 x float> @llvm.log10.v17f32(<17 x float> undef)
+; ALL-NEXT:  Cost Model: Found an estimated cost of 10 for instruction: ret void
+;
+; ALL-SIZE-LABEL: 'log10_f32'
+; ALL-SIZE-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %f32 = call float @llvm.log10.f32(float undef)
+; ALL-SIZE-NEXT:  Cost Model: Found an estimated cost of 4 for instruction: %v2f32 = call <2 x float> @llvm.log10.v2f32(<2 x float> undef)
+; ALL-SIZE-NEXT:  Cost Model: Found an estimated cost of 6 for instruction: %v3f32 = call <3 x float> @llvm.log10.v3f32(<3 x float> undef)
+; ALL-SIZE-NEXT:  Cost Model: Found an estimated cost of 8 for instruction: %v4f32 = call <4 x float> @llvm.log10.v4f32(<4 x float> undef)
+; ALL-SIZE-NEXT:  Cost Model: Found an estimated cost of 10 for instruction: %v5f32 = call <5 x float> @llvm.log10.v5f32(<5 x float> undef)
+; ALL-SIZE-NEXT:  Cost Model: Found an estimated cost of 16 for instruction: %v8f32 = call <8 x float> @llvm.log10.v8f32(<8 x float> undef)
+; ALL-SIZE-NEXT:  Cost Model: Found an estimated cost of 32 for instruction: %v16f32 = call <16 x float> @llvm.log10.v16f32(<16 x float> undef)
+; ALL-SIZE-NEXT:  Cost Model: Found an estimated cost of 34 for instruction: %v17f32 = call <17 x float> @llvm.log10.v17f32(<17 x float> undef)
+; ALL-SIZE-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: ret void
+;
+  %f32 = call float @llvm.log10.f32(float undef)
+  %v2f32 = call <2 x float> @llvm.log10.v2f32(<2 x float> undef)
+  %v3f32 = call <3 x float> @llvm.log10.v3f32(<3 x float> undef)
+  %v4f32 = call <4 x float> @llvm.log10.v4f32(<4 x float> undef)
+  %v5f32 = call <5 x float> @llvm.log10.v5f32(<5 x float> undef)
+  %v8f32 = call <8 x float> @llvm.log10.v8f32(<8 x float> undef)
+  %v16f32 = call <16 x float> @llvm.log10.v16f32(<16 x float> undef)
+  %v17f32 = call <17 x float> @llvm.log10.v17f32(<17 x float> undef)
+  ret void
+}
+
+define void @log10_f64() {
+; ALL-LABEL: 'log10_f64'
+; ALL-NEXT:  Cost Model: Found an estimated cost of 10 for instruction: %f64 = call double @llvm.log10.f64(double undef)
+; ALL-NEXT:  Cost Model: Found an estimated cost of 20 for instruction: %v2f64 = call <2 x double> @llvm.log10.v2f64(<2 x double> undef)
+; ALL-NEXT:  Cost Model: Found an estimated cost of 30 for instruction: %v3f64 = call <3 x double> @llvm.log10.v3f64(<3 x double> undef)
+; ALL-NEXT:  Cost Model: Found an estimated cost of 40 for instruction: %v4f64 = call <4 x double> @llvm.log10.v4f64(<4 x double> undef)
+; ALL-NEXT:  Cost Model: Found an estimated cost of 50 for instruction: %v5f64 = call <5 x double> @llvm.log10.v5f64(<5 x double> undef)
+; ALL-NEXT:  Cost Model: Found an estimated cost of 80 for instruction: %v8f64 = call <8 x double> @llvm.log10.v8f64(<8 x double> undef)
+; ALL-NEXT:  Cost Model: Found an estimated cost of 160 for instruction: %v16f64 = call <16 x double> @llvm.log10.v16f64(<16 x double> undef)
+; ALL-NEXT:  Cost Model: Found an estimated cost of 170 for instruction: %v17f64 = call <17 x double> @llvm.log10.v17f64(<17 x double> undef)
+; ALL-NEXT:  Cost Model: Found an estimated cost of 10 for instruction: ret void
+;
+; ALL-SIZE-LABEL: 'log10_f64'
+; ALL-SIZE-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %f64 = call double @llvm.log10.f64(double undef)
+; ALL-SIZE-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %v2f64 = call <2 x double> @llvm.log10.v2f64(<2 x double> undef)
+; ALL-SIZE-NEXT:  Cost Model: Found an estimated cost of 3 for instruction: %v3f64 = call <3 x double> @llvm.log10.v3f64(<3 x double> undef)
+; ALL-SIZE-NEXT:  Cost Model: Found an estimated cost of 4 for instruction: %v4f64 = call <4 x double> @llvm.log10.v4f64(<4 x double> undef)
+; ALL-SIZE-NEXT:  Cost Model: Found an estimated cost of 5 for instruction: %v5f64 = call <5 x double> @llvm.log10.v5f64(<5 x double> undef)
+; ALL-SIZE-NEXT:  Cost Model: Found an estimated cost of 8 for instruction: %v8f64 = call <8 x double> @llvm.log10.v8f64(<8 x double> undef)
+; ALL-SIZE-NEXT:  Cost Model: Found an estimated cost of 16 for instruction: %v16f64 = call <16 x double> @llvm.log10.v16f64(<16 x double> undef)
+; ALL-SIZE-NEXT:  Cost Model: Found an estimated cost of 17 for instruction: %v17f64 = call <17 x double> @llvm.log10.v17f64(<17 x double> undef)
+; ALL-SIZE-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: ret void
+;
+  %f64 = call double @llvm.log10.f64(double undef)
+  %v2f64 = call <2 x double> @llvm.log10.v2f64(<2 x double> undef)
+  %v3f64 = call <3 x double> @llvm.log10.v3f64(<3 x double> undef)
+  %v4f64 = call <4 x double> @llvm.log10.v4f64(<4 x double> undef)
+  %v5f64 = call <5 x double> @llvm.log10.v5f64(<5 x double> undef)
+  %v8f64 = call <8 x double> @llvm.log10.v8f64(<8 x double> undef)
+  %v16f64 = call <16 x double> @llvm.log10.v16f64(<16 x double> undef)
+  %v17f64 = call <17 x double> @llvm.log10.v17f64(<17 x double> undef)
+  ret void
+}
diff --git a/llvm/test/Analysis/CostModel/AMDGPU/log2.ll b/llvm/test/Analysis/CostModel/AMDGPU/log2.ll
new file mode 100644
index 0000000..1ef3977
--- /dev/null
+++ b/llvm/test/Analysis/CostModel/AMDGPU/log2.ll
@@ -0,0 +1,278 @@
+; NOTE: Assertions have been autogenerated by utils/update_analyze_test_checks.py UTC_ARGS: --version 5
+; RUN: opt < %s -mtriple=amdgcn-unknown-amdhsa -passes="print<cost-model>" -cost-kind=throughput 2>&1 -disable-output | FileCheck -check-prefixes=ALL,BASE %s
+; RUN: opt < %s -mtriple=amdgcn-unknown-amdhsa -mcpu=gfx803 -passes="print<cost-model>" -cost-kind=throughput 2>&1 -disable-output | FileCheck -check-prefixes=ALL,GFX8 %s
+; RUN: opt < %s -mtriple=amdgcn-unknown-amdhsa -mcpu=gfx900 -passes="print<cost-model>" -cost-kind=throughput 2>&1 -disable-output | FileCheck -check-prefixes=ALL,GFX9 %s
+; RUN: opt < %s -mtriple=amdgcn-unknown-amdhsa -mcpu=gfx1030 -passes="print<cost-model>" -cost-kind=throughput 2>&1 -disable-output | FileCheck -check-prefixes=ALL,GFX10 %s
+
+; RUN: opt < %s -mtriple=amdgcn-unknown-amdhsa -passes="print<cost-model>" -cost-kind=code-size 2>&1 -disable-output | FileCheck -check-prefixes=ALL-SIZE,BASE-SIZE %s
+; RUN: opt < %s -mtriple=amdgcn-unknown-amdhsa -mcpu=gfx803 -passes="print<cost-model>" -cost-kind=code-size 2>&1 -disable-output | FileCheck -check-prefixes=ALL-SIZE,GFX8-SIZE %s
+; RUN: opt < %s -mtriple=amdgcn-unknown-amdhsa -mcpu=gfx900 -passes="print<cost-model>" -cost-kind=code-size 2>&1 -disable-output | FileCheck -check-prefixes=ALL-SIZE,GFX9-SIZE %s
+; RUN: opt < %s -mtriple=amdgcn-unknown-amdhsa -mcpu=gfx1030 -passes="print<cost-model>" -cost-kind=code-size 2>&1 -disable-output | FileCheck -check-prefixes=ALL-SIZE,GFX10-SIZE %s
+
+define void @log2_f16() {
+; BASE-LABEL: 'log2_f16'
+; BASE-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %f16 = call half @llvm.log2.f16(half undef)
+; BASE-NEXT:  Cost Model: Found an estimated cost of 4 for instruction: %v2f16 = call <2 x half> @llvm.log2.v2f16(<2 x half> undef)
+; BASE-NEXT:  Cost Model: Found an estimated cost of 8 for instruction: %v3f16 = call <3 x half> @llvm.log2.v3f16(<3 x half> undef)
+; BASE-NEXT:  Cost Model: Found an estimated cost of 8 for instruction: %v4f16 = call <4 x half> @llvm.log2.v4f16(<4 x half> undef)
+; BASE-NEXT:  Cost Model: Found an estimated cost of 16 for instruction: %v5f16 = call <5 x half> @llvm.log2.v5f16(<5 x half> undef)
+; BASE-NEXT:  Cost Model: Found an estimated cost of 16 for instruction: %v8f16 = call <8 x half> @llvm.log2.v8f16(<8 x half> undef)
+; BASE-NEXT:  Cost Model: Found an estimated cost of 32 for instruction: %v16f16 = call <16 x half> @llvm.log2.v16f16(<16 x half> undef)
+; BASE-NEXT:  Cost Model: Found an estimated cost of 64 for instruction: %v17f16 = call <17 x half> @llvm.log2.v17f16(<17 x half> undef)
+; BASE-NEXT:  Cost Model: Found an estimated cost of 10 for instruction: ret void
+;
+; GFX8-LABEL: 'log2_f16'
+; GFX8-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %f16 = call half @llvm.log2.f16(half undef)
+; GFX8-NEXT:  Cost Model: Found an estimated cost of 3 for instruction: %v2f16 = call <2 x half> @llvm.log2.v2f16(<2 x half> undef)
+; GFX8-NEXT:  Cost Model: Found an estimated cost of 5 for instruction: %v3f16 = call <3 x half> @llvm.log2.v3f16(<3 x half> undef)
+; GFX8-NEXT:  Cost Model: Found an estimated cost of 7 for instruction: %v4f16 = call <4 x half> @llvm.log2.v4f16(<4 x half> undef)
+; GFX8-NEXT:  Cost Model: Found an estimated cost of 9 for instruction: %v5f16 = call <5 x half> @llvm.log2.v5f16(<5 x half> undef)
+; GFX8-NEXT:  Cost Model: Found an estimated cost of 15 for instruction: %v8f16 = call <8 x half> @llvm.log2.v8f16(<8 x half> undef)
+; GFX8-NEXT:  Cost Model: Found an estimated cost of 31 for instruction: %v16f16 = call <16 x half> @llvm.log2.v16f16(<16 x half> undef)
+; GFX8-NEXT:  Cost Model: Found an estimated cost of 33 for instruction: %v17f16 = call <17 x half> @llvm.log2.v17f16(<17 x half> undef)
+; GFX8-NEXT:  Cost Model: Found an estimated cost of 10 for instruction: ret void
+;
+; GFX9-LABEL: 'log2_f16'
+; GFX9-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %f16 = call half @llvm.log2.f16(half undef)
+; GFX9-NEXT:  Cost Model: Found an estimated cost of 3 for instruction: %v2f16 = call <2 x half> @llvm.log2.v2f16(<2 x half> undef)
+; GFX9-NEXT:  Cost Model: Found an estimated cost of 5 for instruction: %v3f16 = call <3 x half> @llvm.log2.v3f16(<3 x half> undef)
+; GFX9-NEXT:  Cost Model: Found an estimated cost of 7 for instruction: %v4f16 = call <4 x half> @llvm.log2.v4f16(<4 x half> undef)
+; GFX9-NEXT:  Cost Model: Found an estimated cost of 9 for instruction: %v5f16 = call <5 x half> @llvm.log2.v5f16(<5 x half> undef)
+; GFX9-NEXT:  Cost Model: Found an estimated cost of 15 for instruction: %v8f16 = call <8 x half> @llvm.log2.v8f16(<8 x half> undef)
+; GFX9-NEXT:  Cost Model: Found an estimated cost of 31 for instruction: %v16f16 = call <16 x half> @llvm.log2.v16f16(<16 x half> undef)
+; GFX9-NEXT:  Cost Model: Found an estimated cost of 33 for instruction: %v17f16 = call <17 x half> @llvm.log2.v17f16(<17 x half> undef)
+; GFX9-NEXT:  Cost Model: Found an estimated cost of 10 for instruction: ret void
+;
+; GFX10-LABEL: 'log2_f16'
+; GFX10-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %f16 = call half @llvm.log2.f16(half undef)
+; GFX10-NEXT:  Cost Model: Found an estimated cost of 3 for instruction: %v2f16 = call <2 x half> @llvm.log2.v2f16(<2 x half> undef)
+; GFX10-NEXT:  Cost Model: Found an estimated cost of 5 for instruction: %v3f16 = call <3 x half> @llvm.log2.v3f16(<3 x half> undef)
+; GFX10-NEXT:  Cost Model: Found an estimated cost of 7 for instruction: %v4f16 = call <4 x half> @llvm.log2.v4f16(<4 x half> undef)
+; GFX10-NEXT:  Cost Model: Found an estimated cost of 9 for instruction: %v5f16 = call <5 x half> @llvm.log2.v5f16(<5 x half> undef)
+; GFX10-NEXT:  Cost Model: Found an estimated cost of 15 for instruction: %v8f16 = call <8 x half> @llvm.log2.v8f16(<8 x half> undef)
+; GFX10-NEXT:  Cost Model: Found an estimated cost of 31 for instruction: %v16f16 = call <16 x half> @llvm.log2.v16f16(<16 x half> undef)
+; GFX10-NEXT:  Cost Model: Found an estimated cost of 33 for instruction: %v17f16 = call <17 x half> @llvm.log2.v17f16(<17 x half> undef)
+; GFX10-NEXT:  Cost Model: Found an estimated cost of 10 for instruction: ret void
+;
+; BASE-SIZE-LABEL: 'log2_f16'
+; BASE-SIZE-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %f16 = call half @llvm.log2.f16(half undef)
+; BASE-SIZE-NEXT:  Cost Model: Found an estimated cost of 4 for instruction: %v2f16 = call <2 x half> @llvm.log2.v2f16(<2 x half> undef)
+; BASE-SIZE-NEXT:  Cost Model: Found an estimated cost of 8 for instruction: %v3f16 = call <3 x half> @llvm.log2.v3f16(<3 x half> undef)
+; BASE-SIZE-NEXT:  Cost Model: Found an estimated cost of 8 for instruction: %v4f16 = call <4 x half> @llvm.log2.v4f16(<4 x half> undef)
+; BASE-SIZE-NEXT:  Cost Model: Found an estimated cost of 16 for instruction: %v5f16 = call <5 x half> @llvm.log2.v5f16(<5 x half> undef)
+; BASE-SIZE-NEXT:  Cost Model: Found an estimated cost of 16 for instruction: %v8f16 = call <8 x half> @llvm.log2.v8f16(<8 x half> undef)
+; BASE-SIZE-NEXT:  Cost Model: Found an estimated cost of 32 for instruction: %v16f16 = call <16 x half> @llvm.log2.v16f16(<16 x half> undef)
+; BASE-SIZE-NEXT:  Cost Model: Found an estimated cost of 64 for instruction: %v17f16 = call <17 x half> @llvm.log2.v17f16(<17 x half> undef)
+; BASE-SIZE-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: ret void
+;
+; GFX8-SIZE-LABEL: 'log2_f16'
+; GFX8-SIZE-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %f16 = call half @llvm.log2.f16(half undef)
+; GFX8-SIZE-NEXT:  Cost Model: Found an estimated cost of 3 for instruction: %v2f16 = call <2 x half> @llvm.log2.v2f16(<2 x half> undef)
+; GFX8-SIZE-NEXT:  Cost Model: Found an estimated cost of 5 for instruction: %v3f16 = call <3 x half> @llvm.log2.v3f16(<3 x half> undef)
+; GFX8-SIZE-NEXT:  Cost Model: Found an estimated cost of 7 for instruction: %v4f16 = call <4 x half> @llvm.log2.v4f16(<4 x half> undef)
+; GFX8-SIZE-NEXT:  Cost Model: Found an estimated cost of 9 for instruction: %v5f16 = call <5 x half> @llvm.log2.v5f16(<5 x half> undef)
+; GFX8-SIZE-NEXT:  Cost Model: Found an estimated cost of 15 for instruction: %v8f16 = call <8 x half> @llvm.log2.v8f16(<8 x half> undef)
+; GFX8-SIZE-NEXT:  Cost Model: Found an estimated cost of 31 for instruction: %v16f16 = call <16 x half> @llvm.log2.v16f16(<16 x half> undef)
+; GFX8-SIZE-NEXT:  Cost Model: Found an estimated cost of 33 for instruction: %v17f16 = call <17 x half> @llvm.log2.v17f16(<17 x half> undef)
+; GFX8-SIZE-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: ret void
+;
+; GFX9-SIZE-LABEL: 'log2_f16'
+; GFX9-SIZE-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %f16 = call half @llvm.log2.f16(half undef)
+; GFX9-SIZE-NEXT:  Cost Model: Found an estimated cost of 3 for instruction: %v2f16 = call <2 x half> @llvm.log2.v2f16(<2 x half> undef)
+; GFX9-SIZE-NEXT:  Cost Model: Found an estimated cost of 5 for instruction: %v3f16 = call <3 x half> @llvm.log2.v3f16(<3 x half> undef)
+; GFX9-SIZE-NEXT:  Cost Model: Found an estimated cost of 7 for instruction: %v4f16 = call <4 x half> @llvm.log2.v4f16(<4 x half> undef)
+; GFX9-SIZE-NEXT:  Cost Model: Found an estimated cost of 9 for instruction: %v5f16 = call <5 x half> @llvm.log2.v5f16(<5 x half> undef)
+; GFX9-SIZE-NEXT:  Cost Model: Found an estimated cost of 15 for instruction: %v8f16 = call <8 x half> @llvm.log2.v8f16(<8 x half> undef)
+; GFX9-SIZE-NEXT:  Cost Model: Found an estimated cost of 31 for instruction: %v16f16 = call <16 x half> @llvm.log2.v16f16(<16 x half> undef)
+; GFX9-SIZE-NEXT:  Cost Model: Found an estimated cost of 33 for instruction: %v17f16 = call <17 x half> @llvm.log2.v17f16(<17 x half> undef)
+; GFX9-SIZE-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: ret void
+;
+; GFX10-SIZE-LABEL: 'log2_f16'
+; GFX10-SIZE-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %f16 = call half @llvm.log2.f16(half undef)
+; GFX10-SIZE-NEXT:  Cost Model: Found an estimated cost of 3 for instruction: %v2f16 = call <2 x half> @llvm.log2.v2f16(<2 x half> undef)
+; GFX10-SIZE-NEXT:  Cost Model: Found an estimated cost of 5 for instruction: %v3f16 = call <3 x half> @llvm.log2.v3f16(<3 x half> undef)
+; GFX10-SIZE-NEXT:  Cost Model: Found an estimated cost of 7 for instruction: %v4f16 = call <4 x half> @llvm.log2.v4f16(<4 x half> undef)
+; GFX10-SIZE-NEXT:  Cost Model: Found an estimated cost of 9 for instruction: %v5f16 = call <5 x half> @llvm.log2.v5f16(<5 x half> undef)
+; GFX10-SIZE-NEXT:  Cost Model: Found an estimated cost of 15 for instruction: %v8f16 = call <8 x half> @llvm.log2.v8f16(<8 x half> undef)
+; GFX10-SIZE-NEXT:  Cost Model: Found an estimated cost of 31 for instruction: %v16f16 = call <16 x half> @llvm.log2.v16f16(<16 x half> undef)
+; GFX10-SIZE-NEXT:  Cost Model: Found an estimated cost of 33 for instruction: %v17f16 = call <17 x half> @llvm.log2.v17f16(<17 x half> undef)
+; GFX10-SIZE-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: ret void
+;
+  %f16 = call half @llvm.log2.f16(half undef)
+  %v2f16 = call <2 x half> @llvm.log2.v2f16(<2 x half> undef)
+  %v3f16 = call <3 x half> @llvm.log2.v3f16(<3 x half> undef)
+  %v4f16 = call <4 x half> @llvm.log2.v4f16(<4 x half> undef)
+  %v5f16 = call <5 x half> @llvm.log2.v5f16(<5 x half> undef)
+  %v8f16 = call <8 x half> @llvm.log2.v8f16(<8 x half> undef)
+  %v16f16 = call <16 x half> @llvm.log2.v16f16(<16 x half> undef)
+  %v17f16 = call <17 x half> @llvm.log2.v17f16(<17 x half> undef)
+  ret void
+}
+
+define void @log2_bf16() {
+; BASE-LABEL: 'log2_bf16'
+; BASE-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %bf16 = call bfloat @llvm.log2.bf16(bfloat undef)
+; BASE-NEXT:  Cost Model: Found an estimated cost of 4 for instruction: %v2bf16 = call <2 x bfloat> @llvm.log2.v2bf16(<2 x bfloat> undef)
+; BASE-NEXT:  Cost Model: Found an estimated cost of 8 for instruction: %v3bf16 = call <3 x bfloat> @llvm.log2.v3bf16(<3 x bfloat> undef)
+; BASE-NEXT:  Cost Model: Found an estimated cost of 8 for instruction: %v4bf16 = call <4 x bfloat> @llvm.log2.v4bf16(<4 x bfloat> undef)
+; BASE-NEXT:  Cost Model: Found an estimated cost of 16 for instruction: %v5bf16 = call <5 x bfloat> @llvm.log2.v5bf16(<5 x bfloat> undef)
+; BASE-NEXT:  Cost Model: Found an estimated cost of 16 for instruction: %v8bf16 = call <8 x bfloat> @llvm.log2.v8bf16(<8 x bfloat> undef)
+; BASE-NEXT:  Cost Model: Found an estimated cost of 32 for instruction: %v16bf16 = call <16 x bfloat> @llvm.log2.v16bf16(<16 x bfloat> undef)
+; BASE-NEXT:  Cost Model: Found an estimated cost of 64 for instruction: %v17bf16 = call <17 x bfloat> @llvm.log2.v17bf16(<17 x bfloat> undef)
+; BASE-NEXT:  Cost Model: Found an estimated cost of 10 for instruction: ret void
+;
+; GFX8-LABEL: 'log2_bf16'
+; GFX8-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %bf16 = call bfloat @llvm.log2.bf16(bfloat undef)
+; GFX8-NEXT:  Cost Model: Found an estimated cost of 3 for instruction: %v2bf16 = call <2 x bfloat> @llvm.log2.v2bf16(<2 x bfloat> undef)
+; GFX8-NEXT:  Cost Model: Found an estimated cost of 5 for instruction: %v3bf16 = call <3 x bfloat> @llvm.log2.v3bf16(<3 x bfloat> undef)
+; GFX8-NEXT:  Cost Model: Found an estimated cost of 7 for instruction: %v4bf16 = call <4 x bfloat> @llvm.log2.v4bf16(<4 x bfloat> undef)
+; GFX8-NEXT:  Cost Model: Found an estimated cost of 9 for instruction: %v5bf16 = call <5 x bfloat> @llvm.log2.v5bf16(<5 x bfloat> undef)
+; GFX8-NEXT:  Cost Model: Found an estimated cost of 15 for instruction: %v8bf16 = call <8 x bfloat> @llvm.log2.v8bf16(<8 x bfloat> undef)
+; GFX8-NEXT:  Cost Model: Found an estimated cost of 31 for instruction: %v16bf16 = call <16 x bfloat> @llvm.log2.v16bf16(<16 x bfloat> undef)
+; GFX8-NEXT:  Cost Model: Found an estimated cost of 33 for instruction: %v17bf16 = call <17 x bfloat> @llvm.log2.v17bf16(<17 x bfloat> undef)
+; GFX8-NEXT:  Cost Model: Found an estimated cost of 10 for instruction: ret void
+;
+; GFX9-LABEL: 'log2_bf16'
+; GFX9-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %bf16 = call bfloat @llvm.log2.bf16(bfloat undef)
+; GFX9-NEXT:  Cost Model: Found an estimated cost of 3 for instruction: %v2bf16 = call <2 x bfloat> @llvm.log2.v2bf16(<2 x bfloat> undef)
+; GFX9-NEXT:  Cost Model: Found an estimated cost of 5 for instruction: %v3bf16 = call <3 x bfloat> @llvm.log2.v3bf16(<3 x bfloat> undef)
+; GFX9-NEXT:  Cost Model: Found an estimated cost of 7 for instruction: %v4bf16 = call <4 x bfloat> @llvm.log2.v4bf16(<4 x bfloat> undef)
+; GFX9-NEXT:  Cost Model: Found an estimated cost of 9 for instruction: %v5bf16 = call <5 x bfloat> @llvm.log2.v5bf16(<5 x bfloat> undef)
+; GFX9-NEXT:  Cost Model: Found an estimated cost of 15 for instruction: %v8bf16 = call <8 x bfloat> @llvm.log2.v8bf16(<8 x bfloat> undef)
+; GFX9-NEXT:  Cost Model: Found an estimated cost of 31 for instruction: %v16bf16 = call <16 x bfloat> @llvm.log2.v16bf16(<16 x bfloat> undef)
+; GFX9-NEXT:  Cost Model: Found an estimated cost of 33 for instruction: %v17bf16 = call <17 x bfloat> @llvm.log2.v17bf16(<17 x bfloat> undef)
+; GFX9-NEXT:  Cost Model: Found an estimated cost of 10 for instruction: ret void
+;
+; GFX10-LABEL: 'log2_bf16'
+; GFX10-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %bf16 = call bfloat @llvm.log2.bf16(bfloat undef)
+; GFX10-NEXT:  Cost Model: Found an estimated cost of 3 for instruction: %v2bf16 = call <2 x bfloat> @llvm.log2.v2bf16(<2 x bfloat> undef)
+; GFX10-NEXT:  Cost Model: Found an estimated cost of 5 for instruction: %v3bf16 = call <3 x bfloat> @llvm.log2.v3bf16(<3 x bfloat> undef)
+; GFX10-NEXT:  Cost Model: Found an estimated cost of 7 for instruction: %v4bf16 = call <4 x bfloat> @llvm.log2.v4bf16(<4 x bfloat> undef)
+; GFX10-NEXT:  Cost Model: Found an estimated cost of 9 for instruction: %v5bf16 = call <5 x bfloat> @llvm.log2.v5bf16(<5 x bfloat> undef)
+; GFX10-NEXT:  Cost Model: Found an estimated cost of 15 for instruction: %v8bf16 = call <8 x bfloat> @llvm.log2.v8bf16(<8 x bfloat> undef)
+; GFX10-NEXT:  Cost Model: Found an estimated cost of 31 for instruction: %v16bf16 = call <16 x bfloat> @llvm.log2.v16bf16(<16 x bfloat> undef)
+; GFX10-NEXT:  Cost Model: Found an estimated cost of 33 for instruction: %v17bf16 = call <17 x bfloat> @llvm.log2.v17bf16(<17 x bfloat> undef)
+; GFX10-NEXT:  Cost Model: Found an estimated cost of 10 for instruction: ret void
+;
+; BASE-SIZE-LABEL: 'log2_bf16'
+; BASE-SIZE-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %bf16 = call bfloat @llvm.log2.bf16(bfloat undef)
+; BASE-SIZE-NEXT:  Cost Model: Found an estimated cost of 4 for instruction: %v2bf16 = call <2 x bfloat> @llvm.log2.v2bf16(<2 x bfloat> undef)
+; BASE-SIZE-NEXT:  Cost Model: Found an estimated cost of 8 for instruction: %v3bf16 = call <3 x bfloat> @llvm.log2.v3bf16(<3 x bfloat> undef)
+; BASE-SIZE-NEXT:  Cost Model: Found an estimated cost of 8 for instruction: %v4bf16 = call <4 x bfloat> @llvm.log2.v4bf16(<4 x bfloat> undef)
+; BASE-SIZE-NEXT:  Cost Model: Found an estimated cost of 16 for instruction: %v5bf16 = call <5 x bfloat> @llvm.log2.v5bf16(<5 x bfloat> undef)
+; BASE-SIZE-NEXT:  Cost Model: Found an estimated cost of 16 for instruction: %v8bf16 = call <8 x bfloat> @llvm.log2.v8bf16(<8 x bfloat> undef)
+; BASE-SIZE-NEXT:  Cost Model: Found an estimated cost of 32 for instruction: %v16bf16 = call <16 x bfloat> @llvm.log2.v16bf16(<16 x bfloat> undef)
+; BASE-SIZE-NEXT:  Cost Model: Found an estimated cost of 64 for instruction: %v17bf16 = call <17 x bfloat> @llvm.log2.v17bf16(<17 x bfloat> undef)
+; BASE-SIZE-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: ret void
+;
+; GFX8-SIZE-LABEL: 'log2_bf16'
+; GFX8-SIZE-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %bf16 = call bfloat @llvm.log2.bf16(bfloat undef)
+; GFX8-SIZE-NEXT:  Cost Model: Found an estimated cost of 3 for instruction: %v2bf16 = call <2 x bfloat> @llvm.log2.v2bf16(<2 x bfloat> undef)
+; GFX8-SIZE-NEXT:  Cost Model: Found an estimated cost of 5 for instruction: %v3bf16 = call <3 x bfloat> @llvm.log2.v3bf16(<3 x bfloat> undef)
+; GFX8-SIZE-NEXT:  Cost Model: Found an estimated cost of 7 for instruction: %v4bf16 = call <4 x bfloat> @llvm.log2.v4bf16(<4 x bfloat> undef)
+; GFX8-SIZE-NEXT:  Cost Model: Found an estimated cost of 9 for instruction: %v5bf16 = call <5 x bfloat> @llvm.log2.v5bf16(<5 x bfloat> undef)
+; GFX8-SIZE-NEXT:  Cost Model: Found an estimated cost of 15 for instruction: %v8bf16 = call <8 x bfloat> @llvm.log2.v8bf16(<8 x bfloat> undef)
+; GFX8-SIZE-NEXT:  Cost Model: Found an estimated cost of 31 for instruction: %v16bf16 = call <16 x bfloat> @llvm.log2.v16bf16(<16 x bfloat> undef)
+; GFX8-SIZE-NEXT:  Cost Model: Found an estimated cost of 33 for instruction: %v17bf16 = call <17 x bfloat> @llvm.log2.v17bf16(<17 x bfloat> undef)
+; GFX8-SIZE-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: ret void
+;
+; GFX9-SIZE-LABEL: 'log2_bf16'
+; GFX9-SIZE-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %bf16 = call bfloat @llvm.log2.bf16(bfloat undef)
+; GFX9-SIZE-NEXT:  Cost Model: Found an estimated cost of 3 for instruction: %v2bf16 = call <2 x bfloat> @llvm.log2.v2bf16(<2 x bfloat> undef)
+; GFX9-SIZE-NEXT:  Cost Model: Found an estimated cost of 5 for instruction: %v3bf16 = call <3 x bfloat> @llvm.log2.v3bf16(<3 x bfloat> undef)
+; GFX9-SIZE-NEXT:  Cost Model: Found an estimated cost of 7 for instruction: %v4bf16 = call <4 x bfloat> @llvm.log2.v4bf16(<4 x bfloat> undef)
+; GFX9-SIZE-NEXT:  Cost Model: Found an estimated cost of 9 for instruction: %v5bf16 = call <5 x bfloat> @llvm.log2.v5bf16(<5 x bfloat> undef)
+; GFX9-SIZE-NEXT:  Cost Model: Found an estimated cost of 15 for instruction: %v8bf16 = call <8 x bfloat> @llvm.log2.v8bf16(<8 x bfloat> undef)
+; GFX9-SIZE-NEXT:  Cost Model: Found an estimated cost of 31 for instruction: %v16bf16 = call <16 x bfloat> @llvm.log2.v16bf16(<16 x bfloat> undef)
+; GFX9-SIZE-NEXT:  Cost Model: Found an estimated cost of 33 for instruction: %v17bf16 = call <17 x bfloat> @llvm.log2.v17bf16(<17 x bfloat> undef)
+; GFX9-SIZE-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: ret void
+;
+; GFX10-SIZE-LABEL: 'log2_bf16'
+; GFX10-SIZE-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %bf16 = call bfloat @llvm.log2.bf16(bfloat undef)
+; GFX10-SIZE-NEXT:  Cost Model: Found an estimated cost of 3 for instruction: %v2bf16 = call <2 x bfloat> @llvm.log2.v2bf16(<2 x bfloat> undef)
+; GFX10-SIZE-NEXT:  Cost Model: Found an estimated cost of 5 for instruction: %v3bf16 = call <3 x bfloat> @llvm.log2.v3bf16(<3 x bfloat> undef)
+; GFX10-SIZE-NEXT:  Cost Model: Found an estimated cost of 7 for instruction: %v4bf16 = call <4 x bfloat> @llvm.log2.v4bf16(<4 x bfloat> undef)
+; GFX10-SIZE-NEXT:  Cost Model: Found an estimated cost of 9 for instruction: %v5bf16 = call <5 x bfloat> @llvm.log2.v5bf16(<5 x bfloat> undef)
+; GFX10-SIZE-NEXT:  Cost Model: Found an estimated cost of 15 for instruction: %v8bf16 = call <8 x bfloat> @llvm.log2.v8bf16(<8 x bfloat> undef)
+; GFX10-SIZE-NEXT:  Cost Model: Found an estimated cost of 31 for instruction: %v16bf16 = call <16 x bfloat> @llvm.log2.v16bf16(<16 x bfloat> undef)
+; GFX10-SIZE-NEXT:  Cost Model: Found an estimated cost of 33 for instruction: %v17bf16 = call <17 x bfloat> @llvm.log2.v17bf16(<17 x bfloat> undef)
+; GFX10-SIZE-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: ret void
+;
+  %bf16 = call bfloat @llvm.log2.bf16(bfloat undef)
+  %v2bf16 = call <2 x bfloat> @llvm.log2.v2bf16(<2 x bfloat> undef)
+  %v3bf16 = call <3 x bfloat> @llvm.log2.v3bf16(<3 x bfloat> undef)
+  %v4bf16 = call <4 x bfloat> @llvm.log2.v4bf16(<4 x bfloat> undef)
+  %v5bf16 = call <5 x bfloat> @llvm.log2.v5bf16(<5 x bfloat> undef)
+  %v8bf16 = call <8 x bfloat> @llvm.log2.v8bf16(<8 x bfloat> undef)
+  %v16bf16 = call <16 x bfloat> @llvm.log2.v16bf16(<16 x bfloat> undef)
+  %v17bf16 = call <17 x bfloat> @llvm.log2.v17bf16(<17 x bfloat> undef)
+  ret void
+}
+
+define void @log2_f32() {
+; ALL-LABEL: 'log2_f32'
+; ALL-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %f32 = call float @llvm.log2.f32(float undef)
+; ALL-NEXT:  Cost Model: Found an estimated cost of 4 for instruction: %v2f32 = call <2 x float> @llvm.log2.v2f32(<2 x float> undef)
+; ALL-NEXT:  Cost Model: Found an estimated cost of 6 for instruction: %v3f32 = call <3 x float> @llvm.log2.v3f32(<3 x float> undef)
+; ALL-NEXT:  Cost Model: Found an estimated cost of 8 for instruction: %v4f32 = call <4 x float> @llvm.log2.v4f32(<4 x float> undef)
+; ALL-NEXT:  Cost Model: Found an estimated cost of 10 for instruction: %v5f32 = call <5 x float> @llvm.log2.v5f32(<5 x float> undef)
+; ALL-NEXT:  Cost Model: Found an estimated cost of 16 for instruction: %v8f32 = call <8 x float> @llvm.log2.v8f32(<8 x float> undef)
+; ALL-NEXT:  Cost Model: Found an estimated cost of 32 for instruction: %v16f32 = call <16 x float> @llvm.log2.v16f32(<16 x float> undef)
+; ALL-NEXT:  Cost Model: Found an estimated cost of 34 for instruction: %v17f32 = call <17 x float> @llvm.log2.v17f32(<17 x float> undef)
+; ALL-NEXT:  Cost Model: Found an estimated cost of 10 for instruction: ret void
+;
+; ALL-SIZE-LABEL: 'log2_f32'
+; ALL-SIZE-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %f32 = call float @llvm.log2.f32(float undef)
+; ALL-SIZE-NEXT:  Cost Model: Found an estimated cost of 4 for instruction: %v2f32 = call <2 x float> @llvm.log2.v2f32(<2 x float> undef)
+; ALL-SIZE-NEXT:  Cost Model: Found an estimated cost of 6 for instruction: %v3f32 = call <3 x float> @llvm.log2.v3f32(<3 x float> undef)
+; ALL-SIZE-NEXT:  Cost Model: Found an estimated cost of 8 for instruction: %v4f32 = call <4 x float> @llvm.log2.v4f32(<4 x float> undef)
+; ALL-SIZE-NEXT:  Cost Model: Found an estimated cost of 10 for instruction: %v5f32 = call <5 x float> @llvm.log2.v5f32(<5 x float> undef)
+; ALL-SIZE-NEXT:  Cost Model: Found an estimated cost of 16 for instruction: %v8f32 = call <8 x float> @llvm.log2.v8f32(<8 x float> undef)
+; ALL-SIZE-NEXT:  Cost Model: Found an estimated cost of 32 for instruction: %v16f32 = call <16 x float> @llvm.log2.v16f32(<16 x float> undef)
+; ALL-SIZE-NEXT:  Cost Model: Found an estimated cost of 34 for instruction: %v17f32 = call <17 x float> @llvm.log2.v17f32(<17 x float> undef)
+; ALL-SIZE-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: ret void
+;
+  %f32 = call float @llvm.log2.f32(float undef)
+  %v2f32 = call <2 x float> @llvm.log2.v2f32(<2 x float> undef)
+  %v3f32 = call <3 x float> @llvm.log2.v3f32(<3 x float> undef)
+  %v4f32 = call <4 x float> @llvm.log2.v4f32(<4 x float> undef)
+  %v5f32 = call <5 x float> @llvm.log2.v5f32(<5 x float> undef)
+  %v8f32 = call <8 x float> @llvm.log2.v8f32(<8 x float> undef)
+  %v16f32 = call <16 x float> @llvm.log2.v16f32(<16 x float> undef)
+  %v17f32 = call <17 x float> @llvm.log2.v17f32(<17 x float> undef)
+  ret void
+}
+
+define void @log2_f64() {
+; ALL-LABEL: 'log2_f64'
+; ALL-NEXT:  Cost Model: Found an estimated cost of 10 for instruction: %f64 = call double @llvm.log2.f64(double undef)
+; ALL-NEXT:  Cost Model: Found an estimated cost of 20 for instruction: %v2f64 = call <2 x double> @llvm.log2.v2f64(<2 x double> undef)
+; ALL-NEXT:  Cost Model: Found an estimated cost of 30 for instruction: %v3f64 = call <3 x double> @llvm.log2.v3f64(<3 x double> undef)
+; ALL-NEXT:  Cost Model: Found an estimated cost of 40 for instruction: %v4f64 = call <4 x double> @llvm.log2.v4f64(<4 x double> undef)
+; ALL-NEXT:  Cost Model: Found an estimated cost of 50 for instruction: %v5f64 = call <5 x double> @llvm.log2.v5f64(<5 x double> undef)
+; ALL-NEXT:  Cost Model: Found an estimated cost of 80 for instruction: %v8f64 = call <8 x double> @llvm.log2.v8f64(<8 x double> undef)
+; ALL-NEXT:  Cost Model: Found an estimated cost of 160 for instruction: %v16f64 = call <16 x double> @llvm.log2.v16f64(<16 x double> undef)
+; ALL-NEXT:  Cost Model: Found an estimated cost of 170 for instruction: %v17f64 = call <17 x double> @llvm.log2.v17f64(<17 x double> undef)
+; ALL-NEXT:  Cost Model: Found an estimated cost of 10 for instruction: ret void
+;
+; ALL-SIZE-LABEL: 'log2_f64'
+; ALL-SIZE-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %f64 = call double @llvm.log2.f64(double undef)
+; ALL-SIZE-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %v2f64 = call <2 x double> @llvm.log2.v2f64(<2 x double> undef)
+; ALL-SIZE-NEXT:  Cost Model: Found an estimated cost of 3 for instruction: %v3f64 = call <3 x double> @llvm.log2.v3f64(<3 x double> undef)
+; ALL-SIZE-NEXT:  Cost Model: Found an estimated cost of 4 for instruction: %v4f64 = call <4 x double> @llvm.log2.v4f64(<4 x double> undef)
+; ALL-SIZE-NEXT:  Cost Model: Found an estimated cost of 5 for instruction: %v5f64 = call <5 x double> @llvm.log2.v5f64(<5 x double> undef)
+; ALL-SIZE-NEXT:  Cost Model: Found an estimated cost of 8 for instruction: %v8f64 = call <8 x double> @llvm.log2.v8f64(<8 x double> undef)
+; ALL-SIZE-NEXT:  Cost Model: Found an estimated cost of 16 for instruction: %v16f64 = call <16 x double> @llvm.log2.v16f64(<16 x double> undef)
+; ALL-SIZE-NEXT:  Cost Model: Found an estimated cost of 17 for instruction: %v17f64 = call <17 x double> @llvm.log2.v17f64(<17 x double> undef)
+; ALL-SIZE-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: ret void
+;
+  %f64 = call double @llvm.log2.f64(double undef)
+  %v2f64 = call <2 x double> @llvm.log2.v2f64(<2 x double> undef)
+  %v3f64 = call <3 x double> @llvm.log2.v3f64(<3 x double> undef)
+  %v4f64 = call <4 x double> @llvm.log2.v4f64(<4 x double> undef)
+  %v5f64 = call <5 x double> @llvm.log2.v5f64(<5 x double> undef)
+  %v8f64 = call <8 x double> @llvm.log2.v8f64(<8 x double> undef)
+  %v16f64 = call <16 x double> @llvm.log2.v16f64(<16 x double> undef)
+  %v17f64 = call <17 x double> @llvm.log2.v17f64(<17 x double> undef)
+  ret void
+}
diff --git a/llvm/test/Analysis/CostModel/AMDGPU/maximum.ll b/llvm/test/Analysis/CostModel/AMDGPU/maximum.ll
new file mode 100644
index 0000000..0f6f01b
--- /dev/null
+++ b/llvm/test/Analysis/CostModel/AMDGPU/maximum.ll
@@ -0,0 +1,175 @@
+; NOTE: Assertions have been autogenerated by utils/update_analyze_test_checks.py UTC_ARGS: --version 5
+; RUN: opt -passes="print<cost-model>" 2>&1 -disable-output -mtriple=amdgcn-unknown-amdhsa -mcpu=gfx90a -mattr=+half-rate-64-ops < %s | FileCheck -check-prefixes=ALL,GFX9,GFX90A-FASTF64 %s
+; RUN: opt -passes="print<cost-model>" 2>&1 -disable-output -mtriple=amdgcn-unknown-amdhsa -mcpu=gfx900 -mattr=+half-rate-64-ops < %s | FileCheck -check-prefixes=ALL,GFX9,FASTF64 %s
+; RUN: opt -passes="print<cost-model>" 2>&1 -disable-output -mtriple=amdgcn-unknown-amdhsa -mattr=-half-rate-64-ops < %s | FileCheck -check-prefixes=ALL,SLOWF64 %s
+; RUN: opt -passes="print<cost-model>" -cost-kind=code-size 2>&1 -disable-output -mtriple=amdgcn-unknown-amdhsa -mcpu=gfx90a -mattr=+half-rate-64-ops < %s | FileCheck -check-prefixes=SIZE,GFX9-SIZE,GFX90A-SIZE %s
+; RUN: opt -passes="print<cost-model>" -cost-kind=code-size 2>&1 -disable-output -mtriple=amdgcn-unknown-amdhsa -mcpu=gfx900 -mattr=+half-rate-64-ops < %s | FileCheck -check-prefixes=SIZE,GFX9-SIZE %s
+; RUN: opt -passes="print<cost-model>" -cost-kind=code-size 2>&1 -disable-output -mtriple=amdgcn-unknown-amdhsa -mattr=-half-rate-64-ops < %s | FileCheck -check-prefixes=SIZE,SLOW-SIZE %s
+
+define void @maximum_f16() {
+; GFX9-LABEL: 'maximum_f16'
+; GFX9-NEXT:  Cost Model: Found an estimated cost of 10 for instruction: %f16 = call half @llvm.maximum.f16(half undef, half undef)
+; GFX9-NEXT:  Cost Model: Found an estimated cost of 21 for instruction: %v2f16 = call <2 x half> @llvm.maximum.v2f16(<2 x half> undef, <2 x half> undef)
+; GFX9-NEXT:  Cost Model: Found an estimated cost of 32 for instruction: %v3f16 = call <3 x half> @llvm.maximum.v3f16(<3 x half> undef, <3 x half> undef)
+; GFX9-NEXT:  Cost Model: Found an estimated cost of 43 for instruction: %v4f16 = call <4 x half> @llvm.maximum.v4f16(<4 x half> undef, <4 x half> undef)
+; GFX9-NEXT:  Cost Model: Found an estimated cost of 87 for instruction: %v8f16 = call <8 x half> @llvm.maximum.v8f16(<8 x half> undef, <8 x half> undef)
+; GFX9-NEXT:  Cost Model: Found an estimated cost of 175 for instruction: %v16f16 = call <16 x half> @llvm.maximum.v16f16(<16 x half> undef, <16 x half> undef)
+; GFX9-NEXT:  Cost Model: Found an estimated cost of 10 for instruction: ret void
+;
+; SLOWF64-LABEL: 'maximum_f16'
+; SLOWF64-NEXT:  Cost Model: Found an estimated cost of 10 for instruction: %f16 = call half @llvm.maximum.f16(half undef, half undef)
+; SLOWF64-NEXT:  Cost Model: Found an estimated cost of 22 for instruction: %v2f16 = call <2 x half> @llvm.maximum.v2f16(<2 x half> undef, <2 x half> undef)
+; SLOWF64-NEXT:  Cost Model: Found an estimated cost of 33 for instruction: %v3f16 = call <3 x half> @llvm.maximum.v3f16(<3 x half> undef, <3 x half> undef)
+; SLOWF64-NEXT:  Cost Model: Found an estimated cost of 44 for instruction: %v4f16 = call <4 x half> @llvm.maximum.v4f16(<4 x half> undef, <4 x half> undef)
+; SLOWF64-NEXT:  Cost Model: Found an estimated cost of 88 for instruction: %v8f16 = call <8 x half> @llvm.maximum.v8f16(<8 x half> undef, <8 x half> undef)
+; SLOWF64-NEXT:  Cost Model: Found an estimated cost of 176 for instruction: %v16f16 = call <16 x half> @llvm.maximum.v16f16(<16 x half> undef, <16 x half> undef)
+; SLOWF64-NEXT:  Cost Model: Found an estimated cost of 10 for instruction: ret void
+;
+; GFX9-SIZE-LABEL: 'maximum_f16'
+; GFX9-SIZE-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %f16 = call half @llvm.maximum.f16(half undef, half undef)
+; GFX9-SIZE-NEXT:  Cost Model: Found an estimated cost of 3 for instruction: %v2f16 = call <2 x half> @llvm.maximum.v2f16(<2 x half> undef, <2 x half> undef)
+; GFX9-SIZE-NEXT:  Cost Model: Found an estimated cost of 5 for instruction: %v3f16 = call <3 x half> @llvm.maximum.v3f16(<3 x half> undef, <3 x half> undef)
+; GFX9-SIZE-NEXT:  Cost Model: Found an estimated cost of 7 for instruction: %v4f16 = call <4 x half> @llvm.maximum.v4f16(<4 x half> undef, <4 x half> undef)
+; GFX9-SIZE-NEXT:  Cost Model: Found an estimated cost of 15 for instruction: %v8f16 = call <8 x half> @llvm.maximum.v8f16(<8 x half> undef, <8 x half> undef)
+; GFX9-SIZE-NEXT:  Cost Model: Found an estimated cost of 31 for instruction: %v16f16 = call <16 x half> @llvm.maximum.v16f16(<16 x half> undef, <16 x half> undef)
+; GFX9-SIZE-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: ret void
+;
+; SLOW-SIZE-LABEL: 'maximum_f16'
+; SLOW-SIZE-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %f16 = call half @llvm.maximum.f16(half undef, half undef)
+; SLOW-SIZE-NEXT:  Cost Model: Found an estimated cost of 4 for instruction: %v2f16 = call <2 x half> @llvm.maximum.v2f16(<2 x half> undef, <2 x half> undef)
+; SLOW-SIZE-NEXT:  Cost Model: Found an estimated cost of 6 for instruction: %v3f16 = call <3 x half> @llvm.maximum.v3f16(<3 x half> undef, <3 x half> undef)
+; SLOW-SIZE-NEXT:  Cost Model: Found an estimated cost of 8 for instruction: %v4f16 = call <4 x half> @llvm.maximum.v4f16(<4 x half> undef, <4 x half> undef)
+; SLOW-SIZE-NEXT:  Cost Model: Found an estimated cost of 16 for instruction: %v8f16 = call <8 x half> @llvm.maximum.v8f16(<8 x half> undef, <8 x half> undef)
+; SLOW-SIZE-NEXT:  Cost Model: Found an estimated cost of 32 for instruction: %v16f16 = call <16 x half> @llvm.maximum.v16f16(<16 x half> undef, <16 x half> undef)
+; SLOW-SIZE-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: ret void
+;
+; SLOW-LABEL: 'maximum_f16'
+; SLOW-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %f16 = call half @llvm.maximum.f16(half undef, half undef)
+; SLOW-NEXT:  Cost Model: Found an estimated cost of 4 for instruction: %v2f16 = call <2 x half> @llvm.maximum.v2f16(<2 x half> undef, <2 x half> undef)
+; SLOW-NEXT:  Cost Model: Found an estimated cost of 8 for instruction: %v3f16 = call <3 x half> @llvm.maximum.v3f16(<3 x half> undef, <3 x half> undef)
+; SLOW-NEXT:  Cost Model: Found an estimated cost of 8 for instruction: %v4f16 = call <4 x half> @llvm.maximum.v4f16(<4 x half> undef, <4 x half> undef)
+; SLOW-NEXT:  Cost Model: Found an estimated cost of 16 for instruction: %v8f16 = call <8 x half> @llvm.maximum.v8f16(<8 x half> undef, <8 x half> undef)
+; SLOW-NEXT:  Cost Model: Found an estimated cost of 32 for instruction: %v16f16 = call <16 x half> @llvm.maximum.v16f16(<16 x half> undef, <16 x half> undef)
+; SLOW-NEXT:  Cost Model: Found an estimated cost of 10 for instruction: ret void
+  %f16 = call half @llvm.maximum.f16(half undef, half undef)
+  %v2f16 = call <2 x half> @llvm.maximum.v2f16(<2 x half> undef, <2 x half> undef)
+  %v3f16 = call <3x half> @llvm.maximum.v3f16(<3 x half> undef, <3 x half> undef)
+  %v4f16 = call <4 x half> @llvm.maximum.v4f16(<4 x half> undef, <4 x half> undef)
+  %v8f16 = call <8 x half> @llvm.maximum.v8f16(<8 x half> undef, <8 x half> undef)
+  %v16f16 = call <16 x half> @llvm.maximum.v16f16(<16 x half> undef, <16 x half> undef)
+  ret void
+}
+
+define void @maximum_bf16() {
+; GFX9-LABEL: 'maximum_bf16'
+; GFX9-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %bf16 = call bfloat @llvm.maximum.bf16(bfloat undef, bfloat undef)
+; GFX9-NEXT:  Cost Model: Found an estimated cost of 3 for instruction: %v2bf16 = call <2 x bfloat> @llvm.maximum.v2bf16(<2 x bfloat> undef, <2 x bfloat> undef)
+; GFX9-NEXT:  Cost Model: Found an estimated cost of 5 for instruction: %v3bf16 = call <3 x bfloat> @llvm.maximum.v3bf16(<3 x bfloat> undef, <3 x bfloat> undef)
+; GFX9-NEXT:  Cost Model: Found an estimated cost of 7 for instruction: %v4bf16 = call <4 x bfloat> @llvm.maximum.v4bf16(<4 x bfloat> undef, <4 x bfloat> undef)
+; GFX9-NEXT:  Cost Model: Found an estimated cost of 15 for instruction: %v8bf16 = call <8 x bfloat> @llvm.maximum.v8bf16(<8 x bfloat> undef, <8 x bfloat> undef)
+; GFX9-NEXT:  Cost Model: Found an estimated cost of 31 for instruction: %v16bf16 = call <16 x bfloat> @llvm.maximum.v16bf16(<16 x bfloat> undef, <16 x bfloat> undef)
+; GFX9-NEXT:  Cost Model: Found an estimated cost of 10 for instruction: ret void
+;
+; SLOWF64-LABEL: 'maximum_bf16'
+; SLOWF64-NEXT:  Cost Model: Found an estimated cost of 10 for instruction: %bf16 = call bfloat @llvm.maximum.bf16(bfloat undef, bfloat undef)
+; SLOWF64-NEXT:  Cost Model: Found an estimated cost of 22 for instruction: %v2bf16 = call <2 x bfloat> @llvm.maximum.v2bf16(<2 x bfloat> undef, <2 x bfloat> undef)
+; SLOWF64-NEXT:  Cost Model: Found an estimated cost of 33 for instruction: %v3bf16 = call <3 x bfloat> @llvm.maximum.v3bf16(<3 x bfloat> undef, <3 x bfloat> undef)
+; SLOWF64-NEXT:  Cost Model: Found an estimated cost of 44 for instruction: %v4bf16 = call <4 x bfloat> @llvm.maximum.v4bf16(<4 x bfloat> undef, <4 x bfloat> undef)
+; SLOWF64-NEXT:  Cost Model: Found an estimated cost of 88 for instruction: %v8bf16 = call <8 x bfloat> @llvm.maximum.v8bf16(<8 x bfloat> undef, <8 x bfloat> undef)
+; SLOWF64-NEXT:  Cost Model: Found an estimated cost of 176 for instruction: %v16bf16 = call <16 x bfloat> @llvm.maximum.v16bf16(<16 x bfloat> undef, <16 x bfloat> undef)
+; SLOWF64-NEXT:  Cost Model: Found an estimated cost of 10 for instruction: ret void
+;
+; GFX9-SIZE-LABEL: 'maximum_bf16'
+; GFX9-SIZE-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %bf16 = call bfloat @llvm.maximum.bf16(bfloat undef, bfloat undef)
+; GFX9-SIZE-NEXT:  Cost Model: Found an estimated cost of 3 for instruction: %v2bf16 = call <2 x bfloat> @llvm.maximum.v2bf16(<2 x bfloat> undef, <2 x bfloat> undef)
+; GFX9-SIZE-NEXT:  Cost Model: Found an estimated cost of 5 for instruction: %v3bf16 = call <3 x bfloat> @llvm.maximum.v3bf16(<3 x bfloat> undef, <3 x bfloat> undef)
+; GFX9-SIZE-NEXT:  Cost Model: Found an estimated cost of 7 for instruction: %v4bf16 = call <4 x bfloat> @llvm.maximum.v4bf16(<4 x bfloat> undef, <4 x bfloat> undef)
+; GFX9-SIZE-NEXT:  Cost Model: Found an estimated cost of 15 for instruction: %v8bf16 = call <8 x bfloat> @llvm.maximum.v8bf16(<8 x bfloat> undef, <8 x bfloat> undef)
+; GFX9-SIZE-NEXT:  Cost Model: Found an estimated cost of 31 for instruction: %v16bf16 = call <16 x bfloat> @llvm.maximum.v16bf16(<16 x bfloat> undef, <16 x bfloat> undef)
+; GFX9-SIZE-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: ret void
+;
+; SLOW-SIZE-LABEL: 'maximum_bf16'
+; SLOW-SIZE-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %bf16 = call bfloat @llvm.maximum.bf16(bfloat undef, bfloat undef)
+; SLOW-SIZE-NEXT:  Cost Model: Found an estimated cost of 4 for instruction: %v2bf16 = call <2 x bfloat> @llvm.maximum.v2bf16(<2 x bfloat> undef, <2 x bfloat> undef)
+; SLOW-SIZE-NEXT:  Cost Model: Found an estimated cost of 6 for instruction: %v3bf16 = call <3 x bfloat> @llvm.maximum.v3bf16(<3 x bfloat> undef, <3 x bfloat> undef)
+; SLOW-SIZE-NEXT:  Cost Model: Found an estimated cost of 8 for instruction: %v4bf16 = call <4 x bfloat> @llvm.maximum.v4bf16(<4 x bfloat> undef, <4 x bfloat> undef)
+; SLOW-SIZE-NEXT:  Cost Model: Found an estimated cost of 16 for instruction: %v8bf16 = call <8 x bfloat> @llvm.maximum.v8bf16(<8 x bfloat> undef, <8 x bfloat> undef)
+; SLOW-SIZE-NEXT:  Cost Model: Found an estimated cost of 32 for instruction: %v16bf16 = call <16 x bfloat> @llvm.maximum.v16bf16(<16 x bfloat> undef, <16 x bfloat> undef)
+; SLOW-SIZE-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: ret void
+;
+; SLOW-LABEL: 'maximum_bf16'
+; SLOW-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %bf16 = call bfloat @llvm.maximum.bf16(bfloat undef, bfloat undef)
+; SLOW-NEXT:  Cost Model: Found an estimated cost of 4 for instruction: %v2bf16 = call <2 x bfloat> @llvm.maximum.v2bf16(<2 x bfloat> undef, <2 x bfloat> undef)
+; SLOW-NEXT:  Cost Model: Found an estimated cost of 8 for instruction: %v3bf16 = call <3 x bfloat> @llvm.maximum.v3bf16(<3 x bfloat> undef, <3 x bfloat> undef)
+; SLOW-NEXT:  Cost Model: Found an estimated cost of 8 for instruction: %v4bf16 = call <4 x bfloat> @llvm.maximum.v4bf16(<4 x bfloat> undef, <4 x bfloat> undef)
+; SLOW-NEXT:  Cost Model: Found an estimated cost of 16 for instruction: %v8bf16 = call <8 x bfloat> @llvm.maximum.v8bf16(<8 x bfloat> undef, <8 x bfloat> undef)
+; SLOW-NEXT:  Cost Model: Found an estimated cost of 32 for instruction: %v16bf16 = call <16 x bfloat> @llvm.maximum.v16bf16(<16 x bfloat> undef, <16 x bfloat> undef)
+; SLOW-NEXT:  Cost Model: Found an estimated cost of 10 for instruction: ret void
+  %bf16 = call bfloat @llvm.maximum.bf16(bfloat undef, bfloat undef)
+  %v2bf16 = call <2 x bfloat> @llvm.maximum.v2bf16(<2 x bfloat> undef, <2 x bfloat> undef)
+  %v3bf16 = call <3x bfloat> @llvm.maximum.v3bf16(<3 x bfloat> undef, <3 x bfloat> undef)
+  %v4bf16 = call <4 x bfloat> @llvm.maximum.v4bf16(<4 x bfloat> undef, <4 x bfloat> undef)
+  %v8bf16 = call <8 x bfloat> @llvm.maximum.v8bf16(<8 x bfloat> undef, <8 x bfloat> undef)
+  %v16bf16 = call <16 x bfloat> @llvm.maximum.v16bf16(<16 x bfloat> undef, <16 x bfloat> undef)
+  ret void
+}
+
+define void @maximum_f32() {
+; ALL-LABEL: 'maximum_f32'
+; ALL-NEXT:  Cost Model: Found an estimated cost of 10 for instruction: %f32 = call float @llvm.maximum.f32(float undef, float undef)
+; ALL-NEXT:  Cost Model: Found an estimated cost of 20 for instruction: %v2f32 = call <2 x float> @llvm.maximum.v2f32(<2 x float> undef, <2 x float> undef)
+; ALL-NEXT:  Cost Model: Found an estimated cost of 30 for instruction: %v3f32 = call <3 x float> @llvm.maximum.v3f32(<3 x float> undef, <3 x float> undef)
+; ALL-NEXT:  Cost Model: Found an estimated cost of 40 for instruction: %v4f32 = call <4 x float> @llvm.maximum.v4f32(<4 x float> undef, <4 x float> undef)
+; ALL-NEXT:  Cost Model: Found an estimated cost of 80 for instruction: %v8f32 = call <8 x float> @llvm.maximum.v8f32(<8 x float> undef, <8 x float> undef)
+; ALL-NEXT:  Cost Model: Found an estimated cost of 160 for instruction: %v16f32 = call <16 x float> @llvm.maximum.v16f32(<16 x float> undef, <16 x float> undef)
+; ALL-NEXT:  Cost Model: Found an estimated cost of 10 for instruction: ret void
+;
+; SIZE-LABEL: 'maximum_f32'
+; SIZE-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %f32 = call float @llvm.maximum.f32(float undef, float undef)
+; SIZE-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %v2f32 = call <2 x float> @llvm.maximum.v2f32(<2 x float> undef, <2 x float> undef)
+; SIZE-NEXT:  Cost Model: Found an estimated cost of 3 for instruction: %v3f32 = call <3 x float> @llvm.maximum.v3f32(<3 x float> undef, <3 x float> undef)
+; SIZE-NEXT:  Cost Model: Found an estimated cost of 4 for instruction: %v4f32 = call <4 x float> @llvm.maximum.v4f32(<4 x float> undef, <4 x float> undef)
+; SIZE-NEXT:  Cost Model: Found an estimated cost of 8 for instruction: %v8f32 = call <8 x float> @llvm.maximum.v8f32(<8 x float> undef, <8 x float> undef)
+; SIZE-NEXT:  Cost Model: Found an estimated cost of 16 for instruction: %v16f32 = call <16 x float> @llvm.maximum.v16f32(<16 x float> undef, <16 x float> undef)
+; SIZE-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: ret void
+;
+  %f32 = call float @llvm.maximum.f32(float undef, float undef)
+  %v2f32 = call <2 x float> @llvm.maximum.v2f32(<2 x float> undef, <2 x float> undef)
+  %v3f32 = call <3x float> @llvm.maximum.v3f32(<3 x float> undef, <3 x float> undef)
+  %v4f32 = call <4 x float> @llvm.maximum.v4f32(<4 x float> undef, <4 x float> undef)
+  %v8f32 = call <8 x float> @llvm.maximum.v8f32(<8 x float> undef, <8 x float> undef)
+  %v16f32 = call <16 x float> @llvm.maximum.v16f32(<16 x float> undef, <16 x float> undef)
+  ret void
+}
+
+define void @maximum_f64() {
+; ALL-LABEL: 'maximum_f64'
+; ALL-NEXT:  Cost Model: Found an estimated cost of 10 for instruction: %f64 = call double @llvm.maximum.f64(double undef, double undef)
+; ALL-NEXT:  Cost Model: Found an estimated cost of 20 for instruction: %v2f64 = call <2 x double> @llvm.maximum.v2f64(<2 x double> undef, <2 x double> undef)
+; ALL-NEXT:  Cost Model: Found an estimated cost of 30 for instruction: %v3f64 = call <3 x double> @llvm.maximum.v3f64(<3 x double> undef, <3 x double> undef)
+; ALL-NEXT:  Cost Model: Found an estimated cost of 40 for instruction: %v4f64 = call <4 x double> @llvm.maximum.v4f64(<4 x double> undef, <4 x double> undef)
+; ALL-NEXT:  Cost Model: Found an estimated cost of 80 for instruction: %v8f64 = call <8 x double> @llvm.maximum.v8f64(<8 x double> undef, <8 x double> undef)
+; ALL-NEXT:  Cost Model: Found an estimated cost of 160 for instruction: %v16f64 = call <16 x double> @llvm.maximum.v16f64(<16 x double> undef, <16 x double> undef)
+; ALL-NEXT:  Cost Model: Found an estimated cost of 10 for instruction: ret void
+;
+; SIZE-LABEL: 'maximum_f64'
+; SIZE-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %f64 = call double @llvm.maximum.f64(double undef, double undef)
+; SIZE-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %v2f64 = call <2 x double> @llvm.maximum.v2f64(<2 x double> undef, <2 x double> undef)
+; SIZE-NEXT:  Cost Model: Found an estimated cost of 3 for instruction: %v3f64 = call <3 x double> @llvm.maximum.v3f64(<3 x double> undef, <3 x double> undef)
+; SIZE-NEXT:  Cost Model: Found an estimated cost of 4 for instruction: %v4f64 = call <4 x double> @llvm.maximum.v4f64(<4 x double> undef, <4 x double> undef)
+; SIZE-NEXT:  Cost Model: Found an estimated cost of 8 for instruction: %v8f64 = call <8 x double> @llvm.maximum.v8f64(<8 x double> undef, <8 x double> undef)
+; SIZE-NEXT:  Cost Model: Found an estimated cost of 16 for instruction: %v16f64 = call <16 x double> @llvm.maximum.v16f64(<16 x double> undef, <16 x double> undef)
+; SIZE-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: ret void
+;
+  %f64 = call double @llvm.maximum.f64(double undef, double undef)
+  %v2f64 = call <2 x double> @llvm.maximum.v2f64(<2 x double> undef, <2 x double> undef)
+  %v3f64 = call <3x double> @llvm.maximum.v3f64(<3 x double> undef, <3 x double> undef)
+  %v4f64 = call <4 x double> @llvm.maximum.v4f64(<4 x double> undef, <4 x double> undef)
+  %v8f64 = call <8 x double> @llvm.maximum.v8f64(<8 x double> undef, <8 x double> undef)
+  %v16f64 = call <16 x double> @llvm.maximum.v16f64(<16 x double> undef, <16 x double> undef)
+  ret void
+}
+;; NOTE: These prefixes are unused and the list is autogenerated. Do not add tests below this line:
+; FASTF64: {{.*}}
+; GFX90A-FASTF64: {{.*}}
+; GFX90A-SIZE: {{.*}}
diff --git a/llvm/test/Analysis/CostModel/AMDGPU/maxnum.ll b/llvm/test/Analysis/CostModel/AMDGPU/maxnum.ll
new file mode 100644
index 0000000..0d423fe
--- /dev/null
+++ b/llvm/test/Analysis/CostModel/AMDGPU/maxnum.ll
@@ -0,0 +1,175 @@
+; NOTE: Assertions have been autogenerated by utils/update_analyze_test_checks.py UTC_ARGS: --version 5
+; RUN: opt -passes="print<cost-model>" 2>&1 -disable-output -mtriple=amdgcn-unknown-amdhsa -mcpu=gfx90a -mattr=+half-rate-64-ops < %s | FileCheck -check-prefixes=ALL,GFX9,GFX90A-FASTF64 %s
+; RUN: opt -passes="print<cost-model>" 2>&1 -disable-output -mtriple=amdgcn-unknown-amdhsa -mcpu=gfx900 -mattr=+half-rate-64-ops < %s | FileCheck -check-prefixes=ALL,GFX9,FASTF64 %s
+; RUN: opt -passes="print<cost-model>" 2>&1 -disable-output -mtriple=amdgcn-unknown-amdhsa -mattr=-half-rate-64-ops < %s | FileCheck -check-prefixes=ALL,SLOWF64 %s
+; RUN: opt -passes="print<cost-model>" -cost-kind=code-size 2>&1 -disable-output -mtriple=amdgcn-unknown-amdhsa -mcpu=gfx90a -mattr=+half-rate-64-ops < %s | FileCheck -check-prefixes=SIZE,GFX9-SIZE,GFX90A-SIZE %s
+; RUN: opt -passes="print<cost-model>" -cost-kind=code-size 2>&1 -disable-output -mtriple=amdgcn-unknown-amdhsa -mcpu=gfx900 -mattr=+half-rate-64-ops < %s | FileCheck -check-prefixes=SIZE,GFX9-SIZE %s
+; RUN: opt -passes="print<cost-model>" -cost-kind=code-size 2>&1 -disable-output -mtriple=amdgcn-unknown-amdhsa -mattr=-half-rate-64-ops < %s | FileCheck -check-prefixes=SIZE,SLOW-SIZE %s
+
+define void @maxnum_f16() {
+; GFX9-LABEL: 'maxnum_f16'
+; GFX9-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %f16 = call half @llvm.maxnum.f16(half undef, half undef)
+; GFX9-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %v2f16 = call <2 x half> @llvm.maxnum.v2f16(<2 x half> undef, <2 x half> undef)
+; GFX9-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %v3f16 = call <3 x half> @llvm.maxnum.v3f16(<3 x half> undef, <3 x half> undef)
+; GFX9-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %v4f16 = call <4 x half> @llvm.maxnum.v4f16(<4 x half> undef, <4 x half> undef)
+; GFX9-NEXT:  Cost Model: Found an estimated cost of 23 for instruction: %v8f16 = call <8 x half> @llvm.maxnum.v8f16(<8 x half> undef, <8 x half> undef)
+; GFX9-NEXT:  Cost Model: Found an estimated cost of 47 for instruction: %v16f16 = call <16 x half> @llvm.maxnum.v16f16(<16 x half> undef, <16 x half> undef)
+; GFX9-NEXT:  Cost Model: Found an estimated cost of 10 for instruction: ret void
+;
+; SLOWF64-LABEL: 'maxnum_f16'
+; SLOWF64-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %f16 = call half @llvm.maxnum.f16(half undef, half undef)
+; SLOWF64-NEXT:  Cost Model: Found an estimated cost of 4 for instruction: %v2f16 = call <2 x half> @llvm.maxnum.v2f16(<2 x half> undef, <2 x half> undef)
+; SLOWF64-NEXT:  Cost Model: Found an estimated cost of 8 for instruction: %v3f16 = call <3 x half> @llvm.maxnum.v3f16(<3 x half> undef, <3 x half> undef)
+; SLOWF64-NEXT:  Cost Model: Found an estimated cost of 8 for instruction: %v4f16 = call <4 x half> @llvm.maxnum.v4f16(<4 x half> undef, <4 x half> undef)
+; SLOWF64-NEXT:  Cost Model: Found an estimated cost of 16 for instruction: %v8f16 = call <8 x half> @llvm.maxnum.v8f16(<8 x half> undef, <8 x half> undef)
+; SLOWF64-NEXT:  Cost Model: Found an estimated cost of 32 for instruction: %v16f16 = call <16 x half> @llvm.maxnum.v16f16(<16 x half> undef, <16 x half> undef)
+; SLOWF64-NEXT:  Cost Model: Found an estimated cost of 10 for instruction: ret void
+;
+; GFX9-SIZE-LABEL: 'maxnum_f16'
+; GFX9-SIZE-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %f16 = call half @llvm.maxnum.f16(half undef, half undef)
+; GFX9-SIZE-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %v2f16 = call <2 x half> @llvm.maxnum.v2f16(<2 x half> undef, <2 x half> undef)
+; GFX9-SIZE-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %v3f16 = call <3 x half> @llvm.maxnum.v3f16(<3 x half> undef, <3 x half> undef)
+; GFX9-SIZE-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %v4f16 = call <4 x half> @llvm.maxnum.v4f16(<4 x half> undef, <4 x half> undef)
+; GFX9-SIZE-NEXT:  Cost Model: Found an estimated cost of 23 for instruction: %v8f16 = call <8 x half> @llvm.maxnum.v8f16(<8 x half> undef, <8 x half> undef)
+; GFX9-SIZE-NEXT:  Cost Model: Found an estimated cost of 47 for instruction: %v16f16 = call <16 x half> @llvm.maxnum.v16f16(<16 x half> undef, <16 x half> undef)
+; GFX9-SIZE-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: ret void
+;
+; SLOW-SIZE-LABEL: 'maxnum_f16'
+; SLOW-SIZE-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %f16 = call half @llvm.maxnum.f16(half undef, half undef)
+; SLOW-SIZE-NEXT:  Cost Model: Found an estimated cost of 4 for instruction: %v2f16 = call <2 x half> @llvm.maxnum.v2f16(<2 x half> undef, <2 x half> undef)
+; SLOW-SIZE-NEXT:  Cost Model: Found an estimated cost of 8 for instruction: %v3f16 = call <3 x half> @llvm.maxnum.v3f16(<3 x half> undef, <3 x half> undef)
+; SLOW-SIZE-NEXT:  Cost Model: Found an estimated cost of 8 for instruction: %v4f16 = call <4 x half> @llvm.maxnum.v4f16(<4 x half> undef, <4 x half> undef)
+; SLOW-SIZE-NEXT:  Cost Model: Found an estimated cost of 16 for instruction: %v8f16 = call <8 x half> @llvm.maxnum.v8f16(<8 x half> undef, <8 x half> undef)
+; SLOW-SIZE-NEXT:  Cost Model: Found an estimated cost of 32 for instruction: %v16f16 = call <16 x half> @llvm.maxnum.v16f16(<16 x half> undef, <16 x half> undef)
+; SLOW-SIZE-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: ret void
+;
+; SLOW-LABEL: 'maxnum_f16'
+; SLOW-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %f16 = call half @llvm.maxnum.f16(half undef, half undef)
+; SLOW-NEXT:  Cost Model: Found an estimated cost of 4 for instruction: %v2f16 = call <2 x half> @llvm.maxnum.v2f16(<2 x half> undef, <2 x half> undef)
+; SLOW-NEXT:  Cost Model: Found an estimated cost of 8 for instruction: %v3f16 = call <3 x half> @llvm.maxnum.v3f16(<3 x half> undef, <3 x half> undef)
+; SLOW-NEXT:  Cost Model: Found an estimated cost of 8 for instruction: %v4f16 = call <4 x half> @llvm.maxnum.v4f16(<4 x half> undef, <4 x half> undef)
+; SLOW-NEXT:  Cost Model: Found an estimated cost of 16 for instruction: %v8f16 = call <8 x half> @llvm.maxnum.v8f16(<8 x half> undef, <8 x half> undef)
+; SLOW-NEXT:  Cost Model: Found an estimated cost of 32 for instruction: %v16f16 = call <16 x half> @llvm.maxnum.v16f16(<16 x half> undef, <16 x half> undef)
+; SLOW-NEXT:  Cost Model: Found an estimated cost of 10 for instruction: ret void
+  %f16 = call half @llvm.maxnum.f16(half undef, half undef)
+  %v2f16 = call <2 x half> @llvm.maxnum.v2f16(<2 x half> undef, <2 x half> undef)
+  %v3f16 = call <3x half> @llvm.maxnum.v3f16(<3 x half> undef, <3 x half> undef)
+  %v4f16 = call <4 x half> @llvm.maxnum.v4f16(<4 x half> undef, <4 x half> undef)
+  %v8f16 = call <8 x half> @llvm.maxnum.v8f16(<8 x half> undef, <8 x half> undef)
+  %v16f16 = call <16 x half> @llvm.maxnum.v16f16(<16 x half> undef, <16 x half> undef)
+  ret void
+}
+
+define void @maxnum_bf16() {
+; GFX9-LABEL: 'maxnum_bf16'
+; GFX9-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %bf16 = call bfloat @llvm.maxnum.bf16(bfloat undef, bfloat undef)
+; GFX9-NEXT:  Cost Model: Found an estimated cost of 3 for instruction: %v2bf16 = call <2 x bfloat> @llvm.maxnum.v2bf16(<2 x bfloat> undef, <2 x bfloat> undef)
+; GFX9-NEXT:  Cost Model: Found an estimated cost of 5 for instruction: %v3bf16 = call <3 x bfloat> @llvm.maxnum.v3bf16(<3 x bfloat> undef, <3 x bfloat> undef)
+; GFX9-NEXT:  Cost Model: Found an estimated cost of 7 for instruction: %v4bf16 = call <4 x bfloat> @llvm.maxnum.v4bf16(<4 x bfloat> undef, <4 x bfloat> undef)
+; GFX9-NEXT:  Cost Model: Found an estimated cost of 15 for instruction: %v8bf16 = call <8 x bfloat> @llvm.maxnum.v8bf16(<8 x bfloat> undef, <8 x bfloat> undef)
+; GFX9-NEXT:  Cost Model: Found an estimated cost of 31 for instruction: %v16bf16 = call <16 x bfloat> @llvm.maxnum.v16bf16(<16 x bfloat> undef, <16 x bfloat> undef)
+; GFX9-NEXT:  Cost Model: Found an estimated cost of 10 for instruction: ret void
+;
+; SLOWF64-LABEL: 'maxnum_bf16'
+; SLOWF64-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %bf16 = call bfloat @llvm.maxnum.bf16(bfloat undef, bfloat undef)
+; SLOWF64-NEXT:  Cost Model: Found an estimated cost of 4 for instruction: %v2bf16 = call <2 x bfloat> @llvm.maxnum.v2bf16(<2 x bfloat> undef, <2 x bfloat> undef)
+; SLOWF64-NEXT:  Cost Model: Found an estimated cost of 8 for instruction: %v3bf16 = call <3 x bfloat> @llvm.maxnum.v3bf16(<3 x bfloat> undef, <3 x bfloat> undef)
+; SLOWF64-NEXT:  Cost Model: Found an estimated cost of 8 for instruction: %v4bf16 = call <4 x bfloat> @llvm.maxnum.v4bf16(<4 x bfloat> undef, <4 x bfloat> undef)
+; SLOWF64-NEXT:  Cost Model: Found an estimated cost of 16 for instruction: %v8bf16 = call <8 x bfloat> @llvm.maxnum.v8bf16(<8 x bfloat> undef, <8 x bfloat> undef)
+; SLOWF64-NEXT:  Cost Model: Found an estimated cost of 32 for instruction: %v16bf16 = call <16 x bfloat> @llvm.maxnum.v16bf16(<16 x bfloat> undef, <16 x bfloat> undef)
+; SLOWF64-NEXT:  Cost Model: Found an estimated cost of 10 for instruction: ret void
+;
+; GFX9-SIZE-LABEL: 'maxnum_bf16'
+; GFX9-SIZE-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %bf16 = call bfloat @llvm.maxnum.bf16(bfloat undef, bfloat undef)
+; GFX9-SIZE-NEXT:  Cost Model: Found an estimated cost of 3 for instruction: %v2bf16 = call <2 x bfloat> @llvm.maxnum.v2bf16(<2 x bfloat> undef, <2 x bfloat> undef)
+; GFX9-SIZE-NEXT:  Cost Model: Found an estimated cost of 5 for instruction: %v3bf16 = call <3 x bfloat> @llvm.maxnum.v3bf16(<3 x bfloat> undef, <3 x bfloat> undef)
+; GFX9-SIZE-NEXT:  Cost Model: Found an estimated cost of 7 for instruction: %v4bf16 = call <4 x bfloat> @llvm.maxnum.v4bf16(<4 x bfloat> undef, <4 x bfloat> undef)
+; GFX9-SIZE-NEXT:  Cost Model: Found an estimated cost of 15 for instruction: %v8bf16 = call <8 x bfloat> @llvm.maxnum.v8bf16(<8 x bfloat> undef, <8 x bfloat> undef)
+; GFX9-SIZE-NEXT:  Cost Model: Found an estimated cost of 31 for instruction: %v16bf16 = call <16 x bfloat> @llvm.maxnum.v16bf16(<16 x bfloat> undef, <16 x bfloat> undef)
+; GFX9-SIZE-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: ret void
+;
+; SLOW-SIZE-LABEL: 'maxnum_bf16'
+; SLOW-SIZE-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %bf16 = call bfloat @llvm.maxnum.bf16(bfloat undef, bfloat undef)
+; SLOW-SIZE-NEXT:  Cost Model: Found an estimated cost of 4 for instruction: %v2bf16 = call <2 x bfloat> @llvm.maxnum.v2bf16(<2 x bfloat> undef, <2 x bfloat> undef)
+; SLOW-SIZE-NEXT:  Cost Model: Found an estimated cost of 8 for instruction: %v3bf16 = call <3 x bfloat> @llvm.maxnum.v3bf16(<3 x bfloat> undef, <3 x bfloat> undef)
+; SLOW-SIZE-NEXT:  Cost Model: Found an estimated cost of 8 for instruction: %v4bf16 = call <4 x bfloat> @llvm.maxnum.v4bf16(<4 x bfloat> undef, <4 x bfloat> undef)
+; SLOW-SIZE-NEXT:  Cost Model: Found an estimated cost of 16 for instruction: %v8bf16 = call <8 x bfloat> @llvm.maxnum.v8bf16(<8 x bfloat> undef, <8 x bfloat> undef)
+; SLOW-SIZE-NEXT:  Cost Model: Found an estimated cost of 32 for instruction: %v16bf16 = call <16 x bfloat> @llvm.maxnum.v16bf16(<16 x bfloat> undef, <16 x bfloat> undef)
+; SLOW-SIZE-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: ret void
+;
+; SLOW-LABEL: 'maxnum_bf16'
+; SLOW-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %bf16 = call bfloat @llvm.maxnum.bf16(bfloat undef, bfloat undef)
+; SLOW-NEXT:  Cost Model: Found an estimated cost of 4 for instruction: %v2bf16 = call <2 x bfloat> @llvm.maxnum.v2bf16(<2 x bfloat> undef, <2 x bfloat> undef)
+; SLOW-NEXT:  Cost Model: Found an estimated cost of 8 for instruction: %v3bf16 = call <3 x bfloat> @llvm.maxnum.v3bf16(<3 x bfloat> undef, <3 x bfloat> undef)
+; SLOW-NEXT:  Cost Model: Found an estimated cost of 8 for instruction: %v4bf16 = call <4 x bfloat> @llvm.maxnum.v4bf16(<4 x bfloat> undef, <4 x bfloat> undef)
+; SLOW-NEXT:  Cost Model: Found an estimated cost of 16 for instruction: %v8bf16 = call <8 x bfloat> @llvm.maxnum.v8bf16(<8 x bfloat> undef, <8 x bfloat> undef)
+; SLOW-NEXT:  Cost Model: Found an estimated cost of 32 for instruction: %v16bf16 = call <16 x bfloat> @llvm.maxnum.v16bf16(<16 x bfloat> undef, <16 x bfloat> undef)
+; SLOW-NEXT:  Cost Model: Found an estimated cost of 10 for instruction: ret void
+  %bf16 = call bfloat @llvm.maxnum.bf16(bfloat undef, bfloat undef)
+  %v2bf16 = call <2 x bfloat> @llvm.maxnum.v2bf16(<2 x bfloat> undef, <2 x bfloat> undef)
+  %v3bf16 = call <3x bfloat> @llvm.maxnum.v3bf16(<3 x bfloat> undef, <3 x bfloat> undef)
+  %v4bf16 = call <4 x bfloat> @llvm.maxnum.v4bf16(<4 x bfloat> undef, <4 x bfloat> undef)
+  %v8bf16 = call <8 x bfloat> @llvm.maxnum.v8bf16(<8 x bfloat> undef, <8 x bfloat> undef)
+  %v16bf16 = call <16 x bfloat> @llvm.maxnum.v16bf16(<16 x bfloat> undef, <16 x bfloat> undef)
+  ret void
+}
+
+define void @maxnum_f32() {
+; ALL-LABEL: 'maxnum_f32'
+; ALL-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %f32 = call float @llvm.maxnum.f32(float undef, float undef)
+; ALL-NEXT:  Cost Model: Found an estimated cost of 4 for instruction: %v2f32 = call <2 x float> @llvm.maxnum.v2f32(<2 x float> undef, <2 x float> undef)
+; ALL-NEXT:  Cost Model: Found an estimated cost of 6 for instruction: %v3f32 = call <3 x float> @llvm.maxnum.v3f32(<3 x float> undef, <3 x float> undef)
+; ALL-NEXT:  Cost Model: Found an estimated cost of 8 for instruction: %v4f32 = call <4 x float> @llvm.maxnum.v4f32(<4 x float> undef, <4 x float> undef)
+; ALL-NEXT:  Cost Model: Found an estimated cost of 16 for instruction: %v8f32 = call <8 x float> @llvm.maxnum.v8f32(<8 x float> undef, <8 x float> undef)
+; ALL-NEXT:  Cost Model: Found an estimated cost of 32 for instruction: %v16f32 = call <16 x float> @llvm.maxnum.v16f32(<16 x float> undef, <16 x float> undef)
+; ALL-NEXT:  Cost Model: Found an estimated cost of 10 for instruction: ret void
+;
+; SIZE-LABEL: 'maxnum_f32'
+; SIZE-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %f32 = call float @llvm.maxnum.f32(float undef, float undef)
+; SIZE-NEXT:  Cost Model: Found an estimated cost of 4 for instruction: %v2f32 = call <2 x float> @llvm.maxnum.v2f32(<2 x float> undef, <2 x float> undef)
+; SIZE-NEXT:  Cost Model: Found an estimated cost of 6 for instruction: %v3f32 = call <3 x float> @llvm.maxnum.v3f32(<3 x float> undef, <3 x float> undef)
+; SIZE-NEXT:  Cost Model: Found an estimated cost of 8 for instruction: %v4f32 = call <4 x float> @llvm.maxnum.v4f32(<4 x float> undef, <4 x float> undef)
+; SIZE-NEXT:  Cost Model: Found an estimated cost of 16 for instruction: %v8f32 = call <8 x float> @llvm.maxnum.v8f32(<8 x float> undef, <8 x float> undef)
+; SIZE-NEXT:  Cost Model: Found an estimated cost of 32 for instruction: %v16f32 = call <16 x float> @llvm.maxnum.v16f32(<16 x float> undef, <16 x float> undef)
+; SIZE-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: ret void
+;
+  %f32 = call float @llvm.maxnum.f32(float undef, float undef)
+  %v2f32 = call <2 x float> @llvm.maxnum.v2f32(<2 x float> undef, <2 x float> undef)
+  %v3f32 = call <3x float> @llvm.maxnum.v3f32(<3 x float> undef, <3 x float> undef)
+  %v4f32 = call <4 x float> @llvm.maxnum.v4f32(<4 x float> undef, <4 x float> undef)
+  %v8f32 = call <8 x float> @llvm.maxnum.v8f32(<8 x float> undef, <8 x float> undef)
+  %v16f32 = call <16 x float> @llvm.maxnum.v16f32(<16 x float> undef, <16 x float> undef)
+  ret void
+}
+
+define void @maxnum_f64() {
+; ALL-LABEL: 'maxnum_f64'
+; ALL-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %f64 = call double @llvm.maxnum.f64(double undef, double undef)
+; ALL-NEXT:  Cost Model: Found an estimated cost of 4 for instruction: %v2f64 = call <2 x double> @llvm.maxnum.v2f64(<2 x double> undef, <2 x double> undef)
+; ALL-NEXT:  Cost Model: Found an estimated cost of 6 for instruction: %v3f64 = call <3 x double> @llvm.maxnum.v3f64(<3 x double> undef, <3 x double> undef)
+; ALL-NEXT:  Cost Model: Found an estimated cost of 8 for instruction: %v4f64 = call <4 x double> @llvm.maxnum.v4f64(<4 x double> undef, <4 x double> undef)
+; ALL-NEXT:  Cost Model: Found an estimated cost of 16 for instruction: %v8f64 = call <8 x double> @llvm.maxnum.v8f64(<8 x double> undef, <8 x double> undef)
+; ALL-NEXT:  Cost Model: Found an estimated cost of 32 for instruction: %v16f64 = call <16 x double> @llvm.maxnum.v16f64(<16 x double> undef, <16 x double> undef)
+; ALL-NEXT:  Cost Model: Found an estimated cost of 10 for instruction: ret void
+;
+; SIZE-LABEL: 'maxnum_f64'
+; SIZE-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %f64 = call double @llvm.maxnum.f64(double undef, double undef)
+; SIZE-NEXT:  Cost Model: Found an estimated cost of 4 for instruction: %v2f64 = call <2 x double> @llvm.maxnum.v2f64(<2 x double> undef, <2 x double> undef)
+; SIZE-NEXT:  Cost Model: Found an estimated cost of 6 for instruction: %v3f64 = call <3 x double> @llvm.maxnum.v3f64(<3 x double> undef, <3 x double> undef)
+; SIZE-NEXT:  Cost Model: Found an estimated cost of 8 for instruction: %v4f64 = call <4 x double> @llvm.maxnum.v4f64(<4 x double> undef, <4 x double> undef)
+; SIZE-NEXT:  Cost Model: Found an estimated cost of 16 for instruction: %v8f64 = call <8 x double> @llvm.maxnum.v8f64(<8 x double> undef, <8 x double> undef)
+; SIZE-NEXT:  Cost Model: Found an estimated cost of 32 for instruction: %v16f64 = call <16 x double> @llvm.maxnum.v16f64(<16 x double> undef, <16 x double> undef)
+; SIZE-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: ret void
+;
+  %f64 = call double @llvm.maxnum.f64(double undef, double undef)
+  %v2f64 = call <2 x double> @llvm.maxnum.v2f64(<2 x double> undef, <2 x double> undef)
+  %v3f64 = call <3x double> @llvm.maxnum.v3f64(<3 x double> undef, <3 x double> undef)
+  %v4f64 = call <4 x double> @llvm.maxnum.v4f64(<4 x double> undef, <4 x double> undef)
+  %v8f64 = call <8 x double> @llvm.maxnum.v8f64(<8 x double> undef, <8 x double> undef)
+  %v16f64 = call <16 x double> @llvm.maxnum.v16f64(<16 x double> undef, <16 x double> undef)
+  ret void
+}
+;; NOTE: These prefixes are unused and the list is autogenerated. Do not add tests below this line:
+; FASTF64: {{.*}}
+; GFX90A-FASTF64: {{.*}}
+; GFX90A-SIZE: {{.*}}
diff --git a/llvm/test/Analysis/CostModel/AMDGPU/minimum.ll b/llvm/test/Analysis/CostModel/AMDGPU/minimum.ll
new file mode 100644
index 0000000..b6e52cf
--- /dev/null
+++ b/llvm/test/Analysis/CostModel/AMDGPU/minimum.ll
@@ -0,0 +1,175 @@
+; NOTE: Assertions have been autogenerated by utils/update_analyze_test_checks.py UTC_ARGS: --version 5
+; RUN: opt -passes="print<cost-model>" 2>&1 -disable-output -mtriple=amdgcn-unknown-amdhsa -mcpu=gfx90a -mattr=+half-rate-64-ops < %s | FileCheck -check-prefixes=ALL,GFX9,GFX90A-FASTF64 %s
+; RUN: opt -passes="print<cost-model>" 2>&1 -disable-output -mtriple=amdgcn-unknown-amdhsa -mcpu=gfx900 -mattr=+half-rate-64-ops < %s | FileCheck -check-prefixes=ALL,GFX9,FASTF64 %s
+; RUN: opt -passes="print<cost-model>" 2>&1 -disable-output -mtriple=amdgcn-unknown-amdhsa -mattr=-half-rate-64-ops < %s | FileCheck -check-prefixes=ALL,SLOWF64 %s
+; RUN: opt -passes="print<cost-model>" -cost-kind=code-size 2>&1 -disable-output -mtriple=amdgcn-unknown-amdhsa -mcpu=gfx90a -mattr=+half-rate-64-ops < %s | FileCheck -check-prefixes=SIZE,GFX9-SIZE,GFX90A-SIZE %s
+; RUN: opt -passes="print<cost-model>" -cost-kind=code-size 2>&1 -disable-output -mtriple=amdgcn-unknown-amdhsa -mcpu=gfx900 -mattr=+half-rate-64-ops < %s | FileCheck -check-prefixes=SIZE,GFX9-SIZE %s
+; RUN: opt -passes="print<cost-model>" -cost-kind=code-size 2>&1 -disable-output -mtriple=amdgcn-unknown-amdhsa -mattr=-half-rate-64-ops < %s | FileCheck -check-prefixes=SIZE,SLOW-SIZE %s
+
+define void @minimum_f16() {
+; GFX9-LABEL: 'minimum_f16'
+; GFX9-NEXT:  Cost Model: Found an estimated cost of 10 for instruction: %f16 = call half @llvm.minimum.f16(half undef, half undef)
+; GFX9-NEXT:  Cost Model: Found an estimated cost of 21 for instruction: %v2f16 = call <2 x half> @llvm.minimum.v2f16(<2 x half> undef, <2 x half> undef)
+; GFX9-NEXT:  Cost Model: Found an estimated cost of 32 for instruction: %v3f16 = call <3 x half> @llvm.minimum.v3f16(<3 x half> undef, <3 x half> undef)
+; GFX9-NEXT:  Cost Model: Found an estimated cost of 43 for instruction: %v4f16 = call <4 x half> @llvm.minimum.v4f16(<4 x half> undef, <4 x half> undef)
+; GFX9-NEXT:  Cost Model: Found an estimated cost of 87 for instruction: %v8f16 = call <8 x half> @llvm.minimum.v8f16(<8 x half> undef, <8 x half> undef)
+; GFX9-NEXT:  Cost Model: Found an estimated cost of 175 for instruction: %v16f16 = call <16 x half> @llvm.minimum.v16f16(<16 x half> undef, <16 x half> undef)
+; GFX9-NEXT:  Cost Model: Found an estimated cost of 10 for instruction: ret void
+;
+; SLOWF64-LABEL: 'minimum_f16'
+; SLOWF64-NEXT:  Cost Model: Found an estimated cost of 10 for instruction: %f16 = call half @llvm.minimum.f16(half undef, half undef)
+; SLOWF64-NEXT:  Cost Model: Found an estimated cost of 22 for instruction: %v2f16 = call <2 x half> @llvm.minimum.v2f16(<2 x half> undef, <2 x half> undef)
+; SLOWF64-NEXT:  Cost Model: Found an estimated cost of 33 for instruction: %v3f16 = call <3 x half> @llvm.minimum.v3f16(<3 x half> undef, <3 x half> undef)
+; SLOWF64-NEXT:  Cost Model: Found an estimated cost of 44 for instruction: %v4f16 = call <4 x half> @llvm.minimum.v4f16(<4 x half> undef, <4 x half> undef)
+; SLOWF64-NEXT:  Cost Model: Found an estimated cost of 88 for instruction: %v8f16 = call <8 x half> @llvm.minimum.v8f16(<8 x half> undef, <8 x half> undef)
+; SLOWF64-NEXT:  Cost Model: Found an estimated cost of 176 for instruction: %v16f16 = call <16 x half> @llvm.minimum.v16f16(<16 x half> undef, <16 x half> undef)
+; SLOWF64-NEXT:  Cost Model: Found an estimated cost of 10 for instruction: ret void
+;
+; GFX9-SIZE-LABEL: 'minimum_f16'
+; GFX9-SIZE-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %f16 = call half @llvm.minimum.f16(half undef, half undef)
+; GFX9-SIZE-NEXT:  Cost Model: Found an estimated cost of 3 for instruction: %v2f16 = call <2 x half> @llvm.minimum.v2f16(<2 x half> undef, <2 x half> undef)
+; GFX9-SIZE-NEXT:  Cost Model: Found an estimated cost of 5 for instruction: %v3f16 = call <3 x half> @llvm.minimum.v3f16(<3 x half> undef, <3 x half> undef)
+; GFX9-SIZE-NEXT:  Cost Model: Found an estimated cost of 7 for instruction: %v4f16 = call <4 x half> @llvm.minimum.v4f16(<4 x half> undef, <4 x half> undef)
+; GFX9-SIZE-NEXT:  Cost Model: Found an estimated cost of 15 for instruction: %v8f16 = call <8 x half> @llvm.minimum.v8f16(<8 x half> undef, <8 x half> undef)
+; GFX9-SIZE-NEXT:  Cost Model: Found an estimated cost of 31 for instruction: %v16f16 = call <16 x half> @llvm.minimum.v16f16(<16 x half> undef, <16 x half> undef)
+; GFX9-SIZE-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: ret void
+;
+; SLOW-SIZE-LABEL: 'minimum_f16'
+; SLOW-SIZE-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %f16 = call half @llvm.minimum.f16(half undef, half undef)
+; SLOW-SIZE-NEXT:  Cost Model: Found an estimated cost of 4 for instruction: %v2f16 = call <2 x half> @llvm.minimum.v2f16(<2 x half> undef, <2 x half> undef)
+; SLOW-SIZE-NEXT:  Cost Model: Found an estimated cost of 6 for instruction: %v3f16 = call <3 x half> @llvm.minimum.v3f16(<3 x half> undef, <3 x half> undef)
+; SLOW-SIZE-NEXT:  Cost Model: Found an estimated cost of 8 for instruction: %v4f16 = call <4 x half> @llvm.minimum.v4f16(<4 x half> undef, <4 x half> undef)
+; SLOW-SIZE-NEXT:  Cost Model: Found an estimated cost of 16 for instruction: %v8f16 = call <8 x half> @llvm.minimum.v8f16(<8 x half> undef, <8 x half> undef)
+; SLOW-SIZE-NEXT:  Cost Model: Found an estimated cost of 32 for instruction: %v16f16 = call <16 x half> @llvm.minimum.v16f16(<16 x half> undef, <16 x half> undef)
+; SLOW-SIZE-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: ret void
+;
+; SLOW-LABEL: 'minimum_f16'
+; SLOW-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %f16 = call half @llvm.minimum.f16(half undef, half undef)
+; SLOW-NEXT:  Cost Model: Found an estimated cost of 4 for instruction: %v2f16 = call <2 x half> @llvm.minimum.v2f16(<2 x half> undef, <2 x half> undef)
+; SLOW-NEXT:  Cost Model: Found an estimated cost of 8 for instruction: %v3f16 = call <3 x half> @llvm.minimum.v3f16(<3 x half> undef, <3 x half> undef)
+; SLOW-NEXT:  Cost Model: Found an estimated cost of 8 for instruction: %v4f16 = call <4 x half> @llvm.minimum.v4f16(<4 x half> undef, <4 x half> undef)
+; SLOW-NEXT:  Cost Model: Found an estimated cost of 16 for instruction: %v8f16 = call <8 x half> @llvm.minimum.v8f16(<8 x half> undef, <8 x half> undef)
+; SLOW-NEXT:  Cost Model: Found an estimated cost of 32 for instruction: %v16f16 = call <16 x half> @llvm.minimum.v16f16(<16 x half> undef, <16 x half> undef)
+; SLOW-NEXT:  Cost Model: Found an estimated cost of 10 for instruction: ret void
+  %f16 = call half @llvm.minimum.f16(half undef, half undef)
+  %v2f16 = call <2 x half> @llvm.minimum.v2f16(<2 x half> undef, <2 x half> undef)
+  %v3f16 = call <3x half> @llvm.minimum.v3f16(<3 x half> undef, <3 x half> undef)
+  %v4f16 = call <4 x half> @llvm.minimum.v4f16(<4 x half> undef, <4 x half> undef)
+  %v8f16 = call <8 x half> @llvm.minimum.v8f16(<8 x half> undef, <8 x half> undef)
+  %v16f16 = call <16 x half> @llvm.minimum.v16f16(<16 x half> undef, <16 x half> undef)
+  ret void
+}
+
+define void @minimum_bf16() {
+; GFX9-LABEL: 'minimum_bf16'
+; GFX9-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %bf16 = call bfloat @llvm.minimum.bf16(bfloat undef, bfloat undef)
+; GFX9-NEXT:  Cost Model: Found an estimated cost of 3 for instruction: %v2bf16 = call <2 x bfloat> @llvm.minimum.v2bf16(<2 x bfloat> undef, <2 x bfloat> undef)
+; GFX9-NEXT:  Cost Model: Found an estimated cost of 5 for instruction: %v3bf16 = call <3 x bfloat> @llvm.minimum.v3bf16(<3 x bfloat> undef, <3 x bfloat> undef)
+; GFX9-NEXT:  Cost Model: Found an estimated cost of 7 for instruction: %v4bf16 = call <4 x bfloat> @llvm.minimum.v4bf16(<4 x bfloat> undef, <4 x bfloat> undef)
+; GFX9-NEXT:  Cost Model: Found an estimated cost of 15 for instruction: %v8bf16 = call <8 x bfloat> @llvm.minimum.v8bf16(<8 x bfloat> undef, <8 x bfloat> undef)
+; GFX9-NEXT:  Cost Model: Found an estimated cost of 31 for instruction: %v16bf16 = call <16 x bfloat> @llvm.minimum.v16bf16(<16 x bfloat> undef, <16 x bfloat> undef)
+; GFX9-NEXT:  Cost Model: Found an estimated cost of 10 for instruction: ret void
+;
+; SLOWF64-LABEL: 'minimum_bf16'
+; SLOWF64-NEXT:  Cost Model: Found an estimated cost of 10 for instruction: %bf16 = call bfloat @llvm.minimum.bf16(bfloat undef, bfloat undef)
+; SLOWF64-NEXT:  Cost Model: Found an estimated cost of 22 for instruction: %v2bf16 = call <2 x bfloat> @llvm.minimum.v2bf16(<2 x bfloat> undef, <2 x bfloat> undef)
+; SLOWF64-NEXT:  Cost Model: Found an estimated cost of 33 for instruction: %v3bf16 = call <3 x bfloat> @llvm.minimum.v3bf16(<3 x bfloat> undef, <3 x bfloat> undef)
+; SLOWF64-NEXT:  Cost Model: Found an estimated cost of 44 for instruction: %v4bf16 = call <4 x bfloat> @llvm.minimum.v4bf16(<4 x bfloat> undef, <4 x bfloat> undef)
+; SLOWF64-NEXT:  Cost Model: Found an estimated cost of 88 for instruction: %v8bf16 = call <8 x bfloat> @llvm.minimum.v8bf16(<8 x bfloat> undef, <8 x bfloat> undef)
+; SLOWF64-NEXT:  Cost Model: Found an estimated cost of 176 for instruction: %v16bf16 = call <16 x bfloat> @llvm.minimum.v16bf16(<16 x bfloat> undef, <16 x bfloat> undef)
+; SLOWF64-NEXT:  Cost Model: Found an estimated cost of 10 for instruction: ret void
+;
+; GFX9-SIZE-LABEL: 'minimum_bf16'
+; GFX9-SIZE-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %bf16 = call bfloat @llvm.minimum.bf16(bfloat undef, bfloat undef)
+; GFX9-SIZE-NEXT:  Cost Model: Found an estimated cost of 3 for instruction: %v2bf16 = call <2 x bfloat> @llvm.minimum.v2bf16(<2 x bfloat> undef, <2 x bfloat> undef)
+; GFX9-SIZE-NEXT:  Cost Model: Found an estimated cost of 5 for instruction: %v3bf16 = call <3 x bfloat> @llvm.minimum.v3bf16(<3 x bfloat> undef, <3 x bfloat> undef)
+; GFX9-SIZE-NEXT:  Cost Model: Found an estimated cost of 7 for instruction: %v4bf16 = call <4 x bfloat> @llvm.minimum.v4bf16(<4 x bfloat> undef, <4 x bfloat> undef)
+; GFX9-SIZE-NEXT:  Cost Model: Found an estimated cost of 15 for instruction: %v8bf16 = call <8 x bfloat> @llvm.minimum.v8bf16(<8 x bfloat> undef, <8 x bfloat> undef)
+; GFX9-SIZE-NEXT:  Cost Model: Found an estimated cost of 31 for instruction: %v16bf16 = call <16 x bfloat> @llvm.minimum.v16bf16(<16 x bfloat> undef, <16 x bfloat> undef)
+; GFX9-SIZE-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: ret void
+;
+; SLOW-SIZE-LABEL: 'minimum_bf16'
+; SLOW-SIZE-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %bf16 = call bfloat @llvm.minimum.bf16(bfloat undef, bfloat undef)
+; SLOW-SIZE-NEXT:  Cost Model: Found an estimated cost of 4 for instruction: %v2bf16 = call <2 x bfloat> @llvm.minimum.v2bf16(<2 x bfloat> undef, <2 x bfloat> undef)
+; SLOW-SIZE-NEXT:  Cost Model: Found an estimated cost of 6 for instruction: %v3bf16 = call <3 x bfloat> @llvm.minimum.v3bf16(<3 x bfloat> undef, <3 x bfloat> undef)
+; SLOW-SIZE-NEXT:  Cost Model: Found an estimated cost of 8 for instruction: %v4bf16 = call <4 x bfloat> @llvm.minimum.v4bf16(<4 x bfloat> undef, <4 x bfloat> undef)
+; SLOW-SIZE-NEXT:  Cost Model: Found an estimated cost of 16 for instruction: %v8bf16 = call <8 x bfloat> @llvm.minimum.v8bf16(<8 x bfloat> undef, <8 x bfloat> undef)
+; SLOW-SIZE-NEXT:  Cost Model: Found an estimated cost of 32 for instruction: %v16bf16 = call <16 x bfloat> @llvm.minimum.v16bf16(<16 x bfloat> undef, <16 x bfloat> undef)
+; SLOW-SIZE-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: ret void
+;
+; SLOW-LABEL: 'minimum_bf16'
+; SLOW-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %bf16 = call bfloat @llvm.minimum.bf16(bfloat undef, bfloat undef)
+; SLOW-NEXT:  Cost Model: Found an estimated cost of 4 for instruction: %v2bf16 = call <2 x bfloat> @llvm.minimum.v2bf16(<2 x bfloat> undef, <2 x bfloat> undef)
+; SLOW-NEXT:  Cost Model: Found an estimated cost of 8 for instruction: %v3bf16 = call <3 x bfloat> @llvm.minimum.v3bf16(<3 x bfloat> undef, <3 x bfloat> undef)
+; SLOW-NEXT:  Cost Model: Found an estimated cost of 8 for instruction: %v4bf16 = call <4 x bfloat> @llvm.minimum.v4bf16(<4 x bfloat> undef, <4 x bfloat> undef)
+; SLOW-NEXT:  Cost Model: Found an estimated cost of 16 for instruction: %v8bf16 = call <8 x bfloat> @llvm.minimum.v8bf16(<8 x bfloat> undef, <8 x bfloat> undef)
+; SLOW-NEXT:  Cost Model: Found an estimated cost of 32 for instruction: %v16bf16 = call <16 x bfloat> @llvm.minimum.v16bf16(<16 x bfloat> undef, <16 x bfloat> undef)
+; SLOW-NEXT:  Cost Model: Found an estimated cost of 10 for instruction: ret void
+  %bf16 = call bfloat @llvm.minimum.bf16(bfloat undef, bfloat undef)
+  %v2bf16 = call <2 x bfloat> @llvm.minimum.v2bf16(<2 x bfloat> undef, <2 x bfloat> undef)
+  %v3bf16 = call <3x bfloat> @llvm.minimum.v3bf16(<3 x bfloat> undef, <3 x bfloat> undef)
+  %v4bf16 = call <4 x bfloat> @llvm.minimum.v4bf16(<4 x bfloat> undef, <4 x bfloat> undef)
+  %v8bf16 = call <8 x bfloat> @llvm.minimum.v8bf16(<8 x bfloat> undef, <8 x bfloat> undef)
+  %v16bf16 = call <16 x bfloat> @llvm.minimum.v16bf16(<16 x bfloat> undef, <16 x bfloat> undef)
+  ret void
+}
+
+define void @minimum_f32() {
+; ALL-LABEL: 'minimum_f32'
+; ALL-NEXT:  Cost Model: Found an estimated cost of 10 for instruction: %f32 = call float @llvm.minimum.f32(float undef, float undef)
+; ALL-NEXT:  Cost Model: Found an estimated cost of 20 for instruction: %v2f32 = call <2 x float> @llvm.minimum.v2f32(<2 x float> undef, <2 x float> undef)
+; ALL-NEXT:  Cost Model: Found an estimated cost of 30 for instruction: %v3f32 = call <3 x float> @llvm.minimum.v3f32(<3 x float> undef, <3 x float> undef)
+; ALL-NEXT:  Cost Model: Found an estimated cost of 40 for instruction: %v4f32 = call <4 x float> @llvm.minimum.v4f32(<4 x float> undef, <4 x float> undef)
+; ALL-NEXT:  Cost Model: Found an estimated cost of 80 for instruction: %v8f32 = call <8 x float> @llvm.minimum.v8f32(<8 x float> undef, <8 x float> undef)
+; ALL-NEXT:  Cost Model: Found an estimated cost of 160 for instruction: %v16f32 = call <16 x float> @llvm.minimum.v16f32(<16 x float> undef, <16 x float> undef)
+; ALL-NEXT:  Cost Model: Found an estimated cost of 10 for instruction: ret void
+;
+; SIZE-LABEL: 'minimum_f32'
+; SIZE-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %f32 = call float @llvm.minimum.f32(float undef, float undef)
+; SIZE-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %v2f32 = call <2 x float> @llvm.minimum.v2f32(<2 x float> undef, <2 x float> undef)
+; SIZE-NEXT:  Cost Model: Found an estimated cost of 3 for instruction: %v3f32 = call <3 x float> @llvm.minimum.v3f32(<3 x float> undef, <3 x float> undef)
+; SIZE-NEXT:  Cost Model: Found an estimated cost of 4 for instruction: %v4f32 = call <4 x float> @llvm.minimum.v4f32(<4 x float> undef, <4 x float> undef)
+; SIZE-NEXT:  Cost Model: Found an estimated cost of 8 for instruction: %v8f32 = call <8 x float> @llvm.minimum.v8f32(<8 x float> undef, <8 x float> undef)
+; SIZE-NEXT:  Cost Model: Found an estimated cost of 16 for instruction: %v16f32 = call <16 x float> @llvm.minimum.v16f32(<16 x float> undef, <16 x float> undef)
+; SIZE-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: ret void
+;
+  %f32 = call float @llvm.minimum.f32(float undef, float undef)
+  %v2f32 = call <2 x float> @llvm.minimum.v2f32(<2 x float> undef, <2 x float> undef)
+  %v3f32 = call <3x float> @llvm.minimum.v3f32(<3 x float> undef, <3 x float> undef)
+  %v4f32 = call <4 x float> @llvm.minimum.v4f32(<4 x float> undef, <4 x float> undef)
+  %v8f32 = call <8 x float> @llvm.minimum.v8f32(<8 x float> undef, <8 x float> undef)
+  %v16f32 = call <16 x float> @llvm.minimum.v16f32(<16 x float> undef, <16 x float> undef)
+  ret void
+}
+
+define void @minimum_f64() {
+; ALL-LABEL: 'minimum_f64'
+; ALL-NEXT:  Cost Model: Found an estimated cost of 10 for instruction: %f64 = call double @llvm.minimum.f64(double undef, double undef)
+; ALL-NEXT:  Cost Model: Found an estimated cost of 20 for instruction: %v2f64 = call <2 x double> @llvm.minimum.v2f64(<2 x double> undef, <2 x double> undef)
+; ALL-NEXT:  Cost Model: Found an estimated cost of 30 for instruction: %v3f64 = call <3 x double> @llvm.minimum.v3f64(<3 x double> undef, <3 x double> undef)
+; ALL-NEXT:  Cost Model: Found an estimated cost of 40 for instruction: %v4f64 = call <4 x double> @llvm.minimum.v4f64(<4 x double> undef, <4 x double> undef)
+; ALL-NEXT:  Cost Model: Found an estimated cost of 80 for instruction: %v8f64 = call <8 x double> @llvm.minimum.v8f64(<8 x double> undef, <8 x double> undef)
+; ALL-NEXT:  Cost Model: Found an estimated cost of 160 for instruction: %v16f64 = call <16 x double> @llvm.minimum.v16f64(<16 x double> undef, <16 x double> undef)
+; ALL-NEXT:  Cost Model: Found an estimated cost of 10 for instruction: ret void
+;
+; SIZE-LABEL: 'minimum_f64'
+; SIZE-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %f64 = call double @llvm.minimum.f64(double undef, double undef)
+; SIZE-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %v2f64 = call <2 x double> @llvm.minimum.v2f64(<2 x double> undef, <2 x double> undef)
+; SIZE-NEXT:  Cost Model: Found an estimated cost of 3 for instruction: %v3f64 = call <3 x double> @llvm.minimum.v3f64(<3 x double> undef, <3 x double> undef)
+; SIZE-NEXT:  Cost Model: Found an estimated cost of 4 for instruction: %v4f64 = call <4 x double> @llvm.minimum.v4f64(<4 x double> undef, <4 x double> undef)
+; SIZE-NEXT:  Cost Model: Found an estimated cost of 8 for instruction: %v8f64 = call <8 x double> @llvm.minimum.v8f64(<8 x double> undef, <8 x double> undef)
+; SIZE-NEXT:  Cost Model: Found an estimated cost of 16 for instruction: %v16f64 = call <16 x double> @llvm.minimum.v16f64(<16 x double> undef, <16 x double> undef)
+; SIZE-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: ret void
+;
+  %f64 = call double @llvm.minimum.f64(double undef, double undef)
+  %v2f64 = call <2 x double> @llvm.minimum.v2f64(<2 x double> undef, <2 x double> undef)
+  %v3f64 = call <3x double> @llvm.minimum.v3f64(<3 x double> undef, <3 x double> undef)
+  %v4f64 = call <4 x double> @llvm.minimum.v4f64(<4 x double> undef, <4 x double> undef)
+  %v8f64 = call <8 x double> @llvm.minimum.v8f64(<8 x double> undef, <8 x double> undef)
+  %v16f64 = call <16 x double> @llvm.minimum.v16f64(<16 x double> undef, <16 x double> undef)
+  ret void
+}
+;; NOTE: These prefixes are unused and the list is autogenerated. Do not add tests below this line:
+; FASTF64: {{.*}}
+; GFX90A-FASTF64: {{.*}}
+; GFX90A-SIZE: {{.*}}
diff --git a/llvm/test/Analysis/CostModel/AMDGPU/minnum.ll b/llvm/test/Analysis/CostModel/AMDGPU/minnum.ll
new file mode 100644
index 0000000..61432ba
--- /dev/null
+++ b/llvm/test/Analysis/CostModel/AMDGPU/minnum.ll
@@ -0,0 +1,175 @@
+; NOTE: Assertions have been autogenerated by utils/update_analyze_test_checks.py UTC_ARGS: --version 5
+; RUN: opt -passes="print<cost-model>" 2>&1 -disable-output -mtriple=amdgcn-unknown-amdhsa -mcpu=gfx90a -mattr=+half-rate-64-ops < %s | FileCheck -check-prefixes=ALL,GFX9,GFX90A-FASTF64 %s
+; RUN: opt -passes="print<cost-model>" 2>&1 -disable-output -mtriple=amdgcn-unknown-amdhsa -mcpu=gfx900 -mattr=+half-rate-64-ops < %s | FileCheck -check-prefixes=ALL,GFX9,FASTF64 %s
+; RUN: opt -passes="print<cost-model>" 2>&1 -disable-output -mtriple=amdgcn-unknown-amdhsa -mattr=-half-rate-64-ops < %s | FileCheck -check-prefixes=ALL,SLOWF64 %s
+; RUN: opt -passes="print<cost-model>" -cost-kind=code-size 2>&1 -disable-output -mtriple=amdgcn-unknown-amdhsa -mcpu=gfx90a -mattr=+half-rate-64-ops < %s | FileCheck -check-prefixes=SIZE,GFX9-SIZE,GFX90A-SIZE %s
+; RUN: opt -passes="print<cost-model>" -cost-kind=code-size 2>&1 -disable-output -mtriple=amdgcn-unknown-amdhsa -mcpu=gfx900 -mattr=+half-rate-64-ops < %s | FileCheck -check-prefixes=SIZE,GFX9-SIZE %s
+; RUN: opt -passes="print<cost-model>" -cost-kind=code-size 2>&1 -disable-output -mtriple=amdgcn-unknown-amdhsa -mattr=-half-rate-64-ops < %s | FileCheck -check-prefixes=SIZE,SLOW-SIZE %s
+
+define void @minnum_f16() {
+; GFX9-LABEL: 'minnum_f16'
+; GFX9-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %f16 = call half @llvm.minnum.f16(half undef, half undef)
+; GFX9-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %v2f16 = call <2 x half> @llvm.minnum.v2f16(<2 x half> undef, <2 x half> undef)
+; GFX9-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %v3f16 = call <3 x half> @llvm.minnum.v3f16(<3 x half> undef, <3 x half> undef)
+; GFX9-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %v4f16 = call <4 x half> @llvm.minnum.v4f16(<4 x half> undef, <4 x half> undef)
+; GFX9-NEXT:  Cost Model: Found an estimated cost of 23 for instruction: %v8f16 = call <8 x half> @llvm.minnum.v8f16(<8 x half> undef, <8 x half> undef)
+; GFX9-NEXT:  Cost Model: Found an estimated cost of 47 for instruction: %v16f16 = call <16 x half> @llvm.minnum.v16f16(<16 x half> undef, <16 x half> undef)
+; GFX9-NEXT:  Cost Model: Found an estimated cost of 10 for instruction: ret void
+;
+; SLOWF64-LABEL: 'minnum_f16'
+; SLOWF64-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %f16 = call half @llvm.minnum.f16(half undef, half undef)
+; SLOWF64-NEXT:  Cost Model: Found an estimated cost of 4 for instruction: %v2f16 = call <2 x half> @llvm.minnum.v2f16(<2 x half> undef, <2 x half> undef)
+; SLOWF64-NEXT:  Cost Model: Found an estimated cost of 8 for instruction: %v3f16 = call <3 x half> @llvm.minnum.v3f16(<3 x half> undef, <3 x half> undef)
+; SLOWF64-NEXT:  Cost Model: Found an estimated cost of 8 for instruction: %v4f16 = call <4 x half> @llvm.minnum.v4f16(<4 x half> undef, <4 x half> undef)
+; SLOWF64-NEXT:  Cost Model: Found an estimated cost of 16 for instruction: %v8f16 = call <8 x half> @llvm.minnum.v8f16(<8 x half> undef, <8 x half> undef)
+; SLOWF64-NEXT:  Cost Model: Found an estimated cost of 32 for instruction: %v16f16 = call <16 x half> @llvm.minnum.v16f16(<16 x half> undef, <16 x half> undef)
+; SLOWF64-NEXT:  Cost Model: Found an estimated cost of 10 for instruction: ret void
+;
+; GFX9-SIZE-LABEL: 'minnum_f16'
+; GFX9-SIZE-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %f16 = call half @llvm.minnum.f16(half undef, half undef)
+; GFX9-SIZE-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %v2f16 = call <2 x half> @llvm.minnum.v2f16(<2 x half> undef, <2 x half> undef)
+; GFX9-SIZE-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %v3f16 = call <3 x half> @llvm.minnum.v3f16(<3 x half> undef, <3 x half> undef)
+; GFX9-SIZE-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %v4f16 = call <4 x half> @llvm.minnum.v4f16(<4 x half> undef, <4 x half> undef)
+; GFX9-SIZE-NEXT:  Cost Model: Found an estimated cost of 23 for instruction: %v8f16 = call <8 x half> @llvm.minnum.v8f16(<8 x half> undef, <8 x half> undef)
+; GFX9-SIZE-NEXT:  Cost Model: Found an estimated cost of 47 for instruction: %v16f16 = call <16 x half> @llvm.minnum.v16f16(<16 x half> undef, <16 x half> undef)
+; GFX9-SIZE-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: ret void
+;
+; SLOW-SIZE-LABEL: 'minnum_f16'
+; SLOW-SIZE-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %f16 = call half @llvm.minnum.f16(half undef, half undef)
+; SLOW-SIZE-NEXT:  Cost Model: Found an estimated cost of 4 for instruction: %v2f16 = call <2 x half> @llvm.minnum.v2f16(<2 x half> undef, <2 x half> undef)
+; SLOW-SIZE-NEXT:  Cost Model: Found an estimated cost of 8 for instruction: %v3f16 = call <3 x half> @llvm.minnum.v3f16(<3 x half> undef, <3 x half> undef)
+; SLOW-SIZE-NEXT:  Cost Model: Found an estimated cost of 8 for instruction: %v4f16 = call <4 x half> @llvm.minnum.v4f16(<4 x half> undef, <4 x half> undef)
+; SLOW-SIZE-NEXT:  Cost Model: Found an estimated cost of 16 for instruction: %v8f16 = call <8 x half> @llvm.minnum.v8f16(<8 x half> undef, <8 x half> undef)
+; SLOW-SIZE-NEXT:  Cost Model: Found an estimated cost of 32 for instruction: %v16f16 = call <16 x half> @llvm.minnum.v16f16(<16 x half> undef, <16 x half> undef)
+; SLOW-SIZE-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: ret void
+;
+; SLOW-LABEL: 'minnum_f16'
+; SLOW-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %f16 = call half @llvm.minnum.f16(half undef, half undef)
+; SLOW-NEXT:  Cost Model: Found an estimated cost of 4 for instruction: %v2f16 = call <2 x half> @llvm.minnum.v2f16(<2 x half> undef, <2 x half> undef)
+; SLOW-NEXT:  Cost Model: Found an estimated cost of 8 for instruction: %v3f16 = call <3 x half> @llvm.minnum.v3f16(<3 x half> undef, <3 x half> undef)
+; SLOW-NEXT:  Cost Model: Found an estimated cost of 8 for instruction: %v4f16 = call <4 x half> @llvm.minnum.v4f16(<4 x half> undef, <4 x half> undef)
+; SLOW-NEXT:  Cost Model: Found an estimated cost of 16 for instruction: %v8f16 = call <8 x half> @llvm.minnum.v8f16(<8 x half> undef, <8 x half> undef)
+; SLOW-NEXT:  Cost Model: Found an estimated cost of 32 for instruction: %v16f16 = call <16 x half> @llvm.minnum.v16f16(<16 x half> undef, <16 x half> undef)
+; SLOW-NEXT:  Cost Model: Found an estimated cost of 10 for instruction: ret void
+  %f16 = call half @llvm.minnum.f16(half undef, half undef)
+  %v2f16 = call <2 x half> @llvm.minnum.v2f16(<2 x half> undef, <2 x half> undef)
+  %v3f16 = call <3x half> @llvm.minnum.v3f16(<3 x half> undef, <3 x half> undef)
+  %v4f16 = call <4 x half> @llvm.minnum.v4f16(<4 x half> undef, <4 x half> undef)
+  %v8f16 = call <8 x half> @llvm.minnum.v8f16(<8 x half> undef, <8 x half> undef)
+  %v16f16 = call <16 x half> @llvm.minnum.v16f16(<16 x half> undef, <16 x half> undef)
+  ret void
+}
+
+define void @minnum_bf16() {
+; GFX9-LABEL: 'minnum_bf16'
+; GFX9-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %bf16 = call bfloat @llvm.minnum.bf16(bfloat undef, bfloat undef)
+; GFX9-NEXT:  Cost Model: Found an estimated cost of 3 for instruction: %v2bf16 = call <2 x bfloat> @llvm.minnum.v2bf16(<2 x bfloat> undef, <2 x bfloat> undef)
+; GFX9-NEXT:  Cost Model: Found an estimated cost of 5 for instruction: %v3bf16 = call <3 x bfloat> @llvm.minnum.v3bf16(<3 x bfloat> undef, <3 x bfloat> undef)
+; GFX9-NEXT:  Cost Model: Found an estimated cost of 7 for instruction: %v4bf16 = call <4 x bfloat> @llvm.minnum.v4bf16(<4 x bfloat> undef, <4 x bfloat> undef)
+; GFX9-NEXT:  Cost Model: Found an estimated cost of 15 for instruction: %v8bf16 = call <8 x bfloat> @llvm.minnum.v8bf16(<8 x bfloat> undef, <8 x bfloat> undef)
+; GFX9-NEXT:  Cost Model: Found an estimated cost of 31 for instruction: %v16bf16 = call <16 x bfloat> @llvm.minnum.v16bf16(<16 x bfloat> undef, <16 x bfloat> undef)
+; GFX9-NEXT:  Cost Model: Found an estimated cost of 10 for instruction: ret void
+;
+; SLOWF64-LABEL: 'minnum_bf16'
+; SLOWF64-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %bf16 = call bfloat @llvm.minnum.bf16(bfloat undef, bfloat undef)
+; SLOWF64-NEXT:  Cost Model: Found an estimated cost of 4 for instruction: %v2bf16 = call <2 x bfloat> @llvm.minnum.v2bf16(<2 x bfloat> undef, <2 x bfloat> undef)
+; SLOWF64-NEXT:  Cost Model: Found an estimated cost of 8 for instruction: %v3bf16 = call <3 x bfloat> @llvm.minnum.v3bf16(<3 x bfloat> undef, <3 x bfloat> undef)
+; SLOWF64-NEXT:  Cost Model: Found an estimated cost of 8 for instruction: %v4bf16 = call <4 x bfloat> @llvm.minnum.v4bf16(<4 x bfloat> undef, <4 x bfloat> undef)
+; SLOWF64-NEXT:  Cost Model: Found an estimated cost of 16 for instruction: %v8bf16 = call <8 x bfloat> @llvm.minnum.v8bf16(<8 x bfloat> undef, <8 x bfloat> undef)
+; SLOWF64-NEXT:  Cost Model: Found an estimated cost of 32 for instruction: %v16bf16 = call <16 x bfloat> @llvm.minnum.v16bf16(<16 x bfloat> undef, <16 x bfloat> undef)
+; SLOWF64-NEXT:  Cost Model: Found an estimated cost of 10 for instruction: ret void
+;
+; GFX9-SIZE-LABEL: 'minnum_bf16'
+; GFX9-SIZE-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %bf16 = call bfloat @llvm.minnum.bf16(bfloat undef, bfloat undef)
+; GFX9-SIZE-NEXT:  Cost Model: Found an estimated cost of 3 for instruction: %v2bf16 = call <2 x bfloat> @llvm.minnum.v2bf16(<2 x bfloat> undef, <2 x bfloat> undef)
+; GFX9-SIZE-NEXT:  Cost Model: Found an estimated cost of 5 for instruction: %v3bf16 = call <3 x bfloat> @llvm.minnum.v3bf16(<3 x bfloat> undef, <3 x bfloat> undef)
+; GFX9-SIZE-NEXT:  Cost Model: Found an estimated cost of 7 for instruction: %v4bf16 = call <4 x bfloat> @llvm.minnum.v4bf16(<4 x bfloat> undef, <4 x bfloat> undef)
+; GFX9-SIZE-NEXT:  Cost Model: Found an estimated cost of 15 for instruction: %v8bf16 = call <8 x bfloat> @llvm.minnum.v8bf16(<8 x bfloat> undef, <8 x bfloat> undef)
+; GFX9-SIZE-NEXT:  Cost Model: Found an estimated cost of 31 for instruction: %v16bf16 = call <16 x bfloat> @llvm.minnum.v16bf16(<16 x bfloat> undef, <16 x bfloat> undef)
+; GFX9-SIZE-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: ret void
+;
+; SLOW-SIZE-LABEL: 'minnum_bf16'
+; SLOW-SIZE-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %bf16 = call bfloat @llvm.minnum.bf16(bfloat undef, bfloat undef)
+; SLOW-SIZE-NEXT:  Cost Model: Found an estimated cost of 4 for instruction: %v2bf16 = call <2 x bfloat> @llvm.minnum.v2bf16(<2 x bfloat> undef, <2 x bfloat> undef)
+; SLOW-SIZE-NEXT:  Cost Model: Found an estimated cost of 8 for instruction: %v3bf16 = call <3 x bfloat> @llvm.minnum.v3bf16(<3 x bfloat> undef, <3 x bfloat> undef)
+; SLOW-SIZE-NEXT:  Cost Model: Found an estimated cost of 8 for instruction: %v4bf16 = call <4 x bfloat> @llvm.minnum.v4bf16(<4 x bfloat> undef, <4 x bfloat> undef)
+; SLOW-SIZE-NEXT:  Cost Model: Found an estimated cost of 16 for instruction: %v8bf16 = call <8 x bfloat> @llvm.minnum.v8bf16(<8 x bfloat> undef, <8 x bfloat> undef)
+; SLOW-SIZE-NEXT:  Cost Model: Found an estimated cost of 32 for instruction: %v16bf16 = call <16 x bfloat> @llvm.minnum.v16bf16(<16 x bfloat> undef, <16 x bfloat> undef)
+; SLOW-SIZE-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: ret void
+;
+; SLOW-LABEL: 'minnum_bf16'
+; SLOW-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %bf16 = call bfloat @llvm.minnum.bf16(bfloat undef, bfloat undef)
+; SLOW-NEXT:  Cost Model: Found an estimated cost of 4 for instruction: %v2bf16 = call <2 x bfloat> @llvm.minnum.v2bf16(<2 x bfloat> undef, <2 x bfloat> undef)
+; SLOW-NEXT:  Cost Model: Found an estimated cost of 8 for instruction: %v3bf16 = call <3 x bfloat> @llvm.minnum.v3bf16(<3 x bfloat> undef, <3 x bfloat> undef)
+; SLOW-NEXT:  Cost Model: Found an estimated cost of 8 for instruction: %v4bf16 = call <4 x bfloat> @llvm.minnum.v4bf16(<4 x bfloat> undef, <4 x bfloat> undef)
+; SLOW-NEXT:  Cost Model: Found an estimated cost of 16 for instruction: %v8bf16 = call <8 x bfloat> @llvm.minnum.v8bf16(<8 x bfloat> undef, <8 x bfloat> undef)
+; SLOW-NEXT:  Cost Model: Found an estimated cost of 32 for instruction: %v16bf16 = call <16 x bfloat> @llvm.minnum.v16bf16(<16 x bfloat> undef, <16 x bfloat> undef)
+; SLOW-NEXT:  Cost Model: Found an estimated cost of 10 for instruction: ret void
+  %bf16 = call bfloat @llvm.minnum.bf16(bfloat undef, bfloat undef)
+  %v2bf16 = call <2 x bfloat> @llvm.minnum.v2bf16(<2 x bfloat> undef, <2 x bfloat> undef)
+  %v3bf16 = call <3x bfloat> @llvm.minnum.v3bf16(<3 x bfloat> undef, <3 x bfloat> undef)
+  %v4bf16 = call <4 x bfloat> @llvm.minnum.v4bf16(<4 x bfloat> undef, <4 x bfloat> undef)
+  %v8bf16 = call <8 x bfloat> @llvm.minnum.v8bf16(<8 x bfloat> undef, <8 x bfloat> undef)
+  %v16bf16 = call <16 x bfloat> @llvm.minnum.v16bf16(<16 x bfloat> undef, <16 x bfloat> undef)
+  ret void
+}
+
+define void @minnum_f32() {
+; ALL-LABEL: 'minnum_f32'
+; ALL-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %f32 = call float @llvm.minnum.f32(float undef, float undef)
+; ALL-NEXT:  Cost Model: Found an estimated cost of 4 for instruction: %v2f32 = call <2 x float> @llvm.minnum.v2f32(<2 x float> undef, <2 x float> undef)
+; ALL-NEXT:  Cost Model: Found an estimated cost of 6 for instruction: %v3f32 = call <3 x float> @llvm.minnum.v3f32(<3 x float> undef, <3 x float> undef)
+; ALL-NEXT:  Cost Model: Found an estimated cost of 8 for instruction: %v4f32 = call <4 x float> @llvm.minnum.v4f32(<4 x float> undef, <4 x float> undef)
+; ALL-NEXT:  Cost Model: Found an estimated cost of 16 for instruction: %v8f32 = call <8 x float> @llvm.minnum.v8f32(<8 x float> undef, <8 x float> undef)
+; ALL-NEXT:  Cost Model: Found an estimated cost of 32 for instruction: %v16f32 = call <16 x float> @llvm.minnum.v16f32(<16 x float> undef, <16 x float> undef)
+; ALL-NEXT:  Cost Model: Found an estimated cost of 10 for instruction: ret void
+;
+; SIZE-LABEL: 'minnum_f32'
+; SIZE-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %f32 = call float @llvm.minnum.f32(float undef, float undef)
+; SIZE-NEXT:  Cost Model: Found an estimated cost of 4 for instruction: %v2f32 = call <2 x float> @llvm.minnum.v2f32(<2 x float> undef, <2 x float> undef)
+; SIZE-NEXT:  Cost Model: Found an estimated cost of 6 for instruction: %v3f32 = call <3 x float> @llvm.minnum.v3f32(<3 x float> undef, <3 x float> undef)
+; SIZE-NEXT:  Cost Model: Found an estimated cost of 8 for instruction: %v4f32 = call <4 x float> @llvm.minnum.v4f32(<4 x float> undef, <4 x float> undef)
+; SIZE-NEXT:  Cost Model: Found an estimated cost of 16 for instruction: %v8f32 = call <8 x float> @llvm.minnum.v8f32(<8 x float> undef, <8 x float> undef)
+; SIZE-NEXT:  Cost Model: Found an estimated cost of 32 for instruction: %v16f32 = call <16 x float> @llvm.minnum.v16f32(<16 x float> undef, <16 x float> undef)
+; SIZE-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: ret void
+;
+  %f32 = call float @llvm.minnum.f32(float undef, float undef)
+  %v2f32 = call <2 x float> @llvm.minnum.v2f32(<2 x float> undef, <2 x float> undef)
+  %v3f32 = call <3x float> @llvm.minnum.v3f32(<3 x float> undef, <3 x float> undef)
+  %v4f32 = call <4 x float> @llvm.minnum.v4f32(<4 x float> undef, <4 x float> undef)
+  %v8f32 = call <8 x float> @llvm.minnum.v8f32(<8 x float> undef, <8 x float> undef)
+  %v16f32 = call <16 x float> @llvm.minnum.v16f32(<16 x float> undef, <16 x float> undef)
+  ret void
+}
+
+define void @minnum_f64() {
+; ALL-LABEL: 'minnum_f64'
+; ALL-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %f64 = call double @llvm.minnum.f64(double undef, double undef)
+; ALL-NEXT:  Cost Model: Found an estimated cost of 4 for instruction: %v2f64 = call <2 x double> @llvm.minnum.v2f64(<2 x double> undef, <2 x double> undef)
+; ALL-NEXT:  Cost Model: Found an estimated cost of 6 for instruction: %v3f64 = call <3 x double> @llvm.minnum.v3f64(<3 x double> undef, <3 x double> undef)
+; ALL-NEXT:  Cost Model: Found an estimated cost of 8 for instruction: %v4f64 = call <4 x double> @llvm.minnum.v4f64(<4 x double> undef, <4 x double> undef)
+; ALL-NEXT:  Cost Model: Found an estimated cost of 16 for instruction: %v8f64 = call <8 x double> @llvm.minnum.v8f64(<8 x double> undef, <8 x double> undef)
+; ALL-NEXT:  Cost Model: Found an estimated cost of 32 for instruction: %v16f64 = call <16 x double> @llvm.minnum.v16f64(<16 x double> undef, <16 x double> undef)
+; ALL-NEXT:  Cost Model: Found an estimated cost of 10 for instruction: ret void
+;
+; SIZE-LABEL: 'minnum_f64'
+; SIZE-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %f64 = call double @llvm.minnum.f64(double undef, double undef)
+; SIZE-NEXT:  Cost Model: Found an estimated cost of 4 for instruction: %v2f64 = call <2 x double> @llvm.minnum.v2f64(<2 x double> undef, <2 x double> undef)
+; SIZE-NEXT:  Cost Model: Found an estimated cost of 6 for instruction: %v3f64 = call <3 x double> @llvm.minnum.v3f64(<3 x double> undef, <3 x double> undef)
+; SIZE-NEXT:  Cost Model: Found an estimated cost of 8 for instruction: %v4f64 = call <4 x double> @llvm.minnum.v4f64(<4 x double> undef, <4 x double> undef)
+; SIZE-NEXT:  Cost Model: Found an estimated cost of 16 for instruction: %v8f64 = call <8 x double> @llvm.minnum.v8f64(<8 x double> undef, <8 x double> undef)
+; SIZE-NEXT:  Cost Model: Found an estimated cost of 32 for instruction: %v16f64 = call <16 x double> @llvm.minnum.v16f64(<16 x double> undef, <16 x double> undef)
+; SIZE-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: ret void
+;
+  %f64 = call double @llvm.minnum.f64(double undef, double undef)
+  %v2f64 = call <2 x double> @llvm.minnum.v2f64(<2 x double> undef, <2 x double> undef)
+  %v3f64 = call <3x double> @llvm.minnum.v3f64(<3 x double> undef, <3 x double> undef)
+  %v4f64 = call <4 x double> @llvm.minnum.v4f64(<4 x double> undef, <4 x double> undef)
+  %v8f64 = call <8 x double> @llvm.minnum.v8f64(<8 x double> undef, <8 x double> undef)
+  %v16f64 = call <16 x double> @llvm.minnum.v16f64(<16 x double> undef, <16 x double> undef)
+  ret void
+}
+;; NOTE: These prefixes are unused and the list is autogenerated. Do not add tests below this line:
+; FASTF64: {{.*}}
+; GFX90A-FASTF64: {{.*}}
+; GFX90A-SIZE: {{.*}}
diff --git a/llvm/test/Analysis/CostModel/AMDGPU/ptrmask.ll b/llvm/test/Analysis/CostModel/AMDGPU/ptrmask.ll
new file mode 100644
index 0000000..8600dd7
--- /dev/null
+++ b/llvm/test/Analysis/CostModel/AMDGPU/ptrmask.ll
@@ -0,0 +1,108 @@
+; NOTE: Assertions have been autogenerated by utils/update_analyze_test_checks.py UTC_ARGS: --version 5
+; RUN: opt -passes="print<cost-model>" 2>&1 -disable-output -mtriple=amdgcn-unknown-amdhsa < %s | FileCheck -check-prefixes=ALL %s
+; RUN: opt -passes="print<cost-model>" -cost-kind=code-size 2>&1 -disable-output -mtriple=amdgcn-unknown-amdhsa < %s | FileCheck -check-prefixes=ALL-SIZE %s
+; END.
+
+define ptr @ptrmask_p0_i64(ptr %ptr, i64 %mask) {
+; ALL-LABEL: 'ptrmask_p0_i64'
+; ALL-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %result = call ptr @llvm.ptrmask.p0.i64(ptr %ptr, i64 %mask)
+; ALL-NEXT:  Cost Model: Found an estimated cost of 10 for instruction: ret ptr %result
+;
+; ALL-SIZE-LABEL: 'ptrmask_p0_i64'
+; ALL-SIZE-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %result = call ptr @llvm.ptrmask.p0.i64(ptr %ptr, i64 %mask)
+; ALL-SIZE-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: ret ptr %result
+;
+  %result = call ptr @llvm.ptrmask.p0.i64(ptr %ptr, i64 %mask)
+  ret ptr %result
+}
+
+define <2 x ptr> @ptrmask_v2p0_v2i64(<2 x ptr> %ptr, <2 x i64> %mask) {
+; ALL-LABEL: 'ptrmask_v2p0_v2i64'
+; ALL-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %result = call <2 x ptr> @llvm.ptrmask.v2p0.v2i64(<2 x ptr> %ptr, <2 x i64> %mask)
+; ALL-NEXT:  Cost Model: Found an estimated cost of 10 for instruction: ret <2 x ptr> %result
+;
+; ALL-SIZE-LABEL: 'ptrmask_v2p0_v2i64'
+; ALL-SIZE-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %result = call <2 x ptr> @llvm.ptrmask.v2p0.v2i64(<2 x ptr> %ptr, <2 x i64> %mask)
+; ALL-SIZE-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: ret <2 x ptr> %result
+;
+  %result = call <2 x ptr> @llvm.ptrmask.v2p0.v2i64(<2 x ptr> %ptr, <2 x i64> %mask)
+  ret <2 x ptr> %result
+}
+
+define ptr addrspace(1) @ptrmask_p1_i64(ptr addrspace(1) %ptr, i64 %mask) {
+; ALL-LABEL: 'ptrmask_p1_i64'
+; ALL-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %result = call ptr addrspace(1) @llvm.ptrmask.p1.i64(ptr addrspace(1) %ptr, i64 %mask)
+; ALL-NEXT:  Cost Model: Found an estimated cost of 10 for instruction: ret ptr addrspace(1) %result
+;
+; ALL-SIZE-LABEL: 'ptrmask_p1_i64'
+; ALL-SIZE-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %result = call ptr addrspace(1) @llvm.ptrmask.p1.i64(ptr addrspace(1) %ptr, i64 %mask)
+; ALL-SIZE-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: ret ptr addrspace(1) %result
+;
+  %result = call ptr addrspace(1) @llvm.ptrmask.p1.i64(ptr addrspace(1) %ptr, i64 %mask)
+  ret ptr addrspace(1) %result
+}
+
+define ptr addrspace(5) @ptrmask_p5_i32(ptr addrspace(5) %ptr, i32 %mask) {
+; ALL-LABEL: 'ptrmask_p5_i32'
+; ALL-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %result = call ptr addrspace(5) @llvm.ptrmask.p5.i32(ptr addrspace(5) %ptr, i32 %mask)
+; ALL-NEXT:  Cost Model: Found an estimated cost of 10 for instruction: ret ptr addrspace(5) %result
+;
+; ALL-SIZE-LABEL: 'ptrmask_p5_i32'
+; ALL-SIZE-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %result = call ptr addrspace(5) @llvm.ptrmask.p5.i32(ptr addrspace(5) %ptr, i32 %mask)
+; ALL-SIZE-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: ret ptr addrspace(5) %result
+;
+  %result = call ptr addrspace(5) @llvm.ptrmask.p5.i32(ptr addrspace(5) %ptr, i32 %mask)
+  ret ptr addrspace(5) %result
+}
+
+define ptr addrspace(3) @ptrmask_p3_i32(ptr addrspace(3) %ptr, i32 %mask) {
+; ALL-LABEL: 'ptrmask_p3_i32'
+; ALL-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %result = call ptr addrspace(3) @llvm.ptrmask.p3.i32(ptr addrspace(3) %ptr, i32 %mask)
+; ALL-NEXT:  Cost Model: Found an estimated cost of 10 for instruction: ret ptr addrspace(3) %result
+;
+; ALL-SIZE-LABEL: 'ptrmask_p3_i32'
+; ALL-SIZE-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %result = call ptr addrspace(3) @llvm.ptrmask.p3.i32(ptr addrspace(3) %ptr, i32 %mask)
+; ALL-SIZE-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: ret ptr addrspace(3) %result
+;
+  %result = call ptr addrspace(3) @llvm.ptrmask.p3.i32(ptr addrspace(3) %ptr, i32 %mask)
+  ret ptr addrspace(3) %result
+}
+
+define <2 x ptr addrspace(5)> @ptrmask_v2p5_v2i32(<2 x ptr addrspace(5)> %ptr, <2 x i32> %mask) {
+; ALL-LABEL: 'ptrmask_v2p5_v2i32'
+; ALL-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %result = call <2 x ptr addrspace(5)> @llvm.ptrmask.v2p5.v2i32(<2 x ptr addrspace(5)> %ptr, <2 x i32> %mask)
+; ALL-NEXT:  Cost Model: Found an estimated cost of 10 for instruction: ret <2 x ptr addrspace(5)> %result
+;
+; ALL-SIZE-LABEL: 'ptrmask_v2p5_v2i32'
+; ALL-SIZE-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %result = call <2 x ptr addrspace(5)> @llvm.ptrmask.v2p5.v2i32(<2 x ptr addrspace(5)> %ptr, <2 x i32> %mask)
+; ALL-SIZE-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: ret <2 x ptr addrspace(5)> %result
+;
+  %result = call <2 x ptr addrspace(5)> @llvm.ptrmask.v2p5.v2i32(<2 x ptr addrspace(5)> %ptr, <2 x i32> %mask)
+  ret <2 x ptr addrspace(5)> %result
+}
+
+define <3 x ptr> @ptrmask_v3p0_v3i64(<3 x ptr> %ptr, <3 x i64> %mask) {
+; ALL-LABEL: 'ptrmask_v3p0_v3i64'
+; ALL-NEXT:  Cost Model: Found an estimated cost of 3 for instruction: %result = call <3 x ptr> @llvm.ptrmask.v3p0.v3i64(<3 x ptr> %ptr, <3 x i64> %mask)
+; ALL-NEXT:  Cost Model: Found an estimated cost of 10 for instruction: ret <3 x ptr> %result
+;
+; ALL-SIZE-LABEL: 'ptrmask_v3p0_v3i64'
+; ALL-SIZE-NEXT:  Cost Model: Found an estimated cost of 3 for instruction: %result = call <3 x ptr> @llvm.ptrmask.v3p0.v3i64(<3 x ptr> %ptr, <3 x i64> %mask)
+; ALL-SIZE-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: ret <3 x ptr> %result
+;
+  %result = call <3 x ptr> @llvm.ptrmask.v3p0.v3i64(<3 x ptr> %ptr, <3 x i64> %mask)
+  ret <3 x ptr> %result
+}
+
+define <3 x ptr addrspace(5)> @ptrmask_v3p5_v3i32(<3 x ptr addrspace(5)> %ptr, <3 x i32> %mask) {
+; ALL-LABEL: 'ptrmask_v3p5_v3i32'
+; ALL-NEXT:  Cost Model: Found an estimated cost of 3 for instruction: %result = call <3 x ptr addrspace(5)> @llvm.ptrmask.v3p5.v3i32(<3 x ptr addrspace(5)> %ptr, <3 x i32> %mask)
+; ALL-NEXT:  Cost Model: Found an estimated cost of 10 for instruction: ret <3 x ptr addrspace(5)> %result
+;
+; ALL-SIZE-LABEL: 'ptrmask_v3p5_v3i32'
+; ALL-SIZE-NEXT:  Cost Model: Found an estimated cost of 3 for instruction: %result = call <3 x ptr addrspace(5)> @llvm.ptrmask.v3p5.v3i32(<3 x ptr addrspace(5)> %ptr, <3 x i32> %mask)
+; ALL-SIZE-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: ret <3 x ptr addrspace(5)> %result
+;
+  %result = call <3 x ptr addrspace(5)> @llvm.ptrmask.v3p5.v3i32(<3 x ptr addrspace(5)> %ptr, <3 x i32> %mask)
+  ret <3 x ptr addrspace(5)> %result
+}
diff --git a/llvm/test/Analysis/CostModel/AMDGPU/sqrt.ll b/llvm/test/Analysis/CostModel/AMDGPU/sqrt.ll
new file mode 100644
index 0000000..7136b70
--- /dev/null
+++ b/llvm/test/Analysis/CostModel/AMDGPU/sqrt.ll
@@ -0,0 +1,278 @@
+; NOTE: Assertions have been autogenerated by utils/update_analyze_test_checks.py UTC_ARGS: --version 5
+; RUN: opt < %s -mtriple=amdgcn-unknown-amdhsa -passes="print<cost-model>" -cost-kind=throughput 2>&1 -disable-output | FileCheck -check-prefixes=ALL,BASE %s
+; RUN: opt < %s -mtriple=amdgcn-unknown-amdhsa -mcpu=gfx803 -passes="print<cost-model>" -cost-kind=throughput 2>&1 -disable-output | FileCheck -check-prefixes=ALL,GFX8 %s
+; RUN: opt < %s -mtriple=amdgcn-unknown-amdhsa -mcpu=gfx900 -passes="print<cost-model>" -cost-kind=throughput 2>&1 -disable-output | FileCheck -check-prefixes=ALL,GFX9 %s
+; RUN: opt < %s -mtriple=amdgcn-unknown-amdhsa -mcpu=gfx1030 -passes="print<cost-model>" -cost-kind=throughput 2>&1 -disable-output | FileCheck -check-prefixes=ALL,GFX10 %s
+
+; RUN: opt < %s -mtriple=amdgcn-unknown-amdhsa -passes="print<cost-model>" -cost-kind=code-size 2>&1 -disable-output | FileCheck -check-prefixes=ALL-SIZE,BASE-SIZE %s
+; RUN: opt < %s -mtriple=amdgcn-unknown-amdhsa -mcpu=gfx803 -passes="print<cost-model>" -cost-kind=code-size 2>&1 -disable-output | FileCheck -check-prefixes=ALL-SIZE,GFX8-SIZE %s
+; RUN: opt < %s -mtriple=amdgcn-unknown-amdhsa -mcpu=gfx900 -passes="print<cost-model>" -cost-kind=code-size 2>&1 -disable-output | FileCheck -check-prefixes=ALL-SIZE,GFX9-SIZE %s
+; RUN: opt < %s -mtriple=amdgcn-unknown-amdhsa -mcpu=gfx1030 -passes="print<cost-model>" -cost-kind=code-size 2>&1 -disable-output | FileCheck -check-prefixes=ALL-SIZE,GFX10-SIZE %s
+
+define void @sqrt_f16() {
+; BASE-LABEL: 'sqrt_f16'
+; BASE-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %f16 = call half @llvm.sqrt.f16(half undef)
+; BASE-NEXT:  Cost Model: Found an estimated cost of 4 for instruction: %v2f16 = call <2 x half> @llvm.sqrt.v2f16(<2 x half> undef)
+; BASE-NEXT:  Cost Model: Found an estimated cost of 8 for instruction: %v3f16 = call <3 x half> @llvm.sqrt.v3f16(<3 x half> undef)
+; BASE-NEXT:  Cost Model: Found an estimated cost of 8 for instruction: %v4f16 = call <4 x half> @llvm.sqrt.v4f16(<4 x half> undef)
+; BASE-NEXT:  Cost Model: Found an estimated cost of 16 for instruction: %v5f16 = call <5 x half> @llvm.sqrt.v5f16(<5 x half> undef)
+; BASE-NEXT:  Cost Model: Found an estimated cost of 16 for instruction: %v8f16 = call <8 x half> @llvm.sqrt.v8f16(<8 x half> undef)
+; BASE-NEXT:  Cost Model: Found an estimated cost of 32 for instruction: %v16f16 = call <16 x half> @llvm.sqrt.v16f16(<16 x half> undef)
+; BASE-NEXT:  Cost Model: Found an estimated cost of 64 for instruction: %v17f16 = call <17 x half> @llvm.sqrt.v17f16(<17 x half> undef)
+; BASE-NEXT:  Cost Model: Found an estimated cost of 10 for instruction: ret void
+;
+; GFX8-LABEL: 'sqrt_f16'
+; GFX8-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %f16 = call half @llvm.sqrt.f16(half undef)
+; GFX8-NEXT:  Cost Model: Found an estimated cost of 3 for instruction: %v2f16 = call <2 x half> @llvm.sqrt.v2f16(<2 x half> undef)
+; GFX8-NEXT:  Cost Model: Found an estimated cost of 5 for instruction: %v3f16 = call <3 x half> @llvm.sqrt.v3f16(<3 x half> undef)
+; GFX8-NEXT:  Cost Model: Found an estimated cost of 7 for instruction: %v4f16 = call <4 x half> @llvm.sqrt.v4f16(<4 x half> undef)
+; GFX8-NEXT:  Cost Model: Found an estimated cost of 9 for instruction: %v5f16 = call <5 x half> @llvm.sqrt.v5f16(<5 x half> undef)
+; GFX8-NEXT:  Cost Model: Found an estimated cost of 15 for instruction: %v8f16 = call <8 x half> @llvm.sqrt.v8f16(<8 x half> undef)
+; GFX8-NEXT:  Cost Model: Found an estimated cost of 31 for instruction: %v16f16 = call <16 x half> @llvm.sqrt.v16f16(<16 x half> undef)
+; GFX8-NEXT:  Cost Model: Found an estimated cost of 33 for instruction: %v17f16 = call <17 x half> @llvm.sqrt.v17f16(<17 x half> undef)
+; GFX8-NEXT:  Cost Model: Found an estimated cost of 10 for instruction: ret void
+;
+; GFX9-LABEL: 'sqrt_f16'
+; GFX9-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %f16 = call half @llvm.sqrt.f16(half undef)
+; GFX9-NEXT:  Cost Model: Found an estimated cost of 3 for instruction: %v2f16 = call <2 x half> @llvm.sqrt.v2f16(<2 x half> undef)
+; GFX9-NEXT:  Cost Model: Found an estimated cost of 5 for instruction: %v3f16 = call <3 x half> @llvm.sqrt.v3f16(<3 x half> undef)
+; GFX9-NEXT:  Cost Model: Found an estimated cost of 7 for instruction: %v4f16 = call <4 x half> @llvm.sqrt.v4f16(<4 x half> undef)
+; GFX9-NEXT:  Cost Model: Found an estimated cost of 9 for instruction: %v5f16 = call <5 x half> @llvm.sqrt.v5f16(<5 x half> undef)
+; GFX9-NEXT:  Cost Model: Found an estimated cost of 15 for instruction: %v8f16 = call <8 x half> @llvm.sqrt.v8f16(<8 x half> undef)
+; GFX9-NEXT:  Cost Model: Found an estimated cost of 31 for instruction: %v16f16 = call <16 x half> @llvm.sqrt.v16f16(<16 x half> undef)
+; GFX9-NEXT:  Cost Model: Found an estimated cost of 33 for instruction: %v17f16 = call <17 x half> @llvm.sqrt.v17f16(<17 x half> undef)
+; GFX9-NEXT:  Cost Model: Found an estimated cost of 10 for instruction: ret void
+;
+; GFX10-LABEL: 'sqrt_f16'
+; GFX10-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %f16 = call half @llvm.sqrt.f16(half undef)
+; GFX10-NEXT:  Cost Model: Found an estimated cost of 3 for instruction: %v2f16 = call <2 x half> @llvm.sqrt.v2f16(<2 x half> undef)
+; GFX10-NEXT:  Cost Model: Found an estimated cost of 5 for instruction: %v3f16 = call <3 x half> @llvm.sqrt.v3f16(<3 x half> undef)
+; GFX10-NEXT:  Cost Model: Found an estimated cost of 7 for instruction: %v4f16 = call <4 x half> @llvm.sqrt.v4f16(<4 x half> undef)
+; GFX10-NEXT:  Cost Model: Found an estimated cost of 9 for instruction: %v5f16 = call <5 x half> @llvm.sqrt.v5f16(<5 x half> undef)
+; GFX10-NEXT:  Cost Model: Found an estimated cost of 15 for instruction: %v8f16 = call <8 x half> @llvm.sqrt.v8f16(<8 x half> undef)
+; GFX10-NEXT:  Cost Model: Found an estimated cost of 31 for instruction: %v16f16 = call <16 x half> @llvm.sqrt.v16f16(<16 x half> undef)
+; GFX10-NEXT:  Cost Model: Found an estimated cost of 33 for instruction: %v17f16 = call <17 x half> @llvm.sqrt.v17f16(<17 x half> undef)
+; GFX10-NEXT:  Cost Model: Found an estimated cost of 10 for instruction: ret void
+;
+; BASE-SIZE-LABEL: 'sqrt_f16'
+; BASE-SIZE-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %f16 = call half @llvm.sqrt.f16(half undef)
+; BASE-SIZE-NEXT:  Cost Model: Found an estimated cost of 4 for instruction: %v2f16 = call <2 x half> @llvm.sqrt.v2f16(<2 x half> undef)
+; BASE-SIZE-NEXT:  Cost Model: Found an estimated cost of 8 for instruction: %v3f16 = call <3 x half> @llvm.sqrt.v3f16(<3 x half> undef)
+; BASE-SIZE-NEXT:  Cost Model: Found an estimated cost of 8 for instruction: %v4f16 = call <4 x half> @llvm.sqrt.v4f16(<4 x half> undef)
+; BASE-SIZE-NEXT:  Cost Model: Found an estimated cost of 16 for instruction: %v5f16 = call <5 x half> @llvm.sqrt.v5f16(<5 x half> undef)
+; BASE-SIZE-NEXT:  Cost Model: Found an estimated cost of 16 for instruction: %v8f16 = call <8 x half> @llvm.sqrt.v8f16(<8 x half> undef)
+; BASE-SIZE-NEXT:  Cost Model: Found an estimated cost of 32 for instruction: %v16f16 = call <16 x half> @llvm.sqrt.v16f16(<16 x half> undef)
+; BASE-SIZE-NEXT:  Cost Model: Found an estimated cost of 64 for instruction: %v17f16 = call <17 x half> @llvm.sqrt.v17f16(<17 x half> undef)
+; BASE-SIZE-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: ret void
+;
+; GFX8-SIZE-LABEL: 'sqrt_f16'
+; GFX8-SIZE-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %f16 = call half @llvm.sqrt.f16(half undef)
+; GFX8-SIZE-NEXT:  Cost Model: Found an estimated cost of 3 for instruction: %v2f16 = call <2 x half> @llvm.sqrt.v2f16(<2 x half> undef)
+; GFX8-SIZE-NEXT:  Cost Model: Found an estimated cost of 5 for instruction: %v3f16 = call <3 x half> @llvm.sqrt.v3f16(<3 x half> undef)
+; GFX8-SIZE-NEXT:  Cost Model: Found an estimated cost of 7 for instruction: %v4f16 = call <4 x half> @llvm.sqrt.v4f16(<4 x half> undef)
+; GFX8-SIZE-NEXT:  Cost Model: Found an estimated cost of 9 for instruction: %v5f16 = call <5 x half> @llvm.sqrt.v5f16(<5 x half> undef)
+; GFX8-SIZE-NEXT:  Cost Model: Found an estimated cost of 15 for instruction: %v8f16 = call <8 x half> @llvm.sqrt.v8f16(<8 x half> undef)
+; GFX8-SIZE-NEXT:  Cost Model: Found an estimated cost of 31 for instruction: %v16f16 = call <16 x half> @llvm.sqrt.v16f16(<16 x half> undef)
+; GFX8-SIZE-NEXT:  Cost Model: Found an estimated cost of 33 for instruction: %v17f16 = call <17 x half> @llvm.sqrt.v17f16(<17 x half> undef)
+; GFX8-SIZE-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: ret void
+;
+; GFX9-SIZE-LABEL: 'sqrt_f16'
+; GFX9-SIZE-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %f16 = call half @llvm.sqrt.f16(half undef)
+; GFX9-SIZE-NEXT:  Cost Model: Found an estimated cost of 3 for instruction: %v2f16 = call <2 x half> @llvm.sqrt.v2f16(<2 x half> undef)
+; GFX9-SIZE-NEXT:  Cost Model: Found an estimated cost of 5 for instruction: %v3f16 = call <3 x half> @llvm.sqrt.v3f16(<3 x half> undef)
+; GFX9-SIZE-NEXT:  Cost Model: Found an estimated cost of 7 for instruction: %v4f16 = call <4 x half> @llvm.sqrt.v4f16(<4 x half> undef)
+; GFX9-SIZE-NEXT:  Cost Model: Found an estimated cost of 9 for instruction: %v5f16 = call <5 x half> @llvm.sqrt.v5f16(<5 x half> undef)
+; GFX9-SIZE-NEXT:  Cost Model: Found an estimated cost of 15 for instruction: %v8f16 = call <8 x half> @llvm.sqrt.v8f16(<8 x half> undef)
+; GFX9-SIZE-NEXT:  Cost Model: Found an estimated cost of 31 for instruction: %v16f16 = call <16 x half> @llvm.sqrt.v16f16(<16 x half> undef)
+; GFX9-SIZE-NEXT:  Cost Model: Found an estimated cost of 33 for instruction: %v17f16 = call <17 x half> @llvm.sqrt.v17f16(<17 x half> undef)
+; GFX9-SIZE-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: ret void
+;
+; GFX10-SIZE-LABEL: 'sqrt_f16'
+; GFX10-SIZE-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %f16 = call half @llvm.sqrt.f16(half undef)
+; GFX10-SIZE-NEXT:  Cost Model: Found an estimated cost of 3 for instruction: %v2f16 = call <2 x half> @llvm.sqrt.v2f16(<2 x half> undef)
+; GFX10-SIZE-NEXT:  Cost Model: Found an estimated cost of 5 for instruction: %v3f16 = call <3 x half> @llvm.sqrt.v3f16(<3 x half> undef)
+; GFX10-SIZE-NEXT:  Cost Model: Found an estimated cost of 7 for instruction: %v4f16 = call <4 x half> @llvm.sqrt.v4f16(<4 x half> undef)
+; GFX10-SIZE-NEXT:  Cost Model: Found an estimated cost of 9 for instruction: %v5f16 = call <5 x half> @llvm.sqrt.v5f16(<5 x half> undef)
+; GFX10-SIZE-NEXT:  Cost Model: Found an estimated cost of 15 for instruction: %v8f16 = call <8 x half> @llvm.sqrt.v8f16(<8 x half> undef)
+; GFX10-SIZE-NEXT:  Cost Model: Found an estimated cost of 31 for instruction: %v16f16 = call <16 x half> @llvm.sqrt.v16f16(<16 x half> undef)
+; GFX10-SIZE-NEXT:  Cost Model: Found an estimated cost of 33 for instruction: %v17f16 = call <17 x half> @llvm.sqrt.v17f16(<17 x half> undef)
+; GFX10-SIZE-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: ret void
+;
+  %f16 = call half @llvm.sqrt.f16(half undef)
+  %v2f16 = call <2 x half> @llvm.sqrt.v2f16(<2 x half> undef)
+  %v3f16 = call <3 x half> @llvm.sqrt.v3f16(<3 x half> undef)
+  %v4f16 = call <4 x half> @llvm.sqrt.v4f16(<4 x half> undef)
+  %v5f16 = call <5 x half> @llvm.sqrt.v5f16(<5 x half> undef)
+  %v8f16 = call <8 x half> @llvm.sqrt.v8f16(<8 x half> undef)
+  %v16f16 = call <16 x half> @llvm.sqrt.v16f16(<16 x half> undef)
+  %v17f16 = call <17 x half> @llvm.sqrt.v17f16(<17 x half> undef)
+  ret void
+}
+
+define void @sqrt_bf16() {
+; BASE-LABEL: 'sqrt_bf16'
+; BASE-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %bf16 = call bfloat @llvm.sqrt.bf16(bfloat undef)
+; BASE-NEXT:  Cost Model: Found an estimated cost of 4 for instruction: %v2bf16 = call <2 x bfloat> @llvm.sqrt.v2bf16(<2 x bfloat> undef)
+; BASE-NEXT:  Cost Model: Found an estimated cost of 8 for instruction: %v3bf16 = call <3 x bfloat> @llvm.sqrt.v3bf16(<3 x bfloat> undef)
+; BASE-NEXT:  Cost Model: Found an estimated cost of 8 for instruction: %v4bf16 = call <4 x bfloat> @llvm.sqrt.v4bf16(<4 x bfloat> undef)
+; BASE-NEXT:  Cost Model: Found an estimated cost of 16 for instruction: %v5bf16 = call <5 x bfloat> @llvm.sqrt.v5bf16(<5 x bfloat> undef)
+; BASE-NEXT:  Cost Model: Found an estimated cost of 16 for instruction: %v8bf16 = call <8 x bfloat> @llvm.sqrt.v8bf16(<8 x bfloat> undef)
+; BASE-NEXT:  Cost Model: Found an estimated cost of 32 for instruction: %v16bf16 = call <16 x bfloat> @llvm.sqrt.v16bf16(<16 x bfloat> undef)
+; BASE-NEXT:  Cost Model: Found an estimated cost of 64 for instruction: %v17bf16 = call <17 x bfloat> @llvm.sqrt.v17bf16(<17 x bfloat> undef)
+; BASE-NEXT:  Cost Model: Found an estimated cost of 10 for instruction: ret void
+;
+; GFX8-LABEL: 'sqrt_bf16'
+; GFX8-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %bf16 = call bfloat @llvm.sqrt.bf16(bfloat undef)
+; GFX8-NEXT:  Cost Model: Found an estimated cost of 3 for instruction: %v2bf16 = call <2 x bfloat> @llvm.sqrt.v2bf16(<2 x bfloat> undef)
+; GFX8-NEXT:  Cost Model: Found an estimated cost of 5 for instruction: %v3bf16 = call <3 x bfloat> @llvm.sqrt.v3bf16(<3 x bfloat> undef)
+; GFX8-NEXT:  Cost Model: Found an estimated cost of 7 for instruction: %v4bf16 = call <4 x bfloat> @llvm.sqrt.v4bf16(<4 x bfloat> undef)
+; GFX8-NEXT:  Cost Model: Found an estimated cost of 9 for instruction: %v5bf16 = call <5 x bfloat> @llvm.sqrt.v5bf16(<5 x bfloat> undef)
+; GFX8-NEXT:  Cost Model: Found an estimated cost of 15 for instruction: %v8bf16 = call <8 x bfloat> @llvm.sqrt.v8bf16(<8 x bfloat> undef)
+; GFX8-NEXT:  Cost Model: Found an estimated cost of 31 for instruction: %v16bf16 = call <16 x bfloat> @llvm.sqrt.v16bf16(<16 x bfloat> undef)
+; GFX8-NEXT:  Cost Model: Found an estimated cost of 33 for instruction: %v17bf16 = call <17 x bfloat> @llvm.sqrt.v17bf16(<17 x bfloat> undef)
+; GFX8-NEXT:  Cost Model: Found an estimated cost of 10 for instruction: ret void
+;
+; GFX9-LABEL: 'sqrt_bf16'
+; GFX9-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %bf16 = call bfloat @llvm.sqrt.bf16(bfloat undef)
+; GFX9-NEXT:  Cost Model: Found an estimated cost of 3 for instruction: %v2bf16 = call <2 x bfloat> @llvm.sqrt.v2bf16(<2 x bfloat> undef)
+; GFX9-NEXT:  Cost Model: Found an estimated cost of 5 for instruction: %v3bf16 = call <3 x bfloat> @llvm.sqrt.v3bf16(<3 x bfloat> undef)
+; GFX9-NEXT:  Cost Model: Found an estimated cost of 7 for instruction: %v4bf16 = call <4 x bfloat> @llvm.sqrt.v4bf16(<4 x bfloat> undef)
+; GFX9-NEXT:  Cost Model: Found an estimated cost of 9 for instruction: %v5bf16 = call <5 x bfloat> @llvm.sqrt.v5bf16(<5 x bfloat> undef)
+; GFX9-NEXT:  Cost Model: Found an estimated cost of 15 for instruction: %v8bf16 = call <8 x bfloat> @llvm.sqrt.v8bf16(<8 x bfloat> undef)
+; GFX9-NEXT:  Cost Model: Found an estimated cost of 31 for instruction: %v16bf16 = call <16 x bfloat> @llvm.sqrt.v16bf16(<16 x bfloat> undef)
+; GFX9-NEXT:  Cost Model: Found an estimated cost of 33 for instruction: %v17bf16 = call <17 x bfloat> @llvm.sqrt.v17bf16(<17 x bfloat> undef)
+; GFX9-NEXT:  Cost Model: Found an estimated cost of 10 for instruction: ret void
+;
+; GFX10-LABEL: 'sqrt_bf16'
+; GFX10-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %bf16 = call bfloat @llvm.sqrt.bf16(bfloat undef)
+; GFX10-NEXT:  Cost Model: Found an estimated cost of 3 for instruction: %v2bf16 = call <2 x bfloat> @llvm.sqrt.v2bf16(<2 x bfloat> undef)
+; GFX10-NEXT:  Cost Model: Found an estimated cost of 5 for instruction: %v3bf16 = call <3 x bfloat> @llvm.sqrt.v3bf16(<3 x bfloat> undef)
+; GFX10-NEXT:  Cost Model: Found an estimated cost of 7 for instruction: %v4bf16 = call <4 x bfloat> @llvm.sqrt.v4bf16(<4 x bfloat> undef)
+; GFX10-NEXT:  Cost Model: Found an estimated cost of 9 for instruction: %v5bf16 = call <5 x bfloat> @llvm.sqrt.v5bf16(<5 x bfloat> undef)
+; GFX10-NEXT:  Cost Model: Found an estimated cost of 15 for instruction: %v8bf16 = call <8 x bfloat> @llvm.sqrt.v8bf16(<8 x bfloat> undef)
+; GFX10-NEXT:  Cost Model: Found an estimated cost of 31 for instruction: %v16bf16 = call <16 x bfloat> @llvm.sqrt.v16bf16(<16 x bfloat> undef)
+; GFX10-NEXT:  Cost Model: Found an estimated cost of 33 for instruction: %v17bf16 = call <17 x bfloat> @llvm.sqrt.v17bf16(<17 x bfloat> undef)
+; GFX10-NEXT:  Cost Model: Found an estimated cost of 10 for instruction: ret void
+;
+; BASE-SIZE-LABEL: 'sqrt_bf16'
+; BASE-SIZE-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %bf16 = call bfloat @llvm.sqrt.bf16(bfloat undef)
+; BASE-SIZE-NEXT:  Cost Model: Found an estimated cost of 4 for instruction: %v2bf16 = call <2 x bfloat> @llvm.sqrt.v2bf16(<2 x bfloat> undef)
+; BASE-SIZE-NEXT:  Cost Model: Found an estimated cost of 8 for instruction: %v3bf16 = call <3 x bfloat> @llvm.sqrt.v3bf16(<3 x bfloat> undef)
+; BASE-SIZE-NEXT:  Cost Model: Found an estimated cost of 8 for instruction: %v4bf16 = call <4 x bfloat> @llvm.sqrt.v4bf16(<4 x bfloat> undef)
+; BASE-SIZE-NEXT:  Cost Model: Found an estimated cost of 16 for instruction: %v5bf16 = call <5 x bfloat> @llvm.sqrt.v5bf16(<5 x bfloat> undef)
+; BASE-SIZE-NEXT:  Cost Model: Found an estimated cost of 16 for instruction: %v8bf16 = call <8 x bfloat> @llvm.sqrt.v8bf16(<8 x bfloat> undef)
+; BASE-SIZE-NEXT:  Cost Model: Found an estimated cost of 32 for instruction: %v16bf16 = call <16 x bfloat> @llvm.sqrt.v16bf16(<16 x bfloat> undef)
+; BASE-SIZE-NEXT:  Cost Model: Found an estimated cost of 64 for instruction: %v17bf16 = call <17 x bfloat> @llvm.sqrt.v17bf16(<17 x bfloat> undef)
+; BASE-SIZE-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: ret void
+;
+; GFX8-SIZE-LABEL: 'sqrt_bf16'
+; GFX8-SIZE-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %bf16 = call bfloat @llvm.sqrt.bf16(bfloat undef)
+; GFX8-SIZE-NEXT:  Cost Model: Found an estimated cost of 3 for instruction: %v2bf16 = call <2 x bfloat> @llvm.sqrt.v2bf16(<2 x bfloat> undef)
+; GFX8-SIZE-NEXT:  Cost Model: Found an estimated cost of 5 for instruction: %v3bf16 = call <3 x bfloat> @llvm.sqrt.v3bf16(<3 x bfloat> undef)
+; GFX8-SIZE-NEXT:  Cost Model: Found an estimated cost of 7 for instruction: %v4bf16 = call <4 x bfloat> @llvm.sqrt.v4bf16(<4 x bfloat> undef)
+; GFX8-SIZE-NEXT:  Cost Model: Found an estimated cost of 9 for instruction: %v5bf16 = call <5 x bfloat> @llvm.sqrt.v5bf16(<5 x bfloat> undef)
+; GFX8-SIZE-NEXT:  Cost Model: Found an estimated cost of 15 for instruction: %v8bf16 = call <8 x bfloat> @llvm.sqrt.v8bf16(<8 x bfloat> undef)
+; GFX8-SIZE-NEXT:  Cost Model: Found an estimated cost of 31 for instruction: %v16bf16 = call <16 x bfloat> @llvm.sqrt.v16bf16(<16 x bfloat> undef)
+; GFX8-SIZE-NEXT:  Cost Model: Found an estimated cost of 33 for instruction: %v17bf16 = call <17 x bfloat> @llvm.sqrt.v17bf16(<17 x bfloat> undef)
+; GFX8-SIZE-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: ret void
+;
+; GFX9-SIZE-LABEL: 'sqrt_bf16'
+; GFX9-SIZE-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %bf16 = call bfloat @llvm.sqrt.bf16(bfloat undef)
+; GFX9-SIZE-NEXT:  Cost Model: Found an estimated cost of 3 for instruction: %v2bf16 = call <2 x bfloat> @llvm.sqrt.v2bf16(<2 x bfloat> undef)
+; GFX9-SIZE-NEXT:  Cost Model: Found an estimated cost of 5 for instruction: %v3bf16 = call <3 x bfloat> @llvm.sqrt.v3bf16(<3 x bfloat> undef)
+; GFX9-SIZE-NEXT:  Cost Model: Found an estimated cost of 7 for instruction: %v4bf16 = call <4 x bfloat> @llvm.sqrt.v4bf16(<4 x bfloat> undef)
+; GFX9-SIZE-NEXT:  Cost Model: Found an estimated cost of 9 for instruction: %v5bf16 = call <5 x bfloat> @llvm.sqrt.v5bf16(<5 x bfloat> undef)
+; GFX9-SIZE-NEXT:  Cost Model: Found an estimated cost of 15 for instruction: %v8bf16 = call <8 x bfloat> @llvm.sqrt.v8bf16(<8 x bfloat> undef)
+; GFX9-SIZE-NEXT:  Cost Model: Found an estimated cost of 31 for instruction: %v16bf16 = call <16 x bfloat> @llvm.sqrt.v16bf16(<16 x bfloat> undef)
+; GFX9-SIZE-NEXT:  Cost Model: Found an estimated cost of 33 for instruction: %v17bf16 = call <17 x bfloat> @llvm.sqrt.v17bf16(<17 x bfloat> undef)
+; GFX9-SIZE-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: ret void
+;
+; GFX10-SIZE-LABEL: 'sqrt_bf16'
+; GFX10-SIZE-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %bf16 = call bfloat @llvm.sqrt.bf16(bfloat undef)
+; GFX10-SIZE-NEXT:  Cost Model: Found an estimated cost of 3 for instruction: %v2bf16 = call <2 x bfloat> @llvm.sqrt.v2bf16(<2 x bfloat> undef)
+; GFX10-SIZE-NEXT:  Cost Model: Found an estimated cost of 5 for instruction: %v3bf16 = call <3 x bfloat> @llvm.sqrt.v3bf16(<3 x bfloat> undef)
+; GFX10-SIZE-NEXT:  Cost Model: Found an estimated cost of 7 for instruction: %v4bf16 = call <4 x bfloat> @llvm.sqrt.v4bf16(<4 x bfloat> undef)
+; GFX10-SIZE-NEXT:  Cost Model: Found an estimated cost of 9 for instruction: %v5bf16 = call <5 x bfloat> @llvm.sqrt.v5bf16(<5 x bfloat> undef)
+; GFX10-SIZE-NEXT:  Cost Model: Found an estimated cost of 15 for instruction: %v8bf16 = call <8 x bfloat> @llvm.sqrt.v8bf16(<8 x bfloat> undef)
+; GFX10-SIZE-NEXT:  Cost Model: Found an estimated cost of 31 for instruction: %v16bf16 = call <16 x bfloat> @llvm.sqrt.v16bf16(<16 x bfloat> undef)
+; GFX10-SIZE-NEXT:  Cost Model: Found an estimated cost of 33 for instruction: %v17bf16 = call <17 x bfloat> @llvm.sqrt.v17bf16(<17 x bfloat> undef)
+; GFX10-SIZE-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: ret void
+;
+  %bf16 = call bfloat @llvm.sqrt.bf16(bfloat undef)
+  %v2bf16 = call <2 x bfloat> @llvm.sqrt.v2bf16(<2 x bfloat> undef)
+  %v3bf16 = call <3 x bfloat> @llvm.sqrt.v3bf16(<3 x bfloat> undef)
+  %v4bf16 = call <4 x bfloat> @llvm.sqrt.v4bf16(<4 x bfloat> undef)
+  %v5bf16 = call <5 x bfloat> @llvm.sqrt.v5bf16(<5 x bfloat> undef)
+  %v8bf16 = call <8 x bfloat> @llvm.sqrt.v8bf16(<8 x bfloat> undef)
+  %v16bf16 = call <16 x bfloat> @llvm.sqrt.v16bf16(<16 x bfloat> undef)
+  %v17bf16 = call <17 x bfloat> @llvm.sqrt.v17bf16(<17 x bfloat> undef)
+  ret void
+}
+
+define void @sqrt_f32() {
+; ALL-LABEL: 'sqrt_f32'
+; ALL-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %f32 = call float @llvm.sqrt.f32(float undef)
+; ALL-NEXT:  Cost Model: Found an estimated cost of 4 for instruction: %v2f32 = call <2 x float> @llvm.sqrt.v2f32(<2 x float> undef)
+; ALL-NEXT:  Cost Model: Found an estimated cost of 6 for instruction: %v3f32 = call <3 x float> @llvm.sqrt.v3f32(<3 x float> undef)
+; ALL-NEXT:  Cost Model: Found an estimated cost of 8 for instruction: %v4f32 = call <4 x float> @llvm.sqrt.v4f32(<4 x float> undef)
+; ALL-NEXT:  Cost Model: Found an estimated cost of 10 for instruction: %v5f32 = call <5 x float> @llvm.sqrt.v5f32(<5 x float> undef)
+; ALL-NEXT:  Cost Model: Found an estimated cost of 16 for instruction: %v8f32 = call <8 x float> @llvm.sqrt.v8f32(<8 x float> undef)
+; ALL-NEXT:  Cost Model: Found an estimated cost of 32 for instruction: %v16f32 = call <16 x float> @llvm.sqrt.v16f32(<16 x float> undef)
+; ALL-NEXT:  Cost Model: Found an estimated cost of 34 for instruction: %v17f32 = call <17 x float> @llvm.sqrt.v17f32(<17 x float> undef)
+; ALL-NEXT:  Cost Model: Found an estimated cost of 10 for instruction: ret void
+;
+; ALL-SIZE-LABEL: 'sqrt_f32'
+; ALL-SIZE-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %f32 = call float @llvm.sqrt.f32(float undef)
+; ALL-SIZE-NEXT:  Cost Model: Found an estimated cost of 4 for instruction: %v2f32 = call <2 x float> @llvm.sqrt.v2f32(<2 x float> undef)
+; ALL-SIZE-NEXT:  Cost Model: Found an estimated cost of 6 for instruction: %v3f32 = call <3 x float> @llvm.sqrt.v3f32(<3 x float> undef)
+; ALL-SIZE-NEXT:  Cost Model: Found an estimated cost of 8 for instruction: %v4f32 = call <4 x float> @llvm.sqrt.v4f32(<4 x float> undef)
+; ALL-SIZE-NEXT:  Cost Model: Found an estimated cost of 10 for instruction: %v5f32 = call <5 x float> @llvm.sqrt.v5f32(<5 x float> undef)
+; ALL-SIZE-NEXT:  Cost Model: Found an estimated cost of 16 for instruction: %v8f32 = call <8 x float> @llvm.sqrt.v8f32(<8 x float> undef)
+; ALL-SIZE-NEXT:  Cost Model: Found an estimated cost of 32 for instruction: %v16f32 = call <16 x float> @llvm.sqrt.v16f32(<16 x float> undef)
+; ALL-SIZE-NEXT:  Cost Model: Found an estimated cost of 34 for instruction: %v17f32 = call <17 x float> @llvm.sqrt.v17f32(<17 x float> undef)
+; ALL-SIZE-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: ret void
+;
+  %f32 = call float @llvm.sqrt.f32(float undef)
+  %v2f32 = call <2 x float> @llvm.sqrt.v2f32(<2 x float> undef)
+  %v3f32 = call <3 x float> @llvm.sqrt.v3f32(<3 x float> undef)
+  %v4f32 = call <4 x float> @llvm.sqrt.v4f32(<4 x float> undef)
+  %v5f32 = call <5 x float> @llvm.sqrt.v5f32(<5 x float> undef)
+  %v8f32 = call <8 x float> @llvm.sqrt.v8f32(<8 x float> undef)
+  %v16f32 = call <16 x float> @llvm.sqrt.v16f32(<16 x float> undef)
+  %v17f32 = call <17 x float> @llvm.sqrt.v17f32(<17 x float> undef)
+  ret void
+}
+
+define void @sqrt_f64() {
+; ALL-LABEL: 'sqrt_f64'
+; ALL-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %f64 = call double @llvm.sqrt.f64(double undef)
+; ALL-NEXT:  Cost Model: Found an estimated cost of 4 for instruction: %v2f64 = call <2 x double> @llvm.sqrt.v2f64(<2 x double> undef)
+; ALL-NEXT:  Cost Model: Found an estimated cost of 6 for instruction: %v3f64 = call <3 x double> @llvm.sqrt.v3f64(<3 x double> undef)
+; ALL-NEXT:  Cost Model: Found an estimated cost of 8 for instruction: %v4f64 = call <4 x double> @llvm.sqrt.v4f64(<4 x double> undef)
+; ALL-NEXT:  Cost Model: Found an estimated cost of 10 for instruction: %v5f64 = call <5 x double> @llvm.sqrt.v5f64(<5 x double> undef)
+; ALL-NEXT:  Cost Model: Found an estimated cost of 16 for instruction: %v8f64 = call <8 x double> @llvm.sqrt.v8f64(<8 x double> undef)
+; ALL-NEXT:  Cost Model: Found an estimated cost of 32 for instruction: %v16f64 = call <16 x double> @llvm.sqrt.v16f64(<16 x double> undef)
+; ALL-NEXT:  Cost Model: Found an estimated cost of 34 for instruction: %v17f64 = call <17 x double> @llvm.sqrt.v17f64(<17 x double> undef)
+; ALL-NEXT:  Cost Model: Found an estimated cost of 10 for instruction: ret void
+;
+; ALL-SIZE-LABEL: 'sqrt_f64'
+; ALL-SIZE-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %f64 = call double @llvm.sqrt.f64(double undef)
+; ALL-SIZE-NEXT:  Cost Model: Found an estimated cost of 4 for instruction: %v2f64 = call <2 x double> @llvm.sqrt.v2f64(<2 x double> undef)
+; ALL-SIZE-NEXT:  Cost Model: Found an estimated cost of 6 for instruction: %v3f64 = call <3 x double> @llvm.sqrt.v3f64(<3 x double> undef)
+; ALL-SIZE-NEXT:  Cost Model: Found an estimated cost of 8 for instruction: %v4f64 = call <4 x double> @llvm.sqrt.v4f64(<4 x double> undef)
+; ALL-SIZE-NEXT:  Cost Model: Found an estimated cost of 10 for instruction: %v5f64 = call <5 x double> @llvm.sqrt.v5f64(<5 x double> undef)
+; ALL-SIZE-NEXT:  Cost Model: Found an estimated cost of 16 for instruction: %v8f64 = call <8 x double> @llvm.sqrt.v8f64(<8 x double> undef)
+; ALL-SIZE-NEXT:  Cost Model: Found an estimated cost of 32 for instruction: %v16f64 = call <16 x double> @llvm.sqrt.v16f64(<16 x double> undef)
+; ALL-SIZE-NEXT:  Cost Model: Found an estimated cost of 34 for instruction: %v17f64 = call <17 x double> @llvm.sqrt.v17f64(<17 x double> undef)
+; ALL-SIZE-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: ret void
+;
+  %f64 = call double @llvm.sqrt.f64(double undef)
+  %v2f64 = call <2 x double> @llvm.sqrt.v2f64(<2 x double> undef)
+  %v3f64 = call <3 x double> @llvm.sqrt.v3f64(<3 x double> undef)
+  %v4f64 = call <4 x double> @llvm.sqrt.v4f64(<4 x double> undef)
+  %v5f64 = call <5 x double> @llvm.sqrt.v5f64(<5 x double> undef)
+  %v8f64 = call <8 x double> @llvm.sqrt.v8f64(<8 x double> undef)
+  %v16f64 = call <16 x double> @llvm.sqrt.v16f64(<16 x double> undef)
+  %v17f64 = call <17 x double> @llvm.sqrt.v17f64(<17 x double> undef)
+  ret void
+}
diff --git a/llvm/test/Analysis/CostModel/RISCV/cast.ll b/llvm/test/Analysis/CostModel/RISCV/cast.ll
index 6ddd57a..669e702 100644
--- a/llvm/test/Analysis/CostModel/RISCV/cast.ll
+++ b/llvm/test/Analysis/CostModel/RISCV/cast.ll
@@ -4314,3 +4314,66 @@ define void @uitofp() {
 
   ret void
 }
+
+define void @oddvec_sizes() {
+; CHECK-LABEL: 'oddvec_sizes'
+; CHECK-NEXT:  Cost Model: Found an estimated cost of 9 for instruction: %1 = sext <3 x i8> undef to <3 x i16>
+; CHECK-NEXT:  Cost Model: Found an estimated cost of 21 for instruction: %2 = sext <7 x i8> undef to <7 x i32>
+; CHECK-NEXT:  Cost Model: Found an estimated cost of 45 for instruction: %3 = sext <15 x i8> undef to <15 x i32>
+; CHECK-NEXT:  Cost Model: Found an estimated cost of 9 for instruction: %4 = zext <3 x i8> undef to <3 x i16>
+; CHECK-NEXT:  Cost Model: Found an estimated cost of 21 for instruction: %5 = zext <7 x i8> undef to <7 x i32>
+; CHECK-NEXT:  Cost Model: Found an estimated cost of 45 for instruction: %6 = zext <15 x i8> undef to <15 x i32>
+; CHECK-NEXT:  Cost Model: Found an estimated cost of 6 for instruction: %7 = trunc <3 x i32> undef to <3 x i8>
+; CHECK-NEXT:  Cost Model: Found an estimated cost of 14 for instruction: %8 = trunc <7 x i32> undef to <7 x i8>
+; CHECK-NEXT:  Cost Model: Found an estimated cost of 30 for instruction: %9 = trunc <15 x i32> undef to <15 x i8>
+; CHECK-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: %10 = bitcast <3 x i32> undef to <3 x float>
+; CHECK-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: %11 = bitcast <7 x i32> undef to <7 x float>
+; CHECK-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: %12 = bitcast <15 x i32> undef to <15 x float>
+; CHECK-NEXT:  Cost Model: Found an estimated cost of 9 for instruction: %13 = sitofp <3 x i32> undef to <3 x float>
+; CHECK-NEXT:  Cost Model: Found an estimated cost of 21 for instruction: %14 = sitofp <7 x i32> undef to <7 x float>
+; CHECK-NEXT:  Cost Model: Found an estimated cost of 45 for instruction: %15 = sitofp <15 x i32> undef to <15 x float>
+; CHECK-NEXT:  Cost Model: Found an estimated cost of 9 for instruction: %16 = uitofp <3 x i32> undef to <3 x float>
+; CHECK-NEXT:  Cost Model: Found an estimated cost of 21 for instruction: %17 = uitofp <7 x i32> undef to <7 x float>
+; CHECK-NEXT:  Cost Model: Found an estimated cost of 45 for instruction: %18 = uitofp <15 x i32> undef to <15 x float>
+; CHECK-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %19 = fptosi <3 x float> undef to <3 x i32>
+; CHECK-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %20 = fptosi <7 x float> undef to <7 x i32>
+; CHECK-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %21 = fptosi <15 x float> undef to <15 x i32>
+; CHECK-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %22 = fptoui <3 x float> undef to <3 x i32>
+; CHECK-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %23 = fptoui <7 x float> undef to <7 x i32>
+; CHECK-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %24 = fptoui <15 x float> undef to <15 x i32>
+; CHECK-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: ret void
+;
+  sext <3 x i8> undef to <3 x i16>
+  sext <7 x i8> undef to <7 x i32>
+  sext <15 x i8> undef to <15 x i32>
+
+  zext <3 x i8> undef to <3 x i16>
+  zext <7 x i8> undef to <7 x i32>
+  zext <15 x i8> undef to <15 x i32>
+
+  trunc <3 x i32> undef to <3 x i8>
+  trunc <7 x i32> undef to <7 x i8>
+  trunc <15 x i32> undef to <15 x i8>
+
+  bitcast <3 x i32> undef to <3 x float>
+  bitcast <7 x i32> undef to <7 x float>
+  bitcast <15 x i32> undef to <15 x float>
+
+  sitofp <3 x i32> undef to <3 x float>
+  sitofp <7 x i32> undef to <7 x float>
+  sitofp <15 x i32> undef to <15 x float>
+
+  uitofp <3 x i32> undef to <3 x float>
+  uitofp <7 x i32> undef to <7 x float>
+  uitofp <15 x i32> undef to <15 x float>
+
+  fptosi <3 x float> undef to <3 x i32>
+  fptosi <7 x float> undef to <7 x i32>
+  fptosi <15 x float> undef to <15 x i32>
+
+  fptoui <3 x float> undef to <3 x i32>
+  fptoui <7 x float> undef to <7 x i32>
+  fptoui <15 x float> undef to <15 x i32>
+
+  ret void
+}
diff --git a/llvm/test/Analysis/CostModel/RISCV/rvv-load-store.ll b/llvm/test/Analysis/CostModel/RISCV/rvv-load-store.ll
index 4e92937..2448056 100644
--- a/llvm/test/Analysis/CostModel/RISCV/rvv-load-store.ll
+++ b/llvm/test/Analysis/CostModel/RISCV/rvv-load-store.ll
@@ -6,141 +6,181 @@
 
 define void @load(ptr %p) {
 ; CHECK-LABEL: 'load'
-; CHECK-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %1 = load i8, ptr %p, align 1
-; CHECK-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %2 = load <1 x i8>, ptr %p, align 1
-; CHECK-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %3 = load <2 x i8>, ptr %p, align 2
-; CHECK-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %4 = load <4 x i8>, ptr %p, align 4
-; CHECK-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %5 = load <8 x i8>, ptr %p, align 8
-; CHECK-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %6 = load <16 x i8>, ptr %p, align 16
-; CHECK-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %7 = load <32 x i8>, ptr %p, align 32
-; CHECK-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %8 = load <vscale x 1 x i8>, ptr %p, align 1
-; CHECK-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %9 = load <vscale x 2 x i8>, ptr %p, align 2
-; CHECK-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %10 = load <vscale x 4 x i8>, ptr %p, align 4
-; CHECK-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %11 = load <vscale x 8 x i8>, ptr %p, align 8
-; CHECK-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %12 = load <vscale x 16 x i8>, ptr %p, align 16
-; CHECK-NEXT:  Cost Model: Found an estimated cost of 4 for instruction: %13 = load <vscale x 32 x i8>, ptr %p, align 32
-; CHECK-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %14 = load i16, ptr %p, align 2
-; CHECK-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %15 = load <1 x i16>, ptr %p, align 2
-; CHECK-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %16 = load <2 x i16>, ptr %p, align 4
-; CHECK-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %17 = load <4 x i16>, ptr %p, align 8
-; CHECK-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %18 = load <8 x i16>, ptr %p, align 16
-; CHECK-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %19 = load <16 x i16>, ptr %p, align 32
-; CHECK-NEXT:  Cost Model: Found an estimated cost of 4 for instruction: %20 = load <32 x i16>, ptr %p, align 64
-; CHECK-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %21 = load <vscale x 1 x i16>, ptr %p, align 2
-; CHECK-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %22 = load <vscale x 2 x i16>, ptr %p, align 4
-; CHECK-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %23 = load <vscale x 4 x i16>, ptr %p, align 8
-; CHECK-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %24 = load <vscale x 8 x i16>, ptr %p, align 16
-; CHECK-NEXT:  Cost Model: Found an estimated cost of 4 for instruction: %25 = load <vscale x 16 x i16>, ptr %p, align 32
-; CHECK-NEXT:  Cost Model: Found an estimated cost of 8 for instruction: %26 = load <vscale x 32 x i16>, ptr %p, align 64
-; CHECK-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %27 = load i32, ptr %p, align 4
-; CHECK-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %28 = load <1 x i32>, ptr %p, align 4
-; CHECK-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %29 = load <2 x i32>, ptr %p, align 8
-; CHECK-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %30 = load <4 x i32>, ptr %p, align 16
-; CHECK-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %31 = load <8 x i32>, ptr %p, align 32
-; CHECK-NEXT:  Cost Model: Found an estimated cost of 4 for instruction: %32 = load <16 x i32>, ptr %p, align 64
-; CHECK-NEXT:  Cost Model: Found an estimated cost of 8 for instruction: %33 = load <32 x i32>, ptr %p, align 128
-; CHECK-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %34 = load <vscale x 1 x i32>, ptr %p, align 4
-; CHECK-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %35 = load <vscale x 2 x i32>, ptr %p, align 8
-; CHECK-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %36 = load <vscale x 4 x i32>, ptr %p, align 16
-; CHECK-NEXT:  Cost Model: Found an estimated cost of 4 for instruction: %37 = load <vscale x 8 x i32>, ptr %p, align 32
-; CHECK-NEXT:  Cost Model: Found an estimated cost of 8 for instruction: %38 = load <vscale x 16 x i32>, ptr %p, align 64
-; CHECK-NEXT:  Cost Model: Found an estimated cost of 16 for instruction: %39 = load <vscale x 32 x i32>, ptr %p, align 128
-; CHECK-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %40 = load i64, ptr %p, align 8
-; CHECK-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %41 = load <1 x i64>, ptr %p, align 8
-; CHECK-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %42 = load <2 x i64>, ptr %p, align 16
-; CHECK-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %43 = load <4 x i64>, ptr %p, align 32
-; CHECK-NEXT:  Cost Model: Found an estimated cost of 4 for instruction: %44 = load <8 x i64>, ptr %p, align 64
-; CHECK-NEXT:  Cost Model: Found an estimated cost of 8 for instruction: %45 = load <16 x i64>, ptr %p, align 128
-; CHECK-NEXT:  Cost Model: Found an estimated cost of 16 for instruction: %46 = load <32 x i64>, ptr %p, align 256
-; CHECK-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %47 = load <vscale x 1 x i64>, ptr %p, align 8
-; CHECK-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %48 = load <vscale x 2 x i64>, ptr %p, align 16
-; CHECK-NEXT:  Cost Model: Found an estimated cost of 4 for instruction: %49 = load <vscale x 4 x i64>, ptr %p, align 32
-; CHECK-NEXT:  Cost Model: Found an estimated cost of 8 for instruction: %50 = load <vscale x 8 x i64>, ptr %p, align 64
-; CHECK-NEXT:  Cost Model: Found an estimated cost of 16 for instruction: %51 = load <vscale x 16 x i64>, ptr %p, align 128
-; CHECK-NEXT:  Cost Model: Found an estimated cost of 32 for instruction: %52 = load <vscale x 32 x i64>, ptr %p, align 256
-; CHECK-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %53 = load ptr, ptr %p, align 8
-; CHECK-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %54 = load <1 x ptr>, ptr %p, align 8
-; CHECK-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %55 = load <2 x ptr>, ptr %p, align 16
-; CHECK-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %56 = load <4 x ptr>, ptr %p, align 32
-; CHECK-NEXT:  Cost Model: Found an estimated cost of 4 for instruction: %57 = load <8 x ptr>, ptr %p, align 64
-; CHECK-NEXT:  Cost Model: Found an estimated cost of 8 for instruction: %58 = load <16 x ptr>, ptr %p, align 128
-; CHECK-NEXT:  Cost Model: Found an estimated cost of 16 for instruction: %59 = load <32 x ptr>, ptr %p, align 256
-; CHECK-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %60 = load <vscale x 1 x ptr>, ptr %p, align 8
-; CHECK-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %61 = load <vscale x 2 x ptr>, ptr %p, align 16
-; CHECK-NEXT:  Cost Model: Found an estimated cost of 4 for instruction: %62 = load <vscale x 4 x ptr>, ptr %p, align 32
-; CHECK-NEXT:  Cost Model: Found an estimated cost of 8 for instruction: %63 = load <vscale x 8 x ptr>, ptr %p, align 64
-; CHECK-NEXT:  Cost Model: Found an estimated cost of 16 for instruction: %64 = load <vscale x 16 x ptr>, ptr %p, align 128
-; CHECK-NEXT:  Cost Model: Found an estimated cost of 32 for instruction: %65 = load <vscale x 32 x ptr>, ptr %p, align 256
+; CHECK-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %1 = load i1, ptr %p, align 1
+; CHECK-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %2 = load <1 x i1>, ptr %p, align 1
+; CHECK-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %3 = load <2 x i1>, ptr %p, align 1
+; CHECK-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %4 = load <4 x i1>, ptr %p, align 1
+; CHECK-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %5 = load <8 x i1>, ptr %p, align 1
+; CHECK-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %6 = load <16 x i1>, ptr %p, align 2
+; CHECK-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %7 = load <32 x i1>, ptr %p, align 4
+; CHECK-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %8 = load <vscale x 1 x i1>, ptr %p, align 1
+; CHECK-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %9 = load <vscale x 2 x i1>, ptr %p, align 1
+; CHECK-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %10 = load <vscale x 4 x i1>, ptr %p, align 1
+; CHECK-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %11 = load <vscale x 8 x i1>, ptr %p, align 1
+; CHECK-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %12 = load <vscale x 16 x i1>, ptr %p, align 2
+; CHECK-NEXT:  Cost Model: Found an estimated cost of 4 for instruction: %13 = load <vscale x 32 x i1>, ptr %p, align 4
+; CHECK-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %14 = load i8, ptr %p, align 1
+; CHECK-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %15 = load <1 x i8>, ptr %p, align 1
+; CHECK-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %16 = load <2 x i8>, ptr %p, align 2
+; CHECK-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %17 = load <4 x i8>, ptr %p, align 4
+; CHECK-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %18 = load <8 x i8>, ptr %p, align 8
+; CHECK-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %19 = load <16 x i8>, ptr %p, align 16
+; CHECK-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %20 = load <32 x i8>, ptr %p, align 32
+; CHECK-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %21 = load <vscale x 1 x i8>, ptr %p, align 1
+; CHECK-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %22 = load <vscale x 2 x i8>, ptr %p, align 2
+; CHECK-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %23 = load <vscale x 4 x i8>, ptr %p, align 4
+; CHECK-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %24 = load <vscale x 8 x i8>, ptr %p, align 8
+; CHECK-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %25 = load <vscale x 16 x i8>, ptr %p, align 16
+; CHECK-NEXT:  Cost Model: Found an estimated cost of 4 for instruction: %26 = load <vscale x 32 x i8>, ptr %p, align 32
+; CHECK-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %27 = load i16, ptr %p, align 2
+; CHECK-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %28 = load <1 x i16>, ptr %p, align 2
+; CHECK-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %29 = load <2 x i16>, ptr %p, align 4
+; CHECK-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %30 = load <4 x i16>, ptr %p, align 8
+; CHECK-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %31 = load <8 x i16>, ptr %p, align 16
+; CHECK-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %32 = load <16 x i16>, ptr %p, align 32
+; CHECK-NEXT:  Cost Model: Found an estimated cost of 4 for instruction: %33 = load <32 x i16>, ptr %p, align 64
+; CHECK-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %34 = load <vscale x 1 x i16>, ptr %p, align 2
+; CHECK-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %35 = load <vscale x 2 x i16>, ptr %p, align 4
+; CHECK-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %36 = load <vscale x 4 x i16>, ptr %p, align 8
+; CHECK-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %37 = load <vscale x 8 x i16>, ptr %p, align 16
+; CHECK-NEXT:  Cost Model: Found an estimated cost of 4 for instruction: %38 = load <vscale x 16 x i16>, ptr %p, align 32
+; CHECK-NEXT:  Cost Model: Found an estimated cost of 8 for instruction: %39 = load <vscale x 32 x i16>, ptr %p, align 64
+; CHECK-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %40 = load i32, ptr %p, align 4
+; CHECK-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %41 = load <1 x i32>, ptr %p, align 4
+; CHECK-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %42 = load <2 x i32>, ptr %p, align 8
+; CHECK-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %43 = load <4 x i32>, ptr %p, align 16
+; CHECK-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %44 = load <8 x i32>, ptr %p, align 32
+; CHECK-NEXT:  Cost Model: Found an estimated cost of 4 for instruction: %45 = load <16 x i32>, ptr %p, align 64
+; CHECK-NEXT:  Cost Model: Found an estimated cost of 8 for instruction: %46 = load <32 x i32>, ptr %p, align 128
+; CHECK-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %47 = load <vscale x 1 x i32>, ptr %p, align 4
+; CHECK-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %48 = load <vscale x 2 x i32>, ptr %p, align 8
+; CHECK-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %49 = load <vscale x 4 x i32>, ptr %p, align 16
+; CHECK-NEXT:  Cost Model: Found an estimated cost of 4 for instruction: %50 = load <vscale x 8 x i32>, ptr %p, align 32
+; CHECK-NEXT:  Cost Model: Found an estimated cost of 8 for instruction: %51 = load <vscale x 16 x i32>, ptr %p, align 64
+; CHECK-NEXT:  Cost Model: Found an estimated cost of 16 for instruction: %52 = load <vscale x 32 x i32>, ptr %p, align 128
+; CHECK-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %53 = load i64, ptr %p, align 8
+; CHECK-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %54 = load <1 x i64>, ptr %p, align 8
+; CHECK-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %55 = load <2 x i64>, ptr %p, align 16
+; CHECK-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %56 = load <4 x i64>, ptr %p, align 32
+; CHECK-NEXT:  Cost Model: Found an estimated cost of 4 for instruction: %57 = load <8 x i64>, ptr %p, align 64
+; CHECK-NEXT:  Cost Model: Found an estimated cost of 8 for instruction: %58 = load <16 x i64>, ptr %p, align 128
+; CHECK-NEXT:  Cost Model: Found an estimated cost of 16 for instruction: %59 = load <32 x i64>, ptr %p, align 256
+; CHECK-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %60 = load <vscale x 1 x i64>, ptr %p, align 8
+; CHECK-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %61 = load <vscale x 2 x i64>, ptr %p, align 16
+; CHECK-NEXT:  Cost Model: Found an estimated cost of 4 for instruction: %62 = load <vscale x 4 x i64>, ptr %p, align 32
+; CHECK-NEXT:  Cost Model: Found an estimated cost of 8 for instruction: %63 = load <vscale x 8 x i64>, ptr %p, align 64
+; CHECK-NEXT:  Cost Model: Found an estimated cost of 16 for instruction: %64 = load <vscale x 16 x i64>, ptr %p, align 128
+; CHECK-NEXT:  Cost Model: Found an estimated cost of 32 for instruction: %65 = load <vscale x 32 x i64>, ptr %p, align 256
+; CHECK-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %66 = load ptr, ptr %p, align 8
+; CHECK-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %67 = load <1 x ptr>, ptr %p, align 8
+; CHECK-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %68 = load <2 x ptr>, ptr %p, align 16
+; CHECK-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %69 = load <4 x ptr>, ptr %p, align 32
+; CHECK-NEXT:  Cost Model: Found an estimated cost of 4 for instruction: %70 = load <8 x ptr>, ptr %p, align 64
+; CHECK-NEXT:  Cost Model: Found an estimated cost of 8 for instruction: %71 = load <16 x ptr>, ptr %p, align 128
+; CHECK-NEXT:  Cost Model: Found an estimated cost of 16 for instruction: %72 = load <32 x ptr>, ptr %p, align 256
+; CHECK-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %73 = load <vscale x 1 x ptr>, ptr %p, align 8
+; CHECK-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %74 = load <vscale x 2 x ptr>, ptr %p, align 16
+; CHECK-NEXT:  Cost Model: Found an estimated cost of 4 for instruction: %75 = load <vscale x 4 x ptr>, ptr %p, align 32
+; CHECK-NEXT:  Cost Model: Found an estimated cost of 8 for instruction: %76 = load <vscale x 8 x ptr>, ptr %p, align 64
+; CHECK-NEXT:  Cost Model: Found an estimated cost of 16 for instruction: %77 = load <vscale x 16 x ptr>, ptr %p, align 128
+; CHECK-NEXT:  Cost Model: Found an estimated cost of 32 for instruction: %78 = load <vscale x 32 x ptr>, ptr %p, align 256
 ; CHECK-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: ret void
 ;
 ; CODESIZE-LABEL: 'load'
-; CODESIZE-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %1 = load i8, ptr %p, align 1
-; CODESIZE-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %2 = load <1 x i8>, ptr %p, align 1
-; CODESIZE-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %3 = load <2 x i8>, ptr %p, align 2
-; CODESIZE-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %4 = load <4 x i8>, ptr %p, align 4
-; CODESIZE-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %5 = load <8 x i8>, ptr %p, align 8
-; CODESIZE-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %6 = load <16 x i8>, ptr %p, align 16
-; CODESIZE-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %7 = load <32 x i8>, ptr %p, align 32
-; CODESIZE-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %8 = load <vscale x 1 x i8>, ptr %p, align 1
-; CODESIZE-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %9 = load <vscale x 2 x i8>, ptr %p, align 2
-; CODESIZE-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %10 = load <vscale x 4 x i8>, ptr %p, align 4
-; CODESIZE-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %11 = load <vscale x 8 x i8>, ptr %p, align 8
-; CODESIZE-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %12 = load <vscale x 16 x i8>, ptr %p, align 16
-; CODESIZE-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %13 = load <vscale x 32 x i8>, ptr %p, align 32
-; CODESIZE-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %14 = load i16, ptr %p, align 2
-; CODESIZE-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %15 = load <1 x i16>, ptr %p, align 2
-; CODESIZE-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %16 = load <2 x i16>, ptr %p, align 4
-; CODESIZE-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %17 = load <4 x i16>, ptr %p, align 8
-; CODESIZE-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %18 = load <8 x i16>, ptr %p, align 16
-; CODESIZE-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %19 = load <16 x i16>, ptr %p, align 32
-; CODESIZE-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %20 = load <32 x i16>, ptr %p, align 64
-; CODESIZE-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %21 = load <vscale x 1 x i16>, ptr %p, align 2
-; CODESIZE-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %22 = load <vscale x 2 x i16>, ptr %p, align 4
-; CODESIZE-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %23 = load <vscale x 4 x i16>, ptr %p, align 8
-; CODESIZE-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %24 = load <vscale x 8 x i16>, ptr %p, align 16
-; CODESIZE-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %25 = load <vscale x 16 x i16>, ptr %p, align 32
-; CODESIZE-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %26 = load <vscale x 32 x i16>, ptr %p, align 64
-; CODESIZE-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %27 = load i32, ptr %p, align 4
-; CODESIZE-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %28 = load <1 x i32>, ptr %p, align 4
-; CODESIZE-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %29 = load <2 x i32>, ptr %p, align 8
-; CODESIZE-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %30 = load <4 x i32>, ptr %p, align 16
-; CODESIZE-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %31 = load <8 x i32>, ptr %p, align 32
-; CODESIZE-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %32 = load <16 x i32>, ptr %p, align 64
-; CODESIZE-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %33 = load <32 x i32>, ptr %p, align 128
-; CODESIZE-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %34 = load <vscale x 1 x i32>, ptr %p, align 4
-; CODESIZE-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %35 = load <vscale x 2 x i32>, ptr %p, align 8
-; CODESIZE-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %36 = load <vscale x 4 x i32>, ptr %p, align 16
-; CODESIZE-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %37 = load <vscale x 8 x i32>, ptr %p, align 32
-; CODESIZE-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %38 = load <vscale x 16 x i32>, ptr %p, align 64
-; CODESIZE-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %39 = load <vscale x 32 x i32>, ptr %p, align 128
-; CODESIZE-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %40 = load i64, ptr %p, align 8
-; CODESIZE-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %41 = load <1 x i64>, ptr %p, align 8
-; CODESIZE-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %42 = load <2 x i64>, ptr %p, align 16
-; CODESIZE-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %43 = load <4 x i64>, ptr %p, align 32
-; CODESIZE-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %44 = load <8 x i64>, ptr %p, align 64
-; CODESIZE-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %45 = load <16 x i64>, ptr %p, align 128
-; CODESIZE-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %46 = load <32 x i64>, ptr %p, align 256
-; CODESIZE-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %47 = load <vscale x 1 x i64>, ptr %p, align 8
-; CODESIZE-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %48 = load <vscale x 2 x i64>, ptr %p, align 16
-; CODESIZE-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %49 = load <vscale x 4 x i64>, ptr %p, align 32
-; CODESIZE-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %50 = load <vscale x 8 x i64>, ptr %p, align 64
-; CODESIZE-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %51 = load <vscale x 16 x i64>, ptr %p, align 128
-; CODESIZE-NEXT:  Cost Model: Found an estimated cost of 4 for instruction: %52 = load <vscale x 32 x i64>, ptr %p, align 256
-; CODESIZE-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %53 = load ptr, ptr %p, align 8
-; CODESIZE-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %54 = load <1 x ptr>, ptr %p, align 8
-; CODESIZE-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %55 = load <2 x ptr>, ptr %p, align 16
-; CODESIZE-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %56 = load <4 x ptr>, ptr %p, align 32
-; CODESIZE-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %57 = load <8 x ptr>, ptr %p, align 64
-; CODESIZE-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %58 = load <16 x ptr>, ptr %p, align 128
-; CODESIZE-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %59 = load <32 x ptr>, ptr %p, align 256
-; CODESIZE-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %60 = load <vscale x 1 x ptr>, ptr %p, align 8
-; CODESIZE-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %61 = load <vscale x 2 x ptr>, ptr %p, align 16
-; CODESIZE-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %62 = load <vscale x 4 x ptr>, ptr %p, align 32
-; CODESIZE-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %63 = load <vscale x 8 x ptr>, ptr %p, align 64
-; CODESIZE-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %64 = load <vscale x 16 x ptr>, ptr %p, align 128
-; CODESIZE-NEXT:  Cost Model: Found an estimated cost of 4 for instruction: %65 = load <vscale x 32 x ptr>, ptr %p, align 256
+; CODESIZE-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %1 = load i1, ptr %p, align 1
+; CODESIZE-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %2 = load <1 x i1>, ptr %p, align 1
+; CODESIZE-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %3 = load <2 x i1>, ptr %p, align 1
+; CODESIZE-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %4 = load <4 x i1>, ptr %p, align 1
+; CODESIZE-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %5 = load <8 x i1>, ptr %p, align 1
+; CODESIZE-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %6 = load <16 x i1>, ptr %p, align 2
+; CODESIZE-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %7 = load <32 x i1>, ptr %p, align 4
+; CODESIZE-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %8 = load <vscale x 1 x i1>, ptr %p, align 1
+; CODESIZE-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %9 = load <vscale x 2 x i1>, ptr %p, align 1
+; CODESIZE-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %10 = load <vscale x 4 x i1>, ptr %p, align 1
+; CODESIZE-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %11 = load <vscale x 8 x i1>, ptr %p, align 1
+; CODESIZE-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %12 = load <vscale x 16 x i1>, ptr %p, align 2
+; CODESIZE-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %13 = load <vscale x 32 x i1>, ptr %p, align 4
+; CODESIZE-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %14 = load i8, ptr %p, align 1
+; CODESIZE-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %15 = load <1 x i8>, ptr %p, align 1
+; CODESIZE-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %16 = load <2 x i8>, ptr %p, align 2
+; CODESIZE-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %17 = load <4 x i8>, ptr %p, align 4
+; CODESIZE-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %18 = load <8 x i8>, ptr %p, align 8
+; CODESIZE-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %19 = load <16 x i8>, ptr %p, align 16
+; CODESIZE-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %20 = load <32 x i8>, ptr %p, align 32
+; CODESIZE-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %21 = load <vscale x 1 x i8>, ptr %p, align 1
+; CODESIZE-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %22 = load <vscale x 2 x i8>, ptr %p, align 2
+; CODESIZE-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %23 = load <vscale x 4 x i8>, ptr %p, align 4
+; CODESIZE-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %24 = load <vscale x 8 x i8>, ptr %p, align 8
+; CODESIZE-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %25 = load <vscale x 16 x i8>, ptr %p, align 16
+; CODESIZE-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %26 = load <vscale x 32 x i8>, ptr %p, align 32
+; CODESIZE-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %27 = load i16, ptr %p, align 2
+; CODESIZE-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %28 = load <1 x i16>, ptr %p, align 2
+; CODESIZE-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %29 = load <2 x i16>, ptr %p, align 4
+; CODESIZE-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %30 = load <4 x i16>, ptr %p, align 8
+; CODESIZE-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %31 = load <8 x i16>, ptr %p, align 16
+; CODESIZE-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %32 = load <16 x i16>, ptr %p, align 32
+; CODESIZE-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %33 = load <32 x i16>, ptr %p, align 64
+; CODESIZE-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %34 = load <vscale x 1 x i16>, ptr %p, align 2
+; CODESIZE-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %35 = load <vscale x 2 x i16>, ptr %p, align 4
+; CODESIZE-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %36 = load <vscale x 4 x i16>, ptr %p, align 8
+; CODESIZE-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %37 = load <vscale x 8 x i16>, ptr %p, align 16
+; CODESIZE-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %38 = load <vscale x 16 x i16>, ptr %p, align 32
+; CODESIZE-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %39 = load <vscale x 32 x i16>, ptr %p, align 64
+; CODESIZE-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %40 = load i32, ptr %p, align 4
+; CODESIZE-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %41 = load <1 x i32>, ptr %p, align 4
+; CODESIZE-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %42 = load <2 x i32>, ptr %p, align 8
+; CODESIZE-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %43 = load <4 x i32>, ptr %p, align 16
+; CODESIZE-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %44 = load <8 x i32>, ptr %p, align 32
+; CODESIZE-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %45 = load <16 x i32>, ptr %p, align 64
+; CODESIZE-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %46 = load <32 x i32>, ptr %p, align 128
+; CODESIZE-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %47 = load <vscale x 1 x i32>, ptr %p, align 4
+; CODESIZE-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %48 = load <vscale x 2 x i32>, ptr %p, align 8
+; CODESIZE-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %49 = load <vscale x 4 x i32>, ptr %p, align 16
+; CODESIZE-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %50 = load <vscale x 8 x i32>, ptr %p, align 32
+; CODESIZE-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %51 = load <vscale x 16 x i32>, ptr %p, align 64
+; CODESIZE-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %52 = load <vscale x 32 x i32>, ptr %p, align 128
+; CODESIZE-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %53 = load i64, ptr %p, align 8
+; CODESIZE-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %54 = load <1 x i64>, ptr %p, align 8
+; CODESIZE-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %55 = load <2 x i64>, ptr %p, align 16
+; CODESIZE-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %56 = load <4 x i64>, ptr %p, align 32
+; CODESIZE-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %57 = load <8 x i64>, ptr %p, align 64
+; CODESIZE-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %58 = load <16 x i64>, ptr %p, align 128
+; CODESIZE-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %59 = load <32 x i64>, ptr %p, align 256
+; CODESIZE-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %60 = load <vscale x 1 x i64>, ptr %p, align 8
+; CODESIZE-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %61 = load <vscale x 2 x i64>, ptr %p, align 16
+; CODESIZE-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %62 = load <vscale x 4 x i64>, ptr %p, align 32
+; CODESIZE-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %63 = load <vscale x 8 x i64>, ptr %p, align 64
+; CODESIZE-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %64 = load <vscale x 16 x i64>, ptr %p, align 128
+; CODESIZE-NEXT:  Cost Model: Found an estimated cost of 4 for instruction: %65 = load <vscale x 32 x i64>, ptr %p, align 256
+; CODESIZE-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %66 = load ptr, ptr %p, align 8
+; CODESIZE-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %67 = load <1 x ptr>, ptr %p, align 8
+; CODESIZE-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %68 = load <2 x ptr>, ptr %p, align 16
+; CODESIZE-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %69 = load <4 x ptr>, ptr %p, align 32
+; CODESIZE-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %70 = load <8 x ptr>, ptr %p, align 64
+; CODESIZE-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %71 = load <16 x ptr>, ptr %p, align 128
+; CODESIZE-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %72 = load <32 x ptr>, ptr %p, align 256
+; CODESIZE-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %73 = load <vscale x 1 x ptr>, ptr %p, align 8
+; CODESIZE-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %74 = load <vscale x 2 x ptr>, ptr %p, align 16
+; CODESIZE-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %75 = load <vscale x 4 x ptr>, ptr %p, align 32
+; CODESIZE-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %76 = load <vscale x 8 x ptr>, ptr %p, align 64
+; CODESIZE-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %77 = load <vscale x 16 x ptr>, ptr %p, align 128
+; CODESIZE-NEXT:  Cost Model: Found an estimated cost of 4 for instruction: %78 = load <vscale x 32 x ptr>, ptr %p, align 256
 ; CODESIZE-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: ret void
 ;
+  load i1, ptr %p
+  load <1 x i1>, ptr %p
+  load <2 x i1>, ptr %p
+  load <4 x i1>, ptr %p
+  load <8 x i1>, ptr %p
+  load <16 x i1>, ptr %p
+  load <32 x i1>, ptr %p
+  load <vscale x 1 x i1>, ptr %p
+  load <vscale x 2 x i1>, ptr %p
+  load <vscale x 4 x i1>, ptr %p
+  load <vscale x 8 x i1>, ptr %p
+  load <vscale x 16 x i1>, ptr %p
+  load <vscale x 32 x i1>, ptr %p
+
   load i8, ptr %p
   load <1 x i8>, ptr %p
   load <2 x i8>, ptr %p
@@ -217,6 +257,19 @@ define void @load(ptr %p) {
 
 define void @store(ptr %p) {
 ; CHECK-LABEL: 'store'
+; CHECK-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: store i1 undef, ptr %p, align 1
+; CHECK-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: store <1 x i1> undef, ptr %p, align 1
+; CHECK-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: store <2 x i1> undef, ptr %p, align 1
+; CHECK-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: store <4 x i1> undef, ptr %p, align 1
+; CHECK-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: store <8 x i1> undef, ptr %p, align 1
+; CHECK-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: store <16 x i1> undef, ptr %p, align 2
+; CHECK-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: store <32 x i1> undef, ptr %p, align 4
+; CHECK-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: store <vscale x 1 x i1> undef, ptr %p, align 1
+; CHECK-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: store <vscale x 2 x i1> undef, ptr %p, align 1
+; CHECK-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: store <vscale x 4 x i1> undef, ptr %p, align 1
+; CHECK-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: store <vscale x 8 x i1> undef, ptr %p, align 1
+; CHECK-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: store <vscale x 16 x i1> undef, ptr %p, align 2
+; CHECK-NEXT:  Cost Model: Found an estimated cost of 4 for instruction: store <vscale x 32 x i1> undef, ptr %p, align 4
 ; CHECK-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: store i8 undef, ptr %p, align 1
 ; CHECK-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: store <1 x i8> undef, ptr %p, align 1
 ; CHECK-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: store <2 x i8> undef, ptr %p, align 2
@@ -285,6 +338,19 @@ define void @store(ptr %p) {
 ; CHECK-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: ret void
 ;
 ; CODESIZE-LABEL: 'store'
+; CODESIZE-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: store i1 undef, ptr %p, align 1
+; CODESIZE-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: store <1 x i1> undef, ptr %p, align 1
+; CODESIZE-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: store <2 x i1> undef, ptr %p, align 1
+; CODESIZE-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: store <4 x i1> undef, ptr %p, align 1
+; CODESIZE-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: store <8 x i1> undef, ptr %p, align 1
+; CODESIZE-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: store <16 x i1> undef, ptr %p, align 2
+; CODESIZE-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: store <32 x i1> undef, ptr %p, align 4
+; CODESIZE-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: store <vscale x 1 x i1> undef, ptr %p, align 1
+; CODESIZE-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: store <vscale x 2 x i1> undef, ptr %p, align 1
+; CODESIZE-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: store <vscale x 4 x i1> undef, ptr %p, align 1
+; CODESIZE-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: store <vscale x 8 x i1> undef, ptr %p, align 1
+; CODESIZE-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: store <vscale x 16 x i1> undef, ptr %p, align 2
+; CODESIZE-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: store <vscale x 32 x i1> undef, ptr %p, align 4
 ; CODESIZE-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: store i8 undef, ptr %p, align 1
 ; CODESIZE-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: store <1 x i8> undef, ptr %p, align 1
 ; CODESIZE-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: store <2 x i8> undef, ptr %p, align 2
@@ -352,6 +418,20 @@ define void @store(ptr %p) {
 ; CODESIZE-NEXT:  Cost Model: Found an estimated cost of 4 for instruction: store <vscale x 32 x ptr> undef, ptr %p, align 256
 ; CODESIZE-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: ret void
 ;
+  store i1 undef, ptr %p
+  store <1 x i1> undef, ptr %p
+  store <2 x i1> undef, ptr %p
+  store <4 x i1> undef, ptr %p
+  store <8 x i1> undef, ptr %p
+  store <16 x i1> undef, ptr %p
+  store <32 x i1> undef, ptr %p
+  store <vscale x 1 x i1> undef, ptr %p
+  store <vscale x 2 x i1> undef, ptr %p
+  store <vscale x 4 x i1> undef, ptr %p
+  store <vscale x 8 x i1> undef, ptr %p
+  store <vscale x 16 x i1> undef, ptr %p
+  store <vscale x 32 x i1> undef, ptr %p
+
   store i8 undef, ptr %p
   store <1 x i8> undef, ptr %p
   store <2 x i8> undef, ptr %p
@@ -497,38 +577,78 @@ define void @store_of_constant(ptr %p) {
 
 define void @load_oddsize_vectors(ptr %p) {
 ; CHECK-LABEL: 'load_oddsize_vectors'
-; CHECK-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %1 = load <1 x i32>, ptr %p, align 4
-; CHECK-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %2 = load <2 x i32>, ptr %p, align 8
-; CHECK-NEXT:  Cost Model: Found an estimated cost of 4 for instruction: %3 = load <3 x i32>, ptr %p, align 16
-; CHECK-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %4 = load <4 x i32>, ptr %p, align 16
-; CHECK-NEXT:  Cost Model: Found an estimated cost of 12 for instruction: %5 = load <5 x i32>, ptr %p, align 32
-; CHECK-NEXT:  Cost Model: Found an estimated cost of 14 for instruction: %6 = load <6 x i32>, ptr %p, align 32
-; CHECK-NEXT:  Cost Model: Found an estimated cost of 16 for instruction: %7 = load <7 x i32>, ptr %p, align 32
-; CHECK-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %8 = load <8 x i32>, ptr %p, align 32
-; CHECK-NEXT:  Cost Model: Found an estimated cost of 40 for instruction: %9 = load <9 x i32>, ptr %p, align 64
-; CHECK-NEXT:  Cost Model: Found an estimated cost of 64 for instruction: %10 = load <15 x i32>, ptr %p, align 64
-; CHECK-NEXT:  Cost Model: Found an estimated cost of 4 for instruction: %11 = load <16 x i32>, ptr %p, align 64
-; CHECK-NEXT:  Cost Model: Found an estimated cost of 256 for instruction: %12 = load <31 x i32>, ptr %p, align 128
-; CHECK-NEXT:  Cost Model: Found an estimated cost of 8 for instruction: %13 = load <32 x i32>, ptr %p, align 128
+; CHECK-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %1 = load <1 x i1>, ptr %p, align 1
+; CHECK-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %2 = load <2 x i1>, ptr %p, align 1
+; CHECK-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %3 = load <3 x i1>, ptr %p, align 1
+; CHECK-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %4 = load <4 x i1>, ptr %p, align 1
+; CHECK-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %5 = load <5 x i1>, ptr %p, align 1
+; CHECK-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %6 = load <6 x i1>, ptr %p, align 1
+; CHECK-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %7 = load <7 x i1>, ptr %p, align 1
+; CHECK-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %8 = load <8 x i1>, ptr %p, align 1
+; CHECK-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %9 = load <9 x i1>, ptr %p, align 2
+; CHECK-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %10 = load <15 x i1>, ptr %p, align 2
+; CHECK-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %11 = load <16 x i1>, ptr %p, align 2
+; CHECK-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %12 = load <31 x i1>, ptr %p, align 4
+; CHECK-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %13 = load <32 x i1>, ptr %p, align 4
+; CHECK-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %14 = load <1 x i32>, ptr %p, align 4
+; CHECK-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %15 = load <2 x i32>, ptr %p, align 8
+; CHECK-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %16 = load <3 x i32>, ptr %p, align 16
+; CHECK-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %17 = load <4 x i32>, ptr %p, align 16
+; CHECK-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %18 = load <5 x i32>, ptr %p, align 32
+; CHECK-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %19 = load <6 x i32>, ptr %p, align 32
+; CHECK-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %20 = load <7 x i32>, ptr %p, align 32
+; CHECK-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %21 = load <8 x i32>, ptr %p, align 32
+; CHECK-NEXT:  Cost Model: Found an estimated cost of 4 for instruction: %22 = load <9 x i32>, ptr %p, align 64
+; CHECK-NEXT:  Cost Model: Found an estimated cost of 4 for instruction: %23 = load <15 x i32>, ptr %p, align 64
+; CHECK-NEXT:  Cost Model: Found an estimated cost of 4 for instruction: %24 = load <16 x i32>, ptr %p, align 64
+; CHECK-NEXT:  Cost Model: Found an estimated cost of 8 for instruction: %25 = load <31 x i32>, ptr %p, align 128
+; CHECK-NEXT:  Cost Model: Found an estimated cost of 8 for instruction: %26 = load <32 x i32>, ptr %p, align 128
 ; CHECK-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: ret void
 ;
 ; CODESIZE-LABEL: 'load_oddsize_vectors'
-; CODESIZE-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %1 = load <1 x i32>, ptr %p, align 4
-; CODESIZE-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %2 = load <2 x i32>, ptr %p, align 8
-; CODESIZE-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %3 = load <3 x i32>, ptr %p, align 16
-; CODESIZE-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %4 = load <4 x i32>, ptr %p, align 16
-; CODESIZE-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %5 = load <5 x i32>, ptr %p, align 32
-; CODESIZE-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %6 = load <6 x i32>, ptr %p, align 32
-; CODESIZE-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %7 = load <7 x i32>, ptr %p, align 32
-; CODESIZE-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %8 = load <8 x i32>, ptr %p, align 32
-; CODESIZE-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %9 = load <9 x i32>, ptr %p, align 64
-; CODESIZE-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %10 = load <15 x i32>, ptr %p, align 64
-; CODESIZE-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %11 = load <16 x i32>, ptr %p, align 64
-; CODESIZE-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %12 = load <31 x i32>, ptr %p, align 128
-; CODESIZE-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %13 = load <32 x i32>, ptr %p, align 128
+; CODESIZE-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %1 = load <1 x i1>, ptr %p, align 1
+; CODESIZE-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %2 = load <2 x i1>, ptr %p, align 1
+; CODESIZE-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %3 = load <3 x i1>, ptr %p, align 1
+; CODESIZE-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %4 = load <4 x i1>, ptr %p, align 1
+; CODESIZE-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %5 = load <5 x i1>, ptr %p, align 1
+; CODESIZE-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %6 = load <6 x i1>, ptr %p, align 1
+; CODESIZE-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %7 = load <7 x i1>, ptr %p, align 1
+; CODESIZE-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %8 = load <8 x i1>, ptr %p, align 1
+; CODESIZE-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %9 = load <9 x i1>, ptr %p, align 2
+; CODESIZE-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %10 = load <15 x i1>, ptr %p, align 2
+; CODESIZE-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %11 = load <16 x i1>, ptr %p, align 2
+; CODESIZE-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %12 = load <31 x i1>, ptr %p, align 4
+; CODESIZE-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %13 = load <32 x i1>, ptr %p, align 4
+; CODESIZE-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %14 = load <1 x i32>, ptr %p, align 4
+; CODESIZE-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %15 = load <2 x i32>, ptr %p, align 8
+; CODESIZE-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %16 = load <3 x i32>, ptr %p, align 16
+; CODESIZE-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %17 = load <4 x i32>, ptr %p, align 16
+; CODESIZE-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %18 = load <5 x i32>, ptr %p, align 32
+; CODESIZE-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %19 = load <6 x i32>, ptr %p, align 32
+; CODESIZE-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %20 = load <7 x i32>, ptr %p, align 32
+; CODESIZE-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %21 = load <8 x i32>, ptr %p, align 32
+; CODESIZE-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %22 = load <9 x i32>, ptr %p, align 64
+; CODESIZE-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %23 = load <15 x i32>, ptr %p, align 64
+; CODESIZE-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %24 = load <16 x i32>, ptr %p, align 64
+; CODESIZE-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %25 = load <31 x i32>, ptr %p, align 128
+; CODESIZE-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %26 = load <32 x i32>, ptr %p, align 128
 ; CODESIZE-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: ret void
 ;
 
+  load <1 x i1>, ptr %p
+  load <2 x i1>, ptr %p
+  load <3 x i1>, ptr %p
+  load <4 x i1>, ptr %p
+  load <5 x i1>, ptr %p
+  load <6 x i1>, ptr %p
+  load <7 x i1>, ptr %p
+  load <8 x i1>, ptr %p
+  load <9 x i1>, ptr %p
+  load <15 x i1>, ptr %p
+  load <16 x i1>, ptr %p
+  load <31 x i1>, ptr %p
+  load <32 x i1>, ptr %p
+
   load <1 x i32>, ptr %p
   load <2 x i32>, ptr %p
   load <3 x i32>, ptr %p
@@ -548,21 +668,45 @@ define void @load_oddsize_vectors(ptr %p) {
 
 define void @store_oddsize_vectors(ptr %p) {
 ; CHECK-LABEL: 'store_oddsize_vectors'
+; CHECK-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: store <1 x i1> undef, ptr %p, align 1
+; CHECK-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: store <2 x i1> undef, ptr %p, align 1
+; CHECK-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: store <3 x i1> undef, ptr %p, align 1
+; CHECK-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: store <4 x i1> undef, ptr %p, align 1
+; CHECK-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: store <5 x i1> undef, ptr %p, align 1
+; CHECK-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: store <6 x i1> undef, ptr %p, align 1
+; CHECK-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: store <7 x i1> undef, ptr %p, align 1
+; CHECK-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: store <8 x i1> undef, ptr %p, align 1
+; CHECK-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: store <15 x i1> undef, ptr %p, align 2
+; CHECK-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: store <16 x i1> undef, ptr %p, align 2
+; CHECK-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: store <31 x i1> undef, ptr %p, align 4
+; CHECK-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: store <32 x i1> undef, ptr %p, align 4
 ; CHECK-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: store <1 x i32> undef, ptr %p, align 4
 ; CHECK-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: store <2 x i32> undef, ptr %p, align 8
-; CHECK-NEXT:  Cost Model: Found an estimated cost of 4 for instruction: store <3 x i32> undef, ptr %p, align 16
+; CHECK-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: store <3 x i32> undef, ptr %p, align 16
 ; CHECK-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: store <4 x i32> undef, ptr %p, align 16
-; CHECK-NEXT:  Cost Model: Found an estimated cost of 12 for instruction: store <5 x i32> undef, ptr %p, align 32
-; CHECK-NEXT:  Cost Model: Found an estimated cost of 14 for instruction: store <6 x i32> undef, ptr %p, align 32
-; CHECK-NEXT:  Cost Model: Found an estimated cost of 16 for instruction: store <7 x i32> undef, ptr %p, align 32
+; CHECK-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: store <5 x i32> undef, ptr %p, align 32
+; CHECK-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: store <6 x i32> undef, ptr %p, align 32
+; CHECK-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: store <7 x i32> undef, ptr %p, align 32
 ; CHECK-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: store <8 x i32> undef, ptr %p, align 32
-; CHECK-NEXT:  Cost Model: Found an estimated cost of 64 for instruction: store <15 x i32> undef, ptr %p, align 64
+; CHECK-NEXT:  Cost Model: Found an estimated cost of 4 for instruction: store <15 x i32> undef, ptr %p, align 64
 ; CHECK-NEXT:  Cost Model: Found an estimated cost of 4 for instruction: store <16 x i32> undef, ptr %p, align 64
-; CHECK-NEXT:  Cost Model: Found an estimated cost of 256 for instruction: store <31 x i32> undef, ptr %p, align 128
+; CHECK-NEXT:  Cost Model: Found an estimated cost of 8 for instruction: store <31 x i32> undef, ptr %p, align 128
 ; CHECK-NEXT:  Cost Model: Found an estimated cost of 8 for instruction: store <32 x i32> undef, ptr %p, align 128
 ; CHECK-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: ret void
 ;
 ; CODESIZE-LABEL: 'store_oddsize_vectors'
+; CODESIZE-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: store <1 x i1> undef, ptr %p, align 1
+; CODESIZE-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: store <2 x i1> undef, ptr %p, align 1
+; CODESIZE-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: store <3 x i1> undef, ptr %p, align 1
+; CODESIZE-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: store <4 x i1> undef, ptr %p, align 1
+; CODESIZE-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: store <5 x i1> undef, ptr %p, align 1
+; CODESIZE-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: store <6 x i1> undef, ptr %p, align 1
+; CODESIZE-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: store <7 x i1> undef, ptr %p, align 1
+; CODESIZE-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: store <8 x i1> undef, ptr %p, align 1
+; CODESIZE-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: store <15 x i1> undef, ptr %p, align 2
+; CODESIZE-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: store <16 x i1> undef, ptr %p, align 2
+; CODESIZE-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: store <31 x i1> undef, ptr %p, align 4
+; CODESIZE-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: store <32 x i1> undef, ptr %p, align 4
 ; CODESIZE-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: store <1 x i32> undef, ptr %p, align 4
 ; CODESIZE-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: store <2 x i32> undef, ptr %p, align 8
 ; CODESIZE-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: store <3 x i32> undef, ptr %p, align 16
@@ -577,6 +721,19 @@ define void @store_oddsize_vectors(ptr %p) {
 ; CODESIZE-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: store <32 x i32> undef, ptr %p, align 128
 ; CODESIZE-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: ret void
 ;
+  store <1 x i1> undef, ptr %p
+  store <2 x i1> undef, ptr %p
+  store <3 x i1> undef, ptr %p
+  store <4 x i1> undef, ptr %p
+  store <5 x i1> undef, ptr %p
+  store <6 x i1> undef, ptr %p
+  store <7 x i1> undef, ptr %p
+  store <8 x i1> undef, ptr %p
+  store <15 x i1> undef, ptr %p
+  store <16 x i1> undef, ptr %p
+  store <31 x i1> undef, ptr %p
+  store <32 x i1> undef, ptr %p
+
   store <1 x i32> undef, ptr %p
   store <2 x i32> undef, ptr %p
   store <3 x i32> undef, ptr %p
diff --git a/llvm/test/Analysis/CostModel/X86/arith-overflow.ll b/llvm/test/Analysis/CostModel/X86/arith-overflow.ll
index 963bb8a..d99a6a7 100644
--- a/llvm/test/Analysis/CostModel/X86/arith-overflow.ll
+++ b/llvm/test/Analysis/CostModel/X86/arith-overflow.ll
@@ -1076,11 +1076,11 @@ define i32 @smul(i32 %arg) {
 ; AVX512F-NEXT:  Cost Model: Found an estimated cost of 5 for instruction: %I16 = call { i16, i1 } @llvm.smul.with.overflow.i16(i16 undef, i16 undef)
 ; AVX512F-NEXT:  Cost Model: Found an estimated cost of 3 for instruction: %V8I16 = call { <8 x i16>, <8 x i1> } @llvm.smul.with.overflow.v8i16(<8 x i16> undef, <8 x i16> undef)
 ; AVX512F-NEXT:  Cost Model: Found an estimated cost of 4 for instruction: %V16I16 = call { <16 x i16>, <16 x i1> } @llvm.smul.with.overflow.v16i16(<16 x i16> undef, <16 x i16> undef)
-; AVX512F-NEXT:  Cost Model: Found an estimated cost of 27 for instruction: %V32I16 = call { <32 x i16>, <32 x i1> } @llvm.smul.with.overflow.v32i16(<32 x i16> undef, <32 x i16> undef)
+; AVX512F-NEXT:  Cost Model: Found an estimated cost of 6 for instruction: %V32I16 = call { <32 x i16>, <32 x i1> } @llvm.smul.with.overflow.v32i16(<32 x i16> undef, <32 x i16> undef)
 ; AVX512F-NEXT:  Cost Model: Found an estimated cost of 6 for instruction: %I8 = call { i8, i1 } @llvm.smul.with.overflow.i8(i8 undef, i8 undef)
 ; AVX512F-NEXT:  Cost Model: Found an estimated cost of 6 for instruction: %V16I8 = call { <16 x i8>, <16 x i1> } @llvm.smul.with.overflow.v16i8(<16 x i8> undef, <16 x i8> undef)
 ; AVX512F-NEXT:  Cost Model: Found an estimated cost of 9 for instruction: %V32I8 = call { <32 x i8>, <32 x i1> } @llvm.smul.with.overflow.v32i8(<32 x i8> undef, <32 x i8> undef)
-; AVX512F-NEXT:  Cost Model: Found an estimated cost of 67 for instruction: %V64I8 = call { <64 x i8>, <64 x i1> } @llvm.smul.with.overflow.v64i8(<64 x i8> undef, <64 x i8> undef)
+; AVX512F-NEXT:  Cost Model: Found an estimated cost of 22 for instruction: %V64I8 = call { <64 x i8>, <64 x i1> } @llvm.smul.with.overflow.v64i8(<64 x i8> undef, <64 x i8> undef)
 ; AVX512F-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: ret i32 undef
 ;
 ; AVX512BW-LABEL: 'smul'
@@ -1114,11 +1114,11 @@ define i32 @smul(i32 %arg) {
 ; AVX512DQ-NEXT:  Cost Model: Found an estimated cost of 5 for instruction: %I16 = call { i16, i1 } @llvm.smul.with.overflow.i16(i16 undef, i16 undef)
 ; AVX512DQ-NEXT:  Cost Model: Found an estimated cost of 3 for instruction: %V8I16 = call { <8 x i16>, <8 x i1> } @llvm.smul.with.overflow.v8i16(<8 x i16> undef, <8 x i16> undef)
 ; AVX512DQ-NEXT:  Cost Model: Found an estimated cost of 4 for instruction: %V16I16 = call { <16 x i16>, <16 x i1> } @llvm.smul.with.overflow.v16i16(<16 x i16> undef, <16 x i16> undef)
-; AVX512DQ-NEXT:  Cost Model: Found an estimated cost of 27 for instruction: %V32I16 = call { <32 x i16>, <32 x i1> } @llvm.smul.with.overflow.v32i16(<32 x i16> undef, <32 x i16> undef)
+; AVX512DQ-NEXT:  Cost Model: Found an estimated cost of 6 for instruction: %V32I16 = call { <32 x i16>, <32 x i1> } @llvm.smul.with.overflow.v32i16(<32 x i16> undef, <32 x i16> undef)
 ; AVX512DQ-NEXT:  Cost Model: Found an estimated cost of 6 for instruction: %I8 = call { i8, i1 } @llvm.smul.with.overflow.i8(i8 undef, i8 undef)
 ; AVX512DQ-NEXT:  Cost Model: Found an estimated cost of 6 for instruction: %V16I8 = call { <16 x i8>, <16 x i1> } @llvm.smul.with.overflow.v16i8(<16 x i8> undef, <16 x i8> undef)
 ; AVX512DQ-NEXT:  Cost Model: Found an estimated cost of 9 for instruction: %V32I8 = call { <32 x i8>, <32 x i1> } @llvm.smul.with.overflow.v32i8(<32 x i8> undef, <32 x i8> undef)
-; AVX512DQ-NEXT:  Cost Model: Found an estimated cost of 67 for instruction: %V64I8 = call { <64 x i8>, <64 x i1> } @llvm.smul.with.overflow.v64i8(<64 x i8> undef, <64 x i8> undef)
+; AVX512DQ-NEXT:  Cost Model: Found an estimated cost of 22 for instruction: %V64I8 = call { <64 x i8>, <64 x i1> } @llvm.smul.with.overflow.v64i8(<64 x i8> undef, <64 x i8> undef)
 ; AVX512DQ-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: ret i32 undef
 ;
 ; SLM-LABEL: 'smul'
@@ -1314,11 +1314,11 @@ define i32 @umul(i32 %arg) {
 ; AVX512F-NEXT:  Cost Model: Found an estimated cost of 6 for instruction: %I16 = call { i16, i1 } @llvm.umul.with.overflow.i16(i16 undef, i16 undef)
 ; AVX512F-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %V8I16 = call { <8 x i16>, <8 x i1> } @llvm.umul.with.overflow.v8i16(<8 x i16> undef, <8 x i16> undef)
 ; AVX512F-NEXT:  Cost Model: Found an estimated cost of 4 for instruction: %V16I16 = call { <16 x i16>, <16 x i1> } @llvm.umul.with.overflow.v16i16(<16 x i16> undef, <16 x i16> undef)
-; AVX512F-NEXT:  Cost Model: Found an estimated cost of 22 for instruction: %V32I16 = call { <32 x i16>, <32 x i1> } @llvm.umul.with.overflow.v32i16(<32 x i16> undef, <32 x i16> undef)
+; AVX512F-NEXT:  Cost Model: Found an estimated cost of 5 for instruction: %V32I16 = call { <32 x i16>, <32 x i1> } @llvm.umul.with.overflow.v32i16(<32 x i16> undef, <32 x i16> undef)
 ; AVX512F-NEXT:  Cost Model: Found an estimated cost of 6 for instruction: %I8 = call { i8, i1 } @llvm.umul.with.overflow.i8(i8 undef, i8 undef)
 ; AVX512F-NEXT:  Cost Model: Found an estimated cost of 6 for instruction: %V16I8 = call { <16 x i8>, <16 x i1> } @llvm.umul.with.overflow.v16i8(<16 x i8> undef, <16 x i8> undef)
 ; AVX512F-NEXT:  Cost Model: Found an estimated cost of 9 for instruction: %V32I8 = call { <32 x i8>, <32 x i1> } @llvm.umul.with.overflow.v32i8(<32 x i8> undef, <32 x i8> undef)
-; AVX512F-NEXT:  Cost Model: Found an estimated cost of 64 for instruction: %V64I8 = call { <64 x i8>, <64 x i1> } @llvm.umul.with.overflow.v64i8(<64 x i8> undef, <64 x i8> undef)
+; AVX512F-NEXT:  Cost Model: Found an estimated cost of 18 for instruction: %V64I8 = call { <64 x i8>, <64 x i1> } @llvm.umul.with.overflow.v64i8(<64 x i8> undef, <64 x i8> undef)
 ; AVX512F-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: ret i32 undef
 ;
 ; AVX512BW-LABEL: 'umul'
@@ -1352,11 +1352,11 @@ define i32 @umul(i32 %arg) {
 ; AVX512DQ-NEXT:  Cost Model: Found an estimated cost of 6 for instruction: %I16 = call { i16, i1 } @llvm.umul.with.overflow.i16(i16 undef, i16 undef)
 ; AVX512DQ-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %V8I16 = call { <8 x i16>, <8 x i1> } @llvm.umul.with.overflow.v8i16(<8 x i16> undef, <8 x i16> undef)
 ; AVX512DQ-NEXT:  Cost Model: Found an estimated cost of 4 for instruction: %V16I16 = call { <16 x i16>, <16 x i1> } @llvm.umul.with.overflow.v16i16(<16 x i16> undef, <16 x i16> undef)
-; AVX512DQ-NEXT:  Cost Model: Found an estimated cost of 22 for instruction: %V32I16 = call { <32 x i16>, <32 x i1> } @llvm.umul.with.overflow.v32i16(<32 x i16> undef, <32 x i16> undef)
+; AVX512DQ-NEXT:  Cost Model: Found an estimated cost of 5 for instruction: %V32I16 = call { <32 x i16>, <32 x i1> } @llvm.umul.with.overflow.v32i16(<32 x i16> undef, <32 x i16> undef)
 ; AVX512DQ-NEXT:  Cost Model: Found an estimated cost of 6 for instruction: %I8 = call { i8, i1 } @llvm.umul.with.overflow.i8(i8 undef, i8 undef)
 ; AVX512DQ-NEXT:  Cost Model: Found an estimated cost of 6 for instruction: %V16I8 = call { <16 x i8>, <16 x i1> } @llvm.umul.with.overflow.v16i8(<16 x i8> undef, <16 x i8> undef)
 ; AVX512DQ-NEXT:  Cost Model: Found an estimated cost of 9 for instruction: %V32I8 = call { <32 x i8>, <32 x i1> } @llvm.umul.with.overflow.v32i8(<32 x i8> undef, <32 x i8> undef)
-; AVX512DQ-NEXT:  Cost Model: Found an estimated cost of 64 for instruction: %V64I8 = call { <64 x i8>, <64 x i1> } @llvm.umul.with.overflow.v64i8(<64 x i8> undef, <64 x i8> undef)
+; AVX512DQ-NEXT:  Cost Model: Found an estimated cost of 18 for instruction: %V64I8 = call { <64 x i8>, <64 x i1> } @llvm.umul.with.overflow.v64i8(<64 x i8> undef, <64 x i8> undef)
 ; AVX512DQ-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: ret i32 undef
 ;
 ; SLM-LABEL: 'umul'
diff --git a/llvm/test/Analysis/CycleInfo/unreachable-predecessor.ll b/llvm/test/Analysis/CycleInfo/unreachable-predecessor.ll
new file mode 100644
index 0000000..36a2115
--- /dev/null
+++ b/llvm/test/Analysis/CycleInfo/unreachable-predecessor.ll
@@ -0,0 +1,23 @@
+; RUN: opt < %s -disable-output -passes='print<cycles>' 2>&1 | FileCheck %s
+; CHECK-LABEL: CycleInfo for function: unreachable
+; CHECK:    depth=1: entries(loop.body) loop.latch inner.block
+define void @unreachable(i32 %n) {
+entry:
+  br label %loop.body
+
+loop.body:
+  br label %inner.block
+
+; This branch should not cause %inner.block to appear as an entry.
+unreachable.block:
+  br label %inner.block
+
+inner.block:
+  br i1 undef, label %loop.exit, label %loop.latch
+
+loop.latch:
+  br label %loop.body
+
+loop.exit:
+  ret void
+}
diff --git a/llvm/test/Analysis/ScalarEvolution/finite-trip-count.ll b/llvm/test/Analysis/ScalarEvolution/finite-trip-count.ll
index b3e3b20..471954f 100644
--- a/llvm/test/Analysis/ScalarEvolution/finite-trip-count.ll
+++ b/llvm/test/Analysis/ScalarEvolution/finite-trip-count.ll
@@ -56,6 +56,12 @@ define void @sle_pre_inc_infinite(i32 %len) {
 ; CHECK-NEXT:  Loop %for.body: Unpredictable backedge-taken count.
 ; CHECK-NEXT:  Loop %for.body: Unpredictable constant max backedge-taken count.
 ; CHECK-NEXT:  Loop %for.body: Unpredictable symbolic max backedge-taken count.
+; CHECK-NEXT:  Loop %for.body: Predicated backedge-taken count is (0 smax (1 + (sext i32 %len to i64))<nsw>)
+; CHECK-NEXT:   Predicates:
+; CHECK-NEXT:      {0,+,1}<%for.body> Added Flags: <nssw>
+; CHECK-NEXT:  Loop %for.body: Predicated symbolic max backedge-taken count is (0 smax (1 + (sext i32 %len to i64))<nsw>)
+; CHECK-NEXT:   Predicates:
+; CHECK-NEXT:      {0,+,1}<%for.body> Added Flags: <nssw>
 ;
 entry:
   br label %for.body
@@ -121,6 +127,12 @@ define void @ule_pre_inc_infinite(i32 %len) {
 ; CHECK-NEXT:  Loop %for.body: Unpredictable backedge-taken count.
 ; CHECK-NEXT:  Loop %for.body: Unpredictable constant max backedge-taken count.
 ; CHECK-NEXT:  Loop %for.body: Unpredictable symbolic max backedge-taken count.
+; CHECK-NEXT:  Loop %for.body: Predicated backedge-taken count is (1 + (zext i32 %len to i64))<nuw><nsw>
+; CHECK-NEXT:   Predicates:
+; CHECK-NEXT:      {0,+,1}<%for.body> Added Flags: <nusw>
+; CHECK-NEXT:  Loop %for.body: Predicated symbolic max backedge-taken count is (1 + (zext i32 %len to i64))<nuw><nsw>
+; CHECK-NEXT:   Predicates:
+; CHECK-NEXT:      {0,+,1}<%for.body> Added Flags: <nusw>
 ;
 entry:
   br label %for.body
diff --git a/llvm/test/Analysis/ScalarEvolution/ne-overflow.ll b/llvm/test/Analysis/ScalarEvolution/ne-overflow.ll
index 82b4d0e..49288c8 100644
--- a/llvm/test/Analysis/ScalarEvolution/ne-overflow.ll
+++ b/llvm/test/Analysis/ScalarEvolution/ne-overflow.ll
@@ -240,6 +240,9 @@ define void @test_zext(i64 %N) mustprogress {
 ; CHECK-NEXT:  Loop %for.body: Predicated backedge-taken count is (%N /u 2)
 ; CHECK-NEXT:   Predicates:
 ; CHECK-NEXT:      {0,+,2}<nuw><%for.body> Added Flags: <nusw>
+; CHECK-NEXT:  Loop %for.body: Predicated symbolic max backedge-taken count is (%N /u 2)
+; CHECK-NEXT:   Predicates:
+; CHECK-NEXT:      {0,+,2}<nuw><%for.body> Added Flags: <nusw>
 ;
 entry:
   br label %for.body
diff --git a/llvm/test/Analysis/ScalarEvolution/trip-count-implied-addrec.ll b/llvm/test/Analysis/ScalarEvolution/trip-count-implied-addrec.ll
index e9c13f5..64306ac 100644
--- a/llvm/test/Analysis/ScalarEvolution/trip-count-implied-addrec.ll
+++ b/llvm/test/Analysis/ScalarEvolution/trip-count-implied-addrec.ll
@@ -61,6 +61,9 @@ define void @nw_implies_nsw(i16 %n) mustprogress {
 ; CHECK-NEXT:  Loop %for.body: Predicated backedge-taken count is (128 + (-128 smax %n))
 ; CHECK-NEXT:   Predicates:
 ; CHECK-NEXT:      {-128,+,1}<%for.body> Added Flags: <nssw>
+; CHECK-NEXT:  Loop %for.body: Predicated symbolic max backedge-taken count is (128 + (-128 smax %n))
+; CHECK-NEXT:   Predicates:
+; CHECK-NEXT:      {-128,+,1}<%for.body> Added Flags: <nssw>
 ;
 entry:
   br label %for.body
@@ -107,6 +110,9 @@ define void @actually_infinite() {
 ; CHECK-NEXT:  Loop %for.body: Predicated backedge-taken count is i16 257
 ; CHECK-NEXT:   Predicates:
 ; CHECK-NEXT:      {0,+,1}<%for.body> Added Flags: <nusw>
+; CHECK-NEXT:  Loop %for.body: Predicated symbolic max backedge-taken count is i16 257
+; CHECK-NEXT:   Predicates:
+; CHECK-NEXT:      {0,+,1}<%for.body> Added Flags: <nusw>
 ;
 entry:
   br label %for.body
@@ -132,6 +138,9 @@ define void @rhs_mustexit_1(i16 %n.raw) mustprogress {
 ; CHECK-NEXT:  Loop %for.body: Predicated backedge-taken count is (-1 + (1 umax (-1 + (zext i8 (trunc i16 %n.raw to i8) to i16))<nsw>))
 ; CHECK-NEXT:   Predicates:
 ; CHECK-NEXT:      {1,+,1}<nw><%for.body> Added Flags: <nusw>
+; CHECK-NEXT:  Loop %for.body: Predicated symbolic max backedge-taken count is (-1 + (1 umax (-1 + (zext i8 (trunc i16 %n.raw to i8) to i16))<nsw>))
+; CHECK-NEXT:   Predicates:
+; CHECK-NEXT:      {1,+,1}<nw><%for.body> Added Flags: <nusw>
 ;
 entry:
   %n.and = and i16 %n.raw, 255
@@ -233,6 +242,9 @@ define void @neg_rhs_wrong_range(i16 %n.raw) mustprogress {
 ; CHECK-NEXT:  Loop %for.body: Predicated backedge-taken count is ((-1 + (2 umax (-1 + (zext i8 (trunc i16 %n.raw to i8) to i16))<nsw>)) /u 2)
 ; CHECK-NEXT:   Predicates:
 ; CHECK-NEXT:      {2,+,2}<nw><%for.body> Added Flags: <nusw>
+; CHECK-NEXT:  Loop %for.body: Predicated symbolic max backedge-taken count is ((-1 + (2 umax (-1 + (zext i8 (trunc i16 %n.raw to i8) to i16))<nsw>)) /u 2)
+; CHECK-NEXT:   Predicates:
+; CHECK-NEXT:      {2,+,2}<nw><%for.body> Added Flags: <nusw>
 ;
 entry:
   %n.and = and i16 %n.raw, 255
@@ -260,6 +272,9 @@ define void @neg_rhs_maybe_infinite(i16 %n.raw) {
 ; CHECK-NEXT:  Loop %for.body: Predicated backedge-taken count is (-1 + (1 umax (-1 + (zext i8 (trunc i16 %n.raw to i8) to i16))<nsw>))
 ; CHECK-NEXT:   Predicates:
 ; CHECK-NEXT:      {1,+,1}<%for.body> Added Flags: <nusw>
+; CHECK-NEXT:  Loop %for.body: Predicated symbolic max backedge-taken count is (-1 + (1 umax (-1 + (zext i8 (trunc i16 %n.raw to i8) to i16))<nsw>))
+; CHECK-NEXT:   Predicates:
+; CHECK-NEXT:      {1,+,1}<%for.body> Added Flags: <nusw>
 ;
 entry:
   %n.and = and i16 %n.raw, 255
@@ -382,6 +397,9 @@ define void @ult_constant_rhs_stride2_neg(i16 %n.raw, i8 %start) {
 ; CHECK-NEXT:  Loop %for.body: Predicated backedge-taken count is ((256 + (-1 * (zext i8 (2 + %start) to i16))<nsw>)<nsw> /u 2)
 ; CHECK-NEXT:   Predicates:
 ; CHECK-NEXT:      {(2 + %start),+,2}<%for.body> Added Flags: <nusw>
+; CHECK-NEXT:  Loop %for.body: Predicated symbolic max backedge-taken count is ((256 + (-1 * (zext i8 (2 + %start) to i16))<nsw>)<nsw> /u 2)
+; CHECK-NEXT:   Predicates:
+; CHECK-NEXT:      {(2 + %start),+,2}<%for.body> Added Flags: <nusw>
 ;
 entry:
   br label %for.body
diff --git a/llvm/test/Analysis/ScalarEvolution/trip-count-scalable-stride.ll b/llvm/test/Analysis/ScalarEvolution/trip-count-scalable-stride.ll
index befcabd..7c94983 100644
--- a/llvm/test/Analysis/ScalarEvolution/trip-count-scalable-stride.ll
+++ b/llvm/test/Analysis/ScalarEvolution/trip-count-scalable-stride.ll
@@ -362,3 +362,166 @@ for.body:                                         ; preds = %entry, %for.body
 for.end:                                          ; preds = %for.body, %entry
   ret void
 }
+
+; The next two cases check to see if we can infer the flags on the IV
+; of a countup loop using vscale strides.  vscale is a power of two
+; and these are finite loops by assumption.
+
+define void @vscale_slt_noflags(ptr nocapture %A, i32 %n) mustprogress vscale_range(2,1024) {
+; CHECK-LABEL: 'vscale_slt_noflags'
+; CHECK-NEXT:  Classifying expressions for: @vscale_slt_noflags
+; CHECK-NEXT:    %vscale = call i32 @llvm.vscale.i32()
+; CHECK-NEXT:    --> vscale U: [2,1025) S: [2,1025)
+; CHECK-NEXT:    %i.05 = phi i32 [ %add, %for.body ], [ 0, %entry ]
+; CHECK-NEXT:    --> {0,+,vscale}<%for.body> U: full-set S: full-set Exits: (vscale * ((-1 + %n) /u vscale))<nuw> LoopDispositions: { %for.body: Computable }
+; CHECK-NEXT:    %arrayidx = getelementptr inbounds i32, ptr %A, i32 %i.05
+; CHECK-NEXT:    --> {%A,+,(4 * vscale)<nuw><nsw>}<%for.body> U: full-set S: full-set Exits: ((4 * vscale * ((-1 + %n) /u vscale)) + %A) LoopDispositions: { %for.body: Computable }
+; CHECK-NEXT:    %add = add i32 %i.05, %vscale
+; CHECK-NEXT:    --> {vscale,+,vscale}<nw><%for.body> U: full-set S: full-set Exits: (vscale * (1 + ((-1 + %n) /u vscale))<nuw>) LoopDispositions: { %for.body: Computable }
+; CHECK-NEXT:  Determining loop execution counts for: @vscale_slt_noflags
+; CHECK-NEXT:  Loop %for.body: backedge-taken count is ((-1 + %n) /u vscale)
+; CHECK-NEXT:  Loop %for.body: constant max backedge-taken count is i32 1073741822
+; CHECK-NEXT:  Loop %for.body: symbolic max backedge-taken count is ((-1 + %n) /u vscale)
+; CHECK-NEXT:  Loop %for.body: Trip multiple is 1
+;
+entry:
+  %vscale = call i32 @llvm.vscale.i32()
+  %cmp4 = icmp sgt i32 %n, 0
+  br i1 %cmp4, label %for.body, label %for.end
+
+for.body:                                         ; preds = %entry, %for.body
+  %i.05 = phi i32 [ %add, %for.body ], [ 0, %entry ]
+  %arrayidx = getelementptr inbounds i32, ptr %A, i32 %i.05
+  %0 = load <vscale x 4 x i32>, ptr %arrayidx, align 4
+  %inc = add nsw <vscale x 4 x i32> %0, splat (i32 1)
+  store <vscale x 4 x i32> %inc, ptr %arrayidx, align 4
+  %add = add i32 %i.05, %vscale
+  %cmp = icmp slt i32 %add, %n
+  br i1 %cmp, label %for.body, label %for.end
+
+for.end:                                          ; preds = %for.body, %entry
+  ret void
+}
+
+define void @vscalex4_ult_noflags(ptr nocapture %A, i32 %n) mustprogress vscale_range(2,1024) {
+; CHECK-LABEL: 'vscalex4_ult_noflags'
+; CHECK-NEXT:  Classifying expressions for: @vscalex4_ult_noflags
+; CHECK-NEXT:    %vscale = call i32 @llvm.vscale.i32()
+; CHECK-NEXT:    --> vscale U: [2,1025) S: [2,1025)
+; CHECK-NEXT:    %VF = mul i32 %vscale, 4
+; CHECK-NEXT:    --> (4 * vscale)<nuw><nsw> U: [8,4097) S: [8,4097)
+; CHECK-NEXT:    %i.05 = phi i32 [ %add, %for.body ], [ 0, %entry ]
+; CHECK-NEXT:    --> {0,+,(4 * vscale)<nuw><nsw>}<%for.body> U: [0,-3) S: [-2147483648,2147483645) Exits: (4 * vscale * ((-1 + %n) /u (4 * vscale)<nuw><nsw>)) LoopDispositions: { %for.body: Computable }
+; CHECK-NEXT:    %arrayidx = getelementptr inbounds i32, ptr %A, i32 %i.05
+; CHECK-NEXT:    --> {%A,+,(16 * vscale)<nuw><nsw>}<%for.body> U: full-set S: full-set Exits: ((16 * vscale * ((-1 + %n) /u (4 * vscale)<nuw><nsw>)) + %A) LoopDispositions: { %for.body: Computable }
+; CHECK-NEXT:    %add = add i32 %i.05, %VF
+; CHECK-NEXT:    --> {(4 * vscale)<nuw><nsw>,+,(4 * vscale)<nuw><nsw>}<nw><%for.body> U: [0,-3) S: [-2147483648,2147483645) Exits: (vscale * (4 + (4 * ((-1 + %n) /u (4 * vscale)<nuw><nsw>))<nuw><nsw>)<nuw>) LoopDispositions: { %for.body: Computable }
+; CHECK-NEXT:  Determining loop execution counts for: @vscalex4_ult_noflags
+; CHECK-NEXT:  Loop %for.body: backedge-taken count is ((-1 + %n) /u (4 * vscale)<nuw><nsw>)
+; CHECK-NEXT:  Loop %for.body: constant max backedge-taken count is i32 536870910
+; CHECK-NEXT:  Loop %for.body: symbolic max backedge-taken count is ((-1 + %n) /u (4 * vscale)<nuw><nsw>)
+; CHECK-NEXT:  Loop %for.body: Trip multiple is 1
+;
+entry:
+  %vscale = call i32 @llvm.vscale.i32()
+  %VF = mul i32 %vscale, 4
+  %cmp4 = icmp sgt i32 %n, 0
+  br i1 %cmp4, label %for.body, label %for.end
+
+for.body:                                         ; preds = %entry, %for.body
+  %i.05 = phi i32 [ %add, %for.body ], [ 0, %entry ]
+  %arrayidx = getelementptr inbounds i32, ptr %A, i32 %i.05
+  %0 = load <vscale x 4 x i32>, ptr %arrayidx, align 4
+  %inc = add nsw <vscale x 4 x i32> %0, splat (i32 1)
+  store <vscale x 4 x i32> %inc, ptr %arrayidx, align 4
+  %add = add i32 %i.05, %VF
+  %cmp = icmp ult i32 %add, %n
+  br i1 %cmp, label %for.body, label %for.end
+
+for.end:                                          ; preds = %for.body, %entry
+  ret void
+}
+
+; The next two cases check to see if we can infer the flags on the IV
+; of a countdown loop using vscale strides.
+; TODO: We should be able to because vscale is a power of two and these
+; are finite loops by assumption.
+
+define void @vscale_countdown_ne(ptr nocapture %A, i32 %n) mustprogress vscale_range(2,1024) {
+; CHECK-LABEL: 'vscale_countdown_ne'
+; CHECK-NEXT:  Classifying expressions for: @vscale_countdown_ne
+; CHECK-NEXT:    %vscale = call i32 @llvm.vscale.i32()
+; CHECK-NEXT:    --> vscale U: [2,1025) S: [2,1025)
+; CHECK-NEXT:    %start = sub i32 %n, %vscale
+; CHECK-NEXT:    --> ((-1 * vscale)<nsw> + %n) U: full-set S: full-set
+; CHECK-NEXT:    %iv = phi i32 [ %sub, %for.body ], [ %start, %entry ]
+; CHECK-NEXT:    --> {((-1 * vscale)<nsw> + %n),+,(-1 * vscale)<nsw>}<%for.body> U: full-set S: full-set Exits: <<Unknown>> LoopDispositions: { %for.body: Computable }
+; CHECK-NEXT:    %arrayidx = getelementptr inbounds i32, ptr %A, i32 %iv
+; CHECK-NEXT:    --> {((4 * %n) + (-4 * vscale)<nsw> + %A),+,(-4 * vscale)<nsw>}<%for.body> U: full-set S: full-set Exits: <<Unknown>> LoopDispositions: { %for.body: Computable }
+; CHECK-NEXT:    %sub = sub i32 %iv, %vscale
+; CHECK-NEXT:    --> {((-2 * vscale)<nsw> + %n),+,(-1 * vscale)<nsw>}<%for.body> U: full-set S: full-set Exits: <<Unknown>> LoopDispositions: { %for.body: Computable }
+; CHECK-NEXT:  Determining loop execution counts for: @vscale_countdown_ne
+; CHECK-NEXT:  Loop %for.body: Unpredictable backedge-taken count.
+; CHECK-NEXT:  Loop %for.body: Unpredictable constant max backedge-taken count.
+; CHECK-NEXT:  Loop %for.body: Unpredictable symbolic max backedge-taken count.
+;
+entry:
+  %vscale = call i32 @llvm.vscale.i32()
+  %cmp4 = icmp sgt i32 %n, 0
+  %start = sub i32 %n, %vscale
+  br i1 %cmp4, label %for.body, label %for.end
+
+for.body:                                         ; preds = %entry, %for.body
+  %iv = phi i32 [ %sub, %for.body ], [ %start, %entry ]
+  %arrayidx = getelementptr inbounds i32, ptr %A, i32 %iv
+  %ld = load <vscale x 4 x i32>, ptr %arrayidx, align 4
+  %inc = add nsw <vscale x 4 x i32> %ld, splat (i32 1)
+  store <vscale x 4 x i32> %inc, ptr %arrayidx, align 4
+  %sub = sub i32 %iv, %vscale
+  %cmp = icmp ne i32 %sub, 0
+  br i1 %cmp, label %for.body, label %for.end
+
+for.end:                                          ; preds = %for.body, %entry
+  ret void
+}
+
+define void @vscalex4_countdown_ne(ptr nocapture %A, i32 %n) mustprogress vscale_range(2,1024) {
+; CHECK-LABEL: 'vscalex4_countdown_ne'
+; CHECK-NEXT:  Classifying expressions for: @vscalex4_countdown_ne
+; CHECK-NEXT:    %vscale = call i32 @llvm.vscale.i32()
+; CHECK-NEXT:    --> vscale U: [2,1025) S: [2,1025)
+; CHECK-NEXT:    %VF = shl i32 %vscale, 2
+; CHECK-NEXT:    --> (4 * vscale)<nuw><nsw> U: [8,4097) S: [8,4097)
+; CHECK-NEXT:    %start = sub i32 %n, %VF
+; CHECK-NEXT:    --> ((-4 * vscale)<nsw> + %n) U: full-set S: full-set
+; CHECK-NEXT:    %iv = phi i32 [ %sub, %for.body ], [ %start, %entry ]
+; CHECK-NEXT:    --> {((-4 * vscale)<nsw> + %n),+,(-4 * vscale)<nsw>}<%for.body> U: full-set S: full-set Exits: <<Unknown>> LoopDispositions: { %for.body: Computable }
+; CHECK-NEXT:    %arrayidx = getelementptr inbounds i32, ptr %A, i32 %iv
+; CHECK-NEXT:    --> {((4 * %n) + (-16 * vscale)<nsw> + %A),+,(-16 * vscale)<nsw>}<%for.body> U: full-set S: full-set Exits: <<Unknown>> LoopDispositions: { %for.body: Computable }
+; CHECK-NEXT:    %sub = sub i32 %iv, %VF
+; CHECK-NEXT:    --> {((-8 * vscale)<nsw> + %n),+,(-4 * vscale)<nsw>}<%for.body> U: full-set S: full-set Exits: <<Unknown>> LoopDispositions: { %for.body: Computable }
+; CHECK-NEXT:  Determining loop execution counts for: @vscalex4_countdown_ne
+; CHECK-NEXT:  Loop %for.body: Unpredictable backedge-taken count.
+; CHECK-NEXT:  Loop %for.body: Unpredictable constant max backedge-taken count.
+; CHECK-NEXT:  Loop %for.body: Unpredictable symbolic max backedge-taken count.
+;
+entry:
+  %vscale = call i32 @llvm.vscale.i32()
+  %VF = shl i32 %vscale, 2
+  %cmp4 = icmp sgt i32 %n, 0
+  %start = sub i32 %n, %VF
+  br i1 %cmp4, label %for.body, label %for.end
+
+for.body:                                         ; preds = %entry, %for.body
+  %iv = phi i32 [ %sub, %for.body ], [ %start, %entry ]
+  %arrayidx = getelementptr inbounds i32, ptr %A, i32 %iv
+  %ld = load <vscale x 4 x i32>, ptr %arrayidx, align 4
+  %inc = add nsw <vscale x 4 x i32> %ld, splat (i32 1)
+  store <vscale x 4 x i32> %inc, ptr %arrayidx, align 4
+  %sub = sub i32 %iv, %VF
+  %cmp = icmp ne i32 %sub, 0
+  br i1 %cmp, label %for.body, label %for.end
+
+for.end:                                          ; preds = %for.body, %entry
+  ret void
+}
diff --git a/llvm/test/Bindings/llvm-c/echo.ll b/llvm/test/Bindings/llvm-c/echo.ll
index ab9acbc..45e3d03 100644
--- a/llvm/test/Bindings/llvm-c/echo.ll
+++ b/llvm/test/Bindings/llvm-c/echo.ll
@@ -70,7 +70,7 @@ define void @types() {
   %9 = alloca [3 x i22], align 4
   %10 = alloca ptr addrspace(5), align 8
   %11 = alloca <5 x ptr>, align 64
-  %12 = alloca x86_mmx, align 8
+  %12 = alloca <1 x i64>, align 8
   ret void
 }
 
diff --git a/llvm/test/Bitcode/compatibility.ll b/llvm/test/Bitcode/compatibility.ll
index a756703..e5592b3 100644
--- a/llvm/test/Bitcode/compatibility.ll
+++ b/llvm/test/Bitcode/compatibility.ll
@@ -1112,8 +1112,6 @@ define void @typesystem() {
   ; CHECK: %t5 = alloca x86_fp80
   %t6 = alloca ppc_fp128
   ; CHECK: %t6 = alloca ppc_fp128
-  %t7 = alloca x86_mmx
-  ; CHECK: %t7 = alloca <1 x i64>
   %t8 = alloca ptr
   ; CHECK: %t8 = alloca ptr
   %t9 = alloca <4 x i32>
diff --git a/llvm/test/CodeGen/AArch64/arm64ec-hybrid-patchable.ll b/llvm/test/CodeGen/AArch64/arm64ec-hybrid-patchable.ll
index e5387d4..64fb5b3 100644
--- a/llvm/test/CodeGen/AArch64/arm64ec-hybrid-patchable.ll
+++ b/llvm/test/CodeGen/AArch64/arm64ec-hybrid-patchable.ll
@@ -238,7 +238,7 @@ define dso_local void @caller() nounwind {
 ; CHECK-NEXT:      .symidx exp
 ; CHECK-NEXT:      .word   0
 ; CHECK-NEXT:      .section        .drectve,"yni"
-; CHECK-NEXT:      .ascii  " /EXPORT:\"#exp$hp_target,EXPORTAS,exp$hp_target\""
+; CHECK-NEXT:      .ascii  " /EXPORT:exp"
 
 ; CHECK-NEXT:      .def    func;
 ; CHECK-NEXT:      .scl    2;
diff --git a/llvm/test/CodeGen/AArch64/exp10-libcall-names.ll b/llvm/test/CodeGen/AArch64/exp10-libcall-names.ll
index f53fd44..6e603b7 100644
--- a/llvm/test/CodeGen/AArch64/exp10-libcall-names.ll
+++ b/llvm/test/CodeGen/AArch64/exp10-libcall-names.ll
@@ -7,6 +7,12 @@
 ; RUN: llc -mtriple=aarch64-apple-tvos6.0 < %s | FileCheck -check-prefix=APPLE %s
 ; RUN: llc -mtriple=aarch64-apple-xros6.0 < %s | FileCheck -check-prefix=APPLE %s
 ; RUN: llc -mtriple=aarch64-apple-xros1.0 < %s | FileCheck -check-prefix=APPLE %s
+; RUN: llc -mtriple=arm64-apple-driverkit < %s | FileCheck -check-prefix=APPLE %s
+; RUN: llc -mtriple=arm64-apple-driverkit1.0 < %s | FileCheck -check-prefix=APPLE %s
+; RUN: llc -mtriple=arm64-apple-driverkit24.0 < %s | FileCheck -check-prefix=APPLE %s
+; RUN: llc -mtriple=arm64-apple-bridgeos < %s | FileCheck -check-prefix=BRIDGEOS %s
+; RUN: llc -mtriple=arm64-apple-bridgeos1.0 < %s | FileCheck -check-prefix=BRIDGEOS %s
+; RUN: llc -mtriple=arm64-apple-bridgeos9.0 < %s | FileCheck -check-prefix=BRIDGEOS %s
 
 ; RUN: not llc -mtriple=aarch64-apple-macos10.8 -filetype=null %s 2>&1 | FileCheck -check-prefix=ERR %s
 ; RUN: not llc -mtriple=aarch64-apple-ios6.0 -filetype=null %s 2>&1 | FileCheck -check-prefix=ERR %s
@@ -23,6 +29,11 @@ define float @test_exp10_f32(float %x) {
 ; APPLE-LABEL: test_exp10_f32:
 ; APPLE:       ; %bb.0:
 ; APPLE-NEXT:    b ___exp10f
+;
+; BRIDGEOS-LABEL: test_exp10_f32:
+; BRIDGEOS:       // %bb.0:
+; BRIDGEOS-NEXT:    b __exp10f
+;
   %ret = call float @llvm.exp10.f32(float %x)
   ret float %ret
 }
@@ -35,6 +46,11 @@ define double @test_exp10_f64(double %x) {
 ; APPLE-LABEL: test_exp10_f64:
 ; APPLE:       ; %bb.0:
 ; APPLE-NEXT:    b ___exp10
+;
+; BRIDGEOS-LABEL: test_exp10_f64:
+; BRIDGEOS:       // %bb.0:
+; BRIDGEOS-NEXT:    b __exp10
+;
   %ret = call double @llvm.exp10.f64(double %x)
   ret double %ret
 }
diff --git a/llvm/test/CodeGen/AArch64/fcvt_combine.ll b/llvm/test/CodeGen/AArch64/fcvt_combine.ll
index 62669a6..251d7a4 100644
--- a/llvm/test/CodeGen/AArch64/fcvt_combine.ll
+++ b/llvm/test/CodeGen/AArch64/fcvt_combine.ll
@@ -466,72 +466,19 @@ define <8 x i16> @test_v8f16_sat(<8 x half> %in) {
 ; CHECK-NO16:       // %bb.0:
 ; CHECK-NO16-NEXT:    movi v1.8h, #68, lsl #8
 ; CHECK-NO16-NEXT:    fcvtl v2.4s, v0.4h
-; CHECK-NO16-NEXT:    mov w8, #32767 // =0x7fff
 ; CHECK-NO16-NEXT:    fcvtl2 v0.4s, v0.8h
-; CHECK-NO16-NEXT:    mov w11, #-32768 // =0xffff8000
 ; CHECK-NO16-NEXT:    fcvtl v3.4s, v1.4h
 ; CHECK-NO16-NEXT:    fcvtl2 v1.4s, v1.8h
 ; CHECK-NO16-NEXT:    fmul v2.4s, v2.4s, v3.4s
 ; CHECK-NO16-NEXT:    fmul v0.4s, v0.4s, v1.4s
 ; CHECK-NO16-NEXT:    fcvtn v1.4h, v2.4s
 ; CHECK-NO16-NEXT:    fcvtn2 v1.8h, v0.4s
-; CHECK-NO16-NEXT:    fcvtl2 v0.4s, v1.8h
-; CHECK-NO16-NEXT:    fcvtl v1.4s, v1.4h
-; CHECK-NO16-NEXT:    mov s2, v0.s[1]
-; CHECK-NO16-NEXT:    fcvtzs w10, s0
-; CHECK-NO16-NEXT:    fcvtzs w15, s1
-; CHECK-NO16-NEXT:    fcvtzs w9, s2
-; CHECK-NO16-NEXT:    mov s2, v0.s[2]
-; CHECK-NO16-NEXT:    mov s0, v0.s[3]
-; CHECK-NO16-NEXT:    cmp w9, w8
-; CHECK-NO16-NEXT:    fcvtzs w12, s2
-; CHECK-NO16-NEXT:    mov s2, v1.s[1]
-; CHECK-NO16-NEXT:    csel w9, w9, w8, lt
-; CHECK-NO16-NEXT:    fcvtzs w13, s0
-; CHECK-NO16-NEXT:    mov s0, v1.s[2]
-; CHECK-NO16-NEXT:    cmn w9, #8, lsl #12 // =32768
-; CHECK-NO16-NEXT:    csel w9, w9, w11, gt
-; CHECK-NO16-NEXT:    cmp w10, w8
-; CHECK-NO16-NEXT:    csel w10, w10, w8, lt
-; CHECK-NO16-NEXT:    fcvtzs w14, s2
-; CHECK-NO16-NEXT:    cmn w10, #8, lsl #12 // =32768
-; CHECK-NO16-NEXT:    fcvtzs w16, s0
-; CHECK-NO16-NEXT:    mov s0, v1.s[3]
-; CHECK-NO16-NEXT:    csel w10, w10, w11, gt
-; CHECK-NO16-NEXT:    cmp w12, w8
-; CHECK-NO16-NEXT:    csel w12, w12, w8, lt
-; CHECK-NO16-NEXT:    fmov s1, w10
-; CHECK-NO16-NEXT:    cmn w12, #8, lsl #12 // =32768
-; CHECK-NO16-NEXT:    csel w12, w12, w11, gt
-; CHECK-NO16-NEXT:    cmp w13, w8
-; CHECK-NO16-NEXT:    csel w13, w13, w8, lt
-; CHECK-NO16-NEXT:    mov v1.s[1], w9
-; CHECK-NO16-NEXT:    fcvtzs w9, s0
-; CHECK-NO16-NEXT:    cmn w13, #8, lsl #12 // =32768
-; CHECK-NO16-NEXT:    csel w13, w13, w11, gt
-; CHECK-NO16-NEXT:    cmp w14, w8
-; CHECK-NO16-NEXT:    csel w14, w14, w8, lt
-; CHECK-NO16-NEXT:    cmn w14, #8, lsl #12 // =32768
-; CHECK-NO16-NEXT:    mov v1.s[2], w12
-; CHECK-NO16-NEXT:    csel w14, w14, w11, gt
-; CHECK-NO16-NEXT:    cmp w15, w8
-; CHECK-NO16-NEXT:    csel w15, w15, w8, lt
-; CHECK-NO16-NEXT:    cmn w15, #8, lsl #12 // =32768
-; CHECK-NO16-NEXT:    csel w10, w15, w11, gt
-; CHECK-NO16-NEXT:    cmp w16, w8
-; CHECK-NO16-NEXT:    mov v1.s[3], w13
-; CHECK-NO16-NEXT:    fmov s2, w10
-; CHECK-NO16-NEXT:    csel w10, w16, w8, lt
-; CHECK-NO16-NEXT:    cmn w10, #8, lsl #12 // =32768
-; CHECK-NO16-NEXT:    csel w10, w10, w11, gt
-; CHECK-NO16-NEXT:    cmp w9, w8
-; CHECK-NO16-NEXT:    mov v2.s[1], w14
-; CHECK-NO16-NEXT:    csel w8, w9, w8, lt
-; CHECK-NO16-NEXT:    cmn w8, #8, lsl #12 // =32768
-; CHECK-NO16-NEXT:    csel w8, w8, w11, gt
-; CHECK-NO16-NEXT:    mov v2.s[2], w10
-; CHECK-NO16-NEXT:    mov v2.s[3], w8
-; CHECK-NO16-NEXT:    uzp1 v0.8h, v2.8h, v1.8h
+; CHECK-NO16-NEXT:    fcvtl v0.4s, v1.4h
+; CHECK-NO16-NEXT:    fcvtl2 v1.4s, v1.8h
+; CHECK-NO16-NEXT:    fcvtzs v0.4s, v0.4s
+; CHECK-NO16-NEXT:    fcvtzs v1.4s, v1.4s
+; CHECK-NO16-NEXT:    sqxtn v0.4h, v0.4s
+; CHECK-NO16-NEXT:    sqxtn2 v0.8h, v1.4s
 ; CHECK-NO16-NEXT:    ret
 ;
 ; CHECK-FP16-LABEL: test_v8f16_sat:
diff --git a/llvm/test/CodeGen/AArch64/fptosi-sat-vector.ll b/llvm/test/CodeGen/AArch64/fptosi-sat-vector.ll
index 91c8b7f..4626fd7 100644
--- a/llvm/test/CodeGen/AArch64/fptosi-sat-vector.ll
+++ b/llvm/test/CodeGen/AArch64/fptosi-sat-vector.ll
@@ -2014,47 +2014,17 @@ declare <8 x i128> @llvm.fptosi.sat.v8f16.v8i128(<8 x half>)
 define <8 x i1> @test_signed_v8f16_v8i1(<8 x half> %f) {
 ; CHECK-CVT-LABEL: test_signed_v8f16_v8i1:
 ; CHECK-CVT:       // %bb.0:
-; CHECK-CVT-NEXT:    fcvtl2 v1.4s, v0.8h
+; CHECK-CVT-NEXT:    fcvtl2 v2.4s, v0.8h
 ; CHECK-CVT-NEXT:    fcvtl v0.4s, v0.4h
-; CHECK-CVT-NEXT:    mov s2, v1.s[1]
-; CHECK-CVT-NEXT:    fcvtzs w9, s1
-; CHECK-CVT-NEXT:    fcvtzs w13, s0
-; CHECK-CVT-NEXT:    fcvtzs w8, s2
-; CHECK-CVT-NEXT:    mov s2, v1.s[2]
-; CHECK-CVT-NEXT:    mov s1, v1.s[3]
-; CHECK-CVT-NEXT:    ands w8, w8, w8, asr #31
-; CHECK-CVT-NEXT:    fcvtzs w10, s2
-; CHECK-CVT-NEXT:    mov s2, v0.s[1]
-; CHECK-CVT-NEXT:    fcvtzs w11, s1
-; CHECK-CVT-NEXT:    mov s1, v0.s[2]
-; CHECK-CVT-NEXT:    mov s0, v0.s[3]
-; CHECK-CVT-NEXT:    csinv w8, w8, wzr, ge
-; CHECK-CVT-NEXT:    ands w9, w9, w9, asr #31
-; CHECK-CVT-NEXT:    csinv w9, w9, wzr, ge
-; CHECK-CVT-NEXT:    ands w10, w10, w10, asr #31
-; CHECK-CVT-NEXT:    fcvtzs w12, s2
-; CHECK-CVT-NEXT:    fcvtzs w14, s1
-; CHECK-CVT-NEXT:    fmov s1, w9
-; CHECK-CVT-NEXT:    fcvtzs w9, s0
-; CHECK-CVT-NEXT:    csinv w10, w10, wzr, ge
-; CHECK-CVT-NEXT:    ands w11, w11, w11, asr #31
-; CHECK-CVT-NEXT:    csinv w11, w11, wzr, ge
-; CHECK-CVT-NEXT:    ands w12, w12, w12, asr #31
-; CHECK-CVT-NEXT:    mov v1.s[1], w8
-; CHECK-CVT-NEXT:    csinv w12, w12, wzr, ge
-; CHECK-CVT-NEXT:    ands w13, w13, w13, asr #31
-; CHECK-CVT-NEXT:    csinv w13, w13, wzr, ge
-; CHECK-CVT-NEXT:    ands w8, w14, w14, asr #31
-; CHECK-CVT-NEXT:    mov v1.s[2], w10
-; CHECK-CVT-NEXT:    fmov s2, w13
-; CHECK-CVT-NEXT:    csinv w8, w8, wzr, ge
-; CHECK-CVT-NEXT:    mov v2.s[1], w12
-; CHECK-CVT-NEXT:    mov v1.s[3], w11
-; CHECK-CVT-NEXT:    mov v2.s[2], w8
-; CHECK-CVT-NEXT:    ands w8, w9, w9, asr #31
-; CHECK-CVT-NEXT:    csinv w8, w8, wzr, ge
-; CHECK-CVT-NEXT:    mov v2.s[3], w8
-; CHECK-CVT-NEXT:    uzp1 v0.8h, v2.8h, v1.8h
+; CHECK-CVT-NEXT:    movi v1.2d, #0000000000000000
+; CHECK-CVT-NEXT:    movi v3.2d, #0xffffffffffffffff
+; CHECK-CVT-NEXT:    fcvtzs v2.4s, v2.4s
+; CHECK-CVT-NEXT:    fcvtzs v0.4s, v0.4s
+; CHECK-CVT-NEXT:    smin v2.4s, v2.4s, v1.4s
+; CHECK-CVT-NEXT:    smin v0.4s, v0.4s, v1.4s
+; CHECK-CVT-NEXT:    smax v1.4s, v2.4s, v3.4s
+; CHECK-CVT-NEXT:    smax v0.4s, v0.4s, v3.4s
+; CHECK-CVT-NEXT:    uzp1 v0.8h, v0.8h, v1.8h
 ; CHECK-CVT-NEXT:    xtn v0.8b, v0.8h
 ; CHECK-CVT-NEXT:    ret
 ;
@@ -2074,65 +2044,17 @@ define <8 x i1> @test_signed_v8f16_v8i1(<8 x half> %f) {
 define <8 x i8> @test_signed_v8f16_v8i8(<8 x half> %f) {
 ; CHECK-CVT-LABEL: test_signed_v8f16_v8i8:
 ; CHECK-CVT:       // %bb.0:
-; CHECK-CVT-NEXT:    fcvtl2 v1.4s, v0.8h
-; CHECK-CVT-NEXT:    mov w8, #127 // =0x7f
+; CHECK-CVT-NEXT:    fcvtl2 v2.4s, v0.8h
 ; CHECK-CVT-NEXT:    fcvtl v0.4s, v0.4h
-; CHECK-CVT-NEXT:    mov w11, #-128 // =0xffffff80
-; CHECK-CVT-NEXT:    mov s2, v1.s[1]
-; CHECK-CVT-NEXT:    fcvtzs w10, s1
-; CHECK-CVT-NEXT:    fcvtzs w15, s0
-; CHECK-CVT-NEXT:    fcvtzs w9, s2
-; CHECK-CVT-NEXT:    mov s2, v1.s[2]
-; CHECK-CVT-NEXT:    mov s1, v1.s[3]
-; CHECK-CVT-NEXT:    cmp w9, #127
-; CHECK-CVT-NEXT:    fcvtzs w12, s2
-; CHECK-CVT-NEXT:    mov s2, v0.s[1]
-; CHECK-CVT-NEXT:    csel w9, w9, w8, lt
-; CHECK-CVT-NEXT:    fcvtzs w13, s1
-; CHECK-CVT-NEXT:    mov s1, v0.s[2]
-; CHECK-CVT-NEXT:    cmn w9, #128
-; CHECK-CVT-NEXT:    mov s0, v0.s[3]
-; CHECK-CVT-NEXT:    csel w9, w9, w11, gt
-; CHECK-CVT-NEXT:    cmp w10, #127
-; CHECK-CVT-NEXT:    csel w10, w10, w8, lt
-; CHECK-CVT-NEXT:    fcvtzs w14, s2
-; CHECK-CVT-NEXT:    cmn w10, #128
-; CHECK-CVT-NEXT:    fcvtzs w16, s1
-; CHECK-CVT-NEXT:    csel w10, w10, w11, gt
-; CHECK-CVT-NEXT:    cmp w12, #127
-; CHECK-CVT-NEXT:    csel w12, w12, w8, lt
-; CHECK-CVT-NEXT:    fmov s1, w10
-; CHECK-CVT-NEXT:    cmn w12, #128
-; CHECK-CVT-NEXT:    csel w12, w12, w11, gt
-; CHECK-CVT-NEXT:    cmp w13, #127
-; CHECK-CVT-NEXT:    csel w13, w13, w8, lt
-; CHECK-CVT-NEXT:    mov v1.s[1], w9
-; CHECK-CVT-NEXT:    fcvtzs w9, s0
-; CHECK-CVT-NEXT:    cmn w13, #128
-; CHECK-CVT-NEXT:    csel w13, w13, w11, gt
-; CHECK-CVT-NEXT:    cmp w14, #127
-; CHECK-CVT-NEXT:    csel w14, w14, w8, lt
-; CHECK-CVT-NEXT:    cmn w14, #128
-; CHECK-CVT-NEXT:    mov v1.s[2], w12
-; CHECK-CVT-NEXT:    csel w14, w14, w11, gt
-; CHECK-CVT-NEXT:    cmp w15, #127
-; CHECK-CVT-NEXT:    csel w15, w15, w8, lt
-; CHECK-CVT-NEXT:    cmn w15, #128
-; CHECK-CVT-NEXT:    csel w10, w15, w11, gt
-; CHECK-CVT-NEXT:    cmp w16, #127
-; CHECK-CVT-NEXT:    mov v1.s[3], w13
-; CHECK-CVT-NEXT:    fmov s2, w10
-; CHECK-CVT-NEXT:    csel w10, w16, w8, lt
-; CHECK-CVT-NEXT:    cmn w10, #128
-; CHECK-CVT-NEXT:    csel w10, w10, w11, gt
-; CHECK-CVT-NEXT:    cmp w9, #127
-; CHECK-CVT-NEXT:    mov v2.s[1], w14
-; CHECK-CVT-NEXT:    csel w8, w9, w8, lt
-; CHECK-CVT-NEXT:    cmn w8, #128
-; CHECK-CVT-NEXT:    csel w8, w8, w11, gt
-; CHECK-CVT-NEXT:    mov v2.s[2], w10
-; CHECK-CVT-NEXT:    mov v2.s[3], w8
-; CHECK-CVT-NEXT:    uzp1 v0.8h, v2.8h, v1.8h
+; CHECK-CVT-NEXT:    movi v1.4s, #127
+; CHECK-CVT-NEXT:    fcvtzs v2.4s, v2.4s
+; CHECK-CVT-NEXT:    fcvtzs v0.4s, v0.4s
+; CHECK-CVT-NEXT:    smin v2.4s, v2.4s, v1.4s
+; CHECK-CVT-NEXT:    smin v0.4s, v0.4s, v1.4s
+; CHECK-CVT-NEXT:    mvni v1.4s, #127
+; CHECK-CVT-NEXT:    smax v2.4s, v2.4s, v1.4s
+; CHECK-CVT-NEXT:    smax v0.4s, v0.4s, v1.4s
+; CHECK-CVT-NEXT:    uzp1 v0.8h, v0.8h, v2.8h
 ; CHECK-CVT-NEXT:    xtn v0.8b, v0.8h
 ; CHECK-CVT-NEXT:    ret
 ;
@@ -2148,65 +2070,17 @@ define <8 x i8> @test_signed_v8f16_v8i8(<8 x half> %f) {
 define <8 x i13> @test_signed_v8f16_v8i13(<8 x half> %f) {
 ; CHECK-CVT-LABEL: test_signed_v8f16_v8i13:
 ; CHECK-CVT:       // %bb.0:
-; CHECK-CVT-NEXT:    fcvtl2 v1.4s, v0.8h
-; CHECK-CVT-NEXT:    mov w8, #4095 // =0xfff
+; CHECK-CVT-NEXT:    fcvtl2 v2.4s, v0.8h
 ; CHECK-CVT-NEXT:    fcvtl v0.4s, v0.4h
-; CHECK-CVT-NEXT:    mov w11, #-4096 // =0xfffff000
-; CHECK-CVT-NEXT:    mov s2, v1.s[1]
-; CHECK-CVT-NEXT:    fcvtzs w10, s1
-; CHECK-CVT-NEXT:    fcvtzs w15, s0
-; CHECK-CVT-NEXT:    fcvtzs w9, s2
-; CHECK-CVT-NEXT:    mov s2, v1.s[2]
-; CHECK-CVT-NEXT:    mov s1, v1.s[3]
-; CHECK-CVT-NEXT:    cmp w9, #4095
-; CHECK-CVT-NEXT:    fcvtzs w12, s2
-; CHECK-CVT-NEXT:    mov s2, v0.s[1]
-; CHECK-CVT-NEXT:    csel w9, w9, w8, lt
-; CHECK-CVT-NEXT:    fcvtzs w13, s1
-; CHECK-CVT-NEXT:    mov s1, v0.s[2]
-; CHECK-CVT-NEXT:    cmn w9, #1, lsl #12 // =4096
-; CHECK-CVT-NEXT:    mov s0, v0.s[3]
-; CHECK-CVT-NEXT:    csel w9, w9, w11, gt
-; CHECK-CVT-NEXT:    cmp w10, #4095
-; CHECK-CVT-NEXT:    csel w10, w10, w8, lt
-; CHECK-CVT-NEXT:    fcvtzs w14, s2
-; CHECK-CVT-NEXT:    cmn w10, #1, lsl #12 // =4096
-; CHECK-CVT-NEXT:    fcvtzs w16, s1
-; CHECK-CVT-NEXT:    csel w10, w10, w11, gt
-; CHECK-CVT-NEXT:    cmp w12, #4095
-; CHECK-CVT-NEXT:    csel w12, w12, w8, lt
-; CHECK-CVT-NEXT:    fmov s1, w10
-; CHECK-CVT-NEXT:    cmn w12, #1, lsl #12 // =4096
-; CHECK-CVT-NEXT:    csel w12, w12, w11, gt
-; CHECK-CVT-NEXT:    cmp w13, #4095
-; CHECK-CVT-NEXT:    csel w13, w13, w8, lt
-; CHECK-CVT-NEXT:    mov v1.s[1], w9
-; CHECK-CVT-NEXT:    fcvtzs w9, s0
-; CHECK-CVT-NEXT:    cmn w13, #1, lsl #12 // =4096
-; CHECK-CVT-NEXT:    csel w13, w13, w11, gt
-; CHECK-CVT-NEXT:    cmp w14, #4095
-; CHECK-CVT-NEXT:    csel w14, w14, w8, lt
-; CHECK-CVT-NEXT:    cmn w14, #1, lsl #12 // =4096
-; CHECK-CVT-NEXT:    mov v1.s[2], w12
-; CHECK-CVT-NEXT:    csel w14, w14, w11, gt
-; CHECK-CVT-NEXT:    cmp w15, #4095
-; CHECK-CVT-NEXT:    csel w15, w15, w8, lt
-; CHECK-CVT-NEXT:    cmn w15, #1, lsl #12 // =4096
-; CHECK-CVT-NEXT:    csel w10, w15, w11, gt
-; CHECK-CVT-NEXT:    cmp w16, #4095
-; CHECK-CVT-NEXT:    mov v1.s[3], w13
-; CHECK-CVT-NEXT:    fmov s2, w10
-; CHECK-CVT-NEXT:    csel w10, w16, w8, lt
-; CHECK-CVT-NEXT:    cmn w10, #1, lsl #12 // =4096
-; CHECK-CVT-NEXT:    csel w10, w10, w11, gt
-; CHECK-CVT-NEXT:    cmp w9, #4095
-; CHECK-CVT-NEXT:    mov v2.s[1], w14
-; CHECK-CVT-NEXT:    csel w8, w9, w8, lt
-; CHECK-CVT-NEXT:    cmn w8, #1, lsl #12 // =4096
-; CHECK-CVT-NEXT:    csel w8, w8, w11, gt
-; CHECK-CVT-NEXT:    mov v2.s[2], w10
-; CHECK-CVT-NEXT:    mov v2.s[3], w8
-; CHECK-CVT-NEXT:    uzp1 v0.8h, v2.8h, v1.8h
+; CHECK-CVT-NEXT:    movi v1.4s, #15, msl #8
+; CHECK-CVT-NEXT:    fcvtzs v2.4s, v2.4s
+; CHECK-CVT-NEXT:    fcvtzs v0.4s, v0.4s
+; CHECK-CVT-NEXT:    smin v2.4s, v2.4s, v1.4s
+; CHECK-CVT-NEXT:    smin v0.4s, v0.4s, v1.4s
+; CHECK-CVT-NEXT:    mvni v1.4s, #15, msl #8
+; CHECK-CVT-NEXT:    smax v2.4s, v2.4s, v1.4s
+; CHECK-CVT-NEXT:    smax v0.4s, v0.4s, v1.4s
+; CHECK-CVT-NEXT:    uzp1 v0.8h, v0.8h, v2.8h
 ; CHECK-CVT-NEXT:    ret
 ;
 ; CHECK-FP16-LABEL: test_signed_v8f16_v8i13:
@@ -2224,65 +2098,12 @@ define <8 x i13> @test_signed_v8f16_v8i13(<8 x half> %f) {
 define <8 x i16> @test_signed_v8f16_v8i16(<8 x half> %f) {
 ; CHECK-CVT-LABEL: test_signed_v8f16_v8i16:
 ; CHECK-CVT:       // %bb.0:
-; CHECK-CVT-NEXT:    fcvtl2 v1.4s, v0.8h
-; CHECK-CVT-NEXT:    mov w8, #32767 // =0x7fff
-; CHECK-CVT-NEXT:    fcvtl v0.4s, v0.4h
-; CHECK-CVT-NEXT:    mov w11, #-32768 // =0xffff8000
-; CHECK-CVT-NEXT:    mov s2, v1.s[1]
-; CHECK-CVT-NEXT:    fcvtzs w10, s1
-; CHECK-CVT-NEXT:    fcvtzs w15, s0
-; CHECK-CVT-NEXT:    fcvtzs w9, s2
-; CHECK-CVT-NEXT:    mov s2, v1.s[2]
-; CHECK-CVT-NEXT:    mov s1, v1.s[3]
-; CHECK-CVT-NEXT:    cmp w9, w8
-; CHECK-CVT-NEXT:    fcvtzs w12, s2
-; CHECK-CVT-NEXT:    mov s2, v0.s[1]
-; CHECK-CVT-NEXT:    csel w9, w9, w8, lt
-; CHECK-CVT-NEXT:    fcvtzs w13, s1
-; CHECK-CVT-NEXT:    mov s1, v0.s[2]
-; CHECK-CVT-NEXT:    cmn w9, #8, lsl #12 // =32768
-; CHECK-CVT-NEXT:    mov s0, v0.s[3]
-; CHECK-CVT-NEXT:    csel w9, w9, w11, gt
-; CHECK-CVT-NEXT:    cmp w10, w8
-; CHECK-CVT-NEXT:    csel w10, w10, w8, lt
-; CHECK-CVT-NEXT:    fcvtzs w14, s2
-; CHECK-CVT-NEXT:    cmn w10, #8, lsl #12 // =32768
-; CHECK-CVT-NEXT:    fcvtzs w16, s1
-; CHECK-CVT-NEXT:    csel w10, w10, w11, gt
-; CHECK-CVT-NEXT:    cmp w12, w8
-; CHECK-CVT-NEXT:    csel w12, w12, w8, lt
-; CHECK-CVT-NEXT:    fmov s1, w10
-; CHECK-CVT-NEXT:    cmn w12, #8, lsl #12 // =32768
-; CHECK-CVT-NEXT:    csel w12, w12, w11, gt
-; CHECK-CVT-NEXT:    cmp w13, w8
-; CHECK-CVT-NEXT:    csel w13, w13, w8, lt
-; CHECK-CVT-NEXT:    mov v1.s[1], w9
-; CHECK-CVT-NEXT:    fcvtzs w9, s0
-; CHECK-CVT-NEXT:    cmn w13, #8, lsl #12 // =32768
-; CHECK-CVT-NEXT:    csel w13, w13, w11, gt
-; CHECK-CVT-NEXT:    cmp w14, w8
-; CHECK-CVT-NEXT:    csel w14, w14, w8, lt
-; CHECK-CVT-NEXT:    cmn w14, #8, lsl #12 // =32768
-; CHECK-CVT-NEXT:    mov v1.s[2], w12
-; CHECK-CVT-NEXT:    csel w14, w14, w11, gt
-; CHECK-CVT-NEXT:    cmp w15, w8
-; CHECK-CVT-NEXT:    csel w15, w15, w8, lt
-; CHECK-CVT-NEXT:    cmn w15, #8, lsl #12 // =32768
-; CHECK-CVT-NEXT:    csel w10, w15, w11, gt
-; CHECK-CVT-NEXT:    cmp w16, w8
-; CHECK-CVT-NEXT:    mov v1.s[3], w13
-; CHECK-CVT-NEXT:    fmov s2, w10
-; CHECK-CVT-NEXT:    csel w10, w16, w8, lt
-; CHECK-CVT-NEXT:    cmn w10, #8, lsl #12 // =32768
-; CHECK-CVT-NEXT:    csel w10, w10, w11, gt
-; CHECK-CVT-NEXT:    cmp w9, w8
-; CHECK-CVT-NEXT:    mov v2.s[1], w14
-; CHECK-CVT-NEXT:    csel w8, w9, w8, lt
-; CHECK-CVT-NEXT:    cmn w8, #8, lsl #12 // =32768
-; CHECK-CVT-NEXT:    csel w8, w8, w11, gt
-; CHECK-CVT-NEXT:    mov v2.s[2], w10
-; CHECK-CVT-NEXT:    mov v2.s[3], w8
-; CHECK-CVT-NEXT:    uzp1 v0.8h, v2.8h, v1.8h
+; CHECK-CVT-NEXT:    fcvtl v1.4s, v0.4h
+; CHECK-CVT-NEXT:    fcvtl2 v2.4s, v0.8h
+; CHECK-CVT-NEXT:    fcvtzs v1.4s, v1.4s
+; CHECK-CVT-NEXT:    sqxtn v0.4h, v1.4s
+; CHECK-CVT-NEXT:    fcvtzs v1.4s, v2.4s
+; CHECK-CVT-NEXT:    sqxtn2 v0.8h, v1.4s
 ; CHECK-CVT-NEXT:    ret
 ;
 ; CHECK-FP16-LABEL: test_signed_v8f16_v8i16:
@@ -2984,123 +2805,27 @@ define <16 x i16> @test_signed_v16f32_v16i16(<16 x float> %f) {
 define <16 x i8> @test_signed_v16f16_v16i8(<16 x half> %f) {
 ; CHECK-CVT-LABEL: test_signed_v16f16_v16i8:
 ; CHECK-CVT:       // %bb.0:
-; CHECK-CVT-NEXT:    fcvtl2 v2.4s, v1.8h
-; CHECK-CVT-NEXT:    mov w8, #127 // =0x7f
+; CHECK-CVT-NEXT:    fcvtl2 v3.4s, v1.8h
 ; CHECK-CVT-NEXT:    fcvtl v1.4s, v1.4h
-; CHECK-CVT-NEXT:    mov s3, v2.s[1]
-; CHECK-CVT-NEXT:    fcvtzs w10, s2
-; CHECK-CVT-NEXT:    fcvtzs w16, s1
-; CHECK-CVT-NEXT:    fcvtzs w9, s3
-; CHECK-CVT-NEXT:    mov s3, v2.s[2]
-; CHECK-CVT-NEXT:    mov s2, v2.s[3]
-; CHECK-CVT-NEXT:    cmp w9, #127
-; CHECK-CVT-NEXT:    fcvtzs w12, s3
-; CHECK-CVT-NEXT:    mov s3, v1.s[1]
-; CHECK-CVT-NEXT:    csel w11, w9, w8, lt
-; CHECK-CVT-NEXT:    mov w9, #-128 // =0xffffff80
-; CHECK-CVT-NEXT:    fcvtzs w14, s2
-; CHECK-CVT-NEXT:    cmn w11, #128
-; CHECK-CVT-NEXT:    mov s2, v1.s[2]
-; CHECK-CVT-NEXT:    mov s1, v1.s[3]
-; CHECK-CVT-NEXT:    csel w11, w11, w9, gt
-; CHECK-CVT-NEXT:    cmp w10, #127
-; CHECK-CVT-NEXT:    csel w10, w10, w8, lt
-; CHECK-CVT-NEXT:    fcvtzs w15, s3
-; CHECK-CVT-NEXT:    fcvtl2 v3.4s, v0.8h
-; CHECK-CVT-NEXT:    cmn w10, #128
+; CHECK-CVT-NEXT:    fcvtl2 v4.4s, v0.8h
 ; CHECK-CVT-NEXT:    fcvtl v0.4s, v0.4h
-; CHECK-CVT-NEXT:    csel w13, w10, w9, gt
-; CHECK-CVT-NEXT:    cmp w12, #127
-; CHECK-CVT-NEXT:    fcvtzs w17, s1
-; CHECK-CVT-NEXT:    csel w10, w12, w8, lt
-; CHECK-CVT-NEXT:    cmn w10, #128
-; CHECK-CVT-NEXT:    mov s1, v3.s[2]
-; CHECK-CVT-NEXT:    fcvtzs w0, s3
-; CHECK-CVT-NEXT:    csel w10, w10, w9, gt
-; CHECK-CVT-NEXT:    cmp w14, #127
-; CHECK-CVT-NEXT:    fcvtzs w4, s0
-; CHECK-CVT-NEXT:    csel w12, w14, w8, lt
-; CHECK-CVT-NEXT:    cmn w12, #128
-; CHECK-CVT-NEXT:    csel w12, w12, w9, gt
-; CHECK-CVT-NEXT:    cmp w15, #127
-; CHECK-CVT-NEXT:    fcvtzs w1, s1
-; CHECK-CVT-NEXT:    csel w14, w15, w8, lt
-; CHECK-CVT-NEXT:    fcvtzs w15, s2
-; CHECK-CVT-NEXT:    mov s2, v3.s[1]
-; CHECK-CVT-NEXT:    cmn w14, #128
-; CHECK-CVT-NEXT:    mov s1, v0.s[1]
-; CHECK-CVT-NEXT:    csel w14, w14, w9, gt
-; CHECK-CVT-NEXT:    cmp w16, #127
-; CHECK-CVT-NEXT:    csel w16, w16, w8, lt
-; CHECK-CVT-NEXT:    cmn w16, #128
-; CHECK-CVT-NEXT:    fcvtzs w18, s2
-; CHECK-CVT-NEXT:    mov s2, v3.s[3]
-; CHECK-CVT-NEXT:    csel w16, w16, w9, gt
-; CHECK-CVT-NEXT:    cmp w15, #127
-; CHECK-CVT-NEXT:    fcvtzs w3, s1
-; CHECK-CVT-NEXT:    csel w15, w15, w8, lt
-; CHECK-CVT-NEXT:    mov s1, v0.s[2]
-; CHECK-CVT-NEXT:    mov s0, v0.s[3]
-; CHECK-CVT-NEXT:    cmn w15, #128
-; CHECK-CVT-NEXT:    csel w15, w15, w9, gt
-; CHECK-CVT-NEXT:    cmp w17, #127
-; CHECK-CVT-NEXT:    fcvtzs w2, s2
-; CHECK-CVT-NEXT:    csel w17, w17, w8, lt
-; CHECK-CVT-NEXT:    fmov s2, w13
-; CHECK-CVT-NEXT:    cmn w17, #128
-; CHECK-CVT-NEXT:    csel w17, w17, w9, gt
-; CHECK-CVT-NEXT:    cmp w18, #127
-; CHECK-CVT-NEXT:    csel w18, w18, w8, lt
-; CHECK-CVT-NEXT:    mov v2.s[1], w11
-; CHECK-CVT-NEXT:    cmn w18, #128
-; CHECK-CVT-NEXT:    csel w18, w18, w9, gt
-; CHECK-CVT-NEXT:    cmp w0, #127
-; CHECK-CVT-NEXT:    csel w0, w0, w8, lt
-; CHECK-CVT-NEXT:    cmn w0, #128
-; CHECK-CVT-NEXT:    mov v2.s[2], w10
-; CHECK-CVT-NEXT:    csel w0, w0, w9, gt
-; CHECK-CVT-NEXT:    cmp w1, #127
-; CHECK-CVT-NEXT:    csel w1, w1, w8, lt
-; CHECK-CVT-NEXT:    fmov s3, w0
-; CHECK-CVT-NEXT:    cmn w1, #128
-; CHECK-CVT-NEXT:    csel w1, w1, w9, gt
-; CHECK-CVT-NEXT:    cmp w2, #127
-; CHECK-CVT-NEXT:    mov v2.s[3], w12
-; CHECK-CVT-NEXT:    csel w2, w2, w8, lt
-; CHECK-CVT-NEXT:    mov v3.s[1], w18
-; CHECK-CVT-NEXT:    cmn w2, #128
-; CHECK-CVT-NEXT:    csel w2, w2, w9, gt
-; CHECK-CVT-NEXT:    cmp w3, #127
-; CHECK-CVT-NEXT:    csel w3, w3, w8, lt
-; CHECK-CVT-NEXT:    cmn w3, #128
-; CHECK-CVT-NEXT:    mov v3.s[2], w1
-; CHECK-CVT-NEXT:    csel w13, w3, w9, gt
-; CHECK-CVT-NEXT:    cmp w4, #127
-; CHECK-CVT-NEXT:    csel w3, w4, w8, lt
-; CHECK-CVT-NEXT:    fcvtzs w4, s1
-; CHECK-CVT-NEXT:    fmov s1, w16
-; CHECK-CVT-NEXT:    cmn w3, #128
-; CHECK-CVT-NEXT:    csel w11, w3, w9, gt
-; CHECK-CVT-NEXT:    mov v3.s[3], w2
-; CHECK-CVT-NEXT:    fmov s4, w11
-; CHECK-CVT-NEXT:    mov v1.s[1], w14
-; CHECK-CVT-NEXT:    fcvtzs w11, s0
-; CHECK-CVT-NEXT:    cmp w4, #127
-; CHECK-CVT-NEXT:    mov v4.s[1], w13
-; CHECK-CVT-NEXT:    csel w13, w4, w8, lt
-; CHECK-CVT-NEXT:    cmn w13, #128
-; CHECK-CVT-NEXT:    mov v1.s[2], w15
-; CHECK-CVT-NEXT:    csel w10, w13, w9, gt
-; CHECK-CVT-NEXT:    cmp w11, #127
-; CHECK-CVT-NEXT:    csel w8, w11, w8, lt
-; CHECK-CVT-NEXT:    mov v4.s[2], w10
-; CHECK-CVT-NEXT:    cmn w8, #128
-; CHECK-CVT-NEXT:    csel w8, w8, w9, gt
-; CHECK-CVT-NEXT:    mov v1.s[3], w17
-; CHECK-CVT-NEXT:    mov v4.s[3], w8
-; CHECK-CVT-NEXT:    uzp1 v0.8h, v1.8h, v2.8h
-; CHECK-CVT-NEXT:    uzp1 v1.8h, v4.8h, v3.8h
-; CHECK-CVT-NEXT:    uzp1 v0.16b, v1.16b, v0.16b
+; CHECK-CVT-NEXT:    movi v2.4s, #127
+; CHECK-CVT-NEXT:    fcvtzs v3.4s, v3.4s
+; CHECK-CVT-NEXT:    fcvtzs v1.4s, v1.4s
+; CHECK-CVT-NEXT:    fcvtzs v4.4s, v4.4s
+; CHECK-CVT-NEXT:    fcvtzs v0.4s, v0.4s
+; CHECK-CVT-NEXT:    smin v3.4s, v3.4s, v2.4s
+; CHECK-CVT-NEXT:    smin v1.4s, v1.4s, v2.4s
+; CHECK-CVT-NEXT:    smin v4.4s, v4.4s, v2.4s
+; CHECK-CVT-NEXT:    smin v0.4s, v0.4s, v2.4s
+; CHECK-CVT-NEXT:    mvni v2.4s, #127
+; CHECK-CVT-NEXT:    smax v3.4s, v3.4s, v2.4s
+; CHECK-CVT-NEXT:    smax v1.4s, v1.4s, v2.4s
+; CHECK-CVT-NEXT:    smax v4.4s, v4.4s, v2.4s
+; CHECK-CVT-NEXT:    smax v0.4s, v0.4s, v2.4s
+; CHECK-CVT-NEXT:    uzp1 v1.8h, v1.8h, v3.8h
+; CHECK-CVT-NEXT:    uzp1 v0.8h, v0.8h, v4.8h
+; CHECK-CVT-NEXT:    uzp1 v0.16b, v0.16b, v1.16b
 ; CHECK-CVT-NEXT:    ret
 ;
 ; CHECK-FP16-LABEL: test_signed_v16f16_v16i8:
@@ -3117,122 +2842,18 @@ define <16 x i8> @test_signed_v16f16_v16i8(<16 x half> %f) {
 define <16 x i16> @test_signed_v16f16_v16i16(<16 x half> %f) {
 ; CHECK-CVT-LABEL: test_signed_v16f16_v16i16:
 ; CHECK-CVT:       // %bb.0:
-; CHECK-CVT-NEXT:    fcvtl2 v2.4s, v0.8h
-; CHECK-CVT-NEXT:    mov w8, #32767 // =0x7fff
-; CHECK-CVT-NEXT:    fcvtl v0.4s, v0.4h
-; CHECK-CVT-NEXT:    mov s3, v2.s[1]
-; CHECK-CVT-NEXT:    fcvtzs w10, s2
-; CHECK-CVT-NEXT:    fcvtzs w16, s0
-; CHECK-CVT-NEXT:    fcvtzs w9, s3
-; CHECK-CVT-NEXT:    mov s3, v2.s[2]
-; CHECK-CVT-NEXT:    mov s2, v2.s[3]
-; CHECK-CVT-NEXT:    cmp w9, w8
-; CHECK-CVT-NEXT:    fcvtzs w12, s3
-; CHECK-CVT-NEXT:    mov s3, v0.s[1]
-; CHECK-CVT-NEXT:    csel w11, w9, w8, lt
-; CHECK-CVT-NEXT:    mov w9, #-32768 // =0xffff8000
-; CHECK-CVT-NEXT:    fcvtzs w14, s2
-; CHECK-CVT-NEXT:    cmn w11, #8, lsl #12 // =32768
-; CHECK-CVT-NEXT:    mov s2, v0.s[2]
-; CHECK-CVT-NEXT:    mov s0, v0.s[3]
-; CHECK-CVT-NEXT:    csel w11, w11, w9, gt
-; CHECK-CVT-NEXT:    cmp w10, w8
-; CHECK-CVT-NEXT:    csel w10, w10, w8, lt
-; CHECK-CVT-NEXT:    fcvtzs w15, s3
-; CHECK-CVT-NEXT:    fcvtl2 v3.4s, v1.8h
-; CHECK-CVT-NEXT:    cmn w10, #8, lsl #12 // =32768
-; CHECK-CVT-NEXT:    fcvtl v1.4s, v1.4h
-; CHECK-CVT-NEXT:    csel w13, w10, w9, gt
-; CHECK-CVT-NEXT:    cmp w12, w8
-; CHECK-CVT-NEXT:    fcvtzs w17, s0
-; CHECK-CVT-NEXT:    csel w10, w12, w8, lt
-; CHECK-CVT-NEXT:    cmn w10, #8, lsl #12 // =32768
-; CHECK-CVT-NEXT:    mov s0, v3.s[2]
-; CHECK-CVT-NEXT:    fcvtzs w0, s3
-; CHECK-CVT-NEXT:    csel w10, w10, w9, gt
-; CHECK-CVT-NEXT:    cmp w14, w8
-; CHECK-CVT-NEXT:    fcvtzs w4, s1
-; CHECK-CVT-NEXT:    csel w12, w14, w8, lt
-; CHECK-CVT-NEXT:    cmn w12, #8, lsl #12 // =32768
-; CHECK-CVT-NEXT:    csel w12, w12, w9, gt
-; CHECK-CVT-NEXT:    cmp w15, w8
-; CHECK-CVT-NEXT:    fcvtzs w1, s0
-; CHECK-CVT-NEXT:    csel w14, w15, w8, lt
-; CHECK-CVT-NEXT:    fcvtzs w15, s2
-; CHECK-CVT-NEXT:    mov s2, v3.s[1]
-; CHECK-CVT-NEXT:    cmn w14, #8, lsl #12 // =32768
-; CHECK-CVT-NEXT:    mov s0, v1.s[1]
-; CHECK-CVT-NEXT:    csel w14, w14, w9, gt
-; CHECK-CVT-NEXT:    cmp w16, w8
-; CHECK-CVT-NEXT:    csel w16, w16, w8, lt
-; CHECK-CVT-NEXT:    cmn w16, #8, lsl #12 // =32768
-; CHECK-CVT-NEXT:    fcvtzs w18, s2
-; CHECK-CVT-NEXT:    mov s2, v3.s[3]
-; CHECK-CVT-NEXT:    csel w16, w16, w9, gt
-; CHECK-CVT-NEXT:    cmp w15, w8
-; CHECK-CVT-NEXT:    fcvtzs w3, s0
-; CHECK-CVT-NEXT:    csel w15, w15, w8, lt
-; CHECK-CVT-NEXT:    mov s0, v1.s[2]
-; CHECK-CVT-NEXT:    cmn w15, #8, lsl #12 // =32768
-; CHECK-CVT-NEXT:    csel w15, w15, w9, gt
-; CHECK-CVT-NEXT:    cmp w17, w8
-; CHECK-CVT-NEXT:    fcvtzs w2, s2
-; CHECK-CVT-NEXT:    csel w17, w17, w8, lt
-; CHECK-CVT-NEXT:    fmov s2, w13
-; CHECK-CVT-NEXT:    cmn w17, #8, lsl #12 // =32768
-; CHECK-CVT-NEXT:    csel w17, w17, w9, gt
-; CHECK-CVT-NEXT:    cmp w18, w8
-; CHECK-CVT-NEXT:    csel w18, w18, w8, lt
-; CHECK-CVT-NEXT:    mov v2.s[1], w11
-; CHECK-CVT-NEXT:    cmn w18, #8, lsl #12 // =32768
-; CHECK-CVT-NEXT:    csel w18, w18, w9, gt
-; CHECK-CVT-NEXT:    cmp w0, w8
-; CHECK-CVT-NEXT:    csel w0, w0, w8, lt
-; CHECK-CVT-NEXT:    cmn w0, #8, lsl #12 // =32768
-; CHECK-CVT-NEXT:    mov v2.s[2], w10
-; CHECK-CVT-NEXT:    csel w0, w0, w9, gt
-; CHECK-CVT-NEXT:    cmp w1, w8
-; CHECK-CVT-NEXT:    csel w1, w1, w8, lt
-; CHECK-CVT-NEXT:    fmov s3, w0
-; CHECK-CVT-NEXT:    cmn w1, #8, lsl #12 // =32768
-; CHECK-CVT-NEXT:    csel w1, w1, w9, gt
-; CHECK-CVT-NEXT:    cmp w2, w8
-; CHECK-CVT-NEXT:    mov v2.s[3], w12
-; CHECK-CVT-NEXT:    csel w2, w2, w8, lt
-; CHECK-CVT-NEXT:    mov v3.s[1], w18
-; CHECK-CVT-NEXT:    cmn w2, #8, lsl #12 // =32768
-; CHECK-CVT-NEXT:    csel w2, w2, w9, gt
-; CHECK-CVT-NEXT:    cmp w3, w8
-; CHECK-CVT-NEXT:    csel w3, w3, w8, lt
-; CHECK-CVT-NEXT:    cmn w3, #8, lsl #12 // =32768
-; CHECK-CVT-NEXT:    mov v3.s[2], w1
-; CHECK-CVT-NEXT:    csel w13, w3, w9, gt
-; CHECK-CVT-NEXT:    cmp w4, w8
-; CHECK-CVT-NEXT:    csel w3, w4, w8, lt
-; CHECK-CVT-NEXT:    fcvtzs w4, s0
-; CHECK-CVT-NEXT:    mov s0, v1.s[3]
-; CHECK-CVT-NEXT:    cmn w3, #8, lsl #12 // =32768
-; CHECK-CVT-NEXT:    fmov s1, w16
-; CHECK-CVT-NEXT:    csel w11, w3, w9, gt
-; CHECK-CVT-NEXT:    mov v3.s[3], w2
-; CHECK-CVT-NEXT:    fmov s4, w11
-; CHECK-CVT-NEXT:    mov v1.s[1], w14
-; CHECK-CVT-NEXT:    cmp w4, w8
-; CHECK-CVT-NEXT:    fcvtzs w11, s0
-; CHECK-CVT-NEXT:    mov v4.s[1], w13
-; CHECK-CVT-NEXT:    csel w13, w4, w8, lt
-; CHECK-CVT-NEXT:    cmn w13, #8, lsl #12 // =32768
-; CHECK-CVT-NEXT:    csel w10, w13, w9, gt
-; CHECK-CVT-NEXT:    mov v1.s[2], w15
-; CHECK-CVT-NEXT:    cmp w11, w8
-; CHECK-CVT-NEXT:    csel w8, w11, w8, lt
-; CHECK-CVT-NEXT:    mov v4.s[2], w10
-; CHECK-CVT-NEXT:    cmn w8, #8, lsl #12 // =32768
-; CHECK-CVT-NEXT:    csel w8, w8, w9, gt
-; CHECK-CVT-NEXT:    mov v1.s[3], w17
-; CHECK-CVT-NEXT:    mov v4.s[3], w8
-; CHECK-CVT-NEXT:    uzp1 v0.8h, v1.8h, v2.8h
-; CHECK-CVT-NEXT:    uzp1 v1.8h, v4.8h, v3.8h
+; CHECK-CVT-NEXT:    fcvtl v2.4s, v0.4h
+; CHECK-CVT-NEXT:    fcvtl v3.4s, v1.4h
+; CHECK-CVT-NEXT:    fcvtl2 v4.4s, v0.8h
+; CHECK-CVT-NEXT:    fcvtl2 v5.4s, v1.8h
+; CHECK-CVT-NEXT:    fcvtzs v2.4s, v2.4s
+; CHECK-CVT-NEXT:    fcvtzs v1.4s, v3.4s
+; CHECK-CVT-NEXT:    fcvtzs v3.4s, v5.4s
+; CHECK-CVT-NEXT:    sqxtn v0.4h, v2.4s
+; CHECK-CVT-NEXT:    fcvtzs v2.4s, v4.4s
+; CHECK-CVT-NEXT:    sqxtn v1.4h, v1.4s
+; CHECK-CVT-NEXT:    sqxtn2 v0.8h, v2.4s
+; CHECK-CVT-NEXT:    sqxtn2 v1.8h, v3.4s
 ; CHECK-CVT-NEXT:    ret
 ;
 ; CHECK-FP16-LABEL: test_signed_v16f16_v16i16:
diff --git a/llvm/test/CodeGen/AArch64/fptoui-sat-vector.ll b/llvm/test/CodeGen/AArch64/fptoui-sat-vector.ll
index 6089d76..a3b94bc 100644
--- a/llvm/test/CodeGen/AArch64/fptoui-sat-vector.ll
+++ b/llvm/test/CodeGen/AArch64/fptoui-sat-vector.ll
@@ -1708,47 +1708,14 @@ declare <8 x i128> @llvm.fptoui.sat.v8f16.v8i128(<8 x half>)
 define <8 x i1> @test_unsigned_v8f16_v8i1(<8 x half> %f) {
 ; CHECK-CVT-LABEL: test_unsigned_v8f16_v8i1:
 ; CHECK-CVT:       // %bb.0:
-; CHECK-CVT-NEXT:    fcvtl2 v1.4s, v0.8h
+; CHECK-CVT-NEXT:    fcvtl2 v2.4s, v0.8h
 ; CHECK-CVT-NEXT:    fcvtl v0.4s, v0.4h
-; CHECK-CVT-NEXT:    mov s2, v1.s[1]
-; CHECK-CVT-NEXT:    mov s3, v1.s[2]
-; CHECK-CVT-NEXT:    mov s4, v1.s[3]
-; CHECK-CVT-NEXT:    fcvtzu w9, s1
-; CHECK-CVT-NEXT:    fcvtzu w13, s0
-; CHECK-CVT-NEXT:    mov s1, v0.s[2]
-; CHECK-CVT-NEXT:    fcvtzu w8, s2
-; CHECK-CVT-NEXT:    mov s2, v0.s[1]
-; CHECK-CVT-NEXT:    fcvtzu w10, s3
-; CHECK-CVT-NEXT:    fcvtzu w11, s4
-; CHECK-CVT-NEXT:    fcvtzu w14, s1
-; CHECK-CVT-NEXT:    mov s0, v0.s[3]
-; CHECK-CVT-NEXT:    cmp w8, #1
-; CHECK-CVT-NEXT:    fcvtzu w12, s2
-; CHECK-CVT-NEXT:    csinc w8, w8, wzr, lo
-; CHECK-CVT-NEXT:    cmp w9, #1
-; CHECK-CVT-NEXT:    csinc w9, w9, wzr, lo
-; CHECK-CVT-NEXT:    cmp w10, #1
-; CHECK-CVT-NEXT:    csinc w10, w10, wzr, lo
-; CHECK-CVT-NEXT:    cmp w11, #1
-; CHECK-CVT-NEXT:    fmov s1, w9
-; CHECK-CVT-NEXT:    csinc w11, w11, wzr, lo
-; CHECK-CVT-NEXT:    cmp w12, #1
-; CHECK-CVT-NEXT:    csinc w12, w12, wzr, lo
-; CHECK-CVT-NEXT:    cmp w13, #1
-; CHECK-CVT-NEXT:    csinc w13, w13, wzr, lo
-; CHECK-CVT-NEXT:    mov v1.s[1], w8
-; CHECK-CVT-NEXT:    cmp w14, #1
-; CHECK-CVT-NEXT:    fmov s2, w13
-; CHECK-CVT-NEXT:    fcvtzu w8, s0
-; CHECK-CVT-NEXT:    csinc w9, w14, wzr, lo
-; CHECK-CVT-NEXT:    mov v2.s[1], w12
-; CHECK-CVT-NEXT:    mov v1.s[2], w10
-; CHECK-CVT-NEXT:    cmp w8, #1
-; CHECK-CVT-NEXT:    csinc w8, w8, wzr, lo
-; CHECK-CVT-NEXT:    mov v2.s[2], w9
-; CHECK-CVT-NEXT:    mov v1.s[3], w11
-; CHECK-CVT-NEXT:    mov v2.s[3], w8
-; CHECK-CVT-NEXT:    uzp1 v0.8h, v2.8h, v1.8h
+; CHECK-CVT-NEXT:    movi v1.4s, #1
+; CHECK-CVT-NEXT:    fcvtzu v2.4s, v2.4s
+; CHECK-CVT-NEXT:    fcvtzu v0.4s, v0.4s
+; CHECK-CVT-NEXT:    umin v2.4s, v2.4s, v1.4s
+; CHECK-CVT-NEXT:    umin v0.4s, v0.4s, v1.4s
+; CHECK-CVT-NEXT:    uzp1 v0.8h, v0.8h, v2.8h
 ; CHECK-CVT-NEXT:    xtn v0.8b, v0.8h
 ; CHECK-CVT-NEXT:    ret
 ;
@@ -1766,48 +1733,14 @@ define <8 x i1> @test_unsigned_v8f16_v8i1(<8 x half> %f) {
 define <8 x i8> @test_unsigned_v8f16_v8i8(<8 x half> %f) {
 ; CHECK-CVT-LABEL: test_unsigned_v8f16_v8i8:
 ; CHECK-CVT:       // %bb.0:
-; CHECK-CVT-NEXT:    fcvtl2 v1.4s, v0.8h
+; CHECK-CVT-NEXT:    fcvtl2 v2.4s, v0.8h
 ; CHECK-CVT-NEXT:    fcvtl v0.4s, v0.4h
-; CHECK-CVT-NEXT:    mov w8, #255 // =0xff
-; CHECK-CVT-NEXT:    mov s2, v1.s[1]
-; CHECK-CVT-NEXT:    mov s3, v1.s[2]
-; CHECK-CVT-NEXT:    mov s4, v1.s[3]
-; CHECK-CVT-NEXT:    fcvtzu w10, s1
-; CHECK-CVT-NEXT:    fcvtzu w14, s0
-; CHECK-CVT-NEXT:    mov s1, v0.s[2]
-; CHECK-CVT-NEXT:    fcvtzu w9, s2
-; CHECK-CVT-NEXT:    mov s2, v0.s[1]
-; CHECK-CVT-NEXT:    fcvtzu w11, s3
-; CHECK-CVT-NEXT:    fcvtzu w12, s4
-; CHECK-CVT-NEXT:    fcvtzu w15, s1
-; CHECK-CVT-NEXT:    mov s0, v0.s[3]
-; CHECK-CVT-NEXT:    cmp w9, #255
-; CHECK-CVT-NEXT:    fcvtzu w13, s2
-; CHECK-CVT-NEXT:    csel w9, w9, w8, lo
-; CHECK-CVT-NEXT:    cmp w10, #255
-; CHECK-CVT-NEXT:    csel w10, w10, w8, lo
-; CHECK-CVT-NEXT:    cmp w11, #255
-; CHECK-CVT-NEXT:    csel w11, w11, w8, lo
-; CHECK-CVT-NEXT:    cmp w12, #255
-; CHECK-CVT-NEXT:    fmov s1, w10
-; CHECK-CVT-NEXT:    csel w12, w12, w8, lo
-; CHECK-CVT-NEXT:    cmp w13, #255
-; CHECK-CVT-NEXT:    csel w13, w13, w8, lo
-; CHECK-CVT-NEXT:    cmp w14, #255
-; CHECK-CVT-NEXT:    csel w14, w14, w8, lo
-; CHECK-CVT-NEXT:    mov v1.s[1], w9
-; CHECK-CVT-NEXT:    cmp w15, #255
-; CHECK-CVT-NEXT:    fmov s2, w14
-; CHECK-CVT-NEXT:    fcvtzu w9, s0
-; CHECK-CVT-NEXT:    csel w10, w15, w8, lo
-; CHECK-CVT-NEXT:    mov v2.s[1], w13
-; CHECK-CVT-NEXT:    mov v1.s[2], w11
-; CHECK-CVT-NEXT:    cmp w9, #255
-; CHECK-CVT-NEXT:    csel w8, w9, w8, lo
-; CHECK-CVT-NEXT:    mov v2.s[2], w10
-; CHECK-CVT-NEXT:    mov v1.s[3], w12
-; CHECK-CVT-NEXT:    mov v2.s[3], w8
-; CHECK-CVT-NEXT:    uzp1 v0.8h, v2.8h, v1.8h
+; CHECK-CVT-NEXT:    movi v1.2d, #0x0000ff000000ff
+; CHECK-CVT-NEXT:    fcvtzu v2.4s, v2.4s
+; CHECK-CVT-NEXT:    fcvtzu v0.4s, v0.4s
+; CHECK-CVT-NEXT:    umin v2.4s, v2.4s, v1.4s
+; CHECK-CVT-NEXT:    umin v0.4s, v0.4s, v1.4s
+; CHECK-CVT-NEXT:    uzp1 v0.8h, v0.8h, v2.8h
 ; CHECK-CVT-NEXT:    xtn v0.8b, v0.8h
 ; CHECK-CVT-NEXT:    ret
 ;
@@ -1823,48 +1756,14 @@ define <8 x i8> @test_unsigned_v8f16_v8i8(<8 x half> %f) {
 define <8 x i13> @test_unsigned_v8f16_v8i13(<8 x half> %f) {
 ; CHECK-CVT-LABEL: test_unsigned_v8f16_v8i13:
 ; CHECK-CVT:       // %bb.0:
-; CHECK-CVT-NEXT:    fcvtl2 v1.4s, v0.8h
+; CHECK-CVT-NEXT:    fcvtl2 v2.4s, v0.8h
 ; CHECK-CVT-NEXT:    fcvtl v0.4s, v0.4h
-; CHECK-CVT-NEXT:    mov w8, #8191 // =0x1fff
-; CHECK-CVT-NEXT:    mov s2, v1.s[1]
-; CHECK-CVT-NEXT:    mov s3, v1.s[2]
-; CHECK-CVT-NEXT:    mov s4, v1.s[3]
-; CHECK-CVT-NEXT:    fcvtzu w10, s1
-; CHECK-CVT-NEXT:    fcvtzu w14, s0
-; CHECK-CVT-NEXT:    mov s1, v0.s[2]
-; CHECK-CVT-NEXT:    fcvtzu w9, s2
-; CHECK-CVT-NEXT:    mov s2, v0.s[1]
-; CHECK-CVT-NEXT:    fcvtzu w11, s3
-; CHECK-CVT-NEXT:    fcvtzu w12, s4
-; CHECK-CVT-NEXT:    fcvtzu w15, s1
-; CHECK-CVT-NEXT:    mov s0, v0.s[3]
-; CHECK-CVT-NEXT:    cmp w9, w8
-; CHECK-CVT-NEXT:    fcvtzu w13, s2
-; CHECK-CVT-NEXT:    csel w9, w9, w8, lo
-; CHECK-CVT-NEXT:    cmp w10, w8
-; CHECK-CVT-NEXT:    csel w10, w10, w8, lo
-; CHECK-CVT-NEXT:    cmp w11, w8
-; CHECK-CVT-NEXT:    csel w11, w11, w8, lo
-; CHECK-CVT-NEXT:    cmp w12, w8
-; CHECK-CVT-NEXT:    fmov s1, w10
-; CHECK-CVT-NEXT:    csel w12, w12, w8, lo
-; CHECK-CVT-NEXT:    cmp w13, w8
-; CHECK-CVT-NEXT:    csel w13, w13, w8, lo
-; CHECK-CVT-NEXT:    cmp w14, w8
-; CHECK-CVT-NEXT:    csel w14, w14, w8, lo
-; CHECK-CVT-NEXT:    mov v1.s[1], w9
-; CHECK-CVT-NEXT:    cmp w15, w8
-; CHECK-CVT-NEXT:    fmov s2, w14
-; CHECK-CVT-NEXT:    fcvtzu w9, s0
-; CHECK-CVT-NEXT:    csel w10, w15, w8, lo
-; CHECK-CVT-NEXT:    mov v2.s[1], w13
-; CHECK-CVT-NEXT:    mov v1.s[2], w11
-; CHECK-CVT-NEXT:    cmp w9, w8
-; CHECK-CVT-NEXT:    csel w8, w9, w8, lo
-; CHECK-CVT-NEXT:    mov v2.s[2], w10
-; CHECK-CVT-NEXT:    mov v1.s[3], w12
-; CHECK-CVT-NEXT:    mov v2.s[3], w8
-; CHECK-CVT-NEXT:    uzp1 v0.8h, v2.8h, v1.8h
+; CHECK-CVT-NEXT:    movi v1.4s, #31, msl #8
+; CHECK-CVT-NEXT:    fcvtzu v2.4s, v2.4s
+; CHECK-CVT-NEXT:    fcvtzu v0.4s, v0.4s
+; CHECK-CVT-NEXT:    umin v2.4s, v2.4s, v1.4s
+; CHECK-CVT-NEXT:    umin v0.4s, v0.4s, v1.4s
+; CHECK-CVT-NEXT:    uzp1 v0.8h, v0.8h, v2.8h
 ; CHECK-CVT-NEXT:    ret
 ;
 ; CHECK-FP16-LABEL: test_unsigned_v8f16_v8i13:
@@ -1880,48 +1779,12 @@ define <8 x i13> @test_unsigned_v8f16_v8i13(<8 x half> %f) {
 define <8 x i16> @test_unsigned_v8f16_v8i16(<8 x half> %f) {
 ; CHECK-CVT-LABEL: test_unsigned_v8f16_v8i16:
 ; CHECK-CVT:       // %bb.0:
-; CHECK-CVT-NEXT:    fcvtl2 v1.4s, v0.8h
-; CHECK-CVT-NEXT:    fcvtl v0.4s, v0.4h
-; CHECK-CVT-NEXT:    mov w8, #65535 // =0xffff
-; CHECK-CVT-NEXT:    mov s2, v1.s[1]
-; CHECK-CVT-NEXT:    mov s3, v1.s[2]
-; CHECK-CVT-NEXT:    mov s4, v1.s[3]
-; CHECK-CVT-NEXT:    fcvtzu w10, s1
-; CHECK-CVT-NEXT:    fcvtzu w14, s0
-; CHECK-CVT-NEXT:    mov s1, v0.s[2]
-; CHECK-CVT-NEXT:    fcvtzu w9, s2
-; CHECK-CVT-NEXT:    mov s2, v0.s[1]
-; CHECK-CVT-NEXT:    fcvtzu w11, s3
-; CHECK-CVT-NEXT:    fcvtzu w12, s4
-; CHECK-CVT-NEXT:    fcvtzu w15, s1
-; CHECK-CVT-NEXT:    mov s0, v0.s[3]
-; CHECK-CVT-NEXT:    cmp w9, w8
-; CHECK-CVT-NEXT:    fcvtzu w13, s2
-; CHECK-CVT-NEXT:    csel w9, w9, w8, lo
-; CHECK-CVT-NEXT:    cmp w10, w8
-; CHECK-CVT-NEXT:    csel w10, w10, w8, lo
-; CHECK-CVT-NEXT:    cmp w11, w8
-; CHECK-CVT-NEXT:    csel w11, w11, w8, lo
-; CHECK-CVT-NEXT:    cmp w12, w8
-; CHECK-CVT-NEXT:    fmov s1, w10
-; CHECK-CVT-NEXT:    csel w12, w12, w8, lo
-; CHECK-CVT-NEXT:    cmp w13, w8
-; CHECK-CVT-NEXT:    csel w13, w13, w8, lo
-; CHECK-CVT-NEXT:    cmp w14, w8
-; CHECK-CVT-NEXT:    csel w14, w14, w8, lo
-; CHECK-CVT-NEXT:    mov v1.s[1], w9
-; CHECK-CVT-NEXT:    cmp w15, w8
-; CHECK-CVT-NEXT:    fmov s2, w14
-; CHECK-CVT-NEXT:    fcvtzu w9, s0
-; CHECK-CVT-NEXT:    csel w10, w15, w8, lo
-; CHECK-CVT-NEXT:    mov v2.s[1], w13
-; CHECK-CVT-NEXT:    mov v1.s[2], w11
-; CHECK-CVT-NEXT:    cmp w9, w8
-; CHECK-CVT-NEXT:    csel w8, w9, w8, lo
-; CHECK-CVT-NEXT:    mov v2.s[2], w10
-; CHECK-CVT-NEXT:    mov v1.s[3], w12
-; CHECK-CVT-NEXT:    mov v2.s[3], w8
-; CHECK-CVT-NEXT:    uzp1 v0.8h, v2.8h, v1.8h
+; CHECK-CVT-NEXT:    fcvtl v1.4s, v0.4h
+; CHECK-CVT-NEXT:    fcvtl2 v2.4s, v0.8h
+; CHECK-CVT-NEXT:    fcvtzu v1.4s, v1.4s
+; CHECK-CVT-NEXT:    uqxtn v0.4h, v1.4s
+; CHECK-CVT-NEXT:    fcvtzu v1.4s, v2.4s
+; CHECK-CVT-NEXT:    uqxtn2 v0.8h, v1.4s
 ; CHECK-CVT-NEXT:    ret
 ;
 ; CHECK-FP16-LABEL: test_unsigned_v8f16_v8i16:
@@ -2509,90 +2372,22 @@ define <16 x i16> @test_unsigned_v16f32_v16i16(<16 x float> %f) {
 define <16 x i8> @test_unsigned_v16f16_v16i8(<16 x half> %f) {
 ; CHECK-CVT-LABEL: test_unsigned_v16f16_v16i8:
 ; CHECK-CVT:       // %bb.0:
-; CHECK-CVT-NEXT:    fcvtl2 v2.4s, v1.8h
+; CHECK-CVT-NEXT:    fcvtl2 v3.4s, v1.8h
 ; CHECK-CVT-NEXT:    fcvtl v1.4s, v1.4h
-; CHECK-CVT-NEXT:    mov w8, #255 // =0xff
-; CHECK-CVT-NEXT:    mov s3, v2.s[1]
-; CHECK-CVT-NEXT:    mov s4, v2.s[2]
-; CHECK-CVT-NEXT:    mov s5, v2.s[3]
-; CHECK-CVT-NEXT:    fcvtzu w10, s2
-; CHECK-CVT-NEXT:    fcvtl2 v2.4s, v0.8h
-; CHECK-CVT-NEXT:    fcvtzu w13, s1
+; CHECK-CVT-NEXT:    fcvtl2 v4.4s, v0.8h
 ; CHECK-CVT-NEXT:    fcvtl v0.4s, v0.4h
-; CHECK-CVT-NEXT:    fcvtzu w9, s3
-; CHECK-CVT-NEXT:    mov s3, v1.s[1]
-; CHECK-CVT-NEXT:    fcvtzu w11, s4
-; CHECK-CVT-NEXT:    mov s4, v1.s[2]
-; CHECK-CVT-NEXT:    fcvtzu w12, s5
-; CHECK-CVT-NEXT:    mov s1, v1.s[3]
-; CHECK-CVT-NEXT:    fcvtzu w18, s2
-; CHECK-CVT-NEXT:    fcvtzu w3, s0
-; CHECK-CVT-NEXT:    fcvtzu w14, s3
-; CHECK-CVT-NEXT:    cmp w9, #255
-; CHECK-CVT-NEXT:    mov s3, v2.s[1]
-; CHECK-CVT-NEXT:    csel w9, w9, w8, lo
-; CHECK-CVT-NEXT:    cmp w10, #255
-; CHECK-CVT-NEXT:    fcvtzu w15, s4
-; CHECK-CVT-NEXT:    csel w10, w10, w8, lo
-; CHECK-CVT-NEXT:    cmp w11, #255
-; CHECK-CVT-NEXT:    mov s4, v2.s[2]
-; CHECK-CVT-NEXT:    csel w11, w11, w8, lo
-; CHECK-CVT-NEXT:    cmp w12, #255
-; CHECK-CVT-NEXT:    fcvtzu w16, s1
-; CHECK-CVT-NEXT:    mov s1, v2.s[3]
-; CHECK-CVT-NEXT:    csel w12, w12, w8, lo
-; CHECK-CVT-NEXT:    cmp w14, #255
-; CHECK-CVT-NEXT:    fcvtzu w17, s3
-; CHECK-CVT-NEXT:    mov s3, v0.s[1]
-; CHECK-CVT-NEXT:    csel w14, w14, w8, lo
-; CHECK-CVT-NEXT:    cmp w13, #255
-; CHECK-CVT-NEXT:    fcvtzu w0, s4
-; CHECK-CVT-NEXT:    fmov s2, w10
-; CHECK-CVT-NEXT:    csel w13, w13, w8, lo
-; CHECK-CVT-NEXT:    cmp w15, #255
-; CHECK-CVT-NEXT:    csel w15, w15, w8, lo
-; CHECK-CVT-NEXT:    cmp w16, #255
-; CHECK-CVT-NEXT:    fcvtzu w1, s1
-; CHECK-CVT-NEXT:    csel w16, w16, w8, lo
-; CHECK-CVT-NEXT:    cmp w17, #255
-; CHECK-CVT-NEXT:    fcvtzu w2, s3
-; CHECK-CVT-NEXT:    csel w17, w17, w8, lo
-; CHECK-CVT-NEXT:    cmp w18, #255
-; CHECK-CVT-NEXT:    mov s1, v0.s[2]
-; CHECK-CVT-NEXT:    csel w18, w18, w8, lo
-; CHECK-CVT-NEXT:    cmp w0, #255
-; CHECK-CVT-NEXT:    mov v2.s[1], w9
-; CHECK-CVT-NEXT:    csel w0, w0, w8, lo
-; CHECK-CVT-NEXT:    cmp w1, #255
-; CHECK-CVT-NEXT:    fmov s3, w18
-; CHECK-CVT-NEXT:    csel w10, w1, w8, lo
-; CHECK-CVT-NEXT:    cmp w2, #255
-; CHECK-CVT-NEXT:    mov s0, v0.s[3]
-; CHECK-CVT-NEXT:    csel w9, w2, w8, lo
-; CHECK-CVT-NEXT:    cmp w3, #255
-; CHECK-CVT-NEXT:    fcvtzu w2, s1
-; CHECK-CVT-NEXT:    csel w1, w3, w8, lo
-; CHECK-CVT-NEXT:    fmov s1, w13
-; CHECK-CVT-NEXT:    mov v3.s[1], w17
-; CHECK-CVT-NEXT:    fmov s4, w1
-; CHECK-CVT-NEXT:    mov v2.s[2], w11
-; CHECK-CVT-NEXT:    mov v1.s[1], w14
-; CHECK-CVT-NEXT:    cmp w2, #255
-; CHECK-CVT-NEXT:    mov v4.s[1], w9
-; CHECK-CVT-NEXT:    fcvtzu w9, s0
-; CHECK-CVT-NEXT:    csel w11, w2, w8, lo
-; CHECK-CVT-NEXT:    mov v3.s[2], w0
-; CHECK-CVT-NEXT:    mov v2.s[3], w12
-; CHECK-CVT-NEXT:    mov v1.s[2], w15
-; CHECK-CVT-NEXT:    mov v4.s[2], w11
-; CHECK-CVT-NEXT:    cmp w9, #255
-; CHECK-CVT-NEXT:    csel w8, w9, w8, lo
-; CHECK-CVT-NEXT:    mov v3.s[3], w10
-; CHECK-CVT-NEXT:    mov v1.s[3], w16
-; CHECK-CVT-NEXT:    mov v4.s[3], w8
-; CHECK-CVT-NEXT:    uzp1 v0.8h, v1.8h, v2.8h
-; CHECK-CVT-NEXT:    uzp1 v1.8h, v4.8h, v3.8h
-; CHECK-CVT-NEXT:    uzp1 v0.16b, v1.16b, v0.16b
+; CHECK-CVT-NEXT:    movi v2.2d, #0x0000ff000000ff
+; CHECK-CVT-NEXT:    fcvtzu v3.4s, v3.4s
+; CHECK-CVT-NEXT:    fcvtzu v1.4s, v1.4s
+; CHECK-CVT-NEXT:    fcvtzu v4.4s, v4.4s
+; CHECK-CVT-NEXT:    fcvtzu v0.4s, v0.4s
+; CHECK-CVT-NEXT:    umin v3.4s, v3.4s, v2.4s
+; CHECK-CVT-NEXT:    umin v1.4s, v1.4s, v2.4s
+; CHECK-CVT-NEXT:    umin v4.4s, v4.4s, v2.4s
+; CHECK-CVT-NEXT:    umin v0.4s, v0.4s, v2.4s
+; CHECK-CVT-NEXT:    uzp1 v1.8h, v1.8h, v3.8h
+; CHECK-CVT-NEXT:    uzp1 v0.8h, v0.8h, v4.8h
+; CHECK-CVT-NEXT:    uzp1 v0.16b, v0.16b, v1.16b
 ; CHECK-CVT-NEXT:    ret
 ;
 ; CHECK-FP16-LABEL: test_unsigned_v16f16_v16i8:
@@ -2609,89 +2404,18 @@ define <16 x i8> @test_unsigned_v16f16_v16i8(<16 x half> %f) {
 define <16 x i16> @test_unsigned_v16f16_v16i16(<16 x half> %f) {
 ; CHECK-CVT-LABEL: test_unsigned_v16f16_v16i16:
 ; CHECK-CVT:       // %bb.0:
-; CHECK-CVT-NEXT:    fcvtl2 v2.4s, v0.8h
-; CHECK-CVT-NEXT:    fcvtl v0.4s, v0.4h
-; CHECK-CVT-NEXT:    mov w8, #65535 // =0xffff
-; CHECK-CVT-NEXT:    mov s3, v2.s[1]
-; CHECK-CVT-NEXT:    mov s4, v2.s[2]
-; CHECK-CVT-NEXT:    mov s5, v2.s[3]
-; CHECK-CVT-NEXT:    fcvtzu w10, s2
-; CHECK-CVT-NEXT:    fcvtl2 v2.4s, v1.8h
-; CHECK-CVT-NEXT:    fcvtzu w13, s0
-; CHECK-CVT-NEXT:    fcvtl v1.4s, v1.4h
-; CHECK-CVT-NEXT:    fcvtzu w9, s3
-; CHECK-CVT-NEXT:    mov s3, v0.s[1]
-; CHECK-CVT-NEXT:    fcvtzu w11, s4
-; CHECK-CVT-NEXT:    mov s4, v0.s[2]
-; CHECK-CVT-NEXT:    fcvtzu w12, s5
-; CHECK-CVT-NEXT:    mov s0, v0.s[3]
-; CHECK-CVT-NEXT:    fcvtzu w18, s2
-; CHECK-CVT-NEXT:    fcvtzu w3, s1
-; CHECK-CVT-NEXT:    fcvtzu w14, s3
-; CHECK-CVT-NEXT:    cmp w9, w8
-; CHECK-CVT-NEXT:    mov s3, v2.s[1]
-; CHECK-CVT-NEXT:    csel w9, w9, w8, lo
-; CHECK-CVT-NEXT:    cmp w10, w8
-; CHECK-CVT-NEXT:    fcvtzu w15, s4
-; CHECK-CVT-NEXT:    csel w10, w10, w8, lo
-; CHECK-CVT-NEXT:    cmp w11, w8
-; CHECK-CVT-NEXT:    mov s4, v2.s[2]
-; CHECK-CVT-NEXT:    csel w11, w11, w8, lo
-; CHECK-CVT-NEXT:    cmp w12, w8
-; CHECK-CVT-NEXT:    fcvtzu w16, s0
-; CHECK-CVT-NEXT:    mov s0, v2.s[3]
-; CHECK-CVT-NEXT:    csel w12, w12, w8, lo
-; CHECK-CVT-NEXT:    cmp w14, w8
-; CHECK-CVT-NEXT:    fcvtzu w17, s3
-; CHECK-CVT-NEXT:    mov s3, v1.s[1]
-; CHECK-CVT-NEXT:    csel w14, w14, w8, lo
-; CHECK-CVT-NEXT:    cmp w13, w8
-; CHECK-CVT-NEXT:    fcvtzu w0, s4
-; CHECK-CVT-NEXT:    fmov s2, w10
-; CHECK-CVT-NEXT:    csel w13, w13, w8, lo
-; CHECK-CVT-NEXT:    cmp w15, w8
-; CHECK-CVT-NEXT:    csel w15, w15, w8, lo
-; CHECK-CVT-NEXT:    cmp w16, w8
-; CHECK-CVT-NEXT:    fcvtzu w1, s0
-; CHECK-CVT-NEXT:    csel w16, w16, w8, lo
-; CHECK-CVT-NEXT:    cmp w17, w8
-; CHECK-CVT-NEXT:    fcvtzu w2, s3
-; CHECK-CVT-NEXT:    csel w17, w17, w8, lo
-; CHECK-CVT-NEXT:    cmp w18, w8
-; CHECK-CVT-NEXT:    mov s0, v1.s[2]
-; CHECK-CVT-NEXT:    csel w18, w18, w8, lo
-; CHECK-CVT-NEXT:    cmp w0, w8
-; CHECK-CVT-NEXT:    mov v2.s[1], w9
-; CHECK-CVT-NEXT:    csel w0, w0, w8, lo
-; CHECK-CVT-NEXT:    cmp w1, w8
-; CHECK-CVT-NEXT:    fmov s3, w18
-; CHECK-CVT-NEXT:    csel w10, w1, w8, lo
-; CHECK-CVT-NEXT:    cmp w2, w8
-; CHECK-CVT-NEXT:    csel w9, w2, w8, lo
-; CHECK-CVT-NEXT:    cmp w3, w8
-; CHECK-CVT-NEXT:    fcvtzu w2, s0
-; CHECK-CVT-NEXT:    csel w1, w3, w8, lo
-; CHECK-CVT-NEXT:    mov s0, v1.s[3]
-; CHECK-CVT-NEXT:    fmov s1, w13
-; CHECK-CVT-NEXT:    fmov s4, w1
-; CHECK-CVT-NEXT:    mov v3.s[1], w17
-; CHECK-CVT-NEXT:    mov v2.s[2], w11
-; CHECK-CVT-NEXT:    mov v1.s[1], w14
-; CHECK-CVT-NEXT:    cmp w2, w8
-; CHECK-CVT-NEXT:    mov v4.s[1], w9
-; CHECK-CVT-NEXT:    fcvtzu w9, s0
-; CHECK-CVT-NEXT:    csel w11, w2, w8, lo
-; CHECK-CVT-NEXT:    mov v3.s[2], w0
-; CHECK-CVT-NEXT:    mov v2.s[3], w12
-; CHECK-CVT-NEXT:    mov v1.s[2], w15
-; CHECK-CVT-NEXT:    mov v4.s[2], w11
-; CHECK-CVT-NEXT:    cmp w9, w8
-; CHECK-CVT-NEXT:    csel w8, w9, w8, lo
-; CHECK-CVT-NEXT:    mov v3.s[3], w10
-; CHECK-CVT-NEXT:    mov v1.s[3], w16
-; CHECK-CVT-NEXT:    mov v4.s[3], w8
-; CHECK-CVT-NEXT:    uzp1 v0.8h, v1.8h, v2.8h
-; CHECK-CVT-NEXT:    uzp1 v1.8h, v4.8h, v3.8h
+; CHECK-CVT-NEXT:    fcvtl v2.4s, v0.4h
+; CHECK-CVT-NEXT:    fcvtl v3.4s, v1.4h
+; CHECK-CVT-NEXT:    fcvtl2 v4.4s, v0.8h
+; CHECK-CVT-NEXT:    fcvtl2 v5.4s, v1.8h
+; CHECK-CVT-NEXT:    fcvtzu v2.4s, v2.4s
+; CHECK-CVT-NEXT:    fcvtzu v1.4s, v3.4s
+; CHECK-CVT-NEXT:    fcvtzu v3.4s, v5.4s
+; CHECK-CVT-NEXT:    uqxtn v0.4h, v2.4s
+; CHECK-CVT-NEXT:    fcvtzu v2.4s, v4.4s
+; CHECK-CVT-NEXT:    uqxtn v1.4h, v1.4s
+; CHECK-CVT-NEXT:    uqxtn2 v0.8h, v2.4s
+; CHECK-CVT-NEXT:    uqxtn2 v1.8h, v3.4s
 ; CHECK-CVT-NEXT:    ret
 ;
 ; CHECK-FP16-LABEL: test_unsigned_v16f16_v16i16:
diff --git a/llvm/lib/Target/AArch64/peephole-sxtw.mir b/llvm/test/CodeGen/AArch64/peephole-sxtw.mir
index 274d434..274d434 100644
--- a/llvm/lib/Target/AArch64/peephole-sxtw.mir
+++ b/llvm/test/CodeGen/AArch64/peephole-sxtw.mir
diff --git a/llvm/test/CodeGen/AArch64/ptrauth-fpac.ll b/llvm/test/CodeGen/AArch64/ptrauth-fpac.ll
index 6afe1a9..d5340dc 100644
--- a/llvm/test/CodeGen/AArch64/ptrauth-fpac.ll
+++ b/llvm/test/CodeGen/AArch64/ptrauth-fpac.ll
@@ -1,12 +1,14 @@
 ; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py
-; RUN: llc < %s -mtriple arm64e-apple-darwin                                                -verify-machineinstrs | FileCheck %s --check-prefixes=ALL,NOFPAC
-; RUN: llc < %s -mtriple arm64e-apple-darwin -mattr=+fpac                                   -verify-machineinstrs | FileCheck %s --check-prefixes=ALL,FPAC
+; RUN: llc < %s -mtriple arm64e-apple-darwin                          -verify-machineinstrs | FileCheck %s -DL="L"  --check-prefixes=ALL,NOFPAC
+; RUN: llc < %s -mtriple arm64e-apple-darwin             -mattr=+fpac -verify-machineinstrs | FileCheck %s -DL="L"  --check-prefixes=ALL,FPAC
+; RUN: llc < %s -mtriple aarch64-linux-gnu -mattr=+pauth              -verify-machineinstrs | FileCheck %s -DL=".L" --check-prefixes=ALL,NOFPAC
+; RUN: llc < %s -mtriple aarch64-linux-gnu -mattr=+pauth -mattr=+fpac -verify-machineinstrs | FileCheck %s -DL=".L" --check-prefixes=ALL,FPAC
 
 target datalayout = "e-m:o-i64:64-i128:128-n32:64-S128"
 
 define i64 @test_auth_ia(i64 %arg, i64 %arg1) {
 ; ALL-LABEL: test_auth_ia:
-; ALL:       ; %bb.0:
+; ALL:       %bb.0:
 ; ALL-NEXT:    mov x16, x0
 ; ALL-NEXT:    autia x16, x1
 ; ALL-NEXT:    mov x0, x16
@@ -17,7 +19,7 @@ define i64 @test_auth_ia(i64 %arg, i64 %arg1) {
 
 define i64 @test_auth_ia_zero(i64 %arg) {
 ; ALL-LABEL: test_auth_ia_zero:
-; ALL:       ; %bb.0:
+; ALL:       %bb.0:
 ; ALL-NEXT:    mov x16, x0
 ; ALL-NEXT:    autiza x16
 ; ALL-NEXT:    mov x0, x16
@@ -28,7 +30,7 @@ define i64 @test_auth_ia_zero(i64 %arg) {
 
 define i64 @test_auth_ib(i64 %arg, i64 %arg1) {
 ; ALL-LABEL: test_auth_ib:
-; ALL:       ; %bb.0:
+; ALL:       %bb.0:
 ; ALL-NEXT:    mov x16, x0
 ; ALL-NEXT:    autib x16, x1
 ; ALL-NEXT:    mov x0, x16
@@ -39,7 +41,7 @@ define i64 @test_auth_ib(i64 %arg, i64 %arg1) {
 
 define i64 @test_auth_ib_zero(i64 %arg) {
 ; ALL-LABEL: test_auth_ib_zero:
-; ALL:       ; %bb.0:
+; ALL:       %bb.0:
 ; ALL-NEXT:    mov x16, x0
 ; ALL-NEXT:    autizb x16
 ; ALL-NEXT:    mov x0, x16
@@ -50,7 +52,7 @@ define i64 @test_auth_ib_zero(i64 %arg) {
 
 define i64 @test_auth_da(i64 %arg, i64 %arg1) {
 ; ALL-LABEL: test_auth_da:
-; ALL:       ; %bb.0:
+; ALL:       %bb.0:
 ; ALL-NEXT:    mov x16, x0
 ; ALL-NEXT:    autda x16, x1
 ; ALL-NEXT:    mov x0, x16
@@ -61,7 +63,7 @@ define i64 @test_auth_da(i64 %arg, i64 %arg1) {
 
 define i64 @test_auth_da_zero(i64 %arg) {
 ; ALL-LABEL: test_auth_da_zero:
-; ALL:       ; %bb.0:
+; ALL:       %bb.0:
 ; ALL-NEXT:    mov x16, x0
 ; ALL-NEXT:    autdza x16
 ; ALL-NEXT:    mov x0, x16
@@ -72,7 +74,7 @@ define i64 @test_auth_da_zero(i64 %arg) {
 
 define i64 @test_auth_db(i64 %arg, i64 %arg1) {
 ; ALL-LABEL: test_auth_db:
-; ALL:       ; %bb.0:
+; ALL:       %bb.0:
 ; ALL-NEXT:    mov x16, x0
 ; ALL-NEXT:    autdb x16, x1
 ; ALL-NEXT:    mov x0, x16
@@ -83,7 +85,7 @@ define i64 @test_auth_db(i64 %arg, i64 %arg1) {
 
 define i64 @test_auth_db_zero(i64 %arg) {
 ; ALL-LABEL: test_auth_db_zero:
-; ALL:       ; %bb.0:
+; ALL:       %bb.0:
 ; ALL-NEXT:    mov x16, x0
 ; ALL-NEXT:    autdzb x16
 ; ALL-NEXT:    mov x0, x16
@@ -96,15 +98,15 @@ define i64 @test_auth_db_zero(i64 %arg) {
 ; the validity of a signature.
 define i64 @test_resign_ia_ia(i64 %arg, i64 %arg1, i64 %arg2) {
 ; NOFPAC-LABEL: test_resign_ia_ia:
-; NOFPAC:       ; %bb.0:
+; NOFPAC:       %bb.0:
 ; NOFPAC-NEXT:    mov x16, x0
 ; NOFPAC-NEXT:    autia x16, x1
 ; NOFPAC-NEXT:    mov x17, x16
 ; NOFPAC-NEXT:    xpaci x17
 ; NOFPAC-NEXT:    cmp x16, x17
-; NOFPAC-NEXT:    b.eq Lauth_success_0
+; NOFPAC-NEXT:    b.eq [[L]]auth_success_0
 ; NOFPAC-NEXT:    mov x16, x17
-; NOFPAC-NEXT:    b Lresign_end_0
+; NOFPAC-NEXT:    b [[L]]resign_end_0
 ; NOFPAC-NEXT:  Lauth_success_0:
 ; NOFPAC-NEXT:    pacia x16, x2
 ; NOFPAC-NEXT:  Lresign_end_0:
@@ -112,7 +114,7 @@ define i64 @test_resign_ia_ia(i64 %arg, i64 %arg1, i64 %arg2) {
 ; NOFPAC-NEXT:    ret
 ;
 ; FPAC-LABEL: test_resign_ia_ia:
-; FPAC:       ; %bb.0:
+; FPAC:       %bb.0:
 ; FPAC-NEXT:    mov x16, x0
 ; FPAC-NEXT:    autia x16, x1
 ; FPAC-NEXT:    pacia x16, x2
@@ -124,15 +126,15 @@ define i64 @test_resign_ia_ia(i64 %arg, i64 %arg1, i64 %arg2) {
 
 define i64 @test_resign_ib_ia(i64 %arg, i64 %arg1, i64 %arg2) {
 ; NOFPAC-LABEL: test_resign_ib_ia:
-; NOFPAC:       ; %bb.0:
+; NOFPAC:       %bb.0:
 ; NOFPAC-NEXT:    mov x16, x0
 ; NOFPAC-NEXT:    autib x16, x1
 ; NOFPAC-NEXT:    mov x17, x16
 ; NOFPAC-NEXT:    xpaci x17
 ; NOFPAC-NEXT:    cmp x16, x17
-; NOFPAC-NEXT:    b.eq Lauth_success_1
+; NOFPAC-NEXT:    b.eq [[L]]auth_success_1
 ; NOFPAC-NEXT:    mov x16, x17
-; NOFPAC-NEXT:    b Lresign_end_1
+; NOFPAC-NEXT:    b [[L]]resign_end_1
 ; NOFPAC-NEXT:  Lauth_success_1:
 ; NOFPAC-NEXT:    pacia x16, x2
 ; NOFPAC-NEXT:  Lresign_end_1:
@@ -140,7 +142,7 @@ define i64 @test_resign_ib_ia(i64 %arg, i64 %arg1, i64 %arg2) {
 ; NOFPAC-NEXT:    ret
 ;
 ; FPAC-LABEL: test_resign_ib_ia:
-; FPAC:       ; %bb.0:
+; FPAC:       %bb.0:
 ; FPAC-NEXT:    mov x16, x0
 ; FPAC-NEXT:    autib x16, x1
 ; FPAC-NEXT:    pacia x16, x2
@@ -152,15 +154,15 @@ define i64 @test_resign_ib_ia(i64 %arg, i64 %arg1, i64 %arg2) {
 
 define i64 @test_resign_da_ia(i64 %arg, i64 %arg1, i64 %arg2) {
 ; NOFPAC-LABEL: test_resign_da_ia:
-; NOFPAC:       ; %bb.0:
+; NOFPAC:       %bb.0:
 ; NOFPAC-NEXT:    mov x16, x0
 ; NOFPAC-NEXT:    autda x16, x1
 ; NOFPAC-NEXT:    mov x17, x16
 ; NOFPAC-NEXT:    xpacd x17
 ; NOFPAC-NEXT:    cmp x16, x17
-; NOFPAC-NEXT:    b.eq Lauth_success_2
+; NOFPAC-NEXT:    b.eq [[L]]auth_success_2
 ; NOFPAC-NEXT:    mov x16, x17
-; NOFPAC-NEXT:    b Lresign_end_2
+; NOFPAC-NEXT:    b [[L]]resign_end_2
 ; NOFPAC-NEXT:  Lauth_success_2:
 ; NOFPAC-NEXT:    pacia x16, x2
 ; NOFPAC-NEXT:  Lresign_end_2:
@@ -168,7 +170,7 @@ define i64 @test_resign_da_ia(i64 %arg, i64 %arg1, i64 %arg2) {
 ; NOFPAC-NEXT:    ret
 ;
 ; FPAC-LABEL: test_resign_da_ia:
-; FPAC:       ; %bb.0:
+; FPAC:       %bb.0:
 ; FPAC-NEXT:    mov x16, x0
 ; FPAC-NEXT:    autda x16, x1
 ; FPAC-NEXT:    pacia x16, x2
@@ -180,15 +182,15 @@ define i64 @test_resign_da_ia(i64 %arg, i64 %arg1, i64 %arg2) {
 
 define i64 @test_resign_db_ia(i64 %arg, i64 %arg1, i64 %arg2) {
 ; NOFPAC-LABEL: test_resign_db_ia:
-; NOFPAC:       ; %bb.0:
+; NOFPAC:       %bb.0:
 ; NOFPAC-NEXT:    mov x16, x0
 ; NOFPAC-NEXT:    autdb x16, x1
 ; NOFPAC-NEXT:    mov x17, x16
 ; NOFPAC-NEXT:    xpacd x17
 ; NOFPAC-NEXT:    cmp x16, x17
-; NOFPAC-NEXT:    b.eq Lauth_success_3
+; NOFPAC-NEXT:    b.eq [[L]]auth_success_3
 ; NOFPAC-NEXT:    mov x16, x17
-; NOFPAC-NEXT:    b Lresign_end_3
+; NOFPAC-NEXT:    b [[L]]resign_end_3
 ; NOFPAC-NEXT:  Lauth_success_3:
 ; NOFPAC-NEXT:    pacia x16, x2
 ; NOFPAC-NEXT:  Lresign_end_3:
@@ -196,7 +198,7 @@ define i64 @test_resign_db_ia(i64 %arg, i64 %arg1, i64 %arg2) {
 ; NOFPAC-NEXT:    ret
 ;
 ; FPAC-LABEL: test_resign_db_ia:
-; FPAC:       ; %bb.0:
+; FPAC:       %bb.0:
 ; FPAC-NEXT:    mov x16, x0
 ; FPAC-NEXT:    autdb x16, x1
 ; FPAC-NEXT:    pacia x16, x2
@@ -208,15 +210,15 @@ define i64 @test_resign_db_ia(i64 %arg, i64 %arg1, i64 %arg2) {
 
 define i64 @test_resign_db_ib(i64 %arg, i64 %arg1, i64 %arg2) {
 ; NOFPAC-LABEL: test_resign_db_ib:
-; NOFPAC:       ; %bb.0:
+; NOFPAC:       %bb.0:
 ; NOFPAC-NEXT:    mov x16, x0
 ; NOFPAC-NEXT:    autdb x16, x1
 ; NOFPAC-NEXT:    mov x17, x16
 ; NOFPAC-NEXT:    xpacd x17
 ; NOFPAC-NEXT:    cmp x16, x17
-; NOFPAC-NEXT:    b.eq Lauth_success_4
+; NOFPAC-NEXT:    b.eq [[L]]auth_success_4
 ; NOFPAC-NEXT:    mov x16, x17
-; NOFPAC-NEXT:    b Lresign_end_4
+; NOFPAC-NEXT:    b [[L]]resign_end_4
 ; NOFPAC-NEXT:  Lauth_success_4:
 ; NOFPAC-NEXT:    pacib x16, x2
 ; NOFPAC-NEXT:  Lresign_end_4:
@@ -224,7 +226,7 @@ define i64 @test_resign_db_ib(i64 %arg, i64 %arg1, i64 %arg2) {
 ; NOFPAC-NEXT:    ret
 ;
 ; FPAC-LABEL: test_resign_db_ib:
-; FPAC:       ; %bb.0:
+; FPAC:       %bb.0:
 ; FPAC-NEXT:    mov x16, x0
 ; FPAC-NEXT:    autdb x16, x1
 ; FPAC-NEXT:    pacib x16, x2
@@ -236,15 +238,15 @@ define i64 @test_resign_db_ib(i64 %arg, i64 %arg1, i64 %arg2) {
 
 define i64 @test_resign_db_da(i64 %arg, i64 %arg1, i64 %arg2) {
 ; NOFPAC-LABEL: test_resign_db_da:
-; NOFPAC:       ; %bb.0:
+; NOFPAC:       %bb.0:
 ; NOFPAC-NEXT:    mov x16, x0
 ; NOFPAC-NEXT:    autdb x16, x1
 ; NOFPAC-NEXT:    mov x17, x16
 ; NOFPAC-NEXT:    xpacd x17
 ; NOFPAC-NEXT:    cmp x16, x17
-; NOFPAC-NEXT:    b.eq Lauth_success_5
+; NOFPAC-NEXT:    b.eq [[L]]auth_success_5
 ; NOFPAC-NEXT:    mov x16, x17
-; NOFPAC-NEXT:    b Lresign_end_5
+; NOFPAC-NEXT:    b [[L]]resign_end_5
 ; NOFPAC-NEXT:  Lauth_success_5:
 ; NOFPAC-NEXT:    pacda x16, x2
 ; NOFPAC-NEXT:  Lresign_end_5:
@@ -252,7 +254,7 @@ define i64 @test_resign_db_da(i64 %arg, i64 %arg1, i64 %arg2) {
 ; NOFPAC-NEXT:    ret
 ;
 ; FPAC-LABEL: test_resign_db_da:
-; FPAC:       ; %bb.0:
+; FPAC:       %bb.0:
 ; FPAC-NEXT:    mov x16, x0
 ; FPAC-NEXT:    autdb x16, x1
 ; FPAC-NEXT:    pacda x16, x2
@@ -264,15 +266,15 @@ define i64 @test_resign_db_da(i64 %arg, i64 %arg1, i64 %arg2) {
 
 define i64 @test_resign_db_db(i64 %arg, i64 %arg1, i64 %arg2) {
 ; NOFPAC-LABEL: test_resign_db_db:
-; NOFPAC:       ; %bb.0:
+; NOFPAC:       %bb.0:
 ; NOFPAC-NEXT:    mov x16, x0
 ; NOFPAC-NEXT:    autdb x16, x1
 ; NOFPAC-NEXT:    mov x17, x16
 ; NOFPAC-NEXT:    xpacd x17
 ; NOFPAC-NEXT:    cmp x16, x17
-; NOFPAC-NEXT:    b.eq Lauth_success_6
+; NOFPAC-NEXT:    b.eq [[L]]auth_success_6
 ; NOFPAC-NEXT:    mov x16, x17
-; NOFPAC-NEXT:    b Lresign_end_6
+; NOFPAC-NEXT:    b [[L]]resign_end_6
 ; NOFPAC-NEXT:  Lauth_success_6:
 ; NOFPAC-NEXT:    pacdb x16, x2
 ; NOFPAC-NEXT:  Lresign_end_6:
@@ -280,7 +282,7 @@ define i64 @test_resign_db_db(i64 %arg, i64 %arg1, i64 %arg2) {
 ; NOFPAC-NEXT:    ret
 ;
 ; FPAC-LABEL: test_resign_db_db:
-; FPAC:       ; %bb.0:
+; FPAC:       %bb.0:
 ; FPAC-NEXT:    mov x16, x0
 ; FPAC-NEXT:    autdb x16, x1
 ; FPAC-NEXT:    pacdb x16, x2
@@ -292,15 +294,15 @@ define i64 @test_resign_db_db(i64 %arg, i64 %arg1, i64 %arg2) {
 
 define i64 @test_resign_iza_db(i64 %arg, i64 %arg1, i64 %arg2) {
 ; NOFPAC-LABEL: test_resign_iza_db:
-; NOFPAC:       ; %bb.0:
+; NOFPAC:       %bb.0:
 ; NOFPAC-NEXT:    mov x16, x0
 ; NOFPAC-NEXT:    autiza x16
 ; NOFPAC-NEXT:    mov x17, x16
 ; NOFPAC-NEXT:    xpaci x17
 ; NOFPAC-NEXT:    cmp x16, x17
-; NOFPAC-NEXT:    b.eq Lauth_success_7
+; NOFPAC-NEXT:    b.eq [[L]]auth_success_7
 ; NOFPAC-NEXT:    mov x16, x17
-; NOFPAC-NEXT:    b Lresign_end_7
+; NOFPAC-NEXT:    b [[L]]resign_end_7
 ; NOFPAC-NEXT:  Lauth_success_7:
 ; NOFPAC-NEXT:    pacdb x16, x2
 ; NOFPAC-NEXT:  Lresign_end_7:
@@ -308,7 +310,7 @@ define i64 @test_resign_iza_db(i64 %arg, i64 %arg1, i64 %arg2) {
 ; NOFPAC-NEXT:    ret
 ;
 ; FPAC-LABEL: test_resign_iza_db:
-; FPAC:       ; %bb.0:
+; FPAC:       %bb.0:
 ; FPAC-NEXT:    mov x16, x0
 ; FPAC-NEXT:    autiza x16
 ; FPAC-NEXT:    pacdb x16, x2
@@ -320,15 +322,15 @@ define i64 @test_resign_iza_db(i64 %arg, i64 %arg1, i64 %arg2) {
 
 define i64 @test_resign_da_dzb(i64 %arg, i64 %arg1, i64 %arg2) {
 ; NOFPAC-LABEL: test_resign_da_dzb:
-; NOFPAC:       ; %bb.0:
+; NOFPAC:       %bb.0:
 ; NOFPAC-NEXT:    mov x16, x0
 ; NOFPAC-NEXT:    autda x16, x1
 ; NOFPAC-NEXT:    mov x17, x16
 ; NOFPAC-NEXT:    xpacd x17
 ; NOFPAC-NEXT:    cmp x16, x17
-; NOFPAC-NEXT:    b.eq Lauth_success_8
+; NOFPAC-NEXT:    b.eq [[L]]auth_success_8
 ; NOFPAC-NEXT:    mov x16, x17
-; NOFPAC-NEXT:    b Lresign_end_8
+; NOFPAC-NEXT:    b [[L]]resign_end_8
 ; NOFPAC-NEXT:  Lauth_success_8:
 ; NOFPAC-NEXT:    pacdzb x16
 ; NOFPAC-NEXT:  Lresign_end_8:
@@ -336,7 +338,7 @@ define i64 @test_resign_da_dzb(i64 %arg, i64 %arg1, i64 %arg2) {
 ; NOFPAC-NEXT:    ret
 ;
 ; FPAC-LABEL: test_resign_da_dzb:
-; FPAC:       ; %bb.0:
+; FPAC:       %bb.0:
 ; FPAC-NEXT:    mov x16, x0
 ; FPAC-NEXT:    autda x16, x1
 ; FPAC-NEXT:    pacdzb x16
@@ -348,20 +350,20 @@ define i64 @test_resign_da_dzb(i64 %arg, i64 %arg1, i64 %arg2) {
 
 define i64 @test_auth_trap_attribute(i64 %arg, i64 %arg1) "ptrauth-auth-traps" {
 ; NOFPAC-LABEL: test_auth_trap_attribute:
-; NOFPAC:       ; %bb.0:
+; NOFPAC:       %bb.0:
 ; NOFPAC-NEXT:    mov x16, x0
 ; NOFPAC-NEXT:    autia x16, x1
 ; NOFPAC-NEXT:    mov x17, x16
 ; NOFPAC-NEXT:    xpaci x17
 ; NOFPAC-NEXT:    cmp x16, x17
-; NOFPAC-NEXT:    b.eq Lauth_success_9
+; NOFPAC-NEXT:    b.eq [[L]]auth_success_9
 ; NOFPAC-NEXT:    brk #0xc470
 ; NOFPAC-NEXT:  Lauth_success_9:
 ; NOFPAC-NEXT:    mov x0, x16
 ; NOFPAC-NEXT:    ret
 ;
 ; FPAC-LABEL: test_auth_trap_attribute:
-; FPAC:       ; %bb.0:
+; FPAC:       %bb.0:
 ; FPAC-NEXT:    mov x16, x0
 ; FPAC-NEXT:    autia x16, x1
 ; FPAC-NEXT:    mov x0, x16
diff --git a/llvm/test/CodeGen/AArch64/ptrauth-intrinsic-auth-resign-with-blend.ll b/llvm/test/CodeGen/AArch64/ptrauth-intrinsic-auth-resign-with-blend.ll
index 3b93acd..74d2370 100644
--- a/llvm/test/CodeGen/AArch64/ptrauth-intrinsic-auth-resign-with-blend.ll
+++ b/llvm/test/CodeGen/AArch64/ptrauth-intrinsic-auth-resign-with-blend.ll
@@ -1,24 +1,39 @@
 ; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py
 ; RUN: llc < %s -mtriple arm64e-apple-darwin -global-isel=0                    -verify-machineinstrs \
-; RUN:   -aarch64-ptrauth-auth-checks=none | FileCheck %s --check-prefix=UNCHECKED
+; RUN:   -aarch64-ptrauth-auth-checks=none | FileCheck %s -DL="L" --check-prefixes=UNCHECKED,UNCHECKED-DARWIN
 ; RUN: llc < %s -mtriple arm64e-apple-darwin -global-isel -global-isel-abort=1 -verify-machineinstrs \
-; RUN:   -aarch64-ptrauth-auth-checks=none | FileCheck %s --check-prefix=UNCHECKED
+; RUN:   -aarch64-ptrauth-auth-checks=none | FileCheck %s -DL="L" --check-prefixes=UNCHECKED,UNCHECKED-DARWIN
 
 ; RUN: llc < %s -mtriple arm64e-apple-darwin -global-isel=0                    -verify-machineinstrs \
-; RUN:                                     | FileCheck %s --check-prefix=CHECKED
+; RUN:                                     | FileCheck %s -DL="L" --check-prefixes=CHECKED,CHECKED-DARWIN
 ; RUN: llc < %s -mtriple arm64e-apple-darwin -global-isel -global-isel-abort=1 -verify-machineinstrs \
-; RUN:                                     | FileCheck %s --check-prefix=CHECKED
+; RUN:                                     | FileCheck %s -DL="L" --check-prefixes=CHECKED,CHECKED-DARWIN
 
 ; RUN: llc < %s -mtriple arm64e-apple-darwin -global-isel=0                    -verify-machineinstrs \
-; RUN:   -aarch64-ptrauth-auth-checks=trap | FileCheck %s --check-prefix=TRAP
+; RUN:   -aarch64-ptrauth-auth-checks=trap | FileCheck %s -DL="L" --check-prefixes=TRAP,TRAP-DARWIN
 ; RUN: llc < %s -mtriple arm64e-apple-darwin -global-isel -global-isel-abort=1 -verify-machineinstrs \
-; RUN:   -aarch64-ptrauth-auth-checks=trap | FileCheck %s --check-prefix=TRAP
+; RUN:   -aarch64-ptrauth-auth-checks=trap | FileCheck %s -DL="L" --check-prefixes=TRAP,TRAP-DARWIN
+
+; RUN: llc < %s -mtriple aarch64-linux-gnu -mattr=+pauth -global-isel=0                    -verify-machineinstrs \
+; RUN:   -aarch64-ptrauth-auth-checks=none | FileCheck %s -DL=".L" --check-prefixes=UNCHECKED,UNCHECKED-ELF
+; RUN: llc < %s -mtriple aarch64-linux-gnu -mattr=+pauth -global-isel -global-isel-abort=1 -verify-machineinstrs \
+; RUN:   -aarch64-ptrauth-auth-checks=none | FileCheck %s -DL=".L" --check-prefixes=UNCHECKED,UNCHECKED-ELF
+
+; RUN: llc < %s -mtriple aarch64-linux-gnu -mattr=+pauth -global-isel=0                    -verify-machineinstrs \
+; RUN:                                     | FileCheck %s -DL=".L" --check-prefixes=CHECKED,CHECKED-ELF
+; RUN: llc < %s -mtriple aarch64-linux-gnu -mattr=+pauth -global-isel -global-isel-abort=1 -verify-machineinstrs \
+; RUN:                                     | FileCheck %s -DL=".L" --check-prefixes=CHECKED,CHECKED-ELF
+
+; RUN: llc < %s -mtriple aarch64-linux-gnu -mattr=+pauth -global-isel=0                    -verify-machineinstrs \
+; RUN:   -aarch64-ptrauth-auth-checks=trap | FileCheck %s -DL=".L" --check-prefixes=TRAP,TRAP-ELF
+; RUN: llc < %s -mtriple aarch64-linux-gnu -mattr=+pauth -global-isel -global-isel-abort=1 -verify-machineinstrs \
+; RUN:   -aarch64-ptrauth-auth-checks=trap | FileCheck %s -DL=".L" --check-prefixes=TRAP,TRAP-ELF
 
 target datalayout = "e-m:o-i64:64-i128:128-n32:64-S128"
 
 define i64 @test_auth_blend(i64 %arg, i64 %arg1) {
 ; UNCHECKED-LABEL: test_auth_blend:
-; UNCHECKED:       ; %bb.0:
+; UNCHECKED:       %bb.0:
 ; UNCHECKED-NEXT:    mov x16, x0
 ; UNCHECKED-NEXT:    mov x17, x1
 ; UNCHECKED-NEXT:    movk x17, #65535, lsl #48
@@ -27,7 +42,7 @@ define i64 @test_auth_blend(i64 %arg, i64 %arg1) {
 ; UNCHECKED-NEXT:    ret
 ;
 ; CHECKED-LABEL: test_auth_blend:
-; CHECKED:       ; %bb.0:
+; CHECKED:       %bb.0:
 ; CHECKED-NEXT:    mov x16, x0
 ; CHECKED-NEXT:    mov x17, x1
 ; CHECKED-NEXT:    movk x17, #65535, lsl #48
@@ -36,7 +51,7 @@ define i64 @test_auth_blend(i64 %arg, i64 %arg1) {
 ; CHECKED-NEXT:    ret
 ;
 ; TRAP-LABEL: test_auth_blend:
-; TRAP:       ; %bb.0:
+; TRAP:       %bb.0:
 ; TRAP-NEXT:    mov x16, x0
 ; TRAP-NEXT:    mov x17, x1
 ; TRAP-NEXT:    movk x17, #65535, lsl #48
@@ -44,7 +59,7 @@ define i64 @test_auth_blend(i64 %arg, i64 %arg1) {
 ; TRAP-NEXT:    mov x17, x16
 ; TRAP-NEXT:    xpacd x17
 ; TRAP-NEXT:    cmp x16, x17
-; TRAP-NEXT:    b.eq Lauth_success_0
+; TRAP-NEXT:    b.eq [[L]]auth_success_0
 ; TRAP-NEXT:    brk #0xc472
 ; TRAP-NEXT:  Lauth_success_0:
 ; TRAP-NEXT:    mov x0, x16
@@ -56,7 +71,7 @@ define i64 @test_auth_blend(i64 %arg, i64 %arg1) {
 
 define i64 @test_resign_blend(i64 %arg, i64 %arg1, i64 %arg2) {
 ; UNCHECKED-LABEL: test_resign_blend:
-; UNCHECKED:       ; %bb.0:
+; UNCHECKED:       %bb.0:
 ; UNCHECKED-NEXT:    mov x16, x0
 ; UNCHECKED-NEXT:    mov x17, x1
 ; UNCHECKED-NEXT:    movk x17, #12345, lsl #48
@@ -68,7 +83,7 @@ define i64 @test_resign_blend(i64 %arg, i64 %arg1, i64 %arg2) {
 ; UNCHECKED-NEXT:    ret
 ;
 ; CHECKED-LABEL: test_resign_blend:
-; CHECKED:       ; %bb.0:
+; CHECKED:       %bb.0:
 ; CHECKED-NEXT:    mov x16, x0
 ; CHECKED-NEXT:    mov x17, x1
 ; CHECKED-NEXT:    movk x17, #12345, lsl #48
@@ -76,9 +91,9 @@ define i64 @test_resign_blend(i64 %arg, i64 %arg1, i64 %arg2) {
 ; CHECKED-NEXT:    mov x17, x16
 ; CHECKED-NEXT:    xpacd x17
 ; CHECKED-NEXT:    cmp x16, x17
-; CHECKED-NEXT:    b.eq Lauth_success_0
+; CHECKED-NEXT:    b.eq [[L]]auth_success_0
 ; CHECKED-NEXT:    mov x16, x17
-; CHECKED-NEXT:    b Lresign_end_0
+; CHECKED-NEXT:    b [[L]]resign_end_0
 ; CHECKED-NEXT:  Lauth_success_0:
 ; CHECKED-NEXT:    mov x17, x2
 ; CHECKED-NEXT:    movk x17, #56789, lsl #48
@@ -88,7 +103,7 @@ define i64 @test_resign_blend(i64 %arg, i64 %arg1, i64 %arg2) {
 ; CHECKED-NEXT:    ret
 ;
 ; TRAP-LABEL: test_resign_blend:
-; TRAP:       ; %bb.0:
+; TRAP:       %bb.0:
 ; TRAP-NEXT:    mov x16, x0
 ; TRAP-NEXT:    mov x17, x1
 ; TRAP-NEXT:    movk x17, #12345, lsl #48
@@ -96,7 +111,7 @@ define i64 @test_resign_blend(i64 %arg, i64 %arg1, i64 %arg2) {
 ; TRAP-NEXT:    mov x17, x16
 ; TRAP-NEXT:    xpacd x17
 ; TRAP-NEXT:    cmp x16, x17
-; TRAP-NEXT:    b.eq Lauth_success_1
+; TRAP-NEXT:    b.eq [[L]]auth_success_1
 ; TRAP-NEXT:    brk #0xc472
 ; TRAP-NEXT:  Lauth_success_1:
 ; TRAP-NEXT:    mov x17, x2
@@ -112,18 +127,18 @@ define i64 @test_resign_blend(i64 %arg, i64 %arg1, i64 %arg2) {
 
 define i64 @test_resign_blend_and_const(i64 %arg, i64 %arg1) {
 ; UNCHECKED-LABEL: test_resign_blend_and_const:
-; UNCHECKED:       ; %bb.0:
+; UNCHECKED:       %bb.0:
 ; UNCHECKED-NEXT:    mov x16, x0
 ; UNCHECKED-NEXT:    mov x17, x1
 ; UNCHECKED-NEXT:    movk x17, #12345, lsl #48
 ; UNCHECKED-NEXT:    autda x16, x17
-; UNCHECKED-NEXT:    mov x17, #56789 ; =0xddd5
+; UNCHECKED-NEXT:    mov x17, #56789
 ; UNCHECKED-NEXT:    pacdb x16, x17
 ; UNCHECKED-NEXT:    mov x0, x16
 ; UNCHECKED-NEXT:    ret
 ;
 ; CHECKED-LABEL: test_resign_blend_and_const:
-; CHECKED:       ; %bb.0:
+; CHECKED:       %bb.0:
 ; CHECKED-NEXT:    mov x16, x0
 ; CHECKED-NEXT:    mov x17, x1
 ; CHECKED-NEXT:    movk x17, #12345, lsl #48
@@ -131,18 +146,18 @@ define i64 @test_resign_blend_and_const(i64 %arg, i64 %arg1) {
 ; CHECKED-NEXT:    mov x17, x16
 ; CHECKED-NEXT:    xpacd x17
 ; CHECKED-NEXT:    cmp x16, x17
-; CHECKED-NEXT:    b.eq Lauth_success_1
+; CHECKED-NEXT:    b.eq [[L]]auth_success_1
 ; CHECKED-NEXT:    mov x16, x17
-; CHECKED-NEXT:    b Lresign_end_1
+; CHECKED-NEXT:    b [[L]]resign_end_1
 ; CHECKED-NEXT:  Lauth_success_1:
-; CHECKED-NEXT:    mov x17, #56789 ; =0xddd5
+; CHECKED-NEXT:    mov x17, #56789
 ; CHECKED-NEXT:    pacdb x16, x17
 ; CHECKED-NEXT:  Lresign_end_1:
 ; CHECKED-NEXT:    mov x0, x16
 ; CHECKED-NEXT:    ret
 ;
 ; TRAP-LABEL: test_resign_blend_and_const:
-; TRAP:       ; %bb.0:
+; TRAP:       %bb.0:
 ; TRAP-NEXT:    mov x16, x0
 ; TRAP-NEXT:    mov x17, x1
 ; TRAP-NEXT:    movk x17, #12345, lsl #48
@@ -150,10 +165,10 @@ define i64 @test_resign_blend_and_const(i64 %arg, i64 %arg1) {
 ; TRAP-NEXT:    mov x17, x16
 ; TRAP-NEXT:    xpacd x17
 ; TRAP-NEXT:    cmp x16, x17
-; TRAP-NEXT:    b.eq Lauth_success_2
+; TRAP-NEXT:    b.eq [[L]]auth_success_2
 ; TRAP-NEXT:    brk #0xc472
 ; TRAP-NEXT:  Lauth_success_2:
-; TRAP-NEXT:    mov x17, #56789 ; =0xddd5
+; TRAP-NEXT:    mov x17, #56789
 ; TRAP-NEXT:    pacdb x16, x17
 ; TRAP-NEXT:    mov x0, x16
 ; TRAP-NEXT:    ret
@@ -164,7 +179,7 @@ define i64 @test_resign_blend_and_const(i64 %arg, i64 %arg1) {
 
 define i64 @test_resign_blend_and_addr(i64 %arg, i64 %arg1, i64 %arg2) {
 ; UNCHECKED-LABEL: test_resign_blend_and_addr:
-; UNCHECKED:       ; %bb.0:
+; UNCHECKED:       %bb.0:
 ; UNCHECKED-NEXT:    mov x16, x0
 ; UNCHECKED-NEXT:    mov x17, x1
 ; UNCHECKED-NEXT:    movk x17, #12345, lsl #48
@@ -174,7 +189,7 @@ define i64 @test_resign_blend_and_addr(i64 %arg, i64 %arg1, i64 %arg2) {
 ; UNCHECKED-NEXT:    ret
 ;
 ; CHECKED-LABEL: test_resign_blend_and_addr:
-; CHECKED:       ; %bb.0:
+; CHECKED:       %bb.0:
 ; CHECKED-NEXT:    mov x16, x0
 ; CHECKED-NEXT:    mov x17, x1
 ; CHECKED-NEXT:    movk x17, #12345, lsl #48
@@ -182,9 +197,9 @@ define i64 @test_resign_blend_and_addr(i64 %arg, i64 %arg1, i64 %arg2) {
 ; CHECKED-NEXT:    mov x17, x16
 ; CHECKED-NEXT:    xpacd x17
 ; CHECKED-NEXT:    cmp x16, x17
-; CHECKED-NEXT:    b.eq Lauth_success_2
+; CHECKED-NEXT:    b.eq [[L]]auth_success_2
 ; CHECKED-NEXT:    mov x16, x17
-; CHECKED-NEXT:    b Lresign_end_2
+; CHECKED-NEXT:    b [[L]]resign_end_2
 ; CHECKED-NEXT:  Lauth_success_2:
 ; CHECKED-NEXT:    pacdb x16, x2
 ; CHECKED-NEXT:  Lresign_end_2:
@@ -192,7 +207,7 @@ define i64 @test_resign_blend_and_addr(i64 %arg, i64 %arg1, i64 %arg2) {
 ; CHECKED-NEXT:    ret
 ;
 ; TRAP-LABEL: test_resign_blend_and_addr:
-; TRAP:       ; %bb.0:
+; TRAP:       %bb.0:
 ; TRAP-NEXT:    mov x16, x0
 ; TRAP-NEXT:    mov x17, x1
 ; TRAP-NEXT:    movk x17, #12345, lsl #48
@@ -200,7 +215,7 @@ define i64 @test_resign_blend_and_addr(i64 %arg, i64 %arg1, i64 %arg2) {
 ; TRAP-NEXT:    mov x17, x16
 ; TRAP-NEXT:    xpacd x17
 ; TRAP-NEXT:    cmp x16, x17
-; TRAP-NEXT:    b.eq Lauth_success_3
+; TRAP-NEXT:    b.eq [[L]]auth_success_3
 ; TRAP-NEXT:    brk #0xc472
 ; TRAP-NEXT:  Lauth_success_3:
 ; TRAP-NEXT:    pacdb x16, x2
@@ -212,38 +227,44 @@ define i64 @test_resign_blend_and_addr(i64 %arg, i64 %arg1, i64 %arg2) {
 }
 
 define i64 @test_auth_too_large_discriminator(i64 %arg, i64 %arg1) {
-; UNCHECKED-LABEL: test_auth_too_large_discriminator:
-; UNCHECKED:       ; %bb.0:
-; UNCHECKED-NEXT:    mov w8, #65536 ; =0x10000
-; UNCHECKED-NEXT:    bfi x1, x8, #48, #16
-; UNCHECKED-NEXT:    mov x16, x0
-; UNCHECKED-NEXT:    autda x16, x1
-; UNCHECKED-NEXT:    mov x0, x16
-; UNCHECKED-NEXT:    ret
+; UNCHECKED-LABEL:     test_auth_too_large_discriminator:
+; UNCHECKED:           %bb.0:
+; UNCHECKED-NEXT:        mov w8, #65536
+; UNCHECKED-DARWIN-NEXT: bfi x1, x8, #48, #16
+; UNCHECKED-DARWIN-NEXT: mov x16, x0
+; UNCHECKED-ELF-NEXT:    mov x16, x0
+; UNCHECKED-ELF-NEXT:    bfi x1, x8, #48, #16
+; UNCHECKED-NEXT:        autda x16, x1
+; UNCHECKED-NEXT:        mov x0, x16
+; UNCHECKED-NEXT:        ret
 ;
 ; CHECKED-LABEL: test_auth_too_large_discriminator:
-; CHECKED:       ; %bb.0:
-; CHECKED-NEXT:    mov w8, #65536 ; =0x10000
-; CHECKED-NEXT:    bfi x1, x8, #48, #16
-; CHECKED-NEXT:    mov x16, x0
-; CHECKED-NEXT:    autda x16, x1
-; CHECKED-NEXT:    mov x0, x16
-; CHECKED-NEXT:    ret
+; CHECKED:           %bb.0:
+; CHECKED-NEXT:        mov w8, #65536
+; CHECKED-DARWIN-NEXT: bfi x1, x8, #48, #16
+; CHECKED-DARWIN-NEXT: mov x16, x0
+; CHECKED-ELF-NEXT:    mov x16, x0
+; CHECKED-ELF-NEXT:    bfi x1, x8, #48, #16
+; CHECKED-NEXT:        autda x16, x1
+; CHECKED-NEXT:        mov x0, x16
+; CHECKED-NEXT:        ret
 ;
 ; TRAP-LABEL: test_auth_too_large_discriminator:
-; TRAP:       ; %bb.0:
-; TRAP-NEXT:    mov w8, #65536 ; =0x10000
-; TRAP-NEXT:    bfi x1, x8, #48, #16
-; TRAP-NEXT:    mov x16, x0
-; TRAP-NEXT:    autda x16, x1
-; TRAP-NEXT:    mov x17, x16
-; TRAP-NEXT:    xpacd x17
-; TRAP-NEXT:    cmp x16, x17
-; TRAP-NEXT:    b.eq Lauth_success_4
-; TRAP-NEXT:    brk #0xc472
-; TRAP-NEXT:  Lauth_success_4:
-; TRAP-NEXT:    mov x0, x16
-; TRAP-NEXT:    ret
+; TRAP:           %bb.0:
+; TRAP-NEXT:        mov w8, #65536
+; TRAP-DARWIN-NEXT: bfi x1, x8, #48, #16
+; TRAP-DARWIN-NEXT: mov x16, x0
+; TRAP-ELF-NEXT:    mov x16, x0
+; TRAP-ELF-NEXT:    bfi x1, x8, #48, #16
+; TRAP-NEXT:        autda x16, x1
+; TRAP-NEXT:        mov x17, x16
+; TRAP-NEXT:        xpacd x17
+; TRAP-NEXT:        cmp x16, x17
+; TRAP-NEXT:        b.eq [[L]]auth_success_4
+; TRAP-NEXT:        brk #0xc472
+; TRAP-NEXT:      Lauth_success_4:
+; TRAP-NEXT:        mov x0, x16
+; TRAP-NEXT:        ret
   %tmp0 = call i64 @llvm.ptrauth.blend(i64 %arg1, i64 65536)
   %tmp1 = call i64 @llvm.ptrauth.auth(i64 %arg, i32 2, i64 %tmp0)
   ret i64 %tmp1
diff --git a/llvm/test/CodeGen/AArch64/ptrauth-intrinsic-auth-resign.ll b/llvm/test/CodeGen/AArch64/ptrauth-intrinsic-auth-resign.ll
index 62c9fba..fdd5ae2 100644
--- a/llvm/test/CodeGen/AArch64/ptrauth-intrinsic-auth-resign.ll
+++ b/llvm/test/CodeGen/AArch64/ptrauth-intrinsic-auth-resign.ll
@@ -1,44 +1,59 @@
 ; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py
 ; RUN: llc < %s -mtriple arm64e-apple-darwin -global-isel=0                    -verify-machineinstrs \
-; RUN:   -aarch64-ptrauth-auth-checks=none | FileCheck %s --check-prefix=UNCHECKED
+; RUN:   -aarch64-ptrauth-auth-checks=none | FileCheck %s -DL="L" --check-prefix=UNCHECKED
 ; RUN: llc < %s -mtriple arm64e-apple-darwin -global-isel -global-isel-abort=1 -verify-machineinstrs \
-; RUN:   -aarch64-ptrauth-auth-checks=none | FileCheck %s --check-prefix=UNCHECKED
+; RUN:   -aarch64-ptrauth-auth-checks=none | FileCheck %s -DL="L" --check-prefix=UNCHECKED
 
 ; RUN: llc < %s -mtriple arm64e-apple-darwin -global-isel=0                    -verify-machineinstrs \
-; RUN:                                     | FileCheck %s --check-prefix=CHECKED
+; RUN:                                     | FileCheck %s -DL="L" --check-prefix=CHECKED
 ; RUN: llc < %s -mtriple arm64e-apple-darwin -global-isel -global-isel-abort=1 -verify-machineinstrs \
-; RUN:                                     | FileCheck %s --check-prefix=CHECKED
+; RUN:                                     | FileCheck %s -DL="L" --check-prefix=CHECKED
 
 ; RUN: llc < %s -mtriple arm64e-apple-darwin -global-isel=0                    -verify-machineinstrs \
-; RUN:   -aarch64-ptrauth-auth-checks=trap | FileCheck %s --check-prefix=TRAP
+; RUN:   -aarch64-ptrauth-auth-checks=trap | FileCheck %s -DL="L" --check-prefix=TRAP
 ; RUN: llc < %s -mtriple arm64e-apple-darwin -global-isel -global-isel-abort=1 -verify-machineinstrs \
-; RUN:   -aarch64-ptrauth-auth-checks=trap | FileCheck %s --check-prefix=TRAP
+; RUN:   -aarch64-ptrauth-auth-checks=trap | FileCheck %s -DL="L" --check-prefix=TRAP
+
+; RUN: llc < %s -mtriple aarch64-linux-gnu -mattr=+pauth -global-isel=0                    -verify-machineinstrs \
+; RUN:   -aarch64-ptrauth-auth-checks=none | FileCheck %s -DL=".L" --check-prefix=UNCHECKED
+; RUN: llc < %s -mtriple aarch64-linux-gnu -mattr=+pauth -global-isel -global-isel-abort=1 -verify-machineinstrs \
+; RUN:   -aarch64-ptrauth-auth-checks=none | FileCheck %s -DL=".L" --check-prefix=UNCHECKED
+
+; RUN: llc < %s -mtriple aarch64-linux-gnu -mattr=+pauth -global-isel=0                    -verify-machineinstrs \
+; RUN:                                     | FileCheck %s -DL=".L" --check-prefix=CHECKED
+; RUN: llc < %s -mtriple aarch64-linux-gnu -mattr=+pauth -global-isel -global-isel-abort=1 -verify-machineinstrs \
+; RUN:                                     | FileCheck %s -DL=".L" --check-prefix=CHECKED
+
+; RUN: llc < %s -mtriple aarch64-linux-gnu -mattr=+pauth -global-isel=0                    -verify-machineinstrs \
+; RUN:   -aarch64-ptrauth-auth-checks=trap | FileCheck %s -DL=".L" --check-prefix=TRAP
+; RUN: llc < %s -mtriple aarch64-linux-gnu -mattr=+pauth -global-isel -global-isel-abort=1 -verify-machineinstrs \
+; RUN:   -aarch64-ptrauth-auth-checks=trap | FileCheck %s -DL=".L" --check-prefix=TRAP
 
 target datalayout = "e-m:o-i64:64-i128:128-n32:64-S128"
 
 define i64 @test_auth_ia(i64 %arg, i64 %arg1) {
 ; UNCHECKED-LABEL: test_auth_ia:
-; UNCHECKED:       ; %bb.0:
+; UNCHECKED:       %bb.0:
 ; UNCHECKED-NEXT:    mov x16, x0
 ; UNCHECKED-NEXT:    autia x16, x1
 ; UNCHECKED-NEXT:    mov x0, x16
 ; UNCHECKED-NEXT:    ret
 ;
 ; CHECKED-LABEL: test_auth_ia:
-; CHECKED:       ; %bb.0:
+; CHECKED:       %bb.0:
 ; CHECKED-NEXT:    mov x16, x0
 ; CHECKED-NEXT:    autia x16, x1
 ; CHECKED-NEXT:    mov x0, x16
 ; CHECKED-NEXT:    ret
 ;
 ; TRAP-LABEL: test_auth_ia:
-; TRAP:       ; %bb.0:
+; TRAP:       %bb.0:
 ; TRAP-NEXT:    mov x16, x0
 ; TRAP-NEXT:    autia x16, x1
 ; TRAP-NEXT:    mov x17, x16
 ; TRAP-NEXT:    xpaci x17
 ; TRAP-NEXT:    cmp x16, x17
-; TRAP-NEXT:    b.eq Lauth_success_0
+; TRAP-NEXT:    b.eq [[L]]auth_success_0
 ; TRAP-NEXT:    brk #0xc470
 ; TRAP-NEXT:  Lauth_success_0:
 ; TRAP-NEXT:    mov x0, x16
@@ -49,27 +64,27 @@ define i64 @test_auth_ia(i64 %arg, i64 %arg1) {
 
 define i64 @test_auth_ia_zero(i64 %arg) {
 ; UNCHECKED-LABEL: test_auth_ia_zero:
-; UNCHECKED:       ; %bb.0:
+; UNCHECKED:       %bb.0:
 ; UNCHECKED-NEXT:    mov x16, x0
 ; UNCHECKED-NEXT:    autiza x16
 ; UNCHECKED-NEXT:    mov x0, x16
 ; UNCHECKED-NEXT:    ret
 ;
 ; CHECKED-LABEL: test_auth_ia_zero:
-; CHECKED:       ; %bb.0:
+; CHECKED:       %bb.0:
 ; CHECKED-NEXT:    mov x16, x0
 ; CHECKED-NEXT:    autiza x16
 ; CHECKED-NEXT:    mov x0, x16
 ; CHECKED-NEXT:    ret
 ;
 ; TRAP-LABEL: test_auth_ia_zero:
-; TRAP:       ; %bb.0:
+; TRAP:       %bb.0:
 ; TRAP-NEXT:    mov x16, x0
 ; TRAP-NEXT:    autiza x16
 ; TRAP-NEXT:    mov x17, x16
 ; TRAP-NEXT:    xpaci x17
 ; TRAP-NEXT:    cmp x16, x17
-; TRAP-NEXT:    b.eq Lauth_success_1
+; TRAP-NEXT:    b.eq [[L]]auth_success_1
 ; TRAP-NEXT:    brk #0xc470
 ; TRAP-NEXT:  Lauth_success_1:
 ; TRAP-NEXT:    mov x0, x16
@@ -80,27 +95,27 @@ define i64 @test_auth_ia_zero(i64 %arg) {
 
 define i64 @test_auth_ib(i64 %arg, i64 %arg1) {
 ; UNCHECKED-LABEL: test_auth_ib:
-; UNCHECKED:       ; %bb.0:
+; UNCHECKED:       %bb.0:
 ; UNCHECKED-NEXT:    mov x16, x0
 ; UNCHECKED-NEXT:    autib x16, x1
 ; UNCHECKED-NEXT:    mov x0, x16
 ; UNCHECKED-NEXT:    ret
 ;
 ; CHECKED-LABEL: test_auth_ib:
-; CHECKED:       ; %bb.0:
+; CHECKED:       %bb.0:
 ; CHECKED-NEXT:    mov x16, x0
 ; CHECKED-NEXT:    autib x16, x1
 ; CHECKED-NEXT:    mov x0, x16
 ; CHECKED-NEXT:    ret
 ;
 ; TRAP-LABEL: test_auth_ib:
-; TRAP:       ; %bb.0:
+; TRAP:       %bb.0:
 ; TRAP-NEXT:    mov x16, x0
 ; TRAP-NEXT:    autib x16, x1
 ; TRAP-NEXT:    mov x17, x16
 ; TRAP-NEXT:    xpaci x17
 ; TRAP-NEXT:    cmp x16, x17
-; TRAP-NEXT:    b.eq Lauth_success_2
+; TRAP-NEXT:    b.eq [[L]]auth_success_2
 ; TRAP-NEXT:    brk #0xc471
 ; TRAP-NEXT:  Lauth_success_2:
 ; TRAP-NEXT:    mov x0, x16
@@ -111,27 +126,27 @@ define i64 @test_auth_ib(i64 %arg, i64 %arg1) {
 
 define i64 @test_auth_ib_zero(i64 %arg) {
 ; UNCHECKED-LABEL: test_auth_ib_zero:
-; UNCHECKED:       ; %bb.0:
+; UNCHECKED:       %bb.0:
 ; UNCHECKED-NEXT:    mov x16, x0
 ; UNCHECKED-NEXT:    autizb x16
 ; UNCHECKED-NEXT:    mov x0, x16
 ; UNCHECKED-NEXT:    ret
 ;
 ; CHECKED-LABEL: test_auth_ib_zero:
-; CHECKED:       ; %bb.0:
+; CHECKED:       %bb.0:
 ; CHECKED-NEXT:    mov x16, x0
 ; CHECKED-NEXT:    autizb x16
 ; CHECKED-NEXT:    mov x0, x16
 ; CHECKED-NEXT:    ret
 ;
 ; TRAP-LABEL: test_auth_ib_zero:
-; TRAP:       ; %bb.0:
+; TRAP:       %bb.0:
 ; TRAP-NEXT:    mov x16, x0
 ; TRAP-NEXT:    autizb x16
 ; TRAP-NEXT:    mov x17, x16
 ; TRAP-NEXT:    xpaci x17
 ; TRAP-NEXT:    cmp x16, x17
-; TRAP-NEXT:    b.eq Lauth_success_3
+; TRAP-NEXT:    b.eq [[L]]auth_success_3
 ; TRAP-NEXT:    brk #0xc471
 ; TRAP-NEXT:  Lauth_success_3:
 ; TRAP-NEXT:    mov x0, x16
@@ -142,27 +157,27 @@ define i64 @test_auth_ib_zero(i64 %arg) {
 
 define i64 @test_auth_da(i64 %arg, i64 %arg1) {
 ; UNCHECKED-LABEL: test_auth_da:
-; UNCHECKED:       ; %bb.0:
+; UNCHECKED:       %bb.0:
 ; UNCHECKED-NEXT:    mov x16, x0
 ; UNCHECKED-NEXT:    autda x16, x1
 ; UNCHECKED-NEXT:    mov x0, x16
 ; UNCHECKED-NEXT:    ret
 ;
 ; CHECKED-LABEL: test_auth_da:
-; CHECKED:       ; %bb.0:
+; CHECKED:       %bb.0:
 ; CHECKED-NEXT:    mov x16, x0
 ; CHECKED-NEXT:    autda x16, x1
 ; CHECKED-NEXT:    mov x0, x16
 ; CHECKED-NEXT:    ret
 ;
 ; TRAP-LABEL: test_auth_da:
-; TRAP:       ; %bb.0:
+; TRAP:       %bb.0:
 ; TRAP-NEXT:    mov x16, x0
 ; TRAP-NEXT:    autda x16, x1
 ; TRAP-NEXT:    mov x17, x16
 ; TRAP-NEXT:    xpacd x17
 ; TRAP-NEXT:    cmp x16, x17
-; TRAP-NEXT:    b.eq Lauth_success_4
+; TRAP-NEXT:    b.eq [[L]]auth_success_4
 ; TRAP-NEXT:    brk #0xc472
 ; TRAP-NEXT:  Lauth_success_4:
 ; TRAP-NEXT:    mov x0, x16
@@ -173,27 +188,27 @@ define i64 @test_auth_da(i64 %arg, i64 %arg1) {
 
 define i64 @test_auth_da_zero(i64 %arg) {
 ; UNCHECKED-LABEL: test_auth_da_zero:
-; UNCHECKED:       ; %bb.0:
+; UNCHECKED:       %bb.0:
 ; UNCHECKED-NEXT:    mov x16, x0
 ; UNCHECKED-NEXT:    autdza x16
 ; UNCHECKED-NEXT:    mov x0, x16
 ; UNCHECKED-NEXT:    ret
 ;
 ; CHECKED-LABEL: test_auth_da_zero:
-; CHECKED:       ; %bb.0:
+; CHECKED:       %bb.0:
 ; CHECKED-NEXT:    mov x16, x0
 ; CHECKED-NEXT:    autdza x16
 ; CHECKED-NEXT:    mov x0, x16
 ; CHECKED-NEXT:    ret
 ;
 ; TRAP-LABEL: test_auth_da_zero:
-; TRAP:       ; %bb.0:
+; TRAP:       %bb.0:
 ; TRAP-NEXT:    mov x16, x0
 ; TRAP-NEXT:    autdza x16
 ; TRAP-NEXT:    mov x17, x16
 ; TRAP-NEXT:    xpacd x17
 ; TRAP-NEXT:    cmp x16, x17
-; TRAP-NEXT:    b.eq Lauth_success_5
+; TRAP-NEXT:    b.eq [[L]]auth_success_5
 ; TRAP-NEXT:    brk #0xc472
 ; TRAP-NEXT:  Lauth_success_5:
 ; TRAP-NEXT:    mov x0, x16
@@ -204,27 +219,27 @@ define i64 @test_auth_da_zero(i64 %arg) {
 
 define i64 @test_auth_db(i64 %arg, i64 %arg1) {
 ; UNCHECKED-LABEL: test_auth_db:
-; UNCHECKED:       ; %bb.0:
+; UNCHECKED:       %bb.0:
 ; UNCHECKED-NEXT:    mov x16, x0
 ; UNCHECKED-NEXT:    autdb x16, x1
 ; UNCHECKED-NEXT:    mov x0, x16
 ; UNCHECKED-NEXT:    ret
 ;
 ; CHECKED-LABEL: test_auth_db:
-; CHECKED:       ; %bb.0:
+; CHECKED:       %bb.0:
 ; CHECKED-NEXT:    mov x16, x0
 ; CHECKED-NEXT:    autdb x16, x1
 ; CHECKED-NEXT:    mov x0, x16
 ; CHECKED-NEXT:    ret
 ;
 ; TRAP-LABEL: test_auth_db:
-; TRAP:       ; %bb.0:
+; TRAP:       %bb.0:
 ; TRAP-NEXT:    mov x16, x0
 ; TRAP-NEXT:    autdb x16, x1
 ; TRAP-NEXT:    mov x17, x16
 ; TRAP-NEXT:    xpacd x17
 ; TRAP-NEXT:    cmp x16, x17
-; TRAP-NEXT:    b.eq Lauth_success_6
+; TRAP-NEXT:    b.eq [[L]]auth_success_6
 ; TRAP-NEXT:    brk #0xc473
 ; TRAP-NEXT:  Lauth_success_6:
 ; TRAP-NEXT:    mov x0, x16
@@ -235,27 +250,27 @@ define i64 @test_auth_db(i64 %arg, i64 %arg1) {
 
 define i64 @test_auth_db_zero(i64 %arg) {
 ; UNCHECKED-LABEL: test_auth_db_zero:
-; UNCHECKED:       ; %bb.0:
+; UNCHECKED:       %bb.0:
 ; UNCHECKED-NEXT:    mov x16, x0
 ; UNCHECKED-NEXT:    autdzb x16
 ; UNCHECKED-NEXT:    mov x0, x16
 ; UNCHECKED-NEXT:    ret
 ;
 ; CHECKED-LABEL: test_auth_db_zero:
-; CHECKED:       ; %bb.0:
+; CHECKED:       %bb.0:
 ; CHECKED-NEXT:    mov x16, x0
 ; CHECKED-NEXT:    autdzb x16
 ; CHECKED-NEXT:    mov x0, x16
 ; CHECKED-NEXT:    ret
 ;
 ; TRAP-LABEL: test_auth_db_zero:
-; TRAP:       ; %bb.0:
+; TRAP:       %bb.0:
 ; TRAP-NEXT:    mov x16, x0
 ; TRAP-NEXT:    autdzb x16
 ; TRAP-NEXT:    mov x17, x16
 ; TRAP-NEXT:    xpacd x17
 ; TRAP-NEXT:    cmp x16, x17
-; TRAP-NEXT:    b.eq Lauth_success_7
+; TRAP-NEXT:    b.eq [[L]]auth_success_7
 ; TRAP-NEXT:    brk #0xc473
 ; TRAP-NEXT:  Lauth_success_7:
 ; TRAP-NEXT:    mov x0, x16
@@ -268,7 +283,7 @@ define i64 @test_auth_db_zero(i64 %arg) {
 ;; the validity of a signature.
 define i64 @test_resign_ia_ia(i64 %arg, i64 %arg1, i64 %arg2) {
 ; UNCHECKED-LABEL: test_resign_ia_ia:
-; UNCHECKED:       ; %bb.0:
+; UNCHECKED:       %bb.0:
 ; UNCHECKED-NEXT:    mov x16, x0
 ; UNCHECKED-NEXT:    autia x16, x1
 ; UNCHECKED-NEXT:    pacia x16, x2
@@ -276,15 +291,15 @@ define i64 @test_resign_ia_ia(i64 %arg, i64 %arg1, i64 %arg2) {
 ; UNCHECKED-NEXT:    ret
 ;
 ; CHECKED-LABEL: test_resign_ia_ia:
-; CHECKED:       ; %bb.0:
+; CHECKED:       %bb.0:
 ; CHECKED-NEXT:    mov x16, x0
 ; CHECKED-NEXT:    autia x16, x1
 ; CHECKED-NEXT:    mov x17, x16
 ; CHECKED-NEXT:    xpaci x17
 ; CHECKED-NEXT:    cmp x16, x17
-; CHECKED-NEXT:    b.eq Lauth_success_0
+; CHECKED-NEXT:    b.eq [[L]]auth_success_0
 ; CHECKED-NEXT:    mov x16, x17
-; CHECKED-NEXT:    b Lresign_end_0
+; CHECKED-NEXT:    b [[L]]resign_end_0
 ; CHECKED-NEXT:  Lauth_success_0:
 ; CHECKED-NEXT:    pacia x16, x2
 ; CHECKED-NEXT:  Lresign_end_0:
@@ -292,13 +307,13 @@ define i64 @test_resign_ia_ia(i64 %arg, i64 %arg1, i64 %arg2) {
 ; CHECKED-NEXT:    ret
 ;
 ; TRAP-LABEL: test_resign_ia_ia:
-; TRAP:       ; %bb.0:
+; TRAP:       %bb.0:
 ; TRAP-NEXT:    mov x16, x0
 ; TRAP-NEXT:    autia x16, x1
 ; TRAP-NEXT:    mov x17, x16
 ; TRAP-NEXT:    xpaci x17
 ; TRAP-NEXT:    cmp x16, x17
-; TRAP-NEXT:    b.eq Lauth_success_8
+; TRAP-NEXT:    b.eq [[L]]auth_success_8
 ; TRAP-NEXT:    brk #0xc470
 ; TRAP-NEXT:  Lauth_success_8:
 ; TRAP-NEXT:    pacia x16, x2
@@ -310,7 +325,7 @@ define i64 @test_resign_ia_ia(i64 %arg, i64 %arg1, i64 %arg2) {
 
 define i64 @test_resign_ib_ia(i64 %arg, i64 %arg1, i64 %arg2) {
 ; UNCHECKED-LABEL: test_resign_ib_ia:
-; UNCHECKED:       ; %bb.0:
+; UNCHECKED:       %bb.0:
 ; UNCHECKED-NEXT:    mov x16, x0
 ; UNCHECKED-NEXT:    autib x16, x1
 ; UNCHECKED-NEXT:    pacia x16, x2
@@ -318,15 +333,15 @@ define i64 @test_resign_ib_ia(i64 %arg, i64 %arg1, i64 %arg2) {
 ; UNCHECKED-NEXT:    ret
 ;
 ; CHECKED-LABEL: test_resign_ib_ia:
-; CHECKED:       ; %bb.0:
+; CHECKED:       %bb.0:
 ; CHECKED-NEXT:    mov x16, x0
 ; CHECKED-NEXT:    autib x16, x1
 ; CHECKED-NEXT:    mov x17, x16
 ; CHECKED-NEXT:    xpaci x17
 ; CHECKED-NEXT:    cmp x16, x17
-; CHECKED-NEXT:    b.eq Lauth_success_1
+; CHECKED-NEXT:    b.eq [[L]]auth_success_1
 ; CHECKED-NEXT:    mov x16, x17
-; CHECKED-NEXT:    b Lresign_end_1
+; CHECKED-NEXT:    b [[L]]resign_end_1
 ; CHECKED-NEXT:  Lauth_success_1:
 ; CHECKED-NEXT:    pacia x16, x2
 ; CHECKED-NEXT:  Lresign_end_1:
@@ -334,13 +349,13 @@ define i64 @test_resign_ib_ia(i64 %arg, i64 %arg1, i64 %arg2) {
 ; CHECKED-NEXT:    ret
 ;
 ; TRAP-LABEL: test_resign_ib_ia:
-; TRAP:       ; %bb.0:
+; TRAP:       %bb.0:
 ; TRAP-NEXT:    mov x16, x0
 ; TRAP-NEXT:    autib x16, x1
 ; TRAP-NEXT:    mov x17, x16
 ; TRAP-NEXT:    xpaci x17
 ; TRAP-NEXT:    cmp x16, x17
-; TRAP-NEXT:    b.eq Lauth_success_9
+; TRAP-NEXT:    b.eq [[L]]auth_success_9
 ; TRAP-NEXT:    brk #0xc471
 ; TRAP-NEXT:  Lauth_success_9:
 ; TRAP-NEXT:    pacia x16, x2
@@ -352,7 +367,7 @@ define i64 @test_resign_ib_ia(i64 %arg, i64 %arg1, i64 %arg2) {
 
 define i64 @test_resign_da_ia(i64 %arg, i64 %arg1, i64 %arg2) {
 ; UNCHECKED-LABEL: test_resign_da_ia:
-; UNCHECKED:       ; %bb.0:
+; UNCHECKED:       %bb.0:
 ; UNCHECKED-NEXT:    mov x16, x0
 ; UNCHECKED-NEXT:    autda x16, x1
 ; UNCHECKED-NEXT:    pacia x16, x2
@@ -360,15 +375,15 @@ define i64 @test_resign_da_ia(i64 %arg, i64 %arg1, i64 %arg2) {
 ; UNCHECKED-NEXT:    ret
 ;
 ; CHECKED-LABEL: test_resign_da_ia:
-; CHECKED:       ; %bb.0:
+; CHECKED:       %bb.0:
 ; CHECKED-NEXT:    mov x16, x0
 ; CHECKED-NEXT:    autda x16, x1
 ; CHECKED-NEXT:    mov x17, x16
 ; CHECKED-NEXT:    xpacd x17
 ; CHECKED-NEXT:    cmp x16, x17
-; CHECKED-NEXT:    b.eq Lauth_success_2
+; CHECKED-NEXT:    b.eq [[L]]auth_success_2
 ; CHECKED-NEXT:    mov x16, x17
-; CHECKED-NEXT:    b Lresign_end_2
+; CHECKED-NEXT:    b [[L]]resign_end_2
 ; CHECKED-NEXT:  Lauth_success_2:
 ; CHECKED-NEXT:    pacia x16, x2
 ; CHECKED-NEXT:  Lresign_end_2:
@@ -376,13 +391,13 @@ define i64 @test_resign_da_ia(i64 %arg, i64 %arg1, i64 %arg2) {
 ; CHECKED-NEXT:    ret
 ;
 ; TRAP-LABEL: test_resign_da_ia:
-; TRAP:       ; %bb.0:
+; TRAP:       %bb.0:
 ; TRAP-NEXT:    mov x16, x0
 ; TRAP-NEXT:    autda x16, x1
 ; TRAP-NEXT:    mov x17, x16
 ; TRAP-NEXT:    xpacd x17
 ; TRAP-NEXT:    cmp x16, x17
-; TRAP-NEXT:    b.eq Lauth_success_10
+; TRAP-NEXT:    b.eq [[L]]auth_success_10
 ; TRAP-NEXT:    brk #0xc472
 ; TRAP-NEXT:  Lauth_success_10:
 ; TRAP-NEXT:    pacia x16, x2
@@ -394,7 +409,7 @@ define i64 @test_resign_da_ia(i64 %arg, i64 %arg1, i64 %arg2) {
 
 define i64 @test_resign_db_da(i64 %arg, i64 %arg1, i64 %arg2) {
 ; UNCHECKED-LABEL: test_resign_db_da:
-; UNCHECKED:       ; %bb.0:
+; UNCHECKED:       %bb.0:
 ; UNCHECKED-NEXT:    mov x16, x0
 ; UNCHECKED-NEXT:    autdb x16, x1
 ; UNCHECKED-NEXT:    pacda x16, x2
@@ -402,15 +417,15 @@ define i64 @test_resign_db_da(i64 %arg, i64 %arg1, i64 %arg2) {
 ; UNCHECKED-NEXT:    ret
 ;
 ; CHECKED-LABEL: test_resign_db_da:
-; CHECKED:       ; %bb.0:
+; CHECKED:       %bb.0:
 ; CHECKED-NEXT:    mov x16, x0
 ; CHECKED-NEXT:    autdb x16, x1
 ; CHECKED-NEXT:    mov x17, x16
 ; CHECKED-NEXT:    xpacd x17
 ; CHECKED-NEXT:    cmp x16, x17
-; CHECKED-NEXT:    b.eq Lauth_success_3
+; CHECKED-NEXT:    b.eq [[L]]auth_success_3
 ; CHECKED-NEXT:    mov x16, x17
-; CHECKED-NEXT:    b Lresign_end_3
+; CHECKED-NEXT:    b [[L]]resign_end_3
 ; CHECKED-NEXT:  Lauth_success_3:
 ; CHECKED-NEXT:    pacda x16, x2
 ; CHECKED-NEXT:  Lresign_end_3:
@@ -418,13 +433,13 @@ define i64 @test_resign_db_da(i64 %arg, i64 %arg1, i64 %arg2) {
 ; CHECKED-NEXT:    ret
 ;
 ; TRAP-LABEL: test_resign_db_da:
-; TRAP:       ; %bb.0:
+; TRAP:       %bb.0:
 ; TRAP-NEXT:    mov x16, x0
 ; TRAP-NEXT:    autdb x16, x1
 ; TRAP-NEXT:    mov x17, x16
 ; TRAP-NEXT:    xpacd x17
 ; TRAP-NEXT:    cmp x16, x17
-; TRAP-NEXT:    b.eq Lauth_success_11
+; TRAP-NEXT:    b.eq [[L]]auth_success_11
 ; TRAP-NEXT:    brk #0xc473
 ; TRAP-NEXT:  Lauth_success_11:
 ; TRAP-NEXT:    pacda x16, x2
@@ -436,7 +451,7 @@ define i64 @test_resign_db_da(i64 %arg, i64 %arg1, i64 %arg2) {
 
 define i64 @test_resign_iza_db(i64 %arg, i64 %arg1, i64 %arg2) {
 ; UNCHECKED-LABEL: test_resign_iza_db:
-; UNCHECKED:       ; %bb.0:
+; UNCHECKED:       %bb.0:
 ; UNCHECKED-NEXT:    mov x16, x0
 ; UNCHECKED-NEXT:    autiza x16
 ; UNCHECKED-NEXT:    pacdb x16, x2
@@ -444,15 +459,15 @@ define i64 @test_resign_iza_db(i64 %arg, i64 %arg1, i64 %arg2) {
 ; UNCHECKED-NEXT:    ret
 ;
 ; CHECKED-LABEL: test_resign_iza_db:
-; CHECKED:       ; %bb.0:
+; CHECKED:       %bb.0:
 ; CHECKED-NEXT:    mov x16, x0
 ; CHECKED-NEXT:    autiza x16
 ; CHECKED-NEXT:    mov x17, x16
 ; CHECKED-NEXT:    xpaci x17
 ; CHECKED-NEXT:    cmp x16, x17
-; CHECKED-NEXT:    b.eq Lauth_success_4
+; CHECKED-NEXT:    b.eq [[L]]auth_success_4
 ; CHECKED-NEXT:    mov x16, x17
-; CHECKED-NEXT:    b Lresign_end_4
+; CHECKED-NEXT:    b [[L]]resign_end_4
 ; CHECKED-NEXT:  Lauth_success_4:
 ; CHECKED-NEXT:    pacdb x16, x2
 ; CHECKED-NEXT:  Lresign_end_4:
@@ -460,13 +475,13 @@ define i64 @test_resign_iza_db(i64 %arg, i64 %arg1, i64 %arg2) {
 ; CHECKED-NEXT:    ret
 ;
 ; TRAP-LABEL: test_resign_iza_db:
-; TRAP:       ; %bb.0:
+; TRAP:       %bb.0:
 ; TRAP-NEXT:    mov x16, x0
 ; TRAP-NEXT:    autiza x16
 ; TRAP-NEXT:    mov x17, x16
 ; TRAP-NEXT:    xpaci x17
 ; TRAP-NEXT:    cmp x16, x17
-; TRAP-NEXT:    b.eq Lauth_success_12
+; TRAP-NEXT:    b.eq [[L]]auth_success_12
 ; TRAP-NEXT:    brk #0xc470
 ; TRAP-NEXT:  Lauth_success_12:
 ; TRAP-NEXT:    pacdb x16, x2
@@ -478,7 +493,7 @@ define i64 @test_resign_iza_db(i64 %arg, i64 %arg1, i64 %arg2) {
 
 define i64 @test_resign_da_dzb(i64 %arg, i64 %arg1, i64 %arg2) {
 ; UNCHECKED-LABEL: test_resign_da_dzb:
-; UNCHECKED:       ; %bb.0:
+; UNCHECKED:       %bb.0:
 ; UNCHECKED-NEXT:    mov x16, x0
 ; UNCHECKED-NEXT:    autda x16, x1
 ; UNCHECKED-NEXT:    pacdzb x16
@@ -486,15 +501,15 @@ define i64 @test_resign_da_dzb(i64 %arg, i64 %arg1, i64 %arg2) {
 ; UNCHECKED-NEXT:    ret
 ;
 ; CHECKED-LABEL: test_resign_da_dzb:
-; CHECKED:       ; %bb.0:
+; CHECKED:       %bb.0:
 ; CHECKED-NEXT:    mov x16, x0
 ; CHECKED-NEXT:    autda x16, x1
 ; CHECKED-NEXT:    mov x17, x16
 ; CHECKED-NEXT:    xpacd x17
 ; CHECKED-NEXT:    cmp x16, x17
-; CHECKED-NEXT:    b.eq Lauth_success_5
+; CHECKED-NEXT:    b.eq [[L]]auth_success_5
 ; CHECKED-NEXT:    mov x16, x17
-; CHECKED-NEXT:    b Lresign_end_5
+; CHECKED-NEXT:    b [[L]]resign_end_5
 ; CHECKED-NEXT:  Lauth_success_5:
 ; CHECKED-NEXT:    pacdzb x16
 ; CHECKED-NEXT:  Lresign_end_5:
@@ -502,13 +517,13 @@ define i64 @test_resign_da_dzb(i64 %arg, i64 %arg1, i64 %arg2) {
 ; CHECKED-NEXT:    ret
 ;
 ; TRAP-LABEL: test_resign_da_dzb:
-; TRAP:       ; %bb.0:
+; TRAP:       %bb.0:
 ; TRAP-NEXT:    mov x16, x0
 ; TRAP-NEXT:    autda x16, x1
 ; TRAP-NEXT:    mov x17, x16
 ; TRAP-NEXT:    xpacd x17
 ; TRAP-NEXT:    cmp x16, x17
-; TRAP-NEXT:    b.eq Lauth_success_13
+; TRAP-NEXT:    b.eq [[L]]auth_success_13
 ; TRAP-NEXT:    brk #0xc472
 ; TRAP-NEXT:  Lauth_success_13:
 ; TRAP-NEXT:    pacdzb x16
@@ -520,33 +535,33 @@ define i64 @test_resign_da_dzb(i64 %arg, i64 %arg1, i64 %arg2) {
 
 define i64 @test_auth_trap_attribute(i64 %arg, i64 %arg1) "ptrauth-auth-traps" {
 ; UNCHECKED-LABEL: test_auth_trap_attribute:
-; UNCHECKED:       ; %bb.0:
+; UNCHECKED:       %bb.0:
 ; UNCHECKED-NEXT:    mov x16, x0
 ; UNCHECKED-NEXT:    autia x16, x1
 ; UNCHECKED-NEXT:    mov x0, x16
 ; UNCHECKED-NEXT:    ret
 ;
 ; CHECKED-LABEL: test_auth_trap_attribute:
-; CHECKED:       ; %bb.0:
+; CHECKED:       %bb.0:
 ; CHECKED-NEXT:    mov x16, x0
 ; CHECKED-NEXT:    autia x16, x1
 ; CHECKED-NEXT:    mov x17, x16
 ; CHECKED-NEXT:    xpaci x17
 ; CHECKED-NEXT:    cmp x16, x17
-; CHECKED-NEXT:    b.eq Lauth_success_6
+; CHECKED-NEXT:    b.eq [[L]]auth_success_6
 ; CHECKED-NEXT:    brk #0xc470
 ; CHECKED-NEXT:  Lauth_success_6:
 ; CHECKED-NEXT:    mov x0, x16
 ; CHECKED-NEXT:    ret
 ;
 ; TRAP-LABEL: test_auth_trap_attribute:
-; TRAP:       ; %bb.0:
+; TRAP:       %bb.0:
 ; TRAP-NEXT:    mov x16, x0
 ; TRAP-NEXT:    autia x16, x1
 ; TRAP-NEXT:    mov x17, x16
 ; TRAP-NEXT:    xpaci x17
 ; TRAP-NEXT:    cmp x16, x17
-; TRAP-NEXT:    b.eq Lauth_success_14
+; TRAP-NEXT:    b.eq [[L]]auth_success_14
 ; TRAP-NEXT:    brk #0xc470
 ; TRAP-NEXT:  Lauth_success_14:
 ; TRAP-NEXT:    mov x0, x16
@@ -557,30 +572,30 @@ define i64 @test_auth_trap_attribute(i64 %arg, i64 %arg1) "ptrauth-auth-traps" {
 
 define i64 @test_auth_ia_constdisc(i64 %arg) {
 ; UNCHECKED-LABEL: test_auth_ia_constdisc:
-; UNCHECKED:       ; %bb.0:
+; UNCHECKED:       %bb.0:
 ; UNCHECKED-NEXT:    mov x16, x0
-; UNCHECKED-NEXT:    mov x17, #256 ; =0x100
+; UNCHECKED-NEXT:    mov x17, #256
 ; UNCHECKED-NEXT:    autia x16, x17
 ; UNCHECKED-NEXT:    mov x0, x16
 ; UNCHECKED-NEXT:    ret
 ;
 ; CHECKED-LABEL: test_auth_ia_constdisc:
-; CHECKED:       ; %bb.0:
+; CHECKED:       %bb.0:
 ; CHECKED-NEXT:    mov x16, x0
-; CHECKED-NEXT:    mov x17, #256 ; =0x100
+; CHECKED-NEXT:    mov x17, #256
 ; CHECKED-NEXT:    autia x16, x17
 ; CHECKED-NEXT:    mov x0, x16
 ; CHECKED-NEXT:    ret
 ;
 ; TRAP-LABEL: test_auth_ia_constdisc:
-; TRAP:       ; %bb.0:
+; TRAP:       %bb.0:
 ; TRAP-NEXT:    mov x16, x0
-; TRAP-NEXT:    mov x17, #256 ; =0x100
+; TRAP-NEXT:    mov x17, #256
 ; TRAP-NEXT:    autia x16, x17
 ; TRAP-NEXT:    mov x17, x16
 ; TRAP-NEXT:    xpaci x17
 ; TRAP-NEXT:    cmp x16, x17
-; TRAP-NEXT:    b.eq Lauth_success_15
+; TRAP-NEXT:    b.eq [[L]]auth_success_15
 ; TRAP-NEXT:    brk #0xc470
 ; TRAP-NEXT:  Lauth_success_15:
 ; TRAP-NEXT:    mov x0, x16
@@ -591,42 +606,42 @@ define i64 @test_auth_ia_constdisc(i64 %arg) {
 
 define i64 @test_resign_da_constdisc(i64 %arg, i64 %arg1) {
 ; UNCHECKED-LABEL: test_resign_da_constdisc:
-; UNCHECKED:       ; %bb.0:
+; UNCHECKED:       %bb.0:
 ; UNCHECKED-NEXT:    mov x16, x0
 ; UNCHECKED-NEXT:    autda x16, x1
-; UNCHECKED-NEXT:    mov x17, #256 ; =0x100
+; UNCHECKED-NEXT:    mov x17, #256
 ; UNCHECKED-NEXT:    pacda x16, x17
 ; UNCHECKED-NEXT:    mov x0, x16
 ; UNCHECKED-NEXT:    ret
 ;
 ; CHECKED-LABEL: test_resign_da_constdisc:
-; CHECKED:       ; %bb.0:
+; CHECKED:       %bb.0:
 ; CHECKED-NEXT:    mov x16, x0
 ; CHECKED-NEXT:    autda x16, x1
 ; CHECKED-NEXT:    mov x17, x16
 ; CHECKED-NEXT:    xpacd x17
 ; CHECKED-NEXT:    cmp x16, x17
-; CHECKED-NEXT:    b.eq Lauth_success_7
+; CHECKED-NEXT:    b.eq [[L]]auth_success_7
 ; CHECKED-NEXT:    mov x16, x17
-; CHECKED-NEXT:    b Lresign_end_6
+; CHECKED-NEXT:    b [[L]]resign_end_6
 ; CHECKED-NEXT:  Lauth_success_7:
-; CHECKED-NEXT:    mov x17, #256 ; =0x100
+; CHECKED-NEXT:    mov x17, #256
 ; CHECKED-NEXT:    pacda x16, x17
 ; CHECKED-NEXT:  Lresign_end_6:
 ; CHECKED-NEXT:    mov x0, x16
 ; CHECKED-NEXT:    ret
 ;
 ; TRAP-LABEL: test_resign_da_constdisc:
-; TRAP:       ; %bb.0:
+; TRAP:       %bb.0:
 ; TRAP-NEXT:    mov x16, x0
 ; TRAP-NEXT:    autda x16, x1
 ; TRAP-NEXT:    mov x17, x16
 ; TRAP-NEXT:    xpacd x17
 ; TRAP-NEXT:    cmp x16, x17
-; TRAP-NEXT:    b.eq Lauth_success_16
+; TRAP-NEXT:    b.eq [[L]]auth_success_16
 ; TRAP-NEXT:    brk #0xc472
 ; TRAP-NEXT:  Lauth_success_16:
-; TRAP-NEXT:    mov x17, #256 ; =0x100
+; TRAP-NEXT:    mov x17, #256
 ; TRAP-NEXT:    pacda x16, x17
 ; TRAP-NEXT:    mov x0, x16
 ; TRAP-NEXT:    ret
diff --git a/llvm/test/CodeGen/AMDGPU/GlobalISel/atomicrmw_fmax.ll b/llvm/test/CodeGen/AMDGPU/GlobalISel/atomicrmw_fmax.ll
index c701e87..1270e69 100644
--- a/llvm/test/CodeGen/AMDGPU/GlobalISel/atomicrmw_fmax.ll
+++ b/llvm/test/CodeGen/AMDGPU/GlobalISel/atomicrmw_fmax.ll
@@ -501,8 +501,8 @@ define float @global_agent_atomic_fmax_ret_f32__amdgpu_no_fine_grained_memory(pt
 ; GFX7-NEXT:    s_mov_b32 s7, 0xf000
 ; GFX7-NEXT:    s_mov_b64 s[4:5], 0
 ; GFX7-NEXT:    buffer_load_dword v3, v[0:1], s[4:7], 0 addr64
-; GFX7-NEXT:    s_mov_b64 s[8:9], 0
 ; GFX7-NEXT:    v_mul_f32_e32 v2, 1.0, v2
+; GFX7-NEXT:    s_mov_b64 s[8:9], s[4:5]
 ; GFX7-NEXT:  .LBB4_1: ; %atomicrmw.start
 ; GFX7-NEXT:    ; =>This Inner Loop Header: Depth=1
 ; GFX7-NEXT:    s_waitcnt vmcnt(0)
@@ -710,8 +710,8 @@ define void @global_agent_atomic_fmax_noret_f32__amdgpu_no_fine_grained_memory(p
 ; GFX7-NEXT:    s_mov_b32 s7, 0xf000
 ; GFX7-NEXT:    s_mov_b64 s[4:5], 0
 ; GFX7-NEXT:    buffer_load_dword v3, v[0:1], s[4:7], 0 addr64
-; GFX7-NEXT:    s_mov_b64 s[8:9], 0
 ; GFX7-NEXT:    v_mul_f32_e32 v4, 1.0, v2
+; GFX7-NEXT:    s_mov_b64 s[8:9], s[4:5]
 ; GFX7-NEXT:  .LBB5_1: ; %atomicrmw.start
 ; GFX7-NEXT:    ; =>This Inner Loop Header: Depth=1
 ; GFX7-NEXT:    s_waitcnt vmcnt(0)
@@ -936,7 +936,7 @@ define double @global_agent_atomic_fmax_ret_f64__amdgpu_no_fine_grained_memory(p
 ; GFX7-NEXT:    s_mov_b64 s[4:5], 0
 ; GFX7-NEXT:    buffer_load_dwordx2 v[0:1], v[4:5], s[4:7], 0 addr64
 ; GFX7-NEXT:    v_max_f64 v[6:7], v[2:3], v[2:3]
-; GFX7-NEXT:    s_mov_b64 s[8:9], 0
+; GFX7-NEXT:    s_mov_b64 s[8:9], s[4:5]
 ; GFX7-NEXT:  .LBB6_1: ; %atomicrmw.start
 ; GFX7-NEXT:    ; =>This Inner Loop Header: Depth=1
 ; GFX7-NEXT:    s_waitcnt vmcnt(0)
@@ -1150,7 +1150,7 @@ define void @global_agent_atomic_fmax_noret_f64__amdgpu_no_fine_grained_memory(p
 ; GFX7-NEXT:    s_mov_b64 s[4:5], 0
 ; GFX7-NEXT:    buffer_load_dwordx2 v[4:5], v[0:1], s[4:7], 0 addr64
 ; GFX7-NEXT:    v_max_f64 v[6:7], v[2:3], v[2:3]
-; GFX7-NEXT:    s_mov_b64 s[8:9], 0
+; GFX7-NEXT:    s_mov_b64 s[8:9], s[4:5]
 ; GFX7-NEXT:  .LBB7_1: ; %atomicrmw.start
 ; GFX7-NEXT:    ; =>This Inner Loop Header: Depth=1
 ; GFX7-NEXT:    s_waitcnt vmcnt(0)
diff --git a/llvm/test/CodeGen/AMDGPU/GlobalISel/atomicrmw_fmin.ll b/llvm/test/CodeGen/AMDGPU/GlobalISel/atomicrmw_fmin.ll
index 90110e6..e36161a 100644
--- a/llvm/test/CodeGen/AMDGPU/GlobalISel/atomicrmw_fmin.ll
+++ b/llvm/test/CodeGen/AMDGPU/GlobalISel/atomicrmw_fmin.ll
@@ -501,8 +501,8 @@ define float @global_agent_atomic_fmin_ret_f32__amdgpu_no_fine_grained_memory(pt
 ; GFX7-NEXT:    s_mov_b32 s7, 0xf000
 ; GFX7-NEXT:    s_mov_b64 s[4:5], 0
 ; GFX7-NEXT:    buffer_load_dword v3, v[0:1], s[4:7], 0 addr64
-; GFX7-NEXT:    s_mov_b64 s[8:9], 0
 ; GFX7-NEXT:    v_mul_f32_e32 v2, 1.0, v2
+; GFX7-NEXT:    s_mov_b64 s[8:9], s[4:5]
 ; GFX7-NEXT:  .LBB4_1: ; %atomicrmw.start
 ; GFX7-NEXT:    ; =>This Inner Loop Header: Depth=1
 ; GFX7-NEXT:    s_waitcnt vmcnt(0)
@@ -710,8 +710,8 @@ define void @global_agent_atomic_fmin_noret_f32__amdgpu_no_fine_grained_memory(p
 ; GFX7-NEXT:    s_mov_b32 s7, 0xf000
 ; GFX7-NEXT:    s_mov_b64 s[4:5], 0
 ; GFX7-NEXT:    buffer_load_dword v3, v[0:1], s[4:7], 0 addr64
-; GFX7-NEXT:    s_mov_b64 s[8:9], 0
 ; GFX7-NEXT:    v_mul_f32_e32 v4, 1.0, v2
+; GFX7-NEXT:    s_mov_b64 s[8:9], s[4:5]
 ; GFX7-NEXT:  .LBB5_1: ; %atomicrmw.start
 ; GFX7-NEXT:    ; =>This Inner Loop Header: Depth=1
 ; GFX7-NEXT:    s_waitcnt vmcnt(0)
@@ -936,7 +936,7 @@ define double @global_agent_atomic_fmin_ret_f64__amdgpu_no_fine_grained_memory(p
 ; GFX7-NEXT:    s_mov_b64 s[4:5], 0
 ; GFX7-NEXT:    buffer_load_dwordx2 v[0:1], v[4:5], s[4:7], 0 addr64
 ; GFX7-NEXT:    v_max_f64 v[6:7], v[2:3], v[2:3]
-; GFX7-NEXT:    s_mov_b64 s[8:9], 0
+; GFX7-NEXT:    s_mov_b64 s[8:9], s[4:5]
 ; GFX7-NEXT:  .LBB6_1: ; %atomicrmw.start
 ; GFX7-NEXT:    ; =>This Inner Loop Header: Depth=1
 ; GFX7-NEXT:    s_waitcnt vmcnt(0)
@@ -1150,7 +1150,7 @@ define void @global_agent_atomic_fmin_noret_f64__amdgpu_no_fine_grained_memory(p
 ; GFX7-NEXT:    s_mov_b64 s[4:5], 0
 ; GFX7-NEXT:    buffer_load_dwordx2 v[4:5], v[0:1], s[4:7], 0 addr64
 ; GFX7-NEXT:    v_max_f64 v[6:7], v[2:3], v[2:3]
-; GFX7-NEXT:    s_mov_b64 s[8:9], 0
+; GFX7-NEXT:    s_mov_b64 s[8:9], s[4:5]
 ; GFX7-NEXT:  .LBB7_1: ; %atomicrmw.start
 ; GFX7-NEXT:    ; =>This Inner Loop Header: Depth=1
 ; GFX7-NEXT:    s_waitcnt vmcnt(0)
diff --git a/llvm/test/CodeGen/AMDGPU/GlobalISel/divergence-divergent-i1-phis-no-lane-mask-merging.ll b/llvm/test/CodeGen/AMDGPU/GlobalISel/divergence-divergent-i1-phis-no-lane-mask-merging.ll
index d4d5cb1..966a481 100644
--- a/llvm/test/CodeGen/AMDGPU/GlobalISel/divergence-divergent-i1-phis-no-lane-mask-merging.ll
+++ b/llvm/test/CodeGen/AMDGPU/GlobalISel/divergence-divergent-i1-phis-no-lane-mask-merging.ll
@@ -226,15 +226,16 @@ exit:
 define amdgpu_cs void @single_lane_execution_attribute(i32 inreg %.userdata0, <3 x i32> inreg %.WorkgroupId, <3 x i32> %.LocalInvocationId) #0 {
 ; GFX10-LABEL: single_lane_execution_attribute:
 ; GFX10:       ; %bb.0: ; %.entry
+; GFX10-NEXT:    s_mov_b32 s6, 0
 ; GFX10-NEXT:    s_getpc_b64 s[4:5]
-; GFX10-NEXT:    s_mov_b32 s12, 0
-; GFX10-NEXT:    s_mov_b32 s13, -1
-; GFX10-NEXT:    s_mov_b32 s2, s0
-; GFX10-NEXT:    s_and_b64 s[4:5], s[4:5], s[12:13]
-; GFX10-NEXT:    s_mov_b32 s3, s12
+; GFX10-NEXT:    s_mov_b32 s7, -1
+; GFX10-NEXT:    s_mov_b32 s2, s1
+; GFX10-NEXT:    s_and_b64 s[4:5], s[4:5], s[6:7]
+; GFX10-NEXT:    s_mov_b32 s1, 0
 ; GFX10-NEXT:    v_mbcnt_lo_u32_b32 v1, -1, 0
-; GFX10-NEXT:    s_or_b64 s[2:3], s[4:5], s[2:3]
-; GFX10-NEXT:    s_load_dwordx8 s[4:11], s[2:3], 0x0
+; GFX10-NEXT:    s_or_b64 s[12:13], s[4:5], s[0:1]
+; GFX10-NEXT:    s_mov_b32 s3, -1
+; GFX10-NEXT:    s_load_dwordx8 s[4:11], s[12:13], 0x0
 ; GFX10-NEXT:    v_mbcnt_hi_u32_b32 v1, -1, v1
 ; GFX10-NEXT:    v_lshlrev_b32_e32 v2, 2, v1
 ; GFX10-NEXT:    v_and_b32_e32 v3, 1, v1
@@ -248,8 +249,8 @@ define amdgpu_cs void @single_lane_execution_attribute(i32 inreg %.userdata0, <3
 ; GFX10-NEXT:    v_cmp_eq_u32_e64 s0, 0, v2
 ; GFX10-NEXT:    s_cbranch_vccnz .LBB4_4
 ; GFX10-NEXT:  ; %bb.1: ; %.preheader.preheader
-; GFX10-NEXT:    v_mov_b32_e32 v3, s12
-; GFX10-NEXT:    v_mov_b32_e32 v4, s12
+; GFX10-NEXT:    v_mov_b32_e32 v3, s1
+; GFX10-NEXT:    v_mov_b32_e32 v4, s1
 ; GFX10-NEXT:  .LBB4_2: ; %.preheader
 ; GFX10-NEXT:    ; =>This Inner Loop Header: Depth=1
 ; GFX10-NEXT:    buffer_load_dword v5, v3, s[4:7], 0 offen
@@ -261,17 +262,17 @@ define amdgpu_cs void @single_lane_execution_attribute(i32 inreg %.userdata0, <3
 ; GFX10-NEXT:    s_cbranch_vccnz .LBB4_2
 ; GFX10-NEXT:  ; %bb.3: ; %.preheader._crit_edge
 ; GFX10-NEXT:    v_cmp_eq_u32_e32 vcc_lo, v4, v2
-; GFX10-NEXT:    s_mov_b32 s13, 0
-; GFX10-NEXT:    s_or_b32 s2, s0, vcc_lo
-; GFX10-NEXT:    v_cndmask_b32_e64 v3, 0, 1, s2
+; GFX10-NEXT:    s_mov_b32 s3, 0
+; GFX10-NEXT:    s_or_b32 s1, s0, vcc_lo
+; GFX10-NEXT:    v_cndmask_b32_e64 v3, 0, 1, s1
 ; GFX10-NEXT:  .LBB4_4: ; %Flow
-; GFX10-NEXT:    s_and_b32 vcc_lo, exec_lo, s13
+; GFX10-NEXT:    s_and_b32 vcc_lo, exec_lo, s3
 ; GFX10-NEXT:    s_cbranch_vccz .LBB4_6
 ; GFX10-NEXT:  ; %bb.5: ; %.19
 ; GFX10-NEXT:    v_cndmask_b32_e64 v1, 0, 1, s0
 ; GFX10-NEXT:    v_or_b32_e32 v3, 2, v1
 ; GFX10-NEXT:  .LBB4_6: ; %.22
-; GFX10-NEXT:    v_add_lshl_u32 v0, v0, s1, 2
+; GFX10-NEXT:    v_add_lshl_u32 v0, v0, s2, 2
 ; GFX10-NEXT:    buffer_store_dword v3, v0, s[8:11], 0 offen
 ; GFX10-NEXT:    s_endpgm
 .entry:
diff --git a/llvm/test/CodeGen/AMDGPU/GlobalISel/divergent-control-flow.ll b/llvm/test/CodeGen/AMDGPU/GlobalISel/divergent-control-flow.ll
index 5515de0..f52b7c6 100644
--- a/llvm/test/CodeGen/AMDGPU/GlobalISel/divergent-control-flow.ll
+++ b/llvm/test/CodeGen/AMDGPU/GlobalISel/divergent-control-flow.ll
@@ -193,12 +193,12 @@ bb12:
 define amdgpu_kernel void @break_loop(i32 %arg) {
 ; CHECK-LABEL: break_loop:
 ; CHECK:       ; %bb.0: ; %bb
-; CHECK-NEXT:    s_load_dword s2, s[6:7], 0x0
-; CHECK-NEXT:    s_mov_b64 s[0:1], 0
+; CHECK-NEXT:    s_load_dword s0, s[6:7], 0x0
+; CHECK-NEXT:    ; implicit-def: $sgpr2_sgpr3
 ; CHECK-NEXT:    ; implicit-def: $vgpr1
 ; CHECK-NEXT:    s_waitcnt lgkmcnt(0)
-; CHECK-NEXT:    v_subrev_u32_e32 v0, s2, v0
-; CHECK-NEXT:    ; implicit-def: $sgpr2_sgpr3
+; CHECK-NEXT:    v_subrev_u32_e32 v0, s0, v0
+; CHECK-NEXT:    s_mov_b64 s[0:1], 0
 ; CHECK-NEXT:    s_branch .LBB5_3
 ; CHECK-NEXT:  .LBB5_1: ; %bb4
 ; CHECK-NEXT:    ; in Loop: Header=BB5_3 Depth=1
diff --git a/llvm/test/CodeGen/AMDGPU/GlobalISel/floor.f64.ll b/llvm/test/CodeGen/AMDGPU/GlobalISel/floor.f64.ll
index 34635b0..e19e782 100644
--- a/llvm/test/CodeGen/AMDGPU/GlobalISel/floor.f64.ll
+++ b/llvm/test/CodeGen/AMDGPU/GlobalISel/floor.f64.ll
@@ -8,9 +8,9 @@ define double @v_floor_f64_ieee(double %x) {
 ; GFX6:       ; %bb.0:
 ; GFX6-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX6-NEXT:    v_fract_f64_e32 v[2:3], v[0:1]
-; GFX6-NEXT:    s_mov_b32 s4, -1
-; GFX6-NEXT:    s_mov_b32 s5, 0x3fefffff
-; GFX6-NEXT:    v_min_f64 v[2:3], v[2:3], s[4:5]
+; GFX6-NEXT:    v_mov_b32_e32 v4, -1
+; GFX6-NEXT:    v_mov_b32_e32 v5, 0x3fefffff
+; GFX6-NEXT:    v_min_f64 v[2:3], v[2:3], v[4:5]
 ; GFX6-NEXT:    v_cmp_o_f64_e32 vcc, v[0:1], v[0:1]
 ; GFX6-NEXT:    v_cndmask_b32_e32 v2, v2, v0, vcc
 ; GFX6-NEXT:    v_cndmask_b32_e32 v3, v3, v1, vcc
@@ -31,9 +31,9 @@ define double @v_floor_f64_ieee_nnan(double %x) {
 ; GFX6:       ; %bb.0:
 ; GFX6-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX6-NEXT:    v_fract_f64_e32 v[2:3], v[0:1]
-; GFX6-NEXT:    s_mov_b32 s4, -1
-; GFX6-NEXT:    s_mov_b32 s5, 0x3fefffff
-; GFX6-NEXT:    v_min_f64 v[2:3], v[2:3], s[4:5]
+; GFX6-NEXT:    v_mov_b32_e32 v4, -1
+; GFX6-NEXT:    v_mov_b32_e32 v5, 0x3fefffff
+; GFX6-NEXT:    v_min_f64 v[2:3], v[2:3], v[4:5]
 ; GFX6-NEXT:    v_add_f64 v[0:1], v[0:1], -v[2:3]
 ; GFX6-NEXT:    s_setpc_b64 s[30:31]
 ;
@@ -51,9 +51,9 @@ define double @v_floor_f64_ieee_fneg(double %x) {
 ; GFX6:       ; %bb.0:
 ; GFX6-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX6-NEXT:    v_fract_f64_e64 v[2:3], -v[0:1]
-; GFX6-NEXT:    s_mov_b32 s4, -1
-; GFX6-NEXT:    s_mov_b32 s5, 0x3fefffff
-; GFX6-NEXT:    v_min_f64 v[2:3], v[2:3], s[4:5]
+; GFX6-NEXT:    v_mov_b32_e32 v4, -1
+; GFX6-NEXT:    v_mov_b32_e32 v5, 0x3fefffff
+; GFX6-NEXT:    v_min_f64 v[2:3], v[2:3], v[4:5]
 ; GFX6-NEXT:    v_cmp_o_f64_e32 vcc, v[0:1], v[0:1]
 ; GFX6-NEXT:    v_cndmask_b32_e32 v2, v2, v0, vcc
 ; GFX6-NEXT:    v_cndmask_b32_e32 v3, v3, v1, vcc
@@ -75,9 +75,9 @@ define double @v_floor_f64_nonieee(double %x) #1 {
 ; GFX6:       ; %bb.0:
 ; GFX6-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX6-NEXT:    v_fract_f64_e32 v[2:3], v[0:1]
-; GFX6-NEXT:    s_mov_b32 s4, -1
-; GFX6-NEXT:    s_mov_b32 s5, 0x3fefffff
-; GFX6-NEXT:    v_min_f64 v[2:3], v[2:3], s[4:5]
+; GFX6-NEXT:    v_mov_b32_e32 v4, -1
+; GFX6-NEXT:    v_mov_b32_e32 v5, 0x3fefffff
+; GFX6-NEXT:    v_min_f64 v[2:3], v[2:3], v[4:5]
 ; GFX6-NEXT:    v_cmp_o_f64_e32 vcc, v[0:1], v[0:1]
 ; GFX6-NEXT:    v_cndmask_b32_e32 v2, v2, v0, vcc
 ; GFX6-NEXT:    v_cndmask_b32_e32 v3, v3, v1, vcc
@@ -98,9 +98,9 @@ define double @v_floor_f64_nonieee_nnan(double %x) #1 {
 ; GFX6:       ; %bb.0:
 ; GFX6-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX6-NEXT:    v_fract_f64_e32 v[2:3], v[0:1]
-; GFX6-NEXT:    s_mov_b32 s4, -1
-; GFX6-NEXT:    s_mov_b32 s5, 0x3fefffff
-; GFX6-NEXT:    v_min_f64 v[2:3], v[2:3], s[4:5]
+; GFX6-NEXT:    v_mov_b32_e32 v4, -1
+; GFX6-NEXT:    v_mov_b32_e32 v5, 0x3fefffff
+; GFX6-NEXT:    v_min_f64 v[2:3], v[2:3], v[4:5]
 ; GFX6-NEXT:    v_add_f64 v[0:1], v[0:1], -v[2:3]
 ; GFX6-NEXT:    s_setpc_b64 s[30:31]
 ;
@@ -118,9 +118,9 @@ define double @v_floor_f64_non_ieee_fneg(double %x) #1 {
 ; GFX6:       ; %bb.0:
 ; GFX6-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX6-NEXT:    v_fract_f64_e64 v[2:3], -v[0:1]
-; GFX6-NEXT:    s_mov_b32 s4, -1
-; GFX6-NEXT:    s_mov_b32 s5, 0x3fefffff
-; GFX6-NEXT:    v_min_f64 v[2:3], v[2:3], s[4:5]
+; GFX6-NEXT:    v_mov_b32_e32 v4, -1
+; GFX6-NEXT:    v_mov_b32_e32 v5, 0x3fefffff
+; GFX6-NEXT:    v_min_f64 v[2:3], v[2:3], v[4:5]
 ; GFX6-NEXT:    v_cmp_o_f64_e32 vcc, v[0:1], v[0:1]
 ; GFX6-NEXT:    v_cndmask_b32_e32 v2, v2, v0, vcc
 ; GFX6-NEXT:    v_cndmask_b32_e32 v3, v3, v1, vcc
@@ -142,9 +142,9 @@ define double @v_floor_f64_fabs(double %x) {
 ; GFX6:       ; %bb.0:
 ; GFX6-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX6-NEXT:    v_fract_f64_e64 v[2:3], |v[0:1]|
-; GFX6-NEXT:    s_mov_b32 s4, -1
-; GFX6-NEXT:    s_mov_b32 s5, 0x3fefffff
-; GFX6-NEXT:    v_min_f64 v[2:3], v[2:3], s[4:5]
+; GFX6-NEXT:    v_mov_b32_e32 v4, -1
+; GFX6-NEXT:    v_mov_b32_e32 v5, 0x3fefffff
+; GFX6-NEXT:    v_min_f64 v[2:3], v[2:3], v[4:5]
 ; GFX6-NEXT:    v_cmp_o_f64_e32 vcc, v[0:1], v[0:1]
 ; GFX6-NEXT:    v_cndmask_b32_e32 v2, v2, v0, vcc
 ; GFX6-NEXT:    v_cndmask_b32_e32 v3, v3, v1, vcc
@@ -166,9 +166,9 @@ define double @v_floor_f64_fneg_fabs(double %x) {
 ; GFX6:       ; %bb.0:
 ; GFX6-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX6-NEXT:    v_fract_f64_e64 v[2:3], -|v[0:1]|
-; GFX6-NEXT:    s_mov_b32 s4, -1
-; GFX6-NEXT:    s_mov_b32 s5, 0x3fefffff
-; GFX6-NEXT:    v_min_f64 v[2:3], v[2:3], s[4:5]
+; GFX6-NEXT:    v_mov_b32_e32 v4, -1
+; GFX6-NEXT:    v_mov_b32_e32 v5, 0x3fefffff
+; GFX6-NEXT:    v_min_f64 v[2:3], v[2:3], v[4:5]
 ; GFX6-NEXT:    v_cmp_o_f64_e32 vcc, v[0:1], v[0:1]
 ; GFX6-NEXT:    v_cndmask_b32_e32 v2, v2, v0, vcc
 ; GFX6-NEXT:    v_cndmask_b32_e32 v3, v3, v1, vcc
@@ -190,9 +190,9 @@ define amdgpu_ps <2 x float> @s_floor_f64(double inreg %x) {
 ; GFX6-LABEL: s_floor_f64:
 ; GFX6:       ; %bb.0:
 ; GFX6-NEXT:    v_fract_f64_e32 v[0:1], s[2:3]
-; GFX6-NEXT:    s_mov_b32 s0, -1
-; GFX6-NEXT:    s_mov_b32 s1, 0x3fefffff
-; GFX6-NEXT:    v_min_f64 v[0:1], v[0:1], s[0:1]
+; GFX6-NEXT:    v_mov_b32_e32 v2, -1
+; GFX6-NEXT:    v_mov_b32_e32 v3, 0x3fefffff
+; GFX6-NEXT:    v_min_f64 v[0:1], v[0:1], v[2:3]
 ; GFX6-NEXT:    v_cmp_o_f64_e64 vcc, s[2:3], s[2:3]
 ; GFX6-NEXT:    v_mov_b32_e32 v2, s2
 ; GFX6-NEXT:    v_mov_b32_e32 v3, s3
@@ -214,9 +214,9 @@ define amdgpu_ps <2 x float> @s_floor_f64_fneg(double inreg %x) {
 ; GFX6-LABEL: s_floor_f64_fneg:
 ; GFX6:       ; %bb.0:
 ; GFX6-NEXT:    v_fract_f64_e64 v[0:1], -s[2:3]
-; GFX6-NEXT:    s_mov_b32 s0, -1
-; GFX6-NEXT:    s_mov_b32 s1, 0x3fefffff
-; GFX6-NEXT:    v_min_f64 v[0:1], v[0:1], s[0:1]
+; GFX6-NEXT:    v_mov_b32_e32 v2, -1
+; GFX6-NEXT:    v_mov_b32_e32 v3, 0x3fefffff
+; GFX6-NEXT:    v_min_f64 v[0:1], v[0:1], v[2:3]
 ; GFX6-NEXT:    v_cmp_o_f64_e64 vcc, s[2:3], s[2:3]
 ; GFX6-NEXT:    v_mov_b32_e32 v2, s2
 ; GFX6-NEXT:    v_mov_b32_e32 v3, s3
@@ -239,9 +239,9 @@ define amdgpu_ps <2 x float> @s_floor_f64_fabs(double inreg %x) {
 ; GFX6-LABEL: s_floor_f64_fabs:
 ; GFX6:       ; %bb.0:
 ; GFX6-NEXT:    v_fract_f64_e64 v[0:1], |s[2:3]|
-; GFX6-NEXT:    s_mov_b32 s0, -1
-; GFX6-NEXT:    s_mov_b32 s1, 0x3fefffff
-; GFX6-NEXT:    v_min_f64 v[0:1], v[0:1], s[0:1]
+; GFX6-NEXT:    v_mov_b32_e32 v2, -1
+; GFX6-NEXT:    v_mov_b32_e32 v3, 0x3fefffff
+; GFX6-NEXT:    v_min_f64 v[0:1], v[0:1], v[2:3]
 ; GFX6-NEXT:    v_cmp_o_f64_e64 vcc, s[2:3], s[2:3]
 ; GFX6-NEXT:    v_mov_b32_e32 v2, s2
 ; GFX6-NEXT:    v_mov_b32_e32 v3, s3
@@ -264,9 +264,9 @@ define amdgpu_ps <2 x float> @s_floor_f64_fneg_fabs(double inreg %x) {
 ; GFX6-LABEL: s_floor_f64_fneg_fabs:
 ; GFX6:       ; %bb.0:
 ; GFX6-NEXT:    v_fract_f64_e64 v[0:1], -|s[2:3]|
-; GFX6-NEXT:    s_mov_b32 s0, -1
-; GFX6-NEXT:    s_mov_b32 s1, 0x3fefffff
-; GFX6-NEXT:    v_min_f64 v[0:1], v[0:1], s[0:1]
+; GFX6-NEXT:    v_mov_b32_e32 v2, -1
+; GFX6-NEXT:    v_mov_b32_e32 v3, 0x3fefffff
+; GFX6-NEXT:    v_min_f64 v[0:1], v[0:1], v[2:3]
 ; GFX6-NEXT:    v_cmp_o_f64_e64 vcc, s[2:3], s[2:3]
 ; GFX6-NEXT:    v_mov_b32_e32 v2, s2
 ; GFX6-NEXT:    v_mov_b32_e32 v3, s3
diff --git a/llvm/test/CodeGen/AMDGPU/GlobalISel/global-atomic-fadd.f64.ll b/llvm/test/CodeGen/AMDGPU/GlobalISel/global-atomic-fadd.f64.ll
index b54aec9..3c57832 100644
--- a/llvm/test/CodeGen/AMDGPU/GlobalISel/global-atomic-fadd.f64.ll
+++ b/llvm/test/CodeGen/AMDGPU/GlobalISel/global-atomic-fadd.f64.ll
@@ -434,8 +434,8 @@ define amdgpu_ps void @global_atomic_fadd_f64_saddr_no_rtn_atomicrmw(ptr addrspa
   ; GFX90A_ITERATIVE-NEXT:   [[REG_SEQUENCE1:%[0-9]+]]:sreg_64 = REG_SEQUENCE [[V_READLANE_B32_]], %subreg.sub0, [[V_READLANE_B32_1]], %subreg.sub1
   ; GFX90A_ITERATIVE-NEXT:   [[COPY7:%[0-9]+]]:vreg_64_align2 = COPY [[REG_SEQUENCE1]]
   ; GFX90A_ITERATIVE-NEXT:   [[V_ADD_F64_e64_:%[0-9]+]]:vreg_64_align2 = nofpexcept V_ADD_F64_e64 0, [[PHI]], 0, [[COPY7]], 0, 0, implicit $mode, implicit $exec
-  ; GFX90A_ITERATIVE-NEXT:   [[S_MOV_B1:%[0-9]+]]:sreg_64 = S_MOV_B64_IMM_PSEUDO 1
-  ; GFX90A_ITERATIVE-NEXT:   [[COPY8:%[0-9]+]]:vreg_64_align2 = COPY [[S_MOV_B1]]
+  ; GFX90A_ITERATIVE-NEXT:   [[S_MOV_B64_:%[0-9]+]]:sreg_64 = S_MOV_B64 1
+  ; GFX90A_ITERATIVE-NEXT:   [[COPY8:%[0-9]+]]:vreg_64_align2 = COPY [[S_MOV_B64_]]
   ; GFX90A_ITERATIVE-NEXT:   [[V_LSHLREV_B64_e64_:%[0-9]+]]:vreg_64_align2 = V_LSHLREV_B64_e64 [[V_MIN_U32_e64_]], [[COPY8]], implicit $exec
   ; GFX90A_ITERATIVE-NEXT:   [[COPY9:%[0-9]+]]:vgpr_32 = COPY [[V_LSHLREV_B64_e64_]].sub0
   ; GFX90A_ITERATIVE-NEXT:   [[COPY10:%[0-9]+]]:vgpr_32 = COPY [[V_LSHLREV_B64_e64_]].sub1
@@ -446,8 +446,8 @@ define amdgpu_ps void @global_atomic_fadd_f64_saddr_no_rtn_atomicrmw(ptr addrspa
   ; GFX90A_ITERATIVE-NEXT:   [[V_AND_B32_e64_:%[0-9]+]]:vgpr_32 = V_AND_B32_e64 [[COPY11]], [[V_NOT_B32_e32_]], implicit $exec
   ; GFX90A_ITERATIVE-NEXT:   [[V_AND_B32_e64_1:%[0-9]+]]:vgpr_32 = V_AND_B32_e64 [[COPY12]], [[V_NOT_B32_e32_1]], implicit $exec
   ; GFX90A_ITERATIVE-NEXT:   [[REG_SEQUENCE2:%[0-9]+]]:vreg_64_align2 = REG_SEQUENCE [[V_AND_B32_e64_]], %subreg.sub0, [[V_AND_B32_e64_1]], %subreg.sub1
-  ; GFX90A_ITERATIVE-NEXT:   [[S_MOV_B2:%[0-9]+]]:sreg_64 = S_MOV_B64_IMM_PSEUDO 0
-  ; GFX90A_ITERATIVE-NEXT:   [[COPY13:%[0-9]+]]:vreg_64_align2 = COPY [[S_MOV_B2]]
+  ; GFX90A_ITERATIVE-NEXT:   [[S_MOV_B64_1:%[0-9]+]]:sreg_64 = S_MOV_B64 0
+  ; GFX90A_ITERATIVE-NEXT:   [[COPY13:%[0-9]+]]:vreg_64_align2 = COPY [[S_MOV_B64_1]]
   ; GFX90A_ITERATIVE-NEXT:   [[V_CMP_NE_U64_e64_:%[0-9]+]]:sreg_64 = V_CMP_NE_U64_e64 [[REG_SEQUENCE2]], [[COPY13]], implicit $exec
   ; GFX90A_ITERATIVE-NEXT:   $vcc = COPY [[V_CMP_NE_U64_e64_]]
   ; GFX90A_ITERATIVE-NEXT:   S_CBRANCH_VCCNZ %bb.7, implicit $vcc
@@ -615,8 +615,8 @@ define amdgpu_ps void @global_atomic_fadd_f64_saddr_no_rtn_atomicrmw(ptr addrspa
   ; GFX940_ITERATIVE-NEXT:   [[REG_SEQUENCE1:%[0-9]+]]:sreg_64 = REG_SEQUENCE [[V_READLANE_B32_]], %subreg.sub0, [[V_READLANE_B32_1]], %subreg.sub1
   ; GFX940_ITERATIVE-NEXT:   [[COPY7:%[0-9]+]]:vreg_64_align2 = COPY [[REG_SEQUENCE1]]
   ; GFX940_ITERATIVE-NEXT:   [[V_ADD_F64_e64_:%[0-9]+]]:vreg_64_align2 = nofpexcept V_ADD_F64_e64 0, [[PHI]], 0, [[COPY7]], 0, 0, implicit $mode, implicit $exec
-  ; GFX940_ITERATIVE-NEXT:   [[S_MOV_B1:%[0-9]+]]:sreg_64 = S_MOV_B64_IMM_PSEUDO 1
-  ; GFX940_ITERATIVE-NEXT:   [[COPY8:%[0-9]+]]:vreg_64_align2 = COPY [[S_MOV_B1]]
+  ; GFX940_ITERATIVE-NEXT:   [[S_MOV_B64_:%[0-9]+]]:sreg_64 = S_MOV_B64 1
+  ; GFX940_ITERATIVE-NEXT:   [[COPY8:%[0-9]+]]:vreg_64_align2 = COPY [[S_MOV_B64_]]
   ; GFX940_ITERATIVE-NEXT:   [[V_LSHLREV_B64_e64_:%[0-9]+]]:vreg_64_align2 = V_LSHLREV_B64_e64 [[V_MIN_U32_e64_]], [[COPY8]], implicit $exec
   ; GFX940_ITERATIVE-NEXT:   [[COPY9:%[0-9]+]]:vgpr_32 = COPY [[V_LSHLREV_B64_e64_]].sub0
   ; GFX940_ITERATIVE-NEXT:   [[COPY10:%[0-9]+]]:vgpr_32 = COPY [[V_LSHLREV_B64_e64_]].sub1
@@ -627,8 +627,8 @@ define amdgpu_ps void @global_atomic_fadd_f64_saddr_no_rtn_atomicrmw(ptr addrspa
   ; GFX940_ITERATIVE-NEXT:   [[V_AND_B32_e64_:%[0-9]+]]:vgpr_32 = V_AND_B32_e64 [[COPY11]], [[V_NOT_B32_e32_]], implicit $exec
   ; GFX940_ITERATIVE-NEXT:   [[V_AND_B32_e64_1:%[0-9]+]]:vgpr_32 = V_AND_B32_e64 [[COPY12]], [[V_NOT_B32_e32_1]], implicit $exec
   ; GFX940_ITERATIVE-NEXT:   [[REG_SEQUENCE2:%[0-9]+]]:vreg_64_align2 = REG_SEQUENCE [[V_AND_B32_e64_]], %subreg.sub0, [[V_AND_B32_e64_1]], %subreg.sub1
-  ; GFX940_ITERATIVE-NEXT:   [[S_MOV_B2:%[0-9]+]]:sreg_64 = S_MOV_B64_IMM_PSEUDO 0
-  ; GFX940_ITERATIVE-NEXT:   [[COPY13:%[0-9]+]]:vreg_64_align2 = COPY [[S_MOV_B2]]
+  ; GFX940_ITERATIVE-NEXT:   [[S_MOV_B64_1:%[0-9]+]]:sreg_64 = S_MOV_B64 0
+  ; GFX940_ITERATIVE-NEXT:   [[COPY13:%[0-9]+]]:vreg_64_align2 = COPY [[S_MOV_B64_1]]
   ; GFX940_ITERATIVE-NEXT:   [[V_CMP_NE_U64_e64_:%[0-9]+]]:sreg_64 = V_CMP_NE_U64_e64 [[REG_SEQUENCE2]], [[COPY13]], implicit $exec
   ; GFX940_ITERATIVE-NEXT:   $vcc = COPY [[V_CMP_NE_U64_e64_]]
   ; GFX940_ITERATIVE-NEXT:   S_CBRANCH_VCCNZ %bb.7, implicit $vcc
@@ -837,8 +837,8 @@ define amdgpu_ps double @global_atomic_fadd_f64_saddr_rtn_atomicrmw(ptr addrspac
   ; GFX90A_ITERATIVE-NEXT:   [[REG_SEQUENCE4:%[0-9]+]]:vreg_64_align2 = REG_SEQUENCE [[V_WRITELANE_B32_]], %subreg.sub0, [[V_WRITELANE_B32_1]], %subreg.sub1
   ; GFX90A_ITERATIVE-NEXT:   [[COPY20:%[0-9]+]]:vreg_64_align2 = COPY [[REG_SEQUENCE3]]
   ; GFX90A_ITERATIVE-NEXT:   [[V_ADD_F64_e64_1:%[0-9]+]]:vreg_64_align2 = nofpexcept V_ADD_F64_e64 0, [[PHI2]], 0, [[COPY20]], 0, 0, implicit $mode, implicit $exec
-  ; GFX90A_ITERATIVE-NEXT:   [[S_MOV_B1:%[0-9]+]]:sreg_64 = S_MOV_B64_IMM_PSEUDO 1
-  ; GFX90A_ITERATIVE-NEXT:   [[COPY21:%[0-9]+]]:vreg_64_align2 = COPY [[S_MOV_B1]]
+  ; GFX90A_ITERATIVE-NEXT:   [[S_MOV_B64_:%[0-9]+]]:sreg_64 = S_MOV_B64 1
+  ; GFX90A_ITERATIVE-NEXT:   [[COPY21:%[0-9]+]]:vreg_64_align2 = COPY [[S_MOV_B64_]]
   ; GFX90A_ITERATIVE-NEXT:   [[V_LSHLREV_B64_e64_:%[0-9]+]]:vreg_64_align2 = V_LSHLREV_B64_e64 [[V_MIN_U32_e64_]], [[COPY21]], implicit $exec
   ; GFX90A_ITERATIVE-NEXT:   [[COPY22:%[0-9]+]]:vgpr_32 = COPY [[V_LSHLREV_B64_e64_]].sub0
   ; GFX90A_ITERATIVE-NEXT:   [[COPY23:%[0-9]+]]:vgpr_32 = COPY [[V_LSHLREV_B64_e64_]].sub1
@@ -849,8 +849,8 @@ define amdgpu_ps double @global_atomic_fadd_f64_saddr_rtn_atomicrmw(ptr addrspac
   ; GFX90A_ITERATIVE-NEXT:   [[V_AND_B32_e64_:%[0-9]+]]:vgpr_32 = V_AND_B32_e64 [[COPY24]], [[V_NOT_B32_e32_]], implicit $exec
   ; GFX90A_ITERATIVE-NEXT:   [[V_AND_B32_e64_1:%[0-9]+]]:vgpr_32 = V_AND_B32_e64 [[COPY25]], [[V_NOT_B32_e32_1]], implicit $exec
   ; GFX90A_ITERATIVE-NEXT:   [[REG_SEQUENCE5:%[0-9]+]]:vreg_64_align2 = REG_SEQUENCE [[V_AND_B32_e64_]], %subreg.sub0, [[V_AND_B32_e64_1]], %subreg.sub1
-  ; GFX90A_ITERATIVE-NEXT:   [[S_MOV_B2:%[0-9]+]]:sreg_64 = S_MOV_B64_IMM_PSEUDO 0
-  ; GFX90A_ITERATIVE-NEXT:   [[COPY26:%[0-9]+]]:vreg_64_align2 = COPY [[S_MOV_B2]]
+  ; GFX90A_ITERATIVE-NEXT:   [[S_MOV_B64_1:%[0-9]+]]:sreg_64 = S_MOV_B64 0
+  ; GFX90A_ITERATIVE-NEXT:   [[COPY26:%[0-9]+]]:vreg_64_align2 = COPY [[S_MOV_B64_1]]
   ; GFX90A_ITERATIVE-NEXT:   [[V_CMP_NE_U64_e64_:%[0-9]+]]:sreg_64 = V_CMP_NE_U64_e64 [[REG_SEQUENCE5]], [[COPY26]], implicit $exec
   ; GFX90A_ITERATIVE-NEXT:   $vcc = COPY [[V_CMP_NE_U64_e64_]]
   ; GFX90A_ITERATIVE-NEXT:   S_CBRANCH_VCCNZ %bb.7, implicit $vcc
@@ -1089,8 +1089,8 @@ define amdgpu_ps double @global_atomic_fadd_f64_saddr_rtn_atomicrmw(ptr addrspac
   ; GFX940_ITERATIVE-NEXT:   [[REG_SEQUENCE4:%[0-9]+]]:vreg_64_align2 = REG_SEQUENCE [[V_WRITELANE_B32_]], %subreg.sub0, [[V_WRITELANE_B32_1]], %subreg.sub1
   ; GFX940_ITERATIVE-NEXT:   [[COPY20:%[0-9]+]]:vreg_64_align2 = COPY [[REG_SEQUENCE3]]
   ; GFX940_ITERATIVE-NEXT:   [[V_ADD_F64_e64_1:%[0-9]+]]:vreg_64_align2 = nofpexcept V_ADD_F64_e64 0, [[PHI2]], 0, [[COPY20]], 0, 0, implicit $mode, implicit $exec
-  ; GFX940_ITERATIVE-NEXT:   [[S_MOV_B1:%[0-9]+]]:sreg_64 = S_MOV_B64_IMM_PSEUDO 1
-  ; GFX940_ITERATIVE-NEXT:   [[COPY21:%[0-9]+]]:vreg_64_align2 = COPY [[S_MOV_B1]]
+  ; GFX940_ITERATIVE-NEXT:   [[S_MOV_B64_:%[0-9]+]]:sreg_64 = S_MOV_B64 1
+  ; GFX940_ITERATIVE-NEXT:   [[COPY21:%[0-9]+]]:vreg_64_align2 = COPY [[S_MOV_B64_]]
   ; GFX940_ITERATIVE-NEXT:   [[V_LSHLREV_B64_e64_:%[0-9]+]]:vreg_64_align2 = V_LSHLREV_B64_e64 [[V_MIN_U32_e64_]], [[COPY21]], implicit $exec
   ; GFX940_ITERATIVE-NEXT:   [[COPY22:%[0-9]+]]:vgpr_32 = COPY [[V_LSHLREV_B64_e64_]].sub0
   ; GFX940_ITERATIVE-NEXT:   [[COPY23:%[0-9]+]]:vgpr_32 = COPY [[V_LSHLREV_B64_e64_]].sub1
@@ -1101,8 +1101,8 @@ define amdgpu_ps double @global_atomic_fadd_f64_saddr_rtn_atomicrmw(ptr addrspac
   ; GFX940_ITERATIVE-NEXT:   [[V_AND_B32_e64_:%[0-9]+]]:vgpr_32 = V_AND_B32_e64 [[COPY24]], [[V_NOT_B32_e32_]], implicit $exec
   ; GFX940_ITERATIVE-NEXT:   [[V_AND_B32_e64_1:%[0-9]+]]:vgpr_32 = V_AND_B32_e64 [[COPY25]], [[V_NOT_B32_e32_1]], implicit $exec
   ; GFX940_ITERATIVE-NEXT:   [[REG_SEQUENCE5:%[0-9]+]]:vreg_64_align2 = REG_SEQUENCE [[V_AND_B32_e64_]], %subreg.sub0, [[V_AND_B32_e64_1]], %subreg.sub1
-  ; GFX940_ITERATIVE-NEXT:   [[S_MOV_B2:%[0-9]+]]:sreg_64 = S_MOV_B64_IMM_PSEUDO 0
-  ; GFX940_ITERATIVE-NEXT:   [[COPY26:%[0-9]+]]:vreg_64_align2 = COPY [[S_MOV_B2]]
+  ; GFX940_ITERATIVE-NEXT:   [[S_MOV_B64_1:%[0-9]+]]:sreg_64 = S_MOV_B64 0
+  ; GFX940_ITERATIVE-NEXT:   [[COPY26:%[0-9]+]]:vreg_64_align2 = COPY [[S_MOV_B64_1]]
   ; GFX940_ITERATIVE-NEXT:   [[V_CMP_NE_U64_e64_:%[0-9]+]]:sreg_64 = V_CMP_NE_U64_e64 [[REG_SEQUENCE5]], [[COPY26]], implicit $exec
   ; GFX940_ITERATIVE-NEXT:   $vcc = COPY [[V_CMP_NE_U64_e64_]]
   ; GFX940_ITERATIVE-NEXT:   S_CBRANCH_VCCNZ %bb.7, implicit $vcc
diff --git a/llvm/test/CodeGen/AMDGPU/GlobalISel/inst-select-amdgcn.fcmp.constants.w32.mir b/llvm/test/CodeGen/AMDGPU/GlobalISel/inst-select-amdgcn.fcmp.constants.w32.mir
index 6cb1a12..9a1e1ed 100644
--- a/llvm/test/CodeGen/AMDGPU/GlobalISel/inst-select-amdgcn.fcmp.constants.w32.mir
+++ b/llvm/test/CodeGen/AMDGPU/GlobalISel/inst-select-amdgcn.fcmp.constants.w32.mir
@@ -1,5 +1,5 @@
 # NOTE: Assertions have been autogenerated by utils/update_mir_test_checks.py
-# RUN: llc -mtriple=amdgcn -mcpu=gfx1100 -mattr="+wavefrontsize32" -run-pass=instruction-select -verify-machineinstrs -global-isel %s -o - | FileCheck %s
+# RUN: llc -mtriple=amdgcn -mcpu=gfx1100 -run-pass=instruction-select -verify-machineinstrs -global-isel %s -o - | FileCheck %s
 
 ---
 name: fcmp_false_f16
diff --git a/llvm/test/CodeGen/AMDGPU/GlobalISel/inst-select-and.mir b/llvm/test/CodeGen/AMDGPU/GlobalISel/inst-select-and.mir
index 7179b9f..cbf82da 100644
--- a/llvm/test/CodeGen/AMDGPU/GlobalISel/inst-select-and.mir
+++ b/llvm/test/CodeGen/AMDGPU/GlobalISel/inst-select-and.mir
@@ -1,8 +1,8 @@
 # NOTE: Assertions have been autogenerated by utils/update_mir_test_checks.py
 # RUN: llc -mtriple=amdgcn-amd-amdhsa -mcpu=tahiti -run-pass=instruction-select -verify-machineinstrs -global-isel-abort=0 -o - %s  | FileCheck -check-prefix=WAVE64 %s
 # RUN: llc -mtriple=amdgcn-amd-amdhsa -mcpu=fiji -run-pass=instruction-select -verify-machineinstrs -global-isel-abort=0 -o - %s  | FileCheck -check-prefix=WAVE64 %s
-# RUN: llc -mtriple=amdgcn-amd-amdhsa -mcpu=gfx1010 -mattr="+wavefrontsize32" -run-pass=instruction-select -global-isel-abort=0 -verify-machineinstrs -o - %s  | FileCheck -check-prefix=WAVE32 %s
-# RUN: llc -mtriple=amdgcn-amd-amdhsa -mcpu=gfx1100 -mattr="+wavefrontsize32" -run-pass=instruction-select -global-isel-abort=0 -verify-machineinstrs -o - %s  | FileCheck -check-prefix=WAVE32 %s
+# RUN: llc -mtriple=amdgcn-amd-amdhsa -mcpu=gfx1010 -run-pass=instruction-select -global-isel-abort=0 -verify-machineinstrs -o - %s  | FileCheck -check-prefix=WAVE32 %s
+# RUN: llc -mtriple=amdgcn-amd-amdhsa -mcpu=gfx1100 -run-pass=instruction-select -global-isel-abort=0 -verify-machineinstrs -o - %s  | FileCheck -check-prefix=WAVE32 %s
 
 ---
 
diff --git a/llvm/test/CodeGen/AMDGPU/GlobalISel/inst-select-constant.mir b/llvm/test/CodeGen/AMDGPU/GlobalISel/inst-select-constant.mir
index 390541a..2368ea3 100644
--- a/llvm/test/CodeGen/AMDGPU/GlobalISel/inst-select-constant.mir
+++ b/llvm/test/CodeGen/AMDGPU/GlobalISel/inst-select-constant.mir
@@ -142,13 +142,9 @@ body: |
     ; WAVE64-NEXT: [[V_MOV_B3:%[0-9]+]]:vreg_64 = V_MOV_B64_PSEUDO -54, implicit $exec
     ; WAVE64-NEXT: [[V_MOV_B4:%[0-9]+]]:vreg_64 = V_MOV_B64_PSEUDO 27, implicit $exec
     ; WAVE64-NEXT: [[V_MOV_B5:%[0-9]+]]:vreg_64 = V_MOV_B64_PSEUDO 4294967295, implicit $exec
-    ; WAVE64-NEXT: [[V_MOV_B32_e32_:%[0-9]+]]:vgpr_32 = V_MOV_B32_e32 0, implicit $exec
-    ; WAVE64-NEXT: [[V_MOV_B32_e32_1:%[0-9]+]]:vgpr_32 = V_MOV_B32_e32 1, implicit $exec
-    ; WAVE64-NEXT: [[REG_SEQUENCE:%[0-9]+]]:vreg_64 = REG_SEQUENCE [[V_MOV_B32_e32_]], %subreg.sub0, [[V_MOV_B32_e32_1]], %subreg.sub1
-    ; WAVE64-NEXT: [[V_MOV_B32_e32_2:%[0-9]+]]:vgpr_32 = V_MOV_B32_e32 23255, implicit $exec
-    ; WAVE64-NEXT: [[V_MOV_B32_e32_3:%[0-9]+]]:vgpr_32 = V_MOV_B32_e32 -16, implicit $exec
-    ; WAVE64-NEXT: [[REG_SEQUENCE1:%[0-9]+]]:vreg_64 = REG_SEQUENCE [[V_MOV_B32_e32_2]], %subreg.sub0, [[V_MOV_B32_e32_3]], %subreg.sub1
-    ; WAVE64-NEXT: S_ENDPGM 0, implicit [[V_MOV_B]], implicit [[V_MOV_B1]], implicit [[V_MOV_B2]], implicit [[V_MOV_B3]], implicit [[V_MOV_B4]], implicit [[V_MOV_B5]], implicit [[REG_SEQUENCE]], implicit [[REG_SEQUENCE1]]
+    ; WAVE64-NEXT: [[V_MOV_B6:%[0-9]+]]:vreg_64 = V_MOV_B64_PSEUDO 4294967296, implicit $exec
+    ; WAVE64-NEXT: [[V_MOV_B7:%[0-9]+]]:vreg_64 = V_MOV_B64_PSEUDO -68719453481, implicit $exec
+    ; WAVE64-NEXT: S_ENDPGM 0, implicit [[V_MOV_B]], implicit [[V_MOV_B1]], implicit [[V_MOV_B2]], implicit [[V_MOV_B3]], implicit [[V_MOV_B4]], implicit [[V_MOV_B5]], implicit [[V_MOV_B6]], implicit [[V_MOV_B7]]
     ;
     ; WAVE32-LABEL: name: constant_v_s64
     ; WAVE32: [[V_MOV_B:%[0-9]+]]:vreg_64 = V_MOV_B64_PSEUDO 0, implicit $exec
@@ -157,13 +153,9 @@ body: |
     ; WAVE32-NEXT: [[V_MOV_B3:%[0-9]+]]:vreg_64 = V_MOV_B64_PSEUDO -54, implicit $exec
     ; WAVE32-NEXT: [[V_MOV_B4:%[0-9]+]]:vreg_64 = V_MOV_B64_PSEUDO 27, implicit $exec
     ; WAVE32-NEXT: [[V_MOV_B5:%[0-9]+]]:vreg_64 = V_MOV_B64_PSEUDO 4294967295, implicit $exec
-    ; WAVE32-NEXT: [[V_MOV_B32_e32_:%[0-9]+]]:vgpr_32 = V_MOV_B32_e32 0, implicit $exec
-    ; WAVE32-NEXT: [[V_MOV_B32_e32_1:%[0-9]+]]:vgpr_32 = V_MOV_B32_e32 1, implicit $exec
-    ; WAVE32-NEXT: [[REG_SEQUENCE:%[0-9]+]]:vreg_64 = REG_SEQUENCE [[V_MOV_B32_e32_]], %subreg.sub0, [[V_MOV_B32_e32_1]], %subreg.sub1
-    ; WAVE32-NEXT: [[V_MOV_B32_e32_2:%[0-9]+]]:vgpr_32 = V_MOV_B32_e32 23255, implicit $exec
-    ; WAVE32-NEXT: [[V_MOV_B32_e32_3:%[0-9]+]]:vgpr_32 = V_MOV_B32_e32 -16, implicit $exec
-    ; WAVE32-NEXT: [[REG_SEQUENCE1:%[0-9]+]]:vreg_64 = REG_SEQUENCE [[V_MOV_B32_e32_2]], %subreg.sub0, [[V_MOV_B32_e32_3]], %subreg.sub1
-    ; WAVE32-NEXT: S_ENDPGM 0, implicit [[V_MOV_B]], implicit [[V_MOV_B1]], implicit [[V_MOV_B2]], implicit [[V_MOV_B3]], implicit [[V_MOV_B4]], implicit [[V_MOV_B5]], implicit [[REG_SEQUENCE]], implicit [[REG_SEQUENCE1]]
+    ; WAVE32-NEXT: [[V_MOV_B6:%[0-9]+]]:vreg_64 = V_MOV_B64_PSEUDO 4294967296, implicit $exec
+    ; WAVE32-NEXT: [[V_MOV_B7:%[0-9]+]]:vreg_64 = V_MOV_B64_PSEUDO -68719453481, implicit $exec
+    ; WAVE32-NEXT: S_ENDPGM 0, implicit [[V_MOV_B]], implicit [[V_MOV_B1]], implicit [[V_MOV_B2]], implicit [[V_MOV_B3]], implicit [[V_MOV_B4]], implicit [[V_MOV_B5]], implicit [[V_MOV_B6]], implicit [[V_MOV_B7]]
     %0:vgpr(s64) = G_CONSTANT i64 0
     %1:vgpr(s64) = G_CONSTANT i64 1
     %2:vgpr(s64) = G_CONSTANT i64 -1
@@ -184,34 +176,26 @@ tracksRegLiveness: true
 body: |
   bb.0:
     ; WAVE64-LABEL: name: constant_s_s64
-    ; WAVE64: [[S_MOV_B:%[0-9]+]]:sreg_64 = S_MOV_B64_IMM_PSEUDO 0
-    ; WAVE64-NEXT: [[S_MOV_B1:%[0-9]+]]:sreg_64 = S_MOV_B64_IMM_PSEUDO 1
-    ; WAVE64-NEXT: [[S_MOV_B2:%[0-9]+]]:sreg_64 = S_MOV_B64_IMM_PSEUDO -1
-    ; WAVE64-NEXT: [[S_MOV_B3:%[0-9]+]]:sreg_64 = S_MOV_B64_IMM_PSEUDO -54
-    ; WAVE64-NEXT: [[S_MOV_B4:%[0-9]+]]:sreg_64 = S_MOV_B64_IMM_PSEUDO 27
-    ; WAVE64-NEXT: [[S_MOV_B5:%[0-9]+]]:sreg_64 = S_MOV_B64_IMM_PSEUDO 4294967295
-    ; WAVE64-NEXT: [[S_MOV_B32_:%[0-9]+]]:sreg_32 = S_MOV_B32 0
-    ; WAVE64-NEXT: [[S_MOV_B32_1:%[0-9]+]]:sreg_32 = S_MOV_B32 1
-    ; WAVE64-NEXT: [[REG_SEQUENCE:%[0-9]+]]:sreg_64 = REG_SEQUENCE [[S_MOV_B32_]], %subreg.sub0, [[S_MOV_B32_1]], %subreg.sub1
-    ; WAVE64-NEXT: [[S_MOV_B32_2:%[0-9]+]]:sreg_32 = S_MOV_B32 23255
-    ; WAVE64-NEXT: [[S_MOV_B32_3:%[0-9]+]]:sreg_32 = S_MOV_B32 -16
-    ; WAVE64-NEXT: [[REG_SEQUENCE1:%[0-9]+]]:sreg_64 = REG_SEQUENCE [[S_MOV_B32_2]], %subreg.sub0, [[S_MOV_B32_3]], %subreg.sub1
-    ; WAVE64-NEXT: S_ENDPGM 0, implicit [[S_MOV_B]], implicit [[S_MOV_B1]], implicit [[S_MOV_B2]], implicit [[S_MOV_B3]], implicit [[S_MOV_B4]], implicit [[S_MOV_B5]], implicit [[REG_SEQUENCE]], implicit [[REG_SEQUENCE1]]
+    ; WAVE64: [[S_MOV_B64_:%[0-9]+]]:sreg_64 = S_MOV_B64 0
+    ; WAVE64-NEXT: [[S_MOV_B64_1:%[0-9]+]]:sreg_64 = S_MOV_B64 1
+    ; WAVE64-NEXT: [[S_MOV_B64_2:%[0-9]+]]:sreg_64 = S_MOV_B64 -1
+    ; WAVE64-NEXT: [[S_MOV_B:%[0-9]+]]:sreg_64 = S_MOV_B64_IMM_PSEUDO -54
+    ; WAVE64-NEXT: [[S_MOV_B64_3:%[0-9]+]]:sreg_64 = S_MOV_B64 27
+    ; WAVE64-NEXT: [[S_MOV_B1:%[0-9]+]]:sreg_64 = S_MOV_B64_IMM_PSEUDO 4294967295
+    ; WAVE64-NEXT: [[S_MOV_B2:%[0-9]+]]:sreg_64 = S_MOV_B64_IMM_PSEUDO 4294967296
+    ; WAVE64-NEXT: [[S_MOV_B3:%[0-9]+]]:sreg_64 = S_MOV_B64_IMM_PSEUDO -68719453481
+    ; WAVE64-NEXT: S_ENDPGM 0, implicit [[S_MOV_B64_]], implicit [[S_MOV_B64_1]], implicit [[S_MOV_B64_2]], implicit [[S_MOV_B]], implicit [[S_MOV_B64_3]], implicit [[S_MOV_B1]], implicit [[S_MOV_B2]], implicit [[S_MOV_B3]]
     ;
     ; WAVE32-LABEL: name: constant_s_s64
-    ; WAVE32: [[S_MOV_B:%[0-9]+]]:sreg_64 = S_MOV_B64_IMM_PSEUDO 0
-    ; WAVE32-NEXT: [[S_MOV_B1:%[0-9]+]]:sreg_64 = S_MOV_B64_IMM_PSEUDO 1
-    ; WAVE32-NEXT: [[S_MOV_B2:%[0-9]+]]:sreg_64 = S_MOV_B64_IMM_PSEUDO -1
-    ; WAVE32-NEXT: [[S_MOV_B3:%[0-9]+]]:sreg_64 = S_MOV_B64_IMM_PSEUDO -54
-    ; WAVE32-NEXT: [[S_MOV_B4:%[0-9]+]]:sreg_64 = S_MOV_B64_IMM_PSEUDO 27
-    ; WAVE32-NEXT: [[S_MOV_B5:%[0-9]+]]:sreg_64 = S_MOV_B64_IMM_PSEUDO 4294967295
-    ; WAVE32-NEXT: [[S_MOV_B32_:%[0-9]+]]:sreg_32 = S_MOV_B32 0
-    ; WAVE32-NEXT: [[S_MOV_B32_1:%[0-9]+]]:sreg_32 = S_MOV_B32 1
-    ; WAVE32-NEXT: [[REG_SEQUENCE:%[0-9]+]]:sreg_64 = REG_SEQUENCE [[S_MOV_B32_]], %subreg.sub0, [[S_MOV_B32_1]], %subreg.sub1
-    ; WAVE32-NEXT: [[S_MOV_B32_2:%[0-9]+]]:sreg_32 = S_MOV_B32 23255
-    ; WAVE32-NEXT: [[S_MOV_B32_3:%[0-9]+]]:sreg_32 = S_MOV_B32 -16
-    ; WAVE32-NEXT: [[REG_SEQUENCE1:%[0-9]+]]:sreg_64 = REG_SEQUENCE [[S_MOV_B32_2]], %subreg.sub0, [[S_MOV_B32_3]], %subreg.sub1
-    ; WAVE32-NEXT: S_ENDPGM 0, implicit [[S_MOV_B]], implicit [[S_MOV_B1]], implicit [[S_MOV_B2]], implicit [[S_MOV_B3]], implicit [[S_MOV_B4]], implicit [[S_MOV_B5]], implicit [[REG_SEQUENCE]], implicit [[REG_SEQUENCE1]]
+    ; WAVE32: [[S_MOV_B64_:%[0-9]+]]:sreg_64 = S_MOV_B64 0
+    ; WAVE32-NEXT: [[S_MOV_B64_1:%[0-9]+]]:sreg_64 = S_MOV_B64 1
+    ; WAVE32-NEXT: [[S_MOV_B64_2:%[0-9]+]]:sreg_64 = S_MOV_B64 -1
+    ; WAVE32-NEXT: [[S_MOV_B:%[0-9]+]]:sreg_64 = S_MOV_B64_IMM_PSEUDO -54
+    ; WAVE32-NEXT: [[S_MOV_B64_3:%[0-9]+]]:sreg_64 = S_MOV_B64 27
+    ; WAVE32-NEXT: [[S_MOV_B1:%[0-9]+]]:sreg_64 = S_MOV_B64_IMM_PSEUDO 4294967295
+    ; WAVE32-NEXT: [[S_MOV_B2:%[0-9]+]]:sreg_64 = S_MOV_B64_IMM_PSEUDO 4294967296
+    ; WAVE32-NEXT: [[S_MOV_B3:%[0-9]+]]:sreg_64 = S_MOV_B64_IMM_PSEUDO -68719453481
+    ; WAVE32-NEXT: S_ENDPGM 0, implicit [[S_MOV_B64_]], implicit [[S_MOV_B64_1]], implicit [[S_MOV_B64_2]], implicit [[S_MOV_B]], implicit [[S_MOV_B64_3]], implicit [[S_MOV_B1]], implicit [[S_MOV_B2]], implicit [[S_MOV_B3]]
     %0:sgpr(s64) = G_CONSTANT i64 0
     %1:sgpr(s64) = G_CONSTANT i64 1
     %2:sgpr(s64) = G_CONSTANT i64 -1
@@ -311,6 +295,195 @@ body: |
 ...
 
 ---
+name:            constant_s_p2
+legalized:       true
+regBankSelected: true
+tracksRegLiveness: true
+
+body: |
+  bb.0:
+
+    ; WAVE64-LABEL: name: constant_s_p2
+    ; WAVE64: [[S_MOV_B32_:%[0-9]+]]:sreg_32 = S_MOV_B32 0
+    ; WAVE64-NEXT: [[S_MOV_B32_1:%[0-9]+]]:sreg_32 = S_MOV_B32 1
+    ; WAVE64-NEXT: [[S_MOV_B32_2:%[0-9]+]]:sreg_32 = S_MOV_B32 -1
+    ; WAVE64-NEXT: [[S_MOV_B32_3:%[0-9]+]]:sreg_32 = S_MOV_B32 -54
+    ; WAVE64-NEXT: [[S_MOV_B32_4:%[0-9]+]]:sreg_32 = S_MOV_B32 27
+    ; WAVE64-NEXT: S_ENDPGM 0, implicit [[S_MOV_B32_]], implicit [[S_MOV_B32_1]], implicit [[S_MOV_B32_2]], implicit [[S_MOV_B32_3]], implicit [[S_MOV_B32_4]]
+    ;
+    ; WAVE32-LABEL: name: constant_s_p2
+    ; WAVE32: [[S_MOV_B32_:%[0-9]+]]:sreg_32 = S_MOV_B32 0
+    ; WAVE32-NEXT: [[S_MOV_B32_1:%[0-9]+]]:sreg_32 = S_MOV_B32 1
+    ; WAVE32-NEXT: [[S_MOV_B32_2:%[0-9]+]]:sreg_32 = S_MOV_B32 -1
+    ; WAVE32-NEXT: [[S_MOV_B32_3:%[0-9]+]]:sreg_32 = S_MOV_B32 -54
+    ; WAVE32-NEXT: [[S_MOV_B32_4:%[0-9]+]]:sreg_32 = S_MOV_B32 27
+    ; WAVE32-NEXT: S_ENDPGM 0, implicit [[S_MOV_B32_]], implicit [[S_MOV_B32_1]], implicit [[S_MOV_B32_2]], implicit [[S_MOV_B32_3]], implicit [[S_MOV_B32_4]]
+    %0:sgpr(p2) = G_CONSTANT i32 0
+    %1:sgpr(p2) = G_CONSTANT i32 1
+    %2:sgpr(p2) = G_CONSTANT i32 -1
+    %3:sgpr(p2) = G_CONSTANT i32 -54
+    %4:sgpr(p2) = G_CONSTANT i32 27
+    S_ENDPGM 0, implicit %0 , implicit %1 , implicit %2, implicit %3, implicit %4
+...
+
+---
+name:            constant_v_p2
+legalized:       true
+regBankSelected: true
+tracksRegLiveness: true
+
+body: |
+  bb.0:
+    ; WAVE64-LABEL: name: constant_v_p2
+    ; WAVE64: [[V_MOV_B32_e32_:%[0-9]+]]:vgpr_32 = V_MOV_B32_e32 0, implicit $exec
+    ; WAVE64-NEXT: [[V_MOV_B32_e32_1:%[0-9]+]]:vgpr_32 = V_MOV_B32_e32 1, implicit $exec
+    ; WAVE64-NEXT: [[V_MOV_B32_e32_2:%[0-9]+]]:vgpr_32 = V_MOV_B32_e32 -1, implicit $exec
+    ; WAVE64-NEXT: [[V_MOV_B32_e32_3:%[0-9]+]]:vgpr_32 = V_MOV_B32_e32 -54, implicit $exec
+    ; WAVE64-NEXT: [[V_MOV_B32_e32_4:%[0-9]+]]:vgpr_32 = V_MOV_B32_e32 27, implicit $exec
+    ; WAVE64-NEXT: S_ENDPGM 0, implicit [[V_MOV_B32_e32_]], implicit [[V_MOV_B32_e32_1]], implicit [[V_MOV_B32_e32_2]], implicit [[V_MOV_B32_e32_3]], implicit [[V_MOV_B32_e32_4]]
+    ;
+    ; WAVE32-LABEL: name: constant_v_p2
+    ; WAVE32: [[V_MOV_B32_e32_:%[0-9]+]]:vgpr_32 = V_MOV_B32_e32 0, implicit $exec
+    ; WAVE32-NEXT: [[V_MOV_B32_e32_1:%[0-9]+]]:vgpr_32 = V_MOV_B32_e32 1, implicit $exec
+    ; WAVE32-NEXT: [[V_MOV_B32_e32_2:%[0-9]+]]:vgpr_32 = V_MOV_B32_e32 -1, implicit $exec
+    ; WAVE32-NEXT: [[V_MOV_B32_e32_3:%[0-9]+]]:vgpr_32 = V_MOV_B32_e32 -54, implicit $exec
+    ; WAVE32-NEXT: [[V_MOV_B32_e32_4:%[0-9]+]]:vgpr_32 = V_MOV_B32_e32 27, implicit $exec
+    ; WAVE32-NEXT: S_ENDPGM 0, implicit [[V_MOV_B32_e32_]], implicit [[V_MOV_B32_e32_1]], implicit [[V_MOV_B32_e32_2]], implicit [[V_MOV_B32_e32_3]], implicit [[V_MOV_B32_e32_4]]
+    %0:vgpr(p2) = G_CONSTANT i32 0
+    %1:vgpr(p2) = G_CONSTANT i32 1
+    %2:vgpr(p2) = G_CONSTANT i32 -1
+    %3:vgpr(p2) = G_CONSTANT i32 -54
+    %4:vgpr(p2) = G_CONSTANT i32 27
+    S_ENDPGM 0, implicit %0 , implicit %1 , implicit %2, implicit %3, implicit %4
+...
+
+---
+name:            constant_s_p5
+legalized:       true
+regBankSelected: true
+tracksRegLiveness: true
+
+body: |
+  bb.0:
+
+    ; WAVE64-LABEL: name: constant_s_p5
+    ; WAVE64: [[S_MOV_B32_:%[0-9]+]]:sreg_32 = S_MOV_B32 0
+    ; WAVE64-NEXT: [[S_MOV_B32_1:%[0-9]+]]:sreg_32 = S_MOV_B32 1
+    ; WAVE64-NEXT: [[S_MOV_B32_2:%[0-9]+]]:sreg_32 = S_MOV_B32 -1
+    ; WAVE64-NEXT: [[S_MOV_B32_3:%[0-9]+]]:sreg_32 = S_MOV_B32 -54
+    ; WAVE64-NEXT: [[S_MOV_B32_4:%[0-9]+]]:sreg_32 = S_MOV_B32 27
+    ; WAVE64-NEXT: S_ENDPGM 0, implicit [[S_MOV_B32_]], implicit [[S_MOV_B32_1]], implicit [[S_MOV_B32_2]], implicit [[S_MOV_B32_3]], implicit [[S_MOV_B32_4]]
+    ;
+    ; WAVE32-LABEL: name: constant_s_p5
+    ; WAVE32: [[S_MOV_B32_:%[0-9]+]]:sreg_32 = S_MOV_B32 0
+    ; WAVE32-NEXT: [[S_MOV_B32_1:%[0-9]+]]:sreg_32 = S_MOV_B32 1
+    ; WAVE32-NEXT: [[S_MOV_B32_2:%[0-9]+]]:sreg_32 = S_MOV_B32 -1
+    ; WAVE32-NEXT: [[S_MOV_B32_3:%[0-9]+]]:sreg_32 = S_MOV_B32 -54
+    ; WAVE32-NEXT: [[S_MOV_B32_4:%[0-9]+]]:sreg_32 = S_MOV_B32 27
+    ; WAVE32-NEXT: S_ENDPGM 0, implicit [[S_MOV_B32_]], implicit [[S_MOV_B32_1]], implicit [[S_MOV_B32_2]], implicit [[S_MOV_B32_3]], implicit [[S_MOV_B32_4]]
+    %0:sgpr(p5) = G_CONSTANT i32 0
+    %1:sgpr(p5) = G_CONSTANT i32 1
+    %2:sgpr(p5) = G_CONSTANT i32 -1
+    %3:sgpr(p5) = G_CONSTANT i32 -54
+    %4:sgpr(p5) = G_CONSTANT i32 27
+    S_ENDPGM 0, implicit %0 , implicit %1 , implicit %2, implicit %3, implicit %4
+...
+
+---
+name:            constant_v_p5
+legalized:       true
+regBankSelected: true
+tracksRegLiveness: true
+
+body: |
+  bb.0:
+    ; WAVE64-LABEL: name: constant_v_p5
+    ; WAVE64: [[V_MOV_B32_e32_:%[0-9]+]]:vgpr_32 = V_MOV_B32_e32 0, implicit $exec
+    ; WAVE64-NEXT: [[V_MOV_B32_e32_1:%[0-9]+]]:vgpr_32 = V_MOV_B32_e32 1, implicit $exec
+    ; WAVE64-NEXT: [[V_MOV_B32_e32_2:%[0-9]+]]:vgpr_32 = V_MOV_B32_e32 -1, implicit $exec
+    ; WAVE64-NEXT: [[V_MOV_B32_e32_3:%[0-9]+]]:vgpr_32 = V_MOV_B32_e32 -54, implicit $exec
+    ; WAVE64-NEXT: [[V_MOV_B32_e32_4:%[0-9]+]]:vgpr_32 = V_MOV_B32_e32 27, implicit $exec
+    ; WAVE64-NEXT: S_ENDPGM 0, implicit [[V_MOV_B32_e32_]], implicit [[V_MOV_B32_e32_1]], implicit [[V_MOV_B32_e32_2]], implicit [[V_MOV_B32_e32_3]], implicit [[V_MOV_B32_e32_4]]
+    ;
+    ; WAVE32-LABEL: name: constant_v_p5
+    ; WAVE32: [[V_MOV_B32_e32_:%[0-9]+]]:vgpr_32 = V_MOV_B32_e32 0, implicit $exec
+    ; WAVE32-NEXT: [[V_MOV_B32_e32_1:%[0-9]+]]:vgpr_32 = V_MOV_B32_e32 1, implicit $exec
+    ; WAVE32-NEXT: [[V_MOV_B32_e32_2:%[0-9]+]]:vgpr_32 = V_MOV_B32_e32 -1, implicit $exec
+    ; WAVE32-NEXT: [[V_MOV_B32_e32_3:%[0-9]+]]:vgpr_32 = V_MOV_B32_e32 -54, implicit $exec
+    ; WAVE32-NEXT: [[V_MOV_B32_e32_4:%[0-9]+]]:vgpr_32 = V_MOV_B32_e32 27, implicit $exec
+    ; WAVE32-NEXT: S_ENDPGM 0, implicit [[V_MOV_B32_e32_]], implicit [[V_MOV_B32_e32_1]], implicit [[V_MOV_B32_e32_2]], implicit [[V_MOV_B32_e32_3]], implicit [[V_MOV_B32_e32_4]]
+    %0:vgpr(p5) = G_CONSTANT i32 0
+    %1:vgpr(p5) = G_CONSTANT i32 1
+    %2:vgpr(p5) = G_CONSTANT i32 -1
+    %3:vgpr(p5) = G_CONSTANT i32 -54
+    %4:vgpr(p5) = G_CONSTANT i32 27
+    S_ENDPGM 0, implicit %0 , implicit %1 , implicit %2, implicit %3, implicit %4
+...
+
+---
+name:            constant_s_p6
+legalized:       true
+regBankSelected: true
+tracksRegLiveness: true
+
+body: |
+  bb.0:
+
+    ; WAVE64-LABEL: name: constant_s_p6
+    ; WAVE64: [[S_MOV_B32_:%[0-9]+]]:sreg_32 = S_MOV_B32 0
+    ; WAVE64-NEXT: [[S_MOV_B32_1:%[0-9]+]]:sreg_32 = S_MOV_B32 1
+    ; WAVE64-NEXT: [[S_MOV_B32_2:%[0-9]+]]:sreg_32 = S_MOV_B32 -1
+    ; WAVE64-NEXT: [[S_MOV_B32_3:%[0-9]+]]:sreg_32 = S_MOV_B32 -54
+    ; WAVE64-NEXT: [[S_MOV_B32_4:%[0-9]+]]:sreg_32 = S_MOV_B32 27
+    ; WAVE64-NEXT: S_ENDPGM 0, implicit [[S_MOV_B32_]], implicit [[S_MOV_B32_1]], implicit [[S_MOV_B32_2]], implicit [[S_MOV_B32_3]], implicit [[S_MOV_B32_4]]
+    ;
+    ; WAVE32-LABEL: name: constant_s_p6
+    ; WAVE32: [[S_MOV_B32_:%[0-9]+]]:sreg_32 = S_MOV_B32 0
+    ; WAVE32-NEXT: [[S_MOV_B32_1:%[0-9]+]]:sreg_32 = S_MOV_B32 1
+    ; WAVE32-NEXT: [[S_MOV_B32_2:%[0-9]+]]:sreg_32 = S_MOV_B32 -1
+    ; WAVE32-NEXT: [[S_MOV_B32_3:%[0-9]+]]:sreg_32 = S_MOV_B32 -54
+    ; WAVE32-NEXT: [[S_MOV_B32_4:%[0-9]+]]:sreg_32 = S_MOV_B32 27
+    ; WAVE32-NEXT: S_ENDPGM 0, implicit [[S_MOV_B32_]], implicit [[S_MOV_B32_1]], implicit [[S_MOV_B32_2]], implicit [[S_MOV_B32_3]], implicit [[S_MOV_B32_4]]
+    %0:sgpr(p6) = G_CONSTANT i32 0
+    %1:sgpr(p6) = G_CONSTANT i32 1
+    %2:sgpr(p6) = G_CONSTANT i32 -1
+    %3:sgpr(p6) = G_CONSTANT i32 -54
+    %4:sgpr(p6) = G_CONSTANT i32 27
+    S_ENDPGM 0, implicit %0 , implicit %1 , implicit %2, implicit %3, implicit %4
+...
+
+---
+name:            constant_v_p6
+legalized:       true
+regBankSelected: true
+tracksRegLiveness: true
+
+body: |
+  bb.0:
+    ; WAVE64-LABEL: name: constant_v_p6
+    ; WAVE64: [[V_MOV_B32_e32_:%[0-9]+]]:vgpr_32 = V_MOV_B32_e32 0, implicit $exec
+    ; WAVE64-NEXT: [[V_MOV_B32_e32_1:%[0-9]+]]:vgpr_32 = V_MOV_B32_e32 1, implicit $exec
+    ; WAVE64-NEXT: [[V_MOV_B32_e32_2:%[0-9]+]]:vgpr_32 = V_MOV_B32_e32 -1, implicit $exec
+    ; WAVE64-NEXT: [[V_MOV_B32_e32_3:%[0-9]+]]:vgpr_32 = V_MOV_B32_e32 -54, implicit $exec
+    ; WAVE64-NEXT: [[V_MOV_B32_e32_4:%[0-9]+]]:vgpr_32 = V_MOV_B32_e32 27, implicit $exec
+    ; WAVE64-NEXT: S_ENDPGM 0, implicit [[V_MOV_B32_e32_]], implicit [[V_MOV_B32_e32_1]], implicit [[V_MOV_B32_e32_2]], implicit [[V_MOV_B32_e32_3]], implicit [[V_MOV_B32_e32_4]]
+    ;
+    ; WAVE32-LABEL: name: constant_v_p6
+    ; WAVE32: [[V_MOV_B32_e32_:%[0-9]+]]:vgpr_32 = V_MOV_B32_e32 0, implicit $exec
+    ; WAVE32-NEXT: [[V_MOV_B32_e32_1:%[0-9]+]]:vgpr_32 = V_MOV_B32_e32 1, implicit $exec
+    ; WAVE32-NEXT: [[V_MOV_B32_e32_2:%[0-9]+]]:vgpr_32 = V_MOV_B32_e32 -1, implicit $exec
+    ; WAVE32-NEXT: [[V_MOV_B32_e32_3:%[0-9]+]]:vgpr_32 = V_MOV_B32_e32 -54, implicit $exec
+    ; WAVE32-NEXT: [[V_MOV_B32_e32_4:%[0-9]+]]:vgpr_32 = V_MOV_B32_e32 27, implicit $exec
+    ; WAVE32-NEXT: S_ENDPGM 0, implicit [[V_MOV_B32_e32_]], implicit [[V_MOV_B32_e32_1]], implicit [[V_MOV_B32_e32_2]], implicit [[V_MOV_B32_e32_3]], implicit [[V_MOV_B32_e32_4]]
+    %0:vgpr(p6) = G_CONSTANT i32 0
+    %1:vgpr(p6) = G_CONSTANT i32 1
+    %2:vgpr(p6) = G_CONSTANT i32 -1
+    %3:vgpr(p6) = G_CONSTANT i32 -54
+    %4:vgpr(p6) = G_CONSTANT i32 27
+    S_ENDPGM 0, implicit %0 , implicit %1 , implicit %2, implicit %3, implicit %4
+...
+
+---
 name:            constant_s_p1
 legalized:       true
 regBankSelected: true
@@ -325,13 +498,9 @@ body: |
     ; WAVE64-NEXT: [[S_MOV_B3:%[0-9]+]]:sreg_64 = S_MOV_B64_IMM_PSEUDO -54
     ; WAVE64-NEXT: [[S_MOV_B4:%[0-9]+]]:sreg_64 = S_MOV_B64_IMM_PSEUDO 27
     ; WAVE64-NEXT: [[S_MOV_B5:%[0-9]+]]:sreg_64 = S_MOV_B64_IMM_PSEUDO 4294967295
-    ; WAVE64-NEXT: [[S_MOV_B32_:%[0-9]+]]:sreg_32 = S_MOV_B32 0
-    ; WAVE64-NEXT: [[S_MOV_B32_1:%[0-9]+]]:sreg_32 = S_MOV_B32 1
-    ; WAVE64-NEXT: [[REG_SEQUENCE:%[0-9]+]]:sreg_64 = REG_SEQUENCE [[S_MOV_B32_]], %subreg.sub0, [[S_MOV_B32_1]], %subreg.sub1
-    ; WAVE64-NEXT: [[S_MOV_B32_2:%[0-9]+]]:sreg_32 = S_MOV_B32 23255
-    ; WAVE64-NEXT: [[S_MOV_B32_3:%[0-9]+]]:sreg_32 = S_MOV_B32 -16
-    ; WAVE64-NEXT: [[REG_SEQUENCE1:%[0-9]+]]:sreg_64 = REG_SEQUENCE [[S_MOV_B32_2]], %subreg.sub0, [[S_MOV_B32_3]], %subreg.sub1
-    ; WAVE64-NEXT: S_ENDPGM 0, implicit [[S_MOV_B]], implicit [[S_MOV_B1]], implicit [[S_MOV_B2]], implicit [[S_MOV_B3]], implicit [[S_MOV_B4]], implicit [[S_MOV_B5]], implicit [[REG_SEQUENCE]], implicit [[REG_SEQUENCE1]]
+    ; WAVE64-NEXT: [[S_MOV_B6:%[0-9]+]]:sreg_64 = S_MOV_B64_IMM_PSEUDO 4294967296
+    ; WAVE64-NEXT: [[S_MOV_B7:%[0-9]+]]:sreg_64 = S_MOV_B64_IMM_PSEUDO -68719453481
+    ; WAVE64-NEXT: S_ENDPGM 0, implicit [[S_MOV_B]], implicit [[S_MOV_B1]], implicit [[S_MOV_B2]], implicit [[S_MOV_B3]], implicit [[S_MOV_B4]], implicit [[S_MOV_B5]], implicit [[S_MOV_B6]], implicit [[S_MOV_B7]]
     ;
     ; WAVE32-LABEL: name: constant_s_p1
     ; WAVE32: [[S_MOV_B:%[0-9]+]]:sreg_64 = S_MOV_B64_IMM_PSEUDO 0
@@ -340,13 +509,9 @@ body: |
     ; WAVE32-NEXT: [[S_MOV_B3:%[0-9]+]]:sreg_64 = S_MOV_B64_IMM_PSEUDO -54
     ; WAVE32-NEXT: [[S_MOV_B4:%[0-9]+]]:sreg_64 = S_MOV_B64_IMM_PSEUDO 27
     ; WAVE32-NEXT: [[S_MOV_B5:%[0-9]+]]:sreg_64 = S_MOV_B64_IMM_PSEUDO 4294967295
-    ; WAVE32-NEXT: [[S_MOV_B32_:%[0-9]+]]:sreg_32 = S_MOV_B32 0
-    ; WAVE32-NEXT: [[S_MOV_B32_1:%[0-9]+]]:sreg_32 = S_MOV_B32 1
-    ; WAVE32-NEXT: [[REG_SEQUENCE:%[0-9]+]]:sreg_64 = REG_SEQUENCE [[S_MOV_B32_]], %subreg.sub0, [[S_MOV_B32_1]], %subreg.sub1
-    ; WAVE32-NEXT: [[S_MOV_B32_2:%[0-9]+]]:sreg_32 = S_MOV_B32 23255
-    ; WAVE32-NEXT: [[S_MOV_B32_3:%[0-9]+]]:sreg_32 = S_MOV_B32 -16
-    ; WAVE32-NEXT: [[REG_SEQUENCE1:%[0-9]+]]:sreg_64 = REG_SEQUENCE [[S_MOV_B32_2]], %subreg.sub0, [[S_MOV_B32_3]], %subreg.sub1
-    ; WAVE32-NEXT: S_ENDPGM 0, implicit [[S_MOV_B]], implicit [[S_MOV_B1]], implicit [[S_MOV_B2]], implicit [[S_MOV_B3]], implicit [[S_MOV_B4]], implicit [[S_MOV_B5]], implicit [[REG_SEQUENCE]], implicit [[REG_SEQUENCE1]]
+    ; WAVE32-NEXT: [[S_MOV_B6:%[0-9]+]]:sreg_64 = S_MOV_B64_IMM_PSEUDO 4294967296
+    ; WAVE32-NEXT: [[S_MOV_B7:%[0-9]+]]:sreg_64 = S_MOV_B64_IMM_PSEUDO -68719453481
+    ; WAVE32-NEXT: S_ENDPGM 0, implicit [[S_MOV_B]], implicit [[S_MOV_B1]], implicit [[S_MOV_B2]], implicit [[S_MOV_B3]], implicit [[S_MOV_B4]], implicit [[S_MOV_B5]], implicit [[S_MOV_B6]], implicit [[S_MOV_B7]]
     %0:sgpr(p1) = G_CONSTANT i64 0
     %1:sgpr(p1) = G_CONSTANT i64 1
     %2:sgpr(p1) = G_CONSTANT i64 -1
@@ -373,13 +538,9 @@ body: |
     ; WAVE64-NEXT: [[V_MOV_B3:%[0-9]+]]:vreg_64 = V_MOV_B64_PSEUDO -54, implicit $exec
     ; WAVE64-NEXT: [[V_MOV_B4:%[0-9]+]]:vreg_64 = V_MOV_B64_PSEUDO 27, implicit $exec
     ; WAVE64-NEXT: [[V_MOV_B5:%[0-9]+]]:vreg_64 = V_MOV_B64_PSEUDO 4294967295, implicit $exec
-    ; WAVE64-NEXT: [[V_MOV_B32_e32_:%[0-9]+]]:vgpr_32 = V_MOV_B32_e32 0, implicit $exec
-    ; WAVE64-NEXT: [[V_MOV_B32_e32_1:%[0-9]+]]:vgpr_32 = V_MOV_B32_e32 1, implicit $exec
-    ; WAVE64-NEXT: [[REG_SEQUENCE:%[0-9]+]]:vreg_64 = REG_SEQUENCE [[V_MOV_B32_e32_]], %subreg.sub0, [[V_MOV_B32_e32_1]], %subreg.sub1
-    ; WAVE64-NEXT: [[V_MOV_B32_e32_2:%[0-9]+]]:vgpr_32 = V_MOV_B32_e32 23255, implicit $exec
-    ; WAVE64-NEXT: [[V_MOV_B32_e32_3:%[0-9]+]]:vgpr_32 = V_MOV_B32_e32 -16, implicit $exec
-    ; WAVE64-NEXT: [[REG_SEQUENCE1:%[0-9]+]]:vreg_64 = REG_SEQUENCE [[V_MOV_B32_e32_2]], %subreg.sub0, [[V_MOV_B32_e32_3]], %subreg.sub1
-    ; WAVE64-NEXT: S_ENDPGM 0, implicit [[V_MOV_B]], implicit [[V_MOV_B1]], implicit [[V_MOV_B2]], implicit [[V_MOV_B3]], implicit [[V_MOV_B4]], implicit [[V_MOV_B5]], implicit [[REG_SEQUENCE]], implicit [[REG_SEQUENCE1]]
+    ; WAVE64-NEXT: [[V_MOV_B6:%[0-9]+]]:vreg_64 = V_MOV_B64_PSEUDO 4294967296, implicit $exec
+    ; WAVE64-NEXT: [[V_MOV_B7:%[0-9]+]]:vreg_64 = V_MOV_B64_PSEUDO -68719453481, implicit $exec
+    ; WAVE64-NEXT: S_ENDPGM 0, implicit [[V_MOV_B]], implicit [[V_MOV_B1]], implicit [[V_MOV_B2]], implicit [[V_MOV_B3]], implicit [[V_MOV_B4]], implicit [[V_MOV_B5]], implicit [[V_MOV_B6]], implicit [[V_MOV_B7]]
     ;
     ; WAVE32-LABEL: name: constant_v_p1
     ; WAVE32: [[V_MOV_B:%[0-9]+]]:vreg_64 = V_MOV_B64_PSEUDO 0, implicit $exec
@@ -388,13 +549,9 @@ body: |
     ; WAVE32-NEXT: [[V_MOV_B3:%[0-9]+]]:vreg_64 = V_MOV_B64_PSEUDO -54, implicit $exec
     ; WAVE32-NEXT: [[V_MOV_B4:%[0-9]+]]:vreg_64 = V_MOV_B64_PSEUDO 27, implicit $exec
     ; WAVE32-NEXT: [[V_MOV_B5:%[0-9]+]]:vreg_64 = V_MOV_B64_PSEUDO 4294967295, implicit $exec
-    ; WAVE32-NEXT: [[V_MOV_B32_e32_:%[0-9]+]]:vgpr_32 = V_MOV_B32_e32 0, implicit $exec
-    ; WAVE32-NEXT: [[V_MOV_B32_e32_1:%[0-9]+]]:vgpr_32 = V_MOV_B32_e32 1, implicit $exec
-    ; WAVE32-NEXT: [[REG_SEQUENCE:%[0-9]+]]:vreg_64 = REG_SEQUENCE [[V_MOV_B32_e32_]], %subreg.sub0, [[V_MOV_B32_e32_1]], %subreg.sub1
-    ; WAVE32-NEXT: [[V_MOV_B32_e32_2:%[0-9]+]]:vgpr_32 = V_MOV_B32_e32 23255, implicit $exec
-    ; WAVE32-NEXT: [[V_MOV_B32_e32_3:%[0-9]+]]:vgpr_32 = V_MOV_B32_e32 -16, implicit $exec
-    ; WAVE32-NEXT: [[REG_SEQUENCE1:%[0-9]+]]:vreg_64 = REG_SEQUENCE [[V_MOV_B32_e32_2]], %subreg.sub0, [[V_MOV_B32_e32_3]], %subreg.sub1
-    ; WAVE32-NEXT: S_ENDPGM 0, implicit [[V_MOV_B]], implicit [[V_MOV_B1]], implicit [[V_MOV_B2]], implicit [[V_MOV_B3]], implicit [[V_MOV_B4]], implicit [[V_MOV_B5]], implicit [[REG_SEQUENCE]], implicit [[REG_SEQUENCE1]]
+    ; WAVE32-NEXT: [[V_MOV_B6:%[0-9]+]]:vreg_64 = V_MOV_B64_PSEUDO 4294967296, implicit $exec
+    ; WAVE32-NEXT: [[V_MOV_B7:%[0-9]+]]:vreg_64 = V_MOV_B64_PSEUDO -68719453481, implicit $exec
+    ; WAVE32-NEXT: S_ENDPGM 0, implicit [[V_MOV_B]], implicit [[V_MOV_B1]], implicit [[V_MOV_B2]], implicit [[V_MOV_B3]], implicit [[V_MOV_B4]], implicit [[V_MOV_B5]], implicit [[V_MOV_B6]], implicit [[V_MOV_B7]]
     %0:vgpr(p1) = G_CONSTANT i64 0
     %1:vgpr(p1) = G_CONSTANT i64 1
     %2:vgpr(p1) = G_CONSTANT i64 -1
@@ -407,98 +564,161 @@ body: |
 ...
 
 ---
-name:            constant_s_p999
+name:            constant_s_p0
 legalized:       true
 regBankSelected: true
 tracksRegLiveness: true
 
 body: |
   bb.0:
-    ; WAVE64-LABEL: name: constant_s_p999
+    ; WAVE64-LABEL: name: constant_s_p0
     ; WAVE64: [[S_MOV_B:%[0-9]+]]:sreg_64 = S_MOV_B64_IMM_PSEUDO 0
     ; WAVE64-NEXT: [[S_MOV_B1:%[0-9]+]]:sreg_64 = S_MOV_B64_IMM_PSEUDO 1
     ; WAVE64-NEXT: [[S_MOV_B2:%[0-9]+]]:sreg_64 = S_MOV_B64_IMM_PSEUDO -1
     ; WAVE64-NEXT: [[S_MOV_B3:%[0-9]+]]:sreg_64 = S_MOV_B64_IMM_PSEUDO -54
     ; WAVE64-NEXT: [[S_MOV_B4:%[0-9]+]]:sreg_64 = S_MOV_B64_IMM_PSEUDO 27
     ; WAVE64-NEXT: [[S_MOV_B5:%[0-9]+]]:sreg_64 = S_MOV_B64_IMM_PSEUDO 4294967295
-    ; WAVE64-NEXT: [[S_MOV_B32_:%[0-9]+]]:sreg_32 = S_MOV_B32 0
-    ; WAVE64-NEXT: [[S_MOV_B32_1:%[0-9]+]]:sreg_32 = S_MOV_B32 1
-    ; WAVE64-NEXT: [[REG_SEQUENCE:%[0-9]+]]:sreg_64 = REG_SEQUENCE [[S_MOV_B32_]], %subreg.sub0, [[S_MOV_B32_1]], %subreg.sub1
-    ; WAVE64-NEXT: [[S_MOV_B32_2:%[0-9]+]]:sreg_32 = S_MOV_B32 23255
-    ; WAVE64-NEXT: [[S_MOV_B32_3:%[0-9]+]]:sreg_32 = S_MOV_B32 -16
-    ; WAVE64-NEXT: [[REG_SEQUENCE1:%[0-9]+]]:sreg_64 = REG_SEQUENCE [[S_MOV_B32_2]], %subreg.sub0, [[S_MOV_B32_3]], %subreg.sub1
-    ; WAVE64-NEXT: S_ENDPGM 0, implicit [[S_MOV_B]], implicit [[S_MOV_B1]], implicit [[S_MOV_B2]], implicit [[S_MOV_B3]], implicit [[S_MOV_B4]], implicit [[S_MOV_B5]], implicit [[REG_SEQUENCE]], implicit [[REG_SEQUENCE1]]
+    ; WAVE64-NEXT: [[S_MOV_B6:%[0-9]+]]:sreg_64 = S_MOV_B64_IMM_PSEUDO 4294967296
+    ; WAVE64-NEXT: [[S_MOV_B7:%[0-9]+]]:sreg_64 = S_MOV_B64_IMM_PSEUDO -68719453481
+    ; WAVE64-NEXT: S_ENDPGM 0, implicit [[S_MOV_B]], implicit [[S_MOV_B1]], implicit [[S_MOV_B2]], implicit [[S_MOV_B3]], implicit [[S_MOV_B4]], implicit [[S_MOV_B5]], implicit [[S_MOV_B6]], implicit [[S_MOV_B7]]
     ;
-    ; WAVE32-LABEL: name: constant_s_p999
+    ; WAVE32-LABEL: name: constant_s_p0
     ; WAVE32: [[S_MOV_B:%[0-9]+]]:sreg_64 = S_MOV_B64_IMM_PSEUDO 0
     ; WAVE32-NEXT: [[S_MOV_B1:%[0-9]+]]:sreg_64 = S_MOV_B64_IMM_PSEUDO 1
     ; WAVE32-NEXT: [[S_MOV_B2:%[0-9]+]]:sreg_64 = S_MOV_B64_IMM_PSEUDO -1
     ; WAVE32-NEXT: [[S_MOV_B3:%[0-9]+]]:sreg_64 = S_MOV_B64_IMM_PSEUDO -54
     ; WAVE32-NEXT: [[S_MOV_B4:%[0-9]+]]:sreg_64 = S_MOV_B64_IMM_PSEUDO 27
     ; WAVE32-NEXT: [[S_MOV_B5:%[0-9]+]]:sreg_64 = S_MOV_B64_IMM_PSEUDO 4294967295
-    ; WAVE32-NEXT: [[S_MOV_B32_:%[0-9]+]]:sreg_32 = S_MOV_B32 0
-    ; WAVE32-NEXT: [[S_MOV_B32_1:%[0-9]+]]:sreg_32 = S_MOV_B32 1
-    ; WAVE32-NEXT: [[REG_SEQUENCE:%[0-9]+]]:sreg_64 = REG_SEQUENCE [[S_MOV_B32_]], %subreg.sub0, [[S_MOV_B32_1]], %subreg.sub1
-    ; WAVE32-NEXT: [[S_MOV_B32_2:%[0-9]+]]:sreg_32 = S_MOV_B32 23255
-    ; WAVE32-NEXT: [[S_MOV_B32_3:%[0-9]+]]:sreg_32 = S_MOV_B32 -16
-    ; WAVE32-NEXT: [[REG_SEQUENCE1:%[0-9]+]]:sreg_64 = REG_SEQUENCE [[S_MOV_B32_2]], %subreg.sub0, [[S_MOV_B32_3]], %subreg.sub1
-    ; WAVE32-NEXT: S_ENDPGM 0, implicit [[S_MOV_B]], implicit [[S_MOV_B1]], implicit [[S_MOV_B2]], implicit [[S_MOV_B3]], implicit [[S_MOV_B4]], implicit [[S_MOV_B5]], implicit [[REG_SEQUENCE]], implicit [[REG_SEQUENCE1]]
-    %0:sgpr(p999) = G_CONSTANT i64 0
-    %1:sgpr(p999) = G_CONSTANT i64 1
-    %2:sgpr(p999) = G_CONSTANT i64 -1
-    %3:sgpr(p999) = G_CONSTANT i64 -54
-    %4:sgpr(p999) = G_CONSTANT i64 27
-    %5:sgpr(p999) = G_CONSTANT i64 4294967295
-    %6:sgpr(p999) = G_CONSTANT i64 4294967296
-    %7:sgpr(p999) = G_CONSTANT i64 18446744004990098135
+    ; WAVE32-NEXT: [[S_MOV_B6:%[0-9]+]]:sreg_64 = S_MOV_B64_IMM_PSEUDO 4294967296
+    ; WAVE32-NEXT: [[S_MOV_B7:%[0-9]+]]:sreg_64 = S_MOV_B64_IMM_PSEUDO -68719453481
+    ; WAVE32-NEXT: S_ENDPGM 0, implicit [[S_MOV_B]], implicit [[S_MOV_B1]], implicit [[S_MOV_B2]], implicit [[S_MOV_B3]], implicit [[S_MOV_B4]], implicit [[S_MOV_B5]], implicit [[S_MOV_B6]], implicit [[S_MOV_B7]]
+    %0:sgpr(p0) = G_CONSTANT i64 0
+    %1:sgpr(p0) = G_CONSTANT i64 1
+    %2:sgpr(p0) = G_CONSTANT i64 -1
+    %3:sgpr(p0) = G_CONSTANT i64 -54
+    %4:sgpr(p0) = G_CONSTANT i64 27
+    %5:sgpr(p0) = G_CONSTANT i64 4294967295
+    %6:sgpr(p0) = G_CONSTANT i64 4294967296
+    %7:sgpr(p0) = G_CONSTANT i64 18446744004990098135
     S_ENDPGM 0, implicit %0 , implicit %1 , implicit %2, implicit %3, implicit %4, implicit %5, implicit %6, implicit %7
 ...
 
 ---
-name:            constant_v_p999
+name:            constant_v_p0
 legalized:       true
 regBankSelected: true
 tracksRegLiveness: true
 
 body: |
   bb.0:
-    ; WAVE64-LABEL: name: constant_v_p999
+    ; WAVE64-LABEL: name: constant_v_p0
     ; WAVE64: [[V_MOV_B:%[0-9]+]]:vreg_64 = V_MOV_B64_PSEUDO 0, implicit $exec
     ; WAVE64-NEXT: [[V_MOV_B1:%[0-9]+]]:vreg_64 = V_MOV_B64_PSEUDO 1, implicit $exec
     ; WAVE64-NEXT: [[V_MOV_B2:%[0-9]+]]:vreg_64 = V_MOV_B64_PSEUDO -1, implicit $exec
     ; WAVE64-NEXT: [[V_MOV_B3:%[0-9]+]]:vreg_64 = V_MOV_B64_PSEUDO -54, implicit $exec
     ; WAVE64-NEXT: [[V_MOV_B4:%[0-9]+]]:vreg_64 = V_MOV_B64_PSEUDO 27, implicit $exec
     ; WAVE64-NEXT: [[V_MOV_B5:%[0-9]+]]:vreg_64 = V_MOV_B64_PSEUDO 4294967295, implicit $exec
-    ; WAVE64-NEXT: [[V_MOV_B32_e32_:%[0-9]+]]:vgpr_32 = V_MOV_B32_e32 0, implicit $exec
-    ; WAVE64-NEXT: [[V_MOV_B32_e32_1:%[0-9]+]]:vgpr_32 = V_MOV_B32_e32 1, implicit $exec
-    ; WAVE64-NEXT: [[REG_SEQUENCE:%[0-9]+]]:vreg_64 = REG_SEQUENCE [[V_MOV_B32_e32_]], %subreg.sub0, [[V_MOV_B32_e32_1]], %subreg.sub1
-    ; WAVE64-NEXT: [[V_MOV_B32_e32_2:%[0-9]+]]:vgpr_32 = V_MOV_B32_e32 23255, implicit $exec
-    ; WAVE64-NEXT: [[V_MOV_B32_e32_3:%[0-9]+]]:vgpr_32 = V_MOV_B32_e32 -16, implicit $exec
-    ; WAVE64-NEXT: [[REG_SEQUENCE1:%[0-9]+]]:vreg_64 = REG_SEQUENCE [[V_MOV_B32_e32_2]], %subreg.sub0, [[V_MOV_B32_e32_3]], %subreg.sub1
-    ; WAVE64-NEXT: S_ENDPGM 0, implicit [[V_MOV_B]], implicit [[V_MOV_B1]], implicit [[V_MOV_B2]], implicit [[V_MOV_B3]], implicit [[V_MOV_B4]], implicit [[V_MOV_B5]], implicit [[REG_SEQUENCE]], implicit [[REG_SEQUENCE1]]
+    ; WAVE64-NEXT: [[V_MOV_B6:%[0-9]+]]:vreg_64 = V_MOV_B64_PSEUDO 4294967296, implicit $exec
+    ; WAVE64-NEXT: [[V_MOV_B7:%[0-9]+]]:vreg_64 = V_MOV_B64_PSEUDO -68719453481, implicit $exec
+    ; WAVE64-NEXT: S_ENDPGM 0, implicit [[V_MOV_B]], implicit [[V_MOV_B1]], implicit [[V_MOV_B2]], implicit [[V_MOV_B3]], implicit [[V_MOV_B4]], implicit [[V_MOV_B5]], implicit [[V_MOV_B6]], implicit [[V_MOV_B7]]
     ;
-    ; WAVE32-LABEL: name: constant_v_p999
+    ; WAVE32-LABEL: name: constant_v_p0
     ; WAVE32: [[V_MOV_B:%[0-9]+]]:vreg_64 = V_MOV_B64_PSEUDO 0, implicit $exec
     ; WAVE32-NEXT: [[V_MOV_B1:%[0-9]+]]:vreg_64 = V_MOV_B64_PSEUDO 1, implicit $exec
     ; WAVE32-NEXT: [[V_MOV_B2:%[0-9]+]]:vreg_64 = V_MOV_B64_PSEUDO -1, implicit $exec
     ; WAVE32-NEXT: [[V_MOV_B3:%[0-9]+]]:vreg_64 = V_MOV_B64_PSEUDO -54, implicit $exec
     ; WAVE32-NEXT: [[V_MOV_B4:%[0-9]+]]:vreg_64 = V_MOV_B64_PSEUDO 27, implicit $exec
     ; WAVE32-NEXT: [[V_MOV_B5:%[0-9]+]]:vreg_64 = V_MOV_B64_PSEUDO 4294967295, implicit $exec
-    ; WAVE32-NEXT: [[V_MOV_B32_e32_:%[0-9]+]]:vgpr_32 = V_MOV_B32_e32 0, implicit $exec
-    ; WAVE32-NEXT: [[V_MOV_B32_e32_1:%[0-9]+]]:vgpr_32 = V_MOV_B32_e32 1, implicit $exec
-    ; WAVE32-NEXT: [[REG_SEQUENCE:%[0-9]+]]:vreg_64 = REG_SEQUENCE [[V_MOV_B32_e32_]], %subreg.sub0, [[V_MOV_B32_e32_1]], %subreg.sub1
-    ; WAVE32-NEXT: [[V_MOV_B32_e32_2:%[0-9]+]]:vgpr_32 = V_MOV_B32_e32 23255, implicit $exec
-    ; WAVE32-NEXT: [[V_MOV_B32_e32_3:%[0-9]+]]:vgpr_32 = V_MOV_B32_e32 -16, implicit $exec
-    ; WAVE32-NEXT: [[REG_SEQUENCE1:%[0-9]+]]:vreg_64 = REG_SEQUENCE [[V_MOV_B32_e32_2]], %subreg.sub0, [[V_MOV_B32_e32_3]], %subreg.sub1
-    ; WAVE32-NEXT: S_ENDPGM 0, implicit [[V_MOV_B]], implicit [[V_MOV_B1]], implicit [[V_MOV_B2]], implicit [[V_MOV_B3]], implicit [[V_MOV_B4]], implicit [[V_MOV_B5]], implicit [[REG_SEQUENCE]], implicit [[REG_SEQUENCE1]]
-    %0:vgpr(p999) = G_CONSTANT i64 0
-    %1:vgpr(p999) = G_CONSTANT i64 1
-    %2:vgpr(p999) = G_CONSTANT i64 -1
-    %3:vgpr(p999) = G_CONSTANT i64 -54
-    %4:vgpr(p999) = G_CONSTANT i64 27
-    %5:vgpr(p999) = G_CONSTANT i64 4294967295
-    %6:vgpr(p999) = G_CONSTANT i64 4294967296
-    %7:vgpr(p999) = G_CONSTANT i64 18446744004990098135
+    ; WAVE32-NEXT: [[V_MOV_B6:%[0-9]+]]:vreg_64 = V_MOV_B64_PSEUDO 4294967296, implicit $exec
+    ; WAVE32-NEXT: [[V_MOV_B7:%[0-9]+]]:vreg_64 = V_MOV_B64_PSEUDO -68719453481, implicit $exec
+    ; WAVE32-NEXT: S_ENDPGM 0, implicit [[V_MOV_B]], implicit [[V_MOV_B1]], implicit [[V_MOV_B2]], implicit [[V_MOV_B3]], implicit [[V_MOV_B4]], implicit [[V_MOV_B5]], implicit [[V_MOV_B6]], implicit [[V_MOV_B7]]
+    %0:vgpr(p0) = G_CONSTANT i64 0
+    %1:vgpr(p0) = G_CONSTANT i64 1
+    %2:vgpr(p0) = G_CONSTANT i64 -1
+    %3:vgpr(p0) = G_CONSTANT i64 -54
+    %4:vgpr(p0) = G_CONSTANT i64 27
+    %5:vgpr(p0) = G_CONSTANT i64 4294967295
+    %6:vgpr(p0) = G_CONSTANT i64 4294967296
+    %7:vgpr(p0) = G_CONSTANT i64 18446744004990098135
+    S_ENDPGM 0, implicit %0 , implicit %1 , implicit %2, implicit %3, implicit %4, implicit %5, implicit %6, implicit %7
+...
+---
+name:            constant_s_p4
+legalized:       true
+regBankSelected: true
+tracksRegLiveness: true
+
+body: |
+  bb.0:
+    ; WAVE64-LABEL: name: constant_s_p4
+    ; WAVE64: [[S_MOV_B:%[0-9]+]]:sreg_64 = S_MOV_B64_IMM_PSEUDO 0
+    ; WAVE64-NEXT: [[S_MOV_B1:%[0-9]+]]:sreg_64 = S_MOV_B64_IMM_PSEUDO 1
+    ; WAVE64-NEXT: [[S_MOV_B2:%[0-9]+]]:sreg_64 = S_MOV_B64_IMM_PSEUDO -1
+    ; WAVE64-NEXT: [[S_MOV_B3:%[0-9]+]]:sreg_64 = S_MOV_B64_IMM_PSEUDO -54
+    ; WAVE64-NEXT: [[S_MOV_B4:%[0-9]+]]:sreg_64 = S_MOV_B64_IMM_PSEUDO 27
+    ; WAVE64-NEXT: [[S_MOV_B5:%[0-9]+]]:sreg_64 = S_MOV_B64_IMM_PSEUDO 4294967295
+    ; WAVE64-NEXT: [[S_MOV_B6:%[0-9]+]]:sreg_64 = S_MOV_B64_IMM_PSEUDO 4294967296
+    ; WAVE64-NEXT: [[S_MOV_B7:%[0-9]+]]:sreg_64 = S_MOV_B64_IMM_PSEUDO -68719453481
+    ; WAVE64-NEXT: S_ENDPGM 0, implicit [[S_MOV_B]], implicit [[S_MOV_B1]], implicit [[S_MOV_B2]], implicit [[S_MOV_B3]], implicit [[S_MOV_B4]], implicit [[S_MOV_B5]], implicit [[S_MOV_B6]], implicit [[S_MOV_B7]]
+    ;
+    ; WAVE32-LABEL: name: constant_s_p4
+    ; WAVE32: [[S_MOV_B:%[0-9]+]]:sreg_64 = S_MOV_B64_IMM_PSEUDO 0
+    ; WAVE32-NEXT: [[S_MOV_B1:%[0-9]+]]:sreg_64 = S_MOV_B64_IMM_PSEUDO 1
+    ; WAVE32-NEXT: [[S_MOV_B2:%[0-9]+]]:sreg_64 = S_MOV_B64_IMM_PSEUDO -1
+    ; WAVE32-NEXT: [[S_MOV_B3:%[0-9]+]]:sreg_64 = S_MOV_B64_IMM_PSEUDO -54
+    ; WAVE32-NEXT: [[S_MOV_B4:%[0-9]+]]:sreg_64 = S_MOV_B64_IMM_PSEUDO 27
+    ; WAVE32-NEXT: [[S_MOV_B5:%[0-9]+]]:sreg_64 = S_MOV_B64_IMM_PSEUDO 4294967295
+    ; WAVE32-NEXT: [[S_MOV_B6:%[0-9]+]]:sreg_64 = S_MOV_B64_IMM_PSEUDO 4294967296
+    ; WAVE32-NEXT: [[S_MOV_B7:%[0-9]+]]:sreg_64 = S_MOV_B64_IMM_PSEUDO -68719453481
+    ; WAVE32-NEXT: S_ENDPGM 0, implicit [[S_MOV_B]], implicit [[S_MOV_B1]], implicit [[S_MOV_B2]], implicit [[S_MOV_B3]], implicit [[S_MOV_B4]], implicit [[S_MOV_B5]], implicit [[S_MOV_B6]], implicit [[S_MOV_B7]]
+    %0:sgpr(p4) = G_CONSTANT i64 0
+    %1:sgpr(p4) = G_CONSTANT i64 1
+    %2:sgpr(p4) = G_CONSTANT i64 -1
+    %3:sgpr(p4) = G_CONSTANT i64 -54
+    %4:sgpr(p4) = G_CONSTANT i64 27
+    %5:sgpr(p4) = G_CONSTANT i64 4294967295
+    %6:sgpr(p4) = G_CONSTANT i64 4294967296
+    %7:sgpr(p4) = G_CONSTANT i64 18446744004990098135
+    S_ENDPGM 0, implicit %0 , implicit %1 , implicit %2, implicit %3, implicit %4, implicit %5, implicit %6, implicit %7
+...
+
+---
+name:            constant_v_p4
+legalized:       true
+regBankSelected: true
+tracksRegLiveness: true
+
+body: |
+  bb.0:
+    ; WAVE64-LABEL: name: constant_v_p4
+    ; WAVE64: [[V_MOV_B:%[0-9]+]]:vreg_64 = V_MOV_B64_PSEUDO 0, implicit $exec
+    ; WAVE64-NEXT: [[V_MOV_B1:%[0-9]+]]:vreg_64 = V_MOV_B64_PSEUDO 1, implicit $exec
+    ; WAVE64-NEXT: [[V_MOV_B2:%[0-9]+]]:vreg_64 = V_MOV_B64_PSEUDO -1, implicit $exec
+    ; WAVE64-NEXT: [[V_MOV_B3:%[0-9]+]]:vreg_64 = V_MOV_B64_PSEUDO -54, implicit $exec
+    ; WAVE64-NEXT: [[V_MOV_B4:%[0-9]+]]:vreg_64 = V_MOV_B64_PSEUDO 27, implicit $exec
+    ; WAVE64-NEXT: [[V_MOV_B5:%[0-9]+]]:vreg_64 = V_MOV_B64_PSEUDO 4294967295, implicit $exec
+    ; WAVE64-NEXT: [[V_MOV_B6:%[0-9]+]]:vreg_64 = V_MOV_B64_PSEUDO 4294967296, implicit $exec
+    ; WAVE64-NEXT: [[V_MOV_B7:%[0-9]+]]:vreg_64 = V_MOV_B64_PSEUDO -68719453481, implicit $exec
+    ; WAVE64-NEXT: S_ENDPGM 0, implicit [[V_MOV_B]], implicit [[V_MOV_B1]], implicit [[V_MOV_B2]], implicit [[V_MOV_B3]], implicit [[V_MOV_B4]], implicit [[V_MOV_B5]], implicit [[V_MOV_B6]], implicit [[V_MOV_B7]]
+    ;
+    ; WAVE32-LABEL: name: constant_v_p4
+    ; WAVE32: [[V_MOV_B:%[0-9]+]]:vreg_64 = V_MOV_B64_PSEUDO 0, implicit $exec
+    ; WAVE32-NEXT: [[V_MOV_B1:%[0-9]+]]:vreg_64 = V_MOV_B64_PSEUDO 1, implicit $exec
+    ; WAVE32-NEXT: [[V_MOV_B2:%[0-9]+]]:vreg_64 = V_MOV_B64_PSEUDO -1, implicit $exec
+    ; WAVE32-NEXT: [[V_MOV_B3:%[0-9]+]]:vreg_64 = V_MOV_B64_PSEUDO -54, implicit $exec
+    ; WAVE32-NEXT: [[V_MOV_B4:%[0-9]+]]:vreg_64 = V_MOV_B64_PSEUDO 27, implicit $exec
+    ; WAVE32-NEXT: [[V_MOV_B5:%[0-9]+]]:vreg_64 = V_MOV_B64_PSEUDO 4294967295, implicit $exec
+    ; WAVE32-NEXT: [[V_MOV_B6:%[0-9]+]]:vreg_64 = V_MOV_B64_PSEUDO 4294967296, implicit $exec
+    ; WAVE32-NEXT: [[V_MOV_B7:%[0-9]+]]:vreg_64 = V_MOV_B64_PSEUDO -68719453481, implicit $exec
+    ; WAVE32-NEXT: S_ENDPGM 0, implicit [[V_MOV_B]], implicit [[V_MOV_B1]], implicit [[V_MOV_B2]], implicit [[V_MOV_B3]], implicit [[V_MOV_B4]], implicit [[V_MOV_B5]], implicit [[V_MOV_B6]], implicit [[V_MOV_B7]]
+    %0:vgpr(p4) = G_CONSTANT i64 0
+    %1:vgpr(p4) = G_CONSTANT i64 1
+    %2:vgpr(p4) = G_CONSTANT i64 -1
+    %3:vgpr(p4) = G_CONSTANT i64 -54
+    %4:vgpr(p4) = G_CONSTANT i64 27
+    %5:vgpr(p4) = G_CONSTANT i64 4294967295
+    %6:vgpr(p4) = G_CONSTANT i64 4294967296
+    %7:vgpr(p4) = G_CONSTANT i64 18446744004990098135
     S_ENDPGM 0, implicit %0 , implicit %1 , implicit %2, implicit %3, implicit %4, implicit %5, implicit %6, implicit %7
 ...
 
diff --git a/llvm/test/CodeGen/AMDGPU/GlobalISel/inst-select-copy.mir b/llvm/test/CodeGen/AMDGPU/GlobalISel/inst-select-copy.mir
index 6047bda..3428230 100644
--- a/llvm/test/CodeGen/AMDGPU/GlobalISel/inst-select-copy.mir
+++ b/llvm/test/CodeGen/AMDGPU/GlobalISel/inst-select-copy.mir
@@ -1,7 +1,7 @@
 # NOTE: Assertions have been autogenerated by utils/update_mir_test_checks.py
 # RUN: llc -mtriple=amdgcn-amd-amdhsa -mcpu=hawaii -run-pass=instruction-select -verify-machineinstrs -o - %s | FileCheck -check-prefix=WAVE64 %s
-# RUN: llc -mtriple=amdgcn-amd-amdhsa -mcpu=gfx1010 -mattr=+wavefrontsize32,-wavefrontsize64  -run-pass=instruction-select -verify-machineinstrs -o - %s | FileCheck -check-prefix=WAVE32 %s
-# RUN: llc -mtriple=amdgcn-amd-amdhsa -mcpu=gfx1100 -mattr=+wavefrontsize32,-wavefrontsize64  -run-pass=instruction-select -verify-machineinstrs -o - %s | FileCheck -check-prefix=WAVE32 %s
+# RUN: llc -mtriple=amdgcn-amd-amdhsa -mcpu=gfx1010 -run-pass=instruction-select -verify-machineinstrs -o - %s | FileCheck -check-prefix=WAVE32 %s
+# RUN: llc -mtriple=amdgcn-amd-amdhsa -mcpu=gfx1100 -run-pass=instruction-select -verify-machineinstrs -o - %s | FileCheck -check-prefix=WAVE32 %s
 
 ---
 
diff --git a/llvm/test/CodeGen/AMDGPU/GlobalISel/inst-select-fconstant.mir b/llvm/test/CodeGen/AMDGPU/GlobalISel/inst-select-fconstant.mir
index 942a0a3..13e29f1 100644
--- a/llvm/test/CodeGen/AMDGPU/GlobalISel/inst-select-fconstant.mir
+++ b/llvm/test/CodeGen/AMDGPU/GlobalISel/inst-select-fconstant.mir
@@ -87,13 +87,13 @@ tracksRegLiveness: true
 body: |
   bb.0:
     ; GCN-LABEL: name: fconstant_s_s64
-    ; GCN: [[S_MOV_B:%[0-9]+]]:sreg_64 = S_MOV_B64_IMM_PSEUDO 4607182418800017408
-    ; GCN-NEXT: [[S_MOV_B1:%[0-9]+]]:sreg_64 = S_MOV_B64_IMM_PSEUDO 4620693217682128896
-    ; GCN-NEXT: [[S_MOV_B2:%[0-9]+]]:sreg_64 = S_MOV_B64_IMM_PSEUDO -4611686018427387904
-    ; GCN-NEXT: [[S_MOV_B3:%[0-9]+]]:sreg_64 = S_MOV_B64_IMM_PSEUDO -4601552919265804288
-    ; GCN-NEXT: $sgpr0_sgpr1 = COPY [[S_MOV_B]]
-    ; GCN-NEXT: $sgpr2_sgpr3 = COPY [[S_MOV_B1]]
-    ; GCN-NEXT: S_ENDPGM 0, implicit [[S_MOV_B]], implicit [[S_MOV_B1]], implicit [[S_MOV_B2]], implicit [[S_MOV_B3]]
+    ; GCN: [[S_MOV_B64_:%[0-9]+]]:sreg_64 = S_MOV_B64 4607182418800017408
+    ; GCN-NEXT: [[S_MOV_B:%[0-9]+]]:sreg_64 = S_MOV_B64_IMM_PSEUDO 4620693217682128896
+    ; GCN-NEXT: [[S_MOV_B64_1:%[0-9]+]]:sreg_64 = S_MOV_B64 -4611686018427387904
+    ; GCN-NEXT: [[S_MOV_B1:%[0-9]+]]:sreg_64 = S_MOV_B64_IMM_PSEUDO -4601552919265804288
+    ; GCN-NEXT: $sgpr0_sgpr1 = COPY [[S_MOV_B64_]]
+    ; GCN-NEXT: $sgpr2_sgpr3 = COPY [[S_MOV_B]]
+    ; GCN-NEXT: S_ENDPGM 0, implicit [[S_MOV_B64_]], implicit [[S_MOV_B]], implicit [[S_MOV_B64_1]], implicit [[S_MOV_B1]]
     %0:sgpr(s64) = G_FCONSTANT double 1.0
     %1:sgpr(s64) = G_FCONSTANT double 8.0
     %2:sgpr(s64) = G_FCONSTANT double -2.0
diff --git a/llvm/test/CodeGen/AMDGPU/GlobalISel/inst-select-load-constant.mir b/llvm/test/CodeGen/AMDGPU/GlobalISel/inst-select-load-constant.mir
index 02c6220..ada80da 100644
--- a/llvm/test/CodeGen/AMDGPU/GlobalISel/inst-select-load-constant.mir
+++ b/llvm/test/CodeGen/AMDGPU/GlobalISel/inst-select-load-constant.mir
@@ -1374,11 +1374,11 @@ body: |
     ; GFX6: liveins: $sgpr0_sgpr1
     ; GFX6-NEXT: {{  $}}
     ; GFX6-NEXT: [[COPY:%[0-9]+]]:sreg_64 = COPY $sgpr0_sgpr1
-    ; GFX6-NEXT: [[S_MOV_B:%[0-9]+]]:sreg_64 = S_MOV_B64_IMM_PSEUDO -1
+    ; GFX6-NEXT: [[S_MOV_B64_:%[0-9]+]]:sreg_64 = S_MOV_B64 -1
     ; GFX6-NEXT: [[COPY1:%[0-9]+]]:sreg_32 = COPY [[COPY]].sub0
-    ; GFX6-NEXT: [[COPY2:%[0-9]+]]:sreg_32 = COPY [[S_MOV_B]].sub0
+    ; GFX6-NEXT: [[COPY2:%[0-9]+]]:sreg_32 = COPY [[S_MOV_B64_]].sub0
     ; GFX6-NEXT: [[COPY3:%[0-9]+]]:sreg_32 = COPY [[COPY]].sub1
-    ; GFX6-NEXT: [[COPY4:%[0-9]+]]:sreg_32 = COPY [[S_MOV_B]].sub1
+    ; GFX6-NEXT: [[COPY4:%[0-9]+]]:sreg_32 = COPY [[S_MOV_B64_]].sub1
     ; GFX6-NEXT: [[S_ADD_U32_:%[0-9]+]]:sreg_32 = S_ADD_U32 [[COPY1]], [[COPY2]], implicit-def $scc
     ; GFX6-NEXT: [[S_ADDC_U32_:%[0-9]+]]:sreg_32 = S_ADDC_U32 [[COPY3]], [[COPY4]], implicit-def dead $scc, implicit $scc
     ; GFX6-NEXT: [[REG_SEQUENCE:%[0-9]+]]:sreg_64_xexec = REG_SEQUENCE [[S_ADD_U32_]], %subreg.sub0, [[S_ADDC_U32_]], %subreg.sub1
@@ -1389,11 +1389,11 @@ body: |
     ; GFX7: liveins: $sgpr0_sgpr1
     ; GFX7-NEXT: {{  $}}
     ; GFX7-NEXT: [[COPY:%[0-9]+]]:sreg_64 = COPY $sgpr0_sgpr1
-    ; GFX7-NEXT: [[S_MOV_B:%[0-9]+]]:sreg_64 = S_MOV_B64_IMM_PSEUDO -1
+    ; GFX7-NEXT: [[S_MOV_B64_:%[0-9]+]]:sreg_64 = S_MOV_B64 -1
     ; GFX7-NEXT: [[COPY1:%[0-9]+]]:sreg_32 = COPY [[COPY]].sub0
-    ; GFX7-NEXT: [[COPY2:%[0-9]+]]:sreg_32 = COPY [[S_MOV_B]].sub0
+    ; GFX7-NEXT: [[COPY2:%[0-9]+]]:sreg_32 = COPY [[S_MOV_B64_]].sub0
     ; GFX7-NEXT: [[COPY3:%[0-9]+]]:sreg_32 = COPY [[COPY]].sub1
-    ; GFX7-NEXT: [[COPY4:%[0-9]+]]:sreg_32 = COPY [[S_MOV_B]].sub1
+    ; GFX7-NEXT: [[COPY4:%[0-9]+]]:sreg_32 = COPY [[S_MOV_B64_]].sub1
     ; GFX7-NEXT: [[S_ADD_U32_:%[0-9]+]]:sreg_32 = S_ADD_U32 [[COPY1]], [[COPY2]], implicit-def $scc
     ; GFX7-NEXT: [[S_ADDC_U32_:%[0-9]+]]:sreg_32 = S_ADDC_U32 [[COPY3]], [[COPY4]], implicit-def dead $scc, implicit $scc
     ; GFX7-NEXT: [[REG_SEQUENCE:%[0-9]+]]:sreg_64_xexec = REG_SEQUENCE [[S_ADD_U32_]], %subreg.sub0, [[S_ADDC_U32_]], %subreg.sub1
@@ -1404,11 +1404,11 @@ body: |
     ; GFX8: liveins: $sgpr0_sgpr1
     ; GFX8-NEXT: {{  $}}
     ; GFX8-NEXT: [[COPY:%[0-9]+]]:sreg_64 = COPY $sgpr0_sgpr1
-    ; GFX8-NEXT: [[S_MOV_B:%[0-9]+]]:sreg_64 = S_MOV_B64_IMM_PSEUDO -1
+    ; GFX8-NEXT: [[S_MOV_B64_:%[0-9]+]]:sreg_64 = S_MOV_B64 -1
     ; GFX8-NEXT: [[COPY1:%[0-9]+]]:sreg_32 = COPY [[COPY]].sub0
-    ; GFX8-NEXT: [[COPY2:%[0-9]+]]:sreg_32 = COPY [[S_MOV_B]].sub0
+    ; GFX8-NEXT: [[COPY2:%[0-9]+]]:sreg_32 = COPY [[S_MOV_B64_]].sub0
     ; GFX8-NEXT: [[COPY3:%[0-9]+]]:sreg_32 = COPY [[COPY]].sub1
-    ; GFX8-NEXT: [[COPY4:%[0-9]+]]:sreg_32 = COPY [[S_MOV_B]].sub1
+    ; GFX8-NEXT: [[COPY4:%[0-9]+]]:sreg_32 = COPY [[S_MOV_B64_]].sub1
     ; GFX8-NEXT: [[S_ADD_U32_:%[0-9]+]]:sreg_32 = S_ADD_U32 [[COPY1]], [[COPY2]], implicit-def $scc
     ; GFX8-NEXT: [[S_ADDC_U32_:%[0-9]+]]:sreg_32 = S_ADDC_U32 [[COPY3]], [[COPY4]], implicit-def dead $scc, implicit $scc
     ; GFX8-NEXT: [[REG_SEQUENCE:%[0-9]+]]:sreg_64_xexec = REG_SEQUENCE [[S_ADD_U32_]], %subreg.sub0, [[S_ADDC_U32_]], %subreg.sub1
@@ -1419,11 +1419,11 @@ body: |
     ; GFX10: liveins: $sgpr0_sgpr1
     ; GFX10-NEXT: {{  $}}
     ; GFX10-NEXT: [[COPY:%[0-9]+]]:sreg_64 = COPY $sgpr0_sgpr1
-    ; GFX10-NEXT: [[S_MOV_B:%[0-9]+]]:sreg_64 = S_MOV_B64_IMM_PSEUDO -1
+    ; GFX10-NEXT: [[S_MOV_B64_:%[0-9]+]]:sreg_64 = S_MOV_B64 -1
     ; GFX10-NEXT: [[COPY1:%[0-9]+]]:sreg_32 = COPY [[COPY]].sub0
-    ; GFX10-NEXT: [[COPY2:%[0-9]+]]:sreg_32 = COPY [[S_MOV_B]].sub0
+    ; GFX10-NEXT: [[COPY2:%[0-9]+]]:sreg_32 = COPY [[S_MOV_B64_]].sub0
     ; GFX10-NEXT: [[COPY3:%[0-9]+]]:sreg_32 = COPY [[COPY]].sub1
-    ; GFX10-NEXT: [[COPY4:%[0-9]+]]:sreg_32 = COPY [[S_MOV_B]].sub1
+    ; GFX10-NEXT: [[COPY4:%[0-9]+]]:sreg_32 = COPY [[S_MOV_B64_]].sub1
     ; GFX10-NEXT: [[S_ADD_U32_:%[0-9]+]]:sreg_32 = S_ADD_U32 [[COPY1]], [[COPY2]], implicit-def $scc
     ; GFX10-NEXT: [[S_ADDC_U32_:%[0-9]+]]:sreg_32 = S_ADDC_U32 [[COPY3]], [[COPY4]], implicit-def dead $scc, implicit $scc
     ; GFX10-NEXT: [[REG_SEQUENCE:%[0-9]+]]:sreg_64_xexec = REG_SEQUENCE [[S_ADD_U32_]], %subreg.sub0, [[S_ADDC_U32_]], %subreg.sub1
@@ -1434,11 +1434,11 @@ body: |
     ; GFX11: liveins: $sgpr0_sgpr1
     ; GFX11-NEXT: {{  $}}
     ; GFX11-NEXT: [[COPY:%[0-9]+]]:sreg_64 = COPY $sgpr0_sgpr1
-    ; GFX11-NEXT: [[S_MOV_B:%[0-9]+]]:sreg_64 = S_MOV_B64_IMM_PSEUDO -1
+    ; GFX11-NEXT: [[S_MOV_B64_:%[0-9]+]]:sreg_64 = S_MOV_B64 -1
     ; GFX11-NEXT: [[COPY1:%[0-9]+]]:sreg_32 = COPY [[COPY]].sub0
-    ; GFX11-NEXT: [[COPY2:%[0-9]+]]:sreg_32 = COPY [[S_MOV_B]].sub0
+    ; GFX11-NEXT: [[COPY2:%[0-9]+]]:sreg_32 = COPY [[S_MOV_B64_]].sub0
     ; GFX11-NEXT: [[COPY3:%[0-9]+]]:sreg_32 = COPY [[COPY]].sub1
-    ; GFX11-NEXT: [[COPY4:%[0-9]+]]:sreg_32 = COPY [[S_MOV_B]].sub1
+    ; GFX11-NEXT: [[COPY4:%[0-9]+]]:sreg_32 = COPY [[S_MOV_B64_]].sub1
     ; GFX11-NEXT: [[S_ADD_U32_:%[0-9]+]]:sreg_32 = S_ADD_U32 [[COPY1]], [[COPY2]], implicit-def $scc
     ; GFX11-NEXT: [[S_ADDC_U32_:%[0-9]+]]:sreg_32 = S_ADDC_U32 [[COPY3]], [[COPY4]], implicit-def dead $scc, implicit $scc
     ; GFX11-NEXT: [[REG_SEQUENCE:%[0-9]+]]:sreg_64_xexec = REG_SEQUENCE [[S_ADD_U32_]], %subreg.sub0, [[S_ADDC_U32_]], %subreg.sub1
diff --git a/llvm/test/CodeGen/AMDGPU/GlobalISel/inst-select-load-global-saddr.mir b/llvm/test/CodeGen/AMDGPU/GlobalISel/inst-select-load-global-saddr.mir
index 85ac533..cf4e6c8 100644
--- a/llvm/test/CodeGen/AMDGPU/GlobalISel/inst-select-load-global-saddr.mir
+++ b/llvm/test/CodeGen/AMDGPU/GlobalISel/inst-select-load-global-saddr.mir
@@ -741,17 +741,15 @@ body: |
     ; GFX9: liveins: $sgpr0_sgpr1
     ; GFX9-NEXT: {{  $}}
     ; GFX9-NEXT: [[COPY:%[0-9]+]]:sreg_64 = COPY $sgpr0_sgpr1
-    ; GFX9-NEXT: [[S_MOV_B32_:%[0-9]+]]:sreg_32 = S_MOV_B32 0
-    ; GFX9-NEXT: [[S_MOV_B32_1:%[0-9]+]]:sreg_32 = S_MOV_B32 1
-    ; GFX9-NEXT: [[REG_SEQUENCE:%[0-9]+]]:sreg_64 = REG_SEQUENCE [[S_MOV_B32_]], %subreg.sub0, [[S_MOV_B32_1]], %subreg.sub1
+    ; GFX9-NEXT: [[S_MOV_B:%[0-9]+]]:sreg_64 = S_MOV_B64_IMM_PSEUDO 4294967296
     ; GFX9-NEXT: [[COPY1:%[0-9]+]]:sreg_32 = COPY [[COPY]].sub0
-    ; GFX9-NEXT: [[COPY2:%[0-9]+]]:sreg_32 = COPY [[REG_SEQUENCE]].sub0
+    ; GFX9-NEXT: [[COPY2:%[0-9]+]]:sreg_32 = COPY [[S_MOV_B]].sub0
     ; GFX9-NEXT: [[COPY3:%[0-9]+]]:sreg_32 = COPY [[COPY]].sub1
-    ; GFX9-NEXT: [[COPY4:%[0-9]+]]:sreg_32 = COPY [[REG_SEQUENCE]].sub1
+    ; GFX9-NEXT: [[COPY4:%[0-9]+]]:sreg_32 = COPY [[S_MOV_B]].sub1
     ; GFX9-NEXT: [[S_ADD_U32_:%[0-9]+]]:sreg_32 = S_ADD_U32 [[COPY1]], [[COPY2]], implicit-def $scc
     ; GFX9-NEXT: [[S_ADDC_U32_:%[0-9]+]]:sreg_32 = S_ADDC_U32 [[COPY3]], [[COPY4]], implicit-def dead $scc, implicit $scc
-    ; GFX9-NEXT: [[REG_SEQUENCE1:%[0-9]+]]:sreg_64_xexec = REG_SEQUENCE [[S_ADD_U32_]], %subreg.sub0, [[S_ADDC_U32_]], %subreg.sub1
-    ; GFX9-NEXT: [[COPY5:%[0-9]+]]:vreg_64 = COPY [[REG_SEQUENCE1]]
+    ; GFX9-NEXT: [[REG_SEQUENCE:%[0-9]+]]:sreg_64_xexec = REG_SEQUENCE [[S_ADD_U32_]], %subreg.sub0, [[S_ADDC_U32_]], %subreg.sub1
+    ; GFX9-NEXT: [[COPY5:%[0-9]+]]:vreg_64 = COPY [[REG_SEQUENCE]]
     ; GFX9-NEXT: [[GLOBAL_LOAD_DWORD:%[0-9]+]]:vgpr_32 = GLOBAL_LOAD_DWORD [[COPY5]], 0, 0, implicit $exec :: (load (s32), addrspace 1)
     ; GFX9-NEXT: $vgpr0 = COPY [[GLOBAL_LOAD_DWORD]]
     ;
@@ -759,17 +757,15 @@ body: |
     ; GFX10: liveins: $sgpr0_sgpr1
     ; GFX10-NEXT: {{  $}}
     ; GFX10-NEXT: [[COPY:%[0-9]+]]:sreg_64 = COPY $sgpr0_sgpr1
-    ; GFX10-NEXT: [[S_MOV_B32_:%[0-9]+]]:sreg_32 = S_MOV_B32 0
-    ; GFX10-NEXT: [[S_MOV_B32_1:%[0-9]+]]:sreg_32 = S_MOV_B32 1
-    ; GFX10-NEXT: [[REG_SEQUENCE:%[0-9]+]]:sreg_64 = REG_SEQUENCE [[S_MOV_B32_]], %subreg.sub0, [[S_MOV_B32_1]], %subreg.sub1
+    ; GFX10-NEXT: [[S_MOV_B:%[0-9]+]]:sreg_64 = S_MOV_B64_IMM_PSEUDO 4294967296
     ; GFX10-NEXT: [[COPY1:%[0-9]+]]:sreg_32 = COPY [[COPY]].sub0
-    ; GFX10-NEXT: [[COPY2:%[0-9]+]]:sreg_32 = COPY [[REG_SEQUENCE]].sub0
+    ; GFX10-NEXT: [[COPY2:%[0-9]+]]:sreg_32 = COPY [[S_MOV_B]].sub0
     ; GFX10-NEXT: [[COPY3:%[0-9]+]]:sreg_32 = COPY [[COPY]].sub1
-    ; GFX10-NEXT: [[COPY4:%[0-9]+]]:sreg_32 = COPY [[REG_SEQUENCE]].sub1
+    ; GFX10-NEXT: [[COPY4:%[0-9]+]]:sreg_32 = COPY [[S_MOV_B]].sub1
     ; GFX10-NEXT: [[S_ADD_U32_:%[0-9]+]]:sreg_32 = S_ADD_U32 [[COPY1]], [[COPY2]], implicit-def $scc
     ; GFX10-NEXT: [[S_ADDC_U32_:%[0-9]+]]:sreg_32 = S_ADDC_U32 [[COPY3]], [[COPY4]], implicit-def dead $scc, implicit $scc
-    ; GFX10-NEXT: [[REG_SEQUENCE1:%[0-9]+]]:sreg_64_xexec = REG_SEQUENCE [[S_ADD_U32_]], %subreg.sub0, [[S_ADDC_U32_]], %subreg.sub1
-    ; GFX10-NEXT: [[COPY5:%[0-9]+]]:vreg_64 = COPY [[REG_SEQUENCE1]]
+    ; GFX10-NEXT: [[REG_SEQUENCE:%[0-9]+]]:sreg_64_xexec = REG_SEQUENCE [[S_ADD_U32_]], %subreg.sub0, [[S_ADDC_U32_]], %subreg.sub1
+    ; GFX10-NEXT: [[COPY5:%[0-9]+]]:vreg_64 = COPY [[REG_SEQUENCE]]
     ; GFX10-NEXT: [[GLOBAL_LOAD_DWORD:%[0-9]+]]:vgpr_32 = GLOBAL_LOAD_DWORD [[COPY5]], 0, 0, implicit $exec :: (load (s32), addrspace 1)
     ; GFX10-NEXT: $vgpr0 = COPY [[GLOBAL_LOAD_DWORD]]
     ;
@@ -777,17 +773,15 @@ body: |
     ; GFX11: liveins: $sgpr0_sgpr1
     ; GFX11-NEXT: {{  $}}
     ; GFX11-NEXT: [[COPY:%[0-9]+]]:sreg_64 = COPY $sgpr0_sgpr1
-    ; GFX11-NEXT: [[S_MOV_B32_:%[0-9]+]]:sreg_32 = S_MOV_B32 0
-    ; GFX11-NEXT: [[S_MOV_B32_1:%[0-9]+]]:sreg_32 = S_MOV_B32 1
-    ; GFX11-NEXT: [[REG_SEQUENCE:%[0-9]+]]:sreg_64 = REG_SEQUENCE [[S_MOV_B32_]], %subreg.sub0, [[S_MOV_B32_1]], %subreg.sub1
+    ; GFX11-NEXT: [[S_MOV_B:%[0-9]+]]:sreg_64 = S_MOV_B64_IMM_PSEUDO 4294967296
     ; GFX11-NEXT: [[COPY1:%[0-9]+]]:sreg_32 = COPY [[COPY]].sub0
-    ; GFX11-NEXT: [[COPY2:%[0-9]+]]:sreg_32 = COPY [[REG_SEQUENCE]].sub0
+    ; GFX11-NEXT: [[COPY2:%[0-9]+]]:sreg_32 = COPY [[S_MOV_B]].sub0
     ; GFX11-NEXT: [[COPY3:%[0-9]+]]:sreg_32 = COPY [[COPY]].sub1
-    ; GFX11-NEXT: [[COPY4:%[0-9]+]]:sreg_32 = COPY [[REG_SEQUENCE]].sub1
+    ; GFX11-NEXT: [[COPY4:%[0-9]+]]:sreg_32 = COPY [[S_MOV_B]].sub1
     ; GFX11-NEXT: [[S_ADD_U32_:%[0-9]+]]:sreg_32 = S_ADD_U32 [[COPY1]], [[COPY2]], implicit-def $scc
     ; GFX11-NEXT: [[S_ADDC_U32_:%[0-9]+]]:sreg_32 = S_ADDC_U32 [[COPY3]], [[COPY4]], implicit-def dead $scc, implicit $scc
-    ; GFX11-NEXT: [[REG_SEQUENCE1:%[0-9]+]]:sreg_64_xexec = REG_SEQUENCE [[S_ADD_U32_]], %subreg.sub0, [[S_ADDC_U32_]], %subreg.sub1
-    ; GFX11-NEXT: [[COPY5:%[0-9]+]]:vreg_64 = COPY [[REG_SEQUENCE1]]
+    ; GFX11-NEXT: [[REG_SEQUENCE:%[0-9]+]]:sreg_64_xexec = REG_SEQUENCE [[S_ADD_U32_]], %subreg.sub0, [[S_ADDC_U32_]], %subreg.sub1
+    ; GFX11-NEXT: [[COPY5:%[0-9]+]]:vreg_64 = COPY [[REG_SEQUENCE]]
     ; GFX11-NEXT: [[GLOBAL_LOAD_DWORD:%[0-9]+]]:vgpr_32 = GLOBAL_LOAD_DWORD [[COPY5]], 0, 0, implicit $exec :: (load (s32), addrspace 1)
     ; GFX11-NEXT: $vgpr0 = COPY [[GLOBAL_LOAD_DWORD]]
     ;
@@ -795,17 +789,15 @@ body: |
     ; GFX12: liveins: $sgpr0_sgpr1
     ; GFX12-NEXT: {{  $}}
     ; GFX12-NEXT: [[COPY:%[0-9]+]]:sreg_64 = COPY $sgpr0_sgpr1
-    ; GFX12-NEXT: [[S_MOV_B32_:%[0-9]+]]:sreg_32 = S_MOV_B32 0
-    ; GFX12-NEXT: [[S_MOV_B32_1:%[0-9]+]]:sreg_32 = S_MOV_B32 1
-    ; GFX12-NEXT: [[REG_SEQUENCE:%[0-9]+]]:sreg_64 = REG_SEQUENCE [[S_MOV_B32_]], %subreg.sub0, [[S_MOV_B32_1]], %subreg.sub1
+    ; GFX12-NEXT: [[S_MOV_B:%[0-9]+]]:sreg_64 = S_MOV_B64_IMM_PSEUDO 4294967296
     ; GFX12-NEXT: [[COPY1:%[0-9]+]]:sreg_32 = COPY [[COPY]].sub0
-    ; GFX12-NEXT: [[COPY2:%[0-9]+]]:sreg_32 = COPY [[REG_SEQUENCE]].sub0
+    ; GFX12-NEXT: [[COPY2:%[0-9]+]]:sreg_32 = COPY [[S_MOV_B]].sub0
     ; GFX12-NEXT: [[COPY3:%[0-9]+]]:sreg_32 = COPY [[COPY]].sub1
-    ; GFX12-NEXT: [[COPY4:%[0-9]+]]:sreg_32 = COPY [[REG_SEQUENCE]].sub1
+    ; GFX12-NEXT: [[COPY4:%[0-9]+]]:sreg_32 = COPY [[S_MOV_B]].sub1
     ; GFX12-NEXT: [[S_ADD_U32_:%[0-9]+]]:sreg_32 = S_ADD_U32 [[COPY1]], [[COPY2]], implicit-def $scc
     ; GFX12-NEXT: [[S_ADDC_U32_:%[0-9]+]]:sreg_32 = S_ADDC_U32 [[COPY3]], [[COPY4]], implicit-def dead $scc, implicit $scc
-    ; GFX12-NEXT: [[REG_SEQUENCE1:%[0-9]+]]:sreg_64_xexec = REG_SEQUENCE [[S_ADD_U32_]], %subreg.sub0, [[S_ADDC_U32_]], %subreg.sub1
-    ; GFX12-NEXT: [[COPY5:%[0-9]+]]:vreg_64 = COPY [[REG_SEQUENCE1]]
+    ; GFX12-NEXT: [[REG_SEQUENCE:%[0-9]+]]:sreg_64_xexec = REG_SEQUENCE [[S_ADD_U32_]], %subreg.sub0, [[S_ADDC_U32_]], %subreg.sub1
+    ; GFX12-NEXT: [[COPY5:%[0-9]+]]:vreg_64 = COPY [[REG_SEQUENCE]]
     ; GFX12-NEXT: [[GLOBAL_LOAD_DWORD:%[0-9]+]]:vgpr_32 = GLOBAL_LOAD_DWORD [[COPY5]], 0, 0, implicit $exec :: (load (s32), addrspace 1)
     ; GFX12-NEXT: $vgpr0 = COPY [[GLOBAL_LOAD_DWORD]]
     %0:sgpr(p1) = COPY $sgpr0_sgpr1
@@ -831,35 +823,31 @@ body: |
     ; GFX9: liveins: $sgpr0_sgpr1
     ; GFX9-NEXT: {{  $}}
     ; GFX9-NEXT: [[COPY:%[0-9]+]]:sreg_64 = COPY $sgpr0_sgpr1
-    ; GFX9-NEXT: [[S_MOV_B32_:%[0-9]+]]:sreg_32 = S_MOV_B32 4094
-    ; GFX9-NEXT: [[S_MOV_B32_1:%[0-9]+]]:sreg_32 = S_MOV_B32 1
-    ; GFX9-NEXT: [[REG_SEQUENCE:%[0-9]+]]:sreg_64 = REG_SEQUENCE [[S_MOV_B32_]], %subreg.sub0, [[S_MOV_B32_1]], %subreg.sub1
+    ; GFX9-NEXT: [[S_MOV_B:%[0-9]+]]:sreg_64 = S_MOV_B64_IMM_PSEUDO 4294971390
     ; GFX9-NEXT: [[COPY1:%[0-9]+]]:sreg_32 = COPY [[COPY]].sub0
-    ; GFX9-NEXT: [[COPY2:%[0-9]+]]:sreg_32 = COPY [[REG_SEQUENCE]].sub0
+    ; GFX9-NEXT: [[COPY2:%[0-9]+]]:sreg_32 = COPY [[S_MOV_B]].sub0
     ; GFX9-NEXT: [[COPY3:%[0-9]+]]:sreg_32 = COPY [[COPY]].sub1
-    ; GFX9-NEXT: [[COPY4:%[0-9]+]]:sreg_32 = COPY [[REG_SEQUENCE]].sub1
+    ; GFX9-NEXT: [[COPY4:%[0-9]+]]:sreg_32 = COPY [[S_MOV_B]].sub1
     ; GFX9-NEXT: [[S_ADD_U32_:%[0-9]+]]:sreg_32 = S_ADD_U32 [[COPY1]], [[COPY2]], implicit-def $scc
     ; GFX9-NEXT: [[S_ADDC_U32_:%[0-9]+]]:sreg_32 = S_ADDC_U32 [[COPY3]], [[COPY4]], implicit-def dead $scc, implicit $scc
-    ; GFX9-NEXT: [[REG_SEQUENCE1:%[0-9]+]]:sreg_64_xexec = REG_SEQUENCE [[S_ADD_U32_]], %subreg.sub0, [[S_ADDC_U32_]], %subreg.sub1
+    ; GFX9-NEXT: [[REG_SEQUENCE:%[0-9]+]]:sreg_64_xexec = REG_SEQUENCE [[S_ADD_U32_]], %subreg.sub0, [[S_ADDC_U32_]], %subreg.sub1
     ; GFX9-NEXT: [[V_MOV_B32_e32_:%[0-9]+]]:vgpr_32 = V_MOV_B32_e32 0, implicit $exec
-    ; GFX9-NEXT: [[GLOBAL_LOAD_DWORD_SADDR:%[0-9]+]]:vgpr_32 = GLOBAL_LOAD_DWORD_SADDR [[REG_SEQUENCE1]], [[V_MOV_B32_e32_]], 0, 0, implicit $exec :: (load (s32), addrspace 1)
+    ; GFX9-NEXT: [[GLOBAL_LOAD_DWORD_SADDR:%[0-9]+]]:vgpr_32 = GLOBAL_LOAD_DWORD_SADDR [[REG_SEQUENCE]], [[V_MOV_B32_e32_]], 0, 0, implicit $exec :: (load (s32), addrspace 1)
     ; GFX9-NEXT: $vgpr0 = COPY [[GLOBAL_LOAD_DWORD_SADDR]]
     ;
     ; GFX10-LABEL: name: load_global_s32_from_sgpr_base_offset_4294971390
     ; GFX10: liveins: $sgpr0_sgpr1
     ; GFX10-NEXT: {{  $}}
     ; GFX10-NEXT: [[COPY:%[0-9]+]]:sreg_64 = COPY $sgpr0_sgpr1
-    ; GFX10-NEXT: [[S_MOV_B32_:%[0-9]+]]:sreg_32 = S_MOV_B32 4094
-    ; GFX10-NEXT: [[S_MOV_B32_1:%[0-9]+]]:sreg_32 = S_MOV_B32 1
-    ; GFX10-NEXT: [[REG_SEQUENCE:%[0-9]+]]:sreg_64 = REG_SEQUENCE [[S_MOV_B32_]], %subreg.sub0, [[S_MOV_B32_1]], %subreg.sub1
+    ; GFX10-NEXT: [[S_MOV_B:%[0-9]+]]:sreg_64 = S_MOV_B64_IMM_PSEUDO 4294971390
     ; GFX10-NEXT: [[COPY1:%[0-9]+]]:sreg_32 = COPY [[COPY]].sub0
-    ; GFX10-NEXT: [[COPY2:%[0-9]+]]:sreg_32 = COPY [[REG_SEQUENCE]].sub0
+    ; GFX10-NEXT: [[COPY2:%[0-9]+]]:sreg_32 = COPY [[S_MOV_B]].sub0
     ; GFX10-NEXT: [[COPY3:%[0-9]+]]:sreg_32 = COPY [[COPY]].sub1
-    ; GFX10-NEXT: [[COPY4:%[0-9]+]]:sreg_32 = COPY [[REG_SEQUENCE]].sub1
+    ; GFX10-NEXT: [[COPY4:%[0-9]+]]:sreg_32 = COPY [[S_MOV_B]].sub1
     ; GFX10-NEXT: [[S_ADD_U32_:%[0-9]+]]:sreg_32 = S_ADD_U32 [[COPY1]], [[COPY2]], implicit-def $scc
     ; GFX10-NEXT: [[S_ADDC_U32_:%[0-9]+]]:sreg_32 = S_ADDC_U32 [[COPY3]], [[COPY4]], implicit-def dead $scc, implicit $scc
-    ; GFX10-NEXT: [[REG_SEQUENCE1:%[0-9]+]]:sreg_64_xexec = REG_SEQUENCE [[S_ADD_U32_]], %subreg.sub0, [[S_ADDC_U32_]], %subreg.sub1
-    ; GFX10-NEXT: [[COPY5:%[0-9]+]]:vreg_64 = COPY [[REG_SEQUENCE1]]
+    ; GFX10-NEXT: [[REG_SEQUENCE:%[0-9]+]]:sreg_64_xexec = REG_SEQUENCE [[S_ADD_U32_]], %subreg.sub0, [[S_ADDC_U32_]], %subreg.sub1
+    ; GFX10-NEXT: [[COPY5:%[0-9]+]]:vreg_64 = COPY [[REG_SEQUENCE]]
     ; GFX10-NEXT: [[GLOBAL_LOAD_DWORD:%[0-9]+]]:vgpr_32 = GLOBAL_LOAD_DWORD [[COPY5]], 0, 0, implicit $exec :: (load (s32), addrspace 1)
     ; GFX10-NEXT: $vgpr0 = COPY [[GLOBAL_LOAD_DWORD]]
     ;
@@ -867,17 +855,15 @@ body: |
     ; GFX11: liveins: $sgpr0_sgpr1
     ; GFX11-NEXT: {{  $}}
     ; GFX11-NEXT: [[COPY:%[0-9]+]]:sreg_64 = COPY $sgpr0_sgpr1
-    ; GFX11-NEXT: [[S_MOV_B32_:%[0-9]+]]:sreg_32 = S_MOV_B32 4094
-    ; GFX11-NEXT: [[S_MOV_B32_1:%[0-9]+]]:sreg_32 = S_MOV_B32 1
-    ; GFX11-NEXT: [[REG_SEQUENCE:%[0-9]+]]:sreg_64 = REG_SEQUENCE [[S_MOV_B32_]], %subreg.sub0, [[S_MOV_B32_1]], %subreg.sub1
+    ; GFX11-NEXT: [[S_MOV_B:%[0-9]+]]:sreg_64 = S_MOV_B64_IMM_PSEUDO 4294971390
     ; GFX11-NEXT: [[COPY1:%[0-9]+]]:sreg_32 = COPY [[COPY]].sub0
-    ; GFX11-NEXT: [[COPY2:%[0-9]+]]:sreg_32 = COPY [[REG_SEQUENCE]].sub0
+    ; GFX11-NEXT: [[COPY2:%[0-9]+]]:sreg_32 = COPY [[S_MOV_B]].sub0
     ; GFX11-NEXT: [[COPY3:%[0-9]+]]:sreg_32 = COPY [[COPY]].sub1
-    ; GFX11-NEXT: [[COPY4:%[0-9]+]]:sreg_32 = COPY [[REG_SEQUENCE]].sub1
+    ; GFX11-NEXT: [[COPY4:%[0-9]+]]:sreg_32 = COPY [[S_MOV_B]].sub1
     ; GFX11-NEXT: [[S_ADD_U32_:%[0-9]+]]:sreg_32 = S_ADD_U32 [[COPY1]], [[COPY2]], implicit-def $scc
     ; GFX11-NEXT: [[S_ADDC_U32_:%[0-9]+]]:sreg_32 = S_ADDC_U32 [[COPY3]], [[COPY4]], implicit-def dead $scc, implicit $scc
-    ; GFX11-NEXT: [[REG_SEQUENCE1:%[0-9]+]]:sreg_64_xexec = REG_SEQUENCE [[S_ADD_U32_]], %subreg.sub0, [[S_ADDC_U32_]], %subreg.sub1
-    ; GFX11-NEXT: [[COPY5:%[0-9]+]]:vreg_64 = COPY [[REG_SEQUENCE1]]
+    ; GFX11-NEXT: [[REG_SEQUENCE:%[0-9]+]]:sreg_64_xexec = REG_SEQUENCE [[S_ADD_U32_]], %subreg.sub0, [[S_ADDC_U32_]], %subreg.sub1
+    ; GFX11-NEXT: [[COPY5:%[0-9]+]]:vreg_64 = COPY [[REG_SEQUENCE]]
     ; GFX11-NEXT: [[GLOBAL_LOAD_DWORD:%[0-9]+]]:vgpr_32 = GLOBAL_LOAD_DWORD [[COPY5]], 0, 0, implicit $exec :: (load (s32), addrspace 1)
     ; GFX11-NEXT: $vgpr0 = COPY [[GLOBAL_LOAD_DWORD]]
     ;
@@ -885,17 +871,15 @@ body: |
     ; GFX12: liveins: $sgpr0_sgpr1
     ; GFX12-NEXT: {{  $}}
     ; GFX12-NEXT: [[COPY:%[0-9]+]]:sreg_64 = COPY $sgpr0_sgpr1
-    ; GFX12-NEXT: [[S_MOV_B32_:%[0-9]+]]:sreg_32 = S_MOV_B32 4094
-    ; GFX12-NEXT: [[S_MOV_B32_1:%[0-9]+]]:sreg_32 = S_MOV_B32 1
-    ; GFX12-NEXT: [[REG_SEQUENCE:%[0-9]+]]:sreg_64 = REG_SEQUENCE [[S_MOV_B32_]], %subreg.sub0, [[S_MOV_B32_1]], %subreg.sub1
+    ; GFX12-NEXT: [[S_MOV_B:%[0-9]+]]:sreg_64 = S_MOV_B64_IMM_PSEUDO 4294971390
     ; GFX12-NEXT: [[COPY1:%[0-9]+]]:sreg_32 = COPY [[COPY]].sub0
-    ; GFX12-NEXT: [[COPY2:%[0-9]+]]:sreg_32 = COPY [[REG_SEQUENCE]].sub0
+    ; GFX12-NEXT: [[COPY2:%[0-9]+]]:sreg_32 = COPY [[S_MOV_B]].sub0
     ; GFX12-NEXT: [[COPY3:%[0-9]+]]:sreg_32 = COPY [[COPY]].sub1
-    ; GFX12-NEXT: [[COPY4:%[0-9]+]]:sreg_32 = COPY [[REG_SEQUENCE]].sub1
+    ; GFX12-NEXT: [[COPY4:%[0-9]+]]:sreg_32 = COPY [[S_MOV_B]].sub1
     ; GFX12-NEXT: [[S_ADD_U32_:%[0-9]+]]:sreg_32 = S_ADD_U32 [[COPY1]], [[COPY2]], implicit-def $scc
     ; GFX12-NEXT: [[S_ADDC_U32_:%[0-9]+]]:sreg_32 = S_ADDC_U32 [[COPY3]], [[COPY4]], implicit-def dead $scc, implicit $scc
-    ; GFX12-NEXT: [[REG_SEQUENCE1:%[0-9]+]]:sreg_64_xexec = REG_SEQUENCE [[S_ADD_U32_]], %subreg.sub0, [[S_ADDC_U32_]], %subreg.sub1
-    ; GFX12-NEXT: [[COPY5:%[0-9]+]]:vreg_64 = COPY [[REG_SEQUENCE1]]
+    ; GFX12-NEXT: [[REG_SEQUENCE:%[0-9]+]]:sreg_64_xexec = REG_SEQUENCE [[S_ADD_U32_]], %subreg.sub0, [[S_ADDC_U32_]], %subreg.sub1
+    ; GFX12-NEXT: [[COPY5:%[0-9]+]]:vreg_64 = COPY [[REG_SEQUENCE]]
     ; GFX12-NEXT: [[GLOBAL_LOAD_DWORD:%[0-9]+]]:vgpr_32 = GLOBAL_LOAD_DWORD [[COPY5]], 0, 0, implicit $exec :: (load (s32), addrspace 1)
     ; GFX12-NEXT: $vgpr0 = COPY [[GLOBAL_LOAD_DWORD]]
     %0:sgpr(p1) = COPY $sgpr0_sgpr1
@@ -921,17 +905,15 @@ body: |
     ; GFX9: liveins: $sgpr0_sgpr1
     ; GFX9-NEXT: {{  $}}
     ; GFX9-NEXT: [[COPY:%[0-9]+]]:sreg_64 = COPY $sgpr0_sgpr1
-    ; GFX9-NEXT: [[S_MOV_B32_:%[0-9]+]]:sreg_32 = S_MOV_B32 1
-    ; GFX9-NEXT: [[S_MOV_B32_1:%[0-9]+]]:sreg_32 = S_MOV_B32 -1
-    ; GFX9-NEXT: [[REG_SEQUENCE:%[0-9]+]]:sreg_64 = REG_SEQUENCE [[S_MOV_B32_]], %subreg.sub0, [[S_MOV_B32_1]], %subreg.sub1
+    ; GFX9-NEXT: [[S_MOV_B:%[0-9]+]]:sreg_64 = S_MOV_B64_IMM_PSEUDO -4294967295
     ; GFX9-NEXT: [[COPY1:%[0-9]+]]:sreg_32 = COPY [[COPY]].sub0
-    ; GFX9-NEXT: [[COPY2:%[0-9]+]]:sreg_32 = COPY [[REG_SEQUENCE]].sub0
+    ; GFX9-NEXT: [[COPY2:%[0-9]+]]:sreg_32 = COPY [[S_MOV_B]].sub0
     ; GFX9-NEXT: [[COPY3:%[0-9]+]]:sreg_32 = COPY [[COPY]].sub1
-    ; GFX9-NEXT: [[COPY4:%[0-9]+]]:sreg_32 = COPY [[REG_SEQUENCE]].sub1
+    ; GFX9-NEXT: [[COPY4:%[0-9]+]]:sreg_32 = COPY [[S_MOV_B]].sub1
     ; GFX9-NEXT: [[S_ADD_U32_:%[0-9]+]]:sreg_32 = S_ADD_U32 [[COPY1]], [[COPY2]], implicit-def $scc
     ; GFX9-NEXT: [[S_ADDC_U32_:%[0-9]+]]:sreg_32 = S_ADDC_U32 [[COPY3]], [[COPY4]], implicit-def dead $scc, implicit $scc
-    ; GFX9-NEXT: [[REG_SEQUENCE1:%[0-9]+]]:sreg_64_xexec = REG_SEQUENCE [[S_ADD_U32_]], %subreg.sub0, [[S_ADDC_U32_]], %subreg.sub1
-    ; GFX9-NEXT: [[COPY5:%[0-9]+]]:vreg_64 = COPY [[REG_SEQUENCE1]]
+    ; GFX9-NEXT: [[REG_SEQUENCE:%[0-9]+]]:sreg_64_xexec = REG_SEQUENCE [[S_ADD_U32_]], %subreg.sub0, [[S_ADDC_U32_]], %subreg.sub1
+    ; GFX9-NEXT: [[COPY5:%[0-9]+]]:vreg_64 = COPY [[REG_SEQUENCE]]
     ; GFX9-NEXT: [[GLOBAL_LOAD_DWORD:%[0-9]+]]:vgpr_32 = GLOBAL_LOAD_DWORD [[COPY5]], 0, 0, implicit $exec :: (load (s32), addrspace 1)
     ; GFX9-NEXT: $vgpr0 = COPY [[GLOBAL_LOAD_DWORD]]
     ;
@@ -939,17 +921,15 @@ body: |
     ; GFX10: liveins: $sgpr0_sgpr1
     ; GFX10-NEXT: {{  $}}
     ; GFX10-NEXT: [[COPY:%[0-9]+]]:sreg_64 = COPY $sgpr0_sgpr1
-    ; GFX10-NEXT: [[S_MOV_B32_:%[0-9]+]]:sreg_32 = S_MOV_B32 1
-    ; GFX10-NEXT: [[S_MOV_B32_1:%[0-9]+]]:sreg_32 = S_MOV_B32 -1
-    ; GFX10-NEXT: [[REG_SEQUENCE:%[0-9]+]]:sreg_64 = REG_SEQUENCE [[S_MOV_B32_]], %subreg.sub0, [[S_MOV_B32_1]], %subreg.sub1
+    ; GFX10-NEXT: [[S_MOV_B:%[0-9]+]]:sreg_64 = S_MOV_B64_IMM_PSEUDO -4294967295
     ; GFX10-NEXT: [[COPY1:%[0-9]+]]:sreg_32 = COPY [[COPY]].sub0
-    ; GFX10-NEXT: [[COPY2:%[0-9]+]]:sreg_32 = COPY [[REG_SEQUENCE]].sub0
+    ; GFX10-NEXT: [[COPY2:%[0-9]+]]:sreg_32 = COPY [[S_MOV_B]].sub0
     ; GFX10-NEXT: [[COPY3:%[0-9]+]]:sreg_32 = COPY [[COPY]].sub1
-    ; GFX10-NEXT: [[COPY4:%[0-9]+]]:sreg_32 = COPY [[REG_SEQUENCE]].sub1
+    ; GFX10-NEXT: [[COPY4:%[0-9]+]]:sreg_32 = COPY [[S_MOV_B]].sub1
     ; GFX10-NEXT: [[S_ADD_U32_:%[0-9]+]]:sreg_32 = S_ADD_U32 [[COPY1]], [[COPY2]], implicit-def $scc
     ; GFX10-NEXT: [[S_ADDC_U32_:%[0-9]+]]:sreg_32 = S_ADDC_U32 [[COPY3]], [[COPY4]], implicit-def dead $scc, implicit $scc
-    ; GFX10-NEXT: [[REG_SEQUENCE1:%[0-9]+]]:sreg_64_xexec = REG_SEQUENCE [[S_ADD_U32_]], %subreg.sub0, [[S_ADDC_U32_]], %subreg.sub1
-    ; GFX10-NEXT: [[COPY5:%[0-9]+]]:vreg_64 = COPY [[REG_SEQUENCE1]]
+    ; GFX10-NEXT: [[REG_SEQUENCE:%[0-9]+]]:sreg_64_xexec = REG_SEQUENCE [[S_ADD_U32_]], %subreg.sub0, [[S_ADDC_U32_]], %subreg.sub1
+    ; GFX10-NEXT: [[COPY5:%[0-9]+]]:vreg_64 = COPY [[REG_SEQUENCE]]
     ; GFX10-NEXT: [[GLOBAL_LOAD_DWORD:%[0-9]+]]:vgpr_32 = GLOBAL_LOAD_DWORD [[COPY5]], 0, 0, implicit $exec :: (load (s32), addrspace 1)
     ; GFX10-NEXT: $vgpr0 = COPY [[GLOBAL_LOAD_DWORD]]
     ;
@@ -957,17 +937,15 @@ body: |
     ; GFX11: liveins: $sgpr0_sgpr1
     ; GFX11-NEXT: {{  $}}
     ; GFX11-NEXT: [[COPY:%[0-9]+]]:sreg_64 = COPY $sgpr0_sgpr1
-    ; GFX11-NEXT: [[S_MOV_B32_:%[0-9]+]]:sreg_32 = S_MOV_B32 1
-    ; GFX11-NEXT: [[S_MOV_B32_1:%[0-9]+]]:sreg_32 = S_MOV_B32 -1
-    ; GFX11-NEXT: [[REG_SEQUENCE:%[0-9]+]]:sreg_64 = REG_SEQUENCE [[S_MOV_B32_]], %subreg.sub0, [[S_MOV_B32_1]], %subreg.sub1
+    ; GFX11-NEXT: [[S_MOV_B:%[0-9]+]]:sreg_64 = S_MOV_B64_IMM_PSEUDO -4294967295
     ; GFX11-NEXT: [[COPY1:%[0-9]+]]:sreg_32 = COPY [[COPY]].sub0
-    ; GFX11-NEXT: [[COPY2:%[0-9]+]]:sreg_32 = COPY [[REG_SEQUENCE]].sub0
+    ; GFX11-NEXT: [[COPY2:%[0-9]+]]:sreg_32 = COPY [[S_MOV_B]].sub0
     ; GFX11-NEXT: [[COPY3:%[0-9]+]]:sreg_32 = COPY [[COPY]].sub1
-    ; GFX11-NEXT: [[COPY4:%[0-9]+]]:sreg_32 = COPY [[REG_SEQUENCE]].sub1
+    ; GFX11-NEXT: [[COPY4:%[0-9]+]]:sreg_32 = COPY [[S_MOV_B]].sub1
     ; GFX11-NEXT: [[S_ADD_U32_:%[0-9]+]]:sreg_32 = S_ADD_U32 [[COPY1]], [[COPY2]], implicit-def $scc
     ; GFX11-NEXT: [[S_ADDC_U32_:%[0-9]+]]:sreg_32 = S_ADDC_U32 [[COPY3]], [[COPY4]], implicit-def dead $scc, implicit $scc
-    ; GFX11-NEXT: [[REG_SEQUENCE1:%[0-9]+]]:sreg_64_xexec = REG_SEQUENCE [[S_ADD_U32_]], %subreg.sub0, [[S_ADDC_U32_]], %subreg.sub1
-    ; GFX11-NEXT: [[COPY5:%[0-9]+]]:vreg_64 = COPY [[REG_SEQUENCE1]]
+    ; GFX11-NEXT: [[REG_SEQUENCE:%[0-9]+]]:sreg_64_xexec = REG_SEQUENCE [[S_ADD_U32_]], %subreg.sub0, [[S_ADDC_U32_]], %subreg.sub1
+    ; GFX11-NEXT: [[COPY5:%[0-9]+]]:vreg_64 = COPY [[REG_SEQUENCE]]
     ; GFX11-NEXT: [[GLOBAL_LOAD_DWORD:%[0-9]+]]:vgpr_32 = GLOBAL_LOAD_DWORD [[COPY5]], 0, 0, implicit $exec :: (load (s32), addrspace 1)
     ; GFX11-NEXT: $vgpr0 = COPY [[GLOBAL_LOAD_DWORD]]
     ;
@@ -975,17 +953,15 @@ body: |
     ; GFX12: liveins: $sgpr0_sgpr1
     ; GFX12-NEXT: {{  $}}
     ; GFX12-NEXT: [[COPY:%[0-9]+]]:sreg_64 = COPY $sgpr0_sgpr1
-    ; GFX12-NEXT: [[S_MOV_B32_:%[0-9]+]]:sreg_32 = S_MOV_B32 1
-    ; GFX12-NEXT: [[S_MOV_B32_1:%[0-9]+]]:sreg_32 = S_MOV_B32 -1
-    ; GFX12-NEXT: [[REG_SEQUENCE:%[0-9]+]]:sreg_64 = REG_SEQUENCE [[S_MOV_B32_]], %subreg.sub0, [[S_MOV_B32_1]], %subreg.sub1
+    ; GFX12-NEXT: [[S_MOV_B:%[0-9]+]]:sreg_64 = S_MOV_B64_IMM_PSEUDO -4294967295
     ; GFX12-NEXT: [[COPY1:%[0-9]+]]:sreg_32 = COPY [[COPY]].sub0
-    ; GFX12-NEXT: [[COPY2:%[0-9]+]]:sreg_32 = COPY [[REG_SEQUENCE]].sub0
+    ; GFX12-NEXT: [[COPY2:%[0-9]+]]:sreg_32 = COPY [[S_MOV_B]].sub0
     ; GFX12-NEXT: [[COPY3:%[0-9]+]]:sreg_32 = COPY [[COPY]].sub1
-    ; GFX12-NEXT: [[COPY4:%[0-9]+]]:sreg_32 = COPY [[REG_SEQUENCE]].sub1
+    ; GFX12-NEXT: [[COPY4:%[0-9]+]]:sreg_32 = COPY [[S_MOV_B]].sub1
     ; GFX12-NEXT: [[S_ADD_U32_:%[0-9]+]]:sreg_32 = S_ADD_U32 [[COPY1]], [[COPY2]], implicit-def $scc
     ; GFX12-NEXT: [[S_ADDC_U32_:%[0-9]+]]:sreg_32 = S_ADDC_U32 [[COPY3]], [[COPY4]], implicit-def dead $scc, implicit $scc
-    ; GFX12-NEXT: [[REG_SEQUENCE1:%[0-9]+]]:sreg_64_xexec = REG_SEQUENCE [[S_ADD_U32_]], %subreg.sub0, [[S_ADDC_U32_]], %subreg.sub1
-    ; GFX12-NEXT: [[COPY5:%[0-9]+]]:vreg_64 = COPY [[REG_SEQUENCE1]]
+    ; GFX12-NEXT: [[REG_SEQUENCE:%[0-9]+]]:sreg_64_xexec = REG_SEQUENCE [[S_ADD_U32_]], %subreg.sub0, [[S_ADDC_U32_]], %subreg.sub1
+    ; GFX12-NEXT: [[COPY5:%[0-9]+]]:vreg_64 = COPY [[REG_SEQUENCE]]
     ; GFX12-NEXT: [[GLOBAL_LOAD_DWORD:%[0-9]+]]:vgpr_32 = GLOBAL_LOAD_DWORD [[COPY5]], 0, 0, implicit $exec :: (load (s32), addrspace 1)
     ; GFX12-NEXT: $vgpr0 = COPY [[GLOBAL_LOAD_DWORD]]
     %0:sgpr(p1) = COPY $sgpr0_sgpr1
@@ -1010,17 +986,15 @@ body: |
     ; GFX9: liveins: $sgpr0_sgpr1
     ; GFX9-NEXT: {{  $}}
     ; GFX9-NEXT: [[COPY:%[0-9]+]]:sreg_64 = COPY $sgpr0_sgpr1
-    ; GFX9-NEXT: [[S_MOV_B32_:%[0-9]+]]:sreg_32 = S_MOV_B32 0
-    ; GFX9-NEXT: [[S_MOV_B32_1:%[0-9]+]]:sreg_32 = S_MOV_B32 -1
-    ; GFX9-NEXT: [[REG_SEQUENCE:%[0-9]+]]:sreg_64 = REG_SEQUENCE [[S_MOV_B32_]], %subreg.sub0, [[S_MOV_B32_1]], %subreg.sub1
+    ; GFX9-NEXT: [[S_MOV_B:%[0-9]+]]:sreg_64 = S_MOV_B64_IMM_PSEUDO -4294967296
     ; GFX9-NEXT: [[COPY1:%[0-9]+]]:sreg_32 = COPY [[COPY]].sub0
-    ; GFX9-NEXT: [[COPY2:%[0-9]+]]:sreg_32 = COPY [[REG_SEQUENCE]].sub0
+    ; GFX9-NEXT: [[COPY2:%[0-9]+]]:sreg_32 = COPY [[S_MOV_B]].sub0
     ; GFX9-NEXT: [[COPY3:%[0-9]+]]:sreg_32 = COPY [[COPY]].sub1
-    ; GFX9-NEXT: [[COPY4:%[0-9]+]]:sreg_32 = COPY [[REG_SEQUENCE]].sub1
+    ; GFX9-NEXT: [[COPY4:%[0-9]+]]:sreg_32 = COPY [[S_MOV_B]].sub1
     ; GFX9-NEXT: [[S_ADD_U32_:%[0-9]+]]:sreg_32 = S_ADD_U32 [[COPY1]], [[COPY2]], implicit-def $scc
     ; GFX9-NEXT: [[S_ADDC_U32_:%[0-9]+]]:sreg_32 = S_ADDC_U32 [[COPY3]], [[COPY4]], implicit-def dead $scc, implicit $scc
-    ; GFX9-NEXT: [[REG_SEQUENCE1:%[0-9]+]]:sreg_64_xexec = REG_SEQUENCE [[S_ADD_U32_]], %subreg.sub0, [[S_ADDC_U32_]], %subreg.sub1
-    ; GFX9-NEXT: [[COPY5:%[0-9]+]]:vreg_64 = COPY [[REG_SEQUENCE1]]
+    ; GFX9-NEXT: [[REG_SEQUENCE:%[0-9]+]]:sreg_64_xexec = REG_SEQUENCE [[S_ADD_U32_]], %subreg.sub0, [[S_ADDC_U32_]], %subreg.sub1
+    ; GFX9-NEXT: [[COPY5:%[0-9]+]]:vreg_64 = COPY [[REG_SEQUENCE]]
     ; GFX9-NEXT: [[GLOBAL_LOAD_DWORD:%[0-9]+]]:vgpr_32 = GLOBAL_LOAD_DWORD [[COPY5]], 0, 0, implicit $exec :: (load (s32), addrspace 1)
     ; GFX9-NEXT: $vgpr0 = COPY [[GLOBAL_LOAD_DWORD]]
     ;
@@ -1028,17 +1002,15 @@ body: |
     ; GFX10: liveins: $sgpr0_sgpr1
     ; GFX10-NEXT: {{  $}}
     ; GFX10-NEXT: [[COPY:%[0-9]+]]:sreg_64 = COPY $sgpr0_sgpr1
-    ; GFX10-NEXT: [[S_MOV_B32_:%[0-9]+]]:sreg_32 = S_MOV_B32 0
-    ; GFX10-NEXT: [[S_MOV_B32_1:%[0-9]+]]:sreg_32 = S_MOV_B32 -1
-    ; GFX10-NEXT: [[REG_SEQUENCE:%[0-9]+]]:sreg_64 = REG_SEQUENCE [[S_MOV_B32_]], %subreg.sub0, [[S_MOV_B32_1]], %subreg.sub1
+    ; GFX10-NEXT: [[S_MOV_B:%[0-9]+]]:sreg_64 = S_MOV_B64_IMM_PSEUDO -4294967296
     ; GFX10-NEXT: [[COPY1:%[0-9]+]]:sreg_32 = COPY [[COPY]].sub0
-    ; GFX10-NEXT: [[COPY2:%[0-9]+]]:sreg_32 = COPY [[REG_SEQUENCE]].sub0
+    ; GFX10-NEXT: [[COPY2:%[0-9]+]]:sreg_32 = COPY [[S_MOV_B]].sub0
     ; GFX10-NEXT: [[COPY3:%[0-9]+]]:sreg_32 = COPY [[COPY]].sub1
-    ; GFX10-NEXT: [[COPY4:%[0-9]+]]:sreg_32 = COPY [[REG_SEQUENCE]].sub1
+    ; GFX10-NEXT: [[COPY4:%[0-9]+]]:sreg_32 = COPY [[S_MOV_B]].sub1
     ; GFX10-NEXT: [[S_ADD_U32_:%[0-9]+]]:sreg_32 = S_ADD_U32 [[COPY1]], [[COPY2]], implicit-def $scc
     ; GFX10-NEXT: [[S_ADDC_U32_:%[0-9]+]]:sreg_32 = S_ADDC_U32 [[COPY3]], [[COPY4]], implicit-def dead $scc, implicit $scc
-    ; GFX10-NEXT: [[REG_SEQUENCE1:%[0-9]+]]:sreg_64_xexec = REG_SEQUENCE [[S_ADD_U32_]], %subreg.sub0, [[S_ADDC_U32_]], %subreg.sub1
-    ; GFX10-NEXT: [[COPY5:%[0-9]+]]:vreg_64 = COPY [[REG_SEQUENCE1]]
+    ; GFX10-NEXT: [[REG_SEQUENCE:%[0-9]+]]:sreg_64_xexec = REG_SEQUENCE [[S_ADD_U32_]], %subreg.sub0, [[S_ADDC_U32_]], %subreg.sub1
+    ; GFX10-NEXT: [[COPY5:%[0-9]+]]:vreg_64 = COPY [[REG_SEQUENCE]]
     ; GFX10-NEXT: [[GLOBAL_LOAD_DWORD:%[0-9]+]]:vgpr_32 = GLOBAL_LOAD_DWORD [[COPY5]], 0, 0, implicit $exec :: (load (s32), addrspace 1)
     ; GFX10-NEXT: $vgpr0 = COPY [[GLOBAL_LOAD_DWORD]]
     ;
@@ -1046,17 +1018,15 @@ body: |
     ; GFX11: liveins: $sgpr0_sgpr1
     ; GFX11-NEXT: {{  $}}
     ; GFX11-NEXT: [[COPY:%[0-9]+]]:sreg_64 = COPY $sgpr0_sgpr1
-    ; GFX11-NEXT: [[S_MOV_B32_:%[0-9]+]]:sreg_32 = S_MOV_B32 0
-    ; GFX11-NEXT: [[S_MOV_B32_1:%[0-9]+]]:sreg_32 = S_MOV_B32 -1
-    ; GFX11-NEXT: [[REG_SEQUENCE:%[0-9]+]]:sreg_64 = REG_SEQUENCE [[S_MOV_B32_]], %subreg.sub0, [[S_MOV_B32_1]], %subreg.sub1
+    ; GFX11-NEXT: [[S_MOV_B:%[0-9]+]]:sreg_64 = S_MOV_B64_IMM_PSEUDO -4294967296
     ; GFX11-NEXT: [[COPY1:%[0-9]+]]:sreg_32 = COPY [[COPY]].sub0
-    ; GFX11-NEXT: [[COPY2:%[0-9]+]]:sreg_32 = COPY [[REG_SEQUENCE]].sub0
+    ; GFX11-NEXT: [[COPY2:%[0-9]+]]:sreg_32 = COPY [[S_MOV_B]].sub0
     ; GFX11-NEXT: [[COPY3:%[0-9]+]]:sreg_32 = COPY [[COPY]].sub1
-    ; GFX11-NEXT: [[COPY4:%[0-9]+]]:sreg_32 = COPY [[REG_SEQUENCE]].sub1
+    ; GFX11-NEXT: [[COPY4:%[0-9]+]]:sreg_32 = COPY [[S_MOV_B]].sub1
     ; GFX11-NEXT: [[S_ADD_U32_:%[0-9]+]]:sreg_32 = S_ADD_U32 [[COPY1]], [[COPY2]], implicit-def $scc
     ; GFX11-NEXT: [[S_ADDC_U32_:%[0-9]+]]:sreg_32 = S_ADDC_U32 [[COPY3]], [[COPY4]], implicit-def dead $scc, implicit $scc
-    ; GFX11-NEXT: [[REG_SEQUENCE1:%[0-9]+]]:sreg_64_xexec = REG_SEQUENCE [[S_ADD_U32_]], %subreg.sub0, [[S_ADDC_U32_]], %subreg.sub1
-    ; GFX11-NEXT: [[COPY5:%[0-9]+]]:vreg_64 = COPY [[REG_SEQUENCE1]]
+    ; GFX11-NEXT: [[REG_SEQUENCE:%[0-9]+]]:sreg_64_xexec = REG_SEQUENCE [[S_ADD_U32_]], %subreg.sub0, [[S_ADDC_U32_]], %subreg.sub1
+    ; GFX11-NEXT: [[COPY5:%[0-9]+]]:vreg_64 = COPY [[REG_SEQUENCE]]
     ; GFX11-NEXT: [[GLOBAL_LOAD_DWORD:%[0-9]+]]:vgpr_32 = GLOBAL_LOAD_DWORD [[COPY5]], 0, 0, implicit $exec :: (load (s32), addrspace 1)
     ; GFX11-NEXT: $vgpr0 = COPY [[GLOBAL_LOAD_DWORD]]
     ;
@@ -1064,17 +1034,15 @@ body: |
     ; GFX12: liveins: $sgpr0_sgpr1
     ; GFX12-NEXT: {{  $}}
     ; GFX12-NEXT: [[COPY:%[0-9]+]]:sreg_64 = COPY $sgpr0_sgpr1
-    ; GFX12-NEXT: [[S_MOV_B32_:%[0-9]+]]:sreg_32 = S_MOV_B32 0
-    ; GFX12-NEXT: [[S_MOV_B32_1:%[0-9]+]]:sreg_32 = S_MOV_B32 -1
-    ; GFX12-NEXT: [[REG_SEQUENCE:%[0-9]+]]:sreg_64 = REG_SEQUENCE [[S_MOV_B32_]], %subreg.sub0, [[S_MOV_B32_1]], %subreg.sub1
+    ; GFX12-NEXT: [[S_MOV_B:%[0-9]+]]:sreg_64 = S_MOV_B64_IMM_PSEUDO -4294967296
     ; GFX12-NEXT: [[COPY1:%[0-9]+]]:sreg_32 = COPY [[COPY]].sub0
-    ; GFX12-NEXT: [[COPY2:%[0-9]+]]:sreg_32 = COPY [[REG_SEQUENCE]].sub0
+    ; GFX12-NEXT: [[COPY2:%[0-9]+]]:sreg_32 = COPY [[S_MOV_B]].sub0
     ; GFX12-NEXT: [[COPY3:%[0-9]+]]:sreg_32 = COPY [[COPY]].sub1
-    ; GFX12-NEXT: [[COPY4:%[0-9]+]]:sreg_32 = COPY [[REG_SEQUENCE]].sub1
+    ; GFX12-NEXT: [[COPY4:%[0-9]+]]:sreg_32 = COPY [[S_MOV_B]].sub1
     ; GFX12-NEXT: [[S_ADD_U32_:%[0-9]+]]:sreg_32 = S_ADD_U32 [[COPY1]], [[COPY2]], implicit-def $scc
     ; GFX12-NEXT: [[S_ADDC_U32_:%[0-9]+]]:sreg_32 = S_ADDC_U32 [[COPY3]], [[COPY4]], implicit-def dead $scc, implicit $scc
-    ; GFX12-NEXT: [[REG_SEQUENCE1:%[0-9]+]]:sreg_64_xexec = REG_SEQUENCE [[S_ADD_U32_]], %subreg.sub0, [[S_ADDC_U32_]], %subreg.sub1
-    ; GFX12-NEXT: [[COPY5:%[0-9]+]]:vreg_64 = COPY [[REG_SEQUENCE1]]
+    ; GFX12-NEXT: [[REG_SEQUENCE:%[0-9]+]]:sreg_64_xexec = REG_SEQUENCE [[S_ADD_U32_]], %subreg.sub0, [[S_ADDC_U32_]], %subreg.sub1
+    ; GFX12-NEXT: [[COPY5:%[0-9]+]]:vreg_64 = COPY [[REG_SEQUENCE]]
     ; GFX12-NEXT: [[GLOBAL_LOAD_DWORD:%[0-9]+]]:vgpr_32 = GLOBAL_LOAD_DWORD [[COPY5]], 0, 0, implicit $exec :: (load (s32), addrspace 1)
     ; GFX12-NEXT: $vgpr0 = COPY [[GLOBAL_LOAD_DWORD]]
     %0:sgpr(p1) = COPY $sgpr0_sgpr1
diff --git a/llvm/test/CodeGen/AMDGPU/GlobalISel/inst-select-load-smrd.mir b/llvm/test/CodeGen/AMDGPU/GlobalISel/inst-select-load-smrd.mir
index 4309f48..6f97178 100644
--- a/llvm/test/CodeGen/AMDGPU/GlobalISel/inst-select-load-smrd.mir
+++ b/llvm/test/CodeGen/AMDGPU/GlobalISel/inst-select-load-smrd.mir
@@ -45,9 +45,7 @@ regBankSelected: true
 # CI: S_LOAD_DWORD_IMM_ci [[PTR]], 262144, 0
 
 # Max immediate for CI
-# SIVI: [[K_LO:%[0-9]+]]:sreg_32 = S_MOV_B32 4294967292
-# SIVI: [[K_HI:%[0-9]+]]:sreg_32 = S_MOV_B32 3
-# SIVI: [[K:%[0-9]+]]:sreg_64 = REG_SEQUENCE [[K_LO]], %subreg.sub0, [[K_HI]], %subreg.sub1
+# SIVI: [[K:%[0-9]+]]:sreg_64 = S_MOV_B64_IMM_PSEUDO 17179869180
 # SIVI-DAG: [[K_SUB0:%[0-9]+]]:sreg_32 = COPY [[K]].sub0
 # SIVI-DAG: [[PTR_LO:%[0-9]+]]:sreg_32 = COPY [[PTR]].sub0
 # SIVI-DAG: [[ADD_PTR_LO:%[0-9]+]]:sreg_32 = S_ADD_U32 [[PTR_LO]], [[K_SUB0]]
@@ -59,9 +57,7 @@ regBankSelected: true
 # CI: S_LOAD_DWORD_IMM_ci [[PTR]], 4294967295, 0
 
 # Immediate overflow for CI
-# GCN: [[K_LO:%[0-9]+]]:sreg_32 = S_MOV_B32 0
-# GCN: [[K_HI:%[0-9]+]]:sreg_32 = S_MOV_B32 4
-# GCN: [[K:%[0-9]+]]:sreg_64 = REG_SEQUENCE [[K_LO]], %subreg.sub0, [[K_HI]], %subreg.sub1
+# GCN: [[K:%[0-9]+]]:sreg_64 = S_MOV_B64_IMM_PSEUDO 17179869184
 # GCN-DAG: [[K_SUB0:%[0-9]+]]:sreg_32 = COPY [[K]].sub0
 # GCN-DAG: [[PTR_LO:%[0-9]+]]:sreg_32 = COPY [[PTR]].sub0
 # GCN-DAG: [[ADD_PTR_LO:%[0-9]+]]:sreg_32 = S_ADD_U32 [[PTR_LO]], [[K_SUB0]]
@@ -77,9 +73,7 @@ regBankSelected: true
 # CI: S_LOAD_DWORD_IMM_ci [[PTR]], 1073741823, 0
 
 # Overflow 32-bit byte offset
-# SIVI: [[K_LO:%[0-9]+]]:sreg_32 = S_MOV_B32 0
-# SIVI: [[K_HI:%[0-9]+]]:sreg_32 = S_MOV_B32 1
-# SIVI: [[K:%[0-9]+]]:sreg_64 = REG_SEQUENCE [[K_LO]], %subreg.sub0, [[K_HI]], %subreg.sub1
+# SIVI: [[K:%[0-9]+]]:sreg_64 = S_MOV_B64_IMM_PSEUDO 4294967296
 # SIVI-DAG: [[K_SUB0:%[0-9]+]]:sreg_32 = COPY [[K]].sub0
 # SIVI-DAG: [[PTR_LO:%[0-9]+]]:sreg_32 = COPY [[PTR]].sub0
 # SIVI-DAG: [[ADD_PTR_LO:%[0-9]+]]:sreg_32 = S_ADD_U32 [[PTR_LO]], [[K_SUB0]]
diff --git a/llvm/test/CodeGen/AMDGPU/GlobalISel/inst-select-or.mir b/llvm/test/CodeGen/AMDGPU/GlobalISel/inst-select-or.mir
index 5ade8ae..b9b0245 100644
--- a/llvm/test/CodeGen/AMDGPU/GlobalISel/inst-select-or.mir
+++ b/llvm/test/CodeGen/AMDGPU/GlobalISel/inst-select-or.mir
@@ -1,8 +1,8 @@
 # NOTE: Assertions have been autogenerated by utils/update_mir_test_checks.py
 # RUN: llc -mtriple=amdgcn-amd-amdhsa -mcpu=tahiti -run-pass=instruction-select -verify-machineinstrs -global-isel-abort=0 -o - %s  | FileCheck -check-prefix=WAVE64 %s
 # RUN: llc -mtriple=amdgcn-amd-amdhsa -mcpu=fiji -run-pass=instruction-select -verify-machineinstrs -global-isel-abort=0 -o - %s  | FileCheck -check-prefix=WAVE64 %s
-# RUN: llc -mtriple=amdgcn-amd-amdhsa -mcpu=gfx1010 -mattr="+wavefrontsize32" -run-pass=instruction-select -global-isel-abort=0 -verify-machineinstrs -o - %s  | FileCheck -check-prefix=WAVE32 %s
-# RUN: llc -mtriple=amdgcn-amd-amdhsa -mcpu=gfx1100 -mattr="+wavefrontsize32" -run-pass=instruction-select -global-isel-abort=0 -verify-machineinstrs -o - %s  | FileCheck -check-prefix=WAVE32 %s
+# RUN: llc -mtriple=amdgcn-amd-amdhsa -mcpu=gfx1010 -run-pass=instruction-select -global-isel-abort=0 -verify-machineinstrs -o - %s  | FileCheck -check-prefix=WAVE32 %s
+# RUN: llc -mtriple=amdgcn-amd-amdhsa -mcpu=gfx1100 -run-pass=instruction-select -global-isel-abort=0 -verify-machineinstrs -o - %s  | FileCheck -check-prefix=WAVE32 %s
 
 ---
 
diff --git a/llvm/test/CodeGen/AMDGPU/GlobalISel/inst-select-ptr-add.mir b/llvm/test/CodeGen/AMDGPU/GlobalISel/inst-select-ptr-add.mir
index de29037..41e416e 100644
--- a/llvm/test/CodeGen/AMDGPU/GlobalISel/inst-select-ptr-add.mir
+++ b/llvm/test/CodeGen/AMDGPU/GlobalISel/inst-select-ptr-add.mir
@@ -2,10 +2,10 @@
 # RUN: llc -mtriple=amdgcn -mcpu=tahiti -run-pass=instruction-select -verify-machineinstrs -o - %s  | FileCheck -check-prefix=GFX6 %s
 # RUN: llc -mtriple=amdgcn -mcpu=fiji  -run-pass=instruction-select -verify-machineinstrs -o - %s  | FileCheck -check-prefix=GFX8 %s
 # RUN: llc -mtriple=amdgcn -mcpu=gfx900  -run-pass=instruction-select -verify-machineinstrs -o - %s  | FileCheck -check-prefix=GFX9 %s
-# RUN: llc -mtriple=amdgcn -mcpu=gfx1010 -mattr=-wavefrontsize32,+wavefrontsize64  -run-pass=instruction-select -verify-machineinstrs -o - %s  | FileCheck -check-prefix=GFX10-WAVE64 %s
-# RUN: llc -mtriple=amdgcn -mcpu=gfx1010 -mattr=+wavefrontsize32,-wavefrontsize64  -run-pass=instruction-select -verify-machineinstrs -o - %s  | FileCheck -check-prefix=GFX10-WAVE32 %s
-# RUN: llc -mtriple=amdgcn -mcpu=gfx1100 -mattr=-wavefrontsize32,+wavefrontsize64  -run-pass=instruction-select -verify-machineinstrs -o - %s  | FileCheck -check-prefix=GFX10-WAVE64 %s
-# RUN: llc -mtriple=amdgcn -mcpu=gfx1100 -mattr=+wavefrontsize32,-wavefrontsize64  -run-pass=instruction-select -verify-machineinstrs -o - %s  | FileCheck -check-prefix=GFX10-WAVE32 %s
+# RUN: llc -mtriple=amdgcn -mcpu=gfx1010 -mattr=+wavefrontsize64  -run-pass=instruction-select -verify-machineinstrs -o - %s  | FileCheck -check-prefix=GFX10-WAVE64 %s
+# RUN: llc -mtriple=amdgcn -mcpu=gfx1010 -run-pass=instruction-select -verify-machineinstrs -o - %s  | FileCheck -check-prefix=GFX10-WAVE32 %s
+# RUN: llc -mtriple=amdgcn -mcpu=gfx1100 -mattr=+wavefrontsize64  -run-pass=instruction-select -verify-machineinstrs -o - %s  | FileCheck -check-prefix=GFX10-WAVE64 %s
+# RUN: llc -mtriple=amdgcn -mcpu=gfx1100 -run-pass=instruction-select -verify-machineinstrs -o - %s  | FileCheck -check-prefix=GFX10-WAVE32 %s
 
 ---
 name:  gep_p0_sgpr_sgpr
diff --git a/llvm/test/CodeGen/AMDGPU/GlobalISel/inst-select-ptrmask.mir b/llvm/test/CodeGen/AMDGPU/GlobalISel/inst-select-ptrmask.mir
index 2cfbb68..2a3d97d 100644
--- a/llvm/test/CodeGen/AMDGPU/GlobalISel/inst-select-ptrmask.mir
+++ b/llvm/test/CodeGen/AMDGPU/GlobalISel/inst-select-ptrmask.mir
@@ -314,8 +314,8 @@ body: |
     ; CHECK: liveins: $sgpr0_sgpr1
     ; CHECK-NEXT: {{  $}}
     ; CHECK-NEXT: [[COPY:%[0-9]+]]:sreg_64 = COPY $sgpr0_sgpr1
-    ; CHECK-NEXT: [[S_MOV_B:%[0-9]+]]:sreg_64 = S_MOV_B64_IMM_PSEUDO 0
-    ; CHECK-NEXT: [[S_AND_B64_:%[0-9]+]]:sreg_64 = S_AND_B64 [[COPY]], [[S_MOV_B]], implicit-def dead $scc
+    ; CHECK-NEXT: [[S_MOV_B64_:%[0-9]+]]:sreg_64 = S_MOV_B64 0
+    ; CHECK-NEXT: [[S_AND_B64_:%[0-9]+]]:sreg_64 = S_AND_B64 [[COPY]], [[S_MOV_B64_]], implicit-def dead $scc
     ; CHECK-NEXT: S_ENDPGM 0, implicit [[S_AND_B64_]]
     %0:sgpr(p0) = COPY $sgpr0_sgpr1
     %1:sgpr(s64) = G_CONSTANT i64 0
@@ -337,10 +337,8 @@ body: |
     ; CHECK: liveins: $sgpr0_sgpr1
     ; CHECK-NEXT: {{  $}}
     ; CHECK-NEXT: [[COPY:%[0-9]+]]:sreg_64 = COPY $sgpr0_sgpr1
-    ; CHECK-NEXT: [[S_MOV_B32_:%[0-9]+]]:sreg_32 = S_MOV_B32 4042322160
-    ; CHECK-NEXT: [[S_MOV_B32_1:%[0-9]+]]:sreg_32 = S_MOV_B32 -252645136
-    ; CHECK-NEXT: [[REG_SEQUENCE:%[0-9]+]]:sreg_64 = REG_SEQUENCE [[S_MOV_B32_]], %subreg.sub0, [[S_MOV_B32_1]], %subreg.sub1
-    ; CHECK-NEXT: [[S_AND_B64_:%[0-9]+]]:sreg_64 = S_AND_B64 [[COPY]], [[REG_SEQUENCE]], implicit-def dead $scc
+    ; CHECK-NEXT: [[S_MOV_B:%[0-9]+]]:sreg_64 = S_MOV_B64_IMM_PSEUDO -1085102592571150096
+    ; CHECK-NEXT: [[S_AND_B64_:%[0-9]+]]:sreg_64 = S_AND_B64 [[COPY]], [[S_MOV_B]], implicit-def dead $scc
     ; CHECK-NEXT: S_ENDPGM 0, implicit [[S_AND_B64_]]
     %0:sgpr(p0) = COPY $sgpr0_sgpr1
     %1:sgpr(s64) = G_CONSTANT i64 -1085102592571150096
@@ -362,9 +360,7 @@ body: |
     ; CHECK: liveins: $sgpr0_sgpr1
     ; CHECK-NEXT: {{  $}}
     ; CHECK-NEXT: [[COPY:%[0-9]+]]:sreg_64 = COPY $sgpr0_sgpr1
-    ; CHECK-NEXT: [[S_MOV_B32_:%[0-9]+]]:sreg_32 = S_MOV_B32 0
-    ; CHECK-NEXT: [[S_MOV_B32_1:%[0-9]+]]:sreg_32 = S_MOV_B32 -2147483648
-    ; CHECK-NEXT: %const:sreg_64 = REG_SEQUENCE [[S_MOV_B32_]], %subreg.sub0, [[S_MOV_B32_1]], %subreg.sub1
+    ; CHECK-NEXT: %const:sreg_64 = S_MOV_B64_IMM_PSEUDO -9223372036854775808
     ; CHECK-NEXT: [[S_AND_B64_:%[0-9]+]]:sreg_64 = S_AND_B64 [[COPY]], %const, implicit-def dead $scc
     ; CHECK-NEXT: S_ENDPGM 0, implicit [[S_AND_B64_]]
     %0:sgpr(p0) = COPY $sgpr0_sgpr1
@@ -387,9 +383,7 @@ body: |
     ; CHECK: liveins: $sgpr0_sgpr1
     ; CHECK-NEXT: {{  $}}
     ; CHECK-NEXT: [[COPY:%[0-9]+]]:sreg_64 = COPY $sgpr0_sgpr1
-    ; CHECK-NEXT: [[S_MOV_B32_:%[0-9]+]]:sreg_32 = S_MOV_B32 0
-    ; CHECK-NEXT: [[S_MOV_B32_1:%[0-9]+]]:sreg_32 = S_MOV_B32 -1
-    ; CHECK-NEXT: %const:sreg_64 = REG_SEQUENCE [[S_MOV_B32_]], %subreg.sub0, [[S_MOV_B32_1]], %subreg.sub1
+    ; CHECK-NEXT: %const:sreg_64 = S_MOV_B64_IMM_PSEUDO -4294967296
     ; CHECK-NEXT: [[COPY1:%[0-9]+]]:sreg_32 = COPY [[COPY]].sub0
     ; CHECK-NEXT: [[COPY2:%[0-9]+]]:sreg_32 = COPY [[COPY]].sub1
     ; CHECK-NEXT: [[COPY3:%[0-9]+]]:sreg_32 = COPY %const.sub0
@@ -416,9 +410,7 @@ body: |
     ; CHECK: liveins: $sgpr0_sgpr1
     ; CHECK-NEXT: {{  $}}
     ; CHECK-NEXT: [[COPY:%[0-9]+]]:sreg_64 = COPY $sgpr0_sgpr1
-    ; CHECK-NEXT: [[S_MOV_B32_:%[0-9]+]]:sreg_32 = S_MOV_B32 0
-    ; CHECK-NEXT: [[S_MOV_B32_1:%[0-9]+]]:sreg_32 = S_MOV_B32 1
-    ; CHECK-NEXT: %const:sreg_64 = REG_SEQUENCE [[S_MOV_B32_]], %subreg.sub0, [[S_MOV_B32_1]], %subreg.sub1
+    ; CHECK-NEXT: %const:sreg_64 = S_MOV_B64_IMM_PSEUDO 4294967296
     ; CHECK-NEXT: [[S_AND_B64_:%[0-9]+]]:sreg_64 = S_AND_B64 [[COPY]], %const, implicit-def dead $scc
     ; CHECK-NEXT: S_ENDPGM 0, implicit [[S_AND_B64_]]
     %0:sgpr(p0) = COPY $sgpr0_sgpr1
@@ -441,7 +433,7 @@ body: |
     ; CHECK: liveins: $sgpr0_sgpr1
     ; CHECK-NEXT: {{  $}}
     ; CHECK-NEXT: [[COPY:%[0-9]+]]:sreg_64 = COPY $sgpr0_sgpr1
-    ; CHECK-NEXT: %const:sreg_64 = S_MOV_B64_IMM_PSEUDO -2
+    ; CHECK-NEXT: %const:sreg_64 = S_MOV_B64 -2
     ; CHECK-NEXT: [[COPY1:%[0-9]+]]:sreg_32 = COPY [[COPY]].sub0
     ; CHECK-NEXT: [[COPY2:%[0-9]+]]:sreg_32 = COPY [[COPY]].sub1
     ; CHECK-NEXT: [[COPY3:%[0-9]+]]:sreg_32 = COPY %const.sub0
@@ -468,7 +460,7 @@ body: |
     ; CHECK: liveins: $sgpr0_sgpr1
     ; CHECK-NEXT: {{  $}}
     ; CHECK-NEXT: [[COPY:%[0-9]+]]:sreg_64 = COPY $sgpr0_sgpr1
-    ; CHECK-NEXT: %const:sreg_64 = S_MOV_B64_IMM_PSEUDO -4
+    ; CHECK-NEXT: %const:sreg_64 = S_MOV_B64 -4
     ; CHECK-NEXT: [[COPY1:%[0-9]+]]:sreg_32 = COPY [[COPY]].sub0
     ; CHECK-NEXT: [[COPY2:%[0-9]+]]:sreg_32 = COPY [[COPY]].sub1
     ; CHECK-NEXT: [[COPY3:%[0-9]+]]:sreg_32 = COPY %const.sub0
@@ -495,7 +487,7 @@ body: |
     ; CHECK: liveins: $sgpr0_sgpr1
     ; CHECK-NEXT: {{  $}}
     ; CHECK-NEXT: [[COPY:%[0-9]+]]:sreg_64 = COPY $sgpr0_sgpr1
-    ; CHECK-NEXT: %const:sreg_64 = S_MOV_B64_IMM_PSEUDO -8
+    ; CHECK-NEXT: %const:sreg_64 = S_MOV_B64 -8
     ; CHECK-NEXT: [[COPY1:%[0-9]+]]:sreg_32 = COPY [[COPY]].sub0
     ; CHECK-NEXT: [[COPY2:%[0-9]+]]:sreg_32 = COPY [[COPY]].sub1
     ; CHECK-NEXT: [[COPY3:%[0-9]+]]:sreg_32 = COPY %const.sub0
@@ -522,7 +514,7 @@ body: |
     ; CHECK: liveins: $sgpr0_sgpr1
     ; CHECK-NEXT: {{  $}}
     ; CHECK-NEXT: [[COPY:%[0-9]+]]:sreg_64 = COPY $sgpr0_sgpr1
-    ; CHECK-NEXT: %const:sreg_64 = S_MOV_B64_IMM_PSEUDO -16
+    ; CHECK-NEXT: %const:sreg_64 = S_MOV_B64 -16
     ; CHECK-NEXT: [[COPY1:%[0-9]+]]:sreg_32 = COPY [[COPY]].sub0
     ; CHECK-NEXT: [[COPY2:%[0-9]+]]:sreg_32 = COPY [[COPY]].sub1
     ; CHECK-NEXT: [[COPY3:%[0-9]+]]:sreg_32 = COPY %const.sub0
@@ -743,17 +735,15 @@ body: |
     ; CHECK: liveins: $vgpr0_vgpr1
     ; CHECK-NEXT: {{  $}}
     ; CHECK-NEXT: [[COPY:%[0-9]+]]:vreg_64 = COPY $vgpr0_vgpr1
-    ; CHECK-NEXT: [[V_MOV_B32_e32_:%[0-9]+]]:vgpr_32 = V_MOV_B32_e32 4042322160, implicit $exec
-    ; CHECK-NEXT: [[V_MOV_B32_e32_1:%[0-9]+]]:vgpr_32 = V_MOV_B32_e32 -252645136, implicit $exec
-    ; CHECK-NEXT: [[REG_SEQUENCE:%[0-9]+]]:vreg_64 = REG_SEQUENCE [[V_MOV_B32_e32_]], %subreg.sub0, [[V_MOV_B32_e32_1]], %subreg.sub1
+    ; CHECK-NEXT: [[V_MOV_B:%[0-9]+]]:vreg_64 = V_MOV_B64_PSEUDO -1085102592571150096, implicit $exec
     ; CHECK-NEXT: [[COPY1:%[0-9]+]]:vgpr_32 = COPY [[COPY]].sub0
     ; CHECK-NEXT: [[COPY2:%[0-9]+]]:vgpr_32 = COPY [[COPY]].sub1
-    ; CHECK-NEXT: [[COPY3:%[0-9]+]]:vgpr_32 = COPY [[REG_SEQUENCE]].sub0
+    ; CHECK-NEXT: [[COPY3:%[0-9]+]]:vgpr_32 = COPY [[V_MOV_B]].sub0
     ; CHECK-NEXT: [[V_AND_B32_e64_:%[0-9]+]]:vgpr_32 = V_AND_B32_e64 [[COPY1]], [[COPY3]], implicit $exec
-    ; CHECK-NEXT: [[COPY4:%[0-9]+]]:vgpr_32 = COPY [[REG_SEQUENCE]].sub1
+    ; CHECK-NEXT: [[COPY4:%[0-9]+]]:vgpr_32 = COPY [[V_MOV_B]].sub1
     ; CHECK-NEXT: [[V_AND_B32_e64_1:%[0-9]+]]:vgpr_32 = V_AND_B32_e64 [[COPY2]], [[COPY4]], implicit $exec
-    ; CHECK-NEXT: [[REG_SEQUENCE1:%[0-9]+]]:vreg_64 = REG_SEQUENCE [[V_AND_B32_e64_]], %subreg.sub0, [[V_AND_B32_e64_1]], %subreg.sub1
-    ; CHECK-NEXT: S_ENDPGM 0, implicit [[REG_SEQUENCE1]]
+    ; CHECK-NEXT: [[REG_SEQUENCE:%[0-9]+]]:vreg_64 = REG_SEQUENCE [[V_AND_B32_e64_]], %subreg.sub0, [[V_AND_B32_e64_1]], %subreg.sub1
+    ; CHECK-NEXT: S_ENDPGM 0, implicit [[REG_SEQUENCE]]
     %0:vgpr(p0) = COPY $vgpr0_vgpr1
     %1:vgpr(s64) = G_CONSTANT i64 -1085102592571150096
     %2:vgpr(p0) = G_PTRMASK %0, %1
diff --git a/llvm/test/CodeGen/AMDGPU/GlobalISel/inst-select-store-private.mir b/llvm/test/CodeGen/AMDGPU/GlobalISel/inst-select-store-private.mir
index b90fe2f..60357ab 100644
--- a/llvm/test/CodeGen/AMDGPU/GlobalISel/inst-select-store-private.mir
+++ b/llvm/test/CodeGen/AMDGPU/GlobalISel/inst-select-store-private.mir
@@ -2,7 +2,7 @@
 # RUN: llc -mtriple=amdgcn -mcpu=tahiti -run-pass=instruction-select -verify-machineinstrs  -global-isel-abort=0 -o - %s | FileCheck -check-prefix=GFX6 %s
 # RUN: llc -O0 -mtriple=amdgcn -mcpu=tahiti -run-pass=instruction-select -verify-machineinstrs  -global-isel-abort=0 -o - %s | FileCheck -check-prefix=GFX6 %s
 # RUN: llc -mtriple=amdgcn -mcpu=gfx900 -run-pass=instruction-select -verify-machineinstrs -global-isel-abort=0 -o - %s | FileCheck -check-prefix=GFX9 %s
-# RUN: llc -mtriple=amdgcn -mcpu=gfx1010 -mattr=-wavefrontsize32,+wavefrontsize64 -run-pass=instruction-select -verify-machineinstrs -global-isel-abort=0 -o - %s | FileCheck -check-prefix=GFX9 %s
+# RUN: llc -mtriple=amdgcn -mcpu=gfx1010 -mattr=+wavefrontsize64 -run-pass=instruction-select -verify-machineinstrs -global-isel-abort=0 -o - %s | FileCheck -check-prefix=GFX9 %s
 # RUN: llc -mtriple=amdgcn -mcpu=gfx1100 -run-pass=instruction-select -verify-machineinstrs -global-isel-abort=0 -o - %s | FileCheck -check-prefix=GFX11 %s
 # RUN: llc -mtriple=amdgcn -mcpu=gfx1200 -run-pass=instruction-select -verify-machineinstrs -global-isel-abort=0 -o - %s | FileCheck -check-prefix=GFX12 %s
 
diff --git a/llvm/test/CodeGen/AMDGPU/GlobalISel/inst-select-xor.mir b/llvm/test/CodeGen/AMDGPU/GlobalISel/inst-select-xor.mir
index 229a84e..f2daa23 100644
--- a/llvm/test/CodeGen/AMDGPU/GlobalISel/inst-select-xor.mir
+++ b/llvm/test/CodeGen/AMDGPU/GlobalISel/inst-select-xor.mir
@@ -1,8 +1,8 @@
 # NOTE: Assertions have been autogenerated by utils/update_mir_test_checks.py
 # RUN: llc -mtriple=amdgcn-amd-amdhsa -mcpu=tahiti -run-pass=instruction-select -verify-machineinstrs -global-isel-abort=0 -o - %s  | FileCheck -check-prefix=WAVE64 %s
 # RUN: llc -mtriple=amdgcn-amd-amdhsa -mcpu=fiji -run-pass=instruction-select -verify-machineinstrs -global-isel-abort=0 -o - %s  | FileCheck -check-prefix=WAVE64 %s
-# RUN: llc -mtriple=amdgcn-amd-amdhsa -mcpu=gfx1010 -mattr="+wavefrontsize32" -run-pass=instruction-select -global-isel-abort=0 -verify-machineinstrs -o - %s  | FileCheck -check-prefix=WAVE32 %s
-# RUN: llc -mtriple=amdgcn-amd-amdhsa -mcpu=gfx1100 -mattr="+wavefrontsize32" -run-pass=instruction-select -global-isel-abort=0 -verify-machineinstrs -o - %s  | FileCheck -check-prefix=WAVE32 %s
+# RUN: llc -mtriple=amdgcn-amd-amdhsa -mcpu=gfx1010 -run-pass=instruction-select -global-isel-abort=0 -verify-machineinstrs -o - %s  | FileCheck -check-prefix=WAVE32 %s
+# RUN: llc -mtriple=amdgcn-amd-amdhsa -mcpu=gfx1100 -run-pass=instruction-select -global-isel-abort=0 -verify-machineinstrs -o - %s  | FileCheck -check-prefix=WAVE32 %s
 
 ---
 
diff --git a/llvm/test/CodeGen/AMDGPU/GlobalISel/legalize-brcond.mir b/llvm/test/CodeGen/AMDGPU/GlobalISel/legalize-brcond.mir
index 9be5e14..57bbe02 100644
--- a/llvm/test/CodeGen/AMDGPU/GlobalISel/legalize-brcond.mir
+++ b/llvm/test/CodeGen/AMDGPU/GlobalISel/legalize-brcond.mir
@@ -1,7 +1,7 @@
 # NOTE: Assertions have been autogenerated by utils/update_mir_test_checks.py
 # RUN: llc -mtriple=amdgcn-mesa-mesa3d -mcpu=fiji -O0 -run-pass=legalizer -verify-machineinstrs %s -o - | FileCheck -check-prefix=WAVE64 %s
-# RUN: llc -mtriple=amdgcn-mesa-mesa3d -mcpu=gfx1010 -mattr=+wavefrontsize32,-wavefrontsize64 -O0 -run-pass=legalizer -verify-machineinstrs %s -o - | FileCheck -check-prefix=WAVE32 %s
-# RUN: llc -mtriple=amdgcn-mesa-mesa3d -mcpu=gfx1100 -mattr=+wavefrontsize32,-wavefrontsize64 -O0 -run-pass=legalizer -verify-machineinstrs %s -o - | FileCheck -check-prefix=WAVE32 %s
+# RUN: llc -mtriple=amdgcn-mesa-mesa3d -mcpu=gfx1010 -O0 -run-pass=legalizer -verify-machineinstrs %s -o - | FileCheck -check-prefix=WAVE32 %s
+# RUN: llc -mtriple=amdgcn-mesa-mesa3d -mcpu=gfx1100 -O0 -run-pass=legalizer -verify-machineinstrs %s -o - | FileCheck -check-prefix=WAVE32 %s
 
 ---
 name: legal_brcond_vcc
diff --git a/llvm/test/CodeGen/AMDGPU/GlobalISel/llvm.amdgcn.ballot.i32.ll b/llvm/test/CodeGen/AMDGPU/GlobalISel/llvm.amdgcn.ballot.i32.ll
index 4603668..96cab20 100644
--- a/llvm/test/CodeGen/AMDGPU/GlobalISel/llvm.amdgcn.ballot.i32.ll
+++ b/llvm/test/CodeGen/AMDGPU/GlobalISel/llvm.amdgcn.ballot.i32.ll
@@ -1,6 +1,6 @@
 ; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py
-; RUN: llc -mtriple=amdgcn -mcpu=gfx1010 -mattr=+wavefrontsize32,-wavefrontsize64 -global-isel -verify-machineinstrs < %s | FileCheck %s
-; RUN: llc -mtriple=amdgcn -mcpu=gfx1100 -amdgpu-enable-delay-alu=0 -mattr=+wavefrontsize32,-wavefrontsize64 -global-isel -verify-machineinstrs < %s | FileCheck %s
+; RUN: llc -mtriple=amdgcn -mcpu=gfx1010 -global-isel -verify-machineinstrs < %s | FileCheck %s
+; RUN: llc -mtriple=amdgcn -mcpu=gfx1100 -amdgpu-enable-delay-alu=0 -global-isel -verify-machineinstrs < %s | FileCheck %s
 
 declare i32 @llvm.amdgcn.ballot.i32(i1)
 declare i32 @llvm.ctpop.i32(i32)
diff --git a/llvm/test/CodeGen/AMDGPU/GlobalISel/llvm.amdgcn.end.cf.i32.ll b/llvm/test/CodeGen/AMDGPU/GlobalISel/llvm.amdgcn.end.cf.i32.ll
index 2a26082..6415e18 100644
--- a/llvm/test/CodeGen/AMDGPU/GlobalISel/llvm.amdgcn.end.cf.i32.ll
+++ b/llvm/test/CodeGen/AMDGPU/GlobalISel/llvm.amdgcn.end.cf.i32.ll
@@ -1,6 +1,6 @@
 ; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py
-; RUN: llc -global-isel -mtriple=amdgcn--amdhsa -mcpu=gfx1010 -mattr=+wavefrontsize32,-wavefrontsize64 -verify-machineinstrs < %s | FileCheck -check-prefix=GFX10 %s
-; RUN: llc -global-isel -mtriple=amdgcn--amdhsa -mcpu=gfx1100 -amdgpu-enable-delay-alu=0 -mattr=+wavefrontsize32,-wavefrontsize64 -verify-machineinstrs < %s | FileCheck -check-prefix=GFX11 %s
+; RUN: llc -global-isel -mtriple=amdgcn--amdhsa -mcpu=gfx1010 -verify-machineinstrs < %s | FileCheck -check-prefix=GFX10 %s
+; RUN: llc -global-isel -mtriple=amdgcn--amdhsa -mcpu=gfx1100 -amdgpu-enable-delay-alu=0 -verify-machineinstrs < %s | FileCheck -check-prefix=GFX11 %s
 
 define amdgpu_kernel void @test_wave32(i32 %arg0, [8 x i32], i32 %saved) {
 ; GFX10-LABEL: test_wave32:
diff --git a/llvm/test/CodeGen/AMDGPU/GlobalISel/llvm.amdgcn.if.break.i32.ll b/llvm/test/CodeGen/AMDGPU/GlobalISel/llvm.amdgcn.if.break.i32.ll
index ec069c1..81c73c7 100644
--- a/llvm/test/CodeGen/AMDGPU/GlobalISel/llvm.amdgcn.if.break.i32.ll
+++ b/llvm/test/CodeGen/AMDGPU/GlobalISel/llvm.amdgcn.if.break.i32.ll
@@ -1,6 +1,6 @@
 ; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py
-; RUN: llc -global-isel -mtriple=amdgcn--amdhsa -mcpu=gfx1010 -mattr=+wavefrontsize32,-wavefrontsize64 -verify-machineinstrs < %s | FileCheck -check-prefix=GFX10 %s
-; RUN: llc -global-isel -mtriple=amdgcn--amdhsa -mcpu=gfx1100 -amdgpu-enable-delay-alu=0 -mattr=+wavefrontsize32,-wavefrontsize64 -verify-machineinstrs < %s | FileCheck -check-prefix=GFX11 %s
+; RUN: llc -global-isel -mtriple=amdgcn--amdhsa -mcpu=gfx1010 -verify-machineinstrs < %s | FileCheck -check-prefix=GFX10 %s
+; RUN: llc -global-isel -mtriple=amdgcn--amdhsa -mcpu=gfx1100 -amdgpu-enable-delay-alu=0 -verify-machineinstrs < %s | FileCheck -check-prefix=GFX11 %s
 
 define amdgpu_kernel void @test_wave32(i32 %arg0, [8 x i32], i32 %saved) {
 ; GFX10-LABEL: test_wave32:
diff --git a/llvm/test/CodeGen/AMDGPU/GlobalISel/llvm.amdgcn.intersect_ray.ll b/llvm/test/CodeGen/AMDGPU/GlobalISel/llvm.amdgcn.intersect_ray.ll
index d1fa579..614f59c 100644
--- a/llvm/test/CodeGen/AMDGPU/GlobalISel/llvm.amdgcn.intersect_ray.ll
+++ b/llvm/test/CodeGen/AMDGPU/GlobalISel/llvm.amdgcn.intersect_ray.ll
@@ -906,28 +906,26 @@ define amdgpu_kernel void @image_bvh64_intersect_ray_nsa_reassign(ptr %p_ray, <4
 ; GFX11:       ; %bb.0:
 ; GFX11-NEXT:    s_load_b64 s[4:5], s[2:3], 0x24
 ; GFX11-NEXT:    v_and_b32_e32 v0, 0x3ff, v0
-; GFX11-NEXT:    s_mov_b32 s16, 0xb36211c7
 ; GFX11-NEXT:    s_mov_b32 s6, 2.0
-; GFX11-NEXT:    s_movk_i32 s17, 0x102
-; GFX11-NEXT:    s_delay_alu instid0(VALU_DEP_1)
-; GFX11-NEXT:    v_dual_mov_b32 v9, s16 :: v_dual_lshlrev_b32 v2, 2, v0
 ; GFX11-NEXT:    s_load_b128 s[0:3], s[2:3], 0x34
 ; GFX11-NEXT:    s_mov_b32 s8, 0x40400000
+; GFX11-NEXT:    s_delay_alu instid0(VALU_DEP_1)
+; GFX11-NEXT:    v_dual_mov_b32 v9, 0xb36211c7 :: v_dual_lshlrev_b32 v2, 2, v0
 ; GFX11-NEXT:    s_mov_b32 s12, 0x40c00000
 ; GFX11-NEXT:    s_mov_b32 s10, 0x40a00000
 ; GFX11-NEXT:    s_mov_b32 s9, 4.0
 ; GFX11-NEXT:    s_mov_b32 s14, 0x41000000
 ; GFX11-NEXT:    s_mov_b32 s13, 0x40e00000
-; GFX11-NEXT:    v_dual_mov_b32 v10, s17 :: v_dual_mov_b32 v3, s8
 ; GFX11-NEXT:    v_mov_b32_e32 v6, s12
-; GFX11-NEXT:    v_mov_b32_e32 v4, s9
-; GFX11-NEXT:    v_dual_mov_b32 v8, s14 :: v_dual_mov_b32 v5, s10
+; GFX11-NEXT:    v_bfrev_b32_e32 v10, 4.0
+; GFX11-NEXT:    v_dual_mov_b32 v8, s14 :: v_dual_mov_b32 v3, s8
+; GFX11-NEXT:    v_dual_mov_b32 v4, s9 :: v_dual_mov_b32 v7, s13
 ; GFX11-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX11-NEXT:    v_dual_mov_b32 v0, s4 :: v_dual_mov_b32 v1, s5
+; GFX11-NEXT:    v_dual_mov_b32 v5, s10 :: v_dual_mov_b32 v0, s4
+; GFX11-NEXT:    v_mov_b32_e32 v1, s5
 ; GFX11-NEXT:    s_mov_b32 s4, 0
 ; GFX11-NEXT:    s_mov_b32 s5, 1.0
-; GFX11-NEXT:    v_mov_b32_e32 v7, s13
-; GFX11-NEXT:    s_delay_alu instid0(VALU_DEP_2)
+; GFX11-NEXT:    s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_2)
 ; GFX11-NEXT:    v_add_co_u32 v0, vcc_lo, v0, v2
 ; GFX11-NEXT:    v_add_co_ci_u32_e32 v1, vcc_lo, 0, v1, vcc_lo
 ; GFX11-NEXT:    flat_load_b32 v11, v[0:1]
@@ -1012,17 +1010,16 @@ define amdgpu_kernel void @image_bvh64_intersect_ray_a16_nsa_reassign(ptr %p_ray
 ; GFX11:       ; %bb.0:
 ; GFX11-NEXT:    s_load_b64 s[4:5], s[2:3], 0x24
 ; GFX11-NEXT:    v_and_b32_e32 v0, 0x3ff, v0
-; GFX11-NEXT:    s_mov_b32 s12, 0xb36211c6
 ; GFX11-NEXT:    s_mov_b32 s6, 2.0
-; GFX11-NEXT:    s_movk_i32 s13, 0x102
 ; GFX11-NEXT:    s_mov_b32 s8, 0x42004600
-; GFX11-NEXT:    v_lshlrev_b32_e32 v2, 2, v0
 ; GFX11-NEXT:    s_mov_b32 s9, 0x44004700
 ; GFX11-NEXT:    s_mov_b32 s10, 0x45004800
-; GFX11-NEXT:    v_dual_mov_b32 v6, s12 :: v_dual_mov_b32 v3, s8
-; GFX11-NEXT:    v_mov_b32_e32 v7, s13
+; GFX11-NEXT:    v_lshlrev_b32_e32 v2, 2, v0
 ; GFX11-NEXT:    s_load_b128 s[0:3], s[2:3], 0x34
-; GFX11-NEXT:    v_dual_mov_b32 v4, s9 :: v_dual_mov_b32 v5, s10
+; GFX11-NEXT:    v_mov_b32_e32 v6, 0xb36211c6
+; GFX11-NEXT:    v_bfrev_b32_e32 v7, 4.0
+; GFX11-NEXT:    v_mov_b32_e32 v3, s8
+; GFX11-NEXT:    v_dual_mov_b32 v5, s10 :: v_dual_mov_b32 v4, s9
 ; GFX11-NEXT:    s_waitcnt lgkmcnt(0)
 ; GFX11-NEXT:    v_dual_mov_b32 v0, s4 :: v_dual_mov_b32 v1, s5
 ; GFX11-NEXT:    s_mov_b32 s4, 0
diff --git a/llvm/test/CodeGen/AMDGPU/GlobalISel/llvm.amdgcn.mfma.gfx90a.ll b/llvm/test/CodeGen/AMDGPU/GlobalISel/llvm.amdgcn.mfma.gfx90a.ll
index 4d01279..7916403 100644
--- a/llvm/test/CodeGen/AMDGPU/GlobalISel/llvm.amdgcn.mfma.gfx90a.ll
+++ b/llvm/test/CodeGen/AMDGPU/GlobalISel/llvm.amdgcn.mfma.gfx90a.ll
@@ -326,7 +326,6 @@ define amdgpu_kernel void @test_mfma_f64_16x16x4f64_imm(ptr addrspace(1) %arg, d
 ; GCN-NEXT:    v_pk_mov_b32 v[0:1], s[10:11], s[10:11] op_sel:[0,1]
 ; GCN-NEXT:    s_mov_b64 s[4:5], s[0:1]
 ; GCN-NEXT:    v_accvgpr_write_b32 a0, s0
-; GCN-NEXT:    v_pk_mov_b32 v[2:3], s[12:13], s[12:13] op_sel:[0,1]
 ; GCN-NEXT:    v_accvgpr_write_b32 a1, s1
 ; GCN-NEXT:    v_accvgpr_write_b32 a2, s2
 ; GCN-NEXT:    v_accvgpr_write_b32 a3, s3
@@ -334,6 +333,7 @@ define amdgpu_kernel void @test_mfma_f64_16x16x4f64_imm(ptr addrspace(1) %arg, d
 ; GCN-NEXT:    v_accvgpr_write_b32 a5, s5
 ; GCN-NEXT:    v_accvgpr_write_b32 a6, s6
 ; GCN-NEXT:    v_accvgpr_write_b32 a7, s7
+; GCN-NEXT:    v_pk_mov_b32 v[2:3], s[12:13], s[12:13] op_sel:[0,1]
 ; GCN-NEXT:    s_nop 1
 ; GCN-NEXT:    v_mfma_f64_16x16x4f64 a[0:7], v[0:1], v[2:3], a[0:7]
 ; GCN-NEXT:    v_mov_b32_e32 v0, 0
diff --git a/llvm/test/CodeGen/AMDGPU/GlobalISel/llvm.amdgcn.rsq.clamp.ll b/llvm/test/CodeGen/AMDGPU/GlobalISel/llvm.amdgcn.rsq.clamp.ll
index 3def367..9ef54ed 100644
--- a/llvm/test/CodeGen/AMDGPU/GlobalISel/llvm.amdgcn.rsq.clamp.ll
+++ b/llvm/test/CodeGen/AMDGPU/GlobalISel/llvm.amdgcn.rsq.clamp.ll
@@ -77,11 +77,12 @@ define double @v_rsq_clamp_f64(double %src) #0 {
 ; VI:       ; %bb.0:
 ; VI-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; VI-NEXT:    v_rsq_f64_e32 v[0:1], v[0:1]
-; VI-NEXT:    s_mov_b32 s4, -1
-; VI-NEXT:    s_mov_b32 s5, 0x7fefffff
-; VI-NEXT:    v_min_f64 v[0:1], v[0:1], s[4:5]
-; VI-NEXT:    s_mov_b32 s5, 0xffefffff
-; VI-NEXT:    v_max_f64 v[0:1], v[0:1], s[4:5]
+; VI-NEXT:    v_mov_b32_e32 v2, -1
+; VI-NEXT:    v_mov_b32_e32 v3, 0x7fefffff
+; VI-NEXT:    v_min_f64 v[0:1], v[0:1], v[2:3]
+; VI-NEXT:    v_mov_b32_e32 v2, -1
+; VI-NEXT:    v_mov_b32_e32 v3, 0xffefffff
+; VI-NEXT:    v_max_f64 v[0:1], v[0:1], v[2:3]
 ; VI-NEXT:    s_setpc_b64 s[30:31]
 ;
 ; GFX12-LABEL: v_rsq_clamp_f64:
@@ -92,13 +93,14 @@ define double @v_rsq_clamp_f64(double %src) #0 {
 ; GFX12-NEXT:    s_wait_bvhcnt 0x0
 ; GFX12-NEXT:    s_wait_kmcnt 0x0
 ; GFX12-NEXT:    v_rsq_f64_e32 v[0:1], v[0:1]
-; GFX12-NEXT:    s_mov_b32 s0, -1
-; GFX12-NEXT:    s_mov_b32 s1, 0x7fefffff
-; GFX12-NEXT:    s_delay_alu instid0(TRANS32_DEP_1) | instid1(SALU_CYCLE_1)
-; GFX12-NEXT:    v_min_num_f64_e32 v[0:1], s[0:1], v[0:1]
-; GFX12-NEXT:    s_mov_b32 s1, 0xffefffff
-; GFX12-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instid1(SALU_CYCLE_1)
-; GFX12-NEXT:    v_max_num_f64_e32 v[0:1], s[0:1], v[0:1]
+; GFX12-NEXT:    v_mov_b32_e32 v2, -1
+; GFX12-NEXT:    v_mov_b32_e32 v3, 0x7fefffff
+; GFX12-NEXT:    s_delay_alu instid0(TRANS32_DEP_1) | instid1(VALU_DEP_1)
+; GFX12-NEXT:    v_min_num_f64_e32 v[0:1], v[0:1], v[2:3]
+; GFX12-NEXT:    v_mov_b32_e32 v2, -1
+; GFX12-NEXT:    v_mov_b32_e32 v3, 0xffefffff
+; GFX12-NEXT:    s_delay_alu instid0(VALU_DEP_1)
+; GFX12-NEXT:    v_max_num_f64_e32 v[0:1], v[0:1], v[2:3]
 ; GFX12-NEXT:    s_setpc_b64 s[30:31]
   %rsq_clamp = call double @llvm.amdgcn.rsq.clamp.f64(double %src)
   ret double %rsq_clamp
@@ -115,11 +117,12 @@ define double @v_rsq_clamp_fabs_f64(double %src) #0 {
 ; VI:       ; %bb.0:
 ; VI-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; VI-NEXT:    v_rsq_f64_e64 v[0:1], |v[0:1]|
-; VI-NEXT:    s_mov_b32 s4, -1
-; VI-NEXT:    s_mov_b32 s5, 0x7fefffff
-; VI-NEXT:    v_min_f64 v[0:1], v[0:1], s[4:5]
-; VI-NEXT:    s_mov_b32 s5, 0xffefffff
-; VI-NEXT:    v_max_f64 v[0:1], v[0:1], s[4:5]
+; VI-NEXT:    v_mov_b32_e32 v2, -1
+; VI-NEXT:    v_mov_b32_e32 v3, 0x7fefffff
+; VI-NEXT:    v_min_f64 v[0:1], v[0:1], v[2:3]
+; VI-NEXT:    v_mov_b32_e32 v2, -1
+; VI-NEXT:    v_mov_b32_e32 v3, 0xffefffff
+; VI-NEXT:    v_max_f64 v[0:1], v[0:1], v[2:3]
 ; VI-NEXT:    s_setpc_b64 s[30:31]
 ;
 ; GFX12-LABEL: v_rsq_clamp_fabs_f64:
@@ -130,13 +133,14 @@ define double @v_rsq_clamp_fabs_f64(double %src) #0 {
 ; GFX12-NEXT:    s_wait_bvhcnt 0x0
 ; GFX12-NEXT:    s_wait_kmcnt 0x0
 ; GFX12-NEXT:    v_rsq_f64_e64 v[0:1], |v[0:1]|
-; GFX12-NEXT:    s_mov_b32 s0, -1
-; GFX12-NEXT:    s_mov_b32 s1, 0x7fefffff
-; GFX12-NEXT:    s_delay_alu instid0(TRANS32_DEP_1) | instid1(SALU_CYCLE_1)
-; GFX12-NEXT:    v_min_num_f64_e32 v[0:1], s[0:1], v[0:1]
-; GFX12-NEXT:    s_mov_b32 s1, 0xffefffff
-; GFX12-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instid1(SALU_CYCLE_1)
-; GFX12-NEXT:    v_max_num_f64_e32 v[0:1], s[0:1], v[0:1]
+; GFX12-NEXT:    v_mov_b32_e32 v2, -1
+; GFX12-NEXT:    v_mov_b32_e32 v3, 0x7fefffff
+; GFX12-NEXT:    s_delay_alu instid0(TRANS32_DEP_1) | instid1(VALU_DEP_1)
+; GFX12-NEXT:    v_min_num_f64_e32 v[0:1], v[0:1], v[2:3]
+; GFX12-NEXT:    v_mov_b32_e32 v2, -1
+; GFX12-NEXT:    v_mov_b32_e32 v3, 0xffefffff
+; GFX12-NEXT:    s_delay_alu instid0(VALU_DEP_1)
+; GFX12-NEXT:    v_max_num_f64_e32 v[0:1], v[0:1], v[2:3]
 ; GFX12-NEXT:    s_setpc_b64 s[30:31]
   %fabs.src = call double @llvm.fabs.f64(double %src)
   %rsq_clamp = call double @llvm.amdgcn.rsq.clamp.f64(double %fabs.src)
@@ -185,11 +189,12 @@ define double @v_rsq_clamp_undef_f64() #0 {
 ; VI:       ; %bb.0:
 ; VI-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; VI-NEXT:    v_rsq_f64_e32 v[0:1], s[4:5]
-; VI-NEXT:    s_mov_b32 s4, -1
-; VI-NEXT:    s_mov_b32 s5, 0x7fefffff
-; VI-NEXT:    v_min_f64 v[0:1], v[0:1], s[4:5]
-; VI-NEXT:    s_mov_b32 s5, 0xffefffff
-; VI-NEXT:    v_max_f64 v[0:1], v[0:1], s[4:5]
+; VI-NEXT:    v_mov_b32_e32 v2, -1
+; VI-NEXT:    v_mov_b32_e32 v3, 0x7fefffff
+; VI-NEXT:    v_min_f64 v[0:1], v[0:1], v[2:3]
+; VI-NEXT:    v_mov_b32_e32 v2, -1
+; VI-NEXT:    v_mov_b32_e32 v3, 0xffefffff
+; VI-NEXT:    v_max_f64 v[0:1], v[0:1], v[2:3]
 ; VI-NEXT:    s_setpc_b64 s[30:31]
 ;
 ; GFX12-LABEL: v_rsq_clamp_undef_f64:
@@ -200,13 +205,14 @@ define double @v_rsq_clamp_undef_f64() #0 {
 ; GFX12-NEXT:    s_wait_bvhcnt 0x0
 ; GFX12-NEXT:    s_wait_kmcnt 0x0
 ; GFX12-NEXT:    v_rsq_f64_e32 v[0:1], s[0:1]
-; GFX12-NEXT:    s_mov_b32 s0, -1
-; GFX12-NEXT:    s_mov_b32 s1, 0x7fefffff
-; GFX12-NEXT:    s_delay_alu instid0(TRANS32_DEP_1) | instid1(SALU_CYCLE_1)
-; GFX12-NEXT:    v_min_num_f64_e32 v[0:1], s[0:1], v[0:1]
-; GFX12-NEXT:    s_mov_b32 s1, 0xffefffff
-; GFX12-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instid1(SALU_CYCLE_1)
-; GFX12-NEXT:    v_max_num_f64_e32 v[0:1], s[0:1], v[0:1]
+; GFX12-NEXT:    v_mov_b32_e32 v2, -1
+; GFX12-NEXT:    v_mov_b32_e32 v3, 0x7fefffff
+; GFX12-NEXT:    s_delay_alu instid0(TRANS32_DEP_1) | instid1(VALU_DEP_1)
+; GFX12-NEXT:    v_min_num_f64_e32 v[0:1], v[0:1], v[2:3]
+; GFX12-NEXT:    v_mov_b32_e32 v2, -1
+; GFX12-NEXT:    v_mov_b32_e32 v3, 0xffefffff
+; GFX12-NEXT:    s_delay_alu instid0(VALU_DEP_1)
+; GFX12-NEXT:    v_max_num_f64_e32 v[0:1], v[0:1], v[2:3]
 ; GFX12-NEXT:    s_setpc_b64 s[30:31]
   %rsq_clamp = call double @llvm.amdgcn.rsq.clamp.f64(double undef)
   ret double %rsq_clamp
@@ -254,11 +260,12 @@ define double @v_rsq_clamp_f64_non_ieee(double %src) #2 {
 ; VI:       ; %bb.0:
 ; VI-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; VI-NEXT:    v_rsq_f64_e32 v[0:1], v[0:1]
-; VI-NEXT:    s_mov_b32 s4, -1
-; VI-NEXT:    s_mov_b32 s5, 0x7fefffff
-; VI-NEXT:    v_min_f64 v[0:1], v[0:1], s[4:5]
-; VI-NEXT:    s_mov_b32 s5, 0xffefffff
-; VI-NEXT:    v_max_f64 v[0:1], v[0:1], s[4:5]
+; VI-NEXT:    v_mov_b32_e32 v2, -1
+; VI-NEXT:    v_mov_b32_e32 v3, 0x7fefffff
+; VI-NEXT:    v_min_f64 v[0:1], v[0:1], v[2:3]
+; VI-NEXT:    v_mov_b32_e32 v2, -1
+; VI-NEXT:    v_mov_b32_e32 v3, 0xffefffff
+; VI-NEXT:    v_max_f64 v[0:1], v[0:1], v[2:3]
 ; VI-NEXT:    s_setpc_b64 s[30:31]
 ;
 ; GFX12-LABEL: v_rsq_clamp_f64_non_ieee:
@@ -269,13 +276,14 @@ define double @v_rsq_clamp_f64_non_ieee(double %src) #2 {
 ; GFX12-NEXT:    s_wait_bvhcnt 0x0
 ; GFX12-NEXT:    s_wait_kmcnt 0x0
 ; GFX12-NEXT:    v_rsq_f64_e32 v[0:1], v[0:1]
-; GFX12-NEXT:    s_mov_b32 s0, -1
-; GFX12-NEXT:    s_mov_b32 s1, 0x7fefffff
-; GFX12-NEXT:    s_delay_alu instid0(TRANS32_DEP_1) | instid1(SALU_CYCLE_1)
-; GFX12-NEXT:    v_min_num_f64_e32 v[0:1], s[0:1], v[0:1]
-; GFX12-NEXT:    s_mov_b32 s1, 0xffefffff
-; GFX12-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instid1(SALU_CYCLE_1)
-; GFX12-NEXT:    v_max_num_f64_e32 v[0:1], s[0:1], v[0:1]
+; GFX12-NEXT:    v_mov_b32_e32 v2, -1
+; GFX12-NEXT:    v_mov_b32_e32 v3, 0x7fefffff
+; GFX12-NEXT:    s_delay_alu instid0(TRANS32_DEP_1) | instid1(VALU_DEP_1)
+; GFX12-NEXT:    v_min_num_f64_e32 v[0:1], v[0:1], v[2:3]
+; GFX12-NEXT:    v_mov_b32_e32 v2, -1
+; GFX12-NEXT:    v_mov_b32_e32 v3, 0xffefffff
+; GFX12-NEXT:    s_delay_alu instid0(VALU_DEP_1)
+; GFX12-NEXT:    v_max_num_f64_e32 v[0:1], v[0:1], v[2:3]
 ; GFX12-NEXT:    s_setpc_b64 s[30:31]
   %rsq_clamp = call double @llvm.amdgcn.rsq.clamp.f64(double %src)
   ret double %rsq_clamp
diff --git a/llvm/test/CodeGen/AMDGPU/GlobalISel/llvm.amdgcn.set.inactive.ll b/llvm/test/CodeGen/AMDGPU/GlobalISel/llvm.amdgcn.set.inactive.ll
index 5074f88..8f88aae 100644
--- a/llvm/test/CodeGen/AMDGPU/GlobalISel/llvm.amdgcn.set.inactive.ll
+++ b/llvm/test/CodeGen/AMDGPU/GlobalISel/llvm.amdgcn.set.inactive.ll
@@ -117,10 +117,8 @@ define amdgpu_kernel void @set_inactive_f64(ptr addrspace(1) %out, double %in) {
 ; GCN-LABEL: set_inactive_f64:
 ; GCN:       ; %bb.0:
 ; GCN-NEXT:    s_load_dwordx4 s[0:3], s[2:3], 0x24
-; GCN-NEXT:    s_mov_b32 s4, 0xcccccccd
-; GCN-NEXT:    s_mov_b32 s5, 0x4010cccc
-; GCN-NEXT:    v_mov_b32_e32 v2, s4
-; GCN-NEXT:    v_mov_b32_e32 v3, s5
+; GCN-NEXT:    v_mov_b32_e32 v2, 0xcccccccd
+; GCN-NEXT:    v_mov_b32_e32 v3, 0x4010cccc
 ; GCN-NEXT:    s_waitcnt lgkmcnt(0)
 ; GCN-NEXT:    v_mov_b32_e32 v0, s2
 ; GCN-NEXT:    v_mov_b32_e32 v1, s3
diff --git a/llvm/test/CodeGen/AMDGPU/GlobalISel/llvm.amdgcn.wmma_32.ll b/llvm/test/CodeGen/AMDGPU/GlobalISel/llvm.amdgcn.wmma_32.ll
index 4e44ef3..cc0e34b 100644
--- a/llvm/test/CodeGen/AMDGPU/GlobalISel/llvm.amdgcn.wmma_32.ll
+++ b/llvm/test/CodeGen/AMDGPU/GlobalISel/llvm.amdgcn.wmma_32.ll
@@ -1,5 +1,5 @@
 ; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py
-; RUN: llc -global-isel -mtriple=amdgcn -mcpu=gfx1100 -mattr=+wavefrontsize32,-wavefrontsize64 -verify-machineinstrs < %s | FileCheck %s --check-prefix=W32
+; RUN: llc -global-isel -mtriple=amdgcn -mcpu=gfx1100 -verify-machineinstrs < %s | FileCheck %s --check-prefix=W32
 
 declare <8 x float> @llvm.amdgcn.wmma.f32.16x16x16.f16(<16 x half>, <16 x half> , <8 x float>)
 declare <8 x float> @llvm.amdgcn.wmma.f32.16x16x16.bf16(<16 x i16>, <16 x i16> , <8 x float>)
diff --git a/llvm/test/CodeGen/AMDGPU/GlobalISel/llvm.amdgcn.wmma_64.ll b/llvm/test/CodeGen/AMDGPU/GlobalISel/llvm.amdgcn.wmma_64.ll
index a72b7e4..112a7d9 100644
--- a/llvm/test/CodeGen/AMDGPU/GlobalISel/llvm.amdgcn.wmma_64.ll
+++ b/llvm/test/CodeGen/AMDGPU/GlobalISel/llvm.amdgcn.wmma_64.ll
@@ -1,5 +1,5 @@
 ; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py
-; RUN: llc -global-isel -mtriple=amdgcn -mcpu=gfx1100 -mattr=-wavefrontsize32,+wavefrontsize64 -verify-machineinstrs < %s | FileCheck %s --check-prefix=W64
+; RUN: llc -global-isel -mtriple=amdgcn -mcpu=gfx1100 -mattr=+wavefrontsize64 -verify-machineinstrs < %s | FileCheck %s --check-prefix=W64
 
 declare <4 x float> @llvm.amdgcn.wmma.f32.16x16x16.f16(<16 x half>, <16 x half>, <4 x float>)
 declare <4 x float> @llvm.amdgcn.wmma.f32.16x16x16.bf16(<16 x i16>, <16 x i16>, <4 x float>)
diff --git a/llvm/test/CodeGen/AMDGPU/GlobalISel/llvm.amdgcn.wqm.demote.ll b/llvm/test/CodeGen/AMDGPU/GlobalISel/llvm.amdgcn.wqm.demote.ll
index 2d9fc9b..0f60f40 100644
--- a/llvm/test/CodeGen/AMDGPU/GlobalISel/llvm.amdgcn.wqm.demote.ll
+++ b/llvm/test/CodeGen/AMDGPU/GlobalISel/llvm.amdgcn.wqm.demote.ll
@@ -1,8 +1,8 @@
 ; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py
 ; RUN: llc -global-isel -mtriple=amdgcn -mcpu=tonga -verify-machineinstrs < %s | FileCheck -check-prefix=SI %s
 ; RUN: llc -global-isel -mtriple=amdgcn -mcpu=gfx900 -verify-machineinstrs < %s | FileCheck -check-prefix=GFX9 %s
-; RUN: llc -global-isel -mtriple=amdgcn -mcpu=gfx1010 -mattr=+wavefrontsize32,-wavefrontsize64 -verify-machineinstrs < %s | FileCheck -check-prefix=GFX10-32 %s
-; RUN: llc -global-isel -mtriple=amdgcn -mcpu=gfx1010 -mattr=-wavefrontsize32,+wavefrontsize64 -verify-machineinstrs < %s | FileCheck -check-prefix=GFX10-64 %s
+; RUN: llc -global-isel -mtriple=amdgcn -mcpu=gfx1010 -verify-machineinstrs < %s | FileCheck -check-prefix=GFX10-32 %s
+; RUN: llc -global-isel -mtriple=amdgcn -mcpu=gfx1010 -mattr=+wavefrontsize64 -verify-machineinstrs < %s | FileCheck -check-prefix=GFX10-64 %s
 
 define amdgpu_ps void @static_exact(float %arg0, float %arg1) {
 ; SI-LABEL: static_exact:
diff --git a/llvm/test/CodeGen/AMDGPU/GlobalISel/llvm.memset.ll b/llvm/test/CodeGen/AMDGPU/GlobalISel/llvm.memset.ll
index 3edd2e0..7cd3bab 100644
--- a/llvm/test/CodeGen/AMDGPU/GlobalISel/llvm.memset.ll
+++ b/llvm/test/CodeGen/AMDGPU/GlobalISel/llvm.memset.ll
@@ -7,12 +7,11 @@ declare void @llvm.memset.p1.i32(ptr addrspace(1), i8, i32, i1)
 define amdgpu_cs void @memset_p1i8(ptr addrspace(1) %dst, i8 %val) {
 ; LOOP-LABEL: memset_p1i8:
 ; LOOP:       ; %bb.0: ; %loadstoreloop.preheader
-; LOOP-NEXT:    s_mov_b64 s[4:5], 0
+; LOOP-NEXT:    s_mov_b64 s[0:1], 0
 ; LOOP-NEXT:    s_mov_b32 s2, 0
 ; LOOP-NEXT:    s_mov_b32 s3, 0xf000
-; LOOP-NEXT:    s_mov_b64 s[0:1], 0
-; LOOP-NEXT:    v_mov_b32_e32 v3, s4
-; LOOP-NEXT:    v_mov_b32_e32 v4, s5
+; LOOP-NEXT:    v_mov_b32_e32 v4, s1
+; LOOP-NEXT:    v_mov_b32_e32 v3, s0
 ; LOOP-NEXT:  .LBB0_1: ; %loadstoreloop
 ; LOOP-NEXT:    ; =>This Inner Loop Header: Depth=1
 ; LOOP-NEXT:    v_add_i32_e32 v5, vcc, v0, v3
diff --git a/llvm/test/CodeGen/AMDGPU/GlobalISel/mubuf-global.ll b/llvm/test/CodeGen/AMDGPU/GlobalISel/mubuf-global.ll
index 48217b9..4c34209 100644
--- a/llvm/test/CodeGen/AMDGPU/GlobalISel/mubuf-global.ll
+++ b/llvm/test/CodeGen/AMDGPU/GlobalISel/mubuf-global.ll
@@ -76,29 +76,25 @@ define amdgpu_ps void @mubuf_store_sgpr_ptr_offset4095(ptr addrspace(1) inreg %p
 define amdgpu_ps void @mubuf_store_sgpr_ptr_offset4294967296(ptr addrspace(1) inreg %ptr) {
 ; GFX6-LABEL: mubuf_store_sgpr_ptr_offset4294967296:
 ; GFX6:       ; %bb.0:
-; GFX6-NEXT:    s_mov_b32 s4, 0
-; GFX6-NEXT:    s_mov_b32 s5, 4
-; GFX6-NEXT:    v_mov_b32_e32 v0, s4
+; GFX6-NEXT:    v_mov_b32_e32 v0, 0
 ; GFX6-NEXT:    s_mov_b32 s0, s2
 ; GFX6-NEXT:    s_mov_b32 s1, s3
+; GFX6-NEXT:    s_mov_b32 s2, 0
 ; GFX6-NEXT:    v_mov_b32_e32 v2, 0
+; GFX6-NEXT:    v_mov_b32_e32 v1, 4
 ; GFX6-NEXT:    s_mov_b32 s3, 0xf000
-; GFX6-NEXT:    s_mov_b32 s2, s4
-; GFX6-NEXT:    v_mov_b32_e32 v1, s5
 ; GFX6-NEXT:    buffer_store_dword v2, v[0:1], s[0:3], 0 addr64
 ; GFX6-NEXT:    s_endpgm
 ;
 ; GFX7-LABEL: mubuf_store_sgpr_ptr_offset4294967296:
 ; GFX7:       ; %bb.0:
-; GFX7-NEXT:    s_mov_b32 s4, 0
-; GFX7-NEXT:    s_mov_b32 s5, 4
-; GFX7-NEXT:    v_mov_b32_e32 v0, s4
+; GFX7-NEXT:    v_mov_b32_e32 v0, 0
 ; GFX7-NEXT:    s_mov_b32 s0, s2
 ; GFX7-NEXT:    s_mov_b32 s1, s3
+; GFX7-NEXT:    s_mov_b32 s2, 0
 ; GFX7-NEXT:    v_mov_b32_e32 v2, 0
+; GFX7-NEXT:    v_mov_b32_e32 v1, 4
 ; GFX7-NEXT:    s_mov_b32 s3, 0xf000
-; GFX7-NEXT:    s_mov_b32 s2, s4
-; GFX7-NEXT:    v_mov_b32_e32 v1, s5
 ; GFX7-NEXT:    buffer_store_dword v2, v[0:1], s[0:3], 0 addr64
 ; GFX7-NEXT:    s_endpgm
 ;
@@ -120,29 +116,25 @@ define amdgpu_ps void @mubuf_store_sgpr_ptr_offset4294967296(ptr addrspace(1) in
 define amdgpu_ps void @mubuf_store_sgpr_ptr_offset4294967297(ptr addrspace(1) inreg %ptr) {
 ; GFX6-LABEL: mubuf_store_sgpr_ptr_offset4294967297:
 ; GFX6:       ; %bb.0:
-; GFX6-NEXT:    s_mov_b32 s4, 4
-; GFX6-NEXT:    s_mov_b32 s5, s4
-; GFX6-NEXT:    v_mov_b32_e32 v0, s4
+; GFX6-NEXT:    v_mov_b32_e32 v0, 4
 ; GFX6-NEXT:    s_mov_b32 s0, s2
 ; GFX6-NEXT:    s_mov_b32 s1, s3
 ; GFX6-NEXT:    s_mov_b32 s2, 0
 ; GFX6-NEXT:    v_mov_b32_e32 v2, 0
+; GFX6-NEXT:    v_mov_b32_e32 v1, 4
 ; GFX6-NEXT:    s_mov_b32 s3, 0xf000
-; GFX6-NEXT:    v_mov_b32_e32 v1, s5
 ; GFX6-NEXT:    buffer_store_dword v2, v[0:1], s[0:3], 0 addr64
 ; GFX6-NEXT:    s_endpgm
 ;
 ; GFX7-LABEL: mubuf_store_sgpr_ptr_offset4294967297:
 ; GFX7:       ; %bb.0:
-; GFX7-NEXT:    s_mov_b32 s4, 4
-; GFX7-NEXT:    s_mov_b32 s5, s4
-; GFX7-NEXT:    v_mov_b32_e32 v0, s4
+; GFX7-NEXT:    v_mov_b32_e32 v0, 4
 ; GFX7-NEXT:    s_mov_b32 s0, s2
 ; GFX7-NEXT:    s_mov_b32 s1, s3
 ; GFX7-NEXT:    s_mov_b32 s2, 0
 ; GFX7-NEXT:    v_mov_b32_e32 v2, 0
+; GFX7-NEXT:    v_mov_b32_e32 v1, 4
 ; GFX7-NEXT:    s_mov_b32 s3, 0xf000
-; GFX7-NEXT:    v_mov_b32_e32 v1, s5
 ; GFX7-NEXT:    buffer_store_dword v2, v[0:1], s[0:3], 0 addr64
 ; GFX7-NEXT:    s_endpgm
 ;
@@ -234,9 +226,9 @@ define amdgpu_ps void @mubuf_store_vgpr_ptr_offset4294967296(ptr addrspace(1) %p
 ; GFX6:       ; %bb.0:
 ; GFX6-NEXT:    s_mov_b32 s0, 0
 ; GFX6-NEXT:    s_mov_b32 s1, 4
+; GFX6-NEXT:    s_mov_b32 s2, 0
 ; GFX6-NEXT:    v_mov_b32_e32 v2, 0
 ; GFX6-NEXT:    s_mov_b32 s3, 0xf000
-; GFX6-NEXT:    s_mov_b32 s2, s0
 ; GFX6-NEXT:    buffer_store_dword v2, v[0:1], s[0:3], 0 addr64
 ; GFX6-NEXT:    s_endpgm
 ;
@@ -244,21 +236,16 @@ define amdgpu_ps void @mubuf_store_vgpr_ptr_offset4294967296(ptr addrspace(1) %p
 ; GFX7:       ; %bb.0:
 ; GFX7-NEXT:    s_mov_b32 s0, 0
 ; GFX7-NEXT:    s_mov_b32 s1, 4
+; GFX7-NEXT:    s_mov_b32 s2, 0
 ; GFX7-NEXT:    v_mov_b32_e32 v2, 0
 ; GFX7-NEXT:    s_mov_b32 s3, 0xf000
-; GFX7-NEXT:    s_mov_b32 s2, s0
 ; GFX7-NEXT:    buffer_store_dword v2, v[0:1], s[0:3], 0 addr64
 ; GFX7-NEXT:    s_endpgm
 ;
 ; GFX12-LABEL: mubuf_store_vgpr_ptr_offset4294967296:
 ; GFX12:       ; %bb.0:
-; GFX12-NEXT:    s_mov_b32 s0, 0
-; GFX12-NEXT:    s_mov_b32 s1, 4
-; GFX12-NEXT:    s_delay_alu instid0(SALU_CYCLE_1) | instskip(NEXT) | instid1(VALU_DEP_1)
-; GFX12-NEXT:    v_dual_mov_b32 v3, s1 :: v_dual_mov_b32 v2, s0
-; GFX12-NEXT:    v_add_co_u32 v0, vcc_lo, v0, v2
-; GFX12-NEXT:    s_delay_alu instid0(VALU_DEP_2)
-; GFX12-NEXT:    v_add_co_ci_u32_e32 v1, vcc_lo, v1, v3, vcc_lo
+; GFX12-NEXT:    v_add_co_u32 v0, vcc_lo, v0, 0
+; GFX12-NEXT:    v_add_co_ci_u32_e32 v1, vcc_lo, 4, v1, vcc_lo
 ; GFX12-NEXT:    v_mov_b32_e32 v2, 0
 ; GFX12-NEXT:    global_store_b32 v[0:1], v2, off
 ; GFX12-NEXT:    s_nop 0
@@ -273,7 +260,7 @@ define amdgpu_ps void @mubuf_store_vgpr_ptr_offset4294967297(ptr addrspace(1) %p
 ; GFX6-LABEL: mubuf_store_vgpr_ptr_offset4294967297:
 ; GFX6:       ; %bb.0:
 ; GFX6-NEXT:    s_mov_b32 s0, 4
-; GFX6-NEXT:    s_mov_b32 s1, s0
+; GFX6-NEXT:    s_mov_b32 s1, 4
 ; GFX6-NEXT:    s_mov_b32 s2, 0
 ; GFX6-NEXT:    v_mov_b32_e32 v2, 0
 ; GFX6-NEXT:    s_mov_b32 s3, 0xf000
@@ -283,7 +270,7 @@ define amdgpu_ps void @mubuf_store_vgpr_ptr_offset4294967297(ptr addrspace(1) %p
 ; GFX7-LABEL: mubuf_store_vgpr_ptr_offset4294967297:
 ; GFX7:       ; %bb.0:
 ; GFX7-NEXT:    s_mov_b32 s0, 4
-; GFX7-NEXT:    s_mov_b32 s1, s0
+; GFX7-NEXT:    s_mov_b32 s1, 4
 ; GFX7-NEXT:    s_mov_b32 s2, 0
 ; GFX7-NEXT:    v_mov_b32_e32 v2, 0
 ; GFX7-NEXT:    s_mov_b32 s3, 0xf000
@@ -292,13 +279,8 @@ define amdgpu_ps void @mubuf_store_vgpr_ptr_offset4294967297(ptr addrspace(1) %p
 ;
 ; GFX12-LABEL: mubuf_store_vgpr_ptr_offset4294967297:
 ; GFX12:       ; %bb.0:
-; GFX12-NEXT:    s_mov_b32 s0, 4
-; GFX12-NEXT:    s_delay_alu instid0(SALU_CYCLE_1) | instskip(NEXT) | instid1(SALU_CYCLE_1)
-; GFX12-NEXT:    s_mov_b32 s1, s0
-; GFX12-NEXT:    v_dual_mov_b32 v3, s1 :: v_dual_mov_b32 v2, s0
-; GFX12-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_2)
-; GFX12-NEXT:    v_add_co_u32 v0, vcc_lo, v0, v2
-; GFX12-NEXT:    v_add_co_ci_u32_e32 v1, vcc_lo, v1, v3, vcc_lo
+; GFX12-NEXT:    v_add_co_u32 v0, vcc_lo, v0, 4
+; GFX12-NEXT:    v_add_co_ci_u32_e32 v1, vcc_lo, 4, v1, vcc_lo
 ; GFX12-NEXT:    v_mov_b32_e32 v2, 0
 ; GFX12-NEXT:    global_store_b32 v[0:1], v2, off
 ; GFX12-NEXT:    s_nop 0
@@ -715,28 +697,24 @@ define amdgpu_ps float @mubuf_load_sgpr_ptr_offset4095(ptr addrspace(1) inreg %p
 define amdgpu_ps float @mubuf_load_sgpr_ptr_offset4294967296(ptr addrspace(1) inreg %ptr) {
 ; GFX6-LABEL: mubuf_load_sgpr_ptr_offset4294967296:
 ; GFX6:       ; %bb.0:
-; GFX6-NEXT:    s_mov_b32 s4, 0
-; GFX6-NEXT:    s_mov_b32 s5, 4
-; GFX6-NEXT:    v_mov_b32_e32 v0, s4
+; GFX6-NEXT:    v_mov_b32_e32 v0, 0
 ; GFX6-NEXT:    s_mov_b32 s0, s2
 ; GFX6-NEXT:    s_mov_b32 s1, s3
+; GFX6-NEXT:    s_mov_b32 s2, 0
+; GFX6-NEXT:    v_mov_b32_e32 v1, 4
 ; GFX6-NEXT:    s_mov_b32 s3, 0xf000
-; GFX6-NEXT:    s_mov_b32 s2, s4
-; GFX6-NEXT:    v_mov_b32_e32 v1, s5
 ; GFX6-NEXT:    buffer_load_dword v0, v[0:1], s[0:3], 0 addr64 glc
 ; GFX6-NEXT:    s_waitcnt vmcnt(0)
 ; GFX6-NEXT:    ; return to shader part epilog
 ;
 ; GFX7-LABEL: mubuf_load_sgpr_ptr_offset4294967296:
 ; GFX7:       ; %bb.0:
-; GFX7-NEXT:    s_mov_b32 s4, 0
-; GFX7-NEXT:    s_mov_b32 s5, 4
-; GFX7-NEXT:    v_mov_b32_e32 v0, s4
+; GFX7-NEXT:    v_mov_b32_e32 v0, 0
 ; GFX7-NEXT:    s_mov_b32 s0, s2
 ; GFX7-NEXT:    s_mov_b32 s1, s3
+; GFX7-NEXT:    s_mov_b32 s2, 0
+; GFX7-NEXT:    v_mov_b32_e32 v1, 4
 ; GFX7-NEXT:    s_mov_b32 s3, 0xf000
-; GFX7-NEXT:    s_mov_b32 s2, s4
-; GFX7-NEXT:    v_mov_b32_e32 v1, s5
 ; GFX7-NEXT:    buffer_load_dword v0, v[0:1], s[0:3], 0 addr64 glc
 ; GFX7-NEXT:    s_waitcnt vmcnt(0)
 ; GFX7-NEXT:    ; return to shader part epilog
@@ -758,28 +736,24 @@ define amdgpu_ps float @mubuf_load_sgpr_ptr_offset4294967296(ptr addrspace(1) in
 define amdgpu_ps float @mubuf_load_sgpr_ptr_offset4294967297(ptr addrspace(1) inreg %ptr) {
 ; GFX6-LABEL: mubuf_load_sgpr_ptr_offset4294967297:
 ; GFX6:       ; %bb.0:
-; GFX6-NEXT:    s_mov_b32 s4, 4
-; GFX6-NEXT:    s_mov_b32 s5, s4
-; GFX6-NEXT:    v_mov_b32_e32 v0, s4
+; GFX6-NEXT:    v_mov_b32_e32 v0, 4
 ; GFX6-NEXT:    s_mov_b32 s0, s2
 ; GFX6-NEXT:    s_mov_b32 s1, s3
 ; GFX6-NEXT:    s_mov_b32 s2, 0
+; GFX6-NEXT:    v_mov_b32_e32 v1, 4
 ; GFX6-NEXT:    s_mov_b32 s3, 0xf000
-; GFX6-NEXT:    v_mov_b32_e32 v1, s5
 ; GFX6-NEXT:    buffer_load_dword v0, v[0:1], s[0:3], 0 addr64 glc
 ; GFX6-NEXT:    s_waitcnt vmcnt(0)
 ; GFX6-NEXT:    ; return to shader part epilog
 ;
 ; GFX7-LABEL: mubuf_load_sgpr_ptr_offset4294967297:
 ; GFX7:       ; %bb.0:
-; GFX7-NEXT:    s_mov_b32 s4, 4
-; GFX7-NEXT:    s_mov_b32 s5, s4
-; GFX7-NEXT:    v_mov_b32_e32 v0, s4
+; GFX7-NEXT:    v_mov_b32_e32 v0, 4
 ; GFX7-NEXT:    s_mov_b32 s0, s2
 ; GFX7-NEXT:    s_mov_b32 s1, s3
 ; GFX7-NEXT:    s_mov_b32 s2, 0
+; GFX7-NEXT:    v_mov_b32_e32 v1, 4
 ; GFX7-NEXT:    s_mov_b32 s3, 0xf000
-; GFX7-NEXT:    v_mov_b32_e32 v1, s5
 ; GFX7-NEXT:    buffer_load_dword v0, v[0:1], s[0:3], 0 addr64 glc
 ; GFX7-NEXT:    s_waitcnt vmcnt(0)
 ; GFX7-NEXT:    ; return to shader part epilog
@@ -868,8 +842,8 @@ define amdgpu_ps float @mubuf_load_vgpr_ptr_offset4294967296(ptr addrspace(1) %p
 ; GFX6:       ; %bb.0:
 ; GFX6-NEXT:    s_mov_b32 s0, 0
 ; GFX6-NEXT:    s_mov_b32 s1, 4
+; GFX6-NEXT:    s_mov_b32 s2, 0
 ; GFX6-NEXT:    s_mov_b32 s3, 0xf000
-; GFX6-NEXT:    s_mov_b32 s2, s0
 ; GFX6-NEXT:    buffer_load_dword v0, v[0:1], s[0:3], 0 addr64 glc
 ; GFX6-NEXT:    s_waitcnt vmcnt(0)
 ; GFX6-NEXT:    ; return to shader part epilog
@@ -878,21 +852,16 @@ define amdgpu_ps float @mubuf_load_vgpr_ptr_offset4294967296(ptr addrspace(1) %p
 ; GFX7:       ; %bb.0:
 ; GFX7-NEXT:    s_mov_b32 s0, 0
 ; GFX7-NEXT:    s_mov_b32 s1, 4
+; GFX7-NEXT:    s_mov_b32 s2, 0
 ; GFX7-NEXT:    s_mov_b32 s3, 0xf000
-; GFX7-NEXT:    s_mov_b32 s2, s0
 ; GFX7-NEXT:    buffer_load_dword v0, v[0:1], s[0:3], 0 addr64 glc
 ; GFX7-NEXT:    s_waitcnt vmcnt(0)
 ; GFX7-NEXT:    ; return to shader part epilog
 ;
 ; GFX12-LABEL: mubuf_load_vgpr_ptr_offset4294967296:
 ; GFX12:       ; %bb.0:
-; GFX12-NEXT:    s_mov_b32 s0, 0
-; GFX12-NEXT:    s_mov_b32 s1, 4
-; GFX12-NEXT:    s_delay_alu instid0(SALU_CYCLE_1) | instskip(NEXT) | instid1(VALU_DEP_1)
-; GFX12-NEXT:    v_dual_mov_b32 v3, s1 :: v_dual_mov_b32 v2, s0
-; GFX12-NEXT:    v_add_co_u32 v0, vcc_lo, v0, v2
-; GFX12-NEXT:    s_delay_alu instid0(VALU_DEP_2)
-; GFX12-NEXT:    v_add_co_ci_u32_e32 v1, vcc_lo, v1, v3, vcc_lo
+; GFX12-NEXT:    v_add_co_u32 v0, vcc_lo, v0, 0
+; GFX12-NEXT:    v_add_co_ci_u32_e32 v1, vcc_lo, 4, v1, vcc_lo
 ; GFX12-NEXT:    global_load_b32 v0, v[0:1], off scope:SCOPE_SYS
 ; GFX12-NEXT:    s_wait_loadcnt 0x0
 ; GFX12-NEXT:    ; return to shader part epilog
@@ -905,7 +874,7 @@ define amdgpu_ps float @mubuf_load_vgpr_ptr_offset4294967297(ptr addrspace(1) %p
 ; GFX6-LABEL: mubuf_load_vgpr_ptr_offset4294967297:
 ; GFX6:       ; %bb.0:
 ; GFX6-NEXT:    s_mov_b32 s0, 4
-; GFX6-NEXT:    s_mov_b32 s1, s0
+; GFX6-NEXT:    s_mov_b32 s1, 4
 ; GFX6-NEXT:    s_mov_b32 s2, 0
 ; GFX6-NEXT:    s_mov_b32 s3, 0xf000
 ; GFX6-NEXT:    buffer_load_dword v0, v[0:1], s[0:3], 0 addr64 glc
@@ -915,7 +884,7 @@ define amdgpu_ps float @mubuf_load_vgpr_ptr_offset4294967297(ptr addrspace(1) %p
 ; GFX7-LABEL: mubuf_load_vgpr_ptr_offset4294967297:
 ; GFX7:       ; %bb.0:
 ; GFX7-NEXT:    s_mov_b32 s0, 4
-; GFX7-NEXT:    s_mov_b32 s1, s0
+; GFX7-NEXT:    s_mov_b32 s1, 4
 ; GFX7-NEXT:    s_mov_b32 s2, 0
 ; GFX7-NEXT:    s_mov_b32 s3, 0xf000
 ; GFX7-NEXT:    buffer_load_dword v0, v[0:1], s[0:3], 0 addr64 glc
@@ -924,13 +893,8 @@ define amdgpu_ps float @mubuf_load_vgpr_ptr_offset4294967297(ptr addrspace(1) %p
 ;
 ; GFX12-LABEL: mubuf_load_vgpr_ptr_offset4294967297:
 ; GFX12:       ; %bb.0:
-; GFX12-NEXT:    s_mov_b32 s0, 4
-; GFX12-NEXT:    s_delay_alu instid0(SALU_CYCLE_1) | instskip(NEXT) | instid1(SALU_CYCLE_1)
-; GFX12-NEXT:    s_mov_b32 s1, s0
-; GFX12-NEXT:    v_dual_mov_b32 v3, s1 :: v_dual_mov_b32 v2, s0
-; GFX12-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_2)
-; GFX12-NEXT:    v_add_co_u32 v0, vcc_lo, v0, v2
-; GFX12-NEXT:    v_add_co_ci_u32_e32 v1, vcc_lo, v1, v3, vcc_lo
+; GFX12-NEXT:    v_add_co_u32 v0, vcc_lo, v0, 4
+; GFX12-NEXT:    v_add_co_ci_u32_e32 v1, vcc_lo, 4, v1, vcc_lo
 ; GFX12-NEXT:    global_load_b32 v0, v[0:1], off scope:SCOPE_SYS
 ; GFX12-NEXT:    s_wait_loadcnt 0x0
 ; GFX12-NEXT:    ; return to shader part epilog
@@ -1307,15 +1271,13 @@ define amdgpu_ps float @mubuf_atomicrmw_sgpr_ptr_offset4095(ptr addrspace(1) inr
 define amdgpu_ps float @mubuf_atomicrmw_sgpr_ptr_offset4294967296(ptr addrspace(1) inreg %ptr) {
 ; GFX6-LABEL: mubuf_atomicrmw_sgpr_ptr_offset4294967296:
 ; GFX6:       ; %bb.0:
-; GFX6-NEXT:    s_mov_b32 s4, 0
-; GFX6-NEXT:    s_mov_b32 s5, 4
-; GFX6-NEXT:    v_mov_b32_e32 v1, s4
+; GFX6-NEXT:    v_mov_b32_e32 v1, 0
 ; GFX6-NEXT:    s_mov_b32 s0, s2
 ; GFX6-NEXT:    s_mov_b32 s1, s3
 ; GFX6-NEXT:    v_mov_b32_e32 v0, 2
+; GFX6-NEXT:    s_mov_b32 s2, 0
+; GFX6-NEXT:    v_mov_b32_e32 v2, 4
 ; GFX6-NEXT:    s_mov_b32 s3, 0xf000
-; GFX6-NEXT:    s_mov_b32 s2, s4
-; GFX6-NEXT:    v_mov_b32_e32 v2, s5
 ; GFX6-NEXT:    buffer_atomic_add v0, v[1:2], s[0:3], 0 addr64 glc
 ; GFX6-NEXT:    s_waitcnt vmcnt(0)
 ; GFX6-NEXT:    buffer_wbinvl1
@@ -1324,15 +1286,13 @@ define amdgpu_ps float @mubuf_atomicrmw_sgpr_ptr_offset4294967296(ptr addrspace(
 ;
 ; GFX7-LABEL: mubuf_atomicrmw_sgpr_ptr_offset4294967296:
 ; GFX7:       ; %bb.0:
-; GFX7-NEXT:    s_mov_b32 s4, 0
-; GFX7-NEXT:    s_mov_b32 s5, 4
-; GFX7-NEXT:    v_mov_b32_e32 v1, s4
+; GFX7-NEXT:    v_mov_b32_e32 v1, 0
 ; GFX7-NEXT:    s_mov_b32 s0, s2
 ; GFX7-NEXT:    s_mov_b32 s1, s3
 ; GFX7-NEXT:    v_mov_b32_e32 v0, 2
+; GFX7-NEXT:    s_mov_b32 s2, 0
+; GFX7-NEXT:    v_mov_b32_e32 v2, 4
 ; GFX7-NEXT:    s_mov_b32 s3, 0xf000
-; GFX7-NEXT:    s_mov_b32 s2, s4
-; GFX7-NEXT:    v_mov_b32_e32 v2, s5
 ; GFX7-NEXT:    buffer_atomic_add v0, v[1:2], s[0:3], 0 addr64 glc
 ; GFX7-NEXT:    s_waitcnt vmcnt(0)
 ; GFX7-NEXT:    buffer_wbinvl1
@@ -1404,8 +1364,8 @@ define amdgpu_ps float @mubuf_atomicrmw_vgpr_ptr_offset4294967296(ptr addrspace(
 ; GFX6-NEXT:    s_mov_b32 s0, 0
 ; GFX6-NEXT:    s_mov_b32 s1, 4
 ; GFX6-NEXT:    v_mov_b32_e32 v2, 2
+; GFX6-NEXT:    s_mov_b32 s2, 0
 ; GFX6-NEXT:    s_mov_b32 s3, 0xf000
-; GFX6-NEXT:    s_mov_b32 s2, s0
 ; GFX6-NEXT:    buffer_atomic_add v2, v[0:1], s[0:3], 0 addr64 glc
 ; GFX6-NEXT:    s_waitcnt vmcnt(0)
 ; GFX6-NEXT:    buffer_wbinvl1
@@ -1418,8 +1378,8 @@ define amdgpu_ps float @mubuf_atomicrmw_vgpr_ptr_offset4294967296(ptr addrspace(
 ; GFX7-NEXT:    s_mov_b32 s0, 0
 ; GFX7-NEXT:    s_mov_b32 s1, 4
 ; GFX7-NEXT:    v_mov_b32_e32 v2, 2
+; GFX7-NEXT:    s_mov_b32 s2, 0
 ; GFX7-NEXT:    s_mov_b32 s3, 0xf000
-; GFX7-NEXT:    s_mov_b32 s2, s0
 ; GFX7-NEXT:    buffer_atomic_add v2, v[0:1], s[0:3], 0 addr64 glc
 ; GFX7-NEXT:    s_waitcnt vmcnt(0)
 ; GFX7-NEXT:    buffer_wbinvl1
@@ -1428,13 +1388,8 @@ define amdgpu_ps float @mubuf_atomicrmw_vgpr_ptr_offset4294967296(ptr addrspace(
 ;
 ; GFX12-LABEL: mubuf_atomicrmw_vgpr_ptr_offset4294967296:
 ; GFX12:       ; %bb.0:
-; GFX12-NEXT:    s_mov_b32 s0, 0
-; GFX12-NEXT:    s_mov_b32 s1, 4
-; GFX12-NEXT:    s_delay_alu instid0(SALU_CYCLE_1) | instskip(NEXT) | instid1(VALU_DEP_1)
-; GFX12-NEXT:    v_dual_mov_b32 v3, s1 :: v_dual_mov_b32 v2, s0
-; GFX12-NEXT:    v_add_co_u32 v0, vcc_lo, v0, v2
-; GFX12-NEXT:    s_delay_alu instid0(VALU_DEP_2)
-; GFX12-NEXT:    v_add_co_ci_u32_e32 v1, vcc_lo, v1, v3, vcc_lo
+; GFX12-NEXT:    v_add_co_u32 v0, vcc_lo, v0, 0
+; GFX12-NEXT:    v_add_co_ci_u32_e32 v1, vcc_lo, 4, v1, vcc_lo
 ; GFX12-NEXT:    v_mov_b32_e32 v2, 2
 ; GFX12-NEXT:    global_wb scope:SCOPE_DEV
 ; GFX12-NEXT:    global_atomic_add_u32 v0, v[0:1], v2, off th:TH_ATOMIC_RETURN scope:SCOPE_DEV
@@ -1549,15 +1504,13 @@ define amdgpu_ps float @mubuf_cmpxchg_sgpr_ptr_offset4095(ptr addrspace(1) inreg
 define amdgpu_ps float @mubuf_cmpxchg_sgpr_ptr_offset4294967296(ptr addrspace(1) inreg %ptr, i32 %old, i32 %in) {
 ; GFX6-LABEL: mubuf_cmpxchg_sgpr_ptr_offset4294967296:
 ; GFX6:       ; %bb.0:
-; GFX6-NEXT:    s_mov_b32 s4, 0
-; GFX6-NEXT:    s_mov_b32 s5, 4
-; GFX6-NEXT:    v_mov_b32_e32 v3, s4
+; GFX6-NEXT:    v_mov_b32_e32 v3, 0
 ; GFX6-NEXT:    s_mov_b32 s0, s2
 ; GFX6-NEXT:    s_mov_b32 s1, s3
 ; GFX6-NEXT:    v_mov_b32_e32 v2, v0
+; GFX6-NEXT:    s_mov_b32 s2, 0
+; GFX6-NEXT:    v_mov_b32_e32 v4, 4
 ; GFX6-NEXT:    s_mov_b32 s3, 0xf000
-; GFX6-NEXT:    s_mov_b32 s2, s4
-; GFX6-NEXT:    v_mov_b32_e32 v4, s5
 ; GFX6-NEXT:    buffer_atomic_cmpswap v[1:2], v[3:4], s[0:3], 0 addr64 glc
 ; GFX6-NEXT:    s_waitcnt vmcnt(0)
 ; GFX6-NEXT:    buffer_wbinvl1
@@ -1567,15 +1520,13 @@ define amdgpu_ps float @mubuf_cmpxchg_sgpr_ptr_offset4294967296(ptr addrspace(1)
 ;
 ; GFX7-LABEL: mubuf_cmpxchg_sgpr_ptr_offset4294967296:
 ; GFX7:       ; %bb.0:
-; GFX7-NEXT:    s_mov_b32 s4, 0
-; GFX7-NEXT:    s_mov_b32 s5, 4
-; GFX7-NEXT:    v_mov_b32_e32 v3, s4
+; GFX7-NEXT:    v_mov_b32_e32 v3, 0
 ; GFX7-NEXT:    s_mov_b32 s0, s2
 ; GFX7-NEXT:    s_mov_b32 s1, s3
 ; GFX7-NEXT:    v_mov_b32_e32 v2, v0
+; GFX7-NEXT:    s_mov_b32 s2, 0
+; GFX7-NEXT:    v_mov_b32_e32 v4, 4
 ; GFX7-NEXT:    s_mov_b32 s3, 0xf000
-; GFX7-NEXT:    s_mov_b32 s2, s4
-; GFX7-NEXT:    v_mov_b32_e32 v4, s5
 ; GFX7-NEXT:    buffer_atomic_cmpswap v[1:2], v[3:4], s[0:3], 0 addr64 glc
 ; GFX7-NEXT:    s_waitcnt vmcnt(0)
 ; GFX7-NEXT:    buffer_wbinvl1
@@ -1649,8 +1600,8 @@ define amdgpu_ps float @mubuf_cmpxchg_vgpr_ptr_offset4294967296(ptr addrspace(1)
 ; GFX6-NEXT:    s_mov_b32 s0, 0
 ; GFX6-NEXT:    v_mov_b32_e32 v4, v2
 ; GFX6-NEXT:    s_mov_b32 s1, 4
+; GFX6-NEXT:    s_mov_b32 s2, 0
 ; GFX6-NEXT:    s_mov_b32 s3, 0xf000
-; GFX6-NEXT:    s_mov_b32 s2, s0
 ; GFX6-NEXT:    buffer_atomic_cmpswap v[3:4], v[0:1], s[0:3], 0 addr64 glc
 ; GFX6-NEXT:    s_waitcnt vmcnt(0)
 ; GFX6-NEXT:    buffer_wbinvl1
@@ -1663,8 +1614,8 @@ define amdgpu_ps float @mubuf_cmpxchg_vgpr_ptr_offset4294967296(ptr addrspace(1)
 ; GFX7-NEXT:    s_mov_b32 s0, 0
 ; GFX7-NEXT:    v_mov_b32_e32 v4, v2
 ; GFX7-NEXT:    s_mov_b32 s1, 4
+; GFX7-NEXT:    s_mov_b32 s2, 0
 ; GFX7-NEXT:    s_mov_b32 s3, 0xf000
-; GFX7-NEXT:    s_mov_b32 s2, s0
 ; GFX7-NEXT:    buffer_atomic_cmpswap v[3:4], v[0:1], s[0:3], 0 addr64 glc
 ; GFX7-NEXT:    s_waitcnt vmcnt(0)
 ; GFX7-NEXT:    buffer_wbinvl1
@@ -1673,13 +1624,9 @@ define amdgpu_ps float @mubuf_cmpxchg_vgpr_ptr_offset4294967296(ptr addrspace(1)
 ;
 ; GFX12-LABEL: mubuf_cmpxchg_vgpr_ptr_offset4294967296:
 ; GFX12:       ; %bb.0:
-; GFX12-NEXT:    s_mov_b32 s0, 0
-; GFX12-NEXT:    s_mov_b32 s1, 4
+; GFX12-NEXT:    v_add_co_u32 v0, vcc_lo, v0, 0
 ; GFX12-NEXT:    v_mov_b32_e32 v4, v2
-; GFX12-NEXT:    v_dual_mov_b32 v6, s1 :: v_dual_mov_b32 v5, s0
-; GFX12-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_2)
-; GFX12-NEXT:    v_add_co_u32 v0, vcc_lo, v0, v5
-; GFX12-NEXT:    v_add_co_ci_u32_e32 v1, vcc_lo, v1, v6, vcc_lo
+; GFX12-NEXT:    v_add_co_ci_u32_e32 v1, vcc_lo, 4, v1, vcc_lo
 ; GFX12-NEXT:    global_wb scope:SCOPE_DEV
 ; GFX12-NEXT:    global_atomic_cmpswap_b32 v0, v[0:1], v[3:4], off th:TH_ATOMIC_RETURN scope:SCOPE_DEV
 ; GFX12-NEXT:    s_wait_loadcnt 0x0
diff --git a/llvm/test/CodeGen/AMDGPU/GlobalISel/sdiv.i64.ll b/llvm/test/CodeGen/AMDGPU/GlobalISel/sdiv.i64.ll
index 377fa24..404e726 100644
--- a/llvm/test/CodeGen/AMDGPU/GlobalISel/sdiv.i64.ll
+++ b/llvm/test/CodeGen/AMDGPU/GlobalISel/sdiv.i64.ll
@@ -218,6 +218,7 @@ define amdgpu_ps i64 @s_sdiv_i64(i64 inreg %num, i64 inreg %den) {
 ; CHECK-NEXT:    v_mac_f32_e32 v0, 0x4f800000, v1
 ; CHECK-NEXT:    v_rcp_iflag_f32_e32 v0, v0
 ; CHECK-NEXT:    s_subb_u32 s5, 0, s11
+; CHECK-NEXT:    s_xor_b64 s[6:7], s[6:7], s[8:9]
 ; CHECK-NEXT:    v_mul_f32_e32 v0, 0x5f7ffffc, v0
 ; CHECK-NEXT:    v_mul_f32_e32 v1, 0x2f800000, v0
 ; CHECK-NEXT:    v_trunc_f32_e32 v2, v1
@@ -326,10 +327,9 @@ define amdgpu_ps i64 @s_sdiv_i64(i64 inreg %num, i64 inreg %den) {
 ; CHECK-NEXT:    v_cndmask_b32_e32 v0, v3, v1, vcc
 ; CHECK-NEXT:    v_cmp_ne_u32_e32 vcc, 0, v2
 ; CHECK-NEXT:    v_cndmask_b32_e32 v0, v4, v0, vcc
-; CHECK-NEXT:    s_xor_b64 s[0:1], s[6:7], s[8:9]
-; CHECK-NEXT:    v_xor_b32_e32 v0, s0, v0
-; CHECK-NEXT:    v_subrev_i32_e32 v0, vcc, s0, v0
+; CHECK-NEXT:    v_xor_b32_e32 v0, s6, v0
 ; CHECK-NEXT:    s_mov_b32 s0, 0
+; CHECK-NEXT:    v_subrev_i32_e32 v0, vcc, s6, v0
 ; CHECK-NEXT:    s_branch .LBB1_3
 ; CHECK-NEXT:  .LBB1_2:
 ; CHECK-NEXT:    ; implicit-def: $vgpr0_vgpr1
diff --git a/llvm/test/CodeGen/AMDGPU/GlobalISel/srem.i64.ll b/llvm/test/CodeGen/AMDGPU/GlobalISel/srem.i64.ll
index 83ebc84..5b94e71 100644
--- a/llvm/test/CodeGen/AMDGPU/GlobalISel/srem.i64.ll
+++ b/llvm/test/CodeGen/AMDGPU/GlobalISel/srem.i64.ll
@@ -193,7 +193,7 @@ define amdgpu_ps i64 @s_srem_i64(i64 inreg %num, i64 inreg %den) {
 ; CHECK-NEXT:    s_mov_b32 s7, -1
 ; CHECK-NEXT:    s_and_b64 s[0:1], s[0:1], s[6:7]
 ; CHECK-NEXT:    v_cmp_ne_u64_e64 vcc, s[0:1], 0
-; CHECK-NEXT:    s_mov_b32 s0, 1
+; CHECK-NEXT:    s_mov_b32 s7, 1
 ; CHECK-NEXT:    s_cbranch_vccz .LBB1_2
 ; CHECK-NEXT:  ; %bb.1:
 ; CHECK-NEXT:    s_ashr_i32 s6, s3, 31
@@ -212,6 +212,7 @@ define amdgpu_ps i64 @s_srem_i64(i64 inreg %num, i64 inreg %den) {
 ; CHECK-NEXT:    v_mac_f32_e32 v0, 0x4f800000, v1
 ; CHECK-NEXT:    v_rcp_iflag_f32_e32 v0, v0
 ; CHECK-NEXT:    s_subb_u32 s5, 0, s9
+; CHECK-NEXT:    s_mov_b32 s7, 0
 ; CHECK-NEXT:    v_mul_f32_e32 v0, 0x5f7ffffc, v0
 ; CHECK-NEXT:    v_mul_f32_e32 v1, 0x2f800000, v0
 ; CHECK-NEXT:    v_trunc_f32_e32 v2, v1
@@ -272,43 +273,43 @@ define amdgpu_ps i64 @s_srem_i64(i64 inreg %num, i64 inreg %den) {
 ; CHECK-NEXT:    v_addc_u32_e32 v1, vcc, v4, v1, vcc
 ; CHECK-NEXT:    v_mul_lo_u32 v2, s11, v0
 ; CHECK-NEXT:    v_mul_lo_u32 v3, s10, v1
-; CHECK-NEXT:    v_mul_hi_u32 v4, s10, v0
+; CHECK-NEXT:    v_mul_hi_u32 v5, s10, v0
 ; CHECK-NEXT:    v_mul_hi_u32 v0, s11, v0
-; CHECK-NEXT:    v_mul_hi_u32 v5, s11, v1
+; CHECK-NEXT:    v_mul_hi_u32 v6, s11, v1
 ; CHECK-NEXT:    v_add_i32_e32 v2, vcc, v2, v3
 ; CHECK-NEXT:    v_cndmask_b32_e64 v3, 0, 1, vcc
-; CHECK-NEXT:    v_add_i32_e32 v2, vcc, v2, v4
+; CHECK-NEXT:    v_add_i32_e32 v2, vcc, v2, v5
 ; CHECK-NEXT:    v_cndmask_b32_e64 v2, 0, 1, vcc
-; CHECK-NEXT:    v_mul_lo_u32 v4, s11, v1
+; CHECK-NEXT:    v_mul_lo_u32 v5, s11, v1
 ; CHECK-NEXT:    v_add_i32_e32 v2, vcc, v3, v2
 ; CHECK-NEXT:    v_mul_hi_u32 v3, s10, v1
-; CHECK-NEXT:    v_add_i32_e32 v0, vcc, v4, v0
-; CHECK-NEXT:    v_cndmask_b32_e64 v4, 0, 1, vcc
+; CHECK-NEXT:    v_add_i32_e32 v0, vcc, v5, v0
+; CHECK-NEXT:    v_cndmask_b32_e64 v5, 0, 1, vcc
 ; CHECK-NEXT:    v_add_i32_e32 v0, vcc, v0, v3
 ; CHECK-NEXT:    v_cndmask_b32_e64 v3, 0, 1, vcc
-; CHECK-NEXT:    v_add_i32_e32 v3, vcc, v4, v3
-; CHECK-NEXT:    v_add_i32_e32 v4, vcc, v0, v2
-; CHECK-NEXT:    v_mad_u64_u32 v[0:1], s[0:1], s8, v4, 0
+; CHECK-NEXT:    v_add_i32_e32 v3, vcc, v5, v3
+; CHECK-NEXT:    v_add_i32_e32 v5, vcc, v0, v2
+; CHECK-NEXT:    v_mad_u64_u32 v[0:1], s[0:1], s8, v5, 0
 ; CHECK-NEXT:    v_cndmask_b32_e64 v2, 0, 1, vcc
 ; CHECK-NEXT:    v_add_i32_e32 v2, vcc, v3, v2
-; CHECK-NEXT:    v_add_i32_e32 v2, vcc, v5, v2
+; CHECK-NEXT:    v_add_i32_e32 v2, vcc, v6, v2
 ; CHECK-NEXT:    v_mad_u64_u32 v[1:2], s[0:1], s8, v2, v[1:2]
-; CHECK-NEXT:    v_mov_b32_e32 v5, s11
+; CHECK-NEXT:    v_mov_b32_e32 v3, s11
 ; CHECK-NEXT:    v_sub_i32_e32 v0, vcc, s10, v0
-; CHECK-NEXT:    v_mad_u64_u32 v[1:2], s[0:1], s9, v4, v[1:2]
-; CHECK-NEXT:    v_mov_b32_e32 v3, s9
-; CHECK-NEXT:    v_subb_u32_e64 v2, s[0:1], v5, v1, vcc
+; CHECK-NEXT:    v_mad_u64_u32 v[1:2], s[0:1], s9, v5, v[1:2]
+; CHECK-NEXT:    v_mov_b32_e32 v4, s9
+; CHECK-NEXT:    v_subb_u32_e64 v2, s[0:1], v3, v1, vcc
 ; CHECK-NEXT:    v_sub_i32_e64 v1, s[0:1], s11, v1
-; CHECK-NEXT:    v_subb_u32_e32 v1, vcc, v1, v3, vcc
 ; CHECK-NEXT:    v_cmp_le_u32_e64 s[0:1], s9, v2
-; CHECK-NEXT:    v_subrev_i32_e32 v3, vcc, s8, v0
-; CHECK-NEXT:    v_cndmask_b32_e64 v4, 0, -1, s[0:1]
+; CHECK-NEXT:    v_cndmask_b32_e64 v3, 0, -1, s[0:1]
 ; CHECK-NEXT:    v_cmp_le_u32_e64 s[0:1], s8, v0
-; CHECK-NEXT:    v_subbrev_u32_e32 v1, vcc, 0, v1, vcc
 ; CHECK-NEXT:    v_cndmask_b32_e64 v5, 0, -1, s[0:1]
 ; CHECK-NEXT:    v_cmp_eq_u32_e64 s[0:1], s9, v2
+; CHECK-NEXT:    v_subb_u32_e32 v1, vcc, v1, v4, vcc
+; CHECK-NEXT:    v_cndmask_b32_e64 v2, v3, v5, s[0:1]
+; CHECK-NEXT:    v_subrev_i32_e32 v3, vcc, s8, v0
+; CHECK-NEXT:    v_subbrev_u32_e32 v1, vcc, 0, v1, vcc
 ; CHECK-NEXT:    v_cmp_le_u32_e32 vcc, s9, v1
-; CHECK-NEXT:    v_cndmask_b32_e64 v2, v4, v5, s[0:1]
 ; CHECK-NEXT:    v_cndmask_b32_e64 v4, 0, -1, vcc
 ; CHECK-NEXT:    v_cmp_le_u32_e32 vcc, s8, v3
 ; CHECK-NEXT:    v_cndmask_b32_e64 v5, 0, -1, vcc
@@ -321,12 +322,11 @@ define amdgpu_ps i64 @s_srem_i64(i64 inreg %num, i64 inreg %den) {
 ; CHECK-NEXT:    v_cndmask_b32_e32 v0, v0, v1, vcc
 ; CHECK-NEXT:    v_xor_b32_e32 v0, s6, v0
 ; CHECK-NEXT:    v_subrev_i32_e32 v0, vcc, s6, v0
-; CHECK-NEXT:    s_mov_b32 s0, 0
 ; CHECK-NEXT:    s_branch .LBB1_3
 ; CHECK-NEXT:  .LBB1_2:
 ; CHECK-NEXT:    ; implicit-def: $vgpr0_vgpr1
 ; CHECK-NEXT:  .LBB1_3: ; %Flow
-; CHECK-NEXT:    s_xor_b32 s0, s0, 1
+; CHECK-NEXT:    s_xor_b32 s0, s7, 1
 ; CHECK-NEXT:    s_and_b32 s0, s0, 1
 ; CHECK-NEXT:    s_cmp_lg_u32 s0, 0
 ; CHECK-NEXT:    s_cbranch_scc1 .LBB1_5
diff --git a/llvm/test/CodeGen/AMDGPU/GlobalISel/udiv.i64.ll b/llvm/test/CodeGen/AMDGPU/GlobalISel/udiv.i64.ll
index d155513..e31d8e9 100644
--- a/llvm/test/CodeGen/AMDGPU/GlobalISel/udiv.i64.ll
+++ b/llvm/test/CodeGen/AMDGPU/GlobalISel/udiv.i64.ll
@@ -191,7 +191,7 @@ define amdgpu_ps i64 @s_udiv_i64(i64 inreg %num, i64 inreg %den) {
 ; CHECK-NEXT:    s_mov_b32 s7, -1
 ; CHECK-NEXT:    s_and_b64 s[4:5], s[4:5], s[6:7]
 ; CHECK-NEXT:    v_cmp_ne_u64_e64 vcc, s[4:5], 0
-; CHECK-NEXT:    s_mov_b32 s4, 1
+; CHECK-NEXT:    s_mov_b32 s6, 1
 ; CHECK-NEXT:    v_cvt_f32_u32_e32 v2, s2
 ; CHECK-NEXT:    s_cbranch_vccz .LBB1_2
 ; CHECK-NEXT:  ; %bb.1:
@@ -199,6 +199,7 @@ define amdgpu_ps i64 @s_udiv_i64(i64 inreg %num, i64 inreg %den) {
 ; CHECK-NEXT:    v_cvt_f32_u32_e32 v1, s3
 ; CHECK-NEXT:    s_sub_u32 s4, 0, s2
 ; CHECK-NEXT:    v_mov_b32_e32 v3, s1
+; CHECK-NEXT:    s_mov_b32 s6, 0
 ; CHECK-NEXT:    v_madmk_f32 v1, v1, 0x4f800000, v2
 ; CHECK-NEXT:    s_subb_u32 s5, 0, s3
 ; CHECK-NEXT:    v_rcp_iflag_f32_e32 v1, v1
@@ -317,12 +318,11 @@ define amdgpu_ps i64 @s_udiv_i64(i64 inreg %num, i64 inreg %den) {
 ; CHECK-NEXT:    v_cndmask_b32_e32 v0, v9, v5, vcc
 ; CHECK-NEXT:    v_cmp_ne_u32_e32 vcc, 0, v3
 ; CHECK-NEXT:    v_cndmask_b32_e32 v0, v1, v0, vcc
-; CHECK-NEXT:    s_mov_b32 s4, 0
 ; CHECK-NEXT:    s_branch .LBB1_3
 ; CHECK-NEXT:  .LBB1_2:
 ; CHECK-NEXT:    ; implicit-def: $vgpr0_vgpr1
 ; CHECK-NEXT:  .LBB1_3: ; %Flow
-; CHECK-NEXT:    s_xor_b32 s1, s4, 1
+; CHECK-NEXT:    s_xor_b32 s1, s6, 1
 ; CHECK-NEXT:    s_and_b32 s1, s1, 1
 ; CHECK-NEXT:    s_cmp_lg_u32 s1, 0
 ; CHECK-NEXT:    s_cbranch_scc1 .LBB1_5
diff --git a/llvm/test/CodeGen/AMDGPU/GlobalISel/urem.i64.ll b/llvm/test/CodeGen/AMDGPU/GlobalISel/urem.i64.ll
index cc0f7e2..f30b278b 100644
--- a/llvm/test/CodeGen/AMDGPU/GlobalISel/urem.i64.ll
+++ b/llvm/test/CodeGen/AMDGPU/GlobalISel/urem.i64.ll
@@ -188,13 +188,14 @@ define amdgpu_ps i64 @s_urem_i64(i64 inreg %num, i64 inreg %den) {
 ; CHECK-NEXT:    s_mov_b32 s7, -1
 ; CHECK-NEXT:    s_and_b64 s[4:5], s[4:5], s[6:7]
 ; CHECK-NEXT:    v_cmp_ne_u64_e64 vcc, s[4:5], 0
-; CHECK-NEXT:    s_mov_b32 s4, 1
+; CHECK-NEXT:    s_mov_b32 s6, 1
 ; CHECK-NEXT:    v_cvt_f32_u32_e32 v2, s2
 ; CHECK-NEXT:    s_cbranch_vccz .LBB1_2
 ; CHECK-NEXT:  ; %bb.1:
 ; CHECK-NEXT:    v_mov_b32_e32 v0, s3
 ; CHECK-NEXT:    v_cvt_f32_u32_e32 v1, s3
 ; CHECK-NEXT:    s_sub_u32 s4, 0, s2
+; CHECK-NEXT:    s_mov_b32 s6, 0
 ; CHECK-NEXT:    v_mov_b32_e32 v3, s1
 ; CHECK-NEXT:    v_madmk_f32 v1, v1, 0x4f800000, v2
 ; CHECK-NEXT:    s_subb_u32 s5, 0, s3
@@ -313,12 +314,11 @@ define amdgpu_ps i64 @s_urem_i64(i64 inreg %num, i64 inreg %den) {
 ; CHECK-NEXT:    v_cndmask_b32_e32 v0, v3, v6, vcc
 ; CHECK-NEXT:    v_cmp_ne_u32_e32 vcc, 0, v1
 ; CHECK-NEXT:    v_cndmask_b32_e32 v0, v4, v0, vcc
-; CHECK-NEXT:    s_mov_b32 s4, 0
 ; CHECK-NEXT:    s_branch .LBB1_3
 ; CHECK-NEXT:  .LBB1_2:
 ; CHECK-NEXT:    ; implicit-def: $vgpr0_vgpr1
 ; CHECK-NEXT:  .LBB1_3: ; %Flow
-; CHECK-NEXT:    s_xor_b32 s1, s4, 1
+; CHECK-NEXT:    s_xor_b32 s1, s6, 1
 ; CHECK-NEXT:    s_and_b32 s1, s1, 1
 ; CHECK-NEXT:    s_cmp_lg_u32 s1, 0
 ; CHECK-NEXT:    s_cbranch_scc1 .LBB1_5
diff --git a/llvm/test/CodeGen/AMDGPU/GlobalISel/wmma-gfx12-w64-f16-f32-matrix-modifiers.ll b/llvm/test/CodeGen/AMDGPU/GlobalISel/wmma-gfx12-w64-f16-f32-matrix-modifiers.ll
index 6c232b6..08a5d66 100644
--- a/llvm/test/CodeGen/AMDGPU/GlobalISel/wmma-gfx12-w64-f16-f32-matrix-modifiers.ll
+++ b/llvm/test/CodeGen/AMDGPU/GlobalISel/wmma-gfx12-w64-f16-f32-matrix-modifiers.ll
@@ -1,5 +1,5 @@
 ; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py
-; RUN: llc -global-isel -mtriple=amdgcn -mcpu=gfx1200 -mattr=-wavefrontsize32,+wavefrontsize64 -verify-machineinstrs < %s | FileCheck %s --check-prefix=GFX12
+; RUN: llc -global-isel -mtriple=amdgcn -mcpu=gfx1200 -mattr=+wavefrontsize64 -verify-machineinstrs < %s | FileCheck %s --check-prefix=GFX12
 
 define amdgpu_ps void @test_wmma_f32_16x16x16_f16_negA(<4 x half> %A, <4 x half> %B, <4 x float> %C, ptr addrspace(1) %out) {
 ; GFX12-LABEL: test_wmma_f32_16x16x16_f16_negA:
diff --git a/llvm/test/CodeGen/AMDGPU/GlobalISel/wmma-gfx12-w64-imm.ll b/llvm/test/CodeGen/AMDGPU/GlobalISel/wmma-gfx12-w64-imm.ll
index 717a4fc..738671c 100644
--- a/llvm/test/CodeGen/AMDGPU/GlobalISel/wmma-gfx12-w64-imm.ll
+++ b/llvm/test/CodeGen/AMDGPU/GlobalISel/wmma-gfx12-w64-imm.ll
@@ -1,5 +1,5 @@
 ; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py
-; RUN: llc -global-isel -mtriple=amdgcn -mcpu=gfx1200 -mattr=-wavefrontsize32,+wavefrontsize64 -verify-machineinstrs < %s | FileCheck %s --check-prefix=GFX12
+; RUN: llc -global-isel -mtriple=amdgcn -mcpu=gfx1200 -mattr=+wavefrontsize64 -verify-machineinstrs < %s | FileCheck %s --check-prefix=GFX12
 
 define amdgpu_ps void @test_wmma_f32_16x16x16_f16_imm(<4 x half> %A, <4 x half> %B, ptr addrspace(1) %out) {
 ; GFX12-LABEL: test_wmma_f32_16x16x16_f16_imm:
diff --git a/llvm/test/CodeGen/AMDGPU/GlobalISel/wmma-gfx12-w64-iu-modifiers.ll b/llvm/test/CodeGen/AMDGPU/GlobalISel/wmma-gfx12-w64-iu-modifiers.ll
index 1ef50cb..25f4145 100644
--- a/llvm/test/CodeGen/AMDGPU/GlobalISel/wmma-gfx12-w64-iu-modifiers.ll
+++ b/llvm/test/CodeGen/AMDGPU/GlobalISel/wmma-gfx12-w64-iu-modifiers.ll
@@ -1,5 +1,5 @@
 ; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py
-; RUN: llc -global-isel -mtriple=amdgcn -mcpu=gfx1200 -mattr=-wavefrontsize32,+wavefrontsize64 -verify-machineinstrs < %s | FileCheck %s --check-prefix=GFX12
+; RUN: llc -global-isel -mtriple=amdgcn -mcpu=gfx1200 -mattr=+wavefrontsize64 -verify-machineinstrs < %s | FileCheck %s --check-prefix=GFX12
 
 define amdgpu_ps void @test_wmma_i32_16x16x16_iu8_zext_src0(i32 %A, i32 %B, <4 x i32> %C, ptr addrspace(1) %out) {
 ; GFX12-LABEL: test_wmma_i32_16x16x16_iu8_zext_src0:
diff --git a/llvm/test/CodeGen/AMDGPU/GlobalISel/wmma-gfx12-w64-swmmac-index_key.ll b/llvm/test/CodeGen/AMDGPU/GlobalISel/wmma-gfx12-w64-swmmac-index_key.ll
index 0bd255e..87fe8334 100644
--- a/llvm/test/CodeGen/AMDGPU/GlobalISel/wmma-gfx12-w64-swmmac-index_key.ll
+++ b/llvm/test/CodeGen/AMDGPU/GlobalISel/wmma-gfx12-w64-swmmac-index_key.ll
@@ -1,5 +1,5 @@
 ; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py
-; RUN: llc -global-isel -mtriple=amdgcn -mcpu=gfx1200 -mattr=-wavefrontsize32,+wavefrontsize64 -verify-machineinstrs < %s | FileCheck %s --check-prefix=GFX12
+; RUN: llc -global-isel -mtriple=amdgcn -mcpu=gfx1200 -mattr=+wavefrontsize64 -verify-machineinstrs < %s | FileCheck %s --check-prefix=GFX12
 
 define amdgpu_ps void @test_swmmac_f32_16x16x32_f16_index_key(<4 x half> %A, <8 x half> %B, <4 x float> %C, ptr addrspace(1) %IndexVecPtr, ptr addrspace(1) %out0, ptr addrspace(1) %out1, ptr addrspace(1) %out2, ptr addrspace(1) %out3) {
 ; GFX12-LABEL: test_swmmac_f32_16x16x32_f16_index_key:
diff --git a/llvm/test/CodeGen/AMDGPU/GlobalISel/wmma-gfx12-w64.ll b/llvm/test/CodeGen/AMDGPU/GlobalISel/wmma-gfx12-w64.ll
index 7399fa0..a03180f 100644
--- a/llvm/test/CodeGen/AMDGPU/GlobalISel/wmma-gfx12-w64.ll
+++ b/llvm/test/CodeGen/AMDGPU/GlobalISel/wmma-gfx12-w64.ll
@@ -1,5 +1,5 @@
 ; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py
-; RUN: llc -global-isel -mtriple=amdgcn -mcpu=gfx1200 -mattr=-wavefrontsize32,+wavefrontsize64 -verify-machineinstrs < %s | FileCheck %s --check-prefix=GFX12
+; RUN: llc -global-isel -mtriple=amdgcn -mcpu=gfx1200 -mattr=+wavefrontsize64 -verify-machineinstrs < %s | FileCheck %s --check-prefix=GFX12
 
 define amdgpu_ps void @test_wmma_f32_16x16x16_f16(<4 x half> %A, <4 x half> %B, <4 x float> %C, ptr addrspace(1) %out) {
 ; GFX12-LABEL: test_wmma_f32_16x16x16_f16:
diff --git a/llvm/test/CodeGen/AMDGPU/amdgpu-simplify-libcall-pow-codegen.ll b/llvm/test/CodeGen/AMDGPU/amdgpu-simplify-libcall-pow-codegen.ll
index 6eb7a4a..ebbab5c 100644
--- a/llvm/test/CodeGen/AMDGPU/amdgpu-simplify-libcall-pow-codegen.ll
+++ b/llvm/test/CodeGen/AMDGPU/amdgpu-simplify-libcall-pow-codegen.ll
@@ -634,11 +634,11 @@ define half @test_pown_fast_f16_known_odd(half %x, i32 %y.arg) {
 ; CHECK-NEXT:    v_or_b32_e32 v1, 1, v1
 ; CHECK-NEXT:    v_cvt_f32_i32_e32 v1, v1
 ; CHECK-NEXT:    v_log_f16_e64 v2, |v0|
-; CHECK-NEXT:    v_and_b32_e32 v0, 0xffff8000, v0
+; CHECK-NEXT:    s_movk_i32 s4, 0x7fff
 ; CHECK-NEXT:    v_cvt_f16_f32_e32 v1, v1
 ; CHECK-NEXT:    v_mul_f16_e32 v1, v2, v1
 ; CHECK-NEXT:    v_exp_f16_e32 v1, v1
-; CHECK-NEXT:    v_or_b32_e32 v0, v0, v1
+; CHECK-NEXT:    v_bfi_b32 v0, s4, v1, v0
 ; CHECK-NEXT:    s_setpc_b64 s[30:31]
   %y = or i32 %y.arg, 1
   %call = tail call fast half @_Z4pownDhi(half %x, i32 %y)
@@ -669,9 +669,9 @@ define float @test_pown_fast_f32_known_odd(float %x, i32 %y.arg) {
 ; CHECK-NEXT:    v_exp_f32_e32 v1, v1
 ; CHECK-NEXT:    v_mov_b32_e32 v2, 0x1f800000
 ; CHECK-NEXT:    v_cndmask_b32_e32 v2, 1.0, v2, vcc
-; CHECK-NEXT:    s_brev_b32 s4, 1
+; CHECK-NEXT:    s_brev_b32 s4, -2
 ; CHECK-NEXT:    v_mul_f32_e32 v1, v1, v2
-; CHECK-NEXT:    v_and_or_b32 v0, v0, s4, v1
+; CHECK-NEXT:    v_bfi_b32 v0, s4, v1, v0
 ; CHECK-NEXT:    s_setpc_b64 s[30:31]
   %y = or i32 %y.arg, 1
   %call = tail call fast float @_Z4pownfi(float %x, i32 %y)
diff --git a/llvm/test/CodeGen/AMDGPU/bf16.ll b/llvm/test/CodeGen/AMDGPU/bf16.ll
index 16ffdd7..d732da1 100644
--- a/llvm/test/CodeGen/AMDGPU/bf16.ll
+++ b/llvm/test/CodeGen/AMDGPU/bf16.ll
@@ -4,8 +4,8 @@
 ; RUN: llc < %s -mtriple=amdgcn -mcpu=tonga | FileCheck %s -check-prefixes=GFX8
 ; RUN: llc < %s -mtriple=amdgcn -mcpu=gfx900 | FileCheck %s -check-prefixes=GFX9
 ; RUN: llc < %s -mtriple=amdgcn -mcpu=gfx1010 | FileCheck %s -check-prefixes=GFX10
-; RUN: llc < %s -mtriple=amdgcn -mcpu=gfx1100 -mattr=+real-true16 | FileCheck %s -check-prefix=GFX11
-; RUN: llc < %s -mtriple=amdgcn -mcpu=gfx1100 -mattr=-real-true16 | FileCheck %s -check-prefix=GFX11
+; RUN: llc < %s -mtriple=amdgcn -mcpu=gfx1100 -mattr=+real-true16 | FileCheck %s -check-prefixes=GFX11,GFX11TRUE16
+; RUN: llc < %s -mtriple=amdgcn -mcpu=gfx1100 -mattr=-real-true16 | FileCheck %s -check-prefixes=GFX11,GFX11FAKE16
 
 define void @test_load_store(ptr addrspace(1) %in, ptr addrspace(1) %out) {
 ; GCN-LABEL: test_load_store:
@@ -2131,14 +2131,26 @@ define void @test_store_fpimm(ptr addrspace(1) %ptr0, ptr addrspace(1) %ptr1) {
 ; GFX10-NEXT:    global_store_short v[2:3], v5, off
 ; GFX10-NEXT:    s_setpc_b64 s[30:31]
 ;
-; GFX11-LABEL: test_store_fpimm:
-; GFX11:       ; %bb.0:
-; GFX11-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX11-NEXT:    v_mov_b32_e32 v4, 0x3f80
-; GFX11-NEXT:    v_mov_b32_e32 v5, 0x4228
-; GFX11-NEXT:    global_store_b16 v[0:1], v4, off
-; GFX11-NEXT:    global_store_b16 v[2:3], v5, off
-; GFX11-NEXT:    s_setpc_b64 s[30:31]
+; GFX11TRUE16-LABEL: test_store_fpimm:
+; GFX11TRUE16:       ; %bb.0:
+; GFX11TRUE16-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX11TRUE16-NEXT:    v_mov_b16_e32 v4.l, 0x3f80
+; GFX11TRUE16-NEXT:    v_mov_b16_e32 v4.h, 0x4228
+; GFX11TRUE16-NEXT:    s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_2)
+; GFX11TRUE16-NEXT:    v_mov_b16_e32 v5.l, v4.l
+; GFX11TRUE16-NEXT:    v_mov_b16_e32 v4.l, v4.h
+; GFX11TRUE16-NEXT:    global_store_b16 v[0:1], v5, off
+; GFX11TRUE16-NEXT:    global_store_b16 v[2:3], v4, off
+; GFX11TRUE16-NEXT:    s_setpc_b64 s[30:31]
+;
+; GFX11FAKE16-LABEL: test_store_fpimm:
+; GFX11FAKE16:       ; %bb.0:
+; GFX11FAKE16-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX11FAKE16-NEXT:    v_mov_b32_e32 v4, 0x3f80
+; GFX11FAKE16-NEXT:    v_mov_b32_e32 v5, 0x4228
+; GFX11FAKE16-NEXT:    global_store_b16 v[0:1], v4, off
+; GFX11FAKE16-NEXT:    global_store_b16 v[2:3], v5, off
+; GFX11FAKE16-NEXT:    s_setpc_b64 s[30:31]
   store bfloat 1.0, ptr addrspace(1) %ptr0
   store bfloat 42.0, ptr addrspace(1) %ptr1
   ret void
@@ -9310,6 +9322,72 @@ define <3 x bfloat> @v_fadd_v3bf16(<3 x bfloat> %a, <3 x bfloat> %b) {
 ; GFX10-NEXT:    v_cndmask_b32_e32 v1, v3, v6, vcc_lo
 ; GFX10-NEXT:    v_alignbit_b32 v1, s4, v1, 16
 ; GFX10-NEXT:    s_setpc_b64 s[30:31]
+;
+; GFX11TRUE16-LABEL: v_fadd_v3bf16:
+; GFX11TRUE16:       ; %bb.0:
+; GFX11TRUE16-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX11TRUE16-NEXT:    v_lshlrev_b32_e32 v4, 16, v2
+; GFX11TRUE16-NEXT:    v_lshlrev_b32_e32 v5, 16, v0
+; GFX11TRUE16-NEXT:    v_lshlrev_b32_e32 v3, 16, v3
+; GFX11TRUE16-NEXT:    v_and_b32_e32 v2, 0xffff0000, v2
+; GFX11TRUE16-NEXT:    v_and_b32_e32 v0, 0xffff0000, v0
+; GFX11TRUE16-NEXT:    s_delay_alu instid0(VALU_DEP_4) | instskip(NEXT) | instid1(VALU_DEP_1)
+; GFX11TRUE16-NEXT:    v_dual_add_f32 v4, v5, v4 :: v_dual_lshlrev_b32 v1, 16, v1
+; GFX11TRUE16-NEXT:    v_dual_add_f32 v0, v0, v2 :: v_dual_add_f32 v1, v1, v3
+; GFX11TRUE16-NEXT:    s_delay_alu instid0(VALU_DEP_2) | instskip(SKIP_2) | instid1(VALU_DEP_4)
+; GFX11TRUE16-NEXT:    v_bfe_u32 v2, v4, 16, 1
+; GFX11TRUE16-NEXT:    v_or_b32_e32 v7, 0x400000, v4
+; GFX11TRUE16-NEXT:    v_cmp_u_f32_e32 vcc_lo, v4, v4
+; GFX11TRUE16-NEXT:    v_bfe_u32 v5, v0, 16, 1
+; GFX11TRUE16-NEXT:    v_bfe_u32 v3, v1, 16, 1
+; GFX11TRUE16-NEXT:    v_add3_u32 v2, v2, v4, 0x7fff
+; GFX11TRUE16-NEXT:    v_or_b32_e32 v8, 0x400000, v0
+; GFX11TRUE16-NEXT:    v_or_b32_e32 v6, 0x400000, v1
+; GFX11TRUE16-NEXT:    v_add3_u32 v5, v5, v0, 0x7fff
+; GFX11TRUE16-NEXT:    v_add3_u32 v3, v3, v1, 0x7fff
+; GFX11TRUE16-NEXT:    v_cndmask_b32_e32 v2, v2, v7, vcc_lo
+; GFX11TRUE16-NEXT:    v_cmp_u_f32_e32 vcc_lo, v0, v0
+; GFX11TRUE16-NEXT:    s_delay_alu instid0(VALU_DEP_4) | instskip(SKIP_1) | instid1(VALU_DEP_2)
+; GFX11TRUE16-NEXT:    v_cndmask_b32_e32 v0, v5, v8, vcc_lo
+; GFX11TRUE16-NEXT:    v_cmp_u_f32_e32 vcc_lo, v1, v1
+; GFX11TRUE16-NEXT:    v_perm_b32 v0, v0, v2, 0x7060302
+; GFX11TRUE16-NEXT:    v_cndmask_b32_e32 v1, v3, v6, vcc_lo
+; GFX11TRUE16-NEXT:    s_delay_alu instid0(VALU_DEP_1)
+; GFX11TRUE16-NEXT:    v_alignbit_b32 v1, v0, v1, 16
+; GFX11TRUE16-NEXT:    s_setpc_b64 s[30:31]
+;
+; GFX11FAKE16-LABEL: v_fadd_v3bf16:
+; GFX11FAKE16:       ; %bb.0:
+; GFX11FAKE16-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX11FAKE16-NEXT:    v_lshlrev_b32_e32 v4, 16, v2
+; GFX11FAKE16-NEXT:    v_lshlrev_b32_e32 v5, 16, v0
+; GFX11FAKE16-NEXT:    v_lshlrev_b32_e32 v3, 16, v3
+; GFX11FAKE16-NEXT:    v_and_b32_e32 v2, 0xffff0000, v2
+; GFX11FAKE16-NEXT:    v_and_b32_e32 v0, 0xffff0000, v0
+; GFX11FAKE16-NEXT:    s_delay_alu instid0(VALU_DEP_4) | instskip(NEXT) | instid1(VALU_DEP_1)
+; GFX11FAKE16-NEXT:    v_dual_add_f32 v4, v5, v4 :: v_dual_lshlrev_b32 v1, 16, v1
+; GFX11FAKE16-NEXT:    v_dual_add_f32 v0, v0, v2 :: v_dual_add_f32 v1, v1, v3
+; GFX11FAKE16-NEXT:    s_delay_alu instid0(VALU_DEP_2) | instskip(SKIP_2) | instid1(VALU_DEP_4)
+; GFX11FAKE16-NEXT:    v_bfe_u32 v2, v4, 16, 1
+; GFX11FAKE16-NEXT:    v_or_b32_e32 v7, 0x400000, v4
+; GFX11FAKE16-NEXT:    v_cmp_u_f32_e32 vcc_lo, v4, v4
+; GFX11FAKE16-NEXT:    v_bfe_u32 v5, v0, 16, 1
+; GFX11FAKE16-NEXT:    v_bfe_u32 v3, v1, 16, 1
+; GFX11FAKE16-NEXT:    v_add3_u32 v2, v2, v4, 0x7fff
+; GFX11FAKE16-NEXT:    v_or_b32_e32 v8, 0x400000, v0
+; GFX11FAKE16-NEXT:    v_or_b32_e32 v6, 0x400000, v1
+; GFX11FAKE16-NEXT:    v_add3_u32 v5, v5, v0, 0x7fff
+; GFX11FAKE16-NEXT:    v_add3_u32 v3, v3, v1, 0x7fff
+; GFX11FAKE16-NEXT:    v_cndmask_b32_e32 v2, v2, v7, vcc_lo
+; GFX11FAKE16-NEXT:    v_cmp_u_f32_e32 vcc_lo, v0, v0
+; GFX11FAKE16-NEXT:    s_delay_alu instid0(VALU_DEP_4) | instskip(SKIP_1) | instid1(VALU_DEP_2)
+; GFX11FAKE16-NEXT:    v_cndmask_b32_e32 v0, v5, v8, vcc_lo
+; GFX11FAKE16-NEXT:    v_cmp_u_f32_e32 vcc_lo, v1, v1
+; GFX11FAKE16-NEXT:    v_perm_b32 v0, v0, v2, 0x7060302
+; GFX11FAKE16-NEXT:    v_cndmask_b32_e32 v1, v3, v6, vcc_lo
+; GFX11FAKE16-NEXT:    s_delay_alu instid0(VALU_DEP_1)
+; GFX11FAKE16-NEXT:    v_alignbit_b32 v1, s0, v1, 16
+; GFX11FAKE16-NEXT:    s_setpc_b64 s[30:31]
   %op = fadd <3 x bfloat> %a, %b
   ret <3 x bfloat> %op
 }
@@ -13012,6 +13090,72 @@ define <3 x bfloat> @v_fsub_v3bf16(<3 x bfloat> %a, <3 x bfloat> %b) {
 ; GFX10-NEXT:    v_cndmask_b32_e32 v1, v3, v6, vcc_lo
 ; GFX10-NEXT:    v_alignbit_b32 v1, s4, v1, 16
 ; GFX10-NEXT:    s_setpc_b64 s[30:31]
+;
+; GFX11TRUE16-LABEL: v_fsub_v3bf16:
+; GFX11TRUE16:       ; %bb.0:
+; GFX11TRUE16-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX11TRUE16-NEXT:    v_lshlrev_b32_e32 v4, 16, v2
+; GFX11TRUE16-NEXT:    v_lshlrev_b32_e32 v5, 16, v0
+; GFX11TRUE16-NEXT:    v_lshlrev_b32_e32 v3, 16, v3
+; GFX11TRUE16-NEXT:    v_and_b32_e32 v2, 0xffff0000, v2
+; GFX11TRUE16-NEXT:    v_and_b32_e32 v0, 0xffff0000, v0
+; GFX11TRUE16-NEXT:    s_delay_alu instid0(VALU_DEP_4) | instskip(NEXT) | instid1(VALU_DEP_1)
+; GFX11TRUE16-NEXT:    v_dual_sub_f32 v4, v5, v4 :: v_dual_lshlrev_b32 v1, 16, v1
+; GFX11TRUE16-NEXT:    v_dual_sub_f32 v0, v0, v2 :: v_dual_sub_f32 v1, v1, v3
+; GFX11TRUE16-NEXT:    s_delay_alu instid0(VALU_DEP_2) | instskip(SKIP_2) | instid1(VALU_DEP_4)
+; GFX11TRUE16-NEXT:    v_bfe_u32 v2, v4, 16, 1
+; GFX11TRUE16-NEXT:    v_or_b32_e32 v7, 0x400000, v4
+; GFX11TRUE16-NEXT:    v_cmp_u_f32_e32 vcc_lo, v4, v4
+; GFX11TRUE16-NEXT:    v_bfe_u32 v5, v0, 16, 1
+; GFX11TRUE16-NEXT:    v_bfe_u32 v3, v1, 16, 1
+; GFX11TRUE16-NEXT:    v_add3_u32 v2, v2, v4, 0x7fff
+; GFX11TRUE16-NEXT:    v_or_b32_e32 v8, 0x400000, v0
+; GFX11TRUE16-NEXT:    v_or_b32_e32 v6, 0x400000, v1
+; GFX11TRUE16-NEXT:    v_add3_u32 v5, v5, v0, 0x7fff
+; GFX11TRUE16-NEXT:    v_add3_u32 v3, v3, v1, 0x7fff
+; GFX11TRUE16-NEXT:    v_cndmask_b32_e32 v2, v2, v7, vcc_lo
+; GFX11TRUE16-NEXT:    v_cmp_u_f32_e32 vcc_lo, v0, v0
+; GFX11TRUE16-NEXT:    s_delay_alu instid0(VALU_DEP_4) | instskip(SKIP_1) | instid1(VALU_DEP_2)
+; GFX11TRUE16-NEXT:    v_cndmask_b32_e32 v0, v5, v8, vcc_lo
+; GFX11TRUE16-NEXT:    v_cmp_u_f32_e32 vcc_lo, v1, v1
+; GFX11TRUE16-NEXT:    v_perm_b32 v0, v0, v2, 0x7060302
+; GFX11TRUE16-NEXT:    v_cndmask_b32_e32 v1, v3, v6, vcc_lo
+; GFX11TRUE16-NEXT:    s_delay_alu instid0(VALU_DEP_1)
+; GFX11TRUE16-NEXT:    v_alignbit_b32 v1, v0, v1, 16
+; GFX11TRUE16-NEXT:    s_setpc_b64 s[30:31]
+;
+; GFX11FAKE16-LABEL: v_fsub_v3bf16:
+; GFX11FAKE16:       ; %bb.0:
+; GFX11FAKE16-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX11FAKE16-NEXT:    v_lshlrev_b32_e32 v4, 16, v2
+; GFX11FAKE16-NEXT:    v_lshlrev_b32_e32 v5, 16, v0
+; GFX11FAKE16-NEXT:    v_lshlrev_b32_e32 v3, 16, v3
+; GFX11FAKE16-NEXT:    v_and_b32_e32 v2, 0xffff0000, v2
+; GFX11FAKE16-NEXT:    v_and_b32_e32 v0, 0xffff0000, v0
+; GFX11FAKE16-NEXT:    s_delay_alu instid0(VALU_DEP_4) | instskip(NEXT) | instid1(VALU_DEP_1)
+; GFX11FAKE16-NEXT:    v_dual_sub_f32 v4, v5, v4 :: v_dual_lshlrev_b32 v1, 16, v1
+; GFX11FAKE16-NEXT:    v_dual_sub_f32 v0, v0, v2 :: v_dual_sub_f32 v1, v1, v3
+; GFX11FAKE16-NEXT:    s_delay_alu instid0(VALU_DEP_2) | instskip(SKIP_2) | instid1(VALU_DEP_4)
+; GFX11FAKE16-NEXT:    v_bfe_u32 v2, v4, 16, 1
+; GFX11FAKE16-NEXT:    v_or_b32_e32 v7, 0x400000, v4
+; GFX11FAKE16-NEXT:    v_cmp_u_f32_e32 vcc_lo, v4, v4
+; GFX11FAKE16-NEXT:    v_bfe_u32 v5, v0, 16, 1
+; GFX11FAKE16-NEXT:    v_bfe_u32 v3, v1, 16, 1
+; GFX11FAKE16-NEXT:    v_add3_u32 v2, v2, v4, 0x7fff
+; GFX11FAKE16-NEXT:    v_or_b32_e32 v8, 0x400000, v0
+; GFX11FAKE16-NEXT:    v_or_b32_e32 v6, 0x400000, v1
+; GFX11FAKE16-NEXT:    v_add3_u32 v5, v5, v0, 0x7fff
+; GFX11FAKE16-NEXT:    v_add3_u32 v3, v3, v1, 0x7fff
+; GFX11FAKE16-NEXT:    v_cndmask_b32_e32 v2, v2, v7, vcc_lo
+; GFX11FAKE16-NEXT:    v_cmp_u_f32_e32 vcc_lo, v0, v0
+; GFX11FAKE16-NEXT:    s_delay_alu instid0(VALU_DEP_4) | instskip(SKIP_1) | instid1(VALU_DEP_2)
+; GFX11FAKE16-NEXT:    v_cndmask_b32_e32 v0, v5, v8, vcc_lo
+; GFX11FAKE16-NEXT:    v_cmp_u_f32_e32 vcc_lo, v1, v1
+; GFX11FAKE16-NEXT:    v_perm_b32 v0, v0, v2, 0x7060302
+; GFX11FAKE16-NEXT:    v_cndmask_b32_e32 v1, v3, v6, vcc_lo
+; GFX11FAKE16-NEXT:    s_delay_alu instid0(VALU_DEP_1)
+; GFX11FAKE16-NEXT:    v_alignbit_b32 v1, s0, v1, 16
+; GFX11FAKE16-NEXT:    s_setpc_b64 s[30:31]
   %op = fsub <3 x bfloat> %a, %b
   ret <3 x bfloat> %op
 }
@@ -13615,6 +13759,72 @@ define <3 x bfloat> @v_fmul_v3bf16(<3 x bfloat> %a, <3 x bfloat> %b) {
 ; GFX10-NEXT:    v_cndmask_b32_e32 v1, v3, v6, vcc_lo
 ; GFX10-NEXT:    v_alignbit_b32 v1, s4, v1, 16
 ; GFX10-NEXT:    s_setpc_b64 s[30:31]
+;
+; GFX11TRUE16-LABEL: v_fmul_v3bf16:
+; GFX11TRUE16:       ; %bb.0:
+; GFX11TRUE16-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX11TRUE16-NEXT:    v_lshlrev_b32_e32 v4, 16, v2
+; GFX11TRUE16-NEXT:    v_lshlrev_b32_e32 v5, 16, v0
+; GFX11TRUE16-NEXT:    v_lshlrev_b32_e32 v3, 16, v3
+; GFX11TRUE16-NEXT:    v_and_b32_e32 v2, 0xffff0000, v2
+; GFX11TRUE16-NEXT:    v_and_b32_e32 v0, 0xffff0000, v0
+; GFX11TRUE16-NEXT:    s_delay_alu instid0(VALU_DEP_4) | instskip(NEXT) | instid1(VALU_DEP_1)
+; GFX11TRUE16-NEXT:    v_dual_mul_f32 v4, v5, v4 :: v_dual_lshlrev_b32 v1, 16, v1
+; GFX11TRUE16-NEXT:    v_dual_mul_f32 v0, v0, v2 :: v_dual_mul_f32 v1, v1, v3
+; GFX11TRUE16-NEXT:    s_delay_alu instid0(VALU_DEP_2) | instskip(SKIP_2) | instid1(VALU_DEP_4)
+; GFX11TRUE16-NEXT:    v_bfe_u32 v2, v4, 16, 1
+; GFX11TRUE16-NEXT:    v_or_b32_e32 v7, 0x400000, v4
+; GFX11TRUE16-NEXT:    v_cmp_u_f32_e32 vcc_lo, v4, v4
+; GFX11TRUE16-NEXT:    v_bfe_u32 v5, v0, 16, 1
+; GFX11TRUE16-NEXT:    v_bfe_u32 v3, v1, 16, 1
+; GFX11TRUE16-NEXT:    v_add3_u32 v2, v2, v4, 0x7fff
+; GFX11TRUE16-NEXT:    v_or_b32_e32 v8, 0x400000, v0
+; GFX11TRUE16-NEXT:    v_or_b32_e32 v6, 0x400000, v1
+; GFX11TRUE16-NEXT:    v_add3_u32 v5, v5, v0, 0x7fff
+; GFX11TRUE16-NEXT:    v_add3_u32 v3, v3, v1, 0x7fff
+; GFX11TRUE16-NEXT:    v_cndmask_b32_e32 v2, v2, v7, vcc_lo
+; GFX11TRUE16-NEXT:    v_cmp_u_f32_e32 vcc_lo, v0, v0
+; GFX11TRUE16-NEXT:    s_delay_alu instid0(VALU_DEP_4) | instskip(SKIP_1) | instid1(VALU_DEP_2)
+; GFX11TRUE16-NEXT:    v_cndmask_b32_e32 v0, v5, v8, vcc_lo
+; GFX11TRUE16-NEXT:    v_cmp_u_f32_e32 vcc_lo, v1, v1
+; GFX11TRUE16-NEXT:    v_perm_b32 v0, v0, v2, 0x7060302
+; GFX11TRUE16-NEXT:    v_cndmask_b32_e32 v1, v3, v6, vcc_lo
+; GFX11TRUE16-NEXT:    s_delay_alu instid0(VALU_DEP_1)
+; GFX11TRUE16-NEXT:    v_alignbit_b32 v1, v0, v1, 16
+; GFX11TRUE16-NEXT:    s_setpc_b64 s[30:31]
+;
+; GFX11FAKE16-LABEL: v_fmul_v3bf16:
+; GFX11FAKE16:       ; %bb.0:
+; GFX11FAKE16-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX11FAKE16-NEXT:    v_lshlrev_b32_e32 v4, 16, v2
+; GFX11FAKE16-NEXT:    v_lshlrev_b32_e32 v5, 16, v0
+; GFX11FAKE16-NEXT:    v_lshlrev_b32_e32 v3, 16, v3
+; GFX11FAKE16-NEXT:    v_and_b32_e32 v2, 0xffff0000, v2
+; GFX11FAKE16-NEXT:    v_and_b32_e32 v0, 0xffff0000, v0
+; GFX11FAKE16-NEXT:    s_delay_alu instid0(VALU_DEP_4) | instskip(NEXT) | instid1(VALU_DEP_1)
+; GFX11FAKE16-NEXT:    v_dual_mul_f32 v4, v5, v4 :: v_dual_lshlrev_b32 v1, 16, v1
+; GFX11FAKE16-NEXT:    v_dual_mul_f32 v0, v0, v2 :: v_dual_mul_f32 v1, v1, v3
+; GFX11FAKE16-NEXT:    s_delay_alu instid0(VALU_DEP_2) | instskip(SKIP_2) | instid1(VALU_DEP_4)
+; GFX11FAKE16-NEXT:    v_bfe_u32 v2, v4, 16, 1
+; GFX11FAKE16-NEXT:    v_or_b32_e32 v7, 0x400000, v4
+; GFX11FAKE16-NEXT:    v_cmp_u_f32_e32 vcc_lo, v4, v4
+; GFX11FAKE16-NEXT:    v_bfe_u32 v5, v0, 16, 1
+; GFX11FAKE16-NEXT:    v_bfe_u32 v3, v1, 16, 1
+; GFX11FAKE16-NEXT:    v_add3_u32 v2, v2, v4, 0x7fff
+; GFX11FAKE16-NEXT:    v_or_b32_e32 v8, 0x400000, v0
+; GFX11FAKE16-NEXT:    v_or_b32_e32 v6, 0x400000, v1
+; GFX11FAKE16-NEXT:    v_add3_u32 v5, v5, v0, 0x7fff
+; GFX11FAKE16-NEXT:    v_add3_u32 v3, v3, v1, 0x7fff
+; GFX11FAKE16-NEXT:    v_cndmask_b32_e32 v2, v2, v7, vcc_lo
+; GFX11FAKE16-NEXT:    v_cmp_u_f32_e32 vcc_lo, v0, v0
+; GFX11FAKE16-NEXT:    s_delay_alu instid0(VALU_DEP_4) | instskip(SKIP_1) | instid1(VALU_DEP_2)
+; GFX11FAKE16-NEXT:    v_cndmask_b32_e32 v0, v5, v8, vcc_lo
+; GFX11FAKE16-NEXT:    v_cmp_u_f32_e32 vcc_lo, v1, v1
+; GFX11FAKE16-NEXT:    v_perm_b32 v0, v0, v2, 0x7060302
+; GFX11FAKE16-NEXT:    v_cndmask_b32_e32 v1, v3, v6, vcc_lo
+; GFX11FAKE16-NEXT:    s_delay_alu instid0(VALU_DEP_1)
+; GFX11FAKE16-NEXT:    v_alignbit_b32 v1, s0, v1, 16
+; GFX11FAKE16-NEXT:    s_setpc_b64 s[30:31]
   %op = fmul <3 x bfloat> %a, %b
   ret <3 x bfloat> %op
 }
@@ -17609,6 +17819,72 @@ define <3 x bfloat> @v_minnum_v3bf16(<3 x bfloat> %a, <3 x bfloat> %b) {
 ; GFX10-NEXT:    v_cndmask_b32_e32 v1, v3, v6, vcc_lo
 ; GFX10-NEXT:    v_alignbit_b32 v1, s4, v1, 16
 ; GFX10-NEXT:    s_setpc_b64 s[30:31]
+;
+; GFX11TRUE16-LABEL: v_minnum_v3bf16:
+; GFX11TRUE16:       ; %bb.0:
+; GFX11TRUE16-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX11TRUE16-NEXT:    v_lshlrev_b32_e32 v4, 16, v2
+; GFX11TRUE16-NEXT:    v_lshlrev_b32_e32 v5, 16, v0
+; GFX11TRUE16-NEXT:    v_lshlrev_b32_e32 v3, 16, v3
+; GFX11TRUE16-NEXT:    v_and_b32_e32 v2, 0xffff0000, v2
+; GFX11TRUE16-NEXT:    v_and_b32_e32 v0, 0xffff0000, v0
+; GFX11TRUE16-NEXT:    s_delay_alu instid0(VALU_DEP_4) | instskip(NEXT) | instid1(VALU_DEP_1)
+; GFX11TRUE16-NEXT:    v_dual_min_f32 v4, v5, v4 :: v_dual_lshlrev_b32 v1, 16, v1
+; GFX11TRUE16-NEXT:    v_dual_min_f32 v0, v0, v2 :: v_dual_min_f32 v1, v1, v3
+; GFX11TRUE16-NEXT:    s_delay_alu instid0(VALU_DEP_2) | instskip(SKIP_2) | instid1(VALU_DEP_4)
+; GFX11TRUE16-NEXT:    v_bfe_u32 v2, v4, 16, 1
+; GFX11TRUE16-NEXT:    v_or_b32_e32 v7, 0x400000, v4
+; GFX11TRUE16-NEXT:    v_cmp_u_f32_e32 vcc_lo, v4, v4
+; GFX11TRUE16-NEXT:    v_bfe_u32 v5, v0, 16, 1
+; GFX11TRUE16-NEXT:    v_bfe_u32 v3, v1, 16, 1
+; GFX11TRUE16-NEXT:    v_add3_u32 v2, v2, v4, 0x7fff
+; GFX11TRUE16-NEXT:    v_or_b32_e32 v8, 0x400000, v0
+; GFX11TRUE16-NEXT:    v_or_b32_e32 v6, 0x400000, v1
+; GFX11TRUE16-NEXT:    v_add3_u32 v5, v5, v0, 0x7fff
+; GFX11TRUE16-NEXT:    v_add3_u32 v3, v3, v1, 0x7fff
+; GFX11TRUE16-NEXT:    v_cndmask_b32_e32 v2, v2, v7, vcc_lo
+; GFX11TRUE16-NEXT:    v_cmp_u_f32_e32 vcc_lo, v0, v0
+; GFX11TRUE16-NEXT:    s_delay_alu instid0(VALU_DEP_4) | instskip(SKIP_1) | instid1(VALU_DEP_2)
+; GFX11TRUE16-NEXT:    v_cndmask_b32_e32 v0, v5, v8, vcc_lo
+; GFX11TRUE16-NEXT:    v_cmp_u_f32_e32 vcc_lo, v1, v1
+; GFX11TRUE16-NEXT:    v_perm_b32 v0, v0, v2, 0x7060302
+; GFX11TRUE16-NEXT:    v_cndmask_b32_e32 v1, v3, v6, vcc_lo
+; GFX11TRUE16-NEXT:    s_delay_alu instid0(VALU_DEP_1)
+; GFX11TRUE16-NEXT:    v_alignbit_b32 v1, v0, v1, 16
+; GFX11TRUE16-NEXT:    s_setpc_b64 s[30:31]
+;
+; GFX11FAKE16-LABEL: v_minnum_v3bf16:
+; GFX11FAKE16:       ; %bb.0:
+; GFX11FAKE16-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX11FAKE16-NEXT:    v_lshlrev_b32_e32 v4, 16, v2
+; GFX11FAKE16-NEXT:    v_lshlrev_b32_e32 v5, 16, v0
+; GFX11FAKE16-NEXT:    v_lshlrev_b32_e32 v3, 16, v3
+; GFX11FAKE16-NEXT:    v_and_b32_e32 v2, 0xffff0000, v2
+; GFX11FAKE16-NEXT:    v_and_b32_e32 v0, 0xffff0000, v0
+; GFX11FAKE16-NEXT:    s_delay_alu instid0(VALU_DEP_4) | instskip(NEXT) | instid1(VALU_DEP_1)
+; GFX11FAKE16-NEXT:    v_dual_min_f32 v4, v5, v4 :: v_dual_lshlrev_b32 v1, 16, v1
+; GFX11FAKE16-NEXT:    v_dual_min_f32 v0, v0, v2 :: v_dual_min_f32 v1, v1, v3
+; GFX11FAKE16-NEXT:    s_delay_alu instid0(VALU_DEP_2) | instskip(SKIP_2) | instid1(VALU_DEP_4)
+; GFX11FAKE16-NEXT:    v_bfe_u32 v2, v4, 16, 1
+; GFX11FAKE16-NEXT:    v_or_b32_e32 v7, 0x400000, v4
+; GFX11FAKE16-NEXT:    v_cmp_u_f32_e32 vcc_lo, v4, v4
+; GFX11FAKE16-NEXT:    v_bfe_u32 v5, v0, 16, 1
+; GFX11FAKE16-NEXT:    v_bfe_u32 v3, v1, 16, 1
+; GFX11FAKE16-NEXT:    v_add3_u32 v2, v2, v4, 0x7fff
+; GFX11FAKE16-NEXT:    v_or_b32_e32 v8, 0x400000, v0
+; GFX11FAKE16-NEXT:    v_or_b32_e32 v6, 0x400000, v1
+; GFX11FAKE16-NEXT:    v_add3_u32 v5, v5, v0, 0x7fff
+; GFX11FAKE16-NEXT:    v_add3_u32 v3, v3, v1, 0x7fff
+; GFX11FAKE16-NEXT:    v_cndmask_b32_e32 v2, v2, v7, vcc_lo
+; GFX11FAKE16-NEXT:    v_cmp_u_f32_e32 vcc_lo, v0, v0
+; GFX11FAKE16-NEXT:    s_delay_alu instid0(VALU_DEP_4) | instskip(SKIP_1) | instid1(VALU_DEP_2)
+; GFX11FAKE16-NEXT:    v_cndmask_b32_e32 v0, v5, v8, vcc_lo
+; GFX11FAKE16-NEXT:    v_cmp_u_f32_e32 vcc_lo, v1, v1
+; GFX11FAKE16-NEXT:    v_perm_b32 v0, v0, v2, 0x7060302
+; GFX11FAKE16-NEXT:    v_cndmask_b32_e32 v1, v3, v6, vcc_lo
+; GFX11FAKE16-NEXT:    s_delay_alu instid0(VALU_DEP_1)
+; GFX11FAKE16-NEXT:    v_alignbit_b32 v1, s0, v1, 16
+; GFX11FAKE16-NEXT:    s_setpc_b64 s[30:31]
   %op = call <3 x bfloat> @llvm.minnum.v3bf16(<3 x bfloat> %a, <3 x bfloat> %b)
   ret <3 x bfloat> %op
 }
@@ -21162,6 +21438,72 @@ define <3 x bfloat> @v_maxnum_v3bf16(<3 x bfloat> %a, <3 x bfloat> %b) {
 ; GFX10-NEXT:    v_cndmask_b32_e32 v1, v3, v6, vcc_lo
 ; GFX10-NEXT:    v_alignbit_b32 v1, s4, v1, 16
 ; GFX10-NEXT:    s_setpc_b64 s[30:31]
+;
+; GFX11TRUE16-LABEL: v_maxnum_v3bf16:
+; GFX11TRUE16:       ; %bb.0:
+; GFX11TRUE16-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX11TRUE16-NEXT:    v_lshlrev_b32_e32 v4, 16, v2
+; GFX11TRUE16-NEXT:    v_lshlrev_b32_e32 v5, 16, v0
+; GFX11TRUE16-NEXT:    v_lshlrev_b32_e32 v3, 16, v3
+; GFX11TRUE16-NEXT:    v_and_b32_e32 v2, 0xffff0000, v2
+; GFX11TRUE16-NEXT:    v_and_b32_e32 v0, 0xffff0000, v0
+; GFX11TRUE16-NEXT:    s_delay_alu instid0(VALU_DEP_4) | instskip(NEXT) | instid1(VALU_DEP_1)
+; GFX11TRUE16-NEXT:    v_dual_max_f32 v4, v5, v4 :: v_dual_lshlrev_b32 v1, 16, v1
+; GFX11TRUE16-NEXT:    v_dual_max_f32 v0, v0, v2 :: v_dual_max_f32 v1, v1, v3
+; GFX11TRUE16-NEXT:    s_delay_alu instid0(VALU_DEP_2) | instskip(SKIP_2) | instid1(VALU_DEP_4)
+; GFX11TRUE16-NEXT:    v_bfe_u32 v2, v4, 16, 1
+; GFX11TRUE16-NEXT:    v_or_b32_e32 v7, 0x400000, v4
+; GFX11TRUE16-NEXT:    v_cmp_u_f32_e32 vcc_lo, v4, v4
+; GFX11TRUE16-NEXT:    v_bfe_u32 v5, v0, 16, 1
+; GFX11TRUE16-NEXT:    v_bfe_u32 v3, v1, 16, 1
+; GFX11TRUE16-NEXT:    v_add3_u32 v2, v2, v4, 0x7fff
+; GFX11TRUE16-NEXT:    v_or_b32_e32 v8, 0x400000, v0
+; GFX11TRUE16-NEXT:    v_or_b32_e32 v6, 0x400000, v1
+; GFX11TRUE16-NEXT:    v_add3_u32 v5, v5, v0, 0x7fff
+; GFX11TRUE16-NEXT:    v_add3_u32 v3, v3, v1, 0x7fff
+; GFX11TRUE16-NEXT:    v_cndmask_b32_e32 v2, v2, v7, vcc_lo
+; GFX11TRUE16-NEXT:    v_cmp_u_f32_e32 vcc_lo, v0, v0
+; GFX11TRUE16-NEXT:    s_delay_alu instid0(VALU_DEP_4) | instskip(SKIP_1) | instid1(VALU_DEP_2)
+; GFX11TRUE16-NEXT:    v_cndmask_b32_e32 v0, v5, v8, vcc_lo
+; GFX11TRUE16-NEXT:    v_cmp_u_f32_e32 vcc_lo, v1, v1
+; GFX11TRUE16-NEXT:    v_perm_b32 v0, v0, v2, 0x7060302
+; GFX11TRUE16-NEXT:    v_cndmask_b32_e32 v1, v3, v6, vcc_lo
+; GFX11TRUE16-NEXT:    s_delay_alu instid0(VALU_DEP_1)
+; GFX11TRUE16-NEXT:    v_alignbit_b32 v1, v0, v1, 16
+; GFX11TRUE16-NEXT:    s_setpc_b64 s[30:31]
+;
+; GFX11FAKE16-LABEL: v_maxnum_v3bf16:
+; GFX11FAKE16:       ; %bb.0:
+; GFX11FAKE16-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX11FAKE16-NEXT:    v_lshlrev_b32_e32 v4, 16, v2
+; GFX11FAKE16-NEXT:    v_lshlrev_b32_e32 v5, 16, v0
+; GFX11FAKE16-NEXT:    v_lshlrev_b32_e32 v3, 16, v3
+; GFX11FAKE16-NEXT:    v_and_b32_e32 v2, 0xffff0000, v2
+; GFX11FAKE16-NEXT:    v_and_b32_e32 v0, 0xffff0000, v0
+; GFX11FAKE16-NEXT:    s_delay_alu instid0(VALU_DEP_4) | instskip(NEXT) | instid1(VALU_DEP_1)
+; GFX11FAKE16-NEXT:    v_dual_max_f32 v4, v5, v4 :: v_dual_lshlrev_b32 v1, 16, v1
+; GFX11FAKE16-NEXT:    v_dual_max_f32 v0, v0, v2 :: v_dual_max_f32 v1, v1, v3
+; GFX11FAKE16-NEXT:    s_delay_alu instid0(VALU_DEP_2) | instskip(SKIP_2) | instid1(VALU_DEP_4)
+; GFX11FAKE16-NEXT:    v_bfe_u32 v2, v4, 16, 1
+; GFX11FAKE16-NEXT:    v_or_b32_e32 v7, 0x400000, v4
+; GFX11FAKE16-NEXT:    v_cmp_u_f32_e32 vcc_lo, v4, v4
+; GFX11FAKE16-NEXT:    v_bfe_u32 v5, v0, 16, 1
+; GFX11FAKE16-NEXT:    v_bfe_u32 v3, v1, 16, 1
+; GFX11FAKE16-NEXT:    v_add3_u32 v2, v2, v4, 0x7fff
+; GFX11FAKE16-NEXT:    v_or_b32_e32 v8, 0x400000, v0
+; GFX11FAKE16-NEXT:    v_or_b32_e32 v6, 0x400000, v1
+; GFX11FAKE16-NEXT:    v_add3_u32 v5, v5, v0, 0x7fff
+; GFX11FAKE16-NEXT:    v_add3_u32 v3, v3, v1, 0x7fff
+; GFX11FAKE16-NEXT:    v_cndmask_b32_e32 v2, v2, v7, vcc_lo
+; GFX11FAKE16-NEXT:    v_cmp_u_f32_e32 vcc_lo, v0, v0
+; GFX11FAKE16-NEXT:    s_delay_alu instid0(VALU_DEP_4) | instskip(SKIP_1) | instid1(VALU_DEP_2)
+; GFX11FAKE16-NEXT:    v_cndmask_b32_e32 v0, v5, v8, vcc_lo
+; GFX11FAKE16-NEXT:    v_cmp_u_f32_e32 vcc_lo, v1, v1
+; GFX11FAKE16-NEXT:    v_perm_b32 v0, v0, v2, 0x7060302
+; GFX11FAKE16-NEXT:    v_cndmask_b32_e32 v1, v3, v6, vcc_lo
+; GFX11FAKE16-NEXT:    s_delay_alu instid0(VALU_DEP_1)
+; GFX11FAKE16-NEXT:    v_alignbit_b32 v1, s0, v1, 16
+; GFX11FAKE16-NEXT:    s_setpc_b64 s[30:31]
   %op = call <3 x bfloat> @llvm.maxnum.v3bf16(<3 x bfloat> %a, <3 x bfloat> %b)
   ret <3 x bfloat> %op
 }
@@ -24674,6 +25016,41 @@ define { bfloat, i16 } @v_frexp_bf16_i16(bfloat %a) {
 ; GFX10-NEXT:    v_cndmask_b32_e32 v0, v2, v3, vcc_lo
 ; GFX10-NEXT:    v_lshrrev_b32_e32 v0, 16, v0
 ; GFX10-NEXT:    s_setpc_b64 s[30:31]
+;
+; GFX11TRUE16-LABEL: v_frexp_bf16_i16:
+; GFX11TRUE16:       ; %bb.0:
+; GFX11TRUE16-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX11TRUE16-NEXT:    v_lshlrev_b32_e32 v0, 16, v0
+; GFX11TRUE16-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
+; GFX11TRUE16-NEXT:    v_frexp_mant_f32_e32 v1, v0
+; GFX11TRUE16-NEXT:    v_bfe_u32 v2, v1, 16, 1
+; GFX11TRUE16-NEXT:    v_or_b32_e32 v3, 0x400000, v1
+; GFX11TRUE16-NEXT:    v_cmp_u_f32_e32 vcc_lo, v1, v1
+; GFX11TRUE16-NEXT:    s_delay_alu instid0(VALU_DEP_3) | instskip(NEXT) | instid1(VALU_DEP_1)
+; GFX11TRUE16-NEXT:    v_add3_u32 v2, v2, v1, 0x7fff
+; GFX11TRUE16-NEXT:    v_cndmask_b32_e32 v1, v2, v3, vcc_lo
+; GFX11TRUE16-NEXT:    v_frexp_exp_i32_f32_e32 v2, v0
+; GFX11TRUE16-NEXT:    s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_1)
+; GFX11TRUE16-NEXT:    v_lshrrev_b32_e32 v1, 16, v1
+; GFX11TRUE16-NEXT:    v_dual_mov_b32 v0, v1 :: v_dual_mov_b32 v1, v2
+; GFX11TRUE16-NEXT:    s_setpc_b64 s[30:31]
+;
+; GFX11FAKE16-LABEL: v_frexp_bf16_i16:
+; GFX11FAKE16:       ; %bb.0:
+; GFX11FAKE16-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX11FAKE16-NEXT:    v_lshlrev_b32_e32 v1, 16, v0
+; GFX11FAKE16-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
+; GFX11FAKE16-NEXT:    v_frexp_mant_f32_e32 v0, v1
+; GFX11FAKE16-NEXT:    v_bfe_u32 v2, v0, 16, 1
+; GFX11FAKE16-NEXT:    v_or_b32_e32 v3, 0x400000, v0
+; GFX11FAKE16-NEXT:    v_cmp_u_f32_e32 vcc_lo, v0, v0
+; GFX11FAKE16-NEXT:    s_delay_alu instid0(VALU_DEP_3) | instskip(NEXT) | instid1(VALU_DEP_1)
+; GFX11FAKE16-NEXT:    v_add3_u32 v2, v2, v0, 0x7fff
+; GFX11FAKE16-NEXT:    v_cndmask_b32_e32 v0, v2, v3, vcc_lo
+; GFX11FAKE16-NEXT:    v_frexp_exp_i32_f32_e32 v1, v1
+; GFX11FAKE16-NEXT:    s_delay_alu instid0(VALU_DEP_2)
+; GFX11FAKE16-NEXT:    v_lshrrev_b32_e32 v0, 16, v0
+; GFX11FAKE16-NEXT:    s_setpc_b64 s[30:31]
   %op = call { bfloat, i16 } @llvm.frexp.bf16.i16(bfloat %a)
   ret { bfloat, i16 } %op
 }
@@ -30041,6 +30418,72 @@ define <3 x bfloat> @v_sitofp_v3i16_to_v3bf16(<3 x i16> %x) {
 ; GFX10-NEXT:    v_cndmask_b32_e32 v1, v4, v6, vcc_lo
 ; GFX10-NEXT:    v_alignbit_b32 v1, s4, v1, 16
 ; GFX10-NEXT:    s_setpc_b64 s[30:31]
+;
+; GFX11TRUE16-LABEL: v_sitofp_v3i16_to_v3bf16:
+; GFX11TRUE16:       ; %bb.0:
+; GFX11TRUE16-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX11TRUE16-NEXT:    v_bfe_i32 v2, v0, 0, 16
+; GFX11TRUE16-NEXT:    v_bfe_i32 v1, v1, 0, 16
+; GFX11TRUE16-NEXT:    v_ashrrev_i32_e32 v0, 16, v0
+; GFX11TRUE16-NEXT:    s_delay_alu instid0(VALU_DEP_3) | instskip(NEXT) | instid1(VALU_DEP_3)
+; GFX11TRUE16-NEXT:    v_cvt_f32_i32_e32 v2, v2
+; GFX11TRUE16-NEXT:    v_cvt_f32_i32_e32 v1, v1
+; GFX11TRUE16-NEXT:    s_delay_alu instid0(VALU_DEP_3) | instskip(NEXT) | instid1(VALU_DEP_3)
+; GFX11TRUE16-NEXT:    v_cvt_f32_i32_e32 v0, v0
+; GFX11TRUE16-NEXT:    v_bfe_u32 v3, v2, 16, 1
+; GFX11TRUE16-NEXT:    s_delay_alu instid0(VALU_DEP_3) | instskip(NEXT) | instid1(VALU_DEP_3)
+; GFX11TRUE16-NEXT:    v_bfe_u32 v4, v1, 16, 1
+; GFX11TRUE16-NEXT:    v_bfe_u32 v5, v0, 16, 1
+; GFX11TRUE16-NEXT:    v_or_b32_e32 v7, 0x400000, v2
+; GFX11TRUE16-NEXT:    v_cmp_u_f32_e32 vcc_lo, v2, v2
+; GFX11TRUE16-NEXT:    v_add3_u32 v3, v3, v2, 0x7fff
+; GFX11TRUE16-NEXT:    v_or_b32_e32 v6, 0x400000, v1
+; GFX11TRUE16-NEXT:    v_or_b32_e32 v8, 0x400000, v0
+; GFX11TRUE16-NEXT:    v_add3_u32 v5, v5, v0, 0x7fff
+; GFX11TRUE16-NEXT:    v_add3_u32 v4, v4, v1, 0x7fff
+; GFX11TRUE16-NEXT:    v_cndmask_b32_e32 v2, v3, v7, vcc_lo
+; GFX11TRUE16-NEXT:    v_cmp_u_f32_e32 vcc_lo, v0, v0
+; GFX11TRUE16-NEXT:    s_delay_alu instid0(VALU_DEP_4) | instskip(SKIP_2) | instid1(VALU_DEP_3)
+; GFX11TRUE16-NEXT:    v_cndmask_b32_e32 v0, v5, v8, vcc_lo
+; GFX11TRUE16-NEXT:    v_cmp_u_f32_e32 vcc_lo, v1, v1
+; GFX11TRUE16-NEXT:    v_cndmask_b32_e32 v1, v4, v6, vcc_lo
+; GFX11TRUE16-NEXT:    v_perm_b32 v0, v0, v2, 0x7060302
+; GFX11TRUE16-NEXT:    s_delay_alu instid0(VALU_DEP_1)
+; GFX11TRUE16-NEXT:    v_alignbit_b32 v1, v0, v1, 16
+; GFX11TRUE16-NEXT:    s_setpc_b64 s[30:31]
+;
+; GFX11FAKE16-LABEL: v_sitofp_v3i16_to_v3bf16:
+; GFX11FAKE16:       ; %bb.0:
+; GFX11FAKE16-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX11FAKE16-NEXT:    v_bfe_i32 v2, v0, 0, 16
+; GFX11FAKE16-NEXT:    v_bfe_i32 v1, v1, 0, 16
+; GFX11FAKE16-NEXT:    v_ashrrev_i32_e32 v0, 16, v0
+; GFX11FAKE16-NEXT:    s_delay_alu instid0(VALU_DEP_3) | instskip(NEXT) | instid1(VALU_DEP_3)
+; GFX11FAKE16-NEXT:    v_cvt_f32_i32_e32 v2, v2
+; GFX11FAKE16-NEXT:    v_cvt_f32_i32_e32 v1, v1
+; GFX11FAKE16-NEXT:    s_delay_alu instid0(VALU_DEP_3) | instskip(NEXT) | instid1(VALU_DEP_3)
+; GFX11FAKE16-NEXT:    v_cvt_f32_i32_e32 v0, v0
+; GFX11FAKE16-NEXT:    v_bfe_u32 v3, v2, 16, 1
+; GFX11FAKE16-NEXT:    s_delay_alu instid0(VALU_DEP_3) | instskip(NEXT) | instid1(VALU_DEP_3)
+; GFX11FAKE16-NEXT:    v_bfe_u32 v4, v1, 16, 1
+; GFX11FAKE16-NEXT:    v_bfe_u32 v5, v0, 16, 1
+; GFX11FAKE16-NEXT:    v_or_b32_e32 v7, 0x400000, v2
+; GFX11FAKE16-NEXT:    v_cmp_u_f32_e32 vcc_lo, v2, v2
+; GFX11FAKE16-NEXT:    v_add3_u32 v3, v3, v2, 0x7fff
+; GFX11FAKE16-NEXT:    v_or_b32_e32 v6, 0x400000, v1
+; GFX11FAKE16-NEXT:    v_or_b32_e32 v8, 0x400000, v0
+; GFX11FAKE16-NEXT:    v_add3_u32 v5, v5, v0, 0x7fff
+; GFX11FAKE16-NEXT:    v_add3_u32 v4, v4, v1, 0x7fff
+; GFX11FAKE16-NEXT:    v_cndmask_b32_e32 v2, v3, v7, vcc_lo
+; GFX11FAKE16-NEXT:    v_cmp_u_f32_e32 vcc_lo, v0, v0
+; GFX11FAKE16-NEXT:    s_delay_alu instid0(VALU_DEP_4) | instskip(SKIP_2) | instid1(VALU_DEP_3)
+; GFX11FAKE16-NEXT:    v_cndmask_b32_e32 v0, v5, v8, vcc_lo
+; GFX11FAKE16-NEXT:    v_cmp_u_f32_e32 vcc_lo, v1, v1
+; GFX11FAKE16-NEXT:    v_cndmask_b32_e32 v1, v4, v6, vcc_lo
+; GFX11FAKE16-NEXT:    v_perm_b32 v0, v0, v2, 0x7060302
+; GFX11FAKE16-NEXT:    s_delay_alu instid0(VALU_DEP_2)
+; GFX11FAKE16-NEXT:    v_alignbit_b32 v1, s0, v1, 16
+; GFX11FAKE16-NEXT:    s_setpc_b64 s[30:31]
   %op = sitofp <3 x i16> %x to <3 x bfloat>
   ret <3 x bfloat> %op
 }
@@ -30501,6 +30944,60 @@ define <3 x bfloat> @v_sitofp_v3i32_to_v3bf16(<3 x i32> %x) {
 ; GFX10-NEXT:    v_cndmask_b32_e32 v2, v4, v6, vcc_lo
 ; GFX10-NEXT:    v_alignbit_b32 v1, s4, v2, 16
 ; GFX10-NEXT:    s_setpc_b64 s[30:31]
+;
+; GFX11TRUE16-LABEL: v_sitofp_v3i32_to_v3bf16:
+; GFX11TRUE16:       ; %bb.0:
+; GFX11TRUE16-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX11TRUE16-NEXT:    v_cvt_f32_i32_e32 v0, v0
+; GFX11TRUE16-NEXT:    v_cvt_f32_i32_e32 v1, v1
+; GFX11TRUE16-NEXT:    v_cvt_f32_i32_e32 v2, v2
+; GFX11TRUE16-NEXT:    s_delay_alu instid0(VALU_DEP_3) | instskip(NEXT) | instid1(VALU_DEP_3)
+; GFX11TRUE16-NEXT:    v_bfe_u32 v3, v0, 16, 1
+; GFX11TRUE16-NEXT:    v_bfe_u32 v5, v1, 16, 1
+; GFX11TRUE16-NEXT:    v_or_b32_e32 v7, 0x400000, v0
+; GFX11TRUE16-NEXT:    v_cmp_u_f32_e32 vcc_lo, v0, v0
+; GFX11TRUE16-NEXT:    v_bfe_u32 v4, v2, 16, 1
+; GFX11TRUE16-NEXT:    v_add3_u32 v3, v3, v0, 0x7fff
+; GFX11TRUE16-NEXT:    v_or_b32_e32 v8, 0x400000, v1
+; GFX11TRUE16-NEXT:    v_add3_u32 v5, v5, v1, 0x7fff
+; GFX11TRUE16-NEXT:    v_or_b32_e32 v6, 0x400000, v2
+; GFX11TRUE16-NEXT:    v_add3_u32 v4, v4, v2, 0x7fff
+; GFX11TRUE16-NEXT:    v_cndmask_b32_e32 v0, v3, v7, vcc_lo
+; GFX11TRUE16-NEXT:    v_cmp_u_f32_e32 vcc_lo, v1, v1
+; GFX11TRUE16-NEXT:    v_cndmask_b32_e32 v1, v5, v8, vcc_lo
+; GFX11TRUE16-NEXT:    v_cmp_u_f32_e32 vcc_lo, v2, v2
+; GFX11TRUE16-NEXT:    s_delay_alu instid0(VALU_DEP_2) | instskip(SKIP_1) | instid1(VALU_DEP_1)
+; GFX11TRUE16-NEXT:    v_perm_b32 v0, v1, v0, 0x7060302
+; GFX11TRUE16-NEXT:    v_cndmask_b32_e32 v2, v4, v6, vcc_lo
+; GFX11TRUE16-NEXT:    v_alignbit_b32 v1, v0, v2, 16
+; GFX11TRUE16-NEXT:    s_setpc_b64 s[30:31]
+;
+; GFX11FAKE16-LABEL: v_sitofp_v3i32_to_v3bf16:
+; GFX11FAKE16:       ; %bb.0:
+; GFX11FAKE16-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX11FAKE16-NEXT:    v_cvt_f32_i32_e32 v0, v0
+; GFX11FAKE16-NEXT:    v_cvt_f32_i32_e32 v1, v1
+; GFX11FAKE16-NEXT:    v_cvt_f32_i32_e32 v2, v2
+; GFX11FAKE16-NEXT:    s_delay_alu instid0(VALU_DEP_3) | instskip(NEXT) | instid1(VALU_DEP_3)
+; GFX11FAKE16-NEXT:    v_bfe_u32 v3, v0, 16, 1
+; GFX11FAKE16-NEXT:    v_bfe_u32 v5, v1, 16, 1
+; GFX11FAKE16-NEXT:    v_or_b32_e32 v7, 0x400000, v0
+; GFX11FAKE16-NEXT:    v_cmp_u_f32_e32 vcc_lo, v0, v0
+; GFX11FAKE16-NEXT:    v_bfe_u32 v4, v2, 16, 1
+; GFX11FAKE16-NEXT:    v_add3_u32 v3, v3, v0, 0x7fff
+; GFX11FAKE16-NEXT:    v_or_b32_e32 v8, 0x400000, v1
+; GFX11FAKE16-NEXT:    v_add3_u32 v5, v5, v1, 0x7fff
+; GFX11FAKE16-NEXT:    v_or_b32_e32 v6, 0x400000, v2
+; GFX11FAKE16-NEXT:    v_add3_u32 v4, v4, v2, 0x7fff
+; GFX11FAKE16-NEXT:    v_cndmask_b32_e32 v0, v3, v7, vcc_lo
+; GFX11FAKE16-NEXT:    v_cmp_u_f32_e32 vcc_lo, v1, v1
+; GFX11FAKE16-NEXT:    v_cndmask_b32_e32 v1, v5, v8, vcc_lo
+; GFX11FAKE16-NEXT:    v_cmp_u_f32_e32 vcc_lo, v2, v2
+; GFX11FAKE16-NEXT:    s_delay_alu instid0(VALU_DEP_2) | instskip(SKIP_1) | instid1(VALU_DEP_1)
+; GFX11FAKE16-NEXT:    v_perm_b32 v0, v1, v0, 0x7060302
+; GFX11FAKE16-NEXT:    v_cndmask_b32_e32 v2, v4, v6, vcc_lo
+; GFX11FAKE16-NEXT:    v_alignbit_b32 v1, s0, v2, 16
+; GFX11FAKE16-NEXT:    s_setpc_b64 s[30:31]
   %op = sitofp <3 x i32> %x to <3 x bfloat>
   ret <3 x bfloat> %op
 }
@@ -31325,6 +31822,144 @@ define <3 x bfloat> @v_sitofp_v3i64_to_v3bf16(<3 x i64> %x) {
 ; GFX10-NEXT:    v_cndmask_b32_e32 v1, v4, v6, vcc_lo
 ; GFX10-NEXT:    v_alignbit_b32 v1, s4, v1, 16
 ; GFX10-NEXT:    s_setpc_b64 s[30:31]
+;
+; GFX11TRUE16-LABEL: v_sitofp_v3i64_to_v3bf16:
+; GFX11TRUE16:       ; %bb.0:
+; GFX11TRUE16-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX11TRUE16-NEXT:    v_xor_b32_e32 v8, v0, v1
+; GFX11TRUE16-NEXT:    v_xor_b32_e32 v7, v4, v5
+; GFX11TRUE16-NEXT:    v_xor_b32_e32 v9, v2, v3
+; GFX11TRUE16-NEXT:    v_cls_i32_e32 v10, v1
+; GFX11TRUE16-NEXT:    v_cls_i32_e32 v6, v5
+; GFX11TRUE16-NEXT:    v_ashrrev_i32_e32 v8, 31, v8
+; GFX11TRUE16-NEXT:    v_ashrrev_i32_e32 v7, 31, v7
+; GFX11TRUE16-NEXT:    v_cls_i32_e32 v11, v3
+; GFX11TRUE16-NEXT:    v_ashrrev_i32_e32 v9, 31, v9
+; GFX11TRUE16-NEXT:    v_add_nc_u32_e32 v10, -1, v10
+; GFX11TRUE16-NEXT:    v_add_nc_u32_e32 v8, 32, v8
+; GFX11TRUE16-NEXT:    v_add_nc_u32_e32 v6, -1, v6
+; GFX11TRUE16-NEXT:    v_add_nc_u32_e32 v7, 32, v7
+; GFX11TRUE16-NEXT:    v_add_nc_u32_e32 v11, -1, v11
+; GFX11TRUE16-NEXT:    v_add_nc_u32_e32 v9, 32, v9
+; GFX11TRUE16-NEXT:    v_min_u32_e32 v8, v10, v8
+; GFX11TRUE16-NEXT:    s_delay_alu instid0(VALU_DEP_4) | instskip(NEXT) | instid1(VALU_DEP_3)
+; GFX11TRUE16-NEXT:    v_min_u32_e32 v6, v6, v7
+; GFX11TRUE16-NEXT:    v_min_u32_e32 v7, v11, v9
+; GFX11TRUE16-NEXT:    s_delay_alu instid0(VALU_DEP_3) | instskip(NEXT) | instid1(VALU_DEP_3)
+; GFX11TRUE16-NEXT:    v_lshlrev_b64 v[0:1], v8, v[0:1]
+; GFX11TRUE16-NEXT:    v_lshlrev_b64 v[4:5], v6, v[4:5]
+; GFX11TRUE16-NEXT:    v_sub_nc_u32_e32 v6, 32, v6
+; GFX11TRUE16-NEXT:    s_delay_alu instid0(VALU_DEP_4) | instskip(NEXT) | instid1(VALU_DEP_4)
+; GFX11TRUE16-NEXT:    v_lshlrev_b64 v[2:3], v7, v[2:3]
+; GFX11TRUE16-NEXT:    v_min_u32_e32 v0, 1, v0
+; GFX11TRUE16-NEXT:    s_delay_alu instid0(VALU_DEP_4) | instskip(NEXT) | instid1(VALU_DEP_3)
+; GFX11TRUE16-NEXT:    v_min_u32_e32 v4, 1, v4
+; GFX11TRUE16-NEXT:    v_min_u32_e32 v2, 1, v2
+; GFX11TRUE16-NEXT:    s_delay_alu instid0(VALU_DEP_3) | instskip(NEXT) | instid1(VALU_DEP_3)
+; GFX11TRUE16-NEXT:    v_or_b32_e32 v0, v1, v0
+; GFX11TRUE16-NEXT:    v_or_b32_e32 v1, v5, v4
+; GFX11TRUE16-NEXT:    v_sub_nc_u32_e32 v4, 32, v7
+; GFX11TRUE16-NEXT:    s_delay_alu instid0(VALU_DEP_4) | instskip(SKIP_3) | instid1(VALU_DEP_4)
+; GFX11TRUE16-NEXT:    v_or_b32_e32 v2, v3, v2
+; GFX11TRUE16-NEXT:    v_sub_nc_u32_e32 v3, 32, v8
+; GFX11TRUE16-NEXT:    v_cvt_f32_i32_e32 v0, v0
+; GFX11TRUE16-NEXT:    v_cvt_f32_i32_e32 v1, v1
+; GFX11TRUE16-NEXT:    v_cvt_f32_i32_e32 v2, v2
+; GFX11TRUE16-NEXT:    s_delay_alu instid0(VALU_DEP_3) | instskip(NEXT) | instid1(VALU_DEP_3)
+; GFX11TRUE16-NEXT:    v_ldexp_f32 v0, v0, v3
+; GFX11TRUE16-NEXT:    v_ldexp_f32 v1, v1, v6
+; GFX11TRUE16-NEXT:    s_delay_alu instid0(VALU_DEP_3) | instskip(NEXT) | instid1(VALU_DEP_3)
+; GFX11TRUE16-NEXT:    v_ldexp_f32 v2, v2, v4
+; GFX11TRUE16-NEXT:    v_bfe_u32 v3, v0, 16, 1
+; GFX11TRUE16-NEXT:    v_or_b32_e32 v7, 0x400000, v0
+; GFX11TRUE16-NEXT:    v_cmp_u_f32_e32 vcc_lo, v0, v0
+; GFX11TRUE16-NEXT:    s_delay_alu instid0(VALU_DEP_4)
+; GFX11TRUE16-NEXT:    v_bfe_u32 v5, v2, 16, 1
+; GFX11TRUE16-NEXT:    v_bfe_u32 v4, v1, 16, 1
+; GFX11TRUE16-NEXT:    v_add3_u32 v3, v3, v0, 0x7fff
+; GFX11TRUE16-NEXT:    v_or_b32_e32 v8, 0x400000, v2
+; GFX11TRUE16-NEXT:    v_or_b32_e32 v6, 0x400000, v1
+; GFX11TRUE16-NEXT:    v_add3_u32 v5, v5, v2, 0x7fff
+; GFX11TRUE16-NEXT:    v_add3_u32 v4, v4, v1, 0x7fff
+; GFX11TRUE16-NEXT:    v_cndmask_b32_e32 v0, v3, v7, vcc_lo
+; GFX11TRUE16-NEXT:    v_cmp_u_f32_e32 vcc_lo, v2, v2
+; GFX11TRUE16-NEXT:    s_delay_alu instid0(VALU_DEP_4) | instskip(SKIP_1) | instid1(VALU_DEP_2)
+; GFX11TRUE16-NEXT:    v_cndmask_b32_e32 v2, v5, v8, vcc_lo
+; GFX11TRUE16-NEXT:    v_cmp_u_f32_e32 vcc_lo, v1, v1
+; GFX11TRUE16-NEXT:    v_perm_b32 v0, v2, v0, 0x7060302
+; GFX11TRUE16-NEXT:    v_cndmask_b32_e32 v1, v4, v6, vcc_lo
+; GFX11TRUE16-NEXT:    s_delay_alu instid0(VALU_DEP_1)
+; GFX11TRUE16-NEXT:    v_alignbit_b32 v1, v0, v1, 16
+; GFX11TRUE16-NEXT:    s_setpc_b64 s[30:31]
+;
+; GFX11FAKE16-LABEL: v_sitofp_v3i64_to_v3bf16:
+; GFX11FAKE16:       ; %bb.0:
+; GFX11FAKE16-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX11FAKE16-NEXT:    v_xor_b32_e32 v8, v0, v1
+; GFX11FAKE16-NEXT:    v_xor_b32_e32 v7, v4, v5
+; GFX11FAKE16-NEXT:    v_xor_b32_e32 v9, v2, v3
+; GFX11FAKE16-NEXT:    v_cls_i32_e32 v10, v1
+; GFX11FAKE16-NEXT:    v_cls_i32_e32 v6, v5
+; GFX11FAKE16-NEXT:    v_ashrrev_i32_e32 v8, 31, v8
+; GFX11FAKE16-NEXT:    v_ashrrev_i32_e32 v7, 31, v7
+; GFX11FAKE16-NEXT:    v_cls_i32_e32 v11, v3
+; GFX11FAKE16-NEXT:    v_ashrrev_i32_e32 v9, 31, v9
+; GFX11FAKE16-NEXT:    v_add_nc_u32_e32 v10, -1, v10
+; GFX11FAKE16-NEXT:    v_add_nc_u32_e32 v8, 32, v8
+; GFX11FAKE16-NEXT:    v_add_nc_u32_e32 v6, -1, v6
+; GFX11FAKE16-NEXT:    v_add_nc_u32_e32 v7, 32, v7
+; GFX11FAKE16-NEXT:    v_add_nc_u32_e32 v11, -1, v11
+; GFX11FAKE16-NEXT:    v_add_nc_u32_e32 v9, 32, v9
+; GFX11FAKE16-NEXT:    v_min_u32_e32 v8, v10, v8
+; GFX11FAKE16-NEXT:    s_delay_alu instid0(VALU_DEP_4) | instskip(NEXT) | instid1(VALU_DEP_3)
+; GFX11FAKE16-NEXT:    v_min_u32_e32 v6, v6, v7
+; GFX11FAKE16-NEXT:    v_min_u32_e32 v7, v11, v9
+; GFX11FAKE16-NEXT:    s_delay_alu instid0(VALU_DEP_3) | instskip(NEXT) | instid1(VALU_DEP_3)
+; GFX11FAKE16-NEXT:    v_lshlrev_b64 v[0:1], v8, v[0:1]
+; GFX11FAKE16-NEXT:    v_lshlrev_b64 v[4:5], v6, v[4:5]
+; GFX11FAKE16-NEXT:    v_sub_nc_u32_e32 v6, 32, v6
+; GFX11FAKE16-NEXT:    s_delay_alu instid0(VALU_DEP_4) | instskip(NEXT) | instid1(VALU_DEP_4)
+; GFX11FAKE16-NEXT:    v_lshlrev_b64 v[2:3], v7, v[2:3]
+; GFX11FAKE16-NEXT:    v_min_u32_e32 v0, 1, v0
+; GFX11FAKE16-NEXT:    s_delay_alu instid0(VALU_DEP_4) | instskip(NEXT) | instid1(VALU_DEP_3)
+; GFX11FAKE16-NEXT:    v_min_u32_e32 v4, 1, v4
+; GFX11FAKE16-NEXT:    v_min_u32_e32 v2, 1, v2
+; GFX11FAKE16-NEXT:    s_delay_alu instid0(VALU_DEP_3) | instskip(NEXT) | instid1(VALU_DEP_3)
+; GFX11FAKE16-NEXT:    v_or_b32_e32 v0, v1, v0
+; GFX11FAKE16-NEXT:    v_or_b32_e32 v1, v5, v4
+; GFX11FAKE16-NEXT:    v_sub_nc_u32_e32 v4, 32, v7
+; GFX11FAKE16-NEXT:    s_delay_alu instid0(VALU_DEP_4) | instskip(SKIP_3) | instid1(VALU_DEP_4)
+; GFX11FAKE16-NEXT:    v_or_b32_e32 v2, v3, v2
+; GFX11FAKE16-NEXT:    v_sub_nc_u32_e32 v3, 32, v8
+; GFX11FAKE16-NEXT:    v_cvt_f32_i32_e32 v0, v0
+; GFX11FAKE16-NEXT:    v_cvt_f32_i32_e32 v1, v1
+; GFX11FAKE16-NEXT:    v_cvt_f32_i32_e32 v2, v2
+; GFX11FAKE16-NEXT:    s_delay_alu instid0(VALU_DEP_3) | instskip(NEXT) | instid1(VALU_DEP_3)
+; GFX11FAKE16-NEXT:    v_ldexp_f32 v0, v0, v3
+; GFX11FAKE16-NEXT:    v_ldexp_f32 v1, v1, v6
+; GFX11FAKE16-NEXT:    s_delay_alu instid0(VALU_DEP_3) | instskip(NEXT) | instid1(VALU_DEP_3)
+; GFX11FAKE16-NEXT:    v_ldexp_f32 v2, v2, v4
+; GFX11FAKE16-NEXT:    v_bfe_u32 v3, v0, 16, 1
+; GFX11FAKE16-NEXT:    v_or_b32_e32 v7, 0x400000, v0
+; GFX11FAKE16-NEXT:    v_cmp_u_f32_e32 vcc_lo, v0, v0
+; GFX11FAKE16-NEXT:    s_delay_alu instid0(VALU_DEP_4)
+; GFX11FAKE16-NEXT:    v_bfe_u32 v5, v2, 16, 1
+; GFX11FAKE16-NEXT:    v_bfe_u32 v4, v1, 16, 1
+; GFX11FAKE16-NEXT:    v_add3_u32 v3, v3, v0, 0x7fff
+; GFX11FAKE16-NEXT:    v_or_b32_e32 v8, 0x400000, v2
+; GFX11FAKE16-NEXT:    v_or_b32_e32 v6, 0x400000, v1
+; GFX11FAKE16-NEXT:    v_add3_u32 v5, v5, v2, 0x7fff
+; GFX11FAKE16-NEXT:    v_add3_u32 v4, v4, v1, 0x7fff
+; GFX11FAKE16-NEXT:    v_cndmask_b32_e32 v0, v3, v7, vcc_lo
+; GFX11FAKE16-NEXT:    v_cmp_u_f32_e32 vcc_lo, v2, v2
+; GFX11FAKE16-NEXT:    s_delay_alu instid0(VALU_DEP_4) | instskip(SKIP_1) | instid1(VALU_DEP_2)
+; GFX11FAKE16-NEXT:    v_cndmask_b32_e32 v2, v5, v8, vcc_lo
+; GFX11FAKE16-NEXT:    v_cmp_u_f32_e32 vcc_lo, v1, v1
+; GFX11FAKE16-NEXT:    v_perm_b32 v0, v2, v0, 0x7060302
+; GFX11FAKE16-NEXT:    v_cndmask_b32_e32 v1, v4, v6, vcc_lo
+; GFX11FAKE16-NEXT:    s_delay_alu instid0(VALU_DEP_1)
+; GFX11FAKE16-NEXT:    v_alignbit_b32 v1, s0, v1, 16
+; GFX11FAKE16-NEXT:    s_setpc_b64 s[30:31]
   %op = sitofp <3 x i64> %x to <3 x bfloat>
   ret <3 x bfloat> %op
 }
@@ -32056,6 +32691,74 @@ define <3 x bfloat> @v_uitofp_v3i16_to_v3bf16(<3 x i16> %x) {
 ; GFX10-NEXT:    v_cndmask_b32_e32 v1, v4, v6, vcc_lo
 ; GFX10-NEXT:    v_alignbit_b32 v1, s4, v1, 16
 ; GFX10-NEXT:    s_setpc_b64 s[30:31]
+;
+; GFX11TRUE16-LABEL: v_uitofp_v3i16_to_v3bf16:
+; GFX11TRUE16:       ; %bb.0:
+; GFX11TRUE16-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX11TRUE16-NEXT:    v_and_b32_e32 v1, 0xffff, v1
+; GFX11TRUE16-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
+; GFX11TRUE16-NEXT:    v_cvt_f32_u32_e32 v1, v1
+; GFX11TRUE16-NEXT:    v_bfe_u32 v4, v1, 16, 1
+; GFX11TRUE16-NEXT:    v_or_b32_e32 v6, 0x400000, v1
+; GFX11TRUE16-NEXT:    s_delay_alu instid0(VALU_DEP_2) | instskip(SKIP_2) | instid1(VALU_DEP_2)
+; GFX11TRUE16-NEXT:    v_add3_u32 v4, v4, v1, 0x7fff
+; GFX11TRUE16-NEXT:    v_and_b32_e32 v2, 0xffff, v0
+; GFX11TRUE16-NEXT:    v_lshrrev_b32_e32 v0, 16, v0
+; GFX11TRUE16-NEXT:    v_cvt_f32_u32_e32 v2, v2
+; GFX11TRUE16-NEXT:    s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_2)
+; GFX11TRUE16-NEXT:    v_cvt_f32_u32_e32 v0, v0
+; GFX11TRUE16-NEXT:    v_bfe_u32 v3, v2, 16, 1
+; GFX11TRUE16-NEXT:    s_delay_alu instid0(VALU_DEP_2)
+; GFX11TRUE16-NEXT:    v_bfe_u32 v5, v0, 16, 1
+; GFX11TRUE16-NEXT:    v_or_b32_e32 v7, 0x400000, v2
+; GFX11TRUE16-NEXT:    v_cmp_u_f32_e32 vcc_lo, v2, v2
+; GFX11TRUE16-NEXT:    v_or_b32_e32 v8, 0x400000, v0
+; GFX11TRUE16-NEXT:    v_add3_u32 v3, v3, v2, 0x7fff
+; GFX11TRUE16-NEXT:    v_add3_u32 v5, v5, v0, 0x7fff
+; GFX11TRUE16-NEXT:    s_delay_alu instid0(VALU_DEP_2) | instskip(SKIP_1) | instid1(VALU_DEP_3)
+; GFX11TRUE16-NEXT:    v_cndmask_b32_e32 v2, v3, v7, vcc_lo
+; GFX11TRUE16-NEXT:    v_cmp_u_f32_e32 vcc_lo, v0, v0
+; GFX11TRUE16-NEXT:    v_cndmask_b32_e32 v0, v5, v8, vcc_lo
+; GFX11TRUE16-NEXT:    v_cmp_u_f32_e32 vcc_lo, v1, v1
+; GFX11TRUE16-NEXT:    s_delay_alu instid0(VALU_DEP_2) | instskip(SKIP_1) | instid1(VALU_DEP_1)
+; GFX11TRUE16-NEXT:    v_perm_b32 v0, v0, v2, 0x7060302
+; GFX11TRUE16-NEXT:    v_cndmask_b32_e32 v1, v4, v6, vcc_lo
+; GFX11TRUE16-NEXT:    v_alignbit_b32 v1, v0, v1, 16
+; GFX11TRUE16-NEXT:    s_setpc_b64 s[30:31]
+;
+; GFX11FAKE16-LABEL: v_uitofp_v3i16_to_v3bf16:
+; GFX11FAKE16:       ; %bb.0:
+; GFX11FAKE16-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX11FAKE16-NEXT:    v_and_b32_e32 v1, 0xffff, v1
+; GFX11FAKE16-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
+; GFX11FAKE16-NEXT:    v_cvt_f32_u32_e32 v1, v1
+; GFX11FAKE16-NEXT:    v_bfe_u32 v4, v1, 16, 1
+; GFX11FAKE16-NEXT:    v_or_b32_e32 v6, 0x400000, v1
+; GFX11FAKE16-NEXT:    s_delay_alu instid0(VALU_DEP_2) | instskip(SKIP_2) | instid1(VALU_DEP_2)
+; GFX11FAKE16-NEXT:    v_add3_u32 v4, v4, v1, 0x7fff
+; GFX11FAKE16-NEXT:    v_and_b32_e32 v2, 0xffff, v0
+; GFX11FAKE16-NEXT:    v_lshrrev_b32_e32 v0, 16, v0
+; GFX11FAKE16-NEXT:    v_cvt_f32_u32_e32 v2, v2
+; GFX11FAKE16-NEXT:    s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_2)
+; GFX11FAKE16-NEXT:    v_cvt_f32_u32_e32 v0, v0
+; GFX11FAKE16-NEXT:    v_bfe_u32 v3, v2, 16, 1
+; GFX11FAKE16-NEXT:    s_delay_alu instid0(VALU_DEP_2)
+; GFX11FAKE16-NEXT:    v_bfe_u32 v5, v0, 16, 1
+; GFX11FAKE16-NEXT:    v_or_b32_e32 v7, 0x400000, v2
+; GFX11FAKE16-NEXT:    v_cmp_u_f32_e32 vcc_lo, v2, v2
+; GFX11FAKE16-NEXT:    v_or_b32_e32 v8, 0x400000, v0
+; GFX11FAKE16-NEXT:    v_add3_u32 v3, v3, v2, 0x7fff
+; GFX11FAKE16-NEXT:    v_add3_u32 v5, v5, v0, 0x7fff
+; GFX11FAKE16-NEXT:    s_delay_alu instid0(VALU_DEP_2) | instskip(SKIP_1) | instid1(VALU_DEP_3)
+; GFX11FAKE16-NEXT:    v_cndmask_b32_e32 v2, v3, v7, vcc_lo
+; GFX11FAKE16-NEXT:    v_cmp_u_f32_e32 vcc_lo, v0, v0
+; GFX11FAKE16-NEXT:    v_cndmask_b32_e32 v0, v5, v8, vcc_lo
+; GFX11FAKE16-NEXT:    v_cmp_u_f32_e32 vcc_lo, v1, v1
+; GFX11FAKE16-NEXT:    s_delay_alu instid0(VALU_DEP_2) | instskip(SKIP_1) | instid1(VALU_DEP_1)
+; GFX11FAKE16-NEXT:    v_perm_b32 v0, v0, v2, 0x7060302
+; GFX11FAKE16-NEXT:    v_cndmask_b32_e32 v1, v4, v6, vcc_lo
+; GFX11FAKE16-NEXT:    v_alignbit_b32 v1, s0, v1, 16
+; GFX11FAKE16-NEXT:    s_setpc_b64 s[30:31]
   %op = uitofp <3 x i16> %x to <3 x bfloat>
   ret <3 x bfloat> %op
 }
@@ -32518,6 +33221,60 @@ define <3 x bfloat> @v_uitofp_v3i32_to_v3bf16(<3 x i32> %x) {
 ; GFX10-NEXT:    v_cndmask_b32_e32 v2, v4, v6, vcc_lo
 ; GFX10-NEXT:    v_alignbit_b32 v1, s4, v2, 16
 ; GFX10-NEXT:    s_setpc_b64 s[30:31]
+;
+; GFX11TRUE16-LABEL: v_uitofp_v3i32_to_v3bf16:
+; GFX11TRUE16:       ; %bb.0:
+; GFX11TRUE16-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX11TRUE16-NEXT:    v_cvt_f32_u32_e32 v0, v0
+; GFX11TRUE16-NEXT:    v_cvt_f32_u32_e32 v1, v1
+; GFX11TRUE16-NEXT:    v_cvt_f32_u32_e32 v2, v2
+; GFX11TRUE16-NEXT:    s_delay_alu instid0(VALU_DEP_3) | instskip(NEXT) | instid1(VALU_DEP_3)
+; GFX11TRUE16-NEXT:    v_bfe_u32 v3, v0, 16, 1
+; GFX11TRUE16-NEXT:    v_bfe_u32 v5, v1, 16, 1
+; GFX11TRUE16-NEXT:    v_or_b32_e32 v7, 0x400000, v0
+; GFX11TRUE16-NEXT:    v_cmp_u_f32_e32 vcc_lo, v0, v0
+; GFX11TRUE16-NEXT:    v_bfe_u32 v4, v2, 16, 1
+; GFX11TRUE16-NEXT:    v_add3_u32 v3, v3, v0, 0x7fff
+; GFX11TRUE16-NEXT:    v_or_b32_e32 v8, 0x400000, v1
+; GFX11TRUE16-NEXT:    v_add3_u32 v5, v5, v1, 0x7fff
+; GFX11TRUE16-NEXT:    v_or_b32_e32 v6, 0x400000, v2
+; GFX11TRUE16-NEXT:    v_add3_u32 v4, v4, v2, 0x7fff
+; GFX11TRUE16-NEXT:    v_cndmask_b32_e32 v0, v3, v7, vcc_lo
+; GFX11TRUE16-NEXT:    v_cmp_u_f32_e32 vcc_lo, v1, v1
+; GFX11TRUE16-NEXT:    v_cndmask_b32_e32 v1, v5, v8, vcc_lo
+; GFX11TRUE16-NEXT:    v_cmp_u_f32_e32 vcc_lo, v2, v2
+; GFX11TRUE16-NEXT:    s_delay_alu instid0(VALU_DEP_2) | instskip(SKIP_1) | instid1(VALU_DEP_1)
+; GFX11TRUE16-NEXT:    v_perm_b32 v0, v1, v0, 0x7060302
+; GFX11TRUE16-NEXT:    v_cndmask_b32_e32 v2, v4, v6, vcc_lo
+; GFX11TRUE16-NEXT:    v_alignbit_b32 v1, v0, v2, 16
+; GFX11TRUE16-NEXT:    s_setpc_b64 s[30:31]
+;
+; GFX11FAKE16-LABEL: v_uitofp_v3i32_to_v3bf16:
+; GFX11FAKE16:       ; %bb.0:
+; GFX11FAKE16-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX11FAKE16-NEXT:    v_cvt_f32_u32_e32 v0, v0
+; GFX11FAKE16-NEXT:    v_cvt_f32_u32_e32 v1, v1
+; GFX11FAKE16-NEXT:    v_cvt_f32_u32_e32 v2, v2
+; GFX11FAKE16-NEXT:    s_delay_alu instid0(VALU_DEP_3) | instskip(NEXT) | instid1(VALU_DEP_3)
+; GFX11FAKE16-NEXT:    v_bfe_u32 v3, v0, 16, 1
+; GFX11FAKE16-NEXT:    v_bfe_u32 v5, v1, 16, 1
+; GFX11FAKE16-NEXT:    v_or_b32_e32 v7, 0x400000, v0
+; GFX11FAKE16-NEXT:    v_cmp_u_f32_e32 vcc_lo, v0, v0
+; GFX11FAKE16-NEXT:    v_bfe_u32 v4, v2, 16, 1
+; GFX11FAKE16-NEXT:    v_add3_u32 v3, v3, v0, 0x7fff
+; GFX11FAKE16-NEXT:    v_or_b32_e32 v8, 0x400000, v1
+; GFX11FAKE16-NEXT:    v_add3_u32 v5, v5, v1, 0x7fff
+; GFX11FAKE16-NEXT:    v_or_b32_e32 v6, 0x400000, v2
+; GFX11FAKE16-NEXT:    v_add3_u32 v4, v4, v2, 0x7fff
+; GFX11FAKE16-NEXT:    v_cndmask_b32_e32 v0, v3, v7, vcc_lo
+; GFX11FAKE16-NEXT:    v_cmp_u_f32_e32 vcc_lo, v1, v1
+; GFX11FAKE16-NEXT:    v_cndmask_b32_e32 v1, v5, v8, vcc_lo
+; GFX11FAKE16-NEXT:    v_cmp_u_f32_e32 vcc_lo, v2, v2
+; GFX11FAKE16-NEXT:    s_delay_alu instid0(VALU_DEP_2) | instskip(SKIP_1) | instid1(VALU_DEP_1)
+; GFX11FAKE16-NEXT:    v_perm_b32 v0, v1, v0, 0x7060302
+; GFX11FAKE16-NEXT:    v_cndmask_b32_e32 v2, v4, v6, vcc_lo
+; GFX11FAKE16-NEXT:    v_alignbit_b32 v1, s0, v2, 16
+; GFX11FAKE16-NEXT:    s_setpc_b64 s[30:31]
   %op = uitofp <3 x i32> %x to <3 x bfloat>
   ret <3 x bfloat> %op
 }
@@ -33204,6 +33961,120 @@ define <3 x bfloat> @v_uitofp_v3i64_to_v3bf16(<3 x i64> %x) {
 ; GFX10-NEXT:    v_cndmask_b32_e32 v1, v4, v6, vcc_lo
 ; GFX10-NEXT:    v_alignbit_b32 v1, s4, v1, 16
 ; GFX10-NEXT:    s_setpc_b64 s[30:31]
+;
+; GFX11TRUE16-LABEL: v_uitofp_v3i64_to_v3bf16:
+; GFX11TRUE16:       ; %bb.0:
+; GFX11TRUE16-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX11TRUE16-NEXT:    v_clz_i32_u32_e32 v6, v1
+; GFX11TRUE16-NEXT:    v_clz_i32_u32_e32 v7, v5
+; GFX11TRUE16-NEXT:    v_clz_i32_u32_e32 v8, v3
+; GFX11TRUE16-NEXT:    s_delay_alu instid0(VALU_DEP_3) | instskip(NEXT) | instid1(VALU_DEP_3)
+; GFX11TRUE16-NEXT:    v_min_u32_e32 v6, 32, v6
+; GFX11TRUE16-NEXT:    v_min_u32_e32 v7, 32, v7
+; GFX11TRUE16-NEXT:    s_delay_alu instid0(VALU_DEP_3) | instskip(NEXT) | instid1(VALU_DEP_3)
+; GFX11TRUE16-NEXT:    v_min_u32_e32 v8, 32, v8
+; GFX11TRUE16-NEXT:    v_lshlrev_b64 v[0:1], v6, v[0:1]
+; GFX11TRUE16-NEXT:    s_delay_alu instid0(VALU_DEP_3) | instskip(NEXT) | instid1(VALU_DEP_3)
+; GFX11TRUE16-NEXT:    v_lshlrev_b64 v[4:5], v7, v[4:5]
+; GFX11TRUE16-NEXT:    v_lshlrev_b64 v[2:3], v8, v[2:3]
+; GFX11TRUE16-NEXT:    v_sub_nc_u32_e32 v7, 32, v7
+; GFX11TRUE16-NEXT:    s_delay_alu instid0(VALU_DEP_4) | instskip(NEXT) | instid1(VALU_DEP_4)
+; GFX11TRUE16-NEXT:    v_min_u32_e32 v0, 1, v0
+; GFX11TRUE16-NEXT:    v_min_u32_e32 v4, 1, v4
+; GFX11TRUE16-NEXT:    s_delay_alu instid0(VALU_DEP_4) | instskip(NEXT) | instid1(VALU_DEP_3)
+; GFX11TRUE16-NEXT:    v_min_u32_e32 v2, 1, v2
+; GFX11TRUE16-NEXT:    v_or_b32_e32 v0, v1, v0
+; GFX11TRUE16-NEXT:    s_delay_alu instid0(VALU_DEP_3) | instskip(NEXT) | instid1(VALU_DEP_3)
+; GFX11TRUE16-NEXT:    v_or_b32_e32 v1, v5, v4
+; GFX11TRUE16-NEXT:    v_or_b32_e32 v2, v3, v2
+; GFX11TRUE16-NEXT:    v_sub_nc_u32_e32 v3, 32, v6
+; GFX11TRUE16-NEXT:    v_sub_nc_u32_e32 v4, 32, v8
+; GFX11TRUE16-NEXT:    v_cvt_f32_u32_e32 v0, v0
+; GFX11TRUE16-NEXT:    v_cvt_f32_u32_e32 v1, v1
+; GFX11TRUE16-NEXT:    v_cvt_f32_u32_e32 v2, v2
+; GFX11TRUE16-NEXT:    s_delay_alu instid0(VALU_DEP_3) | instskip(NEXT) | instid1(VALU_DEP_3)
+; GFX11TRUE16-NEXT:    v_ldexp_f32 v0, v0, v3
+; GFX11TRUE16-NEXT:    v_ldexp_f32 v1, v1, v7
+; GFX11TRUE16-NEXT:    s_delay_alu instid0(VALU_DEP_3) | instskip(NEXT) | instid1(VALU_DEP_3)
+; GFX11TRUE16-NEXT:    v_ldexp_f32 v2, v2, v4
+; GFX11TRUE16-NEXT:    v_bfe_u32 v3, v0, 16, 1
+; GFX11TRUE16-NEXT:    s_delay_alu instid0(VALU_DEP_3) | instskip(NEXT) | instid1(VALU_DEP_3)
+; GFX11TRUE16-NEXT:    v_bfe_u32 v4, v1, 16, 1
+; GFX11TRUE16-NEXT:    v_bfe_u32 v5, v2, 16, 1
+; GFX11TRUE16-NEXT:    v_or_b32_e32 v7, 0x400000, v0
+; GFX11TRUE16-NEXT:    v_cmp_u_f32_e32 vcc_lo, v0, v0
+; GFX11TRUE16-NEXT:    v_add3_u32 v3, v3, v0, 0x7fff
+; GFX11TRUE16-NEXT:    v_or_b32_e32 v6, 0x400000, v1
+; GFX11TRUE16-NEXT:    v_or_b32_e32 v8, 0x400000, v2
+; GFX11TRUE16-NEXT:    v_add3_u32 v5, v5, v2, 0x7fff
+; GFX11TRUE16-NEXT:    v_add3_u32 v4, v4, v1, 0x7fff
+; GFX11TRUE16-NEXT:    v_cndmask_b32_e32 v0, v3, v7, vcc_lo
+; GFX11TRUE16-NEXT:    v_cmp_u_f32_e32 vcc_lo, v2, v2
+; GFX11TRUE16-NEXT:    s_delay_alu instid0(VALU_DEP_4) | instskip(SKIP_2) | instid1(VALU_DEP_3)
+; GFX11TRUE16-NEXT:    v_cndmask_b32_e32 v2, v5, v8, vcc_lo
+; GFX11TRUE16-NEXT:    v_cmp_u_f32_e32 vcc_lo, v1, v1
+; GFX11TRUE16-NEXT:    v_cndmask_b32_e32 v1, v4, v6, vcc_lo
+; GFX11TRUE16-NEXT:    v_perm_b32 v0, v2, v0, 0x7060302
+; GFX11TRUE16-NEXT:    s_delay_alu instid0(VALU_DEP_1)
+; GFX11TRUE16-NEXT:    v_alignbit_b32 v1, v0, v1, 16
+; GFX11TRUE16-NEXT:    s_setpc_b64 s[30:31]
+;
+; GFX11FAKE16-LABEL: v_uitofp_v3i64_to_v3bf16:
+; GFX11FAKE16:       ; %bb.0:
+; GFX11FAKE16-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX11FAKE16-NEXT:    v_clz_i32_u32_e32 v6, v1
+; GFX11FAKE16-NEXT:    v_clz_i32_u32_e32 v7, v5
+; GFX11FAKE16-NEXT:    v_clz_i32_u32_e32 v8, v3
+; GFX11FAKE16-NEXT:    s_delay_alu instid0(VALU_DEP_3) | instskip(NEXT) | instid1(VALU_DEP_3)
+; GFX11FAKE16-NEXT:    v_min_u32_e32 v6, 32, v6
+; GFX11FAKE16-NEXT:    v_min_u32_e32 v7, 32, v7
+; GFX11FAKE16-NEXT:    s_delay_alu instid0(VALU_DEP_3) | instskip(NEXT) | instid1(VALU_DEP_3)
+; GFX11FAKE16-NEXT:    v_min_u32_e32 v8, 32, v8
+; GFX11FAKE16-NEXT:    v_lshlrev_b64 v[0:1], v6, v[0:1]
+; GFX11FAKE16-NEXT:    s_delay_alu instid0(VALU_DEP_3) | instskip(NEXT) | instid1(VALU_DEP_3)
+; GFX11FAKE16-NEXT:    v_lshlrev_b64 v[4:5], v7, v[4:5]
+; GFX11FAKE16-NEXT:    v_lshlrev_b64 v[2:3], v8, v[2:3]
+; GFX11FAKE16-NEXT:    v_sub_nc_u32_e32 v7, 32, v7
+; GFX11FAKE16-NEXT:    s_delay_alu instid0(VALU_DEP_4) | instskip(NEXT) | instid1(VALU_DEP_4)
+; GFX11FAKE16-NEXT:    v_min_u32_e32 v0, 1, v0
+; GFX11FAKE16-NEXT:    v_min_u32_e32 v4, 1, v4
+; GFX11FAKE16-NEXT:    s_delay_alu instid0(VALU_DEP_4) | instskip(NEXT) | instid1(VALU_DEP_3)
+; GFX11FAKE16-NEXT:    v_min_u32_e32 v2, 1, v2
+; GFX11FAKE16-NEXT:    v_or_b32_e32 v0, v1, v0
+; GFX11FAKE16-NEXT:    s_delay_alu instid0(VALU_DEP_3) | instskip(NEXT) | instid1(VALU_DEP_3)
+; GFX11FAKE16-NEXT:    v_or_b32_e32 v1, v5, v4
+; GFX11FAKE16-NEXT:    v_or_b32_e32 v2, v3, v2
+; GFX11FAKE16-NEXT:    v_sub_nc_u32_e32 v3, 32, v6
+; GFX11FAKE16-NEXT:    v_sub_nc_u32_e32 v4, 32, v8
+; GFX11FAKE16-NEXT:    v_cvt_f32_u32_e32 v0, v0
+; GFX11FAKE16-NEXT:    v_cvt_f32_u32_e32 v1, v1
+; GFX11FAKE16-NEXT:    v_cvt_f32_u32_e32 v2, v2
+; GFX11FAKE16-NEXT:    s_delay_alu instid0(VALU_DEP_3) | instskip(NEXT) | instid1(VALU_DEP_3)
+; GFX11FAKE16-NEXT:    v_ldexp_f32 v0, v0, v3
+; GFX11FAKE16-NEXT:    v_ldexp_f32 v1, v1, v7
+; GFX11FAKE16-NEXT:    s_delay_alu instid0(VALU_DEP_3) | instskip(NEXT) | instid1(VALU_DEP_3)
+; GFX11FAKE16-NEXT:    v_ldexp_f32 v2, v2, v4
+; GFX11FAKE16-NEXT:    v_bfe_u32 v3, v0, 16, 1
+; GFX11FAKE16-NEXT:    s_delay_alu instid0(VALU_DEP_3) | instskip(NEXT) | instid1(VALU_DEP_3)
+; GFX11FAKE16-NEXT:    v_bfe_u32 v4, v1, 16, 1
+; GFX11FAKE16-NEXT:    v_bfe_u32 v5, v2, 16, 1
+; GFX11FAKE16-NEXT:    v_or_b32_e32 v7, 0x400000, v0
+; GFX11FAKE16-NEXT:    v_cmp_u_f32_e32 vcc_lo, v0, v0
+; GFX11FAKE16-NEXT:    v_add3_u32 v3, v3, v0, 0x7fff
+; GFX11FAKE16-NEXT:    v_or_b32_e32 v6, 0x400000, v1
+; GFX11FAKE16-NEXT:    v_or_b32_e32 v8, 0x400000, v2
+; GFX11FAKE16-NEXT:    v_add3_u32 v5, v5, v2, 0x7fff
+; GFX11FAKE16-NEXT:    v_add3_u32 v4, v4, v1, 0x7fff
+; GFX11FAKE16-NEXT:    v_cndmask_b32_e32 v0, v3, v7, vcc_lo
+; GFX11FAKE16-NEXT:    v_cmp_u_f32_e32 vcc_lo, v2, v2
+; GFX11FAKE16-NEXT:    s_delay_alu instid0(VALU_DEP_4) | instskip(SKIP_2) | instid1(VALU_DEP_3)
+; GFX11FAKE16-NEXT:    v_cndmask_b32_e32 v2, v5, v8, vcc_lo
+; GFX11FAKE16-NEXT:    v_cmp_u_f32_e32 vcc_lo, v1, v1
+; GFX11FAKE16-NEXT:    v_cndmask_b32_e32 v1, v4, v6, vcc_lo
+; GFX11FAKE16-NEXT:    v_perm_b32 v0, v2, v0, 0x7060302
+; GFX11FAKE16-NEXT:    s_delay_alu instid0(VALU_DEP_2)
+; GFX11FAKE16-NEXT:    v_alignbit_b32 v1, s0, v1, 16
+; GFX11FAKE16-NEXT:    s_setpc_b64 s[30:31]
   %op = uitofp <3 x i64> %x to <3 x bfloat>
   ret <3 x bfloat> %op
 }
@@ -39040,6 +39911,84 @@ define <3 x bfloat> @v_fma_v3bf16(<3 x bfloat> %a, <3 x bfloat> %b, <3 x bfloat>
 ; GFX10-NEXT:    v_perm_b32 v0, v2, v1, 0x7060302
 ; GFX10-NEXT:    v_alignbit_b32 v1, s4, v3, 16
 ; GFX10-NEXT:    s_setpc_b64 s[30:31]
+;
+; GFX11TRUE16-LABEL: v_fma_v3bf16:
+; GFX11TRUE16:       ; %bb.0:
+; GFX11TRUE16-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX11TRUE16-NEXT:    v_lshlrev_b32_e32 v6, 16, v4
+; GFX11TRUE16-NEXT:    v_lshlrev_b32_e32 v7, 16, v2
+; GFX11TRUE16-NEXT:    v_lshlrev_b32_e32 v8, 16, v0
+; GFX11TRUE16-NEXT:    v_and_b32_e32 v4, 0xffff0000, v4
+; GFX11TRUE16-NEXT:    v_and_b32_e32 v2, 0xffff0000, v2
+; GFX11TRUE16-NEXT:    v_and_b32_e32 v0, 0xffff0000, v0
+; GFX11TRUE16-NEXT:    v_lshlrev_b32_e32 v3, 16, v3
+; GFX11TRUE16-NEXT:    v_dual_fmac_f32 v6, v8, v7 :: v_dual_lshlrev_b32 v5, 16, v5
+; GFX11TRUE16-NEXT:    s_delay_alu instid0(VALU_DEP_3) | instskip(NEXT) | instid1(VALU_DEP_2)
+; GFX11TRUE16-NEXT:    v_fmac_f32_e32 v4, v0, v2
+; GFX11TRUE16-NEXT:    v_cmp_u_f32_e32 vcc_lo, v6, v6
+; GFX11TRUE16-NEXT:    s_delay_alu instid0(VALU_DEP_2) | instskip(SKIP_1) | instid1(VALU_DEP_2)
+; GFX11TRUE16-NEXT:    v_bfe_u32 v2, v4, 16, 1
+; GFX11TRUE16-NEXT:    v_or_b32_e32 v7, 0x400000, v4
+; GFX11TRUE16-NEXT:    v_add3_u32 v2, v2, v4, 0x7fff
+; GFX11TRUE16-NEXT:    v_lshlrev_b32_e32 v1, 16, v1
+; GFX11TRUE16-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_2) | instid1(VALU_DEP_3)
+; GFX11TRUE16-NEXT:    v_fmac_f32_e32 v5, v1, v3
+; GFX11TRUE16-NEXT:    v_bfe_u32 v1, v6, 16, 1
+; GFX11TRUE16-NEXT:    v_or_b32_e32 v3, 0x400000, v6
+; GFX11TRUE16-NEXT:    v_bfe_u32 v0, v5, 16, 1
+; GFX11TRUE16-NEXT:    s_delay_alu instid0(VALU_DEP_3) | instskip(SKIP_1) | instid1(VALU_DEP_3)
+; GFX11TRUE16-NEXT:    v_add3_u32 v1, v1, v6, 0x7fff
+; GFX11TRUE16-NEXT:    v_or_b32_e32 v8, 0x400000, v5
+; GFX11TRUE16-NEXT:    v_add3_u32 v0, v0, v5, 0x7fff
+; GFX11TRUE16-NEXT:    s_delay_alu instid0(VALU_DEP_3) | instskip(SKIP_4) | instid1(VALU_DEP_3)
+; GFX11TRUE16-NEXT:    v_cndmask_b32_e32 v1, v1, v3, vcc_lo
+; GFX11TRUE16-NEXT:    v_cmp_u_f32_e32 vcc_lo, v4, v4
+; GFX11TRUE16-NEXT:    v_cndmask_b32_e32 v2, v2, v7, vcc_lo
+; GFX11TRUE16-NEXT:    v_cmp_u_f32_e32 vcc_lo, v5, v5
+; GFX11TRUE16-NEXT:    v_cndmask_b32_e32 v3, v0, v8, vcc_lo
+; GFX11TRUE16-NEXT:    v_perm_b32 v0, v2, v1, 0x7060302
+; GFX11TRUE16-NEXT:    s_delay_alu instid0(VALU_DEP_1)
+; GFX11TRUE16-NEXT:    v_alignbit_b32 v1, v0, v3, 16
+; GFX11TRUE16-NEXT:    s_setpc_b64 s[30:31]
+;
+; GFX11FAKE16-LABEL: v_fma_v3bf16:
+; GFX11FAKE16:       ; %bb.0:
+; GFX11FAKE16-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX11FAKE16-NEXT:    v_lshlrev_b32_e32 v6, 16, v4
+; GFX11FAKE16-NEXT:    v_lshlrev_b32_e32 v7, 16, v2
+; GFX11FAKE16-NEXT:    v_lshlrev_b32_e32 v8, 16, v0
+; GFX11FAKE16-NEXT:    v_and_b32_e32 v4, 0xffff0000, v4
+; GFX11FAKE16-NEXT:    v_and_b32_e32 v2, 0xffff0000, v2
+; GFX11FAKE16-NEXT:    v_and_b32_e32 v0, 0xffff0000, v0
+; GFX11FAKE16-NEXT:    v_lshlrev_b32_e32 v3, 16, v3
+; GFX11FAKE16-NEXT:    v_dual_fmac_f32 v6, v8, v7 :: v_dual_lshlrev_b32 v5, 16, v5
+; GFX11FAKE16-NEXT:    s_delay_alu instid0(VALU_DEP_3) | instskip(NEXT) | instid1(VALU_DEP_2)
+; GFX11FAKE16-NEXT:    v_fmac_f32_e32 v4, v0, v2
+; GFX11FAKE16-NEXT:    v_cmp_u_f32_e32 vcc_lo, v6, v6
+; GFX11FAKE16-NEXT:    s_delay_alu instid0(VALU_DEP_2) | instskip(SKIP_1) | instid1(VALU_DEP_2)
+; GFX11FAKE16-NEXT:    v_bfe_u32 v2, v4, 16, 1
+; GFX11FAKE16-NEXT:    v_or_b32_e32 v7, 0x400000, v4
+; GFX11FAKE16-NEXT:    v_add3_u32 v2, v2, v4, 0x7fff
+; GFX11FAKE16-NEXT:    v_lshlrev_b32_e32 v1, 16, v1
+; GFX11FAKE16-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_2) | instid1(VALU_DEP_3)
+; GFX11FAKE16-NEXT:    v_fmac_f32_e32 v5, v1, v3
+; GFX11FAKE16-NEXT:    v_bfe_u32 v1, v6, 16, 1
+; GFX11FAKE16-NEXT:    v_or_b32_e32 v3, 0x400000, v6
+; GFX11FAKE16-NEXT:    v_bfe_u32 v0, v5, 16, 1
+; GFX11FAKE16-NEXT:    s_delay_alu instid0(VALU_DEP_3) | instskip(SKIP_1) | instid1(VALU_DEP_3)
+; GFX11FAKE16-NEXT:    v_add3_u32 v1, v1, v6, 0x7fff
+; GFX11FAKE16-NEXT:    v_or_b32_e32 v8, 0x400000, v5
+; GFX11FAKE16-NEXT:    v_add3_u32 v0, v0, v5, 0x7fff
+; GFX11FAKE16-NEXT:    s_delay_alu instid0(VALU_DEP_3) | instskip(SKIP_4) | instid1(VALU_DEP_3)
+; GFX11FAKE16-NEXT:    v_cndmask_b32_e32 v1, v1, v3, vcc_lo
+; GFX11FAKE16-NEXT:    v_cmp_u_f32_e32 vcc_lo, v4, v4
+; GFX11FAKE16-NEXT:    v_cndmask_b32_e32 v2, v2, v7, vcc_lo
+; GFX11FAKE16-NEXT:    v_cmp_u_f32_e32 vcc_lo, v5, v5
+; GFX11FAKE16-NEXT:    v_cndmask_b32_e32 v3, v0, v8, vcc_lo
+; GFX11FAKE16-NEXT:    v_perm_b32 v0, v2, v1, 0x7060302
+; GFX11FAKE16-NEXT:    s_delay_alu instid0(VALU_DEP_2)
+; GFX11FAKE16-NEXT:    v_alignbit_b32 v1, s0, v3, 16
+; GFX11FAKE16-NEXT:    s_setpc_b64 s[30:31]
   %op = call <3 x bfloat> @llvm.fma.v3bf16(<3 x bfloat> %a, <3 x bfloat> %b, <3 x bfloat> %c)
   ret <3 x bfloat> %op
 }
@@ -39908,6 +40857,132 @@ define <3 x bfloat> @v_fmuladd_v3bf16(<3 x bfloat> %a, <3 x bfloat> %b, <3 x bfl
 ; GFX10-NEXT:    v_cndmask_b32_e32 v1, v4, v6, vcc_lo
 ; GFX10-NEXT:    v_alignbit_b32 v1, s4, v1, 16
 ; GFX10-NEXT:    s_setpc_b64 s[30:31]
+;
+; GFX11TRUE16-LABEL: v_fmuladd_v3bf16:
+; GFX11TRUE16:       ; %bb.0:
+; GFX11TRUE16-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX11TRUE16-NEXT:    v_lshlrev_b32_e32 v6, 16, v2
+; GFX11TRUE16-NEXT:    v_lshlrev_b32_e32 v7, 16, v0
+; GFX11TRUE16-NEXT:    v_and_b32_e32 v2, 0xffff0000, v2
+; GFX11TRUE16-NEXT:    v_and_b32_e32 v0, 0xffff0000, v0
+; GFX11TRUE16-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
+; GFX11TRUE16-NEXT:    v_dual_mul_f32 v0, v0, v2 :: v_dual_lshlrev_b32 v3, 16, v3
+; GFX11TRUE16-NEXT:    v_bfe_u32 v8, v0, 16, 1
+; GFX11TRUE16-NEXT:    v_or_b32_e32 v10, 0x400000, v0
+; GFX11TRUE16-NEXT:    s_delay_alu instid0(VALU_DEP_2) | instskip(SKIP_1) | instid1(VALU_DEP_1)
+; GFX11TRUE16-NEXT:    v_add3_u32 v8, v8, v0, 0x7fff
+; GFX11TRUE16-NEXT:    v_lshlrev_b32_e32 v1, 16, v1
+; GFX11TRUE16-NEXT:    v_mul_f32_e32 v1, v1, v3
+; GFX11TRUE16-NEXT:    v_mul_f32_e32 v3, v7, v6
+; GFX11TRUE16-NEXT:    s_delay_alu instid0(VALU_DEP_2) | instskip(SKIP_1) | instid1(VALU_DEP_3)
+; GFX11TRUE16-NEXT:    v_bfe_u32 v2, v1, 16, 1
+; GFX11TRUE16-NEXT:    v_or_b32_e32 v6, 0x400000, v1
+; GFX11TRUE16-NEXT:    v_bfe_u32 v7, v3, 16, 1
+; GFX11TRUE16-NEXT:    v_cmp_u_f32_e32 vcc_lo, v1, v1
+; GFX11TRUE16-NEXT:    v_or_b32_e32 v9, 0x400000, v3
+; GFX11TRUE16-NEXT:    v_add3_u32 v2, v2, v1, 0x7fff
+; GFX11TRUE16-NEXT:    s_delay_alu instid0(VALU_DEP_4) | instskip(NEXT) | instid1(VALU_DEP_2)
+; GFX11TRUE16-NEXT:    v_add3_u32 v7, v7, v3, 0x7fff
+; GFX11TRUE16-NEXT:    v_cndmask_b32_e32 v1, v2, v6, vcc_lo
+; GFX11TRUE16-NEXT:    v_cmp_u_f32_e32 vcc_lo, v3, v3
+; GFX11TRUE16-NEXT:    v_lshlrev_b32_e32 v3, 16, v5
+; GFX11TRUE16-NEXT:    v_lshlrev_b32_e32 v5, 16, v4
+; GFX11TRUE16-NEXT:    v_and_b32_e32 v4, 0xffff0000, v4
+; GFX11TRUE16-NEXT:    v_cndmask_b32_e32 v2, v7, v9, vcc_lo
+; GFX11TRUE16-NEXT:    v_cmp_u_f32_e32 vcc_lo, v0, v0
+; GFX11TRUE16-NEXT:    s_delay_alu instid0(VALU_DEP_2) | instskip(SKIP_1) | instid1(VALU_DEP_2)
+; GFX11TRUE16-NEXT:    v_and_b32_e32 v2, 0xffff0000, v2
+; GFX11TRUE16-NEXT:    v_cndmask_b32_e32 v0, v8, v10, vcc_lo
+; GFX11TRUE16-NEXT:    v_add_f32_e32 v2, v2, v5
+; GFX11TRUE16-NEXT:    s_delay_alu instid0(VALU_DEP_2) | instskip(SKIP_1) | instid1(VALU_DEP_3)
+; GFX11TRUE16-NEXT:    v_and_b32_e32 v0, 0xffff0000, v0
+; GFX11TRUE16-NEXT:    v_and_b32_e32 v1, 0xffff0000, v1
+; GFX11TRUE16-NEXT:    v_or_b32_e32 v7, 0x400000, v2
+; GFX11TRUE16-NEXT:    s_delay_alu instid0(VALU_DEP_2) | instskip(SKIP_2) | instid1(VALU_DEP_3)
+; GFX11TRUE16-NEXT:    v_dual_add_f32 v0, v0, v4 :: v_dual_add_f32 v1, v1, v3
+; GFX11TRUE16-NEXT:    v_bfe_u32 v3, v2, 16, 1
+; GFX11TRUE16-NEXT:    v_cmp_u_f32_e32 vcc_lo, v2, v2
+; GFX11TRUE16-NEXT:    v_bfe_u32 v5, v0, 16, 1
+; GFX11TRUE16-NEXT:    s_delay_alu instid0(VALU_DEP_4) | instskip(NEXT) | instid1(VALU_DEP_4)
+; GFX11TRUE16-NEXT:    v_bfe_u32 v4, v1, 16, 1
+; GFX11TRUE16-NEXT:    v_add3_u32 v3, v3, v2, 0x7fff
+; GFX11TRUE16-NEXT:    v_or_b32_e32 v8, 0x400000, v0
+; GFX11TRUE16-NEXT:    v_or_b32_e32 v6, 0x400000, v1
+; GFX11TRUE16-NEXT:    v_add3_u32 v5, v5, v0, 0x7fff
+; GFX11TRUE16-NEXT:    v_add3_u32 v4, v4, v1, 0x7fff
+; GFX11TRUE16-NEXT:    v_cndmask_b32_e32 v2, v3, v7, vcc_lo
+; GFX11TRUE16-NEXT:    v_cmp_u_f32_e32 vcc_lo, v0, v0
+; GFX11TRUE16-NEXT:    s_delay_alu instid0(VALU_DEP_4) | instskip(SKIP_1) | instid1(VALU_DEP_2)
+; GFX11TRUE16-NEXT:    v_cndmask_b32_e32 v0, v5, v8, vcc_lo
+; GFX11TRUE16-NEXT:    v_cmp_u_f32_e32 vcc_lo, v1, v1
+; GFX11TRUE16-NEXT:    v_perm_b32 v0, v0, v2, 0x7060302
+; GFX11TRUE16-NEXT:    v_cndmask_b32_e32 v1, v4, v6, vcc_lo
+; GFX11TRUE16-NEXT:    s_delay_alu instid0(VALU_DEP_1)
+; GFX11TRUE16-NEXT:    v_alignbit_b32 v1, v0, v1, 16
+; GFX11TRUE16-NEXT:    s_setpc_b64 s[30:31]
+;
+; GFX11FAKE16-LABEL: v_fmuladd_v3bf16:
+; GFX11FAKE16:       ; %bb.0:
+; GFX11FAKE16-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX11FAKE16-NEXT:    v_lshlrev_b32_e32 v6, 16, v2
+; GFX11FAKE16-NEXT:    v_lshlrev_b32_e32 v7, 16, v0
+; GFX11FAKE16-NEXT:    v_and_b32_e32 v2, 0xffff0000, v2
+; GFX11FAKE16-NEXT:    v_and_b32_e32 v0, 0xffff0000, v0
+; GFX11FAKE16-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
+; GFX11FAKE16-NEXT:    v_dual_mul_f32 v0, v0, v2 :: v_dual_lshlrev_b32 v3, 16, v3
+; GFX11FAKE16-NEXT:    v_bfe_u32 v8, v0, 16, 1
+; GFX11FAKE16-NEXT:    v_or_b32_e32 v10, 0x400000, v0
+; GFX11FAKE16-NEXT:    s_delay_alu instid0(VALU_DEP_2) | instskip(SKIP_1) | instid1(VALU_DEP_1)
+; GFX11FAKE16-NEXT:    v_add3_u32 v8, v8, v0, 0x7fff
+; GFX11FAKE16-NEXT:    v_lshlrev_b32_e32 v1, 16, v1
+; GFX11FAKE16-NEXT:    v_mul_f32_e32 v1, v1, v3
+; GFX11FAKE16-NEXT:    v_mul_f32_e32 v3, v7, v6
+; GFX11FAKE16-NEXT:    s_delay_alu instid0(VALU_DEP_2) | instskip(SKIP_1) | instid1(VALU_DEP_3)
+; GFX11FAKE16-NEXT:    v_bfe_u32 v2, v1, 16, 1
+; GFX11FAKE16-NEXT:    v_or_b32_e32 v6, 0x400000, v1
+; GFX11FAKE16-NEXT:    v_bfe_u32 v7, v3, 16, 1
+; GFX11FAKE16-NEXT:    v_cmp_u_f32_e32 vcc_lo, v1, v1
+; GFX11FAKE16-NEXT:    v_or_b32_e32 v9, 0x400000, v3
+; GFX11FAKE16-NEXT:    v_add3_u32 v2, v2, v1, 0x7fff
+; GFX11FAKE16-NEXT:    s_delay_alu instid0(VALU_DEP_4) | instskip(NEXT) | instid1(VALU_DEP_2)
+; GFX11FAKE16-NEXT:    v_add3_u32 v7, v7, v3, 0x7fff
+; GFX11FAKE16-NEXT:    v_cndmask_b32_e32 v1, v2, v6, vcc_lo
+; GFX11FAKE16-NEXT:    v_cmp_u_f32_e32 vcc_lo, v3, v3
+; GFX11FAKE16-NEXT:    v_lshlrev_b32_e32 v3, 16, v5
+; GFX11FAKE16-NEXT:    v_lshlrev_b32_e32 v5, 16, v4
+; GFX11FAKE16-NEXT:    v_and_b32_e32 v4, 0xffff0000, v4
+; GFX11FAKE16-NEXT:    v_cndmask_b32_e32 v2, v7, v9, vcc_lo
+; GFX11FAKE16-NEXT:    v_cmp_u_f32_e32 vcc_lo, v0, v0
+; GFX11FAKE16-NEXT:    s_delay_alu instid0(VALU_DEP_2) | instskip(SKIP_1) | instid1(VALU_DEP_2)
+; GFX11FAKE16-NEXT:    v_and_b32_e32 v2, 0xffff0000, v2
+; GFX11FAKE16-NEXT:    v_cndmask_b32_e32 v0, v8, v10, vcc_lo
+; GFX11FAKE16-NEXT:    v_add_f32_e32 v2, v2, v5
+; GFX11FAKE16-NEXT:    s_delay_alu instid0(VALU_DEP_2) | instskip(SKIP_1) | instid1(VALU_DEP_3)
+; GFX11FAKE16-NEXT:    v_and_b32_e32 v0, 0xffff0000, v0
+; GFX11FAKE16-NEXT:    v_and_b32_e32 v1, 0xffff0000, v1
+; GFX11FAKE16-NEXT:    v_or_b32_e32 v7, 0x400000, v2
+; GFX11FAKE16-NEXT:    s_delay_alu instid0(VALU_DEP_2) | instskip(SKIP_2) | instid1(VALU_DEP_3)
+; GFX11FAKE16-NEXT:    v_dual_add_f32 v0, v0, v4 :: v_dual_add_f32 v1, v1, v3
+; GFX11FAKE16-NEXT:    v_bfe_u32 v3, v2, 16, 1
+; GFX11FAKE16-NEXT:    v_cmp_u_f32_e32 vcc_lo, v2, v2
+; GFX11FAKE16-NEXT:    v_bfe_u32 v5, v0, 16, 1
+; GFX11FAKE16-NEXT:    s_delay_alu instid0(VALU_DEP_4) | instskip(NEXT) | instid1(VALU_DEP_4)
+; GFX11FAKE16-NEXT:    v_bfe_u32 v4, v1, 16, 1
+; GFX11FAKE16-NEXT:    v_add3_u32 v3, v3, v2, 0x7fff
+; GFX11FAKE16-NEXT:    v_or_b32_e32 v8, 0x400000, v0
+; GFX11FAKE16-NEXT:    v_or_b32_e32 v6, 0x400000, v1
+; GFX11FAKE16-NEXT:    v_add3_u32 v5, v5, v0, 0x7fff
+; GFX11FAKE16-NEXT:    v_add3_u32 v4, v4, v1, 0x7fff
+; GFX11FAKE16-NEXT:    v_cndmask_b32_e32 v2, v3, v7, vcc_lo
+; GFX11FAKE16-NEXT:    v_cmp_u_f32_e32 vcc_lo, v0, v0
+; GFX11FAKE16-NEXT:    s_delay_alu instid0(VALU_DEP_4) | instskip(SKIP_1) | instid1(VALU_DEP_2)
+; GFX11FAKE16-NEXT:    v_cndmask_b32_e32 v0, v5, v8, vcc_lo
+; GFX11FAKE16-NEXT:    v_cmp_u_f32_e32 vcc_lo, v1, v1
+; GFX11FAKE16-NEXT:    v_perm_b32 v0, v0, v2, 0x7060302
+; GFX11FAKE16-NEXT:    v_cndmask_b32_e32 v1, v4, v6, vcc_lo
+; GFX11FAKE16-NEXT:    s_delay_alu instid0(VALU_DEP_1)
+; GFX11FAKE16-NEXT:    v_alignbit_b32 v1, s0, v1, 16
+; GFX11FAKE16-NEXT:    s_setpc_b64 s[30:31]
   %op = call <3 x bfloat> @llvm.fmuladd.v3bf16(<3 x bfloat> %a, <3 x bfloat> %b, <3 x bfloat> %c)
   ret <3 x bfloat> %op
 }
diff --git a/llvm/test/CodeGen/AMDGPU/dag-divergence.ll b/llvm/test/CodeGen/AMDGPU/dag-divergence.ll
index b0b9bbe..dfc2853 100644
--- a/llvm/test/CodeGen/AMDGPU/dag-divergence.ll
+++ b/llvm/test/CodeGen/AMDGPU/dag-divergence.ll
@@ -28,3 +28,20 @@ define amdgpu_kernel void @flat_load_maybe_divergent(ptr addrspace(4) %k, ptr %f
   store i32 %maybe.not.uniform.load, ptr addrspace(1) undef
   ret void
 }
+
+; This decomposes into a sequence of divergent sub carries. The first
+; subs in the sequence are divergent from the value inputs, but the
+; last values are divergent due to the carry in glue (such that
+; divergence needs to propagate through glue if there are any non-void
+; outputs)
+; GCN-LABEL: {{^}}wide_carry_divergence_error:
+; GCN: v_sub_u32_e32
+; GCN: v_subb_u32_e32
+; GCN: v_subbrev_u32_e32
+; GCN: v_subbrev_u32_e32
+define <2 x i128> @wide_carry_divergence_error(i128 %arg) {
+  %i = call i128 @llvm.ctlz.i128(i128 %arg, i1 false)
+  %i1 = sub i128 0, %i
+  %i2 = insertelement <2 x i128> zeroinitializer, i128 %i1, i64 0
+  ret <2 x i128> %i2
+}
diff --git a/llvm/test/CodeGen/AMDGPU/eliminate-frame-index-s-mov-b32.mir b/llvm/test/CodeGen/AMDGPU/eliminate-frame-index-s-mov-b32.mir
new file mode 100644
index 0000000..78fb25a
--- /dev/null
+++ b/llvm/test/CodeGen/AMDGPU/eliminate-frame-index-s-mov-b32.mir
@@ -0,0 +1,1072 @@
+# NOTE: Assertions have been autogenerated by utils/update_mir_test_checks.py
+# RUN: llc -mtriple=amdgcn-amd-amdhsa -mcpu=tonga -verify-machineinstrs -run-pass=prologepilog %s -o - | FileCheck -check-prefixes=GFX8 %s
+# RUN: llc -mtriple=amdgcn-amd-amdhsa -mcpu=gfx900 -verify-machineinstrs -run-pass=prologepilog %s -o - | FileCheck -check-prefixes=GFX900 %s
+# RUN: llc -mtriple=amdgcn-amd-amdhsa -mcpu=gfx90a -verify-machineinstrs -run-pass=prologepilog %s -o - | FileCheck -check-prefixes=GFX90A %s
+# RUN: llc -mtriple=amdgcn-amd-amdhsa -mcpu=gfx1010 -verify-machineinstrs -run-pass=prologepilog %s -o - | FileCheck -check-prefixes=GFX1010 %s
+# RUN: llc -mtriple=amdgcn-amd-amdhsa -mcpu=gfx1100 -verify-machineinstrs -run-pass=prologepilog %s -o - | FileCheck -check-prefixes=GFX1100 %s
+# RUN: llc -mtriple=amdgcn-amd-amdhsa -mcpu=gfx1200 -verify-machineinstrs -run-pass=prologepilog %s -o - | FileCheck -check-prefixes=GFX1200 %s
+
+---
+name: s_copy_frame_index_elimination_failure_pei
+tracksRegLiveness: true
+stack:
+  - { id: 0, name: '', type: default, offset: 0, size: 4, alignment: 4,
+      stack-id: default, callee-saved-register: '', callee-saved-restored: true,
+      local-offset: 0, debug-info-variable: '', debug-info-expression: '',
+      debug-info-location: '' }
+  - { id: 1, name: '', type: default, offset: 8, size: 4, alignment: 4,
+      stack-id: default, callee-saved-register: '', callee-saved-restored: true,
+      local-offset: 8, debug-info-variable: '', debug-info-expression: '',
+      debug-info-location: '' }
+  - { id: 2, name: '', type: default, offset: 24, size: 8, alignment: 8,
+      stack-id: default, callee-saved-register: '', callee-saved-restored: true,
+      local-offset: 24, debug-info-variable: '', debug-info-expression: '',
+      debug-info-location: '' }
+
+machineFunctionInfo:
+  scratchRSrcReg:  '$sgpr0_sgpr1_sgpr2_sgpr3'
+  frameOffsetReg:  '$sgpr33'
+  stackPtrOffsetReg: '$sgpr32'
+body:             |
+  bb.0:
+    liveins: $sgpr12, $sgpr13, $sgpr14, $sgpr15, $vgpr0, $vgpr1, $vgpr2, $vgpr4, $vgpr31, $vgpr40, $vgpr63, $sgpr4_sgpr5, $sgpr6_sgpr7, $sgpr8_sgpr9, $sgpr10_sgpr11, $sgpr30_sgpr31
+    ; GFX8-LABEL: name: s_copy_frame_index_elimination_failure_pei
+    ; GFX8: liveins: $sgpr12, $sgpr13, $sgpr14, $sgpr15, $vgpr0, $vgpr1, $vgpr2, $vgpr4, $vgpr31, $vgpr40, $vgpr63, $sgpr4_sgpr5, $sgpr6_sgpr7, $sgpr8_sgpr9, $sgpr10_sgpr11, $sgpr30_sgpr31
+    ; GFX8-NEXT: {{  $}}
+    ; GFX8-NEXT: renamable $sgpr18_sgpr19 = S_MOV_B64 $src_private_base
+    ; GFX8-NEXT: renamable $sgpr17 = S_MOV_B32 0
+    ; GFX8-NEXT: undef renamable $vcc_lo = COPY undef renamable $sgpr17, implicit-def $vcc
+    ; GFX8-NEXT: $sgpr24 = S_LSHR_B32 $sgpr32, 6, implicit-def dead $scc
+    ; GFX8-NEXT: renamable $sgpr29 = COPY undef renamable $sgpr30
+    ; GFX8-NEXT: $sgpr20 = S_LSHR_B32 $sgpr32, 6, implicit-def $scc
+    ; GFX8-NEXT: $sgpr20 = S_ADD_I32 killed $sgpr20, 4, implicit-def $scc
+    ; GFX8-NEXT: undef renamable $sgpr22 = COPY killed undef renamable $sgpr22, implicit-def $sgpr22_sgpr23
+    ; GFX8-NEXT: undef renamable $sgpr26 = COPY killed undef renamable $sgpr26, implicit-def $sgpr26_sgpr27
+    ; GFX8-NEXT: $sgpr31 = S_LSHR_B32 $sgpr32, 6, implicit-def $scc
+    ; GFX8-NEXT: $sgpr31 = S_ADD_I32 killed $sgpr31, 8, implicit-def $scc
+    ; GFX8-NEXT: renamable $vgpr3 = COPY killed renamable $sgpr30, implicit $exec
+    ; GFX8-NEXT: renamable $vgpr0_vgpr1 = COPY renamable $sgpr28_sgpr29, implicit $exec
+    ; GFX8-NEXT: renamable $vgpr0_vgpr1 = COPY killed renamable $vcc, implicit $exec
+    ; GFX8-NEXT: renamable $vgpr0_vgpr1 = COPY killed renamable $sgpr26_sgpr27, implicit $exec
+    ; GFX8-NEXT: renamable $vgpr0_vgpr1 = COPY killed renamable $sgpr24_sgpr25, implicit $exec
+    ; GFX8-NEXT: renamable $vgpr0_vgpr1 = COPY killed renamable $sgpr22_sgpr23, implicit $exec
+    ; GFX8-NEXT: renamable $vgpr0_vgpr1 = COPY killed renamable $sgpr20_sgpr21, implicit $exec
+    ; GFX8-NEXT: renamable $vgpr0_vgpr1 = COPY killed renamable $sgpr18_sgpr19, implicit $exec
+    ; GFX8-NEXT: dead $sgpr30_sgpr31 = SI_CALL killed renamable $sgpr16_sgpr17, 0, csr_amdgpu_gfx90ainsts, implicit killed $sgpr4_sgpr5, implicit killed $sgpr6_sgpr7, implicit killed $sgpr8_sgpr9, implicit killed $sgpr10_sgpr11, implicit killed $sgpr12, implicit killed $sgpr13, implicit killed $sgpr14, implicit killed $sgpr15, implicit $sgpr0_sgpr1_sgpr2_sgpr3, implicit-def $vgpr0, implicit-def $vgpr1
+    ; GFX8-NEXT: SI_RETURN
+    ;
+    ; GFX900-LABEL: name: s_copy_frame_index_elimination_failure_pei
+    ; GFX900: liveins: $sgpr12, $sgpr13, $sgpr14, $sgpr15, $vgpr0, $vgpr1, $vgpr2, $vgpr4, $vgpr31, $vgpr40, $vgpr63, $sgpr4_sgpr5, $sgpr6_sgpr7, $sgpr8_sgpr9, $sgpr10_sgpr11, $sgpr30_sgpr31
+    ; GFX900-NEXT: {{  $}}
+    ; GFX900-NEXT: renamable $sgpr18_sgpr19 = S_MOV_B64 $src_private_base
+    ; GFX900-NEXT: renamable $sgpr17 = S_MOV_B32 0
+    ; GFX900-NEXT: undef renamable $vcc_lo = COPY undef renamable $sgpr17, implicit-def $vcc
+    ; GFX900-NEXT: $sgpr24 = S_LSHR_B32 $sgpr32, 6, implicit-def dead $scc
+    ; GFX900-NEXT: renamable $sgpr29 = COPY undef renamable $sgpr30
+    ; GFX900-NEXT: $sgpr20 = S_LSHR_B32 $sgpr32, 6, implicit-def $scc
+    ; GFX900-NEXT: $sgpr20 = S_ADD_I32 killed $sgpr20, 4, implicit-def $scc
+    ; GFX900-NEXT: undef renamable $sgpr22 = COPY killed undef renamable $sgpr22, implicit-def $sgpr22_sgpr23
+    ; GFX900-NEXT: undef renamable $sgpr26 = COPY killed undef renamable $sgpr26, implicit-def $sgpr26_sgpr27
+    ; GFX900-NEXT: $sgpr31 = S_LSHR_B32 $sgpr32, 6, implicit-def $scc
+    ; GFX900-NEXT: $sgpr31 = S_ADD_I32 killed $sgpr31, 8, implicit-def $scc
+    ; GFX900-NEXT: renamable $vgpr3 = COPY killed renamable $sgpr30, implicit $exec
+    ; GFX900-NEXT: renamable $vgpr0_vgpr1 = COPY renamable $sgpr28_sgpr29, implicit $exec
+    ; GFX900-NEXT: renamable $vgpr0_vgpr1 = COPY killed renamable $vcc, implicit $exec
+    ; GFX900-NEXT: renamable $vgpr0_vgpr1 = COPY killed renamable $sgpr26_sgpr27, implicit $exec
+    ; GFX900-NEXT: renamable $vgpr0_vgpr1 = COPY killed renamable $sgpr24_sgpr25, implicit $exec
+    ; GFX900-NEXT: renamable $vgpr0_vgpr1 = COPY killed renamable $sgpr22_sgpr23, implicit $exec
+    ; GFX900-NEXT: renamable $vgpr0_vgpr1 = COPY killed renamable $sgpr20_sgpr21, implicit $exec
+    ; GFX900-NEXT: renamable $vgpr0_vgpr1 = COPY killed renamable $sgpr18_sgpr19, implicit $exec
+    ; GFX900-NEXT: dead $sgpr30_sgpr31 = SI_CALL killed renamable $sgpr16_sgpr17, 0, csr_amdgpu_gfx90ainsts, implicit killed $sgpr4_sgpr5, implicit killed $sgpr6_sgpr7, implicit killed $sgpr8_sgpr9, implicit killed $sgpr10_sgpr11, implicit killed $sgpr12, implicit killed $sgpr13, implicit killed $sgpr14, implicit killed $sgpr15, implicit $sgpr0_sgpr1_sgpr2_sgpr3, implicit-def $vgpr0, implicit-def $vgpr1
+    ; GFX900-NEXT: SI_RETURN
+    ;
+    ; GFX90A-LABEL: name: s_copy_frame_index_elimination_failure_pei
+    ; GFX90A: liveins: $sgpr12, $sgpr13, $sgpr14, $sgpr15, $vgpr0, $vgpr1, $vgpr2, $vgpr4, $vgpr31, $vgpr40, $vgpr63, $sgpr4_sgpr5, $sgpr6_sgpr7, $sgpr8_sgpr9, $sgpr10_sgpr11, $sgpr30_sgpr31
+    ; GFX90A-NEXT: {{  $}}
+    ; GFX90A-NEXT: renamable $sgpr18_sgpr19 = S_MOV_B64 $src_private_base
+    ; GFX90A-NEXT: renamable $sgpr17 = S_MOV_B32 0
+    ; GFX90A-NEXT: undef renamable $vcc_lo = COPY undef renamable $sgpr17, implicit-def $vcc
+    ; GFX90A-NEXT: $sgpr24 = S_LSHR_B32 $sgpr32, 6, implicit-def dead $scc
+    ; GFX90A-NEXT: renamable $sgpr29 = COPY undef renamable $sgpr30
+    ; GFX90A-NEXT: $sgpr20 = S_LSHR_B32 $sgpr32, 6, implicit-def $scc
+    ; GFX90A-NEXT: $sgpr20 = S_ADD_I32 killed $sgpr20, 4, implicit-def $scc
+    ; GFX90A-NEXT: undef renamable $sgpr22 = COPY killed undef renamable $sgpr22, implicit-def $sgpr22_sgpr23
+    ; GFX90A-NEXT: undef renamable $sgpr26 = COPY killed undef renamable $sgpr26, implicit-def $sgpr26_sgpr27
+    ; GFX90A-NEXT: $sgpr31 = S_LSHR_B32 $sgpr32, 6, implicit-def $scc
+    ; GFX90A-NEXT: $sgpr31 = S_ADD_I32 killed $sgpr31, 8, implicit-def $scc
+    ; GFX90A-NEXT: renamable $vgpr3 = COPY killed renamable $sgpr30, implicit $exec
+    ; GFX90A-NEXT: renamable $vgpr0_vgpr1 = COPY renamable $sgpr28_sgpr29, implicit $exec
+    ; GFX90A-NEXT: renamable $vgpr0_vgpr1 = COPY killed renamable $vcc, implicit $exec
+    ; GFX90A-NEXT: renamable $vgpr0_vgpr1 = COPY killed renamable $sgpr26_sgpr27, implicit $exec
+    ; GFX90A-NEXT: renamable $vgpr0_vgpr1 = COPY killed renamable $sgpr24_sgpr25, implicit $exec
+    ; GFX90A-NEXT: renamable $vgpr0_vgpr1 = COPY killed renamable $sgpr22_sgpr23, implicit $exec
+    ; GFX90A-NEXT: renamable $vgpr0_vgpr1 = COPY killed renamable $sgpr20_sgpr21, implicit $exec
+    ; GFX90A-NEXT: renamable $vgpr0_vgpr1 = COPY killed renamable $sgpr18_sgpr19, implicit $exec
+    ; GFX90A-NEXT: dead $sgpr30_sgpr31 = SI_CALL killed renamable $sgpr16_sgpr17, 0, csr_amdgpu_gfx90ainsts, implicit killed $sgpr4_sgpr5, implicit killed $sgpr6_sgpr7, implicit killed $sgpr8_sgpr9, implicit killed $sgpr10_sgpr11, implicit killed $sgpr12, implicit killed $sgpr13, implicit killed $sgpr14, implicit killed $sgpr15, implicit $sgpr0_sgpr1_sgpr2_sgpr3, implicit-def $vgpr0, implicit-def $vgpr1
+    ; GFX90A-NEXT: SI_RETURN
+    ;
+    ; GFX1010-LABEL: name: s_copy_frame_index_elimination_failure_pei
+    ; GFX1010: liveins: $sgpr12, $sgpr13, $sgpr14, $sgpr15, $vgpr0, $vgpr1, $vgpr2, $vgpr4, $vgpr31, $vgpr40, $vgpr63, $sgpr4_sgpr5, $sgpr6_sgpr7, $sgpr8_sgpr9, $sgpr10_sgpr11, $sgpr30_sgpr31
+    ; GFX1010-NEXT: {{  $}}
+    ; GFX1010-NEXT: renamable $sgpr18_sgpr19 = S_MOV_B64 $src_private_base
+    ; GFX1010-NEXT: renamable $sgpr17 = S_MOV_B32 0
+    ; GFX1010-NEXT: undef renamable $vcc_lo = COPY undef renamable $sgpr17, implicit-def $vcc_lo
+    ; GFX1010-NEXT: $sgpr24 = S_LSHR_B32 $sgpr32, 5, implicit-def dead $scc
+    ; GFX1010-NEXT: renamable $sgpr29 = COPY undef renamable $sgpr30
+    ; GFX1010-NEXT: $sgpr20 = S_LSHR_B32 $sgpr32, 5, implicit-def $scc
+    ; GFX1010-NEXT: $sgpr20 = S_ADD_I32 killed $sgpr20, 4, implicit-def $scc
+    ; GFX1010-NEXT: undef renamable $sgpr22 = COPY killed undef renamable $sgpr22, implicit-def $sgpr22_sgpr23
+    ; GFX1010-NEXT: undef renamable $sgpr26 = COPY killed undef renamable $sgpr26, implicit-def $sgpr26_sgpr27
+    ; GFX1010-NEXT: $sgpr31 = S_LSHR_B32 $sgpr32, 5, implicit-def $scc
+    ; GFX1010-NEXT: $sgpr31 = S_ADD_I32 killed $sgpr31, 8, implicit-def $scc
+    ; GFX1010-NEXT: renamable $vgpr3 = COPY killed renamable $sgpr30, implicit $exec
+    ; GFX1010-NEXT: renamable $vgpr0_vgpr1 = COPY renamable $sgpr28_sgpr29, implicit $exec
+    ; GFX1010-NEXT: renamable $vgpr0_vgpr1 = COPY killed renamable $vcc, implicit $exec
+    ; GFX1010-NEXT: renamable $vgpr0_vgpr1 = COPY killed renamable $sgpr26_sgpr27, implicit $exec
+    ; GFX1010-NEXT: renamable $vgpr0_vgpr1 = COPY killed renamable $sgpr24_sgpr25, implicit $exec
+    ; GFX1010-NEXT: renamable $vgpr0_vgpr1 = COPY killed renamable $sgpr22_sgpr23, implicit $exec
+    ; GFX1010-NEXT: renamable $vgpr0_vgpr1 = COPY killed renamable $sgpr20_sgpr21, implicit $exec
+    ; GFX1010-NEXT: renamable $vgpr0_vgpr1 = COPY killed renamable $sgpr18_sgpr19, implicit $exec
+    ; GFX1010-NEXT: dead $sgpr30_sgpr31 = SI_CALL killed renamable $sgpr16_sgpr17, 0, csr_amdgpu_gfx90ainsts, implicit killed $sgpr4_sgpr5, implicit killed $sgpr6_sgpr7, implicit killed $sgpr8_sgpr9, implicit killed $sgpr10_sgpr11, implicit killed $sgpr12, implicit killed $sgpr13, implicit killed $sgpr14, implicit killed $sgpr15, implicit $sgpr0_sgpr1_sgpr2_sgpr3, implicit-def $vgpr0, implicit-def $vgpr1
+    ; GFX1010-NEXT: SI_RETURN
+    ;
+    ; GFX1100-LABEL: name: s_copy_frame_index_elimination_failure_pei
+    ; GFX1100: liveins: $sgpr12, $sgpr13, $sgpr14, $sgpr15, $vgpr0, $vgpr1, $vgpr2, $vgpr4, $vgpr31, $vgpr40, $vgpr63, $sgpr4_sgpr5, $sgpr6_sgpr7, $sgpr8_sgpr9, $sgpr10_sgpr11, $sgpr30_sgpr31
+    ; GFX1100-NEXT: {{  $}}
+    ; GFX1100-NEXT: renamable $sgpr18_sgpr19 = S_MOV_B64 $src_private_base
+    ; GFX1100-NEXT: renamable $sgpr17 = S_MOV_B32 0
+    ; GFX1100-NEXT: undef renamable $vcc_lo = COPY undef renamable $sgpr17, implicit-def $vcc_lo
+    ; GFX1100-NEXT: renamable $sgpr24 = S_MOV_B32 $sgpr32
+    ; GFX1100-NEXT: renamable $sgpr29 = COPY undef renamable $sgpr30
+    ; GFX1100-NEXT: $sgpr22 = S_ADD_I32 $sgpr32, 4, implicit-def $scc
+    ; GFX1100-NEXT: renamable $sgpr20 = S_MOV_B32 killed $sgpr22
+    ; GFX1100-NEXT: undef renamable $sgpr22 = COPY killed undef renamable $sgpr22, implicit-def $sgpr22_sgpr23
+    ; GFX1100-NEXT: undef renamable $sgpr26 = COPY killed undef renamable $sgpr26, implicit-def $sgpr26_sgpr27
+    ; GFX1100-NEXT: $sgpr32 = S_ADD_I32 $sgpr32, 8, implicit-def $scc
+    ; GFX1100-NEXT: renamable $sgpr31 = S_MOV_B32 $sgpr32
+    ; GFX1100-NEXT: $sgpr32 = S_ADD_I32 $sgpr32, -8, implicit-def $scc
+    ; GFX1100-NEXT: renamable $vgpr3 = COPY killed renamable $sgpr30, implicit $exec
+    ; GFX1100-NEXT: renamable $vgpr0_vgpr1 = COPY renamable $sgpr28_sgpr29, implicit $exec
+    ; GFX1100-NEXT: renamable $vgpr0_vgpr1 = COPY killed renamable $vcc, implicit $exec
+    ; GFX1100-NEXT: renamable $vgpr0_vgpr1 = COPY killed renamable $sgpr26_sgpr27, implicit $exec
+    ; GFX1100-NEXT: renamable $vgpr0_vgpr1 = COPY killed renamable $sgpr24_sgpr25, implicit $exec
+    ; GFX1100-NEXT: renamable $vgpr0_vgpr1 = COPY killed renamable $sgpr22_sgpr23, implicit $exec
+    ; GFX1100-NEXT: renamable $vgpr0_vgpr1 = COPY killed renamable $sgpr20_sgpr21, implicit $exec
+    ; GFX1100-NEXT: renamable $vgpr0_vgpr1 = COPY killed renamable $sgpr18_sgpr19, implicit $exec
+    ; GFX1100-NEXT: dead $sgpr30_sgpr31 = SI_CALL killed renamable $sgpr16_sgpr17, 0, csr_amdgpu_gfx90ainsts, implicit killed $sgpr4_sgpr5, implicit killed $sgpr6_sgpr7, implicit killed $sgpr8_sgpr9, implicit killed $sgpr10_sgpr11, implicit killed $sgpr12, implicit killed $sgpr13, implicit killed $sgpr14, implicit killed $sgpr15, implicit $sgpr0_sgpr1_sgpr2_sgpr3, implicit-def $vgpr0, implicit-def $vgpr1
+    ; GFX1100-NEXT: SI_RETURN
+    ;
+    ; GFX1200-LABEL: name: s_copy_frame_index_elimination_failure_pei
+    ; GFX1200: liveins: $sgpr12, $sgpr13, $sgpr14, $sgpr15, $vgpr0, $vgpr1, $vgpr2, $vgpr4, $vgpr31, $vgpr40, $vgpr63, $sgpr4_sgpr5, $sgpr6_sgpr7, $sgpr8_sgpr9, $sgpr10_sgpr11, $sgpr30_sgpr31
+    ; GFX1200-NEXT: {{  $}}
+    ; GFX1200-NEXT: renamable $sgpr18_sgpr19 = S_MOV_B64 $src_private_base
+    ; GFX1200-NEXT: renamable $sgpr17 = S_MOV_B32 0
+    ; GFX1200-NEXT: undef renamable $vcc_lo = COPY undef renamable $sgpr17, implicit-def $vcc_lo
+    ; GFX1200-NEXT: renamable $sgpr24 = S_MOV_B32 $sgpr32
+    ; GFX1200-NEXT: renamable $sgpr29 = COPY undef renamable $sgpr30
+    ; GFX1200-NEXT: $sgpr22 = S_ADD_I32 $sgpr32, 4, implicit-def $scc
+    ; GFX1200-NEXT: renamable $sgpr20 = S_MOV_B32 killed $sgpr22
+    ; GFX1200-NEXT: undef renamable $sgpr22 = COPY killed undef renamable $sgpr22, implicit-def $sgpr22_sgpr23
+    ; GFX1200-NEXT: undef renamable $sgpr26 = COPY killed undef renamable $sgpr26, implicit-def $sgpr26_sgpr27
+    ; GFX1200-NEXT: $sgpr32 = S_ADD_I32 $sgpr32, 8, implicit-def $scc
+    ; GFX1200-NEXT: renamable $sgpr31 = S_MOV_B32 $sgpr32
+    ; GFX1200-NEXT: $sgpr32 = S_ADD_I32 $sgpr32, -8, implicit-def $scc
+    ; GFX1200-NEXT: renamable $vgpr3 = COPY killed renamable $sgpr30, implicit $exec
+    ; GFX1200-NEXT: renamable $vgpr0_vgpr1 = COPY renamable $sgpr28_sgpr29, implicit $exec
+    ; GFX1200-NEXT: renamable $vgpr0_vgpr1 = COPY killed renamable $vcc, implicit $exec
+    ; GFX1200-NEXT: renamable $vgpr0_vgpr1 = COPY killed renamable $sgpr26_sgpr27, implicit $exec
+    ; GFX1200-NEXT: renamable $vgpr0_vgpr1 = COPY killed renamable $sgpr24_sgpr25, implicit $exec
+    ; GFX1200-NEXT: renamable $vgpr0_vgpr1 = COPY killed renamable $sgpr22_sgpr23, implicit $exec
+    ; GFX1200-NEXT: renamable $vgpr0_vgpr1 = COPY killed renamable $sgpr20_sgpr21, implicit $exec
+    ; GFX1200-NEXT: renamable $vgpr0_vgpr1 = COPY killed renamable $sgpr18_sgpr19, implicit $exec
+    ; GFX1200-NEXT: dead $sgpr30_sgpr31 = SI_CALL killed renamable $sgpr16_sgpr17, 0, csr_amdgpu_gfx90ainsts, implicit killed $sgpr4_sgpr5, implicit killed $sgpr6_sgpr7, implicit killed $sgpr8_sgpr9, implicit killed $sgpr10_sgpr11, implicit killed $sgpr12, implicit killed $sgpr13, implicit killed $sgpr14, implicit killed $sgpr15, implicit $sgpr0_sgpr1_sgpr2_sgpr3, implicit-def $vgpr0, implicit-def $vgpr1
+    ; GFX1200-NEXT: SI_RETURN
+    renamable $sgpr18_sgpr19 = S_MOV_B64 $src_private_base
+    renamable $sgpr17 = S_MOV_B32 0
+    undef renamable $vcc_lo = COPY undef renamable $sgpr17, implicit-def $vcc
+    renamable $sgpr24 = S_MOV_B32 %stack.0
+    renamable $sgpr29 = COPY undef renamable $sgpr30
+    renamable $sgpr20 = S_MOV_B32 %stack.1
+    undef renamable $sgpr22 = COPY killed undef renamable $sgpr22, implicit-def $sgpr22_sgpr23
+    undef renamable $sgpr26 = COPY killed undef renamable $sgpr26, implicit-def $sgpr26_sgpr27
+    renamable $sgpr31 = S_MOV_B32 %stack.2
+    renamable $vgpr3 = COPY killed renamable $sgpr30, implicit $exec
+    renamable $vgpr0_vgpr1 = COPY renamable $sgpr28_sgpr29, implicit $exec
+    renamable $vgpr0_vgpr1 = COPY killed renamable $vcc, implicit $exec
+    renamable $vgpr0_vgpr1 = COPY killed renamable $sgpr26_sgpr27, implicit $exec
+    renamable $vgpr0_vgpr1 = COPY killed renamable $sgpr24_sgpr25, implicit $exec
+    renamable $vgpr0_vgpr1 = COPY killed renamable $sgpr22_sgpr23, implicit $exec
+    renamable $vgpr0_vgpr1 = COPY killed renamable $sgpr20_sgpr21, implicit $exec
+    renamable $vgpr0_vgpr1 = COPY killed renamable $sgpr18_sgpr19, implicit $exec
+    dead $sgpr30_sgpr31 = SI_CALL killed renamable $sgpr16_sgpr17, 0, csr_amdgpu_gfx90ainsts, implicit killed $sgpr4_sgpr5, implicit killed $sgpr6_sgpr7, implicit killed $sgpr8_sgpr9, implicit killed $sgpr10_sgpr11, implicit killed $sgpr12, implicit killed $sgpr13, implicit killed $sgpr14, implicit killed $sgpr15, implicit $sgpr0_sgpr1_sgpr2_sgpr3, implicit-def $vgpr0, implicit-def $vgpr1
+    SI_RETURN
+
+...
+
+---
+name: materialize_fi_s_mov_b32_offset_64_live_scc_live_vcc
+tracksRegLiveness: true
+stack:
+  - { id: 0, type: default, size: 64, alignment: 16, stack-id: default }
+  - { id: 1, type: default, size: 4, alignment: 4, stack-id: default }
+machineFunctionInfo:
+  stackPtrOffsetReg: '$sgpr32'
+body:   |
+  bb.0:
+    liveins: $sgpr4, $sgpr5, $vgpr0
+
+    ; GFX8-LABEL: name: materialize_fi_s_mov_b32_offset_64_live_scc_live_vcc
+    ; GFX8: liveins: $sgpr4, $sgpr5, $vgpr0
+    ; GFX8-NEXT: {{  $}}
+    ; GFX8-NEXT: V_CMP_EQ_U32_e32 0, killed $vgpr0, implicit-def $vcc, implicit $exec
+    ; GFX8-NEXT: S_CMP_EQ_I32 $sgpr4, $sgpr5, implicit-def $scc
+    ; GFX8-NEXT: $vgpr0 = V_LSHRREV_B32_e64 6, $sgpr32, implicit $exec
+    ; GFX8-NEXT: $sgpr4 = S_MOV_B32 64
+    ; GFX8-NEXT: $vgpr0, dead $sgpr0_sgpr1 = V_ADD_CO_U32_e64 killed $sgpr4, killed $vgpr0, 0, implicit $exec
+    ; GFX8-NEXT: $sgpr4 = V_READFIRSTLANE_B32 $vgpr0, implicit $exec
+    ; GFX8-NEXT: S_ENDPGM 0, implicit $sgpr4, implicit $scc, implicit killed $vcc
+    ;
+    ; GFX900-LABEL: name: materialize_fi_s_mov_b32_offset_64_live_scc_live_vcc
+    ; GFX900: liveins: $sgpr4, $sgpr5, $vgpr0
+    ; GFX900-NEXT: {{  $}}
+    ; GFX900-NEXT: V_CMP_EQ_U32_e32 0, killed $vgpr0, implicit-def $vcc, implicit $exec
+    ; GFX900-NEXT: S_CMP_EQ_I32 $sgpr4, $sgpr5, implicit-def $scc
+    ; GFX900-NEXT: $vgpr0 = V_LSHRREV_B32_e64 6, $sgpr32, implicit $exec
+    ; GFX900-NEXT: $vgpr0 = V_ADD_U32_e32 64, killed $vgpr0, implicit $exec
+    ; GFX900-NEXT: $sgpr4 = V_READFIRSTLANE_B32 $vgpr0, implicit $exec
+    ; GFX900-NEXT: S_ENDPGM 0, implicit $sgpr4, implicit $scc, implicit killed $vcc
+    ;
+    ; GFX90A-LABEL: name: materialize_fi_s_mov_b32_offset_64_live_scc_live_vcc
+    ; GFX90A: liveins: $sgpr4, $sgpr5, $vgpr0
+    ; GFX90A-NEXT: {{  $}}
+    ; GFX90A-NEXT: V_CMP_EQ_U32_e32 0, killed $vgpr0, implicit-def $vcc, implicit $exec
+    ; GFX90A-NEXT: S_CMP_EQ_I32 $sgpr4, $sgpr5, implicit-def $scc
+    ; GFX90A-NEXT: $vgpr0 = V_LSHRREV_B32_e64 6, $sgpr32, implicit $exec
+    ; GFX90A-NEXT: $vgpr0 = V_ADD_U32_e32 64, killed $vgpr0, implicit $exec
+    ; GFX90A-NEXT: $sgpr4 = V_READFIRSTLANE_B32 $vgpr0, implicit $exec
+    ; GFX90A-NEXT: S_ENDPGM 0, implicit $sgpr4, implicit $scc, implicit killed $vcc
+    ;
+    ; GFX1010-LABEL: name: materialize_fi_s_mov_b32_offset_64_live_scc_live_vcc
+    ; GFX1010: liveins: $sgpr4, $sgpr5, $vgpr0
+    ; GFX1010-NEXT: {{  $}}
+    ; GFX1010-NEXT: V_CMP_EQ_U32_e32 0, killed $vgpr0, implicit-def $vcc_lo, implicit $exec
+    ; GFX1010-NEXT: S_CMP_EQ_I32 $sgpr4, $sgpr5, implicit-def $scc
+    ; GFX1010-NEXT: $vgpr0 = V_LSHRREV_B32_e64 5, $sgpr32, implicit $exec
+    ; GFX1010-NEXT: $vgpr0 = V_ADD_U32_e32 64, killed $vgpr0, implicit $exec
+    ; GFX1010-NEXT: $sgpr4 = V_READFIRSTLANE_B32 $vgpr0, implicit $exec
+    ; GFX1010-NEXT: S_ENDPGM 0, implicit $sgpr4, implicit $scc, implicit killed $vcc_lo
+    ;
+    ; GFX1100-LABEL: name: materialize_fi_s_mov_b32_offset_64_live_scc_live_vcc
+    ; GFX1100: liveins: $sgpr4, $sgpr5, $vgpr0
+    ; GFX1100-NEXT: {{  $}}
+    ; GFX1100-NEXT: V_CMP_EQ_U32_e32 0, killed $vgpr0, implicit-def $vcc_lo, implicit $exec
+    ; GFX1100-NEXT: S_CMP_EQ_I32 $sgpr4, $sgpr5, implicit-def $scc
+    ; GFX1100-NEXT: $sgpr0 = S_ADDC_U32 $sgpr32, 64, implicit-def $scc, implicit $scc
+    ; GFX1100-NEXT: S_BITCMP1_B32 $sgpr0, 0, implicit-def $scc
+    ; GFX1100-NEXT: $sgpr0 = S_BITSET0_B32 0, $sgpr0
+    ; GFX1100-NEXT: renamable $sgpr4 = S_MOV_B32 killed $sgpr0
+    ; GFX1100-NEXT: S_ENDPGM 0, implicit $sgpr4, implicit $scc, implicit killed $vcc_lo
+    ;
+    ; GFX1200-LABEL: name: materialize_fi_s_mov_b32_offset_64_live_scc_live_vcc
+    ; GFX1200: liveins: $sgpr4, $sgpr5, $vgpr0
+    ; GFX1200-NEXT: {{  $}}
+    ; GFX1200-NEXT: V_CMP_EQ_U32_e32 0, killed $vgpr0, implicit-def $vcc_lo, implicit $exec
+    ; GFX1200-NEXT: S_CMP_EQ_I32 $sgpr4, $sgpr5, implicit-def $scc
+    ; GFX1200-NEXT: $sgpr0 = S_ADDC_U32 $sgpr32, 64, implicit-def $scc, implicit $scc
+    ; GFX1200-NEXT: S_BITCMP1_B32 $sgpr0, 0, implicit-def $scc
+    ; GFX1200-NEXT: $sgpr0 = S_BITSET0_B32 0, $sgpr0
+    ; GFX1200-NEXT: renamable $sgpr4 = S_MOV_B32 killed $sgpr0
+    ; GFX1200-NEXT: S_ENDPGM 0, implicit $sgpr4, implicit $scc, implicit killed $vcc_lo
+  V_CMP_EQ_U32_e32 0, killed $vgpr0, implicit-def $vcc, implicit $exec
+  S_CMP_EQ_I32 $sgpr4, $sgpr5, implicit-def $scc
+  renamable $sgpr4 = S_MOV_B32 %stack.1
+  S_ENDPGM 0, implicit $sgpr4, implicit $scc, implicit killed $vcc
+
+...
+
+
+---
+name: materialize_fi_s_mov_b32_offset_64_live_scc_live_vcc_no_vgpr
+tracksRegLiveness: true
+stack:
+  - { id: 0, type: default, size: 64, alignment: 16, stack-id: default }
+  - { id: 1, type: default, size: 4, alignment: 4, stack-id: default }
+machineFunctionInfo:
+  scratchRSrcReg:  $sgpr0_sgpr1_sgpr2_sgpr3
+  stackPtrOffsetReg: '$sgpr32'
+body:   |
+  bb.0:
+    liveins: $sgpr4, $sgpr5, $vgpr0
+
+
+    ; GFX8-LABEL: name: materialize_fi_s_mov_b32_offset_64_live_scc_live_vcc_no_vgpr
+    ; GFX8: liveins: $sgpr4, $sgpr5, $vgpr0, $vgpr40, $vgpr41, $vgpr42, $vgpr43, $vgpr44, $vgpr45, $vgpr46, $vgpr47, $vgpr56, $vgpr57, $vgpr58, $vgpr59, $vgpr60, $vgpr61, $vgpr62, $vgpr63
+    ; GFX8-NEXT: {{  $}}
+    ; GFX8-NEXT: BUFFER_STORE_DWORD_OFFSET killed $vgpr40, $sgpr0_sgpr1_sgpr2_sgpr3, $sgpr32, 60, 0, 0, implicit $exec :: (store (s32) into %stack.2, addrspace 5)
+    ; GFX8-NEXT: BUFFER_STORE_DWORD_OFFSET killed $vgpr41, $sgpr0_sgpr1_sgpr2_sgpr3, $sgpr32, 56, 0, 0, implicit $exec :: (store (s32) into %stack.3, addrspace 5)
+    ; GFX8-NEXT: BUFFER_STORE_DWORD_OFFSET killed $vgpr42, $sgpr0_sgpr1_sgpr2_sgpr3, $sgpr32, 52, 0, 0, implicit $exec :: (store (s32) into %stack.4, addrspace 5)
+    ; GFX8-NEXT: BUFFER_STORE_DWORD_OFFSET killed $vgpr43, $sgpr0_sgpr1_sgpr2_sgpr3, $sgpr32, 48, 0, 0, implicit $exec :: (store (s32) into %stack.5, addrspace 5)
+    ; GFX8-NEXT: BUFFER_STORE_DWORD_OFFSET killed $vgpr44, $sgpr0_sgpr1_sgpr2_sgpr3, $sgpr32, 44, 0, 0, implicit $exec :: (store (s32) into %stack.6, addrspace 5)
+    ; GFX8-NEXT: BUFFER_STORE_DWORD_OFFSET killed $vgpr45, $sgpr0_sgpr1_sgpr2_sgpr3, $sgpr32, 40, 0, 0, implicit $exec :: (store (s32) into %stack.7, addrspace 5)
+    ; GFX8-NEXT: BUFFER_STORE_DWORD_OFFSET killed $vgpr46, $sgpr0_sgpr1_sgpr2_sgpr3, $sgpr32, 36, 0, 0, implicit $exec :: (store (s32) into %stack.8, addrspace 5)
+    ; GFX8-NEXT: BUFFER_STORE_DWORD_OFFSET killed $vgpr47, $sgpr0_sgpr1_sgpr2_sgpr3, $sgpr32, 32, 0, 0, implicit $exec :: (store (s32) into %stack.9, addrspace 5)
+    ; GFX8-NEXT: BUFFER_STORE_DWORD_OFFSET killed $vgpr56, $sgpr0_sgpr1_sgpr2_sgpr3, $sgpr32, 28, 0, 0, implicit $exec :: (store (s32) into %stack.10, addrspace 5)
+    ; GFX8-NEXT: BUFFER_STORE_DWORD_OFFSET killed $vgpr57, $sgpr0_sgpr1_sgpr2_sgpr3, $sgpr32, 24, 0, 0, implicit $exec :: (store (s32) into %stack.11, addrspace 5)
+    ; GFX8-NEXT: BUFFER_STORE_DWORD_OFFSET killed $vgpr58, $sgpr0_sgpr1_sgpr2_sgpr3, $sgpr32, 20, 0, 0, implicit $exec :: (store (s32) into %stack.12, addrspace 5)
+    ; GFX8-NEXT: BUFFER_STORE_DWORD_OFFSET killed $vgpr59, $sgpr0_sgpr1_sgpr2_sgpr3, $sgpr32, 16, 0, 0, implicit $exec :: (store (s32) into %stack.13, addrspace 5)
+    ; GFX8-NEXT: BUFFER_STORE_DWORD_OFFSET killed $vgpr60, $sgpr0_sgpr1_sgpr2_sgpr3, $sgpr32, 12, 0, 0, implicit $exec :: (store (s32) into %stack.14, addrspace 5)
+    ; GFX8-NEXT: BUFFER_STORE_DWORD_OFFSET killed $vgpr61, $sgpr0_sgpr1_sgpr2_sgpr3, $sgpr32, 8, 0, 0, implicit $exec :: (store (s32) into %stack.15, addrspace 5)
+    ; GFX8-NEXT: BUFFER_STORE_DWORD_OFFSET killed $vgpr62, $sgpr0_sgpr1_sgpr2_sgpr3, $sgpr32, 4, 0, 0, implicit $exec :: (store (s32) into %stack.16, addrspace 5)
+    ; GFX8-NEXT: BUFFER_STORE_DWORD_OFFSET killed $vgpr63, $sgpr0_sgpr1_sgpr2_sgpr3, $sgpr32, 0, 0, 0, implicit $exec :: (store (s32) into %stack.17, addrspace 5)
+    ; GFX8-NEXT: S_NOP 0, implicit-def $vgpr0_vgpr1_vgpr2_vgpr3_vgpr4_vgpr5_vgpr6_vgpr7
+    ; GFX8-NEXT: S_NOP 0, implicit-def $vgpr8_vgpr9_vgpr10_vgpr11_vgpr12_vgpr13_vgpr14_vgpr15
+    ; GFX8-NEXT: S_NOP 0, implicit-def $vgpr16_vgpr17_vgpr18_vgpr19_vgpr20_vgpr21_vgpr22_vgpr23
+    ; GFX8-NEXT: S_NOP 0, implicit-def $vgpr24_vgpr25_vgpr26_vgpr27_vgpr28_vgpr29_vgpr30_vgpr31
+    ; GFX8-NEXT: S_NOP 0, implicit-def $vgpr32_vgpr33_vgpr34_vgpr35_vgpr36_vgpr37_vgpr38_vgpr39
+    ; GFX8-NEXT: S_NOP 0, implicit-def $vgpr40_vgpr41_vgpr42_vgpr43_vgpr44_vgpr45_vgpr46_vgpr47
+    ; GFX8-NEXT: S_NOP 0, implicit-def $vgpr48_vgpr49_vgpr50_vgpr51_vgpr52_vgpr53_vgpr54_vgpr55
+    ; GFX8-NEXT: S_NOP 0, implicit-def $vgpr56_vgpr57_vgpr58_vgpr59_vgpr60_vgpr61_vgpr62_vgpr63
+    ; GFX8-NEXT: V_CMP_EQ_U32_e32 0, killed $vgpr0, implicit-def $vcc, implicit $exec
+    ; GFX8-NEXT: S_CMP_EQ_I32 $sgpr4, $sgpr5, implicit-def $scc
+    ; GFX8-NEXT: BUFFER_STORE_DWORD_OFFSET killed $vgpr1, $sgpr0_sgpr1_sgpr2_sgpr3, $sgpr32, 132, 0, 0, implicit $exec :: (store (s32) into %stack.18, addrspace 5)
+    ; GFX8-NEXT: $vgpr1 = V_LSHRREV_B32_e64 6, $sgpr32, implicit $exec
+    ; GFX8-NEXT: $sgpr4 = S_MOV_B32 128
+    ; GFX8-NEXT: $vgpr1, dead $sgpr6_sgpr7 = V_ADD_CO_U32_e64 killed $sgpr4, killed $vgpr1, 0, implicit $exec
+    ; GFX8-NEXT: $sgpr4 = V_READFIRSTLANE_B32 $vgpr1, implicit $exec
+    ; GFX8-NEXT: $vgpr1 = BUFFER_LOAD_DWORD_OFFSET $sgpr0_sgpr1_sgpr2_sgpr3, $sgpr32, 132, 0, 0, implicit $exec :: (load (s32) from %stack.18, addrspace 5)
+    ; GFX8-NEXT: S_NOP 0, implicit $vgpr0_vgpr1_vgpr2_vgpr3_vgpr4_vgpr5_vgpr6_vgpr7
+    ; GFX8-NEXT: S_NOP 0, implicit $vgpr8_vgpr9_vgpr10_vgpr11_vgpr12_vgpr13_vgpr14_vgpr15
+    ; GFX8-NEXT: S_NOP 0, implicit $vgpr16_vgpr17_vgpr18_vgpr19_vgpr20_vgpr21_vgpr22_vgpr23
+    ; GFX8-NEXT: S_NOP 0, implicit $vgpr24_vgpr25_vgpr26_vgpr27_vgpr28_vgpr29_vgpr30_vgpr31
+    ; GFX8-NEXT: S_NOP 0, implicit $vgpr32_vgpr33_vgpr34_vgpr35_vgpr36_vgpr37_vgpr38_vgpr39
+    ; GFX8-NEXT: S_NOP 0, implicit $vgpr40_vgpr41_vgpr42_vgpr43_vgpr44_vgpr45_vgpr46_vgpr47
+    ; GFX8-NEXT: S_NOP 0, implicit $vgpr48_vgpr49_vgpr50_vgpr51_vgpr52_vgpr53_vgpr54_vgpr55
+    ; GFX8-NEXT: S_NOP 0, implicit $vgpr56_vgpr57_vgpr58_vgpr59_vgpr60_vgpr61_vgpr62_vgpr63
+    ; GFX8-NEXT: $vgpr63 = BUFFER_LOAD_DWORD_OFFSET $sgpr0_sgpr1_sgpr2_sgpr3, $sgpr32, 0, 0, 0, implicit $exec :: (load (s32) from %stack.17, addrspace 5)
+    ; GFX8-NEXT: $vgpr62 = BUFFER_LOAD_DWORD_OFFSET $sgpr0_sgpr1_sgpr2_sgpr3, $sgpr32, 4, 0, 0, implicit $exec :: (load (s32) from %stack.16, addrspace 5)
+    ; GFX8-NEXT: $vgpr61 = BUFFER_LOAD_DWORD_OFFSET $sgpr0_sgpr1_sgpr2_sgpr3, $sgpr32, 8, 0, 0, implicit $exec :: (load (s32) from %stack.15, addrspace 5)
+    ; GFX8-NEXT: $vgpr60 = BUFFER_LOAD_DWORD_OFFSET $sgpr0_sgpr1_sgpr2_sgpr3, $sgpr32, 12, 0, 0, implicit $exec :: (load (s32) from %stack.14, addrspace 5)
+    ; GFX8-NEXT: $vgpr59 = BUFFER_LOAD_DWORD_OFFSET $sgpr0_sgpr1_sgpr2_sgpr3, $sgpr32, 16, 0, 0, implicit $exec :: (load (s32) from %stack.13, addrspace 5)
+    ; GFX8-NEXT: $vgpr58 = BUFFER_LOAD_DWORD_OFFSET $sgpr0_sgpr1_sgpr2_sgpr3, $sgpr32, 20, 0, 0, implicit $exec :: (load (s32) from %stack.12, addrspace 5)
+    ; GFX8-NEXT: $vgpr57 = BUFFER_LOAD_DWORD_OFFSET $sgpr0_sgpr1_sgpr2_sgpr3, $sgpr32, 24, 0, 0, implicit $exec :: (load (s32) from %stack.11, addrspace 5)
+    ; GFX8-NEXT: $vgpr56 = BUFFER_LOAD_DWORD_OFFSET $sgpr0_sgpr1_sgpr2_sgpr3, $sgpr32, 28, 0, 0, implicit $exec :: (load (s32) from %stack.10, addrspace 5)
+    ; GFX8-NEXT: $vgpr47 = BUFFER_LOAD_DWORD_OFFSET $sgpr0_sgpr1_sgpr2_sgpr3, $sgpr32, 32, 0, 0, implicit $exec :: (load (s32) from %stack.9, addrspace 5)
+    ; GFX8-NEXT: $vgpr46 = BUFFER_LOAD_DWORD_OFFSET $sgpr0_sgpr1_sgpr2_sgpr3, $sgpr32, 36, 0, 0, implicit $exec :: (load (s32) from %stack.8, addrspace 5)
+    ; GFX8-NEXT: $vgpr45 = BUFFER_LOAD_DWORD_OFFSET $sgpr0_sgpr1_sgpr2_sgpr3, $sgpr32, 40, 0, 0, implicit $exec :: (load (s32) from %stack.7, addrspace 5)
+    ; GFX8-NEXT: $vgpr44 = BUFFER_LOAD_DWORD_OFFSET $sgpr0_sgpr1_sgpr2_sgpr3, $sgpr32, 44, 0, 0, implicit $exec :: (load (s32) from %stack.6, addrspace 5)
+    ; GFX8-NEXT: $vgpr43 = BUFFER_LOAD_DWORD_OFFSET $sgpr0_sgpr1_sgpr2_sgpr3, $sgpr32, 48, 0, 0, implicit $exec :: (load (s32) from %stack.5, addrspace 5)
+    ; GFX8-NEXT: $vgpr42 = BUFFER_LOAD_DWORD_OFFSET $sgpr0_sgpr1_sgpr2_sgpr3, $sgpr32, 52, 0, 0, implicit $exec :: (load (s32) from %stack.4, addrspace 5)
+    ; GFX8-NEXT: $vgpr41 = BUFFER_LOAD_DWORD_OFFSET $sgpr0_sgpr1_sgpr2_sgpr3, $sgpr32, 56, 0, 0, implicit $exec :: (load (s32) from %stack.3, addrspace 5)
+    ; GFX8-NEXT: $vgpr40 = BUFFER_LOAD_DWORD_OFFSET $sgpr0_sgpr1_sgpr2_sgpr3, $sgpr32, 60, 0, 0, implicit $exec :: (load (s32) from %stack.2, addrspace 5)
+    ; GFX8-NEXT: S_ENDPGM 0, implicit $sgpr4, implicit $scc, implicit killed $vcc
+    ;
+    ; GFX900-LABEL: name: materialize_fi_s_mov_b32_offset_64_live_scc_live_vcc_no_vgpr
+    ; GFX900: liveins: $sgpr4, $sgpr5, $vgpr0, $vgpr40, $vgpr41, $vgpr42, $vgpr43, $vgpr44, $vgpr45, $vgpr46, $vgpr47, $vgpr56, $vgpr57, $vgpr58, $vgpr59, $vgpr60, $vgpr61, $vgpr62, $vgpr63
+    ; GFX900-NEXT: {{  $}}
+    ; GFX900-NEXT: BUFFER_STORE_DWORD_OFFSET killed $vgpr40, $sgpr0_sgpr1_sgpr2_sgpr3, $sgpr32, 60, 0, 0, implicit $exec :: (store (s32) into %stack.2, addrspace 5)
+    ; GFX900-NEXT: BUFFER_STORE_DWORD_OFFSET killed $vgpr41, $sgpr0_sgpr1_sgpr2_sgpr3, $sgpr32, 56, 0, 0, implicit $exec :: (store (s32) into %stack.3, addrspace 5)
+    ; GFX900-NEXT: BUFFER_STORE_DWORD_OFFSET killed $vgpr42, $sgpr0_sgpr1_sgpr2_sgpr3, $sgpr32, 52, 0, 0, implicit $exec :: (store (s32) into %stack.4, addrspace 5)
+    ; GFX900-NEXT: BUFFER_STORE_DWORD_OFFSET killed $vgpr43, $sgpr0_sgpr1_sgpr2_sgpr3, $sgpr32, 48, 0, 0, implicit $exec :: (store (s32) into %stack.5, addrspace 5)
+    ; GFX900-NEXT: BUFFER_STORE_DWORD_OFFSET killed $vgpr44, $sgpr0_sgpr1_sgpr2_sgpr3, $sgpr32, 44, 0, 0, implicit $exec :: (store (s32) into %stack.6, addrspace 5)
+    ; GFX900-NEXT: BUFFER_STORE_DWORD_OFFSET killed $vgpr45, $sgpr0_sgpr1_sgpr2_sgpr3, $sgpr32, 40, 0, 0, implicit $exec :: (store (s32) into %stack.7, addrspace 5)
+    ; GFX900-NEXT: BUFFER_STORE_DWORD_OFFSET killed $vgpr46, $sgpr0_sgpr1_sgpr2_sgpr3, $sgpr32, 36, 0, 0, implicit $exec :: (store (s32) into %stack.8, addrspace 5)
+    ; GFX900-NEXT: BUFFER_STORE_DWORD_OFFSET killed $vgpr47, $sgpr0_sgpr1_sgpr2_sgpr3, $sgpr32, 32, 0, 0, implicit $exec :: (store (s32) into %stack.9, addrspace 5)
+    ; GFX900-NEXT: BUFFER_STORE_DWORD_OFFSET killed $vgpr56, $sgpr0_sgpr1_sgpr2_sgpr3, $sgpr32, 28, 0, 0, implicit $exec :: (store (s32) into %stack.10, addrspace 5)
+    ; GFX900-NEXT: BUFFER_STORE_DWORD_OFFSET killed $vgpr57, $sgpr0_sgpr1_sgpr2_sgpr3, $sgpr32, 24, 0, 0, implicit $exec :: (store (s32) into %stack.11, addrspace 5)
+    ; GFX900-NEXT: BUFFER_STORE_DWORD_OFFSET killed $vgpr58, $sgpr0_sgpr1_sgpr2_sgpr3, $sgpr32, 20, 0, 0, implicit $exec :: (store (s32) into %stack.12, addrspace 5)
+    ; GFX900-NEXT: BUFFER_STORE_DWORD_OFFSET killed $vgpr59, $sgpr0_sgpr1_sgpr2_sgpr3, $sgpr32, 16, 0, 0, implicit $exec :: (store (s32) into %stack.13, addrspace 5)
+    ; GFX900-NEXT: BUFFER_STORE_DWORD_OFFSET killed $vgpr60, $sgpr0_sgpr1_sgpr2_sgpr3, $sgpr32, 12, 0, 0, implicit $exec :: (store (s32) into %stack.14, addrspace 5)
+    ; GFX900-NEXT: BUFFER_STORE_DWORD_OFFSET killed $vgpr61, $sgpr0_sgpr1_sgpr2_sgpr3, $sgpr32, 8, 0, 0, implicit $exec :: (store (s32) into %stack.15, addrspace 5)
+    ; GFX900-NEXT: BUFFER_STORE_DWORD_OFFSET killed $vgpr62, $sgpr0_sgpr1_sgpr2_sgpr3, $sgpr32, 4, 0, 0, implicit $exec :: (store (s32) into %stack.16, addrspace 5)
+    ; GFX900-NEXT: BUFFER_STORE_DWORD_OFFSET killed $vgpr63, $sgpr0_sgpr1_sgpr2_sgpr3, $sgpr32, 0, 0, 0, implicit $exec :: (store (s32) into %stack.17, addrspace 5)
+    ; GFX900-NEXT: S_NOP 0, implicit-def $vgpr0_vgpr1_vgpr2_vgpr3_vgpr4_vgpr5_vgpr6_vgpr7
+    ; GFX900-NEXT: S_NOP 0, implicit-def $vgpr8_vgpr9_vgpr10_vgpr11_vgpr12_vgpr13_vgpr14_vgpr15
+    ; GFX900-NEXT: S_NOP 0, implicit-def $vgpr16_vgpr17_vgpr18_vgpr19_vgpr20_vgpr21_vgpr22_vgpr23
+    ; GFX900-NEXT: S_NOP 0, implicit-def $vgpr24_vgpr25_vgpr26_vgpr27_vgpr28_vgpr29_vgpr30_vgpr31
+    ; GFX900-NEXT: S_NOP 0, implicit-def $vgpr32_vgpr33_vgpr34_vgpr35_vgpr36_vgpr37_vgpr38_vgpr39
+    ; GFX900-NEXT: S_NOP 0, implicit-def $vgpr40_vgpr41_vgpr42_vgpr43_vgpr44_vgpr45_vgpr46_vgpr47
+    ; GFX900-NEXT: S_NOP 0, implicit-def $vgpr48_vgpr49_vgpr50_vgpr51_vgpr52_vgpr53_vgpr54_vgpr55
+    ; GFX900-NEXT: S_NOP 0, implicit-def $vgpr56_vgpr57_vgpr58_vgpr59_vgpr60_vgpr61_vgpr62_vgpr63
+    ; GFX900-NEXT: V_CMP_EQ_U32_e32 0, killed $vgpr0, implicit-def $vcc, implicit $exec
+    ; GFX900-NEXT: S_CMP_EQ_I32 $sgpr4, $sgpr5, implicit-def $scc
+    ; GFX900-NEXT: BUFFER_STORE_DWORD_OFFSET killed $vgpr1, $sgpr0_sgpr1_sgpr2_sgpr3, $sgpr32, 132, 0, 0, implicit $exec :: (store (s32) into %stack.18, addrspace 5)
+    ; GFX900-NEXT: $vgpr1 = V_LSHRREV_B32_e64 6, $sgpr32, implicit $exec
+    ; GFX900-NEXT: $vgpr1 = V_ADD_U32_e32 128, killed $vgpr1, implicit $exec
+    ; GFX900-NEXT: $sgpr4 = V_READFIRSTLANE_B32 $vgpr1, implicit $exec
+    ; GFX900-NEXT: $vgpr1 = BUFFER_LOAD_DWORD_OFFSET $sgpr0_sgpr1_sgpr2_sgpr3, $sgpr32, 132, 0, 0, implicit $exec :: (load (s32) from %stack.18, addrspace 5)
+    ; GFX900-NEXT: S_NOP 0, implicit $vgpr0_vgpr1_vgpr2_vgpr3_vgpr4_vgpr5_vgpr6_vgpr7
+    ; GFX900-NEXT: S_NOP 0, implicit $vgpr8_vgpr9_vgpr10_vgpr11_vgpr12_vgpr13_vgpr14_vgpr15
+    ; GFX900-NEXT: S_NOP 0, implicit $vgpr16_vgpr17_vgpr18_vgpr19_vgpr20_vgpr21_vgpr22_vgpr23
+    ; GFX900-NEXT: S_NOP 0, implicit $vgpr24_vgpr25_vgpr26_vgpr27_vgpr28_vgpr29_vgpr30_vgpr31
+    ; GFX900-NEXT: S_NOP 0, implicit $vgpr32_vgpr33_vgpr34_vgpr35_vgpr36_vgpr37_vgpr38_vgpr39
+    ; GFX900-NEXT: S_NOP 0, implicit $vgpr40_vgpr41_vgpr42_vgpr43_vgpr44_vgpr45_vgpr46_vgpr47
+    ; GFX900-NEXT: S_NOP 0, implicit $vgpr48_vgpr49_vgpr50_vgpr51_vgpr52_vgpr53_vgpr54_vgpr55
+    ; GFX900-NEXT: S_NOP 0, implicit $vgpr56_vgpr57_vgpr58_vgpr59_vgpr60_vgpr61_vgpr62_vgpr63
+    ; GFX900-NEXT: $vgpr63 = BUFFER_LOAD_DWORD_OFFSET $sgpr0_sgpr1_sgpr2_sgpr3, $sgpr32, 0, 0, 0, implicit $exec :: (load (s32) from %stack.17, addrspace 5)
+    ; GFX900-NEXT: $vgpr62 = BUFFER_LOAD_DWORD_OFFSET $sgpr0_sgpr1_sgpr2_sgpr3, $sgpr32, 4, 0, 0, implicit $exec :: (load (s32) from %stack.16, addrspace 5)
+    ; GFX900-NEXT: $vgpr61 = BUFFER_LOAD_DWORD_OFFSET $sgpr0_sgpr1_sgpr2_sgpr3, $sgpr32, 8, 0, 0, implicit $exec :: (load (s32) from %stack.15, addrspace 5)
+    ; GFX900-NEXT: $vgpr60 = BUFFER_LOAD_DWORD_OFFSET $sgpr0_sgpr1_sgpr2_sgpr3, $sgpr32, 12, 0, 0, implicit $exec :: (load (s32) from %stack.14, addrspace 5)
+    ; GFX900-NEXT: $vgpr59 = BUFFER_LOAD_DWORD_OFFSET $sgpr0_sgpr1_sgpr2_sgpr3, $sgpr32, 16, 0, 0, implicit $exec :: (load (s32) from %stack.13, addrspace 5)
+    ; GFX900-NEXT: $vgpr58 = BUFFER_LOAD_DWORD_OFFSET $sgpr0_sgpr1_sgpr2_sgpr3, $sgpr32, 20, 0, 0, implicit $exec :: (load (s32) from %stack.12, addrspace 5)
+    ; GFX900-NEXT: $vgpr57 = BUFFER_LOAD_DWORD_OFFSET $sgpr0_sgpr1_sgpr2_sgpr3, $sgpr32, 24, 0, 0, implicit $exec :: (load (s32) from %stack.11, addrspace 5)
+    ; GFX900-NEXT: $vgpr56 = BUFFER_LOAD_DWORD_OFFSET $sgpr0_sgpr1_sgpr2_sgpr3, $sgpr32, 28, 0, 0, implicit $exec :: (load (s32) from %stack.10, addrspace 5)
+    ; GFX900-NEXT: $vgpr47 = BUFFER_LOAD_DWORD_OFFSET $sgpr0_sgpr1_sgpr2_sgpr3, $sgpr32, 32, 0, 0, implicit $exec :: (load (s32) from %stack.9, addrspace 5)
+    ; GFX900-NEXT: $vgpr46 = BUFFER_LOAD_DWORD_OFFSET $sgpr0_sgpr1_sgpr2_sgpr3, $sgpr32, 36, 0, 0, implicit $exec :: (load (s32) from %stack.8, addrspace 5)
+    ; GFX900-NEXT: $vgpr45 = BUFFER_LOAD_DWORD_OFFSET $sgpr0_sgpr1_sgpr2_sgpr3, $sgpr32, 40, 0, 0, implicit $exec :: (load (s32) from %stack.7, addrspace 5)
+    ; GFX900-NEXT: $vgpr44 = BUFFER_LOAD_DWORD_OFFSET $sgpr0_sgpr1_sgpr2_sgpr3, $sgpr32, 44, 0, 0, implicit $exec :: (load (s32) from %stack.6, addrspace 5)
+    ; GFX900-NEXT: $vgpr43 = BUFFER_LOAD_DWORD_OFFSET $sgpr0_sgpr1_sgpr2_sgpr3, $sgpr32, 48, 0, 0, implicit $exec :: (load (s32) from %stack.5, addrspace 5)
+    ; GFX900-NEXT: $vgpr42 = BUFFER_LOAD_DWORD_OFFSET $sgpr0_sgpr1_sgpr2_sgpr3, $sgpr32, 52, 0, 0, implicit $exec :: (load (s32) from %stack.4, addrspace 5)
+    ; GFX900-NEXT: $vgpr41 = BUFFER_LOAD_DWORD_OFFSET $sgpr0_sgpr1_sgpr2_sgpr3, $sgpr32, 56, 0, 0, implicit $exec :: (load (s32) from %stack.3, addrspace 5)
+    ; GFX900-NEXT: $vgpr40 = BUFFER_LOAD_DWORD_OFFSET $sgpr0_sgpr1_sgpr2_sgpr3, $sgpr32, 60, 0, 0, implicit $exec :: (load (s32) from %stack.2, addrspace 5)
+    ; GFX900-NEXT: S_ENDPGM 0, implicit $sgpr4, implicit $scc, implicit killed $vcc
+    ;
+    ; GFX90A-LABEL: name: materialize_fi_s_mov_b32_offset_64_live_scc_live_vcc_no_vgpr
+    ; GFX90A: liveins: $agpr0, $agpr1, $agpr2, $agpr3, $agpr4, $agpr5, $agpr6, $agpr7, $agpr8, $agpr9, $agpr10, $agpr11, $agpr12, $agpr13, $agpr14, $agpr15, $sgpr4, $sgpr5, $vgpr0, $vgpr40, $vgpr41, $vgpr42, $vgpr43, $vgpr44, $vgpr45, $vgpr46, $vgpr47, $vgpr56, $vgpr57, $vgpr58, $vgpr59, $vgpr60, $vgpr61, $vgpr62, $vgpr63
+    ; GFX90A-NEXT: {{  $}}
+    ; GFX90A-NEXT: $agpr0 = V_ACCVGPR_WRITE_B32_e64 killed $vgpr40, implicit $exec
+    ; GFX90A-NEXT: $agpr1 = V_ACCVGPR_WRITE_B32_e64 killed $vgpr41, implicit $exec
+    ; GFX90A-NEXT: $agpr2 = V_ACCVGPR_WRITE_B32_e64 killed $vgpr42, implicit $exec
+    ; GFX90A-NEXT: $agpr3 = V_ACCVGPR_WRITE_B32_e64 killed $vgpr43, implicit $exec
+    ; GFX90A-NEXT: $agpr4 = V_ACCVGPR_WRITE_B32_e64 killed $vgpr44, implicit $exec
+    ; GFX90A-NEXT: $agpr5 = V_ACCVGPR_WRITE_B32_e64 killed $vgpr45, implicit $exec
+    ; GFX90A-NEXT: $agpr6 = V_ACCVGPR_WRITE_B32_e64 killed $vgpr46, implicit $exec
+    ; GFX90A-NEXT: $agpr7 = V_ACCVGPR_WRITE_B32_e64 killed $vgpr47, implicit $exec
+    ; GFX90A-NEXT: $agpr8 = V_ACCVGPR_WRITE_B32_e64 killed $vgpr56, implicit $exec
+    ; GFX90A-NEXT: $agpr9 = V_ACCVGPR_WRITE_B32_e64 killed $vgpr57, implicit $exec
+    ; GFX90A-NEXT: $agpr10 = V_ACCVGPR_WRITE_B32_e64 killed $vgpr58, implicit $exec
+    ; GFX90A-NEXT: $agpr11 = V_ACCVGPR_WRITE_B32_e64 killed $vgpr59, implicit $exec
+    ; GFX90A-NEXT: $agpr12 = V_ACCVGPR_WRITE_B32_e64 killed $vgpr60, implicit $exec
+    ; GFX90A-NEXT: $agpr13 = V_ACCVGPR_WRITE_B32_e64 killed $vgpr61, implicit $exec
+    ; GFX90A-NEXT: $agpr14 = V_ACCVGPR_WRITE_B32_e64 killed $vgpr62, implicit $exec
+    ; GFX90A-NEXT: $agpr15 = V_ACCVGPR_WRITE_B32_e64 killed $vgpr63, implicit $exec
+    ; GFX90A-NEXT: S_NOP 0, implicit-def $vgpr0_vgpr1_vgpr2_vgpr3_vgpr4_vgpr5_vgpr6_vgpr7
+    ; GFX90A-NEXT: S_NOP 0, implicit-def $vgpr8_vgpr9_vgpr10_vgpr11_vgpr12_vgpr13_vgpr14_vgpr15
+    ; GFX90A-NEXT: S_NOP 0, implicit-def $vgpr16_vgpr17_vgpr18_vgpr19_vgpr20_vgpr21_vgpr22_vgpr23
+    ; GFX90A-NEXT: S_NOP 0, implicit-def $vgpr24_vgpr25_vgpr26_vgpr27_vgpr28_vgpr29_vgpr30_vgpr31
+    ; GFX90A-NEXT: S_NOP 0, implicit-def $vgpr32_vgpr33_vgpr34_vgpr35_vgpr36_vgpr37_vgpr38_vgpr39
+    ; GFX90A-NEXT: S_NOP 0, implicit-def $vgpr40_vgpr41_vgpr42_vgpr43_vgpr44_vgpr45_vgpr46_vgpr47
+    ; GFX90A-NEXT: S_NOP 0, implicit-def $vgpr48_vgpr49_vgpr50_vgpr51_vgpr52_vgpr53_vgpr54_vgpr55
+    ; GFX90A-NEXT: S_NOP 0, implicit-def $vgpr56_vgpr57_vgpr58_vgpr59_vgpr60_vgpr61_vgpr62_vgpr63
+    ; GFX90A-NEXT: V_CMP_EQ_U32_e32 0, killed $vgpr0, implicit-def $vcc, implicit $exec
+    ; GFX90A-NEXT: S_CMP_EQ_I32 $sgpr4, $sgpr5, implicit-def $scc
+    ; GFX90A-NEXT: BUFFER_STORE_DWORD_OFFSET killed $vgpr1, $sgpr0_sgpr1_sgpr2_sgpr3, $sgpr32, 68, 0, 0, implicit $exec :: (store (s32) into %stack.18, addrspace 5)
+    ; GFX90A-NEXT: $vgpr1 = V_LSHRREV_B32_e64 6, $sgpr32, implicit $exec
+    ; GFX90A-NEXT: $vgpr1 = V_ADD_U32_e32 64, killed $vgpr1, implicit $exec
+    ; GFX90A-NEXT: $sgpr4 = V_READFIRSTLANE_B32 $vgpr1, implicit $exec
+    ; GFX90A-NEXT: $vgpr1 = BUFFER_LOAD_DWORD_OFFSET $sgpr0_sgpr1_sgpr2_sgpr3, $sgpr32, 68, 0, 0, implicit $exec :: (load (s32) from %stack.18, addrspace 5)
+    ; GFX90A-NEXT: S_NOP 0, implicit $vgpr0_vgpr1_vgpr2_vgpr3_vgpr4_vgpr5_vgpr6_vgpr7
+    ; GFX90A-NEXT: S_NOP 0, implicit $vgpr8_vgpr9_vgpr10_vgpr11_vgpr12_vgpr13_vgpr14_vgpr15
+    ; GFX90A-NEXT: S_NOP 0, implicit $vgpr16_vgpr17_vgpr18_vgpr19_vgpr20_vgpr21_vgpr22_vgpr23
+    ; GFX90A-NEXT: S_NOP 0, implicit $vgpr24_vgpr25_vgpr26_vgpr27_vgpr28_vgpr29_vgpr30_vgpr31
+    ; GFX90A-NEXT: S_NOP 0, implicit $vgpr32_vgpr33_vgpr34_vgpr35_vgpr36_vgpr37_vgpr38_vgpr39
+    ; GFX90A-NEXT: S_NOP 0, implicit $vgpr40_vgpr41_vgpr42_vgpr43_vgpr44_vgpr45_vgpr46_vgpr47
+    ; GFX90A-NEXT: S_NOP 0, implicit $vgpr48_vgpr49_vgpr50_vgpr51_vgpr52_vgpr53_vgpr54_vgpr55
+    ; GFX90A-NEXT: S_NOP 0, implicit $vgpr56_vgpr57_vgpr58_vgpr59_vgpr60_vgpr61_vgpr62_vgpr63
+    ; GFX90A-NEXT: $vgpr63 = V_ACCVGPR_READ_B32_e64 $agpr15, implicit $exec
+    ; GFX90A-NEXT: $vgpr62 = V_ACCVGPR_READ_B32_e64 $agpr14, implicit $exec
+    ; GFX90A-NEXT: $vgpr61 = V_ACCVGPR_READ_B32_e64 $agpr13, implicit $exec
+    ; GFX90A-NEXT: $vgpr60 = V_ACCVGPR_READ_B32_e64 $agpr12, implicit $exec
+    ; GFX90A-NEXT: $vgpr59 = V_ACCVGPR_READ_B32_e64 $agpr11, implicit $exec
+    ; GFX90A-NEXT: $vgpr58 = V_ACCVGPR_READ_B32_e64 $agpr10, implicit $exec
+    ; GFX90A-NEXT: $vgpr57 = V_ACCVGPR_READ_B32_e64 $agpr9, implicit $exec
+    ; GFX90A-NEXT: $vgpr56 = V_ACCVGPR_READ_B32_e64 $agpr8, implicit $exec
+    ; GFX90A-NEXT: $vgpr47 = V_ACCVGPR_READ_B32_e64 $agpr7, implicit $exec
+    ; GFX90A-NEXT: $vgpr46 = V_ACCVGPR_READ_B32_e64 $agpr6, implicit $exec
+    ; GFX90A-NEXT: $vgpr45 = V_ACCVGPR_READ_B32_e64 $agpr5, implicit $exec
+    ; GFX90A-NEXT: $vgpr44 = V_ACCVGPR_READ_B32_e64 $agpr4, implicit $exec
+    ; GFX90A-NEXT: $vgpr43 = V_ACCVGPR_READ_B32_e64 $agpr3, implicit $exec
+    ; GFX90A-NEXT: $vgpr42 = V_ACCVGPR_READ_B32_e64 $agpr2, implicit $exec
+    ; GFX90A-NEXT: $vgpr41 = V_ACCVGPR_READ_B32_e64 $agpr1, implicit $exec
+    ; GFX90A-NEXT: $vgpr40 = V_ACCVGPR_READ_B32_e64 $agpr0, implicit $exec
+    ; GFX90A-NEXT: S_ENDPGM 0, implicit $sgpr4, implicit $scc, implicit killed $vcc
+    ;
+    ; GFX1010-LABEL: name: materialize_fi_s_mov_b32_offset_64_live_scc_live_vcc_no_vgpr
+    ; GFX1010: liveins: $sgpr4, $sgpr5, $vgpr0, $vgpr40, $vgpr41, $vgpr42, $vgpr43, $vgpr44, $vgpr45, $vgpr46, $vgpr47, $vgpr56, $vgpr57, $vgpr58, $vgpr59, $vgpr60, $vgpr61, $vgpr62, $vgpr63
+    ; GFX1010-NEXT: {{  $}}
+    ; GFX1010-NEXT: BUFFER_STORE_DWORD_OFFSET killed $vgpr40, $sgpr0_sgpr1_sgpr2_sgpr3, $sgpr32, 60, 0, 0, implicit $exec :: (store (s32) into %stack.2, addrspace 5)
+    ; GFX1010-NEXT: BUFFER_STORE_DWORD_OFFSET killed $vgpr41, $sgpr0_sgpr1_sgpr2_sgpr3, $sgpr32, 56, 0, 0, implicit $exec :: (store (s32) into %stack.3, addrspace 5)
+    ; GFX1010-NEXT: BUFFER_STORE_DWORD_OFFSET killed $vgpr42, $sgpr0_sgpr1_sgpr2_sgpr3, $sgpr32, 52, 0, 0, implicit $exec :: (store (s32) into %stack.4, addrspace 5)
+    ; GFX1010-NEXT: BUFFER_STORE_DWORD_OFFSET killed $vgpr43, $sgpr0_sgpr1_sgpr2_sgpr3, $sgpr32, 48, 0, 0, implicit $exec :: (store (s32) into %stack.5, addrspace 5)
+    ; GFX1010-NEXT: BUFFER_STORE_DWORD_OFFSET killed $vgpr44, $sgpr0_sgpr1_sgpr2_sgpr3, $sgpr32, 44, 0, 0, implicit $exec :: (store (s32) into %stack.6, addrspace 5)
+    ; GFX1010-NEXT: BUFFER_STORE_DWORD_OFFSET killed $vgpr45, $sgpr0_sgpr1_sgpr2_sgpr3, $sgpr32, 40, 0, 0, implicit $exec :: (store (s32) into %stack.7, addrspace 5)
+    ; GFX1010-NEXT: BUFFER_STORE_DWORD_OFFSET killed $vgpr46, $sgpr0_sgpr1_sgpr2_sgpr3, $sgpr32, 36, 0, 0, implicit $exec :: (store (s32) into %stack.8, addrspace 5)
+    ; GFX1010-NEXT: BUFFER_STORE_DWORD_OFFSET killed $vgpr47, $sgpr0_sgpr1_sgpr2_sgpr3, $sgpr32, 32, 0, 0, implicit $exec :: (store (s32) into %stack.9, addrspace 5)
+    ; GFX1010-NEXT: BUFFER_STORE_DWORD_OFFSET killed $vgpr56, $sgpr0_sgpr1_sgpr2_sgpr3, $sgpr32, 28, 0, 0, implicit $exec :: (store (s32) into %stack.10, addrspace 5)
+    ; GFX1010-NEXT: BUFFER_STORE_DWORD_OFFSET killed $vgpr57, $sgpr0_sgpr1_sgpr2_sgpr3, $sgpr32, 24, 0, 0, implicit $exec :: (store (s32) into %stack.11, addrspace 5)
+    ; GFX1010-NEXT: BUFFER_STORE_DWORD_OFFSET killed $vgpr58, $sgpr0_sgpr1_sgpr2_sgpr3, $sgpr32, 20, 0, 0, implicit $exec :: (store (s32) into %stack.12, addrspace 5)
+    ; GFX1010-NEXT: BUFFER_STORE_DWORD_OFFSET killed $vgpr59, $sgpr0_sgpr1_sgpr2_sgpr3, $sgpr32, 16, 0, 0, implicit $exec :: (store (s32) into %stack.13, addrspace 5)
+    ; GFX1010-NEXT: BUFFER_STORE_DWORD_OFFSET killed $vgpr60, $sgpr0_sgpr1_sgpr2_sgpr3, $sgpr32, 12, 0, 0, implicit $exec :: (store (s32) into %stack.14, addrspace 5)
+    ; GFX1010-NEXT: BUFFER_STORE_DWORD_OFFSET killed $vgpr61, $sgpr0_sgpr1_sgpr2_sgpr3, $sgpr32, 8, 0, 0, implicit $exec :: (store (s32) into %stack.15, addrspace 5)
+    ; GFX1010-NEXT: BUFFER_STORE_DWORD_OFFSET killed $vgpr62, $sgpr0_sgpr1_sgpr2_sgpr3, $sgpr32, 4, 0, 0, implicit $exec :: (store (s32) into %stack.16, addrspace 5)
+    ; GFX1010-NEXT: BUFFER_STORE_DWORD_OFFSET killed $vgpr63, $sgpr0_sgpr1_sgpr2_sgpr3, $sgpr32, 0, 0, 0, implicit $exec :: (store (s32) into %stack.17, addrspace 5)
+    ; GFX1010-NEXT: S_NOP 0, implicit-def $vgpr0_vgpr1_vgpr2_vgpr3_vgpr4_vgpr5_vgpr6_vgpr7
+    ; GFX1010-NEXT: S_NOP 0, implicit-def $vgpr8_vgpr9_vgpr10_vgpr11_vgpr12_vgpr13_vgpr14_vgpr15
+    ; GFX1010-NEXT: S_NOP 0, implicit-def $vgpr16_vgpr17_vgpr18_vgpr19_vgpr20_vgpr21_vgpr22_vgpr23
+    ; GFX1010-NEXT: S_NOP 0, implicit-def $vgpr24_vgpr25_vgpr26_vgpr27_vgpr28_vgpr29_vgpr30_vgpr31
+    ; GFX1010-NEXT: S_NOP 0, implicit-def $vgpr32_vgpr33_vgpr34_vgpr35_vgpr36_vgpr37_vgpr38_vgpr39
+    ; GFX1010-NEXT: S_NOP 0, implicit-def $vgpr40_vgpr41_vgpr42_vgpr43_vgpr44_vgpr45_vgpr46_vgpr47
+    ; GFX1010-NEXT: S_NOP 0, implicit-def $vgpr48_vgpr49_vgpr50_vgpr51_vgpr52_vgpr53_vgpr54_vgpr55
+    ; GFX1010-NEXT: S_NOP 0, implicit-def $vgpr56_vgpr57_vgpr58_vgpr59_vgpr60_vgpr61_vgpr62_vgpr63
+    ; GFX1010-NEXT: V_CMP_EQ_U32_e32 0, killed $vgpr0, implicit-def $vcc_lo, implicit $exec
+    ; GFX1010-NEXT: S_CMP_EQ_I32 $sgpr4, $sgpr5, implicit-def $scc
+    ; GFX1010-NEXT: $vgpr64 = V_LSHRREV_B32_e64 5, $sgpr32, implicit $exec
+    ; GFX1010-NEXT: $vgpr64 = V_ADD_U32_e32 128, killed $vgpr64, implicit $exec
+    ; GFX1010-NEXT: $sgpr4 = V_READFIRSTLANE_B32 $vgpr64, implicit $exec
+    ; GFX1010-NEXT: S_NOP 0, implicit $vgpr0_vgpr1_vgpr2_vgpr3_vgpr4_vgpr5_vgpr6_vgpr7
+    ; GFX1010-NEXT: S_NOP 0, implicit $vgpr8_vgpr9_vgpr10_vgpr11_vgpr12_vgpr13_vgpr14_vgpr15
+    ; GFX1010-NEXT: S_NOP 0, implicit $vgpr16_vgpr17_vgpr18_vgpr19_vgpr20_vgpr21_vgpr22_vgpr23
+    ; GFX1010-NEXT: S_NOP 0, implicit $vgpr24_vgpr25_vgpr26_vgpr27_vgpr28_vgpr29_vgpr30_vgpr31
+    ; GFX1010-NEXT: S_NOP 0, implicit $vgpr32_vgpr33_vgpr34_vgpr35_vgpr36_vgpr37_vgpr38_vgpr39
+    ; GFX1010-NEXT: S_NOP 0, implicit $vgpr40_vgpr41_vgpr42_vgpr43_vgpr44_vgpr45_vgpr46_vgpr47
+    ; GFX1010-NEXT: S_NOP 0, implicit $vgpr48_vgpr49_vgpr50_vgpr51_vgpr52_vgpr53_vgpr54_vgpr55
+    ; GFX1010-NEXT: S_NOP 0, implicit $vgpr56_vgpr57_vgpr58_vgpr59_vgpr60_vgpr61_vgpr62_vgpr63
+    ; GFX1010-NEXT: $vgpr63 = BUFFER_LOAD_DWORD_OFFSET $sgpr0_sgpr1_sgpr2_sgpr3, $sgpr32, 0, 0, 0, implicit $exec :: (load (s32) from %stack.17, addrspace 5)
+    ; GFX1010-NEXT: $vgpr62 = BUFFER_LOAD_DWORD_OFFSET $sgpr0_sgpr1_sgpr2_sgpr3, $sgpr32, 4, 0, 0, implicit $exec :: (load (s32) from %stack.16, addrspace 5)
+    ; GFX1010-NEXT: $vgpr61 = BUFFER_LOAD_DWORD_OFFSET $sgpr0_sgpr1_sgpr2_sgpr3, $sgpr32, 8, 0, 0, implicit $exec :: (load (s32) from %stack.15, addrspace 5)
+    ; GFX1010-NEXT: $vgpr60 = BUFFER_LOAD_DWORD_OFFSET $sgpr0_sgpr1_sgpr2_sgpr3, $sgpr32, 12, 0, 0, implicit $exec :: (load (s32) from %stack.14, addrspace 5)
+    ; GFX1010-NEXT: $vgpr59 = BUFFER_LOAD_DWORD_OFFSET $sgpr0_sgpr1_sgpr2_sgpr3, $sgpr32, 16, 0, 0, implicit $exec :: (load (s32) from %stack.13, addrspace 5)
+    ; GFX1010-NEXT: $vgpr58 = BUFFER_LOAD_DWORD_OFFSET $sgpr0_sgpr1_sgpr2_sgpr3, $sgpr32, 20, 0, 0, implicit $exec :: (load (s32) from %stack.12, addrspace 5)
+    ; GFX1010-NEXT: $vgpr57 = BUFFER_LOAD_DWORD_OFFSET $sgpr0_sgpr1_sgpr2_sgpr3, $sgpr32, 24, 0, 0, implicit $exec :: (load (s32) from %stack.11, addrspace 5)
+    ; GFX1010-NEXT: $vgpr56 = BUFFER_LOAD_DWORD_OFFSET $sgpr0_sgpr1_sgpr2_sgpr3, $sgpr32, 28, 0, 0, implicit $exec :: (load (s32) from %stack.10, addrspace 5)
+    ; GFX1010-NEXT: $vgpr47 = BUFFER_LOAD_DWORD_OFFSET $sgpr0_sgpr1_sgpr2_sgpr3, $sgpr32, 32, 0, 0, implicit $exec :: (load (s32) from %stack.9, addrspace 5)
+    ; GFX1010-NEXT: $vgpr46 = BUFFER_LOAD_DWORD_OFFSET $sgpr0_sgpr1_sgpr2_sgpr3, $sgpr32, 36, 0, 0, implicit $exec :: (load (s32) from %stack.8, addrspace 5)
+    ; GFX1010-NEXT: $vgpr45 = BUFFER_LOAD_DWORD_OFFSET $sgpr0_sgpr1_sgpr2_sgpr3, $sgpr32, 40, 0, 0, implicit $exec :: (load (s32) from %stack.7, addrspace 5)
+    ; GFX1010-NEXT: $vgpr44 = BUFFER_LOAD_DWORD_OFFSET $sgpr0_sgpr1_sgpr2_sgpr3, $sgpr32, 44, 0, 0, implicit $exec :: (load (s32) from %stack.6, addrspace 5)
+    ; GFX1010-NEXT: $vgpr43 = BUFFER_LOAD_DWORD_OFFSET $sgpr0_sgpr1_sgpr2_sgpr3, $sgpr32, 48, 0, 0, implicit $exec :: (load (s32) from %stack.5, addrspace 5)
+    ; GFX1010-NEXT: $vgpr42 = BUFFER_LOAD_DWORD_OFFSET $sgpr0_sgpr1_sgpr2_sgpr3, $sgpr32, 52, 0, 0, implicit $exec :: (load (s32) from %stack.4, addrspace 5)
+    ; GFX1010-NEXT: $vgpr41 = BUFFER_LOAD_DWORD_OFFSET $sgpr0_sgpr1_sgpr2_sgpr3, $sgpr32, 56, 0, 0, implicit $exec :: (load (s32) from %stack.3, addrspace 5)
+    ; GFX1010-NEXT: $vgpr40 = BUFFER_LOAD_DWORD_OFFSET $sgpr0_sgpr1_sgpr2_sgpr3, $sgpr32, 60, 0, 0, implicit $exec :: (load (s32) from %stack.2, addrspace 5)
+    ; GFX1010-NEXT: S_ENDPGM 0, implicit $sgpr4, implicit $scc, implicit killed $vcc_lo
+    ;
+    ; GFX1100-LABEL: name: materialize_fi_s_mov_b32_offset_64_live_scc_live_vcc_no_vgpr
+    ; GFX1100: liveins: $sgpr4, $sgpr5, $vgpr0, $vgpr40, $vgpr41, $vgpr42, $vgpr43, $vgpr44, $vgpr45, $vgpr46, $vgpr47, $vgpr56, $vgpr57, $vgpr58, $vgpr59, $vgpr60, $vgpr61, $vgpr62, $vgpr63
+    ; GFX1100-NEXT: {{  $}}
+    ; GFX1100-NEXT: SCRATCH_STORE_DWORD_SADDR killed $vgpr40, $sgpr32, 60, 0, implicit $exec, implicit $flat_scr :: (store (s32) into %stack.2, addrspace 5)
+    ; GFX1100-NEXT: SCRATCH_STORE_DWORD_SADDR killed $vgpr41, $sgpr32, 56, 0, implicit $exec, implicit $flat_scr :: (store (s32) into %stack.3, addrspace 5)
+    ; GFX1100-NEXT: SCRATCH_STORE_DWORD_SADDR killed $vgpr42, $sgpr32, 52, 0, implicit $exec, implicit $flat_scr :: (store (s32) into %stack.4, addrspace 5)
+    ; GFX1100-NEXT: SCRATCH_STORE_DWORD_SADDR killed $vgpr43, $sgpr32, 48, 0, implicit $exec, implicit $flat_scr :: (store (s32) into %stack.5, addrspace 5)
+    ; GFX1100-NEXT: SCRATCH_STORE_DWORD_SADDR killed $vgpr44, $sgpr32, 44, 0, implicit $exec, implicit $flat_scr :: (store (s32) into %stack.6, addrspace 5)
+    ; GFX1100-NEXT: SCRATCH_STORE_DWORD_SADDR killed $vgpr45, $sgpr32, 40, 0, implicit $exec, implicit $flat_scr :: (store (s32) into %stack.7, addrspace 5)
+    ; GFX1100-NEXT: SCRATCH_STORE_DWORD_SADDR killed $vgpr46, $sgpr32, 36, 0, implicit $exec, implicit $flat_scr :: (store (s32) into %stack.8, addrspace 5)
+    ; GFX1100-NEXT: SCRATCH_STORE_DWORD_SADDR killed $vgpr47, $sgpr32, 32, 0, implicit $exec, implicit $flat_scr :: (store (s32) into %stack.9, addrspace 5)
+    ; GFX1100-NEXT: SCRATCH_STORE_DWORD_SADDR killed $vgpr56, $sgpr32, 28, 0, implicit $exec, implicit $flat_scr :: (store (s32) into %stack.10, addrspace 5)
+    ; GFX1100-NEXT: SCRATCH_STORE_DWORD_SADDR killed $vgpr57, $sgpr32, 24, 0, implicit $exec, implicit $flat_scr :: (store (s32) into %stack.11, addrspace 5)
+    ; GFX1100-NEXT: SCRATCH_STORE_DWORD_SADDR killed $vgpr58, $sgpr32, 20, 0, implicit $exec, implicit $flat_scr :: (store (s32) into %stack.12, addrspace 5)
+    ; GFX1100-NEXT: SCRATCH_STORE_DWORD_SADDR killed $vgpr59, $sgpr32, 16, 0, implicit $exec, implicit $flat_scr :: (store (s32) into %stack.13, addrspace 5)
+    ; GFX1100-NEXT: SCRATCH_STORE_DWORD_SADDR killed $vgpr60, $sgpr32, 12, 0, implicit $exec, implicit $flat_scr :: (store (s32) into %stack.14, addrspace 5)
+    ; GFX1100-NEXT: SCRATCH_STORE_DWORD_SADDR killed $vgpr61, $sgpr32, 8, 0, implicit $exec, implicit $flat_scr :: (store (s32) into %stack.15, addrspace 5)
+    ; GFX1100-NEXT: SCRATCH_STORE_DWORD_SADDR killed $vgpr62, $sgpr32, 4, 0, implicit $exec, implicit $flat_scr :: (store (s32) into %stack.16, addrspace 5)
+    ; GFX1100-NEXT: SCRATCH_STORE_DWORD_SADDR killed $vgpr63, $sgpr32, 0, 0, implicit $exec, implicit $flat_scr :: (store (s32) into %stack.17, addrspace 5)
+    ; GFX1100-NEXT: S_NOP 0, implicit-def $vgpr0_vgpr1_vgpr2_vgpr3_vgpr4_vgpr5_vgpr6_vgpr7
+    ; GFX1100-NEXT: S_NOP 0, implicit-def $vgpr8_vgpr9_vgpr10_vgpr11_vgpr12_vgpr13_vgpr14_vgpr15
+    ; GFX1100-NEXT: S_NOP 0, implicit-def $vgpr16_vgpr17_vgpr18_vgpr19_vgpr20_vgpr21_vgpr22_vgpr23
+    ; GFX1100-NEXT: S_NOP 0, implicit-def $vgpr24_vgpr25_vgpr26_vgpr27_vgpr28_vgpr29_vgpr30_vgpr31
+    ; GFX1100-NEXT: S_NOP 0, implicit-def $vgpr32_vgpr33_vgpr34_vgpr35_vgpr36_vgpr37_vgpr38_vgpr39
+    ; GFX1100-NEXT: S_NOP 0, implicit-def $vgpr40_vgpr41_vgpr42_vgpr43_vgpr44_vgpr45_vgpr46_vgpr47
+    ; GFX1100-NEXT: S_NOP 0, implicit-def $vgpr48_vgpr49_vgpr50_vgpr51_vgpr52_vgpr53_vgpr54_vgpr55
+    ; GFX1100-NEXT: S_NOP 0, implicit-def $vgpr56_vgpr57_vgpr58_vgpr59_vgpr60_vgpr61_vgpr62_vgpr63
+    ; GFX1100-NEXT: V_CMP_EQ_U32_e32 0, killed $vgpr0, implicit-def $vcc_lo, implicit $exec
+    ; GFX1100-NEXT: S_CMP_EQ_I32 $sgpr4, $sgpr5, implicit-def $scc
+    ; GFX1100-NEXT: $sgpr5 = S_ADDC_U32 $sgpr32, 128, implicit-def $scc, implicit $scc
+    ; GFX1100-NEXT: S_BITCMP1_B32 $sgpr5, 0, implicit-def $scc
+    ; GFX1100-NEXT: $sgpr5 = S_BITSET0_B32 0, $sgpr5
+    ; GFX1100-NEXT: renamable $sgpr4 = S_MOV_B32 killed $sgpr5
+    ; GFX1100-NEXT: S_NOP 0, implicit $vgpr0_vgpr1_vgpr2_vgpr3_vgpr4_vgpr5_vgpr6_vgpr7
+    ; GFX1100-NEXT: S_NOP 0, implicit $vgpr8_vgpr9_vgpr10_vgpr11_vgpr12_vgpr13_vgpr14_vgpr15
+    ; GFX1100-NEXT: S_NOP 0, implicit $vgpr16_vgpr17_vgpr18_vgpr19_vgpr20_vgpr21_vgpr22_vgpr23
+    ; GFX1100-NEXT: S_NOP 0, implicit $vgpr24_vgpr25_vgpr26_vgpr27_vgpr28_vgpr29_vgpr30_vgpr31
+    ; GFX1100-NEXT: S_NOP 0, implicit $vgpr32_vgpr33_vgpr34_vgpr35_vgpr36_vgpr37_vgpr38_vgpr39
+    ; GFX1100-NEXT: S_NOP 0, implicit $vgpr40_vgpr41_vgpr42_vgpr43_vgpr44_vgpr45_vgpr46_vgpr47
+    ; GFX1100-NEXT: S_NOP 0, implicit $vgpr48_vgpr49_vgpr50_vgpr51_vgpr52_vgpr53_vgpr54_vgpr55
+    ; GFX1100-NEXT: S_NOP 0, implicit $vgpr56_vgpr57_vgpr58_vgpr59_vgpr60_vgpr61_vgpr62_vgpr63
+    ; GFX1100-NEXT: $vgpr63 = SCRATCH_LOAD_DWORD_SADDR $sgpr32, 0, 0, implicit $exec, implicit $flat_scr :: (load (s32) from %stack.17, addrspace 5)
+    ; GFX1100-NEXT: $vgpr62 = SCRATCH_LOAD_DWORD_SADDR $sgpr32, 4, 0, implicit $exec, implicit $flat_scr :: (load (s32) from %stack.16, addrspace 5)
+    ; GFX1100-NEXT: $vgpr61 = SCRATCH_LOAD_DWORD_SADDR $sgpr32, 8, 0, implicit $exec, implicit $flat_scr :: (load (s32) from %stack.15, addrspace 5)
+    ; GFX1100-NEXT: $vgpr60 = SCRATCH_LOAD_DWORD_SADDR $sgpr32, 12, 0, implicit $exec, implicit $flat_scr :: (load (s32) from %stack.14, addrspace 5)
+    ; GFX1100-NEXT: $vgpr59 = SCRATCH_LOAD_DWORD_SADDR $sgpr32, 16, 0, implicit $exec, implicit $flat_scr :: (load (s32) from %stack.13, addrspace 5)
+    ; GFX1100-NEXT: $vgpr58 = SCRATCH_LOAD_DWORD_SADDR $sgpr32, 20, 0, implicit $exec, implicit $flat_scr :: (load (s32) from %stack.12, addrspace 5)
+    ; GFX1100-NEXT: $vgpr57 = SCRATCH_LOAD_DWORD_SADDR $sgpr32, 24, 0, implicit $exec, implicit $flat_scr :: (load (s32) from %stack.11, addrspace 5)
+    ; GFX1100-NEXT: $vgpr56 = SCRATCH_LOAD_DWORD_SADDR $sgpr32, 28, 0, implicit $exec, implicit $flat_scr :: (load (s32) from %stack.10, addrspace 5)
+    ; GFX1100-NEXT: $vgpr47 = SCRATCH_LOAD_DWORD_SADDR $sgpr32, 32, 0, implicit $exec, implicit $flat_scr :: (load (s32) from %stack.9, addrspace 5)
+    ; GFX1100-NEXT: $vgpr46 = SCRATCH_LOAD_DWORD_SADDR $sgpr32, 36, 0, implicit $exec, implicit $flat_scr :: (load (s32) from %stack.8, addrspace 5)
+    ; GFX1100-NEXT: $vgpr45 = SCRATCH_LOAD_DWORD_SADDR $sgpr32, 40, 0, implicit $exec, implicit $flat_scr :: (load (s32) from %stack.7, addrspace 5)
+    ; GFX1100-NEXT: $vgpr44 = SCRATCH_LOAD_DWORD_SADDR $sgpr32, 44, 0, implicit $exec, implicit $flat_scr :: (load (s32) from %stack.6, addrspace 5)
+    ; GFX1100-NEXT: $vgpr43 = SCRATCH_LOAD_DWORD_SADDR $sgpr32, 48, 0, implicit $exec, implicit $flat_scr :: (load (s32) from %stack.5, addrspace 5)
+    ; GFX1100-NEXT: $vgpr42 = SCRATCH_LOAD_DWORD_SADDR $sgpr32, 52, 0, implicit $exec, implicit $flat_scr :: (load (s32) from %stack.4, addrspace 5)
+    ; GFX1100-NEXT: $vgpr41 = SCRATCH_LOAD_DWORD_SADDR $sgpr32, 56, 0, implicit $exec, implicit $flat_scr :: (load (s32) from %stack.3, addrspace 5)
+    ; GFX1100-NEXT: $vgpr40 = SCRATCH_LOAD_DWORD_SADDR $sgpr32, 60, 0, implicit $exec, implicit $flat_scr :: (load (s32) from %stack.2, addrspace 5)
+    ; GFX1100-NEXT: S_ENDPGM 0, implicit $sgpr4, implicit $scc, implicit killed $vcc_lo
+    ;
+    ; GFX1200-LABEL: name: materialize_fi_s_mov_b32_offset_64_live_scc_live_vcc_no_vgpr
+    ; GFX1200: liveins: $sgpr4, $sgpr5, $vgpr0, $vgpr40, $vgpr41, $vgpr42, $vgpr43, $vgpr44, $vgpr45, $vgpr46, $vgpr47, $vgpr56, $vgpr57, $vgpr58, $vgpr59, $vgpr60, $vgpr61, $vgpr62, $vgpr63
+    ; GFX1200-NEXT: {{  $}}
+    ; GFX1200-NEXT: SCRATCH_STORE_DWORD_SADDR killed $vgpr40, $sgpr32, 60, 0, implicit $exec, implicit $flat_scr :: (store (s32) into %stack.2, addrspace 5)
+    ; GFX1200-NEXT: SCRATCH_STORE_DWORD_SADDR killed $vgpr41, $sgpr32, 56, 0, implicit $exec, implicit $flat_scr :: (store (s32) into %stack.3, addrspace 5)
+    ; GFX1200-NEXT: SCRATCH_STORE_DWORD_SADDR killed $vgpr42, $sgpr32, 52, 0, implicit $exec, implicit $flat_scr :: (store (s32) into %stack.4, addrspace 5)
+    ; GFX1200-NEXT: SCRATCH_STORE_DWORD_SADDR killed $vgpr43, $sgpr32, 48, 0, implicit $exec, implicit $flat_scr :: (store (s32) into %stack.5, addrspace 5)
+    ; GFX1200-NEXT: SCRATCH_STORE_DWORD_SADDR killed $vgpr44, $sgpr32, 44, 0, implicit $exec, implicit $flat_scr :: (store (s32) into %stack.6, addrspace 5)
+    ; GFX1200-NEXT: SCRATCH_STORE_DWORD_SADDR killed $vgpr45, $sgpr32, 40, 0, implicit $exec, implicit $flat_scr :: (store (s32) into %stack.7, addrspace 5)
+    ; GFX1200-NEXT: SCRATCH_STORE_DWORD_SADDR killed $vgpr46, $sgpr32, 36, 0, implicit $exec, implicit $flat_scr :: (store (s32) into %stack.8, addrspace 5)
+    ; GFX1200-NEXT: SCRATCH_STORE_DWORD_SADDR killed $vgpr47, $sgpr32, 32, 0, implicit $exec, implicit $flat_scr :: (store (s32) into %stack.9, addrspace 5)
+    ; GFX1200-NEXT: SCRATCH_STORE_DWORD_SADDR killed $vgpr56, $sgpr32, 28, 0, implicit $exec, implicit $flat_scr :: (store (s32) into %stack.10, addrspace 5)
+    ; GFX1200-NEXT: SCRATCH_STORE_DWORD_SADDR killed $vgpr57, $sgpr32, 24, 0, implicit $exec, implicit $flat_scr :: (store (s32) into %stack.11, addrspace 5)
+    ; GFX1200-NEXT: SCRATCH_STORE_DWORD_SADDR killed $vgpr58, $sgpr32, 20, 0, implicit $exec, implicit $flat_scr :: (store (s32) into %stack.12, addrspace 5)
+    ; GFX1200-NEXT: SCRATCH_STORE_DWORD_SADDR killed $vgpr59, $sgpr32, 16, 0, implicit $exec, implicit $flat_scr :: (store (s32) into %stack.13, addrspace 5)
+    ; GFX1200-NEXT: SCRATCH_STORE_DWORD_SADDR killed $vgpr60, $sgpr32, 12, 0, implicit $exec, implicit $flat_scr :: (store (s32) into %stack.14, addrspace 5)
+    ; GFX1200-NEXT: SCRATCH_STORE_DWORD_SADDR killed $vgpr61, $sgpr32, 8, 0, implicit $exec, implicit $flat_scr :: (store (s32) into %stack.15, addrspace 5)
+    ; GFX1200-NEXT: SCRATCH_STORE_DWORD_SADDR killed $vgpr62, $sgpr32, 4, 0, implicit $exec, implicit $flat_scr :: (store (s32) into %stack.16, addrspace 5)
+    ; GFX1200-NEXT: SCRATCH_STORE_DWORD_SADDR killed $vgpr63, $sgpr32, 0, 0, implicit $exec, implicit $flat_scr :: (store (s32) into %stack.17, addrspace 5)
+    ; GFX1200-NEXT: S_NOP 0, implicit-def $vgpr0_vgpr1_vgpr2_vgpr3_vgpr4_vgpr5_vgpr6_vgpr7
+    ; GFX1200-NEXT: S_NOP 0, implicit-def $vgpr8_vgpr9_vgpr10_vgpr11_vgpr12_vgpr13_vgpr14_vgpr15
+    ; GFX1200-NEXT: S_NOP 0, implicit-def $vgpr16_vgpr17_vgpr18_vgpr19_vgpr20_vgpr21_vgpr22_vgpr23
+    ; GFX1200-NEXT: S_NOP 0, implicit-def $vgpr24_vgpr25_vgpr26_vgpr27_vgpr28_vgpr29_vgpr30_vgpr31
+    ; GFX1200-NEXT: S_NOP 0, implicit-def $vgpr32_vgpr33_vgpr34_vgpr35_vgpr36_vgpr37_vgpr38_vgpr39
+    ; GFX1200-NEXT: S_NOP 0, implicit-def $vgpr40_vgpr41_vgpr42_vgpr43_vgpr44_vgpr45_vgpr46_vgpr47
+    ; GFX1200-NEXT: S_NOP 0, implicit-def $vgpr48_vgpr49_vgpr50_vgpr51_vgpr52_vgpr53_vgpr54_vgpr55
+    ; GFX1200-NEXT: S_NOP 0, implicit-def $vgpr56_vgpr57_vgpr58_vgpr59_vgpr60_vgpr61_vgpr62_vgpr63
+    ; GFX1200-NEXT: V_CMP_EQ_U32_e32 0, killed $vgpr0, implicit-def $vcc_lo, implicit $exec
+    ; GFX1200-NEXT: S_CMP_EQ_I32 $sgpr4, $sgpr5, implicit-def $scc
+    ; GFX1200-NEXT: $sgpr5 = S_ADDC_U32 $sgpr32, 128, implicit-def $scc, implicit $scc
+    ; GFX1200-NEXT: S_BITCMP1_B32 $sgpr5, 0, implicit-def $scc
+    ; GFX1200-NEXT: $sgpr5 = S_BITSET0_B32 0, $sgpr5
+    ; GFX1200-NEXT: renamable $sgpr4 = S_MOV_B32 killed $sgpr5
+    ; GFX1200-NEXT: S_NOP 0, implicit $vgpr0_vgpr1_vgpr2_vgpr3_vgpr4_vgpr5_vgpr6_vgpr7
+    ; GFX1200-NEXT: S_NOP 0, implicit $vgpr8_vgpr9_vgpr10_vgpr11_vgpr12_vgpr13_vgpr14_vgpr15
+    ; GFX1200-NEXT: S_NOP 0, implicit $vgpr16_vgpr17_vgpr18_vgpr19_vgpr20_vgpr21_vgpr22_vgpr23
+    ; GFX1200-NEXT: S_NOP 0, implicit $vgpr24_vgpr25_vgpr26_vgpr27_vgpr28_vgpr29_vgpr30_vgpr31
+    ; GFX1200-NEXT: S_NOP 0, implicit $vgpr32_vgpr33_vgpr34_vgpr35_vgpr36_vgpr37_vgpr38_vgpr39
+    ; GFX1200-NEXT: S_NOP 0, implicit $vgpr40_vgpr41_vgpr42_vgpr43_vgpr44_vgpr45_vgpr46_vgpr47
+    ; GFX1200-NEXT: S_NOP 0, implicit $vgpr48_vgpr49_vgpr50_vgpr51_vgpr52_vgpr53_vgpr54_vgpr55
+    ; GFX1200-NEXT: S_NOP 0, implicit $vgpr56_vgpr57_vgpr58_vgpr59_vgpr60_vgpr61_vgpr62_vgpr63
+    ; GFX1200-NEXT: $vgpr63 = SCRATCH_LOAD_DWORD_SADDR $sgpr32, 0, 0, implicit $exec, implicit $flat_scr :: (load (s32) from %stack.17, addrspace 5)
+    ; GFX1200-NEXT: $vgpr62 = SCRATCH_LOAD_DWORD_SADDR $sgpr32, 4, 0, implicit $exec, implicit $flat_scr :: (load (s32) from %stack.16, addrspace 5)
+    ; GFX1200-NEXT: $vgpr61 = SCRATCH_LOAD_DWORD_SADDR $sgpr32, 8, 0, implicit $exec, implicit $flat_scr :: (load (s32) from %stack.15, addrspace 5)
+    ; GFX1200-NEXT: $vgpr60 = SCRATCH_LOAD_DWORD_SADDR $sgpr32, 12, 0, implicit $exec, implicit $flat_scr :: (load (s32) from %stack.14, addrspace 5)
+    ; GFX1200-NEXT: $vgpr59 = SCRATCH_LOAD_DWORD_SADDR $sgpr32, 16, 0, implicit $exec, implicit $flat_scr :: (load (s32) from %stack.13, addrspace 5)
+    ; GFX1200-NEXT: $vgpr58 = SCRATCH_LOAD_DWORD_SADDR $sgpr32, 20, 0, implicit $exec, implicit $flat_scr :: (load (s32) from %stack.12, addrspace 5)
+    ; GFX1200-NEXT: $vgpr57 = SCRATCH_LOAD_DWORD_SADDR $sgpr32, 24, 0, implicit $exec, implicit $flat_scr :: (load (s32) from %stack.11, addrspace 5)
+    ; GFX1200-NEXT: $vgpr56 = SCRATCH_LOAD_DWORD_SADDR $sgpr32, 28, 0, implicit $exec, implicit $flat_scr :: (load (s32) from %stack.10, addrspace 5)
+    ; GFX1200-NEXT: $vgpr47 = SCRATCH_LOAD_DWORD_SADDR $sgpr32, 32, 0, implicit $exec, implicit $flat_scr :: (load (s32) from %stack.9, addrspace 5)
+    ; GFX1200-NEXT: $vgpr46 = SCRATCH_LOAD_DWORD_SADDR $sgpr32, 36, 0, implicit $exec, implicit $flat_scr :: (load (s32) from %stack.8, addrspace 5)
+    ; GFX1200-NEXT: $vgpr45 = SCRATCH_LOAD_DWORD_SADDR $sgpr32, 40, 0, implicit $exec, implicit $flat_scr :: (load (s32) from %stack.7, addrspace 5)
+    ; GFX1200-NEXT: $vgpr44 = SCRATCH_LOAD_DWORD_SADDR $sgpr32, 44, 0, implicit $exec, implicit $flat_scr :: (load (s32) from %stack.6, addrspace 5)
+    ; GFX1200-NEXT: $vgpr43 = SCRATCH_LOAD_DWORD_SADDR $sgpr32, 48, 0, implicit $exec, implicit $flat_scr :: (load (s32) from %stack.5, addrspace 5)
+    ; GFX1200-NEXT: $vgpr42 = SCRATCH_LOAD_DWORD_SADDR $sgpr32, 52, 0, implicit $exec, implicit $flat_scr :: (load (s32) from %stack.4, addrspace 5)
+    ; GFX1200-NEXT: $vgpr41 = SCRATCH_LOAD_DWORD_SADDR $sgpr32, 56, 0, implicit $exec, implicit $flat_scr :: (load (s32) from %stack.3, addrspace 5)
+    ; GFX1200-NEXT: $vgpr40 = SCRATCH_LOAD_DWORD_SADDR $sgpr32, 60, 0, implicit $exec, implicit $flat_scr :: (load (s32) from %stack.2, addrspace 5)
+    ; GFX1200-NEXT: S_ENDPGM 0, implicit $sgpr4, implicit $scc, implicit killed $vcc_lo
+  S_NOP 0, implicit-def $vgpr0_vgpr1_vgpr2_vgpr3_vgpr4_vgpr5_vgpr6_vgpr7
+  S_NOP 0, implicit-def $vgpr8_vgpr9_vgpr10_vgpr11_vgpr12_vgpr13_vgpr14_vgpr15
+  S_NOP 0, implicit-def $vgpr16_vgpr17_vgpr18_vgpr19_vgpr20_vgpr21_vgpr22_vgpr23
+  S_NOP 0, implicit-def $vgpr24_vgpr25_vgpr26_vgpr27_vgpr28_vgpr29_vgpr30_vgpr31
+  S_NOP 0, implicit-def $vgpr32_vgpr33_vgpr34_vgpr35_vgpr36_vgpr37_vgpr38_vgpr39
+  S_NOP 0, implicit-def $vgpr40_vgpr41_vgpr42_vgpr43_vgpr44_vgpr45_vgpr46_vgpr47
+  S_NOP 0, implicit-def $vgpr48_vgpr49_vgpr50_vgpr51_vgpr52_vgpr53_vgpr54_vgpr55
+  S_NOP 0, implicit-def $vgpr56_vgpr57_vgpr58_vgpr59_vgpr60_vgpr61_vgpr62_vgpr63
+
+  V_CMP_EQ_U32_e32 0, killed $vgpr0, implicit-def $vcc, implicit $exec
+  S_CMP_EQ_I32 $sgpr4, $sgpr5, implicit-def $scc
+  renamable $sgpr4 = S_MOV_B32 %stack.1
+
+  S_NOP 0, implicit $vgpr0_vgpr1_vgpr2_vgpr3_vgpr4_vgpr5_vgpr6_vgpr7
+  S_NOP 0, implicit $vgpr8_vgpr9_vgpr10_vgpr11_vgpr12_vgpr13_vgpr14_vgpr15
+  S_NOP 0, implicit $vgpr16_vgpr17_vgpr18_vgpr19_vgpr20_vgpr21_vgpr22_vgpr23
+  S_NOP 0, implicit $vgpr24_vgpr25_vgpr26_vgpr27_vgpr28_vgpr29_vgpr30_vgpr31
+  S_NOP 0, implicit $vgpr32_vgpr33_vgpr34_vgpr35_vgpr36_vgpr37_vgpr38_vgpr39
+  S_NOP 0, implicit $vgpr40_vgpr41_vgpr42_vgpr43_vgpr44_vgpr45_vgpr46_vgpr47
+  S_NOP 0, implicit $vgpr48_vgpr49_vgpr50_vgpr51_vgpr52_vgpr53_vgpr54_vgpr55
+  S_NOP 0, implicit $vgpr56_vgpr57_vgpr58_vgpr59_vgpr60_vgpr61_vgpr62_vgpr63
+  S_ENDPGM 0, implicit $sgpr4, implicit $scc, implicit killed $vcc
+...
+
+
+---
+name: materialize_fi_s_mov_b32_offset_64_live_scc_live_vcc_no_sgpr
+tracksRegLiveness: true
+stack:
+  - { id: 0, type: default, size: 64, alignment: 16, stack-id: default }
+  - { id: 1, type: default, size: 4, alignment: 4, stack-id: default }
+machineFunctionInfo:
+  stackPtrOffsetReg: '$sgpr32'
+body:   |
+  bb.0:
+    liveins: $sgpr4, $sgpr5, $vgpr0
+
+
+    ; GFX8-LABEL: name: materialize_fi_s_mov_b32_offset_64_live_scc_live_vcc_no_sgpr
+    ; GFX8: liveins: $sgpr4, $sgpr5, $vgpr0
+    ; GFX8-NEXT: {{  $}}
+    ; GFX8-NEXT: S_NOP 0, implicit-def $sgpr0_sgpr1_sgpr2_sgpr3_sgpr4_sgpr5_sgpr6_sgpr7
+    ; GFX8-NEXT: S_NOP 0, implicit-def $sgpr8_sgpr9_sgpr10_sgpr11_sgpr12_sgpr13_sgpr14_sgpr15
+    ; GFX8-NEXT: S_NOP 0, implicit-def $sgpr16_sgpr17_sgpr18_sgpr19_sgpr20_sgpr21_sgpr22_sgpr23
+    ; GFX8-NEXT: S_NOP 0, implicit-def $sgpr24_sgpr25_sgpr26_sgpr27_sgpr28_sgpr29_sgpr30_sgpr31
+    ; GFX8-NEXT: S_NOP 0, implicit-def $sgpr32_sgpr33_sgpr34_sgpr35_sgpr36_sgpr37_sgpr38_sgpr39
+    ; GFX8-NEXT: S_NOP 0, implicit-def $sgpr40_sgpr41_sgpr42_sgpr43_sgpr44_sgpr45_sgpr46_sgpr47
+    ; GFX8-NEXT: S_NOP 0, implicit-def $sgpr48_sgpr49_sgpr50_sgpr51_sgpr52_sgpr53_sgpr54_sgpr55
+    ; GFX8-NEXT: S_NOP 0, implicit-def $sgpr56_sgpr57_sgpr58_sgpr59_sgpr60_sgpr61_sgpr62_sgpr63
+    ; GFX8-NEXT: V_CMP_EQ_U32_e32 0, killed $vgpr0, implicit-def $vcc, implicit $exec
+    ; GFX8-NEXT: S_CMP_EQ_I32 $sgpr4, $sgpr5, implicit-def $scc
+    ; GFX8-NEXT: $vgpr0 = V_MOV_B32_e32 64, implicit $exec
+    ; GFX8-NEXT: $vgpr0 = V_MAD_U32_U24_e64 killed $vgpr0, 64, $sgpr32, 0, implicit $exec
+    ; GFX8-NEXT: $vgpr0 = V_LSHRREV_B32_e64 6, $sgpr32, implicit $exec
+    ; GFX8-NEXT: $sgpr4 = V_READFIRSTLANE_B32 $vgpr0, implicit $exec
+    ; GFX8-NEXT: S_NOP 0, implicit $sgpr0_sgpr1_sgpr2_sgpr3_sgpr4_sgpr5_sgpr6_sgpr7
+    ; GFX8-NEXT: S_NOP 0, implicit $sgpr8_sgpr9_sgpr10_sgpr11_sgpr12_sgpr13_sgpr14_sgpr15
+    ; GFX8-NEXT: S_NOP 0, implicit $sgpr16_sgpr17_sgpr18_sgpr19_sgpr20_sgpr21_sgpr22_sgpr23
+    ; GFX8-NEXT: S_NOP 0, implicit $sgpr24_sgpr25_sgpr26_sgpr27_sgpr28_sgpr29_sgpr30_sgpr31
+    ; GFX8-NEXT: S_NOP 0, implicit $sgpr32_sgpr33_sgpr34_sgpr35_sgpr36_sgpr37_sgpr38_sgpr39
+    ; GFX8-NEXT: S_NOP 0, implicit $sgpr40_sgpr41_sgpr42_sgpr43_sgpr44_sgpr45_sgpr46_sgpr47
+    ; GFX8-NEXT: S_NOP 0, implicit $sgpr48_sgpr49_sgpr50_sgpr51_sgpr52_sgpr53_sgpr54_sgpr55
+    ; GFX8-NEXT: S_NOP 0, implicit $sgpr56_sgpr57_sgpr58_sgpr59_sgpr60_sgpr61_sgpr62_sgpr63
+    ; GFX8-NEXT: S_ENDPGM 0, implicit $sgpr4, implicit $scc, implicit killed $vcc
+    ;
+    ; GFX900-LABEL: name: materialize_fi_s_mov_b32_offset_64_live_scc_live_vcc_no_sgpr
+    ; GFX900: liveins: $sgpr4, $sgpr5, $vgpr0
+    ; GFX900-NEXT: {{  $}}
+    ; GFX900-NEXT: S_NOP 0, implicit-def $sgpr0_sgpr1_sgpr2_sgpr3_sgpr4_sgpr5_sgpr6_sgpr7
+    ; GFX900-NEXT: S_NOP 0, implicit-def $sgpr8_sgpr9_sgpr10_sgpr11_sgpr12_sgpr13_sgpr14_sgpr15
+    ; GFX900-NEXT: S_NOP 0, implicit-def $sgpr16_sgpr17_sgpr18_sgpr19_sgpr20_sgpr21_sgpr22_sgpr23
+    ; GFX900-NEXT: S_NOP 0, implicit-def $sgpr24_sgpr25_sgpr26_sgpr27_sgpr28_sgpr29_sgpr30_sgpr31
+    ; GFX900-NEXT: S_NOP 0, implicit-def $sgpr32_sgpr33_sgpr34_sgpr35_sgpr36_sgpr37_sgpr38_sgpr39
+    ; GFX900-NEXT: S_NOP 0, implicit-def $sgpr40_sgpr41_sgpr42_sgpr43_sgpr44_sgpr45_sgpr46_sgpr47
+    ; GFX900-NEXT: S_NOP 0, implicit-def $sgpr48_sgpr49_sgpr50_sgpr51_sgpr52_sgpr53_sgpr54_sgpr55
+    ; GFX900-NEXT: S_NOP 0, implicit-def $sgpr56_sgpr57_sgpr58_sgpr59_sgpr60_sgpr61_sgpr62_sgpr63
+    ; GFX900-NEXT: V_CMP_EQ_U32_e32 0, killed $vgpr0, implicit-def $vcc, implicit $exec
+    ; GFX900-NEXT: S_CMP_EQ_I32 $sgpr4, $sgpr5, implicit-def $scc
+    ; GFX900-NEXT: $vgpr0 = V_LSHRREV_B32_e64 6, $sgpr32, implicit $exec
+    ; GFX900-NEXT: $vgpr0 = V_ADD_U32_e32 64, killed $vgpr0, implicit $exec
+    ; GFX900-NEXT: $sgpr4 = V_READFIRSTLANE_B32 $vgpr0, implicit $exec
+    ; GFX900-NEXT: S_NOP 0, implicit $sgpr0_sgpr1_sgpr2_sgpr3_sgpr4_sgpr5_sgpr6_sgpr7
+    ; GFX900-NEXT: S_NOP 0, implicit $sgpr8_sgpr9_sgpr10_sgpr11_sgpr12_sgpr13_sgpr14_sgpr15
+    ; GFX900-NEXT: S_NOP 0, implicit $sgpr16_sgpr17_sgpr18_sgpr19_sgpr20_sgpr21_sgpr22_sgpr23
+    ; GFX900-NEXT: S_NOP 0, implicit $sgpr24_sgpr25_sgpr26_sgpr27_sgpr28_sgpr29_sgpr30_sgpr31
+    ; GFX900-NEXT: S_NOP 0, implicit $sgpr32_sgpr33_sgpr34_sgpr35_sgpr36_sgpr37_sgpr38_sgpr39
+    ; GFX900-NEXT: S_NOP 0, implicit $sgpr40_sgpr41_sgpr42_sgpr43_sgpr44_sgpr45_sgpr46_sgpr47
+    ; GFX900-NEXT: S_NOP 0, implicit $sgpr48_sgpr49_sgpr50_sgpr51_sgpr52_sgpr53_sgpr54_sgpr55
+    ; GFX900-NEXT: S_NOP 0, implicit $sgpr56_sgpr57_sgpr58_sgpr59_sgpr60_sgpr61_sgpr62_sgpr63
+    ; GFX900-NEXT: S_ENDPGM 0, implicit $sgpr4, implicit $scc, implicit killed $vcc
+    ;
+    ; GFX90A-LABEL: name: materialize_fi_s_mov_b32_offset_64_live_scc_live_vcc_no_sgpr
+    ; GFX90A: liveins: $sgpr4, $sgpr5, $vgpr0
+    ; GFX90A-NEXT: {{  $}}
+    ; GFX90A-NEXT: S_NOP 0, implicit-def $sgpr0_sgpr1_sgpr2_sgpr3_sgpr4_sgpr5_sgpr6_sgpr7
+    ; GFX90A-NEXT: S_NOP 0, implicit-def $sgpr8_sgpr9_sgpr10_sgpr11_sgpr12_sgpr13_sgpr14_sgpr15
+    ; GFX90A-NEXT: S_NOP 0, implicit-def $sgpr16_sgpr17_sgpr18_sgpr19_sgpr20_sgpr21_sgpr22_sgpr23
+    ; GFX90A-NEXT: S_NOP 0, implicit-def $sgpr24_sgpr25_sgpr26_sgpr27_sgpr28_sgpr29_sgpr30_sgpr31
+    ; GFX90A-NEXT: S_NOP 0, implicit-def $sgpr32_sgpr33_sgpr34_sgpr35_sgpr36_sgpr37_sgpr38_sgpr39
+    ; GFX90A-NEXT: S_NOP 0, implicit-def $sgpr40_sgpr41_sgpr42_sgpr43_sgpr44_sgpr45_sgpr46_sgpr47
+    ; GFX90A-NEXT: S_NOP 0, implicit-def $sgpr48_sgpr49_sgpr50_sgpr51_sgpr52_sgpr53_sgpr54_sgpr55
+    ; GFX90A-NEXT: S_NOP 0, implicit-def $sgpr56_sgpr57_sgpr58_sgpr59_sgpr60_sgpr61_sgpr62_sgpr63
+    ; GFX90A-NEXT: V_CMP_EQ_U32_e32 0, killed $vgpr0, implicit-def $vcc, implicit $exec
+    ; GFX90A-NEXT: S_CMP_EQ_I32 $sgpr4, $sgpr5, implicit-def $scc
+    ; GFX90A-NEXT: $vgpr0 = V_LSHRREV_B32_e64 6, $sgpr32, implicit $exec
+    ; GFX90A-NEXT: $vgpr0 = V_ADD_U32_e32 64, killed $vgpr0, implicit $exec
+    ; GFX90A-NEXT: $sgpr4 = V_READFIRSTLANE_B32 $vgpr0, implicit $exec
+    ; GFX90A-NEXT: S_NOP 0, implicit $sgpr0_sgpr1_sgpr2_sgpr3_sgpr4_sgpr5_sgpr6_sgpr7
+    ; GFX90A-NEXT: S_NOP 0, implicit $sgpr8_sgpr9_sgpr10_sgpr11_sgpr12_sgpr13_sgpr14_sgpr15
+    ; GFX90A-NEXT: S_NOP 0, implicit $sgpr16_sgpr17_sgpr18_sgpr19_sgpr20_sgpr21_sgpr22_sgpr23
+    ; GFX90A-NEXT: S_NOP 0, implicit $sgpr24_sgpr25_sgpr26_sgpr27_sgpr28_sgpr29_sgpr30_sgpr31
+    ; GFX90A-NEXT: S_NOP 0, implicit $sgpr32_sgpr33_sgpr34_sgpr35_sgpr36_sgpr37_sgpr38_sgpr39
+    ; GFX90A-NEXT: S_NOP 0, implicit $sgpr40_sgpr41_sgpr42_sgpr43_sgpr44_sgpr45_sgpr46_sgpr47
+    ; GFX90A-NEXT: S_NOP 0, implicit $sgpr48_sgpr49_sgpr50_sgpr51_sgpr52_sgpr53_sgpr54_sgpr55
+    ; GFX90A-NEXT: S_NOP 0, implicit $sgpr56_sgpr57_sgpr58_sgpr59_sgpr60_sgpr61_sgpr62_sgpr63
+    ; GFX90A-NEXT: S_ENDPGM 0, implicit $sgpr4, implicit $scc, implicit killed $vcc
+    ;
+    ; GFX1010-LABEL: name: materialize_fi_s_mov_b32_offset_64_live_scc_live_vcc_no_sgpr
+    ; GFX1010: liveins: $sgpr4, $sgpr5, $vgpr0
+    ; GFX1010-NEXT: {{  $}}
+    ; GFX1010-NEXT: S_NOP 0, implicit-def $sgpr0_sgpr1_sgpr2_sgpr3_sgpr4_sgpr5_sgpr6_sgpr7
+    ; GFX1010-NEXT: S_NOP 0, implicit-def $sgpr8_sgpr9_sgpr10_sgpr11_sgpr12_sgpr13_sgpr14_sgpr15
+    ; GFX1010-NEXT: S_NOP 0, implicit-def $sgpr16_sgpr17_sgpr18_sgpr19_sgpr20_sgpr21_sgpr22_sgpr23
+    ; GFX1010-NEXT: S_NOP 0, implicit-def $sgpr24_sgpr25_sgpr26_sgpr27_sgpr28_sgpr29_sgpr30_sgpr31
+    ; GFX1010-NEXT: S_NOP 0, implicit-def $sgpr32_sgpr33_sgpr34_sgpr35_sgpr36_sgpr37_sgpr38_sgpr39
+    ; GFX1010-NEXT: S_NOP 0, implicit-def $sgpr40_sgpr41_sgpr42_sgpr43_sgpr44_sgpr45_sgpr46_sgpr47
+    ; GFX1010-NEXT: S_NOP 0, implicit-def $sgpr48_sgpr49_sgpr50_sgpr51_sgpr52_sgpr53_sgpr54_sgpr55
+    ; GFX1010-NEXT: S_NOP 0, implicit-def $sgpr56_sgpr57_sgpr58_sgpr59_sgpr60_sgpr61_sgpr62_sgpr63
+    ; GFX1010-NEXT: V_CMP_EQ_U32_e32 0, killed $vgpr0, implicit-def $vcc_lo, implicit $exec
+    ; GFX1010-NEXT: S_CMP_EQ_I32 $sgpr4, $sgpr5, implicit-def $scc
+    ; GFX1010-NEXT: $vgpr0 = V_LSHRREV_B32_e64 5, $sgpr32, implicit $exec
+    ; GFX1010-NEXT: $vgpr0 = V_ADD_U32_e32 64, killed $vgpr0, implicit $exec
+    ; GFX1010-NEXT: $sgpr4 = V_READFIRSTLANE_B32 $vgpr0, implicit $exec
+    ; GFX1010-NEXT: S_NOP 0, implicit $sgpr0_sgpr1_sgpr2_sgpr3_sgpr4_sgpr5_sgpr6_sgpr7
+    ; GFX1010-NEXT: S_NOP 0, implicit $sgpr8_sgpr9_sgpr10_sgpr11_sgpr12_sgpr13_sgpr14_sgpr15
+    ; GFX1010-NEXT: S_NOP 0, implicit $sgpr16_sgpr17_sgpr18_sgpr19_sgpr20_sgpr21_sgpr22_sgpr23
+    ; GFX1010-NEXT: S_NOP 0, implicit $sgpr24_sgpr25_sgpr26_sgpr27_sgpr28_sgpr29_sgpr30_sgpr31
+    ; GFX1010-NEXT: S_NOP 0, implicit $sgpr32_sgpr33_sgpr34_sgpr35_sgpr36_sgpr37_sgpr38_sgpr39
+    ; GFX1010-NEXT: S_NOP 0, implicit $sgpr40_sgpr41_sgpr42_sgpr43_sgpr44_sgpr45_sgpr46_sgpr47
+    ; GFX1010-NEXT: S_NOP 0, implicit $sgpr48_sgpr49_sgpr50_sgpr51_sgpr52_sgpr53_sgpr54_sgpr55
+    ; GFX1010-NEXT: S_NOP 0, implicit $sgpr56_sgpr57_sgpr58_sgpr59_sgpr60_sgpr61_sgpr62_sgpr63
+    ; GFX1010-NEXT: S_ENDPGM 0, implicit $sgpr4, implicit $scc, implicit killed $vcc_lo
+    ;
+    ; GFX1100-LABEL: name: materialize_fi_s_mov_b32_offset_64_live_scc_live_vcc_no_sgpr
+    ; GFX1100: liveins: $sgpr4, $sgpr5, $vgpr0
+    ; GFX1100-NEXT: {{  $}}
+    ; GFX1100-NEXT: S_NOP 0, implicit-def $sgpr0_sgpr1_sgpr2_sgpr3_sgpr4_sgpr5_sgpr6_sgpr7
+    ; GFX1100-NEXT: S_NOP 0, implicit-def $sgpr8_sgpr9_sgpr10_sgpr11_sgpr12_sgpr13_sgpr14_sgpr15
+    ; GFX1100-NEXT: S_NOP 0, implicit-def $sgpr16_sgpr17_sgpr18_sgpr19_sgpr20_sgpr21_sgpr22_sgpr23
+    ; GFX1100-NEXT: S_NOP 0, implicit-def $sgpr24_sgpr25_sgpr26_sgpr27_sgpr28_sgpr29_sgpr30_sgpr31
+    ; GFX1100-NEXT: S_NOP 0, implicit-def $sgpr32_sgpr33_sgpr34_sgpr35_sgpr36_sgpr37_sgpr38_sgpr39
+    ; GFX1100-NEXT: S_NOP 0, implicit-def $sgpr40_sgpr41_sgpr42_sgpr43_sgpr44_sgpr45_sgpr46_sgpr47
+    ; GFX1100-NEXT: S_NOP 0, implicit-def $sgpr48_sgpr49_sgpr50_sgpr51_sgpr52_sgpr53_sgpr54_sgpr55
+    ; GFX1100-NEXT: S_NOP 0, implicit-def $sgpr56_sgpr57_sgpr58_sgpr59_sgpr60_sgpr61_sgpr62_sgpr63
+    ; GFX1100-NEXT: V_CMP_EQ_U32_e32 0, killed $vgpr0, implicit-def $vcc_lo, implicit $exec
+    ; GFX1100-NEXT: S_CMP_EQ_I32 $sgpr4, $sgpr5, implicit-def $scc
+    ; GFX1100-NEXT: $vcc_hi = S_ADDC_U32 $sgpr32, 64, implicit-def $scc, implicit $scc
+    ; GFX1100-NEXT: S_BITCMP1_B32 $vcc_hi, 0, implicit-def $scc
+    ; GFX1100-NEXT: $vcc_hi = S_BITSET0_B32 0, $vcc_hi
+    ; GFX1100-NEXT: renamable $sgpr4 = S_MOV_B32 killed $vcc_hi
+    ; GFX1100-NEXT: S_NOP 0, implicit $sgpr0_sgpr1_sgpr2_sgpr3_sgpr4_sgpr5_sgpr6_sgpr7
+    ; GFX1100-NEXT: S_NOP 0, implicit $sgpr8_sgpr9_sgpr10_sgpr11_sgpr12_sgpr13_sgpr14_sgpr15
+    ; GFX1100-NEXT: S_NOP 0, implicit $sgpr16_sgpr17_sgpr18_sgpr19_sgpr20_sgpr21_sgpr22_sgpr23
+    ; GFX1100-NEXT: S_NOP 0, implicit $sgpr24_sgpr25_sgpr26_sgpr27_sgpr28_sgpr29_sgpr30_sgpr31
+    ; GFX1100-NEXT: S_NOP 0, implicit $sgpr32_sgpr33_sgpr34_sgpr35_sgpr36_sgpr37_sgpr38_sgpr39
+    ; GFX1100-NEXT: S_NOP 0, implicit $sgpr40_sgpr41_sgpr42_sgpr43_sgpr44_sgpr45_sgpr46_sgpr47
+    ; GFX1100-NEXT: S_NOP 0, implicit $sgpr48_sgpr49_sgpr50_sgpr51_sgpr52_sgpr53_sgpr54_sgpr55
+    ; GFX1100-NEXT: S_NOP 0, implicit $sgpr56_sgpr57_sgpr58_sgpr59_sgpr60_sgpr61_sgpr62_sgpr63
+    ; GFX1100-NEXT: S_ENDPGM 0, implicit $sgpr4, implicit $scc, implicit killed $vcc_lo
+    ;
+    ; GFX1200-LABEL: name: materialize_fi_s_mov_b32_offset_64_live_scc_live_vcc_no_sgpr
+    ; GFX1200: liveins: $sgpr4, $sgpr5, $vgpr0
+    ; GFX1200-NEXT: {{  $}}
+    ; GFX1200-NEXT: S_NOP 0, implicit-def $sgpr0_sgpr1_sgpr2_sgpr3_sgpr4_sgpr5_sgpr6_sgpr7
+    ; GFX1200-NEXT: S_NOP 0, implicit-def $sgpr8_sgpr9_sgpr10_sgpr11_sgpr12_sgpr13_sgpr14_sgpr15
+    ; GFX1200-NEXT: S_NOP 0, implicit-def $sgpr16_sgpr17_sgpr18_sgpr19_sgpr20_sgpr21_sgpr22_sgpr23
+    ; GFX1200-NEXT: S_NOP 0, implicit-def $sgpr24_sgpr25_sgpr26_sgpr27_sgpr28_sgpr29_sgpr30_sgpr31
+    ; GFX1200-NEXT: S_NOP 0, implicit-def $sgpr32_sgpr33_sgpr34_sgpr35_sgpr36_sgpr37_sgpr38_sgpr39
+    ; GFX1200-NEXT: S_NOP 0, implicit-def $sgpr40_sgpr41_sgpr42_sgpr43_sgpr44_sgpr45_sgpr46_sgpr47
+    ; GFX1200-NEXT: S_NOP 0, implicit-def $sgpr48_sgpr49_sgpr50_sgpr51_sgpr52_sgpr53_sgpr54_sgpr55
+    ; GFX1200-NEXT: S_NOP 0, implicit-def $sgpr56_sgpr57_sgpr58_sgpr59_sgpr60_sgpr61_sgpr62_sgpr63
+    ; GFX1200-NEXT: V_CMP_EQ_U32_e32 0, killed $vgpr0, implicit-def $vcc_lo, implicit $exec
+    ; GFX1200-NEXT: S_CMP_EQ_I32 $sgpr4, $sgpr5, implicit-def $scc
+    ; GFX1200-NEXT: $vcc_hi = S_ADDC_U32 $sgpr32, 64, implicit-def $scc, implicit $scc
+    ; GFX1200-NEXT: S_BITCMP1_B32 $vcc_hi, 0, implicit-def $scc
+    ; GFX1200-NEXT: $vcc_hi = S_BITSET0_B32 0, $vcc_hi
+    ; GFX1200-NEXT: renamable $sgpr4 = S_MOV_B32 killed $vcc_hi
+    ; GFX1200-NEXT: S_NOP 0, implicit $sgpr0_sgpr1_sgpr2_sgpr3_sgpr4_sgpr5_sgpr6_sgpr7
+    ; GFX1200-NEXT: S_NOP 0, implicit $sgpr8_sgpr9_sgpr10_sgpr11_sgpr12_sgpr13_sgpr14_sgpr15
+    ; GFX1200-NEXT: S_NOP 0, implicit $sgpr16_sgpr17_sgpr18_sgpr19_sgpr20_sgpr21_sgpr22_sgpr23
+    ; GFX1200-NEXT: S_NOP 0, implicit $sgpr24_sgpr25_sgpr26_sgpr27_sgpr28_sgpr29_sgpr30_sgpr31
+    ; GFX1200-NEXT: S_NOP 0, implicit $sgpr32_sgpr33_sgpr34_sgpr35_sgpr36_sgpr37_sgpr38_sgpr39
+    ; GFX1200-NEXT: S_NOP 0, implicit $sgpr40_sgpr41_sgpr42_sgpr43_sgpr44_sgpr45_sgpr46_sgpr47
+    ; GFX1200-NEXT: S_NOP 0, implicit $sgpr48_sgpr49_sgpr50_sgpr51_sgpr52_sgpr53_sgpr54_sgpr55
+    ; GFX1200-NEXT: S_NOP 0, implicit $sgpr56_sgpr57_sgpr58_sgpr59_sgpr60_sgpr61_sgpr62_sgpr63
+    ; GFX1200-NEXT: S_ENDPGM 0, implicit $sgpr4, implicit $scc, implicit killed $vcc_lo
+  S_NOP 0, implicit-def $sgpr0_sgpr1_sgpr2_sgpr3_sgpr4_sgpr5_sgpr6_sgpr7
+  S_NOP 0, implicit-def $sgpr8_sgpr9_sgpr10_sgpr11_sgpr12_sgpr13_sgpr14_sgpr15
+  S_NOP 0, implicit-def $sgpr16_sgpr17_sgpr18_sgpr19_sgpr20_sgpr21_sgpr22_sgpr23
+  S_NOP 0, implicit-def $sgpr24_sgpr25_sgpr26_sgpr27_sgpr28_sgpr29_sgpr30_sgpr31
+  S_NOP 0, implicit-def $sgpr32_sgpr33_sgpr34_sgpr35_sgpr36_sgpr37_sgpr38_sgpr39
+  S_NOP 0, implicit-def $sgpr40_sgpr41_sgpr42_sgpr43_sgpr44_sgpr45_sgpr46_sgpr47
+  S_NOP 0, implicit-def $sgpr48_sgpr49_sgpr50_sgpr51_sgpr52_sgpr53_sgpr54_sgpr55
+  S_NOP 0, implicit-def $sgpr56_sgpr57_sgpr58_sgpr59_sgpr60_sgpr61_sgpr62_sgpr63
+
+  V_CMP_EQ_U32_e32 0, killed $vgpr0, implicit-def $vcc, implicit $exec
+  S_CMP_EQ_I32 $sgpr4, $sgpr5, implicit-def $scc
+  renamable $sgpr4 = S_MOV_B32 %stack.1
+
+  S_NOP 0, implicit $sgpr0_sgpr1_sgpr2_sgpr3_sgpr4_sgpr5_sgpr6_sgpr7
+  S_NOP 0, implicit $sgpr8_sgpr9_sgpr10_sgpr11_sgpr12_sgpr13_sgpr14_sgpr15
+  S_NOP 0, implicit $sgpr16_sgpr17_sgpr18_sgpr19_sgpr20_sgpr21_sgpr22_sgpr23
+  S_NOP 0, implicit $sgpr24_sgpr25_sgpr26_sgpr27_sgpr28_sgpr29_sgpr30_sgpr31
+  S_NOP 0, implicit $sgpr32_sgpr33_sgpr34_sgpr35_sgpr36_sgpr37_sgpr38_sgpr39
+  S_NOP 0, implicit $sgpr40_sgpr41_sgpr42_sgpr43_sgpr44_sgpr45_sgpr46_sgpr47
+  S_NOP 0, implicit $sgpr48_sgpr49_sgpr50_sgpr51_sgpr52_sgpr53_sgpr54_sgpr55
+  S_NOP 0, implicit $sgpr56_sgpr57_sgpr58_sgpr59_sgpr60_sgpr61_sgpr62_sgpr63
+  S_ENDPGM 0, implicit $sgpr4, implicit $scc, implicit killed $vcc
+
+...
+
+---
+name: materialize_fi_s_mov_b32_offset_68_live_scc_no_sgpr
+tracksRegLiveness: true
+stack:
+  - { id: 0, type: default, size: 68, alignment: 16, stack-id: default }
+  - { id: 1, type: default, size: 4, alignment: 4, stack-id: default }
+machineFunctionInfo:
+  stackPtrOffsetReg: '$sgpr32'
+body:             |
+  bb.0:
+    liveins: $sgpr4, $sgpr5, $vgpr0
+
+    ; GFX8-LABEL: name: materialize_fi_s_mov_b32_offset_68_live_scc_no_sgpr
+    ; GFX8: liveins: $sgpr4, $sgpr5, $vgpr0
+    ; GFX8-NEXT: {{  $}}
+    ; GFX8-NEXT: S_NOP 0, implicit-def $sgpr0_sgpr1_sgpr2_sgpr3_sgpr4_sgpr5_sgpr6_sgpr7
+    ; GFX8-NEXT: S_NOP 0, implicit-def $sgpr8_sgpr9_sgpr10_sgpr11_sgpr12_sgpr13_sgpr14_sgpr15
+    ; GFX8-NEXT: S_NOP 0, implicit-def $sgpr16_sgpr17_sgpr18_sgpr19_sgpr20_sgpr21_sgpr22_sgpr23
+    ; GFX8-NEXT: S_NOP 0, implicit-def $sgpr24_sgpr25_sgpr26_sgpr27_sgpr28_sgpr29_sgpr30_sgpr31
+    ; GFX8-NEXT: S_NOP 0, implicit-def $sgpr32_sgpr33_sgpr34_sgpr35_sgpr36_sgpr37_sgpr38_sgpr39
+    ; GFX8-NEXT: S_NOP 0, implicit-def $sgpr40_sgpr41_sgpr42_sgpr43_sgpr44_sgpr45_sgpr46_sgpr47
+    ; GFX8-NEXT: S_NOP 0, implicit-def $sgpr48_sgpr49_sgpr50_sgpr51_sgpr52_sgpr53_sgpr54_sgpr55
+    ; GFX8-NEXT: S_NOP 0, implicit-def $sgpr56_sgpr57_sgpr58_sgpr59_sgpr60_sgpr61_sgpr62_sgpr63
+    ; GFX8-NEXT: V_CMP_EQ_U32_e32 0, killed $vgpr0, implicit-def $vcc, implicit $exec
+    ; GFX8-NEXT: S_CMP_EQ_I32 $sgpr4, $sgpr5, implicit-def $scc
+    ; GFX8-NEXT: $vgpr0 = V_MOV_B32_e32 68, implicit $exec
+    ; GFX8-NEXT: $vgpr0 = V_MAD_U32_U24_e64 killed $vgpr0, 64, $sgpr32, 0, implicit $exec
+    ; GFX8-NEXT: $vgpr0 = V_LSHRREV_B32_e64 6, $sgpr32, implicit $exec
+    ; GFX8-NEXT: $sgpr4 = V_READFIRSTLANE_B32 $vgpr0, implicit $exec
+    ; GFX8-NEXT: S_NOP 0, implicit $sgpr0_sgpr1_sgpr2_sgpr3_sgpr4_sgpr5_sgpr6_sgpr7
+    ; GFX8-NEXT: S_NOP 0, implicit $sgpr8_sgpr9_sgpr10_sgpr11_sgpr12_sgpr13_sgpr14_sgpr15
+    ; GFX8-NEXT: S_NOP 0, implicit $sgpr16_sgpr17_sgpr18_sgpr19_sgpr20_sgpr21_sgpr22_sgpr23
+    ; GFX8-NEXT: S_NOP 0, implicit $sgpr24_sgpr25_sgpr26_sgpr27_sgpr28_sgpr29_sgpr30_sgpr31
+    ; GFX8-NEXT: S_NOP 0, implicit $sgpr32_sgpr33_sgpr34_sgpr35_sgpr36_sgpr37_sgpr38_sgpr39
+    ; GFX8-NEXT: S_NOP 0, implicit $sgpr40_sgpr41_sgpr42_sgpr43_sgpr44_sgpr45_sgpr46_sgpr47
+    ; GFX8-NEXT: S_NOP 0, implicit $sgpr48_sgpr49_sgpr50_sgpr51_sgpr52_sgpr53_sgpr54_sgpr55
+    ; GFX8-NEXT: S_NOP 0, implicit $sgpr56_sgpr57_sgpr58_sgpr59_sgpr60_sgpr61_sgpr62_sgpr63
+    ; GFX8-NEXT: S_ENDPGM 0, implicit $sgpr4, implicit $scc, implicit killed $vcc
+    ;
+    ; GFX900-LABEL: name: materialize_fi_s_mov_b32_offset_68_live_scc_no_sgpr
+    ; GFX900: liveins: $sgpr4, $sgpr5, $vgpr0
+    ; GFX900-NEXT: {{  $}}
+    ; GFX900-NEXT: S_NOP 0, implicit-def $sgpr0_sgpr1_sgpr2_sgpr3_sgpr4_sgpr5_sgpr6_sgpr7
+    ; GFX900-NEXT: S_NOP 0, implicit-def $sgpr8_sgpr9_sgpr10_sgpr11_sgpr12_sgpr13_sgpr14_sgpr15
+    ; GFX900-NEXT: S_NOP 0, implicit-def $sgpr16_sgpr17_sgpr18_sgpr19_sgpr20_sgpr21_sgpr22_sgpr23
+    ; GFX900-NEXT: S_NOP 0, implicit-def $sgpr24_sgpr25_sgpr26_sgpr27_sgpr28_sgpr29_sgpr30_sgpr31
+    ; GFX900-NEXT: S_NOP 0, implicit-def $sgpr32_sgpr33_sgpr34_sgpr35_sgpr36_sgpr37_sgpr38_sgpr39
+    ; GFX900-NEXT: S_NOP 0, implicit-def $sgpr40_sgpr41_sgpr42_sgpr43_sgpr44_sgpr45_sgpr46_sgpr47
+    ; GFX900-NEXT: S_NOP 0, implicit-def $sgpr48_sgpr49_sgpr50_sgpr51_sgpr52_sgpr53_sgpr54_sgpr55
+    ; GFX900-NEXT: S_NOP 0, implicit-def $sgpr56_sgpr57_sgpr58_sgpr59_sgpr60_sgpr61_sgpr62_sgpr63
+    ; GFX900-NEXT: V_CMP_EQ_U32_e32 0, killed $vgpr0, implicit-def $vcc, implicit $exec
+    ; GFX900-NEXT: S_CMP_EQ_I32 $sgpr4, $sgpr5, implicit-def $scc
+    ; GFX900-NEXT: $vgpr0 = V_LSHRREV_B32_e64 6, $sgpr32, implicit $exec
+    ; GFX900-NEXT: $vgpr0 = V_ADD_U32_e32 68, killed $vgpr0, implicit $exec
+    ; GFX900-NEXT: $sgpr4 = V_READFIRSTLANE_B32 $vgpr0, implicit $exec
+    ; GFX900-NEXT: S_NOP 0, implicit $sgpr0_sgpr1_sgpr2_sgpr3_sgpr4_sgpr5_sgpr6_sgpr7
+    ; GFX900-NEXT: S_NOP 0, implicit $sgpr8_sgpr9_sgpr10_sgpr11_sgpr12_sgpr13_sgpr14_sgpr15
+    ; GFX900-NEXT: S_NOP 0, implicit $sgpr16_sgpr17_sgpr18_sgpr19_sgpr20_sgpr21_sgpr22_sgpr23
+    ; GFX900-NEXT: S_NOP 0, implicit $sgpr24_sgpr25_sgpr26_sgpr27_sgpr28_sgpr29_sgpr30_sgpr31
+    ; GFX900-NEXT: S_NOP 0, implicit $sgpr32_sgpr33_sgpr34_sgpr35_sgpr36_sgpr37_sgpr38_sgpr39
+    ; GFX900-NEXT: S_NOP 0, implicit $sgpr40_sgpr41_sgpr42_sgpr43_sgpr44_sgpr45_sgpr46_sgpr47
+    ; GFX900-NEXT: S_NOP 0, implicit $sgpr48_sgpr49_sgpr50_sgpr51_sgpr52_sgpr53_sgpr54_sgpr55
+    ; GFX900-NEXT: S_NOP 0, implicit $sgpr56_sgpr57_sgpr58_sgpr59_sgpr60_sgpr61_sgpr62_sgpr63
+    ; GFX900-NEXT: S_ENDPGM 0, implicit $sgpr4, implicit $scc, implicit killed $vcc
+    ;
+    ; GFX90A-LABEL: name: materialize_fi_s_mov_b32_offset_68_live_scc_no_sgpr
+    ; GFX90A: liveins: $sgpr4, $sgpr5, $vgpr0
+    ; GFX90A-NEXT: {{  $}}
+    ; GFX90A-NEXT: S_NOP 0, implicit-def $sgpr0_sgpr1_sgpr2_sgpr3_sgpr4_sgpr5_sgpr6_sgpr7
+    ; GFX90A-NEXT: S_NOP 0, implicit-def $sgpr8_sgpr9_sgpr10_sgpr11_sgpr12_sgpr13_sgpr14_sgpr15
+    ; GFX90A-NEXT: S_NOP 0, implicit-def $sgpr16_sgpr17_sgpr18_sgpr19_sgpr20_sgpr21_sgpr22_sgpr23
+    ; GFX90A-NEXT: S_NOP 0, implicit-def $sgpr24_sgpr25_sgpr26_sgpr27_sgpr28_sgpr29_sgpr30_sgpr31
+    ; GFX90A-NEXT: S_NOP 0, implicit-def $sgpr32_sgpr33_sgpr34_sgpr35_sgpr36_sgpr37_sgpr38_sgpr39
+    ; GFX90A-NEXT: S_NOP 0, implicit-def $sgpr40_sgpr41_sgpr42_sgpr43_sgpr44_sgpr45_sgpr46_sgpr47
+    ; GFX90A-NEXT: S_NOP 0, implicit-def $sgpr48_sgpr49_sgpr50_sgpr51_sgpr52_sgpr53_sgpr54_sgpr55
+    ; GFX90A-NEXT: S_NOP 0, implicit-def $sgpr56_sgpr57_sgpr58_sgpr59_sgpr60_sgpr61_sgpr62_sgpr63
+    ; GFX90A-NEXT: V_CMP_EQ_U32_e32 0, killed $vgpr0, implicit-def $vcc, implicit $exec
+    ; GFX90A-NEXT: S_CMP_EQ_I32 $sgpr4, $sgpr5, implicit-def $scc
+    ; GFX90A-NEXT: $vgpr0 = V_LSHRREV_B32_e64 6, $sgpr32, implicit $exec
+    ; GFX90A-NEXT: $vgpr0 = V_ADD_U32_e32 68, killed $vgpr0, implicit $exec
+    ; GFX90A-NEXT: $sgpr4 = V_READFIRSTLANE_B32 $vgpr0, implicit $exec
+    ; GFX90A-NEXT: S_NOP 0, implicit $sgpr0_sgpr1_sgpr2_sgpr3_sgpr4_sgpr5_sgpr6_sgpr7
+    ; GFX90A-NEXT: S_NOP 0, implicit $sgpr8_sgpr9_sgpr10_sgpr11_sgpr12_sgpr13_sgpr14_sgpr15
+    ; GFX90A-NEXT: S_NOP 0, implicit $sgpr16_sgpr17_sgpr18_sgpr19_sgpr20_sgpr21_sgpr22_sgpr23
+    ; GFX90A-NEXT: S_NOP 0, implicit $sgpr24_sgpr25_sgpr26_sgpr27_sgpr28_sgpr29_sgpr30_sgpr31
+    ; GFX90A-NEXT: S_NOP 0, implicit $sgpr32_sgpr33_sgpr34_sgpr35_sgpr36_sgpr37_sgpr38_sgpr39
+    ; GFX90A-NEXT: S_NOP 0, implicit $sgpr40_sgpr41_sgpr42_sgpr43_sgpr44_sgpr45_sgpr46_sgpr47
+    ; GFX90A-NEXT: S_NOP 0, implicit $sgpr48_sgpr49_sgpr50_sgpr51_sgpr52_sgpr53_sgpr54_sgpr55
+    ; GFX90A-NEXT: S_NOP 0, implicit $sgpr56_sgpr57_sgpr58_sgpr59_sgpr60_sgpr61_sgpr62_sgpr63
+    ; GFX90A-NEXT: S_ENDPGM 0, implicit $sgpr4, implicit $scc, implicit killed $vcc
+    ;
+    ; GFX1010-LABEL: name: materialize_fi_s_mov_b32_offset_68_live_scc_no_sgpr
+    ; GFX1010: liveins: $sgpr4, $sgpr5, $vgpr0
+    ; GFX1010-NEXT: {{  $}}
+    ; GFX1010-NEXT: S_NOP 0, implicit-def $sgpr0_sgpr1_sgpr2_sgpr3_sgpr4_sgpr5_sgpr6_sgpr7
+    ; GFX1010-NEXT: S_NOP 0, implicit-def $sgpr8_sgpr9_sgpr10_sgpr11_sgpr12_sgpr13_sgpr14_sgpr15
+    ; GFX1010-NEXT: S_NOP 0, implicit-def $sgpr16_sgpr17_sgpr18_sgpr19_sgpr20_sgpr21_sgpr22_sgpr23
+    ; GFX1010-NEXT: S_NOP 0, implicit-def $sgpr24_sgpr25_sgpr26_sgpr27_sgpr28_sgpr29_sgpr30_sgpr31
+    ; GFX1010-NEXT: S_NOP 0, implicit-def $sgpr32_sgpr33_sgpr34_sgpr35_sgpr36_sgpr37_sgpr38_sgpr39
+    ; GFX1010-NEXT: S_NOP 0, implicit-def $sgpr40_sgpr41_sgpr42_sgpr43_sgpr44_sgpr45_sgpr46_sgpr47
+    ; GFX1010-NEXT: S_NOP 0, implicit-def $sgpr48_sgpr49_sgpr50_sgpr51_sgpr52_sgpr53_sgpr54_sgpr55
+    ; GFX1010-NEXT: S_NOP 0, implicit-def $sgpr56_sgpr57_sgpr58_sgpr59_sgpr60_sgpr61_sgpr62_sgpr63
+    ; GFX1010-NEXT: V_CMP_EQ_U32_e32 0, killed $vgpr0, implicit-def $vcc_lo, implicit $exec
+    ; GFX1010-NEXT: S_CMP_EQ_I32 $sgpr4, $sgpr5, implicit-def $scc
+    ; GFX1010-NEXT: $vgpr0 = V_LSHRREV_B32_e64 5, $sgpr32, implicit $exec
+    ; GFX1010-NEXT: $vgpr0 = V_ADD_U32_e32 68, killed $vgpr0, implicit $exec
+    ; GFX1010-NEXT: $sgpr4 = V_READFIRSTLANE_B32 $vgpr0, implicit $exec
+    ; GFX1010-NEXT: S_NOP 0, implicit $sgpr0_sgpr1_sgpr2_sgpr3_sgpr4_sgpr5_sgpr6_sgpr7
+    ; GFX1010-NEXT: S_NOP 0, implicit $sgpr8_sgpr9_sgpr10_sgpr11_sgpr12_sgpr13_sgpr14_sgpr15
+    ; GFX1010-NEXT: S_NOP 0, implicit $sgpr16_sgpr17_sgpr18_sgpr19_sgpr20_sgpr21_sgpr22_sgpr23
+    ; GFX1010-NEXT: S_NOP 0, implicit $sgpr24_sgpr25_sgpr26_sgpr27_sgpr28_sgpr29_sgpr30_sgpr31
+    ; GFX1010-NEXT: S_NOP 0, implicit $sgpr32_sgpr33_sgpr34_sgpr35_sgpr36_sgpr37_sgpr38_sgpr39
+    ; GFX1010-NEXT: S_NOP 0, implicit $sgpr40_sgpr41_sgpr42_sgpr43_sgpr44_sgpr45_sgpr46_sgpr47
+    ; GFX1010-NEXT: S_NOP 0, implicit $sgpr48_sgpr49_sgpr50_sgpr51_sgpr52_sgpr53_sgpr54_sgpr55
+    ; GFX1010-NEXT: S_NOP 0, implicit $sgpr56_sgpr57_sgpr58_sgpr59_sgpr60_sgpr61_sgpr62_sgpr63
+    ; GFX1010-NEXT: S_ENDPGM 0, implicit $sgpr4, implicit $scc, implicit killed $vcc_lo
+    ;
+    ; GFX1100-LABEL: name: materialize_fi_s_mov_b32_offset_68_live_scc_no_sgpr
+    ; GFX1100: liveins: $sgpr4, $sgpr5, $vgpr0
+    ; GFX1100-NEXT: {{  $}}
+    ; GFX1100-NEXT: S_NOP 0, implicit-def $sgpr0_sgpr1_sgpr2_sgpr3_sgpr4_sgpr5_sgpr6_sgpr7
+    ; GFX1100-NEXT: S_NOP 0, implicit-def $sgpr8_sgpr9_sgpr10_sgpr11_sgpr12_sgpr13_sgpr14_sgpr15
+    ; GFX1100-NEXT: S_NOP 0, implicit-def $sgpr16_sgpr17_sgpr18_sgpr19_sgpr20_sgpr21_sgpr22_sgpr23
+    ; GFX1100-NEXT: S_NOP 0, implicit-def $sgpr24_sgpr25_sgpr26_sgpr27_sgpr28_sgpr29_sgpr30_sgpr31
+    ; GFX1100-NEXT: S_NOP 0, implicit-def $sgpr32_sgpr33_sgpr34_sgpr35_sgpr36_sgpr37_sgpr38_sgpr39
+    ; GFX1100-NEXT: S_NOP 0, implicit-def $sgpr40_sgpr41_sgpr42_sgpr43_sgpr44_sgpr45_sgpr46_sgpr47
+    ; GFX1100-NEXT: S_NOP 0, implicit-def $sgpr48_sgpr49_sgpr50_sgpr51_sgpr52_sgpr53_sgpr54_sgpr55
+    ; GFX1100-NEXT: S_NOP 0, implicit-def $sgpr56_sgpr57_sgpr58_sgpr59_sgpr60_sgpr61_sgpr62_sgpr63
+    ; GFX1100-NEXT: V_CMP_EQ_U32_e32 0, killed $vgpr0, implicit-def $vcc_lo, implicit $exec
+    ; GFX1100-NEXT: S_CMP_EQ_I32 $sgpr4, $sgpr5, implicit-def $scc
+    ; GFX1100-NEXT: $vcc_hi = S_ADDC_U32 $sgpr32, 68, implicit-def $scc, implicit $scc
+    ; GFX1100-NEXT: S_BITCMP1_B32 $vcc_hi, 0, implicit-def $scc
+    ; GFX1100-NEXT: $vcc_hi = S_BITSET0_B32 0, $vcc_hi
+    ; GFX1100-NEXT: renamable $sgpr4 = S_MOV_B32 killed $vcc_hi
+    ; GFX1100-NEXT: S_NOP 0, implicit $sgpr0_sgpr1_sgpr2_sgpr3_sgpr4_sgpr5_sgpr6_sgpr7
+    ; GFX1100-NEXT: S_NOP 0, implicit $sgpr8_sgpr9_sgpr10_sgpr11_sgpr12_sgpr13_sgpr14_sgpr15
+    ; GFX1100-NEXT: S_NOP 0, implicit $sgpr16_sgpr17_sgpr18_sgpr19_sgpr20_sgpr21_sgpr22_sgpr23
+    ; GFX1100-NEXT: S_NOP 0, implicit $sgpr24_sgpr25_sgpr26_sgpr27_sgpr28_sgpr29_sgpr30_sgpr31
+    ; GFX1100-NEXT: S_NOP 0, implicit $sgpr32_sgpr33_sgpr34_sgpr35_sgpr36_sgpr37_sgpr38_sgpr39
+    ; GFX1100-NEXT: S_NOP 0, implicit $sgpr40_sgpr41_sgpr42_sgpr43_sgpr44_sgpr45_sgpr46_sgpr47
+    ; GFX1100-NEXT: S_NOP 0, implicit $sgpr48_sgpr49_sgpr50_sgpr51_sgpr52_sgpr53_sgpr54_sgpr55
+    ; GFX1100-NEXT: S_NOP 0, implicit $sgpr56_sgpr57_sgpr58_sgpr59_sgpr60_sgpr61_sgpr62_sgpr63
+    ; GFX1100-NEXT: S_ENDPGM 0, implicit $sgpr4, implicit $scc, implicit killed $vcc_lo
+    ;
+    ; GFX1200-LABEL: name: materialize_fi_s_mov_b32_offset_68_live_scc_no_sgpr
+    ; GFX1200: liveins: $sgpr4, $sgpr5, $vgpr0
+    ; GFX1200-NEXT: {{  $}}
+    ; GFX1200-NEXT: S_NOP 0, implicit-def $sgpr0_sgpr1_sgpr2_sgpr3_sgpr4_sgpr5_sgpr6_sgpr7
+    ; GFX1200-NEXT: S_NOP 0, implicit-def $sgpr8_sgpr9_sgpr10_sgpr11_sgpr12_sgpr13_sgpr14_sgpr15
+    ; GFX1200-NEXT: S_NOP 0, implicit-def $sgpr16_sgpr17_sgpr18_sgpr19_sgpr20_sgpr21_sgpr22_sgpr23
+    ; GFX1200-NEXT: S_NOP 0, implicit-def $sgpr24_sgpr25_sgpr26_sgpr27_sgpr28_sgpr29_sgpr30_sgpr31
+    ; GFX1200-NEXT: S_NOP 0, implicit-def $sgpr32_sgpr33_sgpr34_sgpr35_sgpr36_sgpr37_sgpr38_sgpr39
+    ; GFX1200-NEXT: S_NOP 0, implicit-def $sgpr40_sgpr41_sgpr42_sgpr43_sgpr44_sgpr45_sgpr46_sgpr47
+    ; GFX1200-NEXT: S_NOP 0, implicit-def $sgpr48_sgpr49_sgpr50_sgpr51_sgpr52_sgpr53_sgpr54_sgpr55
+    ; GFX1200-NEXT: S_NOP 0, implicit-def $sgpr56_sgpr57_sgpr58_sgpr59_sgpr60_sgpr61_sgpr62_sgpr63
+    ; GFX1200-NEXT: V_CMP_EQ_U32_e32 0, killed $vgpr0, implicit-def $vcc_lo, implicit $exec
+    ; GFX1200-NEXT: S_CMP_EQ_I32 $sgpr4, $sgpr5, implicit-def $scc
+    ; GFX1200-NEXT: $vcc_hi = S_ADDC_U32 $sgpr32, 68, implicit-def $scc, implicit $scc
+    ; GFX1200-NEXT: S_BITCMP1_B32 $vcc_hi, 0, implicit-def $scc
+    ; GFX1200-NEXT: $vcc_hi = S_BITSET0_B32 0, $vcc_hi
+    ; GFX1200-NEXT: renamable $sgpr4 = S_MOV_B32 killed $vcc_hi
+    ; GFX1200-NEXT: S_NOP 0, implicit $sgpr0_sgpr1_sgpr2_sgpr3_sgpr4_sgpr5_sgpr6_sgpr7
+    ; GFX1200-NEXT: S_NOP 0, implicit $sgpr8_sgpr9_sgpr10_sgpr11_sgpr12_sgpr13_sgpr14_sgpr15
+    ; GFX1200-NEXT: S_NOP 0, implicit $sgpr16_sgpr17_sgpr18_sgpr19_sgpr20_sgpr21_sgpr22_sgpr23
+    ; GFX1200-NEXT: S_NOP 0, implicit $sgpr24_sgpr25_sgpr26_sgpr27_sgpr28_sgpr29_sgpr30_sgpr31
+    ; GFX1200-NEXT: S_NOP 0, implicit $sgpr32_sgpr33_sgpr34_sgpr35_sgpr36_sgpr37_sgpr38_sgpr39
+    ; GFX1200-NEXT: S_NOP 0, implicit $sgpr40_sgpr41_sgpr42_sgpr43_sgpr44_sgpr45_sgpr46_sgpr47
+    ; GFX1200-NEXT: S_NOP 0, implicit $sgpr48_sgpr49_sgpr50_sgpr51_sgpr52_sgpr53_sgpr54_sgpr55
+    ; GFX1200-NEXT: S_NOP 0, implicit $sgpr56_sgpr57_sgpr58_sgpr59_sgpr60_sgpr61_sgpr62_sgpr63
+    ; GFX1200-NEXT: S_ENDPGM 0, implicit $sgpr4, implicit $scc, implicit killed $vcc_lo
+  S_NOP 0, implicit-def $sgpr0_sgpr1_sgpr2_sgpr3_sgpr4_sgpr5_sgpr6_sgpr7
+  S_NOP 0, implicit-def $sgpr8_sgpr9_sgpr10_sgpr11_sgpr12_sgpr13_sgpr14_sgpr15
+  S_NOP 0, implicit-def $sgpr16_sgpr17_sgpr18_sgpr19_sgpr20_sgpr21_sgpr22_sgpr23
+  S_NOP 0, implicit-def $sgpr24_sgpr25_sgpr26_sgpr27_sgpr28_sgpr29_sgpr30_sgpr31
+  S_NOP 0, implicit-def $sgpr32_sgpr33_sgpr34_sgpr35_sgpr36_sgpr37_sgpr38_sgpr39
+  S_NOP 0, implicit-def $sgpr40_sgpr41_sgpr42_sgpr43_sgpr44_sgpr45_sgpr46_sgpr47
+  S_NOP 0, implicit-def $sgpr48_sgpr49_sgpr50_sgpr51_sgpr52_sgpr53_sgpr54_sgpr55
+  S_NOP 0, implicit-def $sgpr56_sgpr57_sgpr58_sgpr59_sgpr60_sgpr61_sgpr62_sgpr63
+
+  V_CMP_EQ_U32_e32 0, killed $vgpr0, implicit-def $vcc, implicit $exec
+  S_CMP_EQ_I32 $sgpr4, $sgpr5, implicit-def $scc
+  renamable $sgpr4 = S_MOV_B32 %stack.1
+
+  S_NOP 0, implicit $sgpr0_sgpr1_sgpr2_sgpr3_sgpr4_sgpr5_sgpr6_sgpr7
+  S_NOP 0, implicit $sgpr8_sgpr9_sgpr10_sgpr11_sgpr12_sgpr13_sgpr14_sgpr15
+  S_NOP 0, implicit $sgpr16_sgpr17_sgpr18_sgpr19_sgpr20_sgpr21_sgpr22_sgpr23
+  S_NOP 0, implicit $sgpr24_sgpr25_sgpr26_sgpr27_sgpr28_sgpr29_sgpr30_sgpr31
+  S_NOP 0, implicit $sgpr32_sgpr33_sgpr34_sgpr35_sgpr36_sgpr37_sgpr38_sgpr39
+  S_NOP 0, implicit $sgpr40_sgpr41_sgpr42_sgpr43_sgpr44_sgpr45_sgpr46_sgpr47
+  S_NOP 0, implicit $sgpr48_sgpr49_sgpr50_sgpr51_sgpr52_sgpr53_sgpr54_sgpr55
+  S_NOP 0, implicit $sgpr56_sgpr57_sgpr58_sgpr59_sgpr60_sgpr61_sgpr62_sgpr63
+  S_ENDPGM 0, implicit $sgpr4, implicit $scc, implicit killed $vcc
+
+...
diff --git a/llvm/test/CodeGen/AMDGPU/frame-index-elimination.ll b/llvm/test/CodeGen/AMDGPU/frame-index-elimination.ll
index eeddc22..4215ae4 100644
--- a/llvm/test/CodeGen/AMDGPU/frame-index-elimination.ll
+++ b/llvm/test/CodeGen/AMDGPU/frame-index-elimination.ll
@@ -310,8 +310,8 @@ ret:
 
 ; GFX11-LABEL: tied_operand_test:
 ; GFX11:       ; %bb.0: ; %entry
-; GFX11-DAG:     scratch_load_u16 [[LDRESULT:v[0-9]+]], off, off
-; GFX11-DAG:     v_mov_b32_e32 [[C:v[0-9]+]], 0x7b
+; GFX11:     scratch_load_u16 [[LDRESULT:v[0-9]+]], off, off
+; GFX11:     v_dual_mov_b32 [[C:v[0-9]+]], 0x7b :: v_dual_mov_b32 v{{[0-9]+}}, s{{[0-9]+}}
 ; GFX11-DAG:     ds_store_b16 v{{[0-9]+}}, [[LDRESULT]]  offset:10
 ; GFX11-DAG:     ds_store_b16 v{{[0-9]+}}, [[C]]  offset:8
 ; GFX11-NEXT:    s_endpgm
diff --git a/llvm/test/CodeGen/AMDGPU/frame-index.mir b/llvm/test/CodeGen/AMDGPU/frame-index.mir
index d8736c5..34c7614 100644
--- a/llvm/test/CodeGen/AMDGPU/frame-index.mir
+++ b/llvm/test/CodeGen/AMDGPU/frame-index.mir
@@ -1,5 +1,7 @@
 # NOTE: Assertions have been autogenerated by utils/update_mir_test_checks.py
-# RUN: llc -mtriple=amdgcn-amd-amdhsa -mcpu=kaveri -verify-machineinstrs -run-pass=prologepilog -o - %s | FileCheck -check-prefix=GCN %s
+# RUN: llc -mtriple=amdgcn-amd-amdhsa -mcpu=tonga -verify-machineinstrs -run-pass=prologepilog %s -o - | FileCheck -check-prefixes=GFX8,GCN %s
+# RUN: llc -mtriple=amdgcn-amd-amdhsa -mcpu=gfx900 -verify-machineinstrs -run-pass=prologepilog %s -o - | FileCheck -check-prefixes=GFX900,GCN %s
+# RUN: llc -mtriple=amdgcn-amd-amdhsa -mcpu=gfx90a -verify-machineinstrs -run-pass=prologepilog %s -o - | FileCheck -check-prefixes=GFX90A,GCN %s
 
 ---
 name: func_add_constant_to_fi_divergent_i32
@@ -211,3 +213,602 @@ body:             |
     renamable $vgpr0 = V_ADD_CO_U32_e32 %stack.0, killed $vgpr0, implicit-def dead $vcc, implicit $exec
 ...
 
+---
+name: materialize_fi_s_mov_b32_offset_0_dead_scc
+tracksRegLiveness: true
+stack:
+  - { id: 0, type: default, size: 4, alignment: 4, stack-id: default }
+machineFunctionInfo:
+  stackPtrOffsetReg: '$sgpr32'
+body:             |
+  bb.0:
+    ; GCN-LABEL: name: materialize_fi_s_mov_b32_offset_0_dead_scc
+    ; GCN: $sgpr4 = S_LSHR_B32 $sgpr32, 6, implicit-def dead $scc
+    ; GCN-NEXT: S_ENDPGM 0, implicit $sgpr4
+  renamable $sgpr4 = S_MOV_B32 %stack.0
+  S_ENDPGM 0, implicit $sgpr4
+
+...
+
+---
+name: materialize_fi_s_mov_b32_offset_0_live_scc
+tracksRegLiveness: true
+stack:
+  - { id: 0, type: default, size: 4, alignment: 4, stack-id: default }
+machineFunctionInfo:
+  stackPtrOffsetReg: '$sgpr32'
+body:             |
+  bb.0:
+    liveins: $sgpr4, $sgpr5
+
+    ; GCN-LABEL: name: materialize_fi_s_mov_b32_offset_0_live_scc
+    ; GCN: liveins: $sgpr4, $sgpr5
+    ; GCN-NEXT: {{  $}}
+    ; GCN-NEXT: S_CMP_EQ_I32 $sgpr4, $sgpr5, implicit-def $scc
+    ; GCN-NEXT: $vgpr0 = V_LSHRREV_B32_e64 6, $sgpr32, implicit $exec
+    ; GCN-NEXT: $sgpr4 = V_READFIRSTLANE_B32 $vgpr0, implicit $exec
+    ; GCN-NEXT: S_ENDPGM 0, implicit $sgpr4, implicit $scc
+  S_CMP_EQ_I32 $sgpr4, $sgpr5, implicit-def $scc
+  renamable $sgpr4 = S_MOV_B32 %stack.0
+  S_ENDPGM 0, implicit $sgpr4, implicit $scc
+
+...
+
+# FI#0 is filler to get a non-0 offset for FI#1
+---
+name: materialize_fi_s_mov_b32_offset_64_dead_scc
+tracksRegLiveness: true
+stack:
+  - { id: 0, type: default, size: 64, alignment: 16, stack-id: default }
+  - { id: 1, type: default, size: 4, alignment: 4, stack-id: default }
+machineFunctionInfo:
+  stackPtrOffsetReg: '$sgpr32'
+body:             |
+  bb.0:
+    ; GCN-LABEL: name: materialize_fi_s_mov_b32_offset_64_dead_scc
+    ; GCN: $sgpr4 = S_LSHR_B32 $sgpr32, 6, implicit-def $scc
+    ; GCN-NEXT: $sgpr4 = S_ADD_I32 killed $sgpr4, 64, implicit-def $scc
+    ; GCN-NEXT: S_ENDPGM 0, implicit $sgpr4
+  renamable $sgpr4 = S_MOV_B32 %stack.1
+  S_ENDPGM 0, implicit $sgpr4
+
+...
+
+---
+name: materialize_fi_s_mov_b32_offset_68_dead_scc
+tracksRegLiveness: true
+stack:
+  - { id: 0, type: default, size: 68, alignment: 16, stack-id: default }
+  - { id: 1, type: default, size: 4, alignment: 4, stack-id: default }
+machineFunctionInfo:
+  stackPtrOffsetReg: '$sgpr32'
+body:             |
+  bb.0:
+    ; GCN-LABEL: name: materialize_fi_s_mov_b32_offset_68_dead_scc
+    ; GCN: $sgpr4 = S_LSHR_B32 $sgpr32, 6, implicit-def $scc
+    ; GCN-NEXT: $sgpr4 = S_ADD_I32 killed $sgpr4, 68, implicit-def $scc
+    ; GCN-NEXT: S_ENDPGM 0, implicit $sgpr4
+  renamable $sgpr4 = S_MOV_B32 %stack.1
+  S_ENDPGM 0, implicit $sgpr4
+
+...
+
+---
+name: materialize_fi_s_mov_b32_offset_64_live_scc
+tracksRegLiveness: true
+stack:
+  - { id: 0, type: default, size: 64, alignment: 16, stack-id: default }
+  - { id: 1, type: default, size: 4, alignment: 4, stack-id: default }
+machineFunctionInfo:
+  stackPtrOffsetReg: '$sgpr32'
+body:             |
+  bb.0:
+    liveins: $sgpr4, $sgpr5
+
+    ; GFX8-LABEL: name: materialize_fi_s_mov_b32_offset_64_live_scc
+    ; GFX8: liveins: $sgpr4, $sgpr5
+    ; GFX8-NEXT: {{  $}}
+    ; GFX8-NEXT: S_CMP_EQ_I32 $sgpr4, $sgpr5, implicit-def $scc
+    ; GFX8-NEXT: $vgpr0 = V_LSHRREV_B32_e64 6, $sgpr32, implicit $exec
+    ; GFX8-NEXT: $sgpr4 = S_MOV_B32 64
+    ; GFX8-NEXT: $vgpr0, dead $vcc = V_ADD_CO_U32_e64 killed $sgpr4, killed $vgpr0, 0, implicit $exec
+    ; GFX8-NEXT: $sgpr4 = V_READFIRSTLANE_B32 $vgpr0, implicit $exec
+    ; GFX8-NEXT: S_ENDPGM 0, implicit $sgpr4, implicit $scc
+    ;
+    ; GFX900-LABEL: name: materialize_fi_s_mov_b32_offset_64_live_scc
+    ; GFX900: liveins: $sgpr4, $sgpr5
+    ; GFX900-NEXT: {{  $}}
+    ; GFX900-NEXT: S_CMP_EQ_I32 $sgpr4, $sgpr5, implicit-def $scc
+    ; GFX900-NEXT: $vgpr0 = V_LSHRREV_B32_e64 6, $sgpr32, implicit $exec
+    ; GFX900-NEXT: $vgpr0 = V_ADD_U32_e32 64, killed $vgpr0, implicit $exec
+    ; GFX900-NEXT: $sgpr4 = V_READFIRSTLANE_B32 $vgpr0, implicit $exec
+    ; GFX900-NEXT: S_ENDPGM 0, implicit $sgpr4, implicit $scc
+    ;
+    ; GFX90A-LABEL: name: materialize_fi_s_mov_b32_offset_64_live_scc
+    ; GFX90A: liveins: $sgpr4, $sgpr5
+    ; GFX90A-NEXT: {{  $}}
+    ; GFX90A-NEXT: S_CMP_EQ_I32 $sgpr4, $sgpr5, implicit-def $scc
+    ; GFX90A-NEXT: $vgpr0 = V_LSHRREV_B32_e64 6, $sgpr32, implicit $exec
+    ; GFX90A-NEXT: $vgpr0 = V_ADD_U32_e32 64, killed $vgpr0, implicit $exec
+    ; GFX90A-NEXT: $sgpr4 = V_READFIRSTLANE_B32 $vgpr0, implicit $exec
+    ; GFX90A-NEXT: S_ENDPGM 0, implicit $sgpr4, implicit $scc
+  S_CMP_EQ_I32 $sgpr4, $sgpr5, implicit-def $scc
+  renamable $sgpr4 = S_MOV_B32 %stack.1
+  S_ENDPGM 0, implicit $sgpr4, implicit $scc
+
+...
+
+---
+name: materialize_fi_s_mov_b32_offset_68_live_scc
+tracksRegLiveness: true
+stack:
+  - { id: 0, type: default, size: 68, alignment: 16, stack-id: default }
+  - { id: 1, type: default, size: 4, alignment: 4, stack-id: default }
+machineFunctionInfo:
+  stackPtrOffsetReg: '$sgpr32'
+body:             |
+  bb.0:
+    liveins: $sgpr4, $sgpr5
+
+    ; GFX8-LABEL: name: materialize_fi_s_mov_b32_offset_68_live_scc
+    ; GFX8: liveins: $sgpr4, $sgpr5
+    ; GFX8-NEXT: {{  $}}
+    ; GFX8-NEXT: S_CMP_EQ_I32 $sgpr4, $sgpr5, implicit-def $scc
+    ; GFX8-NEXT: $vgpr0 = V_LSHRREV_B32_e64 6, $sgpr32, implicit $exec
+    ; GFX8-NEXT: $sgpr4 = S_MOV_B32 68
+    ; GFX8-NEXT: $vgpr0, dead $vcc = V_ADD_CO_U32_e64 killed $sgpr4, killed $vgpr0, 0, implicit $exec
+    ; GFX8-NEXT: $sgpr4 = V_READFIRSTLANE_B32 $vgpr0, implicit $exec
+    ; GFX8-NEXT: S_ENDPGM 0, implicit $sgpr4, implicit $scc
+    ;
+    ; GFX900-LABEL: name: materialize_fi_s_mov_b32_offset_68_live_scc
+    ; GFX900: liveins: $sgpr4, $sgpr5
+    ; GFX900-NEXT: {{  $}}
+    ; GFX900-NEXT: S_CMP_EQ_I32 $sgpr4, $sgpr5, implicit-def $scc
+    ; GFX900-NEXT: $vgpr0 = V_LSHRREV_B32_e64 6, $sgpr32, implicit $exec
+    ; GFX900-NEXT: $vgpr0 = V_ADD_U32_e32 68, killed $vgpr0, implicit $exec
+    ; GFX900-NEXT: $sgpr4 = V_READFIRSTLANE_B32 $vgpr0, implicit $exec
+    ; GFX900-NEXT: S_ENDPGM 0, implicit $sgpr4, implicit $scc
+    ;
+    ; GFX90A-LABEL: name: materialize_fi_s_mov_b32_offset_68_live_scc
+    ; GFX90A: liveins: $sgpr4, $sgpr5
+    ; GFX90A-NEXT: {{  $}}
+    ; GFX90A-NEXT: S_CMP_EQ_I32 $sgpr4, $sgpr5, implicit-def $scc
+    ; GFX90A-NEXT: $vgpr0 = V_LSHRREV_B32_e64 6, $sgpr32, implicit $exec
+    ; GFX90A-NEXT: $vgpr0 = V_ADD_U32_e32 68, killed $vgpr0, implicit $exec
+    ; GFX90A-NEXT: $sgpr4 = V_READFIRSTLANE_B32 $vgpr0, implicit $exec
+    ; GFX90A-NEXT: S_ENDPGM 0, implicit $sgpr4, implicit $scc
+  S_CMP_EQ_I32 $sgpr4, $sgpr5, implicit-def $scc
+  renamable $sgpr4 = S_MOV_B32 %stack.1
+  S_ENDPGM 0, implicit $sgpr4, implicit $scc
+
+...
+
+# FIXME: This is finding a VGPR
+---
+name: materialize_fi_s_mov_b32_offset_0_live_scc__no_free_vgprs
+tracksRegLiveness: true
+stack:
+  - { id: 0, type: default, size: 4, alignment: 16, stack-id: default }
+machineFunctionInfo:
+  scratchRSrcReg:  $sgpr0_sgpr1_sgpr2_sgpr3
+  stackPtrOffsetReg: '$sgpr32'
+  occupancy: 10
+
+body:             |
+  bb.0:
+    liveins: $sgpr4, $sgpr5, $vgpr32_vgpr33_vgpr34_vgpr35_vgpr36_vgpr37_vgpr38_vgpr39
+
+    ; GFX8-LABEL: name: materialize_fi_s_mov_b32_offset_0_live_scc__no_free_vgprs
+    ; GFX8: liveins: $sgpr4, $sgpr5, $vgpr32_vgpr33_vgpr34_vgpr35_vgpr36_vgpr37_vgpr38_vgpr39, $vgpr40, $vgpr41, $vgpr42, $vgpr43, $vgpr44, $vgpr45, $vgpr46, $vgpr47, $vgpr56, $vgpr57, $vgpr58, $vgpr59, $vgpr60, $vgpr61, $vgpr62, $vgpr63
+    ; GFX8-NEXT: {{  $}}
+    ; GFX8-NEXT: BUFFER_STORE_DWORD_OFFSET killed $vgpr40, $sgpr0_sgpr1_sgpr2_sgpr3, $sgpr32, 60, 0, 0, implicit $exec :: (store (s32) into %stack.1, addrspace 5)
+    ; GFX8-NEXT: BUFFER_STORE_DWORD_OFFSET killed $vgpr41, $sgpr0_sgpr1_sgpr2_sgpr3, $sgpr32, 56, 0, 0, implicit $exec :: (store (s32) into %stack.2, addrspace 5)
+    ; GFX8-NEXT: BUFFER_STORE_DWORD_OFFSET killed $vgpr42, $sgpr0_sgpr1_sgpr2_sgpr3, $sgpr32, 52, 0, 0, implicit $exec :: (store (s32) into %stack.3, addrspace 5)
+    ; GFX8-NEXT: BUFFER_STORE_DWORD_OFFSET killed $vgpr43, $sgpr0_sgpr1_sgpr2_sgpr3, $sgpr32, 48, 0, 0, implicit $exec :: (store (s32) into %stack.4, addrspace 5)
+    ; GFX8-NEXT: BUFFER_STORE_DWORD_OFFSET killed $vgpr44, $sgpr0_sgpr1_sgpr2_sgpr3, $sgpr32, 44, 0, 0, implicit $exec :: (store (s32) into %stack.5, addrspace 5)
+    ; GFX8-NEXT: BUFFER_STORE_DWORD_OFFSET killed $vgpr45, $sgpr0_sgpr1_sgpr2_sgpr3, $sgpr32, 40, 0, 0, implicit $exec :: (store (s32) into %stack.6, addrspace 5)
+    ; GFX8-NEXT: BUFFER_STORE_DWORD_OFFSET killed $vgpr46, $sgpr0_sgpr1_sgpr2_sgpr3, $sgpr32, 36, 0, 0, implicit $exec :: (store (s32) into %stack.7, addrspace 5)
+    ; GFX8-NEXT: BUFFER_STORE_DWORD_OFFSET killed $vgpr47, $sgpr0_sgpr1_sgpr2_sgpr3, $sgpr32, 32, 0, 0, implicit $exec :: (store (s32) into %stack.8, addrspace 5)
+    ; GFX8-NEXT: BUFFER_STORE_DWORD_OFFSET killed $vgpr56, $sgpr0_sgpr1_sgpr2_sgpr3, $sgpr32, 28, 0, 0, implicit $exec :: (store (s32) into %stack.9, addrspace 5)
+    ; GFX8-NEXT: BUFFER_STORE_DWORD_OFFSET killed $vgpr57, $sgpr0_sgpr1_sgpr2_sgpr3, $sgpr32, 24, 0, 0, implicit $exec :: (store (s32) into %stack.10, addrspace 5)
+    ; GFX8-NEXT: BUFFER_STORE_DWORD_OFFSET killed $vgpr58, $sgpr0_sgpr1_sgpr2_sgpr3, $sgpr32, 20, 0, 0, implicit $exec :: (store (s32) into %stack.11, addrspace 5)
+    ; GFX8-NEXT: BUFFER_STORE_DWORD_OFFSET killed $vgpr59, $sgpr0_sgpr1_sgpr2_sgpr3, $sgpr32, 16, 0, 0, implicit $exec :: (store (s32) into %stack.12, addrspace 5)
+    ; GFX8-NEXT: BUFFER_STORE_DWORD_OFFSET killed $vgpr60, $sgpr0_sgpr1_sgpr2_sgpr3, $sgpr32, 12, 0, 0, implicit $exec :: (store (s32) into %stack.13, addrspace 5)
+    ; GFX8-NEXT: BUFFER_STORE_DWORD_OFFSET killed $vgpr61, $sgpr0_sgpr1_sgpr2_sgpr3, $sgpr32, 8, 0, 0, implicit $exec :: (store (s32) into %stack.14, addrspace 5)
+    ; GFX8-NEXT: BUFFER_STORE_DWORD_OFFSET killed $vgpr62, $sgpr0_sgpr1_sgpr2_sgpr3, $sgpr32, 4, 0, 0, implicit $exec :: (store (s32) into %stack.15, addrspace 5)
+    ; GFX8-NEXT: BUFFER_STORE_DWORD_OFFSET killed $vgpr63, $sgpr0_sgpr1_sgpr2_sgpr3, $sgpr32, 0, 0, 0, implicit $exec :: (store (s32) into %stack.16, addrspace 5)
+    ; GFX8-NEXT: S_NOP 0, implicit-def $vgpr0_vgpr1_vgpr2_vgpr3_vgpr4_vgpr5_vgpr6_vgpr7
+    ; GFX8-NEXT: S_NOP 0, implicit-def $vgpr8_vgpr9_vgpr10_vgpr11_vgpr12_vgpr13_vgpr14_vgpr15
+    ; GFX8-NEXT: S_NOP 0, implicit-def $vgpr16_vgpr17_vgpr18_vgpr19_vgpr20_vgpr21_vgpr22_vgpr23
+    ; GFX8-NEXT: S_NOP 0, implicit-def $vgpr24_vgpr25_vgpr26_vgpr27_vgpr28_vgpr29_vgpr30_vgpr31
+    ; GFX8-NEXT: S_NOP 0, implicit $vgpr32_vgpr33_vgpr34_vgpr35_vgpr36_vgpr37_vgpr38_vgpr39
+    ; GFX8-NEXT: S_NOP 0, implicit-def $vgpr40_vgpr41_vgpr42_vgpr43_vgpr44_vgpr45_vgpr46_vgpr47
+    ; GFX8-NEXT: S_NOP 0, implicit-def $vgpr48_vgpr49_vgpr50_vgpr51_vgpr52_vgpr53_vgpr54_vgpr55
+    ; GFX8-NEXT: S_NOP 0, implicit-def $vgpr56_vgpr57_vgpr58_vgpr59_vgpr60_vgpr61_vgpr62_vgpr63
+    ; GFX8-NEXT: S_CMP_EQ_I32 $sgpr4, $sgpr5, implicit-def $scc
+    ; GFX8-NEXT: BUFFER_STORE_DWORD_OFFSET killed $vgpr0, $sgpr0_sgpr1_sgpr2_sgpr3, $sgpr32, 68, 0, 0, implicit $exec :: (store (s32) into %stack.17, addrspace 5)
+    ; GFX8-NEXT: $vgpr0 = V_LSHRREV_B32_e64 6, $sgpr32, implicit $exec
+    ; GFX8-NEXT: $sgpr4 = S_MOV_B32 64
+    ; GFX8-NEXT: $vgpr0, dead $vcc = V_ADD_CO_U32_e64 killed $sgpr4, killed $vgpr0, 0, implicit $exec
+    ; GFX8-NEXT: $sgpr4 = V_READFIRSTLANE_B32 $vgpr0, implicit $exec
+    ; GFX8-NEXT: $vgpr0 = BUFFER_LOAD_DWORD_OFFSET $sgpr0_sgpr1_sgpr2_sgpr3, $sgpr32, 68, 0, 0, implicit $exec :: (load (s32) from %stack.17, addrspace 5)
+    ; GFX8-NEXT: S_NOP 0, implicit $vgpr0_vgpr1_vgpr2_vgpr3_vgpr4_vgpr5_vgpr6_vgpr7
+    ; GFX8-NEXT: S_NOP 0, implicit $vgpr8_vgpr9_vgpr10_vgpr11_vgpr12_vgpr13_vgpr14_vgpr15
+    ; GFX8-NEXT: S_NOP 0, implicit $vgpr16_vgpr17_vgpr18_vgpr19_vgpr20_vgpr21_vgpr22_vgpr23
+    ; GFX8-NEXT: S_NOP 0, implicit $vgpr24_vgpr25_vgpr26_vgpr27_vgpr28_vgpr29_vgpr30_vgpr31
+    ; GFX8-NEXT: S_NOP 0, implicit $vgpr32_vgpr33_vgpr34_vgpr35_vgpr36_vgpr37_vgpr38_vgpr39
+    ; GFX8-NEXT: S_NOP 0, implicit $vgpr40_vgpr41_vgpr42_vgpr43_vgpr44_vgpr45_vgpr46_vgpr47
+    ; GFX8-NEXT: S_NOP 0, implicit $vgpr48_vgpr49_vgpr50_vgpr51_vgpr52_vgpr53_vgpr54_vgpr55
+    ; GFX8-NEXT: S_NOP 0, implicit $vgpr56_vgpr57_vgpr58_vgpr59_vgpr60_vgpr61_vgpr62_vgpr63
+    ; GFX8-NEXT: $vgpr63 = BUFFER_LOAD_DWORD_OFFSET $sgpr0_sgpr1_sgpr2_sgpr3, $sgpr32, 0, 0, 0, implicit $exec :: (load (s32) from %stack.16, addrspace 5)
+    ; GFX8-NEXT: $vgpr62 = BUFFER_LOAD_DWORD_OFFSET $sgpr0_sgpr1_sgpr2_sgpr3, $sgpr32, 4, 0, 0, implicit $exec :: (load (s32) from %stack.15, addrspace 5)
+    ; GFX8-NEXT: $vgpr61 = BUFFER_LOAD_DWORD_OFFSET $sgpr0_sgpr1_sgpr2_sgpr3, $sgpr32, 8, 0, 0, implicit $exec :: (load (s32) from %stack.14, addrspace 5)
+    ; GFX8-NEXT: $vgpr60 = BUFFER_LOAD_DWORD_OFFSET $sgpr0_sgpr1_sgpr2_sgpr3, $sgpr32, 12, 0, 0, implicit $exec :: (load (s32) from %stack.13, addrspace 5)
+    ; GFX8-NEXT: $vgpr59 = BUFFER_LOAD_DWORD_OFFSET $sgpr0_sgpr1_sgpr2_sgpr3, $sgpr32, 16, 0, 0, implicit $exec :: (load (s32) from %stack.12, addrspace 5)
+    ; GFX8-NEXT: $vgpr58 = BUFFER_LOAD_DWORD_OFFSET $sgpr0_sgpr1_sgpr2_sgpr3, $sgpr32, 20, 0, 0, implicit $exec :: (load (s32) from %stack.11, addrspace 5)
+    ; GFX8-NEXT: $vgpr57 = BUFFER_LOAD_DWORD_OFFSET $sgpr0_sgpr1_sgpr2_sgpr3, $sgpr32, 24, 0, 0, implicit $exec :: (load (s32) from %stack.10, addrspace 5)
+    ; GFX8-NEXT: $vgpr56 = BUFFER_LOAD_DWORD_OFFSET $sgpr0_sgpr1_sgpr2_sgpr3, $sgpr32, 28, 0, 0, implicit $exec :: (load (s32) from %stack.9, addrspace 5)
+    ; GFX8-NEXT: $vgpr47 = BUFFER_LOAD_DWORD_OFFSET $sgpr0_sgpr1_sgpr2_sgpr3, $sgpr32, 32, 0, 0, implicit $exec :: (load (s32) from %stack.8, addrspace 5)
+    ; GFX8-NEXT: $vgpr46 = BUFFER_LOAD_DWORD_OFFSET $sgpr0_sgpr1_sgpr2_sgpr3, $sgpr32, 36, 0, 0, implicit $exec :: (load (s32) from %stack.7, addrspace 5)
+    ; GFX8-NEXT: $vgpr45 = BUFFER_LOAD_DWORD_OFFSET $sgpr0_sgpr1_sgpr2_sgpr3, $sgpr32, 40, 0, 0, implicit $exec :: (load (s32) from %stack.6, addrspace 5)
+    ; GFX8-NEXT: $vgpr44 = BUFFER_LOAD_DWORD_OFFSET $sgpr0_sgpr1_sgpr2_sgpr3, $sgpr32, 44, 0, 0, implicit $exec :: (load (s32) from %stack.5, addrspace 5)
+    ; GFX8-NEXT: $vgpr43 = BUFFER_LOAD_DWORD_OFFSET $sgpr0_sgpr1_sgpr2_sgpr3, $sgpr32, 48, 0, 0, implicit $exec :: (load (s32) from %stack.4, addrspace 5)
+    ; GFX8-NEXT: $vgpr42 = BUFFER_LOAD_DWORD_OFFSET $sgpr0_sgpr1_sgpr2_sgpr3, $sgpr32, 52, 0, 0, implicit $exec :: (load (s32) from %stack.3, addrspace 5)
+    ; GFX8-NEXT: $vgpr41 = BUFFER_LOAD_DWORD_OFFSET $sgpr0_sgpr1_sgpr2_sgpr3, $sgpr32, 56, 0, 0, implicit $exec :: (load (s32) from %stack.2, addrspace 5)
+    ; GFX8-NEXT: $vgpr40 = BUFFER_LOAD_DWORD_OFFSET $sgpr0_sgpr1_sgpr2_sgpr3, $sgpr32, 60, 0, 0, implicit $exec :: (load (s32) from %stack.1, addrspace 5)
+    ; GFX8-NEXT: S_ENDPGM 0, implicit $sgpr4, implicit $scc
+    ;
+    ; GFX900-LABEL: name: materialize_fi_s_mov_b32_offset_0_live_scc__no_free_vgprs
+    ; GFX900: liveins: $sgpr4, $sgpr5, $vgpr32_vgpr33_vgpr34_vgpr35_vgpr36_vgpr37_vgpr38_vgpr39, $vgpr40, $vgpr41, $vgpr42, $vgpr43, $vgpr44, $vgpr45, $vgpr46, $vgpr47, $vgpr56, $vgpr57, $vgpr58, $vgpr59, $vgpr60, $vgpr61, $vgpr62, $vgpr63
+    ; GFX900-NEXT: {{  $}}
+    ; GFX900-NEXT: BUFFER_STORE_DWORD_OFFSET killed $vgpr40, $sgpr0_sgpr1_sgpr2_sgpr3, $sgpr32, 60, 0, 0, implicit $exec :: (store (s32) into %stack.1, addrspace 5)
+    ; GFX900-NEXT: BUFFER_STORE_DWORD_OFFSET killed $vgpr41, $sgpr0_sgpr1_sgpr2_sgpr3, $sgpr32, 56, 0, 0, implicit $exec :: (store (s32) into %stack.2, addrspace 5)
+    ; GFX900-NEXT: BUFFER_STORE_DWORD_OFFSET killed $vgpr42, $sgpr0_sgpr1_sgpr2_sgpr3, $sgpr32, 52, 0, 0, implicit $exec :: (store (s32) into %stack.3, addrspace 5)
+    ; GFX900-NEXT: BUFFER_STORE_DWORD_OFFSET killed $vgpr43, $sgpr0_sgpr1_sgpr2_sgpr3, $sgpr32, 48, 0, 0, implicit $exec :: (store (s32) into %stack.4, addrspace 5)
+    ; GFX900-NEXT: BUFFER_STORE_DWORD_OFFSET killed $vgpr44, $sgpr0_sgpr1_sgpr2_sgpr3, $sgpr32, 44, 0, 0, implicit $exec :: (store (s32) into %stack.5, addrspace 5)
+    ; GFX900-NEXT: BUFFER_STORE_DWORD_OFFSET killed $vgpr45, $sgpr0_sgpr1_sgpr2_sgpr3, $sgpr32, 40, 0, 0, implicit $exec :: (store (s32) into %stack.6, addrspace 5)
+    ; GFX900-NEXT: BUFFER_STORE_DWORD_OFFSET killed $vgpr46, $sgpr0_sgpr1_sgpr2_sgpr3, $sgpr32, 36, 0, 0, implicit $exec :: (store (s32) into %stack.7, addrspace 5)
+    ; GFX900-NEXT: BUFFER_STORE_DWORD_OFFSET killed $vgpr47, $sgpr0_sgpr1_sgpr2_sgpr3, $sgpr32, 32, 0, 0, implicit $exec :: (store (s32) into %stack.8, addrspace 5)
+    ; GFX900-NEXT: BUFFER_STORE_DWORD_OFFSET killed $vgpr56, $sgpr0_sgpr1_sgpr2_sgpr3, $sgpr32, 28, 0, 0, implicit $exec :: (store (s32) into %stack.9, addrspace 5)
+    ; GFX900-NEXT: BUFFER_STORE_DWORD_OFFSET killed $vgpr57, $sgpr0_sgpr1_sgpr2_sgpr3, $sgpr32, 24, 0, 0, implicit $exec :: (store (s32) into %stack.10, addrspace 5)
+    ; GFX900-NEXT: BUFFER_STORE_DWORD_OFFSET killed $vgpr58, $sgpr0_sgpr1_sgpr2_sgpr3, $sgpr32, 20, 0, 0, implicit $exec :: (store (s32) into %stack.11, addrspace 5)
+    ; GFX900-NEXT: BUFFER_STORE_DWORD_OFFSET killed $vgpr59, $sgpr0_sgpr1_sgpr2_sgpr3, $sgpr32, 16, 0, 0, implicit $exec :: (store (s32) into %stack.12, addrspace 5)
+    ; GFX900-NEXT: BUFFER_STORE_DWORD_OFFSET killed $vgpr60, $sgpr0_sgpr1_sgpr2_sgpr3, $sgpr32, 12, 0, 0, implicit $exec :: (store (s32) into %stack.13, addrspace 5)
+    ; GFX900-NEXT: BUFFER_STORE_DWORD_OFFSET killed $vgpr61, $sgpr0_sgpr1_sgpr2_sgpr3, $sgpr32, 8, 0, 0, implicit $exec :: (store (s32) into %stack.14, addrspace 5)
+    ; GFX900-NEXT: BUFFER_STORE_DWORD_OFFSET killed $vgpr62, $sgpr0_sgpr1_sgpr2_sgpr3, $sgpr32, 4, 0, 0, implicit $exec :: (store (s32) into %stack.15, addrspace 5)
+    ; GFX900-NEXT: BUFFER_STORE_DWORD_OFFSET killed $vgpr63, $sgpr0_sgpr1_sgpr2_sgpr3, $sgpr32, 0, 0, 0, implicit $exec :: (store (s32) into %stack.16, addrspace 5)
+    ; GFX900-NEXT: S_NOP 0, implicit-def $vgpr0_vgpr1_vgpr2_vgpr3_vgpr4_vgpr5_vgpr6_vgpr7
+    ; GFX900-NEXT: S_NOP 0, implicit-def $vgpr8_vgpr9_vgpr10_vgpr11_vgpr12_vgpr13_vgpr14_vgpr15
+    ; GFX900-NEXT: S_NOP 0, implicit-def $vgpr16_vgpr17_vgpr18_vgpr19_vgpr20_vgpr21_vgpr22_vgpr23
+    ; GFX900-NEXT: S_NOP 0, implicit-def $vgpr24_vgpr25_vgpr26_vgpr27_vgpr28_vgpr29_vgpr30_vgpr31
+    ; GFX900-NEXT: S_NOP 0, implicit $vgpr32_vgpr33_vgpr34_vgpr35_vgpr36_vgpr37_vgpr38_vgpr39
+    ; GFX900-NEXT: S_NOP 0, implicit-def $vgpr40_vgpr41_vgpr42_vgpr43_vgpr44_vgpr45_vgpr46_vgpr47
+    ; GFX900-NEXT: S_NOP 0, implicit-def $vgpr48_vgpr49_vgpr50_vgpr51_vgpr52_vgpr53_vgpr54_vgpr55
+    ; GFX900-NEXT: S_NOP 0, implicit-def $vgpr56_vgpr57_vgpr58_vgpr59_vgpr60_vgpr61_vgpr62_vgpr63
+    ; GFX900-NEXT: S_CMP_EQ_I32 $sgpr4, $sgpr5, implicit-def $scc
+    ; GFX900-NEXT: BUFFER_STORE_DWORD_OFFSET killed $vgpr0, $sgpr0_sgpr1_sgpr2_sgpr3, $sgpr32, 68, 0, 0, implicit $exec :: (store (s32) into %stack.17, addrspace 5)
+    ; GFX900-NEXT: $vgpr0 = V_LSHRREV_B32_e64 6, $sgpr32, implicit $exec
+    ; GFX900-NEXT: $vgpr0 = V_ADD_U32_e32 64, killed $vgpr0, implicit $exec
+    ; GFX900-NEXT: $sgpr4 = V_READFIRSTLANE_B32 $vgpr0, implicit $exec
+    ; GFX900-NEXT: $vgpr0 = BUFFER_LOAD_DWORD_OFFSET $sgpr0_sgpr1_sgpr2_sgpr3, $sgpr32, 68, 0, 0, implicit $exec :: (load (s32) from %stack.17, addrspace 5)
+    ; GFX900-NEXT: S_NOP 0, implicit $vgpr0_vgpr1_vgpr2_vgpr3_vgpr4_vgpr5_vgpr6_vgpr7
+    ; GFX900-NEXT: S_NOP 0, implicit $vgpr8_vgpr9_vgpr10_vgpr11_vgpr12_vgpr13_vgpr14_vgpr15
+    ; GFX900-NEXT: S_NOP 0, implicit $vgpr16_vgpr17_vgpr18_vgpr19_vgpr20_vgpr21_vgpr22_vgpr23
+    ; GFX900-NEXT: S_NOP 0, implicit $vgpr24_vgpr25_vgpr26_vgpr27_vgpr28_vgpr29_vgpr30_vgpr31
+    ; GFX900-NEXT: S_NOP 0, implicit $vgpr32_vgpr33_vgpr34_vgpr35_vgpr36_vgpr37_vgpr38_vgpr39
+    ; GFX900-NEXT: S_NOP 0, implicit $vgpr40_vgpr41_vgpr42_vgpr43_vgpr44_vgpr45_vgpr46_vgpr47
+    ; GFX900-NEXT: S_NOP 0, implicit $vgpr48_vgpr49_vgpr50_vgpr51_vgpr52_vgpr53_vgpr54_vgpr55
+    ; GFX900-NEXT: S_NOP 0, implicit $vgpr56_vgpr57_vgpr58_vgpr59_vgpr60_vgpr61_vgpr62_vgpr63
+    ; GFX900-NEXT: $vgpr63 = BUFFER_LOAD_DWORD_OFFSET $sgpr0_sgpr1_sgpr2_sgpr3, $sgpr32, 0, 0, 0, implicit $exec :: (load (s32) from %stack.16, addrspace 5)
+    ; GFX900-NEXT: $vgpr62 = BUFFER_LOAD_DWORD_OFFSET $sgpr0_sgpr1_sgpr2_sgpr3, $sgpr32, 4, 0, 0, implicit $exec :: (load (s32) from %stack.15, addrspace 5)
+    ; GFX900-NEXT: $vgpr61 = BUFFER_LOAD_DWORD_OFFSET $sgpr0_sgpr1_sgpr2_sgpr3, $sgpr32, 8, 0, 0, implicit $exec :: (load (s32) from %stack.14, addrspace 5)
+    ; GFX900-NEXT: $vgpr60 = BUFFER_LOAD_DWORD_OFFSET $sgpr0_sgpr1_sgpr2_sgpr3, $sgpr32, 12, 0, 0, implicit $exec :: (load (s32) from %stack.13, addrspace 5)
+    ; GFX900-NEXT: $vgpr59 = BUFFER_LOAD_DWORD_OFFSET $sgpr0_sgpr1_sgpr2_sgpr3, $sgpr32, 16, 0, 0, implicit $exec :: (load (s32) from %stack.12, addrspace 5)
+    ; GFX900-NEXT: $vgpr58 = BUFFER_LOAD_DWORD_OFFSET $sgpr0_sgpr1_sgpr2_sgpr3, $sgpr32, 20, 0, 0, implicit $exec :: (load (s32) from %stack.11, addrspace 5)
+    ; GFX900-NEXT: $vgpr57 = BUFFER_LOAD_DWORD_OFFSET $sgpr0_sgpr1_sgpr2_sgpr3, $sgpr32, 24, 0, 0, implicit $exec :: (load (s32) from %stack.10, addrspace 5)
+    ; GFX900-NEXT: $vgpr56 = BUFFER_LOAD_DWORD_OFFSET $sgpr0_sgpr1_sgpr2_sgpr3, $sgpr32, 28, 0, 0, implicit $exec :: (load (s32) from %stack.9, addrspace 5)
+    ; GFX900-NEXT: $vgpr47 = BUFFER_LOAD_DWORD_OFFSET $sgpr0_sgpr1_sgpr2_sgpr3, $sgpr32, 32, 0, 0, implicit $exec :: (load (s32) from %stack.8, addrspace 5)
+    ; GFX900-NEXT: $vgpr46 = BUFFER_LOAD_DWORD_OFFSET $sgpr0_sgpr1_sgpr2_sgpr3, $sgpr32, 36, 0, 0, implicit $exec :: (load (s32) from %stack.7, addrspace 5)
+    ; GFX900-NEXT: $vgpr45 = BUFFER_LOAD_DWORD_OFFSET $sgpr0_sgpr1_sgpr2_sgpr3, $sgpr32, 40, 0, 0, implicit $exec :: (load (s32) from %stack.6, addrspace 5)
+    ; GFX900-NEXT: $vgpr44 = BUFFER_LOAD_DWORD_OFFSET $sgpr0_sgpr1_sgpr2_sgpr3, $sgpr32, 44, 0, 0, implicit $exec :: (load (s32) from %stack.5, addrspace 5)
+    ; GFX900-NEXT: $vgpr43 = BUFFER_LOAD_DWORD_OFFSET $sgpr0_sgpr1_sgpr2_sgpr3, $sgpr32, 48, 0, 0, implicit $exec :: (load (s32) from %stack.4, addrspace 5)
+    ; GFX900-NEXT: $vgpr42 = BUFFER_LOAD_DWORD_OFFSET $sgpr0_sgpr1_sgpr2_sgpr3, $sgpr32, 52, 0, 0, implicit $exec :: (load (s32) from %stack.3, addrspace 5)
+    ; GFX900-NEXT: $vgpr41 = BUFFER_LOAD_DWORD_OFFSET $sgpr0_sgpr1_sgpr2_sgpr3, $sgpr32, 56, 0, 0, implicit $exec :: (load (s32) from %stack.2, addrspace 5)
+    ; GFX900-NEXT: $vgpr40 = BUFFER_LOAD_DWORD_OFFSET $sgpr0_sgpr1_sgpr2_sgpr3, $sgpr32, 60, 0, 0, implicit $exec :: (load (s32) from %stack.1, addrspace 5)
+    ; GFX900-NEXT: S_ENDPGM 0, implicit $sgpr4, implicit $scc
+    ;
+    ; GFX90A-LABEL: name: materialize_fi_s_mov_b32_offset_0_live_scc__no_free_vgprs
+    ; GFX90A: liveins: $agpr0, $agpr1, $agpr2, $agpr3, $agpr4, $agpr5, $agpr6, $agpr7, $agpr8, $agpr9, $agpr10, $agpr11, $agpr12, $agpr13, $agpr14, $agpr15, $sgpr4, $sgpr5, $vgpr40, $vgpr41, $vgpr42, $vgpr43, $vgpr44, $vgpr45, $vgpr46, $vgpr47, $vgpr56, $vgpr57, $vgpr58, $vgpr59, $vgpr60, $vgpr61, $vgpr62, $vgpr63, $vgpr32_vgpr33_vgpr34_vgpr35_vgpr36_vgpr37_vgpr38_vgpr39
+    ; GFX90A-NEXT: {{  $}}
+    ; GFX90A-NEXT: $agpr0 = V_ACCVGPR_WRITE_B32_e64 killed $vgpr40, implicit $exec
+    ; GFX90A-NEXT: $agpr1 = V_ACCVGPR_WRITE_B32_e64 killed $vgpr41, implicit $exec
+    ; GFX90A-NEXT: $agpr2 = V_ACCVGPR_WRITE_B32_e64 killed $vgpr42, implicit $exec
+    ; GFX90A-NEXT: $agpr3 = V_ACCVGPR_WRITE_B32_e64 killed $vgpr43, implicit $exec
+    ; GFX90A-NEXT: $agpr4 = V_ACCVGPR_WRITE_B32_e64 killed $vgpr44, implicit $exec
+    ; GFX90A-NEXT: $agpr5 = V_ACCVGPR_WRITE_B32_e64 killed $vgpr45, implicit $exec
+    ; GFX90A-NEXT: $agpr6 = V_ACCVGPR_WRITE_B32_e64 killed $vgpr46, implicit $exec
+    ; GFX90A-NEXT: $agpr7 = V_ACCVGPR_WRITE_B32_e64 killed $vgpr47, implicit $exec
+    ; GFX90A-NEXT: $agpr8 = V_ACCVGPR_WRITE_B32_e64 killed $vgpr56, implicit $exec
+    ; GFX90A-NEXT: $agpr9 = V_ACCVGPR_WRITE_B32_e64 killed $vgpr57, implicit $exec
+    ; GFX90A-NEXT: $agpr10 = V_ACCVGPR_WRITE_B32_e64 killed $vgpr58, implicit $exec
+    ; GFX90A-NEXT: $agpr11 = V_ACCVGPR_WRITE_B32_e64 killed $vgpr59, implicit $exec
+    ; GFX90A-NEXT: $agpr12 = V_ACCVGPR_WRITE_B32_e64 killed $vgpr60, implicit $exec
+    ; GFX90A-NEXT: $agpr13 = V_ACCVGPR_WRITE_B32_e64 killed $vgpr61, implicit $exec
+    ; GFX90A-NEXT: $agpr14 = V_ACCVGPR_WRITE_B32_e64 killed $vgpr62, implicit $exec
+    ; GFX90A-NEXT: $agpr15 = V_ACCVGPR_WRITE_B32_e64 killed $vgpr63, implicit $exec
+    ; GFX90A-NEXT: S_NOP 0, implicit-def $vgpr0_vgpr1_vgpr2_vgpr3_vgpr4_vgpr5_vgpr6_vgpr7
+    ; GFX90A-NEXT: S_NOP 0, implicit-def $vgpr8_vgpr9_vgpr10_vgpr11_vgpr12_vgpr13_vgpr14_vgpr15
+    ; GFX90A-NEXT: S_NOP 0, implicit-def $vgpr16_vgpr17_vgpr18_vgpr19_vgpr20_vgpr21_vgpr22_vgpr23
+    ; GFX90A-NEXT: S_NOP 0, implicit-def $vgpr24_vgpr25_vgpr26_vgpr27_vgpr28_vgpr29_vgpr30_vgpr31
+    ; GFX90A-NEXT: S_NOP 0, implicit $vgpr32_vgpr33_vgpr34_vgpr35_vgpr36_vgpr37_vgpr38_vgpr39
+    ; GFX90A-NEXT: S_NOP 0, implicit-def $vgpr40_vgpr41_vgpr42_vgpr43_vgpr44_vgpr45_vgpr46_vgpr47
+    ; GFX90A-NEXT: S_NOP 0, implicit-def $vgpr48_vgpr49_vgpr50_vgpr51_vgpr52_vgpr53_vgpr54_vgpr55
+    ; GFX90A-NEXT: S_NOP 0, implicit-def $vgpr56_vgpr57_vgpr58_vgpr59_vgpr60_vgpr61_vgpr62_vgpr63
+    ; GFX90A-NEXT: S_CMP_EQ_I32 $sgpr4, $sgpr5, implicit-def $scc
+    ; GFX90A-NEXT: BUFFER_STORE_DWORD_OFFSET killed $vgpr0, $sgpr0_sgpr1_sgpr2_sgpr3, $sgpr32, 4, 0, 0, implicit $exec :: (store (s32) into %stack.17, addrspace 5)
+    ; GFX90A-NEXT: $vgpr0 = V_LSHRREV_B32_e64 6, $sgpr32, implicit $exec
+    ; GFX90A-NEXT: $sgpr4 = V_READFIRSTLANE_B32 $vgpr0, implicit $exec
+    ; GFX90A-NEXT: $vgpr0 = BUFFER_LOAD_DWORD_OFFSET $sgpr0_sgpr1_sgpr2_sgpr3, $sgpr32, 4, 0, 0, implicit $exec :: (load (s32) from %stack.17, addrspace 5)
+    ; GFX90A-NEXT: S_NOP 0, implicit $vgpr0_vgpr1_vgpr2_vgpr3_vgpr4_vgpr5_vgpr6_vgpr7
+    ; GFX90A-NEXT: S_NOP 0, implicit $vgpr8_vgpr9_vgpr10_vgpr11_vgpr12_vgpr13_vgpr14_vgpr15
+    ; GFX90A-NEXT: S_NOP 0, implicit $vgpr16_vgpr17_vgpr18_vgpr19_vgpr20_vgpr21_vgpr22_vgpr23
+    ; GFX90A-NEXT: S_NOP 0, implicit $vgpr24_vgpr25_vgpr26_vgpr27_vgpr28_vgpr29_vgpr30_vgpr31
+    ; GFX90A-NEXT: S_NOP 0, implicit $vgpr32_vgpr33_vgpr34_vgpr35_vgpr36_vgpr37_vgpr38_vgpr39
+    ; GFX90A-NEXT: S_NOP 0, implicit $vgpr40_vgpr41_vgpr42_vgpr43_vgpr44_vgpr45_vgpr46_vgpr47
+    ; GFX90A-NEXT: S_NOP 0, implicit $vgpr48_vgpr49_vgpr50_vgpr51_vgpr52_vgpr53_vgpr54_vgpr55
+    ; GFX90A-NEXT: S_NOP 0, implicit $vgpr56_vgpr57_vgpr58_vgpr59_vgpr60_vgpr61_vgpr62_vgpr63
+    ; GFX90A-NEXT: $vgpr63 = V_ACCVGPR_READ_B32_e64 $agpr15, implicit $exec
+    ; GFX90A-NEXT: $vgpr62 = V_ACCVGPR_READ_B32_e64 $agpr14, implicit $exec
+    ; GFX90A-NEXT: $vgpr61 = V_ACCVGPR_READ_B32_e64 $agpr13, implicit $exec
+    ; GFX90A-NEXT: $vgpr60 = V_ACCVGPR_READ_B32_e64 $agpr12, implicit $exec
+    ; GFX90A-NEXT: $vgpr59 = V_ACCVGPR_READ_B32_e64 $agpr11, implicit $exec
+    ; GFX90A-NEXT: $vgpr58 = V_ACCVGPR_READ_B32_e64 $agpr10, implicit $exec
+    ; GFX90A-NEXT: $vgpr57 = V_ACCVGPR_READ_B32_e64 $agpr9, implicit $exec
+    ; GFX90A-NEXT: $vgpr56 = V_ACCVGPR_READ_B32_e64 $agpr8, implicit $exec
+    ; GFX90A-NEXT: $vgpr47 = V_ACCVGPR_READ_B32_e64 $agpr7, implicit $exec
+    ; GFX90A-NEXT: $vgpr46 = V_ACCVGPR_READ_B32_e64 $agpr6, implicit $exec
+    ; GFX90A-NEXT: $vgpr45 = V_ACCVGPR_READ_B32_e64 $agpr5, implicit $exec
+    ; GFX90A-NEXT: $vgpr44 = V_ACCVGPR_READ_B32_e64 $agpr4, implicit $exec
+    ; GFX90A-NEXT: $vgpr43 = V_ACCVGPR_READ_B32_e64 $agpr3, implicit $exec
+    ; GFX90A-NEXT: $vgpr42 = V_ACCVGPR_READ_B32_e64 $agpr2, implicit $exec
+    ; GFX90A-NEXT: $vgpr41 = V_ACCVGPR_READ_B32_e64 $agpr1, implicit $exec
+    ; GFX90A-NEXT: $vgpr40 = V_ACCVGPR_READ_B32_e64 $agpr0, implicit $exec
+    ; GFX90A-NEXT: S_ENDPGM 0, implicit $sgpr4, implicit $scc
+    S_NOP 0, implicit-def $vgpr0_vgpr1_vgpr2_vgpr3_vgpr4_vgpr5_vgpr6_vgpr7
+    S_NOP 0, implicit-def $vgpr8_vgpr9_vgpr10_vgpr11_vgpr12_vgpr13_vgpr14_vgpr15
+    S_NOP 0, implicit-def $vgpr16_vgpr17_vgpr18_vgpr19_vgpr20_vgpr21_vgpr22_vgpr23
+    S_NOP 0, implicit-def $vgpr24_vgpr25_vgpr26_vgpr27_vgpr28_vgpr29_vgpr30_vgpr31
+    S_NOP 0, implicit $vgpr32_vgpr33_vgpr34_vgpr35_vgpr36_vgpr37_vgpr38_vgpr39
+    S_NOP 0, implicit-def $vgpr40_vgpr41_vgpr42_vgpr43_vgpr44_vgpr45_vgpr46_vgpr47
+    S_NOP 0, implicit-def $vgpr48_vgpr49_vgpr50_vgpr51_vgpr52_vgpr53_vgpr54_vgpr55
+    S_NOP 0, implicit-def $vgpr56_vgpr57_vgpr58_vgpr59_vgpr60_vgpr61_vgpr62_vgpr63
+    S_CMP_EQ_I32 $sgpr4, $sgpr5, implicit-def $scc
+
+    renamable $sgpr4 = S_MOV_B32 %stack.0
+
+    S_NOP 0, implicit $vgpr0_vgpr1_vgpr2_vgpr3_vgpr4_vgpr5_vgpr6_vgpr7
+    S_NOP 0, implicit $vgpr8_vgpr9_vgpr10_vgpr11_vgpr12_vgpr13_vgpr14_vgpr15
+    S_NOP 0, implicit $vgpr16_vgpr17_vgpr18_vgpr19_vgpr20_vgpr21_vgpr22_vgpr23
+    S_NOP 0, implicit $vgpr24_vgpr25_vgpr26_vgpr27_vgpr28_vgpr29_vgpr30_vgpr31
+    S_NOP 0, implicit $vgpr32_vgpr33_vgpr34_vgpr35_vgpr36_vgpr37_vgpr38_vgpr39
+    S_NOP 0, implicit $vgpr40_vgpr41_vgpr42_vgpr43_vgpr44_vgpr45_vgpr46_vgpr47
+    S_NOP 0, implicit $vgpr48_vgpr49_vgpr50_vgpr51_vgpr52_vgpr53_vgpr54_vgpr55
+    S_NOP 0, implicit $vgpr56_vgpr57_vgpr58_vgpr59_vgpr60_vgpr61_vgpr62_vgpr63
+    S_ENDPGM 0, implicit $sgpr4, implicit $scc
+
+...
+
+# FIXME: This is clobbering scc
+---
+name: materialize_fi_s_mov_b32_offset_96_live_scc__no_free_vgprs
+tracksRegLiveness: true
+stack:
+  - { id: 0, type: default, size: 64, alignment: 16, stack-id: default }
+  - { id: 1, type: default, size: 4, alignment: 4, stack-id: default }
+machineFunctionInfo:
+  scratchRSrcReg:  $sgpr0_sgpr1_sgpr2_sgpr3
+  stackPtrOffsetReg: '$sgpr32'
+body:             |
+  bb.0:
+    liveins: $sgpr4, $sgpr5
+
+    ; GFX8-LABEL: name: materialize_fi_s_mov_b32_offset_96_live_scc__no_free_vgprs
+    ; GFX8: liveins: $sgpr4, $sgpr5, $vgpr40, $vgpr41, $vgpr42, $vgpr43, $vgpr44, $vgpr45, $vgpr46, $vgpr47, $vgpr56, $vgpr57, $vgpr58, $vgpr59, $vgpr60, $vgpr61, $vgpr62, $vgpr63
+    ; GFX8-NEXT: {{  $}}
+    ; GFX8-NEXT: BUFFER_STORE_DWORD_OFFSET killed $vgpr40, $sgpr0_sgpr1_sgpr2_sgpr3, $sgpr32, 60, 0, 0, implicit $exec :: (store (s32) into %stack.2, addrspace 5)
+    ; GFX8-NEXT: BUFFER_STORE_DWORD_OFFSET killed $vgpr41, $sgpr0_sgpr1_sgpr2_sgpr3, $sgpr32, 56, 0, 0, implicit $exec :: (store (s32) into %stack.3, addrspace 5)
+    ; GFX8-NEXT: BUFFER_STORE_DWORD_OFFSET killed $vgpr42, $sgpr0_sgpr1_sgpr2_sgpr3, $sgpr32, 52, 0, 0, implicit $exec :: (store (s32) into %stack.4, addrspace 5)
+    ; GFX8-NEXT: BUFFER_STORE_DWORD_OFFSET killed $vgpr43, $sgpr0_sgpr1_sgpr2_sgpr3, $sgpr32, 48, 0, 0, implicit $exec :: (store (s32) into %stack.5, addrspace 5)
+    ; GFX8-NEXT: BUFFER_STORE_DWORD_OFFSET killed $vgpr44, $sgpr0_sgpr1_sgpr2_sgpr3, $sgpr32, 44, 0, 0, implicit $exec :: (store (s32) into %stack.6, addrspace 5)
+    ; GFX8-NEXT: BUFFER_STORE_DWORD_OFFSET killed $vgpr45, $sgpr0_sgpr1_sgpr2_sgpr3, $sgpr32, 40, 0, 0, implicit $exec :: (store (s32) into %stack.7, addrspace 5)
+    ; GFX8-NEXT: BUFFER_STORE_DWORD_OFFSET killed $vgpr46, $sgpr0_sgpr1_sgpr2_sgpr3, $sgpr32, 36, 0, 0, implicit $exec :: (store (s32) into %stack.8, addrspace 5)
+    ; GFX8-NEXT: BUFFER_STORE_DWORD_OFFSET killed $vgpr47, $sgpr0_sgpr1_sgpr2_sgpr3, $sgpr32, 32, 0, 0, implicit $exec :: (store (s32) into %stack.9, addrspace 5)
+    ; GFX8-NEXT: BUFFER_STORE_DWORD_OFFSET killed $vgpr56, $sgpr0_sgpr1_sgpr2_sgpr3, $sgpr32, 28, 0, 0, implicit $exec :: (store (s32) into %stack.10, addrspace 5)
+    ; GFX8-NEXT: BUFFER_STORE_DWORD_OFFSET killed $vgpr57, $sgpr0_sgpr1_sgpr2_sgpr3, $sgpr32, 24, 0, 0, implicit $exec :: (store (s32) into %stack.11, addrspace 5)
+    ; GFX8-NEXT: BUFFER_STORE_DWORD_OFFSET killed $vgpr58, $sgpr0_sgpr1_sgpr2_sgpr3, $sgpr32, 20, 0, 0, implicit $exec :: (store (s32) into %stack.12, addrspace 5)
+    ; GFX8-NEXT: BUFFER_STORE_DWORD_OFFSET killed $vgpr59, $sgpr0_sgpr1_sgpr2_sgpr3, $sgpr32, 16, 0, 0, implicit $exec :: (store (s32) into %stack.13, addrspace 5)
+    ; GFX8-NEXT: BUFFER_STORE_DWORD_OFFSET killed $vgpr60, $sgpr0_sgpr1_sgpr2_sgpr3, $sgpr32, 12, 0, 0, implicit $exec :: (store (s32) into %stack.14, addrspace 5)
+    ; GFX8-NEXT: BUFFER_STORE_DWORD_OFFSET killed $vgpr61, $sgpr0_sgpr1_sgpr2_sgpr3, $sgpr32, 8, 0, 0, implicit $exec :: (store (s32) into %stack.15, addrspace 5)
+    ; GFX8-NEXT: BUFFER_STORE_DWORD_OFFSET killed $vgpr62, $sgpr0_sgpr1_sgpr2_sgpr3, $sgpr32, 4, 0, 0, implicit $exec :: (store (s32) into %stack.16, addrspace 5)
+    ; GFX8-NEXT: BUFFER_STORE_DWORD_OFFSET killed $vgpr63, $sgpr0_sgpr1_sgpr2_sgpr3, $sgpr32, 0, 0, 0, implicit $exec :: (store (s32) into %stack.17, addrspace 5)
+    ; GFX8-NEXT: S_NOP 0, implicit-def $vgpr0_vgpr1_vgpr2_vgpr3_vgpr4_vgpr5_vgpr6_vgpr7
+    ; GFX8-NEXT: S_NOP 0, implicit-def $vgpr8_vgpr9_vgpr10_vgpr11_vgpr12_vgpr13_vgpr14_vgpr15
+    ; GFX8-NEXT: S_NOP 0, implicit-def $vgpr16_vgpr17_vgpr18_vgpr19_vgpr20_vgpr21_vgpr22_vgpr23
+    ; GFX8-NEXT: S_NOP 0, implicit-def $vgpr24_vgpr25_vgpr26_vgpr27_vgpr28_vgpr29_vgpr30_vgpr31
+    ; GFX8-NEXT: S_NOP 0, implicit-def $vgpr32_vgpr33_vgpr34_vgpr35_vgpr36_vgpr37_vgpr38_vgpr39
+    ; GFX8-NEXT: S_NOP 0, implicit-def $vgpr40_vgpr41_vgpr42_vgpr43_vgpr44_vgpr45_vgpr46_vgpr47
+    ; GFX8-NEXT: S_NOP 0, implicit-def $vgpr48_vgpr49_vgpr50_vgpr51_vgpr52_vgpr53_vgpr54_vgpr55
+    ; GFX8-NEXT: S_NOP 0, implicit-def $vgpr56_vgpr57_vgpr58_vgpr59_vgpr60_vgpr61_vgpr62_vgpr63
+    ; GFX8-NEXT: S_CMP_EQ_I32 $sgpr4, $sgpr5, implicit-def $scc
+    ; GFX8-NEXT: BUFFER_STORE_DWORD_OFFSET killed $vgpr0, $sgpr0_sgpr1_sgpr2_sgpr3, $sgpr32, 132, 0, 0, implicit $exec :: (store (s32) into %stack.18, addrspace 5)
+    ; GFX8-NEXT: $vgpr0 = V_LSHRREV_B32_e64 6, $sgpr32, implicit $exec
+    ; GFX8-NEXT: $sgpr4 = S_MOV_B32 128
+    ; GFX8-NEXT: $vgpr0, dead $vcc = V_ADD_CO_U32_e64 killed $sgpr4, killed $vgpr0, 0, implicit $exec
+    ; GFX8-NEXT: $sgpr4 = V_READFIRSTLANE_B32 $vgpr0, implicit $exec
+    ; GFX8-NEXT: $vgpr0 = BUFFER_LOAD_DWORD_OFFSET $sgpr0_sgpr1_sgpr2_sgpr3, $sgpr32, 132, 0, 0, implicit $exec :: (load (s32) from %stack.18, addrspace 5)
+    ; GFX8-NEXT: S_NOP 0, implicit $vgpr0_vgpr1_vgpr2_vgpr3_vgpr4_vgpr5_vgpr6_vgpr7
+    ; GFX8-NEXT: S_NOP 0, implicit $vgpr8_vgpr9_vgpr10_vgpr11_vgpr12_vgpr13_vgpr14_vgpr15
+    ; GFX8-NEXT: S_NOP 0, implicit $vgpr16_vgpr17_vgpr18_vgpr19_vgpr20_vgpr21_vgpr22_vgpr23
+    ; GFX8-NEXT: S_NOP 0, implicit $vgpr24_vgpr25_vgpr26_vgpr27_vgpr28_vgpr29_vgpr30_vgpr31
+    ; GFX8-NEXT: S_NOP 0, implicit $vgpr32_vgpr33_vgpr34_vgpr35_vgpr36_vgpr37_vgpr38_vgpr39
+    ; GFX8-NEXT: S_NOP 0, implicit $vgpr40_vgpr41_vgpr42_vgpr43_vgpr44_vgpr45_vgpr46_vgpr47
+    ; GFX8-NEXT: S_NOP 0, implicit $vgpr48_vgpr49_vgpr50_vgpr51_vgpr52_vgpr53_vgpr54_vgpr55
+    ; GFX8-NEXT: S_NOP 0, implicit $vgpr56_vgpr57_vgpr58_vgpr59_vgpr60_vgpr61_vgpr62_vgpr63
+    ; GFX8-NEXT: $vgpr63 = BUFFER_LOAD_DWORD_OFFSET $sgpr0_sgpr1_sgpr2_sgpr3, $sgpr32, 0, 0, 0, implicit $exec :: (load (s32) from %stack.17, addrspace 5)
+    ; GFX8-NEXT: $vgpr62 = BUFFER_LOAD_DWORD_OFFSET $sgpr0_sgpr1_sgpr2_sgpr3, $sgpr32, 4, 0, 0, implicit $exec :: (load (s32) from %stack.16, addrspace 5)
+    ; GFX8-NEXT: $vgpr61 = BUFFER_LOAD_DWORD_OFFSET $sgpr0_sgpr1_sgpr2_sgpr3, $sgpr32, 8, 0, 0, implicit $exec :: (load (s32) from %stack.15, addrspace 5)
+    ; GFX8-NEXT: $vgpr60 = BUFFER_LOAD_DWORD_OFFSET $sgpr0_sgpr1_sgpr2_sgpr3, $sgpr32, 12, 0, 0, implicit $exec :: (load (s32) from %stack.14, addrspace 5)
+    ; GFX8-NEXT: $vgpr59 = BUFFER_LOAD_DWORD_OFFSET $sgpr0_sgpr1_sgpr2_sgpr3, $sgpr32, 16, 0, 0, implicit $exec :: (load (s32) from %stack.13, addrspace 5)
+    ; GFX8-NEXT: $vgpr58 = BUFFER_LOAD_DWORD_OFFSET $sgpr0_sgpr1_sgpr2_sgpr3, $sgpr32, 20, 0, 0, implicit $exec :: (load (s32) from %stack.12, addrspace 5)
+    ; GFX8-NEXT: $vgpr57 = BUFFER_LOAD_DWORD_OFFSET $sgpr0_sgpr1_sgpr2_sgpr3, $sgpr32, 24, 0, 0, implicit $exec :: (load (s32) from %stack.11, addrspace 5)
+    ; GFX8-NEXT: $vgpr56 = BUFFER_LOAD_DWORD_OFFSET $sgpr0_sgpr1_sgpr2_sgpr3, $sgpr32, 28, 0, 0, implicit $exec :: (load (s32) from %stack.10, addrspace 5)
+    ; GFX8-NEXT: $vgpr47 = BUFFER_LOAD_DWORD_OFFSET $sgpr0_sgpr1_sgpr2_sgpr3, $sgpr32, 32, 0, 0, implicit $exec :: (load (s32) from %stack.9, addrspace 5)
+    ; GFX8-NEXT: $vgpr46 = BUFFER_LOAD_DWORD_OFFSET $sgpr0_sgpr1_sgpr2_sgpr3, $sgpr32, 36, 0, 0, implicit $exec :: (load (s32) from %stack.8, addrspace 5)
+    ; GFX8-NEXT: $vgpr45 = BUFFER_LOAD_DWORD_OFFSET $sgpr0_sgpr1_sgpr2_sgpr3, $sgpr32, 40, 0, 0, implicit $exec :: (load (s32) from %stack.7, addrspace 5)
+    ; GFX8-NEXT: $vgpr44 = BUFFER_LOAD_DWORD_OFFSET $sgpr0_sgpr1_sgpr2_sgpr3, $sgpr32, 44, 0, 0, implicit $exec :: (load (s32) from %stack.6, addrspace 5)
+    ; GFX8-NEXT: $vgpr43 = BUFFER_LOAD_DWORD_OFFSET $sgpr0_sgpr1_sgpr2_sgpr3, $sgpr32, 48, 0, 0, implicit $exec :: (load (s32) from %stack.5, addrspace 5)
+    ; GFX8-NEXT: $vgpr42 = BUFFER_LOAD_DWORD_OFFSET $sgpr0_sgpr1_sgpr2_sgpr3, $sgpr32, 52, 0, 0, implicit $exec :: (load (s32) from %stack.4, addrspace 5)
+    ; GFX8-NEXT: $vgpr41 = BUFFER_LOAD_DWORD_OFFSET $sgpr0_sgpr1_sgpr2_sgpr3, $sgpr32, 56, 0, 0, implicit $exec :: (load (s32) from %stack.3, addrspace 5)
+    ; GFX8-NEXT: $vgpr40 = BUFFER_LOAD_DWORD_OFFSET $sgpr0_sgpr1_sgpr2_sgpr3, $sgpr32, 60, 0, 0, implicit $exec :: (load (s32) from %stack.2, addrspace 5)
+    ; GFX8-NEXT: S_ENDPGM 0, implicit $sgpr4, implicit $scc
+    ;
+    ; GFX900-LABEL: name: materialize_fi_s_mov_b32_offset_96_live_scc__no_free_vgprs
+    ; GFX900: liveins: $sgpr4, $sgpr5, $vgpr40, $vgpr41, $vgpr42, $vgpr43, $vgpr44, $vgpr45, $vgpr46, $vgpr47, $vgpr56, $vgpr57, $vgpr58, $vgpr59, $vgpr60, $vgpr61, $vgpr62, $vgpr63
+    ; GFX900-NEXT: {{  $}}
+    ; GFX900-NEXT: BUFFER_STORE_DWORD_OFFSET killed $vgpr40, $sgpr0_sgpr1_sgpr2_sgpr3, $sgpr32, 60, 0, 0, implicit $exec :: (store (s32) into %stack.2, addrspace 5)
+    ; GFX900-NEXT: BUFFER_STORE_DWORD_OFFSET killed $vgpr41, $sgpr0_sgpr1_sgpr2_sgpr3, $sgpr32, 56, 0, 0, implicit $exec :: (store (s32) into %stack.3, addrspace 5)
+    ; GFX900-NEXT: BUFFER_STORE_DWORD_OFFSET killed $vgpr42, $sgpr0_sgpr1_sgpr2_sgpr3, $sgpr32, 52, 0, 0, implicit $exec :: (store (s32) into %stack.4, addrspace 5)
+    ; GFX900-NEXT: BUFFER_STORE_DWORD_OFFSET killed $vgpr43, $sgpr0_sgpr1_sgpr2_sgpr3, $sgpr32, 48, 0, 0, implicit $exec :: (store (s32) into %stack.5, addrspace 5)
+    ; GFX900-NEXT: BUFFER_STORE_DWORD_OFFSET killed $vgpr44, $sgpr0_sgpr1_sgpr2_sgpr3, $sgpr32, 44, 0, 0, implicit $exec :: (store (s32) into %stack.6, addrspace 5)
+    ; GFX900-NEXT: BUFFER_STORE_DWORD_OFFSET killed $vgpr45, $sgpr0_sgpr1_sgpr2_sgpr3, $sgpr32, 40, 0, 0, implicit $exec :: (store (s32) into %stack.7, addrspace 5)
+    ; GFX900-NEXT: BUFFER_STORE_DWORD_OFFSET killed $vgpr46, $sgpr0_sgpr1_sgpr2_sgpr3, $sgpr32, 36, 0, 0, implicit $exec :: (store (s32) into %stack.8, addrspace 5)
+    ; GFX900-NEXT: BUFFER_STORE_DWORD_OFFSET killed $vgpr47, $sgpr0_sgpr1_sgpr2_sgpr3, $sgpr32, 32, 0, 0, implicit $exec :: (store (s32) into %stack.9, addrspace 5)
+    ; GFX900-NEXT: BUFFER_STORE_DWORD_OFFSET killed $vgpr56, $sgpr0_sgpr1_sgpr2_sgpr3, $sgpr32, 28, 0, 0, implicit $exec :: (store (s32) into %stack.10, addrspace 5)
+    ; GFX900-NEXT: BUFFER_STORE_DWORD_OFFSET killed $vgpr57, $sgpr0_sgpr1_sgpr2_sgpr3, $sgpr32, 24, 0, 0, implicit $exec :: (store (s32) into %stack.11, addrspace 5)
+    ; GFX900-NEXT: BUFFER_STORE_DWORD_OFFSET killed $vgpr58, $sgpr0_sgpr1_sgpr2_sgpr3, $sgpr32, 20, 0, 0, implicit $exec :: (store (s32) into %stack.12, addrspace 5)
+    ; GFX900-NEXT: BUFFER_STORE_DWORD_OFFSET killed $vgpr59, $sgpr0_sgpr1_sgpr2_sgpr3, $sgpr32, 16, 0, 0, implicit $exec :: (store (s32) into %stack.13, addrspace 5)
+    ; GFX900-NEXT: BUFFER_STORE_DWORD_OFFSET killed $vgpr60, $sgpr0_sgpr1_sgpr2_sgpr3, $sgpr32, 12, 0, 0, implicit $exec :: (store (s32) into %stack.14, addrspace 5)
+    ; GFX900-NEXT: BUFFER_STORE_DWORD_OFFSET killed $vgpr61, $sgpr0_sgpr1_sgpr2_sgpr3, $sgpr32, 8, 0, 0, implicit $exec :: (store (s32) into %stack.15, addrspace 5)
+    ; GFX900-NEXT: BUFFER_STORE_DWORD_OFFSET killed $vgpr62, $sgpr0_sgpr1_sgpr2_sgpr3, $sgpr32, 4, 0, 0, implicit $exec :: (store (s32) into %stack.16, addrspace 5)
+    ; GFX900-NEXT: BUFFER_STORE_DWORD_OFFSET killed $vgpr63, $sgpr0_sgpr1_sgpr2_sgpr3, $sgpr32, 0, 0, 0, implicit $exec :: (store (s32) into %stack.17, addrspace 5)
+    ; GFX900-NEXT: S_NOP 0, implicit-def $vgpr0_vgpr1_vgpr2_vgpr3_vgpr4_vgpr5_vgpr6_vgpr7
+    ; GFX900-NEXT: S_NOP 0, implicit-def $vgpr8_vgpr9_vgpr10_vgpr11_vgpr12_vgpr13_vgpr14_vgpr15
+    ; GFX900-NEXT: S_NOP 0, implicit-def $vgpr16_vgpr17_vgpr18_vgpr19_vgpr20_vgpr21_vgpr22_vgpr23
+    ; GFX900-NEXT: S_NOP 0, implicit-def $vgpr24_vgpr25_vgpr26_vgpr27_vgpr28_vgpr29_vgpr30_vgpr31
+    ; GFX900-NEXT: S_NOP 0, implicit-def $vgpr32_vgpr33_vgpr34_vgpr35_vgpr36_vgpr37_vgpr38_vgpr39
+    ; GFX900-NEXT: S_NOP 0, implicit-def $vgpr40_vgpr41_vgpr42_vgpr43_vgpr44_vgpr45_vgpr46_vgpr47
+    ; GFX900-NEXT: S_NOP 0, implicit-def $vgpr48_vgpr49_vgpr50_vgpr51_vgpr52_vgpr53_vgpr54_vgpr55
+    ; GFX900-NEXT: S_NOP 0, implicit-def $vgpr56_vgpr57_vgpr58_vgpr59_vgpr60_vgpr61_vgpr62_vgpr63
+    ; GFX900-NEXT: S_CMP_EQ_I32 $sgpr4, $sgpr5, implicit-def $scc
+    ; GFX900-NEXT: BUFFER_STORE_DWORD_OFFSET killed $vgpr0, $sgpr0_sgpr1_sgpr2_sgpr3, $sgpr32, 132, 0, 0, implicit $exec :: (store (s32) into %stack.18, addrspace 5)
+    ; GFX900-NEXT: $vgpr0 = V_LSHRREV_B32_e64 6, $sgpr32, implicit $exec
+    ; GFX900-NEXT: $vgpr0 = V_ADD_U32_e32 128, killed $vgpr0, implicit $exec
+    ; GFX900-NEXT: $sgpr4 = V_READFIRSTLANE_B32 $vgpr0, implicit $exec
+    ; GFX900-NEXT: $vgpr0 = BUFFER_LOAD_DWORD_OFFSET $sgpr0_sgpr1_sgpr2_sgpr3, $sgpr32, 132, 0, 0, implicit $exec :: (load (s32) from %stack.18, addrspace 5)
+    ; GFX900-NEXT: S_NOP 0, implicit $vgpr0_vgpr1_vgpr2_vgpr3_vgpr4_vgpr5_vgpr6_vgpr7
+    ; GFX900-NEXT: S_NOP 0, implicit $vgpr8_vgpr9_vgpr10_vgpr11_vgpr12_vgpr13_vgpr14_vgpr15
+    ; GFX900-NEXT: S_NOP 0, implicit $vgpr16_vgpr17_vgpr18_vgpr19_vgpr20_vgpr21_vgpr22_vgpr23
+    ; GFX900-NEXT: S_NOP 0, implicit $vgpr24_vgpr25_vgpr26_vgpr27_vgpr28_vgpr29_vgpr30_vgpr31
+    ; GFX900-NEXT: S_NOP 0, implicit $vgpr32_vgpr33_vgpr34_vgpr35_vgpr36_vgpr37_vgpr38_vgpr39
+    ; GFX900-NEXT: S_NOP 0, implicit $vgpr40_vgpr41_vgpr42_vgpr43_vgpr44_vgpr45_vgpr46_vgpr47
+    ; GFX900-NEXT: S_NOP 0, implicit $vgpr48_vgpr49_vgpr50_vgpr51_vgpr52_vgpr53_vgpr54_vgpr55
+    ; GFX900-NEXT: S_NOP 0, implicit $vgpr56_vgpr57_vgpr58_vgpr59_vgpr60_vgpr61_vgpr62_vgpr63
+    ; GFX900-NEXT: $vgpr63 = BUFFER_LOAD_DWORD_OFFSET $sgpr0_sgpr1_sgpr2_sgpr3, $sgpr32, 0, 0, 0, implicit $exec :: (load (s32) from %stack.17, addrspace 5)
+    ; GFX900-NEXT: $vgpr62 = BUFFER_LOAD_DWORD_OFFSET $sgpr0_sgpr1_sgpr2_sgpr3, $sgpr32, 4, 0, 0, implicit $exec :: (load (s32) from %stack.16, addrspace 5)
+    ; GFX900-NEXT: $vgpr61 = BUFFER_LOAD_DWORD_OFFSET $sgpr0_sgpr1_sgpr2_sgpr3, $sgpr32, 8, 0, 0, implicit $exec :: (load (s32) from %stack.15, addrspace 5)
+    ; GFX900-NEXT: $vgpr60 = BUFFER_LOAD_DWORD_OFFSET $sgpr0_sgpr1_sgpr2_sgpr3, $sgpr32, 12, 0, 0, implicit $exec :: (load (s32) from %stack.14, addrspace 5)
+    ; GFX900-NEXT: $vgpr59 = BUFFER_LOAD_DWORD_OFFSET $sgpr0_sgpr1_sgpr2_sgpr3, $sgpr32, 16, 0, 0, implicit $exec :: (load (s32) from %stack.13, addrspace 5)
+    ; GFX900-NEXT: $vgpr58 = BUFFER_LOAD_DWORD_OFFSET $sgpr0_sgpr1_sgpr2_sgpr3, $sgpr32, 20, 0, 0, implicit $exec :: (load (s32) from %stack.12, addrspace 5)
+    ; GFX900-NEXT: $vgpr57 = BUFFER_LOAD_DWORD_OFFSET $sgpr0_sgpr1_sgpr2_sgpr3, $sgpr32, 24, 0, 0, implicit $exec :: (load (s32) from %stack.11, addrspace 5)
+    ; GFX900-NEXT: $vgpr56 = BUFFER_LOAD_DWORD_OFFSET $sgpr0_sgpr1_sgpr2_sgpr3, $sgpr32, 28, 0, 0, implicit $exec :: (load (s32) from %stack.10, addrspace 5)
+    ; GFX900-NEXT: $vgpr47 = BUFFER_LOAD_DWORD_OFFSET $sgpr0_sgpr1_sgpr2_sgpr3, $sgpr32, 32, 0, 0, implicit $exec :: (load (s32) from %stack.9, addrspace 5)
+    ; GFX900-NEXT: $vgpr46 = BUFFER_LOAD_DWORD_OFFSET $sgpr0_sgpr1_sgpr2_sgpr3, $sgpr32, 36, 0, 0, implicit $exec :: (load (s32) from %stack.8, addrspace 5)
+    ; GFX900-NEXT: $vgpr45 = BUFFER_LOAD_DWORD_OFFSET $sgpr0_sgpr1_sgpr2_sgpr3, $sgpr32, 40, 0, 0, implicit $exec :: (load (s32) from %stack.7, addrspace 5)
+    ; GFX900-NEXT: $vgpr44 = BUFFER_LOAD_DWORD_OFFSET $sgpr0_sgpr1_sgpr2_sgpr3, $sgpr32, 44, 0, 0, implicit $exec :: (load (s32) from %stack.6, addrspace 5)
+    ; GFX900-NEXT: $vgpr43 = BUFFER_LOAD_DWORD_OFFSET $sgpr0_sgpr1_sgpr2_sgpr3, $sgpr32, 48, 0, 0, implicit $exec :: (load (s32) from %stack.5, addrspace 5)
+    ; GFX900-NEXT: $vgpr42 = BUFFER_LOAD_DWORD_OFFSET $sgpr0_sgpr1_sgpr2_sgpr3, $sgpr32, 52, 0, 0, implicit $exec :: (load (s32) from %stack.4, addrspace 5)
+    ; GFX900-NEXT: $vgpr41 = BUFFER_LOAD_DWORD_OFFSET $sgpr0_sgpr1_sgpr2_sgpr3, $sgpr32, 56, 0, 0, implicit $exec :: (load (s32) from %stack.3, addrspace 5)
+    ; GFX900-NEXT: $vgpr40 = BUFFER_LOAD_DWORD_OFFSET $sgpr0_sgpr1_sgpr2_sgpr3, $sgpr32, 60, 0, 0, implicit $exec :: (load (s32) from %stack.2, addrspace 5)
+    ; GFX900-NEXT: S_ENDPGM 0, implicit $sgpr4, implicit $scc
+    ;
+    ; GFX90A-LABEL: name: materialize_fi_s_mov_b32_offset_96_live_scc__no_free_vgprs
+    ; GFX90A: liveins: $agpr0, $agpr1, $agpr2, $agpr3, $agpr4, $agpr5, $agpr6, $agpr7, $agpr8, $agpr9, $agpr10, $agpr11, $agpr12, $agpr13, $agpr14, $agpr15, $sgpr4, $sgpr5, $vgpr40, $vgpr41, $vgpr42, $vgpr43, $vgpr44, $vgpr45, $vgpr46, $vgpr47, $vgpr56, $vgpr57, $vgpr58, $vgpr59, $vgpr60, $vgpr61, $vgpr62, $vgpr63
+    ; GFX90A-NEXT: {{  $}}
+    ; GFX90A-NEXT: $agpr0 = V_ACCVGPR_WRITE_B32_e64 killed $vgpr40, implicit $exec
+    ; GFX90A-NEXT: $agpr1 = V_ACCVGPR_WRITE_B32_e64 killed $vgpr41, implicit $exec
+    ; GFX90A-NEXT: $agpr2 = V_ACCVGPR_WRITE_B32_e64 killed $vgpr42, implicit $exec
+    ; GFX90A-NEXT: $agpr3 = V_ACCVGPR_WRITE_B32_e64 killed $vgpr43, implicit $exec
+    ; GFX90A-NEXT: $agpr4 = V_ACCVGPR_WRITE_B32_e64 killed $vgpr44, implicit $exec
+    ; GFX90A-NEXT: $agpr5 = V_ACCVGPR_WRITE_B32_e64 killed $vgpr45, implicit $exec
+    ; GFX90A-NEXT: $agpr6 = V_ACCVGPR_WRITE_B32_e64 killed $vgpr46, implicit $exec
+    ; GFX90A-NEXT: $agpr7 = V_ACCVGPR_WRITE_B32_e64 killed $vgpr47, implicit $exec
+    ; GFX90A-NEXT: $agpr8 = V_ACCVGPR_WRITE_B32_e64 killed $vgpr56, implicit $exec
+    ; GFX90A-NEXT: $agpr9 = V_ACCVGPR_WRITE_B32_e64 killed $vgpr57, implicit $exec
+    ; GFX90A-NEXT: $agpr10 = V_ACCVGPR_WRITE_B32_e64 killed $vgpr58, implicit $exec
+    ; GFX90A-NEXT: $agpr11 = V_ACCVGPR_WRITE_B32_e64 killed $vgpr59, implicit $exec
+    ; GFX90A-NEXT: $agpr12 = V_ACCVGPR_WRITE_B32_e64 killed $vgpr60, implicit $exec
+    ; GFX90A-NEXT: $agpr13 = V_ACCVGPR_WRITE_B32_e64 killed $vgpr61, implicit $exec
+    ; GFX90A-NEXT: $agpr14 = V_ACCVGPR_WRITE_B32_e64 killed $vgpr62, implicit $exec
+    ; GFX90A-NEXT: $agpr15 = V_ACCVGPR_WRITE_B32_e64 killed $vgpr63, implicit $exec
+    ; GFX90A-NEXT: S_NOP 0, implicit-def $vgpr0_vgpr1_vgpr2_vgpr3_vgpr4_vgpr5_vgpr6_vgpr7
+    ; GFX90A-NEXT: S_NOP 0, implicit-def $vgpr8_vgpr9_vgpr10_vgpr11_vgpr12_vgpr13_vgpr14_vgpr15
+    ; GFX90A-NEXT: S_NOP 0, implicit-def $vgpr16_vgpr17_vgpr18_vgpr19_vgpr20_vgpr21_vgpr22_vgpr23
+    ; GFX90A-NEXT: S_NOP 0, implicit-def $vgpr24_vgpr25_vgpr26_vgpr27_vgpr28_vgpr29_vgpr30_vgpr31
+    ; GFX90A-NEXT: S_NOP 0, implicit-def $vgpr32_vgpr33_vgpr34_vgpr35_vgpr36_vgpr37_vgpr38_vgpr39
+    ; GFX90A-NEXT: S_NOP 0, implicit-def $vgpr40_vgpr41_vgpr42_vgpr43_vgpr44_vgpr45_vgpr46_vgpr47
+    ; GFX90A-NEXT: S_NOP 0, implicit-def $vgpr48_vgpr49_vgpr50_vgpr51_vgpr52_vgpr53_vgpr54_vgpr55
+    ; GFX90A-NEXT: S_NOP 0, implicit-def $vgpr56_vgpr57_vgpr58_vgpr59_vgpr60_vgpr61_vgpr62_vgpr63
+    ; GFX90A-NEXT: S_CMP_EQ_I32 $sgpr4, $sgpr5, implicit-def $scc
+    ; GFX90A-NEXT: BUFFER_STORE_DWORD_OFFSET killed $vgpr0, $sgpr0_sgpr1_sgpr2_sgpr3, $sgpr32, 68, 0, 0, implicit $exec :: (store (s32) into %stack.18, addrspace 5)
+    ; GFX90A-NEXT: $vgpr0 = V_LSHRREV_B32_e64 6, $sgpr32, implicit $exec
+    ; GFX90A-NEXT: $vgpr0 = V_ADD_U32_e32 64, killed $vgpr0, implicit $exec
+    ; GFX90A-NEXT: $sgpr4 = V_READFIRSTLANE_B32 $vgpr0, implicit $exec
+    ; GFX90A-NEXT: $vgpr0 = BUFFER_LOAD_DWORD_OFFSET $sgpr0_sgpr1_sgpr2_sgpr3, $sgpr32, 68, 0, 0, implicit $exec :: (load (s32) from %stack.18, addrspace 5)
+    ; GFX90A-NEXT: S_NOP 0, implicit $vgpr0_vgpr1_vgpr2_vgpr3_vgpr4_vgpr5_vgpr6_vgpr7
+    ; GFX90A-NEXT: S_NOP 0, implicit $vgpr8_vgpr9_vgpr10_vgpr11_vgpr12_vgpr13_vgpr14_vgpr15
+    ; GFX90A-NEXT: S_NOP 0, implicit $vgpr16_vgpr17_vgpr18_vgpr19_vgpr20_vgpr21_vgpr22_vgpr23
+    ; GFX90A-NEXT: S_NOP 0, implicit $vgpr24_vgpr25_vgpr26_vgpr27_vgpr28_vgpr29_vgpr30_vgpr31
+    ; GFX90A-NEXT: S_NOP 0, implicit $vgpr32_vgpr33_vgpr34_vgpr35_vgpr36_vgpr37_vgpr38_vgpr39
+    ; GFX90A-NEXT: S_NOP 0, implicit $vgpr40_vgpr41_vgpr42_vgpr43_vgpr44_vgpr45_vgpr46_vgpr47
+    ; GFX90A-NEXT: S_NOP 0, implicit $vgpr48_vgpr49_vgpr50_vgpr51_vgpr52_vgpr53_vgpr54_vgpr55
+    ; GFX90A-NEXT: S_NOP 0, implicit $vgpr56_vgpr57_vgpr58_vgpr59_vgpr60_vgpr61_vgpr62_vgpr63
+    ; GFX90A-NEXT: $vgpr63 = V_ACCVGPR_READ_B32_e64 $agpr15, implicit $exec
+    ; GFX90A-NEXT: $vgpr62 = V_ACCVGPR_READ_B32_e64 $agpr14, implicit $exec
+    ; GFX90A-NEXT: $vgpr61 = V_ACCVGPR_READ_B32_e64 $agpr13, implicit $exec
+    ; GFX90A-NEXT: $vgpr60 = V_ACCVGPR_READ_B32_e64 $agpr12, implicit $exec
+    ; GFX90A-NEXT: $vgpr59 = V_ACCVGPR_READ_B32_e64 $agpr11, implicit $exec
+    ; GFX90A-NEXT: $vgpr58 = V_ACCVGPR_READ_B32_e64 $agpr10, implicit $exec
+    ; GFX90A-NEXT: $vgpr57 = V_ACCVGPR_READ_B32_e64 $agpr9, implicit $exec
+    ; GFX90A-NEXT: $vgpr56 = V_ACCVGPR_READ_B32_e64 $agpr8, implicit $exec
+    ; GFX90A-NEXT: $vgpr47 = V_ACCVGPR_READ_B32_e64 $agpr7, implicit $exec
+    ; GFX90A-NEXT: $vgpr46 = V_ACCVGPR_READ_B32_e64 $agpr6, implicit $exec
+    ; GFX90A-NEXT: $vgpr45 = V_ACCVGPR_READ_B32_e64 $agpr5, implicit $exec
+    ; GFX90A-NEXT: $vgpr44 = V_ACCVGPR_READ_B32_e64 $agpr4, implicit $exec
+    ; GFX90A-NEXT: $vgpr43 = V_ACCVGPR_READ_B32_e64 $agpr3, implicit $exec
+    ; GFX90A-NEXT: $vgpr42 = V_ACCVGPR_READ_B32_e64 $agpr2, implicit $exec
+    ; GFX90A-NEXT: $vgpr41 = V_ACCVGPR_READ_B32_e64 $agpr1, implicit $exec
+    ; GFX90A-NEXT: $vgpr40 = V_ACCVGPR_READ_B32_e64 $agpr0, implicit $exec
+    ; GFX90A-NEXT: S_ENDPGM 0, implicit $sgpr4, implicit $scc
+    S_NOP 0, implicit-def $vgpr0_vgpr1_vgpr2_vgpr3_vgpr4_vgpr5_vgpr6_vgpr7
+    S_NOP 0, implicit-def $vgpr8_vgpr9_vgpr10_vgpr11_vgpr12_vgpr13_vgpr14_vgpr15
+    S_NOP 0, implicit-def $vgpr16_vgpr17_vgpr18_vgpr19_vgpr20_vgpr21_vgpr22_vgpr23
+    S_NOP 0, implicit-def $vgpr24_vgpr25_vgpr26_vgpr27_vgpr28_vgpr29_vgpr30_vgpr31
+    S_NOP 0, implicit-def $vgpr32_vgpr33_vgpr34_vgpr35_vgpr36_vgpr37_vgpr38_vgpr39
+    S_NOP 0, implicit-def $vgpr40_vgpr41_vgpr42_vgpr43_vgpr44_vgpr45_vgpr46_vgpr47
+    S_NOP 0, implicit-def $vgpr48_vgpr49_vgpr50_vgpr51_vgpr52_vgpr53_vgpr54_vgpr55
+    S_NOP 0, implicit-def $vgpr56_vgpr57_vgpr58_vgpr59_vgpr60_vgpr61_vgpr62_vgpr63
+    S_CMP_EQ_I32 $sgpr4, $sgpr5, implicit-def $scc
+
+    renamable $sgpr4 = S_MOV_B32 %stack.1
+
+    S_NOP 0, implicit $vgpr0_vgpr1_vgpr2_vgpr3_vgpr4_vgpr5_vgpr6_vgpr7
+    S_NOP 0, implicit $vgpr8_vgpr9_vgpr10_vgpr11_vgpr12_vgpr13_vgpr14_vgpr15
+    S_NOP 0, implicit $vgpr16_vgpr17_vgpr18_vgpr19_vgpr20_vgpr21_vgpr22_vgpr23
+    S_NOP 0, implicit $vgpr24_vgpr25_vgpr26_vgpr27_vgpr28_vgpr29_vgpr30_vgpr31
+    S_NOP 0, implicit $vgpr32_vgpr33_vgpr34_vgpr35_vgpr36_vgpr37_vgpr38_vgpr39
+    S_NOP 0, implicit $vgpr40_vgpr41_vgpr42_vgpr43_vgpr44_vgpr45_vgpr46_vgpr47
+    S_NOP 0, implicit $vgpr48_vgpr49_vgpr50_vgpr51_vgpr52_vgpr53_vgpr54_vgpr55
+    S_NOP 0, implicit $vgpr56_vgpr57_vgpr58_vgpr59_vgpr60_vgpr61_vgpr62_vgpr63
+    S_ENDPGM 0, implicit $sgpr4, implicit $scc
+
+...
diff --git a/llvm/test/CodeGen/AMDGPU/inline-asm.ll b/llvm/test/CodeGen/AMDGPU/inline-asm.ll
index 555af50..881433f 100644
--- a/llvm/test/CodeGen/AMDGPU/inline-asm.ll
+++ b/llvm/test/CodeGen/AMDGPU/inline-asm.ll
@@ -314,3 +314,52 @@ exit:
   tail call void asm sideeffect "; use $0", "v"(i64 %v1)
   ret void
 }
+
+; CHECK-LABEL: {{^}}scc_as_i32:
+; CHECK: ; def scc
+; CHECK: ; use scc
+define void @scc_as_i32() {
+  %scc = call i32 asm sideeffect "; def $0", "={scc}"()
+  call void asm sideeffect "; use $0 ", "{scc}"(i32 %scc)
+  ret void
+}
+
+; CHECK-LABEL: {{^}}scc_as_i1:
+; CHECK: ; def scc
+; CHECK: ; use scc
+define void @scc_as_i1() {
+  %scc = call i1 asm sideeffect "; def $0", "={scc}"()
+  call void asm sideeffect "; use $0 ", "{scc}"(i1 %scc)
+  ret void
+}
+
+; Make sure the SGPR def is treated as a uniform value when the inline
+; assembly also defines a divergent value. The add should be scalar
+; and not introduce illegal vgpr to sgpr copies.
+; CHECK-LABEL: {{^}}mixed_def_vgpr_sgpr_def_asm:
+; CHECK: ; def v0 s[4:5]
+; CHECK: s_add_u32
+; CHECK-NEXT: s_addc_u32
+; CHECK: ; use s[4:5]
+define void @mixed_def_vgpr_sgpr_def_asm() {
+  %vgpr_sgpr = call { i32, i64 } asm sideeffect "; def $0 $1 ", "=v,={s[4:5]}"()
+  %vgpr = extractvalue { i32, i64 } %vgpr_sgpr, 0
+  %sgpr = extractvalue { i32, i64 } %vgpr_sgpr, 1
+  %sgpr.add = add i64 %sgpr, 2
+  call void asm sideeffect "; use $0 ", "{s[4:5]}"(i64 %sgpr.add)
+  ret void
+}
+
+; CHECK-LABEL: {{^}}mixed_def_sgpr_vgpr_def_asm:
+; CHECK: ; def s[4:5] v0
+; CHECK: s_add_u32
+; CHECK-NEXT: s_addc_u32
+; CHECK: ; use s[4:5]
+define void @mixed_def_sgpr_vgpr_def_asm() {
+  %sgpr_vgpr = call { i64, i32 } asm sideeffect "; def $0 $1 ", "={s[4:5]},=v"()
+  %sgpr = extractvalue { i64, i32 } %sgpr_vgpr, 0
+  %vgpr = extractvalue { i64, i32 } %sgpr_vgpr, 1
+  %sgpr.add = add i64 %sgpr, 2
+  call void asm sideeffect "; use $0 ", "{s[4:5]}"(i64 %sgpr.add)
+  ret void
+}
diff --git a/llvm/test/CodeGen/AMDGPU/insert_vector_dynelt.ll b/llvm/test/CodeGen/AMDGPU/insert_vector_dynelt.ll
index b62bf89..247ec40 100644
--- a/llvm/test/CodeGen/AMDGPU/insert_vector_dynelt.ll
+++ b/llvm/test/CodeGen/AMDGPU/insert_vector_dynelt.ll
@@ -974,19 +974,19 @@ define amdgpu_kernel void @bit4_inselt(ptr addrspace(1) %out, <4 x i1> %vec, i32
 ; GCN-NEXT:    v_mov_b32_e32 v0, 0
 ; GCN-NEXT:    s_waitcnt lgkmcnt(0)
 ; GCN-NEXT:    s_and_b32 s3, s3, 3
-; GCN-NEXT:    v_mov_b32_e32 v1, s2
 ; GCN-NEXT:    v_lshrrev_b16_e64 v2, 1, s2
-; GCN-NEXT:    v_lshrrev_b16_e64 v3, 2, s2
-; GCN-NEXT:    v_lshrrev_b16_e64 v4, 3, s2
+; GCN-NEXT:    v_mov_b32_e32 v3, s2
+; GCN-NEXT:    v_lshrrev_b16_e64 v4, 2, s2
+; GCN-NEXT:    v_lshrrev_b16_e64 v5, 3, s2
+; GCN-NEXT:    v_mov_b32_e32 v1, 1
 ; GCN-NEXT:    v_or_b32_e32 v0, s3, v0
 ; GCN-NEXT:    v_and_b32_e32 v2, 1, v2
-; GCN-NEXT:    v_and_b32_e32 v3, 3, v3
-; GCN-NEXT:    v_and_b32_e32 v4, 1, v4
-; GCN-NEXT:    buffer_store_byte v1, off, s[12:15], 0
-; GCN-NEXT:    buffer_store_byte v4, off, s[12:15], 0 offset:3
-; GCN-NEXT:    buffer_store_byte v3, off, s[12:15], 0 offset:2
+; GCN-NEXT:    v_and_b32_e32 v4, 3, v4
+; GCN-NEXT:    v_and_b32_e32 v5, 1, v5
+; GCN-NEXT:    buffer_store_byte v3, off, s[12:15], 0
+; GCN-NEXT:    buffer_store_byte v5, off, s[12:15], 0 offset:3
+; GCN-NEXT:    buffer_store_byte v4, off, s[12:15], 0 offset:2
 ; GCN-NEXT:    buffer_store_byte v2, off, s[12:15], 0 offset:1
-; GCN-NEXT:    v_mov_b32_e32 v1, 1
 ; GCN-NEXT:    buffer_store_byte v1, v0, s[12:15], 0 offen
 ; GCN-NEXT:    buffer_load_ubyte v0, off, s[12:15], 0
 ; GCN-NEXT:    buffer_load_ubyte v1, off, s[12:15], 0 offset:1
diff --git a/llvm/test/CodeGen/AMDGPU/lds-frame-extern.ll b/llvm/test/CodeGen/AMDGPU/lds-frame-extern.ll
index 266ab68..d51ace6 100644
--- a/llvm/test/CodeGen/AMDGPU/lds-frame-extern.ll
+++ b/llvm/test/CodeGen/AMDGPU/lds-frame-extern.ll
@@ -94,14 +94,14 @@ define amdgpu_kernel void @module_0_kernel_normal_extern_normal(i32 %idx) {
 ; CHECK-LABEL: module_0_kernel_normal_extern_normal:
 ; CHECK:       ; %bb.0:
 ; CHECK-NEXT:    s_load_dword s0, s[6:7], 0x0
-; CHECK-NEXT:    v_mov_b32_e32 v0, 0
-; CHECK-NEXT:    v_mov_b32_e32 v1, 2
+; CHECK-NEXT:    v_mov_b32_e32 v0, 2
+; CHECK-NEXT:    v_mov_b32_e32 v1, 0
 ; CHECK-NEXT:    s_waitcnt lgkmcnt(0)
 ; CHECK-NEXT:    s_lshl_b32 s0, s0, 2
 ; CHECK-NEXT:    s_add_i32 s0, s0, 4
 ; CHECK-NEXT:    v_mov_b32_e32 v2, s0
-; CHECK-NEXT:    ds_write_b16 v0, v1
-; CHECK-NEXT:    ds_write_b32 v2, v0
+; CHECK-NEXT:    ds_write_b16 v1, v0
+; CHECK-NEXT:    ds_write_b32 v2, v1
 ; CHECK-NEXT:    s_endpgm
   store i16 2, ptr addrspace(3) @kernel_normal
 
@@ -134,14 +134,14 @@ define amdgpu_kernel void @module_1_kernel_normal_extern_normal(i32 %idx) {
 ; CHECK-NEXT:    s_waitcnt lgkmcnt(0)
 ; CHECK-NEXT:    s_swappc_b64 s[30:31], s[18:19]
 ; CHECK-NEXT:    s_lshl_b32 s4, s15, 2
-; CHECK-NEXT:    v_mov_b32_e32 v0, 0
-; CHECK-NEXT:    v_mov_b32_e32 v1, 1
+; CHECK-NEXT:    v_mov_b32_e32 v0, 1
+; CHECK-NEXT:    v_mov_b32_e32 v1, 0
 ; CHECK-NEXT:    s_add_i32 s4, s4, 4
 ; CHECK-NEXT:    v_mov_b32_e32 v2, 2
 ; CHECK-NEXT:    v_mov_b32_e32 v3, s4
-; CHECK-NEXT:    ds_write_b16 v0, v1
-; CHECK-NEXT:    ds_write_b16 v0, v2 offset:2
-; CHECK-NEXT:    ds_write_b32 v3, v0
+; CHECK-NEXT:    ds_write_b16 v1, v0
+; CHECK-NEXT:    ds_write_b16 v1, v2 offset:2
+; CHECK-NEXT:    ds_write_b32 v3, v1
 ; CHECK-NEXT:    s_endpgm
   call void @use_module()
   store i16 1, ptr addrspace(3) @module_variable
@@ -157,14 +157,14 @@ define amdgpu_kernel void @module_0_kernel_overalign_extern_normal(i32 %idx) {
 ; CHECK-LABEL: module_0_kernel_overalign_extern_normal:
 ; CHECK:       ; %bb.0:
 ; CHECK-NEXT:    s_load_dword s0, s[6:7], 0x0
-; CHECK-NEXT:    v_mov_b32_e32 v0, 0
-; CHECK-NEXT:    v_mov_b32_e32 v1, 2
+; CHECK-NEXT:    v_mov_b32_e32 v0, 2
+; CHECK-NEXT:    v_mov_b32_e32 v1, 0
 ; CHECK-NEXT:    s_waitcnt lgkmcnt(0)
 ; CHECK-NEXT:    s_lshl_b32 s0, s0, 2
 ; CHECK-NEXT:    s_add_i32 s0, s0, 4
 ; CHECK-NEXT:    v_mov_b32_e32 v2, s0
-; CHECK-NEXT:    ds_write_b16 v0, v1
-; CHECK-NEXT:    ds_write_b32 v2, v0
+; CHECK-NEXT:    ds_write_b16 v1, v0
+; CHECK-NEXT:    ds_write_b32 v2, v1
 ; CHECK-NEXT:    s_endpgm
   store i16 2, ptr addrspace(3) @kernel_overalign
 
@@ -197,14 +197,14 @@ define amdgpu_kernel void @module_1_kernel_overalign_extern_normal(i32 %idx) {
 ; CHECK-NEXT:    s_waitcnt lgkmcnt(0)
 ; CHECK-NEXT:    s_swappc_b64 s[30:31], s[18:19]
 ; CHECK-NEXT:    s_lshl_b32 s4, s15, 2
-; CHECK-NEXT:    v_mov_b32_e32 v0, 0
-; CHECK-NEXT:    v_mov_b32_e32 v1, 1
+; CHECK-NEXT:    v_mov_b32_e32 v0, 1
+; CHECK-NEXT:    v_mov_b32_e32 v1, 0
 ; CHECK-NEXT:    s_add_i32 s4, s4, 8
 ; CHECK-NEXT:    v_mov_b32_e32 v2, 2
 ; CHECK-NEXT:    v_mov_b32_e32 v3, s4
-; CHECK-NEXT:    ds_write_b16 v0, v1
-; CHECK-NEXT:    ds_write_b16 v0, v2 offset:4
-; CHECK-NEXT:    ds_write_b32 v3, v0
+; CHECK-NEXT:    ds_write_b16 v1, v0
+; CHECK-NEXT:    ds_write_b16 v1, v2 offset:4
+; CHECK-NEXT:    ds_write_b32 v3, v1
 ; CHECK-NEXT:    s_endpgm
   call void @use_module()
   store i16 1, ptr addrspace(3) @module_variable
@@ -220,14 +220,14 @@ define amdgpu_kernel void @module_0_kernel_normal_extern_overalign(i32 %idx) {
 ; CHECK-LABEL: module_0_kernel_normal_extern_overalign:
 ; CHECK:       ; %bb.0:
 ; CHECK-NEXT:    s_load_dword s0, s[6:7], 0x0
-; CHECK-NEXT:    v_mov_b32_e32 v0, 0
-; CHECK-NEXT:    v_mov_b32_e32 v1, 2
+; CHECK-NEXT:    v_mov_b32_e32 v0, 2
+; CHECK-NEXT:    v_mov_b32_e32 v1, 0
 ; CHECK-NEXT:    s_waitcnt lgkmcnt(0)
 ; CHECK-NEXT:    s_lshl_b32 s0, s0, 2
 ; CHECK-NEXT:    s_add_i32 s0, s0, 8
 ; CHECK-NEXT:    v_mov_b32_e32 v2, s0
-; CHECK-NEXT:    ds_write_b16 v0, v1
-; CHECK-NEXT:    ds_write_b32 v2, v0
+; CHECK-NEXT:    ds_write_b16 v1, v0
+; CHECK-NEXT:    ds_write_b32 v2, v1
 ; CHECK-NEXT:    s_endpgm
   store i16 2, ptr addrspace(3) @kernel_normal
 
@@ -260,14 +260,14 @@ define amdgpu_kernel void @module_1_kernel_normal_extern_overalign(i32 %idx) {
 ; CHECK-NEXT:    s_waitcnt lgkmcnt(0)
 ; CHECK-NEXT:    s_swappc_b64 s[30:31], s[18:19]
 ; CHECK-NEXT:    s_lshl_b32 s4, s15, 2
-; CHECK-NEXT:    v_mov_b32_e32 v0, 0
-; CHECK-NEXT:    v_mov_b32_e32 v1, 1
+; CHECK-NEXT:    v_mov_b32_e32 v0, 1
+; CHECK-NEXT:    v_mov_b32_e32 v1, 0
 ; CHECK-NEXT:    s_add_i32 s4, s4, 8
 ; CHECK-NEXT:    v_mov_b32_e32 v2, 2
 ; CHECK-NEXT:    v_mov_b32_e32 v3, s4
-; CHECK-NEXT:    ds_write_b16 v0, v1
-; CHECK-NEXT:    ds_write_b16 v0, v2 offset:2
-; CHECK-NEXT:    ds_write_b32 v3, v0
+; CHECK-NEXT:    ds_write_b16 v1, v0
+; CHECK-NEXT:    ds_write_b16 v1, v2 offset:2
+; CHECK-NEXT:    ds_write_b32 v3, v1
 ; CHECK-NEXT:    s_endpgm
   call void @use_module()
   store i16 1, ptr addrspace(3) @module_variable
@@ -283,14 +283,14 @@ define amdgpu_kernel void @module_0_kernel_overalign_extern_overalign(i32 %idx)
 ; CHECK-LABEL: module_0_kernel_overalign_extern_overalign:
 ; CHECK:       ; %bb.0:
 ; CHECK-NEXT:    s_load_dword s0, s[6:7], 0x0
-; CHECK-NEXT:    v_mov_b32_e32 v0, 0
-; CHECK-NEXT:    v_mov_b32_e32 v1, 2
+; CHECK-NEXT:    v_mov_b32_e32 v0, 2
+; CHECK-NEXT:    v_mov_b32_e32 v1, 0
 ; CHECK-NEXT:    s_waitcnt lgkmcnt(0)
 ; CHECK-NEXT:    s_lshl_b32 s0, s0, 2
 ; CHECK-NEXT:    s_add_i32 s0, s0, 8
 ; CHECK-NEXT:    v_mov_b32_e32 v2, s0
-; CHECK-NEXT:    ds_write_b16 v0, v1
-; CHECK-NEXT:    ds_write_b32 v2, v0
+; CHECK-NEXT:    ds_write_b16 v1, v0
+; CHECK-NEXT:    ds_write_b32 v2, v1
 ; CHECK-NEXT:    s_endpgm
   store i16 2, ptr addrspace(3) @kernel_overalign
 
@@ -323,14 +323,14 @@ define amdgpu_kernel void @module_1_kernel_overalign_extern_overalign(i32 %idx)
 ; CHECK-NEXT:    s_waitcnt lgkmcnt(0)
 ; CHECK-NEXT:    s_swappc_b64 s[30:31], s[18:19]
 ; CHECK-NEXT:    s_lshl_b32 s4, s15, 2
-; CHECK-NEXT:    v_mov_b32_e32 v0, 0
-; CHECK-NEXT:    v_mov_b32_e32 v1, 1
+; CHECK-NEXT:    v_mov_b32_e32 v0, 1
+; CHECK-NEXT:    v_mov_b32_e32 v1, 0
 ; CHECK-NEXT:    s_add_i32 s4, s4, 8
 ; CHECK-NEXT:    v_mov_b32_e32 v2, 2
 ; CHECK-NEXT:    v_mov_b32_e32 v3, s4
-; CHECK-NEXT:    ds_write_b16 v0, v1
-; CHECK-NEXT:    ds_write_b16 v0, v2 offset:4
-; CHECK-NEXT:    ds_write_b32 v3, v0
+; CHECK-NEXT:    ds_write_b16 v1, v0
+; CHECK-NEXT:    ds_write_b16 v1, v2 offset:4
+; CHECK-NEXT:    ds_write_b32 v3, v1
 ; CHECK-NEXT:    s_endpgm
   call void @use_module()
   store i16 1, ptr addrspace(3) @module_variable
@@ -368,11 +368,11 @@ define amdgpu_kernel void @module_0_kernel_normal_indirect_extern_normal(i32 %id
 ; CHECK-NEXT:    v_lshlrev_b32_e32 v2, 20, v2
 ; CHECK-NEXT:    s_load_dwordx2 s[6:7], s[6:7], 0x0
 ; CHECK-NEXT:    v_lshlrev_b32_e32 v1, 10, v1
-; CHECK-NEXT:    v_mov_b32_e32 v3, 0
-; CHECK-NEXT:    v_mov_b32_e32 v4, 2
+; CHECK-NEXT:    v_mov_b32_e32 v3, 2
+; CHECK-NEXT:    v_mov_b32_e32 v4, 0
 ; CHECK-NEXT:    s_mov_b32 s15, 0
 ; CHECK-NEXT:    v_or3_b32 v31, v0, v1, v2
-; CHECK-NEXT:    ds_write_b16 v3, v4
+; CHECK-NEXT:    ds_write_b16 v4, v3
 ; CHECK-NEXT:    s_waitcnt lgkmcnt(0)
 ; CHECK-NEXT:    s_swappc_b64 s[30:31], s[6:7]
 ; CHECK-NEXT:    s_endpgm
@@ -408,12 +408,12 @@ define amdgpu_kernel void @module_1_kernel_normal_indirect_extern_normal(i32 %id
 ; CHECK-NEXT:    s_getpc_b64 s[6:7]
 ; CHECK-NEXT:    s_add_u32 s6, s6, use_extern_normal@gotpcrel32@lo+4
 ; CHECK-NEXT:    s_addc_u32 s7, s7, use_extern_normal@gotpcrel32@hi+12
-; CHECK-NEXT:    v_mov_b32_e32 v0, 0
+; CHECK-NEXT:    v_mov_b32_e32 v0, 1
 ; CHECK-NEXT:    s_load_dwordx2 s[6:7], s[6:7], 0x0
-; CHECK-NEXT:    v_mov_b32_e32 v1, 1
+; CHECK-NEXT:    v_mov_b32_e32 v1, 0
 ; CHECK-NEXT:    v_mov_b32_e32 v2, 2
-; CHECK-NEXT:    ds_write_b16 v0, v1
-; CHECK-NEXT:    ds_write_b16 v0, v2 offset:2
+; CHECK-NEXT:    ds_write_b16 v1, v0
+; CHECK-NEXT:    ds_write_b16 v1, v2 offset:2
 ; CHECK-NEXT:    s_waitcnt lgkmcnt(0)
 ; CHECK-NEXT:    s_swappc_b64 s[30:31], s[6:7]
 ; CHECK-NEXT:    s_endpgm
@@ -445,11 +445,11 @@ define amdgpu_kernel void @module_0_kernel_overalign_indirect_extern_normal(i32
 ; CHECK-NEXT:    v_lshlrev_b32_e32 v2, 20, v2
 ; CHECK-NEXT:    s_load_dwordx2 s[6:7], s[6:7], 0x0
 ; CHECK-NEXT:    v_lshlrev_b32_e32 v1, 10, v1
-; CHECK-NEXT:    v_mov_b32_e32 v3, 0
-; CHECK-NEXT:    v_mov_b32_e32 v4, 2
+; CHECK-NEXT:    v_mov_b32_e32 v3, 2
+; CHECK-NEXT:    v_mov_b32_e32 v4, 0
 ; CHECK-NEXT:    s_mov_b32 s15, 2
 ; CHECK-NEXT:    v_or3_b32 v31, v0, v1, v2
-; CHECK-NEXT:    ds_write_b16 v3, v4
+; CHECK-NEXT:    ds_write_b16 v4, v3
 ; CHECK-NEXT:    s_waitcnt lgkmcnt(0)
 ; CHECK-NEXT:    s_swappc_b64 s[30:31], s[6:7]
 ; CHECK-NEXT:    s_endpgm
@@ -485,12 +485,12 @@ define amdgpu_kernel void @module_1_kernel_overalign_indirect_extern_normal(i32
 ; CHECK-NEXT:    s_getpc_b64 s[6:7]
 ; CHECK-NEXT:    s_add_u32 s6, s6, use_extern_normal@gotpcrel32@lo+4
 ; CHECK-NEXT:    s_addc_u32 s7, s7, use_extern_normal@gotpcrel32@hi+12
-; CHECK-NEXT:    v_mov_b32_e32 v0, 0
+; CHECK-NEXT:    v_mov_b32_e32 v0, 1
 ; CHECK-NEXT:    s_load_dwordx2 s[6:7], s[6:7], 0x0
-; CHECK-NEXT:    v_mov_b32_e32 v1, 1
+; CHECK-NEXT:    v_mov_b32_e32 v1, 0
 ; CHECK-NEXT:    v_mov_b32_e32 v2, 2
-; CHECK-NEXT:    ds_write_b16 v0, v1
-; CHECK-NEXT:    ds_write_b16 v0, v2 offset:4
+; CHECK-NEXT:    ds_write_b16 v1, v0
+; CHECK-NEXT:    ds_write_b16 v1, v2 offset:4
 ; CHECK-NEXT:    s_waitcnt lgkmcnt(0)
 ; CHECK-NEXT:    s_swappc_b64 s[30:31], s[6:7]
 ; CHECK-NEXT:    s_endpgm
@@ -522,11 +522,11 @@ define amdgpu_kernel void @module_0_kernel_normal_indirect_extern_overalign(i32
 ; CHECK-NEXT:    v_lshlrev_b32_e32 v2, 20, v2
 ; CHECK-NEXT:    s_load_dwordx2 s[6:7], s[6:7], 0x0
 ; CHECK-NEXT:    v_lshlrev_b32_e32 v1, 10, v1
-; CHECK-NEXT:    v_mov_b32_e32 v3, 0
-; CHECK-NEXT:    v_mov_b32_e32 v4, 2
+; CHECK-NEXT:    v_mov_b32_e32 v3, 2
+; CHECK-NEXT:    v_mov_b32_e32 v4, 0
 ; CHECK-NEXT:    s_mov_b32 s15, 1
 ; CHECK-NEXT:    v_or3_b32 v31, v0, v1, v2
-; CHECK-NEXT:    ds_write_b16 v3, v4
+; CHECK-NEXT:    ds_write_b16 v4, v3
 ; CHECK-NEXT:    s_waitcnt lgkmcnt(0)
 ; CHECK-NEXT:    s_swappc_b64 s[30:31], s[6:7]
 ; CHECK-NEXT:    s_endpgm
@@ -562,12 +562,12 @@ define amdgpu_kernel void @module_1_kernel_normal_indirect_extern_overalign(i32
 ; CHECK-NEXT:    s_getpc_b64 s[6:7]
 ; CHECK-NEXT:    s_add_u32 s6, s6, use_extern_overalign@gotpcrel32@lo+4
 ; CHECK-NEXT:    s_addc_u32 s7, s7, use_extern_overalign@gotpcrel32@hi+12
-; CHECK-NEXT:    v_mov_b32_e32 v0, 0
+; CHECK-NEXT:    v_mov_b32_e32 v0, 1
 ; CHECK-NEXT:    s_load_dwordx2 s[6:7], s[6:7], 0x0
-; CHECK-NEXT:    v_mov_b32_e32 v1, 1
+; CHECK-NEXT:    v_mov_b32_e32 v1, 0
 ; CHECK-NEXT:    v_mov_b32_e32 v2, 2
-; CHECK-NEXT:    ds_write_b16 v0, v1
-; CHECK-NEXT:    ds_write_b16 v0, v2 offset:2
+; CHECK-NEXT:    ds_write_b16 v1, v0
+; CHECK-NEXT:    ds_write_b16 v1, v2 offset:2
 ; CHECK-NEXT:    s_waitcnt lgkmcnt(0)
 ; CHECK-NEXT:    s_swappc_b64 s[30:31], s[6:7]
 ; CHECK-NEXT:    s_endpgm
@@ -599,11 +599,11 @@ define amdgpu_kernel void @module_0_kernel_overalign_indirect_extern_overalign(i
 ; CHECK-NEXT:    v_lshlrev_b32_e32 v2, 20, v2
 ; CHECK-NEXT:    s_load_dwordx2 s[6:7], s[6:7], 0x0
 ; CHECK-NEXT:    v_lshlrev_b32_e32 v1, 10, v1
-; CHECK-NEXT:    v_mov_b32_e32 v3, 0
-; CHECK-NEXT:    v_mov_b32_e32 v4, 2
+; CHECK-NEXT:    v_mov_b32_e32 v3, 2
+; CHECK-NEXT:    v_mov_b32_e32 v4, 0
 ; CHECK-NEXT:    s_mov_b32 s15, 3
 ; CHECK-NEXT:    v_or3_b32 v31, v0, v1, v2
-; CHECK-NEXT:    ds_write_b16 v3, v4
+; CHECK-NEXT:    ds_write_b16 v4, v3
 ; CHECK-NEXT:    s_waitcnt lgkmcnt(0)
 ; CHECK-NEXT:    s_swappc_b64 s[30:31], s[6:7]
 ; CHECK-NEXT:    s_endpgm
@@ -639,12 +639,12 @@ define amdgpu_kernel void @module_1_kernel_overalign_indirect_extern_overalign(i
 ; CHECK-NEXT:    s_getpc_b64 s[6:7]
 ; CHECK-NEXT:    s_add_u32 s6, s6, use_extern_overalign@gotpcrel32@lo+4
 ; CHECK-NEXT:    s_addc_u32 s7, s7, use_extern_overalign@gotpcrel32@hi+12
-; CHECK-NEXT:    v_mov_b32_e32 v0, 0
+; CHECK-NEXT:    v_mov_b32_e32 v0, 1
 ; CHECK-NEXT:    s_load_dwordx2 s[6:7], s[6:7], 0x0
-; CHECK-NEXT:    v_mov_b32_e32 v1, 1
+; CHECK-NEXT:    v_mov_b32_e32 v1, 0
 ; CHECK-NEXT:    v_mov_b32_e32 v2, 2
-; CHECK-NEXT:    ds_write_b16 v0, v1
-; CHECK-NEXT:    ds_write_b16 v0, v2 offset:4
+; CHECK-NEXT:    ds_write_b16 v1, v0
+; CHECK-NEXT:    ds_write_b16 v1, v2 offset:4
 ; CHECK-NEXT:    s_waitcnt lgkmcnt(0)
 ; CHECK-NEXT:    s_swappc_b64 s[30:31], s[6:7]
 ; CHECK-NEXT:    s_endpgm
diff --git a/llvm/test/CodeGen/AMDGPU/lower-lds-struct-aa-memcpy.ll b/llvm/test/CodeGen/AMDGPU/lower-lds-struct-aa-memcpy.ll
index 7998d430d..0fb9e25 100644
--- a/llvm/test/CodeGen/AMDGPU/lower-lds-struct-aa-memcpy.ll
+++ b/llvm/test/CodeGen/AMDGPU/lower-lds-struct-aa-memcpy.ll
@@ -19,18 +19,18 @@ $_f2 = comdat any
 define protected amdgpu_kernel void @test(ptr addrspace(1) nocapture %ptr.coerce) local_unnamed_addr #0 {
 ; GCN-LABEL: test:
 ; GCN:       ; %bb.0: ; %entry
-; GCN-NEXT:    v_mov_b32_e32 v0, 0
-; GCN-NEXT:    v_mov_b32_e32 v1, 2
-; GCN-NEXT:    ds_write_b8 v0, v1
-; GCN-NEXT:    ds_read_u8 v2, v0 offset:2
-; GCN-NEXT:    ds_read_u16 v3, v0
+; GCN-NEXT:    v_mov_b32_e32 v0, 2
+; GCN-NEXT:    v_mov_b32_e32 v1, 0
+; GCN-NEXT:    ds_write_b8 v1, v0
+; GCN-NEXT:    ds_read_u8 v2, v1 offset:2
+; GCN-NEXT:    ds_read_u16 v3, v1
 ; GCN-NEXT:    s_load_dwordx2 s[0:1], s[2:3], 0x24
 ; GCN-NEXT:    s_waitcnt lgkmcnt(0)
-; GCN-NEXT:    ds_write_b8 v0, v2 offset:6
-; GCN-NEXT:    ds_write_b16 v0, v3 offset:4
-; GCN-NEXT:    v_cmp_eq_u16_sdwa s[2:3], v3, v1 src0_sel:BYTE_0 src1_sel:DWORD
-; GCN-NEXT:    v_cndmask_b32_e64 v1, 0, 1, s[2:3]
-; GCN-NEXT:    global_store_byte v0, v1, s[0:1]
+; GCN-NEXT:    ds_write_b8 v1, v2 offset:6
+; GCN-NEXT:    ds_write_b16 v1, v3 offset:4
+; GCN-NEXT:    v_cmp_eq_u16_sdwa s[2:3], v3, v0 src0_sel:BYTE_0 src1_sel:DWORD
+; GCN-NEXT:    v_cndmask_b32_e64 v0, 0, 1, s[2:3]
+; GCN-NEXT:    global_store_byte v1, v0, s[0:1]
 ; GCN-NEXT:    s_endpgm
 ; CHECK-LABEL: define protected amdgpu_kernel void @test(
 ; CHECK-SAME: ptr addrspace(1) nocapture [[PTR_COERCE:%.*]]) local_unnamed_addr #[[ATTR0:[0-9]+]] {
@@ -47,7 +47,6 @@ define protected amdgpu_kernel void @test(ptr addrspace(1) nocapture %ptr.coerce
 ; CHECK-NEXT:    [[FROMBOOL8:%.*]] = zext i1 [[TMP2]] to i8
 ; CHECK-NEXT:    store i8 [[FROMBOOL8]], ptr addrspace(1) [[PTR_COERCE]], align 1
 ; CHECK-NEXT:    ret void
-;
 entry:
   store i8 3, ptr addrspace(3) @_f1, align 1
   tail call void @llvm.memcpy.p3.p3.i64(ptr addrspace(3) noundef align 1 dereferenceable(3) @_f2, ptr addrspace(3) noundef align 1 dereferenceable(3) @_f1, i64 3, i1 false)
diff --git a/llvm/test/CodeGen/AMDGPU/materialize-frame-index-sgpr.gfx10.ll b/llvm/test/CodeGen/AMDGPU/materialize-frame-index-sgpr.gfx10.ll
new file mode 100644
index 0000000..94d1eca
--- /dev/null
+++ b/llvm/test/CodeGen/AMDGPU/materialize-frame-index-sgpr.gfx10.ll
@@ -0,0 +1,1836 @@
+; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py UTC_ARGS: --version 5
+; RUN: llc -mtriple=amdgcn-amd-amdhsa -mcpu=gfx1010 -mattr=+cumode < %s | FileCheck -check-prefix=GFX10_1 %s
+; RUN: llc -mtriple=amdgcn-amd-amdhsa -mcpu=gfx1030 -mattr=+cumode < %s | FileCheck -check-prefix=GFX10_3 %s
+; RUN: llc -mtriple=amdgcn-amd-amdhsa -mcpu=gfx1100 -mattr=+cumode < %s | FileCheck -check-prefix=GFX11 %s
+; RUN: llc -mtriple=amdgcn-amd-amdhsa -mcpu=gfx1200 -mattr=+cumode < %s | FileCheck -check-prefix=GFX12 %s
+; RUN: llc -mtriple=amdgcn-amd-amdhsa -mcpu=gfx803 < %s | FileCheck -check-prefix=GFX8 %s
+; RUN: llc -mtriple=amdgcn-amd-amdhsa -mcpu=gfx900 < %s | FileCheck -check-prefixes=GFX9,GFX900 %s
+; RUN: llc -mtriple=amdgcn-amd-amdhsa -mcpu=gfx940 < %s | FileCheck -check-prefixes=GFX9,GFX940 %s
+
+; We aren't pressuring the SGPRs, so this can use the add with carry out pre-gfx9.
+define void @scalar_mov_materializes_frame_index_unavailable_scc() #0 {
+; GFX10_1-LABEL: scalar_mov_materializes_frame_index_unavailable_scc:
+; GFX10_1:       ; %bb.0:
+; GFX10_1-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX10_1-NEXT:    s_xor_saveexec_b32 s4, -1
+; GFX10_1-NEXT:    s_add_i32 s5, s32, 0x80880
+; GFX10_1-NEXT:    buffer_store_dword v1, off, s[0:3], s5 ; 4-byte Folded Spill
+; GFX10_1-NEXT:    s_waitcnt_depctr 0xffe3
+; GFX10_1-NEXT:    s_mov_b32 exec_lo, s4
+; GFX10_1-NEXT:    v_lshrrev_b32_e64 v0, 5, s32
+; GFX10_1-NEXT:    v_writelane_b32 v1, s59, 0
+; GFX10_1-NEXT:    s_and_b32 s4, 0, exec_lo
+; GFX10_1-NEXT:    v_add_nc_u32_e32 v0, 64, v0
+; GFX10_1-NEXT:    ;;#ASMSTART
+; GFX10_1-NEXT:    ; use alloca0 v0
+; GFX10_1-NEXT:    ;;#ASMEND
+; GFX10_1-NEXT:    v_lshrrev_b32_e64 v0, 5, s32
+; GFX10_1-NEXT:    v_add_nc_u32_e32 v0, 0x4040, v0
+; GFX10_1-NEXT:    v_readfirstlane_b32 s59, v0
+; GFX10_1-NEXT:    ;;#ASMSTART
+; GFX10_1-NEXT:    ; use s59, scc
+; GFX10_1-NEXT:    ;;#ASMEND
+; GFX10_1-NEXT:    v_readlane_b32 s59, v1, 0
+; GFX10_1-NEXT:    s_xor_saveexec_b32 s4, -1
+; GFX10_1-NEXT:    s_add_i32 s5, s32, 0x80880
+; GFX10_1-NEXT:    buffer_load_dword v1, off, s[0:3], s5 ; 4-byte Folded Reload
+; GFX10_1-NEXT:    s_waitcnt_depctr 0xffe3
+; GFX10_1-NEXT:    s_mov_b32 exec_lo, s4
+; GFX10_1-NEXT:    s_waitcnt vmcnt(0)
+; GFX10_1-NEXT:    s_setpc_b64 s[30:31]
+;
+; GFX10_3-LABEL: scalar_mov_materializes_frame_index_unavailable_scc:
+; GFX10_3:       ; %bb.0:
+; GFX10_3-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX10_3-NEXT:    s_xor_saveexec_b32 s4, -1
+; GFX10_3-NEXT:    s_add_i32 s5, s32, 0x80880
+; GFX10_3-NEXT:    buffer_store_dword v1, off, s[0:3], s5 ; 4-byte Folded Spill
+; GFX10_3-NEXT:    s_mov_b32 exec_lo, s4
+; GFX10_3-NEXT:    v_lshrrev_b32_e64 v0, 5, s32
+; GFX10_3-NEXT:    v_writelane_b32 v1, s59, 0
+; GFX10_3-NEXT:    s_and_b32 s4, 0, exec_lo
+; GFX10_3-NEXT:    v_add_nc_u32_e32 v0, 64, v0
+; GFX10_3-NEXT:    ;;#ASMSTART
+; GFX10_3-NEXT:    ; use alloca0 v0
+; GFX10_3-NEXT:    ;;#ASMEND
+; GFX10_3-NEXT:    v_lshrrev_b32_e64 v0, 5, s32
+; GFX10_3-NEXT:    v_add_nc_u32_e32 v0, 0x4040, v0
+; GFX10_3-NEXT:    v_readfirstlane_b32 s59, v0
+; GFX10_3-NEXT:    ;;#ASMSTART
+; GFX10_3-NEXT:    ; use s59, scc
+; GFX10_3-NEXT:    ;;#ASMEND
+; GFX10_3-NEXT:    v_readlane_b32 s59, v1, 0
+; GFX10_3-NEXT:    s_xor_saveexec_b32 s4, -1
+; GFX10_3-NEXT:    s_add_i32 s5, s32, 0x80880
+; GFX10_3-NEXT:    buffer_load_dword v1, off, s[0:3], s5 ; 4-byte Folded Reload
+; GFX10_3-NEXT:    s_mov_b32 exec_lo, s4
+; GFX10_3-NEXT:    s_waitcnt vmcnt(0)
+; GFX10_3-NEXT:    s_setpc_b64 s[30:31]
+;
+; GFX11-LABEL: scalar_mov_materializes_frame_index_unavailable_scc:
+; GFX11:       ; %bb.0:
+; GFX11-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX11-NEXT:    s_xor_saveexec_b32 s0, -1
+; GFX11-NEXT:    s_add_i32 s1, s32, 0x4044
+; GFX11-NEXT:    scratch_store_b32 off, v1, s1 ; 4-byte Folded Spill
+; GFX11-NEXT:    s_mov_b32 exec_lo, s0
+; GFX11-NEXT:    s_add_i32 s0, s32, 64
+; GFX11-NEXT:    v_writelane_b32 v1, s59, 0
+; GFX11-NEXT:    v_mov_b32_e32 v0, s0
+; GFX11-NEXT:    s_and_b32 s0, 0, exec_lo
+; GFX11-NEXT:    s_addc_u32 s0, s32, 0x4040
+; GFX11-NEXT:    ;;#ASMSTART
+; GFX11-NEXT:    ; use alloca0 v0
+; GFX11-NEXT:    ;;#ASMEND
+; GFX11-NEXT:    s_bitcmp1_b32 s0, 0
+; GFX11-NEXT:    s_bitset0_b32 s0, 0
+; GFX11-NEXT:    s_delay_alu instid0(SALU_CYCLE_1)
+; GFX11-NEXT:    s_mov_b32 s59, s0
+; GFX11-NEXT:    ;;#ASMSTART
+; GFX11-NEXT:    ; use s59, scc
+; GFX11-NEXT:    ;;#ASMEND
+; GFX11-NEXT:    v_readlane_b32 s59, v1, 0
+; GFX11-NEXT:    s_xor_saveexec_b32 s0, -1
+; GFX11-NEXT:    s_add_i32 s1, s32, 0x4044
+; GFX11-NEXT:    scratch_load_b32 v1, off, s1 ; 4-byte Folded Reload
+; GFX11-NEXT:    s_mov_b32 exec_lo, s0
+; GFX11-NEXT:    s_waitcnt vmcnt(0)
+; GFX11-NEXT:    s_setpc_b64 s[30:31]
+;
+; GFX12-LABEL: scalar_mov_materializes_frame_index_unavailable_scc:
+; GFX12:       ; %bb.0:
+; GFX12-NEXT:    s_wait_loadcnt_dscnt 0x0
+; GFX12-NEXT:    s_wait_expcnt 0x0
+; GFX12-NEXT:    s_wait_samplecnt 0x0
+; GFX12-NEXT:    s_wait_bvhcnt 0x0
+; GFX12-NEXT:    s_wait_kmcnt 0x0
+; GFX12-NEXT:    s_xor_saveexec_b32 s0, -1
+; GFX12-NEXT:    scratch_store_b32 off, v1, s32 offset:16388 ; 4-byte Folded Spill
+; GFX12-NEXT:    s_mov_b32 exec_lo, s0
+; GFX12-NEXT:    s_delay_alu instid0(SALU_CYCLE_1)
+; GFX12-NEXT:    s_and_b32 s0, 0, exec_lo
+; GFX12-NEXT:    v_writelane_b32 v1, s59, 0
+; GFX12-NEXT:    s_add_co_ci_u32 s0, s32, 0x4000
+; GFX12-NEXT:    v_mov_b32_e32 v0, s32
+; GFX12-NEXT:    s_bitcmp1_b32 s0, 0
+; GFX12-NEXT:    s_bitset0_b32 s0, 0
+; GFX12-NEXT:    ;;#ASMSTART
+; GFX12-NEXT:    ; use alloca0 v0
+; GFX12-NEXT:    ;;#ASMEND
+; GFX12-NEXT:    s_mov_b32 s59, s0
+; GFX12-NEXT:    ;;#ASMSTART
+; GFX12-NEXT:    ; use s59, scc
+; GFX12-NEXT:    ;;#ASMEND
+; GFX12-NEXT:    v_readlane_b32 s59, v1, 0
+; GFX12-NEXT:    s_xor_saveexec_b32 s0, -1
+; GFX12-NEXT:    scratch_load_b32 v1, off, s32 offset:16388 ; 4-byte Folded Reload
+; GFX12-NEXT:    s_mov_b32 exec_lo, s0
+; GFX12-NEXT:    s_wait_loadcnt 0x0
+; GFX12-NEXT:    s_setpc_b64 s[30:31]
+;
+; GFX8-LABEL: scalar_mov_materializes_frame_index_unavailable_scc:
+; GFX8:       ; %bb.0:
+; GFX8-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX8-NEXT:    s_xor_saveexec_b64 s[4:5], -1
+; GFX8-NEXT:    s_add_i32 s6, s32, 0x101100
+; GFX8-NEXT:    buffer_store_dword v1, off, s[0:3], s6 ; 4-byte Folded Spill
+; GFX8-NEXT:    s_mov_b64 exec, s[4:5]
+; GFX8-NEXT:    v_lshrrev_b32_e64 v0, 6, s32
+; GFX8-NEXT:    v_add_u32_e32 v0, vcc, 64, v0
+; GFX8-NEXT:    v_writelane_b32 v1, s59, 0
+; GFX8-NEXT:    ;;#ASMSTART
+; GFX8-NEXT:    ; use alloca0 v0
+; GFX8-NEXT:    ;;#ASMEND
+; GFX8-NEXT:    v_lshrrev_b32_e64 v0, 6, s32
+; GFX8-NEXT:    s_movk_i32 s59, 0x4040
+; GFX8-NEXT:    v_add_u32_e32 v0, vcc, s59, v0
+; GFX8-NEXT:    v_readfirstlane_b32 s59, v0
+; GFX8-NEXT:    s_and_b64 s[4:5], 0, exec
+; GFX8-NEXT:    ;;#ASMSTART
+; GFX8-NEXT:    ; use s59, scc
+; GFX8-NEXT:    ;;#ASMEND
+; GFX8-NEXT:    v_readlane_b32 s59, v1, 0
+; GFX8-NEXT:    s_xor_saveexec_b64 s[4:5], -1
+; GFX8-NEXT:    s_add_i32 s6, s32, 0x101100
+; GFX8-NEXT:    buffer_load_dword v1, off, s[0:3], s6 ; 4-byte Folded Reload
+; GFX8-NEXT:    s_mov_b64 exec, s[4:5]
+; GFX8-NEXT:    s_waitcnt vmcnt(0)
+; GFX8-NEXT:    s_setpc_b64 s[30:31]
+;
+; GFX900-LABEL: scalar_mov_materializes_frame_index_unavailable_scc:
+; GFX900:       ; %bb.0:
+; GFX900-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX900-NEXT:    s_xor_saveexec_b64 s[4:5], -1
+; GFX900-NEXT:    s_add_i32 s6, s32, 0x101100
+; GFX900-NEXT:    buffer_store_dword v1, off, s[0:3], s6 ; 4-byte Folded Spill
+; GFX900-NEXT:    s_mov_b64 exec, s[4:5]
+; GFX900-NEXT:    v_lshrrev_b32_e64 v0, 6, s32
+; GFX900-NEXT:    v_add_u32_e32 v0, 64, v0
+; GFX900-NEXT:    ;;#ASMSTART
+; GFX900-NEXT:    ; use alloca0 v0
+; GFX900-NEXT:    ;;#ASMEND
+; GFX900-NEXT:    v_lshrrev_b32_e64 v0, 6, s32
+; GFX900-NEXT:    v_add_u32_e32 v0, 0x4040, v0
+; GFX900-NEXT:    v_writelane_b32 v1, s59, 0
+; GFX900-NEXT:    v_readfirstlane_b32 s59, v0
+; GFX900-NEXT:    s_and_b64 s[4:5], 0, exec
+; GFX900-NEXT:    ;;#ASMSTART
+; GFX900-NEXT:    ; use s59, scc
+; GFX900-NEXT:    ;;#ASMEND
+; GFX900-NEXT:    v_readlane_b32 s59, v1, 0
+; GFX900-NEXT:    s_xor_saveexec_b64 s[4:5], -1
+; GFX900-NEXT:    s_add_i32 s6, s32, 0x101100
+; GFX900-NEXT:    buffer_load_dword v1, off, s[0:3], s6 ; 4-byte Folded Reload
+; GFX900-NEXT:    s_mov_b64 exec, s[4:5]
+; GFX900-NEXT:    s_waitcnt vmcnt(0)
+; GFX900-NEXT:    s_setpc_b64 s[30:31]
+;
+; GFX940-LABEL: scalar_mov_materializes_frame_index_unavailable_scc:
+; GFX940:       ; %bb.0:
+; GFX940-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX940-NEXT:    s_xor_saveexec_b64 s[0:1], -1
+; GFX940-NEXT:    s_add_i32 s2, s32, 0x4044
+; GFX940-NEXT:    scratch_store_dword off, v1, s2 sc0 sc1 ; 4-byte Folded Spill
+; GFX940-NEXT:    s_mov_b64 exec, s[0:1]
+; GFX940-NEXT:    s_add_i32 s0, s32, 64
+; GFX940-NEXT:    v_mov_b32_e32 v0, s0
+; GFX940-NEXT:    s_and_b64 s[0:1], 0, exec
+; GFX940-NEXT:    s_addc_u32 s0, s32, 0x4040
+; GFX940-NEXT:    s_bitcmp1_b32 s0, 0
+; GFX940-NEXT:    s_bitset0_b32 s0, 0
+; GFX940-NEXT:    v_writelane_b32 v1, s59, 0
+; GFX940-NEXT:    s_mov_b32 s59, s0
+; GFX940-NEXT:    ;;#ASMSTART
+; GFX940-NEXT:    ; use alloca0 v0
+; GFX940-NEXT:    ;;#ASMEND
+; GFX940-NEXT:    ;;#ASMSTART
+; GFX940-NEXT:    ; use s59, scc
+; GFX940-NEXT:    ;;#ASMEND
+; GFX940-NEXT:    v_readlane_b32 s59, v1, 0
+; GFX940-NEXT:    s_xor_saveexec_b64 s[0:1], -1
+; GFX940-NEXT:    s_add_i32 s2, s32, 0x4044
+; GFX940-NEXT:    scratch_load_dword v1, off, s2 ; 4-byte Folded Reload
+; GFX940-NEXT:    s_mov_b64 exec, s[0:1]
+; GFX940-NEXT:    s_waitcnt vmcnt(0)
+; GFX940-NEXT:    s_setpc_b64 s[30:31]
+  %alloca0 = alloca [4096 x i32], align 64, addrspace(5)
+  %alloca1 = alloca i32, align 4, addrspace(5)
+  call void asm sideeffect "; use alloca0 $0", "v"(ptr addrspace(5) %alloca0)
+  call void asm sideeffect "; use $0, $1", "{s59},{scc}"(ptr addrspace(5) %alloca1, i32 0)
+  ret void
+}
+
+; %alloca1 should end up materializing with s_mov_b32, and scc is
+; available.
+define void @scalar_mov_materializes_frame_index_dead_scc() #0 {
+; GFX10_1-LABEL: scalar_mov_materializes_frame_index_dead_scc:
+; GFX10_1:       ; %bb.0:
+; GFX10_1-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX10_1-NEXT:    s_xor_saveexec_b32 s4, -1
+; GFX10_1-NEXT:    s_add_i32 s5, s32, 0x80880
+; GFX10_1-NEXT:    buffer_store_dword v1, off, s[0:3], s5 ; 4-byte Folded Spill
+; GFX10_1-NEXT:    s_waitcnt_depctr 0xffe3
+; GFX10_1-NEXT:    s_mov_b32 exec_lo, s4
+; GFX10_1-NEXT:    v_writelane_b32 v1, s59, 0
+; GFX10_1-NEXT:    v_lshrrev_b32_e64 v0, 5, s32
+; GFX10_1-NEXT:    s_lshr_b32 s59, s32, 5
+; GFX10_1-NEXT:    s_addk_i32 s59, 0x4040
+; GFX10_1-NEXT:    v_add_nc_u32_e32 v0, 64, v0
+; GFX10_1-NEXT:    ;;#ASMSTART
+; GFX10_1-NEXT:    ; use alloca0 v0
+; GFX10_1-NEXT:    ;;#ASMEND
+; GFX10_1-NEXT:    ;;#ASMSTART
+; GFX10_1-NEXT:    ; use s59
+; GFX10_1-NEXT:    ;;#ASMEND
+; GFX10_1-NEXT:    v_readlane_b32 s59, v1, 0
+; GFX10_1-NEXT:    s_xor_saveexec_b32 s4, -1
+; GFX10_1-NEXT:    s_add_i32 s5, s32, 0x80880
+; GFX10_1-NEXT:    buffer_load_dword v1, off, s[0:3], s5 ; 4-byte Folded Reload
+; GFX10_1-NEXT:    s_waitcnt_depctr 0xffe3
+; GFX10_1-NEXT:    s_mov_b32 exec_lo, s4
+; GFX10_1-NEXT:    s_waitcnt vmcnt(0)
+; GFX10_1-NEXT:    s_setpc_b64 s[30:31]
+;
+; GFX10_3-LABEL: scalar_mov_materializes_frame_index_dead_scc:
+; GFX10_3:       ; %bb.0:
+; GFX10_3-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX10_3-NEXT:    s_xor_saveexec_b32 s4, -1
+; GFX10_3-NEXT:    s_add_i32 s5, s32, 0x80880
+; GFX10_3-NEXT:    buffer_store_dword v1, off, s[0:3], s5 ; 4-byte Folded Spill
+; GFX10_3-NEXT:    s_mov_b32 exec_lo, s4
+; GFX10_3-NEXT:    v_writelane_b32 v1, s59, 0
+; GFX10_3-NEXT:    v_lshrrev_b32_e64 v0, 5, s32
+; GFX10_3-NEXT:    s_lshr_b32 s59, s32, 5
+; GFX10_3-NEXT:    s_addk_i32 s59, 0x4040
+; GFX10_3-NEXT:    v_add_nc_u32_e32 v0, 64, v0
+; GFX10_3-NEXT:    ;;#ASMSTART
+; GFX10_3-NEXT:    ; use alloca0 v0
+; GFX10_3-NEXT:    ;;#ASMEND
+; GFX10_3-NEXT:    ;;#ASMSTART
+; GFX10_3-NEXT:    ; use s59
+; GFX10_3-NEXT:    ;;#ASMEND
+; GFX10_3-NEXT:    v_readlane_b32 s59, v1, 0
+; GFX10_3-NEXT:    s_xor_saveexec_b32 s4, -1
+; GFX10_3-NEXT:    s_add_i32 s5, s32, 0x80880
+; GFX10_3-NEXT:    buffer_load_dword v1, off, s[0:3], s5 ; 4-byte Folded Reload
+; GFX10_3-NEXT:    s_mov_b32 exec_lo, s4
+; GFX10_3-NEXT:    s_waitcnt vmcnt(0)
+; GFX10_3-NEXT:    s_setpc_b64 s[30:31]
+;
+; GFX11-LABEL: scalar_mov_materializes_frame_index_dead_scc:
+; GFX11:       ; %bb.0:
+; GFX11-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX11-NEXT:    s_xor_saveexec_b32 s0, -1
+; GFX11-NEXT:    s_add_i32 s1, s32, 0x4044
+; GFX11-NEXT:    scratch_store_b32 off, v1, s1 ; 4-byte Folded Spill
+; GFX11-NEXT:    s_mov_b32 exec_lo, s0
+; GFX11-NEXT:    v_writelane_b32 v1, s59, 0
+; GFX11-NEXT:    s_add_i32 s0, s32, 64
+; GFX11-NEXT:    s_delay_alu instid0(SALU_CYCLE_1)
+; GFX11-NEXT:    v_mov_b32_e32 v0, s0
+; GFX11-NEXT:    s_add_i32 s0, s32, 0x4040
+; GFX11-NEXT:    ;;#ASMSTART
+; GFX11-NEXT:    ; use alloca0 v0
+; GFX11-NEXT:    ;;#ASMEND
+; GFX11-NEXT:    s_mov_b32 s59, s0
+; GFX11-NEXT:    ;;#ASMSTART
+; GFX11-NEXT:    ; use s59
+; GFX11-NEXT:    ;;#ASMEND
+; GFX11-NEXT:    v_readlane_b32 s59, v1, 0
+; GFX11-NEXT:    s_xor_saveexec_b32 s0, -1
+; GFX11-NEXT:    s_add_i32 s1, s32, 0x4044
+; GFX11-NEXT:    scratch_load_b32 v1, off, s1 ; 4-byte Folded Reload
+; GFX11-NEXT:    s_mov_b32 exec_lo, s0
+; GFX11-NEXT:    s_waitcnt vmcnt(0)
+; GFX11-NEXT:    s_setpc_b64 s[30:31]
+;
+; GFX12-LABEL: scalar_mov_materializes_frame_index_dead_scc:
+; GFX12:       ; %bb.0:
+; GFX12-NEXT:    s_wait_loadcnt_dscnt 0x0
+; GFX12-NEXT:    s_wait_expcnt 0x0
+; GFX12-NEXT:    s_wait_samplecnt 0x0
+; GFX12-NEXT:    s_wait_bvhcnt 0x0
+; GFX12-NEXT:    s_wait_kmcnt 0x0
+; GFX12-NEXT:    s_xor_saveexec_b32 s0, -1
+; GFX12-NEXT:    scratch_store_b32 off, v1, s32 offset:16388 ; 4-byte Folded Spill
+; GFX12-NEXT:    s_mov_b32 exec_lo, s0
+; GFX12-NEXT:    v_writelane_b32 v1, s59, 0
+; GFX12-NEXT:    s_add_co_i32 s0, s32, 0x4000
+; GFX12-NEXT:    v_mov_b32_e32 v0, s32
+; GFX12-NEXT:    s_mov_b32 s59, s0
+; GFX12-NEXT:    ;;#ASMSTART
+; GFX12-NEXT:    ; use alloca0 v0
+; GFX12-NEXT:    ;;#ASMEND
+; GFX12-NEXT:    ;;#ASMSTART
+; GFX12-NEXT:    ; use s59
+; GFX12-NEXT:    ;;#ASMEND
+; GFX12-NEXT:    v_readlane_b32 s59, v1, 0
+; GFX12-NEXT:    s_xor_saveexec_b32 s0, -1
+; GFX12-NEXT:    scratch_load_b32 v1, off, s32 offset:16388 ; 4-byte Folded Reload
+; GFX12-NEXT:    s_mov_b32 exec_lo, s0
+; GFX12-NEXT:    s_wait_loadcnt 0x0
+; GFX12-NEXT:    s_setpc_b64 s[30:31]
+;
+; GFX8-LABEL: scalar_mov_materializes_frame_index_dead_scc:
+; GFX8:       ; %bb.0:
+; GFX8-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX8-NEXT:    s_xor_saveexec_b64 s[4:5], -1
+; GFX8-NEXT:    s_add_i32 s6, s32, 0x101100
+; GFX8-NEXT:    buffer_store_dword v1, off, s[0:3], s6 ; 4-byte Folded Spill
+; GFX8-NEXT:    s_mov_b64 exec, s[4:5]
+; GFX8-NEXT:    v_writelane_b32 v1, s59, 0
+; GFX8-NEXT:    s_lshr_b32 s59, s32, 6
+; GFX8-NEXT:    v_lshrrev_b32_e64 v0, 6, s32
+; GFX8-NEXT:    s_addk_i32 s59, 0x4040
+; GFX8-NEXT:    v_add_u32_e32 v0, vcc, 64, v0
+; GFX8-NEXT:    ;;#ASMSTART
+; GFX8-NEXT:    ; use alloca0 v0
+; GFX8-NEXT:    ;;#ASMEND
+; GFX8-NEXT:    ;;#ASMSTART
+; GFX8-NEXT:    ; use s59
+; GFX8-NEXT:    ;;#ASMEND
+; GFX8-NEXT:    v_readlane_b32 s59, v1, 0
+; GFX8-NEXT:    s_xor_saveexec_b64 s[4:5], -1
+; GFX8-NEXT:    s_add_i32 s6, s32, 0x101100
+; GFX8-NEXT:    buffer_load_dword v1, off, s[0:3], s6 ; 4-byte Folded Reload
+; GFX8-NEXT:    s_mov_b64 exec, s[4:5]
+; GFX8-NEXT:    s_waitcnt vmcnt(0)
+; GFX8-NEXT:    s_setpc_b64 s[30:31]
+;
+; GFX900-LABEL: scalar_mov_materializes_frame_index_dead_scc:
+; GFX900:       ; %bb.0:
+; GFX900-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX900-NEXT:    s_xor_saveexec_b64 s[4:5], -1
+; GFX900-NEXT:    s_add_i32 s6, s32, 0x101100
+; GFX900-NEXT:    buffer_store_dword v1, off, s[0:3], s6 ; 4-byte Folded Spill
+; GFX900-NEXT:    s_mov_b64 exec, s[4:5]
+; GFX900-NEXT:    v_writelane_b32 v1, s59, 0
+; GFX900-NEXT:    s_lshr_b32 s59, s32, 6
+; GFX900-NEXT:    v_lshrrev_b32_e64 v0, 6, s32
+; GFX900-NEXT:    s_addk_i32 s59, 0x4040
+; GFX900-NEXT:    v_add_u32_e32 v0, 64, v0
+; GFX900-NEXT:    ;;#ASMSTART
+; GFX900-NEXT:    ; use alloca0 v0
+; GFX900-NEXT:    ;;#ASMEND
+; GFX900-NEXT:    ;;#ASMSTART
+; GFX900-NEXT:    ; use s59
+; GFX900-NEXT:    ;;#ASMEND
+; GFX900-NEXT:    v_readlane_b32 s59, v1, 0
+; GFX900-NEXT:    s_xor_saveexec_b64 s[4:5], -1
+; GFX900-NEXT:    s_add_i32 s6, s32, 0x101100
+; GFX900-NEXT:    buffer_load_dword v1, off, s[0:3], s6 ; 4-byte Folded Reload
+; GFX900-NEXT:    s_mov_b64 exec, s[4:5]
+; GFX900-NEXT:    s_waitcnt vmcnt(0)
+; GFX900-NEXT:    s_setpc_b64 s[30:31]
+;
+; GFX940-LABEL: scalar_mov_materializes_frame_index_dead_scc:
+; GFX940:       ; %bb.0:
+; GFX940-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX940-NEXT:    s_xor_saveexec_b64 s[0:1], -1
+; GFX940-NEXT:    s_add_i32 s2, s32, 0x4044
+; GFX940-NEXT:    scratch_store_dword off, v1, s2 sc0 sc1 ; 4-byte Folded Spill
+; GFX940-NEXT:    s_mov_b64 exec, s[0:1]
+; GFX940-NEXT:    s_add_i32 s0, s32, 64
+; GFX940-NEXT:    v_mov_b32_e32 v0, s0
+; GFX940-NEXT:    s_add_i32 s0, s32, 0x4040
+; GFX940-NEXT:    v_writelane_b32 v1, s59, 0
+; GFX940-NEXT:    s_mov_b32 s59, s0
+; GFX940-NEXT:    ;;#ASMSTART
+; GFX940-NEXT:    ; use alloca0 v0
+; GFX940-NEXT:    ;;#ASMEND
+; GFX940-NEXT:    ;;#ASMSTART
+; GFX940-NEXT:    ; use s59
+; GFX940-NEXT:    ;;#ASMEND
+; GFX940-NEXT:    v_readlane_b32 s59, v1, 0
+; GFX940-NEXT:    s_xor_saveexec_b64 s[0:1], -1
+; GFX940-NEXT:    s_add_i32 s2, s32, 0x4044
+; GFX940-NEXT:    scratch_load_dword v1, off, s2 ; 4-byte Folded Reload
+; GFX940-NEXT:    s_mov_b64 exec, s[0:1]
+; GFX940-NEXT:    s_waitcnt vmcnt(0)
+; GFX940-NEXT:    s_setpc_b64 s[30:31]
+  %alloca0 = alloca [4096 x i32], align 64, addrspace(5)
+  %alloca1 = alloca i32, align 4, addrspace(5)
+  call void asm sideeffect "; use alloca0 $0", "v"(ptr addrspace(5) %alloca0)
+  call void asm sideeffect "; use $0", "{s59}"(ptr addrspace(5) %alloca1)
+  ret void
+}
+
+define void @scalar_mov_materializes_frame_index_unavailable_scc_fp() #1 {
+; GFX10_1-LABEL: scalar_mov_materializes_frame_index_unavailable_scc_fp:
+; GFX10_1:       ; %bb.0:
+; GFX10_1-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX10_1-NEXT:    s_mov_b32 s5, s33
+; GFX10_1-NEXT:    s_mov_b32 s33, s32
+; GFX10_1-NEXT:    s_xor_saveexec_b32 s4, -1
+; GFX10_1-NEXT:    s_add_i32 s6, s33, 0x80880
+; GFX10_1-NEXT:    buffer_store_dword v1, off, s[0:3], s6 ; 4-byte Folded Spill
+; GFX10_1-NEXT:    s_waitcnt_depctr 0xffe3
+; GFX10_1-NEXT:    s_mov_b32 exec_lo, s4
+; GFX10_1-NEXT:    v_lshrrev_b32_e64 v0, 5, s33
+; GFX10_1-NEXT:    v_writelane_b32 v1, s59, 0
+; GFX10_1-NEXT:    s_add_i32 s32, s32, 0x81000
+; GFX10_1-NEXT:    s_and_b32 s4, 0, exec_lo
+; GFX10_1-NEXT:    v_add_nc_u32_e32 v0, 64, v0
+; GFX10_1-NEXT:    ;;#ASMSTART
+; GFX10_1-NEXT:    ; use alloca0 v0
+; GFX10_1-NEXT:    ;;#ASMEND
+; GFX10_1-NEXT:    v_lshrrev_b32_e64 v0, 5, s33
+; GFX10_1-NEXT:    v_add_nc_u32_e32 v0, 0x4040, v0
+; GFX10_1-NEXT:    v_readfirstlane_b32 s59, v0
+; GFX10_1-NEXT:    ;;#ASMSTART
+; GFX10_1-NEXT:    ; use s59, scc
+; GFX10_1-NEXT:    ;;#ASMEND
+; GFX10_1-NEXT:    v_readlane_b32 s59, v1, 0
+; GFX10_1-NEXT:    s_xor_saveexec_b32 s4, -1
+; GFX10_1-NEXT:    s_add_i32 s6, s33, 0x80880
+; GFX10_1-NEXT:    buffer_load_dword v1, off, s[0:3], s6 ; 4-byte Folded Reload
+; GFX10_1-NEXT:    s_waitcnt_depctr 0xffe3
+; GFX10_1-NEXT:    s_mov_b32 exec_lo, s4
+; GFX10_1-NEXT:    s_add_i32 s32, s32, 0xfff7f000
+; GFX10_1-NEXT:    s_mov_b32 s33, s5
+; GFX10_1-NEXT:    s_waitcnt vmcnt(0)
+; GFX10_1-NEXT:    s_setpc_b64 s[30:31]
+;
+; GFX10_3-LABEL: scalar_mov_materializes_frame_index_unavailable_scc_fp:
+; GFX10_3:       ; %bb.0:
+; GFX10_3-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX10_3-NEXT:    s_mov_b32 s5, s33
+; GFX10_3-NEXT:    s_mov_b32 s33, s32
+; GFX10_3-NEXT:    s_xor_saveexec_b32 s4, -1
+; GFX10_3-NEXT:    s_add_i32 s6, s33, 0x80880
+; GFX10_3-NEXT:    buffer_store_dword v1, off, s[0:3], s6 ; 4-byte Folded Spill
+; GFX10_3-NEXT:    s_mov_b32 exec_lo, s4
+; GFX10_3-NEXT:    v_lshrrev_b32_e64 v0, 5, s33
+; GFX10_3-NEXT:    v_writelane_b32 v1, s59, 0
+; GFX10_3-NEXT:    s_add_i32 s32, s32, 0x81000
+; GFX10_3-NEXT:    s_and_b32 s4, 0, exec_lo
+; GFX10_3-NEXT:    v_add_nc_u32_e32 v0, 64, v0
+; GFX10_3-NEXT:    ;;#ASMSTART
+; GFX10_3-NEXT:    ; use alloca0 v0
+; GFX10_3-NEXT:    ;;#ASMEND
+; GFX10_3-NEXT:    v_lshrrev_b32_e64 v0, 5, s33
+; GFX10_3-NEXT:    v_add_nc_u32_e32 v0, 0x4040, v0
+; GFX10_3-NEXT:    v_readfirstlane_b32 s59, v0
+; GFX10_3-NEXT:    ;;#ASMSTART
+; GFX10_3-NEXT:    ; use s59, scc
+; GFX10_3-NEXT:    ;;#ASMEND
+; GFX10_3-NEXT:    v_readlane_b32 s59, v1, 0
+; GFX10_3-NEXT:    s_xor_saveexec_b32 s4, -1
+; GFX10_3-NEXT:    s_add_i32 s6, s33, 0x80880
+; GFX10_3-NEXT:    buffer_load_dword v1, off, s[0:3], s6 ; 4-byte Folded Reload
+; GFX10_3-NEXT:    s_mov_b32 exec_lo, s4
+; GFX10_3-NEXT:    s_add_i32 s32, s32, 0xfff7f000
+; GFX10_3-NEXT:    s_mov_b32 s33, s5
+; GFX10_3-NEXT:    s_waitcnt vmcnt(0)
+; GFX10_3-NEXT:    s_setpc_b64 s[30:31]
+;
+; GFX11-LABEL: scalar_mov_materializes_frame_index_unavailable_scc_fp:
+; GFX11:       ; %bb.0:
+; GFX11-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX11-NEXT:    s_mov_b32 s1, s33
+; GFX11-NEXT:    s_mov_b32 s33, s32
+; GFX11-NEXT:    s_xor_saveexec_b32 s0, -1
+; GFX11-NEXT:    s_add_i32 s2, s33, 0x4044
+; GFX11-NEXT:    scratch_store_b32 off, v1, s2 ; 4-byte Folded Spill
+; GFX11-NEXT:    s_mov_b32 exec_lo, s0
+; GFX11-NEXT:    s_addk_i32 s32, 0x4080
+; GFX11-NEXT:    s_add_i32 s0, s33, 64
+; GFX11-NEXT:    v_writelane_b32 v1, s59, 0
+; GFX11-NEXT:    v_mov_b32_e32 v0, s0
+; GFX11-NEXT:    s_and_b32 s0, 0, exec_lo
+; GFX11-NEXT:    s_addc_u32 s0, s33, 0x4040
+; GFX11-NEXT:    ;;#ASMSTART
+; GFX11-NEXT:    ; use alloca0 v0
+; GFX11-NEXT:    ;;#ASMEND
+; GFX11-NEXT:    s_bitcmp1_b32 s0, 0
+; GFX11-NEXT:    s_bitset0_b32 s0, 0
+; GFX11-NEXT:    s_delay_alu instid0(SALU_CYCLE_1)
+; GFX11-NEXT:    s_mov_b32 s59, s0
+; GFX11-NEXT:    ;;#ASMSTART
+; GFX11-NEXT:    ; use s59, scc
+; GFX11-NEXT:    ;;#ASMEND
+; GFX11-NEXT:    v_readlane_b32 s59, v1, 0
+; GFX11-NEXT:    s_xor_saveexec_b32 s0, -1
+; GFX11-NEXT:    s_add_i32 s2, s33, 0x4044
+; GFX11-NEXT:    scratch_load_b32 v1, off, s2 ; 4-byte Folded Reload
+; GFX11-NEXT:    s_mov_b32 exec_lo, s0
+; GFX11-NEXT:    s_addk_i32 s32, 0xbf80
+; GFX11-NEXT:    s_mov_b32 s33, s1
+; GFX11-NEXT:    s_waitcnt vmcnt(0)
+; GFX11-NEXT:    s_setpc_b64 s[30:31]
+;
+; GFX12-LABEL: scalar_mov_materializes_frame_index_unavailable_scc_fp:
+; GFX12:       ; %bb.0:
+; GFX12-NEXT:    s_wait_loadcnt_dscnt 0x0
+; GFX12-NEXT:    s_wait_expcnt 0x0
+; GFX12-NEXT:    s_wait_samplecnt 0x0
+; GFX12-NEXT:    s_wait_bvhcnt 0x0
+; GFX12-NEXT:    s_wait_kmcnt 0x0
+; GFX12-NEXT:    s_mov_b32 s1, s33
+; GFX12-NEXT:    s_mov_b32 s33, s32
+; GFX12-NEXT:    s_xor_saveexec_b32 s0, -1
+; GFX12-NEXT:    scratch_store_b32 off, v1, s33 offset:16388 ; 4-byte Folded Spill
+; GFX12-NEXT:    s_mov_b32 exec_lo, s0
+; GFX12-NEXT:    s_addk_co_i32 s32, 0x4040
+; GFX12-NEXT:    s_and_b32 s0, 0, exec_lo
+; GFX12-NEXT:    v_writelane_b32 v1, s59, 0
+; GFX12-NEXT:    s_add_co_ci_u32 s0, s33, 0x4000
+; GFX12-NEXT:    v_mov_b32_e32 v0, s33
+; GFX12-NEXT:    s_bitcmp1_b32 s0, 0
+; GFX12-NEXT:    s_bitset0_b32 s0, 0
+; GFX12-NEXT:    ;;#ASMSTART
+; GFX12-NEXT:    ; use alloca0 v0
+; GFX12-NEXT:    ;;#ASMEND
+; GFX12-NEXT:    s_mov_b32 s59, s0
+; GFX12-NEXT:    ;;#ASMSTART
+; GFX12-NEXT:    ; use s59, scc
+; GFX12-NEXT:    ;;#ASMEND
+; GFX12-NEXT:    v_readlane_b32 s59, v1, 0
+; GFX12-NEXT:    s_xor_saveexec_b32 s0, -1
+; GFX12-NEXT:    scratch_load_b32 v1, off, s33 offset:16388 ; 4-byte Folded Reload
+; GFX12-NEXT:    s_mov_b32 exec_lo, s0
+; GFX12-NEXT:    s_addk_co_i32 s32, 0xbfc0
+; GFX12-NEXT:    s_mov_b32 s33, s1
+; GFX12-NEXT:    s_wait_loadcnt 0x0
+; GFX12-NEXT:    s_setpc_b64 s[30:31]
+;
+; GFX8-LABEL: scalar_mov_materializes_frame_index_unavailable_scc_fp:
+; GFX8:       ; %bb.0:
+; GFX8-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX8-NEXT:    s_mov_b32 s6, s33
+; GFX8-NEXT:    s_mov_b32 s33, s32
+; GFX8-NEXT:    s_xor_saveexec_b64 s[4:5], -1
+; GFX8-NEXT:    s_add_i32 s7, s33, 0x101100
+; GFX8-NEXT:    buffer_store_dword v1, off, s[0:3], s7 ; 4-byte Folded Spill
+; GFX8-NEXT:    s_mov_b64 exec, s[4:5]
+; GFX8-NEXT:    v_lshrrev_b32_e64 v0, 6, s33
+; GFX8-NEXT:    v_add_u32_e32 v0, vcc, 64, v0
+; GFX8-NEXT:    v_writelane_b32 v1, s59, 0
+; GFX8-NEXT:    ;;#ASMSTART
+; GFX8-NEXT:    ; use alloca0 v0
+; GFX8-NEXT:    ;;#ASMEND
+; GFX8-NEXT:    v_lshrrev_b32_e64 v0, 6, s33
+; GFX8-NEXT:    s_movk_i32 s59, 0x4040
+; GFX8-NEXT:    v_add_u32_e32 v0, vcc, s59, v0
+; GFX8-NEXT:    s_add_i32 s32, s32, 0x102000
+; GFX8-NEXT:    v_readfirstlane_b32 s59, v0
+; GFX8-NEXT:    s_and_b64 s[4:5], 0, exec
+; GFX8-NEXT:    ;;#ASMSTART
+; GFX8-NEXT:    ; use s59, scc
+; GFX8-NEXT:    ;;#ASMEND
+; GFX8-NEXT:    v_readlane_b32 s59, v1, 0
+; GFX8-NEXT:    s_xor_saveexec_b64 s[4:5], -1
+; GFX8-NEXT:    s_add_i32 s7, s33, 0x101100
+; GFX8-NEXT:    buffer_load_dword v1, off, s[0:3], s7 ; 4-byte Folded Reload
+; GFX8-NEXT:    s_mov_b64 exec, s[4:5]
+; GFX8-NEXT:    s_add_i32 s32, s32, 0xffefe000
+; GFX8-NEXT:    s_mov_b32 s33, s6
+; GFX8-NEXT:    s_waitcnt vmcnt(0)
+; GFX8-NEXT:    s_setpc_b64 s[30:31]
+;
+; GFX900-LABEL: scalar_mov_materializes_frame_index_unavailable_scc_fp:
+; GFX900:       ; %bb.0:
+; GFX900-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX900-NEXT:    s_mov_b32 s6, s33
+; GFX900-NEXT:    s_mov_b32 s33, s32
+; GFX900-NEXT:    s_xor_saveexec_b64 s[4:5], -1
+; GFX900-NEXT:    s_add_i32 s7, s33, 0x101100
+; GFX900-NEXT:    buffer_store_dword v1, off, s[0:3], s7 ; 4-byte Folded Spill
+; GFX900-NEXT:    s_mov_b64 exec, s[4:5]
+; GFX900-NEXT:    v_lshrrev_b32_e64 v0, 6, s33
+; GFX900-NEXT:    v_add_u32_e32 v0, 64, v0
+; GFX900-NEXT:    ;;#ASMSTART
+; GFX900-NEXT:    ; use alloca0 v0
+; GFX900-NEXT:    ;;#ASMEND
+; GFX900-NEXT:    v_lshrrev_b32_e64 v0, 6, s33
+; GFX900-NEXT:    v_add_u32_e32 v0, 0x4040, v0
+; GFX900-NEXT:    s_add_i32 s32, s32, 0x102000
+; GFX900-NEXT:    v_writelane_b32 v1, s59, 0
+; GFX900-NEXT:    v_readfirstlane_b32 s59, v0
+; GFX900-NEXT:    s_and_b64 s[4:5], 0, exec
+; GFX900-NEXT:    ;;#ASMSTART
+; GFX900-NEXT:    ; use s59, scc
+; GFX900-NEXT:    ;;#ASMEND
+; GFX900-NEXT:    v_readlane_b32 s59, v1, 0
+; GFX900-NEXT:    s_xor_saveexec_b64 s[4:5], -1
+; GFX900-NEXT:    s_add_i32 s7, s33, 0x101100
+; GFX900-NEXT:    buffer_load_dword v1, off, s[0:3], s7 ; 4-byte Folded Reload
+; GFX900-NEXT:    s_mov_b64 exec, s[4:5]
+; GFX900-NEXT:    s_add_i32 s32, s32, 0xffefe000
+; GFX900-NEXT:    s_mov_b32 s33, s6
+; GFX900-NEXT:    s_waitcnt vmcnt(0)
+; GFX900-NEXT:    s_setpc_b64 s[30:31]
+;
+; GFX940-LABEL: scalar_mov_materializes_frame_index_unavailable_scc_fp:
+; GFX940:       ; %bb.0:
+; GFX940-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX940-NEXT:    s_mov_b32 s2, s33
+; GFX940-NEXT:    s_mov_b32 s33, s32
+; GFX940-NEXT:    s_xor_saveexec_b64 s[0:1], -1
+; GFX940-NEXT:    s_add_i32 s3, s33, 0x4044
+; GFX940-NEXT:    scratch_store_dword off, v1, s3 sc0 sc1 ; 4-byte Folded Spill
+; GFX940-NEXT:    s_mov_b64 exec, s[0:1]
+; GFX940-NEXT:    s_addk_i32 s32, 0x4080
+; GFX940-NEXT:    s_add_i32 s0, s33, 64
+; GFX940-NEXT:    v_mov_b32_e32 v0, s0
+; GFX940-NEXT:    s_and_b64 s[0:1], 0, exec
+; GFX940-NEXT:    s_addc_u32 s0, s33, 0x4040
+; GFX940-NEXT:    s_bitcmp1_b32 s0, 0
+; GFX940-NEXT:    s_bitset0_b32 s0, 0
+; GFX940-NEXT:    v_writelane_b32 v1, s59, 0
+; GFX940-NEXT:    s_mov_b32 s59, s0
+; GFX940-NEXT:    ;;#ASMSTART
+; GFX940-NEXT:    ; use alloca0 v0
+; GFX940-NEXT:    ;;#ASMEND
+; GFX940-NEXT:    ;;#ASMSTART
+; GFX940-NEXT:    ; use s59, scc
+; GFX940-NEXT:    ;;#ASMEND
+; GFX940-NEXT:    v_readlane_b32 s59, v1, 0
+; GFX940-NEXT:    s_xor_saveexec_b64 s[0:1], -1
+; GFX940-NEXT:    s_add_i32 s3, s33, 0x4044
+; GFX940-NEXT:    scratch_load_dword v1, off, s3 ; 4-byte Folded Reload
+; GFX940-NEXT:    s_mov_b64 exec, s[0:1]
+; GFX940-NEXT:    s_addk_i32 s32, 0xbf80
+; GFX940-NEXT:    s_mov_b32 s33, s2
+; GFX940-NEXT:    s_waitcnt vmcnt(0)
+; GFX940-NEXT:    s_setpc_b64 s[30:31]
+  %alloca0 = alloca [4096 x i32], align 64, addrspace(5)
+  %alloca1 = alloca i32, align 4, addrspace(5)
+  call void asm sideeffect "; use alloca0 $0", "v"(ptr addrspace(5) %alloca0)
+  call void asm sideeffect "; use $0, $1", "{s59},{scc}"(ptr addrspace(5) %alloca1, i32 0)
+  ret void
+}
+
+define void @scalar_mov_materializes_frame_index_unavailable_scc_small_offset() #0 {
+; GFX10_1-LABEL: scalar_mov_materializes_frame_index_unavailable_scc_small_offset:
+; GFX10_1:       ; %bb.0:
+; GFX10_1-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX10_1-NEXT:    s_xor_saveexec_b32 s4, -1
+; GFX10_1-NEXT:    s_add_i32 s5, s32, 0x80800
+; GFX10_1-NEXT:    buffer_store_dword v0, off, s[0:3], s5 ; 4-byte Folded Spill
+; GFX10_1-NEXT:    s_waitcnt_depctr 0xffe3
+; GFX10_1-NEXT:    s_mov_b32 exec_lo, s4
+; GFX10_1-NEXT:    v_lshrrev_b32_e64 v1, 5, s32
+; GFX10_1-NEXT:    v_writelane_b32 v0, s59, 0
+; GFX10_1-NEXT:    s_and_b32 s4, 0, exec_lo
+; GFX10_1-NEXT:    v_add_nc_u32_e32 v1, 64, v1
+; GFX10_1-NEXT:    v_readfirstlane_b32 s59, v1
+; GFX10_1-NEXT:    ;;#ASMSTART
+; GFX10_1-NEXT:    ; use s59, scc
+; GFX10_1-NEXT:    ;;#ASMEND
+; GFX10_1-NEXT:    v_readlane_b32 s59, v0, 0
+; GFX10_1-NEXT:    s_xor_saveexec_b32 s4, -1
+; GFX10_1-NEXT:    s_add_i32 s5, s32, 0x80800
+; GFX10_1-NEXT:    buffer_load_dword v0, off, s[0:3], s5 ; 4-byte Folded Reload
+; GFX10_1-NEXT:    s_waitcnt_depctr 0xffe3
+; GFX10_1-NEXT:    s_mov_b32 exec_lo, s4
+; GFX10_1-NEXT:    s_waitcnt vmcnt(0)
+; GFX10_1-NEXT:    s_setpc_b64 s[30:31]
+;
+; GFX10_3-LABEL: scalar_mov_materializes_frame_index_unavailable_scc_small_offset:
+; GFX10_3:       ; %bb.0:
+; GFX10_3-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX10_3-NEXT:    s_xor_saveexec_b32 s4, -1
+; GFX10_3-NEXT:    s_add_i32 s5, s32, 0x80800
+; GFX10_3-NEXT:    buffer_store_dword v0, off, s[0:3], s5 ; 4-byte Folded Spill
+; GFX10_3-NEXT:    s_mov_b32 exec_lo, s4
+; GFX10_3-NEXT:    v_lshrrev_b32_e64 v1, 5, s32
+; GFX10_3-NEXT:    v_writelane_b32 v0, s59, 0
+; GFX10_3-NEXT:    s_and_b32 s4, 0, exec_lo
+; GFX10_3-NEXT:    v_add_nc_u32_e32 v1, 64, v1
+; GFX10_3-NEXT:    v_readfirstlane_b32 s59, v1
+; GFX10_3-NEXT:    ;;#ASMSTART
+; GFX10_3-NEXT:    ; use s59, scc
+; GFX10_3-NEXT:    ;;#ASMEND
+; GFX10_3-NEXT:    v_readlane_b32 s59, v0, 0
+; GFX10_3-NEXT:    s_xor_saveexec_b32 s4, -1
+; GFX10_3-NEXT:    s_add_i32 s5, s32, 0x80800
+; GFX10_3-NEXT:    buffer_load_dword v0, off, s[0:3], s5 ; 4-byte Folded Reload
+; GFX10_3-NEXT:    s_mov_b32 exec_lo, s4
+; GFX10_3-NEXT:    s_waitcnt vmcnt(0)
+; GFX10_3-NEXT:    s_setpc_b64 s[30:31]
+;
+; GFX11-LABEL: scalar_mov_materializes_frame_index_unavailable_scc_small_offset:
+; GFX11:       ; %bb.0:
+; GFX11-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX11-NEXT:    s_xor_saveexec_b32 s0, -1
+; GFX11-NEXT:    s_add_i32 s1, s32, 0x4040
+; GFX11-NEXT:    scratch_store_b32 off, v0, s1 ; 4-byte Folded Spill
+; GFX11-NEXT:    s_mov_b32 exec_lo, s0
+; GFX11-NEXT:    s_delay_alu instid0(SALU_CYCLE_1) | instskip(SKIP_2) | instid1(SALU_CYCLE_1)
+; GFX11-NEXT:    s_and_b32 s0, 0, exec_lo
+; GFX11-NEXT:    v_writelane_b32 v0, s59, 0
+; GFX11-NEXT:    s_addc_u32 s0, s32, 64
+; GFX11-NEXT:    s_bitcmp1_b32 s0, 0
+; GFX11-NEXT:    s_bitset0_b32 s0, 0
+; GFX11-NEXT:    s_delay_alu instid0(SALU_CYCLE_1)
+; GFX11-NEXT:    s_mov_b32 s59, s0
+; GFX11-NEXT:    ;;#ASMSTART
+; GFX11-NEXT:    ; use s59, scc
+; GFX11-NEXT:    ;;#ASMEND
+; GFX11-NEXT:    v_readlane_b32 s59, v0, 0
+; GFX11-NEXT:    s_xor_saveexec_b32 s0, -1
+; GFX11-NEXT:    s_add_i32 s1, s32, 0x4040
+; GFX11-NEXT:    scratch_load_b32 v0, off, s1 ; 4-byte Folded Reload
+; GFX11-NEXT:    s_mov_b32 exec_lo, s0
+; GFX11-NEXT:    s_waitcnt vmcnt(0)
+; GFX11-NEXT:    s_setpc_b64 s[30:31]
+;
+; GFX12-LABEL: scalar_mov_materializes_frame_index_unavailable_scc_small_offset:
+; GFX12:       ; %bb.0:
+; GFX12-NEXT:    s_wait_loadcnt_dscnt 0x0
+; GFX12-NEXT:    s_wait_expcnt 0x0
+; GFX12-NEXT:    s_wait_samplecnt 0x0
+; GFX12-NEXT:    s_wait_bvhcnt 0x0
+; GFX12-NEXT:    s_wait_kmcnt 0x0
+; GFX12-NEXT:    s_xor_saveexec_b32 s0, -1
+; GFX12-NEXT:    scratch_store_b32 off, v0, s32 offset:16384 ; 4-byte Folded Spill
+; GFX12-NEXT:    s_mov_b32 exec_lo, s0
+; GFX12-NEXT:    v_writelane_b32 v0, s59, 0
+; GFX12-NEXT:    s_mov_b32 s59, s32
+; GFX12-NEXT:    s_and_b32 s0, 0, exec_lo
+; GFX12-NEXT:    ;;#ASMSTART
+; GFX12-NEXT:    ; use s59, scc
+; GFX12-NEXT:    ;;#ASMEND
+; GFX12-NEXT:    s_delay_alu instid0(VALU_DEP_1)
+; GFX12-NEXT:    v_readlane_b32 s59, v0, 0
+; GFX12-NEXT:    s_xor_saveexec_b32 s0, -1
+; GFX12-NEXT:    scratch_load_b32 v0, off, s32 offset:16384 ; 4-byte Folded Reload
+; GFX12-NEXT:    s_mov_b32 exec_lo, s0
+; GFX12-NEXT:    s_wait_loadcnt 0x0
+; GFX12-NEXT:    s_setpc_b64 s[30:31]
+;
+; GFX8-LABEL: scalar_mov_materializes_frame_index_unavailable_scc_small_offset:
+; GFX8:       ; %bb.0:
+; GFX8-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX8-NEXT:    s_xor_saveexec_b64 s[4:5], -1
+; GFX8-NEXT:    s_add_i32 s6, s32, 0x101000
+; GFX8-NEXT:    buffer_store_dword v0, off, s[0:3], s6 ; 4-byte Folded Spill
+; GFX8-NEXT:    s_mov_b64 exec, s[4:5]
+; GFX8-NEXT:    v_writelane_b32 v0, s59, 0
+; GFX8-NEXT:    v_lshrrev_b32_e64 v1, 6, s32
+; GFX8-NEXT:    s_mov_b32 s59, 64
+; GFX8-NEXT:    v_add_u32_e32 v1, vcc, s59, v1
+; GFX8-NEXT:    v_readfirstlane_b32 s59, v1
+; GFX8-NEXT:    s_and_b64 s[4:5], 0, exec
+; GFX8-NEXT:    ;;#ASMSTART
+; GFX8-NEXT:    ; use s59, scc
+; GFX8-NEXT:    ;;#ASMEND
+; GFX8-NEXT:    v_readlane_b32 s59, v0, 0
+; GFX8-NEXT:    s_xor_saveexec_b64 s[4:5], -1
+; GFX8-NEXT:    s_add_i32 s6, s32, 0x101000
+; GFX8-NEXT:    buffer_load_dword v0, off, s[0:3], s6 ; 4-byte Folded Reload
+; GFX8-NEXT:    s_mov_b64 exec, s[4:5]
+; GFX8-NEXT:    s_waitcnt vmcnt(0)
+; GFX8-NEXT:    s_setpc_b64 s[30:31]
+;
+; GFX900-LABEL: scalar_mov_materializes_frame_index_unavailable_scc_small_offset:
+; GFX900:       ; %bb.0:
+; GFX900-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX900-NEXT:    s_xor_saveexec_b64 s[4:5], -1
+; GFX900-NEXT:    s_add_i32 s6, s32, 0x101000
+; GFX900-NEXT:    buffer_store_dword v0, off, s[0:3], s6 ; 4-byte Folded Spill
+; GFX900-NEXT:    s_mov_b64 exec, s[4:5]
+; GFX900-NEXT:    v_lshrrev_b32_e64 v1, 6, s32
+; GFX900-NEXT:    v_add_u32_e32 v1, 64, v1
+; GFX900-NEXT:    v_writelane_b32 v0, s59, 0
+; GFX900-NEXT:    v_readfirstlane_b32 s59, v1
+; GFX900-NEXT:    s_and_b64 s[4:5], 0, exec
+; GFX900-NEXT:    ;;#ASMSTART
+; GFX900-NEXT:    ; use s59, scc
+; GFX900-NEXT:    ;;#ASMEND
+; GFX900-NEXT:    v_readlane_b32 s59, v0, 0
+; GFX900-NEXT:    s_xor_saveexec_b64 s[4:5], -1
+; GFX900-NEXT:    s_add_i32 s6, s32, 0x101000
+; GFX900-NEXT:    buffer_load_dword v0, off, s[0:3], s6 ; 4-byte Folded Reload
+; GFX900-NEXT:    s_mov_b64 exec, s[4:5]
+; GFX900-NEXT:    s_waitcnt vmcnt(0)
+; GFX900-NEXT:    s_setpc_b64 s[30:31]
+;
+; GFX940-LABEL: scalar_mov_materializes_frame_index_unavailable_scc_small_offset:
+; GFX940:       ; %bb.0:
+; GFX940-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX940-NEXT:    s_xor_saveexec_b64 s[0:1], -1
+; GFX940-NEXT:    s_add_i32 s2, s32, 0x4040
+; GFX940-NEXT:    scratch_store_dword off, v0, s2 sc0 sc1 ; 4-byte Folded Spill
+; GFX940-NEXT:    s_mov_b64 exec, s[0:1]
+; GFX940-NEXT:    s_and_b64 s[0:1], 0, exec
+; GFX940-NEXT:    s_addc_u32 s0, s32, 64
+; GFX940-NEXT:    s_bitcmp1_b32 s0, 0
+; GFX940-NEXT:    s_bitset0_b32 s0, 0
+; GFX940-NEXT:    v_writelane_b32 v0, s59, 0
+; GFX940-NEXT:    s_mov_b32 s59, s0
+; GFX940-NEXT:    ;;#ASMSTART
+; GFX940-NEXT:    ; use s59, scc
+; GFX940-NEXT:    ;;#ASMEND
+; GFX940-NEXT:    v_readlane_b32 s59, v0, 0
+; GFX940-NEXT:    s_xor_saveexec_b64 s[0:1], -1
+; GFX940-NEXT:    s_add_i32 s2, s32, 0x4040
+; GFX940-NEXT:    scratch_load_dword v0, off, s2 ; 4-byte Folded Reload
+; GFX940-NEXT:    s_mov_b64 exec, s[0:1]
+; GFX940-NEXT:    s_waitcnt vmcnt(0)
+; GFX940-NEXT:    s_setpc_b64 s[30:31]
+  %alloca0 = alloca [4096 x i32], align 64, addrspace(5)
+  call void asm sideeffect "; use $0, $1", "{s59},{scc}"(ptr addrspace(5) %alloca0, i32 0)
+  ret void
+}
+
+define void @scalar_mov_materializes_frame_index_available_scc_small_offset() #0 {
+; GFX10_1-LABEL: scalar_mov_materializes_frame_index_available_scc_small_offset:
+; GFX10_1:       ; %bb.0:
+; GFX10_1-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX10_1-NEXT:    s_xor_saveexec_b32 s4, -1
+; GFX10_1-NEXT:    s_add_i32 s5, s32, 0x80800
+; GFX10_1-NEXT:    buffer_store_dword v0, off, s[0:3], s5 ; 4-byte Folded Spill
+; GFX10_1-NEXT:    s_waitcnt_depctr 0xffe3
+; GFX10_1-NEXT:    s_mov_b32 exec_lo, s4
+; GFX10_1-NEXT:    v_writelane_b32 v0, s59, 0
+; GFX10_1-NEXT:    s_lshr_b32 s59, s32, 5
+; GFX10_1-NEXT:    s_add_i32 s59, s59, 64
+; GFX10_1-NEXT:    ;;#ASMSTART
+; GFX10_1-NEXT:    ; use s59
+; GFX10_1-NEXT:    ;;#ASMEND
+; GFX10_1-NEXT:    v_readlane_b32 s59, v0, 0
+; GFX10_1-NEXT:    s_xor_saveexec_b32 s4, -1
+; GFX10_1-NEXT:    s_add_i32 s5, s32, 0x80800
+; GFX10_1-NEXT:    buffer_load_dword v0, off, s[0:3], s5 ; 4-byte Folded Reload
+; GFX10_1-NEXT:    s_waitcnt_depctr 0xffe3
+; GFX10_1-NEXT:    s_mov_b32 exec_lo, s4
+; GFX10_1-NEXT:    s_waitcnt vmcnt(0)
+; GFX10_1-NEXT:    s_setpc_b64 s[30:31]
+;
+; GFX10_3-LABEL: scalar_mov_materializes_frame_index_available_scc_small_offset:
+; GFX10_3:       ; %bb.0:
+; GFX10_3-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX10_3-NEXT:    s_xor_saveexec_b32 s4, -1
+; GFX10_3-NEXT:    s_add_i32 s5, s32, 0x80800
+; GFX10_3-NEXT:    buffer_store_dword v0, off, s[0:3], s5 ; 4-byte Folded Spill
+; GFX10_3-NEXT:    s_mov_b32 exec_lo, s4
+; GFX10_3-NEXT:    v_writelane_b32 v0, s59, 0
+; GFX10_3-NEXT:    s_lshr_b32 s59, s32, 5
+; GFX10_3-NEXT:    s_add_i32 s59, s59, 64
+; GFX10_3-NEXT:    ;;#ASMSTART
+; GFX10_3-NEXT:    ; use s59
+; GFX10_3-NEXT:    ;;#ASMEND
+; GFX10_3-NEXT:    v_readlane_b32 s59, v0, 0
+; GFX10_3-NEXT:    s_xor_saveexec_b32 s4, -1
+; GFX10_3-NEXT:    s_add_i32 s5, s32, 0x80800
+; GFX10_3-NEXT:    buffer_load_dword v0, off, s[0:3], s5 ; 4-byte Folded Reload
+; GFX10_3-NEXT:    s_mov_b32 exec_lo, s4
+; GFX10_3-NEXT:    s_waitcnt vmcnt(0)
+; GFX10_3-NEXT:    s_setpc_b64 s[30:31]
+;
+; GFX11-LABEL: scalar_mov_materializes_frame_index_available_scc_small_offset:
+; GFX11:       ; %bb.0:
+; GFX11-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX11-NEXT:    s_xor_saveexec_b32 s0, -1
+; GFX11-NEXT:    s_add_i32 s1, s32, 0x4040
+; GFX11-NEXT:    scratch_store_b32 off, v0, s1 ; 4-byte Folded Spill
+; GFX11-NEXT:    s_mov_b32 exec_lo, s0
+; GFX11-NEXT:    v_writelane_b32 v0, s59, 0
+; GFX11-NEXT:    s_add_i32 s0, s32, 64
+; GFX11-NEXT:    s_delay_alu instid0(SALU_CYCLE_1) | instskip(SKIP_1) | instid1(VALU_DEP_1)
+; GFX11-NEXT:    s_mov_b32 s59, s0
+; GFX11-NEXT:    ;;#ASMSTART
+; GFX11-NEXT:    ; use s59
+; GFX11-NEXT:    ;;#ASMEND
+; GFX11-NEXT:    v_readlane_b32 s59, v0, 0
+; GFX11-NEXT:    s_xor_saveexec_b32 s0, -1
+; GFX11-NEXT:    s_add_i32 s1, s32, 0x4040
+; GFX11-NEXT:    scratch_load_b32 v0, off, s1 ; 4-byte Folded Reload
+; GFX11-NEXT:    s_mov_b32 exec_lo, s0
+; GFX11-NEXT:    s_waitcnt vmcnt(0)
+; GFX11-NEXT:    s_setpc_b64 s[30:31]
+;
+; GFX12-LABEL: scalar_mov_materializes_frame_index_available_scc_small_offset:
+; GFX12:       ; %bb.0:
+; GFX12-NEXT:    s_wait_loadcnt_dscnt 0x0
+; GFX12-NEXT:    s_wait_expcnt 0x0
+; GFX12-NEXT:    s_wait_samplecnt 0x0
+; GFX12-NEXT:    s_wait_bvhcnt 0x0
+; GFX12-NEXT:    s_wait_kmcnt 0x0
+; GFX12-NEXT:    s_xor_saveexec_b32 s0, -1
+; GFX12-NEXT:    scratch_store_b32 off, v0, s32 offset:16384 ; 4-byte Folded Spill
+; GFX12-NEXT:    s_mov_b32 exec_lo, s0
+; GFX12-NEXT:    v_writelane_b32 v0, s59, 0
+; GFX12-NEXT:    s_mov_b32 s59, s32
+; GFX12-NEXT:    ;;#ASMSTART
+; GFX12-NEXT:    ; use s59
+; GFX12-NEXT:    ;;#ASMEND
+; GFX12-NEXT:    s_delay_alu instid0(VALU_DEP_1)
+; GFX12-NEXT:    v_readlane_b32 s59, v0, 0
+; GFX12-NEXT:    s_xor_saveexec_b32 s0, -1
+; GFX12-NEXT:    scratch_load_b32 v0, off, s32 offset:16384 ; 4-byte Folded Reload
+; GFX12-NEXT:    s_mov_b32 exec_lo, s0
+; GFX12-NEXT:    s_wait_loadcnt 0x0
+; GFX12-NEXT:    s_setpc_b64 s[30:31]
+;
+; GFX8-LABEL: scalar_mov_materializes_frame_index_available_scc_small_offset:
+; GFX8:       ; %bb.0:
+; GFX8-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX8-NEXT:    s_xor_saveexec_b64 s[4:5], -1
+; GFX8-NEXT:    s_add_i32 s6, s32, 0x101000
+; GFX8-NEXT:    buffer_store_dword v0, off, s[0:3], s6 ; 4-byte Folded Spill
+; GFX8-NEXT:    s_mov_b64 exec, s[4:5]
+; GFX8-NEXT:    v_writelane_b32 v0, s59, 0
+; GFX8-NEXT:    s_lshr_b32 s59, s32, 6
+; GFX8-NEXT:    s_add_i32 s59, s59, 64
+; GFX8-NEXT:    ;;#ASMSTART
+; GFX8-NEXT:    ; use s59
+; GFX8-NEXT:    ;;#ASMEND
+; GFX8-NEXT:    v_readlane_b32 s59, v0, 0
+; GFX8-NEXT:    s_xor_saveexec_b64 s[4:5], -1
+; GFX8-NEXT:    s_add_i32 s6, s32, 0x101000
+; GFX8-NEXT:    buffer_load_dword v0, off, s[0:3], s6 ; 4-byte Folded Reload
+; GFX8-NEXT:    s_mov_b64 exec, s[4:5]
+; GFX8-NEXT:    s_waitcnt vmcnt(0)
+; GFX8-NEXT:    s_setpc_b64 s[30:31]
+;
+; GFX900-LABEL: scalar_mov_materializes_frame_index_available_scc_small_offset:
+; GFX900:       ; %bb.0:
+; GFX900-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX900-NEXT:    s_xor_saveexec_b64 s[4:5], -1
+; GFX900-NEXT:    s_add_i32 s6, s32, 0x101000
+; GFX900-NEXT:    buffer_store_dword v0, off, s[0:3], s6 ; 4-byte Folded Spill
+; GFX900-NEXT:    s_mov_b64 exec, s[4:5]
+; GFX900-NEXT:    v_writelane_b32 v0, s59, 0
+; GFX900-NEXT:    s_lshr_b32 s59, s32, 6
+; GFX900-NEXT:    s_add_i32 s59, s59, 64
+; GFX900-NEXT:    ;;#ASMSTART
+; GFX900-NEXT:    ; use s59
+; GFX900-NEXT:    ;;#ASMEND
+; GFX900-NEXT:    v_readlane_b32 s59, v0, 0
+; GFX900-NEXT:    s_xor_saveexec_b64 s[4:5], -1
+; GFX900-NEXT:    s_add_i32 s6, s32, 0x101000
+; GFX900-NEXT:    buffer_load_dword v0, off, s[0:3], s6 ; 4-byte Folded Reload
+; GFX900-NEXT:    s_mov_b64 exec, s[4:5]
+; GFX900-NEXT:    s_waitcnt vmcnt(0)
+; GFX900-NEXT:    s_setpc_b64 s[30:31]
+;
+; GFX940-LABEL: scalar_mov_materializes_frame_index_available_scc_small_offset:
+; GFX940:       ; %bb.0:
+; GFX940-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX940-NEXT:    s_xor_saveexec_b64 s[0:1], -1
+; GFX940-NEXT:    s_add_i32 s2, s32, 0x4040
+; GFX940-NEXT:    scratch_store_dword off, v0, s2 sc0 sc1 ; 4-byte Folded Spill
+; GFX940-NEXT:    s_mov_b64 exec, s[0:1]
+; GFX940-NEXT:    s_add_i32 s0, s32, 64
+; GFX940-NEXT:    v_writelane_b32 v0, s59, 0
+; GFX940-NEXT:    s_mov_b32 s59, s0
+; GFX940-NEXT:    ;;#ASMSTART
+; GFX940-NEXT:    ; use s59
+; GFX940-NEXT:    ;;#ASMEND
+; GFX940-NEXT:    v_readlane_b32 s59, v0, 0
+; GFX940-NEXT:    s_xor_saveexec_b64 s[0:1], -1
+; GFX940-NEXT:    s_add_i32 s2, s32, 0x4040
+; GFX940-NEXT:    scratch_load_dword v0, off, s2 ; 4-byte Folded Reload
+; GFX940-NEXT:    s_mov_b64 exec, s[0:1]
+; GFX940-NEXT:    s_waitcnt vmcnt(0)
+; GFX940-NEXT:    s_setpc_b64 s[30:31]
+  %alloca0 = alloca [4096 x i32], align 64, addrspace(5)
+  call void asm sideeffect "; use $0", "{s59}"(ptr addrspace(5) %alloca0)
+  ret void
+}
+
+define void @scalar_mov_materializes_frame_index_unavailable_scc_small_offset_fp() #1 {
+; GFX10_1-LABEL: scalar_mov_materializes_frame_index_unavailable_scc_small_offset_fp:
+; GFX10_1:       ; %bb.0:
+; GFX10_1-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX10_1-NEXT:    s_mov_b32 s5, s33
+; GFX10_1-NEXT:    s_mov_b32 s33, s32
+; GFX10_1-NEXT:    s_xor_saveexec_b32 s4, -1
+; GFX10_1-NEXT:    s_add_i32 s6, s33, 0x80800
+; GFX10_1-NEXT:    buffer_store_dword v0, off, s[0:3], s6 ; 4-byte Folded Spill
+; GFX10_1-NEXT:    s_waitcnt_depctr 0xffe3
+; GFX10_1-NEXT:    s_mov_b32 exec_lo, s4
+; GFX10_1-NEXT:    v_lshrrev_b32_e64 v1, 5, s33
+; GFX10_1-NEXT:    v_writelane_b32 v0, s59, 0
+; GFX10_1-NEXT:    s_add_i32 s32, s32, 0x81000
+; GFX10_1-NEXT:    s_and_b32 s4, 0, exec_lo
+; GFX10_1-NEXT:    v_add_nc_u32_e32 v1, 64, v1
+; GFX10_1-NEXT:    v_readfirstlane_b32 s59, v1
+; GFX10_1-NEXT:    ;;#ASMSTART
+; GFX10_1-NEXT:    ; use s59, scc
+; GFX10_1-NEXT:    ;;#ASMEND
+; GFX10_1-NEXT:    v_readlane_b32 s59, v0, 0
+; GFX10_1-NEXT:    s_xor_saveexec_b32 s4, -1
+; GFX10_1-NEXT:    s_add_i32 s6, s33, 0x80800
+; GFX10_1-NEXT:    buffer_load_dword v0, off, s[0:3], s6 ; 4-byte Folded Reload
+; GFX10_1-NEXT:    s_waitcnt_depctr 0xffe3
+; GFX10_1-NEXT:    s_mov_b32 exec_lo, s4
+; GFX10_1-NEXT:    s_add_i32 s32, s32, 0xfff7f000
+; GFX10_1-NEXT:    s_mov_b32 s33, s5
+; GFX10_1-NEXT:    s_waitcnt vmcnt(0)
+; GFX10_1-NEXT:    s_setpc_b64 s[30:31]
+;
+; GFX10_3-LABEL: scalar_mov_materializes_frame_index_unavailable_scc_small_offset_fp:
+; GFX10_3:       ; %bb.0:
+; GFX10_3-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX10_3-NEXT:    s_mov_b32 s5, s33
+; GFX10_3-NEXT:    s_mov_b32 s33, s32
+; GFX10_3-NEXT:    s_xor_saveexec_b32 s4, -1
+; GFX10_3-NEXT:    s_add_i32 s6, s33, 0x80800
+; GFX10_3-NEXT:    buffer_store_dword v0, off, s[0:3], s6 ; 4-byte Folded Spill
+; GFX10_3-NEXT:    s_mov_b32 exec_lo, s4
+; GFX10_3-NEXT:    v_lshrrev_b32_e64 v1, 5, s33
+; GFX10_3-NEXT:    v_writelane_b32 v0, s59, 0
+; GFX10_3-NEXT:    s_add_i32 s32, s32, 0x81000
+; GFX10_3-NEXT:    s_and_b32 s4, 0, exec_lo
+; GFX10_3-NEXT:    v_add_nc_u32_e32 v1, 64, v1
+; GFX10_3-NEXT:    v_readfirstlane_b32 s59, v1
+; GFX10_3-NEXT:    ;;#ASMSTART
+; GFX10_3-NEXT:    ; use s59, scc
+; GFX10_3-NEXT:    ;;#ASMEND
+; GFX10_3-NEXT:    v_readlane_b32 s59, v0, 0
+; GFX10_3-NEXT:    s_xor_saveexec_b32 s4, -1
+; GFX10_3-NEXT:    s_add_i32 s6, s33, 0x80800
+; GFX10_3-NEXT:    buffer_load_dword v0, off, s[0:3], s6 ; 4-byte Folded Reload
+; GFX10_3-NEXT:    s_mov_b32 exec_lo, s4
+; GFX10_3-NEXT:    s_add_i32 s32, s32, 0xfff7f000
+; GFX10_3-NEXT:    s_mov_b32 s33, s5
+; GFX10_3-NEXT:    s_waitcnt vmcnt(0)
+; GFX10_3-NEXT:    s_setpc_b64 s[30:31]
+;
+; GFX11-LABEL: scalar_mov_materializes_frame_index_unavailable_scc_small_offset_fp:
+; GFX11:       ; %bb.0:
+; GFX11-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX11-NEXT:    s_mov_b32 s1, s33
+; GFX11-NEXT:    s_mov_b32 s33, s32
+; GFX11-NEXT:    s_xor_saveexec_b32 s0, -1
+; GFX11-NEXT:    s_add_i32 s2, s33, 0x4040
+; GFX11-NEXT:    scratch_store_b32 off, v0, s2 ; 4-byte Folded Spill
+; GFX11-NEXT:    s_mov_b32 exec_lo, s0
+; GFX11-NEXT:    s_addk_i32 s32, 0x4080
+; GFX11-NEXT:    s_and_b32 s0, 0, exec_lo
+; GFX11-NEXT:    v_writelane_b32 v0, s59, 0
+; GFX11-NEXT:    s_addc_u32 s0, s33, 64
+; GFX11-NEXT:    s_delay_alu instid0(SALU_CYCLE_1) | instskip(SKIP_1) | instid1(SALU_CYCLE_1)
+; GFX11-NEXT:    s_bitcmp1_b32 s0, 0
+; GFX11-NEXT:    s_bitset0_b32 s0, 0
+; GFX11-NEXT:    s_mov_b32 s59, s0
+; GFX11-NEXT:    ;;#ASMSTART
+; GFX11-NEXT:    ; use s59, scc
+; GFX11-NEXT:    ;;#ASMEND
+; GFX11-NEXT:    v_readlane_b32 s59, v0, 0
+; GFX11-NEXT:    s_xor_saveexec_b32 s0, -1
+; GFX11-NEXT:    s_add_i32 s2, s33, 0x4040
+; GFX11-NEXT:    scratch_load_b32 v0, off, s2 ; 4-byte Folded Reload
+; GFX11-NEXT:    s_mov_b32 exec_lo, s0
+; GFX11-NEXT:    s_addk_i32 s32, 0xbf80
+; GFX11-NEXT:    s_mov_b32 s33, s1
+; GFX11-NEXT:    s_waitcnt vmcnt(0)
+; GFX11-NEXT:    s_setpc_b64 s[30:31]
+;
+; GFX12-LABEL: scalar_mov_materializes_frame_index_unavailable_scc_small_offset_fp:
+; GFX12:       ; %bb.0:
+; GFX12-NEXT:    s_wait_loadcnt_dscnt 0x0
+; GFX12-NEXT:    s_wait_expcnt 0x0
+; GFX12-NEXT:    s_wait_samplecnt 0x0
+; GFX12-NEXT:    s_wait_bvhcnt 0x0
+; GFX12-NEXT:    s_wait_kmcnt 0x0
+; GFX12-NEXT:    s_mov_b32 s1, s33
+; GFX12-NEXT:    s_mov_b32 s33, s32
+; GFX12-NEXT:    s_xor_saveexec_b32 s0, -1
+; GFX12-NEXT:    scratch_store_b32 off, v0, s33 offset:16384 ; 4-byte Folded Spill
+; GFX12-NEXT:    s_mov_b32 exec_lo, s0
+; GFX12-NEXT:    v_writelane_b32 v0, s59, 0
+; GFX12-NEXT:    s_addk_co_i32 s32, 0x4040
+; GFX12-NEXT:    s_mov_b32 s59, s33
+; GFX12-NEXT:    s_and_b32 s0, 0, exec_lo
+; GFX12-NEXT:    ;;#ASMSTART
+; GFX12-NEXT:    ; use s59, scc
+; GFX12-NEXT:    ;;#ASMEND
+; GFX12-NEXT:    v_readlane_b32 s59, v0, 0
+; GFX12-NEXT:    s_xor_saveexec_b32 s0, -1
+; GFX12-NEXT:    scratch_load_b32 v0, off, s33 offset:16384 ; 4-byte Folded Reload
+; GFX12-NEXT:    s_mov_b32 exec_lo, s0
+; GFX12-NEXT:    s_addk_co_i32 s32, 0xbfc0
+; GFX12-NEXT:    s_mov_b32 s33, s1
+; GFX12-NEXT:    s_wait_loadcnt 0x0
+; GFX12-NEXT:    s_setpc_b64 s[30:31]
+;
+; GFX8-LABEL: scalar_mov_materializes_frame_index_unavailable_scc_small_offset_fp:
+; GFX8:       ; %bb.0:
+; GFX8-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX8-NEXT:    s_mov_b32 s6, s33
+; GFX8-NEXT:    s_mov_b32 s33, s32
+; GFX8-NEXT:    s_xor_saveexec_b64 s[4:5], -1
+; GFX8-NEXT:    s_add_i32 s7, s33, 0x101000
+; GFX8-NEXT:    buffer_store_dword v0, off, s[0:3], s7 ; 4-byte Folded Spill
+; GFX8-NEXT:    s_mov_b64 exec, s[4:5]
+; GFX8-NEXT:    v_writelane_b32 v0, s59, 0
+; GFX8-NEXT:    v_lshrrev_b32_e64 v1, 6, s33
+; GFX8-NEXT:    s_mov_b32 s59, 64
+; GFX8-NEXT:    v_add_u32_e32 v1, vcc, s59, v1
+; GFX8-NEXT:    s_add_i32 s32, s32, 0x102000
+; GFX8-NEXT:    v_readfirstlane_b32 s59, v1
+; GFX8-NEXT:    s_and_b64 s[4:5], 0, exec
+; GFX8-NEXT:    ;;#ASMSTART
+; GFX8-NEXT:    ; use s59, scc
+; GFX8-NEXT:    ;;#ASMEND
+; GFX8-NEXT:    v_readlane_b32 s59, v0, 0
+; GFX8-NEXT:    s_xor_saveexec_b64 s[4:5], -1
+; GFX8-NEXT:    s_add_i32 s7, s33, 0x101000
+; GFX8-NEXT:    buffer_load_dword v0, off, s[0:3], s7 ; 4-byte Folded Reload
+; GFX8-NEXT:    s_mov_b64 exec, s[4:5]
+; GFX8-NEXT:    s_add_i32 s32, s32, 0xffefe000
+; GFX8-NEXT:    s_mov_b32 s33, s6
+; GFX8-NEXT:    s_waitcnt vmcnt(0)
+; GFX8-NEXT:    s_setpc_b64 s[30:31]
+;
+; GFX900-LABEL: scalar_mov_materializes_frame_index_unavailable_scc_small_offset_fp:
+; GFX900:       ; %bb.0:
+; GFX900-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX900-NEXT:    s_mov_b32 s6, s33
+; GFX900-NEXT:    s_mov_b32 s33, s32
+; GFX900-NEXT:    s_xor_saveexec_b64 s[4:5], -1
+; GFX900-NEXT:    s_add_i32 s7, s33, 0x101000
+; GFX900-NEXT:    buffer_store_dword v0, off, s[0:3], s7 ; 4-byte Folded Spill
+; GFX900-NEXT:    s_mov_b64 exec, s[4:5]
+; GFX900-NEXT:    v_lshrrev_b32_e64 v1, 6, s33
+; GFX900-NEXT:    v_add_u32_e32 v1, 64, v1
+; GFX900-NEXT:    s_add_i32 s32, s32, 0x102000
+; GFX900-NEXT:    v_writelane_b32 v0, s59, 0
+; GFX900-NEXT:    v_readfirstlane_b32 s59, v1
+; GFX900-NEXT:    s_and_b64 s[4:5], 0, exec
+; GFX900-NEXT:    ;;#ASMSTART
+; GFX900-NEXT:    ; use s59, scc
+; GFX900-NEXT:    ;;#ASMEND
+; GFX900-NEXT:    v_readlane_b32 s59, v0, 0
+; GFX900-NEXT:    s_xor_saveexec_b64 s[4:5], -1
+; GFX900-NEXT:    s_add_i32 s7, s33, 0x101000
+; GFX900-NEXT:    buffer_load_dword v0, off, s[0:3], s7 ; 4-byte Folded Reload
+; GFX900-NEXT:    s_mov_b64 exec, s[4:5]
+; GFX900-NEXT:    s_add_i32 s32, s32, 0xffefe000
+; GFX900-NEXT:    s_mov_b32 s33, s6
+; GFX900-NEXT:    s_waitcnt vmcnt(0)
+; GFX900-NEXT:    s_setpc_b64 s[30:31]
+;
+; GFX940-LABEL: scalar_mov_materializes_frame_index_unavailable_scc_small_offset_fp:
+; GFX940:       ; %bb.0:
+; GFX940-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX940-NEXT:    s_mov_b32 s2, s33
+; GFX940-NEXT:    s_mov_b32 s33, s32
+; GFX940-NEXT:    s_xor_saveexec_b64 s[0:1], -1
+; GFX940-NEXT:    s_add_i32 s3, s33, 0x4040
+; GFX940-NEXT:    scratch_store_dword off, v0, s3 sc0 sc1 ; 4-byte Folded Spill
+; GFX940-NEXT:    s_mov_b64 exec, s[0:1]
+; GFX940-NEXT:    s_addk_i32 s32, 0x4080
+; GFX940-NEXT:    s_and_b64 s[0:1], 0, exec
+; GFX940-NEXT:    s_addc_u32 s0, s33, 64
+; GFX940-NEXT:    s_bitcmp1_b32 s0, 0
+; GFX940-NEXT:    s_bitset0_b32 s0, 0
+; GFX940-NEXT:    v_writelane_b32 v0, s59, 0
+; GFX940-NEXT:    s_mov_b32 s59, s0
+; GFX940-NEXT:    ;;#ASMSTART
+; GFX940-NEXT:    ; use s59, scc
+; GFX940-NEXT:    ;;#ASMEND
+; GFX940-NEXT:    v_readlane_b32 s59, v0, 0
+; GFX940-NEXT:    s_xor_saveexec_b64 s[0:1], -1
+; GFX940-NEXT:    s_add_i32 s3, s33, 0x4040
+; GFX940-NEXT:    scratch_load_dword v0, off, s3 ; 4-byte Folded Reload
+; GFX940-NEXT:    s_mov_b64 exec, s[0:1]
+; GFX940-NEXT:    s_addk_i32 s32, 0xbf80
+; GFX940-NEXT:    s_mov_b32 s33, s2
+; GFX940-NEXT:    s_waitcnt vmcnt(0)
+; GFX940-NEXT:    s_setpc_b64 s[30:31]
+  %alloca0 = alloca [4096 x i32], align 64, addrspace(5)
+  call void asm sideeffect "; use $0, $1", "{s59},{scc}"(ptr addrspace(5) %alloca0, i32 0)
+  ret void
+}
+
+define void @scalar_mov_materializes_frame_index_available_scc_small_offset_fp() #1 {
+; GFX10_1-LABEL: scalar_mov_materializes_frame_index_available_scc_small_offset_fp:
+; GFX10_1:       ; %bb.0:
+; GFX10_1-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX10_1-NEXT:    s_mov_b32 s4, s33
+; GFX10_1-NEXT:    s_mov_b32 s33, s32
+; GFX10_1-NEXT:    s_xor_saveexec_b32 s5, -1
+; GFX10_1-NEXT:    s_add_i32 s6, s33, 0x80800
+; GFX10_1-NEXT:    buffer_store_dword v0, off, s[0:3], s6 ; 4-byte Folded Spill
+; GFX10_1-NEXT:    s_waitcnt_depctr 0xffe3
+; GFX10_1-NEXT:    s_mov_b32 exec_lo, s5
+; GFX10_1-NEXT:    v_writelane_b32 v0, s59, 0
+; GFX10_1-NEXT:    s_add_i32 s32, s32, 0x81000
+; GFX10_1-NEXT:    s_lshr_b32 s59, s33, 5
+; GFX10_1-NEXT:    s_add_i32 s59, s59, 64
+; GFX10_1-NEXT:    ;;#ASMSTART
+; GFX10_1-NEXT:    ; use s59
+; GFX10_1-NEXT:    ;;#ASMEND
+; GFX10_1-NEXT:    v_readlane_b32 s59, v0, 0
+; GFX10_1-NEXT:    s_xor_saveexec_b32 s5, -1
+; GFX10_1-NEXT:    s_add_i32 s6, s33, 0x80800
+; GFX10_1-NEXT:    buffer_load_dword v0, off, s[0:3], s6 ; 4-byte Folded Reload
+; GFX10_1-NEXT:    s_waitcnt_depctr 0xffe3
+; GFX10_1-NEXT:    s_mov_b32 exec_lo, s5
+; GFX10_1-NEXT:    s_add_i32 s32, s32, 0xfff7f000
+; GFX10_1-NEXT:    s_mov_b32 s33, s4
+; GFX10_1-NEXT:    s_waitcnt vmcnt(0)
+; GFX10_1-NEXT:    s_setpc_b64 s[30:31]
+;
+; GFX10_3-LABEL: scalar_mov_materializes_frame_index_available_scc_small_offset_fp:
+; GFX10_3:       ; %bb.0:
+; GFX10_3-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX10_3-NEXT:    s_mov_b32 s4, s33
+; GFX10_3-NEXT:    s_mov_b32 s33, s32
+; GFX10_3-NEXT:    s_xor_saveexec_b32 s5, -1
+; GFX10_3-NEXT:    s_add_i32 s6, s33, 0x80800
+; GFX10_3-NEXT:    buffer_store_dword v0, off, s[0:3], s6 ; 4-byte Folded Spill
+; GFX10_3-NEXT:    s_mov_b32 exec_lo, s5
+; GFX10_3-NEXT:    v_writelane_b32 v0, s59, 0
+; GFX10_3-NEXT:    s_add_i32 s32, s32, 0x81000
+; GFX10_3-NEXT:    s_lshr_b32 s59, s33, 5
+; GFX10_3-NEXT:    s_add_i32 s59, s59, 64
+; GFX10_3-NEXT:    ;;#ASMSTART
+; GFX10_3-NEXT:    ; use s59
+; GFX10_3-NEXT:    ;;#ASMEND
+; GFX10_3-NEXT:    v_readlane_b32 s59, v0, 0
+; GFX10_3-NEXT:    s_xor_saveexec_b32 s5, -1
+; GFX10_3-NEXT:    s_add_i32 s6, s33, 0x80800
+; GFX10_3-NEXT:    buffer_load_dword v0, off, s[0:3], s6 ; 4-byte Folded Reload
+; GFX10_3-NEXT:    s_mov_b32 exec_lo, s5
+; GFX10_3-NEXT:    s_add_i32 s32, s32, 0xfff7f000
+; GFX10_3-NEXT:    s_mov_b32 s33, s4
+; GFX10_3-NEXT:    s_waitcnt vmcnt(0)
+; GFX10_3-NEXT:    s_setpc_b64 s[30:31]
+;
+; GFX11-LABEL: scalar_mov_materializes_frame_index_available_scc_small_offset_fp:
+; GFX11:       ; %bb.0:
+; GFX11-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX11-NEXT:    s_mov_b32 s0, s33
+; GFX11-NEXT:    s_mov_b32 s33, s32
+; GFX11-NEXT:    s_xor_saveexec_b32 s1, -1
+; GFX11-NEXT:    s_add_i32 s2, s33, 0x4040
+; GFX11-NEXT:    scratch_store_b32 off, v0, s2 ; 4-byte Folded Spill
+; GFX11-NEXT:    s_mov_b32 exec_lo, s1
+; GFX11-NEXT:    v_writelane_b32 v0, s59, 0
+; GFX11-NEXT:    s_addk_i32 s32, 0x4080
+; GFX11-NEXT:    s_add_i32 s1, s33, 64
+; GFX11-NEXT:    s_delay_alu instid0(SALU_CYCLE_1)
+; GFX11-NEXT:    s_mov_b32 s59, s1
+; GFX11-NEXT:    ;;#ASMSTART
+; GFX11-NEXT:    ; use s59
+; GFX11-NEXT:    ;;#ASMEND
+; GFX11-NEXT:    v_readlane_b32 s59, v0, 0
+; GFX11-NEXT:    s_xor_saveexec_b32 s1, -1
+; GFX11-NEXT:    s_add_i32 s2, s33, 0x4040
+; GFX11-NEXT:    scratch_load_b32 v0, off, s2 ; 4-byte Folded Reload
+; GFX11-NEXT:    s_mov_b32 exec_lo, s1
+; GFX11-NEXT:    s_addk_i32 s32, 0xbf80
+; GFX11-NEXT:    s_mov_b32 s33, s0
+; GFX11-NEXT:    s_waitcnt vmcnt(0)
+; GFX11-NEXT:    s_setpc_b64 s[30:31]
+;
+; GFX12-LABEL: scalar_mov_materializes_frame_index_available_scc_small_offset_fp:
+; GFX12:       ; %bb.0:
+; GFX12-NEXT:    s_wait_loadcnt_dscnt 0x0
+; GFX12-NEXT:    s_wait_expcnt 0x0
+; GFX12-NEXT:    s_wait_samplecnt 0x0
+; GFX12-NEXT:    s_wait_bvhcnt 0x0
+; GFX12-NEXT:    s_wait_kmcnt 0x0
+; GFX12-NEXT:    s_mov_b32 s0, s33
+; GFX12-NEXT:    s_mov_b32 s33, s32
+; GFX12-NEXT:    s_xor_saveexec_b32 s1, -1
+; GFX12-NEXT:    scratch_store_b32 off, v0, s33 offset:16384 ; 4-byte Folded Spill
+; GFX12-NEXT:    s_mov_b32 exec_lo, s1
+; GFX12-NEXT:    v_writelane_b32 v0, s59, 0
+; GFX12-NEXT:    s_mov_b32 s59, s33
+; GFX12-NEXT:    s_addk_co_i32 s32, 0x4040
+; GFX12-NEXT:    ;;#ASMSTART
+; GFX12-NEXT:    ; use s59
+; GFX12-NEXT:    ;;#ASMEND
+; GFX12-NEXT:    s_delay_alu instid0(VALU_DEP_1)
+; GFX12-NEXT:    v_readlane_b32 s59, v0, 0
+; GFX12-NEXT:    s_xor_saveexec_b32 s1, -1
+; GFX12-NEXT:    scratch_load_b32 v0, off, s33 offset:16384 ; 4-byte Folded Reload
+; GFX12-NEXT:    s_mov_b32 exec_lo, s1
+; GFX12-NEXT:    s_addk_co_i32 s32, 0xbfc0
+; GFX12-NEXT:    s_mov_b32 s33, s0
+; GFX12-NEXT:    s_wait_loadcnt 0x0
+; GFX12-NEXT:    s_setpc_b64 s[30:31]
+;
+; GFX8-LABEL: scalar_mov_materializes_frame_index_available_scc_small_offset_fp:
+; GFX8:       ; %bb.0:
+; GFX8-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX8-NEXT:    s_mov_b32 s4, s33
+; GFX8-NEXT:    s_mov_b32 s33, s32
+; GFX8-NEXT:    s_xor_saveexec_b64 s[6:7], -1
+; GFX8-NEXT:    s_add_i32 s5, s33, 0x101000
+; GFX8-NEXT:    buffer_store_dword v0, off, s[0:3], s5 ; 4-byte Folded Spill
+; GFX8-NEXT:    s_mov_b64 exec, s[6:7]
+; GFX8-NEXT:    s_add_i32 s32, s32, 0x102000
+; GFX8-NEXT:    v_writelane_b32 v0, s59, 0
+; GFX8-NEXT:    s_lshr_b32 s59, s33, 6
+; GFX8-NEXT:    s_add_i32 s59, s59, 64
+; GFX8-NEXT:    ;;#ASMSTART
+; GFX8-NEXT:    ; use s59
+; GFX8-NEXT:    ;;#ASMEND
+; GFX8-NEXT:    v_readlane_b32 s59, v0, 0
+; GFX8-NEXT:    s_xor_saveexec_b64 s[6:7], -1
+; GFX8-NEXT:    s_add_i32 s5, s33, 0x101000
+; GFX8-NEXT:    buffer_load_dword v0, off, s[0:3], s5 ; 4-byte Folded Reload
+; GFX8-NEXT:    s_mov_b64 exec, s[6:7]
+; GFX8-NEXT:    s_add_i32 s32, s32, 0xffefe000
+; GFX8-NEXT:    s_mov_b32 s33, s4
+; GFX8-NEXT:    s_waitcnt vmcnt(0)
+; GFX8-NEXT:    s_setpc_b64 s[30:31]
+;
+; GFX900-LABEL: scalar_mov_materializes_frame_index_available_scc_small_offset_fp:
+; GFX900:       ; %bb.0:
+; GFX900-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX900-NEXT:    s_mov_b32 s4, s33
+; GFX900-NEXT:    s_mov_b32 s33, s32
+; GFX900-NEXT:    s_xor_saveexec_b64 s[6:7], -1
+; GFX900-NEXT:    s_add_i32 s5, s33, 0x101000
+; GFX900-NEXT:    buffer_store_dword v0, off, s[0:3], s5 ; 4-byte Folded Spill
+; GFX900-NEXT:    s_mov_b64 exec, s[6:7]
+; GFX900-NEXT:    s_add_i32 s32, s32, 0x102000
+; GFX900-NEXT:    v_writelane_b32 v0, s59, 0
+; GFX900-NEXT:    s_lshr_b32 s59, s33, 6
+; GFX900-NEXT:    s_add_i32 s59, s59, 64
+; GFX900-NEXT:    ;;#ASMSTART
+; GFX900-NEXT:    ; use s59
+; GFX900-NEXT:    ;;#ASMEND
+; GFX900-NEXT:    v_readlane_b32 s59, v0, 0
+; GFX900-NEXT:    s_xor_saveexec_b64 s[6:7], -1
+; GFX900-NEXT:    s_add_i32 s5, s33, 0x101000
+; GFX900-NEXT:    buffer_load_dword v0, off, s[0:3], s5 ; 4-byte Folded Reload
+; GFX900-NEXT:    s_mov_b64 exec, s[6:7]
+; GFX900-NEXT:    s_add_i32 s32, s32, 0xffefe000
+; GFX900-NEXT:    s_mov_b32 s33, s4
+; GFX900-NEXT:    s_waitcnt vmcnt(0)
+; GFX900-NEXT:    s_setpc_b64 s[30:31]
+;
+; GFX940-LABEL: scalar_mov_materializes_frame_index_available_scc_small_offset_fp:
+; GFX940:       ; %bb.0:
+; GFX940-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX940-NEXT:    s_mov_b32 s0, s33
+; GFX940-NEXT:    s_mov_b32 s33, s32
+; GFX940-NEXT:    s_xor_saveexec_b64 s[2:3], -1
+; GFX940-NEXT:    s_add_i32 s1, s33, 0x4040
+; GFX940-NEXT:    scratch_store_dword off, v0, s1 sc0 sc1 ; 4-byte Folded Spill
+; GFX940-NEXT:    s_mov_b64 exec, s[2:3]
+; GFX940-NEXT:    s_addk_i32 s32, 0x4080
+; GFX940-NEXT:    s_add_i32 s1, s33, 64
+; GFX940-NEXT:    v_writelane_b32 v0, s59, 0
+; GFX940-NEXT:    s_mov_b32 s59, s1
+; GFX940-NEXT:    ;;#ASMSTART
+; GFX940-NEXT:    ; use s59
+; GFX940-NEXT:    ;;#ASMEND
+; GFX940-NEXT:    v_readlane_b32 s59, v0, 0
+; GFX940-NEXT:    s_xor_saveexec_b64 s[2:3], -1
+; GFX940-NEXT:    s_add_i32 s1, s33, 0x4040
+; GFX940-NEXT:    scratch_load_dword v0, off, s1 ; 4-byte Folded Reload
+; GFX940-NEXT:    s_mov_b64 exec, s[2:3]
+; GFX940-NEXT:    s_addk_i32 s32, 0xbf80
+; GFX940-NEXT:    s_mov_b32 s33, s0
+; GFX940-NEXT:    s_waitcnt vmcnt(0)
+; GFX940-NEXT:    s_setpc_b64 s[30:31]
+  %alloca0 = alloca [4096 x i32], align 64, addrspace(5)
+  call void asm sideeffect "; use $0", "{s59}"(ptr addrspace(5) %alloca0)
+  ret void
+}
+
+define void @scalar_mov_materializes_frame_index_unavailable_scc__gep_immoffset() #0 {
+; GFX10_1-LABEL: scalar_mov_materializes_frame_index_unavailable_scc__gep_immoffset:
+; GFX10_1:       ; %bb.0:
+; GFX10_1-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX10_1-NEXT:    s_xor_saveexec_b32 s4, -1
+; GFX10_1-NEXT:    s_add_i32 s5, s32, 0x100800
+; GFX10_1-NEXT:    buffer_store_dword v2, off, s[0:3], s5 ; 4-byte Folded Spill
+; GFX10_1-NEXT:    s_waitcnt_depctr 0xffe3
+; GFX10_1-NEXT:    s_mov_b32 exec_lo, s4
+; GFX10_1-NEXT:    v_lshrrev_b32_e64 v0, 5, s32
+; GFX10_1-NEXT:    v_writelane_b32 v2, s59, 0
+; GFX10_1-NEXT:    v_lshrrev_b32_e64 v1, 5, s32
+; GFX10_1-NEXT:    s_and_b32 s4, 0, exec_lo
+; GFX10_1-NEXT:    v_add_nc_u32_e32 v0, 0x4040, v0
+; GFX10_1-NEXT:    v_add_nc_u32_e32 v1, 64, v1
+; GFX10_1-NEXT:    ;;#ASMSTART
+; GFX10_1-NEXT:    ; use alloca0 v1
+; GFX10_1-NEXT:    ;;#ASMEND
+; GFX10_1-NEXT:    v_add_nc_u32_e32 v0, 0x3ec, v0
+; GFX10_1-NEXT:    v_readfirstlane_b32 s59, v0
+; GFX10_1-NEXT:    ;;#ASMSTART
+; GFX10_1-NEXT:    ; use s59, scc
+; GFX10_1-NEXT:    ;;#ASMEND
+; GFX10_1-NEXT:    v_readlane_b32 s59, v2, 0
+; GFX10_1-NEXT:    s_xor_saveexec_b32 s4, -1
+; GFX10_1-NEXT:    s_add_i32 s5, s32, 0x100800
+; GFX10_1-NEXT:    buffer_load_dword v2, off, s[0:3], s5 ; 4-byte Folded Reload
+; GFX10_1-NEXT:    s_waitcnt_depctr 0xffe3
+; GFX10_1-NEXT:    s_mov_b32 exec_lo, s4
+; GFX10_1-NEXT:    s_waitcnt vmcnt(0)
+; GFX10_1-NEXT:    s_setpc_b64 s[30:31]
+;
+; GFX10_3-LABEL: scalar_mov_materializes_frame_index_unavailable_scc__gep_immoffset:
+; GFX10_3:       ; %bb.0:
+; GFX10_3-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX10_3-NEXT:    s_xor_saveexec_b32 s4, -1
+; GFX10_3-NEXT:    s_add_i32 s5, s32, 0x100800
+; GFX10_3-NEXT:    buffer_store_dword v2, off, s[0:3], s5 ; 4-byte Folded Spill
+; GFX10_3-NEXT:    s_mov_b32 exec_lo, s4
+; GFX10_3-NEXT:    v_lshrrev_b32_e64 v0, 5, s32
+; GFX10_3-NEXT:    v_writelane_b32 v2, s59, 0
+; GFX10_3-NEXT:    v_lshrrev_b32_e64 v1, 5, s32
+; GFX10_3-NEXT:    s_and_b32 s4, 0, exec_lo
+; GFX10_3-NEXT:    v_add_nc_u32_e32 v0, 0x4040, v0
+; GFX10_3-NEXT:    v_add_nc_u32_e32 v1, 64, v1
+; GFX10_3-NEXT:    ;;#ASMSTART
+; GFX10_3-NEXT:    ; use alloca0 v1
+; GFX10_3-NEXT:    ;;#ASMEND
+; GFX10_3-NEXT:    v_add_nc_u32_e32 v0, 0x3ec, v0
+; GFX10_3-NEXT:    v_readfirstlane_b32 s59, v0
+; GFX10_3-NEXT:    ;;#ASMSTART
+; GFX10_3-NEXT:    ; use s59, scc
+; GFX10_3-NEXT:    ;;#ASMEND
+; GFX10_3-NEXT:    v_readlane_b32 s59, v2, 0
+; GFX10_3-NEXT:    s_xor_saveexec_b32 s4, -1
+; GFX10_3-NEXT:    s_add_i32 s5, s32, 0x100800
+; GFX10_3-NEXT:    buffer_load_dword v2, off, s[0:3], s5 ; 4-byte Folded Reload
+; GFX10_3-NEXT:    s_mov_b32 exec_lo, s4
+; GFX10_3-NEXT:    s_waitcnt vmcnt(0)
+; GFX10_3-NEXT:    s_setpc_b64 s[30:31]
+;
+; GFX11-LABEL: scalar_mov_materializes_frame_index_unavailable_scc__gep_immoffset:
+; GFX11:       ; %bb.0:
+; GFX11-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX11-NEXT:    s_xor_saveexec_b32 s0, -1
+; GFX11-NEXT:    s_add_i32 s1, s32, 0x8040
+; GFX11-NEXT:    scratch_store_b32 off, v2, s1 ; 4-byte Folded Spill
+; GFX11-NEXT:    s_mov_b32 exec_lo, s0
+; GFX11-NEXT:    s_add_i32 s0, s32, 0x4040
+; GFX11-NEXT:    v_writelane_b32 v2, s59, 0
+; GFX11-NEXT:    v_mov_b32_e32 v0, s0
+; GFX11-NEXT:    s_add_i32 s0, s32, 64
+; GFX11-NEXT:    s_delay_alu instid0(SALU_CYCLE_1) | instskip(SKIP_1) | instid1(VALU_DEP_2)
+; GFX11-NEXT:    v_mov_b32_e32 v1, s0
+; GFX11-NEXT:    s_and_b32 s0, 0, exec_lo
+; GFX11-NEXT:    v_add_nc_u32_e32 v0, 0x3ec, v0
+; GFX11-NEXT:    ;;#ASMSTART
+; GFX11-NEXT:    ; use alloca0 v1
+; GFX11-NEXT:    ;;#ASMEND
+; GFX11-NEXT:    s_delay_alu instid0(VALU_DEP_1)
+; GFX11-NEXT:    v_readfirstlane_b32 s59, v0
+; GFX11-NEXT:    ;;#ASMSTART
+; GFX11-NEXT:    ; use s59, scc
+; GFX11-NEXT:    ;;#ASMEND
+; GFX11-NEXT:    v_readlane_b32 s59, v2, 0
+; GFX11-NEXT:    s_xor_saveexec_b32 s0, -1
+; GFX11-NEXT:    s_add_i32 s1, s32, 0x8040
+; GFX11-NEXT:    scratch_load_b32 v2, off, s1 ; 4-byte Folded Reload
+; GFX11-NEXT:    s_mov_b32 exec_lo, s0
+; GFX11-NEXT:    s_waitcnt vmcnt(0)
+; GFX11-NEXT:    s_setpc_b64 s[30:31]
+;
+; GFX12-LABEL: scalar_mov_materializes_frame_index_unavailable_scc__gep_immoffset:
+; GFX12:       ; %bb.0:
+; GFX12-NEXT:    s_wait_loadcnt_dscnt 0x0
+; GFX12-NEXT:    s_wait_expcnt 0x0
+; GFX12-NEXT:    s_wait_samplecnt 0x0
+; GFX12-NEXT:    s_wait_bvhcnt 0x0
+; GFX12-NEXT:    s_wait_kmcnt 0x0
+; GFX12-NEXT:    s_xor_saveexec_b32 s0, -1
+; GFX12-NEXT:    scratch_store_b32 off, v2, s32 offset:32768 ; 4-byte Folded Spill
+; GFX12-NEXT:    s_mov_b32 exec_lo, s0
+; GFX12-NEXT:    s_add_co_i32 s0, s32, 0x4000
+; GFX12-NEXT:    v_writelane_b32 v2, s59, 0
+; GFX12-NEXT:    v_dual_mov_b32 v0, s0 :: v_dual_mov_b32 v1, s32
+; GFX12-NEXT:    s_and_b32 s0, 0, exec_lo
+; GFX12-NEXT:    ;;#ASMSTART
+; GFX12-NEXT:    ; use alloca0 v1
+; GFX12-NEXT:    ;;#ASMEND
+; GFX12-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
+; GFX12-NEXT:    v_add_nc_u32_e32 v0, 0x3ec, v0
+; GFX12-NEXT:    v_readfirstlane_b32 s59, v0
+; GFX12-NEXT:    ;;#ASMSTART
+; GFX12-NEXT:    ; use s59, scc
+; GFX12-NEXT:    ;;#ASMEND
+; GFX12-NEXT:    v_readlane_b32 s59, v2, 0
+; GFX12-NEXT:    s_xor_saveexec_b32 s0, -1
+; GFX12-NEXT:    scratch_load_b32 v2, off, s32 offset:32768 ; 4-byte Folded Reload
+; GFX12-NEXT:    s_mov_b32 exec_lo, s0
+; GFX12-NEXT:    s_wait_loadcnt 0x0
+; GFX12-NEXT:    s_setpc_b64 s[30:31]
+;
+; GFX8-LABEL: scalar_mov_materializes_frame_index_unavailable_scc__gep_immoffset:
+; GFX8:       ; %bb.0:
+; GFX8-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX8-NEXT:    s_xor_saveexec_b64 s[4:5], -1
+; GFX8-NEXT:    s_add_i32 s6, s32, 0x201000
+; GFX8-NEXT:    buffer_store_dword v2, off, s[0:3], s6 ; 4-byte Folded Spill
+; GFX8-NEXT:    s_mov_b64 exec, s[4:5]
+; GFX8-NEXT:    v_lshrrev_b32_e64 v0, 6, s32
+; GFX8-NEXT:    s_movk_i32 vcc_lo, 0x4040
+; GFX8-NEXT:    v_add_u32_e32 v0, vcc, vcc_lo, v0
+; GFX8-NEXT:    v_add_u32_e32 v0, vcc, 0x3ec, v0
+; GFX8-NEXT:    v_writelane_b32 v2, s59, 0
+; GFX8-NEXT:    v_lshrrev_b32_e64 v1, 6, s32
+; GFX8-NEXT:    v_readfirstlane_b32 s59, v0
+; GFX8-NEXT:    v_add_u32_e32 v1, vcc, 64, v1
+; GFX8-NEXT:    ;;#ASMSTART
+; GFX8-NEXT:    ; use alloca0 v1
+; GFX8-NEXT:    ;;#ASMEND
+; GFX8-NEXT:    s_and_b64 s[4:5], 0, exec
+; GFX8-NEXT:    ;;#ASMSTART
+; GFX8-NEXT:    ; use s59, scc
+; GFX8-NEXT:    ;;#ASMEND
+; GFX8-NEXT:    v_readlane_b32 s59, v2, 0
+; GFX8-NEXT:    s_xor_saveexec_b64 s[4:5], -1
+; GFX8-NEXT:    s_add_i32 s6, s32, 0x201000
+; GFX8-NEXT:    buffer_load_dword v2, off, s[0:3], s6 ; 4-byte Folded Reload
+; GFX8-NEXT:    s_mov_b64 exec, s[4:5]
+; GFX8-NEXT:    s_waitcnt vmcnt(0)
+; GFX8-NEXT:    s_setpc_b64 s[30:31]
+;
+; GFX900-LABEL: scalar_mov_materializes_frame_index_unavailable_scc__gep_immoffset:
+; GFX900:       ; %bb.0:
+; GFX900-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX900-NEXT:    s_xor_saveexec_b64 s[4:5], -1
+; GFX900-NEXT:    s_add_i32 s6, s32, 0x201000
+; GFX900-NEXT:    buffer_store_dword v2, off, s[0:3], s6 ; 4-byte Folded Spill
+; GFX900-NEXT:    s_mov_b64 exec, s[4:5]
+; GFX900-NEXT:    v_lshrrev_b32_e64 v0, 6, s32
+; GFX900-NEXT:    v_add_u32_e32 v0, 0x4040, v0
+; GFX900-NEXT:    v_add_u32_e32 v0, 0x3ec, v0
+; GFX900-NEXT:    v_writelane_b32 v2, s59, 0
+; GFX900-NEXT:    v_lshrrev_b32_e64 v1, 6, s32
+; GFX900-NEXT:    v_readfirstlane_b32 s59, v0
+; GFX900-NEXT:    v_add_u32_e32 v1, 64, v1
+; GFX900-NEXT:    ;;#ASMSTART
+; GFX900-NEXT:    ; use alloca0 v1
+; GFX900-NEXT:    ;;#ASMEND
+; GFX900-NEXT:    s_and_b64 s[4:5], 0, exec
+; GFX900-NEXT:    ;;#ASMSTART
+; GFX900-NEXT:    ; use s59, scc
+; GFX900-NEXT:    ;;#ASMEND
+; GFX900-NEXT:    v_readlane_b32 s59, v2, 0
+; GFX900-NEXT:    s_xor_saveexec_b64 s[4:5], -1
+; GFX900-NEXT:    s_add_i32 s6, s32, 0x201000
+; GFX900-NEXT:    buffer_load_dword v2, off, s[0:3], s6 ; 4-byte Folded Reload
+; GFX900-NEXT:    s_mov_b64 exec, s[4:5]
+; GFX900-NEXT:    s_waitcnt vmcnt(0)
+; GFX900-NEXT:    s_setpc_b64 s[30:31]
+;
+; GFX940-LABEL: scalar_mov_materializes_frame_index_unavailable_scc__gep_immoffset:
+; GFX940:       ; %bb.0:
+; GFX940-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX940-NEXT:    s_xor_saveexec_b64 s[0:1], -1
+; GFX940-NEXT:    s_add_i32 s2, s32, 0x8040
+; GFX940-NEXT:    scratch_store_dword off, v2, s2 sc0 sc1 ; 4-byte Folded Spill
+; GFX940-NEXT:    s_mov_b64 exec, s[0:1]
+; GFX940-NEXT:    s_add_i32 s0, s32, 0x4040
+; GFX940-NEXT:    v_mov_b32_e32 v0, s0
+; GFX940-NEXT:    v_add_u32_e32 v0, 0x3ec, v0
+; GFX940-NEXT:    v_writelane_b32 v2, s59, 0
+; GFX940-NEXT:    s_add_i32 s0, s32, 64
+; GFX940-NEXT:    v_readfirstlane_b32 s59, v0
+; GFX940-NEXT:    v_mov_b32_e32 v1, s0
+; GFX940-NEXT:    ;;#ASMSTART
+; GFX940-NEXT:    ; use alloca0 v1
+; GFX940-NEXT:    ;;#ASMEND
+; GFX940-NEXT:    s_and_b64 s[0:1], 0, exec
+; GFX940-NEXT:    ;;#ASMSTART
+; GFX940-NEXT:    ; use s59, scc
+; GFX940-NEXT:    ;;#ASMEND
+; GFX940-NEXT:    v_readlane_b32 s59, v2, 0
+; GFX940-NEXT:    s_xor_saveexec_b64 s[0:1], -1
+; GFX940-NEXT:    s_add_i32 s2, s32, 0x8040
+; GFX940-NEXT:    scratch_load_dword v2, off, s2 ; 4-byte Folded Reload
+; GFX940-NEXT:    s_mov_b64 exec, s[0:1]
+; GFX940-NEXT:    s_waitcnt vmcnt(0)
+; GFX940-NEXT:    s_setpc_b64 s[30:31]
+  %alloca0 = alloca [4096 x i32], align 64, addrspace(5)
+  %alloca1 = alloca [4096 x i32], align 4, addrspace(5)
+  %alloca1.offset = getelementptr [4096 x i32], ptr addrspace(5) %alloca1, i32 0, i32 251
+  call void asm sideeffect "; use alloca0 $0", "v"(ptr addrspace(5) %alloca0)
+  call void asm sideeffect "; use $0, $1", "{s59},{scc}"(ptr addrspace(5) %alloca1.offset, i32 0)
+  ret void
+}
+
+define void @scalar_mov_materializes_frame_index_unavailable_scc__gep_sgpr_offset(i32 inreg %soffset) #0 {
+; GFX10_1-LABEL: scalar_mov_materializes_frame_index_unavailable_scc__gep_sgpr_offset:
+; GFX10_1:       ; %bb.0:
+; GFX10_1-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX10_1-NEXT:    s_xor_saveexec_b32 s4, -1
+; GFX10_1-NEXT:    s_add_i32 s5, s32, 0x100800
+; GFX10_1-NEXT:    buffer_store_dword v2, off, s[0:3], s5 ; 4-byte Folded Spill
+; GFX10_1-NEXT:    s_waitcnt_depctr 0xffe3
+; GFX10_1-NEXT:    s_mov_b32 exec_lo, s4
+; GFX10_1-NEXT:    v_lshrrev_b32_e64 v3, 5, s32
+; GFX10_1-NEXT:    s_lshl_b32 s4, s6, 2
+; GFX10_1-NEXT:    v_writelane_b32 v2, s59, 0
+; GFX10_1-NEXT:    v_lshrrev_b32_e64 v0, 5, s32
+; GFX10_1-NEXT:    v_add_nc_u32_e32 v3, 0x4040, v3
+; GFX10_1-NEXT:    v_add_nc_u32_e32 v0, 64, v0
+; GFX10_1-NEXT:    ;;#ASMSTART
+; GFX10_1-NEXT:    ; use alloca0 v0
+; GFX10_1-NEXT:    ;;#ASMEND
+; GFX10_1-NEXT:    v_add_nc_u32_e32 v1, s4, v3
+; GFX10_1-NEXT:    s_and_b32 s4, 0, exec_lo
+; GFX10_1-NEXT:    v_readfirstlane_b32 s59, v1
+; GFX10_1-NEXT:    ;;#ASMSTART
+; GFX10_1-NEXT:    ; use s59, scc
+; GFX10_1-NEXT:    ;;#ASMEND
+; GFX10_1-NEXT:    v_readlane_b32 s59, v2, 0
+; GFX10_1-NEXT:    s_xor_saveexec_b32 s4, -1
+; GFX10_1-NEXT:    s_add_i32 s5, s32, 0x100800
+; GFX10_1-NEXT:    buffer_load_dword v2, off, s[0:3], s5 ; 4-byte Folded Reload
+; GFX10_1-NEXT:    s_waitcnt_depctr 0xffe3
+; GFX10_1-NEXT:    s_mov_b32 exec_lo, s4
+; GFX10_1-NEXT:    s_waitcnt vmcnt(0)
+; GFX10_1-NEXT:    s_setpc_b64 s[30:31]
+;
+; GFX10_3-LABEL: scalar_mov_materializes_frame_index_unavailable_scc__gep_sgpr_offset:
+; GFX10_3:       ; %bb.0:
+; GFX10_3-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX10_3-NEXT:    s_xor_saveexec_b32 s4, -1
+; GFX10_3-NEXT:    s_add_i32 s5, s32, 0x100800
+; GFX10_3-NEXT:    buffer_store_dword v2, off, s[0:3], s5 ; 4-byte Folded Spill
+; GFX10_3-NEXT:    s_mov_b32 exec_lo, s4
+; GFX10_3-NEXT:    v_lshrrev_b32_e64 v3, 5, s32
+; GFX10_3-NEXT:    s_lshl_b32 s4, s6, 2
+; GFX10_3-NEXT:    v_writelane_b32 v2, s59, 0
+; GFX10_3-NEXT:    v_lshrrev_b32_e64 v0, 5, s32
+; GFX10_3-NEXT:    v_add_nc_u32_e32 v3, 0x4040, v3
+; GFX10_3-NEXT:    v_add_nc_u32_e32 v0, 64, v0
+; GFX10_3-NEXT:    ;;#ASMSTART
+; GFX10_3-NEXT:    ; use alloca0 v0
+; GFX10_3-NEXT:    ;;#ASMEND
+; GFX10_3-NEXT:    v_add_nc_u32_e32 v1, s4, v3
+; GFX10_3-NEXT:    s_and_b32 s4, 0, exec_lo
+; GFX10_3-NEXT:    v_readfirstlane_b32 s59, v1
+; GFX10_3-NEXT:    ;;#ASMSTART
+; GFX10_3-NEXT:    ; use s59, scc
+; GFX10_3-NEXT:    ;;#ASMEND
+; GFX10_3-NEXT:    v_readlane_b32 s59, v2, 0
+; GFX10_3-NEXT:    s_xor_saveexec_b32 s4, -1
+; GFX10_3-NEXT:    s_add_i32 s5, s32, 0x100800
+; GFX10_3-NEXT:    buffer_load_dword v2, off, s[0:3], s5 ; 4-byte Folded Reload
+; GFX10_3-NEXT:    s_mov_b32 exec_lo, s4
+; GFX10_3-NEXT:    s_waitcnt vmcnt(0)
+; GFX10_3-NEXT:    s_setpc_b64 s[30:31]
+;
+; GFX11-LABEL: scalar_mov_materializes_frame_index_unavailable_scc__gep_sgpr_offset:
+; GFX11:       ; %bb.0:
+; GFX11-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX11-NEXT:    s_xor_saveexec_b32 s1, -1
+; GFX11-NEXT:    s_add_i32 s2, s32, 0x8040
+; GFX11-NEXT:    scratch_store_b32 off, v2, s2 ; 4-byte Folded Spill
+; GFX11-NEXT:    s_mov_b32 exec_lo, s1
+; GFX11-NEXT:    s_lshl_b32 s0, s0, 2
+; GFX11-NEXT:    s_add_i32 s1, s32, 64
+; GFX11-NEXT:    v_writelane_b32 v2, s59, 0
+; GFX11-NEXT:    v_mov_b32_e32 v0, s1
+; GFX11-NEXT:    s_add_i32 s1, s32, 0x4040
+; GFX11-NEXT:    ;;#ASMSTART
+; GFX11-NEXT:    ; use alloca0 v0
+; GFX11-NEXT:    ;;#ASMEND
+; GFX11-NEXT:    v_add_nc_u32_e64 v1, s0, s1
+; GFX11-NEXT:    s_and_b32 s0, 0, exec_lo
+; GFX11-NEXT:    s_delay_alu instid0(VALU_DEP_1)
+; GFX11-NEXT:    v_readfirstlane_b32 s59, v1
+; GFX11-NEXT:    ;;#ASMSTART
+; GFX11-NEXT:    ; use s59, scc
+; GFX11-NEXT:    ;;#ASMEND
+; GFX11-NEXT:    v_readlane_b32 s59, v2, 0
+; GFX11-NEXT:    s_xor_saveexec_b32 s0, -1
+; GFX11-NEXT:    s_add_i32 s1, s32, 0x8040
+; GFX11-NEXT:    scratch_load_b32 v2, off, s1 ; 4-byte Folded Reload
+; GFX11-NEXT:    s_mov_b32 exec_lo, s0
+; GFX11-NEXT:    s_waitcnt vmcnt(0)
+; GFX11-NEXT:    s_setpc_b64 s[30:31]
+;
+; GFX12-LABEL: scalar_mov_materializes_frame_index_unavailable_scc__gep_sgpr_offset:
+; GFX12:       ; %bb.0:
+; GFX12-NEXT:    s_wait_loadcnt_dscnt 0x0
+; GFX12-NEXT:    s_wait_expcnt 0x0
+; GFX12-NEXT:    s_wait_samplecnt 0x0
+; GFX12-NEXT:    s_wait_bvhcnt 0x0
+; GFX12-NEXT:    s_wait_kmcnt 0x0
+; GFX12-NEXT:    s_xor_saveexec_b32 s1, -1
+; GFX12-NEXT:    scratch_store_b32 off, v2, s32 offset:32768 ; 4-byte Folded Spill
+; GFX12-NEXT:    s_mov_b32 exec_lo, s1
+; GFX12-NEXT:    s_lshl_b32 s0, s0, 2
+; GFX12-NEXT:    s_add_co_i32 s1, s32, 0x4000
+; GFX12-NEXT:    v_writelane_b32 v2, s59, 0
+; GFX12-NEXT:    v_add_nc_u32_e64 v1, s0, s1
+; GFX12-NEXT:    v_mov_b32_e32 v0, s32
+; GFX12-NEXT:    s_and_b32 s0, 0, exec_lo
+; GFX12-NEXT:    ;;#ASMSTART
+; GFX12-NEXT:    ; use alloca0 v0
+; GFX12-NEXT:    ;;#ASMEND
+; GFX12-NEXT:    s_delay_alu instid0(VALU_DEP_2)
+; GFX12-NEXT:    v_readfirstlane_b32 s59, v1
+; GFX12-NEXT:    ;;#ASMSTART
+; GFX12-NEXT:    ; use s59, scc
+; GFX12-NEXT:    ;;#ASMEND
+; GFX12-NEXT:    v_readlane_b32 s59, v2, 0
+; GFX12-NEXT:    s_xor_saveexec_b32 s0, -1
+; GFX12-NEXT:    scratch_load_b32 v2, off, s32 offset:32768 ; 4-byte Folded Reload
+; GFX12-NEXT:    s_mov_b32 exec_lo, s0
+; GFX12-NEXT:    s_wait_loadcnt 0x0
+; GFX12-NEXT:    s_setpc_b64 s[30:31]
+;
+; GFX8-LABEL: scalar_mov_materializes_frame_index_unavailable_scc__gep_sgpr_offset:
+; GFX8:       ; %bb.0:
+; GFX8-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX8-NEXT:    s_xor_saveexec_b64 s[4:5], -1
+; GFX8-NEXT:    s_add_i32 s7, s32, 0x201000
+; GFX8-NEXT:    buffer_store_dword v2, off, s[0:3], s7 ; 4-byte Folded Spill
+; GFX8-NEXT:    s_mov_b64 exec, s[4:5]
+; GFX8-NEXT:    v_lshrrev_b32_e64 v0, 6, s32
+; GFX8-NEXT:    s_movk_i32 vcc_lo, 0x4040
+; GFX8-NEXT:    s_lshl_b32 s4, s6, 2
+; GFX8-NEXT:    v_add_u32_e32 v0, vcc, vcc_lo, v0
+; GFX8-NEXT:    v_add_u32_e32 v0, vcc, s4, v0
+; GFX8-NEXT:    v_writelane_b32 v2, s59, 0
+; GFX8-NEXT:    v_lshrrev_b32_e64 v1, 6, s32
+; GFX8-NEXT:    v_readfirstlane_b32 s59, v0
+; GFX8-NEXT:    v_add_u32_e32 v1, vcc, 64, v1
+; GFX8-NEXT:    ;;#ASMSTART
+; GFX8-NEXT:    ; use alloca0 v1
+; GFX8-NEXT:    ;;#ASMEND
+; GFX8-NEXT:    s_and_b64 s[4:5], 0, exec
+; GFX8-NEXT:    ;;#ASMSTART
+; GFX8-NEXT:    ; use s59, scc
+; GFX8-NEXT:    ;;#ASMEND
+; GFX8-NEXT:    v_readlane_b32 s59, v2, 0
+; GFX8-NEXT:    s_xor_saveexec_b64 s[4:5], -1
+; GFX8-NEXT:    s_add_i32 s6, s32, 0x201000
+; GFX8-NEXT:    buffer_load_dword v2, off, s[0:3], s6 ; 4-byte Folded Reload
+; GFX8-NEXT:    s_mov_b64 exec, s[4:5]
+; GFX8-NEXT:    s_waitcnt vmcnt(0)
+; GFX8-NEXT:    s_setpc_b64 s[30:31]
+;
+; GFX900-LABEL: scalar_mov_materializes_frame_index_unavailable_scc__gep_sgpr_offset:
+; GFX900:       ; %bb.0:
+; GFX900-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX900-NEXT:    s_xor_saveexec_b64 s[4:5], -1
+; GFX900-NEXT:    s_add_i32 s7, s32, 0x201000
+; GFX900-NEXT:    buffer_store_dword v2, off, s[0:3], s7 ; 4-byte Folded Spill
+; GFX900-NEXT:    s_mov_b64 exec, s[4:5]
+; GFX900-NEXT:    v_lshrrev_b32_e64 v0, 6, s32
+; GFX900-NEXT:    s_lshl_b32 s4, s6, 2
+; GFX900-NEXT:    v_add_u32_e32 v0, 0x4040, v0
+; GFX900-NEXT:    v_add_u32_e32 v0, s4, v0
+; GFX900-NEXT:    v_writelane_b32 v2, s59, 0
+; GFX900-NEXT:    v_lshrrev_b32_e64 v1, 6, s32
+; GFX900-NEXT:    v_readfirstlane_b32 s59, v0
+; GFX900-NEXT:    v_add_u32_e32 v1, 64, v1
+; GFX900-NEXT:    ;;#ASMSTART
+; GFX900-NEXT:    ; use alloca0 v1
+; GFX900-NEXT:    ;;#ASMEND
+; GFX900-NEXT:    s_and_b64 s[4:5], 0, exec
+; GFX900-NEXT:    ;;#ASMSTART
+; GFX900-NEXT:    ; use s59, scc
+; GFX900-NEXT:    ;;#ASMEND
+; GFX900-NEXT:    v_readlane_b32 s59, v2, 0
+; GFX900-NEXT:    s_xor_saveexec_b64 s[4:5], -1
+; GFX900-NEXT:    s_add_i32 s6, s32, 0x201000
+; GFX900-NEXT:    buffer_load_dword v2, off, s[0:3], s6 ; 4-byte Folded Reload
+; GFX900-NEXT:    s_mov_b64 exec, s[4:5]
+; GFX900-NEXT:    s_waitcnt vmcnt(0)
+; GFX900-NEXT:    s_setpc_b64 s[30:31]
+;
+; GFX940-LABEL: scalar_mov_materializes_frame_index_unavailable_scc__gep_sgpr_offset:
+; GFX940:       ; %bb.0:
+; GFX940-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX940-NEXT:    s_xor_saveexec_b64 s[2:3], -1
+; GFX940-NEXT:    s_add_i32 s1, s32, 0x8040
+; GFX940-NEXT:    scratch_store_dword off, v2, s1 sc0 sc1 ; 4-byte Folded Spill
+; GFX940-NEXT:    s_mov_b64 exec, s[2:3]
+; GFX940-NEXT:    s_lshl_b32 s0, s0, 2
+; GFX940-NEXT:    s_add_i32 s1, s32, 0x4040
+; GFX940-NEXT:    v_mov_b32_e32 v0, s1
+; GFX940-NEXT:    v_add_u32_e32 v0, s0, v0
+; GFX940-NEXT:    v_writelane_b32 v2, s59, 0
+; GFX940-NEXT:    s_add_i32 s0, s32, 64
+; GFX940-NEXT:    v_readfirstlane_b32 s59, v0
+; GFX940-NEXT:    v_mov_b32_e32 v1, s0
+; GFX940-NEXT:    ;;#ASMSTART
+; GFX940-NEXT:    ; use alloca0 v1
+; GFX940-NEXT:    ;;#ASMEND
+; GFX940-NEXT:    s_and_b64 s[0:1], 0, exec
+; GFX940-NEXT:    ;;#ASMSTART
+; GFX940-NEXT:    ; use s59, scc
+; GFX940-NEXT:    ;;#ASMEND
+; GFX940-NEXT:    v_readlane_b32 s59, v2, 0
+; GFX940-NEXT:    s_xor_saveexec_b64 s[0:1], -1
+; GFX940-NEXT:    s_add_i32 s2, s32, 0x8040
+; GFX940-NEXT:    scratch_load_dword v2, off, s2 ; 4-byte Folded Reload
+; GFX940-NEXT:    s_mov_b64 exec, s[0:1]
+; GFX940-NEXT:    s_waitcnt vmcnt(0)
+; GFX940-NEXT:    s_setpc_b64 s[30:31]
+  %alloca0 = alloca [4096 x i32], align 64, addrspace(5)
+  %alloca1 = alloca [4096 x i32], align 4, addrspace(5)
+  %alloca1.offset = getelementptr [4096 x i32], ptr addrspace(5) %alloca1, i32 0, i32 %soffset
+  call void asm sideeffect "; use alloca0 $0", "v"(ptr addrspace(5) %alloca0)
+  call void asm sideeffect "; use $0, $1", "{s59},{scc}"(ptr addrspace(5) %alloca1.offset, i32 0)
+  ret void
+}
+
+attributes #0 = { nounwind alignstack=64 "amdgpu-no-workitem-id-x" "amdgpu-no-workitem-id-y" "amdgpu-no-workitem-id-z" "amdgpu-waves-per-eu"="10,10" "no-realign-stack" }
+attributes #1 = { nounwind alignstack=64 "amdgpu-no-workitem-id-x" "amdgpu-no-workitem-id-y" "amdgpu-no-workitem-id-z" "amdgpu-waves-per-eu"="10,10" "no-realign-stack" "frame-pointer"="all" }
+;; NOTE: These prefixes are unused and the list is autogenerated. Do not add tests below this line:
+; GFX9: {{.*}}
diff --git a/llvm/test/CodeGen/AMDGPU/materialize-frame-index-sgpr.ll b/llvm/test/CodeGen/AMDGPU/materialize-frame-index-sgpr.ll
new file mode 100644
index 0000000..9cd92dc
--- /dev/null
+++ b/llvm/test/CodeGen/AMDGPU/materialize-frame-index-sgpr.ll
@@ -0,0 +1,2323 @@
+; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py UTC_ARGS: --version 5
+; RUN: llc -mtriple=amdgcn-amd-amdhsa -mcpu=gfx700 < %s | FileCheck -check-prefix=GFX7 %s
+; RUN: llc -mtriple=amdgcn-amd-amdhsa -mcpu=gfx810 -mattr=+xnack < %s | FileCheck -check-prefix=GFX8 %s
+; RUN: llc -mtriple=amdgcn-amd-amdhsa -mcpu=gfx900 -mattr=+xnack < %s | FileCheck -check-prefixes=GFX9,GFX900 %s
+; RUN: llc -mtriple=amdgcn-amd-amdhsa -mcpu=gfx940 -mattr=+xnack < %s | FileCheck -check-prefixes=GFX9,GFX940 %s
+; RUN: llc -mtriple=amdgcn-amd-amdhsa -mcpu=gfx1010 < %s | FileCheck -check-prefix=GFX10_1 %s
+; RUN: llc -mtriple=amdgcn-amd-amdhsa -mcpu=gfx1030 < %s | FileCheck -check-prefix=GFX10_3 %s
+; RUN: llc -mtriple=amdgcn-amd-amdhsa -mcpu=gfx1100 < %s | FileCheck -check-prefix=GFX11 %s
+; RUN: llc -mtriple=amdgcn-amd-amdhsa -mcpu=gfx1200 < %s | FileCheck -check-prefix=GFX12 %s
+
+%asm.output = type { <16 x i32>, <16 x i32>, <16 x i32>, <8 x i32>, <2 x i32>, i32, ; sgprs
+                     <16 x i32>, <7 x i32>, ; vgprs
+                     i64 ; vcc
+                     }
+
+%asm.output2 = type { <16 x i32>, <16 x i32>, <16 x i32>, <8 x i32>, <2 x i32>, i32, ; sgprs
+                     <16 x i32>, <5 x i32>, ; vgprs
+                     i64 ; vcc
+                     }
+
+%asm.output3 = type { <16 x i32>, <16 x i32>, <16 x i32>, <8 x i32>, <2 x i32>, ; sgprs
+                     <16 x i32>, <6 x i32>, ; vgprs
+                     i64 ; vcc
+                     }
+
+; %alloca1 should end up materializing with s_mov_b32, but scc is
+; unavailable.
+;
+; This is primarily to test gfx7 and gfx8, which do not have vector
+; add with no carry.
+;
+define void @scalar_mov_materializes_frame_index_no_live_scc_no_live_sgprs() #0 {
+; GFX7-LABEL: scalar_mov_materializes_frame_index_no_live_scc_no_live_sgprs:
+; GFX7:       ; %bb.0:
+; GFX7-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX7-NEXT:    s_xor_saveexec_b64 s[4:5], -1
+; GFX7-NEXT:    s_add_i32 s6, s32, 0x101100
+; GFX7-NEXT:    buffer_store_dword v23, off, s[0:3], s6 ; 4-byte Folded Spill
+; GFX7-NEXT:    s_mov_b64 exec, s[4:5]
+; GFX7-NEXT:    v_writelane_b32 v23, s30, 0
+; GFX7-NEXT:    v_writelane_b32 v23, s31, 1
+; GFX7-NEXT:    v_writelane_b32 v23, s33, 2
+; GFX7-NEXT:    v_writelane_b32 v23, s34, 3
+; GFX7-NEXT:    v_writelane_b32 v23, s35, 4
+; GFX7-NEXT:    v_writelane_b32 v23, s36, 5
+; GFX7-NEXT:    v_writelane_b32 v23, s37, 6
+; GFX7-NEXT:    v_writelane_b32 v23, s38, 7
+; GFX7-NEXT:    v_writelane_b32 v23, s39, 8
+; GFX7-NEXT:    v_writelane_b32 v23, s40, 9
+; GFX7-NEXT:    v_writelane_b32 v23, s41, 10
+; GFX7-NEXT:    v_writelane_b32 v23, s42, 11
+; GFX7-NEXT:    v_writelane_b32 v23, s43, 12
+; GFX7-NEXT:    v_writelane_b32 v23, s44, 13
+; GFX7-NEXT:    v_writelane_b32 v23, s45, 14
+; GFX7-NEXT:    v_writelane_b32 v23, s46, 15
+; GFX7-NEXT:    v_writelane_b32 v23, s47, 16
+; GFX7-NEXT:    v_writelane_b32 v23, s48, 17
+; GFX7-NEXT:    v_writelane_b32 v23, s49, 18
+; GFX7-NEXT:    v_writelane_b32 v23, s50, 19
+; GFX7-NEXT:    v_writelane_b32 v23, s51, 20
+; GFX7-NEXT:    v_writelane_b32 v23, s52, 21
+; GFX7-NEXT:    v_writelane_b32 v23, s53, 22
+; GFX7-NEXT:    v_writelane_b32 v23, s54, 23
+; GFX7-NEXT:    v_writelane_b32 v23, s55, 24
+; GFX7-NEXT:    v_writelane_b32 v23, s56, 25
+; GFX7-NEXT:    v_lshr_b32_e64 v0, s32, 6
+; GFX7-NEXT:    v_writelane_b32 v23, s57, 26
+; GFX7-NEXT:    v_add_i32_e32 v0, vcc, 64, v0
+; GFX7-NEXT:    s_and_b64 s[4:5], 0, exec
+; GFX7-NEXT:    v_writelane_b32 v23, s58, 27
+; GFX7-NEXT:    ;;#ASMSTART
+; GFX7-NEXT:    ; use alloca0 v0
+; GFX7-NEXT:    ;;#ASMEND
+; GFX7-NEXT:    ;;#ASMSTART
+; GFX7-NEXT:    ; def s[0:15], s[16:31], s[32:47], s[48:55], s[56:57], s58, v[0:15], v[16:22], vcc
+; GFX7-NEXT:    ;;#ASMEND
+; GFX7-NEXT:    buffer_store_dword v0, off, s[0:3], s32
+; GFX7-NEXT:    v_mov_b32_e32 v0, 0x4040
+; GFX7-NEXT:    v_mad_u32_u24 v0, v0, 64, s32
+; GFX7-NEXT:    v_lshr_b32_e64 v0, s32, 6
+; GFX7-NEXT:    v_writelane_b32 v23, s59, 28
+; GFX7-NEXT:    v_readfirstlane_b32 s59, v0
+; GFX7-NEXT:    buffer_load_dword v0, off, s[0:3], s32
+; GFX7-NEXT:    s_waitcnt vmcnt(0)
+; GFX7-NEXT:    ;;#ASMSTART
+; GFX7-NEXT:    ; use s[0:15], s[16:31], s[32:47], s[48:55], s[56:57], s58, v[0:15], v[16:22], vcc, s59, scc
+; GFX7-NEXT:    ;;#ASMEND
+; GFX7-NEXT:    v_readlane_b32 s59, v23, 28
+; GFX7-NEXT:    v_readlane_b32 s58, v23, 27
+; GFX7-NEXT:    v_readlane_b32 s57, v23, 26
+; GFX7-NEXT:    v_readlane_b32 s56, v23, 25
+; GFX7-NEXT:    v_readlane_b32 s55, v23, 24
+; GFX7-NEXT:    v_readlane_b32 s54, v23, 23
+; GFX7-NEXT:    v_readlane_b32 s53, v23, 22
+; GFX7-NEXT:    v_readlane_b32 s52, v23, 21
+; GFX7-NEXT:    v_readlane_b32 s51, v23, 20
+; GFX7-NEXT:    v_readlane_b32 s50, v23, 19
+; GFX7-NEXT:    v_readlane_b32 s49, v23, 18
+; GFX7-NEXT:    v_readlane_b32 s48, v23, 17
+; GFX7-NEXT:    v_readlane_b32 s47, v23, 16
+; GFX7-NEXT:    v_readlane_b32 s46, v23, 15
+; GFX7-NEXT:    v_readlane_b32 s45, v23, 14
+; GFX7-NEXT:    v_readlane_b32 s44, v23, 13
+; GFX7-NEXT:    v_readlane_b32 s43, v23, 12
+; GFX7-NEXT:    v_readlane_b32 s42, v23, 11
+; GFX7-NEXT:    v_readlane_b32 s41, v23, 10
+; GFX7-NEXT:    v_readlane_b32 s40, v23, 9
+; GFX7-NEXT:    v_readlane_b32 s39, v23, 8
+; GFX7-NEXT:    v_readlane_b32 s38, v23, 7
+; GFX7-NEXT:    v_readlane_b32 s37, v23, 6
+; GFX7-NEXT:    v_readlane_b32 s36, v23, 5
+; GFX7-NEXT:    v_readlane_b32 s35, v23, 4
+; GFX7-NEXT:    v_readlane_b32 s34, v23, 3
+; GFX7-NEXT:    v_readlane_b32 s33, v23, 2
+; GFX7-NEXT:    v_readlane_b32 s31, v23, 1
+; GFX7-NEXT:    v_readlane_b32 s30, v23, 0
+; GFX7-NEXT:    s_xor_saveexec_b64 s[4:5], -1
+; GFX7-NEXT:    s_add_i32 s6, s32, 0x101100
+; GFX7-NEXT:    buffer_load_dword v23, off, s[0:3], s6 ; 4-byte Folded Reload
+; GFX7-NEXT:    s_mov_b64 exec, s[4:5]
+; GFX7-NEXT:    s_waitcnt vmcnt(0)
+; GFX7-NEXT:    s_setpc_b64 s[30:31]
+;
+; GFX8-LABEL: scalar_mov_materializes_frame_index_no_live_scc_no_live_sgprs:
+; GFX8:       ; %bb.0:
+; GFX8-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX8-NEXT:    s_xor_saveexec_b64 s[4:5], -1
+; GFX8-NEXT:    s_add_i32 s6, s32, 0x101100
+; GFX8-NEXT:    buffer_store_dword v23, off, s[0:3], s6 ; 4-byte Folded Spill
+; GFX8-NEXT:    s_mov_b64 exec, s[4:5]
+; GFX8-NEXT:    v_writelane_b32 v23, s30, 0
+; GFX8-NEXT:    v_writelane_b32 v23, s31, 1
+; GFX8-NEXT:    v_writelane_b32 v23, s33, 2
+; GFX8-NEXT:    v_writelane_b32 v23, s34, 3
+; GFX8-NEXT:    v_writelane_b32 v23, s35, 4
+; GFX8-NEXT:    v_writelane_b32 v23, s36, 5
+; GFX8-NEXT:    v_writelane_b32 v23, s37, 6
+; GFX8-NEXT:    v_writelane_b32 v23, s38, 7
+; GFX8-NEXT:    v_writelane_b32 v23, s39, 8
+; GFX8-NEXT:    v_writelane_b32 v23, s40, 9
+; GFX8-NEXT:    v_writelane_b32 v23, s41, 10
+; GFX8-NEXT:    v_writelane_b32 v23, s42, 11
+; GFX8-NEXT:    v_writelane_b32 v23, s43, 12
+; GFX8-NEXT:    v_writelane_b32 v23, s44, 13
+; GFX8-NEXT:    v_writelane_b32 v23, s45, 14
+; GFX8-NEXT:    v_writelane_b32 v23, s46, 15
+; GFX8-NEXT:    v_writelane_b32 v23, s47, 16
+; GFX8-NEXT:    v_writelane_b32 v23, s48, 17
+; GFX8-NEXT:    v_writelane_b32 v23, s49, 18
+; GFX8-NEXT:    v_writelane_b32 v23, s50, 19
+; GFX8-NEXT:    v_writelane_b32 v23, s51, 20
+; GFX8-NEXT:    v_writelane_b32 v23, s52, 21
+; GFX8-NEXT:    v_writelane_b32 v23, s53, 22
+; GFX8-NEXT:    v_writelane_b32 v23, s54, 23
+; GFX8-NEXT:    v_writelane_b32 v23, s55, 24
+; GFX8-NEXT:    v_writelane_b32 v23, s56, 25
+; GFX8-NEXT:    v_lshrrev_b32_e64 v0, 6, s32
+; GFX8-NEXT:    v_writelane_b32 v23, s57, 26
+; GFX8-NEXT:    v_add_u32_e32 v0, vcc, 64, v0
+; GFX8-NEXT:    s_and_b64 s[4:5], 0, exec
+; GFX8-NEXT:    v_writelane_b32 v23, s58, 27
+; GFX8-NEXT:    ;;#ASMSTART
+; GFX8-NEXT:    ; use alloca0 v0
+; GFX8-NEXT:    ;;#ASMEND
+; GFX8-NEXT:    ;;#ASMSTART
+; GFX8-NEXT:    ; def s[0:15], s[16:31], s[32:47], s[48:55], s[56:57], s58, v[0:15], v[16:22], vcc
+; GFX8-NEXT:    ;;#ASMEND
+; GFX8-NEXT:    buffer_store_dword v0, off, s[0:3], s32
+; GFX8-NEXT:    v_mov_b32_e32 v0, 0x4040
+; GFX8-NEXT:    v_mad_u32_u24 v0, v0, 64, s32
+; GFX8-NEXT:    v_lshrrev_b32_e64 v0, 6, s32
+; GFX8-NEXT:    v_writelane_b32 v23, s59, 28
+; GFX8-NEXT:    v_readfirstlane_b32 s59, v0
+; GFX8-NEXT:    buffer_load_dword v0, off, s[0:3], s32
+; GFX8-NEXT:    s_waitcnt vmcnt(0)
+; GFX8-NEXT:    ;;#ASMSTART
+; GFX8-NEXT:    ; use s[0:15], s[16:31], s[32:47], s[48:55], s[56:57], s58, v[0:15], v[16:22], vcc, s59, scc
+; GFX8-NEXT:    ;;#ASMEND
+; GFX8-NEXT:    v_readlane_b32 s59, v23, 28
+; GFX8-NEXT:    v_readlane_b32 s58, v23, 27
+; GFX8-NEXT:    v_readlane_b32 s57, v23, 26
+; GFX8-NEXT:    v_readlane_b32 s56, v23, 25
+; GFX8-NEXT:    v_readlane_b32 s55, v23, 24
+; GFX8-NEXT:    v_readlane_b32 s54, v23, 23
+; GFX8-NEXT:    v_readlane_b32 s53, v23, 22
+; GFX8-NEXT:    v_readlane_b32 s52, v23, 21
+; GFX8-NEXT:    v_readlane_b32 s51, v23, 20
+; GFX8-NEXT:    v_readlane_b32 s50, v23, 19
+; GFX8-NEXT:    v_readlane_b32 s49, v23, 18
+; GFX8-NEXT:    v_readlane_b32 s48, v23, 17
+; GFX8-NEXT:    v_readlane_b32 s47, v23, 16
+; GFX8-NEXT:    v_readlane_b32 s46, v23, 15
+; GFX8-NEXT:    v_readlane_b32 s45, v23, 14
+; GFX8-NEXT:    v_readlane_b32 s44, v23, 13
+; GFX8-NEXT:    v_readlane_b32 s43, v23, 12
+; GFX8-NEXT:    v_readlane_b32 s42, v23, 11
+; GFX8-NEXT:    v_readlane_b32 s41, v23, 10
+; GFX8-NEXT:    v_readlane_b32 s40, v23, 9
+; GFX8-NEXT:    v_readlane_b32 s39, v23, 8
+; GFX8-NEXT:    v_readlane_b32 s38, v23, 7
+; GFX8-NEXT:    v_readlane_b32 s37, v23, 6
+; GFX8-NEXT:    v_readlane_b32 s36, v23, 5
+; GFX8-NEXT:    v_readlane_b32 s35, v23, 4
+; GFX8-NEXT:    v_readlane_b32 s34, v23, 3
+; GFX8-NEXT:    v_readlane_b32 s33, v23, 2
+; GFX8-NEXT:    v_readlane_b32 s31, v23, 1
+; GFX8-NEXT:    v_readlane_b32 s30, v23, 0
+; GFX8-NEXT:    s_xor_saveexec_b64 s[4:5], -1
+; GFX8-NEXT:    s_add_i32 s6, s32, 0x101100
+; GFX8-NEXT:    buffer_load_dword v23, off, s[0:3], s6 ; 4-byte Folded Reload
+; GFX8-NEXT:    s_mov_b64 exec, s[4:5]
+; GFX8-NEXT:    s_waitcnt vmcnt(0)
+; GFX8-NEXT:    s_setpc_b64 s[30:31]
+;
+; GFX900-LABEL: scalar_mov_materializes_frame_index_no_live_scc_no_live_sgprs:
+; GFX900:       ; %bb.0:
+; GFX900-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX900-NEXT:    s_xor_saveexec_b64 s[4:5], -1
+; GFX900-NEXT:    s_add_i32 s6, s32, 0x101100
+; GFX900-NEXT:    buffer_store_dword v23, off, s[0:3], s6 ; 4-byte Folded Spill
+; GFX900-NEXT:    s_mov_b64 exec, s[4:5]
+; GFX900-NEXT:    v_writelane_b32 v23, s30, 0
+; GFX900-NEXT:    v_writelane_b32 v23, s31, 1
+; GFX900-NEXT:    v_writelane_b32 v23, s33, 2
+; GFX900-NEXT:    v_writelane_b32 v23, s34, 3
+; GFX900-NEXT:    v_writelane_b32 v23, s35, 4
+; GFX900-NEXT:    v_writelane_b32 v23, s36, 5
+; GFX900-NEXT:    v_writelane_b32 v23, s37, 6
+; GFX900-NEXT:    v_writelane_b32 v23, s38, 7
+; GFX900-NEXT:    v_writelane_b32 v23, s39, 8
+; GFX900-NEXT:    v_writelane_b32 v23, s40, 9
+; GFX900-NEXT:    v_writelane_b32 v23, s41, 10
+; GFX900-NEXT:    v_writelane_b32 v23, s42, 11
+; GFX900-NEXT:    v_writelane_b32 v23, s43, 12
+; GFX900-NEXT:    v_writelane_b32 v23, s44, 13
+; GFX900-NEXT:    v_writelane_b32 v23, s45, 14
+; GFX900-NEXT:    v_writelane_b32 v23, s46, 15
+; GFX900-NEXT:    v_writelane_b32 v23, s47, 16
+; GFX900-NEXT:    v_writelane_b32 v23, s48, 17
+; GFX900-NEXT:    v_writelane_b32 v23, s49, 18
+; GFX900-NEXT:    v_writelane_b32 v23, s50, 19
+; GFX900-NEXT:    v_writelane_b32 v23, s51, 20
+; GFX900-NEXT:    v_writelane_b32 v23, s52, 21
+; GFX900-NEXT:    v_writelane_b32 v23, s53, 22
+; GFX900-NEXT:    v_writelane_b32 v23, s54, 23
+; GFX900-NEXT:    v_writelane_b32 v23, s55, 24
+; GFX900-NEXT:    v_writelane_b32 v23, s56, 25
+; GFX900-NEXT:    v_lshrrev_b32_e64 v0, 6, s32
+; GFX900-NEXT:    v_writelane_b32 v23, s57, 26
+; GFX900-NEXT:    v_add_u32_e32 v0, 64, v0
+; GFX900-NEXT:    s_and_b64 s[4:5], 0, exec
+; GFX900-NEXT:    v_writelane_b32 v23, s58, 27
+; GFX900-NEXT:    ;;#ASMSTART
+; GFX900-NEXT:    ; use alloca0 v0
+; GFX900-NEXT:    ;;#ASMEND
+; GFX900-NEXT:    ;;#ASMSTART
+; GFX900-NEXT:    ; def s[0:15], s[16:31], s[32:47], s[48:55], s[56:57], s58, v[0:15], v[16:22], vcc
+; GFX900-NEXT:    ;;#ASMEND
+; GFX900-NEXT:    buffer_store_dword v0, off, s[0:3], s32
+; GFX900-NEXT:    v_lshrrev_b32_e64 v0, 6, s32
+; GFX900-NEXT:    v_add_u32_e32 v0, 0x4040, v0
+; GFX900-NEXT:    v_writelane_b32 v23, s59, 28
+; GFX900-NEXT:    v_readfirstlane_b32 s59, v0
+; GFX900-NEXT:    buffer_load_dword v0, off, s[0:3], s32
+; GFX900-NEXT:    s_waitcnt vmcnt(0)
+; GFX900-NEXT:    ;;#ASMSTART
+; GFX900-NEXT:    ; use s[0:15], s[16:31], s[32:47], s[48:55], s[56:57], s58, v[0:15], v[16:22], vcc, s59, scc
+; GFX900-NEXT:    ;;#ASMEND
+; GFX900-NEXT:    v_readlane_b32 s59, v23, 28
+; GFX900-NEXT:    v_readlane_b32 s58, v23, 27
+; GFX900-NEXT:    v_readlane_b32 s57, v23, 26
+; GFX900-NEXT:    v_readlane_b32 s56, v23, 25
+; GFX900-NEXT:    v_readlane_b32 s55, v23, 24
+; GFX900-NEXT:    v_readlane_b32 s54, v23, 23
+; GFX900-NEXT:    v_readlane_b32 s53, v23, 22
+; GFX900-NEXT:    v_readlane_b32 s52, v23, 21
+; GFX900-NEXT:    v_readlane_b32 s51, v23, 20
+; GFX900-NEXT:    v_readlane_b32 s50, v23, 19
+; GFX900-NEXT:    v_readlane_b32 s49, v23, 18
+; GFX900-NEXT:    v_readlane_b32 s48, v23, 17
+; GFX900-NEXT:    v_readlane_b32 s47, v23, 16
+; GFX900-NEXT:    v_readlane_b32 s46, v23, 15
+; GFX900-NEXT:    v_readlane_b32 s45, v23, 14
+; GFX900-NEXT:    v_readlane_b32 s44, v23, 13
+; GFX900-NEXT:    v_readlane_b32 s43, v23, 12
+; GFX900-NEXT:    v_readlane_b32 s42, v23, 11
+; GFX900-NEXT:    v_readlane_b32 s41, v23, 10
+; GFX900-NEXT:    v_readlane_b32 s40, v23, 9
+; GFX900-NEXT:    v_readlane_b32 s39, v23, 8
+; GFX900-NEXT:    v_readlane_b32 s38, v23, 7
+; GFX900-NEXT:    v_readlane_b32 s37, v23, 6
+; GFX900-NEXT:    v_readlane_b32 s36, v23, 5
+; GFX900-NEXT:    v_readlane_b32 s35, v23, 4
+; GFX900-NEXT:    v_readlane_b32 s34, v23, 3
+; GFX900-NEXT:    v_readlane_b32 s33, v23, 2
+; GFX900-NEXT:    v_readlane_b32 s31, v23, 1
+; GFX900-NEXT:    v_readlane_b32 s30, v23, 0
+; GFX900-NEXT:    s_xor_saveexec_b64 s[4:5], -1
+; GFX900-NEXT:    s_add_i32 s6, s32, 0x101100
+; GFX900-NEXT:    buffer_load_dword v23, off, s[0:3], s6 ; 4-byte Folded Reload
+; GFX900-NEXT:    s_mov_b64 exec, s[4:5]
+; GFX900-NEXT:    s_waitcnt vmcnt(0)
+; GFX900-NEXT:    s_setpc_b64 s[30:31]
+;
+; GFX940-LABEL: scalar_mov_materializes_frame_index_no_live_scc_no_live_sgprs:
+; GFX940:       ; %bb.0:
+; GFX940-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX940-NEXT:    s_xor_saveexec_b64 s[0:1], -1
+; GFX940-NEXT:    s_add_i32 s2, s32, 0x4044
+; GFX940-NEXT:    scratch_store_dword off, v23, s2 sc0 sc1 ; 4-byte Folded Spill
+; GFX940-NEXT:    s_mov_b64 exec, s[0:1]
+; GFX940-NEXT:    v_writelane_b32 v23, s30, 0
+; GFX940-NEXT:    v_writelane_b32 v23, s31, 1
+; GFX940-NEXT:    v_writelane_b32 v23, s33, 2
+; GFX940-NEXT:    v_writelane_b32 v23, s34, 3
+; GFX940-NEXT:    v_writelane_b32 v23, s35, 4
+; GFX940-NEXT:    v_writelane_b32 v23, s36, 5
+; GFX940-NEXT:    v_writelane_b32 v23, s37, 6
+; GFX940-NEXT:    v_writelane_b32 v23, s38, 7
+; GFX940-NEXT:    v_writelane_b32 v23, s39, 8
+; GFX940-NEXT:    v_writelane_b32 v23, s40, 9
+; GFX940-NEXT:    v_writelane_b32 v23, s41, 10
+; GFX940-NEXT:    v_writelane_b32 v23, s42, 11
+; GFX940-NEXT:    v_writelane_b32 v23, s43, 12
+; GFX940-NEXT:    v_writelane_b32 v23, s44, 13
+; GFX940-NEXT:    v_writelane_b32 v23, s45, 14
+; GFX940-NEXT:    v_writelane_b32 v23, s46, 15
+; GFX940-NEXT:    v_writelane_b32 v23, s47, 16
+; GFX940-NEXT:    v_writelane_b32 v23, s48, 17
+; GFX940-NEXT:    v_writelane_b32 v23, s49, 18
+; GFX940-NEXT:    v_writelane_b32 v23, s50, 19
+; GFX940-NEXT:    v_writelane_b32 v23, s51, 20
+; GFX940-NEXT:    v_writelane_b32 v23, s52, 21
+; GFX940-NEXT:    v_writelane_b32 v23, s53, 22
+; GFX940-NEXT:    v_writelane_b32 v23, s54, 23
+; GFX940-NEXT:    v_writelane_b32 v23, s55, 24
+; GFX940-NEXT:    v_writelane_b32 v23, s56, 25
+; GFX940-NEXT:    v_writelane_b32 v23, s57, 26
+; GFX940-NEXT:    v_writelane_b32 v23, s58, 27
+; GFX940-NEXT:    v_writelane_b32 v23, s59, 28
+; GFX940-NEXT:    v_writelane_b32 v23, s60, 29
+; GFX940-NEXT:    s_add_i32 s0, s32, 64
+; GFX940-NEXT:    v_writelane_b32 v23, s61, 30
+; GFX940-NEXT:    v_mov_b32_e32 v0, s0
+; GFX940-NEXT:    s_and_b64 s[60:61], 0, exec
+; GFX940-NEXT:    ;;#ASMSTART
+; GFX940-NEXT:    ; use alloca0 v0
+; GFX940-NEXT:    ;;#ASMEND
+; GFX940-NEXT:    ;;#ASMSTART
+; GFX940-NEXT:    ; def s[0:15], s[16:31], s[32:47], s[48:55], s[56:57], s58, v[0:15], v[16:22], vcc
+; GFX940-NEXT:    ;;#ASMEND
+; GFX940-NEXT:    s_addc_u32 s60, s32, 0x4040
+; GFX940-NEXT:    s_bitcmp1_b32 s60, 0
+; GFX940-NEXT:    s_bitset0_b32 s60, 0
+; GFX940-NEXT:    s_mov_b32 s59, s60
+; GFX940-NEXT:    ;;#ASMSTART
+; GFX940-NEXT:    ; use s[0:15], s[16:31], s[32:47], s[48:55], s[56:57], s58, v[0:15], v[16:22], vcc, s59, scc
+; GFX940-NEXT:    ;;#ASMEND
+; GFX940-NEXT:    v_readlane_b32 s61, v23, 30
+; GFX940-NEXT:    v_readlane_b32 s60, v23, 29
+; GFX940-NEXT:    v_readlane_b32 s59, v23, 28
+; GFX940-NEXT:    v_readlane_b32 s58, v23, 27
+; GFX940-NEXT:    v_readlane_b32 s57, v23, 26
+; GFX940-NEXT:    v_readlane_b32 s56, v23, 25
+; GFX940-NEXT:    v_readlane_b32 s55, v23, 24
+; GFX940-NEXT:    v_readlane_b32 s54, v23, 23
+; GFX940-NEXT:    v_readlane_b32 s53, v23, 22
+; GFX940-NEXT:    v_readlane_b32 s52, v23, 21
+; GFX940-NEXT:    v_readlane_b32 s51, v23, 20
+; GFX940-NEXT:    v_readlane_b32 s50, v23, 19
+; GFX940-NEXT:    v_readlane_b32 s49, v23, 18
+; GFX940-NEXT:    v_readlane_b32 s48, v23, 17
+; GFX940-NEXT:    v_readlane_b32 s47, v23, 16
+; GFX940-NEXT:    v_readlane_b32 s46, v23, 15
+; GFX940-NEXT:    v_readlane_b32 s45, v23, 14
+; GFX940-NEXT:    v_readlane_b32 s44, v23, 13
+; GFX940-NEXT:    v_readlane_b32 s43, v23, 12
+; GFX940-NEXT:    v_readlane_b32 s42, v23, 11
+; GFX940-NEXT:    v_readlane_b32 s41, v23, 10
+; GFX940-NEXT:    v_readlane_b32 s40, v23, 9
+; GFX940-NEXT:    v_readlane_b32 s39, v23, 8
+; GFX940-NEXT:    v_readlane_b32 s38, v23, 7
+; GFX940-NEXT:    v_readlane_b32 s37, v23, 6
+; GFX940-NEXT:    v_readlane_b32 s36, v23, 5
+; GFX940-NEXT:    v_readlane_b32 s35, v23, 4
+; GFX940-NEXT:    v_readlane_b32 s34, v23, 3
+; GFX940-NEXT:    v_readlane_b32 s33, v23, 2
+; GFX940-NEXT:    v_readlane_b32 s31, v23, 1
+; GFX940-NEXT:    v_readlane_b32 s30, v23, 0
+; GFX940-NEXT:    s_xor_saveexec_b64 s[0:1], -1
+; GFX940-NEXT:    s_add_i32 s2, s32, 0x4044
+; GFX940-NEXT:    scratch_load_dword v23, off, s2 ; 4-byte Folded Reload
+; GFX940-NEXT:    s_mov_b64 exec, s[0:1]
+; GFX940-NEXT:    s_waitcnt vmcnt(0)
+; GFX940-NEXT:    s_setpc_b64 s[30:31]
+;
+; GFX10_1-LABEL: scalar_mov_materializes_frame_index_no_live_scc_no_live_sgprs:
+; GFX10_1:       ; %bb.0:
+; GFX10_1-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX10_1-NEXT:    s_xor_saveexec_b32 s4, -1
+; GFX10_1-NEXT:    s_add_i32 s5, s32, 0x80880
+; GFX10_1-NEXT:    buffer_store_dword v23, off, s[0:3], s5 ; 4-byte Folded Spill
+; GFX10_1-NEXT:    s_waitcnt_depctr 0xffe3
+; GFX10_1-NEXT:    s_mov_b32 exec_lo, s4
+; GFX10_1-NEXT:    v_writelane_b32 v23, s30, 0
+; GFX10_1-NEXT:    v_lshrrev_b32_e64 v0, 5, s32
+; GFX10_1-NEXT:    s_and_b32 s4, 0, exec_lo
+; GFX10_1-NEXT:    v_writelane_b32 v23, s31, 1
+; GFX10_1-NEXT:    v_add_nc_u32_e32 v0, 64, v0
+; GFX10_1-NEXT:    ;;#ASMSTART
+; GFX10_1-NEXT:    ; use alloca0 v0
+; GFX10_1-NEXT:    ;;#ASMEND
+; GFX10_1-NEXT:    v_writelane_b32 v23, s33, 2
+; GFX10_1-NEXT:    v_writelane_b32 v23, s34, 3
+; GFX10_1-NEXT:    v_writelane_b32 v23, s35, 4
+; GFX10_1-NEXT:    v_writelane_b32 v23, s36, 5
+; GFX10_1-NEXT:    v_writelane_b32 v23, s37, 6
+; GFX10_1-NEXT:    v_writelane_b32 v23, s38, 7
+; GFX10_1-NEXT:    v_writelane_b32 v23, s39, 8
+; GFX10_1-NEXT:    v_writelane_b32 v23, s40, 9
+; GFX10_1-NEXT:    v_writelane_b32 v23, s41, 10
+; GFX10_1-NEXT:    v_writelane_b32 v23, s42, 11
+; GFX10_1-NEXT:    v_writelane_b32 v23, s43, 12
+; GFX10_1-NEXT:    v_writelane_b32 v23, s44, 13
+; GFX10_1-NEXT:    v_writelane_b32 v23, s45, 14
+; GFX10_1-NEXT:    v_writelane_b32 v23, s46, 15
+; GFX10_1-NEXT:    v_writelane_b32 v23, s47, 16
+; GFX10_1-NEXT:    v_writelane_b32 v23, s48, 17
+; GFX10_1-NEXT:    v_writelane_b32 v23, s49, 18
+; GFX10_1-NEXT:    v_writelane_b32 v23, s50, 19
+; GFX10_1-NEXT:    v_writelane_b32 v23, s51, 20
+; GFX10_1-NEXT:    v_writelane_b32 v23, s52, 21
+; GFX10_1-NEXT:    v_writelane_b32 v23, s53, 22
+; GFX10_1-NEXT:    v_writelane_b32 v23, s54, 23
+; GFX10_1-NEXT:    v_writelane_b32 v23, s55, 24
+; GFX10_1-NEXT:    v_writelane_b32 v23, s56, 25
+; GFX10_1-NEXT:    v_writelane_b32 v23, s57, 26
+; GFX10_1-NEXT:    v_writelane_b32 v23, s58, 27
+; GFX10_1-NEXT:    ;;#ASMSTART
+; GFX10_1-NEXT:    ; def s[0:15], s[16:31], s[32:47], s[48:55], s[56:57], s58, v[0:15], v[16:22], vcc
+; GFX10_1-NEXT:    ;;#ASMEND
+; GFX10_1-NEXT:    v_lshrrev_b32_e64 v24, 5, s32
+; GFX10_1-NEXT:    v_writelane_b32 v23, s59, 28
+; GFX10_1-NEXT:    v_add_nc_u32_e32 v24, 0x4040, v24
+; GFX10_1-NEXT:    v_readfirstlane_b32 s59, v24
+; GFX10_1-NEXT:    ;;#ASMSTART
+; GFX10_1-NEXT:    ; use s[0:15], s[16:31], s[32:47], s[48:55], s[56:57], s58, v[0:15], v[16:22], vcc, s59, scc
+; GFX10_1-NEXT:    ;;#ASMEND
+; GFX10_1-NEXT:    v_readlane_b32 s59, v23, 28
+; GFX10_1-NEXT:    v_readlane_b32 s58, v23, 27
+; GFX10_1-NEXT:    v_readlane_b32 s57, v23, 26
+; GFX10_1-NEXT:    v_readlane_b32 s56, v23, 25
+; GFX10_1-NEXT:    v_readlane_b32 s55, v23, 24
+; GFX10_1-NEXT:    v_readlane_b32 s54, v23, 23
+; GFX10_1-NEXT:    v_readlane_b32 s53, v23, 22
+; GFX10_1-NEXT:    v_readlane_b32 s52, v23, 21
+; GFX10_1-NEXT:    v_readlane_b32 s51, v23, 20
+; GFX10_1-NEXT:    v_readlane_b32 s50, v23, 19
+; GFX10_1-NEXT:    v_readlane_b32 s49, v23, 18
+; GFX10_1-NEXT:    v_readlane_b32 s48, v23, 17
+; GFX10_1-NEXT:    v_readlane_b32 s47, v23, 16
+; GFX10_1-NEXT:    v_readlane_b32 s46, v23, 15
+; GFX10_1-NEXT:    v_readlane_b32 s45, v23, 14
+; GFX10_1-NEXT:    v_readlane_b32 s44, v23, 13
+; GFX10_1-NEXT:    v_readlane_b32 s43, v23, 12
+; GFX10_1-NEXT:    v_readlane_b32 s42, v23, 11
+; GFX10_1-NEXT:    v_readlane_b32 s41, v23, 10
+; GFX10_1-NEXT:    v_readlane_b32 s40, v23, 9
+; GFX10_1-NEXT:    v_readlane_b32 s39, v23, 8
+; GFX10_1-NEXT:    v_readlane_b32 s38, v23, 7
+; GFX10_1-NEXT:    v_readlane_b32 s37, v23, 6
+; GFX10_1-NEXT:    v_readlane_b32 s36, v23, 5
+; GFX10_1-NEXT:    v_readlane_b32 s35, v23, 4
+; GFX10_1-NEXT:    v_readlane_b32 s34, v23, 3
+; GFX10_1-NEXT:    v_readlane_b32 s33, v23, 2
+; GFX10_1-NEXT:    v_readlane_b32 s31, v23, 1
+; GFX10_1-NEXT:    v_readlane_b32 s30, v23, 0
+; GFX10_1-NEXT:    s_xor_saveexec_b32 s4, -1
+; GFX10_1-NEXT:    s_add_i32 s5, s32, 0x80880
+; GFX10_1-NEXT:    buffer_load_dword v23, off, s[0:3], s5 ; 4-byte Folded Reload
+; GFX10_1-NEXT:    s_waitcnt_depctr 0xffe3
+; GFX10_1-NEXT:    s_mov_b32 exec_lo, s4
+; GFX10_1-NEXT:    s_waitcnt vmcnt(0)
+; GFX10_1-NEXT:    s_setpc_b64 s[30:31]
+;
+; GFX10_3-LABEL: scalar_mov_materializes_frame_index_no_live_scc_no_live_sgprs:
+; GFX10_3:       ; %bb.0:
+; GFX10_3-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX10_3-NEXT:    s_xor_saveexec_b32 s4, -1
+; GFX10_3-NEXT:    s_add_i32 s5, s32, 0x80880
+; GFX10_3-NEXT:    buffer_store_dword v23, off, s[0:3], s5 ; 4-byte Folded Spill
+; GFX10_3-NEXT:    s_mov_b32 exec_lo, s4
+; GFX10_3-NEXT:    v_writelane_b32 v23, s30, 0
+; GFX10_3-NEXT:    v_lshrrev_b32_e64 v0, 5, s32
+; GFX10_3-NEXT:    s_and_b32 s4, 0, exec_lo
+; GFX10_3-NEXT:    v_writelane_b32 v23, s31, 1
+; GFX10_3-NEXT:    v_add_nc_u32_e32 v0, 64, v0
+; GFX10_3-NEXT:    ;;#ASMSTART
+; GFX10_3-NEXT:    ; use alloca0 v0
+; GFX10_3-NEXT:    ;;#ASMEND
+; GFX10_3-NEXT:    v_writelane_b32 v23, s33, 2
+; GFX10_3-NEXT:    v_writelane_b32 v23, s34, 3
+; GFX10_3-NEXT:    v_writelane_b32 v23, s35, 4
+; GFX10_3-NEXT:    v_writelane_b32 v23, s36, 5
+; GFX10_3-NEXT:    v_writelane_b32 v23, s37, 6
+; GFX10_3-NEXT:    v_writelane_b32 v23, s38, 7
+; GFX10_3-NEXT:    v_writelane_b32 v23, s39, 8
+; GFX10_3-NEXT:    v_writelane_b32 v23, s40, 9
+; GFX10_3-NEXT:    v_writelane_b32 v23, s41, 10
+; GFX10_3-NEXT:    v_writelane_b32 v23, s42, 11
+; GFX10_3-NEXT:    v_writelane_b32 v23, s43, 12
+; GFX10_3-NEXT:    v_writelane_b32 v23, s44, 13
+; GFX10_3-NEXT:    v_writelane_b32 v23, s45, 14
+; GFX10_3-NEXT:    v_writelane_b32 v23, s46, 15
+; GFX10_3-NEXT:    v_writelane_b32 v23, s47, 16
+; GFX10_3-NEXT:    v_writelane_b32 v23, s48, 17
+; GFX10_3-NEXT:    v_writelane_b32 v23, s49, 18
+; GFX10_3-NEXT:    v_writelane_b32 v23, s50, 19
+; GFX10_3-NEXT:    v_writelane_b32 v23, s51, 20
+; GFX10_3-NEXT:    v_writelane_b32 v23, s52, 21
+; GFX10_3-NEXT:    v_writelane_b32 v23, s53, 22
+; GFX10_3-NEXT:    v_writelane_b32 v23, s54, 23
+; GFX10_3-NEXT:    v_writelane_b32 v23, s55, 24
+; GFX10_3-NEXT:    v_writelane_b32 v23, s56, 25
+; GFX10_3-NEXT:    v_writelane_b32 v23, s57, 26
+; GFX10_3-NEXT:    v_writelane_b32 v23, s58, 27
+; GFX10_3-NEXT:    ;;#ASMSTART
+; GFX10_3-NEXT:    ; def s[0:15], s[16:31], s[32:47], s[48:55], s[56:57], s58, v[0:15], v[16:22], vcc
+; GFX10_3-NEXT:    ;;#ASMEND
+; GFX10_3-NEXT:    v_lshrrev_b32_e64 v24, 5, s32
+; GFX10_3-NEXT:    v_writelane_b32 v23, s59, 28
+; GFX10_3-NEXT:    v_add_nc_u32_e32 v24, 0x4040, v24
+; GFX10_3-NEXT:    v_readfirstlane_b32 s59, v24
+; GFX10_3-NEXT:    ;;#ASMSTART
+; GFX10_3-NEXT:    ; use s[0:15], s[16:31], s[32:47], s[48:55], s[56:57], s58, v[0:15], v[16:22], vcc, s59, scc
+; GFX10_3-NEXT:    ;;#ASMEND
+; GFX10_3-NEXT:    v_readlane_b32 s59, v23, 28
+; GFX10_3-NEXT:    v_readlane_b32 s58, v23, 27
+; GFX10_3-NEXT:    v_readlane_b32 s57, v23, 26
+; GFX10_3-NEXT:    v_readlane_b32 s56, v23, 25
+; GFX10_3-NEXT:    v_readlane_b32 s55, v23, 24
+; GFX10_3-NEXT:    v_readlane_b32 s54, v23, 23
+; GFX10_3-NEXT:    v_readlane_b32 s53, v23, 22
+; GFX10_3-NEXT:    v_readlane_b32 s52, v23, 21
+; GFX10_3-NEXT:    v_readlane_b32 s51, v23, 20
+; GFX10_3-NEXT:    v_readlane_b32 s50, v23, 19
+; GFX10_3-NEXT:    v_readlane_b32 s49, v23, 18
+; GFX10_3-NEXT:    v_readlane_b32 s48, v23, 17
+; GFX10_3-NEXT:    v_readlane_b32 s47, v23, 16
+; GFX10_3-NEXT:    v_readlane_b32 s46, v23, 15
+; GFX10_3-NEXT:    v_readlane_b32 s45, v23, 14
+; GFX10_3-NEXT:    v_readlane_b32 s44, v23, 13
+; GFX10_3-NEXT:    v_readlane_b32 s43, v23, 12
+; GFX10_3-NEXT:    v_readlane_b32 s42, v23, 11
+; GFX10_3-NEXT:    v_readlane_b32 s41, v23, 10
+; GFX10_3-NEXT:    v_readlane_b32 s40, v23, 9
+; GFX10_3-NEXT:    v_readlane_b32 s39, v23, 8
+; GFX10_3-NEXT:    v_readlane_b32 s38, v23, 7
+; GFX10_3-NEXT:    v_readlane_b32 s37, v23, 6
+; GFX10_3-NEXT:    v_readlane_b32 s36, v23, 5
+; GFX10_3-NEXT:    v_readlane_b32 s35, v23, 4
+; GFX10_3-NEXT:    v_readlane_b32 s34, v23, 3
+; GFX10_3-NEXT:    v_readlane_b32 s33, v23, 2
+; GFX10_3-NEXT:    v_readlane_b32 s31, v23, 1
+; GFX10_3-NEXT:    v_readlane_b32 s30, v23, 0
+; GFX10_3-NEXT:    s_xor_saveexec_b32 s4, -1
+; GFX10_3-NEXT:    s_add_i32 s5, s32, 0x80880
+; GFX10_3-NEXT:    buffer_load_dword v23, off, s[0:3], s5 ; 4-byte Folded Reload
+; GFX10_3-NEXT:    s_mov_b32 exec_lo, s4
+; GFX10_3-NEXT:    s_waitcnt vmcnt(0)
+; GFX10_3-NEXT:    s_setpc_b64 s[30:31]
+;
+; GFX11-LABEL: scalar_mov_materializes_frame_index_no_live_scc_no_live_sgprs:
+; GFX11:       ; %bb.0:
+; GFX11-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX11-NEXT:    s_xor_saveexec_b32 s0, -1
+; GFX11-NEXT:    s_add_i32 s1, s32, 0x4044
+; GFX11-NEXT:    scratch_store_b32 off, v23, s1 ; 4-byte Folded Spill
+; GFX11-NEXT:    s_mov_b32 exec_lo, s0
+; GFX11-NEXT:    v_writelane_b32 v23, s30, 0
+; GFX11-NEXT:    s_add_i32 s0, s32, 64
+; GFX11-NEXT:    s_delay_alu instid0(SALU_CYCLE_1)
+; GFX11-NEXT:    v_mov_b32_e32 v0, s0
+; GFX11-NEXT:    s_and_b32 s0, 0, exec_lo
+; GFX11-NEXT:    v_writelane_b32 v23, s31, 1
+; GFX11-NEXT:    ;;#ASMSTART
+; GFX11-NEXT:    ; use alloca0 v0
+; GFX11-NEXT:    ;;#ASMEND
+; GFX11-NEXT:    v_writelane_b32 v23, s33, 2
+; GFX11-NEXT:    v_writelane_b32 v23, s34, 3
+; GFX11-NEXT:    v_writelane_b32 v23, s35, 4
+; GFX11-NEXT:    v_writelane_b32 v23, s36, 5
+; GFX11-NEXT:    v_writelane_b32 v23, s37, 6
+; GFX11-NEXT:    v_writelane_b32 v23, s38, 7
+; GFX11-NEXT:    v_writelane_b32 v23, s39, 8
+; GFX11-NEXT:    v_writelane_b32 v23, s40, 9
+; GFX11-NEXT:    v_writelane_b32 v23, s41, 10
+; GFX11-NEXT:    v_writelane_b32 v23, s42, 11
+; GFX11-NEXT:    v_writelane_b32 v23, s43, 12
+; GFX11-NEXT:    v_writelane_b32 v23, s44, 13
+; GFX11-NEXT:    v_writelane_b32 v23, s45, 14
+; GFX11-NEXT:    v_writelane_b32 v23, s46, 15
+; GFX11-NEXT:    v_writelane_b32 v23, s47, 16
+; GFX11-NEXT:    v_writelane_b32 v23, s48, 17
+; GFX11-NEXT:    v_writelane_b32 v23, s49, 18
+; GFX11-NEXT:    v_writelane_b32 v23, s50, 19
+; GFX11-NEXT:    v_writelane_b32 v23, s51, 20
+; GFX11-NEXT:    v_writelane_b32 v23, s52, 21
+; GFX11-NEXT:    v_writelane_b32 v23, s53, 22
+; GFX11-NEXT:    v_writelane_b32 v23, s54, 23
+; GFX11-NEXT:    v_writelane_b32 v23, s55, 24
+; GFX11-NEXT:    v_writelane_b32 v23, s56, 25
+; GFX11-NEXT:    v_writelane_b32 v23, s57, 26
+; GFX11-NEXT:    v_writelane_b32 v23, s58, 27
+; GFX11-NEXT:    ;;#ASMSTART
+; GFX11-NEXT:    ; def s[0:15], s[16:31], s[32:47], s[48:55], s[56:57], s58, v[0:15], v[16:22], vcc
+; GFX11-NEXT:    ;;#ASMEND
+; GFX11-NEXT:    s_addc_u32 s32, s32, 0x4040
+; GFX11-NEXT:    s_delay_alu instid0(SALU_CYCLE_1) | instskip(SKIP_2) | instid1(SALU_CYCLE_1)
+; GFX11-NEXT:    s_bitcmp1_b32 s32, 0
+; GFX11-NEXT:    v_writelane_b32 v23, s59, 28
+; GFX11-NEXT:    s_bitset0_b32 s32, 0
+; GFX11-NEXT:    s_mov_b32 s59, s32
+; GFX11-NEXT:    s_addc_u32 s32, s32, 0xffffbfc0
+; GFX11-NEXT:    s_delay_alu instid0(SALU_CYCLE_1)
+; GFX11-NEXT:    s_bitcmp1_b32 s32, 0
+; GFX11-NEXT:    s_bitset0_b32 s32, 0
+; GFX11-NEXT:    ;;#ASMSTART
+; GFX11-NEXT:    ; use s[0:15], s[16:31], s[32:47], s[48:55], s[56:57], s58, v[0:15], v[16:22], vcc, s59, scc
+; GFX11-NEXT:    ;;#ASMEND
+; GFX11-NEXT:    v_readlane_b32 s59, v23, 28
+; GFX11-NEXT:    v_readlane_b32 s58, v23, 27
+; GFX11-NEXT:    v_readlane_b32 s57, v23, 26
+; GFX11-NEXT:    v_readlane_b32 s56, v23, 25
+; GFX11-NEXT:    v_readlane_b32 s55, v23, 24
+; GFX11-NEXT:    v_readlane_b32 s54, v23, 23
+; GFX11-NEXT:    v_readlane_b32 s53, v23, 22
+; GFX11-NEXT:    v_readlane_b32 s52, v23, 21
+; GFX11-NEXT:    v_readlane_b32 s51, v23, 20
+; GFX11-NEXT:    v_readlane_b32 s50, v23, 19
+; GFX11-NEXT:    v_readlane_b32 s49, v23, 18
+; GFX11-NEXT:    v_readlane_b32 s48, v23, 17
+; GFX11-NEXT:    v_readlane_b32 s47, v23, 16
+; GFX11-NEXT:    v_readlane_b32 s46, v23, 15
+; GFX11-NEXT:    v_readlane_b32 s45, v23, 14
+; GFX11-NEXT:    v_readlane_b32 s44, v23, 13
+; GFX11-NEXT:    v_readlane_b32 s43, v23, 12
+; GFX11-NEXT:    v_readlane_b32 s42, v23, 11
+; GFX11-NEXT:    v_readlane_b32 s41, v23, 10
+; GFX11-NEXT:    v_readlane_b32 s40, v23, 9
+; GFX11-NEXT:    v_readlane_b32 s39, v23, 8
+; GFX11-NEXT:    v_readlane_b32 s38, v23, 7
+; GFX11-NEXT:    v_readlane_b32 s37, v23, 6
+; GFX11-NEXT:    v_readlane_b32 s36, v23, 5
+; GFX11-NEXT:    v_readlane_b32 s35, v23, 4
+; GFX11-NEXT:    v_readlane_b32 s34, v23, 3
+; GFX11-NEXT:    v_readlane_b32 s33, v23, 2
+; GFX11-NEXT:    v_readlane_b32 s31, v23, 1
+; GFX11-NEXT:    v_readlane_b32 s30, v23, 0
+; GFX11-NEXT:    s_xor_saveexec_b32 s0, -1
+; GFX11-NEXT:    s_add_i32 s1, s32, 0x4044
+; GFX11-NEXT:    scratch_load_b32 v23, off, s1 ; 4-byte Folded Reload
+; GFX11-NEXT:    s_mov_b32 exec_lo, s0
+; GFX11-NEXT:    s_waitcnt vmcnt(0)
+; GFX11-NEXT:    s_setpc_b64 s[30:31]
+;
+; GFX12-LABEL: scalar_mov_materializes_frame_index_no_live_scc_no_live_sgprs:
+; GFX12:       ; %bb.0:
+; GFX12-NEXT:    s_wait_loadcnt_dscnt 0x0
+; GFX12-NEXT:    s_wait_expcnt 0x0
+; GFX12-NEXT:    s_wait_samplecnt 0x0
+; GFX12-NEXT:    s_wait_bvhcnt 0x0
+; GFX12-NEXT:    s_wait_kmcnt 0x0
+; GFX12-NEXT:    s_xor_saveexec_b32 s0, -1
+; GFX12-NEXT:    scratch_store_b32 off, v23, s32 offset:16388 ; 4-byte Folded Spill
+; GFX12-NEXT:    s_mov_b32 exec_lo, s0
+; GFX12-NEXT:    v_writelane_b32 v23, s30, 0
+; GFX12-NEXT:    v_mov_b32_e32 v0, s32
+; GFX12-NEXT:    s_and_b32 s0, 0, exec_lo
+; GFX12-NEXT:    ;;#ASMSTART
+; GFX12-NEXT:    ; use alloca0 v0
+; GFX12-NEXT:    ;;#ASMEND
+; GFX12-NEXT:    v_writelane_b32 v23, s31, 1
+; GFX12-NEXT:    v_writelane_b32 v23, s33, 2
+; GFX12-NEXT:    v_writelane_b32 v23, s34, 3
+; GFX12-NEXT:    v_writelane_b32 v23, s35, 4
+; GFX12-NEXT:    v_writelane_b32 v23, s36, 5
+; GFX12-NEXT:    v_writelane_b32 v23, s37, 6
+; GFX12-NEXT:    v_writelane_b32 v23, s38, 7
+; GFX12-NEXT:    v_writelane_b32 v23, s39, 8
+; GFX12-NEXT:    v_writelane_b32 v23, s40, 9
+; GFX12-NEXT:    v_writelane_b32 v23, s41, 10
+; GFX12-NEXT:    v_writelane_b32 v23, s42, 11
+; GFX12-NEXT:    v_writelane_b32 v23, s43, 12
+; GFX12-NEXT:    v_writelane_b32 v23, s44, 13
+; GFX12-NEXT:    v_writelane_b32 v23, s45, 14
+; GFX12-NEXT:    v_writelane_b32 v23, s46, 15
+; GFX12-NEXT:    v_writelane_b32 v23, s47, 16
+; GFX12-NEXT:    v_writelane_b32 v23, s48, 17
+; GFX12-NEXT:    v_writelane_b32 v23, s49, 18
+; GFX12-NEXT:    v_writelane_b32 v23, s50, 19
+; GFX12-NEXT:    v_writelane_b32 v23, s51, 20
+; GFX12-NEXT:    v_writelane_b32 v23, s52, 21
+; GFX12-NEXT:    v_writelane_b32 v23, s53, 22
+; GFX12-NEXT:    v_writelane_b32 v23, s54, 23
+; GFX12-NEXT:    v_writelane_b32 v23, s55, 24
+; GFX12-NEXT:    v_writelane_b32 v23, s56, 25
+; GFX12-NEXT:    v_writelane_b32 v23, s57, 26
+; GFX12-NEXT:    v_writelane_b32 v23, s58, 27
+; GFX12-NEXT:    ;;#ASMSTART
+; GFX12-NEXT:    ; def s[0:15], s[16:31], s[32:47], s[48:55], s[56:57], s58, v[0:15], v[16:22], vcc
+; GFX12-NEXT:    ;;#ASMEND
+; GFX12-NEXT:    s_add_co_ci_u32 s32, s32, 0x4000
+; GFX12-NEXT:    s_delay_alu instid0(SALU_CYCLE_1) | instskip(SKIP_2) | instid1(SALU_CYCLE_1)
+; GFX12-NEXT:    s_bitcmp1_b32 s32, 0
+; GFX12-NEXT:    v_writelane_b32 v23, s59, 28
+; GFX12-NEXT:    s_bitset0_b32 s32, 0
+; GFX12-NEXT:    s_mov_b32 s59, s32
+; GFX12-NEXT:    s_add_co_ci_u32 s32, s32, 0xffffc000
+; GFX12-NEXT:    s_delay_alu instid0(SALU_CYCLE_1)
+; GFX12-NEXT:    s_bitcmp1_b32 s32, 0
+; GFX12-NEXT:    s_bitset0_b32 s32, 0
+; GFX12-NEXT:    ;;#ASMSTART
+; GFX12-NEXT:    ; use s[0:15], s[16:31], s[32:47], s[48:55], s[56:57], s58, v[0:15], v[16:22], vcc, s59, scc
+; GFX12-NEXT:    ;;#ASMEND
+; GFX12-NEXT:    v_readlane_b32 s59, v23, 28
+; GFX12-NEXT:    v_readlane_b32 s58, v23, 27
+; GFX12-NEXT:    v_readlane_b32 s57, v23, 26
+; GFX12-NEXT:    v_readlane_b32 s56, v23, 25
+; GFX12-NEXT:    v_readlane_b32 s55, v23, 24
+; GFX12-NEXT:    v_readlane_b32 s54, v23, 23
+; GFX12-NEXT:    v_readlane_b32 s53, v23, 22
+; GFX12-NEXT:    v_readlane_b32 s52, v23, 21
+; GFX12-NEXT:    v_readlane_b32 s51, v23, 20
+; GFX12-NEXT:    v_readlane_b32 s50, v23, 19
+; GFX12-NEXT:    v_readlane_b32 s49, v23, 18
+; GFX12-NEXT:    v_readlane_b32 s48, v23, 17
+; GFX12-NEXT:    v_readlane_b32 s47, v23, 16
+; GFX12-NEXT:    v_readlane_b32 s46, v23, 15
+; GFX12-NEXT:    v_readlane_b32 s45, v23, 14
+; GFX12-NEXT:    v_readlane_b32 s44, v23, 13
+; GFX12-NEXT:    v_readlane_b32 s43, v23, 12
+; GFX12-NEXT:    v_readlane_b32 s42, v23, 11
+; GFX12-NEXT:    v_readlane_b32 s41, v23, 10
+; GFX12-NEXT:    v_readlane_b32 s40, v23, 9
+; GFX12-NEXT:    v_readlane_b32 s39, v23, 8
+; GFX12-NEXT:    v_readlane_b32 s38, v23, 7
+; GFX12-NEXT:    v_readlane_b32 s37, v23, 6
+; GFX12-NEXT:    v_readlane_b32 s36, v23, 5
+; GFX12-NEXT:    v_readlane_b32 s35, v23, 4
+; GFX12-NEXT:    v_readlane_b32 s34, v23, 3
+; GFX12-NEXT:    v_readlane_b32 s33, v23, 2
+; GFX12-NEXT:    v_readlane_b32 s31, v23, 1
+; GFX12-NEXT:    v_readlane_b32 s30, v23, 0
+; GFX12-NEXT:    s_xor_saveexec_b32 s0, -1
+; GFX12-NEXT:    scratch_load_b32 v23, off, s32 offset:16388 ; 4-byte Folded Reload
+; GFX12-NEXT:    s_mov_b32 exec_lo, s0
+; GFX12-NEXT:    s_wait_loadcnt 0x0
+; GFX12-NEXT:    s_setpc_b64 s[30:31]
+  %alloca0 = alloca [4096 x i32], align 64, addrspace(5)
+  %alloca1 = alloca i32, align 4, addrspace(5)
+  call void asm sideeffect "; use alloca0 $0", "v"(ptr addrspace(5) %alloca0)
+
+  ; Force no SGPRs to be available for the carry-out of the vector add.
+  %asm = call %asm.output asm sideeffect
+    "; def $0, $1, $2, $3, $4, $5, $6, $7, $8",
+    "={s[0:15]},={s[16:31]},={s[32:47]},={s[48:55]},={s[56:57]},={s58},={v[0:15]},={v[16:22]},={vcc}"()
+
+  %s0 = extractvalue %asm.output %asm, 0
+  %s1 = extractvalue %asm.output %asm, 1
+  %s2 = extractvalue %asm.output %asm, 2
+  %s3 = extractvalue %asm.output %asm, 3
+  %s4 = extractvalue %asm.output %asm, 4
+  %s5 = extractvalue %asm.output %asm, 5
+
+  %v0 = extractvalue %asm.output %asm, 6
+  %v1 = extractvalue %asm.output %asm, 7
+
+  %vcc = extractvalue %asm.output %asm, 8
+
+  ; scc is unavailable since it is live in
+  call void asm sideeffect "; use $0, $1, $2, $3, $4, $5, $6, $7, $8, $9, $10",
+                           "{s[0:15]},{s[16:31]},{s[32:47]},{s[48:55]},{s[56:57]},{s58},{v[0:15]},{v[16:22]},{vcc},{s59},{scc}"(
+    <16 x i32> %s0,
+    <16 x i32> %s1,
+    <16 x i32> %s2,
+    <8 x i32> %s3,
+    <2 x i32> %s4,
+    i32 %s5,
+    <16 x i32> %v0,
+    <7 x i32> %v1,
+    i64 %vcc,
+    ptr addrspace(5) %alloca1,
+    i32 0) ; use of scc
+
+  ret void
+}
+
+; FIXME: This would have test FI at offset 0, but other objects get
+; assigned there. This shows a non-0, but inline immediate that can
+; fold directly into the address computation.
+define void @scalar_mov_materializes_frame_index_no_live_scc_no_live_sgprs__lowest_offset() #1 {
+; GFX7-LABEL: scalar_mov_materializes_frame_index_no_live_scc_no_live_sgprs__lowest_offset:
+; GFX7:       ; %bb.0:
+; GFX7-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX7-NEXT:    s_xor_saveexec_b64 s[4:5], -1
+; GFX7-NEXT:    s_add_i32 s6, s32, 0x100400
+; GFX7-NEXT:    buffer_store_dword v21, off, s[0:3], s6 ; 4-byte Folded Spill
+; GFX7-NEXT:    s_mov_b64 exec, s[4:5]
+; GFX7-NEXT:    v_writelane_b32 v21, s30, 0
+; GFX7-NEXT:    v_writelane_b32 v21, s31, 1
+; GFX7-NEXT:    v_writelane_b32 v21, s33, 2
+; GFX7-NEXT:    v_writelane_b32 v21, s34, 3
+; GFX7-NEXT:    v_writelane_b32 v21, s35, 4
+; GFX7-NEXT:    v_writelane_b32 v21, s36, 5
+; GFX7-NEXT:    v_writelane_b32 v21, s37, 6
+; GFX7-NEXT:    v_writelane_b32 v21, s38, 7
+; GFX7-NEXT:    v_writelane_b32 v21, s39, 8
+; GFX7-NEXT:    v_writelane_b32 v21, s40, 9
+; GFX7-NEXT:    v_writelane_b32 v21, s41, 10
+; GFX7-NEXT:    v_writelane_b32 v21, s42, 11
+; GFX7-NEXT:    v_writelane_b32 v21, s43, 12
+; GFX7-NEXT:    v_writelane_b32 v21, s44, 13
+; GFX7-NEXT:    v_writelane_b32 v21, s45, 14
+; GFX7-NEXT:    v_writelane_b32 v21, s46, 15
+; GFX7-NEXT:    v_writelane_b32 v21, s47, 16
+; GFX7-NEXT:    v_writelane_b32 v21, s48, 17
+; GFX7-NEXT:    v_writelane_b32 v21, s49, 18
+; GFX7-NEXT:    v_writelane_b32 v21, s50, 19
+; GFX7-NEXT:    v_writelane_b32 v21, s51, 20
+; GFX7-NEXT:    v_writelane_b32 v21, s52, 21
+; GFX7-NEXT:    v_writelane_b32 v21, s53, 22
+; GFX7-NEXT:    v_writelane_b32 v21, s54, 23
+; GFX7-NEXT:    v_writelane_b32 v21, s55, 24
+; GFX7-NEXT:    v_writelane_b32 v21, s56, 25
+; GFX7-NEXT:    v_writelane_b32 v21, s57, 26
+; GFX7-NEXT:    s_and_b64 s[4:5], 0, exec
+; GFX7-NEXT:    v_mov_b32_e32 v22, 16
+; GFX7-NEXT:    v_writelane_b32 v21, s58, 27
+; GFX7-NEXT:    ;;#ASMSTART
+; GFX7-NEXT:    ; def s[0:15], s[16:31], s[32:47], s[48:55], s[56:57], s58, v[0:15], v[16:20], vcc
+; GFX7-NEXT:    ;;#ASMEND
+; GFX7-NEXT:    v_mad_u32_u24 v22, v22, 64, s32
+; GFX7-NEXT:    v_lshr_b32_e64 v22, s32, 6
+; GFX7-NEXT:    v_writelane_b32 v21, s59, 28
+; GFX7-NEXT:    v_readfirstlane_b32 s59, v22
+; GFX7-NEXT:    ;;#ASMSTART
+; GFX7-NEXT:    ; use s[0:15], s[16:31], s[32:47], s[48:55], s[56:57], s58, v[0:15], v[16:20], vcc, s59, scc
+; GFX7-NEXT:    ;;#ASMEND
+; GFX7-NEXT:    v_readlane_b32 s59, v21, 28
+; GFX7-NEXT:    v_readlane_b32 s58, v21, 27
+; GFX7-NEXT:    v_readlane_b32 s57, v21, 26
+; GFX7-NEXT:    v_readlane_b32 s56, v21, 25
+; GFX7-NEXT:    v_readlane_b32 s55, v21, 24
+; GFX7-NEXT:    v_readlane_b32 s54, v21, 23
+; GFX7-NEXT:    v_readlane_b32 s53, v21, 22
+; GFX7-NEXT:    v_readlane_b32 s52, v21, 21
+; GFX7-NEXT:    v_readlane_b32 s51, v21, 20
+; GFX7-NEXT:    v_readlane_b32 s50, v21, 19
+; GFX7-NEXT:    v_readlane_b32 s49, v21, 18
+; GFX7-NEXT:    v_readlane_b32 s48, v21, 17
+; GFX7-NEXT:    v_readlane_b32 s47, v21, 16
+; GFX7-NEXT:    v_readlane_b32 s46, v21, 15
+; GFX7-NEXT:    v_readlane_b32 s45, v21, 14
+; GFX7-NEXT:    v_readlane_b32 s44, v21, 13
+; GFX7-NEXT:    v_readlane_b32 s43, v21, 12
+; GFX7-NEXT:    v_readlane_b32 s42, v21, 11
+; GFX7-NEXT:    v_readlane_b32 s41, v21, 10
+; GFX7-NEXT:    v_readlane_b32 s40, v21, 9
+; GFX7-NEXT:    v_readlane_b32 s39, v21, 8
+; GFX7-NEXT:    v_readlane_b32 s38, v21, 7
+; GFX7-NEXT:    v_readlane_b32 s37, v21, 6
+; GFX7-NEXT:    v_readlane_b32 s36, v21, 5
+; GFX7-NEXT:    v_readlane_b32 s35, v21, 4
+; GFX7-NEXT:    v_readlane_b32 s34, v21, 3
+; GFX7-NEXT:    v_readlane_b32 s33, v21, 2
+; GFX7-NEXT:    v_readlane_b32 s31, v21, 1
+; GFX7-NEXT:    v_readlane_b32 s30, v21, 0
+; GFX7-NEXT:    s_xor_saveexec_b64 s[4:5], -1
+; GFX7-NEXT:    s_add_i32 s6, s32, 0x100400
+; GFX7-NEXT:    buffer_load_dword v21, off, s[0:3], s6 ; 4-byte Folded Reload
+; GFX7-NEXT:    s_mov_b64 exec, s[4:5]
+; GFX7-NEXT:    s_waitcnt vmcnt(0)
+; GFX7-NEXT:    s_setpc_b64 s[30:31]
+;
+; GFX8-LABEL: scalar_mov_materializes_frame_index_no_live_scc_no_live_sgprs__lowest_offset:
+; GFX8:       ; %bb.0:
+; GFX8-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX8-NEXT:    s_xor_saveexec_b64 s[4:5], -1
+; GFX8-NEXT:    s_add_i32 s6, s32, 0x100400
+; GFX8-NEXT:    buffer_store_dword v21, off, s[0:3], s6 ; 4-byte Folded Spill
+; GFX8-NEXT:    s_mov_b64 exec, s[4:5]
+; GFX8-NEXT:    v_writelane_b32 v21, s30, 0
+; GFX8-NEXT:    v_writelane_b32 v21, s31, 1
+; GFX8-NEXT:    v_writelane_b32 v21, s33, 2
+; GFX8-NEXT:    v_writelane_b32 v21, s34, 3
+; GFX8-NEXT:    v_writelane_b32 v21, s35, 4
+; GFX8-NEXT:    v_writelane_b32 v21, s36, 5
+; GFX8-NEXT:    v_writelane_b32 v21, s37, 6
+; GFX8-NEXT:    v_writelane_b32 v21, s38, 7
+; GFX8-NEXT:    v_writelane_b32 v21, s39, 8
+; GFX8-NEXT:    v_writelane_b32 v21, s40, 9
+; GFX8-NEXT:    v_writelane_b32 v21, s41, 10
+; GFX8-NEXT:    v_writelane_b32 v21, s42, 11
+; GFX8-NEXT:    v_writelane_b32 v21, s43, 12
+; GFX8-NEXT:    v_writelane_b32 v21, s44, 13
+; GFX8-NEXT:    v_writelane_b32 v21, s45, 14
+; GFX8-NEXT:    v_writelane_b32 v21, s46, 15
+; GFX8-NEXT:    v_writelane_b32 v21, s47, 16
+; GFX8-NEXT:    v_writelane_b32 v21, s48, 17
+; GFX8-NEXT:    v_writelane_b32 v21, s49, 18
+; GFX8-NEXT:    v_writelane_b32 v21, s50, 19
+; GFX8-NEXT:    v_writelane_b32 v21, s51, 20
+; GFX8-NEXT:    v_writelane_b32 v21, s52, 21
+; GFX8-NEXT:    v_writelane_b32 v21, s53, 22
+; GFX8-NEXT:    v_writelane_b32 v21, s54, 23
+; GFX8-NEXT:    v_writelane_b32 v21, s55, 24
+; GFX8-NEXT:    v_writelane_b32 v21, s56, 25
+; GFX8-NEXT:    v_writelane_b32 v21, s57, 26
+; GFX8-NEXT:    s_and_b64 s[4:5], 0, exec
+; GFX8-NEXT:    v_mov_b32_e32 v22, 16
+; GFX8-NEXT:    v_writelane_b32 v21, s58, 27
+; GFX8-NEXT:    ;;#ASMSTART
+; GFX8-NEXT:    ; def s[0:15], s[16:31], s[32:47], s[48:55], s[56:57], s58, v[0:15], v[16:20], vcc
+; GFX8-NEXT:    ;;#ASMEND
+; GFX8-NEXT:    v_mad_u32_u24 v22, v22, 64, s32
+; GFX8-NEXT:    v_lshrrev_b32_e64 v22, 6, s32
+; GFX8-NEXT:    v_writelane_b32 v21, s59, 28
+; GFX8-NEXT:    v_readfirstlane_b32 s59, v22
+; GFX8-NEXT:    ;;#ASMSTART
+; GFX8-NEXT:    ; use s[0:15], s[16:31], s[32:47], s[48:55], s[56:57], s58, v[0:15], v[16:20], vcc, s59, scc
+; GFX8-NEXT:    ;;#ASMEND
+; GFX8-NEXT:    v_readlane_b32 s59, v21, 28
+; GFX8-NEXT:    v_readlane_b32 s58, v21, 27
+; GFX8-NEXT:    v_readlane_b32 s57, v21, 26
+; GFX8-NEXT:    v_readlane_b32 s56, v21, 25
+; GFX8-NEXT:    v_readlane_b32 s55, v21, 24
+; GFX8-NEXT:    v_readlane_b32 s54, v21, 23
+; GFX8-NEXT:    v_readlane_b32 s53, v21, 22
+; GFX8-NEXT:    v_readlane_b32 s52, v21, 21
+; GFX8-NEXT:    v_readlane_b32 s51, v21, 20
+; GFX8-NEXT:    v_readlane_b32 s50, v21, 19
+; GFX8-NEXT:    v_readlane_b32 s49, v21, 18
+; GFX8-NEXT:    v_readlane_b32 s48, v21, 17
+; GFX8-NEXT:    v_readlane_b32 s47, v21, 16
+; GFX8-NEXT:    v_readlane_b32 s46, v21, 15
+; GFX8-NEXT:    v_readlane_b32 s45, v21, 14
+; GFX8-NEXT:    v_readlane_b32 s44, v21, 13
+; GFX8-NEXT:    v_readlane_b32 s43, v21, 12
+; GFX8-NEXT:    v_readlane_b32 s42, v21, 11
+; GFX8-NEXT:    v_readlane_b32 s41, v21, 10
+; GFX8-NEXT:    v_readlane_b32 s40, v21, 9
+; GFX8-NEXT:    v_readlane_b32 s39, v21, 8
+; GFX8-NEXT:    v_readlane_b32 s38, v21, 7
+; GFX8-NEXT:    v_readlane_b32 s37, v21, 6
+; GFX8-NEXT:    v_readlane_b32 s36, v21, 5
+; GFX8-NEXT:    v_readlane_b32 s35, v21, 4
+; GFX8-NEXT:    v_readlane_b32 s34, v21, 3
+; GFX8-NEXT:    v_readlane_b32 s33, v21, 2
+; GFX8-NEXT:    v_readlane_b32 s31, v21, 1
+; GFX8-NEXT:    v_readlane_b32 s30, v21, 0
+; GFX8-NEXT:    s_xor_saveexec_b64 s[4:5], -1
+; GFX8-NEXT:    s_add_i32 s6, s32, 0x100400
+; GFX8-NEXT:    buffer_load_dword v21, off, s[0:3], s6 ; 4-byte Folded Reload
+; GFX8-NEXT:    s_mov_b64 exec, s[4:5]
+; GFX8-NEXT:    s_waitcnt vmcnt(0)
+; GFX8-NEXT:    s_setpc_b64 s[30:31]
+;
+; GFX900-LABEL: scalar_mov_materializes_frame_index_no_live_scc_no_live_sgprs__lowest_offset:
+; GFX900:       ; %bb.0:
+; GFX900-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX900-NEXT:    s_xor_saveexec_b64 s[4:5], -1
+; GFX900-NEXT:    s_add_i32 s6, s32, 0x100400
+; GFX900-NEXT:    buffer_store_dword v21, off, s[0:3], s6 ; 4-byte Folded Spill
+; GFX900-NEXT:    s_mov_b64 exec, s[4:5]
+; GFX900-NEXT:    v_writelane_b32 v21, s30, 0
+; GFX900-NEXT:    v_writelane_b32 v21, s31, 1
+; GFX900-NEXT:    v_writelane_b32 v21, s33, 2
+; GFX900-NEXT:    v_writelane_b32 v21, s34, 3
+; GFX900-NEXT:    v_writelane_b32 v21, s35, 4
+; GFX900-NEXT:    v_writelane_b32 v21, s36, 5
+; GFX900-NEXT:    v_writelane_b32 v21, s37, 6
+; GFX900-NEXT:    v_writelane_b32 v21, s38, 7
+; GFX900-NEXT:    v_writelane_b32 v21, s39, 8
+; GFX900-NEXT:    v_writelane_b32 v21, s40, 9
+; GFX900-NEXT:    v_writelane_b32 v21, s41, 10
+; GFX900-NEXT:    v_writelane_b32 v21, s42, 11
+; GFX900-NEXT:    v_writelane_b32 v21, s43, 12
+; GFX900-NEXT:    v_writelane_b32 v21, s44, 13
+; GFX900-NEXT:    v_writelane_b32 v21, s45, 14
+; GFX900-NEXT:    v_writelane_b32 v21, s46, 15
+; GFX900-NEXT:    v_writelane_b32 v21, s47, 16
+; GFX900-NEXT:    v_writelane_b32 v21, s48, 17
+; GFX900-NEXT:    v_writelane_b32 v21, s49, 18
+; GFX900-NEXT:    v_writelane_b32 v21, s50, 19
+; GFX900-NEXT:    v_writelane_b32 v21, s51, 20
+; GFX900-NEXT:    v_writelane_b32 v21, s52, 21
+; GFX900-NEXT:    v_writelane_b32 v21, s53, 22
+; GFX900-NEXT:    v_writelane_b32 v21, s54, 23
+; GFX900-NEXT:    v_writelane_b32 v21, s55, 24
+; GFX900-NEXT:    v_writelane_b32 v21, s56, 25
+; GFX900-NEXT:    v_writelane_b32 v21, s57, 26
+; GFX900-NEXT:    s_and_b64 s[4:5], 0, exec
+; GFX900-NEXT:    v_writelane_b32 v21, s58, 27
+; GFX900-NEXT:    ;;#ASMSTART
+; GFX900-NEXT:    ; def s[0:15], s[16:31], s[32:47], s[48:55], s[56:57], s58, v[0:15], v[16:20], vcc
+; GFX900-NEXT:    ;;#ASMEND
+; GFX900-NEXT:    v_lshrrev_b32_e64 v22, 6, s32
+; GFX900-NEXT:    v_add_u32_e32 v22, 16, v22
+; GFX900-NEXT:    v_writelane_b32 v21, s59, 28
+; GFX900-NEXT:    v_readfirstlane_b32 s59, v22
+; GFX900-NEXT:    ;;#ASMSTART
+; GFX900-NEXT:    ; use s[0:15], s[16:31], s[32:47], s[48:55], s[56:57], s58, v[0:15], v[16:20], vcc, s59, scc
+; GFX900-NEXT:    ;;#ASMEND
+; GFX900-NEXT:    v_readlane_b32 s59, v21, 28
+; GFX900-NEXT:    v_readlane_b32 s58, v21, 27
+; GFX900-NEXT:    v_readlane_b32 s57, v21, 26
+; GFX900-NEXT:    v_readlane_b32 s56, v21, 25
+; GFX900-NEXT:    v_readlane_b32 s55, v21, 24
+; GFX900-NEXT:    v_readlane_b32 s54, v21, 23
+; GFX900-NEXT:    v_readlane_b32 s53, v21, 22
+; GFX900-NEXT:    v_readlane_b32 s52, v21, 21
+; GFX900-NEXT:    v_readlane_b32 s51, v21, 20
+; GFX900-NEXT:    v_readlane_b32 s50, v21, 19
+; GFX900-NEXT:    v_readlane_b32 s49, v21, 18
+; GFX900-NEXT:    v_readlane_b32 s48, v21, 17
+; GFX900-NEXT:    v_readlane_b32 s47, v21, 16
+; GFX900-NEXT:    v_readlane_b32 s46, v21, 15
+; GFX900-NEXT:    v_readlane_b32 s45, v21, 14
+; GFX900-NEXT:    v_readlane_b32 s44, v21, 13
+; GFX900-NEXT:    v_readlane_b32 s43, v21, 12
+; GFX900-NEXT:    v_readlane_b32 s42, v21, 11
+; GFX900-NEXT:    v_readlane_b32 s41, v21, 10
+; GFX900-NEXT:    v_readlane_b32 s40, v21, 9
+; GFX900-NEXT:    v_readlane_b32 s39, v21, 8
+; GFX900-NEXT:    v_readlane_b32 s38, v21, 7
+; GFX900-NEXT:    v_readlane_b32 s37, v21, 6
+; GFX900-NEXT:    v_readlane_b32 s36, v21, 5
+; GFX900-NEXT:    v_readlane_b32 s35, v21, 4
+; GFX900-NEXT:    v_readlane_b32 s34, v21, 3
+; GFX900-NEXT:    v_readlane_b32 s33, v21, 2
+; GFX900-NEXT:    v_readlane_b32 s31, v21, 1
+; GFX900-NEXT:    v_readlane_b32 s30, v21, 0
+; GFX900-NEXT:    s_xor_saveexec_b64 s[4:5], -1
+; GFX900-NEXT:    s_add_i32 s6, s32, 0x100400
+; GFX900-NEXT:    buffer_load_dword v21, off, s[0:3], s6 ; 4-byte Folded Reload
+; GFX900-NEXT:    s_mov_b64 exec, s[4:5]
+; GFX900-NEXT:    s_waitcnt vmcnt(0)
+; GFX900-NEXT:    s_setpc_b64 s[30:31]
+;
+; GFX940-LABEL: scalar_mov_materializes_frame_index_no_live_scc_no_live_sgprs__lowest_offset:
+; GFX940:       ; %bb.0:
+; GFX940-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX940-NEXT:    s_xor_saveexec_b64 s[0:1], -1
+; GFX940-NEXT:    s_add_i32 s2, s32, 0x4010
+; GFX940-NEXT:    scratch_store_dword off, v21, s2 sc0 sc1 ; 4-byte Folded Spill
+; GFX940-NEXT:    s_mov_b64 exec, s[0:1]
+; GFX940-NEXT:    v_writelane_b32 v21, s30, 0
+; GFX940-NEXT:    v_writelane_b32 v21, s31, 1
+; GFX940-NEXT:    v_writelane_b32 v21, s33, 2
+; GFX940-NEXT:    v_writelane_b32 v21, s34, 3
+; GFX940-NEXT:    v_writelane_b32 v21, s35, 4
+; GFX940-NEXT:    v_writelane_b32 v21, s36, 5
+; GFX940-NEXT:    v_writelane_b32 v21, s37, 6
+; GFX940-NEXT:    v_writelane_b32 v21, s38, 7
+; GFX940-NEXT:    v_writelane_b32 v21, s39, 8
+; GFX940-NEXT:    v_writelane_b32 v21, s40, 9
+; GFX940-NEXT:    v_writelane_b32 v21, s41, 10
+; GFX940-NEXT:    v_writelane_b32 v21, s42, 11
+; GFX940-NEXT:    v_writelane_b32 v21, s43, 12
+; GFX940-NEXT:    v_writelane_b32 v21, s44, 13
+; GFX940-NEXT:    v_writelane_b32 v21, s45, 14
+; GFX940-NEXT:    v_writelane_b32 v21, s46, 15
+; GFX940-NEXT:    v_writelane_b32 v21, s47, 16
+; GFX940-NEXT:    v_writelane_b32 v21, s48, 17
+; GFX940-NEXT:    v_writelane_b32 v21, s49, 18
+; GFX940-NEXT:    v_writelane_b32 v21, s50, 19
+; GFX940-NEXT:    v_writelane_b32 v21, s51, 20
+; GFX940-NEXT:    v_writelane_b32 v21, s52, 21
+; GFX940-NEXT:    v_writelane_b32 v21, s53, 22
+; GFX940-NEXT:    v_writelane_b32 v21, s54, 23
+; GFX940-NEXT:    v_writelane_b32 v21, s55, 24
+; GFX940-NEXT:    v_writelane_b32 v21, s56, 25
+; GFX940-NEXT:    v_writelane_b32 v21, s57, 26
+; GFX940-NEXT:    v_writelane_b32 v21, s58, 27
+; GFX940-NEXT:    v_writelane_b32 v21, s59, 28
+; GFX940-NEXT:    v_writelane_b32 v21, s60, 29
+; GFX940-NEXT:    v_writelane_b32 v21, s61, 30
+; GFX940-NEXT:    s_and_b64 s[60:61], 0, exec
+; GFX940-NEXT:    ;;#ASMSTART
+; GFX940-NEXT:    ; def s[0:15], s[16:31], s[32:47], s[48:55], s[56:57], s58, v[0:15], v[16:20], vcc
+; GFX940-NEXT:    ;;#ASMEND
+; GFX940-NEXT:    s_addc_u32 s60, s32, 16
+; GFX940-NEXT:    s_bitcmp1_b32 s60, 0
+; GFX940-NEXT:    s_bitset0_b32 s60, 0
+; GFX940-NEXT:    s_mov_b32 s59, s60
+; GFX940-NEXT:    ;;#ASMSTART
+; GFX940-NEXT:    ; use s[0:15], s[16:31], s[32:47], s[48:55], s[56:57], s58, v[0:15], v[16:20], vcc, s59, scc
+; GFX940-NEXT:    ;;#ASMEND
+; GFX940-NEXT:    v_readlane_b32 s61, v21, 30
+; GFX940-NEXT:    v_readlane_b32 s60, v21, 29
+; GFX940-NEXT:    v_readlane_b32 s59, v21, 28
+; GFX940-NEXT:    v_readlane_b32 s58, v21, 27
+; GFX940-NEXT:    v_readlane_b32 s57, v21, 26
+; GFX940-NEXT:    v_readlane_b32 s56, v21, 25
+; GFX940-NEXT:    v_readlane_b32 s55, v21, 24
+; GFX940-NEXT:    v_readlane_b32 s54, v21, 23
+; GFX940-NEXT:    v_readlane_b32 s53, v21, 22
+; GFX940-NEXT:    v_readlane_b32 s52, v21, 21
+; GFX940-NEXT:    v_readlane_b32 s51, v21, 20
+; GFX940-NEXT:    v_readlane_b32 s50, v21, 19
+; GFX940-NEXT:    v_readlane_b32 s49, v21, 18
+; GFX940-NEXT:    v_readlane_b32 s48, v21, 17
+; GFX940-NEXT:    v_readlane_b32 s47, v21, 16
+; GFX940-NEXT:    v_readlane_b32 s46, v21, 15
+; GFX940-NEXT:    v_readlane_b32 s45, v21, 14
+; GFX940-NEXT:    v_readlane_b32 s44, v21, 13
+; GFX940-NEXT:    v_readlane_b32 s43, v21, 12
+; GFX940-NEXT:    v_readlane_b32 s42, v21, 11
+; GFX940-NEXT:    v_readlane_b32 s41, v21, 10
+; GFX940-NEXT:    v_readlane_b32 s40, v21, 9
+; GFX940-NEXT:    v_readlane_b32 s39, v21, 8
+; GFX940-NEXT:    v_readlane_b32 s38, v21, 7
+; GFX940-NEXT:    v_readlane_b32 s37, v21, 6
+; GFX940-NEXT:    v_readlane_b32 s36, v21, 5
+; GFX940-NEXT:    v_readlane_b32 s35, v21, 4
+; GFX940-NEXT:    v_readlane_b32 s34, v21, 3
+; GFX940-NEXT:    v_readlane_b32 s33, v21, 2
+; GFX940-NEXT:    v_readlane_b32 s31, v21, 1
+; GFX940-NEXT:    v_readlane_b32 s30, v21, 0
+; GFX940-NEXT:    s_xor_saveexec_b64 s[0:1], -1
+; GFX940-NEXT:    s_add_i32 s2, s32, 0x4010
+; GFX940-NEXT:    scratch_load_dword v21, off, s2 ; 4-byte Folded Reload
+; GFX940-NEXT:    s_mov_b64 exec, s[0:1]
+; GFX940-NEXT:    s_waitcnt vmcnt(0)
+; GFX940-NEXT:    s_setpc_b64 s[30:31]
+;
+; GFX10_1-LABEL: scalar_mov_materializes_frame_index_no_live_scc_no_live_sgprs__lowest_offset:
+; GFX10_1:       ; %bb.0:
+; GFX10_1-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX10_1-NEXT:    s_xor_saveexec_b32 s4, -1
+; GFX10_1-NEXT:    s_add_i32 s5, s32, 0x80200
+; GFX10_1-NEXT:    buffer_store_dword v21, off, s[0:3], s5 ; 4-byte Folded Spill
+; GFX10_1-NEXT:    s_waitcnt_depctr 0xffe3
+; GFX10_1-NEXT:    s_mov_b32 exec_lo, s4
+; GFX10_1-NEXT:    v_writelane_b32 v21, s30, 0
+; GFX10_1-NEXT:    v_writelane_b32 v21, s31, 1
+; GFX10_1-NEXT:    v_writelane_b32 v21, s33, 2
+; GFX10_1-NEXT:    v_writelane_b32 v21, s34, 3
+; GFX10_1-NEXT:    v_writelane_b32 v21, s35, 4
+; GFX10_1-NEXT:    v_writelane_b32 v21, s36, 5
+; GFX10_1-NEXT:    v_writelane_b32 v21, s37, 6
+; GFX10_1-NEXT:    v_writelane_b32 v21, s38, 7
+; GFX10_1-NEXT:    v_writelane_b32 v21, s39, 8
+; GFX10_1-NEXT:    v_writelane_b32 v21, s40, 9
+; GFX10_1-NEXT:    v_writelane_b32 v21, s41, 10
+; GFX10_1-NEXT:    v_writelane_b32 v21, s42, 11
+; GFX10_1-NEXT:    v_writelane_b32 v21, s43, 12
+; GFX10_1-NEXT:    v_writelane_b32 v21, s44, 13
+; GFX10_1-NEXT:    v_writelane_b32 v21, s45, 14
+; GFX10_1-NEXT:    v_writelane_b32 v21, s46, 15
+; GFX10_1-NEXT:    v_writelane_b32 v21, s47, 16
+; GFX10_1-NEXT:    v_writelane_b32 v21, s48, 17
+; GFX10_1-NEXT:    v_writelane_b32 v21, s49, 18
+; GFX10_1-NEXT:    v_writelane_b32 v21, s50, 19
+; GFX10_1-NEXT:    v_writelane_b32 v21, s51, 20
+; GFX10_1-NEXT:    v_writelane_b32 v21, s52, 21
+; GFX10_1-NEXT:    v_writelane_b32 v21, s53, 22
+; GFX10_1-NEXT:    v_writelane_b32 v21, s54, 23
+; GFX10_1-NEXT:    v_writelane_b32 v21, s55, 24
+; GFX10_1-NEXT:    v_writelane_b32 v21, s56, 25
+; GFX10_1-NEXT:    v_writelane_b32 v21, s57, 26
+; GFX10_1-NEXT:    v_writelane_b32 v21, s58, 27
+; GFX10_1-NEXT:    ;;#ASMSTART
+; GFX10_1-NEXT:    ; def s[0:15], s[16:31], s[32:47], s[48:55], s[56:57], s58, v[0:15], v[16:20], vcc
+; GFX10_1-NEXT:    ;;#ASMEND
+; GFX10_1-NEXT:    v_lshrrev_b32_e64 v22, 5, s32
+; GFX10_1-NEXT:    v_writelane_b32 v21, s59, 28
+; GFX10_1-NEXT:    s_and_b32 s59, 0, exec_lo
+; GFX10_1-NEXT:    v_add_nc_u32_e32 v22, 16, v22
+; GFX10_1-NEXT:    v_readfirstlane_b32 s59, v22
+; GFX10_1-NEXT:    ;;#ASMSTART
+; GFX10_1-NEXT:    ; use s[0:15], s[16:31], s[32:47], s[48:55], s[56:57], s58, v[0:15], v[16:20], vcc, s59, scc
+; GFX10_1-NEXT:    ;;#ASMEND
+; GFX10_1-NEXT:    v_readlane_b32 s59, v21, 28
+; GFX10_1-NEXT:    v_readlane_b32 s58, v21, 27
+; GFX10_1-NEXT:    v_readlane_b32 s57, v21, 26
+; GFX10_1-NEXT:    v_readlane_b32 s56, v21, 25
+; GFX10_1-NEXT:    v_readlane_b32 s55, v21, 24
+; GFX10_1-NEXT:    v_readlane_b32 s54, v21, 23
+; GFX10_1-NEXT:    v_readlane_b32 s53, v21, 22
+; GFX10_1-NEXT:    v_readlane_b32 s52, v21, 21
+; GFX10_1-NEXT:    v_readlane_b32 s51, v21, 20
+; GFX10_1-NEXT:    v_readlane_b32 s50, v21, 19
+; GFX10_1-NEXT:    v_readlane_b32 s49, v21, 18
+; GFX10_1-NEXT:    v_readlane_b32 s48, v21, 17
+; GFX10_1-NEXT:    v_readlane_b32 s47, v21, 16
+; GFX10_1-NEXT:    v_readlane_b32 s46, v21, 15
+; GFX10_1-NEXT:    v_readlane_b32 s45, v21, 14
+; GFX10_1-NEXT:    v_readlane_b32 s44, v21, 13
+; GFX10_1-NEXT:    v_readlane_b32 s43, v21, 12
+; GFX10_1-NEXT:    v_readlane_b32 s42, v21, 11
+; GFX10_1-NEXT:    v_readlane_b32 s41, v21, 10
+; GFX10_1-NEXT:    v_readlane_b32 s40, v21, 9
+; GFX10_1-NEXT:    v_readlane_b32 s39, v21, 8
+; GFX10_1-NEXT:    v_readlane_b32 s38, v21, 7
+; GFX10_1-NEXT:    v_readlane_b32 s37, v21, 6
+; GFX10_1-NEXT:    v_readlane_b32 s36, v21, 5
+; GFX10_1-NEXT:    v_readlane_b32 s35, v21, 4
+; GFX10_1-NEXT:    v_readlane_b32 s34, v21, 3
+; GFX10_1-NEXT:    v_readlane_b32 s33, v21, 2
+; GFX10_1-NEXT:    v_readlane_b32 s31, v21, 1
+; GFX10_1-NEXT:    v_readlane_b32 s30, v21, 0
+; GFX10_1-NEXT:    s_xor_saveexec_b32 s4, -1
+; GFX10_1-NEXT:    s_add_i32 s5, s32, 0x80200
+; GFX10_1-NEXT:    buffer_load_dword v21, off, s[0:3], s5 ; 4-byte Folded Reload
+; GFX10_1-NEXT:    s_waitcnt_depctr 0xffe3
+; GFX10_1-NEXT:    s_mov_b32 exec_lo, s4
+; GFX10_1-NEXT:    s_waitcnt vmcnt(0)
+; GFX10_1-NEXT:    s_setpc_b64 s[30:31]
+;
+; GFX10_3-LABEL: scalar_mov_materializes_frame_index_no_live_scc_no_live_sgprs__lowest_offset:
+; GFX10_3:       ; %bb.0:
+; GFX10_3-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX10_3-NEXT:    s_xor_saveexec_b32 s4, -1
+; GFX10_3-NEXT:    s_add_i32 s5, s32, 0x80200
+; GFX10_3-NEXT:    buffer_store_dword v21, off, s[0:3], s5 ; 4-byte Folded Spill
+; GFX10_3-NEXT:    s_mov_b32 exec_lo, s4
+; GFX10_3-NEXT:    v_writelane_b32 v21, s30, 0
+; GFX10_3-NEXT:    v_writelane_b32 v21, s31, 1
+; GFX10_3-NEXT:    v_writelane_b32 v21, s33, 2
+; GFX10_3-NEXT:    v_writelane_b32 v21, s34, 3
+; GFX10_3-NEXT:    v_writelane_b32 v21, s35, 4
+; GFX10_3-NEXT:    v_writelane_b32 v21, s36, 5
+; GFX10_3-NEXT:    v_writelane_b32 v21, s37, 6
+; GFX10_3-NEXT:    v_writelane_b32 v21, s38, 7
+; GFX10_3-NEXT:    v_writelane_b32 v21, s39, 8
+; GFX10_3-NEXT:    v_writelane_b32 v21, s40, 9
+; GFX10_3-NEXT:    v_writelane_b32 v21, s41, 10
+; GFX10_3-NEXT:    v_writelane_b32 v21, s42, 11
+; GFX10_3-NEXT:    v_writelane_b32 v21, s43, 12
+; GFX10_3-NEXT:    v_writelane_b32 v21, s44, 13
+; GFX10_3-NEXT:    v_writelane_b32 v21, s45, 14
+; GFX10_3-NEXT:    v_writelane_b32 v21, s46, 15
+; GFX10_3-NEXT:    v_writelane_b32 v21, s47, 16
+; GFX10_3-NEXT:    v_writelane_b32 v21, s48, 17
+; GFX10_3-NEXT:    v_writelane_b32 v21, s49, 18
+; GFX10_3-NEXT:    v_writelane_b32 v21, s50, 19
+; GFX10_3-NEXT:    v_writelane_b32 v21, s51, 20
+; GFX10_3-NEXT:    v_writelane_b32 v21, s52, 21
+; GFX10_3-NEXT:    v_writelane_b32 v21, s53, 22
+; GFX10_3-NEXT:    v_writelane_b32 v21, s54, 23
+; GFX10_3-NEXT:    v_writelane_b32 v21, s55, 24
+; GFX10_3-NEXT:    v_writelane_b32 v21, s56, 25
+; GFX10_3-NEXT:    v_writelane_b32 v21, s57, 26
+; GFX10_3-NEXT:    v_writelane_b32 v21, s58, 27
+; GFX10_3-NEXT:    ;;#ASMSTART
+; GFX10_3-NEXT:    ; def s[0:15], s[16:31], s[32:47], s[48:55], s[56:57], s58, v[0:15], v[16:20], vcc
+; GFX10_3-NEXT:    ;;#ASMEND
+; GFX10_3-NEXT:    v_lshrrev_b32_e64 v22, 5, s32
+; GFX10_3-NEXT:    v_writelane_b32 v21, s59, 28
+; GFX10_3-NEXT:    s_and_b32 s59, 0, exec_lo
+; GFX10_3-NEXT:    v_add_nc_u32_e32 v22, 16, v22
+; GFX10_3-NEXT:    v_readfirstlane_b32 s59, v22
+; GFX10_3-NEXT:    ;;#ASMSTART
+; GFX10_3-NEXT:    ; use s[0:15], s[16:31], s[32:47], s[48:55], s[56:57], s58, v[0:15], v[16:20], vcc, s59, scc
+; GFX10_3-NEXT:    ;;#ASMEND
+; GFX10_3-NEXT:    v_readlane_b32 s59, v21, 28
+; GFX10_3-NEXT:    v_readlane_b32 s58, v21, 27
+; GFX10_3-NEXT:    v_readlane_b32 s57, v21, 26
+; GFX10_3-NEXT:    v_readlane_b32 s56, v21, 25
+; GFX10_3-NEXT:    v_readlane_b32 s55, v21, 24
+; GFX10_3-NEXT:    v_readlane_b32 s54, v21, 23
+; GFX10_3-NEXT:    v_readlane_b32 s53, v21, 22
+; GFX10_3-NEXT:    v_readlane_b32 s52, v21, 21
+; GFX10_3-NEXT:    v_readlane_b32 s51, v21, 20
+; GFX10_3-NEXT:    v_readlane_b32 s50, v21, 19
+; GFX10_3-NEXT:    v_readlane_b32 s49, v21, 18
+; GFX10_3-NEXT:    v_readlane_b32 s48, v21, 17
+; GFX10_3-NEXT:    v_readlane_b32 s47, v21, 16
+; GFX10_3-NEXT:    v_readlane_b32 s46, v21, 15
+; GFX10_3-NEXT:    v_readlane_b32 s45, v21, 14
+; GFX10_3-NEXT:    v_readlane_b32 s44, v21, 13
+; GFX10_3-NEXT:    v_readlane_b32 s43, v21, 12
+; GFX10_3-NEXT:    v_readlane_b32 s42, v21, 11
+; GFX10_3-NEXT:    v_readlane_b32 s41, v21, 10
+; GFX10_3-NEXT:    v_readlane_b32 s40, v21, 9
+; GFX10_3-NEXT:    v_readlane_b32 s39, v21, 8
+; GFX10_3-NEXT:    v_readlane_b32 s38, v21, 7
+; GFX10_3-NEXT:    v_readlane_b32 s37, v21, 6
+; GFX10_3-NEXT:    v_readlane_b32 s36, v21, 5
+; GFX10_3-NEXT:    v_readlane_b32 s35, v21, 4
+; GFX10_3-NEXT:    v_readlane_b32 s34, v21, 3
+; GFX10_3-NEXT:    v_readlane_b32 s33, v21, 2
+; GFX10_3-NEXT:    v_readlane_b32 s31, v21, 1
+; GFX10_3-NEXT:    v_readlane_b32 s30, v21, 0
+; GFX10_3-NEXT:    s_xor_saveexec_b32 s4, -1
+; GFX10_3-NEXT:    s_add_i32 s5, s32, 0x80200
+; GFX10_3-NEXT:    buffer_load_dword v21, off, s[0:3], s5 ; 4-byte Folded Reload
+; GFX10_3-NEXT:    s_mov_b32 exec_lo, s4
+; GFX10_3-NEXT:    s_waitcnt vmcnt(0)
+; GFX10_3-NEXT:    s_setpc_b64 s[30:31]
+;
+; GFX11-LABEL: scalar_mov_materializes_frame_index_no_live_scc_no_live_sgprs__lowest_offset:
+; GFX11:       ; %bb.0:
+; GFX11-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX11-NEXT:    s_xor_saveexec_b32 s0, -1
+; GFX11-NEXT:    s_add_i32 s1, s32, 0x4010
+; GFX11-NEXT:    scratch_store_b32 off, v21, s1 ; 4-byte Folded Spill
+; GFX11-NEXT:    s_mov_b32 exec_lo, s0
+; GFX11-NEXT:    v_writelane_b32 v21, s30, 0
+; GFX11-NEXT:    v_writelane_b32 v21, s31, 1
+; GFX11-NEXT:    v_writelane_b32 v21, s33, 2
+; GFX11-NEXT:    v_writelane_b32 v21, s34, 3
+; GFX11-NEXT:    v_writelane_b32 v21, s35, 4
+; GFX11-NEXT:    v_writelane_b32 v21, s36, 5
+; GFX11-NEXT:    v_writelane_b32 v21, s37, 6
+; GFX11-NEXT:    v_writelane_b32 v21, s38, 7
+; GFX11-NEXT:    v_writelane_b32 v21, s39, 8
+; GFX11-NEXT:    v_writelane_b32 v21, s40, 9
+; GFX11-NEXT:    v_writelane_b32 v21, s41, 10
+; GFX11-NEXT:    v_writelane_b32 v21, s42, 11
+; GFX11-NEXT:    v_writelane_b32 v21, s43, 12
+; GFX11-NEXT:    v_writelane_b32 v21, s44, 13
+; GFX11-NEXT:    v_writelane_b32 v21, s45, 14
+; GFX11-NEXT:    v_writelane_b32 v21, s46, 15
+; GFX11-NEXT:    v_writelane_b32 v21, s47, 16
+; GFX11-NEXT:    v_writelane_b32 v21, s48, 17
+; GFX11-NEXT:    v_writelane_b32 v21, s49, 18
+; GFX11-NEXT:    v_writelane_b32 v21, s50, 19
+; GFX11-NEXT:    v_writelane_b32 v21, s51, 20
+; GFX11-NEXT:    v_writelane_b32 v21, s52, 21
+; GFX11-NEXT:    v_writelane_b32 v21, s53, 22
+; GFX11-NEXT:    v_writelane_b32 v21, s54, 23
+; GFX11-NEXT:    v_writelane_b32 v21, s55, 24
+; GFX11-NEXT:    v_writelane_b32 v21, s56, 25
+; GFX11-NEXT:    v_writelane_b32 v21, s57, 26
+; GFX11-NEXT:    v_writelane_b32 v21, s58, 27
+; GFX11-NEXT:    ;;#ASMSTART
+; GFX11-NEXT:    ; def s[0:15], s[16:31], s[32:47], s[48:55], s[56:57], s58, v[0:15], v[16:20], vcc
+; GFX11-NEXT:    ;;#ASMEND
+; GFX11-NEXT:    v_writelane_b32 v21, s59, 28
+; GFX11-NEXT:    s_and_b32 s59, 0, exec_lo
+; GFX11-NEXT:    s_addc_u32 s32, s32, 16
+; GFX11-NEXT:    s_delay_alu instid0(SALU_CYCLE_1) | instskip(SKIP_1) | instid1(SALU_CYCLE_1)
+; GFX11-NEXT:    s_bitcmp1_b32 s32, 0
+; GFX11-NEXT:    s_bitset0_b32 s32, 0
+; GFX11-NEXT:    s_mov_b32 s59, s32
+; GFX11-NEXT:    s_addc_u32 s32, s32, -16
+; GFX11-NEXT:    s_delay_alu instid0(SALU_CYCLE_1)
+; GFX11-NEXT:    s_bitcmp1_b32 s32, 0
+; GFX11-NEXT:    s_bitset0_b32 s32, 0
+; GFX11-NEXT:    ;;#ASMSTART
+; GFX11-NEXT:    ; use s[0:15], s[16:31], s[32:47], s[48:55], s[56:57], s58, v[0:15], v[16:20], vcc, s59, scc
+; GFX11-NEXT:    ;;#ASMEND
+; GFX11-NEXT:    v_readlane_b32 s59, v21, 28
+; GFX11-NEXT:    v_readlane_b32 s58, v21, 27
+; GFX11-NEXT:    v_readlane_b32 s57, v21, 26
+; GFX11-NEXT:    v_readlane_b32 s56, v21, 25
+; GFX11-NEXT:    v_readlane_b32 s55, v21, 24
+; GFX11-NEXT:    v_readlane_b32 s54, v21, 23
+; GFX11-NEXT:    v_readlane_b32 s53, v21, 22
+; GFX11-NEXT:    v_readlane_b32 s52, v21, 21
+; GFX11-NEXT:    v_readlane_b32 s51, v21, 20
+; GFX11-NEXT:    v_readlane_b32 s50, v21, 19
+; GFX11-NEXT:    v_readlane_b32 s49, v21, 18
+; GFX11-NEXT:    v_readlane_b32 s48, v21, 17
+; GFX11-NEXT:    v_readlane_b32 s47, v21, 16
+; GFX11-NEXT:    v_readlane_b32 s46, v21, 15
+; GFX11-NEXT:    v_readlane_b32 s45, v21, 14
+; GFX11-NEXT:    v_readlane_b32 s44, v21, 13
+; GFX11-NEXT:    v_readlane_b32 s43, v21, 12
+; GFX11-NEXT:    v_readlane_b32 s42, v21, 11
+; GFX11-NEXT:    v_readlane_b32 s41, v21, 10
+; GFX11-NEXT:    v_readlane_b32 s40, v21, 9
+; GFX11-NEXT:    v_readlane_b32 s39, v21, 8
+; GFX11-NEXT:    v_readlane_b32 s38, v21, 7
+; GFX11-NEXT:    v_readlane_b32 s37, v21, 6
+; GFX11-NEXT:    v_readlane_b32 s36, v21, 5
+; GFX11-NEXT:    v_readlane_b32 s35, v21, 4
+; GFX11-NEXT:    v_readlane_b32 s34, v21, 3
+; GFX11-NEXT:    v_readlane_b32 s33, v21, 2
+; GFX11-NEXT:    v_readlane_b32 s31, v21, 1
+; GFX11-NEXT:    v_readlane_b32 s30, v21, 0
+; GFX11-NEXT:    s_xor_saveexec_b32 s0, -1
+; GFX11-NEXT:    s_add_i32 s1, s32, 0x4010
+; GFX11-NEXT:    scratch_load_b32 v21, off, s1 ; 4-byte Folded Reload
+; GFX11-NEXT:    s_mov_b32 exec_lo, s0
+; GFX11-NEXT:    s_waitcnt vmcnt(0)
+; GFX11-NEXT:    s_setpc_b64 s[30:31]
+;
+; GFX12-LABEL: scalar_mov_materializes_frame_index_no_live_scc_no_live_sgprs__lowest_offset:
+; GFX12:       ; %bb.0:
+; GFX12-NEXT:    s_wait_loadcnt_dscnt 0x0
+; GFX12-NEXT:    s_wait_expcnt 0x0
+; GFX12-NEXT:    s_wait_samplecnt 0x0
+; GFX12-NEXT:    s_wait_bvhcnt 0x0
+; GFX12-NEXT:    s_wait_kmcnt 0x0
+; GFX12-NEXT:    s_xor_saveexec_b32 s0, -1
+; GFX12-NEXT:    scratch_store_b32 off, v21, s32 offset:16384 ; 4-byte Folded Spill
+; GFX12-NEXT:    s_mov_b32 exec_lo, s0
+; GFX12-NEXT:    v_writelane_b32 v21, s30, 0
+; GFX12-NEXT:    v_writelane_b32 v21, s31, 1
+; GFX12-NEXT:    v_writelane_b32 v21, s33, 2
+; GFX12-NEXT:    v_writelane_b32 v21, s34, 3
+; GFX12-NEXT:    v_writelane_b32 v21, s35, 4
+; GFX12-NEXT:    v_writelane_b32 v21, s36, 5
+; GFX12-NEXT:    v_writelane_b32 v21, s37, 6
+; GFX12-NEXT:    v_writelane_b32 v21, s38, 7
+; GFX12-NEXT:    v_writelane_b32 v21, s39, 8
+; GFX12-NEXT:    v_writelane_b32 v21, s40, 9
+; GFX12-NEXT:    v_writelane_b32 v21, s41, 10
+; GFX12-NEXT:    v_writelane_b32 v21, s42, 11
+; GFX12-NEXT:    v_writelane_b32 v21, s43, 12
+; GFX12-NEXT:    v_writelane_b32 v21, s44, 13
+; GFX12-NEXT:    v_writelane_b32 v21, s45, 14
+; GFX12-NEXT:    v_writelane_b32 v21, s46, 15
+; GFX12-NEXT:    v_writelane_b32 v21, s47, 16
+; GFX12-NEXT:    v_writelane_b32 v21, s48, 17
+; GFX12-NEXT:    v_writelane_b32 v21, s49, 18
+; GFX12-NEXT:    v_writelane_b32 v21, s50, 19
+; GFX12-NEXT:    v_writelane_b32 v21, s51, 20
+; GFX12-NEXT:    v_writelane_b32 v21, s52, 21
+; GFX12-NEXT:    v_writelane_b32 v21, s53, 22
+; GFX12-NEXT:    v_writelane_b32 v21, s54, 23
+; GFX12-NEXT:    v_writelane_b32 v21, s55, 24
+; GFX12-NEXT:    v_writelane_b32 v21, s56, 25
+; GFX12-NEXT:    v_writelane_b32 v21, s57, 26
+; GFX12-NEXT:    v_writelane_b32 v21, s58, 27
+; GFX12-NEXT:    ;;#ASMSTART
+; GFX12-NEXT:    ; def s[0:15], s[16:31], s[32:47], s[48:55], s[56:57], s58, v[0:15], v[16:20], vcc
+; GFX12-NEXT:    ;;#ASMEND
+; GFX12-NEXT:    v_writelane_b32 v21, s59, 28
+; GFX12-NEXT:    s_and_b32 s59, 0, exec_lo
+; GFX12-NEXT:    s_mov_b32 s59, s32
+; GFX12-NEXT:    ;;#ASMSTART
+; GFX12-NEXT:    ; use s[0:15], s[16:31], s[32:47], s[48:55], s[56:57], s58, v[0:15], v[16:20], vcc, s59, scc
+; GFX12-NEXT:    ;;#ASMEND
+; GFX12-NEXT:    s_delay_alu instid0(VALU_DEP_1)
+; GFX12-NEXT:    v_readlane_b32 s59, v21, 28
+; GFX12-NEXT:    v_readlane_b32 s58, v21, 27
+; GFX12-NEXT:    v_readlane_b32 s57, v21, 26
+; GFX12-NEXT:    v_readlane_b32 s56, v21, 25
+; GFX12-NEXT:    v_readlane_b32 s55, v21, 24
+; GFX12-NEXT:    v_readlane_b32 s54, v21, 23
+; GFX12-NEXT:    v_readlane_b32 s53, v21, 22
+; GFX12-NEXT:    v_readlane_b32 s52, v21, 21
+; GFX12-NEXT:    v_readlane_b32 s51, v21, 20
+; GFX12-NEXT:    v_readlane_b32 s50, v21, 19
+; GFX12-NEXT:    v_readlane_b32 s49, v21, 18
+; GFX12-NEXT:    v_readlane_b32 s48, v21, 17
+; GFX12-NEXT:    v_readlane_b32 s47, v21, 16
+; GFX12-NEXT:    v_readlane_b32 s46, v21, 15
+; GFX12-NEXT:    v_readlane_b32 s45, v21, 14
+; GFX12-NEXT:    v_readlane_b32 s44, v21, 13
+; GFX12-NEXT:    v_readlane_b32 s43, v21, 12
+; GFX12-NEXT:    v_readlane_b32 s42, v21, 11
+; GFX12-NEXT:    v_readlane_b32 s41, v21, 10
+; GFX12-NEXT:    v_readlane_b32 s40, v21, 9
+; GFX12-NEXT:    v_readlane_b32 s39, v21, 8
+; GFX12-NEXT:    v_readlane_b32 s38, v21, 7
+; GFX12-NEXT:    v_readlane_b32 s37, v21, 6
+; GFX12-NEXT:    v_readlane_b32 s36, v21, 5
+; GFX12-NEXT:    v_readlane_b32 s35, v21, 4
+; GFX12-NEXT:    v_readlane_b32 s34, v21, 3
+; GFX12-NEXT:    v_readlane_b32 s33, v21, 2
+; GFX12-NEXT:    v_readlane_b32 s31, v21, 1
+; GFX12-NEXT:    v_readlane_b32 s30, v21, 0
+; GFX12-NEXT:    s_xor_saveexec_b32 s0, -1
+; GFX12-NEXT:    scratch_load_b32 v21, off, s32 offset:16384 ; 4-byte Folded Reload
+; GFX12-NEXT:    s_mov_b32 exec_lo, s0
+; GFX12-NEXT:    s_wait_loadcnt 0x0
+; GFX12-NEXT:    s_setpc_b64 s[30:31]
+  %alloca0 = alloca [4096 x i32], align 16, addrspace(5)
+
+  ; Force no SGPRs to be available for the carry-out of the vector add.
+  %asm = call %asm.output2 asm sideeffect
+    "; def $0, $1, $2, $3, $4, $5, $6, $7, $8",
+    "={s[0:15]},={s[16:31]},={s[32:47]},={s[48:55]},={s[56:57]},={s58},={v[0:15]},={v[16:20]},={vcc}"()
+
+  %s0 = extractvalue %asm.output2 %asm, 0
+  %s1 = extractvalue %asm.output2 %asm, 1
+  %s2 = extractvalue %asm.output2 %asm, 2
+  %s3 = extractvalue %asm.output2 %asm, 3
+  %s4 = extractvalue %asm.output2 %asm, 4
+  %s5 = extractvalue %asm.output2 %asm, 5
+
+  %v0 = extractvalue %asm.output2 %asm, 6
+  %v1 = extractvalue %asm.output2 %asm, 7
+
+  %vcc = extractvalue %asm.output2 %asm, 8
+
+  ; scc is unavailable since it is live in
+  call void asm sideeffect "; use $0, $1, $2, $3, $4, $5, $6, $7, $8, $9, $10",
+                           "{s[0:15]},{s[16:31]},{s[32:47]},{s[48:55]},{s[56:57]},{s58},{v[0:15]},{v[16:20]},{vcc},{s59},{scc}"(
+    <16 x i32> %s0,
+    <16 x i32> %s1,
+    <16 x i32> %s2,
+    <8 x i32> %s3,
+    <2 x i32> %s4,
+    i32 %s5,
+    <16 x i32> %v0,
+    <5 x i32> %v1,
+    i64 %vcc,
+    ptr addrspace(5) %alloca0,
+    i32 0) ; use of scc
+
+  ret void
+}
+
+; This case isn't using SGPRs yet.
+; FIXME: Should also use one more VGPR, but currently fails to allocate on gfx8.
+define void @scalar_mov_materializes_frame_index_no_live_scc_no_live_sgprs_gep_immoffset() #0 {
+; GFX7-LABEL: scalar_mov_materializes_frame_index_no_live_scc_no_live_sgprs_gep_immoffset:
+; GFX7:       ; %bb.0:
+; GFX7-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX7-NEXT:    s_xor_saveexec_b64 s[4:5], -1
+; GFX7-NEXT:    s_add_i32 s6, s32, 0x201000
+; GFX7-NEXT:    buffer_store_dword v23, off, s[0:3], s6 ; 4-byte Folded Spill
+; GFX7-NEXT:    s_add_i32 s6, s32, 0x201200
+; GFX7-NEXT:    buffer_store_dword v22, off, s[0:3], s6 ; 4-byte Folded Spill
+; GFX7-NEXT:    s_mov_b64 exec, s[4:5]
+; GFX7-NEXT:    v_writelane_b32 v23, s28, 28
+; GFX7-NEXT:    v_writelane_b32 v23, s29, 29
+; GFX7-NEXT:    v_writelane_b32 v23, s30, 0
+; GFX7-NEXT:    v_writelane_b32 v23, s31, 1
+; GFX7-NEXT:    v_writelane_b32 v23, s33, 2
+; GFX7-NEXT:    v_writelane_b32 v23, s34, 3
+; GFX7-NEXT:    v_writelane_b32 v23, s35, 4
+; GFX7-NEXT:    v_writelane_b32 v23, s36, 5
+; GFX7-NEXT:    v_writelane_b32 v23, s37, 6
+; GFX7-NEXT:    v_writelane_b32 v23, s38, 7
+; GFX7-NEXT:    v_writelane_b32 v23, s39, 8
+; GFX7-NEXT:    v_writelane_b32 v23, s40, 9
+; GFX7-NEXT:    v_writelane_b32 v23, s41, 10
+; GFX7-NEXT:    v_writelane_b32 v23, s42, 11
+; GFX7-NEXT:    v_writelane_b32 v23, s43, 12
+; GFX7-NEXT:    v_writelane_b32 v23, s44, 13
+; GFX7-NEXT:    v_writelane_b32 v23, s45, 14
+; GFX7-NEXT:    v_writelane_b32 v23, s46, 15
+; GFX7-NEXT:    v_writelane_b32 v23, s47, 16
+; GFX7-NEXT:    v_writelane_b32 v23, s48, 17
+; GFX7-NEXT:    v_writelane_b32 v23, s49, 18
+; GFX7-NEXT:    v_writelane_b32 v23, s50, 19
+; GFX7-NEXT:    v_writelane_b32 v23, s51, 20
+; GFX7-NEXT:    v_writelane_b32 v23, s52, 21
+; GFX7-NEXT:    v_writelane_b32 v23, s53, 22
+; GFX7-NEXT:    v_writelane_b32 v23, s54, 23
+; GFX7-NEXT:    v_writelane_b32 v23, s55, 24
+; GFX7-NEXT:    v_lshr_b32_e64 v0, s32, 6
+; GFX7-NEXT:    v_writelane_b32 v23, s56, 25
+; GFX7-NEXT:    v_add_i32_e32 v0, vcc, 64, v0
+; GFX7-NEXT:    v_writelane_b32 v23, s57, 26
+; GFX7-NEXT:    ;;#ASMSTART
+; GFX7-NEXT:    ; use alloca0 v0
+; GFX7-NEXT:    ;;#ASMEND
+; GFX7-NEXT:    ;;#ASMSTART
+; GFX7-NEXT:    ; def s[0:15], s[16:31], s[32:47], s[48:55], s[56:57], v[0:15], v[16:21], vcc
+; GFX7-NEXT:    ;;#ASMEND
+; GFX7-NEXT:    ; implicit-def: $vgpr22
+; GFX7-NEXT:    v_writelane_b32 v23, s59, 27
+; GFX7-NEXT:    v_writelane_b32 v22, vcc_lo, 0
+; GFX7-NEXT:    v_writelane_b32 v22, vcc_hi, 1
+; GFX7-NEXT:    s_or_saveexec_b64 s[28:29], -1
+; GFX7-NEXT:    buffer_store_dword v0, off, s[0:3], s32
+; GFX7-NEXT:    v_mov_b32_e32 v0, 0x8044
+; GFX7-NEXT:    buffer_store_dword v22, v0, s[0:3], s32 offen ; 4-byte Folded Spill
+; GFX7-NEXT:    s_mov_b64 exec, s[28:29]
+; GFX7-NEXT:    buffer_load_dword v0, off, s[0:3], s32
+; GFX7-NEXT:    v_lshr_b32_e64 v22, s32, 6
+; GFX7-NEXT:    s_movk_i32 vcc_lo, 0x4040
+; GFX7-NEXT:    v_add_i32_e32 v22, vcc, vcc_lo, v22
+; GFX7-NEXT:    v_add_i32_e32 v22, vcc, 0x200, v22
+; GFX7-NEXT:    v_readfirstlane_b32 s59, v22
+; GFX7-NEXT:    s_and_b64 vcc, 0, exec
+; GFX7-NEXT:    s_mov_b64 s[28:29], exec
+; GFX7-NEXT:    s_mov_b64 exec, -1
+; GFX7-NEXT:    s_waitcnt vmcnt(0)
+; GFX7-NEXT:    buffer_store_dword v0, off, s[0:3], s32
+; GFX7-NEXT:    v_mov_b32_e32 v0, 0x8044
+; GFX7-NEXT:    buffer_load_dword v22, v0, s[0:3], s32 offen ; 4-byte Folded Reload
+; GFX7-NEXT:    s_mov_b64 exec, s[28:29]
+; GFX7-NEXT:    buffer_load_dword v0, off, s[0:3], s32
+; GFX7-NEXT:    s_waitcnt vmcnt(1)
+; GFX7-NEXT:    v_readlane_b32 vcc_lo, v22, 0
+; GFX7-NEXT:    v_readlane_b32 vcc_hi, v22, 1
+; GFX7-NEXT:    s_mov_b64 s[28:29], exec
+; GFX7-NEXT:    s_mov_b64 exec, -1
+; GFX7-NEXT:    s_mov_b64 exec, s[28:29]
+; GFX7-NEXT:    s_waitcnt vmcnt(0)
+; GFX7-NEXT:    ;;#ASMSTART
+; GFX7-NEXT:    ; use s[0:15], s[16:31], s[32:47], s[48:55], s[56:57], v[0:15], v[16:21], vcc, s59, scc
+; GFX7-NEXT:    ;;#ASMEND
+; GFX7-NEXT:    v_readlane_b32 s59, v23, 27
+; GFX7-NEXT:    v_readlane_b32 s57, v23, 26
+; GFX7-NEXT:    v_readlane_b32 s56, v23, 25
+; GFX7-NEXT:    v_readlane_b32 s55, v23, 24
+; GFX7-NEXT:    v_readlane_b32 s54, v23, 23
+; GFX7-NEXT:    v_readlane_b32 s53, v23, 22
+; GFX7-NEXT:    v_readlane_b32 s52, v23, 21
+; GFX7-NEXT:    v_readlane_b32 s51, v23, 20
+; GFX7-NEXT:    v_readlane_b32 s50, v23, 19
+; GFX7-NEXT:    v_readlane_b32 s49, v23, 18
+; GFX7-NEXT:    v_readlane_b32 s48, v23, 17
+; GFX7-NEXT:    v_readlane_b32 s47, v23, 16
+; GFX7-NEXT:    v_readlane_b32 s46, v23, 15
+; GFX7-NEXT:    v_readlane_b32 s45, v23, 14
+; GFX7-NEXT:    v_readlane_b32 s44, v23, 13
+; GFX7-NEXT:    v_readlane_b32 s43, v23, 12
+; GFX7-NEXT:    v_readlane_b32 s42, v23, 11
+; GFX7-NEXT:    v_readlane_b32 s41, v23, 10
+; GFX7-NEXT:    v_readlane_b32 s40, v23, 9
+; GFX7-NEXT:    v_readlane_b32 s39, v23, 8
+; GFX7-NEXT:    v_readlane_b32 s38, v23, 7
+; GFX7-NEXT:    v_readlane_b32 s37, v23, 6
+; GFX7-NEXT:    v_readlane_b32 s36, v23, 5
+; GFX7-NEXT:    v_readlane_b32 s35, v23, 4
+; GFX7-NEXT:    v_readlane_b32 s34, v23, 3
+; GFX7-NEXT:    v_readlane_b32 s33, v23, 2
+; GFX7-NEXT:    v_readlane_b32 s31, v23, 1
+; GFX7-NEXT:    v_readlane_b32 s30, v23, 0
+; GFX7-NEXT:    ; kill: killed $vgpr22
+; GFX7-NEXT:    v_readlane_b32 s28, v23, 28
+; GFX7-NEXT:    v_readlane_b32 s29, v23, 29
+; GFX7-NEXT:    s_xor_saveexec_b64 s[4:5], -1
+; GFX7-NEXT:    s_add_i32 s6, s32, 0x201000
+; GFX7-NEXT:    buffer_load_dword v23, off, s[0:3], s6 ; 4-byte Folded Reload
+; GFX7-NEXT:    s_add_i32 s6, s32, 0x201200
+; GFX7-NEXT:    buffer_load_dword v22, off, s[0:3], s6 ; 4-byte Folded Reload
+; GFX7-NEXT:    s_mov_b64 exec, s[4:5]
+; GFX7-NEXT:    s_waitcnt vmcnt(0)
+; GFX7-NEXT:    s_setpc_b64 s[30:31]
+;
+; GFX8-LABEL: scalar_mov_materializes_frame_index_no_live_scc_no_live_sgprs_gep_immoffset:
+; GFX8:       ; %bb.0:
+; GFX8-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX8-NEXT:    s_xor_saveexec_b64 s[4:5], -1
+; GFX8-NEXT:    s_add_i32 s6, s32, 0x201000
+; GFX8-NEXT:    buffer_store_dword v23, off, s[0:3], s6 ; 4-byte Folded Spill
+; GFX8-NEXT:    s_add_i32 s6, s32, 0x201200
+; GFX8-NEXT:    buffer_store_dword v22, off, s[0:3], s6 ; 4-byte Folded Spill
+; GFX8-NEXT:    s_mov_b64 exec, s[4:5]
+; GFX8-NEXT:    v_writelane_b32 v23, s58, 28
+; GFX8-NEXT:    v_writelane_b32 v23, s59, 29
+; GFX8-NEXT:    v_writelane_b32 v23, s30, 0
+; GFX8-NEXT:    v_writelane_b32 v23, s31, 1
+; GFX8-NEXT:    v_writelane_b32 v23, s33, 2
+; GFX8-NEXT:    v_writelane_b32 v23, s34, 3
+; GFX8-NEXT:    v_writelane_b32 v23, s35, 4
+; GFX8-NEXT:    v_writelane_b32 v23, s36, 5
+; GFX8-NEXT:    v_writelane_b32 v23, s37, 6
+; GFX8-NEXT:    v_writelane_b32 v23, s38, 7
+; GFX8-NEXT:    v_writelane_b32 v23, s39, 8
+; GFX8-NEXT:    v_writelane_b32 v23, s40, 9
+; GFX8-NEXT:    v_writelane_b32 v23, s41, 10
+; GFX8-NEXT:    v_writelane_b32 v23, s42, 11
+; GFX8-NEXT:    v_writelane_b32 v23, s43, 12
+; GFX8-NEXT:    v_writelane_b32 v23, s44, 13
+; GFX8-NEXT:    v_writelane_b32 v23, s45, 14
+; GFX8-NEXT:    v_writelane_b32 v23, s46, 15
+; GFX8-NEXT:    v_writelane_b32 v23, s47, 16
+; GFX8-NEXT:    v_writelane_b32 v23, s48, 17
+; GFX8-NEXT:    v_writelane_b32 v23, s49, 18
+; GFX8-NEXT:    v_writelane_b32 v23, s50, 19
+; GFX8-NEXT:    v_writelane_b32 v23, s51, 20
+; GFX8-NEXT:    v_writelane_b32 v23, s52, 21
+; GFX8-NEXT:    v_writelane_b32 v23, s53, 22
+; GFX8-NEXT:    v_writelane_b32 v23, s54, 23
+; GFX8-NEXT:    v_writelane_b32 v23, s55, 24
+; GFX8-NEXT:    v_lshrrev_b32_e64 v0, 6, s32
+; GFX8-NEXT:    v_writelane_b32 v23, s56, 25
+; GFX8-NEXT:    v_add_u32_e32 v0, vcc, 64, v0
+; GFX8-NEXT:    v_writelane_b32 v23, s57, 26
+; GFX8-NEXT:    ;;#ASMSTART
+; GFX8-NEXT:    ; use alloca0 v0
+; GFX8-NEXT:    ;;#ASMEND
+; GFX8-NEXT:    ;;#ASMSTART
+; GFX8-NEXT:    ; def s[0:15], s[16:31], s[32:47], s[48:55], s[56:57], v[0:15], v[16:21], vcc
+; GFX8-NEXT:    ;;#ASMEND
+; GFX8-NEXT:    ; implicit-def: $vgpr22
+; GFX8-NEXT:    v_writelane_b32 v23, s59, 27
+; GFX8-NEXT:    v_writelane_b32 v22, vcc_lo, 0
+; GFX8-NEXT:    v_writelane_b32 v22, vcc_hi, 1
+; GFX8-NEXT:    s_or_saveexec_b64 s[58:59], -1
+; GFX8-NEXT:    buffer_store_dword v0, off, s[0:3], s32
+; GFX8-NEXT:    v_mov_b32_e32 v0, 0x8044
+; GFX8-NEXT:    buffer_store_dword v22, v0, s[0:3], s32 offen ; 4-byte Folded Spill
+; GFX8-NEXT:    s_mov_b64 exec, s[58:59]
+; GFX8-NEXT:    buffer_load_dword v0, off, s[0:3], s32
+; GFX8-NEXT:    v_lshrrev_b32_e64 v22, 6, s32
+; GFX8-NEXT:    s_movk_i32 vcc_lo, 0x4040
+; GFX8-NEXT:    v_add_u32_e32 v22, vcc, vcc_lo, v22
+; GFX8-NEXT:    v_add_u32_e32 v22, vcc, 0x200, v22
+; GFX8-NEXT:    v_readfirstlane_b32 s59, v22
+; GFX8-NEXT:    s_and_b64 vcc, 0, exec
+; GFX8-NEXT:    s_mov_b64 s[58:59], exec
+; GFX8-NEXT:    s_mov_b64 exec, -1
+; GFX8-NEXT:    s_waitcnt vmcnt(0)
+; GFX8-NEXT:    buffer_store_dword v0, off, s[0:3], s32
+; GFX8-NEXT:    v_mov_b32_e32 v0, 0x8044
+; GFX8-NEXT:    buffer_load_dword v22, v0, s[0:3], s32 offen ; 4-byte Folded Reload
+; GFX8-NEXT:    s_mov_b64 exec, s[58:59]
+; GFX8-NEXT:    buffer_load_dword v0, off, s[0:3], s32
+; GFX8-NEXT:    s_waitcnt vmcnt(1)
+; GFX8-NEXT:    v_readlane_b32 vcc_lo, v22, 0
+; GFX8-NEXT:    v_readlane_b32 vcc_hi, v22, 1
+; GFX8-NEXT:    s_mov_b64 s[58:59], exec
+; GFX8-NEXT:    s_mov_b64 exec, -1
+; GFX8-NEXT:    s_mov_b64 exec, s[58:59]
+; GFX8-NEXT:    s_waitcnt vmcnt(0)
+; GFX8-NEXT:    ;;#ASMSTART
+; GFX8-NEXT:    ; use s[0:15], s[16:31], s[32:47], s[48:55], s[56:57], v[0:15], v[16:21], vcc, s59, scc
+; GFX8-NEXT:    ;;#ASMEND
+; GFX8-NEXT:    v_readlane_b32 s59, v23, 27
+; GFX8-NEXT:    v_readlane_b32 s57, v23, 26
+; GFX8-NEXT:    v_readlane_b32 s56, v23, 25
+; GFX8-NEXT:    v_readlane_b32 s55, v23, 24
+; GFX8-NEXT:    v_readlane_b32 s54, v23, 23
+; GFX8-NEXT:    v_readlane_b32 s53, v23, 22
+; GFX8-NEXT:    v_readlane_b32 s52, v23, 21
+; GFX8-NEXT:    v_readlane_b32 s51, v23, 20
+; GFX8-NEXT:    v_readlane_b32 s50, v23, 19
+; GFX8-NEXT:    v_readlane_b32 s49, v23, 18
+; GFX8-NEXT:    v_readlane_b32 s48, v23, 17
+; GFX8-NEXT:    v_readlane_b32 s47, v23, 16
+; GFX8-NEXT:    v_readlane_b32 s46, v23, 15
+; GFX8-NEXT:    v_readlane_b32 s45, v23, 14
+; GFX8-NEXT:    v_readlane_b32 s44, v23, 13
+; GFX8-NEXT:    v_readlane_b32 s43, v23, 12
+; GFX8-NEXT:    v_readlane_b32 s42, v23, 11
+; GFX8-NEXT:    v_readlane_b32 s41, v23, 10
+; GFX8-NEXT:    v_readlane_b32 s40, v23, 9
+; GFX8-NEXT:    v_readlane_b32 s39, v23, 8
+; GFX8-NEXT:    v_readlane_b32 s38, v23, 7
+; GFX8-NEXT:    v_readlane_b32 s37, v23, 6
+; GFX8-NEXT:    v_readlane_b32 s36, v23, 5
+; GFX8-NEXT:    v_readlane_b32 s35, v23, 4
+; GFX8-NEXT:    v_readlane_b32 s34, v23, 3
+; GFX8-NEXT:    v_readlane_b32 s33, v23, 2
+; GFX8-NEXT:    v_readlane_b32 s31, v23, 1
+; GFX8-NEXT:    v_readlane_b32 s30, v23, 0
+; GFX8-NEXT:    ; kill: killed $vgpr22
+; GFX8-NEXT:    v_readlane_b32 s58, v23, 28
+; GFX8-NEXT:    v_readlane_b32 s59, v23, 29
+; GFX8-NEXT:    s_xor_saveexec_b64 s[4:5], -1
+; GFX8-NEXT:    s_add_i32 s6, s32, 0x201000
+; GFX8-NEXT:    buffer_load_dword v23, off, s[0:3], s6 ; 4-byte Folded Reload
+; GFX8-NEXT:    s_add_i32 s6, s32, 0x201200
+; GFX8-NEXT:    buffer_load_dword v22, off, s[0:3], s6 ; 4-byte Folded Reload
+; GFX8-NEXT:    s_mov_b64 exec, s[4:5]
+; GFX8-NEXT:    s_waitcnt vmcnt(0)
+; GFX8-NEXT:    s_setpc_b64 s[30:31]
+;
+; GFX900-LABEL: scalar_mov_materializes_frame_index_no_live_scc_no_live_sgprs_gep_immoffset:
+; GFX900:       ; %bb.0:
+; GFX900-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX900-NEXT:    s_xor_saveexec_b64 s[4:5], -1
+; GFX900-NEXT:    s_add_i32 s6, s32, 0x201000
+; GFX900-NEXT:    buffer_store_dword v22, off, s[0:3], s6 ; 4-byte Folded Spill
+; GFX900-NEXT:    s_mov_b64 exec, s[4:5]
+; GFX900-NEXT:    v_writelane_b32 v22, s30, 0
+; GFX900-NEXT:    v_writelane_b32 v22, s31, 1
+; GFX900-NEXT:    v_writelane_b32 v22, s33, 2
+; GFX900-NEXT:    v_writelane_b32 v22, s34, 3
+; GFX900-NEXT:    v_writelane_b32 v22, s35, 4
+; GFX900-NEXT:    v_writelane_b32 v22, s36, 5
+; GFX900-NEXT:    v_writelane_b32 v22, s37, 6
+; GFX900-NEXT:    v_writelane_b32 v22, s38, 7
+; GFX900-NEXT:    v_writelane_b32 v22, s39, 8
+; GFX900-NEXT:    v_writelane_b32 v22, s40, 9
+; GFX900-NEXT:    v_writelane_b32 v22, s41, 10
+; GFX900-NEXT:    v_writelane_b32 v22, s42, 11
+; GFX900-NEXT:    v_writelane_b32 v22, s43, 12
+; GFX900-NEXT:    v_writelane_b32 v22, s44, 13
+; GFX900-NEXT:    v_writelane_b32 v22, s45, 14
+; GFX900-NEXT:    v_writelane_b32 v22, s46, 15
+; GFX900-NEXT:    v_writelane_b32 v22, s47, 16
+; GFX900-NEXT:    v_writelane_b32 v22, s48, 17
+; GFX900-NEXT:    v_writelane_b32 v22, s49, 18
+; GFX900-NEXT:    v_writelane_b32 v22, s50, 19
+; GFX900-NEXT:    v_writelane_b32 v22, s51, 20
+; GFX900-NEXT:    v_writelane_b32 v22, s52, 21
+; GFX900-NEXT:    v_writelane_b32 v22, s53, 22
+; GFX900-NEXT:    v_lshrrev_b32_e64 v0, 6, s32
+; GFX900-NEXT:    v_writelane_b32 v22, s54, 23
+; GFX900-NEXT:    v_add_u32_e32 v0, 64, v0
+; GFX900-NEXT:    v_writelane_b32 v22, s55, 24
+; GFX900-NEXT:    ;;#ASMSTART
+; GFX900-NEXT:    ; use alloca0 v0
+; GFX900-NEXT:    ;;#ASMEND
+; GFX900-NEXT:    v_lshrrev_b32_e64 v0, 6, s32
+; GFX900-NEXT:    v_writelane_b32 v22, s56, 25
+; GFX900-NEXT:    v_add_u32_e32 v0, 0x4040, v0
+; GFX900-NEXT:    v_writelane_b32 v22, s57, 26
+; GFX900-NEXT:    v_add_u32_e32 v0, 0x200, v0
+; GFX900-NEXT:    s_and_b64 s[4:5], 0, exec
+; GFX900-NEXT:    v_writelane_b32 v22, s59, 27
+; GFX900-NEXT:    v_readfirstlane_b32 s59, v0
+; GFX900-NEXT:    ;;#ASMSTART
+; GFX900-NEXT:    ; def s[0:15], s[16:31], s[32:47], s[48:55], s[56:57], v[0:15], v[16:21], vcc
+; GFX900-NEXT:    ;;#ASMEND
+; GFX900-NEXT:    ;;#ASMSTART
+; GFX900-NEXT:    ; use s[0:15], s[16:31], s[32:47], s[48:55], s[56:57], v[0:15], v[16:21], vcc, s59, scc
+; GFX900-NEXT:    ;;#ASMEND
+; GFX900-NEXT:    v_readlane_b32 s59, v22, 27
+; GFX900-NEXT:    v_readlane_b32 s57, v22, 26
+; GFX900-NEXT:    v_readlane_b32 s56, v22, 25
+; GFX900-NEXT:    v_readlane_b32 s55, v22, 24
+; GFX900-NEXT:    v_readlane_b32 s54, v22, 23
+; GFX900-NEXT:    v_readlane_b32 s53, v22, 22
+; GFX900-NEXT:    v_readlane_b32 s52, v22, 21
+; GFX900-NEXT:    v_readlane_b32 s51, v22, 20
+; GFX900-NEXT:    v_readlane_b32 s50, v22, 19
+; GFX900-NEXT:    v_readlane_b32 s49, v22, 18
+; GFX900-NEXT:    v_readlane_b32 s48, v22, 17
+; GFX900-NEXT:    v_readlane_b32 s47, v22, 16
+; GFX900-NEXT:    v_readlane_b32 s46, v22, 15
+; GFX900-NEXT:    v_readlane_b32 s45, v22, 14
+; GFX900-NEXT:    v_readlane_b32 s44, v22, 13
+; GFX900-NEXT:    v_readlane_b32 s43, v22, 12
+; GFX900-NEXT:    v_readlane_b32 s42, v22, 11
+; GFX900-NEXT:    v_readlane_b32 s41, v22, 10
+; GFX900-NEXT:    v_readlane_b32 s40, v22, 9
+; GFX900-NEXT:    v_readlane_b32 s39, v22, 8
+; GFX900-NEXT:    v_readlane_b32 s38, v22, 7
+; GFX900-NEXT:    v_readlane_b32 s37, v22, 6
+; GFX900-NEXT:    v_readlane_b32 s36, v22, 5
+; GFX900-NEXT:    v_readlane_b32 s35, v22, 4
+; GFX900-NEXT:    v_readlane_b32 s34, v22, 3
+; GFX900-NEXT:    v_readlane_b32 s33, v22, 2
+; GFX900-NEXT:    v_readlane_b32 s31, v22, 1
+; GFX900-NEXT:    v_readlane_b32 s30, v22, 0
+; GFX900-NEXT:    s_xor_saveexec_b64 s[4:5], -1
+; GFX900-NEXT:    s_add_i32 s6, s32, 0x201000
+; GFX900-NEXT:    buffer_load_dword v22, off, s[0:3], s6 ; 4-byte Folded Reload
+; GFX900-NEXT:    s_mov_b64 exec, s[4:5]
+; GFX900-NEXT:    s_waitcnt vmcnt(0)
+; GFX900-NEXT:    s_setpc_b64 s[30:31]
+;
+; GFX940-LABEL: scalar_mov_materializes_frame_index_no_live_scc_no_live_sgprs_gep_immoffset:
+; GFX940:       ; %bb.0:
+; GFX940-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX940-NEXT:    s_xor_saveexec_b64 s[0:1], -1
+; GFX940-NEXT:    s_add_i32 s2, s32, 0x8040
+; GFX940-NEXT:    scratch_store_dword off, v23, s2 sc0 sc1 ; 4-byte Folded Spill
+; GFX940-NEXT:    s_mov_b64 exec, s[0:1]
+; GFX940-NEXT:    v_writelane_b32 v23, s30, 0
+; GFX940-NEXT:    v_writelane_b32 v23, s31, 1
+; GFX940-NEXT:    v_writelane_b32 v23, s33, 2
+; GFX940-NEXT:    v_writelane_b32 v23, s34, 3
+; GFX940-NEXT:    v_writelane_b32 v23, s35, 4
+; GFX940-NEXT:    v_writelane_b32 v23, s36, 5
+; GFX940-NEXT:    v_writelane_b32 v23, s37, 6
+; GFX940-NEXT:    v_writelane_b32 v23, s38, 7
+; GFX940-NEXT:    v_writelane_b32 v23, s39, 8
+; GFX940-NEXT:    v_writelane_b32 v23, s40, 9
+; GFX940-NEXT:    v_writelane_b32 v23, s41, 10
+; GFX940-NEXT:    v_writelane_b32 v23, s42, 11
+; GFX940-NEXT:    v_writelane_b32 v23, s43, 12
+; GFX940-NEXT:    v_writelane_b32 v23, s44, 13
+; GFX940-NEXT:    v_writelane_b32 v23, s45, 14
+; GFX940-NEXT:    v_writelane_b32 v23, s46, 15
+; GFX940-NEXT:    v_writelane_b32 v23, s47, 16
+; GFX940-NEXT:    v_writelane_b32 v23, s48, 17
+; GFX940-NEXT:    v_writelane_b32 v23, s49, 18
+; GFX940-NEXT:    v_writelane_b32 v23, s50, 19
+; GFX940-NEXT:    v_writelane_b32 v23, s51, 20
+; GFX940-NEXT:    v_writelane_b32 v23, s52, 21
+; GFX940-NEXT:    v_writelane_b32 v23, s53, 22
+; GFX940-NEXT:    v_writelane_b32 v23, s54, 23
+; GFX940-NEXT:    v_writelane_b32 v23, s55, 24
+; GFX940-NEXT:    v_writelane_b32 v23, s56, 25
+; GFX940-NEXT:    s_add_i32 s0, s32, 64
+; GFX940-NEXT:    v_writelane_b32 v23, s57, 26
+; GFX940-NEXT:    v_mov_b32_e32 v0, s0
+; GFX940-NEXT:    v_writelane_b32 v23, s59, 27
+; GFX940-NEXT:    ;;#ASMSTART
+; GFX940-NEXT:    ; use alloca0 v0
+; GFX940-NEXT:    ;;#ASMEND
+; GFX940-NEXT:    ;;#ASMSTART
+; GFX940-NEXT:    ; def s[0:15], s[16:31], s[32:47], s[48:55], s[56:57], v[0:15], v[16:21], vcc
+; GFX940-NEXT:    ;;#ASMEND
+; GFX940-NEXT:    s_add_i32 s59, s32, 0x4040
+; GFX940-NEXT:    v_mov_b32_e32 v22, s59
+; GFX940-NEXT:    v_writelane_b32 v23, s60, 28
+; GFX940-NEXT:    v_add_u32_e32 v22, 0x200, v22
+; GFX940-NEXT:    v_writelane_b32 v23, s61, 29
+; GFX940-NEXT:    v_readfirstlane_b32 s59, v22
+; GFX940-NEXT:    s_and_b64 s[60:61], 0, exec
+; GFX940-NEXT:    ;;#ASMSTART
+; GFX940-NEXT:    ; use s[0:15], s[16:31], s[32:47], s[48:55], s[56:57], v[0:15], v[16:21], vcc, s59, scc
+; GFX940-NEXT:    ;;#ASMEND
+; GFX940-NEXT:    v_readlane_b32 s61, v23, 29
+; GFX940-NEXT:    v_readlane_b32 s60, v23, 28
+; GFX940-NEXT:    v_readlane_b32 s59, v23, 27
+; GFX940-NEXT:    v_readlane_b32 s57, v23, 26
+; GFX940-NEXT:    v_readlane_b32 s56, v23, 25
+; GFX940-NEXT:    v_readlane_b32 s55, v23, 24
+; GFX940-NEXT:    v_readlane_b32 s54, v23, 23
+; GFX940-NEXT:    v_readlane_b32 s53, v23, 22
+; GFX940-NEXT:    v_readlane_b32 s52, v23, 21
+; GFX940-NEXT:    v_readlane_b32 s51, v23, 20
+; GFX940-NEXT:    v_readlane_b32 s50, v23, 19
+; GFX940-NEXT:    v_readlane_b32 s49, v23, 18
+; GFX940-NEXT:    v_readlane_b32 s48, v23, 17
+; GFX940-NEXT:    v_readlane_b32 s47, v23, 16
+; GFX940-NEXT:    v_readlane_b32 s46, v23, 15
+; GFX940-NEXT:    v_readlane_b32 s45, v23, 14
+; GFX940-NEXT:    v_readlane_b32 s44, v23, 13
+; GFX940-NEXT:    v_readlane_b32 s43, v23, 12
+; GFX940-NEXT:    v_readlane_b32 s42, v23, 11
+; GFX940-NEXT:    v_readlane_b32 s41, v23, 10
+; GFX940-NEXT:    v_readlane_b32 s40, v23, 9
+; GFX940-NEXT:    v_readlane_b32 s39, v23, 8
+; GFX940-NEXT:    v_readlane_b32 s38, v23, 7
+; GFX940-NEXT:    v_readlane_b32 s37, v23, 6
+; GFX940-NEXT:    v_readlane_b32 s36, v23, 5
+; GFX940-NEXT:    v_readlane_b32 s35, v23, 4
+; GFX940-NEXT:    v_readlane_b32 s34, v23, 3
+; GFX940-NEXT:    v_readlane_b32 s33, v23, 2
+; GFX940-NEXT:    v_readlane_b32 s31, v23, 1
+; GFX940-NEXT:    v_readlane_b32 s30, v23, 0
+; GFX940-NEXT:    s_xor_saveexec_b64 s[0:1], -1
+; GFX940-NEXT:    s_add_i32 s2, s32, 0x8040
+; GFX940-NEXT:    scratch_load_dword v23, off, s2 ; 4-byte Folded Reload
+; GFX940-NEXT:    s_mov_b64 exec, s[0:1]
+; GFX940-NEXT:    s_waitcnt vmcnt(0)
+; GFX940-NEXT:    s_setpc_b64 s[30:31]
+;
+; GFX10_1-LABEL: scalar_mov_materializes_frame_index_no_live_scc_no_live_sgprs_gep_immoffset:
+; GFX10_1:       ; %bb.0:
+; GFX10_1-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX10_1-NEXT:    s_xor_saveexec_b32 s4, -1
+; GFX10_1-NEXT:    s_add_i32 s5, s32, 0x100800
+; GFX10_1-NEXT:    buffer_store_dword v23, off, s[0:3], s5 ; 4-byte Folded Spill
+; GFX10_1-NEXT:    s_waitcnt_depctr 0xffe3
+; GFX10_1-NEXT:    s_mov_b32 exec_lo, s4
+; GFX10_1-NEXT:    v_writelane_b32 v23, s30, 0
+; GFX10_1-NEXT:    v_lshrrev_b32_e64 v0, 5, s32
+; GFX10_1-NEXT:    v_lshrrev_b32_e64 v1, 5, s32
+; GFX10_1-NEXT:    s_and_b32 s4, 0, exec_lo
+; GFX10_1-NEXT:    v_writelane_b32 v23, s31, 1
+; GFX10_1-NEXT:    v_add_nc_u32_e32 v0, 0x4040, v0
+; GFX10_1-NEXT:    v_add_nc_u32_e32 v1, 64, v1
+; GFX10_1-NEXT:    ;;#ASMSTART
+; GFX10_1-NEXT:    ; use alloca0 v1
+; GFX10_1-NEXT:    ;;#ASMEND
+; GFX10_1-NEXT:    v_writelane_b32 v23, s33, 2
+; GFX10_1-NEXT:    v_add_nc_u32_e32 v22, 0x200, v0
+; GFX10_1-NEXT:    v_writelane_b32 v23, s34, 3
+; GFX10_1-NEXT:    v_writelane_b32 v23, s35, 4
+; GFX10_1-NEXT:    v_writelane_b32 v23, s36, 5
+; GFX10_1-NEXT:    v_writelane_b32 v23, s37, 6
+; GFX10_1-NEXT:    v_writelane_b32 v23, s38, 7
+; GFX10_1-NEXT:    v_writelane_b32 v23, s39, 8
+; GFX10_1-NEXT:    v_writelane_b32 v23, s40, 9
+; GFX10_1-NEXT:    v_writelane_b32 v23, s41, 10
+; GFX10_1-NEXT:    v_writelane_b32 v23, s42, 11
+; GFX10_1-NEXT:    v_writelane_b32 v23, s43, 12
+; GFX10_1-NEXT:    v_writelane_b32 v23, s44, 13
+; GFX10_1-NEXT:    v_writelane_b32 v23, s45, 14
+; GFX10_1-NEXT:    v_writelane_b32 v23, s46, 15
+; GFX10_1-NEXT:    v_writelane_b32 v23, s47, 16
+; GFX10_1-NEXT:    v_writelane_b32 v23, s48, 17
+; GFX10_1-NEXT:    v_writelane_b32 v23, s49, 18
+; GFX10_1-NEXT:    v_writelane_b32 v23, s50, 19
+; GFX10_1-NEXT:    v_writelane_b32 v23, s51, 20
+; GFX10_1-NEXT:    v_writelane_b32 v23, s52, 21
+; GFX10_1-NEXT:    v_writelane_b32 v23, s53, 22
+; GFX10_1-NEXT:    v_writelane_b32 v23, s54, 23
+; GFX10_1-NEXT:    v_writelane_b32 v23, s55, 24
+; GFX10_1-NEXT:    v_writelane_b32 v23, s56, 25
+; GFX10_1-NEXT:    v_writelane_b32 v23, s57, 26
+; GFX10_1-NEXT:    ;;#ASMSTART
+; GFX10_1-NEXT:    ; def s[0:15], s[16:31], s[32:47], s[48:55], s[56:57], v[0:15], v[16:21], vcc
+; GFX10_1-NEXT:    ;;#ASMEND
+; GFX10_1-NEXT:    v_writelane_b32 v23, s59, 27
+; GFX10_1-NEXT:    v_readfirstlane_b32 s59, v22
+; GFX10_1-NEXT:    ;;#ASMSTART
+; GFX10_1-NEXT:    ; use s[0:15], s[16:31], s[32:47], s[48:55], s[56:57], v[0:15], v[16:21], vcc, s59, scc
+; GFX10_1-NEXT:    ;;#ASMEND
+; GFX10_1-NEXT:    v_readlane_b32 s59, v23, 27
+; GFX10_1-NEXT:    v_readlane_b32 s57, v23, 26
+; GFX10_1-NEXT:    v_readlane_b32 s56, v23, 25
+; GFX10_1-NEXT:    v_readlane_b32 s55, v23, 24
+; GFX10_1-NEXT:    v_readlane_b32 s54, v23, 23
+; GFX10_1-NEXT:    v_readlane_b32 s53, v23, 22
+; GFX10_1-NEXT:    v_readlane_b32 s52, v23, 21
+; GFX10_1-NEXT:    v_readlane_b32 s51, v23, 20
+; GFX10_1-NEXT:    v_readlane_b32 s50, v23, 19
+; GFX10_1-NEXT:    v_readlane_b32 s49, v23, 18
+; GFX10_1-NEXT:    v_readlane_b32 s48, v23, 17
+; GFX10_1-NEXT:    v_readlane_b32 s47, v23, 16
+; GFX10_1-NEXT:    v_readlane_b32 s46, v23, 15
+; GFX10_1-NEXT:    v_readlane_b32 s45, v23, 14
+; GFX10_1-NEXT:    v_readlane_b32 s44, v23, 13
+; GFX10_1-NEXT:    v_readlane_b32 s43, v23, 12
+; GFX10_1-NEXT:    v_readlane_b32 s42, v23, 11
+; GFX10_1-NEXT:    v_readlane_b32 s41, v23, 10
+; GFX10_1-NEXT:    v_readlane_b32 s40, v23, 9
+; GFX10_1-NEXT:    v_readlane_b32 s39, v23, 8
+; GFX10_1-NEXT:    v_readlane_b32 s38, v23, 7
+; GFX10_1-NEXT:    v_readlane_b32 s37, v23, 6
+; GFX10_1-NEXT:    v_readlane_b32 s36, v23, 5
+; GFX10_1-NEXT:    v_readlane_b32 s35, v23, 4
+; GFX10_1-NEXT:    v_readlane_b32 s34, v23, 3
+; GFX10_1-NEXT:    v_readlane_b32 s33, v23, 2
+; GFX10_1-NEXT:    v_readlane_b32 s31, v23, 1
+; GFX10_1-NEXT:    v_readlane_b32 s30, v23, 0
+; GFX10_1-NEXT:    s_xor_saveexec_b32 s4, -1
+; GFX10_1-NEXT:    s_add_i32 s5, s32, 0x100800
+; GFX10_1-NEXT:    buffer_load_dword v23, off, s[0:3], s5 ; 4-byte Folded Reload
+; GFX10_1-NEXT:    s_waitcnt_depctr 0xffe3
+; GFX10_1-NEXT:    s_mov_b32 exec_lo, s4
+; GFX10_1-NEXT:    s_waitcnt vmcnt(0)
+; GFX10_1-NEXT:    s_setpc_b64 s[30:31]
+;
+; GFX10_3-LABEL: scalar_mov_materializes_frame_index_no_live_scc_no_live_sgprs_gep_immoffset:
+; GFX10_3:       ; %bb.0:
+; GFX10_3-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX10_3-NEXT:    s_xor_saveexec_b32 s4, -1
+; GFX10_3-NEXT:    s_add_i32 s5, s32, 0x100800
+; GFX10_3-NEXT:    buffer_store_dword v23, off, s[0:3], s5 ; 4-byte Folded Spill
+; GFX10_3-NEXT:    s_mov_b32 exec_lo, s4
+; GFX10_3-NEXT:    v_writelane_b32 v23, s30, 0
+; GFX10_3-NEXT:    v_lshrrev_b32_e64 v0, 5, s32
+; GFX10_3-NEXT:    v_lshrrev_b32_e64 v1, 5, s32
+; GFX10_3-NEXT:    s_and_b32 s4, 0, exec_lo
+; GFX10_3-NEXT:    v_writelane_b32 v23, s31, 1
+; GFX10_3-NEXT:    v_add_nc_u32_e32 v0, 0x4040, v0
+; GFX10_3-NEXT:    v_add_nc_u32_e32 v1, 64, v1
+; GFX10_3-NEXT:    ;;#ASMSTART
+; GFX10_3-NEXT:    ; use alloca0 v1
+; GFX10_3-NEXT:    ;;#ASMEND
+; GFX10_3-NEXT:    v_writelane_b32 v23, s33, 2
+; GFX10_3-NEXT:    v_add_nc_u32_e32 v22, 0x200, v0
+; GFX10_3-NEXT:    v_writelane_b32 v23, s34, 3
+; GFX10_3-NEXT:    v_writelane_b32 v23, s35, 4
+; GFX10_3-NEXT:    v_writelane_b32 v23, s36, 5
+; GFX10_3-NEXT:    v_writelane_b32 v23, s37, 6
+; GFX10_3-NEXT:    v_writelane_b32 v23, s38, 7
+; GFX10_3-NEXT:    v_writelane_b32 v23, s39, 8
+; GFX10_3-NEXT:    v_writelane_b32 v23, s40, 9
+; GFX10_3-NEXT:    v_writelane_b32 v23, s41, 10
+; GFX10_3-NEXT:    v_writelane_b32 v23, s42, 11
+; GFX10_3-NEXT:    v_writelane_b32 v23, s43, 12
+; GFX10_3-NEXT:    v_writelane_b32 v23, s44, 13
+; GFX10_3-NEXT:    v_writelane_b32 v23, s45, 14
+; GFX10_3-NEXT:    v_writelane_b32 v23, s46, 15
+; GFX10_3-NEXT:    v_writelane_b32 v23, s47, 16
+; GFX10_3-NEXT:    v_writelane_b32 v23, s48, 17
+; GFX10_3-NEXT:    v_writelane_b32 v23, s49, 18
+; GFX10_3-NEXT:    v_writelane_b32 v23, s50, 19
+; GFX10_3-NEXT:    v_writelane_b32 v23, s51, 20
+; GFX10_3-NEXT:    v_writelane_b32 v23, s52, 21
+; GFX10_3-NEXT:    v_writelane_b32 v23, s53, 22
+; GFX10_3-NEXT:    v_writelane_b32 v23, s54, 23
+; GFX10_3-NEXT:    v_writelane_b32 v23, s55, 24
+; GFX10_3-NEXT:    v_writelane_b32 v23, s56, 25
+; GFX10_3-NEXT:    v_writelane_b32 v23, s57, 26
+; GFX10_3-NEXT:    ;;#ASMSTART
+; GFX10_3-NEXT:    ; def s[0:15], s[16:31], s[32:47], s[48:55], s[56:57], v[0:15], v[16:21], vcc
+; GFX10_3-NEXT:    ;;#ASMEND
+; GFX10_3-NEXT:    v_writelane_b32 v23, s59, 27
+; GFX10_3-NEXT:    v_readfirstlane_b32 s59, v22
+; GFX10_3-NEXT:    ;;#ASMSTART
+; GFX10_3-NEXT:    ; use s[0:15], s[16:31], s[32:47], s[48:55], s[56:57], v[0:15], v[16:21], vcc, s59, scc
+; GFX10_3-NEXT:    ;;#ASMEND
+; GFX10_3-NEXT:    v_readlane_b32 s59, v23, 27
+; GFX10_3-NEXT:    v_readlane_b32 s57, v23, 26
+; GFX10_3-NEXT:    v_readlane_b32 s56, v23, 25
+; GFX10_3-NEXT:    v_readlane_b32 s55, v23, 24
+; GFX10_3-NEXT:    v_readlane_b32 s54, v23, 23
+; GFX10_3-NEXT:    v_readlane_b32 s53, v23, 22
+; GFX10_3-NEXT:    v_readlane_b32 s52, v23, 21
+; GFX10_3-NEXT:    v_readlane_b32 s51, v23, 20
+; GFX10_3-NEXT:    v_readlane_b32 s50, v23, 19
+; GFX10_3-NEXT:    v_readlane_b32 s49, v23, 18
+; GFX10_3-NEXT:    v_readlane_b32 s48, v23, 17
+; GFX10_3-NEXT:    v_readlane_b32 s47, v23, 16
+; GFX10_3-NEXT:    v_readlane_b32 s46, v23, 15
+; GFX10_3-NEXT:    v_readlane_b32 s45, v23, 14
+; GFX10_3-NEXT:    v_readlane_b32 s44, v23, 13
+; GFX10_3-NEXT:    v_readlane_b32 s43, v23, 12
+; GFX10_3-NEXT:    v_readlane_b32 s42, v23, 11
+; GFX10_3-NEXT:    v_readlane_b32 s41, v23, 10
+; GFX10_3-NEXT:    v_readlane_b32 s40, v23, 9
+; GFX10_3-NEXT:    v_readlane_b32 s39, v23, 8
+; GFX10_3-NEXT:    v_readlane_b32 s38, v23, 7
+; GFX10_3-NEXT:    v_readlane_b32 s37, v23, 6
+; GFX10_3-NEXT:    v_readlane_b32 s36, v23, 5
+; GFX10_3-NEXT:    v_readlane_b32 s35, v23, 4
+; GFX10_3-NEXT:    v_readlane_b32 s34, v23, 3
+; GFX10_3-NEXT:    v_readlane_b32 s33, v23, 2
+; GFX10_3-NEXT:    v_readlane_b32 s31, v23, 1
+; GFX10_3-NEXT:    v_readlane_b32 s30, v23, 0
+; GFX10_3-NEXT:    s_xor_saveexec_b32 s4, -1
+; GFX10_3-NEXT:    s_add_i32 s5, s32, 0x100800
+; GFX10_3-NEXT:    buffer_load_dword v23, off, s[0:3], s5 ; 4-byte Folded Reload
+; GFX10_3-NEXT:    s_mov_b32 exec_lo, s4
+; GFX10_3-NEXT:    s_waitcnt vmcnt(0)
+; GFX10_3-NEXT:    s_setpc_b64 s[30:31]
+;
+; GFX11-LABEL: scalar_mov_materializes_frame_index_no_live_scc_no_live_sgprs_gep_immoffset:
+; GFX11:       ; %bb.0:
+; GFX11-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX11-NEXT:    s_xor_saveexec_b32 s0, -1
+; GFX11-NEXT:    s_add_i32 s1, s32, 0x8040
+; GFX11-NEXT:    scratch_store_b32 off, v23, s1 ; 4-byte Folded Spill
+; GFX11-NEXT:    s_mov_b32 exec_lo, s0
+; GFX11-NEXT:    v_writelane_b32 v23, s30, 0
+; GFX11-NEXT:    s_add_i32 s0, s32, 0x4040
+; GFX11-NEXT:    s_delay_alu instid0(SALU_CYCLE_1)
+; GFX11-NEXT:    v_mov_b32_e32 v0, s0
+; GFX11-NEXT:    s_add_i32 s0, s32, 64
+; GFX11-NEXT:    v_writelane_b32 v23, s31, 1
+; GFX11-NEXT:    v_mov_b32_e32 v1, s0
+; GFX11-NEXT:    s_and_b32 s0, 0, exec_lo
+; GFX11-NEXT:    v_add_nc_u32_e32 v22, 0x200, v0
+; GFX11-NEXT:    ;;#ASMSTART
+; GFX11-NEXT:    ; use alloca0 v1
+; GFX11-NEXT:    ;;#ASMEND
+; GFX11-NEXT:    v_writelane_b32 v23, s33, 2
+; GFX11-NEXT:    v_writelane_b32 v23, s34, 3
+; GFX11-NEXT:    v_writelane_b32 v23, s35, 4
+; GFX11-NEXT:    v_writelane_b32 v23, s36, 5
+; GFX11-NEXT:    v_writelane_b32 v23, s37, 6
+; GFX11-NEXT:    v_writelane_b32 v23, s38, 7
+; GFX11-NEXT:    v_writelane_b32 v23, s39, 8
+; GFX11-NEXT:    v_writelane_b32 v23, s40, 9
+; GFX11-NEXT:    v_writelane_b32 v23, s41, 10
+; GFX11-NEXT:    v_writelane_b32 v23, s42, 11
+; GFX11-NEXT:    v_writelane_b32 v23, s43, 12
+; GFX11-NEXT:    v_writelane_b32 v23, s44, 13
+; GFX11-NEXT:    v_writelane_b32 v23, s45, 14
+; GFX11-NEXT:    v_writelane_b32 v23, s46, 15
+; GFX11-NEXT:    v_writelane_b32 v23, s47, 16
+; GFX11-NEXT:    v_writelane_b32 v23, s48, 17
+; GFX11-NEXT:    v_writelane_b32 v23, s49, 18
+; GFX11-NEXT:    v_writelane_b32 v23, s50, 19
+; GFX11-NEXT:    v_writelane_b32 v23, s51, 20
+; GFX11-NEXT:    v_writelane_b32 v23, s52, 21
+; GFX11-NEXT:    v_writelane_b32 v23, s53, 22
+; GFX11-NEXT:    v_writelane_b32 v23, s54, 23
+; GFX11-NEXT:    v_writelane_b32 v23, s55, 24
+; GFX11-NEXT:    v_writelane_b32 v23, s56, 25
+; GFX11-NEXT:    v_writelane_b32 v23, s57, 26
+; GFX11-NEXT:    ;;#ASMSTART
+; GFX11-NEXT:    ; def s[0:15], s[16:31], s[32:47], s[48:55], s[56:57], v[0:15], v[16:21], vcc
+; GFX11-NEXT:    ;;#ASMEND
+; GFX11-NEXT:    v_writelane_b32 v23, s59, 27
+; GFX11-NEXT:    v_readfirstlane_b32 s59, v22
+; GFX11-NEXT:    ;;#ASMSTART
+; GFX11-NEXT:    ; use s[0:15], s[16:31], s[32:47], s[48:55], s[56:57], v[0:15], v[16:21], vcc, s59, scc
+; GFX11-NEXT:    ;;#ASMEND
+; GFX11-NEXT:    s_delay_alu instid0(VALU_DEP_2)
+; GFX11-NEXT:    v_readlane_b32 s59, v23, 27
+; GFX11-NEXT:    v_readlane_b32 s57, v23, 26
+; GFX11-NEXT:    v_readlane_b32 s56, v23, 25
+; GFX11-NEXT:    v_readlane_b32 s55, v23, 24
+; GFX11-NEXT:    v_readlane_b32 s54, v23, 23
+; GFX11-NEXT:    v_readlane_b32 s53, v23, 22
+; GFX11-NEXT:    v_readlane_b32 s52, v23, 21
+; GFX11-NEXT:    v_readlane_b32 s51, v23, 20
+; GFX11-NEXT:    v_readlane_b32 s50, v23, 19
+; GFX11-NEXT:    v_readlane_b32 s49, v23, 18
+; GFX11-NEXT:    v_readlane_b32 s48, v23, 17
+; GFX11-NEXT:    v_readlane_b32 s47, v23, 16
+; GFX11-NEXT:    v_readlane_b32 s46, v23, 15
+; GFX11-NEXT:    v_readlane_b32 s45, v23, 14
+; GFX11-NEXT:    v_readlane_b32 s44, v23, 13
+; GFX11-NEXT:    v_readlane_b32 s43, v23, 12
+; GFX11-NEXT:    v_readlane_b32 s42, v23, 11
+; GFX11-NEXT:    v_readlane_b32 s41, v23, 10
+; GFX11-NEXT:    v_readlane_b32 s40, v23, 9
+; GFX11-NEXT:    v_readlane_b32 s39, v23, 8
+; GFX11-NEXT:    v_readlane_b32 s38, v23, 7
+; GFX11-NEXT:    v_readlane_b32 s37, v23, 6
+; GFX11-NEXT:    v_readlane_b32 s36, v23, 5
+; GFX11-NEXT:    v_readlane_b32 s35, v23, 4
+; GFX11-NEXT:    v_readlane_b32 s34, v23, 3
+; GFX11-NEXT:    v_readlane_b32 s33, v23, 2
+; GFX11-NEXT:    v_readlane_b32 s31, v23, 1
+; GFX11-NEXT:    v_readlane_b32 s30, v23, 0
+; GFX11-NEXT:    s_xor_saveexec_b32 s0, -1
+; GFX11-NEXT:    s_add_i32 s1, s32, 0x8040
+; GFX11-NEXT:    scratch_load_b32 v23, off, s1 ; 4-byte Folded Reload
+; GFX11-NEXT:    s_mov_b32 exec_lo, s0
+; GFX11-NEXT:    s_waitcnt vmcnt(0)
+; GFX11-NEXT:    s_setpc_b64 s[30:31]
+;
+; GFX12-LABEL: scalar_mov_materializes_frame_index_no_live_scc_no_live_sgprs_gep_immoffset:
+; GFX12:       ; %bb.0:
+; GFX12-NEXT:    s_wait_loadcnt_dscnt 0x0
+; GFX12-NEXT:    s_wait_expcnt 0x0
+; GFX12-NEXT:    s_wait_samplecnt 0x0
+; GFX12-NEXT:    s_wait_bvhcnt 0x0
+; GFX12-NEXT:    s_wait_kmcnt 0x0
+; GFX12-NEXT:    s_xor_saveexec_b32 s0, -1
+; GFX12-NEXT:    scratch_store_b32 off, v23, s32 offset:32768 ; 4-byte Folded Spill
+; GFX12-NEXT:    s_mov_b32 exec_lo, s0
+; GFX12-NEXT:    v_writelane_b32 v23, s30, 0
+; GFX12-NEXT:    s_add_co_i32 s0, s32, 0x4000
+; GFX12-NEXT:    s_delay_alu instid0(SALU_CYCLE_1) | instskip(SKIP_3) | instid1(VALU_DEP_2)
+; GFX12-NEXT:    v_dual_mov_b32 v1, s32 :: v_dual_mov_b32 v0, s0
+; GFX12-NEXT:    s_and_b32 s0, 0, exec_lo
+; GFX12-NEXT:    v_writelane_b32 v23, s31, 1
+; GFX12-NEXT:    ;;#ASMSTART
+; GFX12-NEXT:    ; use alloca0 v1
+; GFX12-NEXT:    ;;#ASMEND
+; GFX12-NEXT:    v_add_nc_u32_e32 v22, 0x200, v0
+; GFX12-NEXT:    v_writelane_b32 v23, s33, 2
+; GFX12-NEXT:    v_writelane_b32 v23, s34, 3
+; GFX12-NEXT:    v_writelane_b32 v23, s35, 4
+; GFX12-NEXT:    v_writelane_b32 v23, s36, 5
+; GFX12-NEXT:    v_writelane_b32 v23, s37, 6
+; GFX12-NEXT:    v_writelane_b32 v23, s38, 7
+; GFX12-NEXT:    v_writelane_b32 v23, s39, 8
+; GFX12-NEXT:    v_writelane_b32 v23, s40, 9
+; GFX12-NEXT:    v_writelane_b32 v23, s41, 10
+; GFX12-NEXT:    v_writelane_b32 v23, s42, 11
+; GFX12-NEXT:    v_writelane_b32 v23, s43, 12
+; GFX12-NEXT:    v_writelane_b32 v23, s44, 13
+; GFX12-NEXT:    v_writelane_b32 v23, s45, 14
+; GFX12-NEXT:    v_writelane_b32 v23, s46, 15
+; GFX12-NEXT:    v_writelane_b32 v23, s47, 16
+; GFX12-NEXT:    v_writelane_b32 v23, s48, 17
+; GFX12-NEXT:    v_writelane_b32 v23, s49, 18
+; GFX12-NEXT:    v_writelane_b32 v23, s50, 19
+; GFX12-NEXT:    v_writelane_b32 v23, s51, 20
+; GFX12-NEXT:    v_writelane_b32 v23, s52, 21
+; GFX12-NEXT:    v_writelane_b32 v23, s53, 22
+; GFX12-NEXT:    v_writelane_b32 v23, s54, 23
+; GFX12-NEXT:    v_writelane_b32 v23, s55, 24
+; GFX12-NEXT:    v_writelane_b32 v23, s56, 25
+; GFX12-NEXT:    v_writelane_b32 v23, s57, 26
+; GFX12-NEXT:    ;;#ASMSTART
+; GFX12-NEXT:    ; def s[0:15], s[16:31], s[32:47], s[48:55], s[56:57], v[0:15], v[16:21], vcc
+; GFX12-NEXT:    ;;#ASMEND
+; GFX12-NEXT:    v_writelane_b32 v23, s59, 27
+; GFX12-NEXT:    v_readfirstlane_b32 s59, v22
+; GFX12-NEXT:    ;;#ASMSTART
+; GFX12-NEXT:    ; use s[0:15], s[16:31], s[32:47], s[48:55], s[56:57], v[0:15], v[16:21], vcc, s59, scc
+; GFX12-NEXT:    ;;#ASMEND
+; GFX12-NEXT:    s_delay_alu instid0(VALU_DEP_2)
+; GFX12-NEXT:    v_readlane_b32 s59, v23, 27
+; GFX12-NEXT:    v_readlane_b32 s57, v23, 26
+; GFX12-NEXT:    v_readlane_b32 s56, v23, 25
+; GFX12-NEXT:    v_readlane_b32 s55, v23, 24
+; GFX12-NEXT:    v_readlane_b32 s54, v23, 23
+; GFX12-NEXT:    v_readlane_b32 s53, v23, 22
+; GFX12-NEXT:    v_readlane_b32 s52, v23, 21
+; GFX12-NEXT:    v_readlane_b32 s51, v23, 20
+; GFX12-NEXT:    v_readlane_b32 s50, v23, 19
+; GFX12-NEXT:    v_readlane_b32 s49, v23, 18
+; GFX12-NEXT:    v_readlane_b32 s48, v23, 17
+; GFX12-NEXT:    v_readlane_b32 s47, v23, 16
+; GFX12-NEXT:    v_readlane_b32 s46, v23, 15
+; GFX12-NEXT:    v_readlane_b32 s45, v23, 14
+; GFX12-NEXT:    v_readlane_b32 s44, v23, 13
+; GFX12-NEXT:    v_readlane_b32 s43, v23, 12
+; GFX12-NEXT:    v_readlane_b32 s42, v23, 11
+; GFX12-NEXT:    v_readlane_b32 s41, v23, 10
+; GFX12-NEXT:    v_readlane_b32 s40, v23, 9
+; GFX12-NEXT:    v_readlane_b32 s39, v23, 8
+; GFX12-NEXT:    v_readlane_b32 s38, v23, 7
+; GFX12-NEXT:    v_readlane_b32 s37, v23, 6
+; GFX12-NEXT:    v_readlane_b32 s36, v23, 5
+; GFX12-NEXT:    v_readlane_b32 s35, v23, 4
+; GFX12-NEXT:    v_readlane_b32 s34, v23, 3
+; GFX12-NEXT:    v_readlane_b32 s33, v23, 2
+; GFX12-NEXT:    v_readlane_b32 s31, v23, 1
+; GFX12-NEXT:    v_readlane_b32 s30, v23, 0
+; GFX12-NEXT:    s_xor_saveexec_b32 s0, -1
+; GFX12-NEXT:    scratch_load_b32 v23, off, s32 offset:32768 ; 4-byte Folded Reload
+; GFX12-NEXT:    s_mov_b32 exec_lo, s0
+; GFX12-NEXT:    s_wait_loadcnt 0x0
+; GFX12-NEXT:    s_setpc_b64 s[30:31]
+  %alloca0 = alloca [4096 x i32], align 64, addrspace(5)
+  %alloca1 = alloca [4096 x i32], align 4, addrspace(5)
+  call void asm sideeffect "; use alloca0 $0", "v"(ptr addrspace(5) %alloca0)
+
+  ; Force no SGPRs to be available for the carry-out of the vector add.
+  %asm = call %asm.output3 asm sideeffect
+    "; def $0, $1, $2, $3, $4, $5, $6, $7",
+    "={s[0:15]},={s[16:31]},={s[32:47]},={s[48:55]},={s[56:57]},={v[0:15]},={v[16:21]},={vcc}"()
+
+  %s0 = extractvalue %asm.output3 %asm, 0
+  %s1 = extractvalue %asm.output3 %asm, 1
+  %s2 = extractvalue %asm.output3 %asm, 2
+  %s3 = extractvalue %asm.output3 %asm, 3
+  %s4 = extractvalue %asm.output3 %asm, 4
+
+  %v0 = extractvalue %asm.output3 %asm, 5
+  %v1 = extractvalue %asm.output3 %asm, 6
+
+  %vcc = extractvalue %asm.output3 %asm, 7
+
+  %alloca1.offset = getelementptr [4096 x i32], ptr addrspace(5) %alloca1, i32 0, i32 128
+
+  ; scc is unavailable since it is live in
+  call void asm sideeffect "; use $0, $1, $2, $3, $4, $5, $6, $7, $8, $9",
+                           "{s[0:15]},{s[16:31]},{s[32:47]},{s[48:55]},{s[56:57]},{v[0:15]},{v[16:21]},{vcc},{s59},{scc}"(
+    <16 x i32> %s0,
+    <16 x i32> %s1,
+    <16 x i32> %s2,
+    <8 x i32> %s3,
+    <2 x i32> %s4,
+    <16 x i32> %v0,
+    <6 x i32> %v1,
+    i64 %vcc,
+    ptr addrspace(5) %alloca1.offset,
+    i32 0) ; use of scc
+
+  ret void
+}
+
+; For gfx8/gfx9, this should enforce a budget of 24 VGPRs, and 60 SGPRs (4
+; are reserved at the end for xnack + vcc).
+attributes #0 = { nounwind alignstack=64 "amdgpu-no-workitem-id-x" "amdgpu-no-workitem-id-y" "amdgpu-no-workitem-id-z" "amdgpu-waves-per-eu"="10,10" "no-realign-stack" }
+attributes #1 = { nounwind alignstack=16 "amdgpu-no-workitem-id-x" "amdgpu-no-workitem-id-y" "amdgpu-no-workitem-id-z" "amdgpu-waves-per-eu"="10,10" "no-realign-stack" }
+;; NOTE: These prefixes are unused and the list is autogenerated. Do not add tests below this line:
+; GFX9: {{.*}}
diff --git a/llvm/test/CodeGen/AMDGPU/mmra.ll b/llvm/test/CodeGen/AMDGPU/mmra.ll
index d9b48f7..0167fcb 100644
--- a/llvm/test/CodeGen/AMDGPU/mmra.ll
+++ b/llvm/test/CodeGen/AMDGPU/mmra.ll
@@ -17,10 +17,9 @@ define void @fence_loads(ptr %ptr) {
   ; CHECK-NEXT:   ATOMIC_FENCE 5, 1, mmra !0
   ; CHECK-NEXT:   [[COPY2:%[0-9]+]]:vreg_64 = COPY [[REG_SEQUENCE]], mmra !1
   ; CHECK-NEXT:   [[FLAT_LOAD_UBYTE:%[0-9]+]]:vgpr_32 = FLAT_LOAD_UBYTE [[COPY2]], 0, 0, implicit $exec, implicit $flat_scr, mmra !1 :: (load acquire (s8) from %ir.ptr, align 4)
-  ; CHECK-NEXT:   [[S_MOV_B32_:%[0-9]+]]:sreg_32 = S_MOV_B32 1, mmra !2
+  ; CHECK-NEXT:   [[V_MOV_B32_e32_:%[0-9]+]]:vgpr_32 = V_MOV_B32_e32 1, implicit $exec, mmra !2
   ; CHECK-NEXT:   [[COPY3:%[0-9]+]]:vreg_64 = COPY [[REG_SEQUENCE]], mmra !2
-  ; CHECK-NEXT:   [[COPY4:%[0-9]+]]:vgpr_32 = COPY [[S_MOV_B32_]], mmra !2
-  ; CHECK-NEXT:   FLAT_STORE_BYTE [[COPY3]], killed [[COPY4]], 0, 0, implicit $exec, implicit $flat_scr, mmra !2 :: (store release (s8) into %ir.ptr, align 4)
+  ; CHECK-NEXT:   FLAT_STORE_BYTE [[COPY3]], killed [[V_MOV_B32_e32_]], 0, 0, implicit $exec, implicit $flat_scr, mmra !2 :: (store release (s8) into %ir.ptr, align 4)
   ; CHECK-NEXT:   SI_RETURN
   fence release,                                        !mmra !0
   %ld = load atomic i8, ptr %ptr acquire, align 4,      !mmra !2
diff --git a/llvm/test/CodeGen/AMDGPU/offset-split-flat.ll b/llvm/test/CodeGen/AMDGPU/offset-split-flat.ll
index 5c09d2b..31e8a49 100644
--- a/llvm/test/CodeGen/AMDGPU/offset-split-flat.ll
+++ b/llvm/test/CodeGen/AMDGPU/offset-split-flat.ll
@@ -858,14 +858,14 @@ define i8 @flat_inst_valu_offset_64bit_11bit_split0(ptr %p) {
 ; GFX9-SDAG-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
 ; GFX9-SDAG-NEXT:    s_setpc_b64 s[30:31]
 ;
-; GFX10-SDAG-LABEL: flat_inst_valu_offset_64bit_11bit_split0:
-; GFX10-SDAG:       ; %bb.0:
-; GFX10-SDAG-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX10-SDAG-NEXT:    v_add_co_u32 v0, vcc_lo, 0x7ff, v0
-; GFX10-SDAG-NEXT:    v_add_co_ci_u32_e32 v1, vcc_lo, 2, v1, vcc_lo
-; GFX10-SDAG-NEXT:    flat_load_ubyte v0, v[0:1]
-; GFX10-SDAG-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
-; GFX10-SDAG-NEXT:    s_setpc_b64 s[30:31]
+; GFX10-LABEL: flat_inst_valu_offset_64bit_11bit_split0:
+; GFX10:       ; %bb.0:
+; GFX10-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX10-NEXT:    v_add_co_u32 v0, vcc_lo, 0x7ff, v0
+; GFX10-NEXT:    v_add_co_ci_u32_e32 v1, vcc_lo, 2, v1, vcc_lo
+; GFX10-NEXT:    flat_load_ubyte v0, v[0:1]
+; GFX10-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
+; GFX10-NEXT:    s_setpc_b64 s[30:31]
 ;
 ; GFX11-SDAG-LABEL: flat_inst_valu_offset_64bit_11bit_split0:
 ; GFX11-SDAG:       ; %bb.0:
@@ -892,39 +892,17 @@ define i8 @flat_inst_valu_offset_64bit_11bit_split0(ptr %p) {
 ; GFX9-GISEL-LABEL: flat_inst_valu_offset_64bit_11bit_split0:
 ; GFX9-GISEL:       ; %bb.0:
 ; GFX9-GISEL-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX9-GISEL-NEXT:    s_movk_i32 s4, 0x7ff
-; GFX9-GISEL-NEXT:    s_mov_b32 s5, 2
-; GFX9-GISEL-NEXT:    v_mov_b32_e32 v2, s4
-; GFX9-GISEL-NEXT:    v_mov_b32_e32 v3, s5
-; GFX9-GISEL-NEXT:    v_add_co_u32_e32 v0, vcc, v0, v2
-; GFX9-GISEL-NEXT:    v_addc_co_u32_e32 v1, vcc, v1, v3, vcc
+; GFX9-GISEL-NEXT:    v_add_co_u32_e32 v0, vcc, 0x7ff, v0
+; GFX9-GISEL-NEXT:    v_addc_co_u32_e32 v1, vcc, 2, v1, vcc
 ; GFX9-GISEL-NEXT:    flat_load_ubyte v0, v[0:1]
 ; GFX9-GISEL-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
 ; GFX9-GISEL-NEXT:    s_setpc_b64 s[30:31]
 ;
-; GFX10-GISEL-LABEL: flat_inst_valu_offset_64bit_11bit_split0:
-; GFX10-GISEL:       ; %bb.0:
-; GFX10-GISEL-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX10-GISEL-NEXT:    s_movk_i32 s4, 0x7ff
-; GFX10-GISEL-NEXT:    s_mov_b32 s5, 2
-; GFX10-GISEL-NEXT:    v_mov_b32_e32 v2, s4
-; GFX10-GISEL-NEXT:    v_mov_b32_e32 v3, s5
-; GFX10-GISEL-NEXT:    v_add_co_u32 v0, vcc_lo, v0, v2
-; GFX10-GISEL-NEXT:    v_add_co_ci_u32_e32 v1, vcc_lo, v1, v3, vcc_lo
-; GFX10-GISEL-NEXT:    flat_load_ubyte v0, v[0:1]
-; GFX10-GISEL-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
-; GFX10-GISEL-NEXT:    s_setpc_b64 s[30:31]
-;
 ; GFX11-GISEL-LABEL: flat_inst_valu_offset_64bit_11bit_split0:
 ; GFX11-GISEL:       ; %bb.0:
 ; GFX11-GISEL-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX11-GISEL-NEXT:    s_movk_i32 s0, 0x7ff
-; GFX11-GISEL-NEXT:    s_mov_b32 s1, 2
-; GFX11-GISEL-NEXT:    s_delay_alu instid0(SALU_CYCLE_1) | instskip(NEXT) | instid1(VALU_DEP_1)
-; GFX11-GISEL-NEXT:    v_dual_mov_b32 v3, s1 :: v_dual_mov_b32 v2, s0
-; GFX11-GISEL-NEXT:    v_add_co_u32 v0, vcc_lo, v0, v2
-; GFX11-GISEL-NEXT:    s_delay_alu instid0(VALU_DEP_2)
-; GFX11-GISEL-NEXT:    v_add_co_ci_u32_e32 v1, vcc_lo, v1, v3, vcc_lo
+; GFX11-GISEL-NEXT:    v_add_co_u32 v0, vcc_lo, 0x7ff, v0
+; GFX11-GISEL-NEXT:    v_add_co_ci_u32_e32 v1, vcc_lo, 2, v1, vcc_lo
 ; GFX11-GISEL-NEXT:    flat_load_u8 v0, v[0:1]
 ; GFX11-GISEL-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
 ; GFX11-GISEL-NEXT:    s_setpc_b64 s[30:31]
@@ -936,13 +914,8 @@ define i8 @flat_inst_valu_offset_64bit_11bit_split0(ptr %p) {
 ; GFX12-GISEL-NEXT:    s_wait_samplecnt 0x0
 ; GFX12-GISEL-NEXT:    s_wait_bvhcnt 0x0
 ; GFX12-GISEL-NEXT:    s_wait_kmcnt 0x0
-; GFX12-GISEL-NEXT:    s_movk_i32 s0, 0x7ff
-; GFX12-GISEL-NEXT:    s_mov_b32 s1, 2
-; GFX12-GISEL-NEXT:    s_delay_alu instid0(SALU_CYCLE_1) | instskip(NEXT) | instid1(VALU_DEP_1)
-; GFX12-GISEL-NEXT:    v_dual_mov_b32 v3, s1 :: v_dual_mov_b32 v2, s0
-; GFX12-GISEL-NEXT:    v_add_co_u32 v0, vcc_lo, v0, v2
-; GFX12-GISEL-NEXT:    s_delay_alu instid0(VALU_DEP_2)
-; GFX12-GISEL-NEXT:    v_add_co_ci_u32_e32 v1, vcc_lo, v1, v3, vcc_lo
+; GFX12-GISEL-NEXT:    v_add_co_u32 v0, vcc_lo, 0x7ff, v0
+; GFX12-GISEL-NEXT:    v_add_co_ci_u32_e32 v1, vcc_lo, 2, v1, vcc_lo
 ; GFX12-GISEL-NEXT:    flat_load_u8 v0, v[0:1]
 ; GFX12-GISEL-NEXT:    s_wait_loadcnt_dscnt 0x0
 ; GFX12-GISEL-NEXT:    s_setpc_b64 s[30:31]
@@ -962,14 +935,14 @@ define i8 @flat_inst_valu_offset_64bit_11bit_split1(ptr %p) {
 ; GFX9-SDAG-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
 ; GFX9-SDAG-NEXT:    s_setpc_b64 s[30:31]
 ;
-; GFX10-SDAG-LABEL: flat_inst_valu_offset_64bit_11bit_split1:
-; GFX10-SDAG:       ; %bb.0:
-; GFX10-SDAG-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX10-SDAG-NEXT:    v_add_co_u32 v0, vcc_lo, 0x800, v0
-; GFX10-SDAG-NEXT:    v_add_co_ci_u32_e32 v1, vcc_lo, 2, v1, vcc_lo
-; GFX10-SDAG-NEXT:    flat_load_ubyte v0, v[0:1]
-; GFX10-SDAG-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
-; GFX10-SDAG-NEXT:    s_setpc_b64 s[30:31]
+; GFX10-LABEL: flat_inst_valu_offset_64bit_11bit_split1:
+; GFX10:       ; %bb.0:
+; GFX10-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX10-NEXT:    v_add_co_u32 v0, vcc_lo, 0x800, v0
+; GFX10-NEXT:    v_add_co_ci_u32_e32 v1, vcc_lo, 2, v1, vcc_lo
+; GFX10-NEXT:    flat_load_ubyte v0, v[0:1]
+; GFX10-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
+; GFX10-NEXT:    s_setpc_b64 s[30:31]
 ;
 ; GFX11-SDAG-LABEL: flat_inst_valu_offset_64bit_11bit_split1:
 ; GFX11-SDAG:       ; %bb.0:
@@ -996,39 +969,17 @@ define i8 @flat_inst_valu_offset_64bit_11bit_split1(ptr %p) {
 ; GFX9-GISEL-LABEL: flat_inst_valu_offset_64bit_11bit_split1:
 ; GFX9-GISEL:       ; %bb.0:
 ; GFX9-GISEL-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX9-GISEL-NEXT:    s_movk_i32 s4, 0x800
-; GFX9-GISEL-NEXT:    s_mov_b32 s5, 2
-; GFX9-GISEL-NEXT:    v_mov_b32_e32 v2, s4
-; GFX9-GISEL-NEXT:    v_mov_b32_e32 v3, s5
-; GFX9-GISEL-NEXT:    v_add_co_u32_e32 v0, vcc, v0, v2
-; GFX9-GISEL-NEXT:    v_addc_co_u32_e32 v1, vcc, v1, v3, vcc
+; GFX9-GISEL-NEXT:    v_add_co_u32_e32 v0, vcc, 0x800, v0
+; GFX9-GISEL-NEXT:    v_addc_co_u32_e32 v1, vcc, 2, v1, vcc
 ; GFX9-GISEL-NEXT:    flat_load_ubyte v0, v[0:1]
 ; GFX9-GISEL-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
 ; GFX9-GISEL-NEXT:    s_setpc_b64 s[30:31]
 ;
-; GFX10-GISEL-LABEL: flat_inst_valu_offset_64bit_11bit_split1:
-; GFX10-GISEL:       ; %bb.0:
-; GFX10-GISEL-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX10-GISEL-NEXT:    s_movk_i32 s4, 0x800
-; GFX10-GISEL-NEXT:    s_mov_b32 s5, 2
-; GFX10-GISEL-NEXT:    v_mov_b32_e32 v2, s4
-; GFX10-GISEL-NEXT:    v_mov_b32_e32 v3, s5
-; GFX10-GISEL-NEXT:    v_add_co_u32 v0, vcc_lo, v0, v2
-; GFX10-GISEL-NEXT:    v_add_co_ci_u32_e32 v1, vcc_lo, v1, v3, vcc_lo
-; GFX10-GISEL-NEXT:    flat_load_ubyte v0, v[0:1]
-; GFX10-GISEL-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
-; GFX10-GISEL-NEXT:    s_setpc_b64 s[30:31]
-;
 ; GFX11-GISEL-LABEL: flat_inst_valu_offset_64bit_11bit_split1:
 ; GFX11-GISEL:       ; %bb.0:
 ; GFX11-GISEL-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX11-GISEL-NEXT:    s_movk_i32 s0, 0x800
-; GFX11-GISEL-NEXT:    s_mov_b32 s1, 2
-; GFX11-GISEL-NEXT:    s_delay_alu instid0(SALU_CYCLE_1) | instskip(NEXT) | instid1(VALU_DEP_1)
-; GFX11-GISEL-NEXT:    v_dual_mov_b32 v3, s1 :: v_dual_mov_b32 v2, s0
-; GFX11-GISEL-NEXT:    v_add_co_u32 v0, vcc_lo, v0, v2
-; GFX11-GISEL-NEXT:    s_delay_alu instid0(VALU_DEP_2)
-; GFX11-GISEL-NEXT:    v_add_co_ci_u32_e32 v1, vcc_lo, v1, v3, vcc_lo
+; GFX11-GISEL-NEXT:    v_add_co_u32 v0, vcc_lo, 0x800, v0
+; GFX11-GISEL-NEXT:    v_add_co_ci_u32_e32 v1, vcc_lo, 2, v1, vcc_lo
 ; GFX11-GISEL-NEXT:    flat_load_u8 v0, v[0:1]
 ; GFX11-GISEL-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
 ; GFX11-GISEL-NEXT:    s_setpc_b64 s[30:31]
@@ -1040,13 +991,8 @@ define i8 @flat_inst_valu_offset_64bit_11bit_split1(ptr %p) {
 ; GFX12-GISEL-NEXT:    s_wait_samplecnt 0x0
 ; GFX12-GISEL-NEXT:    s_wait_bvhcnt 0x0
 ; GFX12-GISEL-NEXT:    s_wait_kmcnt 0x0
-; GFX12-GISEL-NEXT:    s_movk_i32 s0, 0x800
-; GFX12-GISEL-NEXT:    s_mov_b32 s1, 2
-; GFX12-GISEL-NEXT:    s_delay_alu instid0(SALU_CYCLE_1) | instskip(NEXT) | instid1(VALU_DEP_1)
-; GFX12-GISEL-NEXT:    v_dual_mov_b32 v3, s1 :: v_dual_mov_b32 v2, s0
-; GFX12-GISEL-NEXT:    v_add_co_u32 v0, vcc_lo, v0, v2
-; GFX12-GISEL-NEXT:    s_delay_alu instid0(VALU_DEP_2)
-; GFX12-GISEL-NEXT:    v_add_co_ci_u32_e32 v1, vcc_lo, v1, v3, vcc_lo
+; GFX12-GISEL-NEXT:    v_add_co_u32 v0, vcc_lo, 0x800, v0
+; GFX12-GISEL-NEXT:    v_add_co_ci_u32_e32 v1, vcc_lo, 2, v1, vcc_lo
 ; GFX12-GISEL-NEXT:    flat_load_u8 v0, v[0:1]
 ; GFX12-GISEL-NEXT:    s_wait_loadcnt_dscnt 0x0
 ; GFX12-GISEL-NEXT:    s_setpc_b64 s[30:31]
@@ -1066,14 +1012,14 @@ define i8 @flat_inst_valu_offset_64bit_12bit_split0(ptr %p) {
 ; GFX9-SDAG-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
 ; GFX9-SDAG-NEXT:    s_setpc_b64 s[30:31]
 ;
-; GFX10-SDAG-LABEL: flat_inst_valu_offset_64bit_12bit_split0:
-; GFX10-SDAG:       ; %bb.0:
-; GFX10-SDAG-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX10-SDAG-NEXT:    v_add_co_u32 v0, vcc_lo, 0xfff, v0
-; GFX10-SDAG-NEXT:    v_add_co_ci_u32_e32 v1, vcc_lo, 2, v1, vcc_lo
-; GFX10-SDAG-NEXT:    flat_load_ubyte v0, v[0:1]
-; GFX10-SDAG-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
-; GFX10-SDAG-NEXT:    s_setpc_b64 s[30:31]
+; GFX10-LABEL: flat_inst_valu_offset_64bit_12bit_split0:
+; GFX10:       ; %bb.0:
+; GFX10-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX10-NEXT:    v_add_co_u32 v0, vcc_lo, 0xfff, v0
+; GFX10-NEXT:    v_add_co_ci_u32_e32 v1, vcc_lo, 2, v1, vcc_lo
+; GFX10-NEXT:    flat_load_ubyte v0, v[0:1]
+; GFX10-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
+; GFX10-NEXT:    s_setpc_b64 s[30:31]
 ;
 ; GFX11-SDAG-LABEL: flat_inst_valu_offset_64bit_12bit_split0:
 ; GFX11-SDAG:       ; %bb.0:
@@ -1100,39 +1046,17 @@ define i8 @flat_inst_valu_offset_64bit_12bit_split0(ptr %p) {
 ; GFX9-GISEL-LABEL: flat_inst_valu_offset_64bit_12bit_split0:
 ; GFX9-GISEL:       ; %bb.0:
 ; GFX9-GISEL-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX9-GISEL-NEXT:    s_movk_i32 s4, 0xfff
-; GFX9-GISEL-NEXT:    s_mov_b32 s5, 2
-; GFX9-GISEL-NEXT:    v_mov_b32_e32 v2, s4
-; GFX9-GISEL-NEXT:    v_mov_b32_e32 v3, s5
-; GFX9-GISEL-NEXT:    v_add_co_u32_e32 v0, vcc, v0, v2
-; GFX9-GISEL-NEXT:    v_addc_co_u32_e32 v1, vcc, v1, v3, vcc
+; GFX9-GISEL-NEXT:    v_add_co_u32_e32 v0, vcc, 0xfff, v0
+; GFX9-GISEL-NEXT:    v_addc_co_u32_e32 v1, vcc, 2, v1, vcc
 ; GFX9-GISEL-NEXT:    flat_load_ubyte v0, v[0:1]
 ; GFX9-GISEL-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
 ; GFX9-GISEL-NEXT:    s_setpc_b64 s[30:31]
 ;
-; GFX10-GISEL-LABEL: flat_inst_valu_offset_64bit_12bit_split0:
-; GFX10-GISEL:       ; %bb.0:
-; GFX10-GISEL-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX10-GISEL-NEXT:    s_movk_i32 s4, 0xfff
-; GFX10-GISEL-NEXT:    s_mov_b32 s5, 2
-; GFX10-GISEL-NEXT:    v_mov_b32_e32 v2, s4
-; GFX10-GISEL-NEXT:    v_mov_b32_e32 v3, s5
-; GFX10-GISEL-NEXT:    v_add_co_u32 v0, vcc_lo, v0, v2
-; GFX10-GISEL-NEXT:    v_add_co_ci_u32_e32 v1, vcc_lo, v1, v3, vcc_lo
-; GFX10-GISEL-NEXT:    flat_load_ubyte v0, v[0:1]
-; GFX10-GISEL-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
-; GFX10-GISEL-NEXT:    s_setpc_b64 s[30:31]
-;
 ; GFX11-GISEL-LABEL: flat_inst_valu_offset_64bit_12bit_split0:
 ; GFX11-GISEL:       ; %bb.0:
 ; GFX11-GISEL-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX11-GISEL-NEXT:    s_movk_i32 s0, 0xfff
-; GFX11-GISEL-NEXT:    s_mov_b32 s1, 2
-; GFX11-GISEL-NEXT:    s_delay_alu instid0(SALU_CYCLE_1) | instskip(NEXT) | instid1(VALU_DEP_1)
-; GFX11-GISEL-NEXT:    v_dual_mov_b32 v3, s1 :: v_dual_mov_b32 v2, s0
-; GFX11-GISEL-NEXT:    v_add_co_u32 v0, vcc_lo, v0, v2
-; GFX11-GISEL-NEXT:    s_delay_alu instid0(VALU_DEP_2)
-; GFX11-GISEL-NEXT:    v_add_co_ci_u32_e32 v1, vcc_lo, v1, v3, vcc_lo
+; GFX11-GISEL-NEXT:    v_add_co_u32 v0, vcc_lo, 0xfff, v0
+; GFX11-GISEL-NEXT:    v_add_co_ci_u32_e32 v1, vcc_lo, 2, v1, vcc_lo
 ; GFX11-GISEL-NEXT:    flat_load_u8 v0, v[0:1]
 ; GFX11-GISEL-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
 ; GFX11-GISEL-NEXT:    s_setpc_b64 s[30:31]
@@ -1144,13 +1068,8 @@ define i8 @flat_inst_valu_offset_64bit_12bit_split0(ptr %p) {
 ; GFX12-GISEL-NEXT:    s_wait_samplecnt 0x0
 ; GFX12-GISEL-NEXT:    s_wait_bvhcnt 0x0
 ; GFX12-GISEL-NEXT:    s_wait_kmcnt 0x0
-; GFX12-GISEL-NEXT:    s_movk_i32 s0, 0xfff
-; GFX12-GISEL-NEXT:    s_mov_b32 s1, 2
-; GFX12-GISEL-NEXT:    s_delay_alu instid0(SALU_CYCLE_1) | instskip(NEXT) | instid1(VALU_DEP_1)
-; GFX12-GISEL-NEXT:    v_dual_mov_b32 v3, s1 :: v_dual_mov_b32 v2, s0
-; GFX12-GISEL-NEXT:    v_add_co_u32 v0, vcc_lo, v0, v2
-; GFX12-GISEL-NEXT:    s_delay_alu instid0(VALU_DEP_2)
-; GFX12-GISEL-NEXT:    v_add_co_ci_u32_e32 v1, vcc_lo, v1, v3, vcc_lo
+; GFX12-GISEL-NEXT:    v_add_co_u32 v0, vcc_lo, 0xfff, v0
+; GFX12-GISEL-NEXT:    v_add_co_ci_u32_e32 v1, vcc_lo, 2, v1, vcc_lo
 ; GFX12-GISEL-NEXT:    flat_load_u8 v0, v[0:1]
 ; GFX12-GISEL-NEXT:    s_wait_loadcnt_dscnt 0x0
 ; GFX12-GISEL-NEXT:    s_setpc_b64 s[30:31]
@@ -1161,32 +1080,32 @@ define i8 @flat_inst_valu_offset_64bit_12bit_split0(ptr %p) {
 
 ; Fill 12-bit low-bits (1ull << 33) | 4096
 define i8 @flat_inst_valu_offset_64bit_12bit_split1(ptr %p) {
-; GFX9-SDAG-LABEL: flat_inst_valu_offset_64bit_12bit_split1:
-; GFX9-SDAG:       ; %bb.0:
-; GFX9-SDAG-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX9-SDAG-NEXT:    v_add_co_u32_e32 v0, vcc, 0x1000, v0
-; GFX9-SDAG-NEXT:    v_addc_co_u32_e32 v1, vcc, 2, v1, vcc
-; GFX9-SDAG-NEXT:    flat_load_ubyte v0, v[0:1]
-; GFX9-SDAG-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
-; GFX9-SDAG-NEXT:    s_setpc_b64 s[30:31]
+; GFX9-LABEL: flat_inst_valu_offset_64bit_12bit_split1:
+; GFX9:       ; %bb.0:
+; GFX9-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX9-NEXT:    v_add_co_u32_e32 v0, vcc, 0x1000, v0
+; GFX9-NEXT:    v_addc_co_u32_e32 v1, vcc, 2, v1, vcc
+; GFX9-NEXT:    flat_load_ubyte v0, v[0:1]
+; GFX9-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
+; GFX9-NEXT:    s_setpc_b64 s[30:31]
 ;
-; GFX10-SDAG-LABEL: flat_inst_valu_offset_64bit_12bit_split1:
-; GFX10-SDAG:       ; %bb.0:
-; GFX10-SDAG-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX10-SDAG-NEXT:    v_add_co_u32 v0, vcc_lo, 0x1000, v0
-; GFX10-SDAG-NEXT:    v_add_co_ci_u32_e32 v1, vcc_lo, 2, v1, vcc_lo
-; GFX10-SDAG-NEXT:    flat_load_ubyte v0, v[0:1]
-; GFX10-SDAG-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
-; GFX10-SDAG-NEXT:    s_setpc_b64 s[30:31]
+; GFX10-LABEL: flat_inst_valu_offset_64bit_12bit_split1:
+; GFX10:       ; %bb.0:
+; GFX10-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX10-NEXT:    v_add_co_u32 v0, vcc_lo, 0x1000, v0
+; GFX10-NEXT:    v_add_co_ci_u32_e32 v1, vcc_lo, 2, v1, vcc_lo
+; GFX10-NEXT:    flat_load_ubyte v0, v[0:1]
+; GFX10-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
+; GFX10-NEXT:    s_setpc_b64 s[30:31]
 ;
-; GFX11-SDAG-LABEL: flat_inst_valu_offset_64bit_12bit_split1:
-; GFX11-SDAG:       ; %bb.0:
-; GFX11-SDAG-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX11-SDAG-NEXT:    v_add_co_u32 v0, vcc_lo, 0x1000, v0
-; GFX11-SDAG-NEXT:    v_add_co_ci_u32_e32 v1, vcc_lo, 2, v1, vcc_lo
-; GFX11-SDAG-NEXT:    flat_load_u8 v0, v[0:1]
-; GFX11-SDAG-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
-; GFX11-SDAG-NEXT:    s_setpc_b64 s[30:31]
+; GFX11-LABEL: flat_inst_valu_offset_64bit_12bit_split1:
+; GFX11:       ; %bb.0:
+; GFX11-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX11-NEXT:    v_add_co_u32 v0, vcc_lo, 0x1000, v0
+; GFX11-NEXT:    v_add_co_ci_u32_e32 v1, vcc_lo, 2, v1, vcc_lo
+; GFX11-NEXT:    flat_load_u8 v0, v[0:1]
+; GFX11-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
+; GFX11-NEXT:    s_setpc_b64 s[30:31]
 ;
 ; GFX12-SDAG-LABEL: flat_inst_valu_offset_64bit_12bit_split1:
 ; GFX12-SDAG:       ; %bb.0:
@@ -1201,46 +1120,6 @@ define i8 @flat_inst_valu_offset_64bit_12bit_split1(ptr %p) {
 ; GFX12-SDAG-NEXT:    s_wait_loadcnt_dscnt 0x0
 ; GFX12-SDAG-NEXT:    s_setpc_b64 s[30:31]
 ;
-; GFX9-GISEL-LABEL: flat_inst_valu_offset_64bit_12bit_split1:
-; GFX9-GISEL:       ; %bb.0:
-; GFX9-GISEL-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX9-GISEL-NEXT:    s_movk_i32 s4, 0x1000
-; GFX9-GISEL-NEXT:    s_mov_b32 s5, 2
-; GFX9-GISEL-NEXT:    v_mov_b32_e32 v2, s4
-; GFX9-GISEL-NEXT:    v_mov_b32_e32 v3, s5
-; GFX9-GISEL-NEXT:    v_add_co_u32_e32 v0, vcc, v0, v2
-; GFX9-GISEL-NEXT:    v_addc_co_u32_e32 v1, vcc, v1, v3, vcc
-; GFX9-GISEL-NEXT:    flat_load_ubyte v0, v[0:1]
-; GFX9-GISEL-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
-; GFX9-GISEL-NEXT:    s_setpc_b64 s[30:31]
-;
-; GFX10-GISEL-LABEL: flat_inst_valu_offset_64bit_12bit_split1:
-; GFX10-GISEL:       ; %bb.0:
-; GFX10-GISEL-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX10-GISEL-NEXT:    s_movk_i32 s4, 0x1000
-; GFX10-GISEL-NEXT:    s_mov_b32 s5, 2
-; GFX10-GISEL-NEXT:    v_mov_b32_e32 v2, s4
-; GFX10-GISEL-NEXT:    v_mov_b32_e32 v3, s5
-; GFX10-GISEL-NEXT:    v_add_co_u32 v0, vcc_lo, v0, v2
-; GFX10-GISEL-NEXT:    v_add_co_ci_u32_e32 v1, vcc_lo, v1, v3, vcc_lo
-; GFX10-GISEL-NEXT:    flat_load_ubyte v0, v[0:1]
-; GFX10-GISEL-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
-; GFX10-GISEL-NEXT:    s_setpc_b64 s[30:31]
-;
-; GFX11-GISEL-LABEL: flat_inst_valu_offset_64bit_12bit_split1:
-; GFX11-GISEL:       ; %bb.0:
-; GFX11-GISEL-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX11-GISEL-NEXT:    s_movk_i32 s0, 0x1000
-; GFX11-GISEL-NEXT:    s_mov_b32 s1, 2
-; GFX11-GISEL-NEXT:    s_delay_alu instid0(SALU_CYCLE_1) | instskip(NEXT) | instid1(VALU_DEP_1)
-; GFX11-GISEL-NEXT:    v_dual_mov_b32 v3, s1 :: v_dual_mov_b32 v2, s0
-; GFX11-GISEL-NEXT:    v_add_co_u32 v0, vcc_lo, v0, v2
-; GFX11-GISEL-NEXT:    s_delay_alu instid0(VALU_DEP_2)
-; GFX11-GISEL-NEXT:    v_add_co_ci_u32_e32 v1, vcc_lo, v1, v3, vcc_lo
-; GFX11-GISEL-NEXT:    flat_load_u8 v0, v[0:1]
-; GFX11-GISEL-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
-; GFX11-GISEL-NEXT:    s_setpc_b64 s[30:31]
-;
 ; GFX12-GISEL-LABEL: flat_inst_valu_offset_64bit_12bit_split1:
 ; GFX12-GISEL:       ; %bb.0:
 ; GFX12-GISEL-NEXT:    s_wait_loadcnt_dscnt 0x0
@@ -1248,13 +1127,8 @@ define i8 @flat_inst_valu_offset_64bit_12bit_split1(ptr %p) {
 ; GFX12-GISEL-NEXT:    s_wait_samplecnt 0x0
 ; GFX12-GISEL-NEXT:    s_wait_bvhcnt 0x0
 ; GFX12-GISEL-NEXT:    s_wait_kmcnt 0x0
-; GFX12-GISEL-NEXT:    s_movk_i32 s0, 0x1000
-; GFX12-GISEL-NEXT:    s_mov_b32 s1, 2
-; GFX12-GISEL-NEXT:    s_delay_alu instid0(SALU_CYCLE_1) | instskip(NEXT) | instid1(VALU_DEP_1)
-; GFX12-GISEL-NEXT:    v_dual_mov_b32 v3, s1 :: v_dual_mov_b32 v2, s0
-; GFX12-GISEL-NEXT:    v_add_co_u32 v0, vcc_lo, v0, v2
-; GFX12-GISEL-NEXT:    s_delay_alu instid0(VALU_DEP_2)
-; GFX12-GISEL-NEXT:    v_add_co_ci_u32_e32 v1, vcc_lo, v1, v3, vcc_lo
+; GFX12-GISEL-NEXT:    v_add_co_u32 v0, vcc_lo, 0x1000, v0
+; GFX12-GISEL-NEXT:    v_add_co_ci_u32_e32 v1, vcc_lo, 2, v1, vcc_lo
 ; GFX12-GISEL-NEXT:    flat_load_u8 v0, v[0:1]
 ; GFX12-GISEL-NEXT:    s_wait_loadcnt_dscnt 0x0
 ; GFX12-GISEL-NEXT:    s_setpc_b64 s[30:31]
@@ -1274,14 +1148,14 @@ define i8 @flat_inst_valu_offset_64bit_13bit_split0(ptr %p) {
 ; GFX9-SDAG-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
 ; GFX9-SDAG-NEXT:    s_setpc_b64 s[30:31]
 ;
-; GFX10-SDAG-LABEL: flat_inst_valu_offset_64bit_13bit_split0:
-; GFX10-SDAG:       ; %bb.0:
-; GFX10-SDAG-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX10-SDAG-NEXT:    v_add_co_u32 v0, vcc_lo, 0x1fff, v0
-; GFX10-SDAG-NEXT:    v_add_co_ci_u32_e32 v1, vcc_lo, 2, v1, vcc_lo
-; GFX10-SDAG-NEXT:    flat_load_ubyte v0, v[0:1]
-; GFX10-SDAG-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
-; GFX10-SDAG-NEXT:    s_setpc_b64 s[30:31]
+; GFX10-LABEL: flat_inst_valu_offset_64bit_13bit_split0:
+; GFX10:       ; %bb.0:
+; GFX10-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX10-NEXT:    v_add_co_u32 v0, vcc_lo, 0x1fff, v0
+; GFX10-NEXT:    v_add_co_ci_u32_e32 v1, vcc_lo, 2, v1, vcc_lo
+; GFX10-NEXT:    flat_load_ubyte v0, v[0:1]
+; GFX10-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
+; GFX10-NEXT:    s_setpc_b64 s[30:31]
 ;
 ; GFX11-SDAG-LABEL: flat_inst_valu_offset_64bit_13bit_split0:
 ; GFX11-SDAG:       ; %bb.0:
@@ -1308,39 +1182,17 @@ define i8 @flat_inst_valu_offset_64bit_13bit_split0(ptr %p) {
 ; GFX9-GISEL-LABEL: flat_inst_valu_offset_64bit_13bit_split0:
 ; GFX9-GISEL:       ; %bb.0:
 ; GFX9-GISEL-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX9-GISEL-NEXT:    s_movk_i32 s4, 0x1fff
-; GFX9-GISEL-NEXT:    s_mov_b32 s5, 2
-; GFX9-GISEL-NEXT:    v_mov_b32_e32 v2, s4
-; GFX9-GISEL-NEXT:    v_mov_b32_e32 v3, s5
-; GFX9-GISEL-NEXT:    v_add_co_u32_e32 v0, vcc, v0, v2
-; GFX9-GISEL-NEXT:    v_addc_co_u32_e32 v1, vcc, v1, v3, vcc
+; GFX9-GISEL-NEXT:    v_add_co_u32_e32 v0, vcc, 0x1fff, v0
+; GFX9-GISEL-NEXT:    v_addc_co_u32_e32 v1, vcc, 2, v1, vcc
 ; GFX9-GISEL-NEXT:    flat_load_ubyte v0, v[0:1]
 ; GFX9-GISEL-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
 ; GFX9-GISEL-NEXT:    s_setpc_b64 s[30:31]
 ;
-; GFX10-GISEL-LABEL: flat_inst_valu_offset_64bit_13bit_split0:
-; GFX10-GISEL:       ; %bb.0:
-; GFX10-GISEL-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX10-GISEL-NEXT:    s_movk_i32 s4, 0x1fff
-; GFX10-GISEL-NEXT:    s_mov_b32 s5, 2
-; GFX10-GISEL-NEXT:    v_mov_b32_e32 v2, s4
-; GFX10-GISEL-NEXT:    v_mov_b32_e32 v3, s5
-; GFX10-GISEL-NEXT:    v_add_co_u32 v0, vcc_lo, v0, v2
-; GFX10-GISEL-NEXT:    v_add_co_ci_u32_e32 v1, vcc_lo, v1, v3, vcc_lo
-; GFX10-GISEL-NEXT:    flat_load_ubyte v0, v[0:1]
-; GFX10-GISEL-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
-; GFX10-GISEL-NEXT:    s_setpc_b64 s[30:31]
-;
 ; GFX11-GISEL-LABEL: flat_inst_valu_offset_64bit_13bit_split0:
 ; GFX11-GISEL:       ; %bb.0:
 ; GFX11-GISEL-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX11-GISEL-NEXT:    s_movk_i32 s0, 0x1fff
-; GFX11-GISEL-NEXT:    s_mov_b32 s1, 2
-; GFX11-GISEL-NEXT:    s_delay_alu instid0(SALU_CYCLE_1) | instskip(NEXT) | instid1(VALU_DEP_1)
-; GFX11-GISEL-NEXT:    v_dual_mov_b32 v3, s1 :: v_dual_mov_b32 v2, s0
-; GFX11-GISEL-NEXT:    v_add_co_u32 v0, vcc_lo, v0, v2
-; GFX11-GISEL-NEXT:    s_delay_alu instid0(VALU_DEP_2)
-; GFX11-GISEL-NEXT:    v_add_co_ci_u32_e32 v1, vcc_lo, v1, v3, vcc_lo
+; GFX11-GISEL-NEXT:    v_add_co_u32 v0, vcc_lo, 0x1fff, v0
+; GFX11-GISEL-NEXT:    v_add_co_ci_u32_e32 v1, vcc_lo, 2, v1, vcc_lo
 ; GFX11-GISEL-NEXT:    flat_load_u8 v0, v[0:1]
 ; GFX11-GISEL-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
 ; GFX11-GISEL-NEXT:    s_setpc_b64 s[30:31]
@@ -1352,13 +1204,8 @@ define i8 @flat_inst_valu_offset_64bit_13bit_split0(ptr %p) {
 ; GFX12-GISEL-NEXT:    s_wait_samplecnt 0x0
 ; GFX12-GISEL-NEXT:    s_wait_bvhcnt 0x0
 ; GFX12-GISEL-NEXT:    s_wait_kmcnt 0x0
-; GFX12-GISEL-NEXT:    s_movk_i32 s0, 0x1fff
-; GFX12-GISEL-NEXT:    s_mov_b32 s1, 2
-; GFX12-GISEL-NEXT:    s_delay_alu instid0(SALU_CYCLE_1) | instskip(NEXT) | instid1(VALU_DEP_1)
-; GFX12-GISEL-NEXT:    v_dual_mov_b32 v3, s1 :: v_dual_mov_b32 v2, s0
-; GFX12-GISEL-NEXT:    v_add_co_u32 v0, vcc_lo, v0, v2
-; GFX12-GISEL-NEXT:    s_delay_alu instid0(VALU_DEP_2)
-; GFX12-GISEL-NEXT:    v_add_co_ci_u32_e32 v1, vcc_lo, v1, v3, vcc_lo
+; GFX12-GISEL-NEXT:    v_add_co_u32 v0, vcc_lo, 0x1fff, v0
+; GFX12-GISEL-NEXT:    v_add_co_ci_u32_e32 v1, vcc_lo, 2, v1, vcc_lo
 ; GFX12-GISEL-NEXT:    flat_load_u8 v0, v[0:1]
 ; GFX12-GISEL-NEXT:    s_wait_loadcnt_dscnt 0x0
 ; GFX12-GISEL-NEXT:    s_setpc_b64 s[30:31]
@@ -1369,32 +1216,32 @@ define i8 @flat_inst_valu_offset_64bit_13bit_split0(ptr %p) {
 
 ; Fill 13-bit low-bits (1ull << 33) | 8192
 define i8 @flat_inst_valu_offset_64bit_13bit_split1(ptr %p) {
-; GFX9-SDAG-LABEL: flat_inst_valu_offset_64bit_13bit_split1:
-; GFX9-SDAG:       ; %bb.0:
-; GFX9-SDAG-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX9-SDAG-NEXT:    v_add_co_u32_e32 v0, vcc, 0x2000, v0
-; GFX9-SDAG-NEXT:    v_addc_co_u32_e32 v1, vcc, 2, v1, vcc
-; GFX9-SDAG-NEXT:    flat_load_ubyte v0, v[0:1]
-; GFX9-SDAG-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
-; GFX9-SDAG-NEXT:    s_setpc_b64 s[30:31]
+; GFX9-LABEL: flat_inst_valu_offset_64bit_13bit_split1:
+; GFX9:       ; %bb.0:
+; GFX9-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX9-NEXT:    v_add_co_u32_e32 v0, vcc, 0x2000, v0
+; GFX9-NEXT:    v_addc_co_u32_e32 v1, vcc, 2, v1, vcc
+; GFX9-NEXT:    flat_load_ubyte v0, v[0:1]
+; GFX9-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
+; GFX9-NEXT:    s_setpc_b64 s[30:31]
 ;
-; GFX10-SDAG-LABEL: flat_inst_valu_offset_64bit_13bit_split1:
-; GFX10-SDAG:       ; %bb.0:
-; GFX10-SDAG-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX10-SDAG-NEXT:    v_add_co_u32 v0, vcc_lo, 0x2000, v0
-; GFX10-SDAG-NEXT:    v_add_co_ci_u32_e32 v1, vcc_lo, 2, v1, vcc_lo
-; GFX10-SDAG-NEXT:    flat_load_ubyte v0, v[0:1]
-; GFX10-SDAG-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
-; GFX10-SDAG-NEXT:    s_setpc_b64 s[30:31]
+; GFX10-LABEL: flat_inst_valu_offset_64bit_13bit_split1:
+; GFX10:       ; %bb.0:
+; GFX10-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX10-NEXT:    v_add_co_u32 v0, vcc_lo, 0x2000, v0
+; GFX10-NEXT:    v_add_co_ci_u32_e32 v1, vcc_lo, 2, v1, vcc_lo
+; GFX10-NEXT:    flat_load_ubyte v0, v[0:1]
+; GFX10-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
+; GFX10-NEXT:    s_setpc_b64 s[30:31]
 ;
-; GFX11-SDAG-LABEL: flat_inst_valu_offset_64bit_13bit_split1:
-; GFX11-SDAG:       ; %bb.0:
-; GFX11-SDAG-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX11-SDAG-NEXT:    v_add_co_u32 v0, vcc_lo, 0x2000, v0
-; GFX11-SDAG-NEXT:    v_add_co_ci_u32_e32 v1, vcc_lo, 2, v1, vcc_lo
-; GFX11-SDAG-NEXT:    flat_load_u8 v0, v[0:1]
-; GFX11-SDAG-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
-; GFX11-SDAG-NEXT:    s_setpc_b64 s[30:31]
+; GFX11-LABEL: flat_inst_valu_offset_64bit_13bit_split1:
+; GFX11:       ; %bb.0:
+; GFX11-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX11-NEXT:    v_add_co_u32 v0, vcc_lo, 0x2000, v0
+; GFX11-NEXT:    v_add_co_ci_u32_e32 v1, vcc_lo, 2, v1, vcc_lo
+; GFX11-NEXT:    flat_load_u8 v0, v[0:1]
+; GFX11-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
+; GFX11-NEXT:    s_setpc_b64 s[30:31]
 ;
 ; GFX12-SDAG-LABEL: flat_inst_valu_offset_64bit_13bit_split1:
 ; GFX12-SDAG:       ; %bb.0:
@@ -1409,46 +1256,6 @@ define i8 @flat_inst_valu_offset_64bit_13bit_split1(ptr %p) {
 ; GFX12-SDAG-NEXT:    s_wait_loadcnt_dscnt 0x0
 ; GFX12-SDAG-NEXT:    s_setpc_b64 s[30:31]
 ;
-; GFX9-GISEL-LABEL: flat_inst_valu_offset_64bit_13bit_split1:
-; GFX9-GISEL:       ; %bb.0:
-; GFX9-GISEL-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX9-GISEL-NEXT:    s_movk_i32 s4, 0x2000
-; GFX9-GISEL-NEXT:    s_mov_b32 s5, 2
-; GFX9-GISEL-NEXT:    v_mov_b32_e32 v2, s4
-; GFX9-GISEL-NEXT:    v_mov_b32_e32 v3, s5
-; GFX9-GISEL-NEXT:    v_add_co_u32_e32 v0, vcc, v0, v2
-; GFX9-GISEL-NEXT:    v_addc_co_u32_e32 v1, vcc, v1, v3, vcc
-; GFX9-GISEL-NEXT:    flat_load_ubyte v0, v[0:1]
-; GFX9-GISEL-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
-; GFX9-GISEL-NEXT:    s_setpc_b64 s[30:31]
-;
-; GFX10-GISEL-LABEL: flat_inst_valu_offset_64bit_13bit_split1:
-; GFX10-GISEL:       ; %bb.0:
-; GFX10-GISEL-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX10-GISEL-NEXT:    s_movk_i32 s4, 0x2000
-; GFX10-GISEL-NEXT:    s_mov_b32 s5, 2
-; GFX10-GISEL-NEXT:    v_mov_b32_e32 v2, s4
-; GFX10-GISEL-NEXT:    v_mov_b32_e32 v3, s5
-; GFX10-GISEL-NEXT:    v_add_co_u32 v0, vcc_lo, v0, v2
-; GFX10-GISEL-NEXT:    v_add_co_ci_u32_e32 v1, vcc_lo, v1, v3, vcc_lo
-; GFX10-GISEL-NEXT:    flat_load_ubyte v0, v[0:1]
-; GFX10-GISEL-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
-; GFX10-GISEL-NEXT:    s_setpc_b64 s[30:31]
-;
-; GFX11-GISEL-LABEL: flat_inst_valu_offset_64bit_13bit_split1:
-; GFX11-GISEL:       ; %bb.0:
-; GFX11-GISEL-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX11-GISEL-NEXT:    s_movk_i32 s0, 0x2000
-; GFX11-GISEL-NEXT:    s_mov_b32 s1, 2
-; GFX11-GISEL-NEXT:    s_delay_alu instid0(SALU_CYCLE_1) | instskip(NEXT) | instid1(VALU_DEP_1)
-; GFX11-GISEL-NEXT:    v_dual_mov_b32 v3, s1 :: v_dual_mov_b32 v2, s0
-; GFX11-GISEL-NEXT:    v_add_co_u32 v0, vcc_lo, v0, v2
-; GFX11-GISEL-NEXT:    s_delay_alu instid0(VALU_DEP_2)
-; GFX11-GISEL-NEXT:    v_add_co_ci_u32_e32 v1, vcc_lo, v1, v3, vcc_lo
-; GFX11-GISEL-NEXT:    flat_load_u8 v0, v[0:1]
-; GFX11-GISEL-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
-; GFX11-GISEL-NEXT:    s_setpc_b64 s[30:31]
-;
 ; GFX12-GISEL-LABEL: flat_inst_valu_offset_64bit_13bit_split1:
 ; GFX12-GISEL:       ; %bb.0:
 ; GFX12-GISEL-NEXT:    s_wait_loadcnt_dscnt 0x0
@@ -1456,13 +1263,8 @@ define i8 @flat_inst_valu_offset_64bit_13bit_split1(ptr %p) {
 ; GFX12-GISEL-NEXT:    s_wait_samplecnt 0x0
 ; GFX12-GISEL-NEXT:    s_wait_bvhcnt 0x0
 ; GFX12-GISEL-NEXT:    s_wait_kmcnt 0x0
-; GFX12-GISEL-NEXT:    s_movk_i32 s0, 0x2000
-; GFX12-GISEL-NEXT:    s_mov_b32 s1, 2
-; GFX12-GISEL-NEXT:    s_delay_alu instid0(SALU_CYCLE_1) | instskip(NEXT) | instid1(VALU_DEP_1)
-; GFX12-GISEL-NEXT:    v_dual_mov_b32 v3, s1 :: v_dual_mov_b32 v2, s0
-; GFX12-GISEL-NEXT:    v_add_co_u32 v0, vcc_lo, v0, v2
-; GFX12-GISEL-NEXT:    s_delay_alu instid0(VALU_DEP_2)
-; GFX12-GISEL-NEXT:    v_add_co_ci_u32_e32 v1, vcc_lo, v1, v3, vcc_lo
+; GFX12-GISEL-NEXT:    v_add_co_u32 v0, vcc_lo, 0x2000, v0
+; GFX12-GISEL-NEXT:    v_add_co_ci_u32_e32 v1, vcc_lo, 2, v1, vcc_lo
 ; GFX12-GISEL-NEXT:    flat_load_u8 v0, v[0:1]
 ; GFX12-GISEL-NEXT:    s_wait_loadcnt_dscnt 0x0
 ; GFX12-GISEL-NEXT:    s_setpc_b64 s[30:31]
@@ -1483,23 +1285,23 @@ define i8 @flat_inst_valu_offset_64bit_11bit_neg_high_split0(ptr %p) {
 ; GFX9-SDAG-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
 ; GFX9-SDAG-NEXT:    s_setpc_b64 s[30:31]
 ;
-; GFX10-SDAG-LABEL: flat_inst_valu_offset_64bit_11bit_neg_high_split0:
-; GFX10-SDAG:       ; %bb.0:
-; GFX10-SDAG-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX10-SDAG-NEXT:    v_add_co_u32 v0, vcc_lo, 0x7ff, v0
-; GFX10-SDAG-NEXT:    v_add_co_ci_u32_e32 v1, vcc_lo, 0x80000000, v1, vcc_lo
-; GFX10-SDAG-NEXT:    flat_load_ubyte v0, v[0:1]
-; GFX10-SDAG-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
-; GFX10-SDAG-NEXT:    s_setpc_b64 s[30:31]
+; GFX10-LABEL: flat_inst_valu_offset_64bit_11bit_neg_high_split0:
+; GFX10:       ; %bb.0:
+; GFX10-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX10-NEXT:    v_add_co_u32 v0, vcc_lo, 0x7ff, v0
+; GFX10-NEXT:    v_add_co_ci_u32_e32 v1, vcc_lo, 0x80000000, v1, vcc_lo
+; GFX10-NEXT:    flat_load_ubyte v0, v[0:1]
+; GFX10-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
+; GFX10-NEXT:    s_setpc_b64 s[30:31]
 ;
-; GFX11-SDAG-LABEL: flat_inst_valu_offset_64bit_11bit_neg_high_split0:
-; GFX11-SDAG:       ; %bb.0:
-; GFX11-SDAG-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX11-SDAG-NEXT:    v_add_co_u32 v0, vcc_lo, 0x7ff, v0
-; GFX11-SDAG-NEXT:    v_add_co_ci_u32_e32 v1, vcc_lo, 0x80000000, v1, vcc_lo
-; GFX11-SDAG-NEXT:    flat_load_u8 v0, v[0:1]
-; GFX11-SDAG-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
-; GFX11-SDAG-NEXT:    s_setpc_b64 s[30:31]
+; GFX11-LABEL: flat_inst_valu_offset_64bit_11bit_neg_high_split0:
+; GFX11:       ; %bb.0:
+; GFX11-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX11-NEXT:    v_add_co_u32 v0, vcc_lo, 0x7ff, v0
+; GFX11-NEXT:    v_add_co_ci_u32_e32 v1, vcc_lo, 0x80000000, v1, vcc_lo
+; GFX11-NEXT:    flat_load_u8 v0, v[0:1]
+; GFX11-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
+; GFX11-NEXT:    s_setpc_b64 s[30:31]
 ;
 ; GFX12-SDAG-LABEL: flat_inst_valu_offset_64bit_11bit_neg_high_split0:
 ; GFX12-SDAG:       ; %bb.0:
@@ -1517,43 +1319,13 @@ define i8 @flat_inst_valu_offset_64bit_11bit_neg_high_split0(ptr %p) {
 ; GFX9-GISEL-LABEL: flat_inst_valu_offset_64bit_11bit_neg_high_split0:
 ; GFX9-GISEL:       ; %bb.0:
 ; GFX9-GISEL-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX9-GISEL-NEXT:    s_movk_i32 s4, 0x7ff
-; GFX9-GISEL-NEXT:    s_brev_b32 s5, 1
-; GFX9-GISEL-NEXT:    v_mov_b32_e32 v2, s4
-; GFX9-GISEL-NEXT:    v_mov_b32_e32 v3, s5
-; GFX9-GISEL-NEXT:    v_add_co_u32_e32 v0, vcc, v0, v2
-; GFX9-GISEL-NEXT:    v_addc_co_u32_e32 v1, vcc, v1, v3, vcc
+; GFX9-GISEL-NEXT:    v_bfrev_b32_e32 v2, 1
+; GFX9-GISEL-NEXT:    v_add_co_u32_e32 v0, vcc, 0x7ff, v0
+; GFX9-GISEL-NEXT:    v_addc_co_u32_e32 v1, vcc, v1, v2, vcc
 ; GFX9-GISEL-NEXT:    flat_load_ubyte v0, v[0:1]
 ; GFX9-GISEL-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
 ; GFX9-GISEL-NEXT:    s_setpc_b64 s[30:31]
 ;
-; GFX10-GISEL-LABEL: flat_inst_valu_offset_64bit_11bit_neg_high_split0:
-; GFX10-GISEL:       ; %bb.0:
-; GFX10-GISEL-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX10-GISEL-NEXT:    s_movk_i32 s4, 0x7ff
-; GFX10-GISEL-NEXT:    s_brev_b32 s5, 1
-; GFX10-GISEL-NEXT:    v_mov_b32_e32 v2, s4
-; GFX10-GISEL-NEXT:    v_mov_b32_e32 v3, s5
-; GFX10-GISEL-NEXT:    v_add_co_u32 v0, vcc_lo, v0, v2
-; GFX10-GISEL-NEXT:    v_add_co_ci_u32_e32 v1, vcc_lo, v1, v3, vcc_lo
-; GFX10-GISEL-NEXT:    flat_load_ubyte v0, v[0:1]
-; GFX10-GISEL-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
-; GFX10-GISEL-NEXT:    s_setpc_b64 s[30:31]
-;
-; GFX11-GISEL-LABEL: flat_inst_valu_offset_64bit_11bit_neg_high_split0:
-; GFX11-GISEL:       ; %bb.0:
-; GFX11-GISEL-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX11-GISEL-NEXT:    s_movk_i32 s0, 0x7ff
-; GFX11-GISEL-NEXT:    s_brev_b32 s1, 1
-; GFX11-GISEL-NEXT:    s_delay_alu instid0(SALU_CYCLE_1) | instskip(NEXT) | instid1(VALU_DEP_1)
-; GFX11-GISEL-NEXT:    v_dual_mov_b32 v3, s1 :: v_dual_mov_b32 v2, s0
-; GFX11-GISEL-NEXT:    v_add_co_u32 v0, vcc_lo, v0, v2
-; GFX11-GISEL-NEXT:    s_delay_alu instid0(VALU_DEP_2)
-; GFX11-GISEL-NEXT:    v_add_co_ci_u32_e32 v1, vcc_lo, v1, v3, vcc_lo
-; GFX11-GISEL-NEXT:    flat_load_u8 v0, v[0:1]
-; GFX11-GISEL-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
-; GFX11-GISEL-NEXT:    s_setpc_b64 s[30:31]
-;
 ; GFX12-GISEL-LABEL: flat_inst_valu_offset_64bit_11bit_neg_high_split0:
 ; GFX12-GISEL:       ; %bb.0:
 ; GFX12-GISEL-NEXT:    s_wait_loadcnt_dscnt 0x0
@@ -1561,13 +1333,8 @@ define i8 @flat_inst_valu_offset_64bit_11bit_neg_high_split0(ptr %p) {
 ; GFX12-GISEL-NEXT:    s_wait_samplecnt 0x0
 ; GFX12-GISEL-NEXT:    s_wait_bvhcnt 0x0
 ; GFX12-GISEL-NEXT:    s_wait_kmcnt 0x0
-; GFX12-GISEL-NEXT:    s_movk_i32 s0, 0x7ff
-; GFX12-GISEL-NEXT:    s_brev_b32 s1, 1
-; GFX12-GISEL-NEXT:    s_delay_alu instid0(SALU_CYCLE_1) | instskip(NEXT) | instid1(VALU_DEP_1)
-; GFX12-GISEL-NEXT:    v_dual_mov_b32 v3, s1 :: v_dual_mov_b32 v2, s0
-; GFX12-GISEL-NEXT:    v_add_co_u32 v0, vcc_lo, v0, v2
-; GFX12-GISEL-NEXT:    s_delay_alu instid0(VALU_DEP_2)
-; GFX12-GISEL-NEXT:    v_add_co_ci_u32_e32 v1, vcc_lo, v1, v3, vcc_lo
+; GFX12-GISEL-NEXT:    v_add_co_u32 v0, vcc_lo, 0x7ff, v0
+; GFX12-GISEL-NEXT:    v_add_co_ci_u32_e32 v1, vcc_lo, 0x80000000, v1, vcc_lo
 ; GFX12-GISEL-NEXT:    flat_load_u8 v0, v[0:1]
 ; GFX12-GISEL-NEXT:    s_wait_loadcnt_dscnt 0x0
 ; GFX12-GISEL-NEXT:    s_setpc_b64 s[30:31]
@@ -1588,23 +1355,23 @@ define i8 @flat_inst_valu_offset_64bit_11bit_neg_high_split1(ptr %p) {
 ; GFX9-SDAG-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
 ; GFX9-SDAG-NEXT:    s_setpc_b64 s[30:31]
 ;
-; GFX10-SDAG-LABEL: flat_inst_valu_offset_64bit_11bit_neg_high_split1:
-; GFX10-SDAG:       ; %bb.0:
-; GFX10-SDAG-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX10-SDAG-NEXT:    v_add_co_u32 v0, vcc_lo, 0x800, v0
-; GFX10-SDAG-NEXT:    v_add_co_ci_u32_e32 v1, vcc_lo, 0x80000000, v1, vcc_lo
-; GFX10-SDAG-NEXT:    flat_load_ubyte v0, v[0:1]
-; GFX10-SDAG-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
-; GFX10-SDAG-NEXT:    s_setpc_b64 s[30:31]
+; GFX10-LABEL: flat_inst_valu_offset_64bit_11bit_neg_high_split1:
+; GFX10:       ; %bb.0:
+; GFX10-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX10-NEXT:    v_add_co_u32 v0, vcc_lo, 0x800, v0
+; GFX10-NEXT:    v_add_co_ci_u32_e32 v1, vcc_lo, 0x80000000, v1, vcc_lo
+; GFX10-NEXT:    flat_load_ubyte v0, v[0:1]
+; GFX10-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
+; GFX10-NEXT:    s_setpc_b64 s[30:31]
 ;
-; GFX11-SDAG-LABEL: flat_inst_valu_offset_64bit_11bit_neg_high_split1:
-; GFX11-SDAG:       ; %bb.0:
-; GFX11-SDAG-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX11-SDAG-NEXT:    v_add_co_u32 v0, vcc_lo, 0x800, v0
-; GFX11-SDAG-NEXT:    v_add_co_ci_u32_e32 v1, vcc_lo, 0x80000000, v1, vcc_lo
-; GFX11-SDAG-NEXT:    flat_load_u8 v0, v[0:1]
-; GFX11-SDAG-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
-; GFX11-SDAG-NEXT:    s_setpc_b64 s[30:31]
+; GFX11-LABEL: flat_inst_valu_offset_64bit_11bit_neg_high_split1:
+; GFX11:       ; %bb.0:
+; GFX11-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX11-NEXT:    v_add_co_u32 v0, vcc_lo, 0x800, v0
+; GFX11-NEXT:    v_add_co_ci_u32_e32 v1, vcc_lo, 0x80000000, v1, vcc_lo
+; GFX11-NEXT:    flat_load_u8 v0, v[0:1]
+; GFX11-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
+; GFX11-NEXT:    s_setpc_b64 s[30:31]
 ;
 ; GFX12-SDAG-LABEL: flat_inst_valu_offset_64bit_11bit_neg_high_split1:
 ; GFX12-SDAG:       ; %bb.0:
@@ -1622,43 +1389,13 @@ define i8 @flat_inst_valu_offset_64bit_11bit_neg_high_split1(ptr %p) {
 ; GFX9-GISEL-LABEL: flat_inst_valu_offset_64bit_11bit_neg_high_split1:
 ; GFX9-GISEL:       ; %bb.0:
 ; GFX9-GISEL-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX9-GISEL-NEXT:    s_movk_i32 s4, 0x800
-; GFX9-GISEL-NEXT:    s_brev_b32 s5, 1
-; GFX9-GISEL-NEXT:    v_mov_b32_e32 v2, s4
-; GFX9-GISEL-NEXT:    v_mov_b32_e32 v3, s5
-; GFX9-GISEL-NEXT:    v_add_co_u32_e32 v0, vcc, v0, v2
-; GFX9-GISEL-NEXT:    v_addc_co_u32_e32 v1, vcc, v1, v3, vcc
+; GFX9-GISEL-NEXT:    v_bfrev_b32_e32 v2, 1
+; GFX9-GISEL-NEXT:    v_add_co_u32_e32 v0, vcc, 0x800, v0
+; GFX9-GISEL-NEXT:    v_addc_co_u32_e32 v1, vcc, v1, v2, vcc
 ; GFX9-GISEL-NEXT:    flat_load_ubyte v0, v[0:1]
 ; GFX9-GISEL-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
 ; GFX9-GISEL-NEXT:    s_setpc_b64 s[30:31]
 ;
-; GFX10-GISEL-LABEL: flat_inst_valu_offset_64bit_11bit_neg_high_split1:
-; GFX10-GISEL:       ; %bb.0:
-; GFX10-GISEL-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX10-GISEL-NEXT:    s_movk_i32 s4, 0x800
-; GFX10-GISEL-NEXT:    s_brev_b32 s5, 1
-; GFX10-GISEL-NEXT:    v_mov_b32_e32 v2, s4
-; GFX10-GISEL-NEXT:    v_mov_b32_e32 v3, s5
-; GFX10-GISEL-NEXT:    v_add_co_u32 v0, vcc_lo, v0, v2
-; GFX10-GISEL-NEXT:    v_add_co_ci_u32_e32 v1, vcc_lo, v1, v3, vcc_lo
-; GFX10-GISEL-NEXT:    flat_load_ubyte v0, v[0:1]
-; GFX10-GISEL-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
-; GFX10-GISEL-NEXT:    s_setpc_b64 s[30:31]
-;
-; GFX11-GISEL-LABEL: flat_inst_valu_offset_64bit_11bit_neg_high_split1:
-; GFX11-GISEL:       ; %bb.0:
-; GFX11-GISEL-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX11-GISEL-NEXT:    s_movk_i32 s0, 0x800
-; GFX11-GISEL-NEXT:    s_brev_b32 s1, 1
-; GFX11-GISEL-NEXT:    s_delay_alu instid0(SALU_CYCLE_1) | instskip(NEXT) | instid1(VALU_DEP_1)
-; GFX11-GISEL-NEXT:    v_dual_mov_b32 v3, s1 :: v_dual_mov_b32 v2, s0
-; GFX11-GISEL-NEXT:    v_add_co_u32 v0, vcc_lo, v0, v2
-; GFX11-GISEL-NEXT:    s_delay_alu instid0(VALU_DEP_2)
-; GFX11-GISEL-NEXT:    v_add_co_ci_u32_e32 v1, vcc_lo, v1, v3, vcc_lo
-; GFX11-GISEL-NEXT:    flat_load_u8 v0, v[0:1]
-; GFX11-GISEL-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
-; GFX11-GISEL-NEXT:    s_setpc_b64 s[30:31]
-;
 ; GFX12-GISEL-LABEL: flat_inst_valu_offset_64bit_11bit_neg_high_split1:
 ; GFX12-GISEL:       ; %bb.0:
 ; GFX12-GISEL-NEXT:    s_wait_loadcnt_dscnt 0x0
@@ -1666,13 +1403,8 @@ define i8 @flat_inst_valu_offset_64bit_11bit_neg_high_split1(ptr %p) {
 ; GFX12-GISEL-NEXT:    s_wait_samplecnt 0x0
 ; GFX12-GISEL-NEXT:    s_wait_bvhcnt 0x0
 ; GFX12-GISEL-NEXT:    s_wait_kmcnt 0x0
-; GFX12-GISEL-NEXT:    s_movk_i32 s0, 0x800
-; GFX12-GISEL-NEXT:    s_brev_b32 s1, 1
-; GFX12-GISEL-NEXT:    s_delay_alu instid0(SALU_CYCLE_1) | instskip(NEXT) | instid1(VALU_DEP_1)
-; GFX12-GISEL-NEXT:    v_dual_mov_b32 v3, s1 :: v_dual_mov_b32 v2, s0
-; GFX12-GISEL-NEXT:    v_add_co_u32 v0, vcc_lo, v0, v2
-; GFX12-GISEL-NEXT:    s_delay_alu instid0(VALU_DEP_2)
-; GFX12-GISEL-NEXT:    v_add_co_ci_u32_e32 v1, vcc_lo, v1, v3, vcc_lo
+; GFX12-GISEL-NEXT:    v_add_co_u32 v0, vcc_lo, 0x800, v0
+; GFX12-GISEL-NEXT:    v_add_co_ci_u32_e32 v1, vcc_lo, 0x80000000, v1, vcc_lo
 ; GFX12-GISEL-NEXT:    flat_load_u8 v0, v[0:1]
 ; GFX12-GISEL-NEXT:    s_wait_loadcnt_dscnt 0x0
 ; GFX12-GISEL-NEXT:    s_setpc_b64 s[30:31]
@@ -1693,23 +1425,23 @@ define i8 @flat_inst_valu_offset_64bit_12bit_neg_high_split0(ptr %p) {
 ; GFX9-SDAG-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
 ; GFX9-SDAG-NEXT:    s_setpc_b64 s[30:31]
 ;
-; GFX10-SDAG-LABEL: flat_inst_valu_offset_64bit_12bit_neg_high_split0:
-; GFX10-SDAG:       ; %bb.0:
-; GFX10-SDAG-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX10-SDAG-NEXT:    v_add_co_u32 v0, vcc_lo, 0xfff, v0
-; GFX10-SDAG-NEXT:    v_add_co_ci_u32_e32 v1, vcc_lo, 0x80000000, v1, vcc_lo
-; GFX10-SDAG-NEXT:    flat_load_ubyte v0, v[0:1]
-; GFX10-SDAG-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
-; GFX10-SDAG-NEXT:    s_setpc_b64 s[30:31]
+; GFX10-LABEL: flat_inst_valu_offset_64bit_12bit_neg_high_split0:
+; GFX10:       ; %bb.0:
+; GFX10-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX10-NEXT:    v_add_co_u32 v0, vcc_lo, 0xfff, v0
+; GFX10-NEXT:    v_add_co_ci_u32_e32 v1, vcc_lo, 0x80000000, v1, vcc_lo
+; GFX10-NEXT:    flat_load_ubyte v0, v[0:1]
+; GFX10-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
+; GFX10-NEXT:    s_setpc_b64 s[30:31]
 ;
-; GFX11-SDAG-LABEL: flat_inst_valu_offset_64bit_12bit_neg_high_split0:
-; GFX11-SDAG:       ; %bb.0:
-; GFX11-SDAG-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX11-SDAG-NEXT:    v_add_co_u32 v0, vcc_lo, 0xfff, v0
-; GFX11-SDAG-NEXT:    v_add_co_ci_u32_e32 v1, vcc_lo, 0x80000000, v1, vcc_lo
-; GFX11-SDAG-NEXT:    flat_load_u8 v0, v[0:1]
-; GFX11-SDAG-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
-; GFX11-SDAG-NEXT:    s_setpc_b64 s[30:31]
+; GFX11-LABEL: flat_inst_valu_offset_64bit_12bit_neg_high_split0:
+; GFX11:       ; %bb.0:
+; GFX11-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX11-NEXT:    v_add_co_u32 v0, vcc_lo, 0xfff, v0
+; GFX11-NEXT:    v_add_co_ci_u32_e32 v1, vcc_lo, 0x80000000, v1, vcc_lo
+; GFX11-NEXT:    flat_load_u8 v0, v[0:1]
+; GFX11-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
+; GFX11-NEXT:    s_setpc_b64 s[30:31]
 ;
 ; GFX12-SDAG-LABEL: flat_inst_valu_offset_64bit_12bit_neg_high_split0:
 ; GFX12-SDAG:       ; %bb.0:
@@ -1727,43 +1459,13 @@ define i8 @flat_inst_valu_offset_64bit_12bit_neg_high_split0(ptr %p) {
 ; GFX9-GISEL-LABEL: flat_inst_valu_offset_64bit_12bit_neg_high_split0:
 ; GFX9-GISEL:       ; %bb.0:
 ; GFX9-GISEL-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX9-GISEL-NEXT:    s_movk_i32 s4, 0xfff
-; GFX9-GISEL-NEXT:    s_brev_b32 s5, 1
-; GFX9-GISEL-NEXT:    v_mov_b32_e32 v2, s4
-; GFX9-GISEL-NEXT:    v_mov_b32_e32 v3, s5
-; GFX9-GISEL-NEXT:    v_add_co_u32_e32 v0, vcc, v0, v2
-; GFX9-GISEL-NEXT:    v_addc_co_u32_e32 v1, vcc, v1, v3, vcc
+; GFX9-GISEL-NEXT:    v_bfrev_b32_e32 v2, 1
+; GFX9-GISEL-NEXT:    v_add_co_u32_e32 v0, vcc, 0xfff, v0
+; GFX9-GISEL-NEXT:    v_addc_co_u32_e32 v1, vcc, v1, v2, vcc
 ; GFX9-GISEL-NEXT:    flat_load_ubyte v0, v[0:1]
 ; GFX9-GISEL-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
 ; GFX9-GISEL-NEXT:    s_setpc_b64 s[30:31]
 ;
-; GFX10-GISEL-LABEL: flat_inst_valu_offset_64bit_12bit_neg_high_split0:
-; GFX10-GISEL:       ; %bb.0:
-; GFX10-GISEL-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX10-GISEL-NEXT:    s_movk_i32 s4, 0xfff
-; GFX10-GISEL-NEXT:    s_brev_b32 s5, 1
-; GFX10-GISEL-NEXT:    v_mov_b32_e32 v2, s4
-; GFX10-GISEL-NEXT:    v_mov_b32_e32 v3, s5
-; GFX10-GISEL-NEXT:    v_add_co_u32 v0, vcc_lo, v0, v2
-; GFX10-GISEL-NEXT:    v_add_co_ci_u32_e32 v1, vcc_lo, v1, v3, vcc_lo
-; GFX10-GISEL-NEXT:    flat_load_ubyte v0, v[0:1]
-; GFX10-GISEL-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
-; GFX10-GISEL-NEXT:    s_setpc_b64 s[30:31]
-;
-; GFX11-GISEL-LABEL: flat_inst_valu_offset_64bit_12bit_neg_high_split0:
-; GFX11-GISEL:       ; %bb.0:
-; GFX11-GISEL-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX11-GISEL-NEXT:    s_movk_i32 s0, 0xfff
-; GFX11-GISEL-NEXT:    s_brev_b32 s1, 1
-; GFX11-GISEL-NEXT:    s_delay_alu instid0(SALU_CYCLE_1) | instskip(NEXT) | instid1(VALU_DEP_1)
-; GFX11-GISEL-NEXT:    v_dual_mov_b32 v3, s1 :: v_dual_mov_b32 v2, s0
-; GFX11-GISEL-NEXT:    v_add_co_u32 v0, vcc_lo, v0, v2
-; GFX11-GISEL-NEXT:    s_delay_alu instid0(VALU_DEP_2)
-; GFX11-GISEL-NEXT:    v_add_co_ci_u32_e32 v1, vcc_lo, v1, v3, vcc_lo
-; GFX11-GISEL-NEXT:    flat_load_u8 v0, v[0:1]
-; GFX11-GISEL-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
-; GFX11-GISEL-NEXT:    s_setpc_b64 s[30:31]
-;
 ; GFX12-GISEL-LABEL: flat_inst_valu_offset_64bit_12bit_neg_high_split0:
 ; GFX12-GISEL:       ; %bb.0:
 ; GFX12-GISEL-NEXT:    s_wait_loadcnt_dscnt 0x0
@@ -1771,13 +1473,8 @@ define i8 @flat_inst_valu_offset_64bit_12bit_neg_high_split0(ptr %p) {
 ; GFX12-GISEL-NEXT:    s_wait_samplecnt 0x0
 ; GFX12-GISEL-NEXT:    s_wait_bvhcnt 0x0
 ; GFX12-GISEL-NEXT:    s_wait_kmcnt 0x0
-; GFX12-GISEL-NEXT:    s_movk_i32 s0, 0xfff
-; GFX12-GISEL-NEXT:    s_brev_b32 s1, 1
-; GFX12-GISEL-NEXT:    s_delay_alu instid0(SALU_CYCLE_1) | instskip(NEXT) | instid1(VALU_DEP_1)
-; GFX12-GISEL-NEXT:    v_dual_mov_b32 v3, s1 :: v_dual_mov_b32 v2, s0
-; GFX12-GISEL-NEXT:    v_add_co_u32 v0, vcc_lo, v0, v2
-; GFX12-GISEL-NEXT:    s_delay_alu instid0(VALU_DEP_2)
-; GFX12-GISEL-NEXT:    v_add_co_ci_u32_e32 v1, vcc_lo, v1, v3, vcc_lo
+; GFX12-GISEL-NEXT:    v_add_co_u32 v0, vcc_lo, 0xfff, v0
+; GFX12-GISEL-NEXT:    v_add_co_ci_u32_e32 v1, vcc_lo, 0x80000000, v1, vcc_lo
 ; GFX12-GISEL-NEXT:    flat_load_u8 v0, v[0:1]
 ; GFX12-GISEL-NEXT:    s_wait_loadcnt_dscnt 0x0
 ; GFX12-GISEL-NEXT:    s_setpc_b64 s[30:31]
@@ -1798,23 +1495,23 @@ define i8 @flat_inst_valu_offset_64bit_12bit_neg_high_split1(ptr %p) {
 ; GFX9-SDAG-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
 ; GFX9-SDAG-NEXT:    s_setpc_b64 s[30:31]
 ;
-; GFX10-SDAG-LABEL: flat_inst_valu_offset_64bit_12bit_neg_high_split1:
-; GFX10-SDAG:       ; %bb.0:
-; GFX10-SDAG-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX10-SDAG-NEXT:    v_add_co_u32 v0, vcc_lo, 0x1000, v0
-; GFX10-SDAG-NEXT:    v_add_co_ci_u32_e32 v1, vcc_lo, 0x80000000, v1, vcc_lo
-; GFX10-SDAG-NEXT:    flat_load_ubyte v0, v[0:1]
-; GFX10-SDAG-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
-; GFX10-SDAG-NEXT:    s_setpc_b64 s[30:31]
+; GFX10-LABEL: flat_inst_valu_offset_64bit_12bit_neg_high_split1:
+; GFX10:       ; %bb.0:
+; GFX10-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX10-NEXT:    v_add_co_u32 v0, vcc_lo, 0x1000, v0
+; GFX10-NEXT:    v_add_co_ci_u32_e32 v1, vcc_lo, 0x80000000, v1, vcc_lo
+; GFX10-NEXT:    flat_load_ubyte v0, v[0:1]
+; GFX10-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
+; GFX10-NEXT:    s_setpc_b64 s[30:31]
 ;
-; GFX11-SDAG-LABEL: flat_inst_valu_offset_64bit_12bit_neg_high_split1:
-; GFX11-SDAG:       ; %bb.0:
-; GFX11-SDAG-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX11-SDAG-NEXT:    v_add_co_u32 v0, vcc_lo, 0x1000, v0
-; GFX11-SDAG-NEXT:    v_add_co_ci_u32_e32 v1, vcc_lo, 0x80000000, v1, vcc_lo
-; GFX11-SDAG-NEXT:    flat_load_u8 v0, v[0:1]
-; GFX11-SDAG-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
-; GFX11-SDAG-NEXT:    s_setpc_b64 s[30:31]
+; GFX11-LABEL: flat_inst_valu_offset_64bit_12bit_neg_high_split1:
+; GFX11:       ; %bb.0:
+; GFX11-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX11-NEXT:    v_add_co_u32 v0, vcc_lo, 0x1000, v0
+; GFX11-NEXT:    v_add_co_ci_u32_e32 v1, vcc_lo, 0x80000000, v1, vcc_lo
+; GFX11-NEXT:    flat_load_u8 v0, v[0:1]
+; GFX11-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
+; GFX11-NEXT:    s_setpc_b64 s[30:31]
 ;
 ; GFX12-SDAG-LABEL: flat_inst_valu_offset_64bit_12bit_neg_high_split1:
 ; GFX12-SDAG:       ; %bb.0:
@@ -1832,43 +1529,13 @@ define i8 @flat_inst_valu_offset_64bit_12bit_neg_high_split1(ptr %p) {
 ; GFX9-GISEL-LABEL: flat_inst_valu_offset_64bit_12bit_neg_high_split1:
 ; GFX9-GISEL:       ; %bb.0:
 ; GFX9-GISEL-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX9-GISEL-NEXT:    s_movk_i32 s4, 0x1000
-; GFX9-GISEL-NEXT:    s_brev_b32 s5, 1
-; GFX9-GISEL-NEXT:    v_mov_b32_e32 v2, s4
-; GFX9-GISEL-NEXT:    v_mov_b32_e32 v3, s5
-; GFX9-GISEL-NEXT:    v_add_co_u32_e32 v0, vcc, v0, v2
-; GFX9-GISEL-NEXT:    v_addc_co_u32_e32 v1, vcc, v1, v3, vcc
+; GFX9-GISEL-NEXT:    v_bfrev_b32_e32 v2, 1
+; GFX9-GISEL-NEXT:    v_add_co_u32_e32 v0, vcc, 0x1000, v0
+; GFX9-GISEL-NEXT:    v_addc_co_u32_e32 v1, vcc, v1, v2, vcc
 ; GFX9-GISEL-NEXT:    flat_load_ubyte v0, v[0:1]
 ; GFX9-GISEL-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
 ; GFX9-GISEL-NEXT:    s_setpc_b64 s[30:31]
 ;
-; GFX10-GISEL-LABEL: flat_inst_valu_offset_64bit_12bit_neg_high_split1:
-; GFX10-GISEL:       ; %bb.0:
-; GFX10-GISEL-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX10-GISEL-NEXT:    s_movk_i32 s4, 0x1000
-; GFX10-GISEL-NEXT:    s_brev_b32 s5, 1
-; GFX10-GISEL-NEXT:    v_mov_b32_e32 v2, s4
-; GFX10-GISEL-NEXT:    v_mov_b32_e32 v3, s5
-; GFX10-GISEL-NEXT:    v_add_co_u32 v0, vcc_lo, v0, v2
-; GFX10-GISEL-NEXT:    v_add_co_ci_u32_e32 v1, vcc_lo, v1, v3, vcc_lo
-; GFX10-GISEL-NEXT:    flat_load_ubyte v0, v[0:1]
-; GFX10-GISEL-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
-; GFX10-GISEL-NEXT:    s_setpc_b64 s[30:31]
-;
-; GFX11-GISEL-LABEL: flat_inst_valu_offset_64bit_12bit_neg_high_split1:
-; GFX11-GISEL:       ; %bb.0:
-; GFX11-GISEL-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX11-GISEL-NEXT:    s_movk_i32 s0, 0x1000
-; GFX11-GISEL-NEXT:    s_brev_b32 s1, 1
-; GFX11-GISEL-NEXT:    s_delay_alu instid0(SALU_CYCLE_1) | instskip(NEXT) | instid1(VALU_DEP_1)
-; GFX11-GISEL-NEXT:    v_dual_mov_b32 v3, s1 :: v_dual_mov_b32 v2, s0
-; GFX11-GISEL-NEXT:    v_add_co_u32 v0, vcc_lo, v0, v2
-; GFX11-GISEL-NEXT:    s_delay_alu instid0(VALU_DEP_2)
-; GFX11-GISEL-NEXT:    v_add_co_ci_u32_e32 v1, vcc_lo, v1, v3, vcc_lo
-; GFX11-GISEL-NEXT:    flat_load_u8 v0, v[0:1]
-; GFX11-GISEL-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
-; GFX11-GISEL-NEXT:    s_setpc_b64 s[30:31]
-;
 ; GFX12-GISEL-LABEL: flat_inst_valu_offset_64bit_12bit_neg_high_split1:
 ; GFX12-GISEL:       ; %bb.0:
 ; GFX12-GISEL-NEXT:    s_wait_loadcnt_dscnt 0x0
@@ -1876,13 +1543,8 @@ define i8 @flat_inst_valu_offset_64bit_12bit_neg_high_split1(ptr %p) {
 ; GFX12-GISEL-NEXT:    s_wait_samplecnt 0x0
 ; GFX12-GISEL-NEXT:    s_wait_bvhcnt 0x0
 ; GFX12-GISEL-NEXT:    s_wait_kmcnt 0x0
-; GFX12-GISEL-NEXT:    s_movk_i32 s0, 0x1000
-; GFX12-GISEL-NEXT:    s_brev_b32 s1, 1
-; GFX12-GISEL-NEXT:    s_delay_alu instid0(SALU_CYCLE_1) | instskip(NEXT) | instid1(VALU_DEP_1)
-; GFX12-GISEL-NEXT:    v_dual_mov_b32 v3, s1 :: v_dual_mov_b32 v2, s0
-; GFX12-GISEL-NEXT:    v_add_co_u32 v0, vcc_lo, v0, v2
-; GFX12-GISEL-NEXT:    s_delay_alu instid0(VALU_DEP_2)
-; GFX12-GISEL-NEXT:    v_add_co_ci_u32_e32 v1, vcc_lo, v1, v3, vcc_lo
+; GFX12-GISEL-NEXT:    v_add_co_u32 v0, vcc_lo, 0x1000, v0
+; GFX12-GISEL-NEXT:    v_add_co_ci_u32_e32 v1, vcc_lo, 0x80000000, v1, vcc_lo
 ; GFX12-GISEL-NEXT:    flat_load_u8 v0, v[0:1]
 ; GFX12-GISEL-NEXT:    s_wait_loadcnt_dscnt 0x0
 ; GFX12-GISEL-NEXT:    s_setpc_b64 s[30:31]
@@ -1903,23 +1565,23 @@ define i8 @flat_inst_valu_offset_64bit_13bit_neg_high_split0(ptr %p) {
 ; GFX9-SDAG-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
 ; GFX9-SDAG-NEXT:    s_setpc_b64 s[30:31]
 ;
-; GFX10-SDAG-LABEL: flat_inst_valu_offset_64bit_13bit_neg_high_split0:
-; GFX10-SDAG:       ; %bb.0:
-; GFX10-SDAG-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX10-SDAG-NEXT:    v_add_co_u32 v0, vcc_lo, 0x1fff, v0
-; GFX10-SDAG-NEXT:    v_add_co_ci_u32_e32 v1, vcc_lo, 0x80000000, v1, vcc_lo
-; GFX10-SDAG-NEXT:    flat_load_ubyte v0, v[0:1]
-; GFX10-SDAG-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
-; GFX10-SDAG-NEXT:    s_setpc_b64 s[30:31]
+; GFX10-LABEL: flat_inst_valu_offset_64bit_13bit_neg_high_split0:
+; GFX10:       ; %bb.0:
+; GFX10-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX10-NEXT:    v_add_co_u32 v0, vcc_lo, 0x1fff, v0
+; GFX10-NEXT:    v_add_co_ci_u32_e32 v1, vcc_lo, 0x80000000, v1, vcc_lo
+; GFX10-NEXT:    flat_load_ubyte v0, v[0:1]
+; GFX10-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
+; GFX10-NEXT:    s_setpc_b64 s[30:31]
 ;
-; GFX11-SDAG-LABEL: flat_inst_valu_offset_64bit_13bit_neg_high_split0:
-; GFX11-SDAG:       ; %bb.0:
-; GFX11-SDAG-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX11-SDAG-NEXT:    v_add_co_u32 v0, vcc_lo, 0x1fff, v0
-; GFX11-SDAG-NEXT:    v_add_co_ci_u32_e32 v1, vcc_lo, 0x80000000, v1, vcc_lo
-; GFX11-SDAG-NEXT:    flat_load_u8 v0, v[0:1]
-; GFX11-SDAG-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
-; GFX11-SDAG-NEXT:    s_setpc_b64 s[30:31]
+; GFX11-LABEL: flat_inst_valu_offset_64bit_13bit_neg_high_split0:
+; GFX11:       ; %bb.0:
+; GFX11-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX11-NEXT:    v_add_co_u32 v0, vcc_lo, 0x1fff, v0
+; GFX11-NEXT:    v_add_co_ci_u32_e32 v1, vcc_lo, 0x80000000, v1, vcc_lo
+; GFX11-NEXT:    flat_load_u8 v0, v[0:1]
+; GFX11-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
+; GFX11-NEXT:    s_setpc_b64 s[30:31]
 ;
 ; GFX12-SDAG-LABEL: flat_inst_valu_offset_64bit_13bit_neg_high_split0:
 ; GFX12-SDAG:       ; %bb.0:
@@ -1937,43 +1599,13 @@ define i8 @flat_inst_valu_offset_64bit_13bit_neg_high_split0(ptr %p) {
 ; GFX9-GISEL-LABEL: flat_inst_valu_offset_64bit_13bit_neg_high_split0:
 ; GFX9-GISEL:       ; %bb.0:
 ; GFX9-GISEL-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX9-GISEL-NEXT:    s_movk_i32 s4, 0x1fff
-; GFX9-GISEL-NEXT:    s_brev_b32 s5, 1
-; GFX9-GISEL-NEXT:    v_mov_b32_e32 v2, s4
-; GFX9-GISEL-NEXT:    v_mov_b32_e32 v3, s5
-; GFX9-GISEL-NEXT:    v_add_co_u32_e32 v0, vcc, v0, v2
-; GFX9-GISEL-NEXT:    v_addc_co_u32_e32 v1, vcc, v1, v3, vcc
+; GFX9-GISEL-NEXT:    v_bfrev_b32_e32 v2, 1
+; GFX9-GISEL-NEXT:    v_add_co_u32_e32 v0, vcc, 0x1fff, v0
+; GFX9-GISEL-NEXT:    v_addc_co_u32_e32 v1, vcc, v1, v2, vcc
 ; GFX9-GISEL-NEXT:    flat_load_ubyte v0, v[0:1]
 ; GFX9-GISEL-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
 ; GFX9-GISEL-NEXT:    s_setpc_b64 s[30:31]
 ;
-; GFX10-GISEL-LABEL: flat_inst_valu_offset_64bit_13bit_neg_high_split0:
-; GFX10-GISEL:       ; %bb.0:
-; GFX10-GISEL-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX10-GISEL-NEXT:    s_movk_i32 s4, 0x1fff
-; GFX10-GISEL-NEXT:    s_brev_b32 s5, 1
-; GFX10-GISEL-NEXT:    v_mov_b32_e32 v2, s4
-; GFX10-GISEL-NEXT:    v_mov_b32_e32 v3, s5
-; GFX10-GISEL-NEXT:    v_add_co_u32 v0, vcc_lo, v0, v2
-; GFX10-GISEL-NEXT:    v_add_co_ci_u32_e32 v1, vcc_lo, v1, v3, vcc_lo
-; GFX10-GISEL-NEXT:    flat_load_ubyte v0, v[0:1]
-; GFX10-GISEL-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
-; GFX10-GISEL-NEXT:    s_setpc_b64 s[30:31]
-;
-; GFX11-GISEL-LABEL: flat_inst_valu_offset_64bit_13bit_neg_high_split0:
-; GFX11-GISEL:       ; %bb.0:
-; GFX11-GISEL-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX11-GISEL-NEXT:    s_movk_i32 s0, 0x1fff
-; GFX11-GISEL-NEXT:    s_brev_b32 s1, 1
-; GFX11-GISEL-NEXT:    s_delay_alu instid0(SALU_CYCLE_1) | instskip(NEXT) | instid1(VALU_DEP_1)
-; GFX11-GISEL-NEXT:    v_dual_mov_b32 v3, s1 :: v_dual_mov_b32 v2, s0
-; GFX11-GISEL-NEXT:    v_add_co_u32 v0, vcc_lo, v0, v2
-; GFX11-GISEL-NEXT:    s_delay_alu instid0(VALU_DEP_2)
-; GFX11-GISEL-NEXT:    v_add_co_ci_u32_e32 v1, vcc_lo, v1, v3, vcc_lo
-; GFX11-GISEL-NEXT:    flat_load_u8 v0, v[0:1]
-; GFX11-GISEL-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
-; GFX11-GISEL-NEXT:    s_setpc_b64 s[30:31]
-;
 ; GFX12-GISEL-LABEL: flat_inst_valu_offset_64bit_13bit_neg_high_split0:
 ; GFX12-GISEL:       ; %bb.0:
 ; GFX12-GISEL-NEXT:    s_wait_loadcnt_dscnt 0x0
@@ -1981,13 +1613,8 @@ define i8 @flat_inst_valu_offset_64bit_13bit_neg_high_split0(ptr %p) {
 ; GFX12-GISEL-NEXT:    s_wait_samplecnt 0x0
 ; GFX12-GISEL-NEXT:    s_wait_bvhcnt 0x0
 ; GFX12-GISEL-NEXT:    s_wait_kmcnt 0x0
-; GFX12-GISEL-NEXT:    s_movk_i32 s0, 0x1fff
-; GFX12-GISEL-NEXT:    s_brev_b32 s1, 1
-; GFX12-GISEL-NEXT:    s_delay_alu instid0(SALU_CYCLE_1) | instskip(NEXT) | instid1(VALU_DEP_1)
-; GFX12-GISEL-NEXT:    v_dual_mov_b32 v3, s1 :: v_dual_mov_b32 v2, s0
-; GFX12-GISEL-NEXT:    v_add_co_u32 v0, vcc_lo, v0, v2
-; GFX12-GISEL-NEXT:    s_delay_alu instid0(VALU_DEP_2)
-; GFX12-GISEL-NEXT:    v_add_co_ci_u32_e32 v1, vcc_lo, v1, v3, vcc_lo
+; GFX12-GISEL-NEXT:    v_add_co_u32 v0, vcc_lo, 0x1fff, v0
+; GFX12-GISEL-NEXT:    v_add_co_ci_u32_e32 v1, vcc_lo, 0x80000000, v1, vcc_lo
 ; GFX12-GISEL-NEXT:    flat_load_u8 v0, v[0:1]
 ; GFX12-GISEL-NEXT:    s_wait_loadcnt_dscnt 0x0
 ; GFX12-GISEL-NEXT:    s_setpc_b64 s[30:31]
@@ -2008,23 +1635,23 @@ define i8 @flat_inst_valu_offset_64bit_13bit_neg_high_split1(ptr %p) {
 ; GFX9-SDAG-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
 ; GFX9-SDAG-NEXT:    s_setpc_b64 s[30:31]
 ;
-; GFX10-SDAG-LABEL: flat_inst_valu_offset_64bit_13bit_neg_high_split1:
-; GFX10-SDAG:       ; %bb.0:
-; GFX10-SDAG-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX10-SDAG-NEXT:    v_add_co_u32 v0, vcc_lo, 0x2000, v0
-; GFX10-SDAG-NEXT:    v_add_co_ci_u32_e32 v1, vcc_lo, 0x80000000, v1, vcc_lo
-; GFX10-SDAG-NEXT:    flat_load_ubyte v0, v[0:1]
-; GFX10-SDAG-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
-; GFX10-SDAG-NEXT:    s_setpc_b64 s[30:31]
+; GFX10-LABEL: flat_inst_valu_offset_64bit_13bit_neg_high_split1:
+; GFX10:       ; %bb.0:
+; GFX10-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX10-NEXT:    v_add_co_u32 v0, vcc_lo, 0x2000, v0
+; GFX10-NEXT:    v_add_co_ci_u32_e32 v1, vcc_lo, 0x80000000, v1, vcc_lo
+; GFX10-NEXT:    flat_load_ubyte v0, v[0:1]
+; GFX10-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
+; GFX10-NEXT:    s_setpc_b64 s[30:31]
 ;
-; GFX11-SDAG-LABEL: flat_inst_valu_offset_64bit_13bit_neg_high_split1:
-; GFX11-SDAG:       ; %bb.0:
-; GFX11-SDAG-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX11-SDAG-NEXT:    v_add_co_u32 v0, vcc_lo, 0x2000, v0
-; GFX11-SDAG-NEXT:    v_add_co_ci_u32_e32 v1, vcc_lo, 0x80000000, v1, vcc_lo
-; GFX11-SDAG-NEXT:    flat_load_u8 v0, v[0:1]
-; GFX11-SDAG-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
-; GFX11-SDAG-NEXT:    s_setpc_b64 s[30:31]
+; GFX11-LABEL: flat_inst_valu_offset_64bit_13bit_neg_high_split1:
+; GFX11:       ; %bb.0:
+; GFX11-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX11-NEXT:    v_add_co_u32 v0, vcc_lo, 0x2000, v0
+; GFX11-NEXT:    v_add_co_ci_u32_e32 v1, vcc_lo, 0x80000000, v1, vcc_lo
+; GFX11-NEXT:    flat_load_u8 v0, v[0:1]
+; GFX11-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
+; GFX11-NEXT:    s_setpc_b64 s[30:31]
 ;
 ; GFX12-SDAG-LABEL: flat_inst_valu_offset_64bit_13bit_neg_high_split1:
 ; GFX12-SDAG:       ; %bb.0:
@@ -2042,43 +1669,13 @@ define i8 @flat_inst_valu_offset_64bit_13bit_neg_high_split1(ptr %p) {
 ; GFX9-GISEL-LABEL: flat_inst_valu_offset_64bit_13bit_neg_high_split1:
 ; GFX9-GISEL:       ; %bb.0:
 ; GFX9-GISEL-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX9-GISEL-NEXT:    s_movk_i32 s4, 0x2000
-; GFX9-GISEL-NEXT:    s_brev_b32 s5, 1
-; GFX9-GISEL-NEXT:    v_mov_b32_e32 v2, s4
-; GFX9-GISEL-NEXT:    v_mov_b32_e32 v3, s5
-; GFX9-GISEL-NEXT:    v_add_co_u32_e32 v0, vcc, v0, v2
-; GFX9-GISEL-NEXT:    v_addc_co_u32_e32 v1, vcc, v1, v3, vcc
+; GFX9-GISEL-NEXT:    v_bfrev_b32_e32 v2, 1
+; GFX9-GISEL-NEXT:    v_add_co_u32_e32 v0, vcc, 0x2000, v0
+; GFX9-GISEL-NEXT:    v_addc_co_u32_e32 v1, vcc, v1, v2, vcc
 ; GFX9-GISEL-NEXT:    flat_load_ubyte v0, v[0:1]
 ; GFX9-GISEL-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
 ; GFX9-GISEL-NEXT:    s_setpc_b64 s[30:31]
 ;
-; GFX10-GISEL-LABEL: flat_inst_valu_offset_64bit_13bit_neg_high_split1:
-; GFX10-GISEL:       ; %bb.0:
-; GFX10-GISEL-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX10-GISEL-NEXT:    s_movk_i32 s4, 0x2000
-; GFX10-GISEL-NEXT:    s_brev_b32 s5, 1
-; GFX10-GISEL-NEXT:    v_mov_b32_e32 v2, s4
-; GFX10-GISEL-NEXT:    v_mov_b32_e32 v3, s5
-; GFX10-GISEL-NEXT:    v_add_co_u32 v0, vcc_lo, v0, v2
-; GFX10-GISEL-NEXT:    v_add_co_ci_u32_e32 v1, vcc_lo, v1, v3, vcc_lo
-; GFX10-GISEL-NEXT:    flat_load_ubyte v0, v[0:1]
-; GFX10-GISEL-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
-; GFX10-GISEL-NEXT:    s_setpc_b64 s[30:31]
-;
-; GFX11-GISEL-LABEL: flat_inst_valu_offset_64bit_13bit_neg_high_split1:
-; GFX11-GISEL:       ; %bb.0:
-; GFX11-GISEL-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX11-GISEL-NEXT:    s_movk_i32 s0, 0x2000
-; GFX11-GISEL-NEXT:    s_brev_b32 s1, 1
-; GFX11-GISEL-NEXT:    s_delay_alu instid0(SALU_CYCLE_1) | instskip(NEXT) | instid1(VALU_DEP_1)
-; GFX11-GISEL-NEXT:    v_dual_mov_b32 v3, s1 :: v_dual_mov_b32 v2, s0
-; GFX11-GISEL-NEXT:    v_add_co_u32 v0, vcc_lo, v0, v2
-; GFX11-GISEL-NEXT:    s_delay_alu instid0(VALU_DEP_2)
-; GFX11-GISEL-NEXT:    v_add_co_ci_u32_e32 v1, vcc_lo, v1, v3, vcc_lo
-; GFX11-GISEL-NEXT:    flat_load_u8 v0, v[0:1]
-; GFX11-GISEL-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
-; GFX11-GISEL-NEXT:    s_setpc_b64 s[30:31]
-;
 ; GFX12-GISEL-LABEL: flat_inst_valu_offset_64bit_13bit_neg_high_split1:
 ; GFX12-GISEL:       ; %bb.0:
 ; GFX12-GISEL-NEXT:    s_wait_loadcnt_dscnt 0x0
@@ -2086,13 +1683,8 @@ define i8 @flat_inst_valu_offset_64bit_13bit_neg_high_split1(ptr %p) {
 ; GFX12-GISEL-NEXT:    s_wait_samplecnt 0x0
 ; GFX12-GISEL-NEXT:    s_wait_bvhcnt 0x0
 ; GFX12-GISEL-NEXT:    s_wait_kmcnt 0x0
-; GFX12-GISEL-NEXT:    s_movk_i32 s0, 0x2000
-; GFX12-GISEL-NEXT:    s_brev_b32 s1, 1
-; GFX12-GISEL-NEXT:    s_delay_alu instid0(SALU_CYCLE_1) | instskip(NEXT) | instid1(VALU_DEP_1)
-; GFX12-GISEL-NEXT:    v_dual_mov_b32 v3, s1 :: v_dual_mov_b32 v2, s0
-; GFX12-GISEL-NEXT:    v_add_co_u32 v0, vcc_lo, v0, v2
-; GFX12-GISEL-NEXT:    s_delay_alu instid0(VALU_DEP_2)
-; GFX12-GISEL-NEXT:    v_add_co_ci_u32_e32 v1, vcc_lo, v1, v3, vcc_lo
+; GFX12-GISEL-NEXT:    v_add_co_u32 v0, vcc_lo, 0x2000, v0
+; GFX12-GISEL-NEXT:    v_add_co_ci_u32_e32 v1, vcc_lo, 0x80000000, v1, vcc_lo
 ; GFX12-GISEL-NEXT:    flat_load_u8 v0, v[0:1]
 ; GFX12-GISEL-NEXT:    s_wait_loadcnt_dscnt 0x0
 ; GFX12-GISEL-NEXT:    s_setpc_b64 s[30:31]
@@ -4187,3 +3779,6 @@ define amdgpu_kernel void @flat_inst_salu_offset_64bit_13bit_neg_high_split1(ptr
   store i8 %load, ptr undef
   ret void
 }
+;; NOTE: These prefixes are unused and the list is autogenerated. Do not add tests below this line:
+; GFX10-GISEL: {{.*}}
+; GFX10-SDAG: {{.*}}
diff --git a/llvm/test/CodeGen/AMDGPU/offset-split-global.ll b/llvm/test/CodeGen/AMDGPU/offset-split-global.ll
index b5b8213..548c196 100644
--- a/llvm/test/CodeGen/AMDGPU/offset-split-global.ll
+++ b/llvm/test/CodeGen/AMDGPU/offset-split-global.ll
@@ -924,12 +924,8 @@ define i8 @global_inst_valu_offset_64bit_11bit_split0(ptr addrspace(1) %p) {
 ; GFX9-GISEL-LABEL: global_inst_valu_offset_64bit_11bit_split0:
 ; GFX9-GISEL:       ; %bb.0:
 ; GFX9-GISEL-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX9-GISEL-NEXT:    s_movk_i32 s4, 0x7ff
-; GFX9-GISEL-NEXT:    s_mov_b32 s5, 2
-; GFX9-GISEL-NEXT:    v_mov_b32_e32 v2, s4
-; GFX9-GISEL-NEXT:    v_mov_b32_e32 v3, s5
-; GFX9-GISEL-NEXT:    v_add_co_u32_e32 v0, vcc, v0, v2
-; GFX9-GISEL-NEXT:    v_addc_co_u32_e32 v1, vcc, v1, v3, vcc
+; GFX9-GISEL-NEXT:    v_add_co_u32_e32 v0, vcc, 0x7ff, v0
+; GFX9-GISEL-NEXT:    v_addc_co_u32_e32 v1, vcc, 2, v1, vcc
 ; GFX9-GISEL-NEXT:    global_load_ubyte v0, v[0:1], off
 ; GFX9-GISEL-NEXT:    s_waitcnt vmcnt(0)
 ; GFX9-GISEL-NEXT:    s_setpc_b64 s[30:31]
@@ -937,12 +933,8 @@ define i8 @global_inst_valu_offset_64bit_11bit_split0(ptr addrspace(1) %p) {
 ; GFX10-GISEL-LABEL: global_inst_valu_offset_64bit_11bit_split0:
 ; GFX10-GISEL:       ; %bb.0:
 ; GFX10-GISEL-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX10-GISEL-NEXT:    s_movk_i32 s4, 0x7ff
-; GFX10-GISEL-NEXT:    s_mov_b32 s5, 2
-; GFX10-GISEL-NEXT:    v_mov_b32_e32 v2, s4
-; GFX10-GISEL-NEXT:    v_mov_b32_e32 v3, s5
-; GFX10-GISEL-NEXT:    v_add_co_u32 v0, vcc_lo, v0, v2
-; GFX10-GISEL-NEXT:    v_add_co_ci_u32_e32 v1, vcc_lo, v1, v3, vcc_lo
+; GFX10-GISEL-NEXT:    v_add_co_u32 v0, vcc_lo, 0x7ff, v0
+; GFX10-GISEL-NEXT:    v_add_co_ci_u32_e32 v1, vcc_lo, 2, v1, vcc_lo
 ; GFX10-GISEL-NEXT:    global_load_ubyte v0, v[0:1], off
 ; GFX10-GISEL-NEXT:    s_waitcnt vmcnt(0)
 ; GFX10-GISEL-NEXT:    s_setpc_b64 s[30:31]
@@ -950,13 +942,8 @@ define i8 @global_inst_valu_offset_64bit_11bit_split0(ptr addrspace(1) %p) {
 ; GFX11-GISEL-LABEL: global_inst_valu_offset_64bit_11bit_split0:
 ; GFX11-GISEL:       ; %bb.0:
 ; GFX11-GISEL-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX11-GISEL-NEXT:    s_movk_i32 s0, 0x7ff
-; GFX11-GISEL-NEXT:    s_mov_b32 s1, 2
-; GFX11-GISEL-NEXT:    s_delay_alu instid0(SALU_CYCLE_1) | instskip(NEXT) | instid1(VALU_DEP_1)
-; GFX11-GISEL-NEXT:    v_dual_mov_b32 v3, s1 :: v_dual_mov_b32 v2, s0
-; GFX11-GISEL-NEXT:    v_add_co_u32 v0, vcc_lo, v0, v2
-; GFX11-GISEL-NEXT:    s_delay_alu instid0(VALU_DEP_2)
-; GFX11-GISEL-NEXT:    v_add_co_ci_u32_e32 v1, vcc_lo, v1, v3, vcc_lo
+; GFX11-GISEL-NEXT:    v_add_co_u32 v0, vcc_lo, 0x7ff, v0
+; GFX11-GISEL-NEXT:    v_add_co_ci_u32_e32 v1, vcc_lo, 2, v1, vcc_lo
 ; GFX11-GISEL-NEXT:    global_load_u8 v0, v[0:1], off
 ; GFX11-GISEL-NEXT:    s_waitcnt vmcnt(0)
 ; GFX11-GISEL-NEXT:    s_setpc_b64 s[30:31]
@@ -968,13 +955,8 @@ define i8 @global_inst_valu_offset_64bit_11bit_split0(ptr addrspace(1) %p) {
 ; GFX12-GISEL-NEXT:    s_wait_samplecnt 0x0
 ; GFX12-GISEL-NEXT:    s_wait_bvhcnt 0x0
 ; GFX12-GISEL-NEXT:    s_wait_kmcnt 0x0
-; GFX12-GISEL-NEXT:    s_movk_i32 s0, 0x7ff
-; GFX12-GISEL-NEXT:    s_mov_b32 s1, 2
-; GFX12-GISEL-NEXT:    s_delay_alu instid0(SALU_CYCLE_1) | instskip(NEXT) | instid1(VALU_DEP_1)
-; GFX12-GISEL-NEXT:    v_dual_mov_b32 v3, s1 :: v_dual_mov_b32 v2, s0
-; GFX12-GISEL-NEXT:    v_add_co_u32 v0, vcc_lo, v0, v2
-; GFX12-GISEL-NEXT:    s_delay_alu instid0(VALU_DEP_2)
-; GFX12-GISEL-NEXT:    v_add_co_ci_u32_e32 v1, vcc_lo, v1, v3, vcc_lo
+; GFX12-GISEL-NEXT:    v_add_co_u32 v0, vcc_lo, 0x7ff, v0
+; GFX12-GISEL-NEXT:    v_add_co_ci_u32_e32 v1, vcc_lo, 2, v1, vcc_lo
 ; GFX12-GISEL-NEXT:    global_load_u8 v0, v[0:1], off
 ; GFX12-GISEL-NEXT:    s_wait_loadcnt 0x0
 ; GFX12-GISEL-NEXT:    s_setpc_b64 s[30:31]
@@ -1028,39 +1010,26 @@ define i8 @global_inst_valu_offset_64bit_11bit_split1(ptr addrspace(1) %p) {
 ; GFX9-GISEL-LABEL: global_inst_valu_offset_64bit_11bit_split1:
 ; GFX9-GISEL:       ; %bb.0:
 ; GFX9-GISEL-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX9-GISEL-NEXT:    s_movk_i32 s4, 0x800
-; GFX9-GISEL-NEXT:    s_mov_b32 s5, 2
-; GFX9-GISEL-NEXT:    v_mov_b32_e32 v2, s4
-; GFX9-GISEL-NEXT:    v_mov_b32_e32 v3, s5
-; GFX9-GISEL-NEXT:    v_add_co_u32_e32 v0, vcc, v0, v2
-; GFX9-GISEL-NEXT:    v_addc_co_u32_e32 v1, vcc, v1, v3, vcc
+; GFX9-GISEL-NEXT:    v_add_co_u32_e32 v0, vcc, 0x800, v0
+; GFX9-GISEL-NEXT:    v_addc_co_u32_e32 v1, vcc, 2, v1, vcc
 ; GFX9-GISEL-NEXT:    global_load_ubyte v0, v[0:1], off
 ; GFX9-GISEL-NEXT:    s_waitcnt vmcnt(0)
 ; GFX9-GISEL-NEXT:    s_setpc_b64 s[30:31]
 ;
-; GFX10-GISEL-LABEL: global_inst_valu_offset_64bit_11bit_split1:
-; GFX10-GISEL:       ; %bb.0:
-; GFX10-GISEL-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX10-GISEL-NEXT:    s_movk_i32 s4, 0x800
-; GFX10-GISEL-NEXT:    s_mov_b32 s5, 2
-; GFX10-GISEL-NEXT:    v_mov_b32_e32 v2, s4
-; GFX10-GISEL-NEXT:    v_mov_b32_e32 v3, s5
-; GFX10-GISEL-NEXT:    v_add_co_u32 v0, vcc_lo, v0, v2
-; GFX10-GISEL-NEXT:    v_add_co_ci_u32_e32 v1, vcc_lo, v1, v3, vcc_lo
-; GFX10-GISEL-NEXT:    global_load_ubyte v0, v[0:1], off
-; GFX10-GISEL-NEXT:    s_waitcnt vmcnt(0)
-; GFX10-GISEL-NEXT:    s_setpc_b64 s[30:31]
+; GFX10-LABEL: global_inst_valu_offset_64bit_11bit_split1:
+; GFX10:       ; %bb.0:
+; GFX10-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX10-NEXT:    v_add_co_u32 v0, vcc_lo, 0x800, v0
+; GFX10-NEXT:    v_add_co_ci_u32_e32 v1, vcc_lo, 2, v1, vcc_lo
+; GFX10-NEXT:    global_load_ubyte v0, v[0:1], off
+; GFX10-NEXT:    s_waitcnt vmcnt(0)
+; GFX10-NEXT:    s_setpc_b64 s[30:31]
 ;
 ; GFX11-GISEL-LABEL: global_inst_valu_offset_64bit_11bit_split1:
 ; GFX11-GISEL:       ; %bb.0:
 ; GFX11-GISEL-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX11-GISEL-NEXT:    s_movk_i32 s0, 0x800
-; GFX11-GISEL-NEXT:    s_mov_b32 s1, 2
-; GFX11-GISEL-NEXT:    s_delay_alu instid0(SALU_CYCLE_1) | instskip(NEXT) | instid1(VALU_DEP_1)
-; GFX11-GISEL-NEXT:    v_dual_mov_b32 v3, s1 :: v_dual_mov_b32 v2, s0
-; GFX11-GISEL-NEXT:    v_add_co_u32 v0, vcc_lo, v0, v2
-; GFX11-GISEL-NEXT:    s_delay_alu instid0(VALU_DEP_2)
-; GFX11-GISEL-NEXT:    v_add_co_ci_u32_e32 v1, vcc_lo, v1, v3, vcc_lo
+; GFX11-GISEL-NEXT:    v_add_co_u32 v0, vcc_lo, 0x800, v0
+; GFX11-GISEL-NEXT:    v_add_co_ci_u32_e32 v1, vcc_lo, 2, v1, vcc_lo
 ; GFX11-GISEL-NEXT:    global_load_u8 v0, v[0:1], off
 ; GFX11-GISEL-NEXT:    s_waitcnt vmcnt(0)
 ; GFX11-GISEL-NEXT:    s_setpc_b64 s[30:31]
@@ -1072,13 +1041,8 @@ define i8 @global_inst_valu_offset_64bit_11bit_split1(ptr addrspace(1) %p) {
 ; GFX12-GISEL-NEXT:    s_wait_samplecnt 0x0
 ; GFX12-GISEL-NEXT:    s_wait_bvhcnt 0x0
 ; GFX12-GISEL-NEXT:    s_wait_kmcnt 0x0
-; GFX12-GISEL-NEXT:    s_movk_i32 s0, 0x800
-; GFX12-GISEL-NEXT:    s_mov_b32 s1, 2
-; GFX12-GISEL-NEXT:    s_delay_alu instid0(SALU_CYCLE_1) | instskip(NEXT) | instid1(VALU_DEP_1)
-; GFX12-GISEL-NEXT:    v_dual_mov_b32 v3, s1 :: v_dual_mov_b32 v2, s0
-; GFX12-GISEL-NEXT:    v_add_co_u32 v0, vcc_lo, v0, v2
-; GFX12-GISEL-NEXT:    s_delay_alu instid0(VALU_DEP_2)
-; GFX12-GISEL-NEXT:    v_add_co_ci_u32_e32 v1, vcc_lo, v1, v3, vcc_lo
+; GFX12-GISEL-NEXT:    v_add_co_u32 v0, vcc_lo, 0x800, v0
+; GFX12-GISEL-NEXT:    v_add_co_ci_u32_e32 v1, vcc_lo, 2, v1, vcc_lo
 ; GFX12-GISEL-NEXT:    global_load_u8 v0, v[0:1], off
 ; GFX12-GISEL-NEXT:    s_wait_loadcnt 0x0
 ; GFX12-GISEL-NEXT:    s_setpc_b64 s[30:31]
@@ -1092,15 +1056,6 @@ define i8 @global_inst_valu_offset_64bit_11bit_split1(ptr addrspace(1) %p) {
 ; GFX9-SDAG-NEXT:    s_waitcnt vmcnt(0)
 ; GFX9-SDAG-NEXT:    s_setpc_b64 s[30:31]
 ;
-; GFX10-SDAG-LABEL: global_inst_valu_offset_64bit_11bit_split1:
-; GFX10-SDAG:       ; %bb.0:
-; GFX10-SDAG-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX10-SDAG-NEXT:    v_add_co_u32 v0, vcc_lo, 0x800, v0
-; GFX10-SDAG-NEXT:    v_add_co_ci_u32_e32 v1, vcc_lo, 2, v1, vcc_lo
-; GFX10-SDAG-NEXT:    global_load_ubyte v0, v[0:1], off
-; GFX10-SDAG-NEXT:    s_waitcnt vmcnt(0)
-; GFX10-SDAG-NEXT:    s_setpc_b64 s[30:31]
-;
 ; GFX11-SDAG-LABEL: global_inst_valu_offset_64bit_11bit_split1:
 ; GFX11-SDAG:       ; %bb.0:
 ; GFX11-SDAG-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
@@ -1132,12 +1087,8 @@ define i8 @global_inst_valu_offset_64bit_12bit_split0(ptr addrspace(1) %p) {
 ; GFX9-GISEL-LABEL: global_inst_valu_offset_64bit_12bit_split0:
 ; GFX9-GISEL:       ; %bb.0:
 ; GFX9-GISEL-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX9-GISEL-NEXT:    s_movk_i32 s4, 0xfff
-; GFX9-GISEL-NEXT:    s_mov_b32 s5, 2
-; GFX9-GISEL-NEXT:    v_mov_b32_e32 v2, s4
-; GFX9-GISEL-NEXT:    v_mov_b32_e32 v3, s5
-; GFX9-GISEL-NEXT:    v_add_co_u32_e32 v0, vcc, v0, v2
-; GFX9-GISEL-NEXT:    v_addc_co_u32_e32 v1, vcc, v1, v3, vcc
+; GFX9-GISEL-NEXT:    v_add_co_u32_e32 v0, vcc, 0xfff, v0
+; GFX9-GISEL-NEXT:    v_addc_co_u32_e32 v1, vcc, 2, v1, vcc
 ; GFX9-GISEL-NEXT:    global_load_ubyte v0, v[0:1], off
 ; GFX9-GISEL-NEXT:    s_waitcnt vmcnt(0)
 ; GFX9-GISEL-NEXT:    s_setpc_b64 s[30:31]
@@ -1145,12 +1096,8 @@ define i8 @global_inst_valu_offset_64bit_12bit_split0(ptr addrspace(1) %p) {
 ; GFX10-GISEL-LABEL: global_inst_valu_offset_64bit_12bit_split0:
 ; GFX10-GISEL:       ; %bb.0:
 ; GFX10-GISEL-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX10-GISEL-NEXT:    s_movk_i32 s4, 0xfff
-; GFX10-GISEL-NEXT:    s_mov_b32 s5, 2
-; GFX10-GISEL-NEXT:    v_mov_b32_e32 v2, s4
-; GFX10-GISEL-NEXT:    v_mov_b32_e32 v3, s5
-; GFX10-GISEL-NEXT:    v_add_co_u32 v0, vcc_lo, v0, v2
-; GFX10-GISEL-NEXT:    v_add_co_ci_u32_e32 v1, vcc_lo, v1, v3, vcc_lo
+; GFX10-GISEL-NEXT:    v_add_co_u32 v0, vcc_lo, 0xfff, v0
+; GFX10-GISEL-NEXT:    v_add_co_ci_u32_e32 v1, vcc_lo, 2, v1, vcc_lo
 ; GFX10-GISEL-NEXT:    global_load_ubyte v0, v[0:1], off
 ; GFX10-GISEL-NEXT:    s_waitcnt vmcnt(0)
 ; GFX10-GISEL-NEXT:    s_setpc_b64 s[30:31]
@@ -1158,13 +1105,8 @@ define i8 @global_inst_valu_offset_64bit_12bit_split0(ptr addrspace(1) %p) {
 ; GFX11-GISEL-LABEL: global_inst_valu_offset_64bit_12bit_split0:
 ; GFX11-GISEL:       ; %bb.0:
 ; GFX11-GISEL-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX11-GISEL-NEXT:    s_movk_i32 s0, 0xfff
-; GFX11-GISEL-NEXT:    s_mov_b32 s1, 2
-; GFX11-GISEL-NEXT:    s_delay_alu instid0(SALU_CYCLE_1) | instskip(NEXT) | instid1(VALU_DEP_1)
-; GFX11-GISEL-NEXT:    v_dual_mov_b32 v3, s1 :: v_dual_mov_b32 v2, s0
-; GFX11-GISEL-NEXT:    v_add_co_u32 v0, vcc_lo, v0, v2
-; GFX11-GISEL-NEXT:    s_delay_alu instid0(VALU_DEP_2)
-; GFX11-GISEL-NEXT:    v_add_co_ci_u32_e32 v1, vcc_lo, v1, v3, vcc_lo
+; GFX11-GISEL-NEXT:    v_add_co_u32 v0, vcc_lo, 0xfff, v0
+; GFX11-GISEL-NEXT:    v_add_co_ci_u32_e32 v1, vcc_lo, 2, v1, vcc_lo
 ; GFX11-GISEL-NEXT:    global_load_u8 v0, v[0:1], off
 ; GFX11-GISEL-NEXT:    s_waitcnt vmcnt(0)
 ; GFX11-GISEL-NEXT:    s_setpc_b64 s[30:31]
@@ -1176,13 +1118,8 @@ define i8 @global_inst_valu_offset_64bit_12bit_split0(ptr addrspace(1) %p) {
 ; GFX12-GISEL-NEXT:    s_wait_samplecnt 0x0
 ; GFX12-GISEL-NEXT:    s_wait_bvhcnt 0x0
 ; GFX12-GISEL-NEXT:    s_wait_kmcnt 0x0
-; GFX12-GISEL-NEXT:    s_movk_i32 s0, 0xfff
-; GFX12-GISEL-NEXT:    s_mov_b32 s1, 2
-; GFX12-GISEL-NEXT:    s_delay_alu instid0(SALU_CYCLE_1) | instskip(NEXT) | instid1(VALU_DEP_1)
-; GFX12-GISEL-NEXT:    v_dual_mov_b32 v3, s1 :: v_dual_mov_b32 v2, s0
-; GFX12-GISEL-NEXT:    v_add_co_u32 v0, vcc_lo, v0, v2
-; GFX12-GISEL-NEXT:    s_delay_alu instid0(VALU_DEP_2)
-; GFX12-GISEL-NEXT:    v_add_co_ci_u32_e32 v1, vcc_lo, v1, v3, vcc_lo
+; GFX12-GISEL-NEXT:    v_add_co_u32 v0, vcc_lo, 0xfff, v0
+; GFX12-GISEL-NEXT:    v_add_co_ci_u32_e32 v1, vcc_lo, 2, v1, vcc_lo
 ; GFX12-GISEL-NEXT:    global_load_u8 v0, v[0:1], off
 ; GFX12-GISEL-NEXT:    s_wait_loadcnt 0x0
 ; GFX12-GISEL-NEXT:    s_setpc_b64 s[30:31]
@@ -1233,45 +1170,32 @@ define i8 @global_inst_valu_offset_64bit_12bit_split0(ptr addrspace(1) %p) {
 
 ; Fill 12-bit low-bits (1ull << 33) | 4096
 define i8 @global_inst_valu_offset_64bit_12bit_split1(ptr addrspace(1) %p) {
-; GFX9-GISEL-LABEL: global_inst_valu_offset_64bit_12bit_split1:
-; GFX9-GISEL:       ; %bb.0:
-; GFX9-GISEL-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX9-GISEL-NEXT:    s_movk_i32 s4, 0x1000
-; GFX9-GISEL-NEXT:    s_mov_b32 s5, 2
-; GFX9-GISEL-NEXT:    v_mov_b32_e32 v2, s4
-; GFX9-GISEL-NEXT:    v_mov_b32_e32 v3, s5
-; GFX9-GISEL-NEXT:    v_add_co_u32_e32 v0, vcc, v0, v2
-; GFX9-GISEL-NEXT:    v_addc_co_u32_e32 v1, vcc, v1, v3, vcc
-; GFX9-GISEL-NEXT:    global_load_ubyte v0, v[0:1], off
-; GFX9-GISEL-NEXT:    s_waitcnt vmcnt(0)
-; GFX9-GISEL-NEXT:    s_setpc_b64 s[30:31]
+; GFX9-LABEL: global_inst_valu_offset_64bit_12bit_split1:
+; GFX9:       ; %bb.0:
+; GFX9-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX9-NEXT:    v_add_co_u32_e32 v0, vcc, 0x1000, v0
+; GFX9-NEXT:    v_addc_co_u32_e32 v1, vcc, 2, v1, vcc
+; GFX9-NEXT:    global_load_ubyte v0, v[0:1], off
+; GFX9-NEXT:    s_waitcnt vmcnt(0)
+; GFX9-NEXT:    s_setpc_b64 s[30:31]
 ;
-; GFX10-GISEL-LABEL: global_inst_valu_offset_64bit_12bit_split1:
-; GFX10-GISEL:       ; %bb.0:
-; GFX10-GISEL-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX10-GISEL-NEXT:    s_movk_i32 s4, 0x1000
-; GFX10-GISEL-NEXT:    s_mov_b32 s5, 2
-; GFX10-GISEL-NEXT:    v_mov_b32_e32 v2, s4
-; GFX10-GISEL-NEXT:    v_mov_b32_e32 v3, s5
-; GFX10-GISEL-NEXT:    v_add_co_u32 v0, vcc_lo, v0, v2
-; GFX10-GISEL-NEXT:    v_add_co_ci_u32_e32 v1, vcc_lo, v1, v3, vcc_lo
-; GFX10-GISEL-NEXT:    global_load_ubyte v0, v[0:1], off
-; GFX10-GISEL-NEXT:    s_waitcnt vmcnt(0)
-; GFX10-GISEL-NEXT:    s_setpc_b64 s[30:31]
+; GFX10-LABEL: global_inst_valu_offset_64bit_12bit_split1:
+; GFX10:       ; %bb.0:
+; GFX10-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX10-NEXT:    v_add_co_u32 v0, vcc_lo, 0x1000, v0
+; GFX10-NEXT:    v_add_co_ci_u32_e32 v1, vcc_lo, 2, v1, vcc_lo
+; GFX10-NEXT:    global_load_ubyte v0, v[0:1], off
+; GFX10-NEXT:    s_waitcnt vmcnt(0)
+; GFX10-NEXT:    s_setpc_b64 s[30:31]
 ;
-; GFX11-GISEL-LABEL: global_inst_valu_offset_64bit_12bit_split1:
-; GFX11-GISEL:       ; %bb.0:
-; GFX11-GISEL-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX11-GISEL-NEXT:    s_movk_i32 s0, 0x1000
-; GFX11-GISEL-NEXT:    s_mov_b32 s1, 2
-; GFX11-GISEL-NEXT:    s_delay_alu instid0(SALU_CYCLE_1) | instskip(NEXT) | instid1(VALU_DEP_1)
-; GFX11-GISEL-NEXT:    v_dual_mov_b32 v3, s1 :: v_dual_mov_b32 v2, s0
-; GFX11-GISEL-NEXT:    v_add_co_u32 v0, vcc_lo, v0, v2
-; GFX11-GISEL-NEXT:    s_delay_alu instid0(VALU_DEP_2)
-; GFX11-GISEL-NEXT:    v_add_co_ci_u32_e32 v1, vcc_lo, v1, v3, vcc_lo
-; GFX11-GISEL-NEXT:    global_load_u8 v0, v[0:1], off
-; GFX11-GISEL-NEXT:    s_waitcnt vmcnt(0)
-; GFX11-GISEL-NEXT:    s_setpc_b64 s[30:31]
+; GFX11-LABEL: global_inst_valu_offset_64bit_12bit_split1:
+; GFX11:       ; %bb.0:
+; GFX11-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX11-NEXT:    v_add_co_u32 v0, vcc_lo, 0x1000, v0
+; GFX11-NEXT:    v_add_co_ci_u32_e32 v1, vcc_lo, 2, v1, vcc_lo
+; GFX11-NEXT:    global_load_u8 v0, v[0:1], off
+; GFX11-NEXT:    s_waitcnt vmcnt(0)
+; GFX11-NEXT:    s_setpc_b64 s[30:31]
 ;
 ; GFX12-GISEL-LABEL: global_inst_valu_offset_64bit_12bit_split1:
 ; GFX12-GISEL:       ; %bb.0:
@@ -1280,44 +1204,12 @@ define i8 @global_inst_valu_offset_64bit_12bit_split1(ptr addrspace(1) %p) {
 ; GFX12-GISEL-NEXT:    s_wait_samplecnt 0x0
 ; GFX12-GISEL-NEXT:    s_wait_bvhcnt 0x0
 ; GFX12-GISEL-NEXT:    s_wait_kmcnt 0x0
-; GFX12-GISEL-NEXT:    s_movk_i32 s0, 0x1000
-; GFX12-GISEL-NEXT:    s_mov_b32 s1, 2
-; GFX12-GISEL-NEXT:    s_delay_alu instid0(SALU_CYCLE_1) | instskip(NEXT) | instid1(VALU_DEP_1)
-; GFX12-GISEL-NEXT:    v_dual_mov_b32 v3, s1 :: v_dual_mov_b32 v2, s0
-; GFX12-GISEL-NEXT:    v_add_co_u32 v0, vcc_lo, v0, v2
-; GFX12-GISEL-NEXT:    s_delay_alu instid0(VALU_DEP_2)
-; GFX12-GISEL-NEXT:    v_add_co_ci_u32_e32 v1, vcc_lo, v1, v3, vcc_lo
+; GFX12-GISEL-NEXT:    v_add_co_u32 v0, vcc_lo, 0x1000, v0
+; GFX12-GISEL-NEXT:    v_add_co_ci_u32_e32 v1, vcc_lo, 2, v1, vcc_lo
 ; GFX12-GISEL-NEXT:    global_load_u8 v0, v[0:1], off
 ; GFX12-GISEL-NEXT:    s_wait_loadcnt 0x0
 ; GFX12-GISEL-NEXT:    s_setpc_b64 s[30:31]
 ;
-; GFX9-SDAG-LABEL: global_inst_valu_offset_64bit_12bit_split1:
-; GFX9-SDAG:       ; %bb.0:
-; GFX9-SDAG-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX9-SDAG-NEXT:    v_add_co_u32_e32 v0, vcc, 0x1000, v0
-; GFX9-SDAG-NEXT:    v_addc_co_u32_e32 v1, vcc, 2, v1, vcc
-; GFX9-SDAG-NEXT:    global_load_ubyte v0, v[0:1], off
-; GFX9-SDAG-NEXT:    s_waitcnt vmcnt(0)
-; GFX9-SDAG-NEXT:    s_setpc_b64 s[30:31]
-;
-; GFX10-SDAG-LABEL: global_inst_valu_offset_64bit_12bit_split1:
-; GFX10-SDAG:       ; %bb.0:
-; GFX10-SDAG-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX10-SDAG-NEXT:    v_add_co_u32 v0, vcc_lo, 0x1000, v0
-; GFX10-SDAG-NEXT:    v_add_co_ci_u32_e32 v1, vcc_lo, 2, v1, vcc_lo
-; GFX10-SDAG-NEXT:    global_load_ubyte v0, v[0:1], off
-; GFX10-SDAG-NEXT:    s_waitcnt vmcnt(0)
-; GFX10-SDAG-NEXT:    s_setpc_b64 s[30:31]
-;
-; GFX11-SDAG-LABEL: global_inst_valu_offset_64bit_12bit_split1:
-; GFX11-SDAG:       ; %bb.0:
-; GFX11-SDAG-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX11-SDAG-NEXT:    v_add_co_u32 v0, vcc_lo, 0x1000, v0
-; GFX11-SDAG-NEXT:    v_add_co_ci_u32_e32 v1, vcc_lo, 2, v1, vcc_lo
-; GFX11-SDAG-NEXT:    global_load_u8 v0, v[0:1], off
-; GFX11-SDAG-NEXT:    s_waitcnt vmcnt(0)
-; GFX11-SDAG-NEXT:    s_setpc_b64 s[30:31]
-;
 ; GFX12-SDAG-LABEL: global_inst_valu_offset_64bit_12bit_split1:
 ; GFX12-SDAG:       ; %bb.0:
 ; GFX12-SDAG-NEXT:    s_wait_loadcnt_dscnt 0x0
@@ -1340,12 +1232,8 @@ define i8 @global_inst_valu_offset_64bit_13bit_split0(ptr addrspace(1) %p) {
 ; GFX9-GISEL-LABEL: global_inst_valu_offset_64bit_13bit_split0:
 ; GFX9-GISEL:       ; %bb.0:
 ; GFX9-GISEL-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX9-GISEL-NEXT:    s_movk_i32 s4, 0x1fff
-; GFX9-GISEL-NEXT:    s_mov_b32 s5, 2
-; GFX9-GISEL-NEXT:    v_mov_b32_e32 v2, s4
-; GFX9-GISEL-NEXT:    v_mov_b32_e32 v3, s5
-; GFX9-GISEL-NEXT:    v_add_co_u32_e32 v0, vcc, v0, v2
-; GFX9-GISEL-NEXT:    v_addc_co_u32_e32 v1, vcc, v1, v3, vcc
+; GFX9-GISEL-NEXT:    v_add_co_u32_e32 v0, vcc, 0x1fff, v0
+; GFX9-GISEL-NEXT:    v_addc_co_u32_e32 v1, vcc, 2, v1, vcc
 ; GFX9-GISEL-NEXT:    global_load_ubyte v0, v[0:1], off
 ; GFX9-GISEL-NEXT:    s_waitcnt vmcnt(0)
 ; GFX9-GISEL-NEXT:    s_setpc_b64 s[30:31]
@@ -1353,12 +1241,8 @@ define i8 @global_inst_valu_offset_64bit_13bit_split0(ptr addrspace(1) %p) {
 ; GFX10-GISEL-LABEL: global_inst_valu_offset_64bit_13bit_split0:
 ; GFX10-GISEL:       ; %bb.0:
 ; GFX10-GISEL-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX10-GISEL-NEXT:    s_movk_i32 s4, 0x1fff
-; GFX10-GISEL-NEXT:    s_mov_b32 s5, 2
-; GFX10-GISEL-NEXT:    v_mov_b32_e32 v2, s4
-; GFX10-GISEL-NEXT:    v_mov_b32_e32 v3, s5
-; GFX10-GISEL-NEXT:    v_add_co_u32 v0, vcc_lo, v0, v2
-; GFX10-GISEL-NEXT:    v_add_co_ci_u32_e32 v1, vcc_lo, v1, v3, vcc_lo
+; GFX10-GISEL-NEXT:    v_add_co_u32 v0, vcc_lo, 0x1fff, v0
+; GFX10-GISEL-NEXT:    v_add_co_ci_u32_e32 v1, vcc_lo, 2, v1, vcc_lo
 ; GFX10-GISEL-NEXT:    global_load_ubyte v0, v[0:1], off
 ; GFX10-GISEL-NEXT:    s_waitcnt vmcnt(0)
 ; GFX10-GISEL-NEXT:    s_setpc_b64 s[30:31]
@@ -1366,13 +1250,8 @@ define i8 @global_inst_valu_offset_64bit_13bit_split0(ptr addrspace(1) %p) {
 ; GFX11-GISEL-LABEL: global_inst_valu_offset_64bit_13bit_split0:
 ; GFX11-GISEL:       ; %bb.0:
 ; GFX11-GISEL-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX11-GISEL-NEXT:    s_movk_i32 s0, 0x1fff
-; GFX11-GISEL-NEXT:    s_mov_b32 s1, 2
-; GFX11-GISEL-NEXT:    s_delay_alu instid0(SALU_CYCLE_1) | instskip(NEXT) | instid1(VALU_DEP_1)
-; GFX11-GISEL-NEXT:    v_dual_mov_b32 v3, s1 :: v_dual_mov_b32 v2, s0
-; GFX11-GISEL-NEXT:    v_add_co_u32 v0, vcc_lo, v0, v2
-; GFX11-GISEL-NEXT:    s_delay_alu instid0(VALU_DEP_2)
-; GFX11-GISEL-NEXT:    v_add_co_ci_u32_e32 v1, vcc_lo, v1, v3, vcc_lo
+; GFX11-GISEL-NEXT:    v_add_co_u32 v0, vcc_lo, 0x1fff, v0
+; GFX11-GISEL-NEXT:    v_add_co_ci_u32_e32 v1, vcc_lo, 2, v1, vcc_lo
 ; GFX11-GISEL-NEXT:    global_load_u8 v0, v[0:1], off
 ; GFX11-GISEL-NEXT:    s_waitcnt vmcnt(0)
 ; GFX11-GISEL-NEXT:    s_setpc_b64 s[30:31]
@@ -1384,13 +1263,8 @@ define i8 @global_inst_valu_offset_64bit_13bit_split0(ptr addrspace(1) %p) {
 ; GFX12-GISEL-NEXT:    s_wait_samplecnt 0x0
 ; GFX12-GISEL-NEXT:    s_wait_bvhcnt 0x0
 ; GFX12-GISEL-NEXT:    s_wait_kmcnt 0x0
-; GFX12-GISEL-NEXT:    s_movk_i32 s0, 0x1fff
-; GFX12-GISEL-NEXT:    s_mov_b32 s1, 2
-; GFX12-GISEL-NEXT:    s_delay_alu instid0(SALU_CYCLE_1) | instskip(NEXT) | instid1(VALU_DEP_1)
-; GFX12-GISEL-NEXT:    v_dual_mov_b32 v3, s1 :: v_dual_mov_b32 v2, s0
-; GFX12-GISEL-NEXT:    v_add_co_u32 v0, vcc_lo, v0, v2
-; GFX12-GISEL-NEXT:    s_delay_alu instid0(VALU_DEP_2)
-; GFX12-GISEL-NEXT:    v_add_co_ci_u32_e32 v1, vcc_lo, v1, v3, vcc_lo
+; GFX12-GISEL-NEXT:    v_add_co_u32 v0, vcc_lo, 0x1fff, v0
+; GFX12-GISEL-NEXT:    v_add_co_ci_u32_e32 v1, vcc_lo, 2, v1, vcc_lo
 ; GFX12-GISEL-NEXT:    global_load_u8 v0, v[0:1], off
 ; GFX12-GISEL-NEXT:    s_wait_loadcnt 0x0
 ; GFX12-GISEL-NEXT:    s_setpc_b64 s[30:31]
@@ -1441,45 +1315,32 @@ define i8 @global_inst_valu_offset_64bit_13bit_split0(ptr addrspace(1) %p) {
 
 ; Fill 13-bit low-bits (1ull << 33) | 8192
 define i8 @global_inst_valu_offset_64bit_13bit_split1(ptr addrspace(1) %p) {
-; GFX9-GISEL-LABEL: global_inst_valu_offset_64bit_13bit_split1:
-; GFX9-GISEL:       ; %bb.0:
-; GFX9-GISEL-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX9-GISEL-NEXT:    s_movk_i32 s4, 0x2000
-; GFX9-GISEL-NEXT:    s_mov_b32 s5, 2
-; GFX9-GISEL-NEXT:    v_mov_b32_e32 v2, s4
-; GFX9-GISEL-NEXT:    v_mov_b32_e32 v3, s5
-; GFX9-GISEL-NEXT:    v_add_co_u32_e32 v0, vcc, v0, v2
-; GFX9-GISEL-NEXT:    v_addc_co_u32_e32 v1, vcc, v1, v3, vcc
-; GFX9-GISEL-NEXT:    global_load_ubyte v0, v[0:1], off
-; GFX9-GISEL-NEXT:    s_waitcnt vmcnt(0)
-; GFX9-GISEL-NEXT:    s_setpc_b64 s[30:31]
+; GFX9-LABEL: global_inst_valu_offset_64bit_13bit_split1:
+; GFX9:       ; %bb.0:
+; GFX9-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX9-NEXT:    v_add_co_u32_e32 v0, vcc, 0x2000, v0
+; GFX9-NEXT:    v_addc_co_u32_e32 v1, vcc, 2, v1, vcc
+; GFX9-NEXT:    global_load_ubyte v0, v[0:1], off
+; GFX9-NEXT:    s_waitcnt vmcnt(0)
+; GFX9-NEXT:    s_setpc_b64 s[30:31]
 ;
-; GFX10-GISEL-LABEL: global_inst_valu_offset_64bit_13bit_split1:
-; GFX10-GISEL:       ; %bb.0:
-; GFX10-GISEL-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX10-GISEL-NEXT:    s_movk_i32 s4, 0x2000
-; GFX10-GISEL-NEXT:    s_mov_b32 s5, 2
-; GFX10-GISEL-NEXT:    v_mov_b32_e32 v2, s4
-; GFX10-GISEL-NEXT:    v_mov_b32_e32 v3, s5
-; GFX10-GISEL-NEXT:    v_add_co_u32 v0, vcc_lo, v0, v2
-; GFX10-GISEL-NEXT:    v_add_co_ci_u32_e32 v1, vcc_lo, v1, v3, vcc_lo
-; GFX10-GISEL-NEXT:    global_load_ubyte v0, v[0:1], off
-; GFX10-GISEL-NEXT:    s_waitcnt vmcnt(0)
-; GFX10-GISEL-NEXT:    s_setpc_b64 s[30:31]
+; GFX10-LABEL: global_inst_valu_offset_64bit_13bit_split1:
+; GFX10:       ; %bb.0:
+; GFX10-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX10-NEXT:    v_add_co_u32 v0, vcc_lo, 0x2000, v0
+; GFX10-NEXT:    v_add_co_ci_u32_e32 v1, vcc_lo, 2, v1, vcc_lo
+; GFX10-NEXT:    global_load_ubyte v0, v[0:1], off
+; GFX10-NEXT:    s_waitcnt vmcnt(0)
+; GFX10-NEXT:    s_setpc_b64 s[30:31]
 ;
-; GFX11-GISEL-LABEL: global_inst_valu_offset_64bit_13bit_split1:
-; GFX11-GISEL:       ; %bb.0:
-; GFX11-GISEL-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX11-GISEL-NEXT:    s_movk_i32 s0, 0x2000
-; GFX11-GISEL-NEXT:    s_mov_b32 s1, 2
-; GFX11-GISEL-NEXT:    s_delay_alu instid0(SALU_CYCLE_1) | instskip(NEXT) | instid1(VALU_DEP_1)
-; GFX11-GISEL-NEXT:    v_dual_mov_b32 v3, s1 :: v_dual_mov_b32 v2, s0
-; GFX11-GISEL-NEXT:    v_add_co_u32 v0, vcc_lo, v0, v2
-; GFX11-GISEL-NEXT:    s_delay_alu instid0(VALU_DEP_2)
-; GFX11-GISEL-NEXT:    v_add_co_ci_u32_e32 v1, vcc_lo, v1, v3, vcc_lo
-; GFX11-GISEL-NEXT:    global_load_u8 v0, v[0:1], off
-; GFX11-GISEL-NEXT:    s_waitcnt vmcnt(0)
-; GFX11-GISEL-NEXT:    s_setpc_b64 s[30:31]
+; GFX11-LABEL: global_inst_valu_offset_64bit_13bit_split1:
+; GFX11:       ; %bb.0:
+; GFX11-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX11-NEXT:    v_add_co_u32 v0, vcc_lo, 0x2000, v0
+; GFX11-NEXT:    v_add_co_ci_u32_e32 v1, vcc_lo, 2, v1, vcc_lo
+; GFX11-NEXT:    global_load_u8 v0, v[0:1], off
+; GFX11-NEXT:    s_waitcnt vmcnt(0)
+; GFX11-NEXT:    s_setpc_b64 s[30:31]
 ;
 ; GFX12-GISEL-LABEL: global_inst_valu_offset_64bit_13bit_split1:
 ; GFX12-GISEL:       ; %bb.0:
@@ -1488,44 +1349,12 @@ define i8 @global_inst_valu_offset_64bit_13bit_split1(ptr addrspace(1) %p) {
 ; GFX12-GISEL-NEXT:    s_wait_samplecnt 0x0
 ; GFX12-GISEL-NEXT:    s_wait_bvhcnt 0x0
 ; GFX12-GISEL-NEXT:    s_wait_kmcnt 0x0
-; GFX12-GISEL-NEXT:    s_movk_i32 s0, 0x2000
-; GFX12-GISEL-NEXT:    s_mov_b32 s1, 2
-; GFX12-GISEL-NEXT:    s_delay_alu instid0(SALU_CYCLE_1) | instskip(NEXT) | instid1(VALU_DEP_1)
-; GFX12-GISEL-NEXT:    v_dual_mov_b32 v3, s1 :: v_dual_mov_b32 v2, s0
-; GFX12-GISEL-NEXT:    v_add_co_u32 v0, vcc_lo, v0, v2
-; GFX12-GISEL-NEXT:    s_delay_alu instid0(VALU_DEP_2)
-; GFX12-GISEL-NEXT:    v_add_co_ci_u32_e32 v1, vcc_lo, v1, v3, vcc_lo
+; GFX12-GISEL-NEXT:    v_add_co_u32 v0, vcc_lo, 0x2000, v0
+; GFX12-GISEL-NEXT:    v_add_co_ci_u32_e32 v1, vcc_lo, 2, v1, vcc_lo
 ; GFX12-GISEL-NEXT:    global_load_u8 v0, v[0:1], off
 ; GFX12-GISEL-NEXT:    s_wait_loadcnt 0x0
 ; GFX12-GISEL-NEXT:    s_setpc_b64 s[30:31]
 ;
-; GFX9-SDAG-LABEL: global_inst_valu_offset_64bit_13bit_split1:
-; GFX9-SDAG:       ; %bb.0:
-; GFX9-SDAG-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX9-SDAG-NEXT:    v_add_co_u32_e32 v0, vcc, 0x2000, v0
-; GFX9-SDAG-NEXT:    v_addc_co_u32_e32 v1, vcc, 2, v1, vcc
-; GFX9-SDAG-NEXT:    global_load_ubyte v0, v[0:1], off
-; GFX9-SDAG-NEXT:    s_waitcnt vmcnt(0)
-; GFX9-SDAG-NEXT:    s_setpc_b64 s[30:31]
-;
-; GFX10-SDAG-LABEL: global_inst_valu_offset_64bit_13bit_split1:
-; GFX10-SDAG:       ; %bb.0:
-; GFX10-SDAG-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX10-SDAG-NEXT:    v_add_co_u32 v0, vcc_lo, 0x2000, v0
-; GFX10-SDAG-NEXT:    v_add_co_ci_u32_e32 v1, vcc_lo, 2, v1, vcc_lo
-; GFX10-SDAG-NEXT:    global_load_ubyte v0, v[0:1], off
-; GFX10-SDAG-NEXT:    s_waitcnt vmcnt(0)
-; GFX10-SDAG-NEXT:    s_setpc_b64 s[30:31]
-;
-; GFX11-SDAG-LABEL: global_inst_valu_offset_64bit_13bit_split1:
-; GFX11-SDAG:       ; %bb.0:
-; GFX11-SDAG-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX11-SDAG-NEXT:    v_add_co_u32 v0, vcc_lo, 0x2000, v0
-; GFX11-SDAG-NEXT:    v_add_co_ci_u32_e32 v1, vcc_lo, 2, v1, vcc_lo
-; GFX11-SDAG-NEXT:    global_load_u8 v0, v[0:1], off
-; GFX11-SDAG-NEXT:    s_waitcnt vmcnt(0)
-; GFX11-SDAG-NEXT:    s_setpc_b64 s[30:31]
-;
 ; GFX12-SDAG-LABEL: global_inst_valu_offset_64bit_13bit_split1:
 ; GFX12-SDAG:       ; %bb.0:
 ; GFX12-SDAG-NEXT:    s_wait_loadcnt_dscnt 0x0
@@ -1548,12 +1377,9 @@ define i8 @global_inst_valu_offset_64bit_11bit_neg_high_split0(ptr addrspace(1)
 ; GFX9-GISEL-LABEL: global_inst_valu_offset_64bit_11bit_neg_high_split0:
 ; GFX9-GISEL:       ; %bb.0:
 ; GFX9-GISEL-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX9-GISEL-NEXT:    s_movk_i32 s4, 0x7ff
-; GFX9-GISEL-NEXT:    s_brev_b32 s5, 1
-; GFX9-GISEL-NEXT:    v_mov_b32_e32 v2, s4
-; GFX9-GISEL-NEXT:    v_mov_b32_e32 v3, s5
-; GFX9-GISEL-NEXT:    v_add_co_u32_e32 v0, vcc, v0, v2
-; GFX9-GISEL-NEXT:    v_addc_co_u32_e32 v1, vcc, v1, v3, vcc
+; GFX9-GISEL-NEXT:    v_bfrev_b32_e32 v2, 1
+; GFX9-GISEL-NEXT:    v_add_co_u32_e32 v0, vcc, 0x7ff, v0
+; GFX9-GISEL-NEXT:    v_addc_co_u32_e32 v1, vcc, v1, v2, vcc
 ; GFX9-GISEL-NEXT:    global_load_ubyte v0, v[0:1], off
 ; GFX9-GISEL-NEXT:    s_waitcnt vmcnt(0)
 ; GFX9-GISEL-NEXT:    s_setpc_b64 s[30:31]
@@ -1561,12 +1387,8 @@ define i8 @global_inst_valu_offset_64bit_11bit_neg_high_split0(ptr addrspace(1)
 ; GFX10-GISEL-LABEL: global_inst_valu_offset_64bit_11bit_neg_high_split0:
 ; GFX10-GISEL:       ; %bb.0:
 ; GFX10-GISEL-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX10-GISEL-NEXT:    s_movk_i32 s4, 0x7ff
-; GFX10-GISEL-NEXT:    s_brev_b32 s5, 1
-; GFX10-GISEL-NEXT:    v_mov_b32_e32 v2, s4
-; GFX10-GISEL-NEXT:    v_mov_b32_e32 v3, s5
-; GFX10-GISEL-NEXT:    v_add_co_u32 v0, vcc_lo, v0, v2
-; GFX10-GISEL-NEXT:    v_add_co_ci_u32_e32 v1, vcc_lo, v1, v3, vcc_lo
+; GFX10-GISEL-NEXT:    v_add_co_u32 v0, vcc_lo, 0x7ff, v0
+; GFX10-GISEL-NEXT:    v_add_co_ci_u32_e32 v1, vcc_lo, 0x80000000, v1, vcc_lo
 ; GFX10-GISEL-NEXT:    global_load_ubyte v0, v[0:1], off
 ; GFX10-GISEL-NEXT:    s_waitcnt vmcnt(0)
 ; GFX10-GISEL-NEXT:    s_setpc_b64 s[30:31]
@@ -1574,13 +1396,8 @@ define i8 @global_inst_valu_offset_64bit_11bit_neg_high_split0(ptr addrspace(1)
 ; GFX11-GISEL-LABEL: global_inst_valu_offset_64bit_11bit_neg_high_split0:
 ; GFX11-GISEL:       ; %bb.0:
 ; GFX11-GISEL-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX11-GISEL-NEXT:    s_movk_i32 s0, 0x7ff
-; GFX11-GISEL-NEXT:    s_brev_b32 s1, 1
-; GFX11-GISEL-NEXT:    s_delay_alu instid0(SALU_CYCLE_1) | instskip(NEXT) | instid1(VALU_DEP_1)
-; GFX11-GISEL-NEXT:    v_dual_mov_b32 v3, s1 :: v_dual_mov_b32 v2, s0
-; GFX11-GISEL-NEXT:    v_add_co_u32 v0, vcc_lo, v0, v2
-; GFX11-GISEL-NEXT:    s_delay_alu instid0(VALU_DEP_2)
-; GFX11-GISEL-NEXT:    v_add_co_ci_u32_e32 v1, vcc_lo, v1, v3, vcc_lo
+; GFX11-GISEL-NEXT:    v_add_co_u32 v0, vcc_lo, 0x7ff, v0
+; GFX11-GISEL-NEXT:    v_add_co_ci_u32_e32 v1, vcc_lo, 0x80000000, v1, vcc_lo
 ; GFX11-GISEL-NEXT:    global_load_u8 v0, v[0:1], off
 ; GFX11-GISEL-NEXT:    s_waitcnt vmcnt(0)
 ; GFX11-GISEL-NEXT:    s_setpc_b64 s[30:31]
@@ -1592,13 +1409,8 @@ define i8 @global_inst_valu_offset_64bit_11bit_neg_high_split0(ptr addrspace(1)
 ; GFX12-GISEL-NEXT:    s_wait_samplecnt 0x0
 ; GFX12-GISEL-NEXT:    s_wait_bvhcnt 0x0
 ; GFX12-GISEL-NEXT:    s_wait_kmcnt 0x0
-; GFX12-GISEL-NEXT:    s_movk_i32 s0, 0x7ff
-; GFX12-GISEL-NEXT:    s_brev_b32 s1, 1
-; GFX12-GISEL-NEXT:    s_delay_alu instid0(SALU_CYCLE_1) | instskip(NEXT) | instid1(VALU_DEP_1)
-; GFX12-GISEL-NEXT:    v_dual_mov_b32 v3, s1 :: v_dual_mov_b32 v2, s0
-; GFX12-GISEL-NEXT:    v_add_co_u32 v0, vcc_lo, v0, v2
-; GFX12-GISEL-NEXT:    s_delay_alu instid0(VALU_DEP_2)
-; GFX12-GISEL-NEXT:    v_add_co_ci_u32_e32 v1, vcc_lo, v1, v3, vcc_lo
+; GFX12-GISEL-NEXT:    v_add_co_u32 v0, vcc_lo, 0x7ff, v0
+; GFX12-GISEL-NEXT:    v_add_co_ci_u32_e32 v1, vcc_lo, 0x80000000, v1, vcc_lo
 ; GFX12-GISEL-NEXT:    global_load_u8 v0, v[0:1], off
 ; GFX12-GISEL-NEXT:    s_wait_loadcnt 0x0
 ; GFX12-GISEL-NEXT:    s_setpc_b64 s[30:31]
@@ -1653,39 +1465,27 @@ define i8 @global_inst_valu_offset_64bit_11bit_neg_high_split1(ptr addrspace(1)
 ; GFX9-GISEL-LABEL: global_inst_valu_offset_64bit_11bit_neg_high_split1:
 ; GFX9-GISEL:       ; %bb.0:
 ; GFX9-GISEL-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX9-GISEL-NEXT:    s_movk_i32 s4, 0x800
-; GFX9-GISEL-NEXT:    s_brev_b32 s5, 1
-; GFX9-GISEL-NEXT:    v_mov_b32_e32 v2, s4
-; GFX9-GISEL-NEXT:    v_mov_b32_e32 v3, s5
-; GFX9-GISEL-NEXT:    v_add_co_u32_e32 v0, vcc, v0, v2
-; GFX9-GISEL-NEXT:    v_addc_co_u32_e32 v1, vcc, v1, v3, vcc
+; GFX9-GISEL-NEXT:    v_bfrev_b32_e32 v2, 1
+; GFX9-GISEL-NEXT:    v_add_co_u32_e32 v0, vcc, 0x800, v0
+; GFX9-GISEL-NEXT:    v_addc_co_u32_e32 v1, vcc, v1, v2, vcc
 ; GFX9-GISEL-NEXT:    global_load_ubyte v0, v[0:1], off
 ; GFX9-GISEL-NEXT:    s_waitcnt vmcnt(0)
 ; GFX9-GISEL-NEXT:    s_setpc_b64 s[30:31]
 ;
-; GFX10-GISEL-LABEL: global_inst_valu_offset_64bit_11bit_neg_high_split1:
-; GFX10-GISEL:       ; %bb.0:
-; GFX10-GISEL-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX10-GISEL-NEXT:    s_movk_i32 s4, 0x800
-; GFX10-GISEL-NEXT:    s_brev_b32 s5, 1
-; GFX10-GISEL-NEXT:    v_mov_b32_e32 v2, s4
-; GFX10-GISEL-NEXT:    v_mov_b32_e32 v3, s5
-; GFX10-GISEL-NEXT:    v_add_co_u32 v0, vcc_lo, v0, v2
-; GFX10-GISEL-NEXT:    v_add_co_ci_u32_e32 v1, vcc_lo, v1, v3, vcc_lo
-; GFX10-GISEL-NEXT:    global_load_ubyte v0, v[0:1], off
-; GFX10-GISEL-NEXT:    s_waitcnt vmcnt(0)
-; GFX10-GISEL-NEXT:    s_setpc_b64 s[30:31]
+; GFX10-LABEL: global_inst_valu_offset_64bit_11bit_neg_high_split1:
+; GFX10:       ; %bb.0:
+; GFX10-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX10-NEXT:    v_add_co_u32 v0, vcc_lo, 0x800, v0
+; GFX10-NEXT:    v_add_co_ci_u32_e32 v1, vcc_lo, 0x80000000, v1, vcc_lo
+; GFX10-NEXT:    global_load_ubyte v0, v[0:1], off
+; GFX10-NEXT:    s_waitcnt vmcnt(0)
+; GFX10-NEXT:    s_setpc_b64 s[30:31]
 ;
 ; GFX11-GISEL-LABEL: global_inst_valu_offset_64bit_11bit_neg_high_split1:
 ; GFX11-GISEL:       ; %bb.0:
 ; GFX11-GISEL-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX11-GISEL-NEXT:    s_movk_i32 s0, 0x800
-; GFX11-GISEL-NEXT:    s_brev_b32 s1, 1
-; GFX11-GISEL-NEXT:    s_delay_alu instid0(SALU_CYCLE_1) | instskip(NEXT) | instid1(VALU_DEP_1)
-; GFX11-GISEL-NEXT:    v_dual_mov_b32 v3, s1 :: v_dual_mov_b32 v2, s0
-; GFX11-GISEL-NEXT:    v_add_co_u32 v0, vcc_lo, v0, v2
-; GFX11-GISEL-NEXT:    s_delay_alu instid0(VALU_DEP_2)
-; GFX11-GISEL-NEXT:    v_add_co_ci_u32_e32 v1, vcc_lo, v1, v3, vcc_lo
+; GFX11-GISEL-NEXT:    v_add_co_u32 v0, vcc_lo, 0x800, v0
+; GFX11-GISEL-NEXT:    v_add_co_ci_u32_e32 v1, vcc_lo, 0x80000000, v1, vcc_lo
 ; GFX11-GISEL-NEXT:    global_load_u8 v0, v[0:1], off
 ; GFX11-GISEL-NEXT:    s_waitcnt vmcnt(0)
 ; GFX11-GISEL-NEXT:    s_setpc_b64 s[30:31]
@@ -1697,13 +1497,8 @@ define i8 @global_inst_valu_offset_64bit_11bit_neg_high_split1(ptr addrspace(1)
 ; GFX12-GISEL-NEXT:    s_wait_samplecnt 0x0
 ; GFX12-GISEL-NEXT:    s_wait_bvhcnt 0x0
 ; GFX12-GISEL-NEXT:    s_wait_kmcnt 0x0
-; GFX12-GISEL-NEXT:    s_movk_i32 s0, 0x800
-; GFX12-GISEL-NEXT:    s_brev_b32 s1, 1
-; GFX12-GISEL-NEXT:    s_delay_alu instid0(SALU_CYCLE_1) | instskip(NEXT) | instid1(VALU_DEP_1)
-; GFX12-GISEL-NEXT:    v_dual_mov_b32 v3, s1 :: v_dual_mov_b32 v2, s0
-; GFX12-GISEL-NEXT:    v_add_co_u32 v0, vcc_lo, v0, v2
-; GFX12-GISEL-NEXT:    s_delay_alu instid0(VALU_DEP_2)
-; GFX12-GISEL-NEXT:    v_add_co_ci_u32_e32 v1, vcc_lo, v1, v3, vcc_lo
+; GFX12-GISEL-NEXT:    v_add_co_u32 v0, vcc_lo, 0x800, v0
+; GFX12-GISEL-NEXT:    v_add_co_ci_u32_e32 v1, vcc_lo, 0x80000000, v1, vcc_lo
 ; GFX12-GISEL-NEXT:    global_load_u8 v0, v[0:1], off
 ; GFX12-GISEL-NEXT:    s_wait_loadcnt 0x0
 ; GFX12-GISEL-NEXT:    s_setpc_b64 s[30:31]
@@ -1718,15 +1513,6 @@ define i8 @global_inst_valu_offset_64bit_11bit_neg_high_split1(ptr addrspace(1)
 ; GFX9-SDAG-NEXT:    s_waitcnt vmcnt(0)
 ; GFX9-SDAG-NEXT:    s_setpc_b64 s[30:31]
 ;
-; GFX10-SDAG-LABEL: global_inst_valu_offset_64bit_11bit_neg_high_split1:
-; GFX10-SDAG:       ; %bb.0:
-; GFX10-SDAG-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX10-SDAG-NEXT:    v_add_co_u32 v0, vcc_lo, 0x800, v0
-; GFX10-SDAG-NEXT:    v_add_co_ci_u32_e32 v1, vcc_lo, 0x80000000, v1, vcc_lo
-; GFX10-SDAG-NEXT:    global_load_ubyte v0, v[0:1], off
-; GFX10-SDAG-NEXT:    s_waitcnt vmcnt(0)
-; GFX10-SDAG-NEXT:    s_setpc_b64 s[30:31]
-;
 ; GFX11-SDAG-LABEL: global_inst_valu_offset_64bit_11bit_neg_high_split1:
 ; GFX11-SDAG:       ; %bb.0:
 ; GFX11-SDAG-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
@@ -1758,12 +1544,9 @@ define i8 @global_inst_valu_offset_64bit_12bit_neg_high_split0(ptr addrspace(1)
 ; GFX9-GISEL-LABEL: global_inst_valu_offset_64bit_12bit_neg_high_split0:
 ; GFX9-GISEL:       ; %bb.0:
 ; GFX9-GISEL-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX9-GISEL-NEXT:    s_movk_i32 s4, 0xfff
-; GFX9-GISEL-NEXT:    s_brev_b32 s5, 1
-; GFX9-GISEL-NEXT:    v_mov_b32_e32 v2, s4
-; GFX9-GISEL-NEXT:    v_mov_b32_e32 v3, s5
-; GFX9-GISEL-NEXT:    v_add_co_u32_e32 v0, vcc, v0, v2
-; GFX9-GISEL-NEXT:    v_addc_co_u32_e32 v1, vcc, v1, v3, vcc
+; GFX9-GISEL-NEXT:    v_bfrev_b32_e32 v2, 1
+; GFX9-GISEL-NEXT:    v_add_co_u32_e32 v0, vcc, 0xfff, v0
+; GFX9-GISEL-NEXT:    v_addc_co_u32_e32 v1, vcc, v1, v2, vcc
 ; GFX9-GISEL-NEXT:    global_load_ubyte v0, v[0:1], off
 ; GFX9-GISEL-NEXT:    s_waitcnt vmcnt(0)
 ; GFX9-GISEL-NEXT:    s_setpc_b64 s[30:31]
@@ -1771,12 +1554,8 @@ define i8 @global_inst_valu_offset_64bit_12bit_neg_high_split0(ptr addrspace(1)
 ; GFX10-GISEL-LABEL: global_inst_valu_offset_64bit_12bit_neg_high_split0:
 ; GFX10-GISEL:       ; %bb.0:
 ; GFX10-GISEL-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX10-GISEL-NEXT:    s_movk_i32 s4, 0xfff
-; GFX10-GISEL-NEXT:    s_brev_b32 s5, 1
-; GFX10-GISEL-NEXT:    v_mov_b32_e32 v2, s4
-; GFX10-GISEL-NEXT:    v_mov_b32_e32 v3, s5
-; GFX10-GISEL-NEXT:    v_add_co_u32 v0, vcc_lo, v0, v2
-; GFX10-GISEL-NEXT:    v_add_co_ci_u32_e32 v1, vcc_lo, v1, v3, vcc_lo
+; GFX10-GISEL-NEXT:    v_add_co_u32 v0, vcc_lo, 0xfff, v0
+; GFX10-GISEL-NEXT:    v_add_co_ci_u32_e32 v1, vcc_lo, 0x80000000, v1, vcc_lo
 ; GFX10-GISEL-NEXT:    global_load_ubyte v0, v[0:1], off
 ; GFX10-GISEL-NEXT:    s_waitcnt vmcnt(0)
 ; GFX10-GISEL-NEXT:    s_setpc_b64 s[30:31]
@@ -1784,13 +1563,8 @@ define i8 @global_inst_valu_offset_64bit_12bit_neg_high_split0(ptr addrspace(1)
 ; GFX11-GISEL-LABEL: global_inst_valu_offset_64bit_12bit_neg_high_split0:
 ; GFX11-GISEL:       ; %bb.0:
 ; GFX11-GISEL-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX11-GISEL-NEXT:    s_movk_i32 s0, 0xfff
-; GFX11-GISEL-NEXT:    s_brev_b32 s1, 1
-; GFX11-GISEL-NEXT:    s_delay_alu instid0(SALU_CYCLE_1) | instskip(NEXT) | instid1(VALU_DEP_1)
-; GFX11-GISEL-NEXT:    v_dual_mov_b32 v3, s1 :: v_dual_mov_b32 v2, s0
-; GFX11-GISEL-NEXT:    v_add_co_u32 v0, vcc_lo, v0, v2
-; GFX11-GISEL-NEXT:    s_delay_alu instid0(VALU_DEP_2)
-; GFX11-GISEL-NEXT:    v_add_co_ci_u32_e32 v1, vcc_lo, v1, v3, vcc_lo
+; GFX11-GISEL-NEXT:    v_add_co_u32 v0, vcc_lo, 0xfff, v0
+; GFX11-GISEL-NEXT:    v_add_co_ci_u32_e32 v1, vcc_lo, 0x80000000, v1, vcc_lo
 ; GFX11-GISEL-NEXT:    global_load_u8 v0, v[0:1], off
 ; GFX11-GISEL-NEXT:    s_waitcnt vmcnt(0)
 ; GFX11-GISEL-NEXT:    s_setpc_b64 s[30:31]
@@ -1802,13 +1576,8 @@ define i8 @global_inst_valu_offset_64bit_12bit_neg_high_split0(ptr addrspace(1)
 ; GFX12-GISEL-NEXT:    s_wait_samplecnt 0x0
 ; GFX12-GISEL-NEXT:    s_wait_bvhcnt 0x0
 ; GFX12-GISEL-NEXT:    s_wait_kmcnt 0x0
-; GFX12-GISEL-NEXT:    s_movk_i32 s0, 0xfff
-; GFX12-GISEL-NEXT:    s_brev_b32 s1, 1
-; GFX12-GISEL-NEXT:    s_delay_alu instid0(SALU_CYCLE_1) | instskip(NEXT) | instid1(VALU_DEP_1)
-; GFX12-GISEL-NEXT:    v_dual_mov_b32 v3, s1 :: v_dual_mov_b32 v2, s0
-; GFX12-GISEL-NEXT:    v_add_co_u32 v0, vcc_lo, v0, v2
-; GFX12-GISEL-NEXT:    s_delay_alu instid0(VALU_DEP_2)
-; GFX12-GISEL-NEXT:    v_add_co_ci_u32_e32 v1, vcc_lo, v1, v3, vcc_lo
+; GFX12-GISEL-NEXT:    v_add_co_u32 v0, vcc_lo, 0xfff, v0
+; GFX12-GISEL-NEXT:    v_add_co_ci_u32_e32 v1, vcc_lo, 0x80000000, v1, vcc_lo
 ; GFX12-GISEL-NEXT:    global_load_u8 v0, v[0:1], off
 ; GFX12-GISEL-NEXT:    s_wait_loadcnt 0x0
 ; GFX12-GISEL-NEXT:    s_setpc_b64 s[30:31]
@@ -1863,42 +1632,30 @@ define i8 @global_inst_valu_offset_64bit_12bit_neg_high_split1(ptr addrspace(1)
 ; GFX9-GISEL-LABEL: global_inst_valu_offset_64bit_12bit_neg_high_split1:
 ; GFX9-GISEL:       ; %bb.0:
 ; GFX9-GISEL-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX9-GISEL-NEXT:    s_movk_i32 s4, 0x1000
-; GFX9-GISEL-NEXT:    s_brev_b32 s5, 1
-; GFX9-GISEL-NEXT:    v_mov_b32_e32 v2, s4
-; GFX9-GISEL-NEXT:    v_mov_b32_e32 v3, s5
-; GFX9-GISEL-NEXT:    v_add_co_u32_e32 v0, vcc, v0, v2
-; GFX9-GISEL-NEXT:    v_addc_co_u32_e32 v1, vcc, v1, v3, vcc
+; GFX9-GISEL-NEXT:    v_bfrev_b32_e32 v2, 1
+; GFX9-GISEL-NEXT:    v_add_co_u32_e32 v0, vcc, 0x1000, v0
+; GFX9-GISEL-NEXT:    v_addc_co_u32_e32 v1, vcc, v1, v2, vcc
 ; GFX9-GISEL-NEXT:    global_load_ubyte v0, v[0:1], off
 ; GFX9-GISEL-NEXT:    s_waitcnt vmcnt(0)
 ; GFX9-GISEL-NEXT:    s_setpc_b64 s[30:31]
 ;
-; GFX10-GISEL-LABEL: global_inst_valu_offset_64bit_12bit_neg_high_split1:
-; GFX10-GISEL:       ; %bb.0:
-; GFX10-GISEL-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX10-GISEL-NEXT:    s_movk_i32 s4, 0x1000
-; GFX10-GISEL-NEXT:    s_brev_b32 s5, 1
-; GFX10-GISEL-NEXT:    v_mov_b32_e32 v2, s4
-; GFX10-GISEL-NEXT:    v_mov_b32_e32 v3, s5
-; GFX10-GISEL-NEXT:    v_add_co_u32 v0, vcc_lo, v0, v2
-; GFX10-GISEL-NEXT:    v_add_co_ci_u32_e32 v1, vcc_lo, v1, v3, vcc_lo
-; GFX10-GISEL-NEXT:    global_load_ubyte v0, v[0:1], off
-; GFX10-GISEL-NEXT:    s_waitcnt vmcnt(0)
-; GFX10-GISEL-NEXT:    s_setpc_b64 s[30:31]
+; GFX10-LABEL: global_inst_valu_offset_64bit_12bit_neg_high_split1:
+; GFX10:       ; %bb.0:
+; GFX10-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX10-NEXT:    v_add_co_u32 v0, vcc_lo, 0x1000, v0
+; GFX10-NEXT:    v_add_co_ci_u32_e32 v1, vcc_lo, 0x80000000, v1, vcc_lo
+; GFX10-NEXT:    global_load_ubyte v0, v[0:1], off
+; GFX10-NEXT:    s_waitcnt vmcnt(0)
+; GFX10-NEXT:    s_setpc_b64 s[30:31]
 ;
-; GFX11-GISEL-LABEL: global_inst_valu_offset_64bit_12bit_neg_high_split1:
-; GFX11-GISEL:       ; %bb.0:
-; GFX11-GISEL-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX11-GISEL-NEXT:    s_movk_i32 s0, 0x1000
-; GFX11-GISEL-NEXT:    s_brev_b32 s1, 1
-; GFX11-GISEL-NEXT:    s_delay_alu instid0(SALU_CYCLE_1) | instskip(NEXT) | instid1(VALU_DEP_1)
-; GFX11-GISEL-NEXT:    v_dual_mov_b32 v3, s1 :: v_dual_mov_b32 v2, s0
-; GFX11-GISEL-NEXT:    v_add_co_u32 v0, vcc_lo, v0, v2
-; GFX11-GISEL-NEXT:    s_delay_alu instid0(VALU_DEP_2)
-; GFX11-GISEL-NEXT:    v_add_co_ci_u32_e32 v1, vcc_lo, v1, v3, vcc_lo
-; GFX11-GISEL-NEXT:    global_load_u8 v0, v[0:1], off
-; GFX11-GISEL-NEXT:    s_waitcnt vmcnt(0)
-; GFX11-GISEL-NEXT:    s_setpc_b64 s[30:31]
+; GFX11-LABEL: global_inst_valu_offset_64bit_12bit_neg_high_split1:
+; GFX11:       ; %bb.0:
+; GFX11-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX11-NEXT:    v_add_co_u32 v0, vcc_lo, 0x1000, v0
+; GFX11-NEXT:    v_add_co_ci_u32_e32 v1, vcc_lo, 0x80000000, v1, vcc_lo
+; GFX11-NEXT:    global_load_u8 v0, v[0:1], off
+; GFX11-NEXT:    s_waitcnt vmcnt(0)
+; GFX11-NEXT:    s_setpc_b64 s[30:31]
 ;
 ; GFX12-GISEL-LABEL: global_inst_valu_offset_64bit_12bit_neg_high_split1:
 ; GFX12-GISEL:       ; %bb.0:
@@ -1907,13 +1664,8 @@ define i8 @global_inst_valu_offset_64bit_12bit_neg_high_split1(ptr addrspace(1)
 ; GFX12-GISEL-NEXT:    s_wait_samplecnt 0x0
 ; GFX12-GISEL-NEXT:    s_wait_bvhcnt 0x0
 ; GFX12-GISEL-NEXT:    s_wait_kmcnt 0x0
-; GFX12-GISEL-NEXT:    s_movk_i32 s0, 0x1000
-; GFX12-GISEL-NEXT:    s_brev_b32 s1, 1
-; GFX12-GISEL-NEXT:    s_delay_alu instid0(SALU_CYCLE_1) | instskip(NEXT) | instid1(VALU_DEP_1)
-; GFX12-GISEL-NEXT:    v_dual_mov_b32 v3, s1 :: v_dual_mov_b32 v2, s0
-; GFX12-GISEL-NEXT:    v_add_co_u32 v0, vcc_lo, v0, v2
-; GFX12-GISEL-NEXT:    s_delay_alu instid0(VALU_DEP_2)
-; GFX12-GISEL-NEXT:    v_add_co_ci_u32_e32 v1, vcc_lo, v1, v3, vcc_lo
+; GFX12-GISEL-NEXT:    v_add_co_u32 v0, vcc_lo, 0x1000, v0
+; GFX12-GISEL-NEXT:    v_add_co_ci_u32_e32 v1, vcc_lo, 0x80000000, v1, vcc_lo
 ; GFX12-GISEL-NEXT:    global_load_u8 v0, v[0:1], off
 ; GFX12-GISEL-NEXT:    s_wait_loadcnt 0x0
 ; GFX12-GISEL-NEXT:    s_setpc_b64 s[30:31]
@@ -1928,24 +1680,6 @@ define i8 @global_inst_valu_offset_64bit_12bit_neg_high_split1(ptr addrspace(1)
 ; GFX9-SDAG-NEXT:    s_waitcnt vmcnt(0)
 ; GFX9-SDAG-NEXT:    s_setpc_b64 s[30:31]
 ;
-; GFX10-SDAG-LABEL: global_inst_valu_offset_64bit_12bit_neg_high_split1:
-; GFX10-SDAG:       ; %bb.0:
-; GFX10-SDAG-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX10-SDAG-NEXT:    v_add_co_u32 v0, vcc_lo, 0x1000, v0
-; GFX10-SDAG-NEXT:    v_add_co_ci_u32_e32 v1, vcc_lo, 0x80000000, v1, vcc_lo
-; GFX10-SDAG-NEXT:    global_load_ubyte v0, v[0:1], off
-; GFX10-SDAG-NEXT:    s_waitcnt vmcnt(0)
-; GFX10-SDAG-NEXT:    s_setpc_b64 s[30:31]
-;
-; GFX11-SDAG-LABEL: global_inst_valu_offset_64bit_12bit_neg_high_split1:
-; GFX11-SDAG:       ; %bb.0:
-; GFX11-SDAG-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX11-SDAG-NEXT:    v_add_co_u32 v0, vcc_lo, 0x1000, v0
-; GFX11-SDAG-NEXT:    v_add_co_ci_u32_e32 v1, vcc_lo, 0x80000000, v1, vcc_lo
-; GFX11-SDAG-NEXT:    global_load_u8 v0, v[0:1], off
-; GFX11-SDAG-NEXT:    s_waitcnt vmcnt(0)
-; GFX11-SDAG-NEXT:    s_setpc_b64 s[30:31]
-;
 ; GFX12-SDAG-LABEL: global_inst_valu_offset_64bit_12bit_neg_high_split1:
 ; GFX12-SDAG:       ; %bb.0:
 ; GFX12-SDAG-NEXT:    s_wait_loadcnt_dscnt 0x0
@@ -1968,12 +1702,9 @@ define i8 @global_inst_valu_offset_64bit_13bit_neg_high_split0(ptr addrspace(1)
 ; GFX9-GISEL-LABEL: global_inst_valu_offset_64bit_13bit_neg_high_split0:
 ; GFX9-GISEL:       ; %bb.0:
 ; GFX9-GISEL-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX9-GISEL-NEXT:    s_movk_i32 s4, 0x1fff
-; GFX9-GISEL-NEXT:    s_brev_b32 s5, 1
-; GFX9-GISEL-NEXT:    v_mov_b32_e32 v2, s4
-; GFX9-GISEL-NEXT:    v_mov_b32_e32 v3, s5
-; GFX9-GISEL-NEXT:    v_add_co_u32_e32 v0, vcc, v0, v2
-; GFX9-GISEL-NEXT:    v_addc_co_u32_e32 v1, vcc, v1, v3, vcc
+; GFX9-GISEL-NEXT:    v_bfrev_b32_e32 v2, 1
+; GFX9-GISEL-NEXT:    v_add_co_u32_e32 v0, vcc, 0x1fff, v0
+; GFX9-GISEL-NEXT:    v_addc_co_u32_e32 v1, vcc, v1, v2, vcc
 ; GFX9-GISEL-NEXT:    global_load_ubyte v0, v[0:1], off
 ; GFX9-GISEL-NEXT:    s_waitcnt vmcnt(0)
 ; GFX9-GISEL-NEXT:    s_setpc_b64 s[30:31]
@@ -1981,12 +1712,8 @@ define i8 @global_inst_valu_offset_64bit_13bit_neg_high_split0(ptr addrspace(1)
 ; GFX10-GISEL-LABEL: global_inst_valu_offset_64bit_13bit_neg_high_split0:
 ; GFX10-GISEL:       ; %bb.0:
 ; GFX10-GISEL-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX10-GISEL-NEXT:    s_movk_i32 s4, 0x1fff
-; GFX10-GISEL-NEXT:    s_brev_b32 s5, 1
-; GFX10-GISEL-NEXT:    v_mov_b32_e32 v2, s4
-; GFX10-GISEL-NEXT:    v_mov_b32_e32 v3, s5
-; GFX10-GISEL-NEXT:    v_add_co_u32 v0, vcc_lo, v0, v2
-; GFX10-GISEL-NEXT:    v_add_co_ci_u32_e32 v1, vcc_lo, v1, v3, vcc_lo
+; GFX10-GISEL-NEXT:    v_add_co_u32 v0, vcc_lo, 0x1fff, v0
+; GFX10-GISEL-NEXT:    v_add_co_ci_u32_e32 v1, vcc_lo, 0x80000000, v1, vcc_lo
 ; GFX10-GISEL-NEXT:    global_load_ubyte v0, v[0:1], off
 ; GFX10-GISEL-NEXT:    s_waitcnt vmcnt(0)
 ; GFX10-GISEL-NEXT:    s_setpc_b64 s[30:31]
@@ -1994,13 +1721,8 @@ define i8 @global_inst_valu_offset_64bit_13bit_neg_high_split0(ptr addrspace(1)
 ; GFX11-GISEL-LABEL: global_inst_valu_offset_64bit_13bit_neg_high_split0:
 ; GFX11-GISEL:       ; %bb.0:
 ; GFX11-GISEL-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX11-GISEL-NEXT:    s_movk_i32 s0, 0x1fff
-; GFX11-GISEL-NEXT:    s_brev_b32 s1, 1
-; GFX11-GISEL-NEXT:    s_delay_alu instid0(SALU_CYCLE_1) | instskip(NEXT) | instid1(VALU_DEP_1)
-; GFX11-GISEL-NEXT:    v_dual_mov_b32 v3, s1 :: v_dual_mov_b32 v2, s0
-; GFX11-GISEL-NEXT:    v_add_co_u32 v0, vcc_lo, v0, v2
-; GFX11-GISEL-NEXT:    s_delay_alu instid0(VALU_DEP_2)
-; GFX11-GISEL-NEXT:    v_add_co_ci_u32_e32 v1, vcc_lo, v1, v3, vcc_lo
+; GFX11-GISEL-NEXT:    v_add_co_u32 v0, vcc_lo, 0x1fff, v0
+; GFX11-GISEL-NEXT:    v_add_co_ci_u32_e32 v1, vcc_lo, 0x80000000, v1, vcc_lo
 ; GFX11-GISEL-NEXT:    global_load_u8 v0, v[0:1], off
 ; GFX11-GISEL-NEXT:    s_waitcnt vmcnt(0)
 ; GFX11-GISEL-NEXT:    s_setpc_b64 s[30:31]
@@ -2012,13 +1734,8 @@ define i8 @global_inst_valu_offset_64bit_13bit_neg_high_split0(ptr addrspace(1)
 ; GFX12-GISEL-NEXT:    s_wait_samplecnt 0x0
 ; GFX12-GISEL-NEXT:    s_wait_bvhcnt 0x0
 ; GFX12-GISEL-NEXT:    s_wait_kmcnt 0x0
-; GFX12-GISEL-NEXT:    s_movk_i32 s0, 0x1fff
-; GFX12-GISEL-NEXT:    s_brev_b32 s1, 1
-; GFX12-GISEL-NEXT:    s_delay_alu instid0(SALU_CYCLE_1) | instskip(NEXT) | instid1(VALU_DEP_1)
-; GFX12-GISEL-NEXT:    v_dual_mov_b32 v3, s1 :: v_dual_mov_b32 v2, s0
-; GFX12-GISEL-NEXT:    v_add_co_u32 v0, vcc_lo, v0, v2
-; GFX12-GISEL-NEXT:    s_delay_alu instid0(VALU_DEP_2)
-; GFX12-GISEL-NEXT:    v_add_co_ci_u32_e32 v1, vcc_lo, v1, v3, vcc_lo
+; GFX12-GISEL-NEXT:    v_add_co_u32 v0, vcc_lo, 0x1fff, v0
+; GFX12-GISEL-NEXT:    v_add_co_ci_u32_e32 v1, vcc_lo, 0x80000000, v1, vcc_lo
 ; GFX12-GISEL-NEXT:    global_load_u8 v0, v[0:1], off
 ; GFX12-GISEL-NEXT:    s_wait_loadcnt 0x0
 ; GFX12-GISEL-NEXT:    s_setpc_b64 s[30:31]
@@ -2073,42 +1790,30 @@ define i8 @global_inst_valu_offset_64bit_13bit_neg_high_split1(ptr addrspace(1)
 ; GFX9-GISEL-LABEL: global_inst_valu_offset_64bit_13bit_neg_high_split1:
 ; GFX9-GISEL:       ; %bb.0:
 ; GFX9-GISEL-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX9-GISEL-NEXT:    s_movk_i32 s4, 0x2000
-; GFX9-GISEL-NEXT:    s_brev_b32 s5, 1
-; GFX9-GISEL-NEXT:    v_mov_b32_e32 v2, s4
-; GFX9-GISEL-NEXT:    v_mov_b32_e32 v3, s5
-; GFX9-GISEL-NEXT:    v_add_co_u32_e32 v0, vcc, v0, v2
-; GFX9-GISEL-NEXT:    v_addc_co_u32_e32 v1, vcc, v1, v3, vcc
+; GFX9-GISEL-NEXT:    v_bfrev_b32_e32 v2, 1
+; GFX9-GISEL-NEXT:    v_add_co_u32_e32 v0, vcc, 0x2000, v0
+; GFX9-GISEL-NEXT:    v_addc_co_u32_e32 v1, vcc, v1, v2, vcc
 ; GFX9-GISEL-NEXT:    global_load_ubyte v0, v[0:1], off
 ; GFX9-GISEL-NEXT:    s_waitcnt vmcnt(0)
 ; GFX9-GISEL-NEXT:    s_setpc_b64 s[30:31]
 ;
-; GFX10-GISEL-LABEL: global_inst_valu_offset_64bit_13bit_neg_high_split1:
-; GFX10-GISEL:       ; %bb.0:
-; GFX10-GISEL-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX10-GISEL-NEXT:    s_movk_i32 s4, 0x2000
-; GFX10-GISEL-NEXT:    s_brev_b32 s5, 1
-; GFX10-GISEL-NEXT:    v_mov_b32_e32 v2, s4
-; GFX10-GISEL-NEXT:    v_mov_b32_e32 v3, s5
-; GFX10-GISEL-NEXT:    v_add_co_u32 v0, vcc_lo, v0, v2
-; GFX10-GISEL-NEXT:    v_add_co_ci_u32_e32 v1, vcc_lo, v1, v3, vcc_lo
-; GFX10-GISEL-NEXT:    global_load_ubyte v0, v[0:1], off
-; GFX10-GISEL-NEXT:    s_waitcnt vmcnt(0)
-; GFX10-GISEL-NEXT:    s_setpc_b64 s[30:31]
+; GFX10-LABEL: global_inst_valu_offset_64bit_13bit_neg_high_split1:
+; GFX10:       ; %bb.0:
+; GFX10-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX10-NEXT:    v_add_co_u32 v0, vcc_lo, 0x2000, v0
+; GFX10-NEXT:    v_add_co_ci_u32_e32 v1, vcc_lo, 0x80000000, v1, vcc_lo
+; GFX10-NEXT:    global_load_ubyte v0, v[0:1], off
+; GFX10-NEXT:    s_waitcnt vmcnt(0)
+; GFX10-NEXT:    s_setpc_b64 s[30:31]
 ;
-; GFX11-GISEL-LABEL: global_inst_valu_offset_64bit_13bit_neg_high_split1:
-; GFX11-GISEL:       ; %bb.0:
-; GFX11-GISEL-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX11-GISEL-NEXT:    s_movk_i32 s0, 0x2000
-; GFX11-GISEL-NEXT:    s_brev_b32 s1, 1
-; GFX11-GISEL-NEXT:    s_delay_alu instid0(SALU_CYCLE_1) | instskip(NEXT) | instid1(VALU_DEP_1)
-; GFX11-GISEL-NEXT:    v_dual_mov_b32 v3, s1 :: v_dual_mov_b32 v2, s0
-; GFX11-GISEL-NEXT:    v_add_co_u32 v0, vcc_lo, v0, v2
-; GFX11-GISEL-NEXT:    s_delay_alu instid0(VALU_DEP_2)
-; GFX11-GISEL-NEXT:    v_add_co_ci_u32_e32 v1, vcc_lo, v1, v3, vcc_lo
-; GFX11-GISEL-NEXT:    global_load_u8 v0, v[0:1], off
-; GFX11-GISEL-NEXT:    s_waitcnt vmcnt(0)
-; GFX11-GISEL-NEXT:    s_setpc_b64 s[30:31]
+; GFX11-LABEL: global_inst_valu_offset_64bit_13bit_neg_high_split1:
+; GFX11:       ; %bb.0:
+; GFX11-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX11-NEXT:    v_add_co_u32 v0, vcc_lo, 0x2000, v0
+; GFX11-NEXT:    v_add_co_ci_u32_e32 v1, vcc_lo, 0x80000000, v1, vcc_lo
+; GFX11-NEXT:    global_load_u8 v0, v[0:1], off
+; GFX11-NEXT:    s_waitcnt vmcnt(0)
+; GFX11-NEXT:    s_setpc_b64 s[30:31]
 ;
 ; GFX12-GISEL-LABEL: global_inst_valu_offset_64bit_13bit_neg_high_split1:
 ; GFX12-GISEL:       ; %bb.0:
@@ -2117,13 +1822,8 @@ define i8 @global_inst_valu_offset_64bit_13bit_neg_high_split1(ptr addrspace(1)
 ; GFX12-GISEL-NEXT:    s_wait_samplecnt 0x0
 ; GFX12-GISEL-NEXT:    s_wait_bvhcnt 0x0
 ; GFX12-GISEL-NEXT:    s_wait_kmcnt 0x0
-; GFX12-GISEL-NEXT:    s_movk_i32 s0, 0x2000
-; GFX12-GISEL-NEXT:    s_brev_b32 s1, 1
-; GFX12-GISEL-NEXT:    s_delay_alu instid0(SALU_CYCLE_1) | instskip(NEXT) | instid1(VALU_DEP_1)
-; GFX12-GISEL-NEXT:    v_dual_mov_b32 v3, s1 :: v_dual_mov_b32 v2, s0
-; GFX12-GISEL-NEXT:    v_add_co_u32 v0, vcc_lo, v0, v2
-; GFX12-GISEL-NEXT:    s_delay_alu instid0(VALU_DEP_2)
-; GFX12-GISEL-NEXT:    v_add_co_ci_u32_e32 v1, vcc_lo, v1, v3, vcc_lo
+; GFX12-GISEL-NEXT:    v_add_co_u32 v0, vcc_lo, 0x2000, v0
+; GFX12-GISEL-NEXT:    v_add_co_ci_u32_e32 v1, vcc_lo, 0x80000000, v1, vcc_lo
 ; GFX12-GISEL-NEXT:    global_load_u8 v0, v[0:1], off
 ; GFX12-GISEL-NEXT:    s_wait_loadcnt 0x0
 ; GFX12-GISEL-NEXT:    s_setpc_b64 s[30:31]
@@ -2138,24 +1838,6 @@ define i8 @global_inst_valu_offset_64bit_13bit_neg_high_split1(ptr addrspace(1)
 ; GFX9-SDAG-NEXT:    s_waitcnt vmcnt(0)
 ; GFX9-SDAG-NEXT:    s_setpc_b64 s[30:31]
 ;
-; GFX10-SDAG-LABEL: global_inst_valu_offset_64bit_13bit_neg_high_split1:
-; GFX10-SDAG:       ; %bb.0:
-; GFX10-SDAG-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX10-SDAG-NEXT:    v_add_co_u32 v0, vcc_lo, 0x2000, v0
-; GFX10-SDAG-NEXT:    v_add_co_ci_u32_e32 v1, vcc_lo, 0x80000000, v1, vcc_lo
-; GFX10-SDAG-NEXT:    global_load_ubyte v0, v[0:1], off
-; GFX10-SDAG-NEXT:    s_waitcnt vmcnt(0)
-; GFX10-SDAG-NEXT:    s_setpc_b64 s[30:31]
-;
-; GFX11-SDAG-LABEL: global_inst_valu_offset_64bit_13bit_neg_high_split1:
-; GFX11-SDAG:       ; %bb.0:
-; GFX11-SDAG-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX11-SDAG-NEXT:    v_add_co_u32 v0, vcc_lo, 0x2000, v0
-; GFX11-SDAG-NEXT:    v_add_co_ci_u32_e32 v1, vcc_lo, 0x80000000, v1, vcc_lo
-; GFX11-SDAG-NEXT:    global_load_u8 v0, v[0:1], off
-; GFX11-SDAG-NEXT:    s_waitcnt vmcnt(0)
-; GFX11-SDAG-NEXT:    s_setpc_b64 s[30:31]
-;
 ; GFX12-SDAG-LABEL: global_inst_valu_offset_64bit_13bit_neg_high_split1:
 ; GFX12-SDAG:       ; %bb.0:
 ; GFX12-SDAG-NEXT:    s_wait_loadcnt_dscnt 0x0
diff --git a/llvm/test/CodeGen/AMDGPU/roundeven.ll b/llvm/test/CodeGen/AMDGPU/roundeven.ll
index 528dcfc..0f95c02 100644
--- a/llvm/test/CodeGen/AMDGPU/roundeven.ll
+++ b/llvm/test/CodeGen/AMDGPU/roundeven.ll
@@ -1008,10 +1008,10 @@ define double @v_roundeven_f64(double %x) {
 ; GFX6-NEXT:    v_mov_b32_e32 v2, 0
 ; GFX6-NEXT:    v_or_b32_e32 v3, 0x43300000, v3
 ; GFX6-NEXT:    v_add_f64 v[4:5], v[0:1], v[2:3]
-; GFX6-NEXT:    s_mov_b32 s4, -1
-; GFX6-NEXT:    s_mov_b32 s5, 0x432fffff
+; GFX6-NEXT:    v_mov_b32_e32 v6, -1
+; GFX6-NEXT:    v_mov_b32_e32 v7, 0x432fffff
 ; GFX6-NEXT:    v_add_f64 v[2:3], v[4:5], -v[2:3]
-; GFX6-NEXT:    v_cmp_gt_f64_e64 vcc, |v[0:1]|, s[4:5]
+; GFX6-NEXT:    v_cmp_gt_f64_e64 vcc, |v[0:1]|, v[6:7]
 ; GFX6-NEXT:    v_cndmask_b32_e32 v0, v2, v0, vcc
 ; GFX6-NEXT:    v_cndmask_b32_e32 v1, v3, v1, vcc
 ; GFX6-NEXT:    s_setpc_b64 s[30:31]
@@ -1087,17 +1087,17 @@ define double @v_roundeven_f64_fneg(double %x) {
 ; GFX6-LABEL: v_roundeven_f64_fneg:
 ; GFX6:       ; %bb.0:
 ; GFX6-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX6-NEXT:    v_xor_b32_e32 v6, 0x80000000, v1
-; GFX6-NEXT:    v_and_b32_e32 v3, 0x80000000, v6
+; GFX6-NEXT:    v_xor_b32_e32 v8, 0x80000000, v1
+; GFX6-NEXT:    v_and_b32_e32 v3, 0x80000000, v8
 ; GFX6-NEXT:    v_mov_b32_e32 v2, 0
 ; GFX6-NEXT:    v_or_b32_e32 v3, 0x43300000, v3
 ; GFX6-NEXT:    v_add_f64 v[4:5], -v[0:1], v[2:3]
-; GFX6-NEXT:    s_mov_b32 s4, -1
-; GFX6-NEXT:    s_mov_b32 s5, 0x432fffff
+; GFX6-NEXT:    v_mov_b32_e32 v6, -1
+; GFX6-NEXT:    v_mov_b32_e32 v7, 0x432fffff
 ; GFX6-NEXT:    v_add_f64 v[2:3], v[4:5], -v[2:3]
-; GFX6-NEXT:    v_cmp_gt_f64_e64 vcc, |v[0:1]|, s[4:5]
+; GFX6-NEXT:    v_cmp_gt_f64_e64 vcc, |v[0:1]|, v[6:7]
 ; GFX6-NEXT:    v_cndmask_b32_e32 v0, v2, v0, vcc
-; GFX6-NEXT:    v_cndmask_b32_e32 v1, v3, v6, vcc
+; GFX6-NEXT:    v_cndmask_b32_e32 v1, v3, v8, vcc
 ; GFX6-NEXT:    s_setpc_b64 s[30:31]
 ;
 ; GFX7-LABEL: v_roundeven_f64_fneg:
diff --git a/llvm/test/CodeGen/AMDGPU/scalar_to_vector_v2x16.ll b/llvm/test/CodeGen/AMDGPU/scalar_to_vector_v2x16.ll
index 6d243e4..47e4406 100644
--- a/llvm/test/CodeGen/AMDGPU/scalar_to_vector_v2x16.ll
+++ b/llvm/test/CodeGen/AMDGPU/scalar_to_vector_v2x16.ll
@@ -1,10 +1,8 @@
-; RUN: llc -mtriple=amdgcn -mcpu=fiji -verify-machineinstrs < %s | FileCheck -enable-var-scope -check-prefixes=GCN,GCN-OPT %s
-; RUN: llc -mtriple=amdgcn -mcpu=fiji -O0 -verify-machineinstrs < %s | FileCheck -enable-var-scope -check-prefixes=GCN,GCN-NOOPT %s
+; RUN: llc -mtriple=amdgcn -mcpu=fiji -verify-machineinstrs < %s | FileCheck -enable-var-scope -check-prefix=GCN %s
+; RUN: llc -mtriple=amdgcn -mcpu=fiji -O0 -verify-machineinstrs < %s | FileCheck -enable-var-scope -check-prefix=GCN %s
 
 ; GCN-LABEL: {{^}}scalar_to_vector_i16:
-; GCN-NOOPT: s_mov_b32 [[S:s[0-9]+]], 42
-; GCN-NOOPT: v_mov_b32_e32 [[V:v[0-9]+]], [[S]]
-; GCN-OPT:   v_mov_b32_e32 [[V:v[0-9]+]], 42
+; GCN:   v_mov_b32_e32 [[V:v[0-9]+]], 42
 ; GCN: buffer_store_short [[V]],
 define void @scalar_to_vector_i16() {
   %tmp = load <2 x i16>, ptr addrspace(5) undef
@@ -14,9 +12,7 @@ define void @scalar_to_vector_i16() {
 }
 
 ; GCN-LABEL: {{^}}scalar_to_vector_f16:
-; GCN-NOOPT: s_mov_b32 [[S:s[0-9]+]], 0x3c00
-; GCN-NOOPT: v_mov_b32_e32 [[V:v[0-9]+]], [[S]]
-; GCN-OPT:   v_mov_b32_e32 [[V:v[0-9]+]], 0x3c00
+; GCN:   v_mov_b32_e32 [[V:v[0-9]+]], 0x3c00
 ; GCN: buffer_store_short [[V]],
 define void @scalar_to_vector_f16() {
   %tmp = load <2 x half>, ptr addrspace(5) undef
diff --git a/llvm/test/CodeGen/AMDGPU/sdag-print-divergence.ll b/llvm/test/CodeGen/AMDGPU/sdag-print-divergence.ll
index a3489a8..695d522 100644
--- a/llvm/test/CodeGen/AMDGPU/sdag-print-divergence.ll
+++ b/llvm/test/CodeGen/AMDGPU/sdag-print-divergence.ll
@@ -13,7 +13,7 @@
 ; GCN-DEFAULT:      t4: f32,ch = CopyFromReg # D:1 t0, Register:f32 %1
 ; GCN-DEFAULT:    t6: f32 = fadd # D:1 t5, t4
 ; GCN-DEFAULT:  t8: ch,glue = CopyToReg # D:1 t0, Register:f32 $vgpr0, t6
-; GCN-DEFAULT:  t9: ch = RETURN_TO_EPILOG # D:1 t8, Register:f32 $vgpr0, t8:1
+; GCN-DEFAULT:  t9: ch = RETURN_TO_EPILOG t8, Register:f32 $vgpr0, t8:1
 
 ; GCN-VERBOSE:  t0: ch,glue = EntryToken # D:0
 ; GCN-VERBOSE:  t2: f32,ch = CopyFromReg [ORD=1] # D:0 t0, Register:f32 %0 # D:0
@@ -21,7 +21,7 @@
 ; GCN-VERBOSE:      t4: f32,ch = CopyFromReg [ORD=1] # D:1 t0, Register:f32 %1 # D:0
 ; GCN-VERBOSE:    t6: f32 = fadd [ORD=3] # D:1 t5, t4
 ; GCN-VERBOSE:  t8: ch,glue = CopyToReg [ORD=4] # D:1 t0, Register:f32 $vgpr0 # D:0, t6
-; GCN-VERBOSE:  t9: ch = RETURN_TO_EPILOG [ORD=4] # D:1 t8, Register:f32 $vgpr0 # D:0, t8:1
+; GCN-VERBOSE:  t9: ch = RETURN_TO_EPILOG [ORD=4] # D:0 t8, Register:f32 $vgpr0 # D:0, t8:1
 
 define amdgpu_ps float @test_sdag_dump(float inreg %scalar, float %vector)  {
 entry:
diff --git a/llvm/test/CodeGen/AMDGPU/simplify-libcalls.ll b/llvm/test/CodeGen/AMDGPU/simplify-libcalls.ll
index ba1caf3..bf21ed6 100644
--- a/llvm/test/CodeGen/AMDGPU/simplify-libcalls.ll
+++ b/llvm/test/CodeGen/AMDGPU/simplify-libcalls.ll
@@ -375,11 +375,8 @@ declare float @_Z4pownfi(float, i32)
 ; GCN: %__log2 = tail call fast float @llvm.log2.f32(float %__fabs)
 ; GCN: %__ylogx = fmul fast float %__log2, 1.013000e+03
 ; GCN: %__exp2 = tail call fast float @llvm.exp2.f32(float %__ylogx)
-; GCN: %[[r0:.*]] = bitcast float %tmp to i32
-; GCN: %__pow_sign = and i32 %[[r0]], -2147483648
-; GCN: %[[r1:.*]] = bitcast float %__exp2 to i32
-; GCN: %[[r2:.*]] = or disjoint i32 %__pow_sign, %[[r1]]
-; GCN: store i32 %[[r2]], ptr addrspace(1) %a, align 4
+; GCN: %[[r0:.*]] = tail call float @llvm.copysign.f32(float %__exp2, float %tmp)
+; GCN: store float %[[r0]], ptr addrspace(1) %a, align 4
 define amdgpu_kernel void @test_pow(ptr addrspace(1) nocapture %a) {
 entry:
   %tmp = load float, ptr addrspace(1) %a, align 4
@@ -435,11 +432,7 @@ declare <2 x half> @_Z3powDv2_DhS_(<2 x half>, <2 x half>)
 ; GCN: %__log2 = tail call fast half @llvm.log2.f16(half %__fabs)
 ; GCN: %__ylogx = fmul fast half %__log2, 0xH4A80
 ; GCN: %__exp2 = tail call fast half @llvm.exp2.f16(half %__ylogx)
-; GCN: %1 = bitcast half %x to i16
-; GCN: %__pow_sign = and i16 %1, -32768
-; GCN: %2 = bitcast half %__exp2 to i16
-; GCN: %3 = or disjoint i16 %__pow_sign, %2
-; GCN: %4 = bitcast i16 %3 to half
+; GCN: %1 = tail call half @llvm.copysign.f16(half %__exp2, half %x)
 define half @test_pow_fast_f16__y_13(half %x) {
   %powr = tail call fast half @_Z3powDhDh(half %x, half 13.0)
   ret half %powr
@@ -450,11 +443,7 @@ define half @test_pow_fast_f16__y_13(half %x) {
 ; GCN: %__log2 = tail call fast <2 x half> @llvm.log2.v2f16(<2 x half> %__fabs)
 ; GCN: %__ylogx = fmul fast <2 x half> %__log2, <half 0xH4A80, half 0xH4A80>
 ; GCN: %__exp2 = tail call fast <2 x half> @llvm.exp2.v2f16(<2 x half> %__ylogx)
-; GCN: %1 = bitcast <2 x half> %x to <2 x i16>
-; GCN: %__pow_sign = and <2 x i16> %1, <i16 -32768, i16 -32768>
-; GCN: %2 = bitcast <2 x half> %__exp2 to <2 x i16>
-; GCN: %3 = or disjoint <2 x i16> %__pow_sign, %2
-; GCN: %4 = bitcast <2 x i16> %3 to <2 x half>
+; GCN: %1 = tail call <2 x half> @llvm.copysign.v2f16(<2 x half> %__exp2, <2 x half> %x)
 define <2 x half> @test_pow_fast_v2f16__y_13(<2 x half> %x) {
   %powr = tail call fast <2 x half> @_Z3powDv2_DhS_(<2 x half> %x, <2 x half> <half 13.0, half 13.0>)
   ret <2 x half> %powr
diff --git a/llvm/test/CodeGen/AMDGPU/vgpr-spill-placement-issue61083.ll b/llvm/test/CodeGen/AMDGPU/vgpr-spill-placement-issue61083.ll
index 20dc5ad..50927a2 100644
--- a/llvm/test/CodeGen/AMDGPU/vgpr-spill-placement-issue61083.ll
+++ b/llvm/test/CodeGen/AMDGPU/vgpr-spill-placement-issue61083.ll
@@ -25,8 +25,7 @@ define amdgpu_kernel void @__omp_offloading_16_dd2df_main_l9()  {
 ; CHECK-NEXT:    ; implicit-def: $sgpr4
 ; CHECK-NEXT:    s_mov_b32 s4, 0
 ; CHECK-NEXT:    v_cmp_eq_u32_e64 s[6:7], v2, s4
-; CHECK-NEXT:    s_mov_b32 s4, 0
-; CHECK-NEXT:    v_mov_b32_e32 v2, s4
+; CHECK-NEXT:    v_mov_b32_e32 v2, 0
 ; CHECK-NEXT:    ds_write_b8 v1, v2
 ; CHECK-NEXT:    s_mov_b64 s[4:5], exec
 ; CHECK-NEXT:    v_writelane_b32 v0, s4, 0
diff --git a/llvm/test/CodeGen/AMDGPU/vselect.ll b/llvm/test/CodeGen/AMDGPU/vselect.ll
index 715d4c0..3df757a 100644
--- a/llvm/test/CodeGen/AMDGPU/vselect.ll
+++ b/llvm/test/CodeGen/AMDGPU/vselect.ll
@@ -1,23 +1,66 @@
-;RUN: llc < %s -mtriple=amdgcn -verify-machineinstrs | FileCheck --check-prefix=SI --check-prefix=FUNC %s
-;RUN: llc < %s -mtriple=amdgcn -mcpu=tonga -mattr=-flat-for-global -verify-machineinstrs | FileCheck --check-prefix=VI --check-prefix=FUNC %s
-;RUN: llc < %s -mtriple=r600 -mcpu=redwood | FileCheck --check-prefix=EG --check-prefix=FUNC %s
-
-; FUNC-LABEL: {{^}}test_select_v2i32:
-
-; EG-DAG: CNDE_INT {{\** *}}T{{[0-9]+\.[XYZW], PV\.[XYZW], T[0-9]+\.[XYZW]}}, KC0[3].Z
-; EG-DAG: CNDE_INT {{\** *}}T{{[0-9]+\.[XYZW], PV\.[XYZW], T[0-9]+\.[XYZW]}}, KC0[3].Y
-
-; VI: s_cmp_gt_i32
-; VI: s_cselect_b32
-; VI: s_cmp_gt_i32
-; VI: s_cselect_b32
-
-; SI-DAG: s_cmp_gt_i32
-; SI-DAG: s_cselect_b32
-; SI-DAG: s_cmp_gt_i32
-; SI-DAG: s_cselect_b32
+; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py UTC_ARGS: --version 5
+;RUN: llc < %s -mtriple=amdgcn -verify-machineinstrs | FileCheck --check-prefixes=SI %s
+;RUN: llc < %s -mtriple=amdgcn -mcpu=tonga -mattr=-flat-for-global -verify-machineinstrs | FileCheck --check-prefixes=VI %s
+;RUN: llc < %s -mtriple=r600 -mcpu=redwood | FileCheck --check-prefixes=EG %s
 
 define amdgpu_kernel void @test_select_v2i32(ptr addrspace(1) %out, ptr addrspace(1) %in0, ptr addrspace(1) %in1, <2 x i32> %val) {
+; SI-LABEL: test_select_v2i32:
+; SI:       ; %bb.0: ; %entry
+; SI-NEXT:    s_load_dwordx8 s[0:7], s[2:3], 0x9
+; SI-NEXT:    s_waitcnt lgkmcnt(0)
+; SI-NEXT:    s_load_dwordx2 s[8:9], s[2:3], 0x0
+; SI-NEXT:    s_load_dwordx2 s[4:5], s[4:5], 0x0
+; SI-NEXT:    s_mov_b32 s3, 0xf000
+; SI-NEXT:    s_waitcnt lgkmcnt(0)
+; SI-NEXT:    s_cmp_gt_i32 s9, s5
+; SI-NEXT:    s_cselect_b32 s5, s7, s9
+; SI-NEXT:    s_cmp_gt_i32 s8, s4
+; SI-NEXT:    s_cselect_b32 s4, s6, s8
+; SI-NEXT:    s_mov_b32 s2, -1
+; SI-NEXT:    v_mov_b32_e32 v1, s5
+; SI-NEXT:    v_mov_b32_e32 v0, s4
+; SI-NEXT:    buffer_store_dwordx2 v[0:1], off, s[0:3], 0
+; SI-NEXT:    s_endpgm
+;
+; VI-LABEL: test_select_v2i32:
+; VI:       ; %bb.0: ; %entry
+; VI-NEXT:    s_load_dwordx8 s[0:7], s[2:3], 0x24
+; VI-NEXT:    s_waitcnt lgkmcnt(0)
+; VI-NEXT:    s_load_dwordx2 s[8:9], s[2:3], 0x0
+; VI-NEXT:    s_load_dwordx2 s[4:5], s[4:5], 0x0
+; VI-NEXT:    s_mov_b32 s3, 0xf000
+; VI-NEXT:    s_mov_b32 s2, -1
+; VI-NEXT:    s_waitcnt lgkmcnt(0)
+; VI-NEXT:    s_cmp_gt_i32 s9, s5
+; VI-NEXT:    s_cselect_b32 s5, s7, s9
+; VI-NEXT:    s_cmp_gt_i32 s8, s4
+; VI-NEXT:    s_cselect_b32 s4, s6, s8
+; VI-NEXT:    v_mov_b32_e32 v0, s4
+; VI-NEXT:    v_mov_b32_e32 v1, s5
+; VI-NEXT:    buffer_store_dwordx2 v[0:1], off, s[0:3], 0
+; VI-NEXT:    s_endpgm
+;
+; EG-LABEL: test_select_v2i32:
+; EG:       ; %bb.0: ; %entry
+; EG-NEXT:    ALU 1, @10, KC0[CB0:0-32], KC1[]
+; EG-NEXT:    TEX 1 @6
+; EG-NEXT:    ALU 5, @12, KC0[CB0:0-32], KC1[]
+; EG-NEXT:    MEM_RAT_CACHELESS STORE_RAW T0.XY, T1.X, 1
+; EG-NEXT:    CF_END
+; EG-NEXT:    PAD
+; EG-NEXT:    Fetch clause starting at 6:
+; EG-NEXT:     VTX_READ_64 T1.XY, T1.X, 0, #1
+; EG-NEXT:     VTX_READ_64 T0.XY, T0.X, 0, #1
+; EG-NEXT:    ALU clause starting at 10:
+; EG-NEXT:     MOV T0.X, KC0[2].Z,
+; EG-NEXT:     MOV * T1.X, KC0[2].W,
+; EG-NEXT:    ALU clause starting at 12:
+; EG-NEXT:     SETGT_INT * T0.W, T0.Y, T1.Y,
+; EG-NEXT:     CNDE_INT T0.Y, PV.W, T0.Y, KC0[3].Z,
+; EG-NEXT:     SETGT_INT * T0.W, T0.X, T1.X,
+; EG-NEXT:     CNDE_INT T0.X, PV.W, T0.X, KC0[3].Y,
+; EG-NEXT:     LSHR * T1.X, KC0[2].Y, literal.x,
+; EG-NEXT:    2(2.802597e-45), 0(0.000000e+00)
 entry:
   %load0 = load <2 x i32>, ptr addrspace(1) %in0
   %load1 = load <2 x i32>, ptr addrspace(1) %in1
@@ -27,17 +70,72 @@ entry:
   ret void
 }
 
-; FUNC-LABEL: {{^}}test_select_v2f32:
-
-; EG: CNDE_INT {{\** *}}T{{[0-9]+\.[XYZW], PV\.[XYZW], T[0-9]+\.[XYZW], T[0-9]+\.[XYZW]}}
-; EG: CNDE_INT {{\** *}}T{{[0-9]+\.[XYZW], PV\.[XYZW], T[0-9]+\.[XYZW], T[0-9]+\.[XYZW]}}
-
-; SI: v_cmp_neq_f32_e32 vcc
-; SI: v_cndmask_b32_e32
-; SI: v_cmp_neq_f32_e32 vcc
-; SI: v_cndmask_b32_e32
-
 define amdgpu_kernel void @test_select_v2f32(ptr addrspace(1) %out, ptr addrspace(1) %in0, ptr addrspace(1) %in1) {
+; SI-LABEL: test_select_v2f32:
+; SI:       ; %bb.0: ; %entry
+; SI-NEXT:    s_load_dwordx4 s[4:7], s[2:3], 0x9
+; SI-NEXT:    s_load_dwordx2 s[0:1], s[2:3], 0xd
+; SI-NEXT:    s_waitcnt lgkmcnt(0)
+; SI-NEXT:    s_load_dwordx2 s[2:3], s[6:7], 0x0
+; SI-NEXT:    s_load_dwordx2 s[0:1], s[0:1], 0x0
+; SI-NEXT:    s_mov_b32 s7, 0xf000
+; SI-NEXT:    s_mov_b32 s6, -1
+; SI-NEXT:    s_waitcnt lgkmcnt(0)
+; SI-NEXT:    v_mov_b32_e32 v0, s0
+; SI-NEXT:    v_mov_b32_e32 v1, s1
+; SI-NEXT:    v_mov_b32_e32 v2, s3
+; SI-NEXT:    v_cmp_neq_f32_e32 vcc, s3, v1
+; SI-NEXT:    v_cndmask_b32_e32 v1, v1, v2, vcc
+; SI-NEXT:    v_mov_b32_e32 v2, s2
+; SI-NEXT:    v_cmp_neq_f32_e32 vcc, s2, v0
+; SI-NEXT:    v_cndmask_b32_e32 v0, v0, v2, vcc
+; SI-NEXT:    buffer_store_dwordx2 v[0:1], off, s[4:7], 0
+; SI-NEXT:    s_endpgm
+;
+; VI-LABEL: test_select_v2f32:
+; VI:       ; %bb.0: ; %entry
+; VI-NEXT:    s_load_dwordx2 s[4:5], s[2:3], 0x34
+; VI-NEXT:    s_load_dwordx4 s[0:3], s[2:3], 0x24
+; VI-NEXT:    s_mov_b32 s7, 0xf000
+; VI-NEXT:    s_mov_b32 s6, -1
+; VI-NEXT:    s_waitcnt lgkmcnt(0)
+; VI-NEXT:    s_load_dwordx2 s[8:9], s[4:5], 0x0
+; VI-NEXT:    s_load_dwordx2 s[2:3], s[2:3], 0x0
+; VI-NEXT:    s_mov_b32 s4, s0
+; VI-NEXT:    s_mov_b32 s5, s1
+; VI-NEXT:    s_waitcnt lgkmcnt(0)
+; VI-NEXT:    v_mov_b32_e32 v1, s9
+; VI-NEXT:    v_mov_b32_e32 v0, s8
+; VI-NEXT:    v_mov_b32_e32 v2, s3
+; VI-NEXT:    v_cmp_neq_f32_e32 vcc, s3, v1
+; VI-NEXT:    v_cndmask_b32_e32 v1, v1, v2, vcc
+; VI-NEXT:    v_mov_b32_e32 v2, s2
+; VI-NEXT:    v_cmp_neq_f32_e32 vcc, s2, v0
+; VI-NEXT:    v_cndmask_b32_e32 v0, v0, v2, vcc
+; VI-NEXT:    buffer_store_dwordx2 v[0:1], off, s[4:7], 0
+; VI-NEXT:    s_endpgm
+;
+; EG-LABEL: test_select_v2f32:
+; EG:       ; %bb.0: ; %entry
+; EG-NEXT:    ALU 1, @10, KC0[CB0:0-32], KC1[]
+; EG-NEXT:    TEX 1 @6
+; EG-NEXT:    ALU 5, @12, KC0[CB0:0-32], KC1[]
+; EG-NEXT:    MEM_RAT_CACHELESS STORE_RAW T0.XY, T1.X, 1
+; EG-NEXT:    CF_END
+; EG-NEXT:    PAD
+; EG-NEXT:    Fetch clause starting at 6:
+; EG-NEXT:     VTX_READ_64 T1.XY, T1.X, 0, #1
+; EG-NEXT:     VTX_READ_64 T0.XY, T0.X, 0, #1
+; EG-NEXT:    ALU clause starting at 10:
+; EG-NEXT:     MOV T0.X, KC0[2].Z,
+; EG-NEXT:     MOV * T1.X, KC0[2].W,
+; EG-NEXT:    ALU clause starting at 12:
+; EG-NEXT:     SETNE_DX10 * T0.W, T0.Y, T1.Y,
+; EG-NEXT:     CNDE_INT T0.Y, PV.W, T1.Y, T0.Y,
+; EG-NEXT:     SETNE_DX10 * T0.W, T0.X, T1.X,
+; EG-NEXT:     CNDE_INT T0.X, PV.W, T1.X, T0.X,
+; EG-NEXT:     LSHR * T1.X, KC0[2].Y, literal.x,
+; EG-NEXT:    2(2.802597e-45), 0(0.000000e+00)
 entry:
   %0 = load <2 x float>, ptr addrspace(1) %in0
   %1 = load <2 x float>, ptr addrspace(1) %in1
@@ -47,24 +145,86 @@ entry:
   ret void
 }
 
-;FUNC-LABEL: {{^}}test_select_v4i32:
-
-; EG-DAG: CNDE_INT {{\** *}}T{{[0-9]+\.[XYZW], PV\.[XYZW], T[0-9]+\.[XYZW]}}, KC0[4].X
-; EG-DAG: CNDE_INT {{\** *}}T{{[0-9]+\.[XYZW], T[0-9]+\.[XYZW], T[0-9]+\.[XYZW]}}, KC0[3].W
-; EG-DAG: CNDE_INT {{\** *}}T{{[0-9]+\.[XYZW], PV\.[XYZW], T[0-9]+\.[XYZW]}}, KC0[3].Z
-; EG-DAG: CNDE_INT {{\** *}}T{{[0-9]+\.[XYZW], PV\.[XYZW], T[0-9]+\.[XYZW]}}, KC0[3].Y
-
-; VI: s_cselect_b32
-; VI: s_cselect_b32
-; VI: s_cselect_b32
-; VI: s_cselect_b32
-
-; SI-DAG: s_cselect_b32
-; SI-DAG: s_cselect_b32
-; SI-DAG: s_cselect_b32
-; SI-DAG: s_cselect_b32
-
 define amdgpu_kernel void @test_select_v4i32(ptr addrspace(1) %out, ptr addrspace(1) %in0, ptr addrspace(1) %in1, <4 x i32> %val) {
+; SI-LABEL: test_select_v4i32:
+; SI:       ; %bb.0: ; %entry
+; SI-NEXT:    s_load_dwordx4 s[4:7], s[2:3], 0x9
+; SI-NEXT:    s_load_dwordx2 s[0:1], s[2:3], 0xd
+; SI-NEXT:    s_waitcnt lgkmcnt(0)
+; SI-NEXT:    s_load_dwordx4 s[8:11], s[6:7], 0x0
+; SI-NEXT:    s_load_dwordx4 s[12:15], s[0:1], 0x0
+; SI-NEXT:    s_load_dwordx4 s[0:3], s[2:3], 0x11
+; SI-NEXT:    s_mov_b32 s7, 0xf000
+; SI-NEXT:    s_waitcnt lgkmcnt(0)
+; SI-NEXT:    s_cmp_gt_i32 s10, s14
+; SI-NEXT:    s_cselect_b32 s2, s2, s10
+; SI-NEXT:    s_cmp_gt_i32 s9, s13
+; SI-NEXT:    s_cselect_b32 s1, s1, s9
+; SI-NEXT:    s_cmp_gt_i32 s11, s15
+; SI-NEXT:    s_cselect_b32 s3, s3, s11
+; SI-NEXT:    s_cmp_gt_i32 s8, s12
+; SI-NEXT:    s_cselect_b32 s0, s0, s8
+; SI-NEXT:    s_mov_b32 s6, -1
+; SI-NEXT:    v_mov_b32_e32 v2, s2
+; SI-NEXT:    v_mov_b32_e32 v1, s1
+; SI-NEXT:    v_mov_b32_e32 v3, s3
+; SI-NEXT:    v_mov_b32_e32 v0, s0
+; SI-NEXT:    buffer_store_dwordx4 v[0:3], off, s[4:7], 0
+; SI-NEXT:    s_endpgm
+;
+; VI-LABEL: test_select_v4i32:
+; VI:       ; %bb.0: ; %entry
+; VI-NEXT:    s_load_dwordx4 s[4:7], s[2:3], 0x24
+; VI-NEXT:    s_load_dwordx2 s[0:1], s[2:3], 0x34
+; VI-NEXT:    s_mov_b32 s11, 0xf000
+; VI-NEXT:    s_mov_b32 s10, -1
+; VI-NEXT:    s_waitcnt lgkmcnt(0)
+; VI-NEXT:    s_load_dwordx4 s[12:15], s[6:7], 0x0
+; VI-NEXT:    s_load_dwordx4 s[16:19], s[0:1], 0x0
+; VI-NEXT:    s_load_dwordx4 s[0:3], s[2:3], 0x44
+; VI-NEXT:    s_mov_b32 s8, s4
+; VI-NEXT:    s_mov_b32 s9, s5
+; VI-NEXT:    s_waitcnt lgkmcnt(0)
+; VI-NEXT:    s_cmp_gt_i32 s14, s18
+; VI-NEXT:    s_cselect_b32 s2, s2, s14
+; VI-NEXT:    s_cmp_gt_i32 s13, s17
+; VI-NEXT:    s_cselect_b32 s1, s1, s13
+; VI-NEXT:    s_cmp_gt_i32 s15, s19
+; VI-NEXT:    s_cselect_b32 s3, s3, s15
+; VI-NEXT:    s_cmp_gt_i32 s12, s16
+; VI-NEXT:    s_cselect_b32 s0, s0, s12
+; VI-NEXT:    v_mov_b32_e32 v0, s0
+; VI-NEXT:    v_mov_b32_e32 v1, s1
+; VI-NEXT:    v_mov_b32_e32 v2, s2
+; VI-NEXT:    v_mov_b32_e32 v3, s3
+; VI-NEXT:    buffer_store_dwordx4 v[0:3], off, s[8:11], 0
+; VI-NEXT:    s_endpgm
+;
+; EG-LABEL: test_select_v4i32:
+; EG:       ; %bb.0: ; %entry
+; EG-NEXT:    ALU 1, @10, KC0[CB0:0-32], KC1[]
+; EG-NEXT:    TEX 1 @6
+; EG-NEXT:    ALU 9, @12, KC0[CB0:0-32], KC1[]
+; EG-NEXT:    MEM_RAT_CACHELESS STORE_RAW T0.XYZW, T1.X, 1
+; EG-NEXT:    CF_END
+; EG-NEXT:    PAD
+; EG-NEXT:    Fetch clause starting at 6:
+; EG-NEXT:     VTX_READ_128 T1.XYZW, T1.X, 0, #1
+; EG-NEXT:     VTX_READ_128 T0.XYZW, T0.X, 0, #1
+; EG-NEXT:    ALU clause starting at 10:
+; EG-NEXT:     MOV T0.X, KC0[2].Z,
+; EG-NEXT:     MOV * T1.X, KC0[2].W,
+; EG-NEXT:    ALU clause starting at 12:
+; EG-NEXT:     SETGT_INT T1.W, T0.W, T1.W,
+; EG-NEXT:     SETGT_INT * T2.W, T0.Z, T1.Z,
+; EG-NEXT:     CNDE_INT * T0.W, PV.W, T0.W, KC0[4].X,
+; EG-NEXT:     CNDE_INT T0.Z, T2.W, T0.Z, KC0[3].W,
+; EG-NEXT:     SETGT_INT * T1.W, T0.Y, T1.Y,
+; EG-NEXT:     CNDE_INT T0.Y, PV.W, T0.Y, KC0[3].Z,
+; EG-NEXT:     SETGT_INT * T1.W, T0.X, T1.X,
+; EG-NEXT:     CNDE_INT T0.X, PV.W, T0.X, KC0[3].Y,
+; EG-NEXT:     LSHR * T1.X, KC0[2].Y, literal.x,
+; EG-NEXT:    2(2.802597e-45), 0(0.000000e+00)
 entry:
   %load0 = load <4 x i32>, ptr addrspace(1) %in0
   %load1 = load <4 x i32>, ptr addrspace(1) %in1
@@ -74,17 +234,92 @@ entry:
   ret void
 }
 
-;FUNC-LABEL: {{^}}test_select_v4f32:
-;EG: CNDE_INT {{\** *}}T{{[0-9]+\.[XYZW], PV\.[XYZW], T[0-9]+\.[XYZW], T[0-9]+\.[XYZW]}}
-;EG: CNDE_INT {{\** *}}T{{[0-9]+\.[XYZW], T[0-9]+\.[XYZW], T[0-9]+\.[XYZW], T[0-9]+\.[XYZW]}}
-;EG: CNDE_INT {{\** *}}T{{[0-9]+\.[XYZW], PV\.[XYZW], T[0-9]+\.[XYZW], T[0-9]+\.[XYZW]}}
-;EG: CNDE_INT {{\** *}}T{{[0-9]+\.[XYZW], PV\.[XYZW], T[0-9]+\.[XYZW], T[0-9]+\.[XYZW]}}
-
-; SI: v_cndmask_b32_e32
-; SI: v_cndmask_b32_e32
-; SI: v_cndmask_b32_e32
-; SI: v_cndmask_b32_e32
 define amdgpu_kernel void @test_select_v4f32(ptr addrspace(1) %out, ptr addrspace(1) %in0, ptr addrspace(1) %in1) {
+; SI-LABEL: test_select_v4f32:
+; SI:       ; %bb.0: ; %entry
+; SI-NEXT:    s_load_dwordx4 s[4:7], s[2:3], 0x9
+; SI-NEXT:    s_load_dwordx2 s[8:9], s[2:3], 0xd
+; SI-NEXT:    s_waitcnt lgkmcnt(0)
+; SI-NEXT:    s_load_dwordx4 s[0:3], s[6:7], 0x0
+; SI-NEXT:    s_load_dwordx4 s[8:11], s[8:9], 0x0
+; SI-NEXT:    s_mov_b32 s7, 0xf000
+; SI-NEXT:    s_mov_b32 s6, -1
+; SI-NEXT:    s_waitcnt lgkmcnt(0)
+; SI-NEXT:    v_mov_b32_e32 v0, s8
+; SI-NEXT:    v_mov_b32_e32 v1, s9
+; SI-NEXT:    v_mov_b32_e32 v2, s10
+; SI-NEXT:    v_mov_b32_e32 v3, s11
+; SI-NEXT:    v_mov_b32_e32 v4, s3
+; SI-NEXT:    v_cmp_neq_f32_e32 vcc, s3, v3
+; SI-NEXT:    v_cndmask_b32_e32 v3, v3, v4, vcc
+; SI-NEXT:    v_mov_b32_e32 v4, s2
+; SI-NEXT:    v_cmp_neq_f32_e32 vcc, s2, v2
+; SI-NEXT:    v_cndmask_b32_e32 v2, v2, v4, vcc
+; SI-NEXT:    v_mov_b32_e32 v4, s1
+; SI-NEXT:    v_cmp_neq_f32_e32 vcc, s1, v1
+; SI-NEXT:    v_cndmask_b32_e32 v1, v1, v4, vcc
+; SI-NEXT:    v_mov_b32_e32 v4, s0
+; SI-NEXT:    v_cmp_neq_f32_e32 vcc, s0, v0
+; SI-NEXT:    v_cndmask_b32_e32 v0, v0, v4, vcc
+; SI-NEXT:    buffer_store_dwordx4 v[0:3], off, s[4:7], 0
+; SI-NEXT:    s_endpgm
+;
+; VI-LABEL: test_select_v4f32:
+; VI:       ; %bb.0: ; %entry
+; VI-NEXT:    s_load_dwordx2 s[4:5], s[2:3], 0x34
+; VI-NEXT:    s_load_dwordx4 s[0:3], s[2:3], 0x24
+; VI-NEXT:    s_mov_b32 s7, 0xf000
+; VI-NEXT:    s_mov_b32 s6, -1
+; VI-NEXT:    s_waitcnt lgkmcnt(0)
+; VI-NEXT:    s_load_dwordx4 s[8:11], s[4:5], 0x0
+; VI-NEXT:    s_mov_b32 s4, s0
+; VI-NEXT:    s_mov_b32 s5, s1
+; VI-NEXT:    s_load_dwordx4 s[0:3], s[2:3], 0x0
+; VI-NEXT:    s_waitcnt lgkmcnt(0)
+; VI-NEXT:    v_mov_b32_e32 v3, s11
+; VI-NEXT:    v_mov_b32_e32 v2, s10
+; VI-NEXT:    v_mov_b32_e32 v1, s9
+; VI-NEXT:    v_mov_b32_e32 v4, s3
+; VI-NEXT:    v_cmp_neq_f32_e32 vcc, s3, v3
+; VI-NEXT:    v_cndmask_b32_e32 v3, v3, v4, vcc
+; VI-NEXT:    v_mov_b32_e32 v4, s2
+; VI-NEXT:    v_cmp_neq_f32_e32 vcc, s2, v2
+; VI-NEXT:    v_mov_b32_e32 v0, s8
+; VI-NEXT:    v_cndmask_b32_e32 v2, v2, v4, vcc
+; VI-NEXT:    v_mov_b32_e32 v4, s1
+; VI-NEXT:    v_cmp_neq_f32_e32 vcc, s1, v1
+; VI-NEXT:    v_cndmask_b32_e32 v1, v1, v4, vcc
+; VI-NEXT:    v_mov_b32_e32 v4, s0
+; VI-NEXT:    v_cmp_neq_f32_e32 vcc, s0, v0
+; VI-NEXT:    v_cndmask_b32_e32 v0, v0, v4, vcc
+; VI-NEXT:    buffer_store_dwordx4 v[0:3], off, s[4:7], 0
+; VI-NEXT:    s_endpgm
+;
+; EG-LABEL: test_select_v4f32:
+; EG:       ; %bb.0: ; %entry
+; EG-NEXT:    ALU 1, @10, KC0[CB0:0-32], KC1[]
+; EG-NEXT:    TEX 1 @6
+; EG-NEXT:    ALU 9, @12, KC0[CB0:0-32], KC1[]
+; EG-NEXT:    MEM_RAT_CACHELESS STORE_RAW T0.XYZW, T1.X, 1
+; EG-NEXT:    CF_END
+; EG-NEXT:    PAD
+; EG-NEXT:    Fetch clause starting at 6:
+; EG-NEXT:     VTX_READ_128 T1.XYZW, T1.X, 0, #1
+; EG-NEXT:     VTX_READ_128 T0.XYZW, T0.X, 0, #1
+; EG-NEXT:    ALU clause starting at 10:
+; EG-NEXT:     MOV T0.X, KC0[2].Z,
+; EG-NEXT:     MOV * T1.X, KC0[2].W,
+; EG-NEXT:    ALU clause starting at 12:
+; EG-NEXT:     SETNE_DX10 T2.W, T0.W, T1.W,
+; EG-NEXT:     SETNE_DX10 * T3.W, T0.Z, T1.Z,
+; EG-NEXT:     CNDE_INT * T0.W, PV.W, T1.W, T0.W,
+; EG-NEXT:     CNDE_INT T0.Z, T3.W, T1.Z, T0.Z,
+; EG-NEXT:     SETNE_DX10 * T1.W, T0.Y, T1.Y,
+; EG-NEXT:     CNDE_INT T0.Y, PV.W, T1.Y, T0.Y,
+; EG-NEXT:     SETNE_DX10 * T1.W, T0.X, T1.X,
+; EG-NEXT:     CNDE_INT T0.X, PV.W, T1.X, T0.X,
+; EG-NEXT:     LSHR * T1.X, KC0[2].Y, literal.x,
+; EG-NEXT:    2(2.802597e-45), 0(0.000000e+00)
 entry:
   %0 = load <4 x float>, ptr addrspace(1) %in0
   %1 = load <4 x float>, ptr addrspace(1) %in1
diff --git a/llvm/test/CodeGen/AMDGPU/while-break.ll b/llvm/test/CodeGen/AMDGPU/while-break.ll
index 13b37b4..4625499 100644
--- a/llvm/test/CodeGen/AMDGPU/while-break.ll
+++ b/llvm/test/CodeGen/AMDGPU/while-break.ll
@@ -152,4 +152,90 @@ end:
   ret float %r
 }
 
+; Two chains of phi network that have the same value from %if block.
+define amdgpu_ps < 2 x float> @while_break_two_chains_of_phi(float %v, i32 %x, i32 %y, i32 %z, ptr addrspace(1) %p) #0 {
+; GCN-LABEL: while_break_two_chains_of_phi:
+; GCN:       ; %bb.0: ; %entry
+; GCN-NEXT:    v_mov_b32_e32 v6, 0
+; GCN-NEXT:    s_mov_b32 s2, 0
+; GCN-NEXT:    s_mov_b32 s0, 0
+; GCN-NEXT:    s_branch .LBB2_2
+; GCN-NEXT:  .LBB2_1: ; %Flow1
+; GCN-NEXT:    ; in Loop: Header=BB2_2 Depth=1
+; GCN-NEXT:    s_or_b32 exec_lo, exec_lo, s1
+; GCN-NEXT:    s_and_b32 s1, exec_lo, s4
+; GCN-NEXT:    s_or_b32 s2, s1, s2
+; GCN-NEXT:    s_andn2_b32 exec_lo, exec_lo, s2
+; GCN-NEXT:    s_cbranch_execz .LBB2_6
+; GCN-NEXT:  .LBB2_2: ; %header
+; GCN-NEXT:    ; =>This Inner Loop Header: Depth=1
+; GCN-NEXT:    v_cmp_ge_i32_e64 s3, s0, v1
+; GCN-NEXT:    v_cmp_lt_i32_e32 vcc_lo, s0, v1
+; GCN-NEXT:    s_and_saveexec_b32 s4, vcc_lo
+; GCN-NEXT:    s_cbranch_execz .LBB2_4
+; GCN-NEXT:  ; %bb.3: ; %if
+; GCN-NEXT:    ; in Loop: Header=BB2_2 Depth=1
+; GCN-NEXT:    s_ashr_i32 s1, s0, 31
+; GCN-NEXT:    s_lshl_b64 s[6:7], s[0:1], 2
+; GCN-NEXT:    s_andn2_b32 s1, s3, exec_lo
+; GCN-NEXT:    v_add_co_u32 v6, vcc_lo, v4, s6
+; GCN-NEXT:    v_add_co_ci_u32_e32 v7, vcc_lo, s7, v5, vcc_lo
+; GCN-NEXT:    v_cmp_lt_i32_e32 vcc_lo, s0, v2
+; GCN-NEXT:    global_load_dword v0, v[6:7], off
+; GCN-NEXT:    s_and_b32 s3, vcc_lo, exec_lo
+; GCN-NEXT:    s_or_b32 s3, s1, s3
+; GCN-NEXT:    s_waitcnt vmcnt(0)
+; GCN-NEXT:    v_add_f32_e32 v6, 1.0, v0
+; GCN-NEXT:    v_mov_b32_e32 v0, v6
+; GCN-NEXT:  .LBB2_4: ; %Flow
+; GCN-NEXT:    ; in Loop: Header=BB2_2 Depth=1
+; GCN-NEXT:    s_or_b32 exec_lo, exec_lo, s4
+; GCN-NEXT:    v_mov_b32_e32 v7, v6
+; GCN-NEXT:    s_mov_b32 s4, -1
+; GCN-NEXT:    s_and_saveexec_b32 s1, s3
+; GCN-NEXT:    s_cbranch_execz .LBB2_1
+; GCN-NEXT:  ; %bb.5: ; %latch
+; GCN-NEXT:    ; in Loop: Header=BB2_2 Depth=1
+; GCN-NEXT:    v_cmp_lt_i32_e32 vcc_lo, s0, v3
+; GCN-NEXT:    v_mov_b32_e32 v7, v0
+; GCN-NEXT:    s_add_i32 s0, s0, 1
+; GCN-NEXT:    s_orn2_b32 s4, vcc_lo, exec_lo
+; GCN-NEXT:    s_branch .LBB2_1
+; GCN-NEXT:  .LBB2_6: ; %end
+; GCN-NEXT:    s_or_b32 exec_lo, exec_lo, s2
+; GCN-NEXT:    v_mov_b32_e32 v0, v7
+; GCN-NEXT:    v_mov_b32_e32 v1, v6
+; GCN-NEXT:    ; return to shader part epilog
+entry:
+  br label %header
+
+header:
+  %v.1 = phi float [ %v, %entry ], [ %v.2, %latch ]
+  %v.copy = phi float [ 0.0, %entry ], [ %v.copy.2, %latch ]
+  %ind = phi i32 [ 0, %entry], [ %ind.inc, %latch ]
+  %cc = icmp slt i32 %ind, %x
+  br i1 %cc, label %if, label %latch
+
+if:
+  %v.ptr = getelementptr float, ptr addrspace(1) %p, i32 %ind
+  %v.load = load float, ptr addrspace(1) %v.ptr
+  %v.if = fadd float %v.load, 1.0
+  %cc2 = icmp slt i32 %ind, %y
+  br i1 %cc2, label %latch, label %end
+
+latch:
+  %v.2 = phi float [ %v.1, %header ], [ %v.if, %if ]
+  %v.copy.2 = phi float [ %v.copy, %header ], [ %v.if, %if ]
+  %ind.inc = add i32 %ind, 1
+  %cc3 = icmp slt i32 %ind, %z
+  br i1 %cc3, label %end, label %header
+
+end:
+  %r = phi float [ %v.2, %latch ], [ %v.if, %if ]
+  %r2 = phi float [ %v.copy.2, %latch ], [ %v.if, %if ]
+  %packed0 = insertelement < 2 x float > poison, float %r, i32 0
+  %packed1 = insertelement < 2 x float > %packed0, float %r2, i32 1
+  ret < 2 x float> %packed1
+}
+
 attributes #0 = { nounwind }
diff --git a/llvm/test/CodeGen/AMDGPU/wqm.ll b/llvm/test/CodeGen/AMDGPU/wqm.ll
index 8f052ef..ee71502 100644
--- a/llvm/test/CodeGen/AMDGPU/wqm.ll
+++ b/llvm/test/CodeGen/AMDGPU/wqm.ll
@@ -3538,6 +3538,155 @@ define amdgpu_gs void @wqm_init_exec_wwm() {
   ret void
 }
 
+; Check that exact regions with execz affected instructions are as short as possible
+define amdgpu_ps float @short_exact_regions(<8 x i32> inreg %rsrc, <4 x i32> inreg %sampler, float %c, ptr addrspace(4) %p) {
+; GFX9-W64-LABEL: short_exact_regions:
+; GFX9-W64:       ; %bb.0: ; %main_body
+; GFX9-W64-NEXT:    s_mov_b64 s[12:13], exec
+; GFX9-W64-NEXT:    s_wqm_b64 exec, exec
+; GFX9-W64-NEXT:    image_sample v[3:6], v0, s[0:7], s[8:11] dmask:0xf
+; GFX9-W64-NEXT:    v_mbcnt_lo_u32_b32 v0, -1, 0
+; GFX9-W64-NEXT:    v_mbcnt_hi_u32_b32 v0, -1, v0
+; GFX9-W64-NEXT:    v_cmp_gt_u32_e32 vcc, 16, v0
+; GFX9-W64-NEXT:    s_and_saveexec_b64 s[14:15], vcc
+; GFX9-W64-NEXT:    s_cbranch_execz .LBB59_2
+; GFX9-W64-NEXT:  ; %bb.1: ; %if
+; GFX9-W64-NEXT:    s_and_saveexec_b64 s[16:17], s[12:13]
+; GFX9-W64-NEXT:    global_load_dword v0, v[1:2], off
+; GFX9-W64-NEXT:    s_waitcnt vmcnt(0)
+; GFX9-W64-NEXT:    v_readfirstlane_b32 s18, v0
+; GFX9-W64-NEXT:    s_buffer_load_dword s18, s[8:11], s18 offset:0x0
+; GFX9-W64-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX9-W64-NEXT:    v_mov_b32_e32 v0, s18
+; GFX9-W64-NEXT:    buffer_store_dwordx4 v[3:6], v0, s[0:3], 0 idxen
+; GFX9-W64-NEXT:    s_mov_b64 exec, s[16:17]
+; GFX9-W64-NEXT:  .LBB59_2: ; %endif
+; GFX9-W64-NEXT:    s_or_b64 exec, exec, s[14:15]
+; GFX9-W64-NEXT:    s_waitcnt vmcnt(0)
+; GFX9-W64-NEXT:    image_sample v0, v3, s[0:7], s[8:11] dmask:0x4
+; GFX9-W64-NEXT:    s_waitcnt vmcnt(0)
+; GFX9-W64-NEXT:    v_add_f32_e32 v0, v4, v0
+; GFX9-W64-NEXT:    ; kill: def $vgpr0 killed $vgpr0 killed $exec
+; GFX9-W64-NEXT:    s_and_b64 exec, exec, s[12:13]
+; GFX9-W64-NEXT:    ; return to shader part epilog
+;
+; GFX10-W32-LABEL: short_exact_regions:
+; GFX10-W32:       ; %bb.0: ; %main_body
+; GFX10-W32-NEXT:    s_mov_b32 s12, exec_lo
+; GFX10-W32-NEXT:    s_wqm_b32 exec_lo, exec_lo
+; GFX10-W32-NEXT:    image_sample v[3:6], v0, s[0:7], s[8:11] dmask:0xf dim:SQ_RSRC_IMG_1D
+; GFX10-W32-NEXT:    v_mbcnt_lo_u32_b32 v0, -1, 0
+; GFX10-W32-NEXT:    s_mov_b32 s13, exec_lo
+; GFX10-W32-NEXT:    v_mbcnt_hi_u32_b32 v0, -1, v0
+; GFX10-W32-NEXT:    v_cmpx_gt_u32_e32 16, v0
+; GFX10-W32-NEXT:    s_cbranch_execz .LBB59_2
+; GFX10-W32-NEXT:  ; %bb.1: ; %if
+; GFX10-W32-NEXT:    s_and_saveexec_b32 s14, s12
+; GFX10-W32-NEXT:    global_load_dword v0, v[1:2], off
+; GFX10-W32-NEXT:    s_waitcnt vmcnt(0)
+; GFX10-W32-NEXT:    v_readfirstlane_b32 s15, v0
+; GFX10-W32-NEXT:    s_buffer_load_dword s15, s[8:11], s15 offset:0x0
+; GFX10-W32-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX10-W32-NEXT:    v_mov_b32_e32 v0, s15
+; GFX10-W32-NEXT:    buffer_store_dwordx4 v[3:6], v0, s[0:3], 0 idxen
+; GFX10-W32-NEXT:    s_mov_b32 exec_lo, s14
+; GFX10-W32-NEXT:  .LBB59_2: ; %endif
+; GFX10-W32-NEXT:    s_or_b32 exec_lo, exec_lo, s13
+; GFX10-W32-NEXT:    s_waitcnt vmcnt(0)
+; GFX10-W32-NEXT:    image_sample v0, v3, s[0:7], s[8:11] dmask:0x4 dim:SQ_RSRC_IMG_1D
+; GFX10-W32-NEXT:    s_waitcnt vmcnt(0)
+; GFX10-W32-NEXT:    v_add_f32_e32 v0, v4, v0
+; GFX10-W32-NEXT:    ; kill: def $vgpr0 killed $vgpr0 killed $exec
+; GFX10-W32-NEXT:    s_and_b32 exec_lo, exec_lo, s12
+; GFX10-W32-NEXT:    ; return to shader part epilog
+main_body:
+  %tex1 = call <4 x float> @llvm.amdgcn.image.sample.1d.v4f32.f32(i32 15, float %c, <8 x i32> %rsrc, <4 x i32> %sampler, i1 false, i32 0, i32 0) #0
+  %idx0 = load <4 x i32>, ptr addrspace(4) %p, align 4
+  %lo = call i32 @llvm.amdgcn.mbcnt.lo(i32 -1, i32 0)
+  %hi = call i32 @llvm.amdgcn.mbcnt.hi(i32 -1, i32 %lo)
+  %cc = icmp uge i32 %hi, 16
+  br i1 %cc, label %endif, label %if
+
+if:
+  %idx1 = extractelement <4 x i32> %idx0, i64 0
+  %idx2 = call i32 @llvm.amdgcn.readfirstlane.i32(i32 %idx1)
+  %idx3 = call i32 @llvm.amdgcn.s.buffer.load.i32(<4 x i32> %sampler, i32 %idx2, i32 0)
+
+  call void @llvm.amdgcn.struct.buffer.store.v4f32(<4 x float> %tex1, <4 x i32> undef, i32 %idx3, i32 0, i32 0, i32 0)
+  br label %endif
+
+endif:
+  %d = extractelement <4 x float> %tex1, i64 0
+  %tex2 = call <4 x float> @llvm.amdgcn.image.sample.1d.v4f32.f32(i32 15, float %d, <8 x i32> %rsrc, <4 x i32> %sampler, i1 false, i32 0, i32 0) #0
+  %r0 = extractelement <4 x float> %tex1, i64 1
+  %r1 = extractelement <4 x float> %tex2, i64 2
+  %r2 = fadd float %r0, %r1
+  %out = call float @llvm.amdgcn.wqm.f32(float %r2)
+
+  ret float %out
+}
+
+; Check that exact regions shortening doesn't prevent early WQM exit
+define amdgpu_ps float @short_exact_regions_2(<8 x i32> inreg %rsrc, <4 x i32> inreg %sampler, float %c, ptr addrspace(4) %p) {
+; GFX9-W64-LABEL: short_exact_regions_2:
+; GFX9-W64:       ; %bb.0: ; %main_body
+; GFX9-W64-NEXT:    s_mov_b64 s[12:13], exec
+; GFX9-W64-NEXT:    s_wqm_b64 exec, exec
+; GFX9-W64-NEXT:    image_sample v[3:4], v0, s[0:7], s[8:11] dmask:0x3
+; GFX9-W64-NEXT:    s_and_b64 exec, exec, s[12:13]
+; GFX9-W64-NEXT:    global_load_dword v0, v[1:2], off
+; GFX9-W64-NEXT:    s_waitcnt vmcnt(1)
+; GFX9-W64-NEXT:    image_sample v5, v3, s[0:7], s[8:11] dmask:0x4
+; GFX9-W64-NEXT:    ; kill: killed $sgpr0_sgpr1_sgpr2_sgpr3_sgpr4_sgpr5_sgpr6 killed $sgpr7
+; GFX9-W64-NEXT:    ; kill: killed $vgpr3
+; GFX9-W64-NEXT:    ; kill: killed $vgpr1 killed $vgpr2
+; GFX9-W64-NEXT:    s_waitcnt vmcnt(1)
+; GFX9-W64-NEXT:    v_readfirstlane_b32 s0, v0
+; GFX9-W64-NEXT:    s_buffer_load_dword s0, s[8:11], s0 offset:0x0
+; GFX9-W64-NEXT:    s_waitcnt vmcnt(0)
+; GFX9-W64-NEXT:    v_add_f32_e32 v0, v4, v5
+; GFX9-W64-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX9-W64-NEXT:    v_add_f32_e32 v0, s0, v0
+; GFX9-W64-NEXT:    ; return to shader part epilog
+;
+; GFX10-W32-LABEL: short_exact_regions_2:
+; GFX10-W32:       ; %bb.0: ; %main_body
+; GFX10-W32-NEXT:    s_mov_b32 s12, exec_lo
+; GFX10-W32-NEXT:    s_wqm_b32 exec_lo, exec_lo
+; GFX10-W32-NEXT:    image_sample v[3:4], v0, s[0:7], s[8:11] dmask:0x3 dim:SQ_RSRC_IMG_1D
+; GFX10-W32-NEXT:    s_and_b32 exec_lo, exec_lo, s12
+; GFX10-W32-NEXT:    global_load_dword v0, v[1:2], off
+; GFX10-W32-NEXT:    s_waitcnt vmcnt(1)
+; GFX10-W32-NEXT:    image_sample v1, v3, s[0:7], s[8:11] dmask:0x4 dim:SQ_RSRC_IMG_1D
+; GFX10-W32-NEXT:    s_waitcnt vmcnt(1)
+; GFX10-W32-NEXT:    v_readfirstlane_b32 s0, v0
+; GFX10-W32-NEXT:    s_waitcnt vmcnt(0)
+; GFX10-W32-NEXT:    v_add_f32_e32 v0, v4, v1
+; GFX10-W32-NEXT:    s_buffer_load_dword s0, s[8:11], s0 offset:0x0
+; GFX10-W32-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX10-W32-NEXT:    v_add_f32_e32 v0, s0, v0
+; GFX10-W32-NEXT:    ; return to shader part epilog
+main_body:
+  %tex1 = call <4 x float> @llvm.amdgcn.image.sample.1d.v4f32.f32(i32 15, float %c, <8 x i32> %rsrc, <4 x i32> %sampler, i1 false, i32 0, i32 0) #0
+  %idx0 = load <4 x i32>, ptr addrspace(4) %p, align 4
+  %lo = call i32 @llvm.amdgcn.mbcnt.lo(i32 -1, i32 0)
+  %hi = call i32 @llvm.amdgcn.mbcnt.hi(i32 -1, i32 %lo)
+  %idx1 = extractelement <4 x i32> %idx0, i64 0
+  %d = extractelement <4 x float> %tex1, i64 0
+
+  %tex2 = call <4 x float> @llvm.amdgcn.image.sample.1d.v4f32.f32(i32 15, float %d, <8 x i32> %rsrc, <4 x i32> %sampler, i1 false, i32 0, i32 0) #0
+
+  %idx2 = call i32 @llvm.amdgcn.readfirstlane.i32(i32 %idx1)
+  %idx3 = call float @llvm.amdgcn.s.buffer.load.f32(<4 x i32> %sampler, i32 %idx2, i32 0)
+
+  %r0 = extractelement <4 x float> %tex1, i64 1
+  %r1 = extractelement <4 x float> %tex2, i64 2
+  %r2 = fadd float %r0, %r1
+  %out = fadd float %r2, %idx3
+
+  ret float %out
+}
+
 declare void @llvm.amdgcn.exp.f32(i32, i32, float, float, float, float, i1, i1) #1
 declare void @llvm.amdgcn.image.store.1d.v4f32.i32(<4 x float>, i32, i32, <8 x i32>, i32, i32) #1
 
@@ -3577,6 +3726,7 @@ declare float @llvm.amdgcn.interp.p1(float, i32, i32, i32) #2
 declare float @llvm.amdgcn.interp.p2(float, float, i32, i32, i32) #2
 declare i32 @llvm.amdgcn.ds.swizzle(i32, i32)
 declare float @llvm.amdgcn.s.buffer.load.f32(<4 x i32>, i32, i32 immarg) #7
+declare i32 @llvm.amdgcn.readfirstlane.i32(i32)
 
 attributes #1 = { nounwind }
 attributes #2 = { nounwind readonly }
diff --git a/llvm/test/CodeGen/ARM/vselect_imax.ll b/llvm/test/CodeGen/ARM/vselect_imax.ll
index 37f511f..9f0edb7 100644
--- a/llvm/test/CodeGen/ARM/vselect_imax.ll
+++ b/llvm/test/CodeGen/ARM/vselect_imax.ll
@@ -1,9 +1,18 @@
+; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py UTC_ARGS: --version 5
 ; RUN: opt < %s -passes='print<cost-model>' -mtriple=arm-apple-ios6.0.0 -mcpu=cortex-a8 2>&1 -disable-output | FileCheck %s --check-prefix=COST
 ; RUN: llc -mtriple=arm-eabi -mattr=+neon %s -o - | FileCheck %s
 ; Make sure that ARM backend with NEON handles vselect.
 
 define void @vmax_v4i32(ptr %m, <4 x i32> %a, <4 x i32> %b) {
-; CHECK: vmax.s32 {{q[0-9]+}}, {{q[0-9]+}}, {{q[0-9]+}}
+; CHECK-LABEL: vmax_v4i32:
+; CHECK:       @ %bb.0:
+; CHECK-NEXT:    add r1, sp, #8
+; CHECK-NEXT:    vldr d17, [sp]
+; CHECK-NEXT:    vld1.64 {d18, d19}, [r1]
+; CHECK-NEXT:    vmov d16, r2, r3
+; CHECK-NEXT:    vmax.s32 q8, q8, q9
+; CHECK-NEXT:    vst1.64 {d16, d17}, [r0]
+; CHECK-NEXT:    mov pc, lr
     %cmpres = icmp sgt <4 x i32> %a, %b
     %maxres = select <4 x i1> %cmpres, <4 x i32> %a,  <4 x i32> %b
     store <4 x i32> %maxres, ptr %m
@@ -12,51 +21,84 @@ define void @vmax_v4i32(ptr %m, <4 x i32> %a, <4 x i32> %b) {
 
 %T0_10 = type <16 x i16>
 %T1_10 = type <16 x i1>
+define void @func_blend10(ptr %loadaddr, ptr %loadaddr2, ptr %blend, ptr %storeaddr) {
 ; CHECK-LABEL: func_blend10:
-define void @func_blend10(ptr %loadaddr, ptr %loadaddr2,
-                           ptr %blend, ptr %storeaddr) {
-  %v0 = load %T0_10, ptr %loadaddr
-  %v1 = load %T0_10, ptr %loadaddr2
-  %c = icmp slt %T0_10 %v0, %v1
-; CHECK: vmin.s16
-; CHECK: vmin.s16
+; CHECK:       @ %bb.0:
+; CHECK-NEXT:    vld1.16 {d16, d17}, [r1:128]!
+; CHECK-NEXT:    vld1.16 {d18, d19}, [r0:128]!
+; CHECK-NEXT:    vmin.s16 q8, q9, q8
+; CHECK-NEXT:    vld1.64 {d20, d21}, [r1:128]
+; CHECK-NEXT:    vld1.64 {d18, d19}, [r0:128]
+; CHECK-NEXT:    vmin.s16 q9, q9, q10
+; CHECK-NEXT:    vst1.16 {d16, d17}, [r3:128]!
+; CHECK-NEXT:    vst1.64 {d18, d19}, [r3:128]
+; CHECK-NEXT:    mov pc, lr
 ; COST: func_blend10
 ; COST: cost of 0 {{.*}} icmp
 ; COST: cost of 4 {{.*}} select
+
+  %v0 = load %T0_10, ptr %loadaddr
+  %v1 = load %T0_10, ptr %loadaddr2
+  %c = icmp slt %T0_10 %v0, %v1
   %r = select %T1_10 %c, %T0_10 %v0, %T0_10 %v1
   store %T0_10 %r, ptr %storeaddr
   ret void
 }
+
 %T0_14 = type <8 x i32>
 %T1_14 = type <8 x i1>
+define void @func_blend14(ptr %loadaddr, ptr %loadaddr2, ptr %blend, ptr %storeaddr) {
 ; CHECK-LABEL: func_blend14:
-define void @func_blend14(ptr %loadaddr, ptr %loadaddr2,
-                           ptr %blend, ptr %storeaddr) {
-  %v0 = load %T0_14, ptr %loadaddr
-  %v1 = load %T0_14, ptr %loadaddr2
-  %c = icmp slt %T0_14 %v0, %v1
-; CHECK: vmin.s32
-; CHECK: vmin.s32
+; CHECK:       @ %bb.0:
+; CHECK-NEXT:    vld1.32 {d16, d17}, [r1:128]!
+; CHECK-NEXT:    vld1.32 {d18, d19}, [r0:128]!
+; CHECK-NEXT:    vmin.s32 q8, q9, q8
+; CHECK-NEXT:    vld1.64 {d20, d21}, [r1:128]
+; CHECK-NEXT:    vld1.64 {d18, d19}, [r0:128]
+; CHECK-NEXT:    vmin.s32 q9, q9, q10
+; CHECK-NEXT:    vst1.32 {d16, d17}, [r3:128]!
+; CHECK-NEXT:    vst1.64 {d18, d19}, [r3:128]
+; CHECK-NEXT:    mov pc, lr
 ; COST: func_blend14
 ; COST: cost of 0 {{.*}} icmp
 ; COST: cost of 4 {{.*}} select
+  %v0 = load %T0_14, ptr %loadaddr
+  %v1 = load %T0_14, ptr %loadaddr2
+  %c = icmp slt %T0_14 %v0, %v1
   %r = select %T1_14 %c, %T0_14 %v0, %T0_14 %v1
   store %T0_14 %r, ptr %storeaddr
   ret void
 }
+
 %T0_15 = type <16 x i32>
 %T1_15 = type <16 x i1>
+define void @func_blend15(ptr %loadaddr, ptr %loadaddr2, ptr %blend, ptr %storeaddr) {
 ; CHECK-LABEL: func_blend15:
-define void @func_blend15(ptr %loadaddr, ptr %loadaddr2,
-                           ptr %blend, ptr %storeaddr) {
-; CHECK: vmin.s32
-; CHECK: vmin.s32
-  %v0 = load %T0_15, ptr %loadaddr
-  %v1 = load %T0_15, ptr %loadaddr2
-  %c = icmp slt %T0_15 %v0, %v1
+; CHECK:       @ %bb.0:
+; CHECK-NEXT:    vld1.32 {d16, d17}, [r1:128]!
+; CHECK-NEXT:    vld1.32 {d18, d19}, [r0:128]!
+; CHECK-NEXT:    vmin.s32 q8, q9, q8
+; CHECK-NEXT:    vld1.32 {d20, d21}, [r1:128]!
+; CHECK-NEXT:    vld1.32 {d22, d23}, [r0:128]!
+; CHECK-NEXT:    vmin.s32 q10, q11, q10
+; CHECK-NEXT:    vld1.32 {d24, d25}, [r1:128]!
+; CHECK-NEXT:    vld1.32 {d26, d27}, [r0:128]!
+; CHECK-NEXT:    vmin.s32 q12, q13, q12
+; CHECK-NEXT:    vld1.64 {d18, d19}, [r1:128]
+; CHECK-NEXT:    vld1.64 {d22, d23}, [r0:128]
+; CHECK-NEXT:    vmin.s32 q9, q11, q9
+; CHECK-NEXT:    vst1.32 {d16, d17}, [r3:128]!
+; CHECK-NEXT:    vst1.32 {d20, d21}, [r3:128]!
+; CHECK-NEXT:    vst1.32 {d24, d25}, [r3:128]!
+; CHECK-NEXT:    vst1.64 {d18, d19}, [r3:128]
+; CHECK-NEXT:    mov pc, lr
 ; COST: func_blend15
 ; COST: cost of 0 {{.*}} icmp
 ; COST: cost of 8 {{.*}} select
+
+  %v0 = load %T0_15, ptr %loadaddr
+  %v1 = load %T0_15, ptr %loadaddr2
+  %c = icmp slt %T0_15 %v0, %v1
   %r = select %T1_15 %c, %T0_15 %v0, %T0_15 %v1
   store %T0_15 %r, ptr %storeaddr
   ret void
@@ -66,8 +108,7 @@ define void @func_blend15(ptr %loadaddr, ptr %loadaddr2,
 ; lowering we also need to adjust the cost.
 %T0_18 = type <4 x i64>
 %T1_18 = type <4 x i1>
-define void @func_blend18(ptr %loadaddr, ptr %loadaddr2,
-                           ptr %blend, ptr %storeaddr) {
+define void @func_blend18(ptr %loadaddr, ptr %loadaddr2, ptr %blend, ptr %storeaddr) {
 ; CHECK-LABEL: func_blend18:
 ; CHECK:       @ %bb.0:
 ; CHECK-NEXT:    .save {r4, r5, r6, r7, r11, lr}
@@ -118,20 +159,20 @@ define void @func_blend18(ptr %loadaddr, ptr %loadaddr2,
 ; CHECK-NEXT:    vst1.64 {d18, d19}, [r3:128]
 ; CHECK-NEXT:    pop {r4, r5, r6, r7, r11, lr}
 ; CHECK-NEXT:    mov pc, lr
-  %v0 = load %T0_18, ptr %loadaddr
-  %v1 = load %T0_18, ptr %loadaddr2
-  %c = icmp slt %T0_18 %v0, %v1
 ; COST: func_blend18
 ; COST: cost of 0 {{.*}} icmp
 ; COST: cost of 21 {{.*}} select
+  %v0 = load %T0_18, ptr %loadaddr
+  %v1 = load %T0_18, ptr %loadaddr2
+  %c = icmp slt %T0_18 %v0, %v1
   %r = select %T1_18 %c, %T0_18 %v0, %T0_18 %v1
   store %T0_18 %r, ptr %storeaddr
   ret void
 }
+
 %T0_19 = type <8 x i64>
 %T1_19 = type <8 x i1>
-define void @func_blend19(ptr %loadaddr, ptr %loadaddr2,
-                           ptr %blend, ptr %storeaddr) {
+define void @func_blend19(ptr %loadaddr, ptr %loadaddr2, ptr %blend, ptr %storeaddr) {
 ; CHECK-LABEL: func_blend19:
 ; CHECK:       @ %bb.0:
 ; CHECK-NEXT:    .save {r4, r5, r6, lr}
@@ -226,20 +267,20 @@ define void @func_blend19(ptr %loadaddr, ptr %loadaddr2,
 ; CHECK-NEXT:    vst1.64 {d16, d17}, [r3:128]
 ; CHECK-NEXT:    pop {r4, r5, r6, lr}
 ; CHECK-NEXT:    mov pc, lr
-  %v0 = load %T0_19, ptr %loadaddr
-  %v1 = load %T0_19, ptr %loadaddr2
-  %c = icmp slt %T0_19 %v0, %v1
 ; COST: func_blend19
 ; COST: cost of 0 {{.*}} icmp
 ; COST: cost of 54 {{.*}} select
+  %v0 = load %T0_19, ptr %loadaddr
+  %v1 = load %T0_19, ptr %loadaddr2
+  %c = icmp slt %T0_19 %v0, %v1
   %r = select %T1_19 %c, %T0_19 %v0, %T0_19 %v1
   store %T0_19 %r, ptr %storeaddr
   ret void
 }
+
 %T0_20 = type <16 x i64>
 %T1_20 = type <16 x i1>
-define void @func_blend20(ptr %loadaddr, ptr %loadaddr2,
-                           ptr %blend, ptr %storeaddr) {
+define void @func_blend20(ptr %loadaddr, ptr %loadaddr2, ptr %blend, ptr %storeaddr) {
 ; CHECK-LABEL: func_blend20:
 ; CHECK:       @ %bb.0:
 ; CHECK-NEXT:    .save {r4, r5, r6, r7, r8, r9, r10, lr}
@@ -435,12 +476,12 @@ define void @func_blend20(ptr %loadaddr, ptr %loadaddr2,
 ; CHECK-NEXT:    vpop {d8, d9, d10, d11}
 ; CHECK-NEXT:    pop {r4, r5, r6, r7, r8, r9, r10, lr}
 ; CHECK-NEXT:    mov pc, lr
-  %v0 = load %T0_20, ptr %loadaddr
-  %v1 = load %T0_20, ptr %loadaddr2
-  %c = icmp slt %T0_20 %v0, %v1
 ; COST: func_blend20
 ; COST: cost of 0 {{.*}} icmp
 ; COST: cost of 108 {{.*}} select
+  %v0 = load %T0_20, ptr %loadaddr
+  %v1 = load %T0_20, ptr %loadaddr2
+  %c = icmp slt %T0_20 %v0, %v1
   %r = select %T1_20 %c, %T0_20 %v0, %T0_20 %v1
   store %T0_20 %r, ptr %storeaddr
   ret void
diff --git a/llvm/test/CodeGen/DirectX/UAVMetadata.ll b/llvm/test/CodeGen/DirectX/UAVMetadata.ll
index 0bc8a8c..bdad9fd 100644
--- a/llvm/test/CodeGen/DirectX/UAVMetadata.ll
+++ b/llvm/test/CodeGen/DirectX/UAVMetadata.ll
@@ -1,5 +1,5 @@
 ; RUN: opt -S -dxil-metadata-emit < %s | FileCheck %s
-; RUN: opt -S --passes="print-dxil-resource" < %s 2>&1 | FileCheck %s --check-prefix=PRINT
+; RUN: opt -S --passes="print-dxil-resource-md" < %s 2>&1 | FileCheck %s --check-prefix=PRINT
 ; RUN: llc %s --filetype=asm -o - < %s 2>&1 | FileCheck %s --check-prefixes=CHECK,PRINT
 
 target datalayout = "e-m:e-p:32:32-i1:32-i8:8-i16:16-i32:32-i64:64-f16:16-f32:32-f64:64-n8:16:32:64"
diff --git a/llvm/test/CodeGen/DirectX/abs.ll b/llvm/test/CodeGen/DirectX/abs.ll
index 822580e..85090a5 100644
--- a/llvm/test/CodeGen/DirectX/abs.ll
+++ b/llvm/test/CodeGen/DirectX/abs.ll
@@ -1,5 +1,5 @@
-; RUN: opt -S  -dxil-intrinsic-expansion  < %s | FileCheck %s --check-prefixes=CHECK,EXPCHECK
-; RUN: opt -S  -dxil-op-lower  < %s | FileCheck %s --check-prefixes=CHECK,DOPCHECK
+; RUN: opt -S  -dxil-intrinsic-expansion  -mtriple=dxil-pc-shadermodel6.3-library %s | FileCheck %s --check-prefixes=CHECK,EXPCHECK
+; RUN: opt -S  -dxil-op-lower  -mtriple=dxil-pc-shadermodel6.3-library %s | FileCheck %s --check-prefixes=CHECK,DOPCHECK
 
 ; Make sure dxil operation function calls for abs are generated for int16_t/int/int64_t.
 
diff --git a/llvm/test/CodeGen/DirectX/acos.ll b/llvm/test/CodeGen/DirectX/acos.ll
index 31b0883..cc32182 100644
--- a/llvm/test/CodeGen/DirectX/acos.ll
+++ b/llvm/test/CodeGen/DirectX/acos.ll
@@ -1,4 +1,4 @@
-; RUN: opt -S -dxil-op-lower < %s | FileCheck %s
+; RUN: opt -S -dxil-op-lower -mtriple=dxil-pc-shadermodel6.3-library %s | FileCheck %s
 
 ; Make sure dxil operation function calls for acos are generated for float and half.
 
diff --git a/llvm/test/CodeGen/DirectX/acos_error.ll b/llvm/test/CodeGen/DirectX/acos_error.ll
index e0474e9..4125709 100644
--- a/llvm/test/CodeGen/DirectX/acos_error.ll
+++ b/llvm/test/CodeGen/DirectX/acos_error.ll
@@ -1,4 +1,4 @@
-; RUN: not opt -S -dxil-op-lower %s 2>&1 | FileCheck %s
+; RUN: not opt -S -dxil-op-lower -mtriple=dxil-pc-shadermodel6.3-library %s 2>&1 | FileCheck %s
 
 ; DXIL operation acos does not support double overload type
 ; CHECK: LLVM ERROR: Invalid Overload
diff --git a/llvm/test/CodeGen/DirectX/asin.ll b/llvm/test/CodeGen/DirectX/asin.ll
index 56c2d86..06e3bab 100644
--- a/llvm/test/CodeGen/DirectX/asin.ll
+++ b/llvm/test/CodeGen/DirectX/asin.ll
@@ -1,4 +1,4 @@
-; RUN: opt -S -dxil-op-lower < %s | FileCheck %s
+; RUN: opt -S -dxil-op-lower -mtriple=dxil-pc-shadermodel6.3-library %s | FileCheck %s
 
 ; Make sure dxil operation function calls for asin are generated for float and half.
 
diff --git a/llvm/test/CodeGen/DirectX/asin_error.ll b/llvm/test/CodeGen/DirectX/asin_error.ll
index ddd4b2e..de63b0d 100644
--- a/llvm/test/CodeGen/DirectX/asin_error.ll
+++ b/llvm/test/CodeGen/DirectX/asin_error.ll
@@ -1,4 +1,4 @@
-; RUN: not opt -S -dxil-op-lower %s 2>&1 | FileCheck %s
+; RUN: not opt -S -dxil-op-lower -mtriple=dxil-pc-shadermodel6.3-library %s 2>&1 | FileCheck %s
 
 ; DXIL operation asin does not support double overload type
 ; CHECK: LLVM ERROR: Invalid Overload
diff --git a/llvm/test/CodeGen/DirectX/atan.ll b/llvm/test/CodeGen/DirectX/atan.ll
index 7aa4418..d7c4cd0 100644
--- a/llvm/test/CodeGen/DirectX/atan.ll
+++ b/llvm/test/CodeGen/DirectX/atan.ll
@@ -1,4 +1,4 @@
-; RUN: opt -S -dxil-op-lower < %s | FileCheck %s
+; RUN: opt -S -dxil-op-lower -mtriple=dxil-pc-shadermodel6.3-library %s | FileCheck %s
 
 ; Make sure dxil operation function calls for atan are generated for float and half.
 
diff --git a/llvm/test/CodeGen/DirectX/atan_error.ll b/llvm/test/CodeGen/DirectX/atan_error.ll
index 1880b1d..c320868 100644
--- a/llvm/test/CodeGen/DirectX/atan_error.ll
+++ b/llvm/test/CodeGen/DirectX/atan_error.ll
@@ -1,4 +1,4 @@
-; RUN: not opt -S -dxil-op-lower %s 2>&1 | FileCheck %s
+; RUN: not opt -S -dxil-op-lower -mtriple=dxil-pc-shadermodel6.3-library %s 2>&1 | FileCheck %s
 
 ; DXIL operation atan does not support double overload type
 ; CHECK: LLVM ERROR: Invalid Overload
diff --git a/llvm/test/CodeGen/DirectX/cbuf.ll b/llvm/test/CodeGen/DirectX/cbuf.ll
index d07cc1e..38f08fa 100644
--- a/llvm/test/CodeGen/DirectX/cbuf.ll
+++ b/llvm/test/CodeGen/DirectX/cbuf.ll
@@ -1,5 +1,5 @@
 ; RUN: opt -S -dxil-metadata-emit < %s | FileCheck %s --check-prefix=DXILMD
-; RUN: opt -S --passes="print-dxil-resource" < %s 2>&1 | FileCheck %s --check-prefix=PRINT
+; RUN: opt -S --passes="print-dxil-resource-md" < %s 2>&1 | FileCheck %s --check-prefix=PRINT
 
 target datalayout = "e-m:e-p:32:32-i1:32-i8:8-i16:16-i32:32-i64:64-f16:16-f32:32-f64:64-n8:16:32:64"
 target triple = "dxil-unknown-shadermodel6.7-library"
diff --git a/llvm/test/CodeGen/DirectX/ceil.ll b/llvm/test/CodeGen/DirectX/ceil.ll
index 1585471..48bc549 100644
--- a/llvm/test/CodeGen/DirectX/ceil.ll
+++ b/llvm/test/CodeGen/DirectX/ceil.ll
@@ -1,4 +1,4 @@
-; RUN: opt -S -dxil-op-lower < %s | FileCheck %s
+; RUN: opt -S -dxil-op-lower -mtriple=dxil-pc-shadermodel6.3-library %s | FileCheck %s
 
 ; Make sure dxil operation function calls for ceil are generated for float and half.
 
diff --git a/llvm/test/CodeGen/DirectX/ceil_error.ll b/llvm/test/CodeGen/DirectX/ceil_error.ll
index 1b554d8..da6f083 100644
--- a/llvm/test/CodeGen/DirectX/ceil_error.ll
+++ b/llvm/test/CodeGen/DirectX/ceil_error.ll
@@ -1,4 +1,4 @@
-; RUN: not opt -S -dxil-op-lower %s 2>&1 | FileCheck %s
+; RUN: not opt -S -dxil-op-lower -mtriple=dxil-pc-shadermodel6.3-library %s 2>&1 | FileCheck %s
 
 ; DXIL operation ceil does not support double overload type
 ; CHECK: LLVM ERROR: Invalid Overload Type
diff --git a/llvm/test/CodeGen/DirectX/clamp.ll b/llvm/test/CodeGen/DirectX/clamp.ll
index f122313..2f29e44 100644
--- a/llvm/test/CodeGen/DirectX/clamp.ll
+++ b/llvm/test/CodeGen/DirectX/clamp.ll
@@ -1,4 +1,4 @@
-; RUN: opt -S -dxil-op-lower < %s | FileCheck %s
+; RUN: opt -S -dxil-op-lower -mtriple=dxil-pc-shadermodel6.3-library %s | FileCheck %s
 
 ; Make sure dxil operation function calls for clamp/uclamp are generated for half/float/double/i16/i32/i64.
 
diff --git a/llvm/test/CodeGen/DirectX/comput_ids.ll b/llvm/test/CodeGen/DirectX/comput_ids.ll
index 5539940..976b3ea 100644
--- a/llvm/test/CodeGen/DirectX/comput_ids.ll
+++ b/llvm/test/CodeGen/DirectX/comput_ids.ll
@@ -1,9 +1,9 @@
-; RUN: opt -S -dxil-op-lower < %s | FileCheck %s
+; RUN: opt -S -dxil-op-lower  %s | FileCheck %s
 
 ; Make sure dxil operation function calls for all ComputeID dxil operations are generated.
 
 target datalayout = "e-m:e-p:32:32-i1:32-i8:8-i16:16-i32:32-i64:64-f16:16-f32:32-f64:64-n8:16:32:64"
-target triple = "dxil-pc-shadermodel6.7-library"
+target triple = "dxil-pc-shadermodel6.7-compute"
 
 ; CHECK-LABEL: @test_thread_id(
 ; Function Attrs: noinline nounwind optnone
diff --git a/llvm/test/CodeGen/DirectX/cos.ll b/llvm/test/CodeGen/DirectX/cos.ll
index 00f2e2c..72f4bfc 100644
--- a/llvm/test/CodeGen/DirectX/cos.ll
+++ b/llvm/test/CodeGen/DirectX/cos.ll
@@ -1,4 +1,4 @@
-; RUN: opt -S -dxil-op-lower < %s | FileCheck %s
+; RUN: opt -S -dxil-op-lower -mtriple=dxil-pc-shadermodel6.3-library %s | FileCheck %s
 
 ; Make sure dxil operation function calls for cos are generated for float and half.
 
diff --git a/llvm/test/CodeGen/DirectX/cos_error.ll b/llvm/test/CodeGen/DirectX/cos_error.ll
index a074f5b..6bb85a7 100644
--- a/llvm/test/CodeGen/DirectX/cos_error.ll
+++ b/llvm/test/CodeGen/DirectX/cos_error.ll
@@ -1,4 +1,4 @@
-; RUN: not opt -S -dxil-op-lower %s 2>&1 | FileCheck %s
+; RUN: not opt -S -dxil-op-lower -mtriple=dxil-pc-shadermodel6.3-library %s 2>&1 | FileCheck %s
 
 ; DXIL operation cos does not support double overload type
 ; CHECK: LLVM ERROR: Invalid Overload Type
diff --git a/llvm/test/CodeGen/DirectX/cosh.ll b/llvm/test/CodeGen/DirectX/cosh.ll
index 4fe22f0..91aaf89 100644
--- a/llvm/test/CodeGen/DirectX/cosh.ll
+++ b/llvm/test/CodeGen/DirectX/cosh.ll
@@ -1,4 +1,4 @@
-; RUN: opt -S -dxil-op-lower < %s | FileCheck %s
+; RUN: opt -S -dxil-op-lower -mtriple=dxil-pc-shadermodel6.3-library %s | FileCheck %s
 
 ; Make sure dxil operation function calls for cosh are generated for float and half.
 
diff --git a/llvm/test/CodeGen/DirectX/cosh_error.ll b/llvm/test/CodeGen/DirectX/cosh_error.ll
index cf66c54..4c5f0c7 100644
--- a/llvm/test/CodeGen/DirectX/cosh_error.ll
+++ b/llvm/test/CodeGen/DirectX/cosh_error.ll
@@ -1,4 +1,4 @@
-; RUN: not opt -S -dxil-op-lower %s 2>&1 | FileCheck %s
+; RUN: not opt -S -dxil-op-lower -mtriple=dxil-pc-shadermodel6.3-library %s 2>&1 | FileCheck %s
 
 ; DXIL operation cosh does not support double overload type
 ; CHECK: LLVM ERROR: Invalid Overload
diff --git a/llvm/test/CodeGen/DirectX/dot2_error.ll b/llvm/test/CodeGen/DirectX/dot2_error.ll
index a27bfae..54780d1 100644
--- a/llvm/test/CodeGen/DirectX/dot2_error.ll
+++ b/llvm/test/CodeGen/DirectX/dot2_error.ll
@@ -1,4 +1,4 @@
-; RUN: not opt -S -dxil-op-lower %s 2>&1 | FileCheck %s
+; RUN: not opt -S -dxil-op-lower -mtriple=dxil-pc-shadermodel6.3-library %s 2>&1 | FileCheck %s
 
 ; DXIL operation dot2 does not support double overload type
 ; CHECK: LLVM ERROR: Invalid Overload
diff --git a/llvm/test/CodeGen/DirectX/dot3_error.ll b/llvm/test/CodeGen/DirectX/dot3_error.ll
index eb69fb1..242716b 100644
--- a/llvm/test/CodeGen/DirectX/dot3_error.ll
+++ b/llvm/test/CodeGen/DirectX/dot3_error.ll
@@ -1,4 +1,4 @@
-; RUN: not opt -S -dxil-op-lower %s 2>&1 | FileCheck %s
+; RUN: not opt -S -dxil-op-lower -mtriple=dxil-pc-shadermodel6.3-library %s 2>&1 | FileCheck %s
 
 ; DXIL operation dot3 does not support double overload type
 ; CHECK: LLVM ERROR: Invalid Overload
diff --git a/llvm/test/CodeGen/DirectX/dot4_error.ll b/llvm/test/CodeGen/DirectX/dot4_error.ll
index 5cd6326..731adda 100644
--- a/llvm/test/CodeGen/DirectX/dot4_error.ll
+++ b/llvm/test/CodeGen/DirectX/dot4_error.ll
@@ -1,4 +1,4 @@
-; RUN: not opt -S -dxil-op-lower %s 2>&1 | FileCheck %s
+; RUN: not opt -S -dxil-op-lower -mtriple=dxil-pc-shadermodel6.3-library %s 2>&1 | FileCheck %s
 
 ; DXIL operation dot4 does not support double overload type
 ; CHECK: LLVM ERROR: Invalid Overload
diff --git a/llvm/test/CodeGen/DirectX/exp.ll b/llvm/test/CodeGen/DirectX/exp.ll
index fdafc14..f67e274 100644
--- a/llvm/test/CodeGen/DirectX/exp.ll
+++ b/llvm/test/CodeGen/DirectX/exp.ll
@@ -1,4 +1,4 @@
-; RUN: opt -S -dxil-op-lower < %s | FileCheck %s
+; RUN: opt -S -dxil-op-lower -mtriple=dxil-pc-shadermodel6.3-library %s | FileCheck %s
 
 ; Make sure dxil operation function calls for exp are generated for float and half.
 
diff --git a/llvm/test/CodeGen/DirectX/exp2_error.ll b/llvm/test/CodeGen/DirectX/exp2_error.ll
index 6b91267..4d13f93 100644
--- a/llvm/test/CodeGen/DirectX/exp2_error.ll
+++ b/llvm/test/CodeGen/DirectX/exp2_error.ll
@@ -1,4 +1,4 @@
-; RUN: not opt -S -dxil-op-lower %s 2>&1 | FileCheck %s
+; RUN: not opt -S -dxil-op-lower -mtriple=dxil-pc-shadermodel6.3-library %s 2>&1 | FileCheck %s
 
 ; DXIL operation exp2 does not support double overload type
 ; CHECK: LLVM ERROR: Invalid Overload
diff --git a/llvm/test/CodeGen/DirectX/fabs.ll b/llvm/test/CodeGen/DirectX/fabs.ll
index 3b3f8aa..becbdf8 100644
--- a/llvm/test/CodeGen/DirectX/fabs.ll
+++ b/llvm/test/CodeGen/DirectX/fabs.ll
@@ -1,4 +1,4 @@
-; RUN: opt -S -dxil-op-lower < %s | FileCheck %s
+; RUN: opt -S -dxil-op-lower -mtriple=dxil-pc-shadermodel6.3-library %s | FileCheck %s
 
 ; Make sure dxil operation function calls for abs are generated for float, half, and double.
 
diff --git a/llvm/test/CodeGen/DirectX/fdot.ll b/llvm/test/CodeGen/DirectX/fdot.ll
index 3e13b2a..56817a1 100644
--- a/llvm/test/CodeGen/DirectX/fdot.ll
+++ b/llvm/test/CodeGen/DirectX/fdot.ll
@@ -1,4 +1,4 @@
-; RUN: opt -S  -dxil-op-lower  < %s | FileCheck %s
+; RUN: opt -S  -dxil-op-lower -mtriple=dxil-pc-shadermodel6.3-library %s | FileCheck %s
 
 ; Make sure dxil operation function calls for dot are generated for int/uint vectors.
 
diff --git a/llvm/test/CodeGen/DirectX/flattened_thread_id_in_group_error.ll b/llvm/test/CodeGen/DirectX/flattened_thread_id_in_group_error.ll
new file mode 100644
index 0000000..9abea5e
--- /dev/null
+++ b/llvm/test/CodeGen/DirectX/flattened_thread_id_in_group_error.ll
@@ -0,0 +1,13 @@
+; RUN: not opt -S -dxil-op-lower  %s 2>&1 | FileCheck %s
+
+; DXIL operation sin is not valid in library stage
+; CHECK: LLVM ERROR: library : Invalid Shader Stage for DXIL operation - FlattenedThreadIdInGroup
+
+target triple = "dxil-pc-shadermodel6.7-library"
+
+; Function Attrs: noinline nounwind optnone
+define i32 @test_flattened_thread_id_in_group() #0 {
+entry:
+  %0 = call i32 @llvm.dx.flattened.thread.id.in.group()
+  ret i32 %0
+}
diff --git a/llvm/test/CodeGen/DirectX/floor.ll b/llvm/test/CodeGen/DirectX/floor.ll
index b033e2e..f667cab 100644
--- a/llvm/test/CodeGen/DirectX/floor.ll
+++ b/llvm/test/CodeGen/DirectX/floor.ll
@@ -1,4 +1,4 @@
-; RUN: opt -S -dxil-op-lower < %s | FileCheck %s
+; RUN: opt -S -dxil-op-lower -mtriple=dxil-pc-shadermodel6.3-library %s | FileCheck %s
 
 ; Make sure dxil operation function calls for floor are generated for float and half.
 
diff --git a/llvm/test/CodeGen/DirectX/floor_error.ll b/llvm/test/CodeGen/DirectX/floor_error.ll
index 3b51a4b..e3190e5 100644
--- a/llvm/test/CodeGen/DirectX/floor_error.ll
+++ b/llvm/test/CodeGen/DirectX/floor_error.ll
@@ -1,4 +1,4 @@
-; RUN: not opt -S -dxil-op-lower %s 2>&1 | FileCheck %s
+; RUN: not opt -S -dxil-op-lower -mtriple=dxil-pc-shadermodel6.3-library %s 2>&1 | FileCheck %s
 
 ; DXIL operation floor does not support double overload type
 ; CHECK: LLVM ERROR: Invalid Overload Type
diff --git a/llvm/test/CodeGen/DirectX/fmax.ll b/llvm/test/CodeGen/DirectX/fmax.ll
index aff722c..05852ee3 100644
--- a/llvm/test/CodeGen/DirectX/fmax.ll
+++ b/llvm/test/CodeGen/DirectX/fmax.ll
@@ -1,4 +1,4 @@
-; RUN: opt -S -dxil-op-lower < %s | FileCheck %s
+; RUN: opt -S -dxil-op-lower -mtriple=dxil-pc-shadermodel6.3-library %s | FileCheck %s
 
 ; Make sure dxil operation function calls for fmax are generated for half/float/double.
 
diff --git a/llvm/test/CodeGen/DirectX/fmin.ll b/llvm/test/CodeGen/DirectX/fmin.ll
index 2f7c209..1c6c7ca 100644
--- a/llvm/test/CodeGen/DirectX/fmin.ll
+++ b/llvm/test/CodeGen/DirectX/fmin.ll
@@ -1,4 +1,4 @@
-; RUN: opt -S -dxil-op-lower < %s | FileCheck %s
+; RUN: opt -S -dxil-op-lower -mtriple=dxil-pc-shadermodel6.3-library %s | FileCheck %s
 
 ; Make sure dxil operation function calls for fmin are generated for half/float/double.
 
diff --git a/llvm/test/CodeGen/DirectX/frac_error.ll b/llvm/test/CodeGen/DirectX/frac_error.ll
index ebce761..1bc3558 100644
--- a/llvm/test/CodeGen/DirectX/frac_error.ll
+++ b/llvm/test/CodeGen/DirectX/frac_error.ll
@@ -1,4 +1,4 @@
-; RUN: not opt -S -dxil-op-lower %s 2>&1 | FileCheck %s
+; RUN: not opt -S -dxil-op-lower -mtriple=dxil-pc-shadermodel6.3-library %s 2>&1 | FileCheck %s
 
 ; DXIL operation frac does not support double overload type
 ; CHECK: LLVM ERROR: Invalid Overload Type
diff --git a/llvm/test/CodeGen/DirectX/group_id_error.ll b/llvm/test/CodeGen/DirectX/group_id_error.ll
new file mode 100644
index 0000000..2a6adcf
--- /dev/null
+++ b/llvm/test/CodeGen/DirectX/group_id_error.ll
@@ -0,0 +1,13 @@
+; RUN: not opt -S -dxil-op-lower  %s 2>&1 | FileCheck %s
+
+; DXIL operation not valid for pixel stage
+; CHECK: LLVM ERROR: pixel : Invalid Shader Stage for DXIL operation - GroupId
+
+target triple = "dxil-pc-shadermodel6.7-pixel"
+
+; Function Attrs: noinline nounwind optnone
+define i32 @test_group_id(i32 %a) #0 {
+entry:
+  %0 = call i32 @llvm.dx.group.id(i32 %a)
+  ret i32 %0
+}
diff --git a/llvm/test/CodeGen/DirectX/idot.ll b/llvm/test/CodeGen/DirectX/idot.ll
index 9f89a8d..eac1b91 100644
--- a/llvm/test/CodeGen/DirectX/idot.ll
+++ b/llvm/test/CodeGen/DirectX/idot.ll
@@ -1,5 +1,5 @@
-; RUN: opt -S  -dxil-intrinsic-expansion  < %s | FileCheck %s --check-prefixes=CHECK,EXPCHECK
-; RUN: opt -S  -dxil-op-lower  < %s | FileCheck %s --check-prefixes=CHECK,DOPCHECK
+; RUN: opt -S  -dxil-intrinsic-expansion -mtriple=dxil-pc-shadermodel6.3-library %s | FileCheck %s --check-prefixes=CHECK,EXPCHECK
+; RUN: opt -S  -dxil-op-lower -mtriple=dxil-pc-shadermodel6.3-library %s | FileCheck %s --check-prefixes=CHECK,DOPCHECK
 
 ; Make sure dxil operation function calls for dot are generated for int/uint vectors.
 
diff --git a/llvm/test/CodeGen/DirectX/isinf.ll b/llvm/test/CodeGen/DirectX/isinf.ll
index e2975da..295776b 100644
--- a/llvm/test/CodeGen/DirectX/isinf.ll
+++ b/llvm/test/CodeGen/DirectX/isinf.ll
@@ -1,4 +1,4 @@
-; RUN: opt -S -dxil-op-lower < %s | FileCheck %s
+; RUN: opt -S -dxil-op-lower -mtriple=dxil-pc-shadermodel6.3-library %s | FileCheck %s
 
 ; Make sure dxil operation function calls for isinf are generated for float and half.
 ; CHECK: call i1 @dx.op.isSpecialFloat.f32(i32 9, float %{{.*}})
diff --git a/llvm/test/CodeGen/DirectX/isinf_error.ll b/llvm/test/CodeGen/DirectX/isinf_error.ll
index 95b2d0c..39b83554 100644
--- a/llvm/test/CodeGen/DirectX/isinf_error.ll
+++ b/llvm/test/CodeGen/DirectX/isinf_error.ll
@@ -1,4 +1,4 @@
-; RUN: not opt -S -dxil-op-lower %s 2>&1 | FileCheck %s
+; RUN: not opt -S -dxil-op-lower -mtriple=dxil-pc-shadermodel6.3-library %s 2>&1 | FileCheck %s
 
 ; DXIL operation isinf does not support double overload type
 ; CHECK: LLVM ERROR: Invalid Overload Type
diff --git a/llvm/test/CodeGen/DirectX/log.ll b/llvm/test/CodeGen/DirectX/log.ll
index 172c3bf..ee28908 100644
--- a/llvm/test/CodeGen/DirectX/log.ll
+++ b/llvm/test/CodeGen/DirectX/log.ll
@@ -1,5 +1,5 @@
-; RUN: opt -S  -dxil-intrinsic-expansion  < %s | FileCheck %s --check-prefixes=CHECK,EXPCHECK
-; RUN: opt -S  -dxil-op-lower  < %s | FileCheck %s --check-prefixes=CHECK,DOPCHECK
+; RUN: opt -S  -dxil-intrinsic-expansion  -mtriple=dxil-pc-shadermodel6.3-library %s | FileCheck %s --check-prefixes=CHECK,EXPCHECK
+; RUN: opt -S  -dxil-op-lower  -mtriple=dxil-pc-shadermodel6.3-library %s | FileCheck %s --check-prefixes=CHECK,DOPCHECK
 
 ; Make sure dxil operation function calls for log are generated.
 
diff --git a/llvm/test/CodeGen/DirectX/log10.ll b/llvm/test/CodeGen/DirectX/log10.ll
index d4f827a..a69f270 100644
--- a/llvm/test/CodeGen/DirectX/log10.ll
+++ b/llvm/test/CodeGen/DirectX/log10.ll
@@ -1,5 +1,5 @@
-; RUN: opt -S  -dxil-intrinsic-expansion  < %s | FileCheck %s --check-prefixes=CHECK,EXPCHECK
-; RUN: opt -S  -dxil-op-lower  < %s | FileCheck %s --check-prefixes=CHECK,DOPCHECK
+; RUN: opt -S  -dxil-intrinsic-expansion  -mtriple=dxil-pc-shadermodel6.3-library %s | FileCheck %s --check-prefixes=CHECK,EXPCHECK
+; RUN: opt -S  -dxil-op-lower  -mtriple=dxil-pc-shadermodel6.3-library %s | FileCheck %s --check-prefixes=CHECK,DOPCHECK
 
 ; Make sure dxil operation function calls for log10 are generated.
 
diff --git a/llvm/test/CodeGen/DirectX/log2.ll b/llvm/test/CodeGen/DirectX/log2.ll
index 2164d4d..d6a7ba0 100644
--- a/llvm/test/CodeGen/DirectX/log2.ll
+++ b/llvm/test/CodeGen/DirectX/log2.ll
@@ -1,4 +1,4 @@
-; RUN: opt -S -dxil-op-lower < %s | FileCheck %s
+; RUN: opt -S -dxil-op-lower -mtriple=dxil-pc-shadermodel6.3-library %s | FileCheck %s
 
 ; Make sure dxil operation function calls for log2 are generated for float and half.
 
diff --git a/llvm/test/CodeGen/DirectX/log2_error.ll b/llvm/test/CodeGen/DirectX/log2_error.ll
index a26f6e8..b887685 100644
--- a/llvm/test/CodeGen/DirectX/log2_error.ll
+++ b/llvm/test/CodeGen/DirectX/log2_error.ll
@@ -1,4 +1,4 @@
-; RUN: not opt -S -dxil-op-lower %s 2>&1 | FileCheck %s
+; RUN: not opt -S -dxil-op-lower -mtriple=dxil-pc-shadermodel6.3-library %s 2>&1 | FileCheck %s
 
 ; DXIL operation log2 does not support double overload type
 ; CHECK: LLVM ERROR: Invalid Overload Type
diff --git a/llvm/test/CodeGen/DirectX/pow.ll b/llvm/test/CodeGen/DirectX/pow.ll
index 25ce0fe..0e83c4f 100644
--- a/llvm/test/CodeGen/DirectX/pow.ll
+++ b/llvm/test/CodeGen/DirectX/pow.ll
@@ -1,5 +1,5 @@
-; RUN: opt -S  -dxil-intrinsic-expansion  < %s | FileCheck %s --check-prefixes=CHECK,EXPCHECK
-; RUN: opt -S  -dxil-op-lower  < %s | FileCheck %s --check-prefixes=CHECK,DOPCHECK
+; RUN: opt -S  -dxil-intrinsic-expansion -mtriple=dxil-pc-shadermodel6.3-library %s | FileCheck %s --check-prefixes=CHECK,EXPCHECK
+; RUN: opt -S  -dxil-op-lower -mtriple=dxil-pc-shadermodel6.3-library %s | FileCheck %s --check-prefixes=CHECK,DOPCHECK
 
 ; Make sure dxil operation function calls for pow are generated.
 
diff --git a/llvm/test/CodeGen/DirectX/reversebits.ll b/llvm/test/CodeGen/DirectX/reversebits.ll
index b6a7a1b..1ade57b 100644
--- a/llvm/test/CodeGen/DirectX/reversebits.ll
+++ b/llvm/test/CodeGen/DirectX/reversebits.ll
@@ -1,4 +1,4 @@
-; RUN: opt -S -dxil-op-lower < %s | FileCheck %s
+; RUN: opt -S -dxil-op-lower -mtriple=dxil-pc-shadermodel6.3-library %s | FileCheck %s
 
 ; Make sure dxil operation function calls for reversebits are generated for all integer types.
 
diff --git a/llvm/test/CodeGen/DirectX/round.ll b/llvm/test/CodeGen/DirectX/round.ll
index e0a3772..db953fb 100644
--- a/llvm/test/CodeGen/DirectX/round.ll
+++ b/llvm/test/CodeGen/DirectX/round.ll
@@ -1,4 +1,4 @@
-; RUN: opt -S -dxil-op-lower < %s | FileCheck %s
+; RUN: opt -S -dxil-op-lower -mtriple=dxil-pc-shadermodel6.3-library %s | FileCheck %s
 
 ; Make sure dxil operation function calls for round are generated for float and half.
 
diff --git a/llvm/test/CodeGen/DirectX/round_error.ll b/llvm/test/CodeGen/DirectX/round_error.ll
index 2d27fbb..9d2a4e7 100644
--- a/llvm/test/CodeGen/DirectX/round_error.ll
+++ b/llvm/test/CodeGen/DirectX/round_error.ll
@@ -1,4 +1,4 @@
-; RUN: not opt -S -dxil-op-lower %s 2>&1 | FileCheck %s
+; RUN: not opt -S -dxil-op-lower -mtriple=dxil-pc-shadermodel6.3-library %s 2>&1 | FileCheck %s
 
 ; This test is expected to fail with the following error
 ; CHECK: LLVM ERROR: Invalid Overload Type
diff --git a/llvm/test/CodeGen/DirectX/rsqrt.ll b/llvm/test/CodeGen/DirectX/rsqrt.ll
index 52af0e6..054c844 100644
--- a/llvm/test/CodeGen/DirectX/rsqrt.ll
+++ b/llvm/test/CodeGen/DirectX/rsqrt.ll
@@ -1,4 +1,4 @@
-; RUN: opt -S -dxil-op-lower < %s | FileCheck %s
+; RUN: opt -S -dxil-op-lower -mtriple=dxil-pc-shadermodel6.3-library %s | FileCheck %s
 
 ; Make sure dxil operation function calls for rsqrt are generated for float and half.
 
diff --git a/llvm/test/CodeGen/DirectX/rsqrt_error.ll b/llvm/test/CodeGen/DirectX/rsqrt_error.ll
index 9cd5002..5e29e37 100644
--- a/llvm/test/CodeGen/DirectX/rsqrt_error.ll
+++ b/llvm/test/CodeGen/DirectX/rsqrt_error.ll
@@ -1,4 +1,4 @@
-; RUN: not opt -S -dxil-op-lower %s 2>&1 | FileCheck %s
+; RUN: not opt -S -dxil-op-lower -mtriple=dxil-pc-shadermodel6.3-library %s 2>&1 | FileCheck %s
 
 ; DXIL operation rsqrt does not support double overload type
 ; CHECK: LLVM ERROR: Invalid Overload Type
diff --git a/llvm/test/CodeGen/DirectX/sin.ll b/llvm/test/CodeGen/DirectX/sin.ll
index 1f285c4..f309a36 100644
--- a/llvm/test/CodeGen/DirectX/sin.ll
+++ b/llvm/test/CodeGen/DirectX/sin.ll
@@ -1,4 +1,4 @@
-; RUN: opt -S -dxil-op-lower < %s | FileCheck %s
+; RUN: opt -S -dxil-op-lower -mtriple=dxil-pc-shadermodel6.3-library %s | FileCheck %s
 
 ; Make sure dxil operation function calls for sin are generated for float and half.
 ; CHECK:call float @dx.op.unary.f32(i32 13, float %{{.*}})
diff --git a/llvm/test/CodeGen/DirectX/sin_error.ll b/llvm/test/CodeGen/DirectX/sin_error.ll
index ece0e53..0e20031 100644
--- a/llvm/test/CodeGen/DirectX/sin_error.ll
+++ b/llvm/test/CodeGen/DirectX/sin_error.ll
@@ -1,4 +1,4 @@
-; RUN: not opt -S -dxil-op-lower %s 2>&1 | FileCheck %s
+; RUN: not opt -S -dxil-op-lower -mtriple=dxil-pc-shadermodel6.0-library %s 2>&1 | FileCheck %s
 
 ; DXIL operation sin does not support double overload type
 ; CHECK: LLVM ERROR: Invalid Overload
diff --git a/llvm/test/CodeGen/DirectX/sin_no_stage_error.ll b/llvm/test/CodeGen/DirectX/sin_no_stage_error.ll
new file mode 100644
index 0000000..673fc22
--- /dev/null
+++ b/llvm/test/CodeGen/DirectX/sin_no_stage_error.ll
@@ -0,0 +1,13 @@
+; RUN: not opt -S -dxil-op-lower -mtriple=dxil-pc-shadermodel6.0 %s 2>&1 | FileCheck %s
+
+; Shader Stage is required to ensure the operation is supported.
+; CHECK: LLVM ERROR: 1.0: Unknown Compilation Target Shader Stage specified
+
+define noundef float @sin_float(float noundef %a) #0 {
+entry:
+  %a.addr = alloca float, align 4
+  store float %a, ptr %a.addr, align 4
+  %0 = load float, ptr %a.addr, align 4
+  %1 = call float @llvm.sin.f32(float %0)
+  ret float %1
+}
diff --git a/llvm/test/CodeGen/DirectX/sinh.ll b/llvm/test/CodeGen/DirectX/sinh.ll
index 76d1898..d4d3eda 100644
--- a/llvm/test/CodeGen/DirectX/sinh.ll
+++ b/llvm/test/CodeGen/DirectX/sinh.ll
@@ -1,4 +1,4 @@
-; RUN: opt -S -dxil-op-lower < %s | FileCheck %s
+; RUN: opt -S -dxil-op-lower -mtriple=dxil-pc-shadermodel6.3-library %s | FileCheck %s
 
 ; Make sure dxil operation function calls for sinh are generated for float and half.
 
diff --git a/llvm/test/CodeGen/DirectX/sinh_error.ll b/llvm/test/CodeGen/DirectX/sinh_error.ll
index 6a021ce8..06aeca0 100644
--- a/llvm/test/CodeGen/DirectX/sinh_error.ll
+++ b/llvm/test/CodeGen/DirectX/sinh_error.ll
@@ -1,4 +1,4 @@
-; RUN: not opt -S -dxil-op-lower %s 2>&1 | FileCheck %s
+; RUN: not opt -S -dxil-op-lower -mtriple=dxil-pc-shadermodel6.3-library %s 2>&1 | FileCheck %s
 
 ; DXIL operation sinh does not support double overload type
 ; CHECK: LLVM ERROR: Invalid Overload
diff --git a/llvm/test/CodeGen/DirectX/smax.ll b/llvm/test/CodeGen/DirectX/smax.ll
index 8b24067..bcda51c 100644
--- a/llvm/test/CodeGen/DirectX/smax.ll
+++ b/llvm/test/CodeGen/DirectX/smax.ll
@@ -1,4 +1,4 @@
-; RUN: opt -S -dxil-op-lower < %s | FileCheck %s
+; RUN: opt -S -dxil-op-lower -mtriple=dxil-pc-shadermodel6.3-library %s | FileCheck %s
 
 ; Make sure dxil operation function calls for smax are generated for i16/i32/i64.
 
diff --git a/llvm/test/CodeGen/DirectX/smin.ll b/llvm/test/CodeGen/DirectX/smin.ll
index b2b40a1..8d48847 100644
--- a/llvm/test/CodeGen/DirectX/smin.ll
+++ b/llvm/test/CodeGen/DirectX/smin.ll
@@ -1,4 +1,4 @@
-; RUN: opt -S -dxil-op-lower < %s | FileCheck %s
+; RUN: opt -S -dxil-op-lower -mtriple=dxil-pc-shadermodel6.3-library %s | FileCheck %s
 
 ; Make sure dxil operation function calls for smin are generated for i16/i32/i64.
 
diff --git a/llvm/test/CodeGen/DirectX/sqrt.ll b/llvm/test/CodeGen/DirectX/sqrt.ll
index 76a572e..792fbc8 100644
--- a/llvm/test/CodeGen/DirectX/sqrt.ll
+++ b/llvm/test/CodeGen/DirectX/sqrt.ll
@@ -1,4 +1,4 @@
-; RUN: opt -S -dxil-op-lower < %s | FileCheck %s
+; RUN: opt -S -dxil-op-lower -mtriple=dxil-pc-shadermodel6.3-library %s | FileCheck %s
 
 ; Make sure dxil operation function calls for sqrt are generated for float and half.
 
diff --git a/llvm/test/CodeGen/DirectX/sqrt_error.ll b/llvm/test/CodeGen/DirectX/sqrt_error.ll
index fffa2e1..1477abc 100644
--- a/llvm/test/CodeGen/DirectX/sqrt_error.ll
+++ b/llvm/test/CodeGen/DirectX/sqrt_error.ll
@@ -1,4 +1,4 @@
-; RUN: not opt -S -dxil-op-lower %s 2>&1 | FileCheck %s
+; RUN: not opt -S -dxil-op-lower -mtriple=dxil-pc-shadermodel6.3-library %s 2>&1 | FileCheck %s
 
 ; DXIL operation sqrt does not support double overload type
 ; CHECK: LLVM ERROR: Invalid Overload Type
diff --git a/llvm/test/CodeGen/DirectX/tan.ll b/llvm/test/CodeGen/DirectX/tan.ll
index 567ab02..6f7beb5 100644
--- a/llvm/test/CodeGen/DirectX/tan.ll
+++ b/llvm/test/CodeGen/DirectX/tan.ll
@@ -1,4 +1,4 @@
-; RUN: opt -S -dxil-op-lower < %s | FileCheck %s
+; RUN: opt -S -dxil-op-lower -mtriple=dxil-pc-shadermodel6.3-library %s | FileCheck %s
 
 ; Make sure dxil operation function calls for tan are generated for float and half.
 
diff --git a/llvm/test/CodeGen/DirectX/tan_error.ll b/llvm/test/CodeGen/DirectX/tan_error.ll
index c870c36..fa03e53 100644
--- a/llvm/test/CodeGen/DirectX/tan_error.ll
+++ b/llvm/test/CodeGen/DirectX/tan_error.ll
@@ -1,4 +1,4 @@
-; RUN: not opt -S -dxil-op-lower %s 2>&1 | FileCheck %s
+; RUN: not opt -S -dxil-op-lower -mtriple=dxil-pc-shadermodel6.3-library %s 2>&1 | FileCheck %s
 
 ; DXIL operation tan does not support double overload type
 ; CHECK: LLVM ERROR: Invalid Overload
diff --git a/llvm/test/CodeGen/DirectX/tanh.ll b/llvm/test/CodeGen/DirectX/tanh.ll
index d031317..e6642d9 100644
--- a/llvm/test/CodeGen/DirectX/tanh.ll
+++ b/llvm/test/CodeGen/DirectX/tanh.ll
@@ -1,4 +1,4 @@
-; RUN: opt -S -dxil-op-lower < %s | FileCheck %s
+; RUN: opt -S -dxil-op-lower -mtriple=dxil-pc-shadermodel6.3-library %s | FileCheck %s
 
 ; Make sure dxil operation function calls for tanh are generated for float and half.
 
diff --git a/llvm/test/CodeGen/DirectX/tanh_error.ll b/llvm/test/CodeGen/DirectX/tanh_error.ll
index a1b8cbf..933ffbc 100644
--- a/llvm/test/CodeGen/DirectX/tanh_error.ll
+++ b/llvm/test/CodeGen/DirectX/tanh_error.ll
@@ -1,4 +1,4 @@
-; RUN: not opt -S -dxil-op-lower %s 2>&1 | FileCheck %s
+; RUN: not opt -S -dxil-op-lower -mtriple=dxil-pc-shadermodel6.3-library %s 2>&1 | FileCheck %s
 
 ; DXIL operation tanh does not support double overload type
 ; CHECK: LLVM ERROR: Invalid Overload
diff --git a/llvm/test/CodeGen/DirectX/thread_id_error.ll b/llvm/test/CodeGen/DirectX/thread_id_error.ll
new file mode 100644
index 0000000..69289323
--- /dev/null
+++ b/llvm/test/CodeGen/DirectX/thread_id_error.ll
@@ -0,0 +1,13 @@
+; RUN: not opt -S -dxil-op-lower  %s 2>&1 | FileCheck %s
+
+; DXIL operation not valid for library stage
+; CHECK: LLVM ERROR: library : Invalid Shader Stage for DXIL operation - ThreadId
+
+target triple = "dxil-pc-shadermodel6.7-library"
+
+; Function Attrs: noinline nounwind optnone
+define i32 @test_thread_id(i32 %a) #0 {
+entry:
+  %0 = call i32 @llvm.dx.thread.id(i32 %a)
+  ret i32 %0
+}
diff --git a/llvm/test/CodeGen/DirectX/thread_id_in_group_error.ll b/llvm/test/CodeGen/DirectX/thread_id_in_group_error.ll
new file mode 100644
index 0000000..8b63fd7
--- /dev/null
+++ b/llvm/test/CodeGen/DirectX/thread_id_in_group_error.ll
@@ -0,0 +1,13 @@
+; RUN: not opt -S -dxil-op-lower  %s 2>&1 | FileCheck %s
+
+; DXIL operation sin is not valid in vertex stage
+; CHECK: LLVM ERROR: vertex : Invalid Shader Stage for DXIL operation - ThreadIdInGroup
+
+target triple = "dxil-pc-shadermodel6.7-vertex"
+
+; Function Attrs: noinline nounwind optnone
+define i32 @test_thread_id_in_group(i32 %a) #0 {
+entry:
+  %0 = call i32 @llvm.dx.thread.id.in.group(i32 %a)
+  ret i32 %0
+}
diff --git a/llvm/test/CodeGen/DirectX/trunc.ll b/llvm/test/CodeGen/DirectX/trunc.ll
index 2072f28..f00b737 100644
--- a/llvm/test/CodeGen/DirectX/trunc.ll
+++ b/llvm/test/CodeGen/DirectX/trunc.ll
@@ -1,4 +1,4 @@
-; RUN: opt -S -dxil-op-lower < %s | FileCheck %s
+; RUN: opt -S -dxil-op-lower -mtriple=dxil-pc-shadermodel6.3-library %s | FileCheck %s
 
 ; Make sure dxil operation function calls for trunc are generated for float and half.
 
diff --git a/llvm/test/CodeGen/DirectX/trunc_error.ll b/llvm/test/CodeGen/DirectX/trunc_error.ll
index 751b0b9..ccc7b1d 100644
--- a/llvm/test/CodeGen/DirectX/trunc_error.ll
+++ b/llvm/test/CodeGen/DirectX/trunc_error.ll
@@ -1,4 +1,4 @@
-; RUN: not opt -S -dxil-op-lower %s 2>&1 | FileCheck %s
+; RUN: not opt -S -dxil-op-lower -mtriple=dxil-pc-shadermodel6.3-library %s 2>&1 | FileCheck %s
 
 ; DXIL operation trunc does not support double overload type
 ; CHECK: LLVM ERROR: Invalid Overload Type
diff --git a/llvm/test/CodeGen/DirectX/umax.ll b/llvm/test/CodeGen/DirectX/umax.ll
index be0f557..a4bd66e 100644
--- a/llvm/test/CodeGen/DirectX/umax.ll
+++ b/llvm/test/CodeGen/DirectX/umax.ll
@@ -1,4 +1,4 @@
-; RUN: opt -S -dxil-op-lower < %s | FileCheck %s
+; RUN: opt -S -dxil-op-lower -mtriple=dxil-pc-shadermodel6.3-library %s | FileCheck %s
 
 ; Make sure dxil operation function calls for umax are generated for i16/i32/i64.
 
diff --git a/llvm/test/CodeGen/DirectX/umin.ll b/llvm/test/CodeGen/DirectX/umin.ll
index 5051c71..a551f8f 100644
--- a/llvm/test/CodeGen/DirectX/umin.ll
+++ b/llvm/test/CodeGen/DirectX/umin.ll
@@ -1,4 +1,4 @@
-; RUN: opt -S -dxil-op-lower < %s | FileCheck %s
+; RUN: opt -S -dxil-op-lower -mtriple=dxil-pc-shadermodel6.3-library %s | FileCheck %s
 
 ; Make sure dxil operation function calls for umin are generated for i16/i32/i64.
 
diff --git a/llvm/test/CodeGen/LoongArch/code-models.ll b/llvm/test/CodeGen/LoongArch/code-models.ll
index 4b2b72a..7bc7a98 100644
--- a/llvm/test/CodeGen/LoongArch/code-models.ll
+++ b/llvm/test/CodeGen/LoongArch/code-models.ll
@@ -33,11 +33,11 @@ define i32 @call_globaladdress(i32 %a) nounwind {
 ; LARGE:       # %bb.0:
 ; LARGE-NEXT:    addi.d $sp, $sp, -16
 ; LARGE-NEXT:    st.d $ra, $sp, 8 # 8-byte Folded Spill
-; LARGE-NEXT:    pcalau12i $ra, %got_pc_hi20(callee)
-; LARGE-NEXT:    addi.d $t8, $zero, %got_pc_lo12(callee)
-; LARGE-NEXT:    lu32i.d $t8, %got64_pc_lo20(callee)
-; LARGE-NEXT:    lu52i.d $t8, $t8, %got64_pc_hi12(callee)
-; LARGE-NEXT:    ldx.d $ra, $t8, $ra
+; LARGE-NEXT:    pcalau12i $a1, %got_pc_hi20(callee)
+; LARGE-NEXT:    addi.d $ra, $zero, %got_pc_lo12(callee)
+; LARGE-NEXT:    lu32i.d $ra, %got64_pc_lo20(callee)
+; LARGE-NEXT:    lu52i.d $ra, $ra, %got64_pc_hi12(callee)
+; LARGE-NEXT:    ldx.d $ra, $ra, $a1
 ; LARGE-NEXT:    jirl $ra, $ra, 0
 ; LARGE-NEXT:    ld.d $ra, $sp, 8 # 8-byte Folded Reload
 ; LARGE-NEXT:    addi.d $sp, $sp, 16
@@ -82,11 +82,11 @@ define void @call_external_sym(ptr %dst) {
 ; LARGE-NEXT:    .cfi_offset 1, -8
 ; LARGE-NEXT:    ori $a2, $zero, 1000
 ; LARGE-NEXT:    move $a1, $zero
-; LARGE-NEXT:    pcalau12i $ra, %pc_hi20(memset)
-; LARGE-NEXT:    addi.d $t8, $zero, %pc_lo12(memset)
-; LARGE-NEXT:    lu32i.d $t8, %pc64_lo20(memset)
-; LARGE-NEXT:    lu52i.d $t8, $t8, %pc64_hi12(memset)
-; LARGE-NEXT:    add.d $ra, $t8, $ra
+; LARGE-NEXT:    pcalau12i $a3, %pc_hi20(memset)
+; LARGE-NEXT:    addi.d $ra, $zero, %pc_lo12(memset)
+; LARGE-NEXT:    lu32i.d $ra, %pc64_lo20(memset)
+; LARGE-NEXT:    lu52i.d $ra, $ra, %pc64_hi12(memset)
+; LARGE-NEXT:    add.d $ra, $ra, $a3
 ; LARGE-NEXT:    jirl $ra, $ra, 0
 ; LARGE-NEXT:    ld.d $ra, $sp, 8 # 8-byte Folded Reload
 ; LARGE-NEXT:    addi.d $sp, $sp, 16
@@ -105,17 +105,17 @@ define i32 @caller_tail(i32 %i) nounwind {
 ;
 ; MEDIUM-LABEL: caller_tail:
 ; MEDIUM:       # %bb.0: # %entry
-; MEDIUM-NEXT:    pcaddu18i $t8, %call36(callee_tail)
-; MEDIUM-NEXT:    jr $t8
+; MEDIUM-NEXT:    pcaddu18i $a1, %call36(callee_tail)
+; MEDIUM-NEXT:    jr $a1
 ;
 ; LARGE-LABEL: caller_tail:
 ; LARGE:       # %bb.0: # %entry
-; LARGE-NEXT:    pcalau12i $t7, %got_pc_hi20(callee_tail)
-; LARGE-NEXT:    addi.d $t8, $zero, %got_pc_lo12(callee_tail)
-; LARGE-NEXT:    lu32i.d $t8, %got64_pc_lo20(callee_tail)
-; LARGE-NEXT:    lu52i.d $t8, $t8, %got64_pc_hi12(callee_tail)
-; LARGE-NEXT:    ldx.d $t7, $t8, $t7
-; LARGE-NEXT:    jr $t7
+; LARGE-NEXT:    pcalau12i $a1, %got_pc_hi20(callee_tail)
+; LARGE-NEXT:    addi.d $a2, $zero, %got_pc_lo12(callee_tail)
+; LARGE-NEXT:    lu32i.d $a2, %got64_pc_lo20(callee_tail)
+; LARGE-NEXT:    lu52i.d $a2, $a2, %got64_pc_hi12(callee_tail)
+; LARGE-NEXT:    ldx.d $a1, $a2, $a1
+; LARGE-NEXT:    jr $a1
 entry:
   %r = tail call i32 @callee_tail(i32 %i)
   ret i32 %r
diff --git a/llvm/test/CodeGen/LoongArch/expand-call.ll b/llvm/test/CodeGen/LoongArch/expand-call.ll
index 8c21adb..d221200 100644
--- a/llvm/test/CodeGen/LoongArch/expand-call.ll
+++ b/llvm/test/CodeGen/LoongArch/expand-call.ll
@@ -1,6 +1,6 @@
 ; RUN: llc --mtriple=loongarch64 -mattr=+d --stop-before loongarch-prera-expand-pseudo \
 ; RUN:     --verify-machineinstrs < %s | FileCheck %s --check-prefix=NOEXPAND
-; RUN: llc --mtriple=loongarch64 -mattr=+d --stop-before machine-opt-remark-emitter \
+; RUN: llc --mtriple=loongarch64 --stop-after loongarch-prera-expand-pseudo \
 ; RUN:     --verify-machineinstrs < %s | FileCheck %s --check-prefix=EXPAND
 
 declare void @callee()
diff --git a/llvm/test/CodeGen/LoongArch/global-address.ll b/llvm/test/CodeGen/LoongArch/global-address.ll
index fb29295..2423dd8 100644
--- a/llvm/test/CodeGen/LoongArch/global-address.ll
+++ b/llvm/test/CodeGen/LoongArch/global-address.ll
@@ -53,32 +53,32 @@ define void @foo() nounwind {
 ; LA64LARGENOPIC-LABEL: foo:
 ; LA64LARGENOPIC:       # %bb.0:
 ; LA64LARGENOPIC-NEXT:    pcalau12i $a0, %got_pc_hi20(G)
-; LA64LARGENOPIC-NEXT:    addi.d $t8, $zero, %got_pc_lo12(G)
-; LA64LARGENOPIC-NEXT:    lu32i.d $t8, %got64_pc_lo20(G)
-; LA64LARGENOPIC-NEXT:    lu52i.d $t8, $t8, %got64_pc_hi12(G)
-; LA64LARGENOPIC-NEXT:    ldx.d $a0, $t8, $a0
+; LA64LARGENOPIC-NEXT:    addi.d $a1, $zero, %got_pc_lo12(G)
+; LA64LARGENOPIC-NEXT:    lu32i.d $a1, %got64_pc_lo20(G)
+; LA64LARGENOPIC-NEXT:    lu52i.d $a1, $a1, %got64_pc_hi12(G)
+; LA64LARGENOPIC-NEXT:    ldx.d $a0, $a1, $a0
 ; LA64LARGENOPIC-NEXT:    ld.w $zero, $a0, 0
 ; LA64LARGENOPIC-NEXT:    pcalau12i $a0, %pc_hi20(g)
-; LA64LARGENOPIC-NEXT:    addi.d $t8, $zero, %pc_lo12(g)
-; LA64LARGENOPIC-NEXT:    lu32i.d $t8, %pc64_lo20(g)
-; LA64LARGENOPIC-NEXT:    lu52i.d $t8, $t8, %pc64_hi12(g)
-; LA64LARGENOPIC-NEXT:    add.d $a0, $t8, $a0
+; LA64LARGENOPIC-NEXT:    addi.d $a1, $zero, %pc_lo12(g)
+; LA64LARGENOPIC-NEXT:    lu32i.d $a1, %pc64_lo20(g)
+; LA64LARGENOPIC-NEXT:    lu52i.d $a1, $a1, %pc64_hi12(g)
+; LA64LARGENOPIC-NEXT:    add.d $a0, $a1, $a0
 ; LA64LARGENOPIC-NEXT:    ld.w $zero, $a0, 0
 ; LA64LARGENOPIC-NEXT:    ret
 ;
 ; LA64LARGEPIC-LABEL: foo:
 ; LA64LARGEPIC:       # %bb.0:
 ; LA64LARGEPIC-NEXT:    pcalau12i $a0, %got_pc_hi20(G)
-; LA64LARGEPIC-NEXT:    addi.d $t8, $zero, %got_pc_lo12(G)
-; LA64LARGEPIC-NEXT:    lu32i.d $t8, %got64_pc_lo20(G)
-; LA64LARGEPIC-NEXT:    lu52i.d $t8, $t8, %got64_pc_hi12(G)
-; LA64LARGEPIC-NEXT:    ldx.d $a0, $t8, $a0
+; LA64LARGEPIC-NEXT:    addi.d $a1, $zero, %got_pc_lo12(G)
+; LA64LARGEPIC-NEXT:    lu32i.d $a1, %got64_pc_lo20(G)
+; LA64LARGEPIC-NEXT:    lu52i.d $a1, $a1, %got64_pc_hi12(G)
+; LA64LARGEPIC-NEXT:    ldx.d $a0, $a1, $a0
 ; LA64LARGEPIC-NEXT:    ld.w $zero, $a0, 0
 ; LA64LARGEPIC-NEXT:    pcalau12i $a0, %pc_hi20(.Lg$local)
-; LA64LARGEPIC-NEXT:    addi.d $t8, $zero, %pc_lo12(.Lg$local)
-; LA64LARGEPIC-NEXT:    lu32i.d $t8, %pc64_lo20(.Lg$local)
-; LA64LARGEPIC-NEXT:    lu52i.d $t8, $t8, %pc64_hi12(.Lg$local)
-; LA64LARGEPIC-NEXT:    add.d $a0, $t8, $a0
+; LA64LARGEPIC-NEXT:    addi.d $a1, $zero, %pc_lo12(.Lg$local)
+; LA64LARGEPIC-NEXT:    lu32i.d $a1, %pc64_lo20(.Lg$local)
+; LA64LARGEPIC-NEXT:    lu52i.d $a1, $a1, %pc64_hi12(.Lg$local)
+; LA64LARGEPIC-NEXT:    add.d $a0, $a1, $a0
 ; LA64LARGEPIC-NEXT:    ld.w $zero, $a0, 0
 ; LA64LARGEPIC-NEXT:    ret
   %V = load volatile i32, ptr @G
diff --git a/llvm/test/CodeGen/LoongArch/global-variable-code-model.ll b/llvm/test/CodeGen/LoongArch/global-variable-code-model.ll
index 277b0b9..2b7a862 100644
--- a/llvm/test/CodeGen/LoongArch/global-variable-code-model.ll
+++ b/llvm/test/CodeGen/LoongArch/global-variable-code-model.ll
@@ -20,10 +20,10 @@ define dso_local signext i32 @local_large() #0 {
 ; CHECK-LABEL: local_large:
 ; CHECK:       # %bb.0:
 ; CHECK-NEXT:    pcalau12i $a0, %pc_hi20(b)
-; CHECK-NEXT:    addi.d $t8, $zero, %pc_lo12(b)
-; CHECK-NEXT:    lu32i.d $t8, %pc64_lo20(b)
-; CHECK-NEXT:    lu52i.d $t8, $t8, %pc64_hi12(b)
-; CHECK-NEXT:    add.d $a0, $t8, $a0
+; CHECK-NEXT:    addi.d $a1, $zero, %pc_lo12(b)
+; CHECK-NEXT:    lu32i.d $a1, %pc64_lo20(b)
+; CHECK-NEXT:    lu52i.d $a1, $a1, %pc64_hi12(b)
+; CHECK-NEXT:    add.d $a0, $a1, $a0
 ; CHECK-NEXT:    ld.w $a0, $a0, 0
 ; CHECK-NEXT:    ret
   %1 = load i32, ptr @b, align 4
diff --git a/llvm/test/CodeGen/LoongArch/inline-asm-constraint-m.ll b/llvm/test/CodeGen/LoongArch/inline-asm-constraint-m.ll
index 281d52c..becb3ca 100644
--- a/llvm/test/CodeGen/LoongArch/inline-asm-constraint-m.ll
+++ b/llvm/test/CodeGen/LoongArch/inline-asm-constraint-m.ll
@@ -141,3 +141,49 @@ define i32 @m_offset_2048(ptr %p) nounwind {
   %2 = call i32 asm "ld.w $0, $1", "=r,*m"(ptr elementtype(i32) %1)
   ret i32 %2
 }
+
+@g_i32 = dso_local global i32 0
+
+define i32 @m_addr_pcrel() nounwind {
+; LA32-LABEL: m_addr_pcrel:
+; LA32:       # %bb.0:
+; LA32-NEXT:    pcalau12i $a0, %pc_hi20(g_i32)
+; LA32-NEXT:    addi.w $a1, $a0, %pc_lo12(g_i32)
+; LA32-NEXT:    #APP
+; LA32-NEXT:    ld.w $a0, $a1, 0
+; LA32-NEXT:    #NO_APP
+; LA32-NEXT:    ret
+;
+; LA64-LABEL: m_addr_pcrel:
+; LA64:       # %bb.0:
+; LA64-NEXT:    pcalau12i $a0, %pc_hi20(g_i32)
+; LA64-NEXT:    addi.d $a1, $a0, %pc_lo12(g_i32)
+; LA64-NEXT:    #APP
+; LA64-NEXT:    ld.w $a0, $a1, 0
+; LA64-NEXT:    #NO_APP
+; LA64-NEXT:    ret
+  %1 = tail call i32 asm sideeffect "ld.w $0, $1", "=&r,*m"(ptr nonnull elementtype(i32) @g_i32)
+  ret i32 %1
+}
+
+define i32 @m_addr_should_not_fold() nounwind {
+; LA32-LABEL: m_addr_should_not_fold:
+; LA32:       # %bb.0:
+; LA32-NEXT:    pcalau12i $a0, %pc_hi20(g_i32)
+; LA32-NEXT:    addi.w $a1, $a0, %pc_lo12(g_i32)
+; LA32-NEXT:    #APP
+; LA32-NEXT:    ld.w $a0, $a1, 0
+; LA32-NEXT:    #NO_APP
+; LA32-NEXT:    ret
+;
+; LA64-LABEL: m_addr_should_not_fold:
+; LA64:       # %bb.0:
+; LA64-NEXT:    pcalau12i $a0, %pc_hi20(g_i32)
+; LA64-NEXT:    addi.d $a1, $a0, %pc_lo12(g_i32)
+; LA64-NEXT:    #APP
+; LA64-NEXT:    ld.w $a0, $a1, 0
+; LA64-NEXT:    #NO_APP
+; LA64-NEXT:    ret
+  %1 = tail call i32 asm sideeffect "ld.w $0, $1, 0", "=&r,r,~{memory}"(ptr nonnull @g_i32)
+  ret i32 %1
+}
diff --git a/llvm/test/CodeGen/LoongArch/machinelicm-address-pseudos.ll b/llvm/test/CodeGen/LoongArch/machinelicm-address-pseudos.ll
index ed1a24e..5248468 100644
--- a/llvm/test/CodeGen/LoongArch/machinelicm-address-pseudos.ll
+++ b/llvm/test/CodeGen/LoongArch/machinelicm-address-pseudos.ll
@@ -41,12 +41,12 @@ define void @test_la_pcrel(i32 signext %n) {
 ;
 ; LA64LARGE-LABEL: test_la_pcrel:
 ; LA64LARGE:       # %bb.0: # %entry
-; LA64LARGE-NEXT:    move $a1, $zero
 ; LA64LARGE-NEXT:    pcalau12i $a2, %pc_hi20(l)
-; LA64LARGE-NEXT:    addi.d $t8, $zero, %pc_lo12(l)
-; LA64LARGE-NEXT:    lu32i.d $t8, %pc64_lo20(l)
-; LA64LARGE-NEXT:    lu52i.d $t8, $t8, %pc64_hi12(l)
-; LA64LARGE-NEXT:    add.d $a2, $t8, $a2
+; LA64LARGE-NEXT:    addi.d $a1, $zero, %pc_lo12(l)
+; LA64LARGE-NEXT:    lu32i.d $a1, %pc64_lo20(l)
+; LA64LARGE-NEXT:    lu52i.d $a3, $a1, %pc64_hi12(l)
+; LA64LARGE-NEXT:    move $a1, $zero
+; LA64LARGE-NEXT:    add.d $a2, $a3, $a2
 ; LA64LARGE-NEXT:    .p2align 4, , 16
 ; LA64LARGE-NEXT:  .LBB0_1: # %loop
 ; LA64LARGE-NEXT:    # =>This Inner Loop Header: Depth=1
@@ -103,10 +103,10 @@ define void @test_la_got(i32 signext %n) {
 ; LA64LARGE-LABEL: test_la_got:
 ; LA64LARGE:       # %bb.0: # %entry
 ; LA64LARGE-NEXT:    pcalau12i $a1, %got_pc_hi20(g)
-; LA64LARGE-NEXT:    addi.d $t8, $zero, %got_pc_lo12(g)
-; LA64LARGE-NEXT:    lu32i.d $t8, %got64_pc_lo20(g)
-; LA64LARGE-NEXT:    lu52i.d $t8, $t8, %got64_pc_hi12(g)
-; LA64LARGE-NEXT:    ldx.d $a1, $t8, $a1
+; LA64LARGE-NEXT:    addi.d $a2, $zero, %got_pc_lo12(g)
+; LA64LARGE-NEXT:    lu32i.d $a2, %got64_pc_lo20(g)
+; LA64LARGE-NEXT:    lu52i.d $a2, $a2, %got64_pc_hi12(g)
+; LA64LARGE-NEXT:    ldx.d $a1, $a2, $a1
 ; LA64LARGE-NEXT:    move $a2, $zero
 ; LA64LARGE-NEXT:    .p2align 4, , 16
 ; LA64LARGE-NEXT:  .LBB1_1: # %loop
@@ -165,10 +165,10 @@ define void @test_la_tls_ie(i32 signext %n) {
 ; LA64LARGE-LABEL: test_la_tls_ie:
 ; LA64LARGE:       # %bb.0: # %entry
 ; LA64LARGE-NEXT:    pcalau12i $a1, %ie_pc_hi20(ie)
-; LA64LARGE-NEXT:    addi.d $t8, $zero, %ie_pc_lo12(ie)
-; LA64LARGE-NEXT:    lu32i.d $t8, %ie64_pc_lo20(ie)
-; LA64LARGE-NEXT:    lu52i.d $t8, $t8, %ie64_pc_hi12(ie)
-; LA64LARGE-NEXT:    ldx.d $a1, $t8, $a1
+; LA64LARGE-NEXT:    addi.d $a2, $zero, %ie_pc_lo12(ie)
+; LA64LARGE-NEXT:    lu32i.d $a2, %ie64_pc_lo20(ie)
+; LA64LARGE-NEXT:    lu52i.d $a2, $a2, %ie64_pc_hi12(ie)
+; LA64LARGE-NEXT:    ldx.d $a1, $a2, $a1
 ; LA64LARGE-NEXT:    move $a2, $zero
 ; LA64LARGE-NEXT:    .p2align 4, , 16
 ; LA64LARGE-NEXT:  .LBB2_1: # %loop
@@ -272,21 +272,21 @@ define void @test_la_tls_ld(i32 signext %n) {
 ; LA64LARGE-NEXT:    .cfi_offset 23, -24
 ; LA64LARGE-NEXT:    .cfi_offset 24, -32
 ; LA64LARGE-NEXT:    move $fp, $a0
+; LA64LARGE-NEXT:    pcalau12i $a0, %ld_pc_hi20(ld)
+; LA64LARGE-NEXT:    addi.d $a1, $zero, %got_pc_lo12(ld)
+; LA64LARGE-NEXT:    lu32i.d $a1, %got64_pc_lo20(ld)
+; LA64LARGE-NEXT:    lu52i.d $a1, $a1, %got64_pc_hi12(ld)
 ; LA64LARGE-NEXT:    move $s1, $zero
-; LA64LARGE-NEXT:    pcalau12i $s0, %ld_pc_hi20(ld)
-; LA64LARGE-NEXT:    addi.d $t8, $zero, %got_pc_lo12(ld)
-; LA64LARGE-NEXT:    lu32i.d $t8, %got64_pc_lo20(ld)
-; LA64LARGE-NEXT:    lu52i.d $t8, $t8, %got64_pc_hi12(ld)
-; LA64LARGE-NEXT:    add.d $s0, $t8, $s0
+; LA64LARGE-NEXT:    add.d $s0, $a1, $a0
 ; LA64LARGE-NEXT:    .p2align 4, , 16
 ; LA64LARGE-NEXT:  .LBB3_1: # %loop
 ; LA64LARGE-NEXT:    # =>This Inner Loop Header: Depth=1
 ; LA64LARGE-NEXT:    move $a0, $s0
-; LA64LARGE-NEXT:    pcalau12i $ra, %pc_hi20(__tls_get_addr)
-; LA64LARGE-NEXT:    addi.d $t8, $zero, %pc_lo12(__tls_get_addr)
-; LA64LARGE-NEXT:    lu32i.d $t8, %pc64_lo20(__tls_get_addr)
-; LA64LARGE-NEXT:    lu52i.d $t8, $t8, %pc64_hi12(__tls_get_addr)
-; LA64LARGE-NEXT:    add.d $ra, $t8, $ra
+; LA64LARGE-NEXT:    pcalau12i $a1, %pc_hi20(__tls_get_addr)
+; LA64LARGE-NEXT:    addi.d $ra, $zero, %pc_lo12(__tls_get_addr)
+; LA64LARGE-NEXT:    lu32i.d $ra, %pc64_lo20(__tls_get_addr)
+; LA64LARGE-NEXT:    lu52i.d $ra, $ra, %pc64_hi12(__tls_get_addr)
+; LA64LARGE-NEXT:    add.d $ra, $ra, $a1
 ; LA64LARGE-NEXT:    jirl $ra, $ra, 0
 ; LA64LARGE-NEXT:    ld.w $zero, $a0, 0
 ; LA64LARGE-NEXT:    addi.w $s1, $s1, 1
@@ -438,21 +438,21 @@ define void @test_la_tls_gd(i32 signext %n) nounwind {
 ; LA64LARGE-NEXT:    st.d $s0, $sp, 8 # 8-byte Folded Spill
 ; LA64LARGE-NEXT:    st.d $s1, $sp, 0 # 8-byte Folded Spill
 ; LA64LARGE-NEXT:    move $fp, $a0
+; LA64LARGE-NEXT:    pcalau12i $a0, %gd_pc_hi20(gd)
+; LA64LARGE-NEXT:    addi.d $a1, $zero, %got_pc_lo12(gd)
+; LA64LARGE-NEXT:    lu32i.d $a1, %got64_pc_lo20(gd)
+; LA64LARGE-NEXT:    lu52i.d $a1, $a1, %got64_pc_hi12(gd)
 ; LA64LARGE-NEXT:    move $s1, $zero
-; LA64LARGE-NEXT:    pcalau12i $s0, %gd_pc_hi20(gd)
-; LA64LARGE-NEXT:    addi.d $t8, $zero, %got_pc_lo12(gd)
-; LA64LARGE-NEXT:    lu32i.d $t8, %got64_pc_lo20(gd)
-; LA64LARGE-NEXT:    lu52i.d $t8, $t8, %got64_pc_hi12(gd)
-; LA64LARGE-NEXT:    add.d $s0, $t8, $s0
+; LA64LARGE-NEXT:    add.d $s0, $a1, $a0
 ; LA64LARGE-NEXT:    .p2align 4, , 16
 ; LA64LARGE-NEXT:  .LBB5_1: # %loop
 ; LA64LARGE-NEXT:    # =>This Inner Loop Header: Depth=1
 ; LA64LARGE-NEXT:    move $a0, $s0
-; LA64LARGE-NEXT:    pcalau12i $ra, %pc_hi20(__tls_get_addr)
-; LA64LARGE-NEXT:    addi.d $t8, $zero, %pc_lo12(__tls_get_addr)
-; LA64LARGE-NEXT:    lu32i.d $t8, %pc64_lo20(__tls_get_addr)
-; LA64LARGE-NEXT:    lu52i.d $t8, $t8, %pc64_hi12(__tls_get_addr)
-; LA64LARGE-NEXT:    add.d $ra, $t8, $ra
+; LA64LARGE-NEXT:    pcalau12i $a1, %pc_hi20(__tls_get_addr)
+; LA64LARGE-NEXT:    addi.d $ra, $zero, %pc_lo12(__tls_get_addr)
+; LA64LARGE-NEXT:    lu32i.d $ra, %pc64_lo20(__tls_get_addr)
+; LA64LARGE-NEXT:    lu52i.d $ra, $ra, %pc64_hi12(__tls_get_addr)
+; LA64LARGE-NEXT:    add.d $ra, $ra, $a1
 ; LA64LARGE-NEXT:    jirl $ra, $ra, 0
 ; LA64LARGE-NEXT:    ld.w $zero, $a0, 0
 ; LA64LARGE-NEXT:    addi.w $s1, $s1, 1
diff --git a/llvm/test/CodeGen/LoongArch/merge-base-offset.ll b/llvm/test/CodeGen/LoongArch/merge-base-offset.ll
new file mode 100644
index 0000000..48d18db
--- /dev/null
+++ b/llvm/test/CodeGen/LoongArch/merge-base-offset.ll
@@ -0,0 +1,1107 @@
+; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py
+; RUN: llc --mtriple=loongarch32 --mattr=+d --verify-machineinstrs < %s \
+; RUN:   | FileCheck --check-prefix=LA32 %s
+; RUN: llc --mtriple=loongarch64 --mattr=+d --verify-machineinstrs < %s \
+; RUN:   | FileCheck --check-prefix=LA64 %s
+; RUN: llc --mtriple=loongarch64 --mattr=+d --verify-machineinstrs \
+; RUN:   --code-model=large < %s | FileCheck --check-prefix=LA64-LARGE %s
+
+@g_i8 = dso_local global i8 0
+
+define dso_local signext i8 @load_s8() nounwind {
+; LA32-LABEL: load_s8:
+; LA32:       # %bb.0: # %entry
+; LA32-NEXT:    pcalau12i $a0, %pc_hi20(g_i8)
+; LA32-NEXT:    addi.w $a0, $a0, %pc_lo12(g_i8)
+; LA32-NEXT:    ld.b $a0, $a0, 0
+; LA32-NEXT:    ret
+;
+; LA64-LABEL: load_s8:
+; LA64:       # %bb.0: # %entry
+; LA64-NEXT:    pcalau12i $a0, %pc_hi20(g_i8)
+; LA64-NEXT:    addi.d $a0, $a0, %pc_lo12(g_i8)
+; LA64-NEXT:    ld.b $a0, $a0, 0
+; LA64-NEXT:    ret
+;
+; LA64-LARGE-LABEL: load_s8:
+; LA64-LARGE:       # %bb.0: # %entry
+; LA64-LARGE-NEXT:    pcalau12i $a0, %pc_hi20(g_i8)
+; LA64-LARGE-NEXT:    addi.d $a1, $zero, %pc_lo12(g_i8)
+; LA64-LARGE-NEXT:    lu32i.d $a1, %pc64_lo20(g_i8)
+; LA64-LARGE-NEXT:    lu52i.d $a1, $a1, %pc64_hi12(g_i8)
+; LA64-LARGE-NEXT:    add.d $a0, $a1, $a0
+; LA64-LARGE-NEXT:    ld.b $a0, $a0, 0
+; LA64-LARGE-NEXT:    ret
+entry:
+  %0 = load i8, ptr @g_i8
+  ret i8 %0
+}
+
+define dso_local zeroext i8 @load_u8() nounwind {
+; LA32-LABEL: load_u8:
+; LA32:       # %bb.0: # %entry
+; LA32-NEXT:    pcalau12i $a0, %pc_hi20(g_i8)
+; LA32-NEXT:    addi.w $a0, $a0, %pc_lo12(g_i8)
+; LA32-NEXT:    ld.bu $a0, $a0, 0
+; LA32-NEXT:    ret
+;
+; LA64-LABEL: load_u8:
+; LA64:       # %bb.0: # %entry
+; LA64-NEXT:    pcalau12i $a0, %pc_hi20(g_i8)
+; LA64-NEXT:    addi.d $a0, $a0, %pc_lo12(g_i8)
+; LA64-NEXT:    ld.bu $a0, $a0, 0
+; LA64-NEXT:    ret
+;
+; LA64-LARGE-LABEL: load_u8:
+; LA64-LARGE:       # %bb.0: # %entry
+; LA64-LARGE-NEXT:    pcalau12i $a0, %pc_hi20(g_i8)
+; LA64-LARGE-NEXT:    addi.d $a1, $zero, %pc_lo12(g_i8)
+; LA64-LARGE-NEXT:    lu32i.d $a1, %pc64_lo20(g_i8)
+; LA64-LARGE-NEXT:    lu52i.d $a1, $a1, %pc64_hi12(g_i8)
+; LA64-LARGE-NEXT:    add.d $a0, $a1, $a0
+; LA64-LARGE-NEXT:    ld.bu $a0, $a0, 0
+; LA64-LARGE-NEXT:    ret
+entry:
+  %0 = load i8, ptr @g_i8
+  ret i8 %0
+}
+
+define dso_local void @store_i8() nounwind {
+; LA32-LABEL: store_i8:
+; LA32:       # %bb.0: # %entry
+; LA32-NEXT:    pcalau12i $a0, %pc_hi20(g_i8)
+; LA32-NEXT:    addi.w $a0, $a0, %pc_lo12(g_i8)
+; LA32-NEXT:    ori $a1, $zero, 1
+; LA32-NEXT:    st.b $a1, $a0, 0
+; LA32-NEXT:    ret
+;
+; LA64-LABEL: store_i8:
+; LA64:       # %bb.0: # %entry
+; LA64-NEXT:    pcalau12i $a0, %pc_hi20(g_i8)
+; LA64-NEXT:    addi.d $a0, $a0, %pc_lo12(g_i8)
+; LA64-NEXT:    ori $a1, $zero, 1
+; LA64-NEXT:    st.b $a1, $a0, 0
+; LA64-NEXT:    ret
+;
+; LA64-LARGE-LABEL: store_i8:
+; LA64-LARGE:       # %bb.0: # %entry
+; LA64-LARGE-NEXT:    pcalau12i $a0, %pc_hi20(g_i8)
+; LA64-LARGE-NEXT:    addi.d $a1, $zero, %pc_lo12(g_i8)
+; LA64-LARGE-NEXT:    lu32i.d $a1, %pc64_lo20(g_i8)
+; LA64-LARGE-NEXT:    lu52i.d $a1, $a1, %pc64_hi12(g_i8)
+; LA64-LARGE-NEXT:    add.d $a0, $a1, $a0
+; LA64-LARGE-NEXT:    ori $a1, $zero, 1
+; LA64-LARGE-NEXT:    st.b $a1, $a0, 0
+; LA64-LARGE-NEXT:    ret
+entry:
+  store i8 1, ptr @g_i8
+  ret void
+}
+
+@g_i16 = dso_local global i16 0
+
+define dso_local signext i16 @load_s16() nounwind {
+; LA32-LABEL: load_s16:
+; LA32:       # %bb.0: # %entry
+; LA32-NEXT:    pcalau12i $a0, %pc_hi20(g_i16)
+; LA32-NEXT:    addi.w $a0, $a0, %pc_lo12(g_i16)
+; LA32-NEXT:    ld.h $a0, $a0, 0
+; LA32-NEXT:    ret
+;
+; LA64-LABEL: load_s16:
+; LA64:       # %bb.0: # %entry
+; LA64-NEXT:    pcalau12i $a0, %pc_hi20(g_i16)
+; LA64-NEXT:    addi.d $a0, $a0, %pc_lo12(g_i16)
+; LA64-NEXT:    ld.h $a0, $a0, 0
+; LA64-NEXT:    ret
+;
+; LA64-LARGE-LABEL: load_s16:
+; LA64-LARGE:       # %bb.0: # %entry
+; LA64-LARGE-NEXT:    pcalau12i $a0, %pc_hi20(g_i16)
+; LA64-LARGE-NEXT:    addi.d $a1, $zero, %pc_lo12(g_i16)
+; LA64-LARGE-NEXT:    lu32i.d $a1, %pc64_lo20(g_i16)
+; LA64-LARGE-NEXT:    lu52i.d $a1, $a1, %pc64_hi12(g_i16)
+; LA64-LARGE-NEXT:    add.d $a0, $a1, $a0
+; LA64-LARGE-NEXT:    ld.h $a0, $a0, 0
+; LA64-LARGE-NEXT:    ret
+entry:
+  %0 = load i16, ptr @g_i16
+  ret i16 %0
+}
+
+define dso_local zeroext i16 @load_u16() nounwind {
+; LA32-LABEL: load_u16:
+; LA32:       # %bb.0: # %entry
+; LA32-NEXT:    pcalau12i $a0, %pc_hi20(g_i16)
+; LA32-NEXT:    addi.w $a0, $a0, %pc_lo12(g_i16)
+; LA32-NEXT:    ld.hu $a0, $a0, 0
+; LA32-NEXT:    ret
+;
+; LA64-LABEL: load_u16:
+; LA64:       # %bb.0: # %entry
+; LA64-NEXT:    pcalau12i $a0, %pc_hi20(g_i16)
+; LA64-NEXT:    addi.d $a0, $a0, %pc_lo12(g_i16)
+; LA64-NEXT:    ld.hu $a0, $a0, 0
+; LA64-NEXT:    ret
+;
+; LA64-LARGE-LABEL: load_u16:
+; LA64-LARGE:       # %bb.0: # %entry
+; LA64-LARGE-NEXT:    pcalau12i $a0, %pc_hi20(g_i16)
+; LA64-LARGE-NEXT:    addi.d $a1, $zero, %pc_lo12(g_i16)
+; LA64-LARGE-NEXT:    lu32i.d $a1, %pc64_lo20(g_i16)
+; LA64-LARGE-NEXT:    lu52i.d $a1, $a1, %pc64_hi12(g_i16)
+; LA64-LARGE-NEXT:    add.d $a0, $a1, $a0
+; LA64-LARGE-NEXT:    ld.hu $a0, $a0, 0
+; LA64-LARGE-NEXT:    ret
+entry:
+  %0 = load i16, ptr @g_i16
+  ret i16 %0
+}
+
+define dso_local void @store_i16() nounwind {
+; LA32-LABEL: store_i16:
+; LA32:       # %bb.0: # %entry
+; LA32-NEXT:    pcalau12i $a0, %pc_hi20(g_i16)
+; LA32-NEXT:    addi.w $a0, $a0, %pc_lo12(g_i16)
+; LA32-NEXT:    ori $a1, $zero, 1
+; LA32-NEXT:    st.h $a1, $a0, 0
+; LA32-NEXT:    ret
+;
+; LA64-LABEL: store_i16:
+; LA64:       # %bb.0: # %entry
+; LA64-NEXT:    pcalau12i $a0, %pc_hi20(g_i16)
+; LA64-NEXT:    addi.d $a0, $a0, %pc_lo12(g_i16)
+; LA64-NEXT:    ori $a1, $zero, 1
+; LA64-NEXT:    st.h $a1, $a0, 0
+; LA64-NEXT:    ret
+;
+; LA64-LARGE-LABEL: store_i16:
+; LA64-LARGE:       # %bb.0: # %entry
+; LA64-LARGE-NEXT:    pcalau12i $a0, %pc_hi20(g_i16)
+; LA64-LARGE-NEXT:    addi.d $a1, $zero, %pc_lo12(g_i16)
+; LA64-LARGE-NEXT:    lu32i.d $a1, %pc64_lo20(g_i16)
+; LA64-LARGE-NEXT:    lu52i.d $a1, $a1, %pc64_hi12(g_i16)
+; LA64-LARGE-NEXT:    add.d $a0, $a1, $a0
+; LA64-LARGE-NEXT:    ori $a1, $zero, 1
+; LA64-LARGE-NEXT:    st.h $a1, $a0, 0
+; LA64-LARGE-NEXT:    ret
+entry:
+  store i16 1, ptr @g_i16
+  ret void
+}
+
+@g_i32 = dso_local global i32 0
+
+define dso_local signext i32 @load_s32() nounwind {
+; LA32-LABEL: load_s32:
+; LA32:       # %bb.0: # %entry
+; LA32-NEXT:    pcalau12i $a0, %pc_hi20(g_i32)
+; LA32-NEXT:    addi.w $a0, $a0, %pc_lo12(g_i32)
+; LA32-NEXT:    ld.w $a0, $a0, 0
+; LA32-NEXT:    ret
+;
+; LA64-LABEL: load_s32:
+; LA64:       # %bb.0: # %entry
+; LA64-NEXT:    pcalau12i $a0, %pc_hi20(g_i32)
+; LA64-NEXT:    addi.d $a0, $a0, %pc_lo12(g_i32)
+; LA64-NEXT:    ld.w $a0, $a0, 0
+; LA64-NEXT:    ret
+;
+; LA64-LARGE-LABEL: load_s32:
+; LA64-LARGE:       # %bb.0: # %entry
+; LA64-LARGE-NEXT:    pcalau12i $a0, %pc_hi20(g_i32)
+; LA64-LARGE-NEXT:    addi.d $a1, $zero, %pc_lo12(g_i32)
+; LA64-LARGE-NEXT:    lu32i.d $a1, %pc64_lo20(g_i32)
+; LA64-LARGE-NEXT:    lu52i.d $a1, $a1, %pc64_hi12(g_i32)
+; LA64-LARGE-NEXT:    add.d $a0, $a1, $a0
+; LA64-LARGE-NEXT:    ld.w $a0, $a0, 0
+; LA64-LARGE-NEXT:    ret
+entry:
+  %0 = load i32, ptr @g_i32
+  ret i32 %0
+}
+
+define dso_local zeroext i32 @load_u32() nounwind {
+; LA32-LABEL: load_u32:
+; LA32:       # %bb.0: # %entry
+; LA32-NEXT:    pcalau12i $a0, %pc_hi20(g_i32)
+; LA32-NEXT:    addi.w $a0, $a0, %pc_lo12(g_i32)
+; LA32-NEXT:    ld.w $a0, $a0, 0
+; LA32-NEXT:    ret
+;
+; LA64-LABEL: load_u32:
+; LA64:       # %bb.0: # %entry
+; LA64-NEXT:    pcalau12i $a0, %pc_hi20(g_i32)
+; LA64-NEXT:    addi.d $a0, $a0, %pc_lo12(g_i32)
+; LA64-NEXT:    ld.wu $a0, $a0, 0
+; LA64-NEXT:    ret
+;
+; LA64-LARGE-LABEL: load_u32:
+; LA64-LARGE:       # %bb.0: # %entry
+; LA64-LARGE-NEXT:    pcalau12i $a0, %pc_hi20(g_i32)
+; LA64-LARGE-NEXT:    addi.d $a1, $zero, %pc_lo12(g_i32)
+; LA64-LARGE-NEXT:    lu32i.d $a1, %pc64_lo20(g_i32)
+; LA64-LARGE-NEXT:    lu52i.d $a1, $a1, %pc64_hi12(g_i32)
+; LA64-LARGE-NEXT:    add.d $a0, $a1, $a0
+; LA64-LARGE-NEXT:    ld.wu $a0, $a0, 0
+; LA64-LARGE-NEXT:    ret
+entry:
+  %0 = load i32, ptr @g_i32
+  ret i32 %0
+}
+
+define dso_local void @store_i32() nounwind {
+; LA32-LABEL: store_i32:
+; LA32:       # %bb.0: # %entry
+; LA32-NEXT:    pcalau12i $a0, %pc_hi20(g_i32)
+; LA32-NEXT:    addi.w $a0, $a0, %pc_lo12(g_i32)
+; LA32-NEXT:    ori $a1, $zero, 1
+; LA32-NEXT:    st.w $a1, $a0, 0
+; LA32-NEXT:    ret
+;
+; LA64-LABEL: store_i32:
+; LA64:       # %bb.0: # %entry
+; LA64-NEXT:    pcalau12i $a0, %pc_hi20(g_i32)
+; LA64-NEXT:    addi.d $a0, $a0, %pc_lo12(g_i32)
+; LA64-NEXT:    ori $a1, $zero, 1
+; LA64-NEXT:    st.w $a1, $a0, 0
+; LA64-NEXT:    ret
+;
+; LA64-LARGE-LABEL: store_i32:
+; LA64-LARGE:       # %bb.0: # %entry
+; LA64-LARGE-NEXT:    pcalau12i $a0, %pc_hi20(g_i32)
+; LA64-LARGE-NEXT:    addi.d $a1, $zero, %pc_lo12(g_i32)
+; LA64-LARGE-NEXT:    lu32i.d $a1, %pc64_lo20(g_i32)
+; LA64-LARGE-NEXT:    lu52i.d $a1, $a1, %pc64_hi12(g_i32)
+; LA64-LARGE-NEXT:    add.d $a0, $a1, $a0
+; LA64-LARGE-NEXT:    ori $a1, $zero, 1
+; LA64-LARGE-NEXT:    st.w $a1, $a0, 0
+; LA64-LARGE-NEXT:    ret
+entry:
+  store i32 1, ptr @g_i32
+  ret void
+}
+
+@g_i64 = dso_local global i64 0
+
+define dso_local i64 @load_64() nounwind {
+; LA32-LABEL: load_64:
+; LA32:       # %bb.0: # %entry
+; LA32-NEXT:    pcalau12i $a0, %pc_hi20(g_i64)
+; LA32-NEXT:    addi.w $a1, $a0, %pc_lo12(g_i64)
+; LA32-NEXT:    ld.w $a0, $a1, 0
+; LA32-NEXT:    ld.w $a1, $a1, 4
+; LA32-NEXT:    ret
+;
+; LA64-LABEL: load_64:
+; LA64:       # %bb.0: # %entry
+; LA64-NEXT:    pcalau12i $a0, %pc_hi20(g_i64)
+; LA64-NEXT:    addi.d $a0, $a0, %pc_lo12(g_i64)
+; LA64-NEXT:    ld.d $a0, $a0, 0
+; LA64-NEXT:    ret
+;
+; LA64-LARGE-LABEL: load_64:
+; LA64-LARGE:       # %bb.0: # %entry
+; LA64-LARGE-NEXT:    pcalau12i $a0, %pc_hi20(g_i64)
+; LA64-LARGE-NEXT:    addi.d $a1, $zero, %pc_lo12(g_i64)
+; LA64-LARGE-NEXT:    lu32i.d $a1, %pc64_lo20(g_i64)
+; LA64-LARGE-NEXT:    lu52i.d $a1, $a1, %pc64_hi12(g_i64)
+; LA64-LARGE-NEXT:    add.d $a0, $a1, $a0
+; LA64-LARGE-NEXT:    ld.d $a0, $a0, 0
+; LA64-LARGE-NEXT:    ret
+entry:
+  %0 = load i64, ptr @g_i64
+  ret i64 %0
+}
+
+define dso_local void @store_i64() nounwind {
+; LA32-LABEL: store_i64:
+; LA32:       # %bb.0: # %entry
+; LA32-NEXT:    pcalau12i $a0, %pc_hi20(g_i64)
+; LA32-NEXT:    addi.w $a0, $a0, %pc_lo12(g_i64)
+; LA32-NEXT:    st.w $zero, $a0, 4
+; LA32-NEXT:    ori $a1, $zero, 1
+; LA32-NEXT:    st.w $a1, $a0, 0
+; LA32-NEXT:    ret
+;
+; LA64-LABEL: store_i64:
+; LA64:       # %bb.0: # %entry
+; LA64-NEXT:    pcalau12i $a0, %pc_hi20(g_i64)
+; LA64-NEXT:    addi.d $a0, $a0, %pc_lo12(g_i64)
+; LA64-NEXT:    ori $a1, $zero, 1
+; LA64-NEXT:    st.d $a1, $a0, 0
+; LA64-NEXT:    ret
+;
+; LA64-LARGE-LABEL: store_i64:
+; LA64-LARGE:       # %bb.0: # %entry
+; LA64-LARGE-NEXT:    pcalau12i $a0, %pc_hi20(g_i64)
+; LA64-LARGE-NEXT:    addi.d $a1, $zero, %pc_lo12(g_i64)
+; LA64-LARGE-NEXT:    lu32i.d $a1, %pc64_lo20(g_i64)
+; LA64-LARGE-NEXT:    lu52i.d $a1, $a1, %pc64_hi12(g_i64)
+; LA64-LARGE-NEXT:    add.d $a0, $a1, $a0
+; LA64-LARGE-NEXT:    ori $a1, $zero, 1
+; LA64-LARGE-NEXT:    st.d $a1, $a0, 0
+; LA64-LARGE-NEXT:    ret
+entry:
+  store i64 1, ptr @g_i64
+  ret void
+}
+
+@g_f32 = dso_local global float 0.0
+
+define dso_local float @load_f32() nounwind {
+; LA32-LABEL: load_f32:
+; LA32:       # %bb.0: # %entry
+; LA32-NEXT:    pcalau12i $a0, %pc_hi20(g_f32)
+; LA32-NEXT:    addi.w $a0, $a0, %pc_lo12(g_f32)
+; LA32-NEXT:    fld.s $fa0, $a0, 0
+; LA32-NEXT:    ret
+;
+; LA64-LABEL: load_f32:
+; LA64:       # %bb.0: # %entry
+; LA64-NEXT:    pcalau12i $a0, %pc_hi20(g_f32)
+; LA64-NEXT:    addi.d $a0, $a0, %pc_lo12(g_f32)
+; LA64-NEXT:    fld.s $fa0, $a0, 0
+; LA64-NEXT:    ret
+;
+; LA64-LARGE-LABEL: load_f32:
+; LA64-LARGE:       # %bb.0: # %entry
+; LA64-LARGE-NEXT:    pcalau12i $a0, %pc_hi20(g_f32)
+; LA64-LARGE-NEXT:    addi.d $a1, $zero, %pc_lo12(g_f32)
+; LA64-LARGE-NEXT:    lu32i.d $a1, %pc64_lo20(g_f32)
+; LA64-LARGE-NEXT:    lu52i.d $a1, $a1, %pc64_hi12(g_f32)
+; LA64-LARGE-NEXT:    add.d $a0, $a1, $a0
+; LA64-LARGE-NEXT:    fld.s $fa0, $a0, 0
+; LA64-LARGE-NEXT:    ret
+entry:
+  %0 = load float, ptr @g_f32
+  ret float %0
+}
+
+define dso_local void @store_f32() nounwind {
+; LA32-LABEL: store_f32:
+; LA32:       # %bb.0: # %entry
+; LA32-NEXT:    pcalau12i $a0, %pc_hi20(g_f32)
+; LA32-NEXT:    addi.w $a0, $a0, %pc_lo12(g_f32)
+; LA32-NEXT:    lu12i.w $a1, 260096
+; LA32-NEXT:    st.w $a1, $a0, 0
+; LA32-NEXT:    ret
+;
+; LA64-LABEL: store_f32:
+; LA64:       # %bb.0: # %entry
+; LA64-NEXT:    pcalau12i $a0, %pc_hi20(g_f32)
+; LA64-NEXT:    addi.d $a0, $a0, %pc_lo12(g_f32)
+; LA64-NEXT:    lu12i.w $a1, 260096
+; LA64-NEXT:    st.w $a1, $a0, 0
+; LA64-NEXT:    ret
+;
+; LA64-LARGE-LABEL: store_f32:
+; LA64-LARGE:       # %bb.0: # %entry
+; LA64-LARGE-NEXT:    pcalau12i $a0, %pc_hi20(g_f32)
+; LA64-LARGE-NEXT:    addi.d $a1, $zero, %pc_lo12(g_f32)
+; LA64-LARGE-NEXT:    lu32i.d $a1, %pc64_lo20(g_f32)
+; LA64-LARGE-NEXT:    lu52i.d $a1, $a1, %pc64_hi12(g_f32)
+; LA64-LARGE-NEXT:    add.d $a0, $a1, $a0
+; LA64-LARGE-NEXT:    lu12i.w $a1, 260096
+; LA64-LARGE-NEXT:    st.w $a1, $a0, 0
+; LA64-LARGE-NEXT:    ret
+entry:
+  store float 1.0, ptr @g_f32
+  ret void
+}
+
+@g_f64 = dso_local global double 0.0
+
+define dso_local double @load_f64() nounwind {
+; LA32-LABEL: load_f64:
+; LA32:       # %bb.0: # %entry
+; LA32-NEXT:    pcalau12i $a0, %pc_hi20(g_f64)
+; LA32-NEXT:    addi.w $a0, $a0, %pc_lo12(g_f64)
+; LA32-NEXT:    fld.d $fa0, $a0, 0
+; LA32-NEXT:    ret
+;
+; LA64-LABEL: load_f64:
+; LA64:       # %bb.0: # %entry
+; LA64-NEXT:    pcalau12i $a0, %pc_hi20(g_f64)
+; LA64-NEXT:    addi.d $a0, $a0, %pc_lo12(g_f64)
+; LA64-NEXT:    fld.d $fa0, $a0, 0
+; LA64-NEXT:    ret
+;
+; LA64-LARGE-LABEL: load_f64:
+; LA64-LARGE:       # %bb.0: # %entry
+; LA64-LARGE-NEXT:    pcalau12i $a0, %pc_hi20(g_f64)
+; LA64-LARGE-NEXT:    addi.d $a1, $zero, %pc_lo12(g_f64)
+; LA64-LARGE-NEXT:    lu32i.d $a1, %pc64_lo20(g_f64)
+; LA64-LARGE-NEXT:    lu52i.d $a1, $a1, %pc64_hi12(g_f64)
+; LA64-LARGE-NEXT:    add.d $a0, $a1, $a0
+; LA64-LARGE-NEXT:    fld.d $fa0, $a0, 0
+; LA64-LARGE-NEXT:    ret
+entry:
+  %0 = load double, ptr @g_f64
+  ret double %0
+}
+
+define dso_local void @store_f64() nounwind {
+; LA32-LABEL: store_f64:
+; LA32:       # %bb.0: # %entry
+; LA32-NEXT:    pcalau12i $a0, %pc_hi20(g_f64)
+; LA32-NEXT:    addi.w $a0, $a0, %pc_lo12(g_f64)
+; LA32-NEXT:    addi.w $a1, $zero, 1
+; LA32-NEXT:    movgr2fr.w $fa0, $a1
+; LA32-NEXT:    ffint.s.w $fa0, $fa0
+; LA32-NEXT:    fcvt.d.s $fa0, $fa0
+; LA32-NEXT:    fst.d $fa0, $a0, 0
+; LA32-NEXT:    ret
+;
+; LA64-LABEL: store_f64:
+; LA64:       # %bb.0: # %entry
+; LA64-NEXT:    pcalau12i $a0, %pc_hi20(g_f64)
+; LA64-NEXT:    addi.d $a0, $a0, %pc_lo12(g_f64)
+; LA64-NEXT:    lu52i.d $a1, $zero, 1023
+; LA64-NEXT:    st.d $a1, $a0, 0
+; LA64-NEXT:    ret
+;
+; LA64-LARGE-LABEL: store_f64:
+; LA64-LARGE:       # %bb.0: # %entry
+; LA64-LARGE-NEXT:    pcalau12i $a0, %pc_hi20(g_f64)
+; LA64-LARGE-NEXT:    addi.d $a1, $zero, %pc_lo12(g_f64)
+; LA64-LARGE-NEXT:    lu32i.d $a1, %pc64_lo20(g_f64)
+; LA64-LARGE-NEXT:    lu52i.d $a1, $a1, %pc64_hi12(g_f64)
+; LA64-LARGE-NEXT:    add.d $a0, $a1, $a0
+; LA64-LARGE-NEXT:    lu52i.d $a1, $zero, 1023
+; LA64-LARGE-NEXT:    st.d $a1, $a0, 0
+; LA64-LARGE-NEXT:    ret
+entry:
+  store double 1.0, ptr @g_f64
+  ret void
+}
+
+@g_m64 = dso_local global i64 0
+
+define dso_local void @store_multi() nounwind {
+; LA32-LABEL: store_multi:
+; LA32:       # %bb.0: # %entry
+; LA32-NEXT:    pcalau12i $a0, %pc_hi20(g_m64)
+; LA32-NEXT:    addi.w $a0, $a0, %pc_lo12(g_m64)
+; LA32-NEXT:    st.w $zero, $a0, 4
+; LA32-NEXT:    ori $a1, $zero, 1
+; LA32-NEXT:    st.w $a1, $a0, 0
+; LA32-NEXT:    st.w $zero, $a0, 4
+; LA32-NEXT:    ori $a1, $zero, 2
+; LA32-NEXT:    st.w $a1, $a0, 0
+; LA32-NEXT:    ret
+;
+; LA64-LABEL: store_multi:
+; LA64:       # %bb.0: # %entry
+; LA64-NEXT:    pcalau12i $a0, %pc_hi20(g_m64)
+; LA64-NEXT:    addi.d $a0, $a0, %pc_lo12(g_m64)
+; LA64-NEXT:    ori $a1, $zero, 1
+; LA64-NEXT:    st.d $a1, $a0, 0
+; LA64-NEXT:    ori $a1, $zero, 2
+; LA64-NEXT:    st.d $a1, $a0, 0
+; LA64-NEXT:    ret
+;
+; LA64-LARGE-LABEL: store_multi:
+; LA64-LARGE:       # %bb.0: # %entry
+; LA64-LARGE-NEXT:    pcalau12i $a0, %pc_hi20(g_m64)
+; LA64-LARGE-NEXT:    addi.d $a1, $zero, %pc_lo12(g_m64)
+; LA64-LARGE-NEXT:    lu32i.d $a1, %pc64_lo20(g_m64)
+; LA64-LARGE-NEXT:    lu52i.d $a1, $a1, %pc64_hi12(g_m64)
+; LA64-LARGE-NEXT:    add.d $a0, $a1, $a0
+; LA64-LARGE-NEXT:    ori $a1, $zero, 1
+; LA64-LARGE-NEXT:    st.d $a1, $a0, 0
+; LA64-LARGE-NEXT:    ori $a1, $zero, 2
+; LA64-LARGE-NEXT:    st.d $a1, $a0, 0
+; LA64-LARGE-NEXT:    ret
+entry:
+  store volatile i64 1, ptr @g_m64
+  store volatile i64 2, ptr @g_m64
+  ret void
+}
+
+@g_sf32 = dso_local global float 0.0
+
+define dso_local void @store_sf32() nounwind {
+; LA32-LABEL: store_sf32:
+; LA32:       # %bb.0: # %entry
+; LA32-NEXT:    pcalau12i $a0, %pc_hi20(g_sf32)
+; LA32-NEXT:    addi.w $a0, $a0, %pc_lo12(g_sf32)
+; LA32-NEXT:    fld.s $fa0, $a0, 0
+; LA32-NEXT:    fst.s $fa0, $a0, 0
+; LA32-NEXT:    ret
+;
+; LA64-LABEL: store_sf32:
+; LA64:       # %bb.0: # %entry
+; LA64-NEXT:    pcalau12i $a0, %pc_hi20(g_sf32)
+; LA64-NEXT:    addi.d $a0, $a0, %pc_lo12(g_sf32)
+; LA64-NEXT:    fld.s $fa0, $a0, 0
+; LA64-NEXT:    fst.s $fa0, $a0, 0
+; LA64-NEXT:    ret
+;
+; LA64-LARGE-LABEL: store_sf32:
+; LA64-LARGE:       # %bb.0: # %entry
+; LA64-LARGE-NEXT:    pcalau12i $a0, %pc_hi20(g_sf32)
+; LA64-LARGE-NEXT:    addi.d $a1, $zero, %pc_lo12(g_sf32)
+; LA64-LARGE-NEXT:    lu32i.d $a1, %pc64_lo20(g_sf32)
+; LA64-LARGE-NEXT:    lu52i.d $a1, $a1, %pc64_hi12(g_sf32)
+; LA64-LARGE-NEXT:    add.d $a0, $a1, $a0
+; LA64-LARGE-NEXT:    fld.s $fa0, $a0, 0
+; LA64-LARGE-NEXT:    fst.s $fa0, $a0, 0
+; LA64-LARGE-NEXT:    ret
+entry:
+  %0 = load float, ptr @g_sf32
+  store volatile float %0, ptr @g_sf32
+  ret void
+}
+
+@g_sf64 = dso_local global double 0.0
+
+define dso_local void @store_sf64() nounwind {
+; LA32-LABEL: store_sf64:
+; LA32:       # %bb.0: # %entry
+; LA32-NEXT:    pcalau12i $a0, %pc_hi20(g_sf64)
+; LA32-NEXT:    addi.w $a0, $a0, %pc_lo12(g_sf64)
+; LA32-NEXT:    fld.d $fa0, $a0, 0
+; LA32-NEXT:    fst.d $fa0, $a0, 0
+; LA32-NEXT:    ret
+;
+; LA64-LABEL: store_sf64:
+; LA64:       # %bb.0: # %entry
+; LA64-NEXT:    pcalau12i $a0, %pc_hi20(g_sf64)
+; LA64-NEXT:    addi.d $a0, $a0, %pc_lo12(g_sf64)
+; LA64-NEXT:    fld.d $fa0, $a0, 0
+; LA64-NEXT:    fst.d $fa0, $a0, 0
+; LA64-NEXT:    ret
+;
+; LA64-LARGE-LABEL: store_sf64:
+; LA64-LARGE:       # %bb.0: # %entry
+; LA64-LARGE-NEXT:    pcalau12i $a0, %pc_hi20(g_sf64)
+; LA64-LARGE-NEXT:    addi.d $a1, $zero, %pc_lo12(g_sf64)
+; LA64-LARGE-NEXT:    lu32i.d $a1, %pc64_lo20(g_sf64)
+; LA64-LARGE-NEXT:    lu52i.d $a1, $a1, %pc64_hi12(g_sf64)
+; LA64-LARGE-NEXT:    add.d $a0, $a1, $a0
+; LA64-LARGE-NEXT:    fld.d $fa0, $a0, 0
+; LA64-LARGE-NEXT:    fst.d $fa0, $a0, 0
+; LA64-LARGE-NEXT:    ret
+entry:
+  %0 = load double, ptr @g_sf64
+  store volatile double %0, ptr @g_sf64
+  ret void
+}
+
+@g_rmw = dso_local global i64 0
+
+define dso_local void @rmw() nounwind {
+; LA32-LABEL: rmw:
+; LA32:       # %bb.0: # %entry
+; LA32-NEXT:    pcalau12i $a0, %pc_hi20(g_rmw)
+; LA32-NEXT:    addi.w $a0, $a0, %pc_lo12(g_rmw)
+; LA32-NEXT:    ld.w $a1, $a0, 0
+; LA32-NEXT:    ld.w $a2, $a0, 4
+; LA32-NEXT:    addi.w $a1, $a1, 1
+; LA32-NEXT:    sltui $a3, $a1, 1
+; LA32-NEXT:    add.w $a2, $a2, $a3
+; LA32-NEXT:    st.w $a1, $a0, 0
+; LA32-NEXT:    st.w $a2, $a0, 4
+; LA32-NEXT:    ret
+;
+; LA64-LABEL: rmw:
+; LA64:       # %bb.0: # %entry
+; LA64-NEXT:    pcalau12i $a0, %pc_hi20(g_rmw)
+; LA64-NEXT:    addi.d $a0, $a0, %pc_lo12(g_rmw)
+; LA64-NEXT:    ld.d $a1, $a0, 0
+; LA64-NEXT:    addi.d $a1, $a1, 1
+; LA64-NEXT:    st.d $a1, $a0, 0
+; LA64-NEXT:    ret
+;
+; LA64-LARGE-LABEL: rmw:
+; LA64-LARGE:       # %bb.0: # %entry
+; LA64-LARGE-NEXT:    pcalau12i $a0, %pc_hi20(g_rmw)
+; LA64-LARGE-NEXT:    addi.d $a1, $zero, %pc_lo12(g_rmw)
+; LA64-LARGE-NEXT:    lu32i.d $a1, %pc64_lo20(g_rmw)
+; LA64-LARGE-NEXT:    lu52i.d $a1, $a1, %pc64_hi12(g_rmw)
+; LA64-LARGE-NEXT:    add.d $a0, $a1, $a0
+; LA64-LARGE-NEXT:    ld.d $a1, $a0, 0
+; LA64-LARGE-NEXT:    addi.d $a1, $a1, 1
+; LA64-LARGE-NEXT:    st.d $a1, $a0, 0
+; LA64-LARGE-NEXT:    ret
+entry:
+  %0 = load i64, ptr @g_rmw
+  %1 = add i64 %0, 1
+  store i64 %1, ptr @g_rmw
+  ret void
+}
+
+@g_a32 = dso_local global [2048 x i32] zeroinitializer, align 4
+
+define dso_local void @store_a32() nounwind {
+; LA32-LABEL: store_a32:
+; LA32:       # %bb.0: # %entry
+; LA32-NEXT:    pcalau12i $a0, %pc_hi20(g_a32)
+; LA32-NEXT:    addi.w $a0, $a0, %pc_lo12(g_a32)
+; LA32-NEXT:    lu12i.w $a1, 1
+; LA32-NEXT:    add.w $a0, $a0, $a1
+; LA32-NEXT:    ori $a1, $zero, 1
+; LA32-NEXT:    st.w $a1, $a0, 0
+; LA32-NEXT:    ret
+;
+; LA64-LABEL: store_a32:
+; LA64:       # %bb.0: # %entry
+; LA64-NEXT:    pcalau12i $a0, %pc_hi20(g_a32)
+; LA64-NEXT:    addi.d $a0, $a0, %pc_lo12(g_a32)
+; LA64-NEXT:    ori $a1, $zero, 1
+; LA64-NEXT:    stptr.w $a1, $a0, 4096
+; LA64-NEXT:    ret
+;
+; LA64-LARGE-LABEL: store_a32:
+; LA64-LARGE:       # %bb.0: # %entry
+; LA64-LARGE-NEXT:    pcalau12i $a0, %pc_hi20(g_a32)
+; LA64-LARGE-NEXT:    addi.d $a1, $zero, %pc_lo12(g_a32)
+; LA64-LARGE-NEXT:    lu32i.d $a1, %pc64_lo20(g_a32)
+; LA64-LARGE-NEXT:    lu52i.d $a1, $a1, %pc64_hi12(g_a32)
+; LA64-LARGE-NEXT:    add.d $a0, $a1, $a0
+; LA64-LARGE-NEXT:    ori $a1, $zero, 1
+; LA64-LARGE-NEXT:    stptr.w $a1, $a0, 4096
+; LA64-LARGE-NEXT:    ret
+entry:
+  store i32 1, ptr getelementptr inbounds ([1 x i32], ptr @g_a32, i32 1024), align 4
+  ret void
+}
+
+define dso_local void @store_a32_2() nounwind {
+; LA32-LABEL: store_a32_2:
+; LA32:       # %bb.0: # %entry
+; LA32-NEXT:    pcalau12i $a0, %pc_hi20(g_a32)
+; LA32-NEXT:    addi.w $a0, $a0, %pc_lo12(g_a32)
+; LA32-NEXT:    lu12i.w $a1, 1
+; LA32-NEXT:    add.w $a2, $a0, $a1
+; LA32-NEXT:    ori $a3, $zero, 1
+; LA32-NEXT:    st.w $a3, $a2, 0
+; LA32-NEXT:    ori $a1, $a1, 8
+; LA32-NEXT:    add.w $a0, $a0, $a1
+; LA32-NEXT:    ori $a1, $zero, 2
+; LA32-NEXT:    st.w $a1, $a0, 0
+; LA32-NEXT:    ret
+;
+; LA64-LABEL: store_a32_2:
+; LA64:       # %bb.0: # %entry
+; LA64-NEXT:    pcalau12i $a0, %pc_hi20(g_a32)
+; LA64-NEXT:    addi.d $a0, $a0, %pc_lo12(g_a32)
+; LA64-NEXT:    ori $a1, $zero, 1
+; LA64-NEXT:    stptr.w $a1, $a0, 4096
+; LA64-NEXT:    ori $a1, $zero, 2
+; LA64-NEXT:    stptr.w $a1, $a0, 4104
+; LA64-NEXT:    ret
+;
+; LA64-LARGE-LABEL: store_a32_2:
+; LA64-LARGE:       # %bb.0: # %entry
+; LA64-LARGE-NEXT:    pcalau12i $a0, %pc_hi20(g_a32)
+; LA64-LARGE-NEXT:    addi.d $a1, $zero, %pc_lo12(g_a32)
+; LA64-LARGE-NEXT:    lu32i.d $a1, %pc64_lo20(g_a32)
+; LA64-LARGE-NEXT:    lu52i.d $a1, $a1, %pc64_hi12(g_a32)
+; LA64-LARGE-NEXT:    add.d $a0, $a1, $a0
+; LA64-LARGE-NEXT:    ori $a1, $zero, 1
+; LA64-LARGE-NEXT:    stptr.w $a1, $a0, 4096
+; LA64-LARGE-NEXT:    ori $a1, $zero, 2
+; LA64-LARGE-NEXT:    stptr.w $a1, $a0, 4104
+; LA64-LARGE-NEXT:    ret
+entry:
+  store i32 1, ptr getelementptr inbounds ([1 x i32], ptr @g_a32, i32 1024), align 4
+  store i32 2, ptr getelementptr inbounds ([1 x i32], ptr @g_a32, i32 1026), align 4
+  ret void
+}
+
+define dso_local void @control_flow_with_mem_access() nounwind {
+; LA32-LABEL: control_flow_with_mem_access:
+; LA32:       # %bb.0: # %entry
+; LA32-NEXT:    pcalau12i $a0, %pc_hi20(g_a32)
+; LA32-NEXT:    addi.w $a0, $a0, %pc_lo12(g_a32)
+; LA32-NEXT:    ld.w $a1, $a0, 4
+; LA32-NEXT:    ori $a2, $zero, 1
+; LA32-NEXT:    blt $a1, $a2, .LBB21_2
+; LA32-NEXT:  # %bb.1: # %if.then
+; LA32-NEXT:    ori $a1, $zero, 10
+; LA32-NEXT:    st.w $a1, $a0, 4
+; LA32-NEXT:  .LBB21_2: # %if.end
+; LA32-NEXT:    ret
+;
+; LA64-LABEL: control_flow_with_mem_access:
+; LA64:       # %bb.0: # %entry
+; LA64-NEXT:    pcalau12i $a0, %pc_hi20(g_a32)
+; LA64-NEXT:    addi.d $a0, $a0, %pc_lo12(g_a32)
+; LA64-NEXT:    ld.w $a1, $a0, 4
+; LA64-NEXT:    ori $a2, $zero, 1
+; LA64-NEXT:    blt $a1, $a2, .LBB21_2
+; LA64-NEXT:  # %bb.1: # %if.then
+; LA64-NEXT:    ori $a1, $zero, 10
+; LA64-NEXT:    st.w $a1, $a0, 4
+; LA64-NEXT:  .LBB21_2: # %if.end
+; LA64-NEXT:    ret
+;
+; LA64-LARGE-LABEL: control_flow_with_mem_access:
+; LA64-LARGE:       # %bb.0: # %entry
+; LA64-LARGE-NEXT:    pcalau12i $a0, %pc_hi20(g_a32)
+; LA64-LARGE-NEXT:    addi.d $a1, $zero, %pc_lo12(g_a32)
+; LA64-LARGE-NEXT:    lu32i.d $a1, %pc64_lo20(g_a32)
+; LA64-LARGE-NEXT:    lu52i.d $a1, $a1, %pc64_hi12(g_a32)
+; LA64-LARGE-NEXT:    add.d $a0, $a1, $a0
+; LA64-LARGE-NEXT:    ld.w $a0, $a0, 4
+; LA64-LARGE-NEXT:    ori $a1, $zero, 1
+; LA64-LARGE-NEXT:    blt $a0, $a1, .LBB21_2
+; LA64-LARGE-NEXT:  # %bb.1: # %if.then
+; LA64-LARGE-NEXT:    pcalau12i $a0, %pc_hi20(g_a32)
+; LA64-LARGE-NEXT:    addi.d $a1, $zero, %pc_lo12(g_a32)
+; LA64-LARGE-NEXT:    lu32i.d $a1, %pc64_lo20(g_a32)
+; LA64-LARGE-NEXT:    lu52i.d $a1, $a1, %pc64_hi12(g_a32)
+; LA64-LARGE-NEXT:    add.d $a0, $a1, $a0
+; LA64-LARGE-NEXT:    ori $a1, $zero, 10
+; LA64-LARGE-NEXT:    st.w $a1, $a0, 4
+; LA64-LARGE-NEXT:  .LBB21_2: # %if.end
+; LA64-LARGE-NEXT:    ret
+entry:
+  %0 = load i32, ptr getelementptr inbounds ([1 x i32], ptr @g_a32, i32 1), align 4
+  %cmp = icmp sgt i32 %0, 0
+  br i1 %cmp, label %if.then, label %if.end
+
+if.then:
+  store i32 10, ptr getelementptr inbounds ([1 x i32], ptr @g_a32, i32 1), align 4
+  br label %if.end
+
+if.end:
+  ret void
+}
+
+define dso_local ptr @load_ba_1() nounwind {
+; LA32-LABEL: load_ba_1:
+; LA32:       # %bb.0: # %entry
+; LA32-NEXT:  .Ltmp0: # Block address taken
+; LA32-NEXT:  # %bb.1: # %label
+; LA32-NEXT:    pcalau12i $a0, %pc_hi20(.Ltmp0)
+; LA32-NEXT:    addi.w $a0, $a0, %pc_lo12(.Ltmp0)
+; LA32-NEXT:    ld.w $a0, $a0, 0
+; LA32-NEXT:    ret
+;
+; LA64-LABEL: load_ba_1:
+; LA64:       # %bb.0: # %entry
+; LA64-NEXT:  .Ltmp0: # Block address taken
+; LA64-NEXT:  # %bb.1: # %label
+; LA64-NEXT:    pcalau12i $a0, %pc_hi20(.Ltmp0)
+; LA64-NEXT:    addi.d $a0, $a0, %pc_lo12(.Ltmp0)
+; LA64-NEXT:    ld.d $a0, $a0, 0
+; LA64-NEXT:    ret
+;
+; LA64-LARGE-LABEL: load_ba_1:
+; LA64-LARGE:       # %bb.0: # %entry
+; LA64-LARGE-NEXT:  .Ltmp0: # Block address taken
+; LA64-LARGE-NEXT:  # %bb.1: # %label
+; LA64-LARGE-NEXT:    pcalau12i $a0, %pc_hi20(.Ltmp0)
+; LA64-LARGE-NEXT:    addi.d $a1, $zero, %pc_lo12(.Ltmp0)
+; LA64-LARGE-NEXT:    lu32i.d $a1, %pc64_lo20(.Ltmp0)
+; LA64-LARGE-NEXT:    lu52i.d $a1, $a1, %pc64_hi12(.Ltmp0)
+; LA64-LARGE-NEXT:    add.d $a0, $a1, $a0
+; LA64-LARGE-NEXT:    ld.d $a0, $a0, 0
+; LA64-LARGE-NEXT:    ret
+entry:
+  br label %label
+label:
+  %0 = load ptr, ptr blockaddress(@load_ba_1, %label)
+  ret ptr %0
+}
+
+define dso_local ptr @load_ba_2() nounwind {
+; LA32-LABEL: load_ba_2:
+; LA32:       # %bb.0: # %entry
+; LA32-NEXT:  .Ltmp1: # Block address taken
+; LA32-NEXT:  # %bb.1: # %label
+; LA32-NEXT:    pcalau12i $a0, %pc_hi20(.Ltmp1)
+; LA32-NEXT:    addi.w $a0, $a0, %pc_lo12(.Ltmp1)
+; LA32-NEXT:    ld.w $a0, $a0, 8
+; LA32-NEXT:    ret
+;
+; LA64-LABEL: load_ba_2:
+; LA64:       # %bb.0: # %entry
+; LA64-NEXT:  .Ltmp1: # Block address taken
+; LA64-NEXT:  # %bb.1: # %label
+; LA64-NEXT:    pcalau12i $a0, %pc_hi20(.Ltmp1)
+; LA64-NEXT:    addi.d $a0, $a0, %pc_lo12(.Ltmp1)
+; LA64-NEXT:    ld.d $a0, $a0, 8
+; LA64-NEXT:    ret
+;
+; LA64-LARGE-LABEL: load_ba_2:
+; LA64-LARGE:       # %bb.0: # %entry
+; LA64-LARGE-NEXT:  .Ltmp1: # Block address taken
+; LA64-LARGE-NEXT:  # %bb.1: # %label
+; LA64-LARGE-NEXT:    pcalau12i $a0, %pc_hi20(.Ltmp1)
+; LA64-LARGE-NEXT:    addi.d $a1, $zero, %pc_lo12(.Ltmp1)
+; LA64-LARGE-NEXT:    lu32i.d $a1, %pc64_lo20(.Ltmp1)
+; LA64-LARGE-NEXT:    lu52i.d $a1, $a1, %pc64_hi12(.Ltmp1)
+; LA64-LARGE-NEXT:    add.d $a0, $a1, $a0
+; LA64-LARGE-NEXT:    ld.d $a0, $a0, 8
+; LA64-LARGE-NEXT:    ret
+entry:
+  br label %label
+label:
+  %0 = load ptr, ptr getelementptr inbounds (i8, ptr blockaddress(@load_ba_2, %label), i32 8)
+  ret ptr %0
+}
+
+@g_a64 = dso_local global [614750729487779976 x i64] zeroinitializer, align 8
+
+define dso_local ptr @load_addr_offset_1() nounwind {
+; LA32-LABEL: load_addr_offset_1:
+; LA32:       # %bb.0: # %entry
+; LA32-NEXT:    pcalau12i $a0, %pc_hi20(g_a64)
+; LA32-NEXT:    addi.w $a0, $a0, %pc_lo12(g_a64)
+; LA32-NEXT:    addi.w $a0, $a0, 8
+; LA32-NEXT:    ret
+;
+; LA64-LABEL: load_addr_offset_1:
+; LA64:       # %bb.0: # %entry
+; LA64-NEXT:    pcalau12i $a0, %pc_hi20(g_a64)
+; LA64-NEXT:    addi.d $a0, $a0, %pc_lo12(g_a64)
+; LA64-NEXT:    addi.d $a0, $a0, 8
+; LA64-NEXT:    ret
+;
+; LA64-LARGE-LABEL: load_addr_offset_1:
+; LA64-LARGE:       # %bb.0: # %entry
+; LA64-LARGE-NEXT:    pcalau12i $a0, %pc_hi20(g_a64)
+; LA64-LARGE-NEXT:    addi.d $a1, $zero, %pc_lo12(g_a64)
+; LA64-LARGE-NEXT:    lu32i.d $a1, %pc64_lo20(g_a64)
+; LA64-LARGE-NEXT:    lu52i.d $a1, $a1, %pc64_hi12(g_a64)
+; LA64-LARGE-NEXT:    add.d $a0, $a1, $a0
+; LA64-LARGE-NEXT:    addi.d $a0, $a0, 8
+; LA64-LARGE-NEXT:    ret
+entry:
+  ret ptr getelementptr inbounds ([1 x i64], ptr @g_a64, i64 1)
+}
+
+define dso_local ptr @load_addr_offset_257() nounwind {
+; LA32-LABEL: load_addr_offset_257:
+; LA32:       # %bb.0: # %entry
+; LA32-NEXT:    pcalau12i $a0, %pc_hi20(g_a64)
+; LA32-NEXT:    addi.w $a0, $a0, %pc_lo12(g_a64)
+; LA32-NEXT:    addi.w $a0, $a0, 2047
+; LA32-NEXT:    addi.w $a0, $a0, 9
+; LA32-NEXT:    ret
+;
+; LA64-LABEL: load_addr_offset_257:
+; LA64:       # %bb.0: # %entry
+; LA64-NEXT:    pcalau12i $a0, %pc_hi20(g_a64)
+; LA64-NEXT:    addi.d $a0, $a0, %pc_lo12(g_a64)
+; LA64-NEXT:    addi.d $a0, $a0, 2047
+; LA64-NEXT:    addi.d $a0, $a0, 9
+; LA64-NEXT:    ret
+;
+; LA64-LARGE-LABEL: load_addr_offset_257:
+; LA64-LARGE:       # %bb.0: # %entry
+; LA64-LARGE-NEXT:    pcalau12i $a0, %pc_hi20(g_a64)
+; LA64-LARGE-NEXT:    addi.d $a1, $zero, %pc_lo12(g_a64)
+; LA64-LARGE-NEXT:    lu32i.d $a1, %pc64_lo20(g_a64)
+; LA64-LARGE-NEXT:    lu52i.d $a1, $a1, %pc64_hi12(g_a64)
+; LA64-LARGE-NEXT:    add.d $a0, $a1, $a0
+; LA64-LARGE-NEXT:    addi.d $a0, $a0, 2047
+; LA64-LARGE-NEXT:    addi.d $a0, $a0, 9
+; LA64-LARGE-NEXT:    ret
+entry:
+  ret ptr getelementptr inbounds ([1 x i64], ptr @g_a64, i64 257)
+}
+
+define dso_local ptr @load_addr_offset_1048576() nounwind {
+; LA32-LABEL: load_addr_offset_1048576:
+; LA32:       # %bb.0: # %entry
+; LA32-NEXT:    pcalau12i $a0, %pc_hi20(g_a64)
+; LA32-NEXT:    addi.w $a0, $a0, %pc_lo12(g_a64)
+; LA32-NEXT:    lu12i.w $a1, 2048
+; LA32-NEXT:    add.w $a0, $a0, $a1
+; LA32-NEXT:    ret
+;
+; LA64-LABEL: load_addr_offset_1048576:
+; LA64:       # %bb.0: # %entry
+; LA64-NEXT:    pcalau12i $a0, %pc_hi20(g_a64)
+; LA64-NEXT:    addi.d $a0, $a0, %pc_lo12(g_a64)
+; LA64-NEXT:    addu16i.d $a0, $a0, 128
+; LA64-NEXT:    ret
+;
+; LA64-LARGE-LABEL: load_addr_offset_1048576:
+; LA64-LARGE:       # %bb.0: # %entry
+; LA64-LARGE-NEXT:    pcalau12i $a0, %pc_hi20(g_a64)
+; LA64-LARGE-NEXT:    addi.d $a1, $zero, %pc_lo12(g_a64)
+; LA64-LARGE-NEXT:    lu32i.d $a1, %pc64_lo20(g_a64)
+; LA64-LARGE-NEXT:    lu52i.d $a1, $a1, %pc64_hi12(g_a64)
+; LA64-LARGE-NEXT:    add.d $a0, $a1, $a0
+; LA64-LARGE-NEXT:    addu16i.d $a0, $a0, 128
+; LA64-LARGE-NEXT:    ret
+entry:
+  ret ptr getelementptr inbounds ([1 x i64], ptr @g_a64, i64 1048576)
+}
+
+define dso_local ptr @load_addr_offset_1048577() nounwind {
+; LA32-LABEL: load_addr_offset_1048577:
+; LA32:       # %bb.0: # %entry
+; LA32-NEXT:    pcalau12i $a0, %pc_hi20(g_a64)
+; LA32-NEXT:    addi.w $a0, $a0, %pc_lo12(g_a64)
+; LA32-NEXT:    lu12i.w $a1, 2048
+; LA32-NEXT:    ori $a1, $a1, 8
+; LA32-NEXT:    add.w $a0, $a0, $a1
+; LA32-NEXT:    ret
+;
+; LA64-LABEL: load_addr_offset_1048577:
+; LA64:       # %bb.0: # %entry
+; LA64-NEXT:    pcalau12i $a0, %pc_hi20(g_a64)
+; LA64-NEXT:    addi.d $a0, $a0, %pc_lo12(g_a64)
+; LA64-NEXT:    addu16i.d $a0, $a0, 128
+; LA64-NEXT:    addi.d $a0, $a0, 8
+; LA64-NEXT:    ret
+;
+; LA64-LARGE-LABEL: load_addr_offset_1048577:
+; LA64-LARGE:       # %bb.0: # %entry
+; LA64-LARGE-NEXT:    pcalau12i $a0, %pc_hi20(g_a64)
+; LA64-LARGE-NEXT:    addi.d $a1, $zero, %pc_lo12(g_a64)
+; LA64-LARGE-NEXT:    lu32i.d $a1, %pc64_lo20(g_a64)
+; LA64-LARGE-NEXT:    lu52i.d $a1, $a1, %pc64_hi12(g_a64)
+; LA64-LARGE-NEXT:    add.d $a0, $a1, $a0
+; LA64-LARGE-NEXT:    addu16i.d $a0, $a0, 128
+; LA64-LARGE-NEXT:    addi.d $a0, $a0, 8
+; LA64-LARGE-NEXT:    ret
+entry:
+  ret ptr getelementptr inbounds ([1 x i64], ptr @g_a64, i64 1048577)
+}
+
+define dso_local ptr @load_addr_offset_268432896() nounwind {
+; LA32-LABEL: load_addr_offset_268432896:
+; LA32:       # %bb.0: # %entry
+; LA32-NEXT:    pcalau12i $a0, %pc_hi20(g_a64)
+; LA32-NEXT:    addi.w $a0, $a0, %pc_lo12(g_a64)
+; LA32-NEXT:    lu12i.w $a1, 524283
+; LA32-NEXT:    add.w $a0, $a0, $a1
+; LA32-NEXT:    ret
+;
+; LA64-LABEL: load_addr_offset_268432896:
+; LA64:       # %bb.0: # %entry
+; LA64-NEXT:    pcalau12i $a0, %pc_hi20(g_a64)
+; LA64-NEXT:    addi.d $a0, $a0, %pc_lo12(g_a64)
+; LA64-NEXT:    lu12i.w $a1, 524283
+; LA64-NEXT:    add.d $a0, $a0, $a1
+; LA64-NEXT:    ret
+;
+; LA64-LARGE-LABEL: load_addr_offset_268432896:
+; LA64-LARGE:       # %bb.0: # %entry
+; LA64-LARGE-NEXT:    pcalau12i $a0, %pc_hi20(g_a64)
+; LA64-LARGE-NEXT:    addi.d $a1, $zero, %pc_lo12(g_a64)
+; LA64-LARGE-NEXT:    lu32i.d $a1, %pc64_lo20(g_a64)
+; LA64-LARGE-NEXT:    lu52i.d $a1, $a1, %pc64_hi12(g_a64)
+; LA64-LARGE-NEXT:    add.d $a0, $a1, $a0
+; LA64-LARGE-NEXT:    lu12i.w $a1, 524283
+; LA64-LARGE-NEXT:    add.d $a0, $a0, $a1
+; LA64-LARGE-NEXT:    ret
+entry:
+  ret ptr getelementptr inbounds ([1 x i64], ptr @g_a64, i64 268432896)
+}
+
+define dso_local ptr @load_addr_offset_268432897() nounwind {
+; LA32-LABEL: load_addr_offset_268432897:
+; LA32:       # %bb.0: # %entry
+; LA32-NEXT:    pcalau12i $a0, %pc_hi20(g_a64)
+; LA32-NEXT:    addi.w $a0, $a0, %pc_lo12(g_a64)
+; LA32-NEXT:    lu12i.w $a1, 524283
+; LA32-NEXT:    ori $a1, $a1, 8
+; LA32-NEXT:    add.w $a0, $a0, $a1
+; LA32-NEXT:    ret
+;
+; LA64-LABEL: load_addr_offset_268432897:
+; LA64:       # %bb.0: # %entry
+; LA64-NEXT:    pcalau12i $a0, %pc_hi20(g_a64)
+; LA64-NEXT:    addi.d $a0, $a0, %pc_lo12(g_a64)
+; LA64-NEXT:    lu12i.w $a1, 524283
+; LA64-NEXT:    ori $a1, $a1, 8
+; LA64-NEXT:    add.d $a0, $a0, $a1
+; LA64-NEXT:    ret
+;
+; LA64-LARGE-LABEL: load_addr_offset_268432897:
+; LA64-LARGE:       # %bb.0: # %entry
+; LA64-LARGE-NEXT:    pcalau12i $a0, %pc_hi20(g_a64)
+; LA64-LARGE-NEXT:    addi.d $a1, $zero, %pc_lo12(g_a64)
+; LA64-LARGE-NEXT:    lu32i.d $a1, %pc64_lo20(g_a64)
+; LA64-LARGE-NEXT:    lu52i.d $a1, $a1, %pc64_hi12(g_a64)
+; LA64-LARGE-NEXT:    add.d $a0, $a1, $a0
+; LA64-LARGE-NEXT:    lu12i.w $a1, 524283
+; LA64-LARGE-NEXT:    ori $a1, $a1, 8
+; LA64-LARGE-NEXT:    add.d $a0, $a0, $a1
+; LA64-LARGE-NEXT:    ret
+entry:
+  ret ptr getelementptr inbounds ([1 x i64], ptr @g_a64, i64 268432897)
+}
+
+define dso_local ptr @load_addr_offset_9380351707272() nounwind {
+; LA32-LABEL: load_addr_offset_9380351707272:
+; LA32:       # %bb.0: # %entry
+; LA32-NEXT:    pcalau12i $a0, %pc_hi20(g_a64)
+; LA32-NEXT:    addi.w $a0, $a0, %pc_lo12(g_a64)
+; LA32-NEXT:    lu12i.w $a1, 279556
+; LA32-NEXT:    ori $a1, $a1, 1088
+; LA32-NEXT:    add.w $a0, $a0, $a1
+; LA32-NEXT:    ret
+;
+; LA64-LABEL: load_addr_offset_9380351707272:
+; LA64:       # %bb.0: # %entry
+; LA64-NEXT:    pcalau12i $a0, %pc_hi20(g_a64)
+; LA64-NEXT:    addi.d $a0, $a0, %pc_lo12(g_a64)
+; LA64-NEXT:    lu12i.w $a1, 279556
+; LA64-NEXT:    ori $a1, $a1, 1088
+; LA64-NEXT:    lu32i.d $a1, 17472
+; LA64-NEXT:    add.d $a0, $a0, $a1
+; LA64-NEXT:    ret
+;
+; LA64-LARGE-LABEL: load_addr_offset_9380351707272:
+; LA64-LARGE:       # %bb.0: # %entry
+; LA64-LARGE-NEXT:    pcalau12i $a0, %pc_hi20(g_a64)
+; LA64-LARGE-NEXT:    addi.d $a1, $zero, %pc_lo12(g_a64)
+; LA64-LARGE-NEXT:    lu32i.d $a1, %pc64_lo20(g_a64)
+; LA64-LARGE-NEXT:    lu52i.d $a1, $a1, %pc64_hi12(g_a64)
+; LA64-LARGE-NEXT:    add.d $a0, $a1, $a0
+; LA64-LARGE-NEXT:    lu12i.w $a1, 279556
+; LA64-LARGE-NEXT:    ori $a1, $a1, 1088
+; LA64-LARGE-NEXT:    lu32i.d $a1, 17472
+; LA64-LARGE-NEXT:    add.d $a0, $a0, $a1
+; LA64-LARGE-NEXT:    ret
+entry:
+  ret ptr getelementptr inbounds ([1 x i64], ptr @g_a64, i64 9380351707272)
+}
+
+define dso_local ptr @load_addr_offset_614750729487779976() nounwind {
+; LA32-LABEL: load_addr_offset_614750729487779976:
+; LA32:       # %bb.0: # %entry
+; LA32-NEXT:    pcalau12i $a0, %pc_hi20(g_a64)
+; LA32-NEXT:    addi.w $a0, $a0, %pc_lo12(g_a64)
+; LA32-NEXT:    lu12i.w $a1, 279556
+; LA32-NEXT:    ori $a1, $a1, 1088
+; LA32-NEXT:    add.w $a0, $a0, $a1
+; LA32-NEXT:    ret
+;
+; LA64-LABEL: load_addr_offset_614750729487779976:
+; LA64:       # %bb.0: # %entry
+; LA64-NEXT:    pcalau12i $a0, %pc_hi20(g_a64)
+; LA64-NEXT:    addi.d $a0, $a0, %pc_lo12(g_a64)
+; LA64-NEXT:    lu12i.w $a1, 279556
+; LA64-NEXT:    ori $a1, $a1, 1088
+; LA64-NEXT:    lu32i.d $a1, 17472
+; LA64-NEXT:    lu52i.d $a1, $a1, 1092
+; LA64-NEXT:    add.d $a0, $a0, $a1
+; LA64-NEXT:    ret
+;
+; LA64-LARGE-LABEL: load_addr_offset_614750729487779976:
+; LA64-LARGE:       # %bb.0: # %entry
+; LA64-LARGE-NEXT:    pcalau12i $a0, %pc_hi20(g_a64)
+; LA64-LARGE-NEXT:    addi.d $a1, $zero, %pc_lo12(g_a64)
+; LA64-LARGE-NEXT:    lu32i.d $a1, %pc64_lo20(g_a64)
+; LA64-LARGE-NEXT:    lu52i.d $a1, $a1, %pc64_hi12(g_a64)
+; LA64-LARGE-NEXT:    add.d $a0, $a1, $a0
+; LA64-LARGE-NEXT:    lu12i.w $a1, 279556
+; LA64-LARGE-NEXT:    ori $a1, $a1, 1088
+; LA64-LARGE-NEXT:    lu32i.d $a1, 17472
+; LA64-LARGE-NEXT:    lu52i.d $a1, $a1, 1092
+; LA64-LARGE-NEXT:    add.d $a0, $a0, $a1
+; LA64-LARGE-NEXT:    ret
+entry:
+  ret ptr getelementptr inbounds ([1 x i64], ptr @g_a64, i64 614750729487779976)
+}
diff --git a/llvm/test/CodeGen/LoongArch/psabi-restricted-scheduling.ll b/llvm/test/CodeGen/LoongArch/psabi-restricted-scheduling.ll
index 6a15d3a..b03a523 100644
--- a/llvm/test/CodeGen/LoongArch/psabi-restricted-scheduling.ll
+++ b/llvm/test/CodeGen/LoongArch/psabi-restricted-scheduling.ll
@@ -82,54 +82,54 @@ define void @foo() nounwind {
 ; LARGE_NO_SCH-NEXT:    addi.d $sp, $sp, -16
 ; LARGE_NO_SCH-NEXT:    st.d $ra, $sp, 8 # 8-byte Folded Spill
 ; LARGE_NO_SCH-NEXT:    pcalau12i $a0, %got_pc_hi20(G)
-; LARGE_NO_SCH-NEXT:    addi.d $t8, $zero, %got_pc_lo12(G)
-; LARGE_NO_SCH-NEXT:    lu32i.d $t8, %got64_pc_lo20(G)
-; LARGE_NO_SCH-NEXT:    lu52i.d $t8, $t8, %got64_pc_hi12(G)
-; LARGE_NO_SCH-NEXT:    ldx.d $a0, $t8, $a0
+; LARGE_NO_SCH-NEXT:    addi.d $a1, $zero, %got_pc_lo12(G)
+; LARGE_NO_SCH-NEXT:    lu32i.d $a1, %got64_pc_lo20(G)
+; LARGE_NO_SCH-NEXT:    lu52i.d $a1, $a1, %got64_pc_hi12(G)
+; LARGE_NO_SCH-NEXT:    ldx.d $a0, $a1, $a0
 ; LARGE_NO_SCH-NEXT:    ld.d $zero, $a0, 0
 ; LARGE_NO_SCH-NEXT:    pcalau12i $a0, %pc_hi20(.Lg$local)
-; LARGE_NO_SCH-NEXT:    addi.d $t8, $zero, %pc_lo12(.Lg$local)
-; LARGE_NO_SCH-NEXT:    lu32i.d $t8, %pc64_lo20(.Lg$local)
-; LARGE_NO_SCH-NEXT:    lu52i.d $t8, $t8, %pc64_hi12(.Lg$local)
-; LARGE_NO_SCH-NEXT:    add.d $a0, $t8, $a0
+; LARGE_NO_SCH-NEXT:    addi.d $a1, $zero, %pc_lo12(.Lg$local)
+; LARGE_NO_SCH-NEXT:    lu32i.d $a1, %pc64_lo20(.Lg$local)
+; LARGE_NO_SCH-NEXT:    lu52i.d $a1, $a1, %pc64_hi12(.Lg$local)
+; LARGE_NO_SCH-NEXT:    add.d $a0, $a1, $a0
 ; LARGE_NO_SCH-NEXT:    ld.d $zero, $a0, 0
 ; LARGE_NO_SCH-NEXT:    ori $a0, $zero, 1
-; LARGE_NO_SCH-NEXT:    pcalau12i $ra, %got_pc_hi20(bar)
-; LARGE_NO_SCH-NEXT:    addi.d $t8, $zero, %got_pc_lo12(bar)
-; LARGE_NO_SCH-NEXT:    lu32i.d $t8, %got64_pc_lo20(bar)
-; LARGE_NO_SCH-NEXT:    lu52i.d $t8, $t8, %got64_pc_hi12(bar)
-; LARGE_NO_SCH-NEXT:    ldx.d $ra, $t8, $ra
+; LARGE_NO_SCH-NEXT:    pcalau12i $a1, %got_pc_hi20(bar)
+; LARGE_NO_SCH-NEXT:    addi.d $ra, $zero, %got_pc_lo12(bar)
+; LARGE_NO_SCH-NEXT:    lu32i.d $ra, %got64_pc_lo20(bar)
+; LARGE_NO_SCH-NEXT:    lu52i.d $ra, $ra, %got64_pc_hi12(bar)
+; LARGE_NO_SCH-NEXT:    ldx.d $ra, $ra, $a1
 ; LARGE_NO_SCH-NEXT:    jirl $ra, $ra, 0
 ; LARGE_NO_SCH-NEXT:    pcalau12i $a0, %gd_pc_hi20(gd)
-; LARGE_NO_SCH-NEXT:    addi.d $t8, $zero, %got_pc_lo12(gd)
-; LARGE_NO_SCH-NEXT:    lu32i.d $t8, %got64_pc_lo20(gd)
-; LARGE_NO_SCH-NEXT:    lu52i.d $t8, $t8, %got64_pc_hi12(gd)
-; LARGE_NO_SCH-NEXT:    add.d $a0, $t8, $a0
-; LARGE_NO_SCH-NEXT:    pcalau12i $ra, %pc_hi20(__tls_get_addr)
-; LARGE_NO_SCH-NEXT:    addi.d $t8, $zero, %pc_lo12(__tls_get_addr)
-; LARGE_NO_SCH-NEXT:    lu32i.d $t8, %pc64_lo20(__tls_get_addr)
-; LARGE_NO_SCH-NEXT:    lu52i.d $t8, $t8, %pc64_hi12(__tls_get_addr)
-; LARGE_NO_SCH-NEXT:    add.d $ra, $t8, $ra
+; LARGE_NO_SCH-NEXT:    addi.d $a1, $zero, %got_pc_lo12(gd)
+; LARGE_NO_SCH-NEXT:    lu32i.d $a1, %got64_pc_lo20(gd)
+; LARGE_NO_SCH-NEXT:    lu52i.d $a1, $a1, %got64_pc_hi12(gd)
+; LARGE_NO_SCH-NEXT:    add.d $a0, $a1, $a0
+; LARGE_NO_SCH-NEXT:    pcalau12i $a1, %pc_hi20(__tls_get_addr)
+; LARGE_NO_SCH-NEXT:    addi.d $ra, $zero, %pc_lo12(__tls_get_addr)
+; LARGE_NO_SCH-NEXT:    lu32i.d $ra, %pc64_lo20(__tls_get_addr)
+; LARGE_NO_SCH-NEXT:    lu52i.d $ra, $ra, %pc64_hi12(__tls_get_addr)
+; LARGE_NO_SCH-NEXT:    add.d $ra, $ra, $a1
 ; LARGE_NO_SCH-NEXT:    jirl $ra, $ra, 0
 ; LARGE_NO_SCH-NEXT:    ld.d $zero, $a0, 0
 ; LARGE_NO_SCH-NEXT:    pcalau12i $a0, %ld_pc_hi20(ld)
-; LARGE_NO_SCH-NEXT:    addi.d $t8, $zero, %got_pc_lo12(ld)
-; LARGE_NO_SCH-NEXT:    lu32i.d $t8, %got64_pc_lo20(ld)
-; LARGE_NO_SCH-NEXT:    lu52i.d $t8, $t8, %got64_pc_hi12(ld)
-; LARGE_NO_SCH-NEXT:    add.d $a0, $t8, $a0
-; LARGE_NO_SCH-NEXT:    pcalau12i $ra, %pc_hi20(__tls_get_addr)
-; LARGE_NO_SCH-NEXT:    addi.d $t8, $zero, %pc_lo12(__tls_get_addr)
-; LARGE_NO_SCH-NEXT:    lu32i.d $t8, %pc64_lo20(__tls_get_addr)
-; LARGE_NO_SCH-NEXT:    lu52i.d $t8, $t8, %pc64_hi12(__tls_get_addr)
-; LARGE_NO_SCH-NEXT:    add.d $ra, $t8, $ra
+; LARGE_NO_SCH-NEXT:    addi.d $a1, $zero, %got_pc_lo12(ld)
+; LARGE_NO_SCH-NEXT:    lu32i.d $a1, %got64_pc_lo20(ld)
+; LARGE_NO_SCH-NEXT:    lu52i.d $a1, $a1, %got64_pc_hi12(ld)
+; LARGE_NO_SCH-NEXT:    add.d $a0, $a1, $a0
+; LARGE_NO_SCH-NEXT:    pcalau12i $a1, %pc_hi20(__tls_get_addr)
+; LARGE_NO_SCH-NEXT:    addi.d $ra, $zero, %pc_lo12(__tls_get_addr)
+; LARGE_NO_SCH-NEXT:    lu32i.d $ra, %pc64_lo20(__tls_get_addr)
+; LARGE_NO_SCH-NEXT:    lu52i.d $ra, $ra, %pc64_hi12(__tls_get_addr)
+; LARGE_NO_SCH-NEXT:    add.d $ra, $ra, $a1
 ; LARGE_NO_SCH-NEXT:    jirl $ra, $ra, 0
-; LARGE_NO_SCH-NEXT:    pcalau12i $a1, %ie_pc_hi20(ie)
-; LARGE_NO_SCH-NEXT:    addi.d $t8, $zero, %ie_pc_lo12(ie)
-; LARGE_NO_SCH-NEXT:    lu32i.d $t8, %ie64_pc_lo20(ie)
-; LARGE_NO_SCH-NEXT:    lu52i.d $t8, $t8, %ie64_pc_hi12(ie)
-; LARGE_NO_SCH-NEXT:    ldx.d $a1, $t8, $a1
 ; LARGE_NO_SCH-NEXT:    ld.d $zero, $a0, 0
-; LARGE_NO_SCH-NEXT:    ldx.d $zero, $a1, $tp
+; LARGE_NO_SCH-NEXT:    pcalau12i $a0, %ie_pc_hi20(ie)
+; LARGE_NO_SCH-NEXT:    addi.d $a1, $zero, %ie_pc_lo12(ie)
+; LARGE_NO_SCH-NEXT:    lu32i.d $a1, %ie64_pc_lo20(ie)
+; LARGE_NO_SCH-NEXT:    lu52i.d $a1, $a1, %ie64_pc_hi12(ie)
+; LARGE_NO_SCH-NEXT:    ldx.d $a0, $a1, $a0
+; LARGE_NO_SCH-NEXT:    ldx.d $zero, $a0, $tp
 ; LARGE_NO_SCH-NEXT:    ld.d $ra, $sp, 8 # 8-byte Folded Reload
 ; LARGE_NO_SCH-NEXT:    addi.d $sp, $sp, 16
 ; LARGE_NO_SCH-NEXT:    ret
@@ -139,54 +139,54 @@ define void @foo() nounwind {
 ; LARGE_SCH-NEXT:    addi.d $sp, $sp, -16
 ; LARGE_SCH-NEXT:    st.d $ra, $sp, 8 # 8-byte Folded Spill
 ; LARGE_SCH-NEXT:    pcalau12i $a0, %got_pc_hi20(G)
-; LARGE_SCH-NEXT:    addi.d $t8, $zero, %got_pc_lo12(G)
-; LARGE_SCH-NEXT:    lu32i.d $t8, %got64_pc_lo20(G)
-; LARGE_SCH-NEXT:    lu52i.d $t8, $t8, %got64_pc_hi12(G)
-; LARGE_SCH-NEXT:    ldx.d $a0, $t8, $a0
+; LARGE_SCH-NEXT:    addi.d $a1, $zero, %got_pc_lo12(G)
+; LARGE_SCH-NEXT:    lu32i.d $a1, %got64_pc_lo20(G)
+; LARGE_SCH-NEXT:    lu52i.d $a1, $a1, %got64_pc_hi12(G)
+; LARGE_SCH-NEXT:    ldx.d $a0, $a1, $a0
 ; LARGE_SCH-NEXT:    ld.d $zero, $a0, 0
 ; LARGE_SCH-NEXT:    pcalau12i $a0, %pc_hi20(.Lg$local)
-; LARGE_SCH-NEXT:    addi.d $t8, $zero, %pc_lo12(.Lg$local)
-; LARGE_SCH-NEXT:    lu32i.d $t8, %pc64_lo20(.Lg$local)
-; LARGE_SCH-NEXT:    lu52i.d $t8, $t8, %pc64_hi12(.Lg$local)
-; LARGE_SCH-NEXT:    add.d $a0, $t8, $a0
+; LARGE_SCH-NEXT:    addi.d $a1, $zero, %pc_lo12(.Lg$local)
+; LARGE_SCH-NEXT:    lu32i.d $a1, %pc64_lo20(.Lg$local)
+; LARGE_SCH-NEXT:    lu52i.d $a1, $a1, %pc64_hi12(.Lg$local)
+; LARGE_SCH-NEXT:    add.d $a0, $a1, $a0
 ; LARGE_SCH-NEXT:    ld.d $zero, $a0, 0
 ; LARGE_SCH-NEXT:    ori $a0, $zero, 1
-; LARGE_SCH-NEXT:    pcalau12i $ra, %got_pc_hi20(bar)
-; LARGE_SCH-NEXT:    addi.d $t8, $zero, %got_pc_lo12(bar)
-; LARGE_SCH-NEXT:    lu32i.d $t8, %got64_pc_lo20(bar)
-; LARGE_SCH-NEXT:    lu52i.d $t8, $t8, %got64_pc_hi12(bar)
-; LARGE_SCH-NEXT:    ldx.d $ra, $t8, $ra
+; LARGE_SCH-NEXT:    pcalau12i $a1, %got_pc_hi20(bar)
+; LARGE_SCH-NEXT:    addi.d $ra, $zero, %got_pc_lo12(bar)
+; LARGE_SCH-NEXT:    lu32i.d $ra, %got64_pc_lo20(bar)
+; LARGE_SCH-NEXT:    lu52i.d $ra, $ra, %got64_pc_hi12(bar)
+; LARGE_SCH-NEXT:    ldx.d $ra, $ra, $a1
 ; LARGE_SCH-NEXT:    jirl $ra, $ra, 0
 ; LARGE_SCH-NEXT:    pcalau12i $a0, %gd_pc_hi20(gd)
-; LARGE_SCH-NEXT:    addi.d $t8, $zero, %got_pc_lo12(gd)
-; LARGE_SCH-NEXT:    lu32i.d $t8, %got64_pc_lo20(gd)
-; LARGE_SCH-NEXT:    lu52i.d $t8, $t8, %got64_pc_hi12(gd)
-; LARGE_SCH-NEXT:    add.d $a0, $t8, $a0
-; LARGE_SCH-NEXT:    pcalau12i $ra, %pc_hi20(__tls_get_addr)
-; LARGE_SCH-NEXT:    addi.d $t8, $zero, %pc_lo12(__tls_get_addr)
-; LARGE_SCH-NEXT:    lu32i.d $t8, %pc64_lo20(__tls_get_addr)
-; LARGE_SCH-NEXT:    lu52i.d $t8, $t8, %pc64_hi12(__tls_get_addr)
-; LARGE_SCH-NEXT:    add.d $ra, $t8, $ra
+; LARGE_SCH-NEXT:    addi.d $a1, $zero, %got_pc_lo12(gd)
+; LARGE_SCH-NEXT:    lu32i.d $a1, %got64_pc_lo20(gd)
+; LARGE_SCH-NEXT:    lu52i.d $a1, $a1, %got64_pc_hi12(gd)
+; LARGE_SCH-NEXT:    add.d $a0, $a1, $a0
+; LARGE_SCH-NEXT:    pcalau12i $a1, %pc_hi20(__tls_get_addr)
+; LARGE_SCH-NEXT:    addi.d $ra, $zero, %pc_lo12(__tls_get_addr)
+; LARGE_SCH-NEXT:    lu32i.d $ra, %pc64_lo20(__tls_get_addr)
+; LARGE_SCH-NEXT:    lu52i.d $ra, $ra, %pc64_hi12(__tls_get_addr)
+; LARGE_SCH-NEXT:    add.d $ra, $ra, $a1
 ; LARGE_SCH-NEXT:    jirl $ra, $ra, 0
 ; LARGE_SCH-NEXT:    ld.d $zero, $a0, 0
 ; LARGE_SCH-NEXT:    pcalau12i $a0, %ld_pc_hi20(ld)
-; LARGE_SCH-NEXT:    addi.d $t8, $zero, %got_pc_lo12(ld)
-; LARGE_SCH-NEXT:    lu32i.d $t8, %got64_pc_lo20(ld)
-; LARGE_SCH-NEXT:    lu52i.d $t8, $t8, %got64_pc_hi12(ld)
-; LARGE_SCH-NEXT:    add.d $a0, $t8, $a0
-; LARGE_SCH-NEXT:    pcalau12i $ra, %pc_hi20(__tls_get_addr)
-; LARGE_SCH-NEXT:    addi.d $t8, $zero, %pc_lo12(__tls_get_addr)
-; LARGE_SCH-NEXT:    lu32i.d $t8, %pc64_lo20(__tls_get_addr)
-; LARGE_SCH-NEXT:    lu52i.d $t8, $t8, %pc64_hi12(__tls_get_addr)
-; LARGE_SCH-NEXT:    add.d $ra, $t8, $ra
+; LARGE_SCH-NEXT:    addi.d $a1, $zero, %got_pc_lo12(ld)
+; LARGE_SCH-NEXT:    lu32i.d $a1, %got64_pc_lo20(ld)
+; LARGE_SCH-NEXT:    lu52i.d $a1, $a1, %got64_pc_hi12(ld)
+; LARGE_SCH-NEXT:    add.d $a0, $a1, $a0
+; LARGE_SCH-NEXT:    pcalau12i $a1, %pc_hi20(__tls_get_addr)
+; LARGE_SCH-NEXT:    addi.d $ra, $zero, %pc_lo12(__tls_get_addr)
+; LARGE_SCH-NEXT:    lu32i.d $ra, %pc64_lo20(__tls_get_addr)
+; LARGE_SCH-NEXT:    lu52i.d $ra, $ra, %pc64_hi12(__tls_get_addr)
+; LARGE_SCH-NEXT:    add.d $ra, $ra, $a1
 ; LARGE_SCH-NEXT:    jirl $ra, $ra, 0
-; LARGE_SCH-NEXT:    pcalau12i $a1, %ie_pc_hi20(ie)
-; LARGE_SCH-NEXT:    addi.d $t8, $zero, %ie_pc_lo12(ie)
-; LARGE_SCH-NEXT:    lu32i.d $t8, %ie64_pc_lo20(ie)
-; LARGE_SCH-NEXT:    lu52i.d $t8, $t8, %ie64_pc_hi12(ie)
-; LARGE_SCH-NEXT:    ldx.d $a1, $t8, $a1
 ; LARGE_SCH-NEXT:    ld.d $zero, $a0, 0
-; LARGE_SCH-NEXT:    ldx.d $zero, $a1, $tp
+; LARGE_SCH-NEXT:    pcalau12i $a0, %ie_pc_hi20(ie)
+; LARGE_SCH-NEXT:    addi.d $a1, $zero, %ie_pc_lo12(ie)
+; LARGE_SCH-NEXT:    lu32i.d $a1, %ie64_pc_lo20(ie)
+; LARGE_SCH-NEXT:    lu52i.d $a1, $a1, %ie64_pc_hi12(ie)
+; LARGE_SCH-NEXT:    ldx.d $a0, $a1, $a0
+; LARGE_SCH-NEXT:    ldx.d $zero, $a0, $tp
 ; LARGE_SCH-NEXT:    ld.d $ra, $sp, 8 # 8-byte Folded Reload
 ; LARGE_SCH-NEXT:    addi.d $sp, $sp, 16
 ; LARGE_SCH-NEXT:    ret
diff --git a/llvm/test/CodeGen/LoongArch/rotl-rotr.ll b/llvm/test/CodeGen/LoongArch/rotl-rotr.ll
index 75461f5..774cf61 100644
--- a/llvm/test/CodeGen/LoongArch/rotl-rotr.ll
+++ b/llvm/test/CodeGen/LoongArch/rotl-rotr.ll
@@ -5,15 +5,13 @@
 define signext i32 @rotl_32(i32 signext %x, i32 signext %y) nounwind {
 ; LA32-LABEL: rotl_32:
 ; LA32:       # %bb.0:
-; LA32-NEXT:    ori $a2, $zero, 32
-; LA32-NEXT:    sub.w $a1, $a2, $a1
+; LA32-NEXT:    sub.w $a1, $zero, $a1
 ; LA32-NEXT:    rotr.w $a0, $a0, $a1
 ; LA32-NEXT:    ret
 ;
 ; LA64-LABEL: rotl_32:
 ; LA64:       # %bb.0:
-; LA64-NEXT:    ori $a2, $zero, 32
-; LA64-NEXT:    sub.d $a1, $a2, $a1
+; LA64-NEXT:    sub.d $a1, $zero, $a1
 ; LA64-NEXT:    rotr.w $a0, $a0, $a1
 ; LA64-NEXT:    ret
   %z = sub i32 32, %y
@@ -80,8 +78,7 @@ define i64 @rotl_64(i64 %x, i64 %y) nounwind {
 ;
 ; LA64-LABEL: rotl_64:
 ; LA64:       # %bb.0:
-; LA64-NEXT:    ori $a2, $zero, 64
-; LA64-NEXT:    sub.d $a1, $a2, $a1
+; LA64-NEXT:    sub.d $a1, $zero, $a1
 ; LA64-NEXT:    rotr.d $a0, $a0, $a1
 ; LA64-NEXT:    ret
   %z = sub i64 64, %y
@@ -149,8 +146,7 @@ define signext i32 @rotl_32_mask(i32 signext %x, i32 signext %y) nounwind {
 ;
 ; LA64-LABEL: rotl_32_mask:
 ; LA64:       # %bb.0:
-; LA64-NEXT:    ori $a2, $zero, 32
-; LA64-NEXT:    sub.d $a1, $a2, $a1
+; LA64-NEXT:    sub.d $a1, $zero, $a1
 ; LA64-NEXT:    rotr.w $a0, $a0, $a1
 ; LA64-NEXT:    ret
   %z = sub i32 0, %y
@@ -170,8 +166,7 @@ define signext i32 @rotl_32_mask_and_63_and_31(i32 signext %x, i32 signext %y) n
 ;
 ; LA64-LABEL: rotl_32_mask_and_63_and_31:
 ; LA64:       # %bb.0:
-; LA64-NEXT:    ori $a2, $zero, 32
-; LA64-NEXT:    sub.d $a1, $a2, $a1
+; LA64-NEXT:    sub.d $a1, $zero, $a1
 ; LA64-NEXT:    rotr.w $a0, $a0, $a1
 ; LA64-NEXT:    ret
   %a = and i32 %y, 63
@@ -192,8 +187,7 @@ define signext i32 @rotl_32_mask_or_64_or_32(i32 signext %x, i32 signext %y) nou
 ;
 ; LA64-LABEL: rotl_32_mask_or_64_or_32:
 ; LA64:       # %bb.0:
-; LA64-NEXT:    ori $a2, $zero, 32
-; LA64-NEXT:    sub.d $a1, $a2, $a1
+; LA64-NEXT:    sub.d $a1, $zero, $a1
 ; LA64-NEXT:    rotr.w $a0, $a0, $a1
 ; LA64-NEXT:    ret
   %a = or i32 %y, 64
diff --git a/llvm/test/CodeGen/LoongArch/tls-models.ll b/llvm/test/CodeGen/LoongArch/tls-models.ll
index bb89794..4ac6201 100644
--- a/llvm/test/CodeGen/LoongArch/tls-models.ll
+++ b/llvm/test/CodeGen/LoongArch/tls-models.ll
@@ -51,15 +51,15 @@ define ptr @f1() nounwind {
 ; LA64LARGEPIC-NEXT:    addi.d $sp, $sp, -16
 ; LA64LARGEPIC-NEXT:    st.d $ra, $sp, 8 # 8-byte Folded Spill
 ; LA64LARGEPIC-NEXT:    pcalau12i $a0, %gd_pc_hi20(unspecified)
-; LA64LARGEPIC-NEXT:    addi.d $t8, $zero, %got_pc_lo12(unspecified)
-; LA64LARGEPIC-NEXT:    lu32i.d $t8, %got64_pc_lo20(unspecified)
-; LA64LARGEPIC-NEXT:    lu52i.d $t8, $t8, %got64_pc_hi12(unspecified)
-; LA64LARGEPIC-NEXT:    add.d $a0, $t8, $a0
-; LA64LARGEPIC-NEXT:    pcalau12i $ra, %pc_hi20(__tls_get_addr)
-; LA64LARGEPIC-NEXT:    addi.d $t8, $zero, %pc_lo12(__tls_get_addr)
-; LA64LARGEPIC-NEXT:    lu32i.d $t8, %pc64_lo20(__tls_get_addr)
-; LA64LARGEPIC-NEXT:    lu52i.d $t8, $t8, %pc64_hi12(__tls_get_addr)
-; LA64LARGEPIC-NEXT:    add.d $ra, $t8, $ra
+; LA64LARGEPIC-NEXT:    addi.d $a1, $zero, %got_pc_lo12(unspecified)
+; LA64LARGEPIC-NEXT:    lu32i.d $a1, %got64_pc_lo20(unspecified)
+; LA64LARGEPIC-NEXT:    lu52i.d $a1, $a1, %got64_pc_hi12(unspecified)
+; LA64LARGEPIC-NEXT:    add.d $a0, $a1, $a0
+; LA64LARGEPIC-NEXT:    pcalau12i $a1, %pc_hi20(__tls_get_addr)
+; LA64LARGEPIC-NEXT:    addi.d $ra, $zero, %pc_lo12(__tls_get_addr)
+; LA64LARGEPIC-NEXT:    lu32i.d $ra, %pc64_lo20(__tls_get_addr)
+; LA64LARGEPIC-NEXT:    lu52i.d $ra, $ra, %pc64_hi12(__tls_get_addr)
+; LA64LARGEPIC-NEXT:    add.d $ra, $ra, $a1
 ; LA64LARGEPIC-NEXT:    jirl $ra, $ra, 0
 ; LA64LARGEPIC-NEXT:    ld.d $ra, $sp, 8 # 8-byte Folded Reload
 ; LA64LARGEPIC-NEXT:    addi.d $sp, $sp, 16
@@ -82,10 +82,10 @@ define ptr @f1() nounwind {
 ; LA64LARGENOPIC-LABEL: f1:
 ; LA64LARGENOPIC:       # %bb.0: # %entry
 ; LA64LARGENOPIC-NEXT:    pcalau12i $a0, %ie_pc_hi20(unspecified)
-; LA64LARGENOPIC-NEXT:    addi.d $t8, $zero, %ie_pc_lo12(unspecified)
-; LA64LARGENOPIC-NEXT:    lu32i.d $t8, %ie64_pc_lo20(unspecified)
-; LA64LARGENOPIC-NEXT:    lu52i.d $t8, $t8, %ie64_pc_hi12(unspecified)
-; LA64LARGENOPIC-NEXT:    ldx.d $a0, $t8, $a0
+; LA64LARGENOPIC-NEXT:    addi.d $a1, $zero, %ie_pc_lo12(unspecified)
+; LA64LARGENOPIC-NEXT:    lu32i.d $a1, %ie64_pc_lo20(unspecified)
+; LA64LARGENOPIC-NEXT:    lu52i.d $a1, $a1, %ie64_pc_hi12(unspecified)
+; LA64LARGENOPIC-NEXT:    ldx.d $a0, $a1, $a0
 ; LA64LARGENOPIC-NEXT:    add.d $a0, $a0, $tp
 ; LA64LARGENOPIC-NEXT:    ret
 ;
@@ -120,14 +120,13 @@ define ptr @f1() nounwind {
 ; DESC64-NEXT:    addi.d $sp, $sp, -16
 ; DESC64-NEXT:    st.d $ra, $sp, 8 # 8-byte Folded Spill
 ; DESC64-NEXT:    pcalau12i $a0, %desc_pc_hi20(unspecified)
-; DESC64-NEXT:    addi.d $t8, $zero, %desc_pc_lo12(unspecified)
-; DESC64-NEXT:    lu32i.d $t8, %desc64_pc_lo20(unspecified)
-; DESC64-NEXT:    lu52i.d $t8, $t8, %desc64_pc_hi12(unspecified)
-; DESC64-NEXT:    add.d $a0, $t8, $a0
+; DESC64-NEXT:    addi.d $a1, $zero, %desc_pc_lo12(unspecified)
+; DESC64-NEXT:    lu32i.d $a1, %desc64_pc_lo20(unspecified)
+; DESC64-NEXT:    lu52i.d $a1, $a1, %desc64_pc_hi12(unspecified)
+; DESC64-NEXT:    add.d $a0, $a0, $a1
 ; DESC64-NEXT:    ld.d $ra, $a0, %desc_ld(unspecified)
 ; DESC64-NEXT:    jirl $ra, $ra, %desc_call(unspecified)
-; DESC64-NEXT:    add.d $a1, $a0, $tp
-; DESC64-NEXT:    move $a0, $a1
+; DESC64-NEXT:    add.d $a0, $a0, $tp
 ; DESC64-NEXT:    ld.d $ra, $sp, 8 # 8-byte Folded Reload
 ; DESC64-NEXT:    addi.d $sp, $sp, 16
 ; DESC64-NEXT:    ret
@@ -165,15 +164,15 @@ define ptr @f2() nounwind {
 ; LA64LARGEPIC-NEXT:    addi.d $sp, $sp, -16
 ; LA64LARGEPIC-NEXT:    st.d $ra, $sp, 8 # 8-byte Folded Spill
 ; LA64LARGEPIC-NEXT:    pcalau12i $a0, %ld_pc_hi20(ld)
-; LA64LARGEPIC-NEXT:    addi.d $t8, $zero, %got_pc_lo12(ld)
-; LA64LARGEPIC-NEXT:    lu32i.d $t8, %got64_pc_lo20(ld)
-; LA64LARGEPIC-NEXT:    lu52i.d $t8, $t8, %got64_pc_hi12(ld)
-; LA64LARGEPIC-NEXT:    add.d $a0, $t8, $a0
-; LA64LARGEPIC-NEXT:    pcalau12i $ra, %pc_hi20(__tls_get_addr)
-; LA64LARGEPIC-NEXT:    addi.d $t8, $zero, %pc_lo12(__tls_get_addr)
-; LA64LARGEPIC-NEXT:    lu32i.d $t8, %pc64_lo20(__tls_get_addr)
-; LA64LARGEPIC-NEXT:    lu52i.d $t8, $t8, %pc64_hi12(__tls_get_addr)
-; LA64LARGEPIC-NEXT:    add.d $ra, $t8, $ra
+; LA64LARGEPIC-NEXT:    addi.d $a1, $zero, %got_pc_lo12(ld)
+; LA64LARGEPIC-NEXT:    lu32i.d $a1, %got64_pc_lo20(ld)
+; LA64LARGEPIC-NEXT:    lu52i.d $a1, $a1, %got64_pc_hi12(ld)
+; LA64LARGEPIC-NEXT:    add.d $a0, $a1, $a0
+; LA64LARGEPIC-NEXT:    pcalau12i $a1, %pc_hi20(__tls_get_addr)
+; LA64LARGEPIC-NEXT:    addi.d $ra, $zero, %pc_lo12(__tls_get_addr)
+; LA64LARGEPIC-NEXT:    lu32i.d $ra, %pc64_lo20(__tls_get_addr)
+; LA64LARGEPIC-NEXT:    lu52i.d $ra, $ra, %pc64_hi12(__tls_get_addr)
+; LA64LARGEPIC-NEXT:    add.d $ra, $ra, $a1
 ; LA64LARGEPIC-NEXT:    jirl $ra, $ra, 0
 ; LA64LARGEPIC-NEXT:    ld.d $ra, $sp, 8 # 8-byte Folded Reload
 ; LA64LARGEPIC-NEXT:    addi.d $sp, $sp, 16
@@ -196,10 +195,10 @@ define ptr @f2() nounwind {
 ; LA64LARGENOPIC-LABEL: f2:
 ; LA64LARGENOPIC:       # %bb.0: # %entry
 ; LA64LARGENOPIC-NEXT:    pcalau12i $a0, %ie_pc_hi20(ld)
-; LA64LARGENOPIC-NEXT:    addi.d $t8, $zero, %ie_pc_lo12(ld)
-; LA64LARGENOPIC-NEXT:    lu32i.d $t8, %ie64_pc_lo20(ld)
-; LA64LARGENOPIC-NEXT:    lu52i.d $t8, $t8, %ie64_pc_hi12(ld)
-; LA64LARGENOPIC-NEXT:    ldx.d $a0, $t8, $a0
+; LA64LARGENOPIC-NEXT:    addi.d $a1, $zero, %ie_pc_lo12(ld)
+; LA64LARGENOPIC-NEXT:    lu32i.d $a1, %ie64_pc_lo20(ld)
+; LA64LARGENOPIC-NEXT:    lu52i.d $a1, $a1, %ie64_pc_hi12(ld)
+; LA64LARGENOPIC-NEXT:    ldx.d $a0, $a1, $a0
 ; LA64LARGENOPIC-NEXT:    add.d $a0, $a0, $tp
 ; LA64LARGENOPIC-NEXT:    ret
 ;
@@ -234,14 +233,13 @@ define ptr @f2() nounwind {
 ; DESC64-NEXT:    addi.d $sp, $sp, -16
 ; DESC64-NEXT:    st.d $ra, $sp, 8 # 8-byte Folded Spill
 ; DESC64-NEXT:    pcalau12i $a0, %desc_pc_hi20(ld)
-; DESC64-NEXT:    addi.d $t8, $zero, %desc_pc_lo12(ld)
-; DESC64-NEXT:    lu32i.d $t8, %desc64_pc_lo20(ld)
-; DESC64-NEXT:    lu52i.d $t8, $t8, %desc64_pc_hi12(ld)
-; DESC64-NEXT:    add.d $a0, $t8, $a0
+; DESC64-NEXT:    addi.d $a1, $zero, %desc_pc_lo12(ld)
+; DESC64-NEXT:    lu32i.d $a1, %desc64_pc_lo20(ld)
+; DESC64-NEXT:    lu52i.d $a1, $a1, %desc64_pc_hi12(ld)
+; DESC64-NEXT:    add.d $a0, $a0, $a1
 ; DESC64-NEXT:    ld.d $ra, $a0, %desc_ld(ld)
 ; DESC64-NEXT:    jirl $ra, $ra, %desc_call(ld)
-; DESC64-NEXT:    add.d $a1, $a0, $tp
-; DESC64-NEXT:    move $a0, $a1
+; DESC64-NEXT:    add.d $a0, $a0, $tp
 ; DESC64-NEXT:    ld.d $ra, $sp, 8 # 8-byte Folded Reload
 ; DESC64-NEXT:    addi.d $sp, $sp, 16
 ; DESC64-NEXT:    ret
@@ -269,10 +267,10 @@ define ptr @f3() nounwind {
 ; LA64LARGEPIC-LABEL: f3:
 ; LA64LARGEPIC:       # %bb.0: # %entry
 ; LA64LARGEPIC-NEXT:    pcalau12i $a0, %ie_pc_hi20(ie)
-; LA64LARGEPIC-NEXT:    addi.d $t8, $zero, %ie_pc_lo12(ie)
-; LA64LARGEPIC-NEXT:    lu32i.d $t8, %ie64_pc_lo20(ie)
-; LA64LARGEPIC-NEXT:    lu52i.d $t8, $t8, %ie64_pc_hi12(ie)
-; LA64LARGEPIC-NEXT:    ldx.d $a0, $t8, $a0
+; LA64LARGEPIC-NEXT:    addi.d $a1, $zero, %ie_pc_lo12(ie)
+; LA64LARGEPIC-NEXT:    lu32i.d $a1, %ie64_pc_lo20(ie)
+; LA64LARGEPIC-NEXT:    lu52i.d $a1, $a1, %ie64_pc_hi12(ie)
+; LA64LARGEPIC-NEXT:    ldx.d $a0, $a1, $a0
 ; LA64LARGEPIC-NEXT:    add.d $a0, $a0, $tp
 ; LA64LARGEPIC-NEXT:    ret
 ;
@@ -293,10 +291,10 @@ define ptr @f3() nounwind {
 ; LA64LARGENOPIC-LABEL: f3:
 ; LA64LARGENOPIC:       # %bb.0: # %entry
 ; LA64LARGENOPIC-NEXT:    pcalau12i $a0, %ie_pc_hi20(ie)
-; LA64LARGENOPIC-NEXT:    addi.d $t8, $zero, %ie_pc_lo12(ie)
-; LA64LARGENOPIC-NEXT:    lu32i.d $t8, %ie64_pc_lo20(ie)
-; LA64LARGENOPIC-NEXT:    lu52i.d $t8, $t8, %ie64_pc_hi12(ie)
-; LA64LARGENOPIC-NEXT:    ldx.d $a0, $t8, $a0
+; LA64LARGENOPIC-NEXT:    addi.d $a1, $zero, %ie_pc_lo12(ie)
+; LA64LARGENOPIC-NEXT:    lu32i.d $a1, %ie64_pc_lo20(ie)
+; LA64LARGENOPIC-NEXT:    lu52i.d $a1, $a1, %ie64_pc_hi12(ie)
+; LA64LARGENOPIC-NEXT:    ldx.d $a0, $a1, $a0
 ; LA64LARGENOPIC-NEXT:    add.d $a0, $a0, $tp
 ; LA64LARGENOPIC-NEXT:    ret
 ;
@@ -317,10 +315,10 @@ define ptr @f3() nounwind {
 ; DESC64-LABEL: f3:
 ; DESC64:       # %bb.0: # %entry
 ; DESC64-NEXT:    pcalau12i $a0, %ie_pc_hi20(ie)
-; DESC64-NEXT:    addi.d $t8, $zero, %ie_pc_lo12(ie)
-; DESC64-NEXT:    lu32i.d $t8, %ie64_pc_lo20(ie)
-; DESC64-NEXT:    lu52i.d $t8, $t8, %ie64_pc_hi12(ie)
-; DESC64-NEXT:    ldx.d $a0, $t8, $a0
+; DESC64-NEXT:    addi.d $a1, $zero, %ie_pc_lo12(ie)
+; DESC64-NEXT:    lu32i.d $a1, %ie64_pc_lo20(ie)
+; DESC64-NEXT:    lu52i.d $a1, $a1, %ie64_pc_hi12(ie)
+; DESC64-NEXT:    ldx.d $a0, $a1, $a0
 ; DESC64-NEXT:    add.d $a0, $a0, $tp
 ; DESC64-NEXT:    ret
 entry:
diff --git a/llvm/test/CodeGen/LoongArch/ucmp.ll b/llvm/test/CodeGen/LoongArch/ucmp.ll
index 548c5bd..b91d3bf 100644
--- a/llvm/test/CodeGen/LoongArch/ucmp.ll
+++ b/llvm/test/CodeGen/LoongArch/ucmp.ll
@@ -26,8 +26,8 @@ define i8 @ucmp.8.16(i16 zeroext %x, i16 zeroext %y) nounwind {
 define i8 @ucmp.8.32(i32 %x, i32 %y) nounwind {
 ; CHECK-LABEL: ucmp.8.32:
 ; CHECK:       # %bb.0:
-; CHECK-NEXT:    bstrpick.d $a1, $a1, 31, 0
-; CHECK-NEXT:    bstrpick.d $a0, $a0, 31, 0
+; CHECK-NEXT:    addi.w $a1, $a1, 0
+; CHECK-NEXT:    addi.w $a0, $a0, 0
 ; CHECK-NEXT:    sltu $a2, $a0, $a1
 ; CHECK-NEXT:    sltu $a0, $a1, $a0
 ; CHECK-NEXT:    sub.d $a0, $a0, $a2
@@ -71,8 +71,8 @@ define i8 @ucmp.8.128(i128 %x, i128 %y) nounwind {
 define i32 @ucmp.32.32(i32 %x, i32 %y) nounwind {
 ; CHECK-LABEL: ucmp.32.32:
 ; CHECK:       # %bb.0:
-; CHECK-NEXT:    bstrpick.d $a1, $a1, 31, 0
-; CHECK-NEXT:    bstrpick.d $a0, $a0, 31, 0
+; CHECK-NEXT:    addi.w $a1, $a1, 0
+; CHECK-NEXT:    addi.w $a0, $a0, 0
 ; CHECK-NEXT:    sltu $a2, $a0, $a1
 ; CHECK-NEXT:    sltu $a0, $a1, $a0
 ; CHECK-NEXT:    sub.d $a0, $a0, $a2
diff --git a/llvm/test/CodeGen/NVPTX/math-intrins.ll b/llvm/test/CodeGen/NVPTX/math-intrins.ll
index d318445..fcc4ec6e 100644
--- a/llvm/test/CodeGen/NVPTX/math-intrins.ll
+++ b/llvm/test/CodeGen/NVPTX/math-intrins.ll
@@ -1,6 +1,7 @@
+; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py UTC_ARGS: --version 5
 ; RUN: llc < %s | FileCheck %s --check-prefixes=CHECK,CHECK-NOF16
-; RUN: llc < %s -mcpu=sm_80 | FileCheck %s --check-prefixes=CHECK,CHECK-F16
-; RUN: llc < %s -mcpu=sm_80 --nvptx-no-f16-math | FileCheck %s --check-prefixes=CHECK,CHECK-NOF16
+; RUN: llc < %s -mcpu=sm_80 -mattr +ptx70 | FileCheck %s --check-prefixes=CHECK,CHECK-F16
+; RUN: llc < %s -mcpu=sm_80 -mattr +ptx70 --nvptx-no-f16-math | FileCheck %s --check-prefixes=CHECK,CHECK-SM80-NOF16
 ; RUN: %if ptxas %{ llc < %s | %ptxas-verify %}
 ; RUN: %if ptxas-11.0 %{ llc < %s -mcpu=sm_80 | %ptxas-verify -arch=sm_80 %}
 ; RUN: %if ptxas-11.0 %{ llc < %s -mcpu=sm_80 --nvptx-no-f16-math | %ptxas-verify -arch=sm_80 %}
@@ -29,330 +30,1571 @@ declare half @llvm.minnum.f16(half, half) #0
 declare float @llvm.minnum.f32(float, float) #0
 declare double @llvm.minnum.f64(double, double) #0
 declare <2 x half> @llvm.minnum.v2f16(<2 x half>, <2 x half>) #0
+declare half @llvm.minimum.f16(half, half) #0
+declare float @llvm.minimum.f32(float, float) #0
+declare double @llvm.minimum.f64(double, double) #0
+declare <2 x half> @llvm.minimum.v2f16(<2 x half>, <2 x half>) #0
 declare half @llvm.maxnum.f16(half, half) #0
 declare float @llvm.maxnum.f32(float, float) #0
 declare double @llvm.maxnum.f64(double, double) #0
 declare <2 x half> @llvm.maxnum.v2f16(<2 x half>, <2 x half>) #0
+declare half @llvm.maximum.f16(half, half) #0
+declare float @llvm.maximum.f32(float, float) #0
+declare double @llvm.maximum.f64(double, double) #0
+declare <2 x half> @llvm.maximum.v2f16(<2 x half>, <2 x half>) #0
 declare float @llvm.fma.f32(float, float, float) #0
 declare double @llvm.fma.f64(double, double, double) #0
 
 ; ---- ceil ----
 
-; CHECK-LABEL: ceil_float
 define float @ceil_float(float %a) {
-  ; CHECK: cvt.rpi.f32.f32
+; CHECK-LABEL: ceil_float(
+; CHECK:       {
+; CHECK-NEXT:    .reg .f32 %f<3>;
+; CHECK-EMPTY:
+; CHECK-NEXT:  // %bb.0:
+; CHECK-NEXT:    ld.param.f32 %f1, [ceil_float_param_0];
+; CHECK-NEXT:    cvt.rpi.f32.f32 %f2, %f1;
+; CHECK-NEXT:    st.param.f32 [func_retval0+0], %f2;
+; CHECK-NEXT:    ret;
   %b = call float @llvm.ceil.f32(float %a)
   ret float %b
 }
 
-; CHECK-LABEL: ceil_float_ftz
 define float @ceil_float_ftz(float %a) #1 {
-  ; CHECK: cvt.rpi.ftz.f32.f32
+; CHECK-LABEL: ceil_float_ftz(
+; CHECK:       {
+; CHECK-NEXT:    .reg .f32 %f<3>;
+; CHECK-EMPTY:
+; CHECK-NEXT:  // %bb.0:
+; CHECK-NEXT:    ld.param.f32 %f1, [ceil_float_ftz_param_0];
+; CHECK-NEXT:    cvt.rpi.ftz.f32.f32 %f2, %f1;
+; CHECK-NEXT:    st.param.f32 [func_retval0+0], %f2;
+; CHECK-NEXT:    ret;
   %b = call float @llvm.ceil.f32(float %a)
   ret float %b
 }
 
-; CHECK-LABEL: ceil_double
 define double @ceil_double(double %a) {
-  ; CHECK: cvt.rpi.f64.f64
+; CHECK-LABEL: ceil_double(
+; CHECK:       {
+; CHECK-NEXT:    .reg .f64 %fd<3>;
+; CHECK-EMPTY:
+; CHECK-NEXT:  // %bb.0:
+; CHECK-NEXT:    ld.param.f64 %fd1, [ceil_double_param_0];
+; CHECK-NEXT:    cvt.rpi.f64.f64 %fd2, %fd1;
+; CHECK-NEXT:    st.param.f64 [func_retval0+0], %fd2;
+; CHECK-NEXT:    ret;
   %b = call double @llvm.ceil.f64(double %a)
   ret double %b
 }
 
 ; ---- floor ----
 
-; CHECK-LABEL: floor_float
 define float @floor_float(float %a) {
-  ; CHECK: cvt.rmi.f32.f32
+; CHECK-LABEL: floor_float(
+; CHECK:       {
+; CHECK-NEXT:    .reg .f32 %f<3>;
+; CHECK-EMPTY:
+; CHECK-NEXT:  // %bb.0:
+; CHECK-NEXT:    ld.param.f32 %f1, [floor_float_param_0];
+; CHECK-NEXT:    cvt.rmi.f32.f32 %f2, %f1;
+; CHECK-NEXT:    st.param.f32 [func_retval0+0], %f2;
+; CHECK-NEXT:    ret;
   %b = call float @llvm.floor.f32(float %a)
   ret float %b
 }
 
-; CHECK-LABEL: floor_float_ftz
 define float @floor_float_ftz(float %a) #1 {
-  ; CHECK: cvt.rmi.ftz.f32.f32
+; CHECK-LABEL: floor_float_ftz(
+; CHECK:       {
+; CHECK-NEXT:    .reg .f32 %f<3>;
+; CHECK-EMPTY:
+; CHECK-NEXT:  // %bb.0:
+; CHECK-NEXT:    ld.param.f32 %f1, [floor_float_ftz_param_0];
+; CHECK-NEXT:    cvt.rmi.ftz.f32.f32 %f2, %f1;
+; CHECK-NEXT:    st.param.f32 [func_retval0+0], %f2;
+; CHECK-NEXT:    ret;
   %b = call float @llvm.floor.f32(float %a)
   ret float %b
 }
 
-; CHECK-LABEL: floor_double
 define double @floor_double(double %a) {
-  ; CHECK: cvt.rmi.f64.f64
+; CHECK-LABEL: floor_double(
+; CHECK:       {
+; CHECK-NEXT:    .reg .f64 %fd<3>;
+; CHECK-EMPTY:
+; CHECK-NEXT:  // %bb.0:
+; CHECK-NEXT:    ld.param.f64 %fd1, [floor_double_param_0];
+; CHECK-NEXT:    cvt.rmi.f64.f64 %fd2, %fd1;
+; CHECK-NEXT:    st.param.f64 [func_retval0+0], %fd2;
+; CHECK-NEXT:    ret;
   %b = call double @llvm.floor.f64(double %a)
   ret double %b
 }
 
 ; ---- round ----
 
-; CHECK-LABEL: round_float
 define float @round_float(float %a) {
 ; check the use of sign mask and 0.5 to implement round
-; CHECK: and.b32 [[R1:%r[0-9]+]], {{.*}}, -2147483648;
-; CHECK: or.b32 {{.*}}, [[R1]], 1056964608;
+; CHECK-LABEL: round_float(
+; CHECK:       {
+; CHECK-NEXT:    .reg .pred %p<3>;
+; CHECK-NEXT:    .reg .b32 %r<4>;
+; CHECK-NEXT:    .reg .f32 %f<9>;
+; CHECK-EMPTY:
+; CHECK-NEXT:  // %bb.0:
+; CHECK-NEXT:    ld.param.f32 %f1, [round_float_param_0];
+; CHECK-NEXT:    mov.b32 %r1, %f1;
+; CHECK-NEXT:    and.b32 %r2, %r1, -2147483648;
+; CHECK-NEXT:    or.b32 %r3, %r2, 1056964608;
+; CHECK-NEXT:    mov.b32 %f2, %r3;
+; CHECK-NEXT:    add.rn.f32 %f3, %f1, %f2;
+; CHECK-NEXT:    cvt.rzi.f32.f32 %f4, %f3;
+; CHECK-NEXT:    abs.f32 %f5, %f1;
+; CHECK-NEXT:    setp.gt.f32 %p1, %f5, 0f4B000000;
+; CHECK-NEXT:    selp.f32 %f6, %f1, %f4, %p1;
+; CHECK-NEXT:    cvt.rzi.f32.f32 %f7, %f1;
+; CHECK-NEXT:    setp.lt.f32 %p2, %f5, 0f3F000000;
+; CHECK-NEXT:    selp.f32 %f8, %f7, %f6, %p2;
+; CHECK-NEXT:    st.param.f32 [func_retval0+0], %f8;
+; CHECK-NEXT:    ret;
   %b = call float @llvm.round.f32(float %a)
   ret float %b
 }
 
-; CHECK-LABEL: round_float_ftz
 define float @round_float_ftz(float %a) #1 {
 ; check the use of sign mask and 0.5 to implement round
-; CHECK: and.b32 [[R1:%r[0-9]+]], {{.*}}, -2147483648;
-; CHECK: or.b32 {{.*}}, [[R1]], 1056964608;
+; CHECK-LABEL: round_float_ftz(
+; CHECK:       {
+; CHECK-NEXT:    .reg .pred %p<3>;
+; CHECK-NEXT:    .reg .b32 %r<4>;
+; CHECK-NEXT:    .reg .f32 %f<9>;
+; CHECK-EMPTY:
+; CHECK-NEXT:  // %bb.0:
+; CHECK-NEXT:    ld.param.f32 %f1, [round_float_ftz_param_0];
+; CHECK-NEXT:    mov.b32 %r1, %f1;
+; CHECK-NEXT:    and.b32 %r2, %r1, -2147483648;
+; CHECK-NEXT:    or.b32 %r3, %r2, 1056964608;
+; CHECK-NEXT:    mov.b32 %f2, %r3;
+; CHECK-NEXT:    add.rn.ftz.f32 %f3, %f1, %f2;
+; CHECK-NEXT:    cvt.rzi.ftz.f32.f32 %f4, %f3;
+; CHECK-NEXT:    abs.ftz.f32 %f5, %f1;
+; CHECK-NEXT:    setp.gt.ftz.f32 %p1, %f5, 0f4B000000;
+; CHECK-NEXT:    selp.f32 %f6, %f1, %f4, %p1;
+; CHECK-NEXT:    cvt.rzi.ftz.f32.f32 %f7, %f1;
+; CHECK-NEXT:    setp.lt.ftz.f32 %p2, %f5, 0f3F000000;
+; CHECK-NEXT:    selp.f32 %f8, %f7, %f6, %p2;
+; CHECK-NEXT:    st.param.f32 [func_retval0+0], %f8;
+; CHECK-NEXT:    ret;
   %b = call float @llvm.round.f32(float %a)
   ret float %b
 }
 
-; CHECK-LABEL: round_double
 define double @round_double(double %a) {
 ; check the use of 0.5 to implement round
-; CHECK: setp.lt.f64 {{.*}}, [[R:%fd[0-9]+]], 0d3FE0000000000000;
-; CHECK: add.rn.f64 {{.*}}, [[R]], 0d3FE0000000000000;
+; CHECK-LABEL: round_double(
+; CHECK:       {
+; CHECK-NEXT:    .reg .pred %p<4>;
+; CHECK-NEXT:    .reg .b64 %rd<4>;
+; CHECK-NEXT:    .reg .f64 %fd<10>;
+; CHECK-EMPTY:
+; CHECK-NEXT:  // %bb.0:
+; CHECK-NEXT:    ld.param.f64 %fd1, [round_double_param_0];
+; CHECK-NEXT:    abs.f64 %fd2, %fd1;
+; CHECK-NEXT:    setp.lt.f64 %p1, %fd2, 0d3FE0000000000000;
+; CHECK-NEXT:    add.rn.f64 %fd3, %fd2, 0d3FE0000000000000;
+; CHECK-NEXT:    cvt.rzi.f64.f64 %fd4, %fd3;
+; CHECK-NEXT:    selp.f64 %fd5, 0d0000000000000000, %fd4, %p1;
+; CHECK-NEXT:    abs.f64 %fd6, %fd5;
+; CHECK-NEXT:    neg.f64 %fd7, %fd6;
+; CHECK-NEXT:    mov.b64 %rd1, %fd1;
+; CHECK-NEXT:    shr.u64 %rd2, %rd1, 63;
+; CHECK-NEXT:    and.b64 %rd3, %rd2, 1;
+; CHECK-NEXT:    setp.eq.b64 %p2, %rd3, 1;
+; CHECK-NEXT:    selp.f64 %fd8, %fd7, %fd6, %p2;
+; CHECK-NEXT:    setp.gt.f64 %p3, %fd2, 0d4330000000000000;
+; CHECK-NEXT:    selp.f64 %fd9, %fd1, %fd8, %p3;
+; CHECK-NEXT:    st.param.f64 [func_retval0+0], %fd9;
+; CHECK-NEXT:    ret;
   %b = call double @llvm.round.f64(double %a)
   ret double %b
 }
 
 ; ---- nearbyint ----
 
-; CHECK-LABEL: nearbyint_float
 define float @nearbyint_float(float %a) {
-  ; CHECK: cvt.rni.f32.f32
+; CHECK-LABEL: nearbyint_float(
+; CHECK:       {
+; CHECK-NEXT:    .reg .f32 %f<3>;
+; CHECK-EMPTY:
+; CHECK-NEXT:  // %bb.0:
+; CHECK-NEXT:    ld.param.f32 %f1, [nearbyint_float_param_0];
+; CHECK-NEXT:    cvt.rni.f32.f32 %f2, %f1;
+; CHECK-NEXT:    st.param.f32 [func_retval0+0], %f2;
+; CHECK-NEXT:    ret;
   %b = call float @llvm.nearbyint.f32(float %a)
   ret float %b
 }
 
-; CHECK-LABEL: nearbyint_float_ftz
 define float @nearbyint_float_ftz(float %a) #1 {
-  ; CHECK: cvt.rni.ftz.f32.f32
+; CHECK-LABEL: nearbyint_float_ftz(
+; CHECK:       {
+; CHECK-NEXT:    .reg .f32 %f<3>;
+; CHECK-EMPTY:
+; CHECK-NEXT:  // %bb.0:
+; CHECK-NEXT:    ld.param.f32 %f1, [nearbyint_float_ftz_param_0];
+; CHECK-NEXT:    cvt.rni.ftz.f32.f32 %f2, %f1;
+; CHECK-NEXT:    st.param.f32 [func_retval0+0], %f2;
+; CHECK-NEXT:    ret;
   %b = call float @llvm.nearbyint.f32(float %a)
   ret float %b
 }
 
-; CHECK-LABEL: nearbyint_double
 define double @nearbyint_double(double %a) {
-  ; CHECK: cvt.rni.f64.f64
+; CHECK-LABEL: nearbyint_double(
+; CHECK:       {
+; CHECK-NEXT:    .reg .f64 %fd<3>;
+; CHECK-EMPTY:
+; CHECK-NEXT:  // %bb.0:
+; CHECK-NEXT:    ld.param.f64 %fd1, [nearbyint_double_param_0];
+; CHECK-NEXT:    cvt.rni.f64.f64 %fd2, %fd1;
+; CHECK-NEXT:    st.param.f64 [func_retval0+0], %fd2;
+; CHECK-NEXT:    ret;
   %b = call double @llvm.nearbyint.f64(double %a)
   ret double %b
 }
 
 ; ---- rint ----
 
-; CHECK-LABEL: rint_float
 define float @rint_float(float %a) {
-  ; CHECK: cvt.rni.f32.f32
+; CHECK-LABEL: rint_float(
+; CHECK:       {
+; CHECK-NEXT:    .reg .f32 %f<3>;
+; CHECK-EMPTY:
+; CHECK-NEXT:  // %bb.0:
+; CHECK-NEXT:    ld.param.f32 %f1, [rint_float_param_0];
+; CHECK-NEXT:    cvt.rni.f32.f32 %f2, %f1;
+; CHECK-NEXT:    st.param.f32 [func_retval0+0], %f2;
+; CHECK-NEXT:    ret;
   %b = call float @llvm.rint.f32(float %a)
   ret float %b
 }
 
-; CHECK-LABEL: rint_float_ftz
 define float @rint_float_ftz(float %a) #1 {
-  ; CHECK: cvt.rni.ftz.f32.f32
+; CHECK-LABEL: rint_float_ftz(
+; CHECK:       {
+; CHECK-NEXT:    .reg .f32 %f<3>;
+; CHECK-EMPTY:
+; CHECK-NEXT:  // %bb.0:
+; CHECK-NEXT:    ld.param.f32 %f1, [rint_float_ftz_param_0];
+; CHECK-NEXT:    cvt.rni.ftz.f32.f32 %f2, %f1;
+; CHECK-NEXT:    st.param.f32 [func_retval0+0], %f2;
+; CHECK-NEXT:    ret;
   %b = call float @llvm.rint.f32(float %a)
   ret float %b
 }
 
-; CHECK-LABEL: rint_double
 define double @rint_double(double %a) {
-  ; CHECK: cvt.rni.f64.f64
+; CHECK-LABEL: rint_double(
+; CHECK:       {
+; CHECK-NEXT:    .reg .f64 %fd<3>;
+; CHECK-EMPTY:
+; CHECK-NEXT:  // %bb.0:
+; CHECK-NEXT:    ld.param.f64 %fd1, [rint_double_param_0];
+; CHECK-NEXT:    cvt.rni.f64.f64 %fd2, %fd1;
+; CHECK-NEXT:    st.param.f64 [func_retval0+0], %fd2;
+; CHECK-NEXT:    ret;
   %b = call double @llvm.rint.f64(double %a)
   ret double %b
 }
 
 ; ---- roundeven ----
 
-; CHECK-LABEL: roundeven_float
 define float @roundeven_float(float %a) {
-  ; CHECK: cvt.rni.f32.f32
+; CHECK-LABEL: roundeven_float(
+; CHECK:       {
+; CHECK-NEXT:    .reg .f32 %f<3>;
+; CHECK-EMPTY:
+; CHECK-NEXT:  // %bb.0:
+; CHECK-NEXT:    ld.param.f32 %f1, [roundeven_float_param_0];
+; CHECK-NEXT:    cvt.rni.f32.f32 %f2, %f1;
+; CHECK-NEXT:    st.param.f32 [func_retval0+0], %f2;
+; CHECK-NEXT:    ret;
   %b = call float @llvm.roundeven.f32(float %a)
   ret float %b
 }
 
-; CHECK-LABEL: roundeven_float_ftz
 define float @roundeven_float_ftz(float %a) #1 {
-  ; CHECK: cvt.rni.ftz.f32.f32
+; CHECK-LABEL: roundeven_float_ftz(
+; CHECK:       {
+; CHECK-NEXT:    .reg .f32 %f<3>;
+; CHECK-EMPTY:
+; CHECK-NEXT:  // %bb.0:
+; CHECK-NEXT:    ld.param.f32 %f1, [roundeven_float_ftz_param_0];
+; CHECK-NEXT:    cvt.rni.ftz.f32.f32 %f2, %f1;
+; CHECK-NEXT:    st.param.f32 [func_retval0+0], %f2;
+; CHECK-NEXT:    ret;
   %b = call float @llvm.roundeven.f32(float %a)
   ret float %b
 }
 
-; CHECK-LABEL: roundeven_double
 define double @roundeven_double(double %a) {
-  ; CHECK: cvt.rni.f64.f64
+; CHECK-LABEL: roundeven_double(
+; CHECK:       {
+; CHECK-NEXT:    .reg .f64 %fd<3>;
+; CHECK-EMPTY:
+; CHECK-NEXT:  // %bb.0:
+; CHECK-NEXT:    ld.param.f64 %fd1, [roundeven_double_param_0];
+; CHECK-NEXT:    cvt.rni.f64.f64 %fd2, %fd1;
+; CHECK-NEXT:    st.param.f64 [func_retval0+0], %fd2;
+; CHECK-NEXT:    ret;
   %b = call double @llvm.roundeven.f64(double %a)
   ret double %b
 }
 
 ; ---- trunc ----
 
-; CHECK-LABEL: trunc_float
 define float @trunc_float(float %a) {
-  ; CHECK: cvt.rzi.f32.f32
+; CHECK-LABEL: trunc_float(
+; CHECK:       {
+; CHECK-NEXT:    .reg .f32 %f<3>;
+; CHECK-EMPTY:
+; CHECK-NEXT:  // %bb.0:
+; CHECK-NEXT:    ld.param.f32 %f1, [trunc_float_param_0];
+; CHECK-NEXT:    cvt.rzi.f32.f32 %f2, %f1;
+; CHECK-NEXT:    st.param.f32 [func_retval0+0], %f2;
+; CHECK-NEXT:    ret;
   %b = call float @llvm.trunc.f32(float %a)
   ret float %b
 }
 
-; CHECK-LABEL: trunc_float_ftz
 define float @trunc_float_ftz(float %a) #1 {
-  ; CHECK: cvt.rzi.ftz.f32.f32
+; CHECK-LABEL: trunc_float_ftz(
+; CHECK:       {
+; CHECK-NEXT:    .reg .f32 %f<3>;
+; CHECK-EMPTY:
+; CHECK-NEXT:  // %bb.0:
+; CHECK-NEXT:    ld.param.f32 %f1, [trunc_float_ftz_param_0];
+; CHECK-NEXT:    cvt.rzi.ftz.f32.f32 %f2, %f1;
+; CHECK-NEXT:    st.param.f32 [func_retval0+0], %f2;
+; CHECK-NEXT:    ret;
   %b = call float @llvm.trunc.f32(float %a)
   ret float %b
 }
 
-; CHECK-LABEL: trunc_double
 define double @trunc_double(double %a) {
-  ; CHECK: cvt.rzi.f64.f64
+; CHECK-LABEL: trunc_double(
+; CHECK:       {
+; CHECK-NEXT:    .reg .f64 %fd<3>;
+; CHECK-EMPTY:
+; CHECK-NEXT:  // %bb.0:
+; CHECK-NEXT:    ld.param.f64 %fd1, [trunc_double_param_0];
+; CHECK-NEXT:    cvt.rzi.f64.f64 %fd2, %fd1;
+; CHECK-NEXT:    st.param.f64 [func_retval0+0], %fd2;
+; CHECK-NEXT:    ret;
   %b = call double @llvm.trunc.f64(double %a)
   ret double %b
 }
 
 ; ---- abs ----
 
-; CHECK-LABEL: abs_float
 define float @abs_float(float %a) {
-  ; CHECK: abs.f32
+; CHECK-LABEL: abs_float(
+; CHECK:       {
+; CHECK-NEXT:    .reg .f32 %f<3>;
+; CHECK-EMPTY:
+; CHECK-NEXT:  // %bb.0:
+; CHECK-NEXT:    ld.param.f32 %f1, [abs_float_param_0];
+; CHECK-NEXT:    abs.f32 %f2, %f1;
+; CHECK-NEXT:    st.param.f32 [func_retval0+0], %f2;
+; CHECK-NEXT:    ret;
   %b = call float @llvm.fabs.f32(float %a)
   ret float %b
 }
 
-; CHECK-LABEL: abs_float_ftz
 define float @abs_float_ftz(float %a) #1 {
-  ; CHECK: abs.ftz.f32
+; CHECK-LABEL: abs_float_ftz(
+; CHECK:       {
+; CHECK-NEXT:    .reg .f32 %f<3>;
+; CHECK-EMPTY:
+; CHECK-NEXT:  // %bb.0:
+; CHECK-NEXT:    ld.param.f32 %f1, [abs_float_ftz_param_0];
+; CHECK-NEXT:    abs.ftz.f32 %f2, %f1;
+; CHECK-NEXT:    st.param.f32 [func_retval0+0], %f2;
+; CHECK-NEXT:    ret;
   %b = call float @llvm.fabs.f32(float %a)
   ret float %b
 }
 
-; CHECK-LABEL: abs_double
 define double @abs_double(double %a) {
-  ; CHECK: abs.f64
+; CHECK-LABEL: abs_double(
+; CHECK:       {
+; CHECK-NEXT:    .reg .f64 %fd<3>;
+; CHECK-EMPTY:
+; CHECK-NEXT:  // %bb.0:
+; CHECK-NEXT:    ld.param.f64 %fd1, [abs_double_param_0];
+; CHECK-NEXT:    abs.f64 %fd2, %fd1;
+; CHECK-NEXT:    st.param.f64 [func_retval0+0], %fd2;
+; CHECK-NEXT:    ret;
   %b = call double @llvm.fabs.f64(double %a)
   ret double %b
 }
 
-; ---- min ----
+; ---- minnum ----
 
-; CHECK-LABEL: min_half
-define half @min_half(half %a, half %b) {
-  ; CHECK-NOF16: min.f32
-  ; CHECK-F16: min.f16
+define half @minnum_half(half %a, half %b) {
+; CHECK-NOF16-LABEL: minnum_half(
+; CHECK-NOF16:       {
+; CHECK-NOF16-NEXT:    .reg .b16 %rs<4>;
+; CHECK-NOF16-NEXT:    .reg .f32 %f<4>;
+; CHECK-NOF16-EMPTY:
+; CHECK-NOF16-NEXT:  // %bb.0:
+; CHECK-NOF16-NEXT:    ld.param.b16 %rs1, [minnum_half_param_0];
+; CHECK-NOF16-NEXT:    ld.param.b16 %rs2, [minnum_half_param_1];
+; CHECK-NOF16-NEXT:    cvt.f32.f16 %f1, %rs2;
+; CHECK-NOF16-NEXT:    cvt.f32.f16 %f2, %rs1;
+; CHECK-NOF16-NEXT:    min.f32 %f3, %f2, %f1;
+; CHECK-NOF16-NEXT:    cvt.rn.f16.f32 %rs3, %f3;
+; CHECK-NOF16-NEXT:    st.param.b16 [func_retval0+0], %rs3;
+; CHECK-NOF16-NEXT:    ret;
+;
+; CHECK-F16-LABEL: minnum_half(
+; CHECK-F16:       {
+; CHECK-F16-NEXT:    .reg .b16 %rs<4>;
+; CHECK-F16-EMPTY:
+; CHECK-F16-NEXT:  // %bb.0:
+; CHECK-F16-NEXT:    ld.param.b16 %rs1, [minnum_half_param_0];
+; CHECK-F16-NEXT:    ld.param.b16 %rs2, [minnum_half_param_1];
+; CHECK-F16-NEXT:    min.f16 %rs3, %rs1, %rs2;
+; CHECK-F16-NEXT:    st.param.b16 [func_retval0+0], %rs3;
+; CHECK-F16-NEXT:    ret;
+;
+; CHECK-SM80-NOF16-LABEL: minnum_half(
+; CHECK-SM80-NOF16:       {
+; CHECK-SM80-NOF16-NEXT:    .reg .b16 %rs<4>;
+; CHECK-SM80-NOF16-NEXT:    .reg .f32 %f<4>;
+; CHECK-SM80-NOF16-EMPTY:
+; CHECK-SM80-NOF16-NEXT:  // %bb.0:
+; CHECK-SM80-NOF16-NEXT:    ld.param.b16 %rs1, [minnum_half_param_0];
+; CHECK-SM80-NOF16-NEXT:    ld.param.b16 %rs2, [minnum_half_param_1];
+; CHECK-SM80-NOF16-NEXT:    cvt.f32.f16 %f1, %rs2;
+; CHECK-SM80-NOF16-NEXT:    cvt.f32.f16 %f2, %rs1;
+; CHECK-SM80-NOF16-NEXT:    min.f32 %f3, %f2, %f1;
+; CHECK-SM80-NOF16-NEXT:    cvt.rn.f16.f32 %rs3, %f3;
+; CHECK-SM80-NOF16-NEXT:    st.param.b16 [func_retval0+0], %rs3;
+; CHECK-SM80-NOF16-NEXT:    ret;
   %x = call half @llvm.minnum.f16(half %a, half %b)
   ret half %x
 }
 
-; CHECK-LABEL: min_float
-define float @min_float(float %a, float %b) {
-  ; CHECK: min.f32
+define float @minnum_float(float %a, float %b) {
+; CHECK-LABEL: minnum_float(
+; CHECK:       {
+; CHECK-NEXT:    .reg .f32 %f<4>;
+; CHECK-EMPTY:
+; CHECK-NEXT:  // %bb.0:
+; CHECK-NEXT:    ld.param.f32 %f1, [minnum_float_param_0];
+; CHECK-NEXT:    ld.param.f32 %f2, [minnum_float_param_1];
+; CHECK-NEXT:    min.f32 %f3, %f1, %f2;
+; CHECK-NEXT:    st.param.f32 [func_retval0+0], %f3;
+; CHECK-NEXT:    ret;
   %x = call float @llvm.minnum.f32(float %a, float %b)
   ret float %x
 }
 
-; CHECK-LABEL: min_imm1
-define float @min_imm1(float %a) {
-  ; CHECK: min.f32
+define float @minnum_imm1(float %a) {
+; CHECK-LABEL: minnum_imm1(
+; CHECK:       {
+; CHECK-NEXT:    .reg .f32 %f<3>;
+; CHECK-EMPTY:
+; CHECK-NEXT:  // %bb.0:
+; CHECK-NEXT:    ld.param.f32 %f1, [minnum_imm1_param_0];
+; CHECK-NEXT:    min.f32 %f2, %f1, 0f00000000;
+; CHECK-NEXT:    st.param.f32 [func_retval0+0], %f2;
+; CHECK-NEXT:    ret;
   %x = call float @llvm.minnum.f32(float %a, float 0.0)
   ret float %x
 }
 
-; CHECK-LABEL: min_imm2
-define float @min_imm2(float %a) {
-  ; CHECK: min.f32
+define float @minnum_imm2(float %a) {
+; CHECK-LABEL: minnum_imm2(
+; CHECK:       {
+; CHECK-NEXT:    .reg .f32 %f<3>;
+; CHECK-EMPTY:
+; CHECK-NEXT:  // %bb.0:
+; CHECK-NEXT:    ld.param.f32 %f1, [minnum_imm2_param_0];
+; CHECK-NEXT:    min.f32 %f2, %f1, 0f00000000;
+; CHECK-NEXT:    st.param.f32 [func_retval0+0], %f2;
+; CHECK-NEXT:    ret;
   %x = call float @llvm.minnum.f32(float 0.0, float %a)
   ret float %x
 }
 
-; CHECK-LABEL: min_float_ftz
-define float @min_float_ftz(float %a, float %b) #1 {
-  ; CHECK: min.ftz.f32
+define float @minnum_float_ftz(float %a, float %b) #1 {
+; CHECK-LABEL: minnum_float_ftz(
+; CHECK:       {
+; CHECK-NEXT:    .reg .f32 %f<4>;
+; CHECK-EMPTY:
+; CHECK-NEXT:  // %bb.0:
+; CHECK-NEXT:    ld.param.f32 %f1, [minnum_float_ftz_param_0];
+; CHECK-NEXT:    ld.param.f32 %f2, [minnum_float_ftz_param_1];
+; CHECK-NEXT:    min.ftz.f32 %f3, %f1, %f2;
+; CHECK-NEXT:    st.param.f32 [func_retval0+0], %f3;
+; CHECK-NEXT:    ret;
   %x = call float @llvm.minnum.f32(float %a, float %b)
   ret float %x
 }
 
-; CHECK-LABEL: min_double
-define double @min_double(double %a, double %b) {
-  ; CHECK: min.f64
+define double @minnum_double(double %a, double %b) {
+; CHECK-LABEL: minnum_double(
+; CHECK:       {
+; CHECK-NEXT:    .reg .f64 %fd<4>;
+; CHECK-EMPTY:
+; CHECK-NEXT:  // %bb.0:
+; CHECK-NEXT:    ld.param.f64 %fd1, [minnum_double_param_0];
+; CHECK-NEXT:    ld.param.f64 %fd2, [minnum_double_param_1];
+; CHECK-NEXT:    min.f64 %fd3, %fd1, %fd2;
+; CHECK-NEXT:    st.param.f64 [func_retval0+0], %fd3;
+; CHECK-NEXT:    ret;
   %x = call double @llvm.minnum.f64(double %a, double %b)
   ret double %x
 }
 
-; CHECK-LABEL: min_v2half
-define <2 x half> @min_v2half(<2 x half> %a, <2 x half> %b) {
-  ; CHECK-NOF16: min.f32
-  ; CHECK-NOF16: min.f32
-  ; CHECK-F16: min.f16x2
+define <2 x half> @minnum_v2half(<2 x half> %a, <2 x half> %b) {
+; CHECK-NOF16-LABEL: minnum_v2half(
+; CHECK-NOF16:       {
+; CHECK-NOF16-NEXT:    .reg .b16 %rs<7>;
+; CHECK-NOF16-NEXT:    .reg .b32 %r<4>;
+; CHECK-NOF16-NEXT:    .reg .f32 %f<7>;
+; CHECK-NOF16-EMPTY:
+; CHECK-NOF16-NEXT:  // %bb.0:
+; CHECK-NOF16-NEXT:    ld.param.b32 %r1, [minnum_v2half_param_0];
+; CHECK-NOF16-NEXT:    ld.param.b32 %r2, [minnum_v2half_param_1];
+; CHECK-NOF16-NEXT:    mov.b32 {%rs1, %rs2}, %r2;
+; CHECK-NOF16-NEXT:    cvt.f32.f16 %f1, %rs2;
+; CHECK-NOF16-NEXT:    mov.b32 {%rs3, %rs4}, %r1;
+; CHECK-NOF16-NEXT:    cvt.f32.f16 %f2, %rs4;
+; CHECK-NOF16-NEXT:    min.f32 %f3, %f2, %f1;
+; CHECK-NOF16-NEXT:    cvt.rn.f16.f32 %rs5, %f3;
+; CHECK-NOF16-NEXT:    cvt.f32.f16 %f4, %rs1;
+; CHECK-NOF16-NEXT:    cvt.f32.f16 %f5, %rs3;
+; CHECK-NOF16-NEXT:    min.f32 %f6, %f5, %f4;
+; CHECK-NOF16-NEXT:    cvt.rn.f16.f32 %rs6, %f6;
+; CHECK-NOF16-NEXT:    mov.b32 %r3, {%rs6, %rs5};
+; CHECK-NOF16-NEXT:    st.param.b32 [func_retval0+0], %r3;
+; CHECK-NOF16-NEXT:    ret;
+;
+; CHECK-F16-LABEL: minnum_v2half(
+; CHECK-F16:       {
+; CHECK-F16-NEXT:    .reg .b32 %r<4>;
+; CHECK-F16-EMPTY:
+; CHECK-F16-NEXT:  // %bb.0:
+; CHECK-F16-NEXT:    ld.param.b32 %r1, [minnum_v2half_param_1];
+; CHECK-F16-NEXT:    ld.param.b32 %r2, [minnum_v2half_param_0];
+; CHECK-F16-NEXT:    min.f16x2 %r3, %r2, %r1;
+; CHECK-F16-NEXT:    st.param.b32 [func_retval0+0], %r3;
+; CHECK-F16-NEXT:    ret;
+;
+; CHECK-SM80-NOF16-LABEL: minnum_v2half(
+; CHECK-SM80-NOF16:       {
+; CHECK-SM80-NOF16-NEXT:    .reg .b16 %rs<7>;
+; CHECK-SM80-NOF16-NEXT:    .reg .b32 %r<4>;
+; CHECK-SM80-NOF16-NEXT:    .reg .f32 %f<7>;
+; CHECK-SM80-NOF16-EMPTY:
+; CHECK-SM80-NOF16-NEXT:  // %bb.0:
+; CHECK-SM80-NOF16-NEXT:    ld.param.b32 %r1, [minnum_v2half_param_0];
+; CHECK-SM80-NOF16-NEXT:    ld.param.b32 %r2, [minnum_v2half_param_1];
+; CHECK-SM80-NOF16-NEXT:    mov.b32 {%rs1, %rs2}, %r2;
+; CHECK-SM80-NOF16-NEXT:    cvt.f32.f16 %f1, %rs2;
+; CHECK-SM80-NOF16-NEXT:    mov.b32 {%rs3, %rs4}, %r1;
+; CHECK-SM80-NOF16-NEXT:    cvt.f32.f16 %f2, %rs4;
+; CHECK-SM80-NOF16-NEXT:    min.f32 %f3, %f2, %f1;
+; CHECK-SM80-NOF16-NEXT:    cvt.rn.f16.f32 %rs5, %f3;
+; CHECK-SM80-NOF16-NEXT:    cvt.f32.f16 %f4, %rs1;
+; CHECK-SM80-NOF16-NEXT:    cvt.f32.f16 %f5, %rs3;
+; CHECK-SM80-NOF16-NEXT:    min.f32 %f6, %f5, %f4;
+; CHECK-SM80-NOF16-NEXT:    cvt.rn.f16.f32 %rs6, %f6;
+; CHECK-SM80-NOF16-NEXT:    mov.b32 %r3, {%rs6, %rs5};
+; CHECK-SM80-NOF16-NEXT:    st.param.b32 [func_retval0+0], %r3;
+; CHECK-SM80-NOF16-NEXT:    ret;
   %x = call <2 x half> @llvm.minnum.v2f16(<2 x half> %a, <2 x half> %b)
   ret <2 x half> %x
 }
 
-; ---- max ----
+; ---- minimum ----
 
-; CHECK-LABEL: max_half
-define half @max_half(half %a, half %b) {
-  ; CHECK-NOF16: max.f32
-  ; CHECK-F16: max.f16
+define half @minimum_half(half %a, half %b) {
+; CHECK-NOF16-LABEL: minimum_half(
+; CHECK-NOF16:       {
+; CHECK-NOF16-NEXT:    .reg .pred %p<6>;
+; CHECK-NOF16-NEXT:    .reg .b16 %rs<10>;
+; CHECK-NOF16-NEXT:    .reg .f32 %f<4>;
+; CHECK-NOF16-EMPTY:
+; CHECK-NOF16-NEXT:  // %bb.0:
+; CHECK-NOF16-NEXT:    ld.param.b16 %rs1, [minimum_half_param_0];
+; CHECK-NOF16-NEXT:    ld.param.b16 %rs3, [minimum_half_param_1];
+; CHECK-NOF16-NEXT:    cvt.f32.f16 %f1, %rs3;
+; CHECK-NOF16-NEXT:    cvt.f32.f16 %f2, %rs1;
+; CHECK-NOF16-NEXT:    setp.lt.f32 %p1, %f2, %f1;
+; CHECK-NOF16-NEXT:    selp.b16 %rs4, %rs1, %rs3, %p1;
+; CHECK-NOF16-NEXT:    setp.nan.f32 %p2, %f2, %f1;
+; CHECK-NOF16-NEXT:    selp.b16 %rs5, 0x7E00, %rs4, %p2;
+; CHECK-NOF16-NEXT:    setp.eq.s16 %p3, %rs1, -32768;
+; CHECK-NOF16-NEXT:    selp.b16 %rs6, %rs1, %rs5, %p3;
+; CHECK-NOF16-NEXT:    setp.eq.s16 %p4, %rs3, -32768;
+; CHECK-NOF16-NEXT:    selp.b16 %rs8, %rs3, %rs6, %p4;
+; CHECK-NOF16-NEXT:    cvt.f32.f16 %f3, %rs5;
+; CHECK-NOF16-NEXT:    setp.eq.f32 %p5, %f3, 0f00000000;
+; CHECK-NOF16-NEXT:    selp.b16 %rs9, %rs8, %rs5, %p5;
+; CHECK-NOF16-NEXT:    st.param.b16 [func_retval0+0], %rs9;
+; CHECK-NOF16-NEXT:    ret;
+;
+; CHECK-F16-LABEL: minimum_half(
+; CHECK-F16:       {
+; CHECK-F16-NEXT:    .reg .b16 %rs<4>;
+; CHECK-F16-EMPTY:
+; CHECK-F16-NEXT:  // %bb.0:
+; CHECK-F16-NEXT:    ld.param.b16 %rs1, [minimum_half_param_0];
+; CHECK-F16-NEXT:    ld.param.b16 %rs2, [minimum_half_param_1];
+; CHECK-F16-NEXT:    min.NaN.f16 %rs3, %rs1, %rs2;
+; CHECK-F16-NEXT:    st.param.b16 [func_retval0+0], %rs3;
+; CHECK-F16-NEXT:    ret;
+;
+; CHECK-SM80-NOF16-LABEL: minimum_half(
+; CHECK-SM80-NOF16:       {
+; CHECK-SM80-NOF16-NEXT:    .reg .pred %p<6>;
+; CHECK-SM80-NOF16-NEXT:    .reg .b16 %rs<10>;
+; CHECK-SM80-NOF16-NEXT:    .reg .f32 %f<4>;
+; CHECK-SM80-NOF16-EMPTY:
+; CHECK-SM80-NOF16-NEXT:  // %bb.0:
+; CHECK-SM80-NOF16-NEXT:    ld.param.b16 %rs1, [minimum_half_param_0];
+; CHECK-SM80-NOF16-NEXT:    ld.param.b16 %rs3, [minimum_half_param_1];
+; CHECK-SM80-NOF16-NEXT:    cvt.f32.f16 %f1, %rs3;
+; CHECK-SM80-NOF16-NEXT:    cvt.f32.f16 %f2, %rs1;
+; CHECK-SM80-NOF16-NEXT:    setp.lt.f32 %p1, %f2, %f1;
+; CHECK-SM80-NOF16-NEXT:    selp.b16 %rs4, %rs1, %rs3, %p1;
+; CHECK-SM80-NOF16-NEXT:    setp.nan.f32 %p2, %f2, %f1;
+; CHECK-SM80-NOF16-NEXT:    selp.b16 %rs5, 0x7E00, %rs4, %p2;
+; CHECK-SM80-NOF16-NEXT:    setp.eq.s16 %p3, %rs1, -32768;
+; CHECK-SM80-NOF16-NEXT:    selp.b16 %rs6, %rs1, %rs5, %p3;
+; CHECK-SM80-NOF16-NEXT:    setp.eq.s16 %p4, %rs3, -32768;
+; CHECK-SM80-NOF16-NEXT:    selp.b16 %rs8, %rs3, %rs6, %p4;
+; CHECK-SM80-NOF16-NEXT:    cvt.f32.f16 %f3, %rs5;
+; CHECK-SM80-NOF16-NEXT:    setp.eq.f32 %p5, %f3, 0f00000000;
+; CHECK-SM80-NOF16-NEXT:    selp.b16 %rs9, %rs8, %rs5, %p5;
+; CHECK-SM80-NOF16-NEXT:    st.param.b16 [func_retval0+0], %rs9;
+; CHECK-SM80-NOF16-NEXT:    ret;
+  %x = call half @llvm.minimum.f16(half %a, half %b)
+  ret half %x
+}
+
+define float @minimum_float(float %a, float %b) {
+; CHECK-NOF16-LABEL: minimum_float(
+; CHECK-NOF16:       {
+; CHECK-NOF16-NEXT:    .reg .pred %p<5>;
+; CHECK-NOF16-NEXT:    .reg .b32 %r<3>;
+; CHECK-NOF16-NEXT:    .reg .f32 %f<8>;
+; CHECK-NOF16-EMPTY:
+; CHECK-NOF16-NEXT:  // %bb.0:
+; CHECK-NOF16-NEXT:    ld.param.f32 %f1, [minimum_float_param_0];
+; CHECK-NOF16-NEXT:    mov.b32 %r1, %f1;
+; CHECK-NOF16-NEXT:    ld.param.f32 %f2, [minimum_float_param_1];
+; CHECK-NOF16-NEXT:    setp.nan.f32 %p1, %f1, %f2;
+; CHECK-NOF16-NEXT:    min.f32 %f3, %f1, %f2;
+; CHECK-NOF16-NEXT:    selp.f32 %f4, 0f7FC00000, %f3, %p1;
+; CHECK-NOF16-NEXT:    setp.eq.s32 %p2, %r1, -2147483648;
+; CHECK-NOF16-NEXT:    selp.f32 %f5, %f1, %f4, %p2;
+; CHECK-NOF16-NEXT:    mov.b32 %r2, %f2;
+; CHECK-NOF16-NEXT:    setp.eq.s32 %p3, %r2, -2147483648;
+; CHECK-NOF16-NEXT:    selp.f32 %f6, %f2, %f5, %p3;
+; CHECK-NOF16-NEXT:    setp.eq.f32 %p4, %f4, 0f00000000;
+; CHECK-NOF16-NEXT:    selp.f32 %f7, %f6, %f4, %p4;
+; CHECK-NOF16-NEXT:    st.param.f32 [func_retval0+0], %f7;
+; CHECK-NOF16-NEXT:    ret;
+;
+; CHECK-F16-LABEL: minimum_float(
+; CHECK-F16:       {
+; CHECK-F16-NEXT:    .reg .f32 %f<4>;
+; CHECK-F16-EMPTY:
+; CHECK-F16-NEXT:  // %bb.0:
+; CHECK-F16-NEXT:    ld.param.f32 %f1, [minimum_float_param_0];
+; CHECK-F16-NEXT:    ld.param.f32 %f2, [minimum_float_param_1];
+; CHECK-F16-NEXT:    min.NaN.f32 %f3, %f1, %f2;
+; CHECK-F16-NEXT:    st.param.f32 [func_retval0+0], %f3;
+; CHECK-F16-NEXT:    ret;
+;
+; CHECK-SM80-NOF16-LABEL: minimum_float(
+; CHECK-SM80-NOF16:       {
+; CHECK-SM80-NOF16-NEXT:    .reg .f32 %f<4>;
+; CHECK-SM80-NOF16-EMPTY:
+; CHECK-SM80-NOF16-NEXT:  // %bb.0:
+; CHECK-SM80-NOF16-NEXT:    ld.param.f32 %f1, [minimum_float_param_0];
+; CHECK-SM80-NOF16-NEXT:    ld.param.f32 %f2, [minimum_float_param_1];
+; CHECK-SM80-NOF16-NEXT:    min.NaN.f32 %f3, %f1, %f2;
+; CHECK-SM80-NOF16-NEXT:    st.param.f32 [func_retval0+0], %f3;
+; CHECK-SM80-NOF16-NEXT:    ret;
+  %x = call float @llvm.minimum.f32(float %a, float %b)
+  ret float %x
+}
+
+define float @minimum_imm1(float %a) {
+; CHECK-NOF16-LABEL: minimum_imm1(
+; CHECK-NOF16:       {
+; CHECK-NOF16-NEXT:    .reg .pred %p<4>;
+; CHECK-NOF16-NEXT:    .reg .b32 %r<2>;
+; CHECK-NOF16-NEXT:    .reg .f32 %f<6>;
+; CHECK-NOF16-EMPTY:
+; CHECK-NOF16-NEXT:  // %bb.0:
+; CHECK-NOF16-NEXT:    ld.param.f32 %f1, [minimum_imm1_param_0];
+; CHECK-NOF16-NEXT:    mov.b32 %r1, %f1;
+; CHECK-NOF16-NEXT:    setp.nan.f32 %p1, %f1, %f1;
+; CHECK-NOF16-NEXT:    min.f32 %f2, %f1, 0f00000000;
+; CHECK-NOF16-NEXT:    selp.f32 %f3, 0f7FC00000, %f2, %p1;
+; CHECK-NOF16-NEXT:    setp.eq.s32 %p2, %r1, -2147483648;
+; CHECK-NOF16-NEXT:    selp.f32 %f4, %f1, %f3, %p2;
+; CHECK-NOF16-NEXT:    setp.eq.f32 %p3, %f3, 0f00000000;
+; CHECK-NOF16-NEXT:    selp.f32 %f5, %f4, %f3, %p3;
+; CHECK-NOF16-NEXT:    st.param.f32 [func_retval0+0], %f5;
+; CHECK-NOF16-NEXT:    ret;
+;
+; CHECK-F16-LABEL: minimum_imm1(
+; CHECK-F16:       {
+; CHECK-F16-NEXT:    .reg .f32 %f<3>;
+; CHECK-F16-EMPTY:
+; CHECK-F16-NEXT:  // %bb.0:
+; CHECK-F16-NEXT:    ld.param.f32 %f1, [minimum_imm1_param_0];
+; CHECK-F16-NEXT:    min.NaN.f32 %f2, %f1, 0f00000000;
+; CHECK-F16-NEXT:    st.param.f32 [func_retval0+0], %f2;
+; CHECK-F16-NEXT:    ret;
+;
+; CHECK-SM80-NOF16-LABEL: minimum_imm1(
+; CHECK-SM80-NOF16:       {
+; CHECK-SM80-NOF16-NEXT:    .reg .f32 %f<3>;
+; CHECK-SM80-NOF16-EMPTY:
+; CHECK-SM80-NOF16-NEXT:  // %bb.0:
+; CHECK-SM80-NOF16-NEXT:    ld.param.f32 %f1, [minimum_imm1_param_0];
+; CHECK-SM80-NOF16-NEXT:    min.NaN.f32 %f2, %f1, 0f00000000;
+; CHECK-SM80-NOF16-NEXT:    st.param.f32 [func_retval0+0], %f2;
+; CHECK-SM80-NOF16-NEXT:    ret;
+  %x = call float @llvm.minimum.f32(float %a, float 0.0)
+  ret float %x
+}
+
+define float @minimum_imm2(float %a) {
+; CHECK-NOF16-LABEL: minimum_imm2(
+; CHECK-NOF16:       {
+; CHECK-NOF16-NEXT:    .reg .pred %p<4>;
+; CHECK-NOF16-NEXT:    .reg .b32 %r<2>;
+; CHECK-NOF16-NEXT:    .reg .f32 %f<6>;
+; CHECK-NOF16-EMPTY:
+; CHECK-NOF16-NEXT:  // %bb.0:
+; CHECK-NOF16-NEXT:    ld.param.f32 %f1, [minimum_imm2_param_0];
+; CHECK-NOF16-NEXT:    mov.b32 %r1, %f1;
+; CHECK-NOF16-NEXT:    setp.nan.f32 %p1, %f1, %f1;
+; CHECK-NOF16-NEXT:    min.f32 %f2, %f1, 0f00000000;
+; CHECK-NOF16-NEXT:    selp.f32 %f3, 0f7FC00000, %f2, %p1;
+; CHECK-NOF16-NEXT:    setp.eq.s32 %p2, %r1, -2147483648;
+; CHECK-NOF16-NEXT:    selp.f32 %f4, %f1, %f3, %p2;
+; CHECK-NOF16-NEXT:    setp.eq.f32 %p3, %f3, 0f00000000;
+; CHECK-NOF16-NEXT:    selp.f32 %f5, %f4, %f3, %p3;
+; CHECK-NOF16-NEXT:    st.param.f32 [func_retval0+0], %f5;
+; CHECK-NOF16-NEXT:    ret;
+;
+; CHECK-F16-LABEL: minimum_imm2(
+; CHECK-F16:       {
+; CHECK-F16-NEXT:    .reg .f32 %f<3>;
+; CHECK-F16-EMPTY:
+; CHECK-F16-NEXT:  // %bb.0:
+; CHECK-F16-NEXT:    ld.param.f32 %f1, [minimum_imm2_param_0];
+; CHECK-F16-NEXT:    min.NaN.f32 %f2, %f1, 0f00000000;
+; CHECK-F16-NEXT:    st.param.f32 [func_retval0+0], %f2;
+; CHECK-F16-NEXT:    ret;
+;
+; CHECK-SM80-NOF16-LABEL: minimum_imm2(
+; CHECK-SM80-NOF16:       {
+; CHECK-SM80-NOF16-NEXT:    .reg .f32 %f<3>;
+; CHECK-SM80-NOF16-EMPTY:
+; CHECK-SM80-NOF16-NEXT:  // %bb.0:
+; CHECK-SM80-NOF16-NEXT:    ld.param.f32 %f1, [minimum_imm2_param_0];
+; CHECK-SM80-NOF16-NEXT:    min.NaN.f32 %f2, %f1, 0f00000000;
+; CHECK-SM80-NOF16-NEXT:    st.param.f32 [func_retval0+0], %f2;
+; CHECK-SM80-NOF16-NEXT:    ret;
+  %x = call float @llvm.minimum.f32(float 0.0, float %a)
+  ret float %x
+}
+
+define float @minimum_float_ftz(float %a, float %b) #1 {
+; CHECK-NOF16-LABEL: minimum_float_ftz(
+; CHECK-NOF16:       {
+; CHECK-NOF16-NEXT:    .reg .pred %p<5>;
+; CHECK-NOF16-NEXT:    .reg .b32 %r<3>;
+; CHECK-NOF16-NEXT:    .reg .f32 %f<8>;
+; CHECK-NOF16-EMPTY:
+; CHECK-NOF16-NEXT:  // %bb.0:
+; CHECK-NOF16-NEXT:    ld.param.f32 %f1, [minimum_float_ftz_param_0];
+; CHECK-NOF16-NEXT:    mov.b32 %r1, %f1;
+; CHECK-NOF16-NEXT:    ld.param.f32 %f2, [minimum_float_ftz_param_1];
+; CHECK-NOF16-NEXT:    setp.nan.ftz.f32 %p1, %f1, %f2;
+; CHECK-NOF16-NEXT:    min.ftz.f32 %f3, %f1, %f2;
+; CHECK-NOF16-NEXT:    selp.f32 %f4, 0f7FC00000, %f3, %p1;
+; CHECK-NOF16-NEXT:    setp.eq.s32 %p2, %r1, -2147483648;
+; CHECK-NOF16-NEXT:    selp.f32 %f5, %f1, %f4, %p2;
+; CHECK-NOF16-NEXT:    mov.b32 %r2, %f2;
+; CHECK-NOF16-NEXT:    setp.eq.s32 %p3, %r2, -2147483648;
+; CHECK-NOF16-NEXT:    selp.f32 %f6, %f2, %f5, %p3;
+; CHECK-NOF16-NEXT:    setp.eq.ftz.f32 %p4, %f4, 0f00000000;
+; CHECK-NOF16-NEXT:    selp.f32 %f7, %f6, %f4, %p4;
+; CHECK-NOF16-NEXT:    st.param.f32 [func_retval0+0], %f7;
+; CHECK-NOF16-NEXT:    ret;
+;
+; CHECK-F16-LABEL: minimum_float_ftz(
+; CHECK-F16:       {
+; CHECK-F16-NEXT:    .reg .f32 %f<4>;
+; CHECK-F16-EMPTY:
+; CHECK-F16-NEXT:  // %bb.0:
+; CHECK-F16-NEXT:    ld.param.f32 %f1, [minimum_float_ftz_param_0];
+; CHECK-F16-NEXT:    ld.param.f32 %f2, [minimum_float_ftz_param_1];
+; CHECK-F16-NEXT:    min.NaN.ftz.f32 %f3, %f1, %f2;
+; CHECK-F16-NEXT:    st.param.f32 [func_retval0+0], %f3;
+; CHECK-F16-NEXT:    ret;
+;
+; CHECK-SM80-NOF16-LABEL: minimum_float_ftz(
+; CHECK-SM80-NOF16:       {
+; CHECK-SM80-NOF16-NEXT:    .reg .f32 %f<4>;
+; CHECK-SM80-NOF16-EMPTY:
+; CHECK-SM80-NOF16-NEXT:  // %bb.0:
+; CHECK-SM80-NOF16-NEXT:    ld.param.f32 %f1, [minimum_float_ftz_param_0];
+; CHECK-SM80-NOF16-NEXT:    ld.param.f32 %f2, [minimum_float_ftz_param_1];
+; CHECK-SM80-NOF16-NEXT:    min.NaN.ftz.f32 %f3, %f1, %f2;
+; CHECK-SM80-NOF16-NEXT:    st.param.f32 [func_retval0+0], %f3;
+; CHECK-SM80-NOF16-NEXT:    ret;
+  %x = call float @llvm.minimum.f32(float %a, float %b)
+  ret float %x
+}
+
+define double @minimum_double(double %a, double %b) {
+; CHECK-LABEL: minimum_double(
+; CHECK:       {
+; CHECK-NEXT:    .reg .pred %p<5>;
+; CHECK-NEXT:    .reg .b64 %rd<3>;
+; CHECK-NEXT:    .reg .f64 %fd<8>;
+; CHECK-EMPTY:
+; CHECK-NEXT:  // %bb.0:
+; CHECK-NEXT:    ld.param.f64 %fd1, [minimum_double_param_0];
+; CHECK-NEXT:    mov.b64 %rd1, %fd1;
+; CHECK-NEXT:    ld.param.f64 %fd2, [minimum_double_param_1];
+; CHECK-NEXT:    setp.nan.f64 %p1, %fd1, %fd2;
+; CHECK-NEXT:    min.f64 %fd3, %fd1, %fd2;
+; CHECK-NEXT:    selp.f64 %fd4, 0d7FF8000000000000, %fd3, %p1;
+; CHECK-NEXT:    setp.eq.s64 %p2, %rd1, -9223372036854775808;
+; CHECK-NEXT:    selp.f64 %fd5, %fd1, %fd4, %p2;
+; CHECK-NEXT:    mov.b64 %rd2, %fd2;
+; CHECK-NEXT:    setp.eq.s64 %p3, %rd2, -9223372036854775808;
+; CHECK-NEXT:    selp.f64 %fd6, %fd2, %fd5, %p3;
+; CHECK-NEXT:    setp.eq.f64 %p4, %fd4, 0d0000000000000000;
+; CHECK-NEXT:    selp.f64 %fd7, %fd6, %fd4, %p4;
+; CHECK-NEXT:    st.param.f64 [func_retval0+0], %fd7;
+; CHECK-NEXT:    ret;
+  %x = call double @llvm.minimum.f64(double %a, double %b)
+  ret double %x
+}
+
+define <2 x half> @minimum_v2half(<2 x half> %a, <2 x half> %b) {
+; CHECK-NOF16-LABEL: minimum_v2half(
+; CHECK-NOF16:       {
+; CHECK-NOF16-NEXT:    .reg .pred %p<11>;
+; CHECK-NOF16-NEXT:    .reg .b16 %rs<19>;
+; CHECK-NOF16-NEXT:    .reg .b32 %r<4>;
+; CHECK-NOF16-NEXT:    .reg .f32 %f<7>;
+; CHECK-NOF16-EMPTY:
+; CHECK-NOF16-NEXT:  // %bb.0:
+; CHECK-NOF16-NEXT:    ld.param.b32 %r1, [minimum_v2half_param_0];
+; CHECK-NOF16-NEXT:    ld.param.b32 %r2, [minimum_v2half_param_1];
+; CHECK-NOF16-NEXT:    mov.b32 {%rs1, %rs2}, %r2;
+; CHECK-NOF16-NEXT:    cvt.f32.f16 %f1, %rs2;
+; CHECK-NOF16-NEXT:    mov.b32 {%rs3, %rs4}, %r1;
+; CHECK-NOF16-NEXT:    cvt.f32.f16 %f2, %rs4;
+; CHECK-NOF16-NEXT:    setp.lt.f32 %p1, %f2, %f1;
+; CHECK-NOF16-NEXT:    selp.b16 %rs5, %rs4, %rs2, %p1;
+; CHECK-NOF16-NEXT:    setp.nan.f32 %p2, %f2, %f1;
+; CHECK-NOF16-NEXT:    selp.b16 %rs6, 0x7E00, %rs5, %p2;
+; CHECK-NOF16-NEXT:    setp.eq.s16 %p3, %rs4, -32768;
+; CHECK-NOF16-NEXT:    selp.b16 %rs8, %rs4, %rs6, %p3;
+; CHECK-NOF16-NEXT:    setp.eq.s16 %p4, %rs2, -32768;
+; CHECK-NOF16-NEXT:    selp.b16 %rs10, %rs2, %rs8, %p4;
+; CHECK-NOF16-NEXT:    cvt.f32.f16 %f3, %rs6;
+; CHECK-NOF16-NEXT:    setp.eq.f32 %p5, %f3, 0f00000000;
+; CHECK-NOF16-NEXT:    selp.b16 %rs11, %rs10, %rs6, %p5;
+; CHECK-NOF16-NEXT:    cvt.f32.f16 %f4, %rs1;
+; CHECK-NOF16-NEXT:    cvt.f32.f16 %f5, %rs3;
+; CHECK-NOF16-NEXT:    setp.lt.f32 %p6, %f5, %f4;
+; CHECK-NOF16-NEXT:    selp.b16 %rs12, %rs3, %rs1, %p6;
+; CHECK-NOF16-NEXT:    setp.nan.f32 %p7, %f5, %f4;
+; CHECK-NOF16-NEXT:    selp.b16 %rs13, 0x7E00, %rs12, %p7;
+; CHECK-NOF16-NEXT:    setp.eq.s16 %p8, %rs3, -32768;
+; CHECK-NOF16-NEXT:    selp.b16 %rs15, %rs3, %rs13, %p8;
+; CHECK-NOF16-NEXT:    setp.eq.s16 %p9, %rs1, -32768;
+; CHECK-NOF16-NEXT:    selp.b16 %rs17, %rs1, %rs15, %p9;
+; CHECK-NOF16-NEXT:    cvt.f32.f16 %f6, %rs13;
+; CHECK-NOF16-NEXT:    setp.eq.f32 %p10, %f6, 0f00000000;
+; CHECK-NOF16-NEXT:    selp.b16 %rs18, %rs17, %rs13, %p10;
+; CHECK-NOF16-NEXT:    mov.b32 %r3, {%rs18, %rs11};
+; CHECK-NOF16-NEXT:    st.param.b32 [func_retval0+0], %r3;
+; CHECK-NOF16-NEXT:    ret;
+;
+; CHECK-F16-LABEL: minimum_v2half(
+; CHECK-F16:       {
+; CHECK-F16-NEXT:    .reg .b32 %r<4>;
+; CHECK-F16-EMPTY:
+; CHECK-F16-NEXT:  // %bb.0:
+; CHECK-F16-NEXT:    ld.param.b32 %r1, [minimum_v2half_param_1];
+; CHECK-F16-NEXT:    ld.param.b32 %r2, [minimum_v2half_param_0];
+; CHECK-F16-NEXT:    min.NaN.f16x2 %r3, %r2, %r1;
+; CHECK-F16-NEXT:    st.param.b32 [func_retval0+0], %r3;
+; CHECK-F16-NEXT:    ret;
+;
+; CHECK-SM80-NOF16-LABEL: minimum_v2half(
+; CHECK-SM80-NOF16:       {
+; CHECK-SM80-NOF16-NEXT:    .reg .pred %p<11>;
+; CHECK-SM80-NOF16-NEXT:    .reg .b16 %rs<19>;
+; CHECK-SM80-NOF16-NEXT:    .reg .b32 %r<4>;
+; CHECK-SM80-NOF16-NEXT:    .reg .f32 %f<7>;
+; CHECK-SM80-NOF16-EMPTY:
+; CHECK-SM80-NOF16-NEXT:  // %bb.0:
+; CHECK-SM80-NOF16-NEXT:    ld.param.b32 %r1, [minimum_v2half_param_0];
+; CHECK-SM80-NOF16-NEXT:    ld.param.b32 %r2, [minimum_v2half_param_1];
+; CHECK-SM80-NOF16-NEXT:    mov.b32 {%rs1, %rs2}, %r2;
+; CHECK-SM80-NOF16-NEXT:    cvt.f32.f16 %f1, %rs2;
+; CHECK-SM80-NOF16-NEXT:    mov.b32 {%rs3, %rs4}, %r1;
+; CHECK-SM80-NOF16-NEXT:    cvt.f32.f16 %f2, %rs4;
+; CHECK-SM80-NOF16-NEXT:    setp.lt.f32 %p1, %f2, %f1;
+; CHECK-SM80-NOF16-NEXT:    selp.b16 %rs5, %rs4, %rs2, %p1;
+; CHECK-SM80-NOF16-NEXT:    setp.nan.f32 %p2, %f2, %f1;
+; CHECK-SM80-NOF16-NEXT:    selp.b16 %rs6, 0x7E00, %rs5, %p2;
+; CHECK-SM80-NOF16-NEXT:    setp.eq.s16 %p3, %rs4, -32768;
+; CHECK-SM80-NOF16-NEXT:    selp.b16 %rs8, %rs4, %rs6, %p3;
+; CHECK-SM80-NOF16-NEXT:    setp.eq.s16 %p4, %rs2, -32768;
+; CHECK-SM80-NOF16-NEXT:    selp.b16 %rs10, %rs2, %rs8, %p4;
+; CHECK-SM80-NOF16-NEXT:    cvt.f32.f16 %f3, %rs6;
+; CHECK-SM80-NOF16-NEXT:    setp.eq.f32 %p5, %f3, 0f00000000;
+; CHECK-SM80-NOF16-NEXT:    selp.b16 %rs11, %rs10, %rs6, %p5;
+; CHECK-SM80-NOF16-NEXT:    cvt.f32.f16 %f4, %rs1;
+; CHECK-SM80-NOF16-NEXT:    cvt.f32.f16 %f5, %rs3;
+; CHECK-SM80-NOF16-NEXT:    setp.lt.f32 %p6, %f5, %f4;
+; CHECK-SM80-NOF16-NEXT:    selp.b16 %rs12, %rs3, %rs1, %p6;
+; CHECK-SM80-NOF16-NEXT:    setp.nan.f32 %p7, %f5, %f4;
+; CHECK-SM80-NOF16-NEXT:    selp.b16 %rs13, 0x7E00, %rs12, %p7;
+; CHECK-SM80-NOF16-NEXT:    setp.eq.s16 %p8, %rs3, -32768;
+; CHECK-SM80-NOF16-NEXT:    selp.b16 %rs15, %rs3, %rs13, %p8;
+; CHECK-SM80-NOF16-NEXT:    setp.eq.s16 %p9, %rs1, -32768;
+; CHECK-SM80-NOF16-NEXT:    selp.b16 %rs17, %rs1, %rs15, %p9;
+; CHECK-SM80-NOF16-NEXT:    cvt.f32.f16 %f6, %rs13;
+; CHECK-SM80-NOF16-NEXT:    setp.eq.f32 %p10, %f6, 0f00000000;
+; CHECK-SM80-NOF16-NEXT:    selp.b16 %rs18, %rs17, %rs13, %p10;
+; CHECK-SM80-NOF16-NEXT:    mov.b32 %r3, {%rs18, %rs11};
+; CHECK-SM80-NOF16-NEXT:    st.param.b32 [func_retval0+0], %r3;
+; CHECK-SM80-NOF16-NEXT:    ret;
+  %x = call <2 x half> @llvm.minimum.v2f16(<2 x half> %a, <2 x half> %b)
+  ret <2 x half> %x
+}
+
+; ---- maxnum ----
+
+define half @maxnum_half(half %a, half %b) {
+; CHECK-NOF16-LABEL: maxnum_half(
+; CHECK-NOF16:       {
+; CHECK-NOF16-NEXT:    .reg .b16 %rs<4>;
+; CHECK-NOF16-NEXT:    .reg .f32 %f<4>;
+; CHECK-NOF16-EMPTY:
+; CHECK-NOF16-NEXT:  // %bb.0:
+; CHECK-NOF16-NEXT:    ld.param.b16 %rs1, [maxnum_half_param_0];
+; CHECK-NOF16-NEXT:    ld.param.b16 %rs2, [maxnum_half_param_1];
+; CHECK-NOF16-NEXT:    cvt.f32.f16 %f1, %rs2;
+; CHECK-NOF16-NEXT:    cvt.f32.f16 %f2, %rs1;
+; CHECK-NOF16-NEXT:    max.f32 %f3, %f2, %f1;
+; CHECK-NOF16-NEXT:    cvt.rn.f16.f32 %rs3, %f3;
+; CHECK-NOF16-NEXT:    st.param.b16 [func_retval0+0], %rs3;
+; CHECK-NOF16-NEXT:    ret;
+;
+; CHECK-F16-LABEL: maxnum_half(
+; CHECK-F16:       {
+; CHECK-F16-NEXT:    .reg .b16 %rs<4>;
+; CHECK-F16-EMPTY:
+; CHECK-F16-NEXT:  // %bb.0:
+; CHECK-F16-NEXT:    ld.param.b16 %rs1, [maxnum_half_param_0];
+; CHECK-F16-NEXT:    ld.param.b16 %rs2, [maxnum_half_param_1];
+; CHECK-F16-NEXT:    max.f16 %rs3, %rs1, %rs2;
+; CHECK-F16-NEXT:    st.param.b16 [func_retval0+0], %rs3;
+; CHECK-F16-NEXT:    ret;
+;
+; CHECK-SM80-NOF16-LABEL: maxnum_half(
+; CHECK-SM80-NOF16:       {
+; CHECK-SM80-NOF16-NEXT:    .reg .b16 %rs<4>;
+; CHECK-SM80-NOF16-NEXT:    .reg .f32 %f<4>;
+; CHECK-SM80-NOF16-EMPTY:
+; CHECK-SM80-NOF16-NEXT:  // %bb.0:
+; CHECK-SM80-NOF16-NEXT:    ld.param.b16 %rs1, [maxnum_half_param_0];
+; CHECK-SM80-NOF16-NEXT:    ld.param.b16 %rs2, [maxnum_half_param_1];
+; CHECK-SM80-NOF16-NEXT:    cvt.f32.f16 %f1, %rs2;
+; CHECK-SM80-NOF16-NEXT:    cvt.f32.f16 %f2, %rs1;
+; CHECK-SM80-NOF16-NEXT:    max.f32 %f3, %f2, %f1;
+; CHECK-SM80-NOF16-NEXT:    cvt.rn.f16.f32 %rs3, %f3;
+; CHECK-SM80-NOF16-NEXT:    st.param.b16 [func_retval0+0], %rs3;
+; CHECK-SM80-NOF16-NEXT:    ret;
   %x = call half @llvm.maxnum.f16(half %a, half %b)
   ret half %x
 }
 
-; CHECK-LABEL: max_imm1
-define float @max_imm1(float %a) {
-  ; CHECK: max.f32
+define float @maxnum_imm1(float %a) {
+; CHECK-LABEL: maxnum_imm1(
+; CHECK:       {
+; CHECK-NEXT:    .reg .f32 %f<3>;
+; CHECK-EMPTY:
+; CHECK-NEXT:  // %bb.0:
+; CHECK-NEXT:    ld.param.f32 %f1, [maxnum_imm1_param_0];
+; CHECK-NEXT:    max.f32 %f2, %f1, 0f00000000;
+; CHECK-NEXT:    st.param.f32 [func_retval0+0], %f2;
+; CHECK-NEXT:    ret;
   %x = call float @llvm.maxnum.f32(float %a, float 0.0)
   ret float %x
 }
 
-; CHECK-LABEL: max_imm2
-define float @max_imm2(float %a) {
-  ; CHECK: max.f32
+define float @maxnum_imm2(float %a) {
+; CHECK-LABEL: maxnum_imm2(
+; CHECK:       {
+; CHECK-NEXT:    .reg .f32 %f<3>;
+; CHECK-EMPTY:
+; CHECK-NEXT:  // %bb.0:
+; CHECK-NEXT:    ld.param.f32 %f1, [maxnum_imm2_param_0];
+; CHECK-NEXT:    max.f32 %f2, %f1, 0f00000000;
+; CHECK-NEXT:    st.param.f32 [func_retval0+0], %f2;
+; CHECK-NEXT:    ret;
   %x = call float @llvm.maxnum.f32(float 0.0, float %a)
   ret float %x
 }
 
-; CHECK-LABEL: max_float
-define float @max_float(float %a, float %b) {
-  ; CHECK: max.f32
+define float @maxnum_float(float %a, float %b) {
+; CHECK-LABEL: maxnum_float(
+; CHECK:       {
+; CHECK-NEXT:    .reg .f32 %f<4>;
+; CHECK-EMPTY:
+; CHECK-NEXT:  // %bb.0:
+; CHECK-NEXT:    ld.param.f32 %f1, [maxnum_float_param_0];
+; CHECK-NEXT:    ld.param.f32 %f2, [maxnum_float_param_1];
+; CHECK-NEXT:    max.f32 %f3, %f1, %f2;
+; CHECK-NEXT:    st.param.f32 [func_retval0+0], %f3;
+; CHECK-NEXT:    ret;
   %x = call float @llvm.maxnum.f32(float %a, float %b)
   ret float %x
 }
 
-; CHECK-LABEL: max_float_ftz
-define float @max_float_ftz(float %a, float %b) #1 {
-  ; CHECK: max.ftz.f32
+define float @maxnum_float_ftz(float %a, float %b) #1 {
+; CHECK-LABEL: maxnum_float_ftz(
+; CHECK:       {
+; CHECK-NEXT:    .reg .f32 %f<4>;
+; CHECK-EMPTY:
+; CHECK-NEXT:  // %bb.0:
+; CHECK-NEXT:    ld.param.f32 %f1, [maxnum_float_ftz_param_0];
+; CHECK-NEXT:    ld.param.f32 %f2, [maxnum_float_ftz_param_1];
+; CHECK-NEXT:    max.ftz.f32 %f3, %f1, %f2;
+; CHECK-NEXT:    st.param.f32 [func_retval0+0], %f3;
+; CHECK-NEXT:    ret;
   %x = call float @llvm.maxnum.f32(float %a, float %b)
   ret float %x
 }
 
-; CHECK-LABEL: max_double
-define double @max_double(double %a, double %b) {
-  ; CHECK: max.f64
+define double @maxnum_double(double %a, double %b) {
+; CHECK-LABEL: maxnum_double(
+; CHECK:       {
+; CHECK-NEXT:    .reg .f64 %fd<4>;
+; CHECK-EMPTY:
+; CHECK-NEXT:  // %bb.0:
+; CHECK-NEXT:    ld.param.f64 %fd1, [maxnum_double_param_0];
+; CHECK-NEXT:    ld.param.f64 %fd2, [maxnum_double_param_1];
+; CHECK-NEXT:    max.f64 %fd3, %fd1, %fd2;
+; CHECK-NEXT:    st.param.f64 [func_retval0+0], %fd3;
+; CHECK-NEXT:    ret;
   %x = call double @llvm.maxnum.f64(double %a, double %b)
   ret double %x
 }
 
-; CHECK-LABEL: max_v2half
-define <2 x half> @max_v2half(<2 x half> %a, <2 x half> %b) {
-  ; CHECK-NOF16: max.f32
-  ; CHECK-NOF16: max.f32
-  ; CHECK-F16: max.f16x2
+define <2 x half> @maxnum_v2half(<2 x half> %a, <2 x half> %b) {
+; CHECK-NOF16-LABEL: maxnum_v2half(
+; CHECK-NOF16:       {
+; CHECK-NOF16-NEXT:    .reg .b16 %rs<7>;
+; CHECK-NOF16-NEXT:    .reg .b32 %r<4>;
+; CHECK-NOF16-NEXT:    .reg .f32 %f<7>;
+; CHECK-NOF16-EMPTY:
+; CHECK-NOF16-NEXT:  // %bb.0:
+; CHECK-NOF16-NEXT:    ld.param.b32 %r1, [maxnum_v2half_param_0];
+; CHECK-NOF16-NEXT:    ld.param.b32 %r2, [maxnum_v2half_param_1];
+; CHECK-NOF16-NEXT:    mov.b32 {%rs1, %rs2}, %r2;
+; CHECK-NOF16-NEXT:    cvt.f32.f16 %f1, %rs2;
+; CHECK-NOF16-NEXT:    mov.b32 {%rs3, %rs4}, %r1;
+; CHECK-NOF16-NEXT:    cvt.f32.f16 %f2, %rs4;
+; CHECK-NOF16-NEXT:    max.f32 %f3, %f2, %f1;
+; CHECK-NOF16-NEXT:    cvt.rn.f16.f32 %rs5, %f3;
+; CHECK-NOF16-NEXT:    cvt.f32.f16 %f4, %rs1;
+; CHECK-NOF16-NEXT:    cvt.f32.f16 %f5, %rs3;
+; CHECK-NOF16-NEXT:    max.f32 %f6, %f5, %f4;
+; CHECK-NOF16-NEXT:    cvt.rn.f16.f32 %rs6, %f6;
+; CHECK-NOF16-NEXT:    mov.b32 %r3, {%rs6, %rs5};
+; CHECK-NOF16-NEXT:    st.param.b32 [func_retval0+0], %r3;
+; CHECK-NOF16-NEXT:    ret;
+;
+; CHECK-F16-LABEL: maxnum_v2half(
+; CHECK-F16:       {
+; CHECK-F16-NEXT:    .reg .b32 %r<4>;
+; CHECK-F16-EMPTY:
+; CHECK-F16-NEXT:  // %bb.0:
+; CHECK-F16-NEXT:    ld.param.b32 %r1, [maxnum_v2half_param_1];
+; CHECK-F16-NEXT:    ld.param.b32 %r2, [maxnum_v2half_param_0];
+; CHECK-F16-NEXT:    max.f16x2 %r3, %r2, %r1;
+; CHECK-F16-NEXT:    st.param.b32 [func_retval0+0], %r3;
+; CHECK-F16-NEXT:    ret;
+;
+; CHECK-SM80-NOF16-LABEL: maxnum_v2half(
+; CHECK-SM80-NOF16:       {
+; CHECK-SM80-NOF16-NEXT:    .reg .b16 %rs<7>;
+; CHECK-SM80-NOF16-NEXT:    .reg .b32 %r<4>;
+; CHECK-SM80-NOF16-NEXT:    .reg .f32 %f<7>;
+; CHECK-SM80-NOF16-EMPTY:
+; CHECK-SM80-NOF16-NEXT:  // %bb.0:
+; CHECK-SM80-NOF16-NEXT:    ld.param.b32 %r1, [maxnum_v2half_param_0];
+; CHECK-SM80-NOF16-NEXT:    ld.param.b32 %r2, [maxnum_v2half_param_1];
+; CHECK-SM80-NOF16-NEXT:    mov.b32 {%rs1, %rs2}, %r2;
+; CHECK-SM80-NOF16-NEXT:    cvt.f32.f16 %f1, %rs2;
+; CHECK-SM80-NOF16-NEXT:    mov.b32 {%rs3, %rs4}, %r1;
+; CHECK-SM80-NOF16-NEXT:    cvt.f32.f16 %f2, %rs4;
+; CHECK-SM80-NOF16-NEXT:    max.f32 %f3, %f2, %f1;
+; CHECK-SM80-NOF16-NEXT:    cvt.rn.f16.f32 %rs5, %f3;
+; CHECK-SM80-NOF16-NEXT:    cvt.f32.f16 %f4, %rs1;
+; CHECK-SM80-NOF16-NEXT:    cvt.f32.f16 %f5, %rs3;
+; CHECK-SM80-NOF16-NEXT:    max.f32 %f6, %f5, %f4;
+; CHECK-SM80-NOF16-NEXT:    cvt.rn.f16.f32 %rs6, %f6;
+; CHECK-SM80-NOF16-NEXT:    mov.b32 %r3, {%rs6, %rs5};
+; CHECK-SM80-NOF16-NEXT:    st.param.b32 [func_retval0+0], %r3;
+; CHECK-SM80-NOF16-NEXT:    ret;
   %x = call <2 x half> @llvm.maxnum.v2f16(<2 x half> %a, <2 x half> %b)
   ret <2 x half> %x
 }
 
+; ---- maximum ----
+
+define half @maximum_half(half %a, half %b) {
+; CHECK-NOF16-LABEL: maximum_half(
+; CHECK-NOF16:       {
+; CHECK-NOF16-NEXT:    .reg .pred %p<6>;
+; CHECK-NOF16-NEXT:    .reg .b16 %rs<10>;
+; CHECK-NOF16-NEXT:    .reg .f32 %f<4>;
+; CHECK-NOF16-EMPTY:
+; CHECK-NOF16-NEXT:  // %bb.0:
+; CHECK-NOF16-NEXT:    ld.param.b16 %rs1, [maximum_half_param_0];
+; CHECK-NOF16-NEXT:    ld.param.b16 %rs3, [maximum_half_param_1];
+; CHECK-NOF16-NEXT:    cvt.f32.f16 %f1, %rs3;
+; CHECK-NOF16-NEXT:    cvt.f32.f16 %f2, %rs1;
+; CHECK-NOF16-NEXT:    setp.gt.f32 %p1, %f2, %f1;
+; CHECK-NOF16-NEXT:    selp.b16 %rs4, %rs1, %rs3, %p1;
+; CHECK-NOF16-NEXT:    setp.nan.f32 %p2, %f2, %f1;
+; CHECK-NOF16-NEXT:    selp.b16 %rs5, 0x7E00, %rs4, %p2;
+; CHECK-NOF16-NEXT:    setp.eq.s16 %p3, %rs1, 0;
+; CHECK-NOF16-NEXT:    selp.b16 %rs6, %rs1, %rs5, %p3;
+; CHECK-NOF16-NEXT:    setp.eq.s16 %p4, %rs3, 0;
+; CHECK-NOF16-NEXT:    selp.b16 %rs8, %rs3, %rs6, %p4;
+; CHECK-NOF16-NEXT:    cvt.f32.f16 %f3, %rs5;
+; CHECK-NOF16-NEXT:    setp.eq.f32 %p5, %f3, 0f00000000;
+; CHECK-NOF16-NEXT:    selp.b16 %rs9, %rs8, %rs5, %p5;
+; CHECK-NOF16-NEXT:    st.param.b16 [func_retval0+0], %rs9;
+; CHECK-NOF16-NEXT:    ret;
+;
+; CHECK-F16-LABEL: maximum_half(
+; CHECK-F16:       {
+; CHECK-F16-NEXT:    .reg .b16 %rs<4>;
+; CHECK-F16-EMPTY:
+; CHECK-F16-NEXT:  // %bb.0:
+; CHECK-F16-NEXT:    ld.param.b16 %rs1, [maximum_half_param_0];
+; CHECK-F16-NEXT:    ld.param.b16 %rs2, [maximum_half_param_1];
+; CHECK-F16-NEXT:    max.NaN.f16 %rs3, %rs1, %rs2;
+; CHECK-F16-NEXT:    st.param.b16 [func_retval0+0], %rs3;
+; CHECK-F16-NEXT:    ret;
+;
+; CHECK-SM80-NOF16-LABEL: maximum_half(
+; CHECK-SM80-NOF16:       {
+; CHECK-SM80-NOF16-NEXT:    .reg .pred %p<6>;
+; CHECK-SM80-NOF16-NEXT:    .reg .b16 %rs<10>;
+; CHECK-SM80-NOF16-NEXT:    .reg .f32 %f<4>;
+; CHECK-SM80-NOF16-EMPTY:
+; CHECK-SM80-NOF16-NEXT:  // %bb.0:
+; CHECK-SM80-NOF16-NEXT:    ld.param.b16 %rs1, [maximum_half_param_0];
+; CHECK-SM80-NOF16-NEXT:    ld.param.b16 %rs3, [maximum_half_param_1];
+; CHECK-SM80-NOF16-NEXT:    cvt.f32.f16 %f1, %rs3;
+; CHECK-SM80-NOF16-NEXT:    cvt.f32.f16 %f2, %rs1;
+; CHECK-SM80-NOF16-NEXT:    setp.gt.f32 %p1, %f2, %f1;
+; CHECK-SM80-NOF16-NEXT:    selp.b16 %rs4, %rs1, %rs3, %p1;
+; CHECK-SM80-NOF16-NEXT:    setp.nan.f32 %p2, %f2, %f1;
+; CHECK-SM80-NOF16-NEXT:    selp.b16 %rs5, 0x7E00, %rs4, %p2;
+; CHECK-SM80-NOF16-NEXT:    setp.eq.s16 %p3, %rs1, 0;
+; CHECK-SM80-NOF16-NEXT:    selp.b16 %rs6, %rs1, %rs5, %p3;
+; CHECK-SM80-NOF16-NEXT:    setp.eq.s16 %p4, %rs3, 0;
+; CHECK-SM80-NOF16-NEXT:    selp.b16 %rs8, %rs3, %rs6, %p4;
+; CHECK-SM80-NOF16-NEXT:    cvt.f32.f16 %f3, %rs5;
+; CHECK-SM80-NOF16-NEXT:    setp.eq.f32 %p5, %f3, 0f00000000;
+; CHECK-SM80-NOF16-NEXT:    selp.b16 %rs9, %rs8, %rs5, %p5;
+; CHECK-SM80-NOF16-NEXT:    st.param.b16 [func_retval0+0], %rs9;
+; CHECK-SM80-NOF16-NEXT:    ret;
+  %x = call half @llvm.maximum.f16(half %a, half %b)
+  ret half %x
+}
+
+define float @maximum_imm1(float %a) {
+; CHECK-NOF16-LABEL: maximum_imm1(
+; CHECK-NOF16:       {
+; CHECK-NOF16-NEXT:    .reg .pred %p<3>;
+; CHECK-NOF16-NEXT:    .reg .f32 %f<5>;
+; CHECK-NOF16-EMPTY:
+; CHECK-NOF16-NEXT:  // %bb.0:
+; CHECK-NOF16-NEXT:    ld.param.f32 %f1, [maximum_imm1_param_0];
+; CHECK-NOF16-NEXT:    setp.nan.f32 %p1, %f1, %f1;
+; CHECK-NOF16-NEXT:    max.f32 %f2, %f1, 0f00000000;
+; CHECK-NOF16-NEXT:    selp.f32 %f3, 0f7FC00000, %f2, %p1;
+; CHECK-NOF16-NEXT:    setp.eq.f32 %p2, %f3, 0f00000000;
+; CHECK-NOF16-NEXT:    selp.f32 %f4, 0f00000000, %f3, %p2;
+; CHECK-NOF16-NEXT:    st.param.f32 [func_retval0+0], %f4;
+; CHECK-NOF16-NEXT:    ret;
+;
+; CHECK-F16-LABEL: maximum_imm1(
+; CHECK-F16:       {
+; CHECK-F16-NEXT:    .reg .f32 %f<3>;
+; CHECK-F16-EMPTY:
+; CHECK-F16-NEXT:  // %bb.0:
+; CHECK-F16-NEXT:    ld.param.f32 %f1, [maximum_imm1_param_0];
+; CHECK-F16-NEXT:    max.NaN.f32 %f2, %f1, 0f00000000;
+; CHECK-F16-NEXT:    st.param.f32 [func_retval0+0], %f2;
+; CHECK-F16-NEXT:    ret;
+;
+; CHECK-SM80-NOF16-LABEL: maximum_imm1(
+; CHECK-SM80-NOF16:       {
+; CHECK-SM80-NOF16-NEXT:    .reg .f32 %f<3>;
+; CHECK-SM80-NOF16-EMPTY:
+; CHECK-SM80-NOF16-NEXT:  // %bb.0:
+; CHECK-SM80-NOF16-NEXT:    ld.param.f32 %f1, [maximum_imm1_param_0];
+; CHECK-SM80-NOF16-NEXT:    max.NaN.f32 %f2, %f1, 0f00000000;
+; CHECK-SM80-NOF16-NEXT:    st.param.f32 [func_retval0+0], %f2;
+; CHECK-SM80-NOF16-NEXT:    ret;
+  %x = call float @llvm.maximum.f32(float %a, float 0.0)
+  ret float %x
+}
+
+define float @maximum_imm2(float %a) {
+; CHECK-NOF16-LABEL: maximum_imm2(
+; CHECK-NOF16:       {
+; CHECK-NOF16-NEXT:    .reg .pred %p<3>;
+; CHECK-NOF16-NEXT:    .reg .f32 %f<5>;
+; CHECK-NOF16-EMPTY:
+; CHECK-NOF16-NEXT:  // %bb.0:
+; CHECK-NOF16-NEXT:    ld.param.f32 %f1, [maximum_imm2_param_0];
+; CHECK-NOF16-NEXT:    setp.nan.f32 %p1, %f1, %f1;
+; CHECK-NOF16-NEXT:    max.f32 %f2, %f1, 0f00000000;
+; CHECK-NOF16-NEXT:    selp.f32 %f3, 0f7FC00000, %f2, %p1;
+; CHECK-NOF16-NEXT:    setp.eq.f32 %p2, %f3, 0f00000000;
+; CHECK-NOF16-NEXT:    selp.f32 %f4, 0f00000000, %f3, %p2;
+; CHECK-NOF16-NEXT:    st.param.f32 [func_retval0+0], %f4;
+; CHECK-NOF16-NEXT:    ret;
+;
+; CHECK-F16-LABEL: maximum_imm2(
+; CHECK-F16:       {
+; CHECK-F16-NEXT:    .reg .f32 %f<3>;
+; CHECK-F16-EMPTY:
+; CHECK-F16-NEXT:  // %bb.0:
+; CHECK-F16-NEXT:    ld.param.f32 %f1, [maximum_imm2_param_0];
+; CHECK-F16-NEXT:    max.NaN.f32 %f2, %f1, 0f00000000;
+; CHECK-F16-NEXT:    st.param.f32 [func_retval0+0], %f2;
+; CHECK-F16-NEXT:    ret;
+;
+; CHECK-SM80-NOF16-LABEL: maximum_imm2(
+; CHECK-SM80-NOF16:       {
+; CHECK-SM80-NOF16-NEXT:    .reg .f32 %f<3>;
+; CHECK-SM80-NOF16-EMPTY:
+; CHECK-SM80-NOF16-NEXT:  // %bb.0:
+; CHECK-SM80-NOF16-NEXT:    ld.param.f32 %f1, [maximum_imm2_param_0];
+; CHECK-SM80-NOF16-NEXT:    max.NaN.f32 %f2, %f1, 0f00000000;
+; CHECK-SM80-NOF16-NEXT:    st.param.f32 [func_retval0+0], %f2;
+; CHECK-SM80-NOF16-NEXT:    ret;
+  %x = call float @llvm.maximum.f32(float 0.0, float %a)
+  ret float %x
+}
+
+define float @maximum_float(float %a, float %b) {
+; CHECK-NOF16-LABEL: maximum_float(
+; CHECK-NOF16:       {
+; CHECK-NOF16-NEXT:    .reg .pred %p<5>;
+; CHECK-NOF16-NEXT:    .reg .b32 %r<3>;
+; CHECK-NOF16-NEXT:    .reg .f32 %f<8>;
+; CHECK-NOF16-EMPTY:
+; CHECK-NOF16-NEXT:  // %bb.0:
+; CHECK-NOF16-NEXT:    ld.param.f32 %f1, [maximum_float_param_0];
+; CHECK-NOF16-NEXT:    mov.b32 %r1, %f1;
+; CHECK-NOF16-NEXT:    ld.param.f32 %f2, [maximum_float_param_1];
+; CHECK-NOF16-NEXT:    setp.nan.f32 %p1, %f1, %f2;
+; CHECK-NOF16-NEXT:    max.f32 %f3, %f1, %f2;
+; CHECK-NOF16-NEXT:    selp.f32 %f4, 0f7FC00000, %f3, %p1;
+; CHECK-NOF16-NEXT:    setp.eq.s32 %p2, %r1, 0;
+; CHECK-NOF16-NEXT:    selp.f32 %f5, %f1, %f4, %p2;
+; CHECK-NOF16-NEXT:    mov.b32 %r2, %f2;
+; CHECK-NOF16-NEXT:    setp.eq.s32 %p3, %r2, 0;
+; CHECK-NOF16-NEXT:    selp.f32 %f6, %f2, %f5, %p3;
+; CHECK-NOF16-NEXT:    setp.eq.f32 %p4, %f4, 0f00000000;
+; CHECK-NOF16-NEXT:    selp.f32 %f7, %f6, %f4, %p4;
+; CHECK-NOF16-NEXT:    st.param.f32 [func_retval0+0], %f7;
+; CHECK-NOF16-NEXT:    ret;
+;
+; CHECK-F16-LABEL: maximum_float(
+; CHECK-F16:       {
+; CHECK-F16-NEXT:    .reg .f32 %f<4>;
+; CHECK-F16-EMPTY:
+; CHECK-F16-NEXT:  // %bb.0:
+; CHECK-F16-NEXT:    ld.param.f32 %f1, [maximum_float_param_0];
+; CHECK-F16-NEXT:    ld.param.f32 %f2, [maximum_float_param_1];
+; CHECK-F16-NEXT:    max.NaN.f32 %f3, %f1, %f2;
+; CHECK-F16-NEXT:    st.param.f32 [func_retval0+0], %f3;
+; CHECK-F16-NEXT:    ret;
+;
+; CHECK-SM80-NOF16-LABEL: maximum_float(
+; CHECK-SM80-NOF16:       {
+; CHECK-SM80-NOF16-NEXT:    .reg .f32 %f<4>;
+; CHECK-SM80-NOF16-EMPTY:
+; CHECK-SM80-NOF16-NEXT:  // %bb.0:
+; CHECK-SM80-NOF16-NEXT:    ld.param.f32 %f1, [maximum_float_param_0];
+; CHECK-SM80-NOF16-NEXT:    ld.param.f32 %f2, [maximum_float_param_1];
+; CHECK-SM80-NOF16-NEXT:    max.NaN.f32 %f3, %f1, %f2;
+; CHECK-SM80-NOF16-NEXT:    st.param.f32 [func_retval0+0], %f3;
+; CHECK-SM80-NOF16-NEXT:    ret;
+  %x = call float @llvm.maximum.f32(float %a, float %b)
+  ret float %x
+}
+
+define float @maximum_float_ftz(float %a, float %b) #1 {
+; CHECK-NOF16-LABEL: maximum_float_ftz(
+; CHECK-NOF16:       {
+; CHECK-NOF16-NEXT:    .reg .pred %p<5>;
+; CHECK-NOF16-NEXT:    .reg .b32 %r<3>;
+; CHECK-NOF16-NEXT:    .reg .f32 %f<8>;
+; CHECK-NOF16-EMPTY:
+; CHECK-NOF16-NEXT:  // %bb.0:
+; CHECK-NOF16-NEXT:    ld.param.f32 %f1, [maximum_float_ftz_param_0];
+; CHECK-NOF16-NEXT:    mov.b32 %r1, %f1;
+; CHECK-NOF16-NEXT:    ld.param.f32 %f2, [maximum_float_ftz_param_1];
+; CHECK-NOF16-NEXT:    setp.nan.ftz.f32 %p1, %f1, %f2;
+; CHECK-NOF16-NEXT:    max.ftz.f32 %f3, %f1, %f2;
+; CHECK-NOF16-NEXT:    selp.f32 %f4, 0f7FC00000, %f3, %p1;
+; CHECK-NOF16-NEXT:    setp.eq.s32 %p2, %r1, 0;
+; CHECK-NOF16-NEXT:    selp.f32 %f5, %f1, %f4, %p2;
+; CHECK-NOF16-NEXT:    mov.b32 %r2, %f2;
+; CHECK-NOF16-NEXT:    setp.eq.s32 %p3, %r2, 0;
+; CHECK-NOF16-NEXT:    selp.f32 %f6, %f2, %f5, %p3;
+; CHECK-NOF16-NEXT:    setp.eq.ftz.f32 %p4, %f4, 0f00000000;
+; CHECK-NOF16-NEXT:    selp.f32 %f7, %f6, %f4, %p4;
+; CHECK-NOF16-NEXT:    st.param.f32 [func_retval0+0], %f7;
+; CHECK-NOF16-NEXT:    ret;
+;
+; CHECK-F16-LABEL: maximum_float_ftz(
+; CHECK-F16:       {
+; CHECK-F16-NEXT:    .reg .f32 %f<4>;
+; CHECK-F16-EMPTY:
+; CHECK-F16-NEXT:  // %bb.0:
+; CHECK-F16-NEXT:    ld.param.f32 %f1, [maximum_float_ftz_param_0];
+; CHECK-F16-NEXT:    ld.param.f32 %f2, [maximum_float_ftz_param_1];
+; CHECK-F16-NEXT:    max.NaN.ftz.f32 %f3, %f1, %f2;
+; CHECK-F16-NEXT:    st.param.f32 [func_retval0+0], %f3;
+; CHECK-F16-NEXT:    ret;
+;
+; CHECK-SM80-NOF16-LABEL: maximum_float_ftz(
+; CHECK-SM80-NOF16:       {
+; CHECK-SM80-NOF16-NEXT:    .reg .f32 %f<4>;
+; CHECK-SM80-NOF16-EMPTY:
+; CHECK-SM80-NOF16-NEXT:  // %bb.0:
+; CHECK-SM80-NOF16-NEXT:    ld.param.f32 %f1, [maximum_float_ftz_param_0];
+; CHECK-SM80-NOF16-NEXT:    ld.param.f32 %f2, [maximum_float_ftz_param_1];
+; CHECK-SM80-NOF16-NEXT:    max.NaN.ftz.f32 %f3, %f1, %f2;
+; CHECK-SM80-NOF16-NEXT:    st.param.f32 [func_retval0+0], %f3;
+; CHECK-SM80-NOF16-NEXT:    ret;
+  %x = call float @llvm.maximum.f32(float %a, float %b)
+  ret float %x
+}
+
+define double @maximum_double(double %a, double %b) {
+; CHECK-LABEL: maximum_double(
+; CHECK:       {
+; CHECK-NEXT:    .reg .pred %p<5>;
+; CHECK-NEXT:    .reg .b64 %rd<3>;
+; CHECK-NEXT:    .reg .f64 %fd<8>;
+; CHECK-EMPTY:
+; CHECK-NEXT:  // %bb.0:
+; CHECK-NEXT:    ld.param.f64 %fd1, [maximum_double_param_0];
+; CHECK-NEXT:    mov.b64 %rd1, %fd1;
+; CHECK-NEXT:    ld.param.f64 %fd2, [maximum_double_param_1];
+; CHECK-NEXT:    setp.nan.f64 %p1, %fd1, %fd2;
+; CHECK-NEXT:    max.f64 %fd3, %fd1, %fd2;
+; CHECK-NEXT:    selp.f64 %fd4, 0d7FF8000000000000, %fd3, %p1;
+; CHECK-NEXT:    setp.eq.s64 %p2, %rd1, 0;
+; CHECK-NEXT:    selp.f64 %fd5, %fd1, %fd4, %p2;
+; CHECK-NEXT:    mov.b64 %rd2, %fd2;
+; CHECK-NEXT:    setp.eq.s64 %p3, %rd2, 0;
+; CHECK-NEXT:    selp.f64 %fd6, %fd2, %fd5, %p3;
+; CHECK-NEXT:    setp.eq.f64 %p4, %fd4, 0d0000000000000000;
+; CHECK-NEXT:    selp.f64 %fd7, %fd6, %fd4, %p4;
+; CHECK-NEXT:    st.param.f64 [func_retval0+0], %fd7;
+; CHECK-NEXT:    ret;
+  %x = call double @llvm.maximum.f64(double %a, double %b)
+  ret double %x
+}
+
+define <2 x half> @maximum_v2half(<2 x half> %a, <2 x half> %b) {
+; CHECK-NOF16-LABEL: maximum_v2half(
+; CHECK-NOF16:       {
+; CHECK-NOF16-NEXT:    .reg .pred %p<11>;
+; CHECK-NOF16-NEXT:    .reg .b16 %rs<19>;
+; CHECK-NOF16-NEXT:    .reg .b32 %r<4>;
+; CHECK-NOF16-NEXT:    .reg .f32 %f<7>;
+; CHECK-NOF16-EMPTY:
+; CHECK-NOF16-NEXT:  // %bb.0:
+; CHECK-NOF16-NEXT:    ld.param.b32 %r1, [maximum_v2half_param_0];
+; CHECK-NOF16-NEXT:    ld.param.b32 %r2, [maximum_v2half_param_1];
+; CHECK-NOF16-NEXT:    mov.b32 {%rs1, %rs2}, %r2;
+; CHECK-NOF16-NEXT:    cvt.f32.f16 %f1, %rs2;
+; CHECK-NOF16-NEXT:    mov.b32 {%rs3, %rs4}, %r1;
+; CHECK-NOF16-NEXT:    cvt.f32.f16 %f2, %rs4;
+; CHECK-NOF16-NEXT:    setp.gt.f32 %p1, %f2, %f1;
+; CHECK-NOF16-NEXT:    selp.b16 %rs5, %rs4, %rs2, %p1;
+; CHECK-NOF16-NEXT:    setp.nan.f32 %p2, %f2, %f1;
+; CHECK-NOF16-NEXT:    selp.b16 %rs6, 0x7E00, %rs5, %p2;
+; CHECK-NOF16-NEXT:    setp.eq.s16 %p3, %rs4, 0;
+; CHECK-NOF16-NEXT:    selp.b16 %rs8, %rs4, %rs6, %p3;
+; CHECK-NOF16-NEXT:    setp.eq.s16 %p4, %rs2, 0;
+; CHECK-NOF16-NEXT:    selp.b16 %rs10, %rs2, %rs8, %p4;
+; CHECK-NOF16-NEXT:    cvt.f32.f16 %f3, %rs6;
+; CHECK-NOF16-NEXT:    setp.eq.f32 %p5, %f3, 0f00000000;
+; CHECK-NOF16-NEXT:    selp.b16 %rs11, %rs10, %rs6, %p5;
+; CHECK-NOF16-NEXT:    cvt.f32.f16 %f4, %rs1;
+; CHECK-NOF16-NEXT:    cvt.f32.f16 %f5, %rs3;
+; CHECK-NOF16-NEXT:    setp.gt.f32 %p6, %f5, %f4;
+; CHECK-NOF16-NEXT:    selp.b16 %rs12, %rs3, %rs1, %p6;
+; CHECK-NOF16-NEXT:    setp.nan.f32 %p7, %f5, %f4;
+; CHECK-NOF16-NEXT:    selp.b16 %rs13, 0x7E00, %rs12, %p7;
+; CHECK-NOF16-NEXT:    setp.eq.s16 %p8, %rs3, 0;
+; CHECK-NOF16-NEXT:    selp.b16 %rs15, %rs3, %rs13, %p8;
+; CHECK-NOF16-NEXT:    setp.eq.s16 %p9, %rs1, 0;
+; CHECK-NOF16-NEXT:    selp.b16 %rs17, %rs1, %rs15, %p9;
+; CHECK-NOF16-NEXT:    cvt.f32.f16 %f6, %rs13;
+; CHECK-NOF16-NEXT:    setp.eq.f32 %p10, %f6, 0f00000000;
+; CHECK-NOF16-NEXT:    selp.b16 %rs18, %rs17, %rs13, %p10;
+; CHECK-NOF16-NEXT:    mov.b32 %r3, {%rs18, %rs11};
+; CHECK-NOF16-NEXT:    st.param.b32 [func_retval0+0], %r3;
+; CHECK-NOF16-NEXT:    ret;
+;
+; CHECK-F16-LABEL: maximum_v2half(
+; CHECK-F16:       {
+; CHECK-F16-NEXT:    .reg .b32 %r<4>;
+; CHECK-F16-EMPTY:
+; CHECK-F16-NEXT:  // %bb.0:
+; CHECK-F16-NEXT:    ld.param.b32 %r1, [maximum_v2half_param_1];
+; CHECK-F16-NEXT:    ld.param.b32 %r2, [maximum_v2half_param_0];
+; CHECK-F16-NEXT:    max.NaN.f16x2 %r3, %r2, %r1;
+; CHECK-F16-NEXT:    st.param.b32 [func_retval0+0], %r3;
+; CHECK-F16-NEXT:    ret;
+;
+; CHECK-SM80-NOF16-LABEL: maximum_v2half(
+; CHECK-SM80-NOF16:       {
+; CHECK-SM80-NOF16-NEXT:    .reg .pred %p<11>;
+; CHECK-SM80-NOF16-NEXT:    .reg .b16 %rs<19>;
+; CHECK-SM80-NOF16-NEXT:    .reg .b32 %r<4>;
+; CHECK-SM80-NOF16-NEXT:    .reg .f32 %f<7>;
+; CHECK-SM80-NOF16-EMPTY:
+; CHECK-SM80-NOF16-NEXT:  // %bb.0:
+; CHECK-SM80-NOF16-NEXT:    ld.param.b32 %r1, [maximum_v2half_param_0];
+; CHECK-SM80-NOF16-NEXT:    ld.param.b32 %r2, [maximum_v2half_param_1];
+; CHECK-SM80-NOF16-NEXT:    mov.b32 {%rs1, %rs2}, %r2;
+; CHECK-SM80-NOF16-NEXT:    cvt.f32.f16 %f1, %rs2;
+; CHECK-SM80-NOF16-NEXT:    mov.b32 {%rs3, %rs4}, %r1;
+; CHECK-SM80-NOF16-NEXT:    cvt.f32.f16 %f2, %rs4;
+; CHECK-SM80-NOF16-NEXT:    setp.gt.f32 %p1, %f2, %f1;
+; CHECK-SM80-NOF16-NEXT:    selp.b16 %rs5, %rs4, %rs2, %p1;
+; CHECK-SM80-NOF16-NEXT:    setp.nan.f32 %p2, %f2, %f1;
+; CHECK-SM80-NOF16-NEXT:    selp.b16 %rs6, 0x7E00, %rs5, %p2;
+; CHECK-SM80-NOF16-NEXT:    setp.eq.s16 %p3, %rs4, 0;
+; CHECK-SM80-NOF16-NEXT:    selp.b16 %rs8, %rs4, %rs6, %p3;
+; CHECK-SM80-NOF16-NEXT:    setp.eq.s16 %p4, %rs2, 0;
+; CHECK-SM80-NOF16-NEXT:    selp.b16 %rs10, %rs2, %rs8, %p4;
+; CHECK-SM80-NOF16-NEXT:    cvt.f32.f16 %f3, %rs6;
+; CHECK-SM80-NOF16-NEXT:    setp.eq.f32 %p5, %f3, 0f00000000;
+; CHECK-SM80-NOF16-NEXT:    selp.b16 %rs11, %rs10, %rs6, %p5;
+; CHECK-SM80-NOF16-NEXT:    cvt.f32.f16 %f4, %rs1;
+; CHECK-SM80-NOF16-NEXT:    cvt.f32.f16 %f5, %rs3;
+; CHECK-SM80-NOF16-NEXT:    setp.gt.f32 %p6, %f5, %f4;
+; CHECK-SM80-NOF16-NEXT:    selp.b16 %rs12, %rs3, %rs1, %p6;
+; CHECK-SM80-NOF16-NEXT:    setp.nan.f32 %p7, %f5, %f4;
+; CHECK-SM80-NOF16-NEXT:    selp.b16 %rs13, 0x7E00, %rs12, %p7;
+; CHECK-SM80-NOF16-NEXT:    setp.eq.s16 %p8, %rs3, 0;
+; CHECK-SM80-NOF16-NEXT:    selp.b16 %rs15, %rs3, %rs13, %p8;
+; CHECK-SM80-NOF16-NEXT:    setp.eq.s16 %p9, %rs1, 0;
+; CHECK-SM80-NOF16-NEXT:    selp.b16 %rs17, %rs1, %rs15, %p9;
+; CHECK-SM80-NOF16-NEXT:    cvt.f32.f16 %f6, %rs13;
+; CHECK-SM80-NOF16-NEXT:    setp.eq.f32 %p10, %f6, 0f00000000;
+; CHECK-SM80-NOF16-NEXT:    selp.b16 %rs18, %rs17, %rs13, %p10;
+; CHECK-SM80-NOF16-NEXT:    mov.b32 %r3, {%rs18, %rs11};
+; CHECK-SM80-NOF16-NEXT:    st.param.b32 [func_retval0+0], %r3;
+; CHECK-SM80-NOF16-NEXT:    ret;
+  %x = call <2 x half> @llvm.maximum.v2f16(<2 x half> %a, <2 x half> %b)
+  ret <2 x half> %x
+}
+
 ; ---- fma ----
 
-; CHECK-LABEL: @fma_float
 define float @fma_float(float %a, float %b, float %c) {
-  ; CHECK: fma.rn.f32
+; CHECK-LABEL: fma_float(
+; CHECK:       {
+; CHECK-NEXT:    .reg .f32 %f<5>;
+; CHECK-EMPTY:
+; CHECK-NEXT:  // %bb.0:
+; CHECK-NEXT:    ld.param.f32 %f1, [fma_float_param_0];
+; CHECK-NEXT:    ld.param.f32 %f2, [fma_float_param_1];
+; CHECK-NEXT:    ld.param.f32 %f3, [fma_float_param_2];
+; CHECK-NEXT:    fma.rn.f32 %f4, %f1, %f2, %f3;
+; CHECK-NEXT:    st.param.f32 [func_retval0+0], %f4;
+; CHECK-NEXT:    ret;
   %x = call float @llvm.fma.f32(float %a, float %b, float %c)
   ret float %x
 }
 
-; CHECK-LABEL: @fma_float_ftz
 define float @fma_float_ftz(float %a, float %b, float %c) #1 {
-  ; CHECK: fma.rn.ftz.f32
+; CHECK-LABEL: fma_float_ftz(
+; CHECK:       {
+; CHECK-NEXT:    .reg .f32 %f<5>;
+; CHECK-EMPTY:
+; CHECK-NEXT:  // %bb.0:
+; CHECK-NEXT:    ld.param.f32 %f1, [fma_float_ftz_param_0];
+; CHECK-NEXT:    ld.param.f32 %f2, [fma_float_ftz_param_1];
+; CHECK-NEXT:    ld.param.f32 %f3, [fma_float_ftz_param_2];
+; CHECK-NEXT:    fma.rn.ftz.f32 %f4, %f1, %f2, %f3;
+; CHECK-NEXT:    st.param.f32 [func_retval0+0], %f4;
+; CHECK-NEXT:    ret;
   %x = call float @llvm.fma.f32(float %a, float %b, float %c)
   ret float %x
 }
 
-; CHECK-LABEL: @fma_double
 define double @fma_double(double %a, double %b, double %c) {
-  ; CHECK: fma.rn.f64
+; CHECK-LABEL: fma_double(
+; CHECK:       {
+; CHECK-NEXT:    .reg .f64 %fd<5>;
+; CHECK-EMPTY:
+; CHECK-NEXT:  // %bb.0:
+; CHECK-NEXT:    ld.param.f64 %fd1, [fma_double_param_0];
+; CHECK-NEXT:    ld.param.f64 %fd2, [fma_double_param_1];
+; CHECK-NEXT:    ld.param.f64 %fd3, [fma_double_param_2];
+; CHECK-NEXT:    fma.rn.f64 %fd4, %fd1, %fd2, %fd3;
+; CHECK-NEXT:    st.param.f64 [func_retval0+0], %fd4;
+; CHECK-NEXT:    ret;
   %x = call double @llvm.fma.f64(double %a, double %b, double %c)
   ret double %x
 }
diff --git a/llvm/test/CodeGen/PowerPC/aix-vec_insert_elt.ll b/llvm/test/CodeGen/PowerPC/aix-vec_insert_elt.ll
index aae2326..afc7a39 100644
--- a/llvm/test/CodeGen/PowerPC/aix-vec_insert_elt.ll
+++ b/llvm/test/CodeGen/PowerPC/aix-vec_insert_elt.ll
@@ -750,25 +750,21 @@ entry:
 define <2 x double> @testDoubleImm1(<2 x double> %a, double %b) {
 ; CHECK-64-LABEL: testDoubleImm1:
 ; CHECK-64:       # %bb.0: # %entry
-; CHECK-64-NEXT:    # kill: def $f1 killed $f1 def $vsl1
 ; CHECK-64-NEXT:    xxpermdi 34, 1, 34, 1
 ; CHECK-64-NEXT:    blr
 ;
 ; CHECK-32-LABEL: testDoubleImm1:
 ; CHECK-32:       # %bb.0: # %entry
-; CHECK-32-NEXT:    # kill: def $f1 killed $f1 def $vsl1
 ; CHECK-32-NEXT:    xxpermdi 34, 1, 34, 1
 ; CHECK-32-NEXT:    blr
 ;
 ; CHECK-64-P10-LABEL: testDoubleImm1:
 ; CHECK-64-P10:       # %bb.0: # %entry
-; CHECK-64-P10-NEXT:    # kill: def $f1 killed $f1 def $vsl1
 ; CHECK-64-P10-NEXT:    xxpermdi 34, 1, 34, 1
 ; CHECK-64-P10-NEXT:    blr
 ;
 ; CHECK-32-P10-LABEL: testDoubleImm1:
 ; CHECK-32-P10:       # %bb.0: # %entry
-; CHECK-32-P10-NEXT:    # kill: def $f1 killed $f1 def $vsl1
 ; CHECK-32-P10-NEXT:    xxpermdi 34, 1, 34, 1
 ; CHECK-32-P10-NEXT:    blr
 entry:
diff --git a/llvm/test/CodeGen/PowerPC/aix32-p8-scalar_vector_conversions.ll b/llvm/test/CodeGen/PowerPC/aix32-p8-scalar_vector_conversions.ll
index 19e298a..2f543da 100644
--- a/llvm/test/CodeGen/PowerPC/aix32-p8-scalar_vector_conversions.ll
+++ b/llvm/test/CodeGen/PowerPC/aix32-p8-scalar_vector_conversions.ll
@@ -1099,7 +1099,6 @@ define double @getd1(<2 x double> %vd) {
 ; CHECK-LABEL: getd1:
 ; CHECK:       # %bb.0: # %entry
 ; CHECK-NEXT:    xxswapd 1, 34
-; CHECK-NEXT:    # kill: def $f1 killed $f1 killed $vsl1
 ; CHECK-NEXT:    blr
 entry:
   %vecext = extractelement <2 x double> %vd, i32 1
@@ -1115,7 +1114,6 @@ define double @getveld(<2 x double> %vd, i32 signext %i) {
 ; CHECK-NEXT:    lvsl 3, 0, 3
 ; CHECK-NEXT:    vperm 2, 2, 2, 3
 ; CHECK-NEXT:    xxlor 1, 34, 34
-; CHECK-NEXT:    # kill: def $f1 killed $f1 killed $vsl1
 ; CHECK-NEXT:    blr
 entry:
   %vecext = extractelement <2 x double> %vd, i32 %i
diff --git a/llvm/test/CodeGen/PowerPC/build-vector-tests.ll b/llvm/test/CodeGen/PowerPC/build-vector-tests.ll
index f729018d..91431ed 100644
--- a/llvm/test/CodeGen/PowerPC/build-vector-tests.ll
+++ b/llvm/test/CodeGen/PowerPC/build-vector-tests.ll
@@ -1319,11 +1319,7 @@ entry:
 define <4 x i32> @fromRegsConvftoi(float %a, float %b, float %c, float %d) {
 ; P9BE-LABEL: fromRegsConvftoi:
 ; P9BE:       # %bb.0: # %entry
-; P9BE-NEXT:    # kill: def $f4 killed $f4 def $vsl4
-; P9BE-NEXT:    # kill: def $f2 killed $f2 def $vsl2
 ; P9BE-NEXT:    xxmrghd vs0, vs2, vs4
-; P9BE-NEXT:    # kill: def $f3 killed $f3 def $vsl3
-; P9BE-NEXT:    # kill: def $f1 killed $f1 def $vsl1
 ; P9BE-NEXT:    xvcvdpsxws v2, vs0
 ; P9BE-NEXT:    xxmrghd vs0, vs1, vs3
 ; P9BE-NEXT:    xvcvdpsxws v3, vs0
@@ -1332,11 +1328,7 @@ define <4 x i32> @fromRegsConvftoi(float %a, float %b, float %c, float %d) {
 ;
 ; P9LE-LABEL: fromRegsConvftoi:
 ; P9LE:       # %bb.0: # %entry
-; P9LE-NEXT:    # kill: def $f3 killed $f3 def $vsl3
-; P9LE-NEXT:    # kill: def $f1 killed $f1 def $vsl1
 ; P9LE-NEXT:    xxmrghd vs0, vs3, vs1
-; P9LE-NEXT:    # kill: def $f4 killed $f4 def $vsl4
-; P9LE-NEXT:    # kill: def $f2 killed $f2 def $vsl2
 ; P9LE-NEXT:    xvcvdpsxws v2, vs0
 ; P9LE-NEXT:    xxmrghd vs0, vs4, vs2
 ; P9LE-NEXT:    xvcvdpsxws v3, vs0
@@ -1345,10 +1337,6 @@ define <4 x i32> @fromRegsConvftoi(float %a, float %b, float %c, float %d) {
 ;
 ; P8BE-LABEL: fromRegsConvftoi:
 ; P8BE:       # %bb.0: # %entry
-; P8BE-NEXT:    # kill: def $f4 killed $f4 def $vsl4
-; P8BE-NEXT:    # kill: def $f3 killed $f3 def $vsl3
-; P8BE-NEXT:    # kill: def $f2 killed $f2 def $vsl2
-; P8BE-NEXT:    # kill: def $f1 killed $f1 def $vsl1
 ; P8BE-NEXT:    xxmrghd vs0, vs2, vs4
 ; P8BE-NEXT:    xxmrghd vs1, vs1, vs3
 ; P8BE-NEXT:    xvcvdpsxws v2, vs0
@@ -1358,10 +1346,6 @@ define <4 x i32> @fromRegsConvftoi(float %a, float %b, float %c, float %d) {
 ;
 ; P8LE-LABEL: fromRegsConvftoi:
 ; P8LE:       # %bb.0: # %entry
-; P8LE-NEXT:    # kill: def $f4 killed $f4 def $vsl4
-; P8LE-NEXT:    # kill: def $f3 killed $f3 def $vsl3
-; P8LE-NEXT:    # kill: def $f2 killed $f2 def $vsl2
-; P8LE-NEXT:    # kill: def $f1 killed $f1 def $vsl1
 ; P8LE-NEXT:    xxmrghd vs0, vs3, vs1
 ; P8LE-NEXT:    xxmrghd vs1, vs4, vs2
 ; P8LE-NEXT:    xvcvdpsxws v2, vs0
@@ -1773,11 +1757,7 @@ entry:
 define <4 x i32> @fromRegsConvdtoi(double %a, double %b, double %c, double %d) {
 ; P9BE-LABEL: fromRegsConvdtoi:
 ; P9BE:       # %bb.0: # %entry
-; P9BE-NEXT:    # kill: def $f4 killed $f4 def $vsl4
-; P9BE-NEXT:    # kill: def $f2 killed $f2 def $vsl2
 ; P9BE-NEXT:    xxmrghd vs0, vs2, vs4
-; P9BE-NEXT:    # kill: def $f3 killed $f3 def $vsl3
-; P9BE-NEXT:    # kill: def $f1 killed $f1 def $vsl1
 ; P9BE-NEXT:    xvcvdpsxws v2, vs0
 ; P9BE-NEXT:    xxmrghd vs0, vs1, vs3
 ; P9BE-NEXT:    xvcvdpsxws v3, vs0
@@ -1786,11 +1766,7 @@ define <4 x i32> @fromRegsConvdtoi(double %a, double %b, double %c, double %d) {
 ;
 ; P9LE-LABEL: fromRegsConvdtoi:
 ; P9LE:       # %bb.0: # %entry
-; P9LE-NEXT:    # kill: def $f3 killed $f3 def $vsl3
-; P9LE-NEXT:    # kill: def $f1 killed $f1 def $vsl1
 ; P9LE-NEXT:    xxmrghd vs0, vs3, vs1
-; P9LE-NEXT:    # kill: def $f4 killed $f4 def $vsl4
-; P9LE-NEXT:    # kill: def $f2 killed $f2 def $vsl2
 ; P9LE-NEXT:    xvcvdpsxws v2, vs0
 ; P9LE-NEXT:    xxmrghd vs0, vs4, vs2
 ; P9LE-NEXT:    xvcvdpsxws v3, vs0
@@ -1799,10 +1775,6 @@ define <4 x i32> @fromRegsConvdtoi(double %a, double %b, double %c, double %d) {
 ;
 ; P8BE-LABEL: fromRegsConvdtoi:
 ; P8BE:       # %bb.0: # %entry
-; P8BE-NEXT:    # kill: def $f4 killed $f4 def $vsl4
-; P8BE-NEXT:    # kill: def $f3 killed $f3 def $vsl3
-; P8BE-NEXT:    # kill: def $f2 killed $f2 def $vsl2
-; P8BE-NEXT:    # kill: def $f1 killed $f1 def $vsl1
 ; P8BE-NEXT:    xxmrghd vs0, vs2, vs4
 ; P8BE-NEXT:    xxmrghd vs1, vs1, vs3
 ; P8BE-NEXT:    xvcvdpsxws v2, vs0
@@ -1812,10 +1784,6 @@ define <4 x i32> @fromRegsConvdtoi(double %a, double %b, double %c, double %d) {
 ;
 ; P8LE-LABEL: fromRegsConvdtoi:
 ; P8LE:       # %bb.0: # %entry
-; P8LE-NEXT:    # kill: def $f4 killed $f4 def $vsl4
-; P8LE-NEXT:    # kill: def $f3 killed $f3 def $vsl3
-; P8LE-NEXT:    # kill: def $f2 killed $f2 def $vsl2
-; P8LE-NEXT:    # kill: def $f1 killed $f1 def $vsl1
 ; P8LE-NEXT:    xxmrghd vs0, vs3, vs1
 ; P8LE-NEXT:    xxmrghd vs1, vs4, vs2
 ; P8LE-NEXT:    xvcvdpsxws v2, vs0
@@ -2839,11 +2807,7 @@ entry:
 define <4 x i32> @fromRegsConvftoui(float %a, float %b, float %c, float %d) {
 ; P9BE-LABEL: fromRegsConvftoui:
 ; P9BE:       # %bb.0: # %entry
-; P9BE-NEXT:    # kill: def $f4 killed $f4 def $vsl4
-; P9BE-NEXT:    # kill: def $f2 killed $f2 def $vsl2
 ; P9BE-NEXT:    xxmrghd vs0, vs2, vs4
-; P9BE-NEXT:    # kill: def $f3 killed $f3 def $vsl3
-; P9BE-NEXT:    # kill: def $f1 killed $f1 def $vsl1
 ; P9BE-NEXT:    xvcvdpuxws v2, vs0
 ; P9BE-NEXT:    xxmrghd vs0, vs1, vs3
 ; P9BE-NEXT:    xvcvdpuxws v3, vs0
@@ -2852,11 +2816,7 @@ define <4 x i32> @fromRegsConvftoui(float %a, float %b, float %c, float %d) {
 ;
 ; P9LE-LABEL: fromRegsConvftoui:
 ; P9LE:       # %bb.0: # %entry
-; P9LE-NEXT:    # kill: def $f3 killed $f3 def $vsl3
-; P9LE-NEXT:    # kill: def $f1 killed $f1 def $vsl1
 ; P9LE-NEXT:    xxmrghd vs0, vs3, vs1
-; P9LE-NEXT:    # kill: def $f4 killed $f4 def $vsl4
-; P9LE-NEXT:    # kill: def $f2 killed $f2 def $vsl2
 ; P9LE-NEXT:    xvcvdpuxws v2, vs0
 ; P9LE-NEXT:    xxmrghd vs0, vs4, vs2
 ; P9LE-NEXT:    xvcvdpuxws v3, vs0
@@ -2865,10 +2825,6 @@ define <4 x i32> @fromRegsConvftoui(float %a, float %b, float %c, float %d) {
 ;
 ; P8BE-LABEL: fromRegsConvftoui:
 ; P8BE:       # %bb.0: # %entry
-; P8BE-NEXT:    # kill: def $f4 killed $f4 def $vsl4
-; P8BE-NEXT:    # kill: def $f3 killed $f3 def $vsl3
-; P8BE-NEXT:    # kill: def $f2 killed $f2 def $vsl2
-; P8BE-NEXT:    # kill: def $f1 killed $f1 def $vsl1
 ; P8BE-NEXT:    xxmrghd vs0, vs2, vs4
 ; P8BE-NEXT:    xxmrghd vs1, vs1, vs3
 ; P8BE-NEXT:    xvcvdpuxws v2, vs0
@@ -2878,10 +2834,6 @@ define <4 x i32> @fromRegsConvftoui(float %a, float %b, float %c, float %d) {
 ;
 ; P8LE-LABEL: fromRegsConvftoui:
 ; P8LE:       # %bb.0: # %entry
-; P8LE-NEXT:    # kill: def $f4 killed $f4 def $vsl4
-; P8LE-NEXT:    # kill: def $f3 killed $f3 def $vsl3
-; P8LE-NEXT:    # kill: def $f2 killed $f2 def $vsl2
-; P8LE-NEXT:    # kill: def $f1 killed $f1 def $vsl1
 ; P8LE-NEXT:    xxmrghd vs0, vs3, vs1
 ; P8LE-NEXT:    xxmrghd vs1, vs4, vs2
 ; P8LE-NEXT:    xvcvdpuxws v2, vs0
@@ -3294,11 +3246,7 @@ entry:
 define <4 x i32> @fromRegsConvdtoui(double %a, double %b, double %c, double %d) {
 ; P9BE-LABEL: fromRegsConvdtoui:
 ; P9BE:       # %bb.0: # %entry
-; P9BE-NEXT:    # kill: def $f4 killed $f4 def $vsl4
-; P9BE-NEXT:    # kill: def $f2 killed $f2 def $vsl2
 ; P9BE-NEXT:    xxmrghd vs0, vs2, vs4
-; P9BE-NEXT:    # kill: def $f3 killed $f3 def $vsl3
-; P9BE-NEXT:    # kill: def $f1 killed $f1 def $vsl1
 ; P9BE-NEXT:    xvcvdpuxws v2, vs0
 ; P9BE-NEXT:    xxmrghd vs0, vs1, vs3
 ; P9BE-NEXT:    xvcvdpuxws v3, vs0
@@ -3307,11 +3255,7 @@ define <4 x i32> @fromRegsConvdtoui(double %a, double %b, double %c, double %d)
 ;
 ; P9LE-LABEL: fromRegsConvdtoui:
 ; P9LE:       # %bb.0: # %entry
-; P9LE-NEXT:    # kill: def $f3 killed $f3 def $vsl3
-; P9LE-NEXT:    # kill: def $f1 killed $f1 def $vsl1
 ; P9LE-NEXT:    xxmrghd vs0, vs3, vs1
-; P9LE-NEXT:    # kill: def $f4 killed $f4 def $vsl4
-; P9LE-NEXT:    # kill: def $f2 killed $f2 def $vsl2
 ; P9LE-NEXT:    xvcvdpuxws v2, vs0
 ; P9LE-NEXT:    xxmrghd vs0, vs4, vs2
 ; P9LE-NEXT:    xvcvdpuxws v3, vs0
@@ -3320,10 +3264,6 @@ define <4 x i32> @fromRegsConvdtoui(double %a, double %b, double %c, double %d)
 ;
 ; P8BE-LABEL: fromRegsConvdtoui:
 ; P8BE:       # %bb.0: # %entry
-; P8BE-NEXT:    # kill: def $f4 killed $f4 def $vsl4
-; P8BE-NEXT:    # kill: def $f3 killed $f3 def $vsl3
-; P8BE-NEXT:    # kill: def $f2 killed $f2 def $vsl2
-; P8BE-NEXT:    # kill: def $f1 killed $f1 def $vsl1
 ; P8BE-NEXT:    xxmrghd vs0, vs2, vs4
 ; P8BE-NEXT:    xxmrghd vs1, vs1, vs3
 ; P8BE-NEXT:    xvcvdpuxws v2, vs0
@@ -3333,10 +3273,6 @@ define <4 x i32> @fromRegsConvdtoui(double %a, double %b, double %c, double %d)
 ;
 ; P8LE-LABEL: fromRegsConvdtoui:
 ; P8LE:       # %bb.0: # %entry
-; P8LE-NEXT:    # kill: def $f4 killed $f4 def $vsl4
-; P8LE-NEXT:    # kill: def $f3 killed $f3 def $vsl3
-; P8LE-NEXT:    # kill: def $f2 killed $f2 def $vsl2
-; P8LE-NEXT:    # kill: def $f1 killed $f1 def $vsl1
 ; P8LE-NEXT:    xxmrghd vs0, vs3, vs1
 ; P8LE-NEXT:    xxmrghd vs1, vs4, vs2
 ; P8LE-NEXT:    xvcvdpuxws v2, vs0
@@ -4269,32 +4205,24 @@ entry:
 define <2 x i64> @fromRegsConvftoll(float %a, float %b) {
 ; P9BE-LABEL: fromRegsConvftoll:
 ; P9BE:       # %bb.0: # %entry
-; P9BE-NEXT:    # kill: def $f2 killed $f2 def $vsl2
-; P9BE-NEXT:    # kill: def $f1 killed $f1 def $vsl1
 ; P9BE-NEXT:    xxmrghd vs0, vs1, vs2
 ; P9BE-NEXT:    xvcvdpsxds v2, vs0
 ; P9BE-NEXT:    blr
 ;
 ; P9LE-LABEL: fromRegsConvftoll:
 ; P9LE:       # %bb.0: # %entry
-; P9LE-NEXT:    # kill: def $f2 killed $f2 def $vsl2
-; P9LE-NEXT:    # kill: def $f1 killed $f1 def $vsl1
 ; P9LE-NEXT:    xxmrghd vs0, vs2, vs1
 ; P9LE-NEXT:    xvcvdpsxds v2, vs0
 ; P9LE-NEXT:    blr
 ;
 ; P8BE-LABEL: fromRegsConvftoll:
 ; P8BE:       # %bb.0: # %entry
-; P8BE-NEXT:    # kill: def $f2 killed $f2 def $vsl2
-; P8BE-NEXT:    # kill: def $f1 killed $f1 def $vsl1
 ; P8BE-NEXT:    xxmrghd vs0, vs1, vs2
 ; P8BE-NEXT:    xvcvdpsxds v2, vs0
 ; P8BE-NEXT:    blr
 ;
 ; P8LE-LABEL: fromRegsConvftoll:
 ; P8LE:       # %bb.0: # %entry
-; P8LE-NEXT:    # kill: def $f2 killed $f2 def $vsl2
-; P8LE-NEXT:    # kill: def $f1 killed $f1 def $vsl1
 ; P8LE-NEXT:    xxmrghd vs0, vs2, vs1
 ; P8LE-NEXT:    xvcvdpsxds v2, vs0
 ; P8LE-NEXT:    blr
@@ -4630,32 +4558,24 @@ entry:
 define <2 x i64> @fromRegsConvdtoll(double %a, double %b) {
 ; P9BE-LABEL: fromRegsConvdtoll:
 ; P9BE:       # %bb.0: # %entry
-; P9BE-NEXT:    # kill: def $f2 killed $f2 def $vsl2
-; P9BE-NEXT:    # kill: def $f1 killed $f1 def $vsl1
 ; P9BE-NEXT:    xxmrghd vs0, vs1, vs2
 ; P9BE-NEXT:    xvcvdpsxds v2, vs0
 ; P9BE-NEXT:    blr
 ;
 ; P9LE-LABEL: fromRegsConvdtoll:
 ; P9LE:       # %bb.0: # %entry
-; P9LE-NEXT:    # kill: def $f2 killed $f2 def $vsl2
-; P9LE-NEXT:    # kill: def $f1 killed $f1 def $vsl1
 ; P9LE-NEXT:    xxmrghd vs0, vs2, vs1
 ; P9LE-NEXT:    xvcvdpsxds v2, vs0
 ; P9LE-NEXT:    blr
 ;
 ; P8BE-LABEL: fromRegsConvdtoll:
 ; P8BE:       # %bb.0: # %entry
-; P8BE-NEXT:    # kill: def $f2 killed $f2 def $vsl2
-; P8BE-NEXT:    # kill: def $f1 killed $f1 def $vsl1
 ; P8BE-NEXT:    xxmrghd vs0, vs1, vs2
 ; P8BE-NEXT:    xvcvdpsxds v2, vs0
 ; P8BE-NEXT:    blr
 ;
 ; P8LE-LABEL: fromRegsConvdtoll:
 ; P8LE:       # %bb.0: # %entry
-; P8LE-NEXT:    # kill: def $f2 killed $f2 def $vsl2
-; P8LE-NEXT:    # kill: def $f1 killed $f1 def $vsl1
 ; P8LE-NEXT:    xxmrghd vs0, vs2, vs1
 ; P8LE-NEXT:    xvcvdpsxds v2, vs0
 ; P8LE-NEXT:    blr
@@ -5451,32 +5371,24 @@ entry:
 define <2 x i64> @fromRegsConvftoull(float %a, float %b) {
 ; P9BE-LABEL: fromRegsConvftoull:
 ; P9BE:       # %bb.0: # %entry
-; P9BE-NEXT:    # kill: def $f2 killed $f2 def $vsl2
-; P9BE-NEXT:    # kill: def $f1 killed $f1 def $vsl1
 ; P9BE-NEXT:    xxmrghd vs0, vs1, vs2
 ; P9BE-NEXT:    xvcvdpuxds v2, vs0
 ; P9BE-NEXT:    blr
 ;
 ; P9LE-LABEL: fromRegsConvftoull:
 ; P9LE:       # %bb.0: # %entry
-; P9LE-NEXT:    # kill: def $f2 killed $f2 def $vsl2
-; P9LE-NEXT:    # kill: def $f1 killed $f1 def $vsl1
 ; P9LE-NEXT:    xxmrghd vs0, vs2, vs1
 ; P9LE-NEXT:    xvcvdpuxds v2, vs0
 ; P9LE-NEXT:    blr
 ;
 ; P8BE-LABEL: fromRegsConvftoull:
 ; P8BE:       # %bb.0: # %entry
-; P8BE-NEXT:    # kill: def $f2 killed $f2 def $vsl2
-; P8BE-NEXT:    # kill: def $f1 killed $f1 def $vsl1
 ; P8BE-NEXT:    xxmrghd vs0, vs1, vs2
 ; P8BE-NEXT:    xvcvdpuxds v2, vs0
 ; P8BE-NEXT:    blr
 ;
 ; P8LE-LABEL: fromRegsConvftoull:
 ; P8LE:       # %bb.0: # %entry
-; P8LE-NEXT:    # kill: def $f2 killed $f2 def $vsl2
-; P8LE-NEXT:    # kill: def $f1 killed $f1 def $vsl1
 ; P8LE-NEXT:    xxmrghd vs0, vs2, vs1
 ; P8LE-NEXT:    xvcvdpuxds v2, vs0
 ; P8LE-NEXT:    blr
@@ -5812,32 +5724,24 @@ entry:
 define <2 x i64> @fromRegsConvdtoull(double %a, double %b) {
 ; P9BE-LABEL: fromRegsConvdtoull:
 ; P9BE:       # %bb.0: # %entry
-; P9BE-NEXT:    # kill: def $f2 killed $f2 def $vsl2
-; P9BE-NEXT:    # kill: def $f1 killed $f1 def $vsl1
 ; P9BE-NEXT:    xxmrghd vs0, vs1, vs2
 ; P9BE-NEXT:    xvcvdpuxds v2, vs0
 ; P9BE-NEXT:    blr
 ;
 ; P9LE-LABEL: fromRegsConvdtoull:
 ; P9LE:       # %bb.0: # %entry
-; P9LE-NEXT:    # kill: def $f2 killed $f2 def $vsl2
-; P9LE-NEXT:    # kill: def $f1 killed $f1 def $vsl1
 ; P9LE-NEXT:    xxmrghd vs0, vs2, vs1
 ; P9LE-NEXT:    xvcvdpuxds v2, vs0
 ; P9LE-NEXT:    blr
 ;
 ; P8BE-LABEL: fromRegsConvdtoull:
 ; P8BE:       # %bb.0: # %entry
-; P8BE-NEXT:    # kill: def $f2 killed $f2 def $vsl2
-; P8BE-NEXT:    # kill: def $f1 killed $f1 def $vsl1
 ; P8BE-NEXT:    xxmrghd vs0, vs1, vs2
 ; P8BE-NEXT:    xvcvdpuxds v2, vs0
 ; P8BE-NEXT:    blr
 ;
 ; P8LE-LABEL: fromRegsConvdtoull:
 ; P8LE:       # %bb.0: # %entry
-; P8LE-NEXT:    # kill: def $f2 killed $f2 def $vsl2
-; P8LE-NEXT:    # kill: def $f1 killed $f1 def $vsl1
 ; P8LE-NEXT:    xxmrghd vs0, vs2, vs1
 ; P8LE-NEXT:    xvcvdpuxds v2, vs0
 ; P8LE-NEXT:    blr
diff --git a/llvm/test/CodeGen/PowerPC/builtins-ppc-xlcompat-pwr9-64bit.ll b/llvm/test/CodeGen/PowerPC/builtins-ppc-xlcompat-pwr9-64bit.ll
index 7aa8b0e..798c23c 100644
--- a/llvm/test/CodeGen/PowerPC/builtins-ppc-xlcompat-pwr9-64bit.ll
+++ b/llvm/test/CodeGen/PowerPC/builtins-ppc-xlcompat-pwr9-64bit.ll
@@ -22,7 +22,6 @@ define dso_local double @insert_exp(double %d, i64 %ull) {
 ; CHECK:       # %bb.0: # %entry
 ; CHECK-NEXT:    mffprd 3, 1
 ; CHECK-NEXT:    xsiexpdp 1, 3, 4
-; CHECK-NEXT:    # kill: def $f1 killed $f1 killed $vsl1
 ; CHECK-NEXT:    blr
 entry:
   %0 = tail call double @llvm.ppc.insert.exp(double %d, i64 %ull)
diff --git a/llvm/test/CodeGen/PowerPC/canonical-merge-shuffles.ll b/llvm/test/CodeGen/PowerPC/canonical-merge-shuffles.ll
index f2bd4c7..c26f98c 100644
--- a/llvm/test/CodeGen/PowerPC/canonical-merge-shuffles.ll
+++ b/llvm/test/CodeGen/PowerPC/canonical-merge-shuffles.ll
@@ -565,7 +565,6 @@ define dso_local void @no_crash_elt0_from_RHS(ptr noalias nocapture dereferencea
 ; CHECK-P8-NEXT:    bl dummy
 ; CHECK-P8-NEXT:    nop
 ; CHECK-P8-NEXT:    xxlxor f0, f0, f0
-; CHECK-P8-NEXT:    # kill: def $f1 killed $f1 def $vsl1
 ; CHECK-P8-NEXT:    xxmrghd vs0, vs1, vs0
 ; CHECK-P8-NEXT:    xxswapd vs0, vs0
 ; CHECK-P8-NEXT:    stxvd2x vs0, 0, r30
@@ -580,7 +579,6 @@ define dso_local void @no_crash_elt0_from_RHS(ptr noalias nocapture dereferencea
 ; CHECK-P9-NEXT:    bl dummy
 ; CHECK-P9-NEXT:    nop
 ; CHECK-P9-NEXT:    xxlxor f0, f0, f0
-; CHECK-P9-NEXT:    # kill: def $f1 killed $f1 def $vsl1
 ; CHECK-P9-NEXT:    xxmrghd vs0, vs1, vs0
 ; CHECK-P9-NEXT:    stxv vs0, 0(r30)
 ;
@@ -594,7 +592,6 @@ define dso_local void @no_crash_elt0_from_RHS(ptr noalias nocapture dereferencea
 ; CHECK-P9-BE-NEXT:    bl dummy
 ; CHECK-P9-BE-NEXT:    nop
 ; CHECK-P9-BE-NEXT:    xxlxor f0, f0, f0
-; CHECK-P9-BE-NEXT:    # kill: def $f1 killed $f1 def $vsl1
 ; CHECK-P9-BE-NEXT:    xxmrghd vs0, vs0, vs1
 ; CHECK-P9-BE-NEXT:    stxv vs0, 0(r30)
 ;
@@ -621,7 +618,6 @@ define dso_local void @no_crash_elt0_from_RHS(ptr noalias nocapture dereferencea
 ; CHECK-P7-NEXT:    bl dummy
 ; CHECK-P7-NEXT:    nop
 ; CHECK-P7-NEXT:    xxlxor f0, f0, f0
-; CHECK-P7-NEXT:    # kill: def $f1 killed $f1 def $vsl1
 ; CHECK-P7-NEXT:    xxmrghd vs0, vs1, vs0
 ; CHECK-P7-NEXT:    xxswapd vs0, vs0
 ; CHECK-P7-NEXT:    stxvd2x vs0, 0, r30
@@ -636,7 +632,6 @@ define dso_local void @no_crash_elt0_from_RHS(ptr noalias nocapture dereferencea
 ; P8-AIX-64-NEXT:    bl .dummy[PR]
 ; P8-AIX-64-NEXT:    nop
 ; P8-AIX-64-NEXT:    xxlxor f0, f0, f0
-; P8-AIX-64-NEXT:    # kill: def $f1 killed $f1 def $vsl1
 ; P8-AIX-64-NEXT:    xxmrghd vs0, vs0, vs1
 ; P8-AIX-64-NEXT:    stxvd2x vs0, 0, r31
 ;
@@ -650,7 +645,6 @@ define dso_local void @no_crash_elt0_from_RHS(ptr noalias nocapture dereferencea
 ; P8-AIX-32-NEXT:    bl .dummy[PR]
 ; P8-AIX-32-NEXT:    nop
 ; P8-AIX-32-NEXT:    xxlxor f0, f0, f0
-; P8-AIX-32-NEXT:    # kill: def $f1 killed $f1 def $vsl1
 ; P8-AIX-32-NEXT:    xxmrghd vs0, vs0, vs1
 ; P8-AIX-32-NEXT:    stxvd2x vs0, 0, r31
 test_entry:
diff --git a/llvm/test/CodeGen/PowerPC/combine-fneg.ll b/llvm/test/CodeGen/PowerPC/combine-fneg.ll
index a72abf7..04af094 100644
--- a/llvm/test/CodeGen/PowerPC/combine-fneg.ll
+++ b/llvm/test/CodeGen/PowerPC/combine-fneg.ll
@@ -6,7 +6,6 @@ define <4 x double> @fneg_fdiv_splat(double %a0, <4 x double> %a1) {
 ; CHECK-LABEL: fneg_fdiv_splat:
 ; CHECK:       # %bb.0: # %entry
 ; CHECK-NEXT:    addis 3, 2, .LCPI0_0@toc@ha
-; CHECK-NEXT:    # kill: def $f1 killed $f1 def $vsl1
 ; CHECK-NEXT:    xxspltd 0, 1, 0
 ; CHECK-NEXT:    addi 3, 3, .LCPI0_0@toc@l
 ; CHECK-NEXT:    xvredp 1, 0
diff --git a/llvm/test/CodeGen/PowerPC/constant-pool.ll b/llvm/test/CodeGen/PowerPC/constant-pool.ll
index a9feb93..2ded721 100644
--- a/llvm/test/CodeGen/PowerPC/constant-pool.ll
+++ b/llvm/test/CodeGen/PowerPC/constant-pool.ll
@@ -11,7 +11,6 @@
 ; CHECK:       # %bb.0: # %entry
 ; CHECK-NEXT:    xxsplti32dx vs1, 0, 940572664
 ; CHECK-NEXT:    xxsplti32dx vs1, 1, 1073741824
-; CHECK-NEXT:    # kill: def $f1 killed $f1 killed $vsl1
 ; CHECK-NEXT:    blr
 ;
 ; CHECK-P9-LABEL: FloatConstantPool:
@@ -28,7 +27,6 @@ entry:
 ; CHECK:       # %bb.0: # %entry
 ; CHECK-NEXT:    xxsplti32dx vs1, 0, 1048574
 ; CHECK-NEXT:    xxsplti32dx vs1, 1, 780229072
-; CHECK-NEXT:    # kill: def $f1 killed $f1 killed $vsl1
 ; CHECK-NEXT:    blr
 ;
 ; CHECK-P9-LABEL: DoubleConstantPool:
@@ -47,8 +45,6 @@ entry:
 ; CHECK-NEXT:    xxsplti32dx vs2, 0, -2146625897
 ; CHECK-NEXT:    xxsplti32dx vs1, 1, -609716532
 ; CHECK-NEXT:    xxsplti32dx vs2, 1, 1339675259
-; CHECK-NEXT:    # kill: def $f1 killed $f1 killed $vsl1
-; CHECK-NEXT:    # kill: def $f2 killed $f2 killed $vsl2
 ; CHECK-NEXT:    blr
 ;
 ; CHECK-P9-LABEL: LongDoubleConstantPool:
@@ -224,13 +220,11 @@ define double @two_constants_two_bb(i32 %m, double %a) {
 ; CHECK-NEXT:  # %bb.1:
 ; CHECK-NEXT:    xxsplti32dx vs1, 0, 1074935889
 ; CHECK-NEXT:    xxsplti32dx vs1, 1, -343597384
-; CHECK-NEXT:    # kill: def $f1 killed $f1 killed $vsl1
 ; CHECK-NEXT:    blr
 ; CHECK-NEXT:  .LBB12_2: # %if.end
 ; CHECK-NEXT:    xxsplti32dx vs0, 0, 1076085391
 ; CHECK-NEXT:    xxsplti32dx vs0, 1, 1546188227
 ; CHECK-NEXT:    xsadddp f1, f1, f0
-; CHECK-NEXT:    # kill: def $f1 killed $f1 killed $vsl1
 ; CHECK-NEXT:    blr
 ;
 ; CHECK-P9-LABEL: two_constants_two_bb:
@@ -369,12 +363,10 @@ define ppc_fp128 @three_constants_ppcf128(ppc_fp128 %a, ppc_fp128 %c) {
 ; CHECK-NEXT:    stxv vs63, 32(r1) # 16-byte Folded Spill
 ; CHECK-NEXT:    xxsplti32dx vs63, 0, 1074935889
 ; CHECK-NEXT:    xxsplti32dx vs3, 1, -343597384
-; CHECK-NEXT:    # kill: def $f3 killed $f3 killed $vsl3
 ; CHECK-NEXT:    bl __gcc_qadd@notoc
 ; CHECK-NEXT:    xxsplti32dx vs3, 0, 1074935889
 ; CHECK-NEXT:    xxlxor f4, f4, f4
 ; CHECK-NEXT:    xxsplti32dx vs3, 1, -1719329096
-; CHECK-NEXT:    # kill: def $f3 killed $f3 killed $vsl3
 ; CHECK-NEXT:    bl __gcc_qadd@notoc
 ; CHECK-NEXT:    xxsplti32dx vs63, 1, 8724152
 ; CHECK-NEXT:    xxlxor f4, f4, f4
diff --git a/llvm/test/CodeGen/PowerPC/elf64-byval-cc.ll b/llvm/test/CodeGen/PowerPC/elf64-byval-cc.ll
index fc0bfef..9d537d8 100644
--- a/llvm/test/CodeGen/PowerPC/elf64-byval-cc.ll
+++ b/llvm/test/CodeGen/PowerPC/elf64-byval-cc.ll
@@ -403,11 +403,10 @@ define void @call_test_byval_mem32_2() #0 {
 ; CHECK-NEXT:    std 0, 48(1)
 ; CHECK-NEXT:    .cfi_def_cfa_offset 32
 ; CHECK-NEXT:    .cfi_offset lr, 16
-; CHECK-NEXT:    addis 3, 2, .LC5@toc@ha
 ; CHECK-NEXT:    vspltisw 2, 1
+; CHECK-NEXT:    addis 3, 2, .LC5@toc@ha
 ; CHECK-NEXT:    ld 3, .LC5@toc@l(3)
 ; CHECK-NEXT:    xvcvsxwdp 1, 34
-; CHECK-NEXT:    # kill: def $f1 killed $f1 killed $vsl1
 ; CHECK-NEXT:    ld 7, 24(3)
 ; CHECK-NEXT:    ld 6, 16(3)
 ; CHECK-NEXT:    ld 5, 8(3)
@@ -453,9 +452,7 @@ define void @call_test_byval_mem32_3() #0 {
 ; CHECK-NEXT:    li 7, 2
 ; CHECK-NEXT:    ld 3, .LC5@toc@l(3)
 ; CHECK-NEXT:    xvcvsxwdp 1, 34
-; CHECK-NEXT:    # kill: def $f1 killed $f1 killed $vsl1
 ; CHECK-NEXT:    xvcvsxwdp 2, 35
-; CHECK-NEXT:    # kill: def $f2 killed $f2 killed $vsl2
 ; CHECK-NEXT:    lxvd2x 0, 3, 4
 ; CHECK-NEXT:    li 4, 88
 ; CHECK-NEXT:    stxvd2x 0, 1, 4
diff --git a/llvm/test/CodeGen/PowerPC/fma-combine.ll b/llvm/test/CodeGen/PowerPC/fma-combine.ll
index b39c932..3d45e9a 100644
--- a/llvm/test/CodeGen/PowerPC/fma-combine.ll
+++ b/llvm/test/CodeGen/PowerPC/fma-combine.ll
@@ -202,7 +202,6 @@ define dso_local double @getNegatedExpression_crash(double %x, double %y) {
 ; CHECK-FAST-NEXT:    xvcvsxwdp 4, 34
 ; CHECK-FAST-NEXT:    lfs 3, .LCPI5_0@toc@l(3)
 ; CHECK-FAST-NEXT:    xssubdp 0, 1, 4
-; CHECK-FAST-NEXT:    # kill: def $f4 killed $f4 killed $vsl4
 ; CHECK-FAST-NEXT:    xsmaddadp 4, 1, 3
 ; CHECK-FAST-NEXT:    xsmaddadp 0, 4, 2
 ; CHECK-FAST-NEXT:    fmr 1, 0
@@ -226,7 +225,6 @@ define dso_local double @getNegatedExpression_crash(double %x, double %y) {
 ; CHECK-NEXT:    xvcvsxwdp 4, 34
 ; CHECK-NEXT:    lfs 3, .LCPI5_0@toc@l(3)
 ; CHECK-NEXT:    xssubdp 0, 1, 4
-; CHECK-NEXT:    # kill: def $f4 killed $f4 killed $vsl4
 ; CHECK-NEXT:    xsmaddadp 4, 1, 3
 ; CHECK-NEXT:    xsmaddadp 0, 4, 2
 ; CHECK-NEXT:    fmr 1, 0
diff --git a/llvm/test/CodeGen/PowerPC/fp-strict-round.ll b/llvm/test/CodeGen/PowerPC/fp-strict-round.ll
index 4c8729b9..eac4fb6 100644
--- a/llvm/test/CodeGen/PowerPC/fp-strict-round.ll
+++ b/llvm/test/CodeGen/PowerPC/fp-strict-round.ll
@@ -229,7 +229,6 @@ define <4 x float> @nearbyint_v4f32(<4 x float> %vf1, <4 x float> %vf2) strictfp
 ; P8-NEXT:    xscvspdpn f1, vs0
 ; P8-NEXT:    bl nearbyintf
 ; P8-NEXT:    nop
-; P8-NEXT:    # kill: def $f1 killed $f1 def $vsl1
 ; P8-NEXT:    xxmrghd vs0, vs1, v30
 ; P8-NEXT:    xscvspdpn f1, v31
 ; P8-NEXT:    xvcvdpsp v29, vs0
@@ -240,7 +239,6 @@ define <4 x float> @nearbyint_v4f32(<4 x float> %vf1, <4 x float> %vf2) strictfp
 ; P8-NEXT:    xscvspdpn f1, vs0
 ; P8-NEXT:    bl nearbyintf
 ; P8-NEXT:    nop
-; P8-NEXT:    # kill: def $f1 killed $f1 def $vsl1
 ; P8-NEXT:    xxmrghd vs0, v30, vs1
 ; P8-NEXT:    li r3, 160
 ; P8-NEXT:    xvcvdpsp v2, vs0
@@ -278,7 +276,6 @@ define <4 x float> @nearbyint_v4f32(<4 x float> %vf1, <4 x float> %vf2) strictfp
 ; P9-NEXT:    xscvspdpn f1, vs0
 ; P9-NEXT:    bl nearbyintf
 ; P9-NEXT:    nop
-; P9-NEXT:    # kill: def $f1 killed $f1 def $vsl1
 ; P9-NEXT:    xxmrghd vs0, vs1, v30
 ; P9-NEXT:    xscvspdpn f1, v31
 ; P9-NEXT:    xvcvdpsp v29, vs0
@@ -289,7 +286,6 @@ define <4 x float> @nearbyint_v4f32(<4 x float> %vf1, <4 x float> %vf2) strictfp
 ; P9-NEXT:    xscvspdpn f1, vs0
 ; P9-NEXT:    bl nearbyintf
 ; P9-NEXT:    nop
-; P9-NEXT:    # kill: def $f1 killed $f1 def $vsl1
 ; P9-NEXT:    xxmrghd vs0, v30, vs1
 ; P9-NEXT:    lxv v31, 64(r1) # 16-byte Folded Reload
 ; P9-NEXT:    lxv v30, 48(r1) # 16-byte Folded Reload
@@ -327,11 +323,9 @@ define <2 x double> @nearbyint_v2f64(<2 x double> %vf1, <2 x double> %vf2) stric
 ; P8-NEXT:    nop
 ; P8-NEXT:    xxlor v30, f1, f1
 ; P8-NEXT:    xxswapd vs1, v31
-; P8-NEXT:    # kill: def $f1 killed $f1 killed $vsl1
 ; P8-NEXT:    bl nearbyint
 ; P8-NEXT:    nop
 ; P8-NEXT:    li r3, 144
-; P8-NEXT:    # kill: def $f1 killed $f1 def $vsl1
 ; P8-NEXT:    xxmrghd v2, v30, vs1
 ; P8-NEXT:    lxvd2x v31, r1, r3 # 16-byte Folded Reload
 ; P8-NEXT:    li r3, 128
@@ -358,10 +352,8 @@ define <2 x double> @nearbyint_v2f64(<2 x double> %vf1, <2 x double> %vf2) stric
 ; P9-NEXT:    nop
 ; P9-NEXT:    xscpsgndp v30, f1, f1
 ; P9-NEXT:    xxswapd vs1, v31
-; P9-NEXT:    # kill: def $f1 killed $f1 killed $vsl1
 ; P9-NEXT:    bl nearbyint
 ; P9-NEXT:    nop
-; P9-NEXT:    # kill: def $f1 killed $f1 def $vsl1
 ; P9-NEXT:    xxmrghd v2, v30, vs1
 ; P9-NEXT:    lxv v31, 48(r1) # 16-byte Folded Reload
 ; P9-NEXT:    lxv v30, 32(r1) # 16-byte Folded Reload
diff --git a/llvm/test/CodeGen/PowerPC/frem.ll b/llvm/test/CodeGen/PowerPC/frem.ll
index 8cb68e6..19b4b1c 100644
--- a/llvm/test/CodeGen/PowerPC/frem.ll
+++ b/llvm/test/CodeGen/PowerPC/frem.ll
@@ -70,7 +70,6 @@ define <4 x float> @frem4x32(<4 x float> %a, <4 x float> %b) {
 ; CHECK-NEXT:    xscvspdpn 2, 0
 ; CHECK-NEXT:    bl fmodf
 ; CHECK-NEXT:    nop
-; CHECK-NEXT:    # kill: def $f1 killed $f1 def $vsl1
 ; CHECK-NEXT:    xxmrghd 0, 1, 61
 ; CHECK-NEXT:    xscvspdpn 1, 62
 ; CHECK-NEXT:    xscvspdpn 2, 63
@@ -84,7 +83,6 @@ define <4 x float> @frem4x32(<4 x float> %a, <4 x float> %b) {
 ; CHECK-NEXT:    xscvspdpn 2, 0
 ; CHECK-NEXT:    bl fmodf
 ; CHECK-NEXT:    nop
-; CHECK-NEXT:    # kill: def $f1 killed $f1 def $vsl1
 ; CHECK-NEXT:    xxmrghd 0, 61, 1
 ; CHECK-NEXT:    lxv 63, 80(1) # 16-byte Folded Reload
 ; CHECK-NEXT:    lxv 62, 64(1) # 16-byte Folded Reload
@@ -124,11 +122,8 @@ define <2 x double> @frem2x64(<2 x double> %a, <2 x double> %b) {
 ; CHECK-NEXT:    xscpsgndp 61, 1, 1
 ; CHECK-NEXT:    xxswapd 1, 62
 ; CHECK-NEXT:    xxswapd 2, 63
-; CHECK-NEXT:    # kill: def $f1 killed $f1 killed $vsl1
-; CHECK-NEXT:    # kill: def $f2 killed $f2 killed $vsl2
 ; CHECK-NEXT:    bl fmod
 ; CHECK-NEXT:    nop
-; CHECK-NEXT:    # kill: def $f1 killed $f1 def $vsl1
 ; CHECK-NEXT:    xxmrghd 34, 61, 1
 ; CHECK-NEXT:    lxv 63, 64(1) # 16-byte Folded Reload
 ; CHECK-NEXT:    lxv 62, 48(1) # 16-byte Folded Reload
diff --git a/llvm/test/CodeGen/PowerPC/handle-f16-storage-type.ll b/llvm/test/CodeGen/PowerPC/handle-f16-storage-type.ll
index 13f70f4..4256933 100644
--- a/llvm/test/CodeGen/PowerPC/handle-f16-storage-type.ll
+++ b/llvm/test/CodeGen/PowerPC/handle-f16-storage-type.ll
@@ -666,7 +666,6 @@ define <4 x float> @test_extend32_vec4(ptr %p) #0 {
 ; P8-NEXT:    bl __gnu_h2f_ieee
 ; P8-NEXT:    nop
 ; P8-NEXT:    li r3, 80
-; P8-NEXT:    # kill: def $f1 killed $f1 def $vsl1
 ; P8-NEXT:    xxmrghd vs0, vs61, vs1
 ; P8-NEXT:    xxmrghd vs1, vs63, vs62
 ; P8-NEXT:    ld r30, 96(r1) # 8-byte Folded Reload
@@ -776,7 +775,6 @@ define <4 x double> @test_extend64_vec4(ptr %p) #0 {
 ; P8-NEXT:    nop
 ; P8-NEXT:    li r3, 80
 ; P8-NEXT:    xxmrghd vs35, vs63, vs62
-; P8-NEXT:    # kill: def $f1 killed $f1 def $vsl1
 ; P8-NEXT:    xxmrghd vs34, vs61, vs1
 ; P8-NEXT:    ld r30, 96(r1) # 8-byte Folded Reload
 ; P8-NEXT:    lxvd2x vs63, r1, r3 # 16-byte Folded Reload
@@ -1005,11 +1003,10 @@ define void @test_trunc64_vec4(<4 x double> %a, ptr %p) #0 {
 ; P8-NEXT:    stdu r1, -128(r1)
 ; P8-NEXT:    li r3, 48
 ; P8-NEXT:    std r0, 144(r1)
-; P8-NEXT:    std r27, 88(r1) # 8-byte Folded Spill
 ; P8-NEXT:    xxswapd vs1, vs34
+; P8-NEXT:    std r27, 88(r1) # 8-byte Folded Spill
 ; P8-NEXT:    std r28, 96(r1) # 8-byte Folded Spill
 ; P8-NEXT:    std r29, 104(r1) # 8-byte Folded Spill
-; P8-NEXT:    # kill: def $f1 killed $f1 killed $vsl1
 ; P8-NEXT:    std r30, 112(r1) # 8-byte Folded Spill
 ; P8-NEXT:    mr r30, r7
 ; P8-NEXT:    stxvd2x vs62, r1, r3 # 16-byte Folded Spill
@@ -1019,9 +1016,8 @@ define void @test_trunc64_vec4(<4 x double> %a, ptr %p) #0 {
 ; P8-NEXT:    vmr v31, v3
 ; P8-NEXT:    bl __truncdfhf2
 ; P8-NEXT:    nop
-; P8-NEXT:    mr r29, r3
 ; P8-NEXT:    xxswapd vs1, vs63
-; P8-NEXT:    # kill: def $f1 killed $f1 killed $vsl1
+; P8-NEXT:    mr r29, r3
 ; P8-NEXT:    bl __truncdfhf2
 ; P8-NEXT:    nop
 ; P8-NEXT:    xxlor f1, vs62, vs62
@@ -1238,7 +1234,6 @@ define half @PR40273(half) #0 {
 ; P8-NEXT:    vspltisw v2, 1
 ; P8-NEXT:    xvcvsxwdp vs1, vs34
 ; P8-NEXT:  .LBB20_2:
-; P8-NEXT:    # kill: def $f1 killed $f1 killed $vsl1
 ; P8-NEXT:    addi r1, r1, 32
 ; P8-NEXT:    ld r0, 16(r1)
 ; P8-NEXT:    mtlr r0
@@ -1253,12 +1248,10 @@ define half @PR40273(half) #0 {
 ; CHECK-NEXT:    mtfprwz f0, r3
 ; CHECK-NEXT:    xscvhpdp f0, f0
 ; CHECK-NEXT:    fcmpu cr0, f0, f1
-; CHECK-NEXT:    beq cr0, .LBB20_2
+; CHECK-NEXT:    beqlr cr0
 ; CHECK-NEXT:  # %bb.1:
 ; CHECK-NEXT:    vspltisw v2, 1
 ; CHECK-NEXT:    xvcvsxwdp vs1, vs34
-; CHECK-NEXT:  .LBB20_2:
-; CHECK-NEXT:    # kill: def $f1 killed $f1 killed $vsl1
 ; CHECK-NEXT:    blr
 ;
 ; SOFT-LABEL: PR40273:
diff --git a/llvm/test/CodeGen/PowerPC/ldexp.ll b/llvm/test/CodeGen/PowerPC/ldexp.ll
index ed8089b..151df60 100644
--- a/llvm/test/CodeGen/PowerPC/ldexp.ll
+++ b/llvm/test/CodeGen/PowerPC/ldexp.ll
@@ -13,7 +13,6 @@ define float @ldexp_f32(i8 zeroext %x) {
 ; CHECK-NEXT:    vspltisw v2, 1
 ; CHECK-NEXT:    mr r4, r3
 ; CHECK-NEXT:    xvcvsxwdp vs1, v2
-; CHECK-NEXT:    # kill: def $f1 killed $f1 killed $vsl1
 ; CHECK-NEXT:    bl ldexpf
 ; CHECK-NEXT:    nop
 ; CHECK-NEXT:    addi r1, r1, 32
@@ -36,7 +35,6 @@ define double @ldexp_f64(i8 zeroext %x) {
 ; CHECK-NEXT:    vspltisw v2, 1
 ; CHECK-NEXT:    mr r4, r3
 ; CHECK-NEXT:    xvcvsxwdp vs1, v2
-; CHECK-NEXT:    # kill: def $f1 killed $f1 killed $vsl1
 ; CHECK-NEXT:    bl ldexp
 ; CHECK-NEXT:    nop
 ; CHECK-NEXT:    addi r1, r1, 32
@@ -120,7 +118,6 @@ define <4 x float> @ldexp_v4f32(<4 x float> %val, <4 x i32> %exp) {
 ; CHECK-NEXT:    vextuwrx r4, r3, v31
 ; CHECK-NEXT:    bl ldexpf
 ; CHECK-NEXT:    nop
-; CHECK-NEXT:    # kill: def $f1 killed $f1 def $vsl1
 ; CHECK-NEXT:    xxmrghd vs0, v29, vs1
 ; CHECK-NEXT:    li r3, 0
 ; CHECK-NEXT:    vextuwrx r4, r3, v31
@@ -135,7 +132,6 @@ define <4 x float> @ldexp_v4f32(<4 x float> %val, <4 x i32> %exp) {
 ; CHECK-NEXT:    xscvspdpn f1, vs0
 ; CHECK-NEXT:    bl ldexpf
 ; CHECK-NEXT:    nop
-; CHECK-NEXT:    # kill: def $f1 killed $f1 def $vsl1
 ; CHECK-NEXT:    xxmrghd vs0, vs1, v29
 ; CHECK-NEXT:    lxv v31, 80(r1) # 16-byte Folded Reload
 ; CHECK-NEXT:    lxv v30, 64(r1) # 16-byte Folded Reload
diff --git a/llvm/test/CodeGen/PowerPC/p10-splatImm-CPload-pcrel.ll b/llvm/test/CodeGen/PowerPC/p10-splatImm-CPload-pcrel.ll
index 7373a32..842cb92 100644
--- a/llvm/test/CodeGen/PowerPC/p10-splatImm-CPload-pcrel.ll
+++ b/llvm/test/CodeGen/PowerPC/p10-splatImm-CPload-pcrel.ll
@@ -124,21 +124,18 @@ define dso_local double @testDoubleNonRepresentableScalar() local_unnamed_addr {
 ; CHECK-LE:       # %bb.0: # %entry
 ; CHECK-LE-NEXT:    xxsplti32dx vs1, 0, 1081435463
 ; CHECK-LE-NEXT:    xxsplti32dx vs1, 1, -1374389535
-; CHECK-LE-NEXT:    # kill: def $f1 killed $f1 killed $vsl1
 ; CHECK-LE-NEXT:    blr
 ;
 ; CHECK-NOPCREL-BE-LABEL: testDoubleNonRepresentableScalar:
 ; CHECK-NOPCREL-BE:       # %bb.0: # %entry
 ; CHECK-NOPCREL-BE-NEXT:    xxsplti32dx vs1, 0, 1081435463
 ; CHECK-NOPCREL-BE-NEXT:    xxsplti32dx vs1, 1, -1374389535
-; CHECK-NOPCREL-BE-NEXT:    # kill: def $f1 killed $f1 killed $vsl1
 ; CHECK-NOPCREL-BE-NEXT:    blr
 ;
 ; CHECK-NOPCREL-LE-LABEL: testDoubleNonRepresentableScalar:
 ; CHECK-NOPCREL-LE:       # %bb.0: # %entry
 ; CHECK-NOPCREL-LE-NEXT:    xxsplti32dx vs1, 0, 1081435463
 ; CHECK-NOPCREL-LE-NEXT:    xxsplti32dx vs1, 1, -1374389535
-; CHECK-NOPCREL-LE-NEXT:    # kill: def $f1 killed $f1 killed $vsl1
 ; CHECK-NOPCREL-LE-NEXT:    blr
 ;
 ; CHECK-NOPREFIX-LABEL: testDoubleNonRepresentableScalar:
@@ -151,7 +148,6 @@ define dso_local double @testDoubleNonRepresentableScalar() local_unnamed_addr {
 ; CHECK-BE:       # %bb.0: # %entry
 ; CHECK-BE-NEXT:    xxsplti32dx vs1, 0, 1081435463
 ; CHECK-BE-NEXT:    xxsplti32dx vs1, 1, -1374389535
-; CHECK-BE-NEXT:    # kill: def $f1 killed $f1 killed $vsl1
 ; CHECK-BE-NEXT:    blr
 entry:
   ret double 3.423300e+02
@@ -162,21 +158,18 @@ define dso_local float @testFloatDenormScalar() local_unnamed_addr {
 ; CHECK-LE:       # %bb.0: # %entry
 ; CHECK-LE-NEXT:    xxsplti32dx vs1, 0, 940259579
 ; CHECK-LE-NEXT:    xxsplti32dx vs1, 1, -2147483648
-; CHECK-LE-NEXT:    # kill: def $f1 killed $f1 killed $vsl1
 ; CHECK-LE-NEXT:    blr
 ;
 ; CHECK-NOPCREL-BE-LABEL: testFloatDenormScalar:
 ; CHECK-NOPCREL-BE:       # %bb.0: # %entry
 ; CHECK-NOPCREL-BE-NEXT:    xxsplti32dx vs1, 0, 940259579
 ; CHECK-NOPCREL-BE-NEXT:    xxsplti32dx vs1, 1, -2147483648
-; CHECK-NOPCREL-BE-NEXT:    # kill: def $f1 killed $f1 killed $vsl1
 ; CHECK-NOPCREL-BE-NEXT:    blr
 ;
 ; CHECK-NOPCREL-LE-LABEL: testFloatDenormScalar:
 ; CHECK-NOPCREL-LE:       # %bb.0: # %entry
 ; CHECK-NOPCREL-LE-NEXT:    xxsplti32dx vs1, 0, 940259579
 ; CHECK-NOPCREL-LE-NEXT:    xxsplti32dx vs1, 1, -2147483648
-; CHECK-NOPCREL-LE-NEXT:    # kill: def $f1 killed $f1 killed $vsl1
 ; CHECK-NOPCREL-LE-NEXT:    blr
 ;
 ; CHECK-NOPREFIX-LABEL: testFloatDenormScalar:
@@ -189,7 +182,6 @@ define dso_local float @testFloatDenormScalar() local_unnamed_addr {
 ; CHECK-BE:       # %bb.0: # %entry
 ; CHECK-BE-NEXT:    xxsplti32dx vs1, 0, 940259579
 ; CHECK-BE-NEXT:    xxsplti32dx vs1, 1, -2147483648
-; CHECK-BE-NEXT:    # kill: def $f1 killed $f1 killed $vsl1
 ; CHECK-BE-NEXT:    blr
 entry:
   ret float 0x380B38FB80000000
@@ -200,21 +192,18 @@ define dso_local double @testFloatDenormToDoubleScalar() local_unnamed_addr {
 ; CHECK-LE:       # %bb.0: # %entry
 ; CHECK-LE-NEXT:    xxsplti32dx vs1, 0, 940259579
 ; CHECK-LE-NEXT:    xxsplti32dx vs1, 1, -2147483648
-; CHECK-LE-NEXT:    # kill: def $f1 killed $f1 killed $vsl1
 ; CHECK-LE-NEXT:    blr
 ;
 ; CHECK-NOPCREL-BE-LABEL: testFloatDenormToDoubleScalar:
 ; CHECK-NOPCREL-BE:       # %bb.0: # %entry
 ; CHECK-NOPCREL-BE-NEXT:    xxsplti32dx vs1, 0, 940259579
 ; CHECK-NOPCREL-BE-NEXT:    xxsplti32dx vs1, 1, -2147483648
-; CHECK-NOPCREL-BE-NEXT:    # kill: def $f1 killed $f1 killed $vsl1
 ; CHECK-NOPCREL-BE-NEXT:    blr
 ;
 ; CHECK-NOPCREL-LE-LABEL: testFloatDenormToDoubleScalar:
 ; CHECK-NOPCREL-LE:       # %bb.0: # %entry
 ; CHECK-NOPCREL-LE-NEXT:    xxsplti32dx vs1, 0, 940259579
 ; CHECK-NOPCREL-LE-NEXT:    xxsplti32dx vs1, 1, -2147483648
-; CHECK-NOPCREL-LE-NEXT:    # kill: def $f1 killed $f1 killed $vsl1
 ; CHECK-NOPCREL-LE-NEXT:    blr
 ;
 ; CHECK-NOPREFIX-LABEL: testFloatDenormToDoubleScalar:
@@ -227,7 +216,6 @@ define dso_local double @testFloatDenormToDoubleScalar() local_unnamed_addr {
 ; CHECK-BE:       # %bb.0: # %entry
 ; CHECK-BE-NEXT:    xxsplti32dx vs1, 0, 940259579
 ; CHECK-BE-NEXT:    xxsplti32dx vs1, 1, -2147483648
-; CHECK-BE-NEXT:    # kill: def $f1 killed $f1 killed $vsl1
 ; CHECK-BE-NEXT:    blr
 entry:
   ret double 0x380B38FB80000000
diff --git a/llvm/test/CodeGen/PowerPC/p8-scalar_vector_conversions.ll b/llvm/test/CodeGen/PowerPC/p8-scalar_vector_conversions.ll
index 87b8a64..8f12b18 100644
--- a/llvm/test/CodeGen/PowerPC/p8-scalar_vector_conversions.ll
+++ b/llvm/test/CodeGen/PowerPC/p8-scalar_vector_conversions.ll
@@ -2416,7 +2416,6 @@ define double @getd0(<2 x double> %vd) {
 ; CHECK-LE-LABEL: getd0:
 ; CHECK-LE:       # %bb.0: # %entry
 ; CHECK-LE-NEXT:    xxswapd vs1, v2
-; CHECK-LE-NEXT:    # kill: def $f1 killed $f1 killed $vsl1
 ; CHECK-LE-NEXT:    blr
 ;
 ; CHECK-AIX-LABEL: getd0:
@@ -2435,7 +2434,6 @@ define double @getd1(<2 x double> %vd) {
 ; CHECK-LABEL: getd1:
 ; CHECK:       # %bb.0: # %entry
 ; CHECK-NEXT:    xxswapd vs1, v2
-; CHECK-NEXT:    # kill: def $f1 killed $f1 killed $vsl1
 ; CHECK-NEXT:    blr
 ;
 ; CHECK-LE-LABEL: getd1:
@@ -2446,7 +2444,6 @@ define double @getd1(<2 x double> %vd) {
 ; CHECK-AIX-LABEL: getd1:
 ; CHECK-AIX:       # %bb.0: # %entry
 ; CHECK-AIX-NEXT:    xxswapd 1, 34
-; CHECK-AIX-NEXT:    # kill: def $f1 killed $f1 killed $vsl1
 ; CHECK-AIX-NEXT:    blr
 entry:
   %vecext = extractelement <2 x double> %vd, i32 1
@@ -2462,7 +2459,6 @@ define double @getveld(<2 x double> %vd, i32 signext %i) {
 ; CHECK-NEXT:    lvsl v3, 0, r3
 ; CHECK-NEXT:    vperm v2, v2, v2, v3
 ; CHECK-NEXT:    xxlor vs1, v2, v2
-; CHECK-NEXT:    # kill: def $f1 killed $f1 killed $vsl1
 ; CHECK-NEXT:    blr
 ;
 ; CHECK-LE-LABEL: getveld:
@@ -2474,7 +2470,6 @@ define double @getveld(<2 x double> %vd, i32 signext %i) {
 ; CHECK-LE-NEXT:    lvsl v3, 0, r3
 ; CHECK-LE-NEXT:    vperm v2, v2, v2, v3
 ; CHECK-LE-NEXT:    xxlor vs1, v2, v2
-; CHECK-LE-NEXT:    # kill: def $f1 killed $f1 killed $vsl1
 ; CHECK-LE-NEXT:    blr
 ;
 ; CHECK-AIX-LABEL: getveld:
@@ -2484,7 +2479,6 @@ define double @getveld(<2 x double> %vd, i32 signext %i) {
 ; CHECK-AIX-NEXT:    lvsl 3, 0, 3
 ; CHECK-AIX-NEXT:    vperm 2, 2, 2, 3
 ; CHECK-AIX-NEXT:    xxlor 1, 34, 34
-; CHECK-AIX-NEXT:    # kill: def $f1 killed $f1 killed $vsl1
 ; CHECK-AIX-NEXT:    blr
 entry:
   %vecext = extractelement <2 x double> %vd, i32 %i
diff --git a/llvm/test/CodeGen/PowerPC/pcrel-call-linkage-leaf.ll b/llvm/test/CodeGen/PowerPC/pcrel-call-linkage-leaf.ll
index 541b2c4..0b1047b 100644
--- a/llvm/test/CodeGen/PowerPC/pcrel-call-linkage-leaf.ll
+++ b/llvm/test/CodeGen/PowerPC/pcrel-call-linkage-leaf.ll
@@ -175,7 +175,6 @@ define dso_local double @UsesX2AsConstPoolTOC() local_unnamed_addr {
 ; CHECK-ALL:       # %bb.0: # %entry
 ; CHECK-S-NEXT:    xxsplti32dx vs1, 0, 1078011044
 ; CHECK-S-NEXT:    xxsplti32dx vs1, 1, -337824948
-; CHECK-S-NEXT:    # kill: def $f1 killed $f1 killed $vsl1
 ; CHECK-S-NEXT:    blr
 entry:
   ret double 0x404124A4EBDD334C
diff --git a/llvm/test/CodeGen/PowerPC/save-reg-params.ll b/llvm/test/CodeGen/PowerPC/save-reg-params.ll
index b85ba6c..da4cd51 100644
--- a/llvm/test/CodeGen/PowerPC/save-reg-params.ll
+++ b/llvm/test/CodeGen/PowerPC/save-reg-params.ll
@@ -609,7 +609,6 @@ define void @mixed_2(<2 x double> %a, <4 x i32> %b, i64 %c) #0 {
 ; 32BIT-NEXT:    nop
 ; 32BIT-NEXT:    li 3, 64
 ; 32BIT-NEXT:    lxvd2x 1, 1, 3 # 16-byte Folded Reload
-; 32BIT-NEXT:    # kill: def $f1 killed $f1 killed $vsl1
 ; 32BIT-NEXT:    bl .consume_f64[PR]
 ; 32BIT-NEXT:    nop
 ; 32BIT-NEXT:    addi 1, 1, 80
@@ -635,7 +634,6 @@ define void @mixed_2(<2 x double> %a, <4 x i32> %b, i64 %c) #0 {
 ; 64BIT-NEXT:    nop
 ; 64BIT-NEXT:    li 3, 128
 ; 64BIT-NEXT:    lxvd2x 1, 1, 3 # 16-byte Folded Reload
-; 64BIT-NEXT:    # kill: def $f1 killed $f1 killed $vsl1
 ; 64BIT-NEXT:    bl .consume_f64[PR]
 ; 64BIT-NEXT:    nop
 ; 64BIT-NEXT:    addi 1, 1, 144
diff --git a/llvm/test/CodeGen/PowerPC/select_const.ll b/llvm/test/CodeGen/PowerPC/select_const.ll
index ca4be83..a48d696 100644
--- a/llvm/test/CodeGen/PowerPC/select_const.ll
+++ b/llvm/test/CodeGen/PowerPC/select_const.ll
@@ -845,12 +845,10 @@ define double @sel_constants_frem_constant(i1 %cond) {
 ; ALL-NEXT:  # %bb.1:
 ; ALL-NEXT:    addis 3, 2, .LCPI48_0@toc@ha
 ; ALL-NEXT:    lfd 1, .LCPI48_0@toc@l(3)
-; ALL-NEXT:    # kill: def $f1 killed $f1 killed $vsl1
 ; ALL-NEXT:    blr
 ; ALL-NEXT:  .LBB48_2:
 ; ALL-NEXT:    vspltisw 2, -4
 ; ALL-NEXT:    xvcvsxwdp 1, 34
-; ALL-NEXT:    # kill: def $f1 killed $f1 killed $vsl1
 ; ALL-NEXT:    blr
   %sel = select i1 %cond, double -4.0, double 23.3
   %bo = frem double %sel, 5.1
diff --git a/llvm/test/CodeGen/PowerPC/subreg-coalescer.mir b/llvm/test/CodeGen/PowerPC/subreg-coalescer.mir
index 39eab1f..31407e0 100644
--- a/llvm/test/CodeGen/PowerPC/subreg-coalescer.mir
+++ b/llvm/test/CodeGen/PowerPC/subreg-coalescer.mir
@@ -31,4 +31,3 @@ body:             |
     $v2 = COPY %5
     BLR8 implicit $lr8, implicit $rm, implicit $v2
 ...
-
diff --git a/llvm/test/CodeGen/PowerPC/subreg-lanemasks.mir b/llvm/test/CodeGen/PowerPC/subreg-lanemasks.mir
index e1fd618..cf69d3a 100644
--- a/llvm/test/CodeGen/PowerPC/subreg-lanemasks.mir
+++ b/llvm/test/CodeGen/PowerPC/subreg-lanemasks.mir
@@ -5,21 +5,18 @@
 
 # Keep track of all of the lanemasks for various subregsiters.
 #
-# TODO: The mask for %6.sub_vsx1:accrc is the same as the mask for %10.sub_vsx1_then_sub_64:accrc.
-#       Ideally on PowerPC these masks should be different. To be addressed in a later patch.
-#
-# CHECK: %3 [80r,80d:0) 0@80r  L0000000000000004 [80r,80d:0) 0@80r  weight:0.000000e+00
-# CHECK: %4 [96r,96d:0) 0@96r  L0000000000000800 [96r,96d:0) 0@96r  weight:0.000000e+00
-# CHECK: %5 [112r,112d:0) 0@112r  L0000000000000004 [112r,112d:0) 0@112r  weight:0.000000e+00
-# CHECK: %6 [128r,128d:0) 0@128r  L0000000000000800 [128r,128d:0) 0@128r  weight:0.000000e+00
+# CHECK: %3 [80r,80d:0) 0@80r  L000000000000000C [80r,80d:0) 0@80r  weight:0.000000e+00
+# CHECK: %4 [96r,96d:0) 0@96r  L0000000000003000 [96r,96d:0) 0@96r  weight:0.000000e+00
+# CHECK: %5 [112r,112d:0) 0@112r  L000000000000000C [112r,112d:0) 0@112r  weight:0.000000e+00
+# CHECK: %6 [128r,128d:0) 0@128r  L0000000000003000 [128r,128d:0) 0@128r  weight:0.000000e+00
 # CHECK: %7 [144r,144d:0) 0@144r  L0000000000000004 [144r,144d:0) 0@144r  weight:0.000000e+00
-# CHECK: %8 [160r,160d:0) 0@160r  L0000000000000800 [160r,160d:0) 0@160r  weight:0.000000e+00
+# CHECK: %8 [160r,160d:0) 0@160r  L0000000000001000 [160r,160d:0) 0@160r  weight:0.000000e+00
 # CHECK: %9 [176r,176d:0) 0@176r  L0000000000000004 [176r,176d:0) 0@176r  weight:0.000000e+00
-# CHECK: %10 [192r,192d:0) 0@192r  L0000000000000800 [192r,192d:0) 0@192r  weight:0.000000e+00
-# CHECK: %11 [208r,208d:0) 0@208r  L0000000000001000 [208r,208d:0) 0@208r  weight:0.000000e+00
-# CHECK: %12 [224r,224d:0) 0@224r  L0000000000002000 [224r,224d:0) 0@224r  weight:0.000000e+00
-# CHECK: %13 [240r,240d:0) 0@240r  L0000000000000804 [240r,240d:0) 0@240r  weight:0.000000e+00
-# CHECK: %14 [256r,256d:0) 0@256r  L0000000000003000 [256r,256d:0) 0@256r  weight:0.000000e+00
+# CHECK: %10 [192r,192d:0) 0@192r  L0000000000001000 [192r,192d:0) 0@192r  weight:0.000000e+00
+# CHECK: %11 [208r,208d:0) 0@208r  L0000000000004000 [208r,208d:0) 0@208r  weight:0.000000e+00
+# CHECK: %12 [224r,224d:0) 0@224r  L0000000000010000 [224r,224d:0) 0@224r  weight:0.000000e+00
+# CHECK: %13 [240r,240d:0) 0@240r  L000000000000300C [240r,240d:0) 0@240r  weight:0.000000e+00
+# CHECK: %14 [256r,256d:0) 0@256r  L000000000003C000 [256r,256d:0) 0@256r  weight:0.000000e+00
 
 
 # CHECK:       0B bb.0
diff --git a/llvm/test/CodeGen/PowerPC/toc-float.ll b/llvm/test/CodeGen/PowerPC/toc-float.ll
index 1d6f1f7..943edd5 100644
--- a/llvm/test/CodeGen/PowerPC/toc-float.ll
+++ b/llvm/test/CodeGen/PowerPC/toc-float.ll
@@ -9,14 +9,12 @@ define double @doubleConstant1() {
 ; CHECK-P9:       # %bb.0:
 ; CHECK-P9-NEXT:    vspltisw 2, 14
 ; CHECK-P9-NEXT:    xvcvsxwdp 1, 34
-; CHECK-P9-NEXT:    # kill: def $f1 killed $f1 killed $vsl1
 ; CHECK-P9-NEXT:    blr
 ;
 ; CHECK-P8-LABEL: doubleConstant1:
 ; CHECK-P8:       # %bb.0:
 ; CHECK-P8-NEXT:    vspltisw 2, 14
 ; CHECK-P8-NEXT:    xvcvsxwdp 1, 34
-; CHECK-P8-NEXT:    # kill: def $f1 killed $f1 killed $vsl1
 ; CHECK-P8-NEXT:    blr
   ret double 1.400000e+01
 }
diff --git a/llvm/test/CodeGen/PowerPC/variable_elem_vec_extracts.ll b/llvm/test/CodeGen/PowerPC/variable_elem_vec_extracts.ll
index 49c80a9..d0dda1a 100644
--- a/llvm/test/CodeGen/PowerPC/variable_elem_vec_extracts.ll
+++ b/llvm/test/CodeGen/PowerPC/variable_elem_vec_extracts.ll
@@ -122,7 +122,6 @@ define double @getd(<2 x double> %a, i32 zeroext %b) {
 ; CHECK-NEXT:    lvsl 3, 0, 3
 ; CHECK-NEXT:    vperm 2, 2, 2, 3
 ; CHECK-NEXT:    xxlor 1, 34, 34
-; CHECK-NEXT:    # kill: def $f1 killed $f1 killed $vsl1
 ; CHECK-NEXT:    blr
 ;
 ; CHECK-BE-LABEL: getd:
@@ -132,7 +131,6 @@ define double @getd(<2 x double> %a, i32 zeroext %b) {
 ; CHECK-BE-NEXT:    lvsl 3, 0, 3
 ; CHECK-BE-NEXT:    vperm 2, 2, 2, 3
 ; CHECK-BE-NEXT:    xxlor 1, 34, 34
-; CHECK-BE-NEXT:    # kill: def $f1 killed $f1 killed $vsl1
 ; CHECK-BE-NEXT:    blr
 ;
 ; CHECK-P7-LABEL: getd:
@@ -142,7 +140,6 @@ define double @getd(<2 x double> %a, i32 zeroext %b) {
 ; CHECK-P7-NEXT:    lvsl 3, 0, 3
 ; CHECK-P7-NEXT:    vperm 2, 2, 2, 3
 ; CHECK-P7-NEXT:    xxlor 1, 34, 34
-; CHECK-P7-NEXT:    # kill: def $f1 killed $f1 killed $vsl1
 ; CHECK-P7-NEXT:    blr
 entry:
   %vecext = extractelement <2 x double> %a, i32 %b
diff --git a/llvm/test/CodeGen/PowerPC/vec_insert_elt.ll b/llvm/test/CodeGen/PowerPC/vec_insert_elt.ll
index b98aed8..291a9c1 100644
--- a/llvm/test/CodeGen/PowerPC/vec_insert_elt.ll
+++ b/llvm/test/CodeGen/PowerPC/vec_insert_elt.ll
@@ -940,25 +940,21 @@ entry:
 define <2 x double> @testDoubleImm1(<2 x double> %a, double %b) {
 ; CHECK-LABEL: testDoubleImm1:
 ; CHECK:       # %bb.0: # %entry
-; CHECK-NEXT:    # kill: def $f1 killed $f1 def $vsl1
 ; CHECK-NEXT:    xxmrghd v2, v2, vs1
 ; CHECK-NEXT:    blr
 ;
 ; CHECK-BE-LABEL: testDoubleImm1:
 ; CHECK-BE:       # %bb.0: # %entry
-; CHECK-BE-NEXT:    # kill: def $f1 killed $f1 def $vsl1
 ; CHECK-BE-NEXT:    xxpermdi v2, vs1, v2, 1
 ; CHECK-BE-NEXT:    blr
 ;
 ; CHECK-P9-LABEL: testDoubleImm1:
 ; CHECK-P9:       # %bb.0: # %entry
-; CHECK-P9-NEXT:    # kill: def $f1 killed $f1 def $vsl1
 ; CHECK-P9-NEXT:    xxpermdi v2, vs1, v2, 1
 ; CHECK-P9-NEXT:    blr
 ;
 ; AIX-P8-LABEL: testDoubleImm1:
 ; AIX-P8:       # %bb.0: # %entry
-; AIX-P8-NEXT:    # kill: def $f1 killed $f1 def $vsl1
 ; AIX-P8-NEXT:    xxpermdi v2, vs1, v2, 1
 ; AIX-P8-NEXT:    blr
 entry:
diff --git a/llvm/test/CodeGen/PowerPC/vector-constrained-fp-intrinsics.ll b/llvm/test/CodeGen/PowerPC/vector-constrained-fp-intrinsics.ll
index f217162..aedb1a9 100644
--- a/llvm/test/CodeGen/PowerPC/vector-constrained-fp-intrinsics.ll
+++ b/llvm/test/CodeGen/PowerPC/vector-constrained-fp-intrinsics.ll
@@ -107,32 +107,20 @@ entry:
 define <3 x double> @constrained_vector_fdiv_v3f64(<3 x double> %x, <3 x double> %y) #0 {
 ; PC64LE-LABEL: constrained_vector_fdiv_v3f64:
 ; PC64LE:       # %bb.0: # %entry
-; PC64LE-NEXT:    # kill: def $f5 killed $f5 def $vsl5
-; PC64LE-NEXT:    # kill: def $f4 killed $f4 def $vsl4
-; PC64LE-NEXT:    # kill: def $f2 killed $f2 def $vsl2
-; PC64LE-NEXT:    # kill: def $f1 killed $f1 def $vsl1
 ; PC64LE-NEXT:    xxmrghd 0, 5, 4
 ; PC64LE-NEXT:    xxmrghd 1, 2, 1
 ; PC64LE-NEXT:    xsdivdp 3, 3, 6
 ; PC64LE-NEXT:    xvdivdp 2, 1, 0
 ; PC64LE-NEXT:    xxswapd 1, 2
-; PC64LE-NEXT:    # kill: def $f1 killed $f1 killed $vsl1
-; PC64LE-NEXT:    # kill: def $f2 killed $f2 killed $vsl2
 ; PC64LE-NEXT:    blr
 ;
 ; PC64LE9-LABEL: constrained_vector_fdiv_v3f64:
 ; PC64LE9:       # %bb.0: # %entry
-; PC64LE9-NEXT:    # kill: def $f5 killed $f5 def $vsl5
-; PC64LE9-NEXT:    # kill: def $f4 killed $f4 def $vsl4
-; PC64LE9-NEXT:    # kill: def $f2 killed $f2 def $vsl2
-; PC64LE9-NEXT:    # kill: def $f1 killed $f1 def $vsl1
 ; PC64LE9-NEXT:    xxmrghd 0, 5, 4
 ; PC64LE9-NEXT:    xxmrghd 1, 2, 1
 ; PC64LE9-NEXT:    xsdivdp 3, 3, 6
 ; PC64LE9-NEXT:    xvdivdp 2, 1, 0
 ; PC64LE9-NEXT:    xxswapd 1, 2
-; PC64LE9-NEXT:    # kill: def $f1 killed $f1 killed $vsl1
-; PC64LE9-NEXT:    # kill: def $f2 killed $f2 killed $vsl2
 ; PC64LE9-NEXT:    blr
 entry:
   %div = call <3 x double> @llvm.experimental.constrained.fdiv.v3f64(
@@ -217,13 +205,10 @@ define <2 x double> @constrained_vector_frem_v2f64(<2 x double> %x, <2 x double>
 ; PC64LE-NEXT:    nop
 ; PC64LE-NEXT:    xxlor 61, 1, 1
 ; PC64LE-NEXT:    xxswapd 1, 62
-; PC64LE-NEXT:    # kill: def $f1 killed $f1 killed $vsl1
 ; PC64LE-NEXT:    xxswapd 2, 63
-; PC64LE-NEXT:    # kill: def $f2 killed $f2 killed $vsl2
 ; PC64LE-NEXT:    bl fmod
 ; PC64LE-NEXT:    nop
 ; PC64LE-NEXT:    li 3, 80
-; PC64LE-NEXT:    # kill: def $f1 killed $f1 def $vsl1
 ; PC64LE-NEXT:    xxmrghd 34, 61, 1
 ; PC64LE-NEXT:    lxvd2x 63, 1, 3 # 16-byte Folded Reload
 ; PC64LE-NEXT:    li 3, 64
@@ -252,11 +237,8 @@ define <2 x double> @constrained_vector_frem_v2f64(<2 x double> %x, <2 x double>
 ; PC64LE9-NEXT:    xscpsgndp 61, 1, 1
 ; PC64LE9-NEXT:    xxswapd 1, 62
 ; PC64LE9-NEXT:    xxswapd 2, 63
-; PC64LE9-NEXT:    # kill: def $f1 killed $f1 killed $vsl1
-; PC64LE9-NEXT:    # kill: def $f2 killed $f2 killed $vsl2
 ; PC64LE9-NEXT:    bl fmod
 ; PC64LE9-NEXT:    nop
-; PC64LE9-NEXT:    # kill: def $f1 killed $f1 def $vsl1
 ; PC64LE9-NEXT:    xxmrghd 34, 61, 1
 ; PC64LE9-NEXT:    lxv 63, 64(1) # 16-byte Folded Reload
 ; PC64LE9-NEXT:    lxv 62, 48(1) # 16-byte Folded Reload
@@ -408,7 +390,6 @@ define <3 x double> @constrained_vector_frem_v3f64(<3 x double> %x, <3 x double>
 ; PC64LE-NEXT:    fmr 2, 30
 ; PC64LE-NEXT:    bl fmod
 ; PC64LE-NEXT:    nop
-; PC64LE-NEXT:    # kill: def $f1 killed $f1 def $vsl1
 ; PC64LE-NEXT:    xxmrghd 63, 1, 63
 ; PC64LE-NEXT:    fmr 1, 29
 ; PC64LE-NEXT:    fmr 2, 31
@@ -423,7 +404,6 @@ define <3 x double> @constrained_vector_frem_v3f64(<3 x double> %x, <3 x double>
 ; PC64LE-NEXT:    lfd 29, 72(1) # 8-byte Folded Reload
 ; PC64LE-NEXT:    lfd 28, 64(1) # 8-byte Folded Reload
 ; PC64LE-NEXT:    lxvd2x 63, 1, 3 # 16-byte Folded Reload
-; PC64LE-NEXT:    # kill: def $f1 killed $f1 killed $vsl1
 ; PC64LE-NEXT:    addi 1, 1, 96
 ; PC64LE-NEXT:    ld 0, 16(1)
 ; PC64LE-NEXT:    mtlr 0
@@ -451,7 +431,6 @@ define <3 x double> @constrained_vector_frem_v3f64(<3 x double> %x, <3 x double>
 ; PC64LE9-NEXT:    fmr 2, 30
 ; PC64LE9-NEXT:    bl fmod
 ; PC64LE9-NEXT:    nop
-; PC64LE9-NEXT:    # kill: def $f1 killed $f1 def $vsl1
 ; PC64LE9-NEXT:    xxmrghd 63, 1, 63
 ; PC64LE9-NEXT:    fmr 1, 29
 ; PC64LE9-NEXT:    fmr 2, 31
@@ -462,7 +441,6 @@ define <3 x double> @constrained_vector_frem_v3f64(<3 x double> %x, <3 x double>
 ; PC64LE9-NEXT:    xscpsgndp 2, 63, 63
 ; PC64LE9-NEXT:    lxv 63, 32(1) # 16-byte Folded Reload
 ; PC64LE9-NEXT:    lfd 31, 72(1) # 8-byte Folded Reload
-; PC64LE9-NEXT:    # kill: def $f1 killed $f1 killed $vsl1
 ; PC64LE9-NEXT:    lfd 30, 64(1) # 8-byte Folded Reload
 ; PC64LE9-NEXT:    lfd 29, 56(1) # 8-byte Folded Reload
 ; PC64LE9-NEXT:    lfd 28, 48(1) # 8-byte Folded Reload
@@ -505,12 +483,9 @@ define <4 x double> @constrained_vector_frem_v4f64(<4 x double> %x, <4 x double>
 ; PC64LE-NEXT:    nop
 ; PC64LE-NEXT:    xxlor 59, 1, 1
 ; PC64LE-NEXT:    xxswapd 1, 60
-; PC64LE-NEXT:    # kill: def $f1 killed $f1 killed $vsl1
 ; PC64LE-NEXT:    xxswapd 2, 62
-; PC64LE-NEXT:    # kill: def $f2 killed $f2 killed $vsl2
 ; PC64LE-NEXT:    bl fmod
 ; PC64LE-NEXT:    nop
-; PC64LE-NEXT:    # kill: def $f1 killed $f1 def $vsl1
 ; PC64LE-NEXT:    xxmrghd 62, 59, 1
 ; PC64LE-NEXT:    xxlor 1, 61, 61
 ; PC64LE-NEXT:    xxlor 2, 63, 63
@@ -518,14 +493,11 @@ define <4 x double> @constrained_vector_frem_v4f64(<4 x double> %x, <4 x double>
 ; PC64LE-NEXT:    nop
 ; PC64LE-NEXT:    xxlor 60, 1, 1
 ; PC64LE-NEXT:    xxswapd 1, 61
-; PC64LE-NEXT:    # kill: def $f1 killed $f1 killed $vsl1
 ; PC64LE-NEXT:    xxswapd 2, 63
-; PC64LE-NEXT:    # kill: def $f2 killed $f2 killed $vsl2
 ; PC64LE-NEXT:    bl fmod
 ; PC64LE-NEXT:    nop
 ; PC64LE-NEXT:    li 3, 112
 ; PC64LE-NEXT:    vmr 2, 30
-; PC64LE-NEXT:    # kill: def $f1 killed $f1 def $vsl1
 ; PC64LE-NEXT:    xxmrghd 35, 60, 1
 ; PC64LE-NEXT:    lxvd2x 63, 1, 3 # 16-byte Folded Reload
 ; PC64LE-NEXT:    li 3, 96
@@ -562,11 +534,8 @@ define <4 x double> @constrained_vector_frem_v4f64(<4 x double> %x, <4 x double>
 ; PC64LE9-NEXT:    xscpsgndp 59, 1, 1
 ; PC64LE9-NEXT:    xxswapd 1, 60
 ; PC64LE9-NEXT:    xxswapd 2, 62
-; PC64LE9-NEXT:    # kill: def $f1 killed $f1 killed $vsl1
-; PC64LE9-NEXT:    # kill: def $f2 killed $f2 killed $vsl2
 ; PC64LE9-NEXT:    bl fmod
 ; PC64LE9-NEXT:    nop
-; PC64LE9-NEXT:    # kill: def $f1 killed $f1 def $vsl1
 ; PC64LE9-NEXT:    xxmrghd 62, 59, 1
 ; PC64LE9-NEXT:    xscpsgndp 1, 61, 61
 ; PC64LE9-NEXT:    xscpsgndp 2, 63, 63
@@ -575,11 +544,8 @@ define <4 x double> @constrained_vector_frem_v4f64(<4 x double> %x, <4 x double>
 ; PC64LE9-NEXT:    xscpsgndp 60, 1, 1
 ; PC64LE9-NEXT:    xxswapd 1, 61
 ; PC64LE9-NEXT:    xxswapd 2, 63
-; PC64LE9-NEXT:    # kill: def $f1 killed $f1 killed $vsl1
-; PC64LE9-NEXT:    # kill: def $f2 killed $f2 killed $vsl2
 ; PC64LE9-NEXT:    bl fmod
 ; PC64LE9-NEXT:    nop
-; PC64LE9-NEXT:    # kill: def $f1 killed $f1 def $vsl1
 ; PC64LE9-NEXT:    xxmrghd 35, 60, 1
 ; PC64LE9-NEXT:    vmr 2, 30
 ; PC64LE9-NEXT:    lxv 63, 96(1) # 16-byte Folded Reload
@@ -704,32 +670,20 @@ entry:
 define <3 x double> @constrained_vector_fmul_v3f64(<3 x double> %x, <3 x double> %y) #0 {
 ; PC64LE-LABEL: constrained_vector_fmul_v3f64:
 ; PC64LE:       # %bb.0: # %entry
-; PC64LE-NEXT:    # kill: def $f5 killed $f5 def $vsl5
-; PC64LE-NEXT:    # kill: def $f4 killed $f4 def $vsl4
-; PC64LE-NEXT:    # kill: def $f2 killed $f2 def $vsl2
-; PC64LE-NEXT:    # kill: def $f1 killed $f1 def $vsl1
 ; PC64LE-NEXT:    xxmrghd 0, 5, 4
 ; PC64LE-NEXT:    xxmrghd 1, 2, 1
 ; PC64LE-NEXT:    xsmuldp 3, 3, 6
 ; PC64LE-NEXT:    xvmuldp 2, 1, 0
 ; PC64LE-NEXT:    xxswapd 1, 2
-; PC64LE-NEXT:    # kill: def $f1 killed $f1 killed $vsl1
-; PC64LE-NEXT:    # kill: def $f2 killed $f2 killed $vsl2
 ; PC64LE-NEXT:    blr
 ;
 ; PC64LE9-LABEL: constrained_vector_fmul_v3f64:
 ; PC64LE9:       # %bb.0: # %entry
-; PC64LE9-NEXT:    # kill: def $f5 killed $f5 def $vsl5
-; PC64LE9-NEXT:    # kill: def $f4 killed $f4 def $vsl4
-; PC64LE9-NEXT:    # kill: def $f2 killed $f2 def $vsl2
-; PC64LE9-NEXT:    # kill: def $f1 killed $f1 def $vsl1
 ; PC64LE9-NEXT:    xxmrghd 0, 5, 4
 ; PC64LE9-NEXT:    xxmrghd 1, 2, 1
 ; PC64LE9-NEXT:    xsmuldp 3, 3, 6
 ; PC64LE9-NEXT:    xvmuldp 2, 1, 0
 ; PC64LE9-NEXT:    xxswapd 1, 2
-; PC64LE9-NEXT:    # kill: def $f1 killed $f1 killed $vsl1
-; PC64LE9-NEXT:    # kill: def $f2 killed $f2 killed $vsl2
 ; PC64LE9-NEXT:    blr
 entry:
   %mul = call <3 x double> @llvm.experimental.constrained.fmul.v3f64(
@@ -866,32 +820,20 @@ entry:
 define <3 x double> @constrained_vector_fadd_v3f64(<3 x double> %x, <3 x double> %y) #0 {
 ; PC64LE-LABEL: constrained_vector_fadd_v3f64:
 ; PC64LE:       # %bb.0: # %entry
-; PC64LE-NEXT:    # kill: def $f5 killed $f5 def $vsl5
-; PC64LE-NEXT:    # kill: def $f4 killed $f4 def $vsl4
-; PC64LE-NEXT:    # kill: def $f2 killed $f2 def $vsl2
-; PC64LE-NEXT:    # kill: def $f1 killed $f1 def $vsl1
 ; PC64LE-NEXT:    xxmrghd 0, 5, 4
 ; PC64LE-NEXT:    xxmrghd 1, 2, 1
 ; PC64LE-NEXT:    xsadddp 3, 3, 6
 ; PC64LE-NEXT:    xvadddp 2, 1, 0
 ; PC64LE-NEXT:    xxswapd 1, 2
-; PC64LE-NEXT:    # kill: def $f1 killed $f1 killed $vsl1
-; PC64LE-NEXT:    # kill: def $f2 killed $f2 killed $vsl2
 ; PC64LE-NEXT:    blr
 ;
 ; PC64LE9-LABEL: constrained_vector_fadd_v3f64:
 ; PC64LE9:       # %bb.0: # %entry
-; PC64LE9-NEXT:    # kill: def $f5 killed $f5 def $vsl5
-; PC64LE9-NEXT:    # kill: def $f4 killed $f4 def $vsl4
-; PC64LE9-NEXT:    # kill: def $f2 killed $f2 def $vsl2
-; PC64LE9-NEXT:    # kill: def $f1 killed $f1 def $vsl1
 ; PC64LE9-NEXT:    xxmrghd 0, 5, 4
 ; PC64LE9-NEXT:    xxmrghd 1, 2, 1
 ; PC64LE9-NEXT:    xsadddp 3, 3, 6
 ; PC64LE9-NEXT:    xvadddp 2, 1, 0
 ; PC64LE9-NEXT:    xxswapd 1, 2
-; PC64LE9-NEXT:    # kill: def $f1 killed $f1 killed $vsl1
-; PC64LE9-NEXT:    # kill: def $f2 killed $f2 killed $vsl2
 ; PC64LE9-NEXT:    blr
 entry:
   %add = call <3 x double> @llvm.experimental.constrained.fadd.v3f64(
@@ -1028,32 +970,20 @@ entry:
 define <3 x double> @constrained_vector_fsub_v3f64(<3 x double> %x, <3 x double> %y) #0 {
 ; PC64LE-LABEL: constrained_vector_fsub_v3f64:
 ; PC64LE:       # %bb.0: # %entry
-; PC64LE-NEXT:    # kill: def $f5 killed $f5 def $vsl5
-; PC64LE-NEXT:    # kill: def $f4 killed $f4 def $vsl4
-; PC64LE-NEXT:    # kill: def $f2 killed $f2 def $vsl2
-; PC64LE-NEXT:    # kill: def $f1 killed $f1 def $vsl1
 ; PC64LE-NEXT:    xxmrghd 0, 5, 4
 ; PC64LE-NEXT:    xxmrghd 1, 2, 1
 ; PC64LE-NEXT:    xssubdp 3, 3, 6
 ; PC64LE-NEXT:    xvsubdp 2, 1, 0
 ; PC64LE-NEXT:    xxswapd 1, 2
-; PC64LE-NEXT:    # kill: def $f1 killed $f1 killed $vsl1
-; PC64LE-NEXT:    # kill: def $f2 killed $f2 killed $vsl2
 ; PC64LE-NEXT:    blr
 ;
 ; PC64LE9-LABEL: constrained_vector_fsub_v3f64:
 ; PC64LE9:       # %bb.0: # %entry
-; PC64LE9-NEXT:    # kill: def $f5 killed $f5 def $vsl5
-; PC64LE9-NEXT:    # kill: def $f4 killed $f4 def $vsl4
-; PC64LE9-NEXT:    # kill: def $f2 killed $f2 def $vsl2
-; PC64LE9-NEXT:    # kill: def $f1 killed $f1 def $vsl1
 ; PC64LE9-NEXT:    xxmrghd 0, 5, 4
 ; PC64LE9-NEXT:    xxmrghd 1, 2, 1
 ; PC64LE9-NEXT:    xssubdp 3, 3, 6
 ; PC64LE9-NEXT:    xvsubdp 2, 1, 0
 ; PC64LE9-NEXT:    xxswapd 1, 2
-; PC64LE9-NEXT:    # kill: def $f1 killed $f1 killed $vsl1
-; PC64LE9-NEXT:    # kill: def $f2 killed $f2 killed $vsl2
 ; PC64LE9-NEXT:    blr
 entry:
   %sub = call <3 x double> @llvm.experimental.constrained.fsub.v3f64(
@@ -1175,26 +1105,18 @@ entry:
 define <3 x double> @constrained_vector_sqrt_v3f64(<3 x double> %x) #0 {
 ; PC64LE-LABEL: constrained_vector_sqrt_v3f64:
 ; PC64LE:       # %bb.0: # %entry
-; PC64LE-NEXT:    # kill: def $f2 killed $f2 def $vsl2
-; PC64LE-NEXT:    # kill: def $f1 killed $f1 def $vsl1
 ; PC64LE-NEXT:    xxmrghd 0, 2, 1
 ; PC64LE-NEXT:    xssqrtdp 3, 3
 ; PC64LE-NEXT:    xvsqrtdp 2, 0
 ; PC64LE-NEXT:    xxswapd 1, 2
-; PC64LE-NEXT:    # kill: def $f1 killed $f1 killed $vsl1
-; PC64LE-NEXT:    # kill: def $f2 killed $f2 killed $vsl2
 ; PC64LE-NEXT:    blr
 ;
 ; PC64LE9-LABEL: constrained_vector_sqrt_v3f64:
 ; PC64LE9:       # %bb.0: # %entry
-; PC64LE9-NEXT:    # kill: def $f2 killed $f2 def $vsl2
-; PC64LE9-NEXT:    # kill: def $f1 killed $f1 def $vsl1
 ; PC64LE9-NEXT:    xxmrghd 0, 2, 1
 ; PC64LE9-NEXT:    xssqrtdp 3, 3
 ; PC64LE9-NEXT:    xvsqrtdp 2, 0
 ; PC64LE9-NEXT:    xxswapd 1, 2
-; PC64LE9-NEXT:    # kill: def $f1 killed $f1 killed $vsl1
-; PC64LE9-NEXT:    # kill: def $f2 killed $f2 killed $vsl2
 ; PC64LE9-NEXT:    blr
 entry:
   %sqrt = call <3 x double> @llvm.experimental.constrained.sqrt.v3f64(
@@ -1277,13 +1199,10 @@ define <2 x double> @constrained_vector_pow_v2f64(<2 x double> %x, <2 x double>
 ; PC64LE-NEXT:    nop
 ; PC64LE-NEXT:    xxlor 61, 1, 1
 ; PC64LE-NEXT:    xxswapd 1, 62
-; PC64LE-NEXT:    # kill: def $f1 killed $f1 killed $vsl1
 ; PC64LE-NEXT:    xxswapd 2, 63
-; PC64LE-NEXT:    # kill: def $f2 killed $f2 killed $vsl2
 ; PC64LE-NEXT:    bl pow
 ; PC64LE-NEXT:    nop
 ; PC64LE-NEXT:    li 3, 80
-; PC64LE-NEXT:    # kill: def $f1 killed $f1 def $vsl1
 ; PC64LE-NEXT:    xxmrghd 34, 61, 1
 ; PC64LE-NEXT:    lxvd2x 63, 1, 3 # 16-byte Folded Reload
 ; PC64LE-NEXT:    li 3, 64
@@ -1312,11 +1231,8 @@ define <2 x double> @constrained_vector_pow_v2f64(<2 x double> %x, <2 x double>
 ; PC64LE9-NEXT:    xscpsgndp 61, 1, 1
 ; PC64LE9-NEXT:    xxswapd 1, 62
 ; PC64LE9-NEXT:    xxswapd 2, 63
-; PC64LE9-NEXT:    # kill: def $f1 killed $f1 killed $vsl1
-; PC64LE9-NEXT:    # kill: def $f2 killed $f2 killed $vsl2
 ; PC64LE9-NEXT:    bl pow
 ; PC64LE9-NEXT:    nop
-; PC64LE9-NEXT:    # kill: def $f1 killed $f1 def $vsl1
 ; PC64LE9-NEXT:    xxmrghd 34, 61, 1
 ; PC64LE9-NEXT:    lxv 63, 64(1) # 16-byte Folded Reload
 ; PC64LE9-NEXT:    lxv 62, 48(1) # 16-byte Folded Reload
@@ -1468,7 +1384,6 @@ define <3 x double> @constrained_vector_pow_v3f64(<3 x double> %x, <3 x double>
 ; PC64LE-NEXT:    fmr 2, 30
 ; PC64LE-NEXT:    bl pow
 ; PC64LE-NEXT:    nop
-; PC64LE-NEXT:    # kill: def $f1 killed $f1 def $vsl1
 ; PC64LE-NEXT:    xxmrghd 63, 1, 63
 ; PC64LE-NEXT:    fmr 1, 29
 ; PC64LE-NEXT:    fmr 2, 31
@@ -1483,7 +1398,6 @@ define <3 x double> @constrained_vector_pow_v3f64(<3 x double> %x, <3 x double>
 ; PC64LE-NEXT:    lfd 29, 72(1) # 8-byte Folded Reload
 ; PC64LE-NEXT:    lfd 28, 64(1) # 8-byte Folded Reload
 ; PC64LE-NEXT:    lxvd2x 63, 1, 3 # 16-byte Folded Reload
-; PC64LE-NEXT:    # kill: def $f1 killed $f1 killed $vsl1
 ; PC64LE-NEXT:    addi 1, 1, 96
 ; PC64LE-NEXT:    ld 0, 16(1)
 ; PC64LE-NEXT:    mtlr 0
@@ -1511,7 +1425,6 @@ define <3 x double> @constrained_vector_pow_v3f64(<3 x double> %x, <3 x double>
 ; PC64LE9-NEXT:    fmr 2, 30
 ; PC64LE9-NEXT:    bl pow
 ; PC64LE9-NEXT:    nop
-; PC64LE9-NEXT:    # kill: def $f1 killed $f1 def $vsl1
 ; PC64LE9-NEXT:    xxmrghd 63, 1, 63
 ; PC64LE9-NEXT:    fmr 1, 29
 ; PC64LE9-NEXT:    fmr 2, 31
@@ -1522,7 +1435,6 @@ define <3 x double> @constrained_vector_pow_v3f64(<3 x double> %x, <3 x double>
 ; PC64LE9-NEXT:    xscpsgndp 2, 63, 63
 ; PC64LE9-NEXT:    lxv 63, 32(1) # 16-byte Folded Reload
 ; PC64LE9-NEXT:    lfd 31, 72(1) # 8-byte Folded Reload
-; PC64LE9-NEXT:    # kill: def $f1 killed $f1 killed $vsl1
 ; PC64LE9-NEXT:    lfd 30, 64(1) # 8-byte Folded Reload
 ; PC64LE9-NEXT:    lfd 29, 56(1) # 8-byte Folded Reload
 ; PC64LE9-NEXT:    lfd 28, 48(1) # 8-byte Folded Reload
@@ -1565,12 +1477,9 @@ define <4 x double> @constrained_vector_pow_v4f64(<4 x double> %x, <4 x double>
 ; PC64LE-NEXT:    nop
 ; PC64LE-NEXT:    xxlor 59, 1, 1
 ; PC64LE-NEXT:    xxswapd 1, 60
-; PC64LE-NEXT:    # kill: def $f1 killed $f1 killed $vsl1
 ; PC64LE-NEXT:    xxswapd 2, 62
-; PC64LE-NEXT:    # kill: def $f2 killed $f2 killed $vsl2
 ; PC64LE-NEXT:    bl pow
 ; PC64LE-NEXT:    nop
-; PC64LE-NEXT:    # kill: def $f1 killed $f1 def $vsl1
 ; PC64LE-NEXT:    xxmrghd 62, 59, 1
 ; PC64LE-NEXT:    xxlor 1, 61, 61
 ; PC64LE-NEXT:    xxlor 2, 63, 63
@@ -1578,14 +1487,11 @@ define <4 x double> @constrained_vector_pow_v4f64(<4 x double> %x, <4 x double>
 ; PC64LE-NEXT:    nop
 ; PC64LE-NEXT:    xxlor 60, 1, 1
 ; PC64LE-NEXT:    xxswapd 1, 61
-; PC64LE-NEXT:    # kill: def $f1 killed $f1 killed $vsl1
 ; PC64LE-NEXT:    xxswapd 2, 63
-; PC64LE-NEXT:    # kill: def $f2 killed $f2 killed $vsl2
 ; PC64LE-NEXT:    bl pow
 ; PC64LE-NEXT:    nop
 ; PC64LE-NEXT:    li 3, 112
 ; PC64LE-NEXT:    vmr 2, 30
-; PC64LE-NEXT:    # kill: def $f1 killed $f1 def $vsl1
 ; PC64LE-NEXT:    xxmrghd 35, 60, 1
 ; PC64LE-NEXT:    lxvd2x 63, 1, 3 # 16-byte Folded Reload
 ; PC64LE-NEXT:    li 3, 96
@@ -1622,11 +1528,8 @@ define <4 x double> @constrained_vector_pow_v4f64(<4 x double> %x, <4 x double>
 ; PC64LE9-NEXT:    xscpsgndp 59, 1, 1
 ; PC64LE9-NEXT:    xxswapd 1, 60
 ; PC64LE9-NEXT:    xxswapd 2, 62
-; PC64LE9-NEXT:    # kill: def $f1 killed $f1 killed $vsl1
-; PC64LE9-NEXT:    # kill: def $f2 killed $f2 killed $vsl2
 ; PC64LE9-NEXT:    bl pow
 ; PC64LE9-NEXT:    nop
-; PC64LE9-NEXT:    # kill: def $f1 killed $f1 def $vsl1
 ; PC64LE9-NEXT:    xxmrghd 62, 59, 1
 ; PC64LE9-NEXT:    xscpsgndp 1, 61, 61
 ; PC64LE9-NEXT:    xscpsgndp 2, 63, 63
@@ -1635,11 +1538,8 @@ define <4 x double> @constrained_vector_pow_v4f64(<4 x double> %x, <4 x double>
 ; PC64LE9-NEXT:    xscpsgndp 60, 1, 1
 ; PC64LE9-NEXT:    xxswapd 1, 61
 ; PC64LE9-NEXT:    xxswapd 2, 63
-; PC64LE9-NEXT:    # kill: def $f1 killed $f1 killed $vsl1
-; PC64LE9-NEXT:    # kill: def $f2 killed $f2 killed $vsl2
 ; PC64LE9-NEXT:    bl pow
 ; PC64LE9-NEXT:    nop
-; PC64LE9-NEXT:    # kill: def $f1 killed $f1 def $vsl1
 ; PC64LE9-NEXT:    xxmrghd 35, 60, 1
 ; PC64LE9-NEXT:    vmr 2, 30
 ; PC64LE9-NEXT:    lxv 63, 96(1) # 16-byte Folded Reload
@@ -1712,14 +1612,12 @@ define <2 x double> @constrained_vector_powi_v2f64(<2 x double> %x, i32 %y) #0 {
 ; PC64LE-NEXT:    xxlor 1, 63, 63
 ; PC64LE-NEXT:    bl __powidf2
 ; PC64LE-NEXT:    nop
-; PC64LE-NEXT:    mr 4, 30
 ; PC64LE-NEXT:    xxlor 62, 1, 1
 ; PC64LE-NEXT:    xxswapd 1, 63
-; PC64LE-NEXT:    # kill: def $f1 killed $f1 killed $vsl1
+; PC64LE-NEXT:    mr 4, 30
 ; PC64LE-NEXT:    bl __powidf2
 ; PC64LE-NEXT:    nop
 ; PC64LE-NEXT:    li 3, 64
-; PC64LE-NEXT:    # kill: def $f1 killed $f1 def $vsl1
 ; PC64LE-NEXT:    xxmrghd 34, 62, 1
 ; PC64LE-NEXT:    ld 30, 80(1) # 8-byte Folded Reload
 ; PC64LE-NEXT:    lxvd2x 63, 1, 3 # 16-byte Folded Reload
@@ -1744,13 +1642,11 @@ define <2 x double> @constrained_vector_powi_v2f64(<2 x double> %x, i32 %y) #0 {
 ; PC64LE9-NEXT:    mr 4, 30
 ; PC64LE9-NEXT:    bl __powidf2
 ; PC64LE9-NEXT:    nop
-; PC64LE9-NEXT:    mr 4, 30
 ; PC64LE9-NEXT:    xscpsgndp 62, 1, 1
 ; PC64LE9-NEXT:    xxswapd 1, 63
-; PC64LE9-NEXT:    # kill: def $f1 killed $f1 killed $vsl1
+; PC64LE9-NEXT:    mr 4, 30
 ; PC64LE9-NEXT:    bl __powidf2
 ; PC64LE9-NEXT:    nop
-; PC64LE9-NEXT:    # kill: def $f1 killed $f1 def $vsl1
 ; PC64LE9-NEXT:    xxmrghd 34, 62, 1
 ; PC64LE9-NEXT:    lxv 63, 48(1) # 16-byte Folded Reload
 ; PC64LE9-NEXT:    lxv 62, 32(1) # 16-byte Folded Reload
@@ -1894,7 +1790,6 @@ define <3 x double> @constrained_vector_powi_v3f64(<3 x double> %x, i32 %y) #0 {
 ; PC64LE-NEXT:    mr 4, 30
 ; PC64LE-NEXT:    bl __powidf2
 ; PC64LE-NEXT:    nop
-; PC64LE-NEXT:    # kill: def $f1 killed $f1 def $vsl1
 ; PC64LE-NEXT:    xxmrghd 63, 1, 63
 ; PC64LE-NEXT:    fmr 1, 31
 ; PC64LE-NEXT:    mr 4, 30
@@ -1907,7 +1802,6 @@ define <3 x double> @constrained_vector_powi_v3f64(<3 x double> %x, i32 %y) #0 {
 ; PC64LE-NEXT:    xxlor 2, 63, 63
 ; PC64LE-NEXT:    lfd 30, 80(1) # 8-byte Folded Reload
 ; PC64LE-NEXT:    ld 30, 64(1) # 8-byte Folded Reload
-; PC64LE-NEXT:    # kill: def $f1 killed $f1 killed $vsl1
 ; PC64LE-NEXT:    lxvd2x 63, 1, 3 # 16-byte Folded Reload
 ; PC64LE-NEXT:    addi 1, 1, 96
 ; PC64LE-NEXT:    ld 0, 16(1)
@@ -1934,7 +1828,6 @@ define <3 x double> @constrained_vector_powi_v3f64(<3 x double> %x, i32 %y) #0 {
 ; PC64LE9-NEXT:    mr 4, 30
 ; PC64LE9-NEXT:    bl __powidf2
 ; PC64LE9-NEXT:    nop
-; PC64LE9-NEXT:    # kill: def $f1 killed $f1 def $vsl1
 ; PC64LE9-NEXT:    xxmrghd 63, 1, 63
 ; PC64LE9-NEXT:    fmr 1, 31
 ; PC64LE9-NEXT:    mr 4, 30
@@ -1945,7 +1838,6 @@ define <3 x double> @constrained_vector_powi_v3f64(<3 x double> %x, i32 %y) #0 {
 ; PC64LE9-NEXT:    xscpsgndp 2, 63, 63
 ; PC64LE9-NEXT:    lxv 63, 32(1) # 16-byte Folded Reload
 ; PC64LE9-NEXT:    lfd 31, 72(1) # 8-byte Folded Reload
-; PC64LE9-NEXT:    # kill: def $f1 killed $f1 killed $vsl1
 ; PC64LE9-NEXT:    ld 30, 48(1) # 8-byte Folded Reload
 ; PC64LE9-NEXT:    lfd 30, 64(1) # 8-byte Folded Reload
 ; PC64LE9-NEXT:    addi 1, 1, 80
@@ -1981,27 +1873,23 @@ define <4 x double> @constrained_vector_powi_v4f64(<4 x double> %x, i32 %y) #0 {
 ; PC64LE-NEXT:    vmr 31, 3
 ; PC64LE-NEXT:    bl __powidf2
 ; PC64LE-NEXT:    nop
-; PC64LE-NEXT:    mr 4, 30
 ; PC64LE-NEXT:    xxlor 61, 1, 1
 ; PC64LE-NEXT:    xxswapd 1, 62
-; PC64LE-NEXT:    # kill: def $f1 killed $f1 killed $vsl1
+; PC64LE-NEXT:    mr 4, 30
 ; PC64LE-NEXT:    bl __powidf2
 ; PC64LE-NEXT:    nop
-; PC64LE-NEXT:    # kill: def $f1 killed $f1 def $vsl1
 ; PC64LE-NEXT:    xxmrghd 62, 61, 1
 ; PC64LE-NEXT:    xxlor 1, 63, 63
 ; PC64LE-NEXT:    mr 4, 30
 ; PC64LE-NEXT:    bl __powidf2
 ; PC64LE-NEXT:    nop
-; PC64LE-NEXT:    mr 4, 30
 ; PC64LE-NEXT:    xxlor 61, 1, 1
 ; PC64LE-NEXT:    xxswapd 1, 63
-; PC64LE-NEXT:    # kill: def $f1 killed $f1 killed $vsl1
+; PC64LE-NEXT:    mr 4, 30
 ; PC64LE-NEXT:    bl __powidf2
 ; PC64LE-NEXT:    nop
 ; PC64LE-NEXT:    li 3, 80
 ; PC64LE-NEXT:    vmr 2, 30
-; PC64LE-NEXT:    # kill: def $f1 killed $f1 def $vsl1
 ; PC64LE-NEXT:    xxmrghd 35, 61, 1
 ; PC64LE-NEXT:    ld 30, 96(1) # 8-byte Folded Reload
 ; PC64LE-NEXT:    lxvd2x 63, 1, 3 # 16-byte Folded Reload
@@ -2030,25 +1918,21 @@ define <4 x double> @constrained_vector_powi_v4f64(<4 x double> %x, i32 %y) #0 {
 ; PC64LE9-NEXT:    vmr 31, 3
 ; PC64LE9-NEXT:    bl __powidf2
 ; PC64LE9-NEXT:    nop
-; PC64LE9-NEXT:    mr 4, 30
 ; PC64LE9-NEXT:    xscpsgndp 61, 1, 1
 ; PC64LE9-NEXT:    xxswapd 1, 62
-; PC64LE9-NEXT:    # kill: def $f1 killed $f1 killed $vsl1
+; PC64LE9-NEXT:    mr 4, 30
 ; PC64LE9-NEXT:    bl __powidf2
 ; PC64LE9-NEXT:    nop
-; PC64LE9-NEXT:    # kill: def $f1 killed $f1 def $vsl1
 ; PC64LE9-NEXT:    xxmrghd 62, 61, 1
 ; PC64LE9-NEXT:    xscpsgndp 1, 63, 63
 ; PC64LE9-NEXT:    mr 4, 30
 ; PC64LE9-NEXT:    bl __powidf2
 ; PC64LE9-NEXT:    nop
-; PC64LE9-NEXT:    mr 4, 30
 ; PC64LE9-NEXT:    xscpsgndp 61, 1, 1
 ; PC64LE9-NEXT:    xxswapd 1, 63
-; PC64LE9-NEXT:    # kill: def $f1 killed $f1 killed $vsl1
+; PC64LE9-NEXT:    mr 4, 30
 ; PC64LE9-NEXT:    bl __powidf2
 ; PC64LE9-NEXT:    nop
-; PC64LE9-NEXT:    # kill: def $f1 killed $f1 def $vsl1
 ; PC64LE9-NEXT:    xxmrghd 35, 61, 1
 ; PC64LE9-NEXT:    vmr 2, 30
 ; PC64LE9-NEXT:    lxv 63, 64(1) # 16-byte Folded Reload
@@ -2116,11 +2000,9 @@ define <2 x double> @constrained_vector_sin_v2f64(<2 x double> %x) #0 {
 ; PC64LE-NEXT:    nop
 ; PC64LE-NEXT:    xxlor 62, 1, 1
 ; PC64LE-NEXT:    xxswapd 1, 63
-; PC64LE-NEXT:    # kill: def $f1 killed $f1 killed $vsl1
 ; PC64LE-NEXT:    bl sin
 ; PC64LE-NEXT:    nop
 ; PC64LE-NEXT:    li 3, 64
-; PC64LE-NEXT:    # kill: def $f1 killed $f1 def $vsl1
 ; PC64LE-NEXT:    xxmrghd 34, 62, 1
 ; PC64LE-NEXT:    lxvd2x 63, 1, 3 # 16-byte Folded Reload
 ; PC64LE-NEXT:    li 3, 48
@@ -2143,10 +2025,8 @@ define <2 x double> @constrained_vector_sin_v2f64(<2 x double> %x) #0 {
 ; PC64LE9-NEXT:    nop
 ; PC64LE9-NEXT:    xscpsgndp 62, 1, 1
 ; PC64LE9-NEXT:    xxswapd 1, 63
-; PC64LE9-NEXT:    # kill: def $f1 killed $f1 killed $vsl1
 ; PC64LE9-NEXT:    bl sin
 ; PC64LE9-NEXT:    nop
-; PC64LE9-NEXT:    # kill: def $f1 killed $f1 def $vsl1
 ; PC64LE9-NEXT:    xxmrghd 34, 62, 1
 ; PC64LE9-NEXT:    lxv 63, 48(1) # 16-byte Folded Reload
 ; PC64LE9-NEXT:    lxv 62, 32(1) # 16-byte Folded Reload
@@ -2269,7 +2149,6 @@ define <3 x double> @constrained_vector_sin_v3f64(<3 x double> %x) #0 {
 ; PC64LE-NEXT:    fmr 1, 30
 ; PC64LE-NEXT:    bl sin
 ; PC64LE-NEXT:    nop
-; PC64LE-NEXT:    # kill: def $f1 killed $f1 def $vsl1
 ; PC64LE-NEXT:    xxmrghd 63, 1, 63
 ; PC64LE-NEXT:    fmr 1, 31
 ; PC64LE-NEXT:    bl sin
@@ -2280,7 +2159,6 @@ define <3 x double> @constrained_vector_sin_v3f64(<3 x double> %x) #0 {
 ; PC64LE-NEXT:    lfd 31, 72(1) # 8-byte Folded Reload
 ; PC64LE-NEXT:    xxlor 2, 63, 63
 ; PC64LE-NEXT:    lfd 30, 64(1) # 8-byte Folded Reload
-; PC64LE-NEXT:    # kill: def $f1 killed $f1 killed $vsl1
 ; PC64LE-NEXT:    lxvd2x 63, 1, 3 # 16-byte Folded Reload
 ; PC64LE-NEXT:    addi 1, 1, 80
 ; PC64LE-NEXT:    ld 0, 16(1)
@@ -2303,7 +2181,6 @@ define <3 x double> @constrained_vector_sin_v3f64(<3 x double> %x) #0 {
 ; PC64LE9-NEXT:    fmr 1, 30
 ; PC64LE9-NEXT:    bl sin
 ; PC64LE9-NEXT:    nop
-; PC64LE9-NEXT:    # kill: def $f1 killed $f1 def $vsl1
 ; PC64LE9-NEXT:    xxmrghd 63, 1, 63
 ; PC64LE9-NEXT:    fmr 1, 31
 ; PC64LE9-NEXT:    bl sin
@@ -2313,7 +2190,6 @@ define <3 x double> @constrained_vector_sin_v3f64(<3 x double> %x) #0 {
 ; PC64LE9-NEXT:    xscpsgndp 2, 63, 63
 ; PC64LE9-NEXT:    lxv 63, 32(1) # 16-byte Folded Reload
 ; PC64LE9-NEXT:    lfd 31, 56(1) # 8-byte Folded Reload
-; PC64LE9-NEXT:    # kill: def $f1 killed $f1 killed $vsl1
 ; PC64LE9-NEXT:    lfd 30, 48(1) # 8-byte Folded Reload
 ; PC64LE9-NEXT:    addi 1, 1, 64
 ; PC64LE9-NEXT:    ld 0, 16(1)
@@ -2346,22 +2222,18 @@ define <4 x double> @constrained_vector_sin_v4f64(<4 x double> %x) #0 {
 ; PC64LE-NEXT:    nop
 ; PC64LE-NEXT:    xxlor 61, 1, 1
 ; PC64LE-NEXT:    xxswapd 1, 62
-; PC64LE-NEXT:    # kill: def $f1 killed $f1 killed $vsl1
 ; PC64LE-NEXT:    bl sin
 ; PC64LE-NEXT:    nop
-; PC64LE-NEXT:    # kill: def $f1 killed $f1 def $vsl1
 ; PC64LE-NEXT:    xxmrghd 62, 61, 1
 ; PC64LE-NEXT:    xxlor 1, 63, 63
 ; PC64LE-NEXT:    bl sin
 ; PC64LE-NEXT:    nop
 ; PC64LE-NEXT:    xxlor 61, 1, 1
 ; PC64LE-NEXT:    xxswapd 1, 63
-; PC64LE-NEXT:    # kill: def $f1 killed $f1 killed $vsl1
 ; PC64LE-NEXT:    bl sin
 ; PC64LE-NEXT:    nop
 ; PC64LE-NEXT:    li 3, 80
 ; PC64LE-NEXT:    vmr 2, 30
-; PC64LE-NEXT:    # kill: def $f1 killed $f1 def $vsl1
 ; PC64LE-NEXT:    xxmrghd 35, 61, 1
 ; PC64LE-NEXT:    lxvd2x 63, 1, 3 # 16-byte Folded Reload
 ; PC64LE-NEXT:    li 3, 64
@@ -2388,20 +2260,16 @@ define <4 x double> @constrained_vector_sin_v4f64(<4 x double> %x) #0 {
 ; PC64LE9-NEXT:    nop
 ; PC64LE9-NEXT:    xscpsgndp 61, 1, 1
 ; PC64LE9-NEXT:    xxswapd 1, 62
-; PC64LE9-NEXT:    # kill: def $f1 killed $f1 killed $vsl1
 ; PC64LE9-NEXT:    bl sin
 ; PC64LE9-NEXT:    nop
-; PC64LE9-NEXT:    # kill: def $f1 killed $f1 def $vsl1
 ; PC64LE9-NEXT:    xxmrghd 62, 61, 1
 ; PC64LE9-NEXT:    xscpsgndp 1, 63, 63
 ; PC64LE9-NEXT:    bl sin
 ; PC64LE9-NEXT:    nop
 ; PC64LE9-NEXT:    xscpsgndp 61, 1, 1
 ; PC64LE9-NEXT:    xxswapd 1, 63
-; PC64LE9-NEXT:    # kill: def $f1 killed $f1 killed $vsl1
 ; PC64LE9-NEXT:    bl sin
 ; PC64LE9-NEXT:    nop
-; PC64LE9-NEXT:    # kill: def $f1 killed $f1 def $vsl1
 ; PC64LE9-NEXT:    xxmrghd 35, 61, 1
 ; PC64LE9-NEXT:    vmr 2, 30
 ; PC64LE9-NEXT:    lxv 63, 64(1) # 16-byte Folded Reload
@@ -2467,11 +2335,9 @@ define <2 x double> @constrained_vector_cos_v2f64(<2 x double> %x) #0 {
 ; PC64LE-NEXT:    nop
 ; PC64LE-NEXT:    xxlor 62, 1, 1
 ; PC64LE-NEXT:    xxswapd 1, 63
-; PC64LE-NEXT:    # kill: def $f1 killed $f1 killed $vsl1
 ; PC64LE-NEXT:    bl cos
 ; PC64LE-NEXT:    nop
 ; PC64LE-NEXT:    li 3, 64
-; PC64LE-NEXT:    # kill: def $f1 killed $f1 def $vsl1
 ; PC64LE-NEXT:    xxmrghd 34, 62, 1
 ; PC64LE-NEXT:    lxvd2x 63, 1, 3 # 16-byte Folded Reload
 ; PC64LE-NEXT:    li 3, 48
@@ -2494,10 +2360,8 @@ define <2 x double> @constrained_vector_cos_v2f64(<2 x double> %x) #0 {
 ; PC64LE9-NEXT:    nop
 ; PC64LE9-NEXT:    xscpsgndp 62, 1, 1
 ; PC64LE9-NEXT:    xxswapd 1, 63
-; PC64LE9-NEXT:    # kill: def $f1 killed $f1 killed $vsl1
 ; PC64LE9-NEXT:    bl cos
 ; PC64LE9-NEXT:    nop
-; PC64LE9-NEXT:    # kill: def $f1 killed $f1 def $vsl1
 ; PC64LE9-NEXT:    xxmrghd 34, 62, 1
 ; PC64LE9-NEXT:    lxv 63, 48(1) # 16-byte Folded Reload
 ; PC64LE9-NEXT:    lxv 62, 32(1) # 16-byte Folded Reload
@@ -2620,7 +2484,6 @@ define <3 x double> @constrained_vector_cos_v3f64(<3 x double> %x) #0 {
 ; PC64LE-NEXT:    fmr 1, 30
 ; PC64LE-NEXT:    bl cos
 ; PC64LE-NEXT:    nop
-; PC64LE-NEXT:    # kill: def $f1 killed $f1 def $vsl1
 ; PC64LE-NEXT:    xxmrghd 63, 1, 63
 ; PC64LE-NEXT:    fmr 1, 31
 ; PC64LE-NEXT:    bl cos
@@ -2631,7 +2494,6 @@ define <3 x double> @constrained_vector_cos_v3f64(<3 x double> %x) #0 {
 ; PC64LE-NEXT:    lfd 31, 72(1) # 8-byte Folded Reload
 ; PC64LE-NEXT:    xxlor 2, 63, 63
 ; PC64LE-NEXT:    lfd 30, 64(1) # 8-byte Folded Reload
-; PC64LE-NEXT:    # kill: def $f1 killed $f1 killed $vsl1
 ; PC64LE-NEXT:    lxvd2x 63, 1, 3 # 16-byte Folded Reload
 ; PC64LE-NEXT:    addi 1, 1, 80
 ; PC64LE-NEXT:    ld 0, 16(1)
@@ -2654,7 +2516,6 @@ define <3 x double> @constrained_vector_cos_v3f64(<3 x double> %x) #0 {
 ; PC64LE9-NEXT:    fmr 1, 30
 ; PC64LE9-NEXT:    bl cos
 ; PC64LE9-NEXT:    nop
-; PC64LE9-NEXT:    # kill: def $f1 killed $f1 def $vsl1
 ; PC64LE9-NEXT:    xxmrghd 63, 1, 63
 ; PC64LE9-NEXT:    fmr 1, 31
 ; PC64LE9-NEXT:    bl cos
@@ -2664,7 +2525,6 @@ define <3 x double> @constrained_vector_cos_v3f64(<3 x double> %x) #0 {
 ; PC64LE9-NEXT:    xscpsgndp 2, 63, 63
 ; PC64LE9-NEXT:    lxv 63, 32(1) # 16-byte Folded Reload
 ; PC64LE9-NEXT:    lfd 31, 56(1) # 8-byte Folded Reload
-; PC64LE9-NEXT:    # kill: def $f1 killed $f1 killed $vsl1
 ; PC64LE9-NEXT:    lfd 30, 48(1) # 8-byte Folded Reload
 ; PC64LE9-NEXT:    addi 1, 1, 64
 ; PC64LE9-NEXT:    ld 0, 16(1)
@@ -2697,22 +2557,18 @@ define <4 x double> @constrained_vector_cos_v4f64(<4 x double> %x) #0 {
 ; PC64LE-NEXT:    nop
 ; PC64LE-NEXT:    xxlor 61, 1, 1
 ; PC64LE-NEXT:    xxswapd 1, 62
-; PC64LE-NEXT:    # kill: def $f1 killed $f1 killed $vsl1
 ; PC64LE-NEXT:    bl cos
 ; PC64LE-NEXT:    nop
-; PC64LE-NEXT:    # kill: def $f1 killed $f1 def $vsl1
 ; PC64LE-NEXT:    xxmrghd 62, 61, 1
 ; PC64LE-NEXT:    xxlor 1, 63, 63
 ; PC64LE-NEXT:    bl cos
 ; PC64LE-NEXT:    nop
 ; PC64LE-NEXT:    xxlor 61, 1, 1
 ; PC64LE-NEXT:    xxswapd 1, 63
-; PC64LE-NEXT:    # kill: def $f1 killed $f1 killed $vsl1
 ; PC64LE-NEXT:    bl cos
 ; PC64LE-NEXT:    nop
 ; PC64LE-NEXT:    li 3, 80
 ; PC64LE-NEXT:    vmr 2, 30
-; PC64LE-NEXT:    # kill: def $f1 killed $f1 def $vsl1
 ; PC64LE-NEXT:    xxmrghd 35, 61, 1
 ; PC64LE-NEXT:    lxvd2x 63, 1, 3 # 16-byte Folded Reload
 ; PC64LE-NEXT:    li 3, 64
@@ -2739,20 +2595,16 @@ define <4 x double> @constrained_vector_cos_v4f64(<4 x double> %x) #0 {
 ; PC64LE9-NEXT:    nop
 ; PC64LE9-NEXT:    xscpsgndp 61, 1, 1
 ; PC64LE9-NEXT:    xxswapd 1, 62
-; PC64LE9-NEXT:    # kill: def $f1 killed $f1 killed $vsl1
 ; PC64LE9-NEXT:    bl cos
 ; PC64LE9-NEXT:    nop
-; PC64LE9-NEXT:    # kill: def $f1 killed $f1 def $vsl1
 ; PC64LE9-NEXT:    xxmrghd 62, 61, 1
 ; PC64LE9-NEXT:    xscpsgndp 1, 63, 63
 ; PC64LE9-NEXT:    bl cos
 ; PC64LE9-NEXT:    nop
 ; PC64LE9-NEXT:    xscpsgndp 61, 1, 1
 ; PC64LE9-NEXT:    xxswapd 1, 63
-; PC64LE9-NEXT:    # kill: def $f1 killed $f1 killed $vsl1
 ; PC64LE9-NEXT:    bl cos
 ; PC64LE9-NEXT:    nop
-; PC64LE9-NEXT:    # kill: def $f1 killed $f1 def $vsl1
 ; PC64LE9-NEXT:    xxmrghd 35, 61, 1
 ; PC64LE9-NEXT:    vmr 2, 30
 ; PC64LE9-NEXT:    lxv 63, 64(1) # 16-byte Folded Reload
@@ -2818,11 +2670,9 @@ define <2 x double> @constrained_vector_exp_v2f64(<2 x double> %x) #0 {
 ; PC64LE-NEXT:    nop
 ; PC64LE-NEXT:    xxlor 62, 1, 1
 ; PC64LE-NEXT:    xxswapd 1, 63
-; PC64LE-NEXT:    # kill: def $f1 killed $f1 killed $vsl1
 ; PC64LE-NEXT:    bl exp
 ; PC64LE-NEXT:    nop
 ; PC64LE-NEXT:    li 3, 64
-; PC64LE-NEXT:    # kill: def $f1 killed $f1 def $vsl1
 ; PC64LE-NEXT:    xxmrghd 34, 62, 1
 ; PC64LE-NEXT:    lxvd2x 63, 1, 3 # 16-byte Folded Reload
 ; PC64LE-NEXT:    li 3, 48
@@ -2845,10 +2695,8 @@ define <2 x double> @constrained_vector_exp_v2f64(<2 x double> %x) #0 {
 ; PC64LE9-NEXT:    nop
 ; PC64LE9-NEXT:    xscpsgndp 62, 1, 1
 ; PC64LE9-NEXT:    xxswapd 1, 63
-; PC64LE9-NEXT:    # kill: def $f1 killed $f1 killed $vsl1
 ; PC64LE9-NEXT:    bl exp
 ; PC64LE9-NEXT:    nop
-; PC64LE9-NEXT:    # kill: def $f1 killed $f1 def $vsl1
 ; PC64LE9-NEXT:    xxmrghd 34, 62, 1
 ; PC64LE9-NEXT:    lxv 63, 48(1) # 16-byte Folded Reload
 ; PC64LE9-NEXT:    lxv 62, 32(1) # 16-byte Folded Reload
@@ -2971,7 +2819,6 @@ define <3 x double> @constrained_vector_exp_v3f64(<3 x double> %x) #0 {
 ; PC64LE-NEXT:    fmr 1, 30
 ; PC64LE-NEXT:    bl exp
 ; PC64LE-NEXT:    nop
-; PC64LE-NEXT:    # kill: def $f1 killed $f1 def $vsl1
 ; PC64LE-NEXT:    xxmrghd 63, 1, 63
 ; PC64LE-NEXT:    fmr 1, 31
 ; PC64LE-NEXT:    bl exp
@@ -2982,7 +2829,6 @@ define <3 x double> @constrained_vector_exp_v3f64(<3 x double> %x) #0 {
 ; PC64LE-NEXT:    lfd 31, 72(1) # 8-byte Folded Reload
 ; PC64LE-NEXT:    xxlor 2, 63, 63
 ; PC64LE-NEXT:    lfd 30, 64(1) # 8-byte Folded Reload
-; PC64LE-NEXT:    # kill: def $f1 killed $f1 killed $vsl1
 ; PC64LE-NEXT:    lxvd2x 63, 1, 3 # 16-byte Folded Reload
 ; PC64LE-NEXT:    addi 1, 1, 80
 ; PC64LE-NEXT:    ld 0, 16(1)
@@ -3005,7 +2851,6 @@ define <3 x double> @constrained_vector_exp_v3f64(<3 x double> %x) #0 {
 ; PC64LE9-NEXT:    fmr 1, 30
 ; PC64LE9-NEXT:    bl exp
 ; PC64LE9-NEXT:    nop
-; PC64LE9-NEXT:    # kill: def $f1 killed $f1 def $vsl1
 ; PC64LE9-NEXT:    xxmrghd 63, 1, 63
 ; PC64LE9-NEXT:    fmr 1, 31
 ; PC64LE9-NEXT:    bl exp
@@ -3015,7 +2860,6 @@ define <3 x double> @constrained_vector_exp_v3f64(<3 x double> %x) #0 {
 ; PC64LE9-NEXT:    xscpsgndp 2, 63, 63
 ; PC64LE9-NEXT:    lxv 63, 32(1) # 16-byte Folded Reload
 ; PC64LE9-NEXT:    lfd 31, 56(1) # 8-byte Folded Reload
-; PC64LE9-NEXT:    # kill: def $f1 killed $f1 killed $vsl1
 ; PC64LE9-NEXT:    lfd 30, 48(1) # 8-byte Folded Reload
 ; PC64LE9-NEXT:    addi 1, 1, 64
 ; PC64LE9-NEXT:    ld 0, 16(1)
@@ -3048,22 +2892,18 @@ define <4 x double> @constrained_vector_exp_v4f64(<4 x double> %x) #0 {
 ; PC64LE-NEXT:    nop
 ; PC64LE-NEXT:    xxlor 61, 1, 1
 ; PC64LE-NEXT:    xxswapd 1, 62
-; PC64LE-NEXT:    # kill: def $f1 killed $f1 killed $vsl1
 ; PC64LE-NEXT:    bl exp
 ; PC64LE-NEXT:    nop
-; PC64LE-NEXT:    # kill: def $f1 killed $f1 def $vsl1
 ; PC64LE-NEXT:    xxmrghd 62, 61, 1
 ; PC64LE-NEXT:    xxlor 1, 63, 63
 ; PC64LE-NEXT:    bl exp
 ; PC64LE-NEXT:    nop
 ; PC64LE-NEXT:    xxlor 61, 1, 1
 ; PC64LE-NEXT:    xxswapd 1, 63
-; PC64LE-NEXT:    # kill: def $f1 killed $f1 killed $vsl1
 ; PC64LE-NEXT:    bl exp
 ; PC64LE-NEXT:    nop
 ; PC64LE-NEXT:    li 3, 80
 ; PC64LE-NEXT:    vmr 2, 30
-; PC64LE-NEXT:    # kill: def $f1 killed $f1 def $vsl1
 ; PC64LE-NEXT:    xxmrghd 35, 61, 1
 ; PC64LE-NEXT:    lxvd2x 63, 1, 3 # 16-byte Folded Reload
 ; PC64LE-NEXT:    li 3, 64
@@ -3090,20 +2930,16 @@ define <4 x double> @constrained_vector_exp_v4f64(<4 x double> %x) #0 {
 ; PC64LE9-NEXT:    nop
 ; PC64LE9-NEXT:    xscpsgndp 61, 1, 1
 ; PC64LE9-NEXT:    xxswapd 1, 62
-; PC64LE9-NEXT:    # kill: def $f1 killed $f1 killed $vsl1
 ; PC64LE9-NEXT:    bl exp
 ; PC64LE9-NEXT:    nop
-; PC64LE9-NEXT:    # kill: def $f1 killed $f1 def $vsl1
 ; PC64LE9-NEXT:    xxmrghd 62, 61, 1
 ; PC64LE9-NEXT:    xscpsgndp 1, 63, 63
 ; PC64LE9-NEXT:    bl exp
 ; PC64LE9-NEXT:    nop
 ; PC64LE9-NEXT:    xscpsgndp 61, 1, 1
 ; PC64LE9-NEXT:    xxswapd 1, 63
-; PC64LE9-NEXT:    # kill: def $f1 killed $f1 killed $vsl1
 ; PC64LE9-NEXT:    bl exp
 ; PC64LE9-NEXT:    nop
-; PC64LE9-NEXT:    # kill: def $f1 killed $f1 def $vsl1
 ; PC64LE9-NEXT:    xxmrghd 35, 61, 1
 ; PC64LE9-NEXT:    vmr 2, 30
 ; PC64LE9-NEXT:    lxv 63, 64(1) # 16-byte Folded Reload
@@ -3169,11 +3005,9 @@ define <2 x double> @constrained_vector_exp2_v2f64(<2 x double> %x) #0 {
 ; PC64LE-NEXT:    nop
 ; PC64LE-NEXT:    xxlor 62, 1, 1
 ; PC64LE-NEXT:    xxswapd 1, 63
-; PC64LE-NEXT:    # kill: def $f1 killed $f1 killed $vsl1
 ; PC64LE-NEXT:    bl exp2
 ; PC64LE-NEXT:    nop
 ; PC64LE-NEXT:    li 3, 64
-; PC64LE-NEXT:    # kill: def $f1 killed $f1 def $vsl1
 ; PC64LE-NEXT:    xxmrghd 34, 62, 1
 ; PC64LE-NEXT:    lxvd2x 63, 1, 3 # 16-byte Folded Reload
 ; PC64LE-NEXT:    li 3, 48
@@ -3196,10 +3030,8 @@ define <2 x double> @constrained_vector_exp2_v2f64(<2 x double> %x) #0 {
 ; PC64LE9-NEXT:    nop
 ; PC64LE9-NEXT:    xscpsgndp 62, 1, 1
 ; PC64LE9-NEXT:    xxswapd 1, 63
-; PC64LE9-NEXT:    # kill: def $f1 killed $f1 killed $vsl1
 ; PC64LE9-NEXT:    bl exp2
 ; PC64LE9-NEXT:    nop
-; PC64LE9-NEXT:    # kill: def $f1 killed $f1 def $vsl1
 ; PC64LE9-NEXT:    xxmrghd 34, 62, 1
 ; PC64LE9-NEXT:    lxv 63, 48(1) # 16-byte Folded Reload
 ; PC64LE9-NEXT:    lxv 62, 32(1) # 16-byte Folded Reload
@@ -3322,7 +3154,6 @@ define <3 x double> @constrained_vector_exp2_v3f64(<3 x double> %x) #0 {
 ; PC64LE-NEXT:    fmr 1, 30
 ; PC64LE-NEXT:    bl exp2
 ; PC64LE-NEXT:    nop
-; PC64LE-NEXT:    # kill: def $f1 killed $f1 def $vsl1
 ; PC64LE-NEXT:    xxmrghd 63, 1, 63
 ; PC64LE-NEXT:    fmr 1, 31
 ; PC64LE-NEXT:    bl exp2
@@ -3333,7 +3164,6 @@ define <3 x double> @constrained_vector_exp2_v3f64(<3 x double> %x) #0 {
 ; PC64LE-NEXT:    lfd 31, 72(1) # 8-byte Folded Reload
 ; PC64LE-NEXT:    xxlor 2, 63, 63
 ; PC64LE-NEXT:    lfd 30, 64(1) # 8-byte Folded Reload
-; PC64LE-NEXT:    # kill: def $f1 killed $f1 killed $vsl1
 ; PC64LE-NEXT:    lxvd2x 63, 1, 3 # 16-byte Folded Reload
 ; PC64LE-NEXT:    addi 1, 1, 80
 ; PC64LE-NEXT:    ld 0, 16(1)
@@ -3356,7 +3186,6 @@ define <3 x double> @constrained_vector_exp2_v3f64(<3 x double> %x) #0 {
 ; PC64LE9-NEXT:    fmr 1, 30
 ; PC64LE9-NEXT:    bl exp2
 ; PC64LE9-NEXT:    nop
-; PC64LE9-NEXT:    # kill: def $f1 killed $f1 def $vsl1
 ; PC64LE9-NEXT:    xxmrghd 63, 1, 63
 ; PC64LE9-NEXT:    fmr 1, 31
 ; PC64LE9-NEXT:    bl exp2
@@ -3366,7 +3195,6 @@ define <3 x double> @constrained_vector_exp2_v3f64(<3 x double> %x) #0 {
 ; PC64LE9-NEXT:    xscpsgndp 2, 63, 63
 ; PC64LE9-NEXT:    lxv 63, 32(1) # 16-byte Folded Reload
 ; PC64LE9-NEXT:    lfd 31, 56(1) # 8-byte Folded Reload
-; PC64LE9-NEXT:    # kill: def $f1 killed $f1 killed $vsl1
 ; PC64LE9-NEXT:    lfd 30, 48(1) # 8-byte Folded Reload
 ; PC64LE9-NEXT:    addi 1, 1, 64
 ; PC64LE9-NEXT:    ld 0, 16(1)
@@ -3399,22 +3227,18 @@ define <4 x double> @constrained_vector_exp2_v4f64(<4 x double> %x) #0 {
 ; PC64LE-NEXT:    nop
 ; PC64LE-NEXT:    xxlor 61, 1, 1
 ; PC64LE-NEXT:    xxswapd 1, 62
-; PC64LE-NEXT:    # kill: def $f1 killed $f1 killed $vsl1
 ; PC64LE-NEXT:    bl exp2
 ; PC64LE-NEXT:    nop
-; PC64LE-NEXT:    # kill: def $f1 killed $f1 def $vsl1
 ; PC64LE-NEXT:    xxmrghd 62, 61, 1
 ; PC64LE-NEXT:    xxlor 1, 63, 63
 ; PC64LE-NEXT:    bl exp2
 ; PC64LE-NEXT:    nop
 ; PC64LE-NEXT:    xxlor 61, 1, 1
 ; PC64LE-NEXT:    xxswapd 1, 63
-; PC64LE-NEXT:    # kill: def $f1 killed $f1 killed $vsl1
 ; PC64LE-NEXT:    bl exp2
 ; PC64LE-NEXT:    nop
 ; PC64LE-NEXT:    li 3, 80
 ; PC64LE-NEXT:    vmr 2, 30
-; PC64LE-NEXT:    # kill: def $f1 killed $f1 def $vsl1
 ; PC64LE-NEXT:    xxmrghd 35, 61, 1
 ; PC64LE-NEXT:    lxvd2x 63, 1, 3 # 16-byte Folded Reload
 ; PC64LE-NEXT:    li 3, 64
@@ -3441,20 +3265,16 @@ define <4 x double> @constrained_vector_exp2_v4f64(<4 x double> %x) #0 {
 ; PC64LE9-NEXT:    nop
 ; PC64LE9-NEXT:    xscpsgndp 61, 1, 1
 ; PC64LE9-NEXT:    xxswapd 1, 62
-; PC64LE9-NEXT:    # kill: def $f1 killed $f1 killed $vsl1
 ; PC64LE9-NEXT:    bl exp2
 ; PC64LE9-NEXT:    nop
-; PC64LE9-NEXT:    # kill: def $f1 killed $f1 def $vsl1
 ; PC64LE9-NEXT:    xxmrghd 62, 61, 1
 ; PC64LE9-NEXT:    xscpsgndp 1, 63, 63
 ; PC64LE9-NEXT:    bl exp2
 ; PC64LE9-NEXT:    nop
 ; PC64LE9-NEXT:    xscpsgndp 61, 1, 1
 ; PC64LE9-NEXT:    xxswapd 1, 63
-; PC64LE9-NEXT:    # kill: def $f1 killed $f1 killed $vsl1
 ; PC64LE9-NEXT:    bl exp2
 ; PC64LE9-NEXT:    nop
-; PC64LE9-NEXT:    # kill: def $f1 killed $f1 def $vsl1
 ; PC64LE9-NEXT:    xxmrghd 35, 61, 1
 ; PC64LE9-NEXT:    vmr 2, 30
 ; PC64LE9-NEXT:    lxv 63, 64(1) # 16-byte Folded Reload
@@ -3520,11 +3340,9 @@ define <2 x double> @constrained_vector_log_v2f64(<2 x double> %x) #0 {
 ; PC64LE-NEXT:    nop
 ; PC64LE-NEXT:    xxlor 62, 1, 1
 ; PC64LE-NEXT:    xxswapd 1, 63
-; PC64LE-NEXT:    # kill: def $f1 killed $f1 killed $vsl1
 ; PC64LE-NEXT:    bl log
 ; PC64LE-NEXT:    nop
 ; PC64LE-NEXT:    li 3, 64
-; PC64LE-NEXT:    # kill: def $f1 killed $f1 def $vsl1
 ; PC64LE-NEXT:    xxmrghd 34, 62, 1
 ; PC64LE-NEXT:    lxvd2x 63, 1, 3 # 16-byte Folded Reload
 ; PC64LE-NEXT:    li 3, 48
@@ -3547,10 +3365,8 @@ define <2 x double> @constrained_vector_log_v2f64(<2 x double> %x) #0 {
 ; PC64LE9-NEXT:    nop
 ; PC64LE9-NEXT:    xscpsgndp 62, 1, 1
 ; PC64LE9-NEXT:    xxswapd 1, 63
-; PC64LE9-NEXT:    # kill: def $f1 killed $f1 killed $vsl1
 ; PC64LE9-NEXT:    bl log
 ; PC64LE9-NEXT:    nop
-; PC64LE9-NEXT:    # kill: def $f1 killed $f1 def $vsl1
 ; PC64LE9-NEXT:    xxmrghd 34, 62, 1
 ; PC64LE9-NEXT:    lxv 63, 48(1) # 16-byte Folded Reload
 ; PC64LE9-NEXT:    lxv 62, 32(1) # 16-byte Folded Reload
@@ -3673,7 +3489,6 @@ define <3 x double> @constrained_vector_log_v3f64(<3 x double> %x) #0 {
 ; PC64LE-NEXT:    fmr 1, 30
 ; PC64LE-NEXT:    bl log
 ; PC64LE-NEXT:    nop
-; PC64LE-NEXT:    # kill: def $f1 killed $f1 def $vsl1
 ; PC64LE-NEXT:    xxmrghd 63, 1, 63
 ; PC64LE-NEXT:    fmr 1, 31
 ; PC64LE-NEXT:    bl log
@@ -3684,7 +3499,6 @@ define <3 x double> @constrained_vector_log_v3f64(<3 x double> %x) #0 {
 ; PC64LE-NEXT:    lfd 31, 72(1) # 8-byte Folded Reload
 ; PC64LE-NEXT:    xxlor 2, 63, 63
 ; PC64LE-NEXT:    lfd 30, 64(1) # 8-byte Folded Reload
-; PC64LE-NEXT:    # kill: def $f1 killed $f1 killed $vsl1
 ; PC64LE-NEXT:    lxvd2x 63, 1, 3 # 16-byte Folded Reload
 ; PC64LE-NEXT:    addi 1, 1, 80
 ; PC64LE-NEXT:    ld 0, 16(1)
@@ -3707,7 +3521,6 @@ define <3 x double> @constrained_vector_log_v3f64(<3 x double> %x) #0 {
 ; PC64LE9-NEXT:    fmr 1, 30
 ; PC64LE9-NEXT:    bl log
 ; PC64LE9-NEXT:    nop
-; PC64LE9-NEXT:    # kill: def $f1 killed $f1 def $vsl1
 ; PC64LE9-NEXT:    xxmrghd 63, 1, 63
 ; PC64LE9-NEXT:    fmr 1, 31
 ; PC64LE9-NEXT:    bl log
@@ -3717,7 +3530,6 @@ define <3 x double> @constrained_vector_log_v3f64(<3 x double> %x) #0 {
 ; PC64LE9-NEXT:    xscpsgndp 2, 63, 63
 ; PC64LE9-NEXT:    lxv 63, 32(1) # 16-byte Folded Reload
 ; PC64LE9-NEXT:    lfd 31, 56(1) # 8-byte Folded Reload
-; PC64LE9-NEXT:    # kill: def $f1 killed $f1 killed $vsl1
 ; PC64LE9-NEXT:    lfd 30, 48(1) # 8-byte Folded Reload
 ; PC64LE9-NEXT:    addi 1, 1, 64
 ; PC64LE9-NEXT:    ld 0, 16(1)
@@ -3750,22 +3562,18 @@ define <4 x double> @constrained_vector_log_v4f64(<4 x double> %x) #0 {
 ; PC64LE-NEXT:    nop
 ; PC64LE-NEXT:    xxlor 61, 1, 1
 ; PC64LE-NEXT:    xxswapd 1, 62
-; PC64LE-NEXT:    # kill: def $f1 killed $f1 killed $vsl1
 ; PC64LE-NEXT:    bl log
 ; PC64LE-NEXT:    nop
-; PC64LE-NEXT:    # kill: def $f1 killed $f1 def $vsl1
 ; PC64LE-NEXT:    xxmrghd 62, 61, 1
 ; PC64LE-NEXT:    xxlor 1, 63, 63
 ; PC64LE-NEXT:    bl log
 ; PC64LE-NEXT:    nop
 ; PC64LE-NEXT:    xxlor 61, 1, 1
 ; PC64LE-NEXT:    xxswapd 1, 63
-; PC64LE-NEXT:    # kill: def $f1 killed $f1 killed $vsl1
 ; PC64LE-NEXT:    bl log
 ; PC64LE-NEXT:    nop
 ; PC64LE-NEXT:    li 3, 80
 ; PC64LE-NEXT:    vmr 2, 30
-; PC64LE-NEXT:    # kill: def $f1 killed $f1 def $vsl1
 ; PC64LE-NEXT:    xxmrghd 35, 61, 1
 ; PC64LE-NEXT:    lxvd2x 63, 1, 3 # 16-byte Folded Reload
 ; PC64LE-NEXT:    li 3, 64
@@ -3792,20 +3600,16 @@ define <4 x double> @constrained_vector_log_v4f64(<4 x double> %x) #0 {
 ; PC64LE9-NEXT:    nop
 ; PC64LE9-NEXT:    xscpsgndp 61, 1, 1
 ; PC64LE9-NEXT:    xxswapd 1, 62
-; PC64LE9-NEXT:    # kill: def $f1 killed $f1 killed $vsl1
 ; PC64LE9-NEXT:    bl log
 ; PC64LE9-NEXT:    nop
-; PC64LE9-NEXT:    # kill: def $f1 killed $f1 def $vsl1
 ; PC64LE9-NEXT:    xxmrghd 62, 61, 1
 ; PC64LE9-NEXT:    xscpsgndp 1, 63, 63
 ; PC64LE9-NEXT:    bl log
 ; PC64LE9-NEXT:    nop
 ; PC64LE9-NEXT:    xscpsgndp 61, 1, 1
 ; PC64LE9-NEXT:    xxswapd 1, 63
-; PC64LE9-NEXT:    # kill: def $f1 killed $f1 killed $vsl1
 ; PC64LE9-NEXT:    bl log
 ; PC64LE9-NEXT:    nop
-; PC64LE9-NEXT:    # kill: def $f1 killed $f1 def $vsl1
 ; PC64LE9-NEXT:    xxmrghd 35, 61, 1
 ; PC64LE9-NEXT:    vmr 2, 30
 ; PC64LE9-NEXT:    lxv 63, 64(1) # 16-byte Folded Reload
@@ -3871,11 +3675,9 @@ define <2 x double> @constrained_vector_log10_v2f64(<2 x double> %x) #0 {
 ; PC64LE-NEXT:    nop
 ; PC64LE-NEXT:    xxlor 62, 1, 1
 ; PC64LE-NEXT:    xxswapd 1, 63
-; PC64LE-NEXT:    # kill: def $f1 killed $f1 killed $vsl1
 ; PC64LE-NEXT:    bl log10
 ; PC64LE-NEXT:    nop
 ; PC64LE-NEXT:    li 3, 64
-; PC64LE-NEXT:    # kill: def $f1 killed $f1 def $vsl1
 ; PC64LE-NEXT:    xxmrghd 34, 62, 1
 ; PC64LE-NEXT:    lxvd2x 63, 1, 3 # 16-byte Folded Reload
 ; PC64LE-NEXT:    li 3, 48
@@ -3898,10 +3700,8 @@ define <2 x double> @constrained_vector_log10_v2f64(<2 x double> %x) #0 {
 ; PC64LE9-NEXT:    nop
 ; PC64LE9-NEXT:    xscpsgndp 62, 1, 1
 ; PC64LE9-NEXT:    xxswapd 1, 63
-; PC64LE9-NEXT:    # kill: def $f1 killed $f1 killed $vsl1
 ; PC64LE9-NEXT:    bl log10
 ; PC64LE9-NEXT:    nop
-; PC64LE9-NEXT:    # kill: def $f1 killed $f1 def $vsl1
 ; PC64LE9-NEXT:    xxmrghd 34, 62, 1
 ; PC64LE9-NEXT:    lxv 63, 48(1) # 16-byte Folded Reload
 ; PC64LE9-NEXT:    lxv 62, 32(1) # 16-byte Folded Reload
@@ -4024,7 +3824,6 @@ define <3 x double> @constrained_vector_log10_v3f64(<3 x double> %x) #0 {
 ; PC64LE-NEXT:    fmr 1, 30
 ; PC64LE-NEXT:    bl log10
 ; PC64LE-NEXT:    nop
-; PC64LE-NEXT:    # kill: def $f1 killed $f1 def $vsl1
 ; PC64LE-NEXT:    xxmrghd 63, 1, 63
 ; PC64LE-NEXT:    fmr 1, 31
 ; PC64LE-NEXT:    bl log10
@@ -4035,7 +3834,6 @@ define <3 x double> @constrained_vector_log10_v3f64(<3 x double> %x) #0 {
 ; PC64LE-NEXT:    lfd 31, 72(1) # 8-byte Folded Reload
 ; PC64LE-NEXT:    xxlor 2, 63, 63
 ; PC64LE-NEXT:    lfd 30, 64(1) # 8-byte Folded Reload
-; PC64LE-NEXT:    # kill: def $f1 killed $f1 killed $vsl1
 ; PC64LE-NEXT:    lxvd2x 63, 1, 3 # 16-byte Folded Reload
 ; PC64LE-NEXT:    addi 1, 1, 80
 ; PC64LE-NEXT:    ld 0, 16(1)
@@ -4058,7 +3856,6 @@ define <3 x double> @constrained_vector_log10_v3f64(<3 x double> %x) #0 {
 ; PC64LE9-NEXT:    fmr 1, 30
 ; PC64LE9-NEXT:    bl log10
 ; PC64LE9-NEXT:    nop
-; PC64LE9-NEXT:    # kill: def $f1 killed $f1 def $vsl1
 ; PC64LE9-NEXT:    xxmrghd 63, 1, 63
 ; PC64LE9-NEXT:    fmr 1, 31
 ; PC64LE9-NEXT:    bl log10
@@ -4068,7 +3865,6 @@ define <3 x double> @constrained_vector_log10_v3f64(<3 x double> %x) #0 {
 ; PC64LE9-NEXT:    xscpsgndp 2, 63, 63
 ; PC64LE9-NEXT:    lxv 63, 32(1) # 16-byte Folded Reload
 ; PC64LE9-NEXT:    lfd 31, 56(1) # 8-byte Folded Reload
-; PC64LE9-NEXT:    # kill: def $f1 killed $f1 killed $vsl1
 ; PC64LE9-NEXT:    lfd 30, 48(1) # 8-byte Folded Reload
 ; PC64LE9-NEXT:    addi 1, 1, 64
 ; PC64LE9-NEXT:    ld 0, 16(1)
@@ -4101,22 +3897,18 @@ define <4 x double> @constrained_vector_log10_v4f64(<4 x double> %x) #0 {
 ; PC64LE-NEXT:    nop
 ; PC64LE-NEXT:    xxlor 61, 1, 1
 ; PC64LE-NEXT:    xxswapd 1, 62
-; PC64LE-NEXT:    # kill: def $f1 killed $f1 killed $vsl1
 ; PC64LE-NEXT:    bl log10
 ; PC64LE-NEXT:    nop
-; PC64LE-NEXT:    # kill: def $f1 killed $f1 def $vsl1
 ; PC64LE-NEXT:    xxmrghd 62, 61, 1
 ; PC64LE-NEXT:    xxlor 1, 63, 63
 ; PC64LE-NEXT:    bl log10
 ; PC64LE-NEXT:    nop
 ; PC64LE-NEXT:    xxlor 61, 1, 1
 ; PC64LE-NEXT:    xxswapd 1, 63
-; PC64LE-NEXT:    # kill: def $f1 killed $f1 killed $vsl1
 ; PC64LE-NEXT:    bl log10
 ; PC64LE-NEXT:    nop
 ; PC64LE-NEXT:    li 3, 80
 ; PC64LE-NEXT:    vmr 2, 30
-; PC64LE-NEXT:    # kill: def $f1 killed $f1 def $vsl1
 ; PC64LE-NEXT:    xxmrghd 35, 61, 1
 ; PC64LE-NEXT:    lxvd2x 63, 1, 3 # 16-byte Folded Reload
 ; PC64LE-NEXT:    li 3, 64
@@ -4143,20 +3935,16 @@ define <4 x double> @constrained_vector_log10_v4f64(<4 x double> %x) #0 {
 ; PC64LE9-NEXT:    nop
 ; PC64LE9-NEXT:    xscpsgndp 61, 1, 1
 ; PC64LE9-NEXT:    xxswapd 1, 62
-; PC64LE9-NEXT:    # kill: def $f1 killed $f1 killed $vsl1
 ; PC64LE9-NEXT:    bl log10
 ; PC64LE9-NEXT:    nop
-; PC64LE9-NEXT:    # kill: def $f1 killed $f1 def $vsl1
 ; PC64LE9-NEXT:    xxmrghd 62, 61, 1
 ; PC64LE9-NEXT:    xscpsgndp 1, 63, 63
 ; PC64LE9-NEXT:    bl log10
 ; PC64LE9-NEXT:    nop
 ; PC64LE9-NEXT:    xscpsgndp 61, 1, 1
 ; PC64LE9-NEXT:    xxswapd 1, 63
-; PC64LE9-NEXT:    # kill: def $f1 killed $f1 killed $vsl1
 ; PC64LE9-NEXT:    bl log10
 ; PC64LE9-NEXT:    nop
-; PC64LE9-NEXT:    # kill: def $f1 killed $f1 def $vsl1
 ; PC64LE9-NEXT:    xxmrghd 35, 61, 1
 ; PC64LE9-NEXT:    vmr 2, 30
 ; PC64LE9-NEXT:    lxv 63, 64(1) # 16-byte Folded Reload
@@ -4222,11 +4010,9 @@ define <2 x double> @constrained_vector_log2_v2f64(<2 x double> %x) #0 {
 ; PC64LE-NEXT:    nop
 ; PC64LE-NEXT:    xxlor 62, 1, 1
 ; PC64LE-NEXT:    xxswapd 1, 63
-; PC64LE-NEXT:    # kill: def $f1 killed $f1 killed $vsl1
 ; PC64LE-NEXT:    bl log2
 ; PC64LE-NEXT:    nop
 ; PC64LE-NEXT:    li 3, 64
-; PC64LE-NEXT:    # kill: def $f1 killed $f1 def $vsl1
 ; PC64LE-NEXT:    xxmrghd 34, 62, 1
 ; PC64LE-NEXT:    lxvd2x 63, 1, 3 # 16-byte Folded Reload
 ; PC64LE-NEXT:    li 3, 48
@@ -4249,10 +4035,8 @@ define <2 x double> @constrained_vector_log2_v2f64(<2 x double> %x) #0 {
 ; PC64LE9-NEXT:    nop
 ; PC64LE9-NEXT:    xscpsgndp 62, 1, 1
 ; PC64LE9-NEXT:    xxswapd 1, 63
-; PC64LE9-NEXT:    # kill: def $f1 killed $f1 killed $vsl1
 ; PC64LE9-NEXT:    bl log2
 ; PC64LE9-NEXT:    nop
-; PC64LE9-NEXT:    # kill: def $f1 killed $f1 def $vsl1
 ; PC64LE9-NEXT:    xxmrghd 34, 62, 1
 ; PC64LE9-NEXT:    lxv 63, 48(1) # 16-byte Folded Reload
 ; PC64LE9-NEXT:    lxv 62, 32(1) # 16-byte Folded Reload
@@ -4375,7 +4159,6 @@ define <3 x double> @constrained_vector_log2_v3f64(<3 x double> %x) #0 {
 ; PC64LE-NEXT:    fmr 1, 30
 ; PC64LE-NEXT:    bl log2
 ; PC64LE-NEXT:    nop
-; PC64LE-NEXT:    # kill: def $f1 killed $f1 def $vsl1
 ; PC64LE-NEXT:    xxmrghd 63, 1, 63
 ; PC64LE-NEXT:    fmr 1, 31
 ; PC64LE-NEXT:    bl log2
@@ -4386,7 +4169,6 @@ define <3 x double> @constrained_vector_log2_v3f64(<3 x double> %x) #0 {
 ; PC64LE-NEXT:    lfd 31, 72(1) # 8-byte Folded Reload
 ; PC64LE-NEXT:    xxlor 2, 63, 63
 ; PC64LE-NEXT:    lfd 30, 64(1) # 8-byte Folded Reload
-; PC64LE-NEXT:    # kill: def $f1 killed $f1 killed $vsl1
 ; PC64LE-NEXT:    lxvd2x 63, 1, 3 # 16-byte Folded Reload
 ; PC64LE-NEXT:    addi 1, 1, 80
 ; PC64LE-NEXT:    ld 0, 16(1)
@@ -4409,7 +4191,6 @@ define <3 x double> @constrained_vector_log2_v3f64(<3 x double> %x) #0 {
 ; PC64LE9-NEXT:    fmr 1, 30
 ; PC64LE9-NEXT:    bl log2
 ; PC64LE9-NEXT:    nop
-; PC64LE9-NEXT:    # kill: def $f1 killed $f1 def $vsl1
 ; PC64LE9-NEXT:    xxmrghd 63, 1, 63
 ; PC64LE9-NEXT:    fmr 1, 31
 ; PC64LE9-NEXT:    bl log2
@@ -4419,7 +4200,6 @@ define <3 x double> @constrained_vector_log2_v3f64(<3 x double> %x) #0 {
 ; PC64LE9-NEXT:    xscpsgndp 2, 63, 63
 ; PC64LE9-NEXT:    lxv 63, 32(1) # 16-byte Folded Reload
 ; PC64LE9-NEXT:    lfd 31, 56(1) # 8-byte Folded Reload
-; PC64LE9-NEXT:    # kill: def $f1 killed $f1 killed $vsl1
 ; PC64LE9-NEXT:    lfd 30, 48(1) # 8-byte Folded Reload
 ; PC64LE9-NEXT:    addi 1, 1, 64
 ; PC64LE9-NEXT:    ld 0, 16(1)
@@ -4452,22 +4232,18 @@ define <4 x double> @constrained_vector_log2_v4f64(<4 x double> %x) #0 {
 ; PC64LE-NEXT:    nop
 ; PC64LE-NEXT:    xxlor 61, 1, 1
 ; PC64LE-NEXT:    xxswapd 1, 62
-; PC64LE-NEXT:    # kill: def $f1 killed $f1 killed $vsl1
 ; PC64LE-NEXT:    bl log2
 ; PC64LE-NEXT:    nop
-; PC64LE-NEXT:    # kill: def $f1 killed $f1 def $vsl1
 ; PC64LE-NEXT:    xxmrghd 62, 61, 1
 ; PC64LE-NEXT:    xxlor 1, 63, 63
 ; PC64LE-NEXT:    bl log2
 ; PC64LE-NEXT:    nop
 ; PC64LE-NEXT:    xxlor 61, 1, 1
 ; PC64LE-NEXT:    xxswapd 1, 63
-; PC64LE-NEXT:    # kill: def $f1 killed $f1 killed $vsl1
 ; PC64LE-NEXT:    bl log2
 ; PC64LE-NEXT:    nop
 ; PC64LE-NEXT:    li 3, 80
 ; PC64LE-NEXT:    vmr 2, 30
-; PC64LE-NEXT:    # kill: def $f1 killed $f1 def $vsl1
 ; PC64LE-NEXT:    xxmrghd 35, 61, 1
 ; PC64LE-NEXT:    lxvd2x 63, 1, 3 # 16-byte Folded Reload
 ; PC64LE-NEXT:    li 3, 64
@@ -4494,20 +4270,16 @@ define <4 x double> @constrained_vector_log2_v4f64(<4 x double> %x) #0 {
 ; PC64LE9-NEXT:    nop
 ; PC64LE9-NEXT:    xscpsgndp 61, 1, 1
 ; PC64LE9-NEXT:    xxswapd 1, 62
-; PC64LE9-NEXT:    # kill: def $f1 killed $f1 killed $vsl1
 ; PC64LE9-NEXT:    bl log2
 ; PC64LE9-NEXT:    nop
-; PC64LE9-NEXT:    # kill: def $f1 killed $f1 def $vsl1
 ; PC64LE9-NEXT:    xxmrghd 62, 61, 1
 ; PC64LE9-NEXT:    xscpsgndp 1, 63, 63
 ; PC64LE9-NEXT:    bl log2
 ; PC64LE9-NEXT:    nop
 ; PC64LE9-NEXT:    xscpsgndp 61, 1, 1
 ; PC64LE9-NEXT:    xxswapd 1, 63
-; PC64LE9-NEXT:    # kill: def $f1 killed $f1 killed $vsl1
 ; PC64LE9-NEXT:    bl log2
 ; PC64LE9-NEXT:    nop
-; PC64LE9-NEXT:    # kill: def $f1 killed $f1 def $vsl1
 ; PC64LE9-NEXT:    xxmrghd 35, 61, 1
 ; PC64LE9-NEXT:    vmr 2, 30
 ; PC64LE9-NEXT:    lxv 63, 64(1) # 16-byte Folded Reload
@@ -4615,26 +4387,18 @@ define <3 x float> @constrained_vector_rint_v3f32(<3 x float> %x) #0 {
 define <3 x double> @constrained_vector_rint_v3f64(<3 x double> %x) #0 {
 ; PC64LE-LABEL: constrained_vector_rint_v3f64:
 ; PC64LE:       # %bb.0: # %entry
-; PC64LE-NEXT:    # kill: def $f2 killed $f2 def $vsl2
-; PC64LE-NEXT:    # kill: def $f1 killed $f1 def $vsl1
 ; PC64LE-NEXT:    xxmrghd 0, 2, 1
 ; PC64LE-NEXT:    xsrdpic 3, 3
 ; PC64LE-NEXT:    xvrdpic 2, 0
 ; PC64LE-NEXT:    xxswapd 1, 2
-; PC64LE-NEXT:    # kill: def $f1 killed $f1 killed $vsl1
-; PC64LE-NEXT:    # kill: def $f2 killed $f2 killed $vsl2
 ; PC64LE-NEXT:    blr
 ;
 ; PC64LE9-LABEL: constrained_vector_rint_v3f64:
 ; PC64LE9:       # %bb.0: # %entry
-; PC64LE9-NEXT:    # kill: def $f2 killed $f2 def $vsl2
-; PC64LE9-NEXT:    # kill: def $f1 killed $f1 def $vsl1
 ; PC64LE9-NEXT:    xxmrghd 0, 2, 1
 ; PC64LE9-NEXT:    xsrdpic 3, 3
 ; PC64LE9-NEXT:    xvrdpic 2, 0
 ; PC64LE9-NEXT:    xxswapd 1, 2
-; PC64LE9-NEXT:    # kill: def $f1 killed $f1 killed $vsl1
-; PC64LE9-NEXT:    # kill: def $f2 killed $f2 killed $vsl2
 ; PC64LE9-NEXT:    blr
 entry:
   %rint = call <3 x double> @llvm.experimental.constrained.rint.v3f64(
@@ -4712,11 +4476,9 @@ define <2 x double> @constrained_vector_nearbyint_v2f64(<2 x double> %x) #0 {
 ; PC64LE-NEXT:    nop
 ; PC64LE-NEXT:    xxlor 62, 1, 1
 ; PC64LE-NEXT:    xxswapd 1, 63
-; PC64LE-NEXT:    # kill: def $f1 killed $f1 killed $vsl1
 ; PC64LE-NEXT:    bl nearbyint
 ; PC64LE-NEXT:    nop
 ; PC64LE-NEXT:    li 3, 64
-; PC64LE-NEXT:    # kill: def $f1 killed $f1 def $vsl1
 ; PC64LE-NEXT:    xxmrghd 34, 62, 1
 ; PC64LE-NEXT:    lxvd2x 63, 1, 3 # 16-byte Folded Reload
 ; PC64LE-NEXT:    li 3, 48
@@ -4739,10 +4501,8 @@ define <2 x double> @constrained_vector_nearbyint_v2f64(<2 x double> %x) #0 {
 ; PC64LE9-NEXT:    nop
 ; PC64LE9-NEXT:    xscpsgndp 62, 1, 1
 ; PC64LE9-NEXT:    xxswapd 1, 63
-; PC64LE9-NEXT:    # kill: def $f1 killed $f1 killed $vsl1
 ; PC64LE9-NEXT:    bl nearbyint
 ; PC64LE9-NEXT:    nop
-; PC64LE9-NEXT:    # kill: def $f1 killed $f1 def $vsl1
 ; PC64LE9-NEXT:    xxmrghd 34, 62, 1
 ; PC64LE9-NEXT:    lxv 63, 48(1) # 16-byte Folded Reload
 ; PC64LE9-NEXT:    lxv 62, 32(1) # 16-byte Folded Reload
@@ -4865,7 +4625,6 @@ define <3 x double> @constrained_vector_nearby_v3f64(<3 x double> %x) #0 {
 ; PC64LE-NEXT:    fmr 1, 30
 ; PC64LE-NEXT:    bl nearbyint
 ; PC64LE-NEXT:    nop
-; PC64LE-NEXT:    # kill: def $f1 killed $f1 def $vsl1
 ; PC64LE-NEXT:    xxmrghd 63, 1, 63
 ; PC64LE-NEXT:    fmr 1, 31
 ; PC64LE-NEXT:    bl nearbyint
@@ -4876,7 +4635,6 @@ define <3 x double> @constrained_vector_nearby_v3f64(<3 x double> %x) #0 {
 ; PC64LE-NEXT:    lfd 31, 72(1) # 8-byte Folded Reload
 ; PC64LE-NEXT:    xxlor 2, 63, 63
 ; PC64LE-NEXT:    lfd 30, 64(1) # 8-byte Folded Reload
-; PC64LE-NEXT:    # kill: def $f1 killed $f1 killed $vsl1
 ; PC64LE-NEXT:    lxvd2x 63, 1, 3 # 16-byte Folded Reload
 ; PC64LE-NEXT:    addi 1, 1, 80
 ; PC64LE-NEXT:    ld 0, 16(1)
@@ -4899,7 +4657,6 @@ define <3 x double> @constrained_vector_nearby_v3f64(<3 x double> %x) #0 {
 ; PC64LE9-NEXT:    fmr 1, 30
 ; PC64LE9-NEXT:    bl nearbyint
 ; PC64LE9-NEXT:    nop
-; PC64LE9-NEXT:    # kill: def $f1 killed $f1 def $vsl1
 ; PC64LE9-NEXT:    xxmrghd 63, 1, 63
 ; PC64LE9-NEXT:    fmr 1, 31
 ; PC64LE9-NEXT:    bl nearbyint
@@ -4909,7 +4666,6 @@ define <3 x double> @constrained_vector_nearby_v3f64(<3 x double> %x) #0 {
 ; PC64LE9-NEXT:    xscpsgndp 2, 63, 63
 ; PC64LE9-NEXT:    lxv 63, 32(1) # 16-byte Folded Reload
 ; PC64LE9-NEXT:    lfd 31, 56(1) # 8-byte Folded Reload
-; PC64LE9-NEXT:    # kill: def $f1 killed $f1 killed $vsl1
 ; PC64LE9-NEXT:    lfd 30, 48(1) # 8-byte Folded Reload
 ; PC64LE9-NEXT:    addi 1, 1, 64
 ; PC64LE9-NEXT:    ld 0, 16(1)
@@ -4942,22 +4698,18 @@ define <4 x double> @constrained_vector_nearbyint_v4f64(<4 x double> %x) #0 {
 ; PC64LE-NEXT:    nop
 ; PC64LE-NEXT:    xxlor 61, 1, 1
 ; PC64LE-NEXT:    xxswapd 1, 62
-; PC64LE-NEXT:    # kill: def $f1 killed $f1 killed $vsl1
 ; PC64LE-NEXT:    bl nearbyint
 ; PC64LE-NEXT:    nop
-; PC64LE-NEXT:    # kill: def $f1 killed $f1 def $vsl1
 ; PC64LE-NEXT:    xxmrghd 62, 61, 1
 ; PC64LE-NEXT:    xxlor 1, 63, 63
 ; PC64LE-NEXT:    bl nearbyint
 ; PC64LE-NEXT:    nop
 ; PC64LE-NEXT:    xxlor 61, 1, 1
 ; PC64LE-NEXT:    xxswapd 1, 63
-; PC64LE-NEXT:    # kill: def $f1 killed $f1 killed $vsl1
 ; PC64LE-NEXT:    bl nearbyint
 ; PC64LE-NEXT:    nop
 ; PC64LE-NEXT:    li 3, 80
 ; PC64LE-NEXT:    vmr 2, 30
-; PC64LE-NEXT:    # kill: def $f1 killed $f1 def $vsl1
 ; PC64LE-NEXT:    xxmrghd 35, 61, 1
 ; PC64LE-NEXT:    lxvd2x 63, 1, 3 # 16-byte Folded Reload
 ; PC64LE-NEXT:    li 3, 64
@@ -4984,20 +4736,16 @@ define <4 x double> @constrained_vector_nearbyint_v4f64(<4 x double> %x) #0 {
 ; PC64LE9-NEXT:    nop
 ; PC64LE9-NEXT:    xscpsgndp 61, 1, 1
 ; PC64LE9-NEXT:    xxswapd 1, 62
-; PC64LE9-NEXT:    # kill: def $f1 killed $f1 killed $vsl1
 ; PC64LE9-NEXT:    bl nearbyint
 ; PC64LE9-NEXT:    nop
-; PC64LE9-NEXT:    # kill: def $f1 killed $f1 def $vsl1
 ; PC64LE9-NEXT:    xxmrghd 62, 61, 1
 ; PC64LE9-NEXT:    xscpsgndp 1, 63, 63
 ; PC64LE9-NEXT:    bl nearbyint
 ; PC64LE9-NEXT:    nop
 ; PC64LE9-NEXT:    xscpsgndp 61, 1, 1
 ; PC64LE9-NEXT:    xxswapd 1, 63
-; PC64LE9-NEXT:    # kill: def $f1 killed $f1 killed $vsl1
 ; PC64LE9-NEXT:    bl nearbyint
 ; PC64LE9-NEXT:    nop
-; PC64LE9-NEXT:    # kill: def $f1 killed $f1 def $vsl1
 ; PC64LE9-NEXT:    xxmrghd 35, 61, 1
 ; PC64LE9-NEXT:    vmr 2, 30
 ; PC64LE9-NEXT:    lxv 63, 64(1) # 16-byte Folded Reload
@@ -5179,10 +4927,6 @@ define <3 x double> @constrained_vector_max_v3f64(<3 x double> %x, <3 x double>
 ; PC64LE-NEXT:    mflr 0
 ; PC64LE-NEXT:    stdu 1, -64(1)
 ; PC64LE-NEXT:    li 3, 48
-; PC64LE-NEXT:    # kill: def $f5 killed $f5 def $vsl5
-; PC64LE-NEXT:    # kill: def $f4 killed $f4 def $vsl4
-; PC64LE-NEXT:    # kill: def $f2 killed $f2 def $vsl2
-; PC64LE-NEXT:    # kill: def $f1 killed $f1 def $vsl1
 ; PC64LE-NEXT:    xxmrghd 0, 5, 4
 ; PC64LE-NEXT:    xxmrghd 1, 2, 1
 ; PC64LE-NEXT:    std 0, 80(1)
@@ -5195,7 +4939,6 @@ define <3 x double> @constrained_vector_max_v3f64(<3 x double> %x, <3 x double>
 ; PC64LE-NEXT:    li 3, 48
 ; PC64LE-NEXT:    fmr 3, 1
 ; PC64LE-NEXT:    xxswapd 1, 63
-; PC64LE-NEXT:    # kill: def $f1 killed $f1 killed $vsl1
 ; PC64LE-NEXT:    xxlor 2, 63, 63
 ; PC64LE-NEXT:    lxvd2x 63, 1, 3 # 16-byte Folded Reload
 ; PC64LE-NEXT:    addi 1, 1, 64
@@ -5207,10 +4950,6 @@ define <3 x double> @constrained_vector_max_v3f64(<3 x double> %x, <3 x double>
 ; PC64LE9:       # %bb.0: # %entry
 ; PC64LE9-NEXT:    mflr 0
 ; PC64LE9-NEXT:    stdu 1, -48(1)
-; PC64LE9-NEXT:    # kill: def $f5 killed $f5 def $vsl5
-; PC64LE9-NEXT:    # kill: def $f4 killed $f4 def $vsl4
-; PC64LE9-NEXT:    # kill: def $f2 killed $f2 def $vsl2
-; PC64LE9-NEXT:    # kill: def $f1 killed $f1 def $vsl1
 ; PC64LE9-NEXT:    xxmrghd 0, 5, 4
 ; PC64LE9-NEXT:    xxmrghd 1, 2, 1
 ; PC64LE9-NEXT:    std 0, 64(1)
@@ -5224,7 +4963,6 @@ define <3 x double> @constrained_vector_max_v3f64(<3 x double> %x, <3 x double>
 ; PC64LE9-NEXT:    xxswapd 1, 63
 ; PC64LE9-NEXT:    xscpsgndp 2, 63, 63
 ; PC64LE9-NEXT:    lxv 63, 32(1) # 16-byte Folded Reload
-; PC64LE9-NEXT:    # kill: def $f1 killed $f1 killed $vsl1
 ; PC64LE9-NEXT:    addi 1, 1, 48
 ; PC64LE9-NEXT:    ld 0, 16(1)
 ; PC64LE9-NEXT:    mtlr 0
@@ -5421,10 +5159,6 @@ define <3 x double> @constrained_vector_min_v3f64(<3 x double> %x, <3 x double>
 ; PC64LE-NEXT:    mflr 0
 ; PC64LE-NEXT:    stdu 1, -64(1)
 ; PC64LE-NEXT:    li 3, 48
-; PC64LE-NEXT:    # kill: def $f5 killed $f5 def $vsl5
-; PC64LE-NEXT:    # kill: def $f4 killed $f4 def $vsl4
-; PC64LE-NEXT:    # kill: def $f2 killed $f2 def $vsl2
-; PC64LE-NEXT:    # kill: def $f1 killed $f1 def $vsl1
 ; PC64LE-NEXT:    xxmrghd 0, 5, 4
 ; PC64LE-NEXT:    xxmrghd 1, 2, 1
 ; PC64LE-NEXT:    std 0, 80(1)
@@ -5437,7 +5171,6 @@ define <3 x double> @constrained_vector_min_v3f64(<3 x double> %x, <3 x double>
 ; PC64LE-NEXT:    li 3, 48
 ; PC64LE-NEXT:    fmr 3, 1
 ; PC64LE-NEXT:    xxswapd 1, 63
-; PC64LE-NEXT:    # kill: def $f1 killed $f1 killed $vsl1
 ; PC64LE-NEXT:    xxlor 2, 63, 63
 ; PC64LE-NEXT:    lxvd2x 63, 1, 3 # 16-byte Folded Reload
 ; PC64LE-NEXT:    addi 1, 1, 64
@@ -5449,10 +5182,6 @@ define <3 x double> @constrained_vector_min_v3f64(<3 x double> %x, <3 x double>
 ; PC64LE9:       # %bb.0: # %entry
 ; PC64LE9-NEXT:    mflr 0
 ; PC64LE9-NEXT:    stdu 1, -48(1)
-; PC64LE9-NEXT:    # kill: def $f5 killed $f5 def $vsl5
-; PC64LE9-NEXT:    # kill: def $f4 killed $f4 def $vsl4
-; PC64LE9-NEXT:    # kill: def $f2 killed $f2 def $vsl2
-; PC64LE9-NEXT:    # kill: def $f1 killed $f1 def $vsl1
 ; PC64LE9-NEXT:    xxmrghd 0, 5, 4
 ; PC64LE9-NEXT:    xxmrghd 1, 2, 1
 ; PC64LE9-NEXT:    std 0, 64(1)
@@ -5466,7 +5195,6 @@ define <3 x double> @constrained_vector_min_v3f64(<3 x double> %x, <3 x double>
 ; PC64LE9-NEXT:    xxswapd 1, 63
 ; PC64LE9-NEXT:    xscpsgndp 2, 63, 63
 ; PC64LE9-NEXT:    lxv 63, 32(1) # 16-byte Folded Reload
-; PC64LE9-NEXT:    # kill: def $f1 killed $f1 killed $vsl1
 ; PC64LE9-NEXT:    addi 1, 1, 48
 ; PC64LE9-NEXT:    ld 0, 16(1)
 ; PC64LE9-NEXT:    mtlr 0
@@ -6792,26 +6520,18 @@ entry:
 define <3 x double> @constrained_vector_ceil_v3f64(<3 x double> %x) #0 {
 ; PC64LE-LABEL: constrained_vector_ceil_v3f64:
 ; PC64LE:       # %bb.0: # %entry
-; PC64LE-NEXT:    # kill: def $f2 killed $f2 def $vsl2
-; PC64LE-NEXT:    # kill: def $f1 killed $f1 def $vsl1
 ; PC64LE-NEXT:    xxmrghd 0, 2, 1
 ; PC64LE-NEXT:    xsrdpip 3, 3
 ; PC64LE-NEXT:    xvrdpip 2, 0
 ; PC64LE-NEXT:    xxswapd 1, 2
-; PC64LE-NEXT:    # kill: def $f1 killed $f1 killed $vsl1
-; PC64LE-NEXT:    # kill: def $f2 killed $f2 killed $vsl2
 ; PC64LE-NEXT:    blr
 ;
 ; PC64LE9-LABEL: constrained_vector_ceil_v3f64:
 ; PC64LE9:       # %bb.0: # %entry
-; PC64LE9-NEXT:    # kill: def $f2 killed $f2 def $vsl2
-; PC64LE9-NEXT:    # kill: def $f1 killed $f1 def $vsl1
 ; PC64LE9-NEXT:    xxmrghd 0, 2, 1
 ; PC64LE9-NEXT:    xsrdpip 3, 3
 ; PC64LE9-NEXT:    xvrdpip 2, 0
 ; PC64LE9-NEXT:    xxswapd 1, 2
-; PC64LE9-NEXT:    # kill: def $f1 killed $f1 killed $vsl1
-; PC64LE9-NEXT:    # kill: def $f2 killed $f2 killed $vsl2
 ; PC64LE9-NEXT:    blr
 entry:
   %ceil = call <3 x double> @llvm.experimental.constrained.ceil.v3f64(
@@ -6908,26 +6628,18 @@ entry:
 define <3 x double> @constrained_vector_floor_v3f64(<3 x double> %x) #0 {
 ; PC64LE-LABEL: constrained_vector_floor_v3f64:
 ; PC64LE:       # %bb.0: # %entry
-; PC64LE-NEXT:    # kill: def $f2 killed $f2 def $vsl2
-; PC64LE-NEXT:    # kill: def $f1 killed $f1 def $vsl1
 ; PC64LE-NEXT:    xxmrghd 0, 2, 1
 ; PC64LE-NEXT:    xsrdpim 3, 3
 ; PC64LE-NEXT:    xvrdpim 2, 0
 ; PC64LE-NEXT:    xxswapd 1, 2
-; PC64LE-NEXT:    # kill: def $f1 killed $f1 killed $vsl1
-; PC64LE-NEXT:    # kill: def $f2 killed $f2 killed $vsl2
 ; PC64LE-NEXT:    blr
 ;
 ; PC64LE9-LABEL: constrained_vector_floor_v3f64:
 ; PC64LE9:       # %bb.0: # %entry
-; PC64LE9-NEXT:    # kill: def $f2 killed $f2 def $vsl2
-; PC64LE9-NEXT:    # kill: def $f1 killed $f1 def $vsl1
 ; PC64LE9-NEXT:    xxmrghd 0, 2, 1
 ; PC64LE9-NEXT:    xsrdpim 3, 3
 ; PC64LE9-NEXT:    xvrdpim 2, 0
 ; PC64LE9-NEXT:    xxswapd 1, 2
-; PC64LE9-NEXT:    # kill: def $f1 killed $f1 killed $vsl1
-; PC64LE9-NEXT:    # kill: def $f2 killed $f2 killed $vsl2
 ; PC64LE9-NEXT:    blr
 entry:
   %floor = call <3 x double> @llvm.experimental.constrained.floor.v3f64(
@@ -7024,26 +6736,18 @@ entry:
 define <3 x double> @constrained_vector_round_v3f64(<3 x double> %x) #0 {
 ; PC64LE-LABEL: constrained_vector_round_v3f64:
 ; PC64LE:       # %bb.0: # %entry
-; PC64LE-NEXT:    # kill: def $f2 killed $f2 def $vsl2
-; PC64LE-NEXT:    # kill: def $f1 killed $f1 def $vsl1
 ; PC64LE-NEXT:    xxmrghd 0, 2, 1
 ; PC64LE-NEXT:    xsrdpi 3, 3
 ; PC64LE-NEXT:    xvrdpi 2, 0
 ; PC64LE-NEXT:    xxswapd 1, 2
-; PC64LE-NEXT:    # kill: def $f1 killed $f1 killed $vsl1
-; PC64LE-NEXT:    # kill: def $f2 killed $f2 killed $vsl2
 ; PC64LE-NEXT:    blr
 ;
 ; PC64LE9-LABEL: constrained_vector_round_v3f64:
 ; PC64LE9:       # %bb.0: # %entry
-; PC64LE9-NEXT:    # kill: def $f2 killed $f2 def $vsl2
-; PC64LE9-NEXT:    # kill: def $f1 killed $f1 def $vsl1
 ; PC64LE9-NEXT:    xxmrghd 0, 2, 1
 ; PC64LE9-NEXT:    xsrdpi 3, 3
 ; PC64LE9-NEXT:    xvrdpi 2, 0
 ; PC64LE9-NEXT:    xxswapd 1, 2
-; PC64LE9-NEXT:    # kill: def $f1 killed $f1 killed $vsl1
-; PC64LE9-NEXT:    # kill: def $f2 killed $f2 killed $vsl2
 ; PC64LE9-NEXT:    blr
 entry:
   %round = call <3 x double> @llvm.experimental.constrained.round.v3f64(
@@ -7139,26 +6843,18 @@ entry:
 define <3 x double> @constrained_vector_trunc_v3f64(<3 x double> %x) #0 {
 ; PC64LE-LABEL: constrained_vector_trunc_v3f64:
 ; PC64LE:       # %bb.0: # %entry
-; PC64LE-NEXT:    # kill: def $f2 killed $f2 def $vsl2
-; PC64LE-NEXT:    # kill: def $f1 killed $f1 def $vsl1
 ; PC64LE-NEXT:    xxmrghd 0, 2, 1
 ; PC64LE-NEXT:    xsrdpiz 3, 3
 ; PC64LE-NEXT:    xvrdpiz 2, 0
 ; PC64LE-NEXT:    xxswapd 1, 2
-; PC64LE-NEXT:    # kill: def $f1 killed $f1 killed $vsl1
-; PC64LE-NEXT:    # kill: def $f2 killed $f2 killed $vsl2
 ; PC64LE-NEXT:    blr
 ;
 ; PC64LE9-LABEL: constrained_vector_trunc_v3f64:
 ; PC64LE9:       # %bb.0: # %entry
-; PC64LE9-NEXT:    # kill: def $f2 killed $f2 def $vsl2
-; PC64LE9-NEXT:    # kill: def $f1 killed $f1 def $vsl1
 ; PC64LE9-NEXT:    xxmrghd 0, 2, 1
 ; PC64LE9-NEXT:    xsrdpiz 3, 3
 ; PC64LE9-NEXT:    xvrdpiz 2, 0
 ; PC64LE9-NEXT:    xxswapd 1, 2
-; PC64LE9-NEXT:    # kill: def $f1 killed $f1 killed $vsl1
-; PC64LE9-NEXT:    # kill: def $f2 killed $f2 killed $vsl2
 ; PC64LE9-NEXT:    blr
 entry:
   %trunc = call <3 x double> @llvm.experimental.constrained.trunc.v3f64(
@@ -8350,11 +8046,9 @@ define <2 x double> @constrained_vector_tan_v2f64(<2 x double> %x) #0 {
 ; PC64LE-NEXT:    nop
 ; PC64LE-NEXT:    xxlor 62, 1, 1
 ; PC64LE-NEXT:    xxswapd 1, 63
-; PC64LE-NEXT:    # kill: def $f1 killed $f1 killed $vsl1
 ; PC64LE-NEXT:    bl tan
 ; PC64LE-NEXT:    nop
 ; PC64LE-NEXT:    li 3, 64
-; PC64LE-NEXT:    # kill: def $f1 killed $f1 def $vsl1
 ; PC64LE-NEXT:    xxmrghd 34, 62, 1
 ; PC64LE-NEXT:    lxvd2x 63, 1, 3 # 16-byte Folded Reload
 ; PC64LE-NEXT:    li 3, 48
@@ -8377,10 +8071,8 @@ define <2 x double> @constrained_vector_tan_v2f64(<2 x double> %x) #0 {
 ; PC64LE9-NEXT:    nop
 ; PC64LE9-NEXT:    xscpsgndp 62, 1, 1
 ; PC64LE9-NEXT:    xxswapd 1, 63
-; PC64LE9-NEXT:    # kill: def $f1 killed $f1 killed $vsl1
 ; PC64LE9-NEXT:    bl tan
 ; PC64LE9-NEXT:    nop
-; PC64LE9-NEXT:    # kill: def $f1 killed $f1 def $vsl1
 ; PC64LE9-NEXT:    xxmrghd 34, 62, 1
 ; PC64LE9-NEXT:    lxv 63, 48(1) # 16-byte Folded Reload
 ; PC64LE9-NEXT:    lxv 62, 32(1) # 16-byte Folded Reload
@@ -8503,7 +8195,6 @@ define <3 x double> @constrained_vector_tan_v3f64(<3 x double> %x) #0 {
 ; PC64LE-NEXT:    fmr 1, 30
 ; PC64LE-NEXT:    bl tan
 ; PC64LE-NEXT:    nop
-; PC64LE-NEXT:    # kill: def $f1 killed $f1 def $vsl1
 ; PC64LE-NEXT:    xxmrghd 63, 1, 63
 ; PC64LE-NEXT:    fmr 1, 31
 ; PC64LE-NEXT:    bl tan
@@ -8514,7 +8205,6 @@ define <3 x double> @constrained_vector_tan_v3f64(<3 x double> %x) #0 {
 ; PC64LE-NEXT:    lfd 31, 72(1) # 8-byte Folded Reload
 ; PC64LE-NEXT:    xxlor 2, 63, 63
 ; PC64LE-NEXT:    lfd 30, 64(1) # 8-byte Folded Reload
-; PC64LE-NEXT:    # kill: def $f1 killed $f1 killed $vsl1
 ; PC64LE-NEXT:    lxvd2x 63, 1, 3 # 16-byte Folded Reload
 ; PC64LE-NEXT:    addi 1, 1, 80
 ; PC64LE-NEXT:    ld 0, 16(1)
@@ -8537,7 +8227,6 @@ define <3 x double> @constrained_vector_tan_v3f64(<3 x double> %x) #0 {
 ; PC64LE9-NEXT:    fmr 1, 30
 ; PC64LE9-NEXT:    bl tan
 ; PC64LE9-NEXT:    nop
-; PC64LE9-NEXT:    # kill: def $f1 killed $f1 def $vsl1
 ; PC64LE9-NEXT:    xxmrghd 63, 1, 63
 ; PC64LE9-NEXT:    fmr 1, 31
 ; PC64LE9-NEXT:    bl tan
@@ -8547,7 +8236,6 @@ define <3 x double> @constrained_vector_tan_v3f64(<3 x double> %x) #0 {
 ; PC64LE9-NEXT:    xscpsgndp 2, 63, 63
 ; PC64LE9-NEXT:    lxv 63, 32(1) # 16-byte Folded Reload
 ; PC64LE9-NEXT:    lfd 31, 56(1) # 8-byte Folded Reload
-; PC64LE9-NEXT:    # kill: def $f1 killed $f1 killed $vsl1
 ; PC64LE9-NEXT:    lfd 30, 48(1) # 8-byte Folded Reload
 ; PC64LE9-NEXT:    addi 1, 1, 64
 ; PC64LE9-NEXT:    ld 0, 16(1)
@@ -8580,22 +8268,18 @@ define <4 x double> @constrained_vector_tan_v4f64(<4 x double> %x) #0 {
 ; PC64LE-NEXT:    nop
 ; PC64LE-NEXT:    xxlor 61, 1, 1
 ; PC64LE-NEXT:    xxswapd 1, 62
-; PC64LE-NEXT:    # kill: def $f1 killed $f1 killed $vsl1
 ; PC64LE-NEXT:    bl tan
 ; PC64LE-NEXT:    nop
-; PC64LE-NEXT:    # kill: def $f1 killed $f1 def $vsl1
 ; PC64LE-NEXT:    xxmrghd 62, 61, 1
 ; PC64LE-NEXT:    xxlor 1, 63, 63
 ; PC64LE-NEXT:    bl tan
 ; PC64LE-NEXT:    nop
 ; PC64LE-NEXT:    xxlor 61, 1, 1
 ; PC64LE-NEXT:    xxswapd 1, 63
-; PC64LE-NEXT:    # kill: def $f1 killed $f1 killed $vsl1
 ; PC64LE-NEXT:    bl tan
 ; PC64LE-NEXT:    nop
 ; PC64LE-NEXT:    li 3, 80
 ; PC64LE-NEXT:    vmr 2, 30
-; PC64LE-NEXT:    # kill: def $f1 killed $f1 def $vsl1
 ; PC64LE-NEXT:    xxmrghd 35, 61, 1
 ; PC64LE-NEXT:    lxvd2x 63, 1, 3 # 16-byte Folded Reload
 ; PC64LE-NEXT:    li 3, 64
@@ -8622,20 +8306,16 @@ define <4 x double> @constrained_vector_tan_v4f64(<4 x double> %x) #0 {
 ; PC64LE9-NEXT:    nop
 ; PC64LE9-NEXT:    xscpsgndp 61, 1, 1
 ; PC64LE9-NEXT:    xxswapd 1, 62
-; PC64LE9-NEXT:    # kill: def $f1 killed $f1 killed $vsl1
 ; PC64LE9-NEXT:    bl tan
 ; PC64LE9-NEXT:    nop
-; PC64LE9-NEXT:    # kill: def $f1 killed $f1 def $vsl1
 ; PC64LE9-NEXT:    xxmrghd 62, 61, 1
 ; PC64LE9-NEXT:    xscpsgndp 1, 63, 63
 ; PC64LE9-NEXT:    bl tan
 ; PC64LE9-NEXT:    nop
 ; PC64LE9-NEXT:    xscpsgndp 61, 1, 1
 ; PC64LE9-NEXT:    xxswapd 1, 63
-; PC64LE9-NEXT:    # kill: def $f1 killed $f1 killed $vsl1
 ; PC64LE9-NEXT:    bl tan
 ; PC64LE9-NEXT:    nop
-; PC64LE9-NEXT:    # kill: def $f1 killed $f1 def $vsl1
 ; PC64LE9-NEXT:    xxmrghd 35, 61, 1
 ; PC64LE9-NEXT:    vmr 2, 30
 ; PC64LE9-NEXT:    lxv 63, 64(1) # 16-byte Folded Reload
diff --git a/llvm/test/CodeGen/PowerPC/vector-llrint.ll b/llvm/test/CodeGen/PowerPC/vector-llrint.ll
index 4321b21..190cf6f 100644
--- a/llvm/test/CodeGen/PowerPC/vector-llrint.ll
+++ b/llvm/test/CodeGen/PowerPC/vector-llrint.ll
@@ -4465,9 +4465,8 @@ define <2 x i64> @llrint_v2i64_v2f64(<2 x double> %x) {
 ; BE-NEXT:    xxlor f1, v31, v31
 ; BE-NEXT:    bl llrint
 ; BE-NEXT:    nop
-; BE-NEXT:    std r3, 128(r1)
 ; BE-NEXT:    xxswapd vs1, v31
-; BE-NEXT:    # kill: def $f1 killed $f1 killed $vsl1
+; BE-NEXT:    std r3, 128(r1)
 ; BE-NEXT:    bl llrint
 ; BE-NEXT:    nop
 ; BE-NEXT:    std r3, 136(r1)
@@ -4496,7 +4495,6 @@ define <2 x i64> @llrint_v2i64_v2f64(<2 x double> %x) {
 ; CHECK-NEXT:    nop
 ; CHECK-NEXT:    xxswapd vs1, v31
 ; CHECK-NEXT:    mtvsrd v31, r3
-; CHECK-NEXT:    # kill: def $f1 killed $f1 killed $vsl1
 ; CHECK-NEXT:    bl llrint
 ; CHECK-NEXT:    nop
 ; CHECK-NEXT:    mtfprd f0, r3
@@ -4544,18 +4542,16 @@ define <4 x i64> @llrint_v4i64_v4f64(<4 x double> %x) {
 ; BE-NEXT:    vmr v31, v3
 ; BE-NEXT:    bl llrint
 ; BE-NEXT:    nop
-; BE-NEXT:    std r3, 128(r1)
 ; BE-NEXT:    xxswapd vs1, v30
-; BE-NEXT:    # kill: def $f1 killed $f1 killed $vsl1
+; BE-NEXT:    std r3, 128(r1)
 ; BE-NEXT:    bl llrint
 ; BE-NEXT:    nop
 ; BE-NEXT:    xxlor f1, v31, v31
 ; BE-NEXT:    std r3, 136(r1)
 ; BE-NEXT:    bl llrint
 ; BE-NEXT:    nop
-; BE-NEXT:    std r3, 144(r1)
 ; BE-NEXT:    xxswapd vs1, v31
-; BE-NEXT:    # kill: def $f1 killed $f1 killed $vsl1
+; BE-NEXT:    std r3, 144(r1)
 ; BE-NEXT:    bl llrint
 ; BE-NEXT:    nop
 ; BE-NEXT:    std r3, 152(r1)
@@ -4592,7 +4588,6 @@ define <4 x i64> @llrint_v4i64_v4f64(<4 x double> %x) {
 ; CHECK-NEXT:    nop
 ; CHECK-NEXT:    xxswapd vs1, v30
 ; CHECK-NEXT:    mtvsrd v30, r3
-; CHECK-NEXT:    # kill: def $f1 killed $f1 killed $vsl1
 ; CHECK-NEXT:    bl llrint
 ; CHECK-NEXT:    nop
 ; CHECK-NEXT:    mtfprd f0, r3
@@ -4602,7 +4597,6 @@ define <4 x i64> @llrint_v4i64_v4f64(<4 x double> %x) {
 ; CHECK-NEXT:    nop
 ; CHECK-NEXT:    xxswapd vs1, v31
 ; CHECK-NEXT:    mtvsrd v31, r3
-; CHECK-NEXT:    # kill: def $f1 killed $f1 killed $vsl1
 ; CHECK-NEXT:    bl llrint
 ; CHECK-NEXT:    nop
 ; CHECK-NEXT:    mtfprd f0, r3
@@ -4670,36 +4664,32 @@ define <8 x i64> @llrint_v8i64_v8f64(<8 x double> %x) {
 ; BE-NEXT:    vmr v31, v5
 ; BE-NEXT:    bl llrint
 ; BE-NEXT:    nop
-; BE-NEXT:    std r3, 128(r1)
 ; BE-NEXT:    xxswapd vs1, v28
-; BE-NEXT:    # kill: def $f1 killed $f1 killed $vsl1
+; BE-NEXT:    std r3, 128(r1)
 ; BE-NEXT:    bl llrint
 ; BE-NEXT:    nop
 ; BE-NEXT:    xxlor f1, v29, v29
 ; BE-NEXT:    std r3, 136(r1)
 ; BE-NEXT:    bl llrint
 ; BE-NEXT:    nop
-; BE-NEXT:    std r3, 144(r1)
 ; BE-NEXT:    xxswapd vs1, v29
-; BE-NEXT:    # kill: def $f1 killed $f1 killed $vsl1
+; BE-NEXT:    std r3, 144(r1)
 ; BE-NEXT:    bl llrint
 ; BE-NEXT:    nop
 ; BE-NEXT:    xxlor f1, v30, v30
 ; BE-NEXT:    std r3, 152(r1)
 ; BE-NEXT:    bl llrint
 ; BE-NEXT:    nop
-; BE-NEXT:    std r3, 160(r1)
 ; BE-NEXT:    xxswapd vs1, v30
-; BE-NEXT:    # kill: def $f1 killed $f1 killed $vsl1
+; BE-NEXT:    std r3, 160(r1)
 ; BE-NEXT:    bl llrint
 ; BE-NEXT:    nop
 ; BE-NEXT:    xxlor f1, v31, v31
 ; BE-NEXT:    std r3, 168(r1)
 ; BE-NEXT:    bl llrint
 ; BE-NEXT:    nop
-; BE-NEXT:    std r3, 176(r1)
 ; BE-NEXT:    xxswapd vs1, v31
-; BE-NEXT:    # kill: def $f1 killed $f1 killed $vsl1
+; BE-NEXT:    std r3, 176(r1)
 ; BE-NEXT:    bl llrint
 ; BE-NEXT:    nop
 ; BE-NEXT:    std r3, 184(r1)
@@ -4752,7 +4742,6 @@ define <8 x i64> @llrint_v8i64_v8f64(<8 x double> %x) {
 ; CHECK-NEXT:    nop
 ; CHECK-NEXT:    xxswapd vs1, v28
 ; CHECK-NEXT:    mtvsrd v28, r3
-; CHECK-NEXT:    # kill: def $f1 killed $f1 killed $vsl1
 ; CHECK-NEXT:    bl llrint
 ; CHECK-NEXT:    nop
 ; CHECK-NEXT:    mtfprd f0, r3
@@ -4762,7 +4751,6 @@ define <8 x i64> @llrint_v8i64_v8f64(<8 x double> %x) {
 ; CHECK-NEXT:    nop
 ; CHECK-NEXT:    xxswapd vs1, v29
 ; CHECK-NEXT:    mtvsrd v29, r3
-; CHECK-NEXT:    # kill: def $f1 killed $f1 killed $vsl1
 ; CHECK-NEXT:    bl llrint
 ; CHECK-NEXT:    nop
 ; CHECK-NEXT:    mtfprd f0, r3
@@ -4772,7 +4760,6 @@ define <8 x i64> @llrint_v8i64_v8f64(<8 x double> %x) {
 ; CHECK-NEXT:    nop
 ; CHECK-NEXT:    xxswapd vs1, v30
 ; CHECK-NEXT:    mtvsrd v30, r3
-; CHECK-NEXT:    # kill: def $f1 killed $f1 killed $vsl1
 ; CHECK-NEXT:    bl llrint
 ; CHECK-NEXT:    nop
 ; CHECK-NEXT:    mtfprd f0, r3
@@ -4782,7 +4769,6 @@ define <8 x i64> @llrint_v8i64_v8f64(<8 x double> %x) {
 ; CHECK-NEXT:    nop
 ; CHECK-NEXT:    xxswapd vs1, v31
 ; CHECK-NEXT:    mtvsrd v31, r3
-; CHECK-NEXT:    # kill: def $f1 killed $f1 killed $vsl1
 ; CHECK-NEXT:    bl llrint
 ; CHECK-NEXT:    nop
 ; CHECK-NEXT:    mtfprd f0, r3
diff --git a/llvm/test/CodeGen/PowerPC/vector-lrint.ll b/llvm/test/CodeGen/PowerPC/vector-lrint.ll
index 9667a261..b6d0bd5 100644
--- a/llvm/test/CodeGen/PowerPC/vector-lrint.ll
+++ b/llvm/test/CodeGen/PowerPC/vector-lrint.ll
@@ -4476,9 +4476,8 @@ define <2 x i64> @lrint_v2f64(<2 x double> %x) {
 ; BE-NEXT:    xxlor f1, v31, v31
 ; BE-NEXT:    bl lrint
 ; BE-NEXT:    nop
-; BE-NEXT:    std r3, 128(r1)
 ; BE-NEXT:    xxswapd vs1, v31
-; BE-NEXT:    # kill: def $f1 killed $f1 killed $vsl1
+; BE-NEXT:    std r3, 128(r1)
 ; BE-NEXT:    bl lrint
 ; BE-NEXT:    nop
 ; BE-NEXT:    std r3, 136(r1)
@@ -4507,7 +4506,6 @@ define <2 x i64> @lrint_v2f64(<2 x double> %x) {
 ; CHECK-NEXT:    nop
 ; CHECK-NEXT:    xxswapd vs1, v31
 ; CHECK-NEXT:    mtvsrd v31, r3
-; CHECK-NEXT:    # kill: def $f1 killed $f1 killed $vsl1
 ; CHECK-NEXT:    bl lrint
 ; CHECK-NEXT:    nop
 ; CHECK-NEXT:    mtfprd f0, r3
@@ -4555,18 +4553,16 @@ define <4 x i64> @lrint_v4f64(<4 x double> %x) {
 ; BE-NEXT:    vmr v31, v3
 ; BE-NEXT:    bl lrint
 ; BE-NEXT:    nop
-; BE-NEXT:    std r3, 128(r1)
 ; BE-NEXT:    xxswapd vs1, v30
-; BE-NEXT:    # kill: def $f1 killed $f1 killed $vsl1
+; BE-NEXT:    std r3, 128(r1)
 ; BE-NEXT:    bl lrint
 ; BE-NEXT:    nop
 ; BE-NEXT:    xxlor f1, v31, v31
 ; BE-NEXT:    std r3, 136(r1)
 ; BE-NEXT:    bl lrint
 ; BE-NEXT:    nop
-; BE-NEXT:    std r3, 144(r1)
 ; BE-NEXT:    xxswapd vs1, v31
-; BE-NEXT:    # kill: def $f1 killed $f1 killed $vsl1
+; BE-NEXT:    std r3, 144(r1)
 ; BE-NEXT:    bl lrint
 ; BE-NEXT:    nop
 ; BE-NEXT:    std r3, 152(r1)
@@ -4603,7 +4599,6 @@ define <4 x i64> @lrint_v4f64(<4 x double> %x) {
 ; CHECK-NEXT:    nop
 ; CHECK-NEXT:    xxswapd vs1, v30
 ; CHECK-NEXT:    mtvsrd v30, r3
-; CHECK-NEXT:    # kill: def $f1 killed $f1 killed $vsl1
 ; CHECK-NEXT:    bl lrint
 ; CHECK-NEXT:    nop
 ; CHECK-NEXT:    mtfprd f0, r3
@@ -4613,7 +4608,6 @@ define <4 x i64> @lrint_v4f64(<4 x double> %x) {
 ; CHECK-NEXT:    nop
 ; CHECK-NEXT:    xxswapd vs1, v31
 ; CHECK-NEXT:    mtvsrd v31, r3
-; CHECK-NEXT:    # kill: def $f1 killed $f1 killed $vsl1
 ; CHECK-NEXT:    bl lrint
 ; CHECK-NEXT:    nop
 ; CHECK-NEXT:    mtfprd f0, r3
@@ -4681,36 +4675,32 @@ define <8 x i64> @lrint_v8f64(<8 x double> %x) {
 ; BE-NEXT:    vmr v31, v5
 ; BE-NEXT:    bl lrint
 ; BE-NEXT:    nop
-; BE-NEXT:    std r3, 128(r1)
 ; BE-NEXT:    xxswapd vs1, v28
-; BE-NEXT:    # kill: def $f1 killed $f1 killed $vsl1
+; BE-NEXT:    std r3, 128(r1)
 ; BE-NEXT:    bl lrint
 ; BE-NEXT:    nop
 ; BE-NEXT:    xxlor f1, v29, v29
 ; BE-NEXT:    std r3, 136(r1)
 ; BE-NEXT:    bl lrint
 ; BE-NEXT:    nop
-; BE-NEXT:    std r3, 144(r1)
 ; BE-NEXT:    xxswapd vs1, v29
-; BE-NEXT:    # kill: def $f1 killed $f1 killed $vsl1
+; BE-NEXT:    std r3, 144(r1)
 ; BE-NEXT:    bl lrint
 ; BE-NEXT:    nop
 ; BE-NEXT:    xxlor f1, v30, v30
 ; BE-NEXT:    std r3, 152(r1)
 ; BE-NEXT:    bl lrint
 ; BE-NEXT:    nop
-; BE-NEXT:    std r3, 160(r1)
 ; BE-NEXT:    xxswapd vs1, v30
-; BE-NEXT:    # kill: def $f1 killed $f1 killed $vsl1
+; BE-NEXT:    std r3, 160(r1)
 ; BE-NEXT:    bl lrint
 ; BE-NEXT:    nop
 ; BE-NEXT:    xxlor f1, v31, v31
 ; BE-NEXT:    std r3, 168(r1)
 ; BE-NEXT:    bl lrint
 ; BE-NEXT:    nop
-; BE-NEXT:    std r3, 176(r1)
 ; BE-NEXT:    xxswapd vs1, v31
-; BE-NEXT:    # kill: def $f1 killed $f1 killed $vsl1
+; BE-NEXT:    std r3, 176(r1)
 ; BE-NEXT:    bl lrint
 ; BE-NEXT:    nop
 ; BE-NEXT:    std r3, 184(r1)
@@ -4763,7 +4753,6 @@ define <8 x i64> @lrint_v8f64(<8 x double> %x) {
 ; CHECK-NEXT:    nop
 ; CHECK-NEXT:    xxswapd vs1, v28
 ; CHECK-NEXT:    mtvsrd v28, r3
-; CHECK-NEXT:    # kill: def $f1 killed $f1 killed $vsl1
 ; CHECK-NEXT:    bl lrint
 ; CHECK-NEXT:    nop
 ; CHECK-NEXT:    mtfprd f0, r3
@@ -4773,7 +4762,6 @@ define <8 x i64> @lrint_v8f64(<8 x double> %x) {
 ; CHECK-NEXT:    nop
 ; CHECK-NEXT:    xxswapd vs1, v29
 ; CHECK-NEXT:    mtvsrd v29, r3
-; CHECK-NEXT:    # kill: def $f1 killed $f1 killed $vsl1
 ; CHECK-NEXT:    bl lrint
 ; CHECK-NEXT:    nop
 ; CHECK-NEXT:    mtfprd f0, r3
@@ -4783,7 +4771,6 @@ define <8 x i64> @lrint_v8f64(<8 x double> %x) {
 ; CHECK-NEXT:    nop
 ; CHECK-NEXT:    xxswapd vs1, v30
 ; CHECK-NEXT:    mtvsrd v30, r3
-; CHECK-NEXT:    # kill: def $f1 killed $f1 killed $vsl1
 ; CHECK-NEXT:    bl lrint
 ; CHECK-NEXT:    nop
 ; CHECK-NEXT:    mtfprd f0, r3
@@ -4793,7 +4780,6 @@ define <8 x i64> @lrint_v8f64(<8 x double> %x) {
 ; CHECK-NEXT:    nop
 ; CHECK-NEXT:    xxswapd vs1, v31
 ; CHECK-NEXT:    mtvsrd v31, r3
-; CHECK-NEXT:    # kill: def $f1 killed $f1 killed $vsl1
 ; CHECK-NEXT:    bl lrint
 ; CHECK-NEXT:    nop
 ; CHECK-NEXT:    mtfprd f0, r3
diff --git a/llvm/test/CodeGen/PowerPC/vector-reduce-fadd.ll b/llvm/test/CodeGen/PowerPC/vector-reduce-fadd.ll
index edd3fb7..4a036a7 100644
--- a/llvm/test/CodeGen/PowerPC/vector-reduce-fadd.ll
+++ b/llvm/test/CodeGen/PowerPC/vector-reduce-fadd.ll
@@ -1081,14 +1081,12 @@ define dso_local double @v2f64_fast(<2 x double> %a) local_unnamed_addr #0 {
 ; PWR9LE-NEXT:    xxswapd vs0, v2
 ; PWR9LE-NEXT:    xvadddp vs0, v2, vs0
 ; PWR9LE-NEXT:    xxswapd vs1, vs0
-; PWR9LE-NEXT:    # kill: def $f1 killed $f1 killed $vsl1
 ; PWR9LE-NEXT:    blr
 ;
 ; PWR9BE-LABEL: v2f64_fast:
 ; PWR9BE:       # %bb.0: # %entry
 ; PWR9BE-NEXT:    xxswapd vs0, v2
 ; PWR9BE-NEXT:    xvadddp vs1, v2, vs0
-; PWR9BE-NEXT:    # kill: def $f1 killed $f1 killed $vsl1
 ; PWR9BE-NEXT:    blr
 ;
 ; PWR10LE-LABEL: v2f64_fast:
@@ -1096,14 +1094,12 @@ define dso_local double @v2f64_fast(<2 x double> %a) local_unnamed_addr #0 {
 ; PWR10LE-NEXT:    xxswapd vs0, v2
 ; PWR10LE-NEXT:    xvadddp vs0, v2, vs0
 ; PWR10LE-NEXT:    xxswapd vs1, vs0
-; PWR10LE-NEXT:    # kill: def $f1 killed $f1 killed $vsl1
 ; PWR10LE-NEXT:    blr
 ;
 ; PWR10BE-LABEL: v2f64_fast:
 ; PWR10BE:       # %bb.0: # %entry
 ; PWR10BE-NEXT:    xxswapd vs0, v2
 ; PWR10BE-NEXT:    xvadddp vs1, v2, vs0
-; PWR10BE-NEXT:    # kill: def $f1 killed $f1 killed $vsl1
 ; PWR10BE-NEXT:    blr
 entry:
   %0 = call fast double @llvm.vector.reduce.fadd.v2f64(double -0.000000e+00, <2 x double> %a)
@@ -1203,7 +1199,6 @@ define dso_local double @v4f64_fast(<4 x double> %a) local_unnamed_addr #0 {
 ; PWR9LE-NEXT:    xxswapd vs1, vs0
 ; PWR9LE-NEXT:    xvadddp vs0, vs0, vs1
 ; PWR9LE-NEXT:    xxswapd vs1, vs0
-; PWR9LE-NEXT:    # kill: def $f1 killed $f1 killed $vsl1
 ; PWR9LE-NEXT:    blr
 ;
 ; PWR9BE-LABEL: v4f64_fast:
@@ -1211,7 +1206,6 @@ define dso_local double @v4f64_fast(<4 x double> %a) local_unnamed_addr #0 {
 ; PWR9BE-NEXT:    xvadddp vs0, v2, v3
 ; PWR9BE-NEXT:    xxswapd vs1, vs0
 ; PWR9BE-NEXT:    xvadddp vs1, vs0, vs1
-; PWR9BE-NEXT:    # kill: def $f1 killed $f1 killed $vsl1
 ; PWR9BE-NEXT:    blr
 ;
 ; PWR10LE-LABEL: v4f64_fast:
@@ -1220,7 +1214,6 @@ define dso_local double @v4f64_fast(<4 x double> %a) local_unnamed_addr #0 {
 ; PWR10LE-NEXT:    xxswapd vs1, vs0
 ; PWR10LE-NEXT:    xvadddp vs0, vs0, vs1
 ; PWR10LE-NEXT:    xxswapd vs1, vs0
-; PWR10LE-NEXT:    # kill: def $f1 killed $f1 killed $vsl1
 ; PWR10LE-NEXT:    blr
 ;
 ; PWR10BE-LABEL: v4f64_fast:
@@ -1228,7 +1221,6 @@ define dso_local double @v4f64_fast(<4 x double> %a) local_unnamed_addr #0 {
 ; PWR10BE-NEXT:    xvadddp vs0, v2, v3
 ; PWR10BE-NEXT:    xxswapd vs1, vs0
 ; PWR10BE-NEXT:    xvadddp vs1, vs0, vs1
-; PWR10BE-NEXT:    # kill: def $f1 killed $f1 killed $vsl1
 ; PWR10BE-NEXT:    blr
 entry:
   %0 = call fast double @llvm.vector.reduce.fadd.v4f64(double -0.000000e+00, <4 x double> %a)
@@ -1378,7 +1370,6 @@ define dso_local double @v8f64_fast(<8 x double> %a) local_unnamed_addr #0 {
 ; PWR9LE-NEXT:    xxswapd vs1, vs0
 ; PWR9LE-NEXT:    xvadddp vs0, vs0, vs1
 ; PWR9LE-NEXT:    xxswapd vs1, vs0
-; PWR9LE-NEXT:    # kill: def $f1 killed $f1 killed $vsl1
 ; PWR9LE-NEXT:    blr
 ;
 ; PWR9BE-LABEL: v8f64_fast:
@@ -1388,7 +1379,6 @@ define dso_local double @v8f64_fast(<8 x double> %a) local_unnamed_addr #0 {
 ; PWR9BE-NEXT:    xvadddp vs0, vs1, vs0
 ; PWR9BE-NEXT:    xxswapd vs1, vs0
 ; PWR9BE-NEXT:    xvadddp vs1, vs0, vs1
-; PWR9BE-NEXT:    # kill: def $f1 killed $f1 killed $vsl1
 ; PWR9BE-NEXT:    blr
 ;
 ; PWR10LE-LABEL: v8f64_fast:
@@ -1399,7 +1389,6 @@ define dso_local double @v8f64_fast(<8 x double> %a) local_unnamed_addr #0 {
 ; PWR10LE-NEXT:    xxswapd vs1, vs0
 ; PWR10LE-NEXT:    xvadddp vs0, vs0, vs1
 ; PWR10LE-NEXT:    xxswapd vs1, vs0
-; PWR10LE-NEXT:    # kill: def $f1 killed $f1 killed $vsl1
 ; PWR10LE-NEXT:    blr
 ;
 ; PWR10BE-LABEL: v8f64_fast:
@@ -1409,7 +1398,6 @@ define dso_local double @v8f64_fast(<8 x double> %a) local_unnamed_addr #0 {
 ; PWR10BE-NEXT:    xvadddp vs0, vs1, vs0
 ; PWR10BE-NEXT:    xxswapd vs1, vs0
 ; PWR10BE-NEXT:    xvadddp vs1, vs0, vs1
-; PWR10BE-NEXT:    # kill: def $f1 killed $f1 killed $vsl1
 ; PWR10BE-NEXT:    blr
 entry:
   %0 = call fast double @llvm.vector.reduce.fadd.v8f64(double -0.000000e+00, <8 x double> %a)
@@ -1659,7 +1647,6 @@ define dso_local double @v16f64_fast(<16 x double> %a) local_unnamed_addr #0 {
 ; PWR9LE-NEXT:    xxswapd vs1, vs0
 ; PWR9LE-NEXT:    xvadddp vs0, vs0, vs1
 ; PWR9LE-NEXT:    xxswapd vs1, vs0
-; PWR9LE-NEXT:    # kill: def $f1 killed $f1 killed $vsl1
 ; PWR9LE-NEXT:    blr
 ;
 ; PWR9BE-LABEL: v16f64_fast:
@@ -1673,7 +1660,6 @@ define dso_local double @v16f64_fast(<16 x double> %a) local_unnamed_addr #0 {
 ; PWR9BE-NEXT:    xvadddp vs0, vs0, vs2
 ; PWR9BE-NEXT:    xxswapd vs1, vs0
 ; PWR9BE-NEXT:    xvadddp vs1, vs0, vs1
-; PWR9BE-NEXT:    # kill: def $f1 killed $f1 killed $vsl1
 ; PWR9BE-NEXT:    blr
 ;
 ; PWR10LE-LABEL: v16f64_fast:
@@ -1688,7 +1674,6 @@ define dso_local double @v16f64_fast(<16 x double> %a) local_unnamed_addr #0 {
 ; PWR10LE-NEXT:    xxswapd vs1, vs0
 ; PWR10LE-NEXT:    xvadddp vs0, vs0, vs1
 ; PWR10LE-NEXT:    xxswapd vs1, vs0
-; PWR10LE-NEXT:    # kill: def $f1 killed $f1 killed $vsl1
 ; PWR10LE-NEXT:    blr
 ;
 ; PWR10BE-LABEL: v16f64_fast:
@@ -1702,7 +1687,6 @@ define dso_local double @v16f64_fast(<16 x double> %a) local_unnamed_addr #0 {
 ; PWR10BE-NEXT:    xvadddp vs0, vs0, vs2
 ; PWR10BE-NEXT:    xxswapd vs1, vs0
 ; PWR10BE-NEXT:    xvadddp vs1, vs0, vs1
-; PWR10BE-NEXT:    # kill: def $f1 killed $f1 killed $vsl1
 ; PWR10BE-NEXT:    blr
 entry:
   %0 = call fast double @llvm.vector.reduce.fadd.v16f64(double -0.000000e+00, <16 x double> %a)
@@ -2188,7 +2172,6 @@ define dso_local double @v32f64_fast(<32 x double> %a) local_unnamed_addr #0 {
 ; PWR9LE-NEXT:    xxswapd vs1, vs0
 ; PWR9LE-NEXT:    xvadddp vs0, vs0, vs1
 ; PWR9LE-NEXT:    xxswapd vs1, vs0
-; PWR9LE-NEXT:    # kill: def $f1 killed $f1 killed $vsl1
 ; PWR9LE-NEXT:    blr
 ;
 ; PWR9BE-LABEL: v32f64_fast:
@@ -2214,7 +2197,6 @@ define dso_local double @v32f64_fast(<32 x double> %a) local_unnamed_addr #0 {
 ; PWR9BE-NEXT:    xvadddp vs0, vs0, vs2
 ; PWR9BE-NEXT:    xxswapd vs1, vs0
 ; PWR9BE-NEXT:    xvadddp vs1, vs0, vs1
-; PWR9BE-NEXT:    # kill: def $f1 killed $f1 killed $vsl1
 ; PWR9BE-NEXT:    blr
 ;
 ; PWR10LE-LABEL: v32f64_fast:
@@ -2241,7 +2223,6 @@ define dso_local double @v32f64_fast(<32 x double> %a) local_unnamed_addr #0 {
 ; PWR10LE-NEXT:    xxswapd vs1, vs0
 ; PWR10LE-NEXT:    xvadddp vs0, vs0, vs1
 ; PWR10LE-NEXT:    xxswapd vs1, vs0
-; PWR10LE-NEXT:    # kill: def $f1 killed $f1 killed $vsl1
 ; PWR10LE-NEXT:    blr
 ;
 ; PWR10BE-LABEL: v32f64_fast:
@@ -2267,7 +2248,6 @@ define dso_local double @v32f64_fast(<32 x double> %a) local_unnamed_addr #0 {
 ; PWR10BE-NEXT:    xvadddp vs0, vs0, vs2
 ; PWR10BE-NEXT:    xxswapd vs1, vs0
 ; PWR10BE-NEXT:    xvadddp vs1, vs0, vs1
-; PWR10BE-NEXT:    # kill: def $f1 killed $f1 killed $vsl1
 ; PWR10BE-NEXT:    blr
 entry:
   %0 = call fast double @llvm.vector.reduce.fadd.v32f64(double -0.000000e+00, <32 x double> %a)
@@ -3297,7 +3277,6 @@ define dso_local double @v64f64_fast(<64 x double> %a) local_unnamed_addr #0 {
 ; PWR9LE-NEXT:    xxswapd vs1, vs0
 ; PWR9LE-NEXT:    xvadddp vs0, vs0, vs1
 ; PWR9LE-NEXT:    xxswapd vs1, vs0
-; PWR9LE-NEXT:    # kill: def $f1 killed $f1 killed $vsl1
 ; PWR9LE-NEXT:    blr
 ;
 ; PWR9BE-LABEL: v64f64_fast:
@@ -3355,7 +3334,6 @@ define dso_local double @v64f64_fast(<64 x double> %a) local_unnamed_addr #0 {
 ; PWR9BE-NEXT:    xvadddp vs0, vs1, vs0
 ; PWR9BE-NEXT:    xxswapd vs1, vs0
 ; PWR9BE-NEXT:    xvadddp vs1, vs0, vs1
-; PWR9BE-NEXT:    # kill: def $f1 killed $f1 killed $vsl1
 ; PWR9BE-NEXT:    blr
 ;
 ; PWR10LE-LABEL: v64f64_fast:
@@ -3414,7 +3392,6 @@ define dso_local double @v64f64_fast(<64 x double> %a) local_unnamed_addr #0 {
 ; PWR10LE-NEXT:    xxswapd vs1, vs0
 ; PWR10LE-NEXT:    xvadddp vs0, vs0, vs1
 ; PWR10LE-NEXT:    xxswapd vs1, vs0
-; PWR10LE-NEXT:    # kill: def $f1 killed $f1 killed $vsl1
 ; PWR10LE-NEXT:    blr
 ;
 ; PWR10BE-LABEL: v64f64_fast:
@@ -3472,7 +3449,6 @@ define dso_local double @v64f64_fast(<64 x double> %a) local_unnamed_addr #0 {
 ; PWR10BE-NEXT:    xvadddp vs0, vs1, vs0
 ; PWR10BE-NEXT:    xxswapd vs1, vs0
 ; PWR10BE-NEXT:    xvadddp vs1, vs0, vs1
-; PWR10BE-NEXT:    # kill: def $f1 killed $f1 killed $vsl1
 ; PWR10BE-NEXT:    blr
 entry:
   %0 = call fast double @llvm.vector.reduce.fadd.v64f64(double -0.000000e+00, <64 x double> %a)
@@ -3660,8 +3636,6 @@ define dso_local ppc_fp128 @v2ppcf128_fast(<2 x ppc_fp128> %a) local_unnamed_add
 ; PWR9LE-NEXT:    stfd f1, 32(r1)
 ; PWR9LE-NEXT:    lxv vs1, 32(r1)
 ; PWR9LE-NEXT:    xxswapd vs2, vs1
-; PWR9LE-NEXT:    # kill: def $f1 killed $f1 killed $vsl1
-; PWR9LE-NEXT:    # kill: def $f2 killed $f2 killed $vsl2
 ; PWR9LE-NEXT:    addi r1, r1, 64
 ; PWR9LE-NEXT:    ld r0, 16(r1)
 ; PWR9LE-NEXT:    mtlr r0
@@ -3678,8 +3652,6 @@ define dso_local ppc_fp128 @v2ppcf128_fast(<2 x ppc_fp128> %a) local_unnamed_add
 ; PWR9BE-NEXT:    stfd f1, 112(r1)
 ; PWR9BE-NEXT:    lxv vs1, 112(r1)
 ; PWR9BE-NEXT:    xxswapd vs2, vs1
-; PWR9BE-NEXT:    # kill: def $f1 killed $f1 killed $vsl1
-; PWR9BE-NEXT:    # kill: def $f2 killed $f2 killed $vsl2
 ; PWR9BE-NEXT:    addi r1, r1, 144
 ; PWR9BE-NEXT:    ld r0, 16(r1)
 ; PWR9BE-NEXT:    mtlr r0
@@ -3695,8 +3667,6 @@ define dso_local ppc_fp128 @v2ppcf128_fast(<2 x ppc_fp128> %a) local_unnamed_add
 ; PWR10LE-NEXT:    stfd f1, 32(r1)
 ; PWR10LE-NEXT:    lxv vs1, 32(r1)
 ; PWR10LE-NEXT:    xxswapd vs2, vs1
-; PWR10LE-NEXT:    # kill: def $f1 killed $f1 killed $vsl1
-; PWR10LE-NEXT:    # kill: def $f2 killed $f2 killed $vsl2
 ; PWR10LE-NEXT:    addi r1, r1, 64
 ; PWR10LE-NEXT:    ld r0, 16(r1)
 ; PWR10LE-NEXT:    mtlr r0
@@ -3713,8 +3683,6 @@ define dso_local ppc_fp128 @v2ppcf128_fast(<2 x ppc_fp128> %a) local_unnamed_add
 ; PWR10BE-NEXT:    stfd f1, 112(r1)
 ; PWR10BE-NEXT:    lxv vs1, 112(r1)
 ; PWR10BE-NEXT:    xxswapd vs2, vs1
-; PWR10BE-NEXT:    # kill: def $f1 killed $f1 killed $vsl1
-; PWR10BE-NEXT:    # kill: def $f2 killed $f2 killed $vsl2
 ; PWR10BE-NEXT:    addi r1, r1, 144
 ; PWR10BE-NEXT:    ld r0, 16(r1)
 ; PWR10BE-NEXT:    mtlr r0
@@ -4077,8 +4045,6 @@ define dso_local ppc_fp128 @v4ppcf128_fast(<4 x ppc_fp128> %a) local_unnamed_add
 ; PWR9LE-NEXT:    stfd f1, 32(r1)
 ; PWR9LE-NEXT:    lxv vs1, 32(r1)
 ; PWR9LE-NEXT:    xxswapd vs2, vs1
-; PWR9LE-NEXT:    # kill: def $f1 killed $f1 killed $vsl1
-; PWR9LE-NEXT:    # kill: def $f2 killed $f2 killed $vsl2
 ; PWR9LE-NEXT:    addi r1, r1, 96
 ; PWR9LE-NEXT:    ld r0, 16(r1)
 ; PWR9LE-NEXT:    lfd f31, -8(r1) # 8-byte Folded Reload
@@ -4133,8 +4099,6 @@ define dso_local ppc_fp128 @v4ppcf128_fast(<4 x ppc_fp128> %a) local_unnamed_add
 ; PWR9BE-NEXT:    lfd f28, 144(r1) # 8-byte Folded Reload
 ; PWR9BE-NEXT:    lfd f27, 136(r1) # 8-byte Folded Reload
 ; PWR9BE-NEXT:    lfd f26, 128(r1) # 8-byte Folded Reload
-; PWR9BE-NEXT:    # kill: def $f1 killed $f1 killed $vsl1
-; PWR9BE-NEXT:    # kill: def $f2 killed $f2 killed $vsl2
 ; PWR9BE-NEXT:    addi r1, r1, 176
 ; PWR9BE-NEXT:    ld r0, 16(r1)
 ; PWR9BE-NEXT:    mtlr r0
@@ -4174,8 +4138,6 @@ define dso_local ppc_fp128 @v4ppcf128_fast(<4 x ppc_fp128> %a) local_unnamed_add
 ; PWR10LE-NEXT:    stfd f1, 32(r1)
 ; PWR10LE-NEXT:    lxv vs1, 32(r1)
 ; PWR10LE-NEXT:    xxswapd vs2, vs1
-; PWR10LE-NEXT:    # kill: def $f1 killed $f1 killed $vsl1
-; PWR10LE-NEXT:    # kill: def $f2 killed $f2 killed $vsl2
 ; PWR10LE-NEXT:    addi r1, r1, 96
 ; PWR10LE-NEXT:    ld r0, 16(r1)
 ; PWR10LE-NEXT:    lfd f31, -8(r1) # 8-byte Folded Reload
@@ -4230,8 +4192,6 @@ define dso_local ppc_fp128 @v4ppcf128_fast(<4 x ppc_fp128> %a) local_unnamed_add
 ; PWR10BE-NEXT:    lfd f26, 128(r1) # 8-byte Folded Reload
 ; PWR10BE-NEXT:    lxv vs1, 112(r1)
 ; PWR10BE-NEXT:    xxswapd vs2, vs1
-; PWR10BE-NEXT:    # kill: def $f1 killed $f1 killed $vsl1
-; PWR10BE-NEXT:    # kill: def $f2 killed $f2 killed $vsl2
 ; PWR10BE-NEXT:    addi r1, r1, 176
 ; PWR10BE-NEXT:    ld r0, 16(r1)
 ; PWR10BE-NEXT:    mtlr r0
diff --git a/llvm/test/CodeGen/PowerPC/vector-reduce-fmax.ll b/llvm/test/CodeGen/PowerPC/vector-reduce-fmax.ll
index b1f72f6..7d02414 100644
--- a/llvm/test/CodeGen/PowerPC/vector-reduce-fmax.ll
+++ b/llvm/test/CodeGen/PowerPC/vector-reduce-fmax.ll
@@ -635,14 +635,12 @@ define dso_local double @v2f64_fast(<2 x double> %a) local_unnamed_addr #0 {
 ; PWR9LE-NEXT:    xxswapd vs0, v2
 ; PWR9LE-NEXT:    xvmaxdp vs0, v2, vs0
 ; PWR9LE-NEXT:    xxswapd vs1, vs0
-; PWR9LE-NEXT:    # kill: def $f1 killed $f1 killed $vsl1
 ; PWR9LE-NEXT:    blr
 ;
 ; PWR9BE-LABEL: v2f64_fast:
 ; PWR9BE:       # %bb.0: # %entry
 ; PWR9BE-NEXT:    xxswapd vs0, v2
 ; PWR9BE-NEXT:    xvmaxdp vs1, v2, vs0
-; PWR9BE-NEXT:    # kill: def $f1 killed $f1 killed $vsl1
 ; PWR9BE-NEXT:    blr
 ;
 ; PWR10LE-LABEL: v2f64_fast:
@@ -650,14 +648,12 @@ define dso_local double @v2f64_fast(<2 x double> %a) local_unnamed_addr #0 {
 ; PWR10LE-NEXT:    xxswapd vs0, v2
 ; PWR10LE-NEXT:    xvmaxdp vs0, v2, vs0
 ; PWR10LE-NEXT:    xxswapd vs1, vs0
-; PWR10LE-NEXT:    # kill: def $f1 killed $f1 killed $vsl1
 ; PWR10LE-NEXT:    blr
 ;
 ; PWR10BE-LABEL: v2f64_fast:
 ; PWR10BE:       # %bb.0: # %entry
 ; PWR10BE-NEXT:    xxswapd vs0, v2
 ; PWR10BE-NEXT:    xvmaxdp vs1, v2, vs0
-; PWR10BE-NEXT:    # kill: def $f1 killed $f1 killed $vsl1
 ; PWR10BE-NEXT:    blr
 entry:
   %0 = call fast double @llvm.vector.reduce.fmax.v2f64(<2 x double> %a)
@@ -704,7 +700,6 @@ define dso_local double @v4f64_fast(<4 x double> %a) local_unnamed_addr #0 {
 ; PWR9LE-NEXT:    xxswapd vs1, vs0
 ; PWR9LE-NEXT:    xvmaxdp vs0, vs0, vs1
 ; PWR9LE-NEXT:    xxswapd vs1, vs0
-; PWR9LE-NEXT:    # kill: def $f1 killed $f1 killed $vsl1
 ; PWR9LE-NEXT:    blr
 ;
 ; PWR9BE-LABEL: v4f64_fast:
@@ -712,7 +707,6 @@ define dso_local double @v4f64_fast(<4 x double> %a) local_unnamed_addr #0 {
 ; PWR9BE-NEXT:    xvmaxdp vs0, v2, v3
 ; PWR9BE-NEXT:    xxswapd vs1, vs0
 ; PWR9BE-NEXT:    xvmaxdp vs1, vs0, vs1
-; PWR9BE-NEXT:    # kill: def $f1 killed $f1 killed $vsl1
 ; PWR9BE-NEXT:    blr
 ;
 ; PWR10LE-LABEL: v4f64_fast:
@@ -721,7 +715,6 @@ define dso_local double @v4f64_fast(<4 x double> %a) local_unnamed_addr #0 {
 ; PWR10LE-NEXT:    xxswapd vs1, vs0
 ; PWR10LE-NEXT:    xvmaxdp vs0, vs0, vs1
 ; PWR10LE-NEXT:    xxswapd vs1, vs0
-; PWR10LE-NEXT:    # kill: def $f1 killed $f1 killed $vsl1
 ; PWR10LE-NEXT:    blr
 ;
 ; PWR10BE-LABEL: v4f64_fast:
@@ -729,7 +722,6 @@ define dso_local double @v4f64_fast(<4 x double> %a) local_unnamed_addr #0 {
 ; PWR10BE-NEXT:    xvmaxdp vs0, v2, v3
 ; PWR10BE-NEXT:    xxswapd vs1, vs0
 ; PWR10BE-NEXT:    xvmaxdp vs1, vs0, vs1
-; PWR10BE-NEXT:    # kill: def $f1 killed $f1 killed $vsl1
 ; PWR10BE-NEXT:    blr
 entry:
   %0 = call fast double @llvm.vector.reduce.fmax.v4f64(<4 x double> %a)
@@ -786,7 +778,6 @@ define dso_local double @v8f64_fast(<8 x double> %a) local_unnamed_addr #0 {
 ; PWR9LE-NEXT:    xxswapd vs1, vs0
 ; PWR9LE-NEXT:    xvmaxdp vs0, vs0, vs1
 ; PWR9LE-NEXT:    xxswapd vs1, vs0
-; PWR9LE-NEXT:    # kill: def $f1 killed $f1 killed $vsl1
 ; PWR9LE-NEXT:    blr
 ;
 ; PWR9BE-LABEL: v8f64_fast:
@@ -796,7 +787,6 @@ define dso_local double @v8f64_fast(<8 x double> %a) local_unnamed_addr #0 {
 ; PWR9BE-NEXT:    xvmaxdp vs0, vs1, vs0
 ; PWR9BE-NEXT:    xxswapd vs1, vs0
 ; PWR9BE-NEXT:    xvmaxdp vs1, vs0, vs1
-; PWR9BE-NEXT:    # kill: def $f1 killed $f1 killed $vsl1
 ; PWR9BE-NEXT:    blr
 ;
 ; PWR10LE-LABEL: v8f64_fast:
@@ -807,7 +797,6 @@ define dso_local double @v8f64_fast(<8 x double> %a) local_unnamed_addr #0 {
 ; PWR10LE-NEXT:    xxswapd vs1, vs0
 ; PWR10LE-NEXT:    xvmaxdp vs0, vs0, vs1
 ; PWR10LE-NEXT:    xxswapd vs1, vs0
-; PWR10LE-NEXT:    # kill: def $f1 killed $f1 killed $vsl1
 ; PWR10LE-NEXT:    blr
 ;
 ; PWR10BE-LABEL: v8f64_fast:
@@ -817,7 +806,6 @@ define dso_local double @v8f64_fast(<8 x double> %a) local_unnamed_addr #0 {
 ; PWR10BE-NEXT:    xvmaxdp vs0, vs1, vs0
 ; PWR10BE-NEXT:    xxswapd vs1, vs0
 ; PWR10BE-NEXT:    xvmaxdp vs1, vs0, vs1
-; PWR10BE-NEXT:    # kill: def $f1 killed $f1 killed $vsl1
 ; PWR10BE-NEXT:    blr
 entry:
   %0 = call fast double @llvm.vector.reduce.fmax.v8f64(<8 x double> %a)
@@ -894,7 +882,6 @@ define dso_local double @v16f64_fast(<16 x double> %a) local_unnamed_addr #0 {
 ; PWR9LE-NEXT:    xxswapd vs1, vs0
 ; PWR9LE-NEXT:    xvmaxdp vs0, vs0, vs1
 ; PWR9LE-NEXT:    xxswapd vs1, vs0
-; PWR9LE-NEXT:    # kill: def $f1 killed $f1 killed $vsl1
 ; PWR9LE-NEXT:    blr
 ;
 ; PWR9BE-LABEL: v16f64_fast:
@@ -908,7 +895,6 @@ define dso_local double @v16f64_fast(<16 x double> %a) local_unnamed_addr #0 {
 ; PWR9BE-NEXT:    xvmaxdp vs0, vs0, vs2
 ; PWR9BE-NEXT:    xxswapd vs1, vs0
 ; PWR9BE-NEXT:    xvmaxdp vs1, vs0, vs1
-; PWR9BE-NEXT:    # kill: def $f1 killed $f1 killed $vsl1
 ; PWR9BE-NEXT:    blr
 ;
 ; PWR10LE-LABEL: v16f64_fast:
@@ -923,7 +909,6 @@ define dso_local double @v16f64_fast(<16 x double> %a) local_unnamed_addr #0 {
 ; PWR10LE-NEXT:    xxswapd vs1, vs0
 ; PWR10LE-NEXT:    xvmaxdp vs0, vs0, vs1
 ; PWR10LE-NEXT:    xxswapd vs1, vs0
-; PWR10LE-NEXT:    # kill: def $f1 killed $f1 killed $vsl1
 ; PWR10LE-NEXT:    blr
 ;
 ; PWR10BE-LABEL: v16f64_fast:
@@ -937,7 +922,6 @@ define dso_local double @v16f64_fast(<16 x double> %a) local_unnamed_addr #0 {
 ; PWR10BE-NEXT:    xvmaxdp vs0, vs0, vs2
 ; PWR10BE-NEXT:    xxswapd vs1, vs0
 ; PWR10BE-NEXT:    xvmaxdp vs1, vs0, vs1
-; PWR10BE-NEXT:    # kill: def $f1 killed $f1 killed $vsl1
 ; PWR10BE-NEXT:    blr
 entry:
   %0 = call fast double @llvm.vector.reduce.fmax.v16f64(<16 x double> %a)
@@ -1074,7 +1058,6 @@ define dso_local double @v32f64_fast(<32 x double> %a) local_unnamed_addr #0 {
 ; PWR9LE-NEXT:    xxswapd vs1, vs0
 ; PWR9LE-NEXT:    xvmaxdp vs0, vs0, vs1
 ; PWR9LE-NEXT:    xxswapd vs1, vs0
-; PWR9LE-NEXT:    # kill: def $f1 killed $f1 killed $vsl1
 ; PWR9LE-NEXT:    blr
 ;
 ; PWR9BE-LABEL: v32f64_fast:
@@ -1100,7 +1083,6 @@ define dso_local double @v32f64_fast(<32 x double> %a) local_unnamed_addr #0 {
 ; PWR9BE-NEXT:    xvmaxdp vs0, vs0, vs2
 ; PWR9BE-NEXT:    xxswapd vs1, vs0
 ; PWR9BE-NEXT:    xvmaxdp vs1, vs0, vs1
-; PWR9BE-NEXT:    # kill: def $f1 killed $f1 killed $vsl1
 ; PWR9BE-NEXT:    blr
 ;
 ; PWR10LE-LABEL: v32f64_fast:
@@ -1127,7 +1109,6 @@ define dso_local double @v32f64_fast(<32 x double> %a) local_unnamed_addr #0 {
 ; PWR10LE-NEXT:    xxswapd vs1, vs0
 ; PWR10LE-NEXT:    xvmaxdp vs0, vs0, vs1
 ; PWR10LE-NEXT:    xxswapd vs1, vs0
-; PWR10LE-NEXT:    # kill: def $f1 killed $f1 killed $vsl1
 ; PWR10LE-NEXT:    blr
 ;
 ; PWR10BE-LABEL: v32f64_fast:
@@ -1153,7 +1134,6 @@ define dso_local double @v32f64_fast(<32 x double> %a) local_unnamed_addr #0 {
 ; PWR10BE-NEXT:    xvmaxdp vs0, vs0, vs2
 ; PWR10BE-NEXT:    xxswapd vs1, vs0
 ; PWR10BE-NEXT:    xvmaxdp vs1, vs0, vs1
-; PWR10BE-NEXT:    # kill: def $f1 killed $f1 killed $vsl1
 ; PWR10BE-NEXT:    blr
 entry:
   %0 = call fast double @llvm.vector.reduce.fmax.v32f64(<32 x double> %a)
diff --git a/llvm/test/CodeGen/PowerPC/vector-reduce-fmin.ll b/llvm/test/CodeGen/PowerPC/vector-reduce-fmin.ll
index e806a70..9b01889 100644
--- a/llvm/test/CodeGen/PowerPC/vector-reduce-fmin.ll
+++ b/llvm/test/CodeGen/PowerPC/vector-reduce-fmin.ll
@@ -635,14 +635,12 @@ define dso_local double @v2f64_fast(<2 x double> %a) local_unnamed_addr #0 {
 ; PWR9LE-NEXT:    xxswapd vs0, v2
 ; PWR9LE-NEXT:    xvmindp vs0, v2, vs0
 ; PWR9LE-NEXT:    xxswapd vs1, vs0
-; PWR9LE-NEXT:    # kill: def $f1 killed $f1 killed $vsl1
 ; PWR9LE-NEXT:    blr
 ;
 ; PWR9BE-LABEL: v2f64_fast:
 ; PWR9BE:       # %bb.0: # %entry
 ; PWR9BE-NEXT:    xxswapd vs0, v2
 ; PWR9BE-NEXT:    xvmindp vs1, v2, vs0
-; PWR9BE-NEXT:    # kill: def $f1 killed $f1 killed $vsl1
 ; PWR9BE-NEXT:    blr
 ;
 ; PWR10LE-LABEL: v2f64_fast:
@@ -650,14 +648,12 @@ define dso_local double @v2f64_fast(<2 x double> %a) local_unnamed_addr #0 {
 ; PWR10LE-NEXT:    xxswapd vs0, v2
 ; PWR10LE-NEXT:    xvmindp vs0, v2, vs0
 ; PWR10LE-NEXT:    xxswapd vs1, vs0
-; PWR10LE-NEXT:    # kill: def $f1 killed $f1 killed $vsl1
 ; PWR10LE-NEXT:    blr
 ;
 ; PWR10BE-LABEL: v2f64_fast:
 ; PWR10BE:       # %bb.0: # %entry
 ; PWR10BE-NEXT:    xxswapd vs0, v2
 ; PWR10BE-NEXT:    xvmindp vs1, v2, vs0
-; PWR10BE-NEXT:    # kill: def $f1 killed $f1 killed $vsl1
 ; PWR10BE-NEXT:    blr
 entry:
   %0 = call fast double @llvm.vector.reduce.fmin.v2f64(<2 x double> %a)
@@ -704,7 +700,6 @@ define dso_local double @v4f64_fast(<4 x double> %a) local_unnamed_addr #0 {
 ; PWR9LE-NEXT:    xxswapd vs1, vs0
 ; PWR9LE-NEXT:    xvmindp vs0, vs0, vs1
 ; PWR9LE-NEXT:    xxswapd vs1, vs0
-; PWR9LE-NEXT:    # kill: def $f1 killed $f1 killed $vsl1
 ; PWR9LE-NEXT:    blr
 ;
 ; PWR9BE-LABEL: v4f64_fast:
@@ -712,7 +707,6 @@ define dso_local double @v4f64_fast(<4 x double> %a) local_unnamed_addr #0 {
 ; PWR9BE-NEXT:    xvmindp vs0, v2, v3
 ; PWR9BE-NEXT:    xxswapd vs1, vs0
 ; PWR9BE-NEXT:    xvmindp vs1, vs0, vs1
-; PWR9BE-NEXT:    # kill: def $f1 killed $f1 killed $vsl1
 ; PWR9BE-NEXT:    blr
 ;
 ; PWR10LE-LABEL: v4f64_fast:
@@ -721,7 +715,6 @@ define dso_local double @v4f64_fast(<4 x double> %a) local_unnamed_addr #0 {
 ; PWR10LE-NEXT:    xxswapd vs1, vs0
 ; PWR10LE-NEXT:    xvmindp vs0, vs0, vs1
 ; PWR10LE-NEXT:    xxswapd vs1, vs0
-; PWR10LE-NEXT:    # kill: def $f1 killed $f1 killed $vsl1
 ; PWR10LE-NEXT:    blr
 ;
 ; PWR10BE-LABEL: v4f64_fast:
@@ -729,7 +722,6 @@ define dso_local double @v4f64_fast(<4 x double> %a) local_unnamed_addr #0 {
 ; PWR10BE-NEXT:    xvmindp vs0, v2, v3
 ; PWR10BE-NEXT:    xxswapd vs1, vs0
 ; PWR10BE-NEXT:    xvmindp vs1, vs0, vs1
-; PWR10BE-NEXT:    # kill: def $f1 killed $f1 killed $vsl1
 ; PWR10BE-NEXT:    blr
 entry:
   %0 = call fast double @llvm.vector.reduce.fmin.v4f64(<4 x double> %a)
@@ -786,7 +778,6 @@ define dso_local double @v8f64_fast(<8 x double> %a) local_unnamed_addr #0 {
 ; PWR9LE-NEXT:    xxswapd vs1, vs0
 ; PWR9LE-NEXT:    xvmindp vs0, vs0, vs1
 ; PWR9LE-NEXT:    xxswapd vs1, vs0
-; PWR9LE-NEXT:    # kill: def $f1 killed $f1 killed $vsl1
 ; PWR9LE-NEXT:    blr
 ;
 ; PWR9BE-LABEL: v8f64_fast:
@@ -796,7 +787,6 @@ define dso_local double @v8f64_fast(<8 x double> %a) local_unnamed_addr #0 {
 ; PWR9BE-NEXT:    xvmindp vs0, vs1, vs0
 ; PWR9BE-NEXT:    xxswapd vs1, vs0
 ; PWR9BE-NEXT:    xvmindp vs1, vs0, vs1
-; PWR9BE-NEXT:    # kill: def $f1 killed $f1 killed $vsl1
 ; PWR9BE-NEXT:    blr
 ;
 ; PWR10LE-LABEL: v8f64_fast:
@@ -807,7 +797,6 @@ define dso_local double @v8f64_fast(<8 x double> %a) local_unnamed_addr #0 {
 ; PWR10LE-NEXT:    xxswapd vs1, vs0
 ; PWR10LE-NEXT:    xvmindp vs0, vs0, vs1
 ; PWR10LE-NEXT:    xxswapd vs1, vs0
-; PWR10LE-NEXT:    # kill: def $f1 killed $f1 killed $vsl1
 ; PWR10LE-NEXT:    blr
 ;
 ; PWR10BE-LABEL: v8f64_fast:
@@ -817,7 +806,6 @@ define dso_local double @v8f64_fast(<8 x double> %a) local_unnamed_addr #0 {
 ; PWR10BE-NEXT:    xvmindp vs0, vs1, vs0
 ; PWR10BE-NEXT:    xxswapd vs1, vs0
 ; PWR10BE-NEXT:    xvmindp vs1, vs0, vs1
-; PWR10BE-NEXT:    # kill: def $f1 killed $f1 killed $vsl1
 ; PWR10BE-NEXT:    blr
 entry:
   %0 = call fast double @llvm.vector.reduce.fmin.v8f64(<8 x double> %a)
@@ -894,7 +882,6 @@ define dso_local double @v16f64_fast(<16 x double> %a) local_unnamed_addr #0 {
 ; PWR9LE-NEXT:    xxswapd vs1, vs0
 ; PWR9LE-NEXT:    xvmindp vs0, vs0, vs1
 ; PWR9LE-NEXT:    xxswapd vs1, vs0
-; PWR9LE-NEXT:    # kill: def $f1 killed $f1 killed $vsl1
 ; PWR9LE-NEXT:    blr
 ;
 ; PWR9BE-LABEL: v16f64_fast:
@@ -908,7 +895,6 @@ define dso_local double @v16f64_fast(<16 x double> %a) local_unnamed_addr #0 {
 ; PWR9BE-NEXT:    xvmindp vs0, vs0, vs2
 ; PWR9BE-NEXT:    xxswapd vs1, vs0
 ; PWR9BE-NEXT:    xvmindp vs1, vs0, vs1
-; PWR9BE-NEXT:    # kill: def $f1 killed $f1 killed $vsl1
 ; PWR9BE-NEXT:    blr
 ;
 ; PWR10LE-LABEL: v16f64_fast:
@@ -923,7 +909,6 @@ define dso_local double @v16f64_fast(<16 x double> %a) local_unnamed_addr #0 {
 ; PWR10LE-NEXT:    xxswapd vs1, vs0
 ; PWR10LE-NEXT:    xvmindp vs0, vs0, vs1
 ; PWR10LE-NEXT:    xxswapd vs1, vs0
-; PWR10LE-NEXT:    # kill: def $f1 killed $f1 killed $vsl1
 ; PWR10LE-NEXT:    blr
 ;
 ; PWR10BE-LABEL: v16f64_fast:
@@ -937,7 +922,6 @@ define dso_local double @v16f64_fast(<16 x double> %a) local_unnamed_addr #0 {
 ; PWR10BE-NEXT:    xvmindp vs0, vs0, vs2
 ; PWR10BE-NEXT:    xxswapd vs1, vs0
 ; PWR10BE-NEXT:    xvmindp vs1, vs0, vs1
-; PWR10BE-NEXT:    # kill: def $f1 killed $f1 killed $vsl1
 ; PWR10BE-NEXT:    blr
 entry:
   %0 = call fast double @llvm.vector.reduce.fmin.v16f64(<16 x double> %a)
@@ -1074,7 +1058,6 @@ define dso_local double @v32f64_fast(<32 x double> %a) local_unnamed_addr #0 {
 ; PWR9LE-NEXT:    xxswapd vs1, vs0
 ; PWR9LE-NEXT:    xvmindp vs0, vs0, vs1
 ; PWR9LE-NEXT:    xxswapd vs1, vs0
-; PWR9LE-NEXT:    # kill: def $f1 killed $f1 killed $vsl1
 ; PWR9LE-NEXT:    blr
 ;
 ; PWR9BE-LABEL: v32f64_fast:
@@ -1100,7 +1083,6 @@ define dso_local double @v32f64_fast(<32 x double> %a) local_unnamed_addr #0 {
 ; PWR9BE-NEXT:    xvmindp vs0, vs0, vs2
 ; PWR9BE-NEXT:    xxswapd vs1, vs0
 ; PWR9BE-NEXT:    xvmindp vs1, vs0, vs1
-; PWR9BE-NEXT:    # kill: def $f1 killed $f1 killed $vsl1
 ; PWR9BE-NEXT:    blr
 ;
 ; PWR10LE-LABEL: v32f64_fast:
@@ -1127,7 +1109,6 @@ define dso_local double @v32f64_fast(<32 x double> %a) local_unnamed_addr #0 {
 ; PWR10LE-NEXT:    xxswapd vs1, vs0
 ; PWR10LE-NEXT:    xvmindp vs0, vs0, vs1
 ; PWR10LE-NEXT:    xxswapd vs1, vs0
-; PWR10LE-NEXT:    # kill: def $f1 killed $f1 killed $vsl1
 ; PWR10LE-NEXT:    blr
 ;
 ; PWR10BE-LABEL: v32f64_fast:
@@ -1153,7 +1134,6 @@ define dso_local double @v32f64_fast(<32 x double> %a) local_unnamed_addr #0 {
 ; PWR10BE-NEXT:    xvmindp vs0, vs0, vs2
 ; PWR10BE-NEXT:    xxswapd vs1, vs0
 ; PWR10BE-NEXT:    xvmindp vs1, vs0, vs1
-; PWR10BE-NEXT:    # kill: def $f1 killed $f1 killed $vsl1
 ; PWR10BE-NEXT:    blr
 entry:
   %0 = call fast double @llvm.vector.reduce.fmin.v32f64(<32 x double> %a)
diff --git a/llvm/test/CodeGen/PowerPC/vector-reduce-fmul.ll b/llvm/test/CodeGen/PowerPC/vector-reduce-fmul.ll
index e123f5c..b566bb9 100644
--- a/llvm/test/CodeGen/PowerPC/vector-reduce-fmul.ll
+++ b/llvm/test/CodeGen/PowerPC/vector-reduce-fmul.ll
@@ -1081,14 +1081,12 @@ define dso_local double @v2f64_fast(<2 x double> %a) local_unnamed_addr #0 {
 ; PWR9LE-NEXT:    xxswapd vs0, v2
 ; PWR9LE-NEXT:    xvmuldp vs0, v2, vs0
 ; PWR9LE-NEXT:    xxswapd vs1, vs0
-; PWR9LE-NEXT:    # kill: def $f1 killed $f1 killed $vsl1
 ; PWR9LE-NEXT:    blr
 ;
 ; PWR9BE-LABEL: v2f64_fast:
 ; PWR9BE:       # %bb.0: # %entry
 ; PWR9BE-NEXT:    xxswapd vs0, v2
 ; PWR9BE-NEXT:    xvmuldp vs1, v2, vs0
-; PWR9BE-NEXT:    # kill: def $f1 killed $f1 killed $vsl1
 ; PWR9BE-NEXT:    blr
 ;
 ; PWR10LE-LABEL: v2f64_fast:
@@ -1096,14 +1094,12 @@ define dso_local double @v2f64_fast(<2 x double> %a) local_unnamed_addr #0 {
 ; PWR10LE-NEXT:    xxswapd vs0, v2
 ; PWR10LE-NEXT:    xvmuldp vs0, v2, vs0
 ; PWR10LE-NEXT:    xxswapd vs1, vs0
-; PWR10LE-NEXT:    # kill: def $f1 killed $f1 killed $vsl1
 ; PWR10LE-NEXT:    blr
 ;
 ; PWR10BE-LABEL: v2f64_fast:
 ; PWR10BE:       # %bb.0: # %entry
 ; PWR10BE-NEXT:    xxswapd vs0, v2
 ; PWR10BE-NEXT:    xvmuldp vs1, v2, vs0
-; PWR10BE-NEXT:    # kill: def $f1 killed $f1 killed $vsl1
 ; PWR10BE-NEXT:    blr
 entry:
   %0 = call fast double @llvm.vector.reduce.fmul.v2f64(double 1.000000e+00, <2 x double> %a)
@@ -1203,7 +1199,6 @@ define dso_local double @v4f64_fast(<4 x double> %a) local_unnamed_addr #0 {
 ; PWR9LE-NEXT:    xxswapd vs1, vs0
 ; PWR9LE-NEXT:    xvmuldp vs0, vs0, vs1
 ; PWR9LE-NEXT:    xxswapd vs1, vs0
-; PWR9LE-NEXT:    # kill: def $f1 killed $f1 killed $vsl1
 ; PWR9LE-NEXT:    blr
 ;
 ; PWR9BE-LABEL: v4f64_fast:
@@ -1211,7 +1206,6 @@ define dso_local double @v4f64_fast(<4 x double> %a) local_unnamed_addr #0 {
 ; PWR9BE-NEXT:    xvmuldp vs0, v2, v3
 ; PWR9BE-NEXT:    xxswapd vs1, vs0
 ; PWR9BE-NEXT:    xvmuldp vs1, vs0, vs1
-; PWR9BE-NEXT:    # kill: def $f1 killed $f1 killed $vsl1
 ; PWR9BE-NEXT:    blr
 ;
 ; PWR10LE-LABEL: v4f64_fast:
@@ -1220,7 +1214,6 @@ define dso_local double @v4f64_fast(<4 x double> %a) local_unnamed_addr #0 {
 ; PWR10LE-NEXT:    xxswapd vs1, vs0
 ; PWR10LE-NEXT:    xvmuldp vs0, vs0, vs1
 ; PWR10LE-NEXT:    xxswapd vs1, vs0
-; PWR10LE-NEXT:    # kill: def $f1 killed $f1 killed $vsl1
 ; PWR10LE-NEXT:    blr
 ;
 ; PWR10BE-LABEL: v4f64_fast:
@@ -1228,7 +1221,6 @@ define dso_local double @v4f64_fast(<4 x double> %a) local_unnamed_addr #0 {
 ; PWR10BE-NEXT:    xvmuldp vs0, v2, v3
 ; PWR10BE-NEXT:    xxswapd vs1, vs0
 ; PWR10BE-NEXT:    xvmuldp vs1, vs0, vs1
-; PWR10BE-NEXT:    # kill: def $f1 killed $f1 killed $vsl1
 ; PWR10BE-NEXT:    blr
 entry:
   %0 = call fast double @llvm.vector.reduce.fmul.v4f64(double 1.000000e+00, <4 x double> %a)
@@ -1378,7 +1370,6 @@ define dso_local double @v8f64_fast(<8 x double> %a) local_unnamed_addr #0 {
 ; PWR9LE-NEXT:    xxswapd vs1, vs0
 ; PWR9LE-NEXT:    xvmuldp vs0, vs0, vs1
 ; PWR9LE-NEXT:    xxswapd vs1, vs0
-; PWR9LE-NEXT:    # kill: def $f1 killed $f1 killed $vsl1
 ; PWR9LE-NEXT:    blr
 ;
 ; PWR9BE-LABEL: v8f64_fast:
@@ -1388,7 +1379,6 @@ define dso_local double @v8f64_fast(<8 x double> %a) local_unnamed_addr #0 {
 ; PWR9BE-NEXT:    xvmuldp vs0, vs1, vs0
 ; PWR9BE-NEXT:    xxswapd vs1, vs0
 ; PWR9BE-NEXT:    xvmuldp vs1, vs0, vs1
-; PWR9BE-NEXT:    # kill: def $f1 killed $f1 killed $vsl1
 ; PWR9BE-NEXT:    blr
 ;
 ; PWR10LE-LABEL: v8f64_fast:
@@ -1399,7 +1389,6 @@ define dso_local double @v8f64_fast(<8 x double> %a) local_unnamed_addr #0 {
 ; PWR10LE-NEXT:    xxswapd vs1, vs0
 ; PWR10LE-NEXT:    xvmuldp vs0, vs0, vs1
 ; PWR10LE-NEXT:    xxswapd vs1, vs0
-; PWR10LE-NEXT:    # kill: def $f1 killed $f1 killed $vsl1
 ; PWR10LE-NEXT:    blr
 ;
 ; PWR10BE-LABEL: v8f64_fast:
@@ -1409,7 +1398,6 @@ define dso_local double @v8f64_fast(<8 x double> %a) local_unnamed_addr #0 {
 ; PWR10BE-NEXT:    xvmuldp vs0, vs1, vs0
 ; PWR10BE-NEXT:    xxswapd vs1, vs0
 ; PWR10BE-NEXT:    xvmuldp vs1, vs0, vs1
-; PWR10BE-NEXT:    # kill: def $f1 killed $f1 killed $vsl1
 ; PWR10BE-NEXT:    blr
 entry:
   %0 = call fast double @llvm.vector.reduce.fmul.v8f64(double 1.000000e+00, <8 x double> %a)
@@ -1659,7 +1647,6 @@ define dso_local double @v16f64_fast(<16 x double> %a) local_unnamed_addr #0 {
 ; PWR9LE-NEXT:    xxswapd vs1, vs0
 ; PWR9LE-NEXT:    xvmuldp vs0, vs0, vs1
 ; PWR9LE-NEXT:    xxswapd vs1, vs0
-; PWR9LE-NEXT:    # kill: def $f1 killed $f1 killed $vsl1
 ; PWR9LE-NEXT:    blr
 ;
 ; PWR9BE-LABEL: v16f64_fast:
@@ -1673,7 +1660,6 @@ define dso_local double @v16f64_fast(<16 x double> %a) local_unnamed_addr #0 {
 ; PWR9BE-NEXT:    xvmuldp vs0, vs0, vs2
 ; PWR9BE-NEXT:    xxswapd vs1, vs0
 ; PWR9BE-NEXT:    xvmuldp vs1, vs0, vs1
-; PWR9BE-NEXT:    # kill: def $f1 killed $f1 killed $vsl1
 ; PWR9BE-NEXT:    blr
 ;
 ; PWR10LE-LABEL: v16f64_fast:
@@ -1688,7 +1674,6 @@ define dso_local double @v16f64_fast(<16 x double> %a) local_unnamed_addr #0 {
 ; PWR10LE-NEXT:    xxswapd vs1, vs0
 ; PWR10LE-NEXT:    xvmuldp vs0, vs0, vs1
 ; PWR10LE-NEXT:    xxswapd vs1, vs0
-; PWR10LE-NEXT:    # kill: def $f1 killed $f1 killed $vsl1
 ; PWR10LE-NEXT:    blr
 ;
 ; PWR10BE-LABEL: v16f64_fast:
@@ -1702,7 +1687,6 @@ define dso_local double @v16f64_fast(<16 x double> %a) local_unnamed_addr #0 {
 ; PWR10BE-NEXT:    xvmuldp vs0, vs0, vs2
 ; PWR10BE-NEXT:    xxswapd vs1, vs0
 ; PWR10BE-NEXT:    xvmuldp vs1, vs0, vs1
-; PWR10BE-NEXT:    # kill: def $f1 killed $f1 killed $vsl1
 ; PWR10BE-NEXT:    blr
 entry:
   %0 = call fast double @llvm.vector.reduce.fmul.v16f64(double 1.000000e+00, <16 x double> %a)
diff --git a/llvm/test/CodeGen/PowerPC/vsx.ll b/llvm/test/CodeGen/PowerPC/vsx.ll
index 32cbfd6..d1d29a0 100644
--- a/llvm/test/CodeGen/PowerPC/vsx.ll
+++ b/llvm/test/CodeGen/PowerPC/vsx.ll
@@ -1993,7 +1993,6 @@ define double @test63(<2 x double> %a) {
 ; CHECK-LE-LABEL: test63:
 ; CHECK-LE:       # %bb.0:
 ; CHECK-LE-NEXT:    xxswapd vs1, v2
-; CHECK-LE-NEXT:    # kill: def $f1 killed $f1 killed $vsl1
 ; CHECK-LE-NEXT:    blr
   %v = extractelement <2 x double> %a, i32 0
   ret double %v
@@ -2006,13 +2005,11 @@ define double @test64(<2 x double> %a) {
 ; CHECK-LABEL: test64:
 ; CHECK:       # %bb.0:
 ; CHECK-NEXT:    xxswapd vs1, v2
-; CHECK-NEXT:    # kill: def $f1 killed $f1 killed $vsl1
 ; CHECK-NEXT:    blr
 ;
 ; CHECK-REG-LABEL: test64:
 ; CHECK-REG:       # %bb.0:
 ; CHECK-REG-NEXT:    xxswapd vs1, v2
-; CHECK-REG-NEXT:    # kill: def $f1 killed $f1 killed $vsl1
 ; CHECK-REG-NEXT:    blr
 ;
 ; CHECK-FISL-LABEL: test64:
diff --git a/llvm/test/CodeGen/RISCV/GlobalISel/legalizer/rvv/legalize-load.mir b/llvm/test/CodeGen/RISCV/GlobalISel/legalizer/rvv/legalize-load.mir
new file mode 100644
index 0000000..12f2188
--- /dev/null
+++ b/llvm/test/CodeGen/RISCV/GlobalISel/legalizer/rvv/legalize-load.mir
@@ -0,0 +1,1043 @@
+# NOTE: Assertions have been autogenerated by utils/update_mir_test_checks.py
+# RUN: llc -mtriple=riscv32 -mattr=+v -run-pass=legalizer %s -o - | FileCheck %s
+# RUN: llc -mtriple=riscv64 -mattr=+v -run-pass=legalizer %s -o - | FileCheck %s
+--- |
+
+  define <vscale x 1 x i8> @vload_nxv1i8(ptr %pa) #0 {
+    %va = load <vscale x 1 x i8>, ptr %pa, align 1
+    ret <vscale x 1 x i8> %va
+  }
+
+  define <vscale x 2 x i8> @vload_nxv2i8(ptr %pa) #0 {
+    %va = load <vscale x 2 x i8>, ptr %pa, align 2
+    ret <vscale x 2 x i8> %va
+  }
+
+  define <vscale x 4 x i8> @vload_nxv4i8(ptr %pa) #0 {
+    %va = load <vscale x 4 x i8>, ptr %pa, align 4
+    ret <vscale x 4 x i8> %va
+  }
+
+  define <vscale x 8 x i8> @vload_nxv8i8(ptr %pa) #0 {
+    %va = load <vscale x 8 x i8>, ptr %pa, align 8
+    ret <vscale x 8 x i8> %va
+  }
+
+  define <vscale x 16 x i8> @vload_nxv16i8(ptr %pa) #0 {
+    %va = load <vscale x 16 x i8>, ptr %pa, align 16
+    ret <vscale x 16 x i8> %va
+  }
+
+  define <vscale x 32 x i8> @vload_nxv32i8(ptr %pa) #0 {
+    %va = load <vscale x 32 x i8>, ptr %pa, align 32
+    ret <vscale x 32 x i8> %va
+  }
+
+  define <vscale x 64 x i8> @vload_nxv64i8(ptr %pa) #0 {
+    %va = load <vscale x 64 x i8>, ptr %pa, align 64
+    ret <vscale x 64 x i8> %va
+  }
+
+  define <vscale x 1 x i16> @vload_nxv1i16(ptr %pa) #0 {
+    %va = load <vscale x 1 x i16>, ptr %pa, align 2
+    ret <vscale x 1 x i16> %va
+  }
+
+  define <vscale x 2 x i16> @vload_nxv2i16(ptr %pa) #0 {
+    %va = load <vscale x 2 x i16>, ptr %pa, align 4
+    ret <vscale x 2 x i16> %va
+  }
+
+  define <vscale x 4 x i16> @vload_nxv4i16(ptr %pa) #0 {
+    %va = load <vscale x 4 x i16>, ptr %pa, align 8
+    ret <vscale x 4 x i16> %va
+  }
+
+  define <vscale x 8 x i16> @vload_nxv8i16(ptr %pa) #0 {
+    %va = load <vscale x 8 x i16>, ptr %pa, align 16
+    ret <vscale x 8 x i16> %va
+  }
+
+  define <vscale x 16 x i16> @vload_nxv16i16(ptr %pa) #0 {
+    %va = load <vscale x 16 x i16>, ptr %pa, align 32
+    ret <vscale x 16 x i16> %va
+  }
+
+  define <vscale x 32 x i16> @vload_nxv32i16(ptr %pa) #0 {
+    %va = load <vscale x 32 x i16>, ptr %pa, align 64
+    ret <vscale x 32 x i16> %va
+  }
+
+  define <vscale x 1 x i32> @vload_nxv1i32(ptr %pa) #0 {
+    %va = load <vscale x 1 x i32>, ptr %pa, align 4
+    ret <vscale x 1 x i32> %va
+  }
+
+  define <vscale x 2 x i32> @vload_nxv2i32(ptr %pa) #0 {
+    %va = load <vscale x 2 x i32>, ptr %pa, align 8
+    ret <vscale x 2 x i32> %va
+  }
+
+  define <vscale x 4 x i32> @vload_nxv4i32(ptr %pa) #0 {
+    %va = load <vscale x 4 x i32>, ptr %pa, align 16
+    ret <vscale x 4 x i32> %va
+  }
+
+  define <vscale x 8 x i32> @vload_nxv8i32(ptr %pa) #0 {
+    %va = load <vscale x 8 x i32>, ptr %pa, align 32
+    ret <vscale x 8 x i32> %va
+  }
+
+  define <vscale x 16 x i32> @vload_nxv16i32(ptr %pa) #0 {
+    %va = load <vscale x 16 x i32>, ptr %pa, align 64
+    ret <vscale x 16 x i32> %va
+  }
+
+  define <vscale x 1 x i64> @vload_nxv1i64(ptr %pa) #0 {
+    %va = load <vscale x 1 x i64>, ptr %pa, align 8
+    ret <vscale x 1 x i64> %va
+  }
+
+  define <vscale x 2 x i64> @vload_nxv2i64(ptr %pa) #0 {
+    %va = load <vscale x 2 x i64>, ptr %pa, align 16
+    ret <vscale x 2 x i64> %va
+  }
+
+  define <vscale x 4 x i64> @vload_nxv4i64(ptr %pa) #0 {
+    %va = load <vscale x 4 x i64>, ptr %pa, align 32
+    ret <vscale x 4 x i64> %va
+  }
+
+  define <vscale x 8 x i64> @vload_nxv8i64(ptr %pa) #0 {
+    %va = load <vscale x 8 x i64>, ptr %pa, align 64
+    ret <vscale x 8 x i64> %va
+  }
+
+  define <vscale x 16 x i8> @vload_nxv16i8_align1(ptr %pa) #0 {
+    %va = load <vscale x 16 x i8>, ptr %pa, align 1
+    ret <vscale x 16 x i8> %va
+  }
+
+  define <vscale x 16 x i8> @vload_nxv16i8_align2(ptr %pa) #0 {
+    %va = load <vscale x 16 x i8>, ptr %pa, align 2
+    ret <vscale x 16 x i8> %va
+  }
+
+  define <vscale x 16 x i8> @vload_nxv16i8_align16(ptr %pa) #0 {
+    %va = load <vscale x 16 x i8>, ptr %pa, align 16
+    ret <vscale x 16 x i8> %va
+  }
+
+  define <vscale x 16 x i8> @vload_nxv16i8_align64(ptr %pa) #0 {
+    %va = load <vscale x 16 x i8>, ptr %pa, align 64
+    ret <vscale x 16 x i8> %va
+  }
+
+  define <vscale x 4 x i16> @vload_nxv4i16_align1(ptr %pa) #0 {
+    %va = load <vscale x 4 x i16>, ptr %pa, align 1
+    ret <vscale x 4 x i16> %va
+  }
+
+  define <vscale x 4 x i16> @vload_nxv4i16_align2(ptr %pa) #0 {
+    %va = load <vscale x 4 x i16>, ptr %pa, align 2
+    ret <vscale x 4 x i16> %va
+  }
+
+  define <vscale x 4 x i16> @vload_nxv4i16_align4(ptr %pa) #0 {
+    %va = load <vscale x 4 x i16>, ptr %pa, align 4
+    ret <vscale x 4 x i16> %va
+  }
+
+  define <vscale x 4 x i16> @vload_nxv4i16_align8(ptr %pa) #0 {
+    %va = load <vscale x 4 x i16>, ptr %pa, align 8
+    ret <vscale x 4 x i16> %va
+  }
+
+  define <vscale x 4 x i16> @vload_nxv4i16_align16(ptr %pa) #0 {
+    %va = load <vscale x 4 x i16>, ptr %pa, align 16
+    ret <vscale x 4 x i16> %va
+  }
+
+  define <vscale x 2 x i32> @vload_nxv2i32_align2(ptr %pa) #0 {
+    %va = load <vscale x 2 x i32>, ptr %pa, align 2
+    ret <vscale x 2 x i32> %va
+  }
+
+  define <vscale x 2 x i32> @vload_nxv2i32_align4(ptr %pa) #0 {
+    %va = load <vscale x 2 x i32>, ptr %pa, align 4
+    ret <vscale x 2 x i32> %va
+  }
+
+  define <vscale x 2 x i32> @vload_nxv2i32_align8(ptr %pa) #0 {
+    %va = load <vscale x 2 x i32>, ptr %pa, align 8
+    ret <vscale x 2 x i32> %va
+  }
+
+  define <vscale x 2 x i32> @vload_nxv2i32_align16(ptr %pa) #0 {
+    %va = load <vscale x 2 x i32>, ptr %pa, align 16
+    ret <vscale x 2 x i32> %va
+  }
+
+  define <vscale x 2 x i32> @vload_nxv2i32_align256(ptr %pa) #0 {
+    %va = load <vscale x 2 x i32>, ptr %pa, align 256
+    ret <vscale x 2 x i32> %va
+  }
+
+  define <vscale x 2 x i64> @vload_nxv2i64_align4(ptr %pa) #0 {
+    %va = load <vscale x 2 x i64>, ptr %pa, align 4
+    ret <vscale x 2 x i64> %va
+  }
+
+  define <vscale x 2 x i64> @vload_nxv2i64_align8(ptr %pa) #0 {
+    %va = load <vscale x 2 x i64>, ptr %pa, align 8
+    ret <vscale x 2 x i64> %va
+  }
+
+  define <vscale x 2 x i64> @vload_nxv2i64_align16(ptr %pa) #0 {
+    %va = load <vscale x 2 x i64>, ptr %pa, align 16
+    ret <vscale x 2 x i64> %va
+  }
+
+  define <vscale x 2 x i64> @vload_nxv2i64_align32(ptr %pa) #0 {
+    %va = load <vscale x 2 x i64>, ptr %pa, align 32
+    ret <vscale x 2 x i64> %va
+  }
+
+  define <vscale x 1 x ptr> @vload_nxv1ptr(ptr %pa) #0 {
+    %va = load <vscale x 1 x ptr>, ptr %pa, align 4
+    ret <vscale x 1 x ptr> %va
+  }
+
+  define <vscale x 2 x ptr> @vload_nxv2ptr(ptr %pa) #0 {
+    %va = load <vscale x 2 x ptr>, ptr %pa, align 8
+    ret <vscale x 2 x ptr> %va
+  }
+
+  define <vscale x 8 x ptr> @vload_nxv8ptr(ptr %pa) #0 {
+    %va = load <vscale x 8 x ptr>, ptr %pa, align 32
+    ret <vscale x 8 x ptr> %va
+  }
+
+  attributes #0 = { "target-features"="+v" }
+
+...
+---
+name:            vload_nxv1i8
+body:             |
+  bb.1 (%ir-block.0):
+    liveins: $x10
+
+    ; CHECK-LABEL: name: vload_nxv1i8
+    ; CHECK: liveins: $x10
+    ; CHECK-NEXT: {{  $}}
+    ; CHECK-NEXT: [[COPY:%[0-9]+]]:_(p0) = COPY $x10
+    ; CHECK-NEXT: [[LOAD:%[0-9]+]]:_(<vscale x 1 x s8>) = G_LOAD [[COPY]](p0) :: (load (<vscale x 1 x s8>) from %ir.pa)
+    ; CHECK-NEXT: $v8 = COPY [[LOAD]](<vscale x 1 x s8>)
+    ; CHECK-NEXT: PseudoRET implicit $v8
+    %0:_(p0) = COPY $x10
+    %1:_(<vscale x 1 x s8>) = G_LOAD %0(p0) :: (load (<vscale x 1 x s8>) from %ir.pa)
+    $v8 = COPY %1(<vscale x 1 x s8>)
+    PseudoRET implicit $v8
+
+...
+---
+name:            vload_nxv2i8
+body:             |
+  bb.1 (%ir-block.0):
+    liveins: $x10
+
+    ; CHECK-LABEL: name: vload_nxv2i8
+    ; CHECK: liveins: $x10
+    ; CHECK-NEXT: {{  $}}
+    ; CHECK-NEXT: [[COPY:%[0-9]+]]:_(p0) = COPY $x10
+    ; CHECK-NEXT: [[LOAD:%[0-9]+]]:_(<vscale x 2 x s8>) = G_LOAD [[COPY]](p0) :: (load (<vscale x 2 x s8>) from %ir.pa)
+    ; CHECK-NEXT: $v8 = COPY [[LOAD]](<vscale x 2 x s8>)
+    ; CHECK-NEXT: PseudoRET implicit $v8
+    %0:_(p0) = COPY $x10
+    %1:_(<vscale x 2 x s8>) = G_LOAD %0(p0) :: (load (<vscale x 2 x s8>) from %ir.pa)
+    $v8 = COPY %1(<vscale x 2 x s8>)
+    PseudoRET implicit $v8
+
+...
+---
+name:            vload_nxv4i8
+body:             |
+  bb.1 (%ir-block.0):
+    liveins: $x10
+
+    ; CHECK-LABEL: name: vload_nxv4i8
+    ; CHECK: liveins: $x10
+    ; CHECK-NEXT: {{  $}}
+    ; CHECK-NEXT: [[COPY:%[0-9]+]]:_(p0) = COPY $x10
+    ; CHECK-NEXT: [[LOAD:%[0-9]+]]:_(<vscale x 4 x s8>) = G_LOAD [[COPY]](p0) :: (load (<vscale x 4 x s8>) from %ir.pa)
+    ; CHECK-NEXT: $v8 = COPY [[LOAD]](<vscale x 4 x s8>)
+    ; CHECK-NEXT: PseudoRET implicit $v8
+    %0:_(p0) = COPY $x10
+    %1:_(<vscale x 4 x s8>) = G_LOAD %0(p0) :: (load (<vscale x 4 x s8>) from %ir.pa)
+    $v8 = COPY %1(<vscale x 4 x s8>)
+    PseudoRET implicit $v8
+
+...
+---
+name:            vload_nxv8i8
+body:             |
+  bb.1 (%ir-block.0):
+    liveins: $x10
+
+    ; CHECK-LABEL: name: vload_nxv8i8
+    ; CHECK: liveins: $x10
+    ; CHECK-NEXT: {{  $}}
+    ; CHECK-NEXT: [[COPY:%[0-9]+]]:_(p0) = COPY $x10
+    ; CHECK-NEXT: [[LOAD:%[0-9]+]]:_(<vscale x 8 x s8>) = G_LOAD [[COPY]](p0) :: (load (<vscale x 8 x s8>) from %ir.pa)
+    ; CHECK-NEXT: $v8 = COPY [[LOAD]](<vscale x 8 x s8>)
+    ; CHECK-NEXT: PseudoRET implicit $v8
+    %0:_(p0) = COPY $x10
+    %1:_(<vscale x 8 x s8>) = G_LOAD %0(p0) :: (load (<vscale x 8 x s8>) from %ir.pa)
+    $v8 = COPY %1(<vscale x 8 x s8>)
+    PseudoRET implicit $v8
+
+...
+---
+name:            vload_nxv16i8
+body:             |
+  bb.1 (%ir-block.0):
+    liveins: $x10
+
+    ; CHECK-LABEL: name: vload_nxv16i8
+    ; CHECK: liveins: $x10
+    ; CHECK-NEXT: {{  $}}
+    ; CHECK-NEXT: [[COPY:%[0-9]+]]:_(p0) = COPY $x10
+    ; CHECK-NEXT: [[LOAD:%[0-9]+]]:_(<vscale x 16 x s8>) = G_LOAD [[COPY]](p0) :: (load (<vscale x 16 x s8>) from %ir.pa)
+    ; CHECK-NEXT: $v8m2 = COPY [[LOAD]](<vscale x 16 x s8>)
+    ; CHECK-NEXT: PseudoRET implicit $v8m2
+    %0:_(p0) = COPY $x10
+    %1:_(<vscale x 16 x s8>) = G_LOAD %0(p0) :: (load (<vscale x 16 x s8>) from %ir.pa)
+    $v8m2 = COPY %1(<vscale x 16 x s8>)
+    PseudoRET implicit $v8m2
+
+...
+---
+name:            vload_nxv32i8
+body:             |
+  bb.1 (%ir-block.0):
+    liveins: $x10
+
+    ; CHECK-LABEL: name: vload_nxv32i8
+    ; CHECK: liveins: $x10
+    ; CHECK-NEXT: {{  $}}
+    ; CHECK-NEXT: [[COPY:%[0-9]+]]:_(p0) = COPY $x10
+    ; CHECK-NEXT: [[LOAD:%[0-9]+]]:_(<vscale x 32 x s8>) = G_LOAD [[COPY]](p0) :: (load (<vscale x 32 x s8>) from %ir.pa)
+    ; CHECK-NEXT: $v8m4 = COPY [[LOAD]](<vscale x 32 x s8>)
+    ; CHECK-NEXT: PseudoRET implicit $v8m4
+    %0:_(p0) = COPY $x10
+    %1:_(<vscale x 32 x s8>) = G_LOAD %0(p0) :: (load (<vscale x 32 x s8>) from %ir.pa)
+    $v8m4 = COPY %1(<vscale x 32 x s8>)
+    PseudoRET implicit $v8m4
+
+...
+---
+name:            vload_nxv64i8
+body:             |
+  bb.1 (%ir-block.0):
+    liveins: $x10
+
+    ; CHECK-LABEL: name: vload_nxv64i8
+    ; CHECK: liveins: $x10
+    ; CHECK-NEXT: {{  $}}
+    ; CHECK-NEXT: [[COPY:%[0-9]+]]:_(p0) = COPY $x10
+    ; CHECK-NEXT: [[LOAD:%[0-9]+]]:_(<vscale x 64 x s8>) = G_LOAD [[COPY]](p0) :: (load (<vscale x 64 x s8>) from %ir.pa)
+    ; CHECK-NEXT: $v8m8 = COPY [[LOAD]](<vscale x 64 x s8>)
+    ; CHECK-NEXT: PseudoRET implicit $v8m8
+    %0:_(p0) = COPY $x10
+    %1:_(<vscale x 64 x s8>) = G_LOAD %0(p0) :: (load (<vscale x 64 x s8>) from %ir.pa)
+    $v8m8 = COPY %1(<vscale x 64 x s8>)
+    PseudoRET implicit $v8m8
+
+...
+---
+name:            vload_nxv1i16
+body:             |
+  bb.1 (%ir-block.0):
+    liveins: $x10
+
+    ; CHECK-LABEL: name: vload_nxv1i16
+    ; CHECK: liveins: $x10
+    ; CHECK-NEXT: {{  $}}
+    ; CHECK-NEXT: [[COPY:%[0-9]+]]:_(p0) = COPY $x10
+    ; CHECK-NEXT: [[LOAD:%[0-9]+]]:_(<vscale x 1 x s16>) = G_LOAD [[COPY]](p0) :: (load (<vscale x 1 x s16>) from %ir.pa)
+    ; CHECK-NEXT: $v8 = COPY [[LOAD]](<vscale x 1 x s16>)
+    ; CHECK-NEXT: PseudoRET implicit $v8
+    %0:_(p0) = COPY $x10
+    %1:_(<vscale x 1 x s16>) = G_LOAD %0(p0) :: (load (<vscale x 1 x s16>) from %ir.pa)
+    $v8 = COPY %1(<vscale x 1 x s16>)
+    PseudoRET implicit $v8
+
+...
+---
+name:            vload_nxv2i16
+body:             |
+  bb.1 (%ir-block.0):
+    liveins: $x10
+
+    ; CHECK-LABEL: name: vload_nxv2i16
+    ; CHECK: liveins: $x10
+    ; CHECK-NEXT: {{  $}}
+    ; CHECK-NEXT: [[COPY:%[0-9]+]]:_(p0) = COPY $x10
+    ; CHECK-NEXT: [[LOAD:%[0-9]+]]:_(<vscale x 2 x s16>) = G_LOAD [[COPY]](p0) :: (load (<vscale x 2 x s16>) from %ir.pa)
+    ; CHECK-NEXT: $v8 = COPY [[LOAD]](<vscale x 2 x s16>)
+    ; CHECK-NEXT: PseudoRET implicit $v8
+    %0:_(p0) = COPY $x10
+    %1:_(<vscale x 2 x s16>) = G_LOAD %0(p0) :: (load (<vscale x 2 x s16>) from %ir.pa)
+    $v8 = COPY %1(<vscale x 2 x s16>)
+    PseudoRET implicit $v8
+
+...
+---
+name:            vload_nxv4i16
+body:             |
+  bb.1 (%ir-block.0):
+    liveins: $x10
+
+    ; CHECK-LABEL: name: vload_nxv4i16
+    ; CHECK: liveins: $x10
+    ; CHECK-NEXT: {{  $}}
+    ; CHECK-NEXT: [[COPY:%[0-9]+]]:_(p0) = COPY $x10
+    ; CHECK-NEXT: [[LOAD:%[0-9]+]]:_(<vscale x 4 x s16>) = G_LOAD [[COPY]](p0) :: (load (<vscale x 4 x s16>) from %ir.pa)
+    ; CHECK-NEXT: $v8 = COPY [[LOAD]](<vscale x 4 x s16>)
+    ; CHECK-NEXT: PseudoRET implicit $v8
+    %0:_(p0) = COPY $x10
+    %1:_(<vscale x 4 x s16>) = G_LOAD %0(p0) :: (load (<vscale x 4 x s16>) from %ir.pa)
+    $v8 = COPY %1(<vscale x 4 x s16>)
+    PseudoRET implicit $v8
+
+...
+---
+name:            vload_nxv8i16
+body:             |
+  bb.1 (%ir-block.0):
+    liveins: $x10
+
+    ; CHECK-LABEL: name: vload_nxv8i16
+    ; CHECK: liveins: $x10
+    ; CHECK-NEXT: {{  $}}
+    ; CHECK-NEXT: [[COPY:%[0-9]+]]:_(p0) = COPY $x10
+    ; CHECK-NEXT: [[LOAD:%[0-9]+]]:_(<vscale x 8 x s16>) = G_LOAD [[COPY]](p0) :: (load (<vscale x 8 x s16>) from %ir.pa)
+    ; CHECK-NEXT: $v8m2 = COPY [[LOAD]](<vscale x 8 x s16>)
+    ; CHECK-NEXT: PseudoRET implicit $v8m2
+    %0:_(p0) = COPY $x10
+    %1:_(<vscale x 8 x s16>) = G_LOAD %0(p0) :: (load (<vscale x 8 x s16>) from %ir.pa)
+    $v8m2 = COPY %1(<vscale x 8 x s16>)
+    PseudoRET implicit $v8m2
+
+...
+---
+name:            vload_nxv16i16
+body:             |
+  bb.1 (%ir-block.0):
+    liveins: $x10
+
+    ; CHECK-LABEL: name: vload_nxv16i16
+    ; CHECK: liveins: $x10
+    ; CHECK-NEXT: {{  $}}
+    ; CHECK-NEXT: [[COPY:%[0-9]+]]:_(p0) = COPY $x10
+    ; CHECK-NEXT: [[LOAD:%[0-9]+]]:_(<vscale x 16 x s16>) = G_LOAD [[COPY]](p0) :: (load (<vscale x 16 x s16>) from %ir.pa)
+    ; CHECK-NEXT: $v8m4 = COPY [[LOAD]](<vscale x 16 x s16>)
+    ; CHECK-NEXT: PseudoRET implicit $v8m4
+    %0:_(p0) = COPY $x10
+    %1:_(<vscale x 16 x s16>) = G_LOAD %0(p0) :: (load (<vscale x 16 x s16>) from %ir.pa)
+    $v8m4 = COPY %1(<vscale x 16 x s16>)
+    PseudoRET implicit $v8m4
+
+...
+---
+name:            vload_nxv32i16
+body:             |
+  bb.1 (%ir-block.0):
+    liveins: $x10
+
+    ; CHECK-LABEL: name: vload_nxv32i16
+    ; CHECK: liveins: $x10
+    ; CHECK-NEXT: {{  $}}
+    ; CHECK-NEXT: [[COPY:%[0-9]+]]:_(p0) = COPY $x10
+    ; CHECK-NEXT: [[LOAD:%[0-9]+]]:_(<vscale x 32 x s16>) = G_LOAD [[COPY]](p0) :: (load (<vscale x 32 x s16>) from %ir.pa)
+    ; CHECK-NEXT: $v8m8 = COPY [[LOAD]](<vscale x 32 x s16>)
+    ; CHECK-NEXT: PseudoRET implicit $v8m8
+    %0:_(p0) = COPY $x10
+    %1:_(<vscale x 32 x s16>) = G_LOAD %0(p0) :: (load (<vscale x 32 x s16>) from %ir.pa)
+    $v8m8 = COPY %1(<vscale x 32 x s16>)
+    PseudoRET implicit $v8m8
+
+...
+---
+name:            vload_nxv1i32
+body:             |
+  bb.1 (%ir-block.0):
+    liveins: $x10
+
+    ; CHECK-LABEL: name: vload_nxv1i32
+    ; CHECK: liveins: $x10
+    ; CHECK-NEXT: {{  $}}
+    ; CHECK-NEXT: [[COPY:%[0-9]+]]:_(p0) = COPY $x10
+    ; CHECK-NEXT: [[LOAD:%[0-9]+]]:_(<vscale x 1 x s32>) = G_LOAD [[COPY]](p0) :: (load (<vscale x 1 x s32>) from %ir.pa)
+    ; CHECK-NEXT: $v8 = COPY [[LOAD]](<vscale x 1 x s32>)
+    ; CHECK-NEXT: PseudoRET implicit $v8
+    %0:_(p0) = COPY $x10
+    %1:_(<vscale x 1 x s32>) = G_LOAD %0(p0) :: (load (<vscale x 1 x s32>) from %ir.pa)
+    $v8 = COPY %1(<vscale x 1 x s32>)
+    PseudoRET implicit $v8
+
+...
+---
+name:            vload_nxv2i32
+body:             |
+  bb.1 (%ir-block.0):
+    liveins: $x10
+
+    ; CHECK-LABEL: name: vload_nxv2i32
+    ; CHECK: liveins: $x10
+    ; CHECK-NEXT: {{  $}}
+    ; CHECK-NEXT: [[COPY:%[0-9]+]]:_(p0) = COPY $x10
+    ; CHECK-NEXT: [[LOAD:%[0-9]+]]:_(<vscale x 2 x s32>) = G_LOAD [[COPY]](p0) :: (load (<vscale x 2 x s32>) from %ir.pa)
+    ; CHECK-NEXT: $v8 = COPY [[LOAD]](<vscale x 2 x s32>)
+    ; CHECK-NEXT: PseudoRET implicit $v8
+    %0:_(p0) = COPY $x10
+    %1:_(<vscale x 2 x s32>) = G_LOAD %0(p0) :: (load (<vscale x 2 x s32>) from %ir.pa)
+    $v8 = COPY %1(<vscale x 2 x s32>)
+    PseudoRET implicit $v8
+
+...
+---
+name:            vload_nxv4i32
+body:             |
+  bb.1 (%ir-block.0):
+    liveins: $x10
+
+    ; CHECK-LABEL: name: vload_nxv4i32
+    ; CHECK: liveins: $x10
+    ; CHECK-NEXT: {{  $}}
+    ; CHECK-NEXT: [[COPY:%[0-9]+]]:_(p0) = COPY $x10
+    ; CHECK-NEXT: [[LOAD:%[0-9]+]]:_(<vscale x 4 x s32>) = G_LOAD [[COPY]](p0) :: (load (<vscale x 4 x s32>) from %ir.pa)
+    ; CHECK-NEXT: $v8m2 = COPY [[LOAD]](<vscale x 4 x s32>)
+    ; CHECK-NEXT: PseudoRET implicit $v8m2
+    %0:_(p0) = COPY $x10
+    %1:_(<vscale x 4 x s32>) = G_LOAD %0(p0) :: (load (<vscale x 4 x s32>) from %ir.pa)
+    $v8m2 = COPY %1(<vscale x 4 x s32>)
+    PseudoRET implicit $v8m2
+
+...
+---
+name:            vload_nxv8i32
+body:             |
+  bb.1 (%ir-block.0):
+    liveins: $x10
+
+    ; CHECK-LABEL: name: vload_nxv8i32
+    ; CHECK: liveins: $x10
+    ; CHECK-NEXT: {{  $}}
+    ; CHECK-NEXT: [[COPY:%[0-9]+]]:_(p0) = COPY $x10
+    ; CHECK-NEXT: [[LOAD:%[0-9]+]]:_(<vscale x 8 x s32>) = G_LOAD [[COPY]](p0) :: (load (<vscale x 8 x s32>) from %ir.pa)
+    ; CHECK-NEXT: $v8m4 = COPY [[LOAD]](<vscale x 8 x s32>)
+    ; CHECK-NEXT: PseudoRET implicit $v8m4
+    %0:_(p0) = COPY $x10
+    %1:_(<vscale x 8 x s32>) = G_LOAD %0(p0) :: (load (<vscale x 8 x s32>) from %ir.pa)
+    $v8m4 = COPY %1(<vscale x 8 x s32>)
+    PseudoRET implicit $v8m4
+
+...
+---
+name:            vload_nxv16i32
+body:             |
+  bb.1 (%ir-block.0):
+    liveins: $x10
+
+    ; CHECK-LABEL: name: vload_nxv16i32
+    ; CHECK: liveins: $x10
+    ; CHECK-NEXT: {{  $}}
+    ; CHECK-NEXT: [[COPY:%[0-9]+]]:_(p0) = COPY $x10
+    ; CHECK-NEXT: [[LOAD:%[0-9]+]]:_(<vscale x 16 x s32>) = G_LOAD [[COPY]](p0) :: (load (<vscale x 16 x s32>) from %ir.pa)
+    ; CHECK-NEXT: $v8m8 = COPY [[LOAD]](<vscale x 16 x s32>)
+    ; CHECK-NEXT: PseudoRET implicit $v8m8
+    %0:_(p0) = COPY $x10
+    %1:_(<vscale x 16 x s32>) = G_LOAD %0(p0) :: (load (<vscale x 16 x s32>) from %ir.pa)
+    $v8m8 = COPY %1(<vscale x 16 x s32>)
+    PseudoRET implicit $v8m8
+
+...
+---
+name:            vload_nxv1i64
+body:             |
+  bb.1 (%ir-block.0):
+    liveins: $x10
+
+    ; CHECK-LABEL: name: vload_nxv1i64
+    ; CHECK: liveins: $x10
+    ; CHECK-NEXT: {{  $}}
+    ; CHECK-NEXT: [[COPY:%[0-9]+]]:_(p0) = COPY $x10
+    ; CHECK-NEXT: [[LOAD:%[0-9]+]]:_(<vscale x 1 x s64>) = G_LOAD [[COPY]](p0) :: (load (<vscale x 1 x s64>) from %ir.pa)
+    ; CHECK-NEXT: $v8 = COPY [[LOAD]](<vscale x 1 x s64>)
+    ; CHECK-NEXT: PseudoRET implicit $v8
+    %0:_(p0) = COPY $x10
+    %1:_(<vscale x 1 x s64>) = G_LOAD %0(p0) :: (load (<vscale x 1 x s64>) from %ir.pa)
+    $v8 = COPY %1(<vscale x 1 x s64>)
+    PseudoRET implicit $v8
+
+...
+---
+name:            vload_nxv2i64
+body:             |
+  bb.1 (%ir-block.0):
+    liveins: $x10
+
+    ; CHECK-LABEL: name: vload_nxv2i64
+    ; CHECK: liveins: $x10
+    ; CHECK-NEXT: {{  $}}
+    ; CHECK-NEXT: [[COPY:%[0-9]+]]:_(p0) = COPY $x10
+    ; CHECK-NEXT: [[LOAD:%[0-9]+]]:_(<vscale x 2 x s64>) = G_LOAD [[COPY]](p0) :: (load (<vscale x 2 x s64>) from %ir.pa)
+    ; CHECK-NEXT: $v8m2 = COPY [[LOAD]](<vscale x 2 x s64>)
+    ; CHECK-NEXT: PseudoRET implicit $v8m2
+    %0:_(p0) = COPY $x10
+    %1:_(<vscale x 2 x s64>) = G_LOAD %0(p0) :: (load (<vscale x 2 x s64>) from %ir.pa)
+    $v8m2 = COPY %1(<vscale x 2 x s64>)
+    PseudoRET implicit $v8m2
+
+...
+---
+name:            vload_nxv4i64
+body:             |
+  bb.1 (%ir-block.0):
+    liveins: $x10
+
+    ; CHECK-LABEL: name: vload_nxv4i64
+    ; CHECK: liveins: $x10
+    ; CHECK-NEXT: {{  $}}
+    ; CHECK-NEXT: [[COPY:%[0-9]+]]:_(p0) = COPY $x10
+    ; CHECK-NEXT: [[LOAD:%[0-9]+]]:_(<vscale x 4 x s64>) = G_LOAD [[COPY]](p0) :: (load (<vscale x 4 x s64>) from %ir.pa)
+    ; CHECK-NEXT: $v8m4 = COPY [[LOAD]](<vscale x 4 x s64>)
+    ; CHECK-NEXT: PseudoRET implicit $v8m4
+    %0:_(p0) = COPY $x10
+    %1:_(<vscale x 4 x s64>) = G_LOAD %0(p0) :: (load (<vscale x 4 x s64>) from %ir.pa)
+    $v8m4 = COPY %1(<vscale x 4 x s64>)
+    PseudoRET implicit $v8m4
+
+...
+---
+name:            vload_nxv8i64
+body:             |
+  bb.1 (%ir-block.0):
+    liveins: $x10
+
+    ; CHECK-LABEL: name: vload_nxv8i64
+    ; CHECK: liveins: $x10
+    ; CHECK-NEXT: {{  $}}
+    ; CHECK-NEXT: [[COPY:%[0-9]+]]:_(p0) = COPY $x10
+    ; CHECK-NEXT: [[LOAD:%[0-9]+]]:_(<vscale x 8 x s64>) = G_LOAD [[COPY]](p0) :: (load (<vscale x 8 x s64>) from %ir.pa)
+    ; CHECK-NEXT: $v8m8 = COPY [[LOAD]](<vscale x 8 x s64>)
+    ; CHECK-NEXT: PseudoRET implicit $v8m8
+    %0:_(p0) = COPY $x10
+    %1:_(<vscale x 8 x s64>) = G_LOAD %0(p0) :: (load (<vscale x 8 x s64>) from %ir.pa)
+    $v8m8 = COPY %1(<vscale x 8 x s64>)
+    PseudoRET implicit $v8m8
+
+...
+---
+name:            vload_nxv16i8_align1
+body:             |
+  bb.1 (%ir-block.0):
+    liveins: $x10
+
+    ; CHECK-LABEL: name: vload_nxv16i8_align1
+    ; CHECK: liveins: $x10
+    ; CHECK-NEXT: {{  $}}
+    ; CHECK-NEXT: [[COPY:%[0-9]+]]:_(p0) = COPY $x10
+    ; CHECK-NEXT: [[LOAD:%[0-9]+]]:_(<vscale x 16 x s8>) = G_LOAD [[COPY]](p0) :: (load (<vscale x 16 x s8>) from %ir.pa, align 1)
+    ; CHECK-NEXT: $v8m2 = COPY [[LOAD]](<vscale x 16 x s8>)
+    ; CHECK-NEXT: PseudoRET implicit $v8m2
+    %0:_(p0) = COPY $x10
+    %1:_(<vscale x 16 x s8>) = G_LOAD %0(p0) :: (load (<vscale x 16 x s8>) from %ir.pa, align 1)
+    $v8m2 = COPY %1(<vscale x 16 x s8>)
+    PseudoRET implicit $v8m2
+
+...
+---
+name:            vload_nxv16i8_align2
+body:             |
+  bb.1 (%ir-block.0):
+    liveins: $x10
+
+    ; CHECK-LABEL: name: vload_nxv16i8_align2
+    ; CHECK: liveins: $x10
+    ; CHECK-NEXT: {{  $}}
+    ; CHECK-NEXT: [[COPY:%[0-9]+]]:_(p0) = COPY $x10
+    ; CHECK-NEXT: [[LOAD:%[0-9]+]]:_(<vscale x 16 x s8>) = G_LOAD [[COPY]](p0) :: (load (<vscale x 16 x s8>) from %ir.pa, align 2)
+    ; CHECK-NEXT: $v8m2 = COPY [[LOAD]](<vscale x 16 x s8>)
+    ; CHECK-NEXT: PseudoRET implicit $v8m2
+    %0:_(p0) = COPY $x10
+    %1:_(<vscale x 16 x s8>) = G_LOAD %0(p0) :: (load (<vscale x 16 x s8>) from %ir.pa, align 2)
+    $v8m2 = COPY %1(<vscale x 16 x s8>)
+    PseudoRET implicit $v8m2
+
+...
+---
+name:            vload_nxv16i8_align16
+body:             |
+  bb.1 (%ir-block.0):
+    liveins: $x10
+
+    ; CHECK-LABEL: name: vload_nxv16i8_align16
+    ; CHECK: liveins: $x10
+    ; CHECK-NEXT: {{  $}}
+    ; CHECK-NEXT: [[COPY:%[0-9]+]]:_(p0) = COPY $x10
+    ; CHECK-NEXT: [[LOAD:%[0-9]+]]:_(<vscale x 16 x s8>) = G_LOAD [[COPY]](p0) :: (load (<vscale x 16 x s8>) from %ir.pa)
+    ; CHECK-NEXT: $v8m2 = COPY [[LOAD]](<vscale x 16 x s8>)
+    ; CHECK-NEXT: PseudoRET implicit $v8m2
+    %0:_(p0) = COPY $x10
+    %1:_(<vscale x 16 x s8>) = G_LOAD %0(p0) :: (load (<vscale x 16 x s8>) from %ir.pa)
+    $v8m2 = COPY %1(<vscale x 16 x s8>)
+    PseudoRET implicit $v8m2
+
+...
+---
+name:            vload_nxv16i8_align64
+body:             |
+  bb.1 (%ir-block.0):
+    liveins: $x10
+
+    ; CHECK-LABEL: name: vload_nxv16i8_align64
+    ; CHECK: liveins: $x10
+    ; CHECK-NEXT: {{  $}}
+    ; CHECK-NEXT: [[COPY:%[0-9]+]]:_(p0) = COPY $x10
+    ; CHECK-NEXT: [[LOAD:%[0-9]+]]:_(<vscale x 16 x s8>) = G_LOAD [[COPY]](p0) :: (load (<vscale x 16 x s8>) from %ir.pa, align 64)
+    ; CHECK-NEXT: $v8m2 = COPY [[LOAD]](<vscale x 16 x s8>)
+    ; CHECK-NEXT: PseudoRET implicit $v8m2
+    %0:_(p0) = COPY $x10
+    %1:_(<vscale x 16 x s8>) = G_LOAD %0(p0) :: (load (<vscale x 16 x s8>) from %ir.pa, align 64)
+    $v8m2 = COPY %1(<vscale x 16 x s8>)
+    PseudoRET implicit $v8m2
+
+...
+---
+name:            vload_nxv4i16_align1
+body:             |
+  bb.1 (%ir-block.0):
+    liveins: $x10
+
+    ; CHECK-LABEL: name: vload_nxv4i16_align1
+    ; CHECK: liveins: $x10
+    ; CHECK-NEXT: {{  $}}
+    ; CHECK-NEXT: [[COPY:%[0-9]+]]:_(p0) = COPY $x10
+    ; CHECK-NEXT: [[LOAD:%[0-9]+]]:_(<vscale x 8 x s8>) = G_LOAD [[COPY]](p0) :: (load (<vscale x 8 x s8>) from %ir.pa, align 1)
+    ; CHECK-NEXT: [[BITCAST:%[0-9]+]]:_(<vscale x 4 x s16>) = G_BITCAST [[LOAD]](<vscale x 8 x s8>)
+    ; CHECK-NEXT: $v8 = COPY [[BITCAST]](<vscale x 4 x s16>)
+    ; CHECK-NEXT: PseudoRET implicit $v8
+    %0:_(p0) = COPY $x10
+    %1:_(<vscale x 4 x s16>) = G_LOAD %0(p0) :: (load (<vscale x 4 x s16>) from %ir.pa, align 1)
+    $v8 = COPY %1(<vscale x 4 x s16>)
+    PseudoRET implicit $v8
+
+...
+---
+name:            vload_nxv4i16_align2
+body:             |
+  bb.1 (%ir-block.0):
+    liveins: $x10
+
+    ; CHECK-LABEL: name: vload_nxv4i16_align2
+    ; CHECK: liveins: $x10
+    ; CHECK-NEXT: {{  $}}
+    ; CHECK-NEXT: [[COPY:%[0-9]+]]:_(p0) = COPY $x10
+    ; CHECK-NEXT: [[LOAD:%[0-9]+]]:_(<vscale x 4 x s16>) = G_LOAD [[COPY]](p0) :: (load (<vscale x 4 x s16>) from %ir.pa, align 2)
+    ; CHECK-NEXT: $v8 = COPY [[LOAD]](<vscale x 4 x s16>)
+    ; CHECK-NEXT: PseudoRET implicit $v8
+    %0:_(p0) = COPY $x10
+    %1:_(<vscale x 4 x s16>) = G_LOAD %0(p0) :: (load (<vscale x 4 x s16>) from %ir.pa, align 2)
+    $v8 = COPY %1(<vscale x 4 x s16>)
+    PseudoRET implicit $v8
+
+...
+---
+name:            vload_nxv4i16_align4
+body:             |
+  bb.1 (%ir-block.0):
+    liveins: $x10
+
+    ; CHECK-LABEL: name: vload_nxv4i16_align4
+    ; CHECK: liveins: $x10
+    ; CHECK-NEXT: {{  $}}
+    ; CHECK-NEXT: [[COPY:%[0-9]+]]:_(p0) = COPY $x10
+    ; CHECK-NEXT: [[LOAD:%[0-9]+]]:_(<vscale x 4 x s16>) = G_LOAD [[COPY]](p0) :: (load (<vscale x 4 x s16>) from %ir.pa, align 4)
+    ; CHECK-NEXT: $v8 = COPY [[LOAD]](<vscale x 4 x s16>)
+    ; CHECK-NEXT: PseudoRET implicit $v8
+    %0:_(p0) = COPY $x10
+    %1:_(<vscale x 4 x s16>) = G_LOAD %0(p0) :: (load (<vscale x 4 x s16>) from %ir.pa, align 4)
+    $v8 = COPY %1(<vscale x 4 x s16>)
+    PseudoRET implicit $v8
+
+...
+---
+name:            vload_nxv4i16_align8
+body:             |
+  bb.1 (%ir-block.0):
+    liveins: $x10
+
+    ; CHECK-LABEL: name: vload_nxv4i16_align8
+    ; CHECK: liveins: $x10
+    ; CHECK-NEXT: {{  $}}
+    ; CHECK-NEXT: [[COPY:%[0-9]+]]:_(p0) = COPY $x10
+    ; CHECK-NEXT: [[LOAD:%[0-9]+]]:_(<vscale x 4 x s16>) = G_LOAD [[COPY]](p0) :: (load (<vscale x 4 x s16>) from %ir.pa)
+    ; CHECK-NEXT: $v8 = COPY [[LOAD]](<vscale x 4 x s16>)
+    ; CHECK-NEXT: PseudoRET implicit $v8
+    %0:_(p0) = COPY $x10
+    %1:_(<vscale x 4 x s16>) = G_LOAD %0(p0) :: (load (<vscale x 4 x s16>) from %ir.pa)
+    $v8 = COPY %1(<vscale x 4 x s16>)
+    PseudoRET implicit $v8
+
+...
+---
+name:            vload_nxv4i16_align16
+body:             |
+  bb.1 (%ir-block.0):
+    liveins: $x10
+
+    ; CHECK-LABEL: name: vload_nxv4i16_align16
+    ; CHECK: liveins: $x10
+    ; CHECK-NEXT: {{  $}}
+    ; CHECK-NEXT: [[COPY:%[0-9]+]]:_(p0) = COPY $x10
+    ; CHECK-NEXT: [[LOAD:%[0-9]+]]:_(<vscale x 4 x s16>) = G_LOAD [[COPY]](p0) :: (load (<vscale x 4 x s16>) from %ir.pa, align 16)
+    ; CHECK-NEXT: $v8 = COPY [[LOAD]](<vscale x 4 x s16>)
+    ; CHECK-NEXT: PseudoRET implicit $v8
+    %0:_(p0) = COPY $x10
+    %1:_(<vscale x 4 x s16>) = G_LOAD %0(p0) :: (load (<vscale x 4 x s16>) from %ir.pa, align 16)
+    $v8 = COPY %1(<vscale x 4 x s16>)
+    PseudoRET implicit $v8
+
+...
+---
+name:            vload_nxv2i32_align2
+body:             |
+  bb.1 (%ir-block.0):
+    liveins: $x10
+
+    ; CHECK-LABEL: name: vload_nxv2i32_align2
+    ; CHECK: liveins: $x10
+    ; CHECK-NEXT: {{  $}}
+    ; CHECK-NEXT: [[COPY:%[0-9]+]]:_(p0) = COPY $x10
+    ; CHECK-NEXT: [[LOAD:%[0-9]+]]:_(<vscale x 8 x s8>) = G_LOAD [[COPY]](p0) :: (load (<vscale x 8 x s8>) from %ir.pa, align 2)
+    ; CHECK-NEXT: [[BITCAST:%[0-9]+]]:_(<vscale x 2 x s32>) = G_BITCAST [[LOAD]](<vscale x 8 x s8>)
+    ; CHECK-NEXT: $v8 = COPY [[BITCAST]](<vscale x 2 x s32>)
+    ; CHECK-NEXT: PseudoRET implicit $v8
+    %0:_(p0) = COPY $x10
+    %1:_(<vscale x 2 x s32>) = G_LOAD %0(p0) :: (load (<vscale x 2 x s32>) from %ir.pa, align 2)
+    $v8 = COPY %1(<vscale x 2 x s32>)
+    PseudoRET implicit $v8
+
+...
+---
+name:            vload_nxv2i32_align4
+body:             |
+  bb.1 (%ir-block.0):
+    liveins: $x10
+
+    ; CHECK-LABEL: name: vload_nxv2i32_align4
+    ; CHECK: liveins: $x10
+    ; CHECK-NEXT: {{  $}}
+    ; CHECK-NEXT: [[COPY:%[0-9]+]]:_(p0) = COPY $x10
+    ; CHECK-NEXT: [[LOAD:%[0-9]+]]:_(<vscale x 2 x s32>) = G_LOAD [[COPY]](p0) :: (load (<vscale x 2 x s32>) from %ir.pa, align 4)
+    ; CHECK-NEXT: $v8 = COPY [[LOAD]](<vscale x 2 x s32>)
+    ; CHECK-NEXT: PseudoRET implicit $v8
+    %0:_(p0) = COPY $x10
+    %1:_(<vscale x 2 x s32>) = G_LOAD %0(p0) :: (load (<vscale x 2 x s32>) from %ir.pa, align 4)
+    $v8 = COPY %1(<vscale x 2 x s32>)
+    PseudoRET implicit $v8
+
+...
+---
+name:            vload_nxv2i32_align8
+body:             |
+  bb.1 (%ir-block.0):
+    liveins: $x10
+
+    ; CHECK-LABEL: name: vload_nxv2i32_align8
+    ; CHECK: liveins: $x10
+    ; CHECK-NEXT: {{  $}}
+    ; CHECK-NEXT: [[COPY:%[0-9]+]]:_(p0) = COPY $x10
+    ; CHECK-NEXT: [[LOAD:%[0-9]+]]:_(<vscale x 2 x s32>) = G_LOAD [[COPY]](p0) :: (load (<vscale x 2 x s32>) from %ir.pa)
+    ; CHECK-NEXT: $v8 = COPY [[LOAD]](<vscale x 2 x s32>)
+    ; CHECK-NEXT: PseudoRET implicit $v8
+    %0:_(p0) = COPY $x10
+    %1:_(<vscale x 2 x s32>) = G_LOAD %0(p0) :: (load (<vscale x 2 x s32>) from %ir.pa)
+    $v8 = COPY %1(<vscale x 2 x s32>)
+    PseudoRET implicit $v8
+
+...
+---
+name:            vload_nxv2i32_align16
+body:             |
+  bb.1 (%ir-block.0):
+    liveins: $x10
+
+    ; CHECK-LABEL: name: vload_nxv2i32_align16
+    ; CHECK: liveins: $x10
+    ; CHECK-NEXT: {{  $}}
+    ; CHECK-NEXT: [[COPY:%[0-9]+]]:_(p0) = COPY $x10
+    ; CHECK-NEXT: [[LOAD:%[0-9]+]]:_(<vscale x 2 x s32>) = G_LOAD [[COPY]](p0) :: (load (<vscale x 2 x s32>) from %ir.pa, align 16)
+    ; CHECK-NEXT: $v8 = COPY [[LOAD]](<vscale x 2 x s32>)
+    ; CHECK-NEXT: PseudoRET implicit $v8
+    %0:_(p0) = COPY $x10
+    %1:_(<vscale x 2 x s32>) = G_LOAD %0(p0) :: (load (<vscale x 2 x s32>) from %ir.pa, align 16)
+    $v8 = COPY %1(<vscale x 2 x s32>)
+    PseudoRET implicit $v8
+
+...
+---
+name:            vload_nxv2i32_align256
+body:             |
+  bb.1 (%ir-block.0):
+    liveins: $x10
+
+    ; CHECK-LABEL: name: vload_nxv2i32_align256
+    ; CHECK: liveins: $x10
+    ; CHECK-NEXT: {{  $}}
+    ; CHECK-NEXT: [[COPY:%[0-9]+]]:_(p0) = COPY $x10
+    ; CHECK-NEXT: [[LOAD:%[0-9]+]]:_(<vscale x 2 x s32>) = G_LOAD [[COPY]](p0) :: (load (<vscale x 2 x s32>) from %ir.pa, align 256)
+    ; CHECK-NEXT: $v8 = COPY [[LOAD]](<vscale x 2 x s32>)
+    ; CHECK-NEXT: PseudoRET implicit $v8
+    %0:_(p0) = COPY $x10
+    %1:_(<vscale x 2 x s32>) = G_LOAD %0(p0) :: (load (<vscale x 2 x s32>) from %ir.pa, align 256)
+    $v8 = COPY %1(<vscale x 2 x s32>)
+    PseudoRET implicit $v8
+
+...
+---
+name:            vload_nxv2i64_align4
+body:             |
+  bb.1 (%ir-block.0):
+    liveins: $x10
+
+    ; CHECK-LABEL: name: vload_nxv2i64_align4
+    ; CHECK: liveins: $x10
+    ; CHECK-NEXT: {{  $}}
+    ; CHECK-NEXT: [[COPY:%[0-9]+]]:_(p0) = COPY $x10
+    ; CHECK-NEXT: [[LOAD:%[0-9]+]]:_(<vscale x 16 x s8>) = G_LOAD [[COPY]](p0) :: (load (<vscale x 16 x s8>) from %ir.pa, align 4)
+    ; CHECK-NEXT: [[BITCAST:%[0-9]+]]:_(<vscale x 2 x s64>) = G_BITCAST [[LOAD]](<vscale x 16 x s8>)
+    ; CHECK-NEXT: $v8m2 = COPY [[BITCAST]](<vscale x 2 x s64>)
+    ; CHECK-NEXT: PseudoRET implicit $v8m2
+    %0:_(p0) = COPY $x10
+    %1:_(<vscale x 2 x s64>) = G_LOAD %0(p0) :: (load (<vscale x 2 x s64>) from %ir.pa, align 4)
+    $v8m2 = COPY %1(<vscale x 2 x s64>)
+    PseudoRET implicit $v8m2
+
+...
+---
+name:            vload_nxv2i64_align8
+body:             |
+  bb.1 (%ir-block.0):
+    liveins: $x10
+
+    ; CHECK-LABEL: name: vload_nxv2i64_align8
+    ; CHECK: liveins: $x10
+    ; CHECK-NEXT: {{  $}}
+    ; CHECK-NEXT: [[COPY:%[0-9]+]]:_(p0) = COPY $x10
+    ; CHECK-NEXT: [[LOAD:%[0-9]+]]:_(<vscale x 2 x s64>) = G_LOAD [[COPY]](p0) :: (load (<vscale x 2 x s64>) from %ir.pa, align 8)
+    ; CHECK-NEXT: $v8m2 = COPY [[LOAD]](<vscale x 2 x s64>)
+    ; CHECK-NEXT: PseudoRET implicit $v8m2
+    %0:_(p0) = COPY $x10
+    %1:_(<vscale x 2 x s64>) = G_LOAD %0(p0) :: (load (<vscale x 2 x s64>) from %ir.pa, align 8)
+    $v8m2 = COPY %1(<vscale x 2 x s64>)
+    PseudoRET implicit $v8m2
+
+...
+---
+name:            vload_nxv2i64_align16
+body:             |
+  bb.1 (%ir-block.0):
+    liveins: $x10
+
+    ; CHECK-LABEL: name: vload_nxv2i64_align16
+    ; CHECK: liveins: $x10
+    ; CHECK-NEXT: {{  $}}
+    ; CHECK-NEXT: [[COPY:%[0-9]+]]:_(p0) = COPY $x10
+    ; CHECK-NEXT: [[LOAD:%[0-9]+]]:_(<vscale x 2 x s64>) = G_LOAD [[COPY]](p0) :: (load (<vscale x 2 x s64>) from %ir.pa)
+    ; CHECK-NEXT: $v8m2 = COPY [[LOAD]](<vscale x 2 x s64>)
+    ; CHECK-NEXT: PseudoRET implicit $v8m2
+    %0:_(p0) = COPY $x10
+    %1:_(<vscale x 2 x s64>) = G_LOAD %0(p0) :: (load (<vscale x 2 x s64>) from %ir.pa)
+    $v8m2 = COPY %1(<vscale x 2 x s64>)
+    PseudoRET implicit $v8m2
+
+...
+---
+name:            vload_nxv2i64_align32
+body:             |
+  bb.1 (%ir-block.0):
+    liveins: $x10
+
+    ; CHECK-LABEL: name: vload_nxv2i64_align32
+    ; CHECK: liveins: $x10
+    ; CHECK-NEXT: {{  $}}
+    ; CHECK-NEXT: [[COPY:%[0-9]+]]:_(p0) = COPY $x10
+    ; CHECK-NEXT: [[LOAD:%[0-9]+]]:_(<vscale x 2 x s64>) = G_LOAD [[COPY]](p0) :: (load (<vscale x 2 x s64>) from %ir.pa, align 32)
+    ; CHECK-NEXT: $v8m2 = COPY [[LOAD]](<vscale x 2 x s64>)
+    ; CHECK-NEXT: PseudoRET implicit $v8m2
+    %0:_(p0) = COPY $x10
+    %1:_(<vscale x 2 x s64>) = G_LOAD %0(p0) :: (load (<vscale x 2 x s64>) from %ir.pa, align 32)
+    $v8m2 = COPY %1(<vscale x 2 x s64>)
+    PseudoRET implicit $v8m2
+
+...
+---
+name:            vload_nxv1ptr
+body:             |
+  bb.1 (%ir-block.0):
+    liveins: $x10
+
+    ; CHECK-LABEL: name: vload_nxv1ptr
+    ; CHECK: liveins: $x10
+    ; CHECK-NEXT: {{  $}}
+    ; CHECK-NEXT: [[COPY:%[0-9]+]]:_(p0) = COPY $x10
+    ; CHECK-NEXT: [[LOAD:%[0-9]+]]:_(<vscale x 1 x p0>) = G_LOAD [[COPY]](p0) :: (load (<vscale x 1 x p0>) from %ir.pa)
+    ; CHECK-NEXT: $v8 = COPY [[LOAD]](<vscale x 1 x p0>)
+    ; CHECK-NEXT: PseudoRET implicit $v8
+    %0:_(p0) = COPY $x10
+    %1:_(<vscale x 1 x p0>) = G_LOAD %0(p0) :: (load (<vscale x 1 x p0>) from %ir.pa)
+    $v8 = COPY %1(<vscale x 1 x p0>)
+    PseudoRET implicit $v8
+
+...
+---
+name:            vload_nxv2ptr
+body:             |
+  bb.1 (%ir-block.0):
+    liveins: $x10
+
+    ; CHECK-LABEL: name: vload_nxv2ptr
+    ; CHECK: liveins: $x10
+    ; CHECK-NEXT: {{  $}}
+    ; CHECK-NEXT: [[COPY:%[0-9]+]]:_(p0) = COPY $x10
+    ; CHECK-NEXT: [[LOAD:%[0-9]+]]:_(<vscale x 2 x p0>) = G_LOAD [[COPY]](p0) :: (load (<vscale x 2 x p0>) from %ir.pa)
+    ; CHECK-NEXT: $v8 = COPY [[LOAD]](<vscale x 2 x p0>)
+    ; CHECK-NEXT: PseudoRET implicit $v8
+    %0:_(p0) = COPY $x10
+    %1:_(<vscale x 2 x p0>) = G_LOAD %0(p0) :: (load (<vscale x 2 x p0>) from %ir.pa)
+    $v8 = COPY %1(<vscale x 2 x p0>)
+    PseudoRET implicit $v8
+
+...
+---
+name:            vload_nxv8ptr
+body:             |
+  bb.1 (%ir-block.0):
+    liveins: $x10
+
+    ; CHECK-LABEL: name: vload_nxv8ptr
+    ; CHECK: liveins: $x10
+    ; CHECK-NEXT: {{  $}}
+    ; CHECK-NEXT: [[COPY:%[0-9]+]]:_(p0) = COPY $x10
+    ; CHECK-NEXT: [[LOAD:%[0-9]+]]:_(<vscale x 8 x p0>) = G_LOAD [[COPY]](p0) :: (load (<vscale x 8 x p0>) from %ir.pa)
+    ; CHECK-NEXT: $v8m4 = COPY [[LOAD]](<vscale x 8 x p0>)
+    ; CHECK-NEXT: PseudoRET implicit $v8m4
+    %0:_(p0) = COPY $x10
+    %1:_(<vscale x 8 x p0>) = G_LOAD %0(p0) :: (load (<vscale x 8 x p0>) from %ir.pa)
+    $v8m4 = COPY %1(<vscale x 8 x p0>)
+    PseudoRET implicit $v8m4
+
+...
diff --git a/llvm/test/CodeGen/RISCV/GlobalISel/legalizer/rvv/legalize-store.mir b/llvm/test/CodeGen/RISCV/GlobalISel/legalizer/rvv/legalize-store.mir
new file mode 100644
index 0000000..b91d255
--- /dev/null
+++ b/llvm/test/CodeGen/RISCV/GlobalISel/legalizer/rvv/legalize-store.mir
@@ -0,0 +1,1043 @@
+# NOTE: Assertions have been autogenerated by utils/update_mir_test_checks.py
+# RUN: llc -mtriple=riscv32 -mattr=+v -run-pass=legalizer %s -o - | FileCheck %s
+# RUN: llc -mtriple=riscv64 -mattr=+v -run-pass=legalizer %s -o - | FileCheck %s
+--- |
+
+  define void @vstore_nx1i8(ptr %pa, <vscale x 1 x i8> %b) #0 {
+    store <vscale x 1 x i8> %b, ptr %pa, align 1
+    ret void
+  }
+
+  define void @vstore_nx2i8(ptr %pa, <vscale x 2 x i8> %b) #0 {
+    store <vscale x 2 x i8> %b, ptr %pa, align 2
+    ret void
+  }
+
+  define void @vstore_nx4i8(ptr %pa, <vscale x 4 x i8> %b) #0 {
+    store <vscale x 4 x i8> %b, ptr %pa, align 4
+    ret void
+  }
+
+  define void @vstore_nx8i8(ptr %pa, <vscale x 8 x i8> %b) #0 {
+    store <vscale x 8 x i8> %b, ptr %pa, align 8
+    ret void
+  }
+
+  define void @vstore_nx16i8(ptr %pa, <vscale x 16 x i8> %b) #0 {
+    store <vscale x 16 x i8> %b, ptr %pa, align 16
+    ret void
+  }
+
+  define void @vstore_nx32i8(ptr %pa, <vscale x 32 x i8> %b) #0 {
+    store <vscale x 32 x i8> %b, ptr %pa, align 32
+    ret void
+  }
+
+  define void @vstore_nx64i8(ptr %pa, <vscale x 64 x i8> %b) #0 {
+    store <vscale x 64 x i8> %b, ptr %pa, align 64
+    ret void
+  }
+
+  define void @vstore_nx1i16(ptr %pa, <vscale x 1 x i16> %b) #0 {
+    store <vscale x 1 x i16> %b, ptr %pa, align 2
+    ret void
+  }
+
+  define void @vstore_nx2i16(ptr %pa, <vscale x 2 x i16> %b) #0 {
+    store <vscale x 2 x i16> %b, ptr %pa, align 4
+    ret void
+  }
+
+  define void @vstore_nx4i16(ptr %pa, <vscale x 4 x i16> %b) #0 {
+    store <vscale x 4 x i16> %b, ptr %pa, align 8
+    ret void
+  }
+
+  define void @vstore_nx8i16(ptr %pa, <vscale x 8 x i16> %b) #0 {
+    store <vscale x 8 x i16> %b, ptr %pa, align 16
+    ret void
+  }
+
+  define void @vstore_nx16i16(ptr %pa, <vscale x 16 x i16> %b) #0 {
+    store <vscale x 16 x i16> %b, ptr %pa, align 32
+    ret void
+  }
+
+  define void @vstore_nx32i16(ptr %pa, <vscale x 32 x i16> %b) #0 {
+    store <vscale x 32 x i16> %b, ptr %pa, align 64
+    ret void
+  }
+
+  define void @vstore_nx1i32(ptr %pa, <vscale x 1 x i32> %b) #0 {
+    store <vscale x 1 x i32> %b, ptr %pa, align 4
+    ret void
+  }
+
+  define void @vstore_nx2i32(ptr %pa, <vscale x 2 x i32> %b) #0 {
+    store <vscale x 2 x i32> %b, ptr %pa, align 8
+    ret void
+  }
+
+  define void @vstore_nx4i32(ptr %pa, <vscale x 4 x i32> %b) #0 {
+    store <vscale x 4 x i32> %b, ptr %pa, align 16
+    ret void
+  }
+
+  define void @vstore_nx8i32(ptr %pa, <vscale x 8 x i32> %b) #0 {
+    store <vscale x 8 x i32> %b, ptr %pa, align 32
+    ret void
+  }
+
+  define void @vstore_nx16i32(ptr %pa, <vscale x 16 x i32> %b) #0 {
+    store <vscale x 16 x i32> %b, ptr %pa, align 64
+    ret void
+  }
+
+  define void @vstore_nx1i64(ptr %pa, <vscale x 1 x i64> %b) #0 {
+    store <vscale x 1 x i64> %b, ptr %pa, align 8
+    ret void
+  }
+
+  define void @vstore_nx2i64(ptr %pa, <vscale x 2 x i64> %b) #0 {
+    store <vscale x 2 x i64> %b, ptr %pa, align 16
+    ret void
+  }
+
+  define void @vstore_nx4i64(ptr %pa, <vscale x 4 x i64> %b) #0 {
+    store <vscale x 4 x i64> %b, ptr %pa, align 32
+    ret void
+  }
+
+  define void @vstore_nx8i64(ptr %pa, <vscale x 8 x i64> %b) #0 {
+    store <vscale x 8 x i64> %b, ptr %pa, align 64
+    ret void
+  }
+
+  define void @vstore_nx16i8_align1(ptr %pa, <vscale x 16 x i8> %b) #0 {
+    store <vscale x 16 x i8> %b, ptr %pa, align 1
+    ret void
+  }
+
+  define void @vstore_nx16i8_align2(ptr %pa, <vscale x 16 x i8> %b) #0 {
+    store <vscale x 16 x i8> %b, ptr %pa, align 2
+    ret void
+  }
+
+  define void @vstore_nx16i8_align16(ptr %pa, <vscale x 16 x i8> %b) #0 {
+    store <vscale x 16 x i8> %b, ptr %pa, align 16
+    ret void
+  }
+
+  define void @vstore_nx16i8_align64(ptr %pa, <vscale x 16 x i8> %b) #0 {
+    store <vscale x 16 x i8> %b, ptr %pa, align 64
+    ret void
+  }
+
+  define void @vstore_nx4i16_align1(ptr %pa, <vscale x 4 x i16> %b) #0 {
+    store <vscale x 4 x i16> %b, ptr %pa, align 1
+    ret void
+  }
+
+  define void @vstore_nx4i16_align2(ptr %pa, <vscale x 4 x i16> %b) #0 {
+    store <vscale x 4 x i16> %b, ptr %pa, align 2
+    ret void
+  }
+
+  define void @vstore_nx4i16_align4(ptr %pa, <vscale x 4 x i16> %b) #0 {
+    store <vscale x 4 x i16> %b, ptr %pa, align 4
+    ret void
+  }
+
+  define void @vstore_nx4i16_align8(ptr %pa, <vscale x 4 x i16> %b) #0 {
+    store <vscale x 4 x i16> %b, ptr %pa, align 8
+    ret void
+  }
+
+  define void @vstore_nx4i16_align16(ptr %pa, <vscale x 4 x i16> %b) #0 {
+    store <vscale x 4 x i16> %b, ptr %pa, align 16
+    ret void
+  }
+
+  define void @vstore_nx2i32_align2(ptr %pa, <vscale x 2 x i32> %b) #0 {
+    store <vscale x 2 x i32> %b, ptr %pa, align 2
+    ret void
+  }
+
+  define void @vstore_nx2i32_align4(ptr %pa, <vscale x 2 x i32> %b) #0 {
+    store <vscale x 2 x i32> %b, ptr %pa, align 4
+    ret void
+  }
+
+  define void @vstore_nx2i32_align8(ptr %pa, <vscale x 2 x i32> %b) #0 {
+    store <vscale x 2 x i32> %b, ptr %pa, align 8
+    ret void
+  }
+
+  define void @vstore_nx2i32_align16(ptr %pa, <vscale x 2 x i32> %b) #0 {
+    store <vscale x 2 x i32> %b, ptr %pa, align 16
+    ret void
+  }
+
+  define void @vstore_nx2i32_align256(ptr %pa, <vscale x 2 x i32> %b) #0 {
+    store <vscale x 2 x i32> %b, ptr %pa, align 256
+    ret void
+  }
+
+  define void @vstore_nx2i64_align4(ptr %pa, <vscale x 2 x i64> %b) #0 {
+    store <vscale x 2 x i64> %b, ptr %pa, align 4
+    ret void
+  }
+
+  define void @vstore_nx2i64_align8(ptr %pa, <vscale x 2 x i64> %b) #0 {
+    store <vscale x 2 x i64> %b, ptr %pa, align 8
+    ret void
+  }
+
+  define void @vstore_nx2i64_align16(ptr %pa, <vscale x 2 x i64> %b) #0 {
+    store <vscale x 2 x i64> %b, ptr %pa, align 16
+    ret void
+  }
+
+  define void @vstore_nx2i64_align32(ptr %pa, <vscale x 2 x i64> %b) #0 {
+    store <vscale x 2 x i64> %b, ptr %pa, align 32
+    ret void
+  }
+
+  define void @vstore_nx1ptr(ptr %pa, <vscale x 1 x ptr> %b) #0 {
+    store <vscale x 1 x ptr> %b, ptr %pa, align 4
+    ret void
+  }
+
+  define void @vstore_nx2ptr(ptr %pa, <vscale x 2 x ptr> %b) #0 {
+    store <vscale x 2 x ptr> %b, ptr %pa, align 8
+    ret void
+  }
+
+  define void @vstore_nx8ptr(ptr %pa, <vscale x 8 x ptr> %b) #0 {
+    store <vscale x 8 x ptr> %b, ptr %pa, align 32
+    ret void
+  }
+
+  attributes #0 = { "target-features"="+v" }
+
+...
+---
+name:            vstore_nx1i8
+body:             |
+  bb.1 (%ir-block.0):
+    liveins: $v8, $x10
+
+    ; CHECK-LABEL: name: vstore_nx1i8
+    ; CHECK: liveins: $v8, $x10
+    ; CHECK-NEXT: {{  $}}
+    ; CHECK-NEXT: [[COPY:%[0-9]+]]:_(p0) = COPY $x10
+    ; CHECK-NEXT: [[COPY1:%[0-9]+]]:_(<vscale x 1 x s8>) = COPY $v8
+    ; CHECK-NEXT: G_STORE [[COPY1]](<vscale x 1 x s8>), [[COPY]](p0) :: (store (<vscale x 1 x s8>) into %ir.pa)
+    ; CHECK-NEXT: PseudoRET
+    %0:_(p0) = COPY $x10
+    %1:_(<vscale x 1 x s8>) = COPY $v8
+    G_STORE %1(<vscale x 1 x s8>), %0(p0) :: (store (<vscale x 1 x s8>) into %ir.pa)
+    PseudoRET
+
+...
+---
+name:            vstore_nx2i8
+body:             |
+  bb.1 (%ir-block.0):
+    liveins: $v8, $x10
+
+    ; CHECK-LABEL: name: vstore_nx2i8
+    ; CHECK: liveins: $v8, $x10
+    ; CHECK-NEXT: {{  $}}
+    ; CHECK-NEXT: [[COPY:%[0-9]+]]:_(p0) = COPY $x10
+    ; CHECK-NEXT: [[COPY1:%[0-9]+]]:_(<vscale x 2 x s8>) = COPY $v8
+    ; CHECK-NEXT: G_STORE [[COPY1]](<vscale x 2 x s8>), [[COPY]](p0) :: (store (<vscale x 2 x s8>) into %ir.pa)
+    ; CHECK-NEXT: PseudoRET
+    %0:_(p0) = COPY $x10
+    %1:_(<vscale x 2 x s8>) = COPY $v8
+    G_STORE %1(<vscale x 2 x s8>), %0(p0) :: (store (<vscale x 2 x s8>) into %ir.pa)
+    PseudoRET
+
+...
+---
+name:            vstore_nx4i8
+body:             |
+  bb.1 (%ir-block.0):
+    liveins: $v8, $x10
+
+    ; CHECK-LABEL: name: vstore_nx4i8
+    ; CHECK: liveins: $v8, $x10
+    ; CHECK-NEXT: {{  $}}
+    ; CHECK-NEXT: [[COPY:%[0-9]+]]:_(p0) = COPY $x10
+    ; CHECK-NEXT: [[COPY1:%[0-9]+]]:_(<vscale x 4 x s8>) = COPY $v8
+    ; CHECK-NEXT: G_STORE [[COPY1]](<vscale x 4 x s8>), [[COPY]](p0) :: (store (<vscale x 4 x s8>) into %ir.pa)
+    ; CHECK-NEXT: PseudoRET
+    %0:_(p0) = COPY $x10
+    %1:_(<vscale x 4 x s8>) = COPY $v8
+    G_STORE %1(<vscale x 4 x s8>), %0(p0) :: (store (<vscale x 4 x s8>) into %ir.pa)
+    PseudoRET
+
+...
+---
+name:            vstore_nx8i8
+body:             |
+  bb.1 (%ir-block.0):
+    liveins: $v8, $x10
+
+    ; CHECK-LABEL: name: vstore_nx8i8
+    ; CHECK: liveins: $v8, $x10
+    ; CHECK-NEXT: {{  $}}
+    ; CHECK-NEXT: [[COPY:%[0-9]+]]:_(p0) = COPY $x10
+    ; CHECK-NEXT: [[COPY1:%[0-9]+]]:_(<vscale x 8 x s8>) = COPY $v8
+    ; CHECK-NEXT: G_STORE [[COPY1]](<vscale x 8 x s8>), [[COPY]](p0) :: (store (<vscale x 8 x s8>) into %ir.pa)
+    ; CHECK-NEXT: PseudoRET
+    %0:_(p0) = COPY $x10
+    %1:_(<vscale x 8 x s8>) = COPY $v8
+    G_STORE %1(<vscale x 8 x s8>), %0(p0) :: (store (<vscale x 8 x s8>) into %ir.pa)
+    PseudoRET
+
+...
+---
+name:            vstore_nx16i8
+body:             |
+  bb.1 (%ir-block.0):
+    liveins: $x10, $v8m2
+
+    ; CHECK-LABEL: name: vstore_nx16i8
+    ; CHECK: liveins: $x10, $v8m2
+    ; CHECK-NEXT: {{  $}}
+    ; CHECK-NEXT: [[COPY:%[0-9]+]]:_(p0) = COPY $x10
+    ; CHECK-NEXT: [[COPY1:%[0-9]+]]:_(<vscale x 16 x s8>) = COPY $v8m2
+    ; CHECK-NEXT: G_STORE [[COPY1]](<vscale x 16 x s8>), [[COPY]](p0) :: (store (<vscale x 16 x s8>) into %ir.pa)
+    ; CHECK-NEXT: PseudoRET
+    %0:_(p0) = COPY $x10
+    %1:_(<vscale x 16 x s8>) = COPY $v8m2
+    G_STORE %1(<vscale x 16 x s8>), %0(p0) :: (store (<vscale x 16 x s8>) into %ir.pa)
+    PseudoRET
+
+...
+---
+name:            vstore_nx32i8
+body:             |
+  bb.1 (%ir-block.0):
+    liveins: $x10, $v8m4
+
+    ; CHECK-LABEL: name: vstore_nx32i8
+    ; CHECK: liveins: $x10, $v8m4
+    ; CHECK-NEXT: {{  $}}
+    ; CHECK-NEXT: [[COPY:%[0-9]+]]:_(p0) = COPY $x10
+    ; CHECK-NEXT: [[COPY1:%[0-9]+]]:_(<vscale x 32 x s8>) = COPY $v8m4
+    ; CHECK-NEXT: G_STORE [[COPY1]](<vscale x 32 x s8>), [[COPY]](p0) :: (store (<vscale x 32 x s8>) into %ir.pa)
+    ; CHECK-NEXT: PseudoRET
+    %0:_(p0) = COPY $x10
+    %1:_(<vscale x 32 x s8>) = COPY $v8m4
+    G_STORE %1(<vscale x 32 x s8>), %0(p0) :: (store (<vscale x 32 x s8>) into %ir.pa)
+    PseudoRET
+
+...
+---
+name:            vstore_nx64i8
+body:             |
+  bb.1 (%ir-block.0):
+    liveins: $x10, $v8m8
+
+    ; CHECK-LABEL: name: vstore_nx64i8
+    ; CHECK: liveins: $x10, $v8m8
+    ; CHECK-NEXT: {{  $}}
+    ; CHECK-NEXT: [[COPY:%[0-9]+]]:_(p0) = COPY $x10
+    ; CHECK-NEXT: [[COPY1:%[0-9]+]]:_(<vscale x 64 x s8>) = COPY $v8m8
+    ; CHECK-NEXT: G_STORE [[COPY1]](<vscale x 64 x s8>), [[COPY]](p0) :: (store (<vscale x 64 x s8>) into %ir.pa)
+    ; CHECK-NEXT: PseudoRET
+    %0:_(p0) = COPY $x10
+    %1:_(<vscale x 64 x s8>) = COPY $v8m8
+    G_STORE %1(<vscale x 64 x s8>), %0(p0) :: (store (<vscale x 64 x s8>) into %ir.pa)
+    PseudoRET
+
+...
+---
+name:            vstore_nx1i16
+body:             |
+  bb.1 (%ir-block.0):
+    liveins: $v8, $x10
+
+    ; CHECK-LABEL: name: vstore_nx1i16
+    ; CHECK: liveins: $v8, $x10
+    ; CHECK-NEXT: {{  $}}
+    ; CHECK-NEXT: [[COPY:%[0-9]+]]:_(p0) = COPY $x10
+    ; CHECK-NEXT: [[COPY1:%[0-9]+]]:_(<vscale x 1 x s16>) = COPY $v8
+    ; CHECK-NEXT: G_STORE [[COPY1]](<vscale x 1 x s16>), [[COPY]](p0) :: (store (<vscale x 1 x s16>) into %ir.pa)
+    ; CHECK-NEXT: PseudoRET
+    %0:_(p0) = COPY $x10
+    %1:_(<vscale x 1 x s16>) = COPY $v8
+    G_STORE %1(<vscale x 1 x s16>), %0(p0) :: (store (<vscale x 1 x s16>) into %ir.pa)
+    PseudoRET
+
+...
+---
+name:            vstore_nx2i16
+body:             |
+  bb.1 (%ir-block.0):
+    liveins: $v8, $x10
+
+    ; CHECK-LABEL: name: vstore_nx2i16
+    ; CHECK: liveins: $v8, $x10
+    ; CHECK-NEXT: {{  $}}
+    ; CHECK-NEXT: [[COPY:%[0-9]+]]:_(p0) = COPY $x10
+    ; CHECK-NEXT: [[COPY1:%[0-9]+]]:_(<vscale x 2 x s16>) = COPY $v8
+    ; CHECK-NEXT: G_STORE [[COPY1]](<vscale x 2 x s16>), [[COPY]](p0) :: (store (<vscale x 2 x s16>) into %ir.pa)
+    ; CHECK-NEXT: PseudoRET
+    %0:_(p0) = COPY $x10
+    %1:_(<vscale x 2 x s16>) = COPY $v8
+    G_STORE %1(<vscale x 2 x s16>), %0(p0) :: (store (<vscale x 2 x s16>) into %ir.pa)
+    PseudoRET
+
+...
+---
+name:            vstore_nx4i16
+body:             |
+  bb.1 (%ir-block.0):
+    liveins: $v8, $x10
+
+    ; CHECK-LABEL: name: vstore_nx4i16
+    ; CHECK: liveins: $v8, $x10
+    ; CHECK-NEXT: {{  $}}
+    ; CHECK-NEXT: [[COPY:%[0-9]+]]:_(p0) = COPY $x10
+    ; CHECK-NEXT: [[COPY1:%[0-9]+]]:_(<vscale x 4 x s16>) = COPY $v8
+    ; CHECK-NEXT: G_STORE [[COPY1]](<vscale x 4 x s16>), [[COPY]](p0) :: (store (<vscale x 4 x s16>) into %ir.pa)
+    ; CHECK-NEXT: PseudoRET
+    %0:_(p0) = COPY $x10
+    %1:_(<vscale x 4 x s16>) = COPY $v8
+    G_STORE %1(<vscale x 4 x s16>), %0(p0) :: (store (<vscale x 4 x s16>) into %ir.pa)
+    PseudoRET
+
+...
+---
+name:            vstore_nx8i16
+body:             |
+  bb.1 (%ir-block.0):
+    liveins: $x10, $v8m2
+
+    ; CHECK-LABEL: name: vstore_nx8i16
+    ; CHECK: liveins: $x10, $v8m2
+    ; CHECK-NEXT: {{  $}}
+    ; CHECK-NEXT: [[COPY:%[0-9]+]]:_(p0) = COPY $x10
+    ; CHECK-NEXT: [[COPY1:%[0-9]+]]:_(<vscale x 8 x s16>) = COPY $v8m2
+    ; CHECK-NEXT: G_STORE [[COPY1]](<vscale x 8 x s16>), [[COPY]](p0) :: (store (<vscale x 8 x s16>) into %ir.pa)
+    ; CHECK-NEXT: PseudoRET
+    %0:_(p0) = COPY $x10
+    %1:_(<vscale x 8 x s16>) = COPY $v8m2
+    G_STORE %1(<vscale x 8 x s16>), %0(p0) :: (store (<vscale x 8 x s16>) into %ir.pa)
+    PseudoRET
+
+...
+---
+name:            vstore_nx16i16
+body:             |
+  bb.1 (%ir-block.0):
+    liveins: $x10, $v8m4
+
+    ; CHECK-LABEL: name: vstore_nx16i16
+    ; CHECK: liveins: $x10, $v8m4
+    ; CHECK-NEXT: {{  $}}
+    ; CHECK-NEXT: [[COPY:%[0-9]+]]:_(p0) = COPY $x10
+    ; CHECK-NEXT: [[COPY1:%[0-9]+]]:_(<vscale x 16 x s16>) = COPY $v8m4
+    ; CHECK-NEXT: G_STORE [[COPY1]](<vscale x 16 x s16>), [[COPY]](p0) :: (store (<vscale x 16 x s16>) into %ir.pa)
+    ; CHECK-NEXT: PseudoRET
+    %0:_(p0) = COPY $x10
+    %1:_(<vscale x 16 x s16>) = COPY $v8m4
+    G_STORE %1(<vscale x 16 x s16>), %0(p0) :: (store (<vscale x 16 x s16>) into %ir.pa)
+    PseudoRET
+
+...
+---
+name:            vstore_nx32i16
+body:             |
+  bb.1 (%ir-block.0):
+    liveins: $x10, $v8m8
+
+    ; CHECK-LABEL: name: vstore_nx32i16
+    ; CHECK: liveins: $x10, $v8m8
+    ; CHECK-NEXT: {{  $}}
+    ; CHECK-NEXT: [[COPY:%[0-9]+]]:_(p0) = COPY $x10
+    ; CHECK-NEXT: [[COPY1:%[0-9]+]]:_(<vscale x 32 x s16>) = COPY $v8m8
+    ; CHECK-NEXT: G_STORE [[COPY1]](<vscale x 32 x s16>), [[COPY]](p0) :: (store (<vscale x 32 x s16>) into %ir.pa)
+    ; CHECK-NEXT: PseudoRET
+    %0:_(p0) = COPY $x10
+    %1:_(<vscale x 32 x s16>) = COPY $v8m8
+    G_STORE %1(<vscale x 32 x s16>), %0(p0) :: (store (<vscale x 32 x s16>) into %ir.pa)
+    PseudoRET
+
+...
+---
+name:            vstore_nx1i32
+body:             |
+  bb.1 (%ir-block.0):
+    liveins: $v8, $x10
+
+    ; CHECK-LABEL: name: vstore_nx1i32
+    ; CHECK: liveins: $v8, $x10
+    ; CHECK-NEXT: {{  $}}
+    ; CHECK-NEXT: [[COPY:%[0-9]+]]:_(p0) = COPY $x10
+    ; CHECK-NEXT: [[COPY1:%[0-9]+]]:_(<vscale x 1 x s32>) = COPY $v8
+    ; CHECK-NEXT: G_STORE [[COPY1]](<vscale x 1 x s32>), [[COPY]](p0) :: (store (<vscale x 1 x s32>) into %ir.pa)
+    ; CHECK-NEXT: PseudoRET
+    %0:_(p0) = COPY $x10
+    %1:_(<vscale x 1 x s32>) = COPY $v8
+    G_STORE %1(<vscale x 1 x s32>), %0(p0) :: (store (<vscale x 1 x s32>) into %ir.pa)
+    PseudoRET
+
+...
+---
+name:            vstore_nx2i32
+body:             |
+  bb.1 (%ir-block.0):
+    liveins: $v8, $x10
+
+    ; CHECK-LABEL: name: vstore_nx2i32
+    ; CHECK: liveins: $v8, $x10
+    ; CHECK-NEXT: {{  $}}
+    ; CHECK-NEXT: [[COPY:%[0-9]+]]:_(p0) = COPY $x10
+    ; CHECK-NEXT: [[COPY1:%[0-9]+]]:_(<vscale x 2 x s32>) = COPY $v8
+    ; CHECK-NEXT: G_STORE [[COPY1]](<vscale x 2 x s32>), [[COPY]](p0) :: (store (<vscale x 2 x s32>) into %ir.pa)
+    ; CHECK-NEXT: PseudoRET
+    %0:_(p0) = COPY $x10
+    %1:_(<vscale x 2 x s32>) = COPY $v8
+    G_STORE %1(<vscale x 2 x s32>), %0(p0) :: (store (<vscale x 2 x s32>) into %ir.pa)
+    PseudoRET
+
+...
+---
+name:            vstore_nx4i32
+body:             |
+  bb.1 (%ir-block.0):
+    liveins: $x10, $v8m2
+
+    ; CHECK-LABEL: name: vstore_nx4i32
+    ; CHECK: liveins: $x10, $v8m2
+    ; CHECK-NEXT: {{  $}}
+    ; CHECK-NEXT: [[COPY:%[0-9]+]]:_(p0) = COPY $x10
+    ; CHECK-NEXT: [[COPY1:%[0-9]+]]:_(<vscale x 4 x s32>) = COPY $v8m2
+    ; CHECK-NEXT: G_STORE [[COPY1]](<vscale x 4 x s32>), [[COPY]](p0) :: (store (<vscale x 4 x s32>) into %ir.pa)
+    ; CHECK-NEXT: PseudoRET
+    %0:_(p0) = COPY $x10
+    %1:_(<vscale x 4 x s32>) = COPY $v8m2
+    G_STORE %1(<vscale x 4 x s32>), %0(p0) :: (store (<vscale x 4 x s32>) into %ir.pa)
+    PseudoRET
+
+...
+---
+name:            vstore_nx8i32
+body:             |
+  bb.1 (%ir-block.0):
+    liveins: $x10, $v8m4
+
+    ; CHECK-LABEL: name: vstore_nx8i32
+    ; CHECK: liveins: $x10, $v8m4
+    ; CHECK-NEXT: {{  $}}
+    ; CHECK-NEXT: [[COPY:%[0-9]+]]:_(p0) = COPY $x10
+    ; CHECK-NEXT: [[COPY1:%[0-9]+]]:_(<vscale x 8 x s32>) = COPY $v8m4
+    ; CHECK-NEXT: G_STORE [[COPY1]](<vscale x 8 x s32>), [[COPY]](p0) :: (store (<vscale x 8 x s32>) into %ir.pa)
+    ; CHECK-NEXT: PseudoRET
+    %0:_(p0) = COPY $x10
+    %1:_(<vscale x 8 x s32>) = COPY $v8m4
+    G_STORE %1(<vscale x 8 x s32>), %0(p0) :: (store (<vscale x 8 x s32>) into %ir.pa)
+    PseudoRET
+
+...
+---
+name:            vstore_nx16i32
+body:             |
+  bb.1 (%ir-block.0):
+    liveins: $x10, $v8m8
+
+    ; CHECK-LABEL: name: vstore_nx16i32
+    ; CHECK: liveins: $x10, $v8m8
+    ; CHECK-NEXT: {{  $}}
+    ; CHECK-NEXT: [[COPY:%[0-9]+]]:_(p0) = COPY $x10
+    ; CHECK-NEXT: [[COPY1:%[0-9]+]]:_(<vscale x 16 x s32>) = COPY $v8m8
+    ; CHECK-NEXT: G_STORE [[COPY1]](<vscale x 16 x s32>), [[COPY]](p0) :: (store (<vscale x 16 x s32>) into %ir.pa)
+    ; CHECK-NEXT: PseudoRET
+    %0:_(p0) = COPY $x10
+    %1:_(<vscale x 16 x s32>) = COPY $v8m8
+    G_STORE %1(<vscale x 16 x s32>), %0(p0) :: (store (<vscale x 16 x s32>) into %ir.pa)
+    PseudoRET
+
+...
+---
+name:            vstore_nx1i64
+body:             |
+  bb.1 (%ir-block.0):
+    liveins: $v8, $x10
+
+    ; CHECK-LABEL: name: vstore_nx1i64
+    ; CHECK: liveins: $v8, $x10
+    ; CHECK-NEXT: {{  $}}
+    ; CHECK-NEXT: [[COPY:%[0-9]+]]:_(p0) = COPY $x10
+    ; CHECK-NEXT: [[COPY1:%[0-9]+]]:_(<vscale x 1 x s64>) = COPY $v8
+    ; CHECK-NEXT: G_STORE [[COPY1]](<vscale x 1 x s64>), [[COPY]](p0) :: (store (<vscale x 1 x s64>) into %ir.pa)
+    ; CHECK-NEXT: PseudoRET
+    %0:_(p0) = COPY $x10
+    %1:_(<vscale x 1 x s64>) = COPY $v8
+    G_STORE %1(<vscale x 1 x s64>), %0(p0) :: (store (<vscale x 1 x s64>) into %ir.pa)
+    PseudoRET
+
+...
+---
+name:            vstore_nx2i64
+body:             |
+  bb.1 (%ir-block.0):
+    liveins: $x10, $v8m2
+
+    ; CHECK-LABEL: name: vstore_nx2i64
+    ; CHECK: liveins: $x10, $v8m2
+    ; CHECK-NEXT: {{  $}}
+    ; CHECK-NEXT: [[COPY:%[0-9]+]]:_(p0) = COPY $x10
+    ; CHECK-NEXT: [[COPY1:%[0-9]+]]:_(<vscale x 2 x s64>) = COPY $v8m2
+    ; CHECK-NEXT: G_STORE [[COPY1]](<vscale x 2 x s64>), [[COPY]](p0) :: (store (<vscale x 2 x s64>) into %ir.pa)
+    ; CHECK-NEXT: PseudoRET
+    %0:_(p0) = COPY $x10
+    %1:_(<vscale x 2 x s64>) = COPY $v8m2
+    G_STORE %1(<vscale x 2 x s64>), %0(p0) :: (store (<vscale x 2 x s64>) into %ir.pa)
+    PseudoRET
+
+...
+---
+name:            vstore_nx4i64
+body:             |
+  bb.1 (%ir-block.0):
+    liveins: $x10, $v8m4
+
+    ; CHECK-LABEL: name: vstore_nx4i64
+    ; CHECK: liveins: $x10, $v8m4
+    ; CHECK-NEXT: {{  $}}
+    ; CHECK-NEXT: [[COPY:%[0-9]+]]:_(p0) = COPY $x10
+    ; CHECK-NEXT: [[COPY1:%[0-9]+]]:_(<vscale x 4 x s64>) = COPY $v8m4
+    ; CHECK-NEXT: G_STORE [[COPY1]](<vscale x 4 x s64>), [[COPY]](p0) :: (store (<vscale x 4 x s64>) into %ir.pa)
+    ; CHECK-NEXT: PseudoRET
+    %0:_(p0) = COPY $x10
+    %1:_(<vscale x 4 x s64>) = COPY $v8m4
+    G_STORE %1(<vscale x 4 x s64>), %0(p0) :: (store (<vscale x 4 x s64>) into %ir.pa)
+    PseudoRET
+
+...
+---
+name:            vstore_nx8i64
+body:             |
+  bb.1 (%ir-block.0):
+    liveins: $x10, $v8m8
+
+    ; CHECK-LABEL: name: vstore_nx8i64
+    ; CHECK: liveins: $x10, $v8m8
+    ; CHECK-NEXT: {{  $}}
+    ; CHECK-NEXT: [[COPY:%[0-9]+]]:_(p0) = COPY $x10
+    ; CHECK-NEXT: [[COPY1:%[0-9]+]]:_(<vscale x 8 x s64>) = COPY $v8m8
+    ; CHECK-NEXT: G_STORE [[COPY1]](<vscale x 8 x s64>), [[COPY]](p0) :: (store (<vscale x 8 x s64>) into %ir.pa)
+    ; CHECK-NEXT: PseudoRET
+    %0:_(p0) = COPY $x10
+    %1:_(<vscale x 8 x s64>) = COPY $v8m8
+    G_STORE %1(<vscale x 8 x s64>), %0(p0) :: (store (<vscale x 8 x s64>) into %ir.pa)
+    PseudoRET
+
+...
+---
+name:            vstore_nx16i8_align1
+body:             |
+  bb.1 (%ir-block.0):
+    liveins: $x10, $v8m2
+
+    ; CHECK-LABEL: name: vstore_nx16i8_align1
+    ; CHECK: liveins: $x10, $v8m2
+    ; CHECK-NEXT: {{  $}}
+    ; CHECK-NEXT: [[COPY:%[0-9]+]]:_(p0) = COPY $x10
+    ; CHECK-NEXT: [[COPY1:%[0-9]+]]:_(<vscale x 16 x s8>) = COPY $v8m2
+    ; CHECK-NEXT: G_STORE [[COPY1]](<vscale x 16 x s8>), [[COPY]](p0) :: (store (<vscale x 16 x s8>) into %ir.pa, align 1)
+    ; CHECK-NEXT: PseudoRET
+    %0:_(p0) = COPY $x10
+    %1:_(<vscale x 16 x s8>) = COPY $v8m2
+    G_STORE %1(<vscale x 16 x s8>), %0(p0) :: (store (<vscale x 16 x s8>) into %ir.pa, align 1)
+    PseudoRET
+
+...
+---
+name:            vstore_nx16i8_align2
+body:             |
+  bb.1 (%ir-block.0):
+    liveins: $x10, $v8m2
+
+    ; CHECK-LABEL: name: vstore_nx16i8_align2
+    ; CHECK: liveins: $x10, $v8m2
+    ; CHECK-NEXT: {{  $}}
+    ; CHECK-NEXT: [[COPY:%[0-9]+]]:_(p0) = COPY $x10
+    ; CHECK-NEXT: [[COPY1:%[0-9]+]]:_(<vscale x 16 x s8>) = COPY $v8m2
+    ; CHECK-NEXT: G_STORE [[COPY1]](<vscale x 16 x s8>), [[COPY]](p0) :: (store (<vscale x 16 x s8>) into %ir.pa, align 2)
+    ; CHECK-NEXT: PseudoRET
+    %0:_(p0) = COPY $x10
+    %1:_(<vscale x 16 x s8>) = COPY $v8m2
+    G_STORE %1(<vscale x 16 x s8>), %0(p0) :: (store (<vscale x 16 x s8>) into %ir.pa, align 2)
+    PseudoRET
+
+...
+---
+name:            vstore_nx16i8_align16
+body:             |
+  bb.1 (%ir-block.0):
+    liveins: $x10, $v8m2
+
+    ; CHECK-LABEL: name: vstore_nx16i8_align16
+    ; CHECK: liveins: $x10, $v8m2
+    ; CHECK-NEXT: {{  $}}
+    ; CHECK-NEXT: [[COPY:%[0-9]+]]:_(p0) = COPY $x10
+    ; CHECK-NEXT: [[COPY1:%[0-9]+]]:_(<vscale x 16 x s8>) = COPY $v8m2
+    ; CHECK-NEXT: G_STORE [[COPY1]](<vscale x 16 x s8>), [[COPY]](p0) :: (store (<vscale x 16 x s8>) into %ir.pa)
+    ; CHECK-NEXT: PseudoRET
+    %0:_(p0) = COPY $x10
+    %1:_(<vscale x 16 x s8>) = COPY $v8m2
+    G_STORE %1(<vscale x 16 x s8>), %0(p0) :: (store (<vscale x 16 x s8>) into %ir.pa)
+    PseudoRET
+
+...
+---
+name:            vstore_nx16i8_align64
+body:             |
+  bb.1 (%ir-block.0):
+    liveins: $x10, $v8m2
+
+    ; CHECK-LABEL: name: vstore_nx16i8_align64
+    ; CHECK: liveins: $x10, $v8m2
+    ; CHECK-NEXT: {{  $}}
+    ; CHECK-NEXT: [[COPY:%[0-9]+]]:_(p0) = COPY $x10
+    ; CHECK-NEXT: [[COPY1:%[0-9]+]]:_(<vscale x 16 x s8>) = COPY $v8m2
+    ; CHECK-NEXT: G_STORE [[COPY1]](<vscale x 16 x s8>), [[COPY]](p0) :: (store (<vscale x 16 x s8>) into %ir.pa, align 64)
+    ; CHECK-NEXT: PseudoRET
+    %0:_(p0) = COPY $x10
+    %1:_(<vscale x 16 x s8>) = COPY $v8m2
+    G_STORE %1(<vscale x 16 x s8>), %0(p0) :: (store (<vscale x 16 x s8>) into %ir.pa, align 64)
+    PseudoRET
+
+...
+---
+name:            vstore_nx4i16_align1
+body:             |
+  bb.1 (%ir-block.0):
+    liveins: $v8, $x10
+
+    ; CHECK-LABEL: name: vstore_nx4i16_align1
+    ; CHECK: liveins: $v8, $x10
+    ; CHECK-NEXT: {{  $}}
+    ; CHECK-NEXT: [[COPY:%[0-9]+]]:_(p0) = COPY $x10
+    ; CHECK-NEXT: [[COPY1:%[0-9]+]]:_(<vscale x 4 x s16>) = COPY $v8
+    ; CHECK-NEXT: [[BITCAST:%[0-9]+]]:_(<vscale x 8 x s8>) = G_BITCAST [[COPY1]](<vscale x 4 x s16>)
+    ; CHECK-NEXT: G_STORE [[BITCAST]](<vscale x 8 x s8>), [[COPY]](p0) :: (store (<vscale x 8 x s8>) into %ir.pa, align 1)
+    ; CHECK-NEXT: PseudoRET
+    %0:_(p0) = COPY $x10
+    %1:_(<vscale x 4 x s16>) = COPY $v8
+    G_STORE %1(<vscale x 4 x s16>), %0(p0) :: (store (<vscale x 4 x s16>) into %ir.pa, align 1)
+    PseudoRET
+
+...
+---
+name:            vstore_nx4i16_align2
+body:             |
+  bb.1 (%ir-block.0):
+    liveins: $v8, $x10
+
+    ; CHECK-LABEL: name: vstore_nx4i16_align2
+    ; CHECK: liveins: $v8, $x10
+    ; CHECK-NEXT: {{  $}}
+    ; CHECK-NEXT: [[COPY:%[0-9]+]]:_(p0) = COPY $x10
+    ; CHECK-NEXT: [[COPY1:%[0-9]+]]:_(<vscale x 4 x s16>) = COPY $v8
+    ; CHECK-NEXT: G_STORE [[COPY1]](<vscale x 4 x s16>), [[COPY]](p0) :: (store (<vscale x 4 x s16>) into %ir.pa, align 2)
+    ; CHECK-NEXT: PseudoRET
+    %0:_(p0) = COPY $x10
+    %1:_(<vscale x 4 x s16>) = COPY $v8
+    G_STORE %1(<vscale x 4 x s16>), %0(p0) :: (store (<vscale x 4 x s16>) into %ir.pa, align 2)
+    PseudoRET
+
+...
+---
+name:            vstore_nx4i16_align4
+body:             |
+  bb.1 (%ir-block.0):
+    liveins: $v8, $x10
+
+    ; CHECK-LABEL: name: vstore_nx4i16_align4
+    ; CHECK: liveins: $v8, $x10
+    ; CHECK-NEXT: {{  $}}
+    ; CHECK-NEXT: [[COPY:%[0-9]+]]:_(p0) = COPY $x10
+    ; CHECK-NEXT: [[COPY1:%[0-9]+]]:_(<vscale x 4 x s16>) = COPY $v8
+    ; CHECK-NEXT: G_STORE [[COPY1]](<vscale x 4 x s16>), [[COPY]](p0) :: (store (<vscale x 4 x s16>) into %ir.pa, align 4)
+    ; CHECK-NEXT: PseudoRET
+    %0:_(p0) = COPY $x10
+    %1:_(<vscale x 4 x s16>) = COPY $v8
+    G_STORE %1(<vscale x 4 x s16>), %0(p0) :: (store (<vscale x 4 x s16>) into %ir.pa, align 4)
+    PseudoRET
+
+...
+---
+name:            vstore_nx4i16_align8
+body:             |
+  bb.1 (%ir-block.0):
+    liveins: $v8, $x10
+
+    ; CHECK-LABEL: name: vstore_nx4i16_align8
+    ; CHECK: liveins: $v8, $x10
+    ; CHECK-NEXT: {{  $}}
+    ; CHECK-NEXT: [[COPY:%[0-9]+]]:_(p0) = COPY $x10
+    ; CHECK-NEXT: [[COPY1:%[0-9]+]]:_(<vscale x 4 x s16>) = COPY $v8
+    ; CHECK-NEXT: G_STORE [[COPY1]](<vscale x 4 x s16>), [[COPY]](p0) :: (store (<vscale x 4 x s16>) into %ir.pa)
+    ; CHECK-NEXT: PseudoRET
+    %0:_(p0) = COPY $x10
+    %1:_(<vscale x 4 x s16>) = COPY $v8
+    G_STORE %1(<vscale x 4 x s16>), %0(p0) :: (store (<vscale x 4 x s16>) into %ir.pa)
+    PseudoRET
+
+...
+---
+name:            vstore_nx4i16_align16
+body:             |
+  bb.1 (%ir-block.0):
+    liveins: $v8, $x10
+
+    ; CHECK-LABEL: name: vstore_nx4i16_align16
+    ; CHECK: liveins: $v8, $x10
+    ; CHECK-NEXT: {{  $}}
+    ; CHECK-NEXT: [[COPY:%[0-9]+]]:_(p0) = COPY $x10
+    ; CHECK-NEXT: [[COPY1:%[0-9]+]]:_(<vscale x 4 x s16>) = COPY $v8
+    ; CHECK-NEXT: G_STORE [[COPY1]](<vscale x 4 x s16>), [[COPY]](p0) :: (store (<vscale x 4 x s16>) into %ir.pa, align 16)
+    ; CHECK-NEXT: PseudoRET
+    %0:_(p0) = COPY $x10
+    %1:_(<vscale x 4 x s16>) = COPY $v8
+    G_STORE %1(<vscale x 4 x s16>), %0(p0) :: (store (<vscale x 4 x s16>) into %ir.pa, align 16)
+    PseudoRET
+
+...
+---
+name:            vstore_nx2i32_align2
+body:             |
+  bb.1 (%ir-block.0):
+    liveins: $v8, $x10
+
+    ; CHECK-LABEL: name: vstore_nx2i32_align2
+    ; CHECK: liveins: $v8, $x10
+    ; CHECK-NEXT: {{  $}}
+    ; CHECK-NEXT: [[COPY:%[0-9]+]]:_(p0) = COPY $x10
+    ; CHECK-NEXT: [[COPY1:%[0-9]+]]:_(<vscale x 2 x s32>) = COPY $v8
+    ; CHECK-NEXT: [[BITCAST:%[0-9]+]]:_(<vscale x 8 x s8>) = G_BITCAST [[COPY1]](<vscale x 2 x s32>)
+    ; CHECK-NEXT: G_STORE [[BITCAST]](<vscale x 8 x s8>), [[COPY]](p0) :: (store (<vscale x 8 x s8>) into %ir.pa, align 2)
+    ; CHECK-NEXT: PseudoRET
+    %0:_(p0) = COPY $x10
+    %1:_(<vscale x 2 x s32>) = COPY $v8
+    G_STORE %1(<vscale x 2 x s32>), %0(p0) :: (store (<vscale x 2 x s32>) into %ir.pa, align 2)
+    PseudoRET
+
+...
+---
+name:            vstore_nx2i32_align4
+body:             |
+  bb.1 (%ir-block.0):
+    liveins: $v8, $x10
+
+    ; CHECK-LABEL: name: vstore_nx2i32_align4
+    ; CHECK: liveins: $v8, $x10
+    ; CHECK-NEXT: {{  $}}
+    ; CHECK-NEXT: [[COPY:%[0-9]+]]:_(p0) = COPY $x10
+    ; CHECK-NEXT: [[COPY1:%[0-9]+]]:_(<vscale x 2 x s32>) = COPY $v8
+    ; CHECK-NEXT: G_STORE [[COPY1]](<vscale x 2 x s32>), [[COPY]](p0) :: (store (<vscale x 2 x s32>) into %ir.pa, align 4)
+    ; CHECK-NEXT: PseudoRET
+    %0:_(p0) = COPY $x10
+    %1:_(<vscale x 2 x s32>) = COPY $v8
+    G_STORE %1(<vscale x 2 x s32>), %0(p0) :: (store (<vscale x 2 x s32>) into %ir.pa, align 4)
+    PseudoRET
+
+...
+---
+name:            vstore_nx2i32_align8
+body:             |
+  bb.1 (%ir-block.0):
+    liveins: $v8, $x10
+
+    ; CHECK-LABEL: name: vstore_nx2i32_align8
+    ; CHECK: liveins: $v8, $x10
+    ; CHECK-NEXT: {{  $}}
+    ; CHECK-NEXT: [[COPY:%[0-9]+]]:_(p0) = COPY $x10
+    ; CHECK-NEXT: [[COPY1:%[0-9]+]]:_(<vscale x 2 x s32>) = COPY $v8
+    ; CHECK-NEXT: G_STORE [[COPY1]](<vscale x 2 x s32>), [[COPY]](p0) :: (store (<vscale x 2 x s32>) into %ir.pa)
+    ; CHECK-NEXT: PseudoRET
+    %0:_(p0) = COPY $x10
+    %1:_(<vscale x 2 x s32>) = COPY $v8
+    G_STORE %1(<vscale x 2 x s32>), %0(p0) :: (store (<vscale x 2 x s32>) into %ir.pa)
+    PseudoRET
+
+...
+---
+name:            vstore_nx2i32_align16
+body:             |
+  bb.1 (%ir-block.0):
+    liveins: $v8, $x10
+
+    ; CHECK-LABEL: name: vstore_nx2i32_align16
+    ; CHECK: liveins: $v8, $x10
+    ; CHECK-NEXT: {{  $}}
+    ; CHECK-NEXT: [[COPY:%[0-9]+]]:_(p0) = COPY $x10
+    ; CHECK-NEXT: [[COPY1:%[0-9]+]]:_(<vscale x 2 x s32>) = COPY $v8
+    ; CHECK-NEXT: G_STORE [[COPY1]](<vscale x 2 x s32>), [[COPY]](p0) :: (store (<vscale x 2 x s32>) into %ir.pa, align 16)
+    ; CHECK-NEXT: PseudoRET
+    %0:_(p0) = COPY $x10
+    %1:_(<vscale x 2 x s32>) = COPY $v8
+    G_STORE %1(<vscale x 2 x s32>), %0(p0) :: (store (<vscale x 2 x s32>) into %ir.pa, align 16)
+    PseudoRET
+
+...
+---
+name:            vstore_nx2i32_align256
+body:             |
+  bb.1 (%ir-block.0):
+    liveins: $v8, $x10
+
+    ; CHECK-LABEL: name: vstore_nx2i32_align256
+    ; CHECK: liveins: $v8, $x10
+    ; CHECK-NEXT: {{  $}}
+    ; CHECK-NEXT: [[COPY:%[0-9]+]]:_(p0) = COPY $x10
+    ; CHECK-NEXT: [[COPY1:%[0-9]+]]:_(<vscale x 2 x s32>) = COPY $v8
+    ; CHECK-NEXT: G_STORE [[COPY1]](<vscale x 2 x s32>), [[COPY]](p0) :: (store (<vscale x 2 x s32>) into %ir.pa, align 256)
+    ; CHECK-NEXT: PseudoRET
+    %0:_(p0) = COPY $x10
+    %1:_(<vscale x 2 x s32>) = COPY $v8
+    G_STORE %1(<vscale x 2 x s32>), %0(p0) :: (store (<vscale x 2 x s32>) into %ir.pa, align 256)
+    PseudoRET
+
+...
+---
+name:            vstore_nx2i64_align4
+body:             |
+  bb.1 (%ir-block.0):
+    liveins: $x10, $v8m2
+
+    ; CHECK-LABEL: name: vstore_nx2i64_align4
+    ; CHECK: liveins: $x10, $v8m2
+    ; CHECK-NEXT: {{  $}}
+    ; CHECK-NEXT: [[COPY:%[0-9]+]]:_(p0) = COPY $x10
+    ; CHECK-NEXT: [[COPY1:%[0-9]+]]:_(<vscale x 2 x s64>) = COPY $v8m2
+    ; CHECK-NEXT: [[BITCAST:%[0-9]+]]:_(<vscale x 16 x s8>) = G_BITCAST [[COPY1]](<vscale x 2 x s64>)
+    ; CHECK-NEXT: G_STORE [[BITCAST]](<vscale x 16 x s8>), [[COPY]](p0) :: (store (<vscale x 16 x s8>) into %ir.pa, align 4)
+    ; CHECK-NEXT: PseudoRET
+    %0:_(p0) = COPY $x10
+    %1:_(<vscale x 2 x s64>) = COPY $v8m2
+    G_STORE %1(<vscale x 2 x s64>), %0(p0) :: (store (<vscale x 2 x s64>) into %ir.pa, align 4)
+    PseudoRET
+
+...
+---
+name:            vstore_nx2i64_align8
+body:             |
+  bb.1 (%ir-block.0):
+    liveins: $x10, $v8m2
+
+    ; CHECK-LABEL: name: vstore_nx2i64_align8
+    ; CHECK: liveins: $x10, $v8m2
+    ; CHECK-NEXT: {{  $}}
+    ; CHECK-NEXT: [[COPY:%[0-9]+]]:_(p0) = COPY $x10
+    ; CHECK-NEXT: [[COPY1:%[0-9]+]]:_(<vscale x 2 x s64>) = COPY $v8m2
+    ; CHECK-NEXT: G_STORE [[COPY1]](<vscale x 2 x s64>), [[COPY]](p0) :: (store (<vscale x 2 x s64>) into %ir.pa, align 8)
+    ; CHECK-NEXT: PseudoRET
+    %0:_(p0) = COPY $x10
+    %1:_(<vscale x 2 x s64>) = COPY $v8m2
+    G_STORE %1(<vscale x 2 x s64>), %0(p0) :: (store (<vscale x 2 x s64>) into %ir.pa, align 8)
+    PseudoRET
+
+...
+---
+name:            vstore_nx2i64_align16
+body:             |
+  bb.1 (%ir-block.0):
+    liveins: $x10, $v8m2
+
+    ; CHECK-LABEL: name: vstore_nx2i64_align16
+    ; CHECK: liveins: $x10, $v8m2
+    ; CHECK-NEXT: {{  $}}
+    ; CHECK-NEXT: [[COPY:%[0-9]+]]:_(p0) = COPY $x10
+    ; CHECK-NEXT: [[COPY1:%[0-9]+]]:_(<vscale x 2 x s64>) = COPY $v8m2
+    ; CHECK-NEXT: G_STORE [[COPY1]](<vscale x 2 x s64>), [[COPY]](p0) :: (store (<vscale x 2 x s64>) into %ir.pa)
+    ; CHECK-NEXT: PseudoRET
+    %0:_(p0) = COPY $x10
+    %1:_(<vscale x 2 x s64>) = COPY $v8m2
+    G_STORE %1(<vscale x 2 x s64>), %0(p0) :: (store (<vscale x 2 x s64>) into %ir.pa)
+    PseudoRET
+
+...
+---
+name:            vstore_nx2i64_align32
+body:             |
+  bb.1 (%ir-block.0):
+    liveins: $x10, $v8m2
+
+    ; CHECK-LABEL: name: vstore_nx2i64_align32
+    ; CHECK: liveins: $x10, $v8m2
+    ; CHECK-NEXT: {{  $}}
+    ; CHECK-NEXT: [[COPY:%[0-9]+]]:_(p0) = COPY $x10
+    ; CHECK-NEXT: [[COPY1:%[0-9]+]]:_(<vscale x 2 x s64>) = COPY $v8m2
+    ; CHECK-NEXT: G_STORE [[COPY1]](<vscale x 2 x s64>), [[COPY]](p0) :: (store (<vscale x 2 x s64>) into %ir.pa, align 32)
+    ; CHECK-NEXT: PseudoRET
+    %0:_(p0) = COPY $x10
+    %1:_(<vscale x 2 x s64>) = COPY $v8m2
+    G_STORE %1(<vscale x 2 x s64>), %0(p0) :: (store (<vscale x 2 x s64>) into %ir.pa, align 32)
+    PseudoRET
+
+...
+---
+name:            vstore_nx1ptr
+body:             |
+  bb.1 (%ir-block.0):
+    liveins: $v8, $x10
+
+    ; CHECK-LABEL: name: vstore_nx1ptr
+    ; CHECK: liveins: $v8, $x10
+    ; CHECK-NEXT: {{  $}}
+    ; CHECK-NEXT: [[COPY:%[0-9]+]]:_(p0) = COPY $x10
+    ; CHECK-NEXT: [[COPY1:%[0-9]+]]:_(<vscale x 1 x p0>) = COPY $v8
+    ; CHECK-NEXT: G_STORE [[COPY1]](<vscale x 1 x p0>), [[COPY]](p0) :: (store (<vscale x 1 x p0>) into %ir.pa)
+    ; CHECK-NEXT: PseudoRET
+    %0:_(p0) = COPY $x10
+    %1:_(<vscale x 1 x p0>) = COPY $v8
+    G_STORE %1(<vscale x 1 x p0>), %0(p0) :: (store (<vscale x 1 x p0>) into %ir.pa)
+    PseudoRET
+
+...
+---
+name:            vstore_nx2ptr
+body:             |
+  bb.1 (%ir-block.0):
+    liveins: $x10, $v8m2
+
+    ; CHECK-LABEL: name: vstore_nx2ptr
+    ; CHECK: liveins: $x10, $v8m2
+    ; CHECK-NEXT: {{  $}}
+    ; CHECK-NEXT: [[COPY:%[0-9]+]]:_(p0) = COPY $x10
+    ; CHECK-NEXT: [[COPY1:%[0-9]+]]:_(<vscale x 2 x p0>) = COPY $v8m2
+    ; CHECK-NEXT: G_STORE [[COPY1]](<vscale x 2 x p0>), [[COPY]](p0) :: (store (<vscale x 2 x p0>) into %ir.pa)
+    ; CHECK-NEXT: PseudoRET
+    %0:_(p0) = COPY $x10
+    %1:_(<vscale x 2 x p0>) = COPY $v8m2
+    G_STORE %1(<vscale x 2 x p0>), %0(p0) :: (store (<vscale x 2 x p0>) into %ir.pa)
+    PseudoRET
+
+...
+---
+name:            vstore_nx8ptr
+body:             |
+  bb.1 (%ir-block.0):
+    liveins: $x10, $v8m8
+
+    ; CHECK-LABEL: name: vstore_nx8ptr
+    ; CHECK: liveins: $x10, $v8m8
+    ; CHECK-NEXT: {{  $}}
+    ; CHECK-NEXT: [[COPY:%[0-9]+]]:_(p0) = COPY $x10
+    ; CHECK-NEXT: [[COPY1:%[0-9]+]]:_(<vscale x 8 x p0>) = COPY $v8m8
+    ; CHECK-NEXT: G_STORE [[COPY1]](<vscale x 8 x p0>), [[COPY]](p0) :: (store (<vscale x 8 x p0>) into %ir.pa)
+    ; CHECK-NEXT: PseudoRET
+    %0:_(p0) = COPY $x10
+    %1:_(<vscale x 8 x p0>) = COPY $v8m8
+    G_STORE %1(<vscale x 8 x p0>), %0(p0) :: (store (<vscale x 8 x p0>) into %ir.pa)
+    PseudoRET
+
+...
diff --git a/llvm/test/CodeGen/RISCV/GlobalISel/regbankselect/rvv/load.mir b/llvm/test/CodeGen/RISCV/GlobalISel/regbankselect/rvv/load.mir
new file mode 100644
index 0000000..5c02c72
--- /dev/null
+++ b/llvm/test/CodeGen/RISCV/GlobalISel/regbankselect/rvv/load.mir
@@ -0,0 +1,1481 @@
+# NOTE: Assertions have been autogenerated by utils/update_mir_test_checks.py
+# RUN: llc -mtriple=riscv32 -mattr=+m,+v -run-pass=regbankselect \
+# RUN:   -disable-gisel-legality-check -simplify-mir -verify-machineinstrs %s \
+# RUN:   -o - | FileCheck -check-prefix=RV32I %s
+# RUN: llc -mtriple=riscv64 -mattr=+m,+v -run-pass=regbankselect \
+# RUN:   -disable-gisel-legality-check -simplify-mir -verify-machineinstrs %s \
+# RUN:   -o - | FileCheck -check-prefix=RV64I %s
+--- |
+
+  define <vscale x 1 x i8> @vload_nx1i8(ptr %pa) #0 {
+    %va = load <vscale x 1 x i8>, ptr %pa, align 1
+    ret <vscale x 1 x i8> %va
+  }
+
+  define <vscale x 2 x i8> @vload_nx2i8(ptr %pa) #0 {
+    %va = load <vscale x 2 x i8>, ptr %pa, align 2
+    ret <vscale x 2 x i8> %va
+  }
+
+  define <vscale x 4 x i8> @vload_nx4i8(ptr %pa) #0 {
+    %va = load <vscale x 4 x i8>, ptr %pa, align 4
+    ret <vscale x 4 x i8> %va
+  }
+
+  define <vscale x 8 x i8> @vload_nx8i8(ptr %pa) #0 {
+    %va = load <vscale x 8 x i8>, ptr %pa, align 8
+    ret <vscale x 8 x i8> %va
+  }
+
+  define <vscale x 16 x i8> @vload_nx16i8(ptr %pa) #0 {
+    %va = load <vscale x 16 x i8>, ptr %pa, align 16
+    ret <vscale x 16 x i8> %va
+  }
+
+  define <vscale x 32 x i8> @vload_nx32i8(ptr %pa) #0 {
+    %va = load <vscale x 32 x i8>, ptr %pa, align 32
+    ret <vscale x 32 x i8> %va
+  }
+
+  define <vscale x 64 x i8> @vload_nx64i8(ptr %pa) #0 {
+    %va = load <vscale x 64 x i8>, ptr %pa, align 64
+    ret <vscale x 64 x i8> %va
+  }
+
+  define <vscale x 1 x i16> @vload_nx1i16(ptr %pa) #0 {
+    %va = load <vscale x 1 x i16>, ptr %pa, align 2
+    ret <vscale x 1 x i16> %va
+  }
+
+  define <vscale x 2 x i16> @vload_nx2i16(ptr %pa) #0 {
+    %va = load <vscale x 2 x i16>, ptr %pa, align 4
+    ret <vscale x 2 x i16> %va
+  }
+
+  define <vscale x 4 x i16> @vload_nx4i16(ptr %pa) #0 {
+    %va = load <vscale x 4 x i16>, ptr %pa, align 8
+    ret <vscale x 4 x i16> %va
+  }
+
+  define <vscale x 8 x i16> @vload_nx8i16(ptr %pa) #0 {
+    %va = load <vscale x 8 x i16>, ptr %pa, align 16
+    ret <vscale x 8 x i16> %va
+  }
+
+  define <vscale x 16 x i16> @vload_nx16i16(ptr %pa) #0 {
+    %va = load <vscale x 16 x i16>, ptr %pa, align 32
+    ret <vscale x 16 x i16> %va
+  }
+
+  define <vscale x 32 x i16> @vload_nx32i16(ptr %pa) #0 {
+    %va = load <vscale x 32 x i16>, ptr %pa, align 64
+    ret <vscale x 32 x i16> %va
+  }
+
+  define <vscale x 1 x i32> @vload_nx1i32(ptr %pa) #0 {
+    %va = load <vscale x 1 x i32>, ptr %pa, align 4
+    ret <vscale x 1 x i32> %va
+  }
+
+  define <vscale x 2 x i32> @vload_nx2i32(ptr %pa) #0 {
+    %va = load <vscale x 2 x i32>, ptr %pa, align 8
+    ret <vscale x 2 x i32> %va
+  }
+
+  define <vscale x 4 x i32> @vload_nx4i32(ptr %pa) #0 {
+    %va = load <vscale x 4 x i32>, ptr %pa, align 16
+    ret <vscale x 4 x i32> %va
+  }
+
+  define <vscale x 8 x i32> @vload_nx8i32(ptr %pa) #0 {
+    %va = load <vscale x 8 x i32>, ptr %pa, align 32
+    ret <vscale x 8 x i32> %va
+  }
+
+  define <vscale x 16 x i32> @vload_nx16i32(ptr %pa) #0 {
+    %va = load <vscale x 16 x i32>, ptr %pa, align 64
+    ret <vscale x 16 x i32> %va
+  }
+
+  define <vscale x 1 x i64> @vload_nx1i64(ptr %pa) #0 {
+    %va = load <vscale x 1 x i64>, ptr %pa, align 8
+    ret <vscale x 1 x i64> %va
+  }
+
+  define <vscale x 2 x i64> @vload_nx2i64(ptr %pa) #0 {
+    %va = load <vscale x 2 x i64>, ptr %pa, align 16
+    ret <vscale x 2 x i64> %va
+  }
+
+  define <vscale x 4 x i64> @vload_nx4i64(ptr %pa) #0 {
+    %va = load <vscale x 4 x i64>, ptr %pa, align 32
+    ret <vscale x 4 x i64> %va
+  }
+
+  define <vscale x 8 x i64> @vload_nx8i64(ptr %pa) #0 {
+    %va = load <vscale x 8 x i64>, ptr %pa, align 64
+    ret <vscale x 8 x i64> %va
+  }
+
+  define <vscale x 16 x i8> @vload_nx16i8_align1(ptr %pa) #0 {
+    %va = load <vscale x 16 x i8>, ptr %pa, align 1
+    ret <vscale x 16 x i8> %va
+  }
+
+  define <vscale x 16 x i8> @vload_nx16i8_align2(ptr %pa) #0 {
+    %va = load <vscale x 16 x i8>, ptr %pa, align 2
+    ret <vscale x 16 x i8> %va
+  }
+
+  define <vscale x 16 x i8> @vload_nx16i8_align16(ptr %pa) #0 {
+    %va = load <vscale x 16 x i8>, ptr %pa, align 16
+    ret <vscale x 16 x i8> %va
+  }
+
+  define <vscale x 16 x i8> @vload_nx16i8_align64(ptr %pa) #0 {
+    %va = load <vscale x 16 x i8>, ptr %pa, align 64
+    ret <vscale x 16 x i8> %va
+  }
+
+  define <vscale x 4 x i16> @vload_nx4i16_align1(ptr %pa) #0 {
+    %va = load <vscale x 4 x i16>, ptr %pa, align 1
+    ret <vscale x 4 x i16> %va
+  }
+
+  define <vscale x 4 x i16> @vload_nx4i16_align2(ptr %pa) #0 {
+    %va = load <vscale x 4 x i16>, ptr %pa, align 2
+    ret <vscale x 4 x i16> %va
+  }
+
+  define <vscale x 4 x i16> @vload_nx4i16_align4(ptr %pa) #0 {
+    %va = load <vscale x 4 x i16>, ptr %pa, align 4
+    ret <vscale x 4 x i16> %va
+  }
+
+  define <vscale x 4 x i16> @vload_nx4i16_align8(ptr %pa) #0 {
+    %va = load <vscale x 4 x i16>, ptr %pa, align 8
+    ret <vscale x 4 x i16> %va
+  }
+
+  define <vscale x 4 x i16> @vload_nx4i16_align16(ptr %pa) #0 {
+    %va = load <vscale x 4 x i16>, ptr %pa, align 16
+    ret <vscale x 4 x i16> %va
+  }
+
+  define <vscale x 2 x i32> @vload_nx2i32_align2(ptr %pa) #0 {
+    %va = load <vscale x 2 x i32>, ptr %pa, align 2
+    ret <vscale x 2 x i32> %va
+  }
+
+  define <vscale x 2 x i32> @vload_nx2i32_align4(ptr %pa) #0 {
+    %va = load <vscale x 2 x i32>, ptr %pa, align 4
+    ret <vscale x 2 x i32> %va
+  }
+
+  define <vscale x 2 x i32> @vload_nx2i32_align8(ptr %pa) #0 {
+    %va = load <vscale x 2 x i32>, ptr %pa, align 8
+    ret <vscale x 2 x i32> %va
+  }
+
+  define <vscale x 2 x i32> @vload_nx2i32_align16(ptr %pa) #0 {
+    %va = load <vscale x 2 x i32>, ptr %pa, align 16
+    ret <vscale x 2 x i32> %va
+  }
+
+  define <vscale x 2 x i32> @vload_nx2i32_align256(ptr %pa) #0 {
+    %va = load <vscale x 2 x i32>, ptr %pa, align 256
+    ret <vscale x 2 x i32> %va
+  }
+
+  define <vscale x 2 x i64> @vload_nx2i64_align4(ptr %pa) #0 {
+    %va = load <vscale x 2 x i64>, ptr %pa, align 4
+    ret <vscale x 2 x i64> %va
+  }
+
+  define <vscale x 2 x i64> @vload_nx2i64_align8(ptr %pa) #0 {
+    %va = load <vscale x 2 x i64>, ptr %pa, align 8
+    ret <vscale x 2 x i64> %va
+  }
+
+  define <vscale x 2 x i64> @vload_nx2i64_align16(ptr %pa) #0 {
+    %va = load <vscale x 2 x i64>, ptr %pa, align 16
+    ret <vscale x 2 x i64> %va
+  }
+
+  define <vscale x 2 x i64> @vload_nx2i64_align32(ptr %pa) #0 {
+    %va = load <vscale x 2 x i64>, ptr %pa, align 32
+    ret <vscale x 2 x i64> %va
+  }
+
+  define <vscale x 1 x ptr> @vload_nx1ptr(ptr %pa) #0 {
+    %va = load <vscale x 1 x ptr>, ptr %pa, align 4
+    ret <vscale x 1 x ptr> %va
+  }
+
+  define <vscale x 2 x ptr> @vload_nx2ptr(ptr %pa) #0 {
+    %va = load <vscale x 2 x ptr>, ptr %pa, align 8
+    ret <vscale x 2 x ptr> %va
+  }
+
+  define <vscale x 8 x ptr> @vload_nx8ptr(ptr %pa) #0 {
+    %va = load <vscale x 8 x ptr>, ptr %pa, align 32
+    ret <vscale x 8 x ptr> %va
+  }
+
+...
+---
+name:            vload_nx1i8
+legalized:       true
+tracksRegLiveness: true
+body:             |
+  bb.1 (%ir-block.0):
+    liveins: $x10
+
+    ; RV32I-LABEL: name: vload_nx1i8
+    ; RV32I: liveins: $x10
+    ; RV32I-NEXT: {{  $}}
+    ; RV32I-NEXT: [[COPY:%[0-9]+]]:gprb(p0) = COPY $x10
+    ; RV32I-NEXT: [[LOAD:%[0-9]+]]:vrb(<vscale x 1 x s8>) = G_LOAD [[COPY]](p0) :: (load (<vscale x 1 x s8>) from %ir.pa)
+    ; RV32I-NEXT: $v8 = COPY [[LOAD]](<vscale x 1 x s8>)
+    ; RV32I-NEXT: PseudoRET implicit $v8
+    ;
+    ; RV64I-LABEL: name: vload_nx1i8
+    ; RV64I: liveins: $x10
+    ; RV64I-NEXT: {{  $}}
+    ; RV64I-NEXT: [[COPY:%[0-9]+]]:gprb(p0) = COPY $x10
+    ; RV64I-NEXT: [[LOAD:%[0-9]+]]:vrb(<vscale x 1 x s8>) = G_LOAD [[COPY]](p0) :: (load (<vscale x 1 x s8>) from %ir.pa)
+    ; RV64I-NEXT: $v8 = COPY [[LOAD]](<vscale x 1 x s8>)
+    ; RV64I-NEXT: PseudoRET implicit $v8
+    %0:_(p0) = COPY $x10
+    %1:_(<vscale x 1 x s8>) = G_LOAD %0(p0) :: (load (<vscale x 1 x s8>) from %ir.pa)
+    $v8 = COPY %1(<vscale x 1 x s8>)
+    PseudoRET implicit $v8
+
+...
+---
+name:            vload_nx2i8
+legalized:       true
+tracksRegLiveness: true
+body:             |
+  bb.1 (%ir-block.0):
+    liveins: $x10
+
+    ; RV32I-LABEL: name: vload_nx2i8
+    ; RV32I: liveins: $x10
+    ; RV32I-NEXT: {{  $}}
+    ; RV32I-NEXT: [[COPY:%[0-9]+]]:gprb(p0) = COPY $x10
+    ; RV32I-NEXT: [[LOAD:%[0-9]+]]:vrb(<vscale x 2 x s8>) = G_LOAD [[COPY]](p0) :: (load (<vscale x 2 x s8>) from %ir.pa)
+    ; RV32I-NEXT: $v8 = COPY [[LOAD]](<vscale x 2 x s8>)
+    ; RV32I-NEXT: PseudoRET implicit $v8
+    ;
+    ; RV64I-LABEL: name: vload_nx2i8
+    ; RV64I: liveins: $x10
+    ; RV64I-NEXT: {{  $}}
+    ; RV64I-NEXT: [[COPY:%[0-9]+]]:gprb(p0) = COPY $x10
+    ; RV64I-NEXT: [[LOAD:%[0-9]+]]:vrb(<vscale x 2 x s8>) = G_LOAD [[COPY]](p0) :: (load (<vscale x 2 x s8>) from %ir.pa)
+    ; RV64I-NEXT: $v8 = COPY [[LOAD]](<vscale x 2 x s8>)
+    ; RV64I-NEXT: PseudoRET implicit $v8
+    %0:_(p0) = COPY $x10
+    %1:_(<vscale x 2 x s8>) = G_LOAD %0(p0) :: (load (<vscale x 2 x s8>) from %ir.pa)
+    $v8 = COPY %1(<vscale x 2 x s8>)
+    PseudoRET implicit $v8
+
+...
+---
+name:            vload_nx4i8
+legalized:       true
+tracksRegLiveness: true
+body:             |
+  bb.1 (%ir-block.0):
+    liveins: $x10
+
+    ; RV32I-LABEL: name: vload_nx4i8
+    ; RV32I: liveins: $x10
+    ; RV32I-NEXT: {{  $}}
+    ; RV32I-NEXT: [[COPY:%[0-9]+]]:gprb(p0) = COPY $x10
+    ; RV32I-NEXT: [[LOAD:%[0-9]+]]:vrb(<vscale x 4 x s8>) = G_LOAD [[COPY]](p0) :: (load (<vscale x 4 x s8>) from %ir.pa)
+    ; RV32I-NEXT: $v8 = COPY [[LOAD]](<vscale x 4 x s8>)
+    ; RV32I-NEXT: PseudoRET implicit $v8
+    ;
+    ; RV64I-LABEL: name: vload_nx4i8
+    ; RV64I: liveins: $x10
+    ; RV64I-NEXT: {{  $}}
+    ; RV64I-NEXT: [[COPY:%[0-9]+]]:gprb(p0) = COPY $x10
+    ; RV64I-NEXT: [[LOAD:%[0-9]+]]:vrb(<vscale x 4 x s8>) = G_LOAD [[COPY]](p0) :: (load (<vscale x 4 x s8>) from %ir.pa)
+    ; RV64I-NEXT: $v8 = COPY [[LOAD]](<vscale x 4 x s8>)
+    ; RV64I-NEXT: PseudoRET implicit $v8
+    %0:_(p0) = COPY $x10
+    %1:_(<vscale x 4 x s8>) = G_LOAD %0(p0) :: (load (<vscale x 4 x s8>) from %ir.pa)
+    $v8 = COPY %1(<vscale x 4 x s8>)
+    PseudoRET implicit $v8
+
+...
+---
+name:            vload_nx8i8
+legalized:       true
+tracksRegLiveness: true
+body:             |
+  bb.1 (%ir-block.0):
+    liveins: $x10
+
+    ; RV32I-LABEL: name: vload_nx8i8
+    ; RV32I: liveins: $x10
+    ; RV32I-NEXT: {{  $}}
+    ; RV32I-NEXT: [[COPY:%[0-9]+]]:gprb(p0) = COPY $x10
+    ; RV32I-NEXT: [[LOAD:%[0-9]+]]:vrb(<vscale x 8 x s8>) = G_LOAD [[COPY]](p0) :: (load (<vscale x 8 x s8>) from %ir.pa)
+    ; RV32I-NEXT: $v8 = COPY [[LOAD]](<vscale x 8 x s8>)
+    ; RV32I-NEXT: PseudoRET implicit $v8
+    ;
+    ; RV64I-LABEL: name: vload_nx8i8
+    ; RV64I: liveins: $x10
+    ; RV64I-NEXT: {{  $}}
+    ; RV64I-NEXT: [[COPY:%[0-9]+]]:gprb(p0) = COPY $x10
+    ; RV64I-NEXT: [[LOAD:%[0-9]+]]:vrb(<vscale x 8 x s8>) = G_LOAD [[COPY]](p0) :: (load (<vscale x 8 x s8>) from %ir.pa)
+    ; RV64I-NEXT: $v8 = COPY [[LOAD]](<vscale x 8 x s8>)
+    ; RV64I-NEXT: PseudoRET implicit $v8
+    %0:_(p0) = COPY $x10
+    %1:_(<vscale x 8 x s8>) = G_LOAD %0(p0) :: (load (<vscale x 8 x s8>) from %ir.pa)
+    $v8 = COPY %1(<vscale x 8 x s8>)
+    PseudoRET implicit $v8
+
+...
+---
+name:            vload_nx16i8
+legalized:       true
+tracksRegLiveness: true
+body:             |
+  bb.1 (%ir-block.0):
+    liveins: $x10
+
+    ; RV32I-LABEL: name: vload_nx16i8
+    ; RV32I: liveins: $x10
+    ; RV32I-NEXT: {{  $}}
+    ; RV32I-NEXT: [[COPY:%[0-9]+]]:gprb(p0) = COPY $x10
+    ; RV32I-NEXT: [[LOAD:%[0-9]+]]:vrb(<vscale x 16 x s8>) = G_LOAD [[COPY]](p0) :: (load (<vscale x 16 x s8>) from %ir.pa)
+    ; RV32I-NEXT: $v8m2 = COPY [[LOAD]](<vscale x 16 x s8>)
+    ; RV32I-NEXT: PseudoRET implicit $v8m2
+    ;
+    ; RV64I-LABEL: name: vload_nx16i8
+    ; RV64I: liveins: $x10
+    ; RV64I-NEXT: {{  $}}
+    ; RV64I-NEXT: [[COPY:%[0-9]+]]:gprb(p0) = COPY $x10
+    ; RV64I-NEXT: [[LOAD:%[0-9]+]]:vrb(<vscale x 16 x s8>) = G_LOAD [[COPY]](p0) :: (load (<vscale x 16 x s8>) from %ir.pa)
+    ; RV64I-NEXT: $v8m2 = COPY [[LOAD]](<vscale x 16 x s8>)
+    ; RV64I-NEXT: PseudoRET implicit $v8m2
+    %0:_(p0) = COPY $x10
+    %1:_(<vscale x 16 x s8>) = G_LOAD %0(p0) :: (load (<vscale x 16 x s8>) from %ir.pa)
+    $v8m2 = COPY %1(<vscale x 16 x s8>)
+    PseudoRET implicit $v8m2
+
+...
+---
+name:            vload_nx32i8
+legalized:       true
+tracksRegLiveness: true
+body:             |
+  bb.1 (%ir-block.0):
+    liveins: $x10
+
+    ; RV32I-LABEL: name: vload_nx32i8
+    ; RV32I: liveins: $x10
+    ; RV32I-NEXT: {{  $}}
+    ; RV32I-NEXT: [[COPY:%[0-9]+]]:gprb(p0) = COPY $x10
+    ; RV32I-NEXT: [[LOAD:%[0-9]+]]:vrb(<vscale x 32 x s8>) = G_LOAD [[COPY]](p0) :: (load (<vscale x 32 x s8>) from %ir.pa)
+    ; RV32I-NEXT: $v8m4 = COPY [[LOAD]](<vscale x 32 x s8>)
+    ; RV32I-NEXT: PseudoRET implicit $v8m4
+    ;
+    ; RV64I-LABEL: name: vload_nx32i8
+    ; RV64I: liveins: $x10
+    ; RV64I-NEXT: {{  $}}
+    ; RV64I-NEXT: [[COPY:%[0-9]+]]:gprb(p0) = COPY $x10
+    ; RV64I-NEXT: [[LOAD:%[0-9]+]]:vrb(<vscale x 32 x s8>) = G_LOAD [[COPY]](p0) :: (load (<vscale x 32 x s8>) from %ir.pa)
+    ; RV64I-NEXT: $v8m4 = COPY [[LOAD]](<vscale x 32 x s8>)
+    ; RV64I-NEXT: PseudoRET implicit $v8m4
+    %0:_(p0) = COPY $x10
+    %1:_(<vscale x 32 x s8>) = G_LOAD %0(p0) :: (load (<vscale x 32 x s8>) from %ir.pa)
+    $v8m4 = COPY %1(<vscale x 32 x s8>)
+    PseudoRET implicit $v8m4
+
+...
+---
+name:            vload_nx64i8
+legalized:       true
+tracksRegLiveness: true
+body:             |
+  bb.1 (%ir-block.0):
+    liveins: $x10
+
+    ; RV32I-LABEL: name: vload_nx64i8
+    ; RV32I: liveins: $x10
+    ; RV32I-NEXT: {{  $}}
+    ; RV32I-NEXT: [[COPY:%[0-9]+]]:gprb(p0) = COPY $x10
+    ; RV32I-NEXT: [[LOAD:%[0-9]+]]:vrb(<vscale x 64 x s8>) = G_LOAD [[COPY]](p0) :: (load (<vscale x 64 x s8>) from %ir.pa)
+    ; RV32I-NEXT: $v8m8 = COPY [[LOAD]](<vscale x 64 x s8>)
+    ; RV32I-NEXT: PseudoRET implicit $v8m8
+    ;
+    ; RV64I-LABEL: name: vload_nx64i8
+    ; RV64I: liveins: $x10
+    ; RV64I-NEXT: {{  $}}
+    ; RV64I-NEXT: [[COPY:%[0-9]+]]:gprb(p0) = COPY $x10
+    ; RV64I-NEXT: [[LOAD:%[0-9]+]]:vrb(<vscale x 64 x s8>) = G_LOAD [[COPY]](p0) :: (load (<vscale x 64 x s8>) from %ir.pa)
+    ; RV64I-NEXT: $v8m8 = COPY [[LOAD]](<vscale x 64 x s8>)
+    ; RV64I-NEXT: PseudoRET implicit $v8m8
+    %0:_(p0) = COPY $x10
+    %1:_(<vscale x 64 x s8>) = G_LOAD %0(p0) :: (load (<vscale x 64 x s8>) from %ir.pa)
+    $v8m8 = COPY %1(<vscale x 64 x s8>)
+    PseudoRET implicit $v8m8
+
+...
+---
+name:            vload_nx1i16
+legalized:       true
+tracksRegLiveness: true
+body:             |
+  bb.1 (%ir-block.0):
+    liveins: $x10
+
+    ; RV32I-LABEL: name: vload_nx1i16
+    ; RV32I: liveins: $x10
+    ; RV32I-NEXT: {{  $}}
+    ; RV32I-NEXT: [[COPY:%[0-9]+]]:gprb(p0) = COPY $x10
+    ; RV32I-NEXT: [[LOAD:%[0-9]+]]:vrb(<vscale x 1 x s16>) = G_LOAD [[COPY]](p0) :: (load (<vscale x 1 x s16>) from %ir.pa)
+    ; RV32I-NEXT: $v8 = COPY [[LOAD]](<vscale x 1 x s16>)
+    ; RV32I-NEXT: PseudoRET implicit $v8
+    ;
+    ; RV64I-LABEL: name: vload_nx1i16
+    ; RV64I: liveins: $x10
+    ; RV64I-NEXT: {{  $}}
+    ; RV64I-NEXT: [[COPY:%[0-9]+]]:gprb(p0) = COPY $x10
+    ; RV64I-NEXT: [[LOAD:%[0-9]+]]:vrb(<vscale x 1 x s16>) = G_LOAD [[COPY]](p0) :: (load (<vscale x 1 x s16>) from %ir.pa)
+    ; RV64I-NEXT: $v8 = COPY [[LOAD]](<vscale x 1 x s16>)
+    ; RV64I-NEXT: PseudoRET implicit $v8
+    %0:_(p0) = COPY $x10
+    %1:_(<vscale x 1 x s16>) = G_LOAD %0(p0) :: (load (<vscale x 1 x s16>) from %ir.pa)
+    $v8 = COPY %1(<vscale x 1 x s16>)
+    PseudoRET implicit $v8
+
+...
+---
+name:            vload_nx2i16
+legalized:       true
+tracksRegLiveness: true
+body:             |
+  bb.1 (%ir-block.0):
+    liveins: $x10
+
+    ; RV32I-LABEL: name: vload_nx2i16
+    ; RV32I: liveins: $x10
+    ; RV32I-NEXT: {{  $}}
+    ; RV32I-NEXT: [[COPY:%[0-9]+]]:gprb(p0) = COPY $x10
+    ; RV32I-NEXT: [[LOAD:%[0-9]+]]:vrb(<vscale x 2 x s16>) = G_LOAD [[COPY]](p0) :: (load (<vscale x 2 x s16>) from %ir.pa)
+    ; RV32I-NEXT: $v8 = COPY [[LOAD]](<vscale x 2 x s16>)
+    ; RV32I-NEXT: PseudoRET implicit $v8
+    ;
+    ; RV64I-LABEL: name: vload_nx2i16
+    ; RV64I: liveins: $x10
+    ; RV64I-NEXT: {{  $}}
+    ; RV64I-NEXT: [[COPY:%[0-9]+]]:gprb(p0) = COPY $x10
+    ; RV64I-NEXT: [[LOAD:%[0-9]+]]:vrb(<vscale x 2 x s16>) = G_LOAD [[COPY]](p0) :: (load (<vscale x 2 x s16>) from %ir.pa)
+    ; RV64I-NEXT: $v8 = COPY [[LOAD]](<vscale x 2 x s16>)
+    ; RV64I-NEXT: PseudoRET implicit $v8
+    %0:_(p0) = COPY $x10
+    %1:_(<vscale x 2 x s16>) = G_LOAD %0(p0) :: (load (<vscale x 2 x s16>) from %ir.pa)
+    $v8 = COPY %1(<vscale x 2 x s16>)
+    PseudoRET implicit $v8
+
+...
+---
+name:            vload_nx4i16
+legalized:       true
+tracksRegLiveness: true
+body:             |
+  bb.1 (%ir-block.0):
+    liveins: $x10
+
+    ; RV32I-LABEL: name: vload_nx4i16
+    ; RV32I: liveins: $x10
+    ; RV32I-NEXT: {{  $}}
+    ; RV32I-NEXT: [[COPY:%[0-9]+]]:gprb(p0) = COPY $x10
+    ; RV32I-NEXT: [[LOAD:%[0-9]+]]:vrb(<vscale x 4 x s16>) = G_LOAD [[COPY]](p0) :: (load (<vscale x 4 x s16>) from %ir.pa)
+    ; RV32I-NEXT: $v8 = COPY [[LOAD]](<vscale x 4 x s16>)
+    ; RV32I-NEXT: PseudoRET implicit $v8
+    ;
+    ; RV64I-LABEL: name: vload_nx4i16
+    ; RV64I: liveins: $x10
+    ; RV64I-NEXT: {{  $}}
+    ; RV64I-NEXT: [[COPY:%[0-9]+]]:gprb(p0) = COPY $x10
+    ; RV64I-NEXT: [[LOAD:%[0-9]+]]:vrb(<vscale x 4 x s16>) = G_LOAD [[COPY]](p0) :: (load (<vscale x 4 x s16>) from %ir.pa)
+    ; RV64I-NEXT: $v8 = COPY [[LOAD]](<vscale x 4 x s16>)
+    ; RV64I-NEXT: PseudoRET implicit $v8
+    %0:_(p0) = COPY $x10
+    %1:_(<vscale x 4 x s16>) = G_LOAD %0(p0) :: (load (<vscale x 4 x s16>) from %ir.pa)
+    $v8 = COPY %1(<vscale x 4 x s16>)
+    PseudoRET implicit $v8
+
+...
+---
+name:            vload_nx8i16
+legalized:       true
+tracksRegLiveness: true
+body:             |
+  bb.1 (%ir-block.0):
+    liveins: $x10
+
+    ; RV32I-LABEL: name: vload_nx8i16
+    ; RV32I: liveins: $x10
+    ; RV32I-NEXT: {{  $}}
+    ; RV32I-NEXT: [[COPY:%[0-9]+]]:gprb(p0) = COPY $x10
+    ; RV32I-NEXT: [[LOAD:%[0-9]+]]:vrb(<vscale x 8 x s16>) = G_LOAD [[COPY]](p0) :: (load (<vscale x 8 x s16>) from %ir.pa)
+    ; RV32I-NEXT: $v8m2 = COPY [[LOAD]](<vscale x 8 x s16>)
+    ; RV32I-NEXT: PseudoRET implicit $v8m2
+    ;
+    ; RV64I-LABEL: name: vload_nx8i16
+    ; RV64I: liveins: $x10
+    ; RV64I-NEXT: {{  $}}
+    ; RV64I-NEXT: [[COPY:%[0-9]+]]:gprb(p0) = COPY $x10
+    ; RV64I-NEXT: [[LOAD:%[0-9]+]]:vrb(<vscale x 8 x s16>) = G_LOAD [[COPY]](p0) :: (load (<vscale x 8 x s16>) from %ir.pa)
+    ; RV64I-NEXT: $v8m2 = COPY [[LOAD]](<vscale x 8 x s16>)
+    ; RV64I-NEXT: PseudoRET implicit $v8m2
+    %0:_(p0) = COPY $x10
+    %1:_(<vscale x 8 x s16>) = G_LOAD %0(p0) :: (load (<vscale x 8 x s16>) from %ir.pa)
+    $v8m2 = COPY %1(<vscale x 8 x s16>)
+    PseudoRET implicit $v8m2
+
+...
+---
+name:            vload_nx16i16
+legalized:       true
+tracksRegLiveness: true
+body:             |
+  bb.1 (%ir-block.0):
+    liveins: $x10
+
+    ; RV32I-LABEL: name: vload_nx16i16
+    ; RV32I: liveins: $x10
+    ; RV32I-NEXT: {{  $}}
+    ; RV32I-NEXT: [[COPY:%[0-9]+]]:gprb(p0) = COPY $x10
+    ; RV32I-NEXT: [[LOAD:%[0-9]+]]:vrb(<vscale x 16 x s16>) = G_LOAD [[COPY]](p0) :: (load (<vscale x 16 x s16>) from %ir.pa)
+    ; RV32I-NEXT: $v8m4 = COPY [[LOAD]](<vscale x 16 x s16>)
+    ; RV32I-NEXT: PseudoRET implicit $v8m4
+    ;
+    ; RV64I-LABEL: name: vload_nx16i16
+    ; RV64I: liveins: $x10
+    ; RV64I-NEXT: {{  $}}
+    ; RV64I-NEXT: [[COPY:%[0-9]+]]:gprb(p0) = COPY $x10
+    ; RV64I-NEXT: [[LOAD:%[0-9]+]]:vrb(<vscale x 16 x s16>) = G_LOAD [[COPY]](p0) :: (load (<vscale x 16 x s16>) from %ir.pa)
+    ; RV64I-NEXT: $v8m4 = COPY [[LOAD]](<vscale x 16 x s16>)
+    ; RV64I-NEXT: PseudoRET implicit $v8m4
+    %0:_(p0) = COPY $x10
+    %1:_(<vscale x 16 x s16>) = G_LOAD %0(p0) :: (load (<vscale x 16 x s16>) from %ir.pa)
+    $v8m4 = COPY %1(<vscale x 16 x s16>)
+    PseudoRET implicit $v8m4
+
+...
+---
+name:            vload_nx32i16
+legalized:       true
+tracksRegLiveness: true
+body:             |
+  bb.1 (%ir-block.0):
+    liveins: $x10
+
+    ; RV32I-LABEL: name: vload_nx32i16
+    ; RV32I: liveins: $x10
+    ; RV32I-NEXT: {{  $}}
+    ; RV32I-NEXT: [[COPY:%[0-9]+]]:gprb(p0) = COPY $x10
+    ; RV32I-NEXT: [[LOAD:%[0-9]+]]:vrb(<vscale x 32 x s16>) = G_LOAD [[COPY]](p0) :: (load (<vscale x 32 x s16>) from %ir.pa)
+    ; RV32I-NEXT: $v8m8 = COPY [[LOAD]](<vscale x 32 x s16>)
+    ; RV32I-NEXT: PseudoRET implicit $v8m8
+    ;
+    ; RV64I-LABEL: name: vload_nx32i16
+    ; RV64I: liveins: $x10
+    ; RV64I-NEXT: {{  $}}
+    ; RV64I-NEXT: [[COPY:%[0-9]+]]:gprb(p0) = COPY $x10
+    ; RV64I-NEXT: [[LOAD:%[0-9]+]]:vrb(<vscale x 32 x s16>) = G_LOAD [[COPY]](p0) :: (load (<vscale x 32 x s16>) from %ir.pa)
+    ; RV64I-NEXT: $v8m8 = COPY [[LOAD]](<vscale x 32 x s16>)
+    ; RV64I-NEXT: PseudoRET implicit $v8m8
+    %0:_(p0) = COPY $x10
+    %1:_(<vscale x 32 x s16>) = G_LOAD %0(p0) :: (load (<vscale x 32 x s16>) from %ir.pa)
+    $v8m8 = COPY %1(<vscale x 32 x s16>)
+    PseudoRET implicit $v8m8
+
+...
+---
+name:            vload_nx1i32
+legalized:       true
+tracksRegLiveness: true
+body:             |
+  bb.1 (%ir-block.0):
+    liveins: $x10
+
+    ; RV32I-LABEL: name: vload_nx1i32
+    ; RV32I: liveins: $x10
+    ; RV32I-NEXT: {{  $}}
+    ; RV32I-NEXT: [[COPY:%[0-9]+]]:gprb(p0) = COPY $x10
+    ; RV32I-NEXT: [[LOAD:%[0-9]+]]:vrb(<vscale x 1 x s32>) = G_LOAD [[COPY]](p0) :: (load (<vscale x 1 x s32>) from %ir.pa)
+    ; RV32I-NEXT: $v8 = COPY [[LOAD]](<vscale x 1 x s32>)
+    ; RV32I-NEXT: PseudoRET implicit $v8
+    ;
+    ; RV64I-LABEL: name: vload_nx1i32
+    ; RV64I: liveins: $x10
+    ; RV64I-NEXT: {{  $}}
+    ; RV64I-NEXT: [[COPY:%[0-9]+]]:gprb(p0) = COPY $x10
+    ; RV64I-NEXT: [[LOAD:%[0-9]+]]:vrb(<vscale x 1 x s32>) = G_LOAD [[COPY]](p0) :: (load (<vscale x 1 x s32>) from %ir.pa)
+    ; RV64I-NEXT: $v8 = COPY [[LOAD]](<vscale x 1 x s32>)
+    ; RV64I-NEXT: PseudoRET implicit $v8
+    %0:_(p0) = COPY $x10
+    %1:_(<vscale x 1 x s32>) = G_LOAD %0(p0) :: (load (<vscale x 1 x s32>) from %ir.pa)
+    $v8 = COPY %1(<vscale x 1 x s32>)
+    PseudoRET implicit $v8
+
+...
+---
+name:            vload_nx2i32
+legalized:       true
+tracksRegLiveness: true
+body:             |
+  bb.1 (%ir-block.0):
+    liveins: $x10
+
+    ; RV32I-LABEL: name: vload_nx2i32
+    ; RV32I: liveins: $x10
+    ; RV32I-NEXT: {{  $}}
+    ; RV32I-NEXT: [[COPY:%[0-9]+]]:gprb(p0) = COPY $x10
+    ; RV32I-NEXT: [[LOAD:%[0-9]+]]:vrb(<vscale x 2 x s32>) = G_LOAD [[COPY]](p0) :: (load (<vscale x 2 x s32>) from %ir.pa)
+    ; RV32I-NEXT: $v8 = COPY [[LOAD]](<vscale x 2 x s32>)
+    ; RV32I-NEXT: PseudoRET implicit $v8
+    ;
+    ; RV64I-LABEL: name: vload_nx2i32
+    ; RV64I: liveins: $x10
+    ; RV64I-NEXT: {{  $}}
+    ; RV64I-NEXT: [[COPY:%[0-9]+]]:gprb(p0) = COPY $x10
+    ; RV64I-NEXT: [[LOAD:%[0-9]+]]:vrb(<vscale x 2 x s32>) = G_LOAD [[COPY]](p0) :: (load (<vscale x 2 x s32>) from %ir.pa)
+    ; RV64I-NEXT: $v8 = COPY [[LOAD]](<vscale x 2 x s32>)
+    ; RV64I-NEXT: PseudoRET implicit $v8
+    %0:_(p0) = COPY $x10
+    %1:_(<vscale x 2 x s32>) = G_LOAD %0(p0) :: (load (<vscale x 2 x s32>) from %ir.pa)
+    $v8 = COPY %1(<vscale x 2 x s32>)
+    PseudoRET implicit $v8
+
+...
+---
+name:            vload_nx4i32
+legalized:       true
+tracksRegLiveness: true
+body:             |
+  bb.1 (%ir-block.0):
+    liveins: $x10
+
+    ; RV32I-LABEL: name: vload_nx4i32
+    ; RV32I: liveins: $x10
+    ; RV32I-NEXT: {{  $}}
+    ; RV32I-NEXT: [[COPY:%[0-9]+]]:gprb(p0) = COPY $x10
+    ; RV32I-NEXT: [[LOAD:%[0-9]+]]:vrb(<vscale x 4 x s32>) = G_LOAD [[COPY]](p0) :: (load (<vscale x 4 x s32>) from %ir.pa)
+    ; RV32I-NEXT: $v8m2 = COPY [[LOAD]](<vscale x 4 x s32>)
+    ; RV32I-NEXT: PseudoRET implicit $v8m2
+    ;
+    ; RV64I-LABEL: name: vload_nx4i32
+    ; RV64I: liveins: $x10
+    ; RV64I-NEXT: {{  $}}
+    ; RV64I-NEXT: [[COPY:%[0-9]+]]:gprb(p0) = COPY $x10
+    ; RV64I-NEXT: [[LOAD:%[0-9]+]]:vrb(<vscale x 4 x s32>) = G_LOAD [[COPY]](p0) :: (load (<vscale x 4 x s32>) from %ir.pa)
+    ; RV64I-NEXT: $v8m2 = COPY [[LOAD]](<vscale x 4 x s32>)
+    ; RV64I-NEXT: PseudoRET implicit $v8m2
+    %0:_(p0) = COPY $x10
+    %1:_(<vscale x 4 x s32>) = G_LOAD %0(p0) :: (load (<vscale x 4 x s32>) from %ir.pa)
+    $v8m2 = COPY %1(<vscale x 4 x s32>)
+    PseudoRET implicit $v8m2
+
+...
+---
+name:            vload_nx8i32
+legalized:       true
+tracksRegLiveness: true
+body:             |
+  bb.1 (%ir-block.0):
+    liveins: $x10
+
+    ; RV32I-LABEL: name: vload_nx8i32
+    ; RV32I: liveins: $x10
+    ; RV32I-NEXT: {{  $}}
+    ; RV32I-NEXT: [[COPY:%[0-9]+]]:gprb(p0) = COPY $x10
+    ; RV32I-NEXT: [[LOAD:%[0-9]+]]:vrb(<vscale x 8 x s32>) = G_LOAD [[COPY]](p0) :: (load (<vscale x 8 x s32>) from %ir.pa)
+    ; RV32I-NEXT: $v8m4 = COPY [[LOAD]](<vscale x 8 x s32>)
+    ; RV32I-NEXT: PseudoRET implicit $v8m4
+    ;
+    ; RV64I-LABEL: name: vload_nx8i32
+    ; RV64I: liveins: $x10
+    ; RV64I-NEXT: {{  $}}
+    ; RV64I-NEXT: [[COPY:%[0-9]+]]:gprb(p0) = COPY $x10
+    ; RV64I-NEXT: [[LOAD:%[0-9]+]]:vrb(<vscale x 8 x s32>) = G_LOAD [[COPY]](p0) :: (load (<vscale x 8 x s32>) from %ir.pa)
+    ; RV64I-NEXT: $v8m4 = COPY [[LOAD]](<vscale x 8 x s32>)
+    ; RV64I-NEXT: PseudoRET implicit $v8m4
+    %0:_(p0) = COPY $x10
+    %1:_(<vscale x 8 x s32>) = G_LOAD %0(p0) :: (load (<vscale x 8 x s32>) from %ir.pa)
+    $v8m4 = COPY %1(<vscale x 8 x s32>)
+    PseudoRET implicit $v8m4
+
+...
+---
+name:            vload_nx16i32
+legalized:       true
+tracksRegLiveness: true
+body:             |
+  bb.1 (%ir-block.0):
+    liveins: $x10
+
+    ; RV32I-LABEL: name: vload_nx16i32
+    ; RV32I: liveins: $x10
+    ; RV32I-NEXT: {{  $}}
+    ; RV32I-NEXT: [[COPY:%[0-9]+]]:gprb(p0) = COPY $x10
+    ; RV32I-NEXT: [[LOAD:%[0-9]+]]:vrb(<vscale x 16 x s32>) = G_LOAD [[COPY]](p0) :: (load (<vscale x 16 x s32>) from %ir.pa)
+    ; RV32I-NEXT: $v8m8 = COPY [[LOAD]](<vscale x 16 x s32>)
+    ; RV32I-NEXT: PseudoRET implicit $v8m8
+    ;
+    ; RV64I-LABEL: name: vload_nx16i32
+    ; RV64I: liveins: $x10
+    ; RV64I-NEXT: {{  $}}
+    ; RV64I-NEXT: [[COPY:%[0-9]+]]:gprb(p0) = COPY $x10
+    ; RV64I-NEXT: [[LOAD:%[0-9]+]]:vrb(<vscale x 16 x s32>) = G_LOAD [[COPY]](p0) :: (load (<vscale x 16 x s32>) from %ir.pa)
+    ; RV64I-NEXT: $v8m8 = COPY [[LOAD]](<vscale x 16 x s32>)
+    ; RV64I-NEXT: PseudoRET implicit $v8m8
+    %0:_(p0) = COPY $x10
+    %1:_(<vscale x 16 x s32>) = G_LOAD %0(p0) :: (load (<vscale x 16 x s32>) from %ir.pa)
+    $v8m8 = COPY %1(<vscale x 16 x s32>)
+    PseudoRET implicit $v8m8
+
+...
+---
+name:            vload_nx1i64
+legalized:       true
+tracksRegLiveness: true
+body:             |
+  bb.1 (%ir-block.0):
+    liveins: $x10
+
+    ; RV32I-LABEL: name: vload_nx1i64
+    ; RV32I: liveins: $x10
+    ; RV32I-NEXT: {{  $}}
+    ; RV32I-NEXT: [[COPY:%[0-9]+]]:gprb(p0) = COPY $x10
+    ; RV32I-NEXT: [[LOAD:%[0-9]+]]:vrb(<vscale x 1 x s64>) = G_LOAD [[COPY]](p0) :: (load (<vscale x 1 x s64>) from %ir.pa)
+    ; RV32I-NEXT: $v8 = COPY [[LOAD]](<vscale x 1 x s64>)
+    ; RV32I-NEXT: PseudoRET implicit $v8
+    ;
+    ; RV64I-LABEL: name: vload_nx1i64
+    ; RV64I: liveins: $x10
+    ; RV64I-NEXT: {{  $}}
+    ; RV64I-NEXT: [[COPY:%[0-9]+]]:gprb(p0) = COPY $x10
+    ; RV64I-NEXT: [[LOAD:%[0-9]+]]:vrb(<vscale x 1 x s64>) = G_LOAD [[COPY]](p0) :: (load (<vscale x 1 x s64>) from %ir.pa)
+    ; RV64I-NEXT: $v8 = COPY [[LOAD]](<vscale x 1 x s64>)
+    ; RV64I-NEXT: PseudoRET implicit $v8
+    %0:_(p0) = COPY $x10
+    %1:_(<vscale x 1 x s64>) = G_LOAD %0(p0) :: (load (<vscale x 1 x s64>) from %ir.pa)
+    $v8 = COPY %1(<vscale x 1 x s64>)
+    PseudoRET implicit $v8
+
+...
+---
+name:            vload_nx2i64
+legalized:       true
+tracksRegLiveness: true
+body:             |
+  bb.1 (%ir-block.0):
+    liveins: $x10
+
+    ; RV32I-LABEL: name: vload_nx2i64
+    ; RV32I: liveins: $x10
+    ; RV32I-NEXT: {{  $}}
+    ; RV32I-NEXT: [[COPY:%[0-9]+]]:gprb(p0) = COPY $x10
+    ; RV32I-NEXT: [[LOAD:%[0-9]+]]:vrb(<vscale x 2 x s64>) = G_LOAD [[COPY]](p0) :: (load (<vscale x 2 x s64>) from %ir.pa)
+    ; RV32I-NEXT: $v8m2 = COPY [[LOAD]](<vscale x 2 x s64>)
+    ; RV32I-NEXT: PseudoRET implicit $v8m2
+    ;
+    ; RV64I-LABEL: name: vload_nx2i64
+    ; RV64I: liveins: $x10
+    ; RV64I-NEXT: {{  $}}
+    ; RV64I-NEXT: [[COPY:%[0-9]+]]:gprb(p0) = COPY $x10
+    ; RV64I-NEXT: [[LOAD:%[0-9]+]]:vrb(<vscale x 2 x s64>) = G_LOAD [[COPY]](p0) :: (load (<vscale x 2 x s64>) from %ir.pa)
+    ; RV64I-NEXT: $v8m2 = COPY [[LOAD]](<vscale x 2 x s64>)
+    ; RV64I-NEXT: PseudoRET implicit $v8m2
+    %0:_(p0) = COPY $x10
+    %1:_(<vscale x 2 x s64>) = G_LOAD %0(p0) :: (load (<vscale x 2 x s64>) from %ir.pa)
+    $v8m2 = COPY %1(<vscale x 2 x s64>)
+    PseudoRET implicit $v8m2
+
+...
+---
+name:            vload_nx4i64
+legalized:       true
+tracksRegLiveness: true
+body:             |
+  bb.1 (%ir-block.0):
+    liveins: $x10
+
+    ; RV32I-LABEL: name: vload_nx4i64
+    ; RV32I: liveins: $x10
+    ; RV32I-NEXT: {{  $}}
+    ; RV32I-NEXT: [[COPY:%[0-9]+]]:gprb(p0) = COPY $x10
+    ; RV32I-NEXT: [[LOAD:%[0-9]+]]:vrb(<vscale x 4 x s64>) = G_LOAD [[COPY]](p0) :: (load (<vscale x 4 x s64>) from %ir.pa)
+    ; RV32I-NEXT: $v8m4 = COPY [[LOAD]](<vscale x 4 x s64>)
+    ; RV32I-NEXT: PseudoRET implicit $v8m4
+    ;
+    ; RV64I-LABEL: name: vload_nx4i64
+    ; RV64I: liveins: $x10
+    ; RV64I-NEXT: {{  $}}
+    ; RV64I-NEXT: [[COPY:%[0-9]+]]:gprb(p0) = COPY $x10
+    ; RV64I-NEXT: [[LOAD:%[0-9]+]]:vrb(<vscale x 4 x s64>) = G_LOAD [[COPY]](p0) :: (load (<vscale x 4 x s64>) from %ir.pa)
+    ; RV64I-NEXT: $v8m4 = COPY [[LOAD]](<vscale x 4 x s64>)
+    ; RV64I-NEXT: PseudoRET implicit $v8m4
+    %0:_(p0) = COPY $x10
+    %1:_(<vscale x 4 x s64>) = G_LOAD %0(p0) :: (load (<vscale x 4 x s64>) from %ir.pa)
+    $v8m4 = COPY %1(<vscale x 4 x s64>)
+    PseudoRET implicit $v8m4
+
+...
+---
+name:            vload_nx8i64
+legalized:       true
+tracksRegLiveness: true
+body:             |
+  bb.1 (%ir-block.0):
+    liveins: $x10
+
+    ; RV32I-LABEL: name: vload_nx8i64
+    ; RV32I: liveins: $x10
+    ; RV32I-NEXT: {{  $}}
+    ; RV32I-NEXT: [[COPY:%[0-9]+]]:gprb(p0) = COPY $x10
+    ; RV32I-NEXT: [[LOAD:%[0-9]+]]:vrb(<vscale x 8 x s64>) = G_LOAD [[COPY]](p0) :: (load (<vscale x 8 x s64>) from %ir.pa)
+    ; RV32I-NEXT: $v8m8 = COPY [[LOAD]](<vscale x 8 x s64>)
+    ; RV32I-NEXT: PseudoRET implicit $v8m8
+    ;
+    ; RV64I-LABEL: name: vload_nx8i64
+    ; RV64I: liveins: $x10
+    ; RV64I-NEXT: {{  $}}
+    ; RV64I-NEXT: [[COPY:%[0-9]+]]:gprb(p0) = COPY $x10
+    ; RV64I-NEXT: [[LOAD:%[0-9]+]]:vrb(<vscale x 8 x s64>) = G_LOAD [[COPY]](p0) :: (load (<vscale x 8 x s64>) from %ir.pa)
+    ; RV64I-NEXT: $v8m8 = COPY [[LOAD]](<vscale x 8 x s64>)
+    ; RV64I-NEXT: PseudoRET implicit $v8m8
+    %0:_(p0) = COPY $x10
+    %1:_(<vscale x 8 x s64>) = G_LOAD %0(p0) :: (load (<vscale x 8 x s64>) from %ir.pa)
+    $v8m8 = COPY %1(<vscale x 8 x s64>)
+    PseudoRET implicit $v8m8
+
+...
+---
+name:            vload_nx16i8_align1
+legalized:       true
+tracksRegLiveness: true
+body:             |
+  bb.1 (%ir-block.0):
+    liveins: $x10
+
+    ; RV32I-LABEL: name: vload_nx16i8_align1
+    ; RV32I: liveins: $x10
+    ; RV32I-NEXT: {{  $}}
+    ; RV32I-NEXT: [[COPY:%[0-9]+]]:gprb(p0) = COPY $x10
+    ; RV32I-NEXT: [[LOAD:%[0-9]+]]:vrb(<vscale x 16 x s8>) = G_LOAD [[COPY]](p0) :: (load (<vscale x 16 x s8>) from %ir.pa, align 1)
+    ; RV32I-NEXT: $v8m2 = COPY [[LOAD]](<vscale x 16 x s8>)
+    ; RV32I-NEXT: PseudoRET implicit $v8m2
+    ;
+    ; RV64I-LABEL: name: vload_nx16i8_align1
+    ; RV64I: liveins: $x10
+    ; RV64I-NEXT: {{  $}}
+    ; RV64I-NEXT: [[COPY:%[0-9]+]]:gprb(p0) = COPY $x10
+    ; RV64I-NEXT: [[LOAD:%[0-9]+]]:vrb(<vscale x 16 x s8>) = G_LOAD [[COPY]](p0) :: (load (<vscale x 16 x s8>) from %ir.pa, align 1)
+    ; RV64I-NEXT: $v8m2 = COPY [[LOAD]](<vscale x 16 x s8>)
+    ; RV64I-NEXT: PseudoRET implicit $v8m2
+    %0:_(p0) = COPY $x10
+    %1:_(<vscale x 16 x s8>) = G_LOAD %0(p0) :: (load (<vscale x 16 x s8>) from %ir.pa, align 1)
+    $v8m2 = COPY %1(<vscale x 16 x s8>)
+    PseudoRET implicit $v8m2
+
+...
+---
+name:            vload_nx16i8_align2
+legalized:       true
+tracksRegLiveness: true
+body:             |
+  bb.1 (%ir-block.0):
+    liveins: $x10
+
+    ; RV32I-LABEL: name: vload_nx16i8_align2
+    ; RV32I: liveins: $x10
+    ; RV32I-NEXT: {{  $}}
+    ; RV32I-NEXT: [[COPY:%[0-9]+]]:gprb(p0) = COPY $x10
+    ; RV32I-NEXT: [[LOAD:%[0-9]+]]:vrb(<vscale x 16 x s8>) = G_LOAD [[COPY]](p0) :: (load (<vscale x 16 x s8>) from %ir.pa, align 2)
+    ; RV32I-NEXT: $v8m2 = COPY [[LOAD]](<vscale x 16 x s8>)
+    ; RV32I-NEXT: PseudoRET implicit $v8m2
+    ;
+    ; RV64I-LABEL: name: vload_nx16i8_align2
+    ; RV64I: liveins: $x10
+    ; RV64I-NEXT: {{  $}}
+    ; RV64I-NEXT: [[COPY:%[0-9]+]]:gprb(p0) = COPY $x10
+    ; RV64I-NEXT: [[LOAD:%[0-9]+]]:vrb(<vscale x 16 x s8>) = G_LOAD [[COPY]](p0) :: (load (<vscale x 16 x s8>) from %ir.pa, align 2)
+    ; RV64I-NEXT: $v8m2 = COPY [[LOAD]](<vscale x 16 x s8>)
+    ; RV64I-NEXT: PseudoRET implicit $v8m2
+    %0:_(p0) = COPY $x10
+    %1:_(<vscale x 16 x s8>) = G_LOAD %0(p0) :: (load (<vscale x 16 x s8>) from %ir.pa, align 2)
+    $v8m2 = COPY %1(<vscale x 16 x s8>)
+    PseudoRET implicit $v8m2
+
+...
+---
+name:            vload_nx16i8_align16
+legalized:       true
+tracksRegLiveness: true
+body:             |
+  bb.1 (%ir-block.0):
+    liveins: $x10
+
+    ; RV32I-LABEL: name: vload_nx16i8_align16
+    ; RV32I: liveins: $x10
+    ; RV32I-NEXT: {{  $}}
+    ; RV32I-NEXT: [[COPY:%[0-9]+]]:gprb(p0) = COPY $x10
+    ; RV32I-NEXT: [[LOAD:%[0-9]+]]:vrb(<vscale x 16 x s8>) = G_LOAD [[COPY]](p0) :: (load (<vscale x 16 x s8>) from %ir.pa)
+    ; RV32I-NEXT: $v8m2 = COPY [[LOAD]](<vscale x 16 x s8>)
+    ; RV32I-NEXT: PseudoRET implicit $v8m2
+    ;
+    ; RV64I-LABEL: name: vload_nx16i8_align16
+    ; RV64I: liveins: $x10
+    ; RV64I-NEXT: {{  $}}
+    ; RV64I-NEXT: [[COPY:%[0-9]+]]:gprb(p0) = COPY $x10
+    ; RV64I-NEXT: [[LOAD:%[0-9]+]]:vrb(<vscale x 16 x s8>) = G_LOAD [[COPY]](p0) :: (load (<vscale x 16 x s8>) from %ir.pa)
+    ; RV64I-NEXT: $v8m2 = COPY [[LOAD]](<vscale x 16 x s8>)
+    ; RV64I-NEXT: PseudoRET implicit $v8m2
+    %0:_(p0) = COPY $x10
+    %1:_(<vscale x 16 x s8>) = G_LOAD %0(p0) :: (load (<vscale x 16 x s8>) from %ir.pa)
+    $v8m2 = COPY %1(<vscale x 16 x s8>)
+    PseudoRET implicit $v8m2
+
+...
+---
+name:            vload_nx16i8_align64
+legalized:       true
+tracksRegLiveness: true
+body:             |
+  bb.1 (%ir-block.0):
+    liveins: $x10
+
+    ; RV32I-LABEL: name: vload_nx16i8_align64
+    ; RV32I: liveins: $x10
+    ; RV32I-NEXT: {{  $}}
+    ; RV32I-NEXT: [[COPY:%[0-9]+]]:gprb(p0) = COPY $x10
+    ; RV32I-NEXT: [[LOAD:%[0-9]+]]:vrb(<vscale x 16 x s8>) = G_LOAD [[COPY]](p0) :: (load (<vscale x 16 x s8>) from %ir.pa, align 64)
+    ; RV32I-NEXT: $v8m2 = COPY [[LOAD]](<vscale x 16 x s8>)
+    ; RV32I-NEXT: PseudoRET implicit $v8m2
+    ;
+    ; RV64I-LABEL: name: vload_nx16i8_align64
+    ; RV64I: liveins: $x10
+    ; RV64I-NEXT: {{  $}}
+    ; RV64I-NEXT: [[COPY:%[0-9]+]]:gprb(p0) = COPY $x10
+    ; RV64I-NEXT: [[LOAD:%[0-9]+]]:vrb(<vscale x 16 x s8>) = G_LOAD [[COPY]](p0) :: (load (<vscale x 16 x s8>) from %ir.pa, align 64)
+    ; RV64I-NEXT: $v8m2 = COPY [[LOAD]](<vscale x 16 x s8>)
+    ; RV64I-NEXT: PseudoRET implicit $v8m2
+    %0:_(p0) = COPY $x10
+    %1:_(<vscale x 16 x s8>) = G_LOAD %0(p0) :: (load (<vscale x 16 x s8>) from %ir.pa, align 64)
+    $v8m2 = COPY %1(<vscale x 16 x s8>)
+    PseudoRET implicit $v8m2
+
+...
+---
+name:            vload_nx4i16_align1
+legalized:       true
+tracksRegLiveness: true
+body:             |
+  bb.1 (%ir-block.0):
+    liveins: $x10
+
+    ; RV32I-LABEL: name: vload_nx4i16_align1
+    ; RV32I: liveins: $x10
+    ; RV32I-NEXT: {{  $}}
+    ; RV32I-NEXT: [[COPY:%[0-9]+]]:gprb(p0) = COPY $x10
+    ; RV32I-NEXT: [[LOAD:%[0-9]+]]:vrb(<vscale x 8 x s8>) = G_LOAD [[COPY]](p0) :: (load (<vscale x 8 x s8>) from %ir.pa, align 1)
+    ; RV32I-NEXT: [[BITCAST:%[0-9]+]]:vrb(<vscale x 4 x s16>) = G_BITCAST [[LOAD]](<vscale x 8 x s8>)
+    ; RV32I-NEXT: $v8 = COPY [[BITCAST]](<vscale x 4 x s16>)
+    ; RV32I-NEXT: PseudoRET implicit $v8
+    ;
+    ; RV64I-LABEL: name: vload_nx4i16_align1
+    ; RV64I: liveins: $x10
+    ; RV64I-NEXT: {{  $}}
+    ; RV64I-NEXT: [[COPY:%[0-9]+]]:gprb(p0) = COPY $x10
+    ; RV64I-NEXT: [[LOAD:%[0-9]+]]:vrb(<vscale x 8 x s8>) = G_LOAD [[COPY]](p0) :: (load (<vscale x 8 x s8>) from %ir.pa, align 1)
+    ; RV64I-NEXT: [[BITCAST:%[0-9]+]]:vrb(<vscale x 4 x s16>) = G_BITCAST [[LOAD]](<vscale x 8 x s8>)
+    ; RV64I-NEXT: $v8 = COPY [[BITCAST]](<vscale x 4 x s16>)
+    ; RV64I-NEXT: PseudoRET implicit $v8
+    %0:_(p0) = COPY $x10
+    %2:_(<vscale x 8 x s8>) = G_LOAD %0(p0) :: (load (<vscale x 8 x s8>) from %ir.pa, align 1)
+    %1:_(<vscale x 4 x s16>) = G_BITCAST %2(<vscale x 8 x s8>)
+    $v8 = COPY %1(<vscale x 4 x s16>)
+    PseudoRET implicit $v8
+
+...
+---
+name:            vload_nx4i16_align2
+legalized:       true
+tracksRegLiveness: true
+body:             |
+  bb.1 (%ir-block.0):
+    liveins: $x10
+
+    ; RV32I-LABEL: name: vload_nx4i16_align2
+    ; RV32I: liveins: $x10
+    ; RV32I-NEXT: {{  $}}
+    ; RV32I-NEXT: [[COPY:%[0-9]+]]:gprb(p0) = COPY $x10
+    ; RV32I-NEXT: [[LOAD:%[0-9]+]]:vrb(<vscale x 4 x s16>) = G_LOAD [[COPY]](p0) :: (load (<vscale x 4 x s16>) from %ir.pa, align 2)
+    ; RV32I-NEXT: $v8 = COPY [[LOAD]](<vscale x 4 x s16>)
+    ; RV32I-NEXT: PseudoRET implicit $v8
+    ;
+    ; RV64I-LABEL: name: vload_nx4i16_align2
+    ; RV64I: liveins: $x10
+    ; RV64I-NEXT: {{  $}}
+    ; RV64I-NEXT: [[COPY:%[0-9]+]]:gprb(p0) = COPY $x10
+    ; RV64I-NEXT: [[LOAD:%[0-9]+]]:vrb(<vscale x 4 x s16>) = G_LOAD [[COPY]](p0) :: (load (<vscale x 4 x s16>) from %ir.pa, align 2)
+    ; RV64I-NEXT: $v8 = COPY [[LOAD]](<vscale x 4 x s16>)
+    ; RV64I-NEXT: PseudoRET implicit $v8
+    %0:_(p0) = COPY $x10
+    %1:_(<vscale x 4 x s16>) = G_LOAD %0(p0) :: (load (<vscale x 4 x s16>) from %ir.pa, align 2)
+    $v8 = COPY %1(<vscale x 4 x s16>)
+    PseudoRET implicit $v8
+
+...
+---
+name:            vload_nx4i16_align4
+legalized:       true
+tracksRegLiveness: true
+body:             |
+  bb.1 (%ir-block.0):
+    liveins: $x10
+
+    ; RV32I-LABEL: name: vload_nx4i16_align4
+    ; RV32I: liveins: $x10
+    ; RV32I-NEXT: {{  $}}
+    ; RV32I-NEXT: [[COPY:%[0-9]+]]:gprb(p0) = COPY $x10
+    ; RV32I-NEXT: [[LOAD:%[0-9]+]]:vrb(<vscale x 4 x s16>) = G_LOAD [[COPY]](p0) :: (load (<vscale x 4 x s16>) from %ir.pa, align 4)
+    ; RV32I-NEXT: $v8 = COPY [[LOAD]](<vscale x 4 x s16>)
+    ; RV32I-NEXT: PseudoRET implicit $v8
+    ;
+    ; RV64I-LABEL: name: vload_nx4i16_align4
+    ; RV64I: liveins: $x10
+    ; RV64I-NEXT: {{  $}}
+    ; RV64I-NEXT: [[COPY:%[0-9]+]]:gprb(p0) = COPY $x10
+    ; RV64I-NEXT: [[LOAD:%[0-9]+]]:vrb(<vscale x 4 x s16>) = G_LOAD [[COPY]](p0) :: (load (<vscale x 4 x s16>) from %ir.pa, align 4)
+    ; RV64I-NEXT: $v8 = COPY [[LOAD]](<vscale x 4 x s16>)
+    ; RV64I-NEXT: PseudoRET implicit $v8
+    %0:_(p0) = COPY $x10
+    %1:_(<vscale x 4 x s16>) = G_LOAD %0(p0) :: (load (<vscale x 4 x s16>) from %ir.pa, align 4)
+    $v8 = COPY %1(<vscale x 4 x s16>)
+    PseudoRET implicit $v8
+
+...
+---
+name:            vload_nx4i16_align8
+legalized:       true
+tracksRegLiveness: true
+body:             |
+  bb.1 (%ir-block.0):
+    liveins: $x10
+
+    ; RV32I-LABEL: name: vload_nx4i16_align8
+    ; RV32I: liveins: $x10
+    ; RV32I-NEXT: {{  $}}
+    ; RV32I-NEXT: [[COPY:%[0-9]+]]:gprb(p0) = COPY $x10
+    ; RV32I-NEXT: [[LOAD:%[0-9]+]]:vrb(<vscale x 4 x s16>) = G_LOAD [[COPY]](p0) :: (load (<vscale x 4 x s16>) from %ir.pa)
+    ; RV32I-NEXT: $v8 = COPY [[LOAD]](<vscale x 4 x s16>)
+    ; RV32I-NEXT: PseudoRET implicit $v8
+    ;
+    ; RV64I-LABEL: name: vload_nx4i16_align8
+    ; RV64I: liveins: $x10
+    ; RV64I-NEXT: {{  $}}
+    ; RV64I-NEXT: [[COPY:%[0-9]+]]:gprb(p0) = COPY $x10
+    ; RV64I-NEXT: [[LOAD:%[0-9]+]]:vrb(<vscale x 4 x s16>) = G_LOAD [[COPY]](p0) :: (load (<vscale x 4 x s16>) from %ir.pa)
+    ; RV64I-NEXT: $v8 = COPY [[LOAD]](<vscale x 4 x s16>)
+    ; RV64I-NEXT: PseudoRET implicit $v8
+    %0:_(p0) = COPY $x10
+    %1:_(<vscale x 4 x s16>) = G_LOAD %0(p0) :: (load (<vscale x 4 x s16>) from %ir.pa)
+    $v8 = COPY %1(<vscale x 4 x s16>)
+    PseudoRET implicit $v8
+
+...
+---
+name:            vload_nx4i16_align16
+legalized:       true
+tracksRegLiveness: true
+body:             |
+  bb.1 (%ir-block.0):
+    liveins: $x10
+
+    ; RV32I-LABEL: name: vload_nx4i16_align16
+    ; RV32I: liveins: $x10
+    ; RV32I-NEXT: {{  $}}
+    ; RV32I-NEXT: [[COPY:%[0-9]+]]:gprb(p0) = COPY $x10
+    ; RV32I-NEXT: [[LOAD:%[0-9]+]]:vrb(<vscale x 4 x s16>) = G_LOAD [[COPY]](p0) :: (load (<vscale x 4 x s16>) from %ir.pa, align 16)
+    ; RV32I-NEXT: $v8 = COPY [[LOAD]](<vscale x 4 x s16>)
+    ; RV32I-NEXT: PseudoRET implicit $v8
+    ;
+    ; RV64I-LABEL: name: vload_nx4i16_align16
+    ; RV64I: liveins: $x10
+    ; RV64I-NEXT: {{  $}}
+    ; RV64I-NEXT: [[COPY:%[0-9]+]]:gprb(p0) = COPY $x10
+    ; RV64I-NEXT: [[LOAD:%[0-9]+]]:vrb(<vscale x 4 x s16>) = G_LOAD [[COPY]](p0) :: (load (<vscale x 4 x s16>) from %ir.pa, align 16)
+    ; RV64I-NEXT: $v8 = COPY [[LOAD]](<vscale x 4 x s16>)
+    ; RV64I-NEXT: PseudoRET implicit $v8
+    %0:_(p0) = COPY $x10
+    %1:_(<vscale x 4 x s16>) = G_LOAD %0(p0) :: (load (<vscale x 4 x s16>) from %ir.pa, align 16)
+    $v8 = COPY %1(<vscale x 4 x s16>)
+    PseudoRET implicit $v8
+
+...
+---
+name:            vload_nx2i32_align2
+legalized:       true
+tracksRegLiveness: true
+body:             |
+  bb.1 (%ir-block.0):
+    liveins: $x10
+
+    ; RV32I-LABEL: name: vload_nx2i32_align2
+    ; RV32I: liveins: $x10
+    ; RV32I-NEXT: {{  $}}
+    ; RV32I-NEXT: [[COPY:%[0-9]+]]:gprb(p0) = COPY $x10
+    ; RV32I-NEXT: [[LOAD:%[0-9]+]]:vrb(<vscale x 8 x s8>) = G_LOAD [[COPY]](p0) :: (load (<vscale x 8 x s8>) from %ir.pa, align 2)
+    ; RV32I-NEXT: [[BITCAST:%[0-9]+]]:vrb(<vscale x 2 x s32>) = G_BITCAST [[LOAD]](<vscale x 8 x s8>)
+    ; RV32I-NEXT: $v8 = COPY [[BITCAST]](<vscale x 2 x s32>)
+    ; RV32I-NEXT: PseudoRET implicit $v8
+    ;
+    ; RV64I-LABEL: name: vload_nx2i32_align2
+    ; RV64I: liveins: $x10
+    ; RV64I-NEXT: {{  $}}
+    ; RV64I-NEXT: [[COPY:%[0-9]+]]:gprb(p0) = COPY $x10
+    ; RV64I-NEXT: [[LOAD:%[0-9]+]]:vrb(<vscale x 8 x s8>) = G_LOAD [[COPY]](p0) :: (load (<vscale x 8 x s8>) from %ir.pa, align 2)
+    ; RV64I-NEXT: [[BITCAST:%[0-9]+]]:vrb(<vscale x 2 x s32>) = G_BITCAST [[LOAD]](<vscale x 8 x s8>)
+    ; RV64I-NEXT: $v8 = COPY [[BITCAST]](<vscale x 2 x s32>)
+    ; RV64I-NEXT: PseudoRET implicit $v8
+    %0:_(p0) = COPY $x10
+    %2:_(<vscale x 8 x s8>) = G_LOAD %0(p0) :: (load (<vscale x 8 x s8>) from %ir.pa, align 2)
+    %1:_(<vscale x 2 x s32>) = G_BITCAST %2(<vscale x 8 x s8>)
+    $v8 = COPY %1(<vscale x 2 x s32>)
+    PseudoRET implicit $v8
+
+...
+---
+name:            vload_nx2i32_align4
+legalized:       true
+tracksRegLiveness: true
+body:             |
+  bb.1 (%ir-block.0):
+    liveins: $x10
+
+    ; RV32I-LABEL: name: vload_nx2i32_align4
+    ; RV32I: liveins: $x10
+    ; RV32I-NEXT: {{  $}}
+    ; RV32I-NEXT: [[COPY:%[0-9]+]]:gprb(p0) = COPY $x10
+    ; RV32I-NEXT: [[LOAD:%[0-9]+]]:vrb(<vscale x 2 x s32>) = G_LOAD [[COPY]](p0) :: (load (<vscale x 2 x s32>) from %ir.pa, align 4)
+    ; RV32I-NEXT: $v8 = COPY [[LOAD]](<vscale x 2 x s32>)
+    ; RV32I-NEXT: PseudoRET implicit $v8
+    ;
+    ; RV64I-LABEL: name: vload_nx2i32_align4
+    ; RV64I: liveins: $x10
+    ; RV64I-NEXT: {{  $}}
+    ; RV64I-NEXT: [[COPY:%[0-9]+]]:gprb(p0) = COPY $x10
+    ; RV64I-NEXT: [[LOAD:%[0-9]+]]:vrb(<vscale x 2 x s32>) = G_LOAD [[COPY]](p0) :: (load (<vscale x 2 x s32>) from %ir.pa, align 4)
+    ; RV64I-NEXT: $v8 = COPY [[LOAD]](<vscale x 2 x s32>)
+    ; RV64I-NEXT: PseudoRET implicit $v8
+    %0:_(p0) = COPY $x10
+    %1:_(<vscale x 2 x s32>) = G_LOAD %0(p0) :: (load (<vscale x 2 x s32>) from %ir.pa, align 4)
+    $v8 = COPY %1(<vscale x 2 x s32>)
+    PseudoRET implicit $v8
+
+...
+---
+name:            vload_nx2i32_align8
+legalized:       true
+tracksRegLiveness: true
+body:             |
+  bb.1 (%ir-block.0):
+    liveins: $x10
+
+    ; RV32I-LABEL: name: vload_nx2i32_align8
+    ; RV32I: liveins: $x10
+    ; RV32I-NEXT: {{  $}}
+    ; RV32I-NEXT: [[COPY:%[0-9]+]]:gprb(p0) = COPY $x10
+    ; RV32I-NEXT: [[LOAD:%[0-9]+]]:vrb(<vscale x 2 x s32>) = G_LOAD [[COPY]](p0) :: (load (<vscale x 2 x s32>) from %ir.pa)
+    ; RV32I-NEXT: $v8 = COPY [[LOAD]](<vscale x 2 x s32>)
+    ; RV32I-NEXT: PseudoRET implicit $v8
+    ;
+    ; RV64I-LABEL: name: vload_nx2i32_align8
+    ; RV64I: liveins: $x10
+    ; RV64I-NEXT: {{  $}}
+    ; RV64I-NEXT: [[COPY:%[0-9]+]]:gprb(p0) = COPY $x10
+    ; RV64I-NEXT: [[LOAD:%[0-9]+]]:vrb(<vscale x 2 x s32>) = G_LOAD [[COPY]](p0) :: (load (<vscale x 2 x s32>) from %ir.pa)
+    ; RV64I-NEXT: $v8 = COPY [[LOAD]](<vscale x 2 x s32>)
+    ; RV64I-NEXT: PseudoRET implicit $v8
+    %0:_(p0) = COPY $x10
+    %1:_(<vscale x 2 x s32>) = G_LOAD %0(p0) :: (load (<vscale x 2 x s32>) from %ir.pa)
+    $v8 = COPY %1(<vscale x 2 x s32>)
+    PseudoRET implicit $v8
+
+...
+---
+name:            vload_nx2i32_align16
+legalized:       true
+tracksRegLiveness: true
+body:             |
+  bb.1 (%ir-block.0):
+    liveins: $x10
+
+    ; RV32I-LABEL: name: vload_nx2i32_align16
+    ; RV32I: liveins: $x10
+    ; RV32I-NEXT: {{  $}}
+    ; RV32I-NEXT: [[COPY:%[0-9]+]]:gprb(p0) = COPY $x10
+    ; RV32I-NEXT: [[LOAD:%[0-9]+]]:vrb(<vscale x 2 x s32>) = G_LOAD [[COPY]](p0) :: (load (<vscale x 2 x s32>) from %ir.pa, align 16)
+    ; RV32I-NEXT: $v8 = COPY [[LOAD]](<vscale x 2 x s32>)
+    ; RV32I-NEXT: PseudoRET implicit $v8
+    ;
+    ; RV64I-LABEL: name: vload_nx2i32_align16
+    ; RV64I: liveins: $x10
+    ; RV64I-NEXT: {{  $}}
+    ; RV64I-NEXT: [[COPY:%[0-9]+]]:gprb(p0) = COPY $x10
+    ; RV64I-NEXT: [[LOAD:%[0-9]+]]:vrb(<vscale x 2 x s32>) = G_LOAD [[COPY]](p0) :: (load (<vscale x 2 x s32>) from %ir.pa, align 16)
+    ; RV64I-NEXT: $v8 = COPY [[LOAD]](<vscale x 2 x s32>)
+    ; RV64I-NEXT: PseudoRET implicit $v8
+    %0:_(p0) = COPY $x10
+    %1:_(<vscale x 2 x s32>) = G_LOAD %0(p0) :: (load (<vscale x 2 x s32>) from %ir.pa, align 16)
+    $v8 = COPY %1(<vscale x 2 x s32>)
+    PseudoRET implicit $v8
+
+...
+---
+name:            vload_nx2i32_align256
+legalized:       true
+tracksRegLiveness: true
+body:             |
+  bb.1 (%ir-block.0):
+    liveins: $x10
+
+    ; RV32I-LABEL: name: vload_nx2i32_align256
+    ; RV32I: liveins: $x10
+    ; RV32I-NEXT: {{  $}}
+    ; RV32I-NEXT: [[COPY:%[0-9]+]]:gprb(p0) = COPY $x10
+    ; RV32I-NEXT: [[LOAD:%[0-9]+]]:vrb(<vscale x 2 x s32>) = G_LOAD [[COPY]](p0) :: (load (<vscale x 2 x s32>) from %ir.pa, align 256)
+    ; RV32I-NEXT: $v8 = COPY [[LOAD]](<vscale x 2 x s32>)
+    ; RV32I-NEXT: PseudoRET implicit $v8
+    ;
+    ; RV64I-LABEL: name: vload_nx2i32_align256
+    ; RV64I: liveins: $x10
+    ; RV64I-NEXT: {{  $}}
+    ; RV64I-NEXT: [[COPY:%[0-9]+]]:gprb(p0) = COPY $x10
+    ; RV64I-NEXT: [[LOAD:%[0-9]+]]:vrb(<vscale x 2 x s32>) = G_LOAD [[COPY]](p0) :: (load (<vscale x 2 x s32>) from %ir.pa, align 256)
+    ; RV64I-NEXT: $v8 = COPY [[LOAD]](<vscale x 2 x s32>)
+    ; RV64I-NEXT: PseudoRET implicit $v8
+    %0:_(p0) = COPY $x10
+    %1:_(<vscale x 2 x s32>) = G_LOAD %0(p0) :: (load (<vscale x 2 x s32>) from %ir.pa, align 256)
+    $v8 = COPY %1(<vscale x 2 x s32>)
+    PseudoRET implicit $v8
+
+...
+---
+name:            vload_nx2i64_align4
+legalized:       true
+tracksRegLiveness: true
+body:             |
+  bb.1 (%ir-block.0):
+    liveins: $x10
+
+    ; RV32I-LABEL: name: vload_nx2i64_align4
+    ; RV32I: liveins: $x10
+    ; RV32I-NEXT: {{  $}}
+    ; RV32I-NEXT: [[COPY:%[0-9]+]]:gprb(p0) = COPY $x10
+    ; RV32I-NEXT: [[LOAD:%[0-9]+]]:vrb(<vscale x 16 x s8>) = G_LOAD [[COPY]](p0) :: (load (<vscale x 16 x s8>) from %ir.pa, align 4)
+    ; RV32I-NEXT: [[BITCAST:%[0-9]+]]:vrb(<vscale x 2 x s64>) = G_BITCAST [[LOAD]](<vscale x 16 x s8>)
+    ; RV32I-NEXT: $v8m2 = COPY [[BITCAST]](<vscale x 2 x s64>)
+    ; RV32I-NEXT: PseudoRET implicit $v8m2
+    ;
+    ; RV64I-LABEL: name: vload_nx2i64_align4
+    ; RV64I: liveins: $x10
+    ; RV64I-NEXT: {{  $}}
+    ; RV64I-NEXT: [[COPY:%[0-9]+]]:gprb(p0) = COPY $x10
+    ; RV64I-NEXT: [[LOAD:%[0-9]+]]:vrb(<vscale x 16 x s8>) = G_LOAD [[COPY]](p0) :: (load (<vscale x 16 x s8>) from %ir.pa, align 4)
+    ; RV64I-NEXT: [[BITCAST:%[0-9]+]]:vrb(<vscale x 2 x s64>) = G_BITCAST [[LOAD]](<vscale x 16 x s8>)
+    ; RV64I-NEXT: $v8m2 = COPY [[BITCAST]](<vscale x 2 x s64>)
+    ; RV64I-NEXT: PseudoRET implicit $v8m2
+    %0:_(p0) = COPY $x10
+    %2:_(<vscale x 16 x s8>) = G_LOAD %0(p0) :: (load (<vscale x 16 x s8>) from %ir.pa, align 4)
+    %1:_(<vscale x 2 x s64>) = G_BITCAST %2(<vscale x 16 x s8>)
+    $v8m2 = COPY %1(<vscale x 2 x s64>)
+    PseudoRET implicit $v8m2
+
+...
+---
+name:            vload_nx2i64_align8
+legalized:       true
+tracksRegLiveness: true
+body:             |
+  bb.1 (%ir-block.0):
+    liveins: $x10
+
+    ; RV32I-LABEL: name: vload_nx2i64_align8
+    ; RV32I: liveins: $x10
+    ; RV32I-NEXT: {{  $}}
+    ; RV32I-NEXT: [[COPY:%[0-9]+]]:gprb(p0) = COPY $x10
+    ; RV32I-NEXT: [[LOAD:%[0-9]+]]:vrb(<vscale x 2 x s64>) = G_LOAD [[COPY]](p0) :: (load (<vscale x 2 x s64>) from %ir.pa, align 8)
+    ; RV32I-NEXT: $v8m2 = COPY [[LOAD]](<vscale x 2 x s64>)
+    ; RV32I-NEXT: PseudoRET implicit $v8m2
+    ;
+    ; RV64I-LABEL: name: vload_nx2i64_align8
+    ; RV64I: liveins: $x10
+    ; RV64I-NEXT: {{  $}}
+    ; RV64I-NEXT: [[COPY:%[0-9]+]]:gprb(p0) = COPY $x10
+    ; RV64I-NEXT: [[LOAD:%[0-9]+]]:vrb(<vscale x 2 x s64>) = G_LOAD [[COPY]](p0) :: (load (<vscale x 2 x s64>) from %ir.pa, align 8)
+    ; RV64I-NEXT: $v8m2 = COPY [[LOAD]](<vscale x 2 x s64>)
+    ; RV64I-NEXT: PseudoRET implicit $v8m2
+    %0:_(p0) = COPY $x10
+    %1:_(<vscale x 2 x s64>) = G_LOAD %0(p0) :: (load (<vscale x 2 x s64>) from %ir.pa, align 8)
+    $v8m2 = COPY %1(<vscale x 2 x s64>)
+    PseudoRET implicit $v8m2
+
+...
+---
+name:            vload_nx2i64_align16
+legalized:       true
+tracksRegLiveness: true
+body:             |
+  bb.1 (%ir-block.0):
+    liveins: $x10
+
+    ; RV32I-LABEL: name: vload_nx2i64_align16
+    ; RV32I: liveins: $x10
+    ; RV32I-NEXT: {{  $}}
+    ; RV32I-NEXT: [[COPY:%[0-9]+]]:gprb(p0) = COPY $x10
+    ; RV32I-NEXT: [[LOAD:%[0-9]+]]:vrb(<vscale x 2 x s64>) = G_LOAD [[COPY]](p0) :: (load (<vscale x 2 x s64>) from %ir.pa)
+    ; RV32I-NEXT: $v8m2 = COPY [[LOAD]](<vscale x 2 x s64>)
+    ; RV32I-NEXT: PseudoRET implicit $v8m2
+    ;
+    ; RV64I-LABEL: name: vload_nx2i64_align16
+    ; RV64I: liveins: $x10
+    ; RV64I-NEXT: {{  $}}
+    ; RV64I-NEXT: [[COPY:%[0-9]+]]:gprb(p0) = COPY $x10
+    ; RV64I-NEXT: [[LOAD:%[0-9]+]]:vrb(<vscale x 2 x s64>) = G_LOAD [[COPY]](p0) :: (load (<vscale x 2 x s64>) from %ir.pa)
+    ; RV64I-NEXT: $v8m2 = COPY [[LOAD]](<vscale x 2 x s64>)
+    ; RV64I-NEXT: PseudoRET implicit $v8m2
+    %0:_(p0) = COPY $x10
+    %1:_(<vscale x 2 x s64>) = G_LOAD %0(p0) :: (load (<vscale x 2 x s64>) from %ir.pa)
+    $v8m2 = COPY %1(<vscale x 2 x s64>)
+    PseudoRET implicit $v8m2
+
+...
+---
+name:            vload_nx2i64_align32
+legalized:       true
+tracksRegLiveness: true
+body:             |
+  bb.1 (%ir-block.0):
+    liveins: $x10
+
+    ; RV32I-LABEL: name: vload_nx2i64_align32
+    ; RV32I: liveins: $x10
+    ; RV32I-NEXT: {{  $}}
+    ; RV32I-NEXT: [[COPY:%[0-9]+]]:gprb(p0) = COPY $x10
+    ; RV32I-NEXT: [[LOAD:%[0-9]+]]:vrb(<vscale x 2 x s64>) = G_LOAD [[COPY]](p0) :: (load (<vscale x 2 x s64>) from %ir.pa, align 32)
+    ; RV32I-NEXT: $v8m2 = COPY [[LOAD]](<vscale x 2 x s64>)
+    ; RV32I-NEXT: PseudoRET implicit $v8m2
+    ;
+    ; RV64I-LABEL: name: vload_nx2i64_align32
+    ; RV64I: liveins: $x10
+    ; RV64I-NEXT: {{  $}}
+    ; RV64I-NEXT: [[COPY:%[0-9]+]]:gprb(p0) = COPY $x10
+    ; RV64I-NEXT: [[LOAD:%[0-9]+]]:vrb(<vscale x 2 x s64>) = G_LOAD [[COPY]](p0) :: (load (<vscale x 2 x s64>) from %ir.pa, align 32)
+    ; RV64I-NEXT: $v8m2 = COPY [[LOAD]](<vscale x 2 x s64>)
+    ; RV64I-NEXT: PseudoRET implicit $v8m2
+    %0:_(p0) = COPY $x10
+    %1:_(<vscale x 2 x s64>) = G_LOAD %0(p0) :: (load (<vscale x 2 x s64>) from %ir.pa, align 32)
+    $v8m2 = COPY %1(<vscale x 2 x s64>)
+    PseudoRET implicit $v8m2
+
+...
+---
+name:            vload_nx1ptr
+legalized:       true
+tracksRegLiveness: true
+body:             |
+  bb.1 (%ir-block.0):
+    liveins: $x10
+
+    ; RV32I-LABEL: name: vload_nx1ptr
+    ; RV32I: liveins: $x10
+    ; RV32I-NEXT: {{  $}}
+    ; RV32I-NEXT: [[COPY:%[0-9]+]]:gprb(p0) = COPY $x10
+    ; RV32I-NEXT: [[LOAD:%[0-9]+]]:vrb(<vscale x 1 x p0>) = G_LOAD [[COPY]](p0) :: (load (<vscale x 1 x p0>) from %ir.pa)
+    ; RV32I-NEXT: $v8 = COPY [[LOAD]](<vscale x 1 x p0>)
+    ; RV32I-NEXT: PseudoRET implicit $v8
+    ;
+    ; RV64I-LABEL: name: vload_nx1ptr
+    ; RV64I: liveins: $x10
+    ; RV64I-NEXT: {{  $}}
+    ; RV64I-NEXT: [[COPY:%[0-9]+]]:gprb(p0) = COPY $x10
+    ; RV64I-NEXT: [[LOAD:%[0-9]+]]:vrb(<vscale x 1 x p0>) = G_LOAD [[COPY]](p0) :: (load (<vscale x 1 x p0>) from %ir.pa)
+    ; RV64I-NEXT: $v8 = COPY [[LOAD]](<vscale x 1 x p0>)
+    ; RV64I-NEXT: PseudoRET implicit $v8
+    %0:_(p0) = COPY $x10
+    %1:_(<vscale x 1 x p0>) = G_LOAD %0(p0) :: (load (<vscale x 1 x p0>) from %ir.pa)
+    $v8 = COPY %1(<vscale x 1 x p0>)
+    PseudoRET implicit $v8
+
+...
+---
+name:            vload_nx2ptr
+legalized:       true
+tracksRegLiveness: true
+body:             |
+  bb.1 (%ir-block.0):
+    liveins: $x10
+
+    ; RV32I-LABEL: name: vload_nx2ptr
+    ; RV32I: liveins: $x10
+    ; RV32I-NEXT: {{  $}}
+    ; RV32I-NEXT: [[COPY:%[0-9]+]]:gprb(p0) = COPY $x10
+    ; RV32I-NEXT: [[LOAD:%[0-9]+]]:vrb(<vscale x 2 x p0>) = G_LOAD [[COPY]](p0) :: (load (<vscale x 2 x p0>) from %ir.pa)
+    ; RV32I-NEXT: $v8 = COPY [[LOAD]](<vscale x 2 x p0>)
+    ; RV32I-NEXT: PseudoRET implicit $v8
+    ;
+    ; RV64I-LABEL: name: vload_nx2ptr
+    ; RV64I: liveins: $x10
+    ; RV64I-NEXT: {{  $}}
+    ; RV64I-NEXT: [[COPY:%[0-9]+]]:gprb(p0) = COPY $x10
+    ; RV64I-NEXT: [[LOAD:%[0-9]+]]:vrb(<vscale x 2 x p0>) = G_LOAD [[COPY]](p0) :: (load (<vscale x 2 x p0>) from %ir.pa)
+    ; RV64I-NEXT: $v8 = COPY [[LOAD]](<vscale x 2 x p0>)
+    ; RV64I-NEXT: PseudoRET implicit $v8
+    %0:_(p0) = COPY $x10
+    %1:_(<vscale x 2 x p0>) = G_LOAD %0(p0) :: (load (<vscale x 2 x p0>) from %ir.pa)
+    $v8 = COPY %1(<vscale x 2 x p0>)
+    PseudoRET implicit $v8
+
+...
+---
+name:            vload_nx8ptr
+legalized:       true
+tracksRegLiveness: true
+body:             |
+  bb.1 (%ir-block.0):
+    liveins: $x10
+
+    ; RV32I-LABEL: name: vload_nx8ptr
+    ; RV32I: liveins: $x10
+    ; RV32I-NEXT: {{  $}}
+    ; RV32I-NEXT: [[COPY:%[0-9]+]]:gprb(p0) = COPY $x10
+    ; RV32I-NEXT: [[LOAD:%[0-9]+]]:vrb(<vscale x 8 x p0>) = G_LOAD [[COPY]](p0) :: (load (<vscale x 8 x p0>) from %ir.pa)
+    ; RV32I-NEXT: $v8m4 = COPY [[LOAD]](<vscale x 8 x p0>)
+    ; RV32I-NEXT: PseudoRET implicit $v8m4
+    ;
+    ; RV64I-LABEL: name: vload_nx8ptr
+    ; RV64I: liveins: $x10
+    ; RV64I-NEXT: {{  $}}
+    ; RV64I-NEXT: [[COPY:%[0-9]+]]:gprb(p0) = COPY $x10
+    ; RV64I-NEXT: [[LOAD:%[0-9]+]]:vrb(<vscale x 8 x p0>) = G_LOAD [[COPY]](p0) :: (load (<vscale x 8 x p0>) from %ir.pa)
+    ; RV64I-NEXT: $v8m4 = COPY [[LOAD]](<vscale x 8 x p0>)
+    ; RV64I-NEXT: PseudoRET implicit $v8m4
+    %0:_(p0) = COPY $x10
+    %1:_(<vscale x 8 x p0>) = G_LOAD %0(p0) :: (load (<vscale x 8 x p0>) from %ir.pa)
+    $v8m4 = COPY %1(<vscale x 8 x p0>)
+    PseudoRET implicit $v8m4
+
+...
diff --git a/llvm/test/CodeGen/RISCV/GlobalISel/regbankselect/rvv/store.mir b/llvm/test/CodeGen/RISCV/GlobalISel/regbankselect/rvv/store.mir
new file mode 100644
index 0000000..0bcef4e
--- /dev/null
+++ b/llvm/test/CodeGen/RISCV/GlobalISel/regbankselect/rvv/store.mir
@@ -0,0 +1,1481 @@
+# NOTE: Assertions have been autogenerated by utils/update_mir_test_checks.py
+# RUN: llc -mtriple=riscv32 -mattr=+m,+v -run-pass=regbankselect \
+# RUN:   -disable-gisel-legality-check -simplify-mir -verify-machineinstrs %s \
+# RUN:   -o - | FileCheck -check-prefix=RV32I %s
+# RUN: llc -mtriple=riscv64 -mattr=+m,+v -run-pass=regbankselect \
+# RUN:   -disable-gisel-legality-check -simplify-mir -verify-machineinstrs %s \
+# RUN:   -o - | FileCheck -check-prefix=RV64I %s
+--- |
+
+  define void @vstore_nx1i8(ptr %pa, <vscale x 1 x i8> %b) #0 {
+    store <vscale x 1 x i8> %b, ptr %pa, align 1
+    ret void
+  }
+
+  define void @vstore_nx2i8(ptr %pa, <vscale x 2 x i8> %b) #0 {
+    store <vscale x 2 x i8> %b, ptr %pa, align 2
+    ret void
+  }
+
+  define void @vstore_nx4i8(ptr %pa, <vscale x 4 x i8> %b) #0 {
+    store <vscale x 4 x i8> %b, ptr %pa, align 4
+    ret void
+  }
+
+  define void @vstore_nx8i8(ptr %pa, <vscale x 8 x i8> %b) #0 {
+    store <vscale x 8 x i8> %b, ptr %pa, align 8
+    ret void
+  }
+
+  define void @vstore_nx16i8(ptr %pa, <vscale x 16 x i8> %b) #0 {
+    store <vscale x 16 x i8> %b, ptr %pa, align 16
+    ret void
+  }
+
+  define void @vstore_nx32i8(ptr %pa, <vscale x 32 x i8> %b) #0 {
+    store <vscale x 32 x i8> %b, ptr %pa, align 32
+    ret void
+  }
+
+  define void @vstore_nx64i8(ptr %pa, <vscale x 64 x i8> %b) #0 {
+    store <vscale x 64 x i8> %b, ptr %pa, align 64
+    ret void
+  }
+
+  define void @vstore_nx1i16(ptr %pa, <vscale x 1 x i16> %b) #0 {
+    store <vscale x 1 x i16> %b, ptr %pa, align 2
+    ret void
+  }
+
+  define void @vstore_nx2i16(ptr %pa, <vscale x 2 x i16> %b) #0 {
+    store <vscale x 2 x i16> %b, ptr %pa, align 4
+    ret void
+  }
+
+  define void @vstore_nx4i16(ptr %pa, <vscale x 4 x i16> %b) #0 {
+    store <vscale x 4 x i16> %b, ptr %pa, align 8
+    ret void
+  }
+
+  define void @vstore_nx8i16(ptr %pa, <vscale x 8 x i16> %b) #0 {
+    store <vscale x 8 x i16> %b, ptr %pa, align 16
+    ret void
+  }
+
+  define void @vstore_nx16i16(ptr %pa, <vscale x 16 x i16> %b) #0 {
+    store <vscale x 16 x i16> %b, ptr %pa, align 32
+    ret void
+  }
+
+  define void @vstore_nx32i16(ptr %pa, <vscale x 32 x i16> %b) #0 {
+    store <vscale x 32 x i16> %b, ptr %pa, align 64
+    ret void
+  }
+
+  define void @vstore_nx1i32(ptr %pa, <vscale x 1 x i32> %b) #0 {
+    store <vscale x 1 x i32> %b, ptr %pa, align 4
+    ret void
+  }
+
+  define void @vstore_nx2i32(ptr %pa, <vscale x 2 x i32> %b) #0 {
+    store <vscale x 2 x i32> %b, ptr %pa, align 8
+    ret void
+  }
+
+  define void @vstore_nx4i32(ptr %pa, <vscale x 4 x i32> %b) #0 {
+    store <vscale x 4 x i32> %b, ptr %pa, align 16
+    ret void
+  }
+
+  define void @vstore_nx8i32(ptr %pa, <vscale x 8 x i32> %b) #0 {
+    store <vscale x 8 x i32> %b, ptr %pa, align 32
+    ret void
+  }
+
+  define void @vstore_nx16i32(ptr %pa, <vscale x 16 x i32> %b) #0 {
+    store <vscale x 16 x i32> %b, ptr %pa, align 64
+    ret void
+  }
+
+  define void @vstore_nx1i64(ptr %pa, <vscale x 1 x i64> %b) #0 {
+    store <vscale x 1 x i64> %b, ptr %pa, align 8
+    ret void
+  }
+
+  define void @vstore_nx2i64(ptr %pa, <vscale x 2 x i64> %b) #0 {
+    store <vscale x 2 x i64> %b, ptr %pa, align 16
+    ret void
+  }
+
+  define void @vstore_nx4i64(ptr %pa, <vscale x 4 x i64> %b) #0 {
+    store <vscale x 4 x i64> %b, ptr %pa, align 32
+    ret void
+  }
+
+  define void @vstore_nx8i64(ptr %pa, <vscale x 8 x i64> %b) #0 {
+    store <vscale x 8 x i64> %b, ptr %pa, align 64
+    ret void
+  }
+
+  define void @vstore_nx16i8_align1(ptr %pa, <vscale x 16 x i8> %b) #0 {
+    store <vscale x 16 x i8> %b, ptr %pa, align 1
+    ret void
+  }
+
+  define void @vstore_nx16i8_align2(ptr %pa, <vscale x 16 x i8> %b) #0 {
+    store <vscale x 16 x i8> %b, ptr %pa, align 2
+    ret void
+  }
+
+  define void @vstore_nx16i8_align16(ptr %pa, <vscale x 16 x i8> %b) #0 {
+    store <vscale x 16 x i8> %b, ptr %pa, align 16
+    ret void
+  }
+
+  define void @vstore_nx16i8_align64(ptr %pa, <vscale x 16 x i8> %b) #0 {
+    store <vscale x 16 x i8> %b, ptr %pa, align 64
+    ret void
+  }
+
+  define void @vstore_nx4i16_align1(ptr %pa, <vscale x 4 x i16> %b) #0 {
+    store <vscale x 4 x i16> %b, ptr %pa, align 1
+    ret void
+  }
+
+  define void @vstore_nx4i16_align2(ptr %pa, <vscale x 4 x i16> %b) #0 {
+    store <vscale x 4 x i16> %b, ptr %pa, align 2
+    ret void
+  }
+
+  define void @vstore_nx4i16_align4(ptr %pa, <vscale x 4 x i16> %b) #0 {
+    store <vscale x 4 x i16> %b, ptr %pa, align 4
+    ret void
+  }
+
+  define void @vstore_nx4i16_align8(ptr %pa, <vscale x 4 x i16> %b) #0 {
+    store <vscale x 4 x i16> %b, ptr %pa, align 8
+    ret void
+  }
+
+  define void @vstore_nx4i16_align16(ptr %pa, <vscale x 4 x i16> %b) #0 {
+    store <vscale x 4 x i16> %b, ptr %pa, align 16
+    ret void
+  }
+
+  define void @vstore_nx2i32_align2(ptr %pa, <vscale x 2 x i32> %b) #0 {
+    store <vscale x 2 x i32> %b, ptr %pa, align 2
+    ret void
+  }
+
+  define void @vstore_nx2i32_align4(ptr %pa, <vscale x 2 x i32> %b) #0 {
+    store <vscale x 2 x i32> %b, ptr %pa, align 4
+    ret void
+  }
+
+  define void @vstore_nx2i32_align8(ptr %pa, <vscale x 2 x i32> %b) #0 {
+    store <vscale x 2 x i32> %b, ptr %pa, align 8
+    ret void
+  }
+
+  define void @vstore_nx2i32_align16(ptr %pa, <vscale x 2 x i32> %b) #0 {
+    store <vscale x 2 x i32> %b, ptr %pa, align 16
+    ret void
+  }
+
+  define void @vstore_nx2i32_align256(ptr %pa, <vscale x 2 x i32> %b) #0 {
+    store <vscale x 2 x i32> %b, ptr %pa, align 256
+    ret void
+  }
+
+  define void @vstore_nx2i64_align4(ptr %pa, <vscale x 2 x i64> %b) #0 {
+    store <vscale x 2 x i64> %b, ptr %pa, align 4
+    ret void
+  }
+
+  define void @vstore_nx2i64_align8(ptr %pa, <vscale x 2 x i64> %b) #0 {
+    store <vscale x 2 x i64> %b, ptr %pa, align 8
+    ret void
+  }
+
+  define void @vstore_nx2i64_align16(ptr %pa, <vscale x 2 x i64> %b) #0 {
+    store <vscale x 2 x i64> %b, ptr %pa, align 16
+    ret void
+  }
+
+  define void @vstore_nx2i64_align32(ptr %pa, <vscale x 2 x i64> %b) #0 {
+    store <vscale x 2 x i64> %b, ptr %pa, align 32
+    ret void
+  }
+
+  define void @vstore_nx1ptr(ptr %pa, <vscale x 1 x ptr> %b) #0 {
+    store <vscale x 1 x ptr> %b, ptr %pa, align 4
+    ret void
+  }
+
+  define void @vstore_nx2ptr(ptr %pa, <vscale x 2 x ptr> %b) #0 {
+    store <vscale x 2 x ptr> %b, ptr %pa, align 8
+    ret void
+  }
+
+  define void @vstore_nx8ptr(ptr %pa, <vscale x 8 x ptr> %b) #0 {
+    store <vscale x 8 x ptr> %b, ptr %pa, align 32
+    ret void
+  }
+
+...
+---
+name:            vstore_nx1i8
+legalized:       true
+tracksRegLiveness: true
+body:             |
+  bb.1 (%ir-block.0):
+    liveins: $v8, $x10
+
+    ; RV32I-LABEL: name: vstore_nx1i8
+    ; RV32I: liveins: $v8, $x10
+    ; RV32I-NEXT: {{  $}}
+    ; RV32I-NEXT: [[COPY:%[0-9]+]]:gprb(p0) = COPY $x10
+    ; RV32I-NEXT: [[COPY1:%[0-9]+]]:vrb(<vscale x 1 x s8>) = COPY $v8
+    ; RV32I-NEXT: G_STORE [[COPY1]](<vscale x 1 x s8>), [[COPY]](p0) :: (store (<vscale x 1 x s8>) into %ir.pa)
+    ; RV32I-NEXT: PseudoRET
+    ;
+    ; RV64I-LABEL: name: vstore_nx1i8
+    ; RV64I: liveins: $v8, $x10
+    ; RV64I-NEXT: {{  $}}
+    ; RV64I-NEXT: [[COPY:%[0-9]+]]:gprb(p0) = COPY $x10
+    ; RV64I-NEXT: [[COPY1:%[0-9]+]]:vrb(<vscale x 1 x s8>) = COPY $v8
+    ; RV64I-NEXT: G_STORE [[COPY1]](<vscale x 1 x s8>), [[COPY]](p0) :: (store (<vscale x 1 x s8>) into %ir.pa)
+    ; RV64I-NEXT: PseudoRET
+    %0:_(p0) = COPY $x10
+    %1:_(<vscale x 1 x s8>) = COPY $v8
+    G_STORE %1(<vscale x 1 x s8>), %0(p0) :: (store (<vscale x 1 x s8>) into %ir.pa)
+    PseudoRET
+
+...
+---
+name:            vstore_nx2i8
+legalized:       true
+tracksRegLiveness: true
+body:             |
+  bb.1 (%ir-block.0):
+    liveins: $v8, $x10
+
+    ; RV32I-LABEL: name: vstore_nx2i8
+    ; RV32I: liveins: $v8, $x10
+    ; RV32I-NEXT: {{  $}}
+    ; RV32I-NEXT: [[COPY:%[0-9]+]]:gprb(p0) = COPY $x10
+    ; RV32I-NEXT: [[COPY1:%[0-9]+]]:vrb(<vscale x 2 x s8>) = COPY $v8
+    ; RV32I-NEXT: G_STORE [[COPY1]](<vscale x 2 x s8>), [[COPY]](p0) :: (store (<vscale x 2 x s8>) into %ir.pa)
+    ; RV32I-NEXT: PseudoRET
+    ;
+    ; RV64I-LABEL: name: vstore_nx2i8
+    ; RV64I: liveins: $v8, $x10
+    ; RV64I-NEXT: {{  $}}
+    ; RV64I-NEXT: [[COPY:%[0-9]+]]:gprb(p0) = COPY $x10
+    ; RV64I-NEXT: [[COPY1:%[0-9]+]]:vrb(<vscale x 2 x s8>) = COPY $v8
+    ; RV64I-NEXT: G_STORE [[COPY1]](<vscale x 2 x s8>), [[COPY]](p0) :: (store (<vscale x 2 x s8>) into %ir.pa)
+    ; RV64I-NEXT: PseudoRET
+    %0:_(p0) = COPY $x10
+    %1:_(<vscale x 2 x s8>) = COPY $v8
+    G_STORE %1(<vscale x 2 x s8>), %0(p0) :: (store (<vscale x 2 x s8>) into %ir.pa)
+    PseudoRET
+
+...
+---
+name:            vstore_nx4i8
+legalized:       true
+tracksRegLiveness: true
+body:             |
+  bb.1 (%ir-block.0):
+    liveins: $v8, $x10
+
+    ; RV32I-LABEL: name: vstore_nx4i8
+    ; RV32I: liveins: $v8, $x10
+    ; RV32I-NEXT: {{  $}}
+    ; RV32I-NEXT: [[COPY:%[0-9]+]]:gprb(p0) = COPY $x10
+    ; RV32I-NEXT: [[COPY1:%[0-9]+]]:vrb(<vscale x 4 x s8>) = COPY $v8
+    ; RV32I-NEXT: G_STORE [[COPY1]](<vscale x 4 x s8>), [[COPY]](p0) :: (store (<vscale x 4 x s8>) into %ir.pa)
+    ; RV32I-NEXT: PseudoRET
+    ;
+    ; RV64I-LABEL: name: vstore_nx4i8
+    ; RV64I: liveins: $v8, $x10
+    ; RV64I-NEXT: {{  $}}
+    ; RV64I-NEXT: [[COPY:%[0-9]+]]:gprb(p0) = COPY $x10
+    ; RV64I-NEXT: [[COPY1:%[0-9]+]]:vrb(<vscale x 4 x s8>) = COPY $v8
+    ; RV64I-NEXT: G_STORE [[COPY1]](<vscale x 4 x s8>), [[COPY]](p0) :: (store (<vscale x 4 x s8>) into %ir.pa)
+    ; RV64I-NEXT: PseudoRET
+    %0:_(p0) = COPY $x10
+    %1:_(<vscale x 4 x s8>) = COPY $v8
+    G_STORE %1(<vscale x 4 x s8>), %0(p0) :: (store (<vscale x 4 x s8>) into %ir.pa)
+    PseudoRET
+
+...
+---
+name:            vstore_nx8i8
+legalized:       true
+tracksRegLiveness: true
+body:             |
+  bb.1 (%ir-block.0):
+    liveins: $v8, $x10
+
+    ; RV32I-LABEL: name: vstore_nx8i8
+    ; RV32I: liveins: $v8, $x10
+    ; RV32I-NEXT: {{  $}}
+    ; RV32I-NEXT: [[COPY:%[0-9]+]]:gprb(p0) = COPY $x10
+    ; RV32I-NEXT: [[COPY1:%[0-9]+]]:vrb(<vscale x 8 x s8>) = COPY $v8
+    ; RV32I-NEXT: G_STORE [[COPY1]](<vscale x 8 x s8>), [[COPY]](p0) :: (store (<vscale x 8 x s8>) into %ir.pa)
+    ; RV32I-NEXT: PseudoRET
+    ;
+    ; RV64I-LABEL: name: vstore_nx8i8
+    ; RV64I: liveins: $v8, $x10
+    ; RV64I-NEXT: {{  $}}
+    ; RV64I-NEXT: [[COPY:%[0-9]+]]:gprb(p0) = COPY $x10
+    ; RV64I-NEXT: [[COPY1:%[0-9]+]]:vrb(<vscale x 8 x s8>) = COPY $v8
+    ; RV64I-NEXT: G_STORE [[COPY1]](<vscale x 8 x s8>), [[COPY]](p0) :: (store (<vscale x 8 x s8>) into %ir.pa)
+    ; RV64I-NEXT: PseudoRET
+    %0:_(p0) = COPY $x10
+    %1:_(<vscale x 8 x s8>) = COPY $v8
+    G_STORE %1(<vscale x 8 x s8>), %0(p0) :: (store (<vscale x 8 x s8>) into %ir.pa)
+    PseudoRET
+
+...
+---
+name:            vstore_nx16i8
+legalized:       true
+tracksRegLiveness: true
+body:             |
+  bb.1 (%ir-block.0):
+    liveins: $x10, $v8m2
+
+    ; RV32I-LABEL: name: vstore_nx16i8
+    ; RV32I: liveins: $x10, $v8m2
+    ; RV32I-NEXT: {{  $}}
+    ; RV32I-NEXT: [[COPY:%[0-9]+]]:gprb(p0) = COPY $x10
+    ; RV32I-NEXT: [[COPY1:%[0-9]+]]:vrb(<vscale x 16 x s8>) = COPY $v8m2
+    ; RV32I-NEXT: G_STORE [[COPY1]](<vscale x 16 x s8>), [[COPY]](p0) :: (store (<vscale x 16 x s8>) into %ir.pa)
+    ; RV32I-NEXT: PseudoRET
+    ;
+    ; RV64I-LABEL: name: vstore_nx16i8
+    ; RV64I: liveins: $x10, $v8m2
+    ; RV64I-NEXT: {{  $}}
+    ; RV64I-NEXT: [[COPY:%[0-9]+]]:gprb(p0) = COPY $x10
+    ; RV64I-NEXT: [[COPY1:%[0-9]+]]:vrb(<vscale x 16 x s8>) = COPY $v8m2
+    ; RV64I-NEXT: G_STORE [[COPY1]](<vscale x 16 x s8>), [[COPY]](p0) :: (store (<vscale x 16 x s8>) into %ir.pa)
+    ; RV64I-NEXT: PseudoRET
+    %0:_(p0) = COPY $x10
+    %1:_(<vscale x 16 x s8>) = COPY $v8m2
+    G_STORE %1(<vscale x 16 x s8>), %0(p0) :: (store (<vscale x 16 x s8>) into %ir.pa)
+    PseudoRET
+
+...
+---
+name:            vstore_nx32i8
+legalized:       true
+tracksRegLiveness: true
+body:             |
+  bb.1 (%ir-block.0):
+    liveins: $x10, $v8m4
+
+    ; RV32I-LABEL: name: vstore_nx32i8
+    ; RV32I: liveins: $x10, $v8m4
+    ; RV32I-NEXT: {{  $}}
+    ; RV32I-NEXT: [[COPY:%[0-9]+]]:gprb(p0) = COPY $x10
+    ; RV32I-NEXT: [[COPY1:%[0-9]+]]:vrb(<vscale x 32 x s8>) = COPY $v8m4
+    ; RV32I-NEXT: G_STORE [[COPY1]](<vscale x 32 x s8>), [[COPY]](p0) :: (store (<vscale x 32 x s8>) into %ir.pa)
+    ; RV32I-NEXT: PseudoRET
+    ;
+    ; RV64I-LABEL: name: vstore_nx32i8
+    ; RV64I: liveins: $x10, $v8m4
+    ; RV64I-NEXT: {{  $}}
+    ; RV64I-NEXT: [[COPY:%[0-9]+]]:gprb(p0) = COPY $x10
+    ; RV64I-NEXT: [[COPY1:%[0-9]+]]:vrb(<vscale x 32 x s8>) = COPY $v8m4
+    ; RV64I-NEXT: G_STORE [[COPY1]](<vscale x 32 x s8>), [[COPY]](p0) :: (store (<vscale x 32 x s8>) into %ir.pa)
+    ; RV64I-NEXT: PseudoRET
+    %0:_(p0) = COPY $x10
+    %1:_(<vscale x 32 x s8>) = COPY $v8m4
+    G_STORE %1(<vscale x 32 x s8>), %0(p0) :: (store (<vscale x 32 x s8>) into %ir.pa)
+    PseudoRET
+
+...
+---
+name:            vstore_nx64i8
+legalized:       true
+tracksRegLiveness: true
+body:             |
+  bb.1 (%ir-block.0):
+    liveins: $x10, $v8m8
+
+    ; RV32I-LABEL: name: vstore_nx64i8
+    ; RV32I: liveins: $x10, $v8m8
+    ; RV32I-NEXT: {{  $}}
+    ; RV32I-NEXT: [[COPY:%[0-9]+]]:gprb(p0) = COPY $x10
+    ; RV32I-NEXT: [[COPY1:%[0-9]+]]:vrb(<vscale x 64 x s8>) = COPY $v8m8
+    ; RV32I-NEXT: G_STORE [[COPY1]](<vscale x 64 x s8>), [[COPY]](p0) :: (store (<vscale x 64 x s8>) into %ir.pa)
+    ; RV32I-NEXT: PseudoRET
+    ;
+    ; RV64I-LABEL: name: vstore_nx64i8
+    ; RV64I: liveins: $x10, $v8m8
+    ; RV64I-NEXT: {{  $}}
+    ; RV64I-NEXT: [[COPY:%[0-9]+]]:gprb(p0) = COPY $x10
+    ; RV64I-NEXT: [[COPY1:%[0-9]+]]:vrb(<vscale x 64 x s8>) = COPY $v8m8
+    ; RV64I-NEXT: G_STORE [[COPY1]](<vscale x 64 x s8>), [[COPY]](p0) :: (store (<vscale x 64 x s8>) into %ir.pa)
+    ; RV64I-NEXT: PseudoRET
+    %0:_(p0) = COPY $x10
+    %1:_(<vscale x 64 x s8>) = COPY $v8m8
+    G_STORE %1(<vscale x 64 x s8>), %0(p0) :: (store (<vscale x 64 x s8>) into %ir.pa)
+    PseudoRET
+
+...
+---
+name:            vstore_nx1i16
+legalized:       true
+tracksRegLiveness: true
+body:             |
+  bb.1 (%ir-block.0):
+    liveins: $v8, $x10
+
+    ; RV32I-LABEL: name: vstore_nx1i16
+    ; RV32I: liveins: $v8, $x10
+    ; RV32I-NEXT: {{  $}}
+    ; RV32I-NEXT: [[COPY:%[0-9]+]]:gprb(p0) = COPY $x10
+    ; RV32I-NEXT: [[COPY1:%[0-9]+]]:vrb(<vscale x 1 x s16>) = COPY $v8
+    ; RV32I-NEXT: G_STORE [[COPY1]](<vscale x 1 x s16>), [[COPY]](p0) :: (store (<vscale x 1 x s16>) into %ir.pa)
+    ; RV32I-NEXT: PseudoRET
+    ;
+    ; RV64I-LABEL: name: vstore_nx1i16
+    ; RV64I: liveins: $v8, $x10
+    ; RV64I-NEXT: {{  $}}
+    ; RV64I-NEXT: [[COPY:%[0-9]+]]:gprb(p0) = COPY $x10
+    ; RV64I-NEXT: [[COPY1:%[0-9]+]]:vrb(<vscale x 1 x s16>) = COPY $v8
+    ; RV64I-NEXT: G_STORE [[COPY1]](<vscale x 1 x s16>), [[COPY]](p0) :: (store (<vscale x 1 x s16>) into %ir.pa)
+    ; RV64I-NEXT: PseudoRET
+    %0:_(p0) = COPY $x10
+    %1:_(<vscale x 1 x s16>) = COPY $v8
+    G_STORE %1(<vscale x 1 x s16>), %0(p0) :: (store (<vscale x 1 x s16>) into %ir.pa)
+    PseudoRET
+
+...
+---
+name:            vstore_nx2i16
+legalized:       true
+tracksRegLiveness: true
+body:             |
+  bb.1 (%ir-block.0):
+    liveins: $v8, $x10
+
+    ; RV32I-LABEL: name: vstore_nx2i16
+    ; RV32I: liveins: $v8, $x10
+    ; RV32I-NEXT: {{  $}}
+    ; RV32I-NEXT: [[COPY:%[0-9]+]]:gprb(p0) = COPY $x10
+    ; RV32I-NEXT: [[COPY1:%[0-9]+]]:vrb(<vscale x 2 x s16>) = COPY $v8
+    ; RV32I-NEXT: G_STORE [[COPY1]](<vscale x 2 x s16>), [[COPY]](p0) :: (store (<vscale x 2 x s16>) into %ir.pa)
+    ; RV32I-NEXT: PseudoRET
+    ;
+    ; RV64I-LABEL: name: vstore_nx2i16
+    ; RV64I: liveins: $v8, $x10
+    ; RV64I-NEXT: {{  $}}
+    ; RV64I-NEXT: [[COPY:%[0-9]+]]:gprb(p0) = COPY $x10
+    ; RV64I-NEXT: [[COPY1:%[0-9]+]]:vrb(<vscale x 2 x s16>) = COPY $v8
+    ; RV64I-NEXT: G_STORE [[COPY1]](<vscale x 2 x s16>), [[COPY]](p0) :: (store (<vscale x 2 x s16>) into %ir.pa)
+    ; RV64I-NEXT: PseudoRET
+    %0:_(p0) = COPY $x10
+    %1:_(<vscale x 2 x s16>) = COPY $v8
+    G_STORE %1(<vscale x 2 x s16>), %0(p0) :: (store (<vscale x 2 x s16>) into %ir.pa)
+    PseudoRET
+
+...
+---
+name:            vstore_nx4i16
+legalized:       true
+tracksRegLiveness: true
+body:             |
+  bb.1 (%ir-block.0):
+    liveins: $v8, $x10
+
+    ; RV32I-LABEL: name: vstore_nx4i16
+    ; RV32I: liveins: $v8, $x10
+    ; RV32I-NEXT: {{  $}}
+    ; RV32I-NEXT: [[COPY:%[0-9]+]]:gprb(p0) = COPY $x10
+    ; RV32I-NEXT: [[COPY1:%[0-9]+]]:vrb(<vscale x 4 x s16>) = COPY $v8
+    ; RV32I-NEXT: G_STORE [[COPY1]](<vscale x 4 x s16>), [[COPY]](p0) :: (store (<vscale x 4 x s16>) into %ir.pa)
+    ; RV32I-NEXT: PseudoRET
+    ;
+    ; RV64I-LABEL: name: vstore_nx4i16
+    ; RV64I: liveins: $v8, $x10
+    ; RV64I-NEXT: {{  $}}
+    ; RV64I-NEXT: [[COPY:%[0-9]+]]:gprb(p0) = COPY $x10
+    ; RV64I-NEXT: [[COPY1:%[0-9]+]]:vrb(<vscale x 4 x s16>) = COPY $v8
+    ; RV64I-NEXT: G_STORE [[COPY1]](<vscale x 4 x s16>), [[COPY]](p0) :: (store (<vscale x 4 x s16>) into %ir.pa)
+    ; RV64I-NEXT: PseudoRET
+    %0:_(p0) = COPY $x10
+    %1:_(<vscale x 4 x s16>) = COPY $v8
+    G_STORE %1(<vscale x 4 x s16>), %0(p0) :: (store (<vscale x 4 x s16>) into %ir.pa)
+    PseudoRET
+
+...
+---
+name:            vstore_nx8i16
+legalized:       true
+tracksRegLiveness: true
+body:             |
+  bb.1 (%ir-block.0):
+    liveins: $x10, $v8m2
+
+    ; RV32I-LABEL: name: vstore_nx8i16
+    ; RV32I: liveins: $x10, $v8m2
+    ; RV32I-NEXT: {{  $}}
+    ; RV32I-NEXT: [[COPY:%[0-9]+]]:gprb(p0) = COPY $x10
+    ; RV32I-NEXT: [[COPY1:%[0-9]+]]:vrb(<vscale x 8 x s16>) = COPY $v8m2
+    ; RV32I-NEXT: G_STORE [[COPY1]](<vscale x 8 x s16>), [[COPY]](p0) :: (store (<vscale x 8 x s16>) into %ir.pa)
+    ; RV32I-NEXT: PseudoRET
+    ;
+    ; RV64I-LABEL: name: vstore_nx8i16
+    ; RV64I: liveins: $x10, $v8m2
+    ; RV64I-NEXT: {{  $}}
+    ; RV64I-NEXT: [[COPY:%[0-9]+]]:gprb(p0) = COPY $x10
+    ; RV64I-NEXT: [[COPY1:%[0-9]+]]:vrb(<vscale x 8 x s16>) = COPY $v8m2
+    ; RV64I-NEXT: G_STORE [[COPY1]](<vscale x 8 x s16>), [[COPY]](p0) :: (store (<vscale x 8 x s16>) into %ir.pa)
+    ; RV64I-NEXT: PseudoRET
+    %0:_(p0) = COPY $x10
+    %1:_(<vscale x 8 x s16>) = COPY $v8m2
+    G_STORE %1(<vscale x 8 x s16>), %0(p0) :: (store (<vscale x 8 x s16>) into %ir.pa)
+    PseudoRET
+
+...
+---
+name:            vstore_nx16i16
+legalized:       true
+tracksRegLiveness: true
+body:             |
+  bb.1 (%ir-block.0):
+    liveins: $x10, $v8m4
+
+    ; RV32I-LABEL: name: vstore_nx16i16
+    ; RV32I: liveins: $x10, $v8m4
+    ; RV32I-NEXT: {{  $}}
+    ; RV32I-NEXT: [[COPY:%[0-9]+]]:gprb(p0) = COPY $x10
+    ; RV32I-NEXT: [[COPY1:%[0-9]+]]:vrb(<vscale x 16 x s16>) = COPY $v8m4
+    ; RV32I-NEXT: G_STORE [[COPY1]](<vscale x 16 x s16>), [[COPY]](p0) :: (store (<vscale x 16 x s16>) into %ir.pa)
+    ; RV32I-NEXT: PseudoRET
+    ;
+    ; RV64I-LABEL: name: vstore_nx16i16
+    ; RV64I: liveins: $x10, $v8m4
+    ; RV64I-NEXT: {{  $}}
+    ; RV64I-NEXT: [[COPY:%[0-9]+]]:gprb(p0) = COPY $x10
+    ; RV64I-NEXT: [[COPY1:%[0-9]+]]:vrb(<vscale x 16 x s16>) = COPY $v8m4
+    ; RV64I-NEXT: G_STORE [[COPY1]](<vscale x 16 x s16>), [[COPY]](p0) :: (store (<vscale x 16 x s16>) into %ir.pa)
+    ; RV64I-NEXT: PseudoRET
+    %0:_(p0) = COPY $x10
+    %1:_(<vscale x 16 x s16>) = COPY $v8m4
+    G_STORE %1(<vscale x 16 x s16>), %0(p0) :: (store (<vscale x 16 x s16>) into %ir.pa)
+    PseudoRET
+
+...
+---
+name:            vstore_nx32i16
+legalized:       true
+tracksRegLiveness: true
+body:             |
+  bb.1 (%ir-block.0):
+    liveins: $x10, $v8m8
+
+    ; RV32I-LABEL: name: vstore_nx32i16
+    ; RV32I: liveins: $x10, $v8m8
+    ; RV32I-NEXT: {{  $}}
+    ; RV32I-NEXT: [[COPY:%[0-9]+]]:gprb(p0) = COPY $x10
+    ; RV32I-NEXT: [[COPY1:%[0-9]+]]:vrb(<vscale x 32 x s16>) = COPY $v8m8
+    ; RV32I-NEXT: G_STORE [[COPY1]](<vscale x 32 x s16>), [[COPY]](p0) :: (store (<vscale x 32 x s16>) into %ir.pa)
+    ; RV32I-NEXT: PseudoRET
+    ;
+    ; RV64I-LABEL: name: vstore_nx32i16
+    ; RV64I: liveins: $x10, $v8m8
+    ; RV64I-NEXT: {{  $}}
+    ; RV64I-NEXT: [[COPY:%[0-9]+]]:gprb(p0) = COPY $x10
+    ; RV64I-NEXT: [[COPY1:%[0-9]+]]:vrb(<vscale x 32 x s16>) = COPY $v8m8
+    ; RV64I-NEXT: G_STORE [[COPY1]](<vscale x 32 x s16>), [[COPY]](p0) :: (store (<vscale x 32 x s16>) into %ir.pa)
+    ; RV64I-NEXT: PseudoRET
+    %0:_(p0) = COPY $x10
+    %1:_(<vscale x 32 x s16>) = COPY $v8m8
+    G_STORE %1(<vscale x 32 x s16>), %0(p0) :: (store (<vscale x 32 x s16>) into %ir.pa)
+    PseudoRET
+
+...
+---
+name:            vstore_nx1i32
+legalized:       true
+tracksRegLiveness: true
+body:             |
+  bb.1 (%ir-block.0):
+    liveins: $v8, $x10
+
+    ; RV32I-LABEL: name: vstore_nx1i32
+    ; RV32I: liveins: $v8, $x10
+    ; RV32I-NEXT: {{  $}}
+    ; RV32I-NEXT: [[COPY:%[0-9]+]]:gprb(p0) = COPY $x10
+    ; RV32I-NEXT: [[COPY1:%[0-9]+]]:vrb(<vscale x 1 x s32>) = COPY $v8
+    ; RV32I-NEXT: G_STORE [[COPY1]](<vscale x 1 x s32>), [[COPY]](p0) :: (store (<vscale x 1 x s32>) into %ir.pa)
+    ; RV32I-NEXT: PseudoRET
+    ;
+    ; RV64I-LABEL: name: vstore_nx1i32
+    ; RV64I: liveins: $v8, $x10
+    ; RV64I-NEXT: {{  $}}
+    ; RV64I-NEXT: [[COPY:%[0-9]+]]:gprb(p0) = COPY $x10
+    ; RV64I-NEXT: [[COPY1:%[0-9]+]]:vrb(<vscale x 1 x s32>) = COPY $v8
+    ; RV64I-NEXT: G_STORE [[COPY1]](<vscale x 1 x s32>), [[COPY]](p0) :: (store (<vscale x 1 x s32>) into %ir.pa)
+    ; RV64I-NEXT: PseudoRET
+    %0:_(p0) = COPY $x10
+    %1:_(<vscale x 1 x s32>) = COPY $v8
+    G_STORE %1(<vscale x 1 x s32>), %0(p0) :: (store (<vscale x 1 x s32>) into %ir.pa)
+    PseudoRET
+
+...
+---
+name:            vstore_nx2i32
+legalized:       true
+tracksRegLiveness: true
+body:             |
+  bb.1 (%ir-block.0):
+    liveins: $v8, $x10
+
+    ; RV32I-LABEL: name: vstore_nx2i32
+    ; RV32I: liveins: $v8, $x10
+    ; RV32I-NEXT: {{  $}}
+    ; RV32I-NEXT: [[COPY:%[0-9]+]]:gprb(p0) = COPY $x10
+    ; RV32I-NEXT: [[COPY1:%[0-9]+]]:vrb(<vscale x 2 x s32>) = COPY $v8
+    ; RV32I-NEXT: G_STORE [[COPY1]](<vscale x 2 x s32>), [[COPY]](p0) :: (store (<vscale x 2 x s32>) into %ir.pa)
+    ; RV32I-NEXT: PseudoRET
+    ;
+    ; RV64I-LABEL: name: vstore_nx2i32
+    ; RV64I: liveins: $v8, $x10
+    ; RV64I-NEXT: {{  $}}
+    ; RV64I-NEXT: [[COPY:%[0-9]+]]:gprb(p0) = COPY $x10
+    ; RV64I-NEXT: [[COPY1:%[0-9]+]]:vrb(<vscale x 2 x s32>) = COPY $v8
+    ; RV64I-NEXT: G_STORE [[COPY1]](<vscale x 2 x s32>), [[COPY]](p0) :: (store (<vscale x 2 x s32>) into %ir.pa)
+    ; RV64I-NEXT: PseudoRET
+    %0:_(p0) = COPY $x10
+    %1:_(<vscale x 2 x s32>) = COPY $v8
+    G_STORE %1(<vscale x 2 x s32>), %0(p0) :: (store (<vscale x 2 x s32>) into %ir.pa)
+    PseudoRET
+
+...
+---
+name:            vstore_nx4i32
+legalized:       true
+tracksRegLiveness: true
+body:             |
+  bb.1 (%ir-block.0):
+    liveins: $x10, $v8m2
+
+    ; RV32I-LABEL: name: vstore_nx4i32
+    ; RV32I: liveins: $x10, $v8m2
+    ; RV32I-NEXT: {{  $}}
+    ; RV32I-NEXT: [[COPY:%[0-9]+]]:gprb(p0) = COPY $x10
+    ; RV32I-NEXT: [[COPY1:%[0-9]+]]:vrb(<vscale x 4 x s32>) = COPY $v8m2
+    ; RV32I-NEXT: G_STORE [[COPY1]](<vscale x 4 x s32>), [[COPY]](p0) :: (store (<vscale x 4 x s32>) into %ir.pa)
+    ; RV32I-NEXT: PseudoRET
+    ;
+    ; RV64I-LABEL: name: vstore_nx4i32
+    ; RV64I: liveins: $x10, $v8m2
+    ; RV64I-NEXT: {{  $}}
+    ; RV64I-NEXT: [[COPY:%[0-9]+]]:gprb(p0) = COPY $x10
+    ; RV64I-NEXT: [[COPY1:%[0-9]+]]:vrb(<vscale x 4 x s32>) = COPY $v8m2
+    ; RV64I-NEXT: G_STORE [[COPY1]](<vscale x 4 x s32>), [[COPY]](p0) :: (store (<vscale x 4 x s32>) into %ir.pa)
+    ; RV64I-NEXT: PseudoRET
+    %0:_(p0) = COPY $x10
+    %1:_(<vscale x 4 x s32>) = COPY $v8m2
+    G_STORE %1(<vscale x 4 x s32>), %0(p0) :: (store (<vscale x 4 x s32>) into %ir.pa)
+    PseudoRET
+
+...
+---
+name:            vstore_nx8i32
+legalized:       true
+tracksRegLiveness: true
+body:             |
+  bb.1 (%ir-block.0):
+    liveins: $x10, $v8m4
+
+    ; RV32I-LABEL: name: vstore_nx8i32
+    ; RV32I: liveins: $x10, $v8m4
+    ; RV32I-NEXT: {{  $}}
+    ; RV32I-NEXT: [[COPY:%[0-9]+]]:gprb(p0) = COPY $x10
+    ; RV32I-NEXT: [[COPY1:%[0-9]+]]:vrb(<vscale x 8 x s32>) = COPY $v8m4
+    ; RV32I-NEXT: G_STORE [[COPY1]](<vscale x 8 x s32>), [[COPY]](p0) :: (store (<vscale x 8 x s32>) into %ir.pa)
+    ; RV32I-NEXT: PseudoRET
+    ;
+    ; RV64I-LABEL: name: vstore_nx8i32
+    ; RV64I: liveins: $x10, $v8m4
+    ; RV64I-NEXT: {{  $}}
+    ; RV64I-NEXT: [[COPY:%[0-9]+]]:gprb(p0) = COPY $x10
+    ; RV64I-NEXT: [[COPY1:%[0-9]+]]:vrb(<vscale x 8 x s32>) = COPY $v8m4
+    ; RV64I-NEXT: G_STORE [[COPY1]](<vscale x 8 x s32>), [[COPY]](p0) :: (store (<vscale x 8 x s32>) into %ir.pa)
+    ; RV64I-NEXT: PseudoRET
+    %0:_(p0) = COPY $x10
+    %1:_(<vscale x 8 x s32>) = COPY $v8m4
+    G_STORE %1(<vscale x 8 x s32>), %0(p0) :: (store (<vscale x 8 x s32>) into %ir.pa)
+    PseudoRET
+
+...
+---
+name:            vstore_nx16i32
+legalized:       true
+tracksRegLiveness: true
+body:             |
+  bb.1 (%ir-block.0):
+    liveins: $x10, $v8m8
+
+    ; RV32I-LABEL: name: vstore_nx16i32
+    ; RV32I: liveins: $x10, $v8m8
+    ; RV32I-NEXT: {{  $}}
+    ; RV32I-NEXT: [[COPY:%[0-9]+]]:gprb(p0) = COPY $x10
+    ; RV32I-NEXT: [[COPY1:%[0-9]+]]:vrb(<vscale x 16 x s32>) = COPY $v8m8
+    ; RV32I-NEXT: G_STORE [[COPY1]](<vscale x 16 x s32>), [[COPY]](p0) :: (store (<vscale x 16 x s32>) into %ir.pa)
+    ; RV32I-NEXT: PseudoRET
+    ;
+    ; RV64I-LABEL: name: vstore_nx16i32
+    ; RV64I: liveins: $x10, $v8m8
+    ; RV64I-NEXT: {{  $}}
+    ; RV64I-NEXT: [[COPY:%[0-9]+]]:gprb(p0) = COPY $x10
+    ; RV64I-NEXT: [[COPY1:%[0-9]+]]:vrb(<vscale x 16 x s32>) = COPY $v8m8
+    ; RV64I-NEXT: G_STORE [[COPY1]](<vscale x 16 x s32>), [[COPY]](p0) :: (store (<vscale x 16 x s32>) into %ir.pa)
+    ; RV64I-NEXT: PseudoRET
+    %0:_(p0) = COPY $x10
+    %1:_(<vscale x 16 x s32>) = COPY $v8m8
+    G_STORE %1(<vscale x 16 x s32>), %0(p0) :: (store (<vscale x 16 x s32>) into %ir.pa)
+    PseudoRET
+
+...
+---
+name:            vstore_nx1i64
+legalized:       true
+tracksRegLiveness: true
+body:             |
+  bb.1 (%ir-block.0):
+    liveins: $v8, $x10
+
+    ; RV32I-LABEL: name: vstore_nx1i64
+    ; RV32I: liveins: $v8, $x10
+    ; RV32I-NEXT: {{  $}}
+    ; RV32I-NEXT: [[COPY:%[0-9]+]]:gprb(p0) = COPY $x10
+    ; RV32I-NEXT: [[COPY1:%[0-9]+]]:vrb(<vscale x 1 x s64>) = COPY $v8
+    ; RV32I-NEXT: G_STORE [[COPY1]](<vscale x 1 x s64>), [[COPY]](p0) :: (store (<vscale x 1 x s64>) into %ir.pa)
+    ; RV32I-NEXT: PseudoRET
+    ;
+    ; RV64I-LABEL: name: vstore_nx1i64
+    ; RV64I: liveins: $v8, $x10
+    ; RV64I-NEXT: {{  $}}
+    ; RV64I-NEXT: [[COPY:%[0-9]+]]:gprb(p0) = COPY $x10
+    ; RV64I-NEXT: [[COPY1:%[0-9]+]]:vrb(<vscale x 1 x s64>) = COPY $v8
+    ; RV64I-NEXT: G_STORE [[COPY1]](<vscale x 1 x s64>), [[COPY]](p0) :: (store (<vscale x 1 x s64>) into %ir.pa)
+    ; RV64I-NEXT: PseudoRET
+    %0:_(p0) = COPY $x10
+    %1:_(<vscale x 1 x s64>) = COPY $v8
+    G_STORE %1(<vscale x 1 x s64>), %0(p0) :: (store (<vscale x 1 x s64>) into %ir.pa)
+    PseudoRET
+
+...
+---
+name:            vstore_nx2i64
+legalized:       true
+tracksRegLiveness: true
+body:             |
+  bb.1 (%ir-block.0):
+    liveins: $x10, $v8m2
+
+    ; RV32I-LABEL: name: vstore_nx2i64
+    ; RV32I: liveins: $x10, $v8m2
+    ; RV32I-NEXT: {{  $}}
+    ; RV32I-NEXT: [[COPY:%[0-9]+]]:gprb(p0) = COPY $x10
+    ; RV32I-NEXT: [[COPY1:%[0-9]+]]:vrb(<vscale x 2 x s64>) = COPY $v8m2
+    ; RV32I-NEXT: G_STORE [[COPY1]](<vscale x 2 x s64>), [[COPY]](p0) :: (store (<vscale x 2 x s64>) into %ir.pa)
+    ; RV32I-NEXT: PseudoRET
+    ;
+    ; RV64I-LABEL: name: vstore_nx2i64
+    ; RV64I: liveins: $x10, $v8m2
+    ; RV64I-NEXT: {{  $}}
+    ; RV64I-NEXT: [[COPY:%[0-9]+]]:gprb(p0) = COPY $x10
+    ; RV64I-NEXT: [[COPY1:%[0-9]+]]:vrb(<vscale x 2 x s64>) = COPY $v8m2
+    ; RV64I-NEXT: G_STORE [[COPY1]](<vscale x 2 x s64>), [[COPY]](p0) :: (store (<vscale x 2 x s64>) into %ir.pa)
+    ; RV64I-NEXT: PseudoRET
+    %0:_(p0) = COPY $x10
+    %1:_(<vscale x 2 x s64>) = COPY $v8m2
+    G_STORE %1(<vscale x 2 x s64>), %0(p0) :: (store (<vscale x 2 x s64>) into %ir.pa)
+    PseudoRET
+
+...
+---
+name:            vstore_nx4i64
+legalized:       true
+tracksRegLiveness: true
+body:             |
+  bb.1 (%ir-block.0):
+    liveins: $x10, $v8m4
+
+    ; RV32I-LABEL: name: vstore_nx4i64
+    ; RV32I: liveins: $x10, $v8m4
+    ; RV32I-NEXT: {{  $}}
+    ; RV32I-NEXT: [[COPY:%[0-9]+]]:gprb(p0) = COPY $x10
+    ; RV32I-NEXT: [[COPY1:%[0-9]+]]:vrb(<vscale x 4 x s64>) = COPY $v8m4
+    ; RV32I-NEXT: G_STORE [[COPY1]](<vscale x 4 x s64>), [[COPY]](p0) :: (store (<vscale x 4 x s64>) into %ir.pa)
+    ; RV32I-NEXT: PseudoRET
+    ;
+    ; RV64I-LABEL: name: vstore_nx4i64
+    ; RV64I: liveins: $x10, $v8m4
+    ; RV64I-NEXT: {{  $}}
+    ; RV64I-NEXT: [[COPY:%[0-9]+]]:gprb(p0) = COPY $x10
+    ; RV64I-NEXT: [[COPY1:%[0-9]+]]:vrb(<vscale x 4 x s64>) = COPY $v8m4
+    ; RV64I-NEXT: G_STORE [[COPY1]](<vscale x 4 x s64>), [[COPY]](p0) :: (store (<vscale x 4 x s64>) into %ir.pa)
+    ; RV64I-NEXT: PseudoRET
+    %0:_(p0) = COPY $x10
+    %1:_(<vscale x 4 x s64>) = COPY $v8m4
+    G_STORE %1(<vscale x 4 x s64>), %0(p0) :: (store (<vscale x 4 x s64>) into %ir.pa)
+    PseudoRET
+
+...
+---
+name:            vstore_nx8i64
+legalized:       true
+tracksRegLiveness: true
+body:             |
+  bb.1 (%ir-block.0):
+    liveins: $x10, $v8m8
+
+    ; RV32I-LABEL: name: vstore_nx8i64
+    ; RV32I: liveins: $x10, $v8m8
+    ; RV32I-NEXT: {{  $}}
+    ; RV32I-NEXT: [[COPY:%[0-9]+]]:gprb(p0) = COPY $x10
+    ; RV32I-NEXT: [[COPY1:%[0-9]+]]:vrb(<vscale x 8 x s64>) = COPY $v8m8
+    ; RV32I-NEXT: G_STORE [[COPY1]](<vscale x 8 x s64>), [[COPY]](p0) :: (store (<vscale x 8 x s64>) into %ir.pa)
+    ; RV32I-NEXT: PseudoRET
+    ;
+    ; RV64I-LABEL: name: vstore_nx8i64
+    ; RV64I: liveins: $x10, $v8m8
+    ; RV64I-NEXT: {{  $}}
+    ; RV64I-NEXT: [[COPY:%[0-9]+]]:gprb(p0) = COPY $x10
+    ; RV64I-NEXT: [[COPY1:%[0-9]+]]:vrb(<vscale x 8 x s64>) = COPY $v8m8
+    ; RV64I-NEXT: G_STORE [[COPY1]](<vscale x 8 x s64>), [[COPY]](p0) :: (store (<vscale x 8 x s64>) into %ir.pa)
+    ; RV64I-NEXT: PseudoRET
+    %0:_(p0) = COPY $x10
+    %1:_(<vscale x 8 x s64>) = COPY $v8m8
+    G_STORE %1(<vscale x 8 x s64>), %0(p0) :: (store (<vscale x 8 x s64>) into %ir.pa)
+    PseudoRET
+
+...
+---
+name:            vstore_nx16i8_align1
+legalized:       true
+tracksRegLiveness: true
+body:             |
+  bb.1 (%ir-block.0):
+    liveins: $x10, $v8m2
+
+    ; RV32I-LABEL: name: vstore_nx16i8_align1
+    ; RV32I: liveins: $x10, $v8m2
+    ; RV32I-NEXT: {{  $}}
+    ; RV32I-NEXT: [[COPY:%[0-9]+]]:gprb(p0) = COPY $x10
+    ; RV32I-NEXT: [[COPY1:%[0-9]+]]:vrb(<vscale x 16 x s8>) = COPY $v8m2
+    ; RV32I-NEXT: G_STORE [[COPY1]](<vscale x 16 x s8>), [[COPY]](p0) :: (store (<vscale x 16 x s8>) into %ir.pa, align 1)
+    ; RV32I-NEXT: PseudoRET
+    ;
+    ; RV64I-LABEL: name: vstore_nx16i8_align1
+    ; RV64I: liveins: $x10, $v8m2
+    ; RV64I-NEXT: {{  $}}
+    ; RV64I-NEXT: [[COPY:%[0-9]+]]:gprb(p0) = COPY $x10
+    ; RV64I-NEXT: [[COPY1:%[0-9]+]]:vrb(<vscale x 16 x s8>) = COPY $v8m2
+    ; RV64I-NEXT: G_STORE [[COPY1]](<vscale x 16 x s8>), [[COPY]](p0) :: (store (<vscale x 16 x s8>) into %ir.pa, align 1)
+    ; RV64I-NEXT: PseudoRET
+    %0:_(p0) = COPY $x10
+    %1:_(<vscale x 16 x s8>) = COPY $v8m2
+    G_STORE %1(<vscale x 16 x s8>), %0(p0) :: (store (<vscale x 16 x s8>) into %ir.pa, align 1)
+    PseudoRET
+
+...
+---
+name:            vstore_nx16i8_align2
+legalized:       true
+tracksRegLiveness: true
+body:             |
+  bb.1 (%ir-block.0):
+    liveins: $x10, $v8m2
+
+    ; RV32I-LABEL: name: vstore_nx16i8_align2
+    ; RV32I: liveins: $x10, $v8m2
+    ; RV32I-NEXT: {{  $}}
+    ; RV32I-NEXT: [[COPY:%[0-9]+]]:gprb(p0) = COPY $x10
+    ; RV32I-NEXT: [[COPY1:%[0-9]+]]:vrb(<vscale x 16 x s8>) = COPY $v8m2
+    ; RV32I-NEXT: G_STORE [[COPY1]](<vscale x 16 x s8>), [[COPY]](p0) :: (store (<vscale x 16 x s8>) into %ir.pa, align 2)
+    ; RV32I-NEXT: PseudoRET
+    ;
+    ; RV64I-LABEL: name: vstore_nx16i8_align2
+    ; RV64I: liveins: $x10, $v8m2
+    ; RV64I-NEXT: {{  $}}
+    ; RV64I-NEXT: [[COPY:%[0-9]+]]:gprb(p0) = COPY $x10
+    ; RV64I-NEXT: [[COPY1:%[0-9]+]]:vrb(<vscale x 16 x s8>) = COPY $v8m2
+    ; RV64I-NEXT: G_STORE [[COPY1]](<vscale x 16 x s8>), [[COPY]](p0) :: (store (<vscale x 16 x s8>) into %ir.pa, align 2)
+    ; RV64I-NEXT: PseudoRET
+    %0:_(p0) = COPY $x10
+    %1:_(<vscale x 16 x s8>) = COPY $v8m2
+    G_STORE %1(<vscale x 16 x s8>), %0(p0) :: (store (<vscale x 16 x s8>) into %ir.pa, align 2)
+    PseudoRET
+
+...
+---
+name:            vstore_nx16i8_align16
+legalized:       true
+tracksRegLiveness: true
+body:             |
+  bb.1 (%ir-block.0):
+    liveins: $x10, $v8m2
+
+    ; RV32I-LABEL: name: vstore_nx16i8_align16
+    ; RV32I: liveins: $x10, $v8m2
+    ; RV32I-NEXT: {{  $}}
+    ; RV32I-NEXT: [[COPY:%[0-9]+]]:gprb(p0) = COPY $x10
+    ; RV32I-NEXT: [[COPY1:%[0-9]+]]:vrb(<vscale x 16 x s8>) = COPY $v8m2
+    ; RV32I-NEXT: G_STORE [[COPY1]](<vscale x 16 x s8>), [[COPY]](p0) :: (store (<vscale x 16 x s8>) into %ir.pa)
+    ; RV32I-NEXT: PseudoRET
+    ;
+    ; RV64I-LABEL: name: vstore_nx16i8_align16
+    ; RV64I: liveins: $x10, $v8m2
+    ; RV64I-NEXT: {{  $}}
+    ; RV64I-NEXT: [[COPY:%[0-9]+]]:gprb(p0) = COPY $x10
+    ; RV64I-NEXT: [[COPY1:%[0-9]+]]:vrb(<vscale x 16 x s8>) = COPY $v8m2
+    ; RV64I-NEXT: G_STORE [[COPY1]](<vscale x 16 x s8>), [[COPY]](p0) :: (store (<vscale x 16 x s8>) into %ir.pa)
+    ; RV64I-NEXT: PseudoRET
+    %0:_(p0) = COPY $x10
+    %1:_(<vscale x 16 x s8>) = COPY $v8m2
+    G_STORE %1(<vscale x 16 x s8>), %0(p0) :: (store (<vscale x 16 x s8>) into %ir.pa)
+    PseudoRET
+
+...
+---
+name:            vstore_nx16i8_align64
+legalized:       true
+tracksRegLiveness: true
+body:             |
+  bb.1 (%ir-block.0):
+    liveins: $x10, $v8m2
+
+    ; RV32I-LABEL: name: vstore_nx16i8_align64
+    ; RV32I: liveins: $x10, $v8m2
+    ; RV32I-NEXT: {{  $}}
+    ; RV32I-NEXT: [[COPY:%[0-9]+]]:gprb(p0) = COPY $x10
+    ; RV32I-NEXT: [[COPY1:%[0-9]+]]:vrb(<vscale x 16 x s8>) = COPY $v8m2
+    ; RV32I-NEXT: G_STORE [[COPY1]](<vscale x 16 x s8>), [[COPY]](p0) :: (store (<vscale x 16 x s8>) into %ir.pa, align 64)
+    ; RV32I-NEXT: PseudoRET
+    ;
+    ; RV64I-LABEL: name: vstore_nx16i8_align64
+    ; RV64I: liveins: $x10, $v8m2
+    ; RV64I-NEXT: {{  $}}
+    ; RV64I-NEXT: [[COPY:%[0-9]+]]:gprb(p0) = COPY $x10
+    ; RV64I-NEXT: [[COPY1:%[0-9]+]]:vrb(<vscale x 16 x s8>) = COPY $v8m2
+    ; RV64I-NEXT: G_STORE [[COPY1]](<vscale x 16 x s8>), [[COPY]](p0) :: (store (<vscale x 16 x s8>) into %ir.pa, align 64)
+    ; RV64I-NEXT: PseudoRET
+    %0:_(p0) = COPY $x10
+    %1:_(<vscale x 16 x s8>) = COPY $v8m2
+    G_STORE %1(<vscale x 16 x s8>), %0(p0) :: (store (<vscale x 16 x s8>) into %ir.pa, align 64)
+    PseudoRET
+
+...
+---
+name:            vstore_nx4i16_align1
+legalized:       true
+tracksRegLiveness: true
+body:             |
+  bb.1 (%ir-block.0):
+    liveins: $v8, $x10
+
+    ; RV32I-LABEL: name: vstore_nx4i16_align1
+    ; RV32I: liveins: $v8, $x10
+    ; RV32I-NEXT: {{  $}}
+    ; RV32I-NEXT: [[COPY:%[0-9]+]]:gprb(p0) = COPY $x10
+    ; RV32I-NEXT: [[COPY1:%[0-9]+]]:vrb(<vscale x 4 x s16>) = COPY $v8
+    ; RV32I-NEXT: [[BITCAST:%[0-9]+]]:vrb(<vscale x 8 x s8>) = G_BITCAST [[COPY1]](<vscale x 4 x s16>)
+    ; RV32I-NEXT: G_STORE [[BITCAST]](<vscale x 8 x s8>), [[COPY]](p0) :: (store (<vscale x 8 x s8>) into %ir.pa, align 1)
+    ; RV32I-NEXT: PseudoRET
+    ;
+    ; RV64I-LABEL: name: vstore_nx4i16_align1
+    ; RV64I: liveins: $v8, $x10
+    ; RV64I-NEXT: {{  $}}
+    ; RV64I-NEXT: [[COPY:%[0-9]+]]:gprb(p0) = COPY $x10
+    ; RV64I-NEXT: [[COPY1:%[0-9]+]]:vrb(<vscale x 4 x s16>) = COPY $v8
+    ; RV64I-NEXT: [[BITCAST:%[0-9]+]]:vrb(<vscale x 8 x s8>) = G_BITCAST [[COPY1]](<vscale x 4 x s16>)
+    ; RV64I-NEXT: G_STORE [[BITCAST]](<vscale x 8 x s8>), [[COPY]](p0) :: (store (<vscale x 8 x s8>) into %ir.pa, align 1)
+    ; RV64I-NEXT: PseudoRET
+    %0:_(p0) = COPY $x10
+    %1:_(<vscale x 4 x s16>) = COPY $v8
+    %2:_(<vscale x 8 x s8>) = G_BITCAST %1(<vscale x 4 x s16>)
+    G_STORE %2(<vscale x 8 x s8>), %0(p0) :: (store (<vscale x 8 x s8>) into %ir.pa, align 1)
+    PseudoRET
+
+...
+---
+name:            vstore_nx4i16_align2
+legalized:       true
+tracksRegLiveness: true
+body:             |
+  bb.1 (%ir-block.0):
+    liveins: $v8, $x10
+
+    ; RV32I-LABEL: name: vstore_nx4i16_align2
+    ; RV32I: liveins: $v8, $x10
+    ; RV32I-NEXT: {{  $}}
+    ; RV32I-NEXT: [[COPY:%[0-9]+]]:gprb(p0) = COPY $x10
+    ; RV32I-NEXT: [[COPY1:%[0-9]+]]:vrb(<vscale x 4 x s16>) = COPY $v8
+    ; RV32I-NEXT: G_STORE [[COPY1]](<vscale x 4 x s16>), [[COPY]](p0) :: (store (<vscale x 4 x s16>) into %ir.pa, align 2)
+    ; RV32I-NEXT: PseudoRET
+    ;
+    ; RV64I-LABEL: name: vstore_nx4i16_align2
+    ; RV64I: liveins: $v8, $x10
+    ; RV64I-NEXT: {{  $}}
+    ; RV64I-NEXT: [[COPY:%[0-9]+]]:gprb(p0) = COPY $x10
+    ; RV64I-NEXT: [[COPY1:%[0-9]+]]:vrb(<vscale x 4 x s16>) = COPY $v8
+    ; RV64I-NEXT: G_STORE [[COPY1]](<vscale x 4 x s16>), [[COPY]](p0) :: (store (<vscale x 4 x s16>) into %ir.pa, align 2)
+    ; RV64I-NEXT: PseudoRET
+    %0:_(p0) = COPY $x10
+    %1:_(<vscale x 4 x s16>) = COPY $v8
+    G_STORE %1(<vscale x 4 x s16>), %0(p0) :: (store (<vscale x 4 x s16>) into %ir.pa, align 2)
+    PseudoRET
+
+...
+---
+name:            vstore_nx4i16_align4
+legalized:       true
+tracksRegLiveness: true
+body:             |
+  bb.1 (%ir-block.0):
+    liveins: $v8, $x10
+
+    ; RV32I-LABEL: name: vstore_nx4i16_align4
+    ; RV32I: liveins: $v8, $x10
+    ; RV32I-NEXT: {{  $}}
+    ; RV32I-NEXT: [[COPY:%[0-9]+]]:gprb(p0) = COPY $x10
+    ; RV32I-NEXT: [[COPY1:%[0-9]+]]:vrb(<vscale x 4 x s16>) = COPY $v8
+    ; RV32I-NEXT: G_STORE [[COPY1]](<vscale x 4 x s16>), [[COPY]](p0) :: (store (<vscale x 4 x s16>) into %ir.pa, align 4)
+    ; RV32I-NEXT: PseudoRET
+    ;
+    ; RV64I-LABEL: name: vstore_nx4i16_align4
+    ; RV64I: liveins: $v8, $x10
+    ; RV64I-NEXT: {{  $}}
+    ; RV64I-NEXT: [[COPY:%[0-9]+]]:gprb(p0) = COPY $x10
+    ; RV64I-NEXT: [[COPY1:%[0-9]+]]:vrb(<vscale x 4 x s16>) = COPY $v8
+    ; RV64I-NEXT: G_STORE [[COPY1]](<vscale x 4 x s16>), [[COPY]](p0) :: (store (<vscale x 4 x s16>) into %ir.pa, align 4)
+    ; RV64I-NEXT: PseudoRET
+    %0:_(p0) = COPY $x10
+    %1:_(<vscale x 4 x s16>) = COPY $v8
+    G_STORE %1(<vscale x 4 x s16>), %0(p0) :: (store (<vscale x 4 x s16>) into %ir.pa, align 4)
+    PseudoRET
+
+...
+---
+name:            vstore_nx4i16_align8
+legalized:       true
+tracksRegLiveness: true
+body:             |
+  bb.1 (%ir-block.0):
+    liveins: $v8, $x10
+
+    ; RV32I-LABEL: name: vstore_nx4i16_align8
+    ; RV32I: liveins: $v8, $x10
+    ; RV32I-NEXT: {{  $}}
+    ; RV32I-NEXT: [[COPY:%[0-9]+]]:gprb(p0) = COPY $x10
+    ; RV32I-NEXT: [[COPY1:%[0-9]+]]:vrb(<vscale x 4 x s16>) = COPY $v8
+    ; RV32I-NEXT: G_STORE [[COPY1]](<vscale x 4 x s16>), [[COPY]](p0) :: (store (<vscale x 4 x s16>) into %ir.pa)
+    ; RV32I-NEXT: PseudoRET
+    ;
+    ; RV64I-LABEL: name: vstore_nx4i16_align8
+    ; RV64I: liveins: $v8, $x10
+    ; RV64I-NEXT: {{  $}}
+    ; RV64I-NEXT: [[COPY:%[0-9]+]]:gprb(p0) = COPY $x10
+    ; RV64I-NEXT: [[COPY1:%[0-9]+]]:vrb(<vscale x 4 x s16>) = COPY $v8
+    ; RV64I-NEXT: G_STORE [[COPY1]](<vscale x 4 x s16>), [[COPY]](p0) :: (store (<vscale x 4 x s16>) into %ir.pa)
+    ; RV64I-NEXT: PseudoRET
+    %0:_(p0) = COPY $x10
+    %1:_(<vscale x 4 x s16>) = COPY $v8
+    G_STORE %1(<vscale x 4 x s16>), %0(p0) :: (store (<vscale x 4 x s16>) into %ir.pa)
+    PseudoRET
+
+...
+---
+name:            vstore_nx4i16_align16
+legalized:       true
+tracksRegLiveness: true
+body:             |
+  bb.1 (%ir-block.0):
+    liveins: $v8, $x10
+
+    ; RV32I-LABEL: name: vstore_nx4i16_align16
+    ; RV32I: liveins: $v8, $x10
+    ; RV32I-NEXT: {{  $}}
+    ; RV32I-NEXT: [[COPY:%[0-9]+]]:gprb(p0) = COPY $x10
+    ; RV32I-NEXT: [[COPY1:%[0-9]+]]:vrb(<vscale x 4 x s16>) = COPY $v8
+    ; RV32I-NEXT: G_STORE [[COPY1]](<vscale x 4 x s16>), [[COPY]](p0) :: (store (<vscale x 4 x s16>) into %ir.pa, align 16)
+    ; RV32I-NEXT: PseudoRET
+    ;
+    ; RV64I-LABEL: name: vstore_nx4i16_align16
+    ; RV64I: liveins: $v8, $x10
+    ; RV64I-NEXT: {{  $}}
+    ; RV64I-NEXT: [[COPY:%[0-9]+]]:gprb(p0) = COPY $x10
+    ; RV64I-NEXT: [[COPY1:%[0-9]+]]:vrb(<vscale x 4 x s16>) = COPY $v8
+    ; RV64I-NEXT: G_STORE [[COPY1]](<vscale x 4 x s16>), [[COPY]](p0) :: (store (<vscale x 4 x s16>) into %ir.pa, align 16)
+    ; RV64I-NEXT: PseudoRET
+    %0:_(p0) = COPY $x10
+    %1:_(<vscale x 4 x s16>) = COPY $v8
+    G_STORE %1(<vscale x 4 x s16>), %0(p0) :: (store (<vscale x 4 x s16>) into %ir.pa, align 16)
+    PseudoRET
+
+...
+---
+name:            vstore_nx2i32_align2
+legalized:       true
+tracksRegLiveness: true
+body:             |
+  bb.1 (%ir-block.0):
+    liveins: $v8, $x10
+
+    ; RV32I-LABEL: name: vstore_nx2i32_align2
+    ; RV32I: liveins: $v8, $x10
+    ; RV32I-NEXT: {{  $}}
+    ; RV32I-NEXT: [[COPY:%[0-9]+]]:gprb(p0) = COPY $x10
+    ; RV32I-NEXT: [[COPY1:%[0-9]+]]:vrb(<vscale x 2 x s32>) = COPY $v8
+    ; RV32I-NEXT: [[BITCAST:%[0-9]+]]:vrb(<vscale x 8 x s8>) = G_BITCAST [[COPY1]](<vscale x 2 x s32>)
+    ; RV32I-NEXT: G_STORE [[BITCAST]](<vscale x 8 x s8>), [[COPY]](p0) :: (store (<vscale x 8 x s8>) into %ir.pa, align 2)
+    ; RV32I-NEXT: PseudoRET
+    ;
+    ; RV64I-LABEL: name: vstore_nx2i32_align2
+    ; RV64I: liveins: $v8, $x10
+    ; RV64I-NEXT: {{  $}}
+    ; RV64I-NEXT: [[COPY:%[0-9]+]]:gprb(p0) = COPY $x10
+    ; RV64I-NEXT: [[COPY1:%[0-9]+]]:vrb(<vscale x 2 x s32>) = COPY $v8
+    ; RV64I-NEXT: [[BITCAST:%[0-9]+]]:vrb(<vscale x 8 x s8>) = G_BITCAST [[COPY1]](<vscale x 2 x s32>)
+    ; RV64I-NEXT: G_STORE [[BITCAST]](<vscale x 8 x s8>), [[COPY]](p0) :: (store (<vscale x 8 x s8>) into %ir.pa, align 2)
+    ; RV64I-NEXT: PseudoRET
+    %0:_(p0) = COPY $x10
+    %1:_(<vscale x 2 x s32>) = COPY $v8
+    %2:_(<vscale x 8 x s8>) = G_BITCAST %1(<vscale x 2 x s32>)
+    G_STORE %2(<vscale x 8 x s8>), %0(p0) :: (store (<vscale x 8 x s8>) into %ir.pa, align 2)
+    PseudoRET
+
+...
+---
+name:            vstore_nx2i32_align4
+legalized:       true
+tracksRegLiveness: true
+body:             |
+  bb.1 (%ir-block.0):
+    liveins: $v8, $x10
+
+    ; RV32I-LABEL: name: vstore_nx2i32_align4
+    ; RV32I: liveins: $v8, $x10
+    ; RV32I-NEXT: {{  $}}
+    ; RV32I-NEXT: [[COPY:%[0-9]+]]:gprb(p0) = COPY $x10
+    ; RV32I-NEXT: [[COPY1:%[0-9]+]]:vrb(<vscale x 2 x s32>) = COPY $v8
+    ; RV32I-NEXT: G_STORE [[COPY1]](<vscale x 2 x s32>), [[COPY]](p0) :: (store (<vscale x 2 x s32>) into %ir.pa, align 4)
+    ; RV32I-NEXT: PseudoRET
+    ;
+    ; RV64I-LABEL: name: vstore_nx2i32_align4
+    ; RV64I: liveins: $v8, $x10
+    ; RV64I-NEXT: {{  $}}
+    ; RV64I-NEXT: [[COPY:%[0-9]+]]:gprb(p0) = COPY $x10
+    ; RV64I-NEXT: [[COPY1:%[0-9]+]]:vrb(<vscale x 2 x s32>) = COPY $v8
+    ; RV64I-NEXT: G_STORE [[COPY1]](<vscale x 2 x s32>), [[COPY]](p0) :: (store (<vscale x 2 x s32>) into %ir.pa, align 4)
+    ; RV64I-NEXT: PseudoRET
+    %0:_(p0) = COPY $x10
+    %1:_(<vscale x 2 x s32>) = COPY $v8
+    G_STORE %1(<vscale x 2 x s32>), %0(p0) :: (store (<vscale x 2 x s32>) into %ir.pa, align 4)
+    PseudoRET
+
+...
+---
+name:            vstore_nx2i32_align8
+legalized:       true
+tracksRegLiveness: true
+body:             |
+  bb.1 (%ir-block.0):
+    liveins: $v8, $x10
+
+    ; RV32I-LABEL: name: vstore_nx2i32_align8
+    ; RV32I: liveins: $v8, $x10
+    ; RV32I-NEXT: {{  $}}
+    ; RV32I-NEXT: [[COPY:%[0-9]+]]:gprb(p0) = COPY $x10
+    ; RV32I-NEXT: [[COPY1:%[0-9]+]]:vrb(<vscale x 2 x s32>) = COPY $v8
+    ; RV32I-NEXT: G_STORE [[COPY1]](<vscale x 2 x s32>), [[COPY]](p0) :: (store (<vscale x 2 x s32>) into %ir.pa)
+    ; RV32I-NEXT: PseudoRET
+    ;
+    ; RV64I-LABEL: name: vstore_nx2i32_align8
+    ; RV64I: liveins: $v8, $x10
+    ; RV64I-NEXT: {{  $}}
+    ; RV64I-NEXT: [[COPY:%[0-9]+]]:gprb(p0) = COPY $x10
+    ; RV64I-NEXT: [[COPY1:%[0-9]+]]:vrb(<vscale x 2 x s32>) = COPY $v8
+    ; RV64I-NEXT: G_STORE [[COPY1]](<vscale x 2 x s32>), [[COPY]](p0) :: (store (<vscale x 2 x s32>) into %ir.pa)
+    ; RV64I-NEXT: PseudoRET
+    %0:_(p0) = COPY $x10
+    %1:_(<vscale x 2 x s32>) = COPY $v8
+    G_STORE %1(<vscale x 2 x s32>), %0(p0) :: (store (<vscale x 2 x s32>) into %ir.pa)
+    PseudoRET
+
+...
+---
+name:            vstore_nx2i32_align16
+legalized:       true
+tracksRegLiveness: true
+body:             |
+  bb.1 (%ir-block.0):
+    liveins: $v8, $x10
+
+    ; RV32I-LABEL: name: vstore_nx2i32_align16
+    ; RV32I: liveins: $v8, $x10
+    ; RV32I-NEXT: {{  $}}
+    ; RV32I-NEXT: [[COPY:%[0-9]+]]:gprb(p0) = COPY $x10
+    ; RV32I-NEXT: [[COPY1:%[0-9]+]]:vrb(<vscale x 2 x s32>) = COPY $v8
+    ; RV32I-NEXT: G_STORE [[COPY1]](<vscale x 2 x s32>), [[COPY]](p0) :: (store (<vscale x 2 x s32>) into %ir.pa, align 16)
+    ; RV32I-NEXT: PseudoRET
+    ;
+    ; RV64I-LABEL: name: vstore_nx2i32_align16
+    ; RV64I: liveins: $v8, $x10
+    ; RV64I-NEXT: {{  $}}
+    ; RV64I-NEXT: [[COPY:%[0-9]+]]:gprb(p0) = COPY $x10
+    ; RV64I-NEXT: [[COPY1:%[0-9]+]]:vrb(<vscale x 2 x s32>) = COPY $v8
+    ; RV64I-NEXT: G_STORE [[COPY1]](<vscale x 2 x s32>), [[COPY]](p0) :: (store (<vscale x 2 x s32>) into %ir.pa, align 16)
+    ; RV64I-NEXT: PseudoRET
+    %0:_(p0) = COPY $x10
+    %1:_(<vscale x 2 x s32>) = COPY $v8
+    G_STORE %1(<vscale x 2 x s32>), %0(p0) :: (store (<vscale x 2 x s32>) into %ir.pa, align 16)
+    PseudoRET
+
+...
+---
+name:            vstore_nx2i32_align256
+legalized:       true
+tracksRegLiveness: true
+body:             |
+  bb.1 (%ir-block.0):
+    liveins: $v8, $x10
+
+    ; RV32I-LABEL: name: vstore_nx2i32_align256
+    ; RV32I: liveins: $v8, $x10
+    ; RV32I-NEXT: {{  $}}
+    ; RV32I-NEXT: [[COPY:%[0-9]+]]:gprb(p0) = COPY $x10
+    ; RV32I-NEXT: [[COPY1:%[0-9]+]]:vrb(<vscale x 2 x s32>) = COPY $v8
+    ; RV32I-NEXT: G_STORE [[COPY1]](<vscale x 2 x s32>), [[COPY]](p0) :: (store (<vscale x 2 x s32>) into %ir.pa, align 256)
+    ; RV32I-NEXT: PseudoRET
+    ;
+    ; RV64I-LABEL: name: vstore_nx2i32_align256
+    ; RV64I: liveins: $v8, $x10
+    ; RV64I-NEXT: {{  $}}
+    ; RV64I-NEXT: [[COPY:%[0-9]+]]:gprb(p0) = COPY $x10
+    ; RV64I-NEXT: [[COPY1:%[0-9]+]]:vrb(<vscale x 2 x s32>) = COPY $v8
+    ; RV64I-NEXT: G_STORE [[COPY1]](<vscale x 2 x s32>), [[COPY]](p0) :: (store (<vscale x 2 x s32>) into %ir.pa, align 256)
+    ; RV64I-NEXT: PseudoRET
+    %0:_(p0) = COPY $x10
+    %1:_(<vscale x 2 x s32>) = COPY $v8
+    G_STORE %1(<vscale x 2 x s32>), %0(p0) :: (store (<vscale x 2 x s32>) into %ir.pa, align 256)
+    PseudoRET
+
+...
+---
+name:            vstore_nx2i64_align4
+legalized:       true
+tracksRegLiveness: true
+body:             |
+  bb.1 (%ir-block.0):
+    liveins: $x10, $v8m2
+
+    ; RV32I-LABEL: name: vstore_nx2i64_align4
+    ; RV32I: liveins: $x10, $v8m2
+    ; RV32I-NEXT: {{  $}}
+    ; RV32I-NEXT: [[COPY:%[0-9]+]]:gprb(p0) = COPY $x10
+    ; RV32I-NEXT: [[COPY1:%[0-9]+]]:vrb(<vscale x 2 x s64>) = COPY $v8m2
+    ; RV32I-NEXT: [[BITCAST:%[0-9]+]]:vrb(<vscale x 16 x s8>) = G_BITCAST [[COPY1]](<vscale x 2 x s64>)
+    ; RV32I-NEXT: G_STORE [[BITCAST]](<vscale x 16 x s8>), [[COPY]](p0) :: (store (<vscale x 16 x s8>) into %ir.pa, align 4)
+    ; RV32I-NEXT: PseudoRET
+    ;
+    ; RV64I-LABEL: name: vstore_nx2i64_align4
+    ; RV64I: liveins: $x10, $v8m2
+    ; RV64I-NEXT: {{  $}}
+    ; RV64I-NEXT: [[COPY:%[0-9]+]]:gprb(p0) = COPY $x10
+    ; RV64I-NEXT: [[COPY1:%[0-9]+]]:vrb(<vscale x 2 x s64>) = COPY $v8m2
+    ; RV64I-NEXT: [[BITCAST:%[0-9]+]]:vrb(<vscale x 16 x s8>) = G_BITCAST [[COPY1]](<vscale x 2 x s64>)
+    ; RV64I-NEXT: G_STORE [[BITCAST]](<vscale x 16 x s8>), [[COPY]](p0) :: (store (<vscale x 16 x s8>) into %ir.pa, align 4)
+    ; RV64I-NEXT: PseudoRET
+    %0:_(p0) = COPY $x10
+    %1:_(<vscale x 2 x s64>) = COPY $v8m2
+    %2:_(<vscale x 16 x s8>) = G_BITCAST %1(<vscale x 2 x s64>)
+    G_STORE %2(<vscale x 16 x s8>), %0(p0) :: (store (<vscale x 16 x s8>) into %ir.pa, align 4)
+    PseudoRET
+
+...
+---
+name:            vstore_nx2i64_align8
+legalized:       true
+tracksRegLiveness: true
+body:             |
+  bb.1 (%ir-block.0):
+    liveins: $x10, $v8m2
+
+    ; RV32I-LABEL: name: vstore_nx2i64_align8
+    ; RV32I: liveins: $x10, $v8m2
+    ; RV32I-NEXT: {{  $}}
+    ; RV32I-NEXT: [[COPY:%[0-9]+]]:gprb(p0) = COPY $x10
+    ; RV32I-NEXT: [[COPY1:%[0-9]+]]:vrb(<vscale x 2 x s64>) = COPY $v8m2
+    ; RV32I-NEXT: G_STORE [[COPY1]](<vscale x 2 x s64>), [[COPY]](p0) :: (store (<vscale x 2 x s64>) into %ir.pa, align 8)
+    ; RV32I-NEXT: PseudoRET
+    ;
+    ; RV64I-LABEL: name: vstore_nx2i64_align8
+    ; RV64I: liveins: $x10, $v8m2
+    ; RV64I-NEXT: {{  $}}
+    ; RV64I-NEXT: [[COPY:%[0-9]+]]:gprb(p0) = COPY $x10
+    ; RV64I-NEXT: [[COPY1:%[0-9]+]]:vrb(<vscale x 2 x s64>) = COPY $v8m2
+    ; RV64I-NEXT: G_STORE [[COPY1]](<vscale x 2 x s64>), [[COPY]](p0) :: (store (<vscale x 2 x s64>) into %ir.pa, align 8)
+    ; RV64I-NEXT: PseudoRET
+    %0:_(p0) = COPY $x10
+    %1:_(<vscale x 2 x s64>) = COPY $v8m2
+    G_STORE %1(<vscale x 2 x s64>), %0(p0) :: (store (<vscale x 2 x s64>) into %ir.pa, align 8)
+    PseudoRET
+
+...
+---
+name:            vstore_nx2i64_align16
+legalized:       true
+tracksRegLiveness: true
+body:             |
+  bb.1 (%ir-block.0):
+    liveins: $x10, $v8m2
+
+    ; RV32I-LABEL: name: vstore_nx2i64_align16
+    ; RV32I: liveins: $x10, $v8m2
+    ; RV32I-NEXT: {{  $}}
+    ; RV32I-NEXT: [[COPY:%[0-9]+]]:gprb(p0) = COPY $x10
+    ; RV32I-NEXT: [[COPY1:%[0-9]+]]:vrb(<vscale x 2 x s64>) = COPY $v8m2
+    ; RV32I-NEXT: G_STORE [[COPY1]](<vscale x 2 x s64>), [[COPY]](p0) :: (store (<vscale x 2 x s64>) into %ir.pa)
+    ; RV32I-NEXT: PseudoRET
+    ;
+    ; RV64I-LABEL: name: vstore_nx2i64_align16
+    ; RV64I: liveins: $x10, $v8m2
+    ; RV64I-NEXT: {{  $}}
+    ; RV64I-NEXT: [[COPY:%[0-9]+]]:gprb(p0) = COPY $x10
+    ; RV64I-NEXT: [[COPY1:%[0-9]+]]:vrb(<vscale x 2 x s64>) = COPY $v8m2
+    ; RV64I-NEXT: G_STORE [[COPY1]](<vscale x 2 x s64>), [[COPY]](p0) :: (store (<vscale x 2 x s64>) into %ir.pa)
+    ; RV64I-NEXT: PseudoRET
+    %0:_(p0) = COPY $x10
+    %1:_(<vscale x 2 x s64>) = COPY $v8m2
+    G_STORE %1(<vscale x 2 x s64>), %0(p0) :: (store (<vscale x 2 x s64>) into %ir.pa)
+    PseudoRET
+
+...
+---
+name:            vstore_nx2i64_align32
+legalized:       true
+tracksRegLiveness: true
+body:             |
+  bb.1 (%ir-block.0):
+    liveins: $x10, $v8m2
+
+    ; RV32I-LABEL: name: vstore_nx2i64_align32
+    ; RV32I: liveins: $x10, $v8m2
+    ; RV32I-NEXT: {{  $}}
+    ; RV32I-NEXT: [[COPY:%[0-9]+]]:gprb(p0) = COPY $x10
+    ; RV32I-NEXT: [[COPY1:%[0-9]+]]:vrb(<vscale x 2 x s64>) = COPY $v8m2
+    ; RV32I-NEXT: G_STORE [[COPY1]](<vscale x 2 x s64>), [[COPY]](p0) :: (store (<vscale x 2 x s64>) into %ir.pa, align 32)
+    ; RV32I-NEXT: PseudoRET
+    ;
+    ; RV64I-LABEL: name: vstore_nx2i64_align32
+    ; RV64I: liveins: $x10, $v8m2
+    ; RV64I-NEXT: {{  $}}
+    ; RV64I-NEXT: [[COPY:%[0-9]+]]:gprb(p0) = COPY $x10
+    ; RV64I-NEXT: [[COPY1:%[0-9]+]]:vrb(<vscale x 2 x s64>) = COPY $v8m2
+    ; RV64I-NEXT: G_STORE [[COPY1]](<vscale x 2 x s64>), [[COPY]](p0) :: (store (<vscale x 2 x s64>) into %ir.pa, align 32)
+    ; RV64I-NEXT: PseudoRET
+    %0:_(p0) = COPY $x10
+    %1:_(<vscale x 2 x s64>) = COPY $v8m2
+    G_STORE %1(<vscale x 2 x s64>), %0(p0) :: (store (<vscale x 2 x s64>) into %ir.pa, align 32)
+    PseudoRET
+
+...
+---
+name:            vstore_nx1ptr
+legalized:       true
+tracksRegLiveness: true
+body:             |
+  bb.1 (%ir-block.0):
+    liveins: $v8, $x10
+
+    ; RV32I-LABEL: name: vstore_nx1ptr
+    ; RV32I: liveins: $v8, $x10
+    ; RV32I-NEXT: {{  $}}
+    ; RV32I-NEXT: [[COPY:%[0-9]+]]:gprb(p0) = COPY $x10
+    ; RV32I-NEXT: [[COPY1:%[0-9]+]]:vrb(<vscale x 1 x p0>) = COPY $v8
+    ; RV32I-NEXT: G_STORE [[COPY1]](<vscale x 1 x p0>), [[COPY]](p0) :: (store (<vscale x 1 x p0>) into %ir.pa)
+    ; RV32I-NEXT: PseudoRET
+    ;
+    ; RV64I-LABEL: name: vstore_nx1ptr
+    ; RV64I: liveins: $v8, $x10
+    ; RV64I-NEXT: {{  $}}
+    ; RV64I-NEXT: [[COPY:%[0-9]+]]:gprb(p0) = COPY $x10
+    ; RV64I-NEXT: [[COPY1:%[0-9]+]]:vrb(<vscale x 1 x p0>) = COPY $v8
+    ; RV64I-NEXT: G_STORE [[COPY1]](<vscale x 1 x p0>), [[COPY]](p0) :: (store (<vscale x 1 x p0>) into %ir.pa)
+    ; RV64I-NEXT: PseudoRET
+    %0:_(p0) = COPY $x10
+    %1:_(<vscale x 1 x p0>) = COPY $v8
+    G_STORE %1(<vscale x 1 x p0>), %0(p0) :: (store (<vscale x 1 x p0>) into %ir.pa)
+    PseudoRET
+
+...
+---
+name:            vstore_nx2ptr
+legalized:       true
+tracksRegLiveness: true
+body:             |
+  bb.1 (%ir-block.0):
+    liveins: $v8, $x10
+
+    ; RV32I-LABEL: name: vstore_nx2ptr
+    ; RV32I: liveins: $v8, $x10
+    ; RV32I-NEXT: {{  $}}
+    ; RV32I-NEXT: [[COPY:%[0-9]+]]:gprb(p0) = COPY $x10
+    ; RV32I-NEXT: [[COPY1:%[0-9]+]]:vrb(<vscale x 2 x p0>) = COPY $v8
+    ; RV32I-NEXT: G_STORE [[COPY1]](<vscale x 2 x p0>), [[COPY]](p0) :: (store (<vscale x 2 x p0>) into %ir.pa)
+    ; RV32I-NEXT: PseudoRET
+    ;
+    ; RV64I-LABEL: name: vstore_nx2ptr
+    ; RV64I: liveins: $v8, $x10
+    ; RV64I-NEXT: {{  $}}
+    ; RV64I-NEXT: [[COPY:%[0-9]+]]:gprb(p0) = COPY $x10
+    ; RV64I-NEXT: [[COPY1:%[0-9]+]]:vrb(<vscale x 2 x p0>) = COPY $v8
+    ; RV64I-NEXT: G_STORE [[COPY1]](<vscale x 2 x p0>), [[COPY]](p0) :: (store (<vscale x 2 x p0>) into %ir.pa)
+    ; RV64I-NEXT: PseudoRET
+    %0:_(p0) = COPY $x10
+    %1:_(<vscale x 2 x p0>) = COPY $v8
+    G_STORE %1(<vscale x 2 x p0>), %0(p0) :: (store (<vscale x 2 x p0>) into %ir.pa)
+    PseudoRET
+
+...
+---
+name:            vstore_nx8ptr
+legalized:       true
+tracksRegLiveness: true
+body:             |
+  bb.1 (%ir-block.0):
+    liveins: $x10, $v8m4
+
+    ; RV32I-LABEL: name: vstore_nx8ptr
+    ; RV32I: liveins: $x10, $v8m4
+    ; RV32I-NEXT: {{  $}}
+    ; RV32I-NEXT: [[COPY:%[0-9]+]]:gprb(p0) = COPY $x10
+    ; RV32I-NEXT: [[COPY1:%[0-9]+]]:vrb(<vscale x 8 x p0>) = COPY $v8m4
+    ; RV32I-NEXT: G_STORE [[COPY1]](<vscale x 8 x p0>), [[COPY]](p0) :: (store (<vscale x 8 x p0>) into %ir.pa)
+    ; RV32I-NEXT: PseudoRET
+    ;
+    ; RV64I-LABEL: name: vstore_nx8ptr
+    ; RV64I: liveins: $x10, $v8m4
+    ; RV64I-NEXT: {{  $}}
+    ; RV64I-NEXT: [[COPY:%[0-9]+]]:gprb(p0) = COPY $x10
+    ; RV64I-NEXT: [[COPY1:%[0-9]+]]:vrb(<vscale x 8 x p0>) = COPY $v8m4
+    ; RV64I-NEXT: G_STORE [[COPY1]](<vscale x 8 x p0>), [[COPY]](p0) :: (store (<vscale x 8 x p0>) into %ir.pa)
+    ; RV64I-NEXT: PseudoRET
+    %0:_(p0) = COPY $x10
+    %1:_(<vscale x 8 x p0>) = COPY $v8m4
+    G_STORE %1(<vscale x 8 x p0>), %0(p0) :: (store (<vscale x 8 x p0>) into %ir.pa)
+    PseudoRET
+
+...
diff --git a/llvm/test/CodeGen/RISCV/avgflooru.ll b/llvm/test/CodeGen/RISCV/avgflooru.ll
index b58aaab..fa88c37 100644
--- a/llvm/test/CodeGen/RISCV/avgflooru.ll
+++ b/llvm/test/CodeGen/RISCV/avgflooru.ll
@@ -164,18 +164,20 @@ define i32 @test_ext_i32(i32 %a0, i32 %a1) nounwind {
 define i64 @test_fixed_i64(i64 %a0, i64 %a1) nounwind {
 ; RV32I-LABEL: test_fixed_i64:
 ; RV32I:       # %bb.0:
-; RV32I-NEXT:    and a4, a1, a3
-; RV32I-NEXT:    xor a1, a1, a3
-; RV32I-NEXT:    srli a3, a1, 1
-; RV32I-NEXT:    add a3, a4, a3
-; RV32I-NEXT:    slli a1, a1, 31
-; RV32I-NEXT:    xor a4, a0, a2
-; RV32I-NEXT:    srli a4, a4, 1
-; RV32I-NEXT:    or a1, a4, a1
-; RV32I-NEXT:    and a2, a0, a2
-; RV32I-NEXT:    add a0, a2, a1
+; RV32I-NEXT:    add a4, a3, a1
+; RV32I-NEXT:    add a0, a2, a0
 ; RV32I-NEXT:    sltu a1, a0, a2
-; RV32I-NEXT:    add a1, a3, a1
+; RV32I-NEXT:    add a2, a4, a1
+; RV32I-NEXT:    beq a2, a3, .LBB6_2
+; RV32I-NEXT:  # %bb.1:
+; RV32I-NEXT:    sltu a1, a2, a3
+; RV32I-NEXT:  .LBB6_2:
+; RV32I-NEXT:    slli a1, a1, 31
+; RV32I-NEXT:    srli a3, a2, 1
+; RV32I-NEXT:    or a1, a3, a1
+; RV32I-NEXT:    slli a2, a2, 31
+; RV32I-NEXT:    srli a0, a0, 1
+; RV32I-NEXT:    or a0, a0, a2
 ; RV32I-NEXT:    ret
 ;
 ; RV64I-LABEL: test_fixed_i64:
@@ -195,18 +197,20 @@ define i64 @test_fixed_i64(i64 %a0, i64 %a1) nounwind {
 define i64 @test_ext_i64(i64 %a0, i64 %a1) nounwind {
 ; RV32I-LABEL: test_ext_i64:
 ; RV32I:       # %bb.0:
-; RV32I-NEXT:    and a4, a1, a3
-; RV32I-NEXT:    xor a1, a1, a3
-; RV32I-NEXT:    srli a3, a1, 1
-; RV32I-NEXT:    add a3, a4, a3
-; RV32I-NEXT:    slli a1, a1, 31
-; RV32I-NEXT:    xor a4, a0, a2
-; RV32I-NEXT:    srli a4, a4, 1
-; RV32I-NEXT:    or a1, a4, a1
-; RV32I-NEXT:    and a2, a0, a2
-; RV32I-NEXT:    add a0, a2, a1
+; RV32I-NEXT:    add a4, a3, a1
+; RV32I-NEXT:    add a0, a2, a0
 ; RV32I-NEXT:    sltu a1, a0, a2
-; RV32I-NEXT:    add a1, a3, a1
+; RV32I-NEXT:    add a2, a4, a1
+; RV32I-NEXT:    beq a2, a3, .LBB7_2
+; RV32I-NEXT:  # %bb.1:
+; RV32I-NEXT:    sltu a1, a2, a3
+; RV32I-NEXT:  .LBB7_2:
+; RV32I-NEXT:    slli a1, a1, 31
+; RV32I-NEXT:    srli a3, a2, 1
+; RV32I-NEXT:    or a1, a3, a1
+; RV32I-NEXT:    slli a2, a2, 31
+; RV32I-NEXT:    srli a0, a0, 1
+; RV32I-NEXT:    or a0, a0, a2
 ; RV32I-NEXT:    ret
 ;
 ; RV64I-LABEL: test_ext_i64:
diff --git a/llvm/test/CodeGen/RISCV/condops.ll b/llvm/test/CodeGen/RISCV/condops.ll
index 101cb5a..622365c 100644
--- a/llvm/test/CodeGen/RISCV/condops.ll
+++ b/llvm/test/CodeGen/RISCV/condops.ll
@@ -1,6 +1,7 @@
 ; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py UTC_ARGS: --version 3
 ; RUN: llc -mtriple=riscv32 -target-abi=ilp32f -mattr=+f,+zbs < %s | FileCheck %s -check-prefix=RV32I
 ; RUN: llc -mtriple=riscv64 -target-abi=lp64f -mattr=+f,+zbs < %s | FileCheck %s -check-prefix=RV64I
+; RUN: llc -mtriple=riscv32 -target-abi=ilp32f -mattr=+f,+zbs,+xventanacondops < %s | FileCheck %s -check-prefix=RV32XVENTANACONDOPS
 ; RUN: llc -mtriple=riscv64 -target-abi=lp64f -mattr=+f,+zbs,+xventanacondops < %s | FileCheck %s -check-prefix=RV64XVENTANACONDOPS
 ; RUN: llc -mtriple=riscv64 -target-abi=lp64f -mattr=+f,+zbs,+xtheadcondmov < %s | FileCheck %s -check-prefix=RV64XTHEADCONDMOV
 ; RUN: llc -mtriple=riscv32 -target-abi=ilp32f -mattr=+f,+zbs,+zicond < %s | FileCheck %s -check-prefix=RV32ZICOND
@@ -20,6 +21,12 @@ define i64 @zero1(i64 %rs1, i1 zeroext %rc) {
 ; RV64I-NEXT:    and a0, a1, a0
 ; RV64I-NEXT:    ret
 ;
+; RV32XVENTANACONDOPS-LABEL: zero1:
+; RV32XVENTANACONDOPS:       # %bb.0:
+; RV32XVENTANACONDOPS-NEXT:    vt.maskc a0, a0, a2
+; RV32XVENTANACONDOPS-NEXT:    vt.maskc a1, a1, a2
+; RV32XVENTANACONDOPS-NEXT:    ret
+;
 ; RV64XVENTANACONDOPS-LABEL: zero1:
 ; RV64XVENTANACONDOPS:       # %bb.0:
 ; RV64XVENTANACONDOPS-NEXT:    vt.maskc a0, a0, a1
@@ -58,6 +65,12 @@ define i64 @zero2(i64 %rs1, i1 zeroext %rc) {
 ; RV64I-NEXT:    and a0, a1, a0
 ; RV64I-NEXT:    ret
 ;
+; RV32XVENTANACONDOPS-LABEL: zero2:
+; RV32XVENTANACONDOPS:       # %bb.0:
+; RV32XVENTANACONDOPS-NEXT:    vt.maskcn a0, a0, a2
+; RV32XVENTANACONDOPS-NEXT:    vt.maskcn a1, a1, a2
+; RV32XVENTANACONDOPS-NEXT:    ret
+;
 ; RV64XVENTANACONDOPS-LABEL: zero2:
 ; RV64XVENTANACONDOPS:       # %bb.0:
 ; RV64XVENTANACONDOPS-NEXT:    vt.maskcn a0, a0, a1
@@ -98,6 +111,13 @@ define i64 @zero_singlebit1(i64 %rs1, i64 %rs2) {
 ; RV64I-NEXT:    and a0, a1, a0
 ; RV64I-NEXT:    ret
 ;
+; RV32XVENTANACONDOPS-LABEL: zero_singlebit1:
+; RV32XVENTANACONDOPS:       # %bb.0:
+; RV32XVENTANACONDOPS-NEXT:    bexti a2, a2, 12
+; RV32XVENTANACONDOPS-NEXT:    vt.maskcn a0, a0, a2
+; RV32XVENTANACONDOPS-NEXT:    vt.maskcn a1, a1, a2
+; RV32XVENTANACONDOPS-NEXT:    ret
+;
 ; RV64XVENTANACONDOPS-LABEL: zero_singlebit1:
 ; RV64XVENTANACONDOPS:       # %bb.0:
 ; RV64XVENTANACONDOPS-NEXT:    bexti a1, a1, 12
@@ -145,6 +165,13 @@ define i64 @zero_singlebit2(i64 %rs1, i64 %rs2) {
 ; RV64I-NEXT:    and a0, a1, a0
 ; RV64I-NEXT:    ret
 ;
+; RV32XVENTANACONDOPS-LABEL: zero_singlebit2:
+; RV32XVENTANACONDOPS:       # %bb.0:
+; RV32XVENTANACONDOPS-NEXT:    bexti a2, a2, 12
+; RV32XVENTANACONDOPS-NEXT:    vt.maskc a0, a0, a2
+; RV32XVENTANACONDOPS-NEXT:    vt.maskc a1, a1, a2
+; RV32XVENTANACONDOPS-NEXT:    ret
+;
 ; RV64XVENTANACONDOPS-LABEL: zero_singlebit2:
 ; RV64XVENTANACONDOPS:       # %bb.0:
 ; RV64XVENTANACONDOPS-NEXT:    bexti a1, a1, 12
@@ -195,6 +222,16 @@ define i64 @add1(i1 zeroext %rc, i64 %rs1, i64 %rs2) {
 ; RV64I-NEXT:    add a0, a1, a0
 ; RV64I-NEXT:    ret
 ;
+; RV32XVENTANACONDOPS-LABEL: add1:
+; RV32XVENTANACONDOPS:       # %bb.0:
+; RV32XVENTANACONDOPS-NEXT:    vt.maskc a4, a4, a0
+; RV32XVENTANACONDOPS-NEXT:    add a2, a2, a4
+; RV32XVENTANACONDOPS-NEXT:    vt.maskc a0, a3, a0
+; RV32XVENTANACONDOPS-NEXT:    add a0, a1, a0
+; RV32XVENTANACONDOPS-NEXT:    sltu a1, a0, a1
+; RV32XVENTANACONDOPS-NEXT:    add a1, a2, a1
+; RV32XVENTANACONDOPS-NEXT:    ret
+;
 ; RV64XVENTANACONDOPS-LABEL: add1:
 ; RV64XVENTANACONDOPS:       # %bb.0:
 ; RV64XVENTANACONDOPS-NEXT:    vt.maskc a0, a2, a0
@@ -246,6 +283,16 @@ define i64 @add2(i1 zeroext %rc, i64 %rs1, i64 %rs2) {
 ; RV64I-NEXT:    add a0, a2, a0
 ; RV64I-NEXT:    ret
 ;
+; RV32XVENTANACONDOPS-LABEL: add2:
+; RV32XVENTANACONDOPS:       # %bb.0:
+; RV32XVENTANACONDOPS-NEXT:    vt.maskc a2, a2, a0
+; RV32XVENTANACONDOPS-NEXT:    add a2, a4, a2
+; RV32XVENTANACONDOPS-NEXT:    vt.maskc a0, a1, a0
+; RV32XVENTANACONDOPS-NEXT:    add a0, a3, a0
+; RV32XVENTANACONDOPS-NEXT:    sltu a1, a0, a3
+; RV32XVENTANACONDOPS-NEXT:    add a1, a2, a1
+; RV32XVENTANACONDOPS-NEXT:    ret
+;
 ; RV64XVENTANACONDOPS-LABEL: add2:
 ; RV64XVENTANACONDOPS:       # %bb.0:
 ; RV64XVENTANACONDOPS-NEXT:    vt.maskc a0, a1, a0
@@ -297,6 +344,16 @@ define i64 @add3(i1 zeroext %rc, i64 %rs1, i64 %rs2) {
 ; RV64I-NEXT:    add a0, a1, a0
 ; RV64I-NEXT:    ret
 ;
+; RV32XVENTANACONDOPS-LABEL: add3:
+; RV32XVENTANACONDOPS:       # %bb.0:
+; RV32XVENTANACONDOPS-NEXT:    vt.maskcn a4, a4, a0
+; RV32XVENTANACONDOPS-NEXT:    add a2, a2, a4
+; RV32XVENTANACONDOPS-NEXT:    vt.maskcn a0, a3, a0
+; RV32XVENTANACONDOPS-NEXT:    add a0, a1, a0
+; RV32XVENTANACONDOPS-NEXT:    sltu a1, a0, a1
+; RV32XVENTANACONDOPS-NEXT:    add a1, a2, a1
+; RV32XVENTANACONDOPS-NEXT:    ret
+;
 ; RV64XVENTANACONDOPS-LABEL: add3:
 ; RV64XVENTANACONDOPS:       # %bb.0:
 ; RV64XVENTANACONDOPS-NEXT:    vt.maskcn a0, a2, a0
@@ -348,6 +405,16 @@ define i64 @add4(i1 zeroext %rc, i64 %rs1, i64 %rs2) {
 ; RV64I-NEXT:    add a0, a2, a0
 ; RV64I-NEXT:    ret
 ;
+; RV32XVENTANACONDOPS-LABEL: add4:
+; RV32XVENTANACONDOPS:       # %bb.0:
+; RV32XVENTANACONDOPS-NEXT:    vt.maskcn a2, a2, a0
+; RV32XVENTANACONDOPS-NEXT:    add a2, a4, a2
+; RV32XVENTANACONDOPS-NEXT:    vt.maskcn a0, a1, a0
+; RV32XVENTANACONDOPS-NEXT:    add a0, a3, a0
+; RV32XVENTANACONDOPS-NEXT:    sltu a1, a0, a3
+; RV32XVENTANACONDOPS-NEXT:    add a1, a2, a1
+; RV32XVENTANACONDOPS-NEXT:    ret
+;
 ; RV64XVENTANACONDOPS-LABEL: add4:
 ; RV64XVENTANACONDOPS:       # %bb.0:
 ; RV64XVENTANACONDOPS-NEXT:    vt.maskcn a0, a1, a0
@@ -400,6 +467,17 @@ define i64 @sub1(i1 zeroext %rc, i64 %rs1, i64 %rs2) {
 ; RV64I-NEXT:    sub a0, a1, a0
 ; RV64I-NEXT:    ret
 ;
+; RV32XVENTANACONDOPS-LABEL: sub1:
+; RV32XVENTANACONDOPS:       # %bb.0:
+; RV32XVENTANACONDOPS-NEXT:    vt.maskc a3, a3, a0
+; RV32XVENTANACONDOPS-NEXT:    sltu a5, a1, a3
+; RV32XVENTANACONDOPS-NEXT:    vt.maskc a0, a4, a0
+; RV32XVENTANACONDOPS-NEXT:    sub a2, a2, a0
+; RV32XVENTANACONDOPS-NEXT:    sub a2, a2, a5
+; RV32XVENTANACONDOPS-NEXT:    sub a0, a1, a3
+; RV32XVENTANACONDOPS-NEXT:    mv a1, a2
+; RV32XVENTANACONDOPS-NEXT:    ret
+;
 ; RV64XVENTANACONDOPS-LABEL: sub1:
 ; RV64XVENTANACONDOPS:       # %bb.0:
 ; RV64XVENTANACONDOPS-NEXT:    vt.maskc a0, a2, a0
@@ -453,6 +531,17 @@ define i64 @sub2(i1 zeroext %rc, i64 %rs1, i64 %rs2) {
 ; RV64I-NEXT:    sub a0, a1, a0
 ; RV64I-NEXT:    ret
 ;
+; RV32XVENTANACONDOPS-LABEL: sub2:
+; RV32XVENTANACONDOPS:       # %bb.0:
+; RV32XVENTANACONDOPS-NEXT:    vt.maskcn a3, a3, a0
+; RV32XVENTANACONDOPS-NEXT:    sltu a5, a1, a3
+; RV32XVENTANACONDOPS-NEXT:    vt.maskcn a0, a4, a0
+; RV32XVENTANACONDOPS-NEXT:    sub a2, a2, a0
+; RV32XVENTANACONDOPS-NEXT:    sub a2, a2, a5
+; RV32XVENTANACONDOPS-NEXT:    sub a0, a1, a3
+; RV32XVENTANACONDOPS-NEXT:    mv a1, a2
+; RV32XVENTANACONDOPS-NEXT:    ret
+;
 ; RV64XVENTANACONDOPS-LABEL: sub2:
 ; RV64XVENTANACONDOPS:       # %bb.0:
 ; RV64XVENTANACONDOPS-NEXT:    vt.maskcn a0, a2, a0
@@ -503,6 +592,15 @@ define i64 @or1(i1 zeroext %rc, i64 %rs1, i64 %rs2) {
 ; RV64I-NEXT:    or a0, a1, a0
 ; RV64I-NEXT:    ret
 ;
+; RV32XVENTANACONDOPS-LABEL: or1:
+; RV32XVENTANACONDOPS:       # %bb.0:
+; RV32XVENTANACONDOPS-NEXT:    vt.maskc a3, a3, a0
+; RV32XVENTANACONDOPS-NEXT:    or a3, a1, a3
+; RV32XVENTANACONDOPS-NEXT:    vt.maskc a1, a4, a0
+; RV32XVENTANACONDOPS-NEXT:    or a1, a2, a1
+; RV32XVENTANACONDOPS-NEXT:    mv a0, a3
+; RV32XVENTANACONDOPS-NEXT:    ret
+;
 ; RV64XVENTANACONDOPS-LABEL: or1:
 ; RV64XVENTANACONDOPS:       # %bb.0:
 ; RV64XVENTANACONDOPS-NEXT:    vt.maskc a0, a2, a0
@@ -551,6 +649,15 @@ define i64 @or2(i1 zeroext %rc, i64 %rs1, i64 %rs2) {
 ; RV64I-NEXT:    or a0, a2, a0
 ; RV64I-NEXT:    ret
 ;
+; RV32XVENTANACONDOPS-LABEL: or2:
+; RV32XVENTANACONDOPS:       # %bb.0:
+; RV32XVENTANACONDOPS-NEXT:    vt.maskc a1, a1, a0
+; RV32XVENTANACONDOPS-NEXT:    or a3, a3, a1
+; RV32XVENTANACONDOPS-NEXT:    vt.maskc a1, a2, a0
+; RV32XVENTANACONDOPS-NEXT:    or a1, a4, a1
+; RV32XVENTANACONDOPS-NEXT:    mv a0, a3
+; RV32XVENTANACONDOPS-NEXT:    ret
+;
 ; RV64XVENTANACONDOPS-LABEL: or2:
 ; RV64XVENTANACONDOPS:       # %bb.0:
 ; RV64XVENTANACONDOPS-NEXT:    vt.maskc a0, a1, a0
@@ -599,6 +706,15 @@ define i64 @or3(i1 zeroext %rc, i64 %rs1, i64 %rs2) {
 ; RV64I-NEXT:    or a0, a1, a0
 ; RV64I-NEXT:    ret
 ;
+; RV32XVENTANACONDOPS-LABEL: or3:
+; RV32XVENTANACONDOPS:       # %bb.0:
+; RV32XVENTANACONDOPS-NEXT:    vt.maskcn a3, a3, a0
+; RV32XVENTANACONDOPS-NEXT:    or a3, a1, a3
+; RV32XVENTANACONDOPS-NEXT:    vt.maskcn a1, a4, a0
+; RV32XVENTANACONDOPS-NEXT:    or a1, a2, a1
+; RV32XVENTANACONDOPS-NEXT:    mv a0, a3
+; RV32XVENTANACONDOPS-NEXT:    ret
+;
 ; RV64XVENTANACONDOPS-LABEL: or3:
 ; RV64XVENTANACONDOPS:       # %bb.0:
 ; RV64XVENTANACONDOPS-NEXT:    vt.maskcn a0, a2, a0
@@ -647,6 +763,15 @@ define i64 @or4(i1 zeroext %rc, i64 %rs1, i64 %rs2) {
 ; RV64I-NEXT:    or a0, a2, a0
 ; RV64I-NEXT:    ret
 ;
+; RV32XVENTANACONDOPS-LABEL: or4:
+; RV32XVENTANACONDOPS:       # %bb.0:
+; RV32XVENTANACONDOPS-NEXT:    vt.maskcn a1, a1, a0
+; RV32XVENTANACONDOPS-NEXT:    or a3, a3, a1
+; RV32XVENTANACONDOPS-NEXT:    vt.maskcn a1, a2, a0
+; RV32XVENTANACONDOPS-NEXT:    or a1, a4, a1
+; RV32XVENTANACONDOPS-NEXT:    mv a0, a3
+; RV32XVENTANACONDOPS-NEXT:    ret
+;
 ; RV64XVENTANACONDOPS-LABEL: or4:
 ; RV64XVENTANACONDOPS:       # %bb.0:
 ; RV64XVENTANACONDOPS-NEXT:    vt.maskcn a0, a1, a0
@@ -695,6 +820,15 @@ define i64 @xor1(i1 zeroext %rc, i64 %rs1, i64 %rs2) {
 ; RV64I-NEXT:    xor a0, a1, a0
 ; RV64I-NEXT:    ret
 ;
+; RV32XVENTANACONDOPS-LABEL: xor1:
+; RV32XVENTANACONDOPS:       # %bb.0:
+; RV32XVENTANACONDOPS-NEXT:    vt.maskc a3, a3, a0
+; RV32XVENTANACONDOPS-NEXT:    xor a3, a1, a3
+; RV32XVENTANACONDOPS-NEXT:    vt.maskc a1, a4, a0
+; RV32XVENTANACONDOPS-NEXT:    xor a1, a2, a1
+; RV32XVENTANACONDOPS-NEXT:    mv a0, a3
+; RV32XVENTANACONDOPS-NEXT:    ret
+;
 ; RV64XVENTANACONDOPS-LABEL: xor1:
 ; RV64XVENTANACONDOPS:       # %bb.0:
 ; RV64XVENTANACONDOPS-NEXT:    vt.maskc a0, a2, a0
@@ -743,6 +877,15 @@ define i64 @xor2(i1 zeroext %rc, i64 %rs1, i64 %rs2) {
 ; RV64I-NEXT:    xor a0, a2, a0
 ; RV64I-NEXT:    ret
 ;
+; RV32XVENTANACONDOPS-LABEL: xor2:
+; RV32XVENTANACONDOPS:       # %bb.0:
+; RV32XVENTANACONDOPS-NEXT:    vt.maskc a1, a1, a0
+; RV32XVENTANACONDOPS-NEXT:    xor a3, a3, a1
+; RV32XVENTANACONDOPS-NEXT:    vt.maskc a1, a2, a0
+; RV32XVENTANACONDOPS-NEXT:    xor a1, a4, a1
+; RV32XVENTANACONDOPS-NEXT:    mv a0, a3
+; RV32XVENTANACONDOPS-NEXT:    ret
+;
 ; RV64XVENTANACONDOPS-LABEL: xor2:
 ; RV64XVENTANACONDOPS:       # %bb.0:
 ; RV64XVENTANACONDOPS-NEXT:    vt.maskc a0, a1, a0
@@ -791,6 +934,15 @@ define i64 @xor3(i1 zeroext %rc, i64 %rs1, i64 %rs2) {
 ; RV64I-NEXT:    xor a0, a1, a0
 ; RV64I-NEXT:    ret
 ;
+; RV32XVENTANACONDOPS-LABEL: xor3:
+; RV32XVENTANACONDOPS:       # %bb.0:
+; RV32XVENTANACONDOPS-NEXT:    vt.maskcn a3, a3, a0
+; RV32XVENTANACONDOPS-NEXT:    xor a3, a1, a3
+; RV32XVENTANACONDOPS-NEXT:    vt.maskcn a1, a4, a0
+; RV32XVENTANACONDOPS-NEXT:    xor a1, a2, a1
+; RV32XVENTANACONDOPS-NEXT:    mv a0, a3
+; RV32XVENTANACONDOPS-NEXT:    ret
+;
 ; RV64XVENTANACONDOPS-LABEL: xor3:
 ; RV64XVENTANACONDOPS:       # %bb.0:
 ; RV64XVENTANACONDOPS-NEXT:    vt.maskcn a0, a2, a0
@@ -839,6 +991,15 @@ define i64 @xor4(i1 zeroext %rc, i64 %rs1, i64 %rs2) {
 ; RV64I-NEXT:    xor a0, a2, a0
 ; RV64I-NEXT:    ret
 ;
+; RV32XVENTANACONDOPS-LABEL: xor4:
+; RV32XVENTANACONDOPS:       # %bb.0:
+; RV32XVENTANACONDOPS-NEXT:    vt.maskcn a1, a1, a0
+; RV32XVENTANACONDOPS-NEXT:    xor a3, a3, a1
+; RV32XVENTANACONDOPS-NEXT:    vt.maskcn a1, a2, a0
+; RV32XVENTANACONDOPS-NEXT:    xor a1, a4, a1
+; RV32XVENTANACONDOPS-NEXT:    mv a0, a3
+; RV32XVENTANACONDOPS-NEXT:    ret
+;
 ; RV64XVENTANACONDOPS-LABEL: xor4:
 ; RV64XVENTANACONDOPS:       # %bb.0:
 ; RV64XVENTANACONDOPS-NEXT:    vt.maskcn a0, a1, a0
@@ -891,6 +1052,17 @@ define i64 @and1(i1 zeroext %rc, i64 %rs1, i64 %rs2) {
 ; RV64I-NEXT:    mv a0, a1
 ; RV64I-NEXT:    ret
 ;
+; RV32XVENTANACONDOPS-LABEL: and1:
+; RV32XVENTANACONDOPS:       # %bb.0:
+; RV32XVENTANACONDOPS-NEXT:    and a4, a2, a4
+; RV32XVENTANACONDOPS-NEXT:    and a3, a1, a3
+; RV32XVENTANACONDOPS-NEXT:    vt.maskcn a1, a1, a0
+; RV32XVENTANACONDOPS-NEXT:    or a3, a3, a1
+; RV32XVENTANACONDOPS-NEXT:    vt.maskcn a1, a2, a0
+; RV32XVENTANACONDOPS-NEXT:    or a1, a4, a1
+; RV32XVENTANACONDOPS-NEXT:    mv a0, a3
+; RV32XVENTANACONDOPS-NEXT:    ret
+;
 ; RV64XVENTANACONDOPS-LABEL: and1:
 ; RV64XVENTANACONDOPS:       # %bb.0:
 ; RV64XVENTANACONDOPS-NEXT:    and a2, a1, a2
@@ -948,6 +1120,17 @@ define i64 @and2(i1 zeroext %rc, i64 %rs1, i64 %rs2) {
 ; RV64I-NEXT:    mv a0, a2
 ; RV64I-NEXT:    ret
 ;
+; RV32XVENTANACONDOPS-LABEL: and2:
+; RV32XVENTANACONDOPS:       # %bb.0:
+; RV32XVENTANACONDOPS-NEXT:    and a5, a2, a4
+; RV32XVENTANACONDOPS-NEXT:    and a1, a1, a3
+; RV32XVENTANACONDOPS-NEXT:    vt.maskcn a2, a3, a0
+; RV32XVENTANACONDOPS-NEXT:    or a2, a1, a2
+; RV32XVENTANACONDOPS-NEXT:    vt.maskcn a1, a4, a0
+; RV32XVENTANACONDOPS-NEXT:    or a1, a5, a1
+; RV32XVENTANACONDOPS-NEXT:    mv a0, a2
+; RV32XVENTANACONDOPS-NEXT:    ret
+;
 ; RV64XVENTANACONDOPS-LABEL: and2:
 ; RV64XVENTANACONDOPS:       # %bb.0:
 ; RV64XVENTANACONDOPS-NEXT:    and a1, a1, a2
@@ -1005,6 +1188,17 @@ define i64 @and3(i1 zeroext %rc, i64 %rs1, i64 %rs2) {
 ; RV64I-NEXT:    mv a0, a1
 ; RV64I-NEXT:    ret
 ;
+; RV32XVENTANACONDOPS-LABEL: and3:
+; RV32XVENTANACONDOPS:       # %bb.0:
+; RV32XVENTANACONDOPS-NEXT:    and a4, a2, a4
+; RV32XVENTANACONDOPS-NEXT:    and a3, a1, a3
+; RV32XVENTANACONDOPS-NEXT:    vt.maskc a1, a1, a0
+; RV32XVENTANACONDOPS-NEXT:    or a3, a3, a1
+; RV32XVENTANACONDOPS-NEXT:    vt.maskc a1, a2, a0
+; RV32XVENTANACONDOPS-NEXT:    or a1, a4, a1
+; RV32XVENTANACONDOPS-NEXT:    mv a0, a3
+; RV32XVENTANACONDOPS-NEXT:    ret
+;
 ; RV64XVENTANACONDOPS-LABEL: and3:
 ; RV64XVENTANACONDOPS:       # %bb.0:
 ; RV64XVENTANACONDOPS-NEXT:    and a2, a1, a2
@@ -1062,6 +1256,17 @@ define i64 @and4(i1 zeroext %rc, i64 %rs1, i64 %rs2) {
 ; RV64I-NEXT:    mv a0, a2
 ; RV64I-NEXT:    ret
 ;
+; RV32XVENTANACONDOPS-LABEL: and4:
+; RV32XVENTANACONDOPS:       # %bb.0:
+; RV32XVENTANACONDOPS-NEXT:    and a5, a2, a4
+; RV32XVENTANACONDOPS-NEXT:    and a1, a1, a3
+; RV32XVENTANACONDOPS-NEXT:    vt.maskc a2, a3, a0
+; RV32XVENTANACONDOPS-NEXT:    or a2, a1, a2
+; RV32XVENTANACONDOPS-NEXT:    vt.maskc a1, a4, a0
+; RV32XVENTANACONDOPS-NEXT:    or a1, a5, a1
+; RV32XVENTANACONDOPS-NEXT:    mv a0, a2
+; RV32XVENTANACONDOPS-NEXT:    ret
+;
 ; RV64XVENTANACONDOPS-LABEL: and4:
 ; RV64XVENTANACONDOPS:       # %bb.0:
 ; RV64XVENTANACONDOPS-NEXT:    and a1, a1, a2
@@ -1119,6 +1324,17 @@ define i64 @basic(i1 zeroext %rc, i64 %rs1, i64 %rs2) {
 ; RV64I-NEXT:    mv a0, a1
 ; RV64I-NEXT:    ret
 ;
+; RV32XVENTANACONDOPS-LABEL: basic:
+; RV32XVENTANACONDOPS:       # %bb.0:
+; RV32XVENTANACONDOPS-NEXT:    vt.maskcn a3, a3, a0
+; RV32XVENTANACONDOPS-NEXT:    vt.maskc a1, a1, a0
+; RV32XVENTANACONDOPS-NEXT:    or a3, a1, a3
+; RV32XVENTANACONDOPS-NEXT:    vt.maskcn a1, a4, a0
+; RV32XVENTANACONDOPS-NEXT:    vt.maskc a0, a2, a0
+; RV32XVENTANACONDOPS-NEXT:    or a1, a0, a1
+; RV32XVENTANACONDOPS-NEXT:    mv a0, a3
+; RV32XVENTANACONDOPS-NEXT:    ret
+;
 ; RV64XVENTANACONDOPS-LABEL: basic:
 ; RV64XVENTANACONDOPS:       # %bb.0:
 ; RV64XVENTANACONDOPS-NEXT:    vt.maskcn a2, a2, a0
@@ -1177,6 +1393,19 @@ define i64 @seteq(i64 %a, i64 %b, i64 %rs1, i64 %rs2) {
 ; RV64I-NEXT:    mv a0, a2
 ; RV64I-NEXT:    ret
 ;
+; RV32XVENTANACONDOPS-LABEL: seteq:
+; RV32XVENTANACONDOPS:       # %bb.0:
+; RV32XVENTANACONDOPS-NEXT:    xor a1, a1, a3
+; RV32XVENTANACONDOPS-NEXT:    xor a0, a0, a2
+; RV32XVENTANACONDOPS-NEXT:    or a1, a0, a1
+; RV32XVENTANACONDOPS-NEXT:    vt.maskc a0, a6, a1
+; RV32XVENTANACONDOPS-NEXT:    vt.maskcn a2, a4, a1
+; RV32XVENTANACONDOPS-NEXT:    or a0, a2, a0
+; RV32XVENTANACONDOPS-NEXT:    vt.maskc a2, a7, a1
+; RV32XVENTANACONDOPS-NEXT:    vt.maskcn a1, a5, a1
+; RV32XVENTANACONDOPS-NEXT:    or a1, a1, a2
+; RV32XVENTANACONDOPS-NEXT:    ret
+;
 ; RV64XVENTANACONDOPS-LABEL: seteq:
 ; RV64XVENTANACONDOPS:       # %bb.0:
 ; RV64XVENTANACONDOPS-NEXT:    xor a0, a0, a1
@@ -1241,6 +1470,19 @@ define i64 @setne(i64 %a, i64 %b, i64 %rs1, i64 %rs2) {
 ; RV64I-NEXT:    mv a0, a2
 ; RV64I-NEXT:    ret
 ;
+; RV32XVENTANACONDOPS-LABEL: setne:
+; RV32XVENTANACONDOPS:       # %bb.0:
+; RV32XVENTANACONDOPS-NEXT:    xor a1, a1, a3
+; RV32XVENTANACONDOPS-NEXT:    xor a0, a0, a2
+; RV32XVENTANACONDOPS-NEXT:    or a1, a0, a1
+; RV32XVENTANACONDOPS-NEXT:    vt.maskcn a0, a6, a1
+; RV32XVENTANACONDOPS-NEXT:    vt.maskc a2, a4, a1
+; RV32XVENTANACONDOPS-NEXT:    or a0, a2, a0
+; RV32XVENTANACONDOPS-NEXT:    vt.maskcn a2, a7, a1
+; RV32XVENTANACONDOPS-NEXT:    vt.maskc a1, a5, a1
+; RV32XVENTANACONDOPS-NEXT:    or a1, a1, a2
+; RV32XVENTANACONDOPS-NEXT:    ret
+;
 ; RV64XVENTANACONDOPS-LABEL: setne:
 ; RV64XVENTANACONDOPS:       # %bb.0:
 ; RV64XVENTANACONDOPS-NEXT:    xor a0, a0, a1
@@ -1309,6 +1551,22 @@ define i64 @setgt(i64 %a, i64 %b, i64 %rs1, i64 %rs2) {
 ; RV64I-NEXT:    mv a0, a2
 ; RV64I-NEXT:    ret
 ;
+; RV32XVENTANACONDOPS-LABEL: setgt:
+; RV32XVENTANACONDOPS:       # %bb.0:
+; RV32XVENTANACONDOPS-NEXT:    xor t0, a1, a3
+; RV32XVENTANACONDOPS-NEXT:    slt a1, a3, a1
+; RV32XVENTANACONDOPS-NEXT:    vt.maskc a1, a1, t0
+; RV32XVENTANACONDOPS-NEXT:    sltu a0, a2, a0
+; RV32XVENTANACONDOPS-NEXT:    vt.maskcn a0, a0, t0
+; RV32XVENTANACONDOPS-NEXT:    or a1, a0, a1
+; RV32XVENTANACONDOPS-NEXT:    vt.maskcn a0, a6, a1
+; RV32XVENTANACONDOPS-NEXT:    vt.maskc a2, a4, a1
+; RV32XVENTANACONDOPS-NEXT:    or a0, a2, a0
+; RV32XVENTANACONDOPS-NEXT:    vt.maskcn a2, a7, a1
+; RV32XVENTANACONDOPS-NEXT:    vt.maskc a1, a5, a1
+; RV32XVENTANACONDOPS-NEXT:    or a1, a1, a2
+; RV32XVENTANACONDOPS-NEXT:    ret
+;
 ; RV64XVENTANACONDOPS-LABEL: setgt:
 ; RV64XVENTANACONDOPS:       # %bb.0:
 ; RV64XVENTANACONDOPS-NEXT:    slt a0, a1, a0
@@ -1380,6 +1638,22 @@ define i64 @setge(i64 %a, i64 %b, i64 %rs1, i64 %rs2) {
 ; RV64I-NEXT:    mv a0, a2
 ; RV64I-NEXT:    ret
 ;
+; RV32XVENTANACONDOPS-LABEL: setge:
+; RV32XVENTANACONDOPS:       # %bb.0:
+; RV32XVENTANACONDOPS-NEXT:    xor t0, a1, a3
+; RV32XVENTANACONDOPS-NEXT:    slt a1, a1, a3
+; RV32XVENTANACONDOPS-NEXT:    vt.maskc a1, a1, t0
+; RV32XVENTANACONDOPS-NEXT:    sltu a0, a0, a2
+; RV32XVENTANACONDOPS-NEXT:    vt.maskcn a0, a0, t0
+; RV32XVENTANACONDOPS-NEXT:    or a1, a0, a1
+; RV32XVENTANACONDOPS-NEXT:    vt.maskc a0, a6, a1
+; RV32XVENTANACONDOPS-NEXT:    vt.maskcn a2, a4, a1
+; RV32XVENTANACONDOPS-NEXT:    or a0, a2, a0
+; RV32XVENTANACONDOPS-NEXT:    vt.maskc a2, a7, a1
+; RV32XVENTANACONDOPS-NEXT:    vt.maskcn a1, a5, a1
+; RV32XVENTANACONDOPS-NEXT:    or a1, a1, a2
+; RV32XVENTANACONDOPS-NEXT:    ret
+;
 ; RV64XVENTANACONDOPS-LABEL: setge:
 ; RV64XVENTANACONDOPS:       # %bb.0:
 ; RV64XVENTANACONDOPS-NEXT:    slt a0, a0, a1
@@ -1451,6 +1725,22 @@ define i64 @setlt(i64 %a, i64 %b, i64 %rs1, i64 %rs2) {
 ; RV64I-NEXT:    mv a0, a2
 ; RV64I-NEXT:    ret
 ;
+; RV32XVENTANACONDOPS-LABEL: setlt:
+; RV32XVENTANACONDOPS:       # %bb.0:
+; RV32XVENTANACONDOPS-NEXT:    xor t0, a1, a3
+; RV32XVENTANACONDOPS-NEXT:    slt a1, a1, a3
+; RV32XVENTANACONDOPS-NEXT:    vt.maskc a1, a1, t0
+; RV32XVENTANACONDOPS-NEXT:    sltu a0, a0, a2
+; RV32XVENTANACONDOPS-NEXT:    vt.maskcn a0, a0, t0
+; RV32XVENTANACONDOPS-NEXT:    or a1, a0, a1
+; RV32XVENTANACONDOPS-NEXT:    vt.maskcn a0, a6, a1
+; RV32XVENTANACONDOPS-NEXT:    vt.maskc a2, a4, a1
+; RV32XVENTANACONDOPS-NEXT:    or a0, a2, a0
+; RV32XVENTANACONDOPS-NEXT:    vt.maskcn a2, a7, a1
+; RV32XVENTANACONDOPS-NEXT:    vt.maskc a1, a5, a1
+; RV32XVENTANACONDOPS-NEXT:    or a1, a1, a2
+; RV32XVENTANACONDOPS-NEXT:    ret
+;
 ; RV64XVENTANACONDOPS-LABEL: setlt:
 ; RV64XVENTANACONDOPS:       # %bb.0:
 ; RV64XVENTANACONDOPS-NEXT:    slt a0, a0, a1
@@ -1522,6 +1812,22 @@ define i64 @setle(i64 %a, i64 %b, i64 %rs1, i64 %rs2) {
 ; RV64I-NEXT:    mv a0, a2
 ; RV64I-NEXT:    ret
 ;
+; RV32XVENTANACONDOPS-LABEL: setle:
+; RV32XVENTANACONDOPS:       # %bb.0:
+; RV32XVENTANACONDOPS-NEXT:    xor t0, a1, a3
+; RV32XVENTANACONDOPS-NEXT:    slt a1, a3, a1
+; RV32XVENTANACONDOPS-NEXT:    vt.maskc a1, a1, t0
+; RV32XVENTANACONDOPS-NEXT:    sltu a0, a2, a0
+; RV32XVENTANACONDOPS-NEXT:    vt.maskcn a0, a0, t0
+; RV32XVENTANACONDOPS-NEXT:    or a1, a0, a1
+; RV32XVENTANACONDOPS-NEXT:    vt.maskc a0, a6, a1
+; RV32XVENTANACONDOPS-NEXT:    vt.maskcn a2, a4, a1
+; RV32XVENTANACONDOPS-NEXT:    or a0, a2, a0
+; RV32XVENTANACONDOPS-NEXT:    vt.maskc a2, a7, a1
+; RV32XVENTANACONDOPS-NEXT:    vt.maskcn a1, a5, a1
+; RV32XVENTANACONDOPS-NEXT:    or a1, a1, a2
+; RV32XVENTANACONDOPS-NEXT:    ret
+;
 ; RV64XVENTANACONDOPS-LABEL: setle:
 ; RV64XVENTANACONDOPS:       # %bb.0:
 ; RV64XVENTANACONDOPS-NEXT:    slt a0, a1, a0
@@ -1593,6 +1899,22 @@ define i64 @setugt(i64 %a, i64 %b, i64 %rs1, i64 %rs2) {
 ; RV64I-NEXT:    mv a0, a2
 ; RV64I-NEXT:    ret
 ;
+; RV32XVENTANACONDOPS-LABEL: setugt:
+; RV32XVENTANACONDOPS:       # %bb.0:
+; RV32XVENTANACONDOPS-NEXT:    xor t0, a1, a3
+; RV32XVENTANACONDOPS-NEXT:    sltu a1, a3, a1
+; RV32XVENTANACONDOPS-NEXT:    vt.maskc a1, a1, t0
+; RV32XVENTANACONDOPS-NEXT:    sltu a0, a2, a0
+; RV32XVENTANACONDOPS-NEXT:    vt.maskcn a0, a0, t0
+; RV32XVENTANACONDOPS-NEXT:    or a1, a0, a1
+; RV32XVENTANACONDOPS-NEXT:    vt.maskcn a0, a6, a1
+; RV32XVENTANACONDOPS-NEXT:    vt.maskc a2, a4, a1
+; RV32XVENTANACONDOPS-NEXT:    or a0, a2, a0
+; RV32XVENTANACONDOPS-NEXT:    vt.maskcn a2, a7, a1
+; RV32XVENTANACONDOPS-NEXT:    vt.maskc a1, a5, a1
+; RV32XVENTANACONDOPS-NEXT:    or a1, a1, a2
+; RV32XVENTANACONDOPS-NEXT:    ret
+;
 ; RV64XVENTANACONDOPS-LABEL: setugt:
 ; RV64XVENTANACONDOPS:       # %bb.0:
 ; RV64XVENTANACONDOPS-NEXT:    sltu a0, a1, a0
@@ -1664,6 +1986,22 @@ define i64 @setuge(i64 %a, i64 %b, i64 %rs1, i64 %rs2) {
 ; RV64I-NEXT:    mv a0, a2
 ; RV64I-NEXT:    ret
 ;
+; RV32XVENTANACONDOPS-LABEL: setuge:
+; RV32XVENTANACONDOPS:       # %bb.0:
+; RV32XVENTANACONDOPS-NEXT:    xor t0, a1, a3
+; RV32XVENTANACONDOPS-NEXT:    sltu a1, a1, a3
+; RV32XVENTANACONDOPS-NEXT:    vt.maskc a1, a1, t0
+; RV32XVENTANACONDOPS-NEXT:    sltu a0, a0, a2
+; RV32XVENTANACONDOPS-NEXT:    vt.maskcn a0, a0, t0
+; RV32XVENTANACONDOPS-NEXT:    or a1, a0, a1
+; RV32XVENTANACONDOPS-NEXT:    vt.maskc a0, a6, a1
+; RV32XVENTANACONDOPS-NEXT:    vt.maskcn a2, a4, a1
+; RV32XVENTANACONDOPS-NEXT:    or a0, a2, a0
+; RV32XVENTANACONDOPS-NEXT:    vt.maskc a2, a7, a1
+; RV32XVENTANACONDOPS-NEXT:    vt.maskcn a1, a5, a1
+; RV32XVENTANACONDOPS-NEXT:    or a1, a1, a2
+; RV32XVENTANACONDOPS-NEXT:    ret
+;
 ; RV64XVENTANACONDOPS-LABEL: setuge:
 ; RV64XVENTANACONDOPS:       # %bb.0:
 ; RV64XVENTANACONDOPS-NEXT:    sltu a0, a0, a1
@@ -1735,6 +2073,22 @@ define i64 @setult(i64 %a, i64 %b, i64 %rs1, i64 %rs2) {
 ; RV64I-NEXT:    mv a0, a2
 ; RV64I-NEXT:    ret
 ;
+; RV32XVENTANACONDOPS-LABEL: setult:
+; RV32XVENTANACONDOPS:       # %bb.0:
+; RV32XVENTANACONDOPS-NEXT:    xor t0, a1, a3
+; RV32XVENTANACONDOPS-NEXT:    sltu a1, a1, a3
+; RV32XVENTANACONDOPS-NEXT:    vt.maskc a1, a1, t0
+; RV32XVENTANACONDOPS-NEXT:    sltu a0, a0, a2
+; RV32XVENTANACONDOPS-NEXT:    vt.maskcn a0, a0, t0
+; RV32XVENTANACONDOPS-NEXT:    or a1, a0, a1
+; RV32XVENTANACONDOPS-NEXT:    vt.maskcn a0, a6, a1
+; RV32XVENTANACONDOPS-NEXT:    vt.maskc a2, a4, a1
+; RV32XVENTANACONDOPS-NEXT:    or a0, a2, a0
+; RV32XVENTANACONDOPS-NEXT:    vt.maskcn a2, a7, a1
+; RV32XVENTANACONDOPS-NEXT:    vt.maskc a1, a5, a1
+; RV32XVENTANACONDOPS-NEXT:    or a1, a1, a2
+; RV32XVENTANACONDOPS-NEXT:    ret
+;
 ; RV64XVENTANACONDOPS-LABEL: setult:
 ; RV64XVENTANACONDOPS:       # %bb.0:
 ; RV64XVENTANACONDOPS-NEXT:    sltu a0, a0, a1
@@ -1806,6 +2160,22 @@ define i64 @setule(i64 %a, i64 %b, i64 %rs1, i64 %rs2) {
 ; RV64I-NEXT:    mv a0, a2
 ; RV64I-NEXT:    ret
 ;
+; RV32XVENTANACONDOPS-LABEL: setule:
+; RV32XVENTANACONDOPS:       # %bb.0:
+; RV32XVENTANACONDOPS-NEXT:    xor t0, a1, a3
+; RV32XVENTANACONDOPS-NEXT:    sltu a1, a3, a1
+; RV32XVENTANACONDOPS-NEXT:    vt.maskc a1, a1, t0
+; RV32XVENTANACONDOPS-NEXT:    sltu a0, a2, a0
+; RV32XVENTANACONDOPS-NEXT:    vt.maskcn a0, a0, t0
+; RV32XVENTANACONDOPS-NEXT:    or a1, a0, a1
+; RV32XVENTANACONDOPS-NEXT:    vt.maskc a0, a6, a1
+; RV32XVENTANACONDOPS-NEXT:    vt.maskcn a2, a4, a1
+; RV32XVENTANACONDOPS-NEXT:    or a0, a2, a0
+; RV32XVENTANACONDOPS-NEXT:    vt.maskc a2, a7, a1
+; RV32XVENTANACONDOPS-NEXT:    vt.maskcn a1, a5, a1
+; RV32XVENTANACONDOPS-NEXT:    or a1, a1, a2
+; RV32XVENTANACONDOPS-NEXT:    ret
+;
 ; RV64XVENTANACONDOPS-LABEL: setule:
 ; RV64XVENTANACONDOPS:       # %bb.0:
 ; RV64XVENTANACONDOPS-NEXT:    sltu a0, a1, a0
@@ -1871,6 +2241,17 @@ define i64 @seteq_zero(i64 %a, i64 %rs1, i64 %rs2) {
 ; RV64I-NEXT:    mv a0, a1
 ; RV64I-NEXT:    ret
 ;
+; RV32XVENTANACONDOPS-LABEL: seteq_zero:
+; RV32XVENTANACONDOPS:       # %bb.0:
+; RV32XVENTANACONDOPS-NEXT:    or a1, a0, a1
+; RV32XVENTANACONDOPS-NEXT:    vt.maskc a0, a4, a1
+; RV32XVENTANACONDOPS-NEXT:    vt.maskcn a2, a2, a1
+; RV32XVENTANACONDOPS-NEXT:    or a0, a2, a0
+; RV32XVENTANACONDOPS-NEXT:    vt.maskc a2, a5, a1
+; RV32XVENTANACONDOPS-NEXT:    vt.maskcn a1, a3, a1
+; RV32XVENTANACONDOPS-NEXT:    or a1, a1, a2
+; RV32XVENTANACONDOPS-NEXT:    ret
+;
 ; RV64XVENTANACONDOPS-LABEL: seteq_zero:
 ; RV64XVENTANACONDOPS:       # %bb.0:
 ; RV64XVENTANACONDOPS-NEXT:    vt.maskc a2, a2, a0
@@ -1928,6 +2309,17 @@ define i64 @setne_zero(i64 %a, i64 %rs1, i64 %rs2) {
 ; RV64I-NEXT:    mv a0, a1
 ; RV64I-NEXT:    ret
 ;
+; RV32XVENTANACONDOPS-LABEL: setne_zero:
+; RV32XVENTANACONDOPS:       # %bb.0:
+; RV32XVENTANACONDOPS-NEXT:    or a1, a0, a1
+; RV32XVENTANACONDOPS-NEXT:    vt.maskcn a0, a4, a1
+; RV32XVENTANACONDOPS-NEXT:    vt.maskc a2, a2, a1
+; RV32XVENTANACONDOPS-NEXT:    or a0, a2, a0
+; RV32XVENTANACONDOPS-NEXT:    vt.maskcn a2, a5, a1
+; RV32XVENTANACONDOPS-NEXT:    vt.maskc a1, a3, a1
+; RV32XVENTANACONDOPS-NEXT:    or a1, a1, a2
+; RV32XVENTANACONDOPS-NEXT:    ret
+;
 ; RV64XVENTANACONDOPS-LABEL: setne_zero:
 ; RV64XVENTANACONDOPS:       # %bb.0:
 ; RV64XVENTANACONDOPS-NEXT:    vt.maskcn a2, a2, a0
@@ -1987,6 +2379,18 @@ define i64 @seteq_constant(i64 %a, i64 %rs1, i64 %rs2) {
 ; RV64I-NEXT:    mv a0, a1
 ; RV64I-NEXT:    ret
 ;
+; RV32XVENTANACONDOPS-LABEL: seteq_constant:
+; RV32XVENTANACONDOPS:       # %bb.0:
+; RV32XVENTANACONDOPS-NEXT:    xori a0, a0, 123
+; RV32XVENTANACONDOPS-NEXT:    or a1, a0, a1
+; RV32XVENTANACONDOPS-NEXT:    vt.maskc a0, a4, a1
+; RV32XVENTANACONDOPS-NEXT:    vt.maskcn a2, a2, a1
+; RV32XVENTANACONDOPS-NEXT:    or a0, a2, a0
+; RV32XVENTANACONDOPS-NEXT:    vt.maskc a2, a5, a1
+; RV32XVENTANACONDOPS-NEXT:    vt.maskcn a1, a3, a1
+; RV32XVENTANACONDOPS-NEXT:    or a1, a1, a2
+; RV32XVENTANACONDOPS-NEXT:    ret
+;
 ; RV64XVENTANACONDOPS-LABEL: seteq_constant:
 ; RV64XVENTANACONDOPS:       # %bb.0:
 ; RV64XVENTANACONDOPS-NEXT:    addi a0, a0, -123
@@ -2050,6 +2454,18 @@ define i64 @setne_constant(i64 %a, i64 %rs1, i64 %rs2) {
 ; RV64I-NEXT:    mv a0, a1
 ; RV64I-NEXT:    ret
 ;
+; RV32XVENTANACONDOPS-LABEL: setne_constant:
+; RV32XVENTANACONDOPS:       # %bb.0:
+; RV32XVENTANACONDOPS-NEXT:    xori a0, a0, 456
+; RV32XVENTANACONDOPS-NEXT:    or a1, a0, a1
+; RV32XVENTANACONDOPS-NEXT:    vt.maskcn a0, a4, a1
+; RV32XVENTANACONDOPS-NEXT:    vt.maskc a2, a2, a1
+; RV32XVENTANACONDOPS-NEXT:    or a0, a2, a0
+; RV32XVENTANACONDOPS-NEXT:    vt.maskcn a2, a5, a1
+; RV32XVENTANACONDOPS-NEXT:    vt.maskc a1, a3, a1
+; RV32XVENTANACONDOPS-NEXT:    or a1, a1, a2
+; RV32XVENTANACONDOPS-NEXT:    ret
+;
 ; RV64XVENTANACONDOPS-LABEL: setne_constant:
 ; RV64XVENTANACONDOPS:       # %bb.0:
 ; RV64XVENTANACONDOPS-NEXT:    addi a0, a0, -456
@@ -2113,6 +2529,18 @@ define i64 @seteq_2048(i64 %a, i64 %rs1, i64 %rs2) {
 ; RV64I-NEXT:    mv a0, a1
 ; RV64I-NEXT:    ret
 ;
+; RV32XVENTANACONDOPS-LABEL: seteq_2048:
+; RV32XVENTANACONDOPS:       # %bb.0:
+; RV32XVENTANACONDOPS-NEXT:    binvi a0, a0, 11
+; RV32XVENTANACONDOPS-NEXT:    or a1, a0, a1
+; RV32XVENTANACONDOPS-NEXT:    vt.maskc a0, a4, a1
+; RV32XVENTANACONDOPS-NEXT:    vt.maskcn a2, a2, a1
+; RV32XVENTANACONDOPS-NEXT:    or a0, a2, a0
+; RV32XVENTANACONDOPS-NEXT:    vt.maskc a2, a5, a1
+; RV32XVENTANACONDOPS-NEXT:    vt.maskcn a1, a3, a1
+; RV32XVENTANACONDOPS-NEXT:    or a1, a1, a2
+; RV32XVENTANACONDOPS-NEXT:    ret
+;
 ; RV64XVENTANACONDOPS-LABEL: seteq_2048:
 ; RV64XVENTANACONDOPS:       # %bb.0:
 ; RV64XVENTANACONDOPS-NEXT:    addi a0, a0, -2048
@@ -2177,6 +2605,19 @@ define i64 @seteq_neg2048(i64 %a, i64 %rs1, i64 %rs2) {
 ; RV64I-NEXT:    mv a0, a1
 ; RV64I-NEXT:    ret
 ;
+; RV32XVENTANACONDOPS-LABEL: seteq_neg2048:
+; RV32XVENTANACONDOPS:       # %bb.0:
+; RV32XVENTANACONDOPS-NEXT:    not a1, a1
+; RV32XVENTANACONDOPS-NEXT:    xori a0, a0, -2048
+; RV32XVENTANACONDOPS-NEXT:    or a1, a0, a1
+; RV32XVENTANACONDOPS-NEXT:    vt.maskc a0, a4, a1
+; RV32XVENTANACONDOPS-NEXT:    vt.maskcn a2, a2, a1
+; RV32XVENTANACONDOPS-NEXT:    or a0, a2, a0
+; RV32XVENTANACONDOPS-NEXT:    vt.maskc a2, a5, a1
+; RV32XVENTANACONDOPS-NEXT:    vt.maskcn a1, a3, a1
+; RV32XVENTANACONDOPS-NEXT:    or a1, a1, a2
+; RV32XVENTANACONDOPS-NEXT:    ret
+;
 ; RV64XVENTANACONDOPS-LABEL: seteq_neg2048:
 ; RV64XVENTANACONDOPS:       # %bb.0:
 ; RV64XVENTANACONDOPS-NEXT:    xori a0, a0, -2048
@@ -2242,6 +2683,19 @@ define i64 @setne_neg2048(i64 %a, i64 %rs1, i64 %rs2) {
 ; RV64I-NEXT:    mv a0, a1
 ; RV64I-NEXT:    ret
 ;
+; RV32XVENTANACONDOPS-LABEL: setne_neg2048:
+; RV32XVENTANACONDOPS:       # %bb.0:
+; RV32XVENTANACONDOPS-NEXT:    not a1, a1
+; RV32XVENTANACONDOPS-NEXT:    xori a0, a0, -2048
+; RV32XVENTANACONDOPS-NEXT:    or a1, a0, a1
+; RV32XVENTANACONDOPS-NEXT:    vt.maskcn a0, a4, a1
+; RV32XVENTANACONDOPS-NEXT:    vt.maskc a2, a2, a1
+; RV32XVENTANACONDOPS-NEXT:    or a0, a2, a0
+; RV32XVENTANACONDOPS-NEXT:    vt.maskcn a2, a5, a1
+; RV32XVENTANACONDOPS-NEXT:    vt.maskc a1, a3, a1
+; RV32XVENTANACONDOPS-NEXT:    or a1, a1, a2
+; RV32XVENTANACONDOPS-NEXT:    ret
+;
 ; RV64XVENTANACONDOPS-LABEL: setne_neg2048:
 ; RV64XVENTANACONDOPS:       # %bb.0:
 ; RV64XVENTANACONDOPS-NEXT:    xori a0, a0, -2048
@@ -2302,6 +2756,15 @@ define i64 @zero1_seteq(i64 %a, i64 %b, i64 %rs1) {
 ; RV64I-NEXT:    and a0, a0, a2
 ; RV64I-NEXT:    ret
 ;
+; RV32XVENTANACONDOPS-LABEL: zero1_seteq:
+; RV32XVENTANACONDOPS:       # %bb.0:
+; RV32XVENTANACONDOPS-NEXT:    xor a1, a1, a3
+; RV32XVENTANACONDOPS-NEXT:    xor a0, a0, a2
+; RV32XVENTANACONDOPS-NEXT:    or a1, a0, a1
+; RV32XVENTANACONDOPS-NEXT:    vt.maskcn a0, a4, a1
+; RV32XVENTANACONDOPS-NEXT:    vt.maskcn a1, a5, a1
+; RV32XVENTANACONDOPS-NEXT:    ret
+;
 ; RV64XVENTANACONDOPS-LABEL: zero1_seteq:
 ; RV64XVENTANACONDOPS:       # %bb.0:
 ; RV64XVENTANACONDOPS-NEXT:    xor a0, a0, a1
@@ -2354,6 +2817,15 @@ define i64 @zero2_seteq(i64 %a, i64 %b, i64 %rs1) {
 ; RV64I-NEXT:    and a0, a0, a2
 ; RV64I-NEXT:    ret
 ;
+; RV32XVENTANACONDOPS-LABEL: zero2_seteq:
+; RV32XVENTANACONDOPS:       # %bb.0:
+; RV32XVENTANACONDOPS-NEXT:    xor a1, a1, a3
+; RV32XVENTANACONDOPS-NEXT:    xor a0, a0, a2
+; RV32XVENTANACONDOPS-NEXT:    or a1, a0, a1
+; RV32XVENTANACONDOPS-NEXT:    vt.maskc a0, a4, a1
+; RV32XVENTANACONDOPS-NEXT:    vt.maskc a1, a5, a1
+; RV32XVENTANACONDOPS-NEXT:    ret
+;
 ; RV64XVENTANACONDOPS-LABEL: zero2_seteq:
 ; RV64XVENTANACONDOPS:       # %bb.0:
 ; RV64XVENTANACONDOPS-NEXT:    xor a0, a0, a1
@@ -2406,6 +2878,15 @@ define i64 @zero1_setne(i64 %a, i64 %b, i64 %rs1) {
 ; RV64I-NEXT:    and a0, a0, a2
 ; RV64I-NEXT:    ret
 ;
+; RV32XVENTANACONDOPS-LABEL: zero1_setne:
+; RV32XVENTANACONDOPS:       # %bb.0:
+; RV32XVENTANACONDOPS-NEXT:    xor a1, a1, a3
+; RV32XVENTANACONDOPS-NEXT:    xor a0, a0, a2
+; RV32XVENTANACONDOPS-NEXT:    or a1, a0, a1
+; RV32XVENTANACONDOPS-NEXT:    vt.maskc a0, a4, a1
+; RV32XVENTANACONDOPS-NEXT:    vt.maskc a1, a5, a1
+; RV32XVENTANACONDOPS-NEXT:    ret
+;
 ; RV64XVENTANACONDOPS-LABEL: zero1_setne:
 ; RV64XVENTANACONDOPS:       # %bb.0:
 ; RV64XVENTANACONDOPS-NEXT:    xor a0, a0, a1
@@ -2458,6 +2939,15 @@ define i64 @zero2_setne(i64 %a, i64 %b, i64 %rs1) {
 ; RV64I-NEXT:    and a0, a0, a2
 ; RV64I-NEXT:    ret
 ;
+; RV32XVENTANACONDOPS-LABEL: zero2_setne:
+; RV32XVENTANACONDOPS:       # %bb.0:
+; RV32XVENTANACONDOPS-NEXT:    xor a1, a1, a3
+; RV32XVENTANACONDOPS-NEXT:    xor a0, a0, a2
+; RV32XVENTANACONDOPS-NEXT:    or a1, a0, a1
+; RV32XVENTANACONDOPS-NEXT:    vt.maskcn a0, a4, a1
+; RV32XVENTANACONDOPS-NEXT:    vt.maskcn a1, a5, a1
+; RV32XVENTANACONDOPS-NEXT:    ret
+;
 ; RV64XVENTANACONDOPS-LABEL: zero2_setne:
 ; RV64XVENTANACONDOPS:       # %bb.0:
 ; RV64XVENTANACONDOPS-NEXT:    xor a0, a0, a1
@@ -2507,6 +2997,13 @@ define i64 @zero1_seteq_zero(i64 %a, i64 %rs1) {
 ; RV64I-NEXT:    and a0, a0, a1
 ; RV64I-NEXT:    ret
 ;
+; RV32XVENTANACONDOPS-LABEL: zero1_seteq_zero:
+; RV32XVENTANACONDOPS:       # %bb.0:
+; RV32XVENTANACONDOPS-NEXT:    or a1, a0, a1
+; RV32XVENTANACONDOPS-NEXT:    vt.maskcn a0, a2, a1
+; RV32XVENTANACONDOPS-NEXT:    vt.maskcn a1, a3, a1
+; RV32XVENTANACONDOPS-NEXT:    ret
+;
 ; RV64XVENTANACONDOPS-LABEL: zero1_seteq_zero:
 ; RV64XVENTANACONDOPS:       # %bb.0:
 ; RV64XVENTANACONDOPS-NEXT:    vt.maskcn a0, a1, a0
@@ -2551,6 +3048,13 @@ define i64 @zero2_seteq_zero(i64 %a, i64 %rs1) {
 ; RV64I-NEXT:    and a0, a0, a1
 ; RV64I-NEXT:    ret
 ;
+; RV32XVENTANACONDOPS-LABEL: zero2_seteq_zero:
+; RV32XVENTANACONDOPS:       # %bb.0:
+; RV32XVENTANACONDOPS-NEXT:    or a1, a0, a1
+; RV32XVENTANACONDOPS-NEXT:    vt.maskc a0, a2, a1
+; RV32XVENTANACONDOPS-NEXT:    vt.maskc a1, a3, a1
+; RV32XVENTANACONDOPS-NEXT:    ret
+;
 ; RV64XVENTANACONDOPS-LABEL: zero2_seteq_zero:
 ; RV64XVENTANACONDOPS:       # %bb.0:
 ; RV64XVENTANACONDOPS-NEXT:    vt.maskc a0, a1, a0
@@ -2595,6 +3099,13 @@ define i64 @zero1_setne_zero(i64 %a, i64 %rs1) {
 ; RV64I-NEXT:    and a0, a0, a1
 ; RV64I-NEXT:    ret
 ;
+; RV32XVENTANACONDOPS-LABEL: zero1_setne_zero:
+; RV32XVENTANACONDOPS:       # %bb.0:
+; RV32XVENTANACONDOPS-NEXT:    or a1, a0, a1
+; RV32XVENTANACONDOPS-NEXT:    vt.maskc a0, a2, a1
+; RV32XVENTANACONDOPS-NEXT:    vt.maskc a1, a3, a1
+; RV32XVENTANACONDOPS-NEXT:    ret
+;
 ; RV64XVENTANACONDOPS-LABEL: zero1_setne_zero:
 ; RV64XVENTANACONDOPS:       # %bb.0:
 ; RV64XVENTANACONDOPS-NEXT:    vt.maskc a0, a1, a0
@@ -2639,6 +3150,13 @@ define i64 @zero2_setne_zero(i64 %a, i64 %rs1) {
 ; RV64I-NEXT:    and a0, a0, a1
 ; RV64I-NEXT:    ret
 ;
+; RV32XVENTANACONDOPS-LABEL: zero2_setne_zero:
+; RV32XVENTANACONDOPS:       # %bb.0:
+; RV32XVENTANACONDOPS-NEXT:    or a1, a0, a1
+; RV32XVENTANACONDOPS-NEXT:    vt.maskcn a0, a2, a1
+; RV32XVENTANACONDOPS-NEXT:    vt.maskcn a1, a3, a1
+; RV32XVENTANACONDOPS-NEXT:    ret
+;
 ; RV64XVENTANACONDOPS-LABEL: zero2_setne_zero:
 ; RV64XVENTANACONDOPS:       # %bb.0:
 ; RV64XVENTANACONDOPS-NEXT:    vt.maskcn a0, a1, a0
@@ -2686,6 +3204,15 @@ define i64 @zero1_seteq_constant(i64 %a, i64 %rs1) {
 ; RV64I-NEXT:    and a0, a0, a1
 ; RV64I-NEXT:    ret
 ;
+; RV32XVENTANACONDOPS-LABEL: zero1_seteq_constant:
+; RV32XVENTANACONDOPS:       # %bb.0:
+; RV32XVENTANACONDOPS-NEXT:    not a1, a1
+; RV32XVENTANACONDOPS-NEXT:    xori a0, a0, -231
+; RV32XVENTANACONDOPS-NEXT:    or a1, a0, a1
+; RV32XVENTANACONDOPS-NEXT:    vt.maskcn a0, a2, a1
+; RV32XVENTANACONDOPS-NEXT:    vt.maskcn a1, a3, a1
+; RV32XVENTANACONDOPS-NEXT:    ret
+;
 ; RV64XVENTANACONDOPS-LABEL: zero1_seteq_constant:
 ; RV64XVENTANACONDOPS:       # %bb.0:
 ; RV64XVENTANACONDOPS-NEXT:    addi a0, a0, 231
@@ -2737,6 +3264,14 @@ define i64 @zero2_seteq_constant(i64 %a, i64 %rs1) {
 ; RV64I-NEXT:    and a0, a0, a1
 ; RV64I-NEXT:    ret
 ;
+; RV32XVENTANACONDOPS-LABEL: zero2_seteq_constant:
+; RV32XVENTANACONDOPS:       # %bb.0:
+; RV32XVENTANACONDOPS-NEXT:    xori a0, a0, 546
+; RV32XVENTANACONDOPS-NEXT:    or a1, a0, a1
+; RV32XVENTANACONDOPS-NEXT:    vt.maskc a0, a2, a1
+; RV32XVENTANACONDOPS-NEXT:    vt.maskc a1, a3, a1
+; RV32XVENTANACONDOPS-NEXT:    ret
+;
 ; RV64XVENTANACONDOPS-LABEL: zero2_seteq_constant:
 ; RV64XVENTANACONDOPS:       # %bb.0:
 ; RV64XVENTANACONDOPS-NEXT:    addi a0, a0, -546
@@ -2787,6 +3322,14 @@ define i64 @zero1_setne_constant(i64 %a, i64 %rs1) {
 ; RV64I-NEXT:    and a0, a0, a1
 ; RV64I-NEXT:    ret
 ;
+; RV32XVENTANACONDOPS-LABEL: zero1_setne_constant:
+; RV32XVENTANACONDOPS:       # %bb.0:
+; RV32XVENTANACONDOPS-NEXT:    xori a0, a0, 321
+; RV32XVENTANACONDOPS-NEXT:    or a1, a0, a1
+; RV32XVENTANACONDOPS-NEXT:    vt.maskc a0, a2, a1
+; RV32XVENTANACONDOPS-NEXT:    vt.maskc a1, a3, a1
+; RV32XVENTANACONDOPS-NEXT:    ret
+;
 ; RV64XVENTANACONDOPS-LABEL: zero1_setne_constant:
 ; RV64XVENTANACONDOPS:       # %bb.0:
 ; RV64XVENTANACONDOPS-NEXT:    addi a0, a0, -321
@@ -2838,6 +3381,15 @@ define i64 @zero2_setne_constant(i64 %a, i64 %rs1) {
 ; RV64I-NEXT:    and a0, a0, a1
 ; RV64I-NEXT:    ret
 ;
+; RV32XVENTANACONDOPS-LABEL: zero2_setne_constant:
+; RV32XVENTANACONDOPS:       # %bb.0:
+; RV32XVENTANACONDOPS-NEXT:    not a1, a1
+; RV32XVENTANACONDOPS-NEXT:    xori a0, a0, -654
+; RV32XVENTANACONDOPS-NEXT:    or a1, a0, a1
+; RV32XVENTANACONDOPS-NEXT:    vt.maskcn a0, a2, a1
+; RV32XVENTANACONDOPS-NEXT:    vt.maskcn a1, a3, a1
+; RV32XVENTANACONDOPS-NEXT:    ret
+;
 ; RV64XVENTANACONDOPS-LABEL: zero2_setne_constant:
 ; RV64XVENTANACONDOPS:       # %bb.0:
 ; RV64XVENTANACONDOPS-NEXT:    addi a0, a0, 654
@@ -2890,6 +3442,15 @@ define i64 @zero1_seteq_neg2048(i64 %a, i64 %rs1) {
 ; RV64I-NEXT:    and a0, a0, a1
 ; RV64I-NEXT:    ret
 ;
+; RV32XVENTANACONDOPS-LABEL: zero1_seteq_neg2048:
+; RV32XVENTANACONDOPS:       # %bb.0:
+; RV32XVENTANACONDOPS-NEXT:    not a1, a1
+; RV32XVENTANACONDOPS-NEXT:    xori a0, a0, -2048
+; RV32XVENTANACONDOPS-NEXT:    or a1, a0, a1
+; RV32XVENTANACONDOPS-NEXT:    vt.maskcn a0, a2, a1
+; RV32XVENTANACONDOPS-NEXT:    vt.maskcn a1, a3, a1
+; RV32XVENTANACONDOPS-NEXT:    ret
+;
 ; RV64XVENTANACONDOPS-LABEL: zero1_seteq_neg2048:
 ; RV64XVENTANACONDOPS:       # %bb.0:
 ; RV64XVENTANACONDOPS-NEXT:    xori a0, a0, -2048
@@ -2942,6 +3503,15 @@ define i64 @zero2_seteq_neg2048(i64 %a, i64 %rs1) {
 ; RV64I-NEXT:    and a0, a0, a1
 ; RV64I-NEXT:    ret
 ;
+; RV32XVENTANACONDOPS-LABEL: zero2_seteq_neg2048:
+; RV32XVENTANACONDOPS:       # %bb.0:
+; RV32XVENTANACONDOPS-NEXT:    not a1, a1
+; RV32XVENTANACONDOPS-NEXT:    xori a0, a0, -2048
+; RV32XVENTANACONDOPS-NEXT:    or a1, a0, a1
+; RV32XVENTANACONDOPS-NEXT:    vt.maskc a0, a2, a1
+; RV32XVENTANACONDOPS-NEXT:    vt.maskc a1, a3, a1
+; RV32XVENTANACONDOPS-NEXT:    ret
+;
 ; RV64XVENTANACONDOPS-LABEL: zero2_seteq_neg2048:
 ; RV64XVENTANACONDOPS:       # %bb.0:
 ; RV64XVENTANACONDOPS-NEXT:    xori a0, a0, -2048
@@ -2994,6 +3564,15 @@ define i64 @zero1_setne_neg2048(i64 %a, i64 %rs1) {
 ; RV64I-NEXT:    and a0, a0, a1
 ; RV64I-NEXT:    ret
 ;
+; RV32XVENTANACONDOPS-LABEL: zero1_setne_neg2048:
+; RV32XVENTANACONDOPS:       # %bb.0:
+; RV32XVENTANACONDOPS-NEXT:    not a1, a1
+; RV32XVENTANACONDOPS-NEXT:    xori a0, a0, -2048
+; RV32XVENTANACONDOPS-NEXT:    or a1, a0, a1
+; RV32XVENTANACONDOPS-NEXT:    vt.maskc a0, a2, a1
+; RV32XVENTANACONDOPS-NEXT:    vt.maskc a1, a3, a1
+; RV32XVENTANACONDOPS-NEXT:    ret
+;
 ; RV64XVENTANACONDOPS-LABEL: zero1_setne_neg2048:
 ; RV64XVENTANACONDOPS:       # %bb.0:
 ; RV64XVENTANACONDOPS-NEXT:    xori a0, a0, -2048
@@ -3046,6 +3625,15 @@ define i64 @zero2_setne_neg2048(i64 %a, i64 %rs1) {
 ; RV64I-NEXT:    and a0, a0, a1
 ; RV64I-NEXT:    ret
 ;
+; RV32XVENTANACONDOPS-LABEL: zero2_setne_neg2048:
+; RV32XVENTANACONDOPS:       # %bb.0:
+; RV32XVENTANACONDOPS-NEXT:    not a1, a1
+; RV32XVENTANACONDOPS-NEXT:    xori a0, a0, -2048
+; RV32XVENTANACONDOPS-NEXT:    or a1, a0, a1
+; RV32XVENTANACONDOPS-NEXT:    vt.maskcn a0, a2, a1
+; RV32XVENTANACONDOPS-NEXT:    vt.maskcn a1, a3, a1
+; RV32XVENTANACONDOPS-NEXT:    ret
+;
 ; RV64XVENTANACONDOPS-LABEL: zero2_setne_neg2048:
 ; RV64XVENTANACONDOPS:       # %bb.0:
 ; RV64XVENTANACONDOPS-NEXT:    xori a0, a0, -2048
@@ -3125,6 +3713,28 @@ define void @sextw_removal_maskc(i1 %c, i32 signext %arg, i32 signext %arg1) nou
 ; RV64I-NEXT:    addi sp, sp, 32
 ; RV64I-NEXT:    ret
 ;
+; RV32XVENTANACONDOPS-LABEL: sextw_removal_maskc:
+; RV32XVENTANACONDOPS:       # %bb.0: # %bb
+; RV32XVENTANACONDOPS-NEXT:    addi sp, sp, -16
+; RV32XVENTANACONDOPS-NEXT:    sw ra, 12(sp) # 4-byte Folded Spill
+; RV32XVENTANACONDOPS-NEXT:    sw s0, 8(sp) # 4-byte Folded Spill
+; RV32XVENTANACONDOPS-NEXT:    sw s1, 4(sp) # 4-byte Folded Spill
+; RV32XVENTANACONDOPS-NEXT:    mv s0, a2
+; RV32XVENTANACONDOPS-NEXT:    andi a0, a0, 1
+; RV32XVENTANACONDOPS-NEXT:    vt.maskc s1, a1, a0
+; RV32XVENTANACONDOPS-NEXT:  .LBB56_1: # %bb2
+; RV32XVENTANACONDOPS-NEXT:    # =>This Inner Loop Header: Depth=1
+; RV32XVENTANACONDOPS-NEXT:    mv a0, s1
+; RV32XVENTANACONDOPS-NEXT:    call bar
+; RV32XVENTANACONDOPS-NEXT:    sll s1, s1, s0
+; RV32XVENTANACONDOPS-NEXT:    bnez a0, .LBB56_1
+; RV32XVENTANACONDOPS-NEXT:  # %bb.2: # %bb7
+; RV32XVENTANACONDOPS-NEXT:    lw ra, 12(sp) # 4-byte Folded Reload
+; RV32XVENTANACONDOPS-NEXT:    lw s0, 8(sp) # 4-byte Folded Reload
+; RV32XVENTANACONDOPS-NEXT:    lw s1, 4(sp) # 4-byte Folded Reload
+; RV32XVENTANACONDOPS-NEXT:    addi sp, sp, 16
+; RV32XVENTANACONDOPS-NEXT:    ret
+;
 ; RV64XVENTANACONDOPS-LABEL: sextw_removal_maskc:
 ; RV64XVENTANACONDOPS:       # %bb.0: # %bb
 ; RV64XVENTANACONDOPS-NEXT:    addi sp, sp, -32
@@ -3276,6 +3886,28 @@ define void @sextw_removal_maskcn(i1 %c, i32 signext %arg, i32 signext %arg1) no
 ; RV64I-NEXT:    addi sp, sp, 32
 ; RV64I-NEXT:    ret
 ;
+; RV32XVENTANACONDOPS-LABEL: sextw_removal_maskcn:
+; RV32XVENTANACONDOPS:       # %bb.0: # %bb
+; RV32XVENTANACONDOPS-NEXT:    addi sp, sp, -16
+; RV32XVENTANACONDOPS-NEXT:    sw ra, 12(sp) # 4-byte Folded Spill
+; RV32XVENTANACONDOPS-NEXT:    sw s0, 8(sp) # 4-byte Folded Spill
+; RV32XVENTANACONDOPS-NEXT:    sw s1, 4(sp) # 4-byte Folded Spill
+; RV32XVENTANACONDOPS-NEXT:    mv s0, a2
+; RV32XVENTANACONDOPS-NEXT:    andi a0, a0, 1
+; RV32XVENTANACONDOPS-NEXT:    vt.maskcn s1, a1, a0
+; RV32XVENTANACONDOPS-NEXT:  .LBB57_1: # %bb2
+; RV32XVENTANACONDOPS-NEXT:    # =>This Inner Loop Header: Depth=1
+; RV32XVENTANACONDOPS-NEXT:    mv a0, s1
+; RV32XVENTANACONDOPS-NEXT:    call bar
+; RV32XVENTANACONDOPS-NEXT:    sll s1, s1, s0
+; RV32XVENTANACONDOPS-NEXT:    bnez a0, .LBB57_1
+; RV32XVENTANACONDOPS-NEXT:  # %bb.2: # %bb7
+; RV32XVENTANACONDOPS-NEXT:    lw ra, 12(sp) # 4-byte Folded Reload
+; RV32XVENTANACONDOPS-NEXT:    lw s0, 8(sp) # 4-byte Folded Reload
+; RV32XVENTANACONDOPS-NEXT:    lw s1, 4(sp) # 4-byte Folded Reload
+; RV32XVENTANACONDOPS-NEXT:    addi sp, sp, 16
+; RV32XVENTANACONDOPS-NEXT:    ret
+;
 ; RV64XVENTANACONDOPS-LABEL: sextw_removal_maskcn:
 ; RV64XVENTANACONDOPS:       # %bb.0: # %bb
 ; RV64XVENTANACONDOPS-NEXT:    addi sp, sp, -32
@@ -3398,6 +4030,14 @@ define i32 @setune_32(float %a, float %b, i32 %rs1, i32 %rs2) {
 ; RV64I-NEXT:  .LBB58_2:
 ; RV64I-NEXT:    ret
 ;
+; RV32XVENTANACONDOPS-LABEL: setune_32:
+; RV32XVENTANACONDOPS:       # %bb.0:
+; RV32XVENTANACONDOPS-NEXT:    feq.s a2, fa0, fa1
+; RV32XVENTANACONDOPS-NEXT:    vt.maskc a1, a1, a2
+; RV32XVENTANACONDOPS-NEXT:    vt.maskcn a0, a0, a2
+; RV32XVENTANACONDOPS-NEXT:    or a0, a0, a1
+; RV32XVENTANACONDOPS-NEXT:    ret
+;
 ; RV64XVENTANACONDOPS-LABEL: setune_32:
 ; RV64XVENTANACONDOPS:       # %bb.0:
 ; RV64XVENTANACONDOPS-NEXT:    feq.s a2, fa0, fa1
@@ -3452,6 +4092,17 @@ define i64 @setune_64(float %a, float %b, i64 %rs1, i64 %rs2) {
 ; RV64I-NEXT:  .LBB59_2:
 ; RV64I-NEXT:    ret
 ;
+; RV32XVENTANACONDOPS-LABEL: setune_64:
+; RV32XVENTANACONDOPS:       # %bb.0:
+; RV32XVENTANACONDOPS-NEXT:    feq.s a4, fa0, fa1
+; RV32XVENTANACONDOPS-NEXT:    vt.maskc a2, a2, a4
+; RV32XVENTANACONDOPS-NEXT:    vt.maskcn a0, a0, a4
+; RV32XVENTANACONDOPS-NEXT:    or a0, a0, a2
+; RV32XVENTANACONDOPS-NEXT:    vt.maskc a2, a3, a4
+; RV32XVENTANACONDOPS-NEXT:    vt.maskcn a1, a1, a4
+; RV32XVENTANACONDOPS-NEXT:    or a1, a1, a2
+; RV32XVENTANACONDOPS-NEXT:    ret
+;
 ; RV64XVENTANACONDOPS-LABEL: setune_64:
 ; RV64XVENTANACONDOPS:       # %bb.0:
 ; RV64XVENTANACONDOPS-NEXT:    feq.s a2, fa0, fa1
@@ -3534,6 +4185,25 @@ define signext i16 @numsignbits(i16 signext %0, i16 signext %1, i16 signext %2,
 ; RV64I-NEXT:    addi sp, sp, 16
 ; RV64I-NEXT:    ret
 ;
+; RV32XVENTANACONDOPS-LABEL: numsignbits:
+; RV32XVENTANACONDOPS:       # %bb.0:
+; RV32XVENTANACONDOPS-NEXT:    addi sp, sp, -16
+; RV32XVENTANACONDOPS-NEXT:    sw ra, 12(sp) # 4-byte Folded Spill
+; RV32XVENTANACONDOPS-NEXT:    sw s0, 8(sp) # 4-byte Folded Spill
+; RV32XVENTANACONDOPS-NEXT:    vt.maskc a2, a2, a0
+; RV32XVENTANACONDOPS-NEXT:    vt.maskcn s0, a3, a0
+; RV32XVENTANACONDOPS-NEXT:    or s0, s0, a2
+; RV32XVENTANACONDOPS-NEXT:    beqz a1, .LBB60_2
+; RV32XVENTANACONDOPS-NEXT:  # %bb.1:
+; RV32XVENTANACONDOPS-NEXT:    mv a0, s0
+; RV32XVENTANACONDOPS-NEXT:    call bat
+; RV32XVENTANACONDOPS-NEXT:  .LBB60_2:
+; RV32XVENTANACONDOPS-NEXT:    mv a0, s0
+; RV32XVENTANACONDOPS-NEXT:    lw ra, 12(sp) # 4-byte Folded Reload
+; RV32XVENTANACONDOPS-NEXT:    lw s0, 8(sp) # 4-byte Folded Reload
+; RV32XVENTANACONDOPS-NEXT:    addi sp, sp, 16
+; RV32XVENTANACONDOPS-NEXT:    ret
+;
 ; RV64XVENTANACONDOPS-LABEL: numsignbits:
 ; RV64XVENTANACONDOPS:       # %bb.0:
 ; RV64XVENTANACONDOPS-NEXT:    addi sp, sp, -16
@@ -3639,6 +4309,13 @@ define i64 @single_bit(i64 %x) {
 ; RV64I-NEXT:    and a0, a1, a0
 ; RV64I-NEXT:    ret
 ;
+; RV32XVENTANACONDOPS-LABEL: single_bit:
+; RV32XVENTANACONDOPS:       # %bb.0: # %entry
+; RV32XVENTANACONDOPS-NEXT:    andi a2, a0, 1024
+; RV32XVENTANACONDOPS-NEXT:    vt.maskc a0, a0, a2
+; RV32XVENTANACONDOPS-NEXT:    vt.maskc a1, a1, a2
+; RV32XVENTANACONDOPS-NEXT:    ret
+;
 ; RV64XVENTANACONDOPS-LABEL: single_bit:
 ; RV64XVENTANACONDOPS:       # %bb.0: # %entry
 ; RV64XVENTANACONDOPS-NEXT:    andi a1, a0, 1024
@@ -3688,6 +4365,13 @@ define i64 @single_bit2(i64 %x) {
 ; RV64I-NEXT:    and a0, a1, a0
 ; RV64I-NEXT:    ret
 ;
+; RV32XVENTANACONDOPS-LABEL: single_bit2:
+; RV32XVENTANACONDOPS:       # %bb.0: # %entry
+; RV32XVENTANACONDOPS-NEXT:    bexti a2, a0, 11
+; RV32XVENTANACONDOPS-NEXT:    vt.maskc a0, a0, a2
+; RV32XVENTANACONDOPS-NEXT:    vt.maskc a1, a1, a2
+; RV32XVENTANACONDOPS-NEXT:    ret
+;
 ; RV64XVENTANACONDOPS-LABEL: single_bit2:
 ; RV64XVENTANACONDOPS:       # %bb.0: # %entry
 ; RV64XVENTANACONDOPS-NEXT:    bexti a1, a0, 11
@@ -3738,6 +4422,14 @@ define i64 @single_bit3(i80 %x, i64 %y) {
 ; RV64I-NEXT:    and a0, a0, a2
 ; RV64I-NEXT:    ret
 ;
+; RV32XVENTANACONDOPS-LABEL: single_bit3:
+; RV32XVENTANACONDOPS:       # %bb.0: # %entry
+; RV32XVENTANACONDOPS-NEXT:    lw a0, 8(a0)
+; RV32XVENTANACONDOPS-NEXT:    andi a3, a0, 1
+; RV32XVENTANACONDOPS-NEXT:    vt.maskc a0, a1, a3
+; RV32XVENTANACONDOPS-NEXT:    vt.maskc a1, a2, a3
+; RV32XVENTANACONDOPS-NEXT:    ret
+;
 ; RV64XVENTANACONDOPS-LABEL: single_bit3:
 ; RV64XVENTANACONDOPS:       # %bb.0: # %entry
 ; RV64XVENTANACONDOPS-NEXT:    andi a1, a1, 1
diff --git a/llvm/test/CodeGen/RISCV/double-arith.ll b/llvm/test/CodeGen/RISCV/double-arith.ll
index ced6ff6..ee54501 100644
--- a/llvm/test/CodeGen/RISCV/double-arith.ll
+++ b/llvm/test/CodeGen/RISCV/double-arith.ll
@@ -1497,3 +1497,51 @@ define double @fnmsub_d_contract(double %a, double %b, double %c) nounwind {
   %2 = fsub contract double %c, %1
   ret double %2
 }
+
+define double @fsgnjx_f64(double %x, double %y) nounwind {
+; CHECKIFD-LABEL: fsgnjx_f64:
+; CHECKIFD:       # %bb.0:
+; CHECKIFD-NEXT:    fsgnjx.d fa0, fa1, fa0
+; CHECKIFD-NEXT:    ret
+;
+; RV32IZFINXZDINX-LABEL: fsgnjx_f64:
+; RV32IZFINXZDINX:       # %bb.0:
+; RV32IZFINXZDINX-NEXT:    fsgnjx.d a0, a2, a0
+; RV32IZFINXZDINX-NEXT:    ret
+;
+; RV64IZFINXZDINX-LABEL: fsgnjx_f64:
+; RV64IZFINXZDINX:       # %bb.0:
+; RV64IZFINXZDINX-NEXT:    fsgnjx.d a0, a1, a0
+; RV64IZFINXZDINX-NEXT:    ret
+;
+; RV32I-LABEL: fsgnjx_f64:
+; RV32I:       # %bb.0:
+; RV32I-NEXT:    addi sp, sp, -16
+; RV32I-NEXT:    sw ra, 12(sp) # 4-byte Folded Spill
+; RV32I-NEXT:    lui a0, 524288
+; RV32I-NEXT:    and a0, a1, a0
+; RV32I-NEXT:    lui a1, 261888
+; RV32I-NEXT:    or a1, a0, a1
+; RV32I-NEXT:    li a0, 0
+; RV32I-NEXT:    call __muldf3
+; RV32I-NEXT:    lw ra, 12(sp) # 4-byte Folded Reload
+; RV32I-NEXT:    addi sp, sp, 16
+; RV32I-NEXT:    ret
+;
+; RV64I-LABEL: fsgnjx_f64:
+; RV64I:       # %bb.0:
+; RV64I-NEXT:    addi sp, sp, -16
+; RV64I-NEXT:    sd ra, 8(sp) # 8-byte Folded Spill
+; RV64I-NEXT:    srli a0, a0, 63
+; RV64I-NEXT:    slli a0, a0, 63
+; RV64I-NEXT:    li a2, 1023
+; RV64I-NEXT:    slli a2, a2, 52
+; RV64I-NEXT:    or a0, a0, a2
+; RV64I-NEXT:    call __muldf3
+; RV64I-NEXT:    ld ra, 8(sp) # 8-byte Folded Reload
+; RV64I-NEXT:    addi sp, sp, 16
+; RV64I-NEXT:    ret
+  %z = call double @llvm.copysign.f64(double 1.0, double %x)
+  %mul = fmul double %z, %y
+  ret double %mul
+}
diff --git a/llvm/test/CodeGen/RISCV/float-arith.ll b/llvm/test/CodeGen/RISCV/float-arith.ll
index 7a7ebe6..931f73a 100644
--- a/llvm/test/CodeGen/RISCV/float-arith.ll
+++ b/llvm/test/CodeGen/RISCV/float-arith.ll
@@ -1195,3 +1195,44 @@ define float @fnmsub_s_contract(float %a, float %b, float %c) nounwind {
   %2 = fsub contract float %c, %1
   ret float %2
 }
+
+define float @fsgnjx_f32(float %x, float %y) nounwind {
+; CHECKIF-LABEL: fsgnjx_f32:
+; CHECKIF:       # %bb.0:
+; CHECKIF-NEXT:    fsgnjx.s fa0, fa1, fa0
+; CHECKIF-NEXT:    ret
+;
+; CHECKIZFINX-LABEL: fsgnjx_f32:
+; CHECKIZFINX:       # %bb.0:
+; CHECKIZFINX-NEXT:    fsgnjx.s a0, a1, a0
+; CHECKIZFINX-NEXT:    ret
+;
+; RV32I-LABEL: fsgnjx_f32:
+; RV32I:       # %bb.0:
+; RV32I-NEXT:    addi sp, sp, -16
+; RV32I-NEXT:    sw ra, 12(sp) # 4-byte Folded Spill
+; RV32I-NEXT:    lui a2, 524288
+; RV32I-NEXT:    and a0, a0, a2
+; RV32I-NEXT:    lui a2, 260096
+; RV32I-NEXT:    or a0, a0, a2
+; RV32I-NEXT:    call __mulsf3
+; RV32I-NEXT:    lw ra, 12(sp) # 4-byte Folded Reload
+; RV32I-NEXT:    addi sp, sp, 16
+; RV32I-NEXT:    ret
+;
+; RV64I-LABEL: fsgnjx_f32:
+; RV64I:       # %bb.0:
+; RV64I-NEXT:    addi sp, sp, -16
+; RV64I-NEXT:    sd ra, 8(sp) # 8-byte Folded Spill
+; RV64I-NEXT:    lui a2, 524288
+; RV64I-NEXT:    and a0, a0, a2
+; RV64I-NEXT:    lui a2, 260096
+; RV64I-NEXT:    or a0, a0, a2
+; RV64I-NEXT:    call __mulsf3
+; RV64I-NEXT:    ld ra, 8(sp) # 8-byte Folded Reload
+; RV64I-NEXT:    addi sp, sp, 16
+; RV64I-NEXT:    ret
+  %z = call float @llvm.copysign.f32(float 1.0, float %x)
+  %mul = fmul float %z, %y
+  ret float %mul
+}
diff --git a/llvm/test/CodeGen/RISCV/half-arith.ll b/llvm/test/CodeGen/RISCV/half-arith.ll
index f54adaa..10e63e3 100644
--- a/llvm/test/CodeGen/RISCV/half-arith.ll
+++ b/llvm/test/CodeGen/RISCV/half-arith.ll
@@ -3104,3 +3104,112 @@ define half @fnmsub_s_contract(half %a, half %b, half %c) nounwind {
   %2 = fsub contract half %c, %1
   ret half %2
 }
+
+define half @fsgnjx_f16(half %x, half %y) nounwind {
+; CHECKIZFH-LABEL: fsgnjx_f16:
+; CHECKIZFH:       # %bb.0:
+; CHECKIZFH-NEXT:    fsgnjx.h fa0, fa1, fa0
+; CHECKIZFH-NEXT:    ret
+;
+; CHECK-ZHINX-LABEL: fsgnjx_f16:
+; CHECK-ZHINX:       # %bb.0:
+; CHECK-ZHINX-NEXT:    fsgnjx.h a0, a1, a0
+; CHECK-ZHINX-NEXT:    ret
+;
+; RV32I-LABEL: fsgnjx_f16:
+; RV32I:       # %bb.0:
+; RV32I-NEXT:    addi sp, sp, -16
+; RV32I-NEXT:    sw ra, 12(sp) # 4-byte Folded Spill
+; RV32I-NEXT:    sw s0, 8(sp) # 4-byte Folded Spill
+; RV32I-NEXT:    sw s1, 4(sp) # 4-byte Folded Spill
+; RV32I-NEXT:    li a2, 15
+; RV32I-NEXT:    slli a2, a2, 10
+; RV32I-NEXT:    or s1, a0, a2
+; RV32I-NEXT:    slli a0, a1, 16
+; RV32I-NEXT:    srli a0, a0, 16
+; RV32I-NEXT:    call __extendhfsf2
+; RV32I-NEXT:    mv s0, a0
+; RV32I-NEXT:    lui a0, 12
+; RV32I-NEXT:    addi a0, a0, -1024
+; RV32I-NEXT:    and a0, s1, a0
+; RV32I-NEXT:    call __extendhfsf2
+; RV32I-NEXT:    mv a1, s0
+; RV32I-NEXT:    call __mulsf3
+; RV32I-NEXT:    call __truncsfhf2
+; RV32I-NEXT:    lw ra, 12(sp) # 4-byte Folded Reload
+; RV32I-NEXT:    lw s0, 8(sp) # 4-byte Folded Reload
+; RV32I-NEXT:    lw s1, 4(sp) # 4-byte Folded Reload
+; RV32I-NEXT:    addi sp, sp, 16
+; RV32I-NEXT:    ret
+;
+; RV64I-LABEL: fsgnjx_f16:
+; RV64I:       # %bb.0:
+; RV64I-NEXT:    addi sp, sp, -32
+; RV64I-NEXT:    sd ra, 24(sp) # 8-byte Folded Spill
+; RV64I-NEXT:    sd s0, 16(sp) # 8-byte Folded Spill
+; RV64I-NEXT:    sd s1, 8(sp) # 8-byte Folded Spill
+; RV64I-NEXT:    li a2, 15
+; RV64I-NEXT:    slli a2, a2, 10
+; RV64I-NEXT:    or s1, a0, a2
+; RV64I-NEXT:    slli a0, a1, 48
+; RV64I-NEXT:    srli a0, a0, 48
+; RV64I-NEXT:    call __extendhfsf2
+; RV64I-NEXT:    mv s0, a0
+; RV64I-NEXT:    lui a0, 12
+; RV64I-NEXT:    addiw a0, a0, -1024
+; RV64I-NEXT:    and a0, s1, a0
+; RV64I-NEXT:    call __extendhfsf2
+; RV64I-NEXT:    mv a1, s0
+; RV64I-NEXT:    call __mulsf3
+; RV64I-NEXT:    call __truncsfhf2
+; RV64I-NEXT:    ld ra, 24(sp) # 8-byte Folded Reload
+; RV64I-NEXT:    ld s0, 16(sp) # 8-byte Folded Reload
+; RV64I-NEXT:    ld s1, 8(sp) # 8-byte Folded Reload
+; RV64I-NEXT:    addi sp, sp, 32
+; RV64I-NEXT:    ret
+;
+; CHECK-RV32-FSGNJ-LABEL: fsgnjx_f16:
+; CHECK-RV32-FSGNJ:       # %bb.0:
+; CHECK-RV32-FSGNJ-NEXT:    addi sp, sp, -16
+; CHECK-RV32-FSGNJ-NEXT:    lui a0, %hi(.LCPI23_0)
+; CHECK-RV32-FSGNJ-NEXT:    flh fa5, %lo(.LCPI23_0)(a0)
+; CHECK-RV32-FSGNJ-NEXT:    fsh fa0, 12(sp)
+; CHECK-RV32-FSGNJ-NEXT:    fsh fa5, 8(sp)
+; CHECK-RV32-FSGNJ-NEXT:    lbu a0, 13(sp)
+; CHECK-RV32-FSGNJ-NEXT:    lbu a1, 9(sp)
+; CHECK-RV32-FSGNJ-NEXT:    andi a0, a0, 128
+; CHECK-RV32-FSGNJ-NEXT:    andi a1, a1, 127
+; CHECK-RV32-FSGNJ-NEXT:    or a0, a1, a0
+; CHECK-RV32-FSGNJ-NEXT:    sb a0, 9(sp)
+; CHECK-RV32-FSGNJ-NEXT:    flh fa5, 8(sp)
+; CHECK-RV32-FSGNJ-NEXT:    fcvt.s.h fa4, fa1
+; CHECK-RV32-FSGNJ-NEXT:    fcvt.s.h fa5, fa5
+; CHECK-RV32-FSGNJ-NEXT:    fmul.s fa5, fa5, fa4
+; CHECK-RV32-FSGNJ-NEXT:    fcvt.h.s fa0, fa5
+; CHECK-RV32-FSGNJ-NEXT:    addi sp, sp, 16
+; CHECK-RV32-FSGNJ-NEXT:    ret
+;
+; CHECK-RV64-FSGNJ-LABEL: fsgnjx_f16:
+; CHECK-RV64-FSGNJ:       # %bb.0:
+; CHECK-RV64-FSGNJ-NEXT:    addi sp, sp, -16
+; CHECK-RV64-FSGNJ-NEXT:    lui a0, %hi(.LCPI23_0)
+; CHECK-RV64-FSGNJ-NEXT:    flh fa5, %lo(.LCPI23_0)(a0)
+; CHECK-RV64-FSGNJ-NEXT:    fsh fa0, 8(sp)
+; CHECK-RV64-FSGNJ-NEXT:    fsh fa5, 0(sp)
+; CHECK-RV64-FSGNJ-NEXT:    lbu a0, 9(sp)
+; CHECK-RV64-FSGNJ-NEXT:    lbu a1, 1(sp)
+; CHECK-RV64-FSGNJ-NEXT:    andi a0, a0, 128
+; CHECK-RV64-FSGNJ-NEXT:    andi a1, a1, 127
+; CHECK-RV64-FSGNJ-NEXT:    or a0, a1, a0
+; CHECK-RV64-FSGNJ-NEXT:    sb a0, 1(sp)
+; CHECK-RV64-FSGNJ-NEXT:    flh fa5, 0(sp)
+; CHECK-RV64-FSGNJ-NEXT:    fcvt.s.h fa4, fa1
+; CHECK-RV64-FSGNJ-NEXT:    fcvt.s.h fa5, fa5
+; CHECK-RV64-FSGNJ-NEXT:    fmul.s fa5, fa5, fa4
+; CHECK-RV64-FSGNJ-NEXT:    fcvt.h.s fa0, fa5
+; CHECK-RV64-FSGNJ-NEXT:    addi sp, sp, 16
+; CHECK-RV64-FSGNJ-NEXT:    ret
+  %z = call half @llvm.copysign.f16(half 1.0, half %x)
+  %mul = fmul half %z, %y
+  ret half %mul
+}
diff --git a/llvm/test/CodeGen/RISCV/inline-asm-mem-constraint.ll b/llvm/test/CodeGen/RISCV/inline-asm-mem-constraint.ll
index 52d0dab..6666d92 100644
--- a/llvm/test/CodeGen/RISCV/inline-asm-mem-constraint.ll
+++ b/llvm/test/CodeGen/RISCV/inline-asm-mem-constraint.ll
@@ -2252,3 +2252,53 @@ label:
   call void asm "lw zero, $0", "*A"(ptr elementtype(i32) getelementptr (i8, ptr blockaddress(@constraint_A_with_local_3, %label), i32 2000))
   ret void
 }
+
+@_ZN5repro9MY_BUFFER17hb0f674501d5980a6E = external global <{ [16 x i8] }>
+
+; Address is not used by a memory constraint.
+define void @should_not_fold() {
+; RV32I-LABEL: should_not_fold:
+; RV32I:       # %bb.0: # %start
+; RV32I-NEXT:    .cfi_def_cfa_offset 0
+; RV32I-NEXT:    lui a0, %hi(_ZN5repro9MY_BUFFER17hb0f674501d5980a6E)
+; RV32I-NEXT:    addi a0, a0, %lo(_ZN5repro9MY_BUFFER17hb0f674501d5980a6E)
+; RV32I-NEXT:    #APP
+; RV32I-NEXT:    ecall
+; RV32I-NEXT:    #NO_APP
+; RV32I-NEXT:    ret
+;
+; RV64I-LABEL: should_not_fold:
+; RV64I:       # %bb.0: # %start
+; RV64I-NEXT:    .cfi_def_cfa_offset 0
+; RV64I-NEXT:    lui a0, %hi(_ZN5repro9MY_BUFFER17hb0f674501d5980a6E)
+; RV64I-NEXT:    addi a0, a0, %lo(_ZN5repro9MY_BUFFER17hb0f674501d5980a6E)
+; RV64I-NEXT:    #APP
+; RV64I-NEXT:    ecall
+; RV64I-NEXT:    #NO_APP
+; RV64I-NEXT:    ret
+;
+; RV32I-MEDIUM-LABEL: should_not_fold:
+; RV32I-MEDIUM:       # %bb.0: # %start
+; RV32I-MEDIUM-NEXT:    .cfi_def_cfa_offset 0
+; RV32I-MEDIUM-NEXT:  .Lpcrel_hi39:
+; RV32I-MEDIUM-NEXT:    auipc a0, %pcrel_hi(_ZN5repro9MY_BUFFER17hb0f674501d5980a6E)
+; RV32I-MEDIUM-NEXT:    addi a0, a0, %pcrel_lo(.Lpcrel_hi39)
+; RV32I-MEDIUM-NEXT:    #APP
+; RV32I-MEDIUM-NEXT:    ecall
+; RV32I-MEDIUM-NEXT:    #NO_APP
+; RV32I-MEDIUM-NEXT:    ret
+;
+; RV64I-MEDIUM-LABEL: should_not_fold:
+; RV64I-MEDIUM:       # %bb.0: # %start
+; RV64I-MEDIUM-NEXT:    .cfi_def_cfa_offset 0
+; RV64I-MEDIUM-NEXT:  .Lpcrel_hi39:
+; RV64I-MEDIUM-NEXT:    auipc a0, %pcrel_hi(_ZN5repro9MY_BUFFER17hb0f674501d5980a6E)
+; RV64I-MEDIUM-NEXT:    addi a0, a0, %pcrel_lo(.Lpcrel_hi39)
+; RV64I-MEDIUM-NEXT:    #APP
+; RV64I-MEDIUM-NEXT:    ecall
+; RV64I-MEDIUM-NEXT:    #NO_APP
+; RV64I-MEDIUM-NEXT:    ret
+start:
+  %0 = tail call ptr asm sideeffect alignstack "ecall", "=&{x10},0,~{vtype},~{vl},~{vxsat},~{vxrm},~{memory}"(ptr @_ZN5repro9MY_BUFFER17hb0f674501d5980a6E)
+  ret void
+}
diff --git a/llvm/test/CodeGen/RISCV/pr94265.ll b/llvm/test/CodeGen/RISCV/pr94265.ll
index cb41e22..f92cdb4 100644
--- a/llvm/test/CodeGen/RISCV/pr94265.ll
+++ b/llvm/test/CodeGen/RISCV/pr94265.ll
@@ -1,6 +1,8 @@
 ; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py
 ; RUN: llc < %s -mtriple=riscv32-- -mattr=+v | FileCheck -check-prefix=RV32I %s
 ; RUN: llc < %s -mtriple=riscv64-- -mattr=+v | FileCheck -check-prefix=RV64I %s
+; RUN: llc < %s -mtriple=riscv32-- -mattr=+zve32x,+zvl128b | FileCheck -check-prefix=RV32I %s
+; RUN: llc < %s -mtriple=riscv64-- -mattr=+zve32x,+zvl128b | FileCheck -check-prefix=RV64I %s
 
 define <8 x i16> @PR94265(<8 x i32> %a0) #0 {
 ; RV32I-LABEL: PR94265:
diff --git a/llvm/test/CodeGen/RISCV/riscv-tail-dup-size.ll b/llvm/test/CodeGen/RISCV/riscv-tail-dup-size.ll
new file mode 100644
index 0000000..0508016
--- /dev/null
+++ b/llvm/test/CodeGen/RISCV/riscv-tail-dup-size.ll
@@ -0,0 +1,79 @@
+; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py
+; RUN: llc -mtriple=riscv64 -mattr=+m -O2 < %s | FileCheck %s --check-prefix=CHECK-O2
+; RUN: llc -mtriple=riscv64 -mattr=+m -O3 < %s | FileCheck %s --check-prefix=CHECK-O3
+
+; RUN: llc -mtriple=riscv64 -mattr=+m -tail-dup-size=2 < %s | FileCheck %s --check-prefix=CHECK-O2
+; RUN: llc -mtriple=riscv64 -mattr=+m -tail-dup-placement-threshold=2 < %s | FileCheck %s --check-prefix=CHECK-O2
+; RUN: llc -mtriple=riscv64 -mattr=+m -tail-dup-placement-threshold=6 < %s | FileCheck %s --check-prefix=CHECK-O3
+
+@a = external global i32
+@b = external global i32
+@c = external global i32
+
+declare i32 @foo(i32)
+
+define i32 @test(i32 %n) {
+; CHECK-O2-LABEL: test:
+; CHECK-O2:       # %bb.0: # %entry
+; CHECK-O2-NEXT:    sext.w a1, a0
+; CHECK-O2-NEXT:    blez a1, .LBB0_2
+; CHECK-O2-NEXT:  # %bb.1: # %if.then
+; CHECK-O2-NEXT:    lui a1, %hi(a)
+; CHECK-O2-NEXT:    lw a1, %lo(a)(a1)
+; CHECK-O2-NEXT:    mul a0, a1, a0
+; CHECK-O2-NEXT:    j .LBB0_3
+; CHECK-O2-NEXT:  .LBB0_2: # %if.else
+; CHECK-O2-NEXT:    lui a1, %hi(b)
+; CHECK-O2-NEXT:    lw a1, %lo(b)(a1)
+; CHECK-O2-NEXT:    divw a0, a1, a0
+; CHECK-O2-NEXT:  .LBB0_3: # %if.end
+; CHECK-O2-NEXT:    lui a1, %hi(c)
+; CHECK-O2-NEXT:    lw a1, %lo(c)(a1)
+; CHECK-O2-NEXT:    addi a0, a0, -1
+; CHECK-O2-NEXT:    mulw a0, a0, a1
+; CHECK-O2-NEXT:    tail foo
+;
+; CHECK-O3-LABEL: test:
+; CHECK-O3:       # %bb.0: # %entry
+; CHECK-O3-NEXT:    sext.w a1, a0
+; CHECK-O3-NEXT:    blez a1, .LBB0_2
+; CHECK-O3-NEXT:  # %bb.1: # %if.then
+; CHECK-O3-NEXT:    lui a1, %hi(a)
+; CHECK-O3-NEXT:    lw a1, %lo(a)(a1)
+; CHECK-O3-NEXT:    mul a0, a1, a0
+; CHECK-O3-NEXT:    lui a1, %hi(c)
+; CHECK-O3-NEXT:    lw a1, %lo(c)(a1)
+; CHECK-O3-NEXT:    addi a0, a0, -1
+; CHECK-O3-NEXT:    mulw a0, a0, a1
+; CHECK-O3-NEXT:    tail foo
+; CHECK-O3-NEXT:  .LBB0_2: # %if.else
+; CHECK-O3-NEXT:    lui a1, %hi(b)
+; CHECK-O3-NEXT:    lw a1, %lo(b)(a1)
+; CHECK-O3-NEXT:    divw a0, a1, a0
+; CHECK-O3-NEXT:    lui a1, %hi(c)
+; CHECK-O3-NEXT:    lw a1, %lo(c)(a1)
+; CHECK-O3-NEXT:    addi a0, a0, -1
+; CHECK-O3-NEXT:    mulw a0, a0, a1
+; CHECK-O3-NEXT:    tail foo
+entry:
+  %cmp = icmp sgt i32 %n, 0
+  br i1 %cmp, label %if.then, label %if.else
+
+if.then:
+  %va = load i32, ptr @a
+  %mul = mul nsw i32 %va, %n
+  br label %if.end
+
+if.else:
+  %vb = load i32, ptr @b
+  %div = sdiv i32 %vb, %n
+  br label %if.end
+
+if.end:
+  %phi = phi i32 [ %mul, %if.then ], [ %div, %if.else ]
+  %vc = load i32, ptr @c
+  %add = add nsw i32 %phi, -1
+  %arg = mul i32 %add, %vc
+  %ret = tail call i32 @foo(i32 %arg)
+  ret i32 %ret
+}
diff --git a/llvm/test/CodeGen/RISCV/rv64zba.ll b/llvm/test/CodeGen/RISCV/rv64zba.ll
index 7cb2452..20a0484 100644
--- a/llvm/test/CodeGen/RISCV/rv64zba.ll
+++ b/llvm/test/CodeGen/RISCV/rv64zba.ll
@@ -2879,8 +2879,8 @@ entry:
   ret ptr %5
 }
 
-define i64 @srli_slliw(i64 %1) {
-; RV64I-LABEL: srli_slliw:
+define i64 @srli_slliuw(i64 %1) {
+; RV64I-LABEL: srli_slliuw:
 ; RV64I:       # %bb.0: # %entry
 ; RV64I-NEXT:    slli a0, a0, 2
 ; RV64I-NEXT:    li a1, 1
@@ -2889,7 +2889,7 @@ define i64 @srli_slliw(i64 %1) {
 ; RV64I-NEXT:    and a0, a0, a1
 ; RV64I-NEXT:    ret
 ;
-; RV64ZBA-LABEL: srli_slliw:
+; RV64ZBA-LABEL: srli_slliuw:
 ; RV64ZBA:       # %bb.0: # %entry
 ; RV64ZBA-NEXT:    srli a0, a0, 2
 ; RV64ZBA-NEXT:    slli.uw a0, a0, 4
@@ -2901,8 +2901,8 @@ entry:
   ret i64 %4
 }
 
-define i64 @srli_slliw_canonical(i64 %0) {
-; RV64I-LABEL: srli_slliw_canonical:
+define i64 @srli_slliuw_canonical(i64 %0) {
+; RV64I-LABEL: srli_slliuw_canonical:
 ; RV64I:       # %bb.0: # %entry
 ; RV64I-NEXT:    slli a0, a0, 2
 ; RV64I-NEXT:    li a1, 1
@@ -2911,7 +2911,7 @@ define i64 @srli_slliw_canonical(i64 %0) {
 ; RV64I-NEXT:    and a0, a0, a1
 ; RV64I-NEXT:    ret
 ;
-; RV64ZBA-LABEL: srli_slliw_canonical:
+; RV64ZBA-LABEL: srli_slliuw_canonical:
 ; RV64ZBA:       # %bb.0: # %entry
 ; RV64ZBA-NEXT:    srli a0, a0, 2
 ; RV64ZBA-NEXT:    slli.uw a0, a0, 4
@@ -2949,3 +2949,46 @@ entry:
   %4 = shl i64 %3, 4
   ret i64 %4
 }
+
+define i64 @srli_slliuw_2(i64 %1) {
+; RV64I-LABEL: srli_slliuw_2:
+; RV64I:       # %bb.0: # %entry
+; RV64I-NEXT:    srli a0, a0, 15
+; RV64I-NEXT:    li a1, 1
+; RV64I-NEXT:    slli a1, a1, 35
+; RV64I-NEXT:    addi a1, a1, -8
+; RV64I-NEXT:    and a0, a0, a1
+; RV64I-NEXT:    ret
+;
+; RV64ZBA-LABEL: srli_slliuw_2:
+; RV64ZBA:       # %bb.0: # %entry
+; RV64ZBA-NEXT:    srli a0, a0, 18
+; RV64ZBA-NEXT:    slli.uw a0, a0, 3
+; RV64ZBA-NEXT:    ret
+entry:
+  %2 = lshr i64 %1, 18
+  %3 = and i64 %2, 4294967295
+  %4 = shl i64 %3, 3
+  ret i64 %4
+}
+
+define i64 @srli_slliuw_canonical_2(i64 %0) {
+; RV64I-LABEL: srli_slliuw_canonical_2:
+; RV64I:       # %bb.0: # %entry
+; RV64I-NEXT:    srli a0, a0, 15
+; RV64I-NEXT:    li a1, 1
+; RV64I-NEXT:    slli a1, a1, 35
+; RV64I-NEXT:    addi a1, a1, -8
+; RV64I-NEXT:    and a0, a0, a1
+; RV64I-NEXT:    ret
+;
+; RV64ZBA-LABEL: srli_slliuw_canonical_2:
+; RV64ZBA:       # %bb.0: # %entry
+; RV64ZBA-NEXT:    srli a0, a0, 18
+; RV64ZBA-NEXT:    slli.uw a0, a0, 3
+; RV64ZBA-NEXT:    ret
+entry:
+  %1 = lshr i64 %0, 15
+  %2 = and i64 %1, 34359738360
+  ret i64 %2
+}
diff --git a/llvm/test/CodeGen/RISCV/rvv-cfi-info.ll b/llvm/test/CodeGen/RISCV/rvv-cfi-info.ll
index c99388c..93fe666 100644
--- a/llvm/test/CodeGen/RISCV/rvv-cfi-info.ll
+++ b/llvm/test/CodeGen/RISCV/rvv-cfi-info.ll
@@ -27,8 +27,12 @@ define riscv_vector_cc <vscale x 1 x i32> @test_vector_callee_cfi(<vscale x 1 x
 ; OMIT-FP-NEXT:    addi a0, sp, 16
 ; OMIT-FP-NEXT:    vs4r.v v4, (a0) # Unknown-size Folded Spill
 ; OMIT-FP-NEXT:    .cfi_escape 0x10, 0x61, 0x08, 0x11, 0x7e, 0x92, 0xa2, 0x38, 0x00, 0x1e, 0x22 # $v1 @ cfa - 2 * vlenb
-; OMIT-FP-NEXT:    .cfi_escape 0x10, 0x62, 0x08, 0x11, 0x7c, 0x92, 0xa2, 0x38, 0x00, 0x1e, 0x22 # $v2m2 @ cfa - 4 * vlenb
-; OMIT-FP-NEXT:    .cfi_escape 0x10, 0x64, 0x08, 0x11, 0x78, 0x92, 0xa2, 0x38, 0x00, 0x1e, 0x22 # $v4m4 @ cfa - 8 * vlenb
+; OMIT-FP-NEXT:    .cfi_escape 0x10, 0x62, 0x08, 0x11, 0x7c, 0x92, 0xa2, 0x38, 0x00, 0x1e, 0x22 # $v2 @ cfa - 4 * vlenb
+; OMIT-FP-NEXT:    .cfi_escape 0x10, 0x63, 0x08, 0x11, 0x7d, 0x92, 0xa2, 0x38, 0x00, 0x1e, 0x22 # $v3 @ cfa - 3 * vlenb
+; OMIT-FP-NEXT:    .cfi_escape 0x10, 0x64, 0x08, 0x11, 0x78, 0x92, 0xa2, 0x38, 0x00, 0x1e, 0x22 # $v4 @ cfa - 8 * vlenb
+; OMIT-FP-NEXT:    .cfi_escape 0x10, 0x65, 0x08, 0x11, 0x79, 0x92, 0xa2, 0x38, 0x00, 0x1e, 0x22 # $v5 @ cfa - 7 * vlenb
+; OMIT-FP-NEXT:    .cfi_escape 0x10, 0x66, 0x08, 0x11, 0x7a, 0x92, 0xa2, 0x38, 0x00, 0x1e, 0x22 # $v6 @ cfa - 6 * vlenb
+; OMIT-FP-NEXT:    .cfi_escape 0x10, 0x67, 0x08, 0x11, 0x7b, 0x92, 0xa2, 0x38, 0x00, 0x1e, 0x22 # $v7 @ cfa - 5 * vlenb
 ; OMIT-FP-NEXT:    #APP
 ; OMIT-FP-NEXT:    #NO_APP
 ; OMIT-FP-NEXT:    csrr a0, vlenb
@@ -79,8 +83,12 @@ define riscv_vector_cc <vscale x 1 x i32> @test_vector_callee_cfi(<vscale x 1 x
 ; NO-OMIT-FP-NEXT:    addi a0, a0, -32
 ; NO-OMIT-FP-NEXT:    vs4r.v v4, (a0) # Unknown-size Folded Spill
 ; NO-OMIT-FP-NEXT:    .cfi_escape 0x10, 0x61, 0x0b, 0x11, 0x60, 0x22, 0x11, 0x7e, 0x92, 0xa2, 0x38, 0x00, 0x1e, 0x22 # $v1 @ cfa - 32 - 2 * vlenb
-; NO-OMIT-FP-NEXT:    .cfi_escape 0x10, 0x62, 0x0b, 0x11, 0x60, 0x22, 0x11, 0x7c, 0x92, 0xa2, 0x38, 0x00, 0x1e, 0x22 # $v2m2 @ cfa - 32 - 4 * vlenb
-; NO-OMIT-FP-NEXT:    .cfi_escape 0x10, 0x64, 0x0b, 0x11, 0x60, 0x22, 0x11, 0x78, 0x92, 0xa2, 0x38, 0x00, 0x1e, 0x22 # $v4m4 @ cfa - 32 - 8 * vlenb
+; NO-OMIT-FP-NEXT:    .cfi_escape 0x10, 0x62, 0x0b, 0x11, 0x60, 0x22, 0x11, 0x7c, 0x92, 0xa2, 0x38, 0x00, 0x1e, 0x22 # $v2 @ cfa - 32 - 4 * vlenb
+; NO-OMIT-FP-NEXT:    .cfi_escape 0x10, 0x63, 0x0b, 0x11, 0x60, 0x22, 0x11, 0x7d, 0x92, 0xa2, 0x38, 0x00, 0x1e, 0x22 # $v3 @ cfa - 32 - 3 * vlenb
+; NO-OMIT-FP-NEXT:    .cfi_escape 0x10, 0x64, 0x0b, 0x11, 0x60, 0x22, 0x11, 0x78, 0x92, 0xa2, 0x38, 0x00, 0x1e, 0x22 # $v4 @ cfa - 32 - 8 * vlenb
+; NO-OMIT-FP-NEXT:    .cfi_escape 0x10, 0x65, 0x0b, 0x11, 0x60, 0x22, 0x11, 0x79, 0x92, 0xa2, 0x38, 0x00, 0x1e, 0x22 # $v5 @ cfa - 32 - 7 * vlenb
+; NO-OMIT-FP-NEXT:    .cfi_escape 0x10, 0x66, 0x0b, 0x11, 0x60, 0x22, 0x11, 0x7a, 0x92, 0xa2, 0x38, 0x00, 0x1e, 0x22 # $v6 @ cfa - 32 - 6 * vlenb
+; NO-OMIT-FP-NEXT:    .cfi_escape 0x10, 0x67, 0x0b, 0x11, 0x60, 0x22, 0x11, 0x7b, 0x92, 0xa2, 0x38, 0x00, 0x1e, 0x22 # $v7 @ cfa - 32 - 5 * vlenb
 ; NO-OMIT-FP-NEXT:    #APP
 ; NO-OMIT-FP-NEXT:    #NO_APP
 ; NO-OMIT-FP-NEXT:    csrr a0, vlenb
diff --git a/llvm/test/CodeGen/RISCV/rvv/abs-vp.ll b/llvm/test/CodeGen/RISCV/rvv/abs-vp.ll
index 05d6716..cd2208e 100644
--- a/llvm/test/CodeGen/RISCV/rvv/abs-vp.ll
+++ b/llvm/test/CodeGen/RISCV/rvv/abs-vp.ll
@@ -1,7 +1,7 @@
 ; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py
-; RUN: llc -mtriple=riscv32 -mattr=+d,+zfh,+zvfh,+v,+m -target-abi=ilp32d \
+; RUN: llc -mtriple=riscv32 -mattr=+v,+m -target-abi=ilp32d \
 ; RUN:     -verify-machineinstrs < %s | FileCheck %s --check-prefixes=CHECK
-; RUN: llc -mtriple=riscv64 -mattr=+d,+zfh,+zvfh,+v,+m -target-abi=lp64d \
+; RUN: llc -mtriple=riscv64 -mattr=+v,+m -target-abi=lp64d \
 ; RUN:     -verify-machineinstrs < %s | FileCheck %s --check-prefixes=CHECK
 
 declare <vscale x 1 x i8> @llvm.vp.abs.nxv1i8(<vscale x 1 x i8>, i1 immarg, <vscale x 1 x i1>, i32)
diff --git a/llvm/test/CodeGen/RISCV/rvv/bitreverse-vp.ll b/llvm/test/CodeGen/RISCV/rvv/bitreverse-vp.ll
index 5217148..5709de567 100644
--- a/llvm/test/CodeGen/RISCV/rvv/bitreverse-vp.ll
+++ b/llvm/test/CodeGen/RISCV/rvv/bitreverse-vp.ll
@@ -1,7 +1,7 @@
 ; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py
-; RUN: llc -mtriple=riscv32 -mattr=+d,+zfh,+zvfh,+v,+m -target-abi=ilp32d \
+; RUN: llc -mtriple=riscv32 -mattr=+v,+m -target-abi=ilp32d \
 ; RUN:     -verify-machineinstrs < %s | FileCheck %s --check-prefixes=CHECK,RV32
-; RUN: llc -mtriple=riscv64 -mattr=+d,+zfh,+zvfh,+v,+m -target-abi=lp64d \
+; RUN: llc -mtriple=riscv64 -mattr=+v,+m -target-abi=lp64d \
 ; RUN:     -verify-machineinstrs < %s | FileCheck %s --check-prefixes=CHECK,RV64
 ; RUN: llc -mtriple=riscv32 -mattr=+v,+zvbb,+m -target-abi=ilp32d \
 ; RUN:     -verify-machineinstrs < %s | FileCheck %s --check-prefixes=CHECK-ZVBB
diff --git a/llvm/test/CodeGen/RISCV/rvv/bswap-vp.ll b/llvm/test/CodeGen/RISCV/rvv/bswap-vp.ll
index aadd985..6917d7e 100644
--- a/llvm/test/CodeGen/RISCV/rvv/bswap-vp.ll
+++ b/llvm/test/CodeGen/RISCV/rvv/bswap-vp.ll
@@ -1,7 +1,7 @@
 ; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py
-; RUN: llc -mtriple=riscv32 -mattr=+d,+zfh,+zvfh,+v,+m -target-abi=ilp32d \
+; RUN: llc -mtriple=riscv32 -mattr=+v,+m -target-abi=ilp32d \
 ; RUN:     -verify-machineinstrs < %s | FileCheck %s --check-prefixes=CHECK,RV32
-; RUN: llc -mtriple=riscv64 -mattr=+d,+zfh,+zvfh,+v,+m -target-abi=lp64d \
+; RUN: llc -mtriple=riscv64 -mattr=+v,+m -target-abi=lp64d \
 ; RUN:     -verify-machineinstrs < %s | FileCheck %s --check-prefixes=CHECK,RV64
 ; RUN: llc -mtriple=riscv32 -mattr=+v,+zvkb,+m -target-abi=ilp32d \
 ; RUN:     -verify-machineinstrs < %s | FileCheck %s --check-prefixes=CHECK-ZVKB
diff --git a/llvm/test/CodeGen/RISCV/rvv/commutable.ll b/llvm/test/CodeGen/RISCV/rvv/commutable.ll
index 5bca2ee..e26c467 100644
--- a/llvm/test/CodeGen/RISCV/rvv/commutable.ll
+++ b/llvm/test/CodeGen/RISCV/rvv/commutable.ll
@@ -1,7 +1,7 @@
 ; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py
-; RUN: sed 's/iXLen/i32/g' %s | llc -mtriple=riscv32 -mattr=+f,+d,+zvfh,+v \
+; RUN: sed 's/iXLen/i32/g' %s | llc -mtriple=riscv32 -mattr=+v \
 ; RUN:   -verify-machineinstrs | FileCheck %s
-; RUN: sed 's/iXLen/i64/g' %s | llc -mtriple=riscv64 -mattr=+f,+d,+zvfh,+v \
+; RUN: sed 's/iXLen/i64/g' %s | llc -mtriple=riscv64 -mattr=+v \
 ; RUN:   -verify-machineinstrs | FileCheck %s
 
 ; vadd.vv
diff --git a/llvm/test/CodeGen/RISCV/rvv/ctpop-vp.ll b/llvm/test/CodeGen/RISCV/rvv/ctpop-vp.ll
index b5cafe4..01aac12 100644
--- a/llvm/test/CodeGen/RISCV/rvv/ctpop-vp.ll
+++ b/llvm/test/CodeGen/RISCV/rvv/ctpop-vp.ll
@@ -1,7 +1,7 @@
 ; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py
-; RUN: llc -mtriple=riscv32 -mattr=+d,+zfh,+zvfh,+v,+m -target-abi=ilp32d \
+; RUN: llc -mtriple=riscv32 -mattr=+v,+m -target-abi=ilp32d \
 ; RUN:     -verify-machineinstrs < %s | FileCheck %s --check-prefixes=CHECK,RV32
-; RUN: llc -mtriple=riscv64 -mattr=+d,+zfh,+zvfh,+v,+m -target-abi=lp64d \
+; RUN: llc -mtriple=riscv64 -mattr=+v,+m -target-abi=lp64d \
 ; RUN:     -verify-machineinstrs < %s | FileCheck %s --check-prefixes=CHECK,RV64
 ; RUN: llc -mtriple=riscv32 -mattr=+v,+zvbb,+m -target-abi=ilp32d \
 ; RUN:     -verify-machineinstrs < %s | FileCheck %s --check-prefixes=CHECK-ZVBB
diff --git a/llvm/test/CodeGen/RISCV/rvv/extractelt-fp.ll b/llvm/test/CodeGen/RISCV/rvv/extractelt-fp.ll
index 3b7952f..209a37b 100644
--- a/llvm/test/CodeGen/RISCV/rvv/extractelt-fp.ll
+++ b/llvm/test/CodeGen/RISCV/rvv/extractelt-fp.ll
@@ -7,7 +7,7 @@
 define half @extractelt_nxv1f16_0(<vscale x 1 x half> %v) {
 ; CHECK-LABEL: extractelt_nxv1f16_0:
 ; CHECK:       # %bb.0:
-; CHECK-NEXT:    vsetivli zero, 1, e16, mf4, ta, ma
+; CHECK-NEXT:    vsetivli zero, 1, e16, m1, ta, ma
 ; CHECK-NEXT:    vfmv.f.s fa0, v8
 ; CHECK-NEXT:    ret
   %r = extractelement <vscale x 1 x half> %v, i32 0
@@ -39,7 +39,7 @@ define half @extractelt_nxv1f16_idx(<vscale x 1 x half> %v, i32 zeroext %idx) {
 define half @extractelt_nxv2f16_0(<vscale x 2 x half> %v) {
 ; CHECK-LABEL: extractelt_nxv2f16_0:
 ; CHECK:       # %bb.0:
-; CHECK-NEXT:    vsetivli zero, 1, e16, mf2, ta, ma
+; CHECK-NEXT:    vsetivli zero, 1, e16, m1, ta, ma
 ; CHECK-NEXT:    vfmv.f.s fa0, v8
 ; CHECK-NEXT:    ret
   %r = extractelement <vscale x 2 x half> %v, i32 0
@@ -199,7 +199,7 @@ define half @extractelt_nxv32f16_idx(<vscale x 32 x half> %v, i32 zeroext %idx)
 define float @extractelt_nxv1f32_0(<vscale x 1 x float> %v) {
 ; CHECK-LABEL: extractelt_nxv1f32_0:
 ; CHECK:       # %bb.0:
-; CHECK-NEXT:    vsetivli zero, 1, e32, mf2, ta, ma
+; CHECK-NEXT:    vsetivli zero, 1, e32, m1, ta, ma
 ; CHECK-NEXT:    vfmv.f.s fa0, v8
 ; CHECK-NEXT:    ret
   %r = extractelement <vscale x 1 x float> %v, i32 0
diff --git a/llvm/test/CodeGen/RISCV/rvv/extractelt-int-rv32.ll b/llvm/test/CodeGen/RISCV/rvv/extractelt-int-rv32.ll
index df9949e..4d6bc34 100644
--- a/llvm/test/CodeGen/RISCV/rvv/extractelt-int-rv32.ll
+++ b/llvm/test/CodeGen/RISCV/rvv/extractelt-int-rv32.ll
@@ -1,7 +1,7 @@
 ; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py
-; RUN: llc -mtriple=riscv32 -mattr=+d,+zfh,+zvfh,+v -target-abi=ilp32d \
+; RUN: llc -mtriple=riscv32 -mattr=+v -target-abi=ilp32d \
 ; RUN:     -verify-machineinstrs < %s | FileCheck %s --check-prefixes=CHECK,RV32NOM
-; RUN: llc -mtriple=riscv32 -mattr=+d,+zfh,+zvfh,+v,+m -target-abi=ilp32d \
+; RUN: llc -mtriple=riscv32 -mattr=+v,+m -target-abi=ilp32d \
 ; RUN:     -verify-machineinstrs < %s | FileCheck %s --check-prefixes=CHECK,RV32M
 
 
diff --git a/llvm/test/CodeGen/RISCV/rvv/extractelt-int-rv64.ll b/llvm/test/CodeGen/RISCV/rvv/extractelt-int-rv64.ll
index a96cf58..aba0ad0 100644
--- a/llvm/test/CodeGen/RISCV/rvv/extractelt-int-rv64.ll
+++ b/llvm/test/CodeGen/RISCV/rvv/extractelt-int-rv64.ll
@@ -1,7 +1,7 @@
 ; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py
-; RUN: llc -mtriple=riscv64 -mattr=+d,+zfh,+zvfh,+v -target-abi=lp64d \
+; RUN: llc -mtriple=riscv64 -mattr=+v -target-abi=lp64d \
 ; RUN:     -verify-machineinstrs < %s | FileCheck %s -check-prefixes=CHECK,RV64NOM
-; RUN: llc -mtriple=riscv64 -mattr=+d,+zfh,+zvfh,+v,+m -target-abi=lp64d \
+; RUN: llc -mtriple=riscv64 -mattr=+v,+m -target-abi=lp64d \
 ; RUN:     -verify-machineinstrs < %s | FileCheck %s -check-prefixes=CHECK,RV64M
 
 define signext i8 @extractelt_nxv1i8_0(<vscale x 1 x i8> %v) {
diff --git a/llvm/test/CodeGen/RISCV/rvv/fixed-vectors-abs-vp.ll b/llvm/test/CodeGen/RISCV/rvv/fixed-vectors-abs-vp.ll
index f3e8235..84da351 100644
--- a/llvm/test/CodeGen/RISCV/rvv/fixed-vectors-abs-vp.ll
+++ b/llvm/test/CodeGen/RISCV/rvv/fixed-vectors-abs-vp.ll
@@ -1,7 +1,7 @@
 ; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py
-; RUN: llc -mtriple=riscv32 -mattr=+d,+zfh,+zvfh,+v,+m -target-abi=ilp32d \
+; RUN: llc -mtriple=riscv32 -mattr=+v,+m -target-abi=ilp32d \
 ; RUN:   -verify-machineinstrs < %s | FileCheck %s --check-prefixes=CHECK
-; RUN: llc -mtriple=riscv64 -mattr=+d,+zfh,+zvfh,+v,+m -target-abi=lp64d \
+; RUN: llc -mtriple=riscv64 -mattr=+v,+m -target-abi=lp64d \
 ; RUN:   -verify-machineinstrs < %s | FileCheck %s --check-prefixes=CHECK
 
 declare <2 x i8> @llvm.vp.abs.v2i8(<2 x i8>, i1 immarg, <2 x i1>, i32)
diff --git a/llvm/test/CodeGen/RISCV/rvv/fixed-vectors-bitcast.ll b/llvm/test/CodeGen/RISCV/rvv/fixed-vectors-bitcast.ll
index 5252eb7..f124d55 100644
--- a/llvm/test/CodeGen/RISCV/rvv/fixed-vectors-bitcast.ll
+++ b/llvm/test/CodeGen/RISCV/rvv/fixed-vectors-bitcast.ll
@@ -265,13 +265,13 @@ define i64 @bitcast_v1i64_i64(<1 x i64> %a) {
 define half @bitcast_v2i8_f16(<2 x i8> %a) {
 ; CHECK-LABEL: bitcast_v2i8_f16:
 ; CHECK:       # %bb.0:
-; CHECK-NEXT:    vsetivli zero, 1, e16, mf4, ta, ma
+; CHECK-NEXT:    vsetivli zero, 1, e16, m1, ta, ma
 ; CHECK-NEXT:    vfmv.f.s fa0, v8
 ; CHECK-NEXT:    ret
 ;
 ; ELEN32-LABEL: bitcast_v2i8_f16:
 ; ELEN32:       # %bb.0:
-; ELEN32-NEXT:    vsetivli zero, 1, e16, mf2, ta, ma
+; ELEN32-NEXT:    vsetivli zero, 1, e16, m1, ta, ma
 ; ELEN32-NEXT:    vfmv.f.s fa0, v8
 ; ELEN32-NEXT:    ret
   %b = bitcast <2 x i8> %a to half
@@ -281,13 +281,13 @@ define half @bitcast_v2i8_f16(<2 x i8> %a) {
 define half @bitcast_v1i16_f16(<1 x i16> %a) {
 ; CHECK-LABEL: bitcast_v1i16_f16:
 ; CHECK:       # %bb.0:
-; CHECK-NEXT:    vsetivli zero, 1, e16, mf4, ta, ma
+; CHECK-NEXT:    vsetivli zero, 1, e16, m1, ta, ma
 ; CHECK-NEXT:    vfmv.f.s fa0, v8
 ; CHECK-NEXT:    ret
 ;
 ; ELEN32-LABEL: bitcast_v1i16_f16:
 ; ELEN32:       # %bb.0:
-; ELEN32-NEXT:    vsetivli zero, 1, e16, mf2, ta, ma
+; ELEN32-NEXT:    vsetivli zero, 1, e16, m1, ta, ma
 ; ELEN32-NEXT:    vfmv.f.s fa0, v8
 ; ELEN32-NEXT:    ret
   %b = bitcast <1 x i16> %a to half
@@ -297,7 +297,7 @@ define half @bitcast_v1i16_f16(<1 x i16> %a) {
 define float @bitcast_v4i8_f32(<4 x i8> %a) {
 ; CHECK-LABEL: bitcast_v4i8_f32:
 ; CHECK:       # %bb.0:
-; CHECK-NEXT:    vsetivli zero, 1, e32, mf2, ta, ma
+; CHECK-NEXT:    vsetivli zero, 1, e32, m1, ta, ma
 ; CHECK-NEXT:    vfmv.f.s fa0, v8
 ; CHECK-NEXT:    ret
 ;
@@ -313,7 +313,7 @@ define float @bitcast_v4i8_f32(<4 x i8> %a) {
 define float @bitcast_v2i16_f32(<2 x i16> %a) {
 ; CHECK-LABEL: bitcast_v2i16_f32:
 ; CHECK:       # %bb.0:
-; CHECK-NEXT:    vsetivli zero, 1, e32, mf2, ta, ma
+; CHECK-NEXT:    vsetivli zero, 1, e32, m1, ta, ma
 ; CHECK-NEXT:    vfmv.f.s fa0, v8
 ; CHECK-NEXT:    ret
 ;
@@ -329,7 +329,7 @@ define float @bitcast_v2i16_f32(<2 x i16> %a) {
 define float @bitcast_v1i32_f32(<1 x i32> %a) {
 ; CHECK-LABEL: bitcast_v1i32_f32:
 ; CHECK:       # %bb.0:
-; CHECK-NEXT:    vsetivli zero, 1, e32, mf2, ta, ma
+; CHECK-NEXT:    vsetivli zero, 1, e32, m1, ta, ma
 ; CHECK-NEXT:    vfmv.f.s fa0, v8
 ; CHECK-NEXT:    ret
 ;
diff --git a/llvm/test/CodeGen/RISCV/rvv/fixed-vectors-bitreverse-vp.ll b/llvm/test/CodeGen/RISCV/rvv/fixed-vectors-bitreverse-vp.ll
index 068c25b..90bedf8 100644
--- a/llvm/test/CodeGen/RISCV/rvv/fixed-vectors-bitreverse-vp.ll
+++ b/llvm/test/CodeGen/RISCV/rvv/fixed-vectors-bitreverse-vp.ll
@@ -1,7 +1,7 @@
 ; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py
-; RUN: llc -mtriple=riscv32 -mattr=+d,+zfh,+zvfh,+v,+m -target-abi=ilp32d \
+; RUN: llc -mtriple=riscv32 -mattr=+v,+m -target-abi=ilp32d \
 ; RUN:   -verify-machineinstrs < %s | FileCheck %s --check-prefixes=CHECK,RV32
-; RUN: llc -mtriple=riscv64 -mattr=+d,+zfh,+zvfh,+v,+m -target-abi=lp64d \
+; RUN: llc -mtriple=riscv64 -mattr=+v,+m -target-abi=lp64d \
 ; RUN:   -verify-machineinstrs < %s | FileCheck %s --check-prefixes=CHECK,RV64
 
 declare <2 x i8> @llvm.vp.bitreverse.v2i8(<2 x i8>, <2 x i1>, i32)
diff --git a/llvm/test/CodeGen/RISCV/rvv/fixed-vectors-bswap-vp.ll b/llvm/test/CodeGen/RISCV/rvv/fixed-vectors-bswap-vp.ll
index 1490738..6f2e860 100644
--- a/llvm/test/CodeGen/RISCV/rvv/fixed-vectors-bswap-vp.ll
+++ b/llvm/test/CodeGen/RISCV/rvv/fixed-vectors-bswap-vp.ll
@@ -1,7 +1,7 @@
 ; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py
-; RUN: llc -mtriple=riscv32 -mattr=+d,+zfh,+zvfh,+v,+m -target-abi=ilp32d \
+; RUN: llc -mtriple=riscv32 -mattr=+v,+m -target-abi=ilp32d \
 ; RUN:   -verify-machineinstrs < %s | FileCheck %s --check-prefixes=CHECK,RV32
-; RUN: llc -mtriple=riscv64 -mattr=+d,+zfh,+zvfh,+v,+m -target-abi=lp64d \
+; RUN: llc -mtriple=riscv64 -mattr=+v,+m -target-abi=lp64d \
 ; RUN:   -verify-machineinstrs < %s | FileCheck %s --check-prefixes=CHECK,RV64
 
 declare <2 x i16> @llvm.vp.bswap.v2i16(<2 x i16>, <2 x i1>, i32)
diff --git a/llvm/test/CodeGen/RISCV/rvv/fixed-vectors-calling-conv.ll b/llvm/test/CodeGen/RISCV/rvv/fixed-vectors-calling-conv.ll
index 3286c33..809884c 100644
--- a/llvm/test/CodeGen/RISCV/rvv/fixed-vectors-calling-conv.ll
+++ b/llvm/test/CodeGen/RISCV/rvv/fixed-vectors-calling-conv.ll
@@ -1,5 +1,5 @@
 ; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py
-; RUN: llc -mtriple=riscv64 -mattr=+d,+zfh,+zvfh,+v -verify-machineinstrs < %s | FileCheck %s
+; RUN: llc -mtriple=riscv64 -mattr=+v -verify-machineinstrs < %s | FileCheck %s
 
 define <4 x i8> @ret_v4i8(ptr %p) {
 ; CHECK-LABEL: ret_v4i8:
diff --git a/llvm/test/CodeGen/RISCV/rvv/fixed-vectors-ctlz-vp.ll b/llvm/test/CodeGen/RISCV/rvv/fixed-vectors-ctlz-vp.ll
index b42fb8c..f5e6b92 100644
--- a/llvm/test/CodeGen/RISCV/rvv/fixed-vectors-ctlz-vp.ll
+++ b/llvm/test/CodeGen/RISCV/rvv/fixed-vectors-ctlz-vp.ll
@@ -1,7 +1,7 @@
 ; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py
-; RUN: llc -mtriple=riscv32 -mattr=+d,+zfh,+zvfh,+v,+m -target-abi=ilp32d \
+; RUN: llc -mtriple=riscv32 -mattr=+v,+m -target-abi=ilp32d \
 ; RUN:   -verify-machineinstrs < %s | FileCheck %s --check-prefixes=CHECK,RV32
-; RUN: llc -mtriple=riscv64 -mattr=+d,+zfh,+zvfh,+v,+m -target-abi=lp64d \
+; RUN: llc -mtriple=riscv64 -mattr=+v,+m -target-abi=lp64d \
 ; RUN:   -verify-machineinstrs < %s | FileCheck %s --check-prefixes=CHECK,RV64
 
 declare <2 x i8> @llvm.vp.ctlz.v2i8(<2 x i8>, i1 immarg, <2 x i1>, i32)
diff --git a/llvm/test/CodeGen/RISCV/rvv/fixed-vectors-ctpop-vp.ll b/llvm/test/CodeGen/RISCV/rvv/fixed-vectors-ctpop-vp.ll
index 5fceab8..e90e52f 100644
--- a/llvm/test/CodeGen/RISCV/rvv/fixed-vectors-ctpop-vp.ll
+++ b/llvm/test/CodeGen/RISCV/rvv/fixed-vectors-ctpop-vp.ll
@@ -1,7 +1,7 @@
 ; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py
-; RUN: llc -mtriple=riscv32 -mattr=+d,+zfh,+zvfh,+v,+m -target-abi=ilp32d \
+; RUN: llc -mtriple=riscv32 -mattr=+v,+m -target-abi=ilp32d \
 ; RUN:   -verify-machineinstrs < %s | FileCheck %s --check-prefixes=CHECK,RV32
-; RUN: llc -mtriple=riscv64 -mattr=+d,+zfh,+zvfh,+v,+m -target-abi=lp64d \
+; RUN: llc -mtriple=riscv64 -mattr=+v,+m -target-abi=lp64d \
 ; RUN:   -verify-machineinstrs < %s | FileCheck %s --check-prefixes=CHECK,RV64
 
 declare <2 x i8> @llvm.vp.ctpop.v2i8(<2 x i8>, <2 x i1>, i32)
diff --git a/llvm/test/CodeGen/RISCV/rvv/fixed-vectors-cttz-vp.ll b/llvm/test/CodeGen/RISCV/rvv/fixed-vectors-cttz-vp.ll
index e7736e7..dfad788 100644
--- a/llvm/test/CodeGen/RISCV/rvv/fixed-vectors-cttz-vp.ll
+++ b/llvm/test/CodeGen/RISCV/rvv/fixed-vectors-cttz-vp.ll
@@ -1,7 +1,7 @@
 ; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py
-; RUN: llc -mtriple=riscv32 -mattr=+d,+zfh,+zvfh,+v,+m -target-abi=ilp32d \
+; RUN: llc -mtriple=riscv32 -mattr=+v,+m -target-abi=ilp32d \
 ; RUN:   -verify-machineinstrs < %s | FileCheck %s --check-prefixes=CHECK,RV32
-; RUN: llc -mtriple=riscv64 -mattr=+d,+zfh,+zvfh,+v,+m -target-abi=lp64d \
+; RUN: llc -mtriple=riscv64 -mattr=+v,+m -target-abi=lp64d \
 ; RUN:   -verify-machineinstrs < %s | FileCheck %s --check-prefixes=CHECK,RV64
 
 declare <2 x i8> @llvm.vp.cttz.v2i8(<2 x i8>, i1 immarg, <2 x i1>, i32)
diff --git a/llvm/test/CodeGen/RISCV/rvv/fixed-vectors-extload-truncstore.ll b/llvm/test/CodeGen/RISCV/rvv/fixed-vectors-extload-truncstore.ll
index 369f905..bb2b57f 100644
--- a/llvm/test/CodeGen/RISCV/rvv/fixed-vectors-extload-truncstore.ll
+++ b/llvm/test/CodeGen/RISCV/rvv/fixed-vectors-extload-truncstore.ll
@@ -1,6 +1,6 @@
 ; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py
-; RUN: llc -mtriple=riscv32 -mattr=+d,+zfh,+zvfh,+v -verify-machineinstrs < %s | FileCheck %s
-; RUN: llc -mtriple=riscv64 -mattr=+d,+zfh,+zvfh,+v -verify-machineinstrs < %s | FileCheck %s
+; RUN: llc -mtriple=riscv32 -mattr=+v -verify-machineinstrs < %s | FileCheck %s
+; RUN: llc -mtriple=riscv64 -mattr=+v -verify-machineinstrs < %s | FileCheck %s
 
 define <2 x i16> @sextload_v2i1_v2i16(ptr %x) {
 ; CHECK-LABEL: sextload_v2i1_v2i16:
diff --git a/llvm/test/CodeGen/RISCV/rvv/fixed-vectors-extract-i1.ll b/llvm/test/CodeGen/RISCV/rvv/fixed-vectors-extract-i1.ll
index 386c71c..493481a 100644
--- a/llvm/test/CodeGen/RISCV/rvv/fixed-vectors-extract-i1.ll
+++ b/llvm/test/CodeGen/RISCV/rvv/fixed-vectors-extract-i1.ll
@@ -1,9 +1,9 @@
 ; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py
-; RUN: llc -mtriple=riscv32 -target-abi=ilp32d -mattr=+v,+zfh,+zvfh,+f,+d -verify-machineinstrs < %s | FileCheck %s --check-prefixes=CHECK,RV32
-; RUN: llc -mtriple=riscv64 -target-abi=lp64d -mattr=+v,+zfh,+zvfh,+f,+d -verify-machineinstrs < %s | FileCheck %s --check-prefixes=CHECK,RV64
+; RUN: llc -mtriple=riscv32 -target-abi=ilp32d -mattr=+v -verify-machineinstrs < %s | FileCheck %s --check-prefixes=CHECK,RV32
+; RUN: llc -mtriple=riscv64 -target-abi=lp64d -mattr=+v -verify-machineinstrs < %s | FileCheck %s --check-prefixes=CHECK,RV64
 
-; RUN: llc -mtriple=riscv32 -target-abi=ilp32d -mattr=+v,+zfh,+zvfh,+f,+d,+zbs -verify-machineinstrs < %s | FileCheck %s --check-prefixes=CHECK,RV32ZBS
-; RUN: llc -mtriple=riscv64 -target-abi=lp64d -mattr=+v,+zfh,+zvfh,+f,+d,+zbs -verify-machineinstrs < %s | FileCheck %s --check-prefixes=CHECK,RV64ZBS
+; RUN: llc -mtriple=riscv32 -target-abi=ilp32d -mattr=+v,+zbs -verify-machineinstrs < %s | FileCheck %s --check-prefixes=CHECK,RV32ZBS
+; RUN: llc -mtriple=riscv64 -target-abi=lp64d -mattr=+v,+zbs -verify-machineinstrs < %s | FileCheck %s --check-prefixes=CHECK,RV64ZBS
 
 define i1 @extractelt_v1i1(ptr %x, i64 %idx) nounwind {
 ; CHECK-LABEL: extractelt_v1i1:
diff --git a/llvm/test/CodeGen/RISCV/rvv/fixed-vectors-fp-bitcast.ll b/llvm/test/CodeGen/RISCV/rvv/fixed-vectors-fp-bitcast.ll
index 2fe08fc..5f5015c 100644
--- a/llvm/test/CodeGen/RISCV/rvv/fixed-vectors-fp-bitcast.ll
+++ b/llvm/test/CodeGen/RISCV/rvv/fixed-vectors-fp-bitcast.ll
@@ -19,7 +19,7 @@ define i16 @bitcast_v1f16_i16(<1 x half> %a) {
 define half @bitcast_v1f16_f16(<1 x half> %a) {
 ; CHECK-LABEL: bitcast_v1f16_f16:
 ; CHECK:       # %bb.0:
-; CHECK-NEXT:    vsetivli zero, 1, e16, mf4, ta, ma
+; CHECK-NEXT:    vsetivli zero, 1, e16, m1, ta, ma
 ; CHECK-NEXT:    vfmv.f.s fa0, v8
 ; CHECK-NEXT:    ret
   %b = bitcast <1 x half> %a to half
@@ -49,7 +49,7 @@ define i32 @bitcast_v1f32_i32(<1 x float> %a) {
 define float @bitcast_v2f16_f32(<2 x half> %a) {
 ; CHECK-LABEL: bitcast_v2f16_f32:
 ; CHECK:       # %bb.0:
-; CHECK-NEXT:    vsetivli zero, 1, e32, mf2, ta, ma
+; CHECK-NEXT:    vsetivli zero, 1, e32, m1, ta, ma
 ; CHECK-NEXT:    vfmv.f.s fa0, v8
 ; CHECK-NEXT:    ret
   %b = bitcast <2 x half> %a to float
@@ -59,7 +59,7 @@ define float @bitcast_v2f16_f32(<2 x half> %a) {
 define float @bitcast_v1f32_f32(<1 x float> %a) {
 ; CHECK-LABEL: bitcast_v1f32_f32:
 ; CHECK:       # %bb.0:
-; CHECK-NEXT:    vsetivli zero, 1, e32, mf2, ta, ma
+; CHECK-NEXT:    vsetivli zero, 1, e32, m1, ta, ma
 ; CHECK-NEXT:    vfmv.f.s fa0, v8
 ; CHECK-NEXT:    ret
   %b = bitcast <1 x float> %a to float
@@ -237,7 +237,7 @@ define <1 x double> @bitcast_i64_v1f64(i64 %a) {
 define <1 x i16> @bitcast_f16_v1i16(half %a) {
 ; CHECK-LABEL: bitcast_f16_v1i16:
 ; CHECK:       # %bb.0:
-; CHECK-NEXT:    vsetivli zero, 1, e16, mf4, ta, ma
+; CHECK-NEXT:    vsetivli zero, 1, e16, m1, ta, ma
 ; CHECK-NEXT:    vfmv.s.f v8, fa0
 ; CHECK-NEXT:    ret
   %b = bitcast half %a to <1 x i16>
@@ -247,7 +247,7 @@ define <1 x i16> @bitcast_f16_v1i16(half %a) {
 define <1 x half> @bitcast_f16_v1f16(half %a) {
 ; CHECK-LABEL: bitcast_f16_v1f16:
 ; CHECK:       # %bb.0:
-; CHECK-NEXT:    vsetivli zero, 1, e16, mf4, ta, ma
+; CHECK-NEXT:    vsetivli zero, 1, e16, m1, ta, ma
 ; CHECK-NEXT:    vfmv.s.f v8, fa0
 ; CHECK-NEXT:    ret
   %b = bitcast half %a to <1 x half>
@@ -257,7 +257,7 @@ define <1 x half> @bitcast_f16_v1f16(half %a) {
 define <2 x i16> @bitcast_f32_v2i16(float %a) {
 ; CHECK-LABEL: bitcast_f32_v2i16:
 ; CHECK:       # %bb.0:
-; CHECK-NEXT:    vsetivli zero, 1, e32, mf2, ta, ma
+; CHECK-NEXT:    vsetivli zero, 1, e32, m1, ta, ma
 ; CHECK-NEXT:    vfmv.s.f v8, fa0
 ; CHECK-NEXT:    ret
   %b = bitcast float %a to <2 x i16>
@@ -267,7 +267,7 @@ define <2 x i16> @bitcast_f32_v2i16(float %a) {
 define <2 x half> @bitcast_f32_v2f16(float %a) {
 ; CHECK-LABEL: bitcast_f32_v2f16:
 ; CHECK:       # %bb.0:
-; CHECK-NEXT:    vsetivli zero, 1, e32, mf2, ta, ma
+; CHECK-NEXT:    vsetivli zero, 1, e32, m1, ta, ma
 ; CHECK-NEXT:    vfmv.s.f v8, fa0
 ; CHECK-NEXT:    ret
   %b = bitcast float %a to <2 x half>
@@ -277,7 +277,7 @@ define <2 x half> @bitcast_f32_v2f16(float %a) {
 define <1 x i32> @bitcast_f32_v1i32(float %a) {
 ; CHECK-LABEL: bitcast_f32_v1i32:
 ; CHECK:       # %bb.0:
-; CHECK-NEXT:    vsetivli zero, 1, e32, mf2, ta, ma
+; CHECK-NEXT:    vsetivli zero, 1, e32, m1, ta, ma
 ; CHECK-NEXT:    vfmv.s.f v8, fa0
 ; CHECK-NEXT:    ret
   %b = bitcast float %a to <1 x i32>
@@ -287,7 +287,7 @@ define <1 x i32> @bitcast_f32_v1i32(float %a) {
 define <1 x float> @bitcast_f32_v1f32(float %a) {
 ; CHECK-LABEL: bitcast_f32_v1f32:
 ; CHECK:       # %bb.0:
-; CHECK-NEXT:    vsetivli zero, 1, e32, mf2, ta, ma
+; CHECK-NEXT:    vsetivli zero, 1, e32, m1, ta, ma
 ; CHECK-NEXT:    vfmv.s.f v8, fa0
 ; CHECK-NEXT:    ret
   %b = bitcast float %a to <1 x float>
diff --git a/llvm/test/CodeGen/RISCV/rvv/fixed-vectors-int-buildvec.ll b/llvm/test/CodeGen/RISCV/rvv/fixed-vectors-int-buildvec.ll
index be602e3..f65431b 100644
--- a/llvm/test/CodeGen/RISCV/rvv/fixed-vectors-int-buildvec.ll
+++ b/llvm/test/CodeGen/RISCV/rvv/fixed-vectors-int-buildvec.ll
@@ -915,6 +915,152 @@ define <16 x i8> @buildvec_not_vid_v16i8() {
   ret <16 x i8> <i8 undef, i8 undef, i8 undef, i8 undef, i8 undef, i8 undef, i8 3, i8 undef, i8 undef, i8 undef, i8 undef, i8 undef, i8 undef, i8 undef, i8 0, i8 0>
 }
 
+define <512 x i8> @buildvec_vid_v512i8_indices_overflow() vscale_range(16, 1024) {
+; CHECK-LABEL: buildvec_vid_v512i8_indices_overflow:
+; CHECK:       # %bb.0:
+; CHECK-NEXT:    li a0, 512
+; CHECK-NEXT:    vsetvli zero, a0, e8, m4, ta, ma
+; CHECK-NEXT:    vid.v v8
+; CHECK-NEXT:    ret
+  ret <512 x i8> <i8 0, i8 1, i8 2, i8 3, i8 4, i8 5, i8 6, i8 7, i8 8, i8 9, i8 10, i8 11, i8 12, i8 13, i8 14, i8 15, i8 16, i8 17, i8 18, i8 19, i8 20, i8 21, i8 22, i8 23, i8 24, i8 25, i8 26, i8 27, i8 28, i8 29, i8 30, i8 31, i8 32, i8 33, i8 34, i8 35, i8 36, i8 37, i8 38, i8 39, i8 40, i8 41, i8 42, i8 43, i8 44, i8 45, i8 46, i8 47, i8 48, i8 49, i8 50, i8 51, i8 52, i8 53, i8 54, i8 55, i8 56, i8 57, i8 58, i8 59, i8 60, i8 61, i8 62, i8 63, i8 64, i8 65, i8 66, i8 67, i8 68, i8 69, i8 70, i8 71, i8 72, i8 73, i8 74, i8 75, i8 76, i8 77, i8 78, i8 79, i8 80, i8 81, i8 82, i8 83, i8 84, i8 85, i8 86, i8 87, i8 88, i8 89, i8 90, i8 91, i8 92, i8 93, i8 94, i8 95, i8 96, i8 97, i8 98, i8 99, i8 100, i8 101, i8 102, i8 103, i8 104, i8 105, i8 106, i8 107, i8 108, i8 109, i8 110, i8 111, i8 112, i8 113, i8 114, i8 115, i8 116, i8 117, i8 118, i8 119, i8 120, i8 121, i8 122, i8 123, i8 124, i8 125, i8 126, i8 127, i8 128, i8 129, i8 130, i8 131, i8 132, i8 133, i8 134, i8 135, i8 136, i8 137, i8 138, i8 139, i8 140, i8 141, i8 142, i8 143, i8 144, i8 145, i8 146, i8 147, i8 148, i8 149, i8 150, i8 151, i8 152, i8 153, i8 154, i8 155, i8 156, i8 157, i8 158, i8 159, i8 160, i8 161, i8 162, i8 163, i8 164, i8 165, i8 166, i8 167, i8 168, i8 169, i8 170, i8 171, i8 172, i8 173, i8 174, i8 175, i8 176, i8 177, i8 178, i8 179, i8 180, i8 181, i8 182, i8 183, i8 184, i8 185, i8 186, i8 187, i8 188, i8 189, i8 190, i8 191, i8 192, i8 193, i8 194, i8 195, i8 196, i8 197, i8 198, i8 199, i8 200, i8 201, i8 202, i8 203, i8 204, i8 205, i8 206, i8 207, i8 208, i8 209, i8 210, i8 211, i8 212, i8 213, i8 214, i8 215, i8 216, i8 217, i8 218, i8 219, i8 220, i8 221, i8 222, i8 223, i8 224, i8 225, i8 226, i8 227, i8 228, i8 229, i8 230, i8 231, i8 232, i8 233, i8 234, i8 235, i8 236, i8 237, i8 238, i8 239, i8 240, i8 241, i8 242, i8 243, i8 244, i8 245, i8 246, i8 247, i8 248, i8 249, i8 250, i8 251, i8 252, i8 253, i8 254, i8 255, i8 0, i8 1, i8 2, i8 3, i8 4, i8 5, i8 6, i8 7, i8 8, i8 9, i8 10, i8 11, i8 12, i8 13, i8 14, i8 15, i8 16, i8 17, i8 18, i8 19, i8 20, i8 21, i8 22, i8 23, i8 24, i8 25, i8 26, i8 27, i8 28, i8 29, i8 30, i8 31, i8 32, i8 33, i8 34, i8 35, i8 36, i8 37, i8 38, i8 39, i8 40, i8 41, i8 42, i8 43, i8 44, i8 45, i8 46, i8 47, i8 48, i8 49, i8 50, i8 51, i8 52, i8 53, i8 54, i8 55, i8 56, i8 57, i8 58, i8 59, i8 60, i8 61, i8 62, i8 63, i8 64, i8 65, i8 66, i8 67, i8 68, i8 69, i8 70, i8 71, i8 72, i8 73, i8 74, i8 75, i8 76, i8 77, i8 78, i8 79, i8 80, i8 81, i8 82, i8 83, i8 84, i8 85, i8 86, i8 87, i8 88, i8 89, i8 90, i8 91, i8 92, i8 93, i8 94, i8 95, i8 96, i8 97, i8 98, i8 99, i8 100, i8 101, i8 102, i8 103, i8 104, i8 105, i8 106, i8 107, i8 108, i8 109, i8 110, i8 111, i8 112, i8 113, i8 114, i8 115, i8 116, i8 117, i8 118, i8 119, i8 120, i8 121, i8 122, i8 123, i8 124, i8 125, i8 126, i8 127, i8 128, i8 129, i8 130, i8 131, i8 132, i8 133, i8 134, i8 135, i8 136, i8 137, i8 138, i8 139, i8 140, i8 141, i8 142, i8 143, i8 144, i8 145, i8 146, i8 147, i8 148, i8 149, i8 150, i8 151, i8 152, i8 153, i8 154, i8 155, i8 156, i8 157, i8 158, i8 159, i8 160, i8 161, i8 162, i8 163, i8 164, i8 165, i8 166, i8 167, i8 168, i8 169, i8 170, i8 171, i8 172, i8 173, i8 174, i8 175, i8 176, i8 177, i8 178, i8 179, i8 180, i8 181, i8 182, i8 183, i8 184, i8 185, i8 186, i8 187, i8 188, i8 189, i8 190, i8 191, i8 192, i8 193, i8 194, i8 195, i8 196, i8 197, i8 198, i8 199, i8 200, i8 201, i8 202, i8 203, i8 204, i8 205, i8 206, i8 207, i8 208, i8 209, i8 210, i8 211, i8 212, i8 213, i8 214, i8 215, i8 216, i8 217, i8 218, i8 219, i8 220, i8 221, i8 222, i8 223, i8 224, i8 225, i8 226, i8 227, i8 228, i8 229, i8 230, i8 231, i8 232, i8 233, i8 234, i8 235, i8 236, i8 237, i8 238, i8 239, i8 240, i8 241, i8 242, i8 243, i8 244, i8 245, i8 246, i8 247, i8 248, i8 249, i8 250, i8 251, i8 252, i8 253, i8 254, i8 255>
+}
+
+define <512 x i8> @buildvec_not_vid_v512i8_indices_overflow_1() vscale_range(16, 1024) {
+; RV32-LABEL: buildvec_not_vid_v512i8_indices_overflow_1:
+; RV32:       # %bb.0:
+; RV32-NEXT:    li a0, 512
+; RV32-NEXT:    vsetivli zero, 16, e32, mf2, ta, ma
+; RV32-NEXT:    vid.v v8
+; RV32-NEXT:    vsrl.vi v8, v8, 3
+; RV32-NEXT:    vadd.vi v0, v8, -1
+; RV32-NEXT:    vsetvli zero, a0, e8, m4, ta, ma
+; RV32-NEXT:    vmv.v.i v8, 1
+; RV32-NEXT:    vmerge.vim v8, v8, 0, v0
+; RV32-NEXT:    ret
+;
+; RV64V-LABEL: buildvec_not_vid_v512i8_indices_overflow_1:
+; RV64V:       # %bb.0:
+; RV64V-NEXT:    li a0, 512
+; RV64V-NEXT:    vsetivli zero, 8, e64, m1, ta, ma
+; RV64V-NEXT:    vid.v v8
+; RV64V-NEXT:    vsrl.vi v8, v8, 2
+; RV64V-NEXT:    vadd.vi v0, v8, -1
+; RV64V-NEXT:    vsetvli zero, a0, e8, m4, ta, ma
+; RV64V-NEXT:    vmv.v.i v8, 1
+; RV64V-NEXT:    vmerge.vim v8, v8, 0, v0
+; RV64V-NEXT:    ret
+;
+; RV64ZVE32-LABEL: buildvec_not_vid_v512i8_indices_overflow_1:
+; RV64ZVE32:       # %bb.0:
+; RV64ZVE32-NEXT:    li a0, 512
+; RV64ZVE32-NEXT:    vsetivli zero, 16, e32, m1, ta, ma
+; RV64ZVE32-NEXT:    vid.v v8
+; RV64ZVE32-NEXT:    vsrl.vi v8, v8, 3
+; RV64ZVE32-NEXT:    vadd.vi v0, v8, -1
+; RV64ZVE32-NEXT:    vsetvli zero, a0, e8, m4, ta, ma
+; RV64ZVE32-NEXT:    vmv.v.i v8, 1
+; RV64ZVE32-NEXT:    vmerge.vim v8, v8, 0, v0
+; RV64ZVE32-NEXT:    ret
+  ret <512 x i8> <i8 0, i8 0, i8 0, i8 0, i8 0, i8 0, i8 0, i8 0, i8 0, i8 0, i8 0, i8 0, i8 0, i8 0, i8 0, i8 0, i8 0, i8 0, i8 0, i8 0, i8 0, i8 0, i8 0, i8 0, i8 0, i8 0, i8 0, i8 0, i8 0, i8 0, i8 0, i8 0, i8 0, i8 0, i8 0, i8 0, i8 0, i8 0, i8 0, i8 0, i8 0, i8 0, i8 0, i8 0, i8 0, i8 0, i8 0, i8 0, i8 0, i8 0, i8 0, i8 0, i8 0, i8 0, i8 0, i8 0, i8 0, i8 0, i8 0, i8 0, i8 0, i8 0, i8 0, i8 0, i8 0, i8 0, i8 0, i8 0, i8 0, i8 0, i8 0, i8 0, i8 0, i8 0, i8 0, i8 0, i8 0, i8 0, i8 0, i8 0, i8 0, i8 0, i8 0, i8 0, i8 0, i8 0, i8 0, i8 0, i8 0, i8 0, i8 0, i8 0, i8 0, i8 0, i8 0, i8 0, i8 0, i8 0, i8 0, i8 0, i8 0, i8 0, i8 0, i8 0, i8 0, i8 0, i8 0, i8 0, i8 0, i8 0, i8 0, i8 0, i8 0, i8 0, i8 0, i8 0, i8 0, i8 0, i8 0, i8 0, i8 0, i8 0, i8 0, i8 0, i8 0, i8 0, i8 0, i8 0, i8 0, i8 0, i8 0, i8 0, i8 0, i8 0, i8 0, i8 0, i8 0, i8 0, i8 0, i8 0, i8 0, i8 0, i8 0, i8 0, i8 0, i8 0, i8 0, i8 0, i8 0, i8 0, i8 0, i8 0, i8 0, i8 0, i8 0, i8 0, i8 0, i8 0, i8 0, i8 0, i8 0, i8 0, i8 0, i8 0, i8 0, i8 0, i8 0, i8 0, i8 0, i8 0, i8 0, i8 0, i8 0, i8 0, i8 0, i8 0, i8 0, i8 0, i8 0, i8 0, i8 0, i8 0, i8 0, i8 0, i8 0, i8 0, i8 0, i8 0, i8 0, i8 0, i8 0, i8 0, i8 0, i8 0, i8 0, i8 0, i8 0, i8 0, i8 0, i8 0, i8 0, i8 0, i8 0, i8 0, i8 0, i8 0, i8 0, i8 0, i8 0, i8 0, i8 0, i8 0, i8 0, i8 0, i8 0, i8 0, i8 0, i8 0, i8 0, i8 0, i8 0, i8 0, i8 0, i8 0, i8 0, i8 0, i8 0, i8 0, i8 0, i8 0, i8 0, i8 0, i8 0, i8 0, i8 0, i8 0, i8 0, i8 0, i8 0, i8 0, i8 0, i8 0, i8 0, i8 0, i8 0, i8 0, i8 0, i8 0, i8 0, i8 0, i8 0, i8 0, i8 0, i8 0, i8 0, i8 0, i8 1, i8 1, i8 1, i8 1, i8 1, i8 1, i8 1, i8 1, i8 1, i8 1, i8 1, i8 1, i8 1, i8 1, i8 1, i8 1, i8 1, i8 1, i8 1, i8 1, i8 1, i8 1, i8 1, i8 1, i8 1, i8 1, i8 1, i8 1, i8 1, i8 1, i8 1, i8 1, i8 1, i8 1, i8 1, i8 1, i8 1, i8 1, i8 1, i8 1, i8 1, i8 1, i8 1, i8 1, i8 1, i8 1, i8 1, i8 1, i8 1, i8 1, i8 1, i8 1, i8 1, i8 1, i8 1, i8 1, i8 1, i8 1, i8 1, i8 1, i8 1, i8 1, i8 1, i8 1, i8 1, i8 1, i8 1, i8 1, i8 1, i8 1, i8 1, i8 1, i8 1, i8 1, i8 1, i8 1, i8 1, i8 1, i8 1, i8 1, i8 1, i8 1, i8 1, i8 1, i8 1, i8 1, i8 1, i8 1, i8 1, i8 1, i8 1, i8 1, i8 1, i8 1, i8 1, i8 1, i8 1, i8 1, i8 1, i8 1, i8 1, i8 1, i8 1, i8 1, i8 1, i8 1, i8 1, i8 1, i8 1, i8 1, i8 1, i8 1, i8 1, i8 1, i8 1, i8 1, i8 1, i8 1, i8 1, i8 1, i8 1, i8 1, i8 1, i8 1, i8 1, i8 1, i8 1, i8 1, i8 1, i8 1, i8 1, i8 1, i8 1, i8 1, i8 1, i8 1, i8 1, i8 1, i8 1, i8 1, i8 1, i8 1, i8 1, i8 1, i8 1, i8 1, i8 1, i8 1, i8 1, i8 1, i8 1, i8 1, i8 1, i8 1, i8 1, i8 1, i8 1, i8 1, i8 1, i8 1, i8 1, i8 1, i8 1, i8 1, i8 1, i8 1, i8 1, i8 1, i8 1, i8 1, i8 1, i8 1, i8 1, i8 1, i8 1, i8 1, i8 1, i8 1, i8 1, i8 1, i8 1, i8 1, i8 1, i8 1, i8 1, i8 1, i8 1, i8 1, i8 1, i8 1, i8 1, i8 1, i8 1, i8 1, i8 1, i8 1, i8 1, i8 1, i8 1, i8 1, i8 1, i8 1, i8 1, i8 1, i8 1, i8 1, i8 1, i8 1, i8 1, i8 1, i8 1, i8 1, i8 1, i8 1, i8 1, i8 1, i8 1, i8 1, i8 1, i8 1, i8 1, i8 1, i8 1, i8 1, i8 1, i8 1, i8 1, i8 1, i8 1, i8 1, i8 1, i8 1, i8 1, i8 1, i8 1, i8 1, i8 1, i8 1, i8 1, i8 1, i8 1, i8 1, i8 1, i8 1, i8 1, i8 1, i8 1, i8 1, i8 1, i8 1, i8 1, i8 1, i8 1, i8 1, i8 1, i8 1>
+}
+
+define <512 x i8> @buildvec_not_vid_v512i8_indices_overflow_2() vscale_range(16, 1024) {
+; RV32-LABEL: buildvec_not_vid_v512i8_indices_overflow_2:
+; RV32:       # %bb.0:
+; RV32-NEXT:    vsetivli zero, 16, e32, mf2, ta, ma
+; RV32-NEXT:    vmv.v.i v0, 15
+; RV32-NEXT:    vmv.v.i v9, 0
+; RV32-NEXT:    vmerge.vim v10, v9, -1, v0
+; RV32-NEXT:    li a0, 512
+; RV32-NEXT:    vsetvli zero, a0, e8, m4, ta, ma
+; RV32-NEXT:    vmv.v.i v12, 3
+; RV32-NEXT:    li a1, 240
+; RV32-NEXT:    vsetvli zero, zero, e16, m8, ta, ma
+; RV32-NEXT:    vmv.s.x v8, a1
+; RV32-NEXT:    vmv1r.v v0, v10
+; RV32-NEXT:    vsetvli zero, zero, e8, m4, ta, ma
+; RV32-NEXT:    vmerge.vim v12, v12, 0, v0
+; RV32-NEXT:    vmv1r.v v0, v8
+; RV32-NEXT:    vsetivli zero, 16, e32, mf2, ta, ma
+; RV32-NEXT:    vmerge.vim v10, v9, -1, v0
+; RV32-NEXT:    li a1, 15
+; RV32-NEXT:    slli a1, a1, 8
+; RV32-NEXT:    vmv.s.x v8, a1
+; RV32-NEXT:    vmv1r.v v0, v10
+; RV32-NEXT:    vsetvli zero, a0, e8, m4, ta, ma
+; RV32-NEXT:    vmerge.vim v12, v12, 1, v0
+; RV32-NEXT:    vmv1r.v v0, v8
+; RV32-NEXT:    vsetivli zero, 16, e32, mf2, ta, ma
+; RV32-NEXT:    vmerge.vim v8, v9, -1, v0
+; RV32-NEXT:    vmv1r.v v0, v8
+; RV32-NEXT:    vsetvli zero, a0, e8, m4, ta, ma
+; RV32-NEXT:    vmerge.vim v8, v12, 2, v0
+; RV32-NEXT:    ret
+;
+; RV64V-LABEL: buildvec_not_vid_v512i8_indices_overflow_2:
+; RV64V:       # %bb.0:
+; RV64V-NEXT:    vsetivli zero, 8, e64, m1, ta, ma
+; RV64V-NEXT:    vmv.v.i v0, 3
+; RV64V-NEXT:    vmv.v.i v9, 0
+; RV64V-NEXT:    vmerge.vim v10, v9, -1, v0
+; RV64V-NEXT:    li a0, 512
+; RV64V-NEXT:    vsetvli zero, a0, e8, m4, ta, ma
+; RV64V-NEXT:    vmv.v.i v12, 3
+; RV64V-NEXT:    vsetivli zero, 1, e8, mf8, ta, ma
+; RV64V-NEXT:    vmv.v.i v8, 12
+; RV64V-NEXT:    vmv1r.v v0, v10
+; RV64V-NEXT:    vsetvli zero, a0, e8, m4, ta, ma
+; RV64V-NEXT:    vmerge.vim v12, v12, 0, v0
+; RV64V-NEXT:    vmv1r.v v0, v8
+; RV64V-NEXT:    vsetivli zero, 8, e64, m1, ta, ma
+; RV64V-NEXT:    vmerge.vim v10, v9, -1, v0
+; RV64V-NEXT:    li a1, 48
+; RV64V-NEXT:    vmv.s.x v8, a1
+; RV64V-NEXT:    vmv.v.v v0, v10
+; RV64V-NEXT:    vsetvli zero, a0, e8, m4, ta, ma
+; RV64V-NEXT:    vmerge.vim v12, v12, 1, v0
+; RV64V-NEXT:    vmv1r.v v0, v8
+; RV64V-NEXT:    vsetivli zero, 8, e64, m1, ta, ma
+; RV64V-NEXT:    vmerge.vim v8, v9, -1, v0
+; RV64V-NEXT:    vmv.v.v v0, v8
+; RV64V-NEXT:    vsetvli zero, a0, e8, m4, ta, ma
+; RV64V-NEXT:    vmerge.vim v8, v12, 2, v0
+; RV64V-NEXT:    ret
+;
+; RV64ZVE32-LABEL: buildvec_not_vid_v512i8_indices_overflow_2:
+; RV64ZVE32:       # %bb.0:
+; RV64ZVE32-NEXT:    vsetivli zero, 16, e32, m1, ta, ma
+; RV64ZVE32-NEXT:    vmv.v.i v0, 15
+; RV64ZVE32-NEXT:    vmv.v.i v9, 0
+; RV64ZVE32-NEXT:    vmerge.vim v10, v9, -1, v0
+; RV64ZVE32-NEXT:    li a0, 512
+; RV64ZVE32-NEXT:    vsetvli zero, a0, e8, m4, ta, ma
+; RV64ZVE32-NEXT:    vmv.v.i v12, 3
+; RV64ZVE32-NEXT:    li a1, 240
+; RV64ZVE32-NEXT:    vsetvli zero, zero, e16, m8, ta, ma
+; RV64ZVE32-NEXT:    vmv.s.x v8, a1
+; RV64ZVE32-NEXT:    vmv1r.v v0, v10
+; RV64ZVE32-NEXT:    vsetvli zero, zero, e8, m4, ta, ma
+; RV64ZVE32-NEXT:    vmerge.vim v12, v12, 0, v0
+; RV64ZVE32-NEXT:    vmv1r.v v0, v8
+; RV64ZVE32-NEXT:    vsetivli zero, 16, e32, m1, ta, ma
+; RV64ZVE32-NEXT:    vmerge.vim v10, v9, -1, v0
+; RV64ZVE32-NEXT:    li a1, 15
+; RV64ZVE32-NEXT:    slli a1, a1, 8
+; RV64ZVE32-NEXT:    vmv.s.x v8, a1
+; RV64ZVE32-NEXT:    vmv.v.v v0, v10
+; RV64ZVE32-NEXT:    vsetvli zero, a0, e8, m4, ta, ma
+; RV64ZVE32-NEXT:    vmerge.vim v12, v12, 1, v0
+; RV64ZVE32-NEXT:    vmv1r.v v0, v8
+; RV64ZVE32-NEXT:    vsetivli zero, 16, e32, m1, ta, ma
+; RV64ZVE32-NEXT:    vmerge.vim v8, v9, -1, v0
+; RV64ZVE32-NEXT:    vmv.v.v v0, v8
+; RV64ZVE32-NEXT:    vsetvli zero, a0, e8, m4, ta, ma
+; RV64ZVE32-NEXT:    vmerge.vim v8, v12, 2, v0
+; RV64ZVE32-NEXT:    ret
+  ret <512 x i8> <i8 0, i8 0, i8 0, i8 0, i8 0, i8 0, i8 0, i8 0, i8 0, i8 0, i8 0, i8 0, i8 0, i8 0, i8 0, i8 0, i8 0, i8 0, i8 0, i8 0, i8 0, i8 0, i8 0, i8 0, i8 0, i8 0, i8 0, i8 0, i8 0, i8 0, i8 0, i8 0, i8 0, i8 0, i8 0, i8 0, i8 0, i8 0, i8 0, i8 0, i8 0, i8 0, i8 0, i8 0, i8 0, i8 0, i8 0, i8 0, i8 0, i8 0, i8 0, i8 0, i8 0, i8 0, i8 0, i8 0, i8 0, i8 0, i8 0, i8 0, i8 0, i8 0, i8 0, i8 0, i8 0, i8 0, i8 0, i8 0, i8 0, i8 0, i8 0, i8 0, i8 0, i8 0, i8 0, i8 0, i8 0, i8 0, i8 0, i8 0, i8 0, i8 0, i8 0, i8 0, i8 0, i8 0, i8 0, i8 0, i8 0, i8 0, i8 0, i8 0, i8 0, i8 0, i8 0, i8 0, i8 0, i8 0, i8 0, i8 0, i8 0, i8 0, i8 0, i8 0, i8 0, i8 0, i8 0, i8 0, i8 0, i8 0, i8 0, i8 0, i8 0, i8 0, i8 0, i8 0, i8 0, i8 0, i8 0, i8 0, i8 0, i8 0, i8 0, i8 0, i8 0, i8 0, i8 0, i8 0, i8 1, i8 1, i8 1, i8 1, i8 1, i8 1, i8 1, i8 1, i8 1, i8 1, i8 1, i8 1, i8 1, i8 1, i8 1, i8 1, i8 1, i8 1, i8 1, i8 1, i8 1, i8 1, i8 1, i8 1, i8 1, i8 1, i8 1, i8 1, i8 1, i8 1, i8 1, i8 1, i8 1, i8 1, i8 1, i8 1, i8 1, i8 1, i8 1, i8 1, i8 1, i8 1, i8 1, i8 1, i8 1, i8 1, i8 1, i8 1, i8 1, i8 1, i8 1, i8 1, i8 1, i8 1, i8 1, i8 1, i8 1, i8 1, i8 1, i8 1, i8 1, i8 1, i8 1, i8 1, i8 1, i8 1, i8 1, i8 1, i8 1, i8 1, i8 1, i8 1, i8 1, i8 1, i8 1, i8 1, i8 1, i8 1, i8 1, i8 1, i8 1, i8 1, i8 1, i8 1, i8 1, i8 1, i8 1, i8 1, i8 1, i8 1, i8 1, i8 1, i8 1, i8 1, i8 1, i8 1, i8 1, i8 1, i8 1, i8 1, i8 1, i8 1, i8 1, i8 1, i8 1, i8 1, i8 1, i8 1, i8 1, i8 1, i8 1, i8 1, i8 1, i8 1, i8 1, i8 1, i8 1, i8 1, i8 1, i8 1, i8 1, i8 1, i8 1, i8 1, i8 1, i8 1, i8 1, i8 1, i8 2, i8 2, i8 2, i8 2, i8 2, i8 2, i8 2, i8 2, i8 2, i8 2, i8 2, i8 2, i8 2, i8 2, i8 2, i8 2, i8 2, i8 2, i8 2, i8 2, i8 2, i8 2, i8 2, i8 2, i8 2, i8 2, i8 2, i8 2, i8 2, i8 2, i8 2, i8 2, i8 2, i8 2, i8 2, i8 2, i8 2, i8 2, i8 2, i8 2, i8 2, i8 2, i8 2, i8 2, i8 2, i8 2, i8 2, i8 2, i8 2, i8 2, i8 2, i8 2, i8 2, i8 2, i8 2, i8 2, i8 2, i8 2, i8 2, i8 2, i8 2, i8 2, i8 2, i8 2, i8 2, i8 2, i8 2, i8 2, i8 2, i8 2, i8 2, i8 2, i8 2, i8 2, i8 2, i8 2, i8 2, i8 2, i8 2, i8 2, i8 2, i8 2, i8 2, i8 2, i8 2, i8 2, i8 2, i8 2, i8 2, i8 2, i8 2, i8 2, i8 2, i8 2, i8 2, i8 2, i8 2, i8 2, i8 2, i8 2, i8 2, i8 2, i8 2, i8 2, i8 2, i8 2, i8 2, i8 2, i8 2, i8 2, i8 2, i8 2, i8 2, i8 2, i8 2, i8 2, i8 2, i8 2, i8 2, i8 2, i8 2, i8 2, i8 2, i8 2, i8 2, i8 2, i8 2, i8 2, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3>
+}
+
 define <8 x i32> @prefix_overwrite(<8 x i32> %vin, i32 %a, i32 %b, i32 %c, i32 %d) {
 ; CHECK-LABEL: prefix_overwrite:
 ; CHECK:       # %bb.0:
diff --git a/llvm/test/CodeGen/RISCV/rvv/fixed-vectors-llrint.ll b/llvm/test/CodeGen/RISCV/rvv/fixed-vectors-llrint.ll
index 2d3865b..901be44 100644
--- a/llvm/test/CodeGen/RISCV/rvv/fixed-vectors-llrint.ll
+++ b/llvm/test/CodeGen/RISCV/rvv/fixed-vectors-llrint.ll
@@ -11,7 +11,7 @@ define <1 x i64> @llrint_v1i64_v1f32(<1 x float> %x) {
 ; RV32-NEXT:    .cfi_def_cfa_offset 16
 ; RV32-NEXT:    sw ra, 12(sp) # 4-byte Folded Spill
 ; RV32-NEXT:    .cfi_offset ra, -4
-; RV32-NEXT:    vsetivli zero, 1, e32, mf2, ta, ma
+; RV32-NEXT:    vsetivli zero, 1, e32, m1, ta, ma
 ; RV32-NEXT:    vfmv.f.s fa0, v8
 ; RV32-NEXT:    call llrintf
 ; RV32-NEXT:    vsetivli zero, 2, e32, mf2, ta, ma
@@ -23,10 +23,10 @@ define <1 x i64> @llrint_v1i64_v1f32(<1 x float> %x) {
 ;
 ; RV64-LABEL: llrint_v1i64_v1f32:
 ; RV64:       # %bb.0:
-; RV64-NEXT:    vsetivli zero, 1, e32, mf2, ta, ma
+; RV64-NEXT:    vsetivli zero, 1, e32, m1, ta, ma
 ; RV64-NEXT:    vfmv.f.s fa5, v8
 ; RV64-NEXT:    fcvt.l.s a0, fa5
-; RV64-NEXT:    vsetvli zero, zero, e64, m1, ta, ma
+; RV64-NEXT:    vsetvli zero, zero, e64, m2, ta, ma
 ; RV64-NEXT:    vmv.s.x v8, a0
 ; RV64-NEXT:    ret
   %a = call <1 x i64> @llvm.llrint.v1i64.v1f32(<1 x float> %x)
@@ -47,7 +47,7 @@ define <2 x i64> @llrint_v2i64_v2f32(<2 x float> %x) {
 ; RV32-NEXT:    .cfi_escape 0x0f, 0x0d, 0x72, 0x00, 0x11, 0x20, 0x22, 0x11, 0x02, 0x92, 0xa2, 0x38, 0x00, 0x1e, 0x22 # sp + 32 + 2 * vlenb
 ; RV32-NEXT:    addi a0, sp, 16
 ; RV32-NEXT:    vs1r.v v8, (a0) # Unknown-size Folded Spill
-; RV32-NEXT:    vsetivli zero, 1, e32, mf2, ta, ma
+; RV32-NEXT:    vsetivli zero, 1, e32, m1, ta, ma
 ; RV32-NEXT:    vfmv.f.s fa0, v8
 ; RV32-NEXT:    call llrintf
 ; RV32-NEXT:    vsetivli zero, 4, e32, m1, ta, ma
diff --git a/llvm/test/CodeGen/RISCV/rvv/fixed-vectors-lrint.ll b/llvm/test/CodeGen/RISCV/rvv/fixed-vectors-lrint.ll
index de47d85..a90ee3e 100644
--- a/llvm/test/CodeGen/RISCV/rvv/fixed-vectors-lrint.ll
+++ b/llvm/test/CodeGen/RISCV/rvv/fixed-vectors-lrint.ll
@@ -9,7 +9,7 @@
 define <1 x iXLen> @lrint_v1f32(<1 x float> %x) {
 ; RV32-LABEL: lrint_v1f32:
 ; RV32:       # %bb.0:
-; RV32-NEXT:    vsetivli zero, 1, e32, mf2, ta, ma
+; RV32-NEXT:    vsetivli zero, 1, e32, m1, ta, ma
 ; RV32-NEXT:    vfmv.f.s fa5, v8
 ; RV32-NEXT:    fcvt.w.s a0, fa5
 ; RV32-NEXT:    vmv.s.x v8, a0
@@ -17,7 +17,7 @@ define <1 x iXLen> @lrint_v1f32(<1 x float> %x) {
 ;
 ; RV64-i32-LABEL: lrint_v1f32:
 ; RV64-i32:       # %bb.0:
-; RV64-i32-NEXT:    vsetivli zero, 1, e32, mf2, ta, ma
+; RV64-i32-NEXT:    vsetivli zero, 1, e32, m1, ta, ma
 ; RV64-i32-NEXT:    vfmv.f.s fa5, v8
 ; RV64-i32-NEXT:    fcvt.l.s a0, fa5
 ; RV64-i32-NEXT:    vmv.s.x v8, a0
@@ -25,10 +25,10 @@ define <1 x iXLen> @lrint_v1f32(<1 x float> %x) {
 ;
 ; RV64-i64-LABEL: lrint_v1f32:
 ; RV64-i64:       # %bb.0:
-; RV64-i64-NEXT:    vsetivli zero, 1, e32, mf2, ta, ma
+; RV64-i64-NEXT:    vsetivli zero, 1, e32, m1, ta, ma
 ; RV64-i64-NEXT:    vfmv.f.s fa5, v8
 ; RV64-i64-NEXT:    fcvt.l.s a0, fa5
-; RV64-i64-NEXT:    vsetvli zero, zero, e64, m1, ta, ma
+; RV64-i64-NEXT:    vsetvli zero, zero, e64, m2, ta, ma
 ; RV64-i64-NEXT:    vmv.s.x v8, a0
 ; RV64-i64-NEXT:    ret
   %a = call <1 x iXLen> @llvm.lrint.v1iXLen.v1f32(<1 x float> %x)
diff --git a/llvm/test/CodeGen/RISCV/rvv/fixed-vectors-reduction-fp-vp.ll b/llvm/test/CodeGen/RISCV/rvv/fixed-vectors-reduction-fp-vp.ll
index 7adaaa0..793e8eb 100644
--- a/llvm/test/CodeGen/RISCV/rvv/fixed-vectors-reduction-fp-vp.ll
+++ b/llvm/test/CodeGen/RISCV/rvv/fixed-vectors-reduction-fp-vp.ll
@@ -13,7 +13,7 @@ declare half @llvm.vp.reduce.fadd.v2f16(half, <2 x half>, <2 x i1>, i32)
 define half @vpreduce_fadd_v2f16(half %s, <2 x half> %v, <2 x i1> %m, i32 zeroext %evl) {
 ; ZVFH-LABEL: vpreduce_fadd_v2f16:
 ; ZVFH:       # %bb.0:
-; ZVFH-NEXT:    vsetivli zero, 1, e16, mf4, ta, ma
+; ZVFH-NEXT:    vsetivli zero, 1, e16, m1, ta, ma
 ; ZVFH-NEXT:    vfmv.s.f v9, fa0
 ; ZVFH-NEXT:    vsetvli zero, a0, e16, mf4, ta, ma
 ; ZVFH-NEXT:    vfredusum.vs v9, v8, v9, v0.t
@@ -39,7 +39,7 @@ define half @vpreduce_fadd_v2f16(half %s, <2 x half> %v, <2 x i1> %m, i32 zeroex
 define half @vpreduce_ord_fadd_v2f16(half %s, <2 x half> %v, <2 x i1> %m, i32 zeroext %evl) {
 ; ZVFH-LABEL: vpreduce_ord_fadd_v2f16:
 ; ZVFH:       # %bb.0:
-; ZVFH-NEXT:    vsetivli zero, 1, e16, mf4, ta, ma
+; ZVFH-NEXT:    vsetivli zero, 1, e16, m1, ta, ma
 ; ZVFH-NEXT:    vfmv.s.f v9, fa0
 ; ZVFH-NEXT:    vsetvli zero, a0, e16, mf4, ta, ma
 ; ZVFH-NEXT:    vfredosum.vs v9, v8, v9, v0.t
@@ -67,7 +67,7 @@ declare half @llvm.vp.reduce.fadd.v4f16(half, <4 x half>, <4 x i1>, i32)
 define half @vpreduce_fadd_v4f16(half %s, <4 x half> %v, <4 x i1> %m, i32 zeroext %evl) {
 ; ZVFH-LABEL: vpreduce_fadd_v4f16:
 ; ZVFH:       # %bb.0:
-; ZVFH-NEXT:    vsetivli zero, 1, e16, mf2, ta, ma
+; ZVFH-NEXT:    vsetivli zero, 1, e16, m1, ta, ma
 ; ZVFH-NEXT:    vfmv.s.f v9, fa0
 ; ZVFH-NEXT:    vsetvli zero, a0, e16, mf2, ta, ma
 ; ZVFH-NEXT:    vfredusum.vs v9, v8, v9, v0.t
@@ -93,7 +93,7 @@ define half @vpreduce_fadd_v4f16(half %s, <4 x half> %v, <4 x i1> %m, i32 zeroex
 define half @vpreduce_ord_fadd_v4f16(half %s, <4 x half> %v, <4 x i1> %m, i32 zeroext %evl) {
 ; ZVFH-LABEL: vpreduce_ord_fadd_v4f16:
 ; ZVFH:       # %bb.0:
-; ZVFH-NEXT:    vsetivli zero, 1, e16, mf2, ta, ma
+; ZVFH-NEXT:    vsetivli zero, 1, e16, m1, ta, ma
 ; ZVFH-NEXT:    vfmv.s.f v9, fa0
 ; ZVFH-NEXT:    vsetvli zero, a0, e16, mf2, ta, ma
 ; ZVFH-NEXT:    vfredosum.vs v9, v8, v9, v0.t
@@ -121,7 +121,7 @@ declare float @llvm.vp.reduce.fadd.v2f32(float, <2 x float>, <2 x i1>, i32)
 define float @vpreduce_fadd_v2f32(float %s, <2 x float> %v, <2 x i1> %m, i32 zeroext %evl) {
 ; CHECK-LABEL: vpreduce_fadd_v2f32:
 ; CHECK:       # %bb.0:
-; CHECK-NEXT:    vsetivli zero, 1, e32, mf2, ta, ma
+; CHECK-NEXT:    vsetivli zero, 1, e32, m1, ta, ma
 ; CHECK-NEXT:    vfmv.s.f v9, fa0
 ; CHECK-NEXT:    vsetvli zero, a0, e32, mf2, ta, ma
 ; CHECK-NEXT:    vfredusum.vs v9, v8, v9, v0.t
@@ -134,7 +134,7 @@ define float @vpreduce_fadd_v2f32(float %s, <2 x float> %v, <2 x i1> %m, i32 zer
 define float @vpreduce_ord_fadd_v2f32(float %s, <2 x float> %v, <2 x i1> %m, i32 zeroext %evl) {
 ; CHECK-LABEL: vpreduce_ord_fadd_v2f32:
 ; CHECK:       # %bb.0:
-; CHECK-NEXT:    vsetivli zero, 1, e32, mf2, ta, ma
+; CHECK-NEXT:    vsetivli zero, 1, e32, m1, ta, ma
 ; CHECK-NEXT:    vfmv.s.f v9, fa0
 ; CHECK-NEXT:    vsetvli zero, a0, e32, mf2, ta, ma
 ; CHECK-NEXT:    vfredosum.vs v9, v8, v9, v0.t
diff --git a/llvm/test/CodeGen/RISCV/rvv/fixed-vectors-shuffle-extract-subvector.ll b/llvm/test/CodeGen/RISCV/rvv/fixed-vectors-shuffle-extract-subvector.ll
index d528970..a6bbbaa 100644
--- a/llvm/test/CodeGen/RISCV/rvv/fixed-vectors-shuffle-extract-subvector.ll
+++ b/llvm/test/CodeGen/RISCV/rvv/fixed-vectors-shuffle-extract-subvector.ll
@@ -1,6 +1,6 @@
 ; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py UTC_ARGS: --version 2
-; RUN: llc -mtriple=riscv32 -mattr=+m,+v,+f,+d,+zfh,+zvfh -verify-machineinstrs < %s | FileCheck %s --check-prefixes=CHECK
-; RUN: llc -mtriple=riscv64 -mattr=+m,+v,+f,+d,+zfh,+zvfh -verify-machineinstrs < %s | FileCheck %s --check-prefixes=CHECK
+; RUN: llc -mtriple=riscv32 -mattr=+m,+v -verify-machineinstrs < %s | FileCheck %s --check-prefixes=CHECK
+; RUN: llc -mtriple=riscv64 -mattr=+m,+v -verify-machineinstrs < %s | FileCheck %s --check-prefixes=CHECK
 
 define <8 x i8> @v8i8_from_v16xi8_low(<16 x i8> %a) nounwind {
 ; CHECK-LABEL: v8i8_from_v16xi8_low:
diff --git a/llvm/test/CodeGen/RISCV/rvv/fixed-vectors-strided-vpload.ll b/llvm/test/CodeGen/RISCV/rvv/fixed-vectors-strided-vpload.ll
index b8c7037..849f98c 100644
--- a/llvm/test/CodeGen/RISCV/rvv/fixed-vectors-strided-vpload.ll
+++ b/llvm/test/CodeGen/RISCV/rvv/fixed-vectors-strided-vpload.ll
@@ -638,14 +638,14 @@ declare <33 x double> @llvm.experimental.vp.strided.load.v33f64.p0.i64(ptr, i64,
 define <4 x i8> @zero_strided_unmasked_vpload_4i8_i8(ptr %ptr) {
 ; CHECK-OPT-LABEL: zero_strided_unmasked_vpload_4i8_i8:
 ; CHECK-OPT:       # %bb.0:
-; CHECK-OPT-NEXT:    vsetivli zero, 4, e8, mf4, ta, ma
+; CHECK-OPT-NEXT:    vsetivli zero, 3, e8, mf4, ta, ma
 ; CHECK-OPT-NEXT:    vlse8.v v8, (a0), zero
 ; CHECK-OPT-NEXT:    ret
 ;
 ; CHECK-NO-OPT-LABEL: zero_strided_unmasked_vpload_4i8_i8:
 ; CHECK-NO-OPT:       # %bb.0:
 ; CHECK-NO-OPT-NEXT:    lbu a0, 0(a0)
-; CHECK-NO-OPT-NEXT:    vsetivli zero, 4, e8, mf4, ta, ma
+; CHECK-NO-OPT-NEXT:    vsetivli zero, 3, e8, mf4, ta, ma
 ; CHECK-NO-OPT-NEXT:    vmv.v.x v8, a0
 ; CHECK-NO-OPT-NEXT:    ret
   %load = call <4 x i8> @llvm.experimental.vp.strided.load.4i8.p0.i8(ptr %ptr, i8 0, <4 x i1> splat (i1 true), i32 3)
@@ -657,14 +657,14 @@ define <4 x i8> @zero_strided_unmasked_vpload_4i8_i8(ptr %ptr) {
 define <4 x half> @zero_strided_unmasked_vpload_4f16(ptr %ptr) {
 ; CHECK-OPT-LABEL: zero_strided_unmasked_vpload_4f16:
 ; CHECK-OPT:       # %bb.0:
-; CHECK-OPT-NEXT:    vsetivli zero, 4, e16, mf2, ta, ma
+; CHECK-OPT-NEXT:    vsetivli zero, 3, e16, mf2, ta, ma
 ; CHECK-OPT-NEXT:    vlse16.v v8, (a0), zero
 ; CHECK-OPT-NEXT:    ret
 ;
 ; CHECK-NO-OPT-LABEL: zero_strided_unmasked_vpload_4f16:
 ; CHECK-NO-OPT:       # %bb.0:
 ; CHECK-NO-OPT-NEXT:    flh fa5, 0(a0)
-; CHECK-NO-OPT-NEXT:    vsetivli zero, 4, e16, mf2, ta, ma
+; CHECK-NO-OPT-NEXT:    vsetivli zero, 3, e16, mf2, ta, ma
 ; CHECK-NO-OPT-NEXT:    vfmv.v.f v8, fa5
 ; CHECK-NO-OPT-NEXT:    ret
   %load = call <4 x half> @llvm.experimental.vp.strided.load.4f16.p0.i32(ptr %ptr, i32 0, <4 x i1> splat (i1 true), i32 3)
diff --git a/llvm/test/CodeGen/RISCV/rvv/fixed-vectors-vfptoi-constrained-sdnode.ll b/llvm/test/CodeGen/RISCV/rvv/fixed-vectors-vfptoi-constrained-sdnode.ll
index fdb6bfe..4334f293 100644
--- a/llvm/test/CodeGen/RISCV/rvv/fixed-vectors-vfptoi-constrained-sdnode.ll
+++ b/llvm/test/CodeGen/RISCV/rvv/fixed-vectors-vfptoi-constrained-sdnode.ll
@@ -34,14 +34,14 @@ declare <1 x i7> @llvm.experimental.constrained.fptosi.v1i7.v1f16(<1 x half>, me
 define <1 x i7> @vfptosi_v1f16_v1i7(<1 x half> %va) strictfp {
 ; RV32-LABEL: vfptosi_v1f16_v1i7:
 ; RV32:       # %bb.0:
-; RV32-NEXT:    vsetivli zero, 1, e16, mf4, ta, ma
+; RV32-NEXT:    vsetivli zero, 1, e16, m1, ta, ma
 ; RV32-NEXT:    vfmv.f.s fa5, v8
 ; RV32-NEXT:    fcvt.w.h a0, fa5, rtz
 ; RV32-NEXT:    ret
 ;
 ; RV64-LABEL: vfptosi_v1f16_v1i7:
 ; RV64:       # %bb.0:
-; RV64-NEXT:    vsetivli zero, 1, e16, mf4, ta, ma
+; RV64-NEXT:    vsetivli zero, 1, e16, m1, ta, ma
 ; RV64-NEXT:    vfmv.f.s fa5, v8
 ; RV64-NEXT:    fcvt.l.h a0, fa5, rtz
 ; RV64-NEXT:    ret
@@ -53,14 +53,14 @@ declare <1 x i7> @llvm.experimental.constrained.fptoui.v1i7.v1f16(<1 x half>, me
 define <1 x i7> @vfptoui_v1f16_v1i7(<1 x half> %va) strictfp {
 ; RV32-LABEL: vfptoui_v1f16_v1i7:
 ; RV32:       # %bb.0:
-; RV32-NEXT:    vsetivli zero, 1, e16, mf4, ta, ma
+; RV32-NEXT:    vsetivli zero, 1, e16, m1, ta, ma
 ; RV32-NEXT:    vfmv.f.s fa5, v8
 ; RV32-NEXT:    fcvt.wu.h a0, fa5, rtz
 ; RV32-NEXT:    ret
 ;
 ; RV64-LABEL: vfptoui_v1f16_v1i7:
 ; RV64:       # %bb.0:
-; RV64-NEXT:    vsetivli zero, 1, e16, mf4, ta, ma
+; RV64-NEXT:    vsetivli zero, 1, e16, m1, ta, ma
 ; RV64-NEXT:    vfmv.f.s fa5, v8
 ; RV64-NEXT:    fcvt.lu.h a0, fa5, rtz
 ; RV64-NEXT:    ret
diff --git a/llvm/test/CodeGen/RISCV/rvv/fixed-vectors-vfw-web-simplification.ll b/llvm/test/CodeGen/RISCV/rvv/fixed-vectors-vfw-web-simplification.ll
index 3a99f53..cb50ca4 100644
--- a/llvm/test/CodeGen/RISCV/rvv/fixed-vectors-vfw-web-simplification.ll
+++ b/llvm/test/CodeGen/RISCV/rvv/fixed-vectors-vfw-web-simplification.ll
@@ -97,3 +97,112 @@ define void @vfwmul_v2f32_multiple_users(ptr %x, ptr %y, ptr %z, <2 x float> %a,
   store <2 x double> %g, ptr %z
   ret void
 }
+
+define void @vfwmacc_v2f32_multiple_users(ptr %x, ptr %y, ptr %z, <2 x float> %a, <2 x float> %b, <2 x float> %b2, <2 x double> %w) {
+; NO_FOLDING-LABEL: vfwmacc_v2f32_multiple_users:
+; NO_FOLDING:       # %bb.0:
+; NO_FOLDING-NEXT:    vsetivli zero, 2, e32, mf2, ta, ma
+; NO_FOLDING-NEXT:    vfwcvt.f.f.v v12, v8
+; NO_FOLDING-NEXT:    vfwcvt.f.f.v v8, v9
+; NO_FOLDING-NEXT:    vfwcvt.f.f.v v9, v10
+; NO_FOLDING-NEXT:    vsetvli zero, zero, e64, m1, ta, ma
+; NO_FOLDING-NEXT:    vfmul.vv v10, v12, v8
+; NO_FOLDING-NEXT:    vfmadd.vv v12, v9, v11
+; NO_FOLDING-NEXT:    vfsub.vv v8, v8, v9
+; NO_FOLDING-NEXT:    vse64.v v10, (a0)
+; NO_FOLDING-NEXT:    vse64.v v12, (a1)
+; NO_FOLDING-NEXT:    vse64.v v8, (a2)
+; NO_FOLDING-NEXT:    ret
+;
+; FOLDING-LABEL: vfwmacc_v2f32_multiple_users:
+; FOLDING:       # %bb.0:
+; FOLDING-NEXT:    vsetivli zero, 2, e32, mf2, ta, ma
+; FOLDING-NEXT:    vfwmul.vv v12, v8, v9
+; FOLDING-NEXT:    vfwmacc.vv v11, v8, v10
+; FOLDING-NEXT:    vfwsub.vv v8, v9, v10
+; FOLDING-NEXT:    vse64.v v12, (a0)
+; FOLDING-NEXT:    vse64.v v11, (a1)
+; FOLDING-NEXT:    vse64.v v8, (a2)
+; FOLDING-NEXT:    ret
+  %c = fpext <2 x float> %a to <2 x double>
+  %d = fpext <2 x float> %b to <2 x double>
+  %d2 = fpext <2 x float> %b2 to <2 x double>
+  %e = fmul <2 x double> %c, %d
+  %f = call <2 x double> @llvm.fma(<2 x double> %c, <2 x double> %d2, <2 x double> %w)
+  %g = fsub <2 x double> %d, %d2
+  store <2 x double> %e, ptr %x
+  store <2 x double> %f, ptr %y
+  store <2 x double> %g, ptr %z
+  ret void
+}
+
+; Negative test. We can't fold because the FMA addend is a user.
+define void @vfwmacc_v2f32_multiple_users_addend_user(ptr %x, ptr %y, ptr %z, <2 x float> %a, <2 x float> %b, <2 x float> %b2) {
+; NO_FOLDING-LABEL: vfwmacc_v2f32_multiple_users_addend_user:
+; NO_FOLDING:       # %bb.0:
+; NO_FOLDING-NEXT:    vsetivli zero, 2, e32, mf2, ta, ma
+; NO_FOLDING-NEXT:    vfwcvt.f.f.v v11, v8
+; NO_FOLDING-NEXT:    vfwcvt.f.f.v v8, v9
+; NO_FOLDING-NEXT:    vfwcvt.f.f.v v9, v10
+; NO_FOLDING-NEXT:    vsetvli zero, zero, e64, m1, ta, ma
+; NO_FOLDING-NEXT:    vfmul.vv v10, v11, v8
+; NO_FOLDING-NEXT:    vfmadd.vv v11, v9, v8
+; NO_FOLDING-NEXT:    vfsub.vv v8, v8, v9
+; NO_FOLDING-NEXT:    vse64.v v10, (a0)
+; NO_FOLDING-NEXT:    vse64.v v11, (a1)
+; NO_FOLDING-NEXT:    vse64.v v8, (a2)
+; NO_FOLDING-NEXT:    ret
+;
+; FOLDING-LABEL: vfwmacc_v2f32_multiple_users_addend_user:
+; FOLDING:       # %bb.0:
+; FOLDING-NEXT:    vsetivli zero, 2, e32, mf2, ta, ma
+; FOLDING-NEXT:    vfwcvt.f.f.v v11, v8
+; FOLDING-NEXT:    vfwcvt.f.f.v v8, v9
+; FOLDING-NEXT:    vfwcvt.f.f.v v9, v10
+; FOLDING-NEXT:    vsetvli zero, zero, e64, m1, ta, ma
+; FOLDING-NEXT:    vfmul.vv v10, v11, v8
+; FOLDING-NEXT:    vfmadd.vv v11, v9, v8
+; FOLDING-NEXT:    vfsub.vv v8, v8, v9
+; FOLDING-NEXT:    vse64.v v10, (a0)
+; FOLDING-NEXT:    vse64.v v11, (a1)
+; FOLDING-NEXT:    vse64.v v8, (a2)
+; FOLDING-NEXT:    ret
+  %c = fpext <2 x float> %a to <2 x double>
+  %d = fpext <2 x float> %b to <2 x double>
+  %d2 = fpext <2 x float> %b2 to <2 x double>
+  %e = fmul <2 x double> %c, %d
+  %f = call <2 x double> @llvm.fma(<2 x double> %c, <2 x double> %d2, <2 x double> %d)
+  %g = fsub <2 x double> %d, %d2
+  store <2 x double> %e, ptr %x
+  store <2 x double> %f, ptr %y
+  store <2 x double> %g, ptr %z
+  ret void
+}
+
+; Negative test. We can't fold because the FMA addend is a user.
+define void @vfwmacc_v2f32_addend_user(ptr %x, <2 x float> %a, <2 x float> %b) {
+; NO_FOLDING-LABEL: vfwmacc_v2f32_addend_user:
+; NO_FOLDING:       # %bb.0:
+; NO_FOLDING-NEXT:    vsetivli zero, 2, e32, mf2, ta, ma
+; NO_FOLDING-NEXT:    vfwcvt.f.f.v v10, v8
+; NO_FOLDING-NEXT:    vfwcvt.f.f.v v8, v9
+; NO_FOLDING-NEXT:    vsetvli zero, zero, e64, m1, ta, ma
+; NO_FOLDING-NEXT:    vfmadd.vv v8, v10, v8
+; NO_FOLDING-NEXT:    vse64.v v8, (a0)
+; NO_FOLDING-NEXT:    ret
+;
+; FOLDING-LABEL: vfwmacc_v2f32_addend_user:
+; FOLDING:       # %bb.0:
+; FOLDING-NEXT:    vsetivli zero, 2, e32, mf2, ta, ma
+; FOLDING-NEXT:    vfwcvt.f.f.v v10, v8
+; FOLDING-NEXT:    vfwcvt.f.f.v v8, v9
+; FOLDING-NEXT:    vsetvli zero, zero, e64, m1, ta, ma
+; FOLDING-NEXT:    vfmadd.vv v8, v10, v8
+; FOLDING-NEXT:    vse64.v v8, (a0)
+; FOLDING-NEXT:    ret
+  %c = fpext <2 x float> %a to <2 x double>
+  %d = fpext <2 x float> %b to <2 x double>
+  %f = call <2 x double> @llvm.fma(<2 x double> %c, <2 x double> %d, <2 x double> %d)
+  store <2 x double> %f, ptr %x
+  ret void
+}
diff --git a/llvm/test/CodeGen/RISCV/rvv/fixed-vectors-vfwmacc.ll b/llvm/test/CodeGen/RISCV/rvv/fixed-vectors-vfwmacc.ll
index 1803b52..5140d89 100644
--- a/llvm/test/CodeGen/RISCV/rvv/fixed-vectors-vfwmacc.ll
+++ b/llvm/test/CodeGen/RISCV/rvv/fixed-vectors-vfwmacc.ll
@@ -2031,11 +2031,8 @@ define <8 x double> @vfwnmsac_fv_v8f64_v8f16(<8 x double> %va, <8 x half> %vb, h
 define <2 x float> @vfwmacc_vf2_v2f32(<2 x float> %va, <2 x half> %vb, half %c) {
 ; CHECK-LABEL: vfwmacc_vf2_v2f32:
 ; CHECK:       # %bb.0:
-; CHECK-NEXT:    fcvt.s.h fa5, fa0
 ; CHECK-NEXT:    vsetivli zero, 2, e16, mf4, ta, ma
-; CHECK-NEXT:    vfwcvt.f.f.v v10, v9
-; CHECK-NEXT:    vsetvli zero, zero, e32, mf2, ta, ma
-; CHECK-NEXT:    vfmacc.vf v8, fa5, v10
+; CHECK-NEXT:    vfwmacc.vf v8, fa0, v9
 ; CHECK-NEXT:    ret
   %cext = fpext half %c to float
   %head = insertelement <2 x float> poison, float %cext, i32 0
@@ -2048,11 +2045,8 @@ define <2 x float> @vfwmacc_vf2_v2f32(<2 x float> %va, <2 x half> %vb, half %c)
 define <2 x float> @vfwmsac_vf2_v2f32(<2 x float> %va, <2 x half> %vb, half %c) {
 ; CHECK-LABEL: vfwmsac_vf2_v2f32:
 ; CHECK:       # %bb.0:
-; CHECK-NEXT:    fcvt.s.h fa5, fa0
 ; CHECK-NEXT:    vsetivli zero, 2, e16, mf4, ta, ma
-; CHECK-NEXT:    vfwcvt.f.f.v v10, v9
-; CHECK-NEXT:    vsetvli zero, zero, e32, mf2, ta, ma
-; CHECK-NEXT:    vfmsac.vf v8, fa5, v10
+; CHECK-NEXT:    vfwmsac.vf v8, fa0, v9
 ; CHECK-NEXT:    ret
   %cext = fpext half %c to float
   %head = insertelement <2 x float> poison, float %cext, i32 0
@@ -2066,11 +2060,8 @@ define <2 x float> @vfwmsac_vf2_v2f32(<2 x float> %va, <2 x half> %vb, half %c)
 define <2 x float> @vfwnmacc_vf2_v2f32(<2 x float> %va, <2 x half> %vb, half %c) {
 ; CHECK-LABEL: vfwnmacc_vf2_v2f32:
 ; CHECK:       # %bb.0:
-; CHECK-NEXT:    fcvt.s.h fa5, fa0
 ; CHECK-NEXT:    vsetivli zero, 2, e16, mf4, ta, ma
-; CHECK-NEXT:    vfwcvt.f.f.v v10, v9
-; CHECK-NEXT:    vsetvli zero, zero, e32, mf2, ta, ma
-; CHECK-NEXT:    vfnmacc.vf v8, fa5, v10
+; CHECK-NEXT:    vfwnmacc.vf v8, fa0, v9
 ; CHECK-NEXT:    ret
   %cext = fpext half %c to float
   %head = insertelement <2 x float> poison, float %cext, i32 0
@@ -2085,11 +2076,8 @@ define <2 x float> @vfwnmacc_vf2_v2f32(<2 x float> %va, <2 x half> %vb, half %c)
 define <2 x float> @vfwnmsac_vf2_v2f32(<2 x float> %va, <2 x half> %vb, half %c) {
 ; CHECK-LABEL: vfwnmsac_vf2_v2f32:
 ; CHECK:       # %bb.0:
-; CHECK-NEXT:    fcvt.s.h fa5, fa0
 ; CHECK-NEXT:    vsetivli zero, 2, e16, mf4, ta, ma
-; CHECK-NEXT:    vfwcvt.f.f.v v10, v9
-; CHECK-NEXT:    vsetvli zero, zero, e32, mf2, ta, ma
-; CHECK-NEXT:    vfnmsac.vf v8, fa5, v10
+; CHECK-NEXT:    vfwnmsac.vf v8, fa0, v9
 ; CHECK-NEXT:    ret
   %cext = fpext half %c to float
   %head = insertelement <2 x float> poison, float %cext, i32 0
diff --git a/llvm/test/CodeGen/RISCV/rvv/fixed-vectors-vmacc-vp.ll b/llvm/test/CodeGen/RISCV/rvv/fixed-vectors-vmacc-vp.ll
index 2181fd8..4805d67 100644
--- a/llvm/test/CodeGen/RISCV/rvv/fixed-vectors-vmacc-vp.ll
+++ b/llvm/test/CodeGen/RISCV/rvv/fixed-vectors-vmacc-vp.ll
@@ -1,7 +1,7 @@
 ; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py
-; RUN: llc -mtriple=riscv32 -mattr=+d,+zfh,+zvfh,+v,+m -target-abi=ilp32d \
+; RUN: llc -mtriple=riscv32 -mattr=+v,+m -target-abi=ilp32d \
 ; RUN:     -verify-machineinstrs < %s | FileCheck %s --check-prefixes=CHECK,RV32
-; RUN: llc -mtriple=riscv64 -mattr=+d,+zfh,+zvfh,+v,+m -target-abi=lp64d \
+; RUN: llc -mtriple=riscv64 -mattr=+v,+m -target-abi=lp64d \
 ; RUN:     -verify-machineinstrs < %s | FileCheck %s --check-prefixes=CHECK,RV64
 
 declare <2 x i8> @llvm.vp.mul.nxv2i8(<2 x i8>, <2 x i8>, <2 x i1>, i32)
diff --git a/llvm/test/CodeGen/RISCV/rvv/fixed-vectors-vnmsac-vp.ll b/llvm/test/CodeGen/RISCV/rvv/fixed-vectors-vnmsac-vp.ll
index 695fba6..805e2e2 100644
--- a/llvm/test/CodeGen/RISCV/rvv/fixed-vectors-vnmsac-vp.ll
+++ b/llvm/test/CodeGen/RISCV/rvv/fixed-vectors-vnmsac-vp.ll
@@ -1,7 +1,7 @@
 ; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py
-; RUN: llc -mtriple=riscv32 -mattr=+d,+zfh,+zvfh,+v,+m -target-abi=ilp32d \
+; RUN: llc -mtriple=riscv32 -mattr=+v,+m -target-abi=ilp32d \
 ; RUN:     -verify-machineinstrs < %s | FileCheck %s --check-prefixes=CHECK,RV32
-; RUN: llc -mtriple=riscv64 -mattr=+d,+zfh,+zvfh,+v,+m -target-abi=lp64d \
+; RUN: llc -mtriple=riscv64 -mattr=+v,+m -target-abi=lp64d \
 ; RUN:     -verify-machineinstrs < %s | FileCheck %s --check-prefixes=CHECK,RV64
 
 declare <2 x i8> @llvm.vp.mul.nxv2i8(<2 x i8>, <2 x i8>, <2 x i1>, i32)
diff --git a/llvm/test/CodeGen/RISCV/rvv/fpclamptosat_vec.ll b/llvm/test/CodeGen/RISCV/rvv/fpclamptosat_vec.ll
index 3e2db3f..1395dc9 100644
--- a/llvm/test/CodeGen/RISCV/rvv/fpclamptosat_vec.ll
+++ b/llvm/test/CodeGen/RISCV/rvv/fpclamptosat_vec.ll
@@ -2670,7 +2670,7 @@ define <2 x i64> @stest_f32i64(<2 x float> %x) {
 ; CHECK-V-NEXT:    mv s1, a1
 ; CHECK-V-NEXT:    addi a0, sp, 32
 ; CHECK-V-NEXT:    vl1r.v v8, (a0) # Unknown-size Folded Reload
-; CHECK-V-NEXT:    vsetivli zero, 1, e32, mf2, ta, ma
+; CHECK-V-NEXT:    vsetivli zero, 1, e32, m1, ta, ma
 ; CHECK-V-NEXT:    vfmv.f.s fa0, v8
 ; CHECK-V-NEXT:    call __fixsfti
 ; CHECK-V-NEXT:    li a2, -1
@@ -2803,7 +2803,7 @@ define <2 x i64> @utest_f32i64(<2 x float> %x) {
 ; CHECK-V-NEXT:    mv s1, a1
 ; CHECK-V-NEXT:    addi a0, sp, 32
 ; CHECK-V-NEXT:    vl1r.v v8, (a0) # Unknown-size Folded Reload
-; CHECK-V-NEXT:    vsetivli zero, 1, e32, mf2, ta, ma
+; CHECK-V-NEXT:    vsetivli zero, 1, e32, m1, ta, ma
 ; CHECK-V-NEXT:    vfmv.f.s fa0, v8
 ; CHECK-V-NEXT:    call __fixunssfti
 ; CHECK-V-NEXT:    snez a1, a1
@@ -2915,7 +2915,7 @@ define <2 x i64> @ustest_f32i64(<2 x float> %x) {
 ; CHECK-V-NEXT:    mv s1, a1
 ; CHECK-V-NEXT:    addi a0, sp, 32
 ; CHECK-V-NEXT:    vl1r.v v8, (a0) # Unknown-size Folded Reload
-; CHECK-V-NEXT:    vsetivli zero, 1, e32, mf2, ta, ma
+; CHECK-V-NEXT:    vsetivli zero, 1, e32, m1, ta, ma
 ; CHECK-V-NEXT:    vfmv.f.s fa0, v8
 ; CHECK-V-NEXT:    call __fixsfti
 ; CHECK-V-NEXT:    mv a2, s1
@@ -5985,7 +5985,7 @@ define <2 x i64> @stest_f32i64_mm(<2 x float> %x) {
 ; CHECK-V-NEXT:    mv s1, a1
 ; CHECK-V-NEXT:    addi a0, sp, 32
 ; CHECK-V-NEXT:    vl1r.v v8, (a0) # Unknown-size Folded Reload
-; CHECK-V-NEXT:    vsetivli zero, 1, e32, mf2, ta, ma
+; CHECK-V-NEXT:    vsetivli zero, 1, e32, m1, ta, ma
 ; CHECK-V-NEXT:    vfmv.f.s fa0, v8
 ; CHECK-V-NEXT:    call __fixsfti
 ; CHECK-V-NEXT:    li a2, -1
@@ -6111,7 +6111,7 @@ define <2 x i64> @utest_f32i64_mm(<2 x float> %x) {
 ; CHECK-V-NEXT:    .cfi_escape 0x0f, 0x0e, 0x72, 0x00, 0x11, 0xc0, 0x00, 0x22, 0x11, 0x02, 0x92, 0xa2, 0x38, 0x00, 0x1e, 0x22 # sp + 64 + 2 * vlenb
 ; CHECK-V-NEXT:    addi a0, sp, 32
 ; CHECK-V-NEXT:    vs1r.v v8, (a0) # Unknown-size Folded Spill
-; CHECK-V-NEXT:    vsetivli zero, 1, e32, mf2, ta, ma
+; CHECK-V-NEXT:    vsetivli zero, 1, e32, m1, ta, ma
 ; CHECK-V-NEXT:    vfmv.f.s fa0, v8
 ; CHECK-V-NEXT:    call __fixunssfti
 ; CHECK-V-NEXT:    mv s0, a0
@@ -6219,7 +6219,7 @@ define <2 x i64> @ustest_f32i64_mm(<2 x float> %x) {
 ; CHECK-V-NEXT:    mv s1, a1
 ; CHECK-V-NEXT:    addi a0, sp, 32
 ; CHECK-V-NEXT:    vl1r.v v8, (a0) # Unknown-size Folded Reload
-; CHECK-V-NEXT:    vsetivli zero, 1, e32, mf2, ta, ma
+; CHECK-V-NEXT:    vsetivli zero, 1, e32, m1, ta, ma
 ; CHECK-V-NEXT:    vfmv.f.s fa0, v8
 ; CHECK-V-NEXT:    call __fixsfti
 ; CHECK-V-NEXT:    mv a2, a1
diff --git a/llvm/test/CodeGen/RISCV/rvv/insertelt-fp.ll b/llvm/test/CodeGen/RISCV/rvv/insertelt-fp.ll
index 060e996..8cfa88e 100644
--- a/llvm/test/CodeGen/RISCV/rvv/insertelt-fp.ll
+++ b/llvm/test/CodeGen/RISCV/rvv/insertelt-fp.ll
@@ -7,7 +7,7 @@
 define <vscale x 1 x half> @insertelt_nxv1f16_0(<vscale x 1 x half> %v, half %elt) {
 ; CHECK-LABEL: insertelt_nxv1f16_0:
 ; CHECK:       # %bb.0:
-; CHECK-NEXT:    vsetvli a0, zero, e16, mf4, tu, ma
+; CHECK-NEXT:    vsetvli a0, zero, e16, m1, tu, ma
 ; CHECK-NEXT:    vfmv.s.f v8, fa0
 ; CHECK-NEXT:    ret
   %r = insertelement <vscale x 1 x half> %v, half %elt, i32 0
@@ -29,7 +29,7 @@ define <vscale x 1 x half> @insertelt_nxv1f16_idx(<vscale x 1 x half> %v, half %
 ; CHECK-LABEL: insertelt_nxv1f16_idx:
 ; CHECK:       # %bb.0:
 ; CHECK-NEXT:    addi a1, a0, 1
-; CHECK-NEXT:    vsetvli a2, zero, e16, mf4, ta, ma
+; CHECK-NEXT:    vsetvli a2, zero, e16, m1, ta, ma
 ; CHECK-NEXT:    vfmv.s.f v9, fa0
 ; CHECK-NEXT:    vsetvli zero, a1, e16, mf4, tu, ma
 ; CHECK-NEXT:    vslideup.vx v8, v9, a0
@@ -41,7 +41,7 @@ define <vscale x 1 x half> @insertelt_nxv1f16_idx(<vscale x 1 x half> %v, half %
 define <vscale x 2 x half> @insertelt_nxv2f16_0(<vscale x 2 x half> %v, half %elt) {
 ; CHECK-LABEL: insertelt_nxv2f16_0:
 ; CHECK:       # %bb.0:
-; CHECK-NEXT:    vsetvli a0, zero, e16, mf2, tu, ma
+; CHECK-NEXT:    vsetvli a0, zero, e16, m1, tu, ma
 ; CHECK-NEXT:    vfmv.s.f v8, fa0
 ; CHECK-NEXT:    ret
   %r = insertelement <vscale x 2 x half> %v, half %elt, i32 0
@@ -63,7 +63,7 @@ define <vscale x 2 x half> @insertelt_nxv2f16_idx(<vscale x 2 x half> %v, half %
 ; CHECK-LABEL: insertelt_nxv2f16_idx:
 ; CHECK:       # %bb.0:
 ; CHECK-NEXT:    addi a1, a0, 1
-; CHECK-NEXT:    vsetvli a2, zero, e16, mf2, ta, ma
+; CHECK-NEXT:    vsetvli a2, zero, e16, m1, ta, ma
 ; CHECK-NEXT:    vfmv.s.f v9, fa0
 ; CHECK-NEXT:    vsetvli zero, a1, e16, mf2, tu, ma
 ; CHECK-NEXT:    vslideup.vx v8, v9, a0
@@ -211,7 +211,7 @@ define <vscale x 32 x half> @insertelt_nxv32f16_idx(<vscale x 32 x half> %v, hal
 define <vscale x 1 x float> @insertelt_nxv1f32_0(<vscale x 1 x float> %v, float %elt) {
 ; CHECK-LABEL: insertelt_nxv1f32_0:
 ; CHECK:       # %bb.0:
-; CHECK-NEXT:    vsetvli a0, zero, e32, mf2, tu, ma
+; CHECK-NEXT:    vsetvli a0, zero, e32, m1, tu, ma
 ; CHECK-NEXT:    vfmv.s.f v8, fa0
 ; CHECK-NEXT:    ret
   %r = insertelement <vscale x 1 x float> %v, float %elt, i32 0
@@ -233,7 +233,7 @@ define <vscale x 1 x float> @insertelt_nxv1f32_idx(<vscale x 1 x float> %v, floa
 ; CHECK-LABEL: insertelt_nxv1f32_idx:
 ; CHECK:       # %bb.0:
 ; CHECK-NEXT:    addi a1, a0, 1
-; CHECK-NEXT:    vsetvli a2, zero, e32, mf2, ta, ma
+; CHECK-NEXT:    vsetvli a2, zero, e32, m1, ta, ma
 ; CHECK-NEXT:    vfmv.s.f v9, fa0
 ; CHECK-NEXT:    vsetvli zero, a1, e32, mf2, tu, ma
 ; CHECK-NEXT:    vslideup.vx v8, v9, a0
diff --git a/llvm/test/CodeGen/RISCV/rvv/insertelt-int-rv32.ll b/llvm/test/CodeGen/RISCV/rvv/insertelt-int-rv32.ll
index 410ef8a..c3cc90c 100644
--- a/llvm/test/CodeGen/RISCV/rvv/insertelt-int-rv32.ll
+++ b/llvm/test/CodeGen/RISCV/rvv/insertelt-int-rv32.ll
@@ -1,5 +1,5 @@
 ; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py
-; RUN: llc -mtriple=riscv32 -mattr=+d,+zfh,+zvfh,+v -target-abi=ilp32d \
+; RUN: llc -mtriple=riscv32 -mattr=+v -target-abi=ilp32d \
 ; RUN:     -verify-machineinstrs < %s | FileCheck %s
 
 define <vscale x 1 x i8> @insertelt_nxv1i8_0(<vscale x 1 x i8> %v, i8 signext %elt) {
diff --git a/llvm/test/CodeGen/RISCV/rvv/insertelt-int-rv64.ll b/llvm/test/CodeGen/RISCV/rvv/insertelt-int-rv64.ll
index 5bba1c5..1523126 100644
--- a/llvm/test/CodeGen/RISCV/rvv/insertelt-int-rv64.ll
+++ b/llvm/test/CodeGen/RISCV/rvv/insertelt-int-rv64.ll
@@ -1,5 +1,5 @@
 ; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py
-; RUN: llc -mtriple=riscv64 -mattr=+d,+zfh,+zvfh,+v -target-abi=lp64d \
+; RUN: llc -mtriple=riscv64 -mattr=+v -target-abi=lp64d \
 ; RUN:     -verify-machineinstrs < %s | FileCheck %s
 
 define <vscale x 1 x i8> @insertelt_nxv1i8_0(<vscale x 1 x i8> %v, i8 signext %elt) {
diff --git a/llvm/test/CodeGen/RISCV/rvv/legalize-store-sdnode.ll b/llvm/test/CodeGen/RISCV/rvv/legalize-store-sdnode.ll
index 4b4cffc..77438ee 100644
--- a/llvm/test/CodeGen/RISCV/rvv/legalize-store-sdnode.ll
+++ b/llvm/test/CodeGen/RISCV/rvv/legalize-store-sdnode.ll
@@ -1,6 +1,6 @@
 ; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py
-; RUN: llc -mtriple=riscv32 -mattr=+m,+v,+zfh,+zvfh,+f,+d -verify-machineinstrs < %s | FileCheck %s
-; RUN: llc -mtriple=riscv64 -mattr=+m,+v,+zfh,+zvfh,+f,+d -verify-machineinstrs < %s | FileCheck %s
+; RUN: llc -mtriple=riscv32 -mattr=+m,+v -verify-machineinstrs < %s | FileCheck %s
+; RUN: llc -mtriple=riscv64 -mattr=+m,+v -verify-machineinstrs < %s | FileCheck %s
 
 ; Check that we are able to legalize scalable-vector stores that require widening.
 
diff --git a/llvm/test/CodeGen/RISCV/rvv/mask-exts-truncs-rv64.ll b/llvm/test/CodeGen/RISCV/rvv/mask-exts-truncs-rv64.ll
deleted file mode 100644
index aaac0c7..0000000
--- a/llvm/test/CodeGen/RISCV/rvv/mask-exts-truncs-rv64.ll
+++ /dev/null
@@ -1,728 +0,0 @@
-; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py
-; RUN: llc -mtriple=riscv64 -mattr=+v -verify-machineinstrs < %s | FileCheck %s
-
-define <vscale x 1 x i8> @sext_nxv1i1_nxv1i8(<vscale x 1 x i1> %v) {
-; CHECK-LABEL: sext_nxv1i1_nxv1i8:
-; CHECK:       # %bb.0:
-; CHECK-NEXT:    vsetvli a0, zero, e8, mf8, ta, ma
-; CHECK-NEXT:    vmv.v.i v8, 0
-; CHECK-NEXT:    vmerge.vim v8, v8, -1, v0
-; CHECK-NEXT:    ret
-  %r = sext <vscale x 1 x i1> %v to <vscale x 1 x i8>
-  ret <vscale x 1 x i8> %r
-}
-
-define <vscale x 1 x i8> @zext_nxv1i1_nxv1i8(<vscale x 1 x i1> %v) {
-; CHECK-LABEL: zext_nxv1i1_nxv1i8:
-; CHECK:       # %bb.0:
-; CHECK-NEXT:    vsetvli a0, zero, e8, mf8, ta, ma
-; CHECK-NEXT:    vmv.v.i v8, 0
-; CHECK-NEXT:    vmerge.vim v8, v8, 1, v0
-; CHECK-NEXT:    ret
-  %r = zext <vscale x 1 x i1> %v to <vscale x 1 x i8>
-  ret <vscale x 1 x i8> %r
-}
-
-define <vscale x 1 x i1> @trunc_nxv1i8_nxv1i1(<vscale x 1 x i8> %v) {
-; CHECK-LABEL: trunc_nxv1i8_nxv1i1:
-; CHECK:       # %bb.0:
-; CHECK-NEXT:    vsetvli a0, zero, e8, mf8, ta, ma
-; CHECK-NEXT:    vand.vi v8, v8, 1
-; CHECK-NEXT:    vmsne.vi v0, v8, 0
-; CHECK-NEXT:    ret
-  %r = trunc <vscale x 1 x i8> %v to <vscale x 1 x i1>
-  ret <vscale x 1 x i1> %r
-}
-
-define <vscale x 2 x i8> @sext_nxv2i1_nxv2i8(<vscale x 2 x i1> %v) {
-; CHECK-LABEL: sext_nxv2i1_nxv2i8:
-; CHECK:       # %bb.0:
-; CHECK-NEXT:    vsetvli a0, zero, e8, mf4, ta, ma
-; CHECK-NEXT:    vmv.v.i v8, 0
-; CHECK-NEXT:    vmerge.vim v8, v8, -1, v0
-; CHECK-NEXT:    ret
-  %r = sext <vscale x 2 x i1> %v to <vscale x 2 x i8>
-  ret <vscale x 2 x i8> %r
-}
-
-define <vscale x 2 x i8> @zext_nxv2i1_nxv2i8(<vscale x 2 x i1> %v) {
-; CHECK-LABEL: zext_nxv2i1_nxv2i8:
-; CHECK:       # %bb.0:
-; CHECK-NEXT:    vsetvli a0, zero, e8, mf4, ta, ma
-; CHECK-NEXT:    vmv.v.i v8, 0
-; CHECK-NEXT:    vmerge.vim v8, v8, 1, v0
-; CHECK-NEXT:    ret
-  %r = zext <vscale x 2 x i1> %v to <vscale x 2 x i8>
-  ret <vscale x 2 x i8> %r
-}
-
-define <vscale x 2 x i1> @trunc_nxv2i8_nxv2i1(<vscale x 2 x i8> %v) {
-; CHECK-LABEL: trunc_nxv2i8_nxv2i1:
-; CHECK:       # %bb.0:
-; CHECK-NEXT:    vsetvli a0, zero, e8, mf4, ta, ma
-; CHECK-NEXT:    vand.vi v8, v8, 1
-; CHECK-NEXT:    vmsne.vi v0, v8, 0
-; CHECK-NEXT:    ret
-  %r = trunc <vscale x 2 x i8> %v to <vscale x 2 x i1>
-  ret <vscale x 2 x i1> %r
-}
-
-define <vscale x 4 x i8> @sext_nxv4i1_nxv4i8(<vscale x 4 x i1> %v) {
-; CHECK-LABEL: sext_nxv4i1_nxv4i8:
-; CHECK:       # %bb.0:
-; CHECK-NEXT:    vsetvli a0, zero, e8, mf2, ta, ma
-; CHECK-NEXT:    vmv.v.i v8, 0
-; CHECK-NEXT:    vmerge.vim v8, v8, -1, v0
-; CHECK-NEXT:    ret
-  %r = sext <vscale x 4 x i1> %v to <vscale x 4 x i8>
-  ret <vscale x 4 x i8> %r
-}
-
-define <vscale x 4 x i8> @zext_nxv4i1_nxv4i8(<vscale x 4 x i1> %v) {
-; CHECK-LABEL: zext_nxv4i1_nxv4i8:
-; CHECK:       # %bb.0:
-; CHECK-NEXT:    vsetvli a0, zero, e8, mf2, ta, ma
-; CHECK-NEXT:    vmv.v.i v8, 0
-; CHECK-NEXT:    vmerge.vim v8, v8, 1, v0
-; CHECK-NEXT:    ret
-  %r = zext <vscale x 4 x i1> %v to <vscale x 4 x i8>
-  ret <vscale x 4 x i8> %r
-}
-
-define <vscale x 4 x i1> @trunc_nxv4i8_nxv4i1(<vscale x 4 x i8> %v) {
-; CHECK-LABEL: trunc_nxv4i8_nxv4i1:
-; CHECK:       # %bb.0:
-; CHECK-NEXT:    vsetvli a0, zero, e8, mf2, ta, ma
-; CHECK-NEXT:    vand.vi v8, v8, 1
-; CHECK-NEXT:    vmsne.vi v0, v8, 0
-; CHECK-NEXT:    ret
-  %r = trunc <vscale x 4 x i8> %v to <vscale x 4 x i1>
-  ret <vscale x 4 x i1> %r
-}
-
-define <vscale x 8 x i8> @sext_nxv8i1_nxv8i8(<vscale x 8 x i1> %v) {
-; CHECK-LABEL: sext_nxv8i1_nxv8i8:
-; CHECK:       # %bb.0:
-; CHECK-NEXT:    vsetvli a0, zero, e8, m1, ta, ma
-; CHECK-NEXT:    vmv.v.i v8, 0
-; CHECK-NEXT:    vmerge.vim v8, v8, -1, v0
-; CHECK-NEXT:    ret
-  %r = sext <vscale x 8 x i1> %v to <vscale x 8 x i8>
-  ret <vscale x 8 x i8> %r
-}
-
-define <vscale x 8 x i8> @zext_nxv8i1_nxv8i8(<vscale x 8 x i1> %v) {
-; CHECK-LABEL: zext_nxv8i1_nxv8i8:
-; CHECK:       # %bb.0:
-; CHECK-NEXT:    vsetvli a0, zero, e8, m1, ta, ma
-; CHECK-NEXT:    vmv.v.i v8, 0
-; CHECK-NEXT:    vmerge.vim v8, v8, 1, v0
-; CHECK-NEXT:    ret
-  %r = zext <vscale x 8 x i1> %v to <vscale x 8 x i8>
-  ret <vscale x 8 x i8> %r
-}
-
-define <vscale x 8 x i1> @trunc_nxv8i8_nxv8i1(<vscale x 8 x i8> %v) {
-; CHECK-LABEL: trunc_nxv8i8_nxv8i1:
-; CHECK:       # %bb.0:
-; CHECK-NEXT:    vsetvli a0, zero, e8, m1, ta, ma
-; CHECK-NEXT:    vand.vi v8, v8, 1
-; CHECK-NEXT:    vmsne.vi v0, v8, 0
-; CHECK-NEXT:    ret
-  %r = trunc <vscale x 8 x i8> %v to <vscale x 8 x i1>
-  ret <vscale x 8 x i1> %r
-}
-
-define <vscale x 16 x i8> @sext_nxv16i1_nxv16i8(<vscale x 16 x i1> %v) {
-; CHECK-LABEL: sext_nxv16i1_nxv16i8:
-; CHECK:       # %bb.0:
-; CHECK-NEXT:    vsetvli a0, zero, e8, m2, ta, ma
-; CHECK-NEXT:    vmv.v.i v8, 0
-; CHECK-NEXT:    vmerge.vim v8, v8, -1, v0
-; CHECK-NEXT:    ret
-  %r = sext <vscale x 16 x i1> %v to <vscale x 16 x i8>
-  ret <vscale x 16 x i8> %r
-}
-
-define <vscale x 16 x i8> @zext_nxv16i1_nxv16i8(<vscale x 16 x i1> %v) {
-; CHECK-LABEL: zext_nxv16i1_nxv16i8:
-; CHECK:       # %bb.0:
-; CHECK-NEXT:    vsetvli a0, zero, e8, m2, ta, ma
-; CHECK-NEXT:    vmv.v.i v8, 0
-; CHECK-NEXT:    vmerge.vim v8, v8, 1, v0
-; CHECK-NEXT:    ret
-  %r = zext <vscale x 16 x i1> %v to <vscale x 16 x i8>
-  ret <vscale x 16 x i8> %r
-}
-
-define <vscale x 16 x i1> @trunc_nxv16i8_nxv16i1(<vscale x 16 x i8> %v) {
-; CHECK-LABEL: trunc_nxv16i8_nxv16i1:
-; CHECK:       # %bb.0:
-; CHECK-NEXT:    vsetvli a0, zero, e8, m2, ta, ma
-; CHECK-NEXT:    vand.vi v8, v8, 1
-; CHECK-NEXT:    vmsne.vi v0, v8, 0
-; CHECK-NEXT:    ret
-  %r = trunc <vscale x 16 x i8> %v to <vscale x 16 x i1>
-  ret <vscale x 16 x i1> %r
-}
-
-define <vscale x 32 x i8> @sext_nxv32i1_nxv32i8(<vscale x 32 x i1> %v) {
-; CHECK-LABEL: sext_nxv32i1_nxv32i8:
-; CHECK:       # %bb.0:
-; CHECK-NEXT:    vsetvli a0, zero, e8, m4, ta, ma
-; CHECK-NEXT:    vmv.v.i v8, 0
-; CHECK-NEXT:    vmerge.vim v8, v8, -1, v0
-; CHECK-NEXT:    ret
-  %r = sext <vscale x 32 x i1> %v to <vscale x 32 x i8>
-  ret <vscale x 32 x i8> %r
-}
-
-define <vscale x 32 x i8> @zext_nxv32i1_nxv32i8(<vscale x 32 x i1> %v) {
-; CHECK-LABEL: zext_nxv32i1_nxv32i8:
-; CHECK:       # %bb.0:
-; CHECK-NEXT:    vsetvli a0, zero, e8, m4, ta, ma
-; CHECK-NEXT:    vmv.v.i v8, 0
-; CHECK-NEXT:    vmerge.vim v8, v8, 1, v0
-; CHECK-NEXT:    ret
-  %r = zext <vscale x 32 x i1> %v to <vscale x 32 x i8>
-  ret <vscale x 32 x i8> %r
-}
-
-define <vscale x 32 x i1> @trunc_nxv32i8_nxv32i1(<vscale x 32 x i8> %v) {
-; CHECK-LABEL: trunc_nxv32i8_nxv32i1:
-; CHECK:       # %bb.0:
-; CHECK-NEXT:    vsetvli a0, zero, e8, m4, ta, ma
-; CHECK-NEXT:    vand.vi v8, v8, 1
-; CHECK-NEXT:    vmsne.vi v0, v8, 0
-; CHECK-NEXT:    ret
-  %r = trunc <vscale x 32 x i8> %v to <vscale x 32 x i1>
-  ret <vscale x 32 x i1> %r
-}
-
-define <vscale x 64 x i8> @sext_nxv64i1_nxv64i8(<vscale x 64 x i1> %v) {
-; CHECK-LABEL: sext_nxv64i1_nxv64i8:
-; CHECK:       # %bb.0:
-; CHECK-NEXT:    vsetvli a0, zero, e8, m8, ta, ma
-; CHECK-NEXT:    vmv.v.i v8, 0
-; CHECK-NEXT:    vmerge.vim v8, v8, -1, v0
-; CHECK-NEXT:    ret
-  %r = sext <vscale x 64 x i1> %v to <vscale x 64 x i8>
-  ret <vscale x 64 x i8> %r
-}
-
-define <vscale x 64 x i8> @zext_nxv64i1_nxv64i8(<vscale x 64 x i1> %v) {
-; CHECK-LABEL: zext_nxv64i1_nxv64i8:
-; CHECK:       # %bb.0:
-; CHECK-NEXT:    vsetvli a0, zero, e8, m8, ta, ma
-; CHECK-NEXT:    vmv.v.i v8, 0
-; CHECK-NEXT:    vmerge.vim v8, v8, 1, v0
-; CHECK-NEXT:    ret
-  %r = zext <vscale x 64 x i1> %v to <vscale x 64 x i8>
-  ret <vscale x 64 x i8> %r
-}
-
-define <vscale x 64 x i1> @trunc_nxv64i8_nxv64i1(<vscale x 64 x i8> %v) {
-; CHECK-LABEL: trunc_nxv64i8_nxv64i1:
-; CHECK:       # %bb.0:
-; CHECK-NEXT:    vsetvli a0, zero, e8, m8, ta, ma
-; CHECK-NEXT:    vand.vi v8, v8, 1
-; CHECK-NEXT:    vmsne.vi v0, v8, 0
-; CHECK-NEXT:    ret
-  %r = trunc <vscale x 64 x i8> %v to <vscale x 64 x i1>
-  ret <vscale x 64 x i1> %r
-}
-
-define <vscale x 1 x i16> @sext_nxv1i1_nxv1i16(<vscale x 1 x i1> %v) {
-; CHECK-LABEL: sext_nxv1i1_nxv1i16:
-; CHECK:       # %bb.0:
-; CHECK-NEXT:    vsetvli a0, zero, e16, mf4, ta, ma
-; CHECK-NEXT:    vmv.v.i v8, 0
-; CHECK-NEXT:    vmerge.vim v8, v8, -1, v0
-; CHECK-NEXT:    ret
-  %r = sext <vscale x 1 x i1> %v to <vscale x 1 x i16>
-  ret <vscale x 1 x i16> %r
-}
-
-define <vscale x 1 x i16> @zext_nxv1i1_nxv1i16(<vscale x 1 x i1> %v) {
-; CHECK-LABEL: zext_nxv1i1_nxv1i16:
-; CHECK:       # %bb.0:
-; CHECK-NEXT:    vsetvli a0, zero, e16, mf4, ta, ma
-; CHECK-NEXT:    vmv.v.i v8, 0
-; CHECK-NEXT:    vmerge.vim v8, v8, 1, v0
-; CHECK-NEXT:    ret
-  %r = zext <vscale x 1 x i1> %v to <vscale x 1 x i16>
-  ret <vscale x 1 x i16> %r
-}
-
-define <vscale x 1 x i1> @trunc_nxv1i16_nxv1i1(<vscale x 1 x i16> %v) {
-; CHECK-LABEL: trunc_nxv1i16_nxv1i1:
-; CHECK:       # %bb.0:
-; CHECK-NEXT:    vsetvli a0, zero, e16, mf4, ta, ma
-; CHECK-NEXT:    vand.vi v8, v8, 1
-; CHECK-NEXT:    vmsne.vi v0, v8, 0
-; CHECK-NEXT:    ret
-  %r = trunc <vscale x 1 x i16> %v to <vscale x 1 x i1>
-  ret <vscale x 1 x i1> %r
-}
-
-define <vscale x 2 x i16> @sext_nxv2i1_nxv2i16(<vscale x 2 x i1> %v) {
-; CHECK-LABEL: sext_nxv2i1_nxv2i16:
-; CHECK:       # %bb.0:
-; CHECK-NEXT:    vsetvli a0, zero, e16, mf2, ta, ma
-; CHECK-NEXT:    vmv.v.i v8, 0
-; CHECK-NEXT:    vmerge.vim v8, v8, -1, v0
-; CHECK-NEXT:    ret
-  %r = sext <vscale x 2 x i1> %v to <vscale x 2 x i16>
-  ret <vscale x 2 x i16> %r
-}
-
-define <vscale x 2 x i16> @zext_nxv2i1_nxv2i16(<vscale x 2 x i1> %v) {
-; CHECK-LABEL: zext_nxv2i1_nxv2i16:
-; CHECK:       # %bb.0:
-; CHECK-NEXT:    vsetvli a0, zero, e16, mf2, ta, ma
-; CHECK-NEXT:    vmv.v.i v8, 0
-; CHECK-NEXT:    vmerge.vim v8, v8, 1, v0
-; CHECK-NEXT:    ret
-  %r = zext <vscale x 2 x i1> %v to <vscale x 2 x i16>
-  ret <vscale x 2 x i16> %r
-}
-
-define <vscale x 2 x i1> @trunc_nxv2i16_nxv2i1(<vscale x 2 x i16> %v) {
-; CHECK-LABEL: trunc_nxv2i16_nxv2i1:
-; CHECK:       # %bb.0:
-; CHECK-NEXT:    vsetvli a0, zero, e16, mf2, ta, ma
-; CHECK-NEXT:    vand.vi v8, v8, 1
-; CHECK-NEXT:    vmsne.vi v0, v8, 0
-; CHECK-NEXT:    ret
-  %r = trunc <vscale x 2 x i16> %v to <vscale x 2 x i1>
-  ret <vscale x 2 x i1> %r
-}
-
-define <vscale x 4 x i16> @sext_nxv4i1_nxv4i16(<vscale x 4 x i1> %v) {
-; CHECK-LABEL: sext_nxv4i1_nxv4i16:
-; CHECK:       # %bb.0:
-; CHECK-NEXT:    vsetvli a0, zero, e16, m1, ta, ma
-; CHECK-NEXT:    vmv.v.i v8, 0
-; CHECK-NEXT:    vmerge.vim v8, v8, -1, v0
-; CHECK-NEXT:    ret
-  %r = sext <vscale x 4 x i1> %v to <vscale x 4 x i16>
-  ret <vscale x 4 x i16> %r
-}
-
-define <vscale x 4 x i16> @zext_nxv4i1_nxv4i16(<vscale x 4 x i1> %v) {
-; CHECK-LABEL: zext_nxv4i1_nxv4i16:
-; CHECK:       # %bb.0:
-; CHECK-NEXT:    vsetvli a0, zero, e16, m1, ta, ma
-; CHECK-NEXT:    vmv.v.i v8, 0
-; CHECK-NEXT:    vmerge.vim v8, v8, 1, v0
-; CHECK-NEXT:    ret
-  %r = zext <vscale x 4 x i1> %v to <vscale x 4 x i16>
-  ret <vscale x 4 x i16> %r
-}
-
-define <vscale x 4 x i1> @trunc_nxv4i16_nxv4i1(<vscale x 4 x i16> %v) {
-; CHECK-LABEL: trunc_nxv4i16_nxv4i1:
-; CHECK:       # %bb.0:
-; CHECK-NEXT:    vsetvli a0, zero, e16, m1, ta, ma
-; CHECK-NEXT:    vand.vi v8, v8, 1
-; CHECK-NEXT:    vmsne.vi v0, v8, 0
-; CHECK-NEXT:    ret
-  %r = trunc <vscale x 4 x i16> %v to <vscale x 4 x i1>
-  ret <vscale x 4 x i1> %r
-}
-
-define <vscale x 8 x i16> @sext_nxv8i1_nxv8i16(<vscale x 8 x i1> %v) {
-; CHECK-LABEL: sext_nxv8i1_nxv8i16:
-; CHECK:       # %bb.0:
-; CHECK-NEXT:    vsetvli a0, zero, e16, m2, ta, ma
-; CHECK-NEXT:    vmv.v.i v8, 0
-; CHECK-NEXT:    vmerge.vim v8, v8, -1, v0
-; CHECK-NEXT:    ret
-  %r = sext <vscale x 8 x i1> %v to <vscale x 8 x i16>
-  ret <vscale x 8 x i16> %r
-}
-
-define <vscale x 8 x i16> @zext_nxv8i1_nxv8i16(<vscale x 8 x i1> %v) {
-; CHECK-LABEL: zext_nxv8i1_nxv8i16:
-; CHECK:       # %bb.0:
-; CHECK-NEXT:    vsetvli a0, zero, e16, m2, ta, ma
-; CHECK-NEXT:    vmv.v.i v8, 0
-; CHECK-NEXT:    vmerge.vim v8, v8, 1, v0
-; CHECK-NEXT:    ret
-  %r = zext <vscale x 8 x i1> %v to <vscale x 8 x i16>
-  ret <vscale x 8 x i16> %r
-}
-
-define <vscale x 8 x i1> @trunc_nxv8i16_nxv8i1(<vscale x 8 x i16> %v) {
-; CHECK-LABEL: trunc_nxv8i16_nxv8i1:
-; CHECK:       # %bb.0:
-; CHECK-NEXT:    vsetvli a0, zero, e16, m2, ta, ma
-; CHECK-NEXT:    vand.vi v8, v8, 1
-; CHECK-NEXT:    vmsne.vi v0, v8, 0
-; CHECK-NEXT:    ret
-  %r = trunc <vscale x 8 x i16> %v to <vscale x 8 x i1>
-  ret <vscale x 8 x i1> %r
-}
-
-define <vscale x 16 x i16> @sext_nxv16i1_nxv16i16(<vscale x 16 x i1> %v) {
-; CHECK-LABEL: sext_nxv16i1_nxv16i16:
-; CHECK:       # %bb.0:
-; CHECK-NEXT:    vsetvli a0, zero, e16, m4, ta, ma
-; CHECK-NEXT:    vmv.v.i v8, 0
-; CHECK-NEXT:    vmerge.vim v8, v8, -1, v0
-; CHECK-NEXT:    ret
-  %r = sext <vscale x 16 x i1> %v to <vscale x 16 x i16>
-  ret <vscale x 16 x i16> %r
-}
-
-define <vscale x 16 x i16> @zext_nxv16i1_nxv16i16(<vscale x 16 x i1> %v) {
-; CHECK-LABEL: zext_nxv16i1_nxv16i16:
-; CHECK:       # %bb.0:
-; CHECK-NEXT:    vsetvli a0, zero, e16, m4, ta, ma
-; CHECK-NEXT:    vmv.v.i v8, 0
-; CHECK-NEXT:    vmerge.vim v8, v8, 1, v0
-; CHECK-NEXT:    ret
-  %r = zext <vscale x 16 x i1> %v to <vscale x 16 x i16>
-  ret <vscale x 16 x i16> %r
-}
-
-define <vscale x 16 x i1> @trunc_nxv16i16_nxv16i1(<vscale x 16 x i16> %v) {
-; CHECK-LABEL: trunc_nxv16i16_nxv16i1:
-; CHECK:       # %bb.0:
-; CHECK-NEXT:    vsetvli a0, zero, e16, m4, ta, ma
-; CHECK-NEXT:    vand.vi v8, v8, 1
-; CHECK-NEXT:    vmsne.vi v0, v8, 0
-; CHECK-NEXT:    ret
-  %r = trunc <vscale x 16 x i16> %v to <vscale x 16 x i1>
-  ret <vscale x 16 x i1> %r
-}
-
-define <vscale x 32 x i16> @sext_nxv32i1_nxv32i16(<vscale x 32 x i1> %v) {
-; CHECK-LABEL: sext_nxv32i1_nxv32i16:
-; CHECK:       # %bb.0:
-; CHECK-NEXT:    vsetvli a0, zero, e16, m8, ta, ma
-; CHECK-NEXT:    vmv.v.i v8, 0
-; CHECK-NEXT:    vmerge.vim v8, v8, -1, v0
-; CHECK-NEXT:    ret
-  %r = sext <vscale x 32 x i1> %v to <vscale x 32 x i16>
-  ret <vscale x 32 x i16> %r
-}
-
-define <vscale x 32 x i16> @zext_nxv32i1_nxv32i16(<vscale x 32 x i1> %v) {
-; CHECK-LABEL: zext_nxv32i1_nxv32i16:
-; CHECK:       # %bb.0:
-; CHECK-NEXT:    vsetvli a0, zero, e16, m8, ta, ma
-; CHECK-NEXT:    vmv.v.i v8, 0
-; CHECK-NEXT:    vmerge.vim v8, v8, 1, v0
-; CHECK-NEXT:    ret
-  %r = zext <vscale x 32 x i1> %v to <vscale x 32 x i16>
-  ret <vscale x 32 x i16> %r
-}
-
-define <vscale x 32 x i1> @trunc_nxv32i16_nxv32i1(<vscale x 32 x i16> %v) {
-; CHECK-LABEL: trunc_nxv32i16_nxv32i1:
-; CHECK:       # %bb.0:
-; CHECK-NEXT:    vsetvli a0, zero, e16, m8, ta, ma
-; CHECK-NEXT:    vand.vi v8, v8, 1
-; CHECK-NEXT:    vmsne.vi v0, v8, 0
-; CHECK-NEXT:    ret
-  %r = trunc <vscale x 32 x i16> %v to <vscale x 32 x i1>
-  ret <vscale x 32 x i1> %r
-}
-
-define <vscale x 1 x i32> @sext_nxv1i1_nxv1i32(<vscale x 1 x i1> %v) {
-; CHECK-LABEL: sext_nxv1i1_nxv1i32:
-; CHECK:       # %bb.0:
-; CHECK-NEXT:    vsetvli a0, zero, e32, mf2, ta, ma
-; CHECK-NEXT:    vmv.v.i v8, 0
-; CHECK-NEXT:    vmerge.vim v8, v8, -1, v0
-; CHECK-NEXT:    ret
-  %r = sext <vscale x 1 x i1> %v to <vscale x 1 x i32>
-  ret <vscale x 1 x i32> %r
-}
-
-define <vscale x 1 x i32> @zext_nxv1i1_nxv1i32(<vscale x 1 x i1> %v) {
-; CHECK-LABEL: zext_nxv1i1_nxv1i32:
-; CHECK:       # %bb.0:
-; CHECK-NEXT:    vsetvli a0, zero, e32, mf2, ta, ma
-; CHECK-NEXT:    vmv.v.i v8, 0
-; CHECK-NEXT:    vmerge.vim v8, v8, 1, v0
-; CHECK-NEXT:    ret
-  %r = zext <vscale x 1 x i1> %v to <vscale x 1 x i32>
-  ret <vscale x 1 x i32> %r
-}
-
-define <vscale x 1 x i1> @trunc_nxv1i32_nxv1i1(<vscale x 1 x i32> %v) {
-; CHECK-LABEL: trunc_nxv1i32_nxv1i1:
-; CHECK:       # %bb.0:
-; CHECK-NEXT:    vsetvli a0, zero, e32, mf2, ta, ma
-; CHECK-NEXT:    vand.vi v8, v8, 1
-; CHECK-NEXT:    vmsne.vi v0, v8, 0
-; CHECK-NEXT:    ret
-  %r = trunc <vscale x 1 x i32> %v to <vscale x 1 x i1>
-  ret <vscale x 1 x i1> %r
-}
-
-define <vscale x 2 x i32> @sext_nxv2i1_nxv2i32(<vscale x 2 x i1> %v) {
-; CHECK-LABEL: sext_nxv2i1_nxv2i32:
-; CHECK:       # %bb.0:
-; CHECK-NEXT:    vsetvli a0, zero, e32, m1, ta, ma
-; CHECK-NEXT:    vmv.v.i v8, 0
-; CHECK-NEXT:    vmerge.vim v8, v8, -1, v0
-; CHECK-NEXT:    ret
-  %r = sext <vscale x 2 x i1> %v to <vscale x 2 x i32>
-  ret <vscale x 2 x i32> %r
-}
-
-define <vscale x 2 x i32> @zext_nxv2i1_nxv2i32(<vscale x 2 x i1> %v) {
-; CHECK-LABEL: zext_nxv2i1_nxv2i32:
-; CHECK:       # %bb.0:
-; CHECK-NEXT:    vsetvli a0, zero, e32, m1, ta, ma
-; CHECK-NEXT:    vmv.v.i v8, 0
-; CHECK-NEXT:    vmerge.vim v8, v8, 1, v0
-; CHECK-NEXT:    ret
-  %r = zext <vscale x 2 x i1> %v to <vscale x 2 x i32>
-  ret <vscale x 2 x i32> %r
-}
-
-define <vscale x 2 x i1> @trunc_nxv2i32_nxv2i1(<vscale x 2 x i32> %v) {
-; CHECK-LABEL: trunc_nxv2i32_nxv2i1:
-; CHECK:       # %bb.0:
-; CHECK-NEXT:    vsetvli a0, zero, e32, m1, ta, ma
-; CHECK-NEXT:    vand.vi v8, v8, 1
-; CHECK-NEXT:    vmsne.vi v0, v8, 0
-; CHECK-NEXT:    ret
-  %r = trunc <vscale x 2 x i32> %v to <vscale x 2 x i1>
-  ret <vscale x 2 x i1> %r
-}
-
-define <vscale x 4 x i32> @sext_nxv4i1_nxv4i32(<vscale x 4 x i1> %v) {
-; CHECK-LABEL: sext_nxv4i1_nxv4i32:
-; CHECK:       # %bb.0:
-; CHECK-NEXT:    vsetvli a0, zero, e32, m2, ta, ma
-; CHECK-NEXT:    vmv.v.i v8, 0
-; CHECK-NEXT:    vmerge.vim v8, v8, -1, v0
-; CHECK-NEXT:    ret
-  %r = sext <vscale x 4 x i1> %v to <vscale x 4 x i32>
-  ret <vscale x 4 x i32> %r
-}
-
-define <vscale x 4 x i32> @zext_nxv4i1_nxv4i32(<vscale x 4 x i1> %v) {
-; CHECK-LABEL: zext_nxv4i1_nxv4i32:
-; CHECK:       # %bb.0:
-; CHECK-NEXT:    vsetvli a0, zero, e32, m2, ta, ma
-; CHECK-NEXT:    vmv.v.i v8, 0
-; CHECK-NEXT:    vmerge.vim v8, v8, 1, v0
-; CHECK-NEXT:    ret
-  %r = zext <vscale x 4 x i1> %v to <vscale x 4 x i32>
-  ret <vscale x 4 x i32> %r
-}
-
-define <vscale x 4 x i1> @trunc_nxv4i32_nxv4i1(<vscale x 4 x i32> %v) {
-; CHECK-LABEL: trunc_nxv4i32_nxv4i1:
-; CHECK:       # %bb.0:
-; CHECK-NEXT:    vsetvli a0, zero, e32, m2, ta, ma
-; CHECK-NEXT:    vand.vi v8, v8, 1
-; CHECK-NEXT:    vmsne.vi v0, v8, 0
-; CHECK-NEXT:    ret
-  %r = trunc <vscale x 4 x i32> %v to <vscale x 4 x i1>
-  ret <vscale x 4 x i1> %r
-}
-
-define <vscale x 8 x i32> @sext_nxv8i1_nxv8i32(<vscale x 8 x i1> %v) {
-; CHECK-LABEL: sext_nxv8i1_nxv8i32:
-; CHECK:       # %bb.0:
-; CHECK-NEXT:    vsetvli a0, zero, e32, m4, ta, ma
-; CHECK-NEXT:    vmv.v.i v8, 0
-; CHECK-NEXT:    vmerge.vim v8, v8, -1, v0
-; CHECK-NEXT:    ret
-  %r = sext <vscale x 8 x i1> %v to <vscale x 8 x i32>
-  ret <vscale x 8 x i32> %r
-}
-
-define <vscale x 8 x i32> @zext_nxv8i1_nxv8i32(<vscale x 8 x i1> %v) {
-; CHECK-LABEL: zext_nxv8i1_nxv8i32:
-; CHECK:       # %bb.0:
-; CHECK-NEXT:    vsetvli a0, zero, e32, m4, ta, ma
-; CHECK-NEXT:    vmv.v.i v8, 0
-; CHECK-NEXT:    vmerge.vim v8, v8, 1, v0
-; CHECK-NEXT:    ret
-  %r = zext <vscale x 8 x i1> %v to <vscale x 8 x i32>
-  ret <vscale x 8 x i32> %r
-}
-
-define <vscale x 8 x i1> @trunc_nxv8i32_nxv8i1(<vscale x 8 x i32> %v) {
-; CHECK-LABEL: trunc_nxv8i32_nxv8i1:
-; CHECK:       # %bb.0:
-; CHECK-NEXT:    vsetvli a0, zero, e32, m4, ta, ma
-; CHECK-NEXT:    vand.vi v8, v8, 1
-; CHECK-NEXT:    vmsne.vi v0, v8, 0
-; CHECK-NEXT:    ret
-  %r = trunc <vscale x 8 x i32> %v to <vscale x 8 x i1>
-  ret <vscale x 8 x i1> %r
-}
-
-define <vscale x 16 x i32> @sext_nxv16i1_nxv16i32(<vscale x 16 x i1> %v) {
-; CHECK-LABEL: sext_nxv16i1_nxv16i32:
-; CHECK:       # %bb.0:
-; CHECK-NEXT:    vsetvli a0, zero, e32, m8, ta, ma
-; CHECK-NEXT:    vmv.v.i v8, 0
-; CHECK-NEXT:    vmerge.vim v8, v8, -1, v0
-; CHECK-NEXT:    ret
-  %r = sext <vscale x 16 x i1> %v to <vscale x 16 x i32>
-  ret <vscale x 16 x i32> %r
-}
-
-define <vscale x 16 x i32> @zext_nxv16i1_nxv16i32(<vscale x 16 x i1> %v) {
-; CHECK-LABEL: zext_nxv16i1_nxv16i32:
-; CHECK:       # %bb.0:
-; CHECK-NEXT:    vsetvli a0, zero, e32, m8, ta, ma
-; CHECK-NEXT:    vmv.v.i v8, 0
-; CHECK-NEXT:    vmerge.vim v8, v8, 1, v0
-; CHECK-NEXT:    ret
-  %r = zext <vscale x 16 x i1> %v to <vscale x 16 x i32>
-  ret <vscale x 16 x i32> %r
-}
-
-define <vscale x 16 x i1> @trunc_nxv16i32_nxv16i1(<vscale x 16 x i32> %v) {
-; CHECK-LABEL: trunc_nxv16i32_nxv16i1:
-; CHECK:       # %bb.0:
-; CHECK-NEXT:    vsetvli a0, zero, e32, m8, ta, ma
-; CHECK-NEXT:    vand.vi v8, v8, 1
-; CHECK-NEXT:    vmsne.vi v0, v8, 0
-; CHECK-NEXT:    ret
-  %r = trunc <vscale x 16 x i32> %v to <vscale x 16 x i1>
-  ret <vscale x 16 x i1> %r
-}
-
-define <vscale x 1 x i64> @sext_nxv1i1_nxv1i64(<vscale x 1 x i1> %v) {
-; CHECK-LABEL: sext_nxv1i1_nxv1i64:
-; CHECK:       # %bb.0:
-; CHECK-NEXT:    vsetvli a0, zero, e64, m1, ta, ma
-; CHECK-NEXT:    vmv.v.i v8, 0
-; CHECK-NEXT:    vmerge.vim v8, v8, -1, v0
-; CHECK-NEXT:    ret
-  %r = sext <vscale x 1 x i1> %v to <vscale x 1 x i64>
-  ret <vscale x 1 x i64> %r
-}
-
-define <vscale x 1 x i64> @zext_nxv1i1_nxv1i64(<vscale x 1 x i1> %v) {
-; CHECK-LABEL: zext_nxv1i1_nxv1i64:
-; CHECK:       # %bb.0:
-; CHECK-NEXT:    vsetvli a0, zero, e64, m1, ta, ma
-; CHECK-NEXT:    vmv.v.i v8, 0
-; CHECK-NEXT:    vmerge.vim v8, v8, 1, v0
-; CHECK-NEXT:    ret
-  %r = zext <vscale x 1 x i1> %v to <vscale x 1 x i64>
-  ret <vscale x 1 x i64> %r
-}
-
-define <vscale x 1 x i1> @trunc_nxv1i64_nxv1i1(<vscale x 1 x i64> %v) {
-; CHECK-LABEL: trunc_nxv1i64_nxv1i1:
-; CHECK:       # %bb.0:
-; CHECK-NEXT:    vsetvli a0, zero, e64, m1, ta, ma
-; CHECK-NEXT:    vand.vi v8, v8, 1
-; CHECK-NEXT:    vmsne.vi v0, v8, 0
-; CHECK-NEXT:    ret
-  %r = trunc <vscale x 1 x i64> %v to <vscale x 1 x i1>
-  ret <vscale x 1 x i1> %r
-}
-
-define <vscale x 2 x i64> @sext_nxv2i1_nxv2i64(<vscale x 2 x i1> %v) {
-; CHECK-LABEL: sext_nxv2i1_nxv2i64:
-; CHECK:       # %bb.0:
-; CHECK-NEXT:    vsetvli a0, zero, e64, m2, ta, ma
-; CHECK-NEXT:    vmv.v.i v8, 0
-; CHECK-NEXT:    vmerge.vim v8, v8, -1, v0
-; CHECK-NEXT:    ret
-  %r = sext <vscale x 2 x i1> %v to <vscale x 2 x i64>
-  ret <vscale x 2 x i64> %r
-}
-
-define <vscale x 2 x i64> @zext_nxv2i1_nxv2i64(<vscale x 2 x i1> %v) {
-; CHECK-LABEL: zext_nxv2i1_nxv2i64:
-; CHECK:       # %bb.0:
-; CHECK-NEXT:    vsetvli a0, zero, e64, m2, ta, ma
-; CHECK-NEXT:    vmv.v.i v8, 0
-; CHECK-NEXT:    vmerge.vim v8, v8, 1, v0
-; CHECK-NEXT:    ret
-  %r = zext <vscale x 2 x i1> %v to <vscale x 2 x i64>
-  ret <vscale x 2 x i64> %r
-}
-
-define <vscale x 2 x i1> @trunc_nxv2i64_nxv2i1(<vscale x 2 x i64> %v) {
-; CHECK-LABEL: trunc_nxv2i64_nxv2i1:
-; CHECK:       # %bb.0:
-; CHECK-NEXT:    vsetvli a0, zero, e64, m2, ta, ma
-; CHECK-NEXT:    vand.vi v8, v8, 1
-; CHECK-NEXT:    vmsne.vi v0, v8, 0
-; CHECK-NEXT:    ret
-  %r = trunc <vscale x 2 x i64> %v to <vscale x 2 x i1>
-  ret <vscale x 2 x i1> %r
-}
-
-define <vscale x 4 x i64> @sext_nxv4i1_nxv4i64(<vscale x 4 x i1> %v) {
-; CHECK-LABEL: sext_nxv4i1_nxv4i64:
-; CHECK:       # %bb.0:
-; CHECK-NEXT:    vsetvli a0, zero, e64, m4, ta, ma
-; CHECK-NEXT:    vmv.v.i v8, 0
-; CHECK-NEXT:    vmerge.vim v8, v8, -1, v0
-; CHECK-NEXT:    ret
-  %r = sext <vscale x 4 x i1> %v to <vscale x 4 x i64>
-  ret <vscale x 4 x i64> %r
-}
-
-define <vscale x 4 x i64> @zext_nxv4i1_nxv4i64(<vscale x 4 x i1> %v) {
-; CHECK-LABEL: zext_nxv4i1_nxv4i64:
-; CHECK:       # %bb.0:
-; CHECK-NEXT:    vsetvli a0, zero, e64, m4, ta, ma
-; CHECK-NEXT:    vmv.v.i v8, 0
-; CHECK-NEXT:    vmerge.vim v8, v8, 1, v0
-; CHECK-NEXT:    ret
-  %r = zext <vscale x 4 x i1> %v to <vscale x 4 x i64>
-  ret <vscale x 4 x i64> %r
-}
-
-define <vscale x 4 x i1> @trunc_nxv4i64_nxv4i1(<vscale x 4 x i64> %v) {
-; CHECK-LABEL: trunc_nxv4i64_nxv4i1:
-; CHECK:       # %bb.0:
-; CHECK-NEXT:    vsetvli a0, zero, e64, m4, ta, ma
-; CHECK-NEXT:    vand.vi v8, v8, 1
-; CHECK-NEXT:    vmsne.vi v0, v8, 0
-; CHECK-NEXT:    ret
-  %r = trunc <vscale x 4 x i64> %v to <vscale x 4 x i1>
-  ret <vscale x 4 x i1> %r
-}
-
-define <vscale x 8 x i64> @sext_nxv8i1_nxv8i64(<vscale x 8 x i1> %v) {
-; CHECK-LABEL: sext_nxv8i1_nxv8i64:
-; CHECK:       # %bb.0:
-; CHECK-NEXT:    vsetvli a0, zero, e64, m8, ta, ma
-; CHECK-NEXT:    vmv.v.i v8, 0
-; CHECK-NEXT:    vmerge.vim v8, v8, -1, v0
-; CHECK-NEXT:    ret
-  %r = sext <vscale x 8 x i1> %v to <vscale x 8 x i64>
-  ret <vscale x 8 x i64> %r
-}
-
-define <vscale x 8 x i64> @zext_nxv8i1_nxv8i64(<vscale x 8 x i1> %v) {
-; CHECK-LABEL: zext_nxv8i1_nxv8i64:
-; CHECK:       # %bb.0:
-; CHECK-NEXT:    vsetvli a0, zero, e64, m8, ta, ma
-; CHECK-NEXT:    vmv.v.i v8, 0
-; CHECK-NEXT:    vmerge.vim v8, v8, 1, v0
-; CHECK-NEXT:    ret
-  %r = zext <vscale x 8 x i1> %v to <vscale x 8 x i64>
-  ret <vscale x 8 x i64> %r
-}
-
-define <vscale x 8 x i1> @trunc_nxv8i64_nxv8i1(<vscale x 8 x i64> %v) {
-; CHECK-LABEL: trunc_nxv8i64_nxv8i1:
-; CHECK:       # %bb.0:
-; CHECK-NEXT:    vsetvli a0, zero, e64, m8, ta, ma
-; CHECK-NEXT:    vand.vi v8, v8, 1
-; CHECK-NEXT:    vmsne.vi v0, v8, 0
-; CHECK-NEXT:    ret
-  %r = trunc <vscale x 8 x i64> %v to <vscale x 8 x i1>
-  ret <vscale x 8 x i1> %r
-}
diff --git a/llvm/test/CodeGen/RISCV/rvv/mask-exts-truncs-rv32.ll b/llvm/test/CodeGen/RISCV/rvv/mask-exts-truncs.ll
index ef5ad1f..023a704 100644
--- a/llvm/test/CodeGen/RISCV/rvv/mask-exts-truncs-rv32.ll
+++ b/llvm/test/CodeGen/RISCV/rvv/mask-exts-truncs.ll
@@ -1,5 +1,8 @@
 ; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py
-; RUN: llc -mtriple=riscv32 -mattr=+v -verify-machineinstrs < %s | FileCheck %s
+; RUN: sed 's/iXLen/i32/g' %s | llc -mtriple=riscv32 -mattr=+v \
+; RUN:   -verify-machineinstrs  | FileCheck %s
+; RUN: sed 's/iXLen/i64/g' %s | llc -mtriple=riscv64 -mattr=+v \
+; RUN:   -verify-machineinstrs  | FileCheck %s
 
 define <vscale x 1 x i8> @sext_nxv1i1_nxv1i8(<vscale x 1 x i1> %v) {
 ; CHECK-LABEL: sext_nxv1i1_nxv1i8:
diff --git a/llvm/test/CodeGen/RISCV/rvv/mscatter-combine.ll b/llvm/test/CodeGen/RISCV/rvv/mscatter-combine.ll
index 1c3b429..e686ac8 100644
--- a/llvm/test/CodeGen/RISCV/rvv/mscatter-combine.ll
+++ b/llvm/test/CodeGen/RISCV/rvv/mscatter-combine.ll
@@ -1,7 +1,7 @@
 ; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py
-; RUN: llc -mtriple=riscv32 -mattr=+m,+d,+zfh,+zvfh,+v -target-abi=ilp32d \
+; RUN: llc -mtriple=riscv32 -mattr=+m,+v -target-abi=ilp32d \
 ; RUN:     -verify-machineinstrs < %s | FileCheck %s --check-prefixes=RV32
-; RUN: llc -mtriple=riscv64 -mattr=+m,+d,+zfh,+zvfh,+v -target-abi=lp64d \
+; RUN: llc -mtriple=riscv64 -mattr=+m,+v -target-abi=lp64d \
 ; RUN:     -verify-machineinstrs < %s | FileCheck %s --check-prefixes=RV64
 
 %struct = type { i64, i64, ptr, i32, i32, i32, [4 x i32] }
diff --git a/llvm/test/CodeGen/RISCV/rvv/regalloc-fast-crash.ll b/llvm/test/CodeGen/RISCV/rvv/regalloc-fast-crash.ll
index 64bc1ef..b6f9d31 100644
--- a/llvm/test/CodeGen/RISCV/rvv/regalloc-fast-crash.ll
+++ b/llvm/test/CodeGen/RISCV/rvv/regalloc-fast-crash.ll
@@ -1,5 +1,5 @@
 ; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py
-; RUN: llc -mtriple=riscv32 -mattr=+v,+zfh,+zvfh,+m \
+; RUN: llc -mtriple=riscv32 -mattr=+v,+m \
 ; RUN:     -regalloc=fast -verify-machineinstrs < %s | FileCheck %s
 
 ; This test previously crashed with an error "ran out of registers during register allocation"
diff --git a/llvm/test/CodeGen/RISCV/rvv/rvv-peephole-vmerge-vops.ll b/llvm/test/CodeGen/RISCV/rvv/rvv-peephole-vmerge-vops.ll
index a08bcae..259515f 100644
--- a/llvm/test/CodeGen/RISCV/rvv/rvv-peephole-vmerge-vops.ll
+++ b/llvm/test/CodeGen/RISCV/rvv/rvv-peephole-vmerge-vops.ll
@@ -1196,3 +1196,24 @@ define <vscale x 2 x i32> @true_mask_vmerge_implicit_passthru(<vscale x 2 x i32>
   )
   ret <vscale x 2 x i32> %b
 }
+
+
+define <vscale x 2 x i32> @unfoldable_mismatched_sew(<vscale x 2 x i32> %passthru, <vscale x 1 x i64> %x, <vscale x 1 x i64> %y, <vscale x 2 x i1> %mask, i64 %avl) {
+; CHECK-LABEL: unfoldable_mismatched_sew:
+; CHECK:       # %bb.0:
+; CHECK-NEXT:    vsetvli zero, a0, e64, m1, ta, ma
+; CHECK-NEXT:    vadd.vv v9, v9, v10
+; CHECK-NEXT:    vsetvli zero, a0, e32, m1, tu, ma
+; CHECK-NEXT:    vmv.v.v v8, v9
+; CHECK-NEXT:    ret
+  %a = call <vscale x 1 x i64> @llvm.riscv.vadd.nxv1i64.nxv1i64(<vscale x 1 x i64> poison, <vscale x 1 x i64> %x, <vscale x 1 x i64> %y, i64 %avl)
+  %a.bitcast = bitcast <vscale x 1 x i64> %a to <vscale x 2 x i32>
+  %b = call <vscale x 2 x i32> @llvm.riscv.vmerge.nxv2i32.nxv2i32(
+    <vscale x 2 x i32> %passthru,
+    <vscale x 2 x i32> %passthru,
+    <vscale x 2 x i32> %a.bitcast,
+    <vscale x 2 x i1> splat (i1 true),
+    i64 %avl
+  )
+  ret <vscale x 2 x i32> %b
+}
diff --git a/llvm/test/CodeGen/RISCV/rvv/strided-vpload.ll b/llvm/test/CodeGen/RISCV/rvv/strided-vpload.ll
index 0010f64..14976f2 100644
--- a/llvm/test/CodeGen/RISCV/rvv/strided-vpload.ll
+++ b/llvm/test/CodeGen/RISCV/rvv/strided-vpload.ll
@@ -1,16 +1,16 @@
 ; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py
 ; RUN: llc -mtriple=riscv32 -mattr=+m,+d,+zfh,+v,+zvfh,+optimized-zero-stride-load \
 ; RUN:   -verify-machineinstrs < %s \
-; RUN:   | FileCheck %s --check-prefixes=CHECK,CHECK-RV32,CHECK-OPT
+; RUN:   | FileCheck %s --check-prefixes=CHECK,CHECK-RV32,CHECK-OPT,CHECK-OPT-RV32
 ; RUN: llc -mtriple=riscv64 -mattr=+m,+d,+zfh,+v,+zvfh,+optimized-zero-stride-load \
 ; RUN:   -verify-machineinstrs < %s \
-; RUN:   | FileCheck %s --check-prefixes=CHECK,CHECK-RV64,CHECK-OPT
+; RUN:   | FileCheck %s --check-prefixes=CHECK,CHECK-RV64,CHECK-OPT,CHECK-OPT-RV64
 ; RUN: llc -mtriple=riscv32 -mattr=+m,+d,+zfh,+v,+zvfh \
 ; RUN:   -verify-machineinstrs < %s \
-; RUN:   | FileCheck %s --check-prefixes=CHECK,CHECK-RV32,CHECK-NO-OPT
+; RUN:   | FileCheck %s --check-prefixes=CHECK,CHECK-RV32,CHECK-NO-OPT,CHECK-NO-OPT-RV32
 ; RUN: llc -mtriple=riscv64 -mattr=+m,+d,+zfh,+v,+zvfh \
 ; RUN:   -verify-machineinstrs < %s \
-; RUN:   | FileCheck %s --check-prefixes=CHECK,CHECK-RV64,CHECK-NO-OPT
+; RUN:   | FileCheck %s --check-prefixes=CHECK,CHECK-RV64,CHECK-NO-OPT,CHECK-NO-OPT-RV64
 
 declare <vscale x 1 x i8> @llvm.experimental.vp.strided.load.nxv1i8.p0.i8(ptr, i8, <vscale x 1 x i1>, i32)
 
@@ -823,15 +823,15 @@ define <vscale x 1 x half> @zero_strided_unmasked_vpload_nxv1f16(ptr %ptr) {
   ret <vscale x 1 x half> %load
 }
 
-define <vscale x 1 x i64> @zero_strided_vadd.vx(<vscale x 1 x i64> %v, ptr %ptr) {
-; CHECK-RV32-LABEL: zero_strided_vadd.vx:
+define <vscale x 1 x i64> @zero_strided_vadd_nxv1i64(<vscale x 1 x i64> %v, ptr %ptr) {
+; CHECK-RV32-LABEL: zero_strided_vadd_nxv1i64:
 ; CHECK-RV32:       # %bb.0:
 ; CHECK-RV32-NEXT:    vsetvli a1, zero, e64, m1, ta, ma
 ; CHECK-RV32-NEXT:    vlse64.v v9, (a0), zero
 ; CHECK-RV32-NEXT:    vadd.vv v8, v8, v9
 ; CHECK-RV32-NEXT:    ret
 ;
-; CHECK-RV64-LABEL: zero_strided_vadd.vx:
+; CHECK-RV64-LABEL: zero_strided_vadd_nxv1i64:
 ; CHECK-RV64:       # %bb.0:
 ; CHECK-RV64-NEXT:    ld a0, 0(a0)
 ; CHECK-RV64-NEXT:    vsetvli a1, zero, e64, m1, ta, ma
@@ -842,3 +842,69 @@ define <vscale x 1 x i64> @zero_strided_vadd.vx(<vscale x 1 x i64> %v, ptr %ptr)
   %w = add <vscale x 1 x i64> %v, %load
   ret <vscale x 1 x i64> %w
 }
+
+define <vscale x 16 x i64> @zero_strided_vadd_nxv16i64(<vscale x 16 x i64> %v, ptr %ptr) {
+; CHECK-RV32-LABEL: zero_strided_vadd_nxv16i64:
+; CHECK-RV32:       # %bb.0:
+; CHECK-RV32-NEXT:    csrr a1, vlenb
+; CHECK-RV32-NEXT:    srli a2, a1, 3
+; CHECK-RV32-NEXT:    sub a3, a2, a1
+; CHECK-RV32-NEXT:    sltu a4, a2, a3
+; CHECK-RV32-NEXT:    addi a4, a4, -1
+; CHECK-RV32-NEXT:    and a3, a4, a3
+; CHECK-RV32-NEXT:    vsetvli zero, a3, e64, m8, ta, ma
+; CHECK-RV32-NEXT:    vlse64.v v24, (a0), zero
+; CHECK-RV32-NEXT:    bltu a2, a1, .LBB55_2
+; CHECK-RV32-NEXT:  # %bb.1:
+; CHECK-RV32-NEXT:    mv a2, a1
+; CHECK-RV32-NEXT:  .LBB55_2:
+; CHECK-RV32-NEXT:    vsetvli zero, a2, e64, m8, ta, ma
+; CHECK-RV32-NEXT:    vlse64.v v0, (a0), zero
+; CHECK-RV32-NEXT:    vsetvli a0, zero, e64, m8, ta, ma
+; CHECK-RV32-NEXT:    vadd.vv v16, v16, v24
+; CHECK-RV32-NEXT:    vadd.vv v8, v8, v0
+; CHECK-RV32-NEXT:    ret
+;
+; CHECK-RV64-LABEL: zero_strided_vadd_nxv16i64:
+; CHECK-RV64:       # %bb.0:
+; CHECK-RV64-NEXT:    ld a0, 0(a0)
+; CHECK-RV64-NEXT:    vsetvli a1, zero, e64, m8, ta, ma
+; CHECK-RV64-NEXT:    vadd.vx v8, v8, a0
+; CHECK-RV64-NEXT:    vadd.vx v16, v16, a0
+; CHECK-RV64-NEXT:    ret
+  %vscale = call i32 @llvm.vscale()
+  %load = call <vscale x 16 x i64> @llvm.experimental.vp.strided.load.nxv16i64.p0.i32(ptr %ptr, i32 0, <vscale x 16 x i1> splat (i1 true), i32 %vscale)
+  %w = add <vscale x 16 x i64> %v, %load
+  ret <vscale x 16 x i64> %w
+}
+
+define <vscale x 1 x ptr> @zero_strided_vadd_nxv1p0(<vscale x 1 x ptr> %v, ptr %ptr) {
+; CHECK-OPT-RV32-LABEL: zero_strided_vadd_nxv1p0:
+; CHECK-OPT-RV32:       # %bb.0:
+; CHECK-OPT-RV32-NEXT:    vsetvli a1, zero, e32, mf2, ta, ma
+; CHECK-OPT-RV32-NEXT:    vlse32.v v8, (a0), zero
+; CHECK-OPT-RV32-NEXT:    ret
+;
+; CHECK-OPT-RV64-LABEL: zero_strided_vadd_nxv1p0:
+; CHECK-OPT-RV64:       # %bb.0:
+; CHECK-OPT-RV64-NEXT:    vsetvli a1, zero, e64, m1, ta, ma
+; CHECK-OPT-RV64-NEXT:    vlse64.v v8, (a0), zero
+; CHECK-OPT-RV64-NEXT:    ret
+;
+; CHECK-NO-OPT-RV32-LABEL: zero_strided_vadd_nxv1p0:
+; CHECK-NO-OPT-RV32:       # %bb.0:
+; CHECK-NO-OPT-RV32-NEXT:    lw a0, 0(a0)
+; CHECK-NO-OPT-RV32-NEXT:    vsetvli a1, zero, e32, mf2, ta, ma
+; CHECK-NO-OPT-RV32-NEXT:    vmv.v.x v8, a0
+; CHECK-NO-OPT-RV32-NEXT:    ret
+;
+; CHECK-NO-OPT-RV64-LABEL: zero_strided_vadd_nxv1p0:
+; CHECK-NO-OPT-RV64:       # %bb.0:
+; CHECK-NO-OPT-RV64-NEXT:    ld a0, 0(a0)
+; CHECK-NO-OPT-RV64-NEXT:    vsetvli a1, zero, e64, m1, ta, ma
+; CHECK-NO-OPT-RV64-NEXT:    vmv.v.x v8, a0
+; CHECK-NO-OPT-RV64-NEXT:    ret
+  %vscale = call i32 @llvm.vscale()
+  %load = call <vscale x 1 x ptr> @llvm.experimental.vp.strided.load.nxv1p0.p0.i32(ptr %ptr, i32 0, <vscale x 1 x i1> splat (i1 true), i32 %vscale)
+  ret <vscale x 1 x ptr> %load
+}
diff --git a/llvm/test/CodeGen/RISCV/rvv/trunc-select-to-max-usat.ll b/llvm/test/CodeGen/RISCV/rvv/trunc-select-to-max-usat.ll
index 28d7588..d2f7382 100644
--- a/llvm/test/CodeGen/RISCV/rvv/trunc-select-to-max-usat.ll
+++ b/llvm/test/CodeGen/RISCV/rvv/trunc-select-to-max-usat.ll
@@ -1,5 +1,6 @@
 ; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py UTC_ARGS: --version 5
 ; RUN: llc < %s -mtriple=riscv64 -mattr=+v | FileCheck %s
+; RUN: llc < %s -mtriple=riscv64 -mattr=+zve64x,+zvl128b | FileCheck %s
 
 define <4 x i8> @test_v4i16_v4i8(<4 x i16> %x) {
 ; CHECK-LABEL: test_v4i16_v4i8:
diff --git a/llvm/test/CodeGen/RISCV/rvv/unmasked-ta.ll b/llvm/test/CodeGen/RISCV/rvv/unmasked-ta.ll
index 503085f0..ae8c36a 100644
--- a/llvm/test/CodeGen/RISCV/rvv/unmasked-ta.ll
+++ b/llvm/test/CodeGen/RISCV/rvv/unmasked-ta.ll
@@ -928,7 +928,7 @@ declare <vscale x 1 x half> @llvm.riscv.vfmv.s.f.nxv1f16(<vscale x 1 x half>, ha
 define <vscale x 1 x half> @intrinsic_vfmv.s.f_f_nxv1f16(half %0, iXLen %1) nounwind {
 ; CHECK-LABEL: intrinsic_vfmv.s.f_f_nxv1f16:
 ; CHECK:       # %bb.0: # %entry
-; CHECK-NEXT:    vsetvli zero, a0, e16, mf4, ta, ma
+; CHECK-NEXT:    vsetvli zero, a0, e16, m1, ta, ma
 ; CHECK-NEXT:    vfmv.s.f v8, fa0
 ; CHECK-NEXT:    ret
 entry:
diff --git a/llvm/test/CodeGen/RISCV/rvv/unmasked-tu.ll b/llvm/test/CodeGen/RISCV/rvv/unmasked-tu.ll
index 439301f..1f027ae 100644
--- a/llvm/test/CodeGen/RISCV/rvv/unmasked-tu.ll
+++ b/llvm/test/CodeGen/RISCV/rvv/unmasked-tu.ll
@@ -2576,21 +2576,21 @@ entry:
   ret <vscale x 8 x double> %a
 }
 
-declare <vscale x 1 x half> @llvm.riscv.vfmerge.nxv1f16.nxv1f16(
+declare <vscale x 1 x half> @llvm.riscv.vmerge.nxv1f16.nxv1f16(
   <vscale x 1 x half>,
   <vscale x 1 x half>,
   <vscale x 1 x half>,
   <vscale x 1 x i1>,
   iXLen);
 
-define <vscale x 1 x half> @intrinsic_vfmerge_vvm_nxv1f16_nxv1f16_nxv1f16(<vscale x 1 x half> %0, <vscale x 1 x half> %1, <vscale x 1 x half> %2, <vscale x 1 x i1> %3, iXLen %4) nounwind {
-; CHECK-LABEL: intrinsic_vfmerge_vvm_nxv1f16_nxv1f16_nxv1f16:
+define <vscale x 1 x half> @intrinsic_vmerge_vvm_nxv1f16_nxv1f16_nxv1f16(<vscale x 1 x half> %0, <vscale x 1 x half> %1, <vscale x 1 x half> %2, <vscale x 1 x i1> %3, iXLen %4) nounwind {
+; CHECK-LABEL: intrinsic_vmerge_vvm_nxv1f16_nxv1f16_nxv1f16:
 ; CHECK:       # %bb.0: # %entry
 ; CHECK-NEXT:    vsetvli zero, a0, e16, mf4, tu, ma
 ; CHECK-NEXT:    vmerge.vvm v8, v9, v10, v0
 ; CHECK-NEXT:    ret
 entry:
-  %a = call <vscale x 1 x half> @llvm.riscv.vfmerge.nxv1f16.nxv1f16(
+  %a = call <vscale x 1 x half> @llvm.riscv.vmerge.nxv1f16.nxv1f16(
     <vscale x 1 x half> %0,
     <vscale x 1 x half> %1,
     <vscale x 1 x half> %2,
diff --git a/llvm/test/CodeGen/RISCV/rvv/vfmerge.ll b/llvm/test/CodeGen/RISCV/rvv/vfmerge.ll
index d9df1d4..e47c2a4 100644
--- a/llvm/test/CodeGen/RISCV/rvv/vfmerge.ll
+++ b/llvm/test/CodeGen/RISCV/rvv/vfmerge.ll
@@ -4,21 +4,21 @@
 ; RUN: sed 's/iXLen/i64/g' %s | llc -mtriple=riscv64 -mattr=+v,+zfh,+zvfh \
 ; RUN:   -verify-machineinstrs -target-abi=lp64d | FileCheck %s
 
-declare <vscale x 1 x half> @llvm.riscv.vfmerge.nxv1f16.nxv1f16(
+declare <vscale x 1 x half> @llvm.riscv.vmerge.nxv1f16.nxv1f16(
   <vscale x 1 x half>,
   <vscale x 1 x half>,
   <vscale x 1 x half>,
   <vscale x 1 x i1>,
   iXLen);
 
-define <vscale x 1 x half> @intrinsic_vfmerge_vvm_nxv1f16_nxv1f16_nxv1f16(<vscale x 1 x half> %0, <vscale x 1 x half> %1, <vscale x 1 x i1> %2, iXLen %3) nounwind {
-; CHECK-LABEL: intrinsic_vfmerge_vvm_nxv1f16_nxv1f16_nxv1f16:
+define <vscale x 1 x half> @intrinsic_vmerge_vvm_nxv1f16_nxv1f16_nxv1f16(<vscale x 1 x half> %0, <vscale x 1 x half> %1, <vscale x 1 x i1> %2, iXLen %3) nounwind {
+; CHECK-LABEL: intrinsic_vmerge_vvm_nxv1f16_nxv1f16_nxv1f16:
 ; CHECK:       # %bb.0: # %entry
 ; CHECK-NEXT:    vsetvli zero, a0, e16, mf4, ta, ma
 ; CHECK-NEXT:    vmerge.vvm v8, v8, v9, v0
 ; CHECK-NEXT:    ret
 entry:
-  %a = call <vscale x 1 x half> @llvm.riscv.vfmerge.nxv1f16.nxv1f16(
+  %a = call <vscale x 1 x half> @llvm.riscv.vmerge.nxv1f16.nxv1f16(
     <vscale x 1 x half> undef,
     <vscale x 1 x half> %0,
     <vscale x 1 x half> %1,
@@ -52,21 +52,21 @@ entry:
   ret <vscale x 1 x half> %a
 }
 
-declare <vscale x 2 x half> @llvm.riscv.vfmerge.nxv2f16.nxv2f16(
+declare <vscale x 2 x half> @llvm.riscv.vmerge.nxv2f16.nxv2f16(
   <vscale x 2 x half>,
   <vscale x 2 x half>,
   <vscale x 2 x half>,
   <vscale x 2 x i1>,
   iXLen);
 
-define <vscale x 2 x half> @intrinsic_vfmerge_vvm_nxv2f16_nxv2f16_nxv2f16(<vscale x 2 x half> %0, <vscale x 2 x half> %1, <vscale x 2 x i1> %2, iXLen %3) nounwind {
-; CHECK-LABEL: intrinsic_vfmerge_vvm_nxv2f16_nxv2f16_nxv2f16:
+define <vscale x 2 x half> @intrinsic_vmerge_vvm_nxv2f16_nxv2f16_nxv2f16(<vscale x 2 x half> %0, <vscale x 2 x half> %1, <vscale x 2 x i1> %2, iXLen %3) nounwind {
+; CHECK-LABEL: intrinsic_vmerge_vvm_nxv2f16_nxv2f16_nxv2f16:
 ; CHECK:       # %bb.0: # %entry
 ; CHECK-NEXT:    vsetvli zero, a0, e16, mf2, ta, ma
 ; CHECK-NEXT:    vmerge.vvm v8, v8, v9, v0
 ; CHECK-NEXT:    ret
 entry:
-  %a = call <vscale x 2 x half> @llvm.riscv.vfmerge.nxv2f16.nxv2f16(
+  %a = call <vscale x 2 x half> @llvm.riscv.vmerge.nxv2f16.nxv2f16(
     <vscale x 2 x half> undef,
     <vscale x 2 x half> %0,
     <vscale x 2 x half> %1,
@@ -100,21 +100,21 @@ entry:
   ret <vscale x 2 x half> %a
 }
 
-declare <vscale x 4 x half> @llvm.riscv.vfmerge.nxv4f16.nxv4f16(
+declare <vscale x 4 x half> @llvm.riscv.vmerge.nxv4f16.nxv4f16(
   <vscale x 4 x half>,
   <vscale x 4 x half>,
   <vscale x 4 x half>,
   <vscale x 4 x i1>,
   iXLen);
 
-define <vscale x 4 x half> @intrinsic_vfmerge_vvm_nxv4f16_nxv4f16_nxv4f16(<vscale x 4 x half> %0, <vscale x 4 x half> %1, <vscale x 4 x i1> %2, iXLen %3) nounwind {
-; CHECK-LABEL: intrinsic_vfmerge_vvm_nxv4f16_nxv4f16_nxv4f16:
+define <vscale x 4 x half> @intrinsic_vmerge_vvm_nxv4f16_nxv4f16_nxv4f16(<vscale x 4 x half> %0, <vscale x 4 x half> %1, <vscale x 4 x i1> %2, iXLen %3) nounwind {
+; CHECK-LABEL: intrinsic_vmerge_vvm_nxv4f16_nxv4f16_nxv4f16:
 ; CHECK:       # %bb.0: # %entry
 ; CHECK-NEXT:    vsetvli zero, a0, e16, m1, ta, ma
 ; CHECK-NEXT:    vmerge.vvm v8, v8, v9, v0
 ; CHECK-NEXT:    ret
 entry:
-  %a = call <vscale x 4 x half> @llvm.riscv.vfmerge.nxv4f16.nxv4f16(
+  %a = call <vscale x 4 x half> @llvm.riscv.vmerge.nxv4f16.nxv4f16(
     <vscale x 4 x half> undef,
     <vscale x 4 x half> %0,
     <vscale x 4 x half> %1,
@@ -148,21 +148,21 @@ entry:
   ret <vscale x 4 x half> %a
 }
 
-declare <vscale x 8 x half> @llvm.riscv.vfmerge.nxv8f16.nxv8f16(
+declare <vscale x 8 x half> @llvm.riscv.vmerge.nxv8f16.nxv8f16(
   <vscale x 8 x half>,
   <vscale x 8 x half>,
   <vscale x 8 x half>,
   <vscale x 8 x i1>,
   iXLen);
 
-define <vscale x 8 x half> @intrinsic_vfmerge_vvm_nxv8f16_nxv8f16_nxv8f16(<vscale x 8 x half> %0, <vscale x 8 x half> %1, <vscale x 8 x i1> %2, iXLen %3) nounwind {
-; CHECK-LABEL: intrinsic_vfmerge_vvm_nxv8f16_nxv8f16_nxv8f16:
+define <vscale x 8 x half> @intrinsic_vmerge_vvm_nxv8f16_nxv8f16_nxv8f16(<vscale x 8 x half> %0, <vscale x 8 x half> %1, <vscale x 8 x i1> %2, iXLen %3) nounwind {
+; CHECK-LABEL: intrinsic_vmerge_vvm_nxv8f16_nxv8f16_nxv8f16:
 ; CHECK:       # %bb.0: # %entry
 ; CHECK-NEXT:    vsetvli zero, a0, e16, m2, ta, ma
 ; CHECK-NEXT:    vmerge.vvm v8, v8, v10, v0
 ; CHECK-NEXT:    ret
 entry:
-  %a = call <vscale x 8 x half> @llvm.riscv.vfmerge.nxv8f16.nxv8f16(
+  %a = call <vscale x 8 x half> @llvm.riscv.vmerge.nxv8f16.nxv8f16(
     <vscale x 8 x half> undef,
     <vscale x 8 x half> %0,
     <vscale x 8 x half> %1,
@@ -196,21 +196,21 @@ entry:
   ret <vscale x 8 x half> %a
 }
 
-declare <vscale x 16 x half> @llvm.riscv.vfmerge.nxv16f16.nxv16f16(
+declare <vscale x 16 x half> @llvm.riscv.vmerge.nxv16f16.nxv16f16(
   <vscale x 16 x half>,
   <vscale x 16 x half>,
   <vscale x 16 x half>,
   <vscale x 16 x i1>,
   iXLen);
 
-define <vscale x 16 x half> @intrinsic_vfmerge_vvm_nxv16f16_nxv16f16_nxv16f16(<vscale x 16 x half> %0, <vscale x 16 x half> %1, <vscale x 16 x i1> %2, iXLen %3) nounwind {
-; CHECK-LABEL: intrinsic_vfmerge_vvm_nxv16f16_nxv16f16_nxv16f16:
+define <vscale x 16 x half> @intrinsic_vmerge_vvm_nxv16f16_nxv16f16_nxv16f16(<vscale x 16 x half> %0, <vscale x 16 x half> %1, <vscale x 16 x i1> %2, iXLen %3) nounwind {
+; CHECK-LABEL: intrinsic_vmerge_vvm_nxv16f16_nxv16f16_nxv16f16:
 ; CHECK:       # %bb.0: # %entry
 ; CHECK-NEXT:    vsetvli zero, a0, e16, m4, ta, ma
 ; CHECK-NEXT:    vmerge.vvm v8, v8, v12, v0
 ; CHECK-NEXT:    ret
 entry:
-  %a = call <vscale x 16 x half> @llvm.riscv.vfmerge.nxv16f16.nxv16f16(
+  %a = call <vscale x 16 x half> @llvm.riscv.vmerge.nxv16f16.nxv16f16(
     <vscale x 16 x half> undef,
     <vscale x 16 x half> %0,
     <vscale x 16 x half> %1,
@@ -244,21 +244,21 @@ entry:
   ret <vscale x 16 x half> %a
 }
 
-declare <vscale x 32 x half> @llvm.riscv.vfmerge.nxv32f16.nxv32f16(
+declare <vscale x 32 x half> @llvm.riscv.vmerge.nxv32f16.nxv32f16(
   <vscale x 32 x half>,
   <vscale x 32 x half>,
   <vscale x 32 x half>,
   <vscale x 32 x i1>,
   iXLen);
 
-define <vscale x 32 x half> @intrinsic_vfmerge_vvm_nxv32f16_nxv32f16_nxv32f16(<vscale x 32 x half> %0, <vscale x 32 x half> %1, <vscale x 32 x i1> %2, iXLen %3) nounwind {
-; CHECK-LABEL: intrinsic_vfmerge_vvm_nxv32f16_nxv32f16_nxv32f16:
+define <vscale x 32 x half> @intrinsic_vmerge_vvm_nxv32f16_nxv32f16_nxv32f16(<vscale x 32 x half> %0, <vscale x 32 x half> %1, <vscale x 32 x i1> %2, iXLen %3) nounwind {
+; CHECK-LABEL: intrinsic_vmerge_vvm_nxv32f16_nxv32f16_nxv32f16:
 ; CHECK:       # %bb.0: # %entry
 ; CHECK-NEXT:    vsetvli zero, a0, e16, m8, ta, ma
 ; CHECK-NEXT:    vmerge.vvm v8, v8, v16, v0
 ; CHECK-NEXT:    ret
 entry:
-  %a = call <vscale x 32 x half> @llvm.riscv.vfmerge.nxv32f16.nxv32f16(
+  %a = call <vscale x 32 x half> @llvm.riscv.vmerge.nxv32f16.nxv32f16(
     <vscale x 32 x half> undef,
     <vscale x 32 x half> %0,
     <vscale x 32 x half> %1,
@@ -292,21 +292,21 @@ entry:
   ret <vscale x 32 x half> %a
 }
 
-declare <vscale x 1 x float> @llvm.riscv.vfmerge.nxv1f32.nxv1f32(
+declare <vscale x 1 x float> @llvm.riscv.vmerge.nxv1f32.nxv1f32(
   <vscale x 1 x float>,
   <vscale x 1 x float>,
   <vscale x 1 x float>,
   <vscale x 1 x i1>,
   iXLen);
 
-define <vscale x 1 x float> @intrinsic_vfmerge_vvm_nxv1f32_nxv1f32_nxv1f32(<vscale x 1 x float> %0, <vscale x 1 x float> %1, <vscale x 1 x i1> %2, iXLen %3) nounwind {
-; CHECK-LABEL: intrinsic_vfmerge_vvm_nxv1f32_nxv1f32_nxv1f32:
+define <vscale x 1 x float> @intrinsic_vmerge_vvm_nxv1f32_nxv1f32_nxv1f32(<vscale x 1 x float> %0, <vscale x 1 x float> %1, <vscale x 1 x i1> %2, iXLen %3) nounwind {
+; CHECK-LABEL: intrinsic_vmerge_vvm_nxv1f32_nxv1f32_nxv1f32:
 ; CHECK:       # %bb.0: # %entry
 ; CHECK-NEXT:    vsetvli zero, a0, e32, mf2, ta, ma
 ; CHECK-NEXT:    vmerge.vvm v8, v8, v9, v0
 ; CHECK-NEXT:    ret
 entry:
-  %a = call <vscale x 1 x float> @llvm.riscv.vfmerge.nxv1f32.nxv1f32(
+  %a = call <vscale x 1 x float> @llvm.riscv.vmerge.nxv1f32.nxv1f32(
     <vscale x 1 x float> undef,
     <vscale x 1 x float> %0,
     <vscale x 1 x float> %1,
@@ -340,21 +340,21 @@ entry:
   ret <vscale x 1 x float> %a
 }
 
-declare <vscale x 2 x float> @llvm.riscv.vfmerge.nxv2f32.nxv2f32(
+declare <vscale x 2 x float> @llvm.riscv.vmerge.nxv2f32.nxv2f32(
   <vscale x 2 x float>,
   <vscale x 2 x float>,
   <vscale x 2 x float>,
   <vscale x 2 x i1>,
   iXLen);
 
-define <vscale x 2 x float> @intrinsic_vfmerge_vvm_nxv2f32_nxv2f32_nxv2f32(<vscale x 2 x float> %0, <vscale x 2 x float> %1, <vscale x 2 x i1> %2, iXLen %3) nounwind {
-; CHECK-LABEL: intrinsic_vfmerge_vvm_nxv2f32_nxv2f32_nxv2f32:
+define <vscale x 2 x float> @intrinsic_vmerge_vvm_nxv2f32_nxv2f32_nxv2f32(<vscale x 2 x float> %0, <vscale x 2 x float> %1, <vscale x 2 x i1> %2, iXLen %3) nounwind {
+; CHECK-LABEL: intrinsic_vmerge_vvm_nxv2f32_nxv2f32_nxv2f32:
 ; CHECK:       # %bb.0: # %entry
 ; CHECK-NEXT:    vsetvli zero, a0, e32, m1, ta, ma
 ; CHECK-NEXT:    vmerge.vvm v8, v8, v9, v0
 ; CHECK-NEXT:    ret
 entry:
-  %a = call <vscale x 2 x float> @llvm.riscv.vfmerge.nxv2f32.nxv2f32(
+  %a = call <vscale x 2 x float> @llvm.riscv.vmerge.nxv2f32.nxv2f32(
     <vscale x 2 x float> undef,
     <vscale x 2 x float> %0,
     <vscale x 2 x float> %1,
@@ -388,21 +388,21 @@ entry:
   ret <vscale x 2 x float> %a
 }
 
-declare <vscale x 4 x float> @llvm.riscv.vfmerge.nxv4f32.nxv4f32(
+declare <vscale x 4 x float> @llvm.riscv.vmerge.nxv4f32.nxv4f32(
   <vscale x 4 x float>,
   <vscale x 4 x float>,
   <vscale x 4 x float>,
   <vscale x 4 x i1>,
   iXLen);
 
-define <vscale x 4 x float> @intrinsic_vfmerge_vvm_nxv4f32_nxv4f32_nxv4f32(<vscale x 4 x float> %0, <vscale x 4 x float> %1, <vscale x 4 x i1> %2, iXLen %3) nounwind {
-; CHECK-LABEL: intrinsic_vfmerge_vvm_nxv4f32_nxv4f32_nxv4f32:
+define <vscale x 4 x float> @intrinsic_vmerge_vvm_nxv4f32_nxv4f32_nxv4f32(<vscale x 4 x float> %0, <vscale x 4 x float> %1, <vscale x 4 x i1> %2, iXLen %3) nounwind {
+; CHECK-LABEL: intrinsic_vmerge_vvm_nxv4f32_nxv4f32_nxv4f32:
 ; CHECK:       # %bb.0: # %entry
 ; CHECK-NEXT:    vsetvli zero, a0, e32, m2, ta, ma
 ; CHECK-NEXT:    vmerge.vvm v8, v8, v10, v0
 ; CHECK-NEXT:    ret
 entry:
-  %a = call <vscale x 4 x float> @llvm.riscv.vfmerge.nxv4f32.nxv4f32(
+  %a = call <vscale x 4 x float> @llvm.riscv.vmerge.nxv4f32.nxv4f32(
     <vscale x 4 x float> undef,
     <vscale x 4 x float> %0,
     <vscale x 4 x float> %1,
@@ -436,21 +436,21 @@ entry:
   ret <vscale x 4 x float> %a
 }
 
-declare <vscale x 8 x float> @llvm.riscv.vfmerge.nxv8f32.nxv8f32(
+declare <vscale x 8 x float> @llvm.riscv.vmerge.nxv8f32.nxv8f32(
   <vscale x 8 x float>,
   <vscale x 8 x float>,
   <vscale x 8 x float>,
   <vscale x 8 x i1>,
   iXLen);
 
-define <vscale x 8 x float> @intrinsic_vfmerge_vvm_nxv8f32_nxv8f32_nxv8f32(<vscale x 8 x float> %0, <vscale x 8 x float> %1, <vscale x 8 x i1> %2, iXLen %3) nounwind {
-; CHECK-LABEL: intrinsic_vfmerge_vvm_nxv8f32_nxv8f32_nxv8f32:
+define <vscale x 8 x float> @intrinsic_vmerge_vvm_nxv8f32_nxv8f32_nxv8f32(<vscale x 8 x float> %0, <vscale x 8 x float> %1, <vscale x 8 x i1> %2, iXLen %3) nounwind {
+; CHECK-LABEL: intrinsic_vmerge_vvm_nxv8f32_nxv8f32_nxv8f32:
 ; CHECK:       # %bb.0: # %entry
 ; CHECK-NEXT:    vsetvli zero, a0, e32, m4, ta, ma
 ; CHECK-NEXT:    vmerge.vvm v8, v8, v12, v0
 ; CHECK-NEXT:    ret
 entry:
-  %a = call <vscale x 8 x float> @llvm.riscv.vfmerge.nxv8f32.nxv8f32(
+  %a = call <vscale x 8 x float> @llvm.riscv.vmerge.nxv8f32.nxv8f32(
     <vscale x 8 x float> undef,
     <vscale x 8 x float> %0,
     <vscale x 8 x float> %1,
@@ -484,21 +484,21 @@ entry:
   ret <vscale x 8 x float> %a
 }
 
-declare <vscale x 16 x float> @llvm.riscv.vfmerge.nxv16f32.nxv16f32(
+declare <vscale x 16 x float> @llvm.riscv.vmerge.nxv16f32.nxv16f32(
   <vscale x 16 x float>,
   <vscale x 16 x float>,
   <vscale x 16 x float>,
   <vscale x 16 x i1>,
   iXLen);
 
-define <vscale x 16 x float> @intrinsic_vfmerge_vvm_nxv16f32_nxv16f32_nxv16f32(<vscale x 16 x float> %0, <vscale x 16 x float> %1, <vscale x 16 x i1> %2, iXLen %3) nounwind {
-; CHECK-LABEL: intrinsic_vfmerge_vvm_nxv16f32_nxv16f32_nxv16f32:
+define <vscale x 16 x float> @intrinsic_vmerge_vvm_nxv16f32_nxv16f32_nxv16f32(<vscale x 16 x float> %0, <vscale x 16 x float> %1, <vscale x 16 x i1> %2, iXLen %3) nounwind {
+; CHECK-LABEL: intrinsic_vmerge_vvm_nxv16f32_nxv16f32_nxv16f32:
 ; CHECK:       # %bb.0: # %entry
 ; CHECK-NEXT:    vsetvli zero, a0, e32, m8, ta, ma
 ; CHECK-NEXT:    vmerge.vvm v8, v8, v16, v0
 ; CHECK-NEXT:    ret
 entry:
-  %a = call <vscale x 16 x float> @llvm.riscv.vfmerge.nxv16f32.nxv16f32(
+  %a = call <vscale x 16 x float> @llvm.riscv.vmerge.nxv16f32.nxv16f32(
     <vscale x 16 x float> undef,
     <vscale x 16 x float> %0,
     <vscale x 16 x float> %1,
@@ -532,21 +532,21 @@ entry:
   ret <vscale x 16 x float> %a
 }
 
-declare <vscale x 1 x double> @llvm.riscv.vfmerge.nxv1f64.nxv1f64(
+declare <vscale x 1 x double> @llvm.riscv.vmerge.nxv1f64.nxv1f64(
   <vscale x 1 x double>,
   <vscale x 1 x double>,
   <vscale x 1 x double>,
   <vscale x 1 x i1>,
   iXLen);
 
-define <vscale x 1 x double> @intrinsic_vfmerge_vvm_nxv1f64_nxv1f64_nxv1f64(<vscale x 1 x double> %0, <vscale x 1 x double> %1, <vscale x 1 x i1> %2, iXLen %3) nounwind {
-; CHECK-LABEL: intrinsic_vfmerge_vvm_nxv1f64_nxv1f64_nxv1f64:
+define <vscale x 1 x double> @intrinsic_vmerge_vvm_nxv1f64_nxv1f64_nxv1f64(<vscale x 1 x double> %0, <vscale x 1 x double> %1, <vscale x 1 x i1> %2, iXLen %3) nounwind {
+; CHECK-LABEL: intrinsic_vmerge_vvm_nxv1f64_nxv1f64_nxv1f64:
 ; CHECK:       # %bb.0: # %entry
 ; CHECK-NEXT:    vsetvli zero, a0, e64, m1, ta, ma
 ; CHECK-NEXT:    vmerge.vvm v8, v8, v9, v0
 ; CHECK-NEXT:    ret
 entry:
-  %a = call <vscale x 1 x double> @llvm.riscv.vfmerge.nxv1f64.nxv1f64(
+  %a = call <vscale x 1 x double> @llvm.riscv.vmerge.nxv1f64.nxv1f64(
     <vscale x 1 x double> undef,
     <vscale x 1 x double> %0,
     <vscale x 1 x double> %1,
@@ -587,14 +587,14 @@ declare <vscale x 2 x double> @llvm.riscv.vfmerge.nxv2f64.nxv2f64(
   <vscale x 2 x i1>,
   iXLen);
 
-define <vscale x 2 x double> @intrinsic_vfmerge_vvm_nxv2f64_nxv2f64_nxv2f64(<vscale x 2 x double> %0, <vscale x 2 x double> %1, <vscale x 2 x i1> %2, iXLen %3) nounwind {
-; CHECK-LABEL: intrinsic_vfmerge_vvm_nxv2f64_nxv2f64_nxv2f64:
+define <vscale x 2 x double> @intrinsic_vmerge_vvm_nxv2f64_nxv2f64_nxv2f64(<vscale x 2 x double> %0, <vscale x 2 x double> %1, <vscale x 2 x i1> %2, iXLen %3) nounwind {
+; CHECK-LABEL: intrinsic_vmerge_vvm_nxv2f64_nxv2f64_nxv2f64:
 ; CHECK:       # %bb.0: # %entry
 ; CHECK-NEXT:    vsetvli zero, a0, e64, m2, ta, ma
 ; CHECK-NEXT:    vmerge.vvm v8, v8, v10, v0
 ; CHECK-NEXT:    ret
 entry:
-  %a = call <vscale x 2 x double> @llvm.riscv.vfmerge.nxv2f64.nxv2f64(
+  %a = call <vscale x 2 x double> @llvm.riscv.vmerge.nxv2f64.nxv2f64(
     <vscale x 2 x double> undef,
     <vscale x 2 x double> %0,
     <vscale x 2 x double> %1,
@@ -628,21 +628,21 @@ entry:
   ret <vscale x 2 x double> %a
 }
 
-declare <vscale x 4 x double> @llvm.riscv.vfmerge.nxv4f64.nxv4f64(
+declare <vscale x 4 x double> @llvm.riscv.vmerge.nxv4f64.nxv4f64(
   <vscale x 4 x double>,
   <vscale x 4 x double>,
   <vscale x 4 x double>,
   <vscale x 4 x i1>,
   iXLen);
 
-define <vscale x 4 x double> @intrinsic_vfmerge_vvm_nxv4f64_nxv4f64_nxv4f64(<vscale x 4 x double> %0, <vscale x 4 x double> %1, <vscale x 4 x i1> %2, iXLen %3) nounwind {
-; CHECK-LABEL: intrinsic_vfmerge_vvm_nxv4f64_nxv4f64_nxv4f64:
+define <vscale x 4 x double> @intrinsic_vmerge_vvm_nxv4f64_nxv4f64_nxv4f64(<vscale x 4 x double> %0, <vscale x 4 x double> %1, <vscale x 4 x i1> %2, iXLen %3) nounwind {
+; CHECK-LABEL: intrinsic_vmerge_vvm_nxv4f64_nxv4f64_nxv4f64:
 ; CHECK:       # %bb.0: # %entry
 ; CHECK-NEXT:    vsetvli zero, a0, e64, m4, ta, ma
 ; CHECK-NEXT:    vmerge.vvm v8, v8, v12, v0
 ; CHECK-NEXT:    ret
 entry:
-  %a = call <vscale x 4 x double> @llvm.riscv.vfmerge.nxv4f64.nxv4f64(
+  %a = call <vscale x 4 x double> @llvm.riscv.vmerge.nxv4f64.nxv4f64(
     <vscale x 4 x double> undef,
     <vscale x 4 x double> %0,
     <vscale x 4 x double> %1,
@@ -676,21 +676,21 @@ entry:
   ret <vscale x 4 x double> %a
 }
 
-declare <vscale x 8 x double> @llvm.riscv.vfmerge.nxv8f64.nxv8f64(
+declare <vscale x 8 x double> @llvm.riscv.vmerge.nxv8f64.nxv8f64(
   <vscale x 8 x double>,
   <vscale x 8 x double>,
   <vscale x 8 x double>,
   <vscale x 8 x i1>,
   iXLen);
 
-define <vscale x 8 x double> @intrinsic_vfmerge_vvm_nxv8f64_nxv8f64_nxv8f64(<vscale x 8 x double> %0, <vscale x 8 x double> %1, <vscale x 8 x i1> %2, iXLen %3) nounwind {
-; CHECK-LABEL: intrinsic_vfmerge_vvm_nxv8f64_nxv8f64_nxv8f64:
+define <vscale x 8 x double> @intrinsic_vmerge_vvm_nxv8f64_nxv8f64_nxv8f64(<vscale x 8 x double> %0, <vscale x 8 x double> %1, <vscale x 8 x i1> %2, iXLen %3) nounwind {
+; CHECK-LABEL: intrinsic_vmerge_vvm_nxv8f64_nxv8f64_nxv8f64:
 ; CHECK:       # %bb.0: # %entry
 ; CHECK-NEXT:    vsetvli zero, a0, e64, m8, ta, ma
 ; CHECK-NEXT:    vmerge.vvm v8, v8, v16, v0
 ; CHECK-NEXT:    ret
 entry:
-  %a = call <vscale x 8 x double> @llvm.riscv.vfmerge.nxv8f64.nxv8f64(
+  %a = call <vscale x 8 x double> @llvm.riscv.vmerge.nxv8f64.nxv8f64(
     <vscale x 8 x double> undef,
     <vscale x 8 x double> %0,
     <vscale x 8 x double> %1,
diff --git a/llvm/test/CodeGen/RISCV/rvv/vfmv.f.s.ll b/llvm/test/CodeGen/RISCV/rvv/vfmv.f.s.ll
index 47b4b61..af1c378 100644
--- a/llvm/test/CodeGen/RISCV/rvv/vfmv.f.s.ll
+++ b/llvm/test/CodeGen/RISCV/rvv/vfmv.f.s.ll
@@ -7,7 +7,7 @@ declare half @llvm.riscv.vfmv.f.s.nxv1f16(<vscale x 1 x half>)
 define half @intrinsic_vfmv.f.s_s_nxv1f16(<vscale x 1 x half> %0) nounwind {
 ; CHECK-LABEL: intrinsic_vfmv.f.s_s_nxv1f16:
 ; CHECK:       # %bb.0: # %entry
-; CHECK-NEXT:    vsetivli zero, 1, e16, mf4, ta, ma
+; CHECK-NEXT:    vsetivli zero, 1, e16, m1, ta, ma
 ; CHECK-NEXT:    vfmv.f.s fa0, v8
 ; CHECK-NEXT:    ret
 entry:
@@ -20,7 +20,7 @@ declare half @llvm.riscv.vfmv.f.s.nxv2f16(<vscale x 2 x half>)
 define half @intrinsic_vfmv.f.s_s_nxv2f16(<vscale x 2 x half> %0) nounwind {
 ; CHECK-LABEL: intrinsic_vfmv.f.s_s_nxv2f16:
 ; CHECK:       # %bb.0: # %entry
-; CHECK-NEXT:    vsetivli zero, 1, e16, mf2, ta, ma
+; CHECK-NEXT:    vsetivli zero, 1, e16, m1, ta, ma
 ; CHECK-NEXT:    vfmv.f.s fa0, v8
 ; CHECK-NEXT:    ret
 entry:
@@ -85,7 +85,7 @@ declare float @llvm.riscv.vfmv.f.s.nxv1f32(<vscale x 1 x float>)
 define float @intrinsic_vfmv.f.s_s_nxv1f32(<vscale x 1 x float> %0) nounwind {
 ; CHECK-LABEL: intrinsic_vfmv.f.s_s_nxv1f32:
 ; CHECK:       # %bb.0: # %entry
-; CHECK-NEXT:    vsetivli zero, 1, e32, mf2, ta, ma
+; CHECK-NEXT:    vsetivli zero, 1, e32, m1, ta, ma
 ; CHECK-NEXT:    vfmv.f.s fa0, v8
 ; CHECK-NEXT:    ret
 entry:
diff --git a/llvm/test/CodeGen/RISCV/rvv/vfmv.s.f.ll b/llvm/test/CodeGen/RISCV/rvv/vfmv.s.f.ll
index b3aab23..1e863a4 100644
--- a/llvm/test/CodeGen/RISCV/rvv/vfmv.s.f.ll
+++ b/llvm/test/CodeGen/RISCV/rvv/vfmv.s.f.ll
@@ -9,7 +9,7 @@ declare <vscale x 1 x half> @llvm.riscv.vfmv.s.f.nxv1f16(<vscale x 1 x half>, ha
 define <vscale x 1 x half> @intrinsic_vfmv.s.f_f_nxv1f16(<vscale x 1 x half> %0, half %1, iXLen %2) nounwind {
 ; CHECK-LABEL: intrinsic_vfmv.s.f_f_nxv1f16:
 ; CHECK:       # %bb.0: # %entry
-; CHECK-NEXT:    vsetvli zero, a0, e16, mf4, tu, ma
+; CHECK-NEXT:    vsetvli zero, a0, e16, m1, tu, ma
 ; CHECK-NEXT:    vfmv.s.f v8, fa0
 ; CHECK-NEXT:    ret
 entry:
@@ -22,7 +22,7 @@ declare <vscale x 2 x half> @llvm.riscv.vfmv.s.f.nxv2f16(<vscale x 2 x half>, ha
 define <vscale x 2 x half> @intrinsic_vfmv.s.f_f_nxv2f16(<vscale x 2 x half> %0, half %1, iXLen %2) nounwind {
 ; CHECK-LABEL: intrinsic_vfmv.s.f_f_nxv2f16:
 ; CHECK:       # %bb.0: # %entry
-; CHECK-NEXT:    vsetvli zero, a0, e16, mf2, tu, ma
+; CHECK-NEXT:    vsetvli zero, a0, e16, m1, tu, ma
 ; CHECK-NEXT:    vfmv.s.f v8, fa0
 ; CHECK-NEXT:    ret
 entry:
@@ -87,7 +87,7 @@ declare <vscale x 1 x float> @llvm.riscv.vfmv.s.f.nxv1f32(<vscale x 1 x float>,
 define <vscale x 1 x float> @intrinsic_vfmv.s.f_f_nxv1f32(<vscale x 1 x float> %0, float %1, iXLen %2) nounwind {
 ; CHECK-LABEL: intrinsic_vfmv.s.f_f_nxv1f32:
 ; CHECK:       # %bb.0: # %entry
-; CHECK-NEXT:    vsetvli zero, a0, e32, mf2, tu, ma
+; CHECK-NEXT:    vsetvli zero, a0, e32, m1, tu, ma
 ; CHECK-NEXT:    vfmv.s.f v8, fa0
 ; CHECK-NEXT:    ret
 entry:
diff --git a/llvm/test/CodeGen/RISCV/rvv/vleff-vlseg2ff-output.ll b/llvm/test/CodeGen/RISCV/rvv/vleff-vlseg2ff-output.ll
index 390647f..578b5dc 100644
--- a/llvm/test/CodeGen/RISCV/rvv/vleff-vlseg2ff-output.ll
+++ b/llvm/test/CodeGen/RISCV/rvv/vleff-vlseg2ff-output.ll
@@ -23,7 +23,7 @@ entry:
   ret i64 %1
 }
 
-define i64 @test_vleff_nxv8i8_tu(<vscale x 8 x i8> %merge, ptr %p, i64 %vl) {
+define i64 @test_vleff_nxv8i8_tu(<vscale x 8 x i8> %passthru, ptr %p, i64 %vl) {
   ; CHECK-LABEL: name: test_vleff_nxv8i8_tu
   ; CHECK: bb.0.entry:
   ; CHECK-NEXT:   liveins: $v8, $x10, $x11
@@ -35,7 +35,7 @@ define i64 @test_vleff_nxv8i8_tu(<vscale x 8 x i8> %merge, ptr %p, i64 %vl) {
   ; CHECK-NEXT:   $x10 = COPY [[PseudoVLE8FF_V_M1_1]]
   ; CHECK-NEXT:   PseudoRET implicit $x10
 entry:
-  %0 = call { <vscale x 8 x i8>, i64 } @llvm.riscv.vleff.nxv8i8(<vscale x 8 x i8> %merge, ptr %p, i64 %vl)
+  %0 = call { <vscale x 8 x i8>, i64 } @llvm.riscv.vleff.nxv8i8(<vscale x 8 x i8> %passthru, ptr %p, i64 %vl)
   %1 = extractvalue { <vscale x 8 x i8>, i64 } %0, 1
   ret i64 %1
 }
diff --git a/llvm/test/CodeGen/RISCV/rvv/vlsegff-rv32-dead.ll b/llvm/test/CodeGen/RISCV/rvv/vlsegff-rv32-dead.ll
index 64e3391..9588c85 100644
--- a/llvm/test/CodeGen/RISCV/rvv/vlsegff-rv32-dead.ll
+++ b/llvm/test/CodeGen/RISCV/rvv/vlsegff-rv32-dead.ll
@@ -1,5 +1,5 @@
 ; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py
-; RUN: llc -mtriple=riscv32 -mattr=+zve64d,+f,+d,+zfh,+zvfh \
+; RUN: llc -mtriple=riscv32 -mattr=+zve64x \
 ; RUN:     -verify-machineinstrs < %s | FileCheck %s
 
 declare {<vscale x 16 x i16>,<vscale x 16 x i16>, i32} @llvm.riscv.vlseg2ff.nxv16i16(<vscale x 16 x i16>,<vscale x 16 x i16>, ptr , i32)
diff --git a/llvm/test/CodeGen/RISCV/rvv/vlsegff-rv64-dead.ll b/llvm/test/CodeGen/RISCV/rvv/vlsegff-rv64-dead.ll
index 2cc924e..02c2994 100644
--- a/llvm/test/CodeGen/RISCV/rvv/vlsegff-rv64-dead.ll
+++ b/llvm/test/CodeGen/RISCV/rvv/vlsegff-rv64-dead.ll
@@ -1,5 +1,5 @@
 ; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py
-; RUN: llc -mtriple=riscv64 -mattr=+zve64d,+f,+d,+zfh,+zvfh \
+; RUN: llc -mtriple=riscv64 -mattr=+zve64x \
 ; RUN:     -verify-machineinstrs < %s | FileCheck %s
 
 declare {<vscale x 16 x i16>,<vscale x 16 x i16>, i64} @llvm.riscv.vlseg2ff.nxv16i16(<vscale x 16 x i16>,<vscale x 16 x i16>, ptr , i64)
diff --git a/llvm/test/CodeGen/RISCV/rvv/vmacc-vp.ll b/llvm/test/CodeGen/RISCV/rvv/vmacc-vp.ll
index 5aba7ef..e232ac2 100644
--- a/llvm/test/CodeGen/RISCV/rvv/vmacc-vp.ll
+++ b/llvm/test/CodeGen/RISCV/rvv/vmacc-vp.ll
@@ -1,7 +1,7 @@
 ; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py
-; RUN: llc -mtriple=riscv32 -mattr=+d,+zfh,+zvfh,+v,+m -target-abi=ilp32d \
+; RUN: llc -mtriple=riscv32 -mattr=+v,+m -target-abi=ilp32d \
 ; RUN:     -verify-machineinstrs < %s | FileCheck %s --check-prefixes=CHECK,RV32
-; RUN: llc -mtriple=riscv64 -mattr=+d,+zfh,+zvfh,+v,+m -target-abi=lp64d \
+; RUN: llc -mtriple=riscv64 -mattr=+v,+m -target-abi=lp64d \
 ; RUN:     -verify-machineinstrs < %s | FileCheck %s --check-prefixes=CHECK,RV64
 
 declare <vscale x 1 x i8> @llvm.vp.mul.nxv1i8(<vscale x 1 x i8>, <vscale x 1 x i8>, <vscale x 1 x i1>, i32)
diff --git a/llvm/test/CodeGen/RISCV/rvv/vmadd-vp.ll b/llvm/test/CodeGen/RISCV/rvv/vmadd-vp.ll
index 22ed56a..5401bf7 100644
--- a/llvm/test/CodeGen/RISCV/rvv/vmadd-vp.ll
+++ b/llvm/test/CodeGen/RISCV/rvv/vmadd-vp.ll
@@ -1,7 +1,7 @@
 ; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py
-; RUN: llc -mtriple=riscv32 -mattr=+d,+zfh,+zvfh,+v,+m -target-abi=ilp32d \
+; RUN: llc -mtriple=riscv32 -mattr=+v,+m -target-abi=ilp32d \
 ; RUN:     -verify-machineinstrs < %s | FileCheck %s --check-prefixes=CHECK,RV32
-; RUN: llc -mtriple=riscv64 -mattr=+d,+zfh,+zvfh,+v,+m -target-abi=lp64d \
+; RUN: llc -mtriple=riscv64 -mattr=+v,+m -target-abi=lp64d \
 ; RUN:     -verify-machineinstrs < %s | FileCheck %s --check-prefixes=CHECK,RV64
 
 declare <vscale x 1 x i8> @llvm.vp.mul.nxv1i8(<vscale x 1 x i8>, <vscale x 1 x i8>, <vscale x 1 x i1>, i32)
diff --git a/llvm/test/CodeGen/RISCV/rvv/vmv.s.x-rv32.ll b/llvm/test/CodeGen/RISCV/rvv/vmv.s.x-rv32.ll
deleted file mode 100644
index 0c66000..0000000
--- a/llvm/test/CodeGen/RISCV/rvv/vmv.s.x-rv32.ll
+++ /dev/null
@@ -1,316 +0,0 @@
-; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py
-; RUN: llc -mtriple=riscv32 -mattr=+v -verify-machineinstrs < %s | FileCheck %s
-
-declare <vscale x 1 x i8> @llvm.riscv.vmv.s.x.nxv1i8(<vscale x 1 x i8>, i8, i32)
-
-define <vscale x 1 x i8> @intrinsic_vmv.s.x_x_nxv1i8(<vscale x 1 x i8> %0, i8 %1, i32 %2) nounwind {
-; CHECK-LABEL: intrinsic_vmv.s.x_x_nxv1i8:
-; CHECK:       # %bb.0: # %entry
-; CHECK-NEXT:    vsetvli zero, a1, e8, m1, tu, ma
-; CHECK-NEXT:    vmv.s.x v8, a0
-; CHECK-NEXT:    ret
-entry:
-  %a = call <vscale x 1 x i8> @llvm.riscv.vmv.s.x.nxv1i8(<vscale x 1 x i8> %0, i8 %1, i32 %2)
-  ret <vscale x 1 x i8> %a
-}
-
-declare <vscale x 2 x i8> @llvm.riscv.vmv.s.x.nxv2i8(<vscale x 2 x i8>, i8, i32)
-
-define <vscale x 2 x i8> @intrinsic_vmv.s.x_x_nxv2i8(<vscale x 2 x i8> %0, i8 %1, i32 %2) nounwind {
-; CHECK-LABEL: intrinsic_vmv.s.x_x_nxv2i8:
-; CHECK:       # %bb.0: # %entry
-; CHECK-NEXT:    vsetvli zero, a1, e8, m1, tu, ma
-; CHECK-NEXT:    vmv.s.x v8, a0
-; CHECK-NEXT:    ret
-entry:
-  %a = call <vscale x 2 x i8> @llvm.riscv.vmv.s.x.nxv2i8(<vscale x 2 x i8> %0, i8 %1, i32 %2)
-  ret <vscale x 2 x i8> %a
-}
-
-declare <vscale x 4 x i8> @llvm.riscv.vmv.s.x.nxv4i8(<vscale x 4 x i8>, i8, i32)
-
-define <vscale x 4 x i8> @intrinsic_vmv.s.x_x_nxv4i8(<vscale x 4 x i8> %0, i8 %1, i32 %2) nounwind {
-; CHECK-LABEL: intrinsic_vmv.s.x_x_nxv4i8:
-; CHECK:       # %bb.0: # %entry
-; CHECK-NEXT:    vsetvli zero, a1, e8, m1, tu, ma
-; CHECK-NEXT:    vmv.s.x v8, a0
-; CHECK-NEXT:    ret
-entry:
-  %a = call <vscale x 4 x i8> @llvm.riscv.vmv.s.x.nxv4i8(<vscale x 4 x i8> %0, i8 %1, i32 %2)
-  ret <vscale x 4 x i8> %a
-}
-
-declare <vscale x 8 x i8> @llvm.riscv.vmv.s.x.nxv8i8(<vscale x 8 x i8>, i8, i32)
-
-define <vscale x 8 x i8> @intrinsic_vmv.s.x_x_nxv8i8(<vscale x 8 x i8> %0, i8 %1, i32 %2) nounwind {
-; CHECK-LABEL: intrinsic_vmv.s.x_x_nxv8i8:
-; CHECK:       # %bb.0: # %entry
-; CHECK-NEXT:    vsetvli zero, a1, e8, m1, tu, ma
-; CHECK-NEXT:    vmv.s.x v8, a0
-; CHECK-NEXT:    ret
-entry:
-  %a = call <vscale x 8 x i8> @llvm.riscv.vmv.s.x.nxv8i8(<vscale x 8 x i8> %0, i8 %1, i32 %2)
-  ret <vscale x 8 x i8> %a
-}
-
-declare <vscale x 16 x i8> @llvm.riscv.vmv.s.x.nxv16i8(<vscale x 16 x i8>, i8, i32)
-
-define <vscale x 16 x i8> @intrinsic_vmv.s.x_x_nxv16i8(<vscale x 16 x i8> %0, i8 %1, i32 %2) nounwind {
-; CHECK-LABEL: intrinsic_vmv.s.x_x_nxv16i8:
-; CHECK:       # %bb.0: # %entry
-; CHECK-NEXT:    vsetvli zero, a1, e8, m1, tu, ma
-; CHECK-NEXT:    vmv.s.x v8, a0
-; CHECK-NEXT:    ret
-entry:
-  %a = call <vscale x 16 x i8> @llvm.riscv.vmv.s.x.nxv16i8(<vscale x 16 x i8> %0, i8 %1, i32 %2)
-  ret <vscale x 16 x i8> %a
-}
-
-declare <vscale x 32 x i8> @llvm.riscv.vmv.s.x.nxv32i8(<vscale x 32 x i8>, i8, i32)
-
-define <vscale x 32 x i8> @intrinsic_vmv.s.x_x_nxv32i8(<vscale x 32 x i8> %0, i8 %1, i32 %2) nounwind {
-; CHECK-LABEL: intrinsic_vmv.s.x_x_nxv32i8:
-; CHECK:       # %bb.0: # %entry
-; CHECK-NEXT:    vsetvli zero, a1, e8, m1, tu, ma
-; CHECK-NEXT:    vmv.s.x v8, a0
-; CHECK-NEXT:    ret
-entry:
-  %a = call <vscale x 32 x i8> @llvm.riscv.vmv.s.x.nxv32i8(<vscale x 32 x i8> %0, i8 %1, i32 %2)
-  ret <vscale x 32 x i8> %a
-}
-
-declare <vscale x 64 x i8> @llvm.riscv.vmv.s.x.nxv64i8(<vscale x 64 x i8>, i8, i32)
-
-define <vscale x 64 x i8> @intrinsic_vmv.s.x_x_nxv64i8(<vscale x 64 x i8> %0, i8 %1, i32 %2) nounwind {
-; CHECK-LABEL: intrinsic_vmv.s.x_x_nxv64i8:
-; CHECK:       # %bb.0: # %entry
-; CHECK-NEXT:    vsetvli zero, a1, e8, m1, tu, ma
-; CHECK-NEXT:    vmv.s.x v8, a0
-; CHECK-NEXT:    ret
-entry:
-  %a = call <vscale x 64 x i8> @llvm.riscv.vmv.s.x.nxv64i8(<vscale x 64 x i8> %0, i8 %1, i32 %2)
-  ret <vscale x 64 x i8> %a
-}
-
-declare <vscale x 1 x i16> @llvm.riscv.vmv.s.x.nxv1i16(<vscale x 1 x i16>, i16, i32)
-
-define <vscale x 1 x i16> @intrinsic_vmv.s.x_x_nxv1i16(<vscale x 1 x i16> %0, i16 %1, i32 %2) nounwind {
-; CHECK-LABEL: intrinsic_vmv.s.x_x_nxv1i16:
-; CHECK:       # %bb.0: # %entry
-; CHECK-NEXT:    vsetvli zero, a1, e16, m1, tu, ma
-; CHECK-NEXT:    vmv.s.x v8, a0
-; CHECK-NEXT:    ret
-entry:
-  %a = call <vscale x 1 x i16> @llvm.riscv.vmv.s.x.nxv1i16(<vscale x 1 x i16> %0, i16 %1, i32 %2)
-  ret <vscale x 1 x i16> %a
-}
-
-declare <vscale x 2 x i16> @llvm.riscv.vmv.s.x.nxv2i16(<vscale x 2 x i16>, i16, i32)
-
-define <vscale x 2 x i16> @intrinsic_vmv.s.x_x_nxv2i16(<vscale x 2 x i16> %0, i16 %1, i32 %2) nounwind {
-; CHECK-LABEL: intrinsic_vmv.s.x_x_nxv2i16:
-; CHECK:       # %bb.0: # %entry
-; CHECK-NEXT:    vsetvli zero, a1, e16, m1, tu, ma
-; CHECK-NEXT:    vmv.s.x v8, a0
-; CHECK-NEXT:    ret
-entry:
-  %a = call <vscale x 2 x i16> @llvm.riscv.vmv.s.x.nxv2i16(<vscale x 2 x i16> %0, i16 %1, i32 %2)
-  ret <vscale x 2 x i16> %a
-}
-
-declare <vscale x 4 x i16> @llvm.riscv.vmv.s.x.nxv4i16(<vscale x 4 x i16>, i16, i32)
-
-define <vscale x 4 x i16> @intrinsic_vmv.s.x_x_nxv4i16(<vscale x 4 x i16> %0, i16 %1, i32 %2) nounwind {
-; CHECK-LABEL: intrinsic_vmv.s.x_x_nxv4i16:
-; CHECK:       # %bb.0: # %entry
-; CHECK-NEXT:    vsetvli zero, a1, e16, m1, tu, ma
-; CHECK-NEXT:    vmv.s.x v8, a0
-; CHECK-NEXT:    ret
-entry:
-  %a = call <vscale x 4 x i16> @llvm.riscv.vmv.s.x.nxv4i16(<vscale x 4 x i16> %0, i16 %1, i32 %2)
-  ret <vscale x 4 x i16> %a
-}
-
-declare <vscale x 8 x i16> @llvm.riscv.vmv.s.x.nxv8i16(<vscale x 8 x i16>, i16, i32)
-
-define <vscale x 8 x i16> @intrinsic_vmv.s.x_x_nxv8i16(<vscale x 8 x i16> %0, i16 %1, i32 %2) nounwind {
-; CHECK-LABEL: intrinsic_vmv.s.x_x_nxv8i16:
-; CHECK:       # %bb.0: # %entry
-; CHECK-NEXT:    vsetvli zero, a1, e16, m1, tu, ma
-; CHECK-NEXT:    vmv.s.x v8, a0
-; CHECK-NEXT:    ret
-entry:
-  %a = call <vscale x 8 x i16> @llvm.riscv.vmv.s.x.nxv8i16(<vscale x 8 x i16> %0, i16 %1, i32 %2)
-  ret <vscale x 8 x i16> %a
-}
-
-declare <vscale x 16 x i16> @llvm.riscv.vmv.s.x.nxv16i16(<vscale x 16 x i16>, i16, i32)
-
-define <vscale x 16 x i16> @intrinsic_vmv.s.x_x_nxv16i16(<vscale x 16 x i16> %0, i16 %1, i32 %2) nounwind {
-; CHECK-LABEL: intrinsic_vmv.s.x_x_nxv16i16:
-; CHECK:       # %bb.0: # %entry
-; CHECK-NEXT:    vsetvli zero, a1, e16, m1, tu, ma
-; CHECK-NEXT:    vmv.s.x v8, a0
-; CHECK-NEXT:    ret
-entry:
-  %a = call <vscale x 16 x i16> @llvm.riscv.vmv.s.x.nxv16i16(<vscale x 16 x i16> %0, i16 %1, i32 %2)
-  ret <vscale x 16 x i16> %a
-}
-
-declare <vscale x 32 x i16> @llvm.riscv.vmv.s.x.nxv32i16(<vscale x 32 x i16>, i16, i32)
-
-define <vscale x 32 x i16> @intrinsic_vmv.s.x_x_nxv32i16(<vscale x 32 x i16> %0, i16 %1, i32 %2) nounwind {
-; CHECK-LABEL: intrinsic_vmv.s.x_x_nxv32i16:
-; CHECK:       # %bb.0: # %entry
-; CHECK-NEXT:    vsetvli zero, a1, e16, m1, tu, ma
-; CHECK-NEXT:    vmv.s.x v8, a0
-; CHECK-NEXT:    ret
-entry:
-  %a = call <vscale x 32 x i16> @llvm.riscv.vmv.s.x.nxv32i16(<vscale x 32 x i16> %0, i16 %1, i32 %2)
-  ret <vscale x 32 x i16> %a
-}
-
-declare <vscale x 1 x i32> @llvm.riscv.vmv.s.x.nxv1i32(<vscale x 1 x i32>, i32, i32)
-
-define <vscale x 1 x i32> @intrinsic_vmv.s.x_x_nxv1i32(<vscale x 1 x i32> %0, i32 %1, i32 %2) nounwind {
-; CHECK-LABEL: intrinsic_vmv.s.x_x_nxv1i32:
-; CHECK:       # %bb.0: # %entry
-; CHECK-NEXT:    vsetvli zero, a1, e32, m1, tu, ma
-; CHECK-NEXT:    vmv.s.x v8, a0
-; CHECK-NEXT:    ret
-entry:
-  %a = call <vscale x 1 x i32> @llvm.riscv.vmv.s.x.nxv1i32(<vscale x 1 x i32> %0, i32 %1, i32 %2)
-  ret <vscale x 1 x i32> %a
-}
-
-declare <vscale x 2 x i32> @llvm.riscv.vmv.s.x.nxv2i32(<vscale x 2 x i32>, i32, i32)
-
-define <vscale x 2 x i32> @intrinsic_vmv.s.x_x_nxv2i32(<vscale x 2 x i32> %0, i32 %1, i32 %2) nounwind {
-; CHECK-LABEL: intrinsic_vmv.s.x_x_nxv2i32:
-; CHECK:       # %bb.0: # %entry
-; CHECK-NEXT:    vsetvli zero, a1, e32, m1, tu, ma
-; CHECK-NEXT:    vmv.s.x v8, a0
-; CHECK-NEXT:    ret
-entry:
-  %a = call <vscale x 2 x i32> @llvm.riscv.vmv.s.x.nxv2i32(<vscale x 2 x i32> %0, i32 %1, i32 %2)
-  ret <vscale x 2 x i32> %a
-}
-
-declare <vscale x 4 x i32> @llvm.riscv.vmv.s.x.nxv4i32(<vscale x 4 x i32>, i32, i32)
-
-define <vscale x 4 x i32> @intrinsic_vmv.s.x_x_nxv4i32(<vscale x 4 x i32> %0, i32 %1, i32 %2) nounwind {
-; CHECK-LABEL: intrinsic_vmv.s.x_x_nxv4i32:
-; CHECK:       # %bb.0: # %entry
-; CHECK-NEXT:    vsetvli zero, a1, e32, m1, tu, ma
-; CHECK-NEXT:    vmv.s.x v8, a0
-; CHECK-NEXT:    ret
-entry:
-  %a = call <vscale x 4 x i32> @llvm.riscv.vmv.s.x.nxv4i32(<vscale x 4 x i32> %0, i32 %1, i32 %2)
-  ret <vscale x 4 x i32> %a
-}
-
-declare <vscale x 8 x i32> @llvm.riscv.vmv.s.x.nxv8i32(<vscale x 8 x i32>, i32, i32)
-
-define <vscale x 8 x i32> @intrinsic_vmv.s.x_x_nxv8i32(<vscale x 8 x i32> %0, i32 %1, i32 %2) nounwind {
-; CHECK-LABEL: intrinsic_vmv.s.x_x_nxv8i32:
-; CHECK:       # %bb.0: # %entry
-; CHECK-NEXT:    vsetvli zero, a1, e32, m1, tu, ma
-; CHECK-NEXT:    vmv.s.x v8, a0
-; CHECK-NEXT:    ret
-entry:
-  %a = call <vscale x 8 x i32> @llvm.riscv.vmv.s.x.nxv8i32(<vscale x 8 x i32> %0, i32 %1, i32 %2)
-  ret <vscale x 8 x i32> %a
-}
-
-declare <vscale x 16 x i32> @llvm.riscv.vmv.s.x.nxv16i32(<vscale x 16 x i32>, i32, i32)
-
-define <vscale x 16 x i32> @intrinsic_vmv.s.x_x_nxv16i32(<vscale x 16 x i32> %0, i32 %1, i32 %2) nounwind {
-; CHECK-LABEL: intrinsic_vmv.s.x_x_nxv16i32:
-; CHECK:       # %bb.0: # %entry
-; CHECK-NEXT:    vsetvli zero, a1, e32, m1, tu, ma
-; CHECK-NEXT:    vmv.s.x v8, a0
-; CHECK-NEXT:    ret
-entry:
-  %a = call <vscale x 16 x i32> @llvm.riscv.vmv.s.x.nxv16i32(<vscale x 16 x i32> %0, i32 %1, i32 %2)
-  ret <vscale x 16 x i32> %a
-}
-
-declare <vscale x 1 x i64> @llvm.riscv.vmv.s.x.nxv1i64(<vscale x 1 x i64>, i64, i32);
-
-define <vscale x 1 x i64> @intrinsic_vmv.s.x_x_nxv1i64(<vscale x 1 x i64> %0, i64 %1, i32 %2) nounwind {
-; CHECK-LABEL: intrinsic_vmv.s.x_x_nxv1i64:
-; CHECK:       # %bb.0: # %entry
-; CHECK-NEXT:    addi sp, sp, -16
-; CHECK-NEXT:    sw a1, 12(sp)
-; CHECK-NEXT:    sw a0, 8(sp)
-; CHECK-NEXT:    vsetvli zero, a2, e64, m1, ta, mu
-; CHECK-NEXT:    vid.v v9
-; CHECK-NEXT:    vmseq.vi v0, v9, 0
-; CHECK-NEXT:    addi a0, sp, 8
-; CHECK-NEXT:    vlse64.v v8, (a0), zero, v0.t
-; CHECK-NEXT:    addi sp, sp, 16
-; CHECK-NEXT:    ret
-entry:
-  %a = call <vscale x 1 x i64> @llvm.riscv.vmv.s.x.nxv1i64(<vscale x 1 x i64> %0, i64 %1, i32 %2)
-  ret <vscale x 1 x i64> %a
-}
-
-declare <vscale x 2 x i64> @llvm.riscv.vmv.s.x.nxv2i64(<vscale x 2 x i64>, i64, i32);
-
-define <vscale x 2 x i64> @intrinsic_vmv.s.x_x_nxv2i64(<vscale x 2 x i64> %0, i64 %1, i32 %2) nounwind {
-; CHECK-LABEL: intrinsic_vmv.s.x_x_nxv2i64:
-; CHECK:       # %bb.0: # %entry
-; CHECK-NEXT:    addi sp, sp, -16
-; CHECK-NEXT:    sw a1, 12(sp)
-; CHECK-NEXT:    sw a0, 8(sp)
-; CHECK-NEXT:    vsetvli zero, a2, e64, m2, ta, mu
-; CHECK-NEXT:    vid.v v10
-; CHECK-NEXT:    vmseq.vi v0, v10, 0
-; CHECK-NEXT:    addi a0, sp, 8
-; CHECK-NEXT:    vlse64.v v8, (a0), zero, v0.t
-; CHECK-NEXT:    addi sp, sp, 16
-; CHECK-NEXT:    ret
-entry:
-  %a = call <vscale x 2 x i64> @llvm.riscv.vmv.s.x.nxv2i64(<vscale x 2 x i64> %0, i64 %1, i32 %2)
-  ret <vscale x 2 x i64> %a
-}
-
-declare <vscale x 4 x i64> @llvm.riscv.vmv.s.x.nxv4i64(<vscale x 4 x i64>, i64, i32);
-
-define <vscale x 4 x i64> @intrinsic_vmv.s.x_x_nxv4i64(<vscale x 4 x i64> %0, i64 %1, i32 %2) nounwind {
-; CHECK-LABEL: intrinsic_vmv.s.x_x_nxv4i64:
-; CHECK:       # %bb.0: # %entry
-; CHECK-NEXT:    addi sp, sp, -16
-; CHECK-NEXT:    sw a1, 12(sp)
-; CHECK-NEXT:    sw a0, 8(sp)
-; CHECK-NEXT:    vsetvli zero, a2, e64, m4, ta, mu
-; CHECK-NEXT:    vid.v v12
-; CHECK-NEXT:    vmseq.vi v0, v12, 0
-; CHECK-NEXT:    addi a0, sp, 8
-; CHECK-NEXT:    vlse64.v v8, (a0), zero, v0.t
-; CHECK-NEXT:    addi sp, sp, 16
-; CHECK-NEXT:    ret
-entry:
-  %a = call <vscale x 4 x i64> @llvm.riscv.vmv.s.x.nxv4i64(<vscale x 4 x i64> %0, i64 %1, i32 %2)
-  ret <vscale x 4 x i64> %a
-}
-
-declare <vscale x 8 x i64> @llvm.riscv.vmv.s.x.nxv8i64(<vscale x 8 x i64>, i64, i32);
-
-define <vscale x 8 x i64> @intrinsic_vmv.s.x_x_nxv8i64(<vscale x 8 x i64> %0, i64 %1, i32 %2) nounwind {
-; CHECK-LABEL: intrinsic_vmv.s.x_x_nxv8i64:
-; CHECK:       # %bb.0: # %entry
-; CHECK-NEXT:    addi sp, sp, -16
-; CHECK-NEXT:    sw a1, 12(sp)
-; CHECK-NEXT:    sw a0, 8(sp)
-; CHECK-NEXT:    vsetvli zero, a2, e64, m8, ta, mu
-; CHECK-NEXT:    vid.v v16
-; CHECK-NEXT:    vmseq.vi v0, v16, 0
-; CHECK-NEXT:    addi a0, sp, 8
-; CHECK-NEXT:    vlse64.v v8, (a0), zero, v0.t
-; CHECK-NEXT:    addi sp, sp, 16
-; CHECK-NEXT:    ret
-entry:
-  %a = call <vscale x 8 x i64> @llvm.riscv.vmv.s.x.nxv8i64(<vscale x 8 x i64> %0, i64 %1, i32 %2)
-  ret <vscale x 8 x i64> %a
-}
diff --git a/llvm/test/CodeGen/RISCV/rvv/vmv.s.x-rv64.ll b/llvm/test/CodeGen/RISCV/rvv/vmv.s.x.ll
index 163eb73..afb9cba 100644
--- a/llvm/test/CodeGen/RISCV/rvv/vmv.s.x-rv64.ll
+++ b/llvm/test/CodeGen/RISCV/rvv/vmv.s.x.ll
@@ -1,302 +1,372 @@
 ; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py
-; RUN: llc -mtriple=riscv64 -mattr=+v -verify-machineinstrs < %s | FileCheck %s
+; RUN: sed 's/iXLen/i32/g' %s | llc -mtriple=riscv32 -mattr=+v \
+; RUN:   -verify-machineinstrs  | FileCheck %s --check-prefixes=CHECK,RV32
+; RUN: sed 's/iXLen/i64/g' %s | llc -mtriple=riscv64 -mattr=+v \
+; RUN:   -verify-machineinstrs  | FileCheck %s --check-prefixes=CHECK,RV64
 
-declare <vscale x 1 x i8> @llvm.riscv.vmv.s.x.nxv1i8(<vscale x 1 x i8>, i8, i64);
+declare <vscale x 1 x i8> @llvm.riscv.vmv.s.x.nxv1i8(<vscale x 1 x i8>, i8, iXLen);
 
-define <vscale x 1 x i8> @intrinsic_vmv.s.x_x_nxv1i8(<vscale x 1 x i8> %0, i8 %1, i64 %2) nounwind {
+define <vscale x 1 x i8> @intrinsic_vmv.s.x_x_nxv1i8(<vscale x 1 x i8> %0, i8 %1, iXLen %2) nounwind {
 ; CHECK-LABEL: intrinsic_vmv.s.x_x_nxv1i8:
 ; CHECK:       # %bb.0: # %entry
 ; CHECK-NEXT:    vsetvli zero, a1, e8, m1, tu, ma
 ; CHECK-NEXT:    vmv.s.x v8, a0
 ; CHECK-NEXT:    ret
 entry:
-  %a = call <vscale x 1 x i8> @llvm.riscv.vmv.s.x.nxv1i8(<vscale x 1 x i8> %0, i8 %1, i64 %2)
+  %a = call <vscale x 1 x i8> @llvm.riscv.vmv.s.x.nxv1i8(<vscale x 1 x i8> %0, i8 %1, iXLen %2)
   ret <vscale x 1 x i8> %a
 }
 
-declare <vscale x 2 x i8> @llvm.riscv.vmv.s.x.nxv2i8(<vscale x 2 x i8>, i8, i64);
+declare <vscale x 2 x i8> @llvm.riscv.vmv.s.x.nxv2i8(<vscale x 2 x i8>, i8, iXLen);
 
-define <vscale x 2 x i8> @intrinsic_vmv.s.x_x_nxv2i8(<vscale x 2 x i8> %0, i8 %1, i64 %2) nounwind {
+define <vscale x 2 x i8> @intrinsic_vmv.s.x_x_nxv2i8(<vscale x 2 x i8> %0, i8 %1, iXLen %2) nounwind {
 ; CHECK-LABEL: intrinsic_vmv.s.x_x_nxv2i8:
 ; CHECK:       # %bb.0: # %entry
 ; CHECK-NEXT:    vsetvli zero, a1, e8, m1, tu, ma
 ; CHECK-NEXT:    vmv.s.x v8, a0
 ; CHECK-NEXT:    ret
 entry:
-  %a = call <vscale x 2 x i8> @llvm.riscv.vmv.s.x.nxv2i8(<vscale x 2 x i8> %0, i8 %1, i64 %2)
+  %a = call <vscale x 2 x i8> @llvm.riscv.vmv.s.x.nxv2i8(<vscale x 2 x i8> %0, i8 %1, iXLen %2)
   ret <vscale x 2 x i8> %a
 }
 
-declare <vscale x 4 x i8> @llvm.riscv.vmv.s.x.nxv4i8(<vscale x 4 x i8>, i8, i64);
+declare <vscale x 4 x i8> @llvm.riscv.vmv.s.x.nxv4i8(<vscale x 4 x i8>, i8, iXLen);
 
-define <vscale x 4 x i8> @intrinsic_vmv.s.x_x_nxv4i8(<vscale x 4 x i8> %0, i8 %1, i64 %2) nounwind {
+define <vscale x 4 x i8> @intrinsic_vmv.s.x_x_nxv4i8(<vscale x 4 x i8> %0, i8 %1, iXLen %2) nounwind {
 ; CHECK-LABEL: intrinsic_vmv.s.x_x_nxv4i8:
 ; CHECK:       # %bb.0: # %entry
 ; CHECK-NEXT:    vsetvli zero, a1, e8, m1, tu, ma
 ; CHECK-NEXT:    vmv.s.x v8, a0
 ; CHECK-NEXT:    ret
 entry:
-  %a = call <vscale x 4 x i8> @llvm.riscv.vmv.s.x.nxv4i8(<vscale x 4 x i8> %0, i8 %1, i64 %2)
+  %a = call <vscale x 4 x i8> @llvm.riscv.vmv.s.x.nxv4i8(<vscale x 4 x i8> %0, i8 %1, iXLen %2)
   ret <vscale x 4 x i8> %a
 }
 
-declare <vscale x 8 x i8> @llvm.riscv.vmv.s.x.nxv8i8(<vscale x 8 x i8>, i8, i64);
+declare <vscale x 8 x i8> @llvm.riscv.vmv.s.x.nxv8i8(<vscale x 8 x i8>, i8, iXLen);
 
-define <vscale x 8 x i8> @intrinsic_vmv.s.x_x_nxv8i8(<vscale x 8 x i8> %0, i8 %1, i64 %2) nounwind {
+define <vscale x 8 x i8> @intrinsic_vmv.s.x_x_nxv8i8(<vscale x 8 x i8> %0, i8 %1, iXLen %2) nounwind {
 ; CHECK-LABEL: intrinsic_vmv.s.x_x_nxv8i8:
 ; CHECK:       # %bb.0: # %entry
 ; CHECK-NEXT:    vsetvli zero, a1, e8, m1, tu, ma
 ; CHECK-NEXT:    vmv.s.x v8, a0
 ; CHECK-NEXT:    ret
 entry:
-  %a = call <vscale x 8 x i8> @llvm.riscv.vmv.s.x.nxv8i8(<vscale x 8 x i8> %0, i8 %1, i64 %2)
+  %a = call <vscale x 8 x i8> @llvm.riscv.vmv.s.x.nxv8i8(<vscale x 8 x i8> %0, i8 %1, iXLen %2)
   ret <vscale x 8 x i8> %a
 }
 
-declare <vscale x 16 x i8> @llvm.riscv.vmv.s.x.nxv16i8(<vscale x 16 x i8>, i8, i64);
+declare <vscale x 16 x i8> @llvm.riscv.vmv.s.x.nxv16i8(<vscale x 16 x i8>, i8, iXLen);
 
-define <vscale x 16 x i8> @intrinsic_vmv.s.x_x_nxv16i8(<vscale x 16 x i8> %0, i8 %1, i64 %2) nounwind {
+define <vscale x 16 x i8> @intrinsic_vmv.s.x_x_nxv16i8(<vscale x 16 x i8> %0, i8 %1, iXLen %2) nounwind {
 ; CHECK-LABEL: intrinsic_vmv.s.x_x_nxv16i8:
 ; CHECK:       # %bb.0: # %entry
 ; CHECK-NEXT:    vsetvli zero, a1, e8, m1, tu, ma
 ; CHECK-NEXT:    vmv.s.x v8, a0
 ; CHECK-NEXT:    ret
 entry:
-  %a = call <vscale x 16 x i8> @llvm.riscv.vmv.s.x.nxv16i8(<vscale x 16 x i8> %0, i8 %1, i64 %2)
+  %a = call <vscale x 16 x i8> @llvm.riscv.vmv.s.x.nxv16i8(<vscale x 16 x i8> %0, i8 %1, iXLen %2)
   ret <vscale x 16 x i8> %a
 }
 
-declare <vscale x 32 x i8> @llvm.riscv.vmv.s.x.nxv32i8(<vscale x 32 x i8>, i8, i64);
+declare <vscale x 32 x i8> @llvm.riscv.vmv.s.x.nxv32i8(<vscale x 32 x i8>, i8, iXLen);
 
-define <vscale x 32 x i8> @intrinsic_vmv.s.x_x_nxv32i8(<vscale x 32 x i8> %0, i8 %1, i64 %2) nounwind {
+define <vscale x 32 x i8> @intrinsic_vmv.s.x_x_nxv32i8(<vscale x 32 x i8> %0, i8 %1, iXLen %2) nounwind {
 ; CHECK-LABEL: intrinsic_vmv.s.x_x_nxv32i8:
 ; CHECK:       # %bb.0: # %entry
 ; CHECK-NEXT:    vsetvli zero, a1, e8, m1, tu, ma
 ; CHECK-NEXT:    vmv.s.x v8, a0
 ; CHECK-NEXT:    ret
 entry:
-  %a = call <vscale x 32 x i8> @llvm.riscv.vmv.s.x.nxv32i8(<vscale x 32 x i8> %0, i8 %1, i64 %2)
+  %a = call <vscale x 32 x i8> @llvm.riscv.vmv.s.x.nxv32i8(<vscale x 32 x i8> %0, i8 %1, iXLen %2)
   ret <vscale x 32 x i8> %a
 }
 
-declare <vscale x 64 x i8> @llvm.riscv.vmv.s.x.nxv64i8(<vscale x 64 x i8>, i8, i64);
+declare <vscale x 64 x i8> @llvm.riscv.vmv.s.x.nxv64i8(<vscale x 64 x i8>, i8, iXLen);
 
-define <vscale x 64 x i8> @intrinsic_vmv.s.x_x_nxv64i8(<vscale x 64 x i8> %0, i8 %1, i64 %2) nounwind {
+define <vscale x 64 x i8> @intrinsic_vmv.s.x_x_nxv64i8(<vscale x 64 x i8> %0, i8 %1, iXLen %2) nounwind {
 ; CHECK-LABEL: intrinsic_vmv.s.x_x_nxv64i8:
 ; CHECK:       # %bb.0: # %entry
 ; CHECK-NEXT:    vsetvli zero, a1, e8, m1, tu, ma
 ; CHECK-NEXT:    vmv.s.x v8, a0
 ; CHECK-NEXT:    ret
 entry:
-  %a = call <vscale x 64 x i8> @llvm.riscv.vmv.s.x.nxv64i8(<vscale x 64 x i8> %0, i8 %1, i64 %2)
+  %a = call <vscale x 64 x i8> @llvm.riscv.vmv.s.x.nxv64i8(<vscale x 64 x i8> %0, i8 %1, iXLen %2)
   ret <vscale x 64 x i8> %a
 }
 
-declare <vscale x 1 x i16> @llvm.riscv.vmv.s.x.nxv1i16(<vscale x 1 x i16>, i16, i64);
+declare <vscale x 1 x i16> @llvm.riscv.vmv.s.x.nxv1i16(<vscale x 1 x i16>, i16, iXLen);
 
-define <vscale x 1 x i16> @intrinsic_vmv.s.x_x_nxv1i16(<vscale x 1 x i16> %0, i16 %1, i64 %2) nounwind {
+define <vscale x 1 x i16> @intrinsic_vmv.s.x_x_nxv1i16(<vscale x 1 x i16> %0, i16 %1, iXLen %2) nounwind {
 ; CHECK-LABEL: intrinsic_vmv.s.x_x_nxv1i16:
 ; CHECK:       # %bb.0: # %entry
 ; CHECK-NEXT:    vsetvli zero, a1, e16, m1, tu, ma
 ; CHECK-NEXT:    vmv.s.x v8, a0
 ; CHECK-NEXT:    ret
 entry:
-  %a = call <vscale x 1 x i16> @llvm.riscv.vmv.s.x.nxv1i16(<vscale x 1 x i16> %0, i16 %1, i64 %2)
+  %a = call <vscale x 1 x i16> @llvm.riscv.vmv.s.x.nxv1i16(<vscale x 1 x i16> %0, i16 %1, iXLen %2)
   ret <vscale x 1 x i16> %a
 }
 
-declare <vscale x 2 x i16> @llvm.riscv.vmv.s.x.nxv2i16(<vscale x 2 x i16>, i16, i64);
+declare <vscale x 2 x i16> @llvm.riscv.vmv.s.x.nxv2i16(<vscale x 2 x i16>, i16, iXLen);
 
-define <vscale x 2 x i16> @intrinsic_vmv.s.x_x_nxv2i16(<vscale x 2 x i16> %0, i16 %1, i64 %2) nounwind {
+define <vscale x 2 x i16> @intrinsic_vmv.s.x_x_nxv2i16(<vscale x 2 x i16> %0, i16 %1, iXLen %2) nounwind {
 ; CHECK-LABEL: intrinsic_vmv.s.x_x_nxv2i16:
 ; CHECK:       # %bb.0: # %entry
 ; CHECK-NEXT:    vsetvli zero, a1, e16, m1, tu, ma
 ; CHECK-NEXT:    vmv.s.x v8, a0
 ; CHECK-NEXT:    ret
 entry:
-  %a = call <vscale x 2 x i16> @llvm.riscv.vmv.s.x.nxv2i16(<vscale x 2 x i16> %0, i16 %1, i64 %2)
+  %a = call <vscale x 2 x i16> @llvm.riscv.vmv.s.x.nxv2i16(<vscale x 2 x i16> %0, i16 %1, iXLen %2)
   ret <vscale x 2 x i16> %a
 }
 
-declare <vscale x 4 x i16> @llvm.riscv.vmv.s.x.nxv4i16(<vscale x 4 x i16>, i16, i64);
+declare <vscale x 4 x i16> @llvm.riscv.vmv.s.x.nxv4i16(<vscale x 4 x i16>, i16, iXLen);
 
-define <vscale x 4 x i16> @intrinsic_vmv.s.x_x_nxv4i16(<vscale x 4 x i16> %0, i16 %1, i64 %2) nounwind {
+define <vscale x 4 x i16> @intrinsic_vmv.s.x_x_nxv4i16(<vscale x 4 x i16> %0, i16 %1, iXLen %2) nounwind {
 ; CHECK-LABEL: intrinsic_vmv.s.x_x_nxv4i16:
 ; CHECK:       # %bb.0: # %entry
 ; CHECK-NEXT:    vsetvli zero, a1, e16, m1, tu, ma
 ; CHECK-NEXT:    vmv.s.x v8, a0
 ; CHECK-NEXT:    ret
 entry:
-  %a = call <vscale x 4 x i16> @llvm.riscv.vmv.s.x.nxv4i16(<vscale x 4 x i16> %0, i16 %1, i64 %2)
+  %a = call <vscale x 4 x i16> @llvm.riscv.vmv.s.x.nxv4i16(<vscale x 4 x i16> %0, i16 %1, iXLen %2)
   ret <vscale x 4 x i16> %a
 }
 
-declare <vscale x 8 x i16> @llvm.riscv.vmv.s.x.nxv8i16(<vscale x 8 x i16>, i16, i64);
+declare <vscale x 8 x i16> @llvm.riscv.vmv.s.x.nxv8i16(<vscale x 8 x i16>, i16, iXLen);
 
-define <vscale x 8 x i16> @intrinsic_vmv.s.x_x_nxv8i16(<vscale x 8 x i16> %0, i16 %1, i64 %2) nounwind {
+define <vscale x 8 x i16> @intrinsic_vmv.s.x_x_nxv8i16(<vscale x 8 x i16> %0, i16 %1, iXLen %2) nounwind {
 ; CHECK-LABEL: intrinsic_vmv.s.x_x_nxv8i16:
 ; CHECK:       # %bb.0: # %entry
 ; CHECK-NEXT:    vsetvli zero, a1, e16, m1, tu, ma
 ; CHECK-NEXT:    vmv.s.x v8, a0
 ; CHECK-NEXT:    ret
 entry:
-  %a = call <vscale x 8 x i16> @llvm.riscv.vmv.s.x.nxv8i16(<vscale x 8 x i16> %0, i16 %1, i64 %2)
+  %a = call <vscale x 8 x i16> @llvm.riscv.vmv.s.x.nxv8i16(<vscale x 8 x i16> %0, i16 %1, iXLen %2)
   ret <vscale x 8 x i16> %a
 }
 
-declare <vscale x 16 x i16> @llvm.riscv.vmv.s.x.nxv16i16(<vscale x 16 x i16>, i16, i64);
+declare <vscale x 16 x i16> @llvm.riscv.vmv.s.x.nxv16i16(<vscale x 16 x i16>, i16, iXLen);
 
-define <vscale x 16 x i16> @intrinsic_vmv.s.x_x_nxv16i16(<vscale x 16 x i16> %0, i16 %1, i64 %2) nounwind {
+define <vscale x 16 x i16> @intrinsic_vmv.s.x_x_nxv16i16(<vscale x 16 x i16> %0, i16 %1, iXLen %2) nounwind {
 ; CHECK-LABEL: intrinsic_vmv.s.x_x_nxv16i16:
 ; CHECK:       # %bb.0: # %entry
 ; CHECK-NEXT:    vsetvli zero, a1, e16, m1, tu, ma
 ; CHECK-NEXT:    vmv.s.x v8, a0
 ; CHECK-NEXT:    ret
 entry:
-  %a = call <vscale x 16 x i16> @llvm.riscv.vmv.s.x.nxv16i16(<vscale x 16 x i16> %0, i16 %1, i64 %2)
+  %a = call <vscale x 16 x i16> @llvm.riscv.vmv.s.x.nxv16i16(<vscale x 16 x i16> %0, i16 %1, iXLen %2)
   ret <vscale x 16 x i16> %a
 }
 
-declare <vscale x 32 x i16> @llvm.riscv.vmv.s.x.nxv32i16(<vscale x 32 x i16>, i16, i64);
+declare <vscale x 32 x i16> @llvm.riscv.vmv.s.x.nxv32i16(<vscale x 32 x i16>, i16, iXLen);
 
-define <vscale x 32 x i16> @intrinsic_vmv.s.x_x_nxv32i16(<vscale x 32 x i16> %0, i16 %1, i64 %2) nounwind {
+define <vscale x 32 x i16> @intrinsic_vmv.s.x_x_nxv32i16(<vscale x 32 x i16> %0, i16 %1, iXLen %2) nounwind {
 ; CHECK-LABEL: intrinsic_vmv.s.x_x_nxv32i16:
 ; CHECK:       # %bb.0: # %entry
 ; CHECK-NEXT:    vsetvli zero, a1, e16, m1, tu, ma
 ; CHECK-NEXT:    vmv.s.x v8, a0
 ; CHECK-NEXT:    ret
 entry:
-  %a = call <vscale x 32 x i16> @llvm.riscv.vmv.s.x.nxv32i16(<vscale x 32 x i16> %0, i16 %1, i64 %2)
+  %a = call <vscale x 32 x i16> @llvm.riscv.vmv.s.x.nxv32i16(<vscale x 32 x i16> %0, i16 %1, iXLen %2)
   ret <vscale x 32 x i16> %a
 }
 
-declare <vscale x 1 x i32> @llvm.riscv.vmv.s.x.nxv1i32(<vscale x 1 x i32>, i32, i64);
+declare <vscale x 1 x i32> @llvm.riscv.vmv.s.x.nxv1i32(<vscale x 1 x i32>, i32, iXLen);
 
-define <vscale x 1 x i32> @intrinsic_vmv.s.x_x_nxv1i32(<vscale x 1 x i32> %0, i32 %1, i64 %2) nounwind {
+define <vscale x 1 x i32> @intrinsic_vmv.s.x_x_nxv1i32(<vscale x 1 x i32> %0, i32 %1, iXLen %2) nounwind {
 ; CHECK-LABEL: intrinsic_vmv.s.x_x_nxv1i32:
 ; CHECK:       # %bb.0: # %entry
 ; CHECK-NEXT:    vsetvli zero, a1, e32, m1, tu, ma
 ; CHECK-NEXT:    vmv.s.x v8, a0
 ; CHECK-NEXT:    ret
 entry:
-  %a = call <vscale x 1 x i32> @llvm.riscv.vmv.s.x.nxv1i32(<vscale x 1 x i32> %0, i32 %1, i64 %2)
+  %a = call <vscale x 1 x i32> @llvm.riscv.vmv.s.x.nxv1i32(<vscale x 1 x i32> %0, i32 %1, iXLen %2)
   ret <vscale x 1 x i32> %a
 }
 
-declare <vscale x 2 x i32> @llvm.riscv.vmv.s.x.nxv2i32(<vscale x 2 x i32>, i32, i64);
+declare <vscale x 2 x i32> @llvm.riscv.vmv.s.x.nxv2i32(<vscale x 2 x i32>, i32, iXLen);
 
-define <vscale x 2 x i32> @intrinsic_vmv.s.x_x_nxv2i32(<vscale x 2 x i32> %0, i32 %1, i64 %2) nounwind {
+define <vscale x 2 x i32> @intrinsic_vmv.s.x_x_nxv2i32(<vscale x 2 x i32> %0, i32 %1, iXLen %2) nounwind {
 ; CHECK-LABEL: intrinsic_vmv.s.x_x_nxv2i32:
 ; CHECK:       # %bb.0: # %entry
 ; CHECK-NEXT:    vsetvli zero, a1, e32, m1, tu, ma
 ; CHECK-NEXT:    vmv.s.x v8, a0
 ; CHECK-NEXT:    ret
 entry:
-  %a = call <vscale x 2 x i32> @llvm.riscv.vmv.s.x.nxv2i32(<vscale x 2 x i32> %0, i32 %1, i64 %2)
+  %a = call <vscale x 2 x i32> @llvm.riscv.vmv.s.x.nxv2i32(<vscale x 2 x i32> %0, i32 %1, iXLen %2)
   ret <vscale x 2 x i32> %a
 }
 
-declare <vscale x 4 x i32> @llvm.riscv.vmv.s.x.nxv4i32(<vscale x 4 x i32>, i32, i64);
+declare <vscale x 4 x i32> @llvm.riscv.vmv.s.x.nxv4i32(<vscale x 4 x i32>, i32, iXLen);
 
-define <vscale x 4 x i32> @intrinsic_vmv.s.x_x_nxv4i32(<vscale x 4 x i32> %0, i32 %1, i64 %2) nounwind {
+define <vscale x 4 x i32> @intrinsic_vmv.s.x_x_nxv4i32(<vscale x 4 x i32> %0, i32 %1, iXLen %2) nounwind {
 ; CHECK-LABEL: intrinsic_vmv.s.x_x_nxv4i32:
 ; CHECK:       # %bb.0: # %entry
 ; CHECK-NEXT:    vsetvli zero, a1, e32, m1, tu, ma
 ; CHECK-NEXT:    vmv.s.x v8, a0
 ; CHECK-NEXT:    ret
 entry:
-  %a = call <vscale x 4 x i32> @llvm.riscv.vmv.s.x.nxv4i32(<vscale x 4 x i32> %0, i32 %1, i64 %2)
+  %a = call <vscale x 4 x i32> @llvm.riscv.vmv.s.x.nxv4i32(<vscale x 4 x i32> %0, i32 %1, iXLen %2)
   ret <vscale x 4 x i32> %a
 }
 
-declare <vscale x 8 x i32> @llvm.riscv.vmv.s.x.nxv8i32(<vscale x 8 x i32>, i32, i64);
+declare <vscale x 8 x i32> @llvm.riscv.vmv.s.x.nxv8i32(<vscale x 8 x i32>, i32, iXLen);
 
-define <vscale x 8 x i32> @intrinsic_vmv.s.x_x_nxv8i32(<vscale x 8 x i32> %0, i32 %1, i64 %2) nounwind {
+define <vscale x 8 x i32> @intrinsic_vmv.s.x_x_nxv8i32(<vscale x 8 x i32> %0, i32 %1, iXLen %2) nounwind {
 ; CHECK-LABEL: intrinsic_vmv.s.x_x_nxv8i32:
 ; CHECK:       # %bb.0: # %entry
 ; CHECK-NEXT:    vsetvli zero, a1, e32, m1, tu, ma
 ; CHECK-NEXT:    vmv.s.x v8, a0
 ; CHECK-NEXT:    ret
 entry:
-  %a = call <vscale x 8 x i32> @llvm.riscv.vmv.s.x.nxv8i32(<vscale x 8 x i32> %0, i32 %1, i64 %2)
+  %a = call <vscale x 8 x i32> @llvm.riscv.vmv.s.x.nxv8i32(<vscale x 8 x i32> %0, i32 %1, iXLen %2)
   ret <vscale x 8 x i32> %a
 }
 
-declare <vscale x 16 x i32> @llvm.riscv.vmv.s.x.nxv16i32(<vscale x 16 x i32>, i32, i64);
+declare <vscale x 16 x i32> @llvm.riscv.vmv.s.x.nxv16i32(<vscale x 16 x i32>, i32, iXLen);
 
-define <vscale x 16 x i32> @intrinsic_vmv.s.x_x_nxv16i32(<vscale x 16 x i32> %0, i32 %1, i64 %2) nounwind {
+define <vscale x 16 x i32> @intrinsic_vmv.s.x_x_nxv16i32(<vscale x 16 x i32> %0, i32 %1, iXLen %2) nounwind {
 ; CHECK-LABEL: intrinsic_vmv.s.x_x_nxv16i32:
 ; CHECK:       # %bb.0: # %entry
 ; CHECK-NEXT:    vsetvli zero, a1, e32, m1, tu, ma
 ; CHECK-NEXT:    vmv.s.x v8, a0
 ; CHECK-NEXT:    ret
 entry:
-  %a = call <vscale x 16 x i32> @llvm.riscv.vmv.s.x.nxv16i32(<vscale x 16 x i32> %0, i32 %1, i64 %2)
+  %a = call <vscale x 16 x i32> @llvm.riscv.vmv.s.x.nxv16i32(<vscale x 16 x i32> %0, i32 %1, iXLen %2)
   ret <vscale x 16 x i32> %a
 }
 
-declare <vscale x 1 x i64> @llvm.riscv.vmv.s.x.nxv1i64(<vscale x 1 x i64>, i64, i64);
-
-define <vscale x 1 x i64> @intrinsic_vmv.s.x_x_nxv1i64(<vscale x 1 x i64> %0, i64 %1, i64 %2) nounwind {
-; CHECK-LABEL: intrinsic_vmv.s.x_x_nxv1i64:
-; CHECK:       # %bb.0: # %entry
-; CHECK-NEXT:    vsetvli zero, a1, e64, m1, tu, ma
-; CHECK-NEXT:    vmv.s.x v8, a0
-; CHECK-NEXT:    ret
+declare <vscale x 1 x i64> @llvm.riscv.vmv.s.x.nxv1i64(<vscale x 1 x i64>, i64, iXLen);
+
+define <vscale x 1 x i64> @intrinsic_vmv.s.x_x_nxv1i64(<vscale x 1 x i64> %0, i64 %1, iXLen %2) nounwind {
+; RV32-LABEL: intrinsic_vmv.s.x_x_nxv1i64:
+; RV32:       # %bb.0: # %entry
+; RV32-NEXT:    addi sp, sp, -16
+; RV32-NEXT:    sw a1, 12(sp)
+; RV32-NEXT:    sw a0, 8(sp)
+; RV32-NEXT:    vsetvli zero, a2, e64, m1, ta, mu
+; RV32-NEXT:    vid.v v9
+; RV32-NEXT:    vmseq.vi v0, v9, 0
+; RV32-NEXT:    addi a0, sp, 8
+; RV32-NEXT:    vlse64.v v8, (a0), zero, v0.t
+; RV32-NEXT:    addi sp, sp, 16
+; RV32-NEXT:    ret
+;
+; RV64-LABEL: intrinsic_vmv.s.x_x_nxv1i64:
+; RV64:       # %bb.0: # %entry
+; RV64-NEXT:    vsetvli zero, a1, e64, m1, tu, ma
+; RV64-NEXT:    vmv.s.x v8, a0
+; RV64-NEXT:    ret
 entry:
-  %a = call <vscale x 1 x i64> @llvm.riscv.vmv.s.x.nxv1i64(<vscale x 1 x i64> %0, i64 %1, i64 %2)
+  %a = call <vscale x 1 x i64> @llvm.riscv.vmv.s.x.nxv1i64(<vscale x 1 x i64> %0, i64 %1, iXLen %2)
   ret <vscale x 1 x i64> %a
 }
 
-declare <vscale x 2 x i64> @llvm.riscv.vmv.s.x.nxv2i64(<vscale x 2 x i64>, i64, i64);
-
-define <vscale x 2 x i64> @intrinsic_vmv.s.x_x_nxv2i64(<vscale x 2 x i64> %0, i64 %1, i64 %2) nounwind {
-; CHECK-LABEL: intrinsic_vmv.s.x_x_nxv2i64:
-; CHECK:       # %bb.0: # %entry
-; CHECK-NEXT:    vsetvli zero, a1, e64, m1, tu, ma
-; CHECK-NEXT:    vmv.s.x v8, a0
-; CHECK-NEXT:    ret
+declare <vscale x 2 x i64> @llvm.riscv.vmv.s.x.nxv2i64(<vscale x 2 x i64>, i64, iXLen);
+
+define <vscale x 2 x i64> @intrinsic_vmv.s.x_x_nxv2i64(<vscale x 2 x i64> %0, i64 %1, iXLen %2) nounwind {
+; RV32-LABEL: intrinsic_vmv.s.x_x_nxv2i64:
+; RV32:       # %bb.0: # %entry
+; RV32-NEXT:    addi sp, sp, -16
+; RV32-NEXT:    sw a1, 12(sp)
+; RV32-NEXT:    sw a0, 8(sp)
+; RV32-NEXT:    vsetvli zero, a2, e64, m2, ta, mu
+; RV32-NEXT:    vid.v v10
+; RV32-NEXT:    vmseq.vi v0, v10, 0
+; RV32-NEXT:    addi a0, sp, 8
+; RV32-NEXT:    vlse64.v v8, (a0), zero, v0.t
+; RV32-NEXT:    addi sp, sp, 16
+; RV32-NEXT:    ret
+;
+; RV64-LABEL: intrinsic_vmv.s.x_x_nxv2i64:
+; RV64:       # %bb.0: # %entry
+; RV64-NEXT:    vsetvli zero, a1, e64, m1, tu, ma
+; RV64-NEXT:    vmv.s.x v8, a0
+; RV64-NEXT:    ret
 entry:
-  %a = call <vscale x 2 x i64> @llvm.riscv.vmv.s.x.nxv2i64(<vscale x 2 x i64> %0, i64 %1, i64 %2)
+  %a = call <vscale x 2 x i64> @llvm.riscv.vmv.s.x.nxv2i64(<vscale x 2 x i64> %0, i64 %1, iXLen %2)
   ret <vscale x 2 x i64> %a
 }
 
-declare <vscale x 4 x i64> @llvm.riscv.vmv.s.x.nxv4i64(<vscale x 4 x i64>, i64, i64);
-
-define <vscale x 4 x i64> @intrinsic_vmv.s.x_x_nxv4i64(<vscale x 4 x i64> %0, i64 %1, i64 %2) nounwind {
-; CHECK-LABEL: intrinsic_vmv.s.x_x_nxv4i64:
-; CHECK:       # %bb.0: # %entry
-; CHECK-NEXT:    vsetvli zero, a1, e64, m1, tu, ma
-; CHECK-NEXT:    vmv.s.x v8, a0
-; CHECK-NEXT:    ret
+declare <vscale x 4 x i64> @llvm.riscv.vmv.s.x.nxv4i64(<vscale x 4 x i64>, i64, iXLen);
+
+define <vscale x 4 x i64> @intrinsic_vmv.s.x_x_nxv4i64(<vscale x 4 x i64> %0, i64 %1, iXLen %2) nounwind {
+; RV32-LABEL: intrinsic_vmv.s.x_x_nxv4i64:
+; RV32:       # %bb.0: # %entry
+; RV32-NEXT:    addi sp, sp, -16
+; RV32-NEXT:    sw a1, 12(sp)
+; RV32-NEXT:    sw a0, 8(sp)
+; RV32-NEXT:    vsetvli zero, a2, e64, m4, ta, mu
+; RV32-NEXT:    vid.v v12
+; RV32-NEXT:    vmseq.vi v0, v12, 0
+; RV32-NEXT:    addi a0, sp, 8
+; RV32-NEXT:    vlse64.v v8, (a0), zero, v0.t
+; RV32-NEXT:    addi sp, sp, 16
+; RV32-NEXT:    ret
+;
+; RV64-LABEL: intrinsic_vmv.s.x_x_nxv4i64:
+; RV64:       # %bb.0: # %entry
+; RV64-NEXT:    vsetvli zero, a1, e64, m1, tu, ma
+; RV64-NEXT:    vmv.s.x v8, a0
+; RV64-NEXT:    ret
 entry:
-  %a = call <vscale x 4 x i64> @llvm.riscv.vmv.s.x.nxv4i64(<vscale x 4 x i64> %0, i64 %1, i64 %2)
+  %a = call <vscale x 4 x i64> @llvm.riscv.vmv.s.x.nxv4i64(<vscale x 4 x i64> %0, i64 %1, iXLen %2)
   ret <vscale x 4 x i64> %a
 }
 
-declare <vscale x 8 x i64> @llvm.riscv.vmv.s.x.nxv8i64(<vscale x 8 x i64>, i64, i64);
-
-define <vscale x 8 x i64> @intrinsic_vmv.s.x_x_nxv8i64(<vscale x 8 x i64> %0, i64 %1, i64 %2) nounwind {
-; CHECK-LABEL: intrinsic_vmv.s.x_x_nxv8i64:
-; CHECK:       # %bb.0: # %entry
-; CHECK-NEXT:    vsetvli zero, a1, e64, m1, tu, ma
-; CHECK-NEXT:    vmv.s.x v8, a0
-; CHECK-NEXT:    ret
+declare <vscale x 8 x i64> @llvm.riscv.vmv.s.x.nxv8i64(<vscale x 8 x i64>, i64, iXLen);
+
+define <vscale x 8 x i64> @intrinsic_vmv.s.x_x_nxv8i64(<vscale x 8 x i64> %0, i64 %1, iXLen %2) nounwind {
+; RV32-LABEL: intrinsic_vmv.s.x_x_nxv8i64:
+; RV32:       # %bb.0: # %entry
+; RV32-NEXT:    addi sp, sp, -16
+; RV32-NEXT:    sw a1, 12(sp)
+; RV32-NEXT:    sw a0, 8(sp)
+; RV32-NEXT:    vsetvli zero, a2, e64, m8, ta, mu
+; RV32-NEXT:    vid.v v16
+; RV32-NEXT:    vmseq.vi v0, v16, 0
+; RV32-NEXT:    addi a0, sp, 8
+; RV32-NEXT:    vlse64.v v8, (a0), zero, v0.t
+; RV32-NEXT:    addi sp, sp, 16
+; RV32-NEXT:    ret
+;
+; RV64-LABEL: intrinsic_vmv.s.x_x_nxv8i64:
+; RV64:       # %bb.0: # %entry
+; RV64-NEXT:    vsetvli zero, a1, e64, m1, tu, ma
+; RV64-NEXT:    vmv.s.x v8, a0
+; RV64-NEXT:    ret
 entry:
-  %a = call <vscale x 8 x i64> @llvm.riscv.vmv.s.x.nxv8i64(<vscale x 8 x i64> %0, i64 %1, i64 %2)
+  %a = call <vscale x 8 x i64> @llvm.riscv.vmv.s.x.nxv8i64(<vscale x 8 x i64> %0, i64 %1, iXLen %2)
   ret <vscale x 8 x i64> %a
 }
 
 ; We should not emit a tail agnostic vlse for a tail undisturbed vmv.s.x
 define <vscale x 1 x i64> @intrinsic_vmv.s.x_x_nxv1i64_bug(<vscale x 1 x i64> %0, ptr %1) nounwind {
-; CHECK-LABEL: intrinsic_vmv.s.x_x_nxv1i64_bug:
-; CHECK:       # %bb.0: # %entry
-; CHECK-NEXT:    ld a0, 0(a0)
-; CHECK-NEXT:    vsetivli zero, 1, e64, m1, tu, ma
-; CHECK-NEXT:    vmv.s.x v8, a0
-; CHECK-NEXT:    ret
+; RV32-LABEL: intrinsic_vmv.s.x_x_nxv1i64_bug:
+; RV32:       # %bb.0: # %entry
+; RV32-NEXT:    addi sp, sp, -16
+; RV32-NEXT:    lw a1, 4(a0)
+; RV32-NEXT:    lw a0, 0(a0)
+; RV32-NEXT:    sw a1, 12(sp)
+; RV32-NEXT:    sw a0, 8(sp)
+; RV32-NEXT:    vsetivli zero, 1, e64, m1, ta, mu
+; RV32-NEXT:    vid.v v9
+; RV32-NEXT:    vmseq.vi v0, v9, 0
+; RV32-NEXT:    addi a0, sp, 8
+; RV32-NEXT:    vlse64.v v8, (a0), zero, v0.t
+; RV32-NEXT:    addi sp, sp, 16
+; RV32-NEXT:    ret
+;
+; RV64-LABEL: intrinsic_vmv.s.x_x_nxv1i64_bug:
+; RV64:       # %bb.0: # %entry
+; RV64-NEXT:    ld a0, 0(a0)
+; RV64-NEXT:    vsetivli zero, 1, e64, m1, tu, ma
+; RV64-NEXT:    vmv.s.x v8, a0
+; RV64-NEXT:    ret
 entry:
   %a = load i64, ptr %1, align 8
-  %b = call <vscale x 1 x i64> @llvm.riscv.vmv.s.x.nxv1i64(<vscale x 1 x i64> %0, i64 %a, i64 1)
+  %b = call <vscale x 1 x i64> @llvm.riscv.vmv.s.x.nxv1i64(<vscale x 1 x i64> %0, i64 %a, iXLen 1)
   ret <vscale x 1 x i64> %b
 }
diff --git a/llvm/test/CodeGen/RISCV/rvv/vmv.v.v-peephole.ll b/llvm/test/CodeGen/RISCV/rvv/vmv.v.v-peephole.ll
index 8a589a3..3952e48 100644
--- a/llvm/test/CodeGen/RISCV/rvv/vmv.v.v-peephole.ll
+++ b/llvm/test/CodeGen/RISCV/rvv/vmv.v.v-peephole.ll
@@ -180,3 +180,17 @@ define <vscale x 2 x i32> @unfoldable_vredsum(<vscale x 2 x i32> %passthru, <vsc
   %b = call <vscale x 2 x i32> @llvm.riscv.vmv.v.v.nxv2i32(<vscale x 2 x i32> %passthru, <vscale x 2 x i32> %a, iXLen 1)
   ret <vscale x 2 x i32> %b
 }
+
+define <vscale x 2 x i32> @unfoldable_mismatched_sew(<vscale x 2 x i32> %passthru, <vscale x 1 x i64> %x, <vscale x 1 x i64> %y, iXLen %avl) {
+; CHECK-LABEL: unfoldable_mismatched_sew:
+; CHECK:       # %bb.0:
+; CHECK-NEXT:    vsetvli zero, a0, e64, m1, ta, ma
+; CHECK-NEXT:    vadd.vv v9, v9, v10
+; CHECK-NEXT:    vsetvli zero, a0, e32, m1, tu, ma
+; CHECK-NEXT:    vmv.v.v v8, v9
+; CHECK-NEXT:    ret
+  %a = call <vscale x 1 x i64> @llvm.riscv.vadd.nxv1i64.nxv1i64(<vscale x 1 x i64> poison, <vscale x 1 x i64> %x, <vscale x 1 x i64> %y, iXLen %avl)
+  %a.bitcast = bitcast <vscale x 1 x i64> %a to <vscale x 2 x i32>
+  %b = call <vscale x 2 x i32> @llvm.riscv.vmv.v.v.nxv2i32(<vscale x 2 x i32> %passthru, <vscale x 2 x i32> %a.bitcast, iXLen %avl)
+  ret <vscale x 2 x i32> %b
+}
diff --git a/llvm/test/CodeGen/RISCV/rvv/vmv.v.v-rv64.ll b/llvm/test/CodeGen/RISCV/rvv/vmv.v.v-rv64.ll
deleted file mode 100644
index 6560063..0000000
--- a/llvm/test/CodeGen/RISCV/rvv/vmv.v.v-rv64.ll
+++ /dev/null
@@ -1,743 +0,0 @@
-; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py
-; RUN: llc -mtriple=riscv64 -mattr=+v,+d,+zfh,+zvfh -verify-machineinstrs \
-; RUN:   < %s | FileCheck %s
-
-declare <vscale x 1 x i8> @llvm.riscv.vmv.v.v.nxv1i8(
-  <vscale x 1 x i8>,
-  <vscale x 1 x i8>,
-  i64);
-
-define <vscale x 1 x i8> @intrinsic_vmv.v.v_v_nxv1i8_nxv1i8(<vscale x 1 x i8> %0, i64 %1) nounwind {
-; CHECK-LABEL: intrinsic_vmv.v.v_v_nxv1i8_nxv1i8:
-; CHECK:       # %bb.0: # %entry
-; CHECK-NEXT:    vsetvli zero, a0, e8, mf8, ta, ma
-; CHECK-NEXT:    vmv.v.v v8, v8
-; CHECK-NEXT:    ret
-entry:
-  %a = call <vscale x 1 x i8> @llvm.riscv.vmv.v.v.nxv1i8(
-    <vscale x 1 x i8> undef,
-    <vscale x 1 x i8> %0,
-    i64 %1)
-
-  ret <vscale x 1 x i8> %a
-}
-
-declare <vscale x 2 x i8> @llvm.riscv.vmv.v.v.nxv2i8(
-  <vscale x 2 x i8>,
-  <vscale x 2 x i8>,
-  i64);
-
-define <vscale x 2 x i8> @intrinsic_vmv.v.v_v_nxv2i8_nxv2i8(<vscale x 2 x i8> %0, i64 %1) nounwind {
-; CHECK-LABEL: intrinsic_vmv.v.v_v_nxv2i8_nxv2i8:
-; CHECK:       # %bb.0: # %entry
-; CHECK-NEXT:    vsetvli zero, a0, e8, mf4, ta, ma
-; CHECK-NEXT:    vmv.v.v v8, v8
-; CHECK-NEXT:    ret
-entry:
-  %a = call <vscale x 2 x i8> @llvm.riscv.vmv.v.v.nxv2i8(
-    <vscale x 2 x i8> undef,
-    <vscale x 2 x i8> %0,
-    i64 %1)
-
-  ret <vscale x 2 x i8> %a
-}
-
-declare <vscale x 4 x i8> @llvm.riscv.vmv.v.v.nxv4i8(
-  <vscale x 4 x i8>,
-  <vscale x 4 x i8>,
-  i64);
-
-define <vscale x 4 x i8> @intrinsic_vmv.v.v_v_nxv4i8_nxv4i8(<vscale x 4 x i8> %0, i64 %1) nounwind {
-; CHECK-LABEL: intrinsic_vmv.v.v_v_nxv4i8_nxv4i8:
-; CHECK:       # %bb.0: # %entry
-; CHECK-NEXT:    vsetvli zero, a0, e8, mf2, ta, ma
-; CHECK-NEXT:    vmv.v.v v8, v8
-; CHECK-NEXT:    ret
-entry:
-  %a = call <vscale x 4 x i8> @llvm.riscv.vmv.v.v.nxv4i8(
-    <vscale x 4 x i8> undef,
-    <vscale x 4 x i8> %0,
-    i64 %1)
-
-  ret <vscale x 4 x i8> %a
-}
-
-declare <vscale x 8 x i8> @llvm.riscv.vmv.v.v.nxv8i8(
-  <vscale x 8 x i8>,
-  <vscale x 8 x i8>,
-  i64);
-
-define <vscale x 8 x i8> @intrinsic_vmv.v.v_v_nxv8i8_nxv8i8(<vscale x 8 x i8> %0, i64 %1) nounwind {
-; CHECK-LABEL: intrinsic_vmv.v.v_v_nxv8i8_nxv8i8:
-; CHECK:       # %bb.0: # %entry
-; CHECK-NEXT:    vsetvli zero, a0, e8, m1, ta, ma
-; CHECK-NEXT:    vmv.v.v v8, v8
-; CHECK-NEXT:    ret
-entry:
-  %a = call <vscale x 8 x i8> @llvm.riscv.vmv.v.v.nxv8i8(
-    <vscale x 8 x i8> undef,
-    <vscale x 8 x i8> %0,
-    i64 %1)
-
-  ret <vscale x 8 x i8> %a
-}
-
-declare <vscale x 16 x i8> @llvm.riscv.vmv.v.v.nxv16i8(
-  <vscale x 16 x i8>,
-  <vscale x 16 x i8>,
-  i64);
-
-define <vscale x 16 x i8> @intrinsic_vmv.v.v_v_nxv16i8_nxv16i8(<vscale x 16 x i8> %0, i64 %1) nounwind {
-; CHECK-LABEL: intrinsic_vmv.v.v_v_nxv16i8_nxv16i8:
-; CHECK:       # %bb.0: # %entry
-; CHECK-NEXT:    vsetvli zero, a0, e8, m2, ta, ma
-; CHECK-NEXT:    vmv.v.v v8, v8
-; CHECK-NEXT:    ret
-entry:
-  %a = call <vscale x 16 x i8> @llvm.riscv.vmv.v.v.nxv16i8(
-    <vscale x 16 x i8> undef,
-    <vscale x 16 x i8> %0,
-    i64 %1)
-
-  ret <vscale x 16 x i8> %a
-}
-
-declare <vscale x 32 x i8> @llvm.riscv.vmv.v.v.nxv32i8(
-  <vscale x 32 x i8>,
-  <vscale x 32 x i8>,
-  i64);
-
-define <vscale x 32 x i8> @intrinsic_vmv.v.v_v_nxv32i8_nxv32i8(<vscale x 32 x i8> %0, i64 %1) nounwind {
-; CHECK-LABEL: intrinsic_vmv.v.v_v_nxv32i8_nxv32i8:
-; CHECK:       # %bb.0: # %entry
-; CHECK-NEXT:    vsetvli zero, a0, e8, m4, ta, ma
-; CHECK-NEXT:    vmv.v.v v8, v8
-; CHECK-NEXT:    ret
-entry:
-  %a = call <vscale x 32 x i8> @llvm.riscv.vmv.v.v.nxv32i8(
-    <vscale x 32 x i8> undef,
-    <vscale x 32 x i8> %0,
-    i64 %1)
-
-  ret <vscale x 32 x i8> %a
-}
-
-declare <vscale x 64 x i8> @llvm.riscv.vmv.v.v.nxv64i8(
-  <vscale x 64 x i8>,
-  <vscale x 64 x i8>,
-  i64);
-
-define <vscale x 64 x i8> @intrinsic_vmv.v.v_v_nxv64i8_nxv64i8(<vscale x 64 x i8> %0, i64 %1) nounwind {
-; CHECK-LABEL: intrinsic_vmv.v.v_v_nxv64i8_nxv64i8:
-; CHECK:       # %bb.0: # %entry
-; CHECK-NEXT:    vsetvli zero, a0, e8, m8, ta, ma
-; CHECK-NEXT:    vmv.v.v v8, v8
-; CHECK-NEXT:    ret
-entry:
-  %a = call <vscale x 64 x i8> @llvm.riscv.vmv.v.v.nxv64i8(
-    <vscale x 64 x i8> undef,
-    <vscale x 64 x i8> %0,
-    i64 %1)
-
-  ret <vscale x 64 x i8> %a
-}
-
-declare <vscale x 1 x i16> @llvm.riscv.vmv.v.v.nxv1i16(
-  <vscale x 1 x i16>,
-  <vscale x 1 x i16>,
-  i64);
-
-define <vscale x 1 x i16> @intrinsic_vmv.v.v_v_nxv1i16_nxv1i16(<vscale x 1 x i16> %0, i64 %1) nounwind {
-; CHECK-LABEL: intrinsic_vmv.v.v_v_nxv1i16_nxv1i16:
-; CHECK:       # %bb.0: # %entry
-; CHECK-NEXT:    vsetvli zero, a0, e16, mf4, ta, ma
-; CHECK-NEXT:    vmv.v.v v8, v8
-; CHECK-NEXT:    ret
-entry:
-  %a = call <vscale x 1 x i16> @llvm.riscv.vmv.v.v.nxv1i16(
-    <vscale x 1 x i16> undef,
-    <vscale x 1 x i16> %0,
-    i64 %1)
-
-  ret <vscale x 1 x i16> %a
-}
-
-declare <vscale x 2 x i16> @llvm.riscv.vmv.v.v.nxv2i16(
-  <vscale x 2 x i16>,
-  <vscale x 2 x i16>,
-  i64);
-
-define <vscale x 2 x i16> @intrinsic_vmv.v.v_v_nxv2i16_nxv2i16(<vscale x 2 x i16> %0, i64 %1) nounwind {
-; CHECK-LABEL: intrinsic_vmv.v.v_v_nxv2i16_nxv2i16:
-; CHECK:       # %bb.0: # %entry
-; CHECK-NEXT:    vsetvli zero, a0, e16, mf2, ta, ma
-; CHECK-NEXT:    vmv.v.v v8, v8
-; CHECK-NEXT:    ret
-entry:
-  %a = call <vscale x 2 x i16> @llvm.riscv.vmv.v.v.nxv2i16(
-    <vscale x 2 x i16> undef,
-    <vscale x 2 x i16> %0,
-    i64 %1)
-
-  ret <vscale x 2 x i16> %a
-}
-
-declare <vscale x 4 x i16> @llvm.riscv.vmv.v.v.nxv4i16(
-  <vscale x 4 x i16>,
-  <vscale x 4 x i16>,
-  i64);
-
-define <vscale x 4 x i16> @intrinsic_vmv.v.v_v_nxv4i16_nxv4i16(<vscale x 4 x i16> %0, i64 %1) nounwind {
-; CHECK-LABEL: intrinsic_vmv.v.v_v_nxv4i16_nxv4i16:
-; CHECK:       # %bb.0: # %entry
-; CHECK-NEXT:    vsetvli zero, a0, e16, m1, ta, ma
-; CHECK-NEXT:    vmv.v.v v8, v8
-; CHECK-NEXT:    ret
-entry:
-  %a = call <vscale x 4 x i16> @llvm.riscv.vmv.v.v.nxv4i16(
-    <vscale x 4 x i16> undef,
-    <vscale x 4 x i16> %0,
-    i64 %1)
-
-  ret <vscale x 4 x i16> %a
-}
-
-declare <vscale x 8 x i16> @llvm.riscv.vmv.v.v.nxv8i16(
-  <vscale x 8 x i16>,
-  <vscale x 8 x i16>,
-  i64);
-
-define <vscale x 8 x i16> @intrinsic_vmv.v.v_v_nxv8i16_nxv8i16(<vscale x 8 x i16> %0, i64 %1) nounwind {
-; CHECK-LABEL: intrinsic_vmv.v.v_v_nxv8i16_nxv8i16:
-; CHECK:       # %bb.0: # %entry
-; CHECK-NEXT:    vsetvli zero, a0, e16, m2, ta, ma
-; CHECK-NEXT:    vmv.v.v v8, v8
-; CHECK-NEXT:    ret
-entry:
-  %a = call <vscale x 8 x i16> @llvm.riscv.vmv.v.v.nxv8i16(
-    <vscale x 8 x i16> undef,
-    <vscale x 8 x i16> %0,
-    i64 %1)
-
-  ret <vscale x 8 x i16> %a
-}
-
-declare <vscale x 16 x i16> @llvm.riscv.vmv.v.v.nxv16i16(
-  <vscale x 16 x i16>,
-  <vscale x 16 x i16>,
-  i64);
-
-define <vscale x 16 x i16> @intrinsic_vmv.v.v_v_nxv16i16_nxv16i16(<vscale x 16 x i16> %0, i64 %1) nounwind {
-; CHECK-LABEL: intrinsic_vmv.v.v_v_nxv16i16_nxv16i16:
-; CHECK:       # %bb.0: # %entry
-; CHECK-NEXT:    vsetvli zero, a0, e16, m4, ta, ma
-; CHECK-NEXT:    vmv.v.v v8, v8
-; CHECK-NEXT:    ret
-entry:
-  %a = call <vscale x 16 x i16> @llvm.riscv.vmv.v.v.nxv16i16(
-    <vscale x 16 x i16> undef,
-    <vscale x 16 x i16> %0,
-    i64 %1)
-
-  ret <vscale x 16 x i16> %a
-}
-
-declare <vscale x 32 x i16> @llvm.riscv.vmv.v.v.nxv32i16(
-  <vscale x 32 x i16>,
-  <vscale x 32 x i16>,
-  i64);
-
-define <vscale x 32 x i16> @intrinsic_vmv.v.v_v_nxv32i16_nxv32i16(<vscale x 32 x i16> %0, i64 %1) nounwind {
-; CHECK-LABEL: intrinsic_vmv.v.v_v_nxv32i16_nxv32i16:
-; CHECK:       # %bb.0: # %entry
-; CHECK-NEXT:    vsetvli zero, a0, e16, m8, ta, ma
-; CHECK-NEXT:    vmv.v.v v8, v8
-; CHECK-NEXT:    ret
-entry:
-  %a = call <vscale x 32 x i16> @llvm.riscv.vmv.v.v.nxv32i16(
-    <vscale x 32 x i16> undef,
-    <vscale x 32 x i16> %0,
-    i64 %1)
-
-  ret <vscale x 32 x i16> %a
-}
-
-declare <vscale x 1 x i32> @llvm.riscv.vmv.v.v.nxv1i32(
-  <vscale x 1 x i32>,
-  <vscale x 1 x i32>,
-  i64);
-
-define <vscale x 1 x i32> @intrinsic_vmv.v.v_v_nxv1i32_nxv1i32(<vscale x 1 x i32> %0, i64 %1) nounwind {
-; CHECK-LABEL: intrinsic_vmv.v.v_v_nxv1i32_nxv1i32:
-; CHECK:       # %bb.0: # %entry
-; CHECK-NEXT:    vsetvli zero, a0, e32, mf2, ta, ma
-; CHECK-NEXT:    vmv.v.v v8, v8
-; CHECK-NEXT:    ret
-entry:
-  %a = call <vscale x 1 x i32> @llvm.riscv.vmv.v.v.nxv1i32(
-    <vscale x 1 x i32> undef,
-    <vscale x 1 x i32> %0,
-    i64 %1)
-
-  ret <vscale x 1 x i32> %a
-}
-
-declare <vscale x 2 x i32> @llvm.riscv.vmv.v.v.nxv2i32(
-  <vscale x 2 x i32>,
-  <vscale x 2 x i32>,
-  i64);
-
-define <vscale x 2 x i32> @intrinsic_vmv.v.v_v_nxv2i32_nxv2i32(<vscale x 2 x i32> %0, i64 %1) nounwind {
-; CHECK-LABEL: intrinsic_vmv.v.v_v_nxv2i32_nxv2i32:
-; CHECK:       # %bb.0: # %entry
-; CHECK-NEXT:    vsetvli zero, a0, e32, m1, ta, ma
-; CHECK-NEXT:    vmv.v.v v8, v8
-; CHECK-NEXT:    ret
-entry:
-  %a = call <vscale x 2 x i32> @llvm.riscv.vmv.v.v.nxv2i32(
-    <vscale x 2 x i32> undef,
-    <vscale x 2 x i32> %0,
-    i64 %1)
-
-  ret <vscale x 2 x i32> %a
-}
-
-declare <vscale x 4 x i32> @llvm.riscv.vmv.v.v.nxv4i32(
-  <vscale x 4 x i32>,
-  <vscale x 4 x i32>,
-  i64);
-
-define <vscale x 4 x i32> @intrinsic_vmv.v.v_v_nxv4i32_nxv4i32(<vscale x 4 x i32> %0, i64 %1) nounwind {
-; CHECK-LABEL: intrinsic_vmv.v.v_v_nxv4i32_nxv4i32:
-; CHECK:       # %bb.0: # %entry
-; CHECK-NEXT:    vsetvli zero, a0, e32, m2, ta, ma
-; CHECK-NEXT:    vmv.v.v v8, v8
-; CHECK-NEXT:    ret
-entry:
-  %a = call <vscale x 4 x i32> @llvm.riscv.vmv.v.v.nxv4i32(
-    <vscale x 4 x i32> undef,
-    <vscale x 4 x i32> %0,
-    i64 %1)
-
-  ret <vscale x 4 x i32> %a
-}
-
-declare <vscale x 8 x i32> @llvm.riscv.vmv.v.v.nxv8i32(
-  <vscale x 8 x i32>,
-  <vscale x 8 x i32>,
-  i64);
-
-define <vscale x 8 x i32> @intrinsic_vmv.v.v_v_nxv8i32_nxv8i32(<vscale x 8 x i32> %0, i64 %1) nounwind {
-; CHECK-LABEL: intrinsic_vmv.v.v_v_nxv8i32_nxv8i32:
-; CHECK:       # %bb.0: # %entry
-; CHECK-NEXT:    vsetvli zero, a0, e32, m4, ta, ma
-; CHECK-NEXT:    vmv.v.v v8, v8
-; CHECK-NEXT:    ret
-entry:
-  %a = call <vscale x 8 x i32> @llvm.riscv.vmv.v.v.nxv8i32(
-    <vscale x 8 x i32> undef,
-    <vscale x 8 x i32> %0,
-    i64 %1)
-
-  ret <vscale x 8 x i32> %a
-}
-
-declare <vscale x 16 x i32> @llvm.riscv.vmv.v.v.nxv16i32(
-  <vscale x 16 x i32>,
-  <vscale x 16 x i32>,
-  i64);
-
-define <vscale x 16 x i32> @intrinsic_vmv.v.v_v_nxv16i32_nxv16i32(<vscale x 16 x i32> %0, i64 %1) nounwind {
-; CHECK-LABEL: intrinsic_vmv.v.v_v_nxv16i32_nxv16i32:
-; CHECK:       # %bb.0: # %entry
-; CHECK-NEXT:    vsetvli zero, a0, e32, m8, ta, ma
-; CHECK-NEXT:    vmv.v.v v8, v8
-; CHECK-NEXT:    ret
-entry:
-  %a = call <vscale x 16 x i32> @llvm.riscv.vmv.v.v.nxv16i32(
-    <vscale x 16 x i32> undef,
-    <vscale x 16 x i32> %0,
-    i64 %1)
-
-  ret <vscale x 16 x i32> %a
-}
-
-declare <vscale x 1 x i64> @llvm.riscv.vmv.v.v.nxv1i64(
-  <vscale x 1 x i64>,
-  <vscale x 1 x i64>,
-  i64);
-
-define <vscale x 1 x i64> @intrinsic_vmv.v.v_v_nxv1i64_nxv1i64(<vscale x 1 x i64> %0, i64 %1) nounwind {
-; CHECK-LABEL: intrinsic_vmv.v.v_v_nxv1i64_nxv1i64:
-; CHECK:       # %bb.0: # %entry
-; CHECK-NEXT:    vsetvli zero, a0, e64, m1, ta, ma
-; CHECK-NEXT:    vmv.v.v v8, v8
-; CHECK-NEXT:    ret
-entry:
-  %a = call <vscale x 1 x i64> @llvm.riscv.vmv.v.v.nxv1i64(
-    <vscale x 1 x i64> undef,
-    <vscale x 1 x i64> %0,
-    i64 %1)
-
-  ret <vscale x 1 x i64> %a
-}
-
-declare <vscale x 2 x i64> @llvm.riscv.vmv.v.v.nxv2i64(
-  <vscale x 2 x i64>,
-  <vscale x 2 x i64>,
-  i64);
-
-define <vscale x 2 x i64> @intrinsic_vmv.v.v_v_nxv2i64_nxv2i64(<vscale x 2 x i64> %0, i64 %1) nounwind {
-; CHECK-LABEL: intrinsic_vmv.v.v_v_nxv2i64_nxv2i64:
-; CHECK:       # %bb.0: # %entry
-; CHECK-NEXT:    vsetvli zero, a0, e64, m2, ta, ma
-; CHECK-NEXT:    vmv.v.v v8, v8
-; CHECK-NEXT:    ret
-entry:
-  %a = call <vscale x 2 x i64> @llvm.riscv.vmv.v.v.nxv2i64(
-    <vscale x 2 x i64> undef,
-    <vscale x 2 x i64> %0,
-    i64 %1)
-
-  ret <vscale x 2 x i64> %a
-}
-
-declare <vscale x 4 x i64> @llvm.riscv.vmv.v.v.nxv4i64(
-  <vscale x 4 x i64>,
-  <vscale x 4 x i64>,
-  i64);
-
-define <vscale x 4 x i64> @intrinsic_vmv.v.v_v_nxv4i64_nxv4i64(<vscale x 4 x i64> %0, i64 %1) nounwind {
-; CHECK-LABEL: intrinsic_vmv.v.v_v_nxv4i64_nxv4i64:
-; CHECK:       # %bb.0: # %entry
-; CHECK-NEXT:    vsetvli zero, a0, e64, m4, ta, ma
-; CHECK-NEXT:    vmv.v.v v8, v8
-; CHECK-NEXT:    ret
-entry:
-  %a = call <vscale x 4 x i64> @llvm.riscv.vmv.v.v.nxv4i64(
-    <vscale x 4 x i64> undef,
-    <vscale x 4 x i64> %0,
-    i64 %1)
-
-  ret <vscale x 4 x i64> %a
-}
-
-declare <vscale x 8 x i64> @llvm.riscv.vmv.v.v.nxv8i64(
-  <vscale x 8 x i64>,
-  <vscale x 8 x i64>,
-  i64);
-
-define <vscale x 8 x i64> @intrinsic_vmv.v.v_v_nxv8i64_nxv8i64(<vscale x 8 x i64> %0, i64 %1) nounwind {
-; CHECK-LABEL: intrinsic_vmv.v.v_v_nxv8i64_nxv8i64:
-; CHECK:       # %bb.0: # %entry
-; CHECK-NEXT:    vsetvli zero, a0, e64, m8, ta, ma
-; CHECK-NEXT:    vmv.v.v v8, v8
-; CHECK-NEXT:    ret
-entry:
-  %a = call <vscale x 8 x i64> @llvm.riscv.vmv.v.v.nxv8i64(
-    <vscale x 8 x i64> undef,
-    <vscale x 8 x i64> %0,
-    i64 %1)
-
-  ret <vscale x 8 x i64> %a
-}
-
-declare <vscale x 1 x half> @llvm.riscv.vmv.v.v.nxv1f16(
-  <vscale x 1 x half>,
-  <vscale x 1 x half>,
-  i64);
-
-define <vscale x 1 x half> @intrinsic_vmv.v.v_v_nxv1f16_nxv1f16(<vscale x 1 x half> %0, i64 %1) nounwind {
-; CHECK-LABEL: intrinsic_vmv.v.v_v_nxv1f16_nxv1f16:
-; CHECK:       # %bb.0: # %entry
-; CHECK-NEXT:    vsetvli zero, a0, e16, mf4, ta, ma
-; CHECK-NEXT:    vmv.v.v v8, v8
-; CHECK-NEXT:    ret
-entry:
-  %a = call <vscale x 1 x half> @llvm.riscv.vmv.v.v.nxv1f16(
-    <vscale x 1 x half> undef,
-    <vscale x 1 x half> %0,
-    i64 %1)
-
-  ret <vscale x 1 x half> %a
-}
-
-declare <vscale x 2 x half> @llvm.riscv.vmv.v.v.nxv2f16(
-  <vscale x 2 x half>,
-  <vscale x 2 x half>,
-  i64);
-
-define <vscale x 2 x half> @intrinsic_vmv.v.v_v_nxv2f16_nxv2f16(<vscale x 2 x half> %0, i64 %1) nounwind {
-; CHECK-LABEL: intrinsic_vmv.v.v_v_nxv2f16_nxv2f16:
-; CHECK:       # %bb.0: # %entry
-; CHECK-NEXT:    vsetvli zero, a0, e16, mf2, ta, ma
-; CHECK-NEXT:    vmv.v.v v8, v8
-; CHECK-NEXT:    ret
-entry:
-  %a = call <vscale x 2 x half> @llvm.riscv.vmv.v.v.nxv2f16(
-    <vscale x 2 x half> undef,
-    <vscale x 2 x half> %0,
-    i64 %1)
-
-  ret <vscale x 2 x half> %a
-}
-
-declare <vscale x 4 x half> @llvm.riscv.vmv.v.v.nxv4f16(
-  <vscale x 4 x half>,
-  <vscale x 4 x half>,
-  i64);
-
-define <vscale x 4 x half> @intrinsic_vmv.v.v_v_nxv4f16_nxv4f16(<vscale x 4 x half> %0, i64 %1) nounwind {
-; CHECK-LABEL: intrinsic_vmv.v.v_v_nxv4f16_nxv4f16:
-; CHECK:       # %bb.0: # %entry
-; CHECK-NEXT:    vsetvli zero, a0, e16, m1, ta, ma
-; CHECK-NEXT:    vmv.v.v v8, v8
-; CHECK-NEXT:    ret
-entry:
-  %a = call <vscale x 4 x half> @llvm.riscv.vmv.v.v.nxv4f16(
-    <vscale x 4 x half> undef,
-    <vscale x 4 x half> %0,
-    i64 %1)
-
-  ret <vscale x 4 x half> %a
-}
-
-declare <vscale x 8 x half> @llvm.riscv.vmv.v.v.nxv8f16(
-  <vscale x 8 x half>,
-  <vscale x 8 x half>,
-  i64);
-
-define <vscale x 8 x half> @intrinsic_vmv.v.v_v_nxv8f16_nxv8f16(<vscale x 8 x half> %0, i64 %1) nounwind {
-; CHECK-LABEL: intrinsic_vmv.v.v_v_nxv8f16_nxv8f16:
-; CHECK:       # %bb.0: # %entry
-; CHECK-NEXT:    vsetvli zero, a0, e16, m2, ta, ma
-; CHECK-NEXT:    vmv.v.v v8, v8
-; CHECK-NEXT:    ret
-entry:
-  %a = call <vscale x 8 x half> @llvm.riscv.vmv.v.v.nxv8f16(
-    <vscale x 8 x half> undef,
-    <vscale x 8 x half> %0,
-    i64 %1)
-
-  ret <vscale x 8 x half> %a
-}
-
-declare <vscale x 16 x half> @llvm.riscv.vmv.v.v.nxv16f16(
-  <vscale x 16 x half>,
-  <vscale x 16 x half>,
-  i64);
-
-define <vscale x 16 x half> @intrinsic_vmv.v.v_v_nxv16f16_nxv16f16(<vscale x 16 x half> %0, i64 %1) nounwind {
-; CHECK-LABEL: intrinsic_vmv.v.v_v_nxv16f16_nxv16f16:
-; CHECK:       # %bb.0: # %entry
-; CHECK-NEXT:    vsetvli zero, a0, e16, m4, ta, ma
-; CHECK-NEXT:    vmv.v.v v8, v8
-; CHECK-NEXT:    ret
-entry:
-  %a = call <vscale x 16 x half> @llvm.riscv.vmv.v.v.nxv16f16(
-    <vscale x 16 x half> undef,
-    <vscale x 16 x half> %0,
-    i64 %1)
-
-  ret <vscale x 16 x half> %a
-}
-
-declare <vscale x 32 x half> @llvm.riscv.vmv.v.v.nxv32f16(
-  <vscale x 32 x half>,
-  <vscale x 32 x half>,
-  i64);
-
-define <vscale x 32 x half> @intrinsic_vmv.v.v_v_nxv32f16_nxv32f16(<vscale x 32 x half> %0, i64 %1) nounwind {
-; CHECK-LABEL: intrinsic_vmv.v.v_v_nxv32f16_nxv32f16:
-; CHECK:       # %bb.0: # %entry
-; CHECK-NEXT:    vsetvli zero, a0, e16, m8, ta, ma
-; CHECK-NEXT:    vmv.v.v v8, v8
-; CHECK-NEXT:    ret
-entry:
-  %a = call <vscale x 32 x half> @llvm.riscv.vmv.v.v.nxv32f16(
-    <vscale x 32 x half> undef,
-    <vscale x 32 x half> %0,
-    i64 %1)
-
-  ret <vscale x 32 x half> %a
-}
-
-declare <vscale x 1 x float> @llvm.riscv.vmv.v.v.nxv1f32(
-  <vscale x 1 x float>,
-  <vscale x 1 x float>,
-  i64);
-
-define <vscale x 1 x float> @intrinsic_vmv.v.v_v_nxv1f32_nxv1f32(<vscale x 1 x float> %0, i64 %1) nounwind {
-; CHECK-LABEL: intrinsic_vmv.v.v_v_nxv1f32_nxv1f32:
-; CHECK:       # %bb.0: # %entry
-; CHECK-NEXT:    vsetvli zero, a0, e32, mf2, ta, ma
-; CHECK-NEXT:    vmv.v.v v8, v8
-; CHECK-NEXT:    ret
-entry:
-  %a = call <vscale x 1 x float> @llvm.riscv.vmv.v.v.nxv1f32(
-    <vscale x 1 x float> undef,
-    <vscale x 1 x float> %0,
-    i64 %1)
-
-  ret <vscale x 1 x float> %a
-}
-
-declare <vscale x 2 x float> @llvm.riscv.vmv.v.v.nxv2f32(
-  <vscale x 2 x float>,
-  <vscale x 2 x float>,
-  i64);
-
-define <vscale x 2 x float> @intrinsic_vmv.v.v_v_nxv2f32_nxv2f32(<vscale x 2 x float> %0, i64 %1) nounwind {
-; CHECK-LABEL: intrinsic_vmv.v.v_v_nxv2f32_nxv2f32:
-; CHECK:       # %bb.0: # %entry
-; CHECK-NEXT:    vsetvli zero, a0, e32, m1, ta, ma
-; CHECK-NEXT:    vmv.v.v v8, v8
-; CHECK-NEXT:    ret
-entry:
-  %a = call <vscale x 2 x float> @llvm.riscv.vmv.v.v.nxv2f32(
-    <vscale x 2 x float> undef,
-    <vscale x 2 x float> %0,
-    i64 %1)
-
-  ret <vscale x 2 x float> %a
-}
-
-declare <vscale x 4 x float> @llvm.riscv.vmv.v.v.nxv4f32(
-  <vscale x 4 x float>,
-  <vscale x 4 x float>,
-  i64);
-
-define <vscale x 4 x float> @intrinsic_vmv.v.v_v_nxv4f32_nxv4f32(<vscale x 4 x float> %0, i64 %1) nounwind {
-; CHECK-LABEL: intrinsic_vmv.v.v_v_nxv4f32_nxv4f32:
-; CHECK:       # %bb.0: # %entry
-; CHECK-NEXT:    vsetvli zero, a0, e32, m2, ta, ma
-; CHECK-NEXT:    vmv.v.v v8, v8
-; CHECK-NEXT:    ret
-entry:
-  %a = call <vscale x 4 x float> @llvm.riscv.vmv.v.v.nxv4f32(
-    <vscale x 4 x float> undef,
-    <vscale x 4 x float> %0,
-    i64 %1)
-
-  ret <vscale x 4 x float> %a
-}
-
-declare <vscale x 8 x float> @llvm.riscv.vmv.v.v.nxv8f32(
-  <vscale x 8 x float>,
-  <vscale x 8 x float>,
-  i64);
-
-define <vscale x 8 x float> @intrinsic_vmv.v.v_v_nxv8f32_nxv8f32(<vscale x 8 x float> %0, i64 %1) nounwind {
-; CHECK-LABEL: intrinsic_vmv.v.v_v_nxv8f32_nxv8f32:
-; CHECK:       # %bb.0: # %entry
-; CHECK-NEXT:    vsetvli zero, a0, e32, m4, ta, ma
-; CHECK-NEXT:    vmv.v.v v8, v8
-; CHECK-NEXT:    ret
-entry:
-  %a = call <vscale x 8 x float> @llvm.riscv.vmv.v.v.nxv8f32(
-    <vscale x 8 x float> undef,
-    <vscale x 8 x float> %0,
-    i64 %1)
-
-  ret <vscale x 8 x float> %a
-}
-
-declare <vscale x 16 x float> @llvm.riscv.vmv.v.v.nxv16f32(
-  <vscale x 16 x float>,
-  <vscale x 16 x float>,
-  i64);
-
-define <vscale x 16 x float> @intrinsic_vmv.v.v_v_nxv16f32_nxv16f32(<vscale x 16 x float> %0, i64 %1) nounwind {
-; CHECK-LABEL: intrinsic_vmv.v.v_v_nxv16f32_nxv16f32:
-; CHECK:       # %bb.0: # %entry
-; CHECK-NEXT:    vsetvli zero, a0, e32, m8, ta, ma
-; CHECK-NEXT:    vmv.v.v v8, v8
-; CHECK-NEXT:    ret
-entry:
-  %a = call <vscale x 16 x float> @llvm.riscv.vmv.v.v.nxv16f32(
-    <vscale x 16 x float> undef,
-    <vscale x 16 x float> %0,
-    i64 %1)
-
-  ret <vscale x 16 x float> %a
-}
-
-declare <vscale x 1 x double> @llvm.riscv.vmv.v.v.nxv1f64(
-  <vscale x 1 x double>,
-  <vscale x 1 x double>,
-  i64);
-
-define <vscale x 1 x double> @intrinsic_vmv.v.v_v_nxv1f64_nxv1f64(<vscale x 1 x double> %0, i64 %1) nounwind {
-; CHECK-LABEL: intrinsic_vmv.v.v_v_nxv1f64_nxv1f64:
-; CHECK:       # %bb.0: # %entry
-; CHECK-NEXT:    vsetvli zero, a0, e64, m1, ta, ma
-; CHECK-NEXT:    vmv.v.v v8, v8
-; CHECK-NEXT:    ret
-entry:
-  %a = call <vscale x 1 x double> @llvm.riscv.vmv.v.v.nxv1f64(
-    <vscale x 1 x double> undef,
-    <vscale x 1 x double> %0,
-    i64 %1)
-
-  ret <vscale x 1 x double> %a
-}
-
-declare <vscale x 2 x double> @llvm.riscv.vmv.v.v.nxv2f64(
-  <vscale x 2 x double>,
-  <vscale x 2 x double>,
-  i64);
-
-define <vscale x 2 x double> @intrinsic_vmv.v.v_v_nxv2f64_nxv2f64(<vscale x 2 x double> %0, i64 %1) nounwind {
-; CHECK-LABEL: intrinsic_vmv.v.v_v_nxv2f64_nxv2f64:
-; CHECK:       # %bb.0: # %entry
-; CHECK-NEXT:    vsetvli zero, a0, e64, m2, ta, ma
-; CHECK-NEXT:    vmv.v.v v8, v8
-; CHECK-NEXT:    ret
-entry:
-  %a = call <vscale x 2 x double> @llvm.riscv.vmv.v.v.nxv2f64(
-    <vscale x 2 x double> undef,
-    <vscale x 2 x double> %0,
-    i64 %1)
-
-  ret <vscale x 2 x double> %a
-}
-
-declare <vscale x 4 x double> @llvm.riscv.vmv.v.v.nxv4f64(
-  <vscale x 4 x double>,
-  <vscale x 4 x double>,
-  i64);
-
-define <vscale x 4 x double> @intrinsic_vmv.v.v_v_nxv4f64_nxv4f64(<vscale x 4 x double> %0, i64 %1) nounwind {
-; CHECK-LABEL: intrinsic_vmv.v.v_v_nxv4f64_nxv4f64:
-; CHECK:       # %bb.0: # %entry
-; CHECK-NEXT:    vsetvli zero, a0, e64, m4, ta, ma
-; CHECK-NEXT:    vmv.v.v v8, v8
-; CHECK-NEXT:    ret
-entry:
-  %a = call <vscale x 4 x double> @llvm.riscv.vmv.v.v.nxv4f64(
-    <vscale x 4 x double> undef,
-    <vscale x 4 x double> %0,
-    i64 %1)
-
-  ret <vscale x 4 x double> %a
-}
-
-declare <vscale x 8 x double> @llvm.riscv.vmv.v.v.nxv8f64(
-  <vscale x 8 x double>,
-  <vscale x 8 x double>,
-  i64);
-
-define <vscale x 8 x double> @intrinsic_vmv.v.v_v_nxv8f64_nxv8f64(<vscale x 8 x double> %0, i64 %1) nounwind {
-; CHECK-LABEL: intrinsic_vmv.v.v_v_nxv8f64_nxv8f64:
-; CHECK:       # %bb.0: # %entry
-; CHECK-NEXT:    vsetvli zero, a0, e64, m8, ta, ma
-; CHECK-NEXT:    vmv.v.v v8, v8
-; CHECK-NEXT:    ret
-entry:
-  %a = call <vscale x 8 x double> @llvm.riscv.vmv.v.v.nxv8f64(
-    <vscale x 8 x double> undef,
-    <vscale x 8 x double> %0,
-    i64 %1)
-
-  ret <vscale x 8 x double> %a
-}
diff --git a/llvm/test/CodeGen/RISCV/rvv/vmv.v.v-rv32.ll b/llvm/test/CodeGen/RISCV/rvv/vmv.v.v.ll
index ca31720..cdf8829 100644
--- a/llvm/test/CodeGen/RISCV/rvv/vmv.v.v-rv32.ll
+++ b/llvm/test/CodeGen/RISCV/rvv/vmv.v.v.ll
@@ -1,13 +1,15 @@
 ; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py
-; RUN: llc -mtriple=riscv32 -mattr=+v,+d,+zfh,+zvfh -verify-machineinstrs \
-; RUN:   < %s | FileCheck %s
+; RUN: sed 's/iXLen/i32/g' %s | llc -mtriple=riscv32 -mattr=+v,+d,+zfh,+zvfh \
+; RUN:   -verify-machineinstrs | FileCheck %s
+; RUN: sed 's/iXLen/i64/g' %s | llc -mtriple=riscv64 -mattr=+v,+d,+zfh,+zvfh \
+; RUN:   -verify-machineinstrs | FileCheck %s
 
 declare <vscale x 1 x i8> @llvm.riscv.vmv.v.v.nxv1i8(
   <vscale x 1 x i8>,
   <vscale x 1 x i8>,
-  i32);
+  iXLen);
 
-define <vscale x 1 x i8> @intrinsic_vmv.v.v_v_nxv1i8_nxv1i8(<vscale x 1 x i8> %0, i32 %1) nounwind {
+define <vscale x 1 x i8> @intrinsic_vmv.v.v_v_nxv1i8_nxv1i8(<vscale x 1 x i8> %0, iXLen %1) nounwind {
 ; CHECK-LABEL: intrinsic_vmv.v.v_v_nxv1i8_nxv1i8:
 ; CHECK:       # %bb.0: # %entry
 ; CHECK-NEXT:    vsetvli zero, a0, e8, mf8, ta, ma
@@ -17,7 +19,7 @@ entry:
   %a = call <vscale x 1 x i8> @llvm.riscv.vmv.v.v.nxv1i8(
     <vscale x 1 x i8> undef,
     <vscale x 1 x i8> %0,
-    i32 %1)
+    iXLen %1)
 
   ret <vscale x 1 x i8> %a
 }
@@ -25,9 +27,9 @@ entry:
 declare <vscale x 2 x i8> @llvm.riscv.vmv.v.v.nxv2i8(
   <vscale x 2 x i8>,
   <vscale x 2 x i8>,
-  i32);
+  iXLen);
 
-define <vscale x 2 x i8> @intrinsic_vmv.v.v_v_nxv2i8_nxv2i8(<vscale x 2 x i8> %0, i32 %1) nounwind {
+define <vscale x 2 x i8> @intrinsic_vmv.v.v_v_nxv2i8_nxv2i8(<vscale x 2 x i8> %0, iXLen %1) nounwind {
 ; CHECK-LABEL: intrinsic_vmv.v.v_v_nxv2i8_nxv2i8:
 ; CHECK:       # %bb.0: # %entry
 ; CHECK-NEXT:    vsetvli zero, a0, e8, mf4, ta, ma
@@ -37,7 +39,7 @@ entry:
   %a = call <vscale x 2 x i8> @llvm.riscv.vmv.v.v.nxv2i8(
     <vscale x 2 x i8> undef,
     <vscale x 2 x i8> %0,
-    i32 %1)
+    iXLen %1)
 
   ret <vscale x 2 x i8> %a
 }
@@ -45,9 +47,9 @@ entry:
 declare <vscale x 4 x i8> @llvm.riscv.vmv.v.v.nxv4i8(
   <vscale x 4 x i8>,
   <vscale x 4 x i8>,
-  i32);
+  iXLen);
 
-define <vscale x 4 x i8> @intrinsic_vmv.v.v_v_nxv4i8_nxv4i8(<vscale x 4 x i8> %0, i32 %1) nounwind {
+define <vscale x 4 x i8> @intrinsic_vmv.v.v_v_nxv4i8_nxv4i8(<vscale x 4 x i8> %0, iXLen %1) nounwind {
 ; CHECK-LABEL: intrinsic_vmv.v.v_v_nxv4i8_nxv4i8:
 ; CHECK:       # %bb.0: # %entry
 ; CHECK-NEXT:    vsetvli zero, a0, e8, mf2, ta, ma
@@ -57,7 +59,7 @@ entry:
   %a = call <vscale x 4 x i8> @llvm.riscv.vmv.v.v.nxv4i8(
     <vscale x 4 x i8> undef,
     <vscale x 4 x i8> %0,
-    i32 %1)
+    iXLen %1)
 
   ret <vscale x 4 x i8> %a
 }
@@ -65,9 +67,9 @@ entry:
 declare <vscale x 8 x i8> @llvm.riscv.vmv.v.v.nxv8i8(
   <vscale x 8 x i8>,
   <vscale x 8 x i8>,
-  i32);
+  iXLen);
 
-define <vscale x 8 x i8> @intrinsic_vmv.v.v_v_nxv8i8_nxv8i8(<vscale x 8 x i8> %0, i32 %1) nounwind {
+define <vscale x 8 x i8> @intrinsic_vmv.v.v_v_nxv8i8_nxv8i8(<vscale x 8 x i8> %0, iXLen %1) nounwind {
 ; CHECK-LABEL: intrinsic_vmv.v.v_v_nxv8i8_nxv8i8:
 ; CHECK:       # %bb.0: # %entry
 ; CHECK-NEXT:    vsetvli zero, a0, e8, m1, ta, ma
@@ -77,7 +79,7 @@ entry:
   %a = call <vscale x 8 x i8> @llvm.riscv.vmv.v.v.nxv8i8(
     <vscale x 8 x i8> undef,
     <vscale x 8 x i8> %0,
-    i32 %1)
+    iXLen %1)
 
   ret <vscale x 8 x i8> %a
 }
@@ -85,9 +87,9 @@ entry:
 declare <vscale x 16 x i8> @llvm.riscv.vmv.v.v.nxv16i8(
   <vscale x 16 x i8>,
   <vscale x 16 x i8>,
-  i32);
+  iXLen);
 
-define <vscale x 16 x i8> @intrinsic_vmv.v.v_v_nxv16i8_nxv16i8(<vscale x 16 x i8> %0, i32 %1) nounwind {
+define <vscale x 16 x i8> @intrinsic_vmv.v.v_v_nxv16i8_nxv16i8(<vscale x 16 x i8> %0, iXLen %1) nounwind {
 ; CHECK-LABEL: intrinsic_vmv.v.v_v_nxv16i8_nxv16i8:
 ; CHECK:       # %bb.0: # %entry
 ; CHECK-NEXT:    vsetvli zero, a0, e8, m2, ta, ma
@@ -97,7 +99,7 @@ entry:
   %a = call <vscale x 16 x i8> @llvm.riscv.vmv.v.v.nxv16i8(
     <vscale x 16 x i8> undef,
     <vscale x 16 x i8> %0,
-    i32 %1)
+    iXLen %1)
 
   ret <vscale x 16 x i8> %a
 }
@@ -105,9 +107,9 @@ entry:
 declare <vscale x 32 x i8> @llvm.riscv.vmv.v.v.nxv32i8(
   <vscale x 32 x i8>,
   <vscale x 32 x i8>,
-  i32);
+  iXLen);
 
-define <vscale x 32 x i8> @intrinsic_vmv.v.v_v_nxv32i8_nxv32i8(<vscale x 32 x i8> %0, i32 %1) nounwind {
+define <vscale x 32 x i8> @intrinsic_vmv.v.v_v_nxv32i8_nxv32i8(<vscale x 32 x i8> %0, iXLen %1) nounwind {
 ; CHECK-LABEL: intrinsic_vmv.v.v_v_nxv32i8_nxv32i8:
 ; CHECK:       # %bb.0: # %entry
 ; CHECK-NEXT:    vsetvli zero, a0, e8, m4, ta, ma
@@ -117,7 +119,7 @@ entry:
   %a = call <vscale x 32 x i8> @llvm.riscv.vmv.v.v.nxv32i8(
     <vscale x 32 x i8> undef,
     <vscale x 32 x i8> %0,
-    i32 %1)
+    iXLen %1)
 
   ret <vscale x 32 x i8> %a
 }
@@ -125,9 +127,9 @@ entry:
 declare <vscale x 64 x i8> @llvm.riscv.vmv.v.v.nxv64i8(
   <vscale x 64 x i8>,
   <vscale x 64 x i8>,
-  i32);
+  iXLen);
 
-define <vscale x 64 x i8> @intrinsic_vmv.v.v_v_nxv64i8_nxv64i8(<vscale x 64 x i8> %0, i32 %1) nounwind {
+define <vscale x 64 x i8> @intrinsic_vmv.v.v_v_nxv64i8_nxv64i8(<vscale x 64 x i8> %0, iXLen %1) nounwind {
 ; CHECK-LABEL: intrinsic_vmv.v.v_v_nxv64i8_nxv64i8:
 ; CHECK:       # %bb.0: # %entry
 ; CHECK-NEXT:    vsetvli zero, a0, e8, m8, ta, ma
@@ -137,7 +139,7 @@ entry:
   %a = call <vscale x 64 x i8> @llvm.riscv.vmv.v.v.nxv64i8(
     <vscale x 64 x i8> undef,
     <vscale x 64 x i8> %0,
-    i32 %1)
+    iXLen %1)
 
   ret <vscale x 64 x i8> %a
 }
@@ -145,9 +147,9 @@ entry:
 declare <vscale x 1 x i16> @llvm.riscv.vmv.v.v.nxv1i16(
   <vscale x 1 x i16>,
   <vscale x 1 x i16>,
-  i32);
+  iXLen);
 
-define <vscale x 1 x i16> @intrinsic_vmv.v.v_v_nxv1i16_nxv1i16(<vscale x 1 x i16> %0, i32 %1) nounwind {
+define <vscale x 1 x i16> @intrinsic_vmv.v.v_v_nxv1i16_nxv1i16(<vscale x 1 x i16> %0, iXLen %1) nounwind {
 ; CHECK-LABEL: intrinsic_vmv.v.v_v_nxv1i16_nxv1i16:
 ; CHECK:       # %bb.0: # %entry
 ; CHECK-NEXT:    vsetvli zero, a0, e16, mf4, ta, ma
@@ -157,7 +159,7 @@ entry:
   %a = call <vscale x 1 x i16> @llvm.riscv.vmv.v.v.nxv1i16(
     <vscale x 1 x i16> undef,
     <vscale x 1 x i16> %0,
-    i32 %1)
+    iXLen %1)
 
   ret <vscale x 1 x i16> %a
 }
@@ -165,9 +167,9 @@ entry:
 declare <vscale x 2 x i16> @llvm.riscv.vmv.v.v.nxv2i16(
   <vscale x 2 x i16>,
   <vscale x 2 x i16>,
-  i32);
+  iXLen);
 
-define <vscale x 2 x i16> @intrinsic_vmv.v.v_v_nxv2i16_nxv2i16(<vscale x 2 x i16> %0, i32 %1) nounwind {
+define <vscale x 2 x i16> @intrinsic_vmv.v.v_v_nxv2i16_nxv2i16(<vscale x 2 x i16> %0, iXLen %1) nounwind {
 ; CHECK-LABEL: intrinsic_vmv.v.v_v_nxv2i16_nxv2i16:
 ; CHECK:       # %bb.0: # %entry
 ; CHECK-NEXT:    vsetvli zero, a0, e16, mf2, ta, ma
@@ -177,7 +179,7 @@ entry:
   %a = call <vscale x 2 x i16> @llvm.riscv.vmv.v.v.nxv2i16(
     <vscale x 2 x i16> undef,
     <vscale x 2 x i16> %0,
-    i32 %1)
+    iXLen %1)
 
   ret <vscale x 2 x i16> %a
 }
@@ -185,9 +187,9 @@ entry:
 declare <vscale x 4 x i16> @llvm.riscv.vmv.v.v.nxv4i16(
   <vscale x 4 x i16>,
   <vscale x 4 x i16>,
-  i32);
+  iXLen);
 
-define <vscale x 4 x i16> @intrinsic_vmv.v.v_v_nxv4i16_nxv4i16(<vscale x 4 x i16> %0, i32 %1) nounwind {
+define <vscale x 4 x i16> @intrinsic_vmv.v.v_v_nxv4i16_nxv4i16(<vscale x 4 x i16> %0, iXLen %1) nounwind {
 ; CHECK-LABEL: intrinsic_vmv.v.v_v_nxv4i16_nxv4i16:
 ; CHECK:       # %bb.0: # %entry
 ; CHECK-NEXT:    vsetvli zero, a0, e16, m1, ta, ma
@@ -197,7 +199,7 @@ entry:
   %a = call <vscale x 4 x i16> @llvm.riscv.vmv.v.v.nxv4i16(
     <vscale x 4 x i16> undef,
     <vscale x 4 x i16> %0,
-    i32 %1)
+    iXLen %1)
 
   ret <vscale x 4 x i16> %a
 }
@@ -205,9 +207,9 @@ entry:
 declare <vscale x 8 x i16> @llvm.riscv.vmv.v.v.nxv8i16(
   <vscale x 8 x i16>,
   <vscale x 8 x i16>,
-  i32);
+  iXLen);
 
-define <vscale x 8 x i16> @intrinsic_vmv.v.v_v_nxv8i16_nxv8i16(<vscale x 8 x i16> %0, i32 %1) nounwind {
+define <vscale x 8 x i16> @intrinsic_vmv.v.v_v_nxv8i16_nxv8i16(<vscale x 8 x i16> %0, iXLen %1) nounwind {
 ; CHECK-LABEL: intrinsic_vmv.v.v_v_nxv8i16_nxv8i16:
 ; CHECK:       # %bb.0: # %entry
 ; CHECK-NEXT:    vsetvli zero, a0, e16, m2, ta, ma
@@ -217,7 +219,7 @@ entry:
   %a = call <vscale x 8 x i16> @llvm.riscv.vmv.v.v.nxv8i16(
     <vscale x 8 x i16> undef,
     <vscale x 8 x i16> %0,
-    i32 %1)
+    iXLen %1)
 
   ret <vscale x 8 x i16> %a
 }
@@ -225,9 +227,9 @@ entry:
 declare <vscale x 16 x i16> @llvm.riscv.vmv.v.v.nxv16i16(
   <vscale x 16 x i16>,
   <vscale x 16 x i16>,
-  i32);
+  iXLen);
 
-define <vscale x 16 x i16> @intrinsic_vmv.v.v_v_nxv16i16_nxv16i16(<vscale x 16 x i16> %0, i32 %1) nounwind {
+define <vscale x 16 x i16> @intrinsic_vmv.v.v_v_nxv16i16_nxv16i16(<vscale x 16 x i16> %0, iXLen %1) nounwind {
 ; CHECK-LABEL: intrinsic_vmv.v.v_v_nxv16i16_nxv16i16:
 ; CHECK:       # %bb.0: # %entry
 ; CHECK-NEXT:    vsetvli zero, a0, e16, m4, ta, ma
@@ -237,7 +239,7 @@ entry:
   %a = call <vscale x 16 x i16> @llvm.riscv.vmv.v.v.nxv16i16(
     <vscale x 16 x i16> undef,
     <vscale x 16 x i16> %0,
-    i32 %1)
+    iXLen %1)
 
   ret <vscale x 16 x i16> %a
 }
@@ -245,9 +247,9 @@ entry:
 declare <vscale x 32 x i16> @llvm.riscv.vmv.v.v.nxv32i16(
   <vscale x 32 x i16>,
   <vscale x 32 x i16>,
-  i32);
+  iXLen);
 
-define <vscale x 32 x i16> @intrinsic_vmv.v.v_v_nxv32i16_nxv32i16(<vscale x 32 x i16> %0, i32 %1) nounwind {
+define <vscale x 32 x i16> @intrinsic_vmv.v.v_v_nxv32i16_nxv32i16(<vscale x 32 x i16> %0, iXLen %1) nounwind {
 ; CHECK-LABEL: intrinsic_vmv.v.v_v_nxv32i16_nxv32i16:
 ; CHECK:       # %bb.0: # %entry
 ; CHECK-NEXT:    vsetvli zero, a0, e16, m8, ta, ma
@@ -257,7 +259,7 @@ entry:
   %a = call <vscale x 32 x i16> @llvm.riscv.vmv.v.v.nxv32i16(
     <vscale x 32 x i16> undef,
     <vscale x 32 x i16> %0,
-    i32 %1)
+    iXLen %1)
 
   ret <vscale x 32 x i16> %a
 }
@@ -265,9 +267,9 @@ entry:
 declare <vscale x 1 x i32> @llvm.riscv.vmv.v.v.nxv1i32(
   <vscale x 1 x i32>,
   <vscale x 1 x i32>,
-  i32);
+  iXLen);
 
-define <vscale x 1 x i32> @intrinsic_vmv.v.v_v_nxv1i32_nxv1i32(<vscale x 1 x i32> %0, i32 %1) nounwind {
+define <vscale x 1 x i32> @intrinsic_vmv.v.v_v_nxv1i32_nxv1i32(<vscale x 1 x i32> %0, iXLen %1) nounwind {
 ; CHECK-LABEL: intrinsic_vmv.v.v_v_nxv1i32_nxv1i32:
 ; CHECK:       # %bb.0: # %entry
 ; CHECK-NEXT:    vsetvli zero, a0, e32, mf2, ta, ma
@@ -277,7 +279,7 @@ entry:
   %a = call <vscale x 1 x i32> @llvm.riscv.vmv.v.v.nxv1i32(
     <vscale x 1 x i32> undef,
     <vscale x 1 x i32> %0,
-    i32 %1)
+    iXLen %1)
 
   ret <vscale x 1 x i32> %a
 }
@@ -285,9 +287,9 @@ entry:
 declare <vscale x 2 x i32> @llvm.riscv.vmv.v.v.nxv2i32(
   <vscale x 2 x i32>,
   <vscale x 2 x i32>,
-  i32);
+  iXLen);
 
-define <vscale x 2 x i32> @intrinsic_vmv.v.v_v_nxv2i32_nxv2i32(<vscale x 2 x i32> %0, i32 %1) nounwind {
+define <vscale x 2 x i32> @intrinsic_vmv.v.v_v_nxv2i32_nxv2i32(<vscale x 2 x i32> %0, iXLen %1) nounwind {
 ; CHECK-LABEL: intrinsic_vmv.v.v_v_nxv2i32_nxv2i32:
 ; CHECK:       # %bb.0: # %entry
 ; CHECK-NEXT:    vsetvli zero, a0, e32, m1, ta, ma
@@ -297,7 +299,7 @@ entry:
   %a = call <vscale x 2 x i32> @llvm.riscv.vmv.v.v.nxv2i32(
     <vscale x 2 x i32> undef,
     <vscale x 2 x i32> %0,
-    i32 %1)
+    iXLen %1)
 
   ret <vscale x 2 x i32> %a
 }
@@ -305,9 +307,9 @@ entry:
 declare <vscale x 4 x i32> @llvm.riscv.vmv.v.v.nxv4i32(
   <vscale x 4 x i32>,
   <vscale x 4 x i32>,
-  i32);
+  iXLen);
 
-define <vscale x 4 x i32> @intrinsic_vmv.v.v_v_nxv4i32_nxv4i32(<vscale x 4 x i32> %0, i32 %1) nounwind {
+define <vscale x 4 x i32> @intrinsic_vmv.v.v_v_nxv4i32_nxv4i32(<vscale x 4 x i32> %0, iXLen %1) nounwind {
 ; CHECK-LABEL: intrinsic_vmv.v.v_v_nxv4i32_nxv4i32:
 ; CHECK:       # %bb.0: # %entry
 ; CHECK-NEXT:    vsetvli zero, a0, e32, m2, ta, ma
@@ -317,7 +319,7 @@ entry:
   %a = call <vscale x 4 x i32> @llvm.riscv.vmv.v.v.nxv4i32(
     <vscale x 4 x i32> undef,
     <vscale x 4 x i32> %0,
-    i32 %1)
+    iXLen %1)
 
   ret <vscale x 4 x i32> %a
 }
@@ -325,9 +327,9 @@ entry:
 declare <vscale x 8 x i32> @llvm.riscv.vmv.v.v.nxv8i32(
   <vscale x 8 x i32>,
   <vscale x 8 x i32>,
-  i32);
+  iXLen);
 
-define <vscale x 8 x i32> @intrinsic_vmv.v.v_v_nxv8i32_nxv8i32(<vscale x 8 x i32> %0, i32 %1) nounwind {
+define <vscale x 8 x i32> @intrinsic_vmv.v.v_v_nxv8i32_nxv8i32(<vscale x 8 x i32> %0, iXLen %1) nounwind {
 ; CHECK-LABEL: intrinsic_vmv.v.v_v_nxv8i32_nxv8i32:
 ; CHECK:       # %bb.0: # %entry
 ; CHECK-NEXT:    vsetvli zero, a0, e32, m4, ta, ma
@@ -337,7 +339,7 @@ entry:
   %a = call <vscale x 8 x i32> @llvm.riscv.vmv.v.v.nxv8i32(
     <vscale x 8 x i32> undef,
     <vscale x 8 x i32> %0,
-    i32 %1)
+    iXLen %1)
 
   ret <vscale x 8 x i32> %a
 }
@@ -345,9 +347,9 @@ entry:
 declare <vscale x 16 x i32> @llvm.riscv.vmv.v.v.nxv16i32(
   <vscale x 16 x i32>,
   <vscale x 16 x i32>,
-  i32);
+  iXLen);
 
-define <vscale x 16 x i32> @intrinsic_vmv.v.v_v_nxv16i32_nxv16i32(<vscale x 16 x i32> %0, i32 %1) nounwind {
+define <vscale x 16 x i32> @intrinsic_vmv.v.v_v_nxv16i32_nxv16i32(<vscale x 16 x i32> %0, iXLen %1) nounwind {
 ; CHECK-LABEL: intrinsic_vmv.v.v_v_nxv16i32_nxv16i32:
 ; CHECK:       # %bb.0: # %entry
 ; CHECK-NEXT:    vsetvli zero, a0, e32, m8, ta, ma
@@ -357,7 +359,7 @@ entry:
   %a = call <vscale x 16 x i32> @llvm.riscv.vmv.v.v.nxv16i32(
     <vscale x 16 x i32> undef,
     <vscale x 16 x i32> %0,
-    i32 %1)
+    iXLen %1)
 
   ret <vscale x 16 x i32> %a
 }
@@ -365,9 +367,9 @@ entry:
 declare <vscale x 1 x i64> @llvm.riscv.vmv.v.v.nxv1i64(
   <vscale x 1 x i64>,
   <vscale x 1 x i64>,
-  i32);
+  iXLen);
 
-define <vscale x 1 x i64> @intrinsic_vmv.v.v_v_nxv1i64_nxv1i64(<vscale x 1 x i64> %0, i32 %1) nounwind {
+define <vscale x 1 x i64> @intrinsic_vmv.v.v_v_nxv1i64_nxv1i64(<vscale x 1 x i64> %0, iXLen %1) nounwind {
 ; CHECK-LABEL: intrinsic_vmv.v.v_v_nxv1i64_nxv1i64:
 ; CHECK:       # %bb.0: # %entry
 ; CHECK-NEXT:    vsetvli zero, a0, e64, m1, ta, ma
@@ -377,7 +379,7 @@ entry:
   %a = call <vscale x 1 x i64> @llvm.riscv.vmv.v.v.nxv1i64(
     <vscale x 1 x i64> undef,
     <vscale x 1 x i64> %0,
-    i32 %1)
+    iXLen %1)
 
   ret <vscale x 1 x i64> %a
 }
@@ -385,9 +387,9 @@ entry:
 declare <vscale x 2 x i64> @llvm.riscv.vmv.v.v.nxv2i64(
   <vscale x 2 x i64>,
   <vscale x 2 x i64>,
-  i32);
+  iXLen);
 
-define <vscale x 2 x i64> @intrinsic_vmv.v.v_v_nxv2i64_nxv2i64(<vscale x 2 x i64> %0, i32 %1) nounwind {
+define <vscale x 2 x i64> @intrinsic_vmv.v.v_v_nxv2i64_nxv2i64(<vscale x 2 x i64> %0, iXLen %1) nounwind {
 ; CHECK-LABEL: intrinsic_vmv.v.v_v_nxv2i64_nxv2i64:
 ; CHECK:       # %bb.0: # %entry
 ; CHECK-NEXT:    vsetvli zero, a0, e64, m2, ta, ma
@@ -397,7 +399,7 @@ entry:
   %a = call <vscale x 2 x i64> @llvm.riscv.vmv.v.v.nxv2i64(
     <vscale x 2 x i64> undef,
     <vscale x 2 x i64> %0,
-    i32 %1)
+    iXLen %1)
 
   ret <vscale x 2 x i64> %a
 }
@@ -405,9 +407,9 @@ entry:
 declare <vscale x 4 x i64> @llvm.riscv.vmv.v.v.nxv4i64(
   <vscale x 4 x i64>,
   <vscale x 4 x i64>,
-  i32);
+  iXLen);
 
-define <vscale x 4 x i64> @intrinsic_vmv.v.v_v_nxv4i64_nxv4i64(<vscale x 4 x i64> %0, i32 %1) nounwind {
+define <vscale x 4 x i64> @intrinsic_vmv.v.v_v_nxv4i64_nxv4i64(<vscale x 4 x i64> %0, iXLen %1) nounwind {
 ; CHECK-LABEL: intrinsic_vmv.v.v_v_nxv4i64_nxv4i64:
 ; CHECK:       # %bb.0: # %entry
 ; CHECK-NEXT:    vsetvli zero, a0, e64, m4, ta, ma
@@ -417,7 +419,7 @@ entry:
   %a = call <vscale x 4 x i64> @llvm.riscv.vmv.v.v.nxv4i64(
     <vscale x 4 x i64> undef,
     <vscale x 4 x i64> %0,
-    i32 %1)
+    iXLen %1)
 
   ret <vscale x 4 x i64> %a
 }
@@ -425,9 +427,9 @@ entry:
 declare <vscale x 8 x i64> @llvm.riscv.vmv.v.v.nxv8i64(
   <vscale x 8 x i64>,
   <vscale x 8 x i64>,
-  i32);
+  iXLen);
 
-define <vscale x 8 x i64> @intrinsic_vmv.v.v_v_nxv8i64_nxv8i64(<vscale x 8 x i64> %0, i32 %1) nounwind {
+define <vscale x 8 x i64> @intrinsic_vmv.v.v_v_nxv8i64_nxv8i64(<vscale x 8 x i64> %0, iXLen %1) nounwind {
 ; CHECK-LABEL: intrinsic_vmv.v.v_v_nxv8i64_nxv8i64:
 ; CHECK:       # %bb.0: # %entry
 ; CHECK-NEXT:    vsetvli zero, a0, e64, m8, ta, ma
@@ -437,7 +439,7 @@ entry:
   %a = call <vscale x 8 x i64> @llvm.riscv.vmv.v.v.nxv8i64(
     <vscale x 8 x i64> undef,
     <vscale x 8 x i64> %0,
-    i32 %1)
+    iXLen %1)
 
   ret <vscale x 8 x i64> %a
 }
@@ -445,9 +447,9 @@ entry:
 declare <vscale x 1 x half> @llvm.riscv.vmv.v.v.nxv1f16(
   <vscale x 1 x half>,
   <vscale x 1 x half>,
-  i32);
+  iXLen);
 
-define <vscale x 1 x half> @intrinsic_vmv.v.v_v_nxv1f16_nxv1f16(<vscale x 1 x half> %0, i32 %1) nounwind {
+define <vscale x 1 x half> @intrinsic_vmv.v.v_v_nxv1f16_nxv1f16(<vscale x 1 x half> %0, iXLen %1) nounwind {
 ; CHECK-LABEL: intrinsic_vmv.v.v_v_nxv1f16_nxv1f16:
 ; CHECK:       # %bb.0: # %entry
 ; CHECK-NEXT:    vsetvli zero, a0, e16, mf4, ta, ma
@@ -457,7 +459,7 @@ entry:
   %a = call <vscale x 1 x half> @llvm.riscv.vmv.v.v.nxv1f16(
     <vscale x 1 x half> undef,
     <vscale x 1 x half> %0,
-    i32 %1)
+    iXLen %1)
 
   ret <vscale x 1 x half> %a
 }
@@ -465,9 +467,9 @@ entry:
 declare <vscale x 2 x half> @llvm.riscv.vmv.v.v.nxv2f16(
   <vscale x 2 x half>,
   <vscale x 2 x half>,
-  i32);
+  iXLen);
 
-define <vscale x 2 x half> @intrinsic_vmv.v.v_v_nxv2f16_nxv2f16(<vscale x 2 x half> %0, i32 %1) nounwind {
+define <vscale x 2 x half> @intrinsic_vmv.v.v_v_nxv2f16_nxv2f16(<vscale x 2 x half> %0, iXLen %1) nounwind {
 ; CHECK-LABEL: intrinsic_vmv.v.v_v_nxv2f16_nxv2f16:
 ; CHECK:       # %bb.0: # %entry
 ; CHECK-NEXT:    vsetvli zero, a0, e16, mf2, ta, ma
@@ -477,7 +479,7 @@ entry:
   %a = call <vscale x 2 x half> @llvm.riscv.vmv.v.v.nxv2f16(
     <vscale x 2 x half> undef,
     <vscale x 2 x half> %0,
-    i32 %1)
+    iXLen %1)
 
   ret <vscale x 2 x half> %a
 }
@@ -485,9 +487,9 @@ entry:
 declare <vscale x 4 x half> @llvm.riscv.vmv.v.v.nxv4f16(
   <vscale x 4 x half>,
   <vscale x 4 x half>,
-  i32);
+  iXLen);
 
-define <vscale x 4 x half> @intrinsic_vmv.v.v_v_nxv4f16_nxv4f16(<vscale x 4 x half> %0, i32 %1) nounwind {
+define <vscale x 4 x half> @intrinsic_vmv.v.v_v_nxv4f16_nxv4f16(<vscale x 4 x half> %0, iXLen %1) nounwind {
 ; CHECK-LABEL: intrinsic_vmv.v.v_v_nxv4f16_nxv4f16:
 ; CHECK:       # %bb.0: # %entry
 ; CHECK-NEXT:    vsetvli zero, a0, e16, m1, ta, ma
@@ -497,7 +499,7 @@ entry:
   %a = call <vscale x 4 x half> @llvm.riscv.vmv.v.v.nxv4f16(
     <vscale x 4 x half> undef,
     <vscale x 4 x half> %0,
-    i32 %1)
+    iXLen %1)
 
   ret <vscale x 4 x half> %a
 }
@@ -505,9 +507,9 @@ entry:
 declare <vscale x 8 x half> @llvm.riscv.vmv.v.v.nxv8f16(
   <vscale x 8 x half>,
   <vscale x 8 x half>,
-  i32);
+  iXLen);
 
-define <vscale x 8 x half> @intrinsic_vmv.v.v_v_nxv8f16_nxv8f16(<vscale x 8 x half> %0, i32 %1) nounwind {
+define <vscale x 8 x half> @intrinsic_vmv.v.v_v_nxv8f16_nxv8f16(<vscale x 8 x half> %0, iXLen %1) nounwind {
 ; CHECK-LABEL: intrinsic_vmv.v.v_v_nxv8f16_nxv8f16:
 ; CHECK:       # %bb.0: # %entry
 ; CHECK-NEXT:    vsetvli zero, a0, e16, m2, ta, ma
@@ -517,7 +519,7 @@ entry:
   %a = call <vscale x 8 x half> @llvm.riscv.vmv.v.v.nxv8f16(
     <vscale x 8 x half> undef,
     <vscale x 8 x half> %0,
-    i32 %1)
+    iXLen %1)
 
   ret <vscale x 8 x half> %a
 }
@@ -525,9 +527,9 @@ entry:
 declare <vscale x 16 x half> @llvm.riscv.vmv.v.v.nxv16f16(
   <vscale x 16 x half>,
   <vscale x 16 x half>,
-  i32);
+  iXLen);
 
-define <vscale x 16 x half> @intrinsic_vmv.v.v_v_nxv16f16_nxv16f16(<vscale x 16 x half> %0, i32 %1) nounwind {
+define <vscale x 16 x half> @intrinsic_vmv.v.v_v_nxv16f16_nxv16f16(<vscale x 16 x half> %0, iXLen %1) nounwind {
 ; CHECK-LABEL: intrinsic_vmv.v.v_v_nxv16f16_nxv16f16:
 ; CHECK:       # %bb.0: # %entry
 ; CHECK-NEXT:    vsetvli zero, a0, e16, m4, ta, ma
@@ -537,7 +539,7 @@ entry:
   %a = call <vscale x 16 x half> @llvm.riscv.vmv.v.v.nxv16f16(
     <vscale x 16 x half> undef,
     <vscale x 16 x half> %0,
-    i32 %1)
+    iXLen %1)
 
   ret <vscale x 16 x half> %a
 }
@@ -545,9 +547,9 @@ entry:
 declare <vscale x 32 x half> @llvm.riscv.vmv.v.v.nxv32f16(
   <vscale x 32 x half>,
   <vscale x 32 x half>,
-  i32);
+  iXLen);
 
-define <vscale x 32 x half> @intrinsic_vmv.v.v_v_nxv32f16_nxv32f16(<vscale x 32 x half> %0, i32 %1) nounwind {
+define <vscale x 32 x half> @intrinsic_vmv.v.v_v_nxv32f16_nxv32f16(<vscale x 32 x half> %0, iXLen %1) nounwind {
 ; CHECK-LABEL: intrinsic_vmv.v.v_v_nxv32f16_nxv32f16:
 ; CHECK:       # %bb.0: # %entry
 ; CHECK-NEXT:    vsetvli zero, a0, e16, m8, ta, ma
@@ -557,7 +559,7 @@ entry:
   %a = call <vscale x 32 x half> @llvm.riscv.vmv.v.v.nxv32f16(
     <vscale x 32 x half> undef,
     <vscale x 32 x half> %0,
-    i32 %1)
+    iXLen %1)
 
   ret <vscale x 32 x half> %a
 }
@@ -565,9 +567,9 @@ entry:
 declare <vscale x 1 x float> @llvm.riscv.vmv.v.v.nxv1f32(
   <vscale x 1 x float>,
   <vscale x 1 x float>,
-  i32);
+  iXLen);
 
-define <vscale x 1 x float> @intrinsic_vmv.v.v_v_nxv1f32_nxv1f32(<vscale x 1 x float> %0, i32 %1) nounwind {
+define <vscale x 1 x float> @intrinsic_vmv.v.v_v_nxv1f32_nxv1f32(<vscale x 1 x float> %0, iXLen %1) nounwind {
 ; CHECK-LABEL: intrinsic_vmv.v.v_v_nxv1f32_nxv1f32:
 ; CHECK:       # %bb.0: # %entry
 ; CHECK-NEXT:    vsetvli zero, a0, e32, mf2, ta, ma
@@ -577,7 +579,7 @@ entry:
   %a = call <vscale x 1 x float> @llvm.riscv.vmv.v.v.nxv1f32(
     <vscale x 1 x float> undef,
     <vscale x 1 x float> %0,
-    i32 %1)
+    iXLen %1)
 
   ret <vscale x 1 x float> %a
 }
@@ -585,9 +587,9 @@ entry:
 declare <vscale x 2 x float> @llvm.riscv.vmv.v.v.nxv2f32(
   <vscale x 2 x float>,
   <vscale x 2 x float>,
-  i32);
+  iXLen);
 
-define <vscale x 2 x float> @intrinsic_vmv.v.v_v_nxv2f32_nxv2f32(<vscale x 2 x float> %0, i32 %1) nounwind {
+define <vscale x 2 x float> @intrinsic_vmv.v.v_v_nxv2f32_nxv2f32(<vscale x 2 x float> %0, iXLen %1) nounwind {
 ; CHECK-LABEL: intrinsic_vmv.v.v_v_nxv2f32_nxv2f32:
 ; CHECK:       # %bb.0: # %entry
 ; CHECK-NEXT:    vsetvli zero, a0, e32, m1, ta, ma
@@ -597,7 +599,7 @@ entry:
   %a = call <vscale x 2 x float> @llvm.riscv.vmv.v.v.nxv2f32(
     <vscale x 2 x float> undef,
     <vscale x 2 x float> %0,
-    i32 %1)
+    iXLen %1)
 
   ret <vscale x 2 x float> %a
 }
@@ -605,9 +607,9 @@ entry:
 declare <vscale x 4 x float> @llvm.riscv.vmv.v.v.nxv4f32(
   <vscale x 4 x float>,
   <vscale x 4 x float>,
-  i32);
+  iXLen);
 
-define <vscale x 4 x float> @intrinsic_vmv.v.v_v_nxv4f32_nxv4f32(<vscale x 4 x float> %0, i32 %1) nounwind {
+define <vscale x 4 x float> @intrinsic_vmv.v.v_v_nxv4f32_nxv4f32(<vscale x 4 x float> %0, iXLen %1) nounwind {
 ; CHECK-LABEL: intrinsic_vmv.v.v_v_nxv4f32_nxv4f32:
 ; CHECK:       # %bb.0: # %entry
 ; CHECK-NEXT:    vsetvli zero, a0, e32, m2, ta, ma
@@ -617,7 +619,7 @@ entry:
   %a = call <vscale x 4 x float> @llvm.riscv.vmv.v.v.nxv4f32(
     <vscale x 4 x float> undef,
     <vscale x 4 x float> %0,
-    i32 %1)
+    iXLen %1)
 
   ret <vscale x 4 x float> %a
 }
@@ -625,9 +627,9 @@ entry:
 declare <vscale x 8 x float> @llvm.riscv.vmv.v.v.nxv8f32(
   <vscale x 8 x float>,
   <vscale x 8 x float>,
-  i32);
+  iXLen);
 
-define <vscale x 8 x float> @intrinsic_vmv.v.v_v_nxv8f32_nxv8f32(<vscale x 8 x float> %0, i32 %1) nounwind {
+define <vscale x 8 x float> @intrinsic_vmv.v.v_v_nxv8f32_nxv8f32(<vscale x 8 x float> %0, iXLen %1) nounwind {
 ; CHECK-LABEL: intrinsic_vmv.v.v_v_nxv8f32_nxv8f32:
 ; CHECK:       # %bb.0: # %entry
 ; CHECK-NEXT:    vsetvli zero, a0, e32, m4, ta, ma
@@ -637,7 +639,7 @@ entry:
   %a = call <vscale x 8 x float> @llvm.riscv.vmv.v.v.nxv8f32(
     <vscale x 8 x float> undef,
     <vscale x 8 x float> %0,
-    i32 %1)
+    iXLen %1)
 
   ret <vscale x 8 x float> %a
 }
@@ -645,9 +647,9 @@ entry:
 declare <vscale x 16 x float> @llvm.riscv.vmv.v.v.nxv16f32(
   <vscale x 16 x float>,
   <vscale x 16 x float>,
-  i32);
+  iXLen);
 
-define <vscale x 16 x float> @intrinsic_vmv.v.v_v_nxv16f32_nxv16f32(<vscale x 16 x float> %0, i32 %1) nounwind {
+define <vscale x 16 x float> @intrinsic_vmv.v.v_v_nxv16f32_nxv16f32(<vscale x 16 x float> %0, iXLen %1) nounwind {
 ; CHECK-LABEL: intrinsic_vmv.v.v_v_nxv16f32_nxv16f32:
 ; CHECK:       # %bb.0: # %entry
 ; CHECK-NEXT:    vsetvli zero, a0, e32, m8, ta, ma
@@ -657,7 +659,7 @@ entry:
   %a = call <vscale x 16 x float> @llvm.riscv.vmv.v.v.nxv16f32(
     <vscale x 16 x float> undef,
     <vscale x 16 x float> %0,
-    i32 %1)
+    iXLen %1)
 
   ret <vscale x 16 x float> %a
 }
@@ -665,9 +667,9 @@ entry:
 declare <vscale x 1 x double> @llvm.riscv.vmv.v.v.nxv1f64(
   <vscale x 1 x double>,
   <vscale x 1 x double>,
-  i32);
+  iXLen);
 
-define <vscale x 1 x double> @intrinsic_vmv.v.v_v_nxv1f64_nxv1f64(<vscale x 1 x double> %0, i32 %1) nounwind {
+define <vscale x 1 x double> @intrinsic_vmv.v.v_v_nxv1f64_nxv1f64(<vscale x 1 x double> %0, iXLen %1) nounwind {
 ; CHECK-LABEL: intrinsic_vmv.v.v_v_nxv1f64_nxv1f64:
 ; CHECK:       # %bb.0: # %entry
 ; CHECK-NEXT:    vsetvli zero, a0, e64, m1, ta, ma
@@ -677,7 +679,7 @@ entry:
   %a = call <vscale x 1 x double> @llvm.riscv.vmv.v.v.nxv1f64(
     <vscale x 1 x double> undef,
     <vscale x 1 x double> %0,
-    i32 %1)
+    iXLen %1)
 
   ret <vscale x 1 x double> %a
 }
@@ -685,9 +687,9 @@ entry:
 declare <vscale x 2 x double> @llvm.riscv.vmv.v.v.nxv2f64(
   <vscale x 2 x double>,
   <vscale x 2 x double>,
-  i32);
+  iXLen);
 
-define <vscale x 2 x double> @intrinsic_vmv.v.v_v_nxv2f64_nxv2f64(<vscale x 2 x double> %0, i32 %1) nounwind {
+define <vscale x 2 x double> @intrinsic_vmv.v.v_v_nxv2f64_nxv2f64(<vscale x 2 x double> %0, iXLen %1) nounwind {
 ; CHECK-LABEL: intrinsic_vmv.v.v_v_nxv2f64_nxv2f64:
 ; CHECK:       # %bb.0: # %entry
 ; CHECK-NEXT:    vsetvli zero, a0, e64, m2, ta, ma
@@ -697,7 +699,7 @@ entry:
   %a = call <vscale x 2 x double> @llvm.riscv.vmv.v.v.nxv2f64(
     <vscale x 2 x double> undef,
     <vscale x 2 x double> %0,
-    i32 %1)
+    iXLen %1)
 
   ret <vscale x 2 x double> %a
 }
@@ -705,9 +707,9 @@ entry:
 declare <vscale x 4 x double> @llvm.riscv.vmv.v.v.nxv4f64(
   <vscale x 4 x double>,
   <vscale x 4 x double>,
-  i32);
+  iXLen);
 
-define <vscale x 4 x double> @intrinsic_vmv.v.v_v_nxv4f64_nxv4f64(<vscale x 4 x double> %0, i32 %1) nounwind {
+define <vscale x 4 x double> @intrinsic_vmv.v.v_v_nxv4f64_nxv4f64(<vscale x 4 x double> %0, iXLen %1) nounwind {
 ; CHECK-LABEL: intrinsic_vmv.v.v_v_nxv4f64_nxv4f64:
 ; CHECK:       # %bb.0: # %entry
 ; CHECK-NEXT:    vsetvli zero, a0, e64, m4, ta, ma
@@ -717,7 +719,7 @@ entry:
   %a = call <vscale x 4 x double> @llvm.riscv.vmv.v.v.nxv4f64(
     <vscale x 4 x double> undef,
     <vscale x 4 x double> %0,
-    i32 %1)
+    iXLen %1)
 
   ret <vscale x 4 x double> %a
 }
@@ -725,9 +727,9 @@ entry:
 declare <vscale x 8 x double> @llvm.riscv.vmv.v.v.nxv8f64(
   <vscale x 8 x double>,
   <vscale x 8 x double>,
-  i32);
+  iXLen);
 
-define <vscale x 8 x double> @intrinsic_vmv.v.v_v_nxv8f64_nxv8f64(<vscale x 8 x double> %0, i32 %1) nounwind {
+define <vscale x 8 x double> @intrinsic_vmv.v.v_v_nxv8f64_nxv8f64(<vscale x 8 x double> %0, iXLen %1) nounwind {
 ; CHECK-LABEL: intrinsic_vmv.v.v_v_nxv8f64_nxv8f64:
 ; CHECK:       # %bb.0: # %entry
 ; CHECK-NEXT:    vsetvli zero, a0, e64, m8, ta, ma
@@ -737,7 +739,7 @@ entry:
   %a = call <vscale x 8 x double> @llvm.riscv.vmv.v.v.nxv8f64(
     <vscale x 8 x double> undef,
     <vscale x 8 x double> %0,
-    i32 %1)
+    iXLen %1)
 
   ret <vscale x 8 x double> %a
 }
diff --git a/llvm/test/CodeGen/RISCV/rvv/vmv.v.x-rv32.ll b/llvm/test/CodeGen/RISCV/rvv/vmv.v.x-rv32.ll
deleted file mode 100644
index 3c28568..0000000
--- a/llvm/test/CodeGen/RISCV/rvv/vmv.v.x-rv32.ll
+++ /dev/null
@@ -1,853 +0,0 @@
-; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py
-; RUN: llc -mtriple=riscv32 -mattr=+v -verify-machineinstrs \
-; RUN:   < %s | FileCheck %s
-
-declare <vscale x 1 x i8> @llvm.riscv.vmv.v.x.nxv1i8(
-  <vscale x 1 x i8>,
-  i8,
-  i32);
-
-define <vscale x 1 x i8> @intrinsic_vmv.v.x_x_nxv1i8(i8 %0, i32 %1) nounwind {
-; CHECK-LABEL: intrinsic_vmv.v.x_x_nxv1i8:
-; CHECK:       # %bb.0: # %entry
-; CHECK-NEXT:    vsetvli zero, a1, e8, mf8, ta, ma
-; CHECK-NEXT:    vmv.v.x v8, a0
-; CHECK-NEXT:    ret
-entry:
-  %a = call <vscale x 1 x i8> @llvm.riscv.vmv.v.x.nxv1i8(
-    <vscale x 1 x i8> undef,
-    i8 %0,
-    i32 %1)
-
-  ret <vscale x 1 x i8> %a
-}
-
-declare <vscale x 2 x i8> @llvm.riscv.vmv.v.x.nxv2i8(
-  <vscale x 2 x i8>,
-  i8,
-  i32);
-
-define <vscale x 2 x i8> @intrinsic_vmv.v.x_x_nxv2i8(i8 %0, i32 %1) nounwind {
-; CHECK-LABEL: intrinsic_vmv.v.x_x_nxv2i8:
-; CHECK:       # %bb.0: # %entry
-; CHECK-NEXT:    vsetvli zero, a1, e8, mf4, ta, ma
-; CHECK-NEXT:    vmv.v.x v8, a0
-; CHECK-NEXT:    ret
-entry:
-  %a = call <vscale x 2 x i8> @llvm.riscv.vmv.v.x.nxv2i8(
-    <vscale x 2 x i8> undef,
-    i8 %0,
-    i32 %1)
-
-  ret <vscale x 2 x i8> %a
-}
-
-declare <vscale x 4 x i8> @llvm.riscv.vmv.v.x.nxv4i8(
-  <vscale x 4 x i8>,
-  i8,
-  i32);
-
-define <vscale x 4 x i8> @intrinsic_vmv.v.x_x_nxv4i8(i8 %0, i32 %1) nounwind {
-; CHECK-LABEL: intrinsic_vmv.v.x_x_nxv4i8:
-; CHECK:       # %bb.0: # %entry
-; CHECK-NEXT:    vsetvli zero, a1, e8, mf2, ta, ma
-; CHECK-NEXT:    vmv.v.x v8, a0
-; CHECK-NEXT:    ret
-entry:
-  %a = call <vscale x 4 x i8> @llvm.riscv.vmv.v.x.nxv4i8(
-    <vscale x 4 x i8> undef,
-    i8 %0,
-    i32 %1)
-
-  ret <vscale x 4 x i8> %a
-}
-
-declare <vscale x 8 x i8> @llvm.riscv.vmv.v.x.nxv8i8(
-  <vscale x 8 x i8>,
-  i8,
-  i32);
-
-define <vscale x 8 x i8> @intrinsic_vmv.v.x_x_nxv8i8(i8 %0, i32 %1) nounwind {
-; CHECK-LABEL: intrinsic_vmv.v.x_x_nxv8i8:
-; CHECK:       # %bb.0: # %entry
-; CHECK-NEXT:    vsetvli zero, a1, e8, m1, ta, ma
-; CHECK-NEXT:    vmv.v.x v8, a0
-; CHECK-NEXT:    ret
-entry:
-  %a = call <vscale x 8 x i8> @llvm.riscv.vmv.v.x.nxv8i8(
-    <vscale x 8 x i8> undef,
-    i8 %0,
-    i32 %1)
-
-  ret <vscale x 8 x i8> %a
-}
-
-declare <vscale x 16 x i8> @llvm.riscv.vmv.v.x.nxv16i8(
-  <vscale x 16 x i8>,
-  i8,
-  i32);
-
-define <vscale x 16 x i8> @intrinsic_vmv.v.x_x_nxv16i8(i8 %0, i32 %1) nounwind {
-; CHECK-LABEL: intrinsic_vmv.v.x_x_nxv16i8:
-; CHECK:       # %bb.0: # %entry
-; CHECK-NEXT:    vsetvli zero, a1, e8, m2, ta, ma
-; CHECK-NEXT:    vmv.v.x v8, a0
-; CHECK-NEXT:    ret
-entry:
-  %a = call <vscale x 16 x i8> @llvm.riscv.vmv.v.x.nxv16i8(
-    <vscale x 16 x i8> undef,
-    i8 %0,
-    i32 %1)
-
-  ret <vscale x 16 x i8> %a
-}
-
-declare <vscale x 32 x i8> @llvm.riscv.vmv.v.x.nxv32i8(
-  <vscale x 32 x i8>,
-  i8,
-  i32);
-
-define <vscale x 32 x i8> @intrinsic_vmv.v.x_x_nxv32i8(i8 %0, i32 %1) nounwind {
-; CHECK-LABEL: intrinsic_vmv.v.x_x_nxv32i8:
-; CHECK:       # %bb.0: # %entry
-; CHECK-NEXT:    vsetvli zero, a1, e8, m4, ta, ma
-; CHECK-NEXT:    vmv.v.x v8, a0
-; CHECK-NEXT:    ret
-entry:
-  %a = call <vscale x 32 x i8> @llvm.riscv.vmv.v.x.nxv32i8(
-    <vscale x 32 x i8> undef,
-    i8 %0,
-    i32 %1)
-
-  ret <vscale x 32 x i8> %a
-}
-
-declare <vscale x 64 x i8> @llvm.riscv.vmv.v.x.nxv64i8(
-  <vscale x 64 x i8>,
-  i8,
-  i32);
-
-define <vscale x 64 x i8> @intrinsic_vmv.v.x_x_nxv64i8(i8 %0, i32 %1) nounwind {
-; CHECK-LABEL: intrinsic_vmv.v.x_x_nxv64i8:
-; CHECK:       # %bb.0: # %entry
-; CHECK-NEXT:    vsetvli zero, a1, e8, m8, ta, ma
-; CHECK-NEXT:    vmv.v.x v8, a0
-; CHECK-NEXT:    ret
-entry:
-  %a = call <vscale x 64 x i8> @llvm.riscv.vmv.v.x.nxv64i8(
-    <vscale x 64 x i8> undef,
-    i8 %0,
-    i32 %1)
-
-  ret <vscale x 64 x i8> %a
-}
-
-declare <vscale x 1 x i16> @llvm.riscv.vmv.v.x.nxv1i16(
-  <vscale x 1 x i16>,
-  i16,
-  i32);
-
-define <vscale x 1 x i16> @intrinsic_vmv.v.x_x_nxv1i16(i16 %0, i32 %1) nounwind {
-; CHECK-LABEL: intrinsic_vmv.v.x_x_nxv1i16:
-; CHECK:       # %bb.0: # %entry
-; CHECK-NEXT:    vsetvli zero, a1, e16, mf4, ta, ma
-; CHECK-NEXT:    vmv.v.x v8, a0
-; CHECK-NEXT:    ret
-entry:
-  %a = call <vscale x 1 x i16> @llvm.riscv.vmv.v.x.nxv1i16(
-    <vscale x 1 x i16> undef,
-    i16 %0,
-    i32 %1)
-
-  ret <vscale x 1 x i16> %a
-}
-
-declare <vscale x 2 x i16> @llvm.riscv.vmv.v.x.nxv2i16(
-  <vscale x 2 x i16>,
-  i16,
-  i32);
-
-define <vscale x 2 x i16> @intrinsic_vmv.v.x_x_nxv2i16(i16 %0, i32 %1) nounwind {
-; CHECK-LABEL: intrinsic_vmv.v.x_x_nxv2i16:
-; CHECK:       # %bb.0: # %entry
-; CHECK-NEXT:    vsetvli zero, a1, e16, mf2, ta, ma
-; CHECK-NEXT:    vmv.v.x v8, a0
-; CHECK-NEXT:    ret
-entry:
-  %a = call <vscale x 2 x i16> @llvm.riscv.vmv.v.x.nxv2i16(
-    <vscale x 2 x i16> undef,
-    i16 %0,
-    i32 %1)
-
-  ret <vscale x 2 x i16> %a
-}
-
-declare <vscale x 4 x i16> @llvm.riscv.vmv.v.x.nxv4i16(
-  <vscale x 4 x i16>,
-  i16,
-  i32);
-
-define <vscale x 4 x i16> @intrinsic_vmv.v.x_x_nxv4i16(i16 %0, i32 %1) nounwind {
-; CHECK-LABEL: intrinsic_vmv.v.x_x_nxv4i16:
-; CHECK:       # %bb.0: # %entry
-; CHECK-NEXT:    vsetvli zero, a1, e16, m1, ta, ma
-; CHECK-NEXT:    vmv.v.x v8, a0
-; CHECK-NEXT:    ret
-entry:
-  %a = call <vscale x 4 x i16> @llvm.riscv.vmv.v.x.nxv4i16(
-    <vscale x 4 x i16> undef,
-    i16 %0,
-    i32 %1)
-
-  ret <vscale x 4 x i16> %a
-}
-
-declare <vscale x 8 x i16> @llvm.riscv.vmv.v.x.nxv8i16(
-  <vscale x 8 x i16>,
-  i16,
-  i32);
-
-define <vscale x 8 x i16> @intrinsic_vmv.v.x_x_nxv8i16(i16 %0, i32 %1) nounwind {
-; CHECK-LABEL: intrinsic_vmv.v.x_x_nxv8i16:
-; CHECK:       # %bb.0: # %entry
-; CHECK-NEXT:    vsetvli zero, a1, e16, m2, ta, ma
-; CHECK-NEXT:    vmv.v.x v8, a0
-; CHECK-NEXT:    ret
-entry:
-  %a = call <vscale x 8 x i16> @llvm.riscv.vmv.v.x.nxv8i16(
-    <vscale x 8 x i16> undef,
-    i16 %0,
-    i32 %1)
-
-  ret <vscale x 8 x i16> %a
-}
-
-declare <vscale x 16 x i16> @llvm.riscv.vmv.v.x.nxv16i16(
-  <vscale x 16 x i16>,
-  i16,
-  i32);
-
-define <vscale x 16 x i16> @intrinsic_vmv.v.x_x_nxv16i16(i16 %0, i32 %1) nounwind {
-; CHECK-LABEL: intrinsic_vmv.v.x_x_nxv16i16:
-; CHECK:       # %bb.0: # %entry
-; CHECK-NEXT:    vsetvli zero, a1, e16, m4, ta, ma
-; CHECK-NEXT:    vmv.v.x v8, a0
-; CHECK-NEXT:    ret
-entry:
-  %a = call <vscale x 16 x i16> @llvm.riscv.vmv.v.x.nxv16i16(
-    <vscale x 16 x i16> undef,
-    i16 %0,
-    i32 %1)
-
-  ret <vscale x 16 x i16> %a
-}
-
-declare <vscale x 32 x i16> @llvm.riscv.vmv.v.x.nxv32i16(
-  <vscale x 32 x i16>,
-  i16,
-  i32);
-
-define <vscale x 32 x i16> @intrinsic_vmv.v.x_x_nxv32i16(i16 %0, i32 %1) nounwind {
-; CHECK-LABEL: intrinsic_vmv.v.x_x_nxv32i16:
-; CHECK:       # %bb.0: # %entry
-; CHECK-NEXT:    vsetvli zero, a1, e16, m8, ta, ma
-; CHECK-NEXT:    vmv.v.x v8, a0
-; CHECK-NEXT:    ret
-entry:
-  %a = call <vscale x 32 x i16> @llvm.riscv.vmv.v.x.nxv32i16(
-    <vscale x 32 x i16> undef,
-    i16 %0,
-    i32 %1)
-
-  ret <vscale x 32 x i16> %a
-}
-
-declare <vscale x 1 x i32> @llvm.riscv.vmv.v.x.nxv1i32(
-  <vscale x 1 x i32>,
-  i32,
-  i32);
-
-define <vscale x 1 x i32> @intrinsic_vmv.v.x_x_nxv1i32(i32 %0, i32 %1) nounwind {
-; CHECK-LABEL: intrinsic_vmv.v.x_x_nxv1i32:
-; CHECK:       # %bb.0: # %entry
-; CHECK-NEXT:    vsetvli zero, a1, e32, mf2, ta, ma
-; CHECK-NEXT:    vmv.v.x v8, a0
-; CHECK-NEXT:    ret
-entry:
-  %a = call <vscale x 1 x i32> @llvm.riscv.vmv.v.x.nxv1i32(
-    <vscale x 1 x i32> undef,
-    i32 %0,
-    i32 %1)
-
-  ret <vscale x 1 x i32> %a
-}
-
-declare <vscale x 2 x i32> @llvm.riscv.vmv.v.x.nxv2i32(
-  <vscale x 2 x i32>,
-  i32,
-  i32);
-
-define <vscale x 2 x i32> @intrinsic_vmv.v.x_x_nxv2i32(i32 %0, i32 %1) nounwind {
-; CHECK-LABEL: intrinsic_vmv.v.x_x_nxv2i32:
-; CHECK:       # %bb.0: # %entry
-; CHECK-NEXT:    vsetvli zero, a1, e32, m1, ta, ma
-; CHECK-NEXT:    vmv.v.x v8, a0
-; CHECK-NEXT:    ret
-entry:
-  %a = call <vscale x 2 x i32> @llvm.riscv.vmv.v.x.nxv2i32(
-    <vscale x 2 x i32> undef,
-    i32 %0,
-    i32 %1)
-
-  ret <vscale x 2 x i32> %a
-}
-
-declare <vscale x 4 x i32> @llvm.riscv.vmv.v.x.nxv4i32(
-  <vscale x 4 x i32>,
-  i32,
-  i32);
-
-define <vscale x 4 x i32> @intrinsic_vmv.v.x_x_nxv4i32(i32 %0, i32 %1) nounwind {
-; CHECK-LABEL: intrinsic_vmv.v.x_x_nxv4i32:
-; CHECK:       # %bb.0: # %entry
-; CHECK-NEXT:    vsetvli zero, a1, e32, m2, ta, ma
-; CHECK-NEXT:    vmv.v.x v8, a0
-; CHECK-NEXT:    ret
-entry:
-  %a = call <vscale x 4 x i32> @llvm.riscv.vmv.v.x.nxv4i32(
-    <vscale x 4 x i32> undef,
-    i32 %0,
-    i32 %1)
-
-  ret <vscale x 4 x i32> %a
-}
-
-declare <vscale x 8 x i32> @llvm.riscv.vmv.v.x.nxv8i32(
-  <vscale x 8 x i32>,
-  i32,
-  i32);
-
-define <vscale x 8 x i32> @intrinsic_vmv.v.x_x_nxv8i32(i32 %0, i32 %1) nounwind {
-; CHECK-LABEL: intrinsic_vmv.v.x_x_nxv8i32:
-; CHECK:       # %bb.0: # %entry
-; CHECK-NEXT:    vsetvli zero, a1, e32, m4, ta, ma
-; CHECK-NEXT:    vmv.v.x v8, a0
-; CHECK-NEXT:    ret
-entry:
-  %a = call <vscale x 8 x i32> @llvm.riscv.vmv.v.x.nxv8i32(
-    <vscale x 8 x i32> undef,
-    i32 %0,
-    i32 %1)
-
-  ret <vscale x 8 x i32> %a
-}
-
-declare <vscale x 16 x i32> @llvm.riscv.vmv.v.x.nxv16i32(
-  <vscale x 16 x i32>,
-  i32,
-  i32);
-
-define <vscale x 16 x i32> @intrinsic_vmv.v.x_x_nxv16i32(i32 %0, i32 %1) nounwind {
-; CHECK-LABEL: intrinsic_vmv.v.x_x_nxv16i32:
-; CHECK:       # %bb.0: # %entry
-; CHECK-NEXT:    vsetvli zero, a1, e32, m8, ta, ma
-; CHECK-NEXT:    vmv.v.x v8, a0
-; CHECK-NEXT:    ret
-entry:
-  %a = call <vscale x 16 x i32> @llvm.riscv.vmv.v.x.nxv16i32(
-    <vscale x 16 x i32> undef,
-    i32 %0,
-    i32 %1)
-
-  ret <vscale x 16 x i32> %a
-}
-
-declare <vscale x 1 x i64> @llvm.riscv.vmv.v.x.nxv1i64(
-  <vscale x 1 x i64>,
-  i64,
-  i32);
-
-define <vscale x 1 x i64> @intrinsic_vmv.v.x_x_nxv1i64(i64 %0, i32 %1) nounwind {
-; CHECK-LABEL: intrinsic_vmv.v.x_x_nxv1i64:
-; CHECK:       # %bb.0: # %entry
-; CHECK-NEXT:    addi sp, sp, -16
-; CHECK-NEXT:    sw a1, 12(sp)
-; CHECK-NEXT:    sw a0, 8(sp)
-; CHECK-NEXT:    addi a0, sp, 8
-; CHECK-NEXT:    vsetvli zero, a2, e64, m1, ta, ma
-; CHECK-NEXT:    vlse64.v v8, (a0), zero
-; CHECK-NEXT:    addi sp, sp, 16
-; CHECK-NEXT:    ret
-entry:
-  %a = call <vscale x 1 x i64> @llvm.riscv.vmv.v.x.nxv1i64(
-    <vscale x 1 x i64> undef,
-    i64 %0,
-    i32 %1)
-
-  ret <vscale x 1 x i64> %a
-}
-
-declare <vscale x 2 x i64> @llvm.riscv.vmv.v.x.nxv2i64(
-  <vscale x 2 x i64>,
-  i64,
-  i32);
-
-define <vscale x 2 x i64> @intrinsic_vmv.v.x_x_nxv2i64(i64 %0, i32 %1) nounwind {
-; CHECK-LABEL: intrinsic_vmv.v.x_x_nxv2i64:
-; CHECK:       # %bb.0: # %entry
-; CHECK-NEXT:    addi sp, sp, -16
-; CHECK-NEXT:    sw a1, 12(sp)
-; CHECK-NEXT:    sw a0, 8(sp)
-; CHECK-NEXT:    addi a0, sp, 8
-; CHECK-NEXT:    vsetvli zero, a2, e64, m2, ta, ma
-; CHECK-NEXT:    vlse64.v v8, (a0), zero
-; CHECK-NEXT:    addi sp, sp, 16
-; CHECK-NEXT:    ret
-entry:
-  %a = call <vscale x 2 x i64> @llvm.riscv.vmv.v.x.nxv2i64(
-    <vscale x 2 x i64> undef,
-    i64 %0,
-    i32 %1)
-
-  ret <vscale x 2 x i64> %a
-}
-
-declare <vscale x 4 x i64> @llvm.riscv.vmv.v.x.nxv4i64(
-  <vscale x 4 x i64>,
-  i64,
-  i32);
-
-define <vscale x 4 x i64> @intrinsic_vmv.v.x_x_nxv4i64(i64 %0, i32 %1) nounwind {
-; CHECK-LABEL: intrinsic_vmv.v.x_x_nxv4i64:
-; CHECK:       # %bb.0: # %entry
-; CHECK-NEXT:    addi sp, sp, -16
-; CHECK-NEXT:    sw a1, 12(sp)
-; CHECK-NEXT:    sw a0, 8(sp)
-; CHECK-NEXT:    addi a0, sp, 8
-; CHECK-NEXT:    vsetvli zero, a2, e64, m4, ta, ma
-; CHECK-NEXT:    vlse64.v v8, (a0), zero
-; CHECK-NEXT:    addi sp, sp, 16
-; CHECK-NEXT:    ret
-entry:
-  %a = call <vscale x 4 x i64> @llvm.riscv.vmv.v.x.nxv4i64(
-    <vscale x 4 x i64> undef,
-    i64 %0,
-    i32 %1)
-
-  ret <vscale x 4 x i64> %a
-}
-
-declare <vscale x 8 x i64> @llvm.riscv.vmv.v.x.nxv8i64(
-  <vscale x 8 x i64>,
-  i64,
-  i32);
-
-define <vscale x 8 x i64> @intrinsic_vmv.v.x_x_nxv8i64(i64 %0, i32 %1) nounwind {
-; CHECK-LABEL: intrinsic_vmv.v.x_x_nxv8i64:
-; CHECK:       # %bb.0: # %entry
-; CHECK-NEXT:    addi sp, sp, -16
-; CHECK-NEXT:    sw a1, 12(sp)
-; CHECK-NEXT:    sw a0, 8(sp)
-; CHECK-NEXT:    addi a0, sp, 8
-; CHECK-NEXT:    vsetvli zero, a2, e64, m8, ta, ma
-; CHECK-NEXT:    vlse64.v v8, (a0), zero
-; CHECK-NEXT:    addi sp, sp, 16
-; CHECK-NEXT:    ret
-entry:
-  %a = call <vscale x 8 x i64> @llvm.riscv.vmv.v.x.nxv8i64(
-    <vscale x 8 x i64> undef,
-    i64 %0,
-    i32 %1)
-
-  ret <vscale x 8 x i64> %a
-}
-
-define <vscale x 1 x i8> @intrinsic_vmv.v.x_i_nxv1i8(i32 %0) nounwind {
-; CHECK-LABEL: intrinsic_vmv.v.x_i_nxv1i8:
-; CHECK:       # %bb.0: # %entry
-; CHECK-NEXT:    vsetvli zero, a0, e8, mf8, ta, ma
-; CHECK-NEXT:    vmv.v.i v8, 9
-; CHECK-NEXT:    ret
-entry:
-  %a = call <vscale x 1 x i8> @llvm.riscv.vmv.v.x.nxv1i8(
-    <vscale x 1 x i8> undef,
-    i8 9,
-    i32 %0)
-
-  ret <vscale x 1 x i8> %a
-}
-
-define <vscale x 2 x i8> @intrinsic_vmv.v.x_i_nxv2i8(i32 %0) nounwind {
-; CHECK-LABEL: intrinsic_vmv.v.x_i_nxv2i8:
-; CHECK:       # %bb.0: # %entry
-; CHECK-NEXT:    vsetvli zero, a0, e8, mf4, ta, ma
-; CHECK-NEXT:    vmv.v.i v8, 9
-; CHECK-NEXT:    ret
-entry:
-  %a = call <vscale x 2 x i8> @llvm.riscv.vmv.v.x.nxv2i8(
-    <vscale x 2 x i8> undef,
-    i8 9,
-    i32 %0)
-
-  ret <vscale x 2 x i8> %a
-}
-
-define <vscale x 4 x i8> @intrinsic_vmv.v.x_i_nxv4i8(i32 %0) nounwind {
-; CHECK-LABEL: intrinsic_vmv.v.x_i_nxv4i8:
-; CHECK:       # %bb.0: # %entry
-; CHECK-NEXT:    vsetvli zero, a0, e8, mf2, ta, ma
-; CHECK-NEXT:    vmv.v.i v8, 9
-; CHECK-NEXT:    ret
-entry:
-  %a = call <vscale x 4 x i8> @llvm.riscv.vmv.v.x.nxv4i8(
-    <vscale x 4 x i8> undef,
-    i8 9,
-    i32 %0)
-
-  ret <vscale x 4 x i8> %a
-}
-
-define <vscale x 8 x i8> @intrinsic_vmv.v.x_i_nxv8i8(i32 %0) nounwind {
-; CHECK-LABEL: intrinsic_vmv.v.x_i_nxv8i8:
-; CHECK:       # %bb.0: # %entry
-; CHECK-NEXT:    vsetvli zero, a0, e8, m1, ta, ma
-; CHECK-NEXT:    vmv.v.i v8, 9
-; CHECK-NEXT:    ret
-entry:
-  %a = call <vscale x 8 x i8> @llvm.riscv.vmv.v.x.nxv8i8(
-    <vscale x 8 x i8> undef,
-    i8 9,
-    i32 %0)
-
-  ret <vscale x 8 x i8> %a
-}
-
-define <vscale x 16 x i8> @intrinsic_vmv.v.x_i_nxv16i8(i32 %0) nounwind {
-; CHECK-LABEL: intrinsic_vmv.v.x_i_nxv16i8:
-; CHECK:       # %bb.0: # %entry
-; CHECK-NEXT:    vsetvli zero, a0, e8, m2, ta, ma
-; CHECK-NEXT:    vmv.v.i v8, 9
-; CHECK-NEXT:    ret
-entry:
-  %a = call <vscale x 16 x i8> @llvm.riscv.vmv.v.x.nxv16i8(
-    <vscale x 16 x i8> undef,
-    i8 9,
-    i32 %0)
-
-  ret <vscale x 16 x i8> %a
-}
-
-define <vscale x 32 x i8> @intrinsic_vmv.v.x_i_nxv32i8(i32 %0) nounwind {
-; CHECK-LABEL: intrinsic_vmv.v.x_i_nxv32i8:
-; CHECK:       # %bb.0: # %entry
-; CHECK-NEXT:    vsetvli zero, a0, e8, m4, ta, ma
-; CHECK-NEXT:    vmv.v.i v8, 9
-; CHECK-NEXT:    ret
-entry:
-  %a = call <vscale x 32 x i8> @llvm.riscv.vmv.v.x.nxv32i8(
-    <vscale x 32 x i8> undef,
-    i8 9,
-    i32 %0)
-
-  ret <vscale x 32 x i8> %a
-}
-
-define <vscale x 64 x i8> @intrinsic_vmv.v.x_i_nxv64i8(i32 %0) nounwind {
-; CHECK-LABEL: intrinsic_vmv.v.x_i_nxv64i8:
-; CHECK:       # %bb.0: # %entry
-; CHECK-NEXT:    vsetvli zero, a0, e8, m8, ta, ma
-; CHECK-NEXT:    vmv.v.i v8, 9
-; CHECK-NEXT:    ret
-entry:
-  %a = call <vscale x 64 x i8> @llvm.riscv.vmv.v.x.nxv64i8(
-    <vscale x 64 x i8> undef,
-    i8 9,
-    i32 %0)
-
-  ret <vscale x 64 x i8> %a
-}
-
-define <vscale x 1 x i16> @intrinsic_vmv.v.x_i_nxv1i16(i32 %0) nounwind {
-; CHECK-LABEL: intrinsic_vmv.v.x_i_nxv1i16:
-; CHECK:       # %bb.0: # %entry
-; CHECK-NEXT:    vsetvli zero, a0, e16, mf4, ta, ma
-; CHECK-NEXT:    vmv.v.i v8, 9
-; CHECK-NEXT:    ret
-entry:
-  %a = call <vscale x 1 x i16> @llvm.riscv.vmv.v.x.nxv1i16(
-    <vscale x 1 x i16> undef,
-    i16 9,
-    i32 %0)
-
-  ret <vscale x 1 x i16> %a
-}
-
-define <vscale x 2 x i16> @intrinsic_vmv.v.x_i_nxv2i16(i32 %0) nounwind {
-; CHECK-LABEL: intrinsic_vmv.v.x_i_nxv2i16:
-; CHECK:       # %bb.0: # %entry
-; CHECK-NEXT:    vsetvli zero, a0, e16, mf2, ta, ma
-; CHECK-NEXT:    vmv.v.i v8, 9
-; CHECK-NEXT:    ret
-entry:
-  %a = call <vscale x 2 x i16> @llvm.riscv.vmv.v.x.nxv2i16(
-    <vscale x 2 x i16> undef,
-    i16 9,
-    i32 %0)
-
-  ret <vscale x 2 x i16> %a
-}
-
-define <vscale x 4 x i16> @intrinsic_vmv.v.x_i_nxv4i16(i32 %0) nounwind {
-; CHECK-LABEL: intrinsic_vmv.v.x_i_nxv4i16:
-; CHECK:       # %bb.0: # %entry
-; CHECK-NEXT:    vsetvli zero, a0, e16, m1, ta, ma
-; CHECK-NEXT:    vmv.v.i v8, 9
-; CHECK-NEXT:    ret
-entry:
-  %a = call <vscale x 4 x i16> @llvm.riscv.vmv.v.x.nxv4i16(
-    <vscale x 4 x i16> undef,
-    i16 9,
-    i32 %0)
-
-  ret <vscale x 4 x i16> %a
-}
-
-define <vscale x 8 x i16> @intrinsic_vmv.v.x_i_nxv8i16(i32 %0) nounwind {
-; CHECK-LABEL: intrinsic_vmv.v.x_i_nxv8i16:
-; CHECK:       # %bb.0: # %entry
-; CHECK-NEXT:    vsetvli zero, a0, e16, m2, ta, ma
-; CHECK-NEXT:    vmv.v.i v8, 9
-; CHECK-NEXT:    ret
-entry:
-  %a = call <vscale x 8 x i16> @llvm.riscv.vmv.v.x.nxv8i16(
-    <vscale x 8 x i16> undef,
-    i16 9,
-    i32 %0)
-
-  ret <vscale x 8 x i16> %a
-}
-
-define <vscale x 16 x i16> @intrinsic_vmv.v.x_i_nxv16i16(i32 %0) nounwind {
-; CHECK-LABEL: intrinsic_vmv.v.x_i_nxv16i16:
-; CHECK:       # %bb.0: # %entry
-; CHECK-NEXT:    vsetvli zero, a0, e16, m4, ta, ma
-; CHECK-NEXT:    vmv.v.i v8, 9
-; CHECK-NEXT:    ret
-entry:
-  %a = call <vscale x 16 x i16> @llvm.riscv.vmv.v.x.nxv16i16(
-    <vscale x 16 x i16> undef,
-    i16 9,
-    i32 %0)
-
-  ret <vscale x 16 x i16> %a
-}
-
-define <vscale x 32 x i16> @intrinsic_vmv.v.x_i_nxv32i16(i32 %0) nounwind {
-; CHECK-LABEL: intrinsic_vmv.v.x_i_nxv32i16:
-; CHECK:       # %bb.0: # %entry
-; CHECK-NEXT:    vsetvli zero, a0, e16, m8, ta, ma
-; CHECK-NEXT:    vmv.v.i v8, 9
-; CHECK-NEXT:    ret
-entry:
-  %a = call <vscale x 32 x i16> @llvm.riscv.vmv.v.x.nxv32i16(
-    <vscale x 32 x i16> undef,
-    i16 9,
-    i32 %0)
-
-  ret <vscale x 32 x i16> %a
-}
-
-define <vscale x 1 x i32> @intrinsic_vmv.v.x_i_nxv1i32(i32 %0) nounwind {
-; CHECK-LABEL: intrinsic_vmv.v.x_i_nxv1i32:
-; CHECK:       # %bb.0: # %entry
-; CHECK-NEXT:    vsetvli zero, a0, e32, mf2, ta, ma
-; CHECK-NEXT:    vmv.v.i v8, 9
-; CHECK-NEXT:    ret
-entry:
-  %a = call <vscale x 1 x i32> @llvm.riscv.vmv.v.x.nxv1i32(
-    <vscale x 1 x i32> undef,
-    i32 9,
-    i32 %0)
-
-  ret <vscale x 1 x i32> %a
-}
-
-define <vscale x 2 x i32> @intrinsic_vmv.v.x_i_nxv2i32(i32 %0) nounwind {
-; CHECK-LABEL: intrinsic_vmv.v.x_i_nxv2i32:
-; CHECK:       # %bb.0: # %entry
-; CHECK-NEXT:    vsetvli zero, a0, e32, m1, ta, ma
-; CHECK-NEXT:    vmv.v.i v8, 9
-; CHECK-NEXT:    ret
-entry:
-  %a = call <vscale x 2 x i32> @llvm.riscv.vmv.v.x.nxv2i32(
-    <vscale x 2 x i32> undef,
-    i32 9,
-    i32 %0)
-
-  ret <vscale x 2 x i32> %a
-}
-
-define <vscale x 4 x i32> @intrinsic_vmv.v.x_i_nxv4i32(i32 %0) nounwind {
-; CHECK-LABEL: intrinsic_vmv.v.x_i_nxv4i32:
-; CHECK:       # %bb.0: # %entry
-; CHECK-NEXT:    vsetvli zero, a0, e32, m2, ta, ma
-; CHECK-NEXT:    vmv.v.i v8, 9
-; CHECK-NEXT:    ret
-entry:
-  %a = call <vscale x 4 x i32> @llvm.riscv.vmv.v.x.nxv4i32(
-    <vscale x 4 x i32> undef,
-    i32 9,
-    i32 %0)
-
-  ret <vscale x 4 x i32> %a
-}
-
-define <vscale x 8 x i32> @intrinsic_vmv.v.x_i_nxv8i32(i32 %0) nounwind {
-; CHECK-LABEL: intrinsic_vmv.v.x_i_nxv8i32:
-; CHECK:       # %bb.0: # %entry
-; CHECK-NEXT:    vsetvli zero, a0, e32, m4, ta, ma
-; CHECK-NEXT:    vmv.v.i v8, 9
-; CHECK-NEXT:    ret
-entry:
-  %a = call <vscale x 8 x i32> @llvm.riscv.vmv.v.x.nxv8i32(
-    <vscale x 8 x i32> undef,
-    i32 9,
-    i32 %0)
-
-  ret <vscale x 8 x i32> %a
-}
-
-define <vscale x 16 x i32> @intrinsic_vmv.v.x_i_nxv16i32(i32 %0) nounwind {
-; CHECK-LABEL: intrinsic_vmv.v.x_i_nxv16i32:
-; CHECK:       # %bb.0: # %entry
-; CHECK-NEXT:    vsetvli zero, a0, e32, m8, ta, ma
-; CHECK-NEXT:    vmv.v.i v8, 9
-; CHECK-NEXT:    ret
-entry:
-  %a = call <vscale x 16 x i32> @llvm.riscv.vmv.v.x.nxv16i32(
-    <vscale x 16 x i32> undef,
-    i32 9,
-    i32 %0)
-
-  ret <vscale x 16 x i32> %a
-}
-
-define <vscale x 1 x i64> @intrinsic_vmv.v.x_i_nxv1i64(i32 %0) nounwind {
-; CHECK-LABEL: intrinsic_vmv.v.x_i_nxv1i64:
-; CHECK:       # %bb.0: # %entry
-; CHECK-NEXT:    vsetvli zero, a0, e64, m1, ta, ma
-; CHECK-NEXT:    vmv.v.i v8, 9
-; CHECK-NEXT:    ret
-entry:
-  %a = call <vscale x 1 x i64> @llvm.riscv.vmv.v.x.nxv1i64(
-    <vscale x 1 x i64> undef,
-    i64 9,
-    i32 %0)
-
-  ret <vscale x 1 x i64> %a
-}
-
-define <vscale x 2 x i64> @intrinsic_vmv.v.x_i_nxv2i64(i32 %0) nounwind {
-; CHECK-LABEL: intrinsic_vmv.v.x_i_nxv2i64:
-; CHECK:       # %bb.0: # %entry
-; CHECK-NEXT:    vsetvli zero, a0, e64, m2, ta, ma
-; CHECK-NEXT:    vmv.v.i v8, 9
-; CHECK-NEXT:    ret
-entry:
-  %a = call <vscale x 2 x i64> @llvm.riscv.vmv.v.x.nxv2i64(
-    <vscale x 2 x i64> undef,
-    i64 9,
-    i32 %0)
-
-  ret <vscale x 2 x i64> %a
-}
-
-define <vscale x 4 x i64> @intrinsic_vmv.v.x_i_nxv4i64(i32 %0) nounwind {
-; CHECK-LABEL: intrinsic_vmv.v.x_i_nxv4i64:
-; CHECK:       # %bb.0: # %entry
-; CHECK-NEXT:    vsetvli zero, a0, e64, m4, ta, ma
-; CHECK-NEXT:    vmv.v.i v8, 9
-; CHECK-NEXT:    ret
-entry:
-  %a = call <vscale x 4 x i64> @llvm.riscv.vmv.v.x.nxv4i64(
-    <vscale x 4 x i64> undef,
-    i64 9,
-    i32 %0)
-
-  ret <vscale x 4 x i64> %a
-}
-
-define <vscale x 8 x i64> @intrinsic_vmv.v.x_i_nxv8i64(i32 %0) nounwind {
-; CHECK-LABEL: intrinsic_vmv.v.x_i_nxv8i64:
-; CHECK:       # %bb.0: # %entry
-; CHECK-NEXT:    vsetvli zero, a0, e64, m8, ta, ma
-; CHECK-NEXT:    vmv.v.i v8, 9
-; CHECK-NEXT:    ret
-entry:
-  %a = call <vscale x 8 x i64> @llvm.riscv.vmv.v.x.nxv8i64(
-    <vscale x 8 x i64> undef,
-    i64 9,
-    i32 %0)
-
-  ret <vscale x 8 x i64> %a
-}
-
-define <vscale x 1 x i64> @intrinsic_vmv.v.x_i_nxv1i64_vlmax() nounwind {
-; CHECK-LABEL: intrinsic_vmv.v.x_i_nxv1i64_vlmax:
-; CHECK:       # %bb.0: # %entry
-; CHECK-NEXT:    vsetvli a0, zero, e32, m1, ta, ma
-; CHECK-NEXT:    vmv.v.i v8, 3
-; CHECK-NEXT:    ret
-entry:
-  %a = call <vscale x 1 x i64> @llvm.riscv.vmv.v.x.nxv1i64(
-    <vscale x 1 x i64> undef,
-    i64 12884901891,
-    i32 -1)
-
-  ret <vscale x 1 x i64> %a
-}
-
-define <vscale x 2 x i64> @intrinsic_vmv.v.x_i_nxv2i64_vlmax() nounwind {
-; CHECK-LABEL: intrinsic_vmv.v.x_i_nxv2i64_vlmax:
-; CHECK:       # %bb.0: # %entry
-; CHECK-NEXT:    vsetvli a0, zero, e32, m2, ta, ma
-; CHECK-NEXT:    vmv.v.i v8, 3
-; CHECK-NEXT:    ret
-entry:
-  %a = call <vscale x 2 x i64> @llvm.riscv.vmv.v.x.nxv2i64(
-    <vscale x 2 x i64> undef,
-    i64 12884901891,
-    i32 -1)
-
-  ret <vscale x 2 x i64> %a
-}
-
-define <vscale x 4 x i64> @intrinsic_vmv.v.x_i_nxv4i64_vlmax() nounwind {
-; CHECK-LABEL: intrinsic_vmv.v.x_i_nxv4i64_vlmax:
-; CHECK:       # %bb.0: # %entry
-; CHECK-NEXT:    vsetvli a0, zero, e32, m4, ta, ma
-; CHECK-NEXT:    vmv.v.i v8, 3
-; CHECK-NEXT:    ret
-entry:
-  %a = call <vscale x 4 x i64> @llvm.riscv.vmv.v.x.nxv4i64(
-    <vscale x 4 x i64> undef,
-    i64 12884901891,
-    i32 -1)
-
-  ret <vscale x 4 x i64> %a
-}
-
-define <vscale x 8 x i64> @intrinsic_vmv.v.x_i_nxv8i64_vlmax() nounwind {
-; CHECK-LABEL: intrinsic_vmv.v.x_i_nxv8i64_vlmax:
-; CHECK:       # %bb.0: # %entry
-; CHECK-NEXT:    vsetvli a0, zero, e32, m8, ta, ma
-; CHECK-NEXT:    vmv.v.i v8, 3
-; CHECK-NEXT:    ret
-entry:
-  %a = call <vscale x 8 x i64> @llvm.riscv.vmv.v.x.nxv8i64(
-    <vscale x 8 x i64> undef,
-    i64 12884901891,
-    i32 -1)
-
-  ret <vscale x 8 x i64> %a
-}
diff --git a/llvm/test/CodeGen/RISCV/rvv/vmv.v.x-rv64.ll b/llvm/test/CodeGen/RISCV/rvv/vmv.v.x.ll
index feb05fe4..4fa95fb 100644
--- a/llvm/test/CodeGen/RISCV/rvv/vmv.v.x-rv64.ll
+++ b/llvm/test/CodeGen/RISCV/rvv/vmv.v.x.ll
@@ -1,13 +1,15 @@
 ; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py
-; RUN: llc -mtriple=riscv64 -mattr=+v -verify-machineinstrs \
-; RUN:   < %s | FileCheck %s
+; RUN: sed 's/iXLen/i32/g' %s | llc -mtriple=riscv32 -mattr=+v \
+; RUN:   -verify-machineinstrs  | FileCheck %s --check-prefixes=CHECK,RV32
+; RUN: sed 's/iXLen/i64/g' %s | llc -mtriple=riscv64 -mattr=+v \
+; RUN:   -verify-machineinstrs  | FileCheck %s --check-prefixes=CHECK,RV64
 
 declare <vscale x 1 x i8> @llvm.riscv.vmv.v.x.nxv1i8(
   <vscale x 1 x i8>,
   i8,
-  i64);
+  iXLen);
 
-define <vscale x 1 x i8> @intrinsic_vmv.v.x_x_nxv1i8(i8 %0, i64 %1) nounwind {
+define <vscale x 1 x i8> @intrinsic_vmv.v.x_x_nxv1i8(i8 %0, iXLen %1) nounwind {
 ; CHECK-LABEL: intrinsic_vmv.v.x_x_nxv1i8:
 ; CHECK:       # %bb.0: # %entry
 ; CHECK-NEXT:    vsetvli zero, a1, e8, mf8, ta, ma
@@ -17,7 +19,7 @@ entry:
   %a = call <vscale x 1 x i8> @llvm.riscv.vmv.v.x.nxv1i8(
     <vscale x 1 x i8> undef,
     i8 %0,
-    i64 %1)
+    iXLen %1)
 
   ret <vscale x 1 x i8> %a
 }
@@ -25,9 +27,9 @@ entry:
 declare <vscale x 2 x i8> @llvm.riscv.vmv.v.x.nxv2i8(
   <vscale x 2 x i8>,
   i8,
-  i64);
+  iXLen);
 
-define <vscale x 2 x i8> @intrinsic_vmv.v.x_x_nxv2i8(i8 %0, i64 %1) nounwind {
+define <vscale x 2 x i8> @intrinsic_vmv.v.x_x_nxv2i8(i8 %0, iXLen %1) nounwind {
 ; CHECK-LABEL: intrinsic_vmv.v.x_x_nxv2i8:
 ; CHECK:       # %bb.0: # %entry
 ; CHECK-NEXT:    vsetvli zero, a1, e8, mf4, ta, ma
@@ -37,7 +39,7 @@ entry:
   %a = call <vscale x 2 x i8> @llvm.riscv.vmv.v.x.nxv2i8(
     <vscale x 2 x i8> undef,
     i8 %0,
-    i64 %1)
+    iXLen %1)
 
   ret <vscale x 2 x i8> %a
 }
@@ -45,9 +47,9 @@ entry:
 declare <vscale x 4 x i8> @llvm.riscv.vmv.v.x.nxv4i8(
   <vscale x 4 x i8>,
   i8,
-  i64);
+  iXLen);
 
-define <vscale x 4 x i8> @intrinsic_vmv.v.x_x_nxv4i8(i8 %0, i64 %1) nounwind {
+define <vscale x 4 x i8> @intrinsic_vmv.v.x_x_nxv4i8(i8 %0, iXLen %1) nounwind {
 ; CHECK-LABEL: intrinsic_vmv.v.x_x_nxv4i8:
 ; CHECK:       # %bb.0: # %entry
 ; CHECK-NEXT:    vsetvli zero, a1, e8, mf2, ta, ma
@@ -57,7 +59,7 @@ entry:
   %a = call <vscale x 4 x i8> @llvm.riscv.vmv.v.x.nxv4i8(
     <vscale x 4 x i8> undef,
     i8 %0,
-    i64 %1)
+    iXLen %1)
 
   ret <vscale x 4 x i8> %a
 }
@@ -65,9 +67,9 @@ entry:
 declare <vscale x 8 x i8> @llvm.riscv.vmv.v.x.nxv8i8(
   <vscale x 8 x i8>,
   i8,
-  i64);
+  iXLen);
 
-define <vscale x 8 x i8> @intrinsic_vmv.v.x_x_nxv8i8(i8 %0, i64 %1) nounwind {
+define <vscale x 8 x i8> @intrinsic_vmv.v.x_x_nxv8i8(i8 %0, iXLen %1) nounwind {
 ; CHECK-LABEL: intrinsic_vmv.v.x_x_nxv8i8:
 ; CHECK:       # %bb.0: # %entry
 ; CHECK-NEXT:    vsetvli zero, a1, e8, m1, ta, ma
@@ -77,7 +79,7 @@ entry:
   %a = call <vscale x 8 x i8> @llvm.riscv.vmv.v.x.nxv8i8(
     <vscale x 8 x i8> undef,
     i8 %0,
-    i64 %1)
+    iXLen %1)
 
   ret <vscale x 8 x i8> %a
 }
@@ -85,9 +87,9 @@ entry:
 declare <vscale x 16 x i8> @llvm.riscv.vmv.v.x.nxv16i8(
   <vscale x 16 x i8>,
   i8,
-  i64);
+  iXLen);
 
-define <vscale x 16 x i8> @intrinsic_vmv.v.x_x_nxv16i8(i8 %0, i64 %1) nounwind {
+define <vscale x 16 x i8> @intrinsic_vmv.v.x_x_nxv16i8(i8 %0, iXLen %1) nounwind {
 ; CHECK-LABEL: intrinsic_vmv.v.x_x_nxv16i8:
 ; CHECK:       # %bb.0: # %entry
 ; CHECK-NEXT:    vsetvli zero, a1, e8, m2, ta, ma
@@ -97,7 +99,7 @@ entry:
   %a = call <vscale x 16 x i8> @llvm.riscv.vmv.v.x.nxv16i8(
     <vscale x 16 x i8> undef,
     i8 %0,
-    i64 %1)
+    iXLen %1)
 
   ret <vscale x 16 x i8> %a
 }
@@ -105,9 +107,9 @@ entry:
 declare <vscale x 32 x i8> @llvm.riscv.vmv.v.x.nxv32i8(
   <vscale x 32 x i8>,
   i8,
-  i64);
+  iXLen);
 
-define <vscale x 32 x i8> @intrinsic_vmv.v.x_x_nxv32i8(i8 %0, i64 %1) nounwind {
+define <vscale x 32 x i8> @intrinsic_vmv.v.x_x_nxv32i8(i8 %0, iXLen %1) nounwind {
 ; CHECK-LABEL: intrinsic_vmv.v.x_x_nxv32i8:
 ; CHECK:       # %bb.0: # %entry
 ; CHECK-NEXT:    vsetvli zero, a1, e8, m4, ta, ma
@@ -117,7 +119,7 @@ entry:
   %a = call <vscale x 32 x i8> @llvm.riscv.vmv.v.x.nxv32i8(
     <vscale x 32 x i8> undef,
     i8 %0,
-    i64 %1)
+    iXLen %1)
 
   ret <vscale x 32 x i8> %a
 }
@@ -125,9 +127,9 @@ entry:
 declare <vscale x 64 x i8> @llvm.riscv.vmv.v.x.nxv64i8(
   <vscale x 64 x i8>,
   i8,
-  i64);
+  iXLen);
 
-define <vscale x 64 x i8> @intrinsic_vmv.v.x_x_nxv64i8(i8 %0, i64 %1) nounwind {
+define <vscale x 64 x i8> @intrinsic_vmv.v.x_x_nxv64i8(i8 %0, iXLen %1) nounwind {
 ; CHECK-LABEL: intrinsic_vmv.v.x_x_nxv64i8:
 ; CHECK:       # %bb.0: # %entry
 ; CHECK-NEXT:    vsetvli zero, a1, e8, m8, ta, ma
@@ -137,7 +139,7 @@ entry:
   %a = call <vscale x 64 x i8> @llvm.riscv.vmv.v.x.nxv64i8(
     <vscale x 64 x i8> undef,
     i8 %0,
-    i64 %1)
+    iXLen %1)
 
   ret <vscale x 64 x i8> %a
 }
@@ -145,9 +147,9 @@ entry:
 declare <vscale x 1 x i16> @llvm.riscv.vmv.v.x.nxv1i16(
   <vscale x 1 x i16>,
   i16,
-  i64);
+  iXLen);
 
-define <vscale x 1 x i16> @intrinsic_vmv.v.x_x_nxv1i16(i16 %0, i64 %1) nounwind {
+define <vscale x 1 x i16> @intrinsic_vmv.v.x_x_nxv1i16(i16 %0, iXLen %1) nounwind {
 ; CHECK-LABEL: intrinsic_vmv.v.x_x_nxv1i16:
 ; CHECK:       # %bb.0: # %entry
 ; CHECK-NEXT:    vsetvli zero, a1, e16, mf4, ta, ma
@@ -157,7 +159,7 @@ entry:
   %a = call <vscale x 1 x i16> @llvm.riscv.vmv.v.x.nxv1i16(
     <vscale x 1 x i16> undef,
     i16 %0,
-    i64 %1)
+    iXLen %1)
 
   ret <vscale x 1 x i16> %a
 }
@@ -165,9 +167,9 @@ entry:
 declare <vscale x 2 x i16> @llvm.riscv.vmv.v.x.nxv2i16(
   <vscale x 2 x i16>,
   i16,
-  i64);
+  iXLen);
 
-define <vscale x 2 x i16> @intrinsic_vmv.v.x_x_nxv2i16(i16 %0, i64 %1) nounwind {
+define <vscale x 2 x i16> @intrinsic_vmv.v.x_x_nxv2i16(i16 %0, iXLen %1) nounwind {
 ; CHECK-LABEL: intrinsic_vmv.v.x_x_nxv2i16:
 ; CHECK:       # %bb.0: # %entry
 ; CHECK-NEXT:    vsetvli zero, a1, e16, mf2, ta, ma
@@ -177,7 +179,7 @@ entry:
   %a = call <vscale x 2 x i16> @llvm.riscv.vmv.v.x.nxv2i16(
     <vscale x 2 x i16> undef,
     i16 %0,
-    i64 %1)
+    iXLen %1)
 
   ret <vscale x 2 x i16> %a
 }
@@ -185,9 +187,9 @@ entry:
 declare <vscale x 4 x i16> @llvm.riscv.vmv.v.x.nxv4i16(
   <vscale x 4 x i16>,
   i16,
-  i64);
+  iXLen);
 
-define <vscale x 4 x i16> @intrinsic_vmv.v.x_x_nxv4i16(i16 %0, i64 %1) nounwind {
+define <vscale x 4 x i16> @intrinsic_vmv.v.x_x_nxv4i16(i16 %0, iXLen %1) nounwind {
 ; CHECK-LABEL: intrinsic_vmv.v.x_x_nxv4i16:
 ; CHECK:       # %bb.0: # %entry
 ; CHECK-NEXT:    vsetvli zero, a1, e16, m1, ta, ma
@@ -197,7 +199,7 @@ entry:
   %a = call <vscale x 4 x i16> @llvm.riscv.vmv.v.x.nxv4i16(
     <vscale x 4 x i16> undef,
     i16 %0,
-    i64 %1)
+    iXLen %1)
 
   ret <vscale x 4 x i16> %a
 }
@@ -205,9 +207,9 @@ entry:
 declare <vscale x 8 x i16> @llvm.riscv.vmv.v.x.nxv8i16(
   <vscale x 8 x i16>,
   i16,
-  i64);
+  iXLen);
 
-define <vscale x 8 x i16> @intrinsic_vmv.v.x_x_nxv8i16(i16 %0, i64 %1) nounwind {
+define <vscale x 8 x i16> @intrinsic_vmv.v.x_x_nxv8i16(i16 %0, iXLen %1) nounwind {
 ; CHECK-LABEL: intrinsic_vmv.v.x_x_nxv8i16:
 ; CHECK:       # %bb.0: # %entry
 ; CHECK-NEXT:    vsetvli zero, a1, e16, m2, ta, ma
@@ -217,7 +219,7 @@ entry:
   %a = call <vscale x 8 x i16> @llvm.riscv.vmv.v.x.nxv8i16(
     <vscale x 8 x i16> undef,
     i16 %0,
-    i64 %1)
+    iXLen %1)
 
   ret <vscale x 8 x i16> %a
 }
@@ -225,9 +227,9 @@ entry:
 declare <vscale x 16 x i16> @llvm.riscv.vmv.v.x.nxv16i16(
   <vscale x 16 x i16>,
   i16,
-  i64);
+  iXLen);
 
-define <vscale x 16 x i16> @intrinsic_vmv.v.x_x_nxv16i16(i16 %0, i64 %1) nounwind {
+define <vscale x 16 x i16> @intrinsic_vmv.v.x_x_nxv16i16(i16 %0, iXLen %1) nounwind {
 ; CHECK-LABEL: intrinsic_vmv.v.x_x_nxv16i16:
 ; CHECK:       # %bb.0: # %entry
 ; CHECK-NEXT:    vsetvli zero, a1, e16, m4, ta, ma
@@ -237,7 +239,7 @@ entry:
   %a = call <vscale x 16 x i16> @llvm.riscv.vmv.v.x.nxv16i16(
     <vscale x 16 x i16> undef,
     i16 %0,
-    i64 %1)
+    iXLen %1)
 
   ret <vscale x 16 x i16> %a
 }
@@ -245,9 +247,9 @@ entry:
 declare <vscale x 32 x i16> @llvm.riscv.vmv.v.x.nxv32i16(
   <vscale x 32 x i16>,
   i16,
-  i64);
+  iXLen);
 
-define <vscale x 32 x i16> @intrinsic_vmv.v.x_x_nxv32i16(i16 %0, i64 %1) nounwind {
+define <vscale x 32 x i16> @intrinsic_vmv.v.x_x_nxv32i16(i16 %0, iXLen %1) nounwind {
 ; CHECK-LABEL: intrinsic_vmv.v.x_x_nxv32i16:
 ; CHECK:       # %bb.0: # %entry
 ; CHECK-NEXT:    vsetvli zero, a1, e16, m8, ta, ma
@@ -257,7 +259,7 @@ entry:
   %a = call <vscale x 32 x i16> @llvm.riscv.vmv.v.x.nxv32i16(
     <vscale x 32 x i16> undef,
     i16 %0,
-    i64 %1)
+    iXLen %1)
 
   ret <vscale x 32 x i16> %a
 }
@@ -265,9 +267,9 @@ entry:
 declare <vscale x 1 x i32> @llvm.riscv.vmv.v.x.nxv1i32(
   <vscale x 1 x i32>,
   i32,
-  i64);
+  iXLen);
 
-define <vscale x 1 x i32> @intrinsic_vmv.v.x_x_nxv1i32(i32 %0, i64 %1) nounwind {
+define <vscale x 1 x i32> @intrinsic_vmv.v.x_x_nxv1i32(i32 %0, iXLen %1) nounwind {
 ; CHECK-LABEL: intrinsic_vmv.v.x_x_nxv1i32:
 ; CHECK:       # %bb.0: # %entry
 ; CHECK-NEXT:    vsetvli zero, a1, e32, mf2, ta, ma
@@ -277,7 +279,7 @@ entry:
   %a = call <vscale x 1 x i32> @llvm.riscv.vmv.v.x.nxv1i32(
     <vscale x 1 x i32> undef,
     i32 %0,
-    i64 %1)
+    iXLen %1)
 
   ret <vscale x 1 x i32> %a
 }
@@ -285,9 +287,9 @@ entry:
 declare <vscale x 2 x i32> @llvm.riscv.vmv.v.x.nxv2i32(
   <vscale x 2 x i32>,
   i32,
-  i64);
+  iXLen);
 
-define <vscale x 2 x i32> @intrinsic_vmv.v.x_x_nxv2i32(i32 %0, i64 %1) nounwind {
+define <vscale x 2 x i32> @intrinsic_vmv.v.x_x_nxv2i32(i32 %0, iXLen %1) nounwind {
 ; CHECK-LABEL: intrinsic_vmv.v.x_x_nxv2i32:
 ; CHECK:       # %bb.0: # %entry
 ; CHECK-NEXT:    vsetvli zero, a1, e32, m1, ta, ma
@@ -297,7 +299,7 @@ entry:
   %a = call <vscale x 2 x i32> @llvm.riscv.vmv.v.x.nxv2i32(
     <vscale x 2 x i32> undef,
     i32 %0,
-    i64 %1)
+    iXLen %1)
 
   ret <vscale x 2 x i32> %a
 }
@@ -305,9 +307,9 @@ entry:
 declare <vscale x 4 x i32> @llvm.riscv.vmv.v.x.nxv4i32(
   <vscale x 4 x i32>,
   i32,
-  i64);
+  iXLen);
 
-define <vscale x 4 x i32> @intrinsic_vmv.v.x_x_nxv4i32(i32 %0, i64 %1) nounwind {
+define <vscale x 4 x i32> @intrinsic_vmv.v.x_x_nxv4i32(i32 %0, iXLen %1) nounwind {
 ; CHECK-LABEL: intrinsic_vmv.v.x_x_nxv4i32:
 ; CHECK:       # %bb.0: # %entry
 ; CHECK-NEXT:    vsetvli zero, a1, e32, m2, ta, ma
@@ -317,7 +319,7 @@ entry:
   %a = call <vscale x 4 x i32> @llvm.riscv.vmv.v.x.nxv4i32(
     <vscale x 4 x i32> undef,
     i32 %0,
-    i64 %1)
+    iXLen %1)
 
   ret <vscale x 4 x i32> %a
 }
@@ -325,9 +327,9 @@ entry:
 declare <vscale x 8 x i32> @llvm.riscv.vmv.v.x.nxv8i32(
   <vscale x 8 x i32>,
   i32,
-  i64);
+  iXLen);
 
-define <vscale x 8 x i32> @intrinsic_vmv.v.x_x_nxv8i32(i32 %0, i64 %1) nounwind {
+define <vscale x 8 x i32> @intrinsic_vmv.v.x_x_nxv8i32(i32 %0, iXLen %1) nounwind {
 ; CHECK-LABEL: intrinsic_vmv.v.x_x_nxv8i32:
 ; CHECK:       # %bb.0: # %entry
 ; CHECK-NEXT:    vsetvli zero, a1, e32, m4, ta, ma
@@ -337,7 +339,7 @@ entry:
   %a = call <vscale x 8 x i32> @llvm.riscv.vmv.v.x.nxv8i32(
     <vscale x 8 x i32> undef,
     i32 %0,
-    i64 %1)
+    iXLen %1)
 
   ret <vscale x 8 x i32> %a
 }
@@ -345,9 +347,9 @@ entry:
 declare <vscale x 16 x i32> @llvm.riscv.vmv.v.x.nxv16i32(
   <vscale x 16 x i32>,
   i32,
-  i64);
+  iXLen);
 
-define <vscale x 16 x i32> @intrinsic_vmv.v.x_x_nxv16i32(i32 %0, i64 %1) nounwind {
+define <vscale x 16 x i32> @intrinsic_vmv.v.x_x_nxv16i32(i32 %0, iXLen %1) nounwind {
 ; CHECK-LABEL: intrinsic_vmv.v.x_x_nxv16i32:
 ; CHECK:       # %bb.0: # %entry
 ; CHECK-NEXT:    vsetvli zero, a1, e32, m8, ta, ma
@@ -357,7 +359,7 @@ entry:
   %a = call <vscale x 16 x i32> @llvm.riscv.vmv.v.x.nxv16i32(
     <vscale x 16 x i32> undef,
     i32 %0,
-    i64 %1)
+    iXLen %1)
 
   ret <vscale x 16 x i32> %a
 }
@@ -365,19 +367,30 @@ entry:
 declare <vscale x 1 x i64> @llvm.riscv.vmv.v.x.nxv1i64(
   <vscale x 1 x i64>,
   i64,
-  i64);
-
-define <vscale x 1 x i64> @intrinsic_vmv.v.x_x_nxv1i64(i64 %0, i64 %1) nounwind {
-; CHECK-LABEL: intrinsic_vmv.v.x_x_nxv1i64:
-; CHECK:       # %bb.0: # %entry
-; CHECK-NEXT:    vsetvli zero, a1, e64, m1, ta, ma
-; CHECK-NEXT:    vmv.v.x v8, a0
-; CHECK-NEXT:    ret
+  iXLen);
+
+define <vscale x 1 x i64> @intrinsic_vmv.v.x_x_nxv1i64(i64 %0, iXLen %1) nounwind {
+; RV32-LABEL: intrinsic_vmv.v.x_x_nxv1i64:
+; RV32:       # %bb.0: # %entry
+; RV32-NEXT:    addi sp, sp, -16
+; RV32-NEXT:    sw a1, 12(sp)
+; RV32-NEXT:    sw a0, 8(sp)
+; RV32-NEXT:    addi a0, sp, 8
+; RV32-NEXT:    vsetvli zero, a2, e64, m1, ta, ma
+; RV32-NEXT:    vlse64.v v8, (a0), zero
+; RV32-NEXT:    addi sp, sp, 16
+; RV32-NEXT:    ret
+;
+; RV64-LABEL: intrinsic_vmv.v.x_x_nxv1i64:
+; RV64:       # %bb.0: # %entry
+; RV64-NEXT:    vsetvli zero, a1, e64, m1, ta, ma
+; RV64-NEXT:    vmv.v.x v8, a0
+; RV64-NEXT:    ret
 entry:
   %a = call <vscale x 1 x i64> @llvm.riscv.vmv.v.x.nxv1i64(
     <vscale x 1 x i64> undef,
     i64 %0,
-    i64 %1)
+    iXLen %1)
 
   ret <vscale x 1 x i64> %a
 }
@@ -385,19 +398,30 @@ entry:
 declare <vscale x 2 x i64> @llvm.riscv.vmv.v.x.nxv2i64(
   <vscale x 2 x i64>,
   i64,
-  i64);
-
-define <vscale x 2 x i64> @intrinsic_vmv.v.x_x_nxv2i64(i64 %0, i64 %1) nounwind {
-; CHECK-LABEL: intrinsic_vmv.v.x_x_nxv2i64:
-; CHECK:       # %bb.0: # %entry
-; CHECK-NEXT:    vsetvli zero, a1, e64, m2, ta, ma
-; CHECK-NEXT:    vmv.v.x v8, a0
-; CHECK-NEXT:    ret
+  iXLen);
+
+define <vscale x 2 x i64> @intrinsic_vmv.v.x_x_nxv2i64(i64 %0, iXLen %1) nounwind {
+; RV32-LABEL: intrinsic_vmv.v.x_x_nxv2i64:
+; RV32:       # %bb.0: # %entry
+; RV32-NEXT:    addi sp, sp, -16
+; RV32-NEXT:    sw a1, 12(sp)
+; RV32-NEXT:    sw a0, 8(sp)
+; RV32-NEXT:    addi a0, sp, 8
+; RV32-NEXT:    vsetvli zero, a2, e64, m2, ta, ma
+; RV32-NEXT:    vlse64.v v8, (a0), zero
+; RV32-NEXT:    addi sp, sp, 16
+; RV32-NEXT:    ret
+;
+; RV64-LABEL: intrinsic_vmv.v.x_x_nxv2i64:
+; RV64:       # %bb.0: # %entry
+; RV64-NEXT:    vsetvli zero, a1, e64, m2, ta, ma
+; RV64-NEXT:    vmv.v.x v8, a0
+; RV64-NEXT:    ret
 entry:
   %a = call <vscale x 2 x i64> @llvm.riscv.vmv.v.x.nxv2i64(
     <vscale x 2 x i64> undef,
     i64 %0,
-    i64 %1)
+    iXLen %1)
 
   ret <vscale x 2 x i64> %a
 }
@@ -405,19 +429,30 @@ entry:
 declare <vscale x 4 x i64> @llvm.riscv.vmv.v.x.nxv4i64(
   <vscale x 4 x i64>,
   i64,
-  i64);
-
-define <vscale x 4 x i64> @intrinsic_vmv.v.x_x_nxv4i64(i64 %0, i64 %1) nounwind {
-; CHECK-LABEL: intrinsic_vmv.v.x_x_nxv4i64:
-; CHECK:       # %bb.0: # %entry
-; CHECK-NEXT:    vsetvli zero, a1, e64, m4, ta, ma
-; CHECK-NEXT:    vmv.v.x v8, a0
-; CHECK-NEXT:    ret
+  iXLen);
+
+define <vscale x 4 x i64> @intrinsic_vmv.v.x_x_nxv4i64(i64 %0, iXLen %1) nounwind {
+; RV32-LABEL: intrinsic_vmv.v.x_x_nxv4i64:
+; RV32:       # %bb.0: # %entry
+; RV32-NEXT:    addi sp, sp, -16
+; RV32-NEXT:    sw a1, 12(sp)
+; RV32-NEXT:    sw a0, 8(sp)
+; RV32-NEXT:    addi a0, sp, 8
+; RV32-NEXT:    vsetvli zero, a2, e64, m4, ta, ma
+; RV32-NEXT:    vlse64.v v8, (a0), zero
+; RV32-NEXT:    addi sp, sp, 16
+; RV32-NEXT:    ret
+;
+; RV64-LABEL: intrinsic_vmv.v.x_x_nxv4i64:
+; RV64:       # %bb.0: # %entry
+; RV64-NEXT:    vsetvli zero, a1, e64, m4, ta, ma
+; RV64-NEXT:    vmv.v.x v8, a0
+; RV64-NEXT:    ret
 entry:
   %a = call <vscale x 4 x i64> @llvm.riscv.vmv.v.x.nxv4i64(
     <vscale x 4 x i64> undef,
     i64 %0,
-    i64 %1)
+    iXLen %1)
 
   ret <vscale x 4 x i64> %a
 }
@@ -425,24 +460,35 @@ entry:
 declare <vscale x 8 x i64> @llvm.riscv.vmv.v.x.nxv8i64(
   <vscale x 8 x i64>,
   i64,
-  i64);
-
-define <vscale x 8 x i64> @intrinsic_vmv.v.x_x_nxv8i64(i64 %0, i64 %1) nounwind {
-; CHECK-LABEL: intrinsic_vmv.v.x_x_nxv8i64:
-; CHECK:       # %bb.0: # %entry
-; CHECK-NEXT:    vsetvli zero, a1, e64, m8, ta, ma
-; CHECK-NEXT:    vmv.v.x v8, a0
-; CHECK-NEXT:    ret
+  iXLen);
+
+define <vscale x 8 x i64> @intrinsic_vmv.v.x_x_nxv8i64(i64 %0, iXLen %1) nounwind {
+; RV32-LABEL: intrinsic_vmv.v.x_x_nxv8i64:
+; RV32:       # %bb.0: # %entry
+; RV32-NEXT:    addi sp, sp, -16
+; RV32-NEXT:    sw a1, 12(sp)
+; RV32-NEXT:    sw a0, 8(sp)
+; RV32-NEXT:    addi a0, sp, 8
+; RV32-NEXT:    vsetvli zero, a2, e64, m8, ta, ma
+; RV32-NEXT:    vlse64.v v8, (a0), zero
+; RV32-NEXT:    addi sp, sp, 16
+; RV32-NEXT:    ret
+;
+; RV64-LABEL: intrinsic_vmv.v.x_x_nxv8i64:
+; RV64:       # %bb.0: # %entry
+; RV64-NEXT:    vsetvli zero, a1, e64, m8, ta, ma
+; RV64-NEXT:    vmv.v.x v8, a0
+; RV64-NEXT:    ret
 entry:
   %a = call <vscale x 8 x i64> @llvm.riscv.vmv.v.x.nxv8i64(
     <vscale x 8 x i64> undef,
     i64 %0,
-    i64 %1)
+    iXLen %1)
 
   ret <vscale x 8 x i64> %a
 }
 
-define <vscale x 1 x i8> @intrinsic_vmv.v.x_i_nxv1i8(i64 %0) nounwind {
+define <vscale x 1 x i8> @intrinsic_vmv.v.x_i_nxv1i8(iXLen %0) nounwind {
 ; CHECK-LABEL: intrinsic_vmv.v.x_i_nxv1i8:
 ; CHECK:       # %bb.0: # %entry
 ; CHECK-NEXT:    vsetvli zero, a0, e8, mf8, ta, ma
@@ -452,12 +498,12 @@ entry:
   %a = call <vscale x 1 x i8> @llvm.riscv.vmv.v.x.nxv1i8(
     <vscale x 1 x i8> undef,
     i8 9,
-    i64 %0)
+    iXLen %0)
 
   ret <vscale x 1 x i8> %a
 }
 
-define <vscale x 2 x i8> @intrinsic_vmv.v.x_i_nxv2i8(i64 %0) nounwind {
+define <vscale x 2 x i8> @intrinsic_vmv.v.x_i_nxv2i8(iXLen %0) nounwind {
 ; CHECK-LABEL: intrinsic_vmv.v.x_i_nxv2i8:
 ; CHECK:       # %bb.0: # %entry
 ; CHECK-NEXT:    vsetvli zero, a0, e8, mf4, ta, ma
@@ -467,12 +513,12 @@ entry:
   %a = call <vscale x 2 x i8> @llvm.riscv.vmv.v.x.nxv2i8(
     <vscale x 2 x i8> undef,
     i8 9,
-    i64 %0)
+    iXLen %0)
 
   ret <vscale x 2 x i8> %a
 }
 
-define <vscale x 4 x i8> @intrinsic_vmv.v.x_i_nxv4i8(i64 %0) nounwind {
+define <vscale x 4 x i8> @intrinsic_vmv.v.x_i_nxv4i8(iXLen %0) nounwind {
 ; CHECK-LABEL: intrinsic_vmv.v.x_i_nxv4i8:
 ; CHECK:       # %bb.0: # %entry
 ; CHECK-NEXT:    vsetvli zero, a0, e8, mf2, ta, ma
@@ -482,12 +528,12 @@ entry:
   %a = call <vscale x 4 x i8> @llvm.riscv.vmv.v.x.nxv4i8(
     <vscale x 4 x i8> undef,
     i8 9,
-    i64 %0)
+    iXLen %0)
 
   ret <vscale x 4 x i8> %a
 }
 
-define <vscale x 8 x i8> @intrinsic_vmv.v.x_i_nxv8i8(i64 %0) nounwind {
+define <vscale x 8 x i8> @intrinsic_vmv.v.x_i_nxv8i8(iXLen %0) nounwind {
 ; CHECK-LABEL: intrinsic_vmv.v.x_i_nxv8i8:
 ; CHECK:       # %bb.0: # %entry
 ; CHECK-NEXT:    vsetvli zero, a0, e8, m1, ta, ma
@@ -497,12 +543,12 @@ entry:
   %a = call <vscale x 8 x i8> @llvm.riscv.vmv.v.x.nxv8i8(
     <vscale x 8 x i8> undef,
     i8 9,
-    i64 %0)
+    iXLen %0)
 
   ret <vscale x 8 x i8> %a
 }
 
-define <vscale x 16 x i8> @intrinsic_vmv.v.x_i_nxv16i8(i64 %0) nounwind {
+define <vscale x 16 x i8> @intrinsic_vmv.v.x_i_nxv16i8(iXLen %0) nounwind {
 ; CHECK-LABEL: intrinsic_vmv.v.x_i_nxv16i8:
 ; CHECK:       # %bb.0: # %entry
 ; CHECK-NEXT:    vsetvli zero, a0, e8, m2, ta, ma
@@ -512,12 +558,12 @@ entry:
   %a = call <vscale x 16 x i8> @llvm.riscv.vmv.v.x.nxv16i8(
     <vscale x 16 x i8> undef,
     i8 9,
-    i64 %0)
+    iXLen %0)
 
   ret <vscale x 16 x i8> %a
 }
 
-define <vscale x 32 x i8> @intrinsic_vmv.v.x_i_nxv32i8(i64 %0) nounwind {
+define <vscale x 32 x i8> @intrinsic_vmv.v.x_i_nxv32i8(iXLen %0) nounwind {
 ; CHECK-LABEL: intrinsic_vmv.v.x_i_nxv32i8:
 ; CHECK:       # %bb.0: # %entry
 ; CHECK-NEXT:    vsetvli zero, a0, e8, m4, ta, ma
@@ -527,12 +573,12 @@ entry:
   %a = call <vscale x 32 x i8> @llvm.riscv.vmv.v.x.nxv32i8(
     <vscale x 32 x i8> undef,
     i8 9,
-    i64 %0)
+    iXLen %0)
 
   ret <vscale x 32 x i8> %a
 }
 
-define <vscale x 64 x i8> @intrinsic_vmv.v.x_i_nxv64i8(i64 %0) nounwind {
+define <vscale x 64 x i8> @intrinsic_vmv.v.x_i_nxv64i8(iXLen %0) nounwind {
 ; CHECK-LABEL: intrinsic_vmv.v.x_i_nxv64i8:
 ; CHECK:       # %bb.0: # %entry
 ; CHECK-NEXT:    vsetvli zero, a0, e8, m8, ta, ma
@@ -542,12 +588,12 @@ entry:
   %a = call <vscale x 64 x i8> @llvm.riscv.vmv.v.x.nxv64i8(
     <vscale x 64 x i8> undef,
     i8 9,
-    i64 %0)
+    iXLen %0)
 
   ret <vscale x 64 x i8> %a
 }
 
-define <vscale x 1 x i16> @intrinsic_vmv.v.x_i_nxv1i16(i64 %0) nounwind {
+define <vscale x 1 x i16> @intrinsic_vmv.v.x_i_nxv1i16(iXLen %0) nounwind {
 ; CHECK-LABEL: intrinsic_vmv.v.x_i_nxv1i16:
 ; CHECK:       # %bb.0: # %entry
 ; CHECK-NEXT:    vsetvli zero, a0, e16, mf4, ta, ma
@@ -557,12 +603,12 @@ entry:
   %a = call <vscale x 1 x i16> @llvm.riscv.vmv.v.x.nxv1i16(
     <vscale x 1 x i16> undef,
     i16 9,
-    i64 %0)
+    iXLen %0)
 
   ret <vscale x 1 x i16> %a
 }
 
-define <vscale x 2 x i16> @intrinsic_vmv.v.x_i_nxv2i16(i64 %0) nounwind {
+define <vscale x 2 x i16> @intrinsic_vmv.v.x_i_nxv2i16(iXLen %0) nounwind {
 ; CHECK-LABEL: intrinsic_vmv.v.x_i_nxv2i16:
 ; CHECK:       # %bb.0: # %entry
 ; CHECK-NEXT:    vsetvli zero, a0, e16, mf2, ta, ma
@@ -572,12 +618,12 @@ entry:
   %a = call <vscale x 2 x i16> @llvm.riscv.vmv.v.x.nxv2i16(
     <vscale x 2 x i16> undef,
     i16 9,
-    i64 %0)
+    iXLen %0)
 
   ret <vscale x 2 x i16> %a
 }
 
-define <vscale x 4 x i16> @intrinsic_vmv.v.x_i_nxv4i16(i64 %0) nounwind {
+define <vscale x 4 x i16> @intrinsic_vmv.v.x_i_nxv4i16(iXLen %0) nounwind {
 ; CHECK-LABEL: intrinsic_vmv.v.x_i_nxv4i16:
 ; CHECK:       # %bb.0: # %entry
 ; CHECK-NEXT:    vsetvli zero, a0, e16, m1, ta, ma
@@ -587,12 +633,12 @@ entry:
   %a = call <vscale x 4 x i16> @llvm.riscv.vmv.v.x.nxv4i16(
     <vscale x 4 x i16> undef,
     i16 9,
-    i64 %0)
+    iXLen %0)
 
   ret <vscale x 4 x i16> %a
 }
 
-define <vscale x 8 x i16> @intrinsic_vmv.v.x_i_nxv8i16(i64 %0) nounwind {
+define <vscale x 8 x i16> @intrinsic_vmv.v.x_i_nxv8i16(iXLen %0) nounwind {
 ; CHECK-LABEL: intrinsic_vmv.v.x_i_nxv8i16:
 ; CHECK:       # %bb.0: # %entry
 ; CHECK-NEXT:    vsetvli zero, a0, e16, m2, ta, ma
@@ -602,12 +648,12 @@ entry:
   %a = call <vscale x 8 x i16> @llvm.riscv.vmv.v.x.nxv8i16(
     <vscale x 8 x i16> undef,
     i16 9,
-    i64 %0)
+    iXLen %0)
 
   ret <vscale x 8 x i16> %a
 }
 
-define <vscale x 16 x i16> @intrinsic_vmv.v.x_i_nxv16i16(i64 %0) nounwind {
+define <vscale x 16 x i16> @intrinsic_vmv.v.x_i_nxv16i16(iXLen %0) nounwind {
 ; CHECK-LABEL: intrinsic_vmv.v.x_i_nxv16i16:
 ; CHECK:       # %bb.0: # %entry
 ; CHECK-NEXT:    vsetvli zero, a0, e16, m4, ta, ma
@@ -617,12 +663,12 @@ entry:
   %a = call <vscale x 16 x i16> @llvm.riscv.vmv.v.x.nxv16i16(
     <vscale x 16 x i16> undef,
     i16 9,
-    i64 %0)
+    iXLen %0)
 
   ret <vscale x 16 x i16> %a
 }
 
-define <vscale x 32 x i16> @intrinsic_vmv.v.x_i_nxv32i16(i64 %0) nounwind {
+define <vscale x 32 x i16> @intrinsic_vmv.v.x_i_nxv32i16(iXLen %0) nounwind {
 ; CHECK-LABEL: intrinsic_vmv.v.x_i_nxv32i16:
 ; CHECK:       # %bb.0: # %entry
 ; CHECK-NEXT:    vsetvli zero, a0, e16, m8, ta, ma
@@ -632,12 +678,12 @@ entry:
   %a = call <vscale x 32 x i16> @llvm.riscv.vmv.v.x.nxv32i16(
     <vscale x 32 x i16> undef,
     i16 9,
-    i64 %0)
+    iXLen %0)
 
   ret <vscale x 32 x i16> %a
 }
 
-define <vscale x 1 x i32> @intrinsic_vmv.v.x_i_nxv1i32(i64 %0) nounwind {
+define <vscale x 1 x i32> @intrinsic_vmv.v.x_i_nxv1i32(iXLen %0) nounwind {
 ; CHECK-LABEL: intrinsic_vmv.v.x_i_nxv1i32:
 ; CHECK:       # %bb.0: # %entry
 ; CHECK-NEXT:    vsetvli zero, a0, e32, mf2, ta, ma
@@ -647,12 +693,12 @@ entry:
   %a = call <vscale x 1 x i32> @llvm.riscv.vmv.v.x.nxv1i32(
     <vscale x 1 x i32> undef,
     i32 9,
-    i64 %0)
+    iXLen %0)
 
   ret <vscale x 1 x i32> %a
 }
 
-define <vscale x 2 x i32> @intrinsic_vmv.v.x_i_nxv2i32(i64 %0) nounwind {
+define <vscale x 2 x i32> @intrinsic_vmv.v.x_i_nxv2i32(iXLen %0) nounwind {
 ; CHECK-LABEL: intrinsic_vmv.v.x_i_nxv2i32:
 ; CHECK:       # %bb.0: # %entry
 ; CHECK-NEXT:    vsetvli zero, a0, e32, m1, ta, ma
@@ -662,12 +708,12 @@ entry:
   %a = call <vscale x 2 x i32> @llvm.riscv.vmv.v.x.nxv2i32(
     <vscale x 2 x i32> undef,
     i32 9,
-    i64 %0)
+    iXLen %0)
 
   ret <vscale x 2 x i32> %a
 }
 
-define <vscale x 4 x i32> @intrinsic_vmv.v.x_i_nxv4i32(i64 %0) nounwind {
+define <vscale x 4 x i32> @intrinsic_vmv.v.x_i_nxv4i32(iXLen %0) nounwind {
 ; CHECK-LABEL: intrinsic_vmv.v.x_i_nxv4i32:
 ; CHECK:       # %bb.0: # %entry
 ; CHECK-NEXT:    vsetvli zero, a0, e32, m2, ta, ma
@@ -677,12 +723,12 @@ entry:
   %a = call <vscale x 4 x i32> @llvm.riscv.vmv.v.x.nxv4i32(
     <vscale x 4 x i32> undef,
     i32 9,
-    i64 %0)
+    iXLen %0)
 
   ret <vscale x 4 x i32> %a
 }
 
-define <vscale x 8 x i32> @intrinsic_vmv.v.x_i_nxv8i32(i64 %0) nounwind {
+define <vscale x 8 x i32> @intrinsic_vmv.v.x_i_nxv8i32(iXLen %0) nounwind {
 ; CHECK-LABEL: intrinsic_vmv.v.x_i_nxv8i32:
 ; CHECK:       # %bb.0: # %entry
 ; CHECK-NEXT:    vsetvli zero, a0, e32, m4, ta, ma
@@ -692,12 +738,12 @@ entry:
   %a = call <vscale x 8 x i32> @llvm.riscv.vmv.v.x.nxv8i32(
     <vscale x 8 x i32> undef,
     i32 9,
-    i64 %0)
+    iXLen %0)
 
   ret <vscale x 8 x i32> %a
 }
 
-define <vscale x 16 x i32> @intrinsic_vmv.v.x_i_nxv16i32(i64 %0) nounwind {
+define <vscale x 16 x i32> @intrinsic_vmv.v.x_i_nxv16i32(iXLen %0) nounwind {
 ; CHECK-LABEL: intrinsic_vmv.v.x_i_nxv16i32:
 ; CHECK:       # %bb.0: # %entry
 ; CHECK-NEXT:    vsetvli zero, a0, e32, m8, ta, ma
@@ -707,12 +753,12 @@ entry:
   %a = call <vscale x 16 x i32> @llvm.riscv.vmv.v.x.nxv16i32(
     <vscale x 16 x i32> undef,
     i32 9,
-    i64 %0)
+    iXLen %0)
 
   ret <vscale x 16 x i32> %a
 }
 
-define <vscale x 1 x i64> @intrinsic_vmv.v.x_i_nxv1i64(i64 %0) nounwind {
+define <vscale x 1 x i64> @intrinsic_vmv.v.x_i_nxv1i64(iXLen %0) nounwind {
 ; CHECK-LABEL: intrinsic_vmv.v.x_i_nxv1i64:
 ; CHECK:       # %bb.0: # %entry
 ; CHECK-NEXT:    vsetvli zero, a0, e64, m1, ta, ma
@@ -722,12 +768,12 @@ entry:
   %a = call <vscale x 1 x i64> @llvm.riscv.vmv.v.x.nxv1i64(
     <vscale x 1 x i64> undef,
     i64 9,
-    i64 %0)
+    iXLen %0)
 
   ret <vscale x 1 x i64> %a
 }
 
-define <vscale x 2 x i64> @intrinsic_vmv.v.x_i_nxv2i64(i64 %0) nounwind {
+define <vscale x 2 x i64> @intrinsic_vmv.v.x_i_nxv2i64(iXLen %0) nounwind {
 ; CHECK-LABEL: intrinsic_vmv.v.x_i_nxv2i64:
 ; CHECK:       # %bb.0: # %entry
 ; CHECK-NEXT:    vsetvli zero, a0, e64, m2, ta, ma
@@ -737,12 +783,12 @@ entry:
   %a = call <vscale x 2 x i64> @llvm.riscv.vmv.v.x.nxv2i64(
     <vscale x 2 x i64> undef,
     i64 9,
-    i64 %0)
+    iXLen %0)
 
   ret <vscale x 2 x i64> %a
 }
 
-define <vscale x 4 x i64> @intrinsic_vmv.v.x_i_nxv4i64(i64 %0) nounwind {
+define <vscale x 4 x i64> @intrinsic_vmv.v.x_i_nxv4i64(iXLen %0) nounwind {
 ; CHECK-LABEL: intrinsic_vmv.v.x_i_nxv4i64:
 ; CHECK:       # %bb.0: # %entry
 ; CHECK-NEXT:    vsetvli zero, a0, e64, m4, ta, ma
@@ -752,12 +798,12 @@ entry:
   %a = call <vscale x 4 x i64> @llvm.riscv.vmv.v.x.nxv4i64(
     <vscale x 4 x i64> undef,
     i64 9,
-    i64 %0)
+    iXLen %0)
 
   ret <vscale x 4 x i64> %a
 }
 
-define <vscale x 8 x i64> @intrinsic_vmv.v.x_i_nxv8i64(i64 %0) nounwind {
+define <vscale x 8 x i64> @intrinsic_vmv.v.x_i_nxv8i64(iXLen %0) nounwind {
 ; CHECK-LABEL: intrinsic_vmv.v.x_i_nxv8i64:
 ; CHECK:       # %bb.0: # %entry
 ; CHECK-NEXT:    vsetvli zero, a0, e64, m8, ta, ma
@@ -767,7 +813,7 @@ entry:
   %a = call <vscale x 8 x i64> @llvm.riscv.vmv.v.x.nxv8i64(
     <vscale x 8 x i64> undef,
     i64 9,
-    i64 %0)
+    iXLen %0)
 
   ret <vscale x 8 x i64> %a
 }
diff --git a/llvm/test/CodeGen/RISCV/rvv/vmv.x.s-rv32.ll b/llvm/test/CodeGen/RISCV/rvv/vmv.x.s-rv32.ll
deleted file mode 100644
index 180554ba..0000000
--- a/llvm/test/CodeGen/RISCV/rvv/vmv.x.s-rv32.ll
+++ /dev/null
@@ -1,300 +0,0 @@
-; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py
-; RUN: llc -mtriple=riscv32 -mattr=+v -verify-machineinstrs < %s | FileCheck %s
-
-declare i8 @llvm.riscv.vmv.x.s.nxv1i8(<vscale x 1 x i8>)
-
-define signext i8 @intrinsic_vmv.x.s_s_nxv1i8(<vscale x 1 x i8> %0) nounwind {
-; CHECK-LABEL: intrinsic_vmv.x.s_s_nxv1i8:
-; CHECK:       # %bb.0: # %entry
-; CHECK-NEXT:    vsetivli zero, 1, e8, m1, ta, ma
-; CHECK-NEXT:    vmv.x.s a0, v8
-; CHECK-NEXT:    ret
-entry:
-  %a = call i8 @llvm.riscv.vmv.x.s.nxv1i8(<vscale x 1 x i8> %0)
-  ret i8 %a
-}
-
-declare i8 @llvm.riscv.vmv.x.s.nxv2i8(<vscale x 2 x i8>)
-
-define signext i8 @intrinsic_vmv.x.s_s_nxv2i8(<vscale x 2 x i8> %0) nounwind {
-; CHECK-LABEL: intrinsic_vmv.x.s_s_nxv2i8:
-; CHECK:       # %bb.0: # %entry
-; CHECK-NEXT:    vsetivli zero, 1, e8, m1, ta, ma
-; CHECK-NEXT:    vmv.x.s a0, v8
-; CHECK-NEXT:    ret
-entry:
-  %a = call i8 @llvm.riscv.vmv.x.s.nxv2i8(<vscale x 2 x i8> %0)
-  ret i8 %a
-}
-
-declare i8 @llvm.riscv.vmv.x.s.nxv4i8(<vscale x 4 x i8>)
-
-define signext i8 @intrinsic_vmv.x.s_s_nxv4i8(<vscale x 4 x i8> %0) nounwind {
-; CHECK-LABEL: intrinsic_vmv.x.s_s_nxv4i8:
-; CHECK:       # %bb.0: # %entry
-; CHECK-NEXT:    vsetivli zero, 1, e8, m1, ta, ma
-; CHECK-NEXT:    vmv.x.s a0, v8
-; CHECK-NEXT:    ret
-entry:
-  %a = call i8 @llvm.riscv.vmv.x.s.nxv4i8(<vscale x 4 x i8> %0)
-  ret i8 %a
-}
-
-declare i8 @llvm.riscv.vmv.x.s.nxv8i8(<vscale x 8 x i8>)
-
-define signext i8 @intrinsic_vmv.x.s_s_nxv8i8(<vscale x 8 x i8> %0) nounwind {
-; CHECK-LABEL: intrinsic_vmv.x.s_s_nxv8i8:
-; CHECK:       # %bb.0: # %entry
-; CHECK-NEXT:    vsetivli zero, 1, e8, m1, ta, ma
-; CHECK-NEXT:    vmv.x.s a0, v8
-; CHECK-NEXT:    ret
-entry:
-  %a = call i8 @llvm.riscv.vmv.x.s.nxv8i8(<vscale x 8 x i8> %0)
-  ret i8 %a
-}
-
-declare i8 @llvm.riscv.vmv.x.s.nxv16i8(<vscale x 16 x i8>)
-
-define signext i8 @intrinsic_vmv.x.s_s_nxv16i8(<vscale x 16 x i8> %0) nounwind {
-; CHECK-LABEL: intrinsic_vmv.x.s_s_nxv16i8:
-; CHECK:       # %bb.0: # %entry
-; CHECK-NEXT:    vsetivli zero, 1, e8, m1, ta, ma
-; CHECK-NEXT:    vmv.x.s a0, v8
-; CHECK-NEXT:    ret
-entry:
-  %a = call i8 @llvm.riscv.vmv.x.s.nxv16i8(<vscale x 16 x i8> %0)
-  ret i8 %a
-}
-
-declare i8 @llvm.riscv.vmv.x.s.nxv32i8(<vscale x 32 x i8>)
-
-define signext i8 @intrinsic_vmv.x.s_s_nxv32i8(<vscale x 32 x i8> %0) nounwind {
-; CHECK-LABEL: intrinsic_vmv.x.s_s_nxv32i8:
-; CHECK:       # %bb.0: # %entry
-; CHECK-NEXT:    vsetivli zero, 1, e8, m1, ta, ma
-; CHECK-NEXT:    vmv.x.s a0, v8
-; CHECK-NEXT:    ret
-entry:
-  %a = call i8 @llvm.riscv.vmv.x.s.nxv32i8(<vscale x 32 x i8> %0)
-  ret i8 %a
-}
-
-declare i8 @llvm.riscv.vmv.x.s.nxv64i8(<vscale x 64 x i8>)
-
-define signext i8 @intrinsic_vmv.x.s_s_nxv64i8(<vscale x 64 x i8> %0) nounwind {
-; CHECK-LABEL: intrinsic_vmv.x.s_s_nxv64i8:
-; CHECK:       # %bb.0: # %entry
-; CHECK-NEXT:    vsetivli zero, 1, e8, m1, ta, ma
-; CHECK-NEXT:    vmv.x.s a0, v8
-; CHECK-NEXT:    ret
-entry:
-  %a = call i8 @llvm.riscv.vmv.x.s.nxv64i8(<vscale x 64 x i8> %0)
-  ret i8 %a
-}
-
-declare i16 @llvm.riscv.vmv.x.s.nxv1i16(<vscale x 1 x i16>)
-
-define signext i16 @intrinsic_vmv.x.s_s_nxv1i16(<vscale x 1 x i16> %0) nounwind {
-; CHECK-LABEL: intrinsic_vmv.x.s_s_nxv1i16:
-; CHECK:       # %bb.0: # %entry
-; CHECK-NEXT:    vsetivli zero, 1, e16, m1, ta, ma
-; CHECK-NEXT:    vmv.x.s a0, v8
-; CHECK-NEXT:    ret
-entry:
-  %a = call i16 @llvm.riscv.vmv.x.s.nxv1i16(<vscale x 1 x i16> %0)
-  ret i16 %a
-}
-
-declare i16 @llvm.riscv.vmv.x.s.nxv2i16(<vscale x 2 x i16>)
-
-define signext i16 @intrinsic_vmv.x.s_s_nxv2i16(<vscale x 2 x i16> %0) nounwind {
-; CHECK-LABEL: intrinsic_vmv.x.s_s_nxv2i16:
-; CHECK:       # %bb.0: # %entry
-; CHECK-NEXT:    vsetivli zero, 1, e16, m1, ta, ma
-; CHECK-NEXT:    vmv.x.s a0, v8
-; CHECK-NEXT:    ret
-entry:
-  %a = call i16 @llvm.riscv.vmv.x.s.nxv2i16(<vscale x 2 x i16> %0)
-  ret i16 %a
-}
-
-declare i16 @llvm.riscv.vmv.x.s.nxv4i16(<vscale x 4 x i16>)
-
-define signext i16 @intrinsic_vmv.x.s_s_nxv4i16(<vscale x 4 x i16> %0) nounwind {
-; CHECK-LABEL: intrinsic_vmv.x.s_s_nxv4i16:
-; CHECK:       # %bb.0: # %entry
-; CHECK-NEXT:    vsetivli zero, 1, e16, m1, ta, ma
-; CHECK-NEXT:    vmv.x.s a0, v8
-; CHECK-NEXT:    ret
-entry:
-  %a = call i16 @llvm.riscv.vmv.x.s.nxv4i16(<vscale x 4 x i16> %0)
-  ret i16 %a
-}
-
-declare i16 @llvm.riscv.vmv.x.s.nxv8i16(<vscale x 8 x i16>)
-
-define signext i16 @intrinsic_vmv.x.s_s_nxv8i16(<vscale x 8 x i16> %0) nounwind {
-; CHECK-LABEL: intrinsic_vmv.x.s_s_nxv8i16:
-; CHECK:       # %bb.0: # %entry
-; CHECK-NEXT:    vsetivli zero, 1, e16, m1, ta, ma
-; CHECK-NEXT:    vmv.x.s a0, v8
-; CHECK-NEXT:    ret
-entry:
-  %a = call i16 @llvm.riscv.vmv.x.s.nxv8i16(<vscale x 8 x i16> %0)
-  ret i16 %a
-}
-
-declare i16 @llvm.riscv.vmv.x.s.nxv16i16(<vscale x 16 x i16>)
-
-define signext i16 @intrinsic_vmv.x.s_s_nxv16i16(<vscale x 16 x i16> %0) nounwind {
-; CHECK-LABEL: intrinsic_vmv.x.s_s_nxv16i16:
-; CHECK:       # %bb.0: # %entry
-; CHECK-NEXT:    vsetivli zero, 1, e16, m1, ta, ma
-; CHECK-NEXT:    vmv.x.s a0, v8
-; CHECK-NEXT:    ret
-entry:
-  %a = call i16 @llvm.riscv.vmv.x.s.nxv16i16( <vscale x 16 x i16> %0)
-  ret i16 %a
-}
-
-declare i16 @llvm.riscv.vmv.x.s.nxv32i16( <vscale x 32 x i16>)
-
-define signext i16 @intrinsic_vmv.x.s_s_nxv32i16(<vscale x 32 x i16> %0) nounwind {
-; CHECK-LABEL: intrinsic_vmv.x.s_s_nxv32i16:
-; CHECK:       # %bb.0: # %entry
-; CHECK-NEXT:    vsetivli zero, 1, e16, m1, ta, ma
-; CHECK-NEXT:    vmv.x.s a0, v8
-; CHECK-NEXT:    ret
-entry:
-  %a = call i16 @llvm.riscv.vmv.x.s.nxv32i16( <vscale x 32 x i16> %0)
-  ret i16 %a
-}
-
-declare i32 @llvm.riscv.vmv.x.s.nxv1i32( <vscale x 1 x i32>)
-
-define i32 @intrinsic_vmv.x.s_s_nxv1i32(<vscale x 1 x i32> %0) nounwind {
-; CHECK-LABEL: intrinsic_vmv.x.s_s_nxv1i32:
-; CHECK:       # %bb.0: # %entry
-; CHECK-NEXT:    vsetivli zero, 1, e32, m1, ta, ma
-; CHECK-NEXT:    vmv.x.s a0, v8
-; CHECK-NEXT:    ret
-entry:
-  %a = call i32 @llvm.riscv.vmv.x.s.nxv1i32( <vscale x 1 x i32> %0)
-  ret i32 %a
-}
-
-declare i32 @llvm.riscv.vmv.x.s.nxv2i32( <vscale x 2 x i32>)
-
-define i32 @intrinsic_vmv.x.s_s_nxv2i32(<vscale x 2 x i32> %0) nounwind {
-; CHECK-LABEL: intrinsic_vmv.x.s_s_nxv2i32:
-; CHECK:       # %bb.0: # %entry
-; CHECK-NEXT:    vsetivli zero, 1, e32, m1, ta, ma
-; CHECK-NEXT:    vmv.x.s a0, v8
-; CHECK-NEXT:    ret
-entry:
-  %a = call i32 @llvm.riscv.vmv.x.s.nxv2i32( <vscale x 2 x i32> %0)
-  ret i32 %a
-}
-
-declare i32 @llvm.riscv.vmv.x.s.nxv4i32( <vscale x 4 x i32>)
-
-define i32 @intrinsic_vmv.x.s_s_nxv4i32(<vscale x 4 x i32> %0) nounwind {
-; CHECK-LABEL: intrinsic_vmv.x.s_s_nxv4i32:
-; CHECK:       # %bb.0: # %entry
-; CHECK-NEXT:    vsetivli zero, 1, e32, m1, ta, ma
-; CHECK-NEXT:    vmv.x.s a0, v8
-; CHECK-NEXT:    ret
-entry:
-  %a = call i32 @llvm.riscv.vmv.x.s.nxv4i32( <vscale x 4 x i32> %0)
-  ret i32 %a
-}
-
-declare i32 @llvm.riscv.vmv.x.s.nxv8i32( <vscale x 8 x i32>)
-
-define i32 @intrinsic_vmv.x.s_s_nxv8i32(<vscale x 8 x i32> %0) nounwind {
-; CHECK-LABEL: intrinsic_vmv.x.s_s_nxv8i32:
-; CHECK:       # %bb.0: # %entry
-; CHECK-NEXT:    vsetivli zero, 1, e32, m1, ta, ma
-; CHECK-NEXT:    vmv.x.s a0, v8
-; CHECK-NEXT:    ret
-entry:
-  %a = call i32 @llvm.riscv.vmv.x.s.nxv8i32( <vscale x 8 x i32> %0)
-  ret i32 %a
-}
-
-declare i32 @llvm.riscv.vmv.x.s.nxv16i32( <vscale x 16 x i32>)
-
-define i32 @intrinsic_vmv.x.s_s_nxv16i32(<vscale x 16 x i32> %0) nounwind {
-; CHECK-LABEL: intrinsic_vmv.x.s_s_nxv16i32:
-; CHECK:       # %bb.0: # %entry
-; CHECK-NEXT:    vsetivli zero, 1, e32, m1, ta, ma
-; CHECK-NEXT:    vmv.x.s a0, v8
-; CHECK-NEXT:    ret
-entry:
-  %a = call i32 @llvm.riscv.vmv.x.s.nxv16i32( <vscale x 16 x i32> %0)
-  ret i32 %a
-}
-
-declare i64 @llvm.riscv.vmv.x.s.nxv1i64( <vscale x 1 x i64>)
-
-define i64 @intrinsic_vmv.x.s_s_nxv1i64(<vscale x 1 x i64> %0) nounwind {
-; CHECK-LABEL: intrinsic_vmv.x.s_s_nxv1i64:
-; CHECK:       # %bb.0: # %entry
-; CHECK-NEXT:    li a0, 32
-; CHECK-NEXT:    vsetivli zero, 1, e64, m1, ta, ma
-; CHECK-NEXT:    vsrl.vx v9, v8, a0
-; CHECK-NEXT:    vmv.x.s a1, v9
-; CHECK-NEXT:    vmv.x.s a0, v8
-; CHECK-NEXT:    ret
-entry:
-  %a = call i64 @llvm.riscv.vmv.x.s.nxv1i64( <vscale x 1 x i64> %0)
-  ret i64 %a
-}
-
-declare i64 @llvm.riscv.vmv.x.s.nxv2i64( <vscale x 2 x i64>)
-
-define i64 @intrinsic_vmv.x.s_s_nxv2i64(<vscale x 2 x i64> %0) nounwind {
-; CHECK-LABEL: intrinsic_vmv.x.s_s_nxv2i64:
-; CHECK:       # %bb.0: # %entry
-; CHECK-NEXT:    li a0, 32
-; CHECK-NEXT:    vsetivli zero, 1, e64, m2, ta, ma
-; CHECK-NEXT:    vsrl.vx v10, v8, a0
-; CHECK-NEXT:    vmv.x.s a1, v10
-; CHECK-NEXT:    vmv.x.s a0, v8
-; CHECK-NEXT:    ret
-entry:
-  %a = call i64 @llvm.riscv.vmv.x.s.nxv2i64( <vscale x 2 x i64> %0)
-  ret i64 %a
-}
-
-declare i64 @llvm.riscv.vmv.x.s.nxv4i64( <vscale x 4 x i64>)
-
-define i64 @intrinsic_vmv.x.s_s_nxv4i64(<vscale x 4 x i64> %0) nounwind {
-; CHECK-LABEL: intrinsic_vmv.x.s_s_nxv4i64:
-; CHECK:       # %bb.0: # %entry
-; CHECK-NEXT:    li a0, 32
-; CHECK-NEXT:    vsetivli zero, 1, e64, m4, ta, ma
-; CHECK-NEXT:    vsrl.vx v12, v8, a0
-; CHECK-NEXT:    vmv.x.s a1, v12
-; CHECK-NEXT:    vmv.x.s a0, v8
-; CHECK-NEXT:    ret
-entry:
-  %a = call i64 @llvm.riscv.vmv.x.s.nxv4i64( <vscale x 4 x i64> %0)
-  ret i64 %a
-}
-
-declare i64 @llvm.riscv.vmv.x.s.nxv8i64(<vscale x 8 x i64>)
-
-define i64 @intrinsic_vmv.x.s_s_nxv8i64(<vscale x 8 x i64> %0) nounwind {
-; CHECK-LABEL: intrinsic_vmv.x.s_s_nxv8i64:
-; CHECK:       # %bb.0: # %entry
-; CHECK-NEXT:    li a0, 32
-; CHECK-NEXT:    vsetivli zero, 1, e64, m8, ta, ma
-; CHECK-NEXT:    vsrl.vx v16, v8, a0
-; CHECK-NEXT:    vmv.x.s a1, v16
-; CHECK-NEXT:    vmv.x.s a0, v8
-; CHECK-NEXT:    ret
-entry:
-  %a = call i64 @llvm.riscv.vmv.x.s.nxv8i64(<vscale x 8 x i64> %0)
-  ret i64 %a
-}
diff --git a/llvm/test/CodeGen/RISCV/rvv/vmv.x.s-rv64.ll b/llvm/test/CodeGen/RISCV/rvv/vmv.x.s.ll
index 8c6c010..0ec9439 100644
--- a/llvm/test/CodeGen/RISCV/rvv/vmv.x.s-rv64.ll
+++ b/llvm/test/CodeGen/RISCV/rvv/vmv.x.s.ll
@@ -1,5 +1,8 @@
 ; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py
-; RUN: llc -mtriple=riscv64 -mattr=+v -verify-machineinstrs < %s | FileCheck %s
+; RUN: sed 's/iXLen/i32/g' %s | llc -mtriple=riscv32 -mattr=+v \
+; RUN:   -verify-machineinstrs  | FileCheck %s --check-prefixes=CHECK,RV32
+; RUN: sed 's/iXLen/i64/g' %s | llc -mtriple=riscv64 -mattr=+v \
+; RUN:   -verify-machineinstrs  | FileCheck %s --check-prefixes=CHECK,RV64
 
 declare i8 @llvm.riscv.vmv.x.s.nxv1i8(<vscale x 1 x i8>)
 
@@ -238,11 +241,20 @@ entry:
 declare i64 @llvm.riscv.vmv.x.s.nxv1i64( <vscale x 1 x i64>)
 
 define i64 @intrinsic_vmv.x.s_s_nxv1i64(<vscale x 1 x i64> %0) nounwind {
-; CHECK-LABEL: intrinsic_vmv.x.s_s_nxv1i64:
-; CHECK:       # %bb.0: # %entry
-; CHECK-NEXT:    vsetivli zero, 1, e64, m1, ta, ma
-; CHECK-NEXT:    vmv.x.s a0, v8
-; CHECK-NEXT:    ret
+; RV32-LABEL: intrinsic_vmv.x.s_s_nxv1i64:
+; RV32:       # %bb.0: # %entry
+; RV32-NEXT:    li a0, 32
+; RV32-NEXT:    vsetivli zero, 1, e64, m1, ta, ma
+; RV32-NEXT:    vsrl.vx v9, v8, a0
+; RV32-NEXT:    vmv.x.s a1, v9
+; RV32-NEXT:    vmv.x.s a0, v8
+; RV32-NEXT:    ret
+;
+; RV64-LABEL: intrinsic_vmv.x.s_s_nxv1i64:
+; RV64:       # %bb.0: # %entry
+; RV64-NEXT:    vsetivli zero, 1, e64, m1, ta, ma
+; RV64-NEXT:    vmv.x.s a0, v8
+; RV64-NEXT:    ret
 entry:
   %a = call i64 @llvm.riscv.vmv.x.s.nxv1i64( <vscale x 1 x i64> %0)
   ret i64 %a
@@ -251,11 +263,20 @@ entry:
 declare i64 @llvm.riscv.vmv.x.s.nxv2i64( <vscale x 2 x i64>)
 
 define i64 @intrinsic_vmv.x.s_s_nxv2i64(<vscale x 2 x i64> %0) nounwind {
-; CHECK-LABEL: intrinsic_vmv.x.s_s_nxv2i64:
-; CHECK:       # %bb.0: # %entry
-; CHECK-NEXT:    vsetivli zero, 1, e64, m1, ta, ma
-; CHECK-NEXT:    vmv.x.s a0, v8
-; CHECK-NEXT:    ret
+; RV32-LABEL: intrinsic_vmv.x.s_s_nxv2i64:
+; RV32:       # %bb.0: # %entry
+; RV32-NEXT:    li a0, 32
+; RV32-NEXT:    vsetivli zero, 1, e64, m2, ta, ma
+; RV32-NEXT:    vsrl.vx v10, v8, a0
+; RV32-NEXT:    vmv.x.s a1, v10
+; RV32-NEXT:    vmv.x.s a0, v8
+; RV32-NEXT:    ret
+;
+; RV64-LABEL: intrinsic_vmv.x.s_s_nxv2i64:
+; RV64:       # %bb.0: # %entry
+; RV64-NEXT:    vsetivli zero, 1, e64, m1, ta, ma
+; RV64-NEXT:    vmv.x.s a0, v8
+; RV64-NEXT:    ret
 entry:
   %a = call i64 @llvm.riscv.vmv.x.s.nxv2i64( <vscale x 2 x i64> %0)
   ret i64 %a
@@ -264,11 +285,20 @@ entry:
 declare i64 @llvm.riscv.vmv.x.s.nxv4i64( <vscale x 4 x i64>)
 
 define i64 @intrinsic_vmv.x.s_s_nxv4i64(<vscale x 4 x i64> %0) nounwind {
-; CHECK-LABEL: intrinsic_vmv.x.s_s_nxv4i64:
-; CHECK:       # %bb.0: # %entry
-; CHECK-NEXT:    vsetivli zero, 1, e64, m1, ta, ma
-; CHECK-NEXT:    vmv.x.s a0, v8
-; CHECK-NEXT:    ret
+; RV32-LABEL: intrinsic_vmv.x.s_s_nxv4i64:
+; RV32:       # %bb.0: # %entry
+; RV32-NEXT:    li a0, 32
+; RV32-NEXT:    vsetivli zero, 1, e64, m4, ta, ma
+; RV32-NEXT:    vsrl.vx v12, v8, a0
+; RV32-NEXT:    vmv.x.s a1, v12
+; RV32-NEXT:    vmv.x.s a0, v8
+; RV32-NEXT:    ret
+;
+; RV64-LABEL: intrinsic_vmv.x.s_s_nxv4i64:
+; RV64:       # %bb.0: # %entry
+; RV64-NEXT:    vsetivli zero, 1, e64, m1, ta, ma
+; RV64-NEXT:    vmv.x.s a0, v8
+; RV64-NEXT:    ret
 entry:
   %a = call i64 @llvm.riscv.vmv.x.s.nxv4i64( <vscale x 4 x i64> %0)
   ret i64 %a
@@ -277,11 +307,20 @@ entry:
 declare i64 @llvm.riscv.vmv.x.s.nxv8i64(<vscale x 8 x i64>)
 
 define i64 @intrinsic_vmv.x.s_s_nxv8i64(<vscale x 8 x i64> %0) nounwind {
-; CHECK-LABEL: intrinsic_vmv.x.s_s_nxv8i64:
-; CHECK:       # %bb.0: # %entry
-; CHECK-NEXT:    vsetivli zero, 1, e64, m1, ta, ma
-; CHECK-NEXT:    vmv.x.s a0, v8
-; CHECK-NEXT:    ret
+; RV32-LABEL: intrinsic_vmv.x.s_s_nxv8i64:
+; RV32:       # %bb.0: # %entry
+; RV32-NEXT:    li a0, 32
+; RV32-NEXT:    vsetivli zero, 1, e64, m8, ta, ma
+; RV32-NEXT:    vsrl.vx v16, v8, a0
+; RV32-NEXT:    vmv.x.s a1, v16
+; RV32-NEXT:    vmv.x.s a0, v8
+; RV32-NEXT:    ret
+;
+; RV64-LABEL: intrinsic_vmv.x.s_s_nxv8i64:
+; RV64:       # %bb.0: # %entry
+; RV64-NEXT:    vsetivli zero, 1, e64, m1, ta, ma
+; RV64-NEXT:    vmv.x.s a0, v8
+; RV64-NEXT:    ret
 entry:
   %a = call i64 @llvm.riscv.vmv.x.s.nxv8i64(<vscale x 8 x i64> %0)
   ret i64 %a
diff --git a/llvm/test/CodeGen/RISCV/rvv/vnmsac-vp.ll b/llvm/test/CodeGen/RISCV/rvv/vnmsac-vp.ll
index f958fe8..30edcaf 100644
--- a/llvm/test/CodeGen/RISCV/rvv/vnmsac-vp.ll
+++ b/llvm/test/CodeGen/RISCV/rvv/vnmsac-vp.ll
@@ -1,7 +1,7 @@
 ; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py
-; RUN: llc -mtriple=riscv32 -mattr=+d,+zfh,+zvfh,+v,+m -target-abi=ilp32d \
+; RUN: llc -mtriple=riscv32 -mattr=+v,+m -target-abi=ilp32d \
 ; RUN:     -verify-machineinstrs < %s | FileCheck %s --check-prefixes=CHECK,RV32
-; RUN: llc -mtriple=riscv64 -mattr=+d,+zfh,+zvfh,+v,+m -target-abi=lp64d \
+; RUN: llc -mtriple=riscv64 -mattr=+v,+m -target-abi=lp64d \
 ; RUN:     -verify-machineinstrs < %s | FileCheck %s --check-prefixes=CHECK,RV64
 
 declare <vscale x 1 x i8> @llvm.vp.mul.nxv1i8(<vscale x 1 x i8>, <vscale x 1 x i8>, <vscale x 1 x i1>, i32)
diff --git a/llvm/test/CodeGen/RISCV/rvv/vreductions-fp-sdnode.ll b/llvm/test/CodeGen/RISCV/rvv/vreductions-fp-sdnode.ll
index 3ccdb5d..30e31ce 100644
--- a/llvm/test/CodeGen/RISCV/rvv/vreductions-fp-sdnode.ll
+++ b/llvm/test/CodeGen/RISCV/rvv/vreductions-fp-sdnode.ll
@@ -111,9 +111,9 @@ define float @vreduce_ord_fadd_nxv1f32(<vscale x 1 x float> %v, float %s) {
 define float @vreduce_fwadd_nxv1f32(<vscale x 1 x half> %v, float %s) {
 ; CHECK-LABEL: vreduce_fwadd_nxv1f32:
 ; CHECK:       # %bb.0:
-; CHECK-NEXT:    vsetvli a0, zero, e32, mf2, ta, ma
+; CHECK-NEXT:    vsetvli a0, zero, e32, m1, ta, ma
 ; CHECK-NEXT:    vfmv.s.f v9, fa0
-; CHECK-NEXT:    vsetvli zero, zero, e16, mf4, ta, ma
+; CHECK-NEXT:    vsetvli a0, zero, e16, mf4, ta, ma
 ; CHECK-NEXT:    vfwredusum.vs v8, v8, v9
 ; CHECK-NEXT:    vsetvli zero, zero, e32, mf2, ta, ma
 ; CHECK-NEXT:    vfmv.f.s fa0, v8
@@ -126,9 +126,9 @@ define float @vreduce_fwadd_nxv1f32(<vscale x 1 x half> %v, float %s) {
 define float @vreduce_ord_fwadd_nxv1f32(<vscale x 1 x half> %v, float %s) {
 ; CHECK-LABEL: vreduce_ord_fwadd_nxv1f32:
 ; CHECK:       # %bb.0:
-; CHECK-NEXT:    vsetvli a0, zero, e32, mf2, ta, ma
+; CHECK-NEXT:    vsetvli a0, zero, e32, m1, ta, ma
 ; CHECK-NEXT:    vfmv.s.f v9, fa0
-; CHECK-NEXT:    vsetvli zero, zero, e16, mf4, ta, ma
+; CHECK-NEXT:    vsetvli a0, zero, e16, mf4, ta, ma
 ; CHECK-NEXT:    vfwredosum.vs v8, v8, v9
 ; CHECK-NEXT:    vsetvli zero, zero, e32, mf2, ta, ma
 ; CHECK-NEXT:    vfmv.f.s fa0, v8
diff --git a/llvm/test/CodeGen/RISCV/rvv/vreductions-fp-vp.ll b/llvm/test/CodeGen/RISCV/rvv/vreductions-fp-vp.ll
index f21b42e..7f2e3cd 100644
--- a/llvm/test/CodeGen/RISCV/rvv/vreductions-fp-vp.ll
+++ b/llvm/test/CodeGen/RISCV/rvv/vreductions-fp-vp.ll
@@ -13,7 +13,7 @@ declare half @llvm.vp.reduce.fadd.nxv1f16(half, <vscale x 1 x half>, <vscale x 1
 define half @vpreduce_fadd_nxv1f16(half %s, <vscale x 1 x half> %v, <vscale x 1 x i1> %m, i32 zeroext %evl) {
 ; ZVFH-LABEL: vpreduce_fadd_nxv1f16:
 ; ZVFH:       # %bb.0:
-; ZVFH-NEXT:    vsetivli zero, 1, e16, mf4, ta, ma
+; ZVFH-NEXT:    vsetivli zero, 1, e16, m1, ta, ma
 ; ZVFH-NEXT:    vfmv.s.f v9, fa0
 ; ZVFH-NEXT:    vsetvli zero, a0, e16, mf4, ta, ma
 ; ZVFH-NEXT:    vfredusum.vs v9, v8, v9, v0.t
@@ -39,7 +39,7 @@ define half @vpreduce_fadd_nxv1f16(half %s, <vscale x 1 x half> %v, <vscale x 1
 define half @vpreduce_ord_fadd_nxv1f16(half %s, <vscale x 1 x half> %v, <vscale x 1 x i1> %m, i32 zeroext %evl) {
 ; ZVFH-LABEL: vpreduce_ord_fadd_nxv1f16:
 ; ZVFH:       # %bb.0:
-; ZVFH-NEXT:    vsetivli zero, 1, e16, mf4, ta, ma
+; ZVFH-NEXT:    vsetivli zero, 1, e16, m1, ta, ma
 ; ZVFH-NEXT:    vfmv.s.f v9, fa0
 ; ZVFH-NEXT:    vsetvli zero, a0, e16, mf4, ta, ma
 ; ZVFH-NEXT:    vfredosum.vs v9, v8, v9, v0.t
@@ -67,7 +67,7 @@ declare half @llvm.vp.reduce.fadd.nxv2f16(half, <vscale x 2 x half>, <vscale x 2
 define half @vpreduce_fadd_nxv2f16(half %s, <vscale x 2 x half> %v, <vscale x 2 x i1> %m, i32 zeroext %evl) {
 ; ZVFH-LABEL: vpreduce_fadd_nxv2f16:
 ; ZVFH:       # %bb.0:
-; ZVFH-NEXT:    vsetivli zero, 1, e16, mf2, ta, ma
+; ZVFH-NEXT:    vsetivli zero, 1, e16, m1, ta, ma
 ; ZVFH-NEXT:    vfmv.s.f v9, fa0
 ; ZVFH-NEXT:    vsetvli zero, a0, e16, mf2, ta, ma
 ; ZVFH-NEXT:    vfredusum.vs v9, v8, v9, v0.t
@@ -93,7 +93,7 @@ define half @vpreduce_fadd_nxv2f16(half %s, <vscale x 2 x half> %v, <vscale x 2
 define half @vpreduce_ord_fadd_nxv2f16(half %s, <vscale x 2 x half> %v, <vscale x 2 x i1> %m, i32 zeroext %evl) {
 ; ZVFH-LABEL: vpreduce_ord_fadd_nxv2f16:
 ; ZVFH:       # %bb.0:
-; ZVFH-NEXT:    vsetivli zero, 1, e16, mf2, ta, ma
+; ZVFH-NEXT:    vsetivli zero, 1, e16, m1, ta, ma
 ; ZVFH-NEXT:    vfmv.s.f v9, fa0
 ; ZVFH-NEXT:    vsetvli zero, a0, e16, mf2, ta, ma
 ; ZVFH-NEXT:    vfredosum.vs v9, v8, v9, v0.t
@@ -389,7 +389,7 @@ declare float @llvm.vp.reduce.fadd.nxv1f32(float, <vscale x 1 x float>, <vscale
 define float @vpreduce_fadd_nxv1f32(float %s, <vscale x 1 x float> %v, <vscale x 1 x i1> %m, i32 zeroext %evl) {
 ; CHECK-LABEL: vpreduce_fadd_nxv1f32:
 ; CHECK:       # %bb.0:
-; CHECK-NEXT:    vsetivli zero, 1, e32, mf2, ta, ma
+; CHECK-NEXT:    vsetivli zero, 1, e32, m1, ta, ma
 ; CHECK-NEXT:    vfmv.s.f v9, fa0
 ; CHECK-NEXT:    vsetvli zero, a0, e32, mf2, ta, ma
 ; CHECK-NEXT:    vfredusum.vs v9, v8, v9, v0.t
@@ -402,7 +402,7 @@ define float @vpreduce_fadd_nxv1f32(float %s, <vscale x 1 x float> %v, <vscale x
 define float @vpreduce_ord_fadd_nxv1f32(float %s, <vscale x 1 x float> %v, <vscale x 1 x i1> %m, i32 zeroext %evl) {
 ; CHECK-LABEL: vpreduce_ord_fadd_nxv1f32:
 ; CHECK:       # %bb.0:
-; CHECK-NEXT:    vsetivli zero, 1, e32, mf2, ta, ma
+; CHECK-NEXT:    vsetivli zero, 1, e32, m1, ta, ma
 ; CHECK-NEXT:    vfmv.s.f v9, fa0
 ; CHECK-NEXT:    vsetvli zero, a0, e32, mf2, ta, ma
 ; CHECK-NEXT:    vfredosum.vs v9, v8, v9, v0.t
diff --git a/llvm/test/CodeGen/RISCV/rvv/vrgather-rv32.ll b/llvm/test/CodeGen/RISCV/rvv/vrgather-rv32.ll
deleted file mode 100644
index 0a8b051..0000000
--- a/llvm/test/CodeGen/RISCV/rvv/vrgather-rv32.ll
+++ /dev/null
@@ -1,4299 +0,0 @@
-; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py
-; RUN: llc -mtriple=riscv32 -mattr=+v,+d,+zfh,+zvfh -verify-machineinstrs \
-; RUN:   < %s | FileCheck %s
-
-declare <vscale x 1 x i8> @llvm.riscv.vrgather.vv.nxv1i8.i32(
-  <vscale x 1 x i8>,
-  <vscale x 1 x i8>,
-  <vscale x 1 x i8>,
-  i32);
-
-define <vscale x 1 x i8> @intrinsic_vrgather_vv_nxv1i8_nxv1i8_nxv1i8(<vscale x 1 x i8> %0, <vscale x 1 x i8> %1, i32 %2) nounwind {
-; CHECK-LABEL: intrinsic_vrgather_vv_nxv1i8_nxv1i8_nxv1i8:
-; CHECK:       # %bb.0: # %entry
-; CHECK-NEXT:    vsetvli zero, a0, e8, mf8, ta, ma
-; CHECK-NEXT:    vrgather.vv v10, v8, v9
-; CHECK-NEXT:    vmv1r.v v8, v10
-; CHECK-NEXT:    ret
-entry:
-  %a = call <vscale x 1 x i8> @llvm.riscv.vrgather.vv.nxv1i8.i32(
-    <vscale x 1 x i8> undef,
-    <vscale x 1 x i8> %0,
-    <vscale x 1 x i8> %1,
-    i32 %2)
-
-  ret <vscale x 1 x i8> %a
-}
-
-declare <vscale x 1 x i8> @llvm.riscv.vrgather.vv.mask.nxv1i8.i32(
-  <vscale x 1 x i8>,
-  <vscale x 1 x i8>,
-  <vscale x 1 x i8>,
-  <vscale x 1 x i1>,
-  i32,
-  i32);
-
-define <vscale x 1 x i8> @intrinsic_vrgather_mask_vv_nxv1i8_nxv1i8_nxv1i8(<vscale x 1 x i8> %0, <vscale x 1 x i8> %1, <vscale x 1 x i8> %2, <vscale x 1 x i1> %3, i32 %4) nounwind {
-; CHECK-LABEL: intrinsic_vrgather_mask_vv_nxv1i8_nxv1i8_nxv1i8:
-; CHECK:       # %bb.0: # %entry
-; CHECK-NEXT:    vsetvli zero, a0, e8, mf8, ta, mu
-; CHECK-NEXT:    vrgather.vv v8, v9, v10, v0.t
-; CHECK-NEXT:    ret
-entry:
-  %a = call <vscale x 1 x i8> @llvm.riscv.vrgather.vv.mask.nxv1i8.i32(
-    <vscale x 1 x i8> %0,
-    <vscale x 1 x i8> %1,
-    <vscale x 1 x i8> %2,
-    <vscale x 1 x i1> %3,
-    i32 %4, i32 1)
-
-  ret <vscale x 1 x i8> %a
-}
-
-declare <vscale x 2 x i8> @llvm.riscv.vrgather.vv.nxv2i8.i32(
-  <vscale x 2 x i8>,
-  <vscale x 2 x i8>,
-  <vscale x 2 x i8>,
-  i32);
-
-define <vscale x 2 x i8> @intrinsic_vrgather_vv_nxv2i8_nxv2i8_nxv2i8(<vscale x 2 x i8> %0, <vscale x 2 x i8> %1, i32 %2) nounwind {
-; CHECK-LABEL: intrinsic_vrgather_vv_nxv2i8_nxv2i8_nxv2i8:
-; CHECK:       # %bb.0: # %entry
-; CHECK-NEXT:    vsetvli zero, a0, e8, mf4, ta, ma
-; CHECK-NEXT:    vrgather.vv v10, v8, v9
-; CHECK-NEXT:    vmv1r.v v8, v10
-; CHECK-NEXT:    ret
-entry:
-  %a = call <vscale x 2 x i8> @llvm.riscv.vrgather.vv.nxv2i8.i32(
-    <vscale x 2 x i8> undef,
-    <vscale x 2 x i8> %0,
-    <vscale x 2 x i8> %1,
-    i32 %2)
-
-  ret <vscale x 2 x i8> %a
-}
-
-declare <vscale x 2 x i8> @llvm.riscv.vrgather.vv.mask.nxv2i8.i32(
-  <vscale x 2 x i8>,
-  <vscale x 2 x i8>,
-  <vscale x 2 x i8>,
-  <vscale x 2 x i1>,
-  i32,
-  i32);
-
-define <vscale x 2 x i8> @intrinsic_vrgather_mask_vv_nxv2i8_nxv2i8_nxv2i8(<vscale x 2 x i8> %0, <vscale x 2 x i8> %1, <vscale x 2 x i8> %2, <vscale x 2 x i1> %3, i32 %4) nounwind {
-; CHECK-LABEL: intrinsic_vrgather_mask_vv_nxv2i8_nxv2i8_nxv2i8:
-; CHECK:       # %bb.0: # %entry
-; CHECK-NEXT:    vsetvli zero, a0, e8, mf4, ta, mu
-; CHECK-NEXT:    vrgather.vv v8, v9, v10, v0.t
-; CHECK-NEXT:    ret
-entry:
-  %a = call <vscale x 2 x i8> @llvm.riscv.vrgather.vv.mask.nxv2i8.i32(
-    <vscale x 2 x i8> %0,
-    <vscale x 2 x i8> %1,
-    <vscale x 2 x i8> %2,
-    <vscale x 2 x i1> %3,
-    i32 %4, i32 1)
-
-  ret <vscale x 2 x i8> %a
-}
-
-declare <vscale x 4 x i8> @llvm.riscv.vrgather.vv.nxv4i8.i32(
-  <vscale x 4 x i8>,
-  <vscale x 4 x i8>,
-  <vscale x 4 x i8>,
-  i32);
-
-define <vscale x 4 x i8> @intrinsic_vrgather_vv_nxv4i8_nxv4i8_nxv4i8(<vscale x 4 x i8> %0, <vscale x 4 x i8> %1, i32 %2) nounwind {
-; CHECK-LABEL: intrinsic_vrgather_vv_nxv4i8_nxv4i8_nxv4i8:
-; CHECK:       # %bb.0: # %entry
-; CHECK-NEXT:    vsetvli zero, a0, e8, mf2, ta, ma
-; CHECK-NEXT:    vrgather.vv v10, v8, v9
-; CHECK-NEXT:    vmv1r.v v8, v10
-; CHECK-NEXT:    ret
-entry:
-  %a = call <vscale x 4 x i8> @llvm.riscv.vrgather.vv.nxv4i8.i32(
-    <vscale x 4 x i8> undef,
-    <vscale x 4 x i8> %0,
-    <vscale x 4 x i8> %1,
-    i32 %2)
-
-  ret <vscale x 4 x i8> %a
-}
-
-declare <vscale x 4 x i8> @llvm.riscv.vrgather.vv.mask.nxv4i8.i32(
-  <vscale x 4 x i8>,
-  <vscale x 4 x i8>,
-  <vscale x 4 x i8>,
-  <vscale x 4 x i1>,
-  i32,
-  i32);
-
-define <vscale x 4 x i8> @intrinsic_vrgather_mask_vv_nxv4i8_nxv4i8_nxv4i8(<vscale x 4 x i8> %0, <vscale x 4 x i8> %1, <vscale x 4 x i8> %2, <vscale x 4 x i1> %3, i32 %4) nounwind {
-; CHECK-LABEL: intrinsic_vrgather_mask_vv_nxv4i8_nxv4i8_nxv4i8:
-; CHECK:       # %bb.0: # %entry
-; CHECK-NEXT:    vsetvli zero, a0, e8, mf2, ta, mu
-; CHECK-NEXT:    vrgather.vv v8, v9, v10, v0.t
-; CHECK-NEXT:    ret
-entry:
-  %a = call <vscale x 4 x i8> @llvm.riscv.vrgather.vv.mask.nxv4i8.i32(
-    <vscale x 4 x i8> %0,
-    <vscale x 4 x i8> %1,
-    <vscale x 4 x i8> %2,
-    <vscale x 4 x i1> %3,
-    i32 %4, i32 1)
-
-  ret <vscale x 4 x i8> %a
-}
-
-declare <vscale x 8 x i8> @llvm.riscv.vrgather.vv.nxv8i8.i32(
-  <vscale x 8 x i8>,
-  <vscale x 8 x i8>,
-  <vscale x 8 x i8>,
-  i32);
-
-define <vscale x 8 x i8> @intrinsic_vrgather_vv_nxv8i8_nxv8i8_nxv8i8(<vscale x 8 x i8> %0, <vscale x 8 x i8> %1, i32 %2) nounwind {
-; CHECK-LABEL: intrinsic_vrgather_vv_nxv8i8_nxv8i8_nxv8i8:
-; CHECK:       # %bb.0: # %entry
-; CHECK-NEXT:    vsetvli zero, a0, e8, m1, ta, ma
-; CHECK-NEXT:    vrgather.vv v10, v8, v9
-; CHECK-NEXT:    vmv.v.v v8, v10
-; CHECK-NEXT:    ret
-entry:
-  %a = call <vscale x 8 x i8> @llvm.riscv.vrgather.vv.nxv8i8.i32(
-    <vscale x 8 x i8> undef,
-    <vscale x 8 x i8> %0,
-    <vscale x 8 x i8> %1,
-    i32 %2)
-
-  ret <vscale x 8 x i8> %a
-}
-
-declare <vscale x 8 x i8> @llvm.riscv.vrgather.vv.mask.nxv8i8.i32(
-  <vscale x 8 x i8>,
-  <vscale x 8 x i8>,
-  <vscale x 8 x i8>,
-  <vscale x 8 x i1>,
-  i32,
-  i32);
-
-define <vscale x 8 x i8> @intrinsic_vrgather_mask_vv_nxv8i8_nxv8i8_nxv8i8(<vscale x 8 x i8> %0, <vscale x 8 x i8> %1, <vscale x 8 x i8> %2, <vscale x 8 x i1> %3, i32 %4) nounwind {
-; CHECK-LABEL: intrinsic_vrgather_mask_vv_nxv8i8_nxv8i8_nxv8i8:
-; CHECK:       # %bb.0: # %entry
-; CHECK-NEXT:    vsetvli zero, a0, e8, m1, ta, mu
-; CHECK-NEXT:    vrgather.vv v8, v9, v10, v0.t
-; CHECK-NEXT:    ret
-entry:
-  %a = call <vscale x 8 x i8> @llvm.riscv.vrgather.vv.mask.nxv8i8.i32(
-    <vscale x 8 x i8> %0,
-    <vscale x 8 x i8> %1,
-    <vscale x 8 x i8> %2,
-    <vscale x 8 x i1> %3,
-    i32 %4, i32 1)
-
-  ret <vscale x 8 x i8> %a
-}
-
-declare <vscale x 16 x i8> @llvm.riscv.vrgather.vv.nxv16i8.i32(
-  <vscale x 16 x i8>,
-  <vscale x 16 x i8>,
-  <vscale x 16 x i8>,
-  i32);
-
-define <vscale x 16 x i8> @intrinsic_vrgather_vv_nxv16i8_nxv16i8_nxv16i8(<vscale x 16 x i8> %0, <vscale x 16 x i8> %1, i32 %2) nounwind {
-; CHECK-LABEL: intrinsic_vrgather_vv_nxv16i8_nxv16i8_nxv16i8:
-; CHECK:       # %bb.0: # %entry
-; CHECK-NEXT:    vsetvli zero, a0, e8, m2, ta, ma
-; CHECK-NEXT:    vrgather.vv v12, v8, v10
-; CHECK-NEXT:    vmv.v.v v8, v12
-; CHECK-NEXT:    ret
-entry:
-  %a = call <vscale x 16 x i8> @llvm.riscv.vrgather.vv.nxv16i8.i32(
-    <vscale x 16 x i8> undef,
-    <vscale x 16 x i8> %0,
-    <vscale x 16 x i8> %1,
-    i32 %2)
-
-  ret <vscale x 16 x i8> %a
-}
-
-declare <vscale x 16 x i8> @llvm.riscv.vrgather.vv.mask.nxv16i8.i32(
-  <vscale x 16 x i8>,
-  <vscale x 16 x i8>,
-  <vscale x 16 x i8>,
-  <vscale x 16 x i1>,
-  i32,
-  i32);
-
-define <vscale x 16 x i8> @intrinsic_vrgather_mask_vv_nxv16i8_nxv16i8_nxv16i8(<vscale x 16 x i8> %0, <vscale x 16 x i8> %1, <vscale x 16 x i8> %2, <vscale x 16 x i1> %3, i32 %4) nounwind {
-; CHECK-LABEL: intrinsic_vrgather_mask_vv_nxv16i8_nxv16i8_nxv16i8:
-; CHECK:       # %bb.0: # %entry
-; CHECK-NEXT:    vsetvli zero, a0, e8, m2, ta, mu
-; CHECK-NEXT:    vrgather.vv v8, v10, v12, v0.t
-; CHECK-NEXT:    ret
-entry:
-  %a = call <vscale x 16 x i8> @llvm.riscv.vrgather.vv.mask.nxv16i8.i32(
-    <vscale x 16 x i8> %0,
-    <vscale x 16 x i8> %1,
-    <vscale x 16 x i8> %2,
-    <vscale x 16 x i1> %3,
-    i32 %4, i32 1)
-
-  ret <vscale x 16 x i8> %a
-}
-
-declare <vscale x 32 x i8> @llvm.riscv.vrgather.vv.nxv32i8.i32(
-  <vscale x 32 x i8>,
-  <vscale x 32 x i8>,
-  <vscale x 32 x i8>,
-  i32);
-
-define <vscale x 32 x i8> @intrinsic_vrgather_vv_nxv32i8_nxv32i8_nxv32i8(<vscale x 32 x i8> %0, <vscale x 32 x i8> %1, i32 %2) nounwind {
-; CHECK-LABEL: intrinsic_vrgather_vv_nxv32i8_nxv32i8_nxv32i8:
-; CHECK:       # %bb.0: # %entry
-; CHECK-NEXT:    vsetvli zero, a0, e8, m4, ta, ma
-; CHECK-NEXT:    vrgather.vv v16, v8, v12
-; CHECK-NEXT:    vmv.v.v v8, v16
-; CHECK-NEXT:    ret
-entry:
-  %a = call <vscale x 32 x i8> @llvm.riscv.vrgather.vv.nxv32i8.i32(
-    <vscale x 32 x i8> undef,
-    <vscale x 32 x i8> %0,
-    <vscale x 32 x i8> %1,
-    i32 %2)
-
-  ret <vscale x 32 x i8> %a
-}
-
-declare <vscale x 32 x i8> @llvm.riscv.vrgather.vv.mask.nxv32i8.i32(
-  <vscale x 32 x i8>,
-  <vscale x 32 x i8>,
-  <vscale x 32 x i8>,
-  <vscale x 32 x i1>,
-  i32,
-  i32);
-
-define <vscale x 32 x i8> @intrinsic_vrgather_mask_vv_nxv32i8_nxv32i8_nxv32i8(<vscale x 32 x i8> %0, <vscale x 32 x i8> %1, <vscale x 32 x i8> %2, <vscale x 32 x i1> %3, i32 %4) nounwind {
-; CHECK-LABEL: intrinsic_vrgather_mask_vv_nxv32i8_nxv32i8_nxv32i8:
-; CHECK:       # %bb.0: # %entry
-; CHECK-NEXT:    vsetvli zero, a0, e8, m4, ta, mu
-; CHECK-NEXT:    vrgather.vv v8, v12, v16, v0.t
-; CHECK-NEXT:    ret
-entry:
-  %a = call <vscale x 32 x i8> @llvm.riscv.vrgather.vv.mask.nxv32i8.i32(
-    <vscale x 32 x i8> %0,
-    <vscale x 32 x i8> %1,
-    <vscale x 32 x i8> %2,
-    <vscale x 32 x i1> %3,
-    i32 %4, i32 1)
-
-  ret <vscale x 32 x i8> %a
-}
-
-declare <vscale x 64 x i8> @llvm.riscv.vrgather.vv.nxv64i8.i32(
-  <vscale x 64 x i8>,
-  <vscale x 64 x i8>,
-  <vscale x 64 x i8>,
-  i32);
-
-define <vscale x 64 x i8> @intrinsic_vrgather_vv_nxv64i8_nxv64i8_nxv64i8(<vscale x 64 x i8> %0, <vscale x 64 x i8> %1, i32 %2) nounwind {
-; CHECK-LABEL: intrinsic_vrgather_vv_nxv64i8_nxv64i8_nxv64i8:
-; CHECK:       # %bb.0: # %entry
-; CHECK-NEXT:    vsetvli zero, a0, e8, m8, ta, ma
-; CHECK-NEXT:    vrgather.vv v24, v8, v16
-; CHECK-NEXT:    vmv.v.v v8, v24
-; CHECK-NEXT:    ret
-entry:
-  %a = call <vscale x 64 x i8> @llvm.riscv.vrgather.vv.nxv64i8.i32(
-    <vscale x 64 x i8> undef,
-    <vscale x 64 x i8> %0,
-    <vscale x 64 x i8> %1,
-    i32 %2)
-
-  ret <vscale x 64 x i8> %a
-}
-
-declare <vscale x 64 x i8> @llvm.riscv.vrgather.vv.mask.nxv64i8.i32(
-  <vscale x 64 x i8>,
-  <vscale x 64 x i8>,
-  <vscale x 64 x i8>,
-  <vscale x 64 x i1>,
-  i32,
-  i32);
-
-define <vscale x 64 x i8> @intrinsic_vrgather_mask_vv_nxv64i8_nxv64i8_nxv64i8(<vscale x 64 x i8> %0, <vscale x 64 x i8> %1, <vscale x 64 x i8> %2, <vscale x 64 x i1> %3, i32 %4) nounwind {
-; CHECK-LABEL: intrinsic_vrgather_mask_vv_nxv64i8_nxv64i8_nxv64i8:
-; CHECK:       # %bb.0: # %entry
-; CHECK-NEXT:    vl8r.v v24, (a0)
-; CHECK-NEXT:    vsetvli zero, a1, e8, m8, ta, mu
-; CHECK-NEXT:    vrgather.vv v8, v16, v24, v0.t
-; CHECK-NEXT:    ret
-entry:
-  %a = call <vscale x 64 x i8> @llvm.riscv.vrgather.vv.mask.nxv64i8.i32(
-    <vscale x 64 x i8> %0,
-    <vscale x 64 x i8> %1,
-    <vscale x 64 x i8> %2,
-    <vscale x 64 x i1> %3,
-    i32 %4, i32 1)
-
-  ret <vscale x 64 x i8> %a
-}
-
-declare <vscale x 1 x i16> @llvm.riscv.vrgather.vv.nxv1i16.i32(
-  <vscale x 1 x i16>,
-  <vscale x 1 x i16>,
-  <vscale x 1 x i16>,
-  i32);
-
-define <vscale x 1 x i16> @intrinsic_vrgather_vv_nxv1i16_nxv1i16_nxv1i16(<vscale x 1 x i16> %0, <vscale x 1 x i16> %1, i32 %2) nounwind {
-; CHECK-LABEL: intrinsic_vrgather_vv_nxv1i16_nxv1i16_nxv1i16:
-; CHECK:       # %bb.0: # %entry
-; CHECK-NEXT:    vsetvli zero, a0, e16, mf4, ta, ma
-; CHECK-NEXT:    vrgather.vv v10, v8, v9
-; CHECK-NEXT:    vmv1r.v v8, v10
-; CHECK-NEXT:    ret
-entry:
-  %a = call <vscale x 1 x i16> @llvm.riscv.vrgather.vv.nxv1i16.i32(
-    <vscale x 1 x i16> undef,
-    <vscale x 1 x i16> %0,
-    <vscale x 1 x i16> %1,
-    i32 %2)
-
-  ret <vscale x 1 x i16> %a
-}
-
-declare <vscale x 1 x i16> @llvm.riscv.vrgather.vv.mask.nxv1i16.i32(
-  <vscale x 1 x i16>,
-  <vscale x 1 x i16>,
-  <vscale x 1 x i16>,
-  <vscale x 1 x i1>,
-  i32,
-  i32);
-
-define <vscale x 1 x i16> @intrinsic_vrgather_mask_vv_nxv1i16_nxv1i16_nxv1i16(<vscale x 1 x i16> %0, <vscale x 1 x i16> %1, <vscale x 1 x i16> %2, <vscale x 1 x i1> %3, i32 %4) nounwind {
-; CHECK-LABEL: intrinsic_vrgather_mask_vv_nxv1i16_nxv1i16_nxv1i16:
-; CHECK:       # %bb.0: # %entry
-; CHECK-NEXT:    vsetvli zero, a0, e16, mf4, ta, mu
-; CHECK-NEXT:    vrgather.vv v8, v9, v10, v0.t
-; CHECK-NEXT:    ret
-entry:
-  %a = call <vscale x 1 x i16> @llvm.riscv.vrgather.vv.mask.nxv1i16.i32(
-    <vscale x 1 x i16> %0,
-    <vscale x 1 x i16> %1,
-    <vscale x 1 x i16> %2,
-    <vscale x 1 x i1> %3,
-    i32 %4, i32 1)
-
-  ret <vscale x 1 x i16> %a
-}
-
-declare <vscale x 2 x i16> @llvm.riscv.vrgather.vv.nxv2i16.i32(
-  <vscale x 2 x i16>,
-  <vscale x 2 x i16>,
-  <vscale x 2 x i16>,
-  i32);
-
-define <vscale x 2 x i16> @intrinsic_vrgather_vv_nxv2i16_nxv2i16_nxv2i16(<vscale x 2 x i16> %0, <vscale x 2 x i16> %1, i32 %2) nounwind {
-; CHECK-LABEL: intrinsic_vrgather_vv_nxv2i16_nxv2i16_nxv2i16:
-; CHECK:       # %bb.0: # %entry
-; CHECK-NEXT:    vsetvli zero, a0, e16, mf2, ta, ma
-; CHECK-NEXT:    vrgather.vv v10, v8, v9
-; CHECK-NEXT:    vmv1r.v v8, v10
-; CHECK-NEXT:    ret
-entry:
-  %a = call <vscale x 2 x i16> @llvm.riscv.vrgather.vv.nxv2i16.i32(
-    <vscale x 2 x i16> undef,
-    <vscale x 2 x i16> %0,
-    <vscale x 2 x i16> %1,
-    i32 %2)
-
-  ret <vscale x 2 x i16> %a
-}
-
-declare <vscale x 2 x i16> @llvm.riscv.vrgather.vv.mask.nxv2i16.i32(
-  <vscale x 2 x i16>,
-  <vscale x 2 x i16>,
-  <vscale x 2 x i16>,
-  <vscale x 2 x i1>,
-  i32,
-  i32);
-
-define <vscale x 2 x i16> @intrinsic_vrgather_mask_vv_nxv2i16_nxv2i16_nxv2i16(<vscale x 2 x i16> %0, <vscale x 2 x i16> %1, <vscale x 2 x i16> %2, <vscale x 2 x i1> %3, i32 %4) nounwind {
-; CHECK-LABEL: intrinsic_vrgather_mask_vv_nxv2i16_nxv2i16_nxv2i16:
-; CHECK:       # %bb.0: # %entry
-; CHECK-NEXT:    vsetvli zero, a0, e16, mf2, ta, mu
-; CHECK-NEXT:    vrgather.vv v8, v9, v10, v0.t
-; CHECK-NEXT:    ret
-entry:
-  %a = call <vscale x 2 x i16> @llvm.riscv.vrgather.vv.mask.nxv2i16.i32(
-    <vscale x 2 x i16> %0,
-    <vscale x 2 x i16> %1,
-    <vscale x 2 x i16> %2,
-    <vscale x 2 x i1> %3,
-    i32 %4, i32 1)
-
-  ret <vscale x 2 x i16> %a
-}
-
-declare <vscale x 4 x i16> @llvm.riscv.vrgather.vv.nxv4i16.i32(
-  <vscale x 4 x i16>,
-  <vscale x 4 x i16>,
-  <vscale x 4 x i16>,
-  i32);
-
-define <vscale x 4 x i16> @intrinsic_vrgather_vv_nxv4i16_nxv4i16_nxv4i16(<vscale x 4 x i16> %0, <vscale x 4 x i16> %1, i32 %2) nounwind {
-; CHECK-LABEL: intrinsic_vrgather_vv_nxv4i16_nxv4i16_nxv4i16:
-; CHECK:       # %bb.0: # %entry
-; CHECK-NEXT:    vsetvli zero, a0, e16, m1, ta, ma
-; CHECK-NEXT:    vrgather.vv v10, v8, v9
-; CHECK-NEXT:    vmv.v.v v8, v10
-; CHECK-NEXT:    ret
-entry:
-  %a = call <vscale x 4 x i16> @llvm.riscv.vrgather.vv.nxv4i16.i32(
-    <vscale x 4 x i16> undef,
-    <vscale x 4 x i16> %0,
-    <vscale x 4 x i16> %1,
-    i32 %2)
-
-  ret <vscale x 4 x i16> %a
-}
-
-declare <vscale x 4 x i16> @llvm.riscv.vrgather.vv.mask.nxv4i16.i32(
-  <vscale x 4 x i16>,
-  <vscale x 4 x i16>,
-  <vscale x 4 x i16>,
-  <vscale x 4 x i1>,
-  i32,
-  i32);
-
-define <vscale x 4 x i16> @intrinsic_vrgather_mask_vv_nxv4i16_nxv4i16_nxv4i16(<vscale x 4 x i16> %0, <vscale x 4 x i16> %1, <vscale x 4 x i16> %2, <vscale x 4 x i1> %3, i32 %4) nounwind {
-; CHECK-LABEL: intrinsic_vrgather_mask_vv_nxv4i16_nxv4i16_nxv4i16:
-; CHECK:       # %bb.0: # %entry
-; CHECK-NEXT:    vsetvli zero, a0, e16, m1, ta, mu
-; CHECK-NEXT:    vrgather.vv v8, v9, v10, v0.t
-; CHECK-NEXT:    ret
-entry:
-  %a = call <vscale x 4 x i16> @llvm.riscv.vrgather.vv.mask.nxv4i16.i32(
-    <vscale x 4 x i16> %0,
-    <vscale x 4 x i16> %1,
-    <vscale x 4 x i16> %2,
-    <vscale x 4 x i1> %3,
-    i32 %4, i32 1)
-
-  ret <vscale x 4 x i16> %a
-}
-
-declare <vscale x 8 x i16> @llvm.riscv.vrgather.vv.nxv8i16.i32(
-  <vscale x 8 x i16>,
-  <vscale x 8 x i16>,
-  <vscale x 8 x i16>,
-  i32);
-
-define <vscale x 8 x i16> @intrinsic_vrgather_vv_nxv8i16_nxv8i16_nxv8i16(<vscale x 8 x i16> %0, <vscale x 8 x i16> %1, i32 %2) nounwind {
-; CHECK-LABEL: intrinsic_vrgather_vv_nxv8i16_nxv8i16_nxv8i16:
-; CHECK:       # %bb.0: # %entry
-; CHECK-NEXT:    vsetvli zero, a0, e16, m2, ta, ma
-; CHECK-NEXT:    vrgather.vv v12, v8, v10
-; CHECK-NEXT:    vmv.v.v v8, v12
-; CHECK-NEXT:    ret
-entry:
-  %a = call <vscale x 8 x i16> @llvm.riscv.vrgather.vv.nxv8i16.i32(
-    <vscale x 8 x i16> undef,
-    <vscale x 8 x i16> %0,
-    <vscale x 8 x i16> %1,
-    i32 %2)
-
-  ret <vscale x 8 x i16> %a
-}
-
-declare <vscale x 8 x i16> @llvm.riscv.vrgather.vv.mask.nxv8i16.i32(
-  <vscale x 8 x i16>,
-  <vscale x 8 x i16>,
-  <vscale x 8 x i16>,
-  <vscale x 8 x i1>,
-  i32,
-  i32);
-
-define <vscale x 8 x i16> @intrinsic_vrgather_mask_vv_nxv8i16_nxv8i16_nxv8i16(<vscale x 8 x i16> %0, <vscale x 8 x i16> %1, <vscale x 8 x i16> %2, <vscale x 8 x i1> %3, i32 %4) nounwind {
-; CHECK-LABEL: intrinsic_vrgather_mask_vv_nxv8i16_nxv8i16_nxv8i16:
-; CHECK:       # %bb.0: # %entry
-; CHECK-NEXT:    vsetvli zero, a0, e16, m2, ta, mu
-; CHECK-NEXT:    vrgather.vv v8, v10, v12, v0.t
-; CHECK-NEXT:    ret
-entry:
-  %a = call <vscale x 8 x i16> @llvm.riscv.vrgather.vv.mask.nxv8i16.i32(
-    <vscale x 8 x i16> %0,
-    <vscale x 8 x i16> %1,
-    <vscale x 8 x i16> %2,
-    <vscale x 8 x i1> %3,
-    i32 %4, i32 1)
-
-  ret <vscale x 8 x i16> %a
-}
-
-declare <vscale x 16 x i16> @llvm.riscv.vrgather.vv.nxv16i16.i32(
-  <vscale x 16 x i16>,
-  <vscale x 16 x i16>,
-  <vscale x 16 x i16>,
-  i32);
-
-define <vscale x 16 x i16> @intrinsic_vrgather_vv_nxv16i16_nxv16i16_nxv16i16(<vscale x 16 x i16> %0, <vscale x 16 x i16> %1, i32 %2) nounwind {
-; CHECK-LABEL: intrinsic_vrgather_vv_nxv16i16_nxv16i16_nxv16i16:
-; CHECK:       # %bb.0: # %entry
-; CHECK-NEXT:    vsetvli zero, a0, e16, m4, ta, ma
-; CHECK-NEXT:    vrgather.vv v16, v8, v12
-; CHECK-NEXT:    vmv.v.v v8, v16
-; CHECK-NEXT:    ret
-entry:
-  %a = call <vscale x 16 x i16> @llvm.riscv.vrgather.vv.nxv16i16.i32(
-    <vscale x 16 x i16> undef,
-    <vscale x 16 x i16> %0,
-    <vscale x 16 x i16> %1,
-    i32 %2)
-
-  ret <vscale x 16 x i16> %a
-}
-
-declare <vscale x 16 x i16> @llvm.riscv.vrgather.vv.mask.nxv16i16.i32(
-  <vscale x 16 x i16>,
-  <vscale x 16 x i16>,
-  <vscale x 16 x i16>,
-  <vscale x 16 x i1>,
-  i32,
-  i32);
-
-define <vscale x 16 x i16> @intrinsic_vrgather_mask_vv_nxv16i16_nxv16i16_nxv16i16(<vscale x 16 x i16> %0, <vscale x 16 x i16> %1, <vscale x 16 x i16> %2, <vscale x 16 x i1> %3, i32 %4) nounwind {
-; CHECK-LABEL: intrinsic_vrgather_mask_vv_nxv16i16_nxv16i16_nxv16i16:
-; CHECK:       # %bb.0: # %entry
-; CHECK-NEXT:    vsetvli zero, a0, e16, m4, ta, mu
-; CHECK-NEXT:    vrgather.vv v8, v12, v16, v0.t
-; CHECK-NEXT:    ret
-entry:
-  %a = call <vscale x 16 x i16> @llvm.riscv.vrgather.vv.mask.nxv16i16.i32(
-    <vscale x 16 x i16> %0,
-    <vscale x 16 x i16> %1,
-    <vscale x 16 x i16> %2,
-    <vscale x 16 x i1> %3,
-    i32 %4, i32 1)
-
-  ret <vscale x 16 x i16> %a
-}
-
-declare <vscale x 32 x i16> @llvm.riscv.vrgather.vv.nxv32i16.i32(
-  <vscale x 32 x i16>,
-  <vscale x 32 x i16>,
-  <vscale x 32 x i16>,
-  i32);
-
-define <vscale x 32 x i16> @intrinsic_vrgather_vv_nxv32i16_nxv32i16_nxv32i16(<vscale x 32 x i16> %0, <vscale x 32 x i16> %1, i32 %2) nounwind {
-; CHECK-LABEL: intrinsic_vrgather_vv_nxv32i16_nxv32i16_nxv32i16:
-; CHECK:       # %bb.0: # %entry
-; CHECK-NEXT:    vsetvli zero, a0, e16, m8, ta, ma
-; CHECK-NEXT:    vrgather.vv v24, v8, v16
-; CHECK-NEXT:    vmv.v.v v8, v24
-; CHECK-NEXT:    ret
-entry:
-  %a = call <vscale x 32 x i16> @llvm.riscv.vrgather.vv.nxv32i16.i32(
-    <vscale x 32 x i16> undef,
-    <vscale x 32 x i16> %0,
-    <vscale x 32 x i16> %1,
-    i32 %2)
-
-  ret <vscale x 32 x i16> %a
-}
-
-declare <vscale x 32 x i16> @llvm.riscv.vrgather.vv.mask.nxv32i16.i32(
-  <vscale x 32 x i16>,
-  <vscale x 32 x i16>,
-  <vscale x 32 x i16>,
-  <vscale x 32 x i1>,
-  i32,
-  i32);
-
-define <vscale x 32 x i16> @intrinsic_vrgather_mask_vv_nxv32i16_nxv32i16_nxv32i16(<vscale x 32 x i16> %0, <vscale x 32 x i16> %1, <vscale x 32 x i16> %2, <vscale x 32 x i1> %3, i32 %4) nounwind {
-; CHECK-LABEL: intrinsic_vrgather_mask_vv_nxv32i16_nxv32i16_nxv32i16:
-; CHECK:       # %bb.0: # %entry
-; CHECK-NEXT:    vl8re16.v v24, (a0)
-; CHECK-NEXT:    vsetvli zero, a1, e16, m8, ta, mu
-; CHECK-NEXT:    vrgather.vv v8, v16, v24, v0.t
-; CHECK-NEXT:    ret
-entry:
-  %a = call <vscale x 32 x i16> @llvm.riscv.vrgather.vv.mask.nxv32i16.i32(
-    <vscale x 32 x i16> %0,
-    <vscale x 32 x i16> %1,
-    <vscale x 32 x i16> %2,
-    <vscale x 32 x i1> %3,
-    i32 %4, i32 1)
-
-  ret <vscale x 32 x i16> %a
-}
-
-declare <vscale x 1 x i32> @llvm.riscv.vrgather.vv.nxv1i32.i32(
-  <vscale x 1 x i32>,
-  <vscale x 1 x i32>,
-  <vscale x 1 x i32>,
-  i32);
-
-define <vscale x 1 x i32> @intrinsic_vrgather_vv_nxv1i32_nxv1i32_nxv1i32(<vscale x 1 x i32> %0, <vscale x 1 x i32> %1, i32 %2) nounwind {
-; CHECK-LABEL: intrinsic_vrgather_vv_nxv1i32_nxv1i32_nxv1i32:
-; CHECK:       # %bb.0: # %entry
-; CHECK-NEXT:    vsetvli zero, a0, e32, mf2, ta, ma
-; CHECK-NEXT:    vrgather.vv v10, v8, v9
-; CHECK-NEXT:    vmv1r.v v8, v10
-; CHECK-NEXT:    ret
-entry:
-  %a = call <vscale x 1 x i32> @llvm.riscv.vrgather.vv.nxv1i32.i32(
-    <vscale x 1 x i32> undef,
-    <vscale x 1 x i32> %0,
-    <vscale x 1 x i32> %1,
-    i32 %2)
-
-  ret <vscale x 1 x i32> %a
-}
-
-declare <vscale x 1 x i32> @llvm.riscv.vrgather.vv.mask.nxv1i32.i32(
-  <vscale x 1 x i32>,
-  <vscale x 1 x i32>,
-  <vscale x 1 x i32>,
-  <vscale x 1 x i1>,
-  i32,
-  i32);
-
-define <vscale x 1 x i32> @intrinsic_vrgather_mask_vv_nxv1i32_nxv1i32_nxv1i32(<vscale x 1 x i32> %0, <vscale x 1 x i32> %1, <vscale x 1 x i32> %2, <vscale x 1 x i1> %3, i32 %4) nounwind {
-; CHECK-LABEL: intrinsic_vrgather_mask_vv_nxv1i32_nxv1i32_nxv1i32:
-; CHECK:       # %bb.0: # %entry
-; CHECK-NEXT:    vsetvli zero, a0, e32, mf2, ta, mu
-; CHECK-NEXT:    vrgather.vv v8, v9, v10, v0.t
-; CHECK-NEXT:    ret
-entry:
-  %a = call <vscale x 1 x i32> @llvm.riscv.vrgather.vv.mask.nxv1i32.i32(
-    <vscale x 1 x i32> %0,
-    <vscale x 1 x i32> %1,
-    <vscale x 1 x i32> %2,
-    <vscale x 1 x i1> %3,
-    i32 %4, i32 1)
-
-  ret <vscale x 1 x i32> %a
-}
-
-declare <vscale x 2 x i32> @llvm.riscv.vrgather.vv.nxv2i32.i32(
-  <vscale x 2 x i32>,
-  <vscale x 2 x i32>,
-  <vscale x 2 x i32>,
-  i32);
-
-define <vscale x 2 x i32> @intrinsic_vrgather_vv_nxv2i32_nxv2i32_nxv2i32(<vscale x 2 x i32> %0, <vscale x 2 x i32> %1, i32 %2) nounwind {
-; CHECK-LABEL: intrinsic_vrgather_vv_nxv2i32_nxv2i32_nxv2i32:
-; CHECK:       # %bb.0: # %entry
-; CHECK-NEXT:    vsetvli zero, a0, e32, m1, ta, ma
-; CHECK-NEXT:    vrgather.vv v10, v8, v9
-; CHECK-NEXT:    vmv.v.v v8, v10
-; CHECK-NEXT:    ret
-entry:
-  %a = call <vscale x 2 x i32> @llvm.riscv.vrgather.vv.nxv2i32.i32(
-    <vscale x 2 x i32> undef,
-    <vscale x 2 x i32> %0,
-    <vscale x 2 x i32> %1,
-    i32 %2)
-
-  ret <vscale x 2 x i32> %a
-}
-
-declare <vscale x 2 x i32> @llvm.riscv.vrgather.vv.mask.nxv2i32.i32(
-  <vscale x 2 x i32>,
-  <vscale x 2 x i32>,
-  <vscale x 2 x i32>,
-  <vscale x 2 x i1>,
-  i32,
-  i32);
-
-define <vscale x 2 x i32> @intrinsic_vrgather_mask_vv_nxv2i32_nxv2i32_nxv2i32(<vscale x 2 x i32> %0, <vscale x 2 x i32> %1, <vscale x 2 x i32> %2, <vscale x 2 x i1> %3, i32 %4) nounwind {
-; CHECK-LABEL: intrinsic_vrgather_mask_vv_nxv2i32_nxv2i32_nxv2i32:
-; CHECK:       # %bb.0: # %entry
-; CHECK-NEXT:    vsetvli zero, a0, e32, m1, ta, mu
-; CHECK-NEXT:    vrgather.vv v8, v9, v10, v0.t
-; CHECK-NEXT:    ret
-entry:
-  %a = call <vscale x 2 x i32> @llvm.riscv.vrgather.vv.mask.nxv2i32.i32(
-    <vscale x 2 x i32> %0,
-    <vscale x 2 x i32> %1,
-    <vscale x 2 x i32> %2,
-    <vscale x 2 x i1> %3,
-    i32 %4, i32 1)
-
-  ret <vscale x 2 x i32> %a
-}
-
-declare <vscale x 4 x i32> @llvm.riscv.vrgather.vv.nxv4i32.i32(
-  <vscale x 4 x i32>,
-  <vscale x 4 x i32>,
-  <vscale x 4 x i32>,
-  i32);
-
-define <vscale x 4 x i32> @intrinsic_vrgather_vv_nxv4i32_nxv4i32_nxv4i32(<vscale x 4 x i32> %0, <vscale x 4 x i32> %1, i32 %2) nounwind {
-; CHECK-LABEL: intrinsic_vrgather_vv_nxv4i32_nxv4i32_nxv4i32:
-; CHECK:       # %bb.0: # %entry
-; CHECK-NEXT:    vsetvli zero, a0, e32, m2, ta, ma
-; CHECK-NEXT:    vrgather.vv v12, v8, v10
-; CHECK-NEXT:    vmv.v.v v8, v12
-; CHECK-NEXT:    ret
-entry:
-  %a = call <vscale x 4 x i32> @llvm.riscv.vrgather.vv.nxv4i32.i32(
-    <vscale x 4 x i32> undef,
-    <vscale x 4 x i32> %0,
-    <vscale x 4 x i32> %1,
-    i32 %2)
-
-  ret <vscale x 4 x i32> %a
-}
-
-declare <vscale x 4 x i32> @llvm.riscv.vrgather.vv.mask.nxv4i32.i32(
-  <vscale x 4 x i32>,
-  <vscale x 4 x i32>,
-  <vscale x 4 x i32>,
-  <vscale x 4 x i1>,
-  i32,
-  i32);
-
-define <vscale x 4 x i32> @intrinsic_vrgather_mask_vv_nxv4i32_nxv4i32_nxv4i32(<vscale x 4 x i32> %0, <vscale x 4 x i32> %1, <vscale x 4 x i32> %2, <vscale x 4 x i1> %3, i32 %4) nounwind {
-; CHECK-LABEL: intrinsic_vrgather_mask_vv_nxv4i32_nxv4i32_nxv4i32:
-; CHECK:       # %bb.0: # %entry
-; CHECK-NEXT:    vsetvli zero, a0, e32, m2, ta, mu
-; CHECK-NEXT:    vrgather.vv v8, v10, v12, v0.t
-; CHECK-NEXT:    ret
-entry:
-  %a = call <vscale x 4 x i32> @llvm.riscv.vrgather.vv.mask.nxv4i32.i32(
-    <vscale x 4 x i32> %0,
-    <vscale x 4 x i32> %1,
-    <vscale x 4 x i32> %2,
-    <vscale x 4 x i1> %3,
-    i32 %4, i32 1)
-
-  ret <vscale x 4 x i32> %a
-}
-
-declare <vscale x 8 x i32> @llvm.riscv.vrgather.vv.nxv8i32.i32(
-  <vscale x 8 x i32>,
-  <vscale x 8 x i32>,
-  <vscale x 8 x i32>,
-  i32);
-
-define <vscale x 8 x i32> @intrinsic_vrgather_vv_nxv8i32_nxv8i32_nxv8i32(<vscale x 8 x i32> %0, <vscale x 8 x i32> %1, i32 %2) nounwind {
-; CHECK-LABEL: intrinsic_vrgather_vv_nxv8i32_nxv8i32_nxv8i32:
-; CHECK:       # %bb.0: # %entry
-; CHECK-NEXT:    vsetvli zero, a0, e32, m4, ta, ma
-; CHECK-NEXT:    vrgather.vv v16, v8, v12
-; CHECK-NEXT:    vmv.v.v v8, v16
-; CHECK-NEXT:    ret
-entry:
-  %a = call <vscale x 8 x i32> @llvm.riscv.vrgather.vv.nxv8i32.i32(
-    <vscale x 8 x i32> undef,
-    <vscale x 8 x i32> %0,
-    <vscale x 8 x i32> %1,
-    i32 %2)
-
-  ret <vscale x 8 x i32> %a
-}
-
-declare <vscale x 8 x i32> @llvm.riscv.vrgather.vv.mask.nxv8i32.i32(
-  <vscale x 8 x i32>,
-  <vscale x 8 x i32>,
-  <vscale x 8 x i32>,
-  <vscale x 8 x i1>,
-  i32,
-  i32);
-
-define <vscale x 8 x i32> @intrinsic_vrgather_mask_vv_nxv8i32_nxv8i32_nxv8i32(<vscale x 8 x i32> %0, <vscale x 8 x i32> %1, <vscale x 8 x i32> %2, <vscale x 8 x i1> %3, i32 %4) nounwind {
-; CHECK-LABEL: intrinsic_vrgather_mask_vv_nxv8i32_nxv8i32_nxv8i32:
-; CHECK:       # %bb.0: # %entry
-; CHECK-NEXT:    vsetvli zero, a0, e32, m4, ta, mu
-; CHECK-NEXT:    vrgather.vv v8, v12, v16, v0.t
-; CHECK-NEXT:    ret
-entry:
-  %a = call <vscale x 8 x i32> @llvm.riscv.vrgather.vv.mask.nxv8i32.i32(
-    <vscale x 8 x i32> %0,
-    <vscale x 8 x i32> %1,
-    <vscale x 8 x i32> %2,
-    <vscale x 8 x i1> %3,
-    i32 %4, i32 1)
-
-  ret <vscale x 8 x i32> %a
-}
-
-declare <vscale x 16 x i32> @llvm.riscv.vrgather.vv.nxv16i32.i32(
-  <vscale x 16 x i32>,
-  <vscale x 16 x i32>,
-  <vscale x 16 x i32>,
-  i32);
-
-define <vscale x 16 x i32> @intrinsic_vrgather_vv_nxv16i32_nxv16i32_nxv16i32(<vscale x 16 x i32> %0, <vscale x 16 x i32> %1, i32 %2) nounwind {
-; CHECK-LABEL: intrinsic_vrgather_vv_nxv16i32_nxv16i32_nxv16i32:
-; CHECK:       # %bb.0: # %entry
-; CHECK-NEXT:    vsetvli zero, a0, e32, m8, ta, ma
-; CHECK-NEXT:    vrgather.vv v24, v8, v16
-; CHECK-NEXT:    vmv.v.v v8, v24
-; CHECK-NEXT:    ret
-entry:
-  %a = call <vscale x 16 x i32> @llvm.riscv.vrgather.vv.nxv16i32.i32(
-    <vscale x 16 x i32> undef,
-    <vscale x 16 x i32> %0,
-    <vscale x 16 x i32> %1,
-    i32 %2)
-
-  ret <vscale x 16 x i32> %a
-}
-
-declare <vscale x 16 x i32> @llvm.riscv.vrgather.vv.mask.nxv16i32.i32(
-  <vscale x 16 x i32>,
-  <vscale x 16 x i32>,
-  <vscale x 16 x i32>,
-  <vscale x 16 x i1>,
-  i32,
-  i32);
-
-define <vscale x 16 x i32> @intrinsic_vrgather_mask_vv_nxv16i32_nxv16i32_nxv16i32(<vscale x 16 x i32> %0, <vscale x 16 x i32> %1, <vscale x 16 x i32> %2, <vscale x 16 x i1> %3, i32 %4) nounwind {
-; CHECK-LABEL: intrinsic_vrgather_mask_vv_nxv16i32_nxv16i32_nxv16i32:
-; CHECK:       # %bb.0: # %entry
-; CHECK-NEXT:    vl8re32.v v24, (a0)
-; CHECK-NEXT:    vsetvli zero, a1, e32, m8, ta, mu
-; CHECK-NEXT:    vrgather.vv v8, v16, v24, v0.t
-; CHECK-NEXT:    ret
-entry:
-  %a = call <vscale x 16 x i32> @llvm.riscv.vrgather.vv.mask.nxv16i32.i32(
-    <vscale x 16 x i32> %0,
-    <vscale x 16 x i32> %1,
-    <vscale x 16 x i32> %2,
-    <vscale x 16 x i1> %3,
-    i32 %4, i32 1)
-
-  ret <vscale x 16 x i32> %a
-}
-
-declare <vscale x 1 x half> @llvm.riscv.vrgather.vv.nxv1f16.i32(
-  <vscale x 1 x half>,
-  <vscale x 1 x half>,
-  <vscale x 1 x i16>,
-  i32);
-
-define <vscale x 1 x half> @intrinsic_vrgather_vv_nxv1f16_nxv1f16_nxv1i16(<vscale x 1 x half> %0, <vscale x 1 x i16> %1, i32 %2) nounwind {
-; CHECK-LABEL: intrinsic_vrgather_vv_nxv1f16_nxv1f16_nxv1i16:
-; CHECK:       # %bb.0: # %entry
-; CHECK-NEXT:    vsetvli zero, a0, e16, mf4, ta, ma
-; CHECK-NEXT:    vrgather.vv v10, v8, v9
-; CHECK-NEXT:    vmv1r.v v8, v10
-; CHECK-NEXT:    ret
-entry:
-  %a = call <vscale x 1 x half> @llvm.riscv.vrgather.vv.nxv1f16.i32(
-    <vscale x 1 x half> undef,
-    <vscale x 1 x half> %0,
-    <vscale x 1 x i16> %1,
-    i32 %2)
-
-  ret <vscale x 1 x half> %a
-}
-
-declare <vscale x 1 x half> @llvm.riscv.vrgather.vv.mask.nxv1f16.i32(
-  <vscale x 1 x half>,
-  <vscale x 1 x half>,
-  <vscale x 1 x i16>,
-  <vscale x 1 x i1>,
-  i32,
-  i32);
-
-define <vscale x 1 x half> @intrinsic_vrgather_mask_vv_nxv1f16_nxv1f16_nxv1i16(<vscale x 1 x half> %0, <vscale x 1 x half> %1, <vscale x 1 x i16> %2, <vscale x 1 x i1> %3, i32 %4) nounwind {
-; CHECK-LABEL: intrinsic_vrgather_mask_vv_nxv1f16_nxv1f16_nxv1i16:
-; CHECK:       # %bb.0: # %entry
-; CHECK-NEXT:    vsetvli zero, a0, e16, mf4, ta, mu
-; CHECK-NEXT:    vrgather.vv v8, v9, v10, v0.t
-; CHECK-NEXT:    ret
-entry:
-  %a = call <vscale x 1 x half> @llvm.riscv.vrgather.vv.mask.nxv1f16.i32(
-    <vscale x 1 x half> %0,
-    <vscale x 1 x half> %1,
-    <vscale x 1 x i16> %2,
-    <vscale x 1 x i1> %3,
-    i32 %4, i32 1)
-
-  ret <vscale x 1 x half> %a
-}
-
-declare <vscale x 2 x half> @llvm.riscv.vrgather.vv.nxv2f16.i32(
-  <vscale x 2 x half>,
-  <vscale x 2 x half>,
-  <vscale x 2 x i16>,
-  i32);
-
-define <vscale x 2 x half> @intrinsic_vrgather_vv_nxv2f16_nxv2f16_nxv2i16(<vscale x 2 x half> %0, <vscale x 2 x i16> %1, i32 %2) nounwind {
-; CHECK-LABEL: intrinsic_vrgather_vv_nxv2f16_nxv2f16_nxv2i16:
-; CHECK:       # %bb.0: # %entry
-; CHECK-NEXT:    vsetvli zero, a0, e16, mf2, ta, ma
-; CHECK-NEXT:    vrgather.vv v10, v8, v9
-; CHECK-NEXT:    vmv1r.v v8, v10
-; CHECK-NEXT:    ret
-entry:
-  %a = call <vscale x 2 x half> @llvm.riscv.vrgather.vv.nxv2f16.i32(
-    <vscale x 2 x half> undef,
-    <vscale x 2 x half> %0,
-    <vscale x 2 x i16> %1,
-    i32 %2)
-
-  ret <vscale x 2 x half> %a
-}
-
-declare <vscale x 2 x half> @llvm.riscv.vrgather.vv.mask.nxv2f16.i32(
-  <vscale x 2 x half>,
-  <vscale x 2 x half>,
-  <vscale x 2 x i16>,
-  <vscale x 2 x i1>,
-  i32,
-  i32);
-
-define <vscale x 2 x half> @intrinsic_vrgather_mask_vv_nxv2f16_nxv2f16_nxv2i16(<vscale x 2 x half> %0, <vscale x 2 x half> %1, <vscale x 2 x i16> %2, <vscale x 2 x i1> %3, i32 %4) nounwind {
-; CHECK-LABEL: intrinsic_vrgather_mask_vv_nxv2f16_nxv2f16_nxv2i16:
-; CHECK:       # %bb.0: # %entry
-; CHECK-NEXT:    vsetvli zero, a0, e16, mf2, ta, mu
-; CHECK-NEXT:    vrgather.vv v8, v9, v10, v0.t
-; CHECK-NEXT:    ret
-entry:
-  %a = call <vscale x 2 x half> @llvm.riscv.vrgather.vv.mask.nxv2f16.i32(
-    <vscale x 2 x half> %0,
-    <vscale x 2 x half> %1,
-    <vscale x 2 x i16> %2,
-    <vscale x 2 x i1> %3,
-    i32 %4, i32 1)
-
-  ret <vscale x 2 x half> %a
-}
-
-declare <vscale x 4 x half> @llvm.riscv.vrgather.vv.nxv4f16.i32(
-  <vscale x 4 x half>,
-  <vscale x 4 x half>,
-  <vscale x 4 x i16>,
-  i32);
-
-define <vscale x 4 x half> @intrinsic_vrgather_vv_nxv4f16_nxv4f16_nxv4i16(<vscale x 4 x half> %0, <vscale x 4 x i16> %1, i32 %2) nounwind {
-; CHECK-LABEL: intrinsic_vrgather_vv_nxv4f16_nxv4f16_nxv4i16:
-; CHECK:       # %bb.0: # %entry
-; CHECK-NEXT:    vsetvli zero, a0, e16, m1, ta, ma
-; CHECK-NEXT:    vrgather.vv v10, v8, v9
-; CHECK-NEXT:    vmv.v.v v8, v10
-; CHECK-NEXT:    ret
-entry:
-  %a = call <vscale x 4 x half> @llvm.riscv.vrgather.vv.nxv4f16.i32(
-    <vscale x 4 x half> undef,
-    <vscale x 4 x half> %0,
-    <vscale x 4 x i16> %1,
-    i32 %2)
-
-  ret <vscale x 4 x half> %a
-}
-
-declare <vscale x 4 x half> @llvm.riscv.vrgather.vv.mask.nxv4f16.i32(
-  <vscale x 4 x half>,
-  <vscale x 4 x half>,
-  <vscale x 4 x i16>,
-  <vscale x 4 x i1>,
-  i32,
-  i32);
-
-define <vscale x 4 x half> @intrinsic_vrgather_mask_vv_nxv4f16_nxv4f16_nxv4i16(<vscale x 4 x half> %0, <vscale x 4 x half> %1, <vscale x 4 x i16> %2, <vscale x 4 x i1> %3, i32 %4) nounwind {
-; CHECK-LABEL: intrinsic_vrgather_mask_vv_nxv4f16_nxv4f16_nxv4i16:
-; CHECK:       # %bb.0: # %entry
-; CHECK-NEXT:    vsetvli zero, a0, e16, m1, ta, mu
-; CHECK-NEXT:    vrgather.vv v8, v9, v10, v0.t
-; CHECK-NEXT:    ret
-entry:
-  %a = call <vscale x 4 x half> @llvm.riscv.vrgather.vv.mask.nxv4f16.i32(
-    <vscale x 4 x half> %0,
-    <vscale x 4 x half> %1,
-    <vscale x 4 x i16> %2,
-    <vscale x 4 x i1> %3,
-    i32 %4, i32 1)
-
-  ret <vscale x 4 x half> %a
-}
-
-declare <vscale x 8 x half> @llvm.riscv.vrgather.vv.nxv8f16.i32(
-  <vscale x 8 x half>,
-  <vscale x 8 x half>,
-  <vscale x 8 x i16>,
-  i32);
-
-define <vscale x 8 x half> @intrinsic_vrgather_vv_nxv8f16_nxv8f16_nxv8i16(<vscale x 8 x half> %0, <vscale x 8 x i16> %1, i32 %2) nounwind {
-; CHECK-LABEL: intrinsic_vrgather_vv_nxv8f16_nxv8f16_nxv8i16:
-; CHECK:       # %bb.0: # %entry
-; CHECK-NEXT:    vsetvli zero, a0, e16, m2, ta, ma
-; CHECK-NEXT:    vrgather.vv v12, v8, v10
-; CHECK-NEXT:    vmv.v.v v8, v12
-; CHECK-NEXT:    ret
-entry:
-  %a = call <vscale x 8 x half> @llvm.riscv.vrgather.vv.nxv8f16.i32(
-    <vscale x 8 x half> undef,
-    <vscale x 8 x half> %0,
-    <vscale x 8 x i16> %1,
-    i32 %2)
-
-  ret <vscale x 8 x half> %a
-}
-
-declare <vscale x 8 x half> @llvm.riscv.vrgather.vv.mask.nxv8f16.i32(
-  <vscale x 8 x half>,
-  <vscale x 8 x half>,
-  <vscale x 8 x i16>,
-  <vscale x 8 x i1>,
-  i32,
-  i32);
-
-define <vscale x 8 x half> @intrinsic_vrgather_mask_vv_nxv8f16_nxv8f16_nxv8i16(<vscale x 8 x half> %0, <vscale x 8 x half> %1, <vscale x 8 x i16> %2, <vscale x 8 x i1> %3, i32 %4) nounwind {
-; CHECK-LABEL: intrinsic_vrgather_mask_vv_nxv8f16_nxv8f16_nxv8i16:
-; CHECK:       # %bb.0: # %entry
-; CHECK-NEXT:    vsetvli zero, a0, e16, m2, ta, mu
-; CHECK-NEXT:    vrgather.vv v8, v10, v12, v0.t
-; CHECK-NEXT:    ret
-entry:
-  %a = call <vscale x 8 x half> @llvm.riscv.vrgather.vv.mask.nxv8f16.i32(
-    <vscale x 8 x half> %0,
-    <vscale x 8 x half> %1,
-    <vscale x 8 x i16> %2,
-    <vscale x 8 x i1> %3,
-    i32 %4, i32 1)
-
-  ret <vscale x 8 x half> %a
-}
-
-declare <vscale x 16 x half> @llvm.riscv.vrgather.vv.nxv16f16.i32(
-  <vscale x 16 x half>,
-  <vscale x 16 x half>,
-  <vscale x 16 x i16>,
-  i32);
-
-define <vscale x 16 x half> @intrinsic_vrgather_vv_nxv16f16_nxv16f16_nxv16i16(<vscale x 16 x half> %0, <vscale x 16 x i16> %1, i32 %2) nounwind {
-; CHECK-LABEL: intrinsic_vrgather_vv_nxv16f16_nxv16f16_nxv16i16:
-; CHECK:       # %bb.0: # %entry
-; CHECK-NEXT:    vsetvli zero, a0, e16, m4, ta, ma
-; CHECK-NEXT:    vrgather.vv v16, v8, v12
-; CHECK-NEXT:    vmv.v.v v8, v16
-; CHECK-NEXT:    ret
-entry:
-  %a = call <vscale x 16 x half> @llvm.riscv.vrgather.vv.nxv16f16.i32(
-    <vscale x 16 x half> undef,
-    <vscale x 16 x half> %0,
-    <vscale x 16 x i16> %1,
-    i32 %2)
-
-  ret <vscale x 16 x half> %a
-}
-
-declare <vscale x 16 x half> @llvm.riscv.vrgather.vv.mask.nxv16f16.i32(
-  <vscale x 16 x half>,
-  <vscale x 16 x half>,
-  <vscale x 16 x i16>,
-  <vscale x 16 x i1>,
-  i32,
-  i32);
-
-define <vscale x 16 x half> @intrinsic_vrgather_mask_vv_nxv16f16_nxv16f16_nxv16i16(<vscale x 16 x half> %0, <vscale x 16 x half> %1, <vscale x 16 x i16> %2, <vscale x 16 x i1> %3, i32 %4) nounwind {
-; CHECK-LABEL: intrinsic_vrgather_mask_vv_nxv16f16_nxv16f16_nxv16i16:
-; CHECK:       # %bb.0: # %entry
-; CHECK-NEXT:    vsetvli zero, a0, e16, m4, ta, mu
-; CHECK-NEXT:    vrgather.vv v8, v12, v16, v0.t
-; CHECK-NEXT:    ret
-entry:
-  %a = call <vscale x 16 x half> @llvm.riscv.vrgather.vv.mask.nxv16f16.i32(
-    <vscale x 16 x half> %0,
-    <vscale x 16 x half> %1,
-    <vscale x 16 x i16> %2,
-    <vscale x 16 x i1> %3,
-    i32 %4, i32 1)
-
-  ret <vscale x 16 x half> %a
-}
-
-declare <vscale x 32 x half> @llvm.riscv.vrgather.vv.nxv32f16.i32(
-  <vscale x 32 x half>,
-  <vscale x 32 x half>,
-  <vscale x 32 x i16>,
-  i32);
-
-define <vscale x 32 x half> @intrinsic_vrgather_vv_nxv32f16_nxv32f16_nxv32i16(<vscale x 32 x half> %0, <vscale x 32 x i16> %1, i32 %2) nounwind {
-; CHECK-LABEL: intrinsic_vrgather_vv_nxv32f16_nxv32f16_nxv32i16:
-; CHECK:       # %bb.0: # %entry
-; CHECK-NEXT:    vsetvli zero, a0, e16, m8, ta, ma
-; CHECK-NEXT:    vrgather.vv v24, v8, v16
-; CHECK-NEXT:    vmv.v.v v8, v24
-; CHECK-NEXT:    ret
-entry:
-  %a = call <vscale x 32 x half> @llvm.riscv.vrgather.vv.nxv32f16.i32(
-    <vscale x 32 x half> undef,
-    <vscale x 32 x half> %0,
-    <vscale x 32 x i16> %1,
-    i32 %2)
-
-  ret <vscale x 32 x half> %a
-}
-
-declare <vscale x 32 x half> @llvm.riscv.vrgather.vv.mask.nxv32f16.i32(
-  <vscale x 32 x half>,
-  <vscale x 32 x half>,
-  <vscale x 32 x i16>,
-  <vscale x 32 x i1>,
-  i32,
-  i32);
-
-define <vscale x 32 x half> @intrinsic_vrgather_mask_vv_nxv32f16_nxv32f16_nxv32i16(<vscale x 32 x half> %0, <vscale x 32 x half> %1, <vscale x 32 x i16> %2, <vscale x 32 x i1> %3, i32 %4) nounwind {
-; CHECK-LABEL: intrinsic_vrgather_mask_vv_nxv32f16_nxv32f16_nxv32i16:
-; CHECK:       # %bb.0: # %entry
-; CHECK-NEXT:    vl8re16.v v24, (a0)
-; CHECK-NEXT:    vsetvli zero, a1, e16, m8, ta, mu
-; CHECK-NEXT:    vrgather.vv v8, v16, v24, v0.t
-; CHECK-NEXT:    ret
-entry:
-  %a = call <vscale x 32 x half> @llvm.riscv.vrgather.vv.mask.nxv32f16.i32(
-    <vscale x 32 x half> %0,
-    <vscale x 32 x half> %1,
-    <vscale x 32 x i16> %2,
-    <vscale x 32 x i1> %3,
-    i32 %4, i32 1)
-
-  ret <vscale x 32 x half> %a
-}
-
-declare <vscale x 1 x float> @llvm.riscv.vrgather.vv.nxv1f32.i32(
-  <vscale x 1 x float>,
-  <vscale x 1 x float>,
-  <vscale x 1 x i32>,
-  i32);
-
-define <vscale x 1 x float> @intrinsic_vrgather_vv_nxv1f32_nxv1f32_nxv1i32(<vscale x 1 x float> %0, <vscale x 1 x i32> %1, i32 %2) nounwind {
-; CHECK-LABEL: intrinsic_vrgather_vv_nxv1f32_nxv1f32_nxv1i32:
-; CHECK:       # %bb.0: # %entry
-; CHECK-NEXT:    vsetvli zero, a0, e32, mf2, ta, ma
-; CHECK-NEXT:    vrgather.vv v10, v8, v9
-; CHECK-NEXT:    vmv1r.v v8, v10
-; CHECK-NEXT:    ret
-entry:
-  %a = call <vscale x 1 x float> @llvm.riscv.vrgather.vv.nxv1f32.i32(
-    <vscale x 1 x float> undef,
-    <vscale x 1 x float> %0,
-    <vscale x 1 x i32> %1,
-    i32 %2)
-
-  ret <vscale x 1 x float> %a
-}
-
-declare <vscale x 1 x float> @llvm.riscv.vrgather.vv.mask.nxv1f32.i32(
-  <vscale x 1 x float>,
-  <vscale x 1 x float>,
-  <vscale x 1 x i32>,
-  <vscale x 1 x i1>,
-  i32,
-  i32);
-
-define <vscale x 1 x float> @intrinsic_vrgather_mask_vv_nxv1f32_nxv1f32_nxv1i32(<vscale x 1 x float> %0, <vscale x 1 x float> %1, <vscale x 1 x i32> %2, <vscale x 1 x i1> %3, i32 %4) nounwind {
-; CHECK-LABEL: intrinsic_vrgather_mask_vv_nxv1f32_nxv1f32_nxv1i32:
-; CHECK:       # %bb.0: # %entry
-; CHECK-NEXT:    vsetvli zero, a0, e32, mf2, ta, mu
-; CHECK-NEXT:    vrgather.vv v8, v9, v10, v0.t
-; CHECK-NEXT:    ret
-entry:
-  %a = call <vscale x 1 x float> @llvm.riscv.vrgather.vv.mask.nxv1f32.i32(
-    <vscale x 1 x float> %0,
-    <vscale x 1 x float> %1,
-    <vscale x 1 x i32> %2,
-    <vscale x 1 x i1> %3,
-    i32 %4, i32 1)
-
-  ret <vscale x 1 x float> %a
-}
-
-declare <vscale x 2 x float> @llvm.riscv.vrgather.vv.nxv2f32.i32(
-  <vscale x 2 x float>,
-  <vscale x 2 x float>,
-  <vscale x 2 x i32>,
-  i32);
-
-define <vscale x 2 x float> @intrinsic_vrgather_vv_nxv2f32_nxv2f32_nxv2i32(<vscale x 2 x float> %0, <vscale x 2 x i32> %1, i32 %2) nounwind {
-; CHECK-LABEL: intrinsic_vrgather_vv_nxv2f32_nxv2f32_nxv2i32:
-; CHECK:       # %bb.0: # %entry
-; CHECK-NEXT:    vsetvli zero, a0, e32, m1, ta, ma
-; CHECK-NEXT:    vrgather.vv v10, v8, v9
-; CHECK-NEXT:    vmv.v.v v8, v10
-; CHECK-NEXT:    ret
-entry:
-  %a = call <vscale x 2 x float> @llvm.riscv.vrgather.vv.nxv2f32.i32(
-    <vscale x 2 x float> undef,
-    <vscale x 2 x float> %0,
-    <vscale x 2 x i32> %1,
-    i32 %2)
-
-  ret <vscale x 2 x float> %a
-}
-
-declare <vscale x 2 x float> @llvm.riscv.vrgather.vv.mask.nxv2f32.i32(
-  <vscale x 2 x float>,
-  <vscale x 2 x float>,
-  <vscale x 2 x i32>,
-  <vscale x 2 x i1>,
-  i32,
-  i32);
-
-define <vscale x 2 x float> @intrinsic_vrgather_mask_vv_nxv2f32_nxv2f32_nxv2i32(<vscale x 2 x float> %0, <vscale x 2 x float> %1, <vscale x 2 x i32> %2, <vscale x 2 x i1> %3, i32 %4) nounwind {
-; CHECK-LABEL: intrinsic_vrgather_mask_vv_nxv2f32_nxv2f32_nxv2i32:
-; CHECK:       # %bb.0: # %entry
-; CHECK-NEXT:    vsetvli zero, a0, e32, m1, ta, mu
-; CHECK-NEXT:    vrgather.vv v8, v9, v10, v0.t
-; CHECK-NEXT:    ret
-entry:
-  %a = call <vscale x 2 x float> @llvm.riscv.vrgather.vv.mask.nxv2f32.i32(
-    <vscale x 2 x float> %0,
-    <vscale x 2 x float> %1,
-    <vscale x 2 x i32> %2,
-    <vscale x 2 x i1> %3,
-    i32 %4, i32 1)
-
-  ret <vscale x 2 x float> %a
-}
-
-declare <vscale x 4 x float> @llvm.riscv.vrgather.vv.nxv4f32.i32(
-  <vscale x 4 x float>,
-  <vscale x 4 x float>,
-  <vscale x 4 x i32>,
-  i32);
-
-define <vscale x 4 x float> @intrinsic_vrgather_vv_nxv4f32_nxv4f32_nxv4i32(<vscale x 4 x float> %0, <vscale x 4 x i32> %1, i32 %2) nounwind {
-; CHECK-LABEL: intrinsic_vrgather_vv_nxv4f32_nxv4f32_nxv4i32:
-; CHECK:       # %bb.0: # %entry
-; CHECK-NEXT:    vsetvli zero, a0, e32, m2, ta, ma
-; CHECK-NEXT:    vrgather.vv v12, v8, v10
-; CHECK-NEXT:    vmv.v.v v8, v12
-; CHECK-NEXT:    ret
-entry:
-  %a = call <vscale x 4 x float> @llvm.riscv.vrgather.vv.nxv4f32.i32(
-    <vscale x 4 x float> undef,
-    <vscale x 4 x float> %0,
-    <vscale x 4 x i32> %1,
-    i32 %2)
-
-  ret <vscale x 4 x float> %a
-}
-
-declare <vscale x 4 x float> @llvm.riscv.vrgather.vv.mask.nxv4f32.i32(
-  <vscale x 4 x float>,
-  <vscale x 4 x float>,
-  <vscale x 4 x i32>,
-  <vscale x 4 x i1>,
-  i32,
-  i32);
-
-define <vscale x 4 x float> @intrinsic_vrgather_mask_vv_nxv4f32_nxv4f32_nxv4i32(<vscale x 4 x float> %0, <vscale x 4 x float> %1, <vscale x 4 x i32> %2, <vscale x 4 x i1> %3, i32 %4) nounwind {
-; CHECK-LABEL: intrinsic_vrgather_mask_vv_nxv4f32_nxv4f32_nxv4i32:
-; CHECK:       # %bb.0: # %entry
-; CHECK-NEXT:    vsetvli zero, a0, e32, m2, ta, mu
-; CHECK-NEXT:    vrgather.vv v8, v10, v12, v0.t
-; CHECK-NEXT:    ret
-entry:
-  %a = call <vscale x 4 x float> @llvm.riscv.vrgather.vv.mask.nxv4f32.i32(
-    <vscale x 4 x float> %0,
-    <vscale x 4 x float> %1,
-    <vscale x 4 x i32> %2,
-    <vscale x 4 x i1> %3,
-    i32 %4, i32 1)
-
-  ret <vscale x 4 x float> %a
-}
-
-declare <vscale x 8 x float> @llvm.riscv.vrgather.vv.nxv8f32.i32(
-  <vscale x 8 x float>,
-  <vscale x 8 x float>,
-  <vscale x 8 x i32>,
-  i32);
-
-define <vscale x 8 x float> @intrinsic_vrgather_vv_nxv8f32_nxv8f32_nxv8i32(<vscale x 8 x float> %0, <vscale x 8 x i32> %1, i32 %2) nounwind {
-; CHECK-LABEL: intrinsic_vrgather_vv_nxv8f32_nxv8f32_nxv8i32:
-; CHECK:       # %bb.0: # %entry
-; CHECK-NEXT:    vsetvli zero, a0, e32, m4, ta, ma
-; CHECK-NEXT:    vrgather.vv v16, v8, v12
-; CHECK-NEXT:    vmv.v.v v8, v16
-; CHECK-NEXT:    ret
-entry:
-  %a = call <vscale x 8 x float> @llvm.riscv.vrgather.vv.nxv8f32.i32(
-    <vscale x 8 x float> undef,
-    <vscale x 8 x float> %0,
-    <vscale x 8 x i32> %1,
-    i32 %2)
-
-  ret <vscale x 8 x float> %a
-}
-
-declare <vscale x 8 x float> @llvm.riscv.vrgather.vv.mask.nxv8f32.i32(
-  <vscale x 8 x float>,
-  <vscale x 8 x float>,
-  <vscale x 8 x i32>,
-  <vscale x 8 x i1>,
-  i32,
-  i32);
-
-define <vscale x 8 x float> @intrinsic_vrgather_mask_vv_nxv8f32_nxv8f32_nxv8i32(<vscale x 8 x float> %0, <vscale x 8 x float> %1, <vscale x 8 x i32> %2, <vscale x 8 x i1> %3, i32 %4) nounwind {
-; CHECK-LABEL: intrinsic_vrgather_mask_vv_nxv8f32_nxv8f32_nxv8i32:
-; CHECK:       # %bb.0: # %entry
-; CHECK-NEXT:    vsetvli zero, a0, e32, m4, ta, mu
-; CHECK-NEXT:    vrgather.vv v8, v12, v16, v0.t
-; CHECK-NEXT:    ret
-entry:
-  %a = call <vscale x 8 x float> @llvm.riscv.vrgather.vv.mask.nxv8f32.i32(
-    <vscale x 8 x float> %0,
-    <vscale x 8 x float> %1,
-    <vscale x 8 x i32> %2,
-    <vscale x 8 x i1> %3,
-    i32 %4, i32 1)
-
-  ret <vscale x 8 x float> %a
-}
-
-declare <vscale x 16 x float> @llvm.riscv.vrgather.vv.nxv16f32.i32(
-  <vscale x 16 x float>,
-  <vscale x 16 x float>,
-  <vscale x 16 x i32>,
-  i32);
-
-define <vscale x 16 x float> @intrinsic_vrgather_vv_nxv16f32_nxv16f32_nxv16i32(<vscale x 16 x float> %0, <vscale x 16 x i32> %1, i32 %2) nounwind {
-; CHECK-LABEL: intrinsic_vrgather_vv_nxv16f32_nxv16f32_nxv16i32:
-; CHECK:       # %bb.0: # %entry
-; CHECK-NEXT:    vsetvli zero, a0, e32, m8, ta, ma
-; CHECK-NEXT:    vrgather.vv v24, v8, v16
-; CHECK-NEXT:    vmv.v.v v8, v24
-; CHECK-NEXT:    ret
-entry:
-  %a = call <vscale x 16 x float> @llvm.riscv.vrgather.vv.nxv16f32.i32(
-    <vscale x 16 x float> undef,
-    <vscale x 16 x float> %0,
-    <vscale x 16 x i32> %1,
-    i32 %2)
-
-  ret <vscale x 16 x float> %a
-}
-
-declare <vscale x 16 x float> @llvm.riscv.vrgather.vv.mask.nxv16f32.i32(
-  <vscale x 16 x float>,
-  <vscale x 16 x float>,
-  <vscale x 16 x i32>,
-  <vscale x 16 x i1>,
-  i32,
-  i32);
-
-define <vscale x 16 x float> @intrinsic_vrgather_mask_vv_nxv16f32_nxv16f32_nxv16i32(<vscale x 16 x float> %0, <vscale x 16 x float> %1, <vscale x 16 x i32> %2, <vscale x 16 x i1> %3, i32 %4) nounwind {
-; CHECK-LABEL: intrinsic_vrgather_mask_vv_nxv16f32_nxv16f32_nxv16i32:
-; CHECK:       # %bb.0: # %entry
-; CHECK-NEXT:    vl8re32.v v24, (a0)
-; CHECK-NEXT:    vsetvli zero, a1, e32, m8, ta, mu
-; CHECK-NEXT:    vrgather.vv v8, v16, v24, v0.t
-; CHECK-NEXT:    ret
-entry:
-  %a = call <vscale x 16 x float> @llvm.riscv.vrgather.vv.mask.nxv16f32.i32(
-    <vscale x 16 x float> %0,
-    <vscale x 16 x float> %1,
-    <vscale x 16 x i32> %2,
-    <vscale x 16 x i1> %3,
-    i32 %4, i32 1)
-
-  ret <vscale x 16 x float> %a
-}
-
-declare <vscale x 1 x double> @llvm.riscv.vrgather.vv.nxv1f64.i32(
-  <vscale x 1 x double>,
-  <vscale x 1 x double>,
-  <vscale x 1 x i64>,
-  i32);
-
-define <vscale x 1 x double> @intrinsic_vrgather_vv_nxv1f64_nxv1f64_nxv1i64(<vscale x 1 x double> %0, <vscale x 1 x i64> %1, i32 %2) nounwind {
-; CHECK-LABEL: intrinsic_vrgather_vv_nxv1f64_nxv1f64_nxv1i64:
-; CHECK:       # %bb.0: # %entry
-; CHECK-NEXT:    vsetvli zero, a0, e64, m1, ta, ma
-; CHECK-NEXT:    vrgather.vv v10, v8, v9
-; CHECK-NEXT:    vmv.v.v v8, v10
-; CHECK-NEXT:    ret
-entry:
-  %a = call <vscale x 1 x double> @llvm.riscv.vrgather.vv.nxv1f64.i32(
-    <vscale x 1 x double> undef,
-    <vscale x 1 x double> %0,
-    <vscale x 1 x i64> %1,
-    i32 %2)
-
-  ret <vscale x 1 x double> %a
-}
-
-declare <vscale x 1 x double> @llvm.riscv.vrgather.vv.mask.nxv1f64.i32(
-  <vscale x 1 x double>,
-  <vscale x 1 x double>,
-  <vscale x 1 x i64>,
-  <vscale x 1 x i1>,
-  i32,
-  i32);
-
-define <vscale x 1 x double> @intrinsic_vrgather_mask_vv_nxv1f64_nxv1f64_nxv1i64(<vscale x 1 x double> %0, <vscale x 1 x double> %1, <vscale x 1 x i64> %2, <vscale x 1 x i1> %3, i32 %4) nounwind {
-; CHECK-LABEL: intrinsic_vrgather_mask_vv_nxv1f64_nxv1f64_nxv1i64:
-; CHECK:       # %bb.0: # %entry
-; CHECK-NEXT:    vsetvli zero, a0, e64, m1, ta, mu
-; CHECK-NEXT:    vrgather.vv v8, v9, v10, v0.t
-; CHECK-NEXT:    ret
-entry:
-  %a = call <vscale x 1 x double> @llvm.riscv.vrgather.vv.mask.nxv1f64.i32(
-    <vscale x 1 x double> %0,
-    <vscale x 1 x double> %1,
-    <vscale x 1 x i64> %2,
-    <vscale x 1 x i1> %3,
-    i32 %4, i32 1)
-
-  ret <vscale x 1 x double> %a
-}
-
-declare <vscale x 2 x double> @llvm.riscv.vrgather.vv.nxv2f64.i32(
-  <vscale x 2 x double>,
-  <vscale x 2 x double>,
-  <vscale x 2 x i64>,
-  i32);
-
-define <vscale x 2 x double> @intrinsic_vrgather_vv_nxv2f64_nxv2f64_nxv2i64(<vscale x 2 x double> %0, <vscale x 2 x i64> %1, i32 %2) nounwind {
-; CHECK-LABEL: intrinsic_vrgather_vv_nxv2f64_nxv2f64_nxv2i64:
-; CHECK:       # %bb.0: # %entry
-; CHECK-NEXT:    vsetvli zero, a0, e64, m2, ta, ma
-; CHECK-NEXT:    vrgather.vv v12, v8, v10
-; CHECK-NEXT:    vmv.v.v v8, v12
-; CHECK-NEXT:    ret
-entry:
-  %a = call <vscale x 2 x double> @llvm.riscv.vrgather.vv.nxv2f64.i32(
-    <vscale x 2 x double> undef,
-    <vscale x 2 x double> %0,
-    <vscale x 2 x i64> %1,
-    i32 %2)
-
-  ret <vscale x 2 x double> %a
-}
-
-declare <vscale x 2 x double> @llvm.riscv.vrgather.vv.mask.nxv2f64.i32(
-  <vscale x 2 x double>,
-  <vscale x 2 x double>,
-  <vscale x 2 x i64>,
-  <vscale x 2 x i1>,
-  i32,
-  i32);
-
-define <vscale x 2 x double> @intrinsic_vrgather_mask_vv_nxv2f64_nxv2f64_nxv2i64(<vscale x 2 x double> %0, <vscale x 2 x double> %1, <vscale x 2 x i64> %2, <vscale x 2 x i1> %3, i32 %4) nounwind {
-; CHECK-LABEL: intrinsic_vrgather_mask_vv_nxv2f64_nxv2f64_nxv2i64:
-; CHECK:       # %bb.0: # %entry
-; CHECK-NEXT:    vsetvli zero, a0, e64, m2, ta, mu
-; CHECK-NEXT:    vrgather.vv v8, v10, v12, v0.t
-; CHECK-NEXT:    ret
-entry:
-  %a = call <vscale x 2 x double> @llvm.riscv.vrgather.vv.mask.nxv2f64.i32(
-    <vscale x 2 x double> %0,
-    <vscale x 2 x double> %1,
-    <vscale x 2 x i64> %2,
-    <vscale x 2 x i1> %3,
-    i32 %4, i32 1)
-
-  ret <vscale x 2 x double> %a
-}
-
-declare <vscale x 4 x double> @llvm.riscv.vrgather.vv.nxv4f64.i32(
-  <vscale x 4 x double>,
-  <vscale x 4 x double>,
-  <vscale x 4 x i64>,
-  i32);
-
-define <vscale x 4 x double> @intrinsic_vrgather_vv_nxv4f64_nxv4f64_nxv4i64(<vscale x 4 x double> %0, <vscale x 4 x i64> %1, i32 %2) nounwind {
-; CHECK-LABEL: intrinsic_vrgather_vv_nxv4f64_nxv4f64_nxv4i64:
-; CHECK:       # %bb.0: # %entry
-; CHECK-NEXT:    vsetvli zero, a0, e64, m4, ta, ma
-; CHECK-NEXT:    vrgather.vv v16, v8, v12
-; CHECK-NEXT:    vmv.v.v v8, v16
-; CHECK-NEXT:    ret
-entry:
-  %a = call <vscale x 4 x double> @llvm.riscv.vrgather.vv.nxv4f64.i32(
-    <vscale x 4 x double> undef,
-    <vscale x 4 x double> %0,
-    <vscale x 4 x i64> %1,
-    i32 %2)
-
-  ret <vscale x 4 x double> %a
-}
-
-declare <vscale x 4 x double> @llvm.riscv.vrgather.vv.mask.nxv4f64.i32(
-  <vscale x 4 x double>,
-  <vscale x 4 x double>,
-  <vscale x 4 x i64>,
-  <vscale x 4 x i1>,
-  i32,
-  i32);
-
-define <vscale x 4 x double> @intrinsic_vrgather_mask_vv_nxv4f64_nxv4f64_nxv4i64(<vscale x 4 x double> %0, <vscale x 4 x double> %1, <vscale x 4 x i64> %2, <vscale x 4 x i1> %3, i32 %4) nounwind {
-; CHECK-LABEL: intrinsic_vrgather_mask_vv_nxv4f64_nxv4f64_nxv4i64:
-; CHECK:       # %bb.0: # %entry
-; CHECK-NEXT:    vsetvli zero, a0, e64, m4, ta, mu
-; CHECK-NEXT:    vrgather.vv v8, v12, v16, v0.t
-; CHECK-NEXT:    ret
-entry:
-  %a = call <vscale x 4 x double> @llvm.riscv.vrgather.vv.mask.nxv4f64.i32(
-    <vscale x 4 x double> %0,
-    <vscale x 4 x double> %1,
-    <vscale x 4 x i64> %2,
-    <vscale x 4 x i1> %3,
-    i32 %4, i32 1)
-
-  ret <vscale x 4 x double> %a
-}
-
-declare <vscale x 8 x double> @llvm.riscv.vrgather.vv.nxv8f64.i32(
-  <vscale x 8 x double>,
-  <vscale x 8 x double>,
-  <vscale x 8 x i64>,
-  i32);
-
-define <vscale x 8 x double> @intrinsic_vrgather_vv_nxv8f64_nxv8f64_nxv8i64(<vscale x 8 x double> %0, <vscale x 8 x i64> %1, i32 %2) nounwind {
-; CHECK-LABEL: intrinsic_vrgather_vv_nxv8f64_nxv8f64_nxv8i64:
-; CHECK:       # %bb.0: # %entry
-; CHECK-NEXT:    vsetvli zero, a0, e64, m8, ta, ma
-; CHECK-NEXT:    vrgather.vv v24, v8, v16
-; CHECK-NEXT:    vmv.v.v v8, v24
-; CHECK-NEXT:    ret
-entry:
-  %a = call <vscale x 8 x double> @llvm.riscv.vrgather.vv.nxv8f64.i32(
-    <vscale x 8 x double> undef,
-    <vscale x 8 x double> %0,
-    <vscale x 8 x i64> %1,
-    i32 %2)
-
-  ret <vscale x 8 x double> %a
-}
-
-declare <vscale x 8 x double> @llvm.riscv.vrgather.vv.mask.nxv8f64.i32(
-  <vscale x 8 x double>,
-  <vscale x 8 x double>,
-  <vscale x 8 x i64>,
-  <vscale x 8 x i1>,
-  i32,
-  i32);
-
-define <vscale x 8 x double> @intrinsic_vrgather_mask_vv_nxv8f64_nxv8f64_nxv8i64(<vscale x 8 x double> %0, <vscale x 8 x double> %1, <vscale x 8 x i64> %2, <vscale x 8 x i1> %3, i32 %4) nounwind {
-; CHECK-LABEL: intrinsic_vrgather_mask_vv_nxv8f64_nxv8f64_nxv8i64:
-; CHECK:       # %bb.0: # %entry
-; CHECK-NEXT:    vl8re64.v v24, (a0)
-; CHECK-NEXT:    vsetvli zero, a1, e64, m8, ta, mu
-; CHECK-NEXT:    vrgather.vv v8, v16, v24, v0.t
-; CHECK-NEXT:    ret
-entry:
-  %a = call <vscale x 8 x double> @llvm.riscv.vrgather.vv.mask.nxv8f64.i32(
-    <vscale x 8 x double> %0,
-    <vscale x 8 x double> %1,
-    <vscale x 8 x i64> %2,
-    <vscale x 8 x i1> %3,
-    i32 %4, i32 1)
-
-  ret <vscale x 8 x double> %a
-}
-
-declare <vscale x 1 x i8> @llvm.riscv.vrgather.vx.nxv1i8.i32(
-  <vscale x 1 x i8>,
-  <vscale x 1 x i8>,
-  i32,
-  i32);
-
-define <vscale x 1 x i8> @intrinsic_vrgather_vx_nxv1i8_nxv1i8_i32(<vscale x 1 x i8> %0, i32 %1, i32 %2) nounwind {
-; CHECK-LABEL: intrinsic_vrgather_vx_nxv1i8_nxv1i8_i32:
-; CHECK:       # %bb.0: # %entry
-; CHECK-NEXT:    vsetvli zero, a1, e8, mf8, ta, ma
-; CHECK-NEXT:    vrgather.vx v9, v8, a0
-; CHECK-NEXT:    vmv1r.v v8, v9
-; CHECK-NEXT:    ret
-entry:
-  %a = call <vscale x 1 x i8> @llvm.riscv.vrgather.vx.nxv1i8.i32(
-    <vscale x 1 x i8> undef,
-    <vscale x 1 x i8> %0,
-    i32 %1,
-    i32 %2)
-
-  ret <vscale x 1 x i8> %a
-}
-
-declare <vscale x 1 x i8> @llvm.riscv.vrgather.vx.mask.nxv1i8.i32(
-  <vscale x 1 x i8>,
-  <vscale x 1 x i8>,
-  i32,
-  <vscale x 1 x i1>,
-  i32,
-  i32);
-
-define <vscale x 1 x i8> @intrinsic_vrgather_mask_vx_nxv1i8_nxv1i8_i32(<vscale x 1 x i8> %0, <vscale x 1 x i8> %1, i32 %2, <vscale x 1 x i1> %3, i32 %4) nounwind {
-; CHECK-LABEL: intrinsic_vrgather_mask_vx_nxv1i8_nxv1i8_i32:
-; CHECK:       # %bb.0: # %entry
-; CHECK-NEXT:    vsetvli zero, a1, e8, mf8, ta, mu
-; CHECK-NEXT:    vrgather.vx v8, v9, a0, v0.t
-; CHECK-NEXT:    ret
-entry:
-  %a = call <vscale x 1 x i8> @llvm.riscv.vrgather.vx.mask.nxv1i8.i32(
-    <vscale x 1 x i8> %0,
-    <vscale x 1 x i8> %1,
-    i32 %2,
-    <vscale x 1 x i1> %3,
-    i32 %4, i32 1)
-
-  ret <vscale x 1 x i8> %a
-}
-
-declare <vscale x 2 x i8> @llvm.riscv.vrgather.vx.nxv2i8.i32(
-  <vscale x 2 x i8>,
-  <vscale x 2 x i8>,
-  i32,
-  i32);
-
-define <vscale x 2 x i8> @intrinsic_vrgather_vx_nxv2i8_nxv2i8_i32(<vscale x 2 x i8> %0, i32 %1, i32 %2) nounwind {
-; CHECK-LABEL: intrinsic_vrgather_vx_nxv2i8_nxv2i8_i32:
-; CHECK:       # %bb.0: # %entry
-; CHECK-NEXT:    vsetvli zero, a1, e8, mf4, ta, ma
-; CHECK-NEXT:    vrgather.vx v9, v8, a0
-; CHECK-NEXT:    vmv1r.v v8, v9
-; CHECK-NEXT:    ret
-entry:
-  %a = call <vscale x 2 x i8> @llvm.riscv.vrgather.vx.nxv2i8.i32(
-    <vscale x 2 x i8> undef,
-    <vscale x 2 x i8> %0,
-    i32 %1,
-    i32 %2)
-
-  ret <vscale x 2 x i8> %a
-}
-
-declare <vscale x 2 x i8> @llvm.riscv.vrgather.vx.mask.nxv2i8.i32(
-  <vscale x 2 x i8>,
-  <vscale x 2 x i8>,
-  i32,
-  <vscale x 2 x i1>,
-  i32,
-  i32);
-
-define <vscale x 2 x i8> @intrinsic_vrgather_mask_vx_nxv2i8_nxv2i8_i32(<vscale x 2 x i8> %0, <vscale x 2 x i8> %1, i32 %2, <vscale x 2 x i1> %3, i32 %4) nounwind {
-; CHECK-LABEL: intrinsic_vrgather_mask_vx_nxv2i8_nxv2i8_i32:
-; CHECK:       # %bb.0: # %entry
-; CHECK-NEXT:    vsetvli zero, a1, e8, mf4, ta, mu
-; CHECK-NEXT:    vrgather.vx v8, v9, a0, v0.t
-; CHECK-NEXT:    ret
-entry:
-  %a = call <vscale x 2 x i8> @llvm.riscv.vrgather.vx.mask.nxv2i8.i32(
-    <vscale x 2 x i8> %0,
-    <vscale x 2 x i8> %1,
-    i32 %2,
-    <vscale x 2 x i1> %3,
-    i32 %4, i32 1)
-
-  ret <vscale x 2 x i8> %a
-}
-
-declare <vscale x 4 x i8> @llvm.riscv.vrgather.vx.nxv4i8.i32(
-  <vscale x 4 x i8>,
-  <vscale x 4 x i8>,
-  i32,
-  i32);
-
-define <vscale x 4 x i8> @intrinsic_vrgather_vx_nxv4i8_nxv4i8_i32(<vscale x 4 x i8> %0, i32 %1, i32 %2) nounwind {
-; CHECK-LABEL: intrinsic_vrgather_vx_nxv4i8_nxv4i8_i32:
-; CHECK:       # %bb.0: # %entry
-; CHECK-NEXT:    vsetvli zero, a1, e8, mf2, ta, ma
-; CHECK-NEXT:    vrgather.vx v9, v8, a0
-; CHECK-NEXT:    vmv1r.v v8, v9
-; CHECK-NEXT:    ret
-entry:
-  %a = call <vscale x 4 x i8> @llvm.riscv.vrgather.vx.nxv4i8.i32(
-    <vscale x 4 x i8> undef,
-    <vscale x 4 x i8> %0,
-    i32 %1,
-    i32 %2)
-
-  ret <vscale x 4 x i8> %a
-}
-
-declare <vscale x 4 x i8> @llvm.riscv.vrgather.vx.mask.nxv4i8.i32(
-  <vscale x 4 x i8>,
-  <vscale x 4 x i8>,
-  i32,
-  <vscale x 4 x i1>,
-  i32,
-  i32);
-
-define <vscale x 4 x i8> @intrinsic_vrgather_mask_vx_nxv4i8_nxv4i8_i32(<vscale x 4 x i8> %0, <vscale x 4 x i8> %1, i32 %2, <vscale x 4 x i1> %3, i32 %4) nounwind {
-; CHECK-LABEL: intrinsic_vrgather_mask_vx_nxv4i8_nxv4i8_i32:
-; CHECK:       # %bb.0: # %entry
-; CHECK-NEXT:    vsetvli zero, a1, e8, mf2, ta, mu
-; CHECK-NEXT:    vrgather.vx v8, v9, a0, v0.t
-; CHECK-NEXT:    ret
-entry:
-  %a = call <vscale x 4 x i8> @llvm.riscv.vrgather.vx.mask.nxv4i8.i32(
-    <vscale x 4 x i8> %0,
-    <vscale x 4 x i8> %1,
-    i32 %2,
-    <vscale x 4 x i1> %3,
-    i32 %4, i32 1)
-
-  ret <vscale x 4 x i8> %a
-}
-
-declare <vscale x 8 x i8> @llvm.riscv.vrgather.vx.nxv8i8.i32(
-  <vscale x 8 x i8>,
-  <vscale x 8 x i8>,
-  i32,
-  i32);
-
-define <vscale x 8 x i8> @intrinsic_vrgather_vx_nxv8i8_nxv8i8_i32(<vscale x 8 x i8> %0, i32 %1, i32 %2) nounwind {
-; CHECK-LABEL: intrinsic_vrgather_vx_nxv8i8_nxv8i8_i32:
-; CHECK:       # %bb.0: # %entry
-; CHECK-NEXT:    vsetvli zero, a1, e8, m1, ta, ma
-; CHECK-NEXT:    vrgather.vx v9, v8, a0
-; CHECK-NEXT:    vmv.v.v v8, v9
-; CHECK-NEXT:    ret
-entry:
-  %a = call <vscale x 8 x i8> @llvm.riscv.vrgather.vx.nxv8i8.i32(
-    <vscale x 8 x i8> undef,
-    <vscale x 8 x i8> %0,
-    i32 %1,
-    i32 %2)
-
-  ret <vscale x 8 x i8> %a
-}
-
-declare <vscale x 8 x i8> @llvm.riscv.vrgather.vx.mask.nxv8i8.i32(
-  <vscale x 8 x i8>,
-  <vscale x 8 x i8>,
-  i32,
-  <vscale x 8 x i1>,
-  i32,
-  i32);
-
-define <vscale x 8 x i8> @intrinsic_vrgather_mask_vx_nxv8i8_nxv8i8_i32(<vscale x 8 x i8> %0, <vscale x 8 x i8> %1, i32 %2, <vscale x 8 x i1> %3, i32 %4) nounwind {
-; CHECK-LABEL: intrinsic_vrgather_mask_vx_nxv8i8_nxv8i8_i32:
-; CHECK:       # %bb.0: # %entry
-; CHECK-NEXT:    vsetvli zero, a1, e8, m1, ta, mu
-; CHECK-NEXT:    vrgather.vx v8, v9, a0, v0.t
-; CHECK-NEXT:    ret
-entry:
-  %a = call <vscale x 8 x i8> @llvm.riscv.vrgather.vx.mask.nxv8i8.i32(
-    <vscale x 8 x i8> %0,
-    <vscale x 8 x i8> %1,
-    i32 %2,
-    <vscale x 8 x i1> %3,
-    i32 %4, i32 1)
-
-  ret <vscale x 8 x i8> %a
-}
-
-declare <vscale x 16 x i8> @llvm.riscv.vrgather.vx.nxv16i8.i32(
-  <vscale x 16 x i8>,
-  <vscale x 16 x i8>,
-  i32,
-  i32);
-
-define <vscale x 16 x i8> @intrinsic_vrgather_vx_nxv16i8_nxv16i8_i32(<vscale x 16 x i8> %0, i32 %1, i32 %2) nounwind {
-; CHECK-LABEL: intrinsic_vrgather_vx_nxv16i8_nxv16i8_i32:
-; CHECK:       # %bb.0: # %entry
-; CHECK-NEXT:    vsetvli zero, a1, e8, m2, ta, ma
-; CHECK-NEXT:    vrgather.vx v10, v8, a0
-; CHECK-NEXT:    vmv.v.v v8, v10
-; CHECK-NEXT:    ret
-entry:
-  %a = call <vscale x 16 x i8> @llvm.riscv.vrgather.vx.nxv16i8.i32(
-    <vscale x 16 x i8> undef,
-    <vscale x 16 x i8> %0,
-    i32 %1,
-    i32 %2)
-
-  ret <vscale x 16 x i8> %a
-}
-
-declare <vscale x 16 x i8> @llvm.riscv.vrgather.vx.mask.nxv16i8.i32(
-  <vscale x 16 x i8>,
-  <vscale x 16 x i8>,
-  i32,
-  <vscale x 16 x i1>,
-  i32,
-  i32);
-
-define <vscale x 16 x i8> @intrinsic_vrgather_mask_vx_nxv16i8_nxv16i8_i32(<vscale x 16 x i8> %0, <vscale x 16 x i8> %1, i32 %2, <vscale x 16 x i1> %3, i32 %4) nounwind {
-; CHECK-LABEL: intrinsic_vrgather_mask_vx_nxv16i8_nxv16i8_i32:
-; CHECK:       # %bb.0: # %entry
-; CHECK-NEXT:    vsetvli zero, a1, e8, m2, ta, mu
-; CHECK-NEXT:    vrgather.vx v8, v10, a0, v0.t
-; CHECK-NEXT:    ret
-entry:
-  %a = call <vscale x 16 x i8> @llvm.riscv.vrgather.vx.mask.nxv16i8.i32(
-    <vscale x 16 x i8> %0,
-    <vscale x 16 x i8> %1,
-    i32 %2,
-    <vscale x 16 x i1> %3,
-    i32 %4, i32 1)
-
-  ret <vscale x 16 x i8> %a
-}
-
-declare <vscale x 32 x i8> @llvm.riscv.vrgather.vx.nxv32i8.i32(
-  <vscale x 32 x i8>,
-  <vscale x 32 x i8>,
-  i32,
-  i32);
-
-define <vscale x 32 x i8> @intrinsic_vrgather_vx_nxv32i8_nxv32i8_i32(<vscale x 32 x i8> %0, i32 %1, i32 %2) nounwind {
-; CHECK-LABEL: intrinsic_vrgather_vx_nxv32i8_nxv32i8_i32:
-; CHECK:       # %bb.0: # %entry
-; CHECK-NEXT:    vsetvli zero, a1, e8, m4, ta, ma
-; CHECK-NEXT:    vrgather.vx v12, v8, a0
-; CHECK-NEXT:    vmv.v.v v8, v12
-; CHECK-NEXT:    ret
-entry:
-  %a = call <vscale x 32 x i8> @llvm.riscv.vrgather.vx.nxv32i8.i32(
-    <vscale x 32 x i8> undef,
-    <vscale x 32 x i8> %0,
-    i32 %1,
-    i32 %2)
-
-  ret <vscale x 32 x i8> %a
-}
-
-declare <vscale x 32 x i8> @llvm.riscv.vrgather.vx.mask.nxv32i8.i32(
-  <vscale x 32 x i8>,
-  <vscale x 32 x i8>,
-  i32,
-  <vscale x 32 x i1>,
-  i32,
-  i32);
-
-define <vscale x 32 x i8> @intrinsic_vrgather_mask_vx_nxv32i8_nxv32i8_i32(<vscale x 32 x i8> %0, <vscale x 32 x i8> %1, i32 %2, <vscale x 32 x i1> %3, i32 %4) nounwind {
-; CHECK-LABEL: intrinsic_vrgather_mask_vx_nxv32i8_nxv32i8_i32:
-; CHECK:       # %bb.0: # %entry
-; CHECK-NEXT:    vsetvli zero, a1, e8, m4, ta, mu
-; CHECK-NEXT:    vrgather.vx v8, v12, a0, v0.t
-; CHECK-NEXT:    ret
-entry:
-  %a = call <vscale x 32 x i8> @llvm.riscv.vrgather.vx.mask.nxv32i8.i32(
-    <vscale x 32 x i8> %0,
-    <vscale x 32 x i8> %1,
-    i32 %2,
-    <vscale x 32 x i1> %3,
-    i32 %4, i32 1)
-
-  ret <vscale x 32 x i8> %a
-}
-
-declare <vscale x 64 x i8> @llvm.riscv.vrgather.vx.nxv64i8.i32(
-  <vscale x 64 x i8>,
-  <vscale x 64 x i8>,
-  i32,
-  i32);
-
-define <vscale x 64 x i8> @intrinsic_vrgather_vx_nxv64i8_nxv64i8_i32(<vscale x 64 x i8> %0, i32 %1, i32 %2) nounwind {
-; CHECK-LABEL: intrinsic_vrgather_vx_nxv64i8_nxv64i8_i32:
-; CHECK:       # %bb.0: # %entry
-; CHECK-NEXT:    vsetvli zero, a1, e8, m8, ta, ma
-; CHECK-NEXT:    vrgather.vx v16, v8, a0
-; CHECK-NEXT:    vmv.v.v v8, v16
-; CHECK-NEXT:    ret
-entry:
-  %a = call <vscale x 64 x i8> @llvm.riscv.vrgather.vx.nxv64i8.i32(
-    <vscale x 64 x i8> undef,
-    <vscale x 64 x i8> %0,
-    i32 %1,
-    i32 %2)
-
-  ret <vscale x 64 x i8> %a
-}
-
-declare <vscale x 64 x i8> @llvm.riscv.vrgather.vx.mask.nxv64i8.i32(
-  <vscale x 64 x i8>,
-  <vscale x 64 x i8>,
-  i32,
-  <vscale x 64 x i1>,
-  i32,
-  i32);
-
-define <vscale x 64 x i8> @intrinsic_vrgather_mask_vx_nxv64i8_nxv64i8_i32(<vscale x 64 x i8> %0, <vscale x 64 x i8> %1, i32 %2, <vscale x 64 x i1> %3, i32 %4) nounwind {
-; CHECK-LABEL: intrinsic_vrgather_mask_vx_nxv64i8_nxv64i8_i32:
-; CHECK:       # %bb.0: # %entry
-; CHECK-NEXT:    vsetvli zero, a1, e8, m8, ta, mu
-; CHECK-NEXT:    vrgather.vx v8, v16, a0, v0.t
-; CHECK-NEXT:    ret
-entry:
-  %a = call <vscale x 64 x i8> @llvm.riscv.vrgather.vx.mask.nxv64i8.i32(
-    <vscale x 64 x i8> %0,
-    <vscale x 64 x i8> %1,
-    i32 %2,
-    <vscale x 64 x i1> %3,
-    i32 %4, i32 1)
-
-  ret <vscale x 64 x i8> %a
-}
-
-declare <vscale x 1 x i16> @llvm.riscv.vrgather.vx.nxv1i16.i32(
-  <vscale x 1 x i16>,
-  <vscale x 1 x i16>,
-  i32,
-  i32);
-
-define <vscale x 1 x i16> @intrinsic_vrgather_vx_nxv1i16_nxv1i16_i32(<vscale x 1 x i16> %0, i32 %1, i32 %2) nounwind {
-; CHECK-LABEL: intrinsic_vrgather_vx_nxv1i16_nxv1i16_i32:
-; CHECK:       # %bb.0: # %entry
-; CHECK-NEXT:    vsetvli zero, a1, e16, mf4, ta, ma
-; CHECK-NEXT:    vrgather.vx v9, v8, a0
-; CHECK-NEXT:    vmv1r.v v8, v9
-; CHECK-NEXT:    ret
-entry:
-  %a = call <vscale x 1 x i16> @llvm.riscv.vrgather.vx.nxv1i16.i32(
-    <vscale x 1 x i16> undef,
-    <vscale x 1 x i16> %0,
-    i32 %1,
-    i32 %2)
-
-  ret <vscale x 1 x i16> %a
-}
-
-declare <vscale x 1 x i16> @llvm.riscv.vrgather.vx.mask.nxv1i16.i32(
-  <vscale x 1 x i16>,
-  <vscale x 1 x i16>,
-  i32,
-  <vscale x 1 x i1>,
-  i32,
-  i32);
-
-define <vscale x 1 x i16> @intrinsic_vrgather_mask_vx_nxv1i16_nxv1i16_i32(<vscale x 1 x i16> %0, <vscale x 1 x i16> %1, i32 %2, <vscale x 1 x i1> %3, i32 %4) nounwind {
-; CHECK-LABEL: intrinsic_vrgather_mask_vx_nxv1i16_nxv1i16_i32:
-; CHECK:       # %bb.0: # %entry
-; CHECK-NEXT:    vsetvli zero, a1, e16, mf4, ta, mu
-; CHECK-NEXT:    vrgather.vx v8, v9, a0, v0.t
-; CHECK-NEXT:    ret
-entry:
-  %a = call <vscale x 1 x i16> @llvm.riscv.vrgather.vx.mask.nxv1i16.i32(
-    <vscale x 1 x i16> %0,
-    <vscale x 1 x i16> %1,
-    i32 %2,
-    <vscale x 1 x i1> %3,
-    i32 %4, i32 1)
-
-  ret <vscale x 1 x i16> %a
-}
-
-declare <vscale x 2 x i16> @llvm.riscv.vrgather.vx.nxv2i16.i32(
-  <vscale x 2 x i16>,
-  <vscale x 2 x i16>,
-  i32,
-  i32);
-
-define <vscale x 2 x i16> @intrinsic_vrgather_vx_nxv2i16_nxv2i16_i32(<vscale x 2 x i16> %0, i32 %1, i32 %2) nounwind {
-; CHECK-LABEL: intrinsic_vrgather_vx_nxv2i16_nxv2i16_i32:
-; CHECK:       # %bb.0: # %entry
-; CHECK-NEXT:    vsetvli zero, a1, e16, mf2, ta, ma
-; CHECK-NEXT:    vrgather.vx v9, v8, a0
-; CHECK-NEXT:    vmv1r.v v8, v9
-; CHECK-NEXT:    ret
-entry:
-  %a = call <vscale x 2 x i16> @llvm.riscv.vrgather.vx.nxv2i16.i32(
-    <vscale x 2 x i16> undef,
-    <vscale x 2 x i16> %0,
-    i32 %1,
-    i32 %2)
-
-  ret <vscale x 2 x i16> %a
-}
-
-declare <vscale x 2 x i16> @llvm.riscv.vrgather.vx.mask.nxv2i16.i32(
-  <vscale x 2 x i16>,
-  <vscale x 2 x i16>,
-  i32,
-  <vscale x 2 x i1>,
-  i32,
-  i32);
-
-define <vscale x 2 x i16> @intrinsic_vrgather_mask_vx_nxv2i16_nxv2i16_i32(<vscale x 2 x i16> %0, <vscale x 2 x i16> %1, i32 %2, <vscale x 2 x i1> %3, i32 %4) nounwind {
-; CHECK-LABEL: intrinsic_vrgather_mask_vx_nxv2i16_nxv2i16_i32:
-; CHECK:       # %bb.0: # %entry
-; CHECK-NEXT:    vsetvli zero, a1, e16, mf2, ta, mu
-; CHECK-NEXT:    vrgather.vx v8, v9, a0, v0.t
-; CHECK-NEXT:    ret
-entry:
-  %a = call <vscale x 2 x i16> @llvm.riscv.vrgather.vx.mask.nxv2i16.i32(
-    <vscale x 2 x i16> %0,
-    <vscale x 2 x i16> %1,
-    i32 %2,
-    <vscale x 2 x i1> %3,
-    i32 %4, i32 1)
-
-  ret <vscale x 2 x i16> %a
-}
-
-declare <vscale x 4 x i16> @llvm.riscv.vrgather.vx.nxv4i16.i32(
-  <vscale x 4 x i16>,
-  <vscale x 4 x i16>,
-  i32,
-  i32);
-
-define <vscale x 4 x i16> @intrinsic_vrgather_vx_nxv4i16_nxv4i16_i32(<vscale x 4 x i16> %0, i32 %1, i32 %2) nounwind {
-; CHECK-LABEL: intrinsic_vrgather_vx_nxv4i16_nxv4i16_i32:
-; CHECK:       # %bb.0: # %entry
-; CHECK-NEXT:    vsetvli zero, a1, e16, m1, ta, ma
-; CHECK-NEXT:    vrgather.vx v9, v8, a0
-; CHECK-NEXT:    vmv.v.v v8, v9
-; CHECK-NEXT:    ret
-entry:
-  %a = call <vscale x 4 x i16> @llvm.riscv.vrgather.vx.nxv4i16.i32(
-    <vscale x 4 x i16> undef,
-    <vscale x 4 x i16> %0,
-    i32 %1,
-    i32 %2)
-
-  ret <vscale x 4 x i16> %a
-}
-
-declare <vscale x 4 x i16> @llvm.riscv.vrgather.vx.mask.nxv4i16.i32(
-  <vscale x 4 x i16>,
-  <vscale x 4 x i16>,
-  i32,
-  <vscale x 4 x i1>,
-  i32,
-  i32);
-
-define <vscale x 4 x i16> @intrinsic_vrgather_mask_vx_nxv4i16_nxv4i16_i32(<vscale x 4 x i16> %0, <vscale x 4 x i16> %1, i32 %2, <vscale x 4 x i1> %3, i32 %4) nounwind {
-; CHECK-LABEL: intrinsic_vrgather_mask_vx_nxv4i16_nxv4i16_i32:
-; CHECK:       # %bb.0: # %entry
-; CHECK-NEXT:    vsetvli zero, a1, e16, m1, ta, mu
-; CHECK-NEXT:    vrgather.vx v8, v9, a0, v0.t
-; CHECK-NEXT:    ret
-entry:
-  %a = call <vscale x 4 x i16> @llvm.riscv.vrgather.vx.mask.nxv4i16.i32(
-    <vscale x 4 x i16> %0,
-    <vscale x 4 x i16> %1,
-    i32 %2,
-    <vscale x 4 x i1> %3,
-    i32 %4, i32 1)
-
-  ret <vscale x 4 x i16> %a
-}
-
-declare <vscale x 8 x i16> @llvm.riscv.vrgather.vx.nxv8i16.i32(
-  <vscale x 8 x i16>,
-  <vscale x 8 x i16>,
-  i32,
-  i32);
-
-define <vscale x 8 x i16> @intrinsic_vrgather_vx_nxv8i16_nxv8i16_i32(<vscale x 8 x i16> %0, i32 %1, i32 %2) nounwind {
-; CHECK-LABEL: intrinsic_vrgather_vx_nxv8i16_nxv8i16_i32:
-; CHECK:       # %bb.0: # %entry
-; CHECK-NEXT:    vsetvli zero, a1, e16, m2, ta, ma
-; CHECK-NEXT:    vrgather.vx v10, v8, a0
-; CHECK-NEXT:    vmv.v.v v8, v10
-; CHECK-NEXT:    ret
-entry:
-  %a = call <vscale x 8 x i16> @llvm.riscv.vrgather.vx.nxv8i16.i32(
-    <vscale x 8 x i16> undef,
-    <vscale x 8 x i16> %0,
-    i32 %1,
-    i32 %2)
-
-  ret <vscale x 8 x i16> %a
-}
-
-declare <vscale x 8 x i16> @llvm.riscv.vrgather.vx.mask.nxv8i16.i32(
-  <vscale x 8 x i16>,
-  <vscale x 8 x i16>,
-  i32,
-  <vscale x 8 x i1>,
-  i32,
-  i32);
-
-define <vscale x 8 x i16> @intrinsic_vrgather_mask_vx_nxv8i16_nxv8i16_i32(<vscale x 8 x i16> %0, <vscale x 8 x i16> %1, i32 %2, <vscale x 8 x i1> %3, i32 %4) nounwind {
-; CHECK-LABEL: intrinsic_vrgather_mask_vx_nxv8i16_nxv8i16_i32:
-; CHECK:       # %bb.0: # %entry
-; CHECK-NEXT:    vsetvli zero, a1, e16, m2, ta, mu
-; CHECK-NEXT:    vrgather.vx v8, v10, a0, v0.t
-; CHECK-NEXT:    ret
-entry:
-  %a = call <vscale x 8 x i16> @llvm.riscv.vrgather.vx.mask.nxv8i16.i32(
-    <vscale x 8 x i16> %0,
-    <vscale x 8 x i16> %1,
-    i32 %2,
-    <vscale x 8 x i1> %3,
-    i32 %4, i32 1)
-
-  ret <vscale x 8 x i16> %a
-}
-
-declare <vscale x 16 x i16> @llvm.riscv.vrgather.vx.nxv16i16.i32(
-  <vscale x 16 x i16>,
-  <vscale x 16 x i16>,
-  i32,
-  i32);
-
-define <vscale x 16 x i16> @intrinsic_vrgather_vx_nxv16i16_nxv16i16_i32(<vscale x 16 x i16> %0, i32 %1, i32 %2) nounwind {
-; CHECK-LABEL: intrinsic_vrgather_vx_nxv16i16_nxv16i16_i32:
-; CHECK:       # %bb.0: # %entry
-; CHECK-NEXT:    vsetvli zero, a1, e16, m4, ta, ma
-; CHECK-NEXT:    vrgather.vx v12, v8, a0
-; CHECK-NEXT:    vmv.v.v v8, v12
-; CHECK-NEXT:    ret
-entry:
-  %a = call <vscale x 16 x i16> @llvm.riscv.vrgather.vx.nxv16i16.i32(
-    <vscale x 16 x i16> undef,
-    <vscale x 16 x i16> %0,
-    i32 %1,
-    i32 %2)
-
-  ret <vscale x 16 x i16> %a
-}
-
-declare <vscale x 16 x i16> @llvm.riscv.vrgather.vx.mask.nxv16i16.i32(
-  <vscale x 16 x i16>,
-  <vscale x 16 x i16>,
-  i32,
-  <vscale x 16 x i1>,
-  i32,
-  i32);
-
-define <vscale x 16 x i16> @intrinsic_vrgather_mask_vx_nxv16i16_nxv16i16_i32(<vscale x 16 x i16> %0, <vscale x 16 x i16> %1, i32 %2, <vscale x 16 x i1> %3, i32 %4) nounwind {
-; CHECK-LABEL: intrinsic_vrgather_mask_vx_nxv16i16_nxv16i16_i32:
-; CHECK:       # %bb.0: # %entry
-; CHECK-NEXT:    vsetvli zero, a1, e16, m4, ta, mu
-; CHECK-NEXT:    vrgather.vx v8, v12, a0, v0.t
-; CHECK-NEXT:    ret
-entry:
-  %a = call <vscale x 16 x i16> @llvm.riscv.vrgather.vx.mask.nxv16i16.i32(
-    <vscale x 16 x i16> %0,
-    <vscale x 16 x i16> %1,
-    i32 %2,
-    <vscale x 16 x i1> %3,
-    i32 %4, i32 1)
-
-  ret <vscale x 16 x i16> %a
-}
-
-declare <vscale x 32 x i16> @llvm.riscv.vrgather.vx.nxv32i16.i32(
-  <vscale x 32 x i16>,
-  <vscale x 32 x i16>,
-  i32,
-  i32);
-
-define <vscale x 32 x i16> @intrinsic_vrgather_vx_nxv32i16_nxv32i16_i32(<vscale x 32 x i16> %0, i32 %1, i32 %2) nounwind {
-; CHECK-LABEL: intrinsic_vrgather_vx_nxv32i16_nxv32i16_i32:
-; CHECK:       # %bb.0: # %entry
-; CHECK-NEXT:    vsetvli zero, a1, e16, m8, ta, ma
-; CHECK-NEXT:    vrgather.vx v16, v8, a0
-; CHECK-NEXT:    vmv.v.v v8, v16
-; CHECK-NEXT:    ret
-entry:
-  %a = call <vscale x 32 x i16> @llvm.riscv.vrgather.vx.nxv32i16.i32(
-    <vscale x 32 x i16> undef,
-    <vscale x 32 x i16> %0,
-    i32 %1,
-    i32 %2)
-
-  ret <vscale x 32 x i16> %a
-}
-
-declare <vscale x 32 x i16> @llvm.riscv.vrgather.vx.mask.nxv32i16.i32(
-  <vscale x 32 x i16>,
-  <vscale x 32 x i16>,
-  i32,
-  <vscale x 32 x i1>,
-  i32,
-  i32);
-
-define <vscale x 32 x i16> @intrinsic_vrgather_mask_vx_nxv32i16_nxv32i16_i32(<vscale x 32 x i16> %0, <vscale x 32 x i16> %1, i32 %2, <vscale x 32 x i1> %3, i32 %4) nounwind {
-; CHECK-LABEL: intrinsic_vrgather_mask_vx_nxv32i16_nxv32i16_i32:
-; CHECK:       # %bb.0: # %entry
-; CHECK-NEXT:    vsetvli zero, a1, e16, m8, ta, mu
-; CHECK-NEXT:    vrgather.vx v8, v16, a0, v0.t
-; CHECK-NEXT:    ret
-entry:
-  %a = call <vscale x 32 x i16> @llvm.riscv.vrgather.vx.mask.nxv32i16.i32(
-    <vscale x 32 x i16> %0,
-    <vscale x 32 x i16> %1,
-    i32 %2,
-    <vscale x 32 x i1> %3,
-    i32 %4, i32 1)
-
-  ret <vscale x 32 x i16> %a
-}
-
-declare <vscale x 1 x i32> @llvm.riscv.vrgather.vx.nxv1i32.i32(
-  <vscale x 1 x i32>,
-  <vscale x 1 x i32>,
-  i32,
-  i32);
-
-define <vscale x 1 x i32> @intrinsic_vrgather_vx_nxv1i32_nxv1i32_i32(<vscale x 1 x i32> %0, i32 %1, i32 %2) nounwind {
-; CHECK-LABEL: intrinsic_vrgather_vx_nxv1i32_nxv1i32_i32:
-; CHECK:       # %bb.0: # %entry
-; CHECK-NEXT:    vsetvli zero, a1, e32, mf2, ta, ma
-; CHECK-NEXT:    vrgather.vx v9, v8, a0
-; CHECK-NEXT:    vmv1r.v v8, v9
-; CHECK-NEXT:    ret
-entry:
-  %a = call <vscale x 1 x i32> @llvm.riscv.vrgather.vx.nxv1i32.i32(
-    <vscale x 1 x i32> undef,
-    <vscale x 1 x i32> %0,
-    i32 %1,
-    i32 %2)
-
-  ret <vscale x 1 x i32> %a
-}
-
-declare <vscale x 1 x i32> @llvm.riscv.vrgather.vx.mask.nxv1i32.i32(
-  <vscale x 1 x i32>,
-  <vscale x 1 x i32>,
-  i32,
-  <vscale x 1 x i1>,
-  i32,
-  i32);
-
-define <vscale x 1 x i32> @intrinsic_vrgather_mask_vx_nxv1i32_nxv1i32_i32(<vscale x 1 x i32> %0, <vscale x 1 x i32> %1, i32 %2, <vscale x 1 x i1> %3, i32 %4) nounwind {
-; CHECK-LABEL: intrinsic_vrgather_mask_vx_nxv1i32_nxv1i32_i32:
-; CHECK:       # %bb.0: # %entry
-; CHECK-NEXT:    vsetvli zero, a1, e32, mf2, ta, mu
-; CHECK-NEXT:    vrgather.vx v8, v9, a0, v0.t
-; CHECK-NEXT:    ret
-entry:
-  %a = call <vscale x 1 x i32> @llvm.riscv.vrgather.vx.mask.nxv1i32.i32(
-    <vscale x 1 x i32> %0,
-    <vscale x 1 x i32> %1,
-    i32 %2,
-    <vscale x 1 x i1> %3,
-    i32 %4, i32 1)
-
-  ret <vscale x 1 x i32> %a
-}
-
-declare <vscale x 2 x i32> @llvm.riscv.vrgather.vx.nxv2i32.i32(
-  <vscale x 2 x i32>,
-  <vscale x 2 x i32>,
-  i32,
-  i32);
-
-define <vscale x 2 x i32> @intrinsic_vrgather_vx_nxv2i32_nxv2i32_i32(<vscale x 2 x i32> %0, i32 %1, i32 %2) nounwind {
-; CHECK-LABEL: intrinsic_vrgather_vx_nxv2i32_nxv2i32_i32:
-; CHECK:       # %bb.0: # %entry
-; CHECK-NEXT:    vsetvli zero, a1, e32, m1, ta, ma
-; CHECK-NEXT:    vrgather.vx v9, v8, a0
-; CHECK-NEXT:    vmv.v.v v8, v9
-; CHECK-NEXT:    ret
-entry:
-  %a = call <vscale x 2 x i32> @llvm.riscv.vrgather.vx.nxv2i32.i32(
-    <vscale x 2 x i32> undef,
-    <vscale x 2 x i32> %0,
-    i32 %1,
-    i32 %2)
-
-  ret <vscale x 2 x i32> %a
-}
-
-declare <vscale x 2 x i32> @llvm.riscv.vrgather.vx.mask.nxv2i32.i32(
-  <vscale x 2 x i32>,
-  <vscale x 2 x i32>,
-  i32,
-  <vscale x 2 x i1>,
-  i32,
-  i32);
-
-define <vscale x 2 x i32> @intrinsic_vrgather_mask_vx_nxv2i32_nxv2i32_i32(<vscale x 2 x i32> %0, <vscale x 2 x i32> %1, i32 %2, <vscale x 2 x i1> %3, i32 %4) nounwind {
-; CHECK-LABEL: intrinsic_vrgather_mask_vx_nxv2i32_nxv2i32_i32:
-; CHECK:       # %bb.0: # %entry
-; CHECK-NEXT:    vsetvli zero, a1, e32, m1, ta, mu
-; CHECK-NEXT:    vrgather.vx v8, v9, a0, v0.t
-; CHECK-NEXT:    ret
-entry:
-  %a = call <vscale x 2 x i32> @llvm.riscv.vrgather.vx.mask.nxv2i32.i32(
-    <vscale x 2 x i32> %0,
-    <vscale x 2 x i32> %1,
-    i32 %2,
-    <vscale x 2 x i1> %3,
-    i32 %4, i32 1)
-
-  ret <vscale x 2 x i32> %a
-}
-
-declare <vscale x 4 x i32> @llvm.riscv.vrgather.vx.nxv4i32.i32(
-  <vscale x 4 x i32>,
-  <vscale x 4 x i32>,
-  i32,
-  i32);
-
-define <vscale x 4 x i32> @intrinsic_vrgather_vx_nxv4i32_nxv4i32_i32(<vscale x 4 x i32> %0, i32 %1, i32 %2) nounwind {
-; CHECK-LABEL: intrinsic_vrgather_vx_nxv4i32_nxv4i32_i32:
-; CHECK:       # %bb.0: # %entry
-; CHECK-NEXT:    vsetvli zero, a1, e32, m2, ta, ma
-; CHECK-NEXT:    vrgather.vx v10, v8, a0
-; CHECK-NEXT:    vmv.v.v v8, v10
-; CHECK-NEXT:    ret
-entry:
-  %a = call <vscale x 4 x i32> @llvm.riscv.vrgather.vx.nxv4i32.i32(
-    <vscale x 4 x i32> undef,
-    <vscale x 4 x i32> %0,
-    i32 %1,
-    i32 %2)
-
-  ret <vscale x 4 x i32> %a
-}
-
-declare <vscale x 4 x i32> @llvm.riscv.vrgather.vx.mask.nxv4i32.i32(
-  <vscale x 4 x i32>,
-  <vscale x 4 x i32>,
-  i32,
-  <vscale x 4 x i1>,
-  i32,
-  i32);
-
-define <vscale x 4 x i32> @intrinsic_vrgather_mask_vx_nxv4i32_nxv4i32_i32(<vscale x 4 x i32> %0, <vscale x 4 x i32> %1, i32 %2, <vscale x 4 x i1> %3, i32 %4) nounwind {
-; CHECK-LABEL: intrinsic_vrgather_mask_vx_nxv4i32_nxv4i32_i32:
-; CHECK:       # %bb.0: # %entry
-; CHECK-NEXT:    vsetvli zero, a1, e32, m2, ta, mu
-; CHECK-NEXT:    vrgather.vx v8, v10, a0, v0.t
-; CHECK-NEXT:    ret
-entry:
-  %a = call <vscale x 4 x i32> @llvm.riscv.vrgather.vx.mask.nxv4i32.i32(
-    <vscale x 4 x i32> %0,
-    <vscale x 4 x i32> %1,
-    i32 %2,
-    <vscale x 4 x i1> %3,
-    i32 %4, i32 1)
-
-  ret <vscale x 4 x i32> %a
-}
-
-declare <vscale x 8 x i32> @llvm.riscv.vrgather.vx.nxv8i32.i32(
-  <vscale x 8 x i32>,
-  <vscale x 8 x i32>,
-  i32,
-  i32);
-
-define <vscale x 8 x i32> @intrinsic_vrgather_vx_nxv8i32_nxv8i32_i32(<vscale x 8 x i32> %0, i32 %1, i32 %2) nounwind {
-; CHECK-LABEL: intrinsic_vrgather_vx_nxv8i32_nxv8i32_i32:
-; CHECK:       # %bb.0: # %entry
-; CHECK-NEXT:    vsetvli zero, a1, e32, m4, ta, ma
-; CHECK-NEXT:    vrgather.vx v12, v8, a0
-; CHECK-NEXT:    vmv.v.v v8, v12
-; CHECK-NEXT:    ret
-entry:
-  %a = call <vscale x 8 x i32> @llvm.riscv.vrgather.vx.nxv8i32.i32(
-    <vscale x 8 x i32> undef,
-    <vscale x 8 x i32> %0,
-    i32 %1,
-    i32 %2)
-
-  ret <vscale x 8 x i32> %a
-}
-
-declare <vscale x 8 x i32> @llvm.riscv.vrgather.vx.mask.nxv8i32.i32(
-  <vscale x 8 x i32>,
-  <vscale x 8 x i32>,
-  i32,
-  <vscale x 8 x i1>,
-  i32,
-  i32);
-
-define <vscale x 8 x i32> @intrinsic_vrgather_mask_vx_nxv8i32_nxv8i32_i32(<vscale x 8 x i32> %0, <vscale x 8 x i32> %1, i32 %2, <vscale x 8 x i1> %3, i32 %4) nounwind {
-; CHECK-LABEL: intrinsic_vrgather_mask_vx_nxv8i32_nxv8i32_i32:
-; CHECK:       # %bb.0: # %entry
-; CHECK-NEXT:    vsetvli zero, a1, e32, m4, ta, mu
-; CHECK-NEXT:    vrgather.vx v8, v12, a0, v0.t
-; CHECK-NEXT:    ret
-entry:
-  %a = call <vscale x 8 x i32> @llvm.riscv.vrgather.vx.mask.nxv8i32.i32(
-    <vscale x 8 x i32> %0,
-    <vscale x 8 x i32> %1,
-    i32 %2,
-    <vscale x 8 x i1> %3,
-    i32 %4, i32 1)
-
-  ret <vscale x 8 x i32> %a
-}
-
-declare <vscale x 16 x i32> @llvm.riscv.vrgather.vx.nxv16i32.i32(
-  <vscale x 16 x i32>,
-  <vscale x 16 x i32>,
-  i32,
-  i32);
-
-define <vscale x 16 x i32> @intrinsic_vrgather_vx_nxv16i32_nxv16i32_i32(<vscale x 16 x i32> %0, i32 %1, i32 %2) nounwind {
-; CHECK-LABEL: intrinsic_vrgather_vx_nxv16i32_nxv16i32_i32:
-; CHECK:       # %bb.0: # %entry
-; CHECK-NEXT:    vsetvli zero, a1, e32, m8, ta, ma
-; CHECK-NEXT:    vrgather.vx v16, v8, a0
-; CHECK-NEXT:    vmv.v.v v8, v16
-; CHECK-NEXT:    ret
-entry:
-  %a = call <vscale x 16 x i32> @llvm.riscv.vrgather.vx.nxv16i32.i32(
-    <vscale x 16 x i32> undef,
-    <vscale x 16 x i32> %0,
-    i32 %1,
-    i32 %2)
-
-  ret <vscale x 16 x i32> %a
-}
-
-declare <vscale x 16 x i32> @llvm.riscv.vrgather.vx.mask.nxv16i32.i32(
-  <vscale x 16 x i32>,
-  <vscale x 16 x i32>,
-  i32,
-  <vscale x 16 x i1>,
-  i32,
-  i32);
-
-define <vscale x 16 x i32> @intrinsic_vrgather_mask_vx_nxv16i32_nxv16i32_i32(<vscale x 16 x i32> %0, <vscale x 16 x i32> %1, i32 %2, <vscale x 16 x i1> %3, i32 %4) nounwind {
-; CHECK-LABEL: intrinsic_vrgather_mask_vx_nxv16i32_nxv16i32_i32:
-; CHECK:       # %bb.0: # %entry
-; CHECK-NEXT:    vsetvli zero, a1, e32, m8, ta, mu
-; CHECK-NEXT:    vrgather.vx v8, v16, a0, v0.t
-; CHECK-NEXT:    ret
-entry:
-  %a = call <vscale x 16 x i32> @llvm.riscv.vrgather.vx.mask.nxv16i32.i32(
-    <vscale x 16 x i32> %0,
-    <vscale x 16 x i32> %1,
-    i32 %2,
-    <vscale x 16 x i1> %3,
-    i32 %4, i32 1)
-
-  ret <vscale x 16 x i32> %a
-}
-
-declare <vscale x 1 x half> @llvm.riscv.vrgather.vx.nxv1f16.i32(
-  <vscale x 1 x half>,
-  <vscale x 1 x half>,
-  i32,
-  i32);
-
-define <vscale x 1 x half> @intrinsic_vrgather_vx_nxv1f16_nxv1f16_i32(<vscale x 1 x half> %0, i32 %1, i32 %2) nounwind {
-; CHECK-LABEL: intrinsic_vrgather_vx_nxv1f16_nxv1f16_i32:
-; CHECK:       # %bb.0: # %entry
-; CHECK-NEXT:    vsetvli zero, a1, e16, mf4, ta, ma
-; CHECK-NEXT:    vrgather.vx v9, v8, a0
-; CHECK-NEXT:    vmv1r.v v8, v9
-; CHECK-NEXT:    ret
-entry:
-  %a = call <vscale x 1 x half> @llvm.riscv.vrgather.vx.nxv1f16.i32(
-    <vscale x 1 x half> undef,
-    <vscale x 1 x half> %0,
-    i32 %1,
-    i32 %2)
-
-  ret <vscale x 1 x half> %a
-}
-
-declare <vscale x 1 x half> @llvm.riscv.vrgather.vx.mask.nxv1f16.i32(
-  <vscale x 1 x half>,
-  <vscale x 1 x half>,
-  i32,
-  <vscale x 1 x i1>,
-  i32,
-  i32);
-
-define <vscale x 1 x half> @intrinsic_vrgather_mask_vx_nxv1f16_nxv1f16_i32(<vscale x 1 x half> %0, <vscale x 1 x half> %1, i32 %2, <vscale x 1 x i1> %3, i32 %4) nounwind {
-; CHECK-LABEL: intrinsic_vrgather_mask_vx_nxv1f16_nxv1f16_i32:
-; CHECK:       # %bb.0: # %entry
-; CHECK-NEXT:    vsetvli zero, a1, e16, mf4, ta, mu
-; CHECK-NEXT:    vrgather.vx v8, v9, a0, v0.t
-; CHECK-NEXT:    ret
-entry:
-  %a = call <vscale x 1 x half> @llvm.riscv.vrgather.vx.mask.nxv1f16.i32(
-    <vscale x 1 x half> %0,
-    <vscale x 1 x half> %1,
-    i32 %2,
-    <vscale x 1 x i1> %3,
-    i32 %4, i32 1)
-
-  ret <vscale x 1 x half> %a
-}
-
-declare <vscale x 2 x half> @llvm.riscv.vrgather.vx.nxv2f16.i32(
-  <vscale x 2 x half>,
-  <vscale x 2 x half>,
-  i32,
-  i32);
-
-define <vscale x 2 x half> @intrinsic_vrgather_vx_nxv2f16_nxv2f16_i32(<vscale x 2 x half> %0, i32 %1, i32 %2) nounwind {
-; CHECK-LABEL: intrinsic_vrgather_vx_nxv2f16_nxv2f16_i32:
-; CHECK:       # %bb.0: # %entry
-; CHECK-NEXT:    vsetvli zero, a1, e16, mf2, ta, ma
-; CHECK-NEXT:    vrgather.vx v9, v8, a0
-; CHECK-NEXT:    vmv1r.v v8, v9
-; CHECK-NEXT:    ret
-entry:
-  %a = call <vscale x 2 x half> @llvm.riscv.vrgather.vx.nxv2f16.i32(
-    <vscale x 2 x half> undef,
-    <vscale x 2 x half> %0,
-    i32 %1,
-    i32 %2)
-
-  ret <vscale x 2 x half> %a
-}
-
-declare <vscale x 2 x half> @llvm.riscv.vrgather.vx.mask.nxv2f16.i32(
-  <vscale x 2 x half>,
-  <vscale x 2 x half>,
-  i32,
-  <vscale x 2 x i1>,
-  i32,
-  i32);
-
-define <vscale x 2 x half> @intrinsic_vrgather_mask_vx_nxv2f16_nxv2f16_i32(<vscale x 2 x half> %0, <vscale x 2 x half> %1, i32 %2, <vscale x 2 x i1> %3, i32 %4) nounwind {
-; CHECK-LABEL: intrinsic_vrgather_mask_vx_nxv2f16_nxv2f16_i32:
-; CHECK:       # %bb.0: # %entry
-; CHECK-NEXT:    vsetvli zero, a1, e16, mf2, ta, mu
-; CHECK-NEXT:    vrgather.vx v8, v9, a0, v0.t
-; CHECK-NEXT:    ret
-entry:
-  %a = call <vscale x 2 x half> @llvm.riscv.vrgather.vx.mask.nxv2f16.i32(
-    <vscale x 2 x half> %0,
-    <vscale x 2 x half> %1,
-    i32 %2,
-    <vscale x 2 x i1> %3,
-    i32 %4, i32 1)
-
-  ret <vscale x 2 x half> %a
-}
-
-declare <vscale x 4 x half> @llvm.riscv.vrgather.vx.nxv4f16.i32(
-  <vscale x 4 x half>,
-  <vscale x 4 x half>,
-  i32,
-  i32);
-
-define <vscale x 4 x half> @intrinsic_vrgather_vx_nxv4f16_nxv4f16_i32(<vscale x 4 x half> %0, i32 %1, i32 %2) nounwind {
-; CHECK-LABEL: intrinsic_vrgather_vx_nxv4f16_nxv4f16_i32:
-; CHECK:       # %bb.0: # %entry
-; CHECK-NEXT:    vsetvli zero, a1, e16, m1, ta, ma
-; CHECK-NEXT:    vrgather.vx v9, v8, a0
-; CHECK-NEXT:    vmv.v.v v8, v9
-; CHECK-NEXT:    ret
-entry:
-  %a = call <vscale x 4 x half> @llvm.riscv.vrgather.vx.nxv4f16.i32(
-    <vscale x 4 x half> undef,
-    <vscale x 4 x half> %0,
-    i32 %1,
-    i32 %2)
-
-  ret <vscale x 4 x half> %a
-}
-
-declare <vscale x 4 x half> @llvm.riscv.vrgather.vx.mask.nxv4f16.i32(
-  <vscale x 4 x half>,
-  <vscale x 4 x half>,
-  i32,
-  <vscale x 4 x i1>,
-  i32,
-  i32);
-
-define <vscale x 4 x half> @intrinsic_vrgather_mask_vx_nxv4f16_nxv4f16_i32(<vscale x 4 x half> %0, <vscale x 4 x half> %1, i32 %2, <vscale x 4 x i1> %3, i32 %4) nounwind {
-; CHECK-LABEL: intrinsic_vrgather_mask_vx_nxv4f16_nxv4f16_i32:
-; CHECK:       # %bb.0: # %entry
-; CHECK-NEXT:    vsetvli zero, a1, e16, m1, ta, mu
-; CHECK-NEXT:    vrgather.vx v8, v9, a0, v0.t
-; CHECK-NEXT:    ret
-entry:
-  %a = call <vscale x 4 x half> @llvm.riscv.vrgather.vx.mask.nxv4f16.i32(
-    <vscale x 4 x half> %0,
-    <vscale x 4 x half> %1,
-    i32 %2,
-    <vscale x 4 x i1> %3,
-    i32 %4, i32 1)
-
-  ret <vscale x 4 x half> %a
-}
-
-declare <vscale x 8 x half> @llvm.riscv.vrgather.vx.nxv8f16.i32(
-  <vscale x 8 x half>,
-  <vscale x 8 x half>,
-  i32,
-  i32);
-
-define <vscale x 8 x half> @intrinsic_vrgather_vx_nxv8f16_nxv8f16_i32(<vscale x 8 x half> %0, i32 %1, i32 %2) nounwind {
-; CHECK-LABEL: intrinsic_vrgather_vx_nxv8f16_nxv8f16_i32:
-; CHECK:       # %bb.0: # %entry
-; CHECK-NEXT:    vsetvli zero, a1, e16, m2, ta, ma
-; CHECK-NEXT:    vrgather.vx v10, v8, a0
-; CHECK-NEXT:    vmv.v.v v8, v10
-; CHECK-NEXT:    ret
-entry:
-  %a = call <vscale x 8 x half> @llvm.riscv.vrgather.vx.nxv8f16.i32(
-    <vscale x 8 x half> undef,
-    <vscale x 8 x half> %0,
-    i32 %1,
-    i32 %2)
-
-  ret <vscale x 8 x half> %a
-}
-
-declare <vscale x 8 x half> @llvm.riscv.vrgather.vx.mask.nxv8f16.i32(
-  <vscale x 8 x half>,
-  <vscale x 8 x half>,
-  i32,
-  <vscale x 8 x i1>,
-  i32,
-  i32);
-
-define <vscale x 8 x half> @intrinsic_vrgather_mask_vx_nxv8f16_nxv8f16_i32(<vscale x 8 x half> %0, <vscale x 8 x half> %1, i32 %2, <vscale x 8 x i1> %3, i32 %4) nounwind {
-; CHECK-LABEL: intrinsic_vrgather_mask_vx_nxv8f16_nxv8f16_i32:
-; CHECK:       # %bb.0: # %entry
-; CHECK-NEXT:    vsetvli zero, a1, e16, m2, ta, mu
-; CHECK-NEXT:    vrgather.vx v8, v10, a0, v0.t
-; CHECK-NEXT:    ret
-entry:
-  %a = call <vscale x 8 x half> @llvm.riscv.vrgather.vx.mask.nxv8f16.i32(
-    <vscale x 8 x half> %0,
-    <vscale x 8 x half> %1,
-    i32 %2,
-    <vscale x 8 x i1> %3,
-    i32 %4, i32 1)
-
-  ret <vscale x 8 x half> %a
-}
-
-declare <vscale x 16 x half> @llvm.riscv.vrgather.vx.nxv16f16.i32(
-  <vscale x 16 x half>,
-  <vscale x 16 x half>,
-  i32,
-  i32);
-
-define <vscale x 16 x half> @intrinsic_vrgather_vx_nxv16f16_nxv16f16_i32(<vscale x 16 x half> %0, i32 %1, i32 %2) nounwind {
-; CHECK-LABEL: intrinsic_vrgather_vx_nxv16f16_nxv16f16_i32:
-; CHECK:       # %bb.0: # %entry
-; CHECK-NEXT:    vsetvli zero, a1, e16, m4, ta, ma
-; CHECK-NEXT:    vrgather.vx v12, v8, a0
-; CHECK-NEXT:    vmv.v.v v8, v12
-; CHECK-NEXT:    ret
-entry:
-  %a = call <vscale x 16 x half> @llvm.riscv.vrgather.vx.nxv16f16.i32(
-    <vscale x 16 x half> undef,
-    <vscale x 16 x half> %0,
-    i32 %1,
-    i32 %2)
-
-  ret <vscale x 16 x half> %a
-}
-
-declare <vscale x 16 x half> @llvm.riscv.vrgather.vx.mask.nxv16f16.i32(
-  <vscale x 16 x half>,
-  <vscale x 16 x half>,
-  i32,
-  <vscale x 16 x i1>,
-  i32,
-  i32);
-
-define <vscale x 16 x half> @intrinsic_vrgather_mask_vx_nxv16f16_nxv16f16_i32(<vscale x 16 x half> %0, <vscale x 16 x half> %1, i32 %2, <vscale x 16 x i1> %3, i32 %4) nounwind {
-; CHECK-LABEL: intrinsic_vrgather_mask_vx_nxv16f16_nxv16f16_i32:
-; CHECK:       # %bb.0: # %entry
-; CHECK-NEXT:    vsetvli zero, a1, e16, m4, ta, mu
-; CHECK-NEXT:    vrgather.vx v8, v12, a0, v0.t
-; CHECK-NEXT:    ret
-entry:
-  %a = call <vscale x 16 x half> @llvm.riscv.vrgather.vx.mask.nxv16f16.i32(
-    <vscale x 16 x half> %0,
-    <vscale x 16 x half> %1,
-    i32 %2,
-    <vscale x 16 x i1> %3,
-    i32 %4, i32 1)
-
-  ret <vscale x 16 x half> %a
-}
-
-declare <vscale x 32 x half> @llvm.riscv.vrgather.vx.nxv32f16.i32(
-  <vscale x 32 x half>,
-  <vscale x 32 x half>,
-  i32,
-  i32);
-
-define <vscale x 32 x half> @intrinsic_vrgather_vx_nxv32f16_nxv32f16_i32(<vscale x 32 x half> %0, i32 %1, i32 %2) nounwind {
-; CHECK-LABEL: intrinsic_vrgather_vx_nxv32f16_nxv32f16_i32:
-; CHECK:       # %bb.0: # %entry
-; CHECK-NEXT:    vsetvli zero, a1, e16, m8, ta, ma
-; CHECK-NEXT:    vrgather.vx v16, v8, a0
-; CHECK-NEXT:    vmv.v.v v8, v16
-; CHECK-NEXT:    ret
-entry:
-  %a = call <vscale x 32 x half> @llvm.riscv.vrgather.vx.nxv32f16.i32(
-    <vscale x 32 x half> undef,
-    <vscale x 32 x half> %0,
-    i32 %1,
-    i32 %2)
-
-  ret <vscale x 32 x half> %a
-}
-
-declare <vscale x 32 x half> @llvm.riscv.vrgather.vx.mask.nxv32f16.i32(
-  <vscale x 32 x half>,
-  <vscale x 32 x half>,
-  i32,
-  <vscale x 32 x i1>,
-  i32,
-  i32);
-
-define <vscale x 32 x half> @intrinsic_vrgather_mask_vx_nxv32f16_nxv32f16_i32(<vscale x 32 x half> %0, <vscale x 32 x half> %1, i32 %2, <vscale x 32 x i1> %3, i32 %4) nounwind {
-; CHECK-LABEL: intrinsic_vrgather_mask_vx_nxv32f16_nxv32f16_i32:
-; CHECK:       # %bb.0: # %entry
-; CHECK-NEXT:    vsetvli zero, a1, e16, m8, ta, mu
-; CHECK-NEXT:    vrgather.vx v8, v16, a0, v0.t
-; CHECK-NEXT:    ret
-entry:
-  %a = call <vscale x 32 x half> @llvm.riscv.vrgather.vx.mask.nxv32f16.i32(
-    <vscale x 32 x half> %0,
-    <vscale x 32 x half> %1,
-    i32 %2,
-    <vscale x 32 x i1> %3,
-    i32 %4, i32 1)
-
-  ret <vscale x 32 x half> %a
-}
-
-declare <vscale x 1 x float> @llvm.riscv.vrgather.vx.nxv1f32.i32(
-  <vscale x 1 x float>,
-  <vscale x 1 x float>,
-  i32,
-  i32);
-
-define <vscale x 1 x float> @intrinsic_vrgather_vx_nxv1f32_nxv1f32_i32(<vscale x 1 x float> %0, i32 %1, i32 %2) nounwind {
-; CHECK-LABEL: intrinsic_vrgather_vx_nxv1f32_nxv1f32_i32:
-; CHECK:       # %bb.0: # %entry
-; CHECK-NEXT:    vsetvli zero, a1, e32, mf2, ta, ma
-; CHECK-NEXT:    vrgather.vx v9, v8, a0
-; CHECK-NEXT:    vmv1r.v v8, v9
-; CHECK-NEXT:    ret
-entry:
-  %a = call <vscale x 1 x float> @llvm.riscv.vrgather.vx.nxv1f32.i32(
-    <vscale x 1 x float> undef,
-    <vscale x 1 x float> %0,
-    i32 %1,
-    i32 %2)
-
-  ret <vscale x 1 x float> %a
-}
-
-declare <vscale x 1 x float> @llvm.riscv.vrgather.vx.mask.nxv1f32.i32(
-  <vscale x 1 x float>,
-  <vscale x 1 x float>,
-  i32,
-  <vscale x 1 x i1>,
-  i32,
-  i32);
-
-define <vscale x 1 x float> @intrinsic_vrgather_mask_vx_nxv1f32_nxv1f32_i32(<vscale x 1 x float> %0, <vscale x 1 x float> %1, i32 %2, <vscale x 1 x i1> %3, i32 %4) nounwind {
-; CHECK-LABEL: intrinsic_vrgather_mask_vx_nxv1f32_nxv1f32_i32:
-; CHECK:       # %bb.0: # %entry
-; CHECK-NEXT:    vsetvli zero, a1, e32, mf2, ta, mu
-; CHECK-NEXT:    vrgather.vx v8, v9, a0, v0.t
-; CHECK-NEXT:    ret
-entry:
-  %a = call <vscale x 1 x float> @llvm.riscv.vrgather.vx.mask.nxv1f32.i32(
-    <vscale x 1 x float> %0,
-    <vscale x 1 x float> %1,
-    i32 %2,
-    <vscale x 1 x i1> %3,
-    i32 %4, i32 1)
-
-  ret <vscale x 1 x float> %a
-}
-
-declare <vscale x 2 x float> @llvm.riscv.vrgather.vx.nxv2f32.i32(
-  <vscale x 2 x float>,
-  <vscale x 2 x float>,
-  i32,
-  i32);
-
-define <vscale x 2 x float> @intrinsic_vrgather_vx_nxv2f32_nxv2f32_i32(<vscale x 2 x float> %0, i32 %1, i32 %2) nounwind {
-; CHECK-LABEL: intrinsic_vrgather_vx_nxv2f32_nxv2f32_i32:
-; CHECK:       # %bb.0: # %entry
-; CHECK-NEXT:    vsetvli zero, a1, e32, m1, ta, ma
-; CHECK-NEXT:    vrgather.vx v9, v8, a0
-; CHECK-NEXT:    vmv.v.v v8, v9
-; CHECK-NEXT:    ret
-entry:
-  %a = call <vscale x 2 x float> @llvm.riscv.vrgather.vx.nxv2f32.i32(
-    <vscale x 2 x float> undef,
-    <vscale x 2 x float> %0,
-    i32 %1,
-    i32 %2)
-
-  ret <vscale x 2 x float> %a
-}
-
-declare <vscale x 2 x float> @llvm.riscv.vrgather.vx.mask.nxv2f32.i32(
-  <vscale x 2 x float>,
-  <vscale x 2 x float>,
-  i32,
-  <vscale x 2 x i1>,
-  i32,
-  i32);
-
-define <vscale x 2 x float> @intrinsic_vrgather_mask_vx_nxv2f32_nxv2f32_i32(<vscale x 2 x float> %0, <vscale x 2 x float> %1, i32 %2, <vscale x 2 x i1> %3, i32 %4) nounwind {
-; CHECK-LABEL: intrinsic_vrgather_mask_vx_nxv2f32_nxv2f32_i32:
-; CHECK:       # %bb.0: # %entry
-; CHECK-NEXT:    vsetvli zero, a1, e32, m1, ta, mu
-; CHECK-NEXT:    vrgather.vx v8, v9, a0, v0.t
-; CHECK-NEXT:    ret
-entry:
-  %a = call <vscale x 2 x float> @llvm.riscv.vrgather.vx.mask.nxv2f32.i32(
-    <vscale x 2 x float> %0,
-    <vscale x 2 x float> %1,
-    i32 %2,
-    <vscale x 2 x i1> %3,
-    i32 %4, i32 1)
-
-  ret <vscale x 2 x float> %a
-}
-
-declare <vscale x 4 x float> @llvm.riscv.vrgather.vx.nxv4f32.i32(
-  <vscale x 4 x float>,
-  <vscale x 4 x float>,
-  i32,
-  i32);
-
-define <vscale x 4 x float> @intrinsic_vrgather_vx_nxv4f32_nxv4f32_i32(<vscale x 4 x float> %0, i32 %1, i32 %2) nounwind {
-; CHECK-LABEL: intrinsic_vrgather_vx_nxv4f32_nxv4f32_i32:
-; CHECK:       # %bb.0: # %entry
-; CHECK-NEXT:    vsetvli zero, a1, e32, m2, ta, ma
-; CHECK-NEXT:    vrgather.vx v10, v8, a0
-; CHECK-NEXT:    vmv.v.v v8, v10
-; CHECK-NEXT:    ret
-entry:
-  %a = call <vscale x 4 x float> @llvm.riscv.vrgather.vx.nxv4f32.i32(
-    <vscale x 4 x float> undef,
-    <vscale x 4 x float> %0,
-    i32 %1,
-    i32 %2)
-
-  ret <vscale x 4 x float> %a
-}
-
-declare <vscale x 4 x float> @llvm.riscv.vrgather.vx.mask.nxv4f32.i32(
-  <vscale x 4 x float>,
-  <vscale x 4 x float>,
-  i32,
-  <vscale x 4 x i1>,
-  i32,
-  i32);
-
-define <vscale x 4 x float> @intrinsic_vrgather_mask_vx_nxv4f32_nxv4f32_i32(<vscale x 4 x float> %0, <vscale x 4 x float> %1, i32 %2, <vscale x 4 x i1> %3, i32 %4) nounwind {
-; CHECK-LABEL: intrinsic_vrgather_mask_vx_nxv4f32_nxv4f32_i32:
-; CHECK:       # %bb.0: # %entry
-; CHECK-NEXT:    vsetvli zero, a1, e32, m2, ta, mu
-; CHECK-NEXT:    vrgather.vx v8, v10, a0, v0.t
-; CHECK-NEXT:    ret
-entry:
-  %a = call <vscale x 4 x float> @llvm.riscv.vrgather.vx.mask.nxv4f32.i32(
-    <vscale x 4 x float> %0,
-    <vscale x 4 x float> %1,
-    i32 %2,
-    <vscale x 4 x i1> %3,
-    i32 %4, i32 1)
-
-  ret <vscale x 4 x float> %a
-}
-
-declare <vscale x 8 x float> @llvm.riscv.vrgather.vx.nxv8f32.i32(
-  <vscale x 8 x float>,
-  <vscale x 8 x float>,
-  i32,
-  i32);
-
-define <vscale x 8 x float> @intrinsic_vrgather_vx_nxv8f32_nxv8f32_i32(<vscale x 8 x float> %0, i32 %1, i32 %2) nounwind {
-; CHECK-LABEL: intrinsic_vrgather_vx_nxv8f32_nxv8f32_i32:
-; CHECK:       # %bb.0: # %entry
-; CHECK-NEXT:    vsetvli zero, a1, e32, m4, ta, ma
-; CHECK-NEXT:    vrgather.vx v12, v8, a0
-; CHECK-NEXT:    vmv.v.v v8, v12
-; CHECK-NEXT:    ret
-entry:
-  %a = call <vscale x 8 x float> @llvm.riscv.vrgather.vx.nxv8f32.i32(
-    <vscale x 8 x float> undef,
-    <vscale x 8 x float> %0,
-    i32 %1,
-    i32 %2)
-
-  ret <vscale x 8 x float> %a
-}
-
-declare <vscale x 8 x float> @llvm.riscv.vrgather.vx.mask.nxv8f32.i32(
-  <vscale x 8 x float>,
-  <vscale x 8 x float>,
-  i32,
-  <vscale x 8 x i1>,
-  i32,
-  i32);
-
-define <vscale x 8 x float> @intrinsic_vrgather_mask_vx_nxv8f32_nxv8f32_i32(<vscale x 8 x float> %0, <vscale x 8 x float> %1, i32 %2, <vscale x 8 x i1> %3, i32 %4) nounwind {
-; CHECK-LABEL: intrinsic_vrgather_mask_vx_nxv8f32_nxv8f32_i32:
-; CHECK:       # %bb.0: # %entry
-; CHECK-NEXT:    vsetvli zero, a1, e32, m4, ta, mu
-; CHECK-NEXT:    vrgather.vx v8, v12, a0, v0.t
-; CHECK-NEXT:    ret
-entry:
-  %a = call <vscale x 8 x float> @llvm.riscv.vrgather.vx.mask.nxv8f32.i32(
-    <vscale x 8 x float> %0,
-    <vscale x 8 x float> %1,
-    i32 %2,
-    <vscale x 8 x i1> %3,
-    i32 %4, i32 1)
-
-  ret <vscale x 8 x float> %a
-}
-
-declare <vscale x 16 x float> @llvm.riscv.vrgather.vx.nxv16f32.i32(
-  <vscale x 16 x float>,
-  <vscale x 16 x float>,
-  i32,
-  i32);
-
-define <vscale x 16 x float> @intrinsic_vrgather_vx_nxv16f32_nxv16f32_i32(<vscale x 16 x float> %0, i32 %1, i32 %2) nounwind {
-; CHECK-LABEL: intrinsic_vrgather_vx_nxv16f32_nxv16f32_i32:
-; CHECK:       # %bb.0: # %entry
-; CHECK-NEXT:    vsetvli zero, a1, e32, m8, ta, ma
-; CHECK-NEXT:    vrgather.vx v16, v8, a0
-; CHECK-NEXT:    vmv.v.v v8, v16
-; CHECK-NEXT:    ret
-entry:
-  %a = call <vscale x 16 x float> @llvm.riscv.vrgather.vx.nxv16f32.i32(
-    <vscale x 16 x float> undef,
-    <vscale x 16 x float> %0,
-    i32 %1,
-    i32 %2)
-
-  ret <vscale x 16 x float> %a
-}
-
-declare <vscale x 16 x float> @llvm.riscv.vrgather.vx.mask.nxv16f32.i32(
-  <vscale x 16 x float>,
-  <vscale x 16 x float>,
-  i32,
-  <vscale x 16 x i1>,
-  i32,
-  i32);
-
-define <vscale x 16 x float> @intrinsic_vrgather_mask_vx_nxv16f32_nxv16f32_i32(<vscale x 16 x float> %0, <vscale x 16 x float> %1, i32 %2, <vscale x 16 x i1> %3, i32 %4) nounwind {
-; CHECK-LABEL: intrinsic_vrgather_mask_vx_nxv16f32_nxv16f32_i32:
-; CHECK:       # %bb.0: # %entry
-; CHECK-NEXT:    vsetvli zero, a1, e32, m8, ta, mu
-; CHECK-NEXT:    vrgather.vx v8, v16, a0, v0.t
-; CHECK-NEXT:    ret
-entry:
-  %a = call <vscale x 16 x float> @llvm.riscv.vrgather.vx.mask.nxv16f32.i32(
-    <vscale x 16 x float> %0,
-    <vscale x 16 x float> %1,
-    i32 %2,
-    <vscale x 16 x i1> %3,
-    i32 %4, i32 1)
-
-  ret <vscale x 16 x float> %a
-}
-
-declare <vscale x 1 x double> @llvm.riscv.vrgather.vx.nxv1f64.i32(
-  <vscale x 1 x double>,
-  <vscale x 1 x double>,
-  i32,
-  i32);
-
-define <vscale x 1 x double> @intrinsic_vrgather_vx_nxv1f64_nxv1f64_i32(<vscale x 1 x double> %0, i32 %1, i32 %2) nounwind {
-; CHECK-LABEL: intrinsic_vrgather_vx_nxv1f64_nxv1f64_i32:
-; CHECK:       # %bb.0: # %entry
-; CHECK-NEXT:    vsetvli zero, a1, e64, m1, ta, ma
-; CHECK-NEXT:    vrgather.vx v9, v8, a0
-; CHECK-NEXT:    vmv.v.v v8, v9
-; CHECK-NEXT:    ret
-entry:
-  %a = call <vscale x 1 x double> @llvm.riscv.vrgather.vx.nxv1f64.i32(
-    <vscale x 1 x double> undef,
-    <vscale x 1 x double> %0,
-    i32 %1,
-    i32 %2)
-
-  ret <vscale x 1 x double> %a
-}
-
-declare <vscale x 1 x double> @llvm.riscv.vrgather.vx.mask.nxv1f64.i32(
-  <vscale x 1 x double>,
-  <vscale x 1 x double>,
-  i32,
-  <vscale x 1 x i1>,
-  i32,
-  i32);
-
-define <vscale x 1 x double> @intrinsic_vrgather_mask_vx_nxv1f64_nxv1f64_i32(<vscale x 1 x double> %0, <vscale x 1 x double> %1, i32 %2, <vscale x 1 x i1> %3, i32 %4) nounwind {
-; CHECK-LABEL: intrinsic_vrgather_mask_vx_nxv1f64_nxv1f64_i32:
-; CHECK:       # %bb.0: # %entry
-; CHECK-NEXT:    vsetvli zero, a1, e64, m1, ta, mu
-; CHECK-NEXT:    vrgather.vx v8, v9, a0, v0.t
-; CHECK-NEXT:    ret
-entry:
-  %a = call <vscale x 1 x double> @llvm.riscv.vrgather.vx.mask.nxv1f64.i32(
-    <vscale x 1 x double> %0,
-    <vscale x 1 x double> %1,
-    i32 %2,
-    <vscale x 1 x i1> %3,
-    i32 %4, i32 1)
-
-  ret <vscale x 1 x double> %a
-}
-
-declare <vscale x 2 x double> @llvm.riscv.vrgather.vx.nxv2f64.i32(
-  <vscale x 2 x double>,
-  <vscale x 2 x double>,
-  i32,
-  i32);
-
-define <vscale x 2 x double> @intrinsic_vrgather_vx_nxv2f64_nxv2f64_i32(<vscale x 2 x double> %0, i32 %1, i32 %2) nounwind {
-; CHECK-LABEL: intrinsic_vrgather_vx_nxv2f64_nxv2f64_i32:
-; CHECK:       # %bb.0: # %entry
-; CHECK-NEXT:    vsetvli zero, a1, e64, m2, ta, ma
-; CHECK-NEXT:    vrgather.vx v10, v8, a0
-; CHECK-NEXT:    vmv.v.v v8, v10
-; CHECK-NEXT:    ret
-entry:
-  %a = call <vscale x 2 x double> @llvm.riscv.vrgather.vx.nxv2f64.i32(
-    <vscale x 2 x double> undef,
-    <vscale x 2 x double> %0,
-    i32 %1,
-    i32 %2)
-
-  ret <vscale x 2 x double> %a
-}
-
-declare <vscale x 2 x double> @llvm.riscv.vrgather.vx.mask.nxv2f64.i32(
-  <vscale x 2 x double>,
-  <vscale x 2 x double>,
-  i32,
-  <vscale x 2 x i1>,
-  i32,
-  i32);
-
-define <vscale x 2 x double> @intrinsic_vrgather_mask_vx_nxv2f64_nxv2f64_i32(<vscale x 2 x double> %0, <vscale x 2 x double> %1, i32 %2, <vscale x 2 x i1> %3, i32 %4) nounwind {
-; CHECK-LABEL: intrinsic_vrgather_mask_vx_nxv2f64_nxv2f64_i32:
-; CHECK:       # %bb.0: # %entry
-; CHECK-NEXT:    vsetvli zero, a1, e64, m2, ta, mu
-; CHECK-NEXT:    vrgather.vx v8, v10, a0, v0.t
-; CHECK-NEXT:    ret
-entry:
-  %a = call <vscale x 2 x double> @llvm.riscv.vrgather.vx.mask.nxv2f64.i32(
-    <vscale x 2 x double> %0,
-    <vscale x 2 x double> %1,
-    i32 %2,
-    <vscale x 2 x i1> %3,
-    i32 %4, i32 1)
-
-  ret <vscale x 2 x double> %a
-}
-
-declare <vscale x 4 x double> @llvm.riscv.vrgather.vx.nxv4f64.i32(
-  <vscale x 4 x double>,
-  <vscale x 4 x double>,
-  i32,
-  i32);
-
-define <vscale x 4 x double> @intrinsic_vrgather_vx_nxv4f64_nxv4f64_i32(<vscale x 4 x double> %0, i32 %1, i32 %2) nounwind {
-; CHECK-LABEL: intrinsic_vrgather_vx_nxv4f64_nxv4f64_i32:
-; CHECK:       # %bb.0: # %entry
-; CHECK-NEXT:    vsetvli zero, a1, e64, m4, ta, ma
-; CHECK-NEXT:    vrgather.vx v12, v8, a0
-; CHECK-NEXT:    vmv.v.v v8, v12
-; CHECK-NEXT:    ret
-entry:
-  %a = call <vscale x 4 x double> @llvm.riscv.vrgather.vx.nxv4f64.i32(
-    <vscale x 4 x double> undef,
-    <vscale x 4 x double> %0,
-    i32 %1,
-    i32 %2)
-
-  ret <vscale x 4 x double> %a
-}
-
-declare <vscale x 4 x double> @llvm.riscv.vrgather.vx.mask.nxv4f64.i32(
-  <vscale x 4 x double>,
-  <vscale x 4 x double>,
-  i32,
-  <vscale x 4 x i1>,
-  i32,
-  i32);
-
-define <vscale x 4 x double> @intrinsic_vrgather_mask_vx_nxv4f64_nxv4f64_i32(<vscale x 4 x double> %0, <vscale x 4 x double> %1, i32 %2, <vscale x 4 x i1> %3, i32 %4) nounwind {
-; CHECK-LABEL: intrinsic_vrgather_mask_vx_nxv4f64_nxv4f64_i32:
-; CHECK:       # %bb.0: # %entry
-; CHECK-NEXT:    vsetvli zero, a1, e64, m4, ta, mu
-; CHECK-NEXT:    vrgather.vx v8, v12, a0, v0.t
-; CHECK-NEXT:    ret
-entry:
-  %a = call <vscale x 4 x double> @llvm.riscv.vrgather.vx.mask.nxv4f64.i32(
-    <vscale x 4 x double> %0,
-    <vscale x 4 x double> %1,
-    i32 %2,
-    <vscale x 4 x i1> %3,
-    i32 %4, i32 1)
-
-  ret <vscale x 4 x double> %a
-}
-
-declare <vscale x 8 x double> @llvm.riscv.vrgather.vx.nxv8f64.i32(
-  <vscale x 8 x double>,
-  <vscale x 8 x double>,
-  i32,
-  i32);
-
-define <vscale x 8 x double> @intrinsic_vrgather_vx_nxv8f64_nxv8f64_i32(<vscale x 8 x double> %0, i32 %1, i32 %2) nounwind {
-; CHECK-LABEL: intrinsic_vrgather_vx_nxv8f64_nxv8f64_i32:
-; CHECK:       # %bb.0: # %entry
-; CHECK-NEXT:    vsetvli zero, a1, e64, m8, ta, ma
-; CHECK-NEXT:    vrgather.vx v16, v8, a0
-; CHECK-NEXT:    vmv.v.v v8, v16
-; CHECK-NEXT:    ret
-entry:
-  %a = call <vscale x 8 x double> @llvm.riscv.vrgather.vx.nxv8f64.i32(
-    <vscale x 8 x double> undef,
-    <vscale x 8 x double> %0,
-    i32 %1,
-    i32 %2)
-
-  ret <vscale x 8 x double> %a
-}
-
-declare <vscale x 8 x double> @llvm.riscv.vrgather.vx.mask.nxv8f64.i32(
-  <vscale x 8 x double>,
-  <vscale x 8 x double>,
-  i32,
-  <vscale x 8 x i1>,
-  i32,
-  i32);
-
-define <vscale x 8 x double> @intrinsic_vrgather_mask_vx_nxv8f64_nxv8f64_i32(<vscale x 8 x double> %0, <vscale x 8 x double> %1, i32 %2, <vscale x 8 x i1> %3, i32 %4) nounwind {
-; CHECK-LABEL: intrinsic_vrgather_mask_vx_nxv8f64_nxv8f64_i32:
-; CHECK:       # %bb.0: # %entry
-; CHECK-NEXT:    vsetvli zero, a1, e64, m8, ta, mu
-; CHECK-NEXT:    vrgather.vx v8, v16, a0, v0.t
-; CHECK-NEXT:    ret
-entry:
-  %a = call <vscale x 8 x double> @llvm.riscv.vrgather.vx.mask.nxv8f64.i32(
-    <vscale x 8 x double> %0,
-    <vscale x 8 x double> %1,
-    i32 %2,
-    <vscale x 8 x i1> %3,
-    i32 %4, i32 1)
-
-  ret <vscale x 8 x double> %a
-}
-
-define <vscale x 1 x i8> @intrinsic_vrgather_vi_nxv1i8_nxv1i8_i32(<vscale x 1 x i8> %0, i32 %1) nounwind {
-; CHECK-LABEL: intrinsic_vrgather_vi_nxv1i8_nxv1i8_i32:
-; CHECK:       # %bb.0: # %entry
-; CHECK-NEXT:    vsetvli zero, a0, e8, mf8, ta, ma
-; CHECK-NEXT:    vrgather.vi v9, v8, 9
-; CHECK-NEXT:    vmv1r.v v8, v9
-; CHECK-NEXT:    ret
-entry:
-  %a = call <vscale x 1 x i8> @llvm.riscv.vrgather.vx.nxv1i8.i32(
-    <vscale x 1 x i8> undef,
-    <vscale x 1 x i8> %0,
-    i32 9,
-    i32 %1)
-
-  ret <vscale x 1 x i8> %a
-}
-
-define <vscale x 1 x i8> @intrinsic_vrgather_mask_vi_nxv1i8_nxv1i8_i32(<vscale x 1 x i8> %0, <vscale x 1 x i8> %1, <vscale x 1 x i1> %2, i32 %3) nounwind {
-; CHECK-LABEL: intrinsic_vrgather_mask_vi_nxv1i8_nxv1i8_i32:
-; CHECK:       # %bb.0: # %entry
-; CHECK-NEXT:    vsetvli zero, a0, e8, mf8, ta, mu
-; CHECK-NEXT:    vrgather.vi v8, v9, 9, v0.t
-; CHECK-NEXT:    ret
-entry:
-  %a = call <vscale x 1 x i8> @llvm.riscv.vrgather.vx.mask.nxv1i8.i32(
-    <vscale x 1 x i8> %0,
-    <vscale x 1 x i8> %1,
-    i32 9,
-    <vscale x 1 x i1> %2,
-    i32 %3, i32 1)
-
-  ret <vscale x 1 x i8> %a
-}
-
-define <vscale x 2 x i8> @intrinsic_vrgather_vi_nxv2i8_nxv2i8_i32(<vscale x 2 x i8> %0, i32 %1) nounwind {
-; CHECK-LABEL: intrinsic_vrgather_vi_nxv2i8_nxv2i8_i32:
-; CHECK:       # %bb.0: # %entry
-; CHECK-NEXT:    vsetvli zero, a0, e8, mf4, ta, ma
-; CHECK-NEXT:    vrgather.vi v9, v8, 9
-; CHECK-NEXT:    vmv1r.v v8, v9
-; CHECK-NEXT:    ret
-entry:
-  %a = call <vscale x 2 x i8> @llvm.riscv.vrgather.vx.nxv2i8.i32(
-    <vscale x 2 x i8> undef,
-    <vscale x 2 x i8> %0,
-    i32 9,
-    i32 %1)
-
-  ret <vscale x 2 x i8> %a
-}
-
-define <vscale x 2 x i8> @intrinsic_vrgather_mask_vi_nxv2i8_nxv2i8_i32(<vscale x 2 x i8> %0, <vscale x 2 x i8> %1, <vscale x 2 x i1> %2, i32 %3) nounwind {
-; CHECK-LABEL: intrinsic_vrgather_mask_vi_nxv2i8_nxv2i8_i32:
-; CHECK:       # %bb.0: # %entry
-; CHECK-NEXT:    vsetvli zero, a0, e8, mf4, ta, mu
-; CHECK-NEXT:    vrgather.vi v8, v9, 9, v0.t
-; CHECK-NEXT:    ret
-entry:
-  %a = call <vscale x 2 x i8> @llvm.riscv.vrgather.vx.mask.nxv2i8.i32(
-    <vscale x 2 x i8> %0,
-    <vscale x 2 x i8> %1,
-    i32 9,
-    <vscale x 2 x i1> %2,
-    i32 %3, i32 1)
-
-  ret <vscale x 2 x i8> %a
-}
-
-define <vscale x 4 x i8> @intrinsic_vrgather_vi_nxv4i8_nxv4i8_i32(<vscale x 4 x i8> %0, i32 %1) nounwind {
-; CHECK-LABEL: intrinsic_vrgather_vi_nxv4i8_nxv4i8_i32:
-; CHECK:       # %bb.0: # %entry
-; CHECK-NEXT:    vsetvli zero, a0, e8, mf2, ta, ma
-; CHECK-NEXT:    vrgather.vi v9, v8, 9
-; CHECK-NEXT:    vmv1r.v v8, v9
-; CHECK-NEXT:    ret
-entry:
-  %a = call <vscale x 4 x i8> @llvm.riscv.vrgather.vx.nxv4i8.i32(
-    <vscale x 4 x i8> undef,
-    <vscale x 4 x i8> %0,
-    i32 9,
-    i32 %1)
-
-  ret <vscale x 4 x i8> %a
-}
-
-define <vscale x 4 x i8> @intrinsic_vrgather_mask_vi_nxv4i8_nxv4i8_i32(<vscale x 4 x i8> %0, <vscale x 4 x i8> %1, <vscale x 4 x i1> %2, i32 %3) nounwind {
-; CHECK-LABEL: intrinsic_vrgather_mask_vi_nxv4i8_nxv4i8_i32:
-; CHECK:       # %bb.0: # %entry
-; CHECK-NEXT:    vsetvli zero, a0, e8, mf2, ta, mu
-; CHECK-NEXT:    vrgather.vi v8, v9, 9, v0.t
-; CHECK-NEXT:    ret
-entry:
-  %a = call <vscale x 4 x i8> @llvm.riscv.vrgather.vx.mask.nxv4i8.i32(
-    <vscale x 4 x i8> %0,
-    <vscale x 4 x i8> %1,
-    i32 9,
-    <vscale x 4 x i1> %2,
-    i32 %3, i32 1)
-
-  ret <vscale x 4 x i8> %a
-}
-
-define <vscale x 8 x i8> @intrinsic_vrgather_vi_nxv8i8_nxv8i8_i32(<vscale x 8 x i8> %0, i32 %1) nounwind {
-; CHECK-LABEL: intrinsic_vrgather_vi_nxv8i8_nxv8i8_i32:
-; CHECK:       # %bb.0: # %entry
-; CHECK-NEXT:    vsetvli zero, a0, e8, m1, ta, ma
-; CHECK-NEXT:    vrgather.vi v9, v8, 9
-; CHECK-NEXT:    vmv.v.v v8, v9
-; CHECK-NEXT:    ret
-entry:
-  %a = call <vscale x 8 x i8> @llvm.riscv.vrgather.vx.nxv8i8.i32(
-    <vscale x 8 x i8> undef,
-    <vscale x 8 x i8> %0,
-    i32 9,
-    i32 %1)
-
-  ret <vscale x 8 x i8> %a
-}
-
-define <vscale x 8 x i8> @intrinsic_vrgather_mask_vi_nxv8i8_nxv8i8_i32(<vscale x 8 x i8> %0, <vscale x 8 x i8> %1, <vscale x 8 x i1> %2, i32 %3) nounwind {
-; CHECK-LABEL: intrinsic_vrgather_mask_vi_nxv8i8_nxv8i8_i32:
-; CHECK:       # %bb.0: # %entry
-; CHECK-NEXT:    vsetvli zero, a0, e8, m1, ta, mu
-; CHECK-NEXT:    vrgather.vi v8, v9, 9, v0.t
-; CHECK-NEXT:    ret
-entry:
-  %a = call <vscale x 8 x i8> @llvm.riscv.vrgather.vx.mask.nxv8i8.i32(
-    <vscale x 8 x i8> %0,
-    <vscale x 8 x i8> %1,
-    i32 9,
-    <vscale x 8 x i1> %2,
-    i32 %3, i32 1)
-
-  ret <vscale x 8 x i8> %a
-}
-
-define <vscale x 16 x i8> @intrinsic_vrgather_vi_nxv16i8_nxv16i8_i32(<vscale x 16 x i8> %0, i32 %1) nounwind {
-; CHECK-LABEL: intrinsic_vrgather_vi_nxv16i8_nxv16i8_i32:
-; CHECK:       # %bb.0: # %entry
-; CHECK-NEXT:    vsetvli zero, a0, e8, m2, ta, ma
-; CHECK-NEXT:    vrgather.vi v10, v8, 9
-; CHECK-NEXT:    vmv.v.v v8, v10
-; CHECK-NEXT:    ret
-entry:
-  %a = call <vscale x 16 x i8> @llvm.riscv.vrgather.vx.nxv16i8.i32(
-    <vscale x 16 x i8> undef,
-    <vscale x 16 x i8> %0,
-    i32 9,
-    i32 %1)
-
-  ret <vscale x 16 x i8> %a
-}
-
-define <vscale x 16 x i8> @intrinsic_vrgather_mask_vi_nxv16i8_nxv16i8_i32(<vscale x 16 x i8> %0, <vscale x 16 x i8> %1, <vscale x 16 x i1> %2, i32 %3) nounwind {
-; CHECK-LABEL: intrinsic_vrgather_mask_vi_nxv16i8_nxv16i8_i32:
-; CHECK:       # %bb.0: # %entry
-; CHECK-NEXT:    vsetvli zero, a0, e8, m2, ta, mu
-; CHECK-NEXT:    vrgather.vi v8, v10, 9, v0.t
-; CHECK-NEXT:    ret
-entry:
-  %a = call <vscale x 16 x i8> @llvm.riscv.vrgather.vx.mask.nxv16i8.i32(
-    <vscale x 16 x i8> %0,
-    <vscale x 16 x i8> %1,
-    i32 9,
-    <vscale x 16 x i1> %2,
-    i32 %3, i32 1)
-
-  ret <vscale x 16 x i8> %a
-}
-
-define <vscale x 32 x i8> @intrinsic_vrgather_vi_nxv32i8_nxv32i8_i32(<vscale x 32 x i8> %0, i32 %1) nounwind {
-; CHECK-LABEL: intrinsic_vrgather_vi_nxv32i8_nxv32i8_i32:
-; CHECK:       # %bb.0: # %entry
-; CHECK-NEXT:    vsetvli zero, a0, e8, m4, ta, ma
-; CHECK-NEXT:    vrgather.vi v12, v8, 9
-; CHECK-NEXT:    vmv.v.v v8, v12
-; CHECK-NEXT:    ret
-entry:
-  %a = call <vscale x 32 x i8> @llvm.riscv.vrgather.vx.nxv32i8.i32(
-    <vscale x 32 x i8> undef,
-    <vscale x 32 x i8> %0,
-    i32 9,
-    i32 %1)
-
-  ret <vscale x 32 x i8> %a
-}
-
-define <vscale x 32 x i8> @intrinsic_vrgather_mask_vi_nxv32i8_nxv32i8_i32(<vscale x 32 x i8> %0, <vscale x 32 x i8> %1, <vscale x 32 x i1> %2, i32 %3) nounwind {
-; CHECK-LABEL: intrinsic_vrgather_mask_vi_nxv32i8_nxv32i8_i32:
-; CHECK:       # %bb.0: # %entry
-; CHECK-NEXT:    vsetvli zero, a0, e8, m4, ta, mu
-; CHECK-NEXT:    vrgather.vi v8, v12, 9, v0.t
-; CHECK-NEXT:    ret
-entry:
-  %a = call <vscale x 32 x i8> @llvm.riscv.vrgather.vx.mask.nxv32i8.i32(
-    <vscale x 32 x i8> %0,
-    <vscale x 32 x i8> %1,
-    i32 9,
-    <vscale x 32 x i1> %2,
-    i32 %3, i32 1)
-
-  ret <vscale x 32 x i8> %a
-}
-
-define <vscale x 64 x i8> @intrinsic_vrgather_vi_nxv64i8_nxv64i8_i32(<vscale x 64 x i8> %0, i32 %1) nounwind {
-; CHECK-LABEL: intrinsic_vrgather_vi_nxv64i8_nxv64i8_i32:
-; CHECK:       # %bb.0: # %entry
-; CHECK-NEXT:    vsetvli zero, a0, e8, m8, ta, ma
-; CHECK-NEXT:    vrgather.vi v16, v8, 9
-; CHECK-NEXT:    vmv.v.v v8, v16
-; CHECK-NEXT:    ret
-entry:
-  %a = call <vscale x 64 x i8> @llvm.riscv.vrgather.vx.nxv64i8.i32(
-    <vscale x 64 x i8> undef,
-    <vscale x 64 x i8> %0,
-    i32 9,
-    i32 %1)
-
-  ret <vscale x 64 x i8> %a
-}
-
-define <vscale x 64 x i8> @intrinsic_vrgather_mask_vi_nxv64i8_nxv64i8_i32(<vscale x 64 x i8> %0, <vscale x 64 x i8> %1, <vscale x 64 x i1> %2, i32 %3) nounwind {
-; CHECK-LABEL: intrinsic_vrgather_mask_vi_nxv64i8_nxv64i8_i32:
-; CHECK:       # %bb.0: # %entry
-; CHECK-NEXT:    vsetvli zero, a0, e8, m8, ta, mu
-; CHECK-NEXT:    vrgather.vi v8, v16, 9, v0.t
-; CHECK-NEXT:    ret
-entry:
-  %a = call <vscale x 64 x i8> @llvm.riscv.vrgather.vx.mask.nxv64i8.i32(
-    <vscale x 64 x i8> %0,
-    <vscale x 64 x i8> %1,
-    i32 9,
-    <vscale x 64 x i1> %2,
-    i32 %3, i32 1)
-
-  ret <vscale x 64 x i8> %a
-}
-
-define <vscale x 1 x i16> @intrinsic_vrgather_vi_nxv1i16_nxv1i16_i32(<vscale x 1 x i16> %0, i32 %1) nounwind {
-; CHECK-LABEL: intrinsic_vrgather_vi_nxv1i16_nxv1i16_i32:
-; CHECK:       # %bb.0: # %entry
-; CHECK-NEXT:    vsetvli zero, a0, e16, mf4, ta, ma
-; CHECK-NEXT:    vrgather.vi v9, v8, 9
-; CHECK-NEXT:    vmv1r.v v8, v9
-; CHECK-NEXT:    ret
-entry:
-  %a = call <vscale x 1 x i16> @llvm.riscv.vrgather.vx.nxv1i16.i32(
-    <vscale x 1 x i16> undef,
-    <vscale x 1 x i16> %0,
-    i32 9,
-    i32 %1)
-
-  ret <vscale x 1 x i16> %a
-}
-
-define <vscale x 1 x i16> @intrinsic_vrgather_mask_vi_nxv1i16_nxv1i16_i32(<vscale x 1 x i16> %0, <vscale x 1 x i16> %1, <vscale x 1 x i1> %2, i32 %3) nounwind {
-; CHECK-LABEL: intrinsic_vrgather_mask_vi_nxv1i16_nxv1i16_i32:
-; CHECK:       # %bb.0: # %entry
-; CHECK-NEXT:    vsetvli zero, a0, e16, mf4, ta, mu
-; CHECK-NEXT:    vrgather.vi v8, v9, 9, v0.t
-; CHECK-NEXT:    ret
-entry:
-  %a = call <vscale x 1 x i16> @llvm.riscv.vrgather.vx.mask.nxv1i16.i32(
-    <vscale x 1 x i16> %0,
-    <vscale x 1 x i16> %1,
-    i32 9,
-    <vscale x 1 x i1> %2,
-    i32 %3, i32 1)
-
-  ret <vscale x 1 x i16> %a
-}
-
-define <vscale x 2 x i16> @intrinsic_vrgather_vi_nxv2i16_nxv2i16_i32(<vscale x 2 x i16> %0, i32 %1) nounwind {
-; CHECK-LABEL: intrinsic_vrgather_vi_nxv2i16_nxv2i16_i32:
-; CHECK:       # %bb.0: # %entry
-; CHECK-NEXT:    vsetvli zero, a0, e16, mf2, ta, ma
-; CHECK-NEXT:    vrgather.vi v9, v8, 9
-; CHECK-NEXT:    vmv1r.v v8, v9
-; CHECK-NEXT:    ret
-entry:
-  %a = call <vscale x 2 x i16> @llvm.riscv.vrgather.vx.nxv2i16.i32(
-    <vscale x 2 x i16> undef,
-    <vscale x 2 x i16> %0,
-    i32 9,
-    i32 %1)
-
-  ret <vscale x 2 x i16> %a
-}
-
-define <vscale x 2 x i16> @intrinsic_vrgather_mask_vi_nxv2i16_nxv2i16_i32(<vscale x 2 x i16> %0, <vscale x 2 x i16> %1, <vscale x 2 x i1> %2, i32 %3) nounwind {
-; CHECK-LABEL: intrinsic_vrgather_mask_vi_nxv2i16_nxv2i16_i32:
-; CHECK:       # %bb.0: # %entry
-; CHECK-NEXT:    vsetvli zero, a0, e16, mf2, ta, mu
-; CHECK-NEXT:    vrgather.vi v8, v9, 9, v0.t
-; CHECK-NEXT:    ret
-entry:
-  %a = call <vscale x 2 x i16> @llvm.riscv.vrgather.vx.mask.nxv2i16.i32(
-    <vscale x 2 x i16> %0,
-    <vscale x 2 x i16> %1,
-    i32 9,
-    <vscale x 2 x i1> %2,
-    i32 %3, i32 1)
-
-  ret <vscale x 2 x i16> %a
-}
-
-define <vscale x 4 x i16> @intrinsic_vrgather_vi_nxv4i16_nxv4i16_i32(<vscale x 4 x i16> %0, i32 %1) nounwind {
-; CHECK-LABEL: intrinsic_vrgather_vi_nxv4i16_nxv4i16_i32:
-; CHECK:       # %bb.0: # %entry
-; CHECK-NEXT:    vsetvli zero, a0, e16, m1, ta, ma
-; CHECK-NEXT:    vrgather.vi v9, v8, 9
-; CHECK-NEXT:    vmv.v.v v8, v9
-; CHECK-NEXT:    ret
-entry:
-  %a = call <vscale x 4 x i16> @llvm.riscv.vrgather.vx.nxv4i16.i32(
-    <vscale x 4 x i16> undef,
-    <vscale x 4 x i16> %0,
-    i32 9,
-    i32 %1)
-
-  ret <vscale x 4 x i16> %a
-}
-
-define <vscale x 4 x i16> @intrinsic_vrgather_mask_vi_nxv4i16_nxv4i16_i32(<vscale x 4 x i16> %0, <vscale x 4 x i16> %1, <vscale x 4 x i1> %2, i32 %3) nounwind {
-; CHECK-LABEL: intrinsic_vrgather_mask_vi_nxv4i16_nxv4i16_i32:
-; CHECK:       # %bb.0: # %entry
-; CHECK-NEXT:    vsetvli zero, a0, e16, m1, ta, mu
-; CHECK-NEXT:    vrgather.vi v8, v9, 9, v0.t
-; CHECK-NEXT:    ret
-entry:
-  %a = call <vscale x 4 x i16> @llvm.riscv.vrgather.vx.mask.nxv4i16.i32(
-    <vscale x 4 x i16> %0,
-    <vscale x 4 x i16> %1,
-    i32 9,
-    <vscale x 4 x i1> %2,
-    i32 %3, i32 1)
-
-  ret <vscale x 4 x i16> %a
-}
-
-define <vscale x 8 x i16> @intrinsic_vrgather_vi_nxv8i16_nxv8i16_i32(<vscale x 8 x i16> %0, i32 %1) nounwind {
-; CHECK-LABEL: intrinsic_vrgather_vi_nxv8i16_nxv8i16_i32:
-; CHECK:       # %bb.0: # %entry
-; CHECK-NEXT:    vsetvli zero, a0, e16, m2, ta, ma
-; CHECK-NEXT:    vrgather.vi v10, v8, 9
-; CHECK-NEXT:    vmv.v.v v8, v10
-; CHECK-NEXT:    ret
-entry:
-  %a = call <vscale x 8 x i16> @llvm.riscv.vrgather.vx.nxv8i16.i32(
-    <vscale x 8 x i16> undef,
-    <vscale x 8 x i16> %0,
-    i32 9,
-    i32 %1)
-
-  ret <vscale x 8 x i16> %a
-}
-
-define <vscale x 8 x i16> @intrinsic_vrgather_mask_vi_nxv8i16_nxv8i16_i32(<vscale x 8 x i16> %0, <vscale x 8 x i16> %1, <vscale x 8 x i1> %2, i32 %3) nounwind {
-; CHECK-LABEL: intrinsic_vrgather_mask_vi_nxv8i16_nxv8i16_i32:
-; CHECK:       # %bb.0: # %entry
-; CHECK-NEXT:    vsetvli zero, a0, e16, m2, ta, mu
-; CHECK-NEXT:    vrgather.vi v8, v10, 9, v0.t
-; CHECK-NEXT:    ret
-entry:
-  %a = call <vscale x 8 x i16> @llvm.riscv.vrgather.vx.mask.nxv8i16.i32(
-    <vscale x 8 x i16> %0,
-    <vscale x 8 x i16> %1,
-    i32 9,
-    <vscale x 8 x i1> %2,
-    i32 %3, i32 1)
-
-  ret <vscale x 8 x i16> %a
-}
-
-define <vscale x 16 x i16> @intrinsic_vrgather_vi_nxv16i16_nxv16i16_i32(<vscale x 16 x i16> %0, i32 %1) nounwind {
-; CHECK-LABEL: intrinsic_vrgather_vi_nxv16i16_nxv16i16_i32:
-; CHECK:       # %bb.0: # %entry
-; CHECK-NEXT:    vsetvli zero, a0, e16, m4, ta, ma
-; CHECK-NEXT:    vrgather.vi v12, v8, 9
-; CHECK-NEXT:    vmv.v.v v8, v12
-; CHECK-NEXT:    ret
-entry:
-  %a = call <vscale x 16 x i16> @llvm.riscv.vrgather.vx.nxv16i16.i32(
-    <vscale x 16 x i16> undef,
-    <vscale x 16 x i16> %0,
-    i32 9,
-    i32 %1)
-
-  ret <vscale x 16 x i16> %a
-}
-
-define <vscale x 16 x i16> @intrinsic_vrgather_mask_vi_nxv16i16_nxv16i16_i32(<vscale x 16 x i16> %0, <vscale x 16 x i16> %1, <vscale x 16 x i1> %2, i32 %3) nounwind {
-; CHECK-LABEL: intrinsic_vrgather_mask_vi_nxv16i16_nxv16i16_i32:
-; CHECK:       # %bb.0: # %entry
-; CHECK-NEXT:    vsetvli zero, a0, e16, m4, ta, mu
-; CHECK-NEXT:    vrgather.vi v8, v12, 9, v0.t
-; CHECK-NEXT:    ret
-entry:
-  %a = call <vscale x 16 x i16> @llvm.riscv.vrgather.vx.mask.nxv16i16.i32(
-    <vscale x 16 x i16> %0,
-    <vscale x 16 x i16> %1,
-    i32 9,
-    <vscale x 16 x i1> %2,
-    i32 %3, i32 1)
-
-  ret <vscale x 16 x i16> %a
-}
-
-define <vscale x 32 x i16> @intrinsic_vrgather_vi_nxv32i16_nxv32i16_i32(<vscale x 32 x i16> %0, i32 %1) nounwind {
-; CHECK-LABEL: intrinsic_vrgather_vi_nxv32i16_nxv32i16_i32:
-; CHECK:       # %bb.0: # %entry
-; CHECK-NEXT:    vsetvli zero, a0, e16, m8, ta, ma
-; CHECK-NEXT:    vrgather.vi v16, v8, 9
-; CHECK-NEXT:    vmv.v.v v8, v16
-; CHECK-NEXT:    ret
-entry:
-  %a = call <vscale x 32 x i16> @llvm.riscv.vrgather.vx.nxv32i16.i32(
-    <vscale x 32 x i16> undef,
-    <vscale x 32 x i16> %0,
-    i32 9,
-    i32 %1)
-
-  ret <vscale x 32 x i16> %a
-}
-
-define <vscale x 32 x i16> @intrinsic_vrgather_mask_vi_nxv32i16_nxv32i16_i32(<vscale x 32 x i16> %0, <vscale x 32 x i16> %1, <vscale x 32 x i1> %2, i32 %3) nounwind {
-; CHECK-LABEL: intrinsic_vrgather_mask_vi_nxv32i16_nxv32i16_i32:
-; CHECK:       # %bb.0: # %entry
-; CHECK-NEXT:    vsetvli zero, a0, e16, m8, ta, mu
-; CHECK-NEXT:    vrgather.vi v8, v16, 9, v0.t
-; CHECK-NEXT:    ret
-entry:
-  %a = call <vscale x 32 x i16> @llvm.riscv.vrgather.vx.mask.nxv32i16.i32(
-    <vscale x 32 x i16> %0,
-    <vscale x 32 x i16> %1,
-    i32 9,
-    <vscale x 32 x i1> %2,
-    i32 %3, i32 1)
-
-  ret <vscale x 32 x i16> %a
-}
-
-define <vscale x 1 x i32> @intrinsic_vrgather_vi_nxv1i32_nxv1i32_i32(<vscale x 1 x i32> %0, i32 %1) nounwind {
-; CHECK-LABEL: intrinsic_vrgather_vi_nxv1i32_nxv1i32_i32:
-; CHECK:       # %bb.0: # %entry
-; CHECK-NEXT:    vsetvli zero, a0, e32, mf2, ta, ma
-; CHECK-NEXT:    vrgather.vi v9, v8, 9
-; CHECK-NEXT:    vmv1r.v v8, v9
-; CHECK-NEXT:    ret
-entry:
-  %a = call <vscale x 1 x i32> @llvm.riscv.vrgather.vx.nxv1i32.i32(
-    <vscale x 1 x i32> undef,
-    <vscale x 1 x i32> %0,
-    i32 9,
-    i32 %1)
-
-  ret <vscale x 1 x i32> %a
-}
-
-define <vscale x 1 x i32> @intrinsic_vrgather_mask_vi_nxv1i32_nxv1i32_i32(<vscale x 1 x i32> %0, <vscale x 1 x i32> %1, <vscale x 1 x i1> %2, i32 %3) nounwind {
-; CHECK-LABEL: intrinsic_vrgather_mask_vi_nxv1i32_nxv1i32_i32:
-; CHECK:       # %bb.0: # %entry
-; CHECK-NEXT:    vsetvli zero, a0, e32, mf2, ta, mu
-; CHECK-NEXT:    vrgather.vi v8, v9, 9, v0.t
-; CHECK-NEXT:    ret
-entry:
-  %a = call <vscale x 1 x i32> @llvm.riscv.vrgather.vx.mask.nxv1i32.i32(
-    <vscale x 1 x i32> %0,
-    <vscale x 1 x i32> %1,
-    i32 9,
-    <vscale x 1 x i1> %2,
-    i32 %3, i32 1)
-
-  ret <vscale x 1 x i32> %a
-}
-
-define <vscale x 2 x i32> @intrinsic_vrgather_vi_nxv2i32_nxv2i32_i32(<vscale x 2 x i32> %0, i32 %1) nounwind {
-; CHECK-LABEL: intrinsic_vrgather_vi_nxv2i32_nxv2i32_i32:
-; CHECK:       # %bb.0: # %entry
-; CHECK-NEXT:    vsetvli zero, a0, e32, m1, ta, ma
-; CHECK-NEXT:    vrgather.vi v9, v8, 9
-; CHECK-NEXT:    vmv.v.v v8, v9
-; CHECK-NEXT:    ret
-entry:
-  %a = call <vscale x 2 x i32> @llvm.riscv.vrgather.vx.nxv2i32.i32(
-    <vscale x 2 x i32> undef,
-    <vscale x 2 x i32> %0,
-    i32 9,
-    i32 %1)
-
-  ret <vscale x 2 x i32> %a
-}
-
-define <vscale x 2 x i32> @intrinsic_vrgather_mask_vi_nxv2i32_nxv2i32_i32(<vscale x 2 x i32> %0, <vscale x 2 x i32> %1, <vscale x 2 x i1> %2, i32 %3) nounwind {
-; CHECK-LABEL: intrinsic_vrgather_mask_vi_nxv2i32_nxv2i32_i32:
-; CHECK:       # %bb.0: # %entry
-; CHECK-NEXT:    vsetvli zero, a0, e32, m1, ta, mu
-; CHECK-NEXT:    vrgather.vi v8, v9, 9, v0.t
-; CHECK-NEXT:    ret
-entry:
-  %a = call <vscale x 2 x i32> @llvm.riscv.vrgather.vx.mask.nxv2i32.i32(
-    <vscale x 2 x i32> %0,
-    <vscale x 2 x i32> %1,
-    i32 9,
-    <vscale x 2 x i1> %2,
-    i32 %3, i32 1)
-
-  ret <vscale x 2 x i32> %a
-}
-
-define <vscale x 4 x i32> @intrinsic_vrgather_vi_nxv4i32_nxv4i32_i32(<vscale x 4 x i32> %0, i32 %1) nounwind {
-; CHECK-LABEL: intrinsic_vrgather_vi_nxv4i32_nxv4i32_i32:
-; CHECK:       # %bb.0: # %entry
-; CHECK-NEXT:    vsetvli zero, a0, e32, m2, ta, ma
-; CHECK-NEXT:    vrgather.vi v10, v8, 9
-; CHECK-NEXT:    vmv.v.v v8, v10
-; CHECK-NEXT:    ret
-entry:
-  %a = call <vscale x 4 x i32> @llvm.riscv.vrgather.vx.nxv4i32.i32(
-    <vscale x 4 x i32> undef,
-    <vscale x 4 x i32> %0,
-    i32 9,
-    i32 %1)
-
-  ret <vscale x 4 x i32> %a
-}
-
-define <vscale x 4 x i32> @intrinsic_vrgather_mask_vi_nxv4i32_nxv4i32_i32(<vscale x 4 x i32> %0, <vscale x 4 x i32> %1, <vscale x 4 x i1> %2, i32 %3) nounwind {
-; CHECK-LABEL: intrinsic_vrgather_mask_vi_nxv4i32_nxv4i32_i32:
-; CHECK:       # %bb.0: # %entry
-; CHECK-NEXT:    vsetvli zero, a0, e32, m2, ta, mu
-; CHECK-NEXT:    vrgather.vi v8, v10, 9, v0.t
-; CHECK-NEXT:    ret
-entry:
-  %a = call <vscale x 4 x i32> @llvm.riscv.vrgather.vx.mask.nxv4i32.i32(
-    <vscale x 4 x i32> %0,
-    <vscale x 4 x i32> %1,
-    i32 9,
-    <vscale x 4 x i1> %2,
-    i32 %3, i32 1)
-
-  ret <vscale x 4 x i32> %a
-}
-
-define <vscale x 8 x i32> @intrinsic_vrgather_vi_nxv8i32_nxv8i32_i32(<vscale x 8 x i32> %0, i32 %1) nounwind {
-; CHECK-LABEL: intrinsic_vrgather_vi_nxv8i32_nxv8i32_i32:
-; CHECK:       # %bb.0: # %entry
-; CHECK-NEXT:    vsetvli zero, a0, e32, m4, ta, ma
-; CHECK-NEXT:    vrgather.vi v12, v8, 9
-; CHECK-NEXT:    vmv.v.v v8, v12
-; CHECK-NEXT:    ret
-entry:
-  %a = call <vscale x 8 x i32> @llvm.riscv.vrgather.vx.nxv8i32.i32(
-    <vscale x 8 x i32> undef,
-    <vscale x 8 x i32> %0,
-    i32 9,
-    i32 %1)
-
-  ret <vscale x 8 x i32> %a
-}
-
-define <vscale x 8 x i32> @intrinsic_vrgather_mask_vi_nxv8i32_nxv8i32_i32(<vscale x 8 x i32> %0, <vscale x 8 x i32> %1, <vscale x 8 x i1> %2, i32 %3) nounwind {
-; CHECK-LABEL: intrinsic_vrgather_mask_vi_nxv8i32_nxv8i32_i32:
-; CHECK:       # %bb.0: # %entry
-; CHECK-NEXT:    vsetvli zero, a0, e32, m4, ta, mu
-; CHECK-NEXT:    vrgather.vi v8, v12, 9, v0.t
-; CHECK-NEXT:    ret
-entry:
-  %a = call <vscale x 8 x i32> @llvm.riscv.vrgather.vx.mask.nxv8i32.i32(
-    <vscale x 8 x i32> %0,
-    <vscale x 8 x i32> %1,
-    i32 9,
-    <vscale x 8 x i1> %2,
-    i32 %3, i32 1)
-
-  ret <vscale x 8 x i32> %a
-}
-
-define <vscale x 16 x i32> @intrinsic_vrgather_vi_nxv16i32_nxv16i32_i32(<vscale x 16 x i32> %0, i32 %1) nounwind {
-; CHECK-LABEL: intrinsic_vrgather_vi_nxv16i32_nxv16i32_i32:
-; CHECK:       # %bb.0: # %entry
-; CHECK-NEXT:    vsetvli zero, a0, e32, m8, ta, ma
-; CHECK-NEXT:    vrgather.vi v16, v8, 9
-; CHECK-NEXT:    vmv.v.v v8, v16
-; CHECK-NEXT:    ret
-entry:
-  %a = call <vscale x 16 x i32> @llvm.riscv.vrgather.vx.nxv16i32.i32(
-    <vscale x 16 x i32> undef,
-    <vscale x 16 x i32> %0,
-    i32 9,
-    i32 %1)
-
-  ret <vscale x 16 x i32> %a
-}
-
-define <vscale x 16 x i32> @intrinsic_vrgather_mask_vi_nxv16i32_nxv16i32_i32(<vscale x 16 x i32> %0, <vscale x 16 x i32> %1, <vscale x 16 x i1> %2, i32 %3) nounwind {
-; CHECK-LABEL: intrinsic_vrgather_mask_vi_nxv16i32_nxv16i32_i32:
-; CHECK:       # %bb.0: # %entry
-; CHECK-NEXT:    vsetvli zero, a0, e32, m8, ta, mu
-; CHECK-NEXT:    vrgather.vi v8, v16, 9, v0.t
-; CHECK-NEXT:    ret
-entry:
-  %a = call <vscale x 16 x i32> @llvm.riscv.vrgather.vx.mask.nxv16i32.i32(
-    <vscale x 16 x i32> %0,
-    <vscale x 16 x i32> %1,
-    i32 9,
-    <vscale x 16 x i1> %2,
-    i32 %3, i32 1)
-
-  ret <vscale x 16 x i32> %a
-}
-
-define <vscale x 1 x half> @intrinsic_vrgather_vi_nxv1f16_nxv1f16_i32(<vscale x 1 x half> %0, i32 %1) nounwind {
-; CHECK-LABEL: intrinsic_vrgather_vi_nxv1f16_nxv1f16_i32:
-; CHECK:       # %bb.0: # %entry
-; CHECK-NEXT:    vsetvli zero, a0, e16, mf4, ta, ma
-; CHECK-NEXT:    vrgather.vi v9, v8, 9
-; CHECK-NEXT:    vmv1r.v v8, v9
-; CHECK-NEXT:    ret
-entry:
-  %a = call <vscale x 1 x half> @llvm.riscv.vrgather.vx.nxv1f16.i32(
-    <vscale x 1 x half> undef,
-    <vscale x 1 x half> %0,
-    i32 9,
-    i32 %1)
-
-  ret <vscale x 1 x half> %a
-}
-
-define <vscale x 1 x half> @intrinsic_vrgather_mask_vi_nxv1f16_nxv1f16_i32(<vscale x 1 x half> %0, <vscale x 1 x half> %1, <vscale x 1 x i1> %2, i32 %3) nounwind {
-; CHECK-LABEL: intrinsic_vrgather_mask_vi_nxv1f16_nxv1f16_i32:
-; CHECK:       # %bb.0: # %entry
-; CHECK-NEXT:    vsetvli zero, a0, e16, mf4, ta, mu
-; CHECK-NEXT:    vrgather.vi v8, v9, 9, v0.t
-; CHECK-NEXT:    ret
-entry:
-  %a = call <vscale x 1 x half> @llvm.riscv.vrgather.vx.mask.nxv1f16.i32(
-    <vscale x 1 x half> %0,
-    <vscale x 1 x half> %1,
-    i32 9,
-    <vscale x 1 x i1> %2,
-    i32 %3, i32 1)
-
-  ret <vscale x 1 x half> %a
-}
-
-define <vscale x 2 x half> @intrinsic_vrgather_vi_nxv2f16_nxv2f16_i32(<vscale x 2 x half> %0, i32 %1) nounwind {
-; CHECK-LABEL: intrinsic_vrgather_vi_nxv2f16_nxv2f16_i32:
-; CHECK:       # %bb.0: # %entry
-; CHECK-NEXT:    vsetvli zero, a0, e16, mf2, ta, ma
-; CHECK-NEXT:    vrgather.vi v9, v8, 9
-; CHECK-NEXT:    vmv1r.v v8, v9
-; CHECK-NEXT:    ret
-entry:
-  %a = call <vscale x 2 x half> @llvm.riscv.vrgather.vx.nxv2f16.i32(
-    <vscale x 2 x half> undef,
-    <vscale x 2 x half> %0,
-    i32 9,
-    i32 %1)
-
-  ret <vscale x 2 x half> %a
-}
-
-define <vscale x 2 x half> @intrinsic_vrgather_mask_vi_nxv2f16_nxv2f16_i32(<vscale x 2 x half> %0, <vscale x 2 x half> %1, <vscale x 2 x i1> %2, i32 %3) nounwind {
-; CHECK-LABEL: intrinsic_vrgather_mask_vi_nxv2f16_nxv2f16_i32:
-; CHECK:       # %bb.0: # %entry
-; CHECK-NEXT:    vsetvli zero, a0, e16, mf2, ta, mu
-; CHECK-NEXT:    vrgather.vi v8, v9, 9, v0.t
-; CHECK-NEXT:    ret
-entry:
-  %a = call <vscale x 2 x half> @llvm.riscv.vrgather.vx.mask.nxv2f16.i32(
-    <vscale x 2 x half> %0,
-    <vscale x 2 x half> %1,
-    i32 9,
-    <vscale x 2 x i1> %2,
-    i32 %3, i32 1)
-
-  ret <vscale x 2 x half> %a
-}
-
-define <vscale x 4 x half> @intrinsic_vrgather_vi_nxv4f16_nxv4f16_i32(<vscale x 4 x half> %0, i32 %1) nounwind {
-; CHECK-LABEL: intrinsic_vrgather_vi_nxv4f16_nxv4f16_i32:
-; CHECK:       # %bb.0: # %entry
-; CHECK-NEXT:    vsetvli zero, a0, e16, m1, ta, ma
-; CHECK-NEXT:    vrgather.vi v9, v8, 9
-; CHECK-NEXT:    vmv.v.v v8, v9
-; CHECK-NEXT:    ret
-entry:
-  %a = call <vscale x 4 x half> @llvm.riscv.vrgather.vx.nxv4f16.i32(
-    <vscale x 4 x half> undef,
-    <vscale x 4 x half> %0,
-    i32 9,
-    i32 %1)
-
-  ret <vscale x 4 x half> %a
-}
-
-define <vscale x 4 x half> @intrinsic_vrgather_mask_vi_nxv4f16_nxv4f16_i32(<vscale x 4 x half> %0, <vscale x 4 x half> %1, <vscale x 4 x i1> %2, i32 %3) nounwind {
-; CHECK-LABEL: intrinsic_vrgather_mask_vi_nxv4f16_nxv4f16_i32:
-; CHECK:       # %bb.0: # %entry
-; CHECK-NEXT:    vsetvli zero, a0, e16, m1, ta, mu
-; CHECK-NEXT:    vrgather.vi v8, v9, 9, v0.t
-; CHECK-NEXT:    ret
-entry:
-  %a = call <vscale x 4 x half> @llvm.riscv.vrgather.vx.mask.nxv4f16.i32(
-    <vscale x 4 x half> %0,
-    <vscale x 4 x half> %1,
-    i32 9,
-    <vscale x 4 x i1> %2,
-    i32 %3, i32 1)
-
-  ret <vscale x 4 x half> %a
-}
-
-define <vscale x 8 x half> @intrinsic_vrgather_vi_nxv8f16_nxv8f16_i32(<vscale x 8 x half> %0, i32 %1) nounwind {
-; CHECK-LABEL: intrinsic_vrgather_vi_nxv8f16_nxv8f16_i32:
-; CHECK:       # %bb.0: # %entry
-; CHECK-NEXT:    vsetvli zero, a0, e16, m2, ta, ma
-; CHECK-NEXT:    vrgather.vi v10, v8, 9
-; CHECK-NEXT:    vmv.v.v v8, v10
-; CHECK-NEXT:    ret
-entry:
-  %a = call <vscale x 8 x half> @llvm.riscv.vrgather.vx.nxv8f16.i32(
-    <vscale x 8 x half> undef,
-    <vscale x 8 x half> %0,
-    i32 9,
-    i32 %1)
-
-  ret <vscale x 8 x half> %a
-}
-
-define <vscale x 8 x half> @intrinsic_vrgather_mask_vi_nxv8f16_nxv8f16_i32(<vscale x 8 x half> %0, <vscale x 8 x half> %1, <vscale x 8 x i1> %2, i32 %3) nounwind {
-; CHECK-LABEL: intrinsic_vrgather_mask_vi_nxv8f16_nxv8f16_i32:
-; CHECK:       # %bb.0: # %entry
-; CHECK-NEXT:    vsetvli zero, a0, e16, m2, ta, mu
-; CHECK-NEXT:    vrgather.vi v8, v10, 9, v0.t
-; CHECK-NEXT:    ret
-entry:
-  %a = call <vscale x 8 x half> @llvm.riscv.vrgather.vx.mask.nxv8f16.i32(
-    <vscale x 8 x half> %0,
-    <vscale x 8 x half> %1,
-    i32 9,
-    <vscale x 8 x i1> %2,
-    i32 %3, i32 1)
-
-  ret <vscale x 8 x half> %a
-}
-
-define <vscale x 16 x half> @intrinsic_vrgather_vi_nxv16f16_nxv16f16_i32(<vscale x 16 x half> %0, i32 %1) nounwind {
-; CHECK-LABEL: intrinsic_vrgather_vi_nxv16f16_nxv16f16_i32:
-; CHECK:       # %bb.0: # %entry
-; CHECK-NEXT:    vsetvli zero, a0, e16, m4, ta, ma
-; CHECK-NEXT:    vrgather.vi v12, v8, 9
-; CHECK-NEXT:    vmv.v.v v8, v12
-; CHECK-NEXT:    ret
-entry:
-  %a = call <vscale x 16 x half> @llvm.riscv.vrgather.vx.nxv16f16.i32(
-    <vscale x 16 x half> undef,
-    <vscale x 16 x half> %0,
-    i32 9,
-    i32 %1)
-
-  ret <vscale x 16 x half> %a
-}
-
-define <vscale x 16 x half> @intrinsic_vrgather_mask_vi_nxv16f16_nxv16f16_i32(<vscale x 16 x half> %0, <vscale x 16 x half> %1, <vscale x 16 x i1> %2, i32 %3) nounwind {
-; CHECK-LABEL: intrinsic_vrgather_mask_vi_nxv16f16_nxv16f16_i32:
-; CHECK:       # %bb.0: # %entry
-; CHECK-NEXT:    vsetvli zero, a0, e16, m4, ta, mu
-; CHECK-NEXT:    vrgather.vi v8, v12, 9, v0.t
-; CHECK-NEXT:    ret
-entry:
-  %a = call <vscale x 16 x half> @llvm.riscv.vrgather.vx.mask.nxv16f16.i32(
-    <vscale x 16 x half> %0,
-    <vscale x 16 x half> %1,
-    i32 9,
-    <vscale x 16 x i1> %2,
-    i32 %3, i32 1)
-
-  ret <vscale x 16 x half> %a
-}
-
-define <vscale x 32 x half> @intrinsic_vrgather_vi_nxv32f16_nxv32f16_i32(<vscale x 32 x half> %0, i32 %1) nounwind {
-; CHECK-LABEL: intrinsic_vrgather_vi_nxv32f16_nxv32f16_i32:
-; CHECK:       # %bb.0: # %entry
-; CHECK-NEXT:    vsetvli zero, a0, e16, m8, ta, ma
-; CHECK-NEXT:    vrgather.vi v16, v8, 9
-; CHECK-NEXT:    vmv.v.v v8, v16
-; CHECK-NEXT:    ret
-entry:
-  %a = call <vscale x 32 x half> @llvm.riscv.vrgather.vx.nxv32f16.i32(
-    <vscale x 32 x half> undef,
-    <vscale x 32 x half> %0,
-    i32 9,
-    i32 %1)
-
-  ret <vscale x 32 x half> %a
-}
-
-define <vscale x 32 x half> @intrinsic_vrgather_mask_vi_nxv32f16_nxv32f16_i32(<vscale x 32 x half> %0, <vscale x 32 x half> %1, <vscale x 32 x i1> %2, i32 %3) nounwind {
-; CHECK-LABEL: intrinsic_vrgather_mask_vi_nxv32f16_nxv32f16_i32:
-; CHECK:       # %bb.0: # %entry
-; CHECK-NEXT:    vsetvli zero, a0, e16, m8, ta, mu
-; CHECK-NEXT:    vrgather.vi v8, v16, 9, v0.t
-; CHECK-NEXT:    ret
-entry:
-  %a = call <vscale x 32 x half> @llvm.riscv.vrgather.vx.mask.nxv32f16.i32(
-    <vscale x 32 x half> %0,
-    <vscale x 32 x half> %1,
-    i32 9,
-    <vscale x 32 x i1> %2,
-    i32 %3, i32 1)
-
-  ret <vscale x 32 x half> %a
-}
-
-define <vscale x 1 x float> @intrinsic_vrgather_vi_nxv1f32_nxv1f32_i32(<vscale x 1 x float> %0, i32 %1) nounwind {
-; CHECK-LABEL: intrinsic_vrgather_vi_nxv1f32_nxv1f32_i32:
-; CHECK:       # %bb.0: # %entry
-; CHECK-NEXT:    vsetvli zero, a0, e32, mf2, ta, ma
-; CHECK-NEXT:    vrgather.vi v9, v8, 9
-; CHECK-NEXT:    vmv1r.v v8, v9
-; CHECK-NEXT:    ret
-entry:
-  %a = call <vscale x 1 x float> @llvm.riscv.vrgather.vx.nxv1f32.i32(
-    <vscale x 1 x float> undef,
-    <vscale x 1 x float> %0,
-    i32 9,
-    i32 %1)
-
-  ret <vscale x 1 x float> %a
-}
-
-define <vscale x 1 x float> @intrinsic_vrgather_mask_vi_nxv1f32_nxv1f32_i32(<vscale x 1 x float> %0, <vscale x 1 x float> %1, <vscale x 1 x i1> %2, i32 %3) nounwind {
-; CHECK-LABEL: intrinsic_vrgather_mask_vi_nxv1f32_nxv1f32_i32:
-; CHECK:       # %bb.0: # %entry
-; CHECK-NEXT:    vsetvli zero, a0, e32, mf2, ta, mu
-; CHECK-NEXT:    vrgather.vi v8, v9, 9, v0.t
-; CHECK-NEXT:    ret
-entry:
-  %a = call <vscale x 1 x float> @llvm.riscv.vrgather.vx.mask.nxv1f32.i32(
-    <vscale x 1 x float> %0,
-    <vscale x 1 x float> %1,
-    i32 9,
-    <vscale x 1 x i1> %2,
-    i32 %3, i32 1)
-
-  ret <vscale x 1 x float> %a
-}
-
-define <vscale x 2 x float> @intrinsic_vrgather_vi_nxv2f32_nxv2f32_i32(<vscale x 2 x float> %0, i32 %1) nounwind {
-; CHECK-LABEL: intrinsic_vrgather_vi_nxv2f32_nxv2f32_i32:
-; CHECK:       # %bb.0: # %entry
-; CHECK-NEXT:    vsetvli zero, a0, e32, m1, ta, ma
-; CHECK-NEXT:    vrgather.vi v9, v8, 9
-; CHECK-NEXT:    vmv.v.v v8, v9
-; CHECK-NEXT:    ret
-entry:
-  %a = call <vscale x 2 x float> @llvm.riscv.vrgather.vx.nxv2f32.i32(
-    <vscale x 2 x float> undef,
-    <vscale x 2 x float> %0,
-    i32 9,
-    i32 %1)
-
-  ret <vscale x 2 x float> %a
-}
-
-define <vscale x 2 x float> @intrinsic_vrgather_mask_vi_nxv2f32_nxv2f32_i32(<vscale x 2 x float> %0, <vscale x 2 x float> %1, <vscale x 2 x i1> %2, i32 %3) nounwind {
-; CHECK-LABEL: intrinsic_vrgather_mask_vi_nxv2f32_nxv2f32_i32:
-; CHECK:       # %bb.0: # %entry
-; CHECK-NEXT:    vsetvli zero, a0, e32, m1, ta, mu
-; CHECK-NEXT:    vrgather.vi v8, v9, 9, v0.t
-; CHECK-NEXT:    ret
-entry:
-  %a = call <vscale x 2 x float> @llvm.riscv.vrgather.vx.mask.nxv2f32.i32(
-    <vscale x 2 x float> %0,
-    <vscale x 2 x float> %1,
-    i32 9,
-    <vscale x 2 x i1> %2,
-    i32 %3, i32 1)
-
-  ret <vscale x 2 x float> %a
-}
-
-define <vscale x 4 x float> @intrinsic_vrgather_vi_nxv4f32_nxv4f32_i32(<vscale x 4 x float> %0, i32 %1) nounwind {
-; CHECK-LABEL: intrinsic_vrgather_vi_nxv4f32_nxv4f32_i32:
-; CHECK:       # %bb.0: # %entry
-; CHECK-NEXT:    vsetvli zero, a0, e32, m2, ta, ma
-; CHECK-NEXT:    vrgather.vi v10, v8, 9
-; CHECK-NEXT:    vmv.v.v v8, v10
-; CHECK-NEXT:    ret
-entry:
-  %a = call <vscale x 4 x float> @llvm.riscv.vrgather.vx.nxv4f32.i32(
-    <vscale x 4 x float> undef,
-    <vscale x 4 x float> %0,
-    i32 9,
-    i32 %1)
-
-  ret <vscale x 4 x float> %a
-}
-
-define <vscale x 4 x float> @intrinsic_vrgather_mask_vi_nxv4f32_nxv4f32_i32(<vscale x 4 x float> %0, <vscale x 4 x float> %1, <vscale x 4 x i1> %2, i32 %3) nounwind {
-; CHECK-LABEL: intrinsic_vrgather_mask_vi_nxv4f32_nxv4f32_i32:
-; CHECK:       # %bb.0: # %entry
-; CHECK-NEXT:    vsetvli zero, a0, e32, m2, ta, mu
-; CHECK-NEXT:    vrgather.vi v8, v10, 9, v0.t
-; CHECK-NEXT:    ret
-entry:
-  %a = call <vscale x 4 x float> @llvm.riscv.vrgather.vx.mask.nxv4f32.i32(
-    <vscale x 4 x float> %0,
-    <vscale x 4 x float> %1,
-    i32 9,
-    <vscale x 4 x i1> %2,
-    i32 %3, i32 1)
-
-  ret <vscale x 4 x float> %a
-}
-
-define <vscale x 8 x float> @intrinsic_vrgather_vi_nxv8f32_nxv8f32_i32(<vscale x 8 x float> %0, i32 %1) nounwind {
-; CHECK-LABEL: intrinsic_vrgather_vi_nxv8f32_nxv8f32_i32:
-; CHECK:       # %bb.0: # %entry
-; CHECK-NEXT:    vsetvli zero, a0, e32, m4, ta, ma
-; CHECK-NEXT:    vrgather.vi v12, v8, 9
-; CHECK-NEXT:    vmv.v.v v8, v12
-; CHECK-NEXT:    ret
-entry:
-  %a = call <vscale x 8 x float> @llvm.riscv.vrgather.vx.nxv8f32.i32(
-    <vscale x 8 x float> undef,
-    <vscale x 8 x float> %0,
-    i32 9,
-    i32 %1)
-
-  ret <vscale x 8 x float> %a
-}
-
-define <vscale x 8 x float> @intrinsic_vrgather_mask_vi_nxv8f32_nxv8f32_i32(<vscale x 8 x float> %0, <vscale x 8 x float> %1, <vscale x 8 x i1> %2, i32 %3) nounwind {
-; CHECK-LABEL: intrinsic_vrgather_mask_vi_nxv8f32_nxv8f32_i32:
-; CHECK:       # %bb.0: # %entry
-; CHECK-NEXT:    vsetvli zero, a0, e32, m4, ta, mu
-; CHECK-NEXT:    vrgather.vi v8, v12, 9, v0.t
-; CHECK-NEXT:    ret
-entry:
-  %a = call <vscale x 8 x float> @llvm.riscv.vrgather.vx.mask.nxv8f32.i32(
-    <vscale x 8 x float> %0,
-    <vscale x 8 x float> %1,
-    i32 9,
-    <vscale x 8 x i1> %2,
-    i32 %3, i32 1)
-
-  ret <vscale x 8 x float> %a
-}
-
-define <vscale x 16 x float> @intrinsic_vrgather_vi_nxv16f32_nxv16f32_i32(<vscale x 16 x float> %0, i32 %1) nounwind {
-; CHECK-LABEL: intrinsic_vrgather_vi_nxv16f32_nxv16f32_i32:
-; CHECK:       # %bb.0: # %entry
-; CHECK-NEXT:    vsetvli zero, a0, e32, m8, ta, ma
-; CHECK-NEXT:    vrgather.vi v16, v8, 9
-; CHECK-NEXT:    vmv.v.v v8, v16
-; CHECK-NEXT:    ret
-entry:
-  %a = call <vscale x 16 x float> @llvm.riscv.vrgather.vx.nxv16f32.i32(
-    <vscale x 16 x float> undef,
-    <vscale x 16 x float> %0,
-    i32 9,
-    i32 %1)
-
-  ret <vscale x 16 x float> %a
-}
-
-define <vscale x 16 x float> @intrinsic_vrgather_mask_vi_nxv16f32_nxv16f32_i32(<vscale x 16 x float> %0, <vscale x 16 x float> %1, <vscale x 16 x i1> %2, i32 %3) nounwind {
-; CHECK-LABEL: intrinsic_vrgather_mask_vi_nxv16f32_nxv16f32_i32:
-; CHECK:       # %bb.0: # %entry
-; CHECK-NEXT:    vsetvli zero, a0, e32, m8, ta, mu
-; CHECK-NEXT:    vrgather.vi v8, v16, 9, v0.t
-; CHECK-NEXT:    ret
-entry:
-  %a = call <vscale x 16 x float> @llvm.riscv.vrgather.vx.mask.nxv16f32.i32(
-    <vscale x 16 x float> %0,
-    <vscale x 16 x float> %1,
-    i32 9,
-    <vscale x 16 x i1> %2,
-    i32 %3, i32 1)
-
-  ret <vscale x 16 x float> %a
-}
-
-define <vscale x 1 x double> @intrinsic_vrgather_vi_nxv1f64_nxv1f64_i32(<vscale x 1 x double> %0, i32 %1) nounwind {
-; CHECK-LABEL: intrinsic_vrgather_vi_nxv1f64_nxv1f64_i32:
-; CHECK:       # %bb.0: # %entry
-; CHECK-NEXT:    vsetvli zero, a0, e64, m1, ta, ma
-; CHECK-NEXT:    vrgather.vi v9, v8, 9
-; CHECK-NEXT:    vmv.v.v v8, v9
-; CHECK-NEXT:    ret
-entry:
-  %a = call <vscale x 1 x double> @llvm.riscv.vrgather.vx.nxv1f64.i32(
-    <vscale x 1 x double> undef,
-    <vscale x 1 x double> %0,
-    i32 9,
-    i32 %1)
-
-  ret <vscale x 1 x double> %a
-}
-
-define <vscale x 1 x double> @intrinsic_vrgather_mask_vi_nxv1f64_nxv1f64_i32(<vscale x 1 x double> %0, <vscale x 1 x double> %1, <vscale x 1 x i1> %2, i32 %3) nounwind {
-; CHECK-LABEL: intrinsic_vrgather_mask_vi_nxv1f64_nxv1f64_i32:
-; CHECK:       # %bb.0: # %entry
-; CHECK-NEXT:    vsetvli zero, a0, e64, m1, ta, mu
-; CHECK-NEXT:    vrgather.vi v8, v9, 9, v0.t
-; CHECK-NEXT:    ret
-entry:
-  %a = call <vscale x 1 x double> @llvm.riscv.vrgather.vx.mask.nxv1f64.i32(
-    <vscale x 1 x double> %0,
-    <vscale x 1 x double> %1,
-    i32 9,
-    <vscale x 1 x i1> %2,
-    i32 %3, i32 1)
-
-  ret <vscale x 1 x double> %a
-}
-
-define <vscale x 2 x double> @intrinsic_vrgather_vi_nxv2f64_nxv2f64_i32(<vscale x 2 x double> %0, i32 %1) nounwind {
-; CHECK-LABEL: intrinsic_vrgather_vi_nxv2f64_nxv2f64_i32:
-; CHECK:       # %bb.0: # %entry
-; CHECK-NEXT:    vsetvli zero, a0, e64, m2, ta, ma
-; CHECK-NEXT:    vrgather.vi v10, v8, 9
-; CHECK-NEXT:    vmv.v.v v8, v10
-; CHECK-NEXT:    ret
-entry:
-  %a = call <vscale x 2 x double> @llvm.riscv.vrgather.vx.nxv2f64.i32(
-    <vscale x 2 x double> undef,
-    <vscale x 2 x double> %0,
-    i32 9,
-    i32 %1)
-
-  ret <vscale x 2 x double> %a
-}
-
-define <vscale x 2 x double> @intrinsic_vrgather_mask_vi_nxv2f64_nxv2f64_i32(<vscale x 2 x double> %0, <vscale x 2 x double> %1, <vscale x 2 x i1> %2, i32 %3) nounwind {
-; CHECK-LABEL: intrinsic_vrgather_mask_vi_nxv2f64_nxv2f64_i32:
-; CHECK:       # %bb.0: # %entry
-; CHECK-NEXT:    vsetvli zero, a0, e64, m2, ta, mu
-; CHECK-NEXT:    vrgather.vi v8, v10, 9, v0.t
-; CHECK-NEXT:    ret
-entry:
-  %a = call <vscale x 2 x double> @llvm.riscv.vrgather.vx.mask.nxv2f64.i32(
-    <vscale x 2 x double> %0,
-    <vscale x 2 x double> %1,
-    i32 9,
-    <vscale x 2 x i1> %2,
-    i32 %3, i32 1)
-
-  ret <vscale x 2 x double> %a
-}
-
-define <vscale x 4 x double> @intrinsic_vrgather_vi_nxv4f64_nxv4f64_i32(<vscale x 4 x double> %0, i32 %1) nounwind {
-; CHECK-LABEL: intrinsic_vrgather_vi_nxv4f64_nxv4f64_i32:
-; CHECK:       # %bb.0: # %entry
-; CHECK-NEXT:    vsetvli zero, a0, e64, m4, ta, ma
-; CHECK-NEXT:    vrgather.vi v12, v8, 9
-; CHECK-NEXT:    vmv.v.v v8, v12
-; CHECK-NEXT:    ret
-entry:
-  %a = call <vscale x 4 x double> @llvm.riscv.vrgather.vx.nxv4f64.i32(
-    <vscale x 4 x double> undef,
-    <vscale x 4 x double> %0,
-    i32 9,
-    i32 %1)
-
-  ret <vscale x 4 x double> %a
-}
-
-define <vscale x 4 x double> @intrinsic_vrgather_mask_vi_nxv4f64_nxv4f64_i32(<vscale x 4 x double> %0, <vscale x 4 x double> %1, <vscale x 4 x i1> %2, i32 %3) nounwind {
-; CHECK-LABEL: intrinsic_vrgather_mask_vi_nxv4f64_nxv4f64_i32:
-; CHECK:       # %bb.0: # %entry
-; CHECK-NEXT:    vsetvli zero, a0, e64, m4, ta, mu
-; CHECK-NEXT:    vrgather.vi v8, v12, 9, v0.t
-; CHECK-NEXT:    ret
-entry:
-  %a = call <vscale x 4 x double> @llvm.riscv.vrgather.vx.mask.nxv4f64.i32(
-    <vscale x 4 x double> %0,
-    <vscale x 4 x double> %1,
-    i32 9,
-    <vscale x 4 x i1> %2,
-    i32 %3, i32 1)
-
-  ret <vscale x 4 x double> %a
-}
-
-define <vscale x 8 x double> @intrinsic_vrgather_vi_nxv8f64_nxv8f64_i32(<vscale x 8 x double> %0, i32 %1) nounwind {
-; CHECK-LABEL: intrinsic_vrgather_vi_nxv8f64_nxv8f64_i32:
-; CHECK:       # %bb.0: # %entry
-; CHECK-NEXT:    vsetvli zero, a0, e64, m8, ta, ma
-; CHECK-NEXT:    vrgather.vi v16, v8, 9
-; CHECK-NEXT:    vmv.v.v v8, v16
-; CHECK-NEXT:    ret
-entry:
-  %a = call <vscale x 8 x double> @llvm.riscv.vrgather.vx.nxv8f64.i32(
-    <vscale x 8 x double> undef,
-    <vscale x 8 x double> %0,
-    i32 9,
-    i32 %1)
-
-  ret <vscale x 8 x double> %a
-}
-
-define <vscale x 8 x double> @intrinsic_vrgather_mask_vi_nxv8f64_nxv8f64_i32(<vscale x 8 x double> %0, <vscale x 8 x double> %1, <vscale x 8 x i1> %2, i32 %3) nounwind {
-; CHECK-LABEL: intrinsic_vrgather_mask_vi_nxv8f64_nxv8f64_i32:
-; CHECK:       # %bb.0: # %entry
-; CHECK-NEXT:    vsetvli zero, a0, e64, m8, ta, mu
-; CHECK-NEXT:    vrgather.vi v8, v16, 9, v0.t
-; CHECK-NEXT:    ret
-entry:
-  %a = call <vscale x 8 x double> @llvm.riscv.vrgather.vx.mask.nxv8f64.i32(
-    <vscale x 8 x double> %0,
-    <vscale x 8 x double> %1,
-    i32 9,
-    <vscale x 8 x i1> %2,
-    i32 %3, i32 1)
-
-  ret <vscale x 8 x double> %a
-}
diff --git a/llvm/test/CodeGen/RISCV/rvv/vrgather-rv64.ll b/llvm/test/CodeGen/RISCV/rvv/vrgather.ll
index f6c4869..d11e172 100644
--- a/llvm/test/CodeGen/RISCV/rvv/vrgather-rv64.ll
+++ b/llvm/test/CodeGen/RISCV/rvv/vrgather.ll
@@ -1,14 +1,16 @@
 ; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py
-; RUN: llc -mtriple=riscv64 -mattr=+v,+d,+zfh,+zvfh -verify-machineinstrs \
-; RUN:   < %s | FileCheck %s
+; RUN: sed 's/iXLen/i32/g' %s | llc -mtriple=riscv32 -mattr=+v,+d,+zfh,+zvfh \
+; RUN:   -verify-machineinstrs | FileCheck %s
+; RUN: sed 's/iXLen/i64/g' %s | llc -mtriple=riscv64 -mattr=+v,+d,+zfh,+zvfh \
+; RUN:   -verify-machineinstrs | FileCheck %s
 
-declare <vscale x 1 x i8> @llvm.riscv.vrgather.vv.nxv1i8.i64(
+declare <vscale x 1 x i8> @llvm.riscv.vrgather.vv.nxv1i8.iXLen(
   <vscale x 1 x i8>,
   <vscale x 1 x i8>,
   <vscale x 1 x i8>,
-  i64);
+  iXLen)
 
-define <vscale x 1 x i8> @intrinsic_vrgather_vv_nxv1i8_nxv1i8_nxv1i8(<vscale x 1 x i8> %0, <vscale x 1 x i8> %1, i64 %2) nounwind {
+define <vscale x 1 x i8> @intrinsic_vrgather_vv_nxv1i8_nxv1i8_nxv1i8(<vscale x 1 x i8> %0, <vscale x 1 x i8> %1, iXLen %2) nounwind {
 ; CHECK-LABEL: intrinsic_vrgather_vv_nxv1i8_nxv1i8_nxv1i8:
 ; CHECK:       # %bb.0: # %entry
 ; CHECK-NEXT:    vsetvli zero, a0, e8, mf8, ta, ma
@@ -16,47 +18,47 @@ define <vscale x 1 x i8> @intrinsic_vrgather_vv_nxv1i8_nxv1i8_nxv1i8(<vscale x 1
 ; CHECK-NEXT:    vmv1r.v v8, v10
 ; CHECK-NEXT:    ret
 entry:
-  %a = call <vscale x 1 x i8> @llvm.riscv.vrgather.vv.nxv1i8.i64(
+  %a = call <vscale x 1 x i8> @llvm.riscv.vrgather.vv.nxv1i8.iXLen(
     <vscale x 1 x i8> undef,
     <vscale x 1 x i8> %0,
     <vscale x 1 x i8> %1,
-    i64 %2)
+    iXLen %2)
 
   ret <vscale x 1 x i8> %a
 }
 
-declare <vscale x 1 x i8> @llvm.riscv.vrgather.vv.mask.nxv1i8.i64(
+declare <vscale x 1 x i8> @llvm.riscv.vrgather.vv.mask.nxv1i8.iXLen(
   <vscale x 1 x i8>,
   <vscale x 1 x i8>,
   <vscale x 1 x i8>,
   <vscale x 1 x i1>,
-  i64,
-  i64);
+  iXLen,
+  iXLen)
 
-define <vscale x 1 x i8> @intrinsic_vrgather_mask_vv_nxv1i8_nxv1i8_nxv1i8(<vscale x 1 x i8> %0, <vscale x 1 x i8> %1, <vscale x 1 x i8> %2, <vscale x 1 x i1> %3, i64 %4) nounwind {
+define <vscale x 1 x i8> @intrinsic_vrgather_mask_vv_nxv1i8_nxv1i8_nxv1i8(<vscale x 1 x i8> %0, <vscale x 1 x i8> %1, <vscale x 1 x i8> %2, <vscale x 1 x i1> %3, iXLen %4) nounwind {
 ; CHECK-LABEL: intrinsic_vrgather_mask_vv_nxv1i8_nxv1i8_nxv1i8:
 ; CHECK:       # %bb.0: # %entry
 ; CHECK-NEXT:    vsetvli zero, a0, e8, mf8, ta, mu
 ; CHECK-NEXT:    vrgather.vv v8, v9, v10, v0.t
 ; CHECK-NEXT:    ret
 entry:
-  %a = call <vscale x 1 x i8> @llvm.riscv.vrgather.vv.mask.nxv1i8.i64(
+  %a = call <vscale x 1 x i8> @llvm.riscv.vrgather.vv.mask.nxv1i8.iXLen(
     <vscale x 1 x i8> %0,
     <vscale x 1 x i8> %1,
     <vscale x 1 x i8> %2,
     <vscale x 1 x i1> %3,
-    i64 %4, i64 1)
+    iXLen %4, iXLen 1)
 
   ret <vscale x 1 x i8> %a
 }
 
-declare <vscale x 2 x i8> @llvm.riscv.vrgather.vv.nxv2i8.i64(
+declare <vscale x 2 x i8> @llvm.riscv.vrgather.vv.nxv2i8.iXLen(
   <vscale x 2 x i8>,
   <vscale x 2 x i8>,
   <vscale x 2 x i8>,
-  i64);
+  iXLen)
 
-define <vscale x 2 x i8> @intrinsic_vrgather_vv_nxv2i8_nxv2i8_nxv2i8(<vscale x 2 x i8> %0, <vscale x 2 x i8> %1, i64 %2) nounwind {
+define <vscale x 2 x i8> @intrinsic_vrgather_vv_nxv2i8_nxv2i8_nxv2i8(<vscale x 2 x i8> %0, <vscale x 2 x i8> %1, iXLen %2) nounwind {
 ; CHECK-LABEL: intrinsic_vrgather_vv_nxv2i8_nxv2i8_nxv2i8:
 ; CHECK:       # %bb.0: # %entry
 ; CHECK-NEXT:    vsetvli zero, a0, e8, mf4, ta, ma
@@ -64,47 +66,47 @@ define <vscale x 2 x i8> @intrinsic_vrgather_vv_nxv2i8_nxv2i8_nxv2i8(<vscale x 2
 ; CHECK-NEXT:    vmv1r.v v8, v10
 ; CHECK-NEXT:    ret
 entry:
-  %a = call <vscale x 2 x i8> @llvm.riscv.vrgather.vv.nxv2i8.i64(
+  %a = call <vscale x 2 x i8> @llvm.riscv.vrgather.vv.nxv2i8.iXLen(
     <vscale x 2 x i8> undef,
     <vscale x 2 x i8> %0,
     <vscale x 2 x i8> %1,
-    i64 %2)
+    iXLen %2)
 
   ret <vscale x 2 x i8> %a
 }
 
-declare <vscale x 2 x i8> @llvm.riscv.vrgather.vv.mask.nxv2i8.i64(
+declare <vscale x 2 x i8> @llvm.riscv.vrgather.vv.mask.nxv2i8.iXLen(
   <vscale x 2 x i8>,
   <vscale x 2 x i8>,
   <vscale x 2 x i8>,
   <vscale x 2 x i1>,
-  i64,
-  i64);
+  iXLen,
+  iXLen)
 
-define <vscale x 2 x i8> @intrinsic_vrgather_mask_vv_nxv2i8_nxv2i8_nxv2i8(<vscale x 2 x i8> %0, <vscale x 2 x i8> %1, <vscale x 2 x i8> %2, <vscale x 2 x i1> %3, i64 %4) nounwind {
+define <vscale x 2 x i8> @intrinsic_vrgather_mask_vv_nxv2i8_nxv2i8_nxv2i8(<vscale x 2 x i8> %0, <vscale x 2 x i8> %1, <vscale x 2 x i8> %2, <vscale x 2 x i1> %3, iXLen %4) nounwind {
 ; CHECK-LABEL: intrinsic_vrgather_mask_vv_nxv2i8_nxv2i8_nxv2i8:
 ; CHECK:       # %bb.0: # %entry
 ; CHECK-NEXT:    vsetvli zero, a0, e8, mf4, ta, mu
 ; CHECK-NEXT:    vrgather.vv v8, v9, v10, v0.t
 ; CHECK-NEXT:    ret
 entry:
-  %a = call <vscale x 2 x i8> @llvm.riscv.vrgather.vv.mask.nxv2i8.i64(
+  %a = call <vscale x 2 x i8> @llvm.riscv.vrgather.vv.mask.nxv2i8.iXLen(
     <vscale x 2 x i8> %0,
     <vscale x 2 x i8> %1,
     <vscale x 2 x i8> %2,
     <vscale x 2 x i1> %3,
-    i64 %4, i64 1)
+    iXLen %4, iXLen 1)
 
   ret <vscale x 2 x i8> %a
 }
 
-declare <vscale x 4 x i8> @llvm.riscv.vrgather.vv.nxv4i8.i64(
+declare <vscale x 4 x i8> @llvm.riscv.vrgather.vv.nxv4i8.iXLen(
   <vscale x 4 x i8>,
   <vscale x 4 x i8>,
   <vscale x 4 x i8>,
-  i64);
+  iXLen)
 
-define <vscale x 4 x i8> @intrinsic_vrgather_vv_nxv4i8_nxv4i8_nxv4i8(<vscale x 4 x i8> %0, <vscale x 4 x i8> %1, i64 %2) nounwind {
+define <vscale x 4 x i8> @intrinsic_vrgather_vv_nxv4i8_nxv4i8_nxv4i8(<vscale x 4 x i8> %0, <vscale x 4 x i8> %1, iXLen %2) nounwind {
 ; CHECK-LABEL: intrinsic_vrgather_vv_nxv4i8_nxv4i8_nxv4i8:
 ; CHECK:       # %bb.0: # %entry
 ; CHECK-NEXT:    vsetvli zero, a0, e8, mf2, ta, ma
@@ -112,47 +114,47 @@ define <vscale x 4 x i8> @intrinsic_vrgather_vv_nxv4i8_nxv4i8_nxv4i8(<vscale x 4
 ; CHECK-NEXT:    vmv1r.v v8, v10
 ; CHECK-NEXT:    ret
 entry:
-  %a = call <vscale x 4 x i8> @llvm.riscv.vrgather.vv.nxv4i8.i64(
+  %a = call <vscale x 4 x i8> @llvm.riscv.vrgather.vv.nxv4i8.iXLen(
     <vscale x 4 x i8> undef,
     <vscale x 4 x i8> %0,
     <vscale x 4 x i8> %1,
-    i64 %2)
+    iXLen %2)
 
   ret <vscale x 4 x i8> %a
 }
 
-declare <vscale x 4 x i8> @llvm.riscv.vrgather.vv.mask.nxv4i8.i64(
+declare <vscale x 4 x i8> @llvm.riscv.vrgather.vv.mask.nxv4i8.iXLen(
   <vscale x 4 x i8>,
   <vscale x 4 x i8>,
   <vscale x 4 x i8>,
   <vscale x 4 x i1>,
-  i64,
-  i64);
+  iXLen,
+  iXLen)
 
-define <vscale x 4 x i8> @intrinsic_vrgather_mask_vv_nxv4i8_nxv4i8_nxv4i8(<vscale x 4 x i8> %0, <vscale x 4 x i8> %1, <vscale x 4 x i8> %2, <vscale x 4 x i1> %3, i64 %4) nounwind {
+define <vscale x 4 x i8> @intrinsic_vrgather_mask_vv_nxv4i8_nxv4i8_nxv4i8(<vscale x 4 x i8> %0, <vscale x 4 x i8> %1, <vscale x 4 x i8> %2, <vscale x 4 x i1> %3, iXLen %4) nounwind {
 ; CHECK-LABEL: intrinsic_vrgather_mask_vv_nxv4i8_nxv4i8_nxv4i8:
 ; CHECK:       # %bb.0: # %entry
 ; CHECK-NEXT:    vsetvli zero, a0, e8, mf2, ta, mu
 ; CHECK-NEXT:    vrgather.vv v8, v9, v10, v0.t
 ; CHECK-NEXT:    ret
 entry:
-  %a = call <vscale x 4 x i8> @llvm.riscv.vrgather.vv.mask.nxv4i8.i64(
+  %a = call <vscale x 4 x i8> @llvm.riscv.vrgather.vv.mask.nxv4i8.iXLen(
     <vscale x 4 x i8> %0,
     <vscale x 4 x i8> %1,
     <vscale x 4 x i8> %2,
     <vscale x 4 x i1> %3,
-    i64 %4, i64 1)
+    iXLen %4, iXLen 1)
 
   ret <vscale x 4 x i8> %a
 }
 
-declare <vscale x 8 x i8> @llvm.riscv.vrgather.vv.nxv8i8.i64(
+declare <vscale x 8 x i8> @llvm.riscv.vrgather.vv.nxv8i8.iXLen(
   <vscale x 8 x i8>,
   <vscale x 8 x i8>,
   <vscale x 8 x i8>,
-  i64);
+  iXLen)
 
-define <vscale x 8 x i8> @intrinsic_vrgather_vv_nxv8i8_nxv8i8_nxv8i8(<vscale x 8 x i8> %0, <vscale x 8 x i8> %1, i64 %2) nounwind {
+define <vscale x 8 x i8> @intrinsic_vrgather_vv_nxv8i8_nxv8i8_nxv8i8(<vscale x 8 x i8> %0, <vscale x 8 x i8> %1, iXLen %2) nounwind {
 ; CHECK-LABEL: intrinsic_vrgather_vv_nxv8i8_nxv8i8_nxv8i8:
 ; CHECK:       # %bb.0: # %entry
 ; CHECK-NEXT:    vsetvli zero, a0, e8, m1, ta, ma
@@ -160,47 +162,47 @@ define <vscale x 8 x i8> @intrinsic_vrgather_vv_nxv8i8_nxv8i8_nxv8i8(<vscale x 8
 ; CHECK-NEXT:    vmv.v.v v8, v10
 ; CHECK-NEXT:    ret
 entry:
-  %a = call <vscale x 8 x i8> @llvm.riscv.vrgather.vv.nxv8i8.i64(
+  %a = call <vscale x 8 x i8> @llvm.riscv.vrgather.vv.nxv8i8.iXLen(
     <vscale x 8 x i8> undef,
     <vscale x 8 x i8> %0,
     <vscale x 8 x i8> %1,
-    i64 %2)
+    iXLen %2)
 
   ret <vscale x 8 x i8> %a
 }
 
-declare <vscale x 8 x i8> @llvm.riscv.vrgather.vv.mask.nxv8i8.i64(
+declare <vscale x 8 x i8> @llvm.riscv.vrgather.vv.mask.nxv8i8.iXLen(
   <vscale x 8 x i8>,
   <vscale x 8 x i8>,
   <vscale x 8 x i8>,
   <vscale x 8 x i1>,
-  i64,
-  i64);
+  iXLen,
+  iXLen)
 
-define <vscale x 8 x i8> @intrinsic_vrgather_mask_vv_nxv8i8_nxv8i8_nxv8i8(<vscale x 8 x i8> %0, <vscale x 8 x i8> %1, <vscale x 8 x i8> %2, <vscale x 8 x i1> %3, i64 %4) nounwind {
+define <vscale x 8 x i8> @intrinsic_vrgather_mask_vv_nxv8i8_nxv8i8_nxv8i8(<vscale x 8 x i8> %0, <vscale x 8 x i8> %1, <vscale x 8 x i8> %2, <vscale x 8 x i1> %3, iXLen %4) nounwind {
 ; CHECK-LABEL: intrinsic_vrgather_mask_vv_nxv8i8_nxv8i8_nxv8i8:
 ; CHECK:       # %bb.0: # %entry
 ; CHECK-NEXT:    vsetvli zero, a0, e8, m1, ta, mu
 ; CHECK-NEXT:    vrgather.vv v8, v9, v10, v0.t
 ; CHECK-NEXT:    ret
 entry:
-  %a = call <vscale x 8 x i8> @llvm.riscv.vrgather.vv.mask.nxv8i8.i64(
+  %a = call <vscale x 8 x i8> @llvm.riscv.vrgather.vv.mask.nxv8i8.iXLen(
     <vscale x 8 x i8> %0,
     <vscale x 8 x i8> %1,
     <vscale x 8 x i8> %2,
     <vscale x 8 x i1> %3,
-    i64 %4, i64 1)
+    iXLen %4, iXLen 1)
 
   ret <vscale x 8 x i8> %a
 }
 
-declare <vscale x 16 x i8> @llvm.riscv.vrgather.vv.nxv16i8.i64(
+declare <vscale x 16 x i8> @llvm.riscv.vrgather.vv.nxv16i8.iXLen(
   <vscale x 16 x i8>,
   <vscale x 16 x i8>,
   <vscale x 16 x i8>,
-  i64);
+  iXLen)
 
-define <vscale x 16 x i8> @intrinsic_vrgather_vv_nxv16i8_nxv16i8_nxv16i8(<vscale x 16 x i8> %0, <vscale x 16 x i8> %1, i64 %2) nounwind {
+define <vscale x 16 x i8> @intrinsic_vrgather_vv_nxv16i8_nxv16i8_nxv16i8(<vscale x 16 x i8> %0, <vscale x 16 x i8> %1, iXLen %2) nounwind {
 ; CHECK-LABEL: intrinsic_vrgather_vv_nxv16i8_nxv16i8_nxv16i8:
 ; CHECK:       # %bb.0: # %entry
 ; CHECK-NEXT:    vsetvli zero, a0, e8, m2, ta, ma
@@ -208,47 +210,47 @@ define <vscale x 16 x i8> @intrinsic_vrgather_vv_nxv16i8_nxv16i8_nxv16i8(<vscale
 ; CHECK-NEXT:    vmv.v.v v8, v12
 ; CHECK-NEXT:    ret
 entry:
-  %a = call <vscale x 16 x i8> @llvm.riscv.vrgather.vv.nxv16i8.i64(
+  %a = call <vscale x 16 x i8> @llvm.riscv.vrgather.vv.nxv16i8.iXLen(
     <vscale x 16 x i8> undef,
     <vscale x 16 x i8> %0,
     <vscale x 16 x i8> %1,
-    i64 %2)
+    iXLen %2)
 
   ret <vscale x 16 x i8> %a
 }
 
-declare <vscale x 16 x i8> @llvm.riscv.vrgather.vv.mask.nxv16i8.i64(
+declare <vscale x 16 x i8> @llvm.riscv.vrgather.vv.mask.nxv16i8.iXLen(
   <vscale x 16 x i8>,
   <vscale x 16 x i8>,
   <vscale x 16 x i8>,
   <vscale x 16 x i1>,
-  i64,
-  i64);
+  iXLen,
+  iXLen)
 
-define <vscale x 16 x i8> @intrinsic_vrgather_mask_vv_nxv16i8_nxv16i8_nxv16i8(<vscale x 16 x i8> %0, <vscale x 16 x i8> %1, <vscale x 16 x i8> %2, <vscale x 16 x i1> %3, i64 %4) nounwind {
+define <vscale x 16 x i8> @intrinsic_vrgather_mask_vv_nxv16i8_nxv16i8_nxv16i8(<vscale x 16 x i8> %0, <vscale x 16 x i8> %1, <vscale x 16 x i8> %2, <vscale x 16 x i1> %3, iXLen %4) nounwind {
 ; CHECK-LABEL: intrinsic_vrgather_mask_vv_nxv16i8_nxv16i8_nxv16i8:
 ; CHECK:       # %bb.0: # %entry
 ; CHECK-NEXT:    vsetvli zero, a0, e8, m2, ta, mu
 ; CHECK-NEXT:    vrgather.vv v8, v10, v12, v0.t
 ; CHECK-NEXT:    ret
 entry:
-  %a = call <vscale x 16 x i8> @llvm.riscv.vrgather.vv.mask.nxv16i8.i64(
+  %a = call <vscale x 16 x i8> @llvm.riscv.vrgather.vv.mask.nxv16i8.iXLen(
     <vscale x 16 x i8> %0,
     <vscale x 16 x i8> %1,
     <vscale x 16 x i8> %2,
     <vscale x 16 x i1> %3,
-    i64 %4, i64 1)
+    iXLen %4, iXLen 1)
 
   ret <vscale x 16 x i8> %a
 }
 
-declare <vscale x 32 x i8> @llvm.riscv.vrgather.vv.nxv32i8.i64(
+declare <vscale x 32 x i8> @llvm.riscv.vrgather.vv.nxv32i8.iXLen(
   <vscale x 32 x i8>,
   <vscale x 32 x i8>,
   <vscale x 32 x i8>,
-  i64);
+  iXLen)
 
-define <vscale x 32 x i8> @intrinsic_vrgather_vv_nxv32i8_nxv32i8_nxv32i8(<vscale x 32 x i8> %0, <vscale x 32 x i8> %1, i64 %2) nounwind {
+define <vscale x 32 x i8> @intrinsic_vrgather_vv_nxv32i8_nxv32i8_nxv32i8(<vscale x 32 x i8> %0, <vscale x 32 x i8> %1, iXLen %2) nounwind {
 ; CHECK-LABEL: intrinsic_vrgather_vv_nxv32i8_nxv32i8_nxv32i8:
 ; CHECK:       # %bb.0: # %entry
 ; CHECK-NEXT:    vsetvli zero, a0, e8, m4, ta, ma
@@ -256,47 +258,47 @@ define <vscale x 32 x i8> @intrinsic_vrgather_vv_nxv32i8_nxv32i8_nxv32i8(<vscale
 ; CHECK-NEXT:    vmv.v.v v8, v16
 ; CHECK-NEXT:    ret
 entry:
-  %a = call <vscale x 32 x i8> @llvm.riscv.vrgather.vv.nxv32i8.i64(
+  %a = call <vscale x 32 x i8> @llvm.riscv.vrgather.vv.nxv32i8.iXLen(
     <vscale x 32 x i8> undef,
     <vscale x 32 x i8> %0,
     <vscale x 32 x i8> %1,
-    i64 %2)
+    iXLen %2)
 
   ret <vscale x 32 x i8> %a
 }
 
-declare <vscale x 32 x i8> @llvm.riscv.vrgather.vv.mask.nxv32i8.i64(
+declare <vscale x 32 x i8> @llvm.riscv.vrgather.vv.mask.nxv32i8.iXLen(
   <vscale x 32 x i8>,
   <vscale x 32 x i8>,
   <vscale x 32 x i8>,
   <vscale x 32 x i1>,
-  i64,
-  i64);
+  iXLen,
+  iXLen)
 
-define <vscale x 32 x i8> @intrinsic_vrgather_mask_vv_nxv32i8_nxv32i8_nxv32i8(<vscale x 32 x i8> %0, <vscale x 32 x i8> %1, <vscale x 32 x i8> %2, <vscale x 32 x i1> %3, i64 %4) nounwind {
+define <vscale x 32 x i8> @intrinsic_vrgather_mask_vv_nxv32i8_nxv32i8_nxv32i8(<vscale x 32 x i8> %0, <vscale x 32 x i8> %1, <vscale x 32 x i8> %2, <vscale x 32 x i1> %3, iXLen %4) nounwind {
 ; CHECK-LABEL: intrinsic_vrgather_mask_vv_nxv32i8_nxv32i8_nxv32i8:
 ; CHECK:       # %bb.0: # %entry
 ; CHECK-NEXT:    vsetvli zero, a0, e8, m4, ta, mu
 ; CHECK-NEXT:    vrgather.vv v8, v12, v16, v0.t
 ; CHECK-NEXT:    ret
 entry:
-  %a = call <vscale x 32 x i8> @llvm.riscv.vrgather.vv.mask.nxv32i8.i64(
+  %a = call <vscale x 32 x i8> @llvm.riscv.vrgather.vv.mask.nxv32i8.iXLen(
     <vscale x 32 x i8> %0,
     <vscale x 32 x i8> %1,
     <vscale x 32 x i8> %2,
     <vscale x 32 x i1> %3,
-    i64 %4, i64 1)
+    iXLen %4, iXLen 1)
 
   ret <vscale x 32 x i8> %a
 }
 
-declare <vscale x 64 x i8> @llvm.riscv.vrgather.vv.nxv64i8.i64(
+declare <vscale x 64 x i8> @llvm.riscv.vrgather.vv.nxv64i8.iXLen(
   <vscale x 64 x i8>,
   <vscale x 64 x i8>,
   <vscale x 64 x i8>,
-  i64);
+  iXLen)
 
-define <vscale x 64 x i8> @intrinsic_vrgather_vv_nxv64i8_nxv64i8_nxv64i8(<vscale x 64 x i8> %0, <vscale x 64 x i8> %1, i64 %2) nounwind {
+define <vscale x 64 x i8> @intrinsic_vrgather_vv_nxv64i8_nxv64i8_nxv64i8(<vscale x 64 x i8> %0, <vscale x 64 x i8> %1, iXLen %2) nounwind {
 ; CHECK-LABEL: intrinsic_vrgather_vv_nxv64i8_nxv64i8_nxv64i8:
 ; CHECK:       # %bb.0: # %entry
 ; CHECK-NEXT:    vsetvli zero, a0, e8, m8, ta, ma
@@ -304,24 +306,24 @@ define <vscale x 64 x i8> @intrinsic_vrgather_vv_nxv64i8_nxv64i8_nxv64i8(<vscale
 ; CHECK-NEXT:    vmv.v.v v8, v24
 ; CHECK-NEXT:    ret
 entry:
-  %a = call <vscale x 64 x i8> @llvm.riscv.vrgather.vv.nxv64i8.i64(
+  %a = call <vscale x 64 x i8> @llvm.riscv.vrgather.vv.nxv64i8.iXLen(
     <vscale x 64 x i8> undef,
     <vscale x 64 x i8> %0,
     <vscale x 64 x i8> %1,
-    i64 %2)
+    iXLen %2)
 
   ret <vscale x 64 x i8> %a
 }
 
-declare <vscale x 64 x i8> @llvm.riscv.vrgather.vv.mask.nxv64i8.i64(
+declare <vscale x 64 x i8> @llvm.riscv.vrgather.vv.mask.nxv64i8.iXLen(
   <vscale x 64 x i8>,
   <vscale x 64 x i8>,
   <vscale x 64 x i8>,
   <vscale x 64 x i1>,
-  i64,
-  i64);
+  iXLen,
+  iXLen)
 
-define <vscale x 64 x i8> @intrinsic_vrgather_mask_vv_nxv64i8_nxv64i8_nxv64i8(<vscale x 64 x i8> %0, <vscale x 64 x i8> %1, <vscale x 64 x i8> %2, <vscale x 64 x i1> %3, i64 %4) nounwind {
+define <vscale x 64 x i8> @intrinsic_vrgather_mask_vv_nxv64i8_nxv64i8_nxv64i8(<vscale x 64 x i8> %0, <vscale x 64 x i8> %1, <vscale x 64 x i8> %2, <vscale x 64 x i1> %3, iXLen %4) nounwind {
 ; CHECK-LABEL: intrinsic_vrgather_mask_vv_nxv64i8_nxv64i8_nxv64i8:
 ; CHECK:       # %bb.0: # %entry
 ; CHECK-NEXT:    vl8r.v v24, (a0)
@@ -329,23 +331,23 @@ define <vscale x 64 x i8> @intrinsic_vrgather_mask_vv_nxv64i8_nxv64i8_nxv64i8(<v
 ; CHECK-NEXT:    vrgather.vv v8, v16, v24, v0.t
 ; CHECK-NEXT:    ret
 entry:
-  %a = call <vscale x 64 x i8> @llvm.riscv.vrgather.vv.mask.nxv64i8.i64(
+  %a = call <vscale x 64 x i8> @llvm.riscv.vrgather.vv.mask.nxv64i8.iXLen(
     <vscale x 64 x i8> %0,
     <vscale x 64 x i8> %1,
     <vscale x 64 x i8> %2,
     <vscale x 64 x i1> %3,
-    i64 %4, i64 1)
+    iXLen %4, iXLen 1)
 
   ret <vscale x 64 x i8> %a
 }
 
-declare <vscale x 1 x i16> @llvm.riscv.vrgather.vv.nxv1i16.i64(
+declare <vscale x 1 x i16> @llvm.riscv.vrgather.vv.nxv1i16.iXLen(
   <vscale x 1 x i16>,
   <vscale x 1 x i16>,
   <vscale x 1 x i16>,
-  i64);
+  iXLen)
 
-define <vscale x 1 x i16> @intrinsic_vrgather_vv_nxv1i16_nxv1i16_nxv1i16(<vscale x 1 x i16> %0, <vscale x 1 x i16> %1, i64 %2) nounwind {
+define <vscale x 1 x i16> @intrinsic_vrgather_vv_nxv1i16_nxv1i16_nxv1i16(<vscale x 1 x i16> %0, <vscale x 1 x i16> %1, iXLen %2) nounwind {
 ; CHECK-LABEL: intrinsic_vrgather_vv_nxv1i16_nxv1i16_nxv1i16:
 ; CHECK:       # %bb.0: # %entry
 ; CHECK-NEXT:    vsetvli zero, a0, e16, mf4, ta, ma
@@ -353,47 +355,47 @@ define <vscale x 1 x i16> @intrinsic_vrgather_vv_nxv1i16_nxv1i16_nxv1i16(<vscale
 ; CHECK-NEXT:    vmv1r.v v8, v10
 ; CHECK-NEXT:    ret
 entry:
-  %a = call <vscale x 1 x i16> @llvm.riscv.vrgather.vv.nxv1i16.i64(
+  %a = call <vscale x 1 x i16> @llvm.riscv.vrgather.vv.nxv1i16.iXLen(
     <vscale x 1 x i16> undef,
     <vscale x 1 x i16> %0,
     <vscale x 1 x i16> %1,
-    i64 %2)
+    iXLen %2)
 
   ret <vscale x 1 x i16> %a
 }
 
-declare <vscale x 1 x i16> @llvm.riscv.vrgather.vv.mask.nxv1i16.i64(
+declare <vscale x 1 x i16> @llvm.riscv.vrgather.vv.mask.nxv1i16.iXLen(
   <vscale x 1 x i16>,
   <vscale x 1 x i16>,
   <vscale x 1 x i16>,
   <vscale x 1 x i1>,
-  i64,
-  i64);
+  iXLen,
+  iXLen)
 
-define <vscale x 1 x i16> @intrinsic_vrgather_mask_vv_nxv1i16_nxv1i16_nxv1i16(<vscale x 1 x i16> %0, <vscale x 1 x i16> %1, <vscale x 1 x i16> %2, <vscale x 1 x i1> %3, i64 %4) nounwind {
+define <vscale x 1 x i16> @intrinsic_vrgather_mask_vv_nxv1i16_nxv1i16_nxv1i16(<vscale x 1 x i16> %0, <vscale x 1 x i16> %1, <vscale x 1 x i16> %2, <vscale x 1 x i1> %3, iXLen %4) nounwind {
 ; CHECK-LABEL: intrinsic_vrgather_mask_vv_nxv1i16_nxv1i16_nxv1i16:
 ; CHECK:       # %bb.0: # %entry
 ; CHECK-NEXT:    vsetvli zero, a0, e16, mf4, ta, mu
 ; CHECK-NEXT:    vrgather.vv v8, v9, v10, v0.t
 ; CHECK-NEXT:    ret
 entry:
-  %a = call <vscale x 1 x i16> @llvm.riscv.vrgather.vv.mask.nxv1i16.i64(
+  %a = call <vscale x 1 x i16> @llvm.riscv.vrgather.vv.mask.nxv1i16.iXLen(
     <vscale x 1 x i16> %0,
     <vscale x 1 x i16> %1,
     <vscale x 1 x i16> %2,
     <vscale x 1 x i1> %3,
-    i64 %4, i64 1)
+    iXLen %4, iXLen 1)
 
   ret <vscale x 1 x i16> %a
 }
 
-declare <vscale x 2 x i16> @llvm.riscv.vrgather.vv.nxv2i16.i64(
+declare <vscale x 2 x i16> @llvm.riscv.vrgather.vv.nxv2i16.iXLen(
   <vscale x 2 x i16>,
   <vscale x 2 x i16>,
   <vscale x 2 x i16>,
-  i64);
+  iXLen)
 
-define <vscale x 2 x i16> @intrinsic_vrgather_vv_nxv2i16_nxv2i16_nxv2i16(<vscale x 2 x i16> %0, <vscale x 2 x i16> %1, i64 %2) nounwind {
+define <vscale x 2 x i16> @intrinsic_vrgather_vv_nxv2i16_nxv2i16_nxv2i16(<vscale x 2 x i16> %0, <vscale x 2 x i16> %1, iXLen %2) nounwind {
 ; CHECK-LABEL: intrinsic_vrgather_vv_nxv2i16_nxv2i16_nxv2i16:
 ; CHECK:       # %bb.0: # %entry
 ; CHECK-NEXT:    vsetvli zero, a0, e16, mf2, ta, ma
@@ -401,47 +403,47 @@ define <vscale x 2 x i16> @intrinsic_vrgather_vv_nxv2i16_nxv2i16_nxv2i16(<vscale
 ; CHECK-NEXT:    vmv1r.v v8, v10
 ; CHECK-NEXT:    ret
 entry:
-  %a = call <vscale x 2 x i16> @llvm.riscv.vrgather.vv.nxv2i16.i64(
+  %a = call <vscale x 2 x i16> @llvm.riscv.vrgather.vv.nxv2i16.iXLen(
     <vscale x 2 x i16> undef,
     <vscale x 2 x i16> %0,
     <vscale x 2 x i16> %1,
-    i64 %2)
+    iXLen %2)
 
   ret <vscale x 2 x i16> %a
 }
 
-declare <vscale x 2 x i16> @llvm.riscv.vrgather.vv.mask.nxv2i16.i64(
+declare <vscale x 2 x i16> @llvm.riscv.vrgather.vv.mask.nxv2i16.iXLen(
   <vscale x 2 x i16>,
   <vscale x 2 x i16>,
   <vscale x 2 x i16>,
   <vscale x 2 x i1>,
-  i64,
-  i64);
+  iXLen,
+  iXLen)
 
-define <vscale x 2 x i16> @intrinsic_vrgather_mask_vv_nxv2i16_nxv2i16_nxv2i16(<vscale x 2 x i16> %0, <vscale x 2 x i16> %1, <vscale x 2 x i16> %2, <vscale x 2 x i1> %3, i64 %4) nounwind {
+define <vscale x 2 x i16> @intrinsic_vrgather_mask_vv_nxv2i16_nxv2i16_nxv2i16(<vscale x 2 x i16> %0, <vscale x 2 x i16> %1, <vscale x 2 x i16> %2, <vscale x 2 x i1> %3, iXLen %4) nounwind {
 ; CHECK-LABEL: intrinsic_vrgather_mask_vv_nxv2i16_nxv2i16_nxv2i16:
 ; CHECK:       # %bb.0: # %entry
 ; CHECK-NEXT:    vsetvli zero, a0, e16, mf2, ta, mu
 ; CHECK-NEXT:    vrgather.vv v8, v9, v10, v0.t
 ; CHECK-NEXT:    ret
 entry:
-  %a = call <vscale x 2 x i16> @llvm.riscv.vrgather.vv.mask.nxv2i16.i64(
+  %a = call <vscale x 2 x i16> @llvm.riscv.vrgather.vv.mask.nxv2i16.iXLen(
     <vscale x 2 x i16> %0,
     <vscale x 2 x i16> %1,
     <vscale x 2 x i16> %2,
     <vscale x 2 x i1> %3,
-    i64 %4, i64 1)
+    iXLen %4, iXLen 1)
 
   ret <vscale x 2 x i16> %a
 }
 
-declare <vscale x 4 x i16> @llvm.riscv.vrgather.vv.nxv4i16.i64(
+declare <vscale x 4 x i16> @llvm.riscv.vrgather.vv.nxv4i16.iXLen(
   <vscale x 4 x i16>,
   <vscale x 4 x i16>,
   <vscale x 4 x i16>,
-  i64);
+  iXLen)
 
-define <vscale x 4 x i16> @intrinsic_vrgather_vv_nxv4i16_nxv4i16_nxv4i16(<vscale x 4 x i16> %0, <vscale x 4 x i16> %1, i64 %2) nounwind {
+define <vscale x 4 x i16> @intrinsic_vrgather_vv_nxv4i16_nxv4i16_nxv4i16(<vscale x 4 x i16> %0, <vscale x 4 x i16> %1, iXLen %2) nounwind {
 ; CHECK-LABEL: intrinsic_vrgather_vv_nxv4i16_nxv4i16_nxv4i16:
 ; CHECK:       # %bb.0: # %entry
 ; CHECK-NEXT:    vsetvli zero, a0, e16, m1, ta, ma
@@ -449,47 +451,47 @@ define <vscale x 4 x i16> @intrinsic_vrgather_vv_nxv4i16_nxv4i16_nxv4i16(<vscale
 ; CHECK-NEXT:    vmv.v.v v8, v10
 ; CHECK-NEXT:    ret
 entry:
-  %a = call <vscale x 4 x i16> @llvm.riscv.vrgather.vv.nxv4i16.i64(
+  %a = call <vscale x 4 x i16> @llvm.riscv.vrgather.vv.nxv4i16.iXLen(
     <vscale x 4 x i16> undef,
     <vscale x 4 x i16> %0,
     <vscale x 4 x i16> %1,
-    i64 %2)
+    iXLen %2)
 
   ret <vscale x 4 x i16> %a
 }
 
-declare <vscale x 4 x i16> @llvm.riscv.vrgather.vv.mask.nxv4i16.i64(
+declare <vscale x 4 x i16> @llvm.riscv.vrgather.vv.mask.nxv4i16.iXLen(
   <vscale x 4 x i16>,
   <vscale x 4 x i16>,
   <vscale x 4 x i16>,
   <vscale x 4 x i1>,
-  i64,
-  i64);
+  iXLen,
+  iXLen)
 
-define <vscale x 4 x i16> @intrinsic_vrgather_mask_vv_nxv4i16_nxv4i16_nxv4i16(<vscale x 4 x i16> %0, <vscale x 4 x i16> %1, <vscale x 4 x i16> %2, <vscale x 4 x i1> %3, i64 %4) nounwind {
+define <vscale x 4 x i16> @intrinsic_vrgather_mask_vv_nxv4i16_nxv4i16_nxv4i16(<vscale x 4 x i16> %0, <vscale x 4 x i16> %1, <vscale x 4 x i16> %2, <vscale x 4 x i1> %3, iXLen %4) nounwind {
 ; CHECK-LABEL: intrinsic_vrgather_mask_vv_nxv4i16_nxv4i16_nxv4i16:
 ; CHECK:       # %bb.0: # %entry
 ; CHECK-NEXT:    vsetvli zero, a0, e16, m1, ta, mu
 ; CHECK-NEXT:    vrgather.vv v8, v9, v10, v0.t
 ; CHECK-NEXT:    ret
 entry:
-  %a = call <vscale x 4 x i16> @llvm.riscv.vrgather.vv.mask.nxv4i16.i64(
+  %a = call <vscale x 4 x i16> @llvm.riscv.vrgather.vv.mask.nxv4i16.iXLen(
     <vscale x 4 x i16> %0,
     <vscale x 4 x i16> %1,
     <vscale x 4 x i16> %2,
     <vscale x 4 x i1> %3,
-    i64 %4, i64 1)
+    iXLen %4, iXLen 1)
 
   ret <vscale x 4 x i16> %a
 }
 
-declare <vscale x 8 x i16> @llvm.riscv.vrgather.vv.nxv8i16.i64(
+declare <vscale x 8 x i16> @llvm.riscv.vrgather.vv.nxv8i16.iXLen(
   <vscale x 8 x i16>,
   <vscale x 8 x i16>,
   <vscale x 8 x i16>,
-  i64);
+  iXLen)
 
-define <vscale x 8 x i16> @intrinsic_vrgather_vv_nxv8i16_nxv8i16_nxv8i16(<vscale x 8 x i16> %0, <vscale x 8 x i16> %1, i64 %2) nounwind {
+define <vscale x 8 x i16> @intrinsic_vrgather_vv_nxv8i16_nxv8i16_nxv8i16(<vscale x 8 x i16> %0, <vscale x 8 x i16> %1, iXLen %2) nounwind {
 ; CHECK-LABEL: intrinsic_vrgather_vv_nxv8i16_nxv8i16_nxv8i16:
 ; CHECK:       # %bb.0: # %entry
 ; CHECK-NEXT:    vsetvli zero, a0, e16, m2, ta, ma
@@ -497,47 +499,47 @@ define <vscale x 8 x i16> @intrinsic_vrgather_vv_nxv8i16_nxv8i16_nxv8i16(<vscale
 ; CHECK-NEXT:    vmv.v.v v8, v12
 ; CHECK-NEXT:    ret
 entry:
-  %a = call <vscale x 8 x i16> @llvm.riscv.vrgather.vv.nxv8i16.i64(
+  %a = call <vscale x 8 x i16> @llvm.riscv.vrgather.vv.nxv8i16.iXLen(
     <vscale x 8 x i16> undef,
     <vscale x 8 x i16> %0,
     <vscale x 8 x i16> %1,
-    i64 %2)
+    iXLen %2)
 
   ret <vscale x 8 x i16> %a
 }
 
-declare <vscale x 8 x i16> @llvm.riscv.vrgather.vv.mask.nxv8i16.i64(
+declare <vscale x 8 x i16> @llvm.riscv.vrgather.vv.mask.nxv8i16.iXLen(
   <vscale x 8 x i16>,
   <vscale x 8 x i16>,
   <vscale x 8 x i16>,
   <vscale x 8 x i1>,
-  i64,
-  i64);
+  iXLen,
+  iXLen)
 
-define <vscale x 8 x i16> @intrinsic_vrgather_mask_vv_nxv8i16_nxv8i16_nxv8i16(<vscale x 8 x i16> %0, <vscale x 8 x i16> %1, <vscale x 8 x i16> %2, <vscale x 8 x i1> %3, i64 %4) nounwind {
+define <vscale x 8 x i16> @intrinsic_vrgather_mask_vv_nxv8i16_nxv8i16_nxv8i16(<vscale x 8 x i16> %0, <vscale x 8 x i16> %1, <vscale x 8 x i16> %2, <vscale x 8 x i1> %3, iXLen %4) nounwind {
 ; CHECK-LABEL: intrinsic_vrgather_mask_vv_nxv8i16_nxv8i16_nxv8i16:
 ; CHECK:       # %bb.0: # %entry
 ; CHECK-NEXT:    vsetvli zero, a0, e16, m2, ta, mu
 ; CHECK-NEXT:    vrgather.vv v8, v10, v12, v0.t
 ; CHECK-NEXT:    ret
 entry:
-  %a = call <vscale x 8 x i16> @llvm.riscv.vrgather.vv.mask.nxv8i16.i64(
+  %a = call <vscale x 8 x i16> @llvm.riscv.vrgather.vv.mask.nxv8i16.iXLen(
     <vscale x 8 x i16> %0,
     <vscale x 8 x i16> %1,
     <vscale x 8 x i16> %2,
     <vscale x 8 x i1> %3,
-    i64 %4, i64 1)
+    iXLen %4, iXLen 1)
 
   ret <vscale x 8 x i16> %a
 }
 
-declare <vscale x 16 x i16> @llvm.riscv.vrgather.vv.nxv16i16.i64(
+declare <vscale x 16 x i16> @llvm.riscv.vrgather.vv.nxv16i16.iXLen(
   <vscale x 16 x i16>,
   <vscale x 16 x i16>,
   <vscale x 16 x i16>,
-  i64);
+  iXLen)
 
-define <vscale x 16 x i16> @intrinsic_vrgather_vv_nxv16i16_nxv16i16_nxv16i16(<vscale x 16 x i16> %0, <vscale x 16 x i16> %1, i64 %2) nounwind {
+define <vscale x 16 x i16> @intrinsic_vrgather_vv_nxv16i16_nxv16i16_nxv16i16(<vscale x 16 x i16> %0, <vscale x 16 x i16> %1, iXLen %2) nounwind {
 ; CHECK-LABEL: intrinsic_vrgather_vv_nxv16i16_nxv16i16_nxv16i16:
 ; CHECK:       # %bb.0: # %entry
 ; CHECK-NEXT:    vsetvli zero, a0, e16, m4, ta, ma
@@ -545,47 +547,47 @@ define <vscale x 16 x i16> @intrinsic_vrgather_vv_nxv16i16_nxv16i16_nxv16i16(<vs
 ; CHECK-NEXT:    vmv.v.v v8, v16
 ; CHECK-NEXT:    ret
 entry:
-  %a = call <vscale x 16 x i16> @llvm.riscv.vrgather.vv.nxv16i16.i64(
+  %a = call <vscale x 16 x i16> @llvm.riscv.vrgather.vv.nxv16i16.iXLen(
     <vscale x 16 x i16> undef,
     <vscale x 16 x i16> %0,
     <vscale x 16 x i16> %1,
-    i64 %2)
+    iXLen %2)
 
   ret <vscale x 16 x i16> %a
 }
 
-declare <vscale x 16 x i16> @llvm.riscv.vrgather.vv.mask.nxv16i16.i64(
+declare <vscale x 16 x i16> @llvm.riscv.vrgather.vv.mask.nxv16i16.iXLen(
   <vscale x 16 x i16>,
   <vscale x 16 x i16>,
   <vscale x 16 x i16>,
   <vscale x 16 x i1>,
-  i64,
-  i64);
+  iXLen,
+  iXLen)
 
-define <vscale x 16 x i16> @intrinsic_vrgather_mask_vv_nxv16i16_nxv16i16_nxv16i16(<vscale x 16 x i16> %0, <vscale x 16 x i16> %1, <vscale x 16 x i16> %2, <vscale x 16 x i1> %3, i64 %4) nounwind {
+define <vscale x 16 x i16> @intrinsic_vrgather_mask_vv_nxv16i16_nxv16i16_nxv16i16(<vscale x 16 x i16> %0, <vscale x 16 x i16> %1, <vscale x 16 x i16> %2, <vscale x 16 x i1> %3, iXLen %4) nounwind {
 ; CHECK-LABEL: intrinsic_vrgather_mask_vv_nxv16i16_nxv16i16_nxv16i16:
 ; CHECK:       # %bb.0: # %entry
 ; CHECK-NEXT:    vsetvli zero, a0, e16, m4, ta, mu
 ; CHECK-NEXT:    vrgather.vv v8, v12, v16, v0.t
 ; CHECK-NEXT:    ret
 entry:
-  %a = call <vscale x 16 x i16> @llvm.riscv.vrgather.vv.mask.nxv16i16.i64(
+  %a = call <vscale x 16 x i16> @llvm.riscv.vrgather.vv.mask.nxv16i16.iXLen(
     <vscale x 16 x i16> %0,
     <vscale x 16 x i16> %1,
     <vscale x 16 x i16> %2,
     <vscale x 16 x i1> %3,
-    i64 %4, i64 1)
+    iXLen %4, iXLen 1)
 
   ret <vscale x 16 x i16> %a
 }
 
-declare <vscale x 32 x i16> @llvm.riscv.vrgather.vv.nxv32i16.i64(
+declare <vscale x 32 x i16> @llvm.riscv.vrgather.vv.nxv32i16.iXLen(
   <vscale x 32 x i16>,
   <vscale x 32 x i16>,
   <vscale x 32 x i16>,
-  i64);
+  iXLen)
 
-define <vscale x 32 x i16> @intrinsic_vrgather_vv_nxv32i16_nxv32i16_nxv32i16(<vscale x 32 x i16> %0, <vscale x 32 x i16> %1, i64 %2) nounwind {
+define <vscale x 32 x i16> @intrinsic_vrgather_vv_nxv32i16_nxv32i16_nxv32i16(<vscale x 32 x i16> %0, <vscale x 32 x i16> %1, iXLen %2) nounwind {
 ; CHECK-LABEL: intrinsic_vrgather_vv_nxv32i16_nxv32i16_nxv32i16:
 ; CHECK:       # %bb.0: # %entry
 ; CHECK-NEXT:    vsetvli zero, a0, e16, m8, ta, ma
@@ -593,24 +595,24 @@ define <vscale x 32 x i16> @intrinsic_vrgather_vv_nxv32i16_nxv32i16_nxv32i16(<vs
 ; CHECK-NEXT:    vmv.v.v v8, v24
 ; CHECK-NEXT:    ret
 entry:
-  %a = call <vscale x 32 x i16> @llvm.riscv.vrgather.vv.nxv32i16.i64(
+  %a = call <vscale x 32 x i16> @llvm.riscv.vrgather.vv.nxv32i16.iXLen(
     <vscale x 32 x i16> undef,
     <vscale x 32 x i16> %0,
     <vscale x 32 x i16> %1,
-    i64 %2)
+    iXLen %2)
 
   ret <vscale x 32 x i16> %a
 }
 
-declare <vscale x 32 x i16> @llvm.riscv.vrgather.vv.mask.nxv32i16.i64(
+declare <vscale x 32 x i16> @llvm.riscv.vrgather.vv.mask.nxv32i16.iXLen(
   <vscale x 32 x i16>,
   <vscale x 32 x i16>,
   <vscale x 32 x i16>,
   <vscale x 32 x i1>,
-  i64,
-  i64);
+  iXLen,
+  iXLen)
 
-define <vscale x 32 x i16> @intrinsic_vrgather_mask_vv_nxv32i16_nxv32i16_nxv32i16(<vscale x 32 x i16> %0, <vscale x 32 x i16> %1, <vscale x 32 x i16> %2, <vscale x 32 x i1> %3, i64 %4) nounwind {
+define <vscale x 32 x i16> @intrinsic_vrgather_mask_vv_nxv32i16_nxv32i16_nxv32i16(<vscale x 32 x i16> %0, <vscale x 32 x i16> %1, <vscale x 32 x i16> %2, <vscale x 32 x i1> %3, iXLen %4) nounwind {
 ; CHECK-LABEL: intrinsic_vrgather_mask_vv_nxv32i16_nxv32i16_nxv32i16:
 ; CHECK:       # %bb.0: # %entry
 ; CHECK-NEXT:    vl8re16.v v24, (a0)
@@ -618,23 +620,23 @@ define <vscale x 32 x i16> @intrinsic_vrgather_mask_vv_nxv32i16_nxv32i16_nxv32i1
 ; CHECK-NEXT:    vrgather.vv v8, v16, v24, v0.t
 ; CHECK-NEXT:    ret
 entry:
-  %a = call <vscale x 32 x i16> @llvm.riscv.vrgather.vv.mask.nxv32i16.i64(
+  %a = call <vscale x 32 x i16> @llvm.riscv.vrgather.vv.mask.nxv32i16.iXLen(
     <vscale x 32 x i16> %0,
     <vscale x 32 x i16> %1,
     <vscale x 32 x i16> %2,
     <vscale x 32 x i1> %3,
-    i64 %4, i64 1)
+    iXLen %4, iXLen 1)
 
   ret <vscale x 32 x i16> %a
 }
 
-declare <vscale x 1 x i32> @llvm.riscv.vrgather.vv.nxv1i32.i64(
+declare <vscale x 1 x i32> @llvm.riscv.vrgather.vv.nxv1i32.iXLen(
   <vscale x 1 x i32>,
   <vscale x 1 x i32>,
   <vscale x 1 x i32>,
-  i64);
+  iXLen)
 
-define <vscale x 1 x i32> @intrinsic_vrgather_vv_nxv1i32_nxv1i32_nxv1i32(<vscale x 1 x i32> %0, <vscale x 1 x i32> %1, i64 %2) nounwind {
+define <vscale x 1 x i32> @intrinsic_vrgather_vv_nxv1i32_nxv1i32_nxv1i32(<vscale x 1 x i32> %0, <vscale x 1 x i32> %1, iXLen %2) nounwind {
 ; CHECK-LABEL: intrinsic_vrgather_vv_nxv1i32_nxv1i32_nxv1i32:
 ; CHECK:       # %bb.0: # %entry
 ; CHECK-NEXT:    vsetvli zero, a0, e32, mf2, ta, ma
@@ -642,47 +644,47 @@ define <vscale x 1 x i32> @intrinsic_vrgather_vv_nxv1i32_nxv1i32_nxv1i32(<vscale
 ; CHECK-NEXT:    vmv1r.v v8, v10
 ; CHECK-NEXT:    ret
 entry:
-  %a = call <vscale x 1 x i32> @llvm.riscv.vrgather.vv.nxv1i32.i64(
+  %a = call <vscale x 1 x i32> @llvm.riscv.vrgather.vv.nxv1i32.iXLen(
     <vscale x 1 x i32> undef,
     <vscale x 1 x i32> %0,
     <vscale x 1 x i32> %1,
-    i64 %2)
+    iXLen %2)
 
   ret <vscale x 1 x i32> %a
 }
 
-declare <vscale x 1 x i32> @llvm.riscv.vrgather.vv.mask.nxv1i32.i64(
+declare <vscale x 1 x i32> @llvm.riscv.vrgather.vv.mask.nxv1i32.iXLen(
   <vscale x 1 x i32>,
   <vscale x 1 x i32>,
   <vscale x 1 x i32>,
   <vscale x 1 x i1>,
-  i64,
-  i64);
+  iXLen,
+  iXLen)
 
-define <vscale x 1 x i32> @intrinsic_vrgather_mask_vv_nxv1i32_nxv1i32_nxv1i32(<vscale x 1 x i32> %0, <vscale x 1 x i32> %1, <vscale x 1 x i32> %2, <vscale x 1 x i1> %3, i64 %4) nounwind {
+define <vscale x 1 x i32> @intrinsic_vrgather_mask_vv_nxv1i32_nxv1i32_nxv1i32(<vscale x 1 x i32> %0, <vscale x 1 x i32> %1, <vscale x 1 x i32> %2, <vscale x 1 x i1> %3, iXLen %4) nounwind {
 ; CHECK-LABEL: intrinsic_vrgather_mask_vv_nxv1i32_nxv1i32_nxv1i32:
 ; CHECK:       # %bb.0: # %entry
 ; CHECK-NEXT:    vsetvli zero, a0, e32, mf2, ta, mu
 ; CHECK-NEXT:    vrgather.vv v8, v9, v10, v0.t
 ; CHECK-NEXT:    ret
 entry:
-  %a = call <vscale x 1 x i32> @llvm.riscv.vrgather.vv.mask.nxv1i32.i64(
+  %a = call <vscale x 1 x i32> @llvm.riscv.vrgather.vv.mask.nxv1i32.iXLen(
     <vscale x 1 x i32> %0,
     <vscale x 1 x i32> %1,
     <vscale x 1 x i32> %2,
     <vscale x 1 x i1> %3,
-    i64 %4, i64 1)
+    iXLen %4, iXLen 1)
 
   ret <vscale x 1 x i32> %a
 }
 
-declare <vscale x 2 x i32> @llvm.riscv.vrgather.vv.nxv2i32.i64(
+declare <vscale x 2 x i32> @llvm.riscv.vrgather.vv.nxv2i32.iXLen(
   <vscale x 2 x i32>,
   <vscale x 2 x i32>,
   <vscale x 2 x i32>,
-  i64);
+  iXLen)
 
-define <vscale x 2 x i32> @intrinsic_vrgather_vv_nxv2i32_nxv2i32_nxv2i32(<vscale x 2 x i32> %0, <vscale x 2 x i32> %1, i64 %2) nounwind {
+define <vscale x 2 x i32> @intrinsic_vrgather_vv_nxv2i32_nxv2i32_nxv2i32(<vscale x 2 x i32> %0, <vscale x 2 x i32> %1, iXLen %2) nounwind {
 ; CHECK-LABEL: intrinsic_vrgather_vv_nxv2i32_nxv2i32_nxv2i32:
 ; CHECK:       # %bb.0: # %entry
 ; CHECK-NEXT:    vsetvli zero, a0, e32, m1, ta, ma
@@ -690,47 +692,47 @@ define <vscale x 2 x i32> @intrinsic_vrgather_vv_nxv2i32_nxv2i32_nxv2i32(<vscale
 ; CHECK-NEXT:    vmv.v.v v8, v10
 ; CHECK-NEXT:    ret
 entry:
-  %a = call <vscale x 2 x i32> @llvm.riscv.vrgather.vv.nxv2i32.i64(
+  %a = call <vscale x 2 x i32> @llvm.riscv.vrgather.vv.nxv2i32.iXLen(
     <vscale x 2 x i32> undef,
     <vscale x 2 x i32> %0,
     <vscale x 2 x i32> %1,
-    i64 %2)
+    iXLen %2)
 
   ret <vscale x 2 x i32> %a
 }
 
-declare <vscale x 2 x i32> @llvm.riscv.vrgather.vv.mask.nxv2i32.i64(
+declare <vscale x 2 x i32> @llvm.riscv.vrgather.vv.mask.nxv2i32.iXLen(
   <vscale x 2 x i32>,
   <vscale x 2 x i32>,
   <vscale x 2 x i32>,
   <vscale x 2 x i1>,
-  i64,
-  i64);
+  iXLen,
+  iXLen)
 
-define <vscale x 2 x i32> @intrinsic_vrgather_mask_vv_nxv2i32_nxv2i32_nxv2i32(<vscale x 2 x i32> %0, <vscale x 2 x i32> %1, <vscale x 2 x i32> %2, <vscale x 2 x i1> %3, i64 %4) nounwind {
+define <vscale x 2 x i32> @intrinsic_vrgather_mask_vv_nxv2i32_nxv2i32_nxv2i32(<vscale x 2 x i32> %0, <vscale x 2 x i32> %1, <vscale x 2 x i32> %2, <vscale x 2 x i1> %3, iXLen %4) nounwind {
 ; CHECK-LABEL: intrinsic_vrgather_mask_vv_nxv2i32_nxv2i32_nxv2i32:
 ; CHECK:       # %bb.0: # %entry
 ; CHECK-NEXT:    vsetvli zero, a0, e32, m1, ta, mu
 ; CHECK-NEXT:    vrgather.vv v8, v9, v10, v0.t
 ; CHECK-NEXT:    ret
 entry:
-  %a = call <vscale x 2 x i32> @llvm.riscv.vrgather.vv.mask.nxv2i32.i64(
+  %a = call <vscale x 2 x i32> @llvm.riscv.vrgather.vv.mask.nxv2i32.iXLen(
     <vscale x 2 x i32> %0,
     <vscale x 2 x i32> %1,
     <vscale x 2 x i32> %2,
     <vscale x 2 x i1> %3,
-    i64 %4, i64 1)
+    iXLen %4, iXLen 1)
 
   ret <vscale x 2 x i32> %a
 }
 
-declare <vscale x 4 x i32> @llvm.riscv.vrgather.vv.nxv4i32.i64(
+declare <vscale x 4 x i32> @llvm.riscv.vrgather.vv.nxv4i32.iXLen(
   <vscale x 4 x i32>,
   <vscale x 4 x i32>,
   <vscale x 4 x i32>,
-  i64);
+  iXLen)
 
-define <vscale x 4 x i32> @intrinsic_vrgather_vv_nxv4i32_nxv4i32_nxv4i32(<vscale x 4 x i32> %0, <vscale x 4 x i32> %1, i64 %2) nounwind {
+define <vscale x 4 x i32> @intrinsic_vrgather_vv_nxv4i32_nxv4i32_nxv4i32(<vscale x 4 x i32> %0, <vscale x 4 x i32> %1, iXLen %2) nounwind {
 ; CHECK-LABEL: intrinsic_vrgather_vv_nxv4i32_nxv4i32_nxv4i32:
 ; CHECK:       # %bb.0: # %entry
 ; CHECK-NEXT:    vsetvli zero, a0, e32, m2, ta, ma
@@ -738,47 +740,47 @@ define <vscale x 4 x i32> @intrinsic_vrgather_vv_nxv4i32_nxv4i32_nxv4i32(<vscale
 ; CHECK-NEXT:    vmv.v.v v8, v12
 ; CHECK-NEXT:    ret
 entry:
-  %a = call <vscale x 4 x i32> @llvm.riscv.vrgather.vv.nxv4i32.i64(
+  %a = call <vscale x 4 x i32> @llvm.riscv.vrgather.vv.nxv4i32.iXLen(
     <vscale x 4 x i32> undef,
     <vscale x 4 x i32> %0,
     <vscale x 4 x i32> %1,
-    i64 %2)
+    iXLen %2)
 
   ret <vscale x 4 x i32> %a
 }
 
-declare <vscale x 4 x i32> @llvm.riscv.vrgather.vv.mask.nxv4i32.i64(
+declare <vscale x 4 x i32> @llvm.riscv.vrgather.vv.mask.nxv4i32.iXLen(
   <vscale x 4 x i32>,
   <vscale x 4 x i32>,
   <vscale x 4 x i32>,
   <vscale x 4 x i1>,
-  i64,
-  i64);
+  iXLen,
+  iXLen)
 
-define <vscale x 4 x i32> @intrinsic_vrgather_mask_vv_nxv4i32_nxv4i32_nxv4i32(<vscale x 4 x i32> %0, <vscale x 4 x i32> %1, <vscale x 4 x i32> %2, <vscale x 4 x i1> %3, i64 %4) nounwind {
+define <vscale x 4 x i32> @intrinsic_vrgather_mask_vv_nxv4i32_nxv4i32_nxv4i32(<vscale x 4 x i32> %0, <vscale x 4 x i32> %1, <vscale x 4 x i32> %2, <vscale x 4 x i1> %3, iXLen %4) nounwind {
 ; CHECK-LABEL: intrinsic_vrgather_mask_vv_nxv4i32_nxv4i32_nxv4i32:
 ; CHECK:       # %bb.0: # %entry
 ; CHECK-NEXT:    vsetvli zero, a0, e32, m2, ta, mu
 ; CHECK-NEXT:    vrgather.vv v8, v10, v12, v0.t
 ; CHECK-NEXT:    ret
 entry:
-  %a = call <vscale x 4 x i32> @llvm.riscv.vrgather.vv.mask.nxv4i32.i64(
+  %a = call <vscale x 4 x i32> @llvm.riscv.vrgather.vv.mask.nxv4i32.iXLen(
     <vscale x 4 x i32> %0,
     <vscale x 4 x i32> %1,
     <vscale x 4 x i32> %2,
     <vscale x 4 x i1> %3,
-    i64 %4, i64 1)
+    iXLen %4, iXLen 1)
 
   ret <vscale x 4 x i32> %a
 }
 
-declare <vscale x 8 x i32> @llvm.riscv.vrgather.vv.nxv8i32.i64(
+declare <vscale x 8 x i32> @llvm.riscv.vrgather.vv.nxv8i32.iXLen(
   <vscale x 8 x i32>,
   <vscale x 8 x i32>,
   <vscale x 8 x i32>,
-  i64);
+  iXLen)
 
-define <vscale x 8 x i32> @intrinsic_vrgather_vv_nxv8i32_nxv8i32_nxv8i32(<vscale x 8 x i32> %0, <vscale x 8 x i32> %1, i64 %2) nounwind {
+define <vscale x 8 x i32> @intrinsic_vrgather_vv_nxv8i32_nxv8i32_nxv8i32(<vscale x 8 x i32> %0, <vscale x 8 x i32> %1, iXLen %2) nounwind {
 ; CHECK-LABEL: intrinsic_vrgather_vv_nxv8i32_nxv8i32_nxv8i32:
 ; CHECK:       # %bb.0: # %entry
 ; CHECK-NEXT:    vsetvli zero, a0, e32, m4, ta, ma
@@ -786,47 +788,47 @@ define <vscale x 8 x i32> @intrinsic_vrgather_vv_nxv8i32_nxv8i32_nxv8i32(<vscale
 ; CHECK-NEXT:    vmv.v.v v8, v16
 ; CHECK-NEXT:    ret
 entry:
-  %a = call <vscale x 8 x i32> @llvm.riscv.vrgather.vv.nxv8i32.i64(
+  %a = call <vscale x 8 x i32> @llvm.riscv.vrgather.vv.nxv8i32.iXLen(
     <vscale x 8 x i32> undef,
     <vscale x 8 x i32> %0,
     <vscale x 8 x i32> %1,
-    i64 %2)
+    iXLen %2)
 
   ret <vscale x 8 x i32> %a
 }
 
-declare <vscale x 8 x i32> @llvm.riscv.vrgather.vv.mask.nxv8i32.i64(
+declare <vscale x 8 x i32> @llvm.riscv.vrgather.vv.mask.nxv8i32.iXLen(
   <vscale x 8 x i32>,
   <vscale x 8 x i32>,
   <vscale x 8 x i32>,
   <vscale x 8 x i1>,
-  i64,
-  i64);
+  iXLen,
+  iXLen)
 
-define <vscale x 8 x i32> @intrinsic_vrgather_mask_vv_nxv8i32_nxv8i32_nxv8i32(<vscale x 8 x i32> %0, <vscale x 8 x i32> %1, <vscale x 8 x i32> %2, <vscale x 8 x i1> %3, i64 %4) nounwind {
+define <vscale x 8 x i32> @intrinsic_vrgather_mask_vv_nxv8i32_nxv8i32_nxv8i32(<vscale x 8 x i32> %0, <vscale x 8 x i32> %1, <vscale x 8 x i32> %2, <vscale x 8 x i1> %3, iXLen %4) nounwind {
 ; CHECK-LABEL: intrinsic_vrgather_mask_vv_nxv8i32_nxv8i32_nxv8i32:
 ; CHECK:       # %bb.0: # %entry
 ; CHECK-NEXT:    vsetvli zero, a0, e32, m4, ta, mu
 ; CHECK-NEXT:    vrgather.vv v8, v12, v16, v0.t
 ; CHECK-NEXT:    ret
 entry:
-  %a = call <vscale x 8 x i32> @llvm.riscv.vrgather.vv.mask.nxv8i32.i64(
+  %a = call <vscale x 8 x i32> @llvm.riscv.vrgather.vv.mask.nxv8i32.iXLen(
     <vscale x 8 x i32> %0,
     <vscale x 8 x i32> %1,
     <vscale x 8 x i32> %2,
     <vscale x 8 x i1> %3,
-    i64 %4, i64 1)
+    iXLen %4, iXLen 1)
 
   ret <vscale x 8 x i32> %a
 }
 
-declare <vscale x 16 x i32> @llvm.riscv.vrgather.vv.nxv16i32.i64(
+declare <vscale x 16 x i32> @llvm.riscv.vrgather.vv.nxv16i32.iXLen(
   <vscale x 16 x i32>,
   <vscale x 16 x i32>,
   <vscale x 16 x i32>,
-  i64);
+  iXLen)
 
-define <vscale x 16 x i32> @intrinsic_vrgather_vv_nxv16i32_nxv16i32_nxv16i32(<vscale x 16 x i32> %0, <vscale x 16 x i32> %1, i64 %2) nounwind {
+define <vscale x 16 x i32> @intrinsic_vrgather_vv_nxv16i32_nxv16i32_nxv16i32(<vscale x 16 x i32> %0, <vscale x 16 x i32> %1, iXLen %2) nounwind {
 ; CHECK-LABEL: intrinsic_vrgather_vv_nxv16i32_nxv16i32_nxv16i32:
 ; CHECK:       # %bb.0: # %entry
 ; CHECK-NEXT:    vsetvli zero, a0, e32, m8, ta, ma
@@ -834,24 +836,24 @@ define <vscale x 16 x i32> @intrinsic_vrgather_vv_nxv16i32_nxv16i32_nxv16i32(<vs
 ; CHECK-NEXT:    vmv.v.v v8, v24
 ; CHECK-NEXT:    ret
 entry:
-  %a = call <vscale x 16 x i32> @llvm.riscv.vrgather.vv.nxv16i32.i64(
+  %a = call <vscale x 16 x i32> @llvm.riscv.vrgather.vv.nxv16i32.iXLen(
     <vscale x 16 x i32> undef,
     <vscale x 16 x i32> %0,
     <vscale x 16 x i32> %1,
-    i64 %2)
+    iXLen %2)
 
   ret <vscale x 16 x i32> %a
 }
 
-declare <vscale x 16 x i32> @llvm.riscv.vrgather.vv.mask.nxv16i32.i64(
+declare <vscale x 16 x i32> @llvm.riscv.vrgather.vv.mask.nxv16i32.iXLen(
   <vscale x 16 x i32>,
   <vscale x 16 x i32>,
   <vscale x 16 x i32>,
   <vscale x 16 x i1>,
-  i64,
-  i64);
+  iXLen,
+  iXLen)
 
-define <vscale x 16 x i32> @intrinsic_vrgather_mask_vv_nxv16i32_nxv16i32_nxv16i32(<vscale x 16 x i32> %0, <vscale x 16 x i32> %1, <vscale x 16 x i32> %2, <vscale x 16 x i1> %3, i64 %4) nounwind {
+define <vscale x 16 x i32> @intrinsic_vrgather_mask_vv_nxv16i32_nxv16i32_nxv16i32(<vscale x 16 x i32> %0, <vscale x 16 x i32> %1, <vscale x 16 x i32> %2, <vscale x 16 x i1> %3, iXLen %4) nounwind {
 ; CHECK-LABEL: intrinsic_vrgather_mask_vv_nxv16i32_nxv16i32_nxv16i32:
 ; CHECK:       # %bb.0: # %entry
 ; CHECK-NEXT:    vl8re32.v v24, (a0)
@@ -859,23 +861,23 @@ define <vscale x 16 x i32> @intrinsic_vrgather_mask_vv_nxv16i32_nxv16i32_nxv16i3
 ; CHECK-NEXT:    vrgather.vv v8, v16, v24, v0.t
 ; CHECK-NEXT:    ret
 entry:
-  %a = call <vscale x 16 x i32> @llvm.riscv.vrgather.vv.mask.nxv16i32.i64(
+  %a = call <vscale x 16 x i32> @llvm.riscv.vrgather.vv.mask.nxv16i32.iXLen(
     <vscale x 16 x i32> %0,
     <vscale x 16 x i32> %1,
     <vscale x 16 x i32> %2,
     <vscale x 16 x i1> %3,
-    i64 %4, i64 1)
+    iXLen %4, iXLen 1)
 
   ret <vscale x 16 x i32> %a
 }
 
-declare <vscale x 1 x i64> @llvm.riscv.vrgather.vv.nxv1i64.i64(
+declare <vscale x 1 x i64> @llvm.riscv.vrgather.vv.nxv1i64.iXLen(
   <vscale x 1 x i64>,
   <vscale x 1 x i64>,
   <vscale x 1 x i64>,
-  i64);
+  iXLen)
 
-define <vscale x 1 x i64> @intrinsic_vrgather_vv_nxv1i64_nxv1i64_nxv1i64(<vscale x 1 x i64> %0, <vscale x 1 x i64> %1, i64 %2) nounwind {
+define <vscale x 1 x i64> @intrinsic_vrgather_vv_nxv1i64_nxv1i64_nxv1i64(<vscale x 1 x i64> %0, <vscale x 1 x i64> %1, iXLen %2) nounwind {
 ; CHECK-LABEL: intrinsic_vrgather_vv_nxv1i64_nxv1i64_nxv1i64:
 ; CHECK:       # %bb.0: # %entry
 ; CHECK-NEXT:    vsetvli zero, a0, e64, m1, ta, ma
@@ -883,47 +885,47 @@ define <vscale x 1 x i64> @intrinsic_vrgather_vv_nxv1i64_nxv1i64_nxv1i64(<vscale
 ; CHECK-NEXT:    vmv.v.v v8, v10
 ; CHECK-NEXT:    ret
 entry:
-  %a = call <vscale x 1 x i64> @llvm.riscv.vrgather.vv.nxv1i64.i64(
+  %a = call <vscale x 1 x i64> @llvm.riscv.vrgather.vv.nxv1i64.iXLen(
     <vscale x 1 x i64> undef,
     <vscale x 1 x i64> %0,
     <vscale x 1 x i64> %1,
-    i64 %2)
+    iXLen %2)
 
   ret <vscale x 1 x i64> %a
 }
 
-declare <vscale x 1 x i64> @llvm.riscv.vrgather.vv.mask.nxv1i64.i64(
+declare <vscale x 1 x i64> @llvm.riscv.vrgather.vv.mask.nxv1i64.iXLen(
   <vscale x 1 x i64>,
   <vscale x 1 x i64>,
   <vscale x 1 x i64>,
   <vscale x 1 x i1>,
-  i64,
-  i64);
+  iXLen,
+  iXLen)
 
-define <vscale x 1 x i64> @intrinsic_vrgather_mask_vv_nxv1i64_nxv1i64_nxv1i64(<vscale x 1 x i64> %0, <vscale x 1 x i64> %1, <vscale x 1 x i64> %2, <vscale x 1 x i1> %3, i64 %4) nounwind {
+define <vscale x 1 x i64> @intrinsic_vrgather_mask_vv_nxv1i64_nxv1i64_nxv1i64(<vscale x 1 x i64> %0, <vscale x 1 x i64> %1, <vscale x 1 x i64> %2, <vscale x 1 x i1> %3, iXLen %4) nounwind {
 ; CHECK-LABEL: intrinsic_vrgather_mask_vv_nxv1i64_nxv1i64_nxv1i64:
 ; CHECK:       # %bb.0: # %entry
 ; CHECK-NEXT:    vsetvli zero, a0, e64, m1, ta, mu
 ; CHECK-NEXT:    vrgather.vv v8, v9, v10, v0.t
 ; CHECK-NEXT:    ret
 entry:
-  %a = call <vscale x 1 x i64> @llvm.riscv.vrgather.vv.mask.nxv1i64.i64(
+  %a = call <vscale x 1 x i64> @llvm.riscv.vrgather.vv.mask.nxv1i64.iXLen(
     <vscale x 1 x i64> %0,
     <vscale x 1 x i64> %1,
     <vscale x 1 x i64> %2,
     <vscale x 1 x i1> %3,
-    i64 %4, i64 1)
+    iXLen %4, iXLen 1)
 
   ret <vscale x 1 x i64> %a
 }
 
-declare <vscale x 2 x i64> @llvm.riscv.vrgather.vv.nxv2i64.i64(
+declare <vscale x 2 x i64> @llvm.riscv.vrgather.vv.nxv2i64.iXLen(
   <vscale x 2 x i64>,
   <vscale x 2 x i64>,
   <vscale x 2 x i64>,
-  i64);
+  iXLen)
 
-define <vscale x 2 x i64> @intrinsic_vrgather_vv_nxv2i64_nxv2i64_nxv2i64(<vscale x 2 x i64> %0, <vscale x 2 x i64> %1, i64 %2) nounwind {
+define <vscale x 2 x i64> @intrinsic_vrgather_vv_nxv2i64_nxv2i64_nxv2i64(<vscale x 2 x i64> %0, <vscale x 2 x i64> %1, iXLen %2) nounwind {
 ; CHECK-LABEL: intrinsic_vrgather_vv_nxv2i64_nxv2i64_nxv2i64:
 ; CHECK:       # %bb.0: # %entry
 ; CHECK-NEXT:    vsetvli zero, a0, e64, m2, ta, ma
@@ -931,47 +933,47 @@ define <vscale x 2 x i64> @intrinsic_vrgather_vv_nxv2i64_nxv2i64_nxv2i64(<vscale
 ; CHECK-NEXT:    vmv.v.v v8, v12
 ; CHECK-NEXT:    ret
 entry:
-  %a = call <vscale x 2 x i64> @llvm.riscv.vrgather.vv.nxv2i64.i64(
+  %a = call <vscale x 2 x i64> @llvm.riscv.vrgather.vv.nxv2i64.iXLen(
     <vscale x 2 x i64> undef,
     <vscale x 2 x i64> %0,
     <vscale x 2 x i64> %1,
-    i64 %2)
+    iXLen %2)
 
   ret <vscale x 2 x i64> %a
 }
 
-declare <vscale x 2 x i64> @llvm.riscv.vrgather.vv.mask.nxv2i64.i64(
+declare <vscale x 2 x i64> @llvm.riscv.vrgather.vv.mask.nxv2i64.iXLen(
   <vscale x 2 x i64>,
   <vscale x 2 x i64>,
   <vscale x 2 x i64>,
   <vscale x 2 x i1>,
-  i64,
-  i64);
+  iXLen,
+  iXLen)
 
-define <vscale x 2 x i64> @intrinsic_vrgather_mask_vv_nxv2i64_nxv2i64_nxv2i64(<vscale x 2 x i64> %0, <vscale x 2 x i64> %1, <vscale x 2 x i64> %2, <vscale x 2 x i1> %3, i64 %4) nounwind {
+define <vscale x 2 x i64> @intrinsic_vrgather_mask_vv_nxv2i64_nxv2i64_nxv2i64(<vscale x 2 x i64> %0, <vscale x 2 x i64> %1, <vscale x 2 x i64> %2, <vscale x 2 x i1> %3, iXLen %4) nounwind {
 ; CHECK-LABEL: intrinsic_vrgather_mask_vv_nxv2i64_nxv2i64_nxv2i64:
 ; CHECK:       # %bb.0: # %entry
 ; CHECK-NEXT:    vsetvli zero, a0, e64, m2, ta, mu
 ; CHECK-NEXT:    vrgather.vv v8, v10, v12, v0.t
 ; CHECK-NEXT:    ret
 entry:
-  %a = call <vscale x 2 x i64> @llvm.riscv.vrgather.vv.mask.nxv2i64.i64(
+  %a = call <vscale x 2 x i64> @llvm.riscv.vrgather.vv.mask.nxv2i64.iXLen(
     <vscale x 2 x i64> %0,
     <vscale x 2 x i64> %1,
     <vscale x 2 x i64> %2,
     <vscale x 2 x i1> %3,
-    i64 %4, i64 1)
+    iXLen %4, iXLen 1)
 
   ret <vscale x 2 x i64> %a
 }
 
-declare <vscale x 4 x i64> @llvm.riscv.vrgather.vv.nxv4i64.i64(
+declare <vscale x 4 x i64> @llvm.riscv.vrgather.vv.nxv4i64.iXLen(
   <vscale x 4 x i64>,
   <vscale x 4 x i64>,
   <vscale x 4 x i64>,
-  i64);
+  iXLen)
 
-define <vscale x 4 x i64> @intrinsic_vrgather_vv_nxv4i64_nxv4i64_nxv4i64(<vscale x 4 x i64> %0, <vscale x 4 x i64> %1, i64 %2) nounwind {
+define <vscale x 4 x i64> @intrinsic_vrgather_vv_nxv4i64_nxv4i64_nxv4i64(<vscale x 4 x i64> %0, <vscale x 4 x i64> %1, iXLen %2) nounwind {
 ; CHECK-LABEL: intrinsic_vrgather_vv_nxv4i64_nxv4i64_nxv4i64:
 ; CHECK:       # %bb.0: # %entry
 ; CHECK-NEXT:    vsetvli zero, a0, e64, m4, ta, ma
@@ -979,47 +981,47 @@ define <vscale x 4 x i64> @intrinsic_vrgather_vv_nxv4i64_nxv4i64_nxv4i64(<vscale
 ; CHECK-NEXT:    vmv.v.v v8, v16
 ; CHECK-NEXT:    ret
 entry:
-  %a = call <vscale x 4 x i64> @llvm.riscv.vrgather.vv.nxv4i64.i64(
+  %a = call <vscale x 4 x i64> @llvm.riscv.vrgather.vv.nxv4i64.iXLen(
     <vscale x 4 x i64> undef,
     <vscale x 4 x i64> %0,
     <vscale x 4 x i64> %1,
-    i64 %2)
+    iXLen %2)
 
   ret <vscale x 4 x i64> %a
 }
 
-declare <vscale x 4 x i64> @llvm.riscv.vrgather.vv.mask.nxv4i64.i64(
+declare <vscale x 4 x i64> @llvm.riscv.vrgather.vv.mask.nxv4i64.iXLen(
   <vscale x 4 x i64>,
   <vscale x 4 x i64>,
   <vscale x 4 x i64>,
   <vscale x 4 x i1>,
-  i64,
-  i64);
+  iXLen,
+  iXLen)
 
-define <vscale x 4 x i64> @intrinsic_vrgather_mask_vv_nxv4i64_nxv4i64_nxv4i64(<vscale x 4 x i64> %0, <vscale x 4 x i64> %1, <vscale x 4 x i64> %2, <vscale x 4 x i1> %3, i64 %4) nounwind {
+define <vscale x 4 x i64> @intrinsic_vrgather_mask_vv_nxv4i64_nxv4i64_nxv4i64(<vscale x 4 x i64> %0, <vscale x 4 x i64> %1, <vscale x 4 x i64> %2, <vscale x 4 x i1> %3, iXLen %4) nounwind {
 ; CHECK-LABEL: intrinsic_vrgather_mask_vv_nxv4i64_nxv4i64_nxv4i64:
 ; CHECK:       # %bb.0: # %entry
 ; CHECK-NEXT:    vsetvli zero, a0, e64, m4, ta, mu
 ; CHECK-NEXT:    vrgather.vv v8, v12, v16, v0.t
 ; CHECK-NEXT:    ret
 entry:
-  %a = call <vscale x 4 x i64> @llvm.riscv.vrgather.vv.mask.nxv4i64.i64(
+  %a = call <vscale x 4 x i64> @llvm.riscv.vrgather.vv.mask.nxv4i64.iXLen(
     <vscale x 4 x i64> %0,
     <vscale x 4 x i64> %1,
     <vscale x 4 x i64> %2,
     <vscale x 4 x i1> %3,
-    i64 %4, i64 1)
+    iXLen %4, iXLen 1)
 
   ret <vscale x 4 x i64> %a
 }
 
-declare <vscale x 8 x i64> @llvm.riscv.vrgather.vv.nxv8i64.i64(
+declare <vscale x 8 x i64> @llvm.riscv.vrgather.vv.nxv8i64.iXLen(
   <vscale x 8 x i64>,
   <vscale x 8 x i64>,
   <vscale x 8 x i64>,
-  i64);
+  iXLen)
 
-define <vscale x 8 x i64> @intrinsic_vrgather_vv_nxv8i64_nxv8i64_nxv8i64(<vscale x 8 x i64> %0, <vscale x 8 x i64> %1, i64 %2) nounwind {
+define <vscale x 8 x i64> @intrinsic_vrgather_vv_nxv8i64_nxv8i64_nxv8i64(<vscale x 8 x i64> %0, <vscale x 8 x i64> %1, iXLen %2) nounwind {
 ; CHECK-LABEL: intrinsic_vrgather_vv_nxv8i64_nxv8i64_nxv8i64:
 ; CHECK:       # %bb.0: # %entry
 ; CHECK-NEXT:    vsetvli zero, a0, e64, m8, ta, ma
@@ -1027,24 +1029,24 @@ define <vscale x 8 x i64> @intrinsic_vrgather_vv_nxv8i64_nxv8i64_nxv8i64(<vscale
 ; CHECK-NEXT:    vmv.v.v v8, v24
 ; CHECK-NEXT:    ret
 entry:
-  %a = call <vscale x 8 x i64> @llvm.riscv.vrgather.vv.nxv8i64.i64(
+  %a = call <vscale x 8 x i64> @llvm.riscv.vrgather.vv.nxv8i64.iXLen(
     <vscale x 8 x i64> undef,
     <vscale x 8 x i64> %0,
     <vscale x 8 x i64> %1,
-    i64 %2)
+    iXLen %2)
 
   ret <vscale x 8 x i64> %a
 }
 
-declare <vscale x 8 x i64> @llvm.riscv.vrgather.vv.mask.nxv8i64.i64(
+declare <vscale x 8 x i64> @llvm.riscv.vrgather.vv.mask.nxv8i64.iXLen(
   <vscale x 8 x i64>,
   <vscale x 8 x i64>,
   <vscale x 8 x i64>,
   <vscale x 8 x i1>,
-  i64,
-  i64);
+  iXLen,
+  iXLen)
 
-define <vscale x 8 x i64> @intrinsic_vrgather_mask_vv_nxv8i64_nxv8i64_nxv8i64(<vscale x 8 x i64> %0, <vscale x 8 x i64> %1, <vscale x 8 x i64> %2, <vscale x 8 x i1> %3, i64 %4) nounwind {
+define <vscale x 8 x i64> @intrinsic_vrgather_mask_vv_nxv8i64_nxv8i64_nxv8i64(<vscale x 8 x i64> %0, <vscale x 8 x i64> %1, <vscale x 8 x i64> %2, <vscale x 8 x i1> %3, iXLen %4) nounwind {
 ; CHECK-LABEL: intrinsic_vrgather_mask_vv_nxv8i64_nxv8i64_nxv8i64:
 ; CHECK:       # %bb.0: # %entry
 ; CHECK-NEXT:    vl8re64.v v24, (a0)
@@ -1052,23 +1054,23 @@ define <vscale x 8 x i64> @intrinsic_vrgather_mask_vv_nxv8i64_nxv8i64_nxv8i64(<v
 ; CHECK-NEXT:    vrgather.vv v8, v16, v24, v0.t
 ; CHECK-NEXT:    ret
 entry:
-  %a = call <vscale x 8 x i64> @llvm.riscv.vrgather.vv.mask.nxv8i64.i64(
+  %a = call <vscale x 8 x i64> @llvm.riscv.vrgather.vv.mask.nxv8i64.iXLen(
     <vscale x 8 x i64> %0,
     <vscale x 8 x i64> %1,
     <vscale x 8 x i64> %2,
     <vscale x 8 x i1> %3,
-    i64 %4, i64 1)
+    iXLen %4, iXLen 1)
 
   ret <vscale x 8 x i64> %a
 }
 
-declare <vscale x 1 x half> @llvm.riscv.vrgather.vv.nxv1f16.i64(
+declare <vscale x 1 x half> @llvm.riscv.vrgather.vv.nxv1f16.iXLen(
   <vscale x 1 x half>,
   <vscale x 1 x half>,
   <vscale x 1 x i16>,
-  i64);
+  iXLen)
 
-define <vscale x 1 x half> @intrinsic_vrgather_vv_nxv1f16_nxv1f16_nxv1i16(<vscale x 1 x half> %0, <vscale x 1 x i16> %1, i64 %2) nounwind {
+define <vscale x 1 x half> @intrinsic_vrgather_vv_nxv1f16_nxv1f16_nxv1i16(<vscale x 1 x half> %0, <vscale x 1 x i16> %1, iXLen %2) nounwind {
 ; CHECK-LABEL: intrinsic_vrgather_vv_nxv1f16_nxv1f16_nxv1i16:
 ; CHECK:       # %bb.0: # %entry
 ; CHECK-NEXT:    vsetvli zero, a0, e16, mf4, ta, ma
@@ -1076,47 +1078,47 @@ define <vscale x 1 x half> @intrinsic_vrgather_vv_nxv1f16_nxv1f16_nxv1i16(<vscal
 ; CHECK-NEXT:    vmv1r.v v8, v10
 ; CHECK-NEXT:    ret
 entry:
-  %a = call <vscale x 1 x half> @llvm.riscv.vrgather.vv.nxv1f16.i64(
+  %a = call <vscale x 1 x half> @llvm.riscv.vrgather.vv.nxv1f16.iXLen(
     <vscale x 1 x half> undef,
     <vscale x 1 x half> %0,
     <vscale x 1 x i16> %1,
-    i64 %2)
+    iXLen %2)
 
   ret <vscale x 1 x half> %a
 }
 
-declare <vscale x 1 x half> @llvm.riscv.vrgather.vv.mask.nxv1f16.i64(
+declare <vscale x 1 x half> @llvm.riscv.vrgather.vv.mask.nxv1f16.iXLen(
   <vscale x 1 x half>,
   <vscale x 1 x half>,
   <vscale x 1 x i16>,
   <vscale x 1 x i1>,
-  i64,
-  i64);
+  iXLen,
+  iXLen)
 
-define <vscale x 1 x half> @intrinsic_vrgather_mask_vv_nxv1f16_nxv1f16_nxv1i16(<vscale x 1 x half> %0, <vscale x 1 x half> %1, <vscale x 1 x i16> %2, <vscale x 1 x i1> %3, i64 %4) nounwind {
+define <vscale x 1 x half> @intrinsic_vrgather_mask_vv_nxv1f16_nxv1f16_nxv1i16(<vscale x 1 x half> %0, <vscale x 1 x half> %1, <vscale x 1 x i16> %2, <vscale x 1 x i1> %3, iXLen %4) nounwind {
 ; CHECK-LABEL: intrinsic_vrgather_mask_vv_nxv1f16_nxv1f16_nxv1i16:
 ; CHECK:       # %bb.0: # %entry
 ; CHECK-NEXT:    vsetvli zero, a0, e16, mf4, ta, mu
 ; CHECK-NEXT:    vrgather.vv v8, v9, v10, v0.t
 ; CHECK-NEXT:    ret
 entry:
-  %a = call <vscale x 1 x half> @llvm.riscv.vrgather.vv.mask.nxv1f16.i64(
+  %a = call <vscale x 1 x half> @llvm.riscv.vrgather.vv.mask.nxv1f16.iXLen(
     <vscale x 1 x half> %0,
     <vscale x 1 x half> %1,
     <vscale x 1 x i16> %2,
     <vscale x 1 x i1> %3,
-    i64 %4, i64 1)
+    iXLen %4, iXLen 1)
 
   ret <vscale x 1 x half> %a
 }
 
-declare <vscale x 2 x half> @llvm.riscv.vrgather.vv.nxv2f16.i64(
+declare <vscale x 2 x half> @llvm.riscv.vrgather.vv.nxv2f16.iXLen(
   <vscale x 2 x half>,
   <vscale x 2 x half>,
   <vscale x 2 x i16>,
-  i64);
+  iXLen)
 
-define <vscale x 2 x half> @intrinsic_vrgather_vv_nxv2f16_nxv2f16_nxv2i16(<vscale x 2 x half> %0, <vscale x 2 x i16> %1, i64 %2) nounwind {
+define <vscale x 2 x half> @intrinsic_vrgather_vv_nxv2f16_nxv2f16_nxv2i16(<vscale x 2 x half> %0, <vscale x 2 x i16> %1, iXLen %2) nounwind {
 ; CHECK-LABEL: intrinsic_vrgather_vv_nxv2f16_nxv2f16_nxv2i16:
 ; CHECK:       # %bb.0: # %entry
 ; CHECK-NEXT:    vsetvli zero, a0, e16, mf2, ta, ma
@@ -1124,47 +1126,47 @@ define <vscale x 2 x half> @intrinsic_vrgather_vv_nxv2f16_nxv2f16_nxv2i16(<vscal
 ; CHECK-NEXT:    vmv1r.v v8, v10
 ; CHECK-NEXT:    ret
 entry:
-  %a = call <vscale x 2 x half> @llvm.riscv.vrgather.vv.nxv2f16.i64(
+  %a = call <vscale x 2 x half> @llvm.riscv.vrgather.vv.nxv2f16.iXLen(
     <vscale x 2 x half> undef,
     <vscale x 2 x half> %0,
     <vscale x 2 x i16> %1,
-    i64 %2)
+    iXLen %2)
 
   ret <vscale x 2 x half> %a
 }
 
-declare <vscale x 2 x half> @llvm.riscv.vrgather.vv.mask.nxv2f16.i64(
+declare <vscale x 2 x half> @llvm.riscv.vrgather.vv.mask.nxv2f16.iXLen(
   <vscale x 2 x half>,
   <vscale x 2 x half>,
   <vscale x 2 x i16>,
   <vscale x 2 x i1>,
-  i64,
-  i64);
+  iXLen,
+  iXLen)
 
-define <vscale x 2 x half> @intrinsic_vrgather_mask_vv_nxv2f16_nxv2f16_nxv2i16(<vscale x 2 x half> %0, <vscale x 2 x half> %1, <vscale x 2 x i16> %2, <vscale x 2 x i1> %3, i64 %4) nounwind {
+define <vscale x 2 x half> @intrinsic_vrgather_mask_vv_nxv2f16_nxv2f16_nxv2i16(<vscale x 2 x half> %0, <vscale x 2 x half> %1, <vscale x 2 x i16> %2, <vscale x 2 x i1> %3, iXLen %4) nounwind {
 ; CHECK-LABEL: intrinsic_vrgather_mask_vv_nxv2f16_nxv2f16_nxv2i16:
 ; CHECK:       # %bb.0: # %entry
 ; CHECK-NEXT:    vsetvli zero, a0, e16, mf2, ta, mu
 ; CHECK-NEXT:    vrgather.vv v8, v9, v10, v0.t
 ; CHECK-NEXT:    ret
 entry:
-  %a = call <vscale x 2 x half> @llvm.riscv.vrgather.vv.mask.nxv2f16.i64(
+  %a = call <vscale x 2 x half> @llvm.riscv.vrgather.vv.mask.nxv2f16.iXLen(
     <vscale x 2 x half> %0,
     <vscale x 2 x half> %1,
     <vscale x 2 x i16> %2,
     <vscale x 2 x i1> %3,
-    i64 %4, i64 1)
+    iXLen %4, iXLen 1)
 
   ret <vscale x 2 x half> %a
 }
 
-declare <vscale x 4 x half> @llvm.riscv.vrgather.vv.nxv4f16.i64(
+declare <vscale x 4 x half> @llvm.riscv.vrgather.vv.nxv4f16.iXLen(
   <vscale x 4 x half>,
   <vscale x 4 x half>,
   <vscale x 4 x i16>,
-  i64);
+  iXLen)
 
-define <vscale x 4 x half> @intrinsic_vrgather_vv_nxv4f16_nxv4f16_nxv4i16(<vscale x 4 x half> %0, <vscale x 4 x i16> %1, i64 %2) nounwind {
+define <vscale x 4 x half> @intrinsic_vrgather_vv_nxv4f16_nxv4f16_nxv4i16(<vscale x 4 x half> %0, <vscale x 4 x i16> %1, iXLen %2) nounwind {
 ; CHECK-LABEL: intrinsic_vrgather_vv_nxv4f16_nxv4f16_nxv4i16:
 ; CHECK:       # %bb.0: # %entry
 ; CHECK-NEXT:    vsetvli zero, a0, e16, m1, ta, ma
@@ -1172,47 +1174,47 @@ define <vscale x 4 x half> @intrinsic_vrgather_vv_nxv4f16_nxv4f16_nxv4i16(<vscal
 ; CHECK-NEXT:    vmv.v.v v8, v10
 ; CHECK-NEXT:    ret
 entry:
-  %a = call <vscale x 4 x half> @llvm.riscv.vrgather.vv.nxv4f16.i64(
+  %a = call <vscale x 4 x half> @llvm.riscv.vrgather.vv.nxv4f16.iXLen(
     <vscale x 4 x half> undef,
     <vscale x 4 x half> %0,
     <vscale x 4 x i16> %1,
-    i64 %2)
+    iXLen %2)
 
   ret <vscale x 4 x half> %a
 }
 
-declare <vscale x 4 x half> @llvm.riscv.vrgather.vv.mask.nxv4f16.i64(
+declare <vscale x 4 x half> @llvm.riscv.vrgather.vv.mask.nxv4f16.iXLen(
   <vscale x 4 x half>,
   <vscale x 4 x half>,
   <vscale x 4 x i16>,
   <vscale x 4 x i1>,
-  i64,
-  i64);
+  iXLen,
+  iXLen)
 
-define <vscale x 4 x half> @intrinsic_vrgather_mask_vv_nxv4f16_nxv4f16_nxv4i16(<vscale x 4 x half> %0, <vscale x 4 x half> %1, <vscale x 4 x i16> %2, <vscale x 4 x i1> %3, i64 %4) nounwind {
+define <vscale x 4 x half> @intrinsic_vrgather_mask_vv_nxv4f16_nxv4f16_nxv4i16(<vscale x 4 x half> %0, <vscale x 4 x half> %1, <vscale x 4 x i16> %2, <vscale x 4 x i1> %3, iXLen %4) nounwind {
 ; CHECK-LABEL: intrinsic_vrgather_mask_vv_nxv4f16_nxv4f16_nxv4i16:
 ; CHECK:       # %bb.0: # %entry
 ; CHECK-NEXT:    vsetvli zero, a0, e16, m1, ta, mu
 ; CHECK-NEXT:    vrgather.vv v8, v9, v10, v0.t
 ; CHECK-NEXT:    ret
 entry:
-  %a = call <vscale x 4 x half> @llvm.riscv.vrgather.vv.mask.nxv4f16.i64(
+  %a = call <vscale x 4 x half> @llvm.riscv.vrgather.vv.mask.nxv4f16.iXLen(
     <vscale x 4 x half> %0,
     <vscale x 4 x half> %1,
     <vscale x 4 x i16> %2,
     <vscale x 4 x i1> %3,
-    i64 %4, i64 1)
+    iXLen %4, iXLen 1)
 
   ret <vscale x 4 x half> %a
 }
 
-declare <vscale x 8 x half> @llvm.riscv.vrgather.vv.nxv8f16.i64(
+declare <vscale x 8 x half> @llvm.riscv.vrgather.vv.nxv8f16.iXLen(
   <vscale x 8 x half>,
   <vscale x 8 x half>,
   <vscale x 8 x i16>,
-  i64);
+  iXLen)
 
-define <vscale x 8 x half> @intrinsic_vrgather_vv_nxv8f16_nxv8f16_nxv8i16(<vscale x 8 x half> %0, <vscale x 8 x i16> %1, i64 %2) nounwind {
+define <vscale x 8 x half> @intrinsic_vrgather_vv_nxv8f16_nxv8f16_nxv8i16(<vscale x 8 x half> %0, <vscale x 8 x i16> %1, iXLen %2) nounwind {
 ; CHECK-LABEL: intrinsic_vrgather_vv_nxv8f16_nxv8f16_nxv8i16:
 ; CHECK:       # %bb.0: # %entry
 ; CHECK-NEXT:    vsetvli zero, a0, e16, m2, ta, ma
@@ -1220,47 +1222,47 @@ define <vscale x 8 x half> @intrinsic_vrgather_vv_nxv8f16_nxv8f16_nxv8i16(<vscal
 ; CHECK-NEXT:    vmv.v.v v8, v12
 ; CHECK-NEXT:    ret
 entry:
-  %a = call <vscale x 8 x half> @llvm.riscv.vrgather.vv.nxv8f16.i64(
+  %a = call <vscale x 8 x half> @llvm.riscv.vrgather.vv.nxv8f16.iXLen(
     <vscale x 8 x half> undef,
     <vscale x 8 x half> %0,
     <vscale x 8 x i16> %1,
-    i64 %2)
+    iXLen %2)
 
   ret <vscale x 8 x half> %a
 }
 
-declare <vscale x 8 x half> @llvm.riscv.vrgather.vv.mask.nxv8f16.i64(
+declare <vscale x 8 x half> @llvm.riscv.vrgather.vv.mask.nxv8f16.iXLen(
   <vscale x 8 x half>,
   <vscale x 8 x half>,
   <vscale x 8 x i16>,
   <vscale x 8 x i1>,
-  i64,
-  i64);
+  iXLen,
+  iXLen)
 
-define <vscale x 8 x half> @intrinsic_vrgather_mask_vv_nxv8f16_nxv8f16_nxv8i16(<vscale x 8 x half> %0, <vscale x 8 x half> %1, <vscale x 8 x i16> %2, <vscale x 8 x i1> %3, i64 %4) nounwind {
+define <vscale x 8 x half> @intrinsic_vrgather_mask_vv_nxv8f16_nxv8f16_nxv8i16(<vscale x 8 x half> %0, <vscale x 8 x half> %1, <vscale x 8 x i16> %2, <vscale x 8 x i1> %3, iXLen %4) nounwind {
 ; CHECK-LABEL: intrinsic_vrgather_mask_vv_nxv8f16_nxv8f16_nxv8i16:
 ; CHECK:       # %bb.0: # %entry
 ; CHECK-NEXT:    vsetvli zero, a0, e16, m2, ta, mu
 ; CHECK-NEXT:    vrgather.vv v8, v10, v12, v0.t
 ; CHECK-NEXT:    ret
 entry:
-  %a = call <vscale x 8 x half> @llvm.riscv.vrgather.vv.mask.nxv8f16.i64(
+  %a = call <vscale x 8 x half> @llvm.riscv.vrgather.vv.mask.nxv8f16.iXLen(
     <vscale x 8 x half> %0,
     <vscale x 8 x half> %1,
     <vscale x 8 x i16> %2,
     <vscale x 8 x i1> %3,
-    i64 %4, i64 1)
+    iXLen %4, iXLen 1)
 
   ret <vscale x 8 x half> %a
 }
 
-declare <vscale x 16 x half> @llvm.riscv.vrgather.vv.nxv16f16.i64(
+declare <vscale x 16 x half> @llvm.riscv.vrgather.vv.nxv16f16.iXLen(
   <vscale x 16 x half>,
   <vscale x 16 x half>,
   <vscale x 16 x i16>,
-  i64);
+  iXLen)
 
-define <vscale x 16 x half> @intrinsic_vrgather_vv_nxv16f16_nxv16f16_nxv16i16(<vscale x 16 x half> %0, <vscale x 16 x i16> %1, i64 %2) nounwind {
+define <vscale x 16 x half> @intrinsic_vrgather_vv_nxv16f16_nxv16f16_nxv16i16(<vscale x 16 x half> %0, <vscale x 16 x i16> %1, iXLen %2) nounwind {
 ; CHECK-LABEL: intrinsic_vrgather_vv_nxv16f16_nxv16f16_nxv16i16:
 ; CHECK:       # %bb.0: # %entry
 ; CHECK-NEXT:    vsetvli zero, a0, e16, m4, ta, ma
@@ -1268,47 +1270,47 @@ define <vscale x 16 x half> @intrinsic_vrgather_vv_nxv16f16_nxv16f16_nxv16i16(<v
 ; CHECK-NEXT:    vmv.v.v v8, v16
 ; CHECK-NEXT:    ret
 entry:
-  %a = call <vscale x 16 x half> @llvm.riscv.vrgather.vv.nxv16f16.i64(
+  %a = call <vscale x 16 x half> @llvm.riscv.vrgather.vv.nxv16f16.iXLen(
     <vscale x 16 x half> undef,
     <vscale x 16 x half> %0,
     <vscale x 16 x i16> %1,
-    i64 %2)
+    iXLen %2)
 
   ret <vscale x 16 x half> %a
 }
 
-declare <vscale x 16 x half> @llvm.riscv.vrgather.vv.mask.nxv16f16.i64(
+declare <vscale x 16 x half> @llvm.riscv.vrgather.vv.mask.nxv16f16.iXLen(
   <vscale x 16 x half>,
   <vscale x 16 x half>,
   <vscale x 16 x i16>,
   <vscale x 16 x i1>,
-  i64,
-  i64);
+  iXLen,
+  iXLen)
 
-define <vscale x 16 x half> @intrinsic_vrgather_mask_vv_nxv16f16_nxv16f16_nxv16i16(<vscale x 16 x half> %0, <vscale x 16 x half> %1, <vscale x 16 x i16> %2, <vscale x 16 x i1> %3, i64 %4) nounwind {
+define <vscale x 16 x half> @intrinsic_vrgather_mask_vv_nxv16f16_nxv16f16_nxv16i16(<vscale x 16 x half> %0, <vscale x 16 x half> %1, <vscale x 16 x i16> %2, <vscale x 16 x i1> %3, iXLen %4) nounwind {
 ; CHECK-LABEL: intrinsic_vrgather_mask_vv_nxv16f16_nxv16f16_nxv16i16:
 ; CHECK:       # %bb.0: # %entry
 ; CHECK-NEXT:    vsetvli zero, a0, e16, m4, ta, mu
 ; CHECK-NEXT:    vrgather.vv v8, v12, v16, v0.t
 ; CHECK-NEXT:    ret
 entry:
-  %a = call <vscale x 16 x half> @llvm.riscv.vrgather.vv.mask.nxv16f16.i64(
+  %a = call <vscale x 16 x half> @llvm.riscv.vrgather.vv.mask.nxv16f16.iXLen(
     <vscale x 16 x half> %0,
     <vscale x 16 x half> %1,
     <vscale x 16 x i16> %2,
     <vscale x 16 x i1> %3,
-    i64 %4, i64 1)
+    iXLen %4, iXLen 1)
 
   ret <vscale x 16 x half> %a
 }
 
-declare <vscale x 32 x half> @llvm.riscv.vrgather.vv.nxv32f16.i64(
+declare <vscale x 32 x half> @llvm.riscv.vrgather.vv.nxv32f16.iXLen(
   <vscale x 32 x half>,
   <vscale x 32 x half>,
   <vscale x 32 x i16>,
-  i64);
+  iXLen)
 
-define <vscale x 32 x half> @intrinsic_vrgather_vv_nxv32f16_nxv32f16_nxv32i16(<vscale x 32 x half> %0, <vscale x 32 x i16> %1, i64 %2) nounwind {
+define <vscale x 32 x half> @intrinsic_vrgather_vv_nxv32f16_nxv32f16_nxv32i16(<vscale x 32 x half> %0, <vscale x 32 x i16> %1, iXLen %2) nounwind {
 ; CHECK-LABEL: intrinsic_vrgather_vv_nxv32f16_nxv32f16_nxv32i16:
 ; CHECK:       # %bb.0: # %entry
 ; CHECK-NEXT:    vsetvli zero, a0, e16, m8, ta, ma
@@ -1316,24 +1318,24 @@ define <vscale x 32 x half> @intrinsic_vrgather_vv_nxv32f16_nxv32f16_nxv32i16(<v
 ; CHECK-NEXT:    vmv.v.v v8, v24
 ; CHECK-NEXT:    ret
 entry:
-  %a = call <vscale x 32 x half> @llvm.riscv.vrgather.vv.nxv32f16.i64(
+  %a = call <vscale x 32 x half> @llvm.riscv.vrgather.vv.nxv32f16.iXLen(
     <vscale x 32 x half> undef,
     <vscale x 32 x half> %0,
     <vscale x 32 x i16> %1,
-    i64 %2)
+    iXLen %2)
 
   ret <vscale x 32 x half> %a
 }
 
-declare <vscale x 32 x half> @llvm.riscv.vrgather.vv.mask.nxv32f16.i64(
+declare <vscale x 32 x half> @llvm.riscv.vrgather.vv.mask.nxv32f16.iXLen(
   <vscale x 32 x half>,
   <vscale x 32 x half>,
   <vscale x 32 x i16>,
   <vscale x 32 x i1>,
-  i64,
-  i64);
+  iXLen,
+  iXLen)
 
-define <vscale x 32 x half> @intrinsic_vrgather_mask_vv_nxv32f16_nxv32f16_nxv32i16(<vscale x 32 x half> %0, <vscale x 32 x half> %1, <vscale x 32 x i16> %2, <vscale x 32 x i1> %3, i64 %4) nounwind {
+define <vscale x 32 x half> @intrinsic_vrgather_mask_vv_nxv32f16_nxv32f16_nxv32i16(<vscale x 32 x half> %0, <vscale x 32 x half> %1, <vscale x 32 x i16> %2, <vscale x 32 x i1> %3, iXLen %4) nounwind {
 ; CHECK-LABEL: intrinsic_vrgather_mask_vv_nxv32f16_nxv32f16_nxv32i16:
 ; CHECK:       # %bb.0: # %entry
 ; CHECK-NEXT:    vl8re16.v v24, (a0)
@@ -1341,23 +1343,23 @@ define <vscale x 32 x half> @intrinsic_vrgather_mask_vv_nxv32f16_nxv32f16_nxv32i
 ; CHECK-NEXT:    vrgather.vv v8, v16, v24, v0.t
 ; CHECK-NEXT:    ret
 entry:
-  %a = call <vscale x 32 x half> @llvm.riscv.vrgather.vv.mask.nxv32f16.i64(
+  %a = call <vscale x 32 x half> @llvm.riscv.vrgather.vv.mask.nxv32f16.iXLen(
     <vscale x 32 x half> %0,
     <vscale x 32 x half> %1,
     <vscale x 32 x i16> %2,
     <vscale x 32 x i1> %3,
-    i64 %4, i64 1)
+    iXLen %4, iXLen 1)
 
   ret <vscale x 32 x half> %a
 }
 
-declare <vscale x 1 x float> @llvm.riscv.vrgather.vv.nxv1f32.i64(
+declare <vscale x 1 x float> @llvm.riscv.vrgather.vv.nxv1f32.iXLen(
   <vscale x 1 x float>,
   <vscale x 1 x float>,
   <vscale x 1 x i32>,
-  i64);
+  iXLen)
 
-define <vscale x 1 x float> @intrinsic_vrgather_vv_nxv1f32_nxv1f32_nxv1i32(<vscale x 1 x float> %0, <vscale x 1 x i32> %1, i64 %2) nounwind {
+define <vscale x 1 x float> @intrinsic_vrgather_vv_nxv1f32_nxv1f32_nxv1i32(<vscale x 1 x float> %0, <vscale x 1 x i32> %1, iXLen %2) nounwind {
 ; CHECK-LABEL: intrinsic_vrgather_vv_nxv1f32_nxv1f32_nxv1i32:
 ; CHECK:       # %bb.0: # %entry
 ; CHECK-NEXT:    vsetvli zero, a0, e32, mf2, ta, ma
@@ -1365,47 +1367,47 @@ define <vscale x 1 x float> @intrinsic_vrgather_vv_nxv1f32_nxv1f32_nxv1i32(<vsca
 ; CHECK-NEXT:    vmv1r.v v8, v10
 ; CHECK-NEXT:    ret
 entry:
-  %a = call <vscale x 1 x float> @llvm.riscv.vrgather.vv.nxv1f32.i64(
+  %a = call <vscale x 1 x float> @llvm.riscv.vrgather.vv.nxv1f32.iXLen(
     <vscale x 1 x float> undef,
     <vscale x 1 x float> %0,
     <vscale x 1 x i32> %1,
-    i64 %2)
+    iXLen %2)
 
   ret <vscale x 1 x float> %a
 }
 
-declare <vscale x 1 x float> @llvm.riscv.vrgather.vv.mask.nxv1f32.i64(
+declare <vscale x 1 x float> @llvm.riscv.vrgather.vv.mask.nxv1f32.iXLen(
   <vscale x 1 x float>,
   <vscale x 1 x float>,
   <vscale x 1 x i32>,
   <vscale x 1 x i1>,
-  i64,
-  i64);
+  iXLen,
+  iXLen)
 
-define <vscale x 1 x float> @intrinsic_vrgather_mask_vv_nxv1f32_nxv1f32_nxv1i32(<vscale x 1 x float> %0, <vscale x 1 x float> %1, <vscale x 1 x i32> %2, <vscale x 1 x i1> %3, i64 %4) nounwind {
+define <vscale x 1 x float> @intrinsic_vrgather_mask_vv_nxv1f32_nxv1f32_nxv1i32(<vscale x 1 x float> %0, <vscale x 1 x float> %1, <vscale x 1 x i32> %2, <vscale x 1 x i1> %3, iXLen %4) nounwind {
 ; CHECK-LABEL: intrinsic_vrgather_mask_vv_nxv1f32_nxv1f32_nxv1i32:
 ; CHECK:       # %bb.0: # %entry
 ; CHECK-NEXT:    vsetvli zero, a0, e32, mf2, ta, mu
 ; CHECK-NEXT:    vrgather.vv v8, v9, v10, v0.t
 ; CHECK-NEXT:    ret
 entry:
-  %a = call <vscale x 1 x float> @llvm.riscv.vrgather.vv.mask.nxv1f32.i64(
+  %a = call <vscale x 1 x float> @llvm.riscv.vrgather.vv.mask.nxv1f32.iXLen(
     <vscale x 1 x float> %0,
     <vscale x 1 x float> %1,
     <vscale x 1 x i32> %2,
     <vscale x 1 x i1> %3,
-    i64 %4, i64 1)
+    iXLen %4, iXLen 1)
 
   ret <vscale x 1 x float> %a
 }
 
-declare <vscale x 2 x float> @llvm.riscv.vrgather.vv.nxv2f32.i64(
+declare <vscale x 2 x float> @llvm.riscv.vrgather.vv.nxv2f32.iXLen(
   <vscale x 2 x float>,
   <vscale x 2 x float>,
   <vscale x 2 x i32>,
-  i64);
+  iXLen)
 
-define <vscale x 2 x float> @intrinsic_vrgather_vv_nxv2f32_nxv2f32_nxv2i32(<vscale x 2 x float> %0, <vscale x 2 x i32> %1, i64 %2) nounwind {
+define <vscale x 2 x float> @intrinsic_vrgather_vv_nxv2f32_nxv2f32_nxv2i32(<vscale x 2 x float> %0, <vscale x 2 x i32> %1, iXLen %2) nounwind {
 ; CHECK-LABEL: intrinsic_vrgather_vv_nxv2f32_nxv2f32_nxv2i32:
 ; CHECK:       # %bb.0: # %entry
 ; CHECK-NEXT:    vsetvli zero, a0, e32, m1, ta, ma
@@ -1413,47 +1415,47 @@ define <vscale x 2 x float> @intrinsic_vrgather_vv_nxv2f32_nxv2f32_nxv2i32(<vsca
 ; CHECK-NEXT:    vmv.v.v v8, v10
 ; CHECK-NEXT:    ret
 entry:
-  %a = call <vscale x 2 x float> @llvm.riscv.vrgather.vv.nxv2f32.i64(
+  %a = call <vscale x 2 x float> @llvm.riscv.vrgather.vv.nxv2f32.iXLen(
     <vscale x 2 x float> undef,
     <vscale x 2 x float> %0,
     <vscale x 2 x i32> %1,
-    i64 %2)
+    iXLen %2)
 
   ret <vscale x 2 x float> %a
 }
 
-declare <vscale x 2 x float> @llvm.riscv.vrgather.vv.mask.nxv2f32.i64(
+declare <vscale x 2 x float> @llvm.riscv.vrgather.vv.mask.nxv2f32.iXLen(
   <vscale x 2 x float>,
   <vscale x 2 x float>,
   <vscale x 2 x i32>,
   <vscale x 2 x i1>,
-  i64,
-  i64);
+  iXLen,
+  iXLen)
 
-define <vscale x 2 x float> @intrinsic_vrgather_mask_vv_nxv2f32_nxv2f32_nxv2i32(<vscale x 2 x float> %0, <vscale x 2 x float> %1, <vscale x 2 x i32> %2, <vscale x 2 x i1> %3, i64 %4) nounwind {
+define <vscale x 2 x float> @intrinsic_vrgather_mask_vv_nxv2f32_nxv2f32_nxv2i32(<vscale x 2 x float> %0, <vscale x 2 x float> %1, <vscale x 2 x i32> %2, <vscale x 2 x i1> %3, iXLen %4) nounwind {
 ; CHECK-LABEL: intrinsic_vrgather_mask_vv_nxv2f32_nxv2f32_nxv2i32:
 ; CHECK:       # %bb.0: # %entry
 ; CHECK-NEXT:    vsetvli zero, a0, e32, m1, ta, mu
 ; CHECK-NEXT:    vrgather.vv v8, v9, v10, v0.t
 ; CHECK-NEXT:    ret
 entry:
-  %a = call <vscale x 2 x float> @llvm.riscv.vrgather.vv.mask.nxv2f32.i64(
+  %a = call <vscale x 2 x float> @llvm.riscv.vrgather.vv.mask.nxv2f32.iXLen(
     <vscale x 2 x float> %0,
     <vscale x 2 x float> %1,
     <vscale x 2 x i32> %2,
     <vscale x 2 x i1> %3,
-    i64 %4, i64 1)
+    iXLen %4, iXLen 1)
 
   ret <vscale x 2 x float> %a
 }
 
-declare <vscale x 4 x float> @llvm.riscv.vrgather.vv.nxv4f32.i64(
+declare <vscale x 4 x float> @llvm.riscv.vrgather.vv.nxv4f32.iXLen(
   <vscale x 4 x float>,
   <vscale x 4 x float>,
   <vscale x 4 x i32>,
-  i64);
+  iXLen)
 
-define <vscale x 4 x float> @intrinsic_vrgather_vv_nxv4f32_nxv4f32_nxv4i32(<vscale x 4 x float> %0, <vscale x 4 x i32> %1, i64 %2) nounwind {
+define <vscale x 4 x float> @intrinsic_vrgather_vv_nxv4f32_nxv4f32_nxv4i32(<vscale x 4 x float> %0, <vscale x 4 x i32> %1, iXLen %2) nounwind {
 ; CHECK-LABEL: intrinsic_vrgather_vv_nxv4f32_nxv4f32_nxv4i32:
 ; CHECK:       # %bb.0: # %entry
 ; CHECK-NEXT:    vsetvli zero, a0, e32, m2, ta, ma
@@ -1461,47 +1463,47 @@ define <vscale x 4 x float> @intrinsic_vrgather_vv_nxv4f32_nxv4f32_nxv4i32(<vsca
 ; CHECK-NEXT:    vmv.v.v v8, v12
 ; CHECK-NEXT:    ret
 entry:
-  %a = call <vscale x 4 x float> @llvm.riscv.vrgather.vv.nxv4f32.i64(
+  %a = call <vscale x 4 x float> @llvm.riscv.vrgather.vv.nxv4f32.iXLen(
     <vscale x 4 x float> undef,
     <vscale x 4 x float> %0,
     <vscale x 4 x i32> %1,
-    i64 %2)
+    iXLen %2)
 
   ret <vscale x 4 x float> %a
 }
 
-declare <vscale x 4 x float> @llvm.riscv.vrgather.vv.mask.nxv4f32.i64(
+declare <vscale x 4 x float> @llvm.riscv.vrgather.vv.mask.nxv4f32.iXLen(
   <vscale x 4 x float>,
   <vscale x 4 x float>,
   <vscale x 4 x i32>,
   <vscale x 4 x i1>,
-  i64,
-  i64);
+  iXLen,
+  iXLen)
 
-define <vscale x 4 x float> @intrinsic_vrgather_mask_vv_nxv4f32_nxv4f32_nxv4i32(<vscale x 4 x float> %0, <vscale x 4 x float> %1, <vscale x 4 x i32> %2, <vscale x 4 x i1> %3, i64 %4) nounwind {
+define <vscale x 4 x float> @intrinsic_vrgather_mask_vv_nxv4f32_nxv4f32_nxv4i32(<vscale x 4 x float> %0, <vscale x 4 x float> %1, <vscale x 4 x i32> %2, <vscale x 4 x i1> %3, iXLen %4) nounwind {
 ; CHECK-LABEL: intrinsic_vrgather_mask_vv_nxv4f32_nxv4f32_nxv4i32:
 ; CHECK:       # %bb.0: # %entry
 ; CHECK-NEXT:    vsetvli zero, a0, e32, m2, ta, mu
 ; CHECK-NEXT:    vrgather.vv v8, v10, v12, v0.t
 ; CHECK-NEXT:    ret
 entry:
-  %a = call <vscale x 4 x float> @llvm.riscv.vrgather.vv.mask.nxv4f32.i64(
+  %a = call <vscale x 4 x float> @llvm.riscv.vrgather.vv.mask.nxv4f32.iXLen(
     <vscale x 4 x float> %0,
     <vscale x 4 x float> %1,
     <vscale x 4 x i32> %2,
     <vscale x 4 x i1> %3,
-    i64 %4, i64 1)
+    iXLen %4, iXLen 1)
 
   ret <vscale x 4 x float> %a
 }
 
-declare <vscale x 8 x float> @llvm.riscv.vrgather.vv.nxv8f32.i64(
+declare <vscale x 8 x float> @llvm.riscv.vrgather.vv.nxv8f32.iXLen(
   <vscale x 8 x float>,
   <vscale x 8 x float>,
   <vscale x 8 x i32>,
-  i64);
+  iXLen)
 
-define <vscale x 8 x float> @intrinsic_vrgather_vv_nxv8f32_nxv8f32_nxv8i32(<vscale x 8 x float> %0, <vscale x 8 x i32> %1, i64 %2) nounwind {
+define <vscale x 8 x float> @intrinsic_vrgather_vv_nxv8f32_nxv8f32_nxv8i32(<vscale x 8 x float> %0, <vscale x 8 x i32> %1, iXLen %2) nounwind {
 ; CHECK-LABEL: intrinsic_vrgather_vv_nxv8f32_nxv8f32_nxv8i32:
 ; CHECK:       # %bb.0: # %entry
 ; CHECK-NEXT:    vsetvli zero, a0, e32, m4, ta, ma
@@ -1509,47 +1511,47 @@ define <vscale x 8 x float> @intrinsic_vrgather_vv_nxv8f32_nxv8f32_nxv8i32(<vsca
 ; CHECK-NEXT:    vmv.v.v v8, v16
 ; CHECK-NEXT:    ret
 entry:
-  %a = call <vscale x 8 x float> @llvm.riscv.vrgather.vv.nxv8f32.i64(
+  %a = call <vscale x 8 x float> @llvm.riscv.vrgather.vv.nxv8f32.iXLen(
     <vscale x 8 x float> undef,
     <vscale x 8 x float> %0,
     <vscale x 8 x i32> %1,
-    i64 %2)
+    iXLen %2)
 
   ret <vscale x 8 x float> %a
 }
 
-declare <vscale x 8 x float> @llvm.riscv.vrgather.vv.mask.nxv8f32.i64(
+declare <vscale x 8 x float> @llvm.riscv.vrgather.vv.mask.nxv8f32.iXLen(
   <vscale x 8 x float>,
   <vscale x 8 x float>,
   <vscale x 8 x i32>,
   <vscale x 8 x i1>,
-  i64,
-  i64);
+  iXLen,
+  iXLen)
 
-define <vscale x 8 x float> @intrinsic_vrgather_mask_vv_nxv8f32_nxv8f32_nxv8i32(<vscale x 8 x float> %0, <vscale x 8 x float> %1, <vscale x 8 x i32> %2, <vscale x 8 x i1> %3, i64 %4) nounwind {
+define <vscale x 8 x float> @intrinsic_vrgather_mask_vv_nxv8f32_nxv8f32_nxv8i32(<vscale x 8 x float> %0, <vscale x 8 x float> %1, <vscale x 8 x i32> %2, <vscale x 8 x i1> %3, iXLen %4) nounwind {
 ; CHECK-LABEL: intrinsic_vrgather_mask_vv_nxv8f32_nxv8f32_nxv8i32:
 ; CHECK:       # %bb.0: # %entry
 ; CHECK-NEXT:    vsetvli zero, a0, e32, m4, ta, mu
 ; CHECK-NEXT:    vrgather.vv v8, v12, v16, v0.t
 ; CHECK-NEXT:    ret
 entry:
-  %a = call <vscale x 8 x float> @llvm.riscv.vrgather.vv.mask.nxv8f32.i64(
+  %a = call <vscale x 8 x float> @llvm.riscv.vrgather.vv.mask.nxv8f32.iXLen(
     <vscale x 8 x float> %0,
     <vscale x 8 x float> %1,
     <vscale x 8 x i32> %2,
     <vscale x 8 x i1> %3,
-    i64 %4, i64 1)
+    iXLen %4, iXLen 1)
 
   ret <vscale x 8 x float> %a
 }
 
-declare <vscale x 16 x float> @llvm.riscv.vrgather.vv.nxv16f32.i64(
+declare <vscale x 16 x float> @llvm.riscv.vrgather.vv.nxv16f32.iXLen(
   <vscale x 16 x float>,
   <vscale x 16 x float>,
   <vscale x 16 x i32>,
-  i64);
+  iXLen)
 
-define <vscale x 16 x float> @intrinsic_vrgather_vv_nxv16f32_nxv16f32_nxv16i32(<vscale x 16 x float> %0, <vscale x 16 x i32> %1, i64 %2) nounwind {
+define <vscale x 16 x float> @intrinsic_vrgather_vv_nxv16f32_nxv16f32_nxv16i32(<vscale x 16 x float> %0, <vscale x 16 x i32> %1, iXLen %2) nounwind {
 ; CHECK-LABEL: intrinsic_vrgather_vv_nxv16f32_nxv16f32_nxv16i32:
 ; CHECK:       # %bb.0: # %entry
 ; CHECK-NEXT:    vsetvli zero, a0, e32, m8, ta, ma
@@ -1557,24 +1559,24 @@ define <vscale x 16 x float> @intrinsic_vrgather_vv_nxv16f32_nxv16f32_nxv16i32(<
 ; CHECK-NEXT:    vmv.v.v v8, v24
 ; CHECK-NEXT:    ret
 entry:
-  %a = call <vscale x 16 x float> @llvm.riscv.vrgather.vv.nxv16f32.i64(
+  %a = call <vscale x 16 x float> @llvm.riscv.vrgather.vv.nxv16f32.iXLen(
     <vscale x 16 x float> undef,
     <vscale x 16 x float> %0,
     <vscale x 16 x i32> %1,
-    i64 %2)
+    iXLen %2)
 
   ret <vscale x 16 x float> %a
 }
 
-declare <vscale x 16 x float> @llvm.riscv.vrgather.vv.mask.nxv16f32.i64(
+declare <vscale x 16 x float> @llvm.riscv.vrgather.vv.mask.nxv16f32.iXLen(
   <vscale x 16 x float>,
   <vscale x 16 x float>,
   <vscale x 16 x i32>,
   <vscale x 16 x i1>,
-  i64,
-  i64);
+  iXLen,
+  iXLen)
 
-define <vscale x 16 x float> @intrinsic_vrgather_mask_vv_nxv16f32_nxv16f32_nxv16i32(<vscale x 16 x float> %0, <vscale x 16 x float> %1, <vscale x 16 x i32> %2, <vscale x 16 x i1> %3, i64 %4) nounwind {
+define <vscale x 16 x float> @intrinsic_vrgather_mask_vv_nxv16f32_nxv16f32_nxv16i32(<vscale x 16 x float> %0, <vscale x 16 x float> %1, <vscale x 16 x i32> %2, <vscale x 16 x i1> %3, iXLen %4) nounwind {
 ; CHECK-LABEL: intrinsic_vrgather_mask_vv_nxv16f32_nxv16f32_nxv16i32:
 ; CHECK:       # %bb.0: # %entry
 ; CHECK-NEXT:    vl8re32.v v24, (a0)
@@ -1582,23 +1584,23 @@ define <vscale x 16 x float> @intrinsic_vrgather_mask_vv_nxv16f32_nxv16f32_nxv16
 ; CHECK-NEXT:    vrgather.vv v8, v16, v24, v0.t
 ; CHECK-NEXT:    ret
 entry:
-  %a = call <vscale x 16 x float> @llvm.riscv.vrgather.vv.mask.nxv16f32.i64(
+  %a = call <vscale x 16 x float> @llvm.riscv.vrgather.vv.mask.nxv16f32.iXLen(
     <vscale x 16 x float> %0,
     <vscale x 16 x float> %1,
     <vscale x 16 x i32> %2,
     <vscale x 16 x i1> %3,
-    i64 %4, i64 1)
+    iXLen %4, iXLen 1)
 
   ret <vscale x 16 x float> %a
 }
 
-declare <vscale x 1 x double> @llvm.riscv.vrgather.vv.nxv1f64.i64(
+declare <vscale x 1 x double> @llvm.riscv.vrgather.vv.nxv1f64.iXLen(
   <vscale x 1 x double>,
   <vscale x 1 x double>,
   <vscale x 1 x i64>,
-  i64);
+  iXLen)
 
-define <vscale x 1 x double> @intrinsic_vrgather_vv_nxv1f64_nxv1f64_nxv1i64(<vscale x 1 x double> %0, <vscale x 1 x i64> %1, i64 %2) nounwind {
+define <vscale x 1 x double> @intrinsic_vrgather_vv_nxv1f64_nxv1f64_nxv1i64(<vscale x 1 x double> %0, <vscale x 1 x i64> %1, iXLen %2) nounwind {
 ; CHECK-LABEL: intrinsic_vrgather_vv_nxv1f64_nxv1f64_nxv1i64:
 ; CHECK:       # %bb.0: # %entry
 ; CHECK-NEXT:    vsetvli zero, a0, e64, m1, ta, ma
@@ -1606,47 +1608,47 @@ define <vscale x 1 x double> @intrinsic_vrgather_vv_nxv1f64_nxv1f64_nxv1i64(<vsc
 ; CHECK-NEXT:    vmv.v.v v8, v10
 ; CHECK-NEXT:    ret
 entry:
-  %a = call <vscale x 1 x double> @llvm.riscv.vrgather.vv.nxv1f64.i64(
+  %a = call <vscale x 1 x double> @llvm.riscv.vrgather.vv.nxv1f64.iXLen(
     <vscale x 1 x double> undef,
     <vscale x 1 x double> %0,
     <vscale x 1 x i64> %1,
-    i64 %2)
+    iXLen %2)
 
   ret <vscale x 1 x double> %a
 }
 
-declare <vscale x 1 x double> @llvm.riscv.vrgather.vv.mask.nxv1f64.i64(
+declare <vscale x 1 x double> @llvm.riscv.vrgather.vv.mask.nxv1f64.iXLen(
   <vscale x 1 x double>,
   <vscale x 1 x double>,
   <vscale x 1 x i64>,
   <vscale x 1 x i1>,
-  i64,
-  i64);
+  iXLen,
+  iXLen)
 
-define <vscale x 1 x double> @intrinsic_vrgather_mask_vv_nxv1f64_nxv1f64_nxv1i64(<vscale x 1 x double> %0, <vscale x 1 x double> %1, <vscale x 1 x i64> %2, <vscale x 1 x i1> %3, i64 %4) nounwind {
+define <vscale x 1 x double> @intrinsic_vrgather_mask_vv_nxv1f64_nxv1f64_nxv1i64(<vscale x 1 x double> %0, <vscale x 1 x double> %1, <vscale x 1 x i64> %2, <vscale x 1 x i1> %3, iXLen %4) nounwind {
 ; CHECK-LABEL: intrinsic_vrgather_mask_vv_nxv1f64_nxv1f64_nxv1i64:
 ; CHECK:       # %bb.0: # %entry
 ; CHECK-NEXT:    vsetvli zero, a0, e64, m1, ta, mu
 ; CHECK-NEXT:    vrgather.vv v8, v9, v10, v0.t
 ; CHECK-NEXT:    ret
 entry:
-  %a = call <vscale x 1 x double> @llvm.riscv.vrgather.vv.mask.nxv1f64.i64(
+  %a = call <vscale x 1 x double> @llvm.riscv.vrgather.vv.mask.nxv1f64.iXLen(
     <vscale x 1 x double> %0,
     <vscale x 1 x double> %1,
     <vscale x 1 x i64> %2,
     <vscale x 1 x i1> %3,
-    i64 %4, i64 1)
+    iXLen %4, iXLen 1)
 
   ret <vscale x 1 x double> %a
 }
 
-declare <vscale x 2 x double> @llvm.riscv.vrgather.vv.nxv2f64.i64(
+declare <vscale x 2 x double> @llvm.riscv.vrgather.vv.nxv2f64.iXLen(
   <vscale x 2 x double>,
   <vscale x 2 x double>,
   <vscale x 2 x i64>,
-  i64);
+  iXLen)
 
-define <vscale x 2 x double> @intrinsic_vrgather_vv_nxv2f64_nxv2f64_nxv2i64(<vscale x 2 x double> %0, <vscale x 2 x i64> %1, i64 %2) nounwind {
+define <vscale x 2 x double> @intrinsic_vrgather_vv_nxv2f64_nxv2f64_nxv2i64(<vscale x 2 x double> %0, <vscale x 2 x i64> %1, iXLen %2) nounwind {
 ; CHECK-LABEL: intrinsic_vrgather_vv_nxv2f64_nxv2f64_nxv2i64:
 ; CHECK:       # %bb.0: # %entry
 ; CHECK-NEXT:    vsetvli zero, a0, e64, m2, ta, ma
@@ -1654,47 +1656,47 @@ define <vscale x 2 x double> @intrinsic_vrgather_vv_nxv2f64_nxv2f64_nxv2i64(<vsc
 ; CHECK-NEXT:    vmv.v.v v8, v12
 ; CHECK-NEXT:    ret
 entry:
-  %a = call <vscale x 2 x double> @llvm.riscv.vrgather.vv.nxv2f64.i64(
+  %a = call <vscale x 2 x double> @llvm.riscv.vrgather.vv.nxv2f64.iXLen(
     <vscale x 2 x double> undef,
     <vscale x 2 x double> %0,
     <vscale x 2 x i64> %1,
-    i64 %2)
+    iXLen %2)
 
   ret <vscale x 2 x double> %a
 }
 
-declare <vscale x 2 x double> @llvm.riscv.vrgather.vv.mask.nxv2f64.i64(
+declare <vscale x 2 x double> @llvm.riscv.vrgather.vv.mask.nxv2f64.iXLen(
   <vscale x 2 x double>,
   <vscale x 2 x double>,
   <vscale x 2 x i64>,
   <vscale x 2 x i1>,
-  i64,
-  i64);
+  iXLen,
+  iXLen)
 
-define <vscale x 2 x double> @intrinsic_vrgather_mask_vv_nxv2f64_nxv2f64_nxv2i64(<vscale x 2 x double> %0, <vscale x 2 x double> %1, <vscale x 2 x i64> %2, <vscale x 2 x i1> %3, i64 %4) nounwind {
+define <vscale x 2 x double> @intrinsic_vrgather_mask_vv_nxv2f64_nxv2f64_nxv2i64(<vscale x 2 x double> %0, <vscale x 2 x double> %1, <vscale x 2 x i64> %2, <vscale x 2 x i1> %3, iXLen %4) nounwind {
 ; CHECK-LABEL: intrinsic_vrgather_mask_vv_nxv2f64_nxv2f64_nxv2i64:
 ; CHECK:       # %bb.0: # %entry
 ; CHECK-NEXT:    vsetvli zero, a0, e64, m2, ta, mu
 ; CHECK-NEXT:    vrgather.vv v8, v10, v12, v0.t
 ; CHECK-NEXT:    ret
 entry:
-  %a = call <vscale x 2 x double> @llvm.riscv.vrgather.vv.mask.nxv2f64.i64(
+  %a = call <vscale x 2 x double> @llvm.riscv.vrgather.vv.mask.nxv2f64.iXLen(
     <vscale x 2 x double> %0,
     <vscale x 2 x double> %1,
     <vscale x 2 x i64> %2,
     <vscale x 2 x i1> %3,
-    i64 %4, i64 1)
+    iXLen %4, iXLen 1)
 
   ret <vscale x 2 x double> %a
 }
 
-declare <vscale x 4 x double> @llvm.riscv.vrgather.vv.nxv4f64.i64(
+declare <vscale x 4 x double> @llvm.riscv.vrgather.vv.nxv4f64.iXLen(
   <vscale x 4 x double>,
   <vscale x 4 x double>,
   <vscale x 4 x i64>,
-  i64);
+  iXLen)
 
-define <vscale x 4 x double> @intrinsic_vrgather_vv_nxv4f64_nxv4f64_nxv4i64(<vscale x 4 x double> %0, <vscale x 4 x i64> %1, i64 %2) nounwind {
+define <vscale x 4 x double> @intrinsic_vrgather_vv_nxv4f64_nxv4f64_nxv4i64(<vscale x 4 x double> %0, <vscale x 4 x i64> %1, iXLen %2) nounwind {
 ; CHECK-LABEL: intrinsic_vrgather_vv_nxv4f64_nxv4f64_nxv4i64:
 ; CHECK:       # %bb.0: # %entry
 ; CHECK-NEXT:    vsetvli zero, a0, e64, m4, ta, ma
@@ -1702,47 +1704,47 @@ define <vscale x 4 x double> @intrinsic_vrgather_vv_nxv4f64_nxv4f64_nxv4i64(<vsc
 ; CHECK-NEXT:    vmv.v.v v8, v16
 ; CHECK-NEXT:    ret
 entry:
-  %a = call <vscale x 4 x double> @llvm.riscv.vrgather.vv.nxv4f64.i64(
+  %a = call <vscale x 4 x double> @llvm.riscv.vrgather.vv.nxv4f64.iXLen(
     <vscale x 4 x double> undef,
     <vscale x 4 x double> %0,
     <vscale x 4 x i64> %1,
-    i64 %2)
+    iXLen %2)
 
   ret <vscale x 4 x double> %a
 }
 
-declare <vscale x 4 x double> @llvm.riscv.vrgather.vv.mask.nxv4f64.i64(
+declare <vscale x 4 x double> @llvm.riscv.vrgather.vv.mask.nxv4f64.iXLen(
   <vscale x 4 x double>,
   <vscale x 4 x double>,
   <vscale x 4 x i64>,
   <vscale x 4 x i1>,
-  i64,
-  i64);
+  iXLen,
+  iXLen)
 
-define <vscale x 4 x double> @intrinsic_vrgather_mask_vv_nxv4f64_nxv4f64_nxv4i64(<vscale x 4 x double> %0, <vscale x 4 x double> %1, <vscale x 4 x i64> %2, <vscale x 4 x i1> %3, i64 %4) nounwind {
+define <vscale x 4 x double> @intrinsic_vrgather_mask_vv_nxv4f64_nxv4f64_nxv4i64(<vscale x 4 x double> %0, <vscale x 4 x double> %1, <vscale x 4 x i64> %2, <vscale x 4 x i1> %3, iXLen %4) nounwind {
 ; CHECK-LABEL: intrinsic_vrgather_mask_vv_nxv4f64_nxv4f64_nxv4i64:
 ; CHECK:       # %bb.0: # %entry
 ; CHECK-NEXT:    vsetvli zero, a0, e64, m4, ta, mu
 ; CHECK-NEXT:    vrgather.vv v8, v12, v16, v0.t
 ; CHECK-NEXT:    ret
 entry:
-  %a = call <vscale x 4 x double> @llvm.riscv.vrgather.vv.mask.nxv4f64.i64(
+  %a = call <vscale x 4 x double> @llvm.riscv.vrgather.vv.mask.nxv4f64.iXLen(
     <vscale x 4 x double> %0,
     <vscale x 4 x double> %1,
     <vscale x 4 x i64> %2,
     <vscale x 4 x i1> %3,
-    i64 %4, i64 1)
+    iXLen %4, iXLen 1)
 
   ret <vscale x 4 x double> %a
 }
 
-declare <vscale x 8 x double> @llvm.riscv.vrgather.vv.nxv8f64.i64(
+declare <vscale x 8 x double> @llvm.riscv.vrgather.vv.nxv8f64.iXLen(
   <vscale x 8 x double>,
   <vscale x 8 x double>,
   <vscale x 8 x i64>,
-  i64);
+  iXLen)
 
-define <vscale x 8 x double> @intrinsic_vrgather_vv_nxv8f64_nxv8f64_nxv8i64(<vscale x 8 x double> %0, <vscale x 8 x i64> %1, i64 %2) nounwind {
+define <vscale x 8 x double> @intrinsic_vrgather_vv_nxv8f64_nxv8f64_nxv8i64(<vscale x 8 x double> %0, <vscale x 8 x i64> %1, iXLen %2) nounwind {
 ; CHECK-LABEL: intrinsic_vrgather_vv_nxv8f64_nxv8f64_nxv8i64:
 ; CHECK:       # %bb.0: # %entry
 ; CHECK-NEXT:    vsetvli zero, a0, e64, m8, ta, ma
@@ -1750,24 +1752,24 @@ define <vscale x 8 x double> @intrinsic_vrgather_vv_nxv8f64_nxv8f64_nxv8i64(<vsc
 ; CHECK-NEXT:    vmv.v.v v8, v24
 ; CHECK-NEXT:    ret
 entry:
-  %a = call <vscale x 8 x double> @llvm.riscv.vrgather.vv.nxv8f64.i64(
+  %a = call <vscale x 8 x double> @llvm.riscv.vrgather.vv.nxv8f64.iXLen(
     <vscale x 8 x double> undef,
     <vscale x 8 x double> %0,
     <vscale x 8 x i64> %1,
-    i64 %2)
+    iXLen %2)
 
   ret <vscale x 8 x double> %a
 }
 
-declare <vscale x 8 x double> @llvm.riscv.vrgather.vv.mask.nxv8f64.i64(
+declare <vscale x 8 x double> @llvm.riscv.vrgather.vv.mask.nxv8f64.iXLen(
   <vscale x 8 x double>,
   <vscale x 8 x double>,
   <vscale x 8 x i64>,
   <vscale x 8 x i1>,
-  i64,
-  i64);
+  iXLen,
+  iXLen)
 
-define <vscale x 8 x double> @intrinsic_vrgather_mask_vv_nxv8f64_nxv8f64_nxv8i64(<vscale x 8 x double> %0, <vscale x 8 x double> %1, <vscale x 8 x i64> %2, <vscale x 8 x i1> %3, i64 %4) nounwind {
+define <vscale x 8 x double> @intrinsic_vrgather_mask_vv_nxv8f64_nxv8f64_nxv8i64(<vscale x 8 x double> %0, <vscale x 8 x double> %1, <vscale x 8 x i64> %2, <vscale x 8 x i1> %3, iXLen %4) nounwind {
 ; CHECK-LABEL: intrinsic_vrgather_mask_vv_nxv8f64_nxv8f64_nxv8i64:
 ; CHECK:       # %bb.0: # %entry
 ; CHECK-NEXT:    vl8re64.v v24, (a0)
@@ -1775,3046 +1777,3046 @@ define <vscale x 8 x double> @intrinsic_vrgather_mask_vv_nxv8f64_nxv8f64_nxv8i64
 ; CHECK-NEXT:    vrgather.vv v8, v16, v24, v0.t
 ; CHECK-NEXT:    ret
 entry:
-  %a = call <vscale x 8 x double> @llvm.riscv.vrgather.vv.mask.nxv8f64.i64(
+  %a = call <vscale x 8 x double> @llvm.riscv.vrgather.vv.mask.nxv8f64.iXLen(
     <vscale x 8 x double> %0,
     <vscale x 8 x double> %1,
     <vscale x 8 x i64> %2,
     <vscale x 8 x i1> %3,
-    i64 %4, i64 1)
+    iXLen %4, iXLen 1)
 
   ret <vscale x 8 x double> %a
 }
 
-declare <vscale x 1 x i8> @llvm.riscv.vrgather.vx.nxv1i8.i64(
+declare <vscale x 1 x i8> @llvm.riscv.vrgather.vx.nxv1i8.iXLen(
   <vscale x 1 x i8>,
   <vscale x 1 x i8>,
-  i64,
-  i64);
+  iXLen,
+  iXLen)
 
-define <vscale x 1 x i8> @intrinsic_vrgather_vx_nxv1i8_nxv1i8_i64(<vscale x 1 x i8> %0, i64 %1, i64 %2) nounwind {
-; CHECK-LABEL: intrinsic_vrgather_vx_nxv1i8_nxv1i8_i64:
+define <vscale x 1 x i8> @intrinsic_vrgather_vx_nxv1i8_nxv1i8(<vscale x 1 x i8> %0, iXLen %1, iXLen %2) nounwind {
+; CHECK-LABEL: intrinsic_vrgather_vx_nxv1i8_nxv1i8:
 ; CHECK:       # %bb.0: # %entry
 ; CHECK-NEXT:    vsetvli zero, a1, e8, mf8, ta, ma
 ; CHECK-NEXT:    vrgather.vx v9, v8, a0
 ; CHECK-NEXT:    vmv1r.v v8, v9
 ; CHECK-NEXT:    ret
 entry:
-  %a = call <vscale x 1 x i8> @llvm.riscv.vrgather.vx.nxv1i8.i64(
+  %a = call <vscale x 1 x i8> @llvm.riscv.vrgather.vx.nxv1i8.iXLen(
     <vscale x 1 x i8> undef,
     <vscale x 1 x i8> %0,
-    i64 %1,
-    i64 %2)
+    iXLen %1,
+    iXLen %2)
 
   ret <vscale x 1 x i8> %a
 }
 
-declare <vscale x 1 x i8> @llvm.riscv.vrgather.vx.mask.nxv1i8.i64(
+declare <vscale x 1 x i8> @llvm.riscv.vrgather.vx.mask.nxv1i8.iXLen(
   <vscale x 1 x i8>,
   <vscale x 1 x i8>,
-  i64,
+  iXLen,
   <vscale x 1 x i1>,
-  i64,
-  i64);
+  iXLen,
+  iXLen)
 
-define <vscale x 1 x i8> @intrinsic_vrgather_mask_vx_nxv1i8_nxv1i8_i64(<vscale x 1 x i8> %0, <vscale x 1 x i8> %1, i64 %2, <vscale x 1 x i1> %3, i64 %4) nounwind {
-; CHECK-LABEL: intrinsic_vrgather_mask_vx_nxv1i8_nxv1i8_i64:
+define <vscale x 1 x i8> @intrinsic_vrgather_mask_vx_nxv1i8_nxv1i8(<vscale x 1 x i8> %0, <vscale x 1 x i8> %1, iXLen %2, <vscale x 1 x i1> %3, iXLen %4) nounwind {
+; CHECK-LABEL: intrinsic_vrgather_mask_vx_nxv1i8_nxv1i8:
 ; CHECK:       # %bb.0: # %entry
 ; CHECK-NEXT:    vsetvli zero, a1, e8, mf8, ta, mu
 ; CHECK-NEXT:    vrgather.vx v8, v9, a0, v0.t
 ; CHECK-NEXT:    ret
 entry:
-  %a = call <vscale x 1 x i8> @llvm.riscv.vrgather.vx.mask.nxv1i8.i64(
+  %a = call <vscale x 1 x i8> @llvm.riscv.vrgather.vx.mask.nxv1i8.iXLen(
     <vscale x 1 x i8> %0,
     <vscale x 1 x i8> %1,
-    i64 %2,
+    iXLen %2,
     <vscale x 1 x i1> %3,
-    i64 %4, i64 1)
+    iXLen %4, iXLen 1)
 
   ret <vscale x 1 x i8> %a
 }
 
-declare <vscale x 2 x i8> @llvm.riscv.vrgather.vx.nxv2i8.i64(
+declare <vscale x 2 x i8> @llvm.riscv.vrgather.vx.nxv2i8.iXLen(
   <vscale x 2 x i8>,
   <vscale x 2 x i8>,
-  i64,
-  i64);
+  iXLen,
+  iXLen)
 
-define <vscale x 2 x i8> @intrinsic_vrgather_vx_nxv2i8_nxv2i8_i64(<vscale x 2 x i8> %0, i64 %1, i64 %2) nounwind {
-; CHECK-LABEL: intrinsic_vrgather_vx_nxv2i8_nxv2i8_i64:
+define <vscale x 2 x i8> @intrinsic_vrgather_vx_nxv2i8_nxv2i8(<vscale x 2 x i8> %0, iXLen %1, iXLen %2) nounwind {
+; CHECK-LABEL: intrinsic_vrgather_vx_nxv2i8_nxv2i8:
 ; CHECK:       # %bb.0: # %entry
 ; CHECK-NEXT:    vsetvli zero, a1, e8, mf4, ta, ma
 ; CHECK-NEXT:    vrgather.vx v9, v8, a0
 ; CHECK-NEXT:    vmv1r.v v8, v9
 ; CHECK-NEXT:    ret
 entry:
-  %a = call <vscale x 2 x i8> @llvm.riscv.vrgather.vx.nxv2i8.i64(
+  %a = call <vscale x 2 x i8> @llvm.riscv.vrgather.vx.nxv2i8.iXLen(
     <vscale x 2 x i8> undef,
     <vscale x 2 x i8> %0,
-    i64 %1,
-    i64 %2)
+    iXLen %1,
+    iXLen %2)
 
   ret <vscale x 2 x i8> %a
 }
 
-declare <vscale x 2 x i8> @llvm.riscv.vrgather.vx.mask.nxv2i8.i64(
+declare <vscale x 2 x i8> @llvm.riscv.vrgather.vx.mask.nxv2i8.iXLen(
   <vscale x 2 x i8>,
   <vscale x 2 x i8>,
-  i64,
+  iXLen,
   <vscale x 2 x i1>,
-  i64,
-  i64);
+  iXLen,
+  iXLen)
 
-define <vscale x 2 x i8> @intrinsic_vrgather_mask_vx_nxv2i8_nxv2i8_i64(<vscale x 2 x i8> %0, <vscale x 2 x i8> %1, i64 %2, <vscale x 2 x i1> %3, i64 %4) nounwind {
-; CHECK-LABEL: intrinsic_vrgather_mask_vx_nxv2i8_nxv2i8_i64:
+define <vscale x 2 x i8> @intrinsic_vrgather_mask_vx_nxv2i8_nxv2i8(<vscale x 2 x i8> %0, <vscale x 2 x i8> %1, iXLen %2, <vscale x 2 x i1> %3, iXLen %4) nounwind {
+; CHECK-LABEL: intrinsic_vrgather_mask_vx_nxv2i8_nxv2i8:
 ; CHECK:       # %bb.0: # %entry
 ; CHECK-NEXT:    vsetvli zero, a1, e8, mf4, ta, mu
 ; CHECK-NEXT:    vrgather.vx v8, v9, a0, v0.t
 ; CHECK-NEXT:    ret
 entry:
-  %a = call <vscale x 2 x i8> @llvm.riscv.vrgather.vx.mask.nxv2i8.i64(
+  %a = call <vscale x 2 x i8> @llvm.riscv.vrgather.vx.mask.nxv2i8.iXLen(
     <vscale x 2 x i8> %0,
     <vscale x 2 x i8> %1,
-    i64 %2,
+    iXLen %2,
     <vscale x 2 x i1> %3,
-    i64 %4, i64 1)
+    iXLen %4, iXLen 1)
 
   ret <vscale x 2 x i8> %a
 }
 
-declare <vscale x 4 x i8> @llvm.riscv.vrgather.vx.nxv4i8.i64(
+declare <vscale x 4 x i8> @llvm.riscv.vrgather.vx.nxv4i8.iXLen(
   <vscale x 4 x i8>,
   <vscale x 4 x i8>,
-  i64,
-  i64);
+  iXLen,
+  iXLen)
 
-define <vscale x 4 x i8> @intrinsic_vrgather_vx_nxv4i8_nxv4i8_i64(<vscale x 4 x i8> %0, i64 %1, i64 %2) nounwind {
-; CHECK-LABEL: intrinsic_vrgather_vx_nxv4i8_nxv4i8_i64:
+define <vscale x 4 x i8> @intrinsic_vrgather_vx_nxv4i8_nxv4i8(<vscale x 4 x i8> %0, iXLen %1, iXLen %2) nounwind {
+; CHECK-LABEL: intrinsic_vrgather_vx_nxv4i8_nxv4i8:
 ; CHECK:       # %bb.0: # %entry
 ; CHECK-NEXT:    vsetvli zero, a1, e8, mf2, ta, ma
 ; CHECK-NEXT:    vrgather.vx v9, v8, a0
 ; CHECK-NEXT:    vmv1r.v v8, v9
 ; CHECK-NEXT:    ret
 entry:
-  %a = call <vscale x 4 x i8> @llvm.riscv.vrgather.vx.nxv4i8.i64(
+  %a = call <vscale x 4 x i8> @llvm.riscv.vrgather.vx.nxv4i8.iXLen(
     <vscale x 4 x i8> undef,
     <vscale x 4 x i8> %0,
-    i64 %1,
-    i64 %2)
+    iXLen %1,
+    iXLen %2)
 
   ret <vscale x 4 x i8> %a
 }
 
-declare <vscale x 4 x i8> @llvm.riscv.vrgather.vx.mask.nxv4i8.i64(
+declare <vscale x 4 x i8> @llvm.riscv.vrgather.vx.mask.nxv4i8.iXLen(
   <vscale x 4 x i8>,
   <vscale x 4 x i8>,
-  i64,
+  iXLen,
   <vscale x 4 x i1>,
-  i64,
-  i64);
+  iXLen,
+  iXLen)
 
-define <vscale x 4 x i8> @intrinsic_vrgather_mask_vx_nxv4i8_nxv4i8_i64(<vscale x 4 x i8> %0, <vscale x 4 x i8> %1, i64 %2, <vscale x 4 x i1> %3, i64 %4) nounwind {
-; CHECK-LABEL: intrinsic_vrgather_mask_vx_nxv4i8_nxv4i8_i64:
+define <vscale x 4 x i8> @intrinsic_vrgather_mask_vx_nxv4i8_nxv4i8(<vscale x 4 x i8> %0, <vscale x 4 x i8> %1, iXLen %2, <vscale x 4 x i1> %3, iXLen %4) nounwind {
+; CHECK-LABEL: intrinsic_vrgather_mask_vx_nxv4i8_nxv4i8:
 ; CHECK:       # %bb.0: # %entry
 ; CHECK-NEXT:    vsetvli zero, a1, e8, mf2, ta, mu
 ; CHECK-NEXT:    vrgather.vx v8, v9, a0, v0.t
 ; CHECK-NEXT:    ret
 entry:
-  %a = call <vscale x 4 x i8> @llvm.riscv.vrgather.vx.mask.nxv4i8.i64(
+  %a = call <vscale x 4 x i8> @llvm.riscv.vrgather.vx.mask.nxv4i8.iXLen(
     <vscale x 4 x i8> %0,
     <vscale x 4 x i8> %1,
-    i64 %2,
+    iXLen %2,
     <vscale x 4 x i1> %3,
-    i64 %4, i64 1)
+    iXLen %4, iXLen 1)
 
   ret <vscale x 4 x i8> %a
 }
 
-declare <vscale x 8 x i8> @llvm.riscv.vrgather.vx.nxv8i8.i64(
+declare <vscale x 8 x i8> @llvm.riscv.vrgather.vx.nxv8i8.iXLen(
   <vscale x 8 x i8>,
   <vscale x 8 x i8>,
-  i64,
-  i64);
+  iXLen,
+  iXLen)
 
-define <vscale x 8 x i8> @intrinsic_vrgather_vx_nxv8i8_nxv8i8_i64(<vscale x 8 x i8> %0, i64 %1, i64 %2) nounwind {
-; CHECK-LABEL: intrinsic_vrgather_vx_nxv8i8_nxv8i8_i64:
+define <vscale x 8 x i8> @intrinsic_vrgather_vx_nxv8i8_nxv8i8(<vscale x 8 x i8> %0, iXLen %1, iXLen %2) nounwind {
+; CHECK-LABEL: intrinsic_vrgather_vx_nxv8i8_nxv8i8:
 ; CHECK:       # %bb.0: # %entry
 ; CHECK-NEXT:    vsetvli zero, a1, e8, m1, ta, ma
 ; CHECK-NEXT:    vrgather.vx v9, v8, a0
 ; CHECK-NEXT:    vmv.v.v v8, v9
 ; CHECK-NEXT:    ret
 entry:
-  %a = call <vscale x 8 x i8> @llvm.riscv.vrgather.vx.nxv8i8.i64(
+  %a = call <vscale x 8 x i8> @llvm.riscv.vrgather.vx.nxv8i8.iXLen(
     <vscale x 8 x i8> undef,
     <vscale x 8 x i8> %0,
-    i64 %1,
-    i64 %2)
+    iXLen %1,
+    iXLen %2)
 
   ret <vscale x 8 x i8> %a
 }
 
-declare <vscale x 8 x i8> @llvm.riscv.vrgather.vx.mask.nxv8i8.i64(
+declare <vscale x 8 x i8> @llvm.riscv.vrgather.vx.mask.nxv8i8.iXLen(
   <vscale x 8 x i8>,
   <vscale x 8 x i8>,
-  i64,
+  iXLen,
   <vscale x 8 x i1>,
-  i64,
-  i64);
+  iXLen,
+  iXLen)
 
-define <vscale x 8 x i8> @intrinsic_vrgather_mask_vx_nxv8i8_nxv8i8_i64(<vscale x 8 x i8> %0, <vscale x 8 x i8> %1, i64 %2, <vscale x 8 x i1> %3, i64 %4) nounwind {
-; CHECK-LABEL: intrinsic_vrgather_mask_vx_nxv8i8_nxv8i8_i64:
+define <vscale x 8 x i8> @intrinsic_vrgather_mask_vx_nxv8i8_nxv8i8(<vscale x 8 x i8> %0, <vscale x 8 x i8> %1, iXLen %2, <vscale x 8 x i1> %3, iXLen %4) nounwind {
+; CHECK-LABEL: intrinsic_vrgather_mask_vx_nxv8i8_nxv8i8:
 ; CHECK:       # %bb.0: # %entry
 ; CHECK-NEXT:    vsetvli zero, a1, e8, m1, ta, mu
 ; CHECK-NEXT:    vrgather.vx v8, v9, a0, v0.t
 ; CHECK-NEXT:    ret
 entry:
-  %a = call <vscale x 8 x i8> @llvm.riscv.vrgather.vx.mask.nxv8i8.i64(
+  %a = call <vscale x 8 x i8> @llvm.riscv.vrgather.vx.mask.nxv8i8.iXLen(
     <vscale x 8 x i8> %0,
     <vscale x 8 x i8> %1,
-    i64 %2,
+    iXLen %2,
     <vscale x 8 x i1> %3,
-    i64 %4, i64 1)
+    iXLen %4, iXLen 1)
 
   ret <vscale x 8 x i8> %a
 }
 
-declare <vscale x 16 x i8> @llvm.riscv.vrgather.vx.nxv16i8.i64(
+declare <vscale x 16 x i8> @llvm.riscv.vrgather.vx.nxv16i8.iXLen(
   <vscale x 16 x i8>,
   <vscale x 16 x i8>,
-  i64,
-  i64);
+  iXLen,
+  iXLen)
 
-define <vscale x 16 x i8> @intrinsic_vrgather_vx_nxv16i8_nxv16i8_i64(<vscale x 16 x i8> %0, i64 %1, i64 %2) nounwind {
-; CHECK-LABEL: intrinsic_vrgather_vx_nxv16i8_nxv16i8_i64:
+define <vscale x 16 x i8> @intrinsic_vrgather_vx_nxv16i8_nxv16i8(<vscale x 16 x i8> %0, iXLen %1, iXLen %2) nounwind {
+; CHECK-LABEL: intrinsic_vrgather_vx_nxv16i8_nxv16i8:
 ; CHECK:       # %bb.0: # %entry
 ; CHECK-NEXT:    vsetvli zero, a1, e8, m2, ta, ma
 ; CHECK-NEXT:    vrgather.vx v10, v8, a0
 ; CHECK-NEXT:    vmv.v.v v8, v10
 ; CHECK-NEXT:    ret
 entry:
-  %a = call <vscale x 16 x i8> @llvm.riscv.vrgather.vx.nxv16i8.i64(
+  %a = call <vscale x 16 x i8> @llvm.riscv.vrgather.vx.nxv16i8.iXLen(
     <vscale x 16 x i8> undef,
     <vscale x 16 x i8> %0,
-    i64 %1,
-    i64 %2)
+    iXLen %1,
+    iXLen %2)
 
   ret <vscale x 16 x i8> %a
 }
 
-declare <vscale x 16 x i8> @llvm.riscv.vrgather.vx.mask.nxv16i8.i64(
+declare <vscale x 16 x i8> @llvm.riscv.vrgather.vx.mask.nxv16i8.iXLen(
   <vscale x 16 x i8>,
   <vscale x 16 x i8>,
-  i64,
+  iXLen,
   <vscale x 16 x i1>,
-  i64,
-  i64);
+  iXLen,
+  iXLen)
 
-define <vscale x 16 x i8> @intrinsic_vrgather_mask_vx_nxv16i8_nxv16i8_i64(<vscale x 16 x i8> %0, <vscale x 16 x i8> %1, i64 %2, <vscale x 16 x i1> %3, i64 %4) nounwind {
-; CHECK-LABEL: intrinsic_vrgather_mask_vx_nxv16i8_nxv16i8_i64:
+define <vscale x 16 x i8> @intrinsic_vrgather_mask_vx_nxv16i8_nxv16i8(<vscale x 16 x i8> %0, <vscale x 16 x i8> %1, iXLen %2, <vscale x 16 x i1> %3, iXLen %4) nounwind {
+; CHECK-LABEL: intrinsic_vrgather_mask_vx_nxv16i8_nxv16i8:
 ; CHECK:       # %bb.0: # %entry
 ; CHECK-NEXT:    vsetvli zero, a1, e8, m2, ta, mu
 ; CHECK-NEXT:    vrgather.vx v8, v10, a0, v0.t
 ; CHECK-NEXT:    ret
 entry:
-  %a = call <vscale x 16 x i8> @llvm.riscv.vrgather.vx.mask.nxv16i8.i64(
+  %a = call <vscale x 16 x i8> @llvm.riscv.vrgather.vx.mask.nxv16i8.iXLen(
     <vscale x 16 x i8> %0,
     <vscale x 16 x i8> %1,
-    i64 %2,
+    iXLen %2,
     <vscale x 16 x i1> %3,
-    i64 %4, i64 1)
+    iXLen %4, iXLen 1)
 
   ret <vscale x 16 x i8> %a
 }
 
-declare <vscale x 32 x i8> @llvm.riscv.vrgather.vx.nxv32i8.i64(
+declare <vscale x 32 x i8> @llvm.riscv.vrgather.vx.nxv32i8.iXLen(
   <vscale x 32 x i8>,
   <vscale x 32 x i8>,
-  i64,
-  i64);
+  iXLen,
+  iXLen)
 
-define <vscale x 32 x i8> @intrinsic_vrgather_vx_nxv32i8_nxv32i8_i64(<vscale x 32 x i8> %0, i64 %1, i64 %2) nounwind {
-; CHECK-LABEL: intrinsic_vrgather_vx_nxv32i8_nxv32i8_i64:
+define <vscale x 32 x i8> @intrinsic_vrgather_vx_nxv32i8_nxv32i8(<vscale x 32 x i8> %0, iXLen %1, iXLen %2) nounwind {
+; CHECK-LABEL: intrinsic_vrgather_vx_nxv32i8_nxv32i8:
 ; CHECK:       # %bb.0: # %entry
 ; CHECK-NEXT:    vsetvli zero, a1, e8, m4, ta, ma
 ; CHECK-NEXT:    vrgather.vx v12, v8, a0
 ; CHECK-NEXT:    vmv.v.v v8, v12
 ; CHECK-NEXT:    ret
 entry:
-  %a = call <vscale x 32 x i8> @llvm.riscv.vrgather.vx.nxv32i8.i64(
+  %a = call <vscale x 32 x i8> @llvm.riscv.vrgather.vx.nxv32i8.iXLen(
     <vscale x 32 x i8> undef,
     <vscale x 32 x i8> %0,
-    i64 %1,
-    i64 %2)
+    iXLen %1,
+    iXLen %2)
 
   ret <vscale x 32 x i8> %a
 }
 
-declare <vscale x 32 x i8> @llvm.riscv.vrgather.vx.mask.nxv32i8.i64(
+declare <vscale x 32 x i8> @llvm.riscv.vrgather.vx.mask.nxv32i8.iXLen(
   <vscale x 32 x i8>,
   <vscale x 32 x i8>,
-  i64,
+  iXLen,
   <vscale x 32 x i1>,
-  i64,
-  i64);
+  iXLen,
+  iXLen)
 
-define <vscale x 32 x i8> @intrinsic_vrgather_mask_vx_nxv32i8_nxv32i8_i64(<vscale x 32 x i8> %0, <vscale x 32 x i8> %1, i64 %2, <vscale x 32 x i1> %3, i64 %4) nounwind {
-; CHECK-LABEL: intrinsic_vrgather_mask_vx_nxv32i8_nxv32i8_i64:
+define <vscale x 32 x i8> @intrinsic_vrgather_mask_vx_nxv32i8_nxv32i8(<vscale x 32 x i8> %0, <vscale x 32 x i8> %1, iXLen %2, <vscale x 32 x i1> %3, iXLen %4) nounwind {
+; CHECK-LABEL: intrinsic_vrgather_mask_vx_nxv32i8_nxv32i8:
 ; CHECK:       # %bb.0: # %entry
 ; CHECK-NEXT:    vsetvli zero, a1, e8, m4, ta, mu
 ; CHECK-NEXT:    vrgather.vx v8, v12, a0, v0.t
 ; CHECK-NEXT:    ret
 entry:
-  %a = call <vscale x 32 x i8> @llvm.riscv.vrgather.vx.mask.nxv32i8.i64(
+  %a = call <vscale x 32 x i8> @llvm.riscv.vrgather.vx.mask.nxv32i8.iXLen(
     <vscale x 32 x i8> %0,
     <vscale x 32 x i8> %1,
-    i64 %2,
+    iXLen %2,
     <vscale x 32 x i1> %3,
-    i64 %4, i64 1)
+    iXLen %4, iXLen 1)
 
   ret <vscale x 32 x i8> %a
 }
 
-declare <vscale x 64 x i8> @llvm.riscv.vrgather.vx.nxv64i8.i64(
+declare <vscale x 64 x i8> @llvm.riscv.vrgather.vx.nxv64i8.iXLen(
   <vscale x 64 x i8>,
   <vscale x 64 x i8>,
-  i64,
-  i64);
+  iXLen,
+  iXLen)
 
-define <vscale x 64 x i8> @intrinsic_vrgather_vx_nxv64i8_nxv64i8_i64(<vscale x 64 x i8> %0, i64 %1, i64 %2) nounwind {
-; CHECK-LABEL: intrinsic_vrgather_vx_nxv64i8_nxv64i8_i64:
+define <vscale x 64 x i8> @intrinsic_vrgather_vx_nxv64i8_nxv64i8(<vscale x 64 x i8> %0, iXLen %1, iXLen %2) nounwind {
+; CHECK-LABEL: intrinsic_vrgather_vx_nxv64i8_nxv64i8:
 ; CHECK:       # %bb.0: # %entry
 ; CHECK-NEXT:    vsetvli zero, a1, e8, m8, ta, ma
 ; CHECK-NEXT:    vrgather.vx v16, v8, a0
 ; CHECK-NEXT:    vmv.v.v v8, v16
 ; CHECK-NEXT:    ret
 entry:
-  %a = call <vscale x 64 x i8> @llvm.riscv.vrgather.vx.nxv64i8.i64(
+  %a = call <vscale x 64 x i8> @llvm.riscv.vrgather.vx.nxv64i8.iXLen(
     <vscale x 64 x i8> undef,
     <vscale x 64 x i8> %0,
-    i64 %1,
-    i64 %2)
+    iXLen %1,
+    iXLen %2)
 
   ret <vscale x 64 x i8> %a
 }
 
-declare <vscale x 64 x i8> @llvm.riscv.vrgather.vx.mask.nxv64i8.i64(
+declare <vscale x 64 x i8> @llvm.riscv.vrgather.vx.mask.nxv64i8.iXLen(
   <vscale x 64 x i8>,
   <vscale x 64 x i8>,
-  i64,
+  iXLen,
   <vscale x 64 x i1>,
-  i64,
-  i64);
+  iXLen,
+  iXLen)
 
-define <vscale x 64 x i8> @intrinsic_vrgather_mask_vx_nxv64i8_nxv64i8_i64(<vscale x 64 x i8> %0, <vscale x 64 x i8> %1, i64 %2, <vscale x 64 x i1> %3, i64 %4) nounwind {
-; CHECK-LABEL: intrinsic_vrgather_mask_vx_nxv64i8_nxv64i8_i64:
+define <vscale x 64 x i8> @intrinsic_vrgather_mask_vx_nxv64i8_nxv64i8(<vscale x 64 x i8> %0, <vscale x 64 x i8> %1, iXLen %2, <vscale x 64 x i1> %3, iXLen %4) nounwind {
+; CHECK-LABEL: intrinsic_vrgather_mask_vx_nxv64i8_nxv64i8:
 ; CHECK:       # %bb.0: # %entry
 ; CHECK-NEXT:    vsetvli zero, a1, e8, m8, ta, mu
 ; CHECK-NEXT:    vrgather.vx v8, v16, a0, v0.t
 ; CHECK-NEXT:    ret
 entry:
-  %a = call <vscale x 64 x i8> @llvm.riscv.vrgather.vx.mask.nxv64i8.i64(
+  %a = call <vscale x 64 x i8> @llvm.riscv.vrgather.vx.mask.nxv64i8.iXLen(
     <vscale x 64 x i8> %0,
     <vscale x 64 x i8> %1,
-    i64 %2,
+    iXLen %2,
     <vscale x 64 x i1> %3,
-    i64 %4, i64 1)
+    iXLen %4, iXLen 1)
 
   ret <vscale x 64 x i8> %a
 }
 
-declare <vscale x 1 x i16> @llvm.riscv.vrgather.vx.nxv1i16.i64(
+declare <vscale x 1 x i16> @llvm.riscv.vrgather.vx.nxv1i16.iXLen(
   <vscale x 1 x i16>,
   <vscale x 1 x i16>,
-  i64,
-  i64);
+  iXLen,
+  iXLen)
 
-define <vscale x 1 x i16> @intrinsic_vrgather_vx_nxv1i16_nxv1i16_i64(<vscale x 1 x i16> %0, i64 %1, i64 %2) nounwind {
-; CHECK-LABEL: intrinsic_vrgather_vx_nxv1i16_nxv1i16_i64:
+define <vscale x 1 x i16> @intrinsic_vrgather_vx_nxv1i16_nxv1i16(<vscale x 1 x i16> %0, iXLen %1, iXLen %2) nounwind {
+; CHECK-LABEL: intrinsic_vrgather_vx_nxv1i16_nxv1i16:
 ; CHECK:       # %bb.0: # %entry
 ; CHECK-NEXT:    vsetvli zero, a1, e16, mf4, ta, ma
 ; CHECK-NEXT:    vrgather.vx v9, v8, a0
 ; CHECK-NEXT:    vmv1r.v v8, v9
 ; CHECK-NEXT:    ret
 entry:
-  %a = call <vscale x 1 x i16> @llvm.riscv.vrgather.vx.nxv1i16.i64(
+  %a = call <vscale x 1 x i16> @llvm.riscv.vrgather.vx.nxv1i16.iXLen(
     <vscale x 1 x i16> undef,
     <vscale x 1 x i16> %0,
-    i64 %1,
-    i64 %2)
+    iXLen %1,
+    iXLen %2)
 
   ret <vscale x 1 x i16> %a
 }
 
-declare <vscale x 1 x i16> @llvm.riscv.vrgather.vx.mask.nxv1i16.i64(
+declare <vscale x 1 x i16> @llvm.riscv.vrgather.vx.mask.nxv1i16.iXLen(
   <vscale x 1 x i16>,
   <vscale x 1 x i16>,
-  i64,
+  iXLen,
   <vscale x 1 x i1>,
-  i64,
-  i64);
+  iXLen,
+  iXLen)
 
-define <vscale x 1 x i16> @intrinsic_vrgather_mask_vx_nxv1i16_nxv1i16_i64(<vscale x 1 x i16> %0, <vscale x 1 x i16> %1, i64 %2, <vscale x 1 x i1> %3, i64 %4) nounwind {
-; CHECK-LABEL: intrinsic_vrgather_mask_vx_nxv1i16_nxv1i16_i64:
+define <vscale x 1 x i16> @intrinsic_vrgather_mask_vx_nxv1i16_nxv1i16(<vscale x 1 x i16> %0, <vscale x 1 x i16> %1, iXLen %2, <vscale x 1 x i1> %3, iXLen %4) nounwind {
+; CHECK-LABEL: intrinsic_vrgather_mask_vx_nxv1i16_nxv1i16:
 ; CHECK:       # %bb.0: # %entry
 ; CHECK-NEXT:    vsetvli zero, a1, e16, mf4, ta, mu
 ; CHECK-NEXT:    vrgather.vx v8, v9, a0, v0.t
 ; CHECK-NEXT:    ret
 entry:
-  %a = call <vscale x 1 x i16> @llvm.riscv.vrgather.vx.mask.nxv1i16.i64(
+  %a = call <vscale x 1 x i16> @llvm.riscv.vrgather.vx.mask.nxv1i16.iXLen(
     <vscale x 1 x i16> %0,
     <vscale x 1 x i16> %1,
-    i64 %2,
+    iXLen %2,
     <vscale x 1 x i1> %3,
-    i64 %4, i64 1)
+    iXLen %4, iXLen 1)
 
   ret <vscale x 1 x i16> %a
 }
 
-declare <vscale x 2 x i16> @llvm.riscv.vrgather.vx.nxv2i16.i64(
+declare <vscale x 2 x i16> @llvm.riscv.vrgather.vx.nxv2i16.iXLen(
   <vscale x 2 x i16>,
   <vscale x 2 x i16>,
-  i64,
-  i64);
+  iXLen,
+  iXLen)
 
-define <vscale x 2 x i16> @intrinsic_vrgather_vx_nxv2i16_nxv2i16_i64(<vscale x 2 x i16> %0, i64 %1, i64 %2) nounwind {
-; CHECK-LABEL: intrinsic_vrgather_vx_nxv2i16_nxv2i16_i64:
+define <vscale x 2 x i16> @intrinsic_vrgather_vx_nxv2i16_nxv2i16(<vscale x 2 x i16> %0, iXLen %1, iXLen %2) nounwind {
+; CHECK-LABEL: intrinsic_vrgather_vx_nxv2i16_nxv2i16:
 ; CHECK:       # %bb.0: # %entry
 ; CHECK-NEXT:    vsetvli zero, a1, e16, mf2, ta, ma
 ; CHECK-NEXT:    vrgather.vx v9, v8, a0
 ; CHECK-NEXT:    vmv1r.v v8, v9
 ; CHECK-NEXT:    ret
 entry:
-  %a = call <vscale x 2 x i16> @llvm.riscv.vrgather.vx.nxv2i16.i64(
+  %a = call <vscale x 2 x i16> @llvm.riscv.vrgather.vx.nxv2i16.iXLen(
     <vscale x 2 x i16> undef,
     <vscale x 2 x i16> %0,
-    i64 %1,
-    i64 %2)
+    iXLen %1,
+    iXLen %2)
 
   ret <vscale x 2 x i16> %a
 }
 
-declare <vscale x 2 x i16> @llvm.riscv.vrgather.vx.mask.nxv2i16.i64(
+declare <vscale x 2 x i16> @llvm.riscv.vrgather.vx.mask.nxv2i16.iXLen(
   <vscale x 2 x i16>,
   <vscale x 2 x i16>,
-  i64,
+  iXLen,
   <vscale x 2 x i1>,
-  i64,
-  i64);
+  iXLen,
+  iXLen)
 
-define <vscale x 2 x i16> @intrinsic_vrgather_mask_vx_nxv2i16_nxv2i16_i64(<vscale x 2 x i16> %0, <vscale x 2 x i16> %1, i64 %2, <vscale x 2 x i1> %3, i64 %4) nounwind {
-; CHECK-LABEL: intrinsic_vrgather_mask_vx_nxv2i16_nxv2i16_i64:
+define <vscale x 2 x i16> @intrinsic_vrgather_mask_vx_nxv2i16_nxv2i16(<vscale x 2 x i16> %0, <vscale x 2 x i16> %1, iXLen %2, <vscale x 2 x i1> %3, iXLen %4) nounwind {
+; CHECK-LABEL: intrinsic_vrgather_mask_vx_nxv2i16_nxv2i16:
 ; CHECK:       # %bb.0: # %entry
 ; CHECK-NEXT:    vsetvli zero, a1, e16, mf2, ta, mu
 ; CHECK-NEXT:    vrgather.vx v8, v9, a0, v0.t
 ; CHECK-NEXT:    ret
 entry:
-  %a = call <vscale x 2 x i16> @llvm.riscv.vrgather.vx.mask.nxv2i16.i64(
+  %a = call <vscale x 2 x i16> @llvm.riscv.vrgather.vx.mask.nxv2i16.iXLen(
     <vscale x 2 x i16> %0,
     <vscale x 2 x i16> %1,
-    i64 %2,
+    iXLen %2,
     <vscale x 2 x i1> %3,
-    i64 %4, i64 1)
+    iXLen %4, iXLen 1)
 
   ret <vscale x 2 x i16> %a
 }
 
-declare <vscale x 4 x i16> @llvm.riscv.vrgather.vx.nxv4i16.i64(
+declare <vscale x 4 x i16> @llvm.riscv.vrgather.vx.nxv4i16.iXLen(
   <vscale x 4 x i16>,
   <vscale x 4 x i16>,
-  i64,
-  i64);
+  iXLen,
+  iXLen)
 
-define <vscale x 4 x i16> @intrinsic_vrgather_vx_nxv4i16_nxv4i16_i64(<vscale x 4 x i16> %0, i64 %1, i64 %2) nounwind {
-; CHECK-LABEL: intrinsic_vrgather_vx_nxv4i16_nxv4i16_i64:
+define <vscale x 4 x i16> @intrinsic_vrgather_vx_nxv4i16_nxv4i16(<vscale x 4 x i16> %0, iXLen %1, iXLen %2) nounwind {
+; CHECK-LABEL: intrinsic_vrgather_vx_nxv4i16_nxv4i16:
 ; CHECK:       # %bb.0: # %entry
 ; CHECK-NEXT:    vsetvli zero, a1, e16, m1, ta, ma
 ; CHECK-NEXT:    vrgather.vx v9, v8, a0
 ; CHECK-NEXT:    vmv.v.v v8, v9
 ; CHECK-NEXT:    ret
 entry:
-  %a = call <vscale x 4 x i16> @llvm.riscv.vrgather.vx.nxv4i16.i64(
+  %a = call <vscale x 4 x i16> @llvm.riscv.vrgather.vx.nxv4i16.iXLen(
     <vscale x 4 x i16> undef,
     <vscale x 4 x i16> %0,
-    i64 %1,
-    i64 %2)
+    iXLen %1,
+    iXLen %2)
 
   ret <vscale x 4 x i16> %a
 }
 
-declare <vscale x 4 x i16> @llvm.riscv.vrgather.vx.mask.nxv4i16.i64(
+declare <vscale x 4 x i16> @llvm.riscv.vrgather.vx.mask.nxv4i16.iXLen(
   <vscale x 4 x i16>,
   <vscale x 4 x i16>,
-  i64,
+  iXLen,
   <vscale x 4 x i1>,
-  i64,
-  i64);
+  iXLen,
+  iXLen)
 
-define <vscale x 4 x i16> @intrinsic_vrgather_mask_vx_nxv4i16_nxv4i16_i64(<vscale x 4 x i16> %0, <vscale x 4 x i16> %1, i64 %2, <vscale x 4 x i1> %3, i64 %4) nounwind {
-; CHECK-LABEL: intrinsic_vrgather_mask_vx_nxv4i16_nxv4i16_i64:
+define <vscale x 4 x i16> @intrinsic_vrgather_mask_vx_nxv4i16_nxv4i16(<vscale x 4 x i16> %0, <vscale x 4 x i16> %1, iXLen %2, <vscale x 4 x i1> %3, iXLen %4) nounwind {
+; CHECK-LABEL: intrinsic_vrgather_mask_vx_nxv4i16_nxv4i16:
 ; CHECK:       # %bb.0: # %entry
 ; CHECK-NEXT:    vsetvli zero, a1, e16, m1, ta, mu
 ; CHECK-NEXT:    vrgather.vx v8, v9, a0, v0.t
 ; CHECK-NEXT:    ret
 entry:
-  %a = call <vscale x 4 x i16> @llvm.riscv.vrgather.vx.mask.nxv4i16.i64(
+  %a = call <vscale x 4 x i16> @llvm.riscv.vrgather.vx.mask.nxv4i16.iXLen(
     <vscale x 4 x i16> %0,
     <vscale x 4 x i16> %1,
-    i64 %2,
+    iXLen %2,
     <vscale x 4 x i1> %3,
-    i64 %4, i64 1)
+    iXLen %4, iXLen 1)
 
   ret <vscale x 4 x i16> %a
 }
 
-declare <vscale x 8 x i16> @llvm.riscv.vrgather.vx.nxv8i16.i64(
+declare <vscale x 8 x i16> @llvm.riscv.vrgather.vx.nxv8i16.iXLen(
   <vscale x 8 x i16>,
   <vscale x 8 x i16>,
-  i64,
-  i64);
+  iXLen,
+  iXLen)
 
-define <vscale x 8 x i16> @intrinsic_vrgather_vx_nxv8i16_nxv8i16_i64(<vscale x 8 x i16> %0, i64 %1, i64 %2) nounwind {
-; CHECK-LABEL: intrinsic_vrgather_vx_nxv8i16_nxv8i16_i64:
+define <vscale x 8 x i16> @intrinsic_vrgather_vx_nxv8i16_nxv8i16(<vscale x 8 x i16> %0, iXLen %1, iXLen %2) nounwind {
+; CHECK-LABEL: intrinsic_vrgather_vx_nxv8i16_nxv8i16:
 ; CHECK:       # %bb.0: # %entry
 ; CHECK-NEXT:    vsetvli zero, a1, e16, m2, ta, ma
 ; CHECK-NEXT:    vrgather.vx v10, v8, a0
 ; CHECK-NEXT:    vmv.v.v v8, v10
 ; CHECK-NEXT:    ret
 entry:
-  %a = call <vscale x 8 x i16> @llvm.riscv.vrgather.vx.nxv8i16.i64(
+  %a = call <vscale x 8 x i16> @llvm.riscv.vrgather.vx.nxv8i16.iXLen(
     <vscale x 8 x i16> undef,
     <vscale x 8 x i16> %0,
-    i64 %1,
-    i64 %2)
+    iXLen %1,
+    iXLen %2)
 
   ret <vscale x 8 x i16> %a
 }
 
-declare <vscale x 8 x i16> @llvm.riscv.vrgather.vx.mask.nxv8i16.i64(
+declare <vscale x 8 x i16> @llvm.riscv.vrgather.vx.mask.nxv8i16.iXLen(
   <vscale x 8 x i16>,
   <vscale x 8 x i16>,
-  i64,
+  iXLen,
   <vscale x 8 x i1>,
-  i64,
-  i64);
+  iXLen,
+  iXLen)
 
-define <vscale x 8 x i16> @intrinsic_vrgather_mask_vx_nxv8i16_nxv8i16_i64(<vscale x 8 x i16> %0, <vscale x 8 x i16> %1, i64 %2, <vscale x 8 x i1> %3, i64 %4) nounwind {
-; CHECK-LABEL: intrinsic_vrgather_mask_vx_nxv8i16_nxv8i16_i64:
+define <vscale x 8 x i16> @intrinsic_vrgather_mask_vx_nxv8i16_nxv8i16(<vscale x 8 x i16> %0, <vscale x 8 x i16> %1, iXLen %2, <vscale x 8 x i1> %3, iXLen %4) nounwind {
+; CHECK-LABEL: intrinsic_vrgather_mask_vx_nxv8i16_nxv8i16:
 ; CHECK:       # %bb.0: # %entry
 ; CHECK-NEXT:    vsetvli zero, a1, e16, m2, ta, mu
 ; CHECK-NEXT:    vrgather.vx v8, v10, a0, v0.t
 ; CHECK-NEXT:    ret
 entry:
-  %a = call <vscale x 8 x i16> @llvm.riscv.vrgather.vx.mask.nxv8i16.i64(
+  %a = call <vscale x 8 x i16> @llvm.riscv.vrgather.vx.mask.nxv8i16.iXLen(
     <vscale x 8 x i16> %0,
     <vscale x 8 x i16> %1,
-    i64 %2,
+    iXLen %2,
     <vscale x 8 x i1> %3,
-    i64 %4, i64 1)
+    iXLen %4, iXLen 1)
 
   ret <vscale x 8 x i16> %a
 }
 
-declare <vscale x 16 x i16> @llvm.riscv.vrgather.vx.nxv16i16.i64(
+declare <vscale x 16 x i16> @llvm.riscv.vrgather.vx.nxv16i16.iXLen(
   <vscale x 16 x i16>,
   <vscale x 16 x i16>,
-  i64,
-  i64);
+  iXLen,
+  iXLen)
 
-define <vscale x 16 x i16> @intrinsic_vrgather_vx_nxv16i16_nxv16i16_i64(<vscale x 16 x i16> %0, i64 %1, i64 %2) nounwind {
-; CHECK-LABEL: intrinsic_vrgather_vx_nxv16i16_nxv16i16_i64:
+define <vscale x 16 x i16> @intrinsic_vrgather_vx_nxv16i16_nxv16i16(<vscale x 16 x i16> %0, iXLen %1, iXLen %2) nounwind {
+; CHECK-LABEL: intrinsic_vrgather_vx_nxv16i16_nxv16i16:
 ; CHECK:       # %bb.0: # %entry
 ; CHECK-NEXT:    vsetvli zero, a1, e16, m4, ta, ma
 ; CHECK-NEXT:    vrgather.vx v12, v8, a0
 ; CHECK-NEXT:    vmv.v.v v8, v12
 ; CHECK-NEXT:    ret
 entry:
-  %a = call <vscale x 16 x i16> @llvm.riscv.vrgather.vx.nxv16i16.i64(
+  %a = call <vscale x 16 x i16> @llvm.riscv.vrgather.vx.nxv16i16.iXLen(
     <vscale x 16 x i16> undef,
     <vscale x 16 x i16> %0,
-    i64 %1,
-    i64 %2)
+    iXLen %1,
+    iXLen %2)
 
   ret <vscale x 16 x i16> %a
 }
 
-declare <vscale x 16 x i16> @llvm.riscv.vrgather.vx.mask.nxv16i16.i64(
+declare <vscale x 16 x i16> @llvm.riscv.vrgather.vx.mask.nxv16i16.iXLen(
   <vscale x 16 x i16>,
   <vscale x 16 x i16>,
-  i64,
+  iXLen,
   <vscale x 16 x i1>,
-  i64,
-  i64);
+  iXLen,
+  iXLen)
 
-define <vscale x 16 x i16> @intrinsic_vrgather_mask_vx_nxv16i16_nxv16i16_i64(<vscale x 16 x i16> %0, <vscale x 16 x i16> %1, i64 %2, <vscale x 16 x i1> %3, i64 %4) nounwind {
-; CHECK-LABEL: intrinsic_vrgather_mask_vx_nxv16i16_nxv16i16_i64:
+define <vscale x 16 x i16> @intrinsic_vrgather_mask_vx_nxv16i16_nxv16i16(<vscale x 16 x i16> %0, <vscale x 16 x i16> %1, iXLen %2, <vscale x 16 x i1> %3, iXLen %4) nounwind {
+; CHECK-LABEL: intrinsic_vrgather_mask_vx_nxv16i16_nxv16i16:
 ; CHECK:       # %bb.0: # %entry
 ; CHECK-NEXT:    vsetvli zero, a1, e16, m4, ta, mu
 ; CHECK-NEXT:    vrgather.vx v8, v12, a0, v0.t
 ; CHECK-NEXT:    ret
 entry:
-  %a = call <vscale x 16 x i16> @llvm.riscv.vrgather.vx.mask.nxv16i16.i64(
+  %a = call <vscale x 16 x i16> @llvm.riscv.vrgather.vx.mask.nxv16i16.iXLen(
     <vscale x 16 x i16> %0,
     <vscale x 16 x i16> %1,
-    i64 %2,
+    iXLen %2,
     <vscale x 16 x i1> %3,
-    i64 %4, i64 1)
+    iXLen %4, iXLen 1)
 
   ret <vscale x 16 x i16> %a
 }
 
-declare <vscale x 32 x i16> @llvm.riscv.vrgather.vx.nxv32i16.i64(
+declare <vscale x 32 x i16> @llvm.riscv.vrgather.vx.nxv32i16.iXLen(
   <vscale x 32 x i16>,
   <vscale x 32 x i16>,
-  i64,
-  i64);
+  iXLen,
+  iXLen)
 
-define <vscale x 32 x i16> @intrinsic_vrgather_vx_nxv32i16_nxv32i16_i64(<vscale x 32 x i16> %0, i64 %1, i64 %2) nounwind {
-; CHECK-LABEL: intrinsic_vrgather_vx_nxv32i16_nxv32i16_i64:
+define <vscale x 32 x i16> @intrinsic_vrgather_vx_nxv32i16_nxv32i16(<vscale x 32 x i16> %0, iXLen %1, iXLen %2) nounwind {
+; CHECK-LABEL: intrinsic_vrgather_vx_nxv32i16_nxv32i16:
 ; CHECK:       # %bb.0: # %entry
 ; CHECK-NEXT:    vsetvli zero, a1, e16, m8, ta, ma
 ; CHECK-NEXT:    vrgather.vx v16, v8, a0
 ; CHECK-NEXT:    vmv.v.v v8, v16
 ; CHECK-NEXT:    ret
 entry:
-  %a = call <vscale x 32 x i16> @llvm.riscv.vrgather.vx.nxv32i16.i64(
+  %a = call <vscale x 32 x i16> @llvm.riscv.vrgather.vx.nxv32i16.iXLen(
     <vscale x 32 x i16> undef,
     <vscale x 32 x i16> %0,
-    i64 %1,
-    i64 %2)
+    iXLen %1,
+    iXLen %2)
 
   ret <vscale x 32 x i16> %a
 }
 
-declare <vscale x 32 x i16> @llvm.riscv.vrgather.vx.mask.nxv32i16.i64(
+declare <vscale x 32 x i16> @llvm.riscv.vrgather.vx.mask.nxv32i16.iXLen(
   <vscale x 32 x i16>,
   <vscale x 32 x i16>,
-  i64,
+  iXLen,
   <vscale x 32 x i1>,
-  i64,
-  i64);
+  iXLen,
+  iXLen)
 
-define <vscale x 32 x i16> @intrinsic_vrgather_mask_vx_nxv32i16_nxv32i16_i64(<vscale x 32 x i16> %0, <vscale x 32 x i16> %1, i64 %2, <vscale x 32 x i1> %3, i64 %4) nounwind {
-; CHECK-LABEL: intrinsic_vrgather_mask_vx_nxv32i16_nxv32i16_i64:
+define <vscale x 32 x i16> @intrinsic_vrgather_mask_vx_nxv32i16_nxv32i16(<vscale x 32 x i16> %0, <vscale x 32 x i16> %1, iXLen %2, <vscale x 32 x i1> %3, iXLen %4) nounwind {
+; CHECK-LABEL: intrinsic_vrgather_mask_vx_nxv32i16_nxv32i16:
 ; CHECK:       # %bb.0: # %entry
 ; CHECK-NEXT:    vsetvli zero, a1, e16, m8, ta, mu
 ; CHECK-NEXT:    vrgather.vx v8, v16, a0, v0.t
 ; CHECK-NEXT:    ret
 entry:
-  %a = call <vscale x 32 x i16> @llvm.riscv.vrgather.vx.mask.nxv32i16.i64(
+  %a = call <vscale x 32 x i16> @llvm.riscv.vrgather.vx.mask.nxv32i16.iXLen(
     <vscale x 32 x i16> %0,
     <vscale x 32 x i16> %1,
-    i64 %2,
+    iXLen %2,
     <vscale x 32 x i1> %3,
-    i64 %4, i64 1)
+    iXLen %4, iXLen 1)
 
   ret <vscale x 32 x i16> %a
 }
 
-declare <vscale x 1 x i32> @llvm.riscv.vrgather.vx.nxv1i32.i64(
+declare <vscale x 1 x i32> @llvm.riscv.vrgather.vx.nxv1i32.iXLen(
   <vscale x 1 x i32>,
   <vscale x 1 x i32>,
-  i64,
-  i64);
+  iXLen,
+  iXLen)
 
-define <vscale x 1 x i32> @intrinsic_vrgather_vx_nxv1i32_nxv1i32_i64(<vscale x 1 x i32> %0, i64 %1, i64 %2) nounwind {
-; CHECK-LABEL: intrinsic_vrgather_vx_nxv1i32_nxv1i32_i64:
+define <vscale x 1 x i32> @intrinsic_vrgather_vx_nxv1i32_nxv1i32(<vscale x 1 x i32> %0, iXLen %1, iXLen %2) nounwind {
+; CHECK-LABEL: intrinsic_vrgather_vx_nxv1i32_nxv1i32:
 ; CHECK:       # %bb.0: # %entry
 ; CHECK-NEXT:    vsetvli zero, a1, e32, mf2, ta, ma
 ; CHECK-NEXT:    vrgather.vx v9, v8, a0
 ; CHECK-NEXT:    vmv1r.v v8, v9
 ; CHECK-NEXT:    ret
 entry:
-  %a = call <vscale x 1 x i32> @llvm.riscv.vrgather.vx.nxv1i32.i64(
+  %a = call <vscale x 1 x i32> @llvm.riscv.vrgather.vx.nxv1i32.iXLen(
     <vscale x 1 x i32> undef,
     <vscale x 1 x i32> %0,
-    i64 %1,
-    i64 %2)
+    iXLen %1,
+    iXLen %2)
 
   ret <vscale x 1 x i32> %a
 }
 
-declare <vscale x 1 x i32> @llvm.riscv.vrgather.vx.mask.nxv1i32.i64(
+declare <vscale x 1 x i32> @llvm.riscv.vrgather.vx.mask.nxv1i32.iXLen(
   <vscale x 1 x i32>,
   <vscale x 1 x i32>,
-  i64,
+  iXLen,
   <vscale x 1 x i1>,
-  i64,
-  i64);
+  iXLen,
+  iXLen)
 
-define <vscale x 1 x i32> @intrinsic_vrgather_mask_vx_nxv1i32_nxv1i32_i64(<vscale x 1 x i32> %0, <vscale x 1 x i32> %1, i64 %2, <vscale x 1 x i1> %3, i64 %4) nounwind {
-; CHECK-LABEL: intrinsic_vrgather_mask_vx_nxv1i32_nxv1i32_i64:
+define <vscale x 1 x i32> @intrinsic_vrgather_mask_vx_nxv1i32_nxv1i32(<vscale x 1 x i32> %0, <vscale x 1 x i32> %1, iXLen %2, <vscale x 1 x i1> %3, iXLen %4) nounwind {
+; CHECK-LABEL: intrinsic_vrgather_mask_vx_nxv1i32_nxv1i32:
 ; CHECK:       # %bb.0: # %entry
 ; CHECK-NEXT:    vsetvli zero, a1, e32, mf2, ta, mu
 ; CHECK-NEXT:    vrgather.vx v8, v9, a0, v0.t
 ; CHECK-NEXT:    ret
 entry:
-  %a = call <vscale x 1 x i32> @llvm.riscv.vrgather.vx.mask.nxv1i32.i64(
+  %a = call <vscale x 1 x i32> @llvm.riscv.vrgather.vx.mask.nxv1i32.iXLen(
     <vscale x 1 x i32> %0,
     <vscale x 1 x i32> %1,
-    i64 %2,
+    iXLen %2,
     <vscale x 1 x i1> %3,
-    i64 %4, i64 1)
+    iXLen %4, iXLen 1)
 
   ret <vscale x 1 x i32> %a
 }
 
-declare <vscale x 2 x i32> @llvm.riscv.vrgather.vx.nxv2i32.i64(
+declare <vscale x 2 x i32> @llvm.riscv.vrgather.vx.nxv2i32.iXLen(
   <vscale x 2 x i32>,
   <vscale x 2 x i32>,
-  i64,
-  i64);
+  iXLen,
+  iXLen)
 
-define <vscale x 2 x i32> @intrinsic_vrgather_vx_nxv2i32_nxv2i32_i64(<vscale x 2 x i32> %0, i64 %1, i64 %2) nounwind {
-; CHECK-LABEL: intrinsic_vrgather_vx_nxv2i32_nxv2i32_i64:
+define <vscale x 2 x i32> @intrinsic_vrgather_vx_nxv2i32_nxv2i32(<vscale x 2 x i32> %0, iXLen %1, iXLen %2) nounwind {
+; CHECK-LABEL: intrinsic_vrgather_vx_nxv2i32_nxv2i32:
 ; CHECK:       # %bb.0: # %entry
 ; CHECK-NEXT:    vsetvli zero, a1, e32, m1, ta, ma
 ; CHECK-NEXT:    vrgather.vx v9, v8, a0
 ; CHECK-NEXT:    vmv.v.v v8, v9
 ; CHECK-NEXT:    ret
 entry:
-  %a = call <vscale x 2 x i32> @llvm.riscv.vrgather.vx.nxv2i32.i64(
+  %a = call <vscale x 2 x i32> @llvm.riscv.vrgather.vx.nxv2i32.iXLen(
     <vscale x 2 x i32> undef,
     <vscale x 2 x i32> %0,
-    i64 %1,
-    i64 %2)
+    iXLen %1,
+    iXLen %2)
 
   ret <vscale x 2 x i32> %a
 }
 
-declare <vscale x 2 x i32> @llvm.riscv.vrgather.vx.mask.nxv2i32.i64(
+declare <vscale x 2 x i32> @llvm.riscv.vrgather.vx.mask.nxv2i32.iXLen(
   <vscale x 2 x i32>,
   <vscale x 2 x i32>,
-  i64,
+  iXLen,
   <vscale x 2 x i1>,
-  i64,
-  i64);
+  iXLen,
+  iXLen)
 
-define <vscale x 2 x i32> @intrinsic_vrgather_mask_vx_nxv2i32_nxv2i32_i64(<vscale x 2 x i32> %0, <vscale x 2 x i32> %1, i64 %2, <vscale x 2 x i1> %3, i64 %4) nounwind {
-; CHECK-LABEL: intrinsic_vrgather_mask_vx_nxv2i32_nxv2i32_i64:
+define <vscale x 2 x i32> @intrinsic_vrgather_mask_vx_nxv2i32_nxv2i32(<vscale x 2 x i32> %0, <vscale x 2 x i32> %1, iXLen %2, <vscale x 2 x i1> %3, iXLen %4) nounwind {
+; CHECK-LABEL: intrinsic_vrgather_mask_vx_nxv2i32_nxv2i32:
 ; CHECK:       # %bb.0: # %entry
 ; CHECK-NEXT:    vsetvli zero, a1, e32, m1, ta, mu
 ; CHECK-NEXT:    vrgather.vx v8, v9, a0, v0.t
 ; CHECK-NEXT:    ret
 entry:
-  %a = call <vscale x 2 x i32> @llvm.riscv.vrgather.vx.mask.nxv2i32.i64(
+  %a = call <vscale x 2 x i32> @llvm.riscv.vrgather.vx.mask.nxv2i32.iXLen(
     <vscale x 2 x i32> %0,
     <vscale x 2 x i32> %1,
-    i64 %2,
+    iXLen %2,
     <vscale x 2 x i1> %3,
-    i64 %4, i64 1)
+    iXLen %4, iXLen 1)
 
   ret <vscale x 2 x i32> %a
 }
 
-declare <vscale x 4 x i32> @llvm.riscv.vrgather.vx.nxv4i32.i64(
+declare <vscale x 4 x i32> @llvm.riscv.vrgather.vx.nxv4i32.iXLen(
   <vscale x 4 x i32>,
   <vscale x 4 x i32>,
-  i64,
-  i64);
+  iXLen,
+  iXLen)
 
-define <vscale x 4 x i32> @intrinsic_vrgather_vx_nxv4i32_nxv4i32_i64(<vscale x 4 x i32> %0, i64 %1, i64 %2) nounwind {
-; CHECK-LABEL: intrinsic_vrgather_vx_nxv4i32_nxv4i32_i64:
+define <vscale x 4 x i32> @intrinsic_vrgather_vx_nxv4i32_nxv4i32(<vscale x 4 x i32> %0, iXLen %1, iXLen %2) nounwind {
+; CHECK-LABEL: intrinsic_vrgather_vx_nxv4i32_nxv4i32:
 ; CHECK:       # %bb.0: # %entry
 ; CHECK-NEXT:    vsetvli zero, a1, e32, m2, ta, ma
 ; CHECK-NEXT:    vrgather.vx v10, v8, a0
 ; CHECK-NEXT:    vmv.v.v v8, v10
 ; CHECK-NEXT:    ret
 entry:
-  %a = call <vscale x 4 x i32> @llvm.riscv.vrgather.vx.nxv4i32.i64(
+  %a = call <vscale x 4 x i32> @llvm.riscv.vrgather.vx.nxv4i32.iXLen(
     <vscale x 4 x i32> undef,
     <vscale x 4 x i32> %0,
-    i64 %1,
-    i64 %2)
+    iXLen %1,
+    iXLen %2)
 
   ret <vscale x 4 x i32> %a
 }
 
-declare <vscale x 4 x i32> @llvm.riscv.vrgather.vx.mask.nxv4i32.i64(
+declare <vscale x 4 x i32> @llvm.riscv.vrgather.vx.mask.nxv4i32.iXLen(
   <vscale x 4 x i32>,
   <vscale x 4 x i32>,
-  i64,
+  iXLen,
   <vscale x 4 x i1>,
-  i64,
-  i64);
+  iXLen,
+  iXLen)
 
-define <vscale x 4 x i32> @intrinsic_vrgather_mask_vx_nxv4i32_nxv4i32_i64(<vscale x 4 x i32> %0, <vscale x 4 x i32> %1, i64 %2, <vscale x 4 x i1> %3, i64 %4) nounwind {
-; CHECK-LABEL: intrinsic_vrgather_mask_vx_nxv4i32_nxv4i32_i64:
+define <vscale x 4 x i32> @intrinsic_vrgather_mask_vx_nxv4i32_nxv4i32(<vscale x 4 x i32> %0, <vscale x 4 x i32> %1, iXLen %2, <vscale x 4 x i1> %3, iXLen %4) nounwind {
+; CHECK-LABEL: intrinsic_vrgather_mask_vx_nxv4i32_nxv4i32:
 ; CHECK:       # %bb.0: # %entry
 ; CHECK-NEXT:    vsetvli zero, a1, e32, m2, ta, mu
 ; CHECK-NEXT:    vrgather.vx v8, v10, a0, v0.t
 ; CHECK-NEXT:    ret
 entry:
-  %a = call <vscale x 4 x i32> @llvm.riscv.vrgather.vx.mask.nxv4i32.i64(
+  %a = call <vscale x 4 x i32> @llvm.riscv.vrgather.vx.mask.nxv4i32.iXLen(
     <vscale x 4 x i32> %0,
     <vscale x 4 x i32> %1,
-    i64 %2,
+    iXLen %2,
     <vscale x 4 x i1> %3,
-    i64 %4, i64 1)
+    iXLen %4, iXLen 1)
 
   ret <vscale x 4 x i32> %a
 }
 
-declare <vscale x 8 x i32> @llvm.riscv.vrgather.vx.nxv8i32.i64(
+declare <vscale x 8 x i32> @llvm.riscv.vrgather.vx.nxv8i32.iXLen(
   <vscale x 8 x i32>,
   <vscale x 8 x i32>,
-  i64,
-  i64);
+  iXLen,
+  iXLen)
 
-define <vscale x 8 x i32> @intrinsic_vrgather_vx_nxv8i32_nxv8i32_i64(<vscale x 8 x i32> %0, i64 %1, i64 %2) nounwind {
-; CHECK-LABEL: intrinsic_vrgather_vx_nxv8i32_nxv8i32_i64:
+define <vscale x 8 x i32> @intrinsic_vrgather_vx_nxv8i32_nxv8i32(<vscale x 8 x i32> %0, iXLen %1, iXLen %2) nounwind {
+; CHECK-LABEL: intrinsic_vrgather_vx_nxv8i32_nxv8i32:
 ; CHECK:       # %bb.0: # %entry
 ; CHECK-NEXT:    vsetvli zero, a1, e32, m4, ta, ma
 ; CHECK-NEXT:    vrgather.vx v12, v8, a0
 ; CHECK-NEXT:    vmv.v.v v8, v12
 ; CHECK-NEXT:    ret
 entry:
-  %a = call <vscale x 8 x i32> @llvm.riscv.vrgather.vx.nxv8i32.i64(
+  %a = call <vscale x 8 x i32> @llvm.riscv.vrgather.vx.nxv8i32.iXLen(
     <vscale x 8 x i32> undef,
     <vscale x 8 x i32> %0,
-    i64 %1,
-    i64 %2)
+    iXLen %1,
+    iXLen %2)
 
   ret <vscale x 8 x i32> %a
 }
 
-declare <vscale x 8 x i32> @llvm.riscv.vrgather.vx.mask.nxv8i32.i64(
+declare <vscale x 8 x i32> @llvm.riscv.vrgather.vx.mask.nxv8i32.iXLen(
   <vscale x 8 x i32>,
   <vscale x 8 x i32>,
-  i64,
+  iXLen,
   <vscale x 8 x i1>,
-  i64,
-  i64);
+  iXLen,
+  iXLen)
 
-define <vscale x 8 x i32> @intrinsic_vrgather_mask_vx_nxv8i32_nxv8i32_i64(<vscale x 8 x i32> %0, <vscale x 8 x i32> %1, i64 %2, <vscale x 8 x i1> %3, i64 %4) nounwind {
-; CHECK-LABEL: intrinsic_vrgather_mask_vx_nxv8i32_nxv8i32_i64:
+define <vscale x 8 x i32> @intrinsic_vrgather_mask_vx_nxv8i32_nxv8i32(<vscale x 8 x i32> %0, <vscale x 8 x i32> %1, iXLen %2, <vscale x 8 x i1> %3, iXLen %4) nounwind {
+; CHECK-LABEL: intrinsic_vrgather_mask_vx_nxv8i32_nxv8i32:
 ; CHECK:       # %bb.0: # %entry
 ; CHECK-NEXT:    vsetvli zero, a1, e32, m4, ta, mu
 ; CHECK-NEXT:    vrgather.vx v8, v12, a0, v0.t
 ; CHECK-NEXT:    ret
 entry:
-  %a = call <vscale x 8 x i32> @llvm.riscv.vrgather.vx.mask.nxv8i32.i64(
+  %a = call <vscale x 8 x i32> @llvm.riscv.vrgather.vx.mask.nxv8i32.iXLen(
     <vscale x 8 x i32> %0,
     <vscale x 8 x i32> %1,
-    i64 %2,
+    iXLen %2,
     <vscale x 8 x i1> %3,
-    i64 %4, i64 1)
+    iXLen %4, iXLen 1)
 
   ret <vscale x 8 x i32> %a
 }
 
-declare <vscale x 16 x i32> @llvm.riscv.vrgather.vx.nxv16i32.i64(
+declare <vscale x 16 x i32> @llvm.riscv.vrgather.vx.nxv16i32.iXLen(
   <vscale x 16 x i32>,
   <vscale x 16 x i32>,
-  i64,
-  i64);
+  iXLen,
+  iXLen)
 
-define <vscale x 16 x i32> @intrinsic_vrgather_vx_nxv16i32_nxv16i32_i64(<vscale x 16 x i32> %0, i64 %1, i64 %2) nounwind {
-; CHECK-LABEL: intrinsic_vrgather_vx_nxv16i32_nxv16i32_i64:
+define <vscale x 16 x i32> @intrinsic_vrgather_vx_nxv16i32_nxv16i32(<vscale x 16 x i32> %0, iXLen %1, iXLen %2) nounwind {
+; CHECK-LABEL: intrinsic_vrgather_vx_nxv16i32_nxv16i32:
 ; CHECK:       # %bb.0: # %entry
 ; CHECK-NEXT:    vsetvli zero, a1, e32, m8, ta, ma
 ; CHECK-NEXT:    vrgather.vx v16, v8, a0
 ; CHECK-NEXT:    vmv.v.v v8, v16
 ; CHECK-NEXT:    ret
 entry:
-  %a = call <vscale x 16 x i32> @llvm.riscv.vrgather.vx.nxv16i32.i64(
+  %a = call <vscale x 16 x i32> @llvm.riscv.vrgather.vx.nxv16i32.iXLen(
     <vscale x 16 x i32> undef,
     <vscale x 16 x i32> %0,
-    i64 %1,
-    i64 %2)
+    iXLen %1,
+    iXLen %2)
 
   ret <vscale x 16 x i32> %a
 }
 
-declare <vscale x 16 x i32> @llvm.riscv.vrgather.vx.mask.nxv16i32.i64(
+declare <vscale x 16 x i32> @llvm.riscv.vrgather.vx.mask.nxv16i32.iXLen(
   <vscale x 16 x i32>,
   <vscale x 16 x i32>,
-  i64,
+  iXLen,
   <vscale x 16 x i1>,
-  i64,
-  i64);
+  iXLen,
+  iXLen)
 
-define <vscale x 16 x i32> @intrinsic_vrgather_mask_vx_nxv16i32_nxv16i32_i64(<vscale x 16 x i32> %0, <vscale x 16 x i32> %1, i64 %2, <vscale x 16 x i1> %3, i64 %4) nounwind {
-; CHECK-LABEL: intrinsic_vrgather_mask_vx_nxv16i32_nxv16i32_i64:
+define <vscale x 16 x i32> @intrinsic_vrgather_mask_vx_nxv16i32_nxv16i32(<vscale x 16 x i32> %0, <vscale x 16 x i32> %1, iXLen %2, <vscale x 16 x i1> %3, iXLen %4) nounwind {
+; CHECK-LABEL: intrinsic_vrgather_mask_vx_nxv16i32_nxv16i32:
 ; CHECK:       # %bb.0: # %entry
 ; CHECK-NEXT:    vsetvli zero, a1, e32, m8, ta, mu
 ; CHECK-NEXT:    vrgather.vx v8, v16, a0, v0.t
 ; CHECK-NEXT:    ret
 entry:
-  %a = call <vscale x 16 x i32> @llvm.riscv.vrgather.vx.mask.nxv16i32.i64(
+  %a = call <vscale x 16 x i32> @llvm.riscv.vrgather.vx.mask.nxv16i32.iXLen(
     <vscale x 16 x i32> %0,
     <vscale x 16 x i32> %1,
-    i64 %2,
+    iXLen %2,
     <vscale x 16 x i1> %3,
-    i64 %4, i64 1)
+    iXLen %4, iXLen 1)
 
   ret <vscale x 16 x i32> %a
 }
 
-declare <vscale x 1 x i64> @llvm.riscv.vrgather.vx.nxv1i64.i64(
+declare <vscale x 1 x i64> @llvm.riscv.vrgather.vx.nxv1i64.iXLen(
   <vscale x 1 x i64>,
   <vscale x 1 x i64>,
-  i64,
-  i64);
+  iXLen,
+  iXLen)
 
-define <vscale x 1 x i64> @intrinsic_vrgather_vx_nxv1i64_nxv1i64_i64(<vscale x 1 x i64> %0, i64 %1, i64 %2) nounwind {
-; CHECK-LABEL: intrinsic_vrgather_vx_nxv1i64_nxv1i64_i64:
+define <vscale x 1 x i64> @intrinsic_vrgather_vx_nxv1i64_nxv1i64(<vscale x 1 x i64> %0, iXLen %1, iXLen %2) nounwind {
+; CHECK-LABEL: intrinsic_vrgather_vx_nxv1i64_nxv1i64:
 ; CHECK:       # %bb.0: # %entry
 ; CHECK-NEXT:    vsetvli zero, a1, e64, m1, ta, ma
 ; CHECK-NEXT:    vrgather.vx v9, v8, a0
 ; CHECK-NEXT:    vmv.v.v v8, v9
 ; CHECK-NEXT:    ret
 entry:
-  %a = call <vscale x 1 x i64> @llvm.riscv.vrgather.vx.nxv1i64.i64(
+  %a = call <vscale x 1 x i64> @llvm.riscv.vrgather.vx.nxv1i64.iXLen(
     <vscale x 1 x i64> undef,
     <vscale x 1 x i64> %0,
-    i64 %1,
-    i64 %2)
+    iXLen %1,
+    iXLen %2)
 
   ret <vscale x 1 x i64> %a
 }
 
-declare <vscale x 1 x i64> @llvm.riscv.vrgather.vx.mask.nxv1i64.i64(
+declare <vscale x 1 x i64> @llvm.riscv.vrgather.vx.mask.nxv1i64.iXLen(
   <vscale x 1 x i64>,
   <vscale x 1 x i64>,
-  i64,
+  iXLen,
   <vscale x 1 x i1>,
-  i64,
-  i64);
+  iXLen,
+  iXLen)
 
-define <vscale x 1 x i64> @intrinsic_vrgather_mask_vx_nxv1i64_nxv1i64_i64(<vscale x 1 x i64> %0, <vscale x 1 x i64> %1, i64 %2, <vscale x 1 x i1> %3, i64 %4) nounwind {
-; CHECK-LABEL: intrinsic_vrgather_mask_vx_nxv1i64_nxv1i64_i64:
+define <vscale x 1 x i64> @intrinsic_vrgather_mask_vx_nxv1i64_nxv1i64(<vscale x 1 x i64> %0, <vscale x 1 x i64> %1, iXLen %2, <vscale x 1 x i1> %3, iXLen %4) nounwind {
+; CHECK-LABEL: intrinsic_vrgather_mask_vx_nxv1i64_nxv1i64:
 ; CHECK:       # %bb.0: # %entry
 ; CHECK-NEXT:    vsetvli zero, a1, e64, m1, ta, mu
 ; CHECK-NEXT:    vrgather.vx v8, v9, a0, v0.t
 ; CHECK-NEXT:    ret
 entry:
-  %a = call <vscale x 1 x i64> @llvm.riscv.vrgather.vx.mask.nxv1i64.i64(
+  %a = call <vscale x 1 x i64> @llvm.riscv.vrgather.vx.mask.nxv1i64.iXLen(
     <vscale x 1 x i64> %0,
     <vscale x 1 x i64> %1,
-    i64 %2,
+    iXLen %2,
     <vscale x 1 x i1> %3,
-    i64 %4, i64 1)
+    iXLen %4, iXLen 1)
 
   ret <vscale x 1 x i64> %a
 }
 
-declare <vscale x 2 x i64> @llvm.riscv.vrgather.vx.nxv2i64.i64(
+declare <vscale x 2 x i64> @llvm.riscv.vrgather.vx.nxv2i64.iXLen(
   <vscale x 2 x i64>,
   <vscale x 2 x i64>,
-  i64,
-  i64);
+  iXLen,
+  iXLen)
 
-define <vscale x 2 x i64> @intrinsic_vrgather_vx_nxv2i64_nxv2i64_i64(<vscale x 2 x i64> %0, i64 %1, i64 %2) nounwind {
-; CHECK-LABEL: intrinsic_vrgather_vx_nxv2i64_nxv2i64_i64:
+define <vscale x 2 x i64> @intrinsic_vrgather_vx_nxv2i64_nxv2i64(<vscale x 2 x i64> %0, iXLen %1, iXLen %2) nounwind {
+; CHECK-LABEL: intrinsic_vrgather_vx_nxv2i64_nxv2i64:
 ; CHECK:       # %bb.0: # %entry
 ; CHECK-NEXT:    vsetvli zero, a1, e64, m2, ta, ma
 ; CHECK-NEXT:    vrgather.vx v10, v8, a0
 ; CHECK-NEXT:    vmv.v.v v8, v10
 ; CHECK-NEXT:    ret
 entry:
-  %a = call <vscale x 2 x i64> @llvm.riscv.vrgather.vx.nxv2i64.i64(
+  %a = call <vscale x 2 x i64> @llvm.riscv.vrgather.vx.nxv2i64.iXLen(
     <vscale x 2 x i64> undef,
     <vscale x 2 x i64> %0,
-    i64 %1,
-    i64 %2)
+    iXLen %1,
+    iXLen %2)
 
   ret <vscale x 2 x i64> %a
 }
 
-declare <vscale x 2 x i64> @llvm.riscv.vrgather.vx.mask.nxv2i64.i64(
+declare <vscale x 2 x i64> @llvm.riscv.vrgather.vx.mask.nxv2i64.iXLen(
   <vscale x 2 x i64>,
   <vscale x 2 x i64>,
-  i64,
+  iXLen,
   <vscale x 2 x i1>,
-  i64,
-  i64);
+  iXLen,
+  iXLen)
 
-define <vscale x 2 x i64> @intrinsic_vrgather_mask_vx_nxv2i64_nxv2i64_i64(<vscale x 2 x i64> %0, <vscale x 2 x i64> %1, i64 %2, <vscale x 2 x i1> %3, i64 %4) nounwind {
-; CHECK-LABEL: intrinsic_vrgather_mask_vx_nxv2i64_nxv2i64_i64:
+define <vscale x 2 x i64> @intrinsic_vrgather_mask_vx_nxv2i64_nxv2i64(<vscale x 2 x i64> %0, <vscale x 2 x i64> %1, iXLen %2, <vscale x 2 x i1> %3, iXLen %4) nounwind {
+; CHECK-LABEL: intrinsic_vrgather_mask_vx_nxv2i64_nxv2i64:
 ; CHECK:       # %bb.0: # %entry
 ; CHECK-NEXT:    vsetvli zero, a1, e64, m2, ta, mu
 ; CHECK-NEXT:    vrgather.vx v8, v10, a0, v0.t
 ; CHECK-NEXT:    ret
 entry:
-  %a = call <vscale x 2 x i64> @llvm.riscv.vrgather.vx.mask.nxv2i64.i64(
+  %a = call <vscale x 2 x i64> @llvm.riscv.vrgather.vx.mask.nxv2i64.iXLen(
     <vscale x 2 x i64> %0,
     <vscale x 2 x i64> %1,
-    i64 %2,
+    iXLen %2,
     <vscale x 2 x i1> %3,
-    i64 %4, i64 1)
+    iXLen %4, iXLen 1)
 
   ret <vscale x 2 x i64> %a
 }
 
-declare <vscale x 4 x i64> @llvm.riscv.vrgather.vx.nxv4i64.i64(
+declare <vscale x 4 x i64> @llvm.riscv.vrgather.vx.nxv4i64.iXLen(
   <vscale x 4 x i64>,
   <vscale x 4 x i64>,
-  i64,
-  i64);
+  iXLen,
+  iXLen)
 
-define <vscale x 4 x i64> @intrinsic_vrgather_vx_nxv4i64_nxv4i64_i64(<vscale x 4 x i64> %0, i64 %1, i64 %2) nounwind {
-; CHECK-LABEL: intrinsic_vrgather_vx_nxv4i64_nxv4i64_i64:
+define <vscale x 4 x i64> @intrinsic_vrgather_vx_nxv4i64_nxv4i64(<vscale x 4 x i64> %0, iXLen %1, iXLen %2) nounwind {
+; CHECK-LABEL: intrinsic_vrgather_vx_nxv4i64_nxv4i64:
 ; CHECK:       # %bb.0: # %entry
 ; CHECK-NEXT:    vsetvli zero, a1, e64, m4, ta, ma
 ; CHECK-NEXT:    vrgather.vx v12, v8, a0
 ; CHECK-NEXT:    vmv.v.v v8, v12
 ; CHECK-NEXT:    ret
 entry:
-  %a = call <vscale x 4 x i64> @llvm.riscv.vrgather.vx.nxv4i64.i64(
+  %a = call <vscale x 4 x i64> @llvm.riscv.vrgather.vx.nxv4i64.iXLen(
     <vscale x 4 x i64> undef,
     <vscale x 4 x i64> %0,
-    i64 %1,
-    i64 %2)
+    iXLen %1,
+    iXLen %2)
 
   ret <vscale x 4 x i64> %a
 }
 
-declare <vscale x 4 x i64> @llvm.riscv.vrgather.vx.mask.nxv4i64.i64(
+declare <vscale x 4 x i64> @llvm.riscv.vrgather.vx.mask.nxv4i64.iXLen(
   <vscale x 4 x i64>,
   <vscale x 4 x i64>,
-  i64,
+  iXLen,
   <vscale x 4 x i1>,
-  i64,
-  i64);
+  iXLen,
+  iXLen)
 
-define <vscale x 4 x i64> @intrinsic_vrgather_mask_vx_nxv4i64_nxv4i64_i64(<vscale x 4 x i64> %0, <vscale x 4 x i64> %1, i64 %2, <vscale x 4 x i1> %3, i64 %4) nounwind {
-; CHECK-LABEL: intrinsic_vrgather_mask_vx_nxv4i64_nxv4i64_i64:
+define <vscale x 4 x i64> @intrinsic_vrgather_mask_vx_nxv4i64_nxv4i64(<vscale x 4 x i64> %0, <vscale x 4 x i64> %1, iXLen %2, <vscale x 4 x i1> %3, iXLen %4) nounwind {
+; CHECK-LABEL: intrinsic_vrgather_mask_vx_nxv4i64_nxv4i64:
 ; CHECK:       # %bb.0: # %entry
 ; CHECK-NEXT:    vsetvli zero, a1, e64, m4, ta, mu
 ; CHECK-NEXT:    vrgather.vx v8, v12, a0, v0.t
 ; CHECK-NEXT:    ret
 entry:
-  %a = call <vscale x 4 x i64> @llvm.riscv.vrgather.vx.mask.nxv4i64.i64(
+  %a = call <vscale x 4 x i64> @llvm.riscv.vrgather.vx.mask.nxv4i64.iXLen(
     <vscale x 4 x i64> %0,
     <vscale x 4 x i64> %1,
-    i64 %2,
+    iXLen %2,
     <vscale x 4 x i1> %3,
-    i64 %4, i64 1)
+    iXLen %4, iXLen 1)
 
   ret <vscale x 4 x i64> %a
 }
 
-declare <vscale x 8 x i64> @llvm.riscv.vrgather.vx.nxv8i64.i64(
+declare <vscale x 8 x i64> @llvm.riscv.vrgather.vx.nxv8i64.iXLen(
   <vscale x 8 x i64>,
   <vscale x 8 x i64>,
-  i64,
-  i64);
+  iXLen,
+  iXLen)
 
-define <vscale x 8 x i64> @intrinsic_vrgather_vx_nxv8i64_nxv8i64_i64(<vscale x 8 x i64> %0, i64 %1, i64 %2) nounwind {
-; CHECK-LABEL: intrinsic_vrgather_vx_nxv8i64_nxv8i64_i64:
+define <vscale x 8 x i64> @intrinsic_vrgather_vx_nxv8i64_nxv8i64(<vscale x 8 x i64> %0, iXLen %1, iXLen %2) nounwind {
+; CHECK-LABEL: intrinsic_vrgather_vx_nxv8i64_nxv8i64:
 ; CHECK:       # %bb.0: # %entry
 ; CHECK-NEXT:    vsetvli zero, a1, e64, m8, ta, ma
 ; CHECK-NEXT:    vrgather.vx v16, v8, a0
 ; CHECK-NEXT:    vmv.v.v v8, v16
 ; CHECK-NEXT:    ret
 entry:
-  %a = call <vscale x 8 x i64> @llvm.riscv.vrgather.vx.nxv8i64.i64(
+  %a = call <vscale x 8 x i64> @llvm.riscv.vrgather.vx.nxv8i64.iXLen(
     <vscale x 8 x i64> undef,
     <vscale x 8 x i64> %0,
-    i64 %1,
-    i64 %2)
+    iXLen %1,
+    iXLen %2)
 
   ret <vscale x 8 x i64> %a
 }
 
-declare <vscale x 8 x i64> @llvm.riscv.vrgather.vx.mask.nxv8i64.i64(
+declare <vscale x 8 x i64> @llvm.riscv.vrgather.vx.mask.nxv8i64.iXLen(
   <vscale x 8 x i64>,
   <vscale x 8 x i64>,
-  i64,
+  iXLen,
   <vscale x 8 x i1>,
-  i64,
-  i64);
+  iXLen,
+  iXLen)
 
-define <vscale x 8 x i64> @intrinsic_vrgather_mask_vx_nxv8i64_nxv8i64_i64(<vscale x 8 x i64> %0, <vscale x 8 x i64> %1, i64 %2, <vscale x 8 x i1> %3, i64 %4) nounwind {
-; CHECK-LABEL: intrinsic_vrgather_mask_vx_nxv8i64_nxv8i64_i64:
+define <vscale x 8 x i64> @intrinsic_vrgather_mask_vx_nxv8i64_nxv8i64(<vscale x 8 x i64> %0, <vscale x 8 x i64> %1, iXLen %2, <vscale x 8 x i1> %3, iXLen %4) nounwind {
+; CHECK-LABEL: intrinsic_vrgather_mask_vx_nxv8i64_nxv8i64:
 ; CHECK:       # %bb.0: # %entry
 ; CHECK-NEXT:    vsetvli zero, a1, e64, m8, ta, mu
 ; CHECK-NEXT:    vrgather.vx v8, v16, a0, v0.t
 ; CHECK-NEXT:    ret
 entry:
-  %a = call <vscale x 8 x i64> @llvm.riscv.vrgather.vx.mask.nxv8i64.i64(
+  %a = call <vscale x 8 x i64> @llvm.riscv.vrgather.vx.mask.nxv8i64.iXLen(
     <vscale x 8 x i64> %0,
     <vscale x 8 x i64> %1,
-    i64 %2,
+    iXLen %2,
     <vscale x 8 x i1> %3,
-    i64 %4, i64 1)
+    iXLen %4, iXLen 1)
 
   ret <vscale x 8 x i64> %a
 }
 
-declare <vscale x 1 x half> @llvm.riscv.vrgather.vx.nxv1f16.i64(
+declare <vscale x 1 x half> @llvm.riscv.vrgather.vx.nxv1f16.iXLen(
   <vscale x 1 x half>,
   <vscale x 1 x half>,
-  i64,
-  i64);
+  iXLen,
+  iXLen)
 
-define <vscale x 1 x half> @intrinsic_vrgather_vx_nxv1f16_nxv1f16_i64(<vscale x 1 x half> %0, i64 %1, i64 %2) nounwind {
-; CHECK-LABEL: intrinsic_vrgather_vx_nxv1f16_nxv1f16_i64:
+define <vscale x 1 x half> @intrinsic_vrgather_vx_nxv1f16_nxv1f16(<vscale x 1 x half> %0, iXLen %1, iXLen %2) nounwind {
+; CHECK-LABEL: intrinsic_vrgather_vx_nxv1f16_nxv1f16:
 ; CHECK:       # %bb.0: # %entry
 ; CHECK-NEXT:    vsetvli zero, a1, e16, mf4, ta, ma
 ; CHECK-NEXT:    vrgather.vx v9, v8, a0
 ; CHECK-NEXT:    vmv1r.v v8, v9
 ; CHECK-NEXT:    ret
 entry:
-  %a = call <vscale x 1 x half> @llvm.riscv.vrgather.vx.nxv1f16.i64(
+  %a = call <vscale x 1 x half> @llvm.riscv.vrgather.vx.nxv1f16.iXLen(
     <vscale x 1 x half> undef,
     <vscale x 1 x half> %0,
-    i64 %1,
-    i64 %2)
+    iXLen %1,
+    iXLen %2)
 
   ret <vscale x 1 x half> %a
 }
 
-declare <vscale x 1 x half> @llvm.riscv.vrgather.vx.mask.nxv1f16.i64(
+declare <vscale x 1 x half> @llvm.riscv.vrgather.vx.mask.nxv1f16.iXLen(
   <vscale x 1 x half>,
   <vscale x 1 x half>,
-  i64,
+  iXLen,
   <vscale x 1 x i1>,
-  i64,
-  i64);
+  iXLen,
+  iXLen)
 
-define <vscale x 1 x half> @intrinsic_vrgather_mask_vx_nxv1f16_nxv1f16_i64(<vscale x 1 x half> %0, <vscale x 1 x half> %1, i64 %2, <vscale x 1 x i1> %3, i64 %4) nounwind {
-; CHECK-LABEL: intrinsic_vrgather_mask_vx_nxv1f16_nxv1f16_i64:
+define <vscale x 1 x half> @intrinsic_vrgather_mask_vx_nxv1f16_nxv1f16(<vscale x 1 x half> %0, <vscale x 1 x half> %1, iXLen %2, <vscale x 1 x i1> %3, iXLen %4) nounwind {
+; CHECK-LABEL: intrinsic_vrgather_mask_vx_nxv1f16_nxv1f16:
 ; CHECK:       # %bb.0: # %entry
 ; CHECK-NEXT:    vsetvli zero, a1, e16, mf4, ta, mu
 ; CHECK-NEXT:    vrgather.vx v8, v9, a0, v0.t
 ; CHECK-NEXT:    ret
 entry:
-  %a = call <vscale x 1 x half> @llvm.riscv.vrgather.vx.mask.nxv1f16.i64(
+  %a = call <vscale x 1 x half> @llvm.riscv.vrgather.vx.mask.nxv1f16.iXLen(
     <vscale x 1 x half> %0,
     <vscale x 1 x half> %1,
-    i64 %2,
+    iXLen %2,
     <vscale x 1 x i1> %3,
-    i64 %4, i64 1)
+    iXLen %4, iXLen 1)
 
   ret <vscale x 1 x half> %a
 }
 
-declare <vscale x 2 x half> @llvm.riscv.vrgather.vx.nxv2f16.i64(
+declare <vscale x 2 x half> @llvm.riscv.vrgather.vx.nxv2f16.iXLen(
   <vscale x 2 x half>,
   <vscale x 2 x half>,
-  i64,
-  i64);
+  iXLen,
+  iXLen)
 
-define <vscale x 2 x half> @intrinsic_vrgather_vx_nxv2f16_nxv2f16_i64(<vscale x 2 x half> %0, i64 %1, i64 %2) nounwind {
-; CHECK-LABEL: intrinsic_vrgather_vx_nxv2f16_nxv2f16_i64:
+define <vscale x 2 x half> @intrinsic_vrgather_vx_nxv2f16_nxv2f16(<vscale x 2 x half> %0, iXLen %1, iXLen %2) nounwind {
+; CHECK-LABEL: intrinsic_vrgather_vx_nxv2f16_nxv2f16:
 ; CHECK:       # %bb.0: # %entry
 ; CHECK-NEXT:    vsetvli zero, a1, e16, mf2, ta, ma
 ; CHECK-NEXT:    vrgather.vx v9, v8, a0
 ; CHECK-NEXT:    vmv1r.v v8, v9
 ; CHECK-NEXT:    ret
 entry:
-  %a = call <vscale x 2 x half> @llvm.riscv.vrgather.vx.nxv2f16.i64(
+  %a = call <vscale x 2 x half> @llvm.riscv.vrgather.vx.nxv2f16.iXLen(
     <vscale x 2 x half> undef,
     <vscale x 2 x half> %0,
-    i64 %1,
-    i64 %2)
+    iXLen %1,
+    iXLen %2)
 
   ret <vscale x 2 x half> %a
 }
 
-declare <vscale x 2 x half> @llvm.riscv.vrgather.vx.mask.nxv2f16.i64(
+declare <vscale x 2 x half> @llvm.riscv.vrgather.vx.mask.nxv2f16.iXLen(
   <vscale x 2 x half>,
   <vscale x 2 x half>,
-  i64,
+  iXLen,
   <vscale x 2 x i1>,
-  i64,
-  i64);
+  iXLen,
+  iXLen)
 
-define <vscale x 2 x half> @intrinsic_vrgather_mask_vx_nxv2f16_nxv2f16_i64(<vscale x 2 x half> %0, <vscale x 2 x half> %1, i64 %2, <vscale x 2 x i1> %3, i64 %4) nounwind {
-; CHECK-LABEL: intrinsic_vrgather_mask_vx_nxv2f16_nxv2f16_i64:
+define <vscale x 2 x half> @intrinsic_vrgather_mask_vx_nxv2f16_nxv2f16(<vscale x 2 x half> %0, <vscale x 2 x half> %1, iXLen %2, <vscale x 2 x i1> %3, iXLen %4) nounwind {
+; CHECK-LABEL: intrinsic_vrgather_mask_vx_nxv2f16_nxv2f16:
 ; CHECK:       # %bb.0: # %entry
 ; CHECK-NEXT:    vsetvli zero, a1, e16, mf2, ta, mu
 ; CHECK-NEXT:    vrgather.vx v8, v9, a0, v0.t
 ; CHECK-NEXT:    ret
 entry:
-  %a = call <vscale x 2 x half> @llvm.riscv.vrgather.vx.mask.nxv2f16.i64(
+  %a = call <vscale x 2 x half> @llvm.riscv.vrgather.vx.mask.nxv2f16.iXLen(
     <vscale x 2 x half> %0,
     <vscale x 2 x half> %1,
-    i64 %2,
+    iXLen %2,
     <vscale x 2 x i1> %3,
-    i64 %4, i64 1)
+    iXLen %4, iXLen 1)
 
   ret <vscale x 2 x half> %a
 }
 
-declare <vscale x 4 x half> @llvm.riscv.vrgather.vx.nxv4f16.i64(
+declare <vscale x 4 x half> @llvm.riscv.vrgather.vx.nxv4f16.iXLen(
   <vscale x 4 x half>,
   <vscale x 4 x half>,
-  i64,
-  i64);
+  iXLen,
+  iXLen)
 
-define <vscale x 4 x half> @intrinsic_vrgather_vx_nxv4f16_nxv4f16_i64(<vscale x 4 x half> %0, i64 %1, i64 %2) nounwind {
-; CHECK-LABEL: intrinsic_vrgather_vx_nxv4f16_nxv4f16_i64:
+define <vscale x 4 x half> @intrinsic_vrgather_vx_nxv4f16_nxv4f16(<vscale x 4 x half> %0, iXLen %1, iXLen %2) nounwind {
+; CHECK-LABEL: intrinsic_vrgather_vx_nxv4f16_nxv4f16:
 ; CHECK:       # %bb.0: # %entry
 ; CHECK-NEXT:    vsetvli zero, a1, e16, m1, ta, ma
 ; CHECK-NEXT:    vrgather.vx v9, v8, a0
 ; CHECK-NEXT:    vmv.v.v v8, v9
 ; CHECK-NEXT:    ret
 entry:
-  %a = call <vscale x 4 x half> @llvm.riscv.vrgather.vx.nxv4f16.i64(
+  %a = call <vscale x 4 x half> @llvm.riscv.vrgather.vx.nxv4f16.iXLen(
     <vscale x 4 x half> undef,
     <vscale x 4 x half> %0,
-    i64 %1,
-    i64 %2)
+    iXLen %1,
+    iXLen %2)
 
   ret <vscale x 4 x half> %a
 }
 
-declare <vscale x 4 x half> @llvm.riscv.vrgather.vx.mask.nxv4f16.i64(
+declare <vscale x 4 x half> @llvm.riscv.vrgather.vx.mask.nxv4f16.iXLen(
   <vscale x 4 x half>,
   <vscale x 4 x half>,
-  i64,
+  iXLen,
   <vscale x 4 x i1>,
-  i64,
-  i64);
+  iXLen,
+  iXLen)
 
-define <vscale x 4 x half> @intrinsic_vrgather_mask_vx_nxv4f16_nxv4f16_i64(<vscale x 4 x half> %0, <vscale x 4 x half> %1, i64 %2, <vscale x 4 x i1> %3, i64 %4) nounwind {
-; CHECK-LABEL: intrinsic_vrgather_mask_vx_nxv4f16_nxv4f16_i64:
+define <vscale x 4 x half> @intrinsic_vrgather_mask_vx_nxv4f16_nxv4f16(<vscale x 4 x half> %0, <vscale x 4 x half> %1, iXLen %2, <vscale x 4 x i1> %3, iXLen %4) nounwind {
+; CHECK-LABEL: intrinsic_vrgather_mask_vx_nxv4f16_nxv4f16:
 ; CHECK:       # %bb.0: # %entry
 ; CHECK-NEXT:    vsetvli zero, a1, e16, m1, ta, mu
 ; CHECK-NEXT:    vrgather.vx v8, v9, a0, v0.t
 ; CHECK-NEXT:    ret
 entry:
-  %a = call <vscale x 4 x half> @llvm.riscv.vrgather.vx.mask.nxv4f16.i64(
+  %a = call <vscale x 4 x half> @llvm.riscv.vrgather.vx.mask.nxv4f16.iXLen(
     <vscale x 4 x half> %0,
     <vscale x 4 x half> %1,
-    i64 %2,
+    iXLen %2,
     <vscale x 4 x i1> %3,
-    i64 %4, i64 1)
+    iXLen %4, iXLen 1)
 
   ret <vscale x 4 x half> %a
 }
 
-declare <vscale x 8 x half> @llvm.riscv.vrgather.vx.nxv8f16.i64(
+declare <vscale x 8 x half> @llvm.riscv.vrgather.vx.nxv8f16.iXLen(
   <vscale x 8 x half>,
   <vscale x 8 x half>,
-  i64,
-  i64);
+  iXLen,
+  iXLen)
 
-define <vscale x 8 x half> @intrinsic_vrgather_vx_nxv8f16_nxv8f16_i64(<vscale x 8 x half> %0, i64 %1, i64 %2) nounwind {
-; CHECK-LABEL: intrinsic_vrgather_vx_nxv8f16_nxv8f16_i64:
+define <vscale x 8 x half> @intrinsic_vrgather_vx_nxv8f16_nxv8f16(<vscale x 8 x half> %0, iXLen %1, iXLen %2) nounwind {
+; CHECK-LABEL: intrinsic_vrgather_vx_nxv8f16_nxv8f16:
 ; CHECK:       # %bb.0: # %entry
 ; CHECK-NEXT:    vsetvli zero, a1, e16, m2, ta, ma
 ; CHECK-NEXT:    vrgather.vx v10, v8, a0
 ; CHECK-NEXT:    vmv.v.v v8, v10
 ; CHECK-NEXT:    ret
 entry:
-  %a = call <vscale x 8 x half> @llvm.riscv.vrgather.vx.nxv8f16.i64(
+  %a = call <vscale x 8 x half> @llvm.riscv.vrgather.vx.nxv8f16.iXLen(
     <vscale x 8 x half> undef,
     <vscale x 8 x half> %0,
-    i64 %1,
-    i64 %2)
+    iXLen %1,
+    iXLen %2)
 
   ret <vscale x 8 x half> %a
 }
 
-declare <vscale x 8 x half> @llvm.riscv.vrgather.vx.mask.nxv8f16.i64(
+declare <vscale x 8 x half> @llvm.riscv.vrgather.vx.mask.nxv8f16.iXLen(
   <vscale x 8 x half>,
   <vscale x 8 x half>,
-  i64,
+  iXLen,
   <vscale x 8 x i1>,
-  i64,
-  i64);
+  iXLen,
+  iXLen)
 
-define <vscale x 8 x half> @intrinsic_vrgather_mask_vx_nxv8f16_nxv8f16_i64(<vscale x 8 x half> %0, <vscale x 8 x half> %1, i64 %2, <vscale x 8 x i1> %3, i64 %4) nounwind {
-; CHECK-LABEL: intrinsic_vrgather_mask_vx_nxv8f16_nxv8f16_i64:
+define <vscale x 8 x half> @intrinsic_vrgather_mask_vx_nxv8f16_nxv8f16(<vscale x 8 x half> %0, <vscale x 8 x half> %1, iXLen %2, <vscale x 8 x i1> %3, iXLen %4) nounwind {
+; CHECK-LABEL: intrinsic_vrgather_mask_vx_nxv8f16_nxv8f16:
 ; CHECK:       # %bb.0: # %entry
 ; CHECK-NEXT:    vsetvli zero, a1, e16, m2, ta, mu
 ; CHECK-NEXT:    vrgather.vx v8, v10, a0, v0.t
 ; CHECK-NEXT:    ret
 entry:
-  %a = call <vscale x 8 x half> @llvm.riscv.vrgather.vx.mask.nxv8f16.i64(
+  %a = call <vscale x 8 x half> @llvm.riscv.vrgather.vx.mask.nxv8f16.iXLen(
     <vscale x 8 x half> %0,
     <vscale x 8 x half> %1,
-    i64 %2,
+    iXLen %2,
     <vscale x 8 x i1> %3,
-    i64 %4, i64 1)
+    iXLen %4, iXLen 1)
 
   ret <vscale x 8 x half> %a
 }
 
-declare <vscale x 16 x half> @llvm.riscv.vrgather.vx.nxv16f16.i64(
+declare <vscale x 16 x half> @llvm.riscv.vrgather.vx.nxv16f16.iXLen(
   <vscale x 16 x half>,
   <vscale x 16 x half>,
-  i64,
-  i64);
+  iXLen,
+  iXLen)
 
-define <vscale x 16 x half> @intrinsic_vrgather_vx_nxv16f16_nxv16f16_i64(<vscale x 16 x half> %0, i64 %1, i64 %2) nounwind {
-; CHECK-LABEL: intrinsic_vrgather_vx_nxv16f16_nxv16f16_i64:
+define <vscale x 16 x half> @intrinsic_vrgather_vx_nxv16f16_nxv16f16(<vscale x 16 x half> %0, iXLen %1, iXLen %2) nounwind {
+; CHECK-LABEL: intrinsic_vrgather_vx_nxv16f16_nxv16f16:
 ; CHECK:       # %bb.0: # %entry
 ; CHECK-NEXT:    vsetvli zero, a1, e16, m4, ta, ma
 ; CHECK-NEXT:    vrgather.vx v12, v8, a0
 ; CHECK-NEXT:    vmv.v.v v8, v12
 ; CHECK-NEXT:    ret
 entry:
-  %a = call <vscale x 16 x half> @llvm.riscv.vrgather.vx.nxv16f16.i64(
+  %a = call <vscale x 16 x half> @llvm.riscv.vrgather.vx.nxv16f16.iXLen(
     <vscale x 16 x half> undef,
     <vscale x 16 x half> %0,
-    i64 %1,
-    i64 %2)
+    iXLen %1,
+    iXLen %2)
 
   ret <vscale x 16 x half> %a
 }
 
-declare <vscale x 16 x half> @llvm.riscv.vrgather.vx.mask.nxv16f16.i64(
+declare <vscale x 16 x half> @llvm.riscv.vrgather.vx.mask.nxv16f16.iXLen(
   <vscale x 16 x half>,
   <vscale x 16 x half>,
-  i64,
+  iXLen,
   <vscale x 16 x i1>,
-  i64,
-  i64);
+  iXLen,
+  iXLen)
 
-define <vscale x 16 x half> @intrinsic_vrgather_mask_vx_nxv16f16_nxv16f16_i64(<vscale x 16 x half> %0, <vscale x 16 x half> %1, i64 %2, <vscale x 16 x i1> %3, i64 %4) nounwind {
-; CHECK-LABEL: intrinsic_vrgather_mask_vx_nxv16f16_nxv16f16_i64:
+define <vscale x 16 x half> @intrinsic_vrgather_mask_vx_nxv16f16_nxv16f16(<vscale x 16 x half> %0, <vscale x 16 x half> %1, iXLen %2, <vscale x 16 x i1> %3, iXLen %4) nounwind {
+; CHECK-LABEL: intrinsic_vrgather_mask_vx_nxv16f16_nxv16f16:
 ; CHECK:       # %bb.0: # %entry
 ; CHECK-NEXT:    vsetvli zero, a1, e16, m4, ta, mu
 ; CHECK-NEXT:    vrgather.vx v8, v12, a0, v0.t
 ; CHECK-NEXT:    ret
 entry:
-  %a = call <vscale x 16 x half> @llvm.riscv.vrgather.vx.mask.nxv16f16.i64(
+  %a = call <vscale x 16 x half> @llvm.riscv.vrgather.vx.mask.nxv16f16.iXLen(
     <vscale x 16 x half> %0,
     <vscale x 16 x half> %1,
-    i64 %2,
+    iXLen %2,
     <vscale x 16 x i1> %3,
-    i64 %4, i64 1)
+    iXLen %4, iXLen 1)
 
   ret <vscale x 16 x half> %a
 }
 
-declare <vscale x 32 x half> @llvm.riscv.vrgather.vx.nxv32f16.i64(
+declare <vscale x 32 x half> @llvm.riscv.vrgather.vx.nxv32f16.iXLen(
   <vscale x 32 x half>,
   <vscale x 32 x half>,
-  i64,
-  i64);
+  iXLen,
+  iXLen)
 
-define <vscale x 32 x half> @intrinsic_vrgather_vx_nxv32f16_nxv32f16_i64(<vscale x 32 x half> %0, i64 %1, i64 %2) nounwind {
-; CHECK-LABEL: intrinsic_vrgather_vx_nxv32f16_nxv32f16_i64:
+define <vscale x 32 x half> @intrinsic_vrgather_vx_nxv32f16_nxv32f16(<vscale x 32 x half> %0, iXLen %1, iXLen %2) nounwind {
+; CHECK-LABEL: intrinsic_vrgather_vx_nxv32f16_nxv32f16:
 ; CHECK:       # %bb.0: # %entry
 ; CHECK-NEXT:    vsetvli zero, a1, e16, m8, ta, ma
 ; CHECK-NEXT:    vrgather.vx v16, v8, a0
 ; CHECK-NEXT:    vmv.v.v v8, v16
 ; CHECK-NEXT:    ret
 entry:
-  %a = call <vscale x 32 x half> @llvm.riscv.vrgather.vx.nxv32f16.i64(
+  %a = call <vscale x 32 x half> @llvm.riscv.vrgather.vx.nxv32f16.iXLen(
     <vscale x 32 x half> undef,
     <vscale x 32 x half> %0,
-    i64 %1,
-    i64 %2)
+    iXLen %1,
+    iXLen %2)
 
   ret <vscale x 32 x half> %a
 }
 
-declare <vscale x 32 x half> @llvm.riscv.vrgather.vx.mask.nxv32f16.i64(
+declare <vscale x 32 x half> @llvm.riscv.vrgather.vx.mask.nxv32f16.iXLen(
   <vscale x 32 x half>,
   <vscale x 32 x half>,
-  i64,
+  iXLen,
   <vscale x 32 x i1>,
-  i64,
-  i64);
+  iXLen,
+  iXLen)
 
-define <vscale x 32 x half> @intrinsic_vrgather_mask_vx_nxv32f16_nxv32f16_i64(<vscale x 32 x half> %0, <vscale x 32 x half> %1, i64 %2, <vscale x 32 x i1> %3, i64 %4) nounwind {
-; CHECK-LABEL: intrinsic_vrgather_mask_vx_nxv32f16_nxv32f16_i64:
+define <vscale x 32 x half> @intrinsic_vrgather_mask_vx_nxv32f16_nxv32f16(<vscale x 32 x half> %0, <vscale x 32 x half> %1, iXLen %2, <vscale x 32 x i1> %3, iXLen %4) nounwind {
+; CHECK-LABEL: intrinsic_vrgather_mask_vx_nxv32f16_nxv32f16:
 ; CHECK:       # %bb.0: # %entry
 ; CHECK-NEXT:    vsetvli zero, a1, e16, m8, ta, mu
 ; CHECK-NEXT:    vrgather.vx v8, v16, a0, v0.t
 ; CHECK-NEXT:    ret
 entry:
-  %a = call <vscale x 32 x half> @llvm.riscv.vrgather.vx.mask.nxv32f16.i64(
+  %a = call <vscale x 32 x half> @llvm.riscv.vrgather.vx.mask.nxv32f16.iXLen(
     <vscale x 32 x half> %0,
     <vscale x 32 x half> %1,
-    i64 %2,
+    iXLen %2,
     <vscale x 32 x i1> %3,
-    i64 %4, i64 1)
+    iXLen %4, iXLen 1)
 
   ret <vscale x 32 x half> %a
 }
 
-declare <vscale x 1 x float> @llvm.riscv.vrgather.vx.nxv1f32.i64(
+declare <vscale x 1 x float> @llvm.riscv.vrgather.vx.nxv1f32.iXLen(
   <vscale x 1 x float>,
   <vscale x 1 x float>,
-  i64,
-  i64);
+  iXLen,
+  iXLen)
 
-define <vscale x 1 x float> @intrinsic_vrgather_vx_nxv1f32_nxv1f32_i64(<vscale x 1 x float> %0, i64 %1, i64 %2) nounwind {
-; CHECK-LABEL: intrinsic_vrgather_vx_nxv1f32_nxv1f32_i64:
+define <vscale x 1 x float> @intrinsic_vrgather_vx_nxv1f32_nxv1f32(<vscale x 1 x float> %0, iXLen %1, iXLen %2) nounwind {
+; CHECK-LABEL: intrinsic_vrgather_vx_nxv1f32_nxv1f32:
 ; CHECK:       # %bb.0: # %entry
 ; CHECK-NEXT:    vsetvli zero, a1, e32, mf2, ta, ma
 ; CHECK-NEXT:    vrgather.vx v9, v8, a0
 ; CHECK-NEXT:    vmv1r.v v8, v9
 ; CHECK-NEXT:    ret
 entry:
-  %a = call <vscale x 1 x float> @llvm.riscv.vrgather.vx.nxv1f32.i64(
+  %a = call <vscale x 1 x float> @llvm.riscv.vrgather.vx.nxv1f32.iXLen(
     <vscale x 1 x float> undef,
     <vscale x 1 x float> %0,
-    i64 %1,
-    i64 %2)
+    iXLen %1,
+    iXLen %2)
 
   ret <vscale x 1 x float> %a
 }
 
-declare <vscale x 1 x float> @llvm.riscv.vrgather.vx.mask.nxv1f32.i64(
+declare <vscale x 1 x float> @llvm.riscv.vrgather.vx.mask.nxv1f32.iXLen(
   <vscale x 1 x float>,
   <vscale x 1 x float>,
-  i64,
+  iXLen,
   <vscale x 1 x i1>,
-  i64,
-  i64);
+  iXLen,
+  iXLen)
 
-define <vscale x 1 x float> @intrinsic_vrgather_mask_vx_nxv1f32_nxv1f32_i64(<vscale x 1 x float> %0, <vscale x 1 x float> %1, i64 %2, <vscale x 1 x i1> %3, i64 %4) nounwind {
-; CHECK-LABEL: intrinsic_vrgather_mask_vx_nxv1f32_nxv1f32_i64:
+define <vscale x 1 x float> @intrinsic_vrgather_mask_vx_nxv1f32_nxv1f32(<vscale x 1 x float> %0, <vscale x 1 x float> %1, iXLen %2, <vscale x 1 x i1> %3, iXLen %4) nounwind {
+; CHECK-LABEL: intrinsic_vrgather_mask_vx_nxv1f32_nxv1f32:
 ; CHECK:       # %bb.0: # %entry
 ; CHECK-NEXT:    vsetvli zero, a1, e32, mf2, ta, mu
 ; CHECK-NEXT:    vrgather.vx v8, v9, a0, v0.t
 ; CHECK-NEXT:    ret
 entry:
-  %a = call <vscale x 1 x float> @llvm.riscv.vrgather.vx.mask.nxv1f32.i64(
+  %a = call <vscale x 1 x float> @llvm.riscv.vrgather.vx.mask.nxv1f32.iXLen(
     <vscale x 1 x float> %0,
     <vscale x 1 x float> %1,
-    i64 %2,
+    iXLen %2,
     <vscale x 1 x i1> %3,
-    i64 %4, i64 1)
+    iXLen %4, iXLen 1)
 
   ret <vscale x 1 x float> %a
 }
 
-declare <vscale x 2 x float> @llvm.riscv.vrgather.vx.nxv2f32.i64(
+declare <vscale x 2 x float> @llvm.riscv.vrgather.vx.nxv2f32.iXLen(
   <vscale x 2 x float>,
   <vscale x 2 x float>,
-  i64,
-  i64);
+  iXLen,
+  iXLen)
 
-define <vscale x 2 x float> @intrinsic_vrgather_vx_nxv2f32_nxv2f32_i64(<vscale x 2 x float> %0, i64 %1, i64 %2) nounwind {
-; CHECK-LABEL: intrinsic_vrgather_vx_nxv2f32_nxv2f32_i64:
+define <vscale x 2 x float> @intrinsic_vrgather_vx_nxv2f32_nxv2f32(<vscale x 2 x float> %0, iXLen %1, iXLen %2) nounwind {
+; CHECK-LABEL: intrinsic_vrgather_vx_nxv2f32_nxv2f32:
 ; CHECK:       # %bb.0: # %entry
 ; CHECK-NEXT:    vsetvli zero, a1, e32, m1, ta, ma
 ; CHECK-NEXT:    vrgather.vx v9, v8, a0
 ; CHECK-NEXT:    vmv.v.v v8, v9
 ; CHECK-NEXT:    ret
 entry:
-  %a = call <vscale x 2 x float> @llvm.riscv.vrgather.vx.nxv2f32.i64(
+  %a = call <vscale x 2 x float> @llvm.riscv.vrgather.vx.nxv2f32.iXLen(
     <vscale x 2 x float> undef,
     <vscale x 2 x float> %0,
-    i64 %1,
-    i64 %2)
+    iXLen %1,
+    iXLen %2)
 
   ret <vscale x 2 x float> %a
 }
 
-declare <vscale x 2 x float> @llvm.riscv.vrgather.vx.mask.nxv2f32.i64(
+declare <vscale x 2 x float> @llvm.riscv.vrgather.vx.mask.nxv2f32.iXLen(
   <vscale x 2 x float>,
   <vscale x 2 x float>,
-  i64,
+  iXLen,
   <vscale x 2 x i1>,
-  i64,
-  i64);
+  iXLen,
+  iXLen)
 
-define <vscale x 2 x float> @intrinsic_vrgather_mask_vx_nxv2f32_nxv2f32_i64(<vscale x 2 x float> %0, <vscale x 2 x float> %1, i64 %2, <vscale x 2 x i1> %3, i64 %4) nounwind {
-; CHECK-LABEL: intrinsic_vrgather_mask_vx_nxv2f32_nxv2f32_i64:
+define <vscale x 2 x float> @intrinsic_vrgather_mask_vx_nxv2f32_nxv2f32(<vscale x 2 x float> %0, <vscale x 2 x float> %1, iXLen %2, <vscale x 2 x i1> %3, iXLen %4) nounwind {
+; CHECK-LABEL: intrinsic_vrgather_mask_vx_nxv2f32_nxv2f32:
 ; CHECK:       # %bb.0: # %entry
 ; CHECK-NEXT:    vsetvli zero, a1, e32, m1, ta, mu
 ; CHECK-NEXT:    vrgather.vx v8, v9, a0, v0.t
 ; CHECK-NEXT:    ret
 entry:
-  %a = call <vscale x 2 x float> @llvm.riscv.vrgather.vx.mask.nxv2f32.i64(
+  %a = call <vscale x 2 x float> @llvm.riscv.vrgather.vx.mask.nxv2f32.iXLen(
     <vscale x 2 x float> %0,
     <vscale x 2 x float> %1,
-    i64 %2,
+    iXLen %2,
     <vscale x 2 x i1> %3,
-    i64 %4, i64 1)
+    iXLen %4, iXLen 1)
 
   ret <vscale x 2 x float> %a
 }
 
-declare <vscale x 4 x float> @llvm.riscv.vrgather.vx.nxv4f32.i64(
+declare <vscale x 4 x float> @llvm.riscv.vrgather.vx.nxv4f32.iXLen(
   <vscale x 4 x float>,
   <vscale x 4 x float>,
-  i64,
-  i64);
+  iXLen,
+  iXLen)
 
-define <vscale x 4 x float> @intrinsic_vrgather_vx_nxv4f32_nxv4f32_i64(<vscale x 4 x float> %0, i64 %1, i64 %2) nounwind {
-; CHECK-LABEL: intrinsic_vrgather_vx_nxv4f32_nxv4f32_i64:
+define <vscale x 4 x float> @intrinsic_vrgather_vx_nxv4f32_nxv4f32(<vscale x 4 x float> %0, iXLen %1, iXLen %2) nounwind {
+; CHECK-LABEL: intrinsic_vrgather_vx_nxv4f32_nxv4f32:
 ; CHECK:       # %bb.0: # %entry
 ; CHECK-NEXT:    vsetvli zero, a1, e32, m2, ta, ma
 ; CHECK-NEXT:    vrgather.vx v10, v8, a0
 ; CHECK-NEXT:    vmv.v.v v8, v10
 ; CHECK-NEXT:    ret
 entry:
-  %a = call <vscale x 4 x float> @llvm.riscv.vrgather.vx.nxv4f32.i64(
+  %a = call <vscale x 4 x float> @llvm.riscv.vrgather.vx.nxv4f32.iXLen(
     <vscale x 4 x float> undef,
     <vscale x 4 x float> %0,
-    i64 %1,
-    i64 %2)
+    iXLen %1,
+    iXLen %2)
 
   ret <vscale x 4 x float> %a
 }
 
-declare <vscale x 4 x float> @llvm.riscv.vrgather.vx.mask.nxv4f32.i64(
+declare <vscale x 4 x float> @llvm.riscv.vrgather.vx.mask.nxv4f32.iXLen(
   <vscale x 4 x float>,
   <vscale x 4 x float>,
-  i64,
+  iXLen,
   <vscale x 4 x i1>,
-  i64,
-  i64);
+  iXLen,
+  iXLen)
 
-define <vscale x 4 x float> @intrinsic_vrgather_mask_vx_nxv4f32_nxv4f32_i64(<vscale x 4 x float> %0, <vscale x 4 x float> %1, i64 %2, <vscale x 4 x i1> %3, i64 %4) nounwind {
-; CHECK-LABEL: intrinsic_vrgather_mask_vx_nxv4f32_nxv4f32_i64:
+define <vscale x 4 x float> @intrinsic_vrgather_mask_vx_nxv4f32_nxv4f32(<vscale x 4 x float> %0, <vscale x 4 x float> %1, iXLen %2, <vscale x 4 x i1> %3, iXLen %4) nounwind {
+; CHECK-LABEL: intrinsic_vrgather_mask_vx_nxv4f32_nxv4f32:
 ; CHECK:       # %bb.0: # %entry
 ; CHECK-NEXT:    vsetvli zero, a1, e32, m2, ta, mu
 ; CHECK-NEXT:    vrgather.vx v8, v10, a0, v0.t
 ; CHECK-NEXT:    ret
 entry:
-  %a = call <vscale x 4 x float> @llvm.riscv.vrgather.vx.mask.nxv4f32.i64(
+  %a = call <vscale x 4 x float> @llvm.riscv.vrgather.vx.mask.nxv4f32.iXLen(
     <vscale x 4 x float> %0,
     <vscale x 4 x float> %1,
-    i64 %2,
+    iXLen %2,
     <vscale x 4 x i1> %3,
-    i64 %4, i64 1)
+    iXLen %4, iXLen 1)
 
   ret <vscale x 4 x float> %a
 }
 
-declare <vscale x 8 x float> @llvm.riscv.vrgather.vx.nxv8f32.i64(
+declare <vscale x 8 x float> @llvm.riscv.vrgather.vx.nxv8f32.iXLen(
   <vscale x 8 x float>,
   <vscale x 8 x float>,
-  i64,
-  i64);
+  iXLen,
+  iXLen)
 
-define <vscale x 8 x float> @intrinsic_vrgather_vx_nxv8f32_nxv8f32_i64(<vscale x 8 x float> %0, i64 %1, i64 %2) nounwind {
-; CHECK-LABEL: intrinsic_vrgather_vx_nxv8f32_nxv8f32_i64:
+define <vscale x 8 x float> @intrinsic_vrgather_vx_nxv8f32_nxv8f32(<vscale x 8 x float> %0, iXLen %1, iXLen %2) nounwind {
+; CHECK-LABEL: intrinsic_vrgather_vx_nxv8f32_nxv8f32:
 ; CHECK:       # %bb.0: # %entry
 ; CHECK-NEXT:    vsetvli zero, a1, e32, m4, ta, ma
 ; CHECK-NEXT:    vrgather.vx v12, v8, a0
 ; CHECK-NEXT:    vmv.v.v v8, v12
 ; CHECK-NEXT:    ret
 entry:
-  %a = call <vscale x 8 x float> @llvm.riscv.vrgather.vx.nxv8f32.i64(
+  %a = call <vscale x 8 x float> @llvm.riscv.vrgather.vx.nxv8f32.iXLen(
     <vscale x 8 x float> undef,
     <vscale x 8 x float> %0,
-    i64 %1,
-    i64 %2)
+    iXLen %1,
+    iXLen %2)
 
   ret <vscale x 8 x float> %a
 }
 
-declare <vscale x 8 x float> @llvm.riscv.vrgather.vx.mask.nxv8f32.i64(
+declare <vscale x 8 x float> @llvm.riscv.vrgather.vx.mask.nxv8f32.iXLen(
   <vscale x 8 x float>,
   <vscale x 8 x float>,
-  i64,
+  iXLen,
   <vscale x 8 x i1>,
-  i64,
-  i64);
+  iXLen,
+  iXLen)
 
-define <vscale x 8 x float> @intrinsic_vrgather_mask_vx_nxv8f32_nxv8f32_i64(<vscale x 8 x float> %0, <vscale x 8 x float> %1, i64 %2, <vscale x 8 x i1> %3, i64 %4) nounwind {
-; CHECK-LABEL: intrinsic_vrgather_mask_vx_nxv8f32_nxv8f32_i64:
+define <vscale x 8 x float> @intrinsic_vrgather_mask_vx_nxv8f32_nxv8f32(<vscale x 8 x float> %0, <vscale x 8 x float> %1, iXLen %2, <vscale x 8 x i1> %3, iXLen %4) nounwind {
+; CHECK-LABEL: intrinsic_vrgather_mask_vx_nxv8f32_nxv8f32:
 ; CHECK:       # %bb.0: # %entry
 ; CHECK-NEXT:    vsetvli zero, a1, e32, m4, ta, mu
 ; CHECK-NEXT:    vrgather.vx v8, v12, a0, v0.t
 ; CHECK-NEXT:    ret
 entry:
-  %a = call <vscale x 8 x float> @llvm.riscv.vrgather.vx.mask.nxv8f32.i64(
+  %a = call <vscale x 8 x float> @llvm.riscv.vrgather.vx.mask.nxv8f32.iXLen(
     <vscale x 8 x float> %0,
     <vscale x 8 x float> %1,
-    i64 %2,
+    iXLen %2,
     <vscale x 8 x i1> %3,
-    i64 %4, i64 1)
+    iXLen %4, iXLen 1)
 
   ret <vscale x 8 x float> %a
 }
 
-declare <vscale x 16 x float> @llvm.riscv.vrgather.vx.nxv16f32.i64(
+declare <vscale x 16 x float> @llvm.riscv.vrgather.vx.nxv16f32.iXLen(
   <vscale x 16 x float>,
   <vscale x 16 x float>,
-  i64,
-  i64);
+  iXLen,
+  iXLen)
 
-define <vscale x 16 x float> @intrinsic_vrgather_vx_nxv16f32_nxv16f32_i64(<vscale x 16 x float> %0, i64 %1, i64 %2) nounwind {
-; CHECK-LABEL: intrinsic_vrgather_vx_nxv16f32_nxv16f32_i64:
+define <vscale x 16 x float> @intrinsic_vrgather_vx_nxv16f32_nxv16f32(<vscale x 16 x float> %0, iXLen %1, iXLen %2) nounwind {
+; CHECK-LABEL: intrinsic_vrgather_vx_nxv16f32_nxv16f32:
 ; CHECK:       # %bb.0: # %entry
 ; CHECK-NEXT:    vsetvli zero, a1, e32, m8, ta, ma
 ; CHECK-NEXT:    vrgather.vx v16, v8, a0
 ; CHECK-NEXT:    vmv.v.v v8, v16
 ; CHECK-NEXT:    ret
 entry:
-  %a = call <vscale x 16 x float> @llvm.riscv.vrgather.vx.nxv16f32.i64(
+  %a = call <vscale x 16 x float> @llvm.riscv.vrgather.vx.nxv16f32.iXLen(
     <vscale x 16 x float> undef,
     <vscale x 16 x float> %0,
-    i64 %1,
-    i64 %2)
+    iXLen %1,
+    iXLen %2)
 
   ret <vscale x 16 x float> %a
 }
 
-declare <vscale x 16 x float> @llvm.riscv.vrgather.vx.mask.nxv16f32.i64(
+declare <vscale x 16 x float> @llvm.riscv.vrgather.vx.mask.nxv16f32.iXLen(
   <vscale x 16 x float>,
   <vscale x 16 x float>,
-  i64,
+  iXLen,
   <vscale x 16 x i1>,
-  i64,
-  i64);
+  iXLen,
+  iXLen)
 
-define <vscale x 16 x float> @intrinsic_vrgather_mask_vx_nxv16f32_nxv16f32_i64(<vscale x 16 x float> %0, <vscale x 16 x float> %1, i64 %2, <vscale x 16 x i1> %3, i64 %4) nounwind {
-; CHECK-LABEL: intrinsic_vrgather_mask_vx_nxv16f32_nxv16f32_i64:
+define <vscale x 16 x float> @intrinsic_vrgather_mask_vx_nxv16f32_nxv16f32(<vscale x 16 x float> %0, <vscale x 16 x float> %1, iXLen %2, <vscale x 16 x i1> %3, iXLen %4) nounwind {
+; CHECK-LABEL: intrinsic_vrgather_mask_vx_nxv16f32_nxv16f32:
 ; CHECK:       # %bb.0: # %entry
 ; CHECK-NEXT:    vsetvli zero, a1, e32, m8, ta, mu
 ; CHECK-NEXT:    vrgather.vx v8, v16, a0, v0.t
 ; CHECK-NEXT:    ret
 entry:
-  %a = call <vscale x 16 x float> @llvm.riscv.vrgather.vx.mask.nxv16f32.i64(
+  %a = call <vscale x 16 x float> @llvm.riscv.vrgather.vx.mask.nxv16f32.iXLen(
     <vscale x 16 x float> %0,
     <vscale x 16 x float> %1,
-    i64 %2,
+    iXLen %2,
     <vscale x 16 x i1> %3,
-    i64 %4, i64 1)
+    iXLen %4, iXLen 1)
 
   ret <vscale x 16 x float> %a
 }
 
-declare <vscale x 1 x double> @llvm.riscv.vrgather.vx.nxv1f64.i64(
+declare <vscale x 1 x double> @llvm.riscv.vrgather.vx.nxv1f64.iXLen(
   <vscale x 1 x double>,
   <vscale x 1 x double>,
-  i64,
-  i64);
+  iXLen,
+  iXLen)
 
-define <vscale x 1 x double> @intrinsic_vrgather_vx_nxv1f64_nxv1f64_i64(<vscale x 1 x double> %0, i64 %1, i64 %2) nounwind {
-; CHECK-LABEL: intrinsic_vrgather_vx_nxv1f64_nxv1f64_i64:
+define <vscale x 1 x double> @intrinsic_vrgather_vx_nxv1f64_nxv1f64(<vscale x 1 x double> %0, iXLen %1, iXLen %2) nounwind {
+; CHECK-LABEL: intrinsic_vrgather_vx_nxv1f64_nxv1f64:
 ; CHECK:       # %bb.0: # %entry
 ; CHECK-NEXT:    vsetvli zero, a1, e64, m1, ta, ma
 ; CHECK-NEXT:    vrgather.vx v9, v8, a0
 ; CHECK-NEXT:    vmv.v.v v8, v9
 ; CHECK-NEXT:    ret
 entry:
-  %a = call <vscale x 1 x double> @llvm.riscv.vrgather.vx.nxv1f64.i64(
+  %a = call <vscale x 1 x double> @llvm.riscv.vrgather.vx.nxv1f64.iXLen(
     <vscale x 1 x double> undef,
     <vscale x 1 x double> %0,
-    i64 %1,
-    i64 %2)
+    iXLen %1,
+    iXLen %2)
 
   ret <vscale x 1 x double> %a
 }
 
-declare <vscale x 1 x double> @llvm.riscv.vrgather.vx.mask.nxv1f64.i64(
+declare <vscale x 1 x double> @llvm.riscv.vrgather.vx.mask.nxv1f64.iXLen(
   <vscale x 1 x double>,
   <vscale x 1 x double>,
-  i64,
+  iXLen,
   <vscale x 1 x i1>,
-  i64,
-  i64);
+  iXLen,
+  iXLen)
 
-define <vscale x 1 x double> @intrinsic_vrgather_mask_vx_nxv1f64_nxv1f64_i64(<vscale x 1 x double> %0, <vscale x 1 x double> %1, i64 %2, <vscale x 1 x i1> %3, i64 %4) nounwind {
-; CHECK-LABEL: intrinsic_vrgather_mask_vx_nxv1f64_nxv1f64_i64:
+define <vscale x 1 x double> @intrinsic_vrgather_mask_vx_nxv1f64_nxv1f64(<vscale x 1 x double> %0, <vscale x 1 x double> %1, iXLen %2, <vscale x 1 x i1> %3, iXLen %4) nounwind {
+; CHECK-LABEL: intrinsic_vrgather_mask_vx_nxv1f64_nxv1f64:
 ; CHECK:       # %bb.0: # %entry
 ; CHECK-NEXT:    vsetvli zero, a1, e64, m1, ta, mu
 ; CHECK-NEXT:    vrgather.vx v8, v9, a0, v0.t
 ; CHECK-NEXT:    ret
 entry:
-  %a = call <vscale x 1 x double> @llvm.riscv.vrgather.vx.mask.nxv1f64.i64(
+  %a = call <vscale x 1 x double> @llvm.riscv.vrgather.vx.mask.nxv1f64.iXLen(
     <vscale x 1 x double> %0,
     <vscale x 1 x double> %1,
-    i64 %2,
+    iXLen %2,
     <vscale x 1 x i1> %3,
-    i64 %4, i64 1)
+    iXLen %4, iXLen 1)
 
   ret <vscale x 1 x double> %a
 }
 
-declare <vscale x 2 x double> @llvm.riscv.vrgather.vx.nxv2f64.i64(
+declare <vscale x 2 x double> @llvm.riscv.vrgather.vx.nxv2f64.iXLen(
   <vscale x 2 x double>,
   <vscale x 2 x double>,
-  i64,
-  i64);
+  iXLen,
+  iXLen)
 
-define <vscale x 2 x double> @intrinsic_vrgather_vx_nxv2f64_nxv2f64_i64(<vscale x 2 x double> %0, i64 %1, i64 %2) nounwind {
-; CHECK-LABEL: intrinsic_vrgather_vx_nxv2f64_nxv2f64_i64:
+define <vscale x 2 x double> @intrinsic_vrgather_vx_nxv2f64_nxv2f64(<vscale x 2 x double> %0, iXLen %1, iXLen %2) nounwind {
+; CHECK-LABEL: intrinsic_vrgather_vx_nxv2f64_nxv2f64:
 ; CHECK:       # %bb.0: # %entry
 ; CHECK-NEXT:    vsetvli zero, a1, e64, m2, ta, ma
 ; CHECK-NEXT:    vrgather.vx v10, v8, a0
 ; CHECK-NEXT:    vmv.v.v v8, v10
 ; CHECK-NEXT:    ret
 entry:
-  %a = call <vscale x 2 x double> @llvm.riscv.vrgather.vx.nxv2f64.i64(
+  %a = call <vscale x 2 x double> @llvm.riscv.vrgather.vx.nxv2f64.iXLen(
     <vscale x 2 x double> undef,
     <vscale x 2 x double> %0,
-    i64 %1,
-    i64 %2)
+    iXLen %1,
+    iXLen %2)
 
   ret <vscale x 2 x double> %a
 }
 
-declare <vscale x 2 x double> @llvm.riscv.vrgather.vx.mask.nxv2f64.i64(
+declare <vscale x 2 x double> @llvm.riscv.vrgather.vx.mask.nxv2f64.iXLen(
   <vscale x 2 x double>,
   <vscale x 2 x double>,
-  i64,
+  iXLen,
   <vscale x 2 x i1>,
-  i64,
-  i64);
+  iXLen,
+  iXLen)
 
-define <vscale x 2 x double> @intrinsic_vrgather_mask_vx_nxv2f64_nxv2f64_i64(<vscale x 2 x double> %0, <vscale x 2 x double> %1, i64 %2, <vscale x 2 x i1> %3, i64 %4) nounwind {
-; CHECK-LABEL: intrinsic_vrgather_mask_vx_nxv2f64_nxv2f64_i64:
+define <vscale x 2 x double> @intrinsic_vrgather_mask_vx_nxv2f64_nxv2f64(<vscale x 2 x double> %0, <vscale x 2 x double> %1, iXLen %2, <vscale x 2 x i1> %3, iXLen %4) nounwind {
+; CHECK-LABEL: intrinsic_vrgather_mask_vx_nxv2f64_nxv2f64:
 ; CHECK:       # %bb.0: # %entry
 ; CHECK-NEXT:    vsetvli zero, a1, e64, m2, ta, mu
 ; CHECK-NEXT:    vrgather.vx v8, v10, a0, v0.t
 ; CHECK-NEXT:    ret
 entry:
-  %a = call <vscale x 2 x double> @llvm.riscv.vrgather.vx.mask.nxv2f64.i64(
+  %a = call <vscale x 2 x double> @llvm.riscv.vrgather.vx.mask.nxv2f64.iXLen(
     <vscale x 2 x double> %0,
     <vscale x 2 x double> %1,
-    i64 %2,
+    iXLen %2,
     <vscale x 2 x i1> %3,
-    i64 %4, i64 1)
+    iXLen %4, iXLen 1)
 
   ret <vscale x 2 x double> %a
 }
 
-declare <vscale x 4 x double> @llvm.riscv.vrgather.vx.nxv4f64.i64(
+declare <vscale x 4 x double> @llvm.riscv.vrgather.vx.nxv4f64.iXLen(
   <vscale x 4 x double>,
   <vscale x 4 x double>,
-  i64,
-  i64);
+  iXLen,
+  iXLen)
 
-define <vscale x 4 x double> @intrinsic_vrgather_vx_nxv4f64_nxv4f64_i64(<vscale x 4 x double> %0, i64 %1, i64 %2) nounwind {
-; CHECK-LABEL: intrinsic_vrgather_vx_nxv4f64_nxv4f64_i64:
+define <vscale x 4 x double> @intrinsic_vrgather_vx_nxv4f64_nxv4f64(<vscale x 4 x double> %0, iXLen %1, iXLen %2) nounwind {
+; CHECK-LABEL: intrinsic_vrgather_vx_nxv4f64_nxv4f64:
 ; CHECK:       # %bb.0: # %entry
 ; CHECK-NEXT:    vsetvli zero, a1, e64, m4, ta, ma
 ; CHECK-NEXT:    vrgather.vx v12, v8, a0
 ; CHECK-NEXT:    vmv.v.v v8, v12
 ; CHECK-NEXT:    ret
 entry:
-  %a = call <vscale x 4 x double> @llvm.riscv.vrgather.vx.nxv4f64.i64(
+  %a = call <vscale x 4 x double> @llvm.riscv.vrgather.vx.nxv4f64.iXLen(
     <vscale x 4 x double> undef,
     <vscale x 4 x double> %0,
-    i64 %1,
-    i64 %2)
+    iXLen %1,
+    iXLen %2)
 
   ret <vscale x 4 x double> %a
 }
 
-declare <vscale x 4 x double> @llvm.riscv.vrgather.vx.mask.nxv4f64.i64(
+declare <vscale x 4 x double> @llvm.riscv.vrgather.vx.mask.nxv4f64.iXLen(
   <vscale x 4 x double>,
   <vscale x 4 x double>,
-  i64,
+  iXLen,
   <vscale x 4 x i1>,
-  i64,
-  i64);
+  iXLen,
+  iXLen)
 
-define <vscale x 4 x double> @intrinsic_vrgather_mask_vx_nxv4f64_nxv4f64_i64(<vscale x 4 x double> %0, <vscale x 4 x double> %1, i64 %2, <vscale x 4 x i1> %3, i64 %4) nounwind {
-; CHECK-LABEL: intrinsic_vrgather_mask_vx_nxv4f64_nxv4f64_i64:
+define <vscale x 4 x double> @intrinsic_vrgather_mask_vx_nxv4f64_nxv4f64(<vscale x 4 x double> %0, <vscale x 4 x double> %1, iXLen %2, <vscale x 4 x i1> %3, iXLen %4) nounwind {
+; CHECK-LABEL: intrinsic_vrgather_mask_vx_nxv4f64_nxv4f64:
 ; CHECK:       # %bb.0: # %entry
 ; CHECK-NEXT:    vsetvli zero, a1, e64, m4, ta, mu
 ; CHECK-NEXT:    vrgather.vx v8, v12, a0, v0.t
 ; CHECK-NEXT:    ret
 entry:
-  %a = call <vscale x 4 x double> @llvm.riscv.vrgather.vx.mask.nxv4f64.i64(
+  %a = call <vscale x 4 x double> @llvm.riscv.vrgather.vx.mask.nxv4f64.iXLen(
     <vscale x 4 x double> %0,
     <vscale x 4 x double> %1,
-    i64 %2,
+    iXLen %2,
     <vscale x 4 x i1> %3,
-    i64 %4, i64 1)
+    iXLen %4, iXLen 1)
 
   ret <vscale x 4 x double> %a
 }
 
-declare <vscale x 8 x double> @llvm.riscv.vrgather.vx.nxv8f64.i64(
+declare <vscale x 8 x double> @llvm.riscv.vrgather.vx.nxv8f64.iXLen(
   <vscale x 8 x double>,
   <vscale x 8 x double>,
-  i64,
-  i64);
+  iXLen,
+  iXLen)
 
-define <vscale x 8 x double> @intrinsic_vrgather_vx_nxv8f64_nxv8f64_i64(<vscale x 8 x double> %0, i64 %1, i64 %2) nounwind {
-; CHECK-LABEL: intrinsic_vrgather_vx_nxv8f64_nxv8f64_i64:
+define <vscale x 8 x double> @intrinsic_vrgather_vx_nxv8f64_nxv8f64(<vscale x 8 x double> %0, iXLen %1, iXLen %2) nounwind {
+; CHECK-LABEL: intrinsic_vrgather_vx_nxv8f64_nxv8f64:
 ; CHECK:       # %bb.0: # %entry
 ; CHECK-NEXT:    vsetvli zero, a1, e64, m8, ta, ma
 ; CHECK-NEXT:    vrgather.vx v16, v8, a0
 ; CHECK-NEXT:    vmv.v.v v8, v16
 ; CHECK-NEXT:    ret
 entry:
-  %a = call <vscale x 8 x double> @llvm.riscv.vrgather.vx.nxv8f64.i64(
+  %a = call <vscale x 8 x double> @llvm.riscv.vrgather.vx.nxv8f64.iXLen(
     <vscale x 8 x double> undef,
     <vscale x 8 x double> %0,
-    i64 %1,
-    i64 %2)
+    iXLen %1,
+    iXLen %2)
 
   ret <vscale x 8 x double> %a
 }
 
-declare <vscale x 8 x double> @llvm.riscv.vrgather.vx.mask.nxv8f64.i64(
+declare <vscale x 8 x double> @llvm.riscv.vrgather.vx.mask.nxv8f64.iXLen(
   <vscale x 8 x double>,
   <vscale x 8 x double>,
-  i64,
+  iXLen,
   <vscale x 8 x i1>,
-  i64,
-  i64);
+  iXLen,
+  iXLen)
 
-define <vscale x 8 x double> @intrinsic_vrgather_mask_vx_nxv8f64_nxv8f64_i64(<vscale x 8 x double> %0, <vscale x 8 x double> %1, i64 %2, <vscale x 8 x i1> %3, i64 %4) nounwind {
-; CHECK-LABEL: intrinsic_vrgather_mask_vx_nxv8f64_nxv8f64_i64:
+define <vscale x 8 x double> @intrinsic_vrgather_mask_vx_nxv8f64_nxv8f64(<vscale x 8 x double> %0, <vscale x 8 x double> %1, iXLen %2, <vscale x 8 x i1> %3, iXLen %4) nounwind {
+; CHECK-LABEL: intrinsic_vrgather_mask_vx_nxv8f64_nxv8f64:
 ; CHECK:       # %bb.0: # %entry
 ; CHECK-NEXT:    vsetvli zero, a1, e64, m8, ta, mu
 ; CHECK-NEXT:    vrgather.vx v8, v16, a0, v0.t
 ; CHECK-NEXT:    ret
 entry:
-  %a = call <vscale x 8 x double> @llvm.riscv.vrgather.vx.mask.nxv8f64.i64(
+  %a = call <vscale x 8 x double> @llvm.riscv.vrgather.vx.mask.nxv8f64.iXLen(
     <vscale x 8 x double> %0,
     <vscale x 8 x double> %1,
-    i64 %2,
+    iXLen %2,
     <vscale x 8 x i1> %3,
-    i64 %4, i64 1)
+    iXLen %4, iXLen 1)
 
   ret <vscale x 8 x double> %a
 }
 
-define <vscale x 1 x i8> @intrinsic_vrgather_vi_nxv1i8_nxv1i8_i64(<vscale x 1 x i8> %0, i64 %1) nounwind {
-; CHECK-LABEL: intrinsic_vrgather_vi_nxv1i8_nxv1i8_i64:
+define <vscale x 1 x i8> @intrinsic_vrgather_vi_nxv1i8_nxv1i8(<vscale x 1 x i8> %0, iXLen %1) nounwind {
+; CHECK-LABEL: intrinsic_vrgather_vi_nxv1i8_nxv1i8:
 ; CHECK:       # %bb.0: # %entry
 ; CHECK-NEXT:    vsetvli zero, a0, e8, mf8, ta, ma
 ; CHECK-NEXT:    vrgather.vi v9, v8, 9
 ; CHECK-NEXT:    vmv1r.v v8, v9
 ; CHECK-NEXT:    ret
 entry:
-  %a = call <vscale x 1 x i8> @llvm.riscv.vrgather.vx.nxv1i8.i64(
+  %a = call <vscale x 1 x i8> @llvm.riscv.vrgather.vx.nxv1i8.iXLen(
     <vscale x 1 x i8> undef,
     <vscale x 1 x i8> %0,
-    i64 9,
-    i64 %1)
+    iXLen 9,
+    iXLen %1)
 
   ret <vscale x 1 x i8> %a
 }
 
-define <vscale x 1 x i8> @intrinsic_vrgather_mask_vi_nxv1i8_nxv1i8_i64(<vscale x 1 x i8> %0, <vscale x 1 x i8> %1, <vscale x 1 x i1> %2, i64 %3) nounwind {
-; CHECK-LABEL: intrinsic_vrgather_mask_vi_nxv1i8_nxv1i8_i64:
+define <vscale x 1 x i8> @intrinsic_vrgather_mask_vi_nxv1i8_nxv1i8(<vscale x 1 x i8> %0, <vscale x 1 x i8> %1, <vscale x 1 x i1> %2, iXLen %3) nounwind {
+; CHECK-LABEL: intrinsic_vrgather_mask_vi_nxv1i8_nxv1i8:
 ; CHECK:       # %bb.0: # %entry
 ; CHECK-NEXT:    vsetvli zero, a0, e8, mf8, ta, mu
 ; CHECK-NEXT:    vrgather.vi v8, v9, 9, v0.t
 ; CHECK-NEXT:    ret
 entry:
-  %a = call <vscale x 1 x i8> @llvm.riscv.vrgather.vx.mask.nxv1i8.i64(
+  %a = call <vscale x 1 x i8> @llvm.riscv.vrgather.vx.mask.nxv1i8.iXLen(
     <vscale x 1 x i8> %0,
     <vscale x 1 x i8> %1,
-    i64 9,
+    iXLen 9,
     <vscale x 1 x i1> %2,
-    i64 %3, i64 1)
+    iXLen %3, iXLen 1)
 
   ret <vscale x 1 x i8> %a
 }
 
-define <vscale x 2 x i8> @intrinsic_vrgather_vi_nxv2i8_nxv2i8_i64(<vscale x 2 x i8> %0, i64 %1) nounwind {
-; CHECK-LABEL: intrinsic_vrgather_vi_nxv2i8_nxv2i8_i64:
+define <vscale x 2 x i8> @intrinsic_vrgather_vi_nxv2i8_nxv2i8(<vscale x 2 x i8> %0, iXLen %1) nounwind {
+; CHECK-LABEL: intrinsic_vrgather_vi_nxv2i8_nxv2i8:
 ; CHECK:       # %bb.0: # %entry
 ; CHECK-NEXT:    vsetvli zero, a0, e8, mf4, ta, ma
 ; CHECK-NEXT:    vrgather.vi v9, v8, 9
 ; CHECK-NEXT:    vmv1r.v v8, v9
 ; CHECK-NEXT:    ret
 entry:
-  %a = call <vscale x 2 x i8> @llvm.riscv.vrgather.vx.nxv2i8.i64(
+  %a = call <vscale x 2 x i8> @llvm.riscv.vrgather.vx.nxv2i8.iXLen(
     <vscale x 2 x i8> undef,
     <vscale x 2 x i8> %0,
-    i64 9,
-    i64 %1)
+    iXLen 9,
+    iXLen %1)
 
   ret <vscale x 2 x i8> %a
 }
 
-define <vscale x 2 x i8> @intrinsic_vrgather_mask_vi_nxv2i8_nxv2i8_i64(<vscale x 2 x i8> %0, <vscale x 2 x i8> %1, <vscale x 2 x i1> %2, i64 %3) nounwind {
-; CHECK-LABEL: intrinsic_vrgather_mask_vi_nxv2i8_nxv2i8_i64:
+define <vscale x 2 x i8> @intrinsic_vrgather_mask_vi_nxv2i8_nxv2i8(<vscale x 2 x i8> %0, <vscale x 2 x i8> %1, <vscale x 2 x i1> %2, iXLen %3) nounwind {
+; CHECK-LABEL: intrinsic_vrgather_mask_vi_nxv2i8_nxv2i8:
 ; CHECK:       # %bb.0: # %entry
 ; CHECK-NEXT:    vsetvli zero, a0, e8, mf4, ta, mu
 ; CHECK-NEXT:    vrgather.vi v8, v9, 9, v0.t
 ; CHECK-NEXT:    ret
 entry:
-  %a = call <vscale x 2 x i8> @llvm.riscv.vrgather.vx.mask.nxv2i8.i64(
+  %a = call <vscale x 2 x i8> @llvm.riscv.vrgather.vx.mask.nxv2i8.iXLen(
     <vscale x 2 x i8> %0,
     <vscale x 2 x i8> %1,
-    i64 9,
+    iXLen 9,
     <vscale x 2 x i1> %2,
-    i64 %3, i64 1)
+    iXLen %3, iXLen 1)
 
   ret <vscale x 2 x i8> %a
 }
 
-define <vscale x 4 x i8> @intrinsic_vrgather_vi_nxv4i8_nxv4i8_i64(<vscale x 4 x i8> %0, i64 %1) nounwind {
-; CHECK-LABEL: intrinsic_vrgather_vi_nxv4i8_nxv4i8_i64:
+define <vscale x 4 x i8> @intrinsic_vrgather_vi_nxv4i8_nxv4i8(<vscale x 4 x i8> %0, iXLen %1) nounwind {
+; CHECK-LABEL: intrinsic_vrgather_vi_nxv4i8_nxv4i8:
 ; CHECK:       # %bb.0: # %entry
 ; CHECK-NEXT:    vsetvli zero, a0, e8, mf2, ta, ma
 ; CHECK-NEXT:    vrgather.vi v9, v8, 9
 ; CHECK-NEXT:    vmv1r.v v8, v9
 ; CHECK-NEXT:    ret
 entry:
-  %a = call <vscale x 4 x i8> @llvm.riscv.vrgather.vx.nxv4i8.i64(
+  %a = call <vscale x 4 x i8> @llvm.riscv.vrgather.vx.nxv4i8.iXLen(
     <vscale x 4 x i8> undef,
     <vscale x 4 x i8> %0,
-    i64 9,
-    i64 %1)
+    iXLen 9,
+    iXLen %1)
 
   ret <vscale x 4 x i8> %a
 }
 
-define <vscale x 4 x i8> @intrinsic_vrgather_mask_vi_nxv4i8_nxv4i8_i64(<vscale x 4 x i8> %0, <vscale x 4 x i8> %1, <vscale x 4 x i1> %2, i64 %3) nounwind {
-; CHECK-LABEL: intrinsic_vrgather_mask_vi_nxv4i8_nxv4i8_i64:
+define <vscale x 4 x i8> @intrinsic_vrgather_mask_vi_nxv4i8_nxv4i8(<vscale x 4 x i8> %0, <vscale x 4 x i8> %1, <vscale x 4 x i1> %2, iXLen %3) nounwind {
+; CHECK-LABEL: intrinsic_vrgather_mask_vi_nxv4i8_nxv4i8:
 ; CHECK:       # %bb.0: # %entry
 ; CHECK-NEXT:    vsetvli zero, a0, e8, mf2, ta, mu
 ; CHECK-NEXT:    vrgather.vi v8, v9, 9, v0.t
 ; CHECK-NEXT:    ret
 entry:
-  %a = call <vscale x 4 x i8> @llvm.riscv.vrgather.vx.mask.nxv4i8.i64(
+  %a = call <vscale x 4 x i8> @llvm.riscv.vrgather.vx.mask.nxv4i8.iXLen(
     <vscale x 4 x i8> %0,
     <vscale x 4 x i8> %1,
-    i64 9,
+    iXLen 9,
     <vscale x 4 x i1> %2,
-    i64 %3, i64 1)
+    iXLen %3, iXLen 1)
 
   ret <vscale x 4 x i8> %a
 }
 
-define <vscale x 8 x i8> @intrinsic_vrgather_vi_nxv8i8_nxv8i8_i64(<vscale x 8 x i8> %0, i64 %1) nounwind {
-; CHECK-LABEL: intrinsic_vrgather_vi_nxv8i8_nxv8i8_i64:
+define <vscale x 8 x i8> @intrinsic_vrgather_vi_nxv8i8_nxv8i8(<vscale x 8 x i8> %0, iXLen %1) nounwind {
+; CHECK-LABEL: intrinsic_vrgather_vi_nxv8i8_nxv8i8:
 ; CHECK:       # %bb.0: # %entry
 ; CHECK-NEXT:    vsetvli zero, a0, e8, m1, ta, ma
 ; CHECK-NEXT:    vrgather.vi v9, v8, 9
 ; CHECK-NEXT:    vmv.v.v v8, v9
 ; CHECK-NEXT:    ret
 entry:
-  %a = call <vscale x 8 x i8> @llvm.riscv.vrgather.vx.nxv8i8.i64(
+  %a = call <vscale x 8 x i8> @llvm.riscv.vrgather.vx.nxv8i8.iXLen(
     <vscale x 8 x i8> undef,
     <vscale x 8 x i8> %0,
-    i64 9,
-    i64 %1)
+    iXLen 9,
+    iXLen %1)
 
   ret <vscale x 8 x i8> %a
 }
 
-define <vscale x 8 x i8> @intrinsic_vrgather_mask_vi_nxv8i8_nxv8i8_i64(<vscale x 8 x i8> %0, <vscale x 8 x i8> %1, <vscale x 8 x i1> %2, i64 %3) nounwind {
-; CHECK-LABEL: intrinsic_vrgather_mask_vi_nxv8i8_nxv8i8_i64:
+define <vscale x 8 x i8> @intrinsic_vrgather_mask_vi_nxv8i8_nxv8i8(<vscale x 8 x i8> %0, <vscale x 8 x i8> %1, <vscale x 8 x i1> %2, iXLen %3) nounwind {
+; CHECK-LABEL: intrinsic_vrgather_mask_vi_nxv8i8_nxv8i8:
 ; CHECK:       # %bb.0: # %entry
 ; CHECK-NEXT:    vsetvli zero, a0, e8, m1, ta, mu
 ; CHECK-NEXT:    vrgather.vi v8, v9, 9, v0.t
 ; CHECK-NEXT:    ret
 entry:
-  %a = call <vscale x 8 x i8> @llvm.riscv.vrgather.vx.mask.nxv8i8.i64(
+  %a = call <vscale x 8 x i8> @llvm.riscv.vrgather.vx.mask.nxv8i8.iXLen(
     <vscale x 8 x i8> %0,
     <vscale x 8 x i8> %1,
-    i64 9,
+    iXLen 9,
     <vscale x 8 x i1> %2,
-    i64 %3, i64 1)
+    iXLen %3, iXLen 1)
 
   ret <vscale x 8 x i8> %a
 }
 
-define <vscale x 16 x i8> @intrinsic_vrgather_vi_nxv16i8_nxv16i8_i64(<vscale x 16 x i8> %0, i64 %1) nounwind {
-; CHECK-LABEL: intrinsic_vrgather_vi_nxv16i8_nxv16i8_i64:
+define <vscale x 16 x i8> @intrinsic_vrgather_vi_nxv16i8_nxv16i8(<vscale x 16 x i8> %0, iXLen %1) nounwind {
+; CHECK-LABEL: intrinsic_vrgather_vi_nxv16i8_nxv16i8:
 ; CHECK:       # %bb.0: # %entry
 ; CHECK-NEXT:    vsetvli zero, a0, e8, m2, ta, ma
 ; CHECK-NEXT:    vrgather.vi v10, v8, 9
 ; CHECK-NEXT:    vmv.v.v v8, v10
 ; CHECK-NEXT:    ret
 entry:
-  %a = call <vscale x 16 x i8> @llvm.riscv.vrgather.vx.nxv16i8.i64(
+  %a = call <vscale x 16 x i8> @llvm.riscv.vrgather.vx.nxv16i8.iXLen(
     <vscale x 16 x i8> undef,
     <vscale x 16 x i8> %0,
-    i64 9,
-    i64 %1)
+    iXLen 9,
+    iXLen %1)
 
   ret <vscale x 16 x i8> %a
 }
 
-define <vscale x 16 x i8> @intrinsic_vrgather_mask_vi_nxv16i8_nxv16i8_i64(<vscale x 16 x i8> %0, <vscale x 16 x i8> %1, <vscale x 16 x i1> %2, i64 %3) nounwind {
-; CHECK-LABEL: intrinsic_vrgather_mask_vi_nxv16i8_nxv16i8_i64:
+define <vscale x 16 x i8> @intrinsic_vrgather_mask_vi_nxv16i8_nxv16i8(<vscale x 16 x i8> %0, <vscale x 16 x i8> %1, <vscale x 16 x i1> %2, iXLen %3) nounwind {
+; CHECK-LABEL: intrinsic_vrgather_mask_vi_nxv16i8_nxv16i8:
 ; CHECK:       # %bb.0: # %entry
 ; CHECK-NEXT:    vsetvli zero, a0, e8, m2, ta, mu
 ; CHECK-NEXT:    vrgather.vi v8, v10, 9, v0.t
 ; CHECK-NEXT:    ret
 entry:
-  %a = call <vscale x 16 x i8> @llvm.riscv.vrgather.vx.mask.nxv16i8.i64(
+  %a = call <vscale x 16 x i8> @llvm.riscv.vrgather.vx.mask.nxv16i8.iXLen(
     <vscale x 16 x i8> %0,
     <vscale x 16 x i8> %1,
-    i64 9,
+    iXLen 9,
     <vscale x 16 x i1> %2,
-    i64 %3, i64 1)
+    iXLen %3, iXLen 1)
 
   ret <vscale x 16 x i8> %a
 }
 
-define <vscale x 32 x i8> @intrinsic_vrgather_vi_nxv32i8_nxv32i8_i64(<vscale x 32 x i8> %0, i64 %1) nounwind {
-; CHECK-LABEL: intrinsic_vrgather_vi_nxv32i8_nxv32i8_i64:
+define <vscale x 32 x i8> @intrinsic_vrgather_vi_nxv32i8_nxv32i8(<vscale x 32 x i8> %0, iXLen %1) nounwind {
+; CHECK-LABEL: intrinsic_vrgather_vi_nxv32i8_nxv32i8:
 ; CHECK:       # %bb.0: # %entry
 ; CHECK-NEXT:    vsetvli zero, a0, e8, m4, ta, ma
 ; CHECK-NEXT:    vrgather.vi v12, v8, 9
 ; CHECK-NEXT:    vmv.v.v v8, v12
 ; CHECK-NEXT:    ret
 entry:
-  %a = call <vscale x 32 x i8> @llvm.riscv.vrgather.vx.nxv32i8.i64(
+  %a = call <vscale x 32 x i8> @llvm.riscv.vrgather.vx.nxv32i8.iXLen(
     <vscale x 32 x i8> undef,
     <vscale x 32 x i8> %0,
-    i64 9,
-    i64 %1)
+    iXLen 9,
+    iXLen %1)
 
   ret <vscale x 32 x i8> %a
 }
 
-define <vscale x 32 x i8> @intrinsic_vrgather_mask_vi_nxv32i8_nxv32i8_i64(<vscale x 32 x i8> %0, <vscale x 32 x i8> %1, <vscale x 32 x i1> %2, i64 %3) nounwind {
-; CHECK-LABEL: intrinsic_vrgather_mask_vi_nxv32i8_nxv32i8_i64:
+define <vscale x 32 x i8> @intrinsic_vrgather_mask_vi_nxv32i8_nxv32i8(<vscale x 32 x i8> %0, <vscale x 32 x i8> %1, <vscale x 32 x i1> %2, iXLen %3) nounwind {
+; CHECK-LABEL: intrinsic_vrgather_mask_vi_nxv32i8_nxv32i8:
 ; CHECK:       # %bb.0: # %entry
 ; CHECK-NEXT:    vsetvli zero, a0, e8, m4, ta, mu
 ; CHECK-NEXT:    vrgather.vi v8, v12, 9, v0.t
 ; CHECK-NEXT:    ret
 entry:
-  %a = call <vscale x 32 x i8> @llvm.riscv.vrgather.vx.mask.nxv32i8.i64(
+  %a = call <vscale x 32 x i8> @llvm.riscv.vrgather.vx.mask.nxv32i8.iXLen(
     <vscale x 32 x i8> %0,
     <vscale x 32 x i8> %1,
-    i64 9,
+    iXLen 9,
     <vscale x 32 x i1> %2,
-    i64 %3, i64 1)
+    iXLen %3, iXLen 1)
 
   ret <vscale x 32 x i8> %a
 }
 
-define <vscale x 64 x i8> @intrinsic_vrgather_vi_nxv64i8_nxv64i8_i64(<vscale x 64 x i8> %0, i64 %1) nounwind {
-; CHECK-LABEL: intrinsic_vrgather_vi_nxv64i8_nxv64i8_i64:
+define <vscale x 64 x i8> @intrinsic_vrgather_vi_nxv64i8_nxv64i8(<vscale x 64 x i8> %0, iXLen %1) nounwind {
+; CHECK-LABEL: intrinsic_vrgather_vi_nxv64i8_nxv64i8:
 ; CHECK:       # %bb.0: # %entry
 ; CHECK-NEXT:    vsetvli zero, a0, e8, m8, ta, ma
 ; CHECK-NEXT:    vrgather.vi v16, v8, 9
 ; CHECK-NEXT:    vmv.v.v v8, v16
 ; CHECK-NEXT:    ret
 entry:
-  %a = call <vscale x 64 x i8> @llvm.riscv.vrgather.vx.nxv64i8.i64(
+  %a = call <vscale x 64 x i8> @llvm.riscv.vrgather.vx.nxv64i8.iXLen(
     <vscale x 64 x i8> undef,
     <vscale x 64 x i8> %0,
-    i64 9,
-    i64 %1)
+    iXLen 9,
+    iXLen %1)
 
   ret <vscale x 64 x i8> %a
 }
 
-define <vscale x 64 x i8> @intrinsic_vrgather_mask_vi_nxv64i8_nxv64i8_i64(<vscale x 64 x i8> %0, <vscale x 64 x i8> %1, <vscale x 64 x i1> %2, i64 %3) nounwind {
-; CHECK-LABEL: intrinsic_vrgather_mask_vi_nxv64i8_nxv64i8_i64:
+define <vscale x 64 x i8> @intrinsic_vrgather_mask_vi_nxv64i8_nxv64i8(<vscale x 64 x i8> %0, <vscale x 64 x i8> %1, <vscale x 64 x i1> %2, iXLen %3) nounwind {
+; CHECK-LABEL: intrinsic_vrgather_mask_vi_nxv64i8_nxv64i8:
 ; CHECK:       # %bb.0: # %entry
 ; CHECK-NEXT:    vsetvli zero, a0, e8, m8, ta, mu
 ; CHECK-NEXT:    vrgather.vi v8, v16, 9, v0.t
 ; CHECK-NEXT:    ret
 entry:
-  %a = call <vscale x 64 x i8> @llvm.riscv.vrgather.vx.mask.nxv64i8.i64(
+  %a = call <vscale x 64 x i8> @llvm.riscv.vrgather.vx.mask.nxv64i8.iXLen(
     <vscale x 64 x i8> %0,
     <vscale x 64 x i8> %1,
-    i64 9,
+    iXLen 9,
     <vscale x 64 x i1> %2,
-    i64 %3, i64 1)
+    iXLen %3, iXLen 1)
 
   ret <vscale x 64 x i8> %a
 }
 
-define <vscale x 1 x i16> @intrinsic_vrgather_vi_nxv1i16_nxv1i16_i64(<vscale x 1 x i16> %0, i64 %1) nounwind {
-; CHECK-LABEL: intrinsic_vrgather_vi_nxv1i16_nxv1i16_i64:
+define <vscale x 1 x i16> @intrinsic_vrgather_vi_nxv1i16_nxv1i16(<vscale x 1 x i16> %0, iXLen %1) nounwind {
+; CHECK-LABEL: intrinsic_vrgather_vi_nxv1i16_nxv1i16:
 ; CHECK:       # %bb.0: # %entry
 ; CHECK-NEXT:    vsetvli zero, a0, e16, mf4, ta, ma
 ; CHECK-NEXT:    vrgather.vi v9, v8, 9
 ; CHECK-NEXT:    vmv1r.v v8, v9
 ; CHECK-NEXT:    ret
 entry:
-  %a = call <vscale x 1 x i16> @llvm.riscv.vrgather.vx.nxv1i16.i64(
+  %a = call <vscale x 1 x i16> @llvm.riscv.vrgather.vx.nxv1i16.iXLen(
     <vscale x 1 x i16> undef,
     <vscale x 1 x i16> %0,
-    i64 9,
-    i64 %1)
+    iXLen 9,
+    iXLen %1)
 
   ret <vscale x 1 x i16> %a
 }
 
-define <vscale x 1 x i16> @intrinsic_vrgather_mask_vi_nxv1i16_nxv1i16_i64(<vscale x 1 x i16> %0, <vscale x 1 x i16> %1, <vscale x 1 x i1> %2, i64 %3) nounwind {
-; CHECK-LABEL: intrinsic_vrgather_mask_vi_nxv1i16_nxv1i16_i64:
+define <vscale x 1 x i16> @intrinsic_vrgather_mask_vi_nxv1i16_nxv1i16(<vscale x 1 x i16> %0, <vscale x 1 x i16> %1, <vscale x 1 x i1> %2, iXLen %3) nounwind {
+; CHECK-LABEL: intrinsic_vrgather_mask_vi_nxv1i16_nxv1i16:
 ; CHECK:       # %bb.0: # %entry
 ; CHECK-NEXT:    vsetvli zero, a0, e16, mf4, ta, mu
 ; CHECK-NEXT:    vrgather.vi v8, v9, 9, v0.t
 ; CHECK-NEXT:    ret
 entry:
-  %a = call <vscale x 1 x i16> @llvm.riscv.vrgather.vx.mask.nxv1i16.i64(
+  %a = call <vscale x 1 x i16> @llvm.riscv.vrgather.vx.mask.nxv1i16.iXLen(
     <vscale x 1 x i16> %0,
     <vscale x 1 x i16> %1,
-    i64 9,
+    iXLen 9,
     <vscale x 1 x i1> %2,
-    i64 %3, i64 1)
+    iXLen %3, iXLen 1)
 
   ret <vscale x 1 x i16> %a
 }
 
-define <vscale x 2 x i16> @intrinsic_vrgather_vi_nxv2i16_nxv2i16_i64(<vscale x 2 x i16> %0, i64 %1) nounwind {
-; CHECK-LABEL: intrinsic_vrgather_vi_nxv2i16_nxv2i16_i64:
+define <vscale x 2 x i16> @intrinsic_vrgather_vi_nxv2i16_nxv2i16(<vscale x 2 x i16> %0, iXLen %1) nounwind {
+; CHECK-LABEL: intrinsic_vrgather_vi_nxv2i16_nxv2i16:
 ; CHECK:       # %bb.0: # %entry
 ; CHECK-NEXT:    vsetvli zero, a0, e16, mf2, ta, ma
 ; CHECK-NEXT:    vrgather.vi v9, v8, 9
 ; CHECK-NEXT:    vmv1r.v v8, v9
 ; CHECK-NEXT:    ret
 entry:
-  %a = call <vscale x 2 x i16> @llvm.riscv.vrgather.vx.nxv2i16.i64(
+  %a = call <vscale x 2 x i16> @llvm.riscv.vrgather.vx.nxv2i16.iXLen(
     <vscale x 2 x i16> undef,
     <vscale x 2 x i16> %0,
-    i64 9,
-    i64 %1)
+    iXLen 9,
+    iXLen %1)
 
   ret <vscale x 2 x i16> %a
 }
 
-define <vscale x 2 x i16> @intrinsic_vrgather_mask_vi_nxv2i16_nxv2i16_i64(<vscale x 2 x i16> %0, <vscale x 2 x i16> %1, <vscale x 2 x i1> %2, i64 %3) nounwind {
-; CHECK-LABEL: intrinsic_vrgather_mask_vi_nxv2i16_nxv2i16_i64:
+define <vscale x 2 x i16> @intrinsic_vrgather_mask_vi_nxv2i16_nxv2i16(<vscale x 2 x i16> %0, <vscale x 2 x i16> %1, <vscale x 2 x i1> %2, iXLen %3) nounwind {
+; CHECK-LABEL: intrinsic_vrgather_mask_vi_nxv2i16_nxv2i16:
 ; CHECK:       # %bb.0: # %entry
 ; CHECK-NEXT:    vsetvli zero, a0, e16, mf2, ta, mu
 ; CHECK-NEXT:    vrgather.vi v8, v9, 9, v0.t
 ; CHECK-NEXT:    ret
 entry:
-  %a = call <vscale x 2 x i16> @llvm.riscv.vrgather.vx.mask.nxv2i16.i64(
+  %a = call <vscale x 2 x i16> @llvm.riscv.vrgather.vx.mask.nxv2i16.iXLen(
     <vscale x 2 x i16> %0,
     <vscale x 2 x i16> %1,
-    i64 9,
+    iXLen 9,
     <vscale x 2 x i1> %2,
-    i64 %3, i64 1)
+    iXLen %3, iXLen 1)
 
   ret <vscale x 2 x i16> %a
 }
 
-define <vscale x 4 x i16> @intrinsic_vrgather_vi_nxv4i16_nxv4i16_i64(<vscale x 4 x i16> %0, i64 %1) nounwind {
-; CHECK-LABEL: intrinsic_vrgather_vi_nxv4i16_nxv4i16_i64:
+define <vscale x 4 x i16> @intrinsic_vrgather_vi_nxv4i16_nxv4i16(<vscale x 4 x i16> %0, iXLen %1) nounwind {
+; CHECK-LABEL: intrinsic_vrgather_vi_nxv4i16_nxv4i16:
 ; CHECK:       # %bb.0: # %entry
 ; CHECK-NEXT:    vsetvli zero, a0, e16, m1, ta, ma
 ; CHECK-NEXT:    vrgather.vi v9, v8, 9
 ; CHECK-NEXT:    vmv.v.v v8, v9
 ; CHECK-NEXT:    ret
 entry:
-  %a = call <vscale x 4 x i16> @llvm.riscv.vrgather.vx.nxv4i16.i64(
+  %a = call <vscale x 4 x i16> @llvm.riscv.vrgather.vx.nxv4i16.iXLen(
     <vscale x 4 x i16> undef,
     <vscale x 4 x i16> %0,
-    i64 9,
-    i64 %1)
+    iXLen 9,
+    iXLen %1)
 
   ret <vscale x 4 x i16> %a
 }
 
-define <vscale x 4 x i16> @intrinsic_vrgather_mask_vi_nxv4i16_nxv4i16_i64(<vscale x 4 x i16> %0, <vscale x 4 x i16> %1, <vscale x 4 x i1> %2, i64 %3) nounwind {
-; CHECK-LABEL: intrinsic_vrgather_mask_vi_nxv4i16_nxv4i16_i64:
+define <vscale x 4 x i16> @intrinsic_vrgather_mask_vi_nxv4i16_nxv4i16(<vscale x 4 x i16> %0, <vscale x 4 x i16> %1, <vscale x 4 x i1> %2, iXLen %3) nounwind {
+; CHECK-LABEL: intrinsic_vrgather_mask_vi_nxv4i16_nxv4i16:
 ; CHECK:       # %bb.0: # %entry
 ; CHECK-NEXT:    vsetvli zero, a0, e16, m1, ta, mu
 ; CHECK-NEXT:    vrgather.vi v8, v9, 9, v0.t
 ; CHECK-NEXT:    ret
 entry:
-  %a = call <vscale x 4 x i16> @llvm.riscv.vrgather.vx.mask.nxv4i16.i64(
+  %a = call <vscale x 4 x i16> @llvm.riscv.vrgather.vx.mask.nxv4i16.iXLen(
     <vscale x 4 x i16> %0,
     <vscale x 4 x i16> %1,
-    i64 9,
+    iXLen 9,
     <vscale x 4 x i1> %2,
-    i64 %3, i64 1)
+    iXLen %3, iXLen 1)
 
   ret <vscale x 4 x i16> %a
 }
 
-define <vscale x 8 x i16> @intrinsic_vrgather_vi_nxv8i16_nxv8i16_i64(<vscale x 8 x i16> %0, i64 %1) nounwind {
-; CHECK-LABEL: intrinsic_vrgather_vi_nxv8i16_nxv8i16_i64:
+define <vscale x 8 x i16> @intrinsic_vrgather_vi_nxv8i16_nxv8i16(<vscale x 8 x i16> %0, iXLen %1) nounwind {
+; CHECK-LABEL: intrinsic_vrgather_vi_nxv8i16_nxv8i16:
 ; CHECK:       # %bb.0: # %entry
 ; CHECK-NEXT:    vsetvli zero, a0, e16, m2, ta, ma
 ; CHECK-NEXT:    vrgather.vi v10, v8, 9
 ; CHECK-NEXT:    vmv.v.v v8, v10
 ; CHECK-NEXT:    ret
 entry:
-  %a = call <vscale x 8 x i16> @llvm.riscv.vrgather.vx.nxv8i16.i64(
+  %a = call <vscale x 8 x i16> @llvm.riscv.vrgather.vx.nxv8i16.iXLen(
     <vscale x 8 x i16> undef,
     <vscale x 8 x i16> %0,
-    i64 9,
-    i64 %1)
+    iXLen 9,
+    iXLen %1)
 
   ret <vscale x 8 x i16> %a
 }
 
-define <vscale x 8 x i16> @intrinsic_vrgather_mask_vi_nxv8i16_nxv8i16_i64(<vscale x 8 x i16> %0, <vscale x 8 x i16> %1, <vscale x 8 x i1> %2, i64 %3) nounwind {
-; CHECK-LABEL: intrinsic_vrgather_mask_vi_nxv8i16_nxv8i16_i64:
+define <vscale x 8 x i16> @intrinsic_vrgather_mask_vi_nxv8i16_nxv8i16(<vscale x 8 x i16> %0, <vscale x 8 x i16> %1, <vscale x 8 x i1> %2, iXLen %3) nounwind {
+; CHECK-LABEL: intrinsic_vrgather_mask_vi_nxv8i16_nxv8i16:
 ; CHECK:       # %bb.0: # %entry
 ; CHECK-NEXT:    vsetvli zero, a0, e16, m2, ta, mu
 ; CHECK-NEXT:    vrgather.vi v8, v10, 9, v0.t
 ; CHECK-NEXT:    ret
 entry:
-  %a = call <vscale x 8 x i16> @llvm.riscv.vrgather.vx.mask.nxv8i16.i64(
+  %a = call <vscale x 8 x i16> @llvm.riscv.vrgather.vx.mask.nxv8i16.iXLen(
     <vscale x 8 x i16> %0,
     <vscale x 8 x i16> %1,
-    i64 9,
+    iXLen 9,
     <vscale x 8 x i1> %2,
-    i64 %3, i64 1)
+    iXLen %3, iXLen 1)
 
   ret <vscale x 8 x i16> %a
 }
 
-define <vscale x 16 x i16> @intrinsic_vrgather_vi_nxv16i16_nxv16i16_i64(<vscale x 16 x i16> %0, i64 %1) nounwind {
-; CHECK-LABEL: intrinsic_vrgather_vi_nxv16i16_nxv16i16_i64:
+define <vscale x 16 x i16> @intrinsic_vrgather_vi_nxv16i16_nxv16i16(<vscale x 16 x i16> %0, iXLen %1) nounwind {
+; CHECK-LABEL: intrinsic_vrgather_vi_nxv16i16_nxv16i16:
 ; CHECK:       # %bb.0: # %entry
 ; CHECK-NEXT:    vsetvli zero, a0, e16, m4, ta, ma
 ; CHECK-NEXT:    vrgather.vi v12, v8, 9
 ; CHECK-NEXT:    vmv.v.v v8, v12
 ; CHECK-NEXT:    ret
 entry:
-  %a = call <vscale x 16 x i16> @llvm.riscv.vrgather.vx.nxv16i16.i64(
+  %a = call <vscale x 16 x i16> @llvm.riscv.vrgather.vx.nxv16i16.iXLen(
     <vscale x 16 x i16> undef,
     <vscale x 16 x i16> %0,
-    i64 9,
-    i64 %1)
+    iXLen 9,
+    iXLen %1)
 
   ret <vscale x 16 x i16> %a
 }
 
-define <vscale x 16 x i16> @intrinsic_vrgather_mask_vi_nxv16i16_nxv16i16_i64(<vscale x 16 x i16> %0, <vscale x 16 x i16> %1, <vscale x 16 x i1> %2, i64 %3) nounwind {
-; CHECK-LABEL: intrinsic_vrgather_mask_vi_nxv16i16_nxv16i16_i64:
+define <vscale x 16 x i16> @intrinsic_vrgather_mask_vi_nxv16i16_nxv16i16(<vscale x 16 x i16> %0, <vscale x 16 x i16> %1, <vscale x 16 x i1> %2, iXLen %3) nounwind {
+; CHECK-LABEL: intrinsic_vrgather_mask_vi_nxv16i16_nxv16i16:
 ; CHECK:       # %bb.0: # %entry
 ; CHECK-NEXT:    vsetvli zero, a0, e16, m4, ta, mu
 ; CHECK-NEXT:    vrgather.vi v8, v12, 9, v0.t
 ; CHECK-NEXT:    ret
 entry:
-  %a = call <vscale x 16 x i16> @llvm.riscv.vrgather.vx.mask.nxv16i16.i64(
+  %a = call <vscale x 16 x i16> @llvm.riscv.vrgather.vx.mask.nxv16i16.iXLen(
     <vscale x 16 x i16> %0,
     <vscale x 16 x i16> %1,
-    i64 9,
+    iXLen 9,
     <vscale x 16 x i1> %2,
-    i64 %3, i64 1)
+    iXLen %3, iXLen 1)
 
   ret <vscale x 16 x i16> %a
 }
 
-define <vscale x 32 x i16> @intrinsic_vrgather_vi_nxv32i16_nxv32i16_i64(<vscale x 32 x i16> %0, i64 %1) nounwind {
-; CHECK-LABEL: intrinsic_vrgather_vi_nxv32i16_nxv32i16_i64:
+define <vscale x 32 x i16> @intrinsic_vrgather_vi_nxv32i16_nxv32i16(<vscale x 32 x i16> %0, iXLen %1) nounwind {
+; CHECK-LABEL: intrinsic_vrgather_vi_nxv32i16_nxv32i16:
 ; CHECK:       # %bb.0: # %entry
 ; CHECK-NEXT:    vsetvli zero, a0, e16, m8, ta, ma
 ; CHECK-NEXT:    vrgather.vi v16, v8, 9
 ; CHECK-NEXT:    vmv.v.v v8, v16
 ; CHECK-NEXT:    ret
 entry:
-  %a = call <vscale x 32 x i16> @llvm.riscv.vrgather.vx.nxv32i16.i64(
+  %a = call <vscale x 32 x i16> @llvm.riscv.vrgather.vx.nxv32i16.iXLen(
     <vscale x 32 x i16> undef,
     <vscale x 32 x i16> %0,
-    i64 9,
-    i64 %1)
+    iXLen 9,
+    iXLen %1)
 
   ret <vscale x 32 x i16> %a
 }
 
-define <vscale x 32 x i16> @intrinsic_vrgather_mask_vi_nxv32i16_nxv32i16_i64(<vscale x 32 x i16> %0, <vscale x 32 x i16> %1, <vscale x 32 x i1> %2, i64 %3) nounwind {
-; CHECK-LABEL: intrinsic_vrgather_mask_vi_nxv32i16_nxv32i16_i64:
+define <vscale x 32 x i16> @intrinsic_vrgather_mask_vi_nxv32i16_nxv32i16(<vscale x 32 x i16> %0, <vscale x 32 x i16> %1, <vscale x 32 x i1> %2, iXLen %3) nounwind {
+; CHECK-LABEL: intrinsic_vrgather_mask_vi_nxv32i16_nxv32i16:
 ; CHECK:       # %bb.0: # %entry
 ; CHECK-NEXT:    vsetvli zero, a0, e16, m8, ta, mu
 ; CHECK-NEXT:    vrgather.vi v8, v16, 9, v0.t
 ; CHECK-NEXT:    ret
 entry:
-  %a = call <vscale x 32 x i16> @llvm.riscv.vrgather.vx.mask.nxv32i16.i64(
+  %a = call <vscale x 32 x i16> @llvm.riscv.vrgather.vx.mask.nxv32i16.iXLen(
     <vscale x 32 x i16> %0,
     <vscale x 32 x i16> %1,
-    i64 9,
+    iXLen 9,
     <vscale x 32 x i1> %2,
-    i64 %3, i64 1)
+    iXLen %3, iXLen 1)
 
   ret <vscale x 32 x i16> %a
 }
 
-define <vscale x 1 x i32> @intrinsic_vrgather_vi_nxv1i32_nxv1i32_i64(<vscale x 1 x i32> %0, i64 %1) nounwind {
-; CHECK-LABEL: intrinsic_vrgather_vi_nxv1i32_nxv1i32_i64:
+define <vscale x 1 x i32> @intrinsic_vrgather_vi_nxv1i32_nxv1i32(<vscale x 1 x i32> %0, iXLen %1) nounwind {
+; CHECK-LABEL: intrinsic_vrgather_vi_nxv1i32_nxv1i32:
 ; CHECK:       # %bb.0: # %entry
 ; CHECK-NEXT:    vsetvli zero, a0, e32, mf2, ta, ma
 ; CHECK-NEXT:    vrgather.vi v9, v8, 9
 ; CHECK-NEXT:    vmv1r.v v8, v9
 ; CHECK-NEXT:    ret
 entry:
-  %a = call <vscale x 1 x i32> @llvm.riscv.vrgather.vx.nxv1i32.i64(
+  %a = call <vscale x 1 x i32> @llvm.riscv.vrgather.vx.nxv1i32.iXLen(
     <vscale x 1 x i32> undef,
     <vscale x 1 x i32> %0,
-    i64 9,
-    i64 %1)
+    iXLen 9,
+    iXLen %1)
 
   ret <vscale x 1 x i32> %a
 }
 
-define <vscale x 1 x i32> @intrinsic_vrgather_mask_vi_nxv1i32_nxv1i32_i64(<vscale x 1 x i32> %0, <vscale x 1 x i32> %1, <vscale x 1 x i1> %2, i64 %3) nounwind {
-; CHECK-LABEL: intrinsic_vrgather_mask_vi_nxv1i32_nxv1i32_i64:
+define <vscale x 1 x i32> @intrinsic_vrgather_mask_vi_nxv1i32_nxv1i32(<vscale x 1 x i32> %0, <vscale x 1 x i32> %1, <vscale x 1 x i1> %2, iXLen %3) nounwind {
+; CHECK-LABEL: intrinsic_vrgather_mask_vi_nxv1i32_nxv1i32:
 ; CHECK:       # %bb.0: # %entry
 ; CHECK-NEXT:    vsetvli zero, a0, e32, mf2, ta, mu
 ; CHECK-NEXT:    vrgather.vi v8, v9, 9, v0.t
 ; CHECK-NEXT:    ret
 entry:
-  %a = call <vscale x 1 x i32> @llvm.riscv.vrgather.vx.mask.nxv1i32.i64(
+  %a = call <vscale x 1 x i32> @llvm.riscv.vrgather.vx.mask.nxv1i32.iXLen(
     <vscale x 1 x i32> %0,
     <vscale x 1 x i32> %1,
-    i64 9,
+    iXLen 9,
     <vscale x 1 x i1> %2,
-    i64 %3, i64 1)
+    iXLen %3, iXLen 1)
 
   ret <vscale x 1 x i32> %a
 }
 
-define <vscale x 2 x i32> @intrinsic_vrgather_vi_nxv2i32_nxv2i32_i64(<vscale x 2 x i32> %0, i64 %1) nounwind {
-; CHECK-LABEL: intrinsic_vrgather_vi_nxv2i32_nxv2i32_i64:
+define <vscale x 2 x i32> @intrinsic_vrgather_vi_nxv2i32_nxv2i32(<vscale x 2 x i32> %0, iXLen %1) nounwind {
+; CHECK-LABEL: intrinsic_vrgather_vi_nxv2i32_nxv2i32:
 ; CHECK:       # %bb.0: # %entry
 ; CHECK-NEXT:    vsetvli zero, a0, e32, m1, ta, ma
 ; CHECK-NEXT:    vrgather.vi v9, v8, 9
 ; CHECK-NEXT:    vmv.v.v v8, v9
 ; CHECK-NEXT:    ret
 entry:
-  %a = call <vscale x 2 x i32> @llvm.riscv.vrgather.vx.nxv2i32.i64(
+  %a = call <vscale x 2 x i32> @llvm.riscv.vrgather.vx.nxv2i32.iXLen(
     <vscale x 2 x i32> undef,
     <vscale x 2 x i32> %0,
-    i64 9,
-    i64 %1)
+    iXLen 9,
+    iXLen %1)
 
   ret <vscale x 2 x i32> %a
 }
 
-define <vscale x 2 x i32> @intrinsic_vrgather_mask_vi_nxv2i32_nxv2i32_i64(<vscale x 2 x i32> %0, <vscale x 2 x i32> %1, <vscale x 2 x i1> %2, i64 %3) nounwind {
-; CHECK-LABEL: intrinsic_vrgather_mask_vi_nxv2i32_nxv2i32_i64:
+define <vscale x 2 x i32> @intrinsic_vrgather_mask_vi_nxv2i32_nxv2i32(<vscale x 2 x i32> %0, <vscale x 2 x i32> %1, <vscale x 2 x i1> %2, iXLen %3) nounwind {
+; CHECK-LABEL: intrinsic_vrgather_mask_vi_nxv2i32_nxv2i32:
 ; CHECK:       # %bb.0: # %entry
 ; CHECK-NEXT:    vsetvli zero, a0, e32, m1, ta, mu
 ; CHECK-NEXT:    vrgather.vi v8, v9, 9, v0.t
 ; CHECK-NEXT:    ret
 entry:
-  %a = call <vscale x 2 x i32> @llvm.riscv.vrgather.vx.mask.nxv2i32.i64(
+  %a = call <vscale x 2 x i32> @llvm.riscv.vrgather.vx.mask.nxv2i32.iXLen(
     <vscale x 2 x i32> %0,
     <vscale x 2 x i32> %1,
-    i64 9,
+    iXLen 9,
     <vscale x 2 x i1> %2,
-    i64 %3, i64 1)
+    iXLen %3, iXLen 1)
 
   ret <vscale x 2 x i32> %a
 }
 
-define <vscale x 4 x i32> @intrinsic_vrgather_vi_nxv4i32_nxv4i32_i64(<vscale x 4 x i32> %0, i64 %1) nounwind {
-; CHECK-LABEL: intrinsic_vrgather_vi_nxv4i32_nxv4i32_i64:
+define <vscale x 4 x i32> @intrinsic_vrgather_vi_nxv4i32_nxv4i32(<vscale x 4 x i32> %0, iXLen %1) nounwind {
+; CHECK-LABEL: intrinsic_vrgather_vi_nxv4i32_nxv4i32:
 ; CHECK:       # %bb.0: # %entry
 ; CHECK-NEXT:    vsetvli zero, a0, e32, m2, ta, ma
 ; CHECK-NEXT:    vrgather.vi v10, v8, 9
 ; CHECK-NEXT:    vmv.v.v v8, v10
 ; CHECK-NEXT:    ret
 entry:
-  %a = call <vscale x 4 x i32> @llvm.riscv.vrgather.vx.nxv4i32.i64(
+  %a = call <vscale x 4 x i32> @llvm.riscv.vrgather.vx.nxv4i32.iXLen(
     <vscale x 4 x i32> undef,
     <vscale x 4 x i32> %0,
-    i64 9,
-    i64 %1)
+    iXLen 9,
+    iXLen %1)
 
   ret <vscale x 4 x i32> %a
 }
 
-define <vscale x 4 x i32> @intrinsic_vrgather_mask_vi_nxv4i32_nxv4i32_i64(<vscale x 4 x i32> %0, <vscale x 4 x i32> %1, <vscale x 4 x i1> %2, i64 %3) nounwind {
-; CHECK-LABEL: intrinsic_vrgather_mask_vi_nxv4i32_nxv4i32_i64:
+define <vscale x 4 x i32> @intrinsic_vrgather_mask_vi_nxv4i32_nxv4i32(<vscale x 4 x i32> %0, <vscale x 4 x i32> %1, <vscale x 4 x i1> %2, iXLen %3) nounwind {
+; CHECK-LABEL: intrinsic_vrgather_mask_vi_nxv4i32_nxv4i32:
 ; CHECK:       # %bb.0: # %entry
 ; CHECK-NEXT:    vsetvli zero, a0, e32, m2, ta, mu
 ; CHECK-NEXT:    vrgather.vi v8, v10, 9, v0.t
 ; CHECK-NEXT:    ret
 entry:
-  %a = call <vscale x 4 x i32> @llvm.riscv.vrgather.vx.mask.nxv4i32.i64(
+  %a = call <vscale x 4 x i32> @llvm.riscv.vrgather.vx.mask.nxv4i32.iXLen(
     <vscale x 4 x i32> %0,
     <vscale x 4 x i32> %1,
-    i64 9,
+    iXLen 9,
     <vscale x 4 x i1> %2,
-    i64 %3, i64 1)
+    iXLen %3, iXLen 1)
 
   ret <vscale x 4 x i32> %a
 }
 
-define <vscale x 8 x i32> @intrinsic_vrgather_vi_nxv8i32_nxv8i32_i64(<vscale x 8 x i32> %0, i64 %1) nounwind {
-; CHECK-LABEL: intrinsic_vrgather_vi_nxv8i32_nxv8i32_i64:
+define <vscale x 8 x i32> @intrinsic_vrgather_vi_nxv8i32_nxv8i32(<vscale x 8 x i32> %0, iXLen %1) nounwind {
+; CHECK-LABEL: intrinsic_vrgather_vi_nxv8i32_nxv8i32:
 ; CHECK:       # %bb.0: # %entry
 ; CHECK-NEXT:    vsetvli zero, a0, e32, m4, ta, ma
 ; CHECK-NEXT:    vrgather.vi v12, v8, 9
 ; CHECK-NEXT:    vmv.v.v v8, v12
 ; CHECK-NEXT:    ret
 entry:
-  %a = call <vscale x 8 x i32> @llvm.riscv.vrgather.vx.nxv8i32.i64(
+  %a = call <vscale x 8 x i32> @llvm.riscv.vrgather.vx.nxv8i32.iXLen(
     <vscale x 8 x i32> undef,
     <vscale x 8 x i32> %0,
-    i64 9,
-    i64 %1)
+    iXLen 9,
+    iXLen %1)
 
   ret <vscale x 8 x i32> %a
 }
 
-define <vscale x 8 x i32> @intrinsic_vrgather_mask_vi_nxv8i32_nxv8i32_i64(<vscale x 8 x i32> %0, <vscale x 8 x i32> %1, <vscale x 8 x i1> %2, i64 %3) nounwind {
-; CHECK-LABEL: intrinsic_vrgather_mask_vi_nxv8i32_nxv8i32_i64:
+define <vscale x 8 x i32> @intrinsic_vrgather_mask_vi_nxv8i32_nxv8i32(<vscale x 8 x i32> %0, <vscale x 8 x i32> %1, <vscale x 8 x i1> %2, iXLen %3) nounwind {
+; CHECK-LABEL: intrinsic_vrgather_mask_vi_nxv8i32_nxv8i32:
 ; CHECK:       # %bb.0: # %entry
 ; CHECK-NEXT:    vsetvli zero, a0, e32, m4, ta, mu
 ; CHECK-NEXT:    vrgather.vi v8, v12, 9, v0.t
 ; CHECK-NEXT:    ret
 entry:
-  %a = call <vscale x 8 x i32> @llvm.riscv.vrgather.vx.mask.nxv8i32.i64(
+  %a = call <vscale x 8 x i32> @llvm.riscv.vrgather.vx.mask.nxv8i32.iXLen(
     <vscale x 8 x i32> %0,
     <vscale x 8 x i32> %1,
-    i64 9,
+    iXLen 9,
     <vscale x 8 x i1> %2,
-    i64 %3, i64 1)
+    iXLen %3, iXLen 1)
 
   ret <vscale x 8 x i32> %a
 }
 
-define <vscale x 16 x i32> @intrinsic_vrgather_vi_nxv16i32_nxv16i32_i64(<vscale x 16 x i32> %0, i64 %1) nounwind {
-; CHECK-LABEL: intrinsic_vrgather_vi_nxv16i32_nxv16i32_i64:
+define <vscale x 16 x i32> @intrinsic_vrgather_vi_nxv16i32_nxv16i32(<vscale x 16 x i32> %0, iXLen %1) nounwind {
+; CHECK-LABEL: intrinsic_vrgather_vi_nxv16i32_nxv16i32:
 ; CHECK:       # %bb.0: # %entry
 ; CHECK-NEXT:    vsetvli zero, a0, e32, m8, ta, ma
 ; CHECK-NEXT:    vrgather.vi v16, v8, 9
 ; CHECK-NEXT:    vmv.v.v v8, v16
 ; CHECK-NEXT:    ret
 entry:
-  %a = call <vscale x 16 x i32> @llvm.riscv.vrgather.vx.nxv16i32.i64(
+  %a = call <vscale x 16 x i32> @llvm.riscv.vrgather.vx.nxv16i32.iXLen(
     <vscale x 16 x i32> undef,
     <vscale x 16 x i32> %0,
-    i64 9,
-    i64 %1)
+    iXLen 9,
+    iXLen %1)
 
   ret <vscale x 16 x i32> %a
 }
 
-define <vscale x 16 x i32> @intrinsic_vrgather_mask_vi_nxv16i32_nxv16i32_i64(<vscale x 16 x i32> %0, <vscale x 16 x i32> %1, <vscale x 16 x i1> %2, i64 %3) nounwind {
-; CHECK-LABEL: intrinsic_vrgather_mask_vi_nxv16i32_nxv16i32_i64:
+define <vscale x 16 x i32> @intrinsic_vrgather_mask_vi_nxv16i32_nxv16i32(<vscale x 16 x i32> %0, <vscale x 16 x i32> %1, <vscale x 16 x i1> %2, iXLen %3) nounwind {
+; CHECK-LABEL: intrinsic_vrgather_mask_vi_nxv16i32_nxv16i32:
 ; CHECK:       # %bb.0: # %entry
 ; CHECK-NEXT:    vsetvli zero, a0, e32, m8, ta, mu
 ; CHECK-NEXT:    vrgather.vi v8, v16, 9, v0.t
 ; CHECK-NEXT:    ret
 entry:
-  %a = call <vscale x 16 x i32> @llvm.riscv.vrgather.vx.mask.nxv16i32.i64(
+  %a = call <vscale x 16 x i32> @llvm.riscv.vrgather.vx.mask.nxv16i32.iXLen(
     <vscale x 16 x i32> %0,
     <vscale x 16 x i32> %1,
-    i64 9,
+    iXLen 9,
     <vscale x 16 x i1> %2,
-    i64 %3, i64 1)
+    iXLen %3, iXLen 1)
 
   ret <vscale x 16 x i32> %a
 }
 
-define <vscale x 1 x i64> @intrinsic_vrgather_vi_nxv1i64_nxv1i64_i64(<vscale x 1 x i64> %0, i64 %1) nounwind {
-; CHECK-LABEL: intrinsic_vrgather_vi_nxv1i64_nxv1i64_i64:
+define <vscale x 1 x i64> @intrinsic_vrgather_vi_nxv1i64_nxv1i64(<vscale x 1 x i64> %0, iXLen %1) nounwind {
+; CHECK-LABEL: intrinsic_vrgather_vi_nxv1i64_nxv1i64:
 ; CHECK:       # %bb.0: # %entry
 ; CHECK-NEXT:    vsetvli zero, a0, e64, m1, ta, ma
 ; CHECK-NEXT:    vrgather.vi v9, v8, 9
 ; CHECK-NEXT:    vmv.v.v v8, v9
 ; CHECK-NEXT:    ret
 entry:
-  %a = call <vscale x 1 x i64> @llvm.riscv.vrgather.vx.nxv1i64.i64(
+  %a = call <vscale x 1 x i64> @llvm.riscv.vrgather.vx.nxv1i64.iXLen(
     <vscale x 1 x i64> undef,
     <vscale x 1 x i64> %0,
-    i64 9,
-    i64 %1)
+    iXLen 9,
+    iXLen %1)
 
   ret <vscale x 1 x i64> %a
 }
 
-define <vscale x 1 x i64> @intrinsic_vrgather_mask_vi_nxv1i64_nxv1i64_i64(<vscale x 1 x i64> %0, <vscale x 1 x i64> %1, <vscale x 1 x i1> %2, i64 %3) nounwind {
-; CHECK-LABEL: intrinsic_vrgather_mask_vi_nxv1i64_nxv1i64_i64:
+define <vscale x 1 x i64> @intrinsic_vrgather_mask_vi_nxv1i64_nxv1i64(<vscale x 1 x i64> %0, <vscale x 1 x i64> %1, <vscale x 1 x i1> %2, iXLen %3) nounwind {
+; CHECK-LABEL: intrinsic_vrgather_mask_vi_nxv1i64_nxv1i64:
 ; CHECK:       # %bb.0: # %entry
 ; CHECK-NEXT:    vsetvli zero, a0, e64, m1, ta, mu
 ; CHECK-NEXT:    vrgather.vi v8, v9, 9, v0.t
 ; CHECK-NEXT:    ret
 entry:
-  %a = call <vscale x 1 x i64> @llvm.riscv.vrgather.vx.mask.nxv1i64.i64(
+  %a = call <vscale x 1 x i64> @llvm.riscv.vrgather.vx.mask.nxv1i64.iXLen(
     <vscale x 1 x i64> %0,
     <vscale x 1 x i64> %1,
-    i64 9,
+    iXLen 9,
     <vscale x 1 x i1> %2,
-    i64 %3, i64 1)
+    iXLen %3, iXLen 1)
 
   ret <vscale x 1 x i64> %a
 }
 
-define <vscale x 2 x i64> @intrinsic_vrgather_vi_nxv2i64_nxv2i64_i64(<vscale x 2 x i64> %0, i64 %1) nounwind {
-; CHECK-LABEL: intrinsic_vrgather_vi_nxv2i64_nxv2i64_i64:
+define <vscale x 2 x i64> @intrinsic_vrgather_vi_nxv2i64_nxv2i64(<vscale x 2 x i64> %0, iXLen %1) nounwind {
+; CHECK-LABEL: intrinsic_vrgather_vi_nxv2i64_nxv2i64:
 ; CHECK:       # %bb.0: # %entry
 ; CHECK-NEXT:    vsetvli zero, a0, e64, m2, ta, ma
 ; CHECK-NEXT:    vrgather.vi v10, v8, 9
 ; CHECK-NEXT:    vmv.v.v v8, v10
 ; CHECK-NEXT:    ret
 entry:
-  %a = call <vscale x 2 x i64> @llvm.riscv.vrgather.vx.nxv2i64.i64(
+  %a = call <vscale x 2 x i64> @llvm.riscv.vrgather.vx.nxv2i64.iXLen(
     <vscale x 2 x i64> undef,
     <vscale x 2 x i64> %0,
-    i64 9,
-    i64 %1)
+    iXLen 9,
+    iXLen %1)
 
   ret <vscale x 2 x i64> %a
 }
 
-define <vscale x 2 x i64> @intrinsic_vrgather_mask_vi_nxv2i64_nxv2i64_i64(<vscale x 2 x i64> %0, <vscale x 2 x i64> %1, <vscale x 2 x i1> %2, i64 %3) nounwind {
-; CHECK-LABEL: intrinsic_vrgather_mask_vi_nxv2i64_nxv2i64_i64:
+define <vscale x 2 x i64> @intrinsic_vrgather_mask_vi_nxv2i64_nxv2i64(<vscale x 2 x i64> %0, <vscale x 2 x i64> %1, <vscale x 2 x i1> %2, iXLen %3) nounwind {
+; CHECK-LABEL: intrinsic_vrgather_mask_vi_nxv2i64_nxv2i64:
 ; CHECK:       # %bb.0: # %entry
 ; CHECK-NEXT:    vsetvli zero, a0, e64, m2, ta, mu
 ; CHECK-NEXT:    vrgather.vi v8, v10, 9, v0.t
 ; CHECK-NEXT:    ret
 entry:
-  %a = call <vscale x 2 x i64> @llvm.riscv.vrgather.vx.mask.nxv2i64.i64(
+  %a = call <vscale x 2 x i64> @llvm.riscv.vrgather.vx.mask.nxv2i64.iXLen(
     <vscale x 2 x i64> %0,
     <vscale x 2 x i64> %1,
-    i64 9,
+    iXLen 9,
     <vscale x 2 x i1> %2,
-    i64 %3, i64 1)
+    iXLen %3, iXLen 1)
 
   ret <vscale x 2 x i64> %a
 }
 
-define <vscale x 4 x i64> @intrinsic_vrgather_vi_nxv4i64_nxv4i64_i64(<vscale x 4 x i64> %0, i64 %1) nounwind {
-; CHECK-LABEL: intrinsic_vrgather_vi_nxv4i64_nxv4i64_i64:
+define <vscale x 4 x i64> @intrinsic_vrgather_vi_nxv4i64_nxv4i64(<vscale x 4 x i64> %0, iXLen %1) nounwind {
+; CHECK-LABEL: intrinsic_vrgather_vi_nxv4i64_nxv4i64:
 ; CHECK:       # %bb.0: # %entry
 ; CHECK-NEXT:    vsetvli zero, a0, e64, m4, ta, ma
 ; CHECK-NEXT:    vrgather.vi v12, v8, 9
 ; CHECK-NEXT:    vmv.v.v v8, v12
 ; CHECK-NEXT:    ret
 entry:
-  %a = call <vscale x 4 x i64> @llvm.riscv.vrgather.vx.nxv4i64.i64(
+  %a = call <vscale x 4 x i64> @llvm.riscv.vrgather.vx.nxv4i64.iXLen(
     <vscale x 4 x i64> undef,
     <vscale x 4 x i64> %0,
-    i64 9,
-    i64 %1)
+    iXLen 9,
+    iXLen %1)
 
   ret <vscale x 4 x i64> %a
 }
 
-define <vscale x 4 x i64> @intrinsic_vrgather_mask_vi_nxv4i64_nxv4i64_i64(<vscale x 4 x i64> %0, <vscale x 4 x i64> %1, <vscale x 4 x i1> %2, i64 %3) nounwind {
-; CHECK-LABEL: intrinsic_vrgather_mask_vi_nxv4i64_nxv4i64_i64:
+define <vscale x 4 x i64> @intrinsic_vrgather_mask_vi_nxv4i64_nxv4i64(<vscale x 4 x i64> %0, <vscale x 4 x i64> %1, <vscale x 4 x i1> %2, iXLen %3) nounwind {
+; CHECK-LABEL: intrinsic_vrgather_mask_vi_nxv4i64_nxv4i64:
 ; CHECK:       # %bb.0: # %entry
 ; CHECK-NEXT:    vsetvli zero, a0, e64, m4, ta, mu
 ; CHECK-NEXT:    vrgather.vi v8, v12, 9, v0.t
 ; CHECK-NEXT:    ret
 entry:
-  %a = call <vscale x 4 x i64> @llvm.riscv.vrgather.vx.mask.nxv4i64.i64(
+  %a = call <vscale x 4 x i64> @llvm.riscv.vrgather.vx.mask.nxv4i64.iXLen(
     <vscale x 4 x i64> %0,
     <vscale x 4 x i64> %1,
-    i64 9,
+    iXLen 9,
     <vscale x 4 x i1> %2,
-    i64 %3, i64 1)
+    iXLen %3, iXLen 1)
 
   ret <vscale x 4 x i64> %a
 }
 
-define <vscale x 8 x i64> @intrinsic_vrgather_vi_nxv8i64_nxv8i64_i64(<vscale x 8 x i64> %0, i64 %1) nounwind {
-; CHECK-LABEL: intrinsic_vrgather_vi_nxv8i64_nxv8i64_i64:
+define <vscale x 8 x i64> @intrinsic_vrgather_vi_nxv8i64_nxv8i64(<vscale x 8 x i64> %0, iXLen %1) nounwind {
+; CHECK-LABEL: intrinsic_vrgather_vi_nxv8i64_nxv8i64:
 ; CHECK:       # %bb.0: # %entry
 ; CHECK-NEXT:    vsetvli zero, a0, e64, m8, ta, ma
 ; CHECK-NEXT:    vrgather.vi v16, v8, 9
 ; CHECK-NEXT:    vmv.v.v v8, v16
 ; CHECK-NEXT:    ret
 entry:
-  %a = call <vscale x 8 x i64> @llvm.riscv.vrgather.vx.nxv8i64.i64(
+  %a = call <vscale x 8 x i64> @llvm.riscv.vrgather.vx.nxv8i64.iXLen(
     <vscale x 8 x i64> undef,
     <vscale x 8 x i64> %0,
-    i64 9,
-    i64 %1)
+    iXLen 9,
+    iXLen %1)
 
   ret <vscale x 8 x i64> %a
 }
 
-define <vscale x 8 x i64> @intrinsic_vrgather_mask_vi_nxv8i64_nxv8i64_i64(<vscale x 8 x i64> %0, <vscale x 8 x i64> %1, <vscale x 8 x i1> %2, i64 %3) nounwind {
-; CHECK-LABEL: intrinsic_vrgather_mask_vi_nxv8i64_nxv8i64_i64:
+define <vscale x 8 x i64> @intrinsic_vrgather_mask_vi_nxv8i64_nxv8i64(<vscale x 8 x i64> %0, <vscale x 8 x i64> %1, <vscale x 8 x i1> %2, iXLen %3) nounwind {
+; CHECK-LABEL: intrinsic_vrgather_mask_vi_nxv8i64_nxv8i64:
 ; CHECK:       # %bb.0: # %entry
 ; CHECK-NEXT:    vsetvli zero, a0, e64, m8, ta, mu
 ; CHECK-NEXT:    vrgather.vi v8, v16, 9, v0.t
 ; CHECK-NEXT:    ret
 entry:
-  %a = call <vscale x 8 x i64> @llvm.riscv.vrgather.vx.mask.nxv8i64.i64(
+  %a = call <vscale x 8 x i64> @llvm.riscv.vrgather.vx.mask.nxv8i64.iXLen(
     <vscale x 8 x i64> %0,
     <vscale x 8 x i64> %1,
-    i64 9,
+    iXLen 9,
     <vscale x 8 x i1> %2,
-    i64 %3, i64 1)
+    iXLen %3, iXLen 1)
 
   ret <vscale x 8 x i64> %a
 }
 
-define <vscale x 1 x half> @intrinsic_vrgather_vi_nxv1f16_nxv1f16_i64(<vscale x 1 x half> %0, i64 %1) nounwind {
-; CHECK-LABEL: intrinsic_vrgather_vi_nxv1f16_nxv1f16_i64:
+define <vscale x 1 x half> @intrinsic_vrgather_vi_nxv1f16_nxv1f16(<vscale x 1 x half> %0, iXLen %1) nounwind {
+; CHECK-LABEL: intrinsic_vrgather_vi_nxv1f16_nxv1f16:
 ; CHECK:       # %bb.0: # %entry
 ; CHECK-NEXT:    vsetvli zero, a0, e16, mf4, ta, ma
 ; CHECK-NEXT:    vrgather.vi v9, v8, 9
 ; CHECK-NEXT:    vmv1r.v v8, v9
 ; CHECK-NEXT:    ret
 entry:
-  %a = call <vscale x 1 x half> @llvm.riscv.vrgather.vx.nxv1f16.i64(
+  %a = call <vscale x 1 x half> @llvm.riscv.vrgather.vx.nxv1f16.iXLen(
     <vscale x 1 x half> undef,
     <vscale x 1 x half> %0,
-    i64 9,
-    i64 %1)
+    iXLen 9,
+    iXLen %1)
 
   ret <vscale x 1 x half> %a
 }
 
-define <vscale x 1 x half> @intrinsic_vrgather_mask_vi_nxv1f16_nxv1f16_i64(<vscale x 1 x half> %0, <vscale x 1 x half> %1, <vscale x 1 x i1> %2, i64 %3) nounwind {
-; CHECK-LABEL: intrinsic_vrgather_mask_vi_nxv1f16_nxv1f16_i64:
+define <vscale x 1 x half> @intrinsic_vrgather_mask_vi_nxv1f16_nxv1f16(<vscale x 1 x half> %0, <vscale x 1 x half> %1, <vscale x 1 x i1> %2, iXLen %3) nounwind {
+; CHECK-LABEL: intrinsic_vrgather_mask_vi_nxv1f16_nxv1f16:
 ; CHECK:       # %bb.0: # %entry
 ; CHECK-NEXT:    vsetvli zero, a0, e16, mf4, ta, mu
 ; CHECK-NEXT:    vrgather.vi v8, v9, 9, v0.t
 ; CHECK-NEXT:    ret
 entry:
-  %a = call <vscale x 1 x half> @llvm.riscv.vrgather.vx.mask.nxv1f16.i64(
+  %a = call <vscale x 1 x half> @llvm.riscv.vrgather.vx.mask.nxv1f16.iXLen(
     <vscale x 1 x half> %0,
     <vscale x 1 x half> %1,
-    i64 9,
+    iXLen 9,
     <vscale x 1 x i1> %2,
-    i64 %3, i64 1)
+    iXLen %3, iXLen 1)
 
   ret <vscale x 1 x half> %a
 }
 
-define <vscale x 2 x half> @intrinsic_vrgather_vi_nxv2f16_nxv2f16_i64(<vscale x 2 x half> %0, i64 %1) nounwind {
-; CHECK-LABEL: intrinsic_vrgather_vi_nxv2f16_nxv2f16_i64:
+define <vscale x 2 x half> @intrinsic_vrgather_vi_nxv2f16_nxv2f16(<vscale x 2 x half> %0, iXLen %1) nounwind {
+; CHECK-LABEL: intrinsic_vrgather_vi_nxv2f16_nxv2f16:
 ; CHECK:       # %bb.0: # %entry
 ; CHECK-NEXT:    vsetvli zero, a0, e16, mf2, ta, ma
 ; CHECK-NEXT:    vrgather.vi v9, v8, 9
 ; CHECK-NEXT:    vmv1r.v v8, v9
 ; CHECK-NEXT:    ret
 entry:
-  %a = call <vscale x 2 x half> @llvm.riscv.vrgather.vx.nxv2f16.i64(
+  %a = call <vscale x 2 x half> @llvm.riscv.vrgather.vx.nxv2f16.iXLen(
     <vscale x 2 x half> undef,
     <vscale x 2 x half> %0,
-    i64 9,
-    i64 %1)
+    iXLen 9,
+    iXLen %1)
 
   ret <vscale x 2 x half> %a
 }
 
-define <vscale x 2 x half> @intrinsic_vrgather_mask_vi_nxv2f16_nxv2f16_i64(<vscale x 2 x half> %0, <vscale x 2 x half> %1, <vscale x 2 x i1> %2, i64 %3) nounwind {
-; CHECK-LABEL: intrinsic_vrgather_mask_vi_nxv2f16_nxv2f16_i64:
+define <vscale x 2 x half> @intrinsic_vrgather_mask_vi_nxv2f16_nxv2f16(<vscale x 2 x half> %0, <vscale x 2 x half> %1, <vscale x 2 x i1> %2, iXLen %3) nounwind {
+; CHECK-LABEL: intrinsic_vrgather_mask_vi_nxv2f16_nxv2f16:
 ; CHECK:       # %bb.0: # %entry
 ; CHECK-NEXT:    vsetvli zero, a0, e16, mf2, ta, mu
 ; CHECK-NEXT:    vrgather.vi v8, v9, 9, v0.t
 ; CHECK-NEXT:    ret
 entry:
-  %a = call <vscale x 2 x half> @llvm.riscv.vrgather.vx.mask.nxv2f16.i64(
+  %a = call <vscale x 2 x half> @llvm.riscv.vrgather.vx.mask.nxv2f16.iXLen(
     <vscale x 2 x half> %0,
     <vscale x 2 x half> %1,
-    i64 9,
+    iXLen 9,
     <vscale x 2 x i1> %2,
-    i64 %3, i64 1)
+    iXLen %3, iXLen 1)
 
   ret <vscale x 2 x half> %a
 }
 
-define <vscale x 4 x half> @intrinsic_vrgather_vi_nxv4f16_nxv4f16_i64(<vscale x 4 x half> %0, i64 %1) nounwind {
-; CHECK-LABEL: intrinsic_vrgather_vi_nxv4f16_nxv4f16_i64:
+define <vscale x 4 x half> @intrinsic_vrgather_vi_nxv4f16_nxv4f16(<vscale x 4 x half> %0, iXLen %1) nounwind {
+; CHECK-LABEL: intrinsic_vrgather_vi_nxv4f16_nxv4f16:
 ; CHECK:       # %bb.0: # %entry
 ; CHECK-NEXT:    vsetvli zero, a0, e16, m1, ta, ma
 ; CHECK-NEXT:    vrgather.vi v9, v8, 9
 ; CHECK-NEXT:    vmv.v.v v8, v9
 ; CHECK-NEXT:    ret
 entry:
-  %a = call <vscale x 4 x half> @llvm.riscv.vrgather.vx.nxv4f16.i64(
+  %a = call <vscale x 4 x half> @llvm.riscv.vrgather.vx.nxv4f16.iXLen(
     <vscale x 4 x half> undef,
     <vscale x 4 x half> %0,
-    i64 9,
-    i64 %1)
+    iXLen 9,
+    iXLen %1)
 
   ret <vscale x 4 x half> %a
 }
 
-define <vscale x 4 x half> @intrinsic_vrgather_mask_vi_nxv4f16_nxv4f16_i64(<vscale x 4 x half> %0, <vscale x 4 x half> %1, <vscale x 4 x i1> %2, i64 %3) nounwind {
-; CHECK-LABEL: intrinsic_vrgather_mask_vi_nxv4f16_nxv4f16_i64:
+define <vscale x 4 x half> @intrinsic_vrgather_mask_vi_nxv4f16_nxv4f16(<vscale x 4 x half> %0, <vscale x 4 x half> %1, <vscale x 4 x i1> %2, iXLen %3) nounwind {
+; CHECK-LABEL: intrinsic_vrgather_mask_vi_nxv4f16_nxv4f16:
 ; CHECK:       # %bb.0: # %entry
 ; CHECK-NEXT:    vsetvli zero, a0, e16, m1, ta, mu
 ; CHECK-NEXT:    vrgather.vi v8, v9, 9, v0.t
 ; CHECK-NEXT:    ret
 entry:
-  %a = call <vscale x 4 x half> @llvm.riscv.vrgather.vx.mask.nxv4f16.i64(
+  %a = call <vscale x 4 x half> @llvm.riscv.vrgather.vx.mask.nxv4f16.iXLen(
     <vscale x 4 x half> %0,
     <vscale x 4 x half> %1,
-    i64 9,
+    iXLen 9,
     <vscale x 4 x i1> %2,
-    i64 %3, i64 1)
+    iXLen %3, iXLen 1)
 
   ret <vscale x 4 x half> %a
 }
 
-define <vscale x 8 x half> @intrinsic_vrgather_vi_nxv8f16_nxv8f16_i64(<vscale x 8 x half> %0, i64 %1) nounwind {
-; CHECK-LABEL: intrinsic_vrgather_vi_nxv8f16_nxv8f16_i64:
+define <vscale x 8 x half> @intrinsic_vrgather_vi_nxv8f16_nxv8f16(<vscale x 8 x half> %0, iXLen %1) nounwind {
+; CHECK-LABEL: intrinsic_vrgather_vi_nxv8f16_nxv8f16:
 ; CHECK:       # %bb.0: # %entry
 ; CHECK-NEXT:    vsetvli zero, a0, e16, m2, ta, ma
 ; CHECK-NEXT:    vrgather.vi v10, v8, 9
 ; CHECK-NEXT:    vmv.v.v v8, v10
 ; CHECK-NEXT:    ret
 entry:
-  %a = call <vscale x 8 x half> @llvm.riscv.vrgather.vx.nxv8f16.i64(
+  %a = call <vscale x 8 x half> @llvm.riscv.vrgather.vx.nxv8f16.iXLen(
     <vscale x 8 x half> undef,
     <vscale x 8 x half> %0,
-    i64 9,
-    i64 %1)
+    iXLen 9,
+    iXLen %1)
 
   ret <vscale x 8 x half> %a
 }
 
-define <vscale x 8 x half> @intrinsic_vrgather_mask_vi_nxv8f16_nxv8f16_i64(<vscale x 8 x half> %0, <vscale x 8 x half> %1, <vscale x 8 x i1> %2, i64 %3) nounwind {
-; CHECK-LABEL: intrinsic_vrgather_mask_vi_nxv8f16_nxv8f16_i64:
+define <vscale x 8 x half> @intrinsic_vrgather_mask_vi_nxv8f16_nxv8f16(<vscale x 8 x half> %0, <vscale x 8 x half> %1, <vscale x 8 x i1> %2, iXLen %3) nounwind {
+; CHECK-LABEL: intrinsic_vrgather_mask_vi_nxv8f16_nxv8f16:
 ; CHECK:       # %bb.0: # %entry
 ; CHECK-NEXT:    vsetvli zero, a0, e16, m2, ta, mu
 ; CHECK-NEXT:    vrgather.vi v8, v10, 9, v0.t
 ; CHECK-NEXT:    ret
 entry:
-  %a = call <vscale x 8 x half> @llvm.riscv.vrgather.vx.mask.nxv8f16.i64(
+  %a = call <vscale x 8 x half> @llvm.riscv.vrgather.vx.mask.nxv8f16.iXLen(
     <vscale x 8 x half> %0,
     <vscale x 8 x half> %1,
-    i64 9,
+    iXLen 9,
     <vscale x 8 x i1> %2,
-    i64 %3, i64 1)
+    iXLen %3, iXLen 1)
 
   ret <vscale x 8 x half> %a
 }
 
-define <vscale x 16 x half> @intrinsic_vrgather_vi_nxv16f16_nxv16f16_i64(<vscale x 16 x half> %0, i64 %1) nounwind {
-; CHECK-LABEL: intrinsic_vrgather_vi_nxv16f16_nxv16f16_i64:
+define <vscale x 16 x half> @intrinsic_vrgather_vi_nxv16f16_nxv16f16(<vscale x 16 x half> %0, iXLen %1) nounwind {
+; CHECK-LABEL: intrinsic_vrgather_vi_nxv16f16_nxv16f16:
 ; CHECK:       # %bb.0: # %entry
 ; CHECK-NEXT:    vsetvli zero, a0, e16, m4, ta, ma
 ; CHECK-NEXT:    vrgather.vi v12, v8, 9
 ; CHECK-NEXT:    vmv.v.v v8, v12
 ; CHECK-NEXT:    ret
 entry:
-  %a = call <vscale x 16 x half> @llvm.riscv.vrgather.vx.nxv16f16.i64(
+  %a = call <vscale x 16 x half> @llvm.riscv.vrgather.vx.nxv16f16.iXLen(
     <vscale x 16 x half> undef,
     <vscale x 16 x half> %0,
-    i64 9,
-    i64 %1)
+    iXLen 9,
+    iXLen %1)
 
   ret <vscale x 16 x half> %a
 }
 
-define <vscale x 16 x half> @intrinsic_vrgather_mask_vi_nxv16f16_nxv16f16_i64(<vscale x 16 x half> %0, <vscale x 16 x half> %1, <vscale x 16 x i1> %2, i64 %3) nounwind {
-; CHECK-LABEL: intrinsic_vrgather_mask_vi_nxv16f16_nxv16f16_i64:
+define <vscale x 16 x half> @intrinsic_vrgather_mask_vi_nxv16f16_nxv16f16(<vscale x 16 x half> %0, <vscale x 16 x half> %1, <vscale x 16 x i1> %2, iXLen %3) nounwind {
+; CHECK-LABEL: intrinsic_vrgather_mask_vi_nxv16f16_nxv16f16:
 ; CHECK:       # %bb.0: # %entry
 ; CHECK-NEXT:    vsetvli zero, a0, e16, m4, ta, mu
 ; CHECK-NEXT:    vrgather.vi v8, v12, 9, v0.t
 ; CHECK-NEXT:    ret
 entry:
-  %a = call <vscale x 16 x half> @llvm.riscv.vrgather.vx.mask.nxv16f16.i64(
+  %a = call <vscale x 16 x half> @llvm.riscv.vrgather.vx.mask.nxv16f16.iXLen(
     <vscale x 16 x half> %0,
     <vscale x 16 x half> %1,
-    i64 9,
+    iXLen 9,
     <vscale x 16 x i1> %2,
-    i64 %3, i64 1)
+    iXLen %3, iXLen 1)
 
   ret <vscale x 16 x half> %a
 }
 
-define <vscale x 32 x half> @intrinsic_vrgather_vi_nxv32f16_nxv32f16_i64(<vscale x 32 x half> %0, i64 %1) nounwind {
-; CHECK-LABEL: intrinsic_vrgather_vi_nxv32f16_nxv32f16_i64:
+define <vscale x 32 x half> @intrinsic_vrgather_vi_nxv32f16_nxv32f16(<vscale x 32 x half> %0, iXLen %1) nounwind {
+; CHECK-LABEL: intrinsic_vrgather_vi_nxv32f16_nxv32f16:
 ; CHECK:       # %bb.0: # %entry
 ; CHECK-NEXT:    vsetvli zero, a0, e16, m8, ta, ma
 ; CHECK-NEXT:    vrgather.vi v16, v8, 9
 ; CHECK-NEXT:    vmv.v.v v8, v16
 ; CHECK-NEXT:    ret
 entry:
-  %a = call <vscale x 32 x half> @llvm.riscv.vrgather.vx.nxv32f16.i64(
+  %a = call <vscale x 32 x half> @llvm.riscv.vrgather.vx.nxv32f16.iXLen(
     <vscale x 32 x half> undef,
     <vscale x 32 x half> %0,
-    i64 9,
-    i64 %1)
+    iXLen 9,
+    iXLen %1)
 
   ret <vscale x 32 x half> %a
 }
 
-define <vscale x 32 x half> @intrinsic_vrgather_mask_vi_nxv32f16_nxv32f16_i64(<vscale x 32 x half> %0, <vscale x 32 x half> %1, <vscale x 32 x i1> %2, i64 %3) nounwind {
-; CHECK-LABEL: intrinsic_vrgather_mask_vi_nxv32f16_nxv32f16_i64:
+define <vscale x 32 x half> @intrinsic_vrgather_mask_vi_nxv32f16_nxv32f16(<vscale x 32 x half> %0, <vscale x 32 x half> %1, <vscale x 32 x i1> %2, iXLen %3) nounwind {
+; CHECK-LABEL: intrinsic_vrgather_mask_vi_nxv32f16_nxv32f16:
 ; CHECK:       # %bb.0: # %entry
 ; CHECK-NEXT:    vsetvli zero, a0, e16, m8, ta, mu
 ; CHECK-NEXT:    vrgather.vi v8, v16, 9, v0.t
 ; CHECK-NEXT:    ret
 entry:
-  %a = call <vscale x 32 x half> @llvm.riscv.vrgather.vx.mask.nxv32f16.i64(
+  %a = call <vscale x 32 x half> @llvm.riscv.vrgather.vx.mask.nxv32f16.iXLen(
     <vscale x 32 x half> %0,
     <vscale x 32 x half> %1,
-    i64 9,
+    iXLen 9,
     <vscale x 32 x i1> %2,
-    i64 %3, i64 1)
+    iXLen %3, iXLen 1)
 
   ret <vscale x 32 x half> %a
 }
 
-define <vscale x 1 x float> @intrinsic_vrgather_vi_nxv1f32_nxv1f32_i64(<vscale x 1 x float> %0, i64 %1) nounwind {
-; CHECK-LABEL: intrinsic_vrgather_vi_nxv1f32_nxv1f32_i64:
+define <vscale x 1 x float> @intrinsic_vrgather_vi_nxv1f32_nxv1f32(<vscale x 1 x float> %0, iXLen %1) nounwind {
+; CHECK-LABEL: intrinsic_vrgather_vi_nxv1f32_nxv1f32:
 ; CHECK:       # %bb.0: # %entry
 ; CHECK-NEXT:    vsetvli zero, a0, e32, mf2, ta, ma
 ; CHECK-NEXT:    vrgather.vi v9, v8, 9
 ; CHECK-NEXT:    vmv1r.v v8, v9
 ; CHECK-NEXT:    ret
 entry:
-  %a = call <vscale x 1 x float> @llvm.riscv.vrgather.vx.nxv1f32.i64(
+  %a = call <vscale x 1 x float> @llvm.riscv.vrgather.vx.nxv1f32.iXLen(
     <vscale x 1 x float> undef,
     <vscale x 1 x float> %0,
-    i64 9,
-    i64 %1)
+    iXLen 9,
+    iXLen %1)
 
   ret <vscale x 1 x float> %a
 }
 
-define <vscale x 1 x float> @intrinsic_vrgather_mask_vi_nxv1f32_nxv1f32_i64(<vscale x 1 x float> %0, <vscale x 1 x float> %1, <vscale x 1 x i1> %2, i64 %3) nounwind {
-; CHECK-LABEL: intrinsic_vrgather_mask_vi_nxv1f32_nxv1f32_i64:
+define <vscale x 1 x float> @intrinsic_vrgather_mask_vi_nxv1f32_nxv1f32(<vscale x 1 x float> %0, <vscale x 1 x float> %1, <vscale x 1 x i1> %2, iXLen %3) nounwind {
+; CHECK-LABEL: intrinsic_vrgather_mask_vi_nxv1f32_nxv1f32:
 ; CHECK:       # %bb.0: # %entry
 ; CHECK-NEXT:    vsetvli zero, a0, e32, mf2, ta, mu
 ; CHECK-NEXT:    vrgather.vi v8, v9, 9, v0.t
 ; CHECK-NEXT:    ret
 entry:
-  %a = call <vscale x 1 x float> @llvm.riscv.vrgather.vx.mask.nxv1f32.i64(
+  %a = call <vscale x 1 x float> @llvm.riscv.vrgather.vx.mask.nxv1f32.iXLen(
     <vscale x 1 x float> %0,
     <vscale x 1 x float> %1,
-    i64 9,
+    iXLen 9,
     <vscale x 1 x i1> %2,
-    i64 %3, i64 1)
+    iXLen %3, iXLen 1)
 
   ret <vscale x 1 x float> %a
 }
 
-define <vscale x 2 x float> @intrinsic_vrgather_vi_nxv2f32_nxv2f32_i64(<vscale x 2 x float> %0, i64 %1) nounwind {
-; CHECK-LABEL: intrinsic_vrgather_vi_nxv2f32_nxv2f32_i64:
+define <vscale x 2 x float> @intrinsic_vrgather_vi_nxv2f32_nxv2f32(<vscale x 2 x float> %0, iXLen %1) nounwind {
+; CHECK-LABEL: intrinsic_vrgather_vi_nxv2f32_nxv2f32:
 ; CHECK:       # %bb.0: # %entry
 ; CHECK-NEXT:    vsetvli zero, a0, e32, m1, ta, ma
 ; CHECK-NEXT:    vrgather.vi v9, v8, 9
 ; CHECK-NEXT:    vmv.v.v v8, v9
 ; CHECK-NEXT:    ret
 entry:
-  %a = call <vscale x 2 x float> @llvm.riscv.vrgather.vx.nxv2f32.i64(
+  %a = call <vscale x 2 x float> @llvm.riscv.vrgather.vx.nxv2f32.iXLen(
     <vscale x 2 x float> undef,
     <vscale x 2 x float> %0,
-    i64 9,
-    i64 %1)
+    iXLen 9,
+    iXLen %1)
 
   ret <vscale x 2 x float> %a
 }
 
-define <vscale x 2 x float> @intrinsic_vrgather_mask_vi_nxv2f32_nxv2f32_i64(<vscale x 2 x float> %0, <vscale x 2 x float> %1, <vscale x 2 x i1> %2, i64 %3) nounwind {
-; CHECK-LABEL: intrinsic_vrgather_mask_vi_nxv2f32_nxv2f32_i64:
+define <vscale x 2 x float> @intrinsic_vrgather_mask_vi_nxv2f32_nxv2f32(<vscale x 2 x float> %0, <vscale x 2 x float> %1, <vscale x 2 x i1> %2, iXLen %3) nounwind {
+; CHECK-LABEL: intrinsic_vrgather_mask_vi_nxv2f32_nxv2f32:
 ; CHECK:       # %bb.0: # %entry
 ; CHECK-NEXT:    vsetvli zero, a0, e32, m1, ta, mu
 ; CHECK-NEXT:    vrgather.vi v8, v9, 9, v0.t
 ; CHECK-NEXT:    ret
 entry:
-  %a = call <vscale x 2 x float> @llvm.riscv.vrgather.vx.mask.nxv2f32.i64(
+  %a = call <vscale x 2 x float> @llvm.riscv.vrgather.vx.mask.nxv2f32.iXLen(
     <vscale x 2 x float> %0,
     <vscale x 2 x float> %1,
-    i64 9,
+    iXLen 9,
     <vscale x 2 x i1> %2,
-    i64 %3, i64 1)
+    iXLen %3, iXLen 1)
 
   ret <vscale x 2 x float> %a
 }
 
-define <vscale x 4 x float> @intrinsic_vrgather_vi_nxv4f32_nxv4f32_i64(<vscale x 4 x float> %0, i64 %1) nounwind {
-; CHECK-LABEL: intrinsic_vrgather_vi_nxv4f32_nxv4f32_i64:
+define <vscale x 4 x float> @intrinsic_vrgather_vi_nxv4f32_nxv4f32(<vscale x 4 x float> %0, iXLen %1) nounwind {
+; CHECK-LABEL: intrinsic_vrgather_vi_nxv4f32_nxv4f32:
 ; CHECK:       # %bb.0: # %entry
 ; CHECK-NEXT:    vsetvli zero, a0, e32, m2, ta, ma
 ; CHECK-NEXT:    vrgather.vi v10, v8, 9
 ; CHECK-NEXT:    vmv.v.v v8, v10
 ; CHECK-NEXT:    ret
 entry:
-  %a = call <vscale x 4 x float> @llvm.riscv.vrgather.vx.nxv4f32.i64(
+  %a = call <vscale x 4 x float> @llvm.riscv.vrgather.vx.nxv4f32.iXLen(
     <vscale x 4 x float> undef,
     <vscale x 4 x float> %0,
-    i64 9,
-    i64 %1)
+    iXLen 9,
+    iXLen %1)
 
   ret <vscale x 4 x float> %a
 }
 
-define <vscale x 4 x float> @intrinsic_vrgather_mask_vi_nxv4f32_nxv4f32_i64(<vscale x 4 x float> %0, <vscale x 4 x float> %1, <vscale x 4 x i1> %2, i64 %3) nounwind {
-; CHECK-LABEL: intrinsic_vrgather_mask_vi_nxv4f32_nxv4f32_i64:
+define <vscale x 4 x float> @intrinsic_vrgather_mask_vi_nxv4f32_nxv4f32(<vscale x 4 x float> %0, <vscale x 4 x float> %1, <vscale x 4 x i1> %2, iXLen %3) nounwind {
+; CHECK-LABEL: intrinsic_vrgather_mask_vi_nxv4f32_nxv4f32:
 ; CHECK:       # %bb.0: # %entry
 ; CHECK-NEXT:    vsetvli zero, a0, e32, m2, ta, mu
 ; CHECK-NEXT:    vrgather.vi v8, v10, 9, v0.t
 ; CHECK-NEXT:    ret
 entry:
-  %a = call <vscale x 4 x float> @llvm.riscv.vrgather.vx.mask.nxv4f32.i64(
+  %a = call <vscale x 4 x float> @llvm.riscv.vrgather.vx.mask.nxv4f32.iXLen(
     <vscale x 4 x float> %0,
     <vscale x 4 x float> %1,
-    i64 9,
+    iXLen 9,
     <vscale x 4 x i1> %2,
-    i64 %3, i64 1)
+    iXLen %3, iXLen 1)
 
   ret <vscale x 4 x float> %a
 }
 
-define <vscale x 8 x float> @intrinsic_vrgather_vi_nxv8f32_nxv8f32_i64(<vscale x 8 x float> %0, i64 %1) nounwind {
-; CHECK-LABEL: intrinsic_vrgather_vi_nxv8f32_nxv8f32_i64:
+define <vscale x 8 x float> @intrinsic_vrgather_vi_nxv8f32_nxv8f32(<vscale x 8 x float> %0, iXLen %1) nounwind {
+; CHECK-LABEL: intrinsic_vrgather_vi_nxv8f32_nxv8f32:
 ; CHECK:       # %bb.0: # %entry
 ; CHECK-NEXT:    vsetvli zero, a0, e32, m4, ta, ma
 ; CHECK-NEXT:    vrgather.vi v12, v8, 9
 ; CHECK-NEXT:    vmv.v.v v8, v12
 ; CHECK-NEXT:    ret
 entry:
-  %a = call <vscale x 8 x float> @llvm.riscv.vrgather.vx.nxv8f32.i64(
+  %a = call <vscale x 8 x float> @llvm.riscv.vrgather.vx.nxv8f32.iXLen(
     <vscale x 8 x float> undef,
     <vscale x 8 x float> %0,
-    i64 9,
-    i64 %1)
+    iXLen 9,
+    iXLen %1)
 
   ret <vscale x 8 x float> %a
 }
 
-define <vscale x 8 x float> @intrinsic_vrgather_mask_vi_nxv8f32_nxv8f32_i64(<vscale x 8 x float> %0, <vscale x 8 x float> %1, <vscale x 8 x i1> %2, i64 %3) nounwind {
-; CHECK-LABEL: intrinsic_vrgather_mask_vi_nxv8f32_nxv8f32_i64:
+define <vscale x 8 x float> @intrinsic_vrgather_mask_vi_nxv8f32_nxv8f32(<vscale x 8 x float> %0, <vscale x 8 x float> %1, <vscale x 8 x i1> %2, iXLen %3) nounwind {
+; CHECK-LABEL: intrinsic_vrgather_mask_vi_nxv8f32_nxv8f32:
 ; CHECK:       # %bb.0: # %entry
 ; CHECK-NEXT:    vsetvli zero, a0, e32, m4, ta, mu
 ; CHECK-NEXT:    vrgather.vi v8, v12, 9, v0.t
 ; CHECK-NEXT:    ret
 entry:
-  %a = call <vscale x 8 x float> @llvm.riscv.vrgather.vx.mask.nxv8f32.i64(
+  %a = call <vscale x 8 x float> @llvm.riscv.vrgather.vx.mask.nxv8f32.iXLen(
     <vscale x 8 x float> %0,
     <vscale x 8 x float> %1,
-    i64 9,
+    iXLen 9,
     <vscale x 8 x i1> %2,
-    i64 %3, i64 1)
+    iXLen %3, iXLen 1)
 
   ret <vscale x 8 x float> %a
 }
 
-define <vscale x 16 x float> @intrinsic_vrgather_vi_nxv16f32_nxv16f32_i64(<vscale x 16 x float> %0, i64 %1) nounwind {
-; CHECK-LABEL: intrinsic_vrgather_vi_nxv16f32_nxv16f32_i64:
+define <vscale x 16 x float> @intrinsic_vrgather_vi_nxv16f32_nxv16f32(<vscale x 16 x float> %0, iXLen %1) nounwind {
+; CHECK-LABEL: intrinsic_vrgather_vi_nxv16f32_nxv16f32:
 ; CHECK:       # %bb.0: # %entry
 ; CHECK-NEXT:    vsetvli zero, a0, e32, m8, ta, ma
 ; CHECK-NEXT:    vrgather.vi v16, v8, 9
 ; CHECK-NEXT:    vmv.v.v v8, v16
 ; CHECK-NEXT:    ret
 entry:
-  %a = call <vscale x 16 x float> @llvm.riscv.vrgather.vx.nxv16f32.i64(
+  %a = call <vscale x 16 x float> @llvm.riscv.vrgather.vx.nxv16f32.iXLen(
     <vscale x 16 x float> undef,
     <vscale x 16 x float> %0,
-    i64 9,
-    i64 %1)
+    iXLen 9,
+    iXLen %1)
 
   ret <vscale x 16 x float> %a
 }
 
-define <vscale x 16 x float> @intrinsic_vrgather_mask_vi_nxv16f32_nxv16f32_i64(<vscale x 16 x float> %0, <vscale x 16 x float> %1, <vscale x 16 x i1> %2, i64 %3) nounwind {
-; CHECK-LABEL: intrinsic_vrgather_mask_vi_nxv16f32_nxv16f32_i64:
+define <vscale x 16 x float> @intrinsic_vrgather_mask_vi_nxv16f32_nxv16f32(<vscale x 16 x float> %0, <vscale x 16 x float> %1, <vscale x 16 x i1> %2, iXLen %3) nounwind {
+; CHECK-LABEL: intrinsic_vrgather_mask_vi_nxv16f32_nxv16f32:
 ; CHECK:       # %bb.0: # %entry
 ; CHECK-NEXT:    vsetvli zero, a0, e32, m8, ta, mu
 ; CHECK-NEXT:    vrgather.vi v8, v16, 9, v0.t
 ; CHECK-NEXT:    ret
 entry:
-  %a = call <vscale x 16 x float> @llvm.riscv.vrgather.vx.mask.nxv16f32.i64(
+  %a = call <vscale x 16 x float> @llvm.riscv.vrgather.vx.mask.nxv16f32.iXLen(
     <vscale x 16 x float> %0,
     <vscale x 16 x float> %1,
-    i64 9,
+    iXLen 9,
     <vscale x 16 x i1> %2,
-    i64 %3, i64 1)
+    iXLen %3, iXLen 1)
 
   ret <vscale x 16 x float> %a
 }
 
-define <vscale x 1 x double> @intrinsic_vrgather_vi_nxv1f64_nxv1f64_i64(<vscale x 1 x double> %0, i64 %1) nounwind {
-; CHECK-LABEL: intrinsic_vrgather_vi_nxv1f64_nxv1f64_i64:
+define <vscale x 1 x double> @intrinsic_vrgather_vi_nxv1f64_nxv1f64(<vscale x 1 x double> %0, iXLen %1) nounwind {
+; CHECK-LABEL: intrinsic_vrgather_vi_nxv1f64_nxv1f64:
 ; CHECK:       # %bb.0: # %entry
 ; CHECK-NEXT:    vsetvli zero, a0, e64, m1, ta, ma
 ; CHECK-NEXT:    vrgather.vi v9, v8, 9
 ; CHECK-NEXT:    vmv.v.v v8, v9
 ; CHECK-NEXT:    ret
 entry:
-  %a = call <vscale x 1 x double> @llvm.riscv.vrgather.vx.nxv1f64.i64(
+  %a = call <vscale x 1 x double> @llvm.riscv.vrgather.vx.nxv1f64.iXLen(
     <vscale x 1 x double> undef,
     <vscale x 1 x double> %0,
-    i64 9,
-    i64 %1)
+    iXLen 9,
+    iXLen %1)
 
   ret <vscale x 1 x double> %a
 }
 
-define <vscale x 1 x double> @intrinsic_vrgather_mask_vi_nxv1f64_nxv1f64_i64(<vscale x 1 x double> %0, <vscale x 1 x double> %1, <vscale x 1 x i1> %2, i64 %3) nounwind {
-; CHECK-LABEL: intrinsic_vrgather_mask_vi_nxv1f64_nxv1f64_i64:
+define <vscale x 1 x double> @intrinsic_vrgather_mask_vi_nxv1f64_nxv1f64(<vscale x 1 x double> %0, <vscale x 1 x double> %1, <vscale x 1 x i1> %2, iXLen %3) nounwind {
+; CHECK-LABEL: intrinsic_vrgather_mask_vi_nxv1f64_nxv1f64:
 ; CHECK:       # %bb.0: # %entry
 ; CHECK-NEXT:    vsetvli zero, a0, e64, m1, ta, mu
 ; CHECK-NEXT:    vrgather.vi v8, v9, 9, v0.t
 ; CHECK-NEXT:    ret
 entry:
-  %a = call <vscale x 1 x double> @llvm.riscv.vrgather.vx.mask.nxv1f64.i64(
+  %a = call <vscale x 1 x double> @llvm.riscv.vrgather.vx.mask.nxv1f64.iXLen(
     <vscale x 1 x double> %0,
     <vscale x 1 x double> %1,
-    i64 9,
+    iXLen 9,
     <vscale x 1 x i1> %2,
-    i64 %3, i64 1)
+    iXLen %3, iXLen 1)
 
   ret <vscale x 1 x double> %a
 }
 
-define <vscale x 2 x double> @intrinsic_vrgather_vi_nxv2f64_nxv2f64_i64(<vscale x 2 x double> %0, i64 %1) nounwind {
-; CHECK-LABEL: intrinsic_vrgather_vi_nxv2f64_nxv2f64_i64:
+define <vscale x 2 x double> @intrinsic_vrgather_vi_nxv2f64_nxv2f64(<vscale x 2 x double> %0, iXLen %1) nounwind {
+; CHECK-LABEL: intrinsic_vrgather_vi_nxv2f64_nxv2f64:
 ; CHECK:       # %bb.0: # %entry
 ; CHECK-NEXT:    vsetvli zero, a0, e64, m2, ta, ma
 ; CHECK-NEXT:    vrgather.vi v10, v8, 9
 ; CHECK-NEXT:    vmv.v.v v8, v10
 ; CHECK-NEXT:    ret
 entry:
-  %a = call <vscale x 2 x double> @llvm.riscv.vrgather.vx.nxv2f64.i64(
+  %a = call <vscale x 2 x double> @llvm.riscv.vrgather.vx.nxv2f64.iXLen(
     <vscale x 2 x double> undef,
     <vscale x 2 x double> %0,
-    i64 9,
-    i64 %1)
+    iXLen 9,
+    iXLen %1)
 
   ret <vscale x 2 x double> %a
 }
 
-define <vscale x 2 x double> @intrinsic_vrgather_mask_vi_nxv2f64_nxv2f64_i64(<vscale x 2 x double> %0, <vscale x 2 x double> %1, <vscale x 2 x i1> %2, i64 %3) nounwind {
-; CHECK-LABEL: intrinsic_vrgather_mask_vi_nxv2f64_nxv2f64_i64:
+define <vscale x 2 x double> @intrinsic_vrgather_mask_vi_nxv2f64_nxv2f64(<vscale x 2 x double> %0, <vscale x 2 x double> %1, <vscale x 2 x i1> %2, iXLen %3) nounwind {
+; CHECK-LABEL: intrinsic_vrgather_mask_vi_nxv2f64_nxv2f64:
 ; CHECK:       # %bb.0: # %entry
 ; CHECK-NEXT:    vsetvli zero, a0, e64, m2, ta, mu
 ; CHECK-NEXT:    vrgather.vi v8, v10, 9, v0.t
 ; CHECK-NEXT:    ret
 entry:
-  %a = call <vscale x 2 x double> @llvm.riscv.vrgather.vx.mask.nxv2f64.i64(
+  %a = call <vscale x 2 x double> @llvm.riscv.vrgather.vx.mask.nxv2f64.iXLen(
     <vscale x 2 x double> %0,
     <vscale x 2 x double> %1,
-    i64 9,
+    iXLen 9,
     <vscale x 2 x i1> %2,
-    i64 %3, i64 1)
+    iXLen %3, iXLen 1)
 
   ret <vscale x 2 x double> %a
 }
 
-define <vscale x 4 x double> @intrinsic_vrgather_vi_nxv4f64_nxv4f64_i64(<vscale x 4 x double> %0, i64 %1) nounwind {
-; CHECK-LABEL: intrinsic_vrgather_vi_nxv4f64_nxv4f64_i64:
+define <vscale x 4 x double> @intrinsic_vrgather_vi_nxv4f64_nxv4f64(<vscale x 4 x double> %0, iXLen %1) nounwind {
+; CHECK-LABEL: intrinsic_vrgather_vi_nxv4f64_nxv4f64:
 ; CHECK:       # %bb.0: # %entry
 ; CHECK-NEXT:    vsetvli zero, a0, e64, m4, ta, ma
 ; CHECK-NEXT:    vrgather.vi v12, v8, 9
 ; CHECK-NEXT:    vmv.v.v v8, v12
 ; CHECK-NEXT:    ret
 entry:
-  %a = call <vscale x 4 x double> @llvm.riscv.vrgather.vx.nxv4f64.i64(
+  %a = call <vscale x 4 x double> @llvm.riscv.vrgather.vx.nxv4f64.iXLen(
     <vscale x 4 x double> undef,
     <vscale x 4 x double> %0,
-    i64 9,
-    i64 %1)
+    iXLen 9,
+    iXLen %1)
 
   ret <vscale x 4 x double> %a
 }
 
-define <vscale x 4 x double> @intrinsic_vrgather_mask_vi_nxv4f64_nxv4f64_i64(<vscale x 4 x double> %0, <vscale x 4 x double> %1, <vscale x 4 x i1> %2, i64 %3) nounwind {
-; CHECK-LABEL: intrinsic_vrgather_mask_vi_nxv4f64_nxv4f64_i64:
+define <vscale x 4 x double> @intrinsic_vrgather_mask_vi_nxv4f64_nxv4f64(<vscale x 4 x double> %0, <vscale x 4 x double> %1, <vscale x 4 x i1> %2, iXLen %3) nounwind {
+; CHECK-LABEL: intrinsic_vrgather_mask_vi_nxv4f64_nxv4f64:
 ; CHECK:       # %bb.0: # %entry
 ; CHECK-NEXT:    vsetvli zero, a0, e64, m4, ta, mu
 ; CHECK-NEXT:    vrgather.vi v8, v12, 9, v0.t
 ; CHECK-NEXT:    ret
 entry:
-  %a = call <vscale x 4 x double> @llvm.riscv.vrgather.vx.mask.nxv4f64.i64(
+  %a = call <vscale x 4 x double> @llvm.riscv.vrgather.vx.mask.nxv4f64.iXLen(
     <vscale x 4 x double> %0,
     <vscale x 4 x double> %1,
-    i64 9,
+    iXLen 9,
     <vscale x 4 x i1> %2,
-    i64 %3, i64 1)
+    iXLen %3, iXLen 1)
 
   ret <vscale x 4 x double> %a
 }
 
-define <vscale x 8 x double> @intrinsic_vrgather_vi_nxv8f64_nxv8f64_i64(<vscale x 8 x double> %0, i64 %1) nounwind {
-; CHECK-LABEL: intrinsic_vrgather_vi_nxv8f64_nxv8f64_i64:
+define <vscale x 8 x double> @intrinsic_vrgather_vi_nxv8f64_nxv8f64(<vscale x 8 x double> %0, iXLen %1) nounwind {
+; CHECK-LABEL: intrinsic_vrgather_vi_nxv8f64_nxv8f64:
 ; CHECK:       # %bb.0: # %entry
 ; CHECK-NEXT:    vsetvli zero, a0, e64, m8, ta, ma
 ; CHECK-NEXT:    vrgather.vi v16, v8, 9
 ; CHECK-NEXT:    vmv.v.v v8, v16
 ; CHECK-NEXT:    ret
 entry:
-  %a = call <vscale x 8 x double> @llvm.riscv.vrgather.vx.nxv8f64.i64(
+  %a = call <vscale x 8 x double> @llvm.riscv.vrgather.vx.nxv8f64.iXLen(
     <vscale x 8 x double> undef,
     <vscale x 8 x double> %0,
-    i64 9,
-    i64 %1)
+    iXLen 9,
+    iXLen %1)
 
   ret <vscale x 8 x double> %a
 }
 
-define <vscale x 8 x double> @intrinsic_vrgather_mask_vi_nxv8f64_nxv8f64_i64(<vscale x 8 x double> %0, <vscale x 8 x double> %1, <vscale x 8 x i1> %2, i64 %3) nounwind {
-; CHECK-LABEL: intrinsic_vrgather_mask_vi_nxv8f64_nxv8f64_i64:
+define <vscale x 8 x double> @intrinsic_vrgather_mask_vi_nxv8f64_nxv8f64(<vscale x 8 x double> %0, <vscale x 8 x double> %1, <vscale x 8 x i1> %2, iXLen %3) nounwind {
+; CHECK-LABEL: intrinsic_vrgather_mask_vi_nxv8f64_nxv8f64:
 ; CHECK:       # %bb.0: # %entry
 ; CHECK-NEXT:    vsetvli zero, a0, e64, m8, ta, mu
 ; CHECK-NEXT:    vrgather.vi v8, v16, 9, v0.t
 ; CHECK-NEXT:    ret
 entry:
-  %a = call <vscale x 8 x double> @llvm.riscv.vrgather.vx.mask.nxv8f64.i64(
+  %a = call <vscale x 8 x double> @llvm.riscv.vrgather.vx.mask.nxv8f64.iXLen(
     <vscale x 8 x double> %0,
     <vscale x 8 x double> %1,
-    i64 9,
+    iXLen 9,
     <vscale x 8 x i1> %2,
-    i64 %3, i64 1)
+    iXLen %3, iXLen 1)
 
   ret <vscale x 8 x double> %a
 }
diff --git a/llvm/test/CodeGen/RISCV/rvv/vrgatherei16-rv32.ll b/llvm/test/CodeGen/RISCV/rvv/vrgatherei16-rv32.ll
deleted file mode 100644
index ce14b19..0000000
--- a/llvm/test/CodeGen/RISCV/rvv/vrgatherei16-rv32.ll
+++ /dev/null
@@ -1,1449 +0,0 @@
-; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py
-; RUN: llc -mtriple=riscv32 -mattr=+v,+d,+zfh,+zvfh -verify-machineinstrs \
-; RUN:   < %s | FileCheck %s
-
-declare <vscale x 1 x i8> @llvm.riscv.vrgatherei16.vv.nxv1i8(
-  <vscale x 1 x i8>,
-  <vscale x 1 x i8>,
-  <vscale x 1 x i16>,
-  i32);
-
-define <vscale x 1 x i8> @intrinsic_vrgatherei16_vv_nxv1i8_nxv1i8(<vscale x 1 x i8> %0, <vscale x 1 x i16> %1, i32 %2) nounwind {
-; CHECK-LABEL: intrinsic_vrgatherei16_vv_nxv1i8_nxv1i8:
-; CHECK:       # %bb.0: # %entry
-; CHECK-NEXT:    vsetvli zero, a0, e8, mf8, ta, ma
-; CHECK-NEXT:    vrgatherei16.vv v10, v8, v9
-; CHECK-NEXT:    vmv1r.v v8, v10
-; CHECK-NEXT:    ret
-entry:
-  %a = call <vscale x 1 x i8> @llvm.riscv.vrgatherei16.vv.nxv1i8(
-    <vscale x 1 x i8> undef,
-    <vscale x 1 x i8> %0,
-    <vscale x 1 x i16> %1,
-    i32 %2)
-
-  ret <vscale x 1 x i8> %a
-}
-
-declare <vscale x 1 x i8> @llvm.riscv.vrgatherei16.vv.mask.nxv1i8(
-  <vscale x 1 x i8>,
-  <vscale x 1 x i8>,
-  <vscale x 1 x i16>,
-  <vscale x 1 x i1>,
-  i32,
-  i32);
-
-define <vscale x 1 x i8> @intrinsic_vrgatherei16_mask_vv_nxv1i8_nxv1i8(<vscale x 1 x i8> %0, <vscale x 1 x i8> %1, <vscale x 1 x i16> %2, <vscale x 1 x i1> %3, i32 %4) nounwind {
-; CHECK-LABEL: intrinsic_vrgatherei16_mask_vv_nxv1i8_nxv1i8:
-; CHECK:       # %bb.0: # %entry
-; CHECK-NEXT:    vsetvli zero, a0, e8, mf8, ta, mu
-; CHECK-NEXT:    vrgatherei16.vv v8, v9, v10, v0.t
-; CHECK-NEXT:    ret
-entry:
-  %a = call <vscale x 1 x i8> @llvm.riscv.vrgatherei16.vv.mask.nxv1i8(
-    <vscale x 1 x i8> %0,
-    <vscale x 1 x i8> %1,
-    <vscale x 1 x i16> %2,
-    <vscale x 1 x i1> %3,
-    i32 %4, i32 1)
-
-  ret <vscale x 1 x i8> %a
-}
-
-declare <vscale x 2 x i8> @llvm.riscv.vrgatherei16.vv.nxv2i8(
-  <vscale x 2 x i8>,
-  <vscale x 2 x i8>,
-  <vscale x 2 x i16>,
-  i32);
-
-define <vscale x 2 x i8> @intrinsic_vrgatherei16_vv_nxv2i8_nxv2i8(<vscale x 2 x i8> %0, <vscale x 2 x i16> %1, i32 %2) nounwind {
-; CHECK-LABEL: intrinsic_vrgatherei16_vv_nxv2i8_nxv2i8:
-; CHECK:       # %bb.0: # %entry
-; CHECK-NEXT:    vsetvli zero, a0, e8, mf4, ta, ma
-; CHECK-NEXT:    vrgatherei16.vv v10, v8, v9
-; CHECK-NEXT:    vmv1r.v v8, v10
-; CHECK-NEXT:    ret
-entry:
-  %a = call <vscale x 2 x i8> @llvm.riscv.vrgatherei16.vv.nxv2i8(
-    <vscale x 2 x i8> undef,
-    <vscale x 2 x i8> %0,
-    <vscale x 2 x i16> %1,
-    i32 %2)
-
-  ret <vscale x 2 x i8> %a
-}
-
-declare <vscale x 2 x i8> @llvm.riscv.vrgatherei16.vv.mask.nxv2i8(
-  <vscale x 2 x i8>,
-  <vscale x 2 x i8>,
-  <vscale x 2 x i16>,
-  <vscale x 2 x i1>,
-  i32,
-  i32);
-
-define <vscale x 2 x i8> @intrinsic_vrgatherei16_mask_vv_nxv2i8_nxv2i8(<vscale x 2 x i8> %0, <vscale x 2 x i8> %1, <vscale x 2 x i16> %2, <vscale x 2 x i1> %3, i32 %4) nounwind {
-; CHECK-LABEL: intrinsic_vrgatherei16_mask_vv_nxv2i8_nxv2i8:
-; CHECK:       # %bb.0: # %entry
-; CHECK-NEXT:    vsetvli zero, a0, e8, mf4, ta, mu
-; CHECK-NEXT:    vrgatherei16.vv v8, v9, v10, v0.t
-; CHECK-NEXT:    ret
-entry:
-  %a = call <vscale x 2 x i8> @llvm.riscv.vrgatherei16.vv.mask.nxv2i8(
-    <vscale x 2 x i8> %0,
-    <vscale x 2 x i8> %1,
-    <vscale x 2 x i16> %2,
-    <vscale x 2 x i1> %3,
-    i32 %4, i32 1)
-
-  ret <vscale x 2 x i8> %a
-}
-
-declare <vscale x 4 x i8> @llvm.riscv.vrgatherei16.vv.nxv4i8(
-  <vscale x 4 x i8>,
-  <vscale x 4 x i8>,
-  <vscale x 4 x i16>,
-  i32);
-
-define <vscale x 4 x i8> @intrinsic_vrgatherei16_vv_nxv4i8_nxv4i8(<vscale x 4 x i8> %0, <vscale x 4 x i16> %1, i32 %2) nounwind {
-; CHECK-LABEL: intrinsic_vrgatherei16_vv_nxv4i8_nxv4i8:
-; CHECK:       # %bb.0: # %entry
-; CHECK-NEXT:    vsetvli zero, a0, e8, mf2, ta, ma
-; CHECK-NEXT:    vrgatherei16.vv v10, v8, v9
-; CHECK-NEXT:    vmv1r.v v8, v10
-; CHECK-NEXT:    ret
-entry:
-  %a = call <vscale x 4 x i8> @llvm.riscv.vrgatherei16.vv.nxv4i8(
-    <vscale x 4 x i8> undef,
-    <vscale x 4 x i8> %0,
-    <vscale x 4 x i16> %1,
-    i32 %2)
-
-  ret <vscale x 4 x i8> %a
-}
-
-declare <vscale x 4 x i8> @llvm.riscv.vrgatherei16.vv.mask.nxv4i8(
-  <vscale x 4 x i8>,
-  <vscale x 4 x i8>,
-  <vscale x 4 x i16>,
-  <vscale x 4 x i1>,
-  i32,
-  i32);
-
-define <vscale x 4 x i8> @intrinsic_vrgatherei16_mask_vv_nxv4i8_nxv4i8(<vscale x 4 x i8> %0, <vscale x 4 x i8> %1, <vscale x 4 x i16> %2, <vscale x 4 x i1> %3, i32 %4) nounwind {
-; CHECK-LABEL: intrinsic_vrgatherei16_mask_vv_nxv4i8_nxv4i8:
-; CHECK:       # %bb.0: # %entry
-; CHECK-NEXT:    vsetvli zero, a0, e8, mf2, ta, mu
-; CHECK-NEXT:    vrgatherei16.vv v8, v9, v10, v0.t
-; CHECK-NEXT:    ret
-entry:
-  %a = call <vscale x 4 x i8> @llvm.riscv.vrgatherei16.vv.mask.nxv4i8(
-    <vscale x 4 x i8> %0,
-    <vscale x 4 x i8> %1,
-    <vscale x 4 x i16> %2,
-    <vscale x 4 x i1> %3,
-    i32 %4, i32 1)
-
-  ret <vscale x 4 x i8> %a
-}
-
-declare <vscale x 8 x i8> @llvm.riscv.vrgatherei16.vv.nxv8i8(
-  <vscale x 8 x i8>,
-  <vscale x 8 x i8>,
-  <vscale x 8 x i16>,
-  i32);
-
-define <vscale x 8 x i8> @intrinsic_vrgatherei16_vv_nxv8i8_nxv8i8(<vscale x 8 x i8> %0, <vscale x 8 x i16> %1, i32 %2) nounwind {
-; CHECK-LABEL: intrinsic_vrgatherei16_vv_nxv8i8_nxv8i8:
-; CHECK:       # %bb.0: # %entry
-; CHECK-NEXT:    vsetvli zero, a0, e8, m1, ta, ma
-; CHECK-NEXT:    vrgatherei16.vv v9, v8, v10
-; CHECK-NEXT:    vmv.v.v v8, v9
-; CHECK-NEXT:    ret
-entry:
-  %a = call <vscale x 8 x i8> @llvm.riscv.vrgatherei16.vv.nxv8i8(
-    <vscale x 8 x i8> undef,
-    <vscale x 8 x i8> %0,
-    <vscale x 8 x i16> %1,
-    i32 %2)
-
-  ret <vscale x 8 x i8> %a
-}
-
-declare <vscale x 8 x i8> @llvm.riscv.vrgatherei16.vv.mask.nxv8i8(
-  <vscale x 8 x i8>,
-  <vscale x 8 x i8>,
-  <vscale x 8 x i16>,
-  <vscale x 8 x i1>,
-  i32,
-  i32);
-
-define <vscale x 8 x i8> @intrinsic_vrgatherei16_mask_vv_nxv8i8_nxv8i8(<vscale x 8 x i8> %0, <vscale x 8 x i8> %1, <vscale x 8 x i16> %2, <vscale x 8 x i1> %3, i32 %4) nounwind {
-; CHECK-LABEL: intrinsic_vrgatherei16_mask_vv_nxv8i8_nxv8i8:
-; CHECK:       # %bb.0: # %entry
-; CHECK-NEXT:    vsetvli zero, a0, e8, m1, ta, mu
-; CHECK-NEXT:    vrgatherei16.vv v8, v9, v10, v0.t
-; CHECK-NEXT:    ret
-entry:
-  %a = call <vscale x 8 x i8> @llvm.riscv.vrgatherei16.vv.mask.nxv8i8(
-    <vscale x 8 x i8> %0,
-    <vscale x 8 x i8> %1,
-    <vscale x 8 x i16> %2,
-    <vscale x 8 x i1> %3,
-    i32 %4, i32 1)
-
-  ret <vscale x 8 x i8> %a
-}
-
-declare <vscale x 16 x i8> @llvm.riscv.vrgatherei16.vv.nxv16i8(
-  <vscale x 16 x i8>,
-  <vscale x 16 x i8>,
-  <vscale x 16 x i16>,
-  i32);
-
-define <vscale x 16 x i8> @intrinsic_vrgatherei16_vv_nxv16i8_nxv16i8(<vscale x 16 x i8> %0, <vscale x 16 x i16> %1, i32 %2) nounwind {
-; CHECK-LABEL: intrinsic_vrgatherei16_vv_nxv16i8_nxv16i8:
-; CHECK:       # %bb.0: # %entry
-; CHECK-NEXT:    vsetvli zero, a0, e8, m2, ta, ma
-; CHECK-NEXT:    vrgatherei16.vv v10, v8, v12
-; CHECK-NEXT:    vmv.v.v v8, v10
-; CHECK-NEXT:    ret
-entry:
-  %a = call <vscale x 16 x i8> @llvm.riscv.vrgatherei16.vv.nxv16i8(
-    <vscale x 16 x i8> undef,
-    <vscale x 16 x i8> %0,
-    <vscale x 16 x i16> %1,
-    i32 %2)
-
-  ret <vscale x 16 x i8> %a
-}
-
-declare <vscale x 16 x i8> @llvm.riscv.vrgatherei16.vv.mask.nxv16i8(
-  <vscale x 16 x i8>,
-  <vscale x 16 x i8>,
-  <vscale x 16 x i16>,
-  <vscale x 16 x i1>,
-  i32,
-  i32);
-
-define <vscale x 16 x i8> @intrinsic_vrgatherei16_mask_vv_nxv16i8_nxv16i8(<vscale x 16 x i8> %0, <vscale x 16 x i8> %1, <vscale x 16 x i16> %2, <vscale x 16 x i1> %3, i32 %4) nounwind {
-; CHECK-LABEL: intrinsic_vrgatherei16_mask_vv_nxv16i8_nxv16i8:
-; CHECK:       # %bb.0: # %entry
-; CHECK-NEXT:    vsetvli zero, a0, e8, m2, ta, mu
-; CHECK-NEXT:    vrgatherei16.vv v8, v10, v12, v0.t
-; CHECK-NEXT:    ret
-entry:
-  %a = call <vscale x 16 x i8> @llvm.riscv.vrgatherei16.vv.mask.nxv16i8(
-    <vscale x 16 x i8> %0,
-    <vscale x 16 x i8> %1,
-    <vscale x 16 x i16> %2,
-    <vscale x 16 x i1> %3,
-    i32 %4, i32 1)
-
-  ret <vscale x 16 x i8> %a
-}
-
-declare <vscale x 32 x i8> @llvm.riscv.vrgatherei16.vv.nxv32i8(
-  <vscale x 32 x i8>,
-  <vscale x 32 x i8>,
-  <vscale x 32 x i16>,
-  i32);
-
-define <vscale x 32 x i8> @intrinsic_vrgatherei16_vv_nxv32i8_nxv32i8(<vscale x 32 x i8> %0, <vscale x 32 x i16> %1, i32 %2) nounwind {
-; CHECK-LABEL: intrinsic_vrgatherei16_vv_nxv32i8_nxv32i8:
-; CHECK:       # %bb.0: # %entry
-; CHECK-NEXT:    vsetvli zero, a0, e8, m4, ta, ma
-; CHECK-NEXT:    vrgatherei16.vv v12, v8, v16
-; CHECK-NEXT:    vmv.v.v v8, v12
-; CHECK-NEXT:    ret
-entry:
-  %a = call <vscale x 32 x i8> @llvm.riscv.vrgatherei16.vv.nxv32i8(
-    <vscale x 32 x i8> undef,
-    <vscale x 32 x i8> %0,
-    <vscale x 32 x i16> %1,
-    i32 %2)
-
-  ret <vscale x 32 x i8> %a
-}
-
-declare <vscale x 32 x i8> @llvm.riscv.vrgatherei16.vv.mask.nxv32i8(
-  <vscale x 32 x i8>,
-  <vscale x 32 x i8>,
-  <vscale x 32 x i16>,
-  <vscale x 32 x i1>,
-  i32,
-  i32);
-
-define <vscale x 32 x i8> @intrinsic_vrgatherei16_mask_vv_nxv32i8_nxv32i8(<vscale x 32 x i8> %0, <vscale x 32 x i8> %1, <vscale x 32 x i16> %2, <vscale x 32 x i1> %3, i32 %4) nounwind {
-; CHECK-LABEL: intrinsic_vrgatherei16_mask_vv_nxv32i8_nxv32i8:
-; CHECK:       # %bb.0: # %entry
-; CHECK-NEXT:    vsetvli zero, a0, e8, m4, ta, mu
-; CHECK-NEXT:    vrgatherei16.vv v8, v12, v16, v0.t
-; CHECK-NEXT:    ret
-entry:
-  %a = call <vscale x 32 x i8> @llvm.riscv.vrgatherei16.vv.mask.nxv32i8(
-    <vscale x 32 x i8> %0,
-    <vscale x 32 x i8> %1,
-    <vscale x 32 x i16> %2,
-    <vscale x 32 x i1> %3,
-    i32 %4, i32 1)
-
-  ret <vscale x 32 x i8> %a
-}
-
-declare <vscale x 1 x i16> @llvm.riscv.vrgatherei16.vv.nxv1i16(
-  <vscale x 1 x i16>,
-  <vscale x 1 x i16>,
-  <vscale x 1 x i16>,
-  i32);
-
-define <vscale x 1 x i16> @intrinsic_vrgatherei16_vv_nxv1i16_nxv1i16(<vscale x 1 x i16> %0, <vscale x 1 x i16> %1, i32 %2) nounwind {
-; CHECK-LABEL: intrinsic_vrgatherei16_vv_nxv1i16_nxv1i16:
-; CHECK:       # %bb.0: # %entry
-; CHECK-NEXT:    vsetvli zero, a0, e16, mf4, ta, ma
-; CHECK-NEXT:    vrgatherei16.vv v10, v8, v9
-; CHECK-NEXT:    vmv1r.v v8, v10
-; CHECK-NEXT:    ret
-entry:
-  %a = call <vscale x 1 x i16> @llvm.riscv.vrgatherei16.vv.nxv1i16(
-    <vscale x 1 x i16> undef,
-    <vscale x 1 x i16> %0,
-    <vscale x 1 x i16> %1,
-    i32 %2)
-
-  ret <vscale x 1 x i16> %a
-}
-
-declare <vscale x 1 x i16> @llvm.riscv.vrgatherei16.vv.mask.nxv1i16(
-  <vscale x 1 x i16>,
-  <vscale x 1 x i16>,
-  <vscale x 1 x i16>,
-  <vscale x 1 x i1>,
-  i32,
-  i32);
-
-define <vscale x 1 x i16> @intrinsic_vrgatherei16_mask_vv_nxv1i16_nxv1i16(<vscale x 1 x i16> %0, <vscale x 1 x i16> %1, <vscale x 1 x i16> %2, <vscale x 1 x i1> %3, i32 %4) nounwind {
-; CHECK-LABEL: intrinsic_vrgatherei16_mask_vv_nxv1i16_nxv1i16:
-; CHECK:       # %bb.0: # %entry
-; CHECK-NEXT:    vsetvli zero, a0, e16, mf4, ta, mu
-; CHECK-NEXT:    vrgatherei16.vv v8, v9, v10, v0.t
-; CHECK-NEXT:    ret
-entry:
-  %a = call <vscale x 1 x i16> @llvm.riscv.vrgatherei16.vv.mask.nxv1i16(
-    <vscale x 1 x i16> %0,
-    <vscale x 1 x i16> %1,
-    <vscale x 1 x i16> %2,
-    <vscale x 1 x i1> %3,
-    i32 %4, i32 1)
-
-  ret <vscale x 1 x i16> %a
-}
-
-declare <vscale x 2 x i16> @llvm.riscv.vrgatherei16.vv.nxv2i16(
-  <vscale x 2 x i16>,
-  <vscale x 2 x i16>,
-  <vscale x 2 x i16>,
-  i32);
-
-define <vscale x 2 x i16> @intrinsic_vrgatherei16_vv_nxv2i16_nxv2i16(<vscale x 2 x i16> %0, <vscale x 2 x i16> %1, i32 %2) nounwind {
-; CHECK-LABEL: intrinsic_vrgatherei16_vv_nxv2i16_nxv2i16:
-; CHECK:       # %bb.0: # %entry
-; CHECK-NEXT:    vsetvli zero, a0, e16, mf2, ta, ma
-; CHECK-NEXT:    vrgatherei16.vv v10, v8, v9
-; CHECK-NEXT:    vmv1r.v v8, v10
-; CHECK-NEXT:    ret
-entry:
-  %a = call <vscale x 2 x i16> @llvm.riscv.vrgatherei16.vv.nxv2i16(
-    <vscale x 2 x i16> undef,
-    <vscale x 2 x i16> %0,
-    <vscale x 2 x i16> %1,
-    i32 %2)
-
-  ret <vscale x 2 x i16> %a
-}
-
-declare <vscale x 2 x i16> @llvm.riscv.vrgatherei16.vv.mask.nxv2i16(
-  <vscale x 2 x i16>,
-  <vscale x 2 x i16>,
-  <vscale x 2 x i16>,
-  <vscale x 2 x i1>,
-  i32,
-  i32);
-
-define <vscale x 2 x i16> @intrinsic_vrgatherei16_mask_vv_nxv2i16_nxv2i16(<vscale x 2 x i16> %0, <vscale x 2 x i16> %1, <vscale x 2 x i16> %2, <vscale x 2 x i1> %3, i32 %4) nounwind {
-; CHECK-LABEL: intrinsic_vrgatherei16_mask_vv_nxv2i16_nxv2i16:
-; CHECK:       # %bb.0: # %entry
-; CHECK-NEXT:    vsetvli zero, a0, e16, mf2, ta, mu
-; CHECK-NEXT:    vrgatherei16.vv v8, v9, v10, v0.t
-; CHECK-NEXT:    ret
-entry:
-  %a = call <vscale x 2 x i16> @llvm.riscv.vrgatherei16.vv.mask.nxv2i16(
-    <vscale x 2 x i16> %0,
-    <vscale x 2 x i16> %1,
-    <vscale x 2 x i16> %2,
-    <vscale x 2 x i1> %3,
-    i32 %4, i32 1)
-
-  ret <vscale x 2 x i16> %a
-}
-
-declare <vscale x 4 x i16> @llvm.riscv.vrgatherei16.vv.nxv4i16(
-  <vscale x 4 x i16>,
-  <vscale x 4 x i16>,
-  <vscale x 4 x i16>,
-  i32);
-
-define <vscale x 4 x i16> @intrinsic_vrgatherei16_vv_nxv4i16_nxv4i16(<vscale x 4 x i16> %0, <vscale x 4 x i16> %1, i32 %2) nounwind {
-; CHECK-LABEL: intrinsic_vrgatherei16_vv_nxv4i16_nxv4i16:
-; CHECK:       # %bb.0: # %entry
-; CHECK-NEXT:    vsetvli zero, a0, e16, m1, ta, ma
-; CHECK-NEXT:    vrgatherei16.vv v10, v8, v9
-; CHECK-NEXT:    vmv.v.v v8, v10
-; CHECK-NEXT:    ret
-entry:
-  %a = call <vscale x 4 x i16> @llvm.riscv.vrgatherei16.vv.nxv4i16(
-    <vscale x 4 x i16> undef,
-    <vscale x 4 x i16> %0,
-    <vscale x 4 x i16> %1,
-    i32 %2)
-
-  ret <vscale x 4 x i16> %a
-}
-
-declare <vscale x 4 x i16> @llvm.riscv.vrgatherei16.vv.mask.nxv4i16(
-  <vscale x 4 x i16>,
-  <vscale x 4 x i16>,
-  <vscale x 4 x i16>,
-  <vscale x 4 x i1>,
-  i32,
-  i32);
-
-define <vscale x 4 x i16> @intrinsic_vrgatherei16_mask_vv_nxv4i16_nxv4i16(<vscale x 4 x i16> %0, <vscale x 4 x i16> %1, <vscale x 4 x i16> %2, <vscale x 4 x i1> %3, i32 %4) nounwind {
-; CHECK-LABEL: intrinsic_vrgatherei16_mask_vv_nxv4i16_nxv4i16:
-; CHECK:       # %bb.0: # %entry
-; CHECK-NEXT:    vsetvli zero, a0, e16, m1, ta, mu
-; CHECK-NEXT:    vrgatherei16.vv v8, v9, v10, v0.t
-; CHECK-NEXT:    ret
-entry:
-  %a = call <vscale x 4 x i16> @llvm.riscv.vrgatherei16.vv.mask.nxv4i16(
-    <vscale x 4 x i16> %0,
-    <vscale x 4 x i16> %1,
-    <vscale x 4 x i16> %2,
-    <vscale x 4 x i1> %3,
-    i32 %4, i32 1)
-
-  ret <vscale x 4 x i16> %a
-}
-
-declare <vscale x 8 x i16> @llvm.riscv.vrgatherei16.vv.nxv8i16(
-  <vscale x 8 x i16>,
-  <vscale x 8 x i16>,
-  <vscale x 8 x i16>,
-  i32);
-
-define <vscale x 8 x i16> @intrinsic_vrgatherei16_vv_nxv8i16_nxv8i16(<vscale x 8 x i16> %0, <vscale x 8 x i16> %1, i32 %2) nounwind {
-; CHECK-LABEL: intrinsic_vrgatherei16_vv_nxv8i16_nxv8i16:
-; CHECK:       # %bb.0: # %entry
-; CHECK-NEXT:    vsetvli zero, a0, e16, m2, ta, ma
-; CHECK-NEXT:    vrgatherei16.vv v12, v8, v10
-; CHECK-NEXT:    vmv.v.v v8, v12
-; CHECK-NEXT:    ret
-entry:
-  %a = call <vscale x 8 x i16> @llvm.riscv.vrgatherei16.vv.nxv8i16(
-    <vscale x 8 x i16> undef,
-    <vscale x 8 x i16> %0,
-    <vscale x 8 x i16> %1,
-    i32 %2)
-
-  ret <vscale x 8 x i16> %a
-}
-
-declare <vscale x 8 x i16> @llvm.riscv.vrgatherei16.vv.mask.nxv8i16(
-  <vscale x 8 x i16>,
-  <vscale x 8 x i16>,
-  <vscale x 8 x i16>,
-  <vscale x 8 x i1>,
-  i32,
-  i32);
-
-define <vscale x 8 x i16> @intrinsic_vrgatherei16_mask_vv_nxv8i16_nxv8i16(<vscale x 8 x i16> %0, <vscale x 8 x i16> %1, <vscale x 8 x i16> %2, <vscale x 8 x i1> %3, i32 %4) nounwind {
-; CHECK-LABEL: intrinsic_vrgatherei16_mask_vv_nxv8i16_nxv8i16:
-; CHECK:       # %bb.0: # %entry
-; CHECK-NEXT:    vsetvli zero, a0, e16, m2, ta, mu
-; CHECK-NEXT:    vrgatherei16.vv v8, v10, v12, v0.t
-; CHECK-NEXT:    ret
-entry:
-  %a = call <vscale x 8 x i16> @llvm.riscv.vrgatherei16.vv.mask.nxv8i16(
-    <vscale x 8 x i16> %0,
-    <vscale x 8 x i16> %1,
-    <vscale x 8 x i16> %2,
-    <vscale x 8 x i1> %3,
-    i32 %4, i32 1)
-
-  ret <vscale x 8 x i16> %a
-}
-
-declare <vscale x 16 x i16> @llvm.riscv.vrgatherei16.vv.nxv16i16(
-  <vscale x 16 x i16>,
-  <vscale x 16 x i16>,
-  <vscale x 16 x i16>,
-  i32);
-
-define <vscale x 16 x i16> @intrinsic_vrgatherei16_vv_nxv16i16_nxv16i16(<vscale x 16 x i16> %0, <vscale x 16 x i16> %1, i32 %2) nounwind {
-; CHECK-LABEL: intrinsic_vrgatherei16_vv_nxv16i16_nxv16i16:
-; CHECK:       # %bb.0: # %entry
-; CHECK-NEXT:    vsetvli zero, a0, e16, m4, ta, ma
-; CHECK-NEXT:    vrgatherei16.vv v16, v8, v12
-; CHECK-NEXT:    vmv.v.v v8, v16
-; CHECK-NEXT:    ret
-entry:
-  %a = call <vscale x 16 x i16> @llvm.riscv.vrgatherei16.vv.nxv16i16(
-    <vscale x 16 x i16> undef,
-    <vscale x 16 x i16> %0,
-    <vscale x 16 x i16> %1,
-    i32 %2)
-
-  ret <vscale x 16 x i16> %a
-}
-
-declare <vscale x 16 x i16> @llvm.riscv.vrgatherei16.vv.mask.nxv16i16(
-  <vscale x 16 x i16>,
-  <vscale x 16 x i16>,
-  <vscale x 16 x i16>,
-  <vscale x 16 x i1>,
-  i32,
-  i32);
-
-define <vscale x 16 x i16> @intrinsic_vrgatherei16_mask_vv_nxv16i16_nxv16i16(<vscale x 16 x i16> %0, <vscale x 16 x i16> %1, <vscale x 16 x i16> %2, <vscale x 16 x i1> %3, i32 %4) nounwind {
-; CHECK-LABEL: intrinsic_vrgatherei16_mask_vv_nxv16i16_nxv16i16:
-; CHECK:       # %bb.0: # %entry
-; CHECK-NEXT:    vsetvli zero, a0, e16, m4, ta, mu
-; CHECK-NEXT:    vrgatherei16.vv v8, v12, v16, v0.t
-; CHECK-NEXT:    ret
-entry:
-  %a = call <vscale x 16 x i16> @llvm.riscv.vrgatherei16.vv.mask.nxv16i16(
-    <vscale x 16 x i16> %0,
-    <vscale x 16 x i16> %1,
-    <vscale x 16 x i16> %2,
-    <vscale x 16 x i1> %3,
-    i32 %4, i32 1)
-
-  ret <vscale x 16 x i16> %a
-}
-
-declare <vscale x 32 x i16> @llvm.riscv.vrgatherei16.vv.nxv32i16(
-  <vscale x 32 x i16>,
-  <vscale x 32 x i16>,
-  <vscale x 32 x i16>,
-  i32);
-
-define <vscale x 32 x i16> @intrinsic_vrgatherei16_vv_nxv32i16_nxv32i16(<vscale x 32 x i16> %0, <vscale x 32 x i16> %1, i32 %2) nounwind {
-; CHECK-LABEL: intrinsic_vrgatherei16_vv_nxv32i16_nxv32i16:
-; CHECK:       # %bb.0: # %entry
-; CHECK-NEXT:    vsetvli zero, a0, e16, m8, ta, ma
-; CHECK-NEXT:    vrgatherei16.vv v24, v8, v16
-; CHECK-NEXT:    vmv.v.v v8, v24
-; CHECK-NEXT:    ret
-entry:
-  %a = call <vscale x 32 x i16> @llvm.riscv.vrgatherei16.vv.nxv32i16(
-    <vscale x 32 x i16> undef,
-    <vscale x 32 x i16> %0,
-    <vscale x 32 x i16> %1,
-    i32 %2)
-
-  ret <vscale x 32 x i16> %a
-}
-
-declare <vscale x 32 x i16> @llvm.riscv.vrgatherei16.vv.mask.nxv32i16(
-  <vscale x 32 x i16>,
-  <vscale x 32 x i16>,
-  <vscale x 32 x i16>,
-  <vscale x 32 x i1>,
-  i32,
-  i32);
-
-define <vscale x 32 x i16> @intrinsic_vrgatherei16_mask_vv_nxv32i16_nxv32i16(<vscale x 32 x i16> %0, <vscale x 32 x i16> %1, <vscale x 32 x i16> %2, <vscale x 32 x i1> %3, i32 %4) nounwind {
-; CHECK-LABEL: intrinsic_vrgatherei16_mask_vv_nxv32i16_nxv32i16:
-; CHECK:       # %bb.0: # %entry
-; CHECK-NEXT:    vl8re16.v v24, (a0)
-; CHECK-NEXT:    vsetvli zero, a1, e16, m8, ta, mu
-; CHECK-NEXT:    vrgatherei16.vv v8, v16, v24, v0.t
-; CHECK-NEXT:    ret
-entry:
-  %a = call <vscale x 32 x i16> @llvm.riscv.vrgatherei16.vv.mask.nxv32i16(
-    <vscale x 32 x i16> %0,
-    <vscale x 32 x i16> %1,
-    <vscale x 32 x i16> %2,
-    <vscale x 32 x i1> %3,
-    i32 %4, i32 1)
-
-  ret <vscale x 32 x i16> %a
-}
-
-declare <vscale x 1 x i32> @llvm.riscv.vrgatherei16.vv.nxv1i32(
-  <vscale x 1 x i32>,
-  <vscale x 1 x i32>,
-  <vscale x 1 x i16>,
-  i32);
-
-define <vscale x 1 x i32> @intrinsic_vrgatherei16_vv_nxv1i32_nxv1i32(<vscale x 1 x i32> %0, <vscale x 1 x i16> %1, i32 %2) nounwind {
-; CHECK-LABEL: intrinsic_vrgatherei16_vv_nxv1i32_nxv1i32:
-; CHECK:       # %bb.0: # %entry
-; CHECK-NEXT:    vsetvli zero, a0, e32, mf2, ta, ma
-; CHECK-NEXT:    vrgatherei16.vv v10, v8, v9
-; CHECK-NEXT:    vmv1r.v v8, v10
-; CHECK-NEXT:    ret
-entry:
-  %a = call <vscale x 1 x i32> @llvm.riscv.vrgatherei16.vv.nxv1i32(
-    <vscale x 1 x i32> undef,
-    <vscale x 1 x i32> %0,
-    <vscale x 1 x i16> %1,
-    i32 %2)
-
-  ret <vscale x 1 x i32> %a
-}
-
-declare <vscale x 1 x i32> @llvm.riscv.vrgatherei16.vv.mask.nxv1i32(
-  <vscale x 1 x i32>,
-  <vscale x 1 x i32>,
-  <vscale x 1 x i16>,
-  <vscale x 1 x i1>,
-  i32,
-  i32);
-
-define <vscale x 1 x i32> @intrinsic_vrgatherei16_mask_vv_nxv1i32_nxv1i32(<vscale x 1 x i32> %0, <vscale x 1 x i32> %1, <vscale x 1 x i16> %2, <vscale x 1 x i1> %3, i32 %4) nounwind {
-; CHECK-LABEL: intrinsic_vrgatherei16_mask_vv_nxv1i32_nxv1i32:
-; CHECK:       # %bb.0: # %entry
-; CHECK-NEXT:    vsetvli zero, a0, e32, mf2, ta, mu
-; CHECK-NEXT:    vrgatherei16.vv v8, v9, v10, v0.t
-; CHECK-NEXT:    ret
-entry:
-  %a = call <vscale x 1 x i32> @llvm.riscv.vrgatherei16.vv.mask.nxv1i32(
-    <vscale x 1 x i32> %0,
-    <vscale x 1 x i32> %1,
-    <vscale x 1 x i16> %2,
-    <vscale x 1 x i1> %3,
-    i32 %4, i32 1)
-
-  ret <vscale x 1 x i32> %a
-}
-
-declare <vscale x 4 x i32> @llvm.riscv.vrgatherei16.vv.nxv4i32(
-  <vscale x 4 x i32>,
-  <vscale x 4 x i32>,
-  <vscale x 4 x i16>,
-  i32);
-
-define <vscale x 4 x i32> @intrinsic_vrgatherei16_vv_nxv4i32_nxv4i32(<vscale x 4 x i32> %0, <vscale x 4 x i16> %1, i32 %2) nounwind {
-; CHECK-LABEL: intrinsic_vrgatherei16_vv_nxv4i32_nxv4i32:
-; CHECK:       # %bb.0: # %entry
-; CHECK-NEXT:    vsetvli zero, a0, e32, m2, ta, ma
-; CHECK-NEXT:    vrgatherei16.vv v12, v8, v10
-; CHECK-NEXT:    vmv.v.v v8, v12
-; CHECK-NEXT:    ret
-entry:
-  %a = call <vscale x 4 x i32> @llvm.riscv.vrgatherei16.vv.nxv4i32(
-    <vscale x 4 x i32> undef,
-    <vscale x 4 x i32> %0,
-    <vscale x 4 x i16> %1,
-    i32 %2)
-
-  ret <vscale x 4 x i32> %a
-}
-
-declare <vscale x 4 x i32> @llvm.riscv.vrgatherei16.vv.mask.nxv4i32(
-  <vscale x 4 x i32>,
-  <vscale x 4 x i32>,
-  <vscale x 4 x i16>,
-  <vscale x 4 x i1>,
-  i32,
-  i32);
-
-define <vscale x 4 x i32> @intrinsic_vrgatherei16_mask_vv_nxv4i32_nxv4i32(<vscale x 4 x i32> %0, <vscale x 4 x i32> %1, <vscale x 4 x i16> %2, <vscale x 4 x i1> %3, i32 %4) nounwind {
-; CHECK-LABEL: intrinsic_vrgatherei16_mask_vv_nxv4i32_nxv4i32:
-; CHECK:       # %bb.0: # %entry
-; CHECK-NEXT:    vsetvli zero, a0, e32, m2, ta, mu
-; CHECK-NEXT:    vrgatherei16.vv v8, v10, v12, v0.t
-; CHECK-NEXT:    ret
-entry:
-  %a = call <vscale x 4 x i32> @llvm.riscv.vrgatherei16.vv.mask.nxv4i32(
-    <vscale x 4 x i32> %0,
-    <vscale x 4 x i32> %1,
-    <vscale x 4 x i16> %2,
-    <vscale x 4 x i1> %3,
-    i32 %4, i32 1)
-
-  ret <vscale x 4 x i32> %a
-}
-
-declare <vscale x 8 x i32> @llvm.riscv.vrgatherei16.vv.nxv8i32(
-  <vscale x 8 x i32>,
-  <vscale x 8 x i32>,
-  <vscale x 8 x i16>,
-  i32);
-
-define <vscale x 8 x i32> @intrinsic_vrgatherei16_vv_nxv8i32_nxv8i32(<vscale x 8 x i32> %0, <vscale x 8 x i16> %1, i32 %2) nounwind {
-; CHECK-LABEL: intrinsic_vrgatherei16_vv_nxv8i32_nxv8i32:
-; CHECK:       # %bb.0: # %entry
-; CHECK-NEXT:    vsetvli zero, a0, e32, m4, ta, ma
-; CHECK-NEXT:    vrgatherei16.vv v16, v8, v12
-; CHECK-NEXT:    vmv.v.v v8, v16
-; CHECK-NEXT:    ret
-entry:
-  %a = call <vscale x 8 x i32> @llvm.riscv.vrgatherei16.vv.nxv8i32(
-    <vscale x 8 x i32> undef,
-    <vscale x 8 x i32> %0,
-    <vscale x 8 x i16> %1,
-    i32 %2)
-
-  ret <vscale x 8 x i32> %a
-}
-
-declare <vscale x 8 x i32> @llvm.riscv.vrgatherei16.vv.mask.nxv8i32(
-  <vscale x 8 x i32>,
-  <vscale x 8 x i32>,
-  <vscale x 8 x i16>,
-  <vscale x 8 x i1>,
-  i32,
-  i32);
-
-define <vscale x 8 x i32> @intrinsic_vrgatherei16_mask_vv_nxv8i32_nxv8i32(<vscale x 8 x i32> %0, <vscale x 8 x i32> %1, <vscale x 8 x i16> %2, <vscale x 8 x i1> %3, i32 %4) nounwind {
-; CHECK-LABEL: intrinsic_vrgatherei16_mask_vv_nxv8i32_nxv8i32:
-; CHECK:       # %bb.0: # %entry
-; CHECK-NEXT:    vsetvli zero, a0, e32, m4, ta, mu
-; CHECK-NEXT:    vrgatherei16.vv v8, v12, v16, v0.t
-; CHECK-NEXT:    ret
-entry:
-  %a = call <vscale x 8 x i32> @llvm.riscv.vrgatherei16.vv.mask.nxv8i32(
-    <vscale x 8 x i32> %0,
-    <vscale x 8 x i32> %1,
-    <vscale x 8 x i16> %2,
-    <vscale x 8 x i1> %3,
-    i32 %4, i32 1)
-
-  ret <vscale x 8 x i32> %a
-}
-
-declare <vscale x 16 x i32> @llvm.riscv.vrgatherei16.vv.nxv16i32(
-  <vscale x 16 x i32>,
-  <vscale x 16 x i32>,
-  <vscale x 16 x i16>,
-  i32);
-
-define <vscale x 16 x i32> @intrinsic_vrgatherei16_vv_nxv16i32_nxv16i32(<vscale x 16 x i32> %0, <vscale x 16 x i16> %1, i32 %2) nounwind {
-; CHECK-LABEL: intrinsic_vrgatherei16_vv_nxv16i32_nxv16i32:
-; CHECK:       # %bb.0: # %entry
-; CHECK-NEXT:    vsetvli zero, a0, e32, m8, ta, ma
-; CHECK-NEXT:    vrgatherei16.vv v24, v8, v16
-; CHECK-NEXT:    vmv.v.v v8, v24
-; CHECK-NEXT:    ret
-entry:
-  %a = call <vscale x 16 x i32> @llvm.riscv.vrgatherei16.vv.nxv16i32(
-    <vscale x 16 x i32> undef,
-    <vscale x 16 x i32> %0,
-    <vscale x 16 x i16> %1,
-    i32 %2)
-
-  ret <vscale x 16 x i32> %a
-}
-
-declare <vscale x 16 x i32> @llvm.riscv.vrgatherei16.vv.mask.nxv16i32(
-  <vscale x 16 x i32>,
-  <vscale x 16 x i32>,
-  <vscale x 16 x i16>,
-  <vscale x 16 x i1>,
-  i32,
-  i32);
-
-define <vscale x 16 x i32> @intrinsic_vrgatherei16_mask_vv_nxv16i32_nxv16i32(<vscale x 16 x i32> %0, <vscale x 16 x i32> %1, <vscale x 16 x i16> %2, <vscale x 16 x i1> %3, i32 %4) nounwind {
-; CHECK-LABEL: intrinsic_vrgatherei16_mask_vv_nxv16i32_nxv16i32:
-; CHECK:       # %bb.0: # %entry
-; CHECK-NEXT:    vl4re16.v v24, (a0)
-; CHECK-NEXT:    vsetvli zero, a1, e32, m8, ta, mu
-; CHECK-NEXT:    vrgatherei16.vv v8, v16, v24, v0.t
-; CHECK-NEXT:    ret
-entry:
-  %a = call <vscale x 16 x i32> @llvm.riscv.vrgatherei16.vv.mask.nxv16i32(
-    <vscale x 16 x i32> %0,
-    <vscale x 16 x i32> %1,
-    <vscale x 16 x i16> %2,
-    <vscale x 16 x i1> %3,
-    i32 %4, i32 1)
-
-  ret <vscale x 16 x i32> %a
-}
-
-declare <vscale x 4 x i64> @llvm.riscv.vrgatherei16.vv.nxv4i64(
-  <vscale x 4 x i64>,
-  <vscale x 4 x i64>,
-  <vscale x 4 x i16>,
-  i32);
-
-define <vscale x 4 x i64> @intrinsic_vrgatherei16_vv_nxv4i64_nxv4i64(<vscale x 4 x i64> %0, <vscale x 4 x i16> %1, i32 %2) nounwind {
-; CHECK-LABEL: intrinsic_vrgatherei16_vv_nxv4i64_nxv4i64:
-; CHECK:       # %bb.0: # %entry
-; CHECK-NEXT:    vsetvli zero, a0, e64, m4, ta, ma
-; CHECK-NEXT:    vrgatherei16.vv v16, v8, v12
-; CHECK-NEXT:    vmv.v.v v8, v16
-; CHECK-NEXT:    ret
-entry:
-  %a = call <vscale x 4 x i64> @llvm.riscv.vrgatherei16.vv.nxv4i64(
-    <vscale x 4 x i64> undef,
-    <vscale x 4 x i64> %0,
-    <vscale x 4 x i16> %1,
-    i32 %2)
-
-  ret <vscale x 4 x i64> %a
-}
-
-declare <vscale x 4 x i64> @llvm.riscv.vrgatherei16.vv.mask.nxv4i64(
-  <vscale x 4 x i64>,
-  <vscale x 4 x i64>,
-  <vscale x 4 x i16>,
-  <vscale x 4 x i1>,
-  i32,
-  i32);
-
-define <vscale x 4 x i64> @intrinsic_vrgatherei16_mask_vv_nxv4i64_nxv4i64(<vscale x 4 x i64> %0, <vscale x 4 x i64> %1, <vscale x 4 x i16> %2, <vscale x 4 x i1> %3, i32 %4) nounwind {
-; CHECK-LABEL: intrinsic_vrgatherei16_mask_vv_nxv4i64_nxv4i64:
-; CHECK:       # %bb.0: # %entry
-; CHECK-NEXT:    vsetvli zero, a0, e64, m4, ta, mu
-; CHECK-NEXT:    vrgatherei16.vv v8, v12, v16, v0.t
-; CHECK-NEXT:    ret
-entry:
-  %a = call <vscale x 4 x i64> @llvm.riscv.vrgatherei16.vv.mask.nxv4i64(
-    <vscale x 4 x i64> %0,
-    <vscale x 4 x i64> %1,
-    <vscale x 4 x i16> %2,
-    <vscale x 4 x i1> %3,
-    i32 %4, i32 1)
-
-  ret <vscale x 4 x i64> %a
-}
-
-declare <vscale x 8 x i64> @llvm.riscv.vrgatherei16.vv.nxv8i64(
-  <vscale x 8 x i64>,
-  <vscale x 8 x i64>,
-  <vscale x 8 x i16>,
-  i32);
-
-define <vscale x 8 x i64> @intrinsic_vrgatherei16_vv_nxv8i64_nxv8i64(<vscale x 8 x i64> %0, <vscale x 8 x i16> %1, i32 %2) nounwind {
-; CHECK-LABEL: intrinsic_vrgatherei16_vv_nxv8i64_nxv8i64:
-; CHECK:       # %bb.0: # %entry
-; CHECK-NEXT:    vsetvli zero, a0, e64, m8, ta, ma
-; CHECK-NEXT:    vrgatherei16.vv v24, v8, v16
-; CHECK-NEXT:    vmv.v.v v8, v24
-; CHECK-NEXT:    ret
-entry:
-  %a = call <vscale x 8 x i64> @llvm.riscv.vrgatherei16.vv.nxv8i64(
-    <vscale x 8 x i64> undef,
-    <vscale x 8 x i64> %0,
-    <vscale x 8 x i16> %1,
-    i32 %2)
-
-  ret <vscale x 8 x i64> %a
-}
-
-declare <vscale x 8 x i64> @llvm.riscv.vrgatherei16.vv.mask.nxv8i64(
-  <vscale x 8 x i64>,
-  <vscale x 8 x i64>,
-  <vscale x 8 x i16>,
-  <vscale x 8 x i1>,
-  i32,
-  i32);
-
-define <vscale x 8 x i64> @intrinsic_vrgatherei16_mask_vv_nxv8i64_nxv8i64(<vscale x 8 x i64> %0, <vscale x 8 x i64> %1, <vscale x 8 x i16> %2, <vscale x 8 x i1> %3, i32 %4) nounwind {
-; CHECK-LABEL: intrinsic_vrgatherei16_mask_vv_nxv8i64_nxv8i64:
-; CHECK:       # %bb.0: # %entry
-; CHECK-NEXT:    vl2re16.v v24, (a0)
-; CHECK-NEXT:    vsetvli zero, a1, e64, m8, ta, mu
-; CHECK-NEXT:    vrgatherei16.vv v8, v16, v24, v0.t
-; CHECK-NEXT:    ret
-entry:
-  %a = call <vscale x 8 x i64> @llvm.riscv.vrgatherei16.vv.mask.nxv8i64(
-    <vscale x 8 x i64> %0,
-    <vscale x 8 x i64> %1,
-    <vscale x 8 x i16> %2,
-    <vscale x 8 x i1> %3,
-    i32 %4, i32 1)
-
-  ret <vscale x 8 x i64> %a
-}
-
-declare <vscale x 1 x half> @llvm.riscv.vrgatherei16.vv.nxv1f16(
-  <vscale x 1 x half>,
-  <vscale x 1 x half>,
-  <vscale x 1 x i16>,
-  i32);
-
-define <vscale x 1 x half> @intrinsic_vrgatherei16_vv_nxv1f16_nxv1f16(<vscale x 1 x half> %0, <vscale x 1 x i16> %1, i32 %2) nounwind {
-; CHECK-LABEL: intrinsic_vrgatherei16_vv_nxv1f16_nxv1f16:
-; CHECK:       # %bb.0: # %entry
-; CHECK-NEXT:    vsetvli zero, a0, e16, mf4, ta, ma
-; CHECK-NEXT:    vrgatherei16.vv v10, v8, v9
-; CHECK-NEXT:    vmv1r.v v8, v10
-; CHECK-NEXT:    ret
-entry:
-  %a = call <vscale x 1 x half> @llvm.riscv.vrgatherei16.vv.nxv1f16(
-    <vscale x 1 x half> undef,
-    <vscale x 1 x half> %0,
-    <vscale x 1 x i16> %1,
-    i32 %2)
-
-  ret <vscale x 1 x half> %a
-}
-
-declare <vscale x 1 x half> @llvm.riscv.vrgatherei16.vv.mask.nxv1f16(
-  <vscale x 1 x half>,
-  <vscale x 1 x half>,
-  <vscale x 1 x i16>,
-  <vscale x 1 x i1>,
-  i32,
-  i32);
-
-define <vscale x 1 x half> @intrinsic_vrgatherei16_mask_vv_nxv1f16_nxv1f16(<vscale x 1 x half> %0, <vscale x 1 x half> %1, <vscale x 1 x i16> %2, <vscale x 1 x i1> %3, i32 %4) nounwind {
-; CHECK-LABEL: intrinsic_vrgatherei16_mask_vv_nxv1f16_nxv1f16:
-; CHECK:       # %bb.0: # %entry
-; CHECK-NEXT:    vsetvli zero, a0, e16, mf4, ta, mu
-; CHECK-NEXT:    vrgatherei16.vv v8, v9, v10, v0.t
-; CHECK-NEXT:    ret
-entry:
-  %a = call <vscale x 1 x half> @llvm.riscv.vrgatherei16.vv.mask.nxv1f16(
-    <vscale x 1 x half> %0,
-    <vscale x 1 x half> %1,
-    <vscale x 1 x i16> %2,
-    <vscale x 1 x i1> %3,
-    i32 %4, i32 1)
-
-  ret <vscale x 1 x half> %a
-}
-
-declare <vscale x 2 x half> @llvm.riscv.vrgatherei16.vv.nxv2f16(
-  <vscale x 2 x half>,
-  <vscale x 2 x half>,
-  <vscale x 2 x i16>,
-  i32);
-
-define <vscale x 2 x half> @intrinsic_vrgatherei16_vv_nxv2f16_nxv2f16(<vscale x 2 x half> %0, <vscale x 2 x i16> %1, i32 %2) nounwind {
-; CHECK-LABEL: intrinsic_vrgatherei16_vv_nxv2f16_nxv2f16:
-; CHECK:       # %bb.0: # %entry
-; CHECK-NEXT:    vsetvli zero, a0, e16, mf2, ta, ma
-; CHECK-NEXT:    vrgatherei16.vv v10, v8, v9
-; CHECK-NEXT:    vmv1r.v v8, v10
-; CHECK-NEXT:    ret
-entry:
-  %a = call <vscale x 2 x half> @llvm.riscv.vrgatherei16.vv.nxv2f16(
-    <vscale x 2 x half> undef,
-    <vscale x 2 x half> %0,
-    <vscale x 2 x i16> %1,
-    i32 %2)
-
-  ret <vscale x 2 x half> %a
-}
-
-declare <vscale x 2 x half> @llvm.riscv.vrgatherei16.vv.mask.nxv2f16(
-  <vscale x 2 x half>,
-  <vscale x 2 x half>,
-  <vscale x 2 x i16>,
-  <vscale x 2 x i1>,
-  i32,
-  i32);
-
-define <vscale x 2 x half> @intrinsic_vrgatherei16_mask_vv_nxv2f16_nxv2f16(<vscale x 2 x half> %0, <vscale x 2 x half> %1, <vscale x 2 x i16> %2, <vscale x 2 x i1> %3, i32 %4) nounwind {
-; CHECK-LABEL: intrinsic_vrgatherei16_mask_vv_nxv2f16_nxv2f16:
-; CHECK:       # %bb.0: # %entry
-; CHECK-NEXT:    vsetvli zero, a0, e16, mf2, ta, mu
-; CHECK-NEXT:    vrgatherei16.vv v8, v9, v10, v0.t
-; CHECK-NEXT:    ret
-entry:
-  %a = call <vscale x 2 x half> @llvm.riscv.vrgatherei16.vv.mask.nxv2f16(
-    <vscale x 2 x half> %0,
-    <vscale x 2 x half> %1,
-    <vscale x 2 x i16> %2,
-    <vscale x 2 x i1> %3,
-    i32 %4, i32 1)
-
-  ret <vscale x 2 x half> %a
-}
-
-declare <vscale x 4 x half> @llvm.riscv.vrgatherei16.vv.nxv4f16(
-  <vscale x 4 x half>,
-  <vscale x 4 x half>,
-  <vscale x 4 x i16>,
-  i32);
-
-define <vscale x 4 x half> @intrinsic_vrgatherei16_vv_nxv4f16_nxv4f16(<vscale x 4 x half> %0, <vscale x 4 x i16> %1, i32 %2) nounwind {
-; CHECK-LABEL: intrinsic_vrgatherei16_vv_nxv4f16_nxv4f16:
-; CHECK:       # %bb.0: # %entry
-; CHECK-NEXT:    vsetvli zero, a0, e16, m1, ta, ma
-; CHECK-NEXT:    vrgatherei16.vv v10, v8, v9
-; CHECK-NEXT:    vmv.v.v v8, v10
-; CHECK-NEXT:    ret
-entry:
-  %a = call <vscale x 4 x half> @llvm.riscv.vrgatherei16.vv.nxv4f16(
-    <vscale x 4 x half> undef,
-    <vscale x 4 x half> %0,
-    <vscale x 4 x i16> %1,
-    i32 %2)
-
-  ret <vscale x 4 x half> %a
-}
-
-declare <vscale x 4 x half> @llvm.riscv.vrgatherei16.vv.mask.nxv4f16(
-  <vscale x 4 x half>,
-  <vscale x 4 x half>,
-  <vscale x 4 x i16>,
-  <vscale x 4 x i1>,
-  i32,
-  i32);
-
-define <vscale x 4 x half> @intrinsic_vrgatherei16_mask_vv_nxv4f16_nxv4f16(<vscale x 4 x half> %0, <vscale x 4 x half> %1, <vscale x 4 x i16> %2, <vscale x 4 x i1> %3, i32 %4) nounwind {
-; CHECK-LABEL: intrinsic_vrgatherei16_mask_vv_nxv4f16_nxv4f16:
-; CHECK:       # %bb.0: # %entry
-; CHECK-NEXT:    vsetvli zero, a0, e16, m1, ta, mu
-; CHECK-NEXT:    vrgatherei16.vv v8, v9, v10, v0.t
-; CHECK-NEXT:    ret
-entry:
-  %a = call <vscale x 4 x half> @llvm.riscv.vrgatherei16.vv.mask.nxv4f16(
-    <vscale x 4 x half> %0,
-    <vscale x 4 x half> %1,
-    <vscale x 4 x i16> %2,
-    <vscale x 4 x i1> %3,
-    i32 %4, i32 1)
-
-  ret <vscale x 4 x half> %a
-}
-
-declare <vscale x 8 x half> @llvm.riscv.vrgatherei16.vv.nxv8f16(
-  <vscale x 8 x half>,
-  <vscale x 8 x half>,
-  <vscale x 8 x i16>,
-  i32);
-
-define <vscale x 8 x half> @intrinsic_vrgatherei16_vv_nxv8f16_nxv8f16(<vscale x 8 x half> %0, <vscale x 8 x i16> %1, i32 %2) nounwind {
-; CHECK-LABEL: intrinsic_vrgatherei16_vv_nxv8f16_nxv8f16:
-; CHECK:       # %bb.0: # %entry
-; CHECK-NEXT:    vsetvli zero, a0, e16, m2, ta, ma
-; CHECK-NEXT:    vrgatherei16.vv v12, v8, v10
-; CHECK-NEXT:    vmv.v.v v8, v12
-; CHECK-NEXT:    ret
-entry:
-  %a = call <vscale x 8 x half> @llvm.riscv.vrgatherei16.vv.nxv8f16(
-    <vscale x 8 x half> undef,
-    <vscale x 8 x half> %0,
-    <vscale x 8 x i16> %1,
-    i32 %2)
-
-  ret <vscale x 8 x half> %a
-}
-
-declare <vscale x 8 x half> @llvm.riscv.vrgatherei16.vv.mask.nxv8f16(
-  <vscale x 8 x half>,
-  <vscale x 8 x half>,
-  <vscale x 8 x i16>,
-  <vscale x 8 x i1>,
-  i32,
-  i32);
-
-define <vscale x 8 x half> @intrinsic_vrgatherei16_mask_vv_nxv8f16_nxv8f16(<vscale x 8 x half> %0, <vscale x 8 x half> %1, <vscale x 8 x i16> %2, <vscale x 8 x i1> %3, i32 %4) nounwind {
-; CHECK-LABEL: intrinsic_vrgatherei16_mask_vv_nxv8f16_nxv8f16:
-; CHECK:       # %bb.0: # %entry
-; CHECK-NEXT:    vsetvli zero, a0, e16, m2, ta, mu
-; CHECK-NEXT:    vrgatherei16.vv v8, v10, v12, v0.t
-; CHECK-NEXT:    ret
-entry:
-  %a = call <vscale x 8 x half> @llvm.riscv.vrgatherei16.vv.mask.nxv8f16(
-    <vscale x 8 x half> %0,
-    <vscale x 8 x half> %1,
-    <vscale x 8 x i16> %2,
-    <vscale x 8 x i1> %3,
-    i32 %4, i32 1)
-
-  ret <vscale x 8 x half> %a
-}
-
-declare <vscale x 16 x half> @llvm.riscv.vrgatherei16.vv.nxv16f16(
-  <vscale x 16 x half>,
-  <vscale x 16 x half>,
-  <vscale x 16 x i16>,
-  i32);
-
-define <vscale x 16 x half> @intrinsic_vrgatherei16_vv_nxv16f16_nxv16f16(<vscale x 16 x half> %0, <vscale x 16 x i16> %1, i32 %2) nounwind {
-; CHECK-LABEL: intrinsic_vrgatherei16_vv_nxv16f16_nxv16f16:
-; CHECK:       # %bb.0: # %entry
-; CHECK-NEXT:    vsetvli zero, a0, e16, m4, ta, ma
-; CHECK-NEXT:    vrgatherei16.vv v16, v8, v12
-; CHECK-NEXT:    vmv.v.v v8, v16
-; CHECK-NEXT:    ret
-entry:
-  %a = call <vscale x 16 x half> @llvm.riscv.vrgatherei16.vv.nxv16f16(
-    <vscale x 16 x half> undef,
-    <vscale x 16 x half> %0,
-    <vscale x 16 x i16> %1,
-    i32 %2)
-
-  ret <vscale x 16 x half> %a
-}
-
-declare <vscale x 16 x half> @llvm.riscv.vrgatherei16.vv.mask.nxv16f16(
-  <vscale x 16 x half>,
-  <vscale x 16 x half>,
-  <vscale x 16 x i16>,
-  <vscale x 16 x i1>,
-  i32,
-  i32);
-
-define <vscale x 16 x half> @intrinsic_vrgatherei16_mask_vv_nxv16f16_nxv16f16(<vscale x 16 x half> %0, <vscale x 16 x half> %1, <vscale x 16 x i16> %2, <vscale x 16 x i1> %3, i32 %4) nounwind {
-; CHECK-LABEL: intrinsic_vrgatherei16_mask_vv_nxv16f16_nxv16f16:
-; CHECK:       # %bb.0: # %entry
-; CHECK-NEXT:    vsetvli zero, a0, e16, m4, ta, mu
-; CHECK-NEXT:    vrgatherei16.vv v8, v12, v16, v0.t
-; CHECK-NEXT:    ret
-entry:
-  %a = call <vscale x 16 x half> @llvm.riscv.vrgatherei16.vv.mask.nxv16f16(
-    <vscale x 16 x half> %0,
-    <vscale x 16 x half> %1,
-    <vscale x 16 x i16> %2,
-    <vscale x 16 x i1> %3,
-    i32 %4, i32 1)
-
-  ret <vscale x 16 x half> %a
-}
-
-declare <vscale x 32 x half> @llvm.riscv.vrgatherei16.vv.nxv32f16(
-  <vscale x 32 x half>,
-  <vscale x 32 x half>,
-  <vscale x 32 x i16>,
-  i32);
-
-define <vscale x 32 x half> @intrinsic_vrgatherei16_vv_nxv32f16_nxv32f16(<vscale x 32 x half> %0, <vscale x 32 x i16> %1, i32 %2) nounwind {
-; CHECK-LABEL: intrinsic_vrgatherei16_vv_nxv32f16_nxv32f16:
-; CHECK:       # %bb.0: # %entry
-; CHECK-NEXT:    vsetvli zero, a0, e16, m8, ta, ma
-; CHECK-NEXT:    vrgatherei16.vv v24, v8, v16
-; CHECK-NEXT:    vmv.v.v v8, v24
-; CHECK-NEXT:    ret
-entry:
-  %a = call <vscale x 32 x half> @llvm.riscv.vrgatherei16.vv.nxv32f16(
-    <vscale x 32 x half> undef,
-    <vscale x 32 x half> %0,
-    <vscale x 32 x i16> %1,
-    i32 %2)
-
-  ret <vscale x 32 x half> %a
-}
-
-declare <vscale x 32 x half> @llvm.riscv.vrgatherei16.vv.mask.nxv32f16(
-  <vscale x 32 x half>,
-  <vscale x 32 x half>,
-  <vscale x 32 x i16>,
-  <vscale x 32 x i1>,
-  i32,
-  i32);
-
-define <vscale x 32 x half> @intrinsic_vrgatherei16_mask_vv_nxv32f16_nxv32f16(<vscale x 32 x half> %0, <vscale x 32 x half> %1, <vscale x 32 x i16> %2, <vscale x 32 x i1> %3, i32 %4) nounwind {
-; CHECK-LABEL: intrinsic_vrgatherei16_mask_vv_nxv32f16_nxv32f16:
-; CHECK:       # %bb.0: # %entry
-; CHECK-NEXT:    vl8re16.v v24, (a0)
-; CHECK-NEXT:    vsetvli zero, a1, e16, m8, ta, mu
-; CHECK-NEXT:    vrgatherei16.vv v8, v16, v24, v0.t
-; CHECK-NEXT:    ret
-entry:
-  %a = call <vscale x 32 x half> @llvm.riscv.vrgatherei16.vv.mask.nxv32f16(
-    <vscale x 32 x half> %0,
-    <vscale x 32 x half> %1,
-    <vscale x 32 x i16> %2,
-    <vscale x 32 x i1> %3,
-    i32 %4, i32 1)
-
-  ret <vscale x 32 x half> %a
-}
-
-declare <vscale x 1 x float> @llvm.riscv.vrgatherei16.vv.nxv1f32(
-  <vscale x 1 x float>,
-  <vscale x 1 x float>,
-  <vscale x 1 x i16>,
-  i32);
-
-define <vscale x 1 x float> @intrinsic_vrgatherei16_vv_nxv1f32_nxv1f32(<vscale x 1 x float> %0, <vscale x 1 x i16> %1, i32 %2) nounwind {
-; CHECK-LABEL: intrinsic_vrgatherei16_vv_nxv1f32_nxv1f32:
-; CHECK:       # %bb.0: # %entry
-; CHECK-NEXT:    vsetvli zero, a0, e32, mf2, ta, ma
-; CHECK-NEXT:    vrgatherei16.vv v10, v8, v9
-; CHECK-NEXT:    vmv1r.v v8, v10
-; CHECK-NEXT:    ret
-entry:
-  %a = call <vscale x 1 x float> @llvm.riscv.vrgatherei16.vv.nxv1f32(
-    <vscale x 1 x float> undef,
-    <vscale x 1 x float> %0,
-    <vscale x 1 x i16> %1,
-    i32 %2)
-
-  ret <vscale x 1 x float> %a
-}
-
-declare <vscale x 1 x float> @llvm.riscv.vrgatherei16.vv.mask.nxv1f32(
-  <vscale x 1 x float>,
-  <vscale x 1 x float>,
-  <vscale x 1 x i16>,
-  <vscale x 1 x i1>,
-  i32,
-  i32);
-
-define <vscale x 1 x float> @intrinsic_vrgatherei16_mask_vv_nxv1f32_nxv1f32(<vscale x 1 x float> %0, <vscale x 1 x float> %1, <vscale x 1 x i16> %2, <vscale x 1 x i1> %3, i32 %4) nounwind {
-; CHECK-LABEL: intrinsic_vrgatherei16_mask_vv_nxv1f32_nxv1f32:
-; CHECK:       # %bb.0: # %entry
-; CHECK-NEXT:    vsetvli zero, a0, e32, mf2, ta, mu
-; CHECK-NEXT:    vrgatherei16.vv v8, v9, v10, v0.t
-; CHECK-NEXT:    ret
-entry:
-  %a = call <vscale x 1 x float> @llvm.riscv.vrgatherei16.vv.mask.nxv1f32(
-    <vscale x 1 x float> %0,
-    <vscale x 1 x float> %1,
-    <vscale x 1 x i16> %2,
-    <vscale x 1 x i1> %3,
-    i32 %4, i32 1)
-
-  ret <vscale x 1 x float> %a
-}
-
-declare <vscale x 4 x float> @llvm.riscv.vrgatherei16.vv.nxv4f32(
-  <vscale x 4 x float>,
-  <vscale x 4 x float>,
-  <vscale x 4 x i16>,
-  i32);
-
-define <vscale x 4 x float> @intrinsic_vrgatherei16_vv_nxv4f32_nxv4f32(<vscale x 4 x float> %0, <vscale x 4 x i16> %1, i32 %2) nounwind {
-; CHECK-LABEL: intrinsic_vrgatherei16_vv_nxv4f32_nxv4f32:
-; CHECK:       # %bb.0: # %entry
-; CHECK-NEXT:    vsetvli zero, a0, e32, m2, ta, ma
-; CHECK-NEXT:    vrgatherei16.vv v12, v8, v10
-; CHECK-NEXT:    vmv.v.v v8, v12
-; CHECK-NEXT:    ret
-entry:
-  %a = call <vscale x 4 x float> @llvm.riscv.vrgatherei16.vv.nxv4f32(
-    <vscale x 4 x float> undef,
-    <vscale x 4 x float> %0,
-    <vscale x 4 x i16> %1,
-    i32 %2)
-
-  ret <vscale x 4 x float> %a
-}
-
-declare <vscale x 4 x float> @llvm.riscv.vrgatherei16.vv.mask.nxv4f32(
-  <vscale x 4 x float>,
-  <vscale x 4 x float>,
-  <vscale x 4 x i16>,
-  <vscale x 4 x i1>,
-  i32,
-  i32);
-
-define <vscale x 4 x float> @intrinsic_vrgatherei16_mask_vv_nxv4f32_nxv4f32(<vscale x 4 x float> %0, <vscale x 4 x float> %1, <vscale x 4 x i16> %2, <vscale x 4 x i1> %3, i32 %4) nounwind {
-; CHECK-LABEL: intrinsic_vrgatherei16_mask_vv_nxv4f32_nxv4f32:
-; CHECK:       # %bb.0: # %entry
-; CHECK-NEXT:    vsetvli zero, a0, e32, m2, ta, mu
-; CHECK-NEXT:    vrgatherei16.vv v8, v10, v12, v0.t
-; CHECK-NEXT:    ret
-entry:
-  %a = call <vscale x 4 x float> @llvm.riscv.vrgatherei16.vv.mask.nxv4f32(
-    <vscale x 4 x float> %0,
-    <vscale x 4 x float> %1,
-    <vscale x 4 x i16> %2,
-    <vscale x 4 x i1> %3,
-    i32 %4, i32 1)
-
-  ret <vscale x 4 x float> %a
-}
-
-declare <vscale x 8 x float> @llvm.riscv.vrgatherei16.vv.nxv8f32(
-  <vscale x 8 x float>,
-  <vscale x 8 x float>,
-  <vscale x 8 x i16>,
-  i32);
-
-define <vscale x 8 x float> @intrinsic_vrgatherei16_vv_nxv8f32_nxv8f32(<vscale x 8 x float> %0, <vscale x 8 x i16> %1, i32 %2) nounwind {
-; CHECK-LABEL: intrinsic_vrgatherei16_vv_nxv8f32_nxv8f32:
-; CHECK:       # %bb.0: # %entry
-; CHECK-NEXT:    vsetvli zero, a0, e32, m4, ta, ma
-; CHECK-NEXT:    vrgatherei16.vv v16, v8, v12
-; CHECK-NEXT:    vmv.v.v v8, v16
-; CHECK-NEXT:    ret
-entry:
-  %a = call <vscale x 8 x float> @llvm.riscv.vrgatherei16.vv.nxv8f32(
-    <vscale x 8 x float> undef,
-    <vscale x 8 x float> %0,
-    <vscale x 8 x i16> %1,
-    i32 %2)
-
-  ret <vscale x 8 x float> %a
-}
-
-declare <vscale x 8 x float> @llvm.riscv.vrgatherei16.vv.mask.nxv8f32(
-  <vscale x 8 x float>,
-  <vscale x 8 x float>,
-  <vscale x 8 x i16>,
-  <vscale x 8 x i1>,
-  i32,
-  i32);
-
-define <vscale x 8 x float> @intrinsic_vrgatherei16_mask_vv_nxv8f32_nxv8f32(<vscale x 8 x float> %0, <vscale x 8 x float> %1, <vscale x 8 x i16> %2, <vscale x 8 x i1> %3, i32 %4) nounwind {
-; CHECK-LABEL: intrinsic_vrgatherei16_mask_vv_nxv8f32_nxv8f32:
-; CHECK:       # %bb.0: # %entry
-; CHECK-NEXT:    vsetvli zero, a0, e32, m4, ta, mu
-; CHECK-NEXT:    vrgatherei16.vv v8, v12, v16, v0.t
-; CHECK-NEXT:    ret
-entry:
-  %a = call <vscale x 8 x float> @llvm.riscv.vrgatherei16.vv.mask.nxv8f32(
-    <vscale x 8 x float> %0,
-    <vscale x 8 x float> %1,
-    <vscale x 8 x i16> %2,
-    <vscale x 8 x i1> %3,
-    i32 %4, i32 1)
-
-  ret <vscale x 8 x float> %a
-}
-
-declare <vscale x 16 x float> @llvm.riscv.vrgatherei16.vv.nxv16f32(
-  <vscale x 16 x float>,
-  <vscale x 16 x float>,
-  <vscale x 16 x i16>,
-  i32);
-
-define <vscale x 16 x float> @intrinsic_vrgatherei16_vv_nxv16f32_nxv16f32(<vscale x 16 x float> %0, <vscale x 16 x i16> %1, i32 %2) nounwind {
-; CHECK-LABEL: intrinsic_vrgatherei16_vv_nxv16f32_nxv16f32:
-; CHECK:       # %bb.0: # %entry
-; CHECK-NEXT:    vsetvli zero, a0, e32, m8, ta, ma
-; CHECK-NEXT:    vrgatherei16.vv v24, v8, v16
-; CHECK-NEXT:    vmv.v.v v8, v24
-; CHECK-NEXT:    ret
-entry:
-  %a = call <vscale x 16 x float> @llvm.riscv.vrgatherei16.vv.nxv16f32(
-    <vscale x 16 x float> undef,
-    <vscale x 16 x float> %0,
-    <vscale x 16 x i16> %1,
-    i32 %2)
-
-  ret <vscale x 16 x float> %a
-}
-
-declare <vscale x 16 x float> @llvm.riscv.vrgatherei16.vv.mask.nxv16f32(
-  <vscale x 16 x float>,
-  <vscale x 16 x float>,
-  <vscale x 16 x i16>,
-  <vscale x 16 x i1>,
-  i32,
-  i32);
-
-define <vscale x 16 x float> @intrinsic_vrgatherei16_mask_vv_nxv16f32_nxv16f32(<vscale x 16 x float> %0, <vscale x 16 x float> %1, <vscale x 16 x i16> %2, <vscale x 16 x i1> %3, i32 %4) nounwind {
-; CHECK-LABEL: intrinsic_vrgatherei16_mask_vv_nxv16f32_nxv16f32:
-; CHECK:       # %bb.0: # %entry
-; CHECK-NEXT:    vl4re16.v v24, (a0)
-; CHECK-NEXT:    vsetvli zero, a1, e32, m8, ta, mu
-; CHECK-NEXT:    vrgatherei16.vv v8, v16, v24, v0.t
-; CHECK-NEXT:    ret
-entry:
-  %a = call <vscale x 16 x float> @llvm.riscv.vrgatherei16.vv.mask.nxv16f32(
-    <vscale x 16 x float> %0,
-    <vscale x 16 x float> %1,
-    <vscale x 16 x i16> %2,
-    <vscale x 16 x i1> %3,
-    i32 %4, i32 1)
-
-  ret <vscale x 16 x float> %a
-}
-
-declare <vscale x 4 x double> @llvm.riscv.vrgatherei16.vv.nxv4f64(
-  <vscale x 4 x double>,
-  <vscale x 4 x double>,
-  <vscale x 4 x i16>,
-  i32);
-
-define <vscale x 4 x double> @intrinsic_vrgatherei16_vv_nxv4f64_nxv4f64(<vscale x 4 x double> %0, <vscale x 4 x i16> %1, i32 %2) nounwind {
-; CHECK-LABEL: intrinsic_vrgatherei16_vv_nxv4f64_nxv4f64:
-; CHECK:       # %bb.0: # %entry
-; CHECK-NEXT:    vsetvli zero, a0, e64, m4, ta, ma
-; CHECK-NEXT:    vrgatherei16.vv v16, v8, v12
-; CHECK-NEXT:    vmv.v.v v8, v16
-; CHECK-NEXT:    ret
-entry:
-  %a = call <vscale x 4 x double> @llvm.riscv.vrgatherei16.vv.nxv4f64(
-    <vscale x 4 x double> undef,
-    <vscale x 4 x double> %0,
-    <vscale x 4 x i16> %1,
-    i32 %2)
-
-  ret <vscale x 4 x double> %a
-}
-
-declare <vscale x 4 x double> @llvm.riscv.vrgatherei16.vv.mask.nxv4f64(
-  <vscale x 4 x double>,
-  <vscale x 4 x double>,
-  <vscale x 4 x i16>,
-  <vscale x 4 x i1>,
-  i32,
-  i32);
-
-define <vscale x 4 x double> @intrinsic_vrgatherei16_mask_vv_nxv4f64_nxv4f64(<vscale x 4 x double> %0, <vscale x 4 x double> %1, <vscale x 4 x i16> %2, <vscale x 4 x i1> %3, i32 %4) nounwind {
-; CHECK-LABEL: intrinsic_vrgatherei16_mask_vv_nxv4f64_nxv4f64:
-; CHECK:       # %bb.0: # %entry
-; CHECK-NEXT:    vsetvli zero, a0, e64, m4, ta, mu
-; CHECK-NEXT:    vrgatherei16.vv v8, v12, v16, v0.t
-; CHECK-NEXT:    ret
-entry:
-  %a = call <vscale x 4 x double> @llvm.riscv.vrgatherei16.vv.mask.nxv4f64(
-    <vscale x 4 x double> %0,
-    <vscale x 4 x double> %1,
-    <vscale x 4 x i16> %2,
-    <vscale x 4 x i1> %3,
-    i32 %4, i32 1)
-
-  ret <vscale x 4 x double> %a
-}
-
-declare <vscale x 8 x double> @llvm.riscv.vrgatherei16.vv.nxv8f64(
-  <vscale x 8 x double>,
-  <vscale x 8 x double>,
-  <vscale x 8 x i16>,
-  i32);
-
-define <vscale x 8 x double> @intrinsic_vrgatherei16_vv_nxv8f64_nxv8f64(<vscale x 8 x double> %0, <vscale x 8 x i16> %1, i32 %2) nounwind {
-; CHECK-LABEL: intrinsic_vrgatherei16_vv_nxv8f64_nxv8f64:
-; CHECK:       # %bb.0: # %entry
-; CHECK-NEXT:    vsetvli zero, a0, e64, m8, ta, ma
-; CHECK-NEXT:    vrgatherei16.vv v24, v8, v16
-; CHECK-NEXT:    vmv.v.v v8, v24
-; CHECK-NEXT:    ret
-entry:
-  %a = call <vscale x 8 x double> @llvm.riscv.vrgatherei16.vv.nxv8f64(
-    <vscale x 8 x double> undef,
-    <vscale x 8 x double> %0,
-    <vscale x 8 x i16> %1,
-    i32 %2)
-
-  ret <vscale x 8 x double> %a
-}
-
-declare <vscale x 8 x double> @llvm.riscv.vrgatherei16.vv.mask.nxv8f64(
-  <vscale x 8 x double>,
-  <vscale x 8 x double>,
-  <vscale x 8 x i16>,
-  <vscale x 8 x i1>,
-  i32,
-  i32);
-
-define <vscale x 8 x double> @intrinsic_vrgatherei16_mask_vv_nxv8f64_nxv8f64(<vscale x 8 x double> %0, <vscale x 8 x double> %1, <vscale x 8 x i16> %2, <vscale x 8 x i1> %3, i32 %4) nounwind {
-; CHECK-LABEL: intrinsic_vrgatherei16_mask_vv_nxv8f64_nxv8f64:
-; CHECK:       # %bb.0: # %entry
-; CHECK-NEXT:    vl2re16.v v24, (a0)
-; CHECK-NEXT:    vsetvli zero, a1, e64, m8, ta, mu
-; CHECK-NEXT:    vrgatherei16.vv v8, v16, v24, v0.t
-; CHECK-NEXT:    ret
-entry:
-  %a = call <vscale x 8 x double> @llvm.riscv.vrgatherei16.vv.mask.nxv8f64(
-    <vscale x 8 x double> %0,
-    <vscale x 8 x double> %1,
-    <vscale x 8 x i16> %2,
-    <vscale x 8 x i1> %3,
-    i32 %4, i32 1)
-
-  ret <vscale x 8 x double> %a
-}
diff --git a/llvm/test/CodeGen/RISCV/rvv/vrgatherei16-rv64.ll b/llvm/test/CodeGen/RISCV/rvv/vrgatherei16.ll
index ea54912..d1e947e 100644
--- a/llvm/test/CodeGen/RISCV/rvv/vrgatherei16-rv64.ll
+++ b/llvm/test/CodeGen/RISCV/rvv/vrgatherei16.ll
@@ -1,14 +1,16 @@
 ; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py
-; RUN: llc -mtriple=riscv64 -mattr=+v,+d,+zfh,+zvfh -verify-machineinstrs \
-; RUN:   < %s | FileCheck %s
+; RUN: sed 's/iXLen/i32/g' %s | llc -mtriple=riscv32 -mattr=+v,+d,+zfh,+zvfh \
+; RUN:   -verify-machineinstrs | FileCheck %s
+; RUN: sed 's/iXLen/i64/g' %s | llc -mtriple=riscv64 -mattr=+v,+d,+zfh,+zvfh \
+; RUN:   -verify-machineinstrs | FileCheck %s
 
 declare <vscale x 1 x i8> @llvm.riscv.vrgatherei16.vv.nxv1i8(
   <vscale x 1 x i8>,
   <vscale x 1 x i8>,
   <vscale x 1 x i16>,
-  i64);
+  iXLen);
 
-define <vscale x 1 x i8> @intrinsic_vrgatherei16_vv_nxv1i8_nxv1i8(<vscale x 1 x i8> %0, <vscale x 1 x i16> %1, i64 %2) nounwind {
+define <vscale x 1 x i8> @intrinsic_vrgatherei16_vv_nxv1i8_nxv1i8(<vscale x 1 x i8> %0, <vscale x 1 x i16> %1, iXLen %2) nounwind {
 ; CHECK-LABEL: intrinsic_vrgatherei16_vv_nxv1i8_nxv1i8:
 ; CHECK:       # %bb.0: # %entry
 ; CHECK-NEXT:    vsetvli zero, a0, e8, mf8, ta, ma
@@ -20,7 +22,7 @@ entry:
     <vscale x 1 x i8> undef,
     <vscale x 1 x i8> %0,
     <vscale x 1 x i16> %1,
-    i64 %2)
+    iXLen %2)
 
   ret <vscale x 1 x i8> %a
 }
@@ -30,10 +32,10 @@ declare <vscale x 1 x i8> @llvm.riscv.vrgatherei16.vv.mask.nxv1i8(
   <vscale x 1 x i8>,
   <vscale x 1 x i16>,
   <vscale x 1 x i1>,
-  i64,
-  i64);
+  iXLen,
+  iXLen);
 
-define <vscale x 1 x i8> @intrinsic_vrgatherei16_mask_vv_nxv1i8_nxv1i8(<vscale x 1 x i8> %0, <vscale x 1 x i8> %1, <vscale x 1 x i16> %2, <vscale x 1 x i1> %3, i64 %4) nounwind {
+define <vscale x 1 x i8> @intrinsic_vrgatherei16_mask_vv_nxv1i8_nxv1i8(<vscale x 1 x i8> %0, <vscale x 1 x i8> %1, <vscale x 1 x i16> %2, <vscale x 1 x i1> %3, iXLen %4) nounwind {
 ; CHECK-LABEL: intrinsic_vrgatherei16_mask_vv_nxv1i8_nxv1i8:
 ; CHECK:       # %bb.0: # %entry
 ; CHECK-NEXT:    vsetvli zero, a0, e8, mf8, ta, mu
@@ -45,7 +47,7 @@ entry:
     <vscale x 1 x i8> %1,
     <vscale x 1 x i16> %2,
     <vscale x 1 x i1> %3,
-    i64 %4, i64 1)
+    iXLen %4, iXLen 1)
 
   ret <vscale x 1 x i8> %a
 }
@@ -54,9 +56,9 @@ declare <vscale x 2 x i8> @llvm.riscv.vrgatherei16.vv.nxv2i8(
   <vscale x 2 x i8>,
   <vscale x 2 x i8>,
   <vscale x 2 x i16>,
-  i64);
+  iXLen);
 
-define <vscale x 2 x i8> @intrinsic_vrgatherei16_vv_nxv2i8_nxv2i8(<vscale x 2 x i8> %0, <vscale x 2 x i16> %1, i64 %2) nounwind {
+define <vscale x 2 x i8> @intrinsic_vrgatherei16_vv_nxv2i8_nxv2i8(<vscale x 2 x i8> %0, <vscale x 2 x i16> %1, iXLen %2) nounwind {
 ; CHECK-LABEL: intrinsic_vrgatherei16_vv_nxv2i8_nxv2i8:
 ; CHECK:       # %bb.0: # %entry
 ; CHECK-NEXT:    vsetvli zero, a0, e8, mf4, ta, ma
@@ -68,7 +70,7 @@ entry:
     <vscale x 2 x i8> undef,
     <vscale x 2 x i8> %0,
     <vscale x 2 x i16> %1,
-    i64 %2)
+    iXLen %2)
 
   ret <vscale x 2 x i8> %a
 }
@@ -78,10 +80,10 @@ declare <vscale x 2 x i8> @llvm.riscv.vrgatherei16.vv.mask.nxv2i8(
   <vscale x 2 x i8>,
   <vscale x 2 x i16>,
   <vscale x 2 x i1>,
-  i64,
-  i64);
+  iXLen,
+  iXLen);
 
-define <vscale x 2 x i8> @intrinsic_vrgatherei16_mask_vv_nxv2i8_nxv2i8(<vscale x 2 x i8> %0, <vscale x 2 x i8> %1, <vscale x 2 x i16> %2, <vscale x 2 x i1> %3, i64 %4) nounwind {
+define <vscale x 2 x i8> @intrinsic_vrgatherei16_mask_vv_nxv2i8_nxv2i8(<vscale x 2 x i8> %0, <vscale x 2 x i8> %1, <vscale x 2 x i16> %2, <vscale x 2 x i1> %3, iXLen %4) nounwind {
 ; CHECK-LABEL: intrinsic_vrgatherei16_mask_vv_nxv2i8_nxv2i8:
 ; CHECK:       # %bb.0: # %entry
 ; CHECK-NEXT:    vsetvli zero, a0, e8, mf4, ta, mu
@@ -93,7 +95,7 @@ entry:
     <vscale x 2 x i8> %1,
     <vscale x 2 x i16> %2,
     <vscale x 2 x i1> %3,
-    i64 %4, i64 1)
+    iXLen %4, iXLen 1)
 
   ret <vscale x 2 x i8> %a
 }
@@ -102,9 +104,9 @@ declare <vscale x 4 x i8> @llvm.riscv.vrgatherei16.vv.nxv4i8(
   <vscale x 4 x i8>,
   <vscale x 4 x i8>,
   <vscale x 4 x i16>,
-  i64);
+  iXLen);
 
-define <vscale x 4 x i8> @intrinsic_vrgatherei16_vv_nxv4i8_nxv4i8(<vscale x 4 x i8> %0, <vscale x 4 x i16> %1, i64 %2) nounwind {
+define <vscale x 4 x i8> @intrinsic_vrgatherei16_vv_nxv4i8_nxv4i8(<vscale x 4 x i8> %0, <vscale x 4 x i16> %1, iXLen %2) nounwind {
 ; CHECK-LABEL: intrinsic_vrgatherei16_vv_nxv4i8_nxv4i8:
 ; CHECK:       # %bb.0: # %entry
 ; CHECK-NEXT:    vsetvli zero, a0, e8, mf2, ta, ma
@@ -116,7 +118,7 @@ entry:
     <vscale x 4 x i8> undef,
     <vscale x 4 x i8> %0,
     <vscale x 4 x i16> %1,
-    i64 %2)
+    iXLen %2)
 
   ret <vscale x 4 x i8> %a
 }
@@ -126,10 +128,10 @@ declare <vscale x 4 x i8> @llvm.riscv.vrgatherei16.vv.mask.nxv4i8(
   <vscale x 4 x i8>,
   <vscale x 4 x i16>,
   <vscale x 4 x i1>,
-  i64,
-  i64);
+  iXLen,
+  iXLen);
 
-define <vscale x 4 x i8> @intrinsic_vrgatherei16_mask_vv_nxv4i8_nxv4i8(<vscale x 4 x i8> %0, <vscale x 4 x i8> %1, <vscale x 4 x i16> %2, <vscale x 4 x i1> %3, i64 %4) nounwind {
+define <vscale x 4 x i8> @intrinsic_vrgatherei16_mask_vv_nxv4i8_nxv4i8(<vscale x 4 x i8> %0, <vscale x 4 x i8> %1, <vscale x 4 x i16> %2, <vscale x 4 x i1> %3, iXLen %4) nounwind {
 ; CHECK-LABEL: intrinsic_vrgatherei16_mask_vv_nxv4i8_nxv4i8:
 ; CHECK:       # %bb.0: # %entry
 ; CHECK-NEXT:    vsetvli zero, a0, e8, mf2, ta, mu
@@ -141,7 +143,7 @@ entry:
     <vscale x 4 x i8> %1,
     <vscale x 4 x i16> %2,
     <vscale x 4 x i1> %3,
-    i64 %4, i64 1)
+    iXLen %4, iXLen 1)
 
   ret <vscale x 4 x i8> %a
 }
@@ -150,9 +152,9 @@ declare <vscale x 8 x i8> @llvm.riscv.vrgatherei16.vv.nxv8i8(
   <vscale x 8 x i8>,
   <vscale x 8 x i8>,
   <vscale x 8 x i16>,
-  i64);
+  iXLen);
 
-define <vscale x 8 x i8> @intrinsic_vrgatherei16_vv_nxv8i8_nxv8i8(<vscale x 8 x i8> %0, <vscale x 8 x i16> %1, i64 %2) nounwind {
+define <vscale x 8 x i8> @intrinsic_vrgatherei16_vv_nxv8i8_nxv8i8(<vscale x 8 x i8> %0, <vscale x 8 x i16> %1, iXLen %2) nounwind {
 ; CHECK-LABEL: intrinsic_vrgatherei16_vv_nxv8i8_nxv8i8:
 ; CHECK:       # %bb.0: # %entry
 ; CHECK-NEXT:    vsetvli zero, a0, e8, m1, ta, ma
@@ -164,7 +166,7 @@ entry:
     <vscale x 8 x i8> undef,
     <vscale x 8 x i8> %0,
     <vscale x 8 x i16> %1,
-    i64 %2)
+    iXLen %2)
 
   ret <vscale x 8 x i8> %a
 }
@@ -174,10 +176,10 @@ declare <vscale x 8 x i8> @llvm.riscv.vrgatherei16.vv.mask.nxv8i8(
   <vscale x 8 x i8>,
   <vscale x 8 x i16>,
   <vscale x 8 x i1>,
-  i64,
-  i64);
+  iXLen,
+  iXLen);
 
-define <vscale x 8 x i8> @intrinsic_vrgatherei16_mask_vv_nxv8i8_nxv8i8(<vscale x 8 x i8> %0, <vscale x 8 x i8> %1, <vscale x 8 x i16> %2, <vscale x 8 x i1> %3, i64 %4) nounwind {
+define <vscale x 8 x i8> @intrinsic_vrgatherei16_mask_vv_nxv8i8_nxv8i8(<vscale x 8 x i8> %0, <vscale x 8 x i8> %1, <vscale x 8 x i16> %2, <vscale x 8 x i1> %3, iXLen %4) nounwind {
 ; CHECK-LABEL: intrinsic_vrgatherei16_mask_vv_nxv8i8_nxv8i8:
 ; CHECK:       # %bb.0: # %entry
 ; CHECK-NEXT:    vsetvli zero, a0, e8, m1, ta, mu
@@ -189,7 +191,7 @@ entry:
     <vscale x 8 x i8> %1,
     <vscale x 8 x i16> %2,
     <vscale x 8 x i1> %3,
-    i64 %4, i64 1)
+    iXLen %4, iXLen 1)
 
   ret <vscale x 8 x i8> %a
 }
@@ -198,9 +200,9 @@ declare <vscale x 16 x i8> @llvm.riscv.vrgatherei16.vv.nxv16i8(
   <vscale x 16 x i8>,
   <vscale x 16 x i8>,
   <vscale x 16 x i16>,
-  i64);
+  iXLen);
 
-define <vscale x 16 x i8> @intrinsic_vrgatherei16_vv_nxv16i8_nxv16i8(<vscale x 16 x i8> %0, <vscale x 16 x i16> %1, i64 %2) nounwind {
+define <vscale x 16 x i8> @intrinsic_vrgatherei16_vv_nxv16i8_nxv16i8(<vscale x 16 x i8> %0, <vscale x 16 x i16> %1, iXLen %2) nounwind {
 ; CHECK-LABEL: intrinsic_vrgatherei16_vv_nxv16i8_nxv16i8:
 ; CHECK:       # %bb.0: # %entry
 ; CHECK-NEXT:    vsetvli zero, a0, e8, m2, ta, ma
@@ -212,7 +214,7 @@ entry:
     <vscale x 16 x i8> undef,
     <vscale x 16 x i8> %0,
     <vscale x 16 x i16> %1,
-    i64 %2)
+    iXLen %2)
 
   ret <vscale x 16 x i8> %a
 }
@@ -222,10 +224,10 @@ declare <vscale x 16 x i8> @llvm.riscv.vrgatherei16.vv.mask.nxv16i8(
   <vscale x 16 x i8>,
   <vscale x 16 x i16>,
   <vscale x 16 x i1>,
-  i64,
-  i64);
+  iXLen,
+  iXLen);
 
-define <vscale x 16 x i8> @intrinsic_vrgatherei16_mask_vv_nxv16i8_nxv16i8(<vscale x 16 x i8> %0, <vscale x 16 x i8> %1, <vscale x 16 x i16> %2, <vscale x 16 x i1> %3, i64 %4) nounwind {
+define <vscale x 16 x i8> @intrinsic_vrgatherei16_mask_vv_nxv16i8_nxv16i8(<vscale x 16 x i8> %0, <vscale x 16 x i8> %1, <vscale x 16 x i16> %2, <vscale x 16 x i1> %3, iXLen %4) nounwind {
 ; CHECK-LABEL: intrinsic_vrgatherei16_mask_vv_nxv16i8_nxv16i8:
 ; CHECK:       # %bb.0: # %entry
 ; CHECK-NEXT:    vsetvli zero, a0, e8, m2, ta, mu
@@ -237,7 +239,7 @@ entry:
     <vscale x 16 x i8> %1,
     <vscale x 16 x i16> %2,
     <vscale x 16 x i1> %3,
-    i64 %4, i64 1)
+    iXLen %4, iXLen 1)
 
   ret <vscale x 16 x i8> %a
 }
@@ -246,9 +248,9 @@ declare <vscale x 32 x i8> @llvm.riscv.vrgatherei16.vv.nxv32i8(
   <vscale x 32 x i8>,
   <vscale x 32 x i8>,
   <vscale x 32 x i16>,
-  i64);
+  iXLen);
 
-define <vscale x 32 x i8> @intrinsic_vrgatherei16_vv_nxv32i8_nxv32i8(<vscale x 32 x i8> %0, <vscale x 32 x i16> %1, i64 %2) nounwind {
+define <vscale x 32 x i8> @intrinsic_vrgatherei16_vv_nxv32i8_nxv32i8(<vscale x 32 x i8> %0, <vscale x 32 x i16> %1, iXLen %2) nounwind {
 ; CHECK-LABEL: intrinsic_vrgatherei16_vv_nxv32i8_nxv32i8:
 ; CHECK:       # %bb.0: # %entry
 ; CHECK-NEXT:    vsetvli zero, a0, e8, m4, ta, ma
@@ -260,7 +262,7 @@ entry:
     <vscale x 32 x i8> undef,
     <vscale x 32 x i8> %0,
     <vscale x 32 x i16> %1,
-    i64 %2)
+    iXLen %2)
 
   ret <vscale x 32 x i8> %a
 }
@@ -270,10 +272,10 @@ declare <vscale x 32 x i8> @llvm.riscv.vrgatherei16.vv.mask.nxv32i8(
   <vscale x 32 x i8>,
   <vscale x 32 x i16>,
   <vscale x 32 x i1>,
-  i64,
-  i64);
+  iXLen,
+  iXLen);
 
-define <vscale x 32 x i8> @intrinsic_vrgatherei16_mask_vv_nxv32i8_nxv32i8(<vscale x 32 x i8> %0, <vscale x 32 x i8> %1, <vscale x 32 x i16> %2, <vscale x 32 x i1> %3, i64 %4) nounwind {
+define <vscale x 32 x i8> @intrinsic_vrgatherei16_mask_vv_nxv32i8_nxv32i8(<vscale x 32 x i8> %0, <vscale x 32 x i8> %1, <vscale x 32 x i16> %2, <vscale x 32 x i1> %3, iXLen %4) nounwind {
 ; CHECK-LABEL: intrinsic_vrgatherei16_mask_vv_nxv32i8_nxv32i8:
 ; CHECK:       # %bb.0: # %entry
 ; CHECK-NEXT:    vsetvli zero, a0, e8, m4, ta, mu
@@ -285,7 +287,7 @@ entry:
     <vscale x 32 x i8> %1,
     <vscale x 32 x i16> %2,
     <vscale x 32 x i1> %3,
-    i64 %4, i64 1)
+    iXLen %4, iXLen 1)
 
   ret <vscale x 32 x i8> %a
 }
@@ -294,9 +296,9 @@ declare <vscale x 1 x i16> @llvm.riscv.vrgatherei16.vv.nxv1i16(
   <vscale x 1 x i16>,
   <vscale x 1 x i16>,
   <vscale x 1 x i16>,
-  i64);
+  iXLen);
 
-define <vscale x 1 x i16> @intrinsic_vrgatherei16_vv_nxv1i16_nxv1i16(<vscale x 1 x i16> %0, <vscale x 1 x i16> %1, i64 %2) nounwind {
+define <vscale x 1 x i16> @intrinsic_vrgatherei16_vv_nxv1i16_nxv1i16(<vscale x 1 x i16> %0, <vscale x 1 x i16> %1, iXLen %2) nounwind {
 ; CHECK-LABEL: intrinsic_vrgatherei16_vv_nxv1i16_nxv1i16:
 ; CHECK:       # %bb.0: # %entry
 ; CHECK-NEXT:    vsetvli zero, a0, e16, mf4, ta, ma
@@ -308,7 +310,7 @@ entry:
     <vscale x 1 x i16> undef,
     <vscale x 1 x i16> %0,
     <vscale x 1 x i16> %1,
-    i64 %2)
+    iXLen %2)
 
   ret <vscale x 1 x i16> %a
 }
@@ -318,10 +320,10 @@ declare <vscale x 1 x i16> @llvm.riscv.vrgatherei16.vv.mask.nxv1i16(
   <vscale x 1 x i16>,
   <vscale x 1 x i16>,
   <vscale x 1 x i1>,
-  i64,
-  i64);
+  iXLen,
+  iXLen);
 
-define <vscale x 1 x i16> @intrinsic_vrgatherei16_mask_vv_nxv1i16_nxv1i16(<vscale x 1 x i16> %0, <vscale x 1 x i16> %1, <vscale x 1 x i16> %2, <vscale x 1 x i1> %3, i64 %4) nounwind {
+define <vscale x 1 x i16> @intrinsic_vrgatherei16_mask_vv_nxv1i16_nxv1i16(<vscale x 1 x i16> %0, <vscale x 1 x i16> %1, <vscale x 1 x i16> %2, <vscale x 1 x i1> %3, iXLen %4) nounwind {
 ; CHECK-LABEL: intrinsic_vrgatherei16_mask_vv_nxv1i16_nxv1i16:
 ; CHECK:       # %bb.0: # %entry
 ; CHECK-NEXT:    vsetvli zero, a0, e16, mf4, ta, mu
@@ -333,7 +335,7 @@ entry:
     <vscale x 1 x i16> %1,
     <vscale x 1 x i16> %2,
     <vscale x 1 x i1> %3,
-    i64 %4, i64 1)
+    iXLen %4, iXLen 1)
 
   ret <vscale x 1 x i16> %a
 }
@@ -342,9 +344,9 @@ declare <vscale x 2 x i16> @llvm.riscv.vrgatherei16.vv.nxv2i16(
   <vscale x 2 x i16>,
   <vscale x 2 x i16>,
   <vscale x 2 x i16>,
-  i64);
+  iXLen);
 
-define <vscale x 2 x i16> @intrinsic_vrgatherei16_vv_nxv2i16_nxv2i16(<vscale x 2 x i16> %0, <vscale x 2 x i16> %1, i64 %2) nounwind {
+define <vscale x 2 x i16> @intrinsic_vrgatherei16_vv_nxv2i16_nxv2i16(<vscale x 2 x i16> %0, <vscale x 2 x i16> %1, iXLen %2) nounwind {
 ; CHECK-LABEL: intrinsic_vrgatherei16_vv_nxv2i16_nxv2i16:
 ; CHECK:       # %bb.0: # %entry
 ; CHECK-NEXT:    vsetvli zero, a0, e16, mf2, ta, ma
@@ -356,7 +358,7 @@ entry:
     <vscale x 2 x i16> undef,
     <vscale x 2 x i16> %0,
     <vscale x 2 x i16> %1,
-    i64 %2)
+    iXLen %2)
 
   ret <vscale x 2 x i16> %a
 }
@@ -366,10 +368,10 @@ declare <vscale x 2 x i16> @llvm.riscv.vrgatherei16.vv.mask.nxv2i16(
   <vscale x 2 x i16>,
   <vscale x 2 x i16>,
   <vscale x 2 x i1>,
-  i64,
-  i64);
+  iXLen,
+  iXLen);
 
-define <vscale x 2 x i16> @intrinsic_vrgatherei16_mask_vv_nxv2i16_nxv2i16(<vscale x 2 x i16> %0, <vscale x 2 x i16> %1, <vscale x 2 x i16> %2, <vscale x 2 x i1> %3, i64 %4) nounwind {
+define <vscale x 2 x i16> @intrinsic_vrgatherei16_mask_vv_nxv2i16_nxv2i16(<vscale x 2 x i16> %0, <vscale x 2 x i16> %1, <vscale x 2 x i16> %2, <vscale x 2 x i1> %3, iXLen %4) nounwind {
 ; CHECK-LABEL: intrinsic_vrgatherei16_mask_vv_nxv2i16_nxv2i16:
 ; CHECK:       # %bb.0: # %entry
 ; CHECK-NEXT:    vsetvli zero, a0, e16, mf2, ta, mu
@@ -381,7 +383,7 @@ entry:
     <vscale x 2 x i16> %1,
     <vscale x 2 x i16> %2,
     <vscale x 2 x i1> %3,
-    i64 %4, i64 1)
+    iXLen %4, iXLen 1)
 
   ret <vscale x 2 x i16> %a
 }
@@ -390,9 +392,9 @@ declare <vscale x 4 x i16> @llvm.riscv.vrgatherei16.vv.nxv4i16(
   <vscale x 4 x i16>,
   <vscale x 4 x i16>,
   <vscale x 4 x i16>,
-  i64);
+  iXLen);
 
-define <vscale x 4 x i16> @intrinsic_vrgatherei16_vv_nxv4i16_nxv4i16(<vscale x 4 x i16> %0, <vscale x 4 x i16> %1, i64 %2) nounwind {
+define <vscale x 4 x i16> @intrinsic_vrgatherei16_vv_nxv4i16_nxv4i16(<vscale x 4 x i16> %0, <vscale x 4 x i16> %1, iXLen %2) nounwind {
 ; CHECK-LABEL: intrinsic_vrgatherei16_vv_nxv4i16_nxv4i16:
 ; CHECK:       # %bb.0: # %entry
 ; CHECK-NEXT:    vsetvli zero, a0, e16, m1, ta, ma
@@ -404,7 +406,7 @@ entry:
     <vscale x 4 x i16> undef,
     <vscale x 4 x i16> %0,
     <vscale x 4 x i16> %1,
-    i64 %2)
+    iXLen %2)
 
   ret <vscale x 4 x i16> %a
 }
@@ -414,10 +416,10 @@ declare <vscale x 4 x i16> @llvm.riscv.vrgatherei16.vv.mask.nxv4i16(
   <vscale x 4 x i16>,
   <vscale x 4 x i16>,
   <vscale x 4 x i1>,
-  i64,
-  i64);
+  iXLen,
+  iXLen);
 
-define <vscale x 4 x i16> @intrinsic_vrgatherei16_mask_vv_nxv4i16_nxv4i16(<vscale x 4 x i16> %0, <vscale x 4 x i16> %1, <vscale x 4 x i16> %2, <vscale x 4 x i1> %3, i64 %4) nounwind {
+define <vscale x 4 x i16> @intrinsic_vrgatherei16_mask_vv_nxv4i16_nxv4i16(<vscale x 4 x i16> %0, <vscale x 4 x i16> %1, <vscale x 4 x i16> %2, <vscale x 4 x i1> %3, iXLen %4) nounwind {
 ; CHECK-LABEL: intrinsic_vrgatherei16_mask_vv_nxv4i16_nxv4i16:
 ; CHECK:       # %bb.0: # %entry
 ; CHECK-NEXT:    vsetvli zero, a0, e16, m1, ta, mu
@@ -429,7 +431,7 @@ entry:
     <vscale x 4 x i16> %1,
     <vscale x 4 x i16> %2,
     <vscale x 4 x i1> %3,
-    i64 %4, i64 1)
+    iXLen %4, iXLen 1)
 
   ret <vscale x 4 x i16> %a
 }
@@ -438,9 +440,9 @@ declare <vscale x 8 x i16> @llvm.riscv.vrgatherei16.vv.nxv8i16(
   <vscale x 8 x i16>,
   <vscale x 8 x i16>,
   <vscale x 8 x i16>,
-  i64);
+  iXLen);
 
-define <vscale x 8 x i16> @intrinsic_vrgatherei16_vv_nxv8i16_nxv8i16(<vscale x 8 x i16> %0, <vscale x 8 x i16> %1, i64 %2) nounwind {
+define <vscale x 8 x i16> @intrinsic_vrgatherei16_vv_nxv8i16_nxv8i16(<vscale x 8 x i16> %0, <vscale x 8 x i16> %1, iXLen %2) nounwind {
 ; CHECK-LABEL: intrinsic_vrgatherei16_vv_nxv8i16_nxv8i16:
 ; CHECK:       # %bb.0: # %entry
 ; CHECK-NEXT:    vsetvli zero, a0, e16, m2, ta, ma
@@ -452,7 +454,7 @@ entry:
     <vscale x 8 x i16> undef,
     <vscale x 8 x i16> %0,
     <vscale x 8 x i16> %1,
-    i64 %2)
+    iXLen %2)
 
   ret <vscale x 8 x i16> %a
 }
@@ -462,10 +464,10 @@ declare <vscale x 8 x i16> @llvm.riscv.vrgatherei16.vv.mask.nxv8i16(
   <vscale x 8 x i16>,
   <vscale x 8 x i16>,
   <vscale x 8 x i1>,
-  i64,
-  i64);
+  iXLen,
+  iXLen);
 
-define <vscale x 8 x i16> @intrinsic_vrgatherei16_mask_vv_nxv8i16_nxv8i16(<vscale x 8 x i16> %0, <vscale x 8 x i16> %1, <vscale x 8 x i16> %2, <vscale x 8 x i1> %3, i64 %4) nounwind {
+define <vscale x 8 x i16> @intrinsic_vrgatherei16_mask_vv_nxv8i16_nxv8i16(<vscale x 8 x i16> %0, <vscale x 8 x i16> %1, <vscale x 8 x i16> %2, <vscale x 8 x i1> %3, iXLen %4) nounwind {
 ; CHECK-LABEL: intrinsic_vrgatherei16_mask_vv_nxv8i16_nxv8i16:
 ; CHECK:       # %bb.0: # %entry
 ; CHECK-NEXT:    vsetvli zero, a0, e16, m2, ta, mu
@@ -477,7 +479,7 @@ entry:
     <vscale x 8 x i16> %1,
     <vscale x 8 x i16> %2,
     <vscale x 8 x i1> %3,
-    i64 %4, i64 1)
+    iXLen %4, iXLen 1)
 
   ret <vscale x 8 x i16> %a
 }
@@ -486,9 +488,9 @@ declare <vscale x 16 x i16> @llvm.riscv.vrgatherei16.vv.nxv16i16(
   <vscale x 16 x i16>,
   <vscale x 16 x i16>,
   <vscale x 16 x i16>,
-  i64);
+  iXLen);
 
-define <vscale x 16 x i16> @intrinsic_vrgatherei16_vv_nxv16i16_nxv16i16(<vscale x 16 x i16> %0, <vscale x 16 x i16> %1, i64 %2) nounwind {
+define <vscale x 16 x i16> @intrinsic_vrgatherei16_vv_nxv16i16_nxv16i16(<vscale x 16 x i16> %0, <vscale x 16 x i16> %1, iXLen %2) nounwind {
 ; CHECK-LABEL: intrinsic_vrgatherei16_vv_nxv16i16_nxv16i16:
 ; CHECK:       # %bb.0: # %entry
 ; CHECK-NEXT:    vsetvli zero, a0, e16, m4, ta, ma
@@ -500,7 +502,7 @@ entry:
     <vscale x 16 x i16> undef,
     <vscale x 16 x i16> %0,
     <vscale x 16 x i16> %1,
-    i64 %2)
+    iXLen %2)
 
   ret <vscale x 16 x i16> %a
 }
@@ -510,10 +512,10 @@ declare <vscale x 16 x i16> @llvm.riscv.vrgatherei16.vv.mask.nxv16i16(
   <vscale x 16 x i16>,
   <vscale x 16 x i16>,
   <vscale x 16 x i1>,
-  i64,
-  i64);
+  iXLen,
+  iXLen);
 
-define <vscale x 16 x i16> @intrinsic_vrgatherei16_mask_vv_nxv16i16_nxv16i16(<vscale x 16 x i16> %0, <vscale x 16 x i16> %1, <vscale x 16 x i16> %2, <vscale x 16 x i1> %3, i64 %4) nounwind {
+define <vscale x 16 x i16> @intrinsic_vrgatherei16_mask_vv_nxv16i16_nxv16i16(<vscale x 16 x i16> %0, <vscale x 16 x i16> %1, <vscale x 16 x i16> %2, <vscale x 16 x i1> %3, iXLen %4) nounwind {
 ; CHECK-LABEL: intrinsic_vrgatherei16_mask_vv_nxv16i16_nxv16i16:
 ; CHECK:       # %bb.0: # %entry
 ; CHECK-NEXT:    vsetvli zero, a0, e16, m4, ta, mu
@@ -525,7 +527,7 @@ entry:
     <vscale x 16 x i16> %1,
     <vscale x 16 x i16> %2,
     <vscale x 16 x i1> %3,
-    i64 %4, i64 1)
+    iXLen %4, iXLen 1)
 
   ret <vscale x 16 x i16> %a
 }
@@ -534,9 +536,9 @@ declare <vscale x 32 x i16> @llvm.riscv.vrgatherei16.vv.nxv32i16(
   <vscale x 32 x i16>,
   <vscale x 32 x i16>,
   <vscale x 32 x i16>,
-  i64);
+  iXLen);
 
-define <vscale x 32 x i16> @intrinsic_vrgatherei16_vv_nxv32i16_nxv32i16(<vscale x 32 x i16> %0, <vscale x 32 x i16> %1, i64 %2) nounwind {
+define <vscale x 32 x i16> @intrinsic_vrgatherei16_vv_nxv32i16_nxv32i16(<vscale x 32 x i16> %0, <vscale x 32 x i16> %1, iXLen %2) nounwind {
 ; CHECK-LABEL: intrinsic_vrgatherei16_vv_nxv32i16_nxv32i16:
 ; CHECK:       # %bb.0: # %entry
 ; CHECK-NEXT:    vsetvli zero, a0, e16, m8, ta, ma
@@ -548,7 +550,7 @@ entry:
     <vscale x 32 x i16> undef,
     <vscale x 32 x i16> %0,
     <vscale x 32 x i16> %1,
-    i64 %2)
+    iXLen %2)
 
   ret <vscale x 32 x i16> %a
 }
@@ -558,10 +560,10 @@ declare <vscale x 32 x i16> @llvm.riscv.vrgatherei16.vv.mask.nxv32i16(
   <vscale x 32 x i16>,
   <vscale x 32 x i16>,
   <vscale x 32 x i1>,
-  i64,
-  i64);
+  iXLen,
+  iXLen);
 
-define <vscale x 32 x i16> @intrinsic_vrgatherei16_mask_vv_nxv32i16_nxv32i16(<vscale x 32 x i16> %0, <vscale x 32 x i16> %1, <vscale x 32 x i16> %2, <vscale x 32 x i1> %3, i64 %4) nounwind {
+define <vscale x 32 x i16> @intrinsic_vrgatherei16_mask_vv_nxv32i16_nxv32i16(<vscale x 32 x i16> %0, <vscale x 32 x i16> %1, <vscale x 32 x i16> %2, <vscale x 32 x i1> %3, iXLen %4) nounwind {
 ; CHECK-LABEL: intrinsic_vrgatherei16_mask_vv_nxv32i16_nxv32i16:
 ; CHECK:       # %bb.0: # %entry
 ; CHECK-NEXT:    vl8re16.v v24, (a0)
@@ -574,7 +576,7 @@ entry:
     <vscale x 32 x i16> %1,
     <vscale x 32 x i16> %2,
     <vscale x 32 x i1> %3,
-    i64 %4, i64 1)
+    iXLen %4, iXLen 1)
 
   ret <vscale x 32 x i16> %a
 }
@@ -583,9 +585,9 @@ declare <vscale x 1 x i32> @llvm.riscv.vrgatherei16.vv.nxv1i32(
   <vscale x 1 x i32>,
   <vscale x 1 x i32>,
   <vscale x 1 x i16>,
-  i64);
+  iXLen);
 
-define <vscale x 1 x i32> @intrinsic_vrgatherei16_vv_nxv1i32_nxv1i32(<vscale x 1 x i32> %0, <vscale x 1 x i16> %1, i64 %2) nounwind {
+define <vscale x 1 x i32> @intrinsic_vrgatherei16_vv_nxv1i32_nxv1i32(<vscale x 1 x i32> %0, <vscale x 1 x i16> %1, iXLen %2) nounwind {
 ; CHECK-LABEL: intrinsic_vrgatherei16_vv_nxv1i32_nxv1i32:
 ; CHECK:       # %bb.0: # %entry
 ; CHECK-NEXT:    vsetvli zero, a0, e32, mf2, ta, ma
@@ -597,7 +599,7 @@ entry:
     <vscale x 1 x i32> undef,
     <vscale x 1 x i32> %0,
     <vscale x 1 x i16> %1,
-    i64 %2)
+    iXLen %2)
 
   ret <vscale x 1 x i32> %a
 }
@@ -607,10 +609,10 @@ declare <vscale x 1 x i32> @llvm.riscv.vrgatherei16.vv.mask.nxv1i32(
   <vscale x 1 x i32>,
   <vscale x 1 x i16>,
   <vscale x 1 x i1>,
-  i64,
-  i64);
+  iXLen,
+  iXLen);
 
-define <vscale x 1 x i32> @intrinsic_vrgatherei16_mask_vv_nxv1i32_nxv1i32(<vscale x 1 x i32> %0, <vscale x 1 x i32> %1, <vscale x 1 x i16> %2, <vscale x 1 x i1> %3, i64 %4) nounwind {
+define <vscale x 1 x i32> @intrinsic_vrgatherei16_mask_vv_nxv1i32_nxv1i32(<vscale x 1 x i32> %0, <vscale x 1 x i32> %1, <vscale x 1 x i16> %2, <vscale x 1 x i1> %3, iXLen %4) nounwind {
 ; CHECK-LABEL: intrinsic_vrgatherei16_mask_vv_nxv1i32_nxv1i32:
 ; CHECK:       # %bb.0: # %entry
 ; CHECK-NEXT:    vsetvli zero, a0, e32, mf2, ta, mu
@@ -622,7 +624,7 @@ entry:
     <vscale x 1 x i32> %1,
     <vscale x 1 x i16> %2,
     <vscale x 1 x i1> %3,
-    i64 %4, i64 1)
+    iXLen %4, iXLen 1)
 
   ret <vscale x 1 x i32> %a
 }
@@ -631,9 +633,9 @@ declare <vscale x 4 x i32> @llvm.riscv.vrgatherei16.vv.nxv4i32(
   <vscale x 4 x i32>,
   <vscale x 4 x i32>,
   <vscale x 4 x i16>,
-  i64);
+  iXLen);
 
-define <vscale x 4 x i32> @intrinsic_vrgatherei16_vv_nxv4i32_nxv4i32(<vscale x 4 x i32> %0, <vscale x 4 x i16> %1, i64 %2) nounwind {
+define <vscale x 4 x i32> @intrinsic_vrgatherei16_vv_nxv4i32_nxv4i32(<vscale x 4 x i32> %0, <vscale x 4 x i16> %1, iXLen %2) nounwind {
 ; CHECK-LABEL: intrinsic_vrgatherei16_vv_nxv4i32_nxv4i32:
 ; CHECK:       # %bb.0: # %entry
 ; CHECK-NEXT:    vsetvli zero, a0, e32, m2, ta, ma
@@ -645,7 +647,7 @@ entry:
     <vscale x 4 x i32> undef,
     <vscale x 4 x i32> %0,
     <vscale x 4 x i16> %1,
-    i64 %2)
+    iXLen %2)
 
   ret <vscale x 4 x i32> %a
 }
@@ -655,10 +657,10 @@ declare <vscale x 4 x i32> @llvm.riscv.vrgatherei16.vv.mask.nxv4i32(
   <vscale x 4 x i32>,
   <vscale x 4 x i16>,
   <vscale x 4 x i1>,
-  i64,
-  i64);
+  iXLen,
+  iXLen);
 
-define <vscale x 4 x i32> @intrinsic_vrgatherei16_mask_vv_nxv4i32_nxv4i32(<vscale x 4 x i32> %0, <vscale x 4 x i32> %1, <vscale x 4 x i16> %2, <vscale x 4 x i1> %3, i64 %4) nounwind {
+define <vscale x 4 x i32> @intrinsic_vrgatherei16_mask_vv_nxv4i32_nxv4i32(<vscale x 4 x i32> %0, <vscale x 4 x i32> %1, <vscale x 4 x i16> %2, <vscale x 4 x i1> %3, iXLen %4) nounwind {
 ; CHECK-LABEL: intrinsic_vrgatherei16_mask_vv_nxv4i32_nxv4i32:
 ; CHECK:       # %bb.0: # %entry
 ; CHECK-NEXT:    vsetvli zero, a0, e32, m2, ta, mu
@@ -670,7 +672,7 @@ entry:
     <vscale x 4 x i32> %1,
     <vscale x 4 x i16> %2,
     <vscale x 4 x i1> %3,
-    i64 %4, i64 1)
+    iXLen %4, iXLen 1)
 
   ret <vscale x 4 x i32> %a
 }
@@ -679,9 +681,9 @@ declare <vscale x 8 x i32> @llvm.riscv.vrgatherei16.vv.nxv8i32(
   <vscale x 8 x i32>,
   <vscale x 8 x i32>,
   <vscale x 8 x i16>,
-  i64);
+  iXLen);
 
-define <vscale x 8 x i32> @intrinsic_vrgatherei16_vv_nxv8i32_nxv8i32(<vscale x 8 x i32> %0, <vscale x 8 x i16> %1, i64 %2) nounwind {
+define <vscale x 8 x i32> @intrinsic_vrgatherei16_vv_nxv8i32_nxv8i32(<vscale x 8 x i32> %0, <vscale x 8 x i16> %1, iXLen %2) nounwind {
 ; CHECK-LABEL: intrinsic_vrgatherei16_vv_nxv8i32_nxv8i32:
 ; CHECK:       # %bb.0: # %entry
 ; CHECK-NEXT:    vsetvli zero, a0, e32, m4, ta, ma
@@ -693,7 +695,7 @@ entry:
     <vscale x 8 x i32> undef,
     <vscale x 8 x i32> %0,
     <vscale x 8 x i16> %1,
-    i64 %2)
+    iXLen %2)
 
   ret <vscale x 8 x i32> %a
 }
@@ -703,10 +705,10 @@ declare <vscale x 8 x i32> @llvm.riscv.vrgatherei16.vv.mask.nxv8i32(
   <vscale x 8 x i32>,
   <vscale x 8 x i16>,
   <vscale x 8 x i1>,
-  i64,
-  i64);
+  iXLen,
+  iXLen);
 
-define <vscale x 8 x i32> @intrinsic_vrgatherei16_mask_vv_nxv8i32_nxv8i32(<vscale x 8 x i32> %0, <vscale x 8 x i32> %1, <vscale x 8 x i16> %2, <vscale x 8 x i1> %3, i64 %4) nounwind {
+define <vscale x 8 x i32> @intrinsic_vrgatherei16_mask_vv_nxv8i32_nxv8i32(<vscale x 8 x i32> %0, <vscale x 8 x i32> %1, <vscale x 8 x i16> %2, <vscale x 8 x i1> %3, iXLen %4) nounwind {
 ; CHECK-LABEL: intrinsic_vrgatherei16_mask_vv_nxv8i32_nxv8i32:
 ; CHECK:       # %bb.0: # %entry
 ; CHECK-NEXT:    vsetvli zero, a0, e32, m4, ta, mu
@@ -718,7 +720,7 @@ entry:
     <vscale x 8 x i32> %1,
     <vscale x 8 x i16> %2,
     <vscale x 8 x i1> %3,
-    i64 %4, i64 1)
+    iXLen %4, iXLen 1)
 
   ret <vscale x 8 x i32> %a
 }
@@ -727,9 +729,9 @@ declare <vscale x 16 x i32> @llvm.riscv.vrgatherei16.vv.nxv16i32(
   <vscale x 16 x i32>,
   <vscale x 16 x i32>,
   <vscale x 16 x i16>,
-  i64);
+  iXLen);
 
-define <vscale x 16 x i32> @intrinsic_vrgatherei16_vv_nxv16i32_nxv16i32(<vscale x 16 x i32> %0, <vscale x 16 x i16> %1, i64 %2) nounwind {
+define <vscale x 16 x i32> @intrinsic_vrgatherei16_vv_nxv16i32_nxv16i32(<vscale x 16 x i32> %0, <vscale x 16 x i16> %1, iXLen %2) nounwind {
 ; CHECK-LABEL: intrinsic_vrgatherei16_vv_nxv16i32_nxv16i32:
 ; CHECK:       # %bb.0: # %entry
 ; CHECK-NEXT:    vsetvli zero, a0, e32, m8, ta, ma
@@ -741,7 +743,7 @@ entry:
     <vscale x 16 x i32> undef,
     <vscale x 16 x i32> %0,
     <vscale x 16 x i16> %1,
-    i64 %2)
+    iXLen %2)
 
   ret <vscale x 16 x i32> %a
 }
@@ -751,10 +753,10 @@ declare <vscale x 16 x i32> @llvm.riscv.vrgatherei16.vv.mask.nxv16i32(
   <vscale x 16 x i32>,
   <vscale x 16 x i16>,
   <vscale x 16 x i1>,
-  i64,
-  i64);
+  iXLen,
+  iXLen);
 
-define <vscale x 16 x i32> @intrinsic_vrgatherei16_mask_vv_nxv16i32_nxv16i32(<vscale x 16 x i32> %0, <vscale x 16 x i32> %1, <vscale x 16 x i16> %2, <vscale x 16 x i1> %3, i64 %4) nounwind {
+define <vscale x 16 x i32> @intrinsic_vrgatherei16_mask_vv_nxv16i32_nxv16i32(<vscale x 16 x i32> %0, <vscale x 16 x i32> %1, <vscale x 16 x i16> %2, <vscale x 16 x i1> %3, iXLen %4) nounwind {
 ; CHECK-LABEL: intrinsic_vrgatherei16_mask_vv_nxv16i32_nxv16i32:
 ; CHECK:       # %bb.0: # %entry
 ; CHECK-NEXT:    vl4re16.v v24, (a0)
@@ -767,7 +769,7 @@ entry:
     <vscale x 16 x i32> %1,
     <vscale x 16 x i16> %2,
     <vscale x 16 x i1> %3,
-    i64 %4, i64 1)
+    iXLen %4, iXLen 1)
 
   ret <vscale x 16 x i32> %a
 }
@@ -776,9 +778,9 @@ declare <vscale x 4 x i64> @llvm.riscv.vrgatherei16.vv.nxv4i64(
   <vscale x 4 x i64>,
   <vscale x 4 x i64>,
   <vscale x 4 x i16>,
-  i64);
+  iXLen);
 
-define <vscale x 4 x i64> @intrinsic_vrgatherei16_vv_nxv4i64_nxv4i64(<vscale x 4 x i64> %0, <vscale x 4 x i16> %1, i64 %2) nounwind {
+define <vscale x 4 x i64> @intrinsic_vrgatherei16_vv_nxv4i64_nxv4i64(<vscale x 4 x i64> %0, <vscale x 4 x i16> %1, iXLen %2) nounwind {
 ; CHECK-LABEL: intrinsic_vrgatherei16_vv_nxv4i64_nxv4i64:
 ; CHECK:       # %bb.0: # %entry
 ; CHECK-NEXT:    vsetvli zero, a0, e64, m4, ta, ma
@@ -790,7 +792,7 @@ entry:
     <vscale x 4 x i64> undef,
     <vscale x 4 x i64> %0,
     <vscale x 4 x i16> %1,
-    i64 %2)
+    iXLen %2)
 
   ret <vscale x 4 x i64> %a
 }
@@ -800,10 +802,10 @@ declare <vscale x 4 x i64> @llvm.riscv.vrgatherei16.vv.mask.nxv4i64(
   <vscale x 4 x i64>,
   <vscale x 4 x i16>,
   <vscale x 4 x i1>,
-  i64,
-  i64);
+  iXLen,
+  iXLen);
 
-define <vscale x 4 x i64> @intrinsic_vrgatherei16_mask_vv_nxv4i64_nxv4i64(<vscale x 4 x i64> %0, <vscale x 4 x i64> %1, <vscale x 4 x i16> %2, <vscale x 4 x i1> %3, i64 %4) nounwind {
+define <vscale x 4 x i64> @intrinsic_vrgatherei16_mask_vv_nxv4i64_nxv4i64(<vscale x 4 x i64> %0, <vscale x 4 x i64> %1, <vscale x 4 x i16> %2, <vscale x 4 x i1> %3, iXLen %4) nounwind {
 ; CHECK-LABEL: intrinsic_vrgatherei16_mask_vv_nxv4i64_nxv4i64:
 ; CHECK:       # %bb.0: # %entry
 ; CHECK-NEXT:    vsetvli zero, a0, e64, m4, ta, mu
@@ -815,7 +817,7 @@ entry:
     <vscale x 4 x i64> %1,
     <vscale x 4 x i16> %2,
     <vscale x 4 x i1> %3,
-    i64 %4, i64 1)
+    iXLen %4, iXLen 1)
 
   ret <vscale x 4 x i64> %a
 }
@@ -824,9 +826,9 @@ declare <vscale x 8 x i64> @llvm.riscv.vrgatherei16.vv.nxv8i64(
   <vscale x 8 x i64>,
   <vscale x 8 x i64>,
   <vscale x 8 x i16>,
-  i64);
+  iXLen);
 
-define <vscale x 8 x i64> @intrinsic_vrgatherei16_vv_nxv8i64_nxv8i64(<vscale x 8 x i64> %0, <vscale x 8 x i16> %1, i64 %2) nounwind {
+define <vscale x 8 x i64> @intrinsic_vrgatherei16_vv_nxv8i64_nxv8i64(<vscale x 8 x i64> %0, <vscale x 8 x i16> %1, iXLen %2) nounwind {
 ; CHECK-LABEL: intrinsic_vrgatherei16_vv_nxv8i64_nxv8i64:
 ; CHECK:       # %bb.0: # %entry
 ; CHECK-NEXT:    vsetvli zero, a0, e64, m8, ta, ma
@@ -838,7 +840,7 @@ entry:
     <vscale x 8 x i64> undef,
     <vscale x 8 x i64> %0,
     <vscale x 8 x i16> %1,
-    i64 %2)
+    iXLen %2)
 
   ret <vscale x 8 x i64> %a
 }
@@ -848,10 +850,10 @@ declare <vscale x 8 x i64> @llvm.riscv.vrgatherei16.vv.mask.nxv8i64(
   <vscale x 8 x i64>,
   <vscale x 8 x i16>,
   <vscale x 8 x i1>,
-  i64,
-  i64);
+  iXLen,
+  iXLen);
 
-define <vscale x 8 x i64> @intrinsic_vrgatherei16_mask_vv_nxv8i64_nxv8i64(<vscale x 8 x i64> %0, <vscale x 8 x i64> %1, <vscale x 8 x i16> %2, <vscale x 8 x i1> %3, i64 %4) nounwind {
+define <vscale x 8 x i64> @intrinsic_vrgatherei16_mask_vv_nxv8i64_nxv8i64(<vscale x 8 x i64> %0, <vscale x 8 x i64> %1, <vscale x 8 x i16> %2, <vscale x 8 x i1> %3, iXLen %4) nounwind {
 ; CHECK-LABEL: intrinsic_vrgatherei16_mask_vv_nxv8i64_nxv8i64:
 ; CHECK:       # %bb.0: # %entry
 ; CHECK-NEXT:    vl2re16.v v24, (a0)
@@ -864,7 +866,7 @@ entry:
     <vscale x 8 x i64> %1,
     <vscale x 8 x i16> %2,
     <vscale x 8 x i1> %3,
-    i64 %4, i64 1)
+    iXLen %4, iXLen 1)
 
   ret <vscale x 8 x i64> %a
 }
@@ -873,9 +875,9 @@ declare <vscale x 1 x half> @llvm.riscv.vrgatherei16.vv.nxv1f16(
   <vscale x 1 x half>,
   <vscale x 1 x half>,
   <vscale x 1 x i16>,
-  i64);
+  iXLen);
 
-define <vscale x 1 x half> @intrinsic_vrgatherei16_vv_nxv1f16_nxv1f16(<vscale x 1 x half> %0, <vscale x 1 x i16> %1, i64 %2) nounwind {
+define <vscale x 1 x half> @intrinsic_vrgatherei16_vv_nxv1f16_nxv1f16(<vscale x 1 x half> %0, <vscale x 1 x i16> %1, iXLen %2) nounwind {
 ; CHECK-LABEL: intrinsic_vrgatherei16_vv_nxv1f16_nxv1f16:
 ; CHECK:       # %bb.0: # %entry
 ; CHECK-NEXT:    vsetvli zero, a0, e16, mf4, ta, ma
@@ -887,7 +889,7 @@ entry:
     <vscale x 1 x half> undef,
     <vscale x 1 x half> %0,
     <vscale x 1 x i16> %1,
-    i64 %2)
+    iXLen %2)
 
   ret <vscale x 1 x half> %a
 }
@@ -897,10 +899,10 @@ declare <vscale x 1 x half> @llvm.riscv.vrgatherei16.vv.mask.nxv1f16(
   <vscale x 1 x half>,
   <vscale x 1 x i16>,
   <vscale x 1 x i1>,
-  i64,
-  i64);
+  iXLen,
+  iXLen);
 
-define <vscale x 1 x half> @intrinsic_vrgatherei16_mask_vv_nxv1f16_nxv1f16(<vscale x 1 x half> %0, <vscale x 1 x half> %1, <vscale x 1 x i16> %2, <vscale x 1 x i1> %3, i64 %4) nounwind {
+define <vscale x 1 x half> @intrinsic_vrgatherei16_mask_vv_nxv1f16_nxv1f16(<vscale x 1 x half> %0, <vscale x 1 x half> %1, <vscale x 1 x i16> %2, <vscale x 1 x i1> %3, iXLen %4) nounwind {
 ; CHECK-LABEL: intrinsic_vrgatherei16_mask_vv_nxv1f16_nxv1f16:
 ; CHECK:       # %bb.0: # %entry
 ; CHECK-NEXT:    vsetvli zero, a0, e16, mf4, ta, mu
@@ -912,7 +914,7 @@ entry:
     <vscale x 1 x half> %1,
     <vscale x 1 x i16> %2,
     <vscale x 1 x i1> %3,
-    i64 %4, i64 1)
+    iXLen %4, iXLen 1)
 
   ret <vscale x 1 x half> %a
 }
@@ -921,9 +923,9 @@ declare <vscale x 2 x half> @llvm.riscv.vrgatherei16.vv.nxv2f16(
   <vscale x 2 x half>,
   <vscale x 2 x half>,
   <vscale x 2 x i16>,
-  i64);
+  iXLen);
 
-define <vscale x 2 x half> @intrinsic_vrgatherei16_vv_nxv2f16_nxv2f16(<vscale x 2 x half> %0, <vscale x 2 x i16> %1, i64 %2) nounwind {
+define <vscale x 2 x half> @intrinsic_vrgatherei16_vv_nxv2f16_nxv2f16(<vscale x 2 x half> %0, <vscale x 2 x i16> %1, iXLen %2) nounwind {
 ; CHECK-LABEL: intrinsic_vrgatherei16_vv_nxv2f16_nxv2f16:
 ; CHECK:       # %bb.0: # %entry
 ; CHECK-NEXT:    vsetvli zero, a0, e16, mf2, ta, ma
@@ -935,7 +937,7 @@ entry:
     <vscale x 2 x half> undef,
     <vscale x 2 x half> %0,
     <vscale x 2 x i16> %1,
-    i64 %2)
+    iXLen %2)
 
   ret <vscale x 2 x half> %a
 }
@@ -945,10 +947,10 @@ declare <vscale x 2 x half> @llvm.riscv.vrgatherei16.vv.mask.nxv2f16(
   <vscale x 2 x half>,
   <vscale x 2 x i16>,
   <vscale x 2 x i1>,
-  i64,
-  i64);
+  iXLen,
+  iXLen);
 
-define <vscale x 2 x half> @intrinsic_vrgatherei16_mask_vv_nxv2f16_nxv2f16(<vscale x 2 x half> %0, <vscale x 2 x half> %1, <vscale x 2 x i16> %2, <vscale x 2 x i1> %3, i64 %4) nounwind {
+define <vscale x 2 x half> @intrinsic_vrgatherei16_mask_vv_nxv2f16_nxv2f16(<vscale x 2 x half> %0, <vscale x 2 x half> %1, <vscale x 2 x i16> %2, <vscale x 2 x i1> %3, iXLen %4) nounwind {
 ; CHECK-LABEL: intrinsic_vrgatherei16_mask_vv_nxv2f16_nxv2f16:
 ; CHECK:       # %bb.0: # %entry
 ; CHECK-NEXT:    vsetvli zero, a0, e16, mf2, ta, mu
@@ -960,7 +962,7 @@ entry:
     <vscale x 2 x half> %1,
     <vscale x 2 x i16> %2,
     <vscale x 2 x i1> %3,
-    i64 %4, i64 1)
+    iXLen %4, iXLen 1)
 
   ret <vscale x 2 x half> %a
 }
@@ -969,9 +971,9 @@ declare <vscale x 4 x half> @llvm.riscv.vrgatherei16.vv.nxv4f16(
   <vscale x 4 x half>,
   <vscale x 4 x half>,
   <vscale x 4 x i16>,
-  i64);
+  iXLen);
 
-define <vscale x 4 x half> @intrinsic_vrgatherei16_vv_nxv4f16_nxv4f16(<vscale x 4 x half> %0, <vscale x 4 x i16> %1, i64 %2) nounwind {
+define <vscale x 4 x half> @intrinsic_vrgatherei16_vv_nxv4f16_nxv4f16(<vscale x 4 x half> %0, <vscale x 4 x i16> %1, iXLen %2) nounwind {
 ; CHECK-LABEL: intrinsic_vrgatherei16_vv_nxv4f16_nxv4f16:
 ; CHECK:       # %bb.0: # %entry
 ; CHECK-NEXT:    vsetvli zero, a0, e16, m1, ta, ma
@@ -983,7 +985,7 @@ entry:
     <vscale x 4 x half> undef,
     <vscale x 4 x half> %0,
     <vscale x 4 x i16> %1,
-    i64 %2)
+    iXLen %2)
 
   ret <vscale x 4 x half> %a
 }
@@ -993,10 +995,10 @@ declare <vscale x 4 x half> @llvm.riscv.vrgatherei16.vv.mask.nxv4f16(
   <vscale x 4 x half>,
   <vscale x 4 x i16>,
   <vscale x 4 x i1>,
-  i64,
-  i64);
+  iXLen,
+  iXLen);
 
-define <vscale x 4 x half> @intrinsic_vrgatherei16_mask_vv_nxv4f16_nxv4f16(<vscale x 4 x half> %0, <vscale x 4 x half> %1, <vscale x 4 x i16> %2, <vscale x 4 x i1> %3, i64 %4) nounwind {
+define <vscale x 4 x half> @intrinsic_vrgatherei16_mask_vv_nxv4f16_nxv4f16(<vscale x 4 x half> %0, <vscale x 4 x half> %1, <vscale x 4 x i16> %2, <vscale x 4 x i1> %3, iXLen %4) nounwind {
 ; CHECK-LABEL: intrinsic_vrgatherei16_mask_vv_nxv4f16_nxv4f16:
 ; CHECK:       # %bb.0: # %entry
 ; CHECK-NEXT:    vsetvli zero, a0, e16, m1, ta, mu
@@ -1008,7 +1010,7 @@ entry:
     <vscale x 4 x half> %1,
     <vscale x 4 x i16> %2,
     <vscale x 4 x i1> %3,
-    i64 %4, i64 1)
+    iXLen %4, iXLen 1)
 
   ret <vscale x 4 x half> %a
 }
@@ -1017,9 +1019,9 @@ declare <vscale x 8 x half> @llvm.riscv.vrgatherei16.vv.nxv8f16(
   <vscale x 8 x half>,
   <vscale x 8 x half>,
   <vscale x 8 x i16>,
-  i64);
+  iXLen);
 
-define <vscale x 8 x half> @intrinsic_vrgatherei16_vv_nxv8f16_nxv8f16(<vscale x 8 x half> %0, <vscale x 8 x i16> %1, i64 %2) nounwind {
+define <vscale x 8 x half> @intrinsic_vrgatherei16_vv_nxv8f16_nxv8f16(<vscale x 8 x half> %0, <vscale x 8 x i16> %1, iXLen %2) nounwind {
 ; CHECK-LABEL: intrinsic_vrgatherei16_vv_nxv8f16_nxv8f16:
 ; CHECK:       # %bb.0: # %entry
 ; CHECK-NEXT:    vsetvli zero, a0, e16, m2, ta, ma
@@ -1031,7 +1033,7 @@ entry:
     <vscale x 8 x half> undef,
     <vscale x 8 x half> %0,
     <vscale x 8 x i16> %1,
-    i64 %2)
+    iXLen %2)
 
   ret <vscale x 8 x half> %a
 }
@@ -1041,10 +1043,10 @@ declare <vscale x 8 x half> @llvm.riscv.vrgatherei16.vv.mask.nxv8f16(
   <vscale x 8 x half>,
   <vscale x 8 x i16>,
   <vscale x 8 x i1>,
-  i64,
-  i64);
+  iXLen,
+  iXLen);
 
-define <vscale x 8 x half> @intrinsic_vrgatherei16_mask_vv_nxv8f16_nxv8f16(<vscale x 8 x half> %0, <vscale x 8 x half> %1, <vscale x 8 x i16> %2, <vscale x 8 x i1> %3, i64 %4) nounwind {
+define <vscale x 8 x half> @intrinsic_vrgatherei16_mask_vv_nxv8f16_nxv8f16(<vscale x 8 x half> %0, <vscale x 8 x half> %1, <vscale x 8 x i16> %2, <vscale x 8 x i1> %3, iXLen %4) nounwind {
 ; CHECK-LABEL: intrinsic_vrgatherei16_mask_vv_nxv8f16_nxv8f16:
 ; CHECK:       # %bb.0: # %entry
 ; CHECK-NEXT:    vsetvli zero, a0, e16, m2, ta, mu
@@ -1056,7 +1058,7 @@ entry:
     <vscale x 8 x half> %1,
     <vscale x 8 x i16> %2,
     <vscale x 8 x i1> %3,
-    i64 %4, i64 1)
+    iXLen %4, iXLen 1)
 
   ret <vscale x 8 x half> %a
 }
@@ -1065,9 +1067,9 @@ declare <vscale x 16 x half> @llvm.riscv.vrgatherei16.vv.nxv16f16(
   <vscale x 16 x half>,
   <vscale x 16 x half>,
   <vscale x 16 x i16>,
-  i64);
+  iXLen);
 
-define <vscale x 16 x half> @intrinsic_vrgatherei16_vv_nxv16f16_nxv16f16(<vscale x 16 x half> %0, <vscale x 16 x i16> %1, i64 %2) nounwind {
+define <vscale x 16 x half> @intrinsic_vrgatherei16_vv_nxv16f16_nxv16f16(<vscale x 16 x half> %0, <vscale x 16 x i16> %1, iXLen %2) nounwind {
 ; CHECK-LABEL: intrinsic_vrgatherei16_vv_nxv16f16_nxv16f16:
 ; CHECK:       # %bb.0: # %entry
 ; CHECK-NEXT:    vsetvli zero, a0, e16, m4, ta, ma
@@ -1079,7 +1081,7 @@ entry:
     <vscale x 16 x half> undef,
     <vscale x 16 x half> %0,
     <vscale x 16 x i16> %1,
-    i64 %2)
+    iXLen %2)
 
   ret <vscale x 16 x half> %a
 }
@@ -1089,10 +1091,10 @@ declare <vscale x 16 x half> @llvm.riscv.vrgatherei16.vv.mask.nxv16f16(
   <vscale x 16 x half>,
   <vscale x 16 x i16>,
   <vscale x 16 x i1>,
-  i64,
-  i64);
+  iXLen,
+  iXLen);
 
-define <vscale x 16 x half> @intrinsic_vrgatherei16_mask_vv_nxv16f16_nxv16f16(<vscale x 16 x half> %0, <vscale x 16 x half> %1, <vscale x 16 x i16> %2, <vscale x 16 x i1> %3, i64 %4) nounwind {
+define <vscale x 16 x half> @intrinsic_vrgatherei16_mask_vv_nxv16f16_nxv16f16(<vscale x 16 x half> %0, <vscale x 16 x half> %1, <vscale x 16 x i16> %2, <vscale x 16 x i1> %3, iXLen %4) nounwind {
 ; CHECK-LABEL: intrinsic_vrgatherei16_mask_vv_nxv16f16_nxv16f16:
 ; CHECK:       # %bb.0: # %entry
 ; CHECK-NEXT:    vsetvli zero, a0, e16, m4, ta, mu
@@ -1104,7 +1106,7 @@ entry:
     <vscale x 16 x half> %1,
     <vscale x 16 x i16> %2,
     <vscale x 16 x i1> %3,
-    i64 %4, i64 1)
+    iXLen %4, iXLen 1)
 
   ret <vscale x 16 x half> %a
 }
@@ -1113,9 +1115,9 @@ declare <vscale x 32 x half> @llvm.riscv.vrgatherei16.vv.nxv32f16(
   <vscale x 32 x half>,
   <vscale x 32 x half>,
   <vscale x 32 x i16>,
-  i64);
+  iXLen);
 
-define <vscale x 32 x half> @intrinsic_vrgatherei16_vv_nxv32f16_nxv32f16(<vscale x 32 x half> %0, <vscale x 32 x i16> %1, i64 %2) nounwind {
+define <vscale x 32 x half> @intrinsic_vrgatherei16_vv_nxv32f16_nxv32f16(<vscale x 32 x half> %0, <vscale x 32 x i16> %1, iXLen %2) nounwind {
 ; CHECK-LABEL: intrinsic_vrgatherei16_vv_nxv32f16_nxv32f16:
 ; CHECK:       # %bb.0: # %entry
 ; CHECK-NEXT:    vsetvli zero, a0, e16, m8, ta, ma
@@ -1127,7 +1129,7 @@ entry:
     <vscale x 32 x half> undef,
     <vscale x 32 x half> %0,
     <vscale x 32 x i16> %1,
-    i64 %2)
+    iXLen %2)
 
   ret <vscale x 32 x half> %a
 }
@@ -1137,10 +1139,10 @@ declare <vscale x 32 x half> @llvm.riscv.vrgatherei16.vv.mask.nxv32f16(
   <vscale x 32 x half>,
   <vscale x 32 x i16>,
   <vscale x 32 x i1>,
-  i64,
-  i64);
+  iXLen,
+  iXLen);
 
-define <vscale x 32 x half> @intrinsic_vrgatherei16_mask_vv_nxv32f16_nxv32f16(<vscale x 32 x half> %0, <vscale x 32 x half> %1, <vscale x 32 x i16> %2, <vscale x 32 x i1> %3, i64 %4) nounwind {
+define <vscale x 32 x half> @intrinsic_vrgatherei16_mask_vv_nxv32f16_nxv32f16(<vscale x 32 x half> %0, <vscale x 32 x half> %1, <vscale x 32 x i16> %2, <vscale x 32 x i1> %3, iXLen %4) nounwind {
 ; CHECK-LABEL: intrinsic_vrgatherei16_mask_vv_nxv32f16_nxv32f16:
 ; CHECK:       # %bb.0: # %entry
 ; CHECK-NEXT:    vl8re16.v v24, (a0)
@@ -1153,7 +1155,7 @@ entry:
     <vscale x 32 x half> %1,
     <vscale x 32 x i16> %2,
     <vscale x 32 x i1> %3,
-    i64 %4, i64 1)
+    iXLen %4, iXLen 1)
 
   ret <vscale x 32 x half> %a
 }
@@ -1162,9 +1164,9 @@ declare <vscale x 1 x float> @llvm.riscv.vrgatherei16.vv.nxv1f32(
   <vscale x 1 x float>,
   <vscale x 1 x float>,
   <vscale x 1 x i16>,
-  i64);
+  iXLen);
 
-define <vscale x 1 x float> @intrinsic_vrgatherei16_vv_nxv1f32_nxv1f32(<vscale x 1 x float> %0, <vscale x 1 x i16> %1, i64 %2) nounwind {
+define <vscale x 1 x float> @intrinsic_vrgatherei16_vv_nxv1f32_nxv1f32(<vscale x 1 x float> %0, <vscale x 1 x i16> %1, iXLen %2) nounwind {
 ; CHECK-LABEL: intrinsic_vrgatherei16_vv_nxv1f32_nxv1f32:
 ; CHECK:       # %bb.0: # %entry
 ; CHECK-NEXT:    vsetvli zero, a0, e32, mf2, ta, ma
@@ -1176,7 +1178,7 @@ entry:
     <vscale x 1 x float> undef,
     <vscale x 1 x float> %0,
     <vscale x 1 x i16> %1,
-    i64 %2)
+    iXLen %2)
 
   ret <vscale x 1 x float> %a
 }
@@ -1186,10 +1188,10 @@ declare <vscale x 1 x float> @llvm.riscv.vrgatherei16.vv.mask.nxv1f32(
   <vscale x 1 x float>,
   <vscale x 1 x i16>,
   <vscale x 1 x i1>,
-  i64,
-  i64);
+  iXLen,
+  iXLen);
 
-define <vscale x 1 x float> @intrinsic_vrgatherei16_mask_vv_nxv1f32_nxv1f32(<vscale x 1 x float> %0, <vscale x 1 x float> %1, <vscale x 1 x i16> %2, <vscale x 1 x i1> %3, i64 %4) nounwind {
+define <vscale x 1 x float> @intrinsic_vrgatherei16_mask_vv_nxv1f32_nxv1f32(<vscale x 1 x float> %0, <vscale x 1 x float> %1, <vscale x 1 x i16> %2, <vscale x 1 x i1> %3, iXLen %4) nounwind {
 ; CHECK-LABEL: intrinsic_vrgatherei16_mask_vv_nxv1f32_nxv1f32:
 ; CHECK:       # %bb.0: # %entry
 ; CHECK-NEXT:    vsetvli zero, a0, e32, mf2, ta, mu
@@ -1201,7 +1203,7 @@ entry:
     <vscale x 1 x float> %1,
     <vscale x 1 x i16> %2,
     <vscale x 1 x i1> %3,
-    i64 %4, i64 1)
+    iXLen %4, iXLen 1)
 
   ret <vscale x 1 x float> %a
 }
@@ -1210,9 +1212,9 @@ declare <vscale x 4 x float> @llvm.riscv.vrgatherei16.vv.nxv4f32(
   <vscale x 4 x float>,
   <vscale x 4 x float>,
   <vscale x 4 x i16>,
-  i64);
+  iXLen);
 
-define <vscale x 4 x float> @intrinsic_vrgatherei16_vv_nxv4f32_nxv4f32(<vscale x 4 x float> %0, <vscale x 4 x i16> %1, i64 %2) nounwind {
+define <vscale x 4 x float> @intrinsic_vrgatherei16_vv_nxv4f32_nxv4f32(<vscale x 4 x float> %0, <vscale x 4 x i16> %1, iXLen %2) nounwind {
 ; CHECK-LABEL: intrinsic_vrgatherei16_vv_nxv4f32_nxv4f32:
 ; CHECK:       # %bb.0: # %entry
 ; CHECK-NEXT:    vsetvli zero, a0, e32, m2, ta, ma
@@ -1224,7 +1226,7 @@ entry:
     <vscale x 4 x float> undef,
     <vscale x 4 x float> %0,
     <vscale x 4 x i16> %1,
-    i64 %2)
+    iXLen %2)
 
   ret <vscale x 4 x float> %a
 }
@@ -1234,10 +1236,10 @@ declare <vscale x 4 x float> @llvm.riscv.vrgatherei16.vv.mask.nxv4f32(
   <vscale x 4 x float>,
   <vscale x 4 x i16>,
   <vscale x 4 x i1>,
-  i64,
-  i64);
+  iXLen,
+  iXLen);
 
-define <vscale x 4 x float> @intrinsic_vrgatherei16_mask_vv_nxv4f32_nxv4f32(<vscale x 4 x float> %0, <vscale x 4 x float> %1, <vscale x 4 x i16> %2, <vscale x 4 x i1> %3, i64 %4) nounwind {
+define <vscale x 4 x float> @intrinsic_vrgatherei16_mask_vv_nxv4f32_nxv4f32(<vscale x 4 x float> %0, <vscale x 4 x float> %1, <vscale x 4 x i16> %2, <vscale x 4 x i1> %3, iXLen %4) nounwind {
 ; CHECK-LABEL: intrinsic_vrgatherei16_mask_vv_nxv4f32_nxv4f32:
 ; CHECK:       # %bb.0: # %entry
 ; CHECK-NEXT:    vsetvli zero, a0, e32, m2, ta, mu
@@ -1249,7 +1251,7 @@ entry:
     <vscale x 4 x float> %1,
     <vscale x 4 x i16> %2,
     <vscale x 4 x i1> %3,
-    i64 %4, i64 1)
+    iXLen %4, iXLen 1)
 
   ret <vscale x 4 x float> %a
 }
@@ -1258,9 +1260,9 @@ declare <vscale x 8 x float> @llvm.riscv.vrgatherei16.vv.nxv8f32(
   <vscale x 8 x float>,
   <vscale x 8 x float>,
   <vscale x 8 x i16>,
-  i64);
+  iXLen);
 
-define <vscale x 8 x float> @intrinsic_vrgatherei16_vv_nxv8f32_nxv8f32(<vscale x 8 x float> %0, <vscale x 8 x i16> %1, i64 %2) nounwind {
+define <vscale x 8 x float> @intrinsic_vrgatherei16_vv_nxv8f32_nxv8f32(<vscale x 8 x float> %0, <vscale x 8 x i16> %1, iXLen %2) nounwind {
 ; CHECK-LABEL: intrinsic_vrgatherei16_vv_nxv8f32_nxv8f32:
 ; CHECK:       # %bb.0: # %entry
 ; CHECK-NEXT:    vsetvli zero, a0, e32, m4, ta, ma
@@ -1272,7 +1274,7 @@ entry:
     <vscale x 8 x float> undef,
     <vscale x 8 x float> %0,
     <vscale x 8 x i16> %1,
-    i64 %2)
+    iXLen %2)
 
   ret <vscale x 8 x float> %a
 }
@@ -1282,10 +1284,10 @@ declare <vscale x 8 x float> @llvm.riscv.vrgatherei16.vv.mask.nxv8f32(
   <vscale x 8 x float>,
   <vscale x 8 x i16>,
   <vscale x 8 x i1>,
-  i64,
-  i64);
+  iXLen,
+  iXLen);
 
-define <vscale x 8 x float> @intrinsic_vrgatherei16_mask_vv_nxv8f32_nxv8f32(<vscale x 8 x float> %0, <vscale x 8 x float> %1, <vscale x 8 x i16> %2, <vscale x 8 x i1> %3, i64 %4) nounwind {
+define <vscale x 8 x float> @intrinsic_vrgatherei16_mask_vv_nxv8f32_nxv8f32(<vscale x 8 x float> %0, <vscale x 8 x float> %1, <vscale x 8 x i16> %2, <vscale x 8 x i1> %3, iXLen %4) nounwind {
 ; CHECK-LABEL: intrinsic_vrgatherei16_mask_vv_nxv8f32_nxv8f32:
 ; CHECK:       # %bb.0: # %entry
 ; CHECK-NEXT:    vsetvli zero, a0, e32, m4, ta, mu
@@ -1297,7 +1299,7 @@ entry:
     <vscale x 8 x float> %1,
     <vscale x 8 x i16> %2,
     <vscale x 8 x i1> %3,
-    i64 %4, i64 1)
+    iXLen %4, iXLen 1)
 
   ret <vscale x 8 x float> %a
 }
@@ -1306,9 +1308,9 @@ declare <vscale x 16 x float> @llvm.riscv.vrgatherei16.vv.nxv16f32(
   <vscale x 16 x float>,
   <vscale x 16 x float>,
   <vscale x 16 x i16>,
-  i64);
+  iXLen);
 
-define <vscale x 16 x float> @intrinsic_vrgatherei16_vv_nxv16f32_nxv16f32(<vscale x 16 x float> %0, <vscale x 16 x i16> %1, i64 %2) nounwind {
+define <vscale x 16 x float> @intrinsic_vrgatherei16_vv_nxv16f32_nxv16f32(<vscale x 16 x float> %0, <vscale x 16 x i16> %1, iXLen %2) nounwind {
 ; CHECK-LABEL: intrinsic_vrgatherei16_vv_nxv16f32_nxv16f32:
 ; CHECK:       # %bb.0: # %entry
 ; CHECK-NEXT:    vsetvli zero, a0, e32, m8, ta, ma
@@ -1320,7 +1322,7 @@ entry:
     <vscale x 16 x float> undef,
     <vscale x 16 x float> %0,
     <vscale x 16 x i16> %1,
-    i64 %2)
+    iXLen %2)
 
   ret <vscale x 16 x float> %a
 }
@@ -1330,10 +1332,10 @@ declare <vscale x 16 x float> @llvm.riscv.vrgatherei16.vv.mask.nxv16f32(
   <vscale x 16 x float>,
   <vscale x 16 x i16>,
   <vscale x 16 x i1>,
-  i64,
-  i64);
+  iXLen,
+  iXLen);
 
-define <vscale x 16 x float> @intrinsic_vrgatherei16_mask_vv_nxv16f32_nxv16f32(<vscale x 16 x float> %0, <vscale x 16 x float> %1, <vscale x 16 x i16> %2, <vscale x 16 x i1> %3, i64 %4) nounwind {
+define <vscale x 16 x float> @intrinsic_vrgatherei16_mask_vv_nxv16f32_nxv16f32(<vscale x 16 x float> %0, <vscale x 16 x float> %1, <vscale x 16 x i16> %2, <vscale x 16 x i1> %3, iXLen %4) nounwind {
 ; CHECK-LABEL: intrinsic_vrgatherei16_mask_vv_nxv16f32_nxv16f32:
 ; CHECK:       # %bb.0: # %entry
 ; CHECK-NEXT:    vl4re16.v v24, (a0)
@@ -1346,7 +1348,7 @@ entry:
     <vscale x 16 x float> %1,
     <vscale x 16 x i16> %2,
     <vscale x 16 x i1> %3,
-    i64 %4, i64 1)
+    iXLen %4, iXLen 1)
 
   ret <vscale x 16 x float> %a
 }
@@ -1355,9 +1357,9 @@ declare <vscale x 4 x double> @llvm.riscv.vrgatherei16.vv.nxv4f64(
   <vscale x 4 x double>,
   <vscale x 4 x double>,
   <vscale x 4 x i16>,
-  i64);
+  iXLen);
 
-define <vscale x 4 x double> @intrinsic_vrgatherei16_vv_nxv4f64_nxv4f64(<vscale x 4 x double> %0, <vscale x 4 x i16> %1, i64 %2) nounwind {
+define <vscale x 4 x double> @intrinsic_vrgatherei16_vv_nxv4f64_nxv4f64(<vscale x 4 x double> %0, <vscale x 4 x i16> %1, iXLen %2) nounwind {
 ; CHECK-LABEL: intrinsic_vrgatherei16_vv_nxv4f64_nxv4f64:
 ; CHECK:       # %bb.0: # %entry
 ; CHECK-NEXT:    vsetvli zero, a0, e64, m4, ta, ma
@@ -1369,7 +1371,7 @@ entry:
     <vscale x 4 x double> undef,
     <vscale x 4 x double> %0,
     <vscale x 4 x i16> %1,
-    i64 %2)
+    iXLen %2)
 
   ret <vscale x 4 x double> %a
 }
@@ -1379,10 +1381,10 @@ declare <vscale x 4 x double> @llvm.riscv.vrgatherei16.vv.mask.nxv4f64(
   <vscale x 4 x double>,
   <vscale x 4 x i16>,
   <vscale x 4 x i1>,
-  i64,
-  i64);
+  iXLen,
+  iXLen);
 
-define <vscale x 4 x double> @intrinsic_vrgatherei16_mask_vv_nxv4f64_nxv4f64(<vscale x 4 x double> %0, <vscale x 4 x double> %1, <vscale x 4 x i16> %2, <vscale x 4 x i1> %3, i64 %4) nounwind {
+define <vscale x 4 x double> @intrinsic_vrgatherei16_mask_vv_nxv4f64_nxv4f64(<vscale x 4 x double> %0, <vscale x 4 x double> %1, <vscale x 4 x i16> %2, <vscale x 4 x i1> %3, iXLen %4) nounwind {
 ; CHECK-LABEL: intrinsic_vrgatherei16_mask_vv_nxv4f64_nxv4f64:
 ; CHECK:       # %bb.0: # %entry
 ; CHECK-NEXT:    vsetvli zero, a0, e64, m4, ta, mu
@@ -1394,7 +1396,7 @@ entry:
     <vscale x 4 x double> %1,
     <vscale x 4 x i16> %2,
     <vscale x 4 x i1> %3,
-    i64 %4, i64 1)
+    iXLen %4, iXLen 1)
 
   ret <vscale x 4 x double> %a
 }
@@ -1403,9 +1405,9 @@ declare <vscale x 8 x double> @llvm.riscv.vrgatherei16.vv.nxv8f64(
   <vscale x 8 x double>,
   <vscale x 8 x double>,
   <vscale x 8 x i16>,
-  i64);
+  iXLen);
 
-define <vscale x 8 x double> @intrinsic_vrgatherei16_vv_nxv8f64_nxv8f64(<vscale x 8 x double> %0, <vscale x 8 x i16> %1, i64 %2) nounwind {
+define <vscale x 8 x double> @intrinsic_vrgatherei16_vv_nxv8f64_nxv8f64(<vscale x 8 x double> %0, <vscale x 8 x i16> %1, iXLen %2) nounwind {
 ; CHECK-LABEL: intrinsic_vrgatherei16_vv_nxv8f64_nxv8f64:
 ; CHECK:       # %bb.0: # %entry
 ; CHECK-NEXT:    vsetvli zero, a0, e64, m8, ta, ma
@@ -1417,7 +1419,7 @@ entry:
     <vscale x 8 x double> undef,
     <vscale x 8 x double> %0,
     <vscale x 8 x i16> %1,
-    i64 %2)
+    iXLen %2)
 
   ret <vscale x 8 x double> %a
 }
@@ -1427,10 +1429,10 @@ declare <vscale x 8 x double> @llvm.riscv.vrgatherei16.vv.mask.nxv8f64(
   <vscale x 8 x double>,
   <vscale x 8 x i16>,
   <vscale x 8 x i1>,
-  i64,
-  i64);
+  iXLen,
+  iXLen);
 
-define <vscale x 8 x double> @intrinsic_vrgatherei16_mask_vv_nxv8f64_nxv8f64(<vscale x 8 x double> %0, <vscale x 8 x double> %1, <vscale x 8 x i16> %2, <vscale x 8 x i1> %3, i64 %4) nounwind {
+define <vscale x 8 x double> @intrinsic_vrgatherei16_mask_vv_nxv8f64_nxv8f64(<vscale x 8 x double> %0, <vscale x 8 x double> %1, <vscale x 8 x i16> %2, <vscale x 8 x i1> %3, iXLen %4) nounwind {
 ; CHECK-LABEL: intrinsic_vrgatherei16_mask_vv_nxv8f64_nxv8f64:
 ; CHECK:       # %bb.0: # %entry
 ; CHECK-NEXT:    vl2re16.v v24, (a0)
@@ -1443,7 +1445,7 @@ entry:
     <vscale x 8 x double> %1,
     <vscale x 8 x i16> %2,
     <vscale x 8 x i1> %3,
-    i64 %4, i64 1)
+    iXLen %4, iXLen 1)
 
   ret <vscale x 8 x double> %a
 }
diff --git a/llvm/test/CodeGen/RISCV/rvv/vsadd-rv32.ll b/llvm/test/CodeGen/RISCV/rvv/vsadd-rv32.ll
deleted file mode 100644
index c2586e4..0000000
--- a/llvm/test/CodeGen/RISCV/rvv/vsadd-rv32.ll
+++ /dev/null
@@ -1,2849 +0,0 @@
-; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py
-; RUN: llc -mtriple=riscv32 -mattr=+v -verify-machineinstrs \
-; RUN:   < %s | FileCheck %s
-
-declare <vscale x 1 x i8> @llvm.riscv.vsadd.nxv1i8.nxv1i8(
-  <vscale x 1 x i8>,
-  <vscale x 1 x i8>,
-  <vscale x 1 x i8>,
-  i32);
-
-define <vscale x 1 x i8> @intrinsic_vsadd_vv_nxv1i8_nxv1i8_nxv1i8(<vscale x 1 x i8> %0, <vscale x 1 x i8> %1, i32 %2) nounwind {
-; CHECK-LABEL: intrinsic_vsadd_vv_nxv1i8_nxv1i8_nxv1i8:
-; CHECK:       # %bb.0: # %entry
-; CHECK-NEXT:    vsetvli zero, a0, e8, mf8, ta, ma
-; CHECK-NEXT:    vsadd.vv v8, v8, v9
-; CHECK-NEXT:    ret
-entry:
-  %a = call <vscale x 1 x i8> @llvm.riscv.vsadd.nxv1i8.nxv1i8(
-    <vscale x 1 x i8> undef,
-    <vscale x 1 x i8> %0,
-    <vscale x 1 x i8> %1,
-    i32 %2)
-
-  ret <vscale x 1 x i8> %a
-}
-
-declare <vscale x 1 x i8> @llvm.riscv.vsadd.mask.nxv1i8.nxv1i8(
-  <vscale x 1 x i8>,
-  <vscale x 1 x i8>,
-  <vscale x 1 x i8>,
-  <vscale x 1 x i1>,
-  i32,
-  i32);
-
-define <vscale x 1 x i8> @intrinsic_vsadd_mask_vv_nxv1i8_nxv1i8_nxv1i8(<vscale x 1 x i8> %0, <vscale x 1 x i8> %1, <vscale x 1 x i8> %2, <vscale x 1 x i1> %3, i32 %4) nounwind {
-; CHECK-LABEL: intrinsic_vsadd_mask_vv_nxv1i8_nxv1i8_nxv1i8:
-; CHECK:       # %bb.0: # %entry
-; CHECK-NEXT:    vsetvli zero, a0, e8, mf8, ta, mu
-; CHECK-NEXT:    vsadd.vv v8, v9, v10, v0.t
-; CHECK-NEXT:    ret
-entry:
-  %a = call <vscale x 1 x i8> @llvm.riscv.vsadd.mask.nxv1i8.nxv1i8(
-    <vscale x 1 x i8> %0,
-    <vscale x 1 x i8> %1,
-    <vscale x 1 x i8> %2,
-    <vscale x 1 x i1> %3,
-    i32 %4, i32 1)
-
-  ret <vscale x 1 x i8> %a
-}
-
-declare <vscale x 2 x i8> @llvm.riscv.vsadd.nxv2i8.nxv2i8(
-  <vscale x 2 x i8>,
-  <vscale x 2 x i8>,
-  <vscale x 2 x i8>,
-  i32);
-
-define <vscale x 2 x i8> @intrinsic_vsadd_vv_nxv2i8_nxv2i8_nxv2i8(<vscale x 2 x i8> %0, <vscale x 2 x i8> %1, i32 %2) nounwind {
-; CHECK-LABEL: intrinsic_vsadd_vv_nxv2i8_nxv2i8_nxv2i8:
-; CHECK:       # %bb.0: # %entry
-; CHECK-NEXT:    vsetvli zero, a0, e8, mf4, ta, ma
-; CHECK-NEXT:    vsadd.vv v8, v8, v9
-; CHECK-NEXT:    ret
-entry:
-  %a = call <vscale x 2 x i8> @llvm.riscv.vsadd.nxv2i8.nxv2i8(
-    <vscale x 2 x i8> undef,
-    <vscale x 2 x i8> %0,
-    <vscale x 2 x i8> %1,
-    i32 %2)
-
-  ret <vscale x 2 x i8> %a
-}
-
-declare <vscale x 2 x i8> @llvm.riscv.vsadd.mask.nxv2i8.nxv2i8(
-  <vscale x 2 x i8>,
-  <vscale x 2 x i8>,
-  <vscale x 2 x i8>,
-  <vscale x 2 x i1>,
-  i32,
-  i32);
-
-define <vscale x 2 x i8> @intrinsic_vsadd_mask_vv_nxv2i8_nxv2i8_nxv2i8(<vscale x 2 x i8> %0, <vscale x 2 x i8> %1, <vscale x 2 x i8> %2, <vscale x 2 x i1> %3, i32 %4) nounwind {
-; CHECK-LABEL: intrinsic_vsadd_mask_vv_nxv2i8_nxv2i8_nxv2i8:
-; CHECK:       # %bb.0: # %entry
-; CHECK-NEXT:    vsetvli zero, a0, e8, mf4, ta, mu
-; CHECK-NEXT:    vsadd.vv v8, v9, v10, v0.t
-; CHECK-NEXT:    ret
-entry:
-  %a = call <vscale x 2 x i8> @llvm.riscv.vsadd.mask.nxv2i8.nxv2i8(
-    <vscale x 2 x i8> %0,
-    <vscale x 2 x i8> %1,
-    <vscale x 2 x i8> %2,
-    <vscale x 2 x i1> %3,
-    i32 %4, i32 1)
-
-  ret <vscale x 2 x i8> %a
-}
-
-declare <vscale x 4 x i8> @llvm.riscv.vsadd.nxv4i8.nxv4i8(
-  <vscale x 4 x i8>,
-  <vscale x 4 x i8>,
-  <vscale x 4 x i8>,
-  i32);
-
-define <vscale x 4 x i8> @intrinsic_vsadd_vv_nxv4i8_nxv4i8_nxv4i8(<vscale x 4 x i8> %0, <vscale x 4 x i8> %1, i32 %2) nounwind {
-; CHECK-LABEL: intrinsic_vsadd_vv_nxv4i8_nxv4i8_nxv4i8:
-; CHECK:       # %bb.0: # %entry
-; CHECK-NEXT:    vsetvli zero, a0, e8, mf2, ta, ma
-; CHECK-NEXT:    vsadd.vv v8, v8, v9
-; CHECK-NEXT:    ret
-entry:
-  %a = call <vscale x 4 x i8> @llvm.riscv.vsadd.nxv4i8.nxv4i8(
-    <vscale x 4 x i8> undef,
-    <vscale x 4 x i8> %0,
-    <vscale x 4 x i8> %1,
-    i32 %2)
-
-  ret <vscale x 4 x i8> %a
-}
-
-declare <vscale x 4 x i8> @llvm.riscv.vsadd.mask.nxv4i8.nxv4i8(
-  <vscale x 4 x i8>,
-  <vscale x 4 x i8>,
-  <vscale x 4 x i8>,
-  <vscale x 4 x i1>,
-  i32,
-  i32);
-
-define <vscale x 4 x i8> @intrinsic_vsadd_mask_vv_nxv4i8_nxv4i8_nxv4i8(<vscale x 4 x i8> %0, <vscale x 4 x i8> %1, <vscale x 4 x i8> %2, <vscale x 4 x i1> %3, i32 %4) nounwind {
-; CHECK-LABEL: intrinsic_vsadd_mask_vv_nxv4i8_nxv4i8_nxv4i8:
-; CHECK:       # %bb.0: # %entry
-; CHECK-NEXT:    vsetvli zero, a0, e8, mf2, ta, mu
-; CHECK-NEXT:    vsadd.vv v8, v9, v10, v0.t
-; CHECK-NEXT:    ret
-entry:
-  %a = call <vscale x 4 x i8> @llvm.riscv.vsadd.mask.nxv4i8.nxv4i8(
-    <vscale x 4 x i8> %0,
-    <vscale x 4 x i8> %1,
-    <vscale x 4 x i8> %2,
-    <vscale x 4 x i1> %3,
-    i32 %4, i32 1)
-
-  ret <vscale x 4 x i8> %a
-}
-
-declare <vscale x 8 x i8> @llvm.riscv.vsadd.nxv8i8.nxv8i8(
-  <vscale x 8 x i8>,
-  <vscale x 8 x i8>,
-  <vscale x 8 x i8>,
-  i32);
-
-define <vscale x 8 x i8> @intrinsic_vsadd_vv_nxv8i8_nxv8i8_nxv8i8(<vscale x 8 x i8> %0, <vscale x 8 x i8> %1, i32 %2) nounwind {
-; CHECK-LABEL: intrinsic_vsadd_vv_nxv8i8_nxv8i8_nxv8i8:
-; CHECK:       # %bb.0: # %entry
-; CHECK-NEXT:    vsetvli zero, a0, e8, m1, ta, ma
-; CHECK-NEXT:    vsadd.vv v8, v8, v9
-; CHECK-NEXT:    ret
-entry:
-  %a = call <vscale x 8 x i8> @llvm.riscv.vsadd.nxv8i8.nxv8i8(
-    <vscale x 8 x i8> undef,
-    <vscale x 8 x i8> %0,
-    <vscale x 8 x i8> %1,
-    i32 %2)
-
-  ret <vscale x 8 x i8> %a
-}
-
-declare <vscale x 8 x i8> @llvm.riscv.vsadd.mask.nxv8i8.nxv8i8(
-  <vscale x 8 x i8>,
-  <vscale x 8 x i8>,
-  <vscale x 8 x i8>,
-  <vscale x 8 x i1>,
-  i32,
-  i32);
-
-define <vscale x 8 x i8> @intrinsic_vsadd_mask_vv_nxv8i8_nxv8i8_nxv8i8(<vscale x 8 x i8> %0, <vscale x 8 x i8> %1, <vscale x 8 x i8> %2, <vscale x 8 x i1> %3, i32 %4) nounwind {
-; CHECK-LABEL: intrinsic_vsadd_mask_vv_nxv8i8_nxv8i8_nxv8i8:
-; CHECK:       # %bb.0: # %entry
-; CHECK-NEXT:    vsetvli zero, a0, e8, m1, ta, mu
-; CHECK-NEXT:    vsadd.vv v8, v9, v10, v0.t
-; CHECK-NEXT:    ret
-entry:
-  %a = call <vscale x 8 x i8> @llvm.riscv.vsadd.mask.nxv8i8.nxv8i8(
-    <vscale x 8 x i8> %0,
-    <vscale x 8 x i8> %1,
-    <vscale x 8 x i8> %2,
-    <vscale x 8 x i1> %3,
-    i32 %4, i32 1)
-
-  ret <vscale x 8 x i8> %a
-}
-
-declare <vscale x 16 x i8> @llvm.riscv.vsadd.nxv16i8.nxv16i8(
-  <vscale x 16 x i8>,
-  <vscale x 16 x i8>,
-  <vscale x 16 x i8>,
-  i32);
-
-define <vscale x 16 x i8> @intrinsic_vsadd_vv_nxv16i8_nxv16i8_nxv16i8(<vscale x 16 x i8> %0, <vscale x 16 x i8> %1, i32 %2) nounwind {
-; CHECK-LABEL: intrinsic_vsadd_vv_nxv16i8_nxv16i8_nxv16i8:
-; CHECK:       # %bb.0: # %entry
-; CHECK-NEXT:    vsetvli zero, a0, e8, m2, ta, ma
-; CHECK-NEXT:    vsadd.vv v8, v8, v10
-; CHECK-NEXT:    ret
-entry:
-  %a = call <vscale x 16 x i8> @llvm.riscv.vsadd.nxv16i8.nxv16i8(
-    <vscale x 16 x i8> undef,
-    <vscale x 16 x i8> %0,
-    <vscale x 16 x i8> %1,
-    i32 %2)
-
-  ret <vscale x 16 x i8> %a
-}
-
-declare <vscale x 16 x i8> @llvm.riscv.vsadd.mask.nxv16i8.nxv16i8(
-  <vscale x 16 x i8>,
-  <vscale x 16 x i8>,
-  <vscale x 16 x i8>,
-  <vscale x 16 x i1>,
-  i32,
-  i32);
-
-define <vscale x 16 x i8> @intrinsic_vsadd_mask_vv_nxv16i8_nxv16i8_nxv16i8(<vscale x 16 x i8> %0, <vscale x 16 x i8> %1, <vscale x 16 x i8> %2, <vscale x 16 x i1> %3, i32 %4) nounwind {
-; CHECK-LABEL: intrinsic_vsadd_mask_vv_nxv16i8_nxv16i8_nxv16i8:
-; CHECK:       # %bb.0: # %entry
-; CHECK-NEXT:    vsetvli zero, a0, e8, m2, ta, mu
-; CHECK-NEXT:    vsadd.vv v8, v10, v12, v0.t
-; CHECK-NEXT:    ret
-entry:
-  %a = call <vscale x 16 x i8> @llvm.riscv.vsadd.mask.nxv16i8.nxv16i8(
-    <vscale x 16 x i8> %0,
-    <vscale x 16 x i8> %1,
-    <vscale x 16 x i8> %2,
-    <vscale x 16 x i1> %3,
-    i32 %4, i32 1)
-
-  ret <vscale x 16 x i8> %a
-}
-
-declare <vscale x 32 x i8> @llvm.riscv.vsadd.nxv32i8.nxv32i8(
-  <vscale x 32 x i8>,
-  <vscale x 32 x i8>,
-  <vscale x 32 x i8>,
-  i32);
-
-define <vscale x 32 x i8> @intrinsic_vsadd_vv_nxv32i8_nxv32i8_nxv32i8(<vscale x 32 x i8> %0, <vscale x 32 x i8> %1, i32 %2) nounwind {
-; CHECK-LABEL: intrinsic_vsadd_vv_nxv32i8_nxv32i8_nxv32i8:
-; CHECK:       # %bb.0: # %entry
-; CHECK-NEXT:    vsetvli zero, a0, e8, m4, ta, ma
-; CHECK-NEXT:    vsadd.vv v8, v8, v12
-; CHECK-NEXT:    ret
-entry:
-  %a = call <vscale x 32 x i8> @llvm.riscv.vsadd.nxv32i8.nxv32i8(
-    <vscale x 32 x i8> undef,
-    <vscale x 32 x i8> %0,
-    <vscale x 32 x i8> %1,
-    i32 %2)
-
-  ret <vscale x 32 x i8> %a
-}
-
-declare <vscale x 32 x i8> @llvm.riscv.vsadd.mask.nxv32i8.nxv32i8(
-  <vscale x 32 x i8>,
-  <vscale x 32 x i8>,
-  <vscale x 32 x i8>,
-  <vscale x 32 x i1>,
-  i32,
-  i32);
-
-define <vscale x 32 x i8> @intrinsic_vsadd_mask_vv_nxv32i8_nxv32i8_nxv32i8(<vscale x 32 x i8> %0, <vscale x 32 x i8> %1, <vscale x 32 x i8> %2, <vscale x 32 x i1> %3, i32 %4) nounwind {
-; CHECK-LABEL: intrinsic_vsadd_mask_vv_nxv32i8_nxv32i8_nxv32i8:
-; CHECK:       # %bb.0: # %entry
-; CHECK-NEXT:    vsetvli zero, a0, e8, m4, ta, mu
-; CHECK-NEXT:    vsadd.vv v8, v12, v16, v0.t
-; CHECK-NEXT:    ret
-entry:
-  %a = call <vscale x 32 x i8> @llvm.riscv.vsadd.mask.nxv32i8.nxv32i8(
-    <vscale x 32 x i8> %0,
-    <vscale x 32 x i8> %1,
-    <vscale x 32 x i8> %2,
-    <vscale x 32 x i1> %3,
-    i32 %4, i32 1)
-
-  ret <vscale x 32 x i8> %a
-}
-
-declare <vscale x 64 x i8> @llvm.riscv.vsadd.nxv64i8.nxv64i8(
-  <vscale x 64 x i8>,
-  <vscale x 64 x i8>,
-  <vscale x 64 x i8>,
-  i32);
-
-define <vscale x 64 x i8> @intrinsic_vsadd_vv_nxv64i8_nxv64i8_nxv64i8(<vscale x 64 x i8> %0, <vscale x 64 x i8> %1, i32 %2) nounwind {
-; CHECK-LABEL: intrinsic_vsadd_vv_nxv64i8_nxv64i8_nxv64i8:
-; CHECK:       # %bb.0: # %entry
-; CHECK-NEXT:    vsetvli zero, a0, e8, m8, ta, ma
-; CHECK-NEXT:    vsadd.vv v8, v8, v16
-; CHECK-NEXT:    ret
-entry:
-  %a = call <vscale x 64 x i8> @llvm.riscv.vsadd.nxv64i8.nxv64i8(
-    <vscale x 64 x i8> undef,
-    <vscale x 64 x i8> %0,
-    <vscale x 64 x i8> %1,
-    i32 %2)
-
-  ret <vscale x 64 x i8> %a
-}
-
-declare <vscale x 64 x i8> @llvm.riscv.vsadd.mask.nxv64i8.nxv64i8(
-  <vscale x 64 x i8>,
-  <vscale x 64 x i8>,
-  <vscale x 64 x i8>,
-  <vscale x 64 x i1>,
-  i32,
-  i32);
-
-define <vscale x 64 x i8> @intrinsic_vsadd_mask_vv_nxv64i8_nxv64i8_nxv64i8(<vscale x 64 x i8> %0, <vscale x 64 x i8> %1, <vscale x 64 x i8> %2, <vscale x 64 x i1> %3, i32 %4) nounwind {
-; CHECK-LABEL: intrinsic_vsadd_mask_vv_nxv64i8_nxv64i8_nxv64i8:
-; CHECK:       # %bb.0: # %entry
-; CHECK-NEXT:    vl8r.v v24, (a0)
-; CHECK-NEXT:    vsetvli zero, a1, e8, m8, ta, mu
-; CHECK-NEXT:    vsadd.vv v8, v16, v24, v0.t
-; CHECK-NEXT:    ret
-entry:
-  %a = call <vscale x 64 x i8> @llvm.riscv.vsadd.mask.nxv64i8.nxv64i8(
-    <vscale x 64 x i8> %0,
-    <vscale x 64 x i8> %1,
-    <vscale x 64 x i8> %2,
-    <vscale x 64 x i1> %3,
-    i32 %4, i32 1)
-
-  ret <vscale x 64 x i8> %a
-}
-
-declare <vscale x 1 x i16> @llvm.riscv.vsadd.nxv1i16.nxv1i16(
-  <vscale x 1 x i16>,
-  <vscale x 1 x i16>,
-  <vscale x 1 x i16>,
-  i32);
-
-define <vscale x 1 x i16> @intrinsic_vsadd_vv_nxv1i16_nxv1i16_nxv1i16(<vscale x 1 x i16> %0, <vscale x 1 x i16> %1, i32 %2) nounwind {
-; CHECK-LABEL: intrinsic_vsadd_vv_nxv1i16_nxv1i16_nxv1i16:
-; CHECK:       # %bb.0: # %entry
-; CHECK-NEXT:    vsetvli zero, a0, e16, mf4, ta, ma
-; CHECK-NEXT:    vsadd.vv v8, v8, v9
-; CHECK-NEXT:    ret
-entry:
-  %a = call <vscale x 1 x i16> @llvm.riscv.vsadd.nxv1i16.nxv1i16(
-    <vscale x 1 x i16> undef,
-    <vscale x 1 x i16> %0,
-    <vscale x 1 x i16> %1,
-    i32 %2)
-
-  ret <vscale x 1 x i16> %a
-}
-
-declare <vscale x 1 x i16> @llvm.riscv.vsadd.mask.nxv1i16.nxv1i16(
-  <vscale x 1 x i16>,
-  <vscale x 1 x i16>,
-  <vscale x 1 x i16>,
-  <vscale x 1 x i1>,
-  i32,
-  i32);
-
-define <vscale x 1 x i16> @intrinsic_vsadd_mask_vv_nxv1i16_nxv1i16_nxv1i16(<vscale x 1 x i16> %0, <vscale x 1 x i16> %1, <vscale x 1 x i16> %2, <vscale x 1 x i1> %3, i32 %4) nounwind {
-; CHECK-LABEL: intrinsic_vsadd_mask_vv_nxv1i16_nxv1i16_nxv1i16:
-; CHECK:       # %bb.0: # %entry
-; CHECK-NEXT:    vsetvli zero, a0, e16, mf4, ta, mu
-; CHECK-NEXT:    vsadd.vv v8, v9, v10, v0.t
-; CHECK-NEXT:    ret
-entry:
-  %a = call <vscale x 1 x i16> @llvm.riscv.vsadd.mask.nxv1i16.nxv1i16(
-    <vscale x 1 x i16> %0,
-    <vscale x 1 x i16> %1,
-    <vscale x 1 x i16> %2,
-    <vscale x 1 x i1> %3,
-    i32 %4, i32 1)
-
-  ret <vscale x 1 x i16> %a
-}
-
-declare <vscale x 2 x i16> @llvm.riscv.vsadd.nxv2i16.nxv2i16(
-  <vscale x 2 x i16>,
-  <vscale x 2 x i16>,
-  <vscale x 2 x i16>,
-  i32);
-
-define <vscale x 2 x i16> @intrinsic_vsadd_vv_nxv2i16_nxv2i16_nxv2i16(<vscale x 2 x i16> %0, <vscale x 2 x i16> %1, i32 %2) nounwind {
-; CHECK-LABEL: intrinsic_vsadd_vv_nxv2i16_nxv2i16_nxv2i16:
-; CHECK:       # %bb.0: # %entry
-; CHECK-NEXT:    vsetvli zero, a0, e16, mf2, ta, ma
-; CHECK-NEXT:    vsadd.vv v8, v8, v9
-; CHECK-NEXT:    ret
-entry:
-  %a = call <vscale x 2 x i16> @llvm.riscv.vsadd.nxv2i16.nxv2i16(
-    <vscale x 2 x i16> undef,
-    <vscale x 2 x i16> %0,
-    <vscale x 2 x i16> %1,
-    i32 %2)
-
-  ret <vscale x 2 x i16> %a
-}
-
-declare <vscale x 2 x i16> @llvm.riscv.vsadd.mask.nxv2i16.nxv2i16(
-  <vscale x 2 x i16>,
-  <vscale x 2 x i16>,
-  <vscale x 2 x i16>,
-  <vscale x 2 x i1>,
-  i32,
-  i32);
-
-define <vscale x 2 x i16> @intrinsic_vsadd_mask_vv_nxv2i16_nxv2i16_nxv2i16(<vscale x 2 x i16> %0, <vscale x 2 x i16> %1, <vscale x 2 x i16> %2, <vscale x 2 x i1> %3, i32 %4) nounwind {
-; CHECK-LABEL: intrinsic_vsadd_mask_vv_nxv2i16_nxv2i16_nxv2i16:
-; CHECK:       # %bb.0: # %entry
-; CHECK-NEXT:    vsetvli zero, a0, e16, mf2, ta, mu
-; CHECK-NEXT:    vsadd.vv v8, v9, v10, v0.t
-; CHECK-NEXT:    ret
-entry:
-  %a = call <vscale x 2 x i16> @llvm.riscv.vsadd.mask.nxv2i16.nxv2i16(
-    <vscale x 2 x i16> %0,
-    <vscale x 2 x i16> %1,
-    <vscale x 2 x i16> %2,
-    <vscale x 2 x i1> %3,
-    i32 %4, i32 1)
-
-  ret <vscale x 2 x i16> %a
-}
-
-declare <vscale x 4 x i16> @llvm.riscv.vsadd.nxv4i16.nxv4i16(
-  <vscale x 4 x i16>,
-  <vscale x 4 x i16>,
-  <vscale x 4 x i16>,
-  i32);
-
-define <vscale x 4 x i16> @intrinsic_vsadd_vv_nxv4i16_nxv4i16_nxv4i16(<vscale x 4 x i16> %0, <vscale x 4 x i16> %1, i32 %2) nounwind {
-; CHECK-LABEL: intrinsic_vsadd_vv_nxv4i16_nxv4i16_nxv4i16:
-; CHECK:       # %bb.0: # %entry
-; CHECK-NEXT:    vsetvli zero, a0, e16, m1, ta, ma
-; CHECK-NEXT:    vsadd.vv v8, v8, v9
-; CHECK-NEXT:    ret
-entry:
-  %a = call <vscale x 4 x i16> @llvm.riscv.vsadd.nxv4i16.nxv4i16(
-    <vscale x 4 x i16> undef,
-    <vscale x 4 x i16> %0,
-    <vscale x 4 x i16> %1,
-    i32 %2)
-
-  ret <vscale x 4 x i16> %a
-}
-
-declare <vscale x 4 x i16> @llvm.riscv.vsadd.mask.nxv4i16.nxv4i16(
-  <vscale x 4 x i16>,
-  <vscale x 4 x i16>,
-  <vscale x 4 x i16>,
-  <vscale x 4 x i1>,
-  i32,
-  i32);
-
-define <vscale x 4 x i16> @intrinsic_vsadd_mask_vv_nxv4i16_nxv4i16_nxv4i16(<vscale x 4 x i16> %0, <vscale x 4 x i16> %1, <vscale x 4 x i16> %2, <vscale x 4 x i1> %3, i32 %4) nounwind {
-; CHECK-LABEL: intrinsic_vsadd_mask_vv_nxv4i16_nxv4i16_nxv4i16:
-; CHECK:       # %bb.0: # %entry
-; CHECK-NEXT:    vsetvli zero, a0, e16, m1, ta, mu
-; CHECK-NEXT:    vsadd.vv v8, v9, v10, v0.t
-; CHECK-NEXT:    ret
-entry:
-  %a = call <vscale x 4 x i16> @llvm.riscv.vsadd.mask.nxv4i16.nxv4i16(
-    <vscale x 4 x i16> %0,
-    <vscale x 4 x i16> %1,
-    <vscale x 4 x i16> %2,
-    <vscale x 4 x i1> %3,
-    i32 %4, i32 1)
-
-  ret <vscale x 4 x i16> %a
-}
-
-declare <vscale x 8 x i16> @llvm.riscv.vsadd.nxv8i16.nxv8i16(
-  <vscale x 8 x i16>,
-  <vscale x 8 x i16>,
-  <vscale x 8 x i16>,
-  i32);
-
-define <vscale x 8 x i16> @intrinsic_vsadd_vv_nxv8i16_nxv8i16_nxv8i16(<vscale x 8 x i16> %0, <vscale x 8 x i16> %1, i32 %2) nounwind {
-; CHECK-LABEL: intrinsic_vsadd_vv_nxv8i16_nxv8i16_nxv8i16:
-; CHECK:       # %bb.0: # %entry
-; CHECK-NEXT:    vsetvli zero, a0, e16, m2, ta, ma
-; CHECK-NEXT:    vsadd.vv v8, v8, v10
-; CHECK-NEXT:    ret
-entry:
-  %a = call <vscale x 8 x i16> @llvm.riscv.vsadd.nxv8i16.nxv8i16(
-    <vscale x 8 x i16> undef,
-    <vscale x 8 x i16> %0,
-    <vscale x 8 x i16> %1,
-    i32 %2)
-
-  ret <vscale x 8 x i16> %a
-}
-
-declare <vscale x 8 x i16> @llvm.riscv.vsadd.mask.nxv8i16.nxv8i16(
-  <vscale x 8 x i16>,
-  <vscale x 8 x i16>,
-  <vscale x 8 x i16>,
-  <vscale x 8 x i1>,
-  i32,
-  i32);
-
-define <vscale x 8 x i16> @intrinsic_vsadd_mask_vv_nxv8i16_nxv8i16_nxv8i16(<vscale x 8 x i16> %0, <vscale x 8 x i16> %1, <vscale x 8 x i16> %2, <vscale x 8 x i1> %3, i32 %4) nounwind {
-; CHECK-LABEL: intrinsic_vsadd_mask_vv_nxv8i16_nxv8i16_nxv8i16:
-; CHECK:       # %bb.0: # %entry
-; CHECK-NEXT:    vsetvli zero, a0, e16, m2, ta, mu
-; CHECK-NEXT:    vsadd.vv v8, v10, v12, v0.t
-; CHECK-NEXT:    ret
-entry:
-  %a = call <vscale x 8 x i16> @llvm.riscv.vsadd.mask.nxv8i16.nxv8i16(
-    <vscale x 8 x i16> %0,
-    <vscale x 8 x i16> %1,
-    <vscale x 8 x i16> %2,
-    <vscale x 8 x i1> %3,
-    i32 %4, i32 1)
-
-  ret <vscale x 8 x i16> %a
-}
-
-declare <vscale x 16 x i16> @llvm.riscv.vsadd.nxv16i16.nxv16i16(
-  <vscale x 16 x i16>,
-  <vscale x 16 x i16>,
-  <vscale x 16 x i16>,
-  i32);
-
-define <vscale x 16 x i16> @intrinsic_vsadd_vv_nxv16i16_nxv16i16_nxv16i16(<vscale x 16 x i16> %0, <vscale x 16 x i16> %1, i32 %2) nounwind {
-; CHECK-LABEL: intrinsic_vsadd_vv_nxv16i16_nxv16i16_nxv16i16:
-; CHECK:       # %bb.0: # %entry
-; CHECK-NEXT:    vsetvli zero, a0, e16, m4, ta, ma
-; CHECK-NEXT:    vsadd.vv v8, v8, v12
-; CHECK-NEXT:    ret
-entry:
-  %a = call <vscale x 16 x i16> @llvm.riscv.vsadd.nxv16i16.nxv16i16(
-    <vscale x 16 x i16> undef,
-    <vscale x 16 x i16> %0,
-    <vscale x 16 x i16> %1,
-    i32 %2)
-
-  ret <vscale x 16 x i16> %a
-}
-
-declare <vscale x 16 x i16> @llvm.riscv.vsadd.mask.nxv16i16.nxv16i16(
-  <vscale x 16 x i16>,
-  <vscale x 16 x i16>,
-  <vscale x 16 x i16>,
-  <vscale x 16 x i1>,
-  i32,
-  i32);
-
-define <vscale x 16 x i16> @intrinsic_vsadd_mask_vv_nxv16i16_nxv16i16_nxv16i16(<vscale x 16 x i16> %0, <vscale x 16 x i16> %1, <vscale x 16 x i16> %2, <vscale x 16 x i1> %3, i32 %4) nounwind {
-; CHECK-LABEL: intrinsic_vsadd_mask_vv_nxv16i16_nxv16i16_nxv16i16:
-; CHECK:       # %bb.0: # %entry
-; CHECK-NEXT:    vsetvli zero, a0, e16, m4, ta, mu
-; CHECK-NEXT:    vsadd.vv v8, v12, v16, v0.t
-; CHECK-NEXT:    ret
-entry:
-  %a = call <vscale x 16 x i16> @llvm.riscv.vsadd.mask.nxv16i16.nxv16i16(
-    <vscale x 16 x i16> %0,
-    <vscale x 16 x i16> %1,
-    <vscale x 16 x i16> %2,
-    <vscale x 16 x i1> %3,
-    i32 %4, i32 1)
-
-  ret <vscale x 16 x i16> %a
-}
-
-declare <vscale x 32 x i16> @llvm.riscv.vsadd.nxv32i16.nxv32i16(
-  <vscale x 32 x i16>,
-  <vscale x 32 x i16>,
-  <vscale x 32 x i16>,
-  i32);
-
-define <vscale x 32 x i16> @intrinsic_vsadd_vv_nxv32i16_nxv32i16_nxv32i16(<vscale x 32 x i16> %0, <vscale x 32 x i16> %1, i32 %2) nounwind {
-; CHECK-LABEL: intrinsic_vsadd_vv_nxv32i16_nxv32i16_nxv32i16:
-; CHECK:       # %bb.0: # %entry
-; CHECK-NEXT:    vsetvli zero, a0, e16, m8, ta, ma
-; CHECK-NEXT:    vsadd.vv v8, v8, v16
-; CHECK-NEXT:    ret
-entry:
-  %a = call <vscale x 32 x i16> @llvm.riscv.vsadd.nxv32i16.nxv32i16(
-    <vscale x 32 x i16> undef,
-    <vscale x 32 x i16> %0,
-    <vscale x 32 x i16> %1,
-    i32 %2)
-
-  ret <vscale x 32 x i16> %a
-}
-
-declare <vscale x 32 x i16> @llvm.riscv.vsadd.mask.nxv32i16.nxv32i16(
-  <vscale x 32 x i16>,
-  <vscale x 32 x i16>,
-  <vscale x 32 x i16>,
-  <vscale x 32 x i1>,
-  i32,
-  i32);
-
-define <vscale x 32 x i16> @intrinsic_vsadd_mask_vv_nxv32i16_nxv32i16_nxv32i16(<vscale x 32 x i16> %0, <vscale x 32 x i16> %1, <vscale x 32 x i16> %2, <vscale x 32 x i1> %3, i32 %4) nounwind {
-; CHECK-LABEL: intrinsic_vsadd_mask_vv_nxv32i16_nxv32i16_nxv32i16:
-; CHECK:       # %bb.0: # %entry
-; CHECK-NEXT:    vl8re16.v v24, (a0)
-; CHECK-NEXT:    vsetvli zero, a1, e16, m8, ta, mu
-; CHECK-NEXT:    vsadd.vv v8, v16, v24, v0.t
-; CHECK-NEXT:    ret
-entry:
-  %a = call <vscale x 32 x i16> @llvm.riscv.vsadd.mask.nxv32i16.nxv32i16(
-    <vscale x 32 x i16> %0,
-    <vscale x 32 x i16> %1,
-    <vscale x 32 x i16> %2,
-    <vscale x 32 x i1> %3,
-    i32 %4, i32 1)
-
-  ret <vscale x 32 x i16> %a
-}
-
-declare <vscale x 1 x i32> @llvm.riscv.vsadd.nxv1i32.nxv1i32(
-  <vscale x 1 x i32>,
-  <vscale x 1 x i32>,
-  <vscale x 1 x i32>,
-  i32);
-
-define <vscale x 1 x i32> @intrinsic_vsadd_vv_nxv1i32_nxv1i32_nxv1i32(<vscale x 1 x i32> %0, <vscale x 1 x i32> %1, i32 %2) nounwind {
-; CHECK-LABEL: intrinsic_vsadd_vv_nxv1i32_nxv1i32_nxv1i32:
-; CHECK:       # %bb.0: # %entry
-; CHECK-NEXT:    vsetvli zero, a0, e32, mf2, ta, ma
-; CHECK-NEXT:    vsadd.vv v8, v8, v9
-; CHECK-NEXT:    ret
-entry:
-  %a = call <vscale x 1 x i32> @llvm.riscv.vsadd.nxv1i32.nxv1i32(
-    <vscale x 1 x i32> undef,
-    <vscale x 1 x i32> %0,
-    <vscale x 1 x i32> %1,
-    i32 %2)
-
-  ret <vscale x 1 x i32> %a
-}
-
-declare <vscale x 1 x i32> @llvm.riscv.vsadd.mask.nxv1i32.nxv1i32(
-  <vscale x 1 x i32>,
-  <vscale x 1 x i32>,
-  <vscale x 1 x i32>,
-  <vscale x 1 x i1>,
-  i32,
-  i32);
-
-define <vscale x 1 x i32> @intrinsic_vsadd_mask_vv_nxv1i32_nxv1i32_nxv1i32(<vscale x 1 x i32> %0, <vscale x 1 x i32> %1, <vscale x 1 x i32> %2, <vscale x 1 x i1> %3, i32 %4) nounwind {
-; CHECK-LABEL: intrinsic_vsadd_mask_vv_nxv1i32_nxv1i32_nxv1i32:
-; CHECK:       # %bb.0: # %entry
-; CHECK-NEXT:    vsetvli zero, a0, e32, mf2, ta, mu
-; CHECK-NEXT:    vsadd.vv v8, v9, v10, v0.t
-; CHECK-NEXT:    ret
-entry:
-  %a = call <vscale x 1 x i32> @llvm.riscv.vsadd.mask.nxv1i32.nxv1i32(
-    <vscale x 1 x i32> %0,
-    <vscale x 1 x i32> %1,
-    <vscale x 1 x i32> %2,
-    <vscale x 1 x i1> %3,
-    i32 %4, i32 1)
-
-  ret <vscale x 1 x i32> %a
-}
-
-declare <vscale x 2 x i32> @llvm.riscv.vsadd.nxv2i32.nxv2i32(
-  <vscale x 2 x i32>,
-  <vscale x 2 x i32>,
-  <vscale x 2 x i32>,
-  i32);
-
-define <vscale x 2 x i32> @intrinsic_vsadd_vv_nxv2i32_nxv2i32_nxv2i32(<vscale x 2 x i32> %0, <vscale x 2 x i32> %1, i32 %2) nounwind {
-; CHECK-LABEL: intrinsic_vsadd_vv_nxv2i32_nxv2i32_nxv2i32:
-; CHECK:       # %bb.0: # %entry
-; CHECK-NEXT:    vsetvli zero, a0, e32, m1, ta, ma
-; CHECK-NEXT:    vsadd.vv v8, v8, v9
-; CHECK-NEXT:    ret
-entry:
-  %a = call <vscale x 2 x i32> @llvm.riscv.vsadd.nxv2i32.nxv2i32(
-    <vscale x 2 x i32> undef,
-    <vscale x 2 x i32> %0,
-    <vscale x 2 x i32> %1,
-    i32 %2)
-
-  ret <vscale x 2 x i32> %a
-}
-
-declare <vscale x 2 x i32> @llvm.riscv.vsadd.mask.nxv2i32.nxv2i32(
-  <vscale x 2 x i32>,
-  <vscale x 2 x i32>,
-  <vscale x 2 x i32>,
-  <vscale x 2 x i1>,
-  i32,
-  i32);
-
-define <vscale x 2 x i32> @intrinsic_vsadd_mask_vv_nxv2i32_nxv2i32_nxv2i32(<vscale x 2 x i32> %0, <vscale x 2 x i32> %1, <vscale x 2 x i32> %2, <vscale x 2 x i1> %3, i32 %4) nounwind {
-; CHECK-LABEL: intrinsic_vsadd_mask_vv_nxv2i32_nxv2i32_nxv2i32:
-; CHECK:       # %bb.0: # %entry
-; CHECK-NEXT:    vsetvli zero, a0, e32, m1, ta, mu
-; CHECK-NEXT:    vsadd.vv v8, v9, v10, v0.t
-; CHECK-NEXT:    ret
-entry:
-  %a = call <vscale x 2 x i32> @llvm.riscv.vsadd.mask.nxv2i32.nxv2i32(
-    <vscale x 2 x i32> %0,
-    <vscale x 2 x i32> %1,
-    <vscale x 2 x i32> %2,
-    <vscale x 2 x i1> %3,
-    i32 %4, i32 1)
-
-  ret <vscale x 2 x i32> %a
-}
-
-declare <vscale x 4 x i32> @llvm.riscv.vsadd.nxv4i32.nxv4i32(
-  <vscale x 4 x i32>,
-  <vscale x 4 x i32>,
-  <vscale x 4 x i32>,
-  i32);
-
-define <vscale x 4 x i32> @intrinsic_vsadd_vv_nxv4i32_nxv4i32_nxv4i32(<vscale x 4 x i32> %0, <vscale x 4 x i32> %1, i32 %2) nounwind {
-; CHECK-LABEL: intrinsic_vsadd_vv_nxv4i32_nxv4i32_nxv4i32:
-; CHECK:       # %bb.0: # %entry
-; CHECK-NEXT:    vsetvli zero, a0, e32, m2, ta, ma
-; CHECK-NEXT:    vsadd.vv v8, v8, v10
-; CHECK-NEXT:    ret
-entry:
-  %a = call <vscale x 4 x i32> @llvm.riscv.vsadd.nxv4i32.nxv4i32(
-    <vscale x 4 x i32> undef,
-    <vscale x 4 x i32> %0,
-    <vscale x 4 x i32> %1,
-    i32 %2)
-
-  ret <vscale x 4 x i32> %a
-}
-
-declare <vscale x 4 x i32> @llvm.riscv.vsadd.mask.nxv4i32.nxv4i32(
-  <vscale x 4 x i32>,
-  <vscale x 4 x i32>,
-  <vscale x 4 x i32>,
-  <vscale x 4 x i1>,
-  i32,
-  i32);
-
-define <vscale x 4 x i32> @intrinsic_vsadd_mask_vv_nxv4i32_nxv4i32_nxv4i32(<vscale x 4 x i32> %0, <vscale x 4 x i32> %1, <vscale x 4 x i32> %2, <vscale x 4 x i1> %3, i32 %4) nounwind {
-; CHECK-LABEL: intrinsic_vsadd_mask_vv_nxv4i32_nxv4i32_nxv4i32:
-; CHECK:       # %bb.0: # %entry
-; CHECK-NEXT:    vsetvli zero, a0, e32, m2, ta, mu
-; CHECK-NEXT:    vsadd.vv v8, v10, v12, v0.t
-; CHECK-NEXT:    ret
-entry:
-  %a = call <vscale x 4 x i32> @llvm.riscv.vsadd.mask.nxv4i32.nxv4i32(
-    <vscale x 4 x i32> %0,
-    <vscale x 4 x i32> %1,
-    <vscale x 4 x i32> %2,
-    <vscale x 4 x i1> %3,
-    i32 %4, i32 1)
-
-  ret <vscale x 4 x i32> %a
-}
-
-declare <vscale x 8 x i32> @llvm.riscv.vsadd.nxv8i32.nxv8i32(
-  <vscale x 8 x i32>,
-  <vscale x 8 x i32>,
-  <vscale x 8 x i32>,
-  i32);
-
-define <vscale x 8 x i32> @intrinsic_vsadd_vv_nxv8i32_nxv8i32_nxv8i32(<vscale x 8 x i32> %0, <vscale x 8 x i32> %1, i32 %2) nounwind {
-; CHECK-LABEL: intrinsic_vsadd_vv_nxv8i32_nxv8i32_nxv8i32:
-; CHECK:       # %bb.0: # %entry
-; CHECK-NEXT:    vsetvli zero, a0, e32, m4, ta, ma
-; CHECK-NEXT:    vsadd.vv v8, v8, v12
-; CHECK-NEXT:    ret
-entry:
-  %a = call <vscale x 8 x i32> @llvm.riscv.vsadd.nxv8i32.nxv8i32(
-    <vscale x 8 x i32> undef,
-    <vscale x 8 x i32> %0,
-    <vscale x 8 x i32> %1,
-    i32 %2)
-
-  ret <vscale x 8 x i32> %a
-}
-
-declare <vscale x 8 x i32> @llvm.riscv.vsadd.mask.nxv8i32.nxv8i32(
-  <vscale x 8 x i32>,
-  <vscale x 8 x i32>,
-  <vscale x 8 x i32>,
-  <vscale x 8 x i1>,
-  i32,
-  i32);
-
-define <vscale x 8 x i32> @intrinsic_vsadd_mask_vv_nxv8i32_nxv8i32_nxv8i32(<vscale x 8 x i32> %0, <vscale x 8 x i32> %1, <vscale x 8 x i32> %2, <vscale x 8 x i1> %3, i32 %4) nounwind {
-; CHECK-LABEL: intrinsic_vsadd_mask_vv_nxv8i32_nxv8i32_nxv8i32:
-; CHECK:       # %bb.0: # %entry
-; CHECK-NEXT:    vsetvli zero, a0, e32, m4, ta, mu
-; CHECK-NEXT:    vsadd.vv v8, v12, v16, v0.t
-; CHECK-NEXT:    ret
-entry:
-  %a = call <vscale x 8 x i32> @llvm.riscv.vsadd.mask.nxv8i32.nxv8i32(
-    <vscale x 8 x i32> %0,
-    <vscale x 8 x i32> %1,
-    <vscale x 8 x i32> %2,
-    <vscale x 8 x i1> %3,
-    i32 %4, i32 1)
-
-  ret <vscale x 8 x i32> %a
-}
-
-declare <vscale x 16 x i32> @llvm.riscv.vsadd.nxv16i32.nxv16i32(
-  <vscale x 16 x i32>,
-  <vscale x 16 x i32>,
-  <vscale x 16 x i32>,
-  i32);
-
-define <vscale x 16 x i32> @intrinsic_vsadd_vv_nxv16i32_nxv16i32_nxv16i32(<vscale x 16 x i32> %0, <vscale x 16 x i32> %1, i32 %2) nounwind {
-; CHECK-LABEL: intrinsic_vsadd_vv_nxv16i32_nxv16i32_nxv16i32:
-; CHECK:       # %bb.0: # %entry
-; CHECK-NEXT:    vsetvli zero, a0, e32, m8, ta, ma
-; CHECK-NEXT:    vsadd.vv v8, v8, v16
-; CHECK-NEXT:    ret
-entry:
-  %a = call <vscale x 16 x i32> @llvm.riscv.vsadd.nxv16i32.nxv16i32(
-    <vscale x 16 x i32> undef,
-    <vscale x 16 x i32> %0,
-    <vscale x 16 x i32> %1,
-    i32 %2)
-
-  ret <vscale x 16 x i32> %a
-}
-
-declare <vscale x 16 x i32> @llvm.riscv.vsadd.mask.nxv16i32.nxv16i32(
-  <vscale x 16 x i32>,
-  <vscale x 16 x i32>,
-  <vscale x 16 x i32>,
-  <vscale x 16 x i1>,
-  i32,
-  i32);
-
-define <vscale x 16 x i32> @intrinsic_vsadd_mask_vv_nxv16i32_nxv16i32_nxv16i32(<vscale x 16 x i32> %0, <vscale x 16 x i32> %1, <vscale x 16 x i32> %2, <vscale x 16 x i1> %3, i32 %4) nounwind {
-; CHECK-LABEL: intrinsic_vsadd_mask_vv_nxv16i32_nxv16i32_nxv16i32:
-; CHECK:       # %bb.0: # %entry
-; CHECK-NEXT:    vl8re32.v v24, (a0)
-; CHECK-NEXT:    vsetvli zero, a1, e32, m8, ta, mu
-; CHECK-NEXT:    vsadd.vv v8, v16, v24, v0.t
-; CHECK-NEXT:    ret
-entry:
-  %a = call <vscale x 16 x i32> @llvm.riscv.vsadd.mask.nxv16i32.nxv16i32(
-    <vscale x 16 x i32> %0,
-    <vscale x 16 x i32> %1,
-    <vscale x 16 x i32> %2,
-    <vscale x 16 x i1> %3,
-    i32 %4, i32 1)
-
-  ret <vscale x 16 x i32> %a
-}
-
-declare <vscale x 1 x i64> @llvm.riscv.vsadd.nxv1i64.nxv1i64(
-  <vscale x 1 x i64>,
-  <vscale x 1 x i64>,
-  <vscale x 1 x i64>,
-  i32);
-
-define <vscale x 1 x i64> @intrinsic_vsadd_vv_nxv1i64_nxv1i64_nxv1i64(<vscale x 1 x i64> %0, <vscale x 1 x i64> %1, i32 %2) nounwind {
-; CHECK-LABEL: intrinsic_vsadd_vv_nxv1i64_nxv1i64_nxv1i64:
-; CHECK:       # %bb.0: # %entry
-; CHECK-NEXT:    vsetvli zero, a0, e64, m1, ta, ma
-; CHECK-NEXT:    vsadd.vv v8, v8, v9
-; CHECK-NEXT:    ret
-entry:
-  %a = call <vscale x 1 x i64> @llvm.riscv.vsadd.nxv1i64.nxv1i64(
-    <vscale x 1 x i64> undef,
-    <vscale x 1 x i64> %0,
-    <vscale x 1 x i64> %1,
-    i32 %2)
-
-  ret <vscale x 1 x i64> %a
-}
-
-declare <vscale x 1 x i64> @llvm.riscv.vsadd.mask.nxv1i64.nxv1i64(
-  <vscale x 1 x i64>,
-  <vscale x 1 x i64>,
-  <vscale x 1 x i64>,
-  <vscale x 1 x i1>,
-  i32,
-  i32);
-
-define <vscale x 1 x i64> @intrinsic_vsadd_mask_vv_nxv1i64_nxv1i64_nxv1i64(<vscale x 1 x i64> %0, <vscale x 1 x i64> %1, <vscale x 1 x i64> %2, <vscale x 1 x i1> %3, i32 %4) nounwind {
-; CHECK-LABEL: intrinsic_vsadd_mask_vv_nxv1i64_nxv1i64_nxv1i64:
-; CHECK:       # %bb.0: # %entry
-; CHECK-NEXT:    vsetvli zero, a0, e64, m1, ta, mu
-; CHECK-NEXT:    vsadd.vv v8, v9, v10, v0.t
-; CHECK-NEXT:    ret
-entry:
-  %a = call <vscale x 1 x i64> @llvm.riscv.vsadd.mask.nxv1i64.nxv1i64(
-    <vscale x 1 x i64> %0,
-    <vscale x 1 x i64> %1,
-    <vscale x 1 x i64> %2,
-    <vscale x 1 x i1> %3,
-    i32 %4, i32 1)
-
-  ret <vscale x 1 x i64> %a
-}
-
-declare <vscale x 2 x i64> @llvm.riscv.vsadd.nxv2i64.nxv2i64(
-  <vscale x 2 x i64>,
-  <vscale x 2 x i64>,
-  <vscale x 2 x i64>,
-  i32);
-
-define <vscale x 2 x i64> @intrinsic_vsadd_vv_nxv2i64_nxv2i64_nxv2i64(<vscale x 2 x i64> %0, <vscale x 2 x i64> %1, i32 %2) nounwind {
-; CHECK-LABEL: intrinsic_vsadd_vv_nxv2i64_nxv2i64_nxv2i64:
-; CHECK:       # %bb.0: # %entry
-; CHECK-NEXT:    vsetvli zero, a0, e64, m2, ta, ma
-; CHECK-NEXT:    vsadd.vv v8, v8, v10
-; CHECK-NEXT:    ret
-entry:
-  %a = call <vscale x 2 x i64> @llvm.riscv.vsadd.nxv2i64.nxv2i64(
-    <vscale x 2 x i64> undef,
-    <vscale x 2 x i64> %0,
-    <vscale x 2 x i64> %1,
-    i32 %2)
-
-  ret <vscale x 2 x i64> %a
-}
-
-declare <vscale x 2 x i64> @llvm.riscv.vsadd.mask.nxv2i64.nxv2i64(
-  <vscale x 2 x i64>,
-  <vscale x 2 x i64>,
-  <vscale x 2 x i64>,
-  <vscale x 2 x i1>,
-  i32,
-  i32);
-
-define <vscale x 2 x i64> @intrinsic_vsadd_mask_vv_nxv2i64_nxv2i64_nxv2i64(<vscale x 2 x i64> %0, <vscale x 2 x i64> %1, <vscale x 2 x i64> %2, <vscale x 2 x i1> %3, i32 %4) nounwind {
-; CHECK-LABEL: intrinsic_vsadd_mask_vv_nxv2i64_nxv2i64_nxv2i64:
-; CHECK:       # %bb.0: # %entry
-; CHECK-NEXT:    vsetvli zero, a0, e64, m2, ta, mu
-; CHECK-NEXT:    vsadd.vv v8, v10, v12, v0.t
-; CHECK-NEXT:    ret
-entry:
-  %a = call <vscale x 2 x i64> @llvm.riscv.vsadd.mask.nxv2i64.nxv2i64(
-    <vscale x 2 x i64> %0,
-    <vscale x 2 x i64> %1,
-    <vscale x 2 x i64> %2,
-    <vscale x 2 x i1> %3,
-    i32 %4, i32 1)
-
-  ret <vscale x 2 x i64> %a
-}
-
-declare <vscale x 4 x i64> @llvm.riscv.vsadd.nxv4i64.nxv4i64(
-  <vscale x 4 x i64>,
-  <vscale x 4 x i64>,
-  <vscale x 4 x i64>,
-  i32);
-
-define <vscale x 4 x i64> @intrinsic_vsadd_vv_nxv4i64_nxv4i64_nxv4i64(<vscale x 4 x i64> %0, <vscale x 4 x i64> %1, i32 %2) nounwind {
-; CHECK-LABEL: intrinsic_vsadd_vv_nxv4i64_nxv4i64_nxv4i64:
-; CHECK:       # %bb.0: # %entry
-; CHECK-NEXT:    vsetvli zero, a0, e64, m4, ta, ma
-; CHECK-NEXT:    vsadd.vv v8, v8, v12
-; CHECK-NEXT:    ret
-entry:
-  %a = call <vscale x 4 x i64> @llvm.riscv.vsadd.nxv4i64.nxv4i64(
-    <vscale x 4 x i64> undef,
-    <vscale x 4 x i64> %0,
-    <vscale x 4 x i64> %1,
-    i32 %2)
-
-  ret <vscale x 4 x i64> %a
-}
-
-declare <vscale x 4 x i64> @llvm.riscv.vsadd.mask.nxv4i64.nxv4i64(
-  <vscale x 4 x i64>,
-  <vscale x 4 x i64>,
-  <vscale x 4 x i64>,
-  <vscale x 4 x i1>,
-  i32,
-  i32);
-
-define <vscale x 4 x i64> @intrinsic_vsadd_mask_vv_nxv4i64_nxv4i64_nxv4i64(<vscale x 4 x i64> %0, <vscale x 4 x i64> %1, <vscale x 4 x i64> %2, <vscale x 4 x i1> %3, i32 %4) nounwind {
-; CHECK-LABEL: intrinsic_vsadd_mask_vv_nxv4i64_nxv4i64_nxv4i64:
-; CHECK:       # %bb.0: # %entry
-; CHECK-NEXT:    vsetvli zero, a0, e64, m4, ta, mu
-; CHECK-NEXT:    vsadd.vv v8, v12, v16, v0.t
-; CHECK-NEXT:    ret
-entry:
-  %a = call <vscale x 4 x i64> @llvm.riscv.vsadd.mask.nxv4i64.nxv4i64(
-    <vscale x 4 x i64> %0,
-    <vscale x 4 x i64> %1,
-    <vscale x 4 x i64> %2,
-    <vscale x 4 x i1> %3,
-    i32 %4, i32 1)
-
-  ret <vscale x 4 x i64> %a
-}
-
-declare <vscale x 8 x i64> @llvm.riscv.vsadd.nxv8i64.nxv8i64(
-  <vscale x 8 x i64>,
-  <vscale x 8 x i64>,
-  <vscale x 8 x i64>,
-  i32);
-
-define <vscale x 8 x i64> @intrinsic_vsadd_vv_nxv8i64_nxv8i64_nxv8i64(<vscale x 8 x i64> %0, <vscale x 8 x i64> %1, i32 %2) nounwind {
-; CHECK-LABEL: intrinsic_vsadd_vv_nxv8i64_nxv8i64_nxv8i64:
-; CHECK:       # %bb.0: # %entry
-; CHECK-NEXT:    vsetvli zero, a0, e64, m8, ta, ma
-; CHECK-NEXT:    vsadd.vv v8, v8, v16
-; CHECK-NEXT:    ret
-entry:
-  %a = call <vscale x 8 x i64> @llvm.riscv.vsadd.nxv8i64.nxv8i64(
-    <vscale x 8 x i64> undef,
-    <vscale x 8 x i64> %0,
-    <vscale x 8 x i64> %1,
-    i32 %2)
-
-  ret <vscale x 8 x i64> %a
-}
-
-declare <vscale x 8 x i64> @llvm.riscv.vsadd.mask.nxv8i64.nxv8i64(
-  <vscale x 8 x i64>,
-  <vscale x 8 x i64>,
-  <vscale x 8 x i64>,
-  <vscale x 8 x i1>,
-  i32,
-  i32);
-
-define <vscale x 8 x i64> @intrinsic_vsadd_mask_vv_nxv8i64_nxv8i64_nxv8i64(<vscale x 8 x i64> %0, <vscale x 8 x i64> %1, <vscale x 8 x i64> %2, <vscale x 8 x i1> %3, i32 %4) nounwind {
-; CHECK-LABEL: intrinsic_vsadd_mask_vv_nxv8i64_nxv8i64_nxv8i64:
-; CHECK:       # %bb.0: # %entry
-; CHECK-NEXT:    vl8re64.v v24, (a0)
-; CHECK-NEXT:    vsetvli zero, a1, e64, m8, ta, mu
-; CHECK-NEXT:    vsadd.vv v8, v16, v24, v0.t
-; CHECK-NEXT:    ret
-entry:
-  %a = call <vscale x 8 x i64> @llvm.riscv.vsadd.mask.nxv8i64.nxv8i64(
-    <vscale x 8 x i64> %0,
-    <vscale x 8 x i64> %1,
-    <vscale x 8 x i64> %2,
-    <vscale x 8 x i1> %3,
-    i32 %4, i32 1)
-
-  ret <vscale x 8 x i64> %a
-}
-
-declare <vscale x 1 x i8> @llvm.riscv.vsadd.nxv1i8.i8(
-  <vscale x 1 x i8>,
-  <vscale x 1 x i8>,
-  i8,
-  i32);
-
-define <vscale x 1 x i8> @intrinsic_vsadd_vx_nxv1i8_nxv1i8_i8(<vscale x 1 x i8> %0, i8 %1, i32 %2) nounwind {
-; CHECK-LABEL: intrinsic_vsadd_vx_nxv1i8_nxv1i8_i8:
-; CHECK:       # %bb.0: # %entry
-; CHECK-NEXT:    vsetvli zero, a1, e8, mf8, ta, ma
-; CHECK-NEXT:    vsadd.vx v8, v8, a0
-; CHECK-NEXT:    ret
-entry:
-  %a = call <vscale x 1 x i8> @llvm.riscv.vsadd.nxv1i8.i8(
-    <vscale x 1 x i8> undef,
-    <vscale x 1 x i8> %0,
-    i8 %1,
-    i32 %2)
-
-  ret <vscale x 1 x i8> %a
-}
-
-declare <vscale x 1 x i8> @llvm.riscv.vsadd.mask.nxv1i8.i8(
-  <vscale x 1 x i8>,
-  <vscale x 1 x i8>,
-  i8,
-  <vscale x 1 x i1>,
-  i32,
-  i32);
-
-define <vscale x 1 x i8> @intrinsic_vsadd_mask_vx_nxv1i8_nxv1i8_i8(<vscale x 1 x i8> %0, <vscale x 1 x i8> %1, i8 %2, <vscale x 1 x i1> %3, i32 %4) nounwind {
-; CHECK-LABEL: intrinsic_vsadd_mask_vx_nxv1i8_nxv1i8_i8:
-; CHECK:       # %bb.0: # %entry
-; CHECK-NEXT:    vsetvli zero, a1, e8, mf8, ta, mu
-; CHECK-NEXT:    vsadd.vx v8, v9, a0, v0.t
-; CHECK-NEXT:    ret
-entry:
-  %a = call <vscale x 1 x i8> @llvm.riscv.vsadd.mask.nxv1i8.i8(
-    <vscale x 1 x i8> %0,
-    <vscale x 1 x i8> %1,
-    i8 %2,
-    <vscale x 1 x i1> %3,
-    i32 %4, i32 1)
-
-  ret <vscale x 1 x i8> %a
-}
-
-declare <vscale x 2 x i8> @llvm.riscv.vsadd.nxv2i8.i8(
-  <vscale x 2 x i8>,
-  <vscale x 2 x i8>,
-  i8,
-  i32);
-
-define <vscale x 2 x i8> @intrinsic_vsadd_vx_nxv2i8_nxv2i8_i8(<vscale x 2 x i8> %0, i8 %1, i32 %2) nounwind {
-; CHECK-LABEL: intrinsic_vsadd_vx_nxv2i8_nxv2i8_i8:
-; CHECK:       # %bb.0: # %entry
-; CHECK-NEXT:    vsetvli zero, a1, e8, mf4, ta, ma
-; CHECK-NEXT:    vsadd.vx v8, v8, a0
-; CHECK-NEXT:    ret
-entry:
-  %a = call <vscale x 2 x i8> @llvm.riscv.vsadd.nxv2i8.i8(
-    <vscale x 2 x i8> undef,
-    <vscale x 2 x i8> %0,
-    i8 %1,
-    i32 %2)
-
-  ret <vscale x 2 x i8> %a
-}
-
-declare <vscale x 2 x i8> @llvm.riscv.vsadd.mask.nxv2i8.i8(
-  <vscale x 2 x i8>,
-  <vscale x 2 x i8>,
-  i8,
-  <vscale x 2 x i1>,
-  i32,
-  i32);
-
-define <vscale x 2 x i8> @intrinsic_vsadd_mask_vx_nxv2i8_nxv2i8_i8(<vscale x 2 x i8> %0, <vscale x 2 x i8> %1, i8 %2, <vscale x 2 x i1> %3, i32 %4) nounwind {
-; CHECK-LABEL: intrinsic_vsadd_mask_vx_nxv2i8_nxv2i8_i8:
-; CHECK:       # %bb.0: # %entry
-; CHECK-NEXT:    vsetvli zero, a1, e8, mf4, ta, mu
-; CHECK-NEXT:    vsadd.vx v8, v9, a0, v0.t
-; CHECK-NEXT:    ret
-entry:
-  %a = call <vscale x 2 x i8> @llvm.riscv.vsadd.mask.nxv2i8.i8(
-    <vscale x 2 x i8> %0,
-    <vscale x 2 x i8> %1,
-    i8 %2,
-    <vscale x 2 x i1> %3,
-    i32 %4, i32 1)
-
-  ret <vscale x 2 x i8> %a
-}
-
-declare <vscale x 4 x i8> @llvm.riscv.vsadd.nxv4i8.i8(
-  <vscale x 4 x i8>,
-  <vscale x 4 x i8>,
-  i8,
-  i32);
-
-define <vscale x 4 x i8> @intrinsic_vsadd_vx_nxv4i8_nxv4i8_i8(<vscale x 4 x i8> %0, i8 %1, i32 %2) nounwind {
-; CHECK-LABEL: intrinsic_vsadd_vx_nxv4i8_nxv4i8_i8:
-; CHECK:       # %bb.0: # %entry
-; CHECK-NEXT:    vsetvli zero, a1, e8, mf2, ta, ma
-; CHECK-NEXT:    vsadd.vx v8, v8, a0
-; CHECK-NEXT:    ret
-entry:
-  %a = call <vscale x 4 x i8> @llvm.riscv.vsadd.nxv4i8.i8(
-    <vscale x 4 x i8> undef,
-    <vscale x 4 x i8> %0,
-    i8 %1,
-    i32 %2)
-
-  ret <vscale x 4 x i8> %a
-}
-
-declare <vscale x 4 x i8> @llvm.riscv.vsadd.mask.nxv4i8.i8(
-  <vscale x 4 x i8>,
-  <vscale x 4 x i8>,
-  i8,
-  <vscale x 4 x i1>,
-  i32,
-  i32);
-
-define <vscale x 4 x i8> @intrinsic_vsadd_mask_vx_nxv4i8_nxv4i8_i8(<vscale x 4 x i8> %0, <vscale x 4 x i8> %1, i8 %2, <vscale x 4 x i1> %3, i32 %4) nounwind {
-; CHECK-LABEL: intrinsic_vsadd_mask_vx_nxv4i8_nxv4i8_i8:
-; CHECK:       # %bb.0: # %entry
-; CHECK-NEXT:    vsetvli zero, a1, e8, mf2, ta, mu
-; CHECK-NEXT:    vsadd.vx v8, v9, a0, v0.t
-; CHECK-NEXT:    ret
-entry:
-  %a = call <vscale x 4 x i8> @llvm.riscv.vsadd.mask.nxv4i8.i8(
-    <vscale x 4 x i8> %0,
-    <vscale x 4 x i8> %1,
-    i8 %2,
-    <vscale x 4 x i1> %3,
-    i32 %4, i32 1)
-
-  ret <vscale x 4 x i8> %a
-}
-
-declare <vscale x 8 x i8> @llvm.riscv.vsadd.nxv8i8.i8(
-  <vscale x 8 x i8>,
-  <vscale x 8 x i8>,
-  i8,
-  i32);
-
-define <vscale x 8 x i8> @intrinsic_vsadd_vx_nxv8i8_nxv8i8_i8(<vscale x 8 x i8> %0, i8 %1, i32 %2) nounwind {
-; CHECK-LABEL: intrinsic_vsadd_vx_nxv8i8_nxv8i8_i8:
-; CHECK:       # %bb.0: # %entry
-; CHECK-NEXT:    vsetvli zero, a1, e8, m1, ta, ma
-; CHECK-NEXT:    vsadd.vx v8, v8, a0
-; CHECK-NEXT:    ret
-entry:
-  %a = call <vscale x 8 x i8> @llvm.riscv.vsadd.nxv8i8.i8(
-    <vscale x 8 x i8> undef,
-    <vscale x 8 x i8> %0,
-    i8 %1,
-    i32 %2)
-
-  ret <vscale x 8 x i8> %a
-}
-
-declare <vscale x 8 x i8> @llvm.riscv.vsadd.mask.nxv8i8.i8(
-  <vscale x 8 x i8>,
-  <vscale x 8 x i8>,
-  i8,
-  <vscale x 8 x i1>,
-  i32,
-  i32);
-
-define <vscale x 8 x i8> @intrinsic_vsadd_mask_vx_nxv8i8_nxv8i8_i8(<vscale x 8 x i8> %0, <vscale x 8 x i8> %1, i8 %2, <vscale x 8 x i1> %3, i32 %4) nounwind {
-; CHECK-LABEL: intrinsic_vsadd_mask_vx_nxv8i8_nxv8i8_i8:
-; CHECK:       # %bb.0: # %entry
-; CHECK-NEXT:    vsetvli zero, a1, e8, m1, ta, mu
-; CHECK-NEXT:    vsadd.vx v8, v9, a0, v0.t
-; CHECK-NEXT:    ret
-entry:
-  %a = call <vscale x 8 x i8> @llvm.riscv.vsadd.mask.nxv8i8.i8(
-    <vscale x 8 x i8> %0,
-    <vscale x 8 x i8> %1,
-    i8 %2,
-    <vscale x 8 x i1> %3,
-    i32 %4, i32 1)
-
-  ret <vscale x 8 x i8> %a
-}
-
-declare <vscale x 16 x i8> @llvm.riscv.vsadd.nxv16i8.i8(
-  <vscale x 16 x i8>,
-  <vscale x 16 x i8>,
-  i8,
-  i32);
-
-define <vscale x 16 x i8> @intrinsic_vsadd_vx_nxv16i8_nxv16i8_i8(<vscale x 16 x i8> %0, i8 %1, i32 %2) nounwind {
-; CHECK-LABEL: intrinsic_vsadd_vx_nxv16i8_nxv16i8_i8:
-; CHECK:       # %bb.0: # %entry
-; CHECK-NEXT:    vsetvli zero, a1, e8, m2, ta, ma
-; CHECK-NEXT:    vsadd.vx v8, v8, a0
-; CHECK-NEXT:    ret
-entry:
-  %a = call <vscale x 16 x i8> @llvm.riscv.vsadd.nxv16i8.i8(
-    <vscale x 16 x i8> undef,
-    <vscale x 16 x i8> %0,
-    i8 %1,
-    i32 %2)
-
-  ret <vscale x 16 x i8> %a
-}
-
-declare <vscale x 16 x i8> @llvm.riscv.vsadd.mask.nxv16i8.i8(
-  <vscale x 16 x i8>,
-  <vscale x 16 x i8>,
-  i8,
-  <vscale x 16 x i1>,
-  i32,
-  i32);
-
-define <vscale x 16 x i8> @intrinsic_vsadd_mask_vx_nxv16i8_nxv16i8_i8(<vscale x 16 x i8> %0, <vscale x 16 x i8> %1, i8 %2, <vscale x 16 x i1> %3, i32 %4) nounwind {
-; CHECK-LABEL: intrinsic_vsadd_mask_vx_nxv16i8_nxv16i8_i8:
-; CHECK:       # %bb.0: # %entry
-; CHECK-NEXT:    vsetvli zero, a1, e8, m2, ta, mu
-; CHECK-NEXT:    vsadd.vx v8, v10, a0, v0.t
-; CHECK-NEXT:    ret
-entry:
-  %a = call <vscale x 16 x i8> @llvm.riscv.vsadd.mask.nxv16i8.i8(
-    <vscale x 16 x i8> %0,
-    <vscale x 16 x i8> %1,
-    i8 %2,
-    <vscale x 16 x i1> %3,
-    i32 %4, i32 1)
-
-  ret <vscale x 16 x i8> %a
-}
-
-declare <vscale x 32 x i8> @llvm.riscv.vsadd.nxv32i8.i8(
-  <vscale x 32 x i8>,
-  <vscale x 32 x i8>,
-  i8,
-  i32);
-
-define <vscale x 32 x i8> @intrinsic_vsadd_vx_nxv32i8_nxv32i8_i8(<vscale x 32 x i8> %0, i8 %1, i32 %2) nounwind {
-; CHECK-LABEL: intrinsic_vsadd_vx_nxv32i8_nxv32i8_i8:
-; CHECK:       # %bb.0: # %entry
-; CHECK-NEXT:    vsetvli zero, a1, e8, m4, ta, ma
-; CHECK-NEXT:    vsadd.vx v8, v8, a0
-; CHECK-NEXT:    ret
-entry:
-  %a = call <vscale x 32 x i8> @llvm.riscv.vsadd.nxv32i8.i8(
-    <vscale x 32 x i8> undef,
-    <vscale x 32 x i8> %0,
-    i8 %1,
-    i32 %2)
-
-  ret <vscale x 32 x i8> %a
-}
-
-declare <vscale x 32 x i8> @llvm.riscv.vsadd.mask.nxv32i8.i8(
-  <vscale x 32 x i8>,
-  <vscale x 32 x i8>,
-  i8,
-  <vscale x 32 x i1>,
-  i32,
-  i32);
-
-define <vscale x 32 x i8> @intrinsic_vsadd_mask_vx_nxv32i8_nxv32i8_i8(<vscale x 32 x i8> %0, <vscale x 32 x i8> %1, i8 %2, <vscale x 32 x i1> %3, i32 %4) nounwind {
-; CHECK-LABEL: intrinsic_vsadd_mask_vx_nxv32i8_nxv32i8_i8:
-; CHECK:       # %bb.0: # %entry
-; CHECK-NEXT:    vsetvli zero, a1, e8, m4, ta, mu
-; CHECK-NEXT:    vsadd.vx v8, v12, a0, v0.t
-; CHECK-NEXT:    ret
-entry:
-  %a = call <vscale x 32 x i8> @llvm.riscv.vsadd.mask.nxv32i8.i8(
-    <vscale x 32 x i8> %0,
-    <vscale x 32 x i8> %1,
-    i8 %2,
-    <vscale x 32 x i1> %3,
-    i32 %4, i32 1)
-
-  ret <vscale x 32 x i8> %a
-}
-
-declare <vscale x 64 x i8> @llvm.riscv.vsadd.nxv64i8.i8(
-  <vscale x 64 x i8>,
-  <vscale x 64 x i8>,
-  i8,
-  i32);
-
-define <vscale x 64 x i8> @intrinsic_vsadd_vx_nxv64i8_nxv64i8_i8(<vscale x 64 x i8> %0, i8 %1, i32 %2) nounwind {
-; CHECK-LABEL: intrinsic_vsadd_vx_nxv64i8_nxv64i8_i8:
-; CHECK:       # %bb.0: # %entry
-; CHECK-NEXT:    vsetvli zero, a1, e8, m8, ta, ma
-; CHECK-NEXT:    vsadd.vx v8, v8, a0
-; CHECK-NEXT:    ret
-entry:
-  %a = call <vscale x 64 x i8> @llvm.riscv.vsadd.nxv64i8.i8(
-    <vscale x 64 x i8> undef,
-    <vscale x 64 x i8> %0,
-    i8 %1,
-    i32 %2)
-
-  ret <vscale x 64 x i8> %a
-}
-
-declare <vscale x 64 x i8> @llvm.riscv.vsadd.mask.nxv64i8.i8(
-  <vscale x 64 x i8>,
-  <vscale x 64 x i8>,
-  i8,
-  <vscale x 64 x i1>,
-  i32,
-  i32);
-
-define <vscale x 64 x i8> @intrinsic_vsadd_mask_vx_nxv64i8_nxv64i8_i8(<vscale x 64 x i8> %0, <vscale x 64 x i8> %1, i8 %2, <vscale x 64 x i1> %3, i32 %4) nounwind {
-; CHECK-LABEL: intrinsic_vsadd_mask_vx_nxv64i8_nxv64i8_i8:
-; CHECK:       # %bb.0: # %entry
-; CHECK-NEXT:    vsetvli zero, a1, e8, m8, ta, mu
-; CHECK-NEXT:    vsadd.vx v8, v16, a0, v0.t
-; CHECK-NEXT:    ret
-entry:
-  %a = call <vscale x 64 x i8> @llvm.riscv.vsadd.mask.nxv64i8.i8(
-    <vscale x 64 x i8> %0,
-    <vscale x 64 x i8> %1,
-    i8 %2,
-    <vscale x 64 x i1> %3,
-    i32 %4, i32 1)
-
-  ret <vscale x 64 x i8> %a
-}
-
-declare <vscale x 1 x i16> @llvm.riscv.vsadd.nxv1i16.i16(
-  <vscale x 1 x i16>,
-  <vscale x 1 x i16>,
-  i16,
-  i32);
-
-define <vscale x 1 x i16> @intrinsic_vsadd_vx_nxv1i16_nxv1i16_i16(<vscale x 1 x i16> %0, i16 %1, i32 %2) nounwind {
-; CHECK-LABEL: intrinsic_vsadd_vx_nxv1i16_nxv1i16_i16:
-; CHECK:       # %bb.0: # %entry
-; CHECK-NEXT:    vsetvli zero, a1, e16, mf4, ta, ma
-; CHECK-NEXT:    vsadd.vx v8, v8, a0
-; CHECK-NEXT:    ret
-entry:
-  %a = call <vscale x 1 x i16> @llvm.riscv.vsadd.nxv1i16.i16(
-    <vscale x 1 x i16> undef,
-    <vscale x 1 x i16> %0,
-    i16 %1,
-    i32 %2)
-
-  ret <vscale x 1 x i16> %a
-}
-
-declare <vscale x 1 x i16> @llvm.riscv.vsadd.mask.nxv1i16.i16(
-  <vscale x 1 x i16>,
-  <vscale x 1 x i16>,
-  i16,
-  <vscale x 1 x i1>,
-  i32,
-  i32);
-
-define <vscale x 1 x i16> @intrinsic_vsadd_mask_vx_nxv1i16_nxv1i16_i16(<vscale x 1 x i16> %0, <vscale x 1 x i16> %1, i16 %2, <vscale x 1 x i1> %3, i32 %4) nounwind {
-; CHECK-LABEL: intrinsic_vsadd_mask_vx_nxv1i16_nxv1i16_i16:
-; CHECK:       # %bb.0: # %entry
-; CHECK-NEXT:    vsetvli zero, a1, e16, mf4, ta, mu
-; CHECK-NEXT:    vsadd.vx v8, v9, a0, v0.t
-; CHECK-NEXT:    ret
-entry:
-  %a = call <vscale x 1 x i16> @llvm.riscv.vsadd.mask.nxv1i16.i16(
-    <vscale x 1 x i16> %0,
-    <vscale x 1 x i16> %1,
-    i16 %2,
-    <vscale x 1 x i1> %3,
-    i32 %4, i32 1)
-
-  ret <vscale x 1 x i16> %a
-}
-
-declare <vscale x 2 x i16> @llvm.riscv.vsadd.nxv2i16.i16(
-  <vscale x 2 x i16>,
-  <vscale x 2 x i16>,
-  i16,
-  i32);
-
-define <vscale x 2 x i16> @intrinsic_vsadd_vx_nxv2i16_nxv2i16_i16(<vscale x 2 x i16> %0, i16 %1, i32 %2) nounwind {
-; CHECK-LABEL: intrinsic_vsadd_vx_nxv2i16_nxv2i16_i16:
-; CHECK:       # %bb.0: # %entry
-; CHECK-NEXT:    vsetvli zero, a1, e16, mf2, ta, ma
-; CHECK-NEXT:    vsadd.vx v8, v8, a0
-; CHECK-NEXT:    ret
-entry:
-  %a = call <vscale x 2 x i16> @llvm.riscv.vsadd.nxv2i16.i16(
-    <vscale x 2 x i16> undef,
-    <vscale x 2 x i16> %0,
-    i16 %1,
-    i32 %2)
-
-  ret <vscale x 2 x i16> %a
-}
-
-declare <vscale x 2 x i16> @llvm.riscv.vsadd.mask.nxv2i16.i16(
-  <vscale x 2 x i16>,
-  <vscale x 2 x i16>,
-  i16,
-  <vscale x 2 x i1>,
-  i32,
-  i32);
-
-define <vscale x 2 x i16> @intrinsic_vsadd_mask_vx_nxv2i16_nxv2i16_i16(<vscale x 2 x i16> %0, <vscale x 2 x i16> %1, i16 %2, <vscale x 2 x i1> %3, i32 %4) nounwind {
-; CHECK-LABEL: intrinsic_vsadd_mask_vx_nxv2i16_nxv2i16_i16:
-; CHECK:       # %bb.0: # %entry
-; CHECK-NEXT:    vsetvli zero, a1, e16, mf2, ta, mu
-; CHECK-NEXT:    vsadd.vx v8, v9, a0, v0.t
-; CHECK-NEXT:    ret
-entry:
-  %a = call <vscale x 2 x i16> @llvm.riscv.vsadd.mask.nxv2i16.i16(
-    <vscale x 2 x i16> %0,
-    <vscale x 2 x i16> %1,
-    i16 %2,
-    <vscale x 2 x i1> %3,
-    i32 %4, i32 1)
-
-  ret <vscale x 2 x i16> %a
-}
-
-declare <vscale x 4 x i16> @llvm.riscv.vsadd.nxv4i16.i16(
-  <vscale x 4 x i16>,
-  <vscale x 4 x i16>,
-  i16,
-  i32);
-
-define <vscale x 4 x i16> @intrinsic_vsadd_vx_nxv4i16_nxv4i16_i16(<vscale x 4 x i16> %0, i16 %1, i32 %2) nounwind {
-; CHECK-LABEL: intrinsic_vsadd_vx_nxv4i16_nxv4i16_i16:
-; CHECK:       # %bb.0: # %entry
-; CHECK-NEXT:    vsetvli zero, a1, e16, m1, ta, ma
-; CHECK-NEXT:    vsadd.vx v8, v8, a0
-; CHECK-NEXT:    ret
-entry:
-  %a = call <vscale x 4 x i16> @llvm.riscv.vsadd.nxv4i16.i16(
-    <vscale x 4 x i16> undef,
-    <vscale x 4 x i16> %0,
-    i16 %1,
-    i32 %2)
-
-  ret <vscale x 4 x i16> %a
-}
-
-declare <vscale x 4 x i16> @llvm.riscv.vsadd.mask.nxv4i16.i16(
-  <vscale x 4 x i16>,
-  <vscale x 4 x i16>,
-  i16,
-  <vscale x 4 x i1>,
-  i32,
-  i32);
-
-define <vscale x 4 x i16> @intrinsic_vsadd_mask_vx_nxv4i16_nxv4i16_i16(<vscale x 4 x i16> %0, <vscale x 4 x i16> %1, i16 %2, <vscale x 4 x i1> %3, i32 %4) nounwind {
-; CHECK-LABEL: intrinsic_vsadd_mask_vx_nxv4i16_nxv4i16_i16:
-; CHECK:       # %bb.0: # %entry
-; CHECK-NEXT:    vsetvli zero, a1, e16, m1, ta, mu
-; CHECK-NEXT:    vsadd.vx v8, v9, a0, v0.t
-; CHECK-NEXT:    ret
-entry:
-  %a = call <vscale x 4 x i16> @llvm.riscv.vsadd.mask.nxv4i16.i16(
-    <vscale x 4 x i16> %0,
-    <vscale x 4 x i16> %1,
-    i16 %2,
-    <vscale x 4 x i1> %3,
-    i32 %4, i32 1)
-
-  ret <vscale x 4 x i16> %a
-}
-
-declare <vscale x 8 x i16> @llvm.riscv.vsadd.nxv8i16.i16(
-  <vscale x 8 x i16>,
-  <vscale x 8 x i16>,
-  i16,
-  i32);
-
-define <vscale x 8 x i16> @intrinsic_vsadd_vx_nxv8i16_nxv8i16_i16(<vscale x 8 x i16> %0, i16 %1, i32 %2) nounwind {
-; CHECK-LABEL: intrinsic_vsadd_vx_nxv8i16_nxv8i16_i16:
-; CHECK:       # %bb.0: # %entry
-; CHECK-NEXT:    vsetvli zero, a1, e16, m2, ta, ma
-; CHECK-NEXT:    vsadd.vx v8, v8, a0
-; CHECK-NEXT:    ret
-entry:
-  %a = call <vscale x 8 x i16> @llvm.riscv.vsadd.nxv8i16.i16(
-    <vscale x 8 x i16> undef,
-    <vscale x 8 x i16> %0,
-    i16 %1,
-    i32 %2)
-
-  ret <vscale x 8 x i16> %a
-}
-
-declare <vscale x 8 x i16> @llvm.riscv.vsadd.mask.nxv8i16.i16(
-  <vscale x 8 x i16>,
-  <vscale x 8 x i16>,
-  i16,
-  <vscale x 8 x i1>,
-  i32,
-  i32);
-
-define <vscale x 8 x i16> @intrinsic_vsadd_mask_vx_nxv8i16_nxv8i16_i16(<vscale x 8 x i16> %0, <vscale x 8 x i16> %1, i16 %2, <vscale x 8 x i1> %3, i32 %4) nounwind {
-; CHECK-LABEL: intrinsic_vsadd_mask_vx_nxv8i16_nxv8i16_i16:
-; CHECK:       # %bb.0: # %entry
-; CHECK-NEXT:    vsetvli zero, a1, e16, m2, ta, mu
-; CHECK-NEXT:    vsadd.vx v8, v10, a0, v0.t
-; CHECK-NEXT:    ret
-entry:
-  %a = call <vscale x 8 x i16> @llvm.riscv.vsadd.mask.nxv8i16.i16(
-    <vscale x 8 x i16> %0,
-    <vscale x 8 x i16> %1,
-    i16 %2,
-    <vscale x 8 x i1> %3,
-    i32 %4, i32 1)
-
-  ret <vscale x 8 x i16> %a
-}
-
-declare <vscale x 16 x i16> @llvm.riscv.vsadd.nxv16i16.i16(
-  <vscale x 16 x i16>,
-  <vscale x 16 x i16>,
-  i16,
-  i32);
-
-define <vscale x 16 x i16> @intrinsic_vsadd_vx_nxv16i16_nxv16i16_i16(<vscale x 16 x i16> %0, i16 %1, i32 %2) nounwind {
-; CHECK-LABEL: intrinsic_vsadd_vx_nxv16i16_nxv16i16_i16:
-; CHECK:       # %bb.0: # %entry
-; CHECK-NEXT:    vsetvli zero, a1, e16, m4, ta, ma
-; CHECK-NEXT:    vsadd.vx v8, v8, a0
-; CHECK-NEXT:    ret
-entry:
-  %a = call <vscale x 16 x i16> @llvm.riscv.vsadd.nxv16i16.i16(
-    <vscale x 16 x i16> undef,
-    <vscale x 16 x i16> %0,
-    i16 %1,
-    i32 %2)
-
-  ret <vscale x 16 x i16> %a
-}
-
-declare <vscale x 16 x i16> @llvm.riscv.vsadd.mask.nxv16i16.i16(
-  <vscale x 16 x i16>,
-  <vscale x 16 x i16>,
-  i16,
-  <vscale x 16 x i1>,
-  i32,
-  i32);
-
-define <vscale x 16 x i16> @intrinsic_vsadd_mask_vx_nxv16i16_nxv16i16_i16(<vscale x 16 x i16> %0, <vscale x 16 x i16> %1, i16 %2, <vscale x 16 x i1> %3, i32 %4) nounwind {
-; CHECK-LABEL: intrinsic_vsadd_mask_vx_nxv16i16_nxv16i16_i16:
-; CHECK:       # %bb.0: # %entry
-; CHECK-NEXT:    vsetvli zero, a1, e16, m4, ta, mu
-; CHECK-NEXT:    vsadd.vx v8, v12, a0, v0.t
-; CHECK-NEXT:    ret
-entry:
-  %a = call <vscale x 16 x i16> @llvm.riscv.vsadd.mask.nxv16i16.i16(
-    <vscale x 16 x i16> %0,
-    <vscale x 16 x i16> %1,
-    i16 %2,
-    <vscale x 16 x i1> %3,
-    i32 %4, i32 1)
-
-  ret <vscale x 16 x i16> %a
-}
-
-declare <vscale x 32 x i16> @llvm.riscv.vsadd.nxv32i16.i16(
-  <vscale x 32 x i16>,
-  <vscale x 32 x i16>,
-  i16,
-  i32);
-
-define <vscale x 32 x i16> @intrinsic_vsadd_vx_nxv32i16_nxv32i16_i16(<vscale x 32 x i16> %0, i16 %1, i32 %2) nounwind {
-; CHECK-LABEL: intrinsic_vsadd_vx_nxv32i16_nxv32i16_i16:
-; CHECK:       # %bb.0: # %entry
-; CHECK-NEXT:    vsetvli zero, a1, e16, m8, ta, ma
-; CHECK-NEXT:    vsadd.vx v8, v8, a0
-; CHECK-NEXT:    ret
-entry:
-  %a = call <vscale x 32 x i16> @llvm.riscv.vsadd.nxv32i16.i16(
-    <vscale x 32 x i16> undef,
-    <vscale x 32 x i16> %0,
-    i16 %1,
-    i32 %2)
-
-  ret <vscale x 32 x i16> %a
-}
-
-declare <vscale x 32 x i16> @llvm.riscv.vsadd.mask.nxv32i16.i16(
-  <vscale x 32 x i16>,
-  <vscale x 32 x i16>,
-  i16,
-  <vscale x 32 x i1>,
-  i32,
-  i32);
-
-define <vscale x 32 x i16> @intrinsic_vsadd_mask_vx_nxv32i16_nxv32i16_i16(<vscale x 32 x i16> %0, <vscale x 32 x i16> %1, i16 %2, <vscale x 32 x i1> %3, i32 %4) nounwind {
-; CHECK-LABEL: intrinsic_vsadd_mask_vx_nxv32i16_nxv32i16_i16:
-; CHECK:       # %bb.0: # %entry
-; CHECK-NEXT:    vsetvli zero, a1, e16, m8, ta, mu
-; CHECK-NEXT:    vsadd.vx v8, v16, a0, v0.t
-; CHECK-NEXT:    ret
-entry:
-  %a = call <vscale x 32 x i16> @llvm.riscv.vsadd.mask.nxv32i16.i16(
-    <vscale x 32 x i16> %0,
-    <vscale x 32 x i16> %1,
-    i16 %2,
-    <vscale x 32 x i1> %3,
-    i32 %4, i32 1)
-
-  ret <vscale x 32 x i16> %a
-}
-
-declare <vscale x 1 x i32> @llvm.riscv.vsadd.nxv1i32.i32(
-  <vscale x 1 x i32>,
-  <vscale x 1 x i32>,
-  i32,
-  i32);
-
-define <vscale x 1 x i32> @intrinsic_vsadd_vx_nxv1i32_nxv1i32_i32(<vscale x 1 x i32> %0, i32 %1, i32 %2) nounwind {
-; CHECK-LABEL: intrinsic_vsadd_vx_nxv1i32_nxv1i32_i32:
-; CHECK:       # %bb.0: # %entry
-; CHECK-NEXT:    vsetvli zero, a1, e32, mf2, ta, ma
-; CHECK-NEXT:    vsadd.vx v8, v8, a0
-; CHECK-NEXT:    ret
-entry:
-  %a = call <vscale x 1 x i32> @llvm.riscv.vsadd.nxv1i32.i32(
-    <vscale x 1 x i32> undef,
-    <vscale x 1 x i32> %0,
-    i32 %1,
-    i32 %2)
-
-  ret <vscale x 1 x i32> %a
-}
-
-declare <vscale x 1 x i32> @llvm.riscv.vsadd.mask.nxv1i32.i32(
-  <vscale x 1 x i32>,
-  <vscale x 1 x i32>,
-  i32,
-  <vscale x 1 x i1>,
-  i32,
-  i32);
-
-define <vscale x 1 x i32> @intrinsic_vsadd_mask_vx_nxv1i32_nxv1i32_i32(<vscale x 1 x i32> %0, <vscale x 1 x i32> %1, i32 %2, <vscale x 1 x i1> %3, i32 %4) nounwind {
-; CHECK-LABEL: intrinsic_vsadd_mask_vx_nxv1i32_nxv1i32_i32:
-; CHECK:       # %bb.0: # %entry
-; CHECK-NEXT:    vsetvli zero, a1, e32, mf2, ta, mu
-; CHECK-NEXT:    vsadd.vx v8, v9, a0, v0.t
-; CHECK-NEXT:    ret
-entry:
-  %a = call <vscale x 1 x i32> @llvm.riscv.vsadd.mask.nxv1i32.i32(
-    <vscale x 1 x i32> %0,
-    <vscale x 1 x i32> %1,
-    i32 %2,
-    <vscale x 1 x i1> %3,
-    i32 %4, i32 1)
-
-  ret <vscale x 1 x i32> %a
-}
-
-declare <vscale x 2 x i32> @llvm.riscv.vsadd.nxv2i32.i32(
-  <vscale x 2 x i32>,
-  <vscale x 2 x i32>,
-  i32,
-  i32);
-
-define <vscale x 2 x i32> @intrinsic_vsadd_vx_nxv2i32_nxv2i32_i32(<vscale x 2 x i32> %0, i32 %1, i32 %2) nounwind {
-; CHECK-LABEL: intrinsic_vsadd_vx_nxv2i32_nxv2i32_i32:
-; CHECK:       # %bb.0: # %entry
-; CHECK-NEXT:    vsetvli zero, a1, e32, m1, ta, ma
-; CHECK-NEXT:    vsadd.vx v8, v8, a0
-; CHECK-NEXT:    ret
-entry:
-  %a = call <vscale x 2 x i32> @llvm.riscv.vsadd.nxv2i32.i32(
-    <vscale x 2 x i32> undef,
-    <vscale x 2 x i32> %0,
-    i32 %1,
-    i32 %2)
-
-  ret <vscale x 2 x i32> %a
-}
-
-declare <vscale x 2 x i32> @llvm.riscv.vsadd.mask.nxv2i32.i32(
-  <vscale x 2 x i32>,
-  <vscale x 2 x i32>,
-  i32,
-  <vscale x 2 x i1>,
-  i32,
-  i32);
-
-define <vscale x 2 x i32> @intrinsic_vsadd_mask_vx_nxv2i32_nxv2i32_i32(<vscale x 2 x i32> %0, <vscale x 2 x i32> %1, i32 %2, <vscale x 2 x i1> %3, i32 %4) nounwind {
-; CHECK-LABEL: intrinsic_vsadd_mask_vx_nxv2i32_nxv2i32_i32:
-; CHECK:       # %bb.0: # %entry
-; CHECK-NEXT:    vsetvli zero, a1, e32, m1, ta, mu
-; CHECK-NEXT:    vsadd.vx v8, v9, a0, v0.t
-; CHECK-NEXT:    ret
-entry:
-  %a = call <vscale x 2 x i32> @llvm.riscv.vsadd.mask.nxv2i32.i32(
-    <vscale x 2 x i32> %0,
-    <vscale x 2 x i32> %1,
-    i32 %2,
-    <vscale x 2 x i1> %3,
-    i32 %4, i32 1)
-
-  ret <vscale x 2 x i32> %a
-}
-
-declare <vscale x 4 x i32> @llvm.riscv.vsadd.nxv4i32.i32(
-  <vscale x 4 x i32>,
-  <vscale x 4 x i32>,
-  i32,
-  i32);
-
-define <vscale x 4 x i32> @intrinsic_vsadd_vx_nxv4i32_nxv4i32_i32(<vscale x 4 x i32> %0, i32 %1, i32 %2) nounwind {
-; CHECK-LABEL: intrinsic_vsadd_vx_nxv4i32_nxv4i32_i32:
-; CHECK:       # %bb.0: # %entry
-; CHECK-NEXT:    vsetvli zero, a1, e32, m2, ta, ma
-; CHECK-NEXT:    vsadd.vx v8, v8, a0
-; CHECK-NEXT:    ret
-entry:
-  %a = call <vscale x 4 x i32> @llvm.riscv.vsadd.nxv4i32.i32(
-    <vscale x 4 x i32> undef,
-    <vscale x 4 x i32> %0,
-    i32 %1,
-    i32 %2)
-
-  ret <vscale x 4 x i32> %a
-}
-
-declare <vscale x 4 x i32> @llvm.riscv.vsadd.mask.nxv4i32.i32(
-  <vscale x 4 x i32>,
-  <vscale x 4 x i32>,
-  i32,
-  <vscale x 4 x i1>,
-  i32,
-  i32);
-
-define <vscale x 4 x i32> @intrinsic_vsadd_mask_vx_nxv4i32_nxv4i32_i32(<vscale x 4 x i32> %0, <vscale x 4 x i32> %1, i32 %2, <vscale x 4 x i1> %3, i32 %4) nounwind {
-; CHECK-LABEL: intrinsic_vsadd_mask_vx_nxv4i32_nxv4i32_i32:
-; CHECK:       # %bb.0: # %entry
-; CHECK-NEXT:    vsetvli zero, a1, e32, m2, ta, mu
-; CHECK-NEXT:    vsadd.vx v8, v10, a0, v0.t
-; CHECK-NEXT:    ret
-entry:
-  %a = call <vscale x 4 x i32> @llvm.riscv.vsadd.mask.nxv4i32.i32(
-    <vscale x 4 x i32> %0,
-    <vscale x 4 x i32> %1,
-    i32 %2,
-    <vscale x 4 x i1> %3,
-    i32 %4, i32 1)
-
-  ret <vscale x 4 x i32> %a
-}
-
-declare <vscale x 8 x i32> @llvm.riscv.vsadd.nxv8i32.i32(
-  <vscale x 8 x i32>,
-  <vscale x 8 x i32>,
-  i32,
-  i32);
-
-define <vscale x 8 x i32> @intrinsic_vsadd_vx_nxv8i32_nxv8i32_i32(<vscale x 8 x i32> %0, i32 %1, i32 %2) nounwind {
-; CHECK-LABEL: intrinsic_vsadd_vx_nxv8i32_nxv8i32_i32:
-; CHECK:       # %bb.0: # %entry
-; CHECK-NEXT:    vsetvli zero, a1, e32, m4, ta, ma
-; CHECK-NEXT:    vsadd.vx v8, v8, a0
-; CHECK-NEXT:    ret
-entry:
-  %a = call <vscale x 8 x i32> @llvm.riscv.vsadd.nxv8i32.i32(
-    <vscale x 8 x i32> undef,
-    <vscale x 8 x i32> %0,
-    i32 %1,
-    i32 %2)
-
-  ret <vscale x 8 x i32> %a
-}
-
-declare <vscale x 8 x i32> @llvm.riscv.vsadd.mask.nxv8i32.i32(
-  <vscale x 8 x i32>,
-  <vscale x 8 x i32>,
-  i32,
-  <vscale x 8 x i1>,
-  i32,
-  i32);
-
-define <vscale x 8 x i32> @intrinsic_vsadd_mask_vx_nxv8i32_nxv8i32_i32(<vscale x 8 x i32> %0, <vscale x 8 x i32> %1, i32 %2, <vscale x 8 x i1> %3, i32 %4) nounwind {
-; CHECK-LABEL: intrinsic_vsadd_mask_vx_nxv8i32_nxv8i32_i32:
-; CHECK:       # %bb.0: # %entry
-; CHECK-NEXT:    vsetvli zero, a1, e32, m4, ta, mu
-; CHECK-NEXT:    vsadd.vx v8, v12, a0, v0.t
-; CHECK-NEXT:    ret
-entry:
-  %a = call <vscale x 8 x i32> @llvm.riscv.vsadd.mask.nxv8i32.i32(
-    <vscale x 8 x i32> %0,
-    <vscale x 8 x i32> %1,
-    i32 %2,
-    <vscale x 8 x i1> %3,
-    i32 %4, i32 1)
-
-  ret <vscale x 8 x i32> %a
-}
-
-declare <vscale x 16 x i32> @llvm.riscv.vsadd.nxv16i32.i32(
-  <vscale x 16 x i32>,
-  <vscale x 16 x i32>,
-  i32,
-  i32);
-
-define <vscale x 16 x i32> @intrinsic_vsadd_vx_nxv16i32_nxv16i32_i32(<vscale x 16 x i32> %0, i32 %1, i32 %2) nounwind {
-; CHECK-LABEL: intrinsic_vsadd_vx_nxv16i32_nxv16i32_i32:
-; CHECK:       # %bb.0: # %entry
-; CHECK-NEXT:    vsetvli zero, a1, e32, m8, ta, ma
-; CHECK-NEXT:    vsadd.vx v8, v8, a0
-; CHECK-NEXT:    ret
-entry:
-  %a = call <vscale x 16 x i32> @llvm.riscv.vsadd.nxv16i32.i32(
-    <vscale x 16 x i32> undef,
-    <vscale x 16 x i32> %0,
-    i32 %1,
-    i32 %2)
-
-  ret <vscale x 16 x i32> %a
-}
-
-declare <vscale x 16 x i32> @llvm.riscv.vsadd.mask.nxv16i32.i32(
-  <vscale x 16 x i32>,
-  <vscale x 16 x i32>,
-  i32,
-  <vscale x 16 x i1>,
-  i32,
-  i32);
-
-define <vscale x 16 x i32> @intrinsic_vsadd_mask_vx_nxv16i32_nxv16i32_i32(<vscale x 16 x i32> %0, <vscale x 16 x i32> %1, i32 %2, <vscale x 16 x i1> %3, i32 %4) nounwind {
-; CHECK-LABEL: intrinsic_vsadd_mask_vx_nxv16i32_nxv16i32_i32:
-; CHECK:       # %bb.0: # %entry
-; CHECK-NEXT:    vsetvli zero, a1, e32, m8, ta, mu
-; CHECK-NEXT:    vsadd.vx v8, v16, a0, v0.t
-; CHECK-NEXT:    ret
-entry:
-  %a = call <vscale x 16 x i32> @llvm.riscv.vsadd.mask.nxv16i32.i32(
-    <vscale x 16 x i32> %0,
-    <vscale x 16 x i32> %1,
-    i32 %2,
-    <vscale x 16 x i1> %3,
-    i32 %4, i32 1)
-
-  ret <vscale x 16 x i32> %a
-}
-
-declare <vscale x 1 x i64> @llvm.riscv.vsadd.nxv1i64.i64(
-  <vscale x 1 x i64>,
-  <vscale x 1 x i64>,
-  i64,
-  i32);
-
-define <vscale x 1 x i64> @intrinsic_vsadd_vx_nxv1i64_nxv1i64_i64(<vscale x 1 x i64> %0, i64 %1, i32 %2) nounwind {
-; CHECK-LABEL: intrinsic_vsadd_vx_nxv1i64_nxv1i64_i64:
-; CHECK:       # %bb.0: # %entry
-; CHECK-NEXT:    addi sp, sp, -16
-; CHECK-NEXT:    sw a1, 12(sp)
-; CHECK-NEXT:    sw a0, 8(sp)
-; CHECK-NEXT:    addi a0, sp, 8
-; CHECK-NEXT:    vsetvli zero, a2, e64, m1, ta, ma
-; CHECK-NEXT:    vlse64.v v9, (a0), zero
-; CHECK-NEXT:    vsadd.vv v8, v8, v9
-; CHECK-NEXT:    addi sp, sp, 16
-; CHECK-NEXT:    ret
-entry:
-  %a = call <vscale x 1 x i64> @llvm.riscv.vsadd.nxv1i64.i64(
-    <vscale x 1 x i64> undef,
-    <vscale x 1 x i64> %0,
-    i64 %1,
-    i32 %2)
-
-  ret <vscale x 1 x i64> %a
-}
-
-declare <vscale x 1 x i64> @llvm.riscv.vsadd.mask.nxv1i64.i64(
-  <vscale x 1 x i64>,
-  <vscale x 1 x i64>,
-  i64,
-  <vscale x 1 x i1>,
-  i32,
-  i32);
-
-define <vscale x 1 x i64> @intrinsic_vsadd_mask_vx_nxv1i64_nxv1i64_i64(<vscale x 1 x i64> %0, <vscale x 1 x i64> %1, i64 %2, <vscale x 1 x i1> %3, i32 %4) nounwind {
-; CHECK-LABEL: intrinsic_vsadd_mask_vx_nxv1i64_nxv1i64_i64:
-; CHECK:       # %bb.0: # %entry
-; CHECK-NEXT:    addi sp, sp, -16
-; CHECK-NEXT:    sw a1, 12(sp)
-; CHECK-NEXT:    sw a0, 8(sp)
-; CHECK-NEXT:    addi a0, sp, 8
-; CHECK-NEXT:    vsetvli zero, a2, e64, m1, ta, mu
-; CHECK-NEXT:    vlse64.v v10, (a0), zero
-; CHECK-NEXT:    vsadd.vv v8, v9, v10, v0.t
-; CHECK-NEXT:    addi sp, sp, 16
-; CHECK-NEXT:    ret
-entry:
-  %a = call <vscale x 1 x i64> @llvm.riscv.vsadd.mask.nxv1i64.i64(
-    <vscale x 1 x i64> %0,
-    <vscale x 1 x i64> %1,
-    i64 %2,
-    <vscale x 1 x i1> %3,
-    i32 %4, i32 1)
-
-  ret <vscale x 1 x i64> %a
-}
-
-declare <vscale x 2 x i64> @llvm.riscv.vsadd.nxv2i64.i64(
-  <vscale x 2 x i64>,
-  <vscale x 2 x i64>,
-  i64,
-  i32);
-
-define <vscale x 2 x i64> @intrinsic_vsadd_vx_nxv2i64_nxv2i64_i64(<vscale x 2 x i64> %0, i64 %1, i32 %2) nounwind {
-; CHECK-LABEL: intrinsic_vsadd_vx_nxv2i64_nxv2i64_i64:
-; CHECK:       # %bb.0: # %entry
-; CHECK-NEXT:    addi sp, sp, -16
-; CHECK-NEXT:    sw a1, 12(sp)
-; CHECK-NEXT:    sw a0, 8(sp)
-; CHECK-NEXT:    addi a0, sp, 8
-; CHECK-NEXT:    vsetvli zero, a2, e64, m2, ta, ma
-; CHECK-NEXT:    vlse64.v v10, (a0), zero
-; CHECK-NEXT:    vsadd.vv v8, v8, v10
-; CHECK-NEXT:    addi sp, sp, 16
-; CHECK-NEXT:    ret
-entry:
-  %a = call <vscale x 2 x i64> @llvm.riscv.vsadd.nxv2i64.i64(
-    <vscale x 2 x i64> undef,
-    <vscale x 2 x i64> %0,
-    i64 %1,
-    i32 %2)
-
-  ret <vscale x 2 x i64> %a
-}
-
-declare <vscale x 2 x i64> @llvm.riscv.vsadd.mask.nxv2i64.i64(
-  <vscale x 2 x i64>,
-  <vscale x 2 x i64>,
-  i64,
-  <vscale x 2 x i1>,
-  i32,
-  i32);
-
-define <vscale x 2 x i64> @intrinsic_vsadd_mask_vx_nxv2i64_nxv2i64_i64(<vscale x 2 x i64> %0, <vscale x 2 x i64> %1, i64 %2, <vscale x 2 x i1> %3, i32 %4) nounwind {
-; CHECK-LABEL: intrinsic_vsadd_mask_vx_nxv2i64_nxv2i64_i64:
-; CHECK:       # %bb.0: # %entry
-; CHECK-NEXT:    addi sp, sp, -16
-; CHECK-NEXT:    sw a1, 12(sp)
-; CHECK-NEXT:    sw a0, 8(sp)
-; CHECK-NEXT:    addi a0, sp, 8
-; CHECK-NEXT:    vsetvli zero, a2, e64, m2, ta, mu
-; CHECK-NEXT:    vlse64.v v12, (a0), zero
-; CHECK-NEXT:    vsadd.vv v8, v10, v12, v0.t
-; CHECK-NEXT:    addi sp, sp, 16
-; CHECK-NEXT:    ret
-entry:
-  %a = call <vscale x 2 x i64> @llvm.riscv.vsadd.mask.nxv2i64.i64(
-    <vscale x 2 x i64> %0,
-    <vscale x 2 x i64> %1,
-    i64 %2,
-    <vscale x 2 x i1> %3,
-    i32 %4, i32 1)
-
-  ret <vscale x 2 x i64> %a
-}
-
-declare <vscale x 4 x i64> @llvm.riscv.vsadd.nxv4i64.i64(
-  <vscale x 4 x i64>,
-  <vscale x 4 x i64>,
-  i64,
-  i32);
-
-define <vscale x 4 x i64> @intrinsic_vsadd_vx_nxv4i64_nxv4i64_i64(<vscale x 4 x i64> %0, i64 %1, i32 %2) nounwind {
-; CHECK-LABEL: intrinsic_vsadd_vx_nxv4i64_nxv4i64_i64:
-; CHECK:       # %bb.0: # %entry
-; CHECK-NEXT:    addi sp, sp, -16
-; CHECK-NEXT:    sw a1, 12(sp)
-; CHECK-NEXT:    sw a0, 8(sp)
-; CHECK-NEXT:    addi a0, sp, 8
-; CHECK-NEXT:    vsetvli zero, a2, e64, m4, ta, ma
-; CHECK-NEXT:    vlse64.v v12, (a0), zero
-; CHECK-NEXT:    vsadd.vv v8, v8, v12
-; CHECK-NEXT:    addi sp, sp, 16
-; CHECK-NEXT:    ret
-entry:
-  %a = call <vscale x 4 x i64> @llvm.riscv.vsadd.nxv4i64.i64(
-    <vscale x 4 x i64> undef,
-    <vscale x 4 x i64> %0,
-    i64 %1,
-    i32 %2)
-
-  ret <vscale x 4 x i64> %a
-}
-
-declare <vscale x 4 x i64> @llvm.riscv.vsadd.mask.nxv4i64.i64(
-  <vscale x 4 x i64>,
-  <vscale x 4 x i64>,
-  i64,
-  <vscale x 4 x i1>,
-  i32,
-  i32);
-
-define <vscale x 4 x i64> @intrinsic_vsadd_mask_vx_nxv4i64_nxv4i64_i64(<vscale x 4 x i64> %0, <vscale x 4 x i64> %1, i64 %2, <vscale x 4 x i1> %3, i32 %4) nounwind {
-; CHECK-LABEL: intrinsic_vsadd_mask_vx_nxv4i64_nxv4i64_i64:
-; CHECK:       # %bb.0: # %entry
-; CHECK-NEXT:    addi sp, sp, -16
-; CHECK-NEXT:    sw a1, 12(sp)
-; CHECK-NEXT:    sw a0, 8(sp)
-; CHECK-NEXT:    addi a0, sp, 8
-; CHECK-NEXT:    vsetvli zero, a2, e64, m4, ta, mu
-; CHECK-NEXT:    vlse64.v v16, (a0), zero
-; CHECK-NEXT:    vsadd.vv v8, v12, v16, v0.t
-; CHECK-NEXT:    addi sp, sp, 16
-; CHECK-NEXT:    ret
-entry:
-  %a = call <vscale x 4 x i64> @llvm.riscv.vsadd.mask.nxv4i64.i64(
-    <vscale x 4 x i64> %0,
-    <vscale x 4 x i64> %1,
-    i64 %2,
-    <vscale x 4 x i1> %3,
-    i32 %4, i32 1)
-
-  ret <vscale x 4 x i64> %a
-}
-
-declare <vscale x 8 x i64> @llvm.riscv.vsadd.nxv8i64.i64(
-  <vscale x 8 x i64>,
-  <vscale x 8 x i64>,
-  i64,
-  i32);
-
-define <vscale x 8 x i64> @intrinsic_vsadd_vx_nxv8i64_nxv8i64_i64(<vscale x 8 x i64> %0, i64 %1, i32 %2) nounwind {
-; CHECK-LABEL: intrinsic_vsadd_vx_nxv8i64_nxv8i64_i64:
-; CHECK:       # %bb.0: # %entry
-; CHECK-NEXT:    addi sp, sp, -16
-; CHECK-NEXT:    sw a1, 12(sp)
-; CHECK-NEXT:    sw a0, 8(sp)
-; CHECK-NEXT:    addi a0, sp, 8
-; CHECK-NEXT:    vsetvli zero, a2, e64, m8, ta, ma
-; CHECK-NEXT:    vlse64.v v16, (a0), zero
-; CHECK-NEXT:    vsadd.vv v8, v8, v16
-; CHECK-NEXT:    addi sp, sp, 16
-; CHECK-NEXT:    ret
-entry:
-  %a = call <vscale x 8 x i64> @llvm.riscv.vsadd.nxv8i64.i64(
-    <vscale x 8 x i64> undef,
-    <vscale x 8 x i64> %0,
-    i64 %1,
-    i32 %2)
-
-  ret <vscale x 8 x i64> %a
-}
-
-declare <vscale x 8 x i64> @llvm.riscv.vsadd.mask.nxv8i64.i64(
-  <vscale x 8 x i64>,
-  <vscale x 8 x i64>,
-  i64,
-  <vscale x 8 x i1>,
-  i32,
-  i32);
-
-define <vscale x 8 x i64> @intrinsic_vsadd_mask_vx_nxv8i64_nxv8i64_i64(<vscale x 8 x i64> %0, <vscale x 8 x i64> %1, i64 %2, <vscale x 8 x i1> %3, i32 %4) nounwind {
-; CHECK-LABEL: intrinsic_vsadd_mask_vx_nxv8i64_nxv8i64_i64:
-; CHECK:       # %bb.0: # %entry
-; CHECK-NEXT:    addi sp, sp, -16
-; CHECK-NEXT:    sw a1, 12(sp)
-; CHECK-NEXT:    sw a0, 8(sp)
-; CHECK-NEXT:    addi a0, sp, 8
-; CHECK-NEXT:    vsetvli zero, a2, e64, m8, ta, mu
-; CHECK-NEXT:    vlse64.v v24, (a0), zero
-; CHECK-NEXT:    vsadd.vv v8, v16, v24, v0.t
-; CHECK-NEXT:    addi sp, sp, 16
-; CHECK-NEXT:    ret
-entry:
-  %a = call <vscale x 8 x i64> @llvm.riscv.vsadd.mask.nxv8i64.i64(
-    <vscale x 8 x i64> %0,
-    <vscale x 8 x i64> %1,
-    i64 %2,
-    <vscale x 8 x i1> %3,
-    i32 %4, i32 1)
-
-  ret <vscale x 8 x i64> %a
-}
-
-define <vscale x 1 x i8> @intrinsic_vsadd_vi_nxv1i8_nxv1i8_i8(<vscale x 1 x i8> %0, i32 %1) nounwind {
-; CHECK-LABEL: intrinsic_vsadd_vi_nxv1i8_nxv1i8_i8:
-; CHECK:       # %bb.0: # %entry
-; CHECK-NEXT:    vsetvli zero, a0, e8, mf8, ta, ma
-; CHECK-NEXT:    vsadd.vi v8, v8, 9
-; CHECK-NEXT:    ret
-entry:
-  %a = call <vscale x 1 x i8> @llvm.riscv.vsadd.nxv1i8.i8(
-    <vscale x 1 x i8> undef,
-    <vscale x 1 x i8> %0,
-    i8 9,
-    i32 %1)
-
-  ret <vscale x 1 x i8> %a
-}
-
-define <vscale x 1 x i8> @intrinsic_vsadd_mask_vi_nxv1i8_nxv1i8_i8(<vscale x 1 x i8> %0, <vscale x 1 x i8> %1, <vscale x 1 x i1> %2, i32 %3) nounwind {
-; CHECK-LABEL: intrinsic_vsadd_mask_vi_nxv1i8_nxv1i8_i8:
-; CHECK:       # %bb.0: # %entry
-; CHECK-NEXT:    vsetvli zero, a0, e8, mf8, ta, mu
-; CHECK-NEXT:    vsadd.vi v8, v9, 9, v0.t
-; CHECK-NEXT:    ret
-entry:
-  %a = call <vscale x 1 x i8> @llvm.riscv.vsadd.mask.nxv1i8.i8(
-    <vscale x 1 x i8> %0,
-    <vscale x 1 x i8> %1,
-    i8 9,
-    <vscale x 1 x i1> %2,
-    i32 %3, i32 1)
-
-  ret <vscale x 1 x i8> %a
-}
-
-define <vscale x 2 x i8> @intrinsic_vsadd_vi_nxv2i8_nxv2i8_i8(<vscale x 2 x i8> %0, i32 %1) nounwind {
-; CHECK-LABEL: intrinsic_vsadd_vi_nxv2i8_nxv2i8_i8:
-; CHECK:       # %bb.0: # %entry
-; CHECK-NEXT:    vsetvli zero, a0, e8, mf4, ta, ma
-; CHECK-NEXT:    vsadd.vi v8, v8, 9
-; CHECK-NEXT:    ret
-entry:
-  %a = call <vscale x 2 x i8> @llvm.riscv.vsadd.nxv2i8.i8(
-    <vscale x 2 x i8> undef,
-    <vscale x 2 x i8> %0,
-    i8 9,
-    i32 %1)
-
-  ret <vscale x 2 x i8> %a
-}
-
-define <vscale x 2 x i8> @intrinsic_vsadd_mask_vi_nxv2i8_nxv2i8_i8(<vscale x 2 x i8> %0, <vscale x 2 x i8> %1, <vscale x 2 x i1> %2, i32 %3) nounwind {
-; CHECK-LABEL: intrinsic_vsadd_mask_vi_nxv2i8_nxv2i8_i8:
-; CHECK:       # %bb.0: # %entry
-; CHECK-NEXT:    vsetvli zero, a0, e8, mf4, ta, mu
-; CHECK-NEXT:    vsadd.vi v8, v9, 9, v0.t
-; CHECK-NEXT:    ret
-entry:
-  %a = call <vscale x 2 x i8> @llvm.riscv.vsadd.mask.nxv2i8.i8(
-    <vscale x 2 x i8> %0,
-    <vscale x 2 x i8> %1,
-    i8 9,
-    <vscale x 2 x i1> %2,
-    i32 %3, i32 1)
-
-  ret <vscale x 2 x i8> %a
-}
-
-define <vscale x 4 x i8> @intrinsic_vsadd_vi_nxv4i8_nxv4i8_i8(<vscale x 4 x i8> %0, i32 %1) nounwind {
-; CHECK-LABEL: intrinsic_vsadd_vi_nxv4i8_nxv4i8_i8:
-; CHECK:       # %bb.0: # %entry
-; CHECK-NEXT:    vsetvli zero, a0, e8, mf2, ta, ma
-; CHECK-NEXT:    vsadd.vi v8, v8, 9
-; CHECK-NEXT:    ret
-entry:
-  %a = call <vscale x 4 x i8> @llvm.riscv.vsadd.nxv4i8.i8(
-    <vscale x 4 x i8> undef,
-    <vscale x 4 x i8> %0,
-    i8 9,
-    i32 %1)
-
-  ret <vscale x 4 x i8> %a
-}
-
-define <vscale x 4 x i8> @intrinsic_vsadd_mask_vi_nxv4i8_nxv4i8_i8(<vscale x 4 x i8> %0, <vscale x 4 x i8> %1, <vscale x 4 x i1> %2, i32 %3) nounwind {
-; CHECK-LABEL: intrinsic_vsadd_mask_vi_nxv4i8_nxv4i8_i8:
-; CHECK:       # %bb.0: # %entry
-; CHECK-NEXT:    vsetvli zero, a0, e8, mf2, ta, mu
-; CHECK-NEXT:    vsadd.vi v8, v9, 9, v0.t
-; CHECK-NEXT:    ret
-entry:
-  %a = call <vscale x 4 x i8> @llvm.riscv.vsadd.mask.nxv4i8.i8(
-    <vscale x 4 x i8> %0,
-    <vscale x 4 x i8> %1,
-    i8 9,
-    <vscale x 4 x i1> %2,
-    i32 %3, i32 1)
-
-  ret <vscale x 4 x i8> %a
-}
-
-define <vscale x 8 x i8> @intrinsic_vsadd_vi_nxv8i8_nxv8i8_i8(<vscale x 8 x i8> %0, i32 %1) nounwind {
-; CHECK-LABEL: intrinsic_vsadd_vi_nxv8i8_nxv8i8_i8:
-; CHECK:       # %bb.0: # %entry
-; CHECK-NEXT:    vsetvli zero, a0, e8, m1, ta, ma
-; CHECK-NEXT:    vsadd.vi v8, v8, 9
-; CHECK-NEXT:    ret
-entry:
-  %a = call <vscale x 8 x i8> @llvm.riscv.vsadd.nxv8i8.i8(
-    <vscale x 8 x i8> undef,
-    <vscale x 8 x i8> %0,
-    i8 9,
-    i32 %1)
-
-  ret <vscale x 8 x i8> %a
-}
-
-define <vscale x 8 x i8> @intrinsic_vsadd_mask_vi_nxv8i8_nxv8i8_i8(<vscale x 8 x i8> %0, <vscale x 8 x i8> %1, <vscale x 8 x i1> %2, i32 %3) nounwind {
-; CHECK-LABEL: intrinsic_vsadd_mask_vi_nxv8i8_nxv8i8_i8:
-; CHECK:       # %bb.0: # %entry
-; CHECK-NEXT:    vsetvli zero, a0, e8, m1, ta, mu
-; CHECK-NEXT:    vsadd.vi v8, v9, 9, v0.t
-; CHECK-NEXT:    ret
-entry:
-  %a = call <vscale x 8 x i8> @llvm.riscv.vsadd.mask.nxv8i8.i8(
-    <vscale x 8 x i8> %0,
-    <vscale x 8 x i8> %1,
-    i8 9,
-    <vscale x 8 x i1> %2,
-    i32 %3, i32 1)
-
-  ret <vscale x 8 x i8> %a
-}
-
-define <vscale x 16 x i8> @intrinsic_vsadd_vi_nxv16i8_nxv16i8_i8(<vscale x 16 x i8> %0, i32 %1) nounwind {
-; CHECK-LABEL: intrinsic_vsadd_vi_nxv16i8_nxv16i8_i8:
-; CHECK:       # %bb.0: # %entry
-; CHECK-NEXT:    vsetvli zero, a0, e8, m2, ta, ma
-; CHECK-NEXT:    vsadd.vi v8, v8, 9
-; CHECK-NEXT:    ret
-entry:
-  %a = call <vscale x 16 x i8> @llvm.riscv.vsadd.nxv16i8.i8(
-    <vscale x 16 x i8> undef,
-    <vscale x 16 x i8> %0,
-    i8 9,
-    i32 %1)
-
-  ret <vscale x 16 x i8> %a
-}
-
-define <vscale x 16 x i8> @intrinsic_vsadd_mask_vi_nxv16i8_nxv16i8_i8(<vscale x 16 x i8> %0, <vscale x 16 x i8> %1, <vscale x 16 x i1> %2, i32 %3) nounwind {
-; CHECK-LABEL: intrinsic_vsadd_mask_vi_nxv16i8_nxv16i8_i8:
-; CHECK:       # %bb.0: # %entry
-; CHECK-NEXT:    vsetvli zero, a0, e8, m2, ta, mu
-; CHECK-NEXT:    vsadd.vi v8, v10, 9, v0.t
-; CHECK-NEXT:    ret
-entry:
-  %a = call <vscale x 16 x i8> @llvm.riscv.vsadd.mask.nxv16i8.i8(
-    <vscale x 16 x i8> %0,
-    <vscale x 16 x i8> %1,
-    i8 9,
-    <vscale x 16 x i1> %2,
-    i32 %3, i32 1)
-
-  ret <vscale x 16 x i8> %a
-}
-
-define <vscale x 32 x i8> @intrinsic_vsadd_vi_nxv32i8_nxv32i8_i8(<vscale x 32 x i8> %0, i32 %1) nounwind {
-; CHECK-LABEL: intrinsic_vsadd_vi_nxv32i8_nxv32i8_i8:
-; CHECK:       # %bb.0: # %entry
-; CHECK-NEXT:    vsetvli zero, a0, e8, m4, ta, ma
-; CHECK-NEXT:    vsadd.vi v8, v8, 9
-; CHECK-NEXT:    ret
-entry:
-  %a = call <vscale x 32 x i8> @llvm.riscv.vsadd.nxv32i8.i8(
-    <vscale x 32 x i8> undef,
-    <vscale x 32 x i8> %0,
-    i8 9,
-    i32 %1)
-
-  ret <vscale x 32 x i8> %a
-}
-
-define <vscale x 32 x i8> @intrinsic_vsadd_mask_vi_nxv32i8_nxv32i8_i8(<vscale x 32 x i8> %0, <vscale x 32 x i8> %1, <vscale x 32 x i1> %2, i32 %3) nounwind {
-; CHECK-LABEL: intrinsic_vsadd_mask_vi_nxv32i8_nxv32i8_i8:
-; CHECK:       # %bb.0: # %entry
-; CHECK-NEXT:    vsetvli zero, a0, e8, m4, ta, mu
-; CHECK-NEXT:    vsadd.vi v8, v12, 9, v0.t
-; CHECK-NEXT:    ret
-entry:
-  %a = call <vscale x 32 x i8> @llvm.riscv.vsadd.mask.nxv32i8.i8(
-    <vscale x 32 x i8> %0,
-    <vscale x 32 x i8> %1,
-    i8 9,
-    <vscale x 32 x i1> %2,
-    i32 %3, i32 1)
-
-  ret <vscale x 32 x i8> %a
-}
-
-define <vscale x 64 x i8> @intrinsic_vsadd_vi_nxv64i8_nxv64i8_i8(<vscale x 64 x i8> %0, i32 %1) nounwind {
-; CHECK-LABEL: intrinsic_vsadd_vi_nxv64i8_nxv64i8_i8:
-; CHECK:       # %bb.0: # %entry
-; CHECK-NEXT:    vsetvli zero, a0, e8, m8, ta, ma
-; CHECK-NEXT:    vsadd.vi v8, v8, 9
-; CHECK-NEXT:    ret
-entry:
-  %a = call <vscale x 64 x i8> @llvm.riscv.vsadd.nxv64i8.i8(
-    <vscale x 64 x i8> undef,
-    <vscale x 64 x i8> %0,
-    i8 9,
-    i32 %1)
-
-  ret <vscale x 64 x i8> %a
-}
-
-define <vscale x 64 x i8> @intrinsic_vsadd_mask_vi_nxv64i8_nxv64i8_i8(<vscale x 64 x i8> %0, <vscale x 64 x i8> %1, <vscale x 64 x i1> %2, i32 %3) nounwind {
-; CHECK-LABEL: intrinsic_vsadd_mask_vi_nxv64i8_nxv64i8_i8:
-; CHECK:       # %bb.0: # %entry
-; CHECK-NEXT:    vsetvli zero, a0, e8, m8, ta, mu
-; CHECK-NEXT:    vsadd.vi v8, v16, 9, v0.t
-; CHECK-NEXT:    ret
-entry:
-  %a = call <vscale x 64 x i8> @llvm.riscv.vsadd.mask.nxv64i8.i8(
-    <vscale x 64 x i8> %0,
-    <vscale x 64 x i8> %1,
-    i8 9,
-    <vscale x 64 x i1> %2,
-    i32 %3, i32 1)
-
-  ret <vscale x 64 x i8> %a
-}
-
-define <vscale x 1 x i16> @intrinsic_vsadd_vi_nxv1i16_nxv1i16_i16(<vscale x 1 x i16> %0, i32 %1) nounwind {
-; CHECK-LABEL: intrinsic_vsadd_vi_nxv1i16_nxv1i16_i16:
-; CHECK:       # %bb.0: # %entry
-; CHECK-NEXT:    vsetvli zero, a0, e16, mf4, ta, ma
-; CHECK-NEXT:    vsadd.vi v8, v8, 9
-; CHECK-NEXT:    ret
-entry:
-  %a = call <vscale x 1 x i16> @llvm.riscv.vsadd.nxv1i16.i16(
-    <vscale x 1 x i16> undef,
-    <vscale x 1 x i16> %0,
-    i16 9,
-    i32 %1)
-
-  ret <vscale x 1 x i16> %a
-}
-
-define <vscale x 1 x i16> @intrinsic_vsadd_mask_vi_nxv1i16_nxv1i16_i16(<vscale x 1 x i16> %0, <vscale x 1 x i16> %1, <vscale x 1 x i1> %2, i32 %3) nounwind {
-; CHECK-LABEL: intrinsic_vsadd_mask_vi_nxv1i16_nxv1i16_i16:
-; CHECK:       # %bb.0: # %entry
-; CHECK-NEXT:    vsetvli zero, a0, e16, mf4, ta, mu
-; CHECK-NEXT:    vsadd.vi v8, v9, 9, v0.t
-; CHECK-NEXT:    ret
-entry:
-  %a = call <vscale x 1 x i16> @llvm.riscv.vsadd.mask.nxv1i16.i16(
-    <vscale x 1 x i16> %0,
-    <vscale x 1 x i16> %1,
-    i16 9,
-    <vscale x 1 x i1> %2,
-    i32 %3, i32 1)
-
-  ret <vscale x 1 x i16> %a
-}
-
-define <vscale x 2 x i16> @intrinsic_vsadd_vi_nxv2i16_nxv2i16_i16(<vscale x 2 x i16> %0, i32 %1) nounwind {
-; CHECK-LABEL: intrinsic_vsadd_vi_nxv2i16_nxv2i16_i16:
-; CHECK:       # %bb.0: # %entry
-; CHECK-NEXT:    vsetvli zero, a0, e16, mf2, ta, ma
-; CHECK-NEXT:    vsadd.vi v8, v8, 9
-; CHECK-NEXT:    ret
-entry:
-  %a = call <vscale x 2 x i16> @llvm.riscv.vsadd.nxv2i16.i16(
-    <vscale x 2 x i16> undef,
-    <vscale x 2 x i16> %0,
-    i16 9,
-    i32 %1)
-
-  ret <vscale x 2 x i16> %a
-}
-
-define <vscale x 2 x i16> @intrinsic_vsadd_mask_vi_nxv2i16_nxv2i16_i16(<vscale x 2 x i16> %0, <vscale x 2 x i16> %1, <vscale x 2 x i1> %2, i32 %3) nounwind {
-; CHECK-LABEL: intrinsic_vsadd_mask_vi_nxv2i16_nxv2i16_i16:
-; CHECK:       # %bb.0: # %entry
-; CHECK-NEXT:    vsetvli zero, a0, e16, mf2, ta, mu
-; CHECK-NEXT:    vsadd.vi v8, v9, 9, v0.t
-; CHECK-NEXT:    ret
-entry:
-  %a = call <vscale x 2 x i16> @llvm.riscv.vsadd.mask.nxv2i16.i16(
-    <vscale x 2 x i16> %0,
-    <vscale x 2 x i16> %1,
-    i16 9,
-    <vscale x 2 x i1> %2,
-    i32 %3, i32 1)
-
-  ret <vscale x 2 x i16> %a
-}
-
-define <vscale x 4 x i16> @intrinsic_vsadd_vi_nxv4i16_nxv4i16_i16(<vscale x 4 x i16> %0, i32 %1) nounwind {
-; CHECK-LABEL: intrinsic_vsadd_vi_nxv4i16_nxv4i16_i16:
-; CHECK:       # %bb.0: # %entry
-; CHECK-NEXT:    vsetvli zero, a0, e16, m1, ta, ma
-; CHECK-NEXT:    vsadd.vi v8, v8, 9
-; CHECK-NEXT:    ret
-entry:
-  %a = call <vscale x 4 x i16> @llvm.riscv.vsadd.nxv4i16.i16(
-    <vscale x 4 x i16> undef,
-    <vscale x 4 x i16> %0,
-    i16 9,
-    i32 %1)
-
-  ret <vscale x 4 x i16> %a
-}
-
-define <vscale x 4 x i16> @intrinsic_vsadd_mask_vi_nxv4i16_nxv4i16_i16(<vscale x 4 x i16> %0, <vscale x 4 x i16> %1, <vscale x 4 x i1> %2, i32 %3) nounwind {
-; CHECK-LABEL: intrinsic_vsadd_mask_vi_nxv4i16_nxv4i16_i16:
-; CHECK:       # %bb.0: # %entry
-; CHECK-NEXT:    vsetvli zero, a0, e16, m1, ta, mu
-; CHECK-NEXT:    vsadd.vi v8, v9, 9, v0.t
-; CHECK-NEXT:    ret
-entry:
-  %a = call <vscale x 4 x i16> @llvm.riscv.vsadd.mask.nxv4i16.i16(
-    <vscale x 4 x i16> %0,
-    <vscale x 4 x i16> %1,
-    i16 9,
-    <vscale x 4 x i1> %2,
-    i32 %3, i32 1)
-
-  ret <vscale x 4 x i16> %a
-}
-
-define <vscale x 8 x i16> @intrinsic_vsadd_vi_nxv8i16_nxv8i16_i16(<vscale x 8 x i16> %0, i32 %1) nounwind {
-; CHECK-LABEL: intrinsic_vsadd_vi_nxv8i16_nxv8i16_i16:
-; CHECK:       # %bb.0: # %entry
-; CHECK-NEXT:    vsetvli zero, a0, e16, m2, ta, ma
-; CHECK-NEXT:    vsadd.vi v8, v8, 9
-; CHECK-NEXT:    ret
-entry:
-  %a = call <vscale x 8 x i16> @llvm.riscv.vsadd.nxv8i16.i16(
-    <vscale x 8 x i16> undef,
-    <vscale x 8 x i16> %0,
-    i16 9,
-    i32 %1)
-
-  ret <vscale x 8 x i16> %a
-}
-
-define <vscale x 8 x i16> @intrinsic_vsadd_mask_vi_nxv8i16_nxv8i16_i16(<vscale x 8 x i16> %0, <vscale x 8 x i16> %1, <vscale x 8 x i1> %2, i32 %3) nounwind {
-; CHECK-LABEL: intrinsic_vsadd_mask_vi_nxv8i16_nxv8i16_i16:
-; CHECK:       # %bb.0: # %entry
-; CHECK-NEXT:    vsetvli zero, a0, e16, m2, ta, mu
-; CHECK-NEXT:    vsadd.vi v8, v10, 9, v0.t
-; CHECK-NEXT:    ret
-entry:
-  %a = call <vscale x 8 x i16> @llvm.riscv.vsadd.mask.nxv8i16.i16(
-    <vscale x 8 x i16> %0,
-    <vscale x 8 x i16> %1,
-    i16 9,
-    <vscale x 8 x i1> %2,
-    i32 %3, i32 1)
-
-  ret <vscale x 8 x i16> %a
-}
-
-define <vscale x 16 x i16> @intrinsic_vsadd_vi_nxv16i16_nxv16i16_i16(<vscale x 16 x i16> %0, i32 %1) nounwind {
-; CHECK-LABEL: intrinsic_vsadd_vi_nxv16i16_nxv16i16_i16:
-; CHECK:       # %bb.0: # %entry
-; CHECK-NEXT:    vsetvli zero, a0, e16, m4, ta, ma
-; CHECK-NEXT:    vsadd.vi v8, v8, 9
-; CHECK-NEXT:    ret
-entry:
-  %a = call <vscale x 16 x i16> @llvm.riscv.vsadd.nxv16i16.i16(
-    <vscale x 16 x i16> undef,
-    <vscale x 16 x i16> %0,
-    i16 9,
-    i32 %1)
-
-  ret <vscale x 16 x i16> %a
-}
-
-define <vscale x 16 x i16> @intrinsic_vsadd_mask_vi_nxv16i16_nxv16i16_i16(<vscale x 16 x i16> %0, <vscale x 16 x i16> %1, <vscale x 16 x i1> %2, i32 %3) nounwind {
-; CHECK-LABEL: intrinsic_vsadd_mask_vi_nxv16i16_nxv16i16_i16:
-; CHECK:       # %bb.0: # %entry
-; CHECK-NEXT:    vsetvli zero, a0, e16, m4, ta, mu
-; CHECK-NEXT:    vsadd.vi v8, v12, 9, v0.t
-; CHECK-NEXT:    ret
-entry:
-  %a = call <vscale x 16 x i16> @llvm.riscv.vsadd.mask.nxv16i16.i16(
-    <vscale x 16 x i16> %0,
-    <vscale x 16 x i16> %1,
-    i16 9,
-    <vscale x 16 x i1> %2,
-    i32 %3, i32 1)
-
-  ret <vscale x 16 x i16> %a
-}
-
-define <vscale x 32 x i16> @intrinsic_vsadd_vi_nxv32i16_nxv32i16_i16(<vscale x 32 x i16> %0, i32 %1) nounwind {
-; CHECK-LABEL: intrinsic_vsadd_vi_nxv32i16_nxv32i16_i16:
-; CHECK:       # %bb.0: # %entry
-; CHECK-NEXT:    vsetvli zero, a0, e16, m8, ta, ma
-; CHECK-NEXT:    vsadd.vi v8, v8, 9
-; CHECK-NEXT:    ret
-entry:
-  %a = call <vscale x 32 x i16> @llvm.riscv.vsadd.nxv32i16.i16(
-    <vscale x 32 x i16> undef,
-    <vscale x 32 x i16> %0,
-    i16 9,
-    i32 %1)
-
-  ret <vscale x 32 x i16> %a
-}
-
-define <vscale x 32 x i16> @intrinsic_vsadd_mask_vi_nxv32i16_nxv32i16_i16(<vscale x 32 x i16> %0, <vscale x 32 x i16> %1, <vscale x 32 x i1> %2, i32 %3) nounwind {
-; CHECK-LABEL: intrinsic_vsadd_mask_vi_nxv32i16_nxv32i16_i16:
-; CHECK:       # %bb.0: # %entry
-; CHECK-NEXT:    vsetvli zero, a0, e16, m8, ta, mu
-; CHECK-NEXT:    vsadd.vi v8, v16, 9, v0.t
-; CHECK-NEXT:    ret
-entry:
-  %a = call <vscale x 32 x i16> @llvm.riscv.vsadd.mask.nxv32i16.i16(
-    <vscale x 32 x i16> %0,
-    <vscale x 32 x i16> %1,
-    i16 9,
-    <vscale x 32 x i1> %2,
-    i32 %3, i32 1)
-
-  ret <vscale x 32 x i16> %a
-}
-
-define <vscale x 1 x i32> @intrinsic_vsadd_vi_nxv1i32_nxv1i32_i32(<vscale x 1 x i32> %0, i32 %1) nounwind {
-; CHECK-LABEL: intrinsic_vsadd_vi_nxv1i32_nxv1i32_i32:
-; CHECK:       # %bb.0: # %entry
-; CHECK-NEXT:    vsetvli zero, a0, e32, mf2, ta, ma
-; CHECK-NEXT:    vsadd.vi v8, v8, 9
-; CHECK-NEXT:    ret
-entry:
-  %a = call <vscale x 1 x i32> @llvm.riscv.vsadd.nxv1i32.i32(
-    <vscale x 1 x i32> undef,
-    <vscale x 1 x i32> %0,
-    i32 9,
-    i32 %1)
-
-  ret <vscale x 1 x i32> %a
-}
-
-define <vscale x 1 x i32> @intrinsic_vsadd_mask_vi_nxv1i32_nxv1i32_i32(<vscale x 1 x i32> %0, <vscale x 1 x i32> %1, <vscale x 1 x i1> %2, i32 %3) nounwind {
-; CHECK-LABEL: intrinsic_vsadd_mask_vi_nxv1i32_nxv1i32_i32:
-; CHECK:       # %bb.0: # %entry
-; CHECK-NEXT:    vsetvli zero, a0, e32, mf2, ta, mu
-; CHECK-NEXT:    vsadd.vi v8, v9, 9, v0.t
-; CHECK-NEXT:    ret
-entry:
-  %a = call <vscale x 1 x i32> @llvm.riscv.vsadd.mask.nxv1i32.i32(
-    <vscale x 1 x i32> %0,
-    <vscale x 1 x i32> %1,
-    i32 9,
-    <vscale x 1 x i1> %2,
-    i32 %3, i32 1)
-
-  ret <vscale x 1 x i32> %a
-}
-
-define <vscale x 2 x i32> @intrinsic_vsadd_vi_nxv2i32_nxv2i32_i32(<vscale x 2 x i32> %0, i32 %1) nounwind {
-; CHECK-LABEL: intrinsic_vsadd_vi_nxv2i32_nxv2i32_i32:
-; CHECK:       # %bb.0: # %entry
-; CHECK-NEXT:    vsetvli zero, a0, e32, m1, ta, ma
-; CHECK-NEXT:    vsadd.vi v8, v8, 9
-; CHECK-NEXT:    ret
-entry:
-  %a = call <vscale x 2 x i32> @llvm.riscv.vsadd.nxv2i32.i32(
-    <vscale x 2 x i32> undef,
-    <vscale x 2 x i32> %0,
-    i32 9,
-    i32 %1)
-
-  ret <vscale x 2 x i32> %a
-}
-
-define <vscale x 2 x i32> @intrinsic_vsadd_mask_vi_nxv2i32_nxv2i32_i32(<vscale x 2 x i32> %0, <vscale x 2 x i32> %1, <vscale x 2 x i1> %2, i32 %3) nounwind {
-; CHECK-LABEL: intrinsic_vsadd_mask_vi_nxv2i32_nxv2i32_i32:
-; CHECK:       # %bb.0: # %entry
-; CHECK-NEXT:    vsetvli zero, a0, e32, m1, ta, mu
-; CHECK-NEXT:    vsadd.vi v8, v9, 9, v0.t
-; CHECK-NEXT:    ret
-entry:
-  %a = call <vscale x 2 x i32> @llvm.riscv.vsadd.mask.nxv2i32.i32(
-    <vscale x 2 x i32> %0,
-    <vscale x 2 x i32> %1,
-    i32 9,
-    <vscale x 2 x i1> %2,
-    i32 %3, i32 1)
-
-  ret <vscale x 2 x i32> %a
-}
-
-define <vscale x 4 x i32> @intrinsic_vsadd_vi_nxv4i32_nxv4i32_i32(<vscale x 4 x i32> %0, i32 %1) nounwind {
-; CHECK-LABEL: intrinsic_vsadd_vi_nxv4i32_nxv4i32_i32:
-; CHECK:       # %bb.0: # %entry
-; CHECK-NEXT:    vsetvli zero, a0, e32, m2, ta, ma
-; CHECK-NEXT:    vsadd.vi v8, v8, 9
-; CHECK-NEXT:    ret
-entry:
-  %a = call <vscale x 4 x i32> @llvm.riscv.vsadd.nxv4i32.i32(
-    <vscale x 4 x i32> undef,
-    <vscale x 4 x i32> %0,
-    i32 9,
-    i32 %1)
-
-  ret <vscale x 4 x i32> %a
-}
-
-define <vscale x 4 x i32> @intrinsic_vsadd_mask_vi_nxv4i32_nxv4i32_i32(<vscale x 4 x i32> %0, <vscale x 4 x i32> %1, <vscale x 4 x i1> %2, i32 %3) nounwind {
-; CHECK-LABEL: intrinsic_vsadd_mask_vi_nxv4i32_nxv4i32_i32:
-; CHECK:       # %bb.0: # %entry
-; CHECK-NEXT:    vsetvli zero, a0, e32, m2, ta, mu
-; CHECK-NEXT:    vsadd.vi v8, v10, 9, v0.t
-; CHECK-NEXT:    ret
-entry:
-  %a = call <vscale x 4 x i32> @llvm.riscv.vsadd.mask.nxv4i32.i32(
-    <vscale x 4 x i32> %0,
-    <vscale x 4 x i32> %1,
-    i32 9,
-    <vscale x 4 x i1> %2,
-    i32 %3, i32 1)
-
-  ret <vscale x 4 x i32> %a
-}
-
-define <vscale x 8 x i32> @intrinsic_vsadd_vi_nxv8i32_nxv8i32_i32(<vscale x 8 x i32> %0, i32 %1) nounwind {
-; CHECK-LABEL: intrinsic_vsadd_vi_nxv8i32_nxv8i32_i32:
-; CHECK:       # %bb.0: # %entry
-; CHECK-NEXT:    vsetvli zero, a0, e32, m4, ta, ma
-; CHECK-NEXT:    vsadd.vi v8, v8, 9
-; CHECK-NEXT:    ret
-entry:
-  %a = call <vscale x 8 x i32> @llvm.riscv.vsadd.nxv8i32.i32(
-    <vscale x 8 x i32> undef,
-    <vscale x 8 x i32> %0,
-    i32 9,
-    i32 %1)
-
-  ret <vscale x 8 x i32> %a
-}
-
-define <vscale x 8 x i32> @intrinsic_vsadd_mask_vi_nxv8i32_nxv8i32_i32(<vscale x 8 x i32> %0, <vscale x 8 x i32> %1, <vscale x 8 x i1> %2, i32 %3) nounwind {
-; CHECK-LABEL: intrinsic_vsadd_mask_vi_nxv8i32_nxv8i32_i32:
-; CHECK:       # %bb.0: # %entry
-; CHECK-NEXT:    vsetvli zero, a0, e32, m4, ta, mu
-; CHECK-NEXT:    vsadd.vi v8, v12, 9, v0.t
-; CHECK-NEXT:    ret
-entry:
-  %a = call <vscale x 8 x i32> @llvm.riscv.vsadd.mask.nxv8i32.i32(
-    <vscale x 8 x i32> %0,
-    <vscale x 8 x i32> %1,
-    i32 9,
-    <vscale x 8 x i1> %2,
-    i32 %3, i32 1)
-
-  ret <vscale x 8 x i32> %a
-}
-
-define <vscale x 16 x i32> @intrinsic_vsadd_vi_nxv16i32_nxv16i32_i32(<vscale x 16 x i32> %0, i32 %1) nounwind {
-; CHECK-LABEL: intrinsic_vsadd_vi_nxv16i32_nxv16i32_i32:
-; CHECK:       # %bb.0: # %entry
-; CHECK-NEXT:    vsetvli zero, a0, e32, m8, ta, ma
-; CHECK-NEXT:    vsadd.vi v8, v8, 9
-; CHECK-NEXT:    ret
-entry:
-  %a = call <vscale x 16 x i32> @llvm.riscv.vsadd.nxv16i32.i32(
-    <vscale x 16 x i32> undef,
-    <vscale x 16 x i32> %0,
-    i32 9,
-    i32 %1)
-
-  ret <vscale x 16 x i32> %a
-}
-
-define <vscale x 16 x i32> @intrinsic_vsadd_mask_vi_nxv16i32_nxv16i32_i32(<vscale x 16 x i32> %0, <vscale x 16 x i32> %1, <vscale x 16 x i1> %2, i32 %3) nounwind {
-; CHECK-LABEL: intrinsic_vsadd_mask_vi_nxv16i32_nxv16i32_i32:
-; CHECK:       # %bb.0: # %entry
-; CHECK-NEXT:    vsetvli zero, a0, e32, m8, ta, mu
-; CHECK-NEXT:    vsadd.vi v8, v16, 9, v0.t
-; CHECK-NEXT:    ret
-entry:
-  %a = call <vscale x 16 x i32> @llvm.riscv.vsadd.mask.nxv16i32.i32(
-    <vscale x 16 x i32> %0,
-    <vscale x 16 x i32> %1,
-    i32 9,
-    <vscale x 16 x i1> %2,
-    i32 %3, i32 1)
-
-  ret <vscale x 16 x i32> %a
-}
-
-define <vscale x 1 x i64> @intrinsic_vsadd_vi_nxv1i64_nxv1i64_i64(<vscale x 1 x i64> %0, i32 %1) nounwind {
-; CHECK-LABEL: intrinsic_vsadd_vi_nxv1i64_nxv1i64_i64:
-; CHECK:       # %bb.0: # %entry
-; CHECK-NEXT:    vsetvli zero, a0, e64, m1, ta, ma
-; CHECK-NEXT:    vsadd.vi v8, v8, 9
-; CHECK-NEXT:    ret
-entry:
-  %a = call <vscale x 1 x i64> @llvm.riscv.vsadd.nxv1i64.i64(
-    <vscale x 1 x i64> undef,
-    <vscale x 1 x i64> %0,
-    i64 9,
-    i32 %1)
-
-  ret <vscale x 1 x i64> %a
-}
-
-define <vscale x 1 x i64> @intrinsic_vsadd_mask_vi_nxv1i64_nxv1i64_i64(<vscale x 1 x i64> %0, <vscale x 1 x i64> %1, <vscale x 1 x i1> %2, i32 %3) nounwind {
-; CHECK-LABEL: intrinsic_vsadd_mask_vi_nxv1i64_nxv1i64_i64:
-; CHECK:       # %bb.0: # %entry
-; CHECK-NEXT:    vsetvli zero, a0, e64, m1, ta, mu
-; CHECK-NEXT:    vsadd.vi v8, v9, 9, v0.t
-; CHECK-NEXT:    ret
-entry:
-  %a = call <vscale x 1 x i64> @llvm.riscv.vsadd.mask.nxv1i64.i64(
-    <vscale x 1 x i64> %0,
-    <vscale x 1 x i64> %1,
-    i64 9,
-    <vscale x 1 x i1> %2,
-    i32 %3, i32 1)
-
-  ret <vscale x 1 x i64> %a
-}
-
-define <vscale x 2 x i64> @intrinsic_vsadd_vi_nxv2i64_nxv2i64_i64(<vscale x 2 x i64> %0, i32 %1) nounwind {
-; CHECK-LABEL: intrinsic_vsadd_vi_nxv2i64_nxv2i64_i64:
-; CHECK:       # %bb.0: # %entry
-; CHECK-NEXT:    vsetvli zero, a0, e64, m2, ta, ma
-; CHECK-NEXT:    vsadd.vi v8, v8, 9
-; CHECK-NEXT:    ret
-entry:
-  %a = call <vscale x 2 x i64> @llvm.riscv.vsadd.nxv2i64.i64(
-    <vscale x 2 x i64> undef,
-    <vscale x 2 x i64> %0,
-    i64 9,
-    i32 %1)
-
-  ret <vscale x 2 x i64> %a
-}
-
-define <vscale x 2 x i64> @intrinsic_vsadd_mask_vi_nxv2i64_nxv2i64_i64(<vscale x 2 x i64> %0, <vscale x 2 x i64> %1, <vscale x 2 x i1> %2, i32 %3) nounwind {
-; CHECK-LABEL: intrinsic_vsadd_mask_vi_nxv2i64_nxv2i64_i64:
-; CHECK:       # %bb.0: # %entry
-; CHECK-NEXT:    vsetvli zero, a0, e64, m2, ta, mu
-; CHECK-NEXT:    vsadd.vi v8, v10, 9, v0.t
-; CHECK-NEXT:    ret
-entry:
-  %a = call <vscale x 2 x i64> @llvm.riscv.vsadd.mask.nxv2i64.i64(
-    <vscale x 2 x i64> %0,
-    <vscale x 2 x i64> %1,
-    i64 9,
-    <vscale x 2 x i1> %2,
-    i32 %3, i32 1)
-
-  ret <vscale x 2 x i64> %a
-}
-
-define <vscale x 4 x i64> @intrinsic_vsadd_vi_nxv4i64_nxv4i64_i64(<vscale x 4 x i64> %0, i32 %1) nounwind {
-; CHECK-LABEL: intrinsic_vsadd_vi_nxv4i64_nxv4i64_i64:
-; CHECK:       # %bb.0: # %entry
-; CHECK-NEXT:    vsetvli zero, a0, e64, m4, ta, ma
-; CHECK-NEXT:    vsadd.vi v8, v8, 9
-; CHECK-NEXT:    ret
-entry:
-  %a = call <vscale x 4 x i64> @llvm.riscv.vsadd.nxv4i64.i64(
-    <vscale x 4 x i64> undef,
-    <vscale x 4 x i64> %0,
-    i64 9,
-    i32 %1)
-
-  ret <vscale x 4 x i64> %a
-}
-
-define <vscale x 4 x i64> @intrinsic_vsadd_mask_vi_nxv4i64_nxv4i64_i64(<vscale x 4 x i64> %0, <vscale x 4 x i64> %1, <vscale x 4 x i1> %2, i32 %3) nounwind {
-; CHECK-LABEL: intrinsic_vsadd_mask_vi_nxv4i64_nxv4i64_i64:
-; CHECK:       # %bb.0: # %entry
-; CHECK-NEXT:    vsetvli zero, a0, e64, m4, ta, mu
-; CHECK-NEXT:    vsadd.vi v8, v12, 9, v0.t
-; CHECK-NEXT:    ret
-entry:
-  %a = call <vscale x 4 x i64> @llvm.riscv.vsadd.mask.nxv4i64.i64(
-    <vscale x 4 x i64> %0,
-    <vscale x 4 x i64> %1,
-    i64 9,
-    <vscale x 4 x i1> %2,
-    i32 %3, i32 1)
-
-  ret <vscale x 4 x i64> %a
-}
-
-define <vscale x 8 x i64> @intrinsic_vsadd_vi_nxv8i64_nxv8i64_i64(<vscale x 8 x i64> %0, i32 %1) nounwind {
-; CHECK-LABEL: intrinsic_vsadd_vi_nxv8i64_nxv8i64_i64:
-; CHECK:       # %bb.0: # %entry
-; CHECK-NEXT:    vsetvli zero, a0, e64, m8, ta, ma
-; CHECK-NEXT:    vsadd.vi v8, v8, 9
-; CHECK-NEXT:    ret
-entry:
-  %a = call <vscale x 8 x i64> @llvm.riscv.vsadd.nxv8i64.i64(
-    <vscale x 8 x i64> undef,
-    <vscale x 8 x i64> %0,
-    i64 9,
-    i32 %1)
-
-  ret <vscale x 8 x i64> %a
-}
-
-define <vscale x 8 x i64> @intrinsic_vsadd_mask_vi_nxv8i64_nxv8i64_i64(<vscale x 8 x i64> %0, <vscale x 8 x i64> %1, <vscale x 8 x i1> %2, i32 %3) nounwind {
-; CHECK-LABEL: intrinsic_vsadd_mask_vi_nxv8i64_nxv8i64_i64:
-; CHECK:       # %bb.0: # %entry
-; CHECK-NEXT:    vsetvli zero, a0, e64, m8, ta, mu
-; CHECK-NEXT:    vsadd.vi v8, v16, 9, v0.t
-; CHECK-NEXT:    ret
-entry:
-  %a = call <vscale x 8 x i64> @llvm.riscv.vsadd.mask.nxv8i64.i64(
-    <vscale x 8 x i64> %0,
-    <vscale x 8 x i64> %1,
-    i64 9,
-    <vscale x 8 x i1> %2,
-    i32 %3, i32 1)
-
-  ret <vscale x 8 x i64> %a
-}
diff --git a/llvm/test/CodeGen/RISCV/rvv/vsadd-rv64.ll b/llvm/test/CodeGen/RISCV/rvv/vsadd.ll
index ca56ad21..a108d98 100644
--- a/llvm/test/CodeGen/RISCV/rvv/vsadd-rv64.ll
+++ b/llvm/test/CodeGen/RISCV/rvv/vsadd.ll
@@ -1,14 +1,16 @@
 ; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py
-; RUN: llc -mtriple=riscv64 -mattr=+v -verify-machineinstrs \
-; RUN:   < %s | FileCheck %s
+; RUN: sed 's/iXLen/i32/g' %s | llc -mtriple=riscv32 -mattr=+v \
+; RUN:   -verify-machineinstrs  | FileCheck %s --check-prefixes=CHECK,RV32
+; RUN: sed 's/iXLen/i64/g' %s | llc -mtriple=riscv64 -mattr=+v \
+; RUN:   -verify-machineinstrs  | FileCheck %s --check-prefixes=CHECK,RV64
 
 declare <vscale x 1 x i8> @llvm.riscv.vsadd.nxv1i8.nxv1i8(
   <vscale x 1 x i8>,
   <vscale x 1 x i8>,
   <vscale x 1 x i8>,
-  i64);
+  iXLen)
 
-define <vscale x 1 x i8> @intrinsic_vsadd_vv_nxv1i8_nxv1i8_nxv1i8(<vscale x 1 x i8> %0, <vscale x 1 x i8> %1, i64 %2) nounwind {
+define <vscale x 1 x i8> @intrinsic_vsadd_vv_nxv1i8_nxv1i8_nxv1i8(<vscale x 1 x i8> %0, <vscale x 1 x i8> %1, iXLen %2) nounwind {
 ; CHECK-LABEL: intrinsic_vsadd_vv_nxv1i8_nxv1i8_nxv1i8:
 ; CHECK:       # %bb.0: # %entry
 ; CHECK-NEXT:    vsetvli zero, a0, e8, mf8, ta, ma
@@ -19,7 +21,7 @@ entry:
     <vscale x 1 x i8> undef,
     <vscale x 1 x i8> %0,
     <vscale x 1 x i8> %1,
-    i64 %2)
+    iXLen %2)
 
   ret <vscale x 1 x i8> %a
 }
@@ -29,10 +31,10 @@ declare <vscale x 1 x i8> @llvm.riscv.vsadd.mask.nxv1i8.nxv1i8(
   <vscale x 1 x i8>,
   <vscale x 1 x i8>,
   <vscale x 1 x i1>,
-  i64,
-  i64);
+  iXLen,
+  iXLen)
 
-define <vscale x 1 x i8> @intrinsic_vsadd_mask_vv_nxv1i8_nxv1i8_nxv1i8(<vscale x 1 x i8> %0, <vscale x 1 x i8> %1, <vscale x 1 x i8> %2, <vscale x 1 x i1> %3, i64 %4) nounwind {
+define <vscale x 1 x i8> @intrinsic_vsadd_mask_vv_nxv1i8_nxv1i8_nxv1i8(<vscale x 1 x i8> %0, <vscale x 1 x i8> %1, <vscale x 1 x i8> %2, <vscale x 1 x i1> %3, iXLen %4) nounwind {
 ; CHECK-LABEL: intrinsic_vsadd_mask_vv_nxv1i8_nxv1i8_nxv1i8:
 ; CHECK:       # %bb.0: # %entry
 ; CHECK-NEXT:    vsetvli zero, a0, e8, mf8, ta, mu
@@ -44,7 +46,7 @@ entry:
     <vscale x 1 x i8> %1,
     <vscale x 1 x i8> %2,
     <vscale x 1 x i1> %3,
-    i64 %4, i64 1)
+    iXLen %4, iXLen 1)
 
   ret <vscale x 1 x i8> %a
 }
@@ -53,9 +55,9 @@ declare <vscale x 2 x i8> @llvm.riscv.vsadd.nxv2i8.nxv2i8(
   <vscale x 2 x i8>,
   <vscale x 2 x i8>,
   <vscale x 2 x i8>,
-  i64);
+  iXLen)
 
-define <vscale x 2 x i8> @intrinsic_vsadd_vv_nxv2i8_nxv2i8_nxv2i8(<vscale x 2 x i8> %0, <vscale x 2 x i8> %1, i64 %2) nounwind {
+define <vscale x 2 x i8> @intrinsic_vsadd_vv_nxv2i8_nxv2i8_nxv2i8(<vscale x 2 x i8> %0, <vscale x 2 x i8> %1, iXLen %2) nounwind {
 ; CHECK-LABEL: intrinsic_vsadd_vv_nxv2i8_nxv2i8_nxv2i8:
 ; CHECK:       # %bb.0: # %entry
 ; CHECK-NEXT:    vsetvli zero, a0, e8, mf4, ta, ma
@@ -66,7 +68,7 @@ entry:
     <vscale x 2 x i8> undef,
     <vscale x 2 x i8> %0,
     <vscale x 2 x i8> %1,
-    i64 %2)
+    iXLen %2)
 
   ret <vscale x 2 x i8> %a
 }
@@ -76,10 +78,10 @@ declare <vscale x 2 x i8> @llvm.riscv.vsadd.mask.nxv2i8.nxv2i8(
   <vscale x 2 x i8>,
   <vscale x 2 x i8>,
   <vscale x 2 x i1>,
-  i64,
-  i64);
+  iXLen,
+  iXLen)
 
-define <vscale x 2 x i8> @intrinsic_vsadd_mask_vv_nxv2i8_nxv2i8_nxv2i8(<vscale x 2 x i8> %0, <vscale x 2 x i8> %1, <vscale x 2 x i8> %2, <vscale x 2 x i1> %3, i64 %4) nounwind {
+define <vscale x 2 x i8> @intrinsic_vsadd_mask_vv_nxv2i8_nxv2i8_nxv2i8(<vscale x 2 x i8> %0, <vscale x 2 x i8> %1, <vscale x 2 x i8> %2, <vscale x 2 x i1> %3, iXLen %4) nounwind {
 ; CHECK-LABEL: intrinsic_vsadd_mask_vv_nxv2i8_nxv2i8_nxv2i8:
 ; CHECK:       # %bb.0: # %entry
 ; CHECK-NEXT:    vsetvli zero, a0, e8, mf4, ta, mu
@@ -91,7 +93,7 @@ entry:
     <vscale x 2 x i8> %1,
     <vscale x 2 x i8> %2,
     <vscale x 2 x i1> %3,
-    i64 %4, i64 1)
+    iXLen %4, iXLen 1)
 
   ret <vscale x 2 x i8> %a
 }
@@ -100,9 +102,9 @@ declare <vscale x 4 x i8> @llvm.riscv.vsadd.nxv4i8.nxv4i8(
   <vscale x 4 x i8>,
   <vscale x 4 x i8>,
   <vscale x 4 x i8>,
-  i64);
+  iXLen)
 
-define <vscale x 4 x i8> @intrinsic_vsadd_vv_nxv4i8_nxv4i8_nxv4i8(<vscale x 4 x i8> %0, <vscale x 4 x i8> %1, i64 %2) nounwind {
+define <vscale x 4 x i8> @intrinsic_vsadd_vv_nxv4i8_nxv4i8_nxv4i8(<vscale x 4 x i8> %0, <vscale x 4 x i8> %1, iXLen %2) nounwind {
 ; CHECK-LABEL: intrinsic_vsadd_vv_nxv4i8_nxv4i8_nxv4i8:
 ; CHECK:       # %bb.0: # %entry
 ; CHECK-NEXT:    vsetvli zero, a0, e8, mf2, ta, ma
@@ -113,7 +115,7 @@ entry:
     <vscale x 4 x i8> undef,
     <vscale x 4 x i8> %0,
     <vscale x 4 x i8> %1,
-    i64 %2)
+    iXLen %2)
 
   ret <vscale x 4 x i8> %a
 }
@@ -123,10 +125,10 @@ declare <vscale x 4 x i8> @llvm.riscv.vsadd.mask.nxv4i8.nxv4i8(
   <vscale x 4 x i8>,
   <vscale x 4 x i8>,
   <vscale x 4 x i1>,
-  i64,
-  i64);
+  iXLen,
+  iXLen)
 
-define <vscale x 4 x i8> @intrinsic_vsadd_mask_vv_nxv4i8_nxv4i8_nxv4i8(<vscale x 4 x i8> %0, <vscale x 4 x i8> %1, <vscale x 4 x i8> %2, <vscale x 4 x i1> %3, i64 %4) nounwind {
+define <vscale x 4 x i8> @intrinsic_vsadd_mask_vv_nxv4i8_nxv4i8_nxv4i8(<vscale x 4 x i8> %0, <vscale x 4 x i8> %1, <vscale x 4 x i8> %2, <vscale x 4 x i1> %3, iXLen %4) nounwind {
 ; CHECK-LABEL: intrinsic_vsadd_mask_vv_nxv4i8_nxv4i8_nxv4i8:
 ; CHECK:       # %bb.0: # %entry
 ; CHECK-NEXT:    vsetvli zero, a0, e8, mf2, ta, mu
@@ -138,7 +140,7 @@ entry:
     <vscale x 4 x i8> %1,
     <vscale x 4 x i8> %2,
     <vscale x 4 x i1> %3,
-    i64 %4, i64 1)
+    iXLen %4, iXLen 1)
 
   ret <vscale x 4 x i8> %a
 }
@@ -147,9 +149,9 @@ declare <vscale x 8 x i8> @llvm.riscv.vsadd.nxv8i8.nxv8i8(
   <vscale x 8 x i8>,
   <vscale x 8 x i8>,
   <vscale x 8 x i8>,
-  i64);
+  iXLen)
 
-define <vscale x 8 x i8> @intrinsic_vsadd_vv_nxv8i8_nxv8i8_nxv8i8(<vscale x 8 x i8> %0, <vscale x 8 x i8> %1, i64 %2) nounwind {
+define <vscale x 8 x i8> @intrinsic_vsadd_vv_nxv8i8_nxv8i8_nxv8i8(<vscale x 8 x i8> %0, <vscale x 8 x i8> %1, iXLen %2) nounwind {
 ; CHECK-LABEL: intrinsic_vsadd_vv_nxv8i8_nxv8i8_nxv8i8:
 ; CHECK:       # %bb.0: # %entry
 ; CHECK-NEXT:    vsetvli zero, a0, e8, m1, ta, ma
@@ -160,7 +162,7 @@ entry:
     <vscale x 8 x i8> undef,
     <vscale x 8 x i8> %0,
     <vscale x 8 x i8> %1,
-    i64 %2)
+    iXLen %2)
 
   ret <vscale x 8 x i8> %a
 }
@@ -170,10 +172,10 @@ declare <vscale x 8 x i8> @llvm.riscv.vsadd.mask.nxv8i8.nxv8i8(
   <vscale x 8 x i8>,
   <vscale x 8 x i8>,
   <vscale x 8 x i1>,
-  i64,
-  i64);
+  iXLen,
+  iXLen)
 
-define <vscale x 8 x i8> @intrinsic_vsadd_mask_vv_nxv8i8_nxv8i8_nxv8i8(<vscale x 8 x i8> %0, <vscale x 8 x i8> %1, <vscale x 8 x i8> %2, <vscale x 8 x i1> %3, i64 %4) nounwind {
+define <vscale x 8 x i8> @intrinsic_vsadd_mask_vv_nxv8i8_nxv8i8_nxv8i8(<vscale x 8 x i8> %0, <vscale x 8 x i8> %1, <vscale x 8 x i8> %2, <vscale x 8 x i1> %3, iXLen %4) nounwind {
 ; CHECK-LABEL: intrinsic_vsadd_mask_vv_nxv8i8_nxv8i8_nxv8i8:
 ; CHECK:       # %bb.0: # %entry
 ; CHECK-NEXT:    vsetvli zero, a0, e8, m1, ta, mu
@@ -185,7 +187,7 @@ entry:
     <vscale x 8 x i8> %1,
     <vscale x 8 x i8> %2,
     <vscale x 8 x i1> %3,
-    i64 %4, i64 1)
+    iXLen %4, iXLen 1)
 
   ret <vscale x 8 x i8> %a
 }
@@ -194,9 +196,9 @@ declare <vscale x 16 x i8> @llvm.riscv.vsadd.nxv16i8.nxv16i8(
   <vscale x 16 x i8>,
   <vscale x 16 x i8>,
   <vscale x 16 x i8>,
-  i64);
+  iXLen)
 
-define <vscale x 16 x i8> @intrinsic_vsadd_vv_nxv16i8_nxv16i8_nxv16i8(<vscale x 16 x i8> %0, <vscale x 16 x i8> %1, i64 %2) nounwind {
+define <vscale x 16 x i8> @intrinsic_vsadd_vv_nxv16i8_nxv16i8_nxv16i8(<vscale x 16 x i8> %0, <vscale x 16 x i8> %1, iXLen %2) nounwind {
 ; CHECK-LABEL: intrinsic_vsadd_vv_nxv16i8_nxv16i8_nxv16i8:
 ; CHECK:       # %bb.0: # %entry
 ; CHECK-NEXT:    vsetvli zero, a0, e8, m2, ta, ma
@@ -207,7 +209,7 @@ entry:
     <vscale x 16 x i8> undef,
     <vscale x 16 x i8> %0,
     <vscale x 16 x i8> %1,
-    i64 %2)
+    iXLen %2)
 
   ret <vscale x 16 x i8> %a
 }
@@ -217,10 +219,10 @@ declare <vscale x 16 x i8> @llvm.riscv.vsadd.mask.nxv16i8.nxv16i8(
   <vscale x 16 x i8>,
   <vscale x 16 x i8>,
   <vscale x 16 x i1>,
-  i64,
-  i64);
+  iXLen,
+  iXLen)
 
-define <vscale x 16 x i8> @intrinsic_vsadd_mask_vv_nxv16i8_nxv16i8_nxv16i8(<vscale x 16 x i8> %0, <vscale x 16 x i8> %1, <vscale x 16 x i8> %2, <vscale x 16 x i1> %3, i64 %4) nounwind {
+define <vscale x 16 x i8> @intrinsic_vsadd_mask_vv_nxv16i8_nxv16i8_nxv16i8(<vscale x 16 x i8> %0, <vscale x 16 x i8> %1, <vscale x 16 x i8> %2, <vscale x 16 x i1> %3, iXLen %4) nounwind {
 ; CHECK-LABEL: intrinsic_vsadd_mask_vv_nxv16i8_nxv16i8_nxv16i8:
 ; CHECK:       # %bb.0: # %entry
 ; CHECK-NEXT:    vsetvli zero, a0, e8, m2, ta, mu
@@ -232,7 +234,7 @@ entry:
     <vscale x 16 x i8> %1,
     <vscale x 16 x i8> %2,
     <vscale x 16 x i1> %3,
-    i64 %4, i64 1)
+    iXLen %4, iXLen 1)
 
   ret <vscale x 16 x i8> %a
 }
@@ -241,9 +243,9 @@ declare <vscale x 32 x i8> @llvm.riscv.vsadd.nxv32i8.nxv32i8(
   <vscale x 32 x i8>,
   <vscale x 32 x i8>,
   <vscale x 32 x i8>,
-  i64);
+  iXLen)
 
-define <vscale x 32 x i8> @intrinsic_vsadd_vv_nxv32i8_nxv32i8_nxv32i8(<vscale x 32 x i8> %0, <vscale x 32 x i8> %1, i64 %2) nounwind {
+define <vscale x 32 x i8> @intrinsic_vsadd_vv_nxv32i8_nxv32i8_nxv32i8(<vscale x 32 x i8> %0, <vscale x 32 x i8> %1, iXLen %2) nounwind {
 ; CHECK-LABEL: intrinsic_vsadd_vv_nxv32i8_nxv32i8_nxv32i8:
 ; CHECK:       # %bb.0: # %entry
 ; CHECK-NEXT:    vsetvli zero, a0, e8, m4, ta, ma
@@ -254,7 +256,7 @@ entry:
     <vscale x 32 x i8> undef,
     <vscale x 32 x i8> %0,
     <vscale x 32 x i8> %1,
-    i64 %2)
+    iXLen %2)
 
   ret <vscale x 32 x i8> %a
 }
@@ -264,10 +266,10 @@ declare <vscale x 32 x i8> @llvm.riscv.vsadd.mask.nxv32i8.nxv32i8(
   <vscale x 32 x i8>,
   <vscale x 32 x i8>,
   <vscale x 32 x i1>,
-  i64,
-  i64);
+  iXLen,
+  iXLen)
 
-define <vscale x 32 x i8> @intrinsic_vsadd_mask_vv_nxv32i8_nxv32i8_nxv32i8(<vscale x 32 x i8> %0, <vscale x 32 x i8> %1, <vscale x 32 x i8> %2, <vscale x 32 x i1> %3, i64 %4) nounwind {
+define <vscale x 32 x i8> @intrinsic_vsadd_mask_vv_nxv32i8_nxv32i8_nxv32i8(<vscale x 32 x i8> %0, <vscale x 32 x i8> %1, <vscale x 32 x i8> %2, <vscale x 32 x i1> %3, iXLen %4) nounwind {
 ; CHECK-LABEL: intrinsic_vsadd_mask_vv_nxv32i8_nxv32i8_nxv32i8:
 ; CHECK:       # %bb.0: # %entry
 ; CHECK-NEXT:    vsetvli zero, a0, e8, m4, ta, mu
@@ -279,7 +281,7 @@ entry:
     <vscale x 32 x i8> %1,
     <vscale x 32 x i8> %2,
     <vscale x 32 x i1> %3,
-    i64 %4, i64 1)
+    iXLen %4, iXLen 1)
 
   ret <vscale x 32 x i8> %a
 }
@@ -288,9 +290,9 @@ declare <vscale x 64 x i8> @llvm.riscv.vsadd.nxv64i8.nxv64i8(
   <vscale x 64 x i8>,
   <vscale x 64 x i8>,
   <vscale x 64 x i8>,
-  i64);
+  iXLen)
 
-define <vscale x 64 x i8> @intrinsic_vsadd_vv_nxv64i8_nxv64i8_nxv64i8(<vscale x 64 x i8> %0, <vscale x 64 x i8> %1, i64 %2) nounwind {
+define <vscale x 64 x i8> @intrinsic_vsadd_vv_nxv64i8_nxv64i8_nxv64i8(<vscale x 64 x i8> %0, <vscale x 64 x i8> %1, iXLen %2) nounwind {
 ; CHECK-LABEL: intrinsic_vsadd_vv_nxv64i8_nxv64i8_nxv64i8:
 ; CHECK:       # %bb.0: # %entry
 ; CHECK-NEXT:    vsetvli zero, a0, e8, m8, ta, ma
@@ -301,7 +303,7 @@ entry:
     <vscale x 64 x i8> undef,
     <vscale x 64 x i8> %0,
     <vscale x 64 x i8> %1,
-    i64 %2)
+    iXLen %2)
 
   ret <vscale x 64 x i8> %a
 }
@@ -311,10 +313,10 @@ declare <vscale x 64 x i8> @llvm.riscv.vsadd.mask.nxv64i8.nxv64i8(
   <vscale x 64 x i8>,
   <vscale x 64 x i8>,
   <vscale x 64 x i1>,
-  i64,
-  i64);
+  iXLen,
+  iXLen)
 
-define <vscale x 64 x i8> @intrinsic_vsadd_mask_vv_nxv64i8_nxv64i8_nxv64i8(<vscale x 64 x i8> %0, <vscale x 64 x i8> %1, <vscale x 64 x i8> %2, <vscale x 64 x i1> %3, i64 %4) nounwind {
+define <vscale x 64 x i8> @intrinsic_vsadd_mask_vv_nxv64i8_nxv64i8_nxv64i8(<vscale x 64 x i8> %0, <vscale x 64 x i8> %1, <vscale x 64 x i8> %2, <vscale x 64 x i1> %3, iXLen %4) nounwind {
 ; CHECK-LABEL: intrinsic_vsadd_mask_vv_nxv64i8_nxv64i8_nxv64i8:
 ; CHECK:       # %bb.0: # %entry
 ; CHECK-NEXT:    vl8r.v v24, (a0)
@@ -327,7 +329,7 @@ entry:
     <vscale x 64 x i8> %1,
     <vscale x 64 x i8> %2,
     <vscale x 64 x i1> %3,
-    i64 %4, i64 1)
+    iXLen %4, iXLen 1)
 
   ret <vscale x 64 x i8> %a
 }
@@ -336,9 +338,9 @@ declare <vscale x 1 x i16> @llvm.riscv.vsadd.nxv1i16.nxv1i16(
   <vscale x 1 x i16>,
   <vscale x 1 x i16>,
   <vscale x 1 x i16>,
-  i64);
+  iXLen)
 
-define <vscale x 1 x i16> @intrinsic_vsadd_vv_nxv1i16_nxv1i16_nxv1i16(<vscale x 1 x i16> %0, <vscale x 1 x i16> %1, i64 %2) nounwind {
+define <vscale x 1 x i16> @intrinsic_vsadd_vv_nxv1i16_nxv1i16_nxv1i16(<vscale x 1 x i16> %0, <vscale x 1 x i16> %1, iXLen %2) nounwind {
 ; CHECK-LABEL: intrinsic_vsadd_vv_nxv1i16_nxv1i16_nxv1i16:
 ; CHECK:       # %bb.0: # %entry
 ; CHECK-NEXT:    vsetvli zero, a0, e16, mf4, ta, ma
@@ -349,7 +351,7 @@ entry:
     <vscale x 1 x i16> undef,
     <vscale x 1 x i16> %0,
     <vscale x 1 x i16> %1,
-    i64 %2)
+    iXLen %2)
 
   ret <vscale x 1 x i16> %a
 }
@@ -359,10 +361,10 @@ declare <vscale x 1 x i16> @llvm.riscv.vsadd.mask.nxv1i16.nxv1i16(
   <vscale x 1 x i16>,
   <vscale x 1 x i16>,
   <vscale x 1 x i1>,
-  i64,
-  i64);
+  iXLen,
+  iXLen)
 
-define <vscale x 1 x i16> @intrinsic_vsadd_mask_vv_nxv1i16_nxv1i16_nxv1i16(<vscale x 1 x i16> %0, <vscale x 1 x i16> %1, <vscale x 1 x i16> %2, <vscale x 1 x i1> %3, i64 %4) nounwind {
+define <vscale x 1 x i16> @intrinsic_vsadd_mask_vv_nxv1i16_nxv1i16_nxv1i16(<vscale x 1 x i16> %0, <vscale x 1 x i16> %1, <vscale x 1 x i16> %2, <vscale x 1 x i1> %3, iXLen %4) nounwind {
 ; CHECK-LABEL: intrinsic_vsadd_mask_vv_nxv1i16_nxv1i16_nxv1i16:
 ; CHECK:       # %bb.0: # %entry
 ; CHECK-NEXT:    vsetvli zero, a0, e16, mf4, ta, mu
@@ -374,7 +376,7 @@ entry:
     <vscale x 1 x i16> %1,
     <vscale x 1 x i16> %2,
     <vscale x 1 x i1> %3,
-    i64 %4, i64 1)
+    iXLen %4, iXLen 1)
 
   ret <vscale x 1 x i16> %a
 }
@@ -383,9 +385,9 @@ declare <vscale x 2 x i16> @llvm.riscv.vsadd.nxv2i16.nxv2i16(
   <vscale x 2 x i16>,
   <vscale x 2 x i16>,
   <vscale x 2 x i16>,
-  i64);
+  iXLen)
 
-define <vscale x 2 x i16> @intrinsic_vsadd_vv_nxv2i16_nxv2i16_nxv2i16(<vscale x 2 x i16> %0, <vscale x 2 x i16> %1, i64 %2) nounwind {
+define <vscale x 2 x i16> @intrinsic_vsadd_vv_nxv2i16_nxv2i16_nxv2i16(<vscale x 2 x i16> %0, <vscale x 2 x i16> %1, iXLen %2) nounwind {
 ; CHECK-LABEL: intrinsic_vsadd_vv_nxv2i16_nxv2i16_nxv2i16:
 ; CHECK:       # %bb.0: # %entry
 ; CHECK-NEXT:    vsetvli zero, a0, e16, mf2, ta, ma
@@ -396,7 +398,7 @@ entry:
     <vscale x 2 x i16> undef,
     <vscale x 2 x i16> %0,
     <vscale x 2 x i16> %1,
-    i64 %2)
+    iXLen %2)
 
   ret <vscale x 2 x i16> %a
 }
@@ -406,10 +408,10 @@ declare <vscale x 2 x i16> @llvm.riscv.vsadd.mask.nxv2i16.nxv2i16(
   <vscale x 2 x i16>,
   <vscale x 2 x i16>,
   <vscale x 2 x i1>,
-  i64,
-  i64);
+  iXLen,
+  iXLen)
 
-define <vscale x 2 x i16> @intrinsic_vsadd_mask_vv_nxv2i16_nxv2i16_nxv2i16(<vscale x 2 x i16> %0, <vscale x 2 x i16> %1, <vscale x 2 x i16> %2, <vscale x 2 x i1> %3, i64 %4) nounwind {
+define <vscale x 2 x i16> @intrinsic_vsadd_mask_vv_nxv2i16_nxv2i16_nxv2i16(<vscale x 2 x i16> %0, <vscale x 2 x i16> %1, <vscale x 2 x i16> %2, <vscale x 2 x i1> %3, iXLen %4) nounwind {
 ; CHECK-LABEL: intrinsic_vsadd_mask_vv_nxv2i16_nxv2i16_nxv2i16:
 ; CHECK:       # %bb.0: # %entry
 ; CHECK-NEXT:    vsetvli zero, a0, e16, mf2, ta, mu
@@ -421,7 +423,7 @@ entry:
     <vscale x 2 x i16> %1,
     <vscale x 2 x i16> %2,
     <vscale x 2 x i1> %3,
-    i64 %4, i64 1)
+    iXLen %4, iXLen 1)
 
   ret <vscale x 2 x i16> %a
 }
@@ -430,9 +432,9 @@ declare <vscale x 4 x i16> @llvm.riscv.vsadd.nxv4i16.nxv4i16(
   <vscale x 4 x i16>,
   <vscale x 4 x i16>,
   <vscale x 4 x i16>,
-  i64);
+  iXLen)
 
-define <vscale x 4 x i16> @intrinsic_vsadd_vv_nxv4i16_nxv4i16_nxv4i16(<vscale x 4 x i16> %0, <vscale x 4 x i16> %1, i64 %2) nounwind {
+define <vscale x 4 x i16> @intrinsic_vsadd_vv_nxv4i16_nxv4i16_nxv4i16(<vscale x 4 x i16> %0, <vscale x 4 x i16> %1, iXLen %2) nounwind {
 ; CHECK-LABEL: intrinsic_vsadd_vv_nxv4i16_nxv4i16_nxv4i16:
 ; CHECK:       # %bb.0: # %entry
 ; CHECK-NEXT:    vsetvli zero, a0, e16, m1, ta, ma
@@ -443,7 +445,7 @@ entry:
     <vscale x 4 x i16> undef,
     <vscale x 4 x i16> %0,
     <vscale x 4 x i16> %1,
-    i64 %2)
+    iXLen %2)
 
   ret <vscale x 4 x i16> %a
 }
@@ -453,10 +455,10 @@ declare <vscale x 4 x i16> @llvm.riscv.vsadd.mask.nxv4i16.nxv4i16(
   <vscale x 4 x i16>,
   <vscale x 4 x i16>,
   <vscale x 4 x i1>,
-  i64,
-  i64);
+  iXLen,
+  iXLen)
 
-define <vscale x 4 x i16> @intrinsic_vsadd_mask_vv_nxv4i16_nxv4i16_nxv4i16(<vscale x 4 x i16> %0, <vscale x 4 x i16> %1, <vscale x 4 x i16> %2, <vscale x 4 x i1> %3, i64 %4) nounwind {
+define <vscale x 4 x i16> @intrinsic_vsadd_mask_vv_nxv4i16_nxv4i16_nxv4i16(<vscale x 4 x i16> %0, <vscale x 4 x i16> %1, <vscale x 4 x i16> %2, <vscale x 4 x i1> %3, iXLen %4) nounwind {
 ; CHECK-LABEL: intrinsic_vsadd_mask_vv_nxv4i16_nxv4i16_nxv4i16:
 ; CHECK:       # %bb.0: # %entry
 ; CHECK-NEXT:    vsetvli zero, a0, e16, m1, ta, mu
@@ -468,7 +470,7 @@ entry:
     <vscale x 4 x i16> %1,
     <vscale x 4 x i16> %2,
     <vscale x 4 x i1> %3,
-    i64 %4, i64 1)
+    iXLen %4, iXLen 1)
 
   ret <vscale x 4 x i16> %a
 }
@@ -477,9 +479,9 @@ declare <vscale x 8 x i16> @llvm.riscv.vsadd.nxv8i16.nxv8i16(
   <vscale x 8 x i16>,
   <vscale x 8 x i16>,
   <vscale x 8 x i16>,
-  i64);
+  iXLen)
 
-define <vscale x 8 x i16> @intrinsic_vsadd_vv_nxv8i16_nxv8i16_nxv8i16(<vscale x 8 x i16> %0, <vscale x 8 x i16> %1, i64 %2) nounwind {
+define <vscale x 8 x i16> @intrinsic_vsadd_vv_nxv8i16_nxv8i16_nxv8i16(<vscale x 8 x i16> %0, <vscale x 8 x i16> %1, iXLen %2) nounwind {
 ; CHECK-LABEL: intrinsic_vsadd_vv_nxv8i16_nxv8i16_nxv8i16:
 ; CHECK:       # %bb.0: # %entry
 ; CHECK-NEXT:    vsetvli zero, a0, e16, m2, ta, ma
@@ -490,7 +492,7 @@ entry:
     <vscale x 8 x i16> undef,
     <vscale x 8 x i16> %0,
     <vscale x 8 x i16> %1,
-    i64 %2)
+    iXLen %2)
 
   ret <vscale x 8 x i16> %a
 }
@@ -500,10 +502,10 @@ declare <vscale x 8 x i16> @llvm.riscv.vsadd.mask.nxv8i16.nxv8i16(
   <vscale x 8 x i16>,
   <vscale x 8 x i16>,
   <vscale x 8 x i1>,
-  i64,
-  i64);
+  iXLen,
+  iXLen)
 
-define <vscale x 8 x i16> @intrinsic_vsadd_mask_vv_nxv8i16_nxv8i16_nxv8i16(<vscale x 8 x i16> %0, <vscale x 8 x i16> %1, <vscale x 8 x i16> %2, <vscale x 8 x i1> %3, i64 %4) nounwind {
+define <vscale x 8 x i16> @intrinsic_vsadd_mask_vv_nxv8i16_nxv8i16_nxv8i16(<vscale x 8 x i16> %0, <vscale x 8 x i16> %1, <vscale x 8 x i16> %2, <vscale x 8 x i1> %3, iXLen %4) nounwind {
 ; CHECK-LABEL: intrinsic_vsadd_mask_vv_nxv8i16_nxv8i16_nxv8i16:
 ; CHECK:       # %bb.0: # %entry
 ; CHECK-NEXT:    vsetvli zero, a0, e16, m2, ta, mu
@@ -515,7 +517,7 @@ entry:
     <vscale x 8 x i16> %1,
     <vscale x 8 x i16> %2,
     <vscale x 8 x i1> %3,
-    i64 %4, i64 1)
+    iXLen %4, iXLen 1)
 
   ret <vscale x 8 x i16> %a
 }
@@ -524,9 +526,9 @@ declare <vscale x 16 x i16> @llvm.riscv.vsadd.nxv16i16.nxv16i16(
   <vscale x 16 x i16>,
   <vscale x 16 x i16>,
   <vscale x 16 x i16>,
-  i64);
+  iXLen)
 
-define <vscale x 16 x i16> @intrinsic_vsadd_vv_nxv16i16_nxv16i16_nxv16i16(<vscale x 16 x i16> %0, <vscale x 16 x i16> %1, i64 %2) nounwind {
+define <vscale x 16 x i16> @intrinsic_vsadd_vv_nxv16i16_nxv16i16_nxv16i16(<vscale x 16 x i16> %0, <vscale x 16 x i16> %1, iXLen %2) nounwind {
 ; CHECK-LABEL: intrinsic_vsadd_vv_nxv16i16_nxv16i16_nxv16i16:
 ; CHECK:       # %bb.0: # %entry
 ; CHECK-NEXT:    vsetvli zero, a0, e16, m4, ta, ma
@@ -537,7 +539,7 @@ entry:
     <vscale x 16 x i16> undef,
     <vscale x 16 x i16> %0,
     <vscale x 16 x i16> %1,
-    i64 %2)
+    iXLen %2)
 
   ret <vscale x 16 x i16> %a
 }
@@ -547,10 +549,10 @@ declare <vscale x 16 x i16> @llvm.riscv.vsadd.mask.nxv16i16.nxv16i16(
   <vscale x 16 x i16>,
   <vscale x 16 x i16>,
   <vscale x 16 x i1>,
-  i64,
-  i64);
+  iXLen,
+  iXLen)
 
-define <vscale x 16 x i16> @intrinsic_vsadd_mask_vv_nxv16i16_nxv16i16_nxv16i16(<vscale x 16 x i16> %0, <vscale x 16 x i16> %1, <vscale x 16 x i16> %2, <vscale x 16 x i1> %3, i64 %4) nounwind {
+define <vscale x 16 x i16> @intrinsic_vsadd_mask_vv_nxv16i16_nxv16i16_nxv16i16(<vscale x 16 x i16> %0, <vscale x 16 x i16> %1, <vscale x 16 x i16> %2, <vscale x 16 x i1> %3, iXLen %4) nounwind {
 ; CHECK-LABEL: intrinsic_vsadd_mask_vv_nxv16i16_nxv16i16_nxv16i16:
 ; CHECK:       # %bb.0: # %entry
 ; CHECK-NEXT:    vsetvli zero, a0, e16, m4, ta, mu
@@ -562,7 +564,7 @@ entry:
     <vscale x 16 x i16> %1,
     <vscale x 16 x i16> %2,
     <vscale x 16 x i1> %3,
-    i64 %4, i64 1)
+    iXLen %4, iXLen 1)
 
   ret <vscale x 16 x i16> %a
 }
@@ -571,9 +573,9 @@ declare <vscale x 32 x i16> @llvm.riscv.vsadd.nxv32i16.nxv32i16(
   <vscale x 32 x i16>,
   <vscale x 32 x i16>,
   <vscale x 32 x i16>,
-  i64);
+  iXLen)
 
-define <vscale x 32 x i16> @intrinsic_vsadd_vv_nxv32i16_nxv32i16_nxv32i16(<vscale x 32 x i16> %0, <vscale x 32 x i16> %1, i64 %2) nounwind {
+define <vscale x 32 x i16> @intrinsic_vsadd_vv_nxv32i16_nxv32i16_nxv32i16(<vscale x 32 x i16> %0, <vscale x 32 x i16> %1, iXLen %2) nounwind {
 ; CHECK-LABEL: intrinsic_vsadd_vv_nxv32i16_nxv32i16_nxv32i16:
 ; CHECK:       # %bb.0: # %entry
 ; CHECK-NEXT:    vsetvli zero, a0, e16, m8, ta, ma
@@ -584,7 +586,7 @@ entry:
     <vscale x 32 x i16> undef,
     <vscale x 32 x i16> %0,
     <vscale x 32 x i16> %1,
-    i64 %2)
+    iXLen %2)
 
   ret <vscale x 32 x i16> %a
 }
@@ -594,10 +596,10 @@ declare <vscale x 32 x i16> @llvm.riscv.vsadd.mask.nxv32i16.nxv32i16(
   <vscale x 32 x i16>,
   <vscale x 32 x i16>,
   <vscale x 32 x i1>,
-  i64,
-  i64);
+  iXLen,
+  iXLen)
 
-define <vscale x 32 x i16> @intrinsic_vsadd_mask_vv_nxv32i16_nxv32i16_nxv32i16(<vscale x 32 x i16> %0, <vscale x 32 x i16> %1, <vscale x 32 x i16> %2, <vscale x 32 x i1> %3, i64 %4) nounwind {
+define <vscale x 32 x i16> @intrinsic_vsadd_mask_vv_nxv32i16_nxv32i16_nxv32i16(<vscale x 32 x i16> %0, <vscale x 32 x i16> %1, <vscale x 32 x i16> %2, <vscale x 32 x i1> %3, iXLen %4) nounwind {
 ; CHECK-LABEL: intrinsic_vsadd_mask_vv_nxv32i16_nxv32i16_nxv32i16:
 ; CHECK:       # %bb.0: # %entry
 ; CHECK-NEXT:    vl8re16.v v24, (a0)
@@ -610,7 +612,7 @@ entry:
     <vscale x 32 x i16> %1,
     <vscale x 32 x i16> %2,
     <vscale x 32 x i1> %3,
-    i64 %4, i64 1)
+    iXLen %4, iXLen 1)
 
   ret <vscale x 32 x i16> %a
 }
@@ -619,9 +621,9 @@ declare <vscale x 1 x i32> @llvm.riscv.vsadd.nxv1i32.nxv1i32(
   <vscale x 1 x i32>,
   <vscale x 1 x i32>,
   <vscale x 1 x i32>,
-  i64);
+  iXLen)
 
-define <vscale x 1 x i32> @intrinsic_vsadd_vv_nxv1i32_nxv1i32_nxv1i32(<vscale x 1 x i32> %0, <vscale x 1 x i32> %1, i64 %2) nounwind {
+define <vscale x 1 x i32> @intrinsic_vsadd_vv_nxv1i32_nxv1i32_nxv1i32(<vscale x 1 x i32> %0, <vscale x 1 x i32> %1, iXLen %2) nounwind {
 ; CHECK-LABEL: intrinsic_vsadd_vv_nxv1i32_nxv1i32_nxv1i32:
 ; CHECK:       # %bb.0: # %entry
 ; CHECK-NEXT:    vsetvli zero, a0, e32, mf2, ta, ma
@@ -632,7 +634,7 @@ entry:
     <vscale x 1 x i32> undef,
     <vscale x 1 x i32> %0,
     <vscale x 1 x i32> %1,
-    i64 %2)
+    iXLen %2)
 
   ret <vscale x 1 x i32> %a
 }
@@ -642,10 +644,10 @@ declare <vscale x 1 x i32> @llvm.riscv.vsadd.mask.nxv1i32.nxv1i32(
   <vscale x 1 x i32>,
   <vscale x 1 x i32>,
   <vscale x 1 x i1>,
-  i64,
-  i64);
+  iXLen,
+  iXLen)
 
-define <vscale x 1 x i32> @intrinsic_vsadd_mask_vv_nxv1i32_nxv1i32_nxv1i32(<vscale x 1 x i32> %0, <vscale x 1 x i32> %1, <vscale x 1 x i32> %2, <vscale x 1 x i1> %3, i64 %4) nounwind {
+define <vscale x 1 x i32> @intrinsic_vsadd_mask_vv_nxv1i32_nxv1i32_nxv1i32(<vscale x 1 x i32> %0, <vscale x 1 x i32> %1, <vscale x 1 x i32> %2, <vscale x 1 x i1> %3, iXLen %4) nounwind {
 ; CHECK-LABEL: intrinsic_vsadd_mask_vv_nxv1i32_nxv1i32_nxv1i32:
 ; CHECK:       # %bb.0: # %entry
 ; CHECK-NEXT:    vsetvli zero, a0, e32, mf2, ta, mu
@@ -657,7 +659,7 @@ entry:
     <vscale x 1 x i32> %1,
     <vscale x 1 x i32> %2,
     <vscale x 1 x i1> %3,
-    i64 %4, i64 1)
+    iXLen %4, iXLen 1)
 
   ret <vscale x 1 x i32> %a
 }
@@ -666,9 +668,9 @@ declare <vscale x 2 x i32> @llvm.riscv.vsadd.nxv2i32.nxv2i32(
   <vscale x 2 x i32>,
   <vscale x 2 x i32>,
   <vscale x 2 x i32>,
-  i64);
+  iXLen)
 
-define <vscale x 2 x i32> @intrinsic_vsadd_vv_nxv2i32_nxv2i32_nxv2i32(<vscale x 2 x i32> %0, <vscale x 2 x i32> %1, i64 %2) nounwind {
+define <vscale x 2 x i32> @intrinsic_vsadd_vv_nxv2i32_nxv2i32_nxv2i32(<vscale x 2 x i32> %0, <vscale x 2 x i32> %1, iXLen %2) nounwind {
 ; CHECK-LABEL: intrinsic_vsadd_vv_nxv2i32_nxv2i32_nxv2i32:
 ; CHECK:       # %bb.0: # %entry
 ; CHECK-NEXT:    vsetvli zero, a0, e32, m1, ta, ma
@@ -679,7 +681,7 @@ entry:
     <vscale x 2 x i32> undef,
     <vscale x 2 x i32> %0,
     <vscale x 2 x i32> %1,
-    i64 %2)
+    iXLen %2)
 
   ret <vscale x 2 x i32> %a
 }
@@ -689,10 +691,10 @@ declare <vscale x 2 x i32> @llvm.riscv.vsadd.mask.nxv2i32.nxv2i32(
   <vscale x 2 x i32>,
   <vscale x 2 x i32>,
   <vscale x 2 x i1>,
-  i64,
-  i64);
+  iXLen,
+  iXLen)
 
-define <vscale x 2 x i32> @intrinsic_vsadd_mask_vv_nxv2i32_nxv2i32_nxv2i32(<vscale x 2 x i32> %0, <vscale x 2 x i32> %1, <vscale x 2 x i32> %2, <vscale x 2 x i1> %3, i64 %4) nounwind {
+define <vscale x 2 x i32> @intrinsic_vsadd_mask_vv_nxv2i32_nxv2i32_nxv2i32(<vscale x 2 x i32> %0, <vscale x 2 x i32> %1, <vscale x 2 x i32> %2, <vscale x 2 x i1> %3, iXLen %4) nounwind {
 ; CHECK-LABEL: intrinsic_vsadd_mask_vv_nxv2i32_nxv2i32_nxv2i32:
 ; CHECK:       # %bb.0: # %entry
 ; CHECK-NEXT:    vsetvli zero, a0, e32, m1, ta, mu
@@ -704,7 +706,7 @@ entry:
     <vscale x 2 x i32> %1,
     <vscale x 2 x i32> %2,
     <vscale x 2 x i1> %3,
-    i64 %4, i64 1)
+    iXLen %4, iXLen 1)
 
   ret <vscale x 2 x i32> %a
 }
@@ -713,9 +715,9 @@ declare <vscale x 4 x i32> @llvm.riscv.vsadd.nxv4i32.nxv4i32(
   <vscale x 4 x i32>,
   <vscale x 4 x i32>,
   <vscale x 4 x i32>,
-  i64);
+  iXLen)
 
-define <vscale x 4 x i32> @intrinsic_vsadd_vv_nxv4i32_nxv4i32_nxv4i32(<vscale x 4 x i32> %0, <vscale x 4 x i32> %1, i64 %2) nounwind {
+define <vscale x 4 x i32> @intrinsic_vsadd_vv_nxv4i32_nxv4i32_nxv4i32(<vscale x 4 x i32> %0, <vscale x 4 x i32> %1, iXLen %2) nounwind {
 ; CHECK-LABEL: intrinsic_vsadd_vv_nxv4i32_nxv4i32_nxv4i32:
 ; CHECK:       # %bb.0: # %entry
 ; CHECK-NEXT:    vsetvli zero, a0, e32, m2, ta, ma
@@ -726,7 +728,7 @@ entry:
     <vscale x 4 x i32> undef,
     <vscale x 4 x i32> %0,
     <vscale x 4 x i32> %1,
-    i64 %2)
+    iXLen %2)
 
   ret <vscale x 4 x i32> %a
 }
@@ -736,10 +738,10 @@ declare <vscale x 4 x i32> @llvm.riscv.vsadd.mask.nxv4i32.nxv4i32(
   <vscale x 4 x i32>,
   <vscale x 4 x i32>,
   <vscale x 4 x i1>,
-  i64,
-  i64);
+  iXLen,
+  iXLen)
 
-define <vscale x 4 x i32> @intrinsic_vsadd_mask_vv_nxv4i32_nxv4i32_nxv4i32(<vscale x 4 x i32> %0, <vscale x 4 x i32> %1, <vscale x 4 x i32> %2, <vscale x 4 x i1> %3, i64 %4) nounwind {
+define <vscale x 4 x i32> @intrinsic_vsadd_mask_vv_nxv4i32_nxv4i32_nxv4i32(<vscale x 4 x i32> %0, <vscale x 4 x i32> %1, <vscale x 4 x i32> %2, <vscale x 4 x i1> %3, iXLen %4) nounwind {
 ; CHECK-LABEL: intrinsic_vsadd_mask_vv_nxv4i32_nxv4i32_nxv4i32:
 ; CHECK:       # %bb.0: # %entry
 ; CHECK-NEXT:    vsetvli zero, a0, e32, m2, ta, mu
@@ -751,7 +753,7 @@ entry:
     <vscale x 4 x i32> %1,
     <vscale x 4 x i32> %2,
     <vscale x 4 x i1> %3,
-    i64 %4, i64 1)
+    iXLen %4, iXLen 1)
 
   ret <vscale x 4 x i32> %a
 }
@@ -760,9 +762,9 @@ declare <vscale x 8 x i32> @llvm.riscv.vsadd.nxv8i32.nxv8i32(
   <vscale x 8 x i32>,
   <vscale x 8 x i32>,
   <vscale x 8 x i32>,
-  i64);
+  iXLen)
 
-define <vscale x 8 x i32> @intrinsic_vsadd_vv_nxv8i32_nxv8i32_nxv8i32(<vscale x 8 x i32> %0, <vscale x 8 x i32> %1, i64 %2) nounwind {
+define <vscale x 8 x i32> @intrinsic_vsadd_vv_nxv8i32_nxv8i32_nxv8i32(<vscale x 8 x i32> %0, <vscale x 8 x i32> %1, iXLen %2) nounwind {
 ; CHECK-LABEL: intrinsic_vsadd_vv_nxv8i32_nxv8i32_nxv8i32:
 ; CHECK:       # %bb.0: # %entry
 ; CHECK-NEXT:    vsetvli zero, a0, e32, m4, ta, ma
@@ -773,7 +775,7 @@ entry:
     <vscale x 8 x i32> undef,
     <vscale x 8 x i32> %0,
     <vscale x 8 x i32> %1,
-    i64 %2)
+    iXLen %2)
 
   ret <vscale x 8 x i32> %a
 }
@@ -783,10 +785,10 @@ declare <vscale x 8 x i32> @llvm.riscv.vsadd.mask.nxv8i32.nxv8i32(
   <vscale x 8 x i32>,
   <vscale x 8 x i32>,
   <vscale x 8 x i1>,
-  i64,
-  i64);
+  iXLen,
+  iXLen)
 
-define <vscale x 8 x i32> @intrinsic_vsadd_mask_vv_nxv8i32_nxv8i32_nxv8i32(<vscale x 8 x i32> %0, <vscale x 8 x i32> %1, <vscale x 8 x i32> %2, <vscale x 8 x i1> %3, i64 %4) nounwind {
+define <vscale x 8 x i32> @intrinsic_vsadd_mask_vv_nxv8i32_nxv8i32_nxv8i32(<vscale x 8 x i32> %0, <vscale x 8 x i32> %1, <vscale x 8 x i32> %2, <vscale x 8 x i1> %3, iXLen %4) nounwind {
 ; CHECK-LABEL: intrinsic_vsadd_mask_vv_nxv8i32_nxv8i32_nxv8i32:
 ; CHECK:       # %bb.0: # %entry
 ; CHECK-NEXT:    vsetvli zero, a0, e32, m4, ta, mu
@@ -798,7 +800,7 @@ entry:
     <vscale x 8 x i32> %1,
     <vscale x 8 x i32> %2,
     <vscale x 8 x i1> %3,
-    i64 %4, i64 1)
+    iXLen %4, iXLen 1)
 
   ret <vscale x 8 x i32> %a
 }
@@ -807,9 +809,9 @@ declare <vscale x 16 x i32> @llvm.riscv.vsadd.nxv16i32.nxv16i32(
   <vscale x 16 x i32>,
   <vscale x 16 x i32>,
   <vscale x 16 x i32>,
-  i64);
+  iXLen)
 
-define <vscale x 16 x i32> @intrinsic_vsadd_vv_nxv16i32_nxv16i32_nxv16i32(<vscale x 16 x i32> %0, <vscale x 16 x i32> %1, i64 %2) nounwind {
+define <vscale x 16 x i32> @intrinsic_vsadd_vv_nxv16i32_nxv16i32_nxv16i32(<vscale x 16 x i32> %0, <vscale x 16 x i32> %1, iXLen %2) nounwind {
 ; CHECK-LABEL: intrinsic_vsadd_vv_nxv16i32_nxv16i32_nxv16i32:
 ; CHECK:       # %bb.0: # %entry
 ; CHECK-NEXT:    vsetvli zero, a0, e32, m8, ta, ma
@@ -820,7 +822,7 @@ entry:
     <vscale x 16 x i32> undef,
     <vscale x 16 x i32> %0,
     <vscale x 16 x i32> %1,
-    i64 %2)
+    iXLen %2)
 
   ret <vscale x 16 x i32> %a
 }
@@ -830,10 +832,10 @@ declare <vscale x 16 x i32> @llvm.riscv.vsadd.mask.nxv16i32.nxv16i32(
   <vscale x 16 x i32>,
   <vscale x 16 x i32>,
   <vscale x 16 x i1>,
-  i64,
-  i64);
+  iXLen,
+  iXLen)
 
-define <vscale x 16 x i32> @intrinsic_vsadd_mask_vv_nxv16i32_nxv16i32_nxv16i32(<vscale x 16 x i32> %0, <vscale x 16 x i32> %1, <vscale x 16 x i32> %2, <vscale x 16 x i1> %3, i64 %4) nounwind {
+define <vscale x 16 x i32> @intrinsic_vsadd_mask_vv_nxv16i32_nxv16i32_nxv16i32(<vscale x 16 x i32> %0, <vscale x 16 x i32> %1, <vscale x 16 x i32> %2, <vscale x 16 x i1> %3, iXLen %4) nounwind {
 ; CHECK-LABEL: intrinsic_vsadd_mask_vv_nxv16i32_nxv16i32_nxv16i32:
 ; CHECK:       # %bb.0: # %entry
 ; CHECK-NEXT:    vl8re32.v v24, (a0)
@@ -846,7 +848,7 @@ entry:
     <vscale x 16 x i32> %1,
     <vscale x 16 x i32> %2,
     <vscale x 16 x i1> %3,
-    i64 %4, i64 1)
+    iXLen %4, iXLen 1)
 
   ret <vscale x 16 x i32> %a
 }
@@ -855,9 +857,9 @@ declare <vscale x 1 x i64> @llvm.riscv.vsadd.nxv1i64.nxv1i64(
   <vscale x 1 x i64>,
   <vscale x 1 x i64>,
   <vscale x 1 x i64>,
-  i64);
+  iXLen)
 
-define <vscale x 1 x i64> @intrinsic_vsadd_vv_nxv1i64_nxv1i64_nxv1i64(<vscale x 1 x i64> %0, <vscale x 1 x i64> %1, i64 %2) nounwind {
+define <vscale x 1 x i64> @intrinsic_vsadd_vv_nxv1i64_nxv1i64_nxv1i64(<vscale x 1 x i64> %0, <vscale x 1 x i64> %1, iXLen %2) nounwind {
 ; CHECK-LABEL: intrinsic_vsadd_vv_nxv1i64_nxv1i64_nxv1i64:
 ; CHECK:       # %bb.0: # %entry
 ; CHECK-NEXT:    vsetvli zero, a0, e64, m1, ta, ma
@@ -868,7 +870,7 @@ entry:
     <vscale x 1 x i64> undef,
     <vscale x 1 x i64> %0,
     <vscale x 1 x i64> %1,
-    i64 %2)
+    iXLen %2)
 
   ret <vscale x 1 x i64> %a
 }
@@ -878,10 +880,10 @@ declare <vscale x 1 x i64> @llvm.riscv.vsadd.mask.nxv1i64.nxv1i64(
   <vscale x 1 x i64>,
   <vscale x 1 x i64>,
   <vscale x 1 x i1>,
-  i64,
-  i64);
+  iXLen,
+  iXLen)
 
-define <vscale x 1 x i64> @intrinsic_vsadd_mask_vv_nxv1i64_nxv1i64_nxv1i64(<vscale x 1 x i64> %0, <vscale x 1 x i64> %1, <vscale x 1 x i64> %2, <vscale x 1 x i1> %3, i64 %4) nounwind {
+define <vscale x 1 x i64> @intrinsic_vsadd_mask_vv_nxv1i64_nxv1i64_nxv1i64(<vscale x 1 x i64> %0, <vscale x 1 x i64> %1, <vscale x 1 x i64> %2, <vscale x 1 x i1> %3, iXLen %4) nounwind {
 ; CHECK-LABEL: intrinsic_vsadd_mask_vv_nxv1i64_nxv1i64_nxv1i64:
 ; CHECK:       # %bb.0: # %entry
 ; CHECK-NEXT:    vsetvli zero, a0, e64, m1, ta, mu
@@ -893,7 +895,7 @@ entry:
     <vscale x 1 x i64> %1,
     <vscale x 1 x i64> %2,
     <vscale x 1 x i1> %3,
-    i64 %4, i64 1)
+    iXLen %4, iXLen 1)
 
   ret <vscale x 1 x i64> %a
 }
@@ -902,9 +904,9 @@ declare <vscale x 2 x i64> @llvm.riscv.vsadd.nxv2i64.nxv2i64(
   <vscale x 2 x i64>,
   <vscale x 2 x i64>,
   <vscale x 2 x i64>,
-  i64);
+  iXLen)
 
-define <vscale x 2 x i64> @intrinsic_vsadd_vv_nxv2i64_nxv2i64_nxv2i64(<vscale x 2 x i64> %0, <vscale x 2 x i64> %1, i64 %2) nounwind {
+define <vscale x 2 x i64> @intrinsic_vsadd_vv_nxv2i64_nxv2i64_nxv2i64(<vscale x 2 x i64> %0, <vscale x 2 x i64> %1, iXLen %2) nounwind {
 ; CHECK-LABEL: intrinsic_vsadd_vv_nxv2i64_nxv2i64_nxv2i64:
 ; CHECK:       # %bb.0: # %entry
 ; CHECK-NEXT:    vsetvli zero, a0, e64, m2, ta, ma
@@ -915,7 +917,7 @@ entry:
     <vscale x 2 x i64> undef,
     <vscale x 2 x i64> %0,
     <vscale x 2 x i64> %1,
-    i64 %2)
+    iXLen %2)
 
   ret <vscale x 2 x i64> %a
 }
@@ -925,10 +927,10 @@ declare <vscale x 2 x i64> @llvm.riscv.vsadd.mask.nxv2i64.nxv2i64(
   <vscale x 2 x i64>,
   <vscale x 2 x i64>,
   <vscale x 2 x i1>,
-  i64,
-  i64);
+  iXLen,
+  iXLen)
 
-define <vscale x 2 x i64> @intrinsic_vsadd_mask_vv_nxv2i64_nxv2i64_nxv2i64(<vscale x 2 x i64> %0, <vscale x 2 x i64> %1, <vscale x 2 x i64> %2, <vscale x 2 x i1> %3, i64 %4) nounwind {
+define <vscale x 2 x i64> @intrinsic_vsadd_mask_vv_nxv2i64_nxv2i64_nxv2i64(<vscale x 2 x i64> %0, <vscale x 2 x i64> %1, <vscale x 2 x i64> %2, <vscale x 2 x i1> %3, iXLen %4) nounwind {
 ; CHECK-LABEL: intrinsic_vsadd_mask_vv_nxv2i64_nxv2i64_nxv2i64:
 ; CHECK:       # %bb.0: # %entry
 ; CHECK-NEXT:    vsetvli zero, a0, e64, m2, ta, mu
@@ -940,7 +942,7 @@ entry:
     <vscale x 2 x i64> %1,
     <vscale x 2 x i64> %2,
     <vscale x 2 x i1> %3,
-    i64 %4, i64 1)
+    iXLen %4, iXLen 1)
 
   ret <vscale x 2 x i64> %a
 }
@@ -949,9 +951,9 @@ declare <vscale x 4 x i64> @llvm.riscv.vsadd.nxv4i64.nxv4i64(
   <vscale x 4 x i64>,
   <vscale x 4 x i64>,
   <vscale x 4 x i64>,
-  i64);
+  iXLen)
 
-define <vscale x 4 x i64> @intrinsic_vsadd_vv_nxv4i64_nxv4i64_nxv4i64(<vscale x 4 x i64> %0, <vscale x 4 x i64> %1, i64 %2) nounwind {
+define <vscale x 4 x i64> @intrinsic_vsadd_vv_nxv4i64_nxv4i64_nxv4i64(<vscale x 4 x i64> %0, <vscale x 4 x i64> %1, iXLen %2) nounwind {
 ; CHECK-LABEL: intrinsic_vsadd_vv_nxv4i64_nxv4i64_nxv4i64:
 ; CHECK:       # %bb.0: # %entry
 ; CHECK-NEXT:    vsetvli zero, a0, e64, m4, ta, ma
@@ -962,7 +964,7 @@ entry:
     <vscale x 4 x i64> undef,
     <vscale x 4 x i64> %0,
     <vscale x 4 x i64> %1,
-    i64 %2)
+    iXLen %2)
 
   ret <vscale x 4 x i64> %a
 }
@@ -972,10 +974,10 @@ declare <vscale x 4 x i64> @llvm.riscv.vsadd.mask.nxv4i64.nxv4i64(
   <vscale x 4 x i64>,
   <vscale x 4 x i64>,
   <vscale x 4 x i1>,
-  i64,
-  i64);
+  iXLen,
+  iXLen)
 
-define <vscale x 4 x i64> @intrinsic_vsadd_mask_vv_nxv4i64_nxv4i64_nxv4i64(<vscale x 4 x i64> %0, <vscale x 4 x i64> %1, <vscale x 4 x i64> %2, <vscale x 4 x i1> %3, i64 %4) nounwind {
+define <vscale x 4 x i64> @intrinsic_vsadd_mask_vv_nxv4i64_nxv4i64_nxv4i64(<vscale x 4 x i64> %0, <vscale x 4 x i64> %1, <vscale x 4 x i64> %2, <vscale x 4 x i1> %3, iXLen %4) nounwind {
 ; CHECK-LABEL: intrinsic_vsadd_mask_vv_nxv4i64_nxv4i64_nxv4i64:
 ; CHECK:       # %bb.0: # %entry
 ; CHECK-NEXT:    vsetvli zero, a0, e64, m4, ta, mu
@@ -987,7 +989,7 @@ entry:
     <vscale x 4 x i64> %1,
     <vscale x 4 x i64> %2,
     <vscale x 4 x i1> %3,
-    i64 %4, i64 1)
+    iXLen %4, iXLen 1)
 
   ret <vscale x 4 x i64> %a
 }
@@ -996,9 +998,9 @@ declare <vscale x 8 x i64> @llvm.riscv.vsadd.nxv8i64.nxv8i64(
   <vscale x 8 x i64>,
   <vscale x 8 x i64>,
   <vscale x 8 x i64>,
-  i64);
+  iXLen)
 
-define <vscale x 8 x i64> @intrinsic_vsadd_vv_nxv8i64_nxv8i64_nxv8i64(<vscale x 8 x i64> %0, <vscale x 8 x i64> %1, i64 %2) nounwind {
+define <vscale x 8 x i64> @intrinsic_vsadd_vv_nxv8i64_nxv8i64_nxv8i64(<vscale x 8 x i64> %0, <vscale x 8 x i64> %1, iXLen %2) nounwind {
 ; CHECK-LABEL: intrinsic_vsadd_vv_nxv8i64_nxv8i64_nxv8i64:
 ; CHECK:       # %bb.0: # %entry
 ; CHECK-NEXT:    vsetvli zero, a0, e64, m8, ta, ma
@@ -1009,7 +1011,7 @@ entry:
     <vscale x 8 x i64> undef,
     <vscale x 8 x i64> %0,
     <vscale x 8 x i64> %1,
-    i64 %2)
+    iXLen %2)
 
   ret <vscale x 8 x i64> %a
 }
@@ -1019,10 +1021,10 @@ declare <vscale x 8 x i64> @llvm.riscv.vsadd.mask.nxv8i64.nxv8i64(
   <vscale x 8 x i64>,
   <vscale x 8 x i64>,
   <vscale x 8 x i1>,
-  i64,
-  i64);
+  iXLen,
+  iXLen)
 
-define <vscale x 8 x i64> @intrinsic_vsadd_mask_vv_nxv8i64_nxv8i64_nxv8i64(<vscale x 8 x i64> %0, <vscale x 8 x i64> %1, <vscale x 8 x i64> %2, <vscale x 8 x i1> %3, i64 %4) nounwind {
+define <vscale x 8 x i64> @intrinsic_vsadd_mask_vv_nxv8i64_nxv8i64_nxv8i64(<vscale x 8 x i64> %0, <vscale x 8 x i64> %1, <vscale x 8 x i64> %2, <vscale x 8 x i1> %3, iXLen %4) nounwind {
 ; CHECK-LABEL: intrinsic_vsadd_mask_vv_nxv8i64_nxv8i64_nxv8i64:
 ; CHECK:       # %bb.0: # %entry
 ; CHECK-NEXT:    vl8re64.v v24, (a0)
@@ -1035,7 +1037,7 @@ entry:
     <vscale x 8 x i64> %1,
     <vscale x 8 x i64> %2,
     <vscale x 8 x i1> %3,
-    i64 %4, i64 1)
+    iXLen %4, iXLen 1)
 
   ret <vscale x 8 x i64> %a
 }
@@ -1044,9 +1046,9 @@ declare <vscale x 1 x i8> @llvm.riscv.vsadd.nxv1i8.i8(
   <vscale x 1 x i8>,
   <vscale x 1 x i8>,
   i8,
-  i64);
+  iXLen)
 
-define <vscale x 1 x i8> @intrinsic_vsadd_vx_nxv1i8_nxv1i8_i8(<vscale x 1 x i8> %0, i8 %1, i64 %2) nounwind {
+define <vscale x 1 x i8> @intrinsic_vsadd_vx_nxv1i8_nxv1i8_i8(<vscale x 1 x i8> %0, i8 %1, iXLen %2) nounwind {
 ; CHECK-LABEL: intrinsic_vsadd_vx_nxv1i8_nxv1i8_i8:
 ; CHECK:       # %bb.0: # %entry
 ; CHECK-NEXT:    vsetvli zero, a1, e8, mf8, ta, ma
@@ -1057,7 +1059,7 @@ entry:
     <vscale x 1 x i8> undef,
     <vscale x 1 x i8> %0,
     i8 %1,
-    i64 %2)
+    iXLen %2)
 
   ret <vscale x 1 x i8> %a
 }
@@ -1067,10 +1069,10 @@ declare <vscale x 1 x i8> @llvm.riscv.vsadd.mask.nxv1i8.i8(
   <vscale x 1 x i8>,
   i8,
   <vscale x 1 x i1>,
-  i64,
-  i64);
+  iXLen,
+  iXLen)
 
-define <vscale x 1 x i8> @intrinsic_vsadd_mask_vx_nxv1i8_nxv1i8_i8(<vscale x 1 x i8> %0, <vscale x 1 x i8> %1, i8 %2, <vscale x 1 x i1> %3, i64 %4) nounwind {
+define <vscale x 1 x i8> @intrinsic_vsadd_mask_vx_nxv1i8_nxv1i8_i8(<vscale x 1 x i8> %0, <vscale x 1 x i8> %1, i8 %2, <vscale x 1 x i1> %3, iXLen %4) nounwind {
 ; CHECK-LABEL: intrinsic_vsadd_mask_vx_nxv1i8_nxv1i8_i8:
 ; CHECK:       # %bb.0: # %entry
 ; CHECK-NEXT:    vsetvli zero, a1, e8, mf8, ta, mu
@@ -1082,7 +1084,7 @@ entry:
     <vscale x 1 x i8> %1,
     i8 %2,
     <vscale x 1 x i1> %3,
-    i64 %4, i64 1)
+    iXLen %4, iXLen 1)
 
   ret <vscale x 1 x i8> %a
 }
@@ -1091,9 +1093,9 @@ declare <vscale x 2 x i8> @llvm.riscv.vsadd.nxv2i8.i8(
   <vscale x 2 x i8>,
   <vscale x 2 x i8>,
   i8,
-  i64);
+  iXLen)
 
-define <vscale x 2 x i8> @intrinsic_vsadd_vx_nxv2i8_nxv2i8_i8(<vscale x 2 x i8> %0, i8 %1, i64 %2) nounwind {
+define <vscale x 2 x i8> @intrinsic_vsadd_vx_nxv2i8_nxv2i8_i8(<vscale x 2 x i8> %0, i8 %1, iXLen %2) nounwind {
 ; CHECK-LABEL: intrinsic_vsadd_vx_nxv2i8_nxv2i8_i8:
 ; CHECK:       # %bb.0: # %entry
 ; CHECK-NEXT:    vsetvli zero, a1, e8, mf4, ta, ma
@@ -1104,7 +1106,7 @@ entry:
     <vscale x 2 x i8> undef,
     <vscale x 2 x i8> %0,
     i8 %1,
-    i64 %2)
+    iXLen %2)
 
   ret <vscale x 2 x i8> %a
 }
@@ -1114,10 +1116,10 @@ declare <vscale x 2 x i8> @llvm.riscv.vsadd.mask.nxv2i8.i8(
   <vscale x 2 x i8>,
   i8,
   <vscale x 2 x i1>,
-  i64,
-  i64);
+  iXLen,
+  iXLen)
 
-define <vscale x 2 x i8> @intrinsic_vsadd_mask_vx_nxv2i8_nxv2i8_i8(<vscale x 2 x i8> %0, <vscale x 2 x i8> %1, i8 %2, <vscale x 2 x i1> %3, i64 %4) nounwind {
+define <vscale x 2 x i8> @intrinsic_vsadd_mask_vx_nxv2i8_nxv2i8_i8(<vscale x 2 x i8> %0, <vscale x 2 x i8> %1, i8 %2, <vscale x 2 x i1> %3, iXLen %4) nounwind {
 ; CHECK-LABEL: intrinsic_vsadd_mask_vx_nxv2i8_nxv2i8_i8:
 ; CHECK:       # %bb.0: # %entry
 ; CHECK-NEXT:    vsetvli zero, a1, e8, mf4, ta, mu
@@ -1129,7 +1131,7 @@ entry:
     <vscale x 2 x i8> %1,
     i8 %2,
     <vscale x 2 x i1> %3,
-    i64 %4, i64 1)
+    iXLen %4, iXLen 1)
 
   ret <vscale x 2 x i8> %a
 }
@@ -1138,9 +1140,9 @@ declare <vscale x 4 x i8> @llvm.riscv.vsadd.nxv4i8.i8(
   <vscale x 4 x i8>,
   <vscale x 4 x i8>,
   i8,
-  i64);
+  iXLen)
 
-define <vscale x 4 x i8> @intrinsic_vsadd_vx_nxv4i8_nxv4i8_i8(<vscale x 4 x i8> %0, i8 %1, i64 %2) nounwind {
+define <vscale x 4 x i8> @intrinsic_vsadd_vx_nxv4i8_nxv4i8_i8(<vscale x 4 x i8> %0, i8 %1, iXLen %2) nounwind {
 ; CHECK-LABEL: intrinsic_vsadd_vx_nxv4i8_nxv4i8_i8:
 ; CHECK:       # %bb.0: # %entry
 ; CHECK-NEXT:    vsetvli zero, a1, e8, mf2, ta, ma
@@ -1151,7 +1153,7 @@ entry:
     <vscale x 4 x i8> undef,
     <vscale x 4 x i8> %0,
     i8 %1,
-    i64 %2)
+    iXLen %2)
 
   ret <vscale x 4 x i8> %a
 }
@@ -1161,10 +1163,10 @@ declare <vscale x 4 x i8> @llvm.riscv.vsadd.mask.nxv4i8.i8(
   <vscale x 4 x i8>,
   i8,
   <vscale x 4 x i1>,
-  i64,
-  i64);
+  iXLen,
+  iXLen)
 
-define <vscale x 4 x i8> @intrinsic_vsadd_mask_vx_nxv4i8_nxv4i8_i8(<vscale x 4 x i8> %0, <vscale x 4 x i8> %1, i8 %2, <vscale x 4 x i1> %3, i64 %4) nounwind {
+define <vscale x 4 x i8> @intrinsic_vsadd_mask_vx_nxv4i8_nxv4i8_i8(<vscale x 4 x i8> %0, <vscale x 4 x i8> %1, i8 %2, <vscale x 4 x i1> %3, iXLen %4) nounwind {
 ; CHECK-LABEL: intrinsic_vsadd_mask_vx_nxv4i8_nxv4i8_i8:
 ; CHECK:       # %bb.0: # %entry
 ; CHECK-NEXT:    vsetvli zero, a1, e8, mf2, ta, mu
@@ -1176,7 +1178,7 @@ entry:
     <vscale x 4 x i8> %1,
     i8 %2,
     <vscale x 4 x i1> %3,
-    i64 %4, i64 1)
+    iXLen %4, iXLen 1)
 
   ret <vscale x 4 x i8> %a
 }
@@ -1185,9 +1187,9 @@ declare <vscale x 8 x i8> @llvm.riscv.vsadd.nxv8i8.i8(
   <vscale x 8 x i8>,
   <vscale x 8 x i8>,
   i8,
-  i64);
+  iXLen)
 
-define <vscale x 8 x i8> @intrinsic_vsadd_vx_nxv8i8_nxv8i8_i8(<vscale x 8 x i8> %0, i8 %1, i64 %2) nounwind {
+define <vscale x 8 x i8> @intrinsic_vsadd_vx_nxv8i8_nxv8i8_i8(<vscale x 8 x i8> %0, i8 %1, iXLen %2) nounwind {
 ; CHECK-LABEL: intrinsic_vsadd_vx_nxv8i8_nxv8i8_i8:
 ; CHECK:       # %bb.0: # %entry
 ; CHECK-NEXT:    vsetvli zero, a1, e8, m1, ta, ma
@@ -1198,7 +1200,7 @@ entry:
     <vscale x 8 x i8> undef,
     <vscale x 8 x i8> %0,
     i8 %1,
-    i64 %2)
+    iXLen %2)
 
   ret <vscale x 8 x i8> %a
 }
@@ -1208,10 +1210,10 @@ declare <vscale x 8 x i8> @llvm.riscv.vsadd.mask.nxv8i8.i8(
   <vscale x 8 x i8>,
   i8,
   <vscale x 8 x i1>,
-  i64,
-  i64);
+  iXLen,
+  iXLen)
 
-define <vscale x 8 x i8> @intrinsic_vsadd_mask_vx_nxv8i8_nxv8i8_i8(<vscale x 8 x i8> %0, <vscale x 8 x i8> %1, i8 %2, <vscale x 8 x i1> %3, i64 %4) nounwind {
+define <vscale x 8 x i8> @intrinsic_vsadd_mask_vx_nxv8i8_nxv8i8_i8(<vscale x 8 x i8> %0, <vscale x 8 x i8> %1, i8 %2, <vscale x 8 x i1> %3, iXLen %4) nounwind {
 ; CHECK-LABEL: intrinsic_vsadd_mask_vx_nxv8i8_nxv8i8_i8:
 ; CHECK:       # %bb.0: # %entry
 ; CHECK-NEXT:    vsetvli zero, a1, e8, m1, ta, mu
@@ -1223,7 +1225,7 @@ entry:
     <vscale x 8 x i8> %1,
     i8 %2,
     <vscale x 8 x i1> %3,
-    i64 %4, i64 1)
+    iXLen %4, iXLen 1)
 
   ret <vscale x 8 x i8> %a
 }
@@ -1232,9 +1234,9 @@ declare <vscale x 16 x i8> @llvm.riscv.vsadd.nxv16i8.i8(
   <vscale x 16 x i8>,
   <vscale x 16 x i8>,
   i8,
-  i64);
+  iXLen)
 
-define <vscale x 16 x i8> @intrinsic_vsadd_vx_nxv16i8_nxv16i8_i8(<vscale x 16 x i8> %0, i8 %1, i64 %2) nounwind {
+define <vscale x 16 x i8> @intrinsic_vsadd_vx_nxv16i8_nxv16i8_i8(<vscale x 16 x i8> %0, i8 %1, iXLen %2) nounwind {
 ; CHECK-LABEL: intrinsic_vsadd_vx_nxv16i8_nxv16i8_i8:
 ; CHECK:       # %bb.0: # %entry
 ; CHECK-NEXT:    vsetvli zero, a1, e8, m2, ta, ma
@@ -1245,7 +1247,7 @@ entry:
     <vscale x 16 x i8> undef,
     <vscale x 16 x i8> %0,
     i8 %1,
-    i64 %2)
+    iXLen %2)
 
   ret <vscale x 16 x i8> %a
 }
@@ -1255,10 +1257,10 @@ declare <vscale x 16 x i8> @llvm.riscv.vsadd.mask.nxv16i8.i8(
   <vscale x 16 x i8>,
   i8,
   <vscale x 16 x i1>,
-  i64,
-  i64);
+  iXLen,
+  iXLen)
 
-define <vscale x 16 x i8> @intrinsic_vsadd_mask_vx_nxv16i8_nxv16i8_i8(<vscale x 16 x i8> %0, <vscale x 16 x i8> %1, i8 %2, <vscale x 16 x i1> %3, i64 %4) nounwind {
+define <vscale x 16 x i8> @intrinsic_vsadd_mask_vx_nxv16i8_nxv16i8_i8(<vscale x 16 x i8> %0, <vscale x 16 x i8> %1, i8 %2, <vscale x 16 x i1> %3, iXLen %4) nounwind {
 ; CHECK-LABEL: intrinsic_vsadd_mask_vx_nxv16i8_nxv16i8_i8:
 ; CHECK:       # %bb.0: # %entry
 ; CHECK-NEXT:    vsetvli zero, a1, e8, m2, ta, mu
@@ -1270,7 +1272,7 @@ entry:
     <vscale x 16 x i8> %1,
     i8 %2,
     <vscale x 16 x i1> %3,
-    i64 %4, i64 1)
+    iXLen %4, iXLen 1)
 
   ret <vscale x 16 x i8> %a
 }
@@ -1279,9 +1281,9 @@ declare <vscale x 32 x i8> @llvm.riscv.vsadd.nxv32i8.i8(
   <vscale x 32 x i8>,
   <vscale x 32 x i8>,
   i8,
-  i64);
+  iXLen)
 
-define <vscale x 32 x i8> @intrinsic_vsadd_vx_nxv32i8_nxv32i8_i8(<vscale x 32 x i8> %0, i8 %1, i64 %2) nounwind {
+define <vscale x 32 x i8> @intrinsic_vsadd_vx_nxv32i8_nxv32i8_i8(<vscale x 32 x i8> %0, i8 %1, iXLen %2) nounwind {
 ; CHECK-LABEL: intrinsic_vsadd_vx_nxv32i8_nxv32i8_i8:
 ; CHECK:       # %bb.0: # %entry
 ; CHECK-NEXT:    vsetvli zero, a1, e8, m4, ta, ma
@@ -1292,7 +1294,7 @@ entry:
     <vscale x 32 x i8> undef,
     <vscale x 32 x i8> %0,
     i8 %1,
-    i64 %2)
+    iXLen %2)
 
   ret <vscale x 32 x i8> %a
 }
@@ -1302,10 +1304,10 @@ declare <vscale x 32 x i8> @llvm.riscv.vsadd.mask.nxv32i8.i8(
   <vscale x 32 x i8>,
   i8,
   <vscale x 32 x i1>,
-  i64,
-  i64);
+  iXLen,
+  iXLen)
 
-define <vscale x 32 x i8> @intrinsic_vsadd_mask_vx_nxv32i8_nxv32i8_i8(<vscale x 32 x i8> %0, <vscale x 32 x i8> %1, i8 %2, <vscale x 32 x i1> %3, i64 %4) nounwind {
+define <vscale x 32 x i8> @intrinsic_vsadd_mask_vx_nxv32i8_nxv32i8_i8(<vscale x 32 x i8> %0, <vscale x 32 x i8> %1, i8 %2, <vscale x 32 x i1> %3, iXLen %4) nounwind {
 ; CHECK-LABEL: intrinsic_vsadd_mask_vx_nxv32i8_nxv32i8_i8:
 ; CHECK:       # %bb.0: # %entry
 ; CHECK-NEXT:    vsetvli zero, a1, e8, m4, ta, mu
@@ -1317,7 +1319,7 @@ entry:
     <vscale x 32 x i8> %1,
     i8 %2,
     <vscale x 32 x i1> %3,
-    i64 %4, i64 1)
+    iXLen %4, iXLen 1)
 
   ret <vscale x 32 x i8> %a
 }
@@ -1326,9 +1328,9 @@ declare <vscale x 64 x i8> @llvm.riscv.vsadd.nxv64i8.i8(
   <vscale x 64 x i8>,
   <vscale x 64 x i8>,
   i8,
-  i64);
+  iXLen)
 
-define <vscale x 64 x i8> @intrinsic_vsadd_vx_nxv64i8_nxv64i8_i8(<vscale x 64 x i8> %0, i8 %1, i64 %2) nounwind {
+define <vscale x 64 x i8> @intrinsic_vsadd_vx_nxv64i8_nxv64i8_i8(<vscale x 64 x i8> %0, i8 %1, iXLen %2) nounwind {
 ; CHECK-LABEL: intrinsic_vsadd_vx_nxv64i8_nxv64i8_i8:
 ; CHECK:       # %bb.0: # %entry
 ; CHECK-NEXT:    vsetvli zero, a1, e8, m8, ta, ma
@@ -1339,7 +1341,7 @@ entry:
     <vscale x 64 x i8> undef,
     <vscale x 64 x i8> %0,
     i8 %1,
-    i64 %2)
+    iXLen %2)
 
   ret <vscale x 64 x i8> %a
 }
@@ -1349,10 +1351,10 @@ declare <vscale x 64 x i8> @llvm.riscv.vsadd.mask.nxv64i8.i8(
   <vscale x 64 x i8>,
   i8,
   <vscale x 64 x i1>,
-  i64,
-  i64);
+  iXLen,
+  iXLen)
 
-define <vscale x 64 x i8> @intrinsic_vsadd_mask_vx_nxv64i8_nxv64i8_i8(<vscale x 64 x i8> %0, <vscale x 64 x i8> %1, i8 %2, <vscale x 64 x i1> %3, i64 %4) nounwind {
+define <vscale x 64 x i8> @intrinsic_vsadd_mask_vx_nxv64i8_nxv64i8_i8(<vscale x 64 x i8> %0, <vscale x 64 x i8> %1, i8 %2, <vscale x 64 x i1> %3, iXLen %4) nounwind {
 ; CHECK-LABEL: intrinsic_vsadd_mask_vx_nxv64i8_nxv64i8_i8:
 ; CHECK:       # %bb.0: # %entry
 ; CHECK-NEXT:    vsetvli zero, a1, e8, m8, ta, mu
@@ -1364,7 +1366,7 @@ entry:
     <vscale x 64 x i8> %1,
     i8 %2,
     <vscale x 64 x i1> %3,
-    i64 %4, i64 1)
+    iXLen %4, iXLen 1)
 
   ret <vscale x 64 x i8> %a
 }
@@ -1373,9 +1375,9 @@ declare <vscale x 1 x i16> @llvm.riscv.vsadd.nxv1i16.i16(
   <vscale x 1 x i16>,
   <vscale x 1 x i16>,
   i16,
-  i64);
+  iXLen)
 
-define <vscale x 1 x i16> @intrinsic_vsadd_vx_nxv1i16_nxv1i16_i16(<vscale x 1 x i16> %0, i16 %1, i64 %2) nounwind {
+define <vscale x 1 x i16> @intrinsic_vsadd_vx_nxv1i16_nxv1i16_i16(<vscale x 1 x i16> %0, i16 %1, iXLen %2) nounwind {
 ; CHECK-LABEL: intrinsic_vsadd_vx_nxv1i16_nxv1i16_i16:
 ; CHECK:       # %bb.0: # %entry
 ; CHECK-NEXT:    vsetvli zero, a1, e16, mf4, ta, ma
@@ -1386,7 +1388,7 @@ entry:
     <vscale x 1 x i16> undef,
     <vscale x 1 x i16> %0,
     i16 %1,
-    i64 %2)
+    iXLen %2)
 
   ret <vscale x 1 x i16> %a
 }
@@ -1396,10 +1398,10 @@ declare <vscale x 1 x i16> @llvm.riscv.vsadd.mask.nxv1i16.i16(
   <vscale x 1 x i16>,
   i16,
   <vscale x 1 x i1>,
-  i64,
-  i64);
+  iXLen,
+  iXLen)
 
-define <vscale x 1 x i16> @intrinsic_vsadd_mask_vx_nxv1i16_nxv1i16_i16(<vscale x 1 x i16> %0, <vscale x 1 x i16> %1, i16 %2, <vscale x 1 x i1> %3, i64 %4) nounwind {
+define <vscale x 1 x i16> @intrinsic_vsadd_mask_vx_nxv1i16_nxv1i16_i16(<vscale x 1 x i16> %0, <vscale x 1 x i16> %1, i16 %2, <vscale x 1 x i1> %3, iXLen %4) nounwind {
 ; CHECK-LABEL: intrinsic_vsadd_mask_vx_nxv1i16_nxv1i16_i16:
 ; CHECK:       # %bb.0: # %entry
 ; CHECK-NEXT:    vsetvli zero, a1, e16, mf4, ta, mu
@@ -1411,7 +1413,7 @@ entry:
     <vscale x 1 x i16> %1,
     i16 %2,
     <vscale x 1 x i1> %3,
-    i64 %4, i64 1)
+    iXLen %4, iXLen 1)
 
   ret <vscale x 1 x i16> %a
 }
@@ -1420,9 +1422,9 @@ declare <vscale x 2 x i16> @llvm.riscv.vsadd.nxv2i16.i16(
   <vscale x 2 x i16>,
   <vscale x 2 x i16>,
   i16,
-  i64);
+  iXLen)
 
-define <vscale x 2 x i16> @intrinsic_vsadd_vx_nxv2i16_nxv2i16_i16(<vscale x 2 x i16> %0, i16 %1, i64 %2) nounwind {
+define <vscale x 2 x i16> @intrinsic_vsadd_vx_nxv2i16_nxv2i16_i16(<vscale x 2 x i16> %0, i16 %1, iXLen %2) nounwind {
 ; CHECK-LABEL: intrinsic_vsadd_vx_nxv2i16_nxv2i16_i16:
 ; CHECK:       # %bb.0: # %entry
 ; CHECK-NEXT:    vsetvli zero, a1, e16, mf2, ta, ma
@@ -1433,7 +1435,7 @@ entry:
     <vscale x 2 x i16> undef,
     <vscale x 2 x i16> %0,
     i16 %1,
-    i64 %2)
+    iXLen %2)
 
   ret <vscale x 2 x i16> %a
 }
@@ -1443,10 +1445,10 @@ declare <vscale x 2 x i16> @llvm.riscv.vsadd.mask.nxv2i16.i16(
   <vscale x 2 x i16>,
   i16,
   <vscale x 2 x i1>,
-  i64,
-  i64);
+  iXLen,
+  iXLen)
 
-define <vscale x 2 x i16> @intrinsic_vsadd_mask_vx_nxv2i16_nxv2i16_i16(<vscale x 2 x i16> %0, <vscale x 2 x i16> %1, i16 %2, <vscale x 2 x i1> %3, i64 %4) nounwind {
+define <vscale x 2 x i16> @intrinsic_vsadd_mask_vx_nxv2i16_nxv2i16_i16(<vscale x 2 x i16> %0, <vscale x 2 x i16> %1, i16 %2, <vscale x 2 x i1> %3, iXLen %4) nounwind {
 ; CHECK-LABEL: intrinsic_vsadd_mask_vx_nxv2i16_nxv2i16_i16:
 ; CHECK:       # %bb.0: # %entry
 ; CHECK-NEXT:    vsetvli zero, a1, e16, mf2, ta, mu
@@ -1458,7 +1460,7 @@ entry:
     <vscale x 2 x i16> %1,
     i16 %2,
     <vscale x 2 x i1> %3,
-    i64 %4, i64 1)
+    iXLen %4, iXLen 1)
 
   ret <vscale x 2 x i16> %a
 }
@@ -1467,9 +1469,9 @@ declare <vscale x 4 x i16> @llvm.riscv.vsadd.nxv4i16.i16(
   <vscale x 4 x i16>,
   <vscale x 4 x i16>,
   i16,
-  i64);
+  iXLen)
 
-define <vscale x 4 x i16> @intrinsic_vsadd_vx_nxv4i16_nxv4i16_i16(<vscale x 4 x i16> %0, i16 %1, i64 %2) nounwind {
+define <vscale x 4 x i16> @intrinsic_vsadd_vx_nxv4i16_nxv4i16_i16(<vscale x 4 x i16> %0, i16 %1, iXLen %2) nounwind {
 ; CHECK-LABEL: intrinsic_vsadd_vx_nxv4i16_nxv4i16_i16:
 ; CHECK:       # %bb.0: # %entry
 ; CHECK-NEXT:    vsetvli zero, a1, e16, m1, ta, ma
@@ -1480,7 +1482,7 @@ entry:
     <vscale x 4 x i16> undef,
     <vscale x 4 x i16> %0,
     i16 %1,
-    i64 %2)
+    iXLen %2)
 
   ret <vscale x 4 x i16> %a
 }
@@ -1490,10 +1492,10 @@ declare <vscale x 4 x i16> @llvm.riscv.vsadd.mask.nxv4i16.i16(
   <vscale x 4 x i16>,
   i16,
   <vscale x 4 x i1>,
-  i64,
-  i64);
+  iXLen,
+  iXLen)
 
-define <vscale x 4 x i16> @intrinsic_vsadd_mask_vx_nxv4i16_nxv4i16_i16(<vscale x 4 x i16> %0, <vscale x 4 x i16> %1, i16 %2, <vscale x 4 x i1> %3, i64 %4) nounwind {
+define <vscale x 4 x i16> @intrinsic_vsadd_mask_vx_nxv4i16_nxv4i16_i16(<vscale x 4 x i16> %0, <vscale x 4 x i16> %1, i16 %2, <vscale x 4 x i1> %3, iXLen %4) nounwind {
 ; CHECK-LABEL: intrinsic_vsadd_mask_vx_nxv4i16_nxv4i16_i16:
 ; CHECK:       # %bb.0: # %entry
 ; CHECK-NEXT:    vsetvli zero, a1, e16, m1, ta, mu
@@ -1505,7 +1507,7 @@ entry:
     <vscale x 4 x i16> %1,
     i16 %2,
     <vscale x 4 x i1> %3,
-    i64 %4, i64 1)
+    iXLen %4, iXLen 1)
 
   ret <vscale x 4 x i16> %a
 }
@@ -1514,9 +1516,9 @@ declare <vscale x 8 x i16> @llvm.riscv.vsadd.nxv8i16.i16(
   <vscale x 8 x i16>,
   <vscale x 8 x i16>,
   i16,
-  i64);
+  iXLen)
 
-define <vscale x 8 x i16> @intrinsic_vsadd_vx_nxv8i16_nxv8i16_i16(<vscale x 8 x i16> %0, i16 %1, i64 %2) nounwind {
+define <vscale x 8 x i16> @intrinsic_vsadd_vx_nxv8i16_nxv8i16_i16(<vscale x 8 x i16> %0, i16 %1, iXLen %2) nounwind {
 ; CHECK-LABEL: intrinsic_vsadd_vx_nxv8i16_nxv8i16_i16:
 ; CHECK:       # %bb.0: # %entry
 ; CHECK-NEXT:    vsetvli zero, a1, e16, m2, ta, ma
@@ -1527,7 +1529,7 @@ entry:
     <vscale x 8 x i16> undef,
     <vscale x 8 x i16> %0,
     i16 %1,
-    i64 %2)
+    iXLen %2)
 
   ret <vscale x 8 x i16> %a
 }
@@ -1537,10 +1539,10 @@ declare <vscale x 8 x i16> @llvm.riscv.vsadd.mask.nxv8i16.i16(
   <vscale x 8 x i16>,
   i16,
   <vscale x 8 x i1>,
-  i64,
-  i64);
+  iXLen,
+  iXLen)
 
-define <vscale x 8 x i16> @intrinsic_vsadd_mask_vx_nxv8i16_nxv8i16_i16(<vscale x 8 x i16> %0, <vscale x 8 x i16> %1, i16 %2, <vscale x 8 x i1> %3, i64 %4) nounwind {
+define <vscale x 8 x i16> @intrinsic_vsadd_mask_vx_nxv8i16_nxv8i16_i16(<vscale x 8 x i16> %0, <vscale x 8 x i16> %1, i16 %2, <vscale x 8 x i1> %3, iXLen %4) nounwind {
 ; CHECK-LABEL: intrinsic_vsadd_mask_vx_nxv8i16_nxv8i16_i16:
 ; CHECK:       # %bb.0: # %entry
 ; CHECK-NEXT:    vsetvli zero, a1, e16, m2, ta, mu
@@ -1552,7 +1554,7 @@ entry:
     <vscale x 8 x i16> %1,
     i16 %2,
     <vscale x 8 x i1> %3,
-    i64 %4, i64 1)
+    iXLen %4, iXLen 1)
 
   ret <vscale x 8 x i16> %a
 }
@@ -1561,9 +1563,9 @@ declare <vscale x 16 x i16> @llvm.riscv.vsadd.nxv16i16.i16(
   <vscale x 16 x i16>,
   <vscale x 16 x i16>,
   i16,
-  i64);
+  iXLen)
 
-define <vscale x 16 x i16> @intrinsic_vsadd_vx_nxv16i16_nxv16i16_i16(<vscale x 16 x i16> %0, i16 %1, i64 %2) nounwind {
+define <vscale x 16 x i16> @intrinsic_vsadd_vx_nxv16i16_nxv16i16_i16(<vscale x 16 x i16> %0, i16 %1, iXLen %2) nounwind {
 ; CHECK-LABEL: intrinsic_vsadd_vx_nxv16i16_nxv16i16_i16:
 ; CHECK:       # %bb.0: # %entry
 ; CHECK-NEXT:    vsetvli zero, a1, e16, m4, ta, ma
@@ -1574,7 +1576,7 @@ entry:
     <vscale x 16 x i16> undef,
     <vscale x 16 x i16> %0,
     i16 %1,
-    i64 %2)
+    iXLen %2)
 
   ret <vscale x 16 x i16> %a
 }
@@ -1584,10 +1586,10 @@ declare <vscale x 16 x i16> @llvm.riscv.vsadd.mask.nxv16i16.i16(
   <vscale x 16 x i16>,
   i16,
   <vscale x 16 x i1>,
-  i64,
-  i64);
+  iXLen,
+  iXLen)
 
-define <vscale x 16 x i16> @intrinsic_vsadd_mask_vx_nxv16i16_nxv16i16_i16(<vscale x 16 x i16> %0, <vscale x 16 x i16> %1, i16 %2, <vscale x 16 x i1> %3, i64 %4) nounwind {
+define <vscale x 16 x i16> @intrinsic_vsadd_mask_vx_nxv16i16_nxv16i16_i16(<vscale x 16 x i16> %0, <vscale x 16 x i16> %1, i16 %2, <vscale x 16 x i1> %3, iXLen %4) nounwind {
 ; CHECK-LABEL: intrinsic_vsadd_mask_vx_nxv16i16_nxv16i16_i16:
 ; CHECK:       # %bb.0: # %entry
 ; CHECK-NEXT:    vsetvli zero, a1, e16, m4, ta, mu
@@ -1599,7 +1601,7 @@ entry:
     <vscale x 16 x i16> %1,
     i16 %2,
     <vscale x 16 x i1> %3,
-    i64 %4, i64 1)
+    iXLen %4, iXLen 1)
 
   ret <vscale x 16 x i16> %a
 }
@@ -1608,9 +1610,9 @@ declare <vscale x 32 x i16> @llvm.riscv.vsadd.nxv32i16.i16(
   <vscale x 32 x i16>,
   <vscale x 32 x i16>,
   i16,
-  i64);
+  iXLen)
 
-define <vscale x 32 x i16> @intrinsic_vsadd_vx_nxv32i16_nxv32i16_i16(<vscale x 32 x i16> %0, i16 %1, i64 %2) nounwind {
+define <vscale x 32 x i16> @intrinsic_vsadd_vx_nxv32i16_nxv32i16_i16(<vscale x 32 x i16> %0, i16 %1, iXLen %2) nounwind {
 ; CHECK-LABEL: intrinsic_vsadd_vx_nxv32i16_nxv32i16_i16:
 ; CHECK:       # %bb.0: # %entry
 ; CHECK-NEXT:    vsetvli zero, a1, e16, m8, ta, ma
@@ -1621,7 +1623,7 @@ entry:
     <vscale x 32 x i16> undef,
     <vscale x 32 x i16> %0,
     i16 %1,
-    i64 %2)
+    iXLen %2)
 
   ret <vscale x 32 x i16> %a
 }
@@ -1631,10 +1633,10 @@ declare <vscale x 32 x i16> @llvm.riscv.vsadd.mask.nxv32i16.i16(
   <vscale x 32 x i16>,
   i16,
   <vscale x 32 x i1>,
-  i64,
-  i64);
+  iXLen,
+  iXLen)
 
-define <vscale x 32 x i16> @intrinsic_vsadd_mask_vx_nxv32i16_nxv32i16_i16(<vscale x 32 x i16> %0, <vscale x 32 x i16> %1, i16 %2, <vscale x 32 x i1> %3, i64 %4) nounwind {
+define <vscale x 32 x i16> @intrinsic_vsadd_mask_vx_nxv32i16_nxv32i16_i16(<vscale x 32 x i16> %0, <vscale x 32 x i16> %1, i16 %2, <vscale x 32 x i1> %3, iXLen %4) nounwind {
 ; CHECK-LABEL: intrinsic_vsadd_mask_vx_nxv32i16_nxv32i16_i16:
 ; CHECK:       # %bb.0: # %entry
 ; CHECK-NEXT:    vsetvli zero, a1, e16, m8, ta, mu
@@ -1646,7 +1648,7 @@ entry:
     <vscale x 32 x i16> %1,
     i16 %2,
     <vscale x 32 x i1> %3,
-    i64 %4, i64 1)
+    iXLen %4, iXLen 1)
 
   ret <vscale x 32 x i16> %a
 }
@@ -1655,9 +1657,9 @@ declare <vscale x 1 x i32> @llvm.riscv.vsadd.nxv1i32.i32(
   <vscale x 1 x i32>,
   <vscale x 1 x i32>,
   i32,
-  i64);
+  iXLen)
 
-define <vscale x 1 x i32> @intrinsic_vsadd_vx_nxv1i32_nxv1i32_i32(<vscale x 1 x i32> %0, i32 %1, i64 %2) nounwind {
+define <vscale x 1 x i32> @intrinsic_vsadd_vx_nxv1i32_nxv1i32_i32(<vscale x 1 x i32> %0, i32 %1, iXLen %2) nounwind {
 ; CHECK-LABEL: intrinsic_vsadd_vx_nxv1i32_nxv1i32_i32:
 ; CHECK:       # %bb.0: # %entry
 ; CHECK-NEXT:    vsetvli zero, a1, e32, mf2, ta, ma
@@ -1668,7 +1670,7 @@ entry:
     <vscale x 1 x i32> undef,
     <vscale x 1 x i32> %0,
     i32 %1,
-    i64 %2)
+    iXLen %2)
 
   ret <vscale x 1 x i32> %a
 }
@@ -1678,10 +1680,10 @@ declare <vscale x 1 x i32> @llvm.riscv.vsadd.mask.nxv1i32.i32(
   <vscale x 1 x i32>,
   i32,
   <vscale x 1 x i1>,
-  i64,
-  i64);
+  iXLen,
+  iXLen)
 
-define <vscale x 1 x i32> @intrinsic_vsadd_mask_vx_nxv1i32_nxv1i32_i32(<vscale x 1 x i32> %0, <vscale x 1 x i32> %1, i32 %2, <vscale x 1 x i1> %3, i64 %4) nounwind {
+define <vscale x 1 x i32> @intrinsic_vsadd_mask_vx_nxv1i32_nxv1i32_i32(<vscale x 1 x i32> %0, <vscale x 1 x i32> %1, i32 %2, <vscale x 1 x i1> %3, iXLen %4) nounwind {
 ; CHECK-LABEL: intrinsic_vsadd_mask_vx_nxv1i32_nxv1i32_i32:
 ; CHECK:       # %bb.0: # %entry
 ; CHECK-NEXT:    vsetvli zero, a1, e32, mf2, ta, mu
@@ -1693,7 +1695,7 @@ entry:
     <vscale x 1 x i32> %1,
     i32 %2,
     <vscale x 1 x i1> %3,
-    i64 %4, i64 1)
+    iXLen %4, iXLen 1)
 
   ret <vscale x 1 x i32> %a
 }
@@ -1702,9 +1704,9 @@ declare <vscale x 2 x i32> @llvm.riscv.vsadd.nxv2i32.i32(
   <vscale x 2 x i32>,
   <vscale x 2 x i32>,
   i32,
-  i64);
+  iXLen)
 
-define <vscale x 2 x i32> @intrinsic_vsadd_vx_nxv2i32_nxv2i32_i32(<vscale x 2 x i32> %0, i32 %1, i64 %2) nounwind {
+define <vscale x 2 x i32> @intrinsic_vsadd_vx_nxv2i32_nxv2i32_i32(<vscale x 2 x i32> %0, i32 %1, iXLen %2) nounwind {
 ; CHECK-LABEL: intrinsic_vsadd_vx_nxv2i32_nxv2i32_i32:
 ; CHECK:       # %bb.0: # %entry
 ; CHECK-NEXT:    vsetvli zero, a1, e32, m1, ta, ma
@@ -1715,7 +1717,7 @@ entry:
     <vscale x 2 x i32> undef,
     <vscale x 2 x i32> %0,
     i32 %1,
-    i64 %2)
+    iXLen %2)
 
   ret <vscale x 2 x i32> %a
 }
@@ -1725,10 +1727,10 @@ declare <vscale x 2 x i32> @llvm.riscv.vsadd.mask.nxv2i32.i32(
   <vscale x 2 x i32>,
   i32,
   <vscale x 2 x i1>,
-  i64,
-  i64);
+  iXLen,
+  iXLen)
 
-define <vscale x 2 x i32> @intrinsic_vsadd_mask_vx_nxv2i32_nxv2i32_i32(<vscale x 2 x i32> %0, <vscale x 2 x i32> %1, i32 %2, <vscale x 2 x i1> %3, i64 %4) nounwind {
+define <vscale x 2 x i32> @intrinsic_vsadd_mask_vx_nxv2i32_nxv2i32_i32(<vscale x 2 x i32> %0, <vscale x 2 x i32> %1, i32 %2, <vscale x 2 x i1> %3, iXLen %4) nounwind {
 ; CHECK-LABEL: intrinsic_vsadd_mask_vx_nxv2i32_nxv2i32_i32:
 ; CHECK:       # %bb.0: # %entry
 ; CHECK-NEXT:    vsetvli zero, a1, e32, m1, ta, mu
@@ -1740,7 +1742,7 @@ entry:
     <vscale x 2 x i32> %1,
     i32 %2,
     <vscale x 2 x i1> %3,
-    i64 %4, i64 1)
+    iXLen %4, iXLen 1)
 
   ret <vscale x 2 x i32> %a
 }
@@ -1749,9 +1751,9 @@ declare <vscale x 4 x i32> @llvm.riscv.vsadd.nxv4i32.i32(
   <vscale x 4 x i32>,
   <vscale x 4 x i32>,
   i32,
-  i64);
+  iXLen)
 
-define <vscale x 4 x i32> @intrinsic_vsadd_vx_nxv4i32_nxv4i32_i32(<vscale x 4 x i32> %0, i32 %1, i64 %2) nounwind {
+define <vscale x 4 x i32> @intrinsic_vsadd_vx_nxv4i32_nxv4i32_i32(<vscale x 4 x i32> %0, i32 %1, iXLen %2) nounwind {
 ; CHECK-LABEL: intrinsic_vsadd_vx_nxv4i32_nxv4i32_i32:
 ; CHECK:       # %bb.0: # %entry
 ; CHECK-NEXT:    vsetvli zero, a1, e32, m2, ta, ma
@@ -1762,7 +1764,7 @@ entry:
     <vscale x 4 x i32> undef,
     <vscale x 4 x i32> %0,
     i32 %1,
-    i64 %2)
+    iXLen %2)
 
   ret <vscale x 4 x i32> %a
 }
@@ -1772,10 +1774,10 @@ declare <vscale x 4 x i32> @llvm.riscv.vsadd.mask.nxv4i32.i32(
   <vscale x 4 x i32>,
   i32,
   <vscale x 4 x i1>,
-  i64,
-  i64);
+  iXLen,
+  iXLen)
 
-define <vscale x 4 x i32> @intrinsic_vsadd_mask_vx_nxv4i32_nxv4i32_i32(<vscale x 4 x i32> %0, <vscale x 4 x i32> %1, i32 %2, <vscale x 4 x i1> %3, i64 %4) nounwind {
+define <vscale x 4 x i32> @intrinsic_vsadd_mask_vx_nxv4i32_nxv4i32_i32(<vscale x 4 x i32> %0, <vscale x 4 x i32> %1, i32 %2, <vscale x 4 x i1> %3, iXLen %4) nounwind {
 ; CHECK-LABEL: intrinsic_vsadd_mask_vx_nxv4i32_nxv4i32_i32:
 ; CHECK:       # %bb.0: # %entry
 ; CHECK-NEXT:    vsetvli zero, a1, e32, m2, ta, mu
@@ -1787,7 +1789,7 @@ entry:
     <vscale x 4 x i32> %1,
     i32 %2,
     <vscale x 4 x i1> %3,
-    i64 %4, i64 1)
+    iXLen %4, iXLen 1)
 
   ret <vscale x 4 x i32> %a
 }
@@ -1796,9 +1798,9 @@ declare <vscale x 8 x i32> @llvm.riscv.vsadd.nxv8i32.i32(
   <vscale x 8 x i32>,
   <vscale x 8 x i32>,
   i32,
-  i64);
+  iXLen)
 
-define <vscale x 8 x i32> @intrinsic_vsadd_vx_nxv8i32_nxv8i32_i32(<vscale x 8 x i32> %0, i32 %1, i64 %2) nounwind {
+define <vscale x 8 x i32> @intrinsic_vsadd_vx_nxv8i32_nxv8i32_i32(<vscale x 8 x i32> %0, i32 %1, iXLen %2) nounwind {
 ; CHECK-LABEL: intrinsic_vsadd_vx_nxv8i32_nxv8i32_i32:
 ; CHECK:       # %bb.0: # %entry
 ; CHECK-NEXT:    vsetvli zero, a1, e32, m4, ta, ma
@@ -1809,7 +1811,7 @@ entry:
     <vscale x 8 x i32> undef,
     <vscale x 8 x i32> %0,
     i32 %1,
-    i64 %2)
+    iXLen %2)
 
   ret <vscale x 8 x i32> %a
 }
@@ -1819,10 +1821,10 @@ declare <vscale x 8 x i32> @llvm.riscv.vsadd.mask.nxv8i32.i32(
   <vscale x 8 x i32>,
   i32,
   <vscale x 8 x i1>,
-  i64,
-  i64);
+  iXLen,
+  iXLen)
 
-define <vscale x 8 x i32> @intrinsic_vsadd_mask_vx_nxv8i32_nxv8i32_i32(<vscale x 8 x i32> %0, <vscale x 8 x i32> %1, i32 %2, <vscale x 8 x i1> %3, i64 %4) nounwind {
+define <vscale x 8 x i32> @intrinsic_vsadd_mask_vx_nxv8i32_nxv8i32_i32(<vscale x 8 x i32> %0, <vscale x 8 x i32> %1, i32 %2, <vscale x 8 x i1> %3, iXLen %4) nounwind {
 ; CHECK-LABEL: intrinsic_vsadd_mask_vx_nxv8i32_nxv8i32_i32:
 ; CHECK:       # %bb.0: # %entry
 ; CHECK-NEXT:    vsetvli zero, a1, e32, m4, ta, mu
@@ -1834,7 +1836,7 @@ entry:
     <vscale x 8 x i32> %1,
     i32 %2,
     <vscale x 8 x i1> %3,
-    i64 %4, i64 1)
+    iXLen %4, iXLen 1)
 
   ret <vscale x 8 x i32> %a
 }
@@ -1843,9 +1845,9 @@ declare <vscale x 16 x i32> @llvm.riscv.vsadd.nxv16i32.i32(
   <vscale x 16 x i32>,
   <vscale x 16 x i32>,
   i32,
-  i64);
+  iXLen)
 
-define <vscale x 16 x i32> @intrinsic_vsadd_vx_nxv16i32_nxv16i32_i32(<vscale x 16 x i32> %0, i32 %1, i64 %2) nounwind {
+define <vscale x 16 x i32> @intrinsic_vsadd_vx_nxv16i32_nxv16i32_i32(<vscale x 16 x i32> %0, i32 %1, iXLen %2) nounwind {
 ; CHECK-LABEL: intrinsic_vsadd_vx_nxv16i32_nxv16i32_i32:
 ; CHECK:       # %bb.0: # %entry
 ; CHECK-NEXT:    vsetvli zero, a1, e32, m8, ta, ma
@@ -1856,7 +1858,7 @@ entry:
     <vscale x 16 x i32> undef,
     <vscale x 16 x i32> %0,
     i32 %1,
-    i64 %2)
+    iXLen %2)
 
   ret <vscale x 16 x i32> %a
 }
@@ -1866,10 +1868,10 @@ declare <vscale x 16 x i32> @llvm.riscv.vsadd.mask.nxv16i32.i32(
   <vscale x 16 x i32>,
   i32,
   <vscale x 16 x i1>,
-  i64,
-  i64);
+  iXLen,
+  iXLen)
 
-define <vscale x 16 x i32> @intrinsic_vsadd_mask_vx_nxv16i32_nxv16i32_i32(<vscale x 16 x i32> %0, <vscale x 16 x i32> %1, i32 %2, <vscale x 16 x i1> %3, i64 %4) nounwind {
+define <vscale x 16 x i32> @intrinsic_vsadd_mask_vx_nxv16i32_nxv16i32_i32(<vscale x 16 x i32> %0, <vscale x 16 x i32> %1, i32 %2, <vscale x 16 x i1> %3, iXLen %4) nounwind {
 ; CHECK-LABEL: intrinsic_vsadd_mask_vx_nxv16i32_nxv16i32_i32:
 ; CHECK:       # %bb.0: # %entry
 ; CHECK-NEXT:    vsetvli zero, a1, e32, m8, ta, mu
@@ -1881,7 +1883,7 @@ entry:
     <vscale x 16 x i32> %1,
     i32 %2,
     <vscale x 16 x i1> %3,
-    i64 %4, i64 1)
+    iXLen %4, iXLen 1)
 
   ret <vscale x 16 x i32> %a
 }
@@ -1890,20 +1892,32 @@ declare <vscale x 1 x i64> @llvm.riscv.vsadd.nxv1i64.i64(
   <vscale x 1 x i64>,
   <vscale x 1 x i64>,
   i64,
-  i64);
-
-define <vscale x 1 x i64> @intrinsic_vsadd_vx_nxv1i64_nxv1i64_i64(<vscale x 1 x i64> %0, i64 %1, i64 %2) nounwind {
-; CHECK-LABEL: intrinsic_vsadd_vx_nxv1i64_nxv1i64_i64:
-; CHECK:       # %bb.0: # %entry
-; CHECK-NEXT:    vsetvli zero, a1, e64, m1, ta, ma
-; CHECK-NEXT:    vsadd.vx v8, v8, a0
-; CHECK-NEXT:    ret
+  iXLen)
+
+define <vscale x 1 x i64> @intrinsic_vsadd_vx_nxv1i64_nxv1i64_i64(<vscale x 1 x i64> %0, i64 %1, iXLen %2) nounwind {
+; RV32-LABEL: intrinsic_vsadd_vx_nxv1i64_nxv1i64_i64:
+; RV32:       # %bb.0: # %entry
+; RV32-NEXT:    addi sp, sp, -16
+; RV32-NEXT:    sw a1, 12(sp)
+; RV32-NEXT:    sw a0, 8(sp)
+; RV32-NEXT:    addi a0, sp, 8
+; RV32-NEXT:    vsetvli zero, a2, e64, m1, ta, ma
+; RV32-NEXT:    vlse64.v v9, (a0), zero
+; RV32-NEXT:    vsadd.vv v8, v8, v9
+; RV32-NEXT:    addi sp, sp, 16
+; RV32-NEXT:    ret
+;
+; RV64-LABEL: intrinsic_vsadd_vx_nxv1i64_nxv1i64_i64:
+; RV64:       # %bb.0: # %entry
+; RV64-NEXT:    vsetvli zero, a1, e64, m1, ta, ma
+; RV64-NEXT:    vsadd.vx v8, v8, a0
+; RV64-NEXT:    ret
 entry:
   %a = call <vscale x 1 x i64> @llvm.riscv.vsadd.nxv1i64.i64(
     <vscale x 1 x i64> undef,
     <vscale x 1 x i64> %0,
     i64 %1,
-    i64 %2)
+    iXLen %2)
 
   ret <vscale x 1 x i64> %a
 }
@@ -1913,22 +1927,34 @@ declare <vscale x 1 x i64> @llvm.riscv.vsadd.mask.nxv1i64.i64(
   <vscale x 1 x i64>,
   i64,
   <vscale x 1 x i1>,
-  i64,
-  i64);
-
-define <vscale x 1 x i64> @intrinsic_vsadd_mask_vx_nxv1i64_nxv1i64_i64(<vscale x 1 x i64> %0, <vscale x 1 x i64> %1, i64 %2, <vscale x 1 x i1> %3, i64 %4) nounwind {
-; CHECK-LABEL: intrinsic_vsadd_mask_vx_nxv1i64_nxv1i64_i64:
-; CHECK:       # %bb.0: # %entry
-; CHECK-NEXT:    vsetvli zero, a1, e64, m1, ta, mu
-; CHECK-NEXT:    vsadd.vx v8, v9, a0, v0.t
-; CHECK-NEXT:    ret
+  iXLen,
+  iXLen)
+
+define <vscale x 1 x i64> @intrinsic_vsadd_mask_vx_nxv1i64_nxv1i64_i64(<vscale x 1 x i64> %0, <vscale x 1 x i64> %1, i64 %2, <vscale x 1 x i1> %3, iXLen %4) nounwind {
+; RV32-LABEL: intrinsic_vsadd_mask_vx_nxv1i64_nxv1i64_i64:
+; RV32:       # %bb.0: # %entry
+; RV32-NEXT:    addi sp, sp, -16
+; RV32-NEXT:    sw a1, 12(sp)
+; RV32-NEXT:    sw a0, 8(sp)
+; RV32-NEXT:    addi a0, sp, 8
+; RV32-NEXT:    vsetvli zero, a2, e64, m1, ta, mu
+; RV32-NEXT:    vlse64.v v10, (a0), zero
+; RV32-NEXT:    vsadd.vv v8, v9, v10, v0.t
+; RV32-NEXT:    addi sp, sp, 16
+; RV32-NEXT:    ret
+;
+; RV64-LABEL: intrinsic_vsadd_mask_vx_nxv1i64_nxv1i64_i64:
+; RV64:       # %bb.0: # %entry
+; RV64-NEXT:    vsetvli zero, a1, e64, m1, ta, mu
+; RV64-NEXT:    vsadd.vx v8, v9, a0, v0.t
+; RV64-NEXT:    ret
 entry:
   %a = call <vscale x 1 x i64> @llvm.riscv.vsadd.mask.nxv1i64.i64(
     <vscale x 1 x i64> %0,
     <vscale x 1 x i64> %1,
     i64 %2,
     <vscale x 1 x i1> %3,
-    i64 %4, i64 1)
+    iXLen %4, iXLen 1)
 
   ret <vscale x 1 x i64> %a
 }
@@ -1937,20 +1963,32 @@ declare <vscale x 2 x i64> @llvm.riscv.vsadd.nxv2i64.i64(
   <vscale x 2 x i64>,
   <vscale x 2 x i64>,
   i64,
-  i64);
-
-define <vscale x 2 x i64> @intrinsic_vsadd_vx_nxv2i64_nxv2i64_i64(<vscale x 2 x i64> %0, i64 %1, i64 %2) nounwind {
-; CHECK-LABEL: intrinsic_vsadd_vx_nxv2i64_nxv2i64_i64:
-; CHECK:       # %bb.0: # %entry
-; CHECK-NEXT:    vsetvli zero, a1, e64, m2, ta, ma
-; CHECK-NEXT:    vsadd.vx v8, v8, a0
-; CHECK-NEXT:    ret
+  iXLen)
+
+define <vscale x 2 x i64> @intrinsic_vsadd_vx_nxv2i64_nxv2i64_i64(<vscale x 2 x i64> %0, i64 %1, iXLen %2) nounwind {
+; RV32-LABEL: intrinsic_vsadd_vx_nxv2i64_nxv2i64_i64:
+; RV32:       # %bb.0: # %entry
+; RV32-NEXT:    addi sp, sp, -16
+; RV32-NEXT:    sw a1, 12(sp)
+; RV32-NEXT:    sw a0, 8(sp)
+; RV32-NEXT:    addi a0, sp, 8
+; RV32-NEXT:    vsetvli zero, a2, e64, m2, ta, ma
+; RV32-NEXT:    vlse64.v v10, (a0), zero
+; RV32-NEXT:    vsadd.vv v8, v8, v10
+; RV32-NEXT:    addi sp, sp, 16
+; RV32-NEXT:    ret
+;
+; RV64-LABEL: intrinsic_vsadd_vx_nxv2i64_nxv2i64_i64:
+; RV64:       # %bb.0: # %entry
+; RV64-NEXT:    vsetvli zero, a1, e64, m2, ta, ma
+; RV64-NEXT:    vsadd.vx v8, v8, a0
+; RV64-NEXT:    ret
 entry:
   %a = call <vscale x 2 x i64> @llvm.riscv.vsadd.nxv2i64.i64(
     <vscale x 2 x i64> undef,
     <vscale x 2 x i64> %0,
     i64 %1,
-    i64 %2)
+    iXLen %2)
 
   ret <vscale x 2 x i64> %a
 }
@@ -1960,22 +1998,34 @@ declare <vscale x 2 x i64> @llvm.riscv.vsadd.mask.nxv2i64.i64(
   <vscale x 2 x i64>,
   i64,
   <vscale x 2 x i1>,
-  i64,
-  i64);
-
-define <vscale x 2 x i64> @intrinsic_vsadd_mask_vx_nxv2i64_nxv2i64_i64(<vscale x 2 x i64> %0, <vscale x 2 x i64> %1, i64 %2, <vscale x 2 x i1> %3, i64 %4) nounwind {
-; CHECK-LABEL: intrinsic_vsadd_mask_vx_nxv2i64_nxv2i64_i64:
-; CHECK:       # %bb.0: # %entry
-; CHECK-NEXT:    vsetvli zero, a1, e64, m2, ta, mu
-; CHECK-NEXT:    vsadd.vx v8, v10, a0, v0.t
-; CHECK-NEXT:    ret
+  iXLen,
+  iXLen)
+
+define <vscale x 2 x i64> @intrinsic_vsadd_mask_vx_nxv2i64_nxv2i64_i64(<vscale x 2 x i64> %0, <vscale x 2 x i64> %1, i64 %2, <vscale x 2 x i1> %3, iXLen %4) nounwind {
+; RV32-LABEL: intrinsic_vsadd_mask_vx_nxv2i64_nxv2i64_i64:
+; RV32:       # %bb.0: # %entry
+; RV32-NEXT:    addi sp, sp, -16
+; RV32-NEXT:    sw a1, 12(sp)
+; RV32-NEXT:    sw a0, 8(sp)
+; RV32-NEXT:    addi a0, sp, 8
+; RV32-NEXT:    vsetvli zero, a2, e64, m2, ta, mu
+; RV32-NEXT:    vlse64.v v12, (a0), zero
+; RV32-NEXT:    vsadd.vv v8, v10, v12, v0.t
+; RV32-NEXT:    addi sp, sp, 16
+; RV32-NEXT:    ret
+;
+; RV64-LABEL: intrinsic_vsadd_mask_vx_nxv2i64_nxv2i64_i64:
+; RV64:       # %bb.0: # %entry
+; RV64-NEXT:    vsetvli zero, a1, e64, m2, ta, mu
+; RV64-NEXT:    vsadd.vx v8, v10, a0, v0.t
+; RV64-NEXT:    ret
 entry:
   %a = call <vscale x 2 x i64> @llvm.riscv.vsadd.mask.nxv2i64.i64(
     <vscale x 2 x i64> %0,
     <vscale x 2 x i64> %1,
     i64 %2,
     <vscale x 2 x i1> %3,
-    i64 %4, i64 1)
+    iXLen %4, iXLen 1)
 
   ret <vscale x 2 x i64> %a
 }
@@ -1984,20 +2034,32 @@ declare <vscale x 4 x i64> @llvm.riscv.vsadd.nxv4i64.i64(
   <vscale x 4 x i64>,
   <vscale x 4 x i64>,
   i64,
-  i64);
-
-define <vscale x 4 x i64> @intrinsic_vsadd_vx_nxv4i64_nxv4i64_i64(<vscale x 4 x i64> %0, i64 %1, i64 %2) nounwind {
-; CHECK-LABEL: intrinsic_vsadd_vx_nxv4i64_nxv4i64_i64:
-; CHECK:       # %bb.0: # %entry
-; CHECK-NEXT:    vsetvli zero, a1, e64, m4, ta, ma
-; CHECK-NEXT:    vsadd.vx v8, v8, a0
-; CHECK-NEXT:    ret
+  iXLen)
+
+define <vscale x 4 x i64> @intrinsic_vsadd_vx_nxv4i64_nxv4i64_i64(<vscale x 4 x i64> %0, i64 %1, iXLen %2) nounwind {
+; RV32-LABEL: intrinsic_vsadd_vx_nxv4i64_nxv4i64_i64:
+; RV32:       # %bb.0: # %entry
+; RV32-NEXT:    addi sp, sp, -16
+; RV32-NEXT:    sw a1, 12(sp)
+; RV32-NEXT:    sw a0, 8(sp)
+; RV32-NEXT:    addi a0, sp, 8
+; RV32-NEXT:    vsetvli zero, a2, e64, m4, ta, ma
+; RV32-NEXT:    vlse64.v v12, (a0), zero
+; RV32-NEXT:    vsadd.vv v8, v8, v12
+; RV32-NEXT:    addi sp, sp, 16
+; RV32-NEXT:    ret
+;
+; RV64-LABEL: intrinsic_vsadd_vx_nxv4i64_nxv4i64_i64:
+; RV64:       # %bb.0: # %entry
+; RV64-NEXT:    vsetvli zero, a1, e64, m4, ta, ma
+; RV64-NEXT:    vsadd.vx v8, v8, a0
+; RV64-NEXT:    ret
 entry:
   %a = call <vscale x 4 x i64> @llvm.riscv.vsadd.nxv4i64.i64(
     <vscale x 4 x i64> undef,
     <vscale x 4 x i64> %0,
     i64 %1,
-    i64 %2)
+    iXLen %2)
 
   ret <vscale x 4 x i64> %a
 }
@@ -2007,22 +2069,34 @@ declare <vscale x 4 x i64> @llvm.riscv.vsadd.mask.nxv4i64.i64(
   <vscale x 4 x i64>,
   i64,
   <vscale x 4 x i1>,
-  i64,
-  i64);
-
-define <vscale x 4 x i64> @intrinsic_vsadd_mask_vx_nxv4i64_nxv4i64_i64(<vscale x 4 x i64> %0, <vscale x 4 x i64> %1, i64 %2, <vscale x 4 x i1> %3, i64 %4) nounwind {
-; CHECK-LABEL: intrinsic_vsadd_mask_vx_nxv4i64_nxv4i64_i64:
-; CHECK:       # %bb.0: # %entry
-; CHECK-NEXT:    vsetvli zero, a1, e64, m4, ta, mu
-; CHECK-NEXT:    vsadd.vx v8, v12, a0, v0.t
-; CHECK-NEXT:    ret
+  iXLen,
+  iXLen)
+
+define <vscale x 4 x i64> @intrinsic_vsadd_mask_vx_nxv4i64_nxv4i64_i64(<vscale x 4 x i64> %0, <vscale x 4 x i64> %1, i64 %2, <vscale x 4 x i1> %3, iXLen %4) nounwind {
+; RV32-LABEL: intrinsic_vsadd_mask_vx_nxv4i64_nxv4i64_i64:
+; RV32:       # %bb.0: # %entry
+; RV32-NEXT:    addi sp, sp, -16
+; RV32-NEXT:    sw a1, 12(sp)
+; RV32-NEXT:    sw a0, 8(sp)
+; RV32-NEXT:    addi a0, sp, 8
+; RV32-NEXT:    vsetvli zero, a2, e64, m4, ta, mu
+; RV32-NEXT:    vlse64.v v16, (a0), zero
+; RV32-NEXT:    vsadd.vv v8, v12, v16, v0.t
+; RV32-NEXT:    addi sp, sp, 16
+; RV32-NEXT:    ret
+;
+; RV64-LABEL: intrinsic_vsadd_mask_vx_nxv4i64_nxv4i64_i64:
+; RV64:       # %bb.0: # %entry
+; RV64-NEXT:    vsetvli zero, a1, e64, m4, ta, mu
+; RV64-NEXT:    vsadd.vx v8, v12, a0, v0.t
+; RV64-NEXT:    ret
 entry:
   %a = call <vscale x 4 x i64> @llvm.riscv.vsadd.mask.nxv4i64.i64(
     <vscale x 4 x i64> %0,
     <vscale x 4 x i64> %1,
     i64 %2,
     <vscale x 4 x i1> %3,
-    i64 %4, i64 1)
+    iXLen %4, iXLen 1)
 
   ret <vscale x 4 x i64> %a
 }
@@ -2031,20 +2105,32 @@ declare <vscale x 8 x i64> @llvm.riscv.vsadd.nxv8i64.i64(
   <vscale x 8 x i64>,
   <vscale x 8 x i64>,
   i64,
-  i64);
-
-define <vscale x 8 x i64> @intrinsic_vsadd_vx_nxv8i64_nxv8i64_i64(<vscale x 8 x i64> %0, i64 %1, i64 %2) nounwind {
-; CHECK-LABEL: intrinsic_vsadd_vx_nxv8i64_nxv8i64_i64:
-; CHECK:       # %bb.0: # %entry
-; CHECK-NEXT:    vsetvli zero, a1, e64, m8, ta, ma
-; CHECK-NEXT:    vsadd.vx v8, v8, a0
-; CHECK-NEXT:    ret
+  iXLen)
+
+define <vscale x 8 x i64> @intrinsic_vsadd_vx_nxv8i64_nxv8i64_i64(<vscale x 8 x i64> %0, i64 %1, iXLen %2) nounwind {
+; RV32-LABEL: intrinsic_vsadd_vx_nxv8i64_nxv8i64_i64:
+; RV32:       # %bb.0: # %entry
+; RV32-NEXT:    addi sp, sp, -16
+; RV32-NEXT:    sw a1, 12(sp)
+; RV32-NEXT:    sw a0, 8(sp)
+; RV32-NEXT:    addi a0, sp, 8
+; RV32-NEXT:    vsetvli zero, a2, e64, m8, ta, ma
+; RV32-NEXT:    vlse64.v v16, (a0), zero
+; RV32-NEXT:    vsadd.vv v8, v8, v16
+; RV32-NEXT:    addi sp, sp, 16
+; RV32-NEXT:    ret
+;
+; RV64-LABEL: intrinsic_vsadd_vx_nxv8i64_nxv8i64_i64:
+; RV64:       # %bb.0: # %entry
+; RV64-NEXT:    vsetvli zero, a1, e64, m8, ta, ma
+; RV64-NEXT:    vsadd.vx v8, v8, a0
+; RV64-NEXT:    ret
 entry:
   %a = call <vscale x 8 x i64> @llvm.riscv.vsadd.nxv8i64.i64(
     <vscale x 8 x i64> undef,
     <vscale x 8 x i64> %0,
     i64 %1,
-    i64 %2)
+    iXLen %2)
 
   ret <vscale x 8 x i64> %a
 }
@@ -2054,27 +2140,39 @@ declare <vscale x 8 x i64> @llvm.riscv.vsadd.mask.nxv8i64.i64(
   <vscale x 8 x i64>,
   i64,
   <vscale x 8 x i1>,
-  i64,
-  i64);
-
-define <vscale x 8 x i64> @intrinsic_vsadd_mask_vx_nxv8i64_nxv8i64_i64(<vscale x 8 x i64> %0, <vscale x 8 x i64> %1, i64 %2, <vscale x 8 x i1> %3, i64 %4) nounwind {
-; CHECK-LABEL: intrinsic_vsadd_mask_vx_nxv8i64_nxv8i64_i64:
-; CHECK:       # %bb.0: # %entry
-; CHECK-NEXT:    vsetvli zero, a1, e64, m8, ta, mu
-; CHECK-NEXT:    vsadd.vx v8, v16, a0, v0.t
-; CHECK-NEXT:    ret
+  iXLen,
+  iXLen)
+
+define <vscale x 8 x i64> @intrinsic_vsadd_mask_vx_nxv8i64_nxv8i64_i64(<vscale x 8 x i64> %0, <vscale x 8 x i64> %1, i64 %2, <vscale x 8 x i1> %3, iXLen %4) nounwind {
+; RV32-LABEL: intrinsic_vsadd_mask_vx_nxv8i64_nxv8i64_i64:
+; RV32:       # %bb.0: # %entry
+; RV32-NEXT:    addi sp, sp, -16
+; RV32-NEXT:    sw a1, 12(sp)
+; RV32-NEXT:    sw a0, 8(sp)
+; RV32-NEXT:    addi a0, sp, 8
+; RV32-NEXT:    vsetvli zero, a2, e64, m8, ta, mu
+; RV32-NEXT:    vlse64.v v24, (a0), zero
+; RV32-NEXT:    vsadd.vv v8, v16, v24, v0.t
+; RV32-NEXT:    addi sp, sp, 16
+; RV32-NEXT:    ret
+;
+; RV64-LABEL: intrinsic_vsadd_mask_vx_nxv8i64_nxv8i64_i64:
+; RV64:       # %bb.0: # %entry
+; RV64-NEXT:    vsetvli zero, a1, e64, m8, ta, mu
+; RV64-NEXT:    vsadd.vx v8, v16, a0, v0.t
+; RV64-NEXT:    ret
 entry:
   %a = call <vscale x 8 x i64> @llvm.riscv.vsadd.mask.nxv8i64.i64(
     <vscale x 8 x i64> %0,
     <vscale x 8 x i64> %1,
     i64 %2,
     <vscale x 8 x i1> %3,
-    i64 %4, i64 1)
+    iXLen %4, iXLen 1)
 
   ret <vscale x 8 x i64> %a
 }
 
-define <vscale x 1 x i8> @intrinsic_vsadd_vi_nxv1i8_nxv1i8_i8(<vscale x 1 x i8> %0, i64 %1) nounwind {
+define <vscale x 1 x i8> @intrinsic_vsadd_vi_nxv1i8_nxv1i8_i8(<vscale x 1 x i8> %0, iXLen %1) nounwind {
 ; CHECK-LABEL: intrinsic_vsadd_vi_nxv1i8_nxv1i8_i8:
 ; CHECK:       # %bb.0: # %entry
 ; CHECK-NEXT:    vsetvli zero, a0, e8, mf8, ta, ma
@@ -2085,12 +2183,12 @@ entry:
     <vscale x 1 x i8> undef,
     <vscale x 1 x i8> %0,
     i8 9,
-    i64 %1)
+    iXLen %1)
 
   ret <vscale x 1 x i8> %a
 }
 
-define <vscale x 1 x i8> @intrinsic_vsadd_mask_vi_nxv1i8_nxv1i8_i8(<vscale x 1 x i8> %0, <vscale x 1 x i8> %1, <vscale x 1 x i1> %2, i64 %3) nounwind {
+define <vscale x 1 x i8> @intrinsic_vsadd_mask_vi_nxv1i8_nxv1i8_i8(<vscale x 1 x i8> %0, <vscale x 1 x i8> %1, <vscale x 1 x i1> %2, iXLen %3) nounwind {
 ; CHECK-LABEL: intrinsic_vsadd_mask_vi_nxv1i8_nxv1i8_i8:
 ; CHECK:       # %bb.0: # %entry
 ; CHECK-NEXT:    vsetvli zero, a0, e8, mf8, ta, mu
@@ -2102,12 +2200,12 @@ entry:
     <vscale x 1 x i8> %1,
     i8 9,
     <vscale x 1 x i1> %2,
-    i64 %3, i64 1)
+    iXLen %3, iXLen 1)
 
   ret <vscale x 1 x i8> %a
 }
 
-define <vscale x 2 x i8> @intrinsic_vsadd_vi_nxv2i8_nxv2i8_i8(<vscale x 2 x i8> %0, i64 %1) nounwind {
+define <vscale x 2 x i8> @intrinsic_vsadd_vi_nxv2i8_nxv2i8_i8(<vscale x 2 x i8> %0, iXLen %1) nounwind {
 ; CHECK-LABEL: intrinsic_vsadd_vi_nxv2i8_nxv2i8_i8:
 ; CHECK:       # %bb.0: # %entry
 ; CHECK-NEXT:    vsetvli zero, a0, e8, mf4, ta, ma
@@ -2118,12 +2216,12 @@ entry:
     <vscale x 2 x i8> undef,
     <vscale x 2 x i8> %0,
     i8 9,
-    i64 %1)
+    iXLen %1)
 
   ret <vscale x 2 x i8> %a
 }
 
-define <vscale x 2 x i8> @intrinsic_vsadd_mask_vi_nxv2i8_nxv2i8_i8(<vscale x 2 x i8> %0, <vscale x 2 x i8> %1, <vscale x 2 x i1> %2, i64 %3) nounwind {
+define <vscale x 2 x i8> @intrinsic_vsadd_mask_vi_nxv2i8_nxv2i8_i8(<vscale x 2 x i8> %0, <vscale x 2 x i8> %1, <vscale x 2 x i1> %2, iXLen %3) nounwind {
 ; CHECK-LABEL: intrinsic_vsadd_mask_vi_nxv2i8_nxv2i8_i8:
 ; CHECK:       # %bb.0: # %entry
 ; CHECK-NEXT:    vsetvli zero, a0, e8, mf4, ta, mu
@@ -2135,12 +2233,12 @@ entry:
     <vscale x 2 x i8> %1,
     i8 9,
     <vscale x 2 x i1> %2,
-    i64 %3, i64 1)
+    iXLen %3, iXLen 1)
 
   ret <vscale x 2 x i8> %a
 }
 
-define <vscale x 4 x i8> @intrinsic_vsadd_vi_nxv4i8_nxv4i8_i8(<vscale x 4 x i8> %0, i64 %1) nounwind {
+define <vscale x 4 x i8> @intrinsic_vsadd_vi_nxv4i8_nxv4i8_i8(<vscale x 4 x i8> %0, iXLen %1) nounwind {
 ; CHECK-LABEL: intrinsic_vsadd_vi_nxv4i8_nxv4i8_i8:
 ; CHECK:       # %bb.0: # %entry
 ; CHECK-NEXT:    vsetvli zero, a0, e8, mf2, ta, ma
@@ -2151,12 +2249,12 @@ entry:
     <vscale x 4 x i8> undef,
     <vscale x 4 x i8> %0,
     i8 9,
-    i64 %1)
+    iXLen %1)
 
   ret <vscale x 4 x i8> %a
 }
 
-define <vscale x 4 x i8> @intrinsic_vsadd_mask_vi_nxv4i8_nxv4i8_i8(<vscale x 4 x i8> %0, <vscale x 4 x i8> %1, <vscale x 4 x i1> %2, i64 %3) nounwind {
+define <vscale x 4 x i8> @intrinsic_vsadd_mask_vi_nxv4i8_nxv4i8_i8(<vscale x 4 x i8> %0, <vscale x 4 x i8> %1, <vscale x 4 x i1> %2, iXLen %3) nounwind {
 ; CHECK-LABEL: intrinsic_vsadd_mask_vi_nxv4i8_nxv4i8_i8:
 ; CHECK:       # %bb.0: # %entry
 ; CHECK-NEXT:    vsetvli zero, a0, e8, mf2, ta, mu
@@ -2168,12 +2266,12 @@ entry:
     <vscale x 4 x i8> %1,
     i8 9,
     <vscale x 4 x i1> %2,
-    i64 %3, i64 1)
+    iXLen %3, iXLen 1)
 
   ret <vscale x 4 x i8> %a
 }
 
-define <vscale x 8 x i8> @intrinsic_vsadd_vi_nxv8i8_nxv8i8_i8(<vscale x 8 x i8> %0, i64 %1) nounwind {
+define <vscale x 8 x i8> @intrinsic_vsadd_vi_nxv8i8_nxv8i8_i8(<vscale x 8 x i8> %0, iXLen %1) nounwind {
 ; CHECK-LABEL: intrinsic_vsadd_vi_nxv8i8_nxv8i8_i8:
 ; CHECK:       # %bb.0: # %entry
 ; CHECK-NEXT:    vsetvli zero, a0, e8, m1, ta, ma
@@ -2184,12 +2282,12 @@ entry:
     <vscale x 8 x i8> undef,
     <vscale x 8 x i8> %0,
     i8 9,
-    i64 %1)
+    iXLen %1)
 
   ret <vscale x 8 x i8> %a
 }
 
-define <vscale x 8 x i8> @intrinsic_vsadd_mask_vi_nxv8i8_nxv8i8_i8(<vscale x 8 x i8> %0, <vscale x 8 x i8> %1, <vscale x 8 x i1> %2, i64 %3) nounwind {
+define <vscale x 8 x i8> @intrinsic_vsadd_mask_vi_nxv8i8_nxv8i8_i8(<vscale x 8 x i8> %0, <vscale x 8 x i8> %1, <vscale x 8 x i1> %2, iXLen %3) nounwind {
 ; CHECK-LABEL: intrinsic_vsadd_mask_vi_nxv8i8_nxv8i8_i8:
 ; CHECK:       # %bb.0: # %entry
 ; CHECK-NEXT:    vsetvli zero, a0, e8, m1, ta, mu
@@ -2201,12 +2299,12 @@ entry:
     <vscale x 8 x i8> %1,
     i8 9,
     <vscale x 8 x i1> %2,
-    i64 %3, i64 1)
+    iXLen %3, iXLen 1)
 
   ret <vscale x 8 x i8> %a
 }
 
-define <vscale x 16 x i8> @intrinsic_vsadd_vi_nxv16i8_nxv16i8_i8(<vscale x 16 x i8> %0, i64 %1) nounwind {
+define <vscale x 16 x i8> @intrinsic_vsadd_vi_nxv16i8_nxv16i8_i8(<vscale x 16 x i8> %0, iXLen %1) nounwind {
 ; CHECK-LABEL: intrinsic_vsadd_vi_nxv16i8_nxv16i8_i8:
 ; CHECK:       # %bb.0: # %entry
 ; CHECK-NEXT:    vsetvli zero, a0, e8, m2, ta, ma
@@ -2217,12 +2315,12 @@ entry:
     <vscale x 16 x i8> undef,
     <vscale x 16 x i8> %0,
     i8 9,
-    i64 %1)
+    iXLen %1)
 
   ret <vscale x 16 x i8> %a
 }
 
-define <vscale x 16 x i8> @intrinsic_vsadd_mask_vi_nxv16i8_nxv16i8_i8(<vscale x 16 x i8> %0, <vscale x 16 x i8> %1, <vscale x 16 x i1> %2, i64 %3) nounwind {
+define <vscale x 16 x i8> @intrinsic_vsadd_mask_vi_nxv16i8_nxv16i8_i8(<vscale x 16 x i8> %0, <vscale x 16 x i8> %1, <vscale x 16 x i1> %2, iXLen %3) nounwind {
 ; CHECK-LABEL: intrinsic_vsadd_mask_vi_nxv16i8_nxv16i8_i8:
 ; CHECK:       # %bb.0: # %entry
 ; CHECK-NEXT:    vsetvli zero, a0, e8, m2, ta, mu
@@ -2234,12 +2332,12 @@ entry:
     <vscale x 16 x i8> %1,
     i8 9,
     <vscale x 16 x i1> %2,
-    i64 %3, i64 1)
+    iXLen %3, iXLen 1)
 
   ret <vscale x 16 x i8> %a
 }
 
-define <vscale x 32 x i8> @intrinsic_vsadd_vi_nxv32i8_nxv32i8_i8(<vscale x 32 x i8> %0, i64 %1) nounwind {
+define <vscale x 32 x i8> @intrinsic_vsadd_vi_nxv32i8_nxv32i8_i8(<vscale x 32 x i8> %0, iXLen %1) nounwind {
 ; CHECK-LABEL: intrinsic_vsadd_vi_nxv32i8_nxv32i8_i8:
 ; CHECK:       # %bb.0: # %entry
 ; CHECK-NEXT:    vsetvli zero, a0, e8, m4, ta, ma
@@ -2250,12 +2348,12 @@ entry:
     <vscale x 32 x i8> undef,
     <vscale x 32 x i8> %0,
     i8 9,
-    i64 %1)
+    iXLen %1)
 
   ret <vscale x 32 x i8> %a
 }
 
-define <vscale x 32 x i8> @intrinsic_vsadd_mask_vi_nxv32i8_nxv32i8_i8(<vscale x 32 x i8> %0, <vscale x 32 x i8> %1, <vscale x 32 x i1> %2, i64 %3) nounwind {
+define <vscale x 32 x i8> @intrinsic_vsadd_mask_vi_nxv32i8_nxv32i8_i8(<vscale x 32 x i8> %0, <vscale x 32 x i8> %1, <vscale x 32 x i1> %2, iXLen %3) nounwind {
 ; CHECK-LABEL: intrinsic_vsadd_mask_vi_nxv32i8_nxv32i8_i8:
 ; CHECK:       # %bb.0: # %entry
 ; CHECK-NEXT:    vsetvli zero, a0, e8, m4, ta, mu
@@ -2267,12 +2365,12 @@ entry:
     <vscale x 32 x i8> %1,
     i8 9,
     <vscale x 32 x i1> %2,
-    i64 %3, i64 1)
+    iXLen %3, iXLen 1)
 
   ret <vscale x 32 x i8> %a
 }
 
-define <vscale x 64 x i8> @intrinsic_vsadd_vi_nxv64i8_nxv64i8_i8(<vscale x 64 x i8> %0, i64 %1) nounwind {
+define <vscale x 64 x i8> @intrinsic_vsadd_vi_nxv64i8_nxv64i8_i8(<vscale x 64 x i8> %0, iXLen %1) nounwind {
 ; CHECK-LABEL: intrinsic_vsadd_vi_nxv64i8_nxv64i8_i8:
 ; CHECK:       # %bb.0: # %entry
 ; CHECK-NEXT:    vsetvli zero, a0, e8, m8, ta, ma
@@ -2283,12 +2381,12 @@ entry:
     <vscale x 64 x i8> undef,
     <vscale x 64 x i8> %0,
     i8 9,
-    i64 %1)
+    iXLen %1)
 
   ret <vscale x 64 x i8> %a
 }
 
-define <vscale x 64 x i8> @intrinsic_vsadd_mask_vi_nxv64i8_nxv64i8_i8(<vscale x 64 x i8> %0, <vscale x 64 x i8> %1, <vscale x 64 x i1> %2, i64 %3) nounwind {
+define <vscale x 64 x i8> @intrinsic_vsadd_mask_vi_nxv64i8_nxv64i8_i8(<vscale x 64 x i8> %0, <vscale x 64 x i8> %1, <vscale x 64 x i1> %2, iXLen %3) nounwind {
 ; CHECK-LABEL: intrinsic_vsadd_mask_vi_nxv64i8_nxv64i8_i8:
 ; CHECK:       # %bb.0: # %entry
 ; CHECK-NEXT:    vsetvli zero, a0, e8, m8, ta, mu
@@ -2300,12 +2398,12 @@ entry:
     <vscale x 64 x i8> %1,
     i8 9,
     <vscale x 64 x i1> %2,
-    i64 %3, i64 1)
+    iXLen %3, iXLen 1)
 
   ret <vscale x 64 x i8> %a
 }
 
-define <vscale x 1 x i16> @intrinsic_vsadd_vi_nxv1i16_nxv1i16_i16(<vscale x 1 x i16> %0, i64 %1) nounwind {
+define <vscale x 1 x i16> @intrinsic_vsadd_vi_nxv1i16_nxv1i16_i16(<vscale x 1 x i16> %0, iXLen %1) nounwind {
 ; CHECK-LABEL: intrinsic_vsadd_vi_nxv1i16_nxv1i16_i16:
 ; CHECK:       # %bb.0: # %entry
 ; CHECK-NEXT:    vsetvli zero, a0, e16, mf4, ta, ma
@@ -2316,12 +2414,12 @@ entry:
     <vscale x 1 x i16> undef,
     <vscale x 1 x i16> %0,
     i16 9,
-    i64 %1)
+    iXLen %1)
 
   ret <vscale x 1 x i16> %a
 }
 
-define <vscale x 1 x i16> @intrinsic_vsadd_mask_vi_nxv1i16_nxv1i16_i16(<vscale x 1 x i16> %0, <vscale x 1 x i16> %1, <vscale x 1 x i1> %2, i64 %3) nounwind {
+define <vscale x 1 x i16> @intrinsic_vsadd_mask_vi_nxv1i16_nxv1i16_i16(<vscale x 1 x i16> %0, <vscale x 1 x i16> %1, <vscale x 1 x i1> %2, iXLen %3) nounwind {
 ; CHECK-LABEL: intrinsic_vsadd_mask_vi_nxv1i16_nxv1i16_i16:
 ; CHECK:       # %bb.0: # %entry
 ; CHECK-NEXT:    vsetvli zero, a0, e16, mf4, ta, mu
@@ -2333,12 +2431,12 @@ entry:
     <vscale x 1 x i16> %1,
     i16 9,
     <vscale x 1 x i1> %2,
-    i64 %3, i64 1)
+    iXLen %3, iXLen 1)
 
   ret <vscale x 1 x i16> %a
 }
 
-define <vscale x 2 x i16> @intrinsic_vsadd_vi_nxv2i16_nxv2i16_i16(<vscale x 2 x i16> %0, i64 %1) nounwind {
+define <vscale x 2 x i16> @intrinsic_vsadd_vi_nxv2i16_nxv2i16_i16(<vscale x 2 x i16> %0, iXLen %1) nounwind {
 ; CHECK-LABEL: intrinsic_vsadd_vi_nxv2i16_nxv2i16_i16:
 ; CHECK:       # %bb.0: # %entry
 ; CHECK-NEXT:    vsetvli zero, a0, e16, mf2, ta, ma
@@ -2349,12 +2447,12 @@ entry:
     <vscale x 2 x i16> undef,
     <vscale x 2 x i16> %0,
     i16 9,
-    i64 %1)
+    iXLen %1)
 
   ret <vscale x 2 x i16> %a
 }
 
-define <vscale x 2 x i16> @intrinsic_vsadd_mask_vi_nxv2i16_nxv2i16_i16(<vscale x 2 x i16> %0, <vscale x 2 x i16> %1, <vscale x 2 x i1> %2, i64 %3) nounwind {
+define <vscale x 2 x i16> @intrinsic_vsadd_mask_vi_nxv2i16_nxv2i16_i16(<vscale x 2 x i16> %0, <vscale x 2 x i16> %1, <vscale x 2 x i1> %2, iXLen %3) nounwind {
 ; CHECK-LABEL: intrinsic_vsadd_mask_vi_nxv2i16_nxv2i16_i16:
 ; CHECK:       # %bb.0: # %entry
 ; CHECK-NEXT:    vsetvli zero, a0, e16, mf2, ta, mu
@@ -2366,12 +2464,12 @@ entry:
     <vscale x 2 x i16> %1,
     i16 9,
     <vscale x 2 x i1> %2,
-    i64 %3, i64 1)
+    iXLen %3, iXLen 1)
 
   ret <vscale x 2 x i16> %a
 }
 
-define <vscale x 4 x i16> @intrinsic_vsadd_vi_nxv4i16_nxv4i16_i16(<vscale x 4 x i16> %0, i64 %1) nounwind {
+define <vscale x 4 x i16> @intrinsic_vsadd_vi_nxv4i16_nxv4i16_i16(<vscale x 4 x i16> %0, iXLen %1) nounwind {
 ; CHECK-LABEL: intrinsic_vsadd_vi_nxv4i16_nxv4i16_i16:
 ; CHECK:       # %bb.0: # %entry
 ; CHECK-NEXT:    vsetvli zero, a0, e16, m1, ta, ma
@@ -2382,12 +2480,12 @@ entry:
     <vscale x 4 x i16> undef,
     <vscale x 4 x i16> %0,
     i16 9,
-    i64 %1)
+    iXLen %1)
 
   ret <vscale x 4 x i16> %a
 }
 
-define <vscale x 4 x i16> @intrinsic_vsadd_mask_vi_nxv4i16_nxv4i16_i16(<vscale x 4 x i16> %0, <vscale x 4 x i16> %1, <vscale x 4 x i1> %2, i64 %3) nounwind {
+define <vscale x 4 x i16> @intrinsic_vsadd_mask_vi_nxv4i16_nxv4i16_i16(<vscale x 4 x i16> %0, <vscale x 4 x i16> %1, <vscale x 4 x i1> %2, iXLen %3) nounwind {
 ; CHECK-LABEL: intrinsic_vsadd_mask_vi_nxv4i16_nxv4i16_i16:
 ; CHECK:       # %bb.0: # %entry
 ; CHECK-NEXT:    vsetvli zero, a0, e16, m1, ta, mu
@@ -2399,12 +2497,12 @@ entry:
     <vscale x 4 x i16> %1,
     i16 9,
     <vscale x 4 x i1> %2,
-    i64 %3, i64 1)
+    iXLen %3, iXLen 1)
 
   ret <vscale x 4 x i16> %a
 }
 
-define <vscale x 8 x i16> @intrinsic_vsadd_vi_nxv8i16_nxv8i16_i16(<vscale x 8 x i16> %0, i64 %1) nounwind {
+define <vscale x 8 x i16> @intrinsic_vsadd_vi_nxv8i16_nxv8i16_i16(<vscale x 8 x i16> %0, iXLen %1) nounwind {
 ; CHECK-LABEL: intrinsic_vsadd_vi_nxv8i16_nxv8i16_i16:
 ; CHECK:       # %bb.0: # %entry
 ; CHECK-NEXT:    vsetvli zero, a0, e16, m2, ta, ma
@@ -2415,12 +2513,12 @@ entry:
     <vscale x 8 x i16> undef,
     <vscale x 8 x i16> %0,
     i16 9,
-    i64 %1)
+    iXLen %1)
 
   ret <vscale x 8 x i16> %a
 }
 
-define <vscale x 8 x i16> @intrinsic_vsadd_mask_vi_nxv8i16_nxv8i16_i16(<vscale x 8 x i16> %0, <vscale x 8 x i16> %1, <vscale x 8 x i1> %2, i64 %3) nounwind {
+define <vscale x 8 x i16> @intrinsic_vsadd_mask_vi_nxv8i16_nxv8i16_i16(<vscale x 8 x i16> %0, <vscale x 8 x i16> %1, <vscale x 8 x i1> %2, iXLen %3) nounwind {
 ; CHECK-LABEL: intrinsic_vsadd_mask_vi_nxv8i16_nxv8i16_i16:
 ; CHECK:       # %bb.0: # %entry
 ; CHECK-NEXT:    vsetvli zero, a0, e16, m2, ta, mu
@@ -2432,12 +2530,12 @@ entry:
     <vscale x 8 x i16> %1,
     i16 9,
     <vscale x 8 x i1> %2,
-    i64 %3, i64 1)
+    iXLen %3, iXLen 1)
 
   ret <vscale x 8 x i16> %a
 }
 
-define <vscale x 16 x i16> @intrinsic_vsadd_vi_nxv16i16_nxv16i16_i16(<vscale x 16 x i16> %0, i64 %1) nounwind {
+define <vscale x 16 x i16> @intrinsic_vsadd_vi_nxv16i16_nxv16i16_i16(<vscale x 16 x i16> %0, iXLen %1) nounwind {
 ; CHECK-LABEL: intrinsic_vsadd_vi_nxv16i16_nxv16i16_i16:
 ; CHECK:       # %bb.0: # %entry
 ; CHECK-NEXT:    vsetvli zero, a0, e16, m4, ta, ma
@@ -2448,12 +2546,12 @@ entry:
     <vscale x 16 x i16> undef,
     <vscale x 16 x i16> %0,
     i16 9,
-    i64 %1)
+    iXLen %1)
 
   ret <vscale x 16 x i16> %a
 }
 
-define <vscale x 16 x i16> @intrinsic_vsadd_mask_vi_nxv16i16_nxv16i16_i16(<vscale x 16 x i16> %0, <vscale x 16 x i16> %1, <vscale x 16 x i1> %2, i64 %3) nounwind {
+define <vscale x 16 x i16> @intrinsic_vsadd_mask_vi_nxv16i16_nxv16i16_i16(<vscale x 16 x i16> %0, <vscale x 16 x i16> %1, <vscale x 16 x i1> %2, iXLen %3) nounwind {
 ; CHECK-LABEL: intrinsic_vsadd_mask_vi_nxv16i16_nxv16i16_i16:
 ; CHECK:       # %bb.0: # %entry
 ; CHECK-NEXT:    vsetvli zero, a0, e16, m4, ta, mu
@@ -2465,12 +2563,12 @@ entry:
     <vscale x 16 x i16> %1,
     i16 9,
     <vscale x 16 x i1> %2,
-    i64 %3, i64 1)
+    iXLen %3, iXLen 1)
 
   ret <vscale x 16 x i16> %a
 }
 
-define <vscale x 32 x i16> @intrinsic_vsadd_vi_nxv32i16_nxv32i16_i16(<vscale x 32 x i16> %0, i64 %1) nounwind {
+define <vscale x 32 x i16> @intrinsic_vsadd_vi_nxv32i16_nxv32i16_i16(<vscale x 32 x i16> %0, iXLen %1) nounwind {
 ; CHECK-LABEL: intrinsic_vsadd_vi_nxv32i16_nxv32i16_i16:
 ; CHECK:       # %bb.0: # %entry
 ; CHECK-NEXT:    vsetvli zero, a0, e16, m8, ta, ma
@@ -2481,12 +2579,12 @@ entry:
     <vscale x 32 x i16> undef,
     <vscale x 32 x i16> %0,
     i16 9,
-    i64 %1)
+    iXLen %1)
 
   ret <vscale x 32 x i16> %a
 }
 
-define <vscale x 32 x i16> @intrinsic_vsadd_mask_vi_nxv32i16_nxv32i16_i16(<vscale x 32 x i16> %0, <vscale x 32 x i16> %1, <vscale x 32 x i1> %2, i64 %3) nounwind {
+define <vscale x 32 x i16> @intrinsic_vsadd_mask_vi_nxv32i16_nxv32i16_i16(<vscale x 32 x i16> %0, <vscale x 32 x i16> %1, <vscale x 32 x i1> %2, iXLen %3) nounwind {
 ; CHECK-LABEL: intrinsic_vsadd_mask_vi_nxv32i16_nxv32i16_i16:
 ; CHECK:       # %bb.0: # %entry
 ; CHECK-NEXT:    vsetvli zero, a0, e16, m8, ta, mu
@@ -2498,12 +2596,12 @@ entry:
     <vscale x 32 x i16> %1,
     i16 9,
     <vscale x 32 x i1> %2,
-    i64 %3, i64 1)
+    iXLen %3, iXLen 1)
 
   ret <vscale x 32 x i16> %a
 }
 
-define <vscale x 1 x i32> @intrinsic_vsadd_vi_nxv1i32_nxv1i32_i32(<vscale x 1 x i32> %0, i64 %1) nounwind {
+define <vscale x 1 x i32> @intrinsic_vsadd_vi_nxv1i32_nxv1i32_i32(<vscale x 1 x i32> %0, iXLen %1) nounwind {
 ; CHECK-LABEL: intrinsic_vsadd_vi_nxv1i32_nxv1i32_i32:
 ; CHECK:       # %bb.0: # %entry
 ; CHECK-NEXT:    vsetvli zero, a0, e32, mf2, ta, ma
@@ -2514,12 +2612,12 @@ entry:
     <vscale x 1 x i32> undef,
     <vscale x 1 x i32> %0,
     i32 9,
-    i64 %1)
+    iXLen %1)
 
   ret <vscale x 1 x i32> %a
 }
 
-define <vscale x 1 x i32> @intrinsic_vsadd_mask_vi_nxv1i32_nxv1i32_i32(<vscale x 1 x i32> %0, <vscale x 1 x i32> %1, <vscale x 1 x i1> %2, i64 %3) nounwind {
+define <vscale x 1 x i32> @intrinsic_vsadd_mask_vi_nxv1i32_nxv1i32_i32(<vscale x 1 x i32> %0, <vscale x 1 x i32> %1, <vscale x 1 x i1> %2, iXLen %3) nounwind {
 ; CHECK-LABEL: intrinsic_vsadd_mask_vi_nxv1i32_nxv1i32_i32:
 ; CHECK:       # %bb.0: # %entry
 ; CHECK-NEXT:    vsetvli zero, a0, e32, mf2, ta, mu
@@ -2531,12 +2629,12 @@ entry:
     <vscale x 1 x i32> %1,
     i32 9,
     <vscale x 1 x i1> %2,
-    i64 %3, i64 1)
+    iXLen %3, iXLen 1)
 
   ret <vscale x 1 x i32> %a
 }
 
-define <vscale x 2 x i32> @intrinsic_vsadd_vi_nxv2i32_nxv2i32_i32(<vscale x 2 x i32> %0, i64 %1) nounwind {
+define <vscale x 2 x i32> @intrinsic_vsadd_vi_nxv2i32_nxv2i32_i32(<vscale x 2 x i32> %0, iXLen %1) nounwind {
 ; CHECK-LABEL: intrinsic_vsadd_vi_nxv2i32_nxv2i32_i32:
 ; CHECK:       # %bb.0: # %entry
 ; CHECK-NEXT:    vsetvli zero, a0, e32, m1, ta, ma
@@ -2547,12 +2645,12 @@ entry:
     <vscale x 2 x i32> undef,
     <vscale x 2 x i32> %0,
     i32 9,
-    i64 %1)
+    iXLen %1)
 
   ret <vscale x 2 x i32> %a
 }
 
-define <vscale x 2 x i32> @intrinsic_vsadd_mask_vi_nxv2i32_nxv2i32_i32(<vscale x 2 x i32> %0, <vscale x 2 x i32> %1, <vscale x 2 x i1> %2, i64 %3) nounwind {
+define <vscale x 2 x i32> @intrinsic_vsadd_mask_vi_nxv2i32_nxv2i32_i32(<vscale x 2 x i32> %0, <vscale x 2 x i32> %1, <vscale x 2 x i1> %2, iXLen %3) nounwind {
 ; CHECK-LABEL: intrinsic_vsadd_mask_vi_nxv2i32_nxv2i32_i32:
 ; CHECK:       # %bb.0: # %entry
 ; CHECK-NEXT:    vsetvli zero, a0, e32, m1, ta, mu
@@ -2564,12 +2662,12 @@ entry:
     <vscale x 2 x i32> %1,
     i32 9,
     <vscale x 2 x i1> %2,
-    i64 %3, i64 1)
+    iXLen %3, iXLen 1)
 
   ret <vscale x 2 x i32> %a
 }
 
-define <vscale x 4 x i32> @intrinsic_vsadd_vi_nxv4i32_nxv4i32_i32(<vscale x 4 x i32> %0, i64 %1) nounwind {
+define <vscale x 4 x i32> @intrinsic_vsadd_vi_nxv4i32_nxv4i32_i32(<vscale x 4 x i32> %0, iXLen %1) nounwind {
 ; CHECK-LABEL: intrinsic_vsadd_vi_nxv4i32_nxv4i32_i32:
 ; CHECK:       # %bb.0: # %entry
 ; CHECK-NEXT:    vsetvli zero, a0, e32, m2, ta, ma
@@ -2580,12 +2678,12 @@ entry:
     <vscale x 4 x i32> undef,
     <vscale x 4 x i32> %0,
     i32 9,
-    i64 %1)
+    iXLen %1)
 
   ret <vscale x 4 x i32> %a
 }
 
-define <vscale x 4 x i32> @intrinsic_vsadd_mask_vi_nxv4i32_nxv4i32_i32(<vscale x 4 x i32> %0, <vscale x 4 x i32> %1, <vscale x 4 x i1> %2, i64 %3) nounwind {
+define <vscale x 4 x i32> @intrinsic_vsadd_mask_vi_nxv4i32_nxv4i32_i32(<vscale x 4 x i32> %0, <vscale x 4 x i32> %1, <vscale x 4 x i1> %2, iXLen %3) nounwind {
 ; CHECK-LABEL: intrinsic_vsadd_mask_vi_nxv4i32_nxv4i32_i32:
 ; CHECK:       # %bb.0: # %entry
 ; CHECK-NEXT:    vsetvli zero, a0, e32, m2, ta, mu
@@ -2597,12 +2695,12 @@ entry:
     <vscale x 4 x i32> %1,
     i32 9,
     <vscale x 4 x i1> %2,
-    i64 %3, i64 1)
+    iXLen %3, iXLen 1)
 
   ret <vscale x 4 x i32> %a
 }
 
-define <vscale x 8 x i32> @intrinsic_vsadd_vi_nxv8i32_nxv8i32_i32(<vscale x 8 x i32> %0, i64 %1) nounwind {
+define <vscale x 8 x i32> @intrinsic_vsadd_vi_nxv8i32_nxv8i32_i32(<vscale x 8 x i32> %0, iXLen %1) nounwind {
 ; CHECK-LABEL: intrinsic_vsadd_vi_nxv8i32_nxv8i32_i32:
 ; CHECK:       # %bb.0: # %entry
 ; CHECK-NEXT:    vsetvli zero, a0, e32, m4, ta, ma
@@ -2613,12 +2711,12 @@ entry:
     <vscale x 8 x i32> undef,
     <vscale x 8 x i32> %0,
     i32 9,
-    i64 %1)
+    iXLen %1)
 
   ret <vscale x 8 x i32> %a
 }
 
-define <vscale x 8 x i32> @intrinsic_vsadd_mask_vi_nxv8i32_nxv8i32_i32(<vscale x 8 x i32> %0, <vscale x 8 x i32> %1, <vscale x 8 x i1> %2, i64 %3) nounwind {
+define <vscale x 8 x i32> @intrinsic_vsadd_mask_vi_nxv8i32_nxv8i32_i32(<vscale x 8 x i32> %0, <vscale x 8 x i32> %1, <vscale x 8 x i1> %2, iXLen %3) nounwind {
 ; CHECK-LABEL: intrinsic_vsadd_mask_vi_nxv8i32_nxv8i32_i32:
 ; CHECK:       # %bb.0: # %entry
 ; CHECK-NEXT:    vsetvli zero, a0, e32, m4, ta, mu
@@ -2630,12 +2728,12 @@ entry:
     <vscale x 8 x i32> %1,
     i32 9,
     <vscale x 8 x i1> %2,
-    i64 %3, i64 1)
+    iXLen %3, iXLen 1)
 
   ret <vscale x 8 x i32> %a
 }
 
-define <vscale x 16 x i32> @intrinsic_vsadd_vi_nxv16i32_nxv16i32_i32(<vscale x 16 x i32> %0, i64 %1) nounwind {
+define <vscale x 16 x i32> @intrinsic_vsadd_vi_nxv16i32_nxv16i32_i32(<vscale x 16 x i32> %0, iXLen %1) nounwind {
 ; CHECK-LABEL: intrinsic_vsadd_vi_nxv16i32_nxv16i32_i32:
 ; CHECK:       # %bb.0: # %entry
 ; CHECK-NEXT:    vsetvli zero, a0, e32, m8, ta, ma
@@ -2646,12 +2744,12 @@ entry:
     <vscale x 16 x i32> undef,
     <vscale x 16 x i32> %0,
     i32 9,
-    i64 %1)
+    iXLen %1)
 
   ret <vscale x 16 x i32> %a
 }
 
-define <vscale x 16 x i32> @intrinsic_vsadd_mask_vi_nxv16i32_nxv16i32_i32(<vscale x 16 x i32> %0, <vscale x 16 x i32> %1, <vscale x 16 x i1> %2, i64 %3) nounwind {
+define <vscale x 16 x i32> @intrinsic_vsadd_mask_vi_nxv16i32_nxv16i32_i32(<vscale x 16 x i32> %0, <vscale x 16 x i32> %1, <vscale x 16 x i1> %2, iXLen %3) nounwind {
 ; CHECK-LABEL: intrinsic_vsadd_mask_vi_nxv16i32_nxv16i32_i32:
 ; CHECK:       # %bb.0: # %entry
 ; CHECK-NEXT:    vsetvli zero, a0, e32, m8, ta, mu
@@ -2663,12 +2761,12 @@ entry:
     <vscale x 16 x i32> %1,
     i32 9,
     <vscale x 16 x i1> %2,
-    i64 %3, i64 1)
+    iXLen %3, iXLen 1)
 
   ret <vscale x 16 x i32> %a
 }
 
-define <vscale x 1 x i64> @intrinsic_vsadd_vi_nxv1i64_nxv1i64_i64(<vscale x 1 x i64> %0, i64 %1) nounwind {
+define <vscale x 1 x i64> @intrinsic_vsadd_vi_nxv1i64_nxv1i64_i64(<vscale x 1 x i64> %0, iXLen %1) nounwind {
 ; CHECK-LABEL: intrinsic_vsadd_vi_nxv1i64_nxv1i64_i64:
 ; CHECK:       # %bb.0: # %entry
 ; CHECK-NEXT:    vsetvli zero, a0, e64, m1, ta, ma
@@ -2679,12 +2777,12 @@ entry:
     <vscale x 1 x i64> undef,
     <vscale x 1 x i64> %0,
     i64 9,
-    i64 %1)
+    iXLen %1)
 
   ret <vscale x 1 x i64> %a
 }
 
-define <vscale x 1 x i64> @intrinsic_vsadd_mask_vi_nxv1i64_nxv1i64_i64(<vscale x 1 x i64> %0, <vscale x 1 x i64> %1, <vscale x 1 x i1> %2, i64 %3) nounwind {
+define <vscale x 1 x i64> @intrinsic_vsadd_mask_vi_nxv1i64_nxv1i64_i64(<vscale x 1 x i64> %0, <vscale x 1 x i64> %1, <vscale x 1 x i1> %2, iXLen %3) nounwind {
 ; CHECK-LABEL: intrinsic_vsadd_mask_vi_nxv1i64_nxv1i64_i64:
 ; CHECK:       # %bb.0: # %entry
 ; CHECK-NEXT:    vsetvli zero, a0, e64, m1, ta, mu
@@ -2696,12 +2794,12 @@ entry:
     <vscale x 1 x i64> %1,
     i64 9,
     <vscale x 1 x i1> %2,
-    i64 %3, i64 1)
+    iXLen %3, iXLen 1)
 
   ret <vscale x 1 x i64> %a
 }
 
-define <vscale x 2 x i64> @intrinsic_vsadd_vi_nxv2i64_nxv2i64_i64(<vscale x 2 x i64> %0, i64 %1) nounwind {
+define <vscale x 2 x i64> @intrinsic_vsadd_vi_nxv2i64_nxv2i64_i64(<vscale x 2 x i64> %0, iXLen %1) nounwind {
 ; CHECK-LABEL: intrinsic_vsadd_vi_nxv2i64_nxv2i64_i64:
 ; CHECK:       # %bb.0: # %entry
 ; CHECK-NEXT:    vsetvli zero, a0, e64, m2, ta, ma
@@ -2712,12 +2810,12 @@ entry:
     <vscale x 2 x i64> undef,
     <vscale x 2 x i64> %0,
     i64 9,
-    i64 %1)
+    iXLen %1)
 
   ret <vscale x 2 x i64> %a
 }
 
-define <vscale x 2 x i64> @intrinsic_vsadd_mask_vi_nxv2i64_nxv2i64_i64(<vscale x 2 x i64> %0, <vscale x 2 x i64> %1, <vscale x 2 x i1> %2, i64 %3) nounwind {
+define <vscale x 2 x i64> @intrinsic_vsadd_mask_vi_nxv2i64_nxv2i64_i64(<vscale x 2 x i64> %0, <vscale x 2 x i64> %1, <vscale x 2 x i1> %2, iXLen %3) nounwind {
 ; CHECK-LABEL: intrinsic_vsadd_mask_vi_nxv2i64_nxv2i64_i64:
 ; CHECK:       # %bb.0: # %entry
 ; CHECK-NEXT:    vsetvli zero, a0, e64, m2, ta, mu
@@ -2729,12 +2827,12 @@ entry:
     <vscale x 2 x i64> %1,
     i64 9,
     <vscale x 2 x i1> %2,
-    i64 %3, i64 1)
+    iXLen %3, iXLen 1)
 
   ret <vscale x 2 x i64> %a
 }
 
-define <vscale x 4 x i64> @intrinsic_vsadd_vi_nxv4i64_nxv4i64_i64(<vscale x 4 x i64> %0, i64 %1) nounwind {
+define <vscale x 4 x i64> @intrinsic_vsadd_vi_nxv4i64_nxv4i64_i64(<vscale x 4 x i64> %0, iXLen %1) nounwind {
 ; CHECK-LABEL: intrinsic_vsadd_vi_nxv4i64_nxv4i64_i64:
 ; CHECK:       # %bb.0: # %entry
 ; CHECK-NEXT:    vsetvli zero, a0, e64, m4, ta, ma
@@ -2745,12 +2843,12 @@ entry:
     <vscale x 4 x i64> undef,
     <vscale x 4 x i64> %0,
     i64 9,
-    i64 %1)
+    iXLen %1)
 
   ret <vscale x 4 x i64> %a
 }
 
-define <vscale x 4 x i64> @intrinsic_vsadd_mask_vi_nxv4i64_nxv4i64_i64(<vscale x 4 x i64> %0, <vscale x 4 x i64> %1, <vscale x 4 x i1> %2, i64 %3) nounwind {
+define <vscale x 4 x i64> @intrinsic_vsadd_mask_vi_nxv4i64_nxv4i64_i64(<vscale x 4 x i64> %0, <vscale x 4 x i64> %1, <vscale x 4 x i1> %2, iXLen %3) nounwind {
 ; CHECK-LABEL: intrinsic_vsadd_mask_vi_nxv4i64_nxv4i64_i64:
 ; CHECK:       # %bb.0: # %entry
 ; CHECK-NEXT:    vsetvli zero, a0, e64, m4, ta, mu
@@ -2762,12 +2860,12 @@ entry:
     <vscale x 4 x i64> %1,
     i64 9,
     <vscale x 4 x i1> %2,
-    i64 %3, i64 1)
+    iXLen %3, iXLen 1)
 
   ret <vscale x 4 x i64> %a
 }
 
-define <vscale x 8 x i64> @intrinsic_vsadd_vi_nxv8i64_nxv8i64_i64(<vscale x 8 x i64> %0, i64 %1) nounwind {
+define <vscale x 8 x i64> @intrinsic_vsadd_vi_nxv8i64_nxv8i64_i64(<vscale x 8 x i64> %0, iXLen %1) nounwind {
 ; CHECK-LABEL: intrinsic_vsadd_vi_nxv8i64_nxv8i64_i64:
 ; CHECK:       # %bb.0: # %entry
 ; CHECK-NEXT:    vsetvli zero, a0, e64, m8, ta, ma
@@ -2778,12 +2876,12 @@ entry:
     <vscale x 8 x i64> undef,
     <vscale x 8 x i64> %0,
     i64 9,
-    i64 %1)
+    iXLen %1)
 
   ret <vscale x 8 x i64> %a
 }
 
-define <vscale x 8 x i64> @intrinsic_vsadd_mask_vi_nxv8i64_nxv8i64_i64(<vscale x 8 x i64> %0, <vscale x 8 x i64> %1, <vscale x 8 x i1> %2, i64 %3) nounwind {
+define <vscale x 8 x i64> @intrinsic_vsadd_mask_vi_nxv8i64_nxv8i64_i64(<vscale x 8 x i64> %0, <vscale x 8 x i64> %1, <vscale x 8 x i1> %2, iXLen %3) nounwind {
 ; CHECK-LABEL: intrinsic_vsadd_mask_vi_nxv8i64_nxv8i64_i64:
 ; CHECK:       # %bb.0: # %entry
 ; CHECK-NEXT:    vsetvli zero, a0, e64, m8, ta, mu
@@ -2795,7 +2893,7 @@ entry:
     <vscale x 8 x i64> %1,
     i64 9,
     <vscale x 8 x i1> %2,
-    i64 %3, i64 1)
+    iXLen %3, iXLen 1)
 
   ret <vscale x 8 x i64> %a
 }
diff --git a/llvm/test/CodeGen/RISCV/rvv/vsaddu-rv32.ll b/llvm/test/CodeGen/RISCV/rvv/vsaddu-rv32.ll
deleted file mode 100644
index b5fa9a9..0000000
--- a/llvm/test/CodeGen/RISCV/rvv/vsaddu-rv32.ll
+++ /dev/null
@@ -1,2849 +0,0 @@
-; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py
-; RUN: llc -mtriple=riscv32 -mattr=+v -verify-machineinstrs \
-; RUN:   < %s | FileCheck %s
-
-declare <vscale x 1 x i8> @llvm.riscv.vsaddu.nxv1i8.nxv1i8(
-  <vscale x 1 x i8>,
-  <vscale x 1 x i8>,
-  <vscale x 1 x i8>,
-  i32);
-
-define <vscale x 1 x i8> @intrinsic_vsaddu_vv_nxv1i8_nxv1i8_nxv1i8(<vscale x 1 x i8> %0, <vscale x 1 x i8> %1, i32 %2) nounwind {
-; CHECK-LABEL: intrinsic_vsaddu_vv_nxv1i8_nxv1i8_nxv1i8:
-; CHECK:       # %bb.0: # %entry
-; CHECK-NEXT:    vsetvli zero, a0, e8, mf8, ta, ma
-; CHECK-NEXT:    vsaddu.vv v8, v8, v9
-; CHECK-NEXT:    ret
-entry:
-  %a = call <vscale x 1 x i8> @llvm.riscv.vsaddu.nxv1i8.nxv1i8(
-    <vscale x 1 x i8> undef,
-    <vscale x 1 x i8> %0,
-    <vscale x 1 x i8> %1,
-    i32 %2)
-
-  ret <vscale x 1 x i8> %a
-}
-
-declare <vscale x 1 x i8> @llvm.riscv.vsaddu.mask.nxv1i8.nxv1i8(
-  <vscale x 1 x i8>,
-  <vscale x 1 x i8>,
-  <vscale x 1 x i8>,
-  <vscale x 1 x i1>,
-  i32,
-  i32);
-
-define <vscale x 1 x i8> @intrinsic_vsaddu_mask_vv_nxv1i8_nxv1i8_nxv1i8(<vscale x 1 x i8> %0, <vscale x 1 x i8> %1, <vscale x 1 x i8> %2, <vscale x 1 x i1> %3, i32 %4) nounwind {
-; CHECK-LABEL: intrinsic_vsaddu_mask_vv_nxv1i8_nxv1i8_nxv1i8:
-; CHECK:       # %bb.0: # %entry
-; CHECK-NEXT:    vsetvli zero, a0, e8, mf8, ta, mu
-; CHECK-NEXT:    vsaddu.vv v8, v9, v10, v0.t
-; CHECK-NEXT:    ret
-entry:
-  %a = call <vscale x 1 x i8> @llvm.riscv.vsaddu.mask.nxv1i8.nxv1i8(
-    <vscale x 1 x i8> %0,
-    <vscale x 1 x i8> %1,
-    <vscale x 1 x i8> %2,
-    <vscale x 1 x i1> %3,
-    i32 %4, i32 1)
-
-  ret <vscale x 1 x i8> %a
-}
-
-declare <vscale x 2 x i8> @llvm.riscv.vsaddu.nxv2i8.nxv2i8(
-  <vscale x 2 x i8>,
-  <vscale x 2 x i8>,
-  <vscale x 2 x i8>,
-  i32);
-
-define <vscale x 2 x i8> @intrinsic_vsaddu_vv_nxv2i8_nxv2i8_nxv2i8(<vscale x 2 x i8> %0, <vscale x 2 x i8> %1, i32 %2) nounwind {
-; CHECK-LABEL: intrinsic_vsaddu_vv_nxv2i8_nxv2i8_nxv2i8:
-; CHECK:       # %bb.0: # %entry
-; CHECK-NEXT:    vsetvli zero, a0, e8, mf4, ta, ma
-; CHECK-NEXT:    vsaddu.vv v8, v8, v9
-; CHECK-NEXT:    ret
-entry:
-  %a = call <vscale x 2 x i8> @llvm.riscv.vsaddu.nxv2i8.nxv2i8(
-    <vscale x 2 x i8> undef,
-    <vscale x 2 x i8> %0,
-    <vscale x 2 x i8> %1,
-    i32 %2)
-
-  ret <vscale x 2 x i8> %a
-}
-
-declare <vscale x 2 x i8> @llvm.riscv.vsaddu.mask.nxv2i8.nxv2i8(
-  <vscale x 2 x i8>,
-  <vscale x 2 x i8>,
-  <vscale x 2 x i8>,
-  <vscale x 2 x i1>,
-  i32,
-  i32);
-
-define <vscale x 2 x i8> @intrinsic_vsaddu_mask_vv_nxv2i8_nxv2i8_nxv2i8(<vscale x 2 x i8> %0, <vscale x 2 x i8> %1, <vscale x 2 x i8> %2, <vscale x 2 x i1> %3, i32 %4) nounwind {
-; CHECK-LABEL: intrinsic_vsaddu_mask_vv_nxv2i8_nxv2i8_nxv2i8:
-; CHECK:       # %bb.0: # %entry
-; CHECK-NEXT:    vsetvli zero, a0, e8, mf4, ta, mu
-; CHECK-NEXT:    vsaddu.vv v8, v9, v10, v0.t
-; CHECK-NEXT:    ret
-entry:
-  %a = call <vscale x 2 x i8> @llvm.riscv.vsaddu.mask.nxv2i8.nxv2i8(
-    <vscale x 2 x i8> %0,
-    <vscale x 2 x i8> %1,
-    <vscale x 2 x i8> %2,
-    <vscale x 2 x i1> %3,
-    i32 %4, i32 1)
-
-  ret <vscale x 2 x i8> %a
-}
-
-declare <vscale x 4 x i8> @llvm.riscv.vsaddu.nxv4i8.nxv4i8(
-  <vscale x 4 x i8>,
-  <vscale x 4 x i8>,
-  <vscale x 4 x i8>,
-  i32);
-
-define <vscale x 4 x i8> @intrinsic_vsaddu_vv_nxv4i8_nxv4i8_nxv4i8(<vscale x 4 x i8> %0, <vscale x 4 x i8> %1, i32 %2) nounwind {
-; CHECK-LABEL: intrinsic_vsaddu_vv_nxv4i8_nxv4i8_nxv4i8:
-; CHECK:       # %bb.0: # %entry
-; CHECK-NEXT:    vsetvli zero, a0, e8, mf2, ta, ma
-; CHECK-NEXT:    vsaddu.vv v8, v8, v9
-; CHECK-NEXT:    ret
-entry:
-  %a = call <vscale x 4 x i8> @llvm.riscv.vsaddu.nxv4i8.nxv4i8(
-    <vscale x 4 x i8> undef,
-    <vscale x 4 x i8> %0,
-    <vscale x 4 x i8> %1,
-    i32 %2)
-
-  ret <vscale x 4 x i8> %a
-}
-
-declare <vscale x 4 x i8> @llvm.riscv.vsaddu.mask.nxv4i8.nxv4i8(
-  <vscale x 4 x i8>,
-  <vscale x 4 x i8>,
-  <vscale x 4 x i8>,
-  <vscale x 4 x i1>,
-  i32,
-  i32);
-
-define <vscale x 4 x i8> @intrinsic_vsaddu_mask_vv_nxv4i8_nxv4i8_nxv4i8(<vscale x 4 x i8> %0, <vscale x 4 x i8> %1, <vscale x 4 x i8> %2, <vscale x 4 x i1> %3, i32 %4) nounwind {
-; CHECK-LABEL: intrinsic_vsaddu_mask_vv_nxv4i8_nxv4i8_nxv4i8:
-; CHECK:       # %bb.0: # %entry
-; CHECK-NEXT:    vsetvli zero, a0, e8, mf2, ta, mu
-; CHECK-NEXT:    vsaddu.vv v8, v9, v10, v0.t
-; CHECK-NEXT:    ret
-entry:
-  %a = call <vscale x 4 x i8> @llvm.riscv.vsaddu.mask.nxv4i8.nxv4i8(
-    <vscale x 4 x i8> %0,
-    <vscale x 4 x i8> %1,
-    <vscale x 4 x i8> %2,
-    <vscale x 4 x i1> %3,
-    i32 %4, i32 1)
-
-  ret <vscale x 4 x i8> %a
-}
-
-declare <vscale x 8 x i8> @llvm.riscv.vsaddu.nxv8i8.nxv8i8(
-  <vscale x 8 x i8>,
-  <vscale x 8 x i8>,
-  <vscale x 8 x i8>,
-  i32);
-
-define <vscale x 8 x i8> @intrinsic_vsaddu_vv_nxv8i8_nxv8i8_nxv8i8(<vscale x 8 x i8> %0, <vscale x 8 x i8> %1, i32 %2) nounwind {
-; CHECK-LABEL: intrinsic_vsaddu_vv_nxv8i8_nxv8i8_nxv8i8:
-; CHECK:       # %bb.0: # %entry
-; CHECK-NEXT:    vsetvli zero, a0, e8, m1, ta, ma
-; CHECK-NEXT:    vsaddu.vv v8, v8, v9
-; CHECK-NEXT:    ret
-entry:
-  %a = call <vscale x 8 x i8> @llvm.riscv.vsaddu.nxv8i8.nxv8i8(
-    <vscale x 8 x i8> undef,
-    <vscale x 8 x i8> %0,
-    <vscale x 8 x i8> %1,
-    i32 %2)
-
-  ret <vscale x 8 x i8> %a
-}
-
-declare <vscale x 8 x i8> @llvm.riscv.vsaddu.mask.nxv8i8.nxv8i8(
-  <vscale x 8 x i8>,
-  <vscale x 8 x i8>,
-  <vscale x 8 x i8>,
-  <vscale x 8 x i1>,
-  i32,
-  i32);
-
-define <vscale x 8 x i8> @intrinsic_vsaddu_mask_vv_nxv8i8_nxv8i8_nxv8i8(<vscale x 8 x i8> %0, <vscale x 8 x i8> %1, <vscale x 8 x i8> %2, <vscale x 8 x i1> %3, i32 %4) nounwind {
-; CHECK-LABEL: intrinsic_vsaddu_mask_vv_nxv8i8_nxv8i8_nxv8i8:
-; CHECK:       # %bb.0: # %entry
-; CHECK-NEXT:    vsetvli zero, a0, e8, m1, ta, mu
-; CHECK-NEXT:    vsaddu.vv v8, v9, v10, v0.t
-; CHECK-NEXT:    ret
-entry:
-  %a = call <vscale x 8 x i8> @llvm.riscv.vsaddu.mask.nxv8i8.nxv8i8(
-    <vscale x 8 x i8> %0,
-    <vscale x 8 x i8> %1,
-    <vscale x 8 x i8> %2,
-    <vscale x 8 x i1> %3,
-    i32 %4, i32 1)
-
-  ret <vscale x 8 x i8> %a
-}
-
-declare <vscale x 16 x i8> @llvm.riscv.vsaddu.nxv16i8.nxv16i8(
-  <vscale x 16 x i8>,
-  <vscale x 16 x i8>,
-  <vscale x 16 x i8>,
-  i32);
-
-define <vscale x 16 x i8> @intrinsic_vsaddu_vv_nxv16i8_nxv16i8_nxv16i8(<vscale x 16 x i8> %0, <vscale x 16 x i8> %1, i32 %2) nounwind {
-; CHECK-LABEL: intrinsic_vsaddu_vv_nxv16i8_nxv16i8_nxv16i8:
-; CHECK:       # %bb.0: # %entry
-; CHECK-NEXT:    vsetvli zero, a0, e8, m2, ta, ma
-; CHECK-NEXT:    vsaddu.vv v8, v8, v10
-; CHECK-NEXT:    ret
-entry:
-  %a = call <vscale x 16 x i8> @llvm.riscv.vsaddu.nxv16i8.nxv16i8(
-    <vscale x 16 x i8> undef,
-    <vscale x 16 x i8> %0,
-    <vscale x 16 x i8> %1,
-    i32 %2)
-
-  ret <vscale x 16 x i8> %a
-}
-
-declare <vscale x 16 x i8> @llvm.riscv.vsaddu.mask.nxv16i8.nxv16i8(
-  <vscale x 16 x i8>,
-  <vscale x 16 x i8>,
-  <vscale x 16 x i8>,
-  <vscale x 16 x i1>,
-  i32,
-  i32);
-
-define <vscale x 16 x i8> @intrinsic_vsaddu_mask_vv_nxv16i8_nxv16i8_nxv16i8(<vscale x 16 x i8> %0, <vscale x 16 x i8> %1, <vscale x 16 x i8> %2, <vscale x 16 x i1> %3, i32 %4) nounwind {
-; CHECK-LABEL: intrinsic_vsaddu_mask_vv_nxv16i8_nxv16i8_nxv16i8:
-; CHECK:       # %bb.0: # %entry
-; CHECK-NEXT:    vsetvli zero, a0, e8, m2, ta, mu
-; CHECK-NEXT:    vsaddu.vv v8, v10, v12, v0.t
-; CHECK-NEXT:    ret
-entry:
-  %a = call <vscale x 16 x i8> @llvm.riscv.vsaddu.mask.nxv16i8.nxv16i8(
-    <vscale x 16 x i8> %0,
-    <vscale x 16 x i8> %1,
-    <vscale x 16 x i8> %2,
-    <vscale x 16 x i1> %3,
-    i32 %4, i32 1)
-
-  ret <vscale x 16 x i8> %a
-}
-
-declare <vscale x 32 x i8> @llvm.riscv.vsaddu.nxv32i8.nxv32i8(
-  <vscale x 32 x i8>,
-  <vscale x 32 x i8>,
-  <vscale x 32 x i8>,
-  i32);
-
-define <vscale x 32 x i8> @intrinsic_vsaddu_vv_nxv32i8_nxv32i8_nxv32i8(<vscale x 32 x i8> %0, <vscale x 32 x i8> %1, i32 %2) nounwind {
-; CHECK-LABEL: intrinsic_vsaddu_vv_nxv32i8_nxv32i8_nxv32i8:
-; CHECK:       # %bb.0: # %entry
-; CHECK-NEXT:    vsetvli zero, a0, e8, m4, ta, ma
-; CHECK-NEXT:    vsaddu.vv v8, v8, v12
-; CHECK-NEXT:    ret
-entry:
-  %a = call <vscale x 32 x i8> @llvm.riscv.vsaddu.nxv32i8.nxv32i8(
-    <vscale x 32 x i8> undef,
-    <vscale x 32 x i8> %0,
-    <vscale x 32 x i8> %1,
-    i32 %2)
-
-  ret <vscale x 32 x i8> %a
-}
-
-declare <vscale x 32 x i8> @llvm.riscv.vsaddu.mask.nxv32i8.nxv32i8(
-  <vscale x 32 x i8>,
-  <vscale x 32 x i8>,
-  <vscale x 32 x i8>,
-  <vscale x 32 x i1>,
-  i32,
-  i32);
-
-define <vscale x 32 x i8> @intrinsic_vsaddu_mask_vv_nxv32i8_nxv32i8_nxv32i8(<vscale x 32 x i8> %0, <vscale x 32 x i8> %1, <vscale x 32 x i8> %2, <vscale x 32 x i1> %3, i32 %4) nounwind {
-; CHECK-LABEL: intrinsic_vsaddu_mask_vv_nxv32i8_nxv32i8_nxv32i8:
-; CHECK:       # %bb.0: # %entry
-; CHECK-NEXT:    vsetvli zero, a0, e8, m4, ta, mu
-; CHECK-NEXT:    vsaddu.vv v8, v12, v16, v0.t
-; CHECK-NEXT:    ret
-entry:
-  %a = call <vscale x 32 x i8> @llvm.riscv.vsaddu.mask.nxv32i8.nxv32i8(
-    <vscale x 32 x i8> %0,
-    <vscale x 32 x i8> %1,
-    <vscale x 32 x i8> %2,
-    <vscale x 32 x i1> %3,
-    i32 %4, i32 1)
-
-  ret <vscale x 32 x i8> %a
-}
-
-declare <vscale x 64 x i8> @llvm.riscv.vsaddu.nxv64i8.nxv64i8(
-  <vscale x 64 x i8>,
-  <vscale x 64 x i8>,
-  <vscale x 64 x i8>,
-  i32);
-
-define <vscale x 64 x i8> @intrinsic_vsaddu_vv_nxv64i8_nxv64i8_nxv64i8(<vscale x 64 x i8> %0, <vscale x 64 x i8> %1, i32 %2) nounwind {
-; CHECK-LABEL: intrinsic_vsaddu_vv_nxv64i8_nxv64i8_nxv64i8:
-; CHECK:       # %bb.0: # %entry
-; CHECK-NEXT:    vsetvli zero, a0, e8, m8, ta, ma
-; CHECK-NEXT:    vsaddu.vv v8, v8, v16
-; CHECK-NEXT:    ret
-entry:
-  %a = call <vscale x 64 x i8> @llvm.riscv.vsaddu.nxv64i8.nxv64i8(
-    <vscale x 64 x i8> undef,
-    <vscale x 64 x i8> %0,
-    <vscale x 64 x i8> %1,
-    i32 %2)
-
-  ret <vscale x 64 x i8> %a
-}
-
-declare <vscale x 64 x i8> @llvm.riscv.vsaddu.mask.nxv64i8.nxv64i8(
-  <vscale x 64 x i8>,
-  <vscale x 64 x i8>,
-  <vscale x 64 x i8>,
-  <vscale x 64 x i1>,
-  i32,
-  i32);
-
-define <vscale x 64 x i8> @intrinsic_vsaddu_mask_vv_nxv64i8_nxv64i8_nxv64i8(<vscale x 64 x i8> %0, <vscale x 64 x i8> %1, <vscale x 64 x i8> %2, <vscale x 64 x i1> %3, i32 %4) nounwind {
-; CHECK-LABEL: intrinsic_vsaddu_mask_vv_nxv64i8_nxv64i8_nxv64i8:
-; CHECK:       # %bb.0: # %entry
-; CHECK-NEXT:    vl8r.v v24, (a0)
-; CHECK-NEXT:    vsetvli zero, a1, e8, m8, ta, mu
-; CHECK-NEXT:    vsaddu.vv v8, v16, v24, v0.t
-; CHECK-NEXT:    ret
-entry:
-  %a = call <vscale x 64 x i8> @llvm.riscv.vsaddu.mask.nxv64i8.nxv64i8(
-    <vscale x 64 x i8> %0,
-    <vscale x 64 x i8> %1,
-    <vscale x 64 x i8> %2,
-    <vscale x 64 x i1> %3,
-    i32 %4, i32 1)
-
-  ret <vscale x 64 x i8> %a
-}
-
-declare <vscale x 1 x i16> @llvm.riscv.vsaddu.nxv1i16.nxv1i16(
-  <vscale x 1 x i16>,
-  <vscale x 1 x i16>,
-  <vscale x 1 x i16>,
-  i32);
-
-define <vscale x 1 x i16> @intrinsic_vsaddu_vv_nxv1i16_nxv1i16_nxv1i16(<vscale x 1 x i16> %0, <vscale x 1 x i16> %1, i32 %2) nounwind {
-; CHECK-LABEL: intrinsic_vsaddu_vv_nxv1i16_nxv1i16_nxv1i16:
-; CHECK:       # %bb.0: # %entry
-; CHECK-NEXT:    vsetvli zero, a0, e16, mf4, ta, ma
-; CHECK-NEXT:    vsaddu.vv v8, v8, v9
-; CHECK-NEXT:    ret
-entry:
-  %a = call <vscale x 1 x i16> @llvm.riscv.vsaddu.nxv1i16.nxv1i16(
-    <vscale x 1 x i16> undef,
-    <vscale x 1 x i16> %0,
-    <vscale x 1 x i16> %1,
-    i32 %2)
-
-  ret <vscale x 1 x i16> %a
-}
-
-declare <vscale x 1 x i16> @llvm.riscv.vsaddu.mask.nxv1i16.nxv1i16(
-  <vscale x 1 x i16>,
-  <vscale x 1 x i16>,
-  <vscale x 1 x i16>,
-  <vscale x 1 x i1>,
-  i32,
-  i32);
-
-define <vscale x 1 x i16> @intrinsic_vsaddu_mask_vv_nxv1i16_nxv1i16_nxv1i16(<vscale x 1 x i16> %0, <vscale x 1 x i16> %1, <vscale x 1 x i16> %2, <vscale x 1 x i1> %3, i32 %4) nounwind {
-; CHECK-LABEL: intrinsic_vsaddu_mask_vv_nxv1i16_nxv1i16_nxv1i16:
-; CHECK:       # %bb.0: # %entry
-; CHECK-NEXT:    vsetvli zero, a0, e16, mf4, ta, mu
-; CHECK-NEXT:    vsaddu.vv v8, v9, v10, v0.t
-; CHECK-NEXT:    ret
-entry:
-  %a = call <vscale x 1 x i16> @llvm.riscv.vsaddu.mask.nxv1i16.nxv1i16(
-    <vscale x 1 x i16> %0,
-    <vscale x 1 x i16> %1,
-    <vscale x 1 x i16> %2,
-    <vscale x 1 x i1> %3,
-    i32 %4, i32 1)
-
-  ret <vscale x 1 x i16> %a
-}
-
-declare <vscale x 2 x i16> @llvm.riscv.vsaddu.nxv2i16.nxv2i16(
-  <vscale x 2 x i16>,
-  <vscale x 2 x i16>,
-  <vscale x 2 x i16>,
-  i32);
-
-define <vscale x 2 x i16> @intrinsic_vsaddu_vv_nxv2i16_nxv2i16_nxv2i16(<vscale x 2 x i16> %0, <vscale x 2 x i16> %1, i32 %2) nounwind {
-; CHECK-LABEL: intrinsic_vsaddu_vv_nxv2i16_nxv2i16_nxv2i16:
-; CHECK:       # %bb.0: # %entry
-; CHECK-NEXT:    vsetvli zero, a0, e16, mf2, ta, ma
-; CHECK-NEXT:    vsaddu.vv v8, v8, v9
-; CHECK-NEXT:    ret
-entry:
-  %a = call <vscale x 2 x i16> @llvm.riscv.vsaddu.nxv2i16.nxv2i16(
-    <vscale x 2 x i16> undef,
-    <vscale x 2 x i16> %0,
-    <vscale x 2 x i16> %1,
-    i32 %2)
-
-  ret <vscale x 2 x i16> %a
-}
-
-declare <vscale x 2 x i16> @llvm.riscv.vsaddu.mask.nxv2i16.nxv2i16(
-  <vscale x 2 x i16>,
-  <vscale x 2 x i16>,
-  <vscale x 2 x i16>,
-  <vscale x 2 x i1>,
-  i32,
-  i32);
-
-define <vscale x 2 x i16> @intrinsic_vsaddu_mask_vv_nxv2i16_nxv2i16_nxv2i16(<vscale x 2 x i16> %0, <vscale x 2 x i16> %1, <vscale x 2 x i16> %2, <vscale x 2 x i1> %3, i32 %4) nounwind {
-; CHECK-LABEL: intrinsic_vsaddu_mask_vv_nxv2i16_nxv2i16_nxv2i16:
-; CHECK:       # %bb.0: # %entry
-; CHECK-NEXT:    vsetvli zero, a0, e16, mf2, ta, mu
-; CHECK-NEXT:    vsaddu.vv v8, v9, v10, v0.t
-; CHECK-NEXT:    ret
-entry:
-  %a = call <vscale x 2 x i16> @llvm.riscv.vsaddu.mask.nxv2i16.nxv2i16(
-    <vscale x 2 x i16> %0,
-    <vscale x 2 x i16> %1,
-    <vscale x 2 x i16> %2,
-    <vscale x 2 x i1> %3,
-    i32 %4, i32 1)
-
-  ret <vscale x 2 x i16> %a
-}
-
-declare <vscale x 4 x i16> @llvm.riscv.vsaddu.nxv4i16.nxv4i16(
-  <vscale x 4 x i16>,
-  <vscale x 4 x i16>,
-  <vscale x 4 x i16>,
-  i32);
-
-define <vscale x 4 x i16> @intrinsic_vsaddu_vv_nxv4i16_nxv4i16_nxv4i16(<vscale x 4 x i16> %0, <vscale x 4 x i16> %1, i32 %2) nounwind {
-; CHECK-LABEL: intrinsic_vsaddu_vv_nxv4i16_nxv4i16_nxv4i16:
-; CHECK:       # %bb.0: # %entry
-; CHECK-NEXT:    vsetvli zero, a0, e16, m1, ta, ma
-; CHECK-NEXT:    vsaddu.vv v8, v8, v9
-; CHECK-NEXT:    ret
-entry:
-  %a = call <vscale x 4 x i16> @llvm.riscv.vsaddu.nxv4i16.nxv4i16(
-    <vscale x 4 x i16> undef,
-    <vscale x 4 x i16> %0,
-    <vscale x 4 x i16> %1,
-    i32 %2)
-
-  ret <vscale x 4 x i16> %a
-}
-
-declare <vscale x 4 x i16> @llvm.riscv.vsaddu.mask.nxv4i16.nxv4i16(
-  <vscale x 4 x i16>,
-  <vscale x 4 x i16>,
-  <vscale x 4 x i16>,
-  <vscale x 4 x i1>,
-  i32,
-  i32);
-
-define <vscale x 4 x i16> @intrinsic_vsaddu_mask_vv_nxv4i16_nxv4i16_nxv4i16(<vscale x 4 x i16> %0, <vscale x 4 x i16> %1, <vscale x 4 x i16> %2, <vscale x 4 x i1> %3, i32 %4) nounwind {
-; CHECK-LABEL: intrinsic_vsaddu_mask_vv_nxv4i16_nxv4i16_nxv4i16:
-; CHECK:       # %bb.0: # %entry
-; CHECK-NEXT:    vsetvli zero, a0, e16, m1, ta, mu
-; CHECK-NEXT:    vsaddu.vv v8, v9, v10, v0.t
-; CHECK-NEXT:    ret
-entry:
-  %a = call <vscale x 4 x i16> @llvm.riscv.vsaddu.mask.nxv4i16.nxv4i16(
-    <vscale x 4 x i16> %0,
-    <vscale x 4 x i16> %1,
-    <vscale x 4 x i16> %2,
-    <vscale x 4 x i1> %3,
-    i32 %4, i32 1)
-
-  ret <vscale x 4 x i16> %a
-}
-
-declare <vscale x 8 x i16> @llvm.riscv.vsaddu.nxv8i16.nxv8i16(
-  <vscale x 8 x i16>,
-  <vscale x 8 x i16>,
-  <vscale x 8 x i16>,
-  i32);
-
-define <vscale x 8 x i16> @intrinsic_vsaddu_vv_nxv8i16_nxv8i16_nxv8i16(<vscale x 8 x i16> %0, <vscale x 8 x i16> %1, i32 %2) nounwind {
-; CHECK-LABEL: intrinsic_vsaddu_vv_nxv8i16_nxv8i16_nxv8i16:
-; CHECK:       # %bb.0: # %entry
-; CHECK-NEXT:    vsetvli zero, a0, e16, m2, ta, ma
-; CHECK-NEXT:    vsaddu.vv v8, v8, v10
-; CHECK-NEXT:    ret
-entry:
-  %a = call <vscale x 8 x i16> @llvm.riscv.vsaddu.nxv8i16.nxv8i16(
-    <vscale x 8 x i16> undef,
-    <vscale x 8 x i16> %0,
-    <vscale x 8 x i16> %1,
-    i32 %2)
-
-  ret <vscale x 8 x i16> %a
-}
-
-declare <vscale x 8 x i16> @llvm.riscv.vsaddu.mask.nxv8i16.nxv8i16(
-  <vscale x 8 x i16>,
-  <vscale x 8 x i16>,
-  <vscale x 8 x i16>,
-  <vscale x 8 x i1>,
-  i32,
-  i32);
-
-define <vscale x 8 x i16> @intrinsic_vsaddu_mask_vv_nxv8i16_nxv8i16_nxv8i16(<vscale x 8 x i16> %0, <vscale x 8 x i16> %1, <vscale x 8 x i16> %2, <vscale x 8 x i1> %3, i32 %4) nounwind {
-; CHECK-LABEL: intrinsic_vsaddu_mask_vv_nxv8i16_nxv8i16_nxv8i16:
-; CHECK:       # %bb.0: # %entry
-; CHECK-NEXT:    vsetvli zero, a0, e16, m2, ta, mu
-; CHECK-NEXT:    vsaddu.vv v8, v10, v12, v0.t
-; CHECK-NEXT:    ret
-entry:
-  %a = call <vscale x 8 x i16> @llvm.riscv.vsaddu.mask.nxv8i16.nxv8i16(
-    <vscale x 8 x i16> %0,
-    <vscale x 8 x i16> %1,
-    <vscale x 8 x i16> %2,
-    <vscale x 8 x i1> %3,
-    i32 %4, i32 1)
-
-  ret <vscale x 8 x i16> %a
-}
-
-declare <vscale x 16 x i16> @llvm.riscv.vsaddu.nxv16i16.nxv16i16(
-  <vscale x 16 x i16>,
-  <vscale x 16 x i16>,
-  <vscale x 16 x i16>,
-  i32);
-
-define <vscale x 16 x i16> @intrinsic_vsaddu_vv_nxv16i16_nxv16i16_nxv16i16(<vscale x 16 x i16> %0, <vscale x 16 x i16> %1, i32 %2) nounwind {
-; CHECK-LABEL: intrinsic_vsaddu_vv_nxv16i16_nxv16i16_nxv16i16:
-; CHECK:       # %bb.0: # %entry
-; CHECK-NEXT:    vsetvli zero, a0, e16, m4, ta, ma
-; CHECK-NEXT:    vsaddu.vv v8, v8, v12
-; CHECK-NEXT:    ret
-entry:
-  %a = call <vscale x 16 x i16> @llvm.riscv.vsaddu.nxv16i16.nxv16i16(
-    <vscale x 16 x i16> undef,
-    <vscale x 16 x i16> %0,
-    <vscale x 16 x i16> %1,
-    i32 %2)
-
-  ret <vscale x 16 x i16> %a
-}
-
-declare <vscale x 16 x i16> @llvm.riscv.vsaddu.mask.nxv16i16.nxv16i16(
-  <vscale x 16 x i16>,
-  <vscale x 16 x i16>,
-  <vscale x 16 x i16>,
-  <vscale x 16 x i1>,
-  i32,
-  i32);
-
-define <vscale x 16 x i16> @intrinsic_vsaddu_mask_vv_nxv16i16_nxv16i16_nxv16i16(<vscale x 16 x i16> %0, <vscale x 16 x i16> %1, <vscale x 16 x i16> %2, <vscale x 16 x i1> %3, i32 %4) nounwind {
-; CHECK-LABEL: intrinsic_vsaddu_mask_vv_nxv16i16_nxv16i16_nxv16i16:
-; CHECK:       # %bb.0: # %entry
-; CHECK-NEXT:    vsetvli zero, a0, e16, m4, ta, mu
-; CHECK-NEXT:    vsaddu.vv v8, v12, v16, v0.t
-; CHECK-NEXT:    ret
-entry:
-  %a = call <vscale x 16 x i16> @llvm.riscv.vsaddu.mask.nxv16i16.nxv16i16(
-    <vscale x 16 x i16> %0,
-    <vscale x 16 x i16> %1,
-    <vscale x 16 x i16> %2,
-    <vscale x 16 x i1> %3,
-    i32 %4, i32 1)
-
-  ret <vscale x 16 x i16> %a
-}
-
-declare <vscale x 32 x i16> @llvm.riscv.vsaddu.nxv32i16.nxv32i16(
-  <vscale x 32 x i16>,
-  <vscale x 32 x i16>,
-  <vscale x 32 x i16>,
-  i32);
-
-define <vscale x 32 x i16> @intrinsic_vsaddu_vv_nxv32i16_nxv32i16_nxv32i16(<vscale x 32 x i16> %0, <vscale x 32 x i16> %1, i32 %2) nounwind {
-; CHECK-LABEL: intrinsic_vsaddu_vv_nxv32i16_nxv32i16_nxv32i16:
-; CHECK:       # %bb.0: # %entry
-; CHECK-NEXT:    vsetvli zero, a0, e16, m8, ta, ma
-; CHECK-NEXT:    vsaddu.vv v8, v8, v16
-; CHECK-NEXT:    ret
-entry:
-  %a = call <vscale x 32 x i16> @llvm.riscv.vsaddu.nxv32i16.nxv32i16(
-    <vscale x 32 x i16> undef,
-    <vscale x 32 x i16> %0,
-    <vscale x 32 x i16> %1,
-    i32 %2)
-
-  ret <vscale x 32 x i16> %a
-}
-
-declare <vscale x 32 x i16> @llvm.riscv.vsaddu.mask.nxv32i16.nxv32i16(
-  <vscale x 32 x i16>,
-  <vscale x 32 x i16>,
-  <vscale x 32 x i16>,
-  <vscale x 32 x i1>,
-  i32,
-  i32);
-
-define <vscale x 32 x i16> @intrinsic_vsaddu_mask_vv_nxv32i16_nxv32i16_nxv32i16(<vscale x 32 x i16> %0, <vscale x 32 x i16> %1, <vscale x 32 x i16> %2, <vscale x 32 x i1> %3, i32 %4) nounwind {
-; CHECK-LABEL: intrinsic_vsaddu_mask_vv_nxv32i16_nxv32i16_nxv32i16:
-; CHECK:       # %bb.0: # %entry
-; CHECK-NEXT:    vl8re16.v v24, (a0)
-; CHECK-NEXT:    vsetvli zero, a1, e16, m8, ta, mu
-; CHECK-NEXT:    vsaddu.vv v8, v16, v24, v0.t
-; CHECK-NEXT:    ret
-entry:
-  %a = call <vscale x 32 x i16> @llvm.riscv.vsaddu.mask.nxv32i16.nxv32i16(
-    <vscale x 32 x i16> %0,
-    <vscale x 32 x i16> %1,
-    <vscale x 32 x i16> %2,
-    <vscale x 32 x i1> %3,
-    i32 %4, i32 1)
-
-  ret <vscale x 32 x i16> %a
-}
-
-declare <vscale x 1 x i32> @llvm.riscv.vsaddu.nxv1i32.nxv1i32(
-  <vscale x 1 x i32>,
-  <vscale x 1 x i32>,
-  <vscale x 1 x i32>,
-  i32);
-
-define <vscale x 1 x i32> @intrinsic_vsaddu_vv_nxv1i32_nxv1i32_nxv1i32(<vscale x 1 x i32> %0, <vscale x 1 x i32> %1, i32 %2) nounwind {
-; CHECK-LABEL: intrinsic_vsaddu_vv_nxv1i32_nxv1i32_nxv1i32:
-; CHECK:       # %bb.0: # %entry
-; CHECK-NEXT:    vsetvli zero, a0, e32, mf2, ta, ma
-; CHECK-NEXT:    vsaddu.vv v8, v8, v9
-; CHECK-NEXT:    ret
-entry:
-  %a = call <vscale x 1 x i32> @llvm.riscv.vsaddu.nxv1i32.nxv1i32(
-    <vscale x 1 x i32> undef,
-    <vscale x 1 x i32> %0,
-    <vscale x 1 x i32> %1,
-    i32 %2)
-
-  ret <vscale x 1 x i32> %a
-}
-
-declare <vscale x 1 x i32> @llvm.riscv.vsaddu.mask.nxv1i32.nxv1i32(
-  <vscale x 1 x i32>,
-  <vscale x 1 x i32>,
-  <vscale x 1 x i32>,
-  <vscale x 1 x i1>,
-  i32,
-  i32);
-
-define <vscale x 1 x i32> @intrinsic_vsaddu_mask_vv_nxv1i32_nxv1i32_nxv1i32(<vscale x 1 x i32> %0, <vscale x 1 x i32> %1, <vscale x 1 x i32> %2, <vscale x 1 x i1> %3, i32 %4) nounwind {
-; CHECK-LABEL: intrinsic_vsaddu_mask_vv_nxv1i32_nxv1i32_nxv1i32:
-; CHECK:       # %bb.0: # %entry
-; CHECK-NEXT:    vsetvli zero, a0, e32, mf2, ta, mu
-; CHECK-NEXT:    vsaddu.vv v8, v9, v10, v0.t
-; CHECK-NEXT:    ret
-entry:
-  %a = call <vscale x 1 x i32> @llvm.riscv.vsaddu.mask.nxv1i32.nxv1i32(
-    <vscale x 1 x i32> %0,
-    <vscale x 1 x i32> %1,
-    <vscale x 1 x i32> %2,
-    <vscale x 1 x i1> %3,
-    i32 %4, i32 1)
-
-  ret <vscale x 1 x i32> %a
-}
-
-declare <vscale x 2 x i32> @llvm.riscv.vsaddu.nxv2i32.nxv2i32(
-  <vscale x 2 x i32>,
-  <vscale x 2 x i32>,
-  <vscale x 2 x i32>,
-  i32);
-
-define <vscale x 2 x i32> @intrinsic_vsaddu_vv_nxv2i32_nxv2i32_nxv2i32(<vscale x 2 x i32> %0, <vscale x 2 x i32> %1, i32 %2) nounwind {
-; CHECK-LABEL: intrinsic_vsaddu_vv_nxv2i32_nxv2i32_nxv2i32:
-; CHECK:       # %bb.0: # %entry
-; CHECK-NEXT:    vsetvli zero, a0, e32, m1, ta, ma
-; CHECK-NEXT:    vsaddu.vv v8, v8, v9
-; CHECK-NEXT:    ret
-entry:
-  %a = call <vscale x 2 x i32> @llvm.riscv.vsaddu.nxv2i32.nxv2i32(
-    <vscale x 2 x i32> undef,
-    <vscale x 2 x i32> %0,
-    <vscale x 2 x i32> %1,
-    i32 %2)
-
-  ret <vscale x 2 x i32> %a
-}
-
-declare <vscale x 2 x i32> @llvm.riscv.vsaddu.mask.nxv2i32.nxv2i32(
-  <vscale x 2 x i32>,
-  <vscale x 2 x i32>,
-  <vscale x 2 x i32>,
-  <vscale x 2 x i1>,
-  i32,
-  i32);
-
-define <vscale x 2 x i32> @intrinsic_vsaddu_mask_vv_nxv2i32_nxv2i32_nxv2i32(<vscale x 2 x i32> %0, <vscale x 2 x i32> %1, <vscale x 2 x i32> %2, <vscale x 2 x i1> %3, i32 %4) nounwind {
-; CHECK-LABEL: intrinsic_vsaddu_mask_vv_nxv2i32_nxv2i32_nxv2i32:
-; CHECK:       # %bb.0: # %entry
-; CHECK-NEXT:    vsetvli zero, a0, e32, m1, ta, mu
-; CHECK-NEXT:    vsaddu.vv v8, v9, v10, v0.t
-; CHECK-NEXT:    ret
-entry:
-  %a = call <vscale x 2 x i32> @llvm.riscv.vsaddu.mask.nxv2i32.nxv2i32(
-    <vscale x 2 x i32> %0,
-    <vscale x 2 x i32> %1,
-    <vscale x 2 x i32> %2,
-    <vscale x 2 x i1> %3,
-    i32 %4, i32 1)
-
-  ret <vscale x 2 x i32> %a
-}
-
-declare <vscale x 4 x i32> @llvm.riscv.vsaddu.nxv4i32.nxv4i32(
-  <vscale x 4 x i32>,
-  <vscale x 4 x i32>,
-  <vscale x 4 x i32>,
-  i32);
-
-define <vscale x 4 x i32> @intrinsic_vsaddu_vv_nxv4i32_nxv4i32_nxv4i32(<vscale x 4 x i32> %0, <vscale x 4 x i32> %1, i32 %2) nounwind {
-; CHECK-LABEL: intrinsic_vsaddu_vv_nxv4i32_nxv4i32_nxv4i32:
-; CHECK:       # %bb.0: # %entry
-; CHECK-NEXT:    vsetvli zero, a0, e32, m2, ta, ma
-; CHECK-NEXT:    vsaddu.vv v8, v8, v10
-; CHECK-NEXT:    ret
-entry:
-  %a = call <vscale x 4 x i32> @llvm.riscv.vsaddu.nxv4i32.nxv4i32(
-    <vscale x 4 x i32> undef,
-    <vscale x 4 x i32> %0,
-    <vscale x 4 x i32> %1,
-    i32 %2)
-
-  ret <vscale x 4 x i32> %a
-}
-
-declare <vscale x 4 x i32> @llvm.riscv.vsaddu.mask.nxv4i32.nxv4i32(
-  <vscale x 4 x i32>,
-  <vscale x 4 x i32>,
-  <vscale x 4 x i32>,
-  <vscale x 4 x i1>,
-  i32,
-  i32);
-
-define <vscale x 4 x i32> @intrinsic_vsaddu_mask_vv_nxv4i32_nxv4i32_nxv4i32(<vscale x 4 x i32> %0, <vscale x 4 x i32> %1, <vscale x 4 x i32> %2, <vscale x 4 x i1> %3, i32 %4) nounwind {
-; CHECK-LABEL: intrinsic_vsaddu_mask_vv_nxv4i32_nxv4i32_nxv4i32:
-; CHECK:       # %bb.0: # %entry
-; CHECK-NEXT:    vsetvli zero, a0, e32, m2, ta, mu
-; CHECK-NEXT:    vsaddu.vv v8, v10, v12, v0.t
-; CHECK-NEXT:    ret
-entry:
-  %a = call <vscale x 4 x i32> @llvm.riscv.vsaddu.mask.nxv4i32.nxv4i32(
-    <vscale x 4 x i32> %0,
-    <vscale x 4 x i32> %1,
-    <vscale x 4 x i32> %2,
-    <vscale x 4 x i1> %3,
-    i32 %4, i32 1)
-
-  ret <vscale x 4 x i32> %a
-}
-
-declare <vscale x 8 x i32> @llvm.riscv.vsaddu.nxv8i32.nxv8i32(
-  <vscale x 8 x i32>,
-  <vscale x 8 x i32>,
-  <vscale x 8 x i32>,
-  i32);
-
-define <vscale x 8 x i32> @intrinsic_vsaddu_vv_nxv8i32_nxv8i32_nxv8i32(<vscale x 8 x i32> %0, <vscale x 8 x i32> %1, i32 %2) nounwind {
-; CHECK-LABEL: intrinsic_vsaddu_vv_nxv8i32_nxv8i32_nxv8i32:
-; CHECK:       # %bb.0: # %entry
-; CHECK-NEXT:    vsetvli zero, a0, e32, m4, ta, ma
-; CHECK-NEXT:    vsaddu.vv v8, v8, v12
-; CHECK-NEXT:    ret
-entry:
-  %a = call <vscale x 8 x i32> @llvm.riscv.vsaddu.nxv8i32.nxv8i32(
-    <vscale x 8 x i32> undef,
-    <vscale x 8 x i32> %0,
-    <vscale x 8 x i32> %1,
-    i32 %2)
-
-  ret <vscale x 8 x i32> %a
-}
-
-declare <vscale x 8 x i32> @llvm.riscv.vsaddu.mask.nxv8i32.nxv8i32(
-  <vscale x 8 x i32>,
-  <vscale x 8 x i32>,
-  <vscale x 8 x i32>,
-  <vscale x 8 x i1>,
-  i32,
-  i32);
-
-define <vscale x 8 x i32> @intrinsic_vsaddu_mask_vv_nxv8i32_nxv8i32_nxv8i32(<vscale x 8 x i32> %0, <vscale x 8 x i32> %1, <vscale x 8 x i32> %2, <vscale x 8 x i1> %3, i32 %4) nounwind {
-; CHECK-LABEL: intrinsic_vsaddu_mask_vv_nxv8i32_nxv8i32_nxv8i32:
-; CHECK:       # %bb.0: # %entry
-; CHECK-NEXT:    vsetvli zero, a0, e32, m4, ta, mu
-; CHECK-NEXT:    vsaddu.vv v8, v12, v16, v0.t
-; CHECK-NEXT:    ret
-entry:
-  %a = call <vscale x 8 x i32> @llvm.riscv.vsaddu.mask.nxv8i32.nxv8i32(
-    <vscale x 8 x i32> %0,
-    <vscale x 8 x i32> %1,
-    <vscale x 8 x i32> %2,
-    <vscale x 8 x i1> %3,
-    i32 %4, i32 1)
-
-  ret <vscale x 8 x i32> %a
-}
-
-declare <vscale x 16 x i32> @llvm.riscv.vsaddu.nxv16i32.nxv16i32(
-  <vscale x 16 x i32>,
-  <vscale x 16 x i32>,
-  <vscale x 16 x i32>,
-  i32);
-
-define <vscale x 16 x i32> @intrinsic_vsaddu_vv_nxv16i32_nxv16i32_nxv16i32(<vscale x 16 x i32> %0, <vscale x 16 x i32> %1, i32 %2) nounwind {
-; CHECK-LABEL: intrinsic_vsaddu_vv_nxv16i32_nxv16i32_nxv16i32:
-; CHECK:       # %bb.0: # %entry
-; CHECK-NEXT:    vsetvli zero, a0, e32, m8, ta, ma
-; CHECK-NEXT:    vsaddu.vv v8, v8, v16
-; CHECK-NEXT:    ret
-entry:
-  %a = call <vscale x 16 x i32> @llvm.riscv.vsaddu.nxv16i32.nxv16i32(
-    <vscale x 16 x i32> undef,
-    <vscale x 16 x i32> %0,
-    <vscale x 16 x i32> %1,
-    i32 %2)
-
-  ret <vscale x 16 x i32> %a
-}
-
-declare <vscale x 16 x i32> @llvm.riscv.vsaddu.mask.nxv16i32.nxv16i32(
-  <vscale x 16 x i32>,
-  <vscale x 16 x i32>,
-  <vscale x 16 x i32>,
-  <vscale x 16 x i1>,
-  i32,
-  i32);
-
-define <vscale x 16 x i32> @intrinsic_vsaddu_mask_vv_nxv16i32_nxv16i32_nxv16i32(<vscale x 16 x i32> %0, <vscale x 16 x i32> %1, <vscale x 16 x i32> %2, <vscale x 16 x i1> %3, i32 %4) nounwind {
-; CHECK-LABEL: intrinsic_vsaddu_mask_vv_nxv16i32_nxv16i32_nxv16i32:
-; CHECK:       # %bb.0: # %entry
-; CHECK-NEXT:    vl8re32.v v24, (a0)
-; CHECK-NEXT:    vsetvli zero, a1, e32, m8, ta, mu
-; CHECK-NEXT:    vsaddu.vv v8, v16, v24, v0.t
-; CHECK-NEXT:    ret
-entry:
-  %a = call <vscale x 16 x i32> @llvm.riscv.vsaddu.mask.nxv16i32.nxv16i32(
-    <vscale x 16 x i32> %0,
-    <vscale x 16 x i32> %1,
-    <vscale x 16 x i32> %2,
-    <vscale x 16 x i1> %3,
-    i32 %4, i32 1)
-
-  ret <vscale x 16 x i32> %a
-}
-
-declare <vscale x 1 x i64> @llvm.riscv.vsaddu.nxv1i64.nxv1i64(
-  <vscale x 1 x i64>,
-  <vscale x 1 x i64>,
-  <vscale x 1 x i64>,
-  i32);
-
-define <vscale x 1 x i64> @intrinsic_vsaddu_vv_nxv1i64_nxv1i64_nxv1i64(<vscale x 1 x i64> %0, <vscale x 1 x i64> %1, i32 %2) nounwind {
-; CHECK-LABEL: intrinsic_vsaddu_vv_nxv1i64_nxv1i64_nxv1i64:
-; CHECK:       # %bb.0: # %entry
-; CHECK-NEXT:    vsetvli zero, a0, e64, m1, ta, ma
-; CHECK-NEXT:    vsaddu.vv v8, v8, v9
-; CHECK-NEXT:    ret
-entry:
-  %a = call <vscale x 1 x i64> @llvm.riscv.vsaddu.nxv1i64.nxv1i64(
-    <vscale x 1 x i64> undef,
-    <vscale x 1 x i64> %0,
-    <vscale x 1 x i64> %1,
-    i32 %2)
-
-  ret <vscale x 1 x i64> %a
-}
-
-declare <vscale x 1 x i64> @llvm.riscv.vsaddu.mask.nxv1i64.nxv1i64(
-  <vscale x 1 x i64>,
-  <vscale x 1 x i64>,
-  <vscale x 1 x i64>,
-  <vscale x 1 x i1>,
-  i32,
-  i32);
-
-define <vscale x 1 x i64> @intrinsic_vsaddu_mask_vv_nxv1i64_nxv1i64_nxv1i64(<vscale x 1 x i64> %0, <vscale x 1 x i64> %1, <vscale x 1 x i64> %2, <vscale x 1 x i1> %3, i32 %4) nounwind {
-; CHECK-LABEL: intrinsic_vsaddu_mask_vv_nxv1i64_nxv1i64_nxv1i64:
-; CHECK:       # %bb.0: # %entry
-; CHECK-NEXT:    vsetvli zero, a0, e64, m1, ta, mu
-; CHECK-NEXT:    vsaddu.vv v8, v9, v10, v0.t
-; CHECK-NEXT:    ret
-entry:
-  %a = call <vscale x 1 x i64> @llvm.riscv.vsaddu.mask.nxv1i64.nxv1i64(
-    <vscale x 1 x i64> %0,
-    <vscale x 1 x i64> %1,
-    <vscale x 1 x i64> %2,
-    <vscale x 1 x i1> %3,
-    i32 %4, i32 1)
-
-  ret <vscale x 1 x i64> %a
-}
-
-declare <vscale x 2 x i64> @llvm.riscv.vsaddu.nxv2i64.nxv2i64(
-  <vscale x 2 x i64>,
-  <vscale x 2 x i64>,
-  <vscale x 2 x i64>,
-  i32);
-
-define <vscale x 2 x i64> @intrinsic_vsaddu_vv_nxv2i64_nxv2i64_nxv2i64(<vscale x 2 x i64> %0, <vscale x 2 x i64> %1, i32 %2) nounwind {
-; CHECK-LABEL: intrinsic_vsaddu_vv_nxv2i64_nxv2i64_nxv2i64:
-; CHECK:       # %bb.0: # %entry
-; CHECK-NEXT:    vsetvli zero, a0, e64, m2, ta, ma
-; CHECK-NEXT:    vsaddu.vv v8, v8, v10
-; CHECK-NEXT:    ret
-entry:
-  %a = call <vscale x 2 x i64> @llvm.riscv.vsaddu.nxv2i64.nxv2i64(
-    <vscale x 2 x i64> undef,
-    <vscale x 2 x i64> %0,
-    <vscale x 2 x i64> %1,
-    i32 %2)
-
-  ret <vscale x 2 x i64> %a
-}
-
-declare <vscale x 2 x i64> @llvm.riscv.vsaddu.mask.nxv2i64.nxv2i64(
-  <vscale x 2 x i64>,
-  <vscale x 2 x i64>,
-  <vscale x 2 x i64>,
-  <vscale x 2 x i1>,
-  i32,
-  i32);
-
-define <vscale x 2 x i64> @intrinsic_vsaddu_mask_vv_nxv2i64_nxv2i64_nxv2i64(<vscale x 2 x i64> %0, <vscale x 2 x i64> %1, <vscale x 2 x i64> %2, <vscale x 2 x i1> %3, i32 %4) nounwind {
-; CHECK-LABEL: intrinsic_vsaddu_mask_vv_nxv2i64_nxv2i64_nxv2i64:
-; CHECK:       # %bb.0: # %entry
-; CHECK-NEXT:    vsetvli zero, a0, e64, m2, ta, mu
-; CHECK-NEXT:    vsaddu.vv v8, v10, v12, v0.t
-; CHECK-NEXT:    ret
-entry:
-  %a = call <vscale x 2 x i64> @llvm.riscv.vsaddu.mask.nxv2i64.nxv2i64(
-    <vscale x 2 x i64> %0,
-    <vscale x 2 x i64> %1,
-    <vscale x 2 x i64> %2,
-    <vscale x 2 x i1> %3,
-    i32 %4, i32 1)
-
-  ret <vscale x 2 x i64> %a
-}
-
-declare <vscale x 4 x i64> @llvm.riscv.vsaddu.nxv4i64.nxv4i64(
-  <vscale x 4 x i64>,
-  <vscale x 4 x i64>,
-  <vscale x 4 x i64>,
-  i32);
-
-define <vscale x 4 x i64> @intrinsic_vsaddu_vv_nxv4i64_nxv4i64_nxv4i64(<vscale x 4 x i64> %0, <vscale x 4 x i64> %1, i32 %2) nounwind {
-; CHECK-LABEL: intrinsic_vsaddu_vv_nxv4i64_nxv4i64_nxv4i64:
-; CHECK:       # %bb.0: # %entry
-; CHECK-NEXT:    vsetvli zero, a0, e64, m4, ta, ma
-; CHECK-NEXT:    vsaddu.vv v8, v8, v12
-; CHECK-NEXT:    ret
-entry:
-  %a = call <vscale x 4 x i64> @llvm.riscv.vsaddu.nxv4i64.nxv4i64(
-    <vscale x 4 x i64> undef,
-    <vscale x 4 x i64> %0,
-    <vscale x 4 x i64> %1,
-    i32 %2)
-
-  ret <vscale x 4 x i64> %a
-}
-
-declare <vscale x 4 x i64> @llvm.riscv.vsaddu.mask.nxv4i64.nxv4i64(
-  <vscale x 4 x i64>,
-  <vscale x 4 x i64>,
-  <vscale x 4 x i64>,
-  <vscale x 4 x i1>,
-  i32,
-  i32);
-
-define <vscale x 4 x i64> @intrinsic_vsaddu_mask_vv_nxv4i64_nxv4i64_nxv4i64(<vscale x 4 x i64> %0, <vscale x 4 x i64> %1, <vscale x 4 x i64> %2, <vscale x 4 x i1> %3, i32 %4) nounwind {
-; CHECK-LABEL: intrinsic_vsaddu_mask_vv_nxv4i64_nxv4i64_nxv4i64:
-; CHECK:       # %bb.0: # %entry
-; CHECK-NEXT:    vsetvli zero, a0, e64, m4, ta, mu
-; CHECK-NEXT:    vsaddu.vv v8, v12, v16, v0.t
-; CHECK-NEXT:    ret
-entry:
-  %a = call <vscale x 4 x i64> @llvm.riscv.vsaddu.mask.nxv4i64.nxv4i64(
-    <vscale x 4 x i64> %0,
-    <vscale x 4 x i64> %1,
-    <vscale x 4 x i64> %2,
-    <vscale x 4 x i1> %3,
-    i32 %4, i32 1)
-
-  ret <vscale x 4 x i64> %a
-}
-
-declare <vscale x 8 x i64> @llvm.riscv.vsaddu.nxv8i64.nxv8i64(
-  <vscale x 8 x i64>,
-  <vscale x 8 x i64>,
-  <vscale x 8 x i64>,
-  i32);
-
-define <vscale x 8 x i64> @intrinsic_vsaddu_vv_nxv8i64_nxv8i64_nxv8i64(<vscale x 8 x i64> %0, <vscale x 8 x i64> %1, i32 %2) nounwind {
-; CHECK-LABEL: intrinsic_vsaddu_vv_nxv8i64_nxv8i64_nxv8i64:
-; CHECK:       # %bb.0: # %entry
-; CHECK-NEXT:    vsetvli zero, a0, e64, m8, ta, ma
-; CHECK-NEXT:    vsaddu.vv v8, v8, v16
-; CHECK-NEXT:    ret
-entry:
-  %a = call <vscale x 8 x i64> @llvm.riscv.vsaddu.nxv8i64.nxv8i64(
-    <vscale x 8 x i64> undef,
-    <vscale x 8 x i64> %0,
-    <vscale x 8 x i64> %1,
-    i32 %2)
-
-  ret <vscale x 8 x i64> %a
-}
-
-declare <vscale x 8 x i64> @llvm.riscv.vsaddu.mask.nxv8i64.nxv8i64(
-  <vscale x 8 x i64>,
-  <vscale x 8 x i64>,
-  <vscale x 8 x i64>,
-  <vscale x 8 x i1>,
-  i32,
-  i32);
-
-define <vscale x 8 x i64> @intrinsic_vsaddu_mask_vv_nxv8i64_nxv8i64_nxv8i64(<vscale x 8 x i64> %0, <vscale x 8 x i64> %1, <vscale x 8 x i64> %2, <vscale x 8 x i1> %3, i32 %4) nounwind {
-; CHECK-LABEL: intrinsic_vsaddu_mask_vv_nxv8i64_nxv8i64_nxv8i64:
-; CHECK:       # %bb.0: # %entry
-; CHECK-NEXT:    vl8re64.v v24, (a0)
-; CHECK-NEXT:    vsetvli zero, a1, e64, m8, ta, mu
-; CHECK-NEXT:    vsaddu.vv v8, v16, v24, v0.t
-; CHECK-NEXT:    ret
-entry:
-  %a = call <vscale x 8 x i64> @llvm.riscv.vsaddu.mask.nxv8i64.nxv8i64(
-    <vscale x 8 x i64> %0,
-    <vscale x 8 x i64> %1,
-    <vscale x 8 x i64> %2,
-    <vscale x 8 x i1> %3,
-    i32 %4, i32 1)
-
-  ret <vscale x 8 x i64> %a
-}
-
-declare <vscale x 1 x i8> @llvm.riscv.vsaddu.nxv1i8.i8(
-  <vscale x 1 x i8>,
-  <vscale x 1 x i8>,
-  i8,
-  i32);
-
-define <vscale x 1 x i8> @intrinsic_vsaddu_vx_nxv1i8_nxv1i8_i8(<vscale x 1 x i8> %0, i8 %1, i32 %2) nounwind {
-; CHECK-LABEL: intrinsic_vsaddu_vx_nxv1i8_nxv1i8_i8:
-; CHECK:       # %bb.0: # %entry
-; CHECK-NEXT:    vsetvli zero, a1, e8, mf8, ta, ma
-; CHECK-NEXT:    vsaddu.vx v8, v8, a0
-; CHECK-NEXT:    ret
-entry:
-  %a = call <vscale x 1 x i8> @llvm.riscv.vsaddu.nxv1i8.i8(
-    <vscale x 1 x i8> undef,
-    <vscale x 1 x i8> %0,
-    i8 %1,
-    i32 %2)
-
-  ret <vscale x 1 x i8> %a
-}
-
-declare <vscale x 1 x i8> @llvm.riscv.vsaddu.mask.nxv1i8.i8(
-  <vscale x 1 x i8>,
-  <vscale x 1 x i8>,
-  i8,
-  <vscale x 1 x i1>,
-  i32,
-  i32);
-
-define <vscale x 1 x i8> @intrinsic_vsaddu_mask_vx_nxv1i8_nxv1i8_i8(<vscale x 1 x i8> %0, <vscale x 1 x i8> %1, i8 %2, <vscale x 1 x i1> %3, i32 %4) nounwind {
-; CHECK-LABEL: intrinsic_vsaddu_mask_vx_nxv1i8_nxv1i8_i8:
-; CHECK:       # %bb.0: # %entry
-; CHECK-NEXT:    vsetvli zero, a1, e8, mf8, ta, mu
-; CHECK-NEXT:    vsaddu.vx v8, v9, a0, v0.t
-; CHECK-NEXT:    ret
-entry:
-  %a = call <vscale x 1 x i8> @llvm.riscv.vsaddu.mask.nxv1i8.i8(
-    <vscale x 1 x i8> %0,
-    <vscale x 1 x i8> %1,
-    i8 %2,
-    <vscale x 1 x i1> %3,
-    i32 %4, i32 1)
-
-  ret <vscale x 1 x i8> %a
-}
-
-declare <vscale x 2 x i8> @llvm.riscv.vsaddu.nxv2i8.i8(
-  <vscale x 2 x i8>,
-  <vscale x 2 x i8>,
-  i8,
-  i32);
-
-define <vscale x 2 x i8> @intrinsic_vsaddu_vx_nxv2i8_nxv2i8_i8(<vscale x 2 x i8> %0, i8 %1, i32 %2) nounwind {
-; CHECK-LABEL: intrinsic_vsaddu_vx_nxv2i8_nxv2i8_i8:
-; CHECK:       # %bb.0: # %entry
-; CHECK-NEXT:    vsetvli zero, a1, e8, mf4, ta, ma
-; CHECK-NEXT:    vsaddu.vx v8, v8, a0
-; CHECK-NEXT:    ret
-entry:
-  %a = call <vscale x 2 x i8> @llvm.riscv.vsaddu.nxv2i8.i8(
-    <vscale x 2 x i8> undef,
-    <vscale x 2 x i8> %0,
-    i8 %1,
-    i32 %2)
-
-  ret <vscale x 2 x i8> %a
-}
-
-declare <vscale x 2 x i8> @llvm.riscv.vsaddu.mask.nxv2i8.i8(
-  <vscale x 2 x i8>,
-  <vscale x 2 x i8>,
-  i8,
-  <vscale x 2 x i1>,
-  i32,
-  i32);
-
-define <vscale x 2 x i8> @intrinsic_vsaddu_mask_vx_nxv2i8_nxv2i8_i8(<vscale x 2 x i8> %0, <vscale x 2 x i8> %1, i8 %2, <vscale x 2 x i1> %3, i32 %4) nounwind {
-; CHECK-LABEL: intrinsic_vsaddu_mask_vx_nxv2i8_nxv2i8_i8:
-; CHECK:       # %bb.0: # %entry
-; CHECK-NEXT:    vsetvli zero, a1, e8, mf4, ta, mu
-; CHECK-NEXT:    vsaddu.vx v8, v9, a0, v0.t
-; CHECK-NEXT:    ret
-entry:
-  %a = call <vscale x 2 x i8> @llvm.riscv.vsaddu.mask.nxv2i8.i8(
-    <vscale x 2 x i8> %0,
-    <vscale x 2 x i8> %1,
-    i8 %2,
-    <vscale x 2 x i1> %3,
-    i32 %4, i32 1)
-
-  ret <vscale x 2 x i8> %a
-}
-
-declare <vscale x 4 x i8> @llvm.riscv.vsaddu.nxv4i8.i8(
-  <vscale x 4 x i8>,
-  <vscale x 4 x i8>,
-  i8,
-  i32);
-
-define <vscale x 4 x i8> @intrinsic_vsaddu_vx_nxv4i8_nxv4i8_i8(<vscale x 4 x i8> %0, i8 %1, i32 %2) nounwind {
-; CHECK-LABEL: intrinsic_vsaddu_vx_nxv4i8_nxv4i8_i8:
-; CHECK:       # %bb.0: # %entry
-; CHECK-NEXT:    vsetvli zero, a1, e8, mf2, ta, ma
-; CHECK-NEXT:    vsaddu.vx v8, v8, a0
-; CHECK-NEXT:    ret
-entry:
-  %a = call <vscale x 4 x i8> @llvm.riscv.vsaddu.nxv4i8.i8(
-    <vscale x 4 x i8> undef,
-    <vscale x 4 x i8> %0,
-    i8 %1,
-    i32 %2)
-
-  ret <vscale x 4 x i8> %a
-}
-
-declare <vscale x 4 x i8> @llvm.riscv.vsaddu.mask.nxv4i8.i8(
-  <vscale x 4 x i8>,
-  <vscale x 4 x i8>,
-  i8,
-  <vscale x 4 x i1>,
-  i32,
-  i32);
-
-define <vscale x 4 x i8> @intrinsic_vsaddu_mask_vx_nxv4i8_nxv4i8_i8(<vscale x 4 x i8> %0, <vscale x 4 x i8> %1, i8 %2, <vscale x 4 x i1> %3, i32 %4) nounwind {
-; CHECK-LABEL: intrinsic_vsaddu_mask_vx_nxv4i8_nxv4i8_i8:
-; CHECK:       # %bb.0: # %entry
-; CHECK-NEXT:    vsetvli zero, a1, e8, mf2, ta, mu
-; CHECK-NEXT:    vsaddu.vx v8, v9, a0, v0.t
-; CHECK-NEXT:    ret
-entry:
-  %a = call <vscale x 4 x i8> @llvm.riscv.vsaddu.mask.nxv4i8.i8(
-    <vscale x 4 x i8> %0,
-    <vscale x 4 x i8> %1,
-    i8 %2,
-    <vscale x 4 x i1> %3,
-    i32 %4, i32 1)
-
-  ret <vscale x 4 x i8> %a
-}
-
-declare <vscale x 8 x i8> @llvm.riscv.vsaddu.nxv8i8.i8(
-  <vscale x 8 x i8>,
-  <vscale x 8 x i8>,
-  i8,
-  i32);
-
-define <vscale x 8 x i8> @intrinsic_vsaddu_vx_nxv8i8_nxv8i8_i8(<vscale x 8 x i8> %0, i8 %1, i32 %2) nounwind {
-; CHECK-LABEL: intrinsic_vsaddu_vx_nxv8i8_nxv8i8_i8:
-; CHECK:       # %bb.0: # %entry
-; CHECK-NEXT:    vsetvli zero, a1, e8, m1, ta, ma
-; CHECK-NEXT:    vsaddu.vx v8, v8, a0
-; CHECK-NEXT:    ret
-entry:
-  %a = call <vscale x 8 x i8> @llvm.riscv.vsaddu.nxv8i8.i8(
-    <vscale x 8 x i8> undef,
-    <vscale x 8 x i8> %0,
-    i8 %1,
-    i32 %2)
-
-  ret <vscale x 8 x i8> %a
-}
-
-declare <vscale x 8 x i8> @llvm.riscv.vsaddu.mask.nxv8i8.i8(
-  <vscale x 8 x i8>,
-  <vscale x 8 x i8>,
-  i8,
-  <vscale x 8 x i1>,
-  i32,
-  i32);
-
-define <vscale x 8 x i8> @intrinsic_vsaddu_mask_vx_nxv8i8_nxv8i8_i8(<vscale x 8 x i8> %0, <vscale x 8 x i8> %1, i8 %2, <vscale x 8 x i1> %3, i32 %4) nounwind {
-; CHECK-LABEL: intrinsic_vsaddu_mask_vx_nxv8i8_nxv8i8_i8:
-; CHECK:       # %bb.0: # %entry
-; CHECK-NEXT:    vsetvli zero, a1, e8, m1, ta, mu
-; CHECK-NEXT:    vsaddu.vx v8, v9, a0, v0.t
-; CHECK-NEXT:    ret
-entry:
-  %a = call <vscale x 8 x i8> @llvm.riscv.vsaddu.mask.nxv8i8.i8(
-    <vscale x 8 x i8> %0,
-    <vscale x 8 x i8> %1,
-    i8 %2,
-    <vscale x 8 x i1> %3,
-    i32 %4, i32 1)
-
-  ret <vscale x 8 x i8> %a
-}
-
-declare <vscale x 16 x i8> @llvm.riscv.vsaddu.nxv16i8.i8(
-  <vscale x 16 x i8>,
-  <vscale x 16 x i8>,
-  i8,
-  i32);
-
-define <vscale x 16 x i8> @intrinsic_vsaddu_vx_nxv16i8_nxv16i8_i8(<vscale x 16 x i8> %0, i8 %1, i32 %2) nounwind {
-; CHECK-LABEL: intrinsic_vsaddu_vx_nxv16i8_nxv16i8_i8:
-; CHECK:       # %bb.0: # %entry
-; CHECK-NEXT:    vsetvli zero, a1, e8, m2, ta, ma
-; CHECK-NEXT:    vsaddu.vx v8, v8, a0
-; CHECK-NEXT:    ret
-entry:
-  %a = call <vscale x 16 x i8> @llvm.riscv.vsaddu.nxv16i8.i8(
-    <vscale x 16 x i8> undef,
-    <vscale x 16 x i8> %0,
-    i8 %1,
-    i32 %2)
-
-  ret <vscale x 16 x i8> %a
-}
-
-declare <vscale x 16 x i8> @llvm.riscv.vsaddu.mask.nxv16i8.i8(
-  <vscale x 16 x i8>,
-  <vscale x 16 x i8>,
-  i8,
-  <vscale x 16 x i1>,
-  i32,
-  i32);
-
-define <vscale x 16 x i8> @intrinsic_vsaddu_mask_vx_nxv16i8_nxv16i8_i8(<vscale x 16 x i8> %0, <vscale x 16 x i8> %1, i8 %2, <vscale x 16 x i1> %3, i32 %4) nounwind {
-; CHECK-LABEL: intrinsic_vsaddu_mask_vx_nxv16i8_nxv16i8_i8:
-; CHECK:       # %bb.0: # %entry
-; CHECK-NEXT:    vsetvli zero, a1, e8, m2, ta, mu
-; CHECK-NEXT:    vsaddu.vx v8, v10, a0, v0.t
-; CHECK-NEXT:    ret
-entry:
-  %a = call <vscale x 16 x i8> @llvm.riscv.vsaddu.mask.nxv16i8.i8(
-    <vscale x 16 x i8> %0,
-    <vscale x 16 x i8> %1,
-    i8 %2,
-    <vscale x 16 x i1> %3,
-    i32 %4, i32 1)
-
-  ret <vscale x 16 x i8> %a
-}
-
-declare <vscale x 32 x i8> @llvm.riscv.vsaddu.nxv32i8.i8(
-  <vscale x 32 x i8>,
-  <vscale x 32 x i8>,
-  i8,
-  i32);
-
-define <vscale x 32 x i8> @intrinsic_vsaddu_vx_nxv32i8_nxv32i8_i8(<vscale x 32 x i8> %0, i8 %1, i32 %2) nounwind {
-; CHECK-LABEL: intrinsic_vsaddu_vx_nxv32i8_nxv32i8_i8:
-; CHECK:       # %bb.0: # %entry
-; CHECK-NEXT:    vsetvli zero, a1, e8, m4, ta, ma
-; CHECK-NEXT:    vsaddu.vx v8, v8, a0
-; CHECK-NEXT:    ret
-entry:
-  %a = call <vscale x 32 x i8> @llvm.riscv.vsaddu.nxv32i8.i8(
-    <vscale x 32 x i8> undef,
-    <vscale x 32 x i8> %0,
-    i8 %1,
-    i32 %2)
-
-  ret <vscale x 32 x i8> %a
-}
-
-declare <vscale x 32 x i8> @llvm.riscv.vsaddu.mask.nxv32i8.i8(
-  <vscale x 32 x i8>,
-  <vscale x 32 x i8>,
-  i8,
-  <vscale x 32 x i1>,
-  i32,
-  i32);
-
-define <vscale x 32 x i8> @intrinsic_vsaddu_mask_vx_nxv32i8_nxv32i8_i8(<vscale x 32 x i8> %0, <vscale x 32 x i8> %1, i8 %2, <vscale x 32 x i1> %3, i32 %4) nounwind {
-; CHECK-LABEL: intrinsic_vsaddu_mask_vx_nxv32i8_nxv32i8_i8:
-; CHECK:       # %bb.0: # %entry
-; CHECK-NEXT:    vsetvli zero, a1, e8, m4, ta, mu
-; CHECK-NEXT:    vsaddu.vx v8, v12, a0, v0.t
-; CHECK-NEXT:    ret
-entry:
-  %a = call <vscale x 32 x i8> @llvm.riscv.vsaddu.mask.nxv32i8.i8(
-    <vscale x 32 x i8> %0,
-    <vscale x 32 x i8> %1,
-    i8 %2,
-    <vscale x 32 x i1> %3,
-    i32 %4, i32 1)
-
-  ret <vscale x 32 x i8> %a
-}
-
-declare <vscale x 64 x i8> @llvm.riscv.vsaddu.nxv64i8.i8(
-  <vscale x 64 x i8>,
-  <vscale x 64 x i8>,
-  i8,
-  i32);
-
-define <vscale x 64 x i8> @intrinsic_vsaddu_vx_nxv64i8_nxv64i8_i8(<vscale x 64 x i8> %0, i8 %1, i32 %2) nounwind {
-; CHECK-LABEL: intrinsic_vsaddu_vx_nxv64i8_nxv64i8_i8:
-; CHECK:       # %bb.0: # %entry
-; CHECK-NEXT:    vsetvli zero, a1, e8, m8, ta, ma
-; CHECK-NEXT:    vsaddu.vx v8, v8, a0
-; CHECK-NEXT:    ret
-entry:
-  %a = call <vscale x 64 x i8> @llvm.riscv.vsaddu.nxv64i8.i8(
-    <vscale x 64 x i8> undef,
-    <vscale x 64 x i8> %0,
-    i8 %1,
-    i32 %2)
-
-  ret <vscale x 64 x i8> %a
-}
-
-declare <vscale x 64 x i8> @llvm.riscv.vsaddu.mask.nxv64i8.i8(
-  <vscale x 64 x i8>,
-  <vscale x 64 x i8>,
-  i8,
-  <vscale x 64 x i1>,
-  i32,
-  i32);
-
-define <vscale x 64 x i8> @intrinsic_vsaddu_mask_vx_nxv64i8_nxv64i8_i8(<vscale x 64 x i8> %0, <vscale x 64 x i8> %1, i8 %2, <vscale x 64 x i1> %3, i32 %4) nounwind {
-; CHECK-LABEL: intrinsic_vsaddu_mask_vx_nxv64i8_nxv64i8_i8:
-; CHECK:       # %bb.0: # %entry
-; CHECK-NEXT:    vsetvli zero, a1, e8, m8, ta, mu
-; CHECK-NEXT:    vsaddu.vx v8, v16, a0, v0.t
-; CHECK-NEXT:    ret
-entry:
-  %a = call <vscale x 64 x i8> @llvm.riscv.vsaddu.mask.nxv64i8.i8(
-    <vscale x 64 x i8> %0,
-    <vscale x 64 x i8> %1,
-    i8 %2,
-    <vscale x 64 x i1> %3,
-    i32 %4, i32 1)
-
-  ret <vscale x 64 x i8> %a
-}
-
-declare <vscale x 1 x i16> @llvm.riscv.vsaddu.nxv1i16.i16(
-  <vscale x 1 x i16>,
-  <vscale x 1 x i16>,
-  i16,
-  i32);
-
-define <vscale x 1 x i16> @intrinsic_vsaddu_vx_nxv1i16_nxv1i16_i16(<vscale x 1 x i16> %0, i16 %1, i32 %2) nounwind {
-; CHECK-LABEL: intrinsic_vsaddu_vx_nxv1i16_nxv1i16_i16:
-; CHECK:       # %bb.0: # %entry
-; CHECK-NEXT:    vsetvli zero, a1, e16, mf4, ta, ma
-; CHECK-NEXT:    vsaddu.vx v8, v8, a0
-; CHECK-NEXT:    ret
-entry:
-  %a = call <vscale x 1 x i16> @llvm.riscv.vsaddu.nxv1i16.i16(
-    <vscale x 1 x i16> undef,
-    <vscale x 1 x i16> %0,
-    i16 %1,
-    i32 %2)
-
-  ret <vscale x 1 x i16> %a
-}
-
-declare <vscale x 1 x i16> @llvm.riscv.vsaddu.mask.nxv1i16.i16(
-  <vscale x 1 x i16>,
-  <vscale x 1 x i16>,
-  i16,
-  <vscale x 1 x i1>,
-  i32,
-  i32);
-
-define <vscale x 1 x i16> @intrinsic_vsaddu_mask_vx_nxv1i16_nxv1i16_i16(<vscale x 1 x i16> %0, <vscale x 1 x i16> %1, i16 %2, <vscale x 1 x i1> %3, i32 %4) nounwind {
-; CHECK-LABEL: intrinsic_vsaddu_mask_vx_nxv1i16_nxv1i16_i16:
-; CHECK:       # %bb.0: # %entry
-; CHECK-NEXT:    vsetvli zero, a1, e16, mf4, ta, mu
-; CHECK-NEXT:    vsaddu.vx v8, v9, a0, v0.t
-; CHECK-NEXT:    ret
-entry:
-  %a = call <vscale x 1 x i16> @llvm.riscv.vsaddu.mask.nxv1i16.i16(
-    <vscale x 1 x i16> %0,
-    <vscale x 1 x i16> %1,
-    i16 %2,
-    <vscale x 1 x i1> %3,
-    i32 %4, i32 1)
-
-  ret <vscale x 1 x i16> %a
-}
-
-declare <vscale x 2 x i16> @llvm.riscv.vsaddu.nxv2i16.i16(
-  <vscale x 2 x i16>,
-  <vscale x 2 x i16>,
-  i16,
-  i32);
-
-define <vscale x 2 x i16> @intrinsic_vsaddu_vx_nxv2i16_nxv2i16_i16(<vscale x 2 x i16> %0, i16 %1, i32 %2) nounwind {
-; CHECK-LABEL: intrinsic_vsaddu_vx_nxv2i16_nxv2i16_i16:
-; CHECK:       # %bb.0: # %entry
-; CHECK-NEXT:    vsetvli zero, a1, e16, mf2, ta, ma
-; CHECK-NEXT:    vsaddu.vx v8, v8, a0
-; CHECK-NEXT:    ret
-entry:
-  %a = call <vscale x 2 x i16> @llvm.riscv.vsaddu.nxv2i16.i16(
-    <vscale x 2 x i16> undef,
-    <vscale x 2 x i16> %0,
-    i16 %1,
-    i32 %2)
-
-  ret <vscale x 2 x i16> %a
-}
-
-declare <vscale x 2 x i16> @llvm.riscv.vsaddu.mask.nxv2i16.i16(
-  <vscale x 2 x i16>,
-  <vscale x 2 x i16>,
-  i16,
-  <vscale x 2 x i1>,
-  i32,
-  i32);
-
-define <vscale x 2 x i16> @intrinsic_vsaddu_mask_vx_nxv2i16_nxv2i16_i16(<vscale x 2 x i16> %0, <vscale x 2 x i16> %1, i16 %2, <vscale x 2 x i1> %3, i32 %4) nounwind {
-; CHECK-LABEL: intrinsic_vsaddu_mask_vx_nxv2i16_nxv2i16_i16:
-; CHECK:       # %bb.0: # %entry
-; CHECK-NEXT:    vsetvli zero, a1, e16, mf2, ta, mu
-; CHECK-NEXT:    vsaddu.vx v8, v9, a0, v0.t
-; CHECK-NEXT:    ret
-entry:
-  %a = call <vscale x 2 x i16> @llvm.riscv.vsaddu.mask.nxv2i16.i16(
-    <vscale x 2 x i16> %0,
-    <vscale x 2 x i16> %1,
-    i16 %2,
-    <vscale x 2 x i1> %3,
-    i32 %4, i32 1)
-
-  ret <vscale x 2 x i16> %a
-}
-
-declare <vscale x 4 x i16> @llvm.riscv.vsaddu.nxv4i16.i16(
-  <vscale x 4 x i16>,
-  <vscale x 4 x i16>,
-  i16,
-  i32);
-
-define <vscale x 4 x i16> @intrinsic_vsaddu_vx_nxv4i16_nxv4i16_i16(<vscale x 4 x i16> %0, i16 %1, i32 %2) nounwind {
-; CHECK-LABEL: intrinsic_vsaddu_vx_nxv4i16_nxv4i16_i16:
-; CHECK:       # %bb.0: # %entry
-; CHECK-NEXT:    vsetvli zero, a1, e16, m1, ta, ma
-; CHECK-NEXT:    vsaddu.vx v8, v8, a0
-; CHECK-NEXT:    ret
-entry:
-  %a = call <vscale x 4 x i16> @llvm.riscv.vsaddu.nxv4i16.i16(
-    <vscale x 4 x i16> undef,
-    <vscale x 4 x i16> %0,
-    i16 %1,
-    i32 %2)
-
-  ret <vscale x 4 x i16> %a
-}
-
-declare <vscale x 4 x i16> @llvm.riscv.vsaddu.mask.nxv4i16.i16(
-  <vscale x 4 x i16>,
-  <vscale x 4 x i16>,
-  i16,
-  <vscale x 4 x i1>,
-  i32,
-  i32);
-
-define <vscale x 4 x i16> @intrinsic_vsaddu_mask_vx_nxv4i16_nxv4i16_i16(<vscale x 4 x i16> %0, <vscale x 4 x i16> %1, i16 %2, <vscale x 4 x i1> %3, i32 %4) nounwind {
-; CHECK-LABEL: intrinsic_vsaddu_mask_vx_nxv4i16_nxv4i16_i16:
-; CHECK:       # %bb.0: # %entry
-; CHECK-NEXT:    vsetvli zero, a1, e16, m1, ta, mu
-; CHECK-NEXT:    vsaddu.vx v8, v9, a0, v0.t
-; CHECK-NEXT:    ret
-entry:
-  %a = call <vscale x 4 x i16> @llvm.riscv.vsaddu.mask.nxv4i16.i16(
-    <vscale x 4 x i16> %0,
-    <vscale x 4 x i16> %1,
-    i16 %2,
-    <vscale x 4 x i1> %3,
-    i32 %4, i32 1)
-
-  ret <vscale x 4 x i16> %a
-}
-
-declare <vscale x 8 x i16> @llvm.riscv.vsaddu.nxv8i16.i16(
-  <vscale x 8 x i16>,
-  <vscale x 8 x i16>,
-  i16,
-  i32);
-
-define <vscale x 8 x i16> @intrinsic_vsaddu_vx_nxv8i16_nxv8i16_i16(<vscale x 8 x i16> %0, i16 %1, i32 %2) nounwind {
-; CHECK-LABEL: intrinsic_vsaddu_vx_nxv8i16_nxv8i16_i16:
-; CHECK:       # %bb.0: # %entry
-; CHECK-NEXT:    vsetvli zero, a1, e16, m2, ta, ma
-; CHECK-NEXT:    vsaddu.vx v8, v8, a0
-; CHECK-NEXT:    ret
-entry:
-  %a = call <vscale x 8 x i16> @llvm.riscv.vsaddu.nxv8i16.i16(
-    <vscale x 8 x i16> undef,
-    <vscale x 8 x i16> %0,
-    i16 %1,
-    i32 %2)
-
-  ret <vscale x 8 x i16> %a
-}
-
-declare <vscale x 8 x i16> @llvm.riscv.vsaddu.mask.nxv8i16.i16(
-  <vscale x 8 x i16>,
-  <vscale x 8 x i16>,
-  i16,
-  <vscale x 8 x i1>,
-  i32,
-  i32);
-
-define <vscale x 8 x i16> @intrinsic_vsaddu_mask_vx_nxv8i16_nxv8i16_i16(<vscale x 8 x i16> %0, <vscale x 8 x i16> %1, i16 %2, <vscale x 8 x i1> %3, i32 %4) nounwind {
-; CHECK-LABEL: intrinsic_vsaddu_mask_vx_nxv8i16_nxv8i16_i16:
-; CHECK:       # %bb.0: # %entry
-; CHECK-NEXT:    vsetvli zero, a1, e16, m2, ta, mu
-; CHECK-NEXT:    vsaddu.vx v8, v10, a0, v0.t
-; CHECK-NEXT:    ret
-entry:
-  %a = call <vscale x 8 x i16> @llvm.riscv.vsaddu.mask.nxv8i16.i16(
-    <vscale x 8 x i16> %0,
-    <vscale x 8 x i16> %1,
-    i16 %2,
-    <vscale x 8 x i1> %3,
-    i32 %4, i32 1)
-
-  ret <vscale x 8 x i16> %a
-}
-
-declare <vscale x 16 x i16> @llvm.riscv.vsaddu.nxv16i16.i16(
-  <vscale x 16 x i16>,
-  <vscale x 16 x i16>,
-  i16,
-  i32);
-
-define <vscale x 16 x i16> @intrinsic_vsaddu_vx_nxv16i16_nxv16i16_i16(<vscale x 16 x i16> %0, i16 %1, i32 %2) nounwind {
-; CHECK-LABEL: intrinsic_vsaddu_vx_nxv16i16_nxv16i16_i16:
-; CHECK:       # %bb.0: # %entry
-; CHECK-NEXT:    vsetvli zero, a1, e16, m4, ta, ma
-; CHECK-NEXT:    vsaddu.vx v8, v8, a0
-; CHECK-NEXT:    ret
-entry:
-  %a = call <vscale x 16 x i16> @llvm.riscv.vsaddu.nxv16i16.i16(
-    <vscale x 16 x i16> undef,
-    <vscale x 16 x i16> %0,
-    i16 %1,
-    i32 %2)
-
-  ret <vscale x 16 x i16> %a
-}
-
-declare <vscale x 16 x i16> @llvm.riscv.vsaddu.mask.nxv16i16.i16(
-  <vscale x 16 x i16>,
-  <vscale x 16 x i16>,
-  i16,
-  <vscale x 16 x i1>,
-  i32,
-  i32);
-
-define <vscale x 16 x i16> @intrinsic_vsaddu_mask_vx_nxv16i16_nxv16i16_i16(<vscale x 16 x i16> %0, <vscale x 16 x i16> %1, i16 %2, <vscale x 16 x i1> %3, i32 %4) nounwind {
-; CHECK-LABEL: intrinsic_vsaddu_mask_vx_nxv16i16_nxv16i16_i16:
-; CHECK:       # %bb.0: # %entry
-; CHECK-NEXT:    vsetvli zero, a1, e16, m4, ta, mu
-; CHECK-NEXT:    vsaddu.vx v8, v12, a0, v0.t
-; CHECK-NEXT:    ret
-entry:
-  %a = call <vscale x 16 x i16> @llvm.riscv.vsaddu.mask.nxv16i16.i16(
-    <vscale x 16 x i16> %0,
-    <vscale x 16 x i16> %1,
-    i16 %2,
-    <vscale x 16 x i1> %3,
-    i32 %4, i32 1)
-
-  ret <vscale x 16 x i16> %a
-}
-
-declare <vscale x 32 x i16> @llvm.riscv.vsaddu.nxv32i16.i16(
-  <vscale x 32 x i16>,
-  <vscale x 32 x i16>,
-  i16,
-  i32);
-
-define <vscale x 32 x i16> @intrinsic_vsaddu_vx_nxv32i16_nxv32i16_i16(<vscale x 32 x i16> %0, i16 %1, i32 %2) nounwind {
-; CHECK-LABEL: intrinsic_vsaddu_vx_nxv32i16_nxv32i16_i16:
-; CHECK:       # %bb.0: # %entry
-; CHECK-NEXT:    vsetvli zero, a1, e16, m8, ta, ma
-; CHECK-NEXT:    vsaddu.vx v8, v8, a0
-; CHECK-NEXT:    ret
-entry:
-  %a = call <vscale x 32 x i16> @llvm.riscv.vsaddu.nxv32i16.i16(
-    <vscale x 32 x i16> undef,
-    <vscale x 32 x i16> %0,
-    i16 %1,
-    i32 %2)
-
-  ret <vscale x 32 x i16> %a
-}
-
-declare <vscale x 32 x i16> @llvm.riscv.vsaddu.mask.nxv32i16.i16(
-  <vscale x 32 x i16>,
-  <vscale x 32 x i16>,
-  i16,
-  <vscale x 32 x i1>,
-  i32,
-  i32);
-
-define <vscale x 32 x i16> @intrinsic_vsaddu_mask_vx_nxv32i16_nxv32i16_i16(<vscale x 32 x i16> %0, <vscale x 32 x i16> %1, i16 %2, <vscale x 32 x i1> %3, i32 %4) nounwind {
-; CHECK-LABEL: intrinsic_vsaddu_mask_vx_nxv32i16_nxv32i16_i16:
-; CHECK:       # %bb.0: # %entry
-; CHECK-NEXT:    vsetvli zero, a1, e16, m8, ta, mu
-; CHECK-NEXT:    vsaddu.vx v8, v16, a0, v0.t
-; CHECK-NEXT:    ret
-entry:
-  %a = call <vscale x 32 x i16> @llvm.riscv.vsaddu.mask.nxv32i16.i16(
-    <vscale x 32 x i16> %0,
-    <vscale x 32 x i16> %1,
-    i16 %2,
-    <vscale x 32 x i1> %3,
-    i32 %4, i32 1)
-
-  ret <vscale x 32 x i16> %a
-}
-
-declare <vscale x 1 x i32> @llvm.riscv.vsaddu.nxv1i32.i32(
-  <vscale x 1 x i32>,
-  <vscale x 1 x i32>,
-  i32,
-  i32);
-
-define <vscale x 1 x i32> @intrinsic_vsaddu_vx_nxv1i32_nxv1i32_i32(<vscale x 1 x i32> %0, i32 %1, i32 %2) nounwind {
-; CHECK-LABEL: intrinsic_vsaddu_vx_nxv1i32_nxv1i32_i32:
-; CHECK:       # %bb.0: # %entry
-; CHECK-NEXT:    vsetvli zero, a1, e32, mf2, ta, ma
-; CHECK-NEXT:    vsaddu.vx v8, v8, a0
-; CHECK-NEXT:    ret
-entry:
-  %a = call <vscale x 1 x i32> @llvm.riscv.vsaddu.nxv1i32.i32(
-    <vscale x 1 x i32> undef,
-    <vscale x 1 x i32> %0,
-    i32 %1,
-    i32 %2)
-
-  ret <vscale x 1 x i32> %a
-}
-
-declare <vscale x 1 x i32> @llvm.riscv.vsaddu.mask.nxv1i32.i32(
-  <vscale x 1 x i32>,
-  <vscale x 1 x i32>,
-  i32,
-  <vscale x 1 x i1>,
-  i32,
-  i32);
-
-define <vscale x 1 x i32> @intrinsic_vsaddu_mask_vx_nxv1i32_nxv1i32_i32(<vscale x 1 x i32> %0, <vscale x 1 x i32> %1, i32 %2, <vscale x 1 x i1> %3, i32 %4) nounwind {
-; CHECK-LABEL: intrinsic_vsaddu_mask_vx_nxv1i32_nxv1i32_i32:
-; CHECK:       # %bb.0: # %entry
-; CHECK-NEXT:    vsetvli zero, a1, e32, mf2, ta, mu
-; CHECK-NEXT:    vsaddu.vx v8, v9, a0, v0.t
-; CHECK-NEXT:    ret
-entry:
-  %a = call <vscale x 1 x i32> @llvm.riscv.vsaddu.mask.nxv1i32.i32(
-    <vscale x 1 x i32> %0,
-    <vscale x 1 x i32> %1,
-    i32 %2,
-    <vscale x 1 x i1> %3,
-    i32 %4, i32 1)
-
-  ret <vscale x 1 x i32> %a
-}
-
-declare <vscale x 2 x i32> @llvm.riscv.vsaddu.nxv2i32.i32(
-  <vscale x 2 x i32>,
-  <vscale x 2 x i32>,
-  i32,
-  i32);
-
-define <vscale x 2 x i32> @intrinsic_vsaddu_vx_nxv2i32_nxv2i32_i32(<vscale x 2 x i32> %0, i32 %1, i32 %2) nounwind {
-; CHECK-LABEL: intrinsic_vsaddu_vx_nxv2i32_nxv2i32_i32:
-; CHECK:       # %bb.0: # %entry
-; CHECK-NEXT:    vsetvli zero, a1, e32, m1, ta, ma
-; CHECK-NEXT:    vsaddu.vx v8, v8, a0
-; CHECK-NEXT:    ret
-entry:
-  %a = call <vscale x 2 x i32> @llvm.riscv.vsaddu.nxv2i32.i32(
-    <vscale x 2 x i32> undef,
-    <vscale x 2 x i32> %0,
-    i32 %1,
-    i32 %2)
-
-  ret <vscale x 2 x i32> %a
-}
-
-declare <vscale x 2 x i32> @llvm.riscv.vsaddu.mask.nxv2i32.i32(
-  <vscale x 2 x i32>,
-  <vscale x 2 x i32>,
-  i32,
-  <vscale x 2 x i1>,
-  i32,
-  i32);
-
-define <vscale x 2 x i32> @intrinsic_vsaddu_mask_vx_nxv2i32_nxv2i32_i32(<vscale x 2 x i32> %0, <vscale x 2 x i32> %1, i32 %2, <vscale x 2 x i1> %3, i32 %4) nounwind {
-; CHECK-LABEL: intrinsic_vsaddu_mask_vx_nxv2i32_nxv2i32_i32:
-; CHECK:       # %bb.0: # %entry
-; CHECK-NEXT:    vsetvli zero, a1, e32, m1, ta, mu
-; CHECK-NEXT:    vsaddu.vx v8, v9, a0, v0.t
-; CHECK-NEXT:    ret
-entry:
-  %a = call <vscale x 2 x i32> @llvm.riscv.vsaddu.mask.nxv2i32.i32(
-    <vscale x 2 x i32> %0,
-    <vscale x 2 x i32> %1,
-    i32 %2,
-    <vscale x 2 x i1> %3,
-    i32 %4, i32 1)
-
-  ret <vscale x 2 x i32> %a
-}
-
-declare <vscale x 4 x i32> @llvm.riscv.vsaddu.nxv4i32.i32(
-  <vscale x 4 x i32>,
-  <vscale x 4 x i32>,
-  i32,
-  i32);
-
-define <vscale x 4 x i32> @intrinsic_vsaddu_vx_nxv4i32_nxv4i32_i32(<vscale x 4 x i32> %0, i32 %1, i32 %2) nounwind {
-; CHECK-LABEL: intrinsic_vsaddu_vx_nxv4i32_nxv4i32_i32:
-; CHECK:       # %bb.0: # %entry
-; CHECK-NEXT:    vsetvli zero, a1, e32, m2, ta, ma
-; CHECK-NEXT:    vsaddu.vx v8, v8, a0
-; CHECK-NEXT:    ret
-entry:
-  %a = call <vscale x 4 x i32> @llvm.riscv.vsaddu.nxv4i32.i32(
-    <vscale x 4 x i32> undef,
-    <vscale x 4 x i32> %0,
-    i32 %1,
-    i32 %2)
-
-  ret <vscale x 4 x i32> %a
-}
-
-declare <vscale x 4 x i32> @llvm.riscv.vsaddu.mask.nxv4i32.i32(
-  <vscale x 4 x i32>,
-  <vscale x 4 x i32>,
-  i32,
-  <vscale x 4 x i1>,
-  i32,
-  i32);
-
-define <vscale x 4 x i32> @intrinsic_vsaddu_mask_vx_nxv4i32_nxv4i32_i32(<vscale x 4 x i32> %0, <vscale x 4 x i32> %1, i32 %2, <vscale x 4 x i1> %3, i32 %4) nounwind {
-; CHECK-LABEL: intrinsic_vsaddu_mask_vx_nxv4i32_nxv4i32_i32:
-; CHECK:       # %bb.0: # %entry
-; CHECK-NEXT:    vsetvli zero, a1, e32, m2, ta, mu
-; CHECK-NEXT:    vsaddu.vx v8, v10, a0, v0.t
-; CHECK-NEXT:    ret
-entry:
-  %a = call <vscale x 4 x i32> @llvm.riscv.vsaddu.mask.nxv4i32.i32(
-    <vscale x 4 x i32> %0,
-    <vscale x 4 x i32> %1,
-    i32 %2,
-    <vscale x 4 x i1> %3,
-    i32 %4, i32 1)
-
-  ret <vscale x 4 x i32> %a
-}
-
-declare <vscale x 8 x i32> @llvm.riscv.vsaddu.nxv8i32.i32(
-  <vscale x 8 x i32>,
-  <vscale x 8 x i32>,
-  i32,
-  i32);
-
-define <vscale x 8 x i32> @intrinsic_vsaddu_vx_nxv8i32_nxv8i32_i32(<vscale x 8 x i32> %0, i32 %1, i32 %2) nounwind {
-; CHECK-LABEL: intrinsic_vsaddu_vx_nxv8i32_nxv8i32_i32:
-; CHECK:       # %bb.0: # %entry
-; CHECK-NEXT:    vsetvli zero, a1, e32, m4, ta, ma
-; CHECK-NEXT:    vsaddu.vx v8, v8, a0
-; CHECK-NEXT:    ret
-entry:
-  %a = call <vscale x 8 x i32> @llvm.riscv.vsaddu.nxv8i32.i32(
-    <vscale x 8 x i32> undef,
-    <vscale x 8 x i32> %0,
-    i32 %1,
-    i32 %2)
-
-  ret <vscale x 8 x i32> %a
-}
-
-declare <vscale x 8 x i32> @llvm.riscv.vsaddu.mask.nxv8i32.i32(
-  <vscale x 8 x i32>,
-  <vscale x 8 x i32>,
-  i32,
-  <vscale x 8 x i1>,
-  i32,
-  i32);
-
-define <vscale x 8 x i32> @intrinsic_vsaddu_mask_vx_nxv8i32_nxv8i32_i32(<vscale x 8 x i32> %0, <vscale x 8 x i32> %1, i32 %2, <vscale x 8 x i1> %3, i32 %4) nounwind {
-; CHECK-LABEL: intrinsic_vsaddu_mask_vx_nxv8i32_nxv8i32_i32:
-; CHECK:       # %bb.0: # %entry
-; CHECK-NEXT:    vsetvli zero, a1, e32, m4, ta, mu
-; CHECK-NEXT:    vsaddu.vx v8, v12, a0, v0.t
-; CHECK-NEXT:    ret
-entry:
-  %a = call <vscale x 8 x i32> @llvm.riscv.vsaddu.mask.nxv8i32.i32(
-    <vscale x 8 x i32> %0,
-    <vscale x 8 x i32> %1,
-    i32 %2,
-    <vscale x 8 x i1> %3,
-    i32 %4, i32 1)
-
-  ret <vscale x 8 x i32> %a
-}
-
-declare <vscale x 16 x i32> @llvm.riscv.vsaddu.nxv16i32.i32(
-  <vscale x 16 x i32>,
-  <vscale x 16 x i32>,
-  i32,
-  i32);
-
-define <vscale x 16 x i32> @intrinsic_vsaddu_vx_nxv16i32_nxv16i32_i32(<vscale x 16 x i32> %0, i32 %1, i32 %2) nounwind {
-; CHECK-LABEL: intrinsic_vsaddu_vx_nxv16i32_nxv16i32_i32:
-; CHECK:       # %bb.0: # %entry
-; CHECK-NEXT:    vsetvli zero, a1, e32, m8, ta, ma
-; CHECK-NEXT:    vsaddu.vx v8, v8, a0
-; CHECK-NEXT:    ret
-entry:
-  %a = call <vscale x 16 x i32> @llvm.riscv.vsaddu.nxv16i32.i32(
-    <vscale x 16 x i32> undef,
-    <vscale x 16 x i32> %0,
-    i32 %1,
-    i32 %2)
-
-  ret <vscale x 16 x i32> %a
-}
-
-declare <vscale x 16 x i32> @llvm.riscv.vsaddu.mask.nxv16i32.i32(
-  <vscale x 16 x i32>,
-  <vscale x 16 x i32>,
-  i32,
-  <vscale x 16 x i1>,
-  i32,
-  i32);
-
-define <vscale x 16 x i32> @intrinsic_vsaddu_mask_vx_nxv16i32_nxv16i32_i32(<vscale x 16 x i32> %0, <vscale x 16 x i32> %1, i32 %2, <vscale x 16 x i1> %3, i32 %4) nounwind {
-; CHECK-LABEL: intrinsic_vsaddu_mask_vx_nxv16i32_nxv16i32_i32:
-; CHECK:       # %bb.0: # %entry
-; CHECK-NEXT:    vsetvli zero, a1, e32, m8, ta, mu
-; CHECK-NEXT:    vsaddu.vx v8, v16, a0, v0.t
-; CHECK-NEXT:    ret
-entry:
-  %a = call <vscale x 16 x i32> @llvm.riscv.vsaddu.mask.nxv16i32.i32(
-    <vscale x 16 x i32> %0,
-    <vscale x 16 x i32> %1,
-    i32 %2,
-    <vscale x 16 x i1> %3,
-    i32 %4, i32 1)
-
-  ret <vscale x 16 x i32> %a
-}
-
-declare <vscale x 1 x i64> @llvm.riscv.vsaddu.nxv1i64.i64(
-  <vscale x 1 x i64>,
-  <vscale x 1 x i64>,
-  i64,
-  i32);
-
-define <vscale x 1 x i64> @intrinsic_vsaddu_vx_nxv1i64_nxv1i64_i64(<vscale x 1 x i64> %0, i64 %1, i32 %2) nounwind {
-; CHECK-LABEL: intrinsic_vsaddu_vx_nxv1i64_nxv1i64_i64:
-; CHECK:       # %bb.0: # %entry
-; CHECK-NEXT:    addi sp, sp, -16
-; CHECK-NEXT:    sw a1, 12(sp)
-; CHECK-NEXT:    sw a0, 8(sp)
-; CHECK-NEXT:    addi a0, sp, 8
-; CHECK-NEXT:    vsetvli zero, a2, e64, m1, ta, ma
-; CHECK-NEXT:    vlse64.v v9, (a0), zero
-; CHECK-NEXT:    vsaddu.vv v8, v8, v9
-; CHECK-NEXT:    addi sp, sp, 16
-; CHECK-NEXT:    ret
-entry:
-  %a = call <vscale x 1 x i64> @llvm.riscv.vsaddu.nxv1i64.i64(
-    <vscale x 1 x i64> undef,
-    <vscale x 1 x i64> %0,
-    i64 %1,
-    i32 %2)
-
-  ret <vscale x 1 x i64> %a
-}
-
-declare <vscale x 1 x i64> @llvm.riscv.vsaddu.mask.nxv1i64.i64(
-  <vscale x 1 x i64>,
-  <vscale x 1 x i64>,
-  i64,
-  <vscale x 1 x i1>,
-  i32,
-  i32);
-
-define <vscale x 1 x i64> @intrinsic_vsaddu_mask_vx_nxv1i64_nxv1i64_i64(<vscale x 1 x i64> %0, <vscale x 1 x i64> %1, i64 %2, <vscale x 1 x i1> %3, i32 %4) nounwind {
-; CHECK-LABEL: intrinsic_vsaddu_mask_vx_nxv1i64_nxv1i64_i64:
-; CHECK:       # %bb.0: # %entry
-; CHECK-NEXT:    addi sp, sp, -16
-; CHECK-NEXT:    sw a1, 12(sp)
-; CHECK-NEXT:    sw a0, 8(sp)
-; CHECK-NEXT:    addi a0, sp, 8
-; CHECK-NEXT:    vsetvli zero, a2, e64, m1, ta, mu
-; CHECK-NEXT:    vlse64.v v10, (a0), zero
-; CHECK-NEXT:    vsaddu.vv v8, v9, v10, v0.t
-; CHECK-NEXT:    addi sp, sp, 16
-; CHECK-NEXT:    ret
-entry:
-  %a = call <vscale x 1 x i64> @llvm.riscv.vsaddu.mask.nxv1i64.i64(
-    <vscale x 1 x i64> %0,
-    <vscale x 1 x i64> %1,
-    i64 %2,
-    <vscale x 1 x i1> %3,
-    i32 %4, i32 1)
-
-  ret <vscale x 1 x i64> %a
-}
-
-declare <vscale x 2 x i64> @llvm.riscv.vsaddu.nxv2i64.i64(
-  <vscale x 2 x i64>,
-  <vscale x 2 x i64>,
-  i64,
-  i32);
-
-define <vscale x 2 x i64> @intrinsic_vsaddu_vx_nxv2i64_nxv2i64_i64(<vscale x 2 x i64> %0, i64 %1, i32 %2) nounwind {
-; CHECK-LABEL: intrinsic_vsaddu_vx_nxv2i64_nxv2i64_i64:
-; CHECK:       # %bb.0: # %entry
-; CHECK-NEXT:    addi sp, sp, -16
-; CHECK-NEXT:    sw a1, 12(sp)
-; CHECK-NEXT:    sw a0, 8(sp)
-; CHECK-NEXT:    addi a0, sp, 8
-; CHECK-NEXT:    vsetvli zero, a2, e64, m2, ta, ma
-; CHECK-NEXT:    vlse64.v v10, (a0), zero
-; CHECK-NEXT:    vsaddu.vv v8, v8, v10
-; CHECK-NEXT:    addi sp, sp, 16
-; CHECK-NEXT:    ret
-entry:
-  %a = call <vscale x 2 x i64> @llvm.riscv.vsaddu.nxv2i64.i64(
-    <vscale x 2 x i64> undef,
-    <vscale x 2 x i64> %0,
-    i64 %1,
-    i32 %2)
-
-  ret <vscale x 2 x i64> %a
-}
-
-declare <vscale x 2 x i64> @llvm.riscv.vsaddu.mask.nxv2i64.i64(
-  <vscale x 2 x i64>,
-  <vscale x 2 x i64>,
-  i64,
-  <vscale x 2 x i1>,
-  i32,
-  i32);
-
-define <vscale x 2 x i64> @intrinsic_vsaddu_mask_vx_nxv2i64_nxv2i64_i64(<vscale x 2 x i64> %0, <vscale x 2 x i64> %1, i64 %2, <vscale x 2 x i1> %3, i32 %4) nounwind {
-; CHECK-LABEL: intrinsic_vsaddu_mask_vx_nxv2i64_nxv2i64_i64:
-; CHECK:       # %bb.0: # %entry
-; CHECK-NEXT:    addi sp, sp, -16
-; CHECK-NEXT:    sw a1, 12(sp)
-; CHECK-NEXT:    sw a0, 8(sp)
-; CHECK-NEXT:    addi a0, sp, 8
-; CHECK-NEXT:    vsetvli zero, a2, e64, m2, ta, mu
-; CHECK-NEXT:    vlse64.v v12, (a0), zero
-; CHECK-NEXT:    vsaddu.vv v8, v10, v12, v0.t
-; CHECK-NEXT:    addi sp, sp, 16
-; CHECK-NEXT:    ret
-entry:
-  %a = call <vscale x 2 x i64> @llvm.riscv.vsaddu.mask.nxv2i64.i64(
-    <vscale x 2 x i64> %0,
-    <vscale x 2 x i64> %1,
-    i64 %2,
-    <vscale x 2 x i1> %3,
-    i32 %4, i32 1)
-
-  ret <vscale x 2 x i64> %a
-}
-
-declare <vscale x 4 x i64> @llvm.riscv.vsaddu.nxv4i64.i64(
-  <vscale x 4 x i64>,
-  <vscale x 4 x i64>,
-  i64,
-  i32);
-
-define <vscale x 4 x i64> @intrinsic_vsaddu_vx_nxv4i64_nxv4i64_i64(<vscale x 4 x i64> %0, i64 %1, i32 %2) nounwind {
-; CHECK-LABEL: intrinsic_vsaddu_vx_nxv4i64_nxv4i64_i64:
-; CHECK:       # %bb.0: # %entry
-; CHECK-NEXT:    addi sp, sp, -16
-; CHECK-NEXT:    sw a1, 12(sp)
-; CHECK-NEXT:    sw a0, 8(sp)
-; CHECK-NEXT:    addi a0, sp, 8
-; CHECK-NEXT:    vsetvli zero, a2, e64, m4, ta, ma
-; CHECK-NEXT:    vlse64.v v12, (a0), zero
-; CHECK-NEXT:    vsaddu.vv v8, v8, v12
-; CHECK-NEXT:    addi sp, sp, 16
-; CHECK-NEXT:    ret
-entry:
-  %a = call <vscale x 4 x i64> @llvm.riscv.vsaddu.nxv4i64.i64(
-    <vscale x 4 x i64> undef,
-    <vscale x 4 x i64> %0,
-    i64 %1,
-    i32 %2)
-
-  ret <vscale x 4 x i64> %a
-}
-
-declare <vscale x 4 x i64> @llvm.riscv.vsaddu.mask.nxv4i64.i64(
-  <vscale x 4 x i64>,
-  <vscale x 4 x i64>,
-  i64,
-  <vscale x 4 x i1>,
-  i32,
-  i32);
-
-define <vscale x 4 x i64> @intrinsic_vsaddu_mask_vx_nxv4i64_nxv4i64_i64(<vscale x 4 x i64> %0, <vscale x 4 x i64> %1, i64 %2, <vscale x 4 x i1> %3, i32 %4) nounwind {
-; CHECK-LABEL: intrinsic_vsaddu_mask_vx_nxv4i64_nxv4i64_i64:
-; CHECK:       # %bb.0: # %entry
-; CHECK-NEXT:    addi sp, sp, -16
-; CHECK-NEXT:    sw a1, 12(sp)
-; CHECK-NEXT:    sw a0, 8(sp)
-; CHECK-NEXT:    addi a0, sp, 8
-; CHECK-NEXT:    vsetvli zero, a2, e64, m4, ta, mu
-; CHECK-NEXT:    vlse64.v v16, (a0), zero
-; CHECK-NEXT:    vsaddu.vv v8, v12, v16, v0.t
-; CHECK-NEXT:    addi sp, sp, 16
-; CHECK-NEXT:    ret
-entry:
-  %a = call <vscale x 4 x i64> @llvm.riscv.vsaddu.mask.nxv4i64.i64(
-    <vscale x 4 x i64> %0,
-    <vscale x 4 x i64> %1,
-    i64 %2,
-    <vscale x 4 x i1> %3,
-    i32 %4, i32 1)
-
-  ret <vscale x 4 x i64> %a
-}
-
-declare <vscale x 8 x i64> @llvm.riscv.vsaddu.nxv8i64.i64(
-  <vscale x 8 x i64>,
-  <vscale x 8 x i64>,
-  i64,
-  i32);
-
-define <vscale x 8 x i64> @intrinsic_vsaddu_vx_nxv8i64_nxv8i64_i64(<vscale x 8 x i64> %0, i64 %1, i32 %2) nounwind {
-; CHECK-LABEL: intrinsic_vsaddu_vx_nxv8i64_nxv8i64_i64:
-; CHECK:       # %bb.0: # %entry
-; CHECK-NEXT:    addi sp, sp, -16
-; CHECK-NEXT:    sw a1, 12(sp)
-; CHECK-NEXT:    sw a0, 8(sp)
-; CHECK-NEXT:    addi a0, sp, 8
-; CHECK-NEXT:    vsetvli zero, a2, e64, m8, ta, ma
-; CHECK-NEXT:    vlse64.v v16, (a0), zero
-; CHECK-NEXT:    vsaddu.vv v8, v8, v16
-; CHECK-NEXT:    addi sp, sp, 16
-; CHECK-NEXT:    ret
-entry:
-  %a = call <vscale x 8 x i64> @llvm.riscv.vsaddu.nxv8i64.i64(
-    <vscale x 8 x i64> undef,
-    <vscale x 8 x i64> %0,
-    i64 %1,
-    i32 %2)
-
-  ret <vscale x 8 x i64> %a
-}
-
-declare <vscale x 8 x i64> @llvm.riscv.vsaddu.mask.nxv8i64.i64(
-  <vscale x 8 x i64>,
-  <vscale x 8 x i64>,
-  i64,
-  <vscale x 8 x i1>,
-  i32,
-  i32);
-
-define <vscale x 8 x i64> @intrinsic_vsaddu_mask_vx_nxv8i64_nxv8i64_i64(<vscale x 8 x i64> %0, <vscale x 8 x i64> %1, i64 %2, <vscale x 8 x i1> %3, i32 %4) nounwind {
-; CHECK-LABEL: intrinsic_vsaddu_mask_vx_nxv8i64_nxv8i64_i64:
-; CHECK:       # %bb.0: # %entry
-; CHECK-NEXT:    addi sp, sp, -16
-; CHECK-NEXT:    sw a1, 12(sp)
-; CHECK-NEXT:    sw a0, 8(sp)
-; CHECK-NEXT:    addi a0, sp, 8
-; CHECK-NEXT:    vsetvli zero, a2, e64, m8, ta, mu
-; CHECK-NEXT:    vlse64.v v24, (a0), zero
-; CHECK-NEXT:    vsaddu.vv v8, v16, v24, v0.t
-; CHECK-NEXT:    addi sp, sp, 16
-; CHECK-NEXT:    ret
-entry:
-  %a = call <vscale x 8 x i64> @llvm.riscv.vsaddu.mask.nxv8i64.i64(
-    <vscale x 8 x i64> %0,
-    <vscale x 8 x i64> %1,
-    i64 %2,
-    <vscale x 8 x i1> %3,
-    i32 %4, i32 1)
-
-  ret <vscale x 8 x i64> %a
-}
-
-define <vscale x 1 x i8> @intrinsic_vsaddu_vi_nxv1i8_nxv1i8_i8(<vscale x 1 x i8> %0, i32 %1) nounwind {
-; CHECK-LABEL: intrinsic_vsaddu_vi_nxv1i8_nxv1i8_i8:
-; CHECK:       # %bb.0: # %entry
-; CHECK-NEXT:    vsetvli zero, a0, e8, mf8, ta, ma
-; CHECK-NEXT:    vsaddu.vi v8, v8, 9
-; CHECK-NEXT:    ret
-entry:
-  %a = call <vscale x 1 x i8> @llvm.riscv.vsaddu.nxv1i8.i8(
-    <vscale x 1 x i8> undef,
-    <vscale x 1 x i8> %0,
-    i8 9,
-    i32 %1)
-
-  ret <vscale x 1 x i8> %a
-}
-
-define <vscale x 1 x i8> @intrinsic_vsaddu_mask_vi_nxv1i8_nxv1i8_i8(<vscale x 1 x i8> %0, <vscale x 1 x i8> %1, <vscale x 1 x i1> %2, i32 %3) nounwind {
-; CHECK-LABEL: intrinsic_vsaddu_mask_vi_nxv1i8_nxv1i8_i8:
-; CHECK:       # %bb.0: # %entry
-; CHECK-NEXT:    vsetvli zero, a0, e8, mf8, ta, mu
-; CHECK-NEXT:    vsaddu.vi v8, v9, 9, v0.t
-; CHECK-NEXT:    ret
-entry:
-  %a = call <vscale x 1 x i8> @llvm.riscv.vsaddu.mask.nxv1i8.i8(
-    <vscale x 1 x i8> %0,
-    <vscale x 1 x i8> %1,
-    i8 9,
-    <vscale x 1 x i1> %2,
-    i32 %3, i32 1)
-
-  ret <vscale x 1 x i8> %a
-}
-
-define <vscale x 2 x i8> @intrinsic_vsaddu_vi_nxv2i8_nxv2i8_i8(<vscale x 2 x i8> %0, i32 %1) nounwind {
-; CHECK-LABEL: intrinsic_vsaddu_vi_nxv2i8_nxv2i8_i8:
-; CHECK:       # %bb.0: # %entry
-; CHECK-NEXT:    vsetvli zero, a0, e8, mf4, ta, ma
-; CHECK-NEXT:    vsaddu.vi v8, v8, 9
-; CHECK-NEXT:    ret
-entry:
-  %a = call <vscale x 2 x i8> @llvm.riscv.vsaddu.nxv2i8.i8(
-    <vscale x 2 x i8> undef,
-    <vscale x 2 x i8> %0,
-    i8 9,
-    i32 %1)
-
-  ret <vscale x 2 x i8> %a
-}
-
-define <vscale x 2 x i8> @intrinsic_vsaddu_mask_vi_nxv2i8_nxv2i8_i8(<vscale x 2 x i8> %0, <vscale x 2 x i8> %1, <vscale x 2 x i1> %2, i32 %3) nounwind {
-; CHECK-LABEL: intrinsic_vsaddu_mask_vi_nxv2i8_nxv2i8_i8:
-; CHECK:       # %bb.0: # %entry
-; CHECK-NEXT:    vsetvli zero, a0, e8, mf4, ta, mu
-; CHECK-NEXT:    vsaddu.vi v8, v9, 9, v0.t
-; CHECK-NEXT:    ret
-entry:
-  %a = call <vscale x 2 x i8> @llvm.riscv.vsaddu.mask.nxv2i8.i8(
-    <vscale x 2 x i8> %0,
-    <vscale x 2 x i8> %1,
-    i8 9,
-    <vscale x 2 x i1> %2,
-    i32 %3, i32 1)
-
-  ret <vscale x 2 x i8> %a
-}
-
-define <vscale x 4 x i8> @intrinsic_vsaddu_vi_nxv4i8_nxv4i8_i8(<vscale x 4 x i8> %0, i32 %1) nounwind {
-; CHECK-LABEL: intrinsic_vsaddu_vi_nxv4i8_nxv4i8_i8:
-; CHECK:       # %bb.0: # %entry
-; CHECK-NEXT:    vsetvli zero, a0, e8, mf2, ta, ma
-; CHECK-NEXT:    vsaddu.vi v8, v8, 9
-; CHECK-NEXT:    ret
-entry:
-  %a = call <vscale x 4 x i8> @llvm.riscv.vsaddu.nxv4i8.i8(
-    <vscale x 4 x i8> undef,
-    <vscale x 4 x i8> %0,
-    i8 9,
-    i32 %1)
-
-  ret <vscale x 4 x i8> %a
-}
-
-define <vscale x 4 x i8> @intrinsic_vsaddu_mask_vi_nxv4i8_nxv4i8_i8(<vscale x 4 x i8> %0, <vscale x 4 x i8> %1, <vscale x 4 x i1> %2, i32 %3) nounwind {
-; CHECK-LABEL: intrinsic_vsaddu_mask_vi_nxv4i8_nxv4i8_i8:
-; CHECK:       # %bb.0: # %entry
-; CHECK-NEXT:    vsetvli zero, a0, e8, mf2, ta, mu
-; CHECK-NEXT:    vsaddu.vi v8, v9, 9, v0.t
-; CHECK-NEXT:    ret
-entry:
-  %a = call <vscale x 4 x i8> @llvm.riscv.vsaddu.mask.nxv4i8.i8(
-    <vscale x 4 x i8> %0,
-    <vscale x 4 x i8> %1,
-    i8 9,
-    <vscale x 4 x i1> %2,
-    i32 %3, i32 1)
-
-  ret <vscale x 4 x i8> %a
-}
-
-define <vscale x 8 x i8> @intrinsic_vsaddu_vi_nxv8i8_nxv8i8_i8(<vscale x 8 x i8> %0, i32 %1) nounwind {
-; CHECK-LABEL: intrinsic_vsaddu_vi_nxv8i8_nxv8i8_i8:
-; CHECK:       # %bb.0: # %entry
-; CHECK-NEXT:    vsetvli zero, a0, e8, m1, ta, ma
-; CHECK-NEXT:    vsaddu.vi v8, v8, 9
-; CHECK-NEXT:    ret
-entry:
-  %a = call <vscale x 8 x i8> @llvm.riscv.vsaddu.nxv8i8.i8(
-    <vscale x 8 x i8> undef,
-    <vscale x 8 x i8> %0,
-    i8 9,
-    i32 %1)
-
-  ret <vscale x 8 x i8> %a
-}
-
-define <vscale x 8 x i8> @intrinsic_vsaddu_mask_vi_nxv8i8_nxv8i8_i8(<vscale x 8 x i8> %0, <vscale x 8 x i8> %1, <vscale x 8 x i1> %2, i32 %3) nounwind {
-; CHECK-LABEL: intrinsic_vsaddu_mask_vi_nxv8i8_nxv8i8_i8:
-; CHECK:       # %bb.0: # %entry
-; CHECK-NEXT:    vsetvli zero, a0, e8, m1, ta, mu
-; CHECK-NEXT:    vsaddu.vi v8, v9, 9, v0.t
-; CHECK-NEXT:    ret
-entry:
-  %a = call <vscale x 8 x i8> @llvm.riscv.vsaddu.mask.nxv8i8.i8(
-    <vscale x 8 x i8> %0,
-    <vscale x 8 x i8> %1,
-    i8 9,
-    <vscale x 8 x i1> %2,
-    i32 %3, i32 1)
-
-  ret <vscale x 8 x i8> %a
-}
-
-define <vscale x 16 x i8> @intrinsic_vsaddu_vi_nxv16i8_nxv16i8_i8(<vscale x 16 x i8> %0, i32 %1) nounwind {
-; CHECK-LABEL: intrinsic_vsaddu_vi_nxv16i8_nxv16i8_i8:
-; CHECK:       # %bb.0: # %entry
-; CHECK-NEXT:    vsetvli zero, a0, e8, m2, ta, ma
-; CHECK-NEXT:    vsaddu.vi v8, v8, 9
-; CHECK-NEXT:    ret
-entry:
-  %a = call <vscale x 16 x i8> @llvm.riscv.vsaddu.nxv16i8.i8(
-    <vscale x 16 x i8> undef,
-    <vscale x 16 x i8> %0,
-    i8 9,
-    i32 %1)
-
-  ret <vscale x 16 x i8> %a
-}
-
-define <vscale x 16 x i8> @intrinsic_vsaddu_mask_vi_nxv16i8_nxv16i8_i8(<vscale x 16 x i8> %0, <vscale x 16 x i8> %1, <vscale x 16 x i1> %2, i32 %3) nounwind {
-; CHECK-LABEL: intrinsic_vsaddu_mask_vi_nxv16i8_nxv16i8_i8:
-; CHECK:       # %bb.0: # %entry
-; CHECK-NEXT:    vsetvli zero, a0, e8, m2, ta, mu
-; CHECK-NEXT:    vsaddu.vi v8, v10, 9, v0.t
-; CHECK-NEXT:    ret
-entry:
-  %a = call <vscale x 16 x i8> @llvm.riscv.vsaddu.mask.nxv16i8.i8(
-    <vscale x 16 x i8> %0,
-    <vscale x 16 x i8> %1,
-    i8 9,
-    <vscale x 16 x i1> %2,
-    i32 %3, i32 1)
-
-  ret <vscale x 16 x i8> %a
-}
-
-define <vscale x 32 x i8> @intrinsic_vsaddu_vi_nxv32i8_nxv32i8_i8(<vscale x 32 x i8> %0, i32 %1) nounwind {
-; CHECK-LABEL: intrinsic_vsaddu_vi_nxv32i8_nxv32i8_i8:
-; CHECK:       # %bb.0: # %entry
-; CHECK-NEXT:    vsetvli zero, a0, e8, m4, ta, ma
-; CHECK-NEXT:    vsaddu.vi v8, v8, 9
-; CHECK-NEXT:    ret
-entry:
-  %a = call <vscale x 32 x i8> @llvm.riscv.vsaddu.nxv32i8.i8(
-    <vscale x 32 x i8> undef,
-    <vscale x 32 x i8> %0,
-    i8 9,
-    i32 %1)
-
-  ret <vscale x 32 x i8> %a
-}
-
-define <vscale x 32 x i8> @intrinsic_vsaddu_mask_vi_nxv32i8_nxv32i8_i8(<vscale x 32 x i8> %0, <vscale x 32 x i8> %1, <vscale x 32 x i1> %2, i32 %3) nounwind {
-; CHECK-LABEL: intrinsic_vsaddu_mask_vi_nxv32i8_nxv32i8_i8:
-; CHECK:       # %bb.0: # %entry
-; CHECK-NEXT:    vsetvli zero, a0, e8, m4, ta, mu
-; CHECK-NEXT:    vsaddu.vi v8, v12, 9, v0.t
-; CHECK-NEXT:    ret
-entry:
-  %a = call <vscale x 32 x i8> @llvm.riscv.vsaddu.mask.nxv32i8.i8(
-    <vscale x 32 x i8> %0,
-    <vscale x 32 x i8> %1,
-    i8 9,
-    <vscale x 32 x i1> %2,
-    i32 %3, i32 1)
-
-  ret <vscale x 32 x i8> %a
-}
-
-define <vscale x 64 x i8> @intrinsic_vsaddu_vi_nxv64i8_nxv64i8_i8(<vscale x 64 x i8> %0, i32 %1) nounwind {
-; CHECK-LABEL: intrinsic_vsaddu_vi_nxv64i8_nxv64i8_i8:
-; CHECK:       # %bb.0: # %entry
-; CHECK-NEXT:    vsetvli zero, a0, e8, m8, ta, ma
-; CHECK-NEXT:    vsaddu.vi v8, v8, 9
-; CHECK-NEXT:    ret
-entry:
-  %a = call <vscale x 64 x i8> @llvm.riscv.vsaddu.nxv64i8.i8(
-    <vscale x 64 x i8> undef,
-    <vscale x 64 x i8> %0,
-    i8 9,
-    i32 %1)
-
-  ret <vscale x 64 x i8> %a
-}
-
-define <vscale x 64 x i8> @intrinsic_vsaddu_mask_vi_nxv64i8_nxv64i8_i8(<vscale x 64 x i8> %0, <vscale x 64 x i8> %1, <vscale x 64 x i1> %2, i32 %3) nounwind {
-; CHECK-LABEL: intrinsic_vsaddu_mask_vi_nxv64i8_nxv64i8_i8:
-; CHECK:       # %bb.0: # %entry
-; CHECK-NEXT:    vsetvli zero, a0, e8, m8, ta, mu
-; CHECK-NEXT:    vsaddu.vi v8, v16, 9, v0.t
-; CHECK-NEXT:    ret
-entry:
-  %a = call <vscale x 64 x i8> @llvm.riscv.vsaddu.mask.nxv64i8.i8(
-    <vscale x 64 x i8> %0,
-    <vscale x 64 x i8> %1,
-    i8 9,
-    <vscale x 64 x i1> %2,
-    i32 %3, i32 1)
-
-  ret <vscale x 64 x i8> %a
-}
-
-define <vscale x 1 x i16> @intrinsic_vsaddu_vi_nxv1i16_nxv1i16_i16(<vscale x 1 x i16> %0, i32 %1) nounwind {
-; CHECK-LABEL: intrinsic_vsaddu_vi_nxv1i16_nxv1i16_i16:
-; CHECK:       # %bb.0: # %entry
-; CHECK-NEXT:    vsetvli zero, a0, e16, mf4, ta, ma
-; CHECK-NEXT:    vsaddu.vi v8, v8, 9
-; CHECK-NEXT:    ret
-entry:
-  %a = call <vscale x 1 x i16> @llvm.riscv.vsaddu.nxv1i16.i16(
-    <vscale x 1 x i16> undef,
-    <vscale x 1 x i16> %0,
-    i16 9,
-    i32 %1)
-
-  ret <vscale x 1 x i16> %a
-}
-
-define <vscale x 1 x i16> @intrinsic_vsaddu_mask_vi_nxv1i16_nxv1i16_i16(<vscale x 1 x i16> %0, <vscale x 1 x i16> %1, <vscale x 1 x i1> %2, i32 %3) nounwind {
-; CHECK-LABEL: intrinsic_vsaddu_mask_vi_nxv1i16_nxv1i16_i16:
-; CHECK:       # %bb.0: # %entry
-; CHECK-NEXT:    vsetvli zero, a0, e16, mf4, ta, mu
-; CHECK-NEXT:    vsaddu.vi v8, v9, 9, v0.t
-; CHECK-NEXT:    ret
-entry:
-  %a = call <vscale x 1 x i16> @llvm.riscv.vsaddu.mask.nxv1i16.i16(
-    <vscale x 1 x i16> %0,
-    <vscale x 1 x i16> %1,
-    i16 9,
-    <vscale x 1 x i1> %2,
-    i32 %3, i32 1)
-
-  ret <vscale x 1 x i16> %a
-}
-
-define <vscale x 2 x i16> @intrinsic_vsaddu_vi_nxv2i16_nxv2i16_i16(<vscale x 2 x i16> %0, i32 %1) nounwind {
-; CHECK-LABEL: intrinsic_vsaddu_vi_nxv2i16_nxv2i16_i16:
-; CHECK:       # %bb.0: # %entry
-; CHECK-NEXT:    vsetvli zero, a0, e16, mf2, ta, ma
-; CHECK-NEXT:    vsaddu.vi v8, v8, 9
-; CHECK-NEXT:    ret
-entry:
-  %a = call <vscale x 2 x i16> @llvm.riscv.vsaddu.nxv2i16.i16(
-    <vscale x 2 x i16> undef,
-    <vscale x 2 x i16> %0,
-    i16 9,
-    i32 %1)
-
-  ret <vscale x 2 x i16> %a
-}
-
-define <vscale x 2 x i16> @intrinsic_vsaddu_mask_vi_nxv2i16_nxv2i16_i16(<vscale x 2 x i16> %0, <vscale x 2 x i16> %1, <vscale x 2 x i1> %2, i32 %3) nounwind {
-; CHECK-LABEL: intrinsic_vsaddu_mask_vi_nxv2i16_nxv2i16_i16:
-; CHECK:       # %bb.0: # %entry
-; CHECK-NEXT:    vsetvli zero, a0, e16, mf2, ta, mu
-; CHECK-NEXT:    vsaddu.vi v8, v9, 9, v0.t
-; CHECK-NEXT:    ret
-entry:
-  %a = call <vscale x 2 x i16> @llvm.riscv.vsaddu.mask.nxv2i16.i16(
-    <vscale x 2 x i16> %0,
-    <vscale x 2 x i16> %1,
-    i16 9,
-    <vscale x 2 x i1> %2,
-    i32 %3, i32 1)
-
-  ret <vscale x 2 x i16> %a
-}
-
-define <vscale x 4 x i16> @intrinsic_vsaddu_vi_nxv4i16_nxv4i16_i16(<vscale x 4 x i16> %0, i32 %1) nounwind {
-; CHECK-LABEL: intrinsic_vsaddu_vi_nxv4i16_nxv4i16_i16:
-; CHECK:       # %bb.0: # %entry
-; CHECK-NEXT:    vsetvli zero, a0, e16, m1, ta, ma
-; CHECK-NEXT:    vsaddu.vi v8, v8, 9
-; CHECK-NEXT:    ret
-entry:
-  %a = call <vscale x 4 x i16> @llvm.riscv.vsaddu.nxv4i16.i16(
-    <vscale x 4 x i16> undef,
-    <vscale x 4 x i16> %0,
-    i16 9,
-    i32 %1)
-
-  ret <vscale x 4 x i16> %a
-}
-
-define <vscale x 4 x i16> @intrinsic_vsaddu_mask_vi_nxv4i16_nxv4i16_i16(<vscale x 4 x i16> %0, <vscale x 4 x i16> %1, <vscale x 4 x i1> %2, i32 %3) nounwind {
-; CHECK-LABEL: intrinsic_vsaddu_mask_vi_nxv4i16_nxv4i16_i16:
-; CHECK:       # %bb.0: # %entry
-; CHECK-NEXT:    vsetvli zero, a0, e16, m1, ta, mu
-; CHECK-NEXT:    vsaddu.vi v8, v9, 9, v0.t
-; CHECK-NEXT:    ret
-entry:
-  %a = call <vscale x 4 x i16> @llvm.riscv.vsaddu.mask.nxv4i16.i16(
-    <vscale x 4 x i16> %0,
-    <vscale x 4 x i16> %1,
-    i16 9,
-    <vscale x 4 x i1> %2,
-    i32 %3, i32 1)
-
-  ret <vscale x 4 x i16> %a
-}
-
-define <vscale x 8 x i16> @intrinsic_vsaddu_vi_nxv8i16_nxv8i16_i16(<vscale x 8 x i16> %0, i32 %1) nounwind {
-; CHECK-LABEL: intrinsic_vsaddu_vi_nxv8i16_nxv8i16_i16:
-; CHECK:       # %bb.0: # %entry
-; CHECK-NEXT:    vsetvli zero, a0, e16, m2, ta, ma
-; CHECK-NEXT:    vsaddu.vi v8, v8, 9
-; CHECK-NEXT:    ret
-entry:
-  %a = call <vscale x 8 x i16> @llvm.riscv.vsaddu.nxv8i16.i16(
-    <vscale x 8 x i16> undef,
-    <vscale x 8 x i16> %0,
-    i16 9,
-    i32 %1)
-
-  ret <vscale x 8 x i16> %a
-}
-
-define <vscale x 8 x i16> @intrinsic_vsaddu_mask_vi_nxv8i16_nxv8i16_i16(<vscale x 8 x i16> %0, <vscale x 8 x i16> %1, <vscale x 8 x i1> %2, i32 %3) nounwind {
-; CHECK-LABEL: intrinsic_vsaddu_mask_vi_nxv8i16_nxv8i16_i16:
-; CHECK:       # %bb.0: # %entry
-; CHECK-NEXT:    vsetvli zero, a0, e16, m2, ta, mu
-; CHECK-NEXT:    vsaddu.vi v8, v10, 9, v0.t
-; CHECK-NEXT:    ret
-entry:
-  %a = call <vscale x 8 x i16> @llvm.riscv.vsaddu.mask.nxv8i16.i16(
-    <vscale x 8 x i16> %0,
-    <vscale x 8 x i16> %1,
-    i16 9,
-    <vscale x 8 x i1> %2,
-    i32 %3, i32 1)
-
-  ret <vscale x 8 x i16> %a
-}
-
-define <vscale x 16 x i16> @intrinsic_vsaddu_vi_nxv16i16_nxv16i16_i16(<vscale x 16 x i16> %0, i32 %1) nounwind {
-; CHECK-LABEL: intrinsic_vsaddu_vi_nxv16i16_nxv16i16_i16:
-; CHECK:       # %bb.0: # %entry
-; CHECK-NEXT:    vsetvli zero, a0, e16, m4, ta, ma
-; CHECK-NEXT:    vsaddu.vi v8, v8, 9
-; CHECK-NEXT:    ret
-entry:
-  %a = call <vscale x 16 x i16> @llvm.riscv.vsaddu.nxv16i16.i16(
-    <vscale x 16 x i16> undef,
-    <vscale x 16 x i16> %0,
-    i16 9,
-    i32 %1)
-
-  ret <vscale x 16 x i16> %a
-}
-
-define <vscale x 16 x i16> @intrinsic_vsaddu_mask_vi_nxv16i16_nxv16i16_i16(<vscale x 16 x i16> %0, <vscale x 16 x i16> %1, <vscale x 16 x i1> %2, i32 %3) nounwind {
-; CHECK-LABEL: intrinsic_vsaddu_mask_vi_nxv16i16_nxv16i16_i16:
-; CHECK:       # %bb.0: # %entry
-; CHECK-NEXT:    vsetvli zero, a0, e16, m4, ta, mu
-; CHECK-NEXT:    vsaddu.vi v8, v12, 9, v0.t
-; CHECK-NEXT:    ret
-entry:
-  %a = call <vscale x 16 x i16> @llvm.riscv.vsaddu.mask.nxv16i16.i16(
-    <vscale x 16 x i16> %0,
-    <vscale x 16 x i16> %1,
-    i16 9,
-    <vscale x 16 x i1> %2,
-    i32 %3, i32 1)
-
-  ret <vscale x 16 x i16> %a
-}
-
-define <vscale x 32 x i16> @intrinsic_vsaddu_vi_nxv32i16_nxv32i16_i16(<vscale x 32 x i16> %0, i32 %1) nounwind {
-; CHECK-LABEL: intrinsic_vsaddu_vi_nxv32i16_nxv32i16_i16:
-; CHECK:       # %bb.0: # %entry
-; CHECK-NEXT:    vsetvli zero, a0, e16, m8, ta, ma
-; CHECK-NEXT:    vsaddu.vi v8, v8, 9
-; CHECK-NEXT:    ret
-entry:
-  %a = call <vscale x 32 x i16> @llvm.riscv.vsaddu.nxv32i16.i16(
-    <vscale x 32 x i16> undef,
-    <vscale x 32 x i16> %0,
-    i16 9,
-    i32 %1)
-
-  ret <vscale x 32 x i16> %a
-}
-
-define <vscale x 32 x i16> @intrinsic_vsaddu_mask_vi_nxv32i16_nxv32i16_i16(<vscale x 32 x i16> %0, <vscale x 32 x i16> %1, <vscale x 32 x i1> %2, i32 %3) nounwind {
-; CHECK-LABEL: intrinsic_vsaddu_mask_vi_nxv32i16_nxv32i16_i16:
-; CHECK:       # %bb.0: # %entry
-; CHECK-NEXT:    vsetvli zero, a0, e16, m8, ta, mu
-; CHECK-NEXT:    vsaddu.vi v8, v16, 9, v0.t
-; CHECK-NEXT:    ret
-entry:
-  %a = call <vscale x 32 x i16> @llvm.riscv.vsaddu.mask.nxv32i16.i16(
-    <vscale x 32 x i16> %0,
-    <vscale x 32 x i16> %1,
-    i16 9,
-    <vscale x 32 x i1> %2,
-    i32 %3, i32 1)
-
-  ret <vscale x 32 x i16> %a
-}
-
-define <vscale x 1 x i32> @intrinsic_vsaddu_vi_nxv1i32_nxv1i32_i32(<vscale x 1 x i32> %0, i32 %1) nounwind {
-; CHECK-LABEL: intrinsic_vsaddu_vi_nxv1i32_nxv1i32_i32:
-; CHECK:       # %bb.0: # %entry
-; CHECK-NEXT:    vsetvli zero, a0, e32, mf2, ta, ma
-; CHECK-NEXT:    vsaddu.vi v8, v8, 9
-; CHECK-NEXT:    ret
-entry:
-  %a = call <vscale x 1 x i32> @llvm.riscv.vsaddu.nxv1i32.i32(
-    <vscale x 1 x i32> undef,
-    <vscale x 1 x i32> %0,
-    i32 9,
-    i32 %1)
-
-  ret <vscale x 1 x i32> %a
-}
-
-define <vscale x 1 x i32> @intrinsic_vsaddu_mask_vi_nxv1i32_nxv1i32_i32(<vscale x 1 x i32> %0, <vscale x 1 x i32> %1, <vscale x 1 x i1> %2, i32 %3) nounwind {
-; CHECK-LABEL: intrinsic_vsaddu_mask_vi_nxv1i32_nxv1i32_i32:
-; CHECK:       # %bb.0: # %entry
-; CHECK-NEXT:    vsetvli zero, a0, e32, mf2, ta, mu
-; CHECK-NEXT:    vsaddu.vi v8, v9, 9, v0.t
-; CHECK-NEXT:    ret
-entry:
-  %a = call <vscale x 1 x i32> @llvm.riscv.vsaddu.mask.nxv1i32.i32(
-    <vscale x 1 x i32> %0,
-    <vscale x 1 x i32> %1,
-    i32 9,
-    <vscale x 1 x i1> %2,
-    i32 %3, i32 1)
-
-  ret <vscale x 1 x i32> %a
-}
-
-define <vscale x 2 x i32> @intrinsic_vsaddu_vi_nxv2i32_nxv2i32_i32(<vscale x 2 x i32> %0, i32 %1) nounwind {
-; CHECK-LABEL: intrinsic_vsaddu_vi_nxv2i32_nxv2i32_i32:
-; CHECK:       # %bb.0: # %entry
-; CHECK-NEXT:    vsetvli zero, a0, e32, m1, ta, ma
-; CHECK-NEXT:    vsaddu.vi v8, v8, 9
-; CHECK-NEXT:    ret
-entry:
-  %a = call <vscale x 2 x i32> @llvm.riscv.vsaddu.nxv2i32.i32(
-    <vscale x 2 x i32> undef,
-    <vscale x 2 x i32> %0,
-    i32 9,
-    i32 %1)
-
-  ret <vscale x 2 x i32> %a
-}
-
-define <vscale x 2 x i32> @intrinsic_vsaddu_mask_vi_nxv2i32_nxv2i32_i32(<vscale x 2 x i32> %0, <vscale x 2 x i32> %1, <vscale x 2 x i1> %2, i32 %3) nounwind {
-; CHECK-LABEL: intrinsic_vsaddu_mask_vi_nxv2i32_nxv2i32_i32:
-; CHECK:       # %bb.0: # %entry
-; CHECK-NEXT:    vsetvli zero, a0, e32, m1, ta, mu
-; CHECK-NEXT:    vsaddu.vi v8, v9, 9, v0.t
-; CHECK-NEXT:    ret
-entry:
-  %a = call <vscale x 2 x i32> @llvm.riscv.vsaddu.mask.nxv2i32.i32(
-    <vscale x 2 x i32> %0,
-    <vscale x 2 x i32> %1,
-    i32 9,
-    <vscale x 2 x i1> %2,
-    i32 %3, i32 1)
-
-  ret <vscale x 2 x i32> %a
-}
-
-define <vscale x 4 x i32> @intrinsic_vsaddu_vi_nxv4i32_nxv4i32_i32(<vscale x 4 x i32> %0, i32 %1) nounwind {
-; CHECK-LABEL: intrinsic_vsaddu_vi_nxv4i32_nxv4i32_i32:
-; CHECK:       # %bb.0: # %entry
-; CHECK-NEXT:    vsetvli zero, a0, e32, m2, ta, ma
-; CHECK-NEXT:    vsaddu.vi v8, v8, 9
-; CHECK-NEXT:    ret
-entry:
-  %a = call <vscale x 4 x i32> @llvm.riscv.vsaddu.nxv4i32.i32(
-    <vscale x 4 x i32> undef,
-    <vscale x 4 x i32> %0,
-    i32 9,
-    i32 %1)
-
-  ret <vscale x 4 x i32> %a
-}
-
-define <vscale x 4 x i32> @intrinsic_vsaddu_mask_vi_nxv4i32_nxv4i32_i32(<vscale x 4 x i32> %0, <vscale x 4 x i32> %1, <vscale x 4 x i1> %2, i32 %3) nounwind {
-; CHECK-LABEL: intrinsic_vsaddu_mask_vi_nxv4i32_nxv4i32_i32:
-; CHECK:       # %bb.0: # %entry
-; CHECK-NEXT:    vsetvli zero, a0, e32, m2, ta, mu
-; CHECK-NEXT:    vsaddu.vi v8, v10, 9, v0.t
-; CHECK-NEXT:    ret
-entry:
-  %a = call <vscale x 4 x i32> @llvm.riscv.vsaddu.mask.nxv4i32.i32(
-    <vscale x 4 x i32> %0,
-    <vscale x 4 x i32> %1,
-    i32 9,
-    <vscale x 4 x i1> %2,
-    i32 %3, i32 1)
-
-  ret <vscale x 4 x i32> %a
-}
-
-define <vscale x 8 x i32> @intrinsic_vsaddu_vi_nxv8i32_nxv8i32_i32(<vscale x 8 x i32> %0, i32 %1) nounwind {
-; CHECK-LABEL: intrinsic_vsaddu_vi_nxv8i32_nxv8i32_i32:
-; CHECK:       # %bb.0: # %entry
-; CHECK-NEXT:    vsetvli zero, a0, e32, m4, ta, ma
-; CHECK-NEXT:    vsaddu.vi v8, v8, 9
-; CHECK-NEXT:    ret
-entry:
-  %a = call <vscale x 8 x i32> @llvm.riscv.vsaddu.nxv8i32.i32(
-    <vscale x 8 x i32> undef,
-    <vscale x 8 x i32> %0,
-    i32 9,
-    i32 %1)
-
-  ret <vscale x 8 x i32> %a
-}
-
-define <vscale x 8 x i32> @intrinsic_vsaddu_mask_vi_nxv8i32_nxv8i32_i32(<vscale x 8 x i32> %0, <vscale x 8 x i32> %1, <vscale x 8 x i1> %2, i32 %3) nounwind {
-; CHECK-LABEL: intrinsic_vsaddu_mask_vi_nxv8i32_nxv8i32_i32:
-; CHECK:       # %bb.0: # %entry
-; CHECK-NEXT:    vsetvli zero, a0, e32, m4, ta, mu
-; CHECK-NEXT:    vsaddu.vi v8, v12, 9, v0.t
-; CHECK-NEXT:    ret
-entry:
-  %a = call <vscale x 8 x i32> @llvm.riscv.vsaddu.mask.nxv8i32.i32(
-    <vscale x 8 x i32> %0,
-    <vscale x 8 x i32> %1,
-    i32 9,
-    <vscale x 8 x i1> %2,
-    i32 %3, i32 1)
-
-  ret <vscale x 8 x i32> %a
-}
-
-define <vscale x 16 x i32> @intrinsic_vsaddu_vi_nxv16i32_nxv16i32_i32(<vscale x 16 x i32> %0, i32 %1) nounwind {
-; CHECK-LABEL: intrinsic_vsaddu_vi_nxv16i32_nxv16i32_i32:
-; CHECK:       # %bb.0: # %entry
-; CHECK-NEXT:    vsetvli zero, a0, e32, m8, ta, ma
-; CHECK-NEXT:    vsaddu.vi v8, v8, 9
-; CHECK-NEXT:    ret
-entry:
-  %a = call <vscale x 16 x i32> @llvm.riscv.vsaddu.nxv16i32.i32(
-    <vscale x 16 x i32> undef,
-    <vscale x 16 x i32> %0,
-    i32 9,
-    i32 %1)
-
-  ret <vscale x 16 x i32> %a
-}
-
-define <vscale x 16 x i32> @intrinsic_vsaddu_mask_vi_nxv16i32_nxv16i32_i32(<vscale x 16 x i32> %0, <vscale x 16 x i32> %1, <vscale x 16 x i1> %2, i32 %3) nounwind {
-; CHECK-LABEL: intrinsic_vsaddu_mask_vi_nxv16i32_nxv16i32_i32:
-; CHECK:       # %bb.0: # %entry
-; CHECK-NEXT:    vsetvli zero, a0, e32, m8, ta, mu
-; CHECK-NEXT:    vsaddu.vi v8, v16, 9, v0.t
-; CHECK-NEXT:    ret
-entry:
-  %a = call <vscale x 16 x i32> @llvm.riscv.vsaddu.mask.nxv16i32.i32(
-    <vscale x 16 x i32> %0,
-    <vscale x 16 x i32> %1,
-    i32 9,
-    <vscale x 16 x i1> %2,
-    i32 %3, i32 1)
-
-  ret <vscale x 16 x i32> %a
-}
-
-define <vscale x 1 x i64> @intrinsic_vsaddu_vi_nxv1i64_nxv1i64_i64(<vscale x 1 x i64> %0, i32 %1) nounwind {
-; CHECK-LABEL: intrinsic_vsaddu_vi_nxv1i64_nxv1i64_i64:
-; CHECK:       # %bb.0: # %entry
-; CHECK-NEXT:    vsetvli zero, a0, e64, m1, ta, ma
-; CHECK-NEXT:    vsaddu.vi v8, v8, 9
-; CHECK-NEXT:    ret
-entry:
-  %a = call <vscale x 1 x i64> @llvm.riscv.vsaddu.nxv1i64.i64(
-    <vscale x 1 x i64> undef,
-    <vscale x 1 x i64> %0,
-    i64 9,
-    i32 %1)
-
-  ret <vscale x 1 x i64> %a
-}
-
-define <vscale x 1 x i64> @intrinsic_vsaddu_mask_vi_nxv1i64_nxv1i64_i64(<vscale x 1 x i64> %0, <vscale x 1 x i64> %1, <vscale x 1 x i1> %2, i32 %3) nounwind {
-; CHECK-LABEL: intrinsic_vsaddu_mask_vi_nxv1i64_nxv1i64_i64:
-; CHECK:       # %bb.0: # %entry
-; CHECK-NEXT:    vsetvli zero, a0, e64, m1, ta, mu
-; CHECK-NEXT:    vsaddu.vi v8, v9, 9, v0.t
-; CHECK-NEXT:    ret
-entry:
-  %a = call <vscale x 1 x i64> @llvm.riscv.vsaddu.mask.nxv1i64.i64(
-    <vscale x 1 x i64> %0,
-    <vscale x 1 x i64> %1,
-    i64 9,
-    <vscale x 1 x i1> %2,
-    i32 %3, i32 1)
-
-  ret <vscale x 1 x i64> %a
-}
-
-define <vscale x 2 x i64> @intrinsic_vsaddu_vi_nxv2i64_nxv2i64_i64(<vscale x 2 x i64> %0, i32 %1) nounwind {
-; CHECK-LABEL: intrinsic_vsaddu_vi_nxv2i64_nxv2i64_i64:
-; CHECK:       # %bb.0: # %entry
-; CHECK-NEXT:    vsetvli zero, a0, e64, m2, ta, ma
-; CHECK-NEXT:    vsaddu.vi v8, v8, 9
-; CHECK-NEXT:    ret
-entry:
-  %a = call <vscale x 2 x i64> @llvm.riscv.vsaddu.nxv2i64.i64(
-    <vscale x 2 x i64> undef,
-    <vscale x 2 x i64> %0,
-    i64 9,
-    i32 %1)
-
-  ret <vscale x 2 x i64> %a
-}
-
-define <vscale x 2 x i64> @intrinsic_vsaddu_mask_vi_nxv2i64_nxv2i64_i64(<vscale x 2 x i64> %0, <vscale x 2 x i64> %1, <vscale x 2 x i1> %2, i32 %3) nounwind {
-; CHECK-LABEL: intrinsic_vsaddu_mask_vi_nxv2i64_nxv2i64_i64:
-; CHECK:       # %bb.0: # %entry
-; CHECK-NEXT:    vsetvli zero, a0, e64, m2, ta, mu
-; CHECK-NEXT:    vsaddu.vi v8, v10, 9, v0.t
-; CHECK-NEXT:    ret
-entry:
-  %a = call <vscale x 2 x i64> @llvm.riscv.vsaddu.mask.nxv2i64.i64(
-    <vscale x 2 x i64> %0,
-    <vscale x 2 x i64> %1,
-    i64 9,
-    <vscale x 2 x i1> %2,
-    i32 %3, i32 1)
-
-  ret <vscale x 2 x i64> %a
-}
-
-define <vscale x 4 x i64> @intrinsic_vsaddu_vi_nxv4i64_nxv4i64_i64(<vscale x 4 x i64> %0, i32 %1) nounwind {
-; CHECK-LABEL: intrinsic_vsaddu_vi_nxv4i64_nxv4i64_i64:
-; CHECK:       # %bb.0: # %entry
-; CHECK-NEXT:    vsetvli zero, a0, e64, m4, ta, ma
-; CHECK-NEXT:    vsaddu.vi v8, v8, 9
-; CHECK-NEXT:    ret
-entry:
-  %a = call <vscale x 4 x i64> @llvm.riscv.vsaddu.nxv4i64.i64(
-    <vscale x 4 x i64> undef,
-    <vscale x 4 x i64> %0,
-    i64 9,
-    i32 %1)
-
-  ret <vscale x 4 x i64> %a
-}
-
-define <vscale x 4 x i64> @intrinsic_vsaddu_mask_vi_nxv4i64_nxv4i64_i64(<vscale x 4 x i64> %0, <vscale x 4 x i64> %1, <vscale x 4 x i1> %2, i32 %3) nounwind {
-; CHECK-LABEL: intrinsic_vsaddu_mask_vi_nxv4i64_nxv4i64_i64:
-; CHECK:       # %bb.0: # %entry
-; CHECK-NEXT:    vsetvli zero, a0, e64, m4, ta, mu
-; CHECK-NEXT:    vsaddu.vi v8, v12, 9, v0.t
-; CHECK-NEXT:    ret
-entry:
-  %a = call <vscale x 4 x i64> @llvm.riscv.vsaddu.mask.nxv4i64.i64(
-    <vscale x 4 x i64> %0,
-    <vscale x 4 x i64> %1,
-    i64 9,
-    <vscale x 4 x i1> %2,
-    i32 %3, i32 1)
-
-  ret <vscale x 4 x i64> %a
-}
-
-define <vscale x 8 x i64> @intrinsic_vsaddu_vi_nxv8i64_nxv8i64_i64(<vscale x 8 x i64> %0, i32 %1) nounwind {
-; CHECK-LABEL: intrinsic_vsaddu_vi_nxv8i64_nxv8i64_i64:
-; CHECK:       # %bb.0: # %entry
-; CHECK-NEXT:    vsetvli zero, a0, e64, m8, ta, ma
-; CHECK-NEXT:    vsaddu.vi v8, v8, 9
-; CHECK-NEXT:    ret
-entry:
-  %a = call <vscale x 8 x i64> @llvm.riscv.vsaddu.nxv8i64.i64(
-    <vscale x 8 x i64> undef,
-    <vscale x 8 x i64> %0,
-    i64 9,
-    i32 %1)
-
-  ret <vscale x 8 x i64> %a
-}
-
-define <vscale x 8 x i64> @intrinsic_vsaddu_mask_vi_nxv8i64_nxv8i64_i64(<vscale x 8 x i64> %0, <vscale x 8 x i64> %1, <vscale x 8 x i1> %2, i32 %3) nounwind {
-; CHECK-LABEL: intrinsic_vsaddu_mask_vi_nxv8i64_nxv8i64_i64:
-; CHECK:       # %bb.0: # %entry
-; CHECK-NEXT:    vsetvli zero, a0, e64, m8, ta, mu
-; CHECK-NEXT:    vsaddu.vi v8, v16, 9, v0.t
-; CHECK-NEXT:    ret
-entry:
-  %a = call <vscale x 8 x i64> @llvm.riscv.vsaddu.mask.nxv8i64.i64(
-    <vscale x 8 x i64> %0,
-    <vscale x 8 x i64> %1,
-    i64 9,
-    <vscale x 8 x i1> %2,
-    i32 %3, i32 1)
-
-  ret <vscale x 8 x i64> %a
-}
diff --git a/llvm/test/CodeGen/RISCV/rvv/vsaddu-rv64.ll b/llvm/test/CodeGen/RISCV/rvv/vsaddu.ll
index 077e45f..57a89d6 100644
--- a/llvm/test/CodeGen/RISCV/rvv/vsaddu-rv64.ll
+++ b/llvm/test/CodeGen/RISCV/rvv/vsaddu.ll
@@ -1,14 +1,16 @@
 ; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py
-; RUN: llc -mtriple=riscv64 -mattr=+v -verify-machineinstrs \
-; RUN:   < %s | FileCheck %s
+; RUN: sed 's/iXLen/i32/g' %s | llc -mtriple=riscv32 -mattr=+v \
+; RUN:   -verify-machineinstrs  | FileCheck %s --check-prefixes=CHECK,RV32
+; RUN: sed 's/iXLen/i64/g' %s | llc -mtriple=riscv64 -mattr=+v \
+; RUN:   -verify-machineinstrs  | FileCheck %s --check-prefixes=CHECK,RV64
 
 declare <vscale x 1 x i8> @llvm.riscv.vsaddu.nxv1i8.nxv1i8(
   <vscale x 1 x i8>,
   <vscale x 1 x i8>,
   <vscale x 1 x i8>,
-  i64);
+  iXLen)
 
-define <vscale x 1 x i8> @intrinsic_vsaddu_vv_nxv1i8_nxv1i8_nxv1i8(<vscale x 1 x i8> %0, <vscale x 1 x i8> %1, i64 %2) nounwind {
+define <vscale x 1 x i8> @intrinsic_vsaddu_vv_nxv1i8_nxv1i8_nxv1i8(<vscale x 1 x i8> %0, <vscale x 1 x i8> %1, iXLen %2) nounwind {
 ; CHECK-LABEL: intrinsic_vsaddu_vv_nxv1i8_nxv1i8_nxv1i8:
 ; CHECK:       # %bb.0: # %entry
 ; CHECK-NEXT:    vsetvli zero, a0, e8, mf8, ta, ma
@@ -19,7 +21,7 @@ entry:
     <vscale x 1 x i8> undef,
     <vscale x 1 x i8> %0,
     <vscale x 1 x i8> %1,
-    i64 %2)
+    iXLen %2)
 
   ret <vscale x 1 x i8> %a
 }
@@ -29,10 +31,10 @@ declare <vscale x 1 x i8> @llvm.riscv.vsaddu.mask.nxv1i8.nxv1i8(
   <vscale x 1 x i8>,
   <vscale x 1 x i8>,
   <vscale x 1 x i1>,
-  i64,
-  i64);
+  iXLen,
+  iXLen)
 
-define <vscale x 1 x i8> @intrinsic_vsaddu_mask_vv_nxv1i8_nxv1i8_nxv1i8(<vscale x 1 x i8> %0, <vscale x 1 x i8> %1, <vscale x 1 x i8> %2, <vscale x 1 x i1> %3, i64 %4) nounwind {
+define <vscale x 1 x i8> @intrinsic_vsaddu_mask_vv_nxv1i8_nxv1i8_nxv1i8(<vscale x 1 x i8> %0, <vscale x 1 x i8> %1, <vscale x 1 x i8> %2, <vscale x 1 x i1> %3, iXLen %4) nounwind {
 ; CHECK-LABEL: intrinsic_vsaddu_mask_vv_nxv1i8_nxv1i8_nxv1i8:
 ; CHECK:       # %bb.0: # %entry
 ; CHECK-NEXT:    vsetvli zero, a0, e8, mf8, ta, mu
@@ -44,7 +46,7 @@ entry:
     <vscale x 1 x i8> %1,
     <vscale x 1 x i8> %2,
     <vscale x 1 x i1> %3,
-    i64 %4, i64 1)
+    iXLen %4, iXLen 1)
 
   ret <vscale x 1 x i8> %a
 }
@@ -53,9 +55,9 @@ declare <vscale x 2 x i8> @llvm.riscv.vsaddu.nxv2i8.nxv2i8(
   <vscale x 2 x i8>,
   <vscale x 2 x i8>,
   <vscale x 2 x i8>,
-  i64);
+  iXLen)
 
-define <vscale x 2 x i8> @intrinsic_vsaddu_vv_nxv2i8_nxv2i8_nxv2i8(<vscale x 2 x i8> %0, <vscale x 2 x i8> %1, i64 %2) nounwind {
+define <vscale x 2 x i8> @intrinsic_vsaddu_vv_nxv2i8_nxv2i8_nxv2i8(<vscale x 2 x i8> %0, <vscale x 2 x i8> %1, iXLen %2) nounwind {
 ; CHECK-LABEL: intrinsic_vsaddu_vv_nxv2i8_nxv2i8_nxv2i8:
 ; CHECK:       # %bb.0: # %entry
 ; CHECK-NEXT:    vsetvli zero, a0, e8, mf4, ta, ma
@@ -66,7 +68,7 @@ entry:
     <vscale x 2 x i8> undef,
     <vscale x 2 x i8> %0,
     <vscale x 2 x i8> %1,
-    i64 %2)
+    iXLen %2)
 
   ret <vscale x 2 x i8> %a
 }
@@ -76,10 +78,10 @@ declare <vscale x 2 x i8> @llvm.riscv.vsaddu.mask.nxv2i8.nxv2i8(
   <vscale x 2 x i8>,
   <vscale x 2 x i8>,
   <vscale x 2 x i1>,
-  i64,
-  i64);
+  iXLen,
+  iXLen)
 
-define <vscale x 2 x i8> @intrinsic_vsaddu_mask_vv_nxv2i8_nxv2i8_nxv2i8(<vscale x 2 x i8> %0, <vscale x 2 x i8> %1, <vscale x 2 x i8> %2, <vscale x 2 x i1> %3, i64 %4) nounwind {
+define <vscale x 2 x i8> @intrinsic_vsaddu_mask_vv_nxv2i8_nxv2i8_nxv2i8(<vscale x 2 x i8> %0, <vscale x 2 x i8> %1, <vscale x 2 x i8> %2, <vscale x 2 x i1> %3, iXLen %4) nounwind {
 ; CHECK-LABEL: intrinsic_vsaddu_mask_vv_nxv2i8_nxv2i8_nxv2i8:
 ; CHECK:       # %bb.0: # %entry
 ; CHECK-NEXT:    vsetvli zero, a0, e8, mf4, ta, mu
@@ -91,7 +93,7 @@ entry:
     <vscale x 2 x i8> %1,
     <vscale x 2 x i8> %2,
     <vscale x 2 x i1> %3,
-    i64 %4, i64 1)
+    iXLen %4, iXLen 1)
 
   ret <vscale x 2 x i8> %a
 }
@@ -100,9 +102,9 @@ declare <vscale x 4 x i8> @llvm.riscv.vsaddu.nxv4i8.nxv4i8(
   <vscale x 4 x i8>,
   <vscale x 4 x i8>,
   <vscale x 4 x i8>,
-  i64);
+  iXLen)
 
-define <vscale x 4 x i8> @intrinsic_vsaddu_vv_nxv4i8_nxv4i8_nxv4i8(<vscale x 4 x i8> %0, <vscale x 4 x i8> %1, i64 %2) nounwind {
+define <vscale x 4 x i8> @intrinsic_vsaddu_vv_nxv4i8_nxv4i8_nxv4i8(<vscale x 4 x i8> %0, <vscale x 4 x i8> %1, iXLen %2) nounwind {
 ; CHECK-LABEL: intrinsic_vsaddu_vv_nxv4i8_nxv4i8_nxv4i8:
 ; CHECK:       # %bb.0: # %entry
 ; CHECK-NEXT:    vsetvli zero, a0, e8, mf2, ta, ma
@@ -113,7 +115,7 @@ entry:
     <vscale x 4 x i8> undef,
     <vscale x 4 x i8> %0,
     <vscale x 4 x i8> %1,
-    i64 %2)
+    iXLen %2)
 
   ret <vscale x 4 x i8> %a
 }
@@ -123,10 +125,10 @@ declare <vscale x 4 x i8> @llvm.riscv.vsaddu.mask.nxv4i8.nxv4i8(
   <vscale x 4 x i8>,
   <vscale x 4 x i8>,
   <vscale x 4 x i1>,
-  i64,
-  i64);
+  iXLen,
+  iXLen)
 
-define <vscale x 4 x i8> @intrinsic_vsaddu_mask_vv_nxv4i8_nxv4i8_nxv4i8(<vscale x 4 x i8> %0, <vscale x 4 x i8> %1, <vscale x 4 x i8> %2, <vscale x 4 x i1> %3, i64 %4) nounwind {
+define <vscale x 4 x i8> @intrinsic_vsaddu_mask_vv_nxv4i8_nxv4i8_nxv4i8(<vscale x 4 x i8> %0, <vscale x 4 x i8> %1, <vscale x 4 x i8> %2, <vscale x 4 x i1> %3, iXLen %4) nounwind {
 ; CHECK-LABEL: intrinsic_vsaddu_mask_vv_nxv4i8_nxv4i8_nxv4i8:
 ; CHECK:       # %bb.0: # %entry
 ; CHECK-NEXT:    vsetvli zero, a0, e8, mf2, ta, mu
@@ -138,7 +140,7 @@ entry:
     <vscale x 4 x i8> %1,
     <vscale x 4 x i8> %2,
     <vscale x 4 x i1> %3,
-    i64 %4, i64 1)
+    iXLen %4, iXLen 1)
 
   ret <vscale x 4 x i8> %a
 }
@@ -147,9 +149,9 @@ declare <vscale x 8 x i8> @llvm.riscv.vsaddu.nxv8i8.nxv8i8(
   <vscale x 8 x i8>,
   <vscale x 8 x i8>,
   <vscale x 8 x i8>,
-  i64);
+  iXLen)
 
-define <vscale x 8 x i8> @intrinsic_vsaddu_vv_nxv8i8_nxv8i8_nxv8i8(<vscale x 8 x i8> %0, <vscale x 8 x i8> %1, i64 %2) nounwind {
+define <vscale x 8 x i8> @intrinsic_vsaddu_vv_nxv8i8_nxv8i8_nxv8i8(<vscale x 8 x i8> %0, <vscale x 8 x i8> %1, iXLen %2) nounwind {
 ; CHECK-LABEL: intrinsic_vsaddu_vv_nxv8i8_nxv8i8_nxv8i8:
 ; CHECK:       # %bb.0: # %entry
 ; CHECK-NEXT:    vsetvli zero, a0, e8, m1, ta, ma
@@ -160,7 +162,7 @@ entry:
     <vscale x 8 x i8> undef,
     <vscale x 8 x i8> %0,
     <vscale x 8 x i8> %1,
-    i64 %2)
+    iXLen %2)
 
   ret <vscale x 8 x i8> %a
 }
@@ -170,10 +172,10 @@ declare <vscale x 8 x i8> @llvm.riscv.vsaddu.mask.nxv8i8.nxv8i8(
   <vscale x 8 x i8>,
   <vscale x 8 x i8>,
   <vscale x 8 x i1>,
-  i64,
-  i64);
+  iXLen,
+  iXLen)
 
-define <vscale x 8 x i8> @intrinsic_vsaddu_mask_vv_nxv8i8_nxv8i8_nxv8i8(<vscale x 8 x i8> %0, <vscale x 8 x i8> %1, <vscale x 8 x i8> %2, <vscale x 8 x i1> %3, i64 %4) nounwind {
+define <vscale x 8 x i8> @intrinsic_vsaddu_mask_vv_nxv8i8_nxv8i8_nxv8i8(<vscale x 8 x i8> %0, <vscale x 8 x i8> %1, <vscale x 8 x i8> %2, <vscale x 8 x i1> %3, iXLen %4) nounwind {
 ; CHECK-LABEL: intrinsic_vsaddu_mask_vv_nxv8i8_nxv8i8_nxv8i8:
 ; CHECK:       # %bb.0: # %entry
 ; CHECK-NEXT:    vsetvli zero, a0, e8, m1, ta, mu
@@ -185,7 +187,7 @@ entry:
     <vscale x 8 x i8> %1,
     <vscale x 8 x i8> %2,
     <vscale x 8 x i1> %3,
-    i64 %4, i64 1)
+    iXLen %4, iXLen 1)
 
   ret <vscale x 8 x i8> %a
 }
@@ -194,9 +196,9 @@ declare <vscale x 16 x i8> @llvm.riscv.vsaddu.nxv16i8.nxv16i8(
   <vscale x 16 x i8>,
   <vscale x 16 x i8>,
   <vscale x 16 x i8>,
-  i64);
+  iXLen)
 
-define <vscale x 16 x i8> @intrinsic_vsaddu_vv_nxv16i8_nxv16i8_nxv16i8(<vscale x 16 x i8> %0, <vscale x 16 x i8> %1, i64 %2) nounwind {
+define <vscale x 16 x i8> @intrinsic_vsaddu_vv_nxv16i8_nxv16i8_nxv16i8(<vscale x 16 x i8> %0, <vscale x 16 x i8> %1, iXLen %2) nounwind {
 ; CHECK-LABEL: intrinsic_vsaddu_vv_nxv16i8_nxv16i8_nxv16i8:
 ; CHECK:       # %bb.0: # %entry
 ; CHECK-NEXT:    vsetvli zero, a0, e8, m2, ta, ma
@@ -207,7 +209,7 @@ entry:
     <vscale x 16 x i8> undef,
     <vscale x 16 x i8> %0,
     <vscale x 16 x i8> %1,
-    i64 %2)
+    iXLen %2)
 
   ret <vscale x 16 x i8> %a
 }
@@ -217,10 +219,10 @@ declare <vscale x 16 x i8> @llvm.riscv.vsaddu.mask.nxv16i8.nxv16i8(
   <vscale x 16 x i8>,
   <vscale x 16 x i8>,
   <vscale x 16 x i1>,
-  i64,
-  i64);
+  iXLen,
+  iXLen)
 
-define <vscale x 16 x i8> @intrinsic_vsaddu_mask_vv_nxv16i8_nxv16i8_nxv16i8(<vscale x 16 x i8> %0, <vscale x 16 x i8> %1, <vscale x 16 x i8> %2, <vscale x 16 x i1> %3, i64 %4) nounwind {
+define <vscale x 16 x i8> @intrinsic_vsaddu_mask_vv_nxv16i8_nxv16i8_nxv16i8(<vscale x 16 x i8> %0, <vscale x 16 x i8> %1, <vscale x 16 x i8> %2, <vscale x 16 x i1> %3, iXLen %4) nounwind {
 ; CHECK-LABEL: intrinsic_vsaddu_mask_vv_nxv16i8_nxv16i8_nxv16i8:
 ; CHECK:       # %bb.0: # %entry
 ; CHECK-NEXT:    vsetvli zero, a0, e8, m2, ta, mu
@@ -232,7 +234,7 @@ entry:
     <vscale x 16 x i8> %1,
     <vscale x 16 x i8> %2,
     <vscale x 16 x i1> %3,
-    i64 %4, i64 1)
+    iXLen %4, iXLen 1)
 
   ret <vscale x 16 x i8> %a
 }
@@ -241,9 +243,9 @@ declare <vscale x 32 x i8> @llvm.riscv.vsaddu.nxv32i8.nxv32i8(
   <vscale x 32 x i8>,
   <vscale x 32 x i8>,
   <vscale x 32 x i8>,
-  i64);
+  iXLen)
 
-define <vscale x 32 x i8> @intrinsic_vsaddu_vv_nxv32i8_nxv32i8_nxv32i8(<vscale x 32 x i8> %0, <vscale x 32 x i8> %1, i64 %2) nounwind {
+define <vscale x 32 x i8> @intrinsic_vsaddu_vv_nxv32i8_nxv32i8_nxv32i8(<vscale x 32 x i8> %0, <vscale x 32 x i8> %1, iXLen %2) nounwind {
 ; CHECK-LABEL: intrinsic_vsaddu_vv_nxv32i8_nxv32i8_nxv32i8:
 ; CHECK:       # %bb.0: # %entry
 ; CHECK-NEXT:    vsetvli zero, a0, e8, m4, ta, ma
@@ -254,7 +256,7 @@ entry:
     <vscale x 32 x i8> undef,
     <vscale x 32 x i8> %0,
     <vscale x 32 x i8> %1,
-    i64 %2)
+    iXLen %2)
 
   ret <vscale x 32 x i8> %a
 }
@@ -264,10 +266,10 @@ declare <vscale x 32 x i8> @llvm.riscv.vsaddu.mask.nxv32i8.nxv32i8(
   <vscale x 32 x i8>,
   <vscale x 32 x i8>,
   <vscale x 32 x i1>,
-  i64,
-  i64);
+  iXLen,
+  iXLen)
 
-define <vscale x 32 x i8> @intrinsic_vsaddu_mask_vv_nxv32i8_nxv32i8_nxv32i8(<vscale x 32 x i8> %0, <vscale x 32 x i8> %1, <vscale x 32 x i8> %2, <vscale x 32 x i1> %3, i64 %4) nounwind {
+define <vscale x 32 x i8> @intrinsic_vsaddu_mask_vv_nxv32i8_nxv32i8_nxv32i8(<vscale x 32 x i8> %0, <vscale x 32 x i8> %1, <vscale x 32 x i8> %2, <vscale x 32 x i1> %3, iXLen %4) nounwind {
 ; CHECK-LABEL: intrinsic_vsaddu_mask_vv_nxv32i8_nxv32i8_nxv32i8:
 ; CHECK:       # %bb.0: # %entry
 ; CHECK-NEXT:    vsetvli zero, a0, e8, m4, ta, mu
@@ -279,7 +281,7 @@ entry:
     <vscale x 32 x i8> %1,
     <vscale x 32 x i8> %2,
     <vscale x 32 x i1> %3,
-    i64 %4, i64 1)
+    iXLen %4, iXLen 1)
 
   ret <vscale x 32 x i8> %a
 }
@@ -288,9 +290,9 @@ declare <vscale x 64 x i8> @llvm.riscv.vsaddu.nxv64i8.nxv64i8(
   <vscale x 64 x i8>,
   <vscale x 64 x i8>,
   <vscale x 64 x i8>,
-  i64);
+  iXLen)
 
-define <vscale x 64 x i8> @intrinsic_vsaddu_vv_nxv64i8_nxv64i8_nxv64i8(<vscale x 64 x i8> %0, <vscale x 64 x i8> %1, i64 %2) nounwind {
+define <vscale x 64 x i8> @intrinsic_vsaddu_vv_nxv64i8_nxv64i8_nxv64i8(<vscale x 64 x i8> %0, <vscale x 64 x i8> %1, iXLen %2) nounwind {
 ; CHECK-LABEL: intrinsic_vsaddu_vv_nxv64i8_nxv64i8_nxv64i8:
 ; CHECK:       # %bb.0: # %entry
 ; CHECK-NEXT:    vsetvli zero, a0, e8, m8, ta, ma
@@ -301,7 +303,7 @@ entry:
     <vscale x 64 x i8> undef,
     <vscale x 64 x i8> %0,
     <vscale x 64 x i8> %1,
-    i64 %2)
+    iXLen %2)
 
   ret <vscale x 64 x i8> %a
 }
@@ -311,10 +313,10 @@ declare <vscale x 64 x i8> @llvm.riscv.vsaddu.mask.nxv64i8.nxv64i8(
   <vscale x 64 x i8>,
   <vscale x 64 x i8>,
   <vscale x 64 x i1>,
-  i64,
-  i64);
+  iXLen,
+  iXLen)
 
-define <vscale x 64 x i8> @intrinsic_vsaddu_mask_vv_nxv64i8_nxv64i8_nxv64i8(<vscale x 64 x i8> %0, <vscale x 64 x i8> %1, <vscale x 64 x i8> %2, <vscale x 64 x i1> %3, i64 %4) nounwind {
+define <vscale x 64 x i8> @intrinsic_vsaddu_mask_vv_nxv64i8_nxv64i8_nxv64i8(<vscale x 64 x i8> %0, <vscale x 64 x i8> %1, <vscale x 64 x i8> %2, <vscale x 64 x i1> %3, iXLen %4) nounwind {
 ; CHECK-LABEL: intrinsic_vsaddu_mask_vv_nxv64i8_nxv64i8_nxv64i8:
 ; CHECK:       # %bb.0: # %entry
 ; CHECK-NEXT:    vl8r.v v24, (a0)
@@ -327,7 +329,7 @@ entry:
     <vscale x 64 x i8> %1,
     <vscale x 64 x i8> %2,
     <vscale x 64 x i1> %3,
-    i64 %4, i64 1)
+    iXLen %4, iXLen 1)
 
   ret <vscale x 64 x i8> %a
 }
@@ -336,9 +338,9 @@ declare <vscale x 1 x i16> @llvm.riscv.vsaddu.nxv1i16.nxv1i16(
   <vscale x 1 x i16>,
   <vscale x 1 x i16>,
   <vscale x 1 x i16>,
-  i64);
+  iXLen)
 
-define <vscale x 1 x i16> @intrinsic_vsaddu_vv_nxv1i16_nxv1i16_nxv1i16(<vscale x 1 x i16> %0, <vscale x 1 x i16> %1, i64 %2) nounwind {
+define <vscale x 1 x i16> @intrinsic_vsaddu_vv_nxv1i16_nxv1i16_nxv1i16(<vscale x 1 x i16> %0, <vscale x 1 x i16> %1, iXLen %2) nounwind {
 ; CHECK-LABEL: intrinsic_vsaddu_vv_nxv1i16_nxv1i16_nxv1i16:
 ; CHECK:       # %bb.0: # %entry
 ; CHECK-NEXT:    vsetvli zero, a0, e16, mf4, ta, ma
@@ -349,7 +351,7 @@ entry:
     <vscale x 1 x i16> undef,
     <vscale x 1 x i16> %0,
     <vscale x 1 x i16> %1,
-    i64 %2)
+    iXLen %2)
 
   ret <vscale x 1 x i16> %a
 }
@@ -359,10 +361,10 @@ declare <vscale x 1 x i16> @llvm.riscv.vsaddu.mask.nxv1i16.nxv1i16(
   <vscale x 1 x i16>,
   <vscale x 1 x i16>,
   <vscale x 1 x i1>,
-  i64,
-  i64);
+  iXLen,
+  iXLen)
 
-define <vscale x 1 x i16> @intrinsic_vsaddu_mask_vv_nxv1i16_nxv1i16_nxv1i16(<vscale x 1 x i16> %0, <vscale x 1 x i16> %1, <vscale x 1 x i16> %2, <vscale x 1 x i1> %3, i64 %4) nounwind {
+define <vscale x 1 x i16> @intrinsic_vsaddu_mask_vv_nxv1i16_nxv1i16_nxv1i16(<vscale x 1 x i16> %0, <vscale x 1 x i16> %1, <vscale x 1 x i16> %2, <vscale x 1 x i1> %3, iXLen %4) nounwind {
 ; CHECK-LABEL: intrinsic_vsaddu_mask_vv_nxv1i16_nxv1i16_nxv1i16:
 ; CHECK:       # %bb.0: # %entry
 ; CHECK-NEXT:    vsetvli zero, a0, e16, mf4, ta, mu
@@ -374,7 +376,7 @@ entry:
     <vscale x 1 x i16> %1,
     <vscale x 1 x i16> %2,
     <vscale x 1 x i1> %3,
-    i64 %4, i64 1)
+    iXLen %4, iXLen 1)
 
   ret <vscale x 1 x i16> %a
 }
@@ -383,9 +385,9 @@ declare <vscale x 2 x i16> @llvm.riscv.vsaddu.nxv2i16.nxv2i16(
   <vscale x 2 x i16>,
   <vscale x 2 x i16>,
   <vscale x 2 x i16>,
-  i64);
+  iXLen)
 
-define <vscale x 2 x i16> @intrinsic_vsaddu_vv_nxv2i16_nxv2i16_nxv2i16(<vscale x 2 x i16> %0, <vscale x 2 x i16> %1, i64 %2) nounwind {
+define <vscale x 2 x i16> @intrinsic_vsaddu_vv_nxv2i16_nxv2i16_nxv2i16(<vscale x 2 x i16> %0, <vscale x 2 x i16> %1, iXLen %2) nounwind {
 ; CHECK-LABEL: intrinsic_vsaddu_vv_nxv2i16_nxv2i16_nxv2i16:
 ; CHECK:       # %bb.0: # %entry
 ; CHECK-NEXT:    vsetvli zero, a0, e16, mf2, ta, ma
@@ -396,7 +398,7 @@ entry:
     <vscale x 2 x i16> undef,
     <vscale x 2 x i16> %0,
     <vscale x 2 x i16> %1,
-    i64 %2)
+    iXLen %2)
 
   ret <vscale x 2 x i16> %a
 }
@@ -406,10 +408,10 @@ declare <vscale x 2 x i16> @llvm.riscv.vsaddu.mask.nxv2i16.nxv2i16(
   <vscale x 2 x i16>,
   <vscale x 2 x i16>,
   <vscale x 2 x i1>,
-  i64,
-  i64);
+  iXLen,
+  iXLen)
 
-define <vscale x 2 x i16> @intrinsic_vsaddu_mask_vv_nxv2i16_nxv2i16_nxv2i16(<vscale x 2 x i16> %0, <vscale x 2 x i16> %1, <vscale x 2 x i16> %2, <vscale x 2 x i1> %3, i64 %4) nounwind {
+define <vscale x 2 x i16> @intrinsic_vsaddu_mask_vv_nxv2i16_nxv2i16_nxv2i16(<vscale x 2 x i16> %0, <vscale x 2 x i16> %1, <vscale x 2 x i16> %2, <vscale x 2 x i1> %3, iXLen %4) nounwind {
 ; CHECK-LABEL: intrinsic_vsaddu_mask_vv_nxv2i16_nxv2i16_nxv2i16:
 ; CHECK:       # %bb.0: # %entry
 ; CHECK-NEXT:    vsetvli zero, a0, e16, mf2, ta, mu
@@ -421,7 +423,7 @@ entry:
     <vscale x 2 x i16> %1,
     <vscale x 2 x i16> %2,
     <vscale x 2 x i1> %3,
-    i64 %4, i64 1)
+    iXLen %4, iXLen 1)
 
   ret <vscale x 2 x i16> %a
 }
@@ -430,9 +432,9 @@ declare <vscale x 4 x i16> @llvm.riscv.vsaddu.nxv4i16.nxv4i16(
   <vscale x 4 x i16>,
   <vscale x 4 x i16>,
   <vscale x 4 x i16>,
-  i64);
+  iXLen)
 
-define <vscale x 4 x i16> @intrinsic_vsaddu_vv_nxv4i16_nxv4i16_nxv4i16(<vscale x 4 x i16> %0, <vscale x 4 x i16> %1, i64 %2) nounwind {
+define <vscale x 4 x i16> @intrinsic_vsaddu_vv_nxv4i16_nxv4i16_nxv4i16(<vscale x 4 x i16> %0, <vscale x 4 x i16> %1, iXLen %2) nounwind {
 ; CHECK-LABEL: intrinsic_vsaddu_vv_nxv4i16_nxv4i16_nxv4i16:
 ; CHECK:       # %bb.0: # %entry
 ; CHECK-NEXT:    vsetvli zero, a0, e16, m1, ta, ma
@@ -443,7 +445,7 @@ entry:
     <vscale x 4 x i16> undef,
     <vscale x 4 x i16> %0,
     <vscale x 4 x i16> %1,
-    i64 %2)
+    iXLen %2)
 
   ret <vscale x 4 x i16> %a
 }
@@ -453,10 +455,10 @@ declare <vscale x 4 x i16> @llvm.riscv.vsaddu.mask.nxv4i16.nxv4i16(
   <vscale x 4 x i16>,
   <vscale x 4 x i16>,
   <vscale x 4 x i1>,
-  i64,
-  i64);
+  iXLen,
+  iXLen)
 
-define <vscale x 4 x i16> @intrinsic_vsaddu_mask_vv_nxv4i16_nxv4i16_nxv4i16(<vscale x 4 x i16> %0, <vscale x 4 x i16> %1, <vscale x 4 x i16> %2, <vscale x 4 x i1> %3, i64 %4) nounwind {
+define <vscale x 4 x i16> @intrinsic_vsaddu_mask_vv_nxv4i16_nxv4i16_nxv4i16(<vscale x 4 x i16> %0, <vscale x 4 x i16> %1, <vscale x 4 x i16> %2, <vscale x 4 x i1> %3, iXLen %4) nounwind {
 ; CHECK-LABEL: intrinsic_vsaddu_mask_vv_nxv4i16_nxv4i16_nxv4i16:
 ; CHECK:       # %bb.0: # %entry
 ; CHECK-NEXT:    vsetvli zero, a0, e16, m1, ta, mu
@@ -468,7 +470,7 @@ entry:
     <vscale x 4 x i16> %1,
     <vscale x 4 x i16> %2,
     <vscale x 4 x i1> %3,
-    i64 %4, i64 1)
+    iXLen %4, iXLen 1)
 
   ret <vscale x 4 x i16> %a
 }
@@ -477,9 +479,9 @@ declare <vscale x 8 x i16> @llvm.riscv.vsaddu.nxv8i16.nxv8i16(
   <vscale x 8 x i16>,
   <vscale x 8 x i16>,
   <vscale x 8 x i16>,
-  i64);
+  iXLen)
 
-define <vscale x 8 x i16> @intrinsic_vsaddu_vv_nxv8i16_nxv8i16_nxv8i16(<vscale x 8 x i16> %0, <vscale x 8 x i16> %1, i64 %2) nounwind {
+define <vscale x 8 x i16> @intrinsic_vsaddu_vv_nxv8i16_nxv8i16_nxv8i16(<vscale x 8 x i16> %0, <vscale x 8 x i16> %1, iXLen %2) nounwind {
 ; CHECK-LABEL: intrinsic_vsaddu_vv_nxv8i16_nxv8i16_nxv8i16:
 ; CHECK:       # %bb.0: # %entry
 ; CHECK-NEXT:    vsetvli zero, a0, e16, m2, ta, ma
@@ -490,7 +492,7 @@ entry:
     <vscale x 8 x i16> undef,
     <vscale x 8 x i16> %0,
     <vscale x 8 x i16> %1,
-    i64 %2)
+    iXLen %2)
 
   ret <vscale x 8 x i16> %a
 }
@@ -500,10 +502,10 @@ declare <vscale x 8 x i16> @llvm.riscv.vsaddu.mask.nxv8i16.nxv8i16(
   <vscale x 8 x i16>,
   <vscale x 8 x i16>,
   <vscale x 8 x i1>,
-  i64,
-  i64);
+  iXLen,
+  iXLen)
 
-define <vscale x 8 x i16> @intrinsic_vsaddu_mask_vv_nxv8i16_nxv8i16_nxv8i16(<vscale x 8 x i16> %0, <vscale x 8 x i16> %1, <vscale x 8 x i16> %2, <vscale x 8 x i1> %3, i64 %4) nounwind {
+define <vscale x 8 x i16> @intrinsic_vsaddu_mask_vv_nxv8i16_nxv8i16_nxv8i16(<vscale x 8 x i16> %0, <vscale x 8 x i16> %1, <vscale x 8 x i16> %2, <vscale x 8 x i1> %3, iXLen %4) nounwind {
 ; CHECK-LABEL: intrinsic_vsaddu_mask_vv_nxv8i16_nxv8i16_nxv8i16:
 ; CHECK:       # %bb.0: # %entry
 ; CHECK-NEXT:    vsetvli zero, a0, e16, m2, ta, mu
@@ -515,7 +517,7 @@ entry:
     <vscale x 8 x i16> %1,
     <vscale x 8 x i16> %2,
     <vscale x 8 x i1> %3,
-    i64 %4, i64 1)
+    iXLen %4, iXLen 1)
 
   ret <vscale x 8 x i16> %a
 }
@@ -524,9 +526,9 @@ declare <vscale x 16 x i16> @llvm.riscv.vsaddu.nxv16i16.nxv16i16(
   <vscale x 16 x i16>,
   <vscale x 16 x i16>,
   <vscale x 16 x i16>,
-  i64);
+  iXLen)
 
-define <vscale x 16 x i16> @intrinsic_vsaddu_vv_nxv16i16_nxv16i16_nxv16i16(<vscale x 16 x i16> %0, <vscale x 16 x i16> %1, i64 %2) nounwind {
+define <vscale x 16 x i16> @intrinsic_vsaddu_vv_nxv16i16_nxv16i16_nxv16i16(<vscale x 16 x i16> %0, <vscale x 16 x i16> %1, iXLen %2) nounwind {
 ; CHECK-LABEL: intrinsic_vsaddu_vv_nxv16i16_nxv16i16_nxv16i16:
 ; CHECK:       # %bb.0: # %entry
 ; CHECK-NEXT:    vsetvli zero, a0, e16, m4, ta, ma
@@ -537,7 +539,7 @@ entry:
     <vscale x 16 x i16> undef,
     <vscale x 16 x i16> %0,
     <vscale x 16 x i16> %1,
-    i64 %2)
+    iXLen %2)
 
   ret <vscale x 16 x i16> %a
 }
@@ -547,10 +549,10 @@ declare <vscale x 16 x i16> @llvm.riscv.vsaddu.mask.nxv16i16.nxv16i16(
   <vscale x 16 x i16>,
   <vscale x 16 x i16>,
   <vscale x 16 x i1>,
-  i64,
-  i64);
+  iXLen,
+  iXLen)
 
-define <vscale x 16 x i16> @intrinsic_vsaddu_mask_vv_nxv16i16_nxv16i16_nxv16i16(<vscale x 16 x i16> %0, <vscale x 16 x i16> %1, <vscale x 16 x i16> %2, <vscale x 16 x i1> %3, i64 %4) nounwind {
+define <vscale x 16 x i16> @intrinsic_vsaddu_mask_vv_nxv16i16_nxv16i16_nxv16i16(<vscale x 16 x i16> %0, <vscale x 16 x i16> %1, <vscale x 16 x i16> %2, <vscale x 16 x i1> %3, iXLen %4) nounwind {
 ; CHECK-LABEL: intrinsic_vsaddu_mask_vv_nxv16i16_nxv16i16_nxv16i16:
 ; CHECK:       # %bb.0: # %entry
 ; CHECK-NEXT:    vsetvli zero, a0, e16, m4, ta, mu
@@ -562,7 +564,7 @@ entry:
     <vscale x 16 x i16> %1,
     <vscale x 16 x i16> %2,
     <vscale x 16 x i1> %3,
-    i64 %4, i64 1)
+    iXLen %4, iXLen 1)
 
   ret <vscale x 16 x i16> %a
 }
@@ -571,9 +573,9 @@ declare <vscale x 32 x i16> @llvm.riscv.vsaddu.nxv32i16.nxv32i16(
   <vscale x 32 x i16>,
   <vscale x 32 x i16>,
   <vscale x 32 x i16>,
-  i64);
+  iXLen)
 
-define <vscale x 32 x i16> @intrinsic_vsaddu_vv_nxv32i16_nxv32i16_nxv32i16(<vscale x 32 x i16> %0, <vscale x 32 x i16> %1, i64 %2) nounwind {
+define <vscale x 32 x i16> @intrinsic_vsaddu_vv_nxv32i16_nxv32i16_nxv32i16(<vscale x 32 x i16> %0, <vscale x 32 x i16> %1, iXLen %2) nounwind {
 ; CHECK-LABEL: intrinsic_vsaddu_vv_nxv32i16_nxv32i16_nxv32i16:
 ; CHECK:       # %bb.0: # %entry
 ; CHECK-NEXT:    vsetvli zero, a0, e16, m8, ta, ma
@@ -584,7 +586,7 @@ entry:
     <vscale x 32 x i16> undef,
     <vscale x 32 x i16> %0,
     <vscale x 32 x i16> %1,
-    i64 %2)
+    iXLen %2)
 
   ret <vscale x 32 x i16> %a
 }
@@ -594,10 +596,10 @@ declare <vscale x 32 x i16> @llvm.riscv.vsaddu.mask.nxv32i16.nxv32i16(
   <vscale x 32 x i16>,
   <vscale x 32 x i16>,
   <vscale x 32 x i1>,
-  i64,
-  i64);
+  iXLen,
+  iXLen)
 
-define <vscale x 32 x i16> @intrinsic_vsaddu_mask_vv_nxv32i16_nxv32i16_nxv32i16(<vscale x 32 x i16> %0, <vscale x 32 x i16> %1, <vscale x 32 x i16> %2, <vscale x 32 x i1> %3, i64 %4) nounwind {
+define <vscale x 32 x i16> @intrinsic_vsaddu_mask_vv_nxv32i16_nxv32i16_nxv32i16(<vscale x 32 x i16> %0, <vscale x 32 x i16> %1, <vscale x 32 x i16> %2, <vscale x 32 x i1> %3, iXLen %4) nounwind {
 ; CHECK-LABEL: intrinsic_vsaddu_mask_vv_nxv32i16_nxv32i16_nxv32i16:
 ; CHECK:       # %bb.0: # %entry
 ; CHECK-NEXT:    vl8re16.v v24, (a0)
@@ -610,7 +612,7 @@ entry:
     <vscale x 32 x i16> %1,
     <vscale x 32 x i16> %2,
     <vscale x 32 x i1> %3,
-    i64 %4, i64 1)
+    iXLen %4, iXLen 1)
 
   ret <vscale x 32 x i16> %a
 }
@@ -619,9 +621,9 @@ declare <vscale x 1 x i32> @llvm.riscv.vsaddu.nxv1i32.nxv1i32(
   <vscale x 1 x i32>,
   <vscale x 1 x i32>,
   <vscale x 1 x i32>,
-  i64);
+  iXLen)
 
-define <vscale x 1 x i32> @intrinsic_vsaddu_vv_nxv1i32_nxv1i32_nxv1i32(<vscale x 1 x i32> %0, <vscale x 1 x i32> %1, i64 %2) nounwind {
+define <vscale x 1 x i32> @intrinsic_vsaddu_vv_nxv1i32_nxv1i32_nxv1i32(<vscale x 1 x i32> %0, <vscale x 1 x i32> %1, iXLen %2) nounwind {
 ; CHECK-LABEL: intrinsic_vsaddu_vv_nxv1i32_nxv1i32_nxv1i32:
 ; CHECK:       # %bb.0: # %entry
 ; CHECK-NEXT:    vsetvli zero, a0, e32, mf2, ta, ma
@@ -632,7 +634,7 @@ entry:
     <vscale x 1 x i32> undef,
     <vscale x 1 x i32> %0,
     <vscale x 1 x i32> %1,
-    i64 %2)
+    iXLen %2)
 
   ret <vscale x 1 x i32> %a
 }
@@ -642,10 +644,10 @@ declare <vscale x 1 x i32> @llvm.riscv.vsaddu.mask.nxv1i32.nxv1i32(
   <vscale x 1 x i32>,
   <vscale x 1 x i32>,
   <vscale x 1 x i1>,
-  i64,
-  i64);
+  iXLen,
+  iXLen)
 
-define <vscale x 1 x i32> @intrinsic_vsaddu_mask_vv_nxv1i32_nxv1i32_nxv1i32(<vscale x 1 x i32> %0, <vscale x 1 x i32> %1, <vscale x 1 x i32> %2, <vscale x 1 x i1> %3, i64 %4) nounwind {
+define <vscale x 1 x i32> @intrinsic_vsaddu_mask_vv_nxv1i32_nxv1i32_nxv1i32(<vscale x 1 x i32> %0, <vscale x 1 x i32> %1, <vscale x 1 x i32> %2, <vscale x 1 x i1> %3, iXLen %4) nounwind {
 ; CHECK-LABEL: intrinsic_vsaddu_mask_vv_nxv1i32_nxv1i32_nxv1i32:
 ; CHECK:       # %bb.0: # %entry
 ; CHECK-NEXT:    vsetvli zero, a0, e32, mf2, ta, mu
@@ -657,7 +659,7 @@ entry:
     <vscale x 1 x i32> %1,
     <vscale x 1 x i32> %2,
     <vscale x 1 x i1> %3,
-    i64 %4, i64 1)
+    iXLen %4, iXLen 1)
 
   ret <vscale x 1 x i32> %a
 }
@@ -666,9 +668,9 @@ declare <vscale x 2 x i32> @llvm.riscv.vsaddu.nxv2i32.nxv2i32(
   <vscale x 2 x i32>,
   <vscale x 2 x i32>,
   <vscale x 2 x i32>,
-  i64);
+  iXLen)
 
-define <vscale x 2 x i32> @intrinsic_vsaddu_vv_nxv2i32_nxv2i32_nxv2i32(<vscale x 2 x i32> %0, <vscale x 2 x i32> %1, i64 %2) nounwind {
+define <vscale x 2 x i32> @intrinsic_vsaddu_vv_nxv2i32_nxv2i32_nxv2i32(<vscale x 2 x i32> %0, <vscale x 2 x i32> %1, iXLen %2) nounwind {
 ; CHECK-LABEL: intrinsic_vsaddu_vv_nxv2i32_nxv2i32_nxv2i32:
 ; CHECK:       # %bb.0: # %entry
 ; CHECK-NEXT:    vsetvli zero, a0, e32, m1, ta, ma
@@ -679,7 +681,7 @@ entry:
     <vscale x 2 x i32> undef,
     <vscale x 2 x i32> %0,
     <vscale x 2 x i32> %1,
-    i64 %2)
+    iXLen %2)
 
   ret <vscale x 2 x i32> %a
 }
@@ -689,10 +691,10 @@ declare <vscale x 2 x i32> @llvm.riscv.vsaddu.mask.nxv2i32.nxv2i32(
   <vscale x 2 x i32>,
   <vscale x 2 x i32>,
   <vscale x 2 x i1>,
-  i64,
-  i64);
+  iXLen,
+  iXLen)
 
-define <vscale x 2 x i32> @intrinsic_vsaddu_mask_vv_nxv2i32_nxv2i32_nxv2i32(<vscale x 2 x i32> %0, <vscale x 2 x i32> %1, <vscale x 2 x i32> %2, <vscale x 2 x i1> %3, i64 %4) nounwind {
+define <vscale x 2 x i32> @intrinsic_vsaddu_mask_vv_nxv2i32_nxv2i32_nxv2i32(<vscale x 2 x i32> %0, <vscale x 2 x i32> %1, <vscale x 2 x i32> %2, <vscale x 2 x i1> %3, iXLen %4) nounwind {
 ; CHECK-LABEL: intrinsic_vsaddu_mask_vv_nxv2i32_nxv2i32_nxv2i32:
 ; CHECK:       # %bb.0: # %entry
 ; CHECK-NEXT:    vsetvli zero, a0, e32, m1, ta, mu
@@ -704,7 +706,7 @@ entry:
     <vscale x 2 x i32> %1,
     <vscale x 2 x i32> %2,
     <vscale x 2 x i1> %3,
-    i64 %4, i64 1)
+    iXLen %4, iXLen 1)
 
   ret <vscale x 2 x i32> %a
 }
@@ -713,9 +715,9 @@ declare <vscale x 4 x i32> @llvm.riscv.vsaddu.nxv4i32.nxv4i32(
   <vscale x 4 x i32>,
   <vscale x 4 x i32>,
   <vscale x 4 x i32>,
-  i64);
+  iXLen)
 
-define <vscale x 4 x i32> @intrinsic_vsaddu_vv_nxv4i32_nxv4i32_nxv4i32(<vscale x 4 x i32> %0, <vscale x 4 x i32> %1, i64 %2) nounwind {
+define <vscale x 4 x i32> @intrinsic_vsaddu_vv_nxv4i32_nxv4i32_nxv4i32(<vscale x 4 x i32> %0, <vscale x 4 x i32> %1, iXLen %2) nounwind {
 ; CHECK-LABEL: intrinsic_vsaddu_vv_nxv4i32_nxv4i32_nxv4i32:
 ; CHECK:       # %bb.0: # %entry
 ; CHECK-NEXT:    vsetvli zero, a0, e32, m2, ta, ma
@@ -726,7 +728,7 @@ entry:
     <vscale x 4 x i32> undef,
     <vscale x 4 x i32> %0,
     <vscale x 4 x i32> %1,
-    i64 %2)
+    iXLen %2)
 
   ret <vscale x 4 x i32> %a
 }
@@ -736,10 +738,10 @@ declare <vscale x 4 x i32> @llvm.riscv.vsaddu.mask.nxv4i32.nxv4i32(
   <vscale x 4 x i32>,
   <vscale x 4 x i32>,
   <vscale x 4 x i1>,
-  i64,
-  i64);
+  iXLen,
+  iXLen)
 
-define <vscale x 4 x i32> @intrinsic_vsaddu_mask_vv_nxv4i32_nxv4i32_nxv4i32(<vscale x 4 x i32> %0, <vscale x 4 x i32> %1, <vscale x 4 x i32> %2, <vscale x 4 x i1> %3, i64 %4) nounwind {
+define <vscale x 4 x i32> @intrinsic_vsaddu_mask_vv_nxv4i32_nxv4i32_nxv4i32(<vscale x 4 x i32> %0, <vscale x 4 x i32> %1, <vscale x 4 x i32> %2, <vscale x 4 x i1> %3, iXLen %4) nounwind {
 ; CHECK-LABEL: intrinsic_vsaddu_mask_vv_nxv4i32_nxv4i32_nxv4i32:
 ; CHECK:       # %bb.0: # %entry
 ; CHECK-NEXT:    vsetvli zero, a0, e32, m2, ta, mu
@@ -751,7 +753,7 @@ entry:
     <vscale x 4 x i32> %1,
     <vscale x 4 x i32> %2,
     <vscale x 4 x i1> %3,
-    i64 %4, i64 1)
+    iXLen %4, iXLen 1)
 
   ret <vscale x 4 x i32> %a
 }
@@ -760,9 +762,9 @@ declare <vscale x 8 x i32> @llvm.riscv.vsaddu.nxv8i32.nxv8i32(
   <vscale x 8 x i32>,
   <vscale x 8 x i32>,
   <vscale x 8 x i32>,
-  i64);
+  iXLen)
 
-define <vscale x 8 x i32> @intrinsic_vsaddu_vv_nxv8i32_nxv8i32_nxv8i32(<vscale x 8 x i32> %0, <vscale x 8 x i32> %1, i64 %2) nounwind {
+define <vscale x 8 x i32> @intrinsic_vsaddu_vv_nxv8i32_nxv8i32_nxv8i32(<vscale x 8 x i32> %0, <vscale x 8 x i32> %1, iXLen %2) nounwind {
 ; CHECK-LABEL: intrinsic_vsaddu_vv_nxv8i32_nxv8i32_nxv8i32:
 ; CHECK:       # %bb.0: # %entry
 ; CHECK-NEXT:    vsetvli zero, a0, e32, m4, ta, ma
@@ -773,7 +775,7 @@ entry:
     <vscale x 8 x i32> undef,
     <vscale x 8 x i32> %0,
     <vscale x 8 x i32> %1,
-    i64 %2)
+    iXLen %2)
 
   ret <vscale x 8 x i32> %a
 }
@@ -783,10 +785,10 @@ declare <vscale x 8 x i32> @llvm.riscv.vsaddu.mask.nxv8i32.nxv8i32(
   <vscale x 8 x i32>,
   <vscale x 8 x i32>,
   <vscale x 8 x i1>,
-  i64,
-  i64);
+  iXLen,
+  iXLen)
 
-define <vscale x 8 x i32> @intrinsic_vsaddu_mask_vv_nxv8i32_nxv8i32_nxv8i32(<vscale x 8 x i32> %0, <vscale x 8 x i32> %1, <vscale x 8 x i32> %2, <vscale x 8 x i1> %3, i64 %4) nounwind {
+define <vscale x 8 x i32> @intrinsic_vsaddu_mask_vv_nxv8i32_nxv8i32_nxv8i32(<vscale x 8 x i32> %0, <vscale x 8 x i32> %1, <vscale x 8 x i32> %2, <vscale x 8 x i1> %3, iXLen %4) nounwind {
 ; CHECK-LABEL: intrinsic_vsaddu_mask_vv_nxv8i32_nxv8i32_nxv8i32:
 ; CHECK:       # %bb.0: # %entry
 ; CHECK-NEXT:    vsetvli zero, a0, e32, m4, ta, mu
@@ -798,7 +800,7 @@ entry:
     <vscale x 8 x i32> %1,
     <vscale x 8 x i32> %2,
     <vscale x 8 x i1> %3,
-    i64 %4, i64 1)
+    iXLen %4, iXLen 1)
 
   ret <vscale x 8 x i32> %a
 }
@@ -807,9 +809,9 @@ declare <vscale x 16 x i32> @llvm.riscv.vsaddu.nxv16i32.nxv16i32(
   <vscale x 16 x i32>,
   <vscale x 16 x i32>,
   <vscale x 16 x i32>,
-  i64);
+  iXLen)
 
-define <vscale x 16 x i32> @intrinsic_vsaddu_vv_nxv16i32_nxv16i32_nxv16i32(<vscale x 16 x i32> %0, <vscale x 16 x i32> %1, i64 %2) nounwind {
+define <vscale x 16 x i32> @intrinsic_vsaddu_vv_nxv16i32_nxv16i32_nxv16i32(<vscale x 16 x i32> %0, <vscale x 16 x i32> %1, iXLen %2) nounwind {
 ; CHECK-LABEL: intrinsic_vsaddu_vv_nxv16i32_nxv16i32_nxv16i32:
 ; CHECK:       # %bb.0: # %entry
 ; CHECK-NEXT:    vsetvli zero, a0, e32, m8, ta, ma
@@ -820,7 +822,7 @@ entry:
     <vscale x 16 x i32> undef,
     <vscale x 16 x i32> %0,
     <vscale x 16 x i32> %1,
-    i64 %2)
+    iXLen %2)
 
   ret <vscale x 16 x i32> %a
 }
@@ -830,10 +832,10 @@ declare <vscale x 16 x i32> @llvm.riscv.vsaddu.mask.nxv16i32.nxv16i32(
   <vscale x 16 x i32>,
   <vscale x 16 x i32>,
   <vscale x 16 x i1>,
-  i64,
-  i64);
+  iXLen,
+  iXLen)
 
-define <vscale x 16 x i32> @intrinsic_vsaddu_mask_vv_nxv16i32_nxv16i32_nxv16i32(<vscale x 16 x i32> %0, <vscale x 16 x i32> %1, <vscale x 16 x i32> %2, <vscale x 16 x i1> %3, i64 %4) nounwind {
+define <vscale x 16 x i32> @intrinsic_vsaddu_mask_vv_nxv16i32_nxv16i32_nxv16i32(<vscale x 16 x i32> %0, <vscale x 16 x i32> %1, <vscale x 16 x i32> %2, <vscale x 16 x i1> %3, iXLen %4) nounwind {
 ; CHECK-LABEL: intrinsic_vsaddu_mask_vv_nxv16i32_nxv16i32_nxv16i32:
 ; CHECK:       # %bb.0: # %entry
 ; CHECK-NEXT:    vl8re32.v v24, (a0)
@@ -846,7 +848,7 @@ entry:
     <vscale x 16 x i32> %1,
     <vscale x 16 x i32> %2,
     <vscale x 16 x i1> %3,
-    i64 %4, i64 1)
+    iXLen %4, iXLen 1)
 
   ret <vscale x 16 x i32> %a
 }
@@ -855,9 +857,9 @@ declare <vscale x 1 x i64> @llvm.riscv.vsaddu.nxv1i64.nxv1i64(
   <vscale x 1 x i64>,
   <vscale x 1 x i64>,
   <vscale x 1 x i64>,
-  i64);
+  iXLen)
 
-define <vscale x 1 x i64> @intrinsic_vsaddu_vv_nxv1i64_nxv1i64_nxv1i64(<vscale x 1 x i64> %0, <vscale x 1 x i64> %1, i64 %2) nounwind {
+define <vscale x 1 x i64> @intrinsic_vsaddu_vv_nxv1i64_nxv1i64_nxv1i64(<vscale x 1 x i64> %0, <vscale x 1 x i64> %1, iXLen %2) nounwind {
 ; CHECK-LABEL: intrinsic_vsaddu_vv_nxv1i64_nxv1i64_nxv1i64:
 ; CHECK:       # %bb.0: # %entry
 ; CHECK-NEXT:    vsetvli zero, a0, e64, m1, ta, ma
@@ -868,7 +870,7 @@ entry:
     <vscale x 1 x i64> undef,
     <vscale x 1 x i64> %0,
     <vscale x 1 x i64> %1,
-    i64 %2)
+    iXLen %2)
 
   ret <vscale x 1 x i64> %a
 }
@@ -878,10 +880,10 @@ declare <vscale x 1 x i64> @llvm.riscv.vsaddu.mask.nxv1i64.nxv1i64(
   <vscale x 1 x i64>,
   <vscale x 1 x i64>,
   <vscale x 1 x i1>,
-  i64,
-  i64);
+  iXLen,
+  iXLen)
 
-define <vscale x 1 x i64> @intrinsic_vsaddu_mask_vv_nxv1i64_nxv1i64_nxv1i64(<vscale x 1 x i64> %0, <vscale x 1 x i64> %1, <vscale x 1 x i64> %2, <vscale x 1 x i1> %3, i64 %4) nounwind {
+define <vscale x 1 x i64> @intrinsic_vsaddu_mask_vv_nxv1i64_nxv1i64_nxv1i64(<vscale x 1 x i64> %0, <vscale x 1 x i64> %1, <vscale x 1 x i64> %2, <vscale x 1 x i1> %3, iXLen %4) nounwind {
 ; CHECK-LABEL: intrinsic_vsaddu_mask_vv_nxv1i64_nxv1i64_nxv1i64:
 ; CHECK:       # %bb.0: # %entry
 ; CHECK-NEXT:    vsetvli zero, a0, e64, m1, ta, mu
@@ -893,7 +895,7 @@ entry:
     <vscale x 1 x i64> %1,
     <vscale x 1 x i64> %2,
     <vscale x 1 x i1> %3,
-    i64 %4, i64 1)
+    iXLen %4, iXLen 1)
 
   ret <vscale x 1 x i64> %a
 }
@@ -902,9 +904,9 @@ declare <vscale x 2 x i64> @llvm.riscv.vsaddu.nxv2i64.nxv2i64(
   <vscale x 2 x i64>,
   <vscale x 2 x i64>,
   <vscale x 2 x i64>,
-  i64);
+  iXLen)
 
-define <vscale x 2 x i64> @intrinsic_vsaddu_vv_nxv2i64_nxv2i64_nxv2i64(<vscale x 2 x i64> %0, <vscale x 2 x i64> %1, i64 %2) nounwind {
+define <vscale x 2 x i64> @intrinsic_vsaddu_vv_nxv2i64_nxv2i64_nxv2i64(<vscale x 2 x i64> %0, <vscale x 2 x i64> %1, iXLen %2) nounwind {
 ; CHECK-LABEL: intrinsic_vsaddu_vv_nxv2i64_nxv2i64_nxv2i64:
 ; CHECK:       # %bb.0: # %entry
 ; CHECK-NEXT:    vsetvli zero, a0, e64, m2, ta, ma
@@ -915,7 +917,7 @@ entry:
     <vscale x 2 x i64> undef,
     <vscale x 2 x i64> %0,
     <vscale x 2 x i64> %1,
-    i64 %2)
+    iXLen %2)
 
   ret <vscale x 2 x i64> %a
 }
@@ -925,10 +927,10 @@ declare <vscale x 2 x i64> @llvm.riscv.vsaddu.mask.nxv2i64.nxv2i64(
   <vscale x 2 x i64>,
   <vscale x 2 x i64>,
   <vscale x 2 x i1>,
-  i64,
-  i64);
+  iXLen,
+  iXLen)
 
-define <vscale x 2 x i64> @intrinsic_vsaddu_mask_vv_nxv2i64_nxv2i64_nxv2i64(<vscale x 2 x i64> %0, <vscale x 2 x i64> %1, <vscale x 2 x i64> %2, <vscale x 2 x i1> %3, i64 %4) nounwind {
+define <vscale x 2 x i64> @intrinsic_vsaddu_mask_vv_nxv2i64_nxv2i64_nxv2i64(<vscale x 2 x i64> %0, <vscale x 2 x i64> %1, <vscale x 2 x i64> %2, <vscale x 2 x i1> %3, iXLen %4) nounwind {
 ; CHECK-LABEL: intrinsic_vsaddu_mask_vv_nxv2i64_nxv2i64_nxv2i64:
 ; CHECK:       # %bb.0: # %entry
 ; CHECK-NEXT:    vsetvli zero, a0, e64, m2, ta, mu
@@ -940,7 +942,7 @@ entry:
     <vscale x 2 x i64> %1,
     <vscale x 2 x i64> %2,
     <vscale x 2 x i1> %3,
-    i64 %4, i64 1)
+    iXLen %4, iXLen 1)
 
   ret <vscale x 2 x i64> %a
 }
@@ -949,9 +951,9 @@ declare <vscale x 4 x i64> @llvm.riscv.vsaddu.nxv4i64.nxv4i64(
   <vscale x 4 x i64>,
   <vscale x 4 x i64>,
   <vscale x 4 x i64>,
-  i64);
+  iXLen)
 
-define <vscale x 4 x i64> @intrinsic_vsaddu_vv_nxv4i64_nxv4i64_nxv4i64(<vscale x 4 x i64> %0, <vscale x 4 x i64> %1, i64 %2) nounwind {
+define <vscale x 4 x i64> @intrinsic_vsaddu_vv_nxv4i64_nxv4i64_nxv4i64(<vscale x 4 x i64> %0, <vscale x 4 x i64> %1, iXLen %2) nounwind {
 ; CHECK-LABEL: intrinsic_vsaddu_vv_nxv4i64_nxv4i64_nxv4i64:
 ; CHECK:       # %bb.0: # %entry
 ; CHECK-NEXT:    vsetvli zero, a0, e64, m4, ta, ma
@@ -962,7 +964,7 @@ entry:
     <vscale x 4 x i64> undef,
     <vscale x 4 x i64> %0,
     <vscale x 4 x i64> %1,
-    i64 %2)
+    iXLen %2)
 
   ret <vscale x 4 x i64> %a
 }
@@ -972,10 +974,10 @@ declare <vscale x 4 x i64> @llvm.riscv.vsaddu.mask.nxv4i64.nxv4i64(
   <vscale x 4 x i64>,
   <vscale x 4 x i64>,
   <vscale x 4 x i1>,
-  i64,
-  i64);
+  iXLen,
+  iXLen)
 
-define <vscale x 4 x i64> @intrinsic_vsaddu_mask_vv_nxv4i64_nxv4i64_nxv4i64(<vscale x 4 x i64> %0, <vscale x 4 x i64> %1, <vscale x 4 x i64> %2, <vscale x 4 x i1> %3, i64 %4) nounwind {
+define <vscale x 4 x i64> @intrinsic_vsaddu_mask_vv_nxv4i64_nxv4i64_nxv4i64(<vscale x 4 x i64> %0, <vscale x 4 x i64> %1, <vscale x 4 x i64> %2, <vscale x 4 x i1> %3, iXLen %4) nounwind {
 ; CHECK-LABEL: intrinsic_vsaddu_mask_vv_nxv4i64_nxv4i64_nxv4i64:
 ; CHECK:       # %bb.0: # %entry
 ; CHECK-NEXT:    vsetvli zero, a0, e64, m4, ta, mu
@@ -987,7 +989,7 @@ entry:
     <vscale x 4 x i64> %1,
     <vscale x 4 x i64> %2,
     <vscale x 4 x i1> %3,
-    i64 %4, i64 1)
+    iXLen %4, iXLen 1)
 
   ret <vscale x 4 x i64> %a
 }
@@ -996,9 +998,9 @@ declare <vscale x 8 x i64> @llvm.riscv.vsaddu.nxv8i64.nxv8i64(
   <vscale x 8 x i64>,
   <vscale x 8 x i64>,
   <vscale x 8 x i64>,
-  i64);
+  iXLen)
 
-define <vscale x 8 x i64> @intrinsic_vsaddu_vv_nxv8i64_nxv8i64_nxv8i64(<vscale x 8 x i64> %0, <vscale x 8 x i64> %1, i64 %2) nounwind {
+define <vscale x 8 x i64> @intrinsic_vsaddu_vv_nxv8i64_nxv8i64_nxv8i64(<vscale x 8 x i64> %0, <vscale x 8 x i64> %1, iXLen %2) nounwind {
 ; CHECK-LABEL: intrinsic_vsaddu_vv_nxv8i64_nxv8i64_nxv8i64:
 ; CHECK:       # %bb.0: # %entry
 ; CHECK-NEXT:    vsetvli zero, a0, e64, m8, ta, ma
@@ -1009,7 +1011,7 @@ entry:
     <vscale x 8 x i64> undef,
     <vscale x 8 x i64> %0,
     <vscale x 8 x i64> %1,
-    i64 %2)
+    iXLen %2)
 
   ret <vscale x 8 x i64> %a
 }
@@ -1019,10 +1021,10 @@ declare <vscale x 8 x i64> @llvm.riscv.vsaddu.mask.nxv8i64.nxv8i64(
   <vscale x 8 x i64>,
   <vscale x 8 x i64>,
   <vscale x 8 x i1>,
-  i64,
-  i64);
+  iXLen,
+  iXLen)
 
-define <vscale x 8 x i64> @intrinsic_vsaddu_mask_vv_nxv8i64_nxv8i64_nxv8i64(<vscale x 8 x i64> %0, <vscale x 8 x i64> %1, <vscale x 8 x i64> %2, <vscale x 8 x i1> %3, i64 %4) nounwind {
+define <vscale x 8 x i64> @intrinsic_vsaddu_mask_vv_nxv8i64_nxv8i64_nxv8i64(<vscale x 8 x i64> %0, <vscale x 8 x i64> %1, <vscale x 8 x i64> %2, <vscale x 8 x i1> %3, iXLen %4) nounwind {
 ; CHECK-LABEL: intrinsic_vsaddu_mask_vv_nxv8i64_nxv8i64_nxv8i64:
 ; CHECK:       # %bb.0: # %entry
 ; CHECK-NEXT:    vl8re64.v v24, (a0)
@@ -1035,7 +1037,7 @@ entry:
     <vscale x 8 x i64> %1,
     <vscale x 8 x i64> %2,
     <vscale x 8 x i1> %3,
-    i64 %4, i64 1)
+    iXLen %4, iXLen 1)
 
   ret <vscale x 8 x i64> %a
 }
@@ -1044,9 +1046,9 @@ declare <vscale x 1 x i8> @llvm.riscv.vsaddu.nxv1i8.i8(
   <vscale x 1 x i8>,
   <vscale x 1 x i8>,
   i8,
-  i64);
+  iXLen)
 
-define <vscale x 1 x i8> @intrinsic_vsaddu_vx_nxv1i8_nxv1i8_i8(<vscale x 1 x i8> %0, i8 %1, i64 %2) nounwind {
+define <vscale x 1 x i8> @intrinsic_vsaddu_vx_nxv1i8_nxv1i8_i8(<vscale x 1 x i8> %0, i8 %1, iXLen %2) nounwind {
 ; CHECK-LABEL: intrinsic_vsaddu_vx_nxv1i8_nxv1i8_i8:
 ; CHECK:       # %bb.0: # %entry
 ; CHECK-NEXT:    vsetvli zero, a1, e8, mf8, ta, ma
@@ -1057,7 +1059,7 @@ entry:
     <vscale x 1 x i8> undef,
     <vscale x 1 x i8> %0,
     i8 %1,
-    i64 %2)
+    iXLen %2)
 
   ret <vscale x 1 x i8> %a
 }
@@ -1067,10 +1069,10 @@ declare <vscale x 1 x i8> @llvm.riscv.vsaddu.mask.nxv1i8.i8(
   <vscale x 1 x i8>,
   i8,
   <vscale x 1 x i1>,
-  i64,
-  i64);
+  iXLen,
+  iXLen)
 
-define <vscale x 1 x i8> @intrinsic_vsaddu_mask_vx_nxv1i8_nxv1i8_i8(<vscale x 1 x i8> %0, <vscale x 1 x i8> %1, i8 %2, <vscale x 1 x i1> %3, i64 %4) nounwind {
+define <vscale x 1 x i8> @intrinsic_vsaddu_mask_vx_nxv1i8_nxv1i8_i8(<vscale x 1 x i8> %0, <vscale x 1 x i8> %1, i8 %2, <vscale x 1 x i1> %3, iXLen %4) nounwind {
 ; CHECK-LABEL: intrinsic_vsaddu_mask_vx_nxv1i8_nxv1i8_i8:
 ; CHECK:       # %bb.0: # %entry
 ; CHECK-NEXT:    vsetvli zero, a1, e8, mf8, ta, mu
@@ -1082,7 +1084,7 @@ entry:
     <vscale x 1 x i8> %1,
     i8 %2,
     <vscale x 1 x i1> %3,
-    i64 %4, i64 1)
+    iXLen %4, iXLen 1)
 
   ret <vscale x 1 x i8> %a
 }
@@ -1091,9 +1093,9 @@ declare <vscale x 2 x i8> @llvm.riscv.vsaddu.nxv2i8.i8(
   <vscale x 2 x i8>,
   <vscale x 2 x i8>,
   i8,
-  i64);
+  iXLen)
 
-define <vscale x 2 x i8> @intrinsic_vsaddu_vx_nxv2i8_nxv2i8_i8(<vscale x 2 x i8> %0, i8 %1, i64 %2) nounwind {
+define <vscale x 2 x i8> @intrinsic_vsaddu_vx_nxv2i8_nxv2i8_i8(<vscale x 2 x i8> %0, i8 %1, iXLen %2) nounwind {
 ; CHECK-LABEL: intrinsic_vsaddu_vx_nxv2i8_nxv2i8_i8:
 ; CHECK:       # %bb.0: # %entry
 ; CHECK-NEXT:    vsetvli zero, a1, e8, mf4, ta, ma
@@ -1104,7 +1106,7 @@ entry:
     <vscale x 2 x i8> undef,
     <vscale x 2 x i8> %0,
     i8 %1,
-    i64 %2)
+    iXLen %2)
 
   ret <vscale x 2 x i8> %a
 }
@@ -1114,10 +1116,10 @@ declare <vscale x 2 x i8> @llvm.riscv.vsaddu.mask.nxv2i8.i8(
   <vscale x 2 x i8>,
   i8,
   <vscale x 2 x i1>,
-  i64,
-  i64);
+  iXLen,
+  iXLen)
 
-define <vscale x 2 x i8> @intrinsic_vsaddu_mask_vx_nxv2i8_nxv2i8_i8(<vscale x 2 x i8> %0, <vscale x 2 x i8> %1, i8 %2, <vscale x 2 x i1> %3, i64 %4) nounwind {
+define <vscale x 2 x i8> @intrinsic_vsaddu_mask_vx_nxv2i8_nxv2i8_i8(<vscale x 2 x i8> %0, <vscale x 2 x i8> %1, i8 %2, <vscale x 2 x i1> %3, iXLen %4) nounwind {
 ; CHECK-LABEL: intrinsic_vsaddu_mask_vx_nxv2i8_nxv2i8_i8:
 ; CHECK:       # %bb.0: # %entry
 ; CHECK-NEXT:    vsetvli zero, a1, e8, mf4, ta, mu
@@ -1129,7 +1131,7 @@ entry:
     <vscale x 2 x i8> %1,
     i8 %2,
     <vscale x 2 x i1> %3,
-    i64 %4, i64 1)
+    iXLen %4, iXLen 1)
 
   ret <vscale x 2 x i8> %a
 }
@@ -1138,9 +1140,9 @@ declare <vscale x 4 x i8> @llvm.riscv.vsaddu.nxv4i8.i8(
   <vscale x 4 x i8>,
   <vscale x 4 x i8>,
   i8,
-  i64);
+  iXLen)
 
-define <vscale x 4 x i8> @intrinsic_vsaddu_vx_nxv4i8_nxv4i8_i8(<vscale x 4 x i8> %0, i8 %1, i64 %2) nounwind {
+define <vscale x 4 x i8> @intrinsic_vsaddu_vx_nxv4i8_nxv4i8_i8(<vscale x 4 x i8> %0, i8 %1, iXLen %2) nounwind {
 ; CHECK-LABEL: intrinsic_vsaddu_vx_nxv4i8_nxv4i8_i8:
 ; CHECK:       # %bb.0: # %entry
 ; CHECK-NEXT:    vsetvli zero, a1, e8, mf2, ta, ma
@@ -1151,7 +1153,7 @@ entry:
     <vscale x 4 x i8> undef,
     <vscale x 4 x i8> %0,
     i8 %1,
-    i64 %2)
+    iXLen %2)
 
   ret <vscale x 4 x i8> %a
 }
@@ -1161,10 +1163,10 @@ declare <vscale x 4 x i8> @llvm.riscv.vsaddu.mask.nxv4i8.i8(
   <vscale x 4 x i8>,
   i8,
   <vscale x 4 x i1>,
-  i64,
-  i64);
+  iXLen,
+  iXLen)
 
-define <vscale x 4 x i8> @intrinsic_vsaddu_mask_vx_nxv4i8_nxv4i8_i8(<vscale x 4 x i8> %0, <vscale x 4 x i8> %1, i8 %2, <vscale x 4 x i1> %3, i64 %4) nounwind {
+define <vscale x 4 x i8> @intrinsic_vsaddu_mask_vx_nxv4i8_nxv4i8_i8(<vscale x 4 x i8> %0, <vscale x 4 x i8> %1, i8 %2, <vscale x 4 x i1> %3, iXLen %4) nounwind {
 ; CHECK-LABEL: intrinsic_vsaddu_mask_vx_nxv4i8_nxv4i8_i8:
 ; CHECK:       # %bb.0: # %entry
 ; CHECK-NEXT:    vsetvli zero, a1, e8, mf2, ta, mu
@@ -1176,7 +1178,7 @@ entry:
     <vscale x 4 x i8> %1,
     i8 %2,
     <vscale x 4 x i1> %3,
-    i64 %4, i64 1)
+    iXLen %4, iXLen 1)
 
   ret <vscale x 4 x i8> %a
 }
@@ -1185,9 +1187,9 @@ declare <vscale x 8 x i8> @llvm.riscv.vsaddu.nxv8i8.i8(
   <vscale x 8 x i8>,
   <vscale x 8 x i8>,
   i8,
-  i64);
+  iXLen)
 
-define <vscale x 8 x i8> @intrinsic_vsaddu_vx_nxv8i8_nxv8i8_i8(<vscale x 8 x i8> %0, i8 %1, i64 %2) nounwind {
+define <vscale x 8 x i8> @intrinsic_vsaddu_vx_nxv8i8_nxv8i8_i8(<vscale x 8 x i8> %0, i8 %1, iXLen %2) nounwind {
 ; CHECK-LABEL: intrinsic_vsaddu_vx_nxv8i8_nxv8i8_i8:
 ; CHECK:       # %bb.0: # %entry
 ; CHECK-NEXT:    vsetvli zero, a1, e8, m1, ta, ma
@@ -1198,7 +1200,7 @@ entry:
     <vscale x 8 x i8> undef,
     <vscale x 8 x i8> %0,
     i8 %1,
-    i64 %2)
+    iXLen %2)
 
   ret <vscale x 8 x i8> %a
 }
@@ -1208,10 +1210,10 @@ declare <vscale x 8 x i8> @llvm.riscv.vsaddu.mask.nxv8i8.i8(
   <vscale x 8 x i8>,
   i8,
   <vscale x 8 x i1>,
-  i64,
-  i64);
+  iXLen,
+  iXLen)
 
-define <vscale x 8 x i8> @intrinsic_vsaddu_mask_vx_nxv8i8_nxv8i8_i8(<vscale x 8 x i8> %0, <vscale x 8 x i8> %1, i8 %2, <vscale x 8 x i1> %3, i64 %4) nounwind {
+define <vscale x 8 x i8> @intrinsic_vsaddu_mask_vx_nxv8i8_nxv8i8_i8(<vscale x 8 x i8> %0, <vscale x 8 x i8> %1, i8 %2, <vscale x 8 x i1> %3, iXLen %4) nounwind {
 ; CHECK-LABEL: intrinsic_vsaddu_mask_vx_nxv8i8_nxv8i8_i8:
 ; CHECK:       # %bb.0: # %entry
 ; CHECK-NEXT:    vsetvli zero, a1, e8, m1, ta, mu
@@ -1223,7 +1225,7 @@ entry:
     <vscale x 8 x i8> %1,
     i8 %2,
     <vscale x 8 x i1> %3,
-    i64 %4, i64 1)
+    iXLen %4, iXLen 1)
 
   ret <vscale x 8 x i8> %a
 }
@@ -1232,9 +1234,9 @@ declare <vscale x 16 x i8> @llvm.riscv.vsaddu.nxv16i8.i8(
   <vscale x 16 x i8>,
   <vscale x 16 x i8>,
   i8,
-  i64);
+  iXLen)
 
-define <vscale x 16 x i8> @intrinsic_vsaddu_vx_nxv16i8_nxv16i8_i8(<vscale x 16 x i8> %0, i8 %1, i64 %2) nounwind {
+define <vscale x 16 x i8> @intrinsic_vsaddu_vx_nxv16i8_nxv16i8_i8(<vscale x 16 x i8> %0, i8 %1, iXLen %2) nounwind {
 ; CHECK-LABEL: intrinsic_vsaddu_vx_nxv16i8_nxv16i8_i8:
 ; CHECK:       # %bb.0: # %entry
 ; CHECK-NEXT:    vsetvli zero, a1, e8, m2, ta, ma
@@ -1245,7 +1247,7 @@ entry:
     <vscale x 16 x i8> undef,
     <vscale x 16 x i8> %0,
     i8 %1,
-    i64 %2)
+    iXLen %2)
 
   ret <vscale x 16 x i8> %a
 }
@@ -1255,10 +1257,10 @@ declare <vscale x 16 x i8> @llvm.riscv.vsaddu.mask.nxv16i8.i8(
   <vscale x 16 x i8>,
   i8,
   <vscale x 16 x i1>,
-  i64,
-  i64);
+  iXLen,
+  iXLen)
 
-define <vscale x 16 x i8> @intrinsic_vsaddu_mask_vx_nxv16i8_nxv16i8_i8(<vscale x 16 x i8> %0, <vscale x 16 x i8> %1, i8 %2, <vscale x 16 x i1> %3, i64 %4) nounwind {
+define <vscale x 16 x i8> @intrinsic_vsaddu_mask_vx_nxv16i8_nxv16i8_i8(<vscale x 16 x i8> %0, <vscale x 16 x i8> %1, i8 %2, <vscale x 16 x i1> %3, iXLen %4) nounwind {
 ; CHECK-LABEL: intrinsic_vsaddu_mask_vx_nxv16i8_nxv16i8_i8:
 ; CHECK:       # %bb.0: # %entry
 ; CHECK-NEXT:    vsetvli zero, a1, e8, m2, ta, mu
@@ -1270,7 +1272,7 @@ entry:
     <vscale x 16 x i8> %1,
     i8 %2,
     <vscale x 16 x i1> %3,
-    i64 %4, i64 1)
+    iXLen %4, iXLen 1)
 
   ret <vscale x 16 x i8> %a
 }
@@ -1279,9 +1281,9 @@ declare <vscale x 32 x i8> @llvm.riscv.vsaddu.nxv32i8.i8(
   <vscale x 32 x i8>,
   <vscale x 32 x i8>,
   i8,
-  i64);
+  iXLen)
 
-define <vscale x 32 x i8> @intrinsic_vsaddu_vx_nxv32i8_nxv32i8_i8(<vscale x 32 x i8> %0, i8 %1, i64 %2) nounwind {
+define <vscale x 32 x i8> @intrinsic_vsaddu_vx_nxv32i8_nxv32i8_i8(<vscale x 32 x i8> %0, i8 %1, iXLen %2) nounwind {
 ; CHECK-LABEL: intrinsic_vsaddu_vx_nxv32i8_nxv32i8_i8:
 ; CHECK:       # %bb.0: # %entry
 ; CHECK-NEXT:    vsetvli zero, a1, e8, m4, ta, ma
@@ -1292,7 +1294,7 @@ entry:
     <vscale x 32 x i8> undef,
     <vscale x 32 x i8> %0,
     i8 %1,
-    i64 %2)
+    iXLen %2)
 
   ret <vscale x 32 x i8> %a
 }
@@ -1302,10 +1304,10 @@ declare <vscale x 32 x i8> @llvm.riscv.vsaddu.mask.nxv32i8.i8(
   <vscale x 32 x i8>,
   i8,
   <vscale x 32 x i1>,
-  i64,
-  i64);
+  iXLen,
+  iXLen)
 
-define <vscale x 32 x i8> @intrinsic_vsaddu_mask_vx_nxv32i8_nxv32i8_i8(<vscale x 32 x i8> %0, <vscale x 32 x i8> %1, i8 %2, <vscale x 32 x i1> %3, i64 %4) nounwind {
+define <vscale x 32 x i8> @intrinsic_vsaddu_mask_vx_nxv32i8_nxv32i8_i8(<vscale x 32 x i8> %0, <vscale x 32 x i8> %1, i8 %2, <vscale x 32 x i1> %3, iXLen %4) nounwind {
 ; CHECK-LABEL: intrinsic_vsaddu_mask_vx_nxv32i8_nxv32i8_i8:
 ; CHECK:       # %bb.0: # %entry
 ; CHECK-NEXT:    vsetvli zero, a1, e8, m4, ta, mu
@@ -1317,7 +1319,7 @@ entry:
     <vscale x 32 x i8> %1,
     i8 %2,
     <vscale x 32 x i1> %3,
-    i64 %4, i64 1)
+    iXLen %4, iXLen 1)
 
   ret <vscale x 32 x i8> %a
 }
@@ -1326,9 +1328,9 @@ declare <vscale x 64 x i8> @llvm.riscv.vsaddu.nxv64i8.i8(
   <vscale x 64 x i8>,
   <vscale x 64 x i8>,
   i8,
-  i64);
+  iXLen)
 
-define <vscale x 64 x i8> @intrinsic_vsaddu_vx_nxv64i8_nxv64i8_i8(<vscale x 64 x i8> %0, i8 %1, i64 %2) nounwind {
+define <vscale x 64 x i8> @intrinsic_vsaddu_vx_nxv64i8_nxv64i8_i8(<vscale x 64 x i8> %0, i8 %1, iXLen %2) nounwind {
 ; CHECK-LABEL: intrinsic_vsaddu_vx_nxv64i8_nxv64i8_i8:
 ; CHECK:       # %bb.0: # %entry
 ; CHECK-NEXT:    vsetvli zero, a1, e8, m8, ta, ma
@@ -1339,7 +1341,7 @@ entry:
     <vscale x 64 x i8> undef,
     <vscale x 64 x i8> %0,
     i8 %1,
-    i64 %2)
+    iXLen %2)
 
   ret <vscale x 64 x i8> %a
 }
@@ -1349,10 +1351,10 @@ declare <vscale x 64 x i8> @llvm.riscv.vsaddu.mask.nxv64i8.i8(
   <vscale x 64 x i8>,
   i8,
   <vscale x 64 x i1>,
-  i64,
-  i64);
+  iXLen,
+  iXLen)
 
-define <vscale x 64 x i8> @intrinsic_vsaddu_mask_vx_nxv64i8_nxv64i8_i8(<vscale x 64 x i8> %0, <vscale x 64 x i8> %1, i8 %2, <vscale x 64 x i1> %3, i64 %4) nounwind {
+define <vscale x 64 x i8> @intrinsic_vsaddu_mask_vx_nxv64i8_nxv64i8_i8(<vscale x 64 x i8> %0, <vscale x 64 x i8> %1, i8 %2, <vscale x 64 x i1> %3, iXLen %4) nounwind {
 ; CHECK-LABEL: intrinsic_vsaddu_mask_vx_nxv64i8_nxv64i8_i8:
 ; CHECK:       # %bb.0: # %entry
 ; CHECK-NEXT:    vsetvli zero, a1, e8, m8, ta, mu
@@ -1364,7 +1366,7 @@ entry:
     <vscale x 64 x i8> %1,
     i8 %2,
     <vscale x 64 x i1> %3,
-    i64 %4, i64 1)
+    iXLen %4, iXLen 1)
 
   ret <vscale x 64 x i8> %a
 }
@@ -1373,9 +1375,9 @@ declare <vscale x 1 x i16> @llvm.riscv.vsaddu.nxv1i16.i16(
   <vscale x 1 x i16>,
   <vscale x 1 x i16>,
   i16,
-  i64);
+  iXLen)
 
-define <vscale x 1 x i16> @intrinsic_vsaddu_vx_nxv1i16_nxv1i16_i16(<vscale x 1 x i16> %0, i16 %1, i64 %2) nounwind {
+define <vscale x 1 x i16> @intrinsic_vsaddu_vx_nxv1i16_nxv1i16_i16(<vscale x 1 x i16> %0, i16 %1, iXLen %2) nounwind {
 ; CHECK-LABEL: intrinsic_vsaddu_vx_nxv1i16_nxv1i16_i16:
 ; CHECK:       # %bb.0: # %entry
 ; CHECK-NEXT:    vsetvli zero, a1, e16, mf4, ta, ma
@@ -1386,7 +1388,7 @@ entry:
     <vscale x 1 x i16> undef,
     <vscale x 1 x i16> %0,
     i16 %1,
-    i64 %2)
+    iXLen %2)
 
   ret <vscale x 1 x i16> %a
 }
@@ -1396,10 +1398,10 @@ declare <vscale x 1 x i16> @llvm.riscv.vsaddu.mask.nxv1i16.i16(
   <vscale x 1 x i16>,
   i16,
   <vscale x 1 x i1>,
-  i64,
-  i64);
+  iXLen,
+  iXLen)
 
-define <vscale x 1 x i16> @intrinsic_vsaddu_mask_vx_nxv1i16_nxv1i16_i16(<vscale x 1 x i16> %0, <vscale x 1 x i16> %1, i16 %2, <vscale x 1 x i1> %3, i64 %4) nounwind {
+define <vscale x 1 x i16> @intrinsic_vsaddu_mask_vx_nxv1i16_nxv1i16_i16(<vscale x 1 x i16> %0, <vscale x 1 x i16> %1, i16 %2, <vscale x 1 x i1> %3, iXLen %4) nounwind {
 ; CHECK-LABEL: intrinsic_vsaddu_mask_vx_nxv1i16_nxv1i16_i16:
 ; CHECK:       # %bb.0: # %entry
 ; CHECK-NEXT:    vsetvli zero, a1, e16, mf4, ta, mu
@@ -1411,7 +1413,7 @@ entry:
     <vscale x 1 x i16> %1,
     i16 %2,
     <vscale x 1 x i1> %3,
-    i64 %4, i64 1)
+    iXLen %4, iXLen 1)
 
   ret <vscale x 1 x i16> %a
 }
@@ -1420,9 +1422,9 @@ declare <vscale x 2 x i16> @llvm.riscv.vsaddu.nxv2i16.i16(
   <vscale x 2 x i16>,
   <vscale x 2 x i16>,
   i16,
-  i64);
+  iXLen)
 
-define <vscale x 2 x i16> @intrinsic_vsaddu_vx_nxv2i16_nxv2i16_i16(<vscale x 2 x i16> %0, i16 %1, i64 %2) nounwind {
+define <vscale x 2 x i16> @intrinsic_vsaddu_vx_nxv2i16_nxv2i16_i16(<vscale x 2 x i16> %0, i16 %1, iXLen %2) nounwind {
 ; CHECK-LABEL: intrinsic_vsaddu_vx_nxv2i16_nxv2i16_i16:
 ; CHECK:       # %bb.0: # %entry
 ; CHECK-NEXT:    vsetvli zero, a1, e16, mf2, ta, ma
@@ -1433,7 +1435,7 @@ entry:
     <vscale x 2 x i16> undef,
     <vscale x 2 x i16> %0,
     i16 %1,
-    i64 %2)
+    iXLen %2)
 
   ret <vscale x 2 x i16> %a
 }
@@ -1443,10 +1445,10 @@ declare <vscale x 2 x i16> @llvm.riscv.vsaddu.mask.nxv2i16.i16(
   <vscale x 2 x i16>,
   i16,
   <vscale x 2 x i1>,
-  i64,
-  i64);
+  iXLen,
+  iXLen)
 
-define <vscale x 2 x i16> @intrinsic_vsaddu_mask_vx_nxv2i16_nxv2i16_i16(<vscale x 2 x i16> %0, <vscale x 2 x i16> %1, i16 %2, <vscale x 2 x i1> %3, i64 %4) nounwind {
+define <vscale x 2 x i16> @intrinsic_vsaddu_mask_vx_nxv2i16_nxv2i16_i16(<vscale x 2 x i16> %0, <vscale x 2 x i16> %1, i16 %2, <vscale x 2 x i1> %3, iXLen %4) nounwind {
 ; CHECK-LABEL: intrinsic_vsaddu_mask_vx_nxv2i16_nxv2i16_i16:
 ; CHECK:       # %bb.0: # %entry
 ; CHECK-NEXT:    vsetvli zero, a1, e16, mf2, ta, mu
@@ -1458,7 +1460,7 @@ entry:
     <vscale x 2 x i16> %1,
     i16 %2,
     <vscale x 2 x i1> %3,
-    i64 %4, i64 1)
+    iXLen %4, iXLen 1)
 
   ret <vscale x 2 x i16> %a
 }
@@ -1467,9 +1469,9 @@ declare <vscale x 4 x i16> @llvm.riscv.vsaddu.nxv4i16.i16(
   <vscale x 4 x i16>,
   <vscale x 4 x i16>,
   i16,
-  i64);
+  iXLen)
 
-define <vscale x 4 x i16> @intrinsic_vsaddu_vx_nxv4i16_nxv4i16_i16(<vscale x 4 x i16> %0, i16 %1, i64 %2) nounwind {
+define <vscale x 4 x i16> @intrinsic_vsaddu_vx_nxv4i16_nxv4i16_i16(<vscale x 4 x i16> %0, i16 %1, iXLen %2) nounwind {
 ; CHECK-LABEL: intrinsic_vsaddu_vx_nxv4i16_nxv4i16_i16:
 ; CHECK:       # %bb.0: # %entry
 ; CHECK-NEXT:    vsetvli zero, a1, e16, m1, ta, ma
@@ -1480,7 +1482,7 @@ entry:
     <vscale x 4 x i16> undef,
     <vscale x 4 x i16> %0,
     i16 %1,
-    i64 %2)
+    iXLen %2)
 
   ret <vscale x 4 x i16> %a
 }
@@ -1490,10 +1492,10 @@ declare <vscale x 4 x i16> @llvm.riscv.vsaddu.mask.nxv4i16.i16(
   <vscale x 4 x i16>,
   i16,
   <vscale x 4 x i1>,
-  i64,
-  i64);
+  iXLen,
+  iXLen)
 
-define <vscale x 4 x i16> @intrinsic_vsaddu_mask_vx_nxv4i16_nxv4i16_i16(<vscale x 4 x i16> %0, <vscale x 4 x i16> %1, i16 %2, <vscale x 4 x i1> %3, i64 %4) nounwind {
+define <vscale x 4 x i16> @intrinsic_vsaddu_mask_vx_nxv4i16_nxv4i16_i16(<vscale x 4 x i16> %0, <vscale x 4 x i16> %1, i16 %2, <vscale x 4 x i1> %3, iXLen %4) nounwind {
 ; CHECK-LABEL: intrinsic_vsaddu_mask_vx_nxv4i16_nxv4i16_i16:
 ; CHECK:       # %bb.0: # %entry
 ; CHECK-NEXT:    vsetvli zero, a1, e16, m1, ta, mu
@@ -1505,7 +1507,7 @@ entry:
     <vscale x 4 x i16> %1,
     i16 %2,
     <vscale x 4 x i1> %3,
-    i64 %4, i64 1)
+    iXLen %4, iXLen 1)
 
   ret <vscale x 4 x i16> %a
 }
@@ -1514,9 +1516,9 @@ declare <vscale x 8 x i16> @llvm.riscv.vsaddu.nxv8i16.i16(
   <vscale x 8 x i16>,
   <vscale x 8 x i16>,
   i16,
-  i64);
+  iXLen)
 
-define <vscale x 8 x i16> @intrinsic_vsaddu_vx_nxv8i16_nxv8i16_i16(<vscale x 8 x i16> %0, i16 %1, i64 %2) nounwind {
+define <vscale x 8 x i16> @intrinsic_vsaddu_vx_nxv8i16_nxv8i16_i16(<vscale x 8 x i16> %0, i16 %1, iXLen %2) nounwind {
 ; CHECK-LABEL: intrinsic_vsaddu_vx_nxv8i16_nxv8i16_i16:
 ; CHECK:       # %bb.0: # %entry
 ; CHECK-NEXT:    vsetvli zero, a1, e16, m2, ta, ma
@@ -1527,7 +1529,7 @@ entry:
     <vscale x 8 x i16> undef,
     <vscale x 8 x i16> %0,
     i16 %1,
-    i64 %2)
+    iXLen %2)
 
   ret <vscale x 8 x i16> %a
 }
@@ -1537,10 +1539,10 @@ declare <vscale x 8 x i16> @llvm.riscv.vsaddu.mask.nxv8i16.i16(
   <vscale x 8 x i16>,
   i16,
   <vscale x 8 x i1>,
-  i64,
-  i64);
+  iXLen,
+  iXLen)
 
-define <vscale x 8 x i16> @intrinsic_vsaddu_mask_vx_nxv8i16_nxv8i16_i16(<vscale x 8 x i16> %0, <vscale x 8 x i16> %1, i16 %2, <vscale x 8 x i1> %3, i64 %4) nounwind {
+define <vscale x 8 x i16> @intrinsic_vsaddu_mask_vx_nxv8i16_nxv8i16_i16(<vscale x 8 x i16> %0, <vscale x 8 x i16> %1, i16 %2, <vscale x 8 x i1> %3, iXLen %4) nounwind {
 ; CHECK-LABEL: intrinsic_vsaddu_mask_vx_nxv8i16_nxv8i16_i16:
 ; CHECK:       # %bb.0: # %entry
 ; CHECK-NEXT:    vsetvli zero, a1, e16, m2, ta, mu
@@ -1552,7 +1554,7 @@ entry:
     <vscale x 8 x i16> %1,
     i16 %2,
     <vscale x 8 x i1> %3,
-    i64 %4, i64 1)
+    iXLen %4, iXLen 1)
 
   ret <vscale x 8 x i16> %a
 }
@@ -1561,9 +1563,9 @@ declare <vscale x 16 x i16> @llvm.riscv.vsaddu.nxv16i16.i16(
   <vscale x 16 x i16>,
   <vscale x 16 x i16>,
   i16,
-  i64);
+  iXLen)
 
-define <vscale x 16 x i16> @intrinsic_vsaddu_vx_nxv16i16_nxv16i16_i16(<vscale x 16 x i16> %0, i16 %1, i64 %2) nounwind {
+define <vscale x 16 x i16> @intrinsic_vsaddu_vx_nxv16i16_nxv16i16_i16(<vscale x 16 x i16> %0, i16 %1, iXLen %2) nounwind {
 ; CHECK-LABEL: intrinsic_vsaddu_vx_nxv16i16_nxv16i16_i16:
 ; CHECK:       # %bb.0: # %entry
 ; CHECK-NEXT:    vsetvli zero, a1, e16, m4, ta, ma
@@ -1574,7 +1576,7 @@ entry:
     <vscale x 16 x i16> undef,
     <vscale x 16 x i16> %0,
     i16 %1,
-    i64 %2)
+    iXLen %2)
 
   ret <vscale x 16 x i16> %a
 }
@@ -1584,10 +1586,10 @@ declare <vscale x 16 x i16> @llvm.riscv.vsaddu.mask.nxv16i16.i16(
   <vscale x 16 x i16>,
   i16,
   <vscale x 16 x i1>,
-  i64,
-  i64);
+  iXLen,
+  iXLen)
 
-define <vscale x 16 x i16> @intrinsic_vsaddu_mask_vx_nxv16i16_nxv16i16_i16(<vscale x 16 x i16> %0, <vscale x 16 x i16> %1, i16 %2, <vscale x 16 x i1> %3, i64 %4) nounwind {
+define <vscale x 16 x i16> @intrinsic_vsaddu_mask_vx_nxv16i16_nxv16i16_i16(<vscale x 16 x i16> %0, <vscale x 16 x i16> %1, i16 %2, <vscale x 16 x i1> %3, iXLen %4) nounwind {
 ; CHECK-LABEL: intrinsic_vsaddu_mask_vx_nxv16i16_nxv16i16_i16:
 ; CHECK:       # %bb.0: # %entry
 ; CHECK-NEXT:    vsetvli zero, a1, e16, m4, ta, mu
@@ -1599,7 +1601,7 @@ entry:
     <vscale x 16 x i16> %1,
     i16 %2,
     <vscale x 16 x i1> %3,
-    i64 %4, i64 1)
+    iXLen %4, iXLen 1)
 
   ret <vscale x 16 x i16> %a
 }
@@ -1608,9 +1610,9 @@ declare <vscale x 32 x i16> @llvm.riscv.vsaddu.nxv32i16.i16(
   <vscale x 32 x i16>,
   <vscale x 32 x i16>,
   i16,
-  i64);
+  iXLen)
 
-define <vscale x 32 x i16> @intrinsic_vsaddu_vx_nxv32i16_nxv32i16_i16(<vscale x 32 x i16> %0, i16 %1, i64 %2) nounwind {
+define <vscale x 32 x i16> @intrinsic_vsaddu_vx_nxv32i16_nxv32i16_i16(<vscale x 32 x i16> %0, i16 %1, iXLen %2) nounwind {
 ; CHECK-LABEL: intrinsic_vsaddu_vx_nxv32i16_nxv32i16_i16:
 ; CHECK:       # %bb.0: # %entry
 ; CHECK-NEXT:    vsetvli zero, a1, e16, m8, ta, ma
@@ -1621,7 +1623,7 @@ entry:
     <vscale x 32 x i16> undef,
     <vscale x 32 x i16> %0,
     i16 %1,
-    i64 %2)
+    iXLen %2)
 
   ret <vscale x 32 x i16> %a
 }
@@ -1631,10 +1633,10 @@ declare <vscale x 32 x i16> @llvm.riscv.vsaddu.mask.nxv32i16.i16(
   <vscale x 32 x i16>,
   i16,
   <vscale x 32 x i1>,
-  i64,
-  i64);
+  iXLen,
+  iXLen)
 
-define <vscale x 32 x i16> @intrinsic_vsaddu_mask_vx_nxv32i16_nxv32i16_i16(<vscale x 32 x i16> %0, <vscale x 32 x i16> %1, i16 %2, <vscale x 32 x i1> %3, i64 %4) nounwind {
+define <vscale x 32 x i16> @intrinsic_vsaddu_mask_vx_nxv32i16_nxv32i16_i16(<vscale x 32 x i16> %0, <vscale x 32 x i16> %1, i16 %2, <vscale x 32 x i1> %3, iXLen %4) nounwind {
 ; CHECK-LABEL: intrinsic_vsaddu_mask_vx_nxv32i16_nxv32i16_i16:
 ; CHECK:       # %bb.0: # %entry
 ; CHECK-NEXT:    vsetvli zero, a1, e16, m8, ta, mu
@@ -1646,7 +1648,7 @@ entry:
     <vscale x 32 x i16> %1,
     i16 %2,
     <vscale x 32 x i1> %3,
-    i64 %4, i64 1)
+    iXLen %4, iXLen 1)
 
   ret <vscale x 32 x i16> %a
 }
@@ -1655,9 +1657,9 @@ declare <vscale x 1 x i32> @llvm.riscv.vsaddu.nxv1i32.i32(
   <vscale x 1 x i32>,
   <vscale x 1 x i32>,
   i32,
-  i64);
+  iXLen)
 
-define <vscale x 1 x i32> @intrinsic_vsaddu_vx_nxv1i32_nxv1i32_i32(<vscale x 1 x i32> %0, i32 %1, i64 %2) nounwind {
+define <vscale x 1 x i32> @intrinsic_vsaddu_vx_nxv1i32_nxv1i32_i32(<vscale x 1 x i32> %0, i32 %1, iXLen %2) nounwind {
 ; CHECK-LABEL: intrinsic_vsaddu_vx_nxv1i32_nxv1i32_i32:
 ; CHECK:       # %bb.0: # %entry
 ; CHECK-NEXT:    vsetvli zero, a1, e32, mf2, ta, ma
@@ -1668,7 +1670,7 @@ entry:
     <vscale x 1 x i32> undef,
     <vscale x 1 x i32> %0,
     i32 %1,
-    i64 %2)
+    iXLen %2)
 
   ret <vscale x 1 x i32> %a
 }
@@ -1678,10 +1680,10 @@ declare <vscale x 1 x i32> @llvm.riscv.vsaddu.mask.nxv1i32.i32(
   <vscale x 1 x i32>,
   i32,
   <vscale x 1 x i1>,
-  i64,
-  i64);
+  iXLen,
+  iXLen)
 
-define <vscale x 1 x i32> @intrinsic_vsaddu_mask_vx_nxv1i32_nxv1i32_i32(<vscale x 1 x i32> %0, <vscale x 1 x i32> %1, i32 %2, <vscale x 1 x i1> %3, i64 %4) nounwind {
+define <vscale x 1 x i32> @intrinsic_vsaddu_mask_vx_nxv1i32_nxv1i32_i32(<vscale x 1 x i32> %0, <vscale x 1 x i32> %1, i32 %2, <vscale x 1 x i1> %3, iXLen %4) nounwind {
 ; CHECK-LABEL: intrinsic_vsaddu_mask_vx_nxv1i32_nxv1i32_i32:
 ; CHECK:       # %bb.0: # %entry
 ; CHECK-NEXT:    vsetvli zero, a1, e32, mf2, ta, mu
@@ -1693,7 +1695,7 @@ entry:
     <vscale x 1 x i32> %1,
     i32 %2,
     <vscale x 1 x i1> %3,
-    i64 %4, i64 1)
+    iXLen %4, iXLen 1)
 
   ret <vscale x 1 x i32> %a
 }
@@ -1702,9 +1704,9 @@ declare <vscale x 2 x i32> @llvm.riscv.vsaddu.nxv2i32.i32(
   <vscale x 2 x i32>,
   <vscale x 2 x i32>,
   i32,
-  i64);
+  iXLen)
 
-define <vscale x 2 x i32> @intrinsic_vsaddu_vx_nxv2i32_nxv2i32_i32(<vscale x 2 x i32> %0, i32 %1, i64 %2) nounwind {
+define <vscale x 2 x i32> @intrinsic_vsaddu_vx_nxv2i32_nxv2i32_i32(<vscale x 2 x i32> %0, i32 %1, iXLen %2) nounwind {
 ; CHECK-LABEL: intrinsic_vsaddu_vx_nxv2i32_nxv2i32_i32:
 ; CHECK:       # %bb.0: # %entry
 ; CHECK-NEXT:    vsetvli zero, a1, e32, m1, ta, ma
@@ -1715,7 +1717,7 @@ entry:
     <vscale x 2 x i32> undef,
     <vscale x 2 x i32> %0,
     i32 %1,
-    i64 %2)
+    iXLen %2)
 
   ret <vscale x 2 x i32> %a
 }
@@ -1725,10 +1727,10 @@ declare <vscale x 2 x i32> @llvm.riscv.vsaddu.mask.nxv2i32.i32(
   <vscale x 2 x i32>,
   i32,
   <vscale x 2 x i1>,
-  i64,
-  i64);
+  iXLen,
+  iXLen)
 
-define <vscale x 2 x i32> @intrinsic_vsaddu_mask_vx_nxv2i32_nxv2i32_i32(<vscale x 2 x i32> %0, <vscale x 2 x i32> %1, i32 %2, <vscale x 2 x i1> %3, i64 %4) nounwind {
+define <vscale x 2 x i32> @intrinsic_vsaddu_mask_vx_nxv2i32_nxv2i32_i32(<vscale x 2 x i32> %0, <vscale x 2 x i32> %1, i32 %2, <vscale x 2 x i1> %3, iXLen %4) nounwind {
 ; CHECK-LABEL: intrinsic_vsaddu_mask_vx_nxv2i32_nxv2i32_i32:
 ; CHECK:       # %bb.0: # %entry
 ; CHECK-NEXT:    vsetvli zero, a1, e32, m1, ta, mu
@@ -1740,7 +1742,7 @@ entry:
     <vscale x 2 x i32> %1,
     i32 %2,
     <vscale x 2 x i1> %3,
-    i64 %4, i64 1)
+    iXLen %4, iXLen 1)
 
   ret <vscale x 2 x i32> %a
 }
@@ -1749,9 +1751,9 @@ declare <vscale x 4 x i32> @llvm.riscv.vsaddu.nxv4i32.i32(
   <vscale x 4 x i32>,
   <vscale x 4 x i32>,
   i32,
-  i64);
+  iXLen)
 
-define <vscale x 4 x i32> @intrinsic_vsaddu_vx_nxv4i32_nxv4i32_i32(<vscale x 4 x i32> %0, i32 %1, i64 %2) nounwind {
+define <vscale x 4 x i32> @intrinsic_vsaddu_vx_nxv4i32_nxv4i32_i32(<vscale x 4 x i32> %0, i32 %1, iXLen %2) nounwind {
 ; CHECK-LABEL: intrinsic_vsaddu_vx_nxv4i32_nxv4i32_i32:
 ; CHECK:       # %bb.0: # %entry
 ; CHECK-NEXT:    vsetvli zero, a1, e32, m2, ta, ma
@@ -1762,7 +1764,7 @@ entry:
     <vscale x 4 x i32> undef,
     <vscale x 4 x i32> %0,
     i32 %1,
-    i64 %2)
+    iXLen %2)
 
   ret <vscale x 4 x i32> %a
 }
@@ -1772,10 +1774,10 @@ declare <vscale x 4 x i32> @llvm.riscv.vsaddu.mask.nxv4i32.i32(
   <vscale x 4 x i32>,
   i32,
   <vscale x 4 x i1>,
-  i64,
-  i64);
+  iXLen,
+  iXLen)
 
-define <vscale x 4 x i32> @intrinsic_vsaddu_mask_vx_nxv4i32_nxv4i32_i32(<vscale x 4 x i32> %0, <vscale x 4 x i32> %1, i32 %2, <vscale x 4 x i1> %3, i64 %4) nounwind {
+define <vscale x 4 x i32> @intrinsic_vsaddu_mask_vx_nxv4i32_nxv4i32_i32(<vscale x 4 x i32> %0, <vscale x 4 x i32> %1, i32 %2, <vscale x 4 x i1> %3, iXLen %4) nounwind {
 ; CHECK-LABEL: intrinsic_vsaddu_mask_vx_nxv4i32_nxv4i32_i32:
 ; CHECK:       # %bb.0: # %entry
 ; CHECK-NEXT:    vsetvli zero, a1, e32, m2, ta, mu
@@ -1787,7 +1789,7 @@ entry:
     <vscale x 4 x i32> %1,
     i32 %2,
     <vscale x 4 x i1> %3,
-    i64 %4, i64 1)
+    iXLen %4, iXLen 1)
 
   ret <vscale x 4 x i32> %a
 }
@@ -1796,9 +1798,9 @@ declare <vscale x 8 x i32> @llvm.riscv.vsaddu.nxv8i32.i32(
   <vscale x 8 x i32>,
   <vscale x 8 x i32>,
   i32,
-  i64);
+  iXLen)
 
-define <vscale x 8 x i32> @intrinsic_vsaddu_vx_nxv8i32_nxv8i32_i32(<vscale x 8 x i32> %0, i32 %1, i64 %2) nounwind {
+define <vscale x 8 x i32> @intrinsic_vsaddu_vx_nxv8i32_nxv8i32_i32(<vscale x 8 x i32> %0, i32 %1, iXLen %2) nounwind {
 ; CHECK-LABEL: intrinsic_vsaddu_vx_nxv8i32_nxv8i32_i32:
 ; CHECK:       # %bb.0: # %entry
 ; CHECK-NEXT:    vsetvli zero, a1, e32, m4, ta, ma
@@ -1809,7 +1811,7 @@ entry:
     <vscale x 8 x i32> undef,
     <vscale x 8 x i32> %0,
     i32 %1,
-    i64 %2)
+    iXLen %2)
 
   ret <vscale x 8 x i32> %a
 }
@@ -1819,10 +1821,10 @@ declare <vscale x 8 x i32> @llvm.riscv.vsaddu.mask.nxv8i32.i32(
   <vscale x 8 x i32>,
   i32,
   <vscale x 8 x i1>,
-  i64,
-  i64);
+  iXLen,
+  iXLen)
 
-define <vscale x 8 x i32> @intrinsic_vsaddu_mask_vx_nxv8i32_nxv8i32_i32(<vscale x 8 x i32> %0, <vscale x 8 x i32> %1, i32 %2, <vscale x 8 x i1> %3, i64 %4) nounwind {
+define <vscale x 8 x i32> @intrinsic_vsaddu_mask_vx_nxv8i32_nxv8i32_i32(<vscale x 8 x i32> %0, <vscale x 8 x i32> %1, i32 %2, <vscale x 8 x i1> %3, iXLen %4) nounwind {
 ; CHECK-LABEL: intrinsic_vsaddu_mask_vx_nxv8i32_nxv8i32_i32:
 ; CHECK:       # %bb.0: # %entry
 ; CHECK-NEXT:    vsetvli zero, a1, e32, m4, ta, mu
@@ -1834,7 +1836,7 @@ entry:
     <vscale x 8 x i32> %1,
     i32 %2,
     <vscale x 8 x i1> %3,
-    i64 %4, i64 1)
+    iXLen %4, iXLen 1)
 
   ret <vscale x 8 x i32> %a
 }
@@ -1843,9 +1845,9 @@ declare <vscale x 16 x i32> @llvm.riscv.vsaddu.nxv16i32.i32(
   <vscale x 16 x i32>,
   <vscale x 16 x i32>,
   i32,
-  i64);
+  iXLen)
 
-define <vscale x 16 x i32> @intrinsic_vsaddu_vx_nxv16i32_nxv16i32_i32(<vscale x 16 x i32> %0, i32 %1, i64 %2) nounwind {
+define <vscale x 16 x i32> @intrinsic_vsaddu_vx_nxv16i32_nxv16i32_i32(<vscale x 16 x i32> %0, i32 %1, iXLen %2) nounwind {
 ; CHECK-LABEL: intrinsic_vsaddu_vx_nxv16i32_nxv16i32_i32:
 ; CHECK:       # %bb.0: # %entry
 ; CHECK-NEXT:    vsetvli zero, a1, e32, m8, ta, ma
@@ -1856,7 +1858,7 @@ entry:
     <vscale x 16 x i32> undef,
     <vscale x 16 x i32> %0,
     i32 %1,
-    i64 %2)
+    iXLen %2)
 
   ret <vscale x 16 x i32> %a
 }
@@ -1866,10 +1868,10 @@ declare <vscale x 16 x i32> @llvm.riscv.vsaddu.mask.nxv16i32.i32(
   <vscale x 16 x i32>,
   i32,
   <vscale x 16 x i1>,
-  i64,
-  i64);
+  iXLen,
+  iXLen)
 
-define <vscale x 16 x i32> @intrinsic_vsaddu_mask_vx_nxv16i32_nxv16i32_i32(<vscale x 16 x i32> %0, <vscale x 16 x i32> %1, i32 %2, <vscale x 16 x i1> %3, i64 %4) nounwind {
+define <vscale x 16 x i32> @intrinsic_vsaddu_mask_vx_nxv16i32_nxv16i32_i32(<vscale x 16 x i32> %0, <vscale x 16 x i32> %1, i32 %2, <vscale x 16 x i1> %3, iXLen %4) nounwind {
 ; CHECK-LABEL: intrinsic_vsaddu_mask_vx_nxv16i32_nxv16i32_i32:
 ; CHECK:       # %bb.0: # %entry
 ; CHECK-NEXT:    vsetvli zero, a1, e32, m8, ta, mu
@@ -1881,7 +1883,7 @@ entry:
     <vscale x 16 x i32> %1,
     i32 %2,
     <vscale x 16 x i1> %3,
-    i64 %4, i64 1)
+    iXLen %4, iXLen 1)
 
   ret <vscale x 16 x i32> %a
 }
@@ -1890,20 +1892,32 @@ declare <vscale x 1 x i64> @llvm.riscv.vsaddu.nxv1i64.i64(
   <vscale x 1 x i64>,
   <vscale x 1 x i64>,
   i64,
-  i64);
-
-define <vscale x 1 x i64> @intrinsic_vsaddu_vx_nxv1i64_nxv1i64_i64(<vscale x 1 x i64> %0, i64 %1, i64 %2) nounwind {
-; CHECK-LABEL: intrinsic_vsaddu_vx_nxv1i64_nxv1i64_i64:
-; CHECK:       # %bb.0: # %entry
-; CHECK-NEXT:    vsetvli zero, a1, e64, m1, ta, ma
-; CHECK-NEXT:    vsaddu.vx v8, v8, a0
-; CHECK-NEXT:    ret
+  iXLen)
+
+define <vscale x 1 x i64> @intrinsic_vsaddu_vx_nxv1i64_nxv1i64_i64(<vscale x 1 x i64> %0, i64 %1, iXLen %2) nounwind {
+; RV32-LABEL: intrinsic_vsaddu_vx_nxv1i64_nxv1i64_i64:
+; RV32:       # %bb.0: # %entry
+; RV32-NEXT:    addi sp, sp, -16
+; RV32-NEXT:    sw a1, 12(sp)
+; RV32-NEXT:    sw a0, 8(sp)
+; RV32-NEXT:    addi a0, sp, 8
+; RV32-NEXT:    vsetvli zero, a2, e64, m1, ta, ma
+; RV32-NEXT:    vlse64.v v9, (a0), zero
+; RV32-NEXT:    vsaddu.vv v8, v8, v9
+; RV32-NEXT:    addi sp, sp, 16
+; RV32-NEXT:    ret
+;
+; RV64-LABEL: intrinsic_vsaddu_vx_nxv1i64_nxv1i64_i64:
+; RV64:       # %bb.0: # %entry
+; RV64-NEXT:    vsetvli zero, a1, e64, m1, ta, ma
+; RV64-NEXT:    vsaddu.vx v8, v8, a0
+; RV64-NEXT:    ret
 entry:
   %a = call <vscale x 1 x i64> @llvm.riscv.vsaddu.nxv1i64.i64(
     <vscale x 1 x i64> undef,
     <vscale x 1 x i64> %0,
     i64 %1,
-    i64 %2)
+    iXLen %2)
 
   ret <vscale x 1 x i64> %a
 }
@@ -1913,22 +1927,34 @@ declare <vscale x 1 x i64> @llvm.riscv.vsaddu.mask.nxv1i64.i64(
   <vscale x 1 x i64>,
   i64,
   <vscale x 1 x i1>,
-  i64,
-  i64);
-
-define <vscale x 1 x i64> @intrinsic_vsaddu_mask_vx_nxv1i64_nxv1i64_i64(<vscale x 1 x i64> %0, <vscale x 1 x i64> %1, i64 %2, <vscale x 1 x i1> %3, i64 %4) nounwind {
-; CHECK-LABEL: intrinsic_vsaddu_mask_vx_nxv1i64_nxv1i64_i64:
-; CHECK:       # %bb.0: # %entry
-; CHECK-NEXT:    vsetvli zero, a1, e64, m1, ta, mu
-; CHECK-NEXT:    vsaddu.vx v8, v9, a0, v0.t
-; CHECK-NEXT:    ret
+  iXLen,
+  iXLen)
+
+define <vscale x 1 x i64> @intrinsic_vsaddu_mask_vx_nxv1i64_nxv1i64_i64(<vscale x 1 x i64> %0, <vscale x 1 x i64> %1, i64 %2, <vscale x 1 x i1> %3, iXLen %4) nounwind {
+; RV32-LABEL: intrinsic_vsaddu_mask_vx_nxv1i64_nxv1i64_i64:
+; RV32:       # %bb.0: # %entry
+; RV32-NEXT:    addi sp, sp, -16
+; RV32-NEXT:    sw a1, 12(sp)
+; RV32-NEXT:    sw a0, 8(sp)
+; RV32-NEXT:    addi a0, sp, 8
+; RV32-NEXT:    vsetvli zero, a2, e64, m1, ta, mu
+; RV32-NEXT:    vlse64.v v10, (a0), zero
+; RV32-NEXT:    vsaddu.vv v8, v9, v10, v0.t
+; RV32-NEXT:    addi sp, sp, 16
+; RV32-NEXT:    ret
+;
+; RV64-LABEL: intrinsic_vsaddu_mask_vx_nxv1i64_nxv1i64_i64:
+; RV64:       # %bb.0: # %entry
+; RV64-NEXT:    vsetvli zero, a1, e64, m1, ta, mu
+; RV64-NEXT:    vsaddu.vx v8, v9, a0, v0.t
+; RV64-NEXT:    ret
 entry:
   %a = call <vscale x 1 x i64> @llvm.riscv.vsaddu.mask.nxv1i64.i64(
     <vscale x 1 x i64> %0,
     <vscale x 1 x i64> %1,
     i64 %2,
     <vscale x 1 x i1> %3,
-    i64 %4, i64 1)
+    iXLen %4, iXLen 1)
 
   ret <vscale x 1 x i64> %a
 }
@@ -1937,20 +1963,32 @@ declare <vscale x 2 x i64> @llvm.riscv.vsaddu.nxv2i64.i64(
   <vscale x 2 x i64>,
   <vscale x 2 x i64>,
   i64,
-  i64);
-
-define <vscale x 2 x i64> @intrinsic_vsaddu_vx_nxv2i64_nxv2i64_i64(<vscale x 2 x i64> %0, i64 %1, i64 %2) nounwind {
-; CHECK-LABEL: intrinsic_vsaddu_vx_nxv2i64_nxv2i64_i64:
-; CHECK:       # %bb.0: # %entry
-; CHECK-NEXT:    vsetvli zero, a1, e64, m2, ta, ma
-; CHECK-NEXT:    vsaddu.vx v8, v8, a0
-; CHECK-NEXT:    ret
+  iXLen)
+
+define <vscale x 2 x i64> @intrinsic_vsaddu_vx_nxv2i64_nxv2i64_i64(<vscale x 2 x i64> %0, i64 %1, iXLen %2) nounwind {
+; RV32-LABEL: intrinsic_vsaddu_vx_nxv2i64_nxv2i64_i64:
+; RV32:       # %bb.0: # %entry
+; RV32-NEXT:    addi sp, sp, -16
+; RV32-NEXT:    sw a1, 12(sp)
+; RV32-NEXT:    sw a0, 8(sp)
+; RV32-NEXT:    addi a0, sp, 8
+; RV32-NEXT:    vsetvli zero, a2, e64, m2, ta, ma
+; RV32-NEXT:    vlse64.v v10, (a0), zero
+; RV32-NEXT:    vsaddu.vv v8, v8, v10
+; RV32-NEXT:    addi sp, sp, 16
+; RV32-NEXT:    ret
+;
+; RV64-LABEL: intrinsic_vsaddu_vx_nxv2i64_nxv2i64_i64:
+; RV64:       # %bb.0: # %entry
+; RV64-NEXT:    vsetvli zero, a1, e64, m2, ta, ma
+; RV64-NEXT:    vsaddu.vx v8, v8, a0
+; RV64-NEXT:    ret
 entry:
   %a = call <vscale x 2 x i64> @llvm.riscv.vsaddu.nxv2i64.i64(
     <vscale x 2 x i64> undef,
     <vscale x 2 x i64> %0,
     i64 %1,
-    i64 %2)
+    iXLen %2)
 
   ret <vscale x 2 x i64> %a
 }
@@ -1960,22 +1998,34 @@ declare <vscale x 2 x i64> @llvm.riscv.vsaddu.mask.nxv2i64.i64(
   <vscale x 2 x i64>,
   i64,
   <vscale x 2 x i1>,
-  i64,
-  i64);
-
-define <vscale x 2 x i64> @intrinsic_vsaddu_mask_vx_nxv2i64_nxv2i64_i64(<vscale x 2 x i64> %0, <vscale x 2 x i64> %1, i64 %2, <vscale x 2 x i1> %3, i64 %4) nounwind {
-; CHECK-LABEL: intrinsic_vsaddu_mask_vx_nxv2i64_nxv2i64_i64:
-; CHECK:       # %bb.0: # %entry
-; CHECK-NEXT:    vsetvli zero, a1, e64, m2, ta, mu
-; CHECK-NEXT:    vsaddu.vx v8, v10, a0, v0.t
-; CHECK-NEXT:    ret
+  iXLen,
+  iXLen)
+
+define <vscale x 2 x i64> @intrinsic_vsaddu_mask_vx_nxv2i64_nxv2i64_i64(<vscale x 2 x i64> %0, <vscale x 2 x i64> %1, i64 %2, <vscale x 2 x i1> %3, iXLen %4) nounwind {
+; RV32-LABEL: intrinsic_vsaddu_mask_vx_nxv2i64_nxv2i64_i64:
+; RV32:       # %bb.0: # %entry
+; RV32-NEXT:    addi sp, sp, -16
+; RV32-NEXT:    sw a1, 12(sp)
+; RV32-NEXT:    sw a0, 8(sp)
+; RV32-NEXT:    addi a0, sp, 8
+; RV32-NEXT:    vsetvli zero, a2, e64, m2, ta, mu
+; RV32-NEXT:    vlse64.v v12, (a0), zero
+; RV32-NEXT:    vsaddu.vv v8, v10, v12, v0.t
+; RV32-NEXT:    addi sp, sp, 16
+; RV32-NEXT:    ret
+;
+; RV64-LABEL: intrinsic_vsaddu_mask_vx_nxv2i64_nxv2i64_i64:
+; RV64:       # %bb.0: # %entry
+; RV64-NEXT:    vsetvli zero, a1, e64, m2, ta, mu
+; RV64-NEXT:    vsaddu.vx v8, v10, a0, v0.t
+; RV64-NEXT:    ret
 entry:
   %a = call <vscale x 2 x i64> @llvm.riscv.vsaddu.mask.nxv2i64.i64(
     <vscale x 2 x i64> %0,
     <vscale x 2 x i64> %1,
     i64 %2,
     <vscale x 2 x i1> %3,
-    i64 %4, i64 1)
+    iXLen %4, iXLen 1)
 
   ret <vscale x 2 x i64> %a
 }
@@ -1984,20 +2034,32 @@ declare <vscale x 4 x i64> @llvm.riscv.vsaddu.nxv4i64.i64(
   <vscale x 4 x i64>,
   <vscale x 4 x i64>,
   i64,
-  i64);
-
-define <vscale x 4 x i64> @intrinsic_vsaddu_vx_nxv4i64_nxv4i64_i64(<vscale x 4 x i64> %0, i64 %1, i64 %2) nounwind {
-; CHECK-LABEL: intrinsic_vsaddu_vx_nxv4i64_nxv4i64_i64:
-; CHECK:       # %bb.0: # %entry
-; CHECK-NEXT:    vsetvli zero, a1, e64, m4, ta, ma
-; CHECK-NEXT:    vsaddu.vx v8, v8, a0
-; CHECK-NEXT:    ret
+  iXLen)
+
+define <vscale x 4 x i64> @intrinsic_vsaddu_vx_nxv4i64_nxv4i64_i64(<vscale x 4 x i64> %0, i64 %1, iXLen %2) nounwind {
+; RV32-LABEL: intrinsic_vsaddu_vx_nxv4i64_nxv4i64_i64:
+; RV32:       # %bb.0: # %entry
+; RV32-NEXT:    addi sp, sp, -16
+; RV32-NEXT:    sw a1, 12(sp)
+; RV32-NEXT:    sw a0, 8(sp)
+; RV32-NEXT:    addi a0, sp, 8
+; RV32-NEXT:    vsetvli zero, a2, e64, m4, ta, ma
+; RV32-NEXT:    vlse64.v v12, (a0), zero
+; RV32-NEXT:    vsaddu.vv v8, v8, v12
+; RV32-NEXT:    addi sp, sp, 16
+; RV32-NEXT:    ret
+;
+; RV64-LABEL: intrinsic_vsaddu_vx_nxv4i64_nxv4i64_i64:
+; RV64:       # %bb.0: # %entry
+; RV64-NEXT:    vsetvli zero, a1, e64, m4, ta, ma
+; RV64-NEXT:    vsaddu.vx v8, v8, a0
+; RV64-NEXT:    ret
 entry:
   %a = call <vscale x 4 x i64> @llvm.riscv.vsaddu.nxv4i64.i64(
     <vscale x 4 x i64> undef,
     <vscale x 4 x i64> %0,
     i64 %1,
-    i64 %2)
+    iXLen %2)
 
   ret <vscale x 4 x i64> %a
 }
@@ -2007,22 +2069,34 @@ declare <vscale x 4 x i64> @llvm.riscv.vsaddu.mask.nxv4i64.i64(
   <vscale x 4 x i64>,
   i64,
   <vscale x 4 x i1>,
-  i64,
-  i64);
-
-define <vscale x 4 x i64> @intrinsic_vsaddu_mask_vx_nxv4i64_nxv4i64_i64(<vscale x 4 x i64> %0, <vscale x 4 x i64> %1, i64 %2, <vscale x 4 x i1> %3, i64 %4) nounwind {
-; CHECK-LABEL: intrinsic_vsaddu_mask_vx_nxv4i64_nxv4i64_i64:
-; CHECK:       # %bb.0: # %entry
-; CHECK-NEXT:    vsetvli zero, a1, e64, m4, ta, mu
-; CHECK-NEXT:    vsaddu.vx v8, v12, a0, v0.t
-; CHECK-NEXT:    ret
+  iXLen,
+  iXLen)
+
+define <vscale x 4 x i64> @intrinsic_vsaddu_mask_vx_nxv4i64_nxv4i64_i64(<vscale x 4 x i64> %0, <vscale x 4 x i64> %1, i64 %2, <vscale x 4 x i1> %3, iXLen %4) nounwind {
+; RV32-LABEL: intrinsic_vsaddu_mask_vx_nxv4i64_nxv4i64_i64:
+; RV32:       # %bb.0: # %entry
+; RV32-NEXT:    addi sp, sp, -16
+; RV32-NEXT:    sw a1, 12(sp)
+; RV32-NEXT:    sw a0, 8(sp)
+; RV32-NEXT:    addi a0, sp, 8
+; RV32-NEXT:    vsetvli zero, a2, e64, m4, ta, mu
+; RV32-NEXT:    vlse64.v v16, (a0), zero
+; RV32-NEXT:    vsaddu.vv v8, v12, v16, v0.t
+; RV32-NEXT:    addi sp, sp, 16
+; RV32-NEXT:    ret
+;
+; RV64-LABEL: intrinsic_vsaddu_mask_vx_nxv4i64_nxv4i64_i64:
+; RV64:       # %bb.0: # %entry
+; RV64-NEXT:    vsetvli zero, a1, e64, m4, ta, mu
+; RV64-NEXT:    vsaddu.vx v8, v12, a0, v0.t
+; RV64-NEXT:    ret
 entry:
   %a = call <vscale x 4 x i64> @llvm.riscv.vsaddu.mask.nxv4i64.i64(
     <vscale x 4 x i64> %0,
     <vscale x 4 x i64> %1,
     i64 %2,
     <vscale x 4 x i1> %3,
-    i64 %4, i64 1)
+    iXLen %4, iXLen 1)
 
   ret <vscale x 4 x i64> %a
 }
@@ -2031,20 +2105,32 @@ declare <vscale x 8 x i64> @llvm.riscv.vsaddu.nxv8i64.i64(
   <vscale x 8 x i64>,
   <vscale x 8 x i64>,
   i64,
-  i64);
-
-define <vscale x 8 x i64> @intrinsic_vsaddu_vx_nxv8i64_nxv8i64_i64(<vscale x 8 x i64> %0, i64 %1, i64 %2) nounwind {
-; CHECK-LABEL: intrinsic_vsaddu_vx_nxv8i64_nxv8i64_i64:
-; CHECK:       # %bb.0: # %entry
-; CHECK-NEXT:    vsetvli zero, a1, e64, m8, ta, ma
-; CHECK-NEXT:    vsaddu.vx v8, v8, a0
-; CHECK-NEXT:    ret
+  iXLen)
+
+define <vscale x 8 x i64> @intrinsic_vsaddu_vx_nxv8i64_nxv8i64_i64(<vscale x 8 x i64> %0, i64 %1, iXLen %2) nounwind {
+; RV32-LABEL: intrinsic_vsaddu_vx_nxv8i64_nxv8i64_i64:
+; RV32:       # %bb.0: # %entry
+; RV32-NEXT:    addi sp, sp, -16
+; RV32-NEXT:    sw a1, 12(sp)
+; RV32-NEXT:    sw a0, 8(sp)
+; RV32-NEXT:    addi a0, sp, 8
+; RV32-NEXT:    vsetvli zero, a2, e64, m8, ta, ma
+; RV32-NEXT:    vlse64.v v16, (a0), zero
+; RV32-NEXT:    vsaddu.vv v8, v8, v16
+; RV32-NEXT:    addi sp, sp, 16
+; RV32-NEXT:    ret
+;
+; RV64-LABEL: intrinsic_vsaddu_vx_nxv8i64_nxv8i64_i64:
+; RV64:       # %bb.0: # %entry
+; RV64-NEXT:    vsetvli zero, a1, e64, m8, ta, ma
+; RV64-NEXT:    vsaddu.vx v8, v8, a0
+; RV64-NEXT:    ret
 entry:
   %a = call <vscale x 8 x i64> @llvm.riscv.vsaddu.nxv8i64.i64(
     <vscale x 8 x i64> undef,
     <vscale x 8 x i64> %0,
     i64 %1,
-    i64 %2)
+    iXLen %2)
 
   ret <vscale x 8 x i64> %a
 }
@@ -2054,27 +2140,39 @@ declare <vscale x 8 x i64> @llvm.riscv.vsaddu.mask.nxv8i64.i64(
   <vscale x 8 x i64>,
   i64,
   <vscale x 8 x i1>,
-  i64,
-  i64);
-
-define <vscale x 8 x i64> @intrinsic_vsaddu_mask_vx_nxv8i64_nxv8i64_i64(<vscale x 8 x i64> %0, <vscale x 8 x i64> %1, i64 %2, <vscale x 8 x i1> %3, i64 %4) nounwind {
-; CHECK-LABEL: intrinsic_vsaddu_mask_vx_nxv8i64_nxv8i64_i64:
-; CHECK:       # %bb.0: # %entry
-; CHECK-NEXT:    vsetvli zero, a1, e64, m8, ta, mu
-; CHECK-NEXT:    vsaddu.vx v8, v16, a0, v0.t
-; CHECK-NEXT:    ret
+  iXLen,
+  iXLen)
+
+define <vscale x 8 x i64> @intrinsic_vsaddu_mask_vx_nxv8i64_nxv8i64_i64(<vscale x 8 x i64> %0, <vscale x 8 x i64> %1, i64 %2, <vscale x 8 x i1> %3, iXLen %4) nounwind {
+; RV32-LABEL: intrinsic_vsaddu_mask_vx_nxv8i64_nxv8i64_i64:
+; RV32:       # %bb.0: # %entry
+; RV32-NEXT:    addi sp, sp, -16
+; RV32-NEXT:    sw a1, 12(sp)
+; RV32-NEXT:    sw a0, 8(sp)
+; RV32-NEXT:    addi a0, sp, 8
+; RV32-NEXT:    vsetvli zero, a2, e64, m8, ta, mu
+; RV32-NEXT:    vlse64.v v24, (a0), zero
+; RV32-NEXT:    vsaddu.vv v8, v16, v24, v0.t
+; RV32-NEXT:    addi sp, sp, 16
+; RV32-NEXT:    ret
+;
+; RV64-LABEL: intrinsic_vsaddu_mask_vx_nxv8i64_nxv8i64_i64:
+; RV64:       # %bb.0: # %entry
+; RV64-NEXT:    vsetvli zero, a1, e64, m8, ta, mu
+; RV64-NEXT:    vsaddu.vx v8, v16, a0, v0.t
+; RV64-NEXT:    ret
 entry:
   %a = call <vscale x 8 x i64> @llvm.riscv.vsaddu.mask.nxv8i64.i64(
     <vscale x 8 x i64> %0,
     <vscale x 8 x i64> %1,
     i64 %2,
     <vscale x 8 x i1> %3,
-    i64 %4, i64 1)
+    iXLen %4, iXLen 1)
 
   ret <vscale x 8 x i64> %a
 }
 
-define <vscale x 1 x i8> @intrinsic_vsaddu_vi_nxv1i8_nxv1i8_i8(<vscale x 1 x i8> %0, i64 %1) nounwind {
+define <vscale x 1 x i8> @intrinsic_vsaddu_vi_nxv1i8_nxv1i8_i8(<vscale x 1 x i8> %0, iXLen %1) nounwind {
 ; CHECK-LABEL: intrinsic_vsaddu_vi_nxv1i8_nxv1i8_i8:
 ; CHECK:       # %bb.0: # %entry
 ; CHECK-NEXT:    vsetvli zero, a0, e8, mf8, ta, ma
@@ -2085,12 +2183,12 @@ entry:
     <vscale x 1 x i8> undef,
     <vscale x 1 x i8> %0,
     i8 9,
-    i64 %1)
+    iXLen %1)
 
   ret <vscale x 1 x i8> %a
 }
 
-define <vscale x 1 x i8> @intrinsic_vsaddu_mask_vi_nxv1i8_nxv1i8_i8(<vscale x 1 x i8> %0, <vscale x 1 x i8> %1, <vscale x 1 x i1> %2, i64 %3) nounwind {
+define <vscale x 1 x i8> @intrinsic_vsaddu_mask_vi_nxv1i8_nxv1i8_i8(<vscale x 1 x i8> %0, <vscale x 1 x i8> %1, <vscale x 1 x i1> %2, iXLen %3) nounwind {
 ; CHECK-LABEL: intrinsic_vsaddu_mask_vi_nxv1i8_nxv1i8_i8:
 ; CHECK:       # %bb.0: # %entry
 ; CHECK-NEXT:    vsetvli zero, a0, e8, mf8, ta, mu
@@ -2102,12 +2200,12 @@ entry:
     <vscale x 1 x i8> %1,
     i8 9,
     <vscale x 1 x i1> %2,
-    i64 %3, i64 1)
+    iXLen %3, iXLen 1)
 
   ret <vscale x 1 x i8> %a
 }
 
-define <vscale x 2 x i8> @intrinsic_vsaddu_vi_nxv2i8_nxv2i8_i8(<vscale x 2 x i8> %0, i64 %1) nounwind {
+define <vscale x 2 x i8> @intrinsic_vsaddu_vi_nxv2i8_nxv2i8_i8(<vscale x 2 x i8> %0, iXLen %1) nounwind {
 ; CHECK-LABEL: intrinsic_vsaddu_vi_nxv2i8_nxv2i8_i8:
 ; CHECK:       # %bb.0: # %entry
 ; CHECK-NEXT:    vsetvli zero, a0, e8, mf4, ta, ma
@@ -2118,12 +2216,12 @@ entry:
     <vscale x 2 x i8> undef,
     <vscale x 2 x i8> %0,
     i8 9,
-    i64 %1)
+    iXLen %1)
 
   ret <vscale x 2 x i8> %a
 }
 
-define <vscale x 2 x i8> @intrinsic_vsaddu_mask_vi_nxv2i8_nxv2i8_i8(<vscale x 2 x i8> %0, <vscale x 2 x i8> %1, <vscale x 2 x i1> %2, i64 %3) nounwind {
+define <vscale x 2 x i8> @intrinsic_vsaddu_mask_vi_nxv2i8_nxv2i8_i8(<vscale x 2 x i8> %0, <vscale x 2 x i8> %1, <vscale x 2 x i1> %2, iXLen %3) nounwind {
 ; CHECK-LABEL: intrinsic_vsaddu_mask_vi_nxv2i8_nxv2i8_i8:
 ; CHECK:       # %bb.0: # %entry
 ; CHECK-NEXT:    vsetvli zero, a0, e8, mf4, ta, mu
@@ -2135,12 +2233,12 @@ entry:
     <vscale x 2 x i8> %1,
     i8 9,
     <vscale x 2 x i1> %2,
-    i64 %3, i64 1)
+    iXLen %3, iXLen 1)
 
   ret <vscale x 2 x i8> %a
 }
 
-define <vscale x 4 x i8> @intrinsic_vsaddu_vi_nxv4i8_nxv4i8_i8(<vscale x 4 x i8> %0, i64 %1) nounwind {
+define <vscale x 4 x i8> @intrinsic_vsaddu_vi_nxv4i8_nxv4i8_i8(<vscale x 4 x i8> %0, iXLen %1) nounwind {
 ; CHECK-LABEL: intrinsic_vsaddu_vi_nxv4i8_nxv4i8_i8:
 ; CHECK:       # %bb.0: # %entry
 ; CHECK-NEXT:    vsetvli zero, a0, e8, mf2, ta, ma
@@ -2151,12 +2249,12 @@ entry:
     <vscale x 4 x i8> undef,
     <vscale x 4 x i8> %0,
     i8 9,
-    i64 %1)
+    iXLen %1)
 
   ret <vscale x 4 x i8> %a
 }
 
-define <vscale x 4 x i8> @intrinsic_vsaddu_mask_vi_nxv4i8_nxv4i8_i8(<vscale x 4 x i8> %0, <vscale x 4 x i8> %1, <vscale x 4 x i1> %2, i64 %3) nounwind {
+define <vscale x 4 x i8> @intrinsic_vsaddu_mask_vi_nxv4i8_nxv4i8_i8(<vscale x 4 x i8> %0, <vscale x 4 x i8> %1, <vscale x 4 x i1> %2, iXLen %3) nounwind {
 ; CHECK-LABEL: intrinsic_vsaddu_mask_vi_nxv4i8_nxv4i8_i8:
 ; CHECK:       # %bb.0: # %entry
 ; CHECK-NEXT:    vsetvli zero, a0, e8, mf2, ta, mu
@@ -2168,12 +2266,12 @@ entry:
     <vscale x 4 x i8> %1,
     i8 9,
     <vscale x 4 x i1> %2,
-    i64 %3, i64 1)
+    iXLen %3, iXLen 1)
 
   ret <vscale x 4 x i8> %a
 }
 
-define <vscale x 8 x i8> @intrinsic_vsaddu_vi_nxv8i8_nxv8i8_i8(<vscale x 8 x i8> %0, i64 %1) nounwind {
+define <vscale x 8 x i8> @intrinsic_vsaddu_vi_nxv8i8_nxv8i8_i8(<vscale x 8 x i8> %0, iXLen %1) nounwind {
 ; CHECK-LABEL: intrinsic_vsaddu_vi_nxv8i8_nxv8i8_i8:
 ; CHECK:       # %bb.0: # %entry
 ; CHECK-NEXT:    vsetvli zero, a0, e8, m1, ta, ma
@@ -2184,12 +2282,12 @@ entry:
     <vscale x 8 x i8> undef,
     <vscale x 8 x i8> %0,
     i8 9,
-    i64 %1)
+    iXLen %1)
 
   ret <vscale x 8 x i8> %a
 }
 
-define <vscale x 8 x i8> @intrinsic_vsaddu_mask_vi_nxv8i8_nxv8i8_i8(<vscale x 8 x i8> %0, <vscale x 8 x i8> %1, <vscale x 8 x i1> %2, i64 %3) nounwind {
+define <vscale x 8 x i8> @intrinsic_vsaddu_mask_vi_nxv8i8_nxv8i8_i8(<vscale x 8 x i8> %0, <vscale x 8 x i8> %1, <vscale x 8 x i1> %2, iXLen %3) nounwind {
 ; CHECK-LABEL: intrinsic_vsaddu_mask_vi_nxv8i8_nxv8i8_i8:
 ; CHECK:       # %bb.0: # %entry
 ; CHECK-NEXT:    vsetvli zero, a0, e8, m1, ta, mu
@@ -2201,12 +2299,12 @@ entry:
     <vscale x 8 x i8> %1,
     i8 9,
     <vscale x 8 x i1> %2,
-    i64 %3, i64 1)
+    iXLen %3, iXLen 1)
 
   ret <vscale x 8 x i8> %a
 }
 
-define <vscale x 16 x i8> @intrinsic_vsaddu_vi_nxv16i8_nxv16i8_i8(<vscale x 16 x i8> %0, i64 %1) nounwind {
+define <vscale x 16 x i8> @intrinsic_vsaddu_vi_nxv16i8_nxv16i8_i8(<vscale x 16 x i8> %0, iXLen %1) nounwind {
 ; CHECK-LABEL: intrinsic_vsaddu_vi_nxv16i8_nxv16i8_i8:
 ; CHECK:       # %bb.0: # %entry
 ; CHECK-NEXT:    vsetvli zero, a0, e8, m2, ta, ma
@@ -2217,12 +2315,12 @@ entry:
     <vscale x 16 x i8> undef,
     <vscale x 16 x i8> %0,
     i8 9,
-    i64 %1)
+    iXLen %1)
 
   ret <vscale x 16 x i8> %a
 }
 
-define <vscale x 16 x i8> @intrinsic_vsaddu_mask_vi_nxv16i8_nxv16i8_i8(<vscale x 16 x i8> %0, <vscale x 16 x i8> %1, <vscale x 16 x i1> %2, i64 %3) nounwind {
+define <vscale x 16 x i8> @intrinsic_vsaddu_mask_vi_nxv16i8_nxv16i8_i8(<vscale x 16 x i8> %0, <vscale x 16 x i8> %1, <vscale x 16 x i1> %2, iXLen %3) nounwind {
 ; CHECK-LABEL: intrinsic_vsaddu_mask_vi_nxv16i8_nxv16i8_i8:
 ; CHECK:       # %bb.0: # %entry
 ; CHECK-NEXT:    vsetvli zero, a0, e8, m2, ta, mu
@@ -2234,12 +2332,12 @@ entry:
     <vscale x 16 x i8> %1,
     i8 9,
     <vscale x 16 x i1> %2,
-    i64 %3, i64 1)
+    iXLen %3, iXLen 1)
 
   ret <vscale x 16 x i8> %a
 }
 
-define <vscale x 32 x i8> @intrinsic_vsaddu_vi_nxv32i8_nxv32i8_i8(<vscale x 32 x i8> %0, i64 %1) nounwind {
+define <vscale x 32 x i8> @intrinsic_vsaddu_vi_nxv32i8_nxv32i8_i8(<vscale x 32 x i8> %0, iXLen %1) nounwind {
 ; CHECK-LABEL: intrinsic_vsaddu_vi_nxv32i8_nxv32i8_i8:
 ; CHECK:       # %bb.0: # %entry
 ; CHECK-NEXT:    vsetvli zero, a0, e8, m4, ta, ma
@@ -2250,12 +2348,12 @@ entry:
     <vscale x 32 x i8> undef,
     <vscale x 32 x i8> %0,
     i8 9,
-    i64 %1)
+    iXLen %1)
 
   ret <vscale x 32 x i8> %a
 }
 
-define <vscale x 32 x i8> @intrinsic_vsaddu_mask_vi_nxv32i8_nxv32i8_i8(<vscale x 32 x i8> %0, <vscale x 32 x i8> %1, <vscale x 32 x i1> %2, i64 %3) nounwind {
+define <vscale x 32 x i8> @intrinsic_vsaddu_mask_vi_nxv32i8_nxv32i8_i8(<vscale x 32 x i8> %0, <vscale x 32 x i8> %1, <vscale x 32 x i1> %2, iXLen %3) nounwind {
 ; CHECK-LABEL: intrinsic_vsaddu_mask_vi_nxv32i8_nxv32i8_i8:
 ; CHECK:       # %bb.0: # %entry
 ; CHECK-NEXT:    vsetvli zero, a0, e8, m4, ta, mu
@@ -2267,12 +2365,12 @@ entry:
     <vscale x 32 x i8> %1,
     i8 9,
     <vscale x 32 x i1> %2,
-    i64 %3, i64 1)
+    iXLen %3, iXLen 1)
 
   ret <vscale x 32 x i8> %a
 }
 
-define <vscale x 64 x i8> @intrinsic_vsaddu_vi_nxv64i8_nxv64i8_i8(<vscale x 64 x i8> %0, i64 %1) nounwind {
+define <vscale x 64 x i8> @intrinsic_vsaddu_vi_nxv64i8_nxv64i8_i8(<vscale x 64 x i8> %0, iXLen %1) nounwind {
 ; CHECK-LABEL: intrinsic_vsaddu_vi_nxv64i8_nxv64i8_i8:
 ; CHECK:       # %bb.0: # %entry
 ; CHECK-NEXT:    vsetvli zero, a0, e8, m8, ta, ma
@@ -2283,12 +2381,12 @@ entry:
     <vscale x 64 x i8> undef,
     <vscale x 64 x i8> %0,
     i8 9,
-    i64 %1)
+    iXLen %1)
 
   ret <vscale x 64 x i8> %a
 }
 
-define <vscale x 64 x i8> @intrinsic_vsaddu_mask_vi_nxv64i8_nxv64i8_i8(<vscale x 64 x i8> %0, <vscale x 64 x i8> %1, <vscale x 64 x i1> %2, i64 %3) nounwind {
+define <vscale x 64 x i8> @intrinsic_vsaddu_mask_vi_nxv64i8_nxv64i8_i8(<vscale x 64 x i8> %0, <vscale x 64 x i8> %1, <vscale x 64 x i1> %2, iXLen %3) nounwind {
 ; CHECK-LABEL: intrinsic_vsaddu_mask_vi_nxv64i8_nxv64i8_i8:
 ; CHECK:       # %bb.0: # %entry
 ; CHECK-NEXT:    vsetvli zero, a0, e8, m8, ta, mu
@@ -2300,12 +2398,12 @@ entry:
     <vscale x 64 x i8> %1,
     i8 9,
     <vscale x 64 x i1> %2,
-    i64 %3, i64 1)
+    iXLen %3, iXLen 1)
 
   ret <vscale x 64 x i8> %a
 }
 
-define <vscale x 1 x i16> @intrinsic_vsaddu_vi_nxv1i16_nxv1i16_i16(<vscale x 1 x i16> %0, i64 %1) nounwind {
+define <vscale x 1 x i16> @intrinsic_vsaddu_vi_nxv1i16_nxv1i16_i16(<vscale x 1 x i16> %0, iXLen %1) nounwind {
 ; CHECK-LABEL: intrinsic_vsaddu_vi_nxv1i16_nxv1i16_i16:
 ; CHECK:       # %bb.0: # %entry
 ; CHECK-NEXT:    vsetvli zero, a0, e16, mf4, ta, ma
@@ -2316,12 +2414,12 @@ entry:
     <vscale x 1 x i16> undef,
     <vscale x 1 x i16> %0,
     i16 9,
-    i64 %1)
+    iXLen %1)
 
   ret <vscale x 1 x i16> %a
 }
 
-define <vscale x 1 x i16> @intrinsic_vsaddu_mask_vi_nxv1i16_nxv1i16_i16(<vscale x 1 x i16> %0, <vscale x 1 x i16> %1, <vscale x 1 x i1> %2, i64 %3) nounwind {
+define <vscale x 1 x i16> @intrinsic_vsaddu_mask_vi_nxv1i16_nxv1i16_i16(<vscale x 1 x i16> %0, <vscale x 1 x i16> %1, <vscale x 1 x i1> %2, iXLen %3) nounwind {
 ; CHECK-LABEL: intrinsic_vsaddu_mask_vi_nxv1i16_nxv1i16_i16:
 ; CHECK:       # %bb.0: # %entry
 ; CHECK-NEXT:    vsetvli zero, a0, e16, mf4, ta, mu
@@ -2333,12 +2431,12 @@ entry:
     <vscale x 1 x i16> %1,
     i16 9,
     <vscale x 1 x i1> %2,
-    i64 %3, i64 1)
+    iXLen %3, iXLen 1)
 
   ret <vscale x 1 x i16> %a
 }
 
-define <vscale x 2 x i16> @intrinsic_vsaddu_vi_nxv2i16_nxv2i16_i16(<vscale x 2 x i16> %0, i64 %1) nounwind {
+define <vscale x 2 x i16> @intrinsic_vsaddu_vi_nxv2i16_nxv2i16_i16(<vscale x 2 x i16> %0, iXLen %1) nounwind {
 ; CHECK-LABEL: intrinsic_vsaddu_vi_nxv2i16_nxv2i16_i16:
 ; CHECK:       # %bb.0: # %entry
 ; CHECK-NEXT:    vsetvli zero, a0, e16, mf2, ta, ma
@@ -2349,12 +2447,12 @@ entry:
     <vscale x 2 x i16> undef,
     <vscale x 2 x i16> %0,
     i16 9,
-    i64 %1)
+    iXLen %1)
 
   ret <vscale x 2 x i16> %a
 }
 
-define <vscale x 2 x i16> @intrinsic_vsaddu_mask_vi_nxv2i16_nxv2i16_i16(<vscale x 2 x i16> %0, <vscale x 2 x i16> %1, <vscale x 2 x i1> %2, i64 %3) nounwind {
+define <vscale x 2 x i16> @intrinsic_vsaddu_mask_vi_nxv2i16_nxv2i16_i16(<vscale x 2 x i16> %0, <vscale x 2 x i16> %1, <vscale x 2 x i1> %2, iXLen %3) nounwind {
 ; CHECK-LABEL: intrinsic_vsaddu_mask_vi_nxv2i16_nxv2i16_i16:
 ; CHECK:       # %bb.0: # %entry
 ; CHECK-NEXT:    vsetvli zero, a0, e16, mf2, ta, mu
@@ -2366,12 +2464,12 @@ entry:
     <vscale x 2 x i16> %1,
     i16 9,
     <vscale x 2 x i1> %2,
-    i64 %3, i64 1)
+    iXLen %3, iXLen 1)
 
   ret <vscale x 2 x i16> %a
 }
 
-define <vscale x 4 x i16> @intrinsic_vsaddu_vi_nxv4i16_nxv4i16_i16(<vscale x 4 x i16> %0, i64 %1) nounwind {
+define <vscale x 4 x i16> @intrinsic_vsaddu_vi_nxv4i16_nxv4i16_i16(<vscale x 4 x i16> %0, iXLen %1) nounwind {
 ; CHECK-LABEL: intrinsic_vsaddu_vi_nxv4i16_nxv4i16_i16:
 ; CHECK:       # %bb.0: # %entry
 ; CHECK-NEXT:    vsetvli zero, a0, e16, m1, ta, ma
@@ -2382,12 +2480,12 @@ entry:
     <vscale x 4 x i16> undef,
     <vscale x 4 x i16> %0,
     i16 9,
-    i64 %1)
+    iXLen %1)
 
   ret <vscale x 4 x i16> %a
 }
 
-define <vscale x 4 x i16> @intrinsic_vsaddu_mask_vi_nxv4i16_nxv4i16_i16(<vscale x 4 x i16> %0, <vscale x 4 x i16> %1, <vscale x 4 x i1> %2, i64 %3) nounwind {
+define <vscale x 4 x i16> @intrinsic_vsaddu_mask_vi_nxv4i16_nxv4i16_i16(<vscale x 4 x i16> %0, <vscale x 4 x i16> %1, <vscale x 4 x i1> %2, iXLen %3) nounwind {
 ; CHECK-LABEL: intrinsic_vsaddu_mask_vi_nxv4i16_nxv4i16_i16:
 ; CHECK:       # %bb.0: # %entry
 ; CHECK-NEXT:    vsetvli zero, a0, e16, m1, ta, mu
@@ -2399,12 +2497,12 @@ entry:
     <vscale x 4 x i16> %1,
     i16 9,
     <vscale x 4 x i1> %2,
-    i64 %3, i64 1)
+    iXLen %3, iXLen 1)
 
   ret <vscale x 4 x i16> %a
 }
 
-define <vscale x 8 x i16> @intrinsic_vsaddu_vi_nxv8i16_nxv8i16_i16(<vscale x 8 x i16> %0, i64 %1) nounwind {
+define <vscale x 8 x i16> @intrinsic_vsaddu_vi_nxv8i16_nxv8i16_i16(<vscale x 8 x i16> %0, iXLen %1) nounwind {
 ; CHECK-LABEL: intrinsic_vsaddu_vi_nxv8i16_nxv8i16_i16:
 ; CHECK:       # %bb.0: # %entry
 ; CHECK-NEXT:    vsetvli zero, a0, e16, m2, ta, ma
@@ -2415,12 +2513,12 @@ entry:
     <vscale x 8 x i16> undef,
     <vscale x 8 x i16> %0,
     i16 9,
-    i64 %1)
+    iXLen %1)
 
   ret <vscale x 8 x i16> %a
 }
 
-define <vscale x 8 x i16> @intrinsic_vsaddu_mask_vi_nxv8i16_nxv8i16_i16(<vscale x 8 x i16> %0, <vscale x 8 x i16> %1, <vscale x 8 x i1> %2, i64 %3) nounwind {
+define <vscale x 8 x i16> @intrinsic_vsaddu_mask_vi_nxv8i16_nxv8i16_i16(<vscale x 8 x i16> %0, <vscale x 8 x i16> %1, <vscale x 8 x i1> %2, iXLen %3) nounwind {
 ; CHECK-LABEL: intrinsic_vsaddu_mask_vi_nxv8i16_nxv8i16_i16:
 ; CHECK:       # %bb.0: # %entry
 ; CHECK-NEXT:    vsetvli zero, a0, e16, m2, ta, mu
@@ -2432,12 +2530,12 @@ entry:
     <vscale x 8 x i16> %1,
     i16 9,
     <vscale x 8 x i1> %2,
-    i64 %3, i64 1)
+    iXLen %3, iXLen 1)
 
   ret <vscale x 8 x i16> %a
 }
 
-define <vscale x 16 x i16> @intrinsic_vsaddu_vi_nxv16i16_nxv16i16_i16(<vscale x 16 x i16> %0, i64 %1) nounwind {
+define <vscale x 16 x i16> @intrinsic_vsaddu_vi_nxv16i16_nxv16i16_i16(<vscale x 16 x i16> %0, iXLen %1) nounwind {
 ; CHECK-LABEL: intrinsic_vsaddu_vi_nxv16i16_nxv16i16_i16:
 ; CHECK:       # %bb.0: # %entry
 ; CHECK-NEXT:    vsetvli zero, a0, e16, m4, ta, ma
@@ -2448,12 +2546,12 @@ entry:
     <vscale x 16 x i16> undef,
     <vscale x 16 x i16> %0,
     i16 9,
-    i64 %1)
+    iXLen %1)
 
   ret <vscale x 16 x i16> %a
 }
 
-define <vscale x 16 x i16> @intrinsic_vsaddu_mask_vi_nxv16i16_nxv16i16_i16(<vscale x 16 x i16> %0, <vscale x 16 x i16> %1, <vscale x 16 x i1> %2, i64 %3) nounwind {
+define <vscale x 16 x i16> @intrinsic_vsaddu_mask_vi_nxv16i16_nxv16i16_i16(<vscale x 16 x i16> %0, <vscale x 16 x i16> %1, <vscale x 16 x i1> %2, iXLen %3) nounwind {
 ; CHECK-LABEL: intrinsic_vsaddu_mask_vi_nxv16i16_nxv16i16_i16:
 ; CHECK:       # %bb.0: # %entry
 ; CHECK-NEXT:    vsetvli zero, a0, e16, m4, ta, mu
@@ -2465,12 +2563,12 @@ entry:
     <vscale x 16 x i16> %1,
     i16 9,
     <vscale x 16 x i1> %2,
-    i64 %3, i64 1)
+    iXLen %3, iXLen 1)
 
   ret <vscale x 16 x i16> %a
 }
 
-define <vscale x 32 x i16> @intrinsic_vsaddu_vi_nxv32i16_nxv32i16_i16(<vscale x 32 x i16> %0, i64 %1) nounwind {
+define <vscale x 32 x i16> @intrinsic_vsaddu_vi_nxv32i16_nxv32i16_i16(<vscale x 32 x i16> %0, iXLen %1) nounwind {
 ; CHECK-LABEL: intrinsic_vsaddu_vi_nxv32i16_nxv32i16_i16:
 ; CHECK:       # %bb.0: # %entry
 ; CHECK-NEXT:    vsetvli zero, a0, e16, m8, ta, ma
@@ -2481,12 +2579,12 @@ entry:
     <vscale x 32 x i16> undef,
     <vscale x 32 x i16> %0,
     i16 9,
-    i64 %1)
+    iXLen %1)
 
   ret <vscale x 32 x i16> %a
 }
 
-define <vscale x 32 x i16> @intrinsic_vsaddu_mask_vi_nxv32i16_nxv32i16_i16(<vscale x 32 x i16> %0, <vscale x 32 x i16> %1, <vscale x 32 x i1> %2, i64 %3) nounwind {
+define <vscale x 32 x i16> @intrinsic_vsaddu_mask_vi_nxv32i16_nxv32i16_i16(<vscale x 32 x i16> %0, <vscale x 32 x i16> %1, <vscale x 32 x i1> %2, iXLen %3) nounwind {
 ; CHECK-LABEL: intrinsic_vsaddu_mask_vi_nxv32i16_nxv32i16_i16:
 ; CHECK:       # %bb.0: # %entry
 ; CHECK-NEXT:    vsetvli zero, a0, e16, m8, ta, mu
@@ -2498,12 +2596,12 @@ entry:
     <vscale x 32 x i16> %1,
     i16 9,
     <vscale x 32 x i1> %2,
-    i64 %3, i64 1)
+    iXLen %3, iXLen 1)
 
   ret <vscale x 32 x i16> %a
 }
 
-define <vscale x 1 x i32> @intrinsic_vsaddu_vi_nxv1i32_nxv1i32_i32(<vscale x 1 x i32> %0, i64 %1) nounwind {
+define <vscale x 1 x i32> @intrinsic_vsaddu_vi_nxv1i32_nxv1i32_i32(<vscale x 1 x i32> %0, iXLen %1) nounwind {
 ; CHECK-LABEL: intrinsic_vsaddu_vi_nxv1i32_nxv1i32_i32:
 ; CHECK:       # %bb.0: # %entry
 ; CHECK-NEXT:    vsetvli zero, a0, e32, mf2, ta, ma
@@ -2514,12 +2612,12 @@ entry:
     <vscale x 1 x i32> undef,
     <vscale x 1 x i32> %0,
     i32 9,
-    i64 %1)
+    iXLen %1)
 
   ret <vscale x 1 x i32> %a
 }
 
-define <vscale x 1 x i32> @intrinsic_vsaddu_mask_vi_nxv1i32_nxv1i32_i32(<vscale x 1 x i32> %0, <vscale x 1 x i32> %1, <vscale x 1 x i1> %2, i64 %3) nounwind {
+define <vscale x 1 x i32> @intrinsic_vsaddu_mask_vi_nxv1i32_nxv1i32_i32(<vscale x 1 x i32> %0, <vscale x 1 x i32> %1, <vscale x 1 x i1> %2, iXLen %3) nounwind {
 ; CHECK-LABEL: intrinsic_vsaddu_mask_vi_nxv1i32_nxv1i32_i32:
 ; CHECK:       # %bb.0: # %entry
 ; CHECK-NEXT:    vsetvli zero, a0, e32, mf2, ta, mu
@@ -2531,12 +2629,12 @@ entry:
     <vscale x 1 x i32> %1,
     i32 9,
     <vscale x 1 x i1> %2,
-    i64 %3, i64 1)
+    iXLen %3, iXLen 1)
 
   ret <vscale x 1 x i32> %a
 }
 
-define <vscale x 2 x i32> @intrinsic_vsaddu_vi_nxv2i32_nxv2i32_i32(<vscale x 2 x i32> %0, i64 %1) nounwind {
+define <vscale x 2 x i32> @intrinsic_vsaddu_vi_nxv2i32_nxv2i32_i32(<vscale x 2 x i32> %0, iXLen %1) nounwind {
 ; CHECK-LABEL: intrinsic_vsaddu_vi_nxv2i32_nxv2i32_i32:
 ; CHECK:       # %bb.0: # %entry
 ; CHECK-NEXT:    vsetvli zero, a0, e32, m1, ta, ma
@@ -2547,12 +2645,12 @@ entry:
     <vscale x 2 x i32> undef,
     <vscale x 2 x i32> %0,
     i32 9,
-    i64 %1)
+    iXLen %1)
 
   ret <vscale x 2 x i32> %a
 }
 
-define <vscale x 2 x i32> @intrinsic_vsaddu_mask_vi_nxv2i32_nxv2i32_i32(<vscale x 2 x i32> %0, <vscale x 2 x i32> %1, <vscale x 2 x i1> %2, i64 %3) nounwind {
+define <vscale x 2 x i32> @intrinsic_vsaddu_mask_vi_nxv2i32_nxv2i32_i32(<vscale x 2 x i32> %0, <vscale x 2 x i32> %1, <vscale x 2 x i1> %2, iXLen %3) nounwind {
 ; CHECK-LABEL: intrinsic_vsaddu_mask_vi_nxv2i32_nxv2i32_i32:
 ; CHECK:       # %bb.0: # %entry
 ; CHECK-NEXT:    vsetvli zero, a0, e32, m1, ta, mu
@@ -2564,12 +2662,12 @@ entry:
     <vscale x 2 x i32> %1,
     i32 9,
     <vscale x 2 x i1> %2,
-    i64 %3, i64 1)
+    iXLen %3, iXLen 1)
 
   ret <vscale x 2 x i32> %a
 }
 
-define <vscale x 4 x i32> @intrinsic_vsaddu_vi_nxv4i32_nxv4i32_i32(<vscale x 4 x i32> %0, i64 %1) nounwind {
+define <vscale x 4 x i32> @intrinsic_vsaddu_vi_nxv4i32_nxv4i32_i32(<vscale x 4 x i32> %0, iXLen %1) nounwind {
 ; CHECK-LABEL: intrinsic_vsaddu_vi_nxv4i32_nxv4i32_i32:
 ; CHECK:       # %bb.0: # %entry
 ; CHECK-NEXT:    vsetvli zero, a0, e32, m2, ta, ma
@@ -2580,12 +2678,12 @@ entry:
     <vscale x 4 x i32> undef,
     <vscale x 4 x i32> %0,
     i32 9,
-    i64 %1)
+    iXLen %1)
 
   ret <vscale x 4 x i32> %a
 }
 
-define <vscale x 4 x i32> @intrinsic_vsaddu_mask_vi_nxv4i32_nxv4i32_i32(<vscale x 4 x i32> %0, <vscale x 4 x i32> %1, <vscale x 4 x i1> %2, i64 %3) nounwind {
+define <vscale x 4 x i32> @intrinsic_vsaddu_mask_vi_nxv4i32_nxv4i32_i32(<vscale x 4 x i32> %0, <vscale x 4 x i32> %1, <vscale x 4 x i1> %2, iXLen %3) nounwind {
 ; CHECK-LABEL: intrinsic_vsaddu_mask_vi_nxv4i32_nxv4i32_i32:
 ; CHECK:       # %bb.0: # %entry
 ; CHECK-NEXT:    vsetvli zero, a0, e32, m2, ta, mu
@@ -2597,12 +2695,12 @@ entry:
     <vscale x 4 x i32> %1,
     i32 9,
     <vscale x 4 x i1> %2,
-    i64 %3, i64 1)
+    iXLen %3, iXLen 1)
 
   ret <vscale x 4 x i32> %a
 }
 
-define <vscale x 8 x i32> @intrinsic_vsaddu_vi_nxv8i32_nxv8i32_i32(<vscale x 8 x i32> %0, i64 %1) nounwind {
+define <vscale x 8 x i32> @intrinsic_vsaddu_vi_nxv8i32_nxv8i32_i32(<vscale x 8 x i32> %0, iXLen %1) nounwind {
 ; CHECK-LABEL: intrinsic_vsaddu_vi_nxv8i32_nxv8i32_i32:
 ; CHECK:       # %bb.0: # %entry
 ; CHECK-NEXT:    vsetvli zero, a0, e32, m4, ta, ma
@@ -2613,12 +2711,12 @@ entry:
     <vscale x 8 x i32> undef,
     <vscale x 8 x i32> %0,
     i32 9,
-    i64 %1)
+    iXLen %1)
 
   ret <vscale x 8 x i32> %a
 }
 
-define <vscale x 8 x i32> @intrinsic_vsaddu_mask_vi_nxv8i32_nxv8i32_i32(<vscale x 8 x i32> %0, <vscale x 8 x i32> %1, <vscale x 8 x i1> %2, i64 %3) nounwind {
+define <vscale x 8 x i32> @intrinsic_vsaddu_mask_vi_nxv8i32_nxv8i32_i32(<vscale x 8 x i32> %0, <vscale x 8 x i32> %1, <vscale x 8 x i1> %2, iXLen %3) nounwind {
 ; CHECK-LABEL: intrinsic_vsaddu_mask_vi_nxv8i32_nxv8i32_i32:
 ; CHECK:       # %bb.0: # %entry
 ; CHECK-NEXT:    vsetvli zero, a0, e32, m4, ta, mu
@@ -2630,12 +2728,12 @@ entry:
     <vscale x 8 x i32> %1,
     i32 9,
     <vscale x 8 x i1> %2,
-    i64 %3, i64 1)
+    iXLen %3, iXLen 1)
 
   ret <vscale x 8 x i32> %a
 }
 
-define <vscale x 16 x i32> @intrinsic_vsaddu_vi_nxv16i32_nxv16i32_i32(<vscale x 16 x i32> %0, i64 %1) nounwind {
+define <vscale x 16 x i32> @intrinsic_vsaddu_vi_nxv16i32_nxv16i32_i32(<vscale x 16 x i32> %0, iXLen %1) nounwind {
 ; CHECK-LABEL: intrinsic_vsaddu_vi_nxv16i32_nxv16i32_i32:
 ; CHECK:       # %bb.0: # %entry
 ; CHECK-NEXT:    vsetvli zero, a0, e32, m8, ta, ma
@@ -2646,12 +2744,12 @@ entry:
     <vscale x 16 x i32> undef,
     <vscale x 16 x i32> %0,
     i32 9,
-    i64 %1)
+    iXLen %1)
 
   ret <vscale x 16 x i32> %a
 }
 
-define <vscale x 16 x i32> @intrinsic_vsaddu_mask_vi_nxv16i32_nxv16i32_i32(<vscale x 16 x i32> %0, <vscale x 16 x i32> %1, <vscale x 16 x i1> %2, i64 %3) nounwind {
+define <vscale x 16 x i32> @intrinsic_vsaddu_mask_vi_nxv16i32_nxv16i32_i32(<vscale x 16 x i32> %0, <vscale x 16 x i32> %1, <vscale x 16 x i1> %2, iXLen %3) nounwind {
 ; CHECK-LABEL: intrinsic_vsaddu_mask_vi_nxv16i32_nxv16i32_i32:
 ; CHECK:       # %bb.0: # %entry
 ; CHECK-NEXT:    vsetvli zero, a0, e32, m8, ta, mu
@@ -2663,12 +2761,12 @@ entry:
     <vscale x 16 x i32> %1,
     i32 9,
     <vscale x 16 x i1> %2,
-    i64 %3, i64 1)
+    iXLen %3, iXLen 1)
 
   ret <vscale x 16 x i32> %a
 }
 
-define <vscale x 1 x i64> @intrinsic_vsaddu_vi_nxv1i64_nxv1i64_i64(<vscale x 1 x i64> %0, i64 %1) nounwind {
+define <vscale x 1 x i64> @intrinsic_vsaddu_vi_nxv1i64_nxv1i64_i64(<vscale x 1 x i64> %0, iXLen %1) nounwind {
 ; CHECK-LABEL: intrinsic_vsaddu_vi_nxv1i64_nxv1i64_i64:
 ; CHECK:       # %bb.0: # %entry
 ; CHECK-NEXT:    vsetvli zero, a0, e64, m1, ta, ma
@@ -2679,12 +2777,12 @@ entry:
     <vscale x 1 x i64> undef,
     <vscale x 1 x i64> %0,
     i64 9,
-    i64 %1)
+    iXLen %1)
 
   ret <vscale x 1 x i64> %a
 }
 
-define <vscale x 1 x i64> @intrinsic_vsaddu_mask_vi_nxv1i64_nxv1i64_i64(<vscale x 1 x i64> %0, <vscale x 1 x i64> %1, <vscale x 1 x i1> %2, i64 %3) nounwind {
+define <vscale x 1 x i64> @intrinsic_vsaddu_mask_vi_nxv1i64_nxv1i64_i64(<vscale x 1 x i64> %0, <vscale x 1 x i64> %1, <vscale x 1 x i1> %2, iXLen %3) nounwind {
 ; CHECK-LABEL: intrinsic_vsaddu_mask_vi_nxv1i64_nxv1i64_i64:
 ; CHECK:       # %bb.0: # %entry
 ; CHECK-NEXT:    vsetvli zero, a0, e64, m1, ta, mu
@@ -2696,12 +2794,12 @@ entry:
     <vscale x 1 x i64> %1,
     i64 9,
     <vscale x 1 x i1> %2,
-    i64 %3, i64 1)
+    iXLen %3, iXLen 1)
 
   ret <vscale x 1 x i64> %a
 }
 
-define <vscale x 2 x i64> @intrinsic_vsaddu_vi_nxv2i64_nxv2i64_i64(<vscale x 2 x i64> %0, i64 %1) nounwind {
+define <vscale x 2 x i64> @intrinsic_vsaddu_vi_nxv2i64_nxv2i64_i64(<vscale x 2 x i64> %0, iXLen %1) nounwind {
 ; CHECK-LABEL: intrinsic_vsaddu_vi_nxv2i64_nxv2i64_i64:
 ; CHECK:       # %bb.0: # %entry
 ; CHECK-NEXT:    vsetvli zero, a0, e64, m2, ta, ma
@@ -2712,12 +2810,12 @@ entry:
     <vscale x 2 x i64> undef,
     <vscale x 2 x i64> %0,
     i64 9,
-    i64 %1)
+    iXLen %1)
 
   ret <vscale x 2 x i64> %a
 }
 
-define <vscale x 2 x i64> @intrinsic_vsaddu_mask_vi_nxv2i64_nxv2i64_i64(<vscale x 2 x i64> %0, <vscale x 2 x i64> %1, <vscale x 2 x i1> %2, i64 %3) nounwind {
+define <vscale x 2 x i64> @intrinsic_vsaddu_mask_vi_nxv2i64_nxv2i64_i64(<vscale x 2 x i64> %0, <vscale x 2 x i64> %1, <vscale x 2 x i1> %2, iXLen %3) nounwind {
 ; CHECK-LABEL: intrinsic_vsaddu_mask_vi_nxv2i64_nxv2i64_i64:
 ; CHECK:       # %bb.0: # %entry
 ; CHECK-NEXT:    vsetvli zero, a0, e64, m2, ta, mu
@@ -2729,12 +2827,12 @@ entry:
     <vscale x 2 x i64> %1,
     i64 9,
     <vscale x 2 x i1> %2,
-    i64 %3, i64 1)
+    iXLen %3, iXLen 1)
 
   ret <vscale x 2 x i64> %a
 }
 
-define <vscale x 4 x i64> @intrinsic_vsaddu_vi_nxv4i64_nxv4i64_i64(<vscale x 4 x i64> %0, i64 %1) nounwind {
+define <vscale x 4 x i64> @intrinsic_vsaddu_vi_nxv4i64_nxv4i64_i64(<vscale x 4 x i64> %0, iXLen %1) nounwind {
 ; CHECK-LABEL: intrinsic_vsaddu_vi_nxv4i64_nxv4i64_i64:
 ; CHECK:       # %bb.0: # %entry
 ; CHECK-NEXT:    vsetvli zero, a0, e64, m4, ta, ma
@@ -2745,12 +2843,12 @@ entry:
     <vscale x 4 x i64> undef,
     <vscale x 4 x i64> %0,
     i64 9,
-    i64 %1)
+    iXLen %1)
 
   ret <vscale x 4 x i64> %a
 }
 
-define <vscale x 4 x i64> @intrinsic_vsaddu_mask_vi_nxv4i64_nxv4i64_i64(<vscale x 4 x i64> %0, <vscale x 4 x i64> %1, <vscale x 4 x i1> %2, i64 %3) nounwind {
+define <vscale x 4 x i64> @intrinsic_vsaddu_mask_vi_nxv4i64_nxv4i64_i64(<vscale x 4 x i64> %0, <vscale x 4 x i64> %1, <vscale x 4 x i1> %2, iXLen %3) nounwind {
 ; CHECK-LABEL: intrinsic_vsaddu_mask_vi_nxv4i64_nxv4i64_i64:
 ; CHECK:       # %bb.0: # %entry
 ; CHECK-NEXT:    vsetvli zero, a0, e64, m4, ta, mu
@@ -2762,12 +2860,12 @@ entry:
     <vscale x 4 x i64> %1,
     i64 9,
     <vscale x 4 x i1> %2,
-    i64 %3, i64 1)
+    iXLen %3, iXLen 1)
 
   ret <vscale x 4 x i64> %a
 }
 
-define <vscale x 8 x i64> @intrinsic_vsaddu_vi_nxv8i64_nxv8i64_i64(<vscale x 8 x i64> %0, i64 %1) nounwind {
+define <vscale x 8 x i64> @intrinsic_vsaddu_vi_nxv8i64_nxv8i64_i64(<vscale x 8 x i64> %0, iXLen %1) nounwind {
 ; CHECK-LABEL: intrinsic_vsaddu_vi_nxv8i64_nxv8i64_i64:
 ; CHECK:       # %bb.0: # %entry
 ; CHECK-NEXT:    vsetvli zero, a0, e64, m8, ta, ma
@@ -2778,12 +2876,12 @@ entry:
     <vscale x 8 x i64> undef,
     <vscale x 8 x i64> %0,
     i64 9,
-    i64 %1)
+    iXLen %1)
 
   ret <vscale x 8 x i64> %a
 }
 
-define <vscale x 8 x i64> @intrinsic_vsaddu_mask_vi_nxv8i64_nxv8i64_i64(<vscale x 8 x i64> %0, <vscale x 8 x i64> %1, <vscale x 8 x i1> %2, i64 %3) nounwind {
+define <vscale x 8 x i64> @intrinsic_vsaddu_mask_vi_nxv8i64_nxv8i64_i64(<vscale x 8 x i64> %0, <vscale x 8 x i64> %1, <vscale x 8 x i1> %2, iXLen %3) nounwind {
 ; CHECK-LABEL: intrinsic_vsaddu_mask_vi_nxv8i64_nxv8i64_i64:
 ; CHECK:       # %bb.0: # %entry
 ; CHECK-NEXT:    vsetvli zero, a0, e64, m8, ta, mu
@@ -2795,7 +2893,7 @@ entry:
     <vscale x 8 x i64> %1,
     i64 9,
     <vscale x 8 x i1> %2,
-    i64 %3, i64 1)
+    iXLen %3, iXLen 1)
 
   ret <vscale x 8 x i64> %a
 }
diff --git a/llvm/test/CodeGen/RISCV/rvv/vslide1down-constant-vl-rv32.ll b/llvm/test/CodeGen/RISCV/rvv/vslide1down-constant-vl-rv32.ll
index 4115e6a..fd90e67 100644
--- a/llvm/test/CodeGen/RISCV/rvv/vslide1down-constant-vl-rv32.ll
+++ b/llvm/test/CodeGen/RISCV/rvv/vslide1down-constant-vl-rv32.ll
@@ -51,7 +51,7 @@ define <vscale x 1 x i64> @intrinsic_vslide1down_vx_nxv1i64_nxv1i64_i64_vl2(<vsc
 ;
 ; CHECK-64-LABEL: intrinsic_vslide1down_vx_nxv1i64_nxv1i64_i64_vl2:
 ; CHECK-64:       # %bb.0: # %entry
-; CHECK-64-NEXT:    vsetvli a2, zero, e32, m1, ta, ma
+; CHECK-64-NEXT:    vsetivli zero, 2, e32, m1, ta, ma
 ; CHECK-64-NEXT:    vslide1down.vx v8, v8, a0
 ; CHECK-64-NEXT:    vslide1down.vx v8, v8, a1
 ; CHECK-64-NEXT:    ret
@@ -84,7 +84,7 @@ define <vscale x 1 x i64> @intrinsic_vslide1down_vx_nxv1i64_nxv1i64_i64_vl3(<vsc
 ;
 ; CHECK-64-LABEL: intrinsic_vslide1down_vx_nxv1i64_nxv1i64_i64_vl3:
 ; CHECK-64:       # %bb.0: # %entry
-; CHECK-64-NEXT:    vsetvli a2, zero, e32, m1, ta, ma
+; CHECK-64-NEXT:    vsetivli zero, 2, e32, m1, ta, ma
 ; CHECK-64-NEXT:    vslide1down.vx v8, v8, a0
 ; CHECK-64-NEXT:    vslide1down.vx v8, v8, a1
 ; CHECK-64-NEXT:    ret
@@ -117,7 +117,7 @@ define <vscale x 1 x i64> @intrinsic_vslide1down_vx_nxv1i64_nxv1i64_i64_vl8(<vsc
 ;
 ; CHECK-64-LABEL: intrinsic_vslide1down_vx_nxv1i64_nxv1i64_i64_vl8:
 ; CHECK-64:       # %bb.0: # %entry
-; CHECK-64-NEXT:    vsetvli a2, zero, e32, m1, ta, ma
+; CHECK-64-NEXT:    vsetivli zero, 2, e32, m1, ta, ma
 ; CHECK-64-NEXT:    vslide1down.vx v8, v8, a0
 ; CHECK-64-NEXT:    vslide1down.vx v8, v8, a1
 ; CHECK-64-NEXT:    ret
@@ -152,7 +152,7 @@ define <vscale x 1 x i64> @intrinsic_vslide1down_vx_nxv1i64_nxv1i64_i64_vl9(<vsc
 ;
 ; CHECK-64-LABEL: intrinsic_vslide1down_vx_nxv1i64_nxv1i64_i64_vl9:
 ; CHECK-64:       # %bb.0: # %entry
-; CHECK-64-NEXT:    vsetvli a2, zero, e32, m1, ta, ma
+; CHECK-64-NEXT:    vsetivli zero, 2, e32, m1, ta, ma
 ; CHECK-64-NEXT:    vslide1down.vx v8, v8, a0
 ; CHECK-64-NEXT:    vslide1down.vx v8, v8, a1
 ; CHECK-64-NEXT:    ret
@@ -187,7 +187,7 @@ define <vscale x 1 x i64> @intrinsic_vslide1down_vx_nxv1i64_nxv1i64_i64_vl15(<vs
 ;
 ; CHECK-64-LABEL: intrinsic_vslide1down_vx_nxv1i64_nxv1i64_i64_vl15:
 ; CHECK-64:       # %bb.0: # %entry
-; CHECK-64-NEXT:    vsetvli a2, zero, e32, m1, ta, ma
+; CHECK-64-NEXT:    vsetivli zero, 2, e32, m1, ta, ma
 ; CHECK-64-NEXT:    vslide1down.vx v8, v8, a0
 ; CHECK-64-NEXT:    vslide1down.vx v8, v8, a1
 ; CHECK-64-NEXT:    ret
@@ -213,14 +213,14 @@ define <vscale x 1 x i64> @intrinsic_vslide1down_vx_nxv1i64_nxv1i64_i64_vl16(<vs
 ;
 ; CHECK-512-LABEL: intrinsic_vslide1down_vx_nxv1i64_nxv1i64_i64_vl16:
 ; CHECK-512:       # %bb.0: # %entry
-; CHECK-512-NEXT:    vsetvli a2, zero, e32, m1, ta, ma
+; CHECK-512-NEXT:    vsetivli zero, 16, e32, m1, ta, ma
 ; CHECK-512-NEXT:    vslide1down.vx v8, v8, a0
 ; CHECK-512-NEXT:    vslide1down.vx v8, v8, a1
 ; CHECK-512-NEXT:    ret
 ;
 ; CHECK-64-LABEL: intrinsic_vslide1down_vx_nxv1i64_nxv1i64_i64_vl16:
 ; CHECK-64:       # %bb.0: # %entry
-; CHECK-64-NEXT:    vsetvli a2, zero, e32, m1, ta, ma
+; CHECK-64-NEXT:    vsetivli zero, 2, e32, m1, ta, ma
 ; CHECK-64-NEXT:    vslide1down.vx v8, v8, a0
 ; CHECK-64-NEXT:    vslide1down.vx v8, v8, a1
 ; CHECK-64-NEXT:    ret
@@ -247,14 +247,14 @@ define <vscale x 1 x i64> @intrinsic_vslide1down_vx_nxv1i64_nxv1i64_i64_vl2047(<
 ;
 ; CHECK-512-LABEL: intrinsic_vslide1down_vx_nxv1i64_nxv1i64_i64_vl2047:
 ; CHECK-512:       # %bb.0: # %entry
-; CHECK-512-NEXT:    vsetvli a2, zero, e32, m1, ta, ma
+; CHECK-512-NEXT:    vsetivli zero, 16, e32, m1, ta, ma
 ; CHECK-512-NEXT:    vslide1down.vx v8, v8, a0
 ; CHECK-512-NEXT:    vslide1down.vx v8, v8, a1
 ; CHECK-512-NEXT:    ret
 ;
 ; CHECK-64-LABEL: intrinsic_vslide1down_vx_nxv1i64_nxv1i64_i64_vl2047:
 ; CHECK-64:       # %bb.0: # %entry
-; CHECK-64-NEXT:    vsetvli a2, zero, e32, m1, ta, ma
+; CHECK-64-NEXT:    vsetivli zero, 2, e32, m1, ta, ma
 ; CHECK-64-NEXT:    vslide1down.vx v8, v8, a0
 ; CHECK-64-NEXT:    vslide1down.vx v8, v8, a1
 ; CHECK-64-NEXT:    ret
@@ -269,12 +269,26 @@ entry:
 }
 
 define <vscale x 1 x i64> @intrinsic_vslide1down_vx_nxv1i64_nxv1i64_i64_vl2048(<vscale x 1 x i64> %0, i64 %1) nounwind {
-; CHECK-LABEL: intrinsic_vslide1down_vx_nxv1i64_nxv1i64_i64_vl2048:
-; CHECK:       # %bb.0: # %entry
-; CHECK-NEXT:    vsetvli a2, zero, e32, m1, ta, ma
-; CHECK-NEXT:    vslide1down.vx v8, v8, a0
-; CHECK-NEXT:    vslide1down.vx v8, v8, a1
-; CHECK-NEXT:    ret
+; CHECK-128-65536-LABEL: intrinsic_vslide1down_vx_nxv1i64_nxv1i64_i64_vl2048:
+; CHECK-128-65536:       # %bb.0: # %entry
+; CHECK-128-65536-NEXT:    vsetvli a2, zero, e32, m1, ta, ma
+; CHECK-128-65536-NEXT:    vslide1down.vx v8, v8, a0
+; CHECK-128-65536-NEXT:    vslide1down.vx v8, v8, a1
+; CHECK-128-65536-NEXT:    ret
+;
+; CHECK-512-LABEL: intrinsic_vslide1down_vx_nxv1i64_nxv1i64_i64_vl2048:
+; CHECK-512:       # %bb.0: # %entry
+; CHECK-512-NEXT:    vsetivli zero, 16, e32, m1, ta, ma
+; CHECK-512-NEXT:    vslide1down.vx v8, v8, a0
+; CHECK-512-NEXT:    vslide1down.vx v8, v8, a1
+; CHECK-512-NEXT:    ret
+;
+; CHECK-64-LABEL: intrinsic_vslide1down_vx_nxv1i64_nxv1i64_i64_vl2048:
+; CHECK-64:       # %bb.0: # %entry
+; CHECK-64-NEXT:    vsetivli zero, 2, e32, m1, ta, ma
+; CHECK-64-NEXT:    vslide1down.vx v8, v8, a0
+; CHECK-64-NEXT:    vslide1down.vx v8, v8, a1
+; CHECK-64-NEXT:    ret
 entry:
   %a = call <vscale x 1 x i64> @llvm.riscv.vslide1down.nxv1i64.i64(
     <vscale x 1 x i64> undef,
diff --git a/llvm/test/CodeGen/RISCV/rvv/vslide1down-rv32.ll b/llvm/test/CodeGen/RISCV/rvv/vslide1down-rv32.ll
deleted file mode 100644
index 0699737..0000000
--- a/llvm/test/CodeGen/RISCV/rvv/vslide1down-rv32.ll
+++ /dev/null
@@ -1,1069 +0,0 @@
-; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py
-; RUN: llc -mtriple=riscv32 -mattr=+v,+f -verify-machineinstrs \
-; RUN:   < %s | FileCheck %s
-
-declare <vscale x 1 x i8> @llvm.riscv.vslide1down.nxv1i8.i8(
-  <vscale x 1 x i8>,
-  <vscale x 1 x i8>,
-  i8,
-  i32);
-
-define <vscale x 1 x i8> @intrinsic_vslide1down_vx_nxv1i8_nxv1i8_i8(<vscale x 1 x i8> %0, i8 %1, i32 %2) nounwind {
-; CHECK-LABEL: intrinsic_vslide1down_vx_nxv1i8_nxv1i8_i8:
-; CHECK:       # %bb.0: # %entry
-; CHECK-NEXT:    vsetvli zero, a1, e8, mf8, ta, ma
-; CHECK-NEXT:    vslide1down.vx v8, v8, a0
-; CHECK-NEXT:    ret
-entry:
-  %a = call <vscale x 1 x i8> @llvm.riscv.vslide1down.nxv1i8.i8(
-    <vscale x 1 x i8> undef,
-    <vscale x 1 x i8> %0,
-    i8 %1,
-    i32 %2)
-
-  ret <vscale x 1 x i8> %a
-}
-
-declare <vscale x 1 x i8> @llvm.riscv.vslide1down.mask.nxv1i8.i8(
-  <vscale x 1 x i8>,
-  <vscale x 1 x i8>,
-  i8,
-  <vscale x 1 x i1>,
-  i32,
-  i32);
-
-define <vscale x 1 x i8> @intrinsic_vslide1down_mask_vx_nxv1i8_nxv1i8_i8(<vscale x 1 x i8> %0, <vscale x 1 x i8> %1, i8 %2, <vscale x 1 x i1> %3, i32 %4) nounwind {
-; CHECK-LABEL: intrinsic_vslide1down_mask_vx_nxv1i8_nxv1i8_i8:
-; CHECK:       # %bb.0: # %entry
-; CHECK-NEXT:    vsetvli zero, a1, e8, mf8, ta, mu
-; CHECK-NEXT:    vslide1down.vx v8, v9, a0, v0.t
-; CHECK-NEXT:    ret
-entry:
-  %a = call <vscale x 1 x i8> @llvm.riscv.vslide1down.mask.nxv1i8.i8(
-    <vscale x 1 x i8> %0,
-    <vscale x 1 x i8> %1,
-    i8 %2,
-    <vscale x 1 x i1> %3,
-    i32 %4, i32 1)
-
-  ret <vscale x 1 x i8> %a
-}
-
-declare <vscale x 2 x i8> @llvm.riscv.vslide1down.nxv2i8.i8(
-  <vscale x 2 x i8>,
-  <vscale x 2 x i8>,
-  i8,
-  i32);
-
-define <vscale x 2 x i8> @intrinsic_vslide1down_vx_nxv2i8_nxv2i8_i8(<vscale x 2 x i8> %0, i8 %1, i32 %2) nounwind {
-; CHECK-LABEL: intrinsic_vslide1down_vx_nxv2i8_nxv2i8_i8:
-; CHECK:       # %bb.0: # %entry
-; CHECK-NEXT:    vsetvli zero, a1, e8, mf4, ta, ma
-; CHECK-NEXT:    vslide1down.vx v8, v8, a0
-; CHECK-NEXT:    ret
-entry:
-  %a = call <vscale x 2 x i8> @llvm.riscv.vslide1down.nxv2i8.i8(
-    <vscale x 2 x i8> undef,
-    <vscale x 2 x i8> %0,
-    i8 %1,
-    i32 %2)
-
-  ret <vscale x 2 x i8> %a
-}
-
-declare <vscale x 2 x i8> @llvm.riscv.vslide1down.mask.nxv2i8.i8(
-  <vscale x 2 x i8>,
-  <vscale x 2 x i8>,
-  i8,
-  <vscale x 2 x i1>,
-  i32,
-  i32);
-
-define <vscale x 2 x i8> @intrinsic_vslide1down_mask_vx_nxv2i8_nxv2i8_i8(<vscale x 2 x i8> %0, <vscale x 2 x i8> %1, i8 %2, <vscale x 2 x i1> %3, i32 %4) nounwind {
-; CHECK-LABEL: intrinsic_vslide1down_mask_vx_nxv2i8_nxv2i8_i8:
-; CHECK:       # %bb.0: # %entry
-; CHECK-NEXT:    vsetvli zero, a1, e8, mf4, ta, mu
-; CHECK-NEXT:    vslide1down.vx v8, v9, a0, v0.t
-; CHECK-NEXT:    ret
-entry:
-  %a = call <vscale x 2 x i8> @llvm.riscv.vslide1down.mask.nxv2i8.i8(
-    <vscale x 2 x i8> %0,
-    <vscale x 2 x i8> %1,
-    i8 %2,
-    <vscale x 2 x i1> %3,
-    i32 %4, i32 1)
-
-  ret <vscale x 2 x i8> %a
-}
-
-declare <vscale x 4 x i8> @llvm.riscv.vslide1down.nxv4i8.i8(
-  <vscale x 4 x i8>,
-  <vscale x 4 x i8>,
-  i8,
-  i32);
-
-define <vscale x 4 x i8> @intrinsic_vslide1down_vx_nxv4i8_nxv4i8_i8(<vscale x 4 x i8> %0, i8 %1, i32 %2) nounwind {
-; CHECK-LABEL: intrinsic_vslide1down_vx_nxv4i8_nxv4i8_i8:
-; CHECK:       # %bb.0: # %entry
-; CHECK-NEXT:    vsetvli zero, a1, e8, mf2, ta, ma
-; CHECK-NEXT:    vslide1down.vx v8, v8, a0
-; CHECK-NEXT:    ret
-entry:
-  %a = call <vscale x 4 x i8> @llvm.riscv.vslide1down.nxv4i8.i8(
-    <vscale x 4 x i8> undef,
-    <vscale x 4 x i8> %0,
-    i8 %1,
-    i32 %2)
-
-  ret <vscale x 4 x i8> %a
-}
-
-declare <vscale x 4 x i8> @llvm.riscv.vslide1down.mask.nxv4i8.i8(
-  <vscale x 4 x i8>,
-  <vscale x 4 x i8>,
-  i8,
-  <vscale x 4 x i1>,
-  i32,
-  i32);
-
-define <vscale x 4 x i8> @intrinsic_vslide1down_mask_vx_nxv4i8_nxv4i8_i8(<vscale x 4 x i8> %0, <vscale x 4 x i8> %1, i8 %2, <vscale x 4 x i1> %3, i32 %4) nounwind {
-; CHECK-LABEL: intrinsic_vslide1down_mask_vx_nxv4i8_nxv4i8_i8:
-; CHECK:       # %bb.0: # %entry
-; CHECK-NEXT:    vsetvli zero, a1, e8, mf2, ta, mu
-; CHECK-NEXT:    vslide1down.vx v8, v9, a0, v0.t
-; CHECK-NEXT:    ret
-entry:
-  %a = call <vscale x 4 x i8> @llvm.riscv.vslide1down.mask.nxv4i8.i8(
-    <vscale x 4 x i8> %0,
-    <vscale x 4 x i8> %1,
-    i8 %2,
-    <vscale x 4 x i1> %3,
-    i32 %4, i32 1)
-
-  ret <vscale x 4 x i8> %a
-}
-
-declare <vscale x 8 x i8> @llvm.riscv.vslide1down.nxv8i8.i8(
-  <vscale x 8 x i8>,
-  <vscale x 8 x i8>,
-  i8,
-  i32);
-
-define <vscale x 8 x i8> @intrinsic_vslide1down_vx_nxv8i8_nxv8i8_i8(<vscale x 8 x i8> %0, i8 %1, i32 %2) nounwind {
-; CHECK-LABEL: intrinsic_vslide1down_vx_nxv8i8_nxv8i8_i8:
-; CHECK:       # %bb.0: # %entry
-; CHECK-NEXT:    vsetvli zero, a1, e8, m1, ta, ma
-; CHECK-NEXT:    vslide1down.vx v8, v8, a0
-; CHECK-NEXT:    ret
-entry:
-  %a = call <vscale x 8 x i8> @llvm.riscv.vslide1down.nxv8i8.i8(
-    <vscale x 8 x i8> undef,
-    <vscale x 8 x i8> %0,
-    i8 %1,
-    i32 %2)
-
-  ret <vscale x 8 x i8> %a
-}
-
-declare <vscale x 8 x i8> @llvm.riscv.vslide1down.mask.nxv8i8.i8(
-  <vscale x 8 x i8>,
-  <vscale x 8 x i8>,
-  i8,
-  <vscale x 8 x i1>,
-  i32,
-  i32);
-
-define <vscale x 8 x i8> @intrinsic_vslide1down_mask_vx_nxv8i8_nxv8i8_i8(<vscale x 8 x i8> %0, <vscale x 8 x i8> %1, i8 %2, <vscale x 8 x i1> %3, i32 %4) nounwind {
-; CHECK-LABEL: intrinsic_vslide1down_mask_vx_nxv8i8_nxv8i8_i8:
-; CHECK:       # %bb.0: # %entry
-; CHECK-NEXT:    vsetvli zero, a1, e8, m1, ta, mu
-; CHECK-NEXT:    vslide1down.vx v8, v9, a0, v0.t
-; CHECK-NEXT:    ret
-entry:
-  %a = call <vscale x 8 x i8> @llvm.riscv.vslide1down.mask.nxv8i8.i8(
-    <vscale x 8 x i8> %0,
-    <vscale x 8 x i8> %1,
-    i8 %2,
-    <vscale x 8 x i1> %3,
-    i32 %4, i32 1)
-
-  ret <vscale x 8 x i8> %a
-}
-
-declare <vscale x 16 x i8> @llvm.riscv.vslide1down.nxv16i8.i8(
-  <vscale x 16 x i8>,
-  <vscale x 16 x i8>,
-  i8,
-  i32);
-
-define <vscale x 16 x i8> @intrinsic_vslide1down_vx_nxv16i8_nxv16i8_i8(<vscale x 16 x i8> %0, i8 %1, i32 %2) nounwind {
-; CHECK-LABEL: intrinsic_vslide1down_vx_nxv16i8_nxv16i8_i8:
-; CHECK:       # %bb.0: # %entry
-; CHECK-NEXT:    vsetvli zero, a1, e8, m2, ta, ma
-; CHECK-NEXT:    vslide1down.vx v8, v8, a0
-; CHECK-NEXT:    ret
-entry:
-  %a = call <vscale x 16 x i8> @llvm.riscv.vslide1down.nxv16i8.i8(
-    <vscale x 16 x i8> undef,
-    <vscale x 16 x i8> %0,
-    i8 %1,
-    i32 %2)
-
-  ret <vscale x 16 x i8> %a
-}
-
-declare <vscale x 16 x i8> @llvm.riscv.vslide1down.mask.nxv16i8.i8(
-  <vscale x 16 x i8>,
-  <vscale x 16 x i8>,
-  i8,
-  <vscale x 16 x i1>,
-  i32,
-  i32);
-
-define <vscale x 16 x i8> @intrinsic_vslide1down_mask_vx_nxv16i8_nxv16i8_i8(<vscale x 16 x i8> %0, <vscale x 16 x i8> %1, i8 %2, <vscale x 16 x i1> %3, i32 %4) nounwind {
-; CHECK-LABEL: intrinsic_vslide1down_mask_vx_nxv16i8_nxv16i8_i8:
-; CHECK:       # %bb.0: # %entry
-; CHECK-NEXT:    vsetvli zero, a1, e8, m2, ta, mu
-; CHECK-NEXT:    vslide1down.vx v8, v10, a0, v0.t
-; CHECK-NEXT:    ret
-entry:
-  %a = call <vscale x 16 x i8> @llvm.riscv.vslide1down.mask.nxv16i8.i8(
-    <vscale x 16 x i8> %0,
-    <vscale x 16 x i8> %1,
-    i8 %2,
-    <vscale x 16 x i1> %3,
-    i32 %4, i32 1)
-
-  ret <vscale x 16 x i8> %a
-}
-
-declare <vscale x 32 x i8> @llvm.riscv.vslide1down.nxv32i8.i8(
-  <vscale x 32 x i8>,
-  <vscale x 32 x i8>,
-  i8,
-  i32);
-
-define <vscale x 32 x i8> @intrinsic_vslide1down_vx_nxv32i8_nxv32i8_i8(<vscale x 32 x i8> %0, i8 %1, i32 %2) nounwind {
-; CHECK-LABEL: intrinsic_vslide1down_vx_nxv32i8_nxv32i8_i8:
-; CHECK:       # %bb.0: # %entry
-; CHECK-NEXT:    vsetvli zero, a1, e8, m4, ta, ma
-; CHECK-NEXT:    vslide1down.vx v8, v8, a0
-; CHECK-NEXT:    ret
-entry:
-  %a = call <vscale x 32 x i8> @llvm.riscv.vslide1down.nxv32i8.i8(
-    <vscale x 32 x i8> undef,
-    <vscale x 32 x i8> %0,
-    i8 %1,
-    i32 %2)
-
-  ret <vscale x 32 x i8> %a
-}
-
-declare <vscale x 32 x i8> @llvm.riscv.vslide1down.mask.nxv32i8.i8(
-  <vscale x 32 x i8>,
-  <vscale x 32 x i8>,
-  i8,
-  <vscale x 32 x i1>,
-  i32,
-  i32);
-
-define <vscale x 32 x i8> @intrinsic_vslide1down_mask_vx_nxv32i8_nxv32i8_i8(<vscale x 32 x i8> %0, <vscale x 32 x i8> %1, i8 %2, <vscale x 32 x i1> %3, i32 %4) nounwind {
-; CHECK-LABEL: intrinsic_vslide1down_mask_vx_nxv32i8_nxv32i8_i8:
-; CHECK:       # %bb.0: # %entry
-; CHECK-NEXT:    vsetvli zero, a1, e8, m4, ta, mu
-; CHECK-NEXT:    vslide1down.vx v8, v12, a0, v0.t
-; CHECK-NEXT:    ret
-entry:
-  %a = call <vscale x 32 x i8> @llvm.riscv.vslide1down.mask.nxv32i8.i8(
-    <vscale x 32 x i8> %0,
-    <vscale x 32 x i8> %1,
-    i8 %2,
-    <vscale x 32 x i1> %3,
-    i32 %4, i32 1)
-
-  ret <vscale x 32 x i8> %a
-}
-
-declare <vscale x 64 x i8> @llvm.riscv.vslide1down.nxv64i8.i8(
-  <vscale x 64 x i8>,
-  <vscale x 64 x i8>,
-  i8,
-  i32);
-
-define <vscale x 64 x i8> @intrinsic_vslide1down_vx_nxv64i8_nxv64i8_i8(<vscale x 64 x i8> %0, i8 %1, i32 %2) nounwind {
-; CHECK-LABEL: intrinsic_vslide1down_vx_nxv64i8_nxv64i8_i8:
-; CHECK:       # %bb.0: # %entry
-; CHECK-NEXT:    vsetvli zero, a1, e8, m8, ta, ma
-; CHECK-NEXT:    vslide1down.vx v8, v8, a0
-; CHECK-NEXT:    ret
-entry:
-  %a = call <vscale x 64 x i8> @llvm.riscv.vslide1down.nxv64i8.i8(
-    <vscale x 64 x i8> undef,
-    <vscale x 64 x i8> %0,
-    i8 %1,
-    i32 %2)
-
-  ret <vscale x 64 x i8> %a
-}
-
-declare <vscale x 64 x i8> @llvm.riscv.vslide1down.mask.nxv64i8.i8(
-  <vscale x 64 x i8>,
-  <vscale x 64 x i8>,
-  i8,
-  <vscale x 64 x i1>,
-  i32,
-  i32);
-
-define <vscale x 64 x i8> @intrinsic_vslide1down_mask_vx_nxv64i8_nxv64i8_i8(<vscale x 64 x i8> %0, <vscale x 64 x i8> %1, i8 %2, <vscale x 64 x i1> %3, i32 %4) nounwind {
-; CHECK-LABEL: intrinsic_vslide1down_mask_vx_nxv64i8_nxv64i8_i8:
-; CHECK:       # %bb.0: # %entry
-; CHECK-NEXT:    vsetvli zero, a1, e8, m8, ta, mu
-; CHECK-NEXT:    vslide1down.vx v8, v16, a0, v0.t
-; CHECK-NEXT:    ret
-entry:
-  %a = call <vscale x 64 x i8> @llvm.riscv.vslide1down.mask.nxv64i8.i8(
-    <vscale x 64 x i8> %0,
-    <vscale x 64 x i8> %1,
-    i8 %2,
-    <vscale x 64 x i1> %3,
-    i32 %4, i32 1)
-
-  ret <vscale x 64 x i8> %a
-}
-
-declare <vscale x 1 x i16> @llvm.riscv.vslide1down.nxv1i16.i16(
-  <vscale x 1 x i16>,
-  <vscale x 1 x i16>,
-  i16,
-  i32);
-
-define <vscale x 1 x i16> @intrinsic_vslide1down_vx_nxv1i16_nxv1i16_i16(<vscale x 1 x i16> %0, i16 %1, i32 %2) nounwind {
-; CHECK-LABEL: intrinsic_vslide1down_vx_nxv1i16_nxv1i16_i16:
-; CHECK:       # %bb.0: # %entry
-; CHECK-NEXT:    vsetvli zero, a1, e16, mf4, ta, ma
-; CHECK-NEXT:    vslide1down.vx v8, v8, a0
-; CHECK-NEXT:    ret
-entry:
-  %a = call <vscale x 1 x i16> @llvm.riscv.vslide1down.nxv1i16.i16(
-    <vscale x 1 x i16> undef,
-    <vscale x 1 x i16> %0,
-    i16 %1,
-    i32 %2)
-
-  ret <vscale x 1 x i16> %a
-}
-
-declare <vscale x 1 x i16> @llvm.riscv.vslide1down.mask.nxv1i16.i16(
-  <vscale x 1 x i16>,
-  <vscale x 1 x i16>,
-  i16,
-  <vscale x 1 x i1>,
-  i32,
-  i32);
-
-define <vscale x 1 x i16> @intrinsic_vslide1down_mask_vx_nxv1i16_nxv1i16_i16(<vscale x 1 x i16> %0, <vscale x 1 x i16> %1, i16 %2, <vscale x 1 x i1> %3, i32 %4) nounwind {
-; CHECK-LABEL: intrinsic_vslide1down_mask_vx_nxv1i16_nxv1i16_i16:
-; CHECK:       # %bb.0: # %entry
-; CHECK-NEXT:    vsetvli zero, a1, e16, mf4, ta, mu
-; CHECK-NEXT:    vslide1down.vx v8, v9, a0, v0.t
-; CHECK-NEXT:    ret
-entry:
-  %a = call <vscale x 1 x i16> @llvm.riscv.vslide1down.mask.nxv1i16.i16(
-    <vscale x 1 x i16> %0,
-    <vscale x 1 x i16> %1,
-    i16 %2,
-    <vscale x 1 x i1> %3,
-    i32 %4, i32 1)
-
-  ret <vscale x 1 x i16> %a
-}
-
-declare <vscale x 2 x i16> @llvm.riscv.vslide1down.nxv2i16.i16(
-  <vscale x 2 x i16>,
-  <vscale x 2 x i16>,
-  i16,
-  i32);
-
-define <vscale x 2 x i16> @intrinsic_vslide1down_vx_nxv2i16_nxv2i16_i16(<vscale x 2 x i16> %0, i16 %1, i32 %2) nounwind {
-; CHECK-LABEL: intrinsic_vslide1down_vx_nxv2i16_nxv2i16_i16:
-; CHECK:       # %bb.0: # %entry
-; CHECK-NEXT:    vsetvli zero, a1, e16, mf2, ta, ma
-; CHECK-NEXT:    vslide1down.vx v8, v8, a0
-; CHECK-NEXT:    ret
-entry:
-  %a = call <vscale x 2 x i16> @llvm.riscv.vslide1down.nxv2i16.i16(
-    <vscale x 2 x i16> undef,
-    <vscale x 2 x i16> %0,
-    i16 %1,
-    i32 %2)
-
-  ret <vscale x 2 x i16> %a
-}
-
-declare <vscale x 2 x i16> @llvm.riscv.vslide1down.mask.nxv2i16.i16(
-  <vscale x 2 x i16>,
-  <vscale x 2 x i16>,
-  i16,
-  <vscale x 2 x i1>,
-  i32,
-  i32);
-
-define <vscale x 2 x i16> @intrinsic_vslide1down_mask_vx_nxv2i16_nxv2i16_i16(<vscale x 2 x i16> %0, <vscale x 2 x i16> %1, i16 %2, <vscale x 2 x i1> %3, i32 %4) nounwind {
-; CHECK-LABEL: intrinsic_vslide1down_mask_vx_nxv2i16_nxv2i16_i16:
-; CHECK:       # %bb.0: # %entry
-; CHECK-NEXT:    vsetvli zero, a1, e16, mf2, ta, mu
-; CHECK-NEXT:    vslide1down.vx v8, v9, a0, v0.t
-; CHECK-NEXT:    ret
-entry:
-  %a = call <vscale x 2 x i16> @llvm.riscv.vslide1down.mask.nxv2i16.i16(
-    <vscale x 2 x i16> %0,
-    <vscale x 2 x i16> %1,
-    i16 %2,
-    <vscale x 2 x i1> %3,
-    i32 %4, i32 1)
-
-  ret <vscale x 2 x i16> %a
-}
-
-declare <vscale x 4 x i16> @llvm.riscv.vslide1down.nxv4i16.i16(
-  <vscale x 4 x i16>,
-  <vscale x 4 x i16>,
-  i16,
-  i32);
-
-define <vscale x 4 x i16> @intrinsic_vslide1down_vx_nxv4i16_nxv4i16_i16(<vscale x 4 x i16> %0, i16 %1, i32 %2) nounwind {
-; CHECK-LABEL: intrinsic_vslide1down_vx_nxv4i16_nxv4i16_i16:
-; CHECK:       # %bb.0: # %entry
-; CHECK-NEXT:    vsetvli zero, a1, e16, m1, ta, ma
-; CHECK-NEXT:    vslide1down.vx v8, v8, a0
-; CHECK-NEXT:    ret
-entry:
-  %a = call <vscale x 4 x i16> @llvm.riscv.vslide1down.nxv4i16.i16(
-    <vscale x 4 x i16> undef,
-    <vscale x 4 x i16> %0,
-    i16 %1,
-    i32 %2)
-
-  ret <vscale x 4 x i16> %a
-}
-
-declare <vscale x 4 x i16> @llvm.riscv.vslide1down.mask.nxv4i16.i16(
-  <vscale x 4 x i16>,
-  <vscale x 4 x i16>,
-  i16,
-  <vscale x 4 x i1>,
-  i32,
-  i32);
-
-define <vscale x 4 x i16> @intrinsic_vslide1down_mask_vx_nxv4i16_nxv4i16_i16(<vscale x 4 x i16> %0, <vscale x 4 x i16> %1, i16 %2, <vscale x 4 x i1> %3, i32 %4) nounwind {
-; CHECK-LABEL: intrinsic_vslide1down_mask_vx_nxv4i16_nxv4i16_i16:
-; CHECK:       # %bb.0: # %entry
-; CHECK-NEXT:    vsetvli zero, a1, e16, m1, ta, mu
-; CHECK-NEXT:    vslide1down.vx v8, v9, a0, v0.t
-; CHECK-NEXT:    ret
-entry:
-  %a = call <vscale x 4 x i16> @llvm.riscv.vslide1down.mask.nxv4i16.i16(
-    <vscale x 4 x i16> %0,
-    <vscale x 4 x i16> %1,
-    i16 %2,
-    <vscale x 4 x i1> %3,
-    i32 %4, i32 1)
-
-  ret <vscale x 4 x i16> %a
-}
-
-declare <vscale x 8 x i16> @llvm.riscv.vslide1down.nxv8i16.i16(
-  <vscale x 8 x i16>,
-  <vscale x 8 x i16>,
-  i16,
-  i32);
-
-define <vscale x 8 x i16> @intrinsic_vslide1down_vx_nxv8i16_nxv8i16_i16(<vscale x 8 x i16> %0, i16 %1, i32 %2) nounwind {
-; CHECK-LABEL: intrinsic_vslide1down_vx_nxv8i16_nxv8i16_i16:
-; CHECK:       # %bb.0: # %entry
-; CHECK-NEXT:    vsetvli zero, a1, e16, m2, ta, ma
-; CHECK-NEXT:    vslide1down.vx v8, v8, a0
-; CHECK-NEXT:    ret
-entry:
-  %a = call <vscale x 8 x i16> @llvm.riscv.vslide1down.nxv8i16.i16(
-    <vscale x 8 x i16> undef,
-    <vscale x 8 x i16> %0,
-    i16 %1,
-    i32 %2)
-
-  ret <vscale x 8 x i16> %a
-}
-
-declare <vscale x 8 x i16> @llvm.riscv.vslide1down.mask.nxv8i16.i16(
-  <vscale x 8 x i16>,
-  <vscale x 8 x i16>,
-  i16,
-  <vscale x 8 x i1>,
-  i32,
-  i32);
-
-define <vscale x 8 x i16> @intrinsic_vslide1down_mask_vx_nxv8i16_nxv8i16_i16(<vscale x 8 x i16> %0, <vscale x 8 x i16> %1, i16 %2, <vscale x 8 x i1> %3, i32 %4) nounwind {
-; CHECK-LABEL: intrinsic_vslide1down_mask_vx_nxv8i16_nxv8i16_i16:
-; CHECK:       # %bb.0: # %entry
-; CHECK-NEXT:    vsetvli zero, a1, e16, m2, ta, mu
-; CHECK-NEXT:    vslide1down.vx v8, v10, a0, v0.t
-; CHECK-NEXT:    ret
-entry:
-  %a = call <vscale x 8 x i16> @llvm.riscv.vslide1down.mask.nxv8i16.i16(
-    <vscale x 8 x i16> %0,
-    <vscale x 8 x i16> %1,
-    i16 %2,
-    <vscale x 8 x i1> %3,
-    i32 %4, i32 1)
-
-  ret <vscale x 8 x i16> %a
-}
-
-declare <vscale x 16 x i16> @llvm.riscv.vslide1down.nxv16i16.i16(
-  <vscale x 16 x i16>,
-  <vscale x 16 x i16>,
-  i16,
-  i32);
-
-define <vscale x 16 x i16> @intrinsic_vslide1down_vx_nxv16i16_nxv16i16_i16(<vscale x 16 x i16> %0, i16 %1, i32 %2) nounwind {
-; CHECK-LABEL: intrinsic_vslide1down_vx_nxv16i16_nxv16i16_i16:
-; CHECK:       # %bb.0: # %entry
-; CHECK-NEXT:    vsetvli zero, a1, e16, m4, ta, ma
-; CHECK-NEXT:    vslide1down.vx v8, v8, a0
-; CHECK-NEXT:    ret
-entry:
-  %a = call <vscale x 16 x i16> @llvm.riscv.vslide1down.nxv16i16.i16(
-    <vscale x 16 x i16> undef,
-    <vscale x 16 x i16> %0,
-    i16 %1,
-    i32 %2)
-
-  ret <vscale x 16 x i16> %a
-}
-
-declare <vscale x 16 x i16> @llvm.riscv.vslide1down.mask.nxv16i16.i16(
-  <vscale x 16 x i16>,
-  <vscale x 16 x i16>,
-  i16,
-  <vscale x 16 x i1>,
-  i32,
-  i32);
-
-define <vscale x 16 x i16> @intrinsic_vslide1down_mask_vx_nxv16i16_nxv16i16_i16(<vscale x 16 x i16> %0, <vscale x 16 x i16> %1, i16 %2, <vscale x 16 x i1> %3, i32 %4) nounwind {
-; CHECK-LABEL: intrinsic_vslide1down_mask_vx_nxv16i16_nxv16i16_i16:
-; CHECK:       # %bb.0: # %entry
-; CHECK-NEXT:    vsetvli zero, a1, e16, m4, ta, mu
-; CHECK-NEXT:    vslide1down.vx v8, v12, a0, v0.t
-; CHECK-NEXT:    ret
-entry:
-  %a = call <vscale x 16 x i16> @llvm.riscv.vslide1down.mask.nxv16i16.i16(
-    <vscale x 16 x i16> %0,
-    <vscale x 16 x i16> %1,
-    i16 %2,
-    <vscale x 16 x i1> %3,
-    i32 %4, i32 1)
-
-  ret <vscale x 16 x i16> %a
-}
-
-declare <vscale x 32 x i16> @llvm.riscv.vslide1down.nxv32i16.i16(
-  <vscale x 32 x i16>,
-  <vscale x 32 x i16>,
-  i16,
-  i32);
-
-define <vscale x 32 x i16> @intrinsic_vslide1down_vx_nxv32i16_nxv32i16_i16(<vscale x 32 x i16> %0, i16 %1, i32 %2) nounwind {
-; CHECK-LABEL: intrinsic_vslide1down_vx_nxv32i16_nxv32i16_i16:
-; CHECK:       # %bb.0: # %entry
-; CHECK-NEXT:    vsetvli zero, a1, e16, m8, ta, ma
-; CHECK-NEXT:    vslide1down.vx v8, v8, a0
-; CHECK-NEXT:    ret
-entry:
-  %a = call <vscale x 32 x i16> @llvm.riscv.vslide1down.nxv32i16.i16(
-    <vscale x 32 x i16> undef,
-    <vscale x 32 x i16> %0,
-    i16 %1,
-    i32 %2)
-
-  ret <vscale x 32 x i16> %a
-}
-
-declare <vscale x 32 x i16> @llvm.riscv.vslide1down.mask.nxv32i16.i16(
-  <vscale x 32 x i16>,
-  <vscale x 32 x i16>,
-  i16,
-  <vscale x 32 x i1>,
-  i32,
-  i32);
-
-define <vscale x 32 x i16> @intrinsic_vslide1down_mask_vx_nxv32i16_nxv32i16_i16(<vscale x 32 x i16> %0, <vscale x 32 x i16> %1, i16 %2, <vscale x 32 x i1> %3, i32 %4) nounwind {
-; CHECK-LABEL: intrinsic_vslide1down_mask_vx_nxv32i16_nxv32i16_i16:
-; CHECK:       # %bb.0: # %entry
-; CHECK-NEXT:    vsetvli zero, a1, e16, m8, ta, mu
-; CHECK-NEXT:    vslide1down.vx v8, v16, a0, v0.t
-; CHECK-NEXT:    ret
-entry:
-  %a = call <vscale x 32 x i16> @llvm.riscv.vslide1down.mask.nxv32i16.i16(
-    <vscale x 32 x i16> %0,
-    <vscale x 32 x i16> %1,
-    i16 %2,
-    <vscale x 32 x i1> %3,
-    i32 %4, i32 1)
-
-  ret <vscale x 32 x i16> %a
-}
-
-declare <vscale x 1 x i32> @llvm.riscv.vslide1down.nxv1i32.i32(
-  <vscale x 1 x i32>,
-  <vscale x 1 x i32>,
-  i32,
-  i32);
-
-define <vscale x 1 x i32> @intrinsic_vslide1down_vx_nxv1i32_nxv1i32_i32(<vscale x 1 x i32> %0, i32 %1, i32 %2) nounwind {
-; CHECK-LABEL: intrinsic_vslide1down_vx_nxv1i32_nxv1i32_i32:
-; CHECK:       # %bb.0: # %entry
-; CHECK-NEXT:    vsetvli zero, a1, e32, mf2, ta, ma
-; CHECK-NEXT:    vslide1down.vx v8, v8, a0
-; CHECK-NEXT:    ret
-entry:
-  %a = call <vscale x 1 x i32> @llvm.riscv.vslide1down.nxv1i32.i32(
-    <vscale x 1 x i32> undef,
-    <vscale x 1 x i32> %0,
-    i32 %1,
-    i32 %2)
-
-  ret <vscale x 1 x i32> %a
-}
-
-declare <vscale x 1 x i32> @llvm.riscv.vslide1down.mask.nxv1i32.i32(
-  <vscale x 1 x i32>,
-  <vscale x 1 x i32>,
-  i32,
-  <vscale x 1 x i1>,
-  i32,
-  i32);
-
-define <vscale x 1 x i32> @intrinsic_vslide1down_mask_vx_nxv1i32_nxv1i32_i32(<vscale x 1 x i32> %0, <vscale x 1 x i32> %1, i32 %2, <vscale x 1 x i1> %3, i32 %4) nounwind {
-; CHECK-LABEL: intrinsic_vslide1down_mask_vx_nxv1i32_nxv1i32_i32:
-; CHECK:       # %bb.0: # %entry
-; CHECK-NEXT:    vsetvli zero, a1, e32, mf2, ta, mu
-; CHECK-NEXT:    vslide1down.vx v8, v9, a0, v0.t
-; CHECK-NEXT:    ret
-entry:
-  %a = call <vscale x 1 x i32> @llvm.riscv.vslide1down.mask.nxv1i32.i32(
-    <vscale x 1 x i32> %0,
-    <vscale x 1 x i32> %1,
-    i32 %2,
-    <vscale x 1 x i1> %3,
-    i32 %4, i32 1)
-
-  ret <vscale x 1 x i32> %a
-}
-
-declare <vscale x 2 x i32> @llvm.riscv.vslide1down.nxv2i32.i32(
-  <vscale x 2 x i32>,
-  <vscale x 2 x i32>,
-  i32,
-  i32);
-
-define <vscale x 2 x i32> @intrinsic_vslide1down_vx_nxv2i32_nxv2i32_i32(<vscale x 2 x i32> %0, i32 %1, i32 %2) nounwind {
-; CHECK-LABEL: intrinsic_vslide1down_vx_nxv2i32_nxv2i32_i32:
-; CHECK:       # %bb.0: # %entry
-; CHECK-NEXT:    vsetvli zero, a1, e32, m1, ta, ma
-; CHECK-NEXT:    vslide1down.vx v8, v8, a0
-; CHECK-NEXT:    ret
-entry:
-  %a = call <vscale x 2 x i32> @llvm.riscv.vslide1down.nxv2i32.i32(
-    <vscale x 2 x i32> undef,
-    <vscale x 2 x i32> %0,
-    i32 %1,
-    i32 %2)
-
-  ret <vscale x 2 x i32> %a
-}
-
-declare <vscale x 2 x i32> @llvm.riscv.vslide1down.mask.nxv2i32.i32(
-  <vscale x 2 x i32>,
-  <vscale x 2 x i32>,
-  i32,
-  <vscale x 2 x i1>,
-  i32,
-  i32);
-
-define <vscale x 2 x i32> @intrinsic_vslide1down_mask_vx_nxv2i32_nxv2i32_i32(<vscale x 2 x i32> %0, <vscale x 2 x i32> %1, i32 %2, <vscale x 2 x i1> %3, i32 %4) nounwind {
-; CHECK-LABEL: intrinsic_vslide1down_mask_vx_nxv2i32_nxv2i32_i32:
-; CHECK:       # %bb.0: # %entry
-; CHECK-NEXT:    vsetvli zero, a1, e32, m1, ta, mu
-; CHECK-NEXT:    vslide1down.vx v8, v9, a0, v0.t
-; CHECK-NEXT:    ret
-entry:
-  %a = call <vscale x 2 x i32> @llvm.riscv.vslide1down.mask.nxv2i32.i32(
-    <vscale x 2 x i32> %0,
-    <vscale x 2 x i32> %1,
-    i32 %2,
-    <vscale x 2 x i1> %3,
-    i32 %4, i32 1)
-
-  ret <vscale x 2 x i32> %a
-}
-
-declare <vscale x 4 x i32> @llvm.riscv.vslide1down.nxv4i32.i32(
-  <vscale x 4 x i32>,
-  <vscale x 4 x i32>,
-  i32,
-  i32);
-
-define <vscale x 4 x i32> @intrinsic_vslide1down_vx_nxv4i32_nxv4i32_i32(<vscale x 4 x i32> %0, i32 %1, i32 %2) nounwind {
-; CHECK-LABEL: intrinsic_vslide1down_vx_nxv4i32_nxv4i32_i32:
-; CHECK:       # %bb.0: # %entry
-; CHECK-NEXT:    vsetvli zero, a1, e32, m2, ta, ma
-; CHECK-NEXT:    vslide1down.vx v8, v8, a0
-; CHECK-NEXT:    ret
-entry:
-  %a = call <vscale x 4 x i32> @llvm.riscv.vslide1down.nxv4i32.i32(
-    <vscale x 4 x i32> undef,
-    <vscale x 4 x i32> %0,
-    i32 %1,
-    i32 %2)
-
-  ret <vscale x 4 x i32> %a
-}
-
-declare <vscale x 4 x i32> @llvm.riscv.vslide1down.mask.nxv4i32.i32(
-  <vscale x 4 x i32>,
-  <vscale x 4 x i32>,
-  i32,
-  <vscale x 4 x i1>,
-  i32,
-  i32);
-
-define <vscale x 4 x i32> @intrinsic_vslide1down_mask_vx_nxv4i32_nxv4i32_i32(<vscale x 4 x i32> %0, <vscale x 4 x i32> %1, i32 %2, <vscale x 4 x i1> %3, i32 %4) nounwind {
-; CHECK-LABEL: intrinsic_vslide1down_mask_vx_nxv4i32_nxv4i32_i32:
-; CHECK:       # %bb.0: # %entry
-; CHECK-NEXT:    vsetvli zero, a1, e32, m2, ta, mu
-; CHECK-NEXT:    vslide1down.vx v8, v10, a0, v0.t
-; CHECK-NEXT:    ret
-entry:
-  %a = call <vscale x 4 x i32> @llvm.riscv.vslide1down.mask.nxv4i32.i32(
-    <vscale x 4 x i32> %0,
-    <vscale x 4 x i32> %1,
-    i32 %2,
-    <vscale x 4 x i1> %3,
-    i32 %4, i32 1)
-
-  ret <vscale x 4 x i32> %a
-}
-
-declare <vscale x 8 x i32> @llvm.riscv.vslide1down.nxv8i32.i32(
-  <vscale x 8 x i32>,
-  <vscale x 8 x i32>,
-  i32,
-  i32);
-
-define <vscale x 8 x i32> @intrinsic_vslide1down_vx_nxv8i32_nxv8i32_i32(<vscale x 8 x i32> %0, i32 %1, i32 %2) nounwind {
-; CHECK-LABEL: intrinsic_vslide1down_vx_nxv8i32_nxv8i32_i32:
-; CHECK:       # %bb.0: # %entry
-; CHECK-NEXT:    vsetvli zero, a1, e32, m4, ta, ma
-; CHECK-NEXT:    vslide1down.vx v8, v8, a0
-; CHECK-NEXT:    ret
-entry:
-  %a = call <vscale x 8 x i32> @llvm.riscv.vslide1down.nxv8i32.i32(
-    <vscale x 8 x i32> undef,
-    <vscale x 8 x i32> %0,
-    i32 %1,
-    i32 %2)
-
-  ret <vscale x 8 x i32> %a
-}
-
-declare <vscale x 8 x i32> @llvm.riscv.vslide1down.mask.nxv8i32.i32(
-  <vscale x 8 x i32>,
-  <vscale x 8 x i32>,
-  i32,
-  <vscale x 8 x i1>,
-  i32,
-  i32);
-
-define <vscale x 8 x i32> @intrinsic_vslide1down_mask_vx_nxv8i32_nxv8i32_i32(<vscale x 8 x i32> %0, <vscale x 8 x i32> %1, i32 %2, <vscale x 8 x i1> %3, i32 %4) nounwind {
-; CHECK-LABEL: intrinsic_vslide1down_mask_vx_nxv8i32_nxv8i32_i32:
-; CHECK:       # %bb.0: # %entry
-; CHECK-NEXT:    vsetvli zero, a1, e32, m4, ta, mu
-; CHECK-NEXT:    vslide1down.vx v8, v12, a0, v0.t
-; CHECK-NEXT:    ret
-entry:
-  %a = call <vscale x 8 x i32> @llvm.riscv.vslide1down.mask.nxv8i32.i32(
-    <vscale x 8 x i32> %0,
-    <vscale x 8 x i32> %1,
-    i32 %2,
-    <vscale x 8 x i1> %3,
-    i32 %4, i32 1)
-
-  ret <vscale x 8 x i32> %a
-}
-
-declare <vscale x 16 x i32> @llvm.riscv.vslide1down.nxv16i32.i32(
-  <vscale x 16 x i32>,
-  <vscale x 16 x i32>,
-  i32,
-  i32);
-
-define <vscale x 16 x i32> @intrinsic_vslide1down_vx_nxv16i32_nxv16i32_i32(<vscale x 16 x i32> %0, i32 %1, i32 %2) nounwind {
-; CHECK-LABEL: intrinsic_vslide1down_vx_nxv16i32_nxv16i32_i32:
-; CHECK:       # %bb.0: # %entry
-; CHECK-NEXT:    vsetvli zero, a1, e32, m8, ta, ma
-; CHECK-NEXT:    vslide1down.vx v8, v8, a0
-; CHECK-NEXT:    ret
-entry:
-  %a = call <vscale x 16 x i32> @llvm.riscv.vslide1down.nxv16i32.i32(
-    <vscale x 16 x i32> undef,
-    <vscale x 16 x i32> %0,
-    i32 %1,
-    i32 %2)
-
-  ret <vscale x 16 x i32> %a
-}
-
-declare <vscale x 16 x i32> @llvm.riscv.vslide1down.mask.nxv16i32.i32(
-  <vscale x 16 x i32>,
-  <vscale x 16 x i32>,
-  i32,
-  <vscale x 16 x i1>,
-  i32,
-  i32);
-
-define <vscale x 16 x i32> @intrinsic_vslide1down_mask_vx_nxv16i32_nxv16i32_i32(<vscale x 16 x i32> %0, <vscale x 16 x i32> %1, i32 %2, <vscale x 16 x i1> %3, i32 %4) nounwind {
-; CHECK-LABEL: intrinsic_vslide1down_mask_vx_nxv16i32_nxv16i32_i32:
-; CHECK:       # %bb.0: # %entry
-; CHECK-NEXT:    vsetvli zero, a1, e32, m8, ta, mu
-; CHECK-NEXT:    vslide1down.vx v8, v16, a0, v0.t
-; CHECK-NEXT:    ret
-entry:
-  %a = call <vscale x 16 x i32> @llvm.riscv.vslide1down.mask.nxv16i32.i32(
-    <vscale x 16 x i32> %0,
-    <vscale x 16 x i32> %1,
-    i32 %2,
-    <vscale x 16 x i1> %3,
-    i32 %4, i32 1)
-
-  ret <vscale x 16 x i32> %a
-}
-
-declare <vscale x 1 x i64> @llvm.riscv.vslide1down.nxv1i64.i64(
-  <vscale x 1 x i64>,
-  <vscale x 1 x i64>,
-  i64,
-  i32);
-
-define <vscale x 1 x i64> @intrinsic_vslide1down_vx_nxv1i64_nxv1i64_i64(<vscale x 1 x i64> %0, i64 %1, i32 %2) nounwind {
-; CHECK-LABEL: intrinsic_vslide1down_vx_nxv1i64_nxv1i64_i64:
-; CHECK:       # %bb.0: # %entry
-; CHECK-NEXT:    vsetvli a2, a2, e64, m1, ta, ma
-; CHECK-NEXT:    slli a2, a2, 1
-; CHECK-NEXT:    vsetvli zero, a2, e32, m1, ta, ma
-; CHECK-NEXT:    vslide1down.vx v8, v8, a0
-; CHECK-NEXT:    vslide1down.vx v8, v8, a1
-; CHECK-NEXT:    ret
-entry:
-  %a = call <vscale x 1 x i64> @llvm.riscv.vslide1down.nxv1i64.i64(
-    <vscale x 1 x i64> undef,
-    <vscale x 1 x i64> %0,
-    i64 %1,
-    i32 %2)
-
-  ret <vscale x 1 x i64> %a
-}
-
-declare <vscale x 1 x i64> @llvm.riscv.vslide1down.mask.nxv1i64.i64(
-  <vscale x 1 x i64>,
-  <vscale x 1 x i64>,
-  i64,
-  <vscale x 1 x i1>,
-  i32,
-  i32);
-
-define <vscale x 1 x i64> @intrinsic_vslide1down_mask_vx_nxv1i64_nxv1i64_i64(<vscale x 1 x i64> %0, <vscale x 1 x i64> %1, i64 %2, <vscale x 1 x i1> %3, i32 %4) nounwind {
-; CHECK-LABEL: intrinsic_vslide1down_mask_vx_nxv1i64_nxv1i64_i64:
-; CHECK:       # %bb.0: # %entry
-; CHECK-NEXT:    vsetvli a3, a2, e64, m1, ta, ma
-; CHECK-NEXT:    slli a3, a3, 1
-; CHECK-NEXT:    vsetvli zero, a3, e32, m1, ta, ma
-; CHECK-NEXT:    vslide1down.vx v9, v9, a0
-; CHECK-NEXT:    vslide1down.vx v9, v9, a1
-; CHECK-NEXT:    vsetvli zero, a2, e64, m1, ta, ma
-; CHECK-NEXT:    vmerge.vvm v8, v8, v9, v0
-; CHECK-NEXT:    ret
-entry:
-  %a = call <vscale x 1 x i64> @llvm.riscv.vslide1down.mask.nxv1i64.i64(
-    <vscale x 1 x i64> %0,
-    <vscale x 1 x i64> %1,
-    i64 %2,
-    <vscale x 1 x i1> %3,
-    i32 %4, i32 1)
-
-  ret <vscale x 1 x i64> %a
-}
-
-declare <vscale x 2 x i64> @llvm.riscv.vslide1down.nxv2i64.i64(
-  <vscale x 2 x i64>,
-  <vscale x 2 x i64>,
-  i64,
-  i32);
-
-define <vscale x 2 x i64> @intrinsic_vslide1down_vx_nxv2i64_nxv2i64_i64(<vscale x 2 x i64> %0, i64 %1, i32 %2) nounwind {
-; CHECK-LABEL: intrinsic_vslide1down_vx_nxv2i64_nxv2i64_i64:
-; CHECK:       # %bb.0: # %entry
-; CHECK-NEXT:    vsetvli a2, a2, e64, m2, ta, ma
-; CHECK-NEXT:    slli a2, a2, 1
-; CHECK-NEXT:    vsetvli zero, a2, e32, m2, ta, ma
-; CHECK-NEXT:    vslide1down.vx v8, v8, a0
-; CHECK-NEXT:    vslide1down.vx v8, v8, a1
-; CHECK-NEXT:    ret
-entry:
-  %a = call <vscale x 2 x i64> @llvm.riscv.vslide1down.nxv2i64.i64(
-    <vscale x 2 x i64> undef,
-    <vscale x 2 x i64> %0,
-    i64 %1,
-    i32 %2)
-
-  ret <vscale x 2 x i64> %a
-}
-
-declare <vscale x 2 x i64> @llvm.riscv.vslide1down.mask.nxv2i64.i64(
-  <vscale x 2 x i64>,
-  <vscale x 2 x i64>,
-  i64,
-  <vscale x 2 x i1>,
-  i32,
-  i32);
-
-define <vscale x 2 x i64> @intrinsic_vslide1down_mask_vx_nxv2i64_nxv2i64_i64(<vscale x 2 x i64> %0, <vscale x 2 x i64> %1, i64 %2, <vscale x 2 x i1> %3, i32 %4) nounwind {
-; CHECK-LABEL: intrinsic_vslide1down_mask_vx_nxv2i64_nxv2i64_i64:
-; CHECK:       # %bb.0: # %entry
-; CHECK-NEXT:    vsetvli a3, a2, e64, m2, ta, ma
-; CHECK-NEXT:    slli a3, a3, 1
-; CHECK-NEXT:    vsetvli zero, a3, e32, m2, ta, ma
-; CHECK-NEXT:    vslide1down.vx v10, v10, a0
-; CHECK-NEXT:    vslide1down.vx v10, v10, a1
-; CHECK-NEXT:    vsetvli zero, a2, e64, m2, ta, ma
-; CHECK-NEXT:    vmerge.vvm v8, v8, v10, v0
-; CHECK-NEXT:    ret
-entry:
-  %a = call <vscale x 2 x i64> @llvm.riscv.vslide1down.mask.nxv2i64.i64(
-    <vscale x 2 x i64> %0,
-    <vscale x 2 x i64> %1,
-    i64 %2,
-    <vscale x 2 x i1> %3,
-    i32 %4, i32 1)
-
-  ret <vscale x 2 x i64> %a
-}
-
-declare <vscale x 4 x i64> @llvm.riscv.vslide1down.nxv4i64.i64(
-  <vscale x 4 x i64>,
-  <vscale x 4 x i64>,
-  i64,
-  i32);
-
-define <vscale x 4 x i64> @intrinsic_vslide1down_vx_nxv4i64_nxv4i64_i64(<vscale x 4 x i64> %0, i64 %1, i32 %2) nounwind {
-; CHECK-LABEL: intrinsic_vslide1down_vx_nxv4i64_nxv4i64_i64:
-; CHECK:       # %bb.0: # %entry
-; CHECK-NEXT:    vsetvli a2, a2, e64, m4, ta, ma
-; CHECK-NEXT:    slli a2, a2, 1
-; CHECK-NEXT:    vsetvli zero, a2, e32, m4, ta, ma
-; CHECK-NEXT:    vslide1down.vx v8, v8, a0
-; CHECK-NEXT:    vslide1down.vx v8, v8, a1
-; CHECK-NEXT:    ret
-entry:
-  %a = call <vscale x 4 x i64> @llvm.riscv.vslide1down.nxv4i64.i64(
-    <vscale x 4 x i64> undef,
-    <vscale x 4 x i64> %0,
-    i64 %1,
-    i32 %2)
-
-  ret <vscale x 4 x i64> %a
-}
-
-declare <vscale x 4 x i64> @llvm.riscv.vslide1down.mask.nxv4i64.i64(
-  <vscale x 4 x i64>,
-  <vscale x 4 x i64>,
-  i64,
-  <vscale x 4 x i1>,
-  i32,
-  i32);
-
-define <vscale x 4 x i64> @intrinsic_vslide1down_mask_vx_nxv4i64_nxv4i64_i64(<vscale x 4 x i64> %0, <vscale x 4 x i64> %1, i64 %2, <vscale x 4 x i1> %3, i32 %4) nounwind {
-; CHECK-LABEL: intrinsic_vslide1down_mask_vx_nxv4i64_nxv4i64_i64:
-; CHECK:       # %bb.0: # %entry
-; CHECK-NEXT:    vsetvli a3, a2, e64, m4, ta, ma
-; CHECK-NEXT:    slli a3, a3, 1
-; CHECK-NEXT:    vsetvli zero, a3, e32, m4, ta, ma
-; CHECK-NEXT:    vslide1down.vx v12, v12, a0
-; CHECK-NEXT:    vslide1down.vx v12, v12, a1
-; CHECK-NEXT:    vsetvli zero, a2, e64, m4, ta, ma
-; CHECK-NEXT:    vmerge.vvm v8, v8, v12, v0
-; CHECK-NEXT:    ret
-entry:
-  %a = call <vscale x 4 x i64> @llvm.riscv.vslide1down.mask.nxv4i64.i64(
-    <vscale x 4 x i64> %0,
-    <vscale x 4 x i64> %1,
-    i64 %2,
-    <vscale x 4 x i1> %3,
-    i32 %4, i32 1)
-
-  ret <vscale x 4 x i64> %a
-}
-
-declare <vscale x 8 x i64> @llvm.riscv.vslide1down.nxv8i64.i64(
-  <vscale x 8 x i64>,
-  <vscale x 8 x i64>,
-  i64,
-  i32);
-
-define <vscale x 8 x i64> @intrinsic_vslide1down_vx_nxv8i64_nxv8i64_i64(<vscale x 8 x i64> %0, i64 %1, i32 %2) nounwind {
-; CHECK-LABEL: intrinsic_vslide1down_vx_nxv8i64_nxv8i64_i64:
-; CHECK:       # %bb.0: # %entry
-; CHECK-NEXT:    vsetvli a2, a2, e64, m8, ta, ma
-; CHECK-NEXT:    slli a2, a2, 1
-; CHECK-NEXT:    vsetvli zero, a2, e32, m8, ta, ma
-; CHECK-NEXT:    vslide1down.vx v8, v8, a0
-; CHECK-NEXT:    vslide1down.vx v8, v8, a1
-; CHECK-NEXT:    ret
-entry:
-  %a = call <vscale x 8 x i64> @llvm.riscv.vslide1down.nxv8i64.i64(
-    <vscale x 8 x i64> undef,
-    <vscale x 8 x i64> %0,
-    i64 %1,
-    i32 %2)
-
-  ret <vscale x 8 x i64> %a
-}
-
-declare <vscale x 8 x i64> @llvm.riscv.vslide1down.mask.nxv8i64.i64(
-  <vscale x 8 x i64>,
-  <vscale x 8 x i64>,
-  i64,
-  <vscale x 8 x i1>,
-  i32,
-  i32);
-
-define <vscale x 8 x i64> @intrinsic_vslide1down_mask_vx_nxv8i64_nxv8i64_i64(<vscale x 8 x i64> %0, <vscale x 8 x i64> %1, i64 %2, <vscale x 8 x i1> %3, i32 %4) nounwind {
-; CHECK-LABEL: intrinsic_vslide1down_mask_vx_nxv8i64_nxv8i64_i64:
-; CHECK:       # %bb.0: # %entry
-; CHECK-NEXT:    vsetvli a3, a2, e64, m8, ta, ma
-; CHECK-NEXT:    slli a3, a3, 1
-; CHECK-NEXT:    vsetvli zero, a3, e32, m8, ta, ma
-; CHECK-NEXT:    vslide1down.vx v16, v16, a0
-; CHECK-NEXT:    vslide1down.vx v16, v16, a1
-; CHECK-NEXT:    vsetvli zero, a2, e64, m8, ta, ma
-; CHECK-NEXT:    vmerge.vvm v8, v8, v16, v0
-; CHECK-NEXT:    ret
-entry:
-  %a = call <vscale x 8 x i64> @llvm.riscv.vslide1down.mask.nxv8i64.i64(
-    <vscale x 8 x i64> %0,
-    <vscale x 8 x i64> %1,
-    i64 %2,
-    <vscale x 8 x i1> %3,
-    i32 %4, i32 1)
-
-  ret <vscale x 8 x i64> %a
-}
diff --git a/llvm/test/CodeGen/RISCV/rvv/vslide1down-rv64.ll b/llvm/test/CodeGen/RISCV/rvv/vslide1down.ll
index ccb107b..e7ca276 100644
--- a/llvm/test/CodeGen/RISCV/rvv/vslide1down-rv64.ll
+++ b/llvm/test/CodeGen/RISCV/rvv/vslide1down.ll
@@ -1,14 +1,16 @@
 ; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py
-; RUN: llc -mtriple=riscv64 -mattr=+v,+d -verify-machineinstrs \
-; RUN:   < %s | FileCheck %s
+; RUN: sed 's/iXLen/i32/g' %s | llc -mtriple=riscv32 -mattr=+v \
+; RUN:   -verify-machineinstrs  | FileCheck %s --check-prefixes=CHECK,RV32
+; RUN: sed 's/iXLen/i64/g' %s | llc -mtriple=riscv64 -mattr=+v \
+; RUN:   -verify-machineinstrs  | FileCheck %s --check-prefixes=CHECK,RV64
 
 declare <vscale x 1 x i8> @llvm.riscv.vslide1down.nxv1i8.i8(
   <vscale x 1 x i8>,
   <vscale x 1 x i8>,
   i8,
-  i64);
+  iXLen)
 
-define <vscale x 1 x i8> @intrinsic_vslide1down_vx_nxv1i8_nxv1i8_i8(<vscale x 1 x i8> %0, i8 %1, i64 %2) nounwind {
+define <vscale x 1 x i8> @intrinsic_vslide1down_vx_nxv1i8_nxv1i8_i8(<vscale x 1 x i8> %0, i8 %1, iXLen %2) nounwind {
 ; CHECK-LABEL: intrinsic_vslide1down_vx_nxv1i8_nxv1i8_i8:
 ; CHECK:       # %bb.0: # %entry
 ; CHECK-NEXT:    vsetvli zero, a1, e8, mf8, ta, ma
@@ -19,7 +21,7 @@ entry:
     <vscale x 1 x i8> undef,
     <vscale x 1 x i8> %0,
     i8 %1,
-    i64 %2)
+    iXLen %2)
 
   ret <vscale x 1 x i8> %a
 }
@@ -29,10 +31,10 @@ declare <vscale x 1 x i8> @llvm.riscv.vslide1down.mask.nxv1i8.i8(
   <vscale x 1 x i8>,
   i8,
   <vscale x 1 x i1>,
-  i64,
-  i64);
+  iXLen,
+  iXLen)
 
-define <vscale x 1 x i8> @intrinsic_vslide1down_mask_vx_nxv1i8_nxv1i8_i8(<vscale x 1 x i8> %0, <vscale x 1 x i8> %1, i8 %2, <vscale x 1 x i1> %3, i64 %4) nounwind {
+define <vscale x 1 x i8> @intrinsic_vslide1down_mask_vx_nxv1i8_nxv1i8_i8(<vscale x 1 x i8> %0, <vscale x 1 x i8> %1, i8 %2, <vscale x 1 x i1> %3, iXLen %4) nounwind {
 ; CHECK-LABEL: intrinsic_vslide1down_mask_vx_nxv1i8_nxv1i8_i8:
 ; CHECK:       # %bb.0: # %entry
 ; CHECK-NEXT:    vsetvli zero, a1, e8, mf8, ta, mu
@@ -44,7 +46,7 @@ entry:
     <vscale x 1 x i8> %1,
     i8 %2,
     <vscale x 1 x i1> %3,
-    i64 %4, i64 1)
+    iXLen %4, iXLen 1)
 
   ret <vscale x 1 x i8> %a
 }
@@ -53,9 +55,9 @@ declare <vscale x 2 x i8> @llvm.riscv.vslide1down.nxv2i8.i8(
   <vscale x 2 x i8>,
   <vscale x 2 x i8>,
   i8,
-  i64);
+  iXLen)
 
-define <vscale x 2 x i8> @intrinsic_vslide1down_vx_nxv2i8_nxv2i8_i8(<vscale x 2 x i8> %0, i8 %1, i64 %2) nounwind {
+define <vscale x 2 x i8> @intrinsic_vslide1down_vx_nxv2i8_nxv2i8_i8(<vscale x 2 x i8> %0, i8 %1, iXLen %2) nounwind {
 ; CHECK-LABEL: intrinsic_vslide1down_vx_nxv2i8_nxv2i8_i8:
 ; CHECK:       # %bb.0: # %entry
 ; CHECK-NEXT:    vsetvli zero, a1, e8, mf4, ta, ma
@@ -66,7 +68,7 @@ entry:
     <vscale x 2 x i8> undef,
     <vscale x 2 x i8> %0,
     i8 %1,
-    i64 %2)
+    iXLen %2)
 
   ret <vscale x 2 x i8> %a
 }
@@ -76,10 +78,10 @@ declare <vscale x 2 x i8> @llvm.riscv.vslide1down.mask.nxv2i8.i8(
   <vscale x 2 x i8>,
   i8,
   <vscale x 2 x i1>,
-  i64,
-  i64);
+  iXLen,
+  iXLen)
 
-define <vscale x 2 x i8> @intrinsic_vslide1down_mask_vx_nxv2i8_nxv2i8_i8(<vscale x 2 x i8> %0, <vscale x 2 x i8> %1, i8 %2, <vscale x 2 x i1> %3, i64 %4) nounwind {
+define <vscale x 2 x i8> @intrinsic_vslide1down_mask_vx_nxv2i8_nxv2i8_i8(<vscale x 2 x i8> %0, <vscale x 2 x i8> %1, i8 %2, <vscale x 2 x i1> %3, iXLen %4) nounwind {
 ; CHECK-LABEL: intrinsic_vslide1down_mask_vx_nxv2i8_nxv2i8_i8:
 ; CHECK:       # %bb.0: # %entry
 ; CHECK-NEXT:    vsetvli zero, a1, e8, mf4, ta, mu
@@ -91,7 +93,7 @@ entry:
     <vscale x 2 x i8> %1,
     i8 %2,
     <vscale x 2 x i1> %3,
-    i64 %4, i64 1)
+    iXLen %4, iXLen 1)
 
   ret <vscale x 2 x i8> %a
 }
@@ -100,9 +102,9 @@ declare <vscale x 4 x i8> @llvm.riscv.vslide1down.nxv4i8.i8(
   <vscale x 4 x i8>,
   <vscale x 4 x i8>,
   i8,
-  i64);
+  iXLen)
 
-define <vscale x 4 x i8> @intrinsic_vslide1down_vx_nxv4i8_nxv4i8_i8(<vscale x 4 x i8> %0, i8 %1, i64 %2) nounwind {
+define <vscale x 4 x i8> @intrinsic_vslide1down_vx_nxv4i8_nxv4i8_i8(<vscale x 4 x i8> %0, i8 %1, iXLen %2) nounwind {
 ; CHECK-LABEL: intrinsic_vslide1down_vx_nxv4i8_nxv4i8_i8:
 ; CHECK:       # %bb.0: # %entry
 ; CHECK-NEXT:    vsetvli zero, a1, e8, mf2, ta, ma
@@ -113,7 +115,7 @@ entry:
     <vscale x 4 x i8> undef,
     <vscale x 4 x i8> %0,
     i8 %1,
-    i64 %2)
+    iXLen %2)
 
   ret <vscale x 4 x i8> %a
 }
@@ -123,10 +125,10 @@ declare <vscale x 4 x i8> @llvm.riscv.vslide1down.mask.nxv4i8.i8(
   <vscale x 4 x i8>,
   i8,
   <vscale x 4 x i1>,
-  i64,
-  i64);
+  iXLen,
+  iXLen)
 
-define <vscale x 4 x i8> @intrinsic_vslide1down_mask_vx_nxv4i8_nxv4i8_i8(<vscale x 4 x i8> %0, <vscale x 4 x i8> %1, i8 %2, <vscale x 4 x i1> %3, i64 %4) nounwind {
+define <vscale x 4 x i8> @intrinsic_vslide1down_mask_vx_nxv4i8_nxv4i8_i8(<vscale x 4 x i8> %0, <vscale x 4 x i8> %1, i8 %2, <vscale x 4 x i1> %3, iXLen %4) nounwind {
 ; CHECK-LABEL: intrinsic_vslide1down_mask_vx_nxv4i8_nxv4i8_i8:
 ; CHECK:       # %bb.0: # %entry
 ; CHECK-NEXT:    vsetvli zero, a1, e8, mf2, ta, mu
@@ -138,7 +140,7 @@ entry:
     <vscale x 4 x i8> %1,
     i8 %2,
     <vscale x 4 x i1> %3,
-    i64 %4, i64 1)
+    iXLen %4, iXLen 1)
 
   ret <vscale x 4 x i8> %a
 }
@@ -147,9 +149,9 @@ declare <vscale x 8 x i8> @llvm.riscv.vslide1down.nxv8i8.i8(
   <vscale x 8 x i8>,
   <vscale x 8 x i8>,
   i8,
-  i64);
+  iXLen)
 
-define <vscale x 8 x i8> @intrinsic_vslide1down_vx_nxv8i8_nxv8i8_i8(<vscale x 8 x i8> %0, i8 %1, i64 %2) nounwind {
+define <vscale x 8 x i8> @intrinsic_vslide1down_vx_nxv8i8_nxv8i8_i8(<vscale x 8 x i8> %0, i8 %1, iXLen %2) nounwind {
 ; CHECK-LABEL: intrinsic_vslide1down_vx_nxv8i8_nxv8i8_i8:
 ; CHECK:       # %bb.0: # %entry
 ; CHECK-NEXT:    vsetvli zero, a1, e8, m1, ta, ma
@@ -160,7 +162,7 @@ entry:
     <vscale x 8 x i8> undef,
     <vscale x 8 x i8> %0,
     i8 %1,
-    i64 %2)
+    iXLen %2)
 
   ret <vscale x 8 x i8> %a
 }
@@ -170,10 +172,10 @@ declare <vscale x 8 x i8> @llvm.riscv.vslide1down.mask.nxv8i8.i8(
   <vscale x 8 x i8>,
   i8,
   <vscale x 8 x i1>,
-  i64,
-  i64);
+  iXLen,
+  iXLen)
 
-define <vscale x 8 x i8> @intrinsic_vslide1down_mask_vx_nxv8i8_nxv8i8_i8(<vscale x 8 x i8> %0, <vscale x 8 x i8> %1, i8 %2, <vscale x 8 x i1> %3, i64 %4) nounwind {
+define <vscale x 8 x i8> @intrinsic_vslide1down_mask_vx_nxv8i8_nxv8i8_i8(<vscale x 8 x i8> %0, <vscale x 8 x i8> %1, i8 %2, <vscale x 8 x i1> %3, iXLen %4) nounwind {
 ; CHECK-LABEL: intrinsic_vslide1down_mask_vx_nxv8i8_nxv8i8_i8:
 ; CHECK:       # %bb.0: # %entry
 ; CHECK-NEXT:    vsetvli zero, a1, e8, m1, ta, mu
@@ -185,7 +187,7 @@ entry:
     <vscale x 8 x i8> %1,
     i8 %2,
     <vscale x 8 x i1> %3,
-    i64 %4, i64 1)
+    iXLen %4, iXLen 1)
 
   ret <vscale x 8 x i8> %a
 }
@@ -194,9 +196,9 @@ declare <vscale x 16 x i8> @llvm.riscv.vslide1down.nxv16i8.i8(
   <vscale x 16 x i8>,
   <vscale x 16 x i8>,
   i8,
-  i64);
+  iXLen)
 
-define <vscale x 16 x i8> @intrinsic_vslide1down_vx_nxv16i8_nxv16i8_i8(<vscale x 16 x i8> %0, i8 %1, i64 %2) nounwind {
+define <vscale x 16 x i8> @intrinsic_vslide1down_vx_nxv16i8_nxv16i8_i8(<vscale x 16 x i8> %0, i8 %1, iXLen %2) nounwind {
 ; CHECK-LABEL: intrinsic_vslide1down_vx_nxv16i8_nxv16i8_i8:
 ; CHECK:       # %bb.0: # %entry
 ; CHECK-NEXT:    vsetvli zero, a1, e8, m2, ta, ma
@@ -207,7 +209,7 @@ entry:
     <vscale x 16 x i8> undef,
     <vscale x 16 x i8> %0,
     i8 %1,
-    i64 %2)
+    iXLen %2)
 
   ret <vscale x 16 x i8> %a
 }
@@ -217,10 +219,10 @@ declare <vscale x 16 x i8> @llvm.riscv.vslide1down.mask.nxv16i8.i8(
   <vscale x 16 x i8>,
   i8,
   <vscale x 16 x i1>,
-  i64,
-  i64);
+  iXLen,
+  iXLen)
 
-define <vscale x 16 x i8> @intrinsic_vslide1down_mask_vx_nxv16i8_nxv16i8_i8(<vscale x 16 x i8> %0, <vscale x 16 x i8> %1, i8 %2, <vscale x 16 x i1> %3, i64 %4) nounwind {
+define <vscale x 16 x i8> @intrinsic_vslide1down_mask_vx_nxv16i8_nxv16i8_i8(<vscale x 16 x i8> %0, <vscale x 16 x i8> %1, i8 %2, <vscale x 16 x i1> %3, iXLen %4) nounwind {
 ; CHECK-LABEL: intrinsic_vslide1down_mask_vx_nxv16i8_nxv16i8_i8:
 ; CHECK:       # %bb.0: # %entry
 ; CHECK-NEXT:    vsetvli zero, a1, e8, m2, ta, mu
@@ -232,7 +234,7 @@ entry:
     <vscale x 16 x i8> %1,
     i8 %2,
     <vscale x 16 x i1> %3,
-    i64 %4, i64 1)
+    iXLen %4, iXLen 1)
 
   ret <vscale x 16 x i8> %a
 }
@@ -241,9 +243,9 @@ declare <vscale x 32 x i8> @llvm.riscv.vslide1down.nxv32i8.i8(
   <vscale x 32 x i8>,
   <vscale x 32 x i8>,
   i8,
-  i64);
+  iXLen)
 
-define <vscale x 32 x i8> @intrinsic_vslide1down_vx_nxv32i8_nxv32i8_i8(<vscale x 32 x i8> %0, i8 %1, i64 %2) nounwind {
+define <vscale x 32 x i8> @intrinsic_vslide1down_vx_nxv32i8_nxv32i8_i8(<vscale x 32 x i8> %0, i8 %1, iXLen %2) nounwind {
 ; CHECK-LABEL: intrinsic_vslide1down_vx_nxv32i8_nxv32i8_i8:
 ; CHECK:       # %bb.0: # %entry
 ; CHECK-NEXT:    vsetvli zero, a1, e8, m4, ta, ma
@@ -254,7 +256,7 @@ entry:
     <vscale x 32 x i8> undef,
     <vscale x 32 x i8> %0,
     i8 %1,
-    i64 %2)
+    iXLen %2)
 
   ret <vscale x 32 x i8> %a
 }
@@ -264,10 +266,10 @@ declare <vscale x 32 x i8> @llvm.riscv.vslide1down.mask.nxv32i8.i8(
   <vscale x 32 x i8>,
   i8,
   <vscale x 32 x i1>,
-  i64,
-  i64);
+  iXLen,
+  iXLen)
 
-define <vscale x 32 x i8> @intrinsic_vslide1down_mask_vx_nxv32i8_nxv32i8_i8(<vscale x 32 x i8> %0, <vscale x 32 x i8> %1, i8 %2, <vscale x 32 x i1> %3, i64 %4) nounwind {
+define <vscale x 32 x i8> @intrinsic_vslide1down_mask_vx_nxv32i8_nxv32i8_i8(<vscale x 32 x i8> %0, <vscale x 32 x i8> %1, i8 %2, <vscale x 32 x i1> %3, iXLen %4) nounwind {
 ; CHECK-LABEL: intrinsic_vslide1down_mask_vx_nxv32i8_nxv32i8_i8:
 ; CHECK:       # %bb.0: # %entry
 ; CHECK-NEXT:    vsetvli zero, a1, e8, m4, ta, mu
@@ -279,7 +281,7 @@ entry:
     <vscale x 32 x i8> %1,
     i8 %2,
     <vscale x 32 x i1> %3,
-    i64 %4, i64 1)
+    iXLen %4, iXLen 1)
 
   ret <vscale x 32 x i8> %a
 }
@@ -288,9 +290,9 @@ declare <vscale x 64 x i8> @llvm.riscv.vslide1down.nxv64i8.i8(
   <vscale x 64 x i8>,
   <vscale x 64 x i8>,
   i8,
-  i64);
+  iXLen)
 
-define <vscale x 64 x i8> @intrinsic_vslide1down_vx_nxv64i8_nxv64i8_i8(<vscale x 64 x i8> %0, i8 %1, i64 %2) nounwind {
+define <vscale x 64 x i8> @intrinsic_vslide1down_vx_nxv64i8_nxv64i8_i8(<vscale x 64 x i8> %0, i8 %1, iXLen %2) nounwind {
 ; CHECK-LABEL: intrinsic_vslide1down_vx_nxv64i8_nxv64i8_i8:
 ; CHECK:       # %bb.0: # %entry
 ; CHECK-NEXT:    vsetvli zero, a1, e8, m8, ta, ma
@@ -301,7 +303,7 @@ entry:
     <vscale x 64 x i8> undef,
     <vscale x 64 x i8> %0,
     i8 %1,
-    i64 %2)
+    iXLen %2)
 
   ret <vscale x 64 x i8> %a
 }
@@ -311,10 +313,10 @@ declare <vscale x 64 x i8> @llvm.riscv.vslide1down.mask.nxv64i8.i8(
   <vscale x 64 x i8>,
   i8,
   <vscale x 64 x i1>,
-  i64,
-  i64);
+  iXLen,
+  iXLen)
 
-define <vscale x 64 x i8> @intrinsic_vslide1down_mask_vx_nxv64i8_nxv64i8_i8(<vscale x 64 x i8> %0, <vscale x 64 x i8> %1, i8 %2, <vscale x 64 x i1> %3, i64 %4) nounwind {
+define <vscale x 64 x i8> @intrinsic_vslide1down_mask_vx_nxv64i8_nxv64i8_i8(<vscale x 64 x i8> %0, <vscale x 64 x i8> %1, i8 %2, <vscale x 64 x i1> %3, iXLen %4) nounwind {
 ; CHECK-LABEL: intrinsic_vslide1down_mask_vx_nxv64i8_nxv64i8_i8:
 ; CHECK:       # %bb.0: # %entry
 ; CHECK-NEXT:    vsetvli zero, a1, e8, m8, ta, mu
@@ -326,7 +328,7 @@ entry:
     <vscale x 64 x i8> %1,
     i8 %2,
     <vscale x 64 x i1> %3,
-    i64 %4, i64 1)
+    iXLen %4, iXLen 1)
 
   ret <vscale x 64 x i8> %a
 }
@@ -335,9 +337,9 @@ declare <vscale x 1 x i16> @llvm.riscv.vslide1down.nxv1i16.i16(
   <vscale x 1 x i16>,
   <vscale x 1 x i16>,
   i16,
-  i64);
+  iXLen)
 
-define <vscale x 1 x i16> @intrinsic_vslide1down_vx_nxv1i16_nxv1i16_i16(<vscale x 1 x i16> %0, i16 %1, i64 %2) nounwind {
+define <vscale x 1 x i16> @intrinsic_vslide1down_vx_nxv1i16_nxv1i16_i16(<vscale x 1 x i16> %0, i16 %1, iXLen %2) nounwind {
 ; CHECK-LABEL: intrinsic_vslide1down_vx_nxv1i16_nxv1i16_i16:
 ; CHECK:       # %bb.0: # %entry
 ; CHECK-NEXT:    vsetvli zero, a1, e16, mf4, ta, ma
@@ -348,7 +350,7 @@ entry:
     <vscale x 1 x i16> undef,
     <vscale x 1 x i16> %0,
     i16 %1,
-    i64 %2)
+    iXLen %2)
 
   ret <vscale x 1 x i16> %a
 }
@@ -358,10 +360,10 @@ declare <vscale x 1 x i16> @llvm.riscv.vslide1down.mask.nxv1i16.i16(
   <vscale x 1 x i16>,
   i16,
   <vscale x 1 x i1>,
-  i64,
-  i64);
+  iXLen,
+  iXLen)
 
-define <vscale x 1 x i16> @intrinsic_vslide1down_mask_vx_nxv1i16_nxv1i16_i16(<vscale x 1 x i16> %0, <vscale x 1 x i16> %1, i16 %2, <vscale x 1 x i1> %3, i64 %4) nounwind {
+define <vscale x 1 x i16> @intrinsic_vslide1down_mask_vx_nxv1i16_nxv1i16_i16(<vscale x 1 x i16> %0, <vscale x 1 x i16> %1, i16 %2, <vscale x 1 x i1> %3, iXLen %4) nounwind {
 ; CHECK-LABEL: intrinsic_vslide1down_mask_vx_nxv1i16_nxv1i16_i16:
 ; CHECK:       # %bb.0: # %entry
 ; CHECK-NEXT:    vsetvli zero, a1, e16, mf4, ta, mu
@@ -373,7 +375,7 @@ entry:
     <vscale x 1 x i16> %1,
     i16 %2,
     <vscale x 1 x i1> %3,
-    i64 %4, i64 1)
+    iXLen %4, iXLen 1)
 
   ret <vscale x 1 x i16> %a
 }
@@ -382,9 +384,9 @@ declare <vscale x 2 x i16> @llvm.riscv.vslide1down.nxv2i16.i16(
   <vscale x 2 x i16>,
   <vscale x 2 x i16>,
   i16,
-  i64);
+  iXLen)
 
-define <vscale x 2 x i16> @intrinsic_vslide1down_vx_nxv2i16_nxv2i16_i16(<vscale x 2 x i16> %0, i16 %1, i64 %2) nounwind {
+define <vscale x 2 x i16> @intrinsic_vslide1down_vx_nxv2i16_nxv2i16_i16(<vscale x 2 x i16> %0, i16 %1, iXLen %2) nounwind {
 ; CHECK-LABEL: intrinsic_vslide1down_vx_nxv2i16_nxv2i16_i16:
 ; CHECK:       # %bb.0: # %entry
 ; CHECK-NEXT:    vsetvli zero, a1, e16, mf2, ta, ma
@@ -395,7 +397,7 @@ entry:
     <vscale x 2 x i16> undef,
     <vscale x 2 x i16> %0,
     i16 %1,
-    i64 %2)
+    iXLen %2)
 
   ret <vscale x 2 x i16> %a
 }
@@ -405,10 +407,10 @@ declare <vscale x 2 x i16> @llvm.riscv.vslide1down.mask.nxv2i16.i16(
   <vscale x 2 x i16>,
   i16,
   <vscale x 2 x i1>,
-  i64,
-  i64);
+  iXLen,
+  iXLen)
 
-define <vscale x 2 x i16> @intrinsic_vslide1down_mask_vx_nxv2i16_nxv2i16_i16(<vscale x 2 x i16> %0, <vscale x 2 x i16> %1, i16 %2, <vscale x 2 x i1> %3, i64 %4) nounwind {
+define <vscale x 2 x i16> @intrinsic_vslide1down_mask_vx_nxv2i16_nxv2i16_i16(<vscale x 2 x i16> %0, <vscale x 2 x i16> %1, i16 %2, <vscale x 2 x i1> %3, iXLen %4) nounwind {
 ; CHECK-LABEL: intrinsic_vslide1down_mask_vx_nxv2i16_nxv2i16_i16:
 ; CHECK:       # %bb.0: # %entry
 ; CHECK-NEXT:    vsetvli zero, a1, e16, mf2, ta, mu
@@ -420,7 +422,7 @@ entry:
     <vscale x 2 x i16> %1,
     i16 %2,
     <vscale x 2 x i1> %3,
-    i64 %4, i64 1)
+    iXLen %4, iXLen 1)
 
   ret <vscale x 2 x i16> %a
 }
@@ -429,9 +431,9 @@ declare <vscale x 4 x i16> @llvm.riscv.vslide1down.nxv4i16.i16(
   <vscale x 4 x i16>,
   <vscale x 4 x i16>,
   i16,
-  i64);
+  iXLen)
 
-define <vscale x 4 x i16> @intrinsic_vslide1down_vx_nxv4i16_nxv4i16_i16(<vscale x 4 x i16> %0, i16 %1, i64 %2) nounwind {
+define <vscale x 4 x i16> @intrinsic_vslide1down_vx_nxv4i16_nxv4i16_i16(<vscale x 4 x i16> %0, i16 %1, iXLen %2) nounwind {
 ; CHECK-LABEL: intrinsic_vslide1down_vx_nxv4i16_nxv4i16_i16:
 ; CHECK:       # %bb.0: # %entry
 ; CHECK-NEXT:    vsetvli zero, a1, e16, m1, ta, ma
@@ -442,7 +444,7 @@ entry:
     <vscale x 4 x i16> undef,
     <vscale x 4 x i16> %0,
     i16 %1,
-    i64 %2)
+    iXLen %2)
 
   ret <vscale x 4 x i16> %a
 }
@@ -452,10 +454,10 @@ declare <vscale x 4 x i16> @llvm.riscv.vslide1down.mask.nxv4i16.i16(
   <vscale x 4 x i16>,
   i16,
   <vscale x 4 x i1>,
-  i64,
-  i64);
+  iXLen,
+  iXLen)
 
-define <vscale x 4 x i16> @intrinsic_vslide1down_mask_vx_nxv4i16_nxv4i16_i16(<vscale x 4 x i16> %0, <vscale x 4 x i16> %1, i16 %2, <vscale x 4 x i1> %3, i64 %4) nounwind {
+define <vscale x 4 x i16> @intrinsic_vslide1down_mask_vx_nxv4i16_nxv4i16_i16(<vscale x 4 x i16> %0, <vscale x 4 x i16> %1, i16 %2, <vscale x 4 x i1> %3, iXLen %4) nounwind {
 ; CHECK-LABEL: intrinsic_vslide1down_mask_vx_nxv4i16_nxv4i16_i16:
 ; CHECK:       # %bb.0: # %entry
 ; CHECK-NEXT:    vsetvli zero, a1, e16, m1, ta, mu
@@ -467,7 +469,7 @@ entry:
     <vscale x 4 x i16> %1,
     i16 %2,
     <vscale x 4 x i1> %3,
-    i64 %4, i64 1)
+    iXLen %4, iXLen 1)
 
   ret <vscale x 4 x i16> %a
 }
@@ -476,9 +478,9 @@ declare <vscale x 8 x i16> @llvm.riscv.vslide1down.nxv8i16.i16(
   <vscale x 8 x i16>,
   <vscale x 8 x i16>,
   i16,
-  i64);
+  iXLen)
 
-define <vscale x 8 x i16> @intrinsic_vslide1down_vx_nxv8i16_nxv8i16_i16(<vscale x 8 x i16> %0, i16 %1, i64 %2) nounwind {
+define <vscale x 8 x i16> @intrinsic_vslide1down_vx_nxv8i16_nxv8i16_i16(<vscale x 8 x i16> %0, i16 %1, iXLen %2) nounwind {
 ; CHECK-LABEL: intrinsic_vslide1down_vx_nxv8i16_nxv8i16_i16:
 ; CHECK:       # %bb.0: # %entry
 ; CHECK-NEXT:    vsetvli zero, a1, e16, m2, ta, ma
@@ -489,7 +491,7 @@ entry:
     <vscale x 8 x i16> undef,
     <vscale x 8 x i16> %0,
     i16 %1,
-    i64 %2)
+    iXLen %2)
 
   ret <vscale x 8 x i16> %a
 }
@@ -499,10 +501,10 @@ declare <vscale x 8 x i16> @llvm.riscv.vslide1down.mask.nxv8i16.i16(
   <vscale x 8 x i16>,
   i16,
   <vscale x 8 x i1>,
-  i64,
-  i64);
+  iXLen,
+  iXLen)
 
-define <vscale x 8 x i16> @intrinsic_vslide1down_mask_vx_nxv8i16_nxv8i16_i16(<vscale x 8 x i16> %0, <vscale x 8 x i16> %1, i16 %2, <vscale x 8 x i1> %3, i64 %4) nounwind {
+define <vscale x 8 x i16> @intrinsic_vslide1down_mask_vx_nxv8i16_nxv8i16_i16(<vscale x 8 x i16> %0, <vscale x 8 x i16> %1, i16 %2, <vscale x 8 x i1> %3, iXLen %4) nounwind {
 ; CHECK-LABEL: intrinsic_vslide1down_mask_vx_nxv8i16_nxv8i16_i16:
 ; CHECK:       # %bb.0: # %entry
 ; CHECK-NEXT:    vsetvli zero, a1, e16, m2, ta, mu
@@ -514,7 +516,7 @@ entry:
     <vscale x 8 x i16> %1,
     i16 %2,
     <vscale x 8 x i1> %3,
-    i64 %4, i64 1)
+    iXLen %4, iXLen 1)
 
   ret <vscale x 8 x i16> %a
 }
@@ -523,9 +525,9 @@ declare <vscale x 16 x i16> @llvm.riscv.vslide1down.nxv16i16.i16(
   <vscale x 16 x i16>,
   <vscale x 16 x i16>,
   i16,
-  i64);
+  iXLen)
 
-define <vscale x 16 x i16> @intrinsic_vslide1down_vx_nxv16i16_nxv16i16_i16(<vscale x 16 x i16> %0, i16 %1, i64 %2) nounwind {
+define <vscale x 16 x i16> @intrinsic_vslide1down_vx_nxv16i16_nxv16i16_i16(<vscale x 16 x i16> %0, i16 %1, iXLen %2) nounwind {
 ; CHECK-LABEL: intrinsic_vslide1down_vx_nxv16i16_nxv16i16_i16:
 ; CHECK:       # %bb.0: # %entry
 ; CHECK-NEXT:    vsetvli zero, a1, e16, m4, ta, ma
@@ -536,7 +538,7 @@ entry:
     <vscale x 16 x i16> undef,
     <vscale x 16 x i16> %0,
     i16 %1,
-    i64 %2)
+    iXLen %2)
 
   ret <vscale x 16 x i16> %a
 }
@@ -546,10 +548,10 @@ declare <vscale x 16 x i16> @llvm.riscv.vslide1down.mask.nxv16i16.i16(
   <vscale x 16 x i16>,
   i16,
   <vscale x 16 x i1>,
-  i64,
-  i64);
+  iXLen,
+  iXLen)
 
-define <vscale x 16 x i16> @intrinsic_vslide1down_mask_vx_nxv16i16_nxv16i16_i16(<vscale x 16 x i16> %0, <vscale x 16 x i16> %1, i16 %2, <vscale x 16 x i1> %3, i64 %4) nounwind {
+define <vscale x 16 x i16> @intrinsic_vslide1down_mask_vx_nxv16i16_nxv16i16_i16(<vscale x 16 x i16> %0, <vscale x 16 x i16> %1, i16 %2, <vscale x 16 x i1> %3, iXLen %4) nounwind {
 ; CHECK-LABEL: intrinsic_vslide1down_mask_vx_nxv16i16_nxv16i16_i16:
 ; CHECK:       # %bb.0: # %entry
 ; CHECK-NEXT:    vsetvli zero, a1, e16, m4, ta, mu
@@ -561,7 +563,7 @@ entry:
     <vscale x 16 x i16> %1,
     i16 %2,
     <vscale x 16 x i1> %3,
-    i64 %4, i64 1)
+    iXLen %4, iXLen 1)
 
   ret <vscale x 16 x i16> %a
 }
@@ -570,9 +572,9 @@ declare <vscale x 32 x i16> @llvm.riscv.vslide1down.nxv32i16.i16(
   <vscale x 32 x i16>,
   <vscale x 32 x i16>,
   i16,
-  i64);
+  iXLen)
 
-define <vscale x 32 x i16> @intrinsic_vslide1down_vx_nxv32i16_nxv32i16_i16(<vscale x 32 x i16> %0, i16 %1, i64 %2) nounwind {
+define <vscale x 32 x i16> @intrinsic_vslide1down_vx_nxv32i16_nxv32i16_i16(<vscale x 32 x i16> %0, i16 %1, iXLen %2) nounwind {
 ; CHECK-LABEL: intrinsic_vslide1down_vx_nxv32i16_nxv32i16_i16:
 ; CHECK:       # %bb.0: # %entry
 ; CHECK-NEXT:    vsetvli zero, a1, e16, m8, ta, ma
@@ -583,7 +585,7 @@ entry:
     <vscale x 32 x i16> undef,
     <vscale x 32 x i16> %0,
     i16 %1,
-    i64 %2)
+    iXLen %2)
 
   ret <vscale x 32 x i16> %a
 }
@@ -593,10 +595,10 @@ declare <vscale x 32 x i16> @llvm.riscv.vslide1down.mask.nxv32i16.i16(
   <vscale x 32 x i16>,
   i16,
   <vscale x 32 x i1>,
-  i64,
-  i64);
+  iXLen,
+  iXLen)
 
-define <vscale x 32 x i16> @intrinsic_vslide1down_mask_vx_nxv32i16_nxv32i16_i16(<vscale x 32 x i16> %0, <vscale x 32 x i16> %1, i16 %2, <vscale x 32 x i1> %3, i64 %4) nounwind {
+define <vscale x 32 x i16> @intrinsic_vslide1down_mask_vx_nxv32i16_nxv32i16_i16(<vscale x 32 x i16> %0, <vscale x 32 x i16> %1, i16 %2, <vscale x 32 x i1> %3, iXLen %4) nounwind {
 ; CHECK-LABEL: intrinsic_vslide1down_mask_vx_nxv32i16_nxv32i16_i16:
 ; CHECK:       # %bb.0: # %entry
 ; CHECK-NEXT:    vsetvli zero, a1, e16, m8, ta, mu
@@ -608,7 +610,7 @@ entry:
     <vscale x 32 x i16> %1,
     i16 %2,
     <vscale x 32 x i1> %3,
-    i64 %4, i64 1)
+    iXLen %4, iXLen 1)
 
   ret <vscale x 32 x i16> %a
 }
@@ -617,9 +619,9 @@ declare <vscale x 1 x i32> @llvm.riscv.vslide1down.nxv1i32.i32(
   <vscale x 1 x i32>,
   <vscale x 1 x i32>,
   i32,
-  i64);
+  iXLen)
 
-define <vscale x 1 x i32> @intrinsic_vslide1down_vx_nxv1i32_nxv1i32_i32(<vscale x 1 x i32> %0, i32 %1, i64 %2) nounwind {
+define <vscale x 1 x i32> @intrinsic_vslide1down_vx_nxv1i32_nxv1i32_i32(<vscale x 1 x i32> %0, i32 %1, iXLen %2) nounwind {
 ; CHECK-LABEL: intrinsic_vslide1down_vx_nxv1i32_nxv1i32_i32:
 ; CHECK:       # %bb.0: # %entry
 ; CHECK-NEXT:    vsetvli zero, a1, e32, mf2, ta, ma
@@ -630,7 +632,7 @@ entry:
     <vscale x 1 x i32> undef,
     <vscale x 1 x i32> %0,
     i32 %1,
-    i64 %2)
+    iXLen %2)
 
   ret <vscale x 1 x i32> %a
 }
@@ -640,10 +642,10 @@ declare <vscale x 1 x i32> @llvm.riscv.vslide1down.mask.nxv1i32.i32(
   <vscale x 1 x i32>,
   i32,
   <vscale x 1 x i1>,
-  i64,
-  i64);
+  iXLen,
+  iXLen)
 
-define <vscale x 1 x i32> @intrinsic_vslide1down_mask_vx_nxv1i32_nxv1i32_i32(<vscale x 1 x i32> %0, <vscale x 1 x i32> %1, i32 %2, <vscale x 1 x i1> %3, i64 %4) nounwind {
+define <vscale x 1 x i32> @intrinsic_vslide1down_mask_vx_nxv1i32_nxv1i32_i32(<vscale x 1 x i32> %0, <vscale x 1 x i32> %1, i32 %2, <vscale x 1 x i1> %3, iXLen %4) nounwind {
 ; CHECK-LABEL: intrinsic_vslide1down_mask_vx_nxv1i32_nxv1i32_i32:
 ; CHECK:       # %bb.0: # %entry
 ; CHECK-NEXT:    vsetvli zero, a1, e32, mf2, ta, mu
@@ -655,7 +657,7 @@ entry:
     <vscale x 1 x i32> %1,
     i32 %2,
     <vscale x 1 x i1> %3,
-    i64 %4, i64 1)
+    iXLen %4, iXLen 1)
 
   ret <vscale x 1 x i32> %a
 }
@@ -664,9 +666,9 @@ declare <vscale x 2 x i32> @llvm.riscv.vslide1down.nxv2i32.i32(
   <vscale x 2 x i32>,
   <vscale x 2 x i32>,
   i32,
-  i64);
+  iXLen)
 
-define <vscale x 2 x i32> @intrinsic_vslide1down_vx_nxv2i32_nxv2i32_i32(<vscale x 2 x i32> %0, i32 %1, i64 %2) nounwind {
+define <vscale x 2 x i32> @intrinsic_vslide1down_vx_nxv2i32_nxv2i32_i32(<vscale x 2 x i32> %0, i32 %1, iXLen %2) nounwind {
 ; CHECK-LABEL: intrinsic_vslide1down_vx_nxv2i32_nxv2i32_i32:
 ; CHECK:       # %bb.0: # %entry
 ; CHECK-NEXT:    vsetvli zero, a1, e32, m1, ta, ma
@@ -677,7 +679,7 @@ entry:
     <vscale x 2 x i32> undef,
     <vscale x 2 x i32> %0,
     i32 %1,
-    i64 %2)
+    iXLen %2)
 
   ret <vscale x 2 x i32> %a
 }
@@ -687,10 +689,10 @@ declare <vscale x 2 x i32> @llvm.riscv.vslide1down.mask.nxv2i32.i32(
   <vscale x 2 x i32>,
   i32,
   <vscale x 2 x i1>,
-  i64,
-  i64);
+  iXLen,
+  iXLen)
 
-define <vscale x 2 x i32> @intrinsic_vslide1down_mask_vx_nxv2i32_nxv2i32_i32(<vscale x 2 x i32> %0, <vscale x 2 x i32> %1, i32 %2, <vscale x 2 x i1> %3, i64 %4) nounwind {
+define <vscale x 2 x i32> @intrinsic_vslide1down_mask_vx_nxv2i32_nxv2i32_i32(<vscale x 2 x i32> %0, <vscale x 2 x i32> %1, i32 %2, <vscale x 2 x i1> %3, iXLen %4) nounwind {
 ; CHECK-LABEL: intrinsic_vslide1down_mask_vx_nxv2i32_nxv2i32_i32:
 ; CHECK:       # %bb.0: # %entry
 ; CHECK-NEXT:    vsetvli zero, a1, e32, m1, ta, mu
@@ -702,7 +704,7 @@ entry:
     <vscale x 2 x i32> %1,
     i32 %2,
     <vscale x 2 x i1> %3,
-    i64 %4, i64 1)
+    iXLen %4, iXLen 1)
 
   ret <vscale x 2 x i32> %a
 }
@@ -711,9 +713,9 @@ declare <vscale x 4 x i32> @llvm.riscv.vslide1down.nxv4i32.i32(
   <vscale x 4 x i32>,
   <vscale x 4 x i32>,
   i32,
-  i64);
+  iXLen)
 
-define <vscale x 4 x i32> @intrinsic_vslide1down_vx_nxv4i32_nxv4i32_i32(<vscale x 4 x i32> %0, i32 %1, i64 %2) nounwind {
+define <vscale x 4 x i32> @intrinsic_vslide1down_vx_nxv4i32_nxv4i32_i32(<vscale x 4 x i32> %0, i32 %1, iXLen %2) nounwind {
 ; CHECK-LABEL: intrinsic_vslide1down_vx_nxv4i32_nxv4i32_i32:
 ; CHECK:       # %bb.0: # %entry
 ; CHECK-NEXT:    vsetvli zero, a1, e32, m2, ta, ma
@@ -724,7 +726,7 @@ entry:
     <vscale x 4 x i32> undef,
     <vscale x 4 x i32> %0,
     i32 %1,
-    i64 %2)
+    iXLen %2)
 
   ret <vscale x 4 x i32> %a
 }
@@ -734,10 +736,10 @@ declare <vscale x 4 x i32> @llvm.riscv.vslide1down.mask.nxv4i32.i32(
   <vscale x 4 x i32>,
   i32,
   <vscale x 4 x i1>,
-  i64,
-  i64);
+  iXLen,
+  iXLen)
 
-define <vscale x 4 x i32> @intrinsic_vslide1down_mask_vx_nxv4i32_nxv4i32_i32(<vscale x 4 x i32> %0, <vscale x 4 x i32> %1, i32 %2, <vscale x 4 x i1> %3, i64 %4) nounwind {
+define <vscale x 4 x i32> @intrinsic_vslide1down_mask_vx_nxv4i32_nxv4i32_i32(<vscale x 4 x i32> %0, <vscale x 4 x i32> %1, i32 %2, <vscale x 4 x i1> %3, iXLen %4) nounwind {
 ; CHECK-LABEL: intrinsic_vslide1down_mask_vx_nxv4i32_nxv4i32_i32:
 ; CHECK:       # %bb.0: # %entry
 ; CHECK-NEXT:    vsetvli zero, a1, e32, m2, ta, mu
@@ -749,7 +751,7 @@ entry:
     <vscale x 4 x i32> %1,
     i32 %2,
     <vscale x 4 x i1> %3,
-    i64 %4, i64 1)
+    iXLen %4, iXLen 1)
 
   ret <vscale x 4 x i32> %a
 }
@@ -758,9 +760,9 @@ declare <vscale x 8 x i32> @llvm.riscv.vslide1down.nxv8i32.i32(
   <vscale x 8 x i32>,
   <vscale x 8 x i32>,
   i32,
-  i64);
+  iXLen)
 
-define <vscale x 8 x i32> @intrinsic_vslide1down_vx_nxv8i32_nxv8i32_i32(<vscale x 8 x i32> %0, i32 %1, i64 %2) nounwind {
+define <vscale x 8 x i32> @intrinsic_vslide1down_vx_nxv8i32_nxv8i32_i32(<vscale x 8 x i32> %0, i32 %1, iXLen %2) nounwind {
 ; CHECK-LABEL: intrinsic_vslide1down_vx_nxv8i32_nxv8i32_i32:
 ; CHECK:       # %bb.0: # %entry
 ; CHECK-NEXT:    vsetvli zero, a1, e32, m4, ta, ma
@@ -771,7 +773,7 @@ entry:
     <vscale x 8 x i32> undef,
     <vscale x 8 x i32> %0,
     i32 %1,
-    i64 %2)
+    iXLen %2)
 
   ret <vscale x 8 x i32> %a
 }
@@ -781,10 +783,10 @@ declare <vscale x 8 x i32> @llvm.riscv.vslide1down.mask.nxv8i32.i32(
   <vscale x 8 x i32>,
   i32,
   <vscale x 8 x i1>,
-  i64,
-  i64);
+  iXLen,
+  iXLen)
 
-define <vscale x 8 x i32> @intrinsic_vslide1down_mask_vx_nxv8i32_nxv8i32_i32(<vscale x 8 x i32> %0, <vscale x 8 x i32> %1, i32 %2, <vscale x 8 x i1> %3, i64 %4) nounwind {
+define <vscale x 8 x i32> @intrinsic_vslide1down_mask_vx_nxv8i32_nxv8i32_i32(<vscale x 8 x i32> %0, <vscale x 8 x i32> %1, i32 %2, <vscale x 8 x i1> %3, iXLen %4) nounwind {
 ; CHECK-LABEL: intrinsic_vslide1down_mask_vx_nxv8i32_nxv8i32_i32:
 ; CHECK:       # %bb.0: # %entry
 ; CHECK-NEXT:    vsetvli zero, a1, e32, m4, ta, mu
@@ -796,7 +798,7 @@ entry:
     <vscale x 8 x i32> %1,
     i32 %2,
     <vscale x 8 x i1> %3,
-    i64 %4, i64 1)
+    iXLen %4, iXLen 1)
 
   ret <vscale x 8 x i32> %a
 }
@@ -805,9 +807,9 @@ declare <vscale x 16 x i32> @llvm.riscv.vslide1down.nxv16i32.i32(
   <vscale x 16 x i32>,
   <vscale x 16 x i32>,
   i32,
-  i64);
+  iXLen)
 
-define <vscale x 16 x i32> @intrinsic_vslide1down_vx_nxv16i32_nxv16i32_i32(<vscale x 16 x i32> %0, i32 %1, i64 %2) nounwind {
+define <vscale x 16 x i32> @intrinsic_vslide1down_vx_nxv16i32_nxv16i32_i32(<vscale x 16 x i32> %0, i32 %1, iXLen %2) nounwind {
 ; CHECK-LABEL: intrinsic_vslide1down_vx_nxv16i32_nxv16i32_i32:
 ; CHECK:       # %bb.0: # %entry
 ; CHECK-NEXT:    vsetvli zero, a1, e32, m8, ta, ma
@@ -818,7 +820,7 @@ entry:
     <vscale x 16 x i32> undef,
     <vscale x 16 x i32> %0,
     i32 %1,
-    i64 %2)
+    iXLen %2)
 
   ret <vscale x 16 x i32> %a
 }
@@ -828,10 +830,10 @@ declare <vscale x 16 x i32> @llvm.riscv.vslide1down.mask.nxv16i32.i32(
   <vscale x 16 x i32>,
   i32,
   <vscale x 16 x i1>,
-  i64,
-  i64);
+  iXLen,
+  iXLen)
 
-define <vscale x 16 x i32> @intrinsic_vslide1down_mask_vx_nxv16i32_nxv16i32_i32(<vscale x 16 x i32> %0, <vscale x 16 x i32> %1, i32 %2, <vscale x 16 x i1> %3, i64 %4) nounwind {
+define <vscale x 16 x i32> @intrinsic_vslide1down_mask_vx_nxv16i32_nxv16i32_i32(<vscale x 16 x i32> %0, <vscale x 16 x i32> %1, i32 %2, <vscale x 16 x i1> %3, iXLen %4) nounwind {
 ; CHECK-LABEL: intrinsic_vslide1down_mask_vx_nxv16i32_nxv16i32_i32:
 ; CHECK:       # %bb.0: # %entry
 ; CHECK-NEXT:    vsetvli zero, a1, e32, m8, ta, mu
@@ -843,7 +845,7 @@ entry:
     <vscale x 16 x i32> %1,
     i32 %2,
     <vscale x 16 x i1> %3,
-    i64 %4, i64 1)
+    iXLen %4, iXLen 1)
 
   ret <vscale x 16 x i32> %a
 }
@@ -852,20 +854,29 @@ declare <vscale x 1 x i64> @llvm.riscv.vslide1down.nxv1i64.i64(
   <vscale x 1 x i64>,
   <vscale x 1 x i64>,
   i64,
-  i64);
-
-define <vscale x 1 x i64> @intrinsic_vslide1down_vx_nxv1i64_nxv1i64_i64(<vscale x 1 x i64> %0, i64 %1, i64 %2) nounwind {
-; CHECK-LABEL: intrinsic_vslide1down_vx_nxv1i64_nxv1i64_i64:
-; CHECK:       # %bb.0: # %entry
-; CHECK-NEXT:    vsetvli zero, a1, e64, m1, ta, ma
-; CHECK-NEXT:    vslide1down.vx v8, v8, a0
-; CHECK-NEXT:    ret
+  iXLen)
+
+define <vscale x 1 x i64> @intrinsic_vslide1down_vx_nxv1i64_nxv1i64_i64(<vscale x 1 x i64> %0, i64 %1, iXLen %2) nounwind {
+; RV32-LABEL: intrinsic_vslide1down_vx_nxv1i64_nxv1i64_i64:
+; RV32:       # %bb.0: # %entry
+; RV32-NEXT:    vsetvli a2, a2, e64, m1, ta, ma
+; RV32-NEXT:    slli a2, a2, 1
+; RV32-NEXT:    vsetvli zero, a2, e32, m1, ta, ma
+; RV32-NEXT:    vslide1down.vx v8, v8, a0
+; RV32-NEXT:    vslide1down.vx v8, v8, a1
+; RV32-NEXT:    ret
+;
+; RV64-LABEL: intrinsic_vslide1down_vx_nxv1i64_nxv1i64_i64:
+; RV64:       # %bb.0: # %entry
+; RV64-NEXT:    vsetvli zero, a1, e64, m1, ta, ma
+; RV64-NEXT:    vslide1down.vx v8, v8, a0
+; RV64-NEXT:    ret
 entry:
   %a = call <vscale x 1 x i64> @llvm.riscv.vslide1down.nxv1i64.i64(
     <vscale x 1 x i64> undef,
     <vscale x 1 x i64> %0,
     i64 %1,
-    i64 %2)
+    iXLen %2)
 
   ret <vscale x 1 x i64> %a
 }
@@ -875,22 +886,33 @@ declare <vscale x 1 x i64> @llvm.riscv.vslide1down.mask.nxv1i64.i64(
   <vscale x 1 x i64>,
   i64,
   <vscale x 1 x i1>,
-  i64,
-  i64);
-
-define <vscale x 1 x i64> @intrinsic_vslide1down_mask_vx_nxv1i64_nxv1i64_i64(<vscale x 1 x i64> %0, <vscale x 1 x i64> %1, i64 %2, <vscale x 1 x i1> %3, i64 %4) nounwind {
-; CHECK-LABEL: intrinsic_vslide1down_mask_vx_nxv1i64_nxv1i64_i64:
-; CHECK:       # %bb.0: # %entry
-; CHECK-NEXT:    vsetvli zero, a1, e64, m1, ta, mu
-; CHECK-NEXT:    vslide1down.vx v8, v9, a0, v0.t
-; CHECK-NEXT:    ret
+  iXLen,
+  iXLen)
+
+define <vscale x 1 x i64> @intrinsic_vslide1down_mask_vx_nxv1i64_nxv1i64_i64(<vscale x 1 x i64> %0, <vscale x 1 x i64> %1, i64 %2, <vscale x 1 x i1> %3, iXLen %4) nounwind {
+; RV32-LABEL: intrinsic_vslide1down_mask_vx_nxv1i64_nxv1i64_i64:
+; RV32:       # %bb.0: # %entry
+; RV32-NEXT:    vsetvli a3, a2, e64, m1, ta, ma
+; RV32-NEXT:    slli a3, a3, 1
+; RV32-NEXT:    vsetvli zero, a3, e32, m1, ta, ma
+; RV32-NEXT:    vslide1down.vx v9, v9, a0
+; RV32-NEXT:    vslide1down.vx v9, v9, a1
+; RV32-NEXT:    vsetvli zero, a2, e64, m1, ta, ma
+; RV32-NEXT:    vmerge.vvm v8, v8, v9, v0
+; RV32-NEXT:    ret
+;
+; RV64-LABEL: intrinsic_vslide1down_mask_vx_nxv1i64_nxv1i64_i64:
+; RV64:       # %bb.0: # %entry
+; RV64-NEXT:    vsetvli zero, a1, e64, m1, ta, mu
+; RV64-NEXT:    vslide1down.vx v8, v9, a0, v0.t
+; RV64-NEXT:    ret
 entry:
   %a = call <vscale x 1 x i64> @llvm.riscv.vslide1down.mask.nxv1i64.i64(
     <vscale x 1 x i64> %0,
     <vscale x 1 x i64> %1,
     i64 %2,
     <vscale x 1 x i1> %3,
-    i64 %4, i64 1)
+    iXLen %4, iXLen 1)
 
   ret <vscale x 1 x i64> %a
 }
@@ -899,20 +921,29 @@ declare <vscale x 2 x i64> @llvm.riscv.vslide1down.nxv2i64.i64(
   <vscale x 2 x i64>,
   <vscale x 2 x i64>,
   i64,
-  i64);
-
-define <vscale x 2 x i64> @intrinsic_vslide1down_vx_nxv2i64_nxv2i64_i64(<vscale x 2 x i64> %0, i64 %1, i64 %2) nounwind {
-; CHECK-LABEL: intrinsic_vslide1down_vx_nxv2i64_nxv2i64_i64:
-; CHECK:       # %bb.0: # %entry
-; CHECK-NEXT:    vsetvli zero, a1, e64, m2, ta, ma
-; CHECK-NEXT:    vslide1down.vx v8, v8, a0
-; CHECK-NEXT:    ret
+  iXLen)
+
+define <vscale x 2 x i64> @intrinsic_vslide1down_vx_nxv2i64_nxv2i64_i64(<vscale x 2 x i64> %0, i64 %1, iXLen %2) nounwind {
+; RV32-LABEL: intrinsic_vslide1down_vx_nxv2i64_nxv2i64_i64:
+; RV32:       # %bb.0: # %entry
+; RV32-NEXT:    vsetvli a2, a2, e64, m2, ta, ma
+; RV32-NEXT:    slli a2, a2, 1
+; RV32-NEXT:    vsetvli zero, a2, e32, m2, ta, ma
+; RV32-NEXT:    vslide1down.vx v8, v8, a0
+; RV32-NEXT:    vslide1down.vx v8, v8, a1
+; RV32-NEXT:    ret
+;
+; RV64-LABEL: intrinsic_vslide1down_vx_nxv2i64_nxv2i64_i64:
+; RV64:       # %bb.0: # %entry
+; RV64-NEXT:    vsetvli zero, a1, e64, m2, ta, ma
+; RV64-NEXT:    vslide1down.vx v8, v8, a0
+; RV64-NEXT:    ret
 entry:
   %a = call <vscale x 2 x i64> @llvm.riscv.vslide1down.nxv2i64.i64(
     <vscale x 2 x i64> undef,
     <vscale x 2 x i64> %0,
     i64 %1,
-    i64 %2)
+    iXLen %2)
 
   ret <vscale x 2 x i64> %a
 }
@@ -922,22 +953,33 @@ declare <vscale x 2 x i64> @llvm.riscv.vslide1down.mask.nxv2i64.i64(
   <vscale x 2 x i64>,
   i64,
   <vscale x 2 x i1>,
-  i64,
-  i64);
-
-define <vscale x 2 x i64> @intrinsic_vslide1down_mask_vx_nxv2i64_nxv2i64_i64(<vscale x 2 x i64> %0, <vscale x 2 x i64> %1, i64 %2, <vscale x 2 x i1> %3, i64 %4) nounwind {
-; CHECK-LABEL: intrinsic_vslide1down_mask_vx_nxv2i64_nxv2i64_i64:
-; CHECK:       # %bb.0: # %entry
-; CHECK-NEXT:    vsetvli zero, a1, e64, m2, ta, mu
-; CHECK-NEXT:    vslide1down.vx v8, v10, a0, v0.t
-; CHECK-NEXT:    ret
+  iXLen,
+  iXLen)
+
+define <vscale x 2 x i64> @intrinsic_vslide1down_mask_vx_nxv2i64_nxv2i64_i64(<vscale x 2 x i64> %0, <vscale x 2 x i64> %1, i64 %2, <vscale x 2 x i1> %3, iXLen %4) nounwind {
+; RV32-LABEL: intrinsic_vslide1down_mask_vx_nxv2i64_nxv2i64_i64:
+; RV32:       # %bb.0: # %entry
+; RV32-NEXT:    vsetvli a3, a2, e64, m2, ta, ma
+; RV32-NEXT:    slli a3, a3, 1
+; RV32-NEXT:    vsetvli zero, a3, e32, m2, ta, ma
+; RV32-NEXT:    vslide1down.vx v10, v10, a0
+; RV32-NEXT:    vslide1down.vx v10, v10, a1
+; RV32-NEXT:    vsetvli zero, a2, e64, m2, ta, ma
+; RV32-NEXT:    vmerge.vvm v8, v8, v10, v0
+; RV32-NEXT:    ret
+;
+; RV64-LABEL: intrinsic_vslide1down_mask_vx_nxv2i64_nxv2i64_i64:
+; RV64:       # %bb.0: # %entry
+; RV64-NEXT:    vsetvli zero, a1, e64, m2, ta, mu
+; RV64-NEXT:    vslide1down.vx v8, v10, a0, v0.t
+; RV64-NEXT:    ret
 entry:
   %a = call <vscale x 2 x i64> @llvm.riscv.vslide1down.mask.nxv2i64.i64(
     <vscale x 2 x i64> %0,
     <vscale x 2 x i64> %1,
     i64 %2,
     <vscale x 2 x i1> %3,
-    i64 %4, i64 1)
+    iXLen %4, iXLen 1)
 
   ret <vscale x 2 x i64> %a
 }
@@ -946,20 +988,29 @@ declare <vscale x 4 x i64> @llvm.riscv.vslide1down.nxv4i64.i64(
   <vscale x 4 x i64>,
   <vscale x 4 x i64>,
   i64,
-  i64);
-
-define <vscale x 4 x i64> @intrinsic_vslide1down_vx_nxv4i64_nxv4i64_i64(<vscale x 4 x i64> %0, i64 %1, i64 %2) nounwind {
-; CHECK-LABEL: intrinsic_vslide1down_vx_nxv4i64_nxv4i64_i64:
-; CHECK:       # %bb.0: # %entry
-; CHECK-NEXT:    vsetvli zero, a1, e64, m4, ta, ma
-; CHECK-NEXT:    vslide1down.vx v8, v8, a0
-; CHECK-NEXT:    ret
+  iXLen)
+
+define <vscale x 4 x i64> @intrinsic_vslide1down_vx_nxv4i64_nxv4i64_i64(<vscale x 4 x i64> %0, i64 %1, iXLen %2) nounwind {
+; RV32-LABEL: intrinsic_vslide1down_vx_nxv4i64_nxv4i64_i64:
+; RV32:       # %bb.0: # %entry
+; RV32-NEXT:    vsetvli a2, a2, e64, m4, ta, ma
+; RV32-NEXT:    slli a2, a2, 1
+; RV32-NEXT:    vsetvli zero, a2, e32, m4, ta, ma
+; RV32-NEXT:    vslide1down.vx v8, v8, a0
+; RV32-NEXT:    vslide1down.vx v8, v8, a1
+; RV32-NEXT:    ret
+;
+; RV64-LABEL: intrinsic_vslide1down_vx_nxv4i64_nxv4i64_i64:
+; RV64:       # %bb.0: # %entry
+; RV64-NEXT:    vsetvli zero, a1, e64, m4, ta, ma
+; RV64-NEXT:    vslide1down.vx v8, v8, a0
+; RV64-NEXT:    ret
 entry:
   %a = call <vscale x 4 x i64> @llvm.riscv.vslide1down.nxv4i64.i64(
     <vscale x 4 x i64> undef,
     <vscale x 4 x i64> %0,
     i64 %1,
-    i64 %2)
+    iXLen %2)
 
   ret <vscale x 4 x i64> %a
 }
@@ -969,22 +1020,33 @@ declare <vscale x 4 x i64> @llvm.riscv.vslide1down.mask.nxv4i64.i64(
   <vscale x 4 x i64>,
   i64,
   <vscale x 4 x i1>,
-  i64,
-  i64);
-
-define <vscale x 4 x i64> @intrinsic_vslide1down_mask_vx_nxv4i64_nxv4i64_i64(<vscale x 4 x i64> %0, <vscale x 4 x i64> %1, i64 %2, <vscale x 4 x i1> %3, i64 %4) nounwind {
-; CHECK-LABEL: intrinsic_vslide1down_mask_vx_nxv4i64_nxv4i64_i64:
-; CHECK:       # %bb.0: # %entry
-; CHECK-NEXT:    vsetvli zero, a1, e64, m4, ta, mu
-; CHECK-NEXT:    vslide1down.vx v8, v12, a0, v0.t
-; CHECK-NEXT:    ret
+  iXLen,
+  iXLen)
+
+define <vscale x 4 x i64> @intrinsic_vslide1down_mask_vx_nxv4i64_nxv4i64_i64(<vscale x 4 x i64> %0, <vscale x 4 x i64> %1, i64 %2, <vscale x 4 x i1> %3, iXLen %4) nounwind {
+; RV32-LABEL: intrinsic_vslide1down_mask_vx_nxv4i64_nxv4i64_i64:
+; RV32:       # %bb.0: # %entry
+; RV32-NEXT:    vsetvli a3, a2, e64, m4, ta, ma
+; RV32-NEXT:    slli a3, a3, 1
+; RV32-NEXT:    vsetvli zero, a3, e32, m4, ta, ma
+; RV32-NEXT:    vslide1down.vx v12, v12, a0
+; RV32-NEXT:    vslide1down.vx v12, v12, a1
+; RV32-NEXT:    vsetvli zero, a2, e64, m4, ta, ma
+; RV32-NEXT:    vmerge.vvm v8, v8, v12, v0
+; RV32-NEXT:    ret
+;
+; RV64-LABEL: intrinsic_vslide1down_mask_vx_nxv4i64_nxv4i64_i64:
+; RV64:       # %bb.0: # %entry
+; RV64-NEXT:    vsetvli zero, a1, e64, m4, ta, mu
+; RV64-NEXT:    vslide1down.vx v8, v12, a0, v0.t
+; RV64-NEXT:    ret
 entry:
   %a = call <vscale x 4 x i64> @llvm.riscv.vslide1down.mask.nxv4i64.i64(
     <vscale x 4 x i64> %0,
     <vscale x 4 x i64> %1,
     i64 %2,
     <vscale x 4 x i1> %3,
-    i64 %4, i64 1)
+    iXLen %4, iXLen 1)
 
   ret <vscale x 4 x i64> %a
 }
@@ -993,20 +1055,29 @@ declare <vscale x 8 x i64> @llvm.riscv.vslide1down.nxv8i64.i64(
   <vscale x 8 x i64>,
   <vscale x 8 x i64>,
   i64,
-  i64);
-
-define <vscale x 8 x i64> @intrinsic_vslide1down_vx_nxv8i64_nxv8i64_i64(<vscale x 8 x i64> %0, i64 %1, i64 %2) nounwind {
-; CHECK-LABEL: intrinsic_vslide1down_vx_nxv8i64_nxv8i64_i64:
-; CHECK:       # %bb.0: # %entry
-; CHECK-NEXT:    vsetvli zero, a1, e64, m8, ta, ma
-; CHECK-NEXT:    vslide1down.vx v8, v8, a0
-; CHECK-NEXT:    ret
+  iXLen)
+
+define <vscale x 8 x i64> @intrinsic_vslide1down_vx_nxv8i64_nxv8i64_i64(<vscale x 8 x i64> %0, i64 %1, iXLen %2) nounwind {
+; RV32-LABEL: intrinsic_vslide1down_vx_nxv8i64_nxv8i64_i64:
+; RV32:       # %bb.0: # %entry
+; RV32-NEXT:    vsetvli a2, a2, e64, m8, ta, ma
+; RV32-NEXT:    slli a2, a2, 1
+; RV32-NEXT:    vsetvli zero, a2, e32, m8, ta, ma
+; RV32-NEXT:    vslide1down.vx v8, v8, a0
+; RV32-NEXT:    vslide1down.vx v8, v8, a1
+; RV32-NEXT:    ret
+;
+; RV64-LABEL: intrinsic_vslide1down_vx_nxv8i64_nxv8i64_i64:
+; RV64:       # %bb.0: # %entry
+; RV64-NEXT:    vsetvli zero, a1, e64, m8, ta, ma
+; RV64-NEXT:    vslide1down.vx v8, v8, a0
+; RV64-NEXT:    ret
 entry:
   %a = call <vscale x 8 x i64> @llvm.riscv.vslide1down.nxv8i64.i64(
     <vscale x 8 x i64> undef,
     <vscale x 8 x i64> %0,
     i64 %1,
-    i64 %2)
+    iXLen %2)
 
   ret <vscale x 8 x i64> %a
 }
@@ -1016,22 +1087,33 @@ declare <vscale x 8 x i64> @llvm.riscv.vslide1down.mask.nxv8i64.i64(
   <vscale x 8 x i64>,
   i64,
   <vscale x 8 x i1>,
-  i64,
-  i64);
-
-define <vscale x 8 x i64> @intrinsic_vslide1down_mask_vx_nxv8i64_nxv8i64_i64(<vscale x 8 x i64> %0, <vscale x 8 x i64> %1, i64 %2, <vscale x 8 x i1> %3, i64 %4) nounwind {
-; CHECK-LABEL: intrinsic_vslide1down_mask_vx_nxv8i64_nxv8i64_i64:
-; CHECK:       # %bb.0: # %entry
-; CHECK-NEXT:    vsetvli zero, a1, e64, m8, ta, mu
-; CHECK-NEXT:    vslide1down.vx v8, v16, a0, v0.t
-; CHECK-NEXT:    ret
+  iXLen,
+  iXLen)
+
+define <vscale x 8 x i64> @intrinsic_vslide1down_mask_vx_nxv8i64_nxv8i64_i64(<vscale x 8 x i64> %0, <vscale x 8 x i64> %1, i64 %2, <vscale x 8 x i1> %3, iXLen %4) nounwind {
+; RV32-LABEL: intrinsic_vslide1down_mask_vx_nxv8i64_nxv8i64_i64:
+; RV32:       # %bb.0: # %entry
+; RV32-NEXT:    vsetvli a3, a2, e64, m8, ta, ma
+; RV32-NEXT:    slli a3, a3, 1
+; RV32-NEXT:    vsetvli zero, a3, e32, m8, ta, ma
+; RV32-NEXT:    vslide1down.vx v16, v16, a0
+; RV32-NEXT:    vslide1down.vx v16, v16, a1
+; RV32-NEXT:    vsetvli zero, a2, e64, m8, ta, ma
+; RV32-NEXT:    vmerge.vvm v8, v8, v16, v0
+; RV32-NEXT:    ret
+;
+; RV64-LABEL: intrinsic_vslide1down_mask_vx_nxv8i64_nxv8i64_i64:
+; RV64:       # %bb.0: # %entry
+; RV64-NEXT:    vsetvli zero, a1, e64, m8, ta, mu
+; RV64-NEXT:    vslide1down.vx v8, v16, a0, v0.t
+; RV64-NEXT:    ret
 entry:
   %a = call <vscale x 8 x i64> @llvm.riscv.vslide1down.mask.nxv8i64.i64(
     <vscale x 8 x i64> %0,
     <vscale x 8 x i64> %1,
     i64 %2,
     <vscale x 8 x i1> %3,
-    i64 %4, i64 1)
+    iXLen %4, iXLen 1)
 
   ret <vscale x 8 x i64> %a
 }
diff --git a/llvm/test/CodeGen/RISCV/rvv/vslide1up-constant-vl-rv32.ll b/llvm/test/CodeGen/RISCV/rvv/vslide1up-constant-vl-rv32.ll
index f0d621b..b26f1ca 100644
--- a/llvm/test/CodeGen/RISCV/rvv/vslide1up-constant-vl-rv32.ll
+++ b/llvm/test/CodeGen/RISCV/rvv/vslide1up-constant-vl-rv32.ll
@@ -51,7 +51,7 @@ define <vscale x 1 x i64> @intrinsic_vslide1up_vx_nxv1i64_nxv1i64_i64_vl2(<vscal
 ;
 ; CHECK-64-LABEL: intrinsic_vslide1up_vx_nxv1i64_nxv1i64_i64_vl2:
 ; CHECK-64:       # %bb.0: # %entry
-; CHECK-64-NEXT:    vsetvli a2, zero, e32, m1, ta, ma
+; CHECK-64-NEXT:    vsetivli zero, 2, e32, m1, ta, ma
 ; CHECK-64-NEXT:    vslide1up.vx v9, v8, a1
 ; CHECK-64-NEXT:    vslide1up.vx v8, v9, a0
 ; CHECK-64-NEXT:    ret
@@ -84,7 +84,7 @@ define <vscale x 1 x i64> @intrinsic_vslide1up_vx_nxv1i64_nxv1i64_i64_vl3(<vscal
 ;
 ; CHECK-64-LABEL: intrinsic_vslide1up_vx_nxv1i64_nxv1i64_i64_vl3:
 ; CHECK-64:       # %bb.0: # %entry
-; CHECK-64-NEXT:    vsetvli a2, zero, e32, m1, ta, ma
+; CHECK-64-NEXT:    vsetivli zero, 2, e32, m1, ta, ma
 ; CHECK-64-NEXT:    vslide1up.vx v9, v8, a1
 ; CHECK-64-NEXT:    vslide1up.vx v8, v9, a0
 ; CHECK-64-NEXT:    ret
@@ -117,7 +117,7 @@ define <vscale x 1 x i64> @intrinsic_vslide1up_vx_nxv1i64_nxv1i64_i64_vl8(<vscal
 ;
 ; CHECK-64-LABEL: intrinsic_vslide1up_vx_nxv1i64_nxv1i64_i64_vl8:
 ; CHECK-64:       # %bb.0: # %entry
-; CHECK-64-NEXT:    vsetvli a2, zero, e32, m1, ta, ma
+; CHECK-64-NEXT:    vsetivli zero, 2, e32, m1, ta, ma
 ; CHECK-64-NEXT:    vslide1up.vx v9, v8, a1
 ; CHECK-64-NEXT:    vslide1up.vx v8, v9, a0
 ; CHECK-64-NEXT:    ret
@@ -152,7 +152,7 @@ define <vscale x 1 x i64> @intrinsic_vslide1up_vx_nxv1i64_nxv1i64_i64_vl9(<vscal
 ;
 ; CHECK-64-LABEL: intrinsic_vslide1up_vx_nxv1i64_nxv1i64_i64_vl9:
 ; CHECK-64:       # %bb.0: # %entry
-; CHECK-64-NEXT:    vsetvli a2, zero, e32, m1, ta, ma
+; CHECK-64-NEXT:    vsetivli zero, 2, e32, m1, ta, ma
 ; CHECK-64-NEXT:    vslide1up.vx v9, v8, a1
 ; CHECK-64-NEXT:    vslide1up.vx v8, v9, a0
 ; CHECK-64-NEXT:    ret
@@ -187,7 +187,7 @@ define <vscale x 1 x i64> @intrinsic_vslide1up_vx_nxv1i64_nxv1i64_i64_vl15(<vsca
 ;
 ; CHECK-64-LABEL: intrinsic_vslide1up_vx_nxv1i64_nxv1i64_i64_vl15:
 ; CHECK-64:       # %bb.0: # %entry
-; CHECK-64-NEXT:    vsetvli a2, zero, e32, m1, ta, ma
+; CHECK-64-NEXT:    vsetivli zero, 2, e32, m1, ta, ma
 ; CHECK-64-NEXT:    vslide1up.vx v9, v8, a1
 ; CHECK-64-NEXT:    vslide1up.vx v8, v9, a0
 ; CHECK-64-NEXT:    ret
@@ -213,14 +213,14 @@ define <vscale x 1 x i64> @intrinsic_vslide1up_vx_nxv1i64_nxv1i64_i64_vl16(<vsca
 ;
 ; CHECK-512-LABEL: intrinsic_vslide1up_vx_nxv1i64_nxv1i64_i64_vl16:
 ; CHECK-512:       # %bb.0: # %entry
-; CHECK-512-NEXT:    vsetvli a2, zero, e32, m1, ta, ma
+; CHECK-512-NEXT:    vsetivli zero, 16, e32, m1, ta, ma
 ; CHECK-512-NEXT:    vslide1up.vx v9, v8, a1
 ; CHECK-512-NEXT:    vslide1up.vx v8, v9, a0
 ; CHECK-512-NEXT:    ret
 ;
 ; CHECK-64-LABEL: intrinsic_vslide1up_vx_nxv1i64_nxv1i64_i64_vl16:
 ; CHECK-64:       # %bb.0: # %entry
-; CHECK-64-NEXT:    vsetvli a2, zero, e32, m1, ta, ma
+; CHECK-64-NEXT:    vsetivli zero, 2, e32, m1, ta, ma
 ; CHECK-64-NEXT:    vslide1up.vx v9, v8, a1
 ; CHECK-64-NEXT:    vslide1up.vx v8, v9, a0
 ; CHECK-64-NEXT:    ret
@@ -247,14 +247,14 @@ define <vscale x 1 x i64> @intrinsic_vslide1up_vx_nxv1i64_nxv1i64_i64_vl2047(<vs
 ;
 ; CHECK-512-LABEL: intrinsic_vslide1up_vx_nxv1i64_nxv1i64_i64_vl2047:
 ; CHECK-512:       # %bb.0: # %entry
-; CHECK-512-NEXT:    vsetvli a2, zero, e32, m1, ta, ma
+; CHECK-512-NEXT:    vsetivli zero, 16, e32, m1, ta, ma
 ; CHECK-512-NEXT:    vslide1up.vx v9, v8, a1
 ; CHECK-512-NEXT:    vslide1up.vx v8, v9, a0
 ; CHECK-512-NEXT:    ret
 ;
 ; CHECK-64-LABEL: intrinsic_vslide1up_vx_nxv1i64_nxv1i64_i64_vl2047:
 ; CHECK-64:       # %bb.0: # %entry
-; CHECK-64-NEXT:    vsetvli a2, zero, e32, m1, ta, ma
+; CHECK-64-NEXT:    vsetivli zero, 2, e32, m1, ta, ma
 ; CHECK-64-NEXT:    vslide1up.vx v9, v8, a1
 ; CHECK-64-NEXT:    vslide1up.vx v8, v9, a0
 ; CHECK-64-NEXT:    ret
@@ -269,12 +269,26 @@ entry:
 }
 
 define <vscale x 1 x i64> @intrinsic_vslide1up_vx_nxv1i64_nxv1i64_i64_vl2048(<vscale x 1 x i64> %0, i64 %1) nounwind {
-; CHECK-LABEL: intrinsic_vslide1up_vx_nxv1i64_nxv1i64_i64_vl2048:
-; CHECK:       # %bb.0: # %entry
-; CHECK-NEXT:    vsetvli a2, zero, e32, m1, ta, ma
-; CHECK-NEXT:    vslide1up.vx v9, v8, a1
-; CHECK-NEXT:    vslide1up.vx v8, v9, a0
-; CHECK-NEXT:    ret
+; CHECK-128-65536-LABEL: intrinsic_vslide1up_vx_nxv1i64_nxv1i64_i64_vl2048:
+; CHECK-128-65536:       # %bb.0: # %entry
+; CHECK-128-65536-NEXT:    vsetvli a2, zero, e32, m1, ta, ma
+; CHECK-128-65536-NEXT:    vslide1up.vx v9, v8, a1
+; CHECK-128-65536-NEXT:    vslide1up.vx v8, v9, a0
+; CHECK-128-65536-NEXT:    ret
+;
+; CHECK-512-LABEL: intrinsic_vslide1up_vx_nxv1i64_nxv1i64_i64_vl2048:
+; CHECK-512:       # %bb.0: # %entry
+; CHECK-512-NEXT:    vsetivli zero, 16, e32, m1, ta, ma
+; CHECK-512-NEXT:    vslide1up.vx v9, v8, a1
+; CHECK-512-NEXT:    vslide1up.vx v8, v9, a0
+; CHECK-512-NEXT:    ret
+;
+; CHECK-64-LABEL: intrinsic_vslide1up_vx_nxv1i64_nxv1i64_i64_vl2048:
+; CHECK-64:       # %bb.0: # %entry
+; CHECK-64-NEXT:    vsetivli zero, 2, e32, m1, ta, ma
+; CHECK-64-NEXT:    vslide1up.vx v9, v8, a1
+; CHECK-64-NEXT:    vslide1up.vx v8, v9, a0
+; CHECK-64-NEXT:    ret
 entry:
   %a = call <vscale x 1 x i64> @llvm.riscv.vslide1up.nxv1i64.i64(
     <vscale x 1 x i64> undef,
diff --git a/llvm/test/CodeGen/RISCV/rvv/vslide1up-rv64.ll b/llvm/test/CodeGen/RISCV/rvv/vslide1up-rv64.ll
deleted file mode 100644
index 6c82149..0000000
--- a/llvm/test/CodeGen/RISCV/rvv/vslide1up-rv64.ll
+++ /dev/null
@@ -1,1059 +0,0 @@
-; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py
-; RUN: llc -mtriple=riscv64 -mattr=+v,+d -verify-machineinstrs \
-; RUN:   < %s | FileCheck %s
-
-declare <vscale x 1 x i8> @llvm.riscv.vslide1up.nxv1i8.i8(
-  <vscale x 1 x i8>,
-  <vscale x 1 x i8>,
-  i8,
-  i64);
-
-define <vscale x 1 x i8> @intrinsic_vslide1up_vx_nxv1i8_nxv1i8_i8(<vscale x 1 x i8> %0, i8 %1, i64 %2) nounwind {
-; CHECK-LABEL: intrinsic_vslide1up_vx_nxv1i8_nxv1i8_i8:
-; CHECK:       # %bb.0: # %entry
-; CHECK-NEXT:    vsetvli zero, a1, e8, mf8, ta, ma
-; CHECK-NEXT:    vslide1up.vx v9, v8, a0
-; CHECK-NEXT:    vmv1r.v v8, v9
-; CHECK-NEXT:    ret
-entry:
-  %a = call <vscale x 1 x i8> @llvm.riscv.vslide1up.nxv1i8.i8(
-    <vscale x 1 x i8> undef,
-    <vscale x 1 x i8> %0,
-    i8 %1,
-    i64 %2)
-
-  ret <vscale x 1 x i8> %a
-}
-
-declare <vscale x 1 x i8> @llvm.riscv.vslide1up.mask.nxv1i8.i8(
-  <vscale x 1 x i8>,
-  <vscale x 1 x i8>,
-  i8,
-  <vscale x 1 x i1>,
-  i64,
-  i64);
-
-define <vscale x 1 x i8> @intrinsic_vslide1up_mask_vx_nxv1i8_nxv1i8_i8(<vscale x 1 x i8> %0, <vscale x 1 x i8> %1, i8 %2, <vscale x 1 x i1> %3, i64 %4) nounwind {
-; CHECK-LABEL: intrinsic_vslide1up_mask_vx_nxv1i8_nxv1i8_i8:
-; CHECK:       # %bb.0: # %entry
-; CHECK-NEXT:    vsetvli zero, a1, e8, mf8, ta, mu
-; CHECK-NEXT:    vslide1up.vx v8, v9, a0, v0.t
-; CHECK-NEXT:    ret
-entry:
-  %a = call <vscale x 1 x i8> @llvm.riscv.vslide1up.mask.nxv1i8.i8(
-    <vscale x 1 x i8> %0,
-    <vscale x 1 x i8> %1,
-    i8 %2,
-    <vscale x 1 x i1> %3,
-    i64 %4, i64 1)
-
-  ret <vscale x 1 x i8> %a
-}
-
-declare <vscale x 2 x i8> @llvm.riscv.vslide1up.nxv2i8.i8(
-  <vscale x 2 x i8>,
-  <vscale x 2 x i8>,
-  i8,
-  i64);
-
-define <vscale x 2 x i8> @intrinsic_vslide1up_vx_nxv2i8_nxv2i8_i8(<vscale x 2 x i8> %0, i8 %1, i64 %2) nounwind {
-; CHECK-LABEL: intrinsic_vslide1up_vx_nxv2i8_nxv2i8_i8:
-; CHECK:       # %bb.0: # %entry
-; CHECK-NEXT:    vsetvli zero, a1, e8, mf4, ta, ma
-; CHECK-NEXT:    vslide1up.vx v9, v8, a0
-; CHECK-NEXT:    vmv1r.v v8, v9
-; CHECK-NEXT:    ret
-entry:
-  %a = call <vscale x 2 x i8> @llvm.riscv.vslide1up.nxv2i8.i8(
-    <vscale x 2 x i8> undef,
-    <vscale x 2 x i8> %0,
-    i8 %1,
-    i64 %2)
-
-  ret <vscale x 2 x i8> %a
-}
-
-declare <vscale x 2 x i8> @llvm.riscv.vslide1up.mask.nxv2i8.i8(
-  <vscale x 2 x i8>,
-  <vscale x 2 x i8>,
-  i8,
-  <vscale x 2 x i1>,
-  i64,
-  i64);
-
-define <vscale x 2 x i8> @intrinsic_vslide1up_mask_vx_nxv2i8_nxv2i8_i8(<vscale x 2 x i8> %0, <vscale x 2 x i8> %1, i8 %2, <vscale x 2 x i1> %3, i64 %4) nounwind {
-; CHECK-LABEL: intrinsic_vslide1up_mask_vx_nxv2i8_nxv2i8_i8:
-; CHECK:       # %bb.0: # %entry
-; CHECK-NEXT:    vsetvli zero, a1, e8, mf4, ta, mu
-; CHECK-NEXT:    vslide1up.vx v8, v9, a0, v0.t
-; CHECK-NEXT:    ret
-entry:
-  %a = call <vscale x 2 x i8> @llvm.riscv.vslide1up.mask.nxv2i8.i8(
-    <vscale x 2 x i8> %0,
-    <vscale x 2 x i8> %1,
-    i8 %2,
-    <vscale x 2 x i1> %3,
-    i64 %4, i64 1)
-
-  ret <vscale x 2 x i8> %a
-}
-
-declare <vscale x 4 x i8> @llvm.riscv.vslide1up.nxv4i8.i8(
-  <vscale x 4 x i8>,
-  <vscale x 4 x i8>,
-  i8,
-  i64);
-
-define <vscale x 4 x i8> @intrinsic_vslide1up_vx_nxv4i8_nxv4i8_i8(<vscale x 4 x i8> %0, i8 %1, i64 %2) nounwind {
-; CHECK-LABEL: intrinsic_vslide1up_vx_nxv4i8_nxv4i8_i8:
-; CHECK:       # %bb.0: # %entry
-; CHECK-NEXT:    vsetvli zero, a1, e8, mf2, ta, ma
-; CHECK-NEXT:    vslide1up.vx v9, v8, a0
-; CHECK-NEXT:    vmv1r.v v8, v9
-; CHECK-NEXT:    ret
-entry:
-  %a = call <vscale x 4 x i8> @llvm.riscv.vslide1up.nxv4i8.i8(
-    <vscale x 4 x i8> undef,
-    <vscale x 4 x i8> %0,
-    i8 %1,
-    i64 %2)
-
-  ret <vscale x 4 x i8> %a
-}
-
-declare <vscale x 4 x i8> @llvm.riscv.vslide1up.mask.nxv4i8.i8(
-  <vscale x 4 x i8>,
-  <vscale x 4 x i8>,
-  i8,
-  <vscale x 4 x i1>,
-  i64,
-  i64);
-
-define <vscale x 4 x i8> @intrinsic_vslide1up_mask_vx_nxv4i8_nxv4i8_i8(<vscale x 4 x i8> %0, <vscale x 4 x i8> %1, i8 %2, <vscale x 4 x i1> %3, i64 %4) nounwind {
-; CHECK-LABEL: intrinsic_vslide1up_mask_vx_nxv4i8_nxv4i8_i8:
-; CHECK:       # %bb.0: # %entry
-; CHECK-NEXT:    vsetvli zero, a1, e8, mf2, ta, mu
-; CHECK-NEXT:    vslide1up.vx v8, v9, a0, v0.t
-; CHECK-NEXT:    ret
-entry:
-  %a = call <vscale x 4 x i8> @llvm.riscv.vslide1up.mask.nxv4i8.i8(
-    <vscale x 4 x i8> %0,
-    <vscale x 4 x i8> %1,
-    i8 %2,
-    <vscale x 4 x i1> %3,
-    i64 %4, i64 1)
-
-  ret <vscale x 4 x i8> %a
-}
-
-declare <vscale x 8 x i8> @llvm.riscv.vslide1up.nxv8i8.i8(
-  <vscale x 8 x i8>,
-  <vscale x 8 x i8>,
-  i8,
-  i64);
-
-define <vscale x 8 x i8> @intrinsic_vslide1up_vx_nxv8i8_nxv8i8_i8(<vscale x 8 x i8> %0, i8 %1, i64 %2) nounwind {
-; CHECK-LABEL: intrinsic_vslide1up_vx_nxv8i8_nxv8i8_i8:
-; CHECK:       # %bb.0: # %entry
-; CHECK-NEXT:    vsetvli zero, a1, e8, m1, ta, ma
-; CHECK-NEXT:    vslide1up.vx v9, v8, a0
-; CHECK-NEXT:    vmv.v.v v8, v9
-; CHECK-NEXT:    ret
-entry:
-  %a = call <vscale x 8 x i8> @llvm.riscv.vslide1up.nxv8i8.i8(
-    <vscale x 8 x i8> undef,
-    <vscale x 8 x i8> %0,
-    i8 %1,
-    i64 %2)
-
-  ret <vscale x 8 x i8> %a
-}
-
-declare <vscale x 8 x i8> @llvm.riscv.vslide1up.mask.nxv8i8.i8(
-  <vscale x 8 x i8>,
-  <vscale x 8 x i8>,
-  i8,
-  <vscale x 8 x i1>,
-  i64,
-  i64);
-
-define <vscale x 8 x i8> @intrinsic_vslide1up_mask_vx_nxv8i8_nxv8i8_i8(<vscale x 8 x i8> %0, <vscale x 8 x i8> %1, i8 %2, <vscale x 8 x i1> %3, i64 %4) nounwind {
-; CHECK-LABEL: intrinsic_vslide1up_mask_vx_nxv8i8_nxv8i8_i8:
-; CHECK:       # %bb.0: # %entry
-; CHECK-NEXT:    vsetvli zero, a1, e8, m1, ta, mu
-; CHECK-NEXT:    vslide1up.vx v8, v9, a0, v0.t
-; CHECK-NEXT:    ret
-entry:
-  %a = call <vscale x 8 x i8> @llvm.riscv.vslide1up.mask.nxv8i8.i8(
-    <vscale x 8 x i8> %0,
-    <vscale x 8 x i8> %1,
-    i8 %2,
-    <vscale x 8 x i1> %3,
-    i64 %4, i64 1)
-
-  ret <vscale x 8 x i8> %a
-}
-
-declare <vscale x 16 x i8> @llvm.riscv.vslide1up.nxv16i8.i8(
-  <vscale x 16 x i8>,
-  <vscale x 16 x i8>,
-  i8,
-  i64);
-
-define <vscale x 16 x i8> @intrinsic_vslide1up_vx_nxv16i8_nxv16i8_i8(<vscale x 16 x i8> %0, i8 %1, i64 %2) nounwind {
-; CHECK-LABEL: intrinsic_vslide1up_vx_nxv16i8_nxv16i8_i8:
-; CHECK:       # %bb.0: # %entry
-; CHECK-NEXT:    vsetvli zero, a1, e8, m2, ta, ma
-; CHECK-NEXT:    vslide1up.vx v10, v8, a0
-; CHECK-NEXT:    vmv.v.v v8, v10
-; CHECK-NEXT:    ret
-entry:
-  %a = call <vscale x 16 x i8> @llvm.riscv.vslide1up.nxv16i8.i8(
-    <vscale x 16 x i8> undef,
-    <vscale x 16 x i8> %0,
-    i8 %1,
-    i64 %2)
-
-  ret <vscale x 16 x i8> %a
-}
-
-declare <vscale x 16 x i8> @llvm.riscv.vslide1up.mask.nxv16i8.i8(
-  <vscale x 16 x i8>,
-  <vscale x 16 x i8>,
-  i8,
-  <vscale x 16 x i1>,
-  i64,
-  i64);
-
-define <vscale x 16 x i8> @intrinsic_vslide1up_mask_vx_nxv16i8_nxv16i8_i8(<vscale x 16 x i8> %0, <vscale x 16 x i8> %1, i8 %2, <vscale x 16 x i1> %3, i64 %4) nounwind {
-; CHECK-LABEL: intrinsic_vslide1up_mask_vx_nxv16i8_nxv16i8_i8:
-; CHECK:       # %bb.0: # %entry
-; CHECK-NEXT:    vsetvli zero, a1, e8, m2, ta, mu
-; CHECK-NEXT:    vslide1up.vx v8, v10, a0, v0.t
-; CHECK-NEXT:    ret
-entry:
-  %a = call <vscale x 16 x i8> @llvm.riscv.vslide1up.mask.nxv16i8.i8(
-    <vscale x 16 x i8> %0,
-    <vscale x 16 x i8> %1,
-    i8 %2,
-    <vscale x 16 x i1> %3,
-    i64 %4, i64 1)
-
-  ret <vscale x 16 x i8> %a
-}
-
-declare <vscale x 32 x i8> @llvm.riscv.vslide1up.nxv32i8.i8(
-  <vscale x 32 x i8>,
-  <vscale x 32 x i8>,
-  i8,
-  i64);
-
-define <vscale x 32 x i8> @intrinsic_vslide1up_vx_nxv32i8_nxv32i8_i8(<vscale x 32 x i8> %0, i8 %1, i64 %2) nounwind {
-; CHECK-LABEL: intrinsic_vslide1up_vx_nxv32i8_nxv32i8_i8:
-; CHECK:       # %bb.0: # %entry
-; CHECK-NEXT:    vsetvli zero, a1, e8, m4, ta, ma
-; CHECK-NEXT:    vslide1up.vx v12, v8, a0
-; CHECK-NEXT:    vmv.v.v v8, v12
-; CHECK-NEXT:    ret
-entry:
-  %a = call <vscale x 32 x i8> @llvm.riscv.vslide1up.nxv32i8.i8(
-    <vscale x 32 x i8> undef,
-    <vscale x 32 x i8> %0,
-    i8 %1,
-    i64 %2)
-
-  ret <vscale x 32 x i8> %a
-}
-
-declare <vscale x 32 x i8> @llvm.riscv.vslide1up.mask.nxv32i8.i8(
-  <vscale x 32 x i8>,
-  <vscale x 32 x i8>,
-  i8,
-  <vscale x 32 x i1>,
-  i64,
-  i64);
-
-define <vscale x 32 x i8> @intrinsic_vslide1up_mask_vx_nxv32i8_nxv32i8_i8(<vscale x 32 x i8> %0, <vscale x 32 x i8> %1, i8 %2, <vscale x 32 x i1> %3, i64 %4) nounwind {
-; CHECK-LABEL: intrinsic_vslide1up_mask_vx_nxv32i8_nxv32i8_i8:
-; CHECK:       # %bb.0: # %entry
-; CHECK-NEXT:    vsetvli zero, a1, e8, m4, ta, mu
-; CHECK-NEXT:    vslide1up.vx v8, v12, a0, v0.t
-; CHECK-NEXT:    ret
-entry:
-  %a = call <vscale x 32 x i8> @llvm.riscv.vslide1up.mask.nxv32i8.i8(
-    <vscale x 32 x i8> %0,
-    <vscale x 32 x i8> %1,
-    i8 %2,
-    <vscale x 32 x i1> %3,
-    i64 %4, i64 1)
-
-  ret <vscale x 32 x i8> %a
-}
-
-declare <vscale x 64 x i8> @llvm.riscv.vslide1up.nxv64i8.i8(
-  <vscale x 64 x i8>,
-  <vscale x 64 x i8>,
-  i8,
-  i64);
-
-define <vscale x 64 x i8> @intrinsic_vslide1up_vx_nxv64i8_nxv64i8_i8(<vscale x 64 x i8> %0, i8 %1, i64 %2) nounwind {
-; CHECK-LABEL: intrinsic_vslide1up_vx_nxv64i8_nxv64i8_i8:
-; CHECK:       # %bb.0: # %entry
-; CHECK-NEXT:    vsetvli zero, a1, e8, m8, ta, ma
-; CHECK-NEXT:    vslide1up.vx v16, v8, a0
-; CHECK-NEXT:    vmv.v.v v8, v16
-; CHECK-NEXT:    ret
-entry:
-  %a = call <vscale x 64 x i8> @llvm.riscv.vslide1up.nxv64i8.i8(
-    <vscale x 64 x i8> undef,
-    <vscale x 64 x i8> %0,
-    i8 %1,
-    i64 %2)
-
-  ret <vscale x 64 x i8> %a
-}
-
-declare <vscale x 64 x i8> @llvm.riscv.vslide1up.mask.nxv64i8.i8(
-  <vscale x 64 x i8>,
-  <vscale x 64 x i8>,
-  i8,
-  <vscale x 64 x i1>,
-  i64,
-  i64);
-
-define <vscale x 64 x i8> @intrinsic_vslide1up_mask_vx_nxv64i8_nxv64i8_i8(<vscale x 64 x i8> %0, <vscale x 64 x i8> %1, i8 %2, <vscale x 64 x i1> %3, i64 %4) nounwind {
-; CHECK-LABEL: intrinsic_vslide1up_mask_vx_nxv64i8_nxv64i8_i8:
-; CHECK:       # %bb.0: # %entry
-; CHECK-NEXT:    vsetvli zero, a1, e8, m8, ta, mu
-; CHECK-NEXT:    vslide1up.vx v8, v16, a0, v0.t
-; CHECK-NEXT:    ret
-entry:
-  %a = call <vscale x 64 x i8> @llvm.riscv.vslide1up.mask.nxv64i8.i8(
-    <vscale x 64 x i8> %0,
-    <vscale x 64 x i8> %1,
-    i8 %2,
-    <vscale x 64 x i1> %3,
-    i64 %4, i64 1)
-
-  ret <vscale x 64 x i8> %a
-}
-
-declare <vscale x 1 x i16> @llvm.riscv.vslide1up.nxv1i16.i16(
-  <vscale x 1 x i16>,
-  <vscale x 1 x i16>,
-  i16,
-  i64);
-
-define <vscale x 1 x i16> @intrinsic_vslide1up_vx_nxv1i16_nxv1i16_i16(<vscale x 1 x i16> %0, i16 %1, i64 %2) nounwind {
-; CHECK-LABEL: intrinsic_vslide1up_vx_nxv1i16_nxv1i16_i16:
-; CHECK:       # %bb.0: # %entry
-; CHECK-NEXT:    vsetvli zero, a1, e16, mf4, ta, ma
-; CHECK-NEXT:    vslide1up.vx v9, v8, a0
-; CHECK-NEXT:    vmv1r.v v8, v9
-; CHECK-NEXT:    ret
-entry:
-  %a = call <vscale x 1 x i16> @llvm.riscv.vslide1up.nxv1i16.i16(
-    <vscale x 1 x i16> undef,
-    <vscale x 1 x i16> %0,
-    i16 %1,
-    i64 %2)
-
-  ret <vscale x 1 x i16> %a
-}
-
-declare <vscale x 1 x i16> @llvm.riscv.vslide1up.mask.nxv1i16.i16(
-  <vscale x 1 x i16>,
-  <vscale x 1 x i16>,
-  i16,
-  <vscale x 1 x i1>,
-  i64,
-  i64);
-
-define <vscale x 1 x i16> @intrinsic_vslide1up_mask_vx_nxv1i16_nxv1i16_i16(<vscale x 1 x i16> %0, <vscale x 1 x i16> %1, i16 %2, <vscale x 1 x i1> %3, i64 %4) nounwind {
-; CHECK-LABEL: intrinsic_vslide1up_mask_vx_nxv1i16_nxv1i16_i16:
-; CHECK:       # %bb.0: # %entry
-; CHECK-NEXT:    vsetvli zero, a1, e16, mf4, ta, mu
-; CHECK-NEXT:    vslide1up.vx v8, v9, a0, v0.t
-; CHECK-NEXT:    ret
-entry:
-  %a = call <vscale x 1 x i16> @llvm.riscv.vslide1up.mask.nxv1i16.i16(
-    <vscale x 1 x i16> %0,
-    <vscale x 1 x i16> %1,
-    i16 %2,
-    <vscale x 1 x i1> %3,
-    i64 %4, i64 1)
-
-  ret <vscale x 1 x i16> %a
-}
-
-declare <vscale x 2 x i16> @llvm.riscv.vslide1up.nxv2i16.i16(
-  <vscale x 2 x i16>,
-  <vscale x 2 x i16>,
-  i16,
-  i64);
-
-define <vscale x 2 x i16> @intrinsic_vslide1up_vx_nxv2i16_nxv2i16_i16(<vscale x 2 x i16> %0, i16 %1, i64 %2) nounwind {
-; CHECK-LABEL: intrinsic_vslide1up_vx_nxv2i16_nxv2i16_i16:
-; CHECK:       # %bb.0: # %entry
-; CHECK-NEXT:    vsetvli zero, a1, e16, mf2, ta, ma
-; CHECK-NEXT:    vslide1up.vx v9, v8, a0
-; CHECK-NEXT:    vmv1r.v v8, v9
-; CHECK-NEXT:    ret
-entry:
-  %a = call <vscale x 2 x i16> @llvm.riscv.vslide1up.nxv2i16.i16(
-    <vscale x 2 x i16> undef,
-    <vscale x 2 x i16> %0,
-    i16 %1,
-    i64 %2)
-
-  ret <vscale x 2 x i16> %a
-}
-
-declare <vscale x 2 x i16> @llvm.riscv.vslide1up.mask.nxv2i16.i16(
-  <vscale x 2 x i16>,
-  <vscale x 2 x i16>,
-  i16,
-  <vscale x 2 x i1>,
-  i64,
-  i64);
-
-define <vscale x 2 x i16> @intrinsic_vslide1up_mask_vx_nxv2i16_nxv2i16_i16(<vscale x 2 x i16> %0, <vscale x 2 x i16> %1, i16 %2, <vscale x 2 x i1> %3, i64 %4) nounwind {
-; CHECK-LABEL: intrinsic_vslide1up_mask_vx_nxv2i16_nxv2i16_i16:
-; CHECK:       # %bb.0: # %entry
-; CHECK-NEXT:    vsetvli zero, a1, e16, mf2, ta, mu
-; CHECK-NEXT:    vslide1up.vx v8, v9, a0, v0.t
-; CHECK-NEXT:    ret
-entry:
-  %a = call <vscale x 2 x i16> @llvm.riscv.vslide1up.mask.nxv2i16.i16(
-    <vscale x 2 x i16> %0,
-    <vscale x 2 x i16> %1,
-    i16 %2,
-    <vscale x 2 x i1> %3,
-    i64 %4, i64 1)
-
-  ret <vscale x 2 x i16> %a
-}
-
-declare <vscale x 4 x i16> @llvm.riscv.vslide1up.nxv4i16.i16(
-  <vscale x 4 x i16>,
-  <vscale x 4 x i16>,
-  i16,
-  i64);
-
-define <vscale x 4 x i16> @intrinsic_vslide1up_vx_nxv4i16_nxv4i16_i16(<vscale x 4 x i16> %0, i16 %1, i64 %2) nounwind {
-; CHECK-LABEL: intrinsic_vslide1up_vx_nxv4i16_nxv4i16_i16:
-; CHECK:       # %bb.0: # %entry
-; CHECK-NEXT:    vsetvli zero, a1, e16, m1, ta, ma
-; CHECK-NEXT:    vslide1up.vx v9, v8, a0
-; CHECK-NEXT:    vmv.v.v v8, v9
-; CHECK-NEXT:    ret
-entry:
-  %a = call <vscale x 4 x i16> @llvm.riscv.vslide1up.nxv4i16.i16(
-    <vscale x 4 x i16> undef,
-    <vscale x 4 x i16> %0,
-    i16 %1,
-    i64 %2)
-
-  ret <vscale x 4 x i16> %a
-}
-
-declare <vscale x 4 x i16> @llvm.riscv.vslide1up.mask.nxv4i16.i16(
-  <vscale x 4 x i16>,
-  <vscale x 4 x i16>,
-  i16,
-  <vscale x 4 x i1>,
-  i64,
-  i64);
-
-define <vscale x 4 x i16> @intrinsic_vslide1up_mask_vx_nxv4i16_nxv4i16_i16(<vscale x 4 x i16> %0, <vscale x 4 x i16> %1, i16 %2, <vscale x 4 x i1> %3, i64 %4) nounwind {
-; CHECK-LABEL: intrinsic_vslide1up_mask_vx_nxv4i16_nxv4i16_i16:
-; CHECK:       # %bb.0: # %entry
-; CHECK-NEXT:    vsetvli zero, a1, e16, m1, ta, mu
-; CHECK-NEXT:    vslide1up.vx v8, v9, a0, v0.t
-; CHECK-NEXT:    ret
-entry:
-  %a = call <vscale x 4 x i16> @llvm.riscv.vslide1up.mask.nxv4i16.i16(
-    <vscale x 4 x i16> %0,
-    <vscale x 4 x i16> %1,
-    i16 %2,
-    <vscale x 4 x i1> %3,
-    i64 %4, i64 1)
-
-  ret <vscale x 4 x i16> %a
-}
-
-declare <vscale x 8 x i16> @llvm.riscv.vslide1up.nxv8i16.i16(
-  <vscale x 8 x i16>,
-  <vscale x 8 x i16>,
-  i16,
-  i64);
-
-define <vscale x 8 x i16> @intrinsic_vslide1up_vx_nxv8i16_nxv8i16_i16(<vscale x 8 x i16> %0, i16 %1, i64 %2) nounwind {
-; CHECK-LABEL: intrinsic_vslide1up_vx_nxv8i16_nxv8i16_i16:
-; CHECK:       # %bb.0: # %entry
-; CHECK-NEXT:    vsetvli zero, a1, e16, m2, ta, ma
-; CHECK-NEXT:    vslide1up.vx v10, v8, a0
-; CHECK-NEXT:    vmv.v.v v8, v10
-; CHECK-NEXT:    ret
-entry:
-  %a = call <vscale x 8 x i16> @llvm.riscv.vslide1up.nxv8i16.i16(
-    <vscale x 8 x i16> undef,
-    <vscale x 8 x i16> %0,
-    i16 %1,
-    i64 %2)
-
-  ret <vscale x 8 x i16> %a
-}
-
-declare <vscale x 8 x i16> @llvm.riscv.vslide1up.mask.nxv8i16.i16(
-  <vscale x 8 x i16>,
-  <vscale x 8 x i16>,
-  i16,
-  <vscale x 8 x i1>,
-  i64,
-  i64);
-
-define <vscale x 8 x i16> @intrinsic_vslide1up_mask_vx_nxv8i16_nxv8i16_i16(<vscale x 8 x i16> %0, <vscale x 8 x i16> %1, i16 %2, <vscale x 8 x i1> %3, i64 %4) nounwind {
-; CHECK-LABEL: intrinsic_vslide1up_mask_vx_nxv8i16_nxv8i16_i16:
-; CHECK:       # %bb.0: # %entry
-; CHECK-NEXT:    vsetvli zero, a1, e16, m2, ta, mu
-; CHECK-NEXT:    vslide1up.vx v8, v10, a0, v0.t
-; CHECK-NEXT:    ret
-entry:
-  %a = call <vscale x 8 x i16> @llvm.riscv.vslide1up.mask.nxv8i16.i16(
-    <vscale x 8 x i16> %0,
-    <vscale x 8 x i16> %1,
-    i16 %2,
-    <vscale x 8 x i1> %3,
-    i64 %4, i64 1)
-
-  ret <vscale x 8 x i16> %a
-}
-
-declare <vscale x 16 x i16> @llvm.riscv.vslide1up.nxv16i16.i16(
-  <vscale x 16 x i16>,
-  <vscale x 16 x i16>,
-  i16,
-  i64);
-
-define <vscale x 16 x i16> @intrinsic_vslide1up_vx_nxv16i16_nxv16i16_i16(<vscale x 16 x i16> %0, i16 %1, i64 %2) nounwind {
-; CHECK-LABEL: intrinsic_vslide1up_vx_nxv16i16_nxv16i16_i16:
-; CHECK:       # %bb.0: # %entry
-; CHECK-NEXT:    vsetvli zero, a1, e16, m4, ta, ma
-; CHECK-NEXT:    vslide1up.vx v12, v8, a0
-; CHECK-NEXT:    vmv.v.v v8, v12
-; CHECK-NEXT:    ret
-entry:
-  %a = call <vscale x 16 x i16> @llvm.riscv.vslide1up.nxv16i16.i16(
-    <vscale x 16 x i16> undef,
-    <vscale x 16 x i16> %0,
-    i16 %1,
-    i64 %2)
-
-  ret <vscale x 16 x i16> %a
-}
-
-declare <vscale x 16 x i16> @llvm.riscv.vslide1up.mask.nxv16i16.i16(
-  <vscale x 16 x i16>,
-  <vscale x 16 x i16>,
-  i16,
-  <vscale x 16 x i1>,
-  i64,
-  i64);
-
-define <vscale x 16 x i16> @intrinsic_vslide1up_mask_vx_nxv16i16_nxv16i16_i16(<vscale x 16 x i16> %0, <vscale x 16 x i16> %1, i16 %2, <vscale x 16 x i1> %3, i64 %4) nounwind {
-; CHECK-LABEL: intrinsic_vslide1up_mask_vx_nxv16i16_nxv16i16_i16:
-; CHECK:       # %bb.0: # %entry
-; CHECK-NEXT:    vsetvli zero, a1, e16, m4, ta, mu
-; CHECK-NEXT:    vslide1up.vx v8, v12, a0, v0.t
-; CHECK-NEXT:    ret
-entry:
-  %a = call <vscale x 16 x i16> @llvm.riscv.vslide1up.mask.nxv16i16.i16(
-    <vscale x 16 x i16> %0,
-    <vscale x 16 x i16> %1,
-    i16 %2,
-    <vscale x 16 x i1> %3,
-    i64 %4, i64 1)
-
-  ret <vscale x 16 x i16> %a
-}
-
-declare <vscale x 32 x i16> @llvm.riscv.vslide1up.nxv32i16.i16(
-  <vscale x 32 x i16>,
-  <vscale x 32 x i16>,
-  i16,
-  i64);
-
-define <vscale x 32 x i16> @intrinsic_vslide1up_vx_nxv32i16_nxv32i16_i16(<vscale x 32 x i16> %0, i16 %1, i64 %2) nounwind {
-; CHECK-LABEL: intrinsic_vslide1up_vx_nxv32i16_nxv32i16_i16:
-; CHECK:       # %bb.0: # %entry
-; CHECK-NEXT:    vsetvli zero, a1, e16, m8, ta, ma
-; CHECK-NEXT:    vslide1up.vx v16, v8, a0
-; CHECK-NEXT:    vmv.v.v v8, v16
-; CHECK-NEXT:    ret
-entry:
-  %a = call <vscale x 32 x i16> @llvm.riscv.vslide1up.nxv32i16.i16(
-    <vscale x 32 x i16> undef,
-    <vscale x 32 x i16> %0,
-    i16 %1,
-    i64 %2)
-
-  ret <vscale x 32 x i16> %a
-}
-
-declare <vscale x 32 x i16> @llvm.riscv.vslide1up.mask.nxv32i16.i16(
-  <vscale x 32 x i16>,
-  <vscale x 32 x i16>,
-  i16,
-  <vscale x 32 x i1>,
-  i64,
-  i64);
-
-define <vscale x 32 x i16> @intrinsic_vslide1up_mask_vx_nxv32i16_nxv32i16_i16(<vscale x 32 x i16> %0, <vscale x 32 x i16> %1, i16 %2, <vscale x 32 x i1> %3, i64 %4) nounwind {
-; CHECK-LABEL: intrinsic_vslide1up_mask_vx_nxv32i16_nxv32i16_i16:
-; CHECK:       # %bb.0: # %entry
-; CHECK-NEXT:    vsetvli zero, a1, e16, m8, ta, mu
-; CHECK-NEXT:    vslide1up.vx v8, v16, a0, v0.t
-; CHECK-NEXT:    ret
-entry:
-  %a = call <vscale x 32 x i16> @llvm.riscv.vslide1up.mask.nxv32i16.i16(
-    <vscale x 32 x i16> %0,
-    <vscale x 32 x i16> %1,
-    i16 %2,
-    <vscale x 32 x i1> %3,
-    i64 %4, i64 1)
-
-  ret <vscale x 32 x i16> %a
-}
-
-declare <vscale x 1 x i32> @llvm.riscv.vslide1up.nxv1i32.i32(
-  <vscale x 1 x i32>,
-  <vscale x 1 x i32>,
-  i32,
-  i64);
-
-define <vscale x 1 x i32> @intrinsic_vslide1up_vx_nxv1i32_nxv1i32_i32(<vscale x 1 x i32> %0, i32 %1, i64 %2) nounwind {
-; CHECK-LABEL: intrinsic_vslide1up_vx_nxv1i32_nxv1i32_i32:
-; CHECK:       # %bb.0: # %entry
-; CHECK-NEXT:    vsetvli zero, a1, e32, mf2, ta, ma
-; CHECK-NEXT:    vslide1up.vx v9, v8, a0
-; CHECK-NEXT:    vmv1r.v v8, v9
-; CHECK-NEXT:    ret
-entry:
-  %a = call <vscale x 1 x i32> @llvm.riscv.vslide1up.nxv1i32.i32(
-    <vscale x 1 x i32> undef,
-    <vscale x 1 x i32> %0,
-    i32 %1,
-    i64 %2)
-
-  ret <vscale x 1 x i32> %a
-}
-
-declare <vscale x 1 x i32> @llvm.riscv.vslide1up.mask.nxv1i32.i32(
-  <vscale x 1 x i32>,
-  <vscale x 1 x i32>,
-  i32,
-  <vscale x 1 x i1>,
-  i64,
-  i64);
-
-define <vscale x 1 x i32> @intrinsic_vslide1up_mask_vx_nxv1i32_nxv1i32_i32(<vscale x 1 x i32> %0, <vscale x 1 x i32> %1, i32 %2, <vscale x 1 x i1> %3, i64 %4) nounwind {
-; CHECK-LABEL: intrinsic_vslide1up_mask_vx_nxv1i32_nxv1i32_i32:
-; CHECK:       # %bb.0: # %entry
-; CHECK-NEXT:    vsetvli zero, a1, e32, mf2, ta, mu
-; CHECK-NEXT:    vslide1up.vx v8, v9, a0, v0.t
-; CHECK-NEXT:    ret
-entry:
-  %a = call <vscale x 1 x i32> @llvm.riscv.vslide1up.mask.nxv1i32.i32(
-    <vscale x 1 x i32> %0,
-    <vscale x 1 x i32> %1,
-    i32 %2,
-    <vscale x 1 x i1> %3,
-    i64 %4, i64 1)
-
-  ret <vscale x 1 x i32> %a
-}
-
-declare <vscale x 2 x i32> @llvm.riscv.vslide1up.nxv2i32.i32(
-  <vscale x 2 x i32>,
-  <vscale x 2 x i32>,
-  i32,
-  i64);
-
-define <vscale x 2 x i32> @intrinsic_vslide1up_vx_nxv2i32_nxv2i32_i32(<vscale x 2 x i32> %0, i32 %1, i64 %2) nounwind {
-; CHECK-LABEL: intrinsic_vslide1up_vx_nxv2i32_nxv2i32_i32:
-; CHECK:       # %bb.0: # %entry
-; CHECK-NEXT:    vsetvli zero, a1, e32, m1, ta, ma
-; CHECK-NEXT:    vslide1up.vx v9, v8, a0
-; CHECK-NEXT:    vmv.v.v v8, v9
-; CHECK-NEXT:    ret
-entry:
-  %a = call <vscale x 2 x i32> @llvm.riscv.vslide1up.nxv2i32.i32(
-    <vscale x 2 x i32> undef,
-    <vscale x 2 x i32> %0,
-    i32 %1,
-    i64 %2)
-
-  ret <vscale x 2 x i32> %a
-}
-
-declare <vscale x 2 x i32> @llvm.riscv.vslide1up.mask.nxv2i32.i32(
-  <vscale x 2 x i32>,
-  <vscale x 2 x i32>,
-  i32,
-  <vscale x 2 x i1>,
-  i64,
-  i64);
-
-define <vscale x 2 x i32> @intrinsic_vslide1up_mask_vx_nxv2i32_nxv2i32_i32(<vscale x 2 x i32> %0, <vscale x 2 x i32> %1, i32 %2, <vscale x 2 x i1> %3, i64 %4) nounwind {
-; CHECK-LABEL: intrinsic_vslide1up_mask_vx_nxv2i32_nxv2i32_i32:
-; CHECK:       # %bb.0: # %entry
-; CHECK-NEXT:    vsetvli zero, a1, e32, m1, ta, mu
-; CHECK-NEXT:    vslide1up.vx v8, v9, a0, v0.t
-; CHECK-NEXT:    ret
-entry:
-  %a = call <vscale x 2 x i32> @llvm.riscv.vslide1up.mask.nxv2i32.i32(
-    <vscale x 2 x i32> %0,
-    <vscale x 2 x i32> %1,
-    i32 %2,
-    <vscale x 2 x i1> %3,
-    i64 %4, i64 1)
-
-  ret <vscale x 2 x i32> %a
-}
-
-declare <vscale x 4 x i32> @llvm.riscv.vslide1up.nxv4i32.i32(
-  <vscale x 4 x i32>,
-  <vscale x 4 x i32>,
-  i32,
-  i64);
-
-define <vscale x 4 x i32> @intrinsic_vslide1up_vx_nxv4i32_nxv4i32_i32(<vscale x 4 x i32> %0, i32 %1, i64 %2) nounwind {
-; CHECK-LABEL: intrinsic_vslide1up_vx_nxv4i32_nxv4i32_i32:
-; CHECK:       # %bb.0: # %entry
-; CHECK-NEXT:    vsetvli zero, a1, e32, m2, ta, ma
-; CHECK-NEXT:    vslide1up.vx v10, v8, a0
-; CHECK-NEXT:    vmv.v.v v8, v10
-; CHECK-NEXT:    ret
-entry:
-  %a = call <vscale x 4 x i32> @llvm.riscv.vslide1up.nxv4i32.i32(
-    <vscale x 4 x i32> undef,
-    <vscale x 4 x i32> %0,
-    i32 %1,
-    i64 %2)
-
-  ret <vscale x 4 x i32> %a
-}
-
-declare <vscale x 4 x i32> @llvm.riscv.vslide1up.mask.nxv4i32.i32(
-  <vscale x 4 x i32>,
-  <vscale x 4 x i32>,
-  i32,
-  <vscale x 4 x i1>,
-  i64,
-  i64);
-
-define <vscale x 4 x i32> @intrinsic_vslide1up_mask_vx_nxv4i32_nxv4i32_i32(<vscale x 4 x i32> %0, <vscale x 4 x i32> %1, i32 %2, <vscale x 4 x i1> %3, i64 %4) nounwind {
-; CHECK-LABEL: intrinsic_vslide1up_mask_vx_nxv4i32_nxv4i32_i32:
-; CHECK:       # %bb.0: # %entry
-; CHECK-NEXT:    vsetvli zero, a1, e32, m2, ta, mu
-; CHECK-NEXT:    vslide1up.vx v8, v10, a0, v0.t
-; CHECK-NEXT:    ret
-entry:
-  %a = call <vscale x 4 x i32> @llvm.riscv.vslide1up.mask.nxv4i32.i32(
-    <vscale x 4 x i32> %0,
-    <vscale x 4 x i32> %1,
-    i32 %2,
-    <vscale x 4 x i1> %3,
-    i64 %4, i64 1)
-
-  ret <vscale x 4 x i32> %a
-}
-
-declare <vscale x 8 x i32> @llvm.riscv.vslide1up.nxv8i32.i32(
-  <vscale x 8 x i32>,
-  <vscale x 8 x i32>,
-  i32,
-  i64);
-
-define <vscale x 8 x i32> @intrinsic_vslide1up_vx_nxv8i32_nxv8i32_i32(<vscale x 8 x i32> %0, i32 %1, i64 %2) nounwind {
-; CHECK-LABEL: intrinsic_vslide1up_vx_nxv8i32_nxv8i32_i32:
-; CHECK:       # %bb.0: # %entry
-; CHECK-NEXT:    vsetvli zero, a1, e32, m4, ta, ma
-; CHECK-NEXT:    vslide1up.vx v12, v8, a0
-; CHECK-NEXT:    vmv.v.v v8, v12
-; CHECK-NEXT:    ret
-entry:
-  %a = call <vscale x 8 x i32> @llvm.riscv.vslide1up.nxv8i32.i32(
-    <vscale x 8 x i32> undef,
-    <vscale x 8 x i32> %0,
-    i32 %1,
-    i64 %2)
-
-  ret <vscale x 8 x i32> %a
-}
-
-declare <vscale x 8 x i32> @llvm.riscv.vslide1up.mask.nxv8i32.i32(
-  <vscale x 8 x i32>,
-  <vscale x 8 x i32>,
-  i32,
-  <vscale x 8 x i1>,
-  i64,
-  i64);
-
-define <vscale x 8 x i32> @intrinsic_vslide1up_mask_vx_nxv8i32_nxv8i32_i32(<vscale x 8 x i32> %0, <vscale x 8 x i32> %1, i32 %2, <vscale x 8 x i1> %3, i64 %4) nounwind {
-; CHECK-LABEL: intrinsic_vslide1up_mask_vx_nxv8i32_nxv8i32_i32:
-; CHECK:       # %bb.0: # %entry
-; CHECK-NEXT:    vsetvli zero, a1, e32, m4, ta, mu
-; CHECK-NEXT:    vslide1up.vx v8, v12, a0, v0.t
-; CHECK-NEXT:    ret
-entry:
-  %a = call <vscale x 8 x i32> @llvm.riscv.vslide1up.mask.nxv8i32.i32(
-    <vscale x 8 x i32> %0,
-    <vscale x 8 x i32> %1,
-    i32 %2,
-    <vscale x 8 x i1> %3,
-    i64 %4, i64 1)
-
-  ret <vscale x 8 x i32> %a
-}
-
-declare <vscale x 16 x i32> @llvm.riscv.vslide1up.nxv16i32.i32(
-  <vscale x 16 x i32>,
-  <vscale x 16 x i32>,
-  i32,
-  i64);
-
-define <vscale x 16 x i32> @intrinsic_vslide1up_vx_nxv16i32_nxv16i32_i32(<vscale x 16 x i32> %0, i32 %1, i64 %2) nounwind {
-; CHECK-LABEL: intrinsic_vslide1up_vx_nxv16i32_nxv16i32_i32:
-; CHECK:       # %bb.0: # %entry
-; CHECK-NEXT:    vsetvli zero, a1, e32, m8, ta, ma
-; CHECK-NEXT:    vslide1up.vx v16, v8, a0
-; CHECK-NEXT:    vmv.v.v v8, v16
-; CHECK-NEXT:    ret
-entry:
-  %a = call <vscale x 16 x i32> @llvm.riscv.vslide1up.nxv16i32.i32(
-    <vscale x 16 x i32> undef,
-    <vscale x 16 x i32> %0,
-    i32 %1,
-    i64 %2)
-
-  ret <vscale x 16 x i32> %a
-}
-
-declare <vscale x 16 x i32> @llvm.riscv.vslide1up.mask.nxv16i32.i32(
-  <vscale x 16 x i32>,
-  <vscale x 16 x i32>,
-  i32,
-  <vscale x 16 x i1>,
-  i64,
-  i64);
-
-define <vscale x 16 x i32> @intrinsic_vslide1up_mask_vx_nxv16i32_nxv16i32_i32(<vscale x 16 x i32> %0, <vscale x 16 x i32> %1, i32 %2, <vscale x 16 x i1> %3, i64 %4) nounwind {
-; CHECK-LABEL: intrinsic_vslide1up_mask_vx_nxv16i32_nxv16i32_i32:
-; CHECK:       # %bb.0: # %entry
-; CHECK-NEXT:    vsetvli zero, a1, e32, m8, ta, mu
-; CHECK-NEXT:    vslide1up.vx v8, v16, a0, v0.t
-; CHECK-NEXT:    ret
-entry:
-  %a = call <vscale x 16 x i32> @llvm.riscv.vslide1up.mask.nxv16i32.i32(
-    <vscale x 16 x i32> %0,
-    <vscale x 16 x i32> %1,
-    i32 %2,
-    <vscale x 16 x i1> %3,
-    i64 %4, i64 1)
-
-  ret <vscale x 16 x i32> %a
-}
-
-declare <vscale x 1 x i64> @llvm.riscv.vslide1up.nxv1i64.i64(
-  <vscale x 1 x i64>,
-  <vscale x 1 x i64>,
-  i64,
-  i64);
-
-define <vscale x 1 x i64> @intrinsic_vslide1up_vx_nxv1i64_nxv1i64_i64(<vscale x 1 x i64> %0, i64 %1, i64 %2) nounwind {
-; CHECK-LABEL: intrinsic_vslide1up_vx_nxv1i64_nxv1i64_i64:
-; CHECK:       # %bb.0: # %entry
-; CHECK-NEXT:    vsetvli zero, a1, e64, m1, ta, ma
-; CHECK-NEXT:    vslide1up.vx v9, v8, a0
-; CHECK-NEXT:    vmv.v.v v8, v9
-; CHECK-NEXT:    ret
-entry:
-  %a = call <vscale x 1 x i64> @llvm.riscv.vslide1up.nxv1i64.i64(
-    <vscale x 1 x i64> undef,
-    <vscale x 1 x i64> %0,
-    i64 %1,
-    i64 %2)
-
-  ret <vscale x 1 x i64> %a
-}
-
-declare <vscale x 1 x i64> @llvm.riscv.vslide1up.mask.nxv1i64.i64(
-  <vscale x 1 x i64>,
-  <vscale x 1 x i64>,
-  i64,
-  <vscale x 1 x i1>,
-  i64,
-  i64);
-
-define <vscale x 1 x i64> @intrinsic_vslide1up_mask_vx_nxv1i64_nxv1i64_i64(<vscale x 1 x i64> %0, <vscale x 1 x i64> %1, i64 %2, <vscale x 1 x i1> %3, i64 %4) nounwind {
-; CHECK-LABEL: intrinsic_vslide1up_mask_vx_nxv1i64_nxv1i64_i64:
-; CHECK:       # %bb.0: # %entry
-; CHECK-NEXT:    vsetvli zero, a1, e64, m1, ta, mu
-; CHECK-NEXT:    vslide1up.vx v8, v9, a0, v0.t
-; CHECK-NEXT:    ret
-entry:
-  %a = call <vscale x 1 x i64> @llvm.riscv.vslide1up.mask.nxv1i64.i64(
-    <vscale x 1 x i64> %0,
-    <vscale x 1 x i64> %1,
-    i64 %2,
-    <vscale x 1 x i1> %3,
-    i64 %4, i64 1)
-
-  ret <vscale x 1 x i64> %a
-}
-
-declare <vscale x 2 x i64> @llvm.riscv.vslide1up.nxv2i64.i64(
-  <vscale x 2 x i64>,
-  <vscale x 2 x i64>,
-  i64,
-  i64);
-
-define <vscale x 2 x i64> @intrinsic_vslide1up_vx_nxv2i64_nxv2i64_i64(<vscale x 2 x i64> %0, i64 %1, i64 %2) nounwind {
-; CHECK-LABEL: intrinsic_vslide1up_vx_nxv2i64_nxv2i64_i64:
-; CHECK:       # %bb.0: # %entry
-; CHECK-NEXT:    vsetvli zero, a1, e64, m2, ta, ma
-; CHECK-NEXT:    vslide1up.vx v10, v8, a0
-; CHECK-NEXT:    vmv.v.v v8, v10
-; CHECK-NEXT:    ret
-entry:
-  %a = call <vscale x 2 x i64> @llvm.riscv.vslide1up.nxv2i64.i64(
-    <vscale x 2 x i64> undef,
-    <vscale x 2 x i64> %0,
-    i64 %1,
-    i64 %2)
-
-  ret <vscale x 2 x i64> %a
-}
-
-declare <vscale x 2 x i64> @llvm.riscv.vslide1up.mask.nxv2i64.i64(
-  <vscale x 2 x i64>,
-  <vscale x 2 x i64>,
-  i64,
-  <vscale x 2 x i1>,
-  i64,
-  i64);
-
-define <vscale x 2 x i64> @intrinsic_vslide1up_mask_vx_nxv2i64_nxv2i64_i64(<vscale x 2 x i64> %0, <vscale x 2 x i64> %1, i64 %2, <vscale x 2 x i1> %3, i64 %4) nounwind {
-; CHECK-LABEL: intrinsic_vslide1up_mask_vx_nxv2i64_nxv2i64_i64:
-; CHECK:       # %bb.0: # %entry
-; CHECK-NEXT:    vsetvli zero, a1, e64, m2, ta, mu
-; CHECK-NEXT:    vslide1up.vx v8, v10, a0, v0.t
-; CHECK-NEXT:    ret
-entry:
-  %a = call <vscale x 2 x i64> @llvm.riscv.vslide1up.mask.nxv2i64.i64(
-    <vscale x 2 x i64> %0,
-    <vscale x 2 x i64> %1,
-    i64 %2,
-    <vscale x 2 x i1> %3,
-    i64 %4, i64 1)
-
-  ret <vscale x 2 x i64> %a
-}
-
-declare <vscale x 4 x i64> @llvm.riscv.vslide1up.nxv4i64.i64(
-  <vscale x 4 x i64>,
-  <vscale x 4 x i64>,
-  i64,
-  i64);
-
-define <vscale x 4 x i64> @intrinsic_vslide1up_vx_nxv4i64_nxv4i64_i64(<vscale x 4 x i64> %0, i64 %1, i64 %2) nounwind {
-; CHECK-LABEL: intrinsic_vslide1up_vx_nxv4i64_nxv4i64_i64:
-; CHECK:       # %bb.0: # %entry
-; CHECK-NEXT:    vsetvli zero, a1, e64, m4, ta, ma
-; CHECK-NEXT:    vslide1up.vx v12, v8, a0
-; CHECK-NEXT:    vmv.v.v v8, v12
-; CHECK-NEXT:    ret
-entry:
-  %a = call <vscale x 4 x i64> @llvm.riscv.vslide1up.nxv4i64.i64(
-    <vscale x 4 x i64> undef,
-    <vscale x 4 x i64> %0,
-    i64 %1,
-    i64 %2)
-
-  ret <vscale x 4 x i64> %a
-}
-
-declare <vscale x 4 x i64> @llvm.riscv.vslide1up.mask.nxv4i64.i64(
-  <vscale x 4 x i64>,
-  <vscale x 4 x i64>,
-  i64,
-  <vscale x 4 x i1>,
-  i64,
-  i64);
-
-define <vscale x 4 x i64> @intrinsic_vslide1up_mask_vx_nxv4i64_nxv4i64_i64(<vscale x 4 x i64> %0, <vscale x 4 x i64> %1, i64 %2, <vscale x 4 x i1> %3, i64 %4) nounwind {
-; CHECK-LABEL: intrinsic_vslide1up_mask_vx_nxv4i64_nxv4i64_i64:
-; CHECK:       # %bb.0: # %entry
-; CHECK-NEXT:    vsetvli zero, a1, e64, m4, ta, mu
-; CHECK-NEXT:    vslide1up.vx v8, v12, a0, v0.t
-; CHECK-NEXT:    ret
-entry:
-  %a = call <vscale x 4 x i64> @llvm.riscv.vslide1up.mask.nxv4i64.i64(
-    <vscale x 4 x i64> %0,
-    <vscale x 4 x i64> %1,
-    i64 %2,
-    <vscale x 4 x i1> %3,
-    i64 %4, i64 1)
-
-  ret <vscale x 4 x i64> %a
-}
-
-declare <vscale x 8 x i64> @llvm.riscv.vslide1up.nxv8i64.i64(
-  <vscale x 8 x i64>,
-  <vscale x 8 x i64>,
-  i64,
-  i64);
-
-define <vscale x 8 x i64> @intrinsic_vslide1up_vx_nxv8i64_nxv8i64_i64(<vscale x 8 x i64> %0, i64 %1, i64 %2) nounwind {
-; CHECK-LABEL: intrinsic_vslide1up_vx_nxv8i64_nxv8i64_i64:
-; CHECK:       # %bb.0: # %entry
-; CHECK-NEXT:    vsetvli zero, a1, e64, m8, ta, ma
-; CHECK-NEXT:    vslide1up.vx v16, v8, a0
-; CHECK-NEXT:    vmv.v.v v8, v16
-; CHECK-NEXT:    ret
-entry:
-  %a = call <vscale x 8 x i64> @llvm.riscv.vslide1up.nxv8i64.i64(
-    <vscale x 8 x i64> undef,
-    <vscale x 8 x i64> %0,
-    i64 %1,
-    i64 %2)
-
-  ret <vscale x 8 x i64> %a
-}
-
-declare <vscale x 8 x i64> @llvm.riscv.vslide1up.mask.nxv8i64.i64(
-  <vscale x 8 x i64>,
-  <vscale x 8 x i64>,
-  i64,
-  <vscale x 8 x i1>,
-  i64,
-  i64);
-
-define <vscale x 8 x i64> @intrinsic_vslide1up_mask_vx_nxv8i64_nxv8i64_i64(<vscale x 8 x i64> %0, <vscale x 8 x i64> %1, i64 %2, <vscale x 8 x i1> %3, i64 %4) nounwind {
-; CHECK-LABEL: intrinsic_vslide1up_mask_vx_nxv8i64_nxv8i64_i64:
-; CHECK:       # %bb.0: # %entry
-; CHECK-NEXT:    vsetvli zero, a1, e64, m8, ta, mu
-; CHECK-NEXT:    vslide1up.vx v8, v16, a0, v0.t
-; CHECK-NEXT:    ret
-entry:
-  %a = call <vscale x 8 x i64> @llvm.riscv.vslide1up.mask.nxv8i64.i64(
-    <vscale x 8 x i64> %0,
-    <vscale x 8 x i64> %1,
-    i64 %2,
-    <vscale x 8 x i1> %3,
-    i64 %4, i64 1)
-
-  ret <vscale x 8 x i64> %a
-}
diff --git a/llvm/test/CodeGen/RISCV/rvv/vslide1up-rv32.ll b/llvm/test/CodeGen/RISCV/rvv/vslide1up.ll
index 55f0196..e1f020a 100644
--- a/llvm/test/CodeGen/RISCV/rvv/vslide1up-rv32.ll
+++ b/llvm/test/CodeGen/RISCV/rvv/vslide1up.ll
@@ -1,14 +1,16 @@
 ; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py
-; RUN: llc -mtriple=riscv32 -mattr=+v,+f -verify-machineinstrs \
-; RUN:   < %s | FileCheck %s
+; RUN: sed 's/iXLen/i32/g' %s | llc -mtriple=riscv32 -mattr=+v \
+; RUN:   -verify-machineinstrs  | FileCheck %s --check-prefixes=CHECK,RV32
+; RUN: sed 's/iXLen/i64/g' %s | llc -mtriple=riscv64 -mattr=+v \
+; RUN:   -verify-machineinstrs  | FileCheck %s --check-prefixes=CHECK,RV64
 
 declare <vscale x 1 x i8> @llvm.riscv.vslide1up.nxv1i8.i8(
   <vscale x 1 x i8>,
   <vscale x 1 x i8>,
   i8,
-  i32);
+  iXLen)
 
-define <vscale x 1 x i8> @intrinsic_vslide1up_vx_nxv1i8_nxv1i8_i8(<vscale x 1 x i8> %0, i8 %1, i32 %2) nounwind {
+define <vscale x 1 x i8> @intrinsic_vslide1up_vx_nxv1i8_nxv1i8_i8(<vscale x 1 x i8> %0, i8 %1, iXLen %2) nounwind {
 ; CHECK-LABEL: intrinsic_vslide1up_vx_nxv1i8_nxv1i8_i8:
 ; CHECK:       # %bb.0: # %entry
 ; CHECK-NEXT:    vsetvli zero, a1, e8, mf8, ta, ma
@@ -20,7 +22,7 @@ entry:
     <vscale x 1 x i8> undef,
     <vscale x 1 x i8> %0,
     i8 %1,
-    i32 %2)
+    iXLen %2)
 
   ret <vscale x 1 x i8> %a
 }
@@ -30,10 +32,10 @@ declare <vscale x 1 x i8> @llvm.riscv.vslide1up.mask.nxv1i8.i8(
   <vscale x 1 x i8>,
   i8,
   <vscale x 1 x i1>,
-  i32,
-  i32);
+  iXLen,
+  iXLen)
 
-define <vscale x 1 x i8> @intrinsic_vslide1up_mask_vx_nxv1i8_nxv1i8_i8(<vscale x 1 x i8> %0, <vscale x 1 x i8> %1, i8 %2, <vscale x 1 x i1> %3, i32 %4) nounwind {
+define <vscale x 1 x i8> @intrinsic_vslide1up_mask_vx_nxv1i8_nxv1i8_i8(<vscale x 1 x i8> %0, <vscale x 1 x i8> %1, i8 %2, <vscale x 1 x i1> %3, iXLen %4) nounwind {
 ; CHECK-LABEL: intrinsic_vslide1up_mask_vx_nxv1i8_nxv1i8_i8:
 ; CHECK:       # %bb.0: # %entry
 ; CHECK-NEXT:    vsetvli zero, a1, e8, mf8, ta, mu
@@ -45,7 +47,7 @@ entry:
     <vscale x 1 x i8> %1,
     i8 %2,
     <vscale x 1 x i1> %3,
-    i32 %4, i32 1)
+    iXLen %4, iXLen 1)
 
   ret <vscale x 1 x i8> %a
 }
@@ -54,9 +56,9 @@ declare <vscale x 2 x i8> @llvm.riscv.vslide1up.nxv2i8.i8(
   <vscale x 2 x i8>,
   <vscale x 2 x i8>,
   i8,
-  i32);
+  iXLen)
 
-define <vscale x 2 x i8> @intrinsic_vslide1up_vx_nxv2i8_nxv2i8_i8(<vscale x 2 x i8> %0, i8 %1, i32 %2) nounwind {
+define <vscale x 2 x i8> @intrinsic_vslide1up_vx_nxv2i8_nxv2i8_i8(<vscale x 2 x i8> %0, i8 %1, iXLen %2) nounwind {
 ; CHECK-LABEL: intrinsic_vslide1up_vx_nxv2i8_nxv2i8_i8:
 ; CHECK:       # %bb.0: # %entry
 ; CHECK-NEXT:    vsetvli zero, a1, e8, mf4, ta, ma
@@ -68,7 +70,7 @@ entry:
     <vscale x 2 x i8> undef,
     <vscale x 2 x i8> %0,
     i8 %1,
-    i32 %2)
+    iXLen %2)
 
   ret <vscale x 2 x i8> %a
 }
@@ -78,10 +80,10 @@ declare <vscale x 2 x i8> @llvm.riscv.vslide1up.mask.nxv2i8.i8(
   <vscale x 2 x i8>,
   i8,
   <vscale x 2 x i1>,
-  i32,
-  i32);
+  iXLen,
+  iXLen)
 
-define <vscale x 2 x i8> @intrinsic_vslide1up_mask_vx_nxv2i8_nxv2i8_i8(<vscale x 2 x i8> %0, <vscale x 2 x i8> %1, i8 %2, <vscale x 2 x i1> %3, i32 %4) nounwind {
+define <vscale x 2 x i8> @intrinsic_vslide1up_mask_vx_nxv2i8_nxv2i8_i8(<vscale x 2 x i8> %0, <vscale x 2 x i8> %1, i8 %2, <vscale x 2 x i1> %3, iXLen %4) nounwind {
 ; CHECK-LABEL: intrinsic_vslide1up_mask_vx_nxv2i8_nxv2i8_i8:
 ; CHECK:       # %bb.0: # %entry
 ; CHECK-NEXT:    vsetvli zero, a1, e8, mf4, ta, mu
@@ -93,7 +95,7 @@ entry:
     <vscale x 2 x i8> %1,
     i8 %2,
     <vscale x 2 x i1> %3,
-    i32 %4, i32 1)
+    iXLen %4, iXLen 1)
 
   ret <vscale x 2 x i8> %a
 }
@@ -102,9 +104,9 @@ declare <vscale x 4 x i8> @llvm.riscv.vslide1up.nxv4i8.i8(
   <vscale x 4 x i8>,
   <vscale x 4 x i8>,
   i8,
-  i32);
+  iXLen)
 
-define <vscale x 4 x i8> @intrinsic_vslide1up_vx_nxv4i8_nxv4i8_i8(<vscale x 4 x i8> %0, i8 %1, i32 %2) nounwind {
+define <vscale x 4 x i8> @intrinsic_vslide1up_vx_nxv4i8_nxv4i8_i8(<vscale x 4 x i8> %0, i8 %1, iXLen %2) nounwind {
 ; CHECK-LABEL: intrinsic_vslide1up_vx_nxv4i8_nxv4i8_i8:
 ; CHECK:       # %bb.0: # %entry
 ; CHECK-NEXT:    vsetvli zero, a1, e8, mf2, ta, ma
@@ -116,7 +118,7 @@ entry:
     <vscale x 4 x i8> undef,
     <vscale x 4 x i8> %0,
     i8 %1,
-    i32 %2)
+    iXLen %2)
 
   ret <vscale x 4 x i8> %a
 }
@@ -126,10 +128,10 @@ declare <vscale x 4 x i8> @llvm.riscv.vslide1up.mask.nxv4i8.i8(
   <vscale x 4 x i8>,
   i8,
   <vscale x 4 x i1>,
-  i32,
-  i32);
+  iXLen,
+  iXLen)
 
-define <vscale x 4 x i8> @intrinsic_vslide1up_mask_vx_nxv4i8_nxv4i8_i8(<vscale x 4 x i8> %0, <vscale x 4 x i8> %1, i8 %2, <vscale x 4 x i1> %3, i32 %4) nounwind {
+define <vscale x 4 x i8> @intrinsic_vslide1up_mask_vx_nxv4i8_nxv4i8_i8(<vscale x 4 x i8> %0, <vscale x 4 x i8> %1, i8 %2, <vscale x 4 x i1> %3, iXLen %4) nounwind {
 ; CHECK-LABEL: intrinsic_vslide1up_mask_vx_nxv4i8_nxv4i8_i8:
 ; CHECK:       # %bb.0: # %entry
 ; CHECK-NEXT:    vsetvli zero, a1, e8, mf2, ta, mu
@@ -141,7 +143,7 @@ entry:
     <vscale x 4 x i8> %1,
     i8 %2,
     <vscale x 4 x i1> %3,
-    i32 %4, i32 1)
+    iXLen %4, iXLen 1)
 
   ret <vscale x 4 x i8> %a
 }
@@ -150,9 +152,9 @@ declare <vscale x 8 x i8> @llvm.riscv.vslide1up.nxv8i8.i8(
   <vscale x 8 x i8>,
   <vscale x 8 x i8>,
   i8,
-  i32);
+  iXLen)
 
-define <vscale x 8 x i8> @intrinsic_vslide1up_vx_nxv8i8_nxv8i8_i8(<vscale x 8 x i8> %0, i8 %1, i32 %2) nounwind {
+define <vscale x 8 x i8> @intrinsic_vslide1up_vx_nxv8i8_nxv8i8_i8(<vscale x 8 x i8> %0, i8 %1, iXLen %2) nounwind {
 ; CHECK-LABEL: intrinsic_vslide1up_vx_nxv8i8_nxv8i8_i8:
 ; CHECK:       # %bb.0: # %entry
 ; CHECK-NEXT:    vsetvli zero, a1, e8, m1, ta, ma
@@ -164,7 +166,7 @@ entry:
     <vscale x 8 x i8> undef,
     <vscale x 8 x i8> %0,
     i8 %1,
-    i32 %2)
+    iXLen %2)
 
   ret <vscale x 8 x i8> %a
 }
@@ -174,10 +176,10 @@ declare <vscale x 8 x i8> @llvm.riscv.vslide1up.mask.nxv8i8.i8(
   <vscale x 8 x i8>,
   i8,
   <vscale x 8 x i1>,
-  i32,
-  i32);
+  iXLen,
+  iXLen)
 
-define <vscale x 8 x i8> @intrinsic_vslide1up_mask_vx_nxv8i8_nxv8i8_i8(<vscale x 8 x i8> %0, <vscale x 8 x i8> %1, i8 %2, <vscale x 8 x i1> %3, i32 %4) nounwind {
+define <vscale x 8 x i8> @intrinsic_vslide1up_mask_vx_nxv8i8_nxv8i8_i8(<vscale x 8 x i8> %0, <vscale x 8 x i8> %1, i8 %2, <vscale x 8 x i1> %3, iXLen %4) nounwind {
 ; CHECK-LABEL: intrinsic_vslide1up_mask_vx_nxv8i8_nxv8i8_i8:
 ; CHECK:       # %bb.0: # %entry
 ; CHECK-NEXT:    vsetvli zero, a1, e8, m1, ta, mu
@@ -189,7 +191,7 @@ entry:
     <vscale x 8 x i8> %1,
     i8 %2,
     <vscale x 8 x i1> %3,
-    i32 %4, i32 1)
+    iXLen %4, iXLen 1)
 
   ret <vscale x 8 x i8> %a
 }
@@ -198,9 +200,9 @@ declare <vscale x 16 x i8> @llvm.riscv.vslide1up.nxv16i8.i8(
   <vscale x 16 x i8>,
   <vscale x 16 x i8>,
   i8,
-  i32);
+  iXLen)
 
-define <vscale x 16 x i8> @intrinsic_vslide1up_vx_nxv16i8_nxv16i8_i8(<vscale x 16 x i8> %0, i8 %1, i32 %2) nounwind {
+define <vscale x 16 x i8> @intrinsic_vslide1up_vx_nxv16i8_nxv16i8_i8(<vscale x 16 x i8> %0, i8 %1, iXLen %2) nounwind {
 ; CHECK-LABEL: intrinsic_vslide1up_vx_nxv16i8_nxv16i8_i8:
 ; CHECK:       # %bb.0: # %entry
 ; CHECK-NEXT:    vsetvli zero, a1, e8, m2, ta, ma
@@ -212,7 +214,7 @@ entry:
     <vscale x 16 x i8> undef,
     <vscale x 16 x i8> %0,
     i8 %1,
-    i32 %2)
+    iXLen %2)
 
   ret <vscale x 16 x i8> %a
 }
@@ -222,10 +224,10 @@ declare <vscale x 16 x i8> @llvm.riscv.vslide1up.mask.nxv16i8.i8(
   <vscale x 16 x i8>,
   i8,
   <vscale x 16 x i1>,
-  i32,
-  i32);
+  iXLen,
+  iXLen)
 
-define <vscale x 16 x i8> @intrinsic_vslide1up_mask_vx_nxv16i8_nxv16i8_i8(<vscale x 16 x i8> %0, <vscale x 16 x i8> %1, i8 %2, <vscale x 16 x i1> %3, i32 %4) nounwind {
+define <vscale x 16 x i8> @intrinsic_vslide1up_mask_vx_nxv16i8_nxv16i8_i8(<vscale x 16 x i8> %0, <vscale x 16 x i8> %1, i8 %2, <vscale x 16 x i1> %3, iXLen %4) nounwind {
 ; CHECK-LABEL: intrinsic_vslide1up_mask_vx_nxv16i8_nxv16i8_i8:
 ; CHECK:       # %bb.0: # %entry
 ; CHECK-NEXT:    vsetvli zero, a1, e8, m2, ta, mu
@@ -237,7 +239,7 @@ entry:
     <vscale x 16 x i8> %1,
     i8 %2,
     <vscale x 16 x i1> %3,
-    i32 %4, i32 1)
+    iXLen %4, iXLen 1)
 
   ret <vscale x 16 x i8> %a
 }
@@ -246,9 +248,9 @@ declare <vscale x 32 x i8> @llvm.riscv.vslide1up.nxv32i8.i8(
   <vscale x 32 x i8>,
   <vscale x 32 x i8>,
   i8,
-  i32);
+  iXLen)
 
-define <vscale x 32 x i8> @intrinsic_vslide1up_vx_nxv32i8_nxv32i8_i8(<vscale x 32 x i8> %0, i8 %1, i32 %2) nounwind {
+define <vscale x 32 x i8> @intrinsic_vslide1up_vx_nxv32i8_nxv32i8_i8(<vscale x 32 x i8> %0, i8 %1, iXLen %2) nounwind {
 ; CHECK-LABEL: intrinsic_vslide1up_vx_nxv32i8_nxv32i8_i8:
 ; CHECK:       # %bb.0: # %entry
 ; CHECK-NEXT:    vsetvli zero, a1, e8, m4, ta, ma
@@ -260,7 +262,7 @@ entry:
     <vscale x 32 x i8> undef,
     <vscale x 32 x i8> %0,
     i8 %1,
-    i32 %2)
+    iXLen %2)
 
   ret <vscale x 32 x i8> %a
 }
@@ -270,10 +272,10 @@ declare <vscale x 32 x i8> @llvm.riscv.vslide1up.mask.nxv32i8.i8(
   <vscale x 32 x i8>,
   i8,
   <vscale x 32 x i1>,
-  i32,
-  i32);
+  iXLen,
+  iXLen)
 
-define <vscale x 32 x i8> @intrinsic_vslide1up_mask_vx_nxv32i8_nxv32i8_i8(<vscale x 32 x i8> %0, <vscale x 32 x i8> %1, i8 %2, <vscale x 32 x i1> %3, i32 %4) nounwind {
+define <vscale x 32 x i8> @intrinsic_vslide1up_mask_vx_nxv32i8_nxv32i8_i8(<vscale x 32 x i8> %0, <vscale x 32 x i8> %1, i8 %2, <vscale x 32 x i1> %3, iXLen %4) nounwind {
 ; CHECK-LABEL: intrinsic_vslide1up_mask_vx_nxv32i8_nxv32i8_i8:
 ; CHECK:       # %bb.0: # %entry
 ; CHECK-NEXT:    vsetvli zero, a1, e8, m4, ta, mu
@@ -285,7 +287,7 @@ entry:
     <vscale x 32 x i8> %1,
     i8 %2,
     <vscale x 32 x i1> %3,
-    i32 %4, i32 1)
+    iXLen %4, iXLen 1)
 
   ret <vscale x 32 x i8> %a
 }
@@ -294,9 +296,9 @@ declare <vscale x 64 x i8> @llvm.riscv.vslide1up.nxv64i8.i8(
   <vscale x 64 x i8>,
   <vscale x 64 x i8>,
   i8,
-  i32);
+  iXLen)
 
-define <vscale x 64 x i8> @intrinsic_vslide1up_vx_nxv64i8_nxv64i8_i8(<vscale x 64 x i8> %0, i8 %1, i32 %2) nounwind {
+define <vscale x 64 x i8> @intrinsic_vslide1up_vx_nxv64i8_nxv64i8_i8(<vscale x 64 x i8> %0, i8 %1, iXLen %2) nounwind {
 ; CHECK-LABEL: intrinsic_vslide1up_vx_nxv64i8_nxv64i8_i8:
 ; CHECK:       # %bb.0: # %entry
 ; CHECK-NEXT:    vsetvli zero, a1, e8, m8, ta, ma
@@ -308,7 +310,7 @@ entry:
     <vscale x 64 x i8> undef,
     <vscale x 64 x i8> %0,
     i8 %1,
-    i32 %2)
+    iXLen %2)
 
   ret <vscale x 64 x i8> %a
 }
@@ -318,10 +320,10 @@ declare <vscale x 64 x i8> @llvm.riscv.vslide1up.mask.nxv64i8.i8(
   <vscale x 64 x i8>,
   i8,
   <vscale x 64 x i1>,
-  i32,
-  i32);
+  iXLen,
+  iXLen)
 
-define <vscale x 64 x i8> @intrinsic_vslide1up_mask_vx_nxv64i8_nxv64i8_i8(<vscale x 64 x i8> %0, <vscale x 64 x i8> %1, i8 %2, <vscale x 64 x i1> %3, i32 %4) nounwind {
+define <vscale x 64 x i8> @intrinsic_vslide1up_mask_vx_nxv64i8_nxv64i8_i8(<vscale x 64 x i8> %0, <vscale x 64 x i8> %1, i8 %2, <vscale x 64 x i1> %3, iXLen %4) nounwind {
 ; CHECK-LABEL: intrinsic_vslide1up_mask_vx_nxv64i8_nxv64i8_i8:
 ; CHECK:       # %bb.0: # %entry
 ; CHECK-NEXT:    vsetvli zero, a1, e8, m8, ta, mu
@@ -333,7 +335,7 @@ entry:
     <vscale x 64 x i8> %1,
     i8 %2,
     <vscale x 64 x i1> %3,
-    i32 %4, i32 1)
+    iXLen %4, iXLen 1)
 
   ret <vscale x 64 x i8> %a
 }
@@ -342,9 +344,9 @@ declare <vscale x 1 x i16> @llvm.riscv.vslide1up.nxv1i16.i16(
   <vscale x 1 x i16>,
   <vscale x 1 x i16>,
   i16,
-  i32);
+  iXLen)
 
-define <vscale x 1 x i16> @intrinsic_vslide1up_vx_nxv1i16_nxv1i16_i16(<vscale x 1 x i16> %0, i16 %1, i32 %2) nounwind {
+define <vscale x 1 x i16> @intrinsic_vslide1up_vx_nxv1i16_nxv1i16_i16(<vscale x 1 x i16> %0, i16 %1, iXLen %2) nounwind {
 ; CHECK-LABEL: intrinsic_vslide1up_vx_nxv1i16_nxv1i16_i16:
 ; CHECK:       # %bb.0: # %entry
 ; CHECK-NEXT:    vsetvli zero, a1, e16, mf4, ta, ma
@@ -356,7 +358,7 @@ entry:
     <vscale x 1 x i16> undef,
     <vscale x 1 x i16> %0,
     i16 %1,
-    i32 %2)
+    iXLen %2)
 
   ret <vscale x 1 x i16> %a
 }
@@ -366,10 +368,10 @@ declare <vscale x 1 x i16> @llvm.riscv.vslide1up.mask.nxv1i16.i16(
   <vscale x 1 x i16>,
   i16,
   <vscale x 1 x i1>,
-  i32,
-  i32);
+  iXLen,
+  iXLen)
 
-define <vscale x 1 x i16> @intrinsic_vslide1up_mask_vx_nxv1i16_nxv1i16_i16(<vscale x 1 x i16> %0, <vscale x 1 x i16> %1, i16 %2, <vscale x 1 x i1> %3, i32 %4) nounwind {
+define <vscale x 1 x i16> @intrinsic_vslide1up_mask_vx_nxv1i16_nxv1i16_i16(<vscale x 1 x i16> %0, <vscale x 1 x i16> %1, i16 %2, <vscale x 1 x i1> %3, iXLen %4) nounwind {
 ; CHECK-LABEL: intrinsic_vslide1up_mask_vx_nxv1i16_nxv1i16_i16:
 ; CHECK:       # %bb.0: # %entry
 ; CHECK-NEXT:    vsetvli zero, a1, e16, mf4, ta, mu
@@ -381,7 +383,7 @@ entry:
     <vscale x 1 x i16> %1,
     i16 %2,
     <vscale x 1 x i1> %3,
-    i32 %4, i32 1)
+    iXLen %4, iXLen 1)
 
   ret <vscale x 1 x i16> %a
 }
@@ -390,9 +392,9 @@ declare <vscale x 2 x i16> @llvm.riscv.vslide1up.nxv2i16.i16(
   <vscale x 2 x i16>,
   <vscale x 2 x i16>,
   i16,
-  i32);
+  iXLen)
 
-define <vscale x 2 x i16> @intrinsic_vslide1up_vx_nxv2i16_nxv2i16_i16(<vscale x 2 x i16> %0, i16 %1, i32 %2) nounwind {
+define <vscale x 2 x i16> @intrinsic_vslide1up_vx_nxv2i16_nxv2i16_i16(<vscale x 2 x i16> %0, i16 %1, iXLen %2) nounwind {
 ; CHECK-LABEL: intrinsic_vslide1up_vx_nxv2i16_nxv2i16_i16:
 ; CHECK:       # %bb.0: # %entry
 ; CHECK-NEXT:    vsetvli zero, a1, e16, mf2, ta, ma
@@ -404,7 +406,7 @@ entry:
     <vscale x 2 x i16> undef,
     <vscale x 2 x i16> %0,
     i16 %1,
-    i32 %2)
+    iXLen %2)
 
   ret <vscale x 2 x i16> %a
 }
@@ -414,10 +416,10 @@ declare <vscale x 2 x i16> @llvm.riscv.vslide1up.mask.nxv2i16.i16(
   <vscale x 2 x i16>,
   i16,
   <vscale x 2 x i1>,
-  i32,
-  i32);
+  iXLen,
+  iXLen)
 
-define <vscale x 2 x i16> @intrinsic_vslide1up_mask_vx_nxv2i16_nxv2i16_i16(<vscale x 2 x i16> %0, <vscale x 2 x i16> %1, i16 %2, <vscale x 2 x i1> %3, i32 %4) nounwind {
+define <vscale x 2 x i16> @intrinsic_vslide1up_mask_vx_nxv2i16_nxv2i16_i16(<vscale x 2 x i16> %0, <vscale x 2 x i16> %1, i16 %2, <vscale x 2 x i1> %3, iXLen %4) nounwind {
 ; CHECK-LABEL: intrinsic_vslide1up_mask_vx_nxv2i16_nxv2i16_i16:
 ; CHECK:       # %bb.0: # %entry
 ; CHECK-NEXT:    vsetvli zero, a1, e16, mf2, ta, mu
@@ -429,7 +431,7 @@ entry:
     <vscale x 2 x i16> %1,
     i16 %2,
     <vscale x 2 x i1> %3,
-    i32 %4, i32 1)
+    iXLen %4, iXLen 1)
 
   ret <vscale x 2 x i16> %a
 }
@@ -438,9 +440,9 @@ declare <vscale x 4 x i16> @llvm.riscv.vslide1up.nxv4i16.i16(
   <vscale x 4 x i16>,
   <vscale x 4 x i16>,
   i16,
-  i32);
+  iXLen)
 
-define <vscale x 4 x i16> @intrinsic_vslide1up_vx_nxv4i16_nxv4i16_i16(<vscale x 4 x i16> %0, i16 %1, i32 %2) nounwind {
+define <vscale x 4 x i16> @intrinsic_vslide1up_vx_nxv4i16_nxv4i16_i16(<vscale x 4 x i16> %0, i16 %1, iXLen %2) nounwind {
 ; CHECK-LABEL: intrinsic_vslide1up_vx_nxv4i16_nxv4i16_i16:
 ; CHECK:       # %bb.0: # %entry
 ; CHECK-NEXT:    vsetvli zero, a1, e16, m1, ta, ma
@@ -452,7 +454,7 @@ entry:
     <vscale x 4 x i16> undef,
     <vscale x 4 x i16> %0,
     i16 %1,
-    i32 %2)
+    iXLen %2)
 
   ret <vscale x 4 x i16> %a
 }
@@ -462,10 +464,10 @@ declare <vscale x 4 x i16> @llvm.riscv.vslide1up.mask.nxv4i16.i16(
   <vscale x 4 x i16>,
   i16,
   <vscale x 4 x i1>,
-  i32,
-  i32);
+  iXLen,
+  iXLen)
 
-define <vscale x 4 x i16> @intrinsic_vslide1up_mask_vx_nxv4i16_nxv4i16_i16(<vscale x 4 x i16> %0, <vscale x 4 x i16> %1, i16 %2, <vscale x 4 x i1> %3, i32 %4) nounwind {
+define <vscale x 4 x i16> @intrinsic_vslide1up_mask_vx_nxv4i16_nxv4i16_i16(<vscale x 4 x i16> %0, <vscale x 4 x i16> %1, i16 %2, <vscale x 4 x i1> %3, iXLen %4) nounwind {
 ; CHECK-LABEL: intrinsic_vslide1up_mask_vx_nxv4i16_nxv4i16_i16:
 ; CHECK:       # %bb.0: # %entry
 ; CHECK-NEXT:    vsetvli zero, a1, e16, m1, ta, mu
@@ -477,7 +479,7 @@ entry:
     <vscale x 4 x i16> %1,
     i16 %2,
     <vscale x 4 x i1> %3,
-    i32 %4, i32 1)
+    iXLen %4, iXLen 1)
 
   ret <vscale x 4 x i16> %a
 }
@@ -486,9 +488,9 @@ declare <vscale x 8 x i16> @llvm.riscv.vslide1up.nxv8i16.i16(
   <vscale x 8 x i16>,
   <vscale x 8 x i16>,
   i16,
-  i32);
+  iXLen)
 
-define <vscale x 8 x i16> @intrinsic_vslide1up_vx_nxv8i16_nxv8i16_i16(<vscale x 8 x i16> %0, i16 %1, i32 %2) nounwind {
+define <vscale x 8 x i16> @intrinsic_vslide1up_vx_nxv8i16_nxv8i16_i16(<vscale x 8 x i16> %0, i16 %1, iXLen %2) nounwind {
 ; CHECK-LABEL: intrinsic_vslide1up_vx_nxv8i16_nxv8i16_i16:
 ; CHECK:       # %bb.0: # %entry
 ; CHECK-NEXT:    vsetvli zero, a1, e16, m2, ta, ma
@@ -500,7 +502,7 @@ entry:
     <vscale x 8 x i16> undef,
     <vscale x 8 x i16> %0,
     i16 %1,
-    i32 %2)
+    iXLen %2)
 
   ret <vscale x 8 x i16> %a
 }
@@ -510,10 +512,10 @@ declare <vscale x 8 x i16> @llvm.riscv.vslide1up.mask.nxv8i16.i16(
   <vscale x 8 x i16>,
   i16,
   <vscale x 8 x i1>,
-  i32,
-  i32);
+  iXLen,
+  iXLen)
 
-define <vscale x 8 x i16> @intrinsic_vslide1up_mask_vx_nxv8i16_nxv8i16_i16(<vscale x 8 x i16> %0, <vscale x 8 x i16> %1, i16 %2, <vscale x 8 x i1> %3, i32 %4) nounwind {
+define <vscale x 8 x i16> @intrinsic_vslide1up_mask_vx_nxv8i16_nxv8i16_i16(<vscale x 8 x i16> %0, <vscale x 8 x i16> %1, i16 %2, <vscale x 8 x i1> %3, iXLen %4) nounwind {
 ; CHECK-LABEL: intrinsic_vslide1up_mask_vx_nxv8i16_nxv8i16_i16:
 ; CHECK:       # %bb.0: # %entry
 ; CHECK-NEXT:    vsetvli zero, a1, e16, m2, ta, mu
@@ -525,7 +527,7 @@ entry:
     <vscale x 8 x i16> %1,
     i16 %2,
     <vscale x 8 x i1> %3,
-    i32 %4, i32 1)
+    iXLen %4, iXLen 1)
 
   ret <vscale x 8 x i16> %a
 }
@@ -534,9 +536,9 @@ declare <vscale x 16 x i16> @llvm.riscv.vslide1up.nxv16i16.i16(
   <vscale x 16 x i16>,
   <vscale x 16 x i16>,
   i16,
-  i32);
+  iXLen)
 
-define <vscale x 16 x i16> @intrinsic_vslide1up_vx_nxv16i16_nxv16i16_i16(<vscale x 16 x i16> %0, i16 %1, i32 %2) nounwind {
+define <vscale x 16 x i16> @intrinsic_vslide1up_vx_nxv16i16_nxv16i16_i16(<vscale x 16 x i16> %0, i16 %1, iXLen %2) nounwind {
 ; CHECK-LABEL: intrinsic_vslide1up_vx_nxv16i16_nxv16i16_i16:
 ; CHECK:       # %bb.0: # %entry
 ; CHECK-NEXT:    vsetvli zero, a1, e16, m4, ta, ma
@@ -548,7 +550,7 @@ entry:
     <vscale x 16 x i16> undef,
     <vscale x 16 x i16> %0,
     i16 %1,
-    i32 %2)
+    iXLen %2)
 
   ret <vscale x 16 x i16> %a
 }
@@ -558,10 +560,10 @@ declare <vscale x 16 x i16> @llvm.riscv.vslide1up.mask.nxv16i16.i16(
   <vscale x 16 x i16>,
   i16,
   <vscale x 16 x i1>,
-  i32,
-  i32);
+  iXLen,
+  iXLen)
 
-define <vscale x 16 x i16> @intrinsic_vslide1up_mask_vx_nxv16i16_nxv16i16_i16(<vscale x 16 x i16> %0, <vscale x 16 x i16> %1, i16 %2, <vscale x 16 x i1> %3, i32 %4) nounwind {
+define <vscale x 16 x i16> @intrinsic_vslide1up_mask_vx_nxv16i16_nxv16i16_i16(<vscale x 16 x i16> %0, <vscale x 16 x i16> %1, i16 %2, <vscale x 16 x i1> %3, iXLen %4) nounwind {
 ; CHECK-LABEL: intrinsic_vslide1up_mask_vx_nxv16i16_nxv16i16_i16:
 ; CHECK:       # %bb.0: # %entry
 ; CHECK-NEXT:    vsetvli zero, a1, e16, m4, ta, mu
@@ -573,7 +575,7 @@ entry:
     <vscale x 16 x i16> %1,
     i16 %2,
     <vscale x 16 x i1> %3,
-    i32 %4, i32 1)
+    iXLen %4, iXLen 1)
 
   ret <vscale x 16 x i16> %a
 }
@@ -582,9 +584,9 @@ declare <vscale x 32 x i16> @llvm.riscv.vslide1up.nxv32i16.i16(
   <vscale x 32 x i16>,
   <vscale x 32 x i16>,
   i16,
-  i32);
+  iXLen)
 
-define <vscale x 32 x i16> @intrinsic_vslide1up_vx_nxv32i16_nxv32i16_i16(<vscale x 32 x i16> %0, i16 %1, i32 %2) nounwind {
+define <vscale x 32 x i16> @intrinsic_vslide1up_vx_nxv32i16_nxv32i16_i16(<vscale x 32 x i16> %0, i16 %1, iXLen %2) nounwind {
 ; CHECK-LABEL: intrinsic_vslide1up_vx_nxv32i16_nxv32i16_i16:
 ; CHECK:       # %bb.0: # %entry
 ; CHECK-NEXT:    vsetvli zero, a1, e16, m8, ta, ma
@@ -596,7 +598,7 @@ entry:
     <vscale x 32 x i16> undef,
     <vscale x 32 x i16> %0,
     i16 %1,
-    i32 %2)
+    iXLen %2)
 
   ret <vscale x 32 x i16> %a
 }
@@ -606,10 +608,10 @@ declare <vscale x 32 x i16> @llvm.riscv.vslide1up.mask.nxv32i16.i16(
   <vscale x 32 x i16>,
   i16,
   <vscale x 32 x i1>,
-  i32,
-  i32);
+  iXLen,
+  iXLen)
 
-define <vscale x 32 x i16> @intrinsic_vslide1up_mask_vx_nxv32i16_nxv32i16_i16(<vscale x 32 x i16> %0, <vscale x 32 x i16> %1, i16 %2, <vscale x 32 x i1> %3, i32 %4) nounwind {
+define <vscale x 32 x i16> @intrinsic_vslide1up_mask_vx_nxv32i16_nxv32i16_i16(<vscale x 32 x i16> %0, <vscale x 32 x i16> %1, i16 %2, <vscale x 32 x i1> %3, iXLen %4) nounwind {
 ; CHECK-LABEL: intrinsic_vslide1up_mask_vx_nxv32i16_nxv32i16_i16:
 ; CHECK:       # %bb.0: # %entry
 ; CHECK-NEXT:    vsetvli zero, a1, e16, m8, ta, mu
@@ -621,7 +623,7 @@ entry:
     <vscale x 32 x i16> %1,
     i16 %2,
     <vscale x 32 x i1> %3,
-    i32 %4, i32 1)
+    iXLen %4, iXLen 1)
 
   ret <vscale x 32 x i16> %a
 }
@@ -630,9 +632,9 @@ declare <vscale x 1 x i32> @llvm.riscv.vslide1up.nxv1i32.i32(
   <vscale x 1 x i32>,
   <vscale x 1 x i32>,
   i32,
-  i32);
+  iXLen)
 
-define <vscale x 1 x i32> @intrinsic_vslide1up_vx_nxv1i32_nxv1i32_i32(<vscale x 1 x i32> %0, i32 %1, i32 %2) nounwind {
+define <vscale x 1 x i32> @intrinsic_vslide1up_vx_nxv1i32_nxv1i32_i32(<vscale x 1 x i32> %0, i32 %1, iXLen %2) nounwind {
 ; CHECK-LABEL: intrinsic_vslide1up_vx_nxv1i32_nxv1i32_i32:
 ; CHECK:       # %bb.0: # %entry
 ; CHECK-NEXT:    vsetvli zero, a1, e32, mf2, ta, ma
@@ -644,7 +646,7 @@ entry:
     <vscale x 1 x i32> undef,
     <vscale x 1 x i32> %0,
     i32 %1,
-    i32 %2)
+    iXLen %2)
 
   ret <vscale x 1 x i32> %a
 }
@@ -654,10 +656,10 @@ declare <vscale x 1 x i32> @llvm.riscv.vslide1up.mask.nxv1i32.i32(
   <vscale x 1 x i32>,
   i32,
   <vscale x 1 x i1>,
-  i32,
-  i32);
+  iXLen,
+  iXLen)
 
-define <vscale x 1 x i32> @intrinsic_vslide1up_mask_vx_nxv1i32_nxv1i32_i32(<vscale x 1 x i32> %0, <vscale x 1 x i32> %1, i32 %2, <vscale x 1 x i1> %3, i32 %4) nounwind {
+define <vscale x 1 x i32> @intrinsic_vslide1up_mask_vx_nxv1i32_nxv1i32_i32(<vscale x 1 x i32> %0, <vscale x 1 x i32> %1, i32 %2, <vscale x 1 x i1> %3, iXLen %4) nounwind {
 ; CHECK-LABEL: intrinsic_vslide1up_mask_vx_nxv1i32_nxv1i32_i32:
 ; CHECK:       # %bb.0: # %entry
 ; CHECK-NEXT:    vsetvli zero, a1, e32, mf2, ta, mu
@@ -669,7 +671,7 @@ entry:
     <vscale x 1 x i32> %1,
     i32 %2,
     <vscale x 1 x i1> %3,
-    i32 %4, i32 1)
+    iXLen %4, iXLen 1)
 
   ret <vscale x 1 x i32> %a
 }
@@ -678,9 +680,9 @@ declare <vscale x 2 x i32> @llvm.riscv.vslide1up.nxv2i32.i32(
   <vscale x 2 x i32>,
   <vscale x 2 x i32>,
   i32,
-  i32);
+  iXLen)
 
-define <vscale x 2 x i32> @intrinsic_vslide1up_vx_nxv2i32_nxv2i32_i32(<vscale x 2 x i32> %0, i32 %1, i32 %2) nounwind {
+define <vscale x 2 x i32> @intrinsic_vslide1up_vx_nxv2i32_nxv2i32_i32(<vscale x 2 x i32> %0, i32 %1, iXLen %2) nounwind {
 ; CHECK-LABEL: intrinsic_vslide1up_vx_nxv2i32_nxv2i32_i32:
 ; CHECK:       # %bb.0: # %entry
 ; CHECK-NEXT:    vsetvli zero, a1, e32, m1, ta, ma
@@ -692,7 +694,7 @@ entry:
     <vscale x 2 x i32> undef,
     <vscale x 2 x i32> %0,
     i32 %1,
-    i32 %2)
+    iXLen %2)
 
   ret <vscale x 2 x i32> %a
 }
@@ -702,10 +704,10 @@ declare <vscale x 2 x i32> @llvm.riscv.vslide1up.mask.nxv2i32.i32(
   <vscale x 2 x i32>,
   i32,
   <vscale x 2 x i1>,
-  i32,
-  i32);
+  iXLen,
+  iXLen)
 
-define <vscale x 2 x i32> @intrinsic_vslide1up_mask_vx_nxv2i32_nxv2i32_i32(<vscale x 2 x i32> %0, <vscale x 2 x i32> %1, i32 %2, <vscale x 2 x i1> %3, i32 %4) nounwind {
+define <vscale x 2 x i32> @intrinsic_vslide1up_mask_vx_nxv2i32_nxv2i32_i32(<vscale x 2 x i32> %0, <vscale x 2 x i32> %1, i32 %2, <vscale x 2 x i1> %3, iXLen %4) nounwind {
 ; CHECK-LABEL: intrinsic_vslide1up_mask_vx_nxv2i32_nxv2i32_i32:
 ; CHECK:       # %bb.0: # %entry
 ; CHECK-NEXT:    vsetvli zero, a1, e32, m1, ta, mu
@@ -717,7 +719,7 @@ entry:
     <vscale x 2 x i32> %1,
     i32 %2,
     <vscale x 2 x i1> %3,
-    i32 %4, i32 1)
+    iXLen %4, iXLen 1)
 
   ret <vscale x 2 x i32> %a
 }
@@ -726,9 +728,9 @@ declare <vscale x 4 x i32> @llvm.riscv.vslide1up.nxv4i32.i32(
   <vscale x 4 x i32>,
   <vscale x 4 x i32>,
   i32,
-  i32);
+  iXLen)
 
-define <vscale x 4 x i32> @intrinsic_vslide1up_vx_nxv4i32_nxv4i32_i32(<vscale x 4 x i32> %0, i32 %1, i32 %2) nounwind {
+define <vscale x 4 x i32> @intrinsic_vslide1up_vx_nxv4i32_nxv4i32_i32(<vscale x 4 x i32> %0, i32 %1, iXLen %2) nounwind {
 ; CHECK-LABEL: intrinsic_vslide1up_vx_nxv4i32_nxv4i32_i32:
 ; CHECK:       # %bb.0: # %entry
 ; CHECK-NEXT:    vsetvli zero, a1, e32, m2, ta, ma
@@ -740,7 +742,7 @@ entry:
     <vscale x 4 x i32> undef,
     <vscale x 4 x i32> %0,
     i32 %1,
-    i32 %2)
+    iXLen %2)
 
   ret <vscale x 4 x i32> %a
 }
@@ -750,10 +752,10 @@ declare <vscale x 4 x i32> @llvm.riscv.vslide1up.mask.nxv4i32.i32(
   <vscale x 4 x i32>,
   i32,
   <vscale x 4 x i1>,
-  i32,
-  i32);
+  iXLen,
+  iXLen)
 
-define <vscale x 4 x i32> @intrinsic_vslide1up_mask_vx_nxv4i32_nxv4i32_i32(<vscale x 4 x i32> %0, <vscale x 4 x i32> %1, i32 %2, <vscale x 4 x i1> %3, i32 %4) nounwind {
+define <vscale x 4 x i32> @intrinsic_vslide1up_mask_vx_nxv4i32_nxv4i32_i32(<vscale x 4 x i32> %0, <vscale x 4 x i32> %1, i32 %2, <vscale x 4 x i1> %3, iXLen %4) nounwind {
 ; CHECK-LABEL: intrinsic_vslide1up_mask_vx_nxv4i32_nxv4i32_i32:
 ; CHECK:       # %bb.0: # %entry
 ; CHECK-NEXT:    vsetvli zero, a1, e32, m2, ta, mu
@@ -765,7 +767,7 @@ entry:
     <vscale x 4 x i32> %1,
     i32 %2,
     <vscale x 4 x i1> %3,
-    i32 %4, i32 1)
+    iXLen %4, iXLen 1)
 
   ret <vscale x 4 x i32> %a
 }
@@ -774,9 +776,9 @@ declare <vscale x 8 x i32> @llvm.riscv.vslide1up.nxv8i32.i32(
   <vscale x 8 x i32>,
   <vscale x 8 x i32>,
   i32,
-  i32);
+  iXLen)
 
-define <vscale x 8 x i32> @intrinsic_vslide1up_vx_nxv8i32_nxv8i32_i32(<vscale x 8 x i32> %0, i32 %1, i32 %2) nounwind {
+define <vscale x 8 x i32> @intrinsic_vslide1up_vx_nxv8i32_nxv8i32_i32(<vscale x 8 x i32> %0, i32 %1, iXLen %2) nounwind {
 ; CHECK-LABEL: intrinsic_vslide1up_vx_nxv8i32_nxv8i32_i32:
 ; CHECK:       # %bb.0: # %entry
 ; CHECK-NEXT:    vsetvli zero, a1, e32, m4, ta, ma
@@ -788,7 +790,7 @@ entry:
     <vscale x 8 x i32> undef,
     <vscale x 8 x i32> %0,
     i32 %1,
-    i32 %2)
+    iXLen %2)
 
   ret <vscale x 8 x i32> %a
 }
@@ -798,10 +800,10 @@ declare <vscale x 8 x i32> @llvm.riscv.vslide1up.mask.nxv8i32.i32(
   <vscale x 8 x i32>,
   i32,
   <vscale x 8 x i1>,
-  i32,
-  i32);
+  iXLen,
+  iXLen)
 
-define <vscale x 8 x i32> @intrinsic_vslide1up_mask_vx_nxv8i32_nxv8i32_i32(<vscale x 8 x i32> %0, <vscale x 8 x i32> %1, i32 %2, <vscale x 8 x i1> %3, i32 %4) nounwind {
+define <vscale x 8 x i32> @intrinsic_vslide1up_mask_vx_nxv8i32_nxv8i32_i32(<vscale x 8 x i32> %0, <vscale x 8 x i32> %1, i32 %2, <vscale x 8 x i1> %3, iXLen %4) nounwind {
 ; CHECK-LABEL: intrinsic_vslide1up_mask_vx_nxv8i32_nxv8i32_i32:
 ; CHECK:       # %bb.0: # %entry
 ; CHECK-NEXT:    vsetvli zero, a1, e32, m4, ta, mu
@@ -813,7 +815,7 @@ entry:
     <vscale x 8 x i32> %1,
     i32 %2,
     <vscale x 8 x i1> %3,
-    i32 %4, i32 1)
+    iXLen %4, iXLen 1)
 
   ret <vscale x 8 x i32> %a
 }
@@ -822,9 +824,9 @@ declare <vscale x 16 x i32> @llvm.riscv.vslide1up.nxv16i32.i32(
   <vscale x 16 x i32>,
   <vscale x 16 x i32>,
   i32,
-  i32);
+  iXLen)
 
-define <vscale x 16 x i32> @intrinsic_vslide1up_vx_nxv16i32_nxv16i32_i32(<vscale x 16 x i32> %0, i32 %1, i32 %2) nounwind {
+define <vscale x 16 x i32> @intrinsic_vslide1up_vx_nxv16i32_nxv16i32_i32(<vscale x 16 x i32> %0, i32 %1, iXLen %2) nounwind {
 ; CHECK-LABEL: intrinsic_vslide1up_vx_nxv16i32_nxv16i32_i32:
 ; CHECK:       # %bb.0: # %entry
 ; CHECK-NEXT:    vsetvli zero, a1, e32, m8, ta, ma
@@ -836,7 +838,7 @@ entry:
     <vscale x 16 x i32> undef,
     <vscale x 16 x i32> %0,
     i32 %1,
-    i32 %2)
+    iXLen %2)
 
   ret <vscale x 16 x i32> %a
 }
@@ -846,10 +848,10 @@ declare <vscale x 16 x i32> @llvm.riscv.vslide1up.mask.nxv16i32.i32(
   <vscale x 16 x i32>,
   i32,
   <vscale x 16 x i1>,
-  i32,
-  i32);
+  iXLen,
+  iXLen)
 
-define <vscale x 16 x i32> @intrinsic_vslide1up_mask_vx_nxv16i32_nxv16i32_i32(<vscale x 16 x i32> %0, <vscale x 16 x i32> %1, i32 %2, <vscale x 16 x i1> %3, i32 %4) nounwind {
+define <vscale x 16 x i32> @intrinsic_vslide1up_mask_vx_nxv16i32_nxv16i32_i32(<vscale x 16 x i32> %0, <vscale x 16 x i32> %1, i32 %2, <vscale x 16 x i1> %3, iXLen %4) nounwind {
 ; CHECK-LABEL: intrinsic_vslide1up_mask_vx_nxv16i32_nxv16i32_i32:
 ; CHECK:       # %bb.0: # %entry
 ; CHECK-NEXT:    vsetvli zero, a1, e32, m8, ta, mu
@@ -861,7 +863,7 @@ entry:
     <vscale x 16 x i32> %1,
     i32 %2,
     <vscale x 16 x i1> %3,
-    i32 %4, i32 1)
+    iXLen %4, iXLen 1)
 
   ret <vscale x 16 x i32> %a
 }
@@ -870,23 +872,30 @@ declare <vscale x 1 x i64> @llvm.riscv.vslide1up.nxv1i64.i64(
   <vscale x 1 x i64>,
   <vscale x 1 x i64>,
   i64,
-  i32);
-
-define <vscale x 1 x i64> @intrinsic_vslide1up_vx_nxv1i64_nxv1i64_i64(<vscale x 1 x i64> %0, i64 %1, i32 %2) nounwind {
-; CHECK-LABEL: intrinsic_vslide1up_vx_nxv1i64_nxv1i64_i64:
-; CHECK:       # %bb.0: # %entry
-; CHECK-NEXT:    vsetvli a2, a2, e64, m1, ta, ma
-; CHECK-NEXT:    slli a2, a2, 1
-; CHECK-NEXT:    vsetvli zero, a2, e32, m1, ta, ma
-; CHECK-NEXT:    vslide1up.vx v9, v8, a1
-; CHECK-NEXT:    vslide1up.vx v8, v9, a0
-; CHECK-NEXT:    ret
+  iXLen)
+
+define <vscale x 1 x i64> @intrinsic_vslide1up_vx_nxv1i64_nxv1i64_i64(<vscale x 1 x i64> %0, i64 %1, iXLen %2) nounwind {
+; RV32-LABEL: intrinsic_vslide1up_vx_nxv1i64_nxv1i64_i64:
+; RV32:       # %bb.0: # %entry
+; RV32-NEXT:    vsetvli a2, a2, e64, m1, ta, ma
+; RV32-NEXT:    slli a2, a2, 1
+; RV32-NEXT:    vsetvli zero, a2, e32, m1, ta, ma
+; RV32-NEXT:    vslide1up.vx v9, v8, a1
+; RV32-NEXT:    vslide1up.vx v8, v9, a0
+; RV32-NEXT:    ret
+;
+; RV64-LABEL: intrinsic_vslide1up_vx_nxv1i64_nxv1i64_i64:
+; RV64:       # %bb.0: # %entry
+; RV64-NEXT:    vsetvli zero, a1, e64, m1, ta, ma
+; RV64-NEXT:    vslide1up.vx v9, v8, a0
+; RV64-NEXT:    vmv.v.v v8, v9
+; RV64-NEXT:    ret
 entry:
   %a = call <vscale x 1 x i64> @llvm.riscv.vslide1up.nxv1i64.i64(
     <vscale x 1 x i64> undef,
     <vscale x 1 x i64> %0,
     i64 %1,
-    i32 %2)
+    iXLen %2)
 
   ret <vscale x 1 x i64> %a
 }
@@ -896,27 +905,33 @@ declare <vscale x 1 x i64> @llvm.riscv.vslide1up.mask.nxv1i64.i64(
   <vscale x 1 x i64>,
   i64,
   <vscale x 1 x i1>,
-  i32,
-  i32);
-
-define <vscale x 1 x i64> @intrinsic_vslide1up_mask_vx_nxv1i64_nxv1i64_i64(<vscale x 1 x i64> %0, <vscale x 1 x i64> %1, i64 %2, <vscale x 1 x i1> %3, i32 %4) nounwind {
-; CHECK-LABEL: intrinsic_vslide1up_mask_vx_nxv1i64_nxv1i64_i64:
-; CHECK:       # %bb.0: # %entry
-; CHECK-NEXT:    vsetvli a3, a2, e64, m1, ta, ma
-; CHECK-NEXT:    slli a3, a3, 1
-; CHECK-NEXT:    vsetvli zero, a3, e32, m1, ta, ma
-; CHECK-NEXT:    vslide1up.vx v10, v9, a1
-; CHECK-NEXT:    vslide1up.vx v9, v10, a0
-; CHECK-NEXT:    vsetvli zero, a2, e64, m1, ta, ma
-; CHECK-NEXT:    vmerge.vvm v8, v8, v9, v0
-; CHECK-NEXT:    ret
+  iXLen,
+  iXLen)
+
+define <vscale x 1 x i64> @intrinsic_vslide1up_mask_vx_nxv1i64_nxv1i64_i64(<vscale x 1 x i64> %0, <vscale x 1 x i64> %1, i64 %2, <vscale x 1 x i1> %3, iXLen %4) nounwind {
+; RV32-LABEL: intrinsic_vslide1up_mask_vx_nxv1i64_nxv1i64_i64:
+; RV32:       # %bb.0: # %entry
+; RV32-NEXT:    vsetvli a3, a2, e64, m1, ta, ma
+; RV32-NEXT:    slli a3, a3, 1
+; RV32-NEXT:    vsetvli zero, a3, e32, m1, ta, ma
+; RV32-NEXT:    vslide1up.vx v10, v9, a1
+; RV32-NEXT:    vslide1up.vx v9, v10, a0
+; RV32-NEXT:    vsetvli zero, a2, e64, m1, ta, ma
+; RV32-NEXT:    vmerge.vvm v8, v8, v9, v0
+; RV32-NEXT:    ret
+;
+; RV64-LABEL: intrinsic_vslide1up_mask_vx_nxv1i64_nxv1i64_i64:
+; RV64:       # %bb.0: # %entry
+; RV64-NEXT:    vsetvli zero, a1, e64, m1, ta, mu
+; RV64-NEXT:    vslide1up.vx v8, v9, a0, v0.t
+; RV64-NEXT:    ret
 entry:
   %a = call <vscale x 1 x i64> @llvm.riscv.vslide1up.mask.nxv1i64.i64(
     <vscale x 1 x i64> %0,
     <vscale x 1 x i64> %1,
     i64 %2,
     <vscale x 1 x i1> %3,
-    i32 %4, i32 1)
+    iXLen %4, iXLen 1)
 
   ret <vscale x 1 x i64> %a
 }
@@ -925,23 +940,30 @@ declare <vscale x 2 x i64> @llvm.riscv.vslide1up.nxv2i64.i64(
   <vscale x 2 x i64>,
   <vscale x 2 x i64>,
   i64,
-  i32);
-
-define <vscale x 2 x i64> @intrinsic_vslide1up_vx_nxv2i64_nxv2i64_i64(<vscale x 2 x i64> %0, i64 %1, i32 %2) nounwind {
-; CHECK-LABEL: intrinsic_vslide1up_vx_nxv2i64_nxv2i64_i64:
-; CHECK:       # %bb.0: # %entry
-; CHECK-NEXT:    vsetvli a2, a2, e64, m2, ta, ma
-; CHECK-NEXT:    slli a2, a2, 1
-; CHECK-NEXT:    vsetvli zero, a2, e32, m2, ta, ma
-; CHECK-NEXT:    vslide1up.vx v10, v8, a1
-; CHECK-NEXT:    vslide1up.vx v8, v10, a0
-; CHECK-NEXT:    ret
+  iXLen)
+
+define <vscale x 2 x i64> @intrinsic_vslide1up_vx_nxv2i64_nxv2i64_i64(<vscale x 2 x i64> %0, i64 %1, iXLen %2) nounwind {
+; RV32-LABEL: intrinsic_vslide1up_vx_nxv2i64_nxv2i64_i64:
+; RV32:       # %bb.0: # %entry
+; RV32-NEXT:    vsetvli a2, a2, e64, m2, ta, ma
+; RV32-NEXT:    slli a2, a2, 1
+; RV32-NEXT:    vsetvli zero, a2, e32, m2, ta, ma
+; RV32-NEXT:    vslide1up.vx v10, v8, a1
+; RV32-NEXT:    vslide1up.vx v8, v10, a0
+; RV32-NEXT:    ret
+;
+; RV64-LABEL: intrinsic_vslide1up_vx_nxv2i64_nxv2i64_i64:
+; RV64:       # %bb.0: # %entry
+; RV64-NEXT:    vsetvli zero, a1, e64, m2, ta, ma
+; RV64-NEXT:    vslide1up.vx v10, v8, a0
+; RV64-NEXT:    vmv.v.v v8, v10
+; RV64-NEXT:    ret
 entry:
   %a = call <vscale x 2 x i64> @llvm.riscv.vslide1up.nxv2i64.i64(
     <vscale x 2 x i64> undef,
     <vscale x 2 x i64> %0,
     i64 %1,
-    i32 %2)
+    iXLen %2)
 
   ret <vscale x 2 x i64> %a
 }
@@ -951,27 +973,33 @@ declare <vscale x 2 x i64> @llvm.riscv.vslide1up.mask.nxv2i64.i64(
   <vscale x 2 x i64>,
   i64,
   <vscale x 2 x i1>,
-  i32,
-  i32);
-
-define <vscale x 2 x i64> @intrinsic_vslide1up_mask_vx_nxv2i64_nxv2i64_i64(<vscale x 2 x i64> %0, <vscale x 2 x i64> %1, i64 %2, <vscale x 2 x i1> %3, i32 %4) nounwind {
-; CHECK-LABEL: intrinsic_vslide1up_mask_vx_nxv2i64_nxv2i64_i64:
-; CHECK:       # %bb.0: # %entry
-; CHECK-NEXT:    vsetvli a3, a2, e64, m2, ta, ma
-; CHECK-NEXT:    slli a3, a3, 1
-; CHECK-NEXT:    vsetvli zero, a3, e32, m2, ta, ma
-; CHECK-NEXT:    vslide1up.vx v12, v10, a1
-; CHECK-NEXT:    vslide1up.vx v10, v12, a0
-; CHECK-NEXT:    vsetvli zero, a2, e64, m2, ta, ma
-; CHECK-NEXT:    vmerge.vvm v8, v8, v10, v0
-; CHECK-NEXT:    ret
+  iXLen,
+  iXLen)
+
+define <vscale x 2 x i64> @intrinsic_vslide1up_mask_vx_nxv2i64_nxv2i64_i64(<vscale x 2 x i64> %0, <vscale x 2 x i64> %1, i64 %2, <vscale x 2 x i1> %3, iXLen %4) nounwind {
+; RV32-LABEL: intrinsic_vslide1up_mask_vx_nxv2i64_nxv2i64_i64:
+; RV32:       # %bb.0: # %entry
+; RV32-NEXT:    vsetvli a3, a2, e64, m2, ta, ma
+; RV32-NEXT:    slli a3, a3, 1
+; RV32-NEXT:    vsetvli zero, a3, e32, m2, ta, ma
+; RV32-NEXT:    vslide1up.vx v12, v10, a1
+; RV32-NEXT:    vslide1up.vx v10, v12, a0
+; RV32-NEXT:    vsetvli zero, a2, e64, m2, ta, ma
+; RV32-NEXT:    vmerge.vvm v8, v8, v10, v0
+; RV32-NEXT:    ret
+;
+; RV64-LABEL: intrinsic_vslide1up_mask_vx_nxv2i64_nxv2i64_i64:
+; RV64:       # %bb.0: # %entry
+; RV64-NEXT:    vsetvli zero, a1, e64, m2, ta, mu
+; RV64-NEXT:    vslide1up.vx v8, v10, a0, v0.t
+; RV64-NEXT:    ret
 entry:
   %a = call <vscale x 2 x i64> @llvm.riscv.vslide1up.mask.nxv2i64.i64(
     <vscale x 2 x i64> %0,
     <vscale x 2 x i64> %1,
     i64 %2,
     <vscale x 2 x i1> %3,
-    i32 %4, i32 1)
+    iXLen %4, iXLen 1)
 
   ret <vscale x 2 x i64> %a
 }
@@ -980,23 +1008,30 @@ declare <vscale x 4 x i64> @llvm.riscv.vslide1up.nxv4i64.i64(
   <vscale x 4 x i64>,
   <vscale x 4 x i64>,
   i64,
-  i32);
-
-define <vscale x 4 x i64> @intrinsic_vslide1up_vx_nxv4i64_nxv4i64_i64(<vscale x 4 x i64> %0, i64 %1, i32 %2) nounwind {
-; CHECK-LABEL: intrinsic_vslide1up_vx_nxv4i64_nxv4i64_i64:
-; CHECK:       # %bb.0: # %entry
-; CHECK-NEXT:    vsetvli a2, a2, e64, m4, ta, ma
-; CHECK-NEXT:    slli a2, a2, 1
-; CHECK-NEXT:    vsetvli zero, a2, e32, m4, ta, ma
-; CHECK-NEXT:    vslide1up.vx v12, v8, a1
-; CHECK-NEXT:    vslide1up.vx v8, v12, a0
-; CHECK-NEXT:    ret
+  iXLen)
+
+define <vscale x 4 x i64> @intrinsic_vslide1up_vx_nxv4i64_nxv4i64_i64(<vscale x 4 x i64> %0, i64 %1, iXLen %2) nounwind {
+; RV32-LABEL: intrinsic_vslide1up_vx_nxv4i64_nxv4i64_i64:
+; RV32:       # %bb.0: # %entry
+; RV32-NEXT:    vsetvli a2, a2, e64, m4, ta, ma
+; RV32-NEXT:    slli a2, a2, 1
+; RV32-NEXT:    vsetvli zero, a2, e32, m4, ta, ma
+; RV32-NEXT:    vslide1up.vx v12, v8, a1
+; RV32-NEXT:    vslide1up.vx v8, v12, a0
+; RV32-NEXT:    ret
+;
+; RV64-LABEL: intrinsic_vslide1up_vx_nxv4i64_nxv4i64_i64:
+; RV64:       # %bb.0: # %entry
+; RV64-NEXT:    vsetvli zero, a1, e64, m4, ta, ma
+; RV64-NEXT:    vslide1up.vx v12, v8, a0
+; RV64-NEXT:    vmv.v.v v8, v12
+; RV64-NEXT:    ret
 entry:
   %a = call <vscale x 4 x i64> @llvm.riscv.vslide1up.nxv4i64.i64(
     <vscale x 4 x i64> undef,
     <vscale x 4 x i64> %0,
     i64 %1,
-    i32 %2)
+    iXLen %2)
 
   ret <vscale x 4 x i64> %a
 }
@@ -1006,27 +1041,33 @@ declare <vscale x 4 x i64> @llvm.riscv.vslide1up.mask.nxv4i64.i64(
   <vscale x 4 x i64>,
   i64,
   <vscale x 4 x i1>,
-  i32,
-  i32);
-
-define <vscale x 4 x i64> @intrinsic_vslide1up_mask_vx_nxv4i64_nxv4i64_i64(<vscale x 4 x i64> %0, <vscale x 4 x i64> %1, i64 %2, <vscale x 4 x i1> %3, i32 %4) nounwind {
-; CHECK-LABEL: intrinsic_vslide1up_mask_vx_nxv4i64_nxv4i64_i64:
-; CHECK:       # %bb.0: # %entry
-; CHECK-NEXT:    vsetvli a3, a2, e64, m4, ta, ma
-; CHECK-NEXT:    slli a3, a3, 1
-; CHECK-NEXT:    vsetvli zero, a3, e32, m4, ta, ma
-; CHECK-NEXT:    vslide1up.vx v16, v12, a1
-; CHECK-NEXT:    vslide1up.vx v12, v16, a0
-; CHECK-NEXT:    vsetvli zero, a2, e64, m4, ta, ma
-; CHECK-NEXT:    vmerge.vvm v8, v8, v12, v0
-; CHECK-NEXT:    ret
+  iXLen,
+  iXLen)
+
+define <vscale x 4 x i64> @intrinsic_vslide1up_mask_vx_nxv4i64_nxv4i64_i64(<vscale x 4 x i64> %0, <vscale x 4 x i64> %1, i64 %2, <vscale x 4 x i1> %3, iXLen %4) nounwind {
+; RV32-LABEL: intrinsic_vslide1up_mask_vx_nxv4i64_nxv4i64_i64:
+; RV32:       # %bb.0: # %entry
+; RV32-NEXT:    vsetvli a3, a2, e64, m4, ta, ma
+; RV32-NEXT:    slli a3, a3, 1
+; RV32-NEXT:    vsetvli zero, a3, e32, m4, ta, ma
+; RV32-NEXT:    vslide1up.vx v16, v12, a1
+; RV32-NEXT:    vslide1up.vx v12, v16, a0
+; RV32-NEXT:    vsetvli zero, a2, e64, m4, ta, ma
+; RV32-NEXT:    vmerge.vvm v8, v8, v12, v0
+; RV32-NEXT:    ret
+;
+; RV64-LABEL: intrinsic_vslide1up_mask_vx_nxv4i64_nxv4i64_i64:
+; RV64:       # %bb.0: # %entry
+; RV64-NEXT:    vsetvli zero, a1, e64, m4, ta, mu
+; RV64-NEXT:    vslide1up.vx v8, v12, a0, v0.t
+; RV64-NEXT:    ret
 entry:
   %a = call <vscale x 4 x i64> @llvm.riscv.vslide1up.mask.nxv4i64.i64(
     <vscale x 4 x i64> %0,
     <vscale x 4 x i64> %1,
     i64 %2,
     <vscale x 4 x i1> %3,
-    i32 %4, i32 1)
+    iXLen %4, iXLen 1)
 
   ret <vscale x 4 x i64> %a
 }
@@ -1035,23 +1076,30 @@ declare <vscale x 8 x i64> @llvm.riscv.vslide1up.nxv8i64.i64(
   <vscale x 8 x i64>,
   <vscale x 8 x i64>,
   i64,
-  i32);
-
-define <vscale x 8 x i64> @intrinsic_vslide1up_vx_nxv8i64_nxv8i64_i64(<vscale x 8 x i64> %0, i64 %1, i32 %2) nounwind {
-; CHECK-LABEL: intrinsic_vslide1up_vx_nxv8i64_nxv8i64_i64:
-; CHECK:       # %bb.0: # %entry
-; CHECK-NEXT:    vsetvli a2, a2, e64, m8, ta, ma
-; CHECK-NEXT:    slli a2, a2, 1
-; CHECK-NEXT:    vsetvli zero, a2, e32, m8, ta, ma
-; CHECK-NEXT:    vslide1up.vx v16, v8, a1
-; CHECK-NEXT:    vslide1up.vx v8, v16, a0
-; CHECK-NEXT:    ret
+  iXLen)
+
+define <vscale x 8 x i64> @intrinsic_vslide1up_vx_nxv8i64_nxv8i64_i64(<vscale x 8 x i64> %0, i64 %1, iXLen %2) nounwind {
+; RV32-LABEL: intrinsic_vslide1up_vx_nxv8i64_nxv8i64_i64:
+; RV32:       # %bb.0: # %entry
+; RV32-NEXT:    vsetvli a2, a2, e64, m8, ta, ma
+; RV32-NEXT:    slli a2, a2, 1
+; RV32-NEXT:    vsetvli zero, a2, e32, m8, ta, ma
+; RV32-NEXT:    vslide1up.vx v16, v8, a1
+; RV32-NEXT:    vslide1up.vx v8, v16, a0
+; RV32-NEXT:    ret
+;
+; RV64-LABEL: intrinsic_vslide1up_vx_nxv8i64_nxv8i64_i64:
+; RV64:       # %bb.0: # %entry
+; RV64-NEXT:    vsetvli zero, a1, e64, m8, ta, ma
+; RV64-NEXT:    vslide1up.vx v16, v8, a0
+; RV64-NEXT:    vmv.v.v v8, v16
+; RV64-NEXT:    ret
 entry:
   %a = call <vscale x 8 x i64> @llvm.riscv.vslide1up.nxv8i64.i64(
     <vscale x 8 x i64> undef,
     <vscale x 8 x i64> %0,
     i64 %1,
-    i32 %2)
+    iXLen %2)
 
   ret <vscale x 8 x i64> %a
 }
@@ -1061,27 +1109,33 @@ declare <vscale x 8 x i64> @llvm.riscv.vslide1up.mask.nxv8i64.i64(
   <vscale x 8 x i64>,
   i64,
   <vscale x 8 x i1>,
-  i32,
-  i32);
-
-define <vscale x 8 x i64> @intrinsic_vslide1up_mask_vx_nxv8i64_nxv8i64_i64(<vscale x 8 x i64> %0, <vscale x 8 x i64> %1, i64 %2, <vscale x 8 x i1> %3, i32 %4) nounwind {
-; CHECK-LABEL: intrinsic_vslide1up_mask_vx_nxv8i64_nxv8i64_i64:
-; CHECK:       # %bb.0: # %entry
-; CHECK-NEXT:    vsetvli a3, a2, e64, m8, ta, ma
-; CHECK-NEXT:    slli a3, a3, 1
-; CHECK-NEXT:    vsetvli zero, a3, e32, m8, ta, ma
-; CHECK-NEXT:    vslide1up.vx v24, v16, a1
-; CHECK-NEXT:    vslide1up.vx v16, v24, a0
-; CHECK-NEXT:    vsetvli zero, a2, e64, m8, ta, ma
-; CHECK-NEXT:    vmerge.vvm v8, v8, v16, v0
-; CHECK-NEXT:    ret
+  iXLen,
+  iXLen)
+
+define <vscale x 8 x i64> @intrinsic_vslide1up_mask_vx_nxv8i64_nxv8i64_i64(<vscale x 8 x i64> %0, <vscale x 8 x i64> %1, i64 %2, <vscale x 8 x i1> %3, iXLen %4) nounwind {
+; RV32-LABEL: intrinsic_vslide1up_mask_vx_nxv8i64_nxv8i64_i64:
+; RV32:       # %bb.0: # %entry
+; RV32-NEXT:    vsetvli a3, a2, e64, m8, ta, ma
+; RV32-NEXT:    slli a3, a3, 1
+; RV32-NEXT:    vsetvli zero, a3, e32, m8, ta, ma
+; RV32-NEXT:    vslide1up.vx v24, v16, a1
+; RV32-NEXT:    vslide1up.vx v16, v24, a0
+; RV32-NEXT:    vsetvli zero, a2, e64, m8, ta, ma
+; RV32-NEXT:    vmerge.vvm v8, v8, v16, v0
+; RV32-NEXT:    ret
+;
+; RV64-LABEL: intrinsic_vslide1up_mask_vx_nxv8i64_nxv8i64_i64:
+; RV64:       # %bb.0: # %entry
+; RV64-NEXT:    vsetvli zero, a1, e64, m8, ta, mu
+; RV64-NEXT:    vslide1up.vx v8, v16, a0, v0.t
+; RV64-NEXT:    ret
 entry:
   %a = call <vscale x 8 x i64> @llvm.riscv.vslide1up.mask.nxv8i64.i64(
     <vscale x 8 x i64> %0,
     <vscale x 8 x i64> %1,
     i64 %2,
     <vscale x 8 x i1> %3,
-    i32 %4, i32 1)
+    iXLen %4, iXLen 1)
 
   ret <vscale x 8 x i64> %a
 }
diff --git a/llvm/test/CodeGen/RISCV/rvv/vsmul-rv32.ll b/llvm/test/CodeGen/RISCV/rvv/vsmul-rv32.ll
deleted file mode 100644
index e7d8ae63..0000000
--- a/llvm/test/CodeGen/RISCV/rvv/vsmul-rv32.ll
+++ /dev/null
@@ -1,2166 +0,0 @@
-; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py
-; RUN: llc -mtriple=riscv32 -mattr=+v -verify-machineinstrs \
-; RUN:   < %s | FileCheck %s
-; RUN: not --crash llc -mtriple=riscv32  -mattr=+zve64d 2>&1 \
-; RUN:   < %s | FileCheck %s --check-prefixes=ZVE64D
-
-; ZVE64D: LLVM ERROR: Cannot select: intrinsic %llvm.riscv.vsmul
-
-declare <vscale x 1 x i8> @llvm.riscv.vsmul.nxv1i8.nxv1i8(
-  <vscale x 1 x i8>,
-  <vscale x 1 x i8>,
-  <vscale x 1 x i8>,
-  i32, i32);
-
-define <vscale x 1 x i8> @intrinsic_vsmul_vv_nxv1i8_nxv1i8_nxv1i8(<vscale x 1 x i8> %0, <vscale x 1 x i8> %1, i32 %2) nounwind {
-; CHECK-LABEL: intrinsic_vsmul_vv_nxv1i8_nxv1i8_nxv1i8:
-; CHECK:       # %bb.0: # %entry
-; CHECK-NEXT:    csrwi vxrm, 0
-; CHECK-NEXT:    vsetvli zero, a0, e8, mf8, ta, ma
-; CHECK-NEXT:    vsmul.vv v8, v8, v9
-; CHECK-NEXT:    ret
-entry:
-  %a = call <vscale x 1 x i8> @llvm.riscv.vsmul.nxv1i8.nxv1i8(
-    <vscale x 1 x i8> undef,
-    <vscale x 1 x i8> %0,
-    <vscale x 1 x i8> %1,
-    i32 0, i32 %2)
-
-  ret <vscale x 1 x i8> %a
-}
-
-declare <vscale x 1 x i8> @llvm.riscv.vsmul.mask.nxv1i8.nxv1i8(
-  <vscale x 1 x i8>,
-  <vscale x 1 x i8>,
-  <vscale x 1 x i8>,
-  <vscale x 1 x i1>,
-  i32, i32, i32);
-
-define <vscale x 1 x i8> @intrinsic_vsmul_mask_vv_nxv1i8_nxv1i8_nxv1i8(<vscale x 1 x i8> %0, <vscale x 1 x i8> %1, <vscale x 1 x i8> %2, <vscale x 1 x i1> %3, i32 %4) nounwind {
-; CHECK-LABEL: intrinsic_vsmul_mask_vv_nxv1i8_nxv1i8_nxv1i8:
-; CHECK:       # %bb.0: # %entry
-; CHECK-NEXT:    csrwi vxrm, 0
-; CHECK-NEXT:    vsetvli zero, a0, e8, mf8, ta, mu
-; CHECK-NEXT:    vsmul.vv v8, v9, v10, v0.t
-; CHECK-NEXT:    ret
-entry:
-  %a = call <vscale x 1 x i8> @llvm.riscv.vsmul.mask.nxv1i8.nxv1i8(
-    <vscale x 1 x i8> %0,
-    <vscale x 1 x i8> %1,
-    <vscale x 1 x i8> %2,
-    <vscale x 1 x i1> %3,
-    i32 0, i32 %4, i32 1)
-
-  ret <vscale x 1 x i8> %a
-}
-
-declare <vscale x 2 x i8> @llvm.riscv.vsmul.nxv2i8.nxv2i8(
-  <vscale x 2 x i8>,
-  <vscale x 2 x i8>,
-  <vscale x 2 x i8>,
-  i32, i32);
-
-define <vscale x 2 x i8> @intrinsic_vsmul_vv_nxv2i8_nxv2i8_nxv2i8(<vscale x 2 x i8> %0, <vscale x 2 x i8> %1, i32 %2) nounwind {
-; CHECK-LABEL: intrinsic_vsmul_vv_nxv2i8_nxv2i8_nxv2i8:
-; CHECK:       # %bb.0: # %entry
-; CHECK-NEXT:    csrwi vxrm, 0
-; CHECK-NEXT:    vsetvli zero, a0, e8, mf4, ta, ma
-; CHECK-NEXT:    vsmul.vv v8, v8, v9
-; CHECK-NEXT:    ret
-entry:
-  %a = call <vscale x 2 x i8> @llvm.riscv.vsmul.nxv2i8.nxv2i8(
-    <vscale x 2 x i8> undef,
-    <vscale x 2 x i8> %0,
-    <vscale x 2 x i8> %1,
-    i32 0, i32 %2)
-
-  ret <vscale x 2 x i8> %a
-}
-
-declare <vscale x 2 x i8> @llvm.riscv.vsmul.mask.nxv2i8.nxv2i8(
-  <vscale x 2 x i8>,
-  <vscale x 2 x i8>,
-  <vscale x 2 x i8>,
-  <vscale x 2 x i1>,
-  i32, i32, i32);
-
-define <vscale x 2 x i8> @intrinsic_vsmul_mask_vv_nxv2i8_nxv2i8_nxv2i8(<vscale x 2 x i8> %0, <vscale x 2 x i8> %1, <vscale x 2 x i8> %2, <vscale x 2 x i1> %3, i32 %4) nounwind {
-; CHECK-LABEL: intrinsic_vsmul_mask_vv_nxv2i8_nxv2i8_nxv2i8:
-; CHECK:       # %bb.0: # %entry
-; CHECK-NEXT:    csrwi vxrm, 0
-; CHECK-NEXT:    vsetvli zero, a0, e8, mf4, ta, mu
-; CHECK-NEXT:    vsmul.vv v8, v9, v10, v0.t
-; CHECK-NEXT:    ret
-entry:
-  %a = call <vscale x 2 x i8> @llvm.riscv.vsmul.mask.nxv2i8.nxv2i8(
-    <vscale x 2 x i8> %0,
-    <vscale x 2 x i8> %1,
-    <vscale x 2 x i8> %2,
-    <vscale x 2 x i1> %3,
-    i32 0, i32 %4, i32 1)
-
-  ret <vscale x 2 x i8> %a
-}
-
-declare <vscale x 4 x i8> @llvm.riscv.vsmul.nxv4i8.nxv4i8(
-  <vscale x 4 x i8>,
-  <vscale x 4 x i8>,
-  <vscale x 4 x i8>,
-  i32, i32);
-
-define <vscale x 4 x i8> @intrinsic_vsmul_vv_nxv4i8_nxv4i8_nxv4i8(<vscale x 4 x i8> %0, <vscale x 4 x i8> %1, i32 %2) nounwind {
-; CHECK-LABEL: intrinsic_vsmul_vv_nxv4i8_nxv4i8_nxv4i8:
-; CHECK:       # %bb.0: # %entry
-; CHECK-NEXT:    csrwi vxrm, 0
-; CHECK-NEXT:    vsetvli zero, a0, e8, mf2, ta, ma
-; CHECK-NEXT:    vsmul.vv v8, v8, v9
-; CHECK-NEXT:    ret
-entry:
-  %a = call <vscale x 4 x i8> @llvm.riscv.vsmul.nxv4i8.nxv4i8(
-    <vscale x 4 x i8> undef,
-    <vscale x 4 x i8> %0,
-    <vscale x 4 x i8> %1,
-    i32 0, i32 %2)
-
-  ret <vscale x 4 x i8> %a
-}
-
-declare <vscale x 4 x i8> @llvm.riscv.vsmul.mask.nxv4i8.nxv4i8(
-  <vscale x 4 x i8>,
-  <vscale x 4 x i8>,
-  <vscale x 4 x i8>,
-  <vscale x 4 x i1>,
-  i32, i32, i32);
-
-define <vscale x 4 x i8> @intrinsic_vsmul_mask_vv_nxv4i8_nxv4i8_nxv4i8(<vscale x 4 x i8> %0, <vscale x 4 x i8> %1, <vscale x 4 x i8> %2, <vscale x 4 x i1> %3, i32 %4) nounwind {
-; CHECK-LABEL: intrinsic_vsmul_mask_vv_nxv4i8_nxv4i8_nxv4i8:
-; CHECK:       # %bb.0: # %entry
-; CHECK-NEXT:    csrwi vxrm, 0
-; CHECK-NEXT:    vsetvli zero, a0, e8, mf2, ta, mu
-; CHECK-NEXT:    vsmul.vv v8, v9, v10, v0.t
-; CHECK-NEXT:    ret
-entry:
-  %a = call <vscale x 4 x i8> @llvm.riscv.vsmul.mask.nxv4i8.nxv4i8(
-    <vscale x 4 x i8> %0,
-    <vscale x 4 x i8> %1,
-    <vscale x 4 x i8> %2,
-    <vscale x 4 x i1> %3,
-    i32 0, i32 %4, i32 1)
-
-  ret <vscale x 4 x i8> %a
-}
-
-declare <vscale x 8 x i8> @llvm.riscv.vsmul.nxv8i8.nxv8i8(
-  <vscale x 8 x i8>,
-  <vscale x 8 x i8>,
-  <vscale x 8 x i8>,
-  i32, i32);
-
-define <vscale x 8 x i8> @intrinsic_vsmul_vv_nxv8i8_nxv8i8_nxv8i8(<vscale x 8 x i8> %0, <vscale x 8 x i8> %1, i32 %2) nounwind {
-; CHECK-LABEL: intrinsic_vsmul_vv_nxv8i8_nxv8i8_nxv8i8:
-; CHECK:       # %bb.0: # %entry
-; CHECK-NEXT:    csrwi vxrm, 0
-; CHECK-NEXT:    vsetvli zero, a0, e8, m1, ta, ma
-; CHECK-NEXT:    vsmul.vv v8, v8, v9
-; CHECK-NEXT:    ret
-entry:
-  %a = call <vscale x 8 x i8> @llvm.riscv.vsmul.nxv8i8.nxv8i8(
-    <vscale x 8 x i8> undef,
-    <vscale x 8 x i8> %0,
-    <vscale x 8 x i8> %1,
-    i32 0, i32 %2)
-
-  ret <vscale x 8 x i8> %a
-}
-
-declare <vscale x 8 x i8> @llvm.riscv.vsmul.mask.nxv8i8.nxv8i8(
-  <vscale x 8 x i8>,
-  <vscale x 8 x i8>,
-  <vscale x 8 x i8>,
-  <vscale x 8 x i1>,
-  i32, i32, i32);
-
-define <vscale x 8 x i8> @intrinsic_vsmul_mask_vv_nxv8i8_nxv8i8_nxv8i8(<vscale x 8 x i8> %0, <vscale x 8 x i8> %1, <vscale x 8 x i8> %2, <vscale x 8 x i1> %3, i32 %4) nounwind {
-; CHECK-LABEL: intrinsic_vsmul_mask_vv_nxv8i8_nxv8i8_nxv8i8:
-; CHECK:       # %bb.0: # %entry
-; CHECK-NEXT:    csrwi vxrm, 0
-; CHECK-NEXT:    vsetvli zero, a0, e8, m1, ta, mu
-; CHECK-NEXT:    vsmul.vv v8, v9, v10, v0.t
-; CHECK-NEXT:    ret
-entry:
-  %a = call <vscale x 8 x i8> @llvm.riscv.vsmul.mask.nxv8i8.nxv8i8(
-    <vscale x 8 x i8> %0,
-    <vscale x 8 x i8> %1,
-    <vscale x 8 x i8> %2,
-    <vscale x 8 x i1> %3,
-    i32 0, i32 %4, i32 1)
-
-  ret <vscale x 8 x i8> %a
-}
-
-declare <vscale x 16 x i8> @llvm.riscv.vsmul.nxv16i8.nxv16i8(
-  <vscale x 16 x i8>,
-  <vscale x 16 x i8>,
-  <vscale x 16 x i8>,
-  i32, i32);
-
-define <vscale x 16 x i8> @intrinsic_vsmul_vv_nxv16i8_nxv16i8_nxv16i8(<vscale x 16 x i8> %0, <vscale x 16 x i8> %1, i32 %2) nounwind {
-; CHECK-LABEL: intrinsic_vsmul_vv_nxv16i8_nxv16i8_nxv16i8:
-; CHECK:       # %bb.0: # %entry
-; CHECK-NEXT:    csrwi vxrm, 0
-; CHECK-NEXT:    vsetvli zero, a0, e8, m2, ta, ma
-; CHECK-NEXT:    vsmul.vv v8, v8, v10
-; CHECK-NEXT:    ret
-entry:
-  %a = call <vscale x 16 x i8> @llvm.riscv.vsmul.nxv16i8.nxv16i8(
-    <vscale x 16 x i8> undef,
-    <vscale x 16 x i8> %0,
-    <vscale x 16 x i8> %1,
-    i32 0, i32 %2)
-
-  ret <vscale x 16 x i8> %a
-}
-
-declare <vscale x 16 x i8> @llvm.riscv.vsmul.mask.nxv16i8.nxv16i8(
-  <vscale x 16 x i8>,
-  <vscale x 16 x i8>,
-  <vscale x 16 x i8>,
-  <vscale x 16 x i1>,
-  i32, i32, i32);
-
-define <vscale x 16 x i8> @intrinsic_vsmul_mask_vv_nxv16i8_nxv16i8_nxv16i8(<vscale x 16 x i8> %0, <vscale x 16 x i8> %1, <vscale x 16 x i8> %2, <vscale x 16 x i1> %3, i32 %4) nounwind {
-; CHECK-LABEL: intrinsic_vsmul_mask_vv_nxv16i8_nxv16i8_nxv16i8:
-; CHECK:       # %bb.0: # %entry
-; CHECK-NEXT:    csrwi vxrm, 0
-; CHECK-NEXT:    vsetvli zero, a0, e8, m2, ta, mu
-; CHECK-NEXT:    vsmul.vv v8, v10, v12, v0.t
-; CHECK-NEXT:    ret
-entry:
-  %a = call <vscale x 16 x i8> @llvm.riscv.vsmul.mask.nxv16i8.nxv16i8(
-    <vscale x 16 x i8> %0,
-    <vscale x 16 x i8> %1,
-    <vscale x 16 x i8> %2,
-    <vscale x 16 x i1> %3,
-    i32 0, i32 %4, i32 1)
-
-  ret <vscale x 16 x i8> %a
-}
-
-declare <vscale x 32 x i8> @llvm.riscv.vsmul.nxv32i8.nxv32i8(
-  <vscale x 32 x i8>,
-  <vscale x 32 x i8>,
-  <vscale x 32 x i8>,
-  i32, i32);
-
-define <vscale x 32 x i8> @intrinsic_vsmul_vv_nxv32i8_nxv32i8_nxv32i8(<vscale x 32 x i8> %0, <vscale x 32 x i8> %1, i32 %2) nounwind {
-; CHECK-LABEL: intrinsic_vsmul_vv_nxv32i8_nxv32i8_nxv32i8:
-; CHECK:       # %bb.0: # %entry
-; CHECK-NEXT:    csrwi vxrm, 0
-; CHECK-NEXT:    vsetvli zero, a0, e8, m4, ta, ma
-; CHECK-NEXT:    vsmul.vv v8, v8, v12
-; CHECK-NEXT:    ret
-entry:
-  %a = call <vscale x 32 x i8> @llvm.riscv.vsmul.nxv32i8.nxv32i8(
-    <vscale x 32 x i8> undef,
-    <vscale x 32 x i8> %0,
-    <vscale x 32 x i8> %1,
-    i32 0, i32 %2)
-
-  ret <vscale x 32 x i8> %a
-}
-
-declare <vscale x 32 x i8> @llvm.riscv.vsmul.mask.nxv32i8.nxv32i8(
-  <vscale x 32 x i8>,
-  <vscale x 32 x i8>,
-  <vscale x 32 x i8>,
-  <vscale x 32 x i1>,
-  i32, i32, i32);
-
-define <vscale x 32 x i8> @intrinsic_vsmul_mask_vv_nxv32i8_nxv32i8_nxv32i8(<vscale x 32 x i8> %0, <vscale x 32 x i8> %1, <vscale x 32 x i8> %2, <vscale x 32 x i1> %3, i32 %4) nounwind {
-; CHECK-LABEL: intrinsic_vsmul_mask_vv_nxv32i8_nxv32i8_nxv32i8:
-; CHECK:       # %bb.0: # %entry
-; CHECK-NEXT:    csrwi vxrm, 0
-; CHECK-NEXT:    vsetvli zero, a0, e8, m4, ta, mu
-; CHECK-NEXT:    vsmul.vv v8, v12, v16, v0.t
-; CHECK-NEXT:    ret
-entry:
-  %a = call <vscale x 32 x i8> @llvm.riscv.vsmul.mask.nxv32i8.nxv32i8(
-    <vscale x 32 x i8> %0,
-    <vscale x 32 x i8> %1,
-    <vscale x 32 x i8> %2,
-    <vscale x 32 x i1> %3,
-    i32 0, i32 %4, i32 1)
-
-  ret <vscale x 32 x i8> %a
-}
-
-declare <vscale x 64 x i8> @llvm.riscv.vsmul.nxv64i8.nxv64i8(
-  <vscale x 64 x i8>,
-  <vscale x 64 x i8>,
-  <vscale x 64 x i8>,
-  i32, i32);
-
-define <vscale x 64 x i8> @intrinsic_vsmul_vv_nxv64i8_nxv64i8_nxv64i8(<vscale x 64 x i8> %0, <vscale x 64 x i8> %1, i32 %2) nounwind {
-; CHECK-LABEL: intrinsic_vsmul_vv_nxv64i8_nxv64i8_nxv64i8:
-; CHECK:       # %bb.0: # %entry
-; CHECK-NEXT:    csrwi vxrm, 0
-; CHECK-NEXT:    vsetvli zero, a0, e8, m8, ta, ma
-; CHECK-NEXT:    vsmul.vv v8, v8, v16
-; CHECK-NEXT:    ret
-entry:
-  %a = call <vscale x 64 x i8> @llvm.riscv.vsmul.nxv64i8.nxv64i8(
-    <vscale x 64 x i8> undef,
-    <vscale x 64 x i8> %0,
-    <vscale x 64 x i8> %1,
-    i32 0, i32 %2)
-
-  ret <vscale x 64 x i8> %a
-}
-
-declare <vscale x 64 x i8> @llvm.riscv.vsmul.mask.nxv64i8.nxv64i8(
-  <vscale x 64 x i8>,
-  <vscale x 64 x i8>,
-  <vscale x 64 x i8>,
-  <vscale x 64 x i1>,
-  i32, i32, i32);
-
-define <vscale x 64 x i8> @intrinsic_vsmul_mask_vv_nxv64i8_nxv64i8_nxv64i8(<vscale x 64 x i8> %0, <vscale x 64 x i8> %1, <vscale x 64 x i8> %2, <vscale x 64 x i1> %3, i32 %4) nounwind {
-; CHECK-LABEL: intrinsic_vsmul_mask_vv_nxv64i8_nxv64i8_nxv64i8:
-; CHECK:       # %bb.0: # %entry
-; CHECK-NEXT:    vl8r.v v24, (a0)
-; CHECK-NEXT:    csrwi vxrm, 0
-; CHECK-NEXT:    vsetvli zero, a1, e8, m8, ta, mu
-; CHECK-NEXT:    vsmul.vv v8, v16, v24, v0.t
-; CHECK-NEXT:    ret
-entry:
-  %a = call <vscale x 64 x i8> @llvm.riscv.vsmul.mask.nxv64i8.nxv64i8(
-    <vscale x 64 x i8> %0,
-    <vscale x 64 x i8> %1,
-    <vscale x 64 x i8> %2,
-    <vscale x 64 x i1> %3,
-    i32 0, i32 %4, i32 1)
-
-  ret <vscale x 64 x i8> %a
-}
-
-declare <vscale x 1 x i16> @llvm.riscv.vsmul.nxv1i16.nxv1i16(
-  <vscale x 1 x i16>,
-  <vscale x 1 x i16>,
-  <vscale x 1 x i16>,
-  i32, i32);
-
-define <vscale x 1 x i16> @intrinsic_vsmul_vv_nxv1i16_nxv1i16_nxv1i16(<vscale x 1 x i16> %0, <vscale x 1 x i16> %1, i32 %2) nounwind {
-; CHECK-LABEL: intrinsic_vsmul_vv_nxv1i16_nxv1i16_nxv1i16:
-; CHECK:       # %bb.0: # %entry
-; CHECK-NEXT:    csrwi vxrm, 0
-; CHECK-NEXT:    vsetvli zero, a0, e16, mf4, ta, ma
-; CHECK-NEXT:    vsmul.vv v8, v8, v9
-; CHECK-NEXT:    ret
-entry:
-  %a = call <vscale x 1 x i16> @llvm.riscv.vsmul.nxv1i16.nxv1i16(
-    <vscale x 1 x i16> undef,
-    <vscale x 1 x i16> %0,
-    <vscale x 1 x i16> %1,
-    i32 0, i32 %2)
-
-  ret <vscale x 1 x i16> %a
-}
-
-declare <vscale x 1 x i16> @llvm.riscv.vsmul.mask.nxv1i16.nxv1i16(
-  <vscale x 1 x i16>,
-  <vscale x 1 x i16>,
-  <vscale x 1 x i16>,
-  <vscale x 1 x i1>,
-  i32, i32, i32);
-
-define <vscale x 1 x i16> @intrinsic_vsmul_mask_vv_nxv1i16_nxv1i16_nxv1i16(<vscale x 1 x i16> %0, <vscale x 1 x i16> %1, <vscale x 1 x i16> %2, <vscale x 1 x i1> %3, i32 %4) nounwind {
-; CHECK-LABEL: intrinsic_vsmul_mask_vv_nxv1i16_nxv1i16_nxv1i16:
-; CHECK:       # %bb.0: # %entry
-; CHECK-NEXT:    csrwi vxrm, 0
-; CHECK-NEXT:    vsetvli zero, a0, e16, mf4, ta, mu
-; CHECK-NEXT:    vsmul.vv v8, v9, v10, v0.t
-; CHECK-NEXT:    ret
-entry:
-  %a = call <vscale x 1 x i16> @llvm.riscv.vsmul.mask.nxv1i16.nxv1i16(
-    <vscale x 1 x i16> %0,
-    <vscale x 1 x i16> %1,
-    <vscale x 1 x i16> %2,
-    <vscale x 1 x i1> %3,
-    i32 0, i32 %4, i32 1)
-
-  ret <vscale x 1 x i16> %a
-}
-
-declare <vscale x 2 x i16> @llvm.riscv.vsmul.nxv2i16.nxv2i16(
-  <vscale x 2 x i16>,
-  <vscale x 2 x i16>,
-  <vscale x 2 x i16>,
-  i32, i32);
-
-define <vscale x 2 x i16> @intrinsic_vsmul_vv_nxv2i16_nxv2i16_nxv2i16(<vscale x 2 x i16> %0, <vscale x 2 x i16> %1, i32 %2) nounwind {
-; CHECK-LABEL: intrinsic_vsmul_vv_nxv2i16_nxv2i16_nxv2i16:
-; CHECK:       # %bb.0: # %entry
-; CHECK-NEXT:    csrwi vxrm, 0
-; CHECK-NEXT:    vsetvli zero, a0, e16, mf2, ta, ma
-; CHECK-NEXT:    vsmul.vv v8, v8, v9
-; CHECK-NEXT:    ret
-entry:
-  %a = call <vscale x 2 x i16> @llvm.riscv.vsmul.nxv2i16.nxv2i16(
-    <vscale x 2 x i16> undef,
-    <vscale x 2 x i16> %0,
-    <vscale x 2 x i16> %1,
-    i32 0, i32 %2)
-
-  ret <vscale x 2 x i16> %a
-}
-
-declare <vscale x 2 x i16> @llvm.riscv.vsmul.mask.nxv2i16.nxv2i16(
-  <vscale x 2 x i16>,
-  <vscale x 2 x i16>,
-  <vscale x 2 x i16>,
-  <vscale x 2 x i1>,
-  i32, i32, i32);
-
-define <vscale x 2 x i16> @intrinsic_vsmul_mask_vv_nxv2i16_nxv2i16_nxv2i16(<vscale x 2 x i16> %0, <vscale x 2 x i16> %1, <vscale x 2 x i16> %2, <vscale x 2 x i1> %3, i32 %4) nounwind {
-; CHECK-LABEL: intrinsic_vsmul_mask_vv_nxv2i16_nxv2i16_nxv2i16:
-; CHECK:       # %bb.0: # %entry
-; CHECK-NEXT:    csrwi vxrm, 0
-; CHECK-NEXT:    vsetvli zero, a0, e16, mf2, ta, mu
-; CHECK-NEXT:    vsmul.vv v8, v9, v10, v0.t
-; CHECK-NEXT:    ret
-entry:
-  %a = call <vscale x 2 x i16> @llvm.riscv.vsmul.mask.nxv2i16.nxv2i16(
-    <vscale x 2 x i16> %0,
-    <vscale x 2 x i16> %1,
-    <vscale x 2 x i16> %2,
-    <vscale x 2 x i1> %3,
-    i32 0, i32 %4, i32 1)
-
-  ret <vscale x 2 x i16> %a
-}
-
-declare <vscale x 4 x i16> @llvm.riscv.vsmul.nxv4i16.nxv4i16(
-  <vscale x 4 x i16>,
-  <vscale x 4 x i16>,
-  <vscale x 4 x i16>,
-  i32, i32);
-
-define <vscale x 4 x i16> @intrinsic_vsmul_vv_nxv4i16_nxv4i16_nxv4i16(<vscale x 4 x i16> %0, <vscale x 4 x i16> %1, i32 %2) nounwind {
-; CHECK-LABEL: intrinsic_vsmul_vv_nxv4i16_nxv4i16_nxv4i16:
-; CHECK:       # %bb.0: # %entry
-; CHECK-NEXT:    csrwi vxrm, 0
-; CHECK-NEXT:    vsetvli zero, a0, e16, m1, ta, ma
-; CHECK-NEXT:    vsmul.vv v8, v8, v9
-; CHECK-NEXT:    ret
-entry:
-  %a = call <vscale x 4 x i16> @llvm.riscv.vsmul.nxv4i16.nxv4i16(
-    <vscale x 4 x i16> undef,
-    <vscale x 4 x i16> %0,
-    <vscale x 4 x i16> %1,
-    i32 0, i32 %2)
-
-  ret <vscale x 4 x i16> %a
-}
-
-declare <vscale x 4 x i16> @llvm.riscv.vsmul.mask.nxv4i16.nxv4i16(
-  <vscale x 4 x i16>,
-  <vscale x 4 x i16>,
-  <vscale x 4 x i16>,
-  <vscale x 4 x i1>,
-  i32, i32, i32);
-
-define <vscale x 4 x i16> @intrinsic_vsmul_mask_vv_nxv4i16_nxv4i16_nxv4i16(<vscale x 4 x i16> %0, <vscale x 4 x i16> %1, <vscale x 4 x i16> %2, <vscale x 4 x i1> %3, i32 %4) nounwind {
-; CHECK-LABEL: intrinsic_vsmul_mask_vv_nxv4i16_nxv4i16_nxv4i16:
-; CHECK:       # %bb.0: # %entry
-; CHECK-NEXT:    csrwi vxrm, 0
-; CHECK-NEXT:    vsetvli zero, a0, e16, m1, ta, mu
-; CHECK-NEXT:    vsmul.vv v8, v9, v10, v0.t
-; CHECK-NEXT:    ret
-entry:
-  %a = call <vscale x 4 x i16> @llvm.riscv.vsmul.mask.nxv4i16.nxv4i16(
-    <vscale x 4 x i16> %0,
-    <vscale x 4 x i16> %1,
-    <vscale x 4 x i16> %2,
-    <vscale x 4 x i1> %3,
-    i32 0, i32 %4, i32 1)
-
-  ret <vscale x 4 x i16> %a
-}
-
-declare <vscale x 8 x i16> @llvm.riscv.vsmul.nxv8i16.nxv8i16(
-  <vscale x 8 x i16>,
-  <vscale x 8 x i16>,
-  <vscale x 8 x i16>,
-  i32, i32);
-
-define <vscale x 8 x i16> @intrinsic_vsmul_vv_nxv8i16_nxv8i16_nxv8i16(<vscale x 8 x i16> %0, <vscale x 8 x i16> %1, i32 %2) nounwind {
-; CHECK-LABEL: intrinsic_vsmul_vv_nxv8i16_nxv8i16_nxv8i16:
-; CHECK:       # %bb.0: # %entry
-; CHECK-NEXT:    csrwi vxrm, 0
-; CHECK-NEXT:    vsetvli zero, a0, e16, m2, ta, ma
-; CHECK-NEXT:    vsmul.vv v8, v8, v10
-; CHECK-NEXT:    ret
-entry:
-  %a = call <vscale x 8 x i16> @llvm.riscv.vsmul.nxv8i16.nxv8i16(
-    <vscale x 8 x i16> undef,
-    <vscale x 8 x i16> %0,
-    <vscale x 8 x i16> %1,
-    i32 0, i32 %2)
-
-  ret <vscale x 8 x i16> %a
-}
-
-declare <vscale x 8 x i16> @llvm.riscv.vsmul.mask.nxv8i16.nxv8i16(
-  <vscale x 8 x i16>,
-  <vscale x 8 x i16>,
-  <vscale x 8 x i16>,
-  <vscale x 8 x i1>,
-  i32, i32, i32);
-
-define <vscale x 8 x i16> @intrinsic_vsmul_mask_vv_nxv8i16_nxv8i16_nxv8i16(<vscale x 8 x i16> %0, <vscale x 8 x i16> %1, <vscale x 8 x i16> %2, <vscale x 8 x i1> %3, i32 %4) nounwind {
-; CHECK-LABEL: intrinsic_vsmul_mask_vv_nxv8i16_nxv8i16_nxv8i16:
-; CHECK:       # %bb.0: # %entry
-; CHECK-NEXT:    csrwi vxrm, 0
-; CHECK-NEXT:    vsetvli zero, a0, e16, m2, ta, mu
-; CHECK-NEXT:    vsmul.vv v8, v10, v12, v0.t
-; CHECK-NEXT:    ret
-entry:
-  %a = call <vscale x 8 x i16> @llvm.riscv.vsmul.mask.nxv8i16.nxv8i16(
-    <vscale x 8 x i16> %0,
-    <vscale x 8 x i16> %1,
-    <vscale x 8 x i16> %2,
-    <vscale x 8 x i1> %3,
-    i32 0, i32 %4, i32 1)
-
-  ret <vscale x 8 x i16> %a
-}
-
-declare <vscale x 16 x i16> @llvm.riscv.vsmul.nxv16i16.nxv16i16(
-  <vscale x 16 x i16>,
-  <vscale x 16 x i16>,
-  <vscale x 16 x i16>,
-  i32, i32);
-
-define <vscale x 16 x i16> @intrinsic_vsmul_vv_nxv16i16_nxv16i16_nxv16i16(<vscale x 16 x i16> %0, <vscale x 16 x i16> %1, i32 %2) nounwind {
-; CHECK-LABEL: intrinsic_vsmul_vv_nxv16i16_nxv16i16_nxv16i16:
-; CHECK:       # %bb.0: # %entry
-; CHECK-NEXT:    csrwi vxrm, 0
-; CHECK-NEXT:    vsetvli zero, a0, e16, m4, ta, ma
-; CHECK-NEXT:    vsmul.vv v8, v8, v12
-; CHECK-NEXT:    ret
-entry:
-  %a = call <vscale x 16 x i16> @llvm.riscv.vsmul.nxv16i16.nxv16i16(
-    <vscale x 16 x i16> undef,
-    <vscale x 16 x i16> %0,
-    <vscale x 16 x i16> %1,
-    i32 0, i32 %2)
-
-  ret <vscale x 16 x i16> %a
-}
-
-declare <vscale x 16 x i16> @llvm.riscv.vsmul.mask.nxv16i16.nxv16i16(
-  <vscale x 16 x i16>,
-  <vscale x 16 x i16>,
-  <vscale x 16 x i16>,
-  <vscale x 16 x i1>,
-  i32, i32, i32);
-
-define <vscale x 16 x i16> @intrinsic_vsmul_mask_vv_nxv16i16_nxv16i16_nxv16i16(<vscale x 16 x i16> %0, <vscale x 16 x i16> %1, <vscale x 16 x i16> %2, <vscale x 16 x i1> %3, i32 %4) nounwind {
-; CHECK-LABEL: intrinsic_vsmul_mask_vv_nxv16i16_nxv16i16_nxv16i16:
-; CHECK:       # %bb.0: # %entry
-; CHECK-NEXT:    csrwi vxrm, 0
-; CHECK-NEXT:    vsetvli zero, a0, e16, m4, ta, mu
-; CHECK-NEXT:    vsmul.vv v8, v12, v16, v0.t
-; CHECK-NEXT:    ret
-entry:
-  %a = call <vscale x 16 x i16> @llvm.riscv.vsmul.mask.nxv16i16.nxv16i16(
-    <vscale x 16 x i16> %0,
-    <vscale x 16 x i16> %1,
-    <vscale x 16 x i16> %2,
-    <vscale x 16 x i1> %3,
-    i32 0, i32 %4, i32 1)
-
-  ret <vscale x 16 x i16> %a
-}
-
-declare <vscale x 32 x i16> @llvm.riscv.vsmul.nxv32i16.nxv32i16(
-  <vscale x 32 x i16>,
-  <vscale x 32 x i16>,
-  <vscale x 32 x i16>,
-  i32, i32);
-
-define <vscale x 32 x i16> @intrinsic_vsmul_vv_nxv32i16_nxv32i16_nxv32i16(<vscale x 32 x i16> %0, <vscale x 32 x i16> %1, i32 %2) nounwind {
-; CHECK-LABEL: intrinsic_vsmul_vv_nxv32i16_nxv32i16_nxv32i16:
-; CHECK:       # %bb.0: # %entry
-; CHECK-NEXT:    csrwi vxrm, 0
-; CHECK-NEXT:    vsetvli zero, a0, e16, m8, ta, ma
-; CHECK-NEXT:    vsmul.vv v8, v8, v16
-; CHECK-NEXT:    ret
-entry:
-  %a = call <vscale x 32 x i16> @llvm.riscv.vsmul.nxv32i16.nxv32i16(
-    <vscale x 32 x i16> undef,
-    <vscale x 32 x i16> %0,
-    <vscale x 32 x i16> %1,
-    i32 0, i32 %2)
-
-  ret <vscale x 32 x i16> %a
-}
-
-declare <vscale x 32 x i16> @llvm.riscv.vsmul.mask.nxv32i16.nxv32i16(
-  <vscale x 32 x i16>,
-  <vscale x 32 x i16>,
-  <vscale x 32 x i16>,
-  <vscale x 32 x i1>,
-  i32, i32, i32);
-
-define <vscale x 32 x i16> @intrinsic_vsmul_mask_vv_nxv32i16_nxv32i16_nxv32i16(<vscale x 32 x i16> %0, <vscale x 32 x i16> %1, <vscale x 32 x i16> %2, <vscale x 32 x i1> %3, i32 %4) nounwind {
-; CHECK-LABEL: intrinsic_vsmul_mask_vv_nxv32i16_nxv32i16_nxv32i16:
-; CHECK:       # %bb.0: # %entry
-; CHECK-NEXT:    vl8re16.v v24, (a0)
-; CHECK-NEXT:    csrwi vxrm, 0
-; CHECK-NEXT:    vsetvli zero, a1, e16, m8, ta, mu
-; CHECK-NEXT:    vsmul.vv v8, v16, v24, v0.t
-; CHECK-NEXT:    ret
-entry:
-  %a = call <vscale x 32 x i16> @llvm.riscv.vsmul.mask.nxv32i16.nxv32i16(
-    <vscale x 32 x i16> %0,
-    <vscale x 32 x i16> %1,
-    <vscale x 32 x i16> %2,
-    <vscale x 32 x i1> %3,
-    i32 0, i32 %4, i32 1)
-
-  ret <vscale x 32 x i16> %a
-}
-
-declare <vscale x 1 x i32> @llvm.riscv.vsmul.nxv1i32.nxv1i32(
-  <vscale x 1 x i32>,
-  <vscale x 1 x i32>,
-  <vscale x 1 x i32>,
-  i32, i32);
-
-define <vscale x 1 x i32> @intrinsic_vsmul_vv_nxv1i32_nxv1i32_nxv1i32(<vscale x 1 x i32> %0, <vscale x 1 x i32> %1, i32 %2) nounwind {
-; CHECK-LABEL: intrinsic_vsmul_vv_nxv1i32_nxv1i32_nxv1i32:
-; CHECK:       # %bb.0: # %entry
-; CHECK-NEXT:    csrwi vxrm, 0
-; CHECK-NEXT:    vsetvli zero, a0, e32, mf2, ta, ma
-; CHECK-NEXT:    vsmul.vv v8, v8, v9
-; CHECK-NEXT:    ret
-entry:
-  %a = call <vscale x 1 x i32> @llvm.riscv.vsmul.nxv1i32.nxv1i32(
-    <vscale x 1 x i32> undef,
-    <vscale x 1 x i32> %0,
-    <vscale x 1 x i32> %1,
-    i32 0, i32 %2)
-
-  ret <vscale x 1 x i32> %a
-}
-
-declare <vscale x 1 x i32> @llvm.riscv.vsmul.mask.nxv1i32.nxv1i32(
-  <vscale x 1 x i32>,
-  <vscale x 1 x i32>,
-  <vscale x 1 x i32>,
-  <vscale x 1 x i1>,
-  i32, i32, i32);
-
-define <vscale x 1 x i32> @intrinsic_vsmul_mask_vv_nxv1i32_nxv1i32_nxv1i32(<vscale x 1 x i32> %0, <vscale x 1 x i32> %1, <vscale x 1 x i32> %2, <vscale x 1 x i1> %3, i32 %4) nounwind {
-; CHECK-LABEL: intrinsic_vsmul_mask_vv_nxv1i32_nxv1i32_nxv1i32:
-; CHECK:       # %bb.0: # %entry
-; CHECK-NEXT:    csrwi vxrm, 0
-; CHECK-NEXT:    vsetvli zero, a0, e32, mf2, ta, mu
-; CHECK-NEXT:    vsmul.vv v8, v9, v10, v0.t
-; CHECK-NEXT:    ret
-entry:
-  %a = call <vscale x 1 x i32> @llvm.riscv.vsmul.mask.nxv1i32.nxv1i32(
-    <vscale x 1 x i32> %0,
-    <vscale x 1 x i32> %1,
-    <vscale x 1 x i32> %2,
-    <vscale x 1 x i1> %3,
-    i32 0, i32 %4, i32 1)
-
-  ret <vscale x 1 x i32> %a
-}
-
-declare <vscale x 2 x i32> @llvm.riscv.vsmul.nxv2i32.nxv2i32(
-  <vscale x 2 x i32>,
-  <vscale x 2 x i32>,
-  <vscale x 2 x i32>,
-  i32, i32);
-
-define <vscale x 2 x i32> @intrinsic_vsmul_vv_nxv2i32_nxv2i32_nxv2i32(<vscale x 2 x i32> %0, <vscale x 2 x i32> %1, i32 %2) nounwind {
-; CHECK-LABEL: intrinsic_vsmul_vv_nxv2i32_nxv2i32_nxv2i32:
-; CHECK:       # %bb.0: # %entry
-; CHECK-NEXT:    csrwi vxrm, 0
-; CHECK-NEXT:    vsetvli zero, a0, e32, m1, ta, ma
-; CHECK-NEXT:    vsmul.vv v8, v8, v9
-; CHECK-NEXT:    ret
-entry:
-  %a = call <vscale x 2 x i32> @llvm.riscv.vsmul.nxv2i32.nxv2i32(
-    <vscale x 2 x i32> undef,
-    <vscale x 2 x i32> %0,
-    <vscale x 2 x i32> %1,
-    i32 0, i32 %2)
-
-  ret <vscale x 2 x i32> %a
-}
-
-declare <vscale x 2 x i32> @llvm.riscv.vsmul.mask.nxv2i32.nxv2i32(
-  <vscale x 2 x i32>,
-  <vscale x 2 x i32>,
-  <vscale x 2 x i32>,
-  <vscale x 2 x i1>,
-  i32, i32, i32);
-
-define <vscale x 2 x i32> @intrinsic_vsmul_mask_vv_nxv2i32_nxv2i32_nxv2i32(<vscale x 2 x i32> %0, <vscale x 2 x i32> %1, <vscale x 2 x i32> %2, <vscale x 2 x i1> %3, i32 %4) nounwind {
-; CHECK-LABEL: intrinsic_vsmul_mask_vv_nxv2i32_nxv2i32_nxv2i32:
-; CHECK:       # %bb.0: # %entry
-; CHECK-NEXT:    csrwi vxrm, 0
-; CHECK-NEXT:    vsetvli zero, a0, e32, m1, ta, mu
-; CHECK-NEXT:    vsmul.vv v8, v9, v10, v0.t
-; CHECK-NEXT:    ret
-entry:
-  %a = call <vscale x 2 x i32> @llvm.riscv.vsmul.mask.nxv2i32.nxv2i32(
-    <vscale x 2 x i32> %0,
-    <vscale x 2 x i32> %1,
-    <vscale x 2 x i32> %2,
-    <vscale x 2 x i1> %3,
-    i32 0, i32 %4, i32 1)
-
-  ret <vscale x 2 x i32> %a
-}
-
-declare <vscale x 4 x i32> @llvm.riscv.vsmul.nxv4i32.nxv4i32(
-  <vscale x 4 x i32>,
-  <vscale x 4 x i32>,
-  <vscale x 4 x i32>,
-  i32, i32);
-
-define <vscale x 4 x i32> @intrinsic_vsmul_vv_nxv4i32_nxv4i32_nxv4i32(<vscale x 4 x i32> %0, <vscale x 4 x i32> %1, i32 %2) nounwind {
-; CHECK-LABEL: intrinsic_vsmul_vv_nxv4i32_nxv4i32_nxv4i32:
-; CHECK:       # %bb.0: # %entry
-; CHECK-NEXT:    csrwi vxrm, 0
-; CHECK-NEXT:    vsetvli zero, a0, e32, m2, ta, ma
-; CHECK-NEXT:    vsmul.vv v8, v8, v10
-; CHECK-NEXT:    ret
-entry:
-  %a = call <vscale x 4 x i32> @llvm.riscv.vsmul.nxv4i32.nxv4i32(
-    <vscale x 4 x i32> undef,
-    <vscale x 4 x i32> %0,
-    <vscale x 4 x i32> %1,
-    i32 0, i32 %2)
-
-  ret <vscale x 4 x i32> %a
-}
-
-declare <vscale x 4 x i32> @llvm.riscv.vsmul.mask.nxv4i32.nxv4i32(
-  <vscale x 4 x i32>,
-  <vscale x 4 x i32>,
-  <vscale x 4 x i32>,
-  <vscale x 4 x i1>,
-  i32, i32, i32);
-
-define <vscale x 4 x i32> @intrinsic_vsmul_mask_vv_nxv4i32_nxv4i32_nxv4i32(<vscale x 4 x i32> %0, <vscale x 4 x i32> %1, <vscale x 4 x i32> %2, <vscale x 4 x i1> %3, i32 %4) nounwind {
-; CHECK-LABEL: intrinsic_vsmul_mask_vv_nxv4i32_nxv4i32_nxv4i32:
-; CHECK:       # %bb.0: # %entry
-; CHECK-NEXT:    csrwi vxrm, 0
-; CHECK-NEXT:    vsetvli zero, a0, e32, m2, ta, mu
-; CHECK-NEXT:    vsmul.vv v8, v10, v12, v0.t
-; CHECK-NEXT:    ret
-entry:
-  %a = call <vscale x 4 x i32> @llvm.riscv.vsmul.mask.nxv4i32.nxv4i32(
-    <vscale x 4 x i32> %0,
-    <vscale x 4 x i32> %1,
-    <vscale x 4 x i32> %2,
-    <vscale x 4 x i1> %3,
-    i32 0, i32 %4, i32 1)
-
-  ret <vscale x 4 x i32> %a
-}
-
-declare <vscale x 8 x i32> @llvm.riscv.vsmul.nxv8i32.nxv8i32(
-  <vscale x 8 x i32>,
-  <vscale x 8 x i32>,
-  <vscale x 8 x i32>,
-  i32, i32);
-
-define <vscale x 8 x i32> @intrinsic_vsmul_vv_nxv8i32_nxv8i32_nxv8i32(<vscale x 8 x i32> %0, <vscale x 8 x i32> %1, i32 %2) nounwind {
-; CHECK-LABEL: intrinsic_vsmul_vv_nxv8i32_nxv8i32_nxv8i32:
-; CHECK:       # %bb.0: # %entry
-; CHECK-NEXT:    csrwi vxrm, 0
-; CHECK-NEXT:    vsetvli zero, a0, e32, m4, ta, ma
-; CHECK-NEXT:    vsmul.vv v8, v8, v12
-; CHECK-NEXT:    ret
-entry:
-  %a = call <vscale x 8 x i32> @llvm.riscv.vsmul.nxv8i32.nxv8i32(
-    <vscale x 8 x i32> undef,
-    <vscale x 8 x i32> %0,
-    <vscale x 8 x i32> %1,
-    i32 0, i32 %2)
-
-  ret <vscale x 8 x i32> %a
-}
-
-declare <vscale x 8 x i32> @llvm.riscv.vsmul.mask.nxv8i32.nxv8i32(
-  <vscale x 8 x i32>,
-  <vscale x 8 x i32>,
-  <vscale x 8 x i32>,
-  <vscale x 8 x i1>,
-  i32, i32, i32);
-
-define <vscale x 8 x i32> @intrinsic_vsmul_mask_vv_nxv8i32_nxv8i32_nxv8i32(<vscale x 8 x i32> %0, <vscale x 8 x i32> %1, <vscale x 8 x i32> %2, <vscale x 8 x i1> %3, i32 %4) nounwind {
-; CHECK-LABEL: intrinsic_vsmul_mask_vv_nxv8i32_nxv8i32_nxv8i32:
-; CHECK:       # %bb.0: # %entry
-; CHECK-NEXT:    csrwi vxrm, 0
-; CHECK-NEXT:    vsetvli zero, a0, e32, m4, ta, mu
-; CHECK-NEXT:    vsmul.vv v8, v12, v16, v0.t
-; CHECK-NEXT:    ret
-entry:
-  %a = call <vscale x 8 x i32> @llvm.riscv.vsmul.mask.nxv8i32.nxv8i32(
-    <vscale x 8 x i32> %0,
-    <vscale x 8 x i32> %1,
-    <vscale x 8 x i32> %2,
-    <vscale x 8 x i1> %3,
-    i32 0, i32 %4, i32 1)
-
-  ret <vscale x 8 x i32> %a
-}
-
-declare <vscale x 16 x i32> @llvm.riscv.vsmul.nxv16i32.nxv16i32(
-  <vscale x 16 x i32>,
-  <vscale x 16 x i32>,
-  <vscale x 16 x i32>,
-  i32, i32);
-
-define <vscale x 16 x i32> @intrinsic_vsmul_vv_nxv16i32_nxv16i32_nxv16i32(<vscale x 16 x i32> %0, <vscale x 16 x i32> %1, i32 %2) nounwind {
-; CHECK-LABEL: intrinsic_vsmul_vv_nxv16i32_nxv16i32_nxv16i32:
-; CHECK:       # %bb.0: # %entry
-; CHECK-NEXT:    csrwi vxrm, 0
-; CHECK-NEXT:    vsetvli zero, a0, e32, m8, ta, ma
-; CHECK-NEXT:    vsmul.vv v8, v8, v16
-; CHECK-NEXT:    ret
-entry:
-  %a = call <vscale x 16 x i32> @llvm.riscv.vsmul.nxv16i32.nxv16i32(
-    <vscale x 16 x i32> undef,
-    <vscale x 16 x i32> %0,
-    <vscale x 16 x i32> %1,
-    i32 0, i32 %2)
-
-  ret <vscale x 16 x i32> %a
-}
-
-declare <vscale x 16 x i32> @llvm.riscv.vsmul.mask.nxv16i32.nxv16i32(
-  <vscale x 16 x i32>,
-  <vscale x 16 x i32>,
-  <vscale x 16 x i32>,
-  <vscale x 16 x i1>,
-  i32, i32, i32);
-
-define <vscale x 16 x i32> @intrinsic_vsmul_mask_vv_nxv16i32_nxv16i32_nxv16i32(<vscale x 16 x i32> %0, <vscale x 16 x i32> %1, <vscale x 16 x i32> %2, <vscale x 16 x i1> %3, i32 %4) nounwind {
-; CHECK-LABEL: intrinsic_vsmul_mask_vv_nxv16i32_nxv16i32_nxv16i32:
-; CHECK:       # %bb.0: # %entry
-; CHECK-NEXT:    vl8re32.v v24, (a0)
-; CHECK-NEXT:    csrwi vxrm, 0
-; CHECK-NEXT:    vsetvli zero, a1, e32, m8, ta, mu
-; CHECK-NEXT:    vsmul.vv v8, v16, v24, v0.t
-; CHECK-NEXT:    ret
-entry:
-  %a = call <vscale x 16 x i32> @llvm.riscv.vsmul.mask.nxv16i32.nxv16i32(
-    <vscale x 16 x i32> %0,
-    <vscale x 16 x i32> %1,
-    <vscale x 16 x i32> %2,
-    <vscale x 16 x i1> %3,
-    i32 0, i32 %4, i32 1)
-
-  ret <vscale x 16 x i32> %a
-}
-
-declare <vscale x 1 x i64> @llvm.riscv.vsmul.nxv1i64.nxv1i64(
-  <vscale x 1 x i64>,
-  <vscale x 1 x i64>,
-  <vscale x 1 x i64>,
-  i32, i32);
-
-define <vscale x 1 x i64> @intrinsic_vsmul_vv_nxv1i64_nxv1i64_nxv1i64(<vscale x 1 x i64> %0, <vscale x 1 x i64> %1, i32 %2) nounwind {
-; CHECK-LABEL: intrinsic_vsmul_vv_nxv1i64_nxv1i64_nxv1i64:
-; CHECK:       # %bb.0: # %entry
-; CHECK-NEXT:    csrwi vxrm, 0
-; CHECK-NEXT:    vsetvli zero, a0, e64, m1, ta, ma
-; CHECK-NEXT:    vsmul.vv v8, v8, v9
-; CHECK-NEXT:    ret
-entry:
-  %a = call <vscale x 1 x i64> @llvm.riscv.vsmul.nxv1i64.nxv1i64(
-    <vscale x 1 x i64> undef,
-    <vscale x 1 x i64> %0,
-    <vscale x 1 x i64> %1,
-    i32 0, i32 %2)
-
-  ret <vscale x 1 x i64> %a
-}
-
-declare <vscale x 1 x i64> @llvm.riscv.vsmul.mask.nxv1i64.nxv1i64(
-  <vscale x 1 x i64>,
-  <vscale x 1 x i64>,
-  <vscale x 1 x i64>,
-  <vscale x 1 x i1>,
-  i32, i32, i32);
-
-define <vscale x 1 x i64> @intrinsic_vsmul_mask_vv_nxv1i64_nxv1i64_nxv1i64(<vscale x 1 x i64> %0, <vscale x 1 x i64> %1, <vscale x 1 x i64> %2, <vscale x 1 x i1> %3, i32 %4) nounwind {
-; CHECK-LABEL: intrinsic_vsmul_mask_vv_nxv1i64_nxv1i64_nxv1i64:
-; CHECK:       # %bb.0: # %entry
-; CHECK-NEXT:    csrwi vxrm, 0
-; CHECK-NEXT:    vsetvli zero, a0, e64, m1, ta, mu
-; CHECK-NEXT:    vsmul.vv v8, v9, v10, v0.t
-; CHECK-NEXT:    ret
-entry:
-  %a = call <vscale x 1 x i64> @llvm.riscv.vsmul.mask.nxv1i64.nxv1i64(
-    <vscale x 1 x i64> %0,
-    <vscale x 1 x i64> %1,
-    <vscale x 1 x i64> %2,
-    <vscale x 1 x i1> %3,
-    i32 0, i32 %4, i32 1)
-
-  ret <vscale x 1 x i64> %a
-}
-
-declare <vscale x 2 x i64> @llvm.riscv.vsmul.nxv2i64.nxv2i64(
-  <vscale x 2 x i64>,
-  <vscale x 2 x i64>,
-  <vscale x 2 x i64>,
-  i32, i32);
-
-define <vscale x 2 x i64> @intrinsic_vsmul_vv_nxv2i64_nxv2i64_nxv2i64(<vscale x 2 x i64> %0, <vscale x 2 x i64> %1, i32 %2) nounwind {
-; CHECK-LABEL: intrinsic_vsmul_vv_nxv2i64_nxv2i64_nxv2i64:
-; CHECK:       # %bb.0: # %entry
-; CHECK-NEXT:    csrwi vxrm, 0
-; CHECK-NEXT:    vsetvli zero, a0, e64, m2, ta, ma
-; CHECK-NEXT:    vsmul.vv v8, v8, v10
-; CHECK-NEXT:    ret
-entry:
-  %a = call <vscale x 2 x i64> @llvm.riscv.vsmul.nxv2i64.nxv2i64(
-    <vscale x 2 x i64> undef,
-    <vscale x 2 x i64> %0,
-    <vscale x 2 x i64> %1,
-    i32 0, i32 %2)
-
-  ret <vscale x 2 x i64> %a
-}
-
-declare <vscale x 2 x i64> @llvm.riscv.vsmul.mask.nxv2i64.nxv2i64(
-  <vscale x 2 x i64>,
-  <vscale x 2 x i64>,
-  <vscale x 2 x i64>,
-  <vscale x 2 x i1>,
-  i32, i32, i32);
-
-define <vscale x 2 x i64> @intrinsic_vsmul_mask_vv_nxv2i64_nxv2i64_nxv2i64(<vscale x 2 x i64> %0, <vscale x 2 x i64> %1, <vscale x 2 x i64> %2, <vscale x 2 x i1> %3, i32 %4) nounwind {
-; CHECK-LABEL: intrinsic_vsmul_mask_vv_nxv2i64_nxv2i64_nxv2i64:
-; CHECK:       # %bb.0: # %entry
-; CHECK-NEXT:    csrwi vxrm, 0
-; CHECK-NEXT:    vsetvli zero, a0, e64, m2, ta, mu
-; CHECK-NEXT:    vsmul.vv v8, v10, v12, v0.t
-; CHECK-NEXT:    ret
-entry:
-  %a = call <vscale x 2 x i64> @llvm.riscv.vsmul.mask.nxv2i64.nxv2i64(
-    <vscale x 2 x i64> %0,
-    <vscale x 2 x i64> %1,
-    <vscale x 2 x i64> %2,
-    <vscale x 2 x i1> %3,
-    i32 0, i32 %4, i32 1)
-
-  ret <vscale x 2 x i64> %a
-}
-
-declare <vscale x 4 x i64> @llvm.riscv.vsmul.nxv4i64.nxv4i64(
-  <vscale x 4 x i64>,
-  <vscale x 4 x i64>,
-  <vscale x 4 x i64>,
-  i32, i32);
-
-define <vscale x 4 x i64> @intrinsic_vsmul_vv_nxv4i64_nxv4i64_nxv4i64(<vscale x 4 x i64> %0, <vscale x 4 x i64> %1, i32 %2) nounwind {
-; CHECK-LABEL: intrinsic_vsmul_vv_nxv4i64_nxv4i64_nxv4i64:
-; CHECK:       # %bb.0: # %entry
-; CHECK-NEXT:    csrwi vxrm, 0
-; CHECK-NEXT:    vsetvli zero, a0, e64, m4, ta, ma
-; CHECK-NEXT:    vsmul.vv v8, v8, v12
-; CHECK-NEXT:    ret
-entry:
-  %a = call <vscale x 4 x i64> @llvm.riscv.vsmul.nxv4i64.nxv4i64(
-    <vscale x 4 x i64> undef,
-    <vscale x 4 x i64> %0,
-    <vscale x 4 x i64> %1,
-    i32 0, i32 %2)
-
-  ret <vscale x 4 x i64> %a
-}
-
-declare <vscale x 4 x i64> @llvm.riscv.vsmul.mask.nxv4i64.nxv4i64(
-  <vscale x 4 x i64>,
-  <vscale x 4 x i64>,
-  <vscale x 4 x i64>,
-  <vscale x 4 x i1>,
-  i32, i32, i32);
-
-define <vscale x 4 x i64> @intrinsic_vsmul_mask_vv_nxv4i64_nxv4i64_nxv4i64(<vscale x 4 x i64> %0, <vscale x 4 x i64> %1, <vscale x 4 x i64> %2, <vscale x 4 x i1> %3, i32 %4) nounwind {
-; CHECK-LABEL: intrinsic_vsmul_mask_vv_nxv4i64_nxv4i64_nxv4i64:
-; CHECK:       # %bb.0: # %entry
-; CHECK-NEXT:    csrwi vxrm, 0
-; CHECK-NEXT:    vsetvli zero, a0, e64, m4, ta, mu
-; CHECK-NEXT:    vsmul.vv v8, v12, v16, v0.t
-; CHECK-NEXT:    ret
-entry:
-  %a = call <vscale x 4 x i64> @llvm.riscv.vsmul.mask.nxv4i64.nxv4i64(
-    <vscale x 4 x i64> %0,
-    <vscale x 4 x i64> %1,
-    <vscale x 4 x i64> %2,
-    <vscale x 4 x i1> %3,
-    i32 0, i32 %4, i32 1)
-
-  ret <vscale x 4 x i64> %a
-}
-
-declare <vscale x 8 x i64> @llvm.riscv.vsmul.nxv8i64.nxv8i64(
-  <vscale x 8 x i64>,
-  <vscale x 8 x i64>,
-  <vscale x 8 x i64>,
-  i32, i32);
-
-define <vscale x 8 x i64> @intrinsic_vsmul_vv_nxv8i64_nxv8i64_nxv8i64(<vscale x 8 x i64> %0, <vscale x 8 x i64> %1, i32 %2) nounwind {
-; CHECK-LABEL: intrinsic_vsmul_vv_nxv8i64_nxv8i64_nxv8i64:
-; CHECK:       # %bb.0: # %entry
-; CHECK-NEXT:    csrwi vxrm, 0
-; CHECK-NEXT:    vsetvli zero, a0, e64, m8, ta, ma
-; CHECK-NEXT:    vsmul.vv v8, v8, v16
-; CHECK-NEXT:    ret
-entry:
-  %a = call <vscale x 8 x i64> @llvm.riscv.vsmul.nxv8i64.nxv8i64(
-    <vscale x 8 x i64> undef,
-    <vscale x 8 x i64> %0,
-    <vscale x 8 x i64> %1,
-    i32 0, i32 %2)
-
-  ret <vscale x 8 x i64> %a
-}
-
-declare <vscale x 8 x i64> @llvm.riscv.vsmul.mask.nxv8i64.nxv8i64(
-  <vscale x 8 x i64>,
-  <vscale x 8 x i64>,
-  <vscale x 8 x i64>,
-  <vscale x 8 x i1>,
-  i32, i32, i32);
-
-define <vscale x 8 x i64> @intrinsic_vsmul_mask_vv_nxv8i64_nxv8i64_nxv8i64(<vscale x 8 x i64> %0, <vscale x 8 x i64> %1, <vscale x 8 x i64> %2, <vscale x 8 x i1> %3, i32 %4) nounwind {
-; CHECK-LABEL: intrinsic_vsmul_mask_vv_nxv8i64_nxv8i64_nxv8i64:
-; CHECK:       # %bb.0: # %entry
-; CHECK-NEXT:    vl8re64.v v24, (a0)
-; CHECK-NEXT:    csrwi vxrm, 0
-; CHECK-NEXT:    vsetvli zero, a1, e64, m8, ta, mu
-; CHECK-NEXT:    vsmul.vv v8, v16, v24, v0.t
-; CHECK-NEXT:    ret
-entry:
-  %a = call <vscale x 8 x i64> @llvm.riscv.vsmul.mask.nxv8i64.nxv8i64(
-    <vscale x 8 x i64> %0,
-    <vscale x 8 x i64> %1,
-    <vscale x 8 x i64> %2,
-    <vscale x 8 x i1> %3,
-    i32 0, i32 %4, i32 1)
-
-  ret <vscale x 8 x i64> %a
-}
-
-declare <vscale x 1 x i8> @llvm.riscv.vsmul.nxv1i8.i8(
-  <vscale x 1 x i8>,
-  <vscale x 1 x i8>,
-  i8,
-  i32, i32);
-
-define <vscale x 1 x i8> @intrinsic_vsmul_vx_nxv1i8_nxv1i8_i8(<vscale x 1 x i8> %0, i8 %1, i32 %2) nounwind {
-; CHECK-LABEL: intrinsic_vsmul_vx_nxv1i8_nxv1i8_i8:
-; CHECK:       # %bb.0: # %entry
-; CHECK-NEXT:    csrwi vxrm, 0
-; CHECK-NEXT:    vsetvli zero, a1, e8, mf8, ta, ma
-; CHECK-NEXT:    vsmul.vx v8, v8, a0
-; CHECK-NEXT:    ret
-entry:
-  %a = call <vscale x 1 x i8> @llvm.riscv.vsmul.nxv1i8.i8(
-    <vscale x 1 x i8> undef,
-    <vscale x 1 x i8> %0,
-    i8 %1,
-    i32 0, i32 %2)
-
-  ret <vscale x 1 x i8> %a
-}
-
-declare <vscale x 1 x i8> @llvm.riscv.vsmul.mask.nxv1i8.i8(
-  <vscale x 1 x i8>,
-  <vscale x 1 x i8>,
-  i8,
-  <vscale x 1 x i1>,
-  i32, i32, i32);
-
-define <vscale x 1 x i8> @intrinsic_vsmul_mask_vx_nxv1i8_nxv1i8_i8(<vscale x 1 x i8> %0, <vscale x 1 x i8> %1, i8 %2, <vscale x 1 x i1> %3, i32 %4) nounwind {
-; CHECK-LABEL: intrinsic_vsmul_mask_vx_nxv1i8_nxv1i8_i8:
-; CHECK:       # %bb.0: # %entry
-; CHECK-NEXT:    csrwi vxrm, 0
-; CHECK-NEXT:    vsetvli zero, a1, e8, mf8, ta, mu
-; CHECK-NEXT:    vsmul.vx v8, v9, a0, v0.t
-; CHECK-NEXT:    ret
-entry:
-  %a = call <vscale x 1 x i8> @llvm.riscv.vsmul.mask.nxv1i8.i8(
-    <vscale x 1 x i8> %0,
-    <vscale x 1 x i8> %1,
-    i8 %2,
-    <vscale x 1 x i1> %3,
-    i32 0, i32 %4, i32 1)
-
-  ret <vscale x 1 x i8> %a
-}
-
-declare <vscale x 2 x i8> @llvm.riscv.vsmul.nxv2i8.i8(
-  <vscale x 2 x i8>,
-  <vscale x 2 x i8>,
-  i8,
-  i32, i32);
-
-define <vscale x 2 x i8> @intrinsic_vsmul_vx_nxv2i8_nxv2i8_i8(<vscale x 2 x i8> %0, i8 %1, i32 %2) nounwind {
-; CHECK-LABEL: intrinsic_vsmul_vx_nxv2i8_nxv2i8_i8:
-; CHECK:       # %bb.0: # %entry
-; CHECK-NEXT:    csrwi vxrm, 0
-; CHECK-NEXT:    vsetvli zero, a1, e8, mf4, ta, ma
-; CHECK-NEXT:    vsmul.vx v8, v8, a0
-; CHECK-NEXT:    ret
-entry:
-  %a = call <vscale x 2 x i8> @llvm.riscv.vsmul.nxv2i8.i8(
-    <vscale x 2 x i8> undef,
-    <vscale x 2 x i8> %0,
-    i8 %1,
-    i32 0, i32 %2)
-
-  ret <vscale x 2 x i8> %a
-}
-
-declare <vscale x 2 x i8> @llvm.riscv.vsmul.mask.nxv2i8.i8(
-  <vscale x 2 x i8>,
-  <vscale x 2 x i8>,
-  i8,
-  <vscale x 2 x i1>,
-  i32, i32, i32);
-
-define <vscale x 2 x i8> @intrinsic_vsmul_mask_vx_nxv2i8_nxv2i8_i8(<vscale x 2 x i8> %0, <vscale x 2 x i8> %1, i8 %2, <vscale x 2 x i1> %3, i32 %4) nounwind {
-; CHECK-LABEL: intrinsic_vsmul_mask_vx_nxv2i8_nxv2i8_i8:
-; CHECK:       # %bb.0: # %entry
-; CHECK-NEXT:    csrwi vxrm, 0
-; CHECK-NEXT:    vsetvli zero, a1, e8, mf4, ta, mu
-; CHECK-NEXT:    vsmul.vx v8, v9, a0, v0.t
-; CHECK-NEXT:    ret
-entry:
-  %a = call <vscale x 2 x i8> @llvm.riscv.vsmul.mask.nxv2i8.i8(
-    <vscale x 2 x i8> %0,
-    <vscale x 2 x i8> %1,
-    i8 %2,
-    <vscale x 2 x i1> %3,
-    i32 0, i32 %4, i32 1)
-
-  ret <vscale x 2 x i8> %a
-}
-
-declare <vscale x 4 x i8> @llvm.riscv.vsmul.nxv4i8.i8(
-  <vscale x 4 x i8>,
-  <vscale x 4 x i8>,
-  i8,
-  i32, i32);
-
-define <vscale x 4 x i8> @intrinsic_vsmul_vx_nxv4i8_nxv4i8_i8(<vscale x 4 x i8> %0, i8 %1, i32 %2) nounwind {
-; CHECK-LABEL: intrinsic_vsmul_vx_nxv4i8_nxv4i8_i8:
-; CHECK:       # %bb.0: # %entry
-; CHECK-NEXT:    csrwi vxrm, 0
-; CHECK-NEXT:    vsetvli zero, a1, e8, mf2, ta, ma
-; CHECK-NEXT:    vsmul.vx v8, v8, a0
-; CHECK-NEXT:    ret
-entry:
-  %a = call <vscale x 4 x i8> @llvm.riscv.vsmul.nxv4i8.i8(
-    <vscale x 4 x i8> undef,
-    <vscale x 4 x i8> %0,
-    i8 %1,
-    i32 0, i32 %2)
-
-  ret <vscale x 4 x i8> %a
-}
-
-declare <vscale x 4 x i8> @llvm.riscv.vsmul.mask.nxv4i8.i8(
-  <vscale x 4 x i8>,
-  <vscale x 4 x i8>,
-  i8,
-  <vscale x 4 x i1>,
-  i32, i32, i32);
-
-define <vscale x 4 x i8> @intrinsic_vsmul_mask_vx_nxv4i8_nxv4i8_i8(<vscale x 4 x i8> %0, <vscale x 4 x i8> %1, i8 %2, <vscale x 4 x i1> %3, i32 %4) nounwind {
-; CHECK-LABEL: intrinsic_vsmul_mask_vx_nxv4i8_nxv4i8_i8:
-; CHECK:       # %bb.0: # %entry
-; CHECK-NEXT:    csrwi vxrm, 0
-; CHECK-NEXT:    vsetvli zero, a1, e8, mf2, ta, mu
-; CHECK-NEXT:    vsmul.vx v8, v9, a0, v0.t
-; CHECK-NEXT:    ret
-entry:
-  %a = call <vscale x 4 x i8> @llvm.riscv.vsmul.mask.nxv4i8.i8(
-    <vscale x 4 x i8> %0,
-    <vscale x 4 x i8> %1,
-    i8 %2,
-    <vscale x 4 x i1> %3,
-    i32 0, i32 %4, i32 1)
-
-  ret <vscale x 4 x i8> %a
-}
-
-declare <vscale x 8 x i8> @llvm.riscv.vsmul.nxv8i8.i8(
-  <vscale x 8 x i8>,
-  <vscale x 8 x i8>,
-  i8,
-  i32, i32);
-
-define <vscale x 8 x i8> @intrinsic_vsmul_vx_nxv8i8_nxv8i8_i8(<vscale x 8 x i8> %0, i8 %1, i32 %2) nounwind {
-; CHECK-LABEL: intrinsic_vsmul_vx_nxv8i8_nxv8i8_i8:
-; CHECK:       # %bb.0: # %entry
-; CHECK-NEXT:    csrwi vxrm, 0
-; CHECK-NEXT:    vsetvli zero, a1, e8, m1, ta, ma
-; CHECK-NEXT:    vsmul.vx v8, v8, a0
-; CHECK-NEXT:    ret
-entry:
-  %a = call <vscale x 8 x i8> @llvm.riscv.vsmul.nxv8i8.i8(
-    <vscale x 8 x i8> undef,
-    <vscale x 8 x i8> %0,
-    i8 %1,
-    i32 0, i32 %2)
-
-  ret <vscale x 8 x i8> %a
-}
-
-declare <vscale x 8 x i8> @llvm.riscv.vsmul.mask.nxv8i8.i8(
-  <vscale x 8 x i8>,
-  <vscale x 8 x i8>,
-  i8,
-  <vscale x 8 x i1>,
-  i32, i32, i32);
-
-define <vscale x 8 x i8> @intrinsic_vsmul_mask_vx_nxv8i8_nxv8i8_i8(<vscale x 8 x i8> %0, <vscale x 8 x i8> %1, i8 %2, <vscale x 8 x i1> %3, i32 %4) nounwind {
-; CHECK-LABEL: intrinsic_vsmul_mask_vx_nxv8i8_nxv8i8_i8:
-; CHECK:       # %bb.0: # %entry
-; CHECK-NEXT:    csrwi vxrm, 0
-; CHECK-NEXT:    vsetvli zero, a1, e8, m1, ta, mu
-; CHECK-NEXT:    vsmul.vx v8, v9, a0, v0.t
-; CHECK-NEXT:    ret
-entry:
-  %a = call <vscale x 8 x i8> @llvm.riscv.vsmul.mask.nxv8i8.i8(
-    <vscale x 8 x i8> %0,
-    <vscale x 8 x i8> %1,
-    i8 %2,
-    <vscale x 8 x i1> %3,
-    i32 0, i32 %4, i32 1)
-
-  ret <vscale x 8 x i8> %a
-}
-
-declare <vscale x 16 x i8> @llvm.riscv.vsmul.nxv16i8.i8(
-  <vscale x 16 x i8>,
-  <vscale x 16 x i8>,
-  i8,
-  i32, i32);
-
-define <vscale x 16 x i8> @intrinsic_vsmul_vx_nxv16i8_nxv16i8_i8(<vscale x 16 x i8> %0, i8 %1, i32 %2) nounwind {
-; CHECK-LABEL: intrinsic_vsmul_vx_nxv16i8_nxv16i8_i8:
-; CHECK:       # %bb.0: # %entry
-; CHECK-NEXT:    csrwi vxrm, 0
-; CHECK-NEXT:    vsetvli zero, a1, e8, m2, ta, ma
-; CHECK-NEXT:    vsmul.vx v8, v8, a0
-; CHECK-NEXT:    ret
-entry:
-  %a = call <vscale x 16 x i8> @llvm.riscv.vsmul.nxv16i8.i8(
-    <vscale x 16 x i8> undef,
-    <vscale x 16 x i8> %0,
-    i8 %1,
-    i32 0, i32 %2)
-
-  ret <vscale x 16 x i8> %a
-}
-
-declare <vscale x 16 x i8> @llvm.riscv.vsmul.mask.nxv16i8.i8(
-  <vscale x 16 x i8>,
-  <vscale x 16 x i8>,
-  i8,
-  <vscale x 16 x i1>,
-  i32, i32, i32);
-
-define <vscale x 16 x i8> @intrinsic_vsmul_mask_vx_nxv16i8_nxv16i8_i8(<vscale x 16 x i8> %0, <vscale x 16 x i8> %1, i8 %2, <vscale x 16 x i1> %3, i32 %4) nounwind {
-; CHECK-LABEL: intrinsic_vsmul_mask_vx_nxv16i8_nxv16i8_i8:
-; CHECK:       # %bb.0: # %entry
-; CHECK-NEXT:    csrwi vxrm, 0
-; CHECK-NEXT:    vsetvli zero, a1, e8, m2, ta, mu
-; CHECK-NEXT:    vsmul.vx v8, v10, a0, v0.t
-; CHECK-NEXT:    ret
-entry:
-  %a = call <vscale x 16 x i8> @llvm.riscv.vsmul.mask.nxv16i8.i8(
-    <vscale x 16 x i8> %0,
-    <vscale x 16 x i8> %1,
-    i8 %2,
-    <vscale x 16 x i1> %3,
-    i32 0, i32 %4, i32 1)
-
-  ret <vscale x 16 x i8> %a
-}
-
-declare <vscale x 32 x i8> @llvm.riscv.vsmul.nxv32i8.i8(
-  <vscale x 32 x i8>,
-  <vscale x 32 x i8>,
-  i8,
-  i32, i32);
-
-define <vscale x 32 x i8> @intrinsic_vsmul_vx_nxv32i8_nxv32i8_i8(<vscale x 32 x i8> %0, i8 %1, i32 %2) nounwind {
-; CHECK-LABEL: intrinsic_vsmul_vx_nxv32i8_nxv32i8_i8:
-; CHECK:       # %bb.0: # %entry
-; CHECK-NEXT:    csrwi vxrm, 0
-; CHECK-NEXT:    vsetvli zero, a1, e8, m4, ta, ma
-; CHECK-NEXT:    vsmul.vx v8, v8, a0
-; CHECK-NEXT:    ret
-entry:
-  %a = call <vscale x 32 x i8> @llvm.riscv.vsmul.nxv32i8.i8(
-    <vscale x 32 x i8> undef,
-    <vscale x 32 x i8> %0,
-    i8 %1,
-    i32 0, i32 %2)
-
-  ret <vscale x 32 x i8> %a
-}
-
-declare <vscale x 32 x i8> @llvm.riscv.vsmul.mask.nxv32i8.i8(
-  <vscale x 32 x i8>,
-  <vscale x 32 x i8>,
-  i8,
-  <vscale x 32 x i1>,
-  i32, i32, i32);
-
-define <vscale x 32 x i8> @intrinsic_vsmul_mask_vx_nxv32i8_nxv32i8_i8(<vscale x 32 x i8> %0, <vscale x 32 x i8> %1, i8 %2, <vscale x 32 x i1> %3, i32 %4) nounwind {
-; CHECK-LABEL: intrinsic_vsmul_mask_vx_nxv32i8_nxv32i8_i8:
-; CHECK:       # %bb.0: # %entry
-; CHECK-NEXT:    csrwi vxrm, 0
-; CHECK-NEXT:    vsetvli zero, a1, e8, m4, ta, mu
-; CHECK-NEXT:    vsmul.vx v8, v12, a0, v0.t
-; CHECK-NEXT:    ret
-entry:
-  %a = call <vscale x 32 x i8> @llvm.riscv.vsmul.mask.nxv32i8.i8(
-    <vscale x 32 x i8> %0,
-    <vscale x 32 x i8> %1,
-    i8 %2,
-    <vscale x 32 x i1> %3,
-    i32 0, i32 %4, i32 1)
-
-  ret <vscale x 32 x i8> %a
-}
-
-declare <vscale x 64 x i8> @llvm.riscv.vsmul.nxv64i8.i8(
-  <vscale x 64 x i8>,
-  <vscale x 64 x i8>,
-  i8,
-  i32, i32);
-
-define <vscale x 64 x i8> @intrinsic_vsmul_vx_nxv64i8_nxv64i8_i8(<vscale x 64 x i8> %0, i8 %1, i32 %2) nounwind {
-; CHECK-LABEL: intrinsic_vsmul_vx_nxv64i8_nxv64i8_i8:
-; CHECK:       # %bb.0: # %entry
-; CHECK-NEXT:    csrwi vxrm, 0
-; CHECK-NEXT:    vsetvli zero, a1, e8, m8, ta, ma
-; CHECK-NEXT:    vsmul.vx v8, v8, a0
-; CHECK-NEXT:    ret
-entry:
-  %a = call <vscale x 64 x i8> @llvm.riscv.vsmul.nxv64i8.i8(
-    <vscale x 64 x i8> undef,
-    <vscale x 64 x i8> %0,
-    i8 %1,
-    i32 0, i32 %2)
-
-  ret <vscale x 64 x i8> %a
-}
-
-declare <vscale x 64 x i8> @llvm.riscv.vsmul.mask.nxv64i8.i8(
-  <vscale x 64 x i8>,
-  <vscale x 64 x i8>,
-  i8,
-  <vscale x 64 x i1>,
-  i32, i32, i32);
-
-define <vscale x 64 x i8> @intrinsic_vsmul_mask_vx_nxv64i8_nxv64i8_i8(<vscale x 64 x i8> %0, <vscale x 64 x i8> %1, i8 %2, <vscale x 64 x i1> %3, i32 %4) nounwind {
-; CHECK-LABEL: intrinsic_vsmul_mask_vx_nxv64i8_nxv64i8_i8:
-; CHECK:       # %bb.0: # %entry
-; CHECK-NEXT:    csrwi vxrm, 0
-; CHECK-NEXT:    vsetvli zero, a1, e8, m8, ta, mu
-; CHECK-NEXT:    vsmul.vx v8, v16, a0, v0.t
-; CHECK-NEXT:    ret
-entry:
-  %a = call <vscale x 64 x i8> @llvm.riscv.vsmul.mask.nxv64i8.i8(
-    <vscale x 64 x i8> %0,
-    <vscale x 64 x i8> %1,
-    i8 %2,
-    <vscale x 64 x i1> %3,
-    i32 0, i32 %4, i32 1)
-
-  ret <vscale x 64 x i8> %a
-}
-
-declare <vscale x 1 x i16> @llvm.riscv.vsmul.nxv1i16.i16(
-  <vscale x 1 x i16>,
-  <vscale x 1 x i16>,
-  i16,
-  i32, i32);
-
-define <vscale x 1 x i16> @intrinsic_vsmul_vx_nxv1i16_nxv1i16_i16(<vscale x 1 x i16> %0, i16 %1, i32 %2) nounwind {
-; CHECK-LABEL: intrinsic_vsmul_vx_nxv1i16_nxv1i16_i16:
-; CHECK:       # %bb.0: # %entry
-; CHECK-NEXT:    csrwi vxrm, 0
-; CHECK-NEXT:    vsetvli zero, a1, e16, mf4, ta, ma
-; CHECK-NEXT:    vsmul.vx v8, v8, a0
-; CHECK-NEXT:    ret
-entry:
-  %a = call <vscale x 1 x i16> @llvm.riscv.vsmul.nxv1i16.i16(
-    <vscale x 1 x i16> undef,
-    <vscale x 1 x i16> %0,
-    i16 %1,
-    i32 0, i32 %2)
-
-  ret <vscale x 1 x i16> %a
-}
-
-declare <vscale x 1 x i16> @llvm.riscv.vsmul.mask.nxv1i16.i16(
-  <vscale x 1 x i16>,
-  <vscale x 1 x i16>,
-  i16,
-  <vscale x 1 x i1>,
-  i32, i32, i32);
-
-define <vscale x 1 x i16> @intrinsic_vsmul_mask_vx_nxv1i16_nxv1i16_i16(<vscale x 1 x i16> %0, <vscale x 1 x i16> %1, i16 %2, <vscale x 1 x i1> %3, i32 %4) nounwind {
-; CHECK-LABEL: intrinsic_vsmul_mask_vx_nxv1i16_nxv1i16_i16:
-; CHECK:       # %bb.0: # %entry
-; CHECK-NEXT:    csrwi vxrm, 0
-; CHECK-NEXT:    vsetvli zero, a1, e16, mf4, ta, mu
-; CHECK-NEXT:    vsmul.vx v8, v9, a0, v0.t
-; CHECK-NEXT:    ret
-entry:
-  %a = call <vscale x 1 x i16> @llvm.riscv.vsmul.mask.nxv1i16.i16(
-    <vscale x 1 x i16> %0,
-    <vscale x 1 x i16> %1,
-    i16 %2,
-    <vscale x 1 x i1> %3,
-    i32 0, i32 %4, i32 1)
-
-  ret <vscale x 1 x i16> %a
-}
-
-declare <vscale x 2 x i16> @llvm.riscv.vsmul.nxv2i16.i16(
-  <vscale x 2 x i16>,
-  <vscale x 2 x i16>,
-  i16,
-  i32, i32);
-
-define <vscale x 2 x i16> @intrinsic_vsmul_vx_nxv2i16_nxv2i16_i16(<vscale x 2 x i16> %0, i16 %1, i32 %2) nounwind {
-; CHECK-LABEL: intrinsic_vsmul_vx_nxv2i16_nxv2i16_i16:
-; CHECK:       # %bb.0: # %entry
-; CHECK-NEXT:    csrwi vxrm, 0
-; CHECK-NEXT:    vsetvli zero, a1, e16, mf2, ta, ma
-; CHECK-NEXT:    vsmul.vx v8, v8, a0
-; CHECK-NEXT:    ret
-entry:
-  %a = call <vscale x 2 x i16> @llvm.riscv.vsmul.nxv2i16.i16(
-    <vscale x 2 x i16> undef,
-    <vscale x 2 x i16> %0,
-    i16 %1,
-    i32 0, i32 %2)
-
-  ret <vscale x 2 x i16> %a
-}
-
-declare <vscale x 2 x i16> @llvm.riscv.vsmul.mask.nxv2i16.i16(
-  <vscale x 2 x i16>,
-  <vscale x 2 x i16>,
-  i16,
-  <vscale x 2 x i1>,
-  i32, i32, i32);
-
-define <vscale x 2 x i16> @intrinsic_vsmul_mask_vx_nxv2i16_nxv2i16_i16(<vscale x 2 x i16> %0, <vscale x 2 x i16> %1, i16 %2, <vscale x 2 x i1> %3, i32 %4) nounwind {
-; CHECK-LABEL: intrinsic_vsmul_mask_vx_nxv2i16_nxv2i16_i16:
-; CHECK:       # %bb.0: # %entry
-; CHECK-NEXT:    csrwi vxrm, 0
-; CHECK-NEXT:    vsetvli zero, a1, e16, mf2, ta, mu
-; CHECK-NEXT:    vsmul.vx v8, v9, a0, v0.t
-; CHECK-NEXT:    ret
-entry:
-  %a = call <vscale x 2 x i16> @llvm.riscv.vsmul.mask.nxv2i16.i16(
-    <vscale x 2 x i16> %0,
-    <vscale x 2 x i16> %1,
-    i16 %2,
-    <vscale x 2 x i1> %3,
-    i32 0, i32 %4, i32 1)
-
-  ret <vscale x 2 x i16> %a
-}
-
-declare <vscale x 4 x i16> @llvm.riscv.vsmul.nxv4i16.i16(
-  <vscale x 4 x i16>,
-  <vscale x 4 x i16>,
-  i16,
-  i32, i32);
-
-define <vscale x 4 x i16> @intrinsic_vsmul_vx_nxv4i16_nxv4i16_i16(<vscale x 4 x i16> %0, i16 %1, i32 %2) nounwind {
-; CHECK-LABEL: intrinsic_vsmul_vx_nxv4i16_nxv4i16_i16:
-; CHECK:       # %bb.0: # %entry
-; CHECK-NEXT:    csrwi vxrm, 0
-; CHECK-NEXT:    vsetvli zero, a1, e16, m1, ta, ma
-; CHECK-NEXT:    vsmul.vx v8, v8, a0
-; CHECK-NEXT:    ret
-entry:
-  %a = call <vscale x 4 x i16> @llvm.riscv.vsmul.nxv4i16.i16(
-    <vscale x 4 x i16> undef,
-    <vscale x 4 x i16> %0,
-    i16 %1,
-    i32 0, i32 %2)
-
-  ret <vscale x 4 x i16> %a
-}
-
-declare <vscale x 4 x i16> @llvm.riscv.vsmul.mask.nxv4i16.i16(
-  <vscale x 4 x i16>,
-  <vscale x 4 x i16>,
-  i16,
-  <vscale x 4 x i1>,
-  i32, i32, i32);
-
-define <vscale x 4 x i16> @intrinsic_vsmul_mask_vx_nxv4i16_nxv4i16_i16(<vscale x 4 x i16> %0, <vscale x 4 x i16> %1, i16 %2, <vscale x 4 x i1> %3, i32 %4) nounwind {
-; CHECK-LABEL: intrinsic_vsmul_mask_vx_nxv4i16_nxv4i16_i16:
-; CHECK:       # %bb.0: # %entry
-; CHECK-NEXT:    csrwi vxrm, 0
-; CHECK-NEXT:    vsetvli zero, a1, e16, m1, ta, mu
-; CHECK-NEXT:    vsmul.vx v8, v9, a0, v0.t
-; CHECK-NEXT:    ret
-entry:
-  %a = call <vscale x 4 x i16> @llvm.riscv.vsmul.mask.nxv4i16.i16(
-    <vscale x 4 x i16> %0,
-    <vscale x 4 x i16> %1,
-    i16 %2,
-    <vscale x 4 x i1> %3,
-    i32 0, i32 %4, i32 1)
-
-  ret <vscale x 4 x i16> %a
-}
-
-declare <vscale x 8 x i16> @llvm.riscv.vsmul.nxv8i16.i16(
-  <vscale x 8 x i16>,
-  <vscale x 8 x i16>,
-  i16,
-  i32, i32);
-
-define <vscale x 8 x i16> @intrinsic_vsmul_vx_nxv8i16_nxv8i16_i16(<vscale x 8 x i16> %0, i16 %1, i32 %2) nounwind {
-; CHECK-LABEL: intrinsic_vsmul_vx_nxv8i16_nxv8i16_i16:
-; CHECK:       # %bb.0: # %entry
-; CHECK-NEXT:    csrwi vxrm, 0
-; CHECK-NEXT:    vsetvli zero, a1, e16, m2, ta, ma
-; CHECK-NEXT:    vsmul.vx v8, v8, a0
-; CHECK-NEXT:    ret
-entry:
-  %a = call <vscale x 8 x i16> @llvm.riscv.vsmul.nxv8i16.i16(
-    <vscale x 8 x i16> undef,
-    <vscale x 8 x i16> %0,
-    i16 %1,
-    i32 0, i32 %2)
-
-  ret <vscale x 8 x i16> %a
-}
-
-declare <vscale x 8 x i16> @llvm.riscv.vsmul.mask.nxv8i16.i16(
-  <vscale x 8 x i16>,
-  <vscale x 8 x i16>,
-  i16,
-  <vscale x 8 x i1>,
-  i32, i32, i32);
-
-define <vscale x 8 x i16> @intrinsic_vsmul_mask_vx_nxv8i16_nxv8i16_i16(<vscale x 8 x i16> %0, <vscale x 8 x i16> %1, i16 %2, <vscale x 8 x i1> %3, i32 %4) nounwind {
-; CHECK-LABEL: intrinsic_vsmul_mask_vx_nxv8i16_nxv8i16_i16:
-; CHECK:       # %bb.0: # %entry
-; CHECK-NEXT:    csrwi vxrm, 0
-; CHECK-NEXT:    vsetvli zero, a1, e16, m2, ta, mu
-; CHECK-NEXT:    vsmul.vx v8, v10, a0, v0.t
-; CHECK-NEXT:    ret
-entry:
-  %a = call <vscale x 8 x i16> @llvm.riscv.vsmul.mask.nxv8i16.i16(
-    <vscale x 8 x i16> %0,
-    <vscale x 8 x i16> %1,
-    i16 %2,
-    <vscale x 8 x i1> %3,
-    i32 0, i32 %4, i32 1)
-
-  ret <vscale x 8 x i16> %a
-}
-
-declare <vscale x 16 x i16> @llvm.riscv.vsmul.nxv16i16.i16(
-  <vscale x 16 x i16>,
-  <vscale x 16 x i16>,
-  i16,
-  i32, i32);
-
-define <vscale x 16 x i16> @intrinsic_vsmul_vx_nxv16i16_nxv16i16_i16(<vscale x 16 x i16> %0, i16 %1, i32 %2) nounwind {
-; CHECK-LABEL: intrinsic_vsmul_vx_nxv16i16_nxv16i16_i16:
-; CHECK:       # %bb.0: # %entry
-; CHECK-NEXT:    csrwi vxrm, 0
-; CHECK-NEXT:    vsetvli zero, a1, e16, m4, ta, ma
-; CHECK-NEXT:    vsmul.vx v8, v8, a0
-; CHECK-NEXT:    ret
-entry:
-  %a = call <vscale x 16 x i16> @llvm.riscv.vsmul.nxv16i16.i16(
-    <vscale x 16 x i16> undef,
-    <vscale x 16 x i16> %0,
-    i16 %1,
-    i32 0, i32 %2)
-
-  ret <vscale x 16 x i16> %a
-}
-
-declare <vscale x 16 x i16> @llvm.riscv.vsmul.mask.nxv16i16.i16(
-  <vscale x 16 x i16>,
-  <vscale x 16 x i16>,
-  i16,
-  <vscale x 16 x i1>,
-  i32, i32, i32);
-
-define <vscale x 16 x i16> @intrinsic_vsmul_mask_vx_nxv16i16_nxv16i16_i16(<vscale x 16 x i16> %0, <vscale x 16 x i16> %1, i16 %2, <vscale x 16 x i1> %3, i32 %4) nounwind {
-; CHECK-LABEL: intrinsic_vsmul_mask_vx_nxv16i16_nxv16i16_i16:
-; CHECK:       # %bb.0: # %entry
-; CHECK-NEXT:    csrwi vxrm, 0
-; CHECK-NEXT:    vsetvli zero, a1, e16, m4, ta, mu
-; CHECK-NEXT:    vsmul.vx v8, v12, a0, v0.t
-; CHECK-NEXT:    ret
-entry:
-  %a = call <vscale x 16 x i16> @llvm.riscv.vsmul.mask.nxv16i16.i16(
-    <vscale x 16 x i16> %0,
-    <vscale x 16 x i16> %1,
-    i16 %2,
-    <vscale x 16 x i1> %3,
-    i32 0, i32 %4, i32 1)
-
-  ret <vscale x 16 x i16> %a
-}
-
-declare <vscale x 32 x i16> @llvm.riscv.vsmul.nxv32i16.i16(
-  <vscale x 32 x i16>,
-  <vscale x 32 x i16>,
-  i16,
-  i32, i32);
-
-define <vscale x 32 x i16> @intrinsic_vsmul_vx_nxv32i16_nxv32i16_i16(<vscale x 32 x i16> %0, i16 %1, i32 %2) nounwind {
-; CHECK-LABEL: intrinsic_vsmul_vx_nxv32i16_nxv32i16_i16:
-; CHECK:       # %bb.0: # %entry
-; CHECK-NEXT:    csrwi vxrm, 0
-; CHECK-NEXT:    vsetvli zero, a1, e16, m8, ta, ma
-; CHECK-NEXT:    vsmul.vx v8, v8, a0
-; CHECK-NEXT:    ret
-entry:
-  %a = call <vscale x 32 x i16> @llvm.riscv.vsmul.nxv32i16.i16(
-    <vscale x 32 x i16> undef,
-    <vscale x 32 x i16> %0,
-    i16 %1,
-    i32 0, i32 %2)
-
-  ret <vscale x 32 x i16> %a
-}
-
-declare <vscale x 32 x i16> @llvm.riscv.vsmul.mask.nxv32i16.i16(
-  <vscale x 32 x i16>,
-  <vscale x 32 x i16>,
-  i16,
-  <vscale x 32 x i1>,
-  i32, i32, i32);
-
-define <vscale x 32 x i16> @intrinsic_vsmul_mask_vx_nxv32i16_nxv32i16_i16(<vscale x 32 x i16> %0, <vscale x 32 x i16> %1, i16 %2, <vscale x 32 x i1> %3, i32 %4) nounwind {
-; CHECK-LABEL: intrinsic_vsmul_mask_vx_nxv32i16_nxv32i16_i16:
-; CHECK:       # %bb.0: # %entry
-; CHECK-NEXT:    csrwi vxrm, 0
-; CHECK-NEXT:    vsetvli zero, a1, e16, m8, ta, mu
-; CHECK-NEXT:    vsmul.vx v8, v16, a0, v0.t
-; CHECK-NEXT:    ret
-entry:
-  %a = call <vscale x 32 x i16> @llvm.riscv.vsmul.mask.nxv32i16.i16(
-    <vscale x 32 x i16> %0,
-    <vscale x 32 x i16> %1,
-    i16 %2,
-    <vscale x 32 x i1> %3,
-    i32 0, i32 %4, i32 1)
-
-  ret <vscale x 32 x i16> %a
-}
-
-declare <vscale x 1 x i32> @llvm.riscv.vsmul.nxv1i32.i32(
-  <vscale x 1 x i32>,
-  <vscale x 1 x i32>,
-  i32, i32, i32);
-
-define <vscale x 1 x i32> @intrinsic_vsmul_vx_nxv1i32_nxv1i32_i32(<vscale x 1 x i32> %0, i32 %1, i32 %2) nounwind {
-; CHECK-LABEL: intrinsic_vsmul_vx_nxv1i32_nxv1i32_i32:
-; CHECK:       # %bb.0: # %entry
-; CHECK-NEXT:    csrwi vxrm, 0
-; CHECK-NEXT:    vsetvli zero, a1, e32, mf2, ta, ma
-; CHECK-NEXT:    vsmul.vx v8, v8, a0
-; CHECK-NEXT:    ret
-entry:
-  %a = call <vscale x 1 x i32> @llvm.riscv.vsmul.nxv1i32.i32(
-    <vscale x 1 x i32> undef,
-    <vscale x 1 x i32> %0,
-    i32 %1,
-    i32 0, i32 %2)
-
-  ret <vscale x 1 x i32> %a
-}
-
-declare <vscale x 1 x i32> @llvm.riscv.vsmul.mask.nxv1i32.i32(
-  <vscale x 1 x i32>,
-  <vscale x 1 x i32>,
-  i32,
-  <vscale x 1 x i1>,
-  i32, i32, i32);
-
-define <vscale x 1 x i32> @intrinsic_vsmul_mask_vx_nxv1i32_nxv1i32_i32(<vscale x 1 x i32> %0, <vscale x 1 x i32> %1, i32 %2, <vscale x 1 x i1> %3, i32 %4) nounwind {
-; CHECK-LABEL: intrinsic_vsmul_mask_vx_nxv1i32_nxv1i32_i32:
-; CHECK:       # %bb.0: # %entry
-; CHECK-NEXT:    csrwi vxrm, 0
-; CHECK-NEXT:    vsetvli zero, a1, e32, mf2, ta, mu
-; CHECK-NEXT:    vsmul.vx v8, v9, a0, v0.t
-; CHECK-NEXT:    ret
-entry:
-  %a = call <vscale x 1 x i32> @llvm.riscv.vsmul.mask.nxv1i32.i32(
-    <vscale x 1 x i32> %0,
-    <vscale x 1 x i32> %1,
-    i32 %2,
-    <vscale x 1 x i1> %3,
-    i32 0, i32 %4, i32 1)
-
-  ret <vscale x 1 x i32> %a
-}
-
-declare <vscale x 2 x i32> @llvm.riscv.vsmul.nxv2i32.i32(
-  <vscale x 2 x i32>,
-  <vscale x 2 x i32>,
-  i32, i32, i32);
-
-define <vscale x 2 x i32> @intrinsic_vsmul_vx_nxv2i32_nxv2i32_i32(<vscale x 2 x i32> %0, i32 %1, i32 %2) nounwind {
-; CHECK-LABEL: intrinsic_vsmul_vx_nxv2i32_nxv2i32_i32:
-; CHECK:       # %bb.0: # %entry
-; CHECK-NEXT:    csrwi vxrm, 0
-; CHECK-NEXT:    vsetvli zero, a1, e32, m1, ta, ma
-; CHECK-NEXT:    vsmul.vx v8, v8, a0
-; CHECK-NEXT:    ret
-entry:
-  %a = call <vscale x 2 x i32> @llvm.riscv.vsmul.nxv2i32.i32(
-    <vscale x 2 x i32> undef,
-    <vscale x 2 x i32> %0,
-    i32 %1,
-    i32 0, i32 %2)
-
-  ret <vscale x 2 x i32> %a
-}
-
-declare <vscale x 2 x i32> @llvm.riscv.vsmul.mask.nxv2i32.i32(
-  <vscale x 2 x i32>,
-  <vscale x 2 x i32>,
-  i32,
-  <vscale x 2 x i1>,
-  i32, i32, i32);
-
-define <vscale x 2 x i32> @intrinsic_vsmul_mask_vx_nxv2i32_nxv2i32_i32(<vscale x 2 x i32> %0, <vscale x 2 x i32> %1, i32 %2, <vscale x 2 x i1> %3, i32 %4) nounwind {
-; CHECK-LABEL: intrinsic_vsmul_mask_vx_nxv2i32_nxv2i32_i32:
-; CHECK:       # %bb.0: # %entry
-; CHECK-NEXT:    csrwi vxrm, 0
-; CHECK-NEXT:    vsetvli zero, a1, e32, m1, ta, mu
-; CHECK-NEXT:    vsmul.vx v8, v9, a0, v0.t
-; CHECK-NEXT:    ret
-entry:
-  %a = call <vscale x 2 x i32> @llvm.riscv.vsmul.mask.nxv2i32.i32(
-    <vscale x 2 x i32> %0,
-    <vscale x 2 x i32> %1,
-    i32 %2,
-    <vscale x 2 x i1> %3,
-    i32 0, i32 %4, i32 1)
-
-  ret <vscale x 2 x i32> %a
-}
-
-declare <vscale x 4 x i32> @llvm.riscv.vsmul.nxv4i32.i32(
-  <vscale x 4 x i32>,
-  <vscale x 4 x i32>,
-  i32, i32, i32);
-
-define <vscale x 4 x i32> @intrinsic_vsmul_vx_nxv4i32_nxv4i32_i32(<vscale x 4 x i32> %0, i32 %1, i32 %2) nounwind {
-; CHECK-LABEL: intrinsic_vsmul_vx_nxv4i32_nxv4i32_i32:
-; CHECK:       # %bb.0: # %entry
-; CHECK-NEXT:    csrwi vxrm, 0
-; CHECK-NEXT:    vsetvli zero, a1, e32, m2, ta, ma
-; CHECK-NEXT:    vsmul.vx v8, v8, a0
-; CHECK-NEXT:    ret
-entry:
-  %a = call <vscale x 4 x i32> @llvm.riscv.vsmul.nxv4i32.i32(
-    <vscale x 4 x i32> undef,
-    <vscale x 4 x i32> %0,
-    i32 %1,
-    i32 0, i32 %2)
-
-  ret <vscale x 4 x i32> %a
-}
-
-declare <vscale x 4 x i32> @llvm.riscv.vsmul.mask.nxv4i32.i32(
-  <vscale x 4 x i32>,
-  <vscale x 4 x i32>,
-  i32,
-  <vscale x 4 x i1>,
-  i32, i32, i32);
-
-define <vscale x 4 x i32> @intrinsic_vsmul_mask_vx_nxv4i32_nxv4i32_i32(<vscale x 4 x i32> %0, <vscale x 4 x i32> %1, i32 %2, <vscale x 4 x i1> %3, i32 %4) nounwind {
-; CHECK-LABEL: intrinsic_vsmul_mask_vx_nxv4i32_nxv4i32_i32:
-; CHECK:       # %bb.0: # %entry
-; CHECK-NEXT:    csrwi vxrm, 0
-; CHECK-NEXT:    vsetvli zero, a1, e32, m2, ta, mu
-; CHECK-NEXT:    vsmul.vx v8, v10, a0, v0.t
-; CHECK-NEXT:    ret
-entry:
-  %a = call <vscale x 4 x i32> @llvm.riscv.vsmul.mask.nxv4i32.i32(
-    <vscale x 4 x i32> %0,
-    <vscale x 4 x i32> %1,
-    i32 %2,
-    <vscale x 4 x i1> %3,
-    i32 0, i32 %4, i32 1)
-
-  ret <vscale x 4 x i32> %a
-}
-
-declare <vscale x 8 x i32> @llvm.riscv.vsmul.nxv8i32.i32(
-  <vscale x 8 x i32>,
-  <vscale x 8 x i32>,
-  i32, i32, i32);
-
-define <vscale x 8 x i32> @intrinsic_vsmul_vx_nxv8i32_nxv8i32_i32(<vscale x 8 x i32> %0, i32 %1, i32 %2) nounwind {
-; CHECK-LABEL: intrinsic_vsmul_vx_nxv8i32_nxv8i32_i32:
-; CHECK:       # %bb.0: # %entry
-; CHECK-NEXT:    csrwi vxrm, 0
-; CHECK-NEXT:    vsetvli zero, a1, e32, m4, ta, ma
-; CHECK-NEXT:    vsmul.vx v8, v8, a0
-; CHECK-NEXT:    ret
-entry:
-  %a = call <vscale x 8 x i32> @llvm.riscv.vsmul.nxv8i32.i32(
-    <vscale x 8 x i32> undef,
-    <vscale x 8 x i32> %0,
-    i32 %1,
-    i32 0, i32 %2)
-
-  ret <vscale x 8 x i32> %a
-}
-
-declare <vscale x 8 x i32> @llvm.riscv.vsmul.mask.nxv8i32.i32(
-  <vscale x 8 x i32>,
-  <vscale x 8 x i32>,
-  i32,
-  <vscale x 8 x i1>,
-  i32, i32, i32);
-
-define <vscale x 8 x i32> @intrinsic_vsmul_mask_vx_nxv8i32_nxv8i32_i32(<vscale x 8 x i32> %0, <vscale x 8 x i32> %1, i32 %2, <vscale x 8 x i1> %3, i32 %4) nounwind {
-; CHECK-LABEL: intrinsic_vsmul_mask_vx_nxv8i32_nxv8i32_i32:
-; CHECK:       # %bb.0: # %entry
-; CHECK-NEXT:    csrwi vxrm, 0
-; CHECK-NEXT:    vsetvli zero, a1, e32, m4, ta, mu
-; CHECK-NEXT:    vsmul.vx v8, v12, a0, v0.t
-; CHECK-NEXT:    ret
-entry:
-  %a = call <vscale x 8 x i32> @llvm.riscv.vsmul.mask.nxv8i32.i32(
-    <vscale x 8 x i32> %0,
-    <vscale x 8 x i32> %1,
-    i32 %2,
-    <vscale x 8 x i1> %3,
-    i32 0, i32 %4, i32 1)
-
-  ret <vscale x 8 x i32> %a
-}
-
-declare <vscale x 16 x i32> @llvm.riscv.vsmul.nxv16i32.i32(
-  <vscale x 16 x i32>,
-  <vscale x 16 x i32>,
-  i32, i32, i32);
-
-define <vscale x 16 x i32> @intrinsic_vsmul_vx_nxv16i32_nxv16i32_i32(<vscale x 16 x i32> %0, i32 %1, i32 %2) nounwind {
-; CHECK-LABEL: intrinsic_vsmul_vx_nxv16i32_nxv16i32_i32:
-; CHECK:       # %bb.0: # %entry
-; CHECK-NEXT:    csrwi vxrm, 0
-; CHECK-NEXT:    vsetvli zero, a1, e32, m8, ta, ma
-; CHECK-NEXT:    vsmul.vx v8, v8, a0
-; CHECK-NEXT:    ret
-entry:
-  %a = call <vscale x 16 x i32> @llvm.riscv.vsmul.nxv16i32.i32(
-    <vscale x 16 x i32> undef,
-    <vscale x 16 x i32> %0,
-    i32 %1,
-    i32 0, i32 %2)
-
-  ret <vscale x 16 x i32> %a
-}
-
-declare <vscale x 16 x i32> @llvm.riscv.vsmul.mask.nxv16i32.i32(
-  <vscale x 16 x i32>,
-  <vscale x 16 x i32>,
-  i32,
-  <vscale x 16 x i1>,
-  i32, i32, i32);
-
-define <vscale x 16 x i32> @intrinsic_vsmul_mask_vx_nxv16i32_nxv16i32_i32(<vscale x 16 x i32> %0, <vscale x 16 x i32> %1, i32 %2, <vscale x 16 x i1> %3, i32 %4) nounwind {
-; CHECK-LABEL: intrinsic_vsmul_mask_vx_nxv16i32_nxv16i32_i32:
-; CHECK:       # %bb.0: # %entry
-; CHECK-NEXT:    csrwi vxrm, 0
-; CHECK-NEXT:    vsetvli zero, a1, e32, m8, ta, mu
-; CHECK-NEXT:    vsmul.vx v8, v16, a0, v0.t
-; CHECK-NEXT:    ret
-entry:
-  %a = call <vscale x 16 x i32> @llvm.riscv.vsmul.mask.nxv16i32.i32(
-    <vscale x 16 x i32> %0,
-    <vscale x 16 x i32> %1,
-    i32 %2,
-    <vscale x 16 x i1> %3,
-    i32 0, i32 %4, i32 1)
-
-  ret <vscale x 16 x i32> %a
-}
-
-declare <vscale x 1 x i64> @llvm.riscv.vsmul.nxv1i64.i64(
-  <vscale x 1 x i64>,
-  <vscale x 1 x i64>,
-  i64,
-  i32, i32);
-
-define <vscale x 1 x i64> @intrinsic_vsmul_vx_nxv1i64_nxv1i64_i64(<vscale x 1 x i64> %0, i64 %1, i32 %2) nounwind {
-; CHECK-LABEL: intrinsic_vsmul_vx_nxv1i64_nxv1i64_i64:
-; CHECK:       # %bb.0: # %entry
-; CHECK-NEXT:    addi sp, sp, -16
-; CHECK-NEXT:    sw a1, 12(sp)
-; CHECK-NEXT:    sw a0, 8(sp)
-; CHECK-NEXT:    addi a0, sp, 8
-; CHECK-NEXT:    vsetvli zero, a2, e64, m1, ta, ma
-; CHECK-NEXT:    vlse64.v v9, (a0), zero
-; CHECK-NEXT:    csrwi vxrm, 0
-; CHECK-NEXT:    vsmul.vv v8, v8, v9
-; CHECK-NEXT:    addi sp, sp, 16
-; CHECK-NEXT:    ret
-entry:
-  %a = call <vscale x 1 x i64> @llvm.riscv.vsmul.nxv1i64.i64(
-    <vscale x 1 x i64> undef,
-    <vscale x 1 x i64> %0,
-    i64 %1,
-    i32 0, i32 %2)
-
-  ret <vscale x 1 x i64> %a
-}
-
-declare <vscale x 1 x i64> @llvm.riscv.vsmul.mask.nxv1i64.i64(
-  <vscale x 1 x i64>,
-  <vscale x 1 x i64>,
-  i64,
-  <vscale x 1 x i1>,
-  i32, i32, i32);
-
-define <vscale x 1 x i64> @intrinsic_vsmul_mask_vx_nxv1i64_nxv1i64_i64(<vscale x 1 x i64> %0, <vscale x 1 x i64> %1, i64 %2, <vscale x 1 x i1> %3, i32 %4) nounwind {
-; CHECK-LABEL: intrinsic_vsmul_mask_vx_nxv1i64_nxv1i64_i64:
-; CHECK:       # %bb.0: # %entry
-; CHECK-NEXT:    addi sp, sp, -16
-; CHECK-NEXT:    sw a1, 12(sp)
-; CHECK-NEXT:    sw a0, 8(sp)
-; CHECK-NEXT:    addi a0, sp, 8
-; CHECK-NEXT:    vsetvli zero, a2, e64, m1, ta, mu
-; CHECK-NEXT:    vlse64.v v10, (a0), zero
-; CHECK-NEXT:    csrwi vxrm, 0
-; CHECK-NEXT:    vsmul.vv v8, v9, v10, v0.t
-; CHECK-NEXT:    addi sp, sp, 16
-; CHECK-NEXT:    ret
-entry:
-  %a = call <vscale x 1 x i64> @llvm.riscv.vsmul.mask.nxv1i64.i64(
-    <vscale x 1 x i64> %0,
-    <vscale x 1 x i64> %1,
-    i64 %2,
-    <vscale x 1 x i1> %3,
-    i32 0, i32 %4, i32 1)
-
-  ret <vscale x 1 x i64> %a
-}
-
-declare <vscale x 2 x i64> @llvm.riscv.vsmul.nxv2i64.i64(
-  <vscale x 2 x i64>,
-  <vscale x 2 x i64>,
-  i64,
-  i32, i32);
-
-define <vscale x 2 x i64> @intrinsic_vsmul_vx_nxv2i64_nxv2i64_i64(<vscale x 2 x i64> %0, i64 %1, i32 %2) nounwind {
-; CHECK-LABEL: intrinsic_vsmul_vx_nxv2i64_nxv2i64_i64:
-; CHECK:       # %bb.0: # %entry
-; CHECK-NEXT:    addi sp, sp, -16
-; CHECK-NEXT:    sw a1, 12(sp)
-; CHECK-NEXT:    sw a0, 8(sp)
-; CHECK-NEXT:    addi a0, sp, 8
-; CHECK-NEXT:    vsetvli zero, a2, e64, m2, ta, ma
-; CHECK-NEXT:    vlse64.v v10, (a0), zero
-; CHECK-NEXT:    csrwi vxrm, 0
-; CHECK-NEXT:    vsmul.vv v8, v8, v10
-; CHECK-NEXT:    addi sp, sp, 16
-; CHECK-NEXT:    ret
-entry:
-  %a = call <vscale x 2 x i64> @llvm.riscv.vsmul.nxv2i64.i64(
-    <vscale x 2 x i64> undef,
-    <vscale x 2 x i64> %0,
-    i64 %1,
-    i32 0, i32 %2)
-
-  ret <vscale x 2 x i64> %a
-}
-
-declare <vscale x 2 x i64> @llvm.riscv.vsmul.mask.nxv2i64.i64(
-  <vscale x 2 x i64>,
-  <vscale x 2 x i64>,
-  i64,
-  <vscale x 2 x i1>,
-  i32, i32, i32);
-
-define <vscale x 2 x i64> @intrinsic_vsmul_mask_vx_nxv2i64_nxv2i64_i64(<vscale x 2 x i64> %0, <vscale x 2 x i64> %1, i64 %2, <vscale x 2 x i1> %3, i32 %4) nounwind {
-; CHECK-LABEL: intrinsic_vsmul_mask_vx_nxv2i64_nxv2i64_i64:
-; CHECK:       # %bb.0: # %entry
-; CHECK-NEXT:    addi sp, sp, -16
-; CHECK-NEXT:    sw a1, 12(sp)
-; CHECK-NEXT:    sw a0, 8(sp)
-; CHECK-NEXT:    addi a0, sp, 8
-; CHECK-NEXT:    vsetvli zero, a2, e64, m2, ta, mu
-; CHECK-NEXT:    vlse64.v v12, (a0), zero
-; CHECK-NEXT:    csrwi vxrm, 0
-; CHECK-NEXT:    vsmul.vv v8, v10, v12, v0.t
-; CHECK-NEXT:    addi sp, sp, 16
-; CHECK-NEXT:    ret
-entry:
-  %a = call <vscale x 2 x i64> @llvm.riscv.vsmul.mask.nxv2i64.i64(
-    <vscale x 2 x i64> %0,
-    <vscale x 2 x i64> %1,
-    i64 %2,
-    <vscale x 2 x i1> %3,
-    i32 0, i32 %4, i32 1)
-
-  ret <vscale x 2 x i64> %a
-}
-
-declare <vscale x 4 x i64> @llvm.riscv.vsmul.nxv4i64.i64(
-  <vscale x 4 x i64>,
-  <vscale x 4 x i64>,
-  i64,
-  i32, i32);
-
-define <vscale x 4 x i64> @intrinsic_vsmul_vx_nxv4i64_nxv4i64_i64(<vscale x 4 x i64> %0, i64 %1, i32 %2) nounwind {
-; CHECK-LABEL: intrinsic_vsmul_vx_nxv4i64_nxv4i64_i64:
-; CHECK:       # %bb.0: # %entry
-; CHECK-NEXT:    addi sp, sp, -16
-; CHECK-NEXT:    sw a1, 12(sp)
-; CHECK-NEXT:    sw a0, 8(sp)
-; CHECK-NEXT:    addi a0, sp, 8
-; CHECK-NEXT:    vsetvli zero, a2, e64, m4, ta, ma
-; CHECK-NEXT:    vlse64.v v12, (a0), zero
-; CHECK-NEXT:    csrwi vxrm, 0
-; CHECK-NEXT:    vsmul.vv v8, v8, v12
-; CHECK-NEXT:    addi sp, sp, 16
-; CHECK-NEXT:    ret
-entry:
-  %a = call <vscale x 4 x i64> @llvm.riscv.vsmul.nxv4i64.i64(
-    <vscale x 4 x i64> undef,
-    <vscale x 4 x i64> %0,
-    i64 %1,
-    i32 0, i32 %2)
-
-  ret <vscale x 4 x i64> %a
-}
-
-declare <vscale x 4 x i64> @llvm.riscv.vsmul.mask.nxv4i64.i64(
-  <vscale x 4 x i64>,
-  <vscale x 4 x i64>,
-  i64,
-  <vscale x 4 x i1>,
-  i32, i32, i32);
-
-define <vscale x 4 x i64> @intrinsic_vsmul_mask_vx_nxv4i64_nxv4i64_i64(<vscale x 4 x i64> %0, <vscale x 4 x i64> %1, i64 %2, <vscale x 4 x i1> %3, i32 %4) nounwind {
-; CHECK-LABEL: intrinsic_vsmul_mask_vx_nxv4i64_nxv4i64_i64:
-; CHECK:       # %bb.0: # %entry
-; CHECK-NEXT:    addi sp, sp, -16
-; CHECK-NEXT:    sw a1, 12(sp)
-; CHECK-NEXT:    sw a0, 8(sp)
-; CHECK-NEXT:    addi a0, sp, 8
-; CHECK-NEXT:    vsetvli zero, a2, e64, m4, ta, mu
-; CHECK-NEXT:    vlse64.v v16, (a0), zero
-; CHECK-NEXT:    csrwi vxrm, 0
-; CHECK-NEXT:    vsmul.vv v8, v12, v16, v0.t
-; CHECK-NEXT:    addi sp, sp, 16
-; CHECK-NEXT:    ret
-entry:
-  %a = call <vscale x 4 x i64> @llvm.riscv.vsmul.mask.nxv4i64.i64(
-    <vscale x 4 x i64> %0,
-    <vscale x 4 x i64> %1,
-    i64 %2,
-    <vscale x 4 x i1> %3,
-    i32 0, i32 %4, i32 1)
-
-  ret <vscale x 4 x i64> %a
-}
-
-declare <vscale x 8 x i64> @llvm.riscv.vsmul.nxv8i64.i64(
-  <vscale x 8 x i64>,
-  <vscale x 8 x i64>,
-  i64,
-  i32, i32);
-
-define <vscale x 8 x i64> @intrinsic_vsmul_vx_nxv8i64_nxv8i64_i64(<vscale x 8 x i64> %0, i64 %1, i32 %2) nounwind {
-; CHECK-LABEL: intrinsic_vsmul_vx_nxv8i64_nxv8i64_i64:
-; CHECK:       # %bb.0: # %entry
-; CHECK-NEXT:    addi sp, sp, -16
-; CHECK-NEXT:    sw a1, 12(sp)
-; CHECK-NEXT:    sw a0, 8(sp)
-; CHECK-NEXT:    addi a0, sp, 8
-; CHECK-NEXT:    vsetvli zero, a2, e64, m8, ta, ma
-; CHECK-NEXT:    vlse64.v v16, (a0), zero
-; CHECK-NEXT:    csrwi vxrm, 0
-; CHECK-NEXT:    vsmul.vv v8, v8, v16
-; CHECK-NEXT:    addi sp, sp, 16
-; CHECK-NEXT:    ret
-entry:
-  %a = call <vscale x 8 x i64> @llvm.riscv.vsmul.nxv8i64.i64(
-    <vscale x 8 x i64> undef,
-    <vscale x 8 x i64> %0,
-    i64 %1,
-    i32 0, i32 %2)
-
-  ret <vscale x 8 x i64> %a
-}
-
-declare <vscale x 8 x i64> @llvm.riscv.vsmul.mask.nxv8i64.i64(
-  <vscale x 8 x i64>,
-  <vscale x 8 x i64>,
-  i64,
-  <vscale x 8 x i1>,
-  i32, i32, i32);
-
-define <vscale x 8 x i64> @intrinsic_vsmul_mask_vx_nxv8i64_nxv8i64_i64(<vscale x 8 x i64> %0, <vscale x 8 x i64> %1, i64 %2, <vscale x 8 x i1> %3, i32 %4) nounwind {
-; CHECK-LABEL: intrinsic_vsmul_mask_vx_nxv8i64_nxv8i64_i64:
-; CHECK:       # %bb.0: # %entry
-; CHECK-NEXT:    addi sp, sp, -16
-; CHECK-NEXT:    sw a1, 12(sp)
-; CHECK-NEXT:    sw a0, 8(sp)
-; CHECK-NEXT:    addi a0, sp, 8
-; CHECK-NEXT:    vsetvli zero, a2, e64, m8, ta, mu
-; CHECK-NEXT:    vlse64.v v24, (a0), zero
-; CHECK-NEXT:    csrwi vxrm, 0
-; CHECK-NEXT:    vsmul.vv v8, v16, v24, v0.t
-; CHECK-NEXT:    addi sp, sp, 16
-; CHECK-NEXT:    ret
-entry:
-  %a = call <vscale x 8 x i64> @llvm.riscv.vsmul.mask.nxv8i64.i64(
-    <vscale x 8 x i64> %0,
-    <vscale x 8 x i64> %1,
-    i64 %2,
-    <vscale x 8 x i1> %3,
-    i32 0, i32 %4, i32 1)
-
-  ret <vscale x 8 x i64> %a
-}
diff --git a/llvm/test/CodeGen/RISCV/rvv/vsmul-rv64.ll b/llvm/test/CodeGen/RISCV/rvv/vsmul.ll
index 66bc5c9..bc53bce 100644
--- a/llvm/test/CodeGen/RISCV/rvv/vsmul-rv64.ll
+++ b/llvm/test/CodeGen/RISCV/rvv/vsmul.ll
@@ -1,8 +1,10 @@
 ; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py
-; RUN: llc -mtriple=riscv64 -mattr=+v -verify-machineinstrs \
-; RUN:   < %s | FileCheck %s
-; RUN: not --crash llc -mtriple=riscv64  -mattr=+zve64d 2>&1 \
-; RUN:   < %s | FileCheck %s --check-prefixes=ZVE64D
+; RUN: sed 's/iXLen/i32/g' %s | llc -mtriple=riscv32 -mattr=+v \
+; RUN:   -verify-machineinstrs  | FileCheck %s --check-prefixes=CHECK,RV32
+; RUN: sed 's/iXLen/i64/g' %s | llc -mtriple=riscv64 -mattr=+v \
+; RUN:   -verify-machineinstrs  | FileCheck %s --check-prefixes=CHECK,RV64
+; RUN: sed 's/iXLen/i64/g' %s | not --crash llc -mtriple=riscv64 \
+; RUN:   -mattr=+zve64d 2>&1 | FileCheck %s --check-prefixes=ZVE64D
 
 ; ZVE64D: LLVM ERROR: Cannot select: intrinsic %llvm.riscv.vsmul
 
@@ -10,9 +12,9 @@ declare <vscale x 1 x i8> @llvm.riscv.vsmul.nxv1i8.nxv1i8(
   <vscale x 1 x i8>,
   <vscale x 1 x i8>,
   <vscale x 1 x i8>,
-  i64, i64);
+  iXLen, iXLen)
 
-define <vscale x 1 x i8> @intrinsic_vsmul_vv_nxv1i8_nxv1i8_nxv1i8(<vscale x 1 x i8> %0, <vscale x 1 x i8> %1, i64 %2) nounwind {
+define <vscale x 1 x i8> @intrinsic_vsmul_vv_nxv1i8_nxv1i8_nxv1i8(<vscale x 1 x i8> %0, <vscale x 1 x i8> %1, iXLen %2) nounwind {
 ; CHECK-LABEL: intrinsic_vsmul_vv_nxv1i8_nxv1i8_nxv1i8:
 ; CHECK:       # %bb.0: # %entry
 ; CHECK-NEXT:    csrwi vxrm, 0
@@ -24,7 +26,7 @@ entry:
     <vscale x 1 x i8> undef,
     <vscale x 1 x i8> %0,
     <vscale x 1 x i8> %1,
-    i64 0, i64 %2)
+    iXLen 0, iXLen %2)
 
   ret <vscale x 1 x i8> %a
 }
@@ -34,9 +36,9 @@ declare <vscale x 1 x i8> @llvm.riscv.vsmul.mask.nxv1i8.nxv1i8(
   <vscale x 1 x i8>,
   <vscale x 1 x i8>,
   <vscale x 1 x i1>,
-  i64, i64, i64);
+  iXLen, iXLen, iXLen)
 
-define <vscale x 1 x i8> @intrinsic_vsmul_mask_vv_nxv1i8_nxv1i8_nxv1i8(<vscale x 1 x i8> %0, <vscale x 1 x i8> %1, <vscale x 1 x i8> %2, <vscale x 1 x i1> %3, i64 %4) nounwind {
+define <vscale x 1 x i8> @intrinsic_vsmul_mask_vv_nxv1i8_nxv1i8_nxv1i8(<vscale x 1 x i8> %0, <vscale x 1 x i8> %1, <vscale x 1 x i8> %2, <vscale x 1 x i1> %3, iXLen %4) nounwind {
 ; CHECK-LABEL: intrinsic_vsmul_mask_vv_nxv1i8_nxv1i8_nxv1i8:
 ; CHECK:       # %bb.0: # %entry
 ; CHECK-NEXT:    csrwi vxrm, 0
@@ -49,7 +51,7 @@ entry:
     <vscale x 1 x i8> %1,
     <vscale x 1 x i8> %2,
     <vscale x 1 x i1> %3,
-    i64 0, i64 %4, i64 1)
+    iXLen 0, iXLen %4, iXLen 1)
 
   ret <vscale x 1 x i8> %a
 }
@@ -58,9 +60,9 @@ declare <vscale x 2 x i8> @llvm.riscv.vsmul.nxv2i8.nxv2i8(
   <vscale x 2 x i8>,
   <vscale x 2 x i8>,
   <vscale x 2 x i8>,
-  i64, i64);
+  iXLen, iXLen)
 
-define <vscale x 2 x i8> @intrinsic_vsmul_vv_nxv2i8_nxv2i8_nxv2i8(<vscale x 2 x i8> %0, <vscale x 2 x i8> %1, i64 %2) nounwind {
+define <vscale x 2 x i8> @intrinsic_vsmul_vv_nxv2i8_nxv2i8_nxv2i8(<vscale x 2 x i8> %0, <vscale x 2 x i8> %1, iXLen %2) nounwind {
 ; CHECK-LABEL: intrinsic_vsmul_vv_nxv2i8_nxv2i8_nxv2i8:
 ; CHECK:       # %bb.0: # %entry
 ; CHECK-NEXT:    csrwi vxrm, 0
@@ -72,7 +74,7 @@ entry:
     <vscale x 2 x i8> undef,
     <vscale x 2 x i8> %0,
     <vscale x 2 x i8> %1,
-    i64 0, i64 %2)
+    iXLen 0, iXLen %2)
 
   ret <vscale x 2 x i8> %a
 }
@@ -82,9 +84,9 @@ declare <vscale x 2 x i8> @llvm.riscv.vsmul.mask.nxv2i8.nxv2i8(
   <vscale x 2 x i8>,
   <vscale x 2 x i8>,
   <vscale x 2 x i1>,
-  i64, i64, i64);
+  iXLen, iXLen, iXLen)
 
-define <vscale x 2 x i8> @intrinsic_vsmul_mask_vv_nxv2i8_nxv2i8_nxv2i8(<vscale x 2 x i8> %0, <vscale x 2 x i8> %1, <vscale x 2 x i8> %2, <vscale x 2 x i1> %3, i64 %4) nounwind {
+define <vscale x 2 x i8> @intrinsic_vsmul_mask_vv_nxv2i8_nxv2i8_nxv2i8(<vscale x 2 x i8> %0, <vscale x 2 x i8> %1, <vscale x 2 x i8> %2, <vscale x 2 x i1> %3, iXLen %4) nounwind {
 ; CHECK-LABEL: intrinsic_vsmul_mask_vv_nxv2i8_nxv2i8_nxv2i8:
 ; CHECK:       # %bb.0: # %entry
 ; CHECK-NEXT:    csrwi vxrm, 0
@@ -97,7 +99,7 @@ entry:
     <vscale x 2 x i8> %1,
     <vscale x 2 x i8> %2,
     <vscale x 2 x i1> %3,
-    i64 0, i64 %4, i64 1)
+    iXLen 0, iXLen %4, iXLen 1)
 
   ret <vscale x 2 x i8> %a
 }
@@ -106,9 +108,9 @@ declare <vscale x 4 x i8> @llvm.riscv.vsmul.nxv4i8.nxv4i8(
   <vscale x 4 x i8>,
   <vscale x 4 x i8>,
   <vscale x 4 x i8>,
-  i64, i64);
+  iXLen, iXLen)
 
-define <vscale x 4 x i8> @intrinsic_vsmul_vv_nxv4i8_nxv4i8_nxv4i8(<vscale x 4 x i8> %0, <vscale x 4 x i8> %1, i64 %2) nounwind {
+define <vscale x 4 x i8> @intrinsic_vsmul_vv_nxv4i8_nxv4i8_nxv4i8(<vscale x 4 x i8> %0, <vscale x 4 x i8> %1, iXLen %2) nounwind {
 ; CHECK-LABEL: intrinsic_vsmul_vv_nxv4i8_nxv4i8_nxv4i8:
 ; CHECK:       # %bb.0: # %entry
 ; CHECK-NEXT:    csrwi vxrm, 0
@@ -120,7 +122,7 @@ entry:
     <vscale x 4 x i8> undef,
     <vscale x 4 x i8> %0,
     <vscale x 4 x i8> %1,
-    i64 0, i64 %2)
+    iXLen 0, iXLen %2)
 
   ret <vscale x 4 x i8> %a
 }
@@ -130,9 +132,9 @@ declare <vscale x 4 x i8> @llvm.riscv.vsmul.mask.nxv4i8.nxv4i8(
   <vscale x 4 x i8>,
   <vscale x 4 x i8>,
   <vscale x 4 x i1>,
-  i64, i64, i64);
+  iXLen, iXLen, iXLen)
 
-define <vscale x 4 x i8> @intrinsic_vsmul_mask_vv_nxv4i8_nxv4i8_nxv4i8(<vscale x 4 x i8> %0, <vscale x 4 x i8> %1, <vscale x 4 x i8> %2, <vscale x 4 x i1> %3, i64 %4) nounwind {
+define <vscale x 4 x i8> @intrinsic_vsmul_mask_vv_nxv4i8_nxv4i8_nxv4i8(<vscale x 4 x i8> %0, <vscale x 4 x i8> %1, <vscale x 4 x i8> %2, <vscale x 4 x i1> %3, iXLen %4) nounwind {
 ; CHECK-LABEL: intrinsic_vsmul_mask_vv_nxv4i8_nxv4i8_nxv4i8:
 ; CHECK:       # %bb.0: # %entry
 ; CHECK-NEXT:    csrwi vxrm, 0
@@ -145,7 +147,7 @@ entry:
     <vscale x 4 x i8> %1,
     <vscale x 4 x i8> %2,
     <vscale x 4 x i1> %3,
-    i64 0, i64 %4, i64 1)
+    iXLen 0, iXLen %4, iXLen 1)
 
   ret <vscale x 4 x i8> %a
 }
@@ -154,9 +156,9 @@ declare <vscale x 8 x i8> @llvm.riscv.vsmul.nxv8i8.nxv8i8(
   <vscale x 8 x i8>,
   <vscale x 8 x i8>,
   <vscale x 8 x i8>,
-  i64, i64);
+  iXLen, iXLen)
 
-define <vscale x 8 x i8> @intrinsic_vsmul_vv_nxv8i8_nxv8i8_nxv8i8(<vscale x 8 x i8> %0, <vscale x 8 x i8> %1, i64 %2) nounwind {
+define <vscale x 8 x i8> @intrinsic_vsmul_vv_nxv8i8_nxv8i8_nxv8i8(<vscale x 8 x i8> %0, <vscale x 8 x i8> %1, iXLen %2) nounwind {
 ; CHECK-LABEL: intrinsic_vsmul_vv_nxv8i8_nxv8i8_nxv8i8:
 ; CHECK:       # %bb.0: # %entry
 ; CHECK-NEXT:    csrwi vxrm, 0
@@ -168,7 +170,7 @@ entry:
     <vscale x 8 x i8> undef,
     <vscale x 8 x i8> %0,
     <vscale x 8 x i8> %1,
-    i64 0, i64 %2)
+    iXLen 0, iXLen %2)
 
   ret <vscale x 8 x i8> %a
 }
@@ -178,9 +180,9 @@ declare <vscale x 8 x i8> @llvm.riscv.vsmul.mask.nxv8i8.nxv8i8(
   <vscale x 8 x i8>,
   <vscale x 8 x i8>,
   <vscale x 8 x i1>,
-  i64, i64, i64);
+  iXLen, iXLen, iXLen)
 
-define <vscale x 8 x i8> @intrinsic_vsmul_mask_vv_nxv8i8_nxv8i8_nxv8i8(<vscale x 8 x i8> %0, <vscale x 8 x i8> %1, <vscale x 8 x i8> %2, <vscale x 8 x i1> %3, i64 %4) nounwind {
+define <vscale x 8 x i8> @intrinsic_vsmul_mask_vv_nxv8i8_nxv8i8_nxv8i8(<vscale x 8 x i8> %0, <vscale x 8 x i8> %1, <vscale x 8 x i8> %2, <vscale x 8 x i1> %3, iXLen %4) nounwind {
 ; CHECK-LABEL: intrinsic_vsmul_mask_vv_nxv8i8_nxv8i8_nxv8i8:
 ; CHECK:       # %bb.0: # %entry
 ; CHECK-NEXT:    csrwi vxrm, 0
@@ -193,7 +195,7 @@ entry:
     <vscale x 8 x i8> %1,
     <vscale x 8 x i8> %2,
     <vscale x 8 x i1> %3,
-    i64 0, i64 %4, i64 1)
+    iXLen 0, iXLen %4, iXLen 1)
 
   ret <vscale x 8 x i8> %a
 }
@@ -202,9 +204,9 @@ declare <vscale x 16 x i8> @llvm.riscv.vsmul.nxv16i8.nxv16i8(
   <vscale x 16 x i8>,
   <vscale x 16 x i8>,
   <vscale x 16 x i8>,
-  i64, i64);
+  iXLen, iXLen)
 
-define <vscale x 16 x i8> @intrinsic_vsmul_vv_nxv16i8_nxv16i8_nxv16i8(<vscale x 16 x i8> %0, <vscale x 16 x i8> %1, i64 %2) nounwind {
+define <vscale x 16 x i8> @intrinsic_vsmul_vv_nxv16i8_nxv16i8_nxv16i8(<vscale x 16 x i8> %0, <vscale x 16 x i8> %1, iXLen %2) nounwind {
 ; CHECK-LABEL: intrinsic_vsmul_vv_nxv16i8_nxv16i8_nxv16i8:
 ; CHECK:       # %bb.0: # %entry
 ; CHECK-NEXT:    csrwi vxrm, 0
@@ -216,7 +218,7 @@ entry:
     <vscale x 16 x i8> undef,
     <vscale x 16 x i8> %0,
     <vscale x 16 x i8> %1,
-    i64 0, i64 %2)
+    iXLen 0, iXLen %2)
 
   ret <vscale x 16 x i8> %a
 }
@@ -226,9 +228,9 @@ declare <vscale x 16 x i8> @llvm.riscv.vsmul.mask.nxv16i8.nxv16i8(
   <vscale x 16 x i8>,
   <vscale x 16 x i8>,
   <vscale x 16 x i1>,
-  i64, i64, i64);
+  iXLen, iXLen, iXLen)
 
-define <vscale x 16 x i8> @intrinsic_vsmul_mask_vv_nxv16i8_nxv16i8_nxv16i8(<vscale x 16 x i8> %0, <vscale x 16 x i8> %1, <vscale x 16 x i8> %2, <vscale x 16 x i1> %3, i64 %4) nounwind {
+define <vscale x 16 x i8> @intrinsic_vsmul_mask_vv_nxv16i8_nxv16i8_nxv16i8(<vscale x 16 x i8> %0, <vscale x 16 x i8> %1, <vscale x 16 x i8> %2, <vscale x 16 x i1> %3, iXLen %4) nounwind {
 ; CHECK-LABEL: intrinsic_vsmul_mask_vv_nxv16i8_nxv16i8_nxv16i8:
 ; CHECK:       # %bb.0: # %entry
 ; CHECK-NEXT:    csrwi vxrm, 0
@@ -241,7 +243,7 @@ entry:
     <vscale x 16 x i8> %1,
     <vscale x 16 x i8> %2,
     <vscale x 16 x i1> %3,
-    i64 0, i64 %4, i64 1)
+    iXLen 0, iXLen %4, iXLen 1)
 
   ret <vscale x 16 x i8> %a
 }
@@ -250,9 +252,9 @@ declare <vscale x 32 x i8> @llvm.riscv.vsmul.nxv32i8.nxv32i8(
   <vscale x 32 x i8>,
   <vscale x 32 x i8>,
   <vscale x 32 x i8>,
-  i64, i64);
+  iXLen, iXLen)
 
-define <vscale x 32 x i8> @intrinsic_vsmul_vv_nxv32i8_nxv32i8_nxv32i8(<vscale x 32 x i8> %0, <vscale x 32 x i8> %1, i64 %2) nounwind {
+define <vscale x 32 x i8> @intrinsic_vsmul_vv_nxv32i8_nxv32i8_nxv32i8(<vscale x 32 x i8> %0, <vscale x 32 x i8> %1, iXLen %2) nounwind {
 ; CHECK-LABEL: intrinsic_vsmul_vv_nxv32i8_nxv32i8_nxv32i8:
 ; CHECK:       # %bb.0: # %entry
 ; CHECK-NEXT:    csrwi vxrm, 0
@@ -264,7 +266,7 @@ entry:
     <vscale x 32 x i8> undef,
     <vscale x 32 x i8> %0,
     <vscale x 32 x i8> %1,
-    i64 0, i64 %2)
+    iXLen 0, iXLen %2)
 
   ret <vscale x 32 x i8> %a
 }
@@ -274,9 +276,9 @@ declare <vscale x 32 x i8> @llvm.riscv.vsmul.mask.nxv32i8.nxv32i8(
   <vscale x 32 x i8>,
   <vscale x 32 x i8>,
   <vscale x 32 x i1>,
-  i64, i64, i64);
+  iXLen, iXLen, iXLen)
 
-define <vscale x 32 x i8> @intrinsic_vsmul_mask_vv_nxv32i8_nxv32i8_nxv32i8(<vscale x 32 x i8> %0, <vscale x 32 x i8> %1, <vscale x 32 x i8> %2, <vscale x 32 x i1> %3, i64 %4) nounwind {
+define <vscale x 32 x i8> @intrinsic_vsmul_mask_vv_nxv32i8_nxv32i8_nxv32i8(<vscale x 32 x i8> %0, <vscale x 32 x i8> %1, <vscale x 32 x i8> %2, <vscale x 32 x i1> %3, iXLen %4) nounwind {
 ; CHECK-LABEL: intrinsic_vsmul_mask_vv_nxv32i8_nxv32i8_nxv32i8:
 ; CHECK:       # %bb.0: # %entry
 ; CHECK-NEXT:    csrwi vxrm, 0
@@ -289,7 +291,7 @@ entry:
     <vscale x 32 x i8> %1,
     <vscale x 32 x i8> %2,
     <vscale x 32 x i1> %3,
-    i64 0, i64 %4, i64 1)
+    iXLen 0, iXLen %4, iXLen 1)
 
   ret <vscale x 32 x i8> %a
 }
@@ -298,9 +300,9 @@ declare <vscale x 64 x i8> @llvm.riscv.vsmul.nxv64i8.nxv64i8(
   <vscale x 64 x i8>,
   <vscale x 64 x i8>,
   <vscale x 64 x i8>,
-  i64, i64);
+  iXLen, iXLen)
 
-define <vscale x 64 x i8> @intrinsic_vsmul_vv_nxv64i8_nxv64i8_nxv64i8(<vscale x 64 x i8> %0, <vscale x 64 x i8> %1, i64 %2) nounwind {
+define <vscale x 64 x i8> @intrinsic_vsmul_vv_nxv64i8_nxv64i8_nxv64i8(<vscale x 64 x i8> %0, <vscale x 64 x i8> %1, iXLen %2) nounwind {
 ; CHECK-LABEL: intrinsic_vsmul_vv_nxv64i8_nxv64i8_nxv64i8:
 ; CHECK:       # %bb.0: # %entry
 ; CHECK-NEXT:    csrwi vxrm, 0
@@ -312,7 +314,7 @@ entry:
     <vscale x 64 x i8> undef,
     <vscale x 64 x i8> %0,
     <vscale x 64 x i8> %1,
-    i64 0, i64 %2)
+    iXLen 0, iXLen %2)
 
   ret <vscale x 64 x i8> %a
 }
@@ -322,9 +324,9 @@ declare <vscale x 64 x i8> @llvm.riscv.vsmul.mask.nxv64i8.nxv64i8(
   <vscale x 64 x i8>,
   <vscale x 64 x i8>,
   <vscale x 64 x i1>,
-  i64, i64, i64);
+  iXLen, iXLen, iXLen)
 
-define <vscale x 64 x i8> @intrinsic_vsmul_mask_vv_nxv64i8_nxv64i8_nxv64i8(<vscale x 64 x i8> %0, <vscale x 64 x i8> %1, <vscale x 64 x i8> %2, <vscale x 64 x i1> %3, i64 %4) nounwind {
+define <vscale x 64 x i8> @intrinsic_vsmul_mask_vv_nxv64i8_nxv64i8_nxv64i8(<vscale x 64 x i8> %0, <vscale x 64 x i8> %1, <vscale x 64 x i8> %2, <vscale x 64 x i1> %3, iXLen %4) nounwind {
 ; CHECK-LABEL: intrinsic_vsmul_mask_vv_nxv64i8_nxv64i8_nxv64i8:
 ; CHECK:       # %bb.0: # %entry
 ; CHECK-NEXT:    vl8r.v v24, (a0)
@@ -338,7 +340,7 @@ entry:
     <vscale x 64 x i8> %1,
     <vscale x 64 x i8> %2,
     <vscale x 64 x i1> %3,
-    i64 0, i64 %4, i64 1)
+    iXLen 0, iXLen %4, iXLen 1)
 
   ret <vscale x 64 x i8> %a
 }
@@ -347,9 +349,9 @@ declare <vscale x 1 x i16> @llvm.riscv.vsmul.nxv1i16.nxv1i16(
   <vscale x 1 x i16>,
   <vscale x 1 x i16>,
   <vscale x 1 x i16>,
-  i64, i64);
+  iXLen, iXLen)
 
-define <vscale x 1 x i16> @intrinsic_vsmul_vv_nxv1i16_nxv1i16_nxv1i16(<vscale x 1 x i16> %0, <vscale x 1 x i16> %1, i64 %2) nounwind {
+define <vscale x 1 x i16> @intrinsic_vsmul_vv_nxv1i16_nxv1i16_nxv1i16(<vscale x 1 x i16> %0, <vscale x 1 x i16> %1, iXLen %2) nounwind {
 ; CHECK-LABEL: intrinsic_vsmul_vv_nxv1i16_nxv1i16_nxv1i16:
 ; CHECK:       # %bb.0: # %entry
 ; CHECK-NEXT:    csrwi vxrm, 0
@@ -361,7 +363,7 @@ entry:
     <vscale x 1 x i16> undef,
     <vscale x 1 x i16> %0,
     <vscale x 1 x i16> %1,
-    i64 0, i64 %2)
+    iXLen 0, iXLen %2)
 
   ret <vscale x 1 x i16> %a
 }
@@ -371,9 +373,9 @@ declare <vscale x 1 x i16> @llvm.riscv.vsmul.mask.nxv1i16.nxv1i16(
   <vscale x 1 x i16>,
   <vscale x 1 x i16>,
   <vscale x 1 x i1>,
-  i64, i64, i64);
+  iXLen, iXLen, iXLen)
 
-define <vscale x 1 x i16> @intrinsic_vsmul_mask_vv_nxv1i16_nxv1i16_nxv1i16(<vscale x 1 x i16> %0, <vscale x 1 x i16> %1, <vscale x 1 x i16> %2, <vscale x 1 x i1> %3, i64 %4) nounwind {
+define <vscale x 1 x i16> @intrinsic_vsmul_mask_vv_nxv1i16_nxv1i16_nxv1i16(<vscale x 1 x i16> %0, <vscale x 1 x i16> %1, <vscale x 1 x i16> %2, <vscale x 1 x i1> %3, iXLen %4) nounwind {
 ; CHECK-LABEL: intrinsic_vsmul_mask_vv_nxv1i16_nxv1i16_nxv1i16:
 ; CHECK:       # %bb.0: # %entry
 ; CHECK-NEXT:    csrwi vxrm, 0
@@ -386,7 +388,7 @@ entry:
     <vscale x 1 x i16> %1,
     <vscale x 1 x i16> %2,
     <vscale x 1 x i1> %3,
-    i64 0, i64 %4, i64 1)
+    iXLen 0, iXLen %4, iXLen 1)
 
   ret <vscale x 1 x i16> %a
 }
@@ -395,9 +397,9 @@ declare <vscale x 2 x i16> @llvm.riscv.vsmul.nxv2i16.nxv2i16(
   <vscale x 2 x i16>,
   <vscale x 2 x i16>,
   <vscale x 2 x i16>,
-  i64, i64);
+  iXLen, iXLen)
 
-define <vscale x 2 x i16> @intrinsic_vsmul_vv_nxv2i16_nxv2i16_nxv2i16(<vscale x 2 x i16> %0, <vscale x 2 x i16> %1, i64 %2) nounwind {
+define <vscale x 2 x i16> @intrinsic_vsmul_vv_nxv2i16_nxv2i16_nxv2i16(<vscale x 2 x i16> %0, <vscale x 2 x i16> %1, iXLen %2) nounwind {
 ; CHECK-LABEL: intrinsic_vsmul_vv_nxv2i16_nxv2i16_nxv2i16:
 ; CHECK:       # %bb.0: # %entry
 ; CHECK-NEXT:    csrwi vxrm, 0
@@ -409,7 +411,7 @@ entry:
     <vscale x 2 x i16> undef,
     <vscale x 2 x i16> %0,
     <vscale x 2 x i16> %1,
-    i64 0, i64 %2)
+    iXLen 0, iXLen %2)
 
   ret <vscale x 2 x i16> %a
 }
@@ -419,9 +421,9 @@ declare <vscale x 2 x i16> @llvm.riscv.vsmul.mask.nxv2i16.nxv2i16(
   <vscale x 2 x i16>,
   <vscale x 2 x i16>,
   <vscale x 2 x i1>,
-  i64, i64, i64);
+  iXLen, iXLen, iXLen)
 
-define <vscale x 2 x i16> @intrinsic_vsmul_mask_vv_nxv2i16_nxv2i16_nxv2i16(<vscale x 2 x i16> %0, <vscale x 2 x i16> %1, <vscale x 2 x i16> %2, <vscale x 2 x i1> %3, i64 %4) nounwind {
+define <vscale x 2 x i16> @intrinsic_vsmul_mask_vv_nxv2i16_nxv2i16_nxv2i16(<vscale x 2 x i16> %0, <vscale x 2 x i16> %1, <vscale x 2 x i16> %2, <vscale x 2 x i1> %3, iXLen %4) nounwind {
 ; CHECK-LABEL: intrinsic_vsmul_mask_vv_nxv2i16_nxv2i16_nxv2i16:
 ; CHECK:       # %bb.0: # %entry
 ; CHECK-NEXT:    csrwi vxrm, 0
@@ -434,7 +436,7 @@ entry:
     <vscale x 2 x i16> %1,
     <vscale x 2 x i16> %2,
     <vscale x 2 x i1> %3,
-    i64 0, i64 %4, i64 1)
+    iXLen 0, iXLen %4, iXLen 1)
 
   ret <vscale x 2 x i16> %a
 }
@@ -443,9 +445,9 @@ declare <vscale x 4 x i16> @llvm.riscv.vsmul.nxv4i16.nxv4i16(
   <vscale x 4 x i16>,
   <vscale x 4 x i16>,
   <vscale x 4 x i16>,
-  i64, i64);
+  iXLen, iXLen)
 
-define <vscale x 4 x i16> @intrinsic_vsmul_vv_nxv4i16_nxv4i16_nxv4i16(<vscale x 4 x i16> %0, <vscale x 4 x i16> %1, i64 %2) nounwind {
+define <vscale x 4 x i16> @intrinsic_vsmul_vv_nxv4i16_nxv4i16_nxv4i16(<vscale x 4 x i16> %0, <vscale x 4 x i16> %1, iXLen %2) nounwind {
 ; CHECK-LABEL: intrinsic_vsmul_vv_nxv4i16_nxv4i16_nxv4i16:
 ; CHECK:       # %bb.0: # %entry
 ; CHECK-NEXT:    csrwi vxrm, 0
@@ -457,7 +459,7 @@ entry:
     <vscale x 4 x i16> undef,
     <vscale x 4 x i16> %0,
     <vscale x 4 x i16> %1,
-    i64 0, i64 %2)
+    iXLen 0, iXLen %2)
 
   ret <vscale x 4 x i16> %a
 }
@@ -467,9 +469,9 @@ declare <vscale x 4 x i16> @llvm.riscv.vsmul.mask.nxv4i16.nxv4i16(
   <vscale x 4 x i16>,
   <vscale x 4 x i16>,
   <vscale x 4 x i1>,
-  i64, i64, i64);
+  iXLen, iXLen, iXLen)
 
-define <vscale x 4 x i16> @intrinsic_vsmul_mask_vv_nxv4i16_nxv4i16_nxv4i16(<vscale x 4 x i16> %0, <vscale x 4 x i16> %1, <vscale x 4 x i16> %2, <vscale x 4 x i1> %3, i64 %4) nounwind {
+define <vscale x 4 x i16> @intrinsic_vsmul_mask_vv_nxv4i16_nxv4i16_nxv4i16(<vscale x 4 x i16> %0, <vscale x 4 x i16> %1, <vscale x 4 x i16> %2, <vscale x 4 x i1> %3, iXLen %4) nounwind {
 ; CHECK-LABEL: intrinsic_vsmul_mask_vv_nxv4i16_nxv4i16_nxv4i16:
 ; CHECK:       # %bb.0: # %entry
 ; CHECK-NEXT:    csrwi vxrm, 0
@@ -482,7 +484,7 @@ entry:
     <vscale x 4 x i16> %1,
     <vscale x 4 x i16> %2,
     <vscale x 4 x i1> %3,
-    i64 0, i64 %4, i64 1)
+    iXLen 0, iXLen %4, iXLen 1)
 
   ret <vscale x 4 x i16> %a
 }
@@ -491,9 +493,9 @@ declare <vscale x 8 x i16> @llvm.riscv.vsmul.nxv8i16.nxv8i16(
   <vscale x 8 x i16>,
   <vscale x 8 x i16>,
   <vscale x 8 x i16>,
-  i64, i64);
+  iXLen, iXLen)
 
-define <vscale x 8 x i16> @intrinsic_vsmul_vv_nxv8i16_nxv8i16_nxv8i16(<vscale x 8 x i16> %0, <vscale x 8 x i16> %1, i64 %2) nounwind {
+define <vscale x 8 x i16> @intrinsic_vsmul_vv_nxv8i16_nxv8i16_nxv8i16(<vscale x 8 x i16> %0, <vscale x 8 x i16> %1, iXLen %2) nounwind {
 ; CHECK-LABEL: intrinsic_vsmul_vv_nxv8i16_nxv8i16_nxv8i16:
 ; CHECK:       # %bb.0: # %entry
 ; CHECK-NEXT:    csrwi vxrm, 0
@@ -505,7 +507,7 @@ entry:
     <vscale x 8 x i16> undef,
     <vscale x 8 x i16> %0,
     <vscale x 8 x i16> %1,
-    i64 0, i64 %2)
+    iXLen 0, iXLen %2)
 
   ret <vscale x 8 x i16> %a
 }
@@ -515,9 +517,9 @@ declare <vscale x 8 x i16> @llvm.riscv.vsmul.mask.nxv8i16.nxv8i16(
   <vscale x 8 x i16>,
   <vscale x 8 x i16>,
   <vscale x 8 x i1>,
-  i64, i64, i64);
+  iXLen, iXLen, iXLen)
 
-define <vscale x 8 x i16> @intrinsic_vsmul_mask_vv_nxv8i16_nxv8i16_nxv8i16(<vscale x 8 x i16> %0, <vscale x 8 x i16> %1, <vscale x 8 x i16> %2, <vscale x 8 x i1> %3, i64 %4) nounwind {
+define <vscale x 8 x i16> @intrinsic_vsmul_mask_vv_nxv8i16_nxv8i16_nxv8i16(<vscale x 8 x i16> %0, <vscale x 8 x i16> %1, <vscale x 8 x i16> %2, <vscale x 8 x i1> %3, iXLen %4) nounwind {
 ; CHECK-LABEL: intrinsic_vsmul_mask_vv_nxv8i16_nxv8i16_nxv8i16:
 ; CHECK:       # %bb.0: # %entry
 ; CHECK-NEXT:    csrwi vxrm, 0
@@ -530,7 +532,7 @@ entry:
     <vscale x 8 x i16> %1,
     <vscale x 8 x i16> %2,
     <vscale x 8 x i1> %3,
-    i64 0, i64 %4, i64 1)
+    iXLen 0, iXLen %4, iXLen 1)
 
   ret <vscale x 8 x i16> %a
 }
@@ -539,9 +541,9 @@ declare <vscale x 16 x i16> @llvm.riscv.vsmul.nxv16i16.nxv16i16(
   <vscale x 16 x i16>,
   <vscale x 16 x i16>,
   <vscale x 16 x i16>,
-  i64, i64);
+  iXLen, iXLen)
 
-define <vscale x 16 x i16> @intrinsic_vsmul_vv_nxv16i16_nxv16i16_nxv16i16(<vscale x 16 x i16> %0, <vscale x 16 x i16> %1, i64 %2) nounwind {
+define <vscale x 16 x i16> @intrinsic_vsmul_vv_nxv16i16_nxv16i16_nxv16i16(<vscale x 16 x i16> %0, <vscale x 16 x i16> %1, iXLen %2) nounwind {
 ; CHECK-LABEL: intrinsic_vsmul_vv_nxv16i16_nxv16i16_nxv16i16:
 ; CHECK:       # %bb.0: # %entry
 ; CHECK-NEXT:    csrwi vxrm, 0
@@ -553,7 +555,7 @@ entry:
     <vscale x 16 x i16> undef,
     <vscale x 16 x i16> %0,
     <vscale x 16 x i16> %1,
-    i64 0, i64 %2)
+    iXLen 0, iXLen %2)
 
   ret <vscale x 16 x i16> %a
 }
@@ -563,9 +565,9 @@ declare <vscale x 16 x i16> @llvm.riscv.vsmul.mask.nxv16i16.nxv16i16(
   <vscale x 16 x i16>,
   <vscale x 16 x i16>,
   <vscale x 16 x i1>,
-  i64, i64, i64);
+  iXLen, iXLen, iXLen)
 
-define <vscale x 16 x i16> @intrinsic_vsmul_mask_vv_nxv16i16_nxv16i16_nxv16i16(<vscale x 16 x i16> %0, <vscale x 16 x i16> %1, <vscale x 16 x i16> %2, <vscale x 16 x i1> %3, i64 %4) nounwind {
+define <vscale x 16 x i16> @intrinsic_vsmul_mask_vv_nxv16i16_nxv16i16_nxv16i16(<vscale x 16 x i16> %0, <vscale x 16 x i16> %1, <vscale x 16 x i16> %2, <vscale x 16 x i1> %3, iXLen %4) nounwind {
 ; CHECK-LABEL: intrinsic_vsmul_mask_vv_nxv16i16_nxv16i16_nxv16i16:
 ; CHECK:       # %bb.0: # %entry
 ; CHECK-NEXT:    csrwi vxrm, 0
@@ -578,7 +580,7 @@ entry:
     <vscale x 16 x i16> %1,
     <vscale x 16 x i16> %2,
     <vscale x 16 x i1> %3,
-    i64 0, i64 %4, i64 1)
+    iXLen 0, iXLen %4, iXLen 1)
 
   ret <vscale x 16 x i16> %a
 }
@@ -587,9 +589,9 @@ declare <vscale x 32 x i16> @llvm.riscv.vsmul.nxv32i16.nxv32i16(
   <vscale x 32 x i16>,
   <vscale x 32 x i16>,
   <vscale x 32 x i16>,
-  i64, i64);
+  iXLen, iXLen)
 
-define <vscale x 32 x i16> @intrinsic_vsmul_vv_nxv32i16_nxv32i16_nxv32i16(<vscale x 32 x i16> %0, <vscale x 32 x i16> %1, i64 %2) nounwind {
+define <vscale x 32 x i16> @intrinsic_vsmul_vv_nxv32i16_nxv32i16_nxv32i16(<vscale x 32 x i16> %0, <vscale x 32 x i16> %1, iXLen %2) nounwind {
 ; CHECK-LABEL: intrinsic_vsmul_vv_nxv32i16_nxv32i16_nxv32i16:
 ; CHECK:       # %bb.0: # %entry
 ; CHECK-NEXT:    csrwi vxrm, 0
@@ -601,7 +603,7 @@ entry:
     <vscale x 32 x i16> undef,
     <vscale x 32 x i16> %0,
     <vscale x 32 x i16> %1,
-    i64 0, i64 %2)
+    iXLen 0, iXLen %2)
 
   ret <vscale x 32 x i16> %a
 }
@@ -611,9 +613,9 @@ declare <vscale x 32 x i16> @llvm.riscv.vsmul.mask.nxv32i16.nxv32i16(
   <vscale x 32 x i16>,
   <vscale x 32 x i16>,
   <vscale x 32 x i1>,
-  i64, i64, i64);
+  iXLen, iXLen, iXLen)
 
-define <vscale x 32 x i16> @intrinsic_vsmul_mask_vv_nxv32i16_nxv32i16_nxv32i16(<vscale x 32 x i16> %0, <vscale x 32 x i16> %1, <vscale x 32 x i16> %2, <vscale x 32 x i1> %3, i64 %4) nounwind {
+define <vscale x 32 x i16> @intrinsic_vsmul_mask_vv_nxv32i16_nxv32i16_nxv32i16(<vscale x 32 x i16> %0, <vscale x 32 x i16> %1, <vscale x 32 x i16> %2, <vscale x 32 x i1> %3, iXLen %4) nounwind {
 ; CHECK-LABEL: intrinsic_vsmul_mask_vv_nxv32i16_nxv32i16_nxv32i16:
 ; CHECK:       # %bb.0: # %entry
 ; CHECK-NEXT:    vl8re16.v v24, (a0)
@@ -627,7 +629,7 @@ entry:
     <vscale x 32 x i16> %1,
     <vscale x 32 x i16> %2,
     <vscale x 32 x i1> %3,
-    i64 0, i64 %4, i64 1)
+    iXLen 0, iXLen %4, iXLen 1)
 
   ret <vscale x 32 x i16> %a
 }
@@ -636,9 +638,9 @@ declare <vscale x 1 x i32> @llvm.riscv.vsmul.nxv1i32.nxv1i32(
   <vscale x 1 x i32>,
   <vscale x 1 x i32>,
   <vscale x 1 x i32>,
-  i64, i64);
+  iXLen, iXLen)
 
-define <vscale x 1 x i32> @intrinsic_vsmul_vv_nxv1i32_nxv1i32_nxv1i32(<vscale x 1 x i32> %0, <vscale x 1 x i32> %1, i64 %2) nounwind {
+define <vscale x 1 x i32> @intrinsic_vsmul_vv_nxv1i32_nxv1i32_nxv1i32(<vscale x 1 x i32> %0, <vscale x 1 x i32> %1, iXLen %2) nounwind {
 ; CHECK-LABEL: intrinsic_vsmul_vv_nxv1i32_nxv1i32_nxv1i32:
 ; CHECK:       # %bb.0: # %entry
 ; CHECK-NEXT:    csrwi vxrm, 0
@@ -650,7 +652,7 @@ entry:
     <vscale x 1 x i32> undef,
     <vscale x 1 x i32> %0,
     <vscale x 1 x i32> %1,
-    i64 0, i64 %2)
+    iXLen 0, iXLen %2)
 
   ret <vscale x 1 x i32> %a
 }
@@ -660,9 +662,9 @@ declare <vscale x 1 x i32> @llvm.riscv.vsmul.mask.nxv1i32.nxv1i32(
   <vscale x 1 x i32>,
   <vscale x 1 x i32>,
   <vscale x 1 x i1>,
-  i64, i64, i64);
+  iXLen, iXLen, iXLen)
 
-define <vscale x 1 x i32> @intrinsic_vsmul_mask_vv_nxv1i32_nxv1i32_nxv1i32(<vscale x 1 x i32> %0, <vscale x 1 x i32> %1, <vscale x 1 x i32> %2, <vscale x 1 x i1> %3, i64 %4) nounwind {
+define <vscale x 1 x i32> @intrinsic_vsmul_mask_vv_nxv1i32_nxv1i32_nxv1i32(<vscale x 1 x i32> %0, <vscale x 1 x i32> %1, <vscale x 1 x i32> %2, <vscale x 1 x i1> %3, iXLen %4) nounwind {
 ; CHECK-LABEL: intrinsic_vsmul_mask_vv_nxv1i32_nxv1i32_nxv1i32:
 ; CHECK:       # %bb.0: # %entry
 ; CHECK-NEXT:    csrwi vxrm, 0
@@ -675,7 +677,7 @@ entry:
     <vscale x 1 x i32> %1,
     <vscale x 1 x i32> %2,
     <vscale x 1 x i1> %3,
-    i64 0, i64 %4, i64 1)
+    iXLen 0, iXLen %4, iXLen 1)
 
   ret <vscale x 1 x i32> %a
 }
@@ -684,9 +686,9 @@ declare <vscale x 2 x i32> @llvm.riscv.vsmul.nxv2i32.nxv2i32(
   <vscale x 2 x i32>,
   <vscale x 2 x i32>,
   <vscale x 2 x i32>,
-  i64, i64);
+  iXLen, iXLen)
 
-define <vscale x 2 x i32> @intrinsic_vsmul_vv_nxv2i32_nxv2i32_nxv2i32(<vscale x 2 x i32> %0, <vscale x 2 x i32> %1, i64 %2) nounwind {
+define <vscale x 2 x i32> @intrinsic_vsmul_vv_nxv2i32_nxv2i32_nxv2i32(<vscale x 2 x i32> %0, <vscale x 2 x i32> %1, iXLen %2) nounwind {
 ; CHECK-LABEL: intrinsic_vsmul_vv_nxv2i32_nxv2i32_nxv2i32:
 ; CHECK:       # %bb.0: # %entry
 ; CHECK-NEXT:    csrwi vxrm, 0
@@ -698,7 +700,7 @@ entry:
     <vscale x 2 x i32> undef,
     <vscale x 2 x i32> %0,
     <vscale x 2 x i32> %1,
-    i64 0, i64 %2)
+    iXLen 0, iXLen %2)
 
   ret <vscale x 2 x i32> %a
 }
@@ -708,9 +710,9 @@ declare <vscale x 2 x i32> @llvm.riscv.vsmul.mask.nxv2i32.nxv2i32(
   <vscale x 2 x i32>,
   <vscale x 2 x i32>,
   <vscale x 2 x i1>,
-  i64, i64, i64);
+  iXLen, iXLen, iXLen)
 
-define <vscale x 2 x i32> @intrinsic_vsmul_mask_vv_nxv2i32_nxv2i32_nxv2i32(<vscale x 2 x i32> %0, <vscale x 2 x i32> %1, <vscale x 2 x i32> %2, <vscale x 2 x i1> %3, i64 %4) nounwind {
+define <vscale x 2 x i32> @intrinsic_vsmul_mask_vv_nxv2i32_nxv2i32_nxv2i32(<vscale x 2 x i32> %0, <vscale x 2 x i32> %1, <vscale x 2 x i32> %2, <vscale x 2 x i1> %3, iXLen %4) nounwind {
 ; CHECK-LABEL: intrinsic_vsmul_mask_vv_nxv2i32_nxv2i32_nxv2i32:
 ; CHECK:       # %bb.0: # %entry
 ; CHECK-NEXT:    csrwi vxrm, 0
@@ -723,7 +725,7 @@ entry:
     <vscale x 2 x i32> %1,
     <vscale x 2 x i32> %2,
     <vscale x 2 x i1> %3,
-    i64 0, i64 %4, i64 1)
+    iXLen 0, iXLen %4, iXLen 1)
 
   ret <vscale x 2 x i32> %a
 }
@@ -732,9 +734,9 @@ declare <vscale x 4 x i32> @llvm.riscv.vsmul.nxv4i32.nxv4i32(
   <vscale x 4 x i32>,
   <vscale x 4 x i32>,
   <vscale x 4 x i32>,
-  i64, i64);
+  iXLen, iXLen)
 
-define <vscale x 4 x i32> @intrinsic_vsmul_vv_nxv4i32_nxv4i32_nxv4i32(<vscale x 4 x i32> %0, <vscale x 4 x i32> %1, i64 %2) nounwind {
+define <vscale x 4 x i32> @intrinsic_vsmul_vv_nxv4i32_nxv4i32_nxv4i32(<vscale x 4 x i32> %0, <vscale x 4 x i32> %1, iXLen %2) nounwind {
 ; CHECK-LABEL: intrinsic_vsmul_vv_nxv4i32_nxv4i32_nxv4i32:
 ; CHECK:       # %bb.0: # %entry
 ; CHECK-NEXT:    csrwi vxrm, 0
@@ -746,7 +748,7 @@ entry:
     <vscale x 4 x i32> undef,
     <vscale x 4 x i32> %0,
     <vscale x 4 x i32> %1,
-    i64 0, i64 %2)
+    iXLen 0, iXLen %2)
 
   ret <vscale x 4 x i32> %a
 }
@@ -756,9 +758,9 @@ declare <vscale x 4 x i32> @llvm.riscv.vsmul.mask.nxv4i32.nxv4i32(
   <vscale x 4 x i32>,
   <vscale x 4 x i32>,
   <vscale x 4 x i1>,
-  i64, i64, i64);
+  iXLen, iXLen, iXLen)
 
-define <vscale x 4 x i32> @intrinsic_vsmul_mask_vv_nxv4i32_nxv4i32_nxv4i32(<vscale x 4 x i32> %0, <vscale x 4 x i32> %1, <vscale x 4 x i32> %2, <vscale x 4 x i1> %3, i64 %4) nounwind {
+define <vscale x 4 x i32> @intrinsic_vsmul_mask_vv_nxv4i32_nxv4i32_nxv4i32(<vscale x 4 x i32> %0, <vscale x 4 x i32> %1, <vscale x 4 x i32> %2, <vscale x 4 x i1> %3, iXLen %4) nounwind {
 ; CHECK-LABEL: intrinsic_vsmul_mask_vv_nxv4i32_nxv4i32_nxv4i32:
 ; CHECK:       # %bb.0: # %entry
 ; CHECK-NEXT:    csrwi vxrm, 0
@@ -771,7 +773,7 @@ entry:
     <vscale x 4 x i32> %1,
     <vscale x 4 x i32> %2,
     <vscale x 4 x i1> %3,
-    i64 0, i64 %4, i64 1)
+    iXLen 0, iXLen %4, iXLen 1)
 
   ret <vscale x 4 x i32> %a
 }
@@ -780,9 +782,9 @@ declare <vscale x 8 x i32> @llvm.riscv.vsmul.nxv8i32.nxv8i32(
   <vscale x 8 x i32>,
   <vscale x 8 x i32>,
   <vscale x 8 x i32>,
-  i64, i64);
+  iXLen, iXLen)
 
-define <vscale x 8 x i32> @intrinsic_vsmul_vv_nxv8i32_nxv8i32_nxv8i32(<vscale x 8 x i32> %0, <vscale x 8 x i32> %1, i64 %2) nounwind {
+define <vscale x 8 x i32> @intrinsic_vsmul_vv_nxv8i32_nxv8i32_nxv8i32(<vscale x 8 x i32> %0, <vscale x 8 x i32> %1, iXLen %2) nounwind {
 ; CHECK-LABEL: intrinsic_vsmul_vv_nxv8i32_nxv8i32_nxv8i32:
 ; CHECK:       # %bb.0: # %entry
 ; CHECK-NEXT:    csrwi vxrm, 0
@@ -794,7 +796,7 @@ entry:
     <vscale x 8 x i32> undef,
     <vscale x 8 x i32> %0,
     <vscale x 8 x i32> %1,
-    i64 0, i64 %2)
+    iXLen 0, iXLen %2)
 
   ret <vscale x 8 x i32> %a
 }
@@ -804,9 +806,9 @@ declare <vscale x 8 x i32> @llvm.riscv.vsmul.mask.nxv8i32.nxv8i32(
   <vscale x 8 x i32>,
   <vscale x 8 x i32>,
   <vscale x 8 x i1>,
-  i64, i64, i64);
+  iXLen, iXLen, iXLen)
 
-define <vscale x 8 x i32> @intrinsic_vsmul_mask_vv_nxv8i32_nxv8i32_nxv8i32(<vscale x 8 x i32> %0, <vscale x 8 x i32> %1, <vscale x 8 x i32> %2, <vscale x 8 x i1> %3, i64 %4) nounwind {
+define <vscale x 8 x i32> @intrinsic_vsmul_mask_vv_nxv8i32_nxv8i32_nxv8i32(<vscale x 8 x i32> %0, <vscale x 8 x i32> %1, <vscale x 8 x i32> %2, <vscale x 8 x i1> %3, iXLen %4) nounwind {
 ; CHECK-LABEL: intrinsic_vsmul_mask_vv_nxv8i32_nxv8i32_nxv8i32:
 ; CHECK:       # %bb.0: # %entry
 ; CHECK-NEXT:    csrwi vxrm, 0
@@ -819,7 +821,7 @@ entry:
     <vscale x 8 x i32> %1,
     <vscale x 8 x i32> %2,
     <vscale x 8 x i1> %3,
-    i64 0, i64 %4, i64 1)
+    iXLen 0, iXLen %4, iXLen 1)
 
   ret <vscale x 8 x i32> %a
 }
@@ -828,9 +830,9 @@ declare <vscale x 16 x i32> @llvm.riscv.vsmul.nxv16i32.nxv16i32(
   <vscale x 16 x i32>,
   <vscale x 16 x i32>,
   <vscale x 16 x i32>,
-  i64, i64);
+  iXLen, iXLen)
 
-define <vscale x 16 x i32> @intrinsic_vsmul_vv_nxv16i32_nxv16i32_nxv16i32(<vscale x 16 x i32> %0, <vscale x 16 x i32> %1, i64 %2) nounwind {
+define <vscale x 16 x i32> @intrinsic_vsmul_vv_nxv16i32_nxv16i32_nxv16i32(<vscale x 16 x i32> %0, <vscale x 16 x i32> %1, iXLen %2) nounwind {
 ; CHECK-LABEL: intrinsic_vsmul_vv_nxv16i32_nxv16i32_nxv16i32:
 ; CHECK:       # %bb.0: # %entry
 ; CHECK-NEXT:    csrwi vxrm, 0
@@ -842,7 +844,7 @@ entry:
     <vscale x 16 x i32> undef,
     <vscale x 16 x i32> %0,
     <vscale x 16 x i32> %1,
-    i64 0, i64 %2)
+    iXLen 0, iXLen %2)
 
   ret <vscale x 16 x i32> %a
 }
@@ -852,9 +854,9 @@ declare <vscale x 16 x i32> @llvm.riscv.vsmul.mask.nxv16i32.nxv16i32(
   <vscale x 16 x i32>,
   <vscale x 16 x i32>,
   <vscale x 16 x i1>,
-  i64, i64, i64);
+  iXLen, iXLen, iXLen)
 
-define <vscale x 16 x i32> @intrinsic_vsmul_mask_vv_nxv16i32_nxv16i32_nxv16i32(<vscale x 16 x i32> %0, <vscale x 16 x i32> %1, <vscale x 16 x i32> %2, <vscale x 16 x i1> %3, i64 %4) nounwind {
+define <vscale x 16 x i32> @intrinsic_vsmul_mask_vv_nxv16i32_nxv16i32_nxv16i32(<vscale x 16 x i32> %0, <vscale x 16 x i32> %1, <vscale x 16 x i32> %2, <vscale x 16 x i1> %3, iXLen %4) nounwind {
 ; CHECK-LABEL: intrinsic_vsmul_mask_vv_nxv16i32_nxv16i32_nxv16i32:
 ; CHECK:       # %bb.0: # %entry
 ; CHECK-NEXT:    vl8re32.v v24, (a0)
@@ -868,7 +870,7 @@ entry:
     <vscale x 16 x i32> %1,
     <vscale x 16 x i32> %2,
     <vscale x 16 x i1> %3,
-    i64 0, i64 %4, i64 1)
+    iXLen 0, iXLen %4, iXLen 1)
 
   ret <vscale x 16 x i32> %a
 }
@@ -877,9 +879,9 @@ declare <vscale x 1 x i64> @llvm.riscv.vsmul.nxv1i64.nxv1i64(
   <vscale x 1 x i64>,
   <vscale x 1 x i64>,
   <vscale x 1 x i64>,
-  i64, i64);
+  iXLen, iXLen)
 
-define <vscale x 1 x i64> @intrinsic_vsmul_vv_nxv1i64_nxv1i64_nxv1i64(<vscale x 1 x i64> %0, <vscale x 1 x i64> %1, i64 %2) nounwind {
+define <vscale x 1 x i64> @intrinsic_vsmul_vv_nxv1i64_nxv1i64_nxv1i64(<vscale x 1 x i64> %0, <vscale x 1 x i64> %1, iXLen %2) nounwind {
 ; CHECK-LABEL: intrinsic_vsmul_vv_nxv1i64_nxv1i64_nxv1i64:
 ; CHECK:       # %bb.0: # %entry
 ; CHECK-NEXT:    csrwi vxrm, 0
@@ -891,7 +893,7 @@ entry:
     <vscale x 1 x i64> undef,
     <vscale x 1 x i64> %0,
     <vscale x 1 x i64> %1,
-    i64 0, i64 %2)
+    iXLen 0, iXLen %2)
 
   ret <vscale x 1 x i64> %a
 }
@@ -901,9 +903,9 @@ declare <vscale x 1 x i64> @llvm.riscv.vsmul.mask.nxv1i64.nxv1i64(
   <vscale x 1 x i64>,
   <vscale x 1 x i64>,
   <vscale x 1 x i1>,
-  i64, i64, i64);
+  iXLen, iXLen, iXLen)
 
-define <vscale x 1 x i64> @intrinsic_vsmul_mask_vv_nxv1i64_nxv1i64_nxv1i64(<vscale x 1 x i64> %0, <vscale x 1 x i64> %1, <vscale x 1 x i64> %2, <vscale x 1 x i1> %3, i64 %4) nounwind {
+define <vscale x 1 x i64> @intrinsic_vsmul_mask_vv_nxv1i64_nxv1i64_nxv1i64(<vscale x 1 x i64> %0, <vscale x 1 x i64> %1, <vscale x 1 x i64> %2, <vscale x 1 x i1> %3, iXLen %4) nounwind {
 ; CHECK-LABEL: intrinsic_vsmul_mask_vv_nxv1i64_nxv1i64_nxv1i64:
 ; CHECK:       # %bb.0: # %entry
 ; CHECK-NEXT:    csrwi vxrm, 0
@@ -916,7 +918,7 @@ entry:
     <vscale x 1 x i64> %1,
     <vscale x 1 x i64> %2,
     <vscale x 1 x i1> %3,
-    i64 0, i64 %4, i64 1)
+    iXLen 0, iXLen %4, iXLen 1)
 
   ret <vscale x 1 x i64> %a
 }
@@ -925,9 +927,9 @@ declare <vscale x 2 x i64> @llvm.riscv.vsmul.nxv2i64.nxv2i64(
   <vscale x 2 x i64>,
   <vscale x 2 x i64>,
   <vscale x 2 x i64>,
-  i64, i64);
+  iXLen, iXLen)
 
-define <vscale x 2 x i64> @intrinsic_vsmul_vv_nxv2i64_nxv2i64_nxv2i64(<vscale x 2 x i64> %0, <vscale x 2 x i64> %1, i64 %2) nounwind {
+define <vscale x 2 x i64> @intrinsic_vsmul_vv_nxv2i64_nxv2i64_nxv2i64(<vscale x 2 x i64> %0, <vscale x 2 x i64> %1, iXLen %2) nounwind {
 ; CHECK-LABEL: intrinsic_vsmul_vv_nxv2i64_nxv2i64_nxv2i64:
 ; CHECK:       # %bb.0: # %entry
 ; CHECK-NEXT:    csrwi vxrm, 0
@@ -939,7 +941,7 @@ entry:
     <vscale x 2 x i64> undef,
     <vscale x 2 x i64> %0,
     <vscale x 2 x i64> %1,
-    i64 0, i64 %2)
+    iXLen 0, iXLen %2)
 
   ret <vscale x 2 x i64> %a
 }
@@ -949,9 +951,9 @@ declare <vscale x 2 x i64> @llvm.riscv.vsmul.mask.nxv2i64.nxv2i64(
   <vscale x 2 x i64>,
   <vscale x 2 x i64>,
   <vscale x 2 x i1>,
-  i64, i64, i64);
+  iXLen, iXLen, iXLen)
 
-define <vscale x 2 x i64> @intrinsic_vsmul_mask_vv_nxv2i64_nxv2i64_nxv2i64(<vscale x 2 x i64> %0, <vscale x 2 x i64> %1, <vscale x 2 x i64> %2, <vscale x 2 x i1> %3, i64 %4) nounwind {
+define <vscale x 2 x i64> @intrinsic_vsmul_mask_vv_nxv2i64_nxv2i64_nxv2i64(<vscale x 2 x i64> %0, <vscale x 2 x i64> %1, <vscale x 2 x i64> %2, <vscale x 2 x i1> %3, iXLen %4) nounwind {
 ; CHECK-LABEL: intrinsic_vsmul_mask_vv_nxv2i64_nxv2i64_nxv2i64:
 ; CHECK:       # %bb.0: # %entry
 ; CHECK-NEXT:    csrwi vxrm, 0
@@ -964,7 +966,7 @@ entry:
     <vscale x 2 x i64> %1,
     <vscale x 2 x i64> %2,
     <vscale x 2 x i1> %3,
-    i64 0, i64 %4, i64 1)
+    iXLen 0, iXLen %4, iXLen 1)
 
   ret <vscale x 2 x i64> %a
 }
@@ -973,9 +975,9 @@ declare <vscale x 4 x i64> @llvm.riscv.vsmul.nxv4i64.nxv4i64(
   <vscale x 4 x i64>,
   <vscale x 4 x i64>,
   <vscale x 4 x i64>,
-  i64, i64);
+  iXLen, iXLen)
 
-define <vscale x 4 x i64> @intrinsic_vsmul_vv_nxv4i64_nxv4i64_nxv4i64(<vscale x 4 x i64> %0, <vscale x 4 x i64> %1, i64 %2) nounwind {
+define <vscale x 4 x i64> @intrinsic_vsmul_vv_nxv4i64_nxv4i64_nxv4i64(<vscale x 4 x i64> %0, <vscale x 4 x i64> %1, iXLen %2) nounwind {
 ; CHECK-LABEL: intrinsic_vsmul_vv_nxv4i64_nxv4i64_nxv4i64:
 ; CHECK:       # %bb.0: # %entry
 ; CHECK-NEXT:    csrwi vxrm, 0
@@ -987,7 +989,7 @@ entry:
     <vscale x 4 x i64> undef,
     <vscale x 4 x i64> %0,
     <vscale x 4 x i64> %1,
-    i64 0, i64 %2)
+    iXLen 0, iXLen %2)
 
   ret <vscale x 4 x i64> %a
 }
@@ -997,9 +999,9 @@ declare <vscale x 4 x i64> @llvm.riscv.vsmul.mask.nxv4i64.nxv4i64(
   <vscale x 4 x i64>,
   <vscale x 4 x i64>,
   <vscale x 4 x i1>,
-  i64, i64, i64);
+  iXLen, iXLen, iXLen)
 
-define <vscale x 4 x i64> @intrinsic_vsmul_mask_vv_nxv4i64_nxv4i64_nxv4i64(<vscale x 4 x i64> %0, <vscale x 4 x i64> %1, <vscale x 4 x i64> %2, <vscale x 4 x i1> %3, i64 %4) nounwind {
+define <vscale x 4 x i64> @intrinsic_vsmul_mask_vv_nxv4i64_nxv4i64_nxv4i64(<vscale x 4 x i64> %0, <vscale x 4 x i64> %1, <vscale x 4 x i64> %2, <vscale x 4 x i1> %3, iXLen %4) nounwind {
 ; CHECK-LABEL: intrinsic_vsmul_mask_vv_nxv4i64_nxv4i64_nxv4i64:
 ; CHECK:       # %bb.0: # %entry
 ; CHECK-NEXT:    csrwi vxrm, 0
@@ -1012,7 +1014,7 @@ entry:
     <vscale x 4 x i64> %1,
     <vscale x 4 x i64> %2,
     <vscale x 4 x i1> %3,
-    i64 0, i64 %4, i64 1)
+    iXLen 0, iXLen %4, iXLen 1)
 
   ret <vscale x 4 x i64> %a
 }
@@ -1021,9 +1023,9 @@ declare <vscale x 8 x i64> @llvm.riscv.vsmul.nxv8i64.nxv8i64(
   <vscale x 8 x i64>,
   <vscale x 8 x i64>,
   <vscale x 8 x i64>,
-  i64, i64);
+  iXLen, iXLen)
 
-define <vscale x 8 x i64> @intrinsic_vsmul_vv_nxv8i64_nxv8i64_nxv8i64(<vscale x 8 x i64> %0, <vscale x 8 x i64> %1, i64 %2) nounwind {
+define <vscale x 8 x i64> @intrinsic_vsmul_vv_nxv8i64_nxv8i64_nxv8i64(<vscale x 8 x i64> %0, <vscale x 8 x i64> %1, iXLen %2) nounwind {
 ; CHECK-LABEL: intrinsic_vsmul_vv_nxv8i64_nxv8i64_nxv8i64:
 ; CHECK:       # %bb.0: # %entry
 ; CHECK-NEXT:    csrwi vxrm, 0
@@ -1035,7 +1037,7 @@ entry:
     <vscale x 8 x i64> undef,
     <vscale x 8 x i64> %0,
     <vscale x 8 x i64> %1,
-    i64 0, i64 %2)
+    iXLen 0, iXLen %2)
 
   ret <vscale x 8 x i64> %a
 }
@@ -1045,9 +1047,9 @@ declare <vscale x 8 x i64> @llvm.riscv.vsmul.mask.nxv8i64.nxv8i64(
   <vscale x 8 x i64>,
   <vscale x 8 x i64>,
   <vscale x 8 x i1>,
-  i64, i64, i64);
+  iXLen, iXLen, iXLen)
 
-define <vscale x 8 x i64> @intrinsic_vsmul_mask_vv_nxv8i64_nxv8i64_nxv8i64(<vscale x 8 x i64> %0, <vscale x 8 x i64> %1, <vscale x 8 x i64> %2, <vscale x 8 x i1> %3, i64 %4) nounwind {
+define <vscale x 8 x i64> @intrinsic_vsmul_mask_vv_nxv8i64_nxv8i64_nxv8i64(<vscale x 8 x i64> %0, <vscale x 8 x i64> %1, <vscale x 8 x i64> %2, <vscale x 8 x i1> %3, iXLen %4) nounwind {
 ; CHECK-LABEL: intrinsic_vsmul_mask_vv_nxv8i64_nxv8i64_nxv8i64:
 ; CHECK:       # %bb.0: # %entry
 ; CHECK-NEXT:    vl8re64.v v24, (a0)
@@ -1061,7 +1063,7 @@ entry:
     <vscale x 8 x i64> %1,
     <vscale x 8 x i64> %2,
     <vscale x 8 x i1> %3,
-    i64 0, i64 %4, i64 1)
+    iXLen 0, iXLen %4, iXLen 1)
 
   ret <vscale x 8 x i64> %a
 }
@@ -1070,9 +1072,9 @@ declare <vscale x 1 x i8> @llvm.riscv.vsmul.nxv1i8.i8(
   <vscale x 1 x i8>,
   <vscale x 1 x i8>,
   i8,
-  i64, i64);
+  iXLen, iXLen)
 
-define <vscale x 1 x i8> @intrinsic_vsmul_vx_nxv1i8_nxv1i8_i8(<vscale x 1 x i8> %0, i8 %1, i64 %2) nounwind {
+define <vscale x 1 x i8> @intrinsic_vsmul_vx_nxv1i8_nxv1i8_i8(<vscale x 1 x i8> %0, i8 %1, iXLen %2) nounwind {
 ; CHECK-LABEL: intrinsic_vsmul_vx_nxv1i8_nxv1i8_i8:
 ; CHECK:       # %bb.0: # %entry
 ; CHECK-NEXT:    csrwi vxrm, 0
@@ -1084,7 +1086,7 @@ entry:
     <vscale x 1 x i8> undef,
     <vscale x 1 x i8> %0,
     i8 %1,
-    i64 0, i64 %2)
+    iXLen 0, iXLen %2)
 
   ret <vscale x 1 x i8> %a
 }
@@ -1094,9 +1096,9 @@ declare <vscale x 1 x i8> @llvm.riscv.vsmul.mask.nxv1i8.i8(
   <vscale x 1 x i8>,
   i8,
   <vscale x 1 x i1>,
-  i64, i64, i64);
+  iXLen, iXLen, iXLen)
 
-define <vscale x 1 x i8> @intrinsic_vsmul_mask_vx_nxv1i8_nxv1i8_i8(<vscale x 1 x i8> %0, <vscale x 1 x i8> %1, i8 %2, <vscale x 1 x i1> %3, i64 %4) nounwind {
+define <vscale x 1 x i8> @intrinsic_vsmul_mask_vx_nxv1i8_nxv1i8_i8(<vscale x 1 x i8> %0, <vscale x 1 x i8> %1, i8 %2, <vscale x 1 x i1> %3, iXLen %4) nounwind {
 ; CHECK-LABEL: intrinsic_vsmul_mask_vx_nxv1i8_nxv1i8_i8:
 ; CHECK:       # %bb.0: # %entry
 ; CHECK-NEXT:    csrwi vxrm, 0
@@ -1109,7 +1111,7 @@ entry:
     <vscale x 1 x i8> %1,
     i8 %2,
     <vscale x 1 x i1> %3,
-    i64 0, i64 %4, i64 1)
+    iXLen 0, iXLen %4, iXLen 1)
 
   ret <vscale x 1 x i8> %a
 }
@@ -1118,9 +1120,9 @@ declare <vscale x 2 x i8> @llvm.riscv.vsmul.nxv2i8.i8(
   <vscale x 2 x i8>,
   <vscale x 2 x i8>,
   i8,
-  i64, i64);
+  iXLen, iXLen)
 
-define <vscale x 2 x i8> @intrinsic_vsmul_vx_nxv2i8_nxv2i8_i8(<vscale x 2 x i8> %0, i8 %1, i64 %2) nounwind {
+define <vscale x 2 x i8> @intrinsic_vsmul_vx_nxv2i8_nxv2i8_i8(<vscale x 2 x i8> %0, i8 %1, iXLen %2) nounwind {
 ; CHECK-LABEL: intrinsic_vsmul_vx_nxv2i8_nxv2i8_i8:
 ; CHECK:       # %bb.0: # %entry
 ; CHECK-NEXT:    csrwi vxrm, 0
@@ -1132,7 +1134,7 @@ entry:
     <vscale x 2 x i8> undef,
     <vscale x 2 x i8> %0,
     i8 %1,
-    i64 0, i64 %2)
+    iXLen 0, iXLen %2)
 
   ret <vscale x 2 x i8> %a
 }
@@ -1142,9 +1144,9 @@ declare <vscale x 2 x i8> @llvm.riscv.vsmul.mask.nxv2i8.i8(
   <vscale x 2 x i8>,
   i8,
   <vscale x 2 x i1>,
-  i64, i64, i64);
+  iXLen, iXLen, iXLen)
 
-define <vscale x 2 x i8> @intrinsic_vsmul_mask_vx_nxv2i8_nxv2i8_i8(<vscale x 2 x i8> %0, <vscale x 2 x i8> %1, i8 %2, <vscale x 2 x i1> %3, i64 %4) nounwind {
+define <vscale x 2 x i8> @intrinsic_vsmul_mask_vx_nxv2i8_nxv2i8_i8(<vscale x 2 x i8> %0, <vscale x 2 x i8> %1, i8 %2, <vscale x 2 x i1> %3, iXLen %4) nounwind {
 ; CHECK-LABEL: intrinsic_vsmul_mask_vx_nxv2i8_nxv2i8_i8:
 ; CHECK:       # %bb.0: # %entry
 ; CHECK-NEXT:    csrwi vxrm, 0
@@ -1157,7 +1159,7 @@ entry:
     <vscale x 2 x i8> %1,
     i8 %2,
     <vscale x 2 x i1> %3,
-    i64 0, i64 %4, i64 1)
+    iXLen 0, iXLen %4, iXLen 1)
 
   ret <vscale x 2 x i8> %a
 }
@@ -1166,9 +1168,9 @@ declare <vscale x 4 x i8> @llvm.riscv.vsmul.nxv4i8.i8(
   <vscale x 4 x i8>,
   <vscale x 4 x i8>,
   i8,
-  i64, i64);
+  iXLen, iXLen)
 
-define <vscale x 4 x i8> @intrinsic_vsmul_vx_nxv4i8_nxv4i8_i8(<vscale x 4 x i8> %0, i8 %1, i64 %2) nounwind {
+define <vscale x 4 x i8> @intrinsic_vsmul_vx_nxv4i8_nxv4i8_i8(<vscale x 4 x i8> %0, i8 %1, iXLen %2) nounwind {
 ; CHECK-LABEL: intrinsic_vsmul_vx_nxv4i8_nxv4i8_i8:
 ; CHECK:       # %bb.0: # %entry
 ; CHECK-NEXT:    csrwi vxrm, 0
@@ -1180,7 +1182,7 @@ entry:
     <vscale x 4 x i8> undef,
     <vscale x 4 x i8> %0,
     i8 %1,
-    i64 0, i64 %2)
+    iXLen 0, iXLen %2)
 
   ret <vscale x 4 x i8> %a
 }
@@ -1190,9 +1192,9 @@ declare <vscale x 4 x i8> @llvm.riscv.vsmul.mask.nxv4i8.i8(
   <vscale x 4 x i8>,
   i8,
   <vscale x 4 x i1>,
-  i64, i64, i64);
+  iXLen, iXLen, iXLen)
 
-define <vscale x 4 x i8> @intrinsic_vsmul_mask_vx_nxv4i8_nxv4i8_i8(<vscale x 4 x i8> %0, <vscale x 4 x i8> %1, i8 %2, <vscale x 4 x i1> %3, i64 %4) nounwind {
+define <vscale x 4 x i8> @intrinsic_vsmul_mask_vx_nxv4i8_nxv4i8_i8(<vscale x 4 x i8> %0, <vscale x 4 x i8> %1, i8 %2, <vscale x 4 x i1> %3, iXLen %4) nounwind {
 ; CHECK-LABEL: intrinsic_vsmul_mask_vx_nxv4i8_nxv4i8_i8:
 ; CHECK:       # %bb.0: # %entry
 ; CHECK-NEXT:    csrwi vxrm, 0
@@ -1205,7 +1207,7 @@ entry:
     <vscale x 4 x i8> %1,
     i8 %2,
     <vscale x 4 x i1> %3,
-    i64 0, i64 %4, i64 1)
+    iXLen 0, iXLen %4, iXLen 1)
 
   ret <vscale x 4 x i8> %a
 }
@@ -1214,9 +1216,9 @@ declare <vscale x 8 x i8> @llvm.riscv.vsmul.nxv8i8.i8(
   <vscale x 8 x i8>,
   <vscale x 8 x i8>,
   i8,
-  i64, i64);
+  iXLen, iXLen)
 
-define <vscale x 8 x i8> @intrinsic_vsmul_vx_nxv8i8_nxv8i8_i8(<vscale x 8 x i8> %0, i8 %1, i64 %2) nounwind {
+define <vscale x 8 x i8> @intrinsic_vsmul_vx_nxv8i8_nxv8i8_i8(<vscale x 8 x i8> %0, i8 %1, iXLen %2) nounwind {
 ; CHECK-LABEL: intrinsic_vsmul_vx_nxv8i8_nxv8i8_i8:
 ; CHECK:       # %bb.0: # %entry
 ; CHECK-NEXT:    csrwi vxrm, 0
@@ -1228,7 +1230,7 @@ entry:
     <vscale x 8 x i8> undef,
     <vscale x 8 x i8> %0,
     i8 %1,
-    i64 0, i64 %2)
+    iXLen 0, iXLen %2)
 
   ret <vscale x 8 x i8> %a
 }
@@ -1238,9 +1240,9 @@ declare <vscale x 8 x i8> @llvm.riscv.vsmul.mask.nxv8i8.i8(
   <vscale x 8 x i8>,
   i8,
   <vscale x 8 x i1>,
-  i64, i64, i64);
+  iXLen, iXLen, iXLen)
 
-define <vscale x 8 x i8> @intrinsic_vsmul_mask_vx_nxv8i8_nxv8i8_i8(<vscale x 8 x i8> %0, <vscale x 8 x i8> %1, i8 %2, <vscale x 8 x i1> %3, i64 %4) nounwind {
+define <vscale x 8 x i8> @intrinsic_vsmul_mask_vx_nxv8i8_nxv8i8_i8(<vscale x 8 x i8> %0, <vscale x 8 x i8> %1, i8 %2, <vscale x 8 x i1> %3, iXLen %4) nounwind {
 ; CHECK-LABEL: intrinsic_vsmul_mask_vx_nxv8i8_nxv8i8_i8:
 ; CHECK:       # %bb.0: # %entry
 ; CHECK-NEXT:    csrwi vxrm, 0
@@ -1253,7 +1255,7 @@ entry:
     <vscale x 8 x i8> %1,
     i8 %2,
     <vscale x 8 x i1> %3,
-    i64 0, i64 %4, i64 1)
+    iXLen 0, iXLen %4, iXLen 1)
 
   ret <vscale x 8 x i8> %a
 }
@@ -1262,9 +1264,9 @@ declare <vscale x 16 x i8> @llvm.riscv.vsmul.nxv16i8.i8(
   <vscale x 16 x i8>,
   <vscale x 16 x i8>,
   i8,
-  i64, i64);
+  iXLen, iXLen)
 
-define <vscale x 16 x i8> @intrinsic_vsmul_vx_nxv16i8_nxv16i8_i8(<vscale x 16 x i8> %0, i8 %1, i64 %2) nounwind {
+define <vscale x 16 x i8> @intrinsic_vsmul_vx_nxv16i8_nxv16i8_i8(<vscale x 16 x i8> %0, i8 %1, iXLen %2) nounwind {
 ; CHECK-LABEL: intrinsic_vsmul_vx_nxv16i8_nxv16i8_i8:
 ; CHECK:       # %bb.0: # %entry
 ; CHECK-NEXT:    csrwi vxrm, 0
@@ -1276,7 +1278,7 @@ entry:
     <vscale x 16 x i8> undef,
     <vscale x 16 x i8> %0,
     i8 %1,
-    i64 0, i64 %2)
+    iXLen 0, iXLen %2)
 
   ret <vscale x 16 x i8> %a
 }
@@ -1286,9 +1288,9 @@ declare <vscale x 16 x i8> @llvm.riscv.vsmul.mask.nxv16i8.i8(
   <vscale x 16 x i8>,
   i8,
   <vscale x 16 x i1>,
-  i64, i64, i64);
+  iXLen, iXLen, iXLen)
 
-define <vscale x 16 x i8> @intrinsic_vsmul_mask_vx_nxv16i8_nxv16i8_i8(<vscale x 16 x i8> %0, <vscale x 16 x i8> %1, i8 %2, <vscale x 16 x i1> %3, i64 %4) nounwind {
+define <vscale x 16 x i8> @intrinsic_vsmul_mask_vx_nxv16i8_nxv16i8_i8(<vscale x 16 x i8> %0, <vscale x 16 x i8> %1, i8 %2, <vscale x 16 x i1> %3, iXLen %4) nounwind {
 ; CHECK-LABEL: intrinsic_vsmul_mask_vx_nxv16i8_nxv16i8_i8:
 ; CHECK:       # %bb.0: # %entry
 ; CHECK-NEXT:    csrwi vxrm, 0
@@ -1301,7 +1303,7 @@ entry:
     <vscale x 16 x i8> %1,
     i8 %2,
     <vscale x 16 x i1> %3,
-    i64 0, i64 %4, i64 1)
+    iXLen 0, iXLen %4, iXLen 1)
 
   ret <vscale x 16 x i8> %a
 }
@@ -1310,9 +1312,9 @@ declare <vscale x 32 x i8> @llvm.riscv.vsmul.nxv32i8.i8(
   <vscale x 32 x i8>,
   <vscale x 32 x i8>,
   i8,
-  i64, i64);
+  iXLen, iXLen)
 
-define <vscale x 32 x i8> @intrinsic_vsmul_vx_nxv32i8_nxv32i8_i8(<vscale x 32 x i8> %0, i8 %1, i64 %2) nounwind {
+define <vscale x 32 x i8> @intrinsic_vsmul_vx_nxv32i8_nxv32i8_i8(<vscale x 32 x i8> %0, i8 %1, iXLen %2) nounwind {
 ; CHECK-LABEL: intrinsic_vsmul_vx_nxv32i8_nxv32i8_i8:
 ; CHECK:       # %bb.0: # %entry
 ; CHECK-NEXT:    csrwi vxrm, 0
@@ -1324,7 +1326,7 @@ entry:
     <vscale x 32 x i8> undef,
     <vscale x 32 x i8> %0,
     i8 %1,
-    i64 0, i64 %2)
+    iXLen 0, iXLen %2)
 
   ret <vscale x 32 x i8> %a
 }
@@ -1334,9 +1336,9 @@ declare <vscale x 32 x i8> @llvm.riscv.vsmul.mask.nxv32i8.i8(
   <vscale x 32 x i8>,
   i8,
   <vscale x 32 x i1>,
-  i64, i64, i64);
+  iXLen, iXLen, iXLen)
 
-define <vscale x 32 x i8> @intrinsic_vsmul_mask_vx_nxv32i8_nxv32i8_i8(<vscale x 32 x i8> %0, <vscale x 32 x i8> %1, i8 %2, <vscale x 32 x i1> %3, i64 %4) nounwind {
+define <vscale x 32 x i8> @intrinsic_vsmul_mask_vx_nxv32i8_nxv32i8_i8(<vscale x 32 x i8> %0, <vscale x 32 x i8> %1, i8 %2, <vscale x 32 x i1> %3, iXLen %4) nounwind {
 ; CHECK-LABEL: intrinsic_vsmul_mask_vx_nxv32i8_nxv32i8_i8:
 ; CHECK:       # %bb.0: # %entry
 ; CHECK-NEXT:    csrwi vxrm, 0
@@ -1349,7 +1351,7 @@ entry:
     <vscale x 32 x i8> %1,
     i8 %2,
     <vscale x 32 x i1> %3,
-    i64 0, i64 %4, i64 1)
+    iXLen 0, iXLen %4, iXLen 1)
 
   ret <vscale x 32 x i8> %a
 }
@@ -1358,9 +1360,9 @@ declare <vscale x 64 x i8> @llvm.riscv.vsmul.nxv64i8.i8(
   <vscale x 64 x i8>,
   <vscale x 64 x i8>,
   i8,
-  i64, i64);
+  iXLen, iXLen)
 
-define <vscale x 64 x i8> @intrinsic_vsmul_vx_nxv64i8_nxv64i8_i8(<vscale x 64 x i8> %0, i8 %1, i64 %2) nounwind {
+define <vscale x 64 x i8> @intrinsic_vsmul_vx_nxv64i8_nxv64i8_i8(<vscale x 64 x i8> %0, i8 %1, iXLen %2) nounwind {
 ; CHECK-LABEL: intrinsic_vsmul_vx_nxv64i8_nxv64i8_i8:
 ; CHECK:       # %bb.0: # %entry
 ; CHECK-NEXT:    csrwi vxrm, 0
@@ -1372,7 +1374,7 @@ entry:
     <vscale x 64 x i8> undef,
     <vscale x 64 x i8> %0,
     i8 %1,
-    i64 0, i64 %2)
+    iXLen 0, iXLen %2)
 
   ret <vscale x 64 x i8> %a
 }
@@ -1382,9 +1384,9 @@ declare <vscale x 64 x i8> @llvm.riscv.vsmul.mask.nxv64i8.i8(
   <vscale x 64 x i8>,
   i8,
   <vscale x 64 x i1>,
-  i64, i64, i64);
+  iXLen, iXLen, iXLen)
 
-define <vscale x 64 x i8> @intrinsic_vsmul_mask_vx_nxv64i8_nxv64i8_i8(<vscale x 64 x i8> %0, <vscale x 64 x i8> %1, i8 %2, <vscale x 64 x i1> %3, i64 %4) nounwind {
+define <vscale x 64 x i8> @intrinsic_vsmul_mask_vx_nxv64i8_nxv64i8_i8(<vscale x 64 x i8> %0, <vscale x 64 x i8> %1, i8 %2, <vscale x 64 x i1> %3, iXLen %4) nounwind {
 ; CHECK-LABEL: intrinsic_vsmul_mask_vx_nxv64i8_nxv64i8_i8:
 ; CHECK:       # %bb.0: # %entry
 ; CHECK-NEXT:    csrwi vxrm, 0
@@ -1397,7 +1399,7 @@ entry:
     <vscale x 64 x i8> %1,
     i8 %2,
     <vscale x 64 x i1> %3,
-    i64 0, i64 %4, i64 1)
+    iXLen 0, iXLen %4, iXLen 1)
 
   ret <vscale x 64 x i8> %a
 }
@@ -1406,9 +1408,9 @@ declare <vscale x 1 x i16> @llvm.riscv.vsmul.nxv1i16.i16(
   <vscale x 1 x i16>,
   <vscale x 1 x i16>,
   i16,
-  i64, i64);
+  iXLen, iXLen)
 
-define <vscale x 1 x i16> @intrinsic_vsmul_vx_nxv1i16_nxv1i16_i16(<vscale x 1 x i16> %0, i16 %1, i64 %2) nounwind {
+define <vscale x 1 x i16> @intrinsic_vsmul_vx_nxv1i16_nxv1i16_i16(<vscale x 1 x i16> %0, i16 %1, iXLen %2) nounwind {
 ; CHECK-LABEL: intrinsic_vsmul_vx_nxv1i16_nxv1i16_i16:
 ; CHECK:       # %bb.0: # %entry
 ; CHECK-NEXT:    csrwi vxrm, 0
@@ -1420,7 +1422,7 @@ entry:
     <vscale x 1 x i16> undef,
     <vscale x 1 x i16> %0,
     i16 %1,
-    i64 0, i64 %2)
+    iXLen 0, iXLen %2)
 
   ret <vscale x 1 x i16> %a
 }
@@ -1430,9 +1432,9 @@ declare <vscale x 1 x i16> @llvm.riscv.vsmul.mask.nxv1i16.i16(
   <vscale x 1 x i16>,
   i16,
   <vscale x 1 x i1>,
-  i64, i64, i64);
+  iXLen, iXLen, iXLen)
 
-define <vscale x 1 x i16> @intrinsic_vsmul_mask_vx_nxv1i16_nxv1i16_i16(<vscale x 1 x i16> %0, <vscale x 1 x i16> %1, i16 %2, <vscale x 1 x i1> %3, i64 %4) nounwind {
+define <vscale x 1 x i16> @intrinsic_vsmul_mask_vx_nxv1i16_nxv1i16_i16(<vscale x 1 x i16> %0, <vscale x 1 x i16> %1, i16 %2, <vscale x 1 x i1> %3, iXLen %4) nounwind {
 ; CHECK-LABEL: intrinsic_vsmul_mask_vx_nxv1i16_nxv1i16_i16:
 ; CHECK:       # %bb.0: # %entry
 ; CHECK-NEXT:    csrwi vxrm, 0
@@ -1445,7 +1447,7 @@ entry:
     <vscale x 1 x i16> %1,
     i16 %2,
     <vscale x 1 x i1> %3,
-    i64 0, i64 %4, i64 1)
+    iXLen 0, iXLen %4, iXLen 1)
 
   ret <vscale x 1 x i16> %a
 }
@@ -1454,9 +1456,9 @@ declare <vscale x 2 x i16> @llvm.riscv.vsmul.nxv2i16.i16(
   <vscale x 2 x i16>,
   <vscale x 2 x i16>,
   i16,
-  i64, i64);
+  iXLen, iXLen)
 
-define <vscale x 2 x i16> @intrinsic_vsmul_vx_nxv2i16_nxv2i16_i16(<vscale x 2 x i16> %0, i16 %1, i64 %2) nounwind {
+define <vscale x 2 x i16> @intrinsic_vsmul_vx_nxv2i16_nxv2i16_i16(<vscale x 2 x i16> %0, i16 %1, iXLen %2) nounwind {
 ; CHECK-LABEL: intrinsic_vsmul_vx_nxv2i16_nxv2i16_i16:
 ; CHECK:       # %bb.0: # %entry
 ; CHECK-NEXT:    csrwi vxrm, 0
@@ -1468,7 +1470,7 @@ entry:
     <vscale x 2 x i16> undef,
     <vscale x 2 x i16> %0,
     i16 %1,
-    i64 0, i64 %2)
+    iXLen 0, iXLen %2)
 
   ret <vscale x 2 x i16> %a
 }
@@ -1478,9 +1480,9 @@ declare <vscale x 2 x i16> @llvm.riscv.vsmul.mask.nxv2i16.i16(
   <vscale x 2 x i16>,
   i16,
   <vscale x 2 x i1>,
-  i64, i64, i64);
+  iXLen, iXLen, iXLen)
 
-define <vscale x 2 x i16> @intrinsic_vsmul_mask_vx_nxv2i16_nxv2i16_i16(<vscale x 2 x i16> %0, <vscale x 2 x i16> %1, i16 %2, <vscale x 2 x i1> %3, i64 %4) nounwind {
+define <vscale x 2 x i16> @intrinsic_vsmul_mask_vx_nxv2i16_nxv2i16_i16(<vscale x 2 x i16> %0, <vscale x 2 x i16> %1, i16 %2, <vscale x 2 x i1> %3, iXLen %4) nounwind {
 ; CHECK-LABEL: intrinsic_vsmul_mask_vx_nxv2i16_nxv2i16_i16:
 ; CHECK:       # %bb.0: # %entry
 ; CHECK-NEXT:    csrwi vxrm, 0
@@ -1493,7 +1495,7 @@ entry:
     <vscale x 2 x i16> %1,
     i16 %2,
     <vscale x 2 x i1> %3,
-    i64 0, i64 %4, i64 1)
+    iXLen 0, iXLen %4, iXLen 1)
 
   ret <vscale x 2 x i16> %a
 }
@@ -1502,9 +1504,9 @@ declare <vscale x 4 x i16> @llvm.riscv.vsmul.nxv4i16.i16(
   <vscale x 4 x i16>,
   <vscale x 4 x i16>,
   i16,
-  i64, i64);
+  iXLen, iXLen)
 
-define <vscale x 4 x i16> @intrinsic_vsmul_vx_nxv4i16_nxv4i16_i16(<vscale x 4 x i16> %0, i16 %1, i64 %2) nounwind {
+define <vscale x 4 x i16> @intrinsic_vsmul_vx_nxv4i16_nxv4i16_i16(<vscale x 4 x i16> %0, i16 %1, iXLen %2) nounwind {
 ; CHECK-LABEL: intrinsic_vsmul_vx_nxv4i16_nxv4i16_i16:
 ; CHECK:       # %bb.0: # %entry
 ; CHECK-NEXT:    csrwi vxrm, 0
@@ -1516,7 +1518,7 @@ entry:
     <vscale x 4 x i16> undef,
     <vscale x 4 x i16> %0,
     i16 %1,
-    i64 0, i64 %2)
+    iXLen 0, iXLen %2)
 
   ret <vscale x 4 x i16> %a
 }
@@ -1526,9 +1528,9 @@ declare <vscale x 4 x i16> @llvm.riscv.vsmul.mask.nxv4i16.i16(
   <vscale x 4 x i16>,
   i16,
   <vscale x 4 x i1>,
-  i64, i64, i64);
+  iXLen, iXLen, iXLen)
 
-define <vscale x 4 x i16> @intrinsic_vsmul_mask_vx_nxv4i16_nxv4i16_i16(<vscale x 4 x i16> %0, <vscale x 4 x i16> %1, i16 %2, <vscale x 4 x i1> %3, i64 %4) nounwind {
+define <vscale x 4 x i16> @intrinsic_vsmul_mask_vx_nxv4i16_nxv4i16_i16(<vscale x 4 x i16> %0, <vscale x 4 x i16> %1, i16 %2, <vscale x 4 x i1> %3, iXLen %4) nounwind {
 ; CHECK-LABEL: intrinsic_vsmul_mask_vx_nxv4i16_nxv4i16_i16:
 ; CHECK:       # %bb.0: # %entry
 ; CHECK-NEXT:    csrwi vxrm, 0
@@ -1541,7 +1543,7 @@ entry:
     <vscale x 4 x i16> %1,
     i16 %2,
     <vscale x 4 x i1> %3,
-    i64 0, i64 %4, i64 1)
+    iXLen 0, iXLen %4, iXLen 1)
 
   ret <vscale x 4 x i16> %a
 }
@@ -1550,9 +1552,9 @@ declare <vscale x 8 x i16> @llvm.riscv.vsmul.nxv8i16.i16(
   <vscale x 8 x i16>,
   <vscale x 8 x i16>,
   i16,
-  i64, i64);
+  iXLen, iXLen)
 
-define <vscale x 8 x i16> @intrinsic_vsmul_vx_nxv8i16_nxv8i16_i16(<vscale x 8 x i16> %0, i16 %1, i64 %2) nounwind {
+define <vscale x 8 x i16> @intrinsic_vsmul_vx_nxv8i16_nxv8i16_i16(<vscale x 8 x i16> %0, i16 %1, iXLen %2) nounwind {
 ; CHECK-LABEL: intrinsic_vsmul_vx_nxv8i16_nxv8i16_i16:
 ; CHECK:       # %bb.0: # %entry
 ; CHECK-NEXT:    csrwi vxrm, 0
@@ -1564,7 +1566,7 @@ entry:
     <vscale x 8 x i16> undef,
     <vscale x 8 x i16> %0,
     i16 %1,
-    i64 0, i64 %2)
+    iXLen 0, iXLen %2)
 
   ret <vscale x 8 x i16> %a
 }
@@ -1574,9 +1576,9 @@ declare <vscale x 8 x i16> @llvm.riscv.vsmul.mask.nxv8i16.i16(
   <vscale x 8 x i16>,
   i16,
   <vscale x 8 x i1>,
-  i64, i64, i64);
+  iXLen, iXLen, iXLen)
 
-define <vscale x 8 x i16> @intrinsic_vsmul_mask_vx_nxv8i16_nxv8i16_i16(<vscale x 8 x i16> %0, <vscale x 8 x i16> %1, i16 %2, <vscale x 8 x i1> %3, i64 %4) nounwind {
+define <vscale x 8 x i16> @intrinsic_vsmul_mask_vx_nxv8i16_nxv8i16_i16(<vscale x 8 x i16> %0, <vscale x 8 x i16> %1, i16 %2, <vscale x 8 x i1> %3, iXLen %4) nounwind {
 ; CHECK-LABEL: intrinsic_vsmul_mask_vx_nxv8i16_nxv8i16_i16:
 ; CHECK:       # %bb.0: # %entry
 ; CHECK-NEXT:    csrwi vxrm, 0
@@ -1589,7 +1591,7 @@ entry:
     <vscale x 8 x i16> %1,
     i16 %2,
     <vscale x 8 x i1> %3,
-    i64 0, i64 %4, i64 1)
+    iXLen 0, iXLen %4, iXLen 1)
 
   ret <vscale x 8 x i16> %a
 }
@@ -1598,9 +1600,9 @@ declare <vscale x 16 x i16> @llvm.riscv.vsmul.nxv16i16.i16(
   <vscale x 16 x i16>,
   <vscale x 16 x i16>,
   i16,
-  i64, i64);
+  iXLen, iXLen)
 
-define <vscale x 16 x i16> @intrinsic_vsmul_vx_nxv16i16_nxv16i16_i16(<vscale x 16 x i16> %0, i16 %1, i64 %2) nounwind {
+define <vscale x 16 x i16> @intrinsic_vsmul_vx_nxv16i16_nxv16i16_i16(<vscale x 16 x i16> %0, i16 %1, iXLen %2) nounwind {
 ; CHECK-LABEL: intrinsic_vsmul_vx_nxv16i16_nxv16i16_i16:
 ; CHECK:       # %bb.0: # %entry
 ; CHECK-NEXT:    csrwi vxrm, 0
@@ -1612,7 +1614,7 @@ entry:
     <vscale x 16 x i16> undef,
     <vscale x 16 x i16> %0,
     i16 %1,
-    i64 0, i64 %2)
+    iXLen 0, iXLen %2)
 
   ret <vscale x 16 x i16> %a
 }
@@ -1622,9 +1624,9 @@ declare <vscale x 16 x i16> @llvm.riscv.vsmul.mask.nxv16i16.i16(
   <vscale x 16 x i16>,
   i16,
   <vscale x 16 x i1>,
-  i64, i64, i64);
+  iXLen, iXLen, iXLen)
 
-define <vscale x 16 x i16> @intrinsic_vsmul_mask_vx_nxv16i16_nxv16i16_i16(<vscale x 16 x i16> %0, <vscale x 16 x i16> %1, i16 %2, <vscale x 16 x i1> %3, i64 %4) nounwind {
+define <vscale x 16 x i16> @intrinsic_vsmul_mask_vx_nxv16i16_nxv16i16_i16(<vscale x 16 x i16> %0, <vscale x 16 x i16> %1, i16 %2, <vscale x 16 x i1> %3, iXLen %4) nounwind {
 ; CHECK-LABEL: intrinsic_vsmul_mask_vx_nxv16i16_nxv16i16_i16:
 ; CHECK:       # %bb.0: # %entry
 ; CHECK-NEXT:    csrwi vxrm, 0
@@ -1637,7 +1639,7 @@ entry:
     <vscale x 16 x i16> %1,
     i16 %2,
     <vscale x 16 x i1> %3,
-    i64 0, i64 %4, i64 1)
+    iXLen 0, iXLen %4, iXLen 1)
 
   ret <vscale x 16 x i16> %a
 }
@@ -1646,9 +1648,9 @@ declare <vscale x 32 x i16> @llvm.riscv.vsmul.nxv32i16.i16(
   <vscale x 32 x i16>,
   <vscale x 32 x i16>,
   i16,
-  i64, i64);
+  iXLen, iXLen)
 
-define <vscale x 32 x i16> @intrinsic_vsmul_vx_nxv32i16_nxv32i16_i16(<vscale x 32 x i16> %0, i16 %1, i64 %2) nounwind {
+define <vscale x 32 x i16> @intrinsic_vsmul_vx_nxv32i16_nxv32i16_i16(<vscale x 32 x i16> %0, i16 %1, iXLen %2) nounwind {
 ; CHECK-LABEL: intrinsic_vsmul_vx_nxv32i16_nxv32i16_i16:
 ; CHECK:       # %bb.0: # %entry
 ; CHECK-NEXT:    csrwi vxrm, 0
@@ -1660,7 +1662,7 @@ entry:
     <vscale x 32 x i16> undef,
     <vscale x 32 x i16> %0,
     i16 %1,
-    i64 0, i64 %2)
+    iXLen 0, iXLen %2)
 
   ret <vscale x 32 x i16> %a
 }
@@ -1670,9 +1672,9 @@ declare <vscale x 32 x i16> @llvm.riscv.vsmul.mask.nxv32i16.i16(
   <vscale x 32 x i16>,
   i16,
   <vscale x 32 x i1>,
-  i64, i64, i64);
+  iXLen, iXLen, iXLen)
 
-define <vscale x 32 x i16> @intrinsic_vsmul_mask_vx_nxv32i16_nxv32i16_i16(<vscale x 32 x i16> %0, <vscale x 32 x i16> %1, i16 %2, <vscale x 32 x i1> %3, i64 %4) nounwind {
+define <vscale x 32 x i16> @intrinsic_vsmul_mask_vx_nxv32i16_nxv32i16_i16(<vscale x 32 x i16> %0, <vscale x 32 x i16> %1, i16 %2, <vscale x 32 x i1> %3, iXLen %4) nounwind {
 ; CHECK-LABEL: intrinsic_vsmul_mask_vx_nxv32i16_nxv32i16_i16:
 ; CHECK:       # %bb.0: # %entry
 ; CHECK-NEXT:    csrwi vxrm, 0
@@ -1685,7 +1687,7 @@ entry:
     <vscale x 32 x i16> %1,
     i16 %2,
     <vscale x 32 x i1> %3,
-    i64 0, i64 %4, i64 1)
+    iXLen 0, iXLen %4, iXLen 1)
 
   ret <vscale x 32 x i16> %a
 }
@@ -1694,9 +1696,9 @@ declare <vscale x 1 x i32> @llvm.riscv.vsmul.nxv1i32.i32(
   <vscale x 1 x i32>,
   <vscale x 1 x i32>,
   i32,
-  i64, i64);
+  iXLen, iXLen)
 
-define <vscale x 1 x i32> @intrinsic_vsmul_vx_nxv1i32_nxv1i32_i32(<vscale x 1 x i32> %0, i32 %1, i64 %2) nounwind {
+define <vscale x 1 x i32> @intrinsic_vsmul_vx_nxv1i32_nxv1i32_i32(<vscale x 1 x i32> %0, i32 %1, iXLen %2) nounwind {
 ; CHECK-LABEL: intrinsic_vsmul_vx_nxv1i32_nxv1i32_i32:
 ; CHECK:       # %bb.0: # %entry
 ; CHECK-NEXT:    csrwi vxrm, 0
@@ -1708,7 +1710,7 @@ entry:
     <vscale x 1 x i32> undef,
     <vscale x 1 x i32> %0,
     i32 %1,
-    i64 0, i64 %2)
+    iXLen 0, iXLen %2)
 
   ret <vscale x 1 x i32> %a
 }
@@ -1718,9 +1720,9 @@ declare <vscale x 1 x i32> @llvm.riscv.vsmul.mask.nxv1i32.i32(
   <vscale x 1 x i32>,
   i32,
   <vscale x 1 x i1>,
-  i64, i64, i64);
+  iXLen, iXLen, iXLen)
 
-define <vscale x 1 x i32> @intrinsic_vsmul_mask_vx_nxv1i32_nxv1i32_i32(<vscale x 1 x i32> %0, <vscale x 1 x i32> %1, i32 %2, <vscale x 1 x i1> %3, i64 %4) nounwind {
+define <vscale x 1 x i32> @intrinsic_vsmul_mask_vx_nxv1i32_nxv1i32_i32(<vscale x 1 x i32> %0, <vscale x 1 x i32> %1, i32 %2, <vscale x 1 x i1> %3, iXLen %4) nounwind {
 ; CHECK-LABEL: intrinsic_vsmul_mask_vx_nxv1i32_nxv1i32_i32:
 ; CHECK:       # %bb.0: # %entry
 ; CHECK-NEXT:    csrwi vxrm, 0
@@ -1733,7 +1735,7 @@ entry:
     <vscale x 1 x i32> %1,
     i32 %2,
     <vscale x 1 x i1> %3,
-    i64 0, i64 %4, i64 1)
+    iXLen 0, iXLen %4, iXLen 1)
 
   ret <vscale x 1 x i32> %a
 }
@@ -1742,9 +1744,9 @@ declare <vscale x 2 x i32> @llvm.riscv.vsmul.nxv2i32.i32(
   <vscale x 2 x i32>,
   <vscale x 2 x i32>,
   i32,
-  i64, i64);
+  iXLen, iXLen)
 
-define <vscale x 2 x i32> @intrinsic_vsmul_vx_nxv2i32_nxv2i32_i32(<vscale x 2 x i32> %0, i32 %1, i64 %2) nounwind {
+define <vscale x 2 x i32> @intrinsic_vsmul_vx_nxv2i32_nxv2i32_i32(<vscale x 2 x i32> %0, i32 %1, iXLen %2) nounwind {
 ; CHECK-LABEL: intrinsic_vsmul_vx_nxv2i32_nxv2i32_i32:
 ; CHECK:       # %bb.0: # %entry
 ; CHECK-NEXT:    csrwi vxrm, 0
@@ -1756,7 +1758,7 @@ entry:
     <vscale x 2 x i32> undef,
     <vscale x 2 x i32> %0,
     i32 %1,
-    i64 0, i64 %2)
+    iXLen 0, iXLen %2)
 
   ret <vscale x 2 x i32> %a
 }
@@ -1766,9 +1768,9 @@ declare <vscale x 2 x i32> @llvm.riscv.vsmul.mask.nxv2i32.i32(
   <vscale x 2 x i32>,
   i32,
   <vscale x 2 x i1>,
-  i64, i64, i64);
+  iXLen, iXLen, iXLen)
 
-define <vscale x 2 x i32> @intrinsic_vsmul_mask_vx_nxv2i32_nxv2i32_i32(<vscale x 2 x i32> %0, <vscale x 2 x i32> %1, i32 %2, <vscale x 2 x i1> %3, i64 %4) nounwind {
+define <vscale x 2 x i32> @intrinsic_vsmul_mask_vx_nxv2i32_nxv2i32_i32(<vscale x 2 x i32> %0, <vscale x 2 x i32> %1, i32 %2, <vscale x 2 x i1> %3, iXLen %4) nounwind {
 ; CHECK-LABEL: intrinsic_vsmul_mask_vx_nxv2i32_nxv2i32_i32:
 ; CHECK:       # %bb.0: # %entry
 ; CHECK-NEXT:    csrwi vxrm, 0
@@ -1781,7 +1783,7 @@ entry:
     <vscale x 2 x i32> %1,
     i32 %2,
     <vscale x 2 x i1> %3,
-    i64 0, i64 %4, i64 1)
+    iXLen 0, iXLen %4, iXLen 1)
 
   ret <vscale x 2 x i32> %a
 }
@@ -1790,9 +1792,9 @@ declare <vscale x 4 x i32> @llvm.riscv.vsmul.nxv4i32.i32(
   <vscale x 4 x i32>,
   <vscale x 4 x i32>,
   i32,
-  i64, i64);
+  iXLen, iXLen)
 
-define <vscale x 4 x i32> @intrinsic_vsmul_vx_nxv4i32_nxv4i32_i32(<vscale x 4 x i32> %0, i32 %1, i64 %2) nounwind {
+define <vscale x 4 x i32> @intrinsic_vsmul_vx_nxv4i32_nxv4i32_i32(<vscale x 4 x i32> %0, i32 %1, iXLen %2) nounwind {
 ; CHECK-LABEL: intrinsic_vsmul_vx_nxv4i32_nxv4i32_i32:
 ; CHECK:       # %bb.0: # %entry
 ; CHECK-NEXT:    csrwi vxrm, 0
@@ -1804,7 +1806,7 @@ entry:
     <vscale x 4 x i32> undef,
     <vscale x 4 x i32> %0,
     i32 %1,
-    i64 0, i64 %2)
+    iXLen 0, iXLen %2)
 
   ret <vscale x 4 x i32> %a
 }
@@ -1814,9 +1816,9 @@ declare <vscale x 4 x i32> @llvm.riscv.vsmul.mask.nxv4i32.i32(
   <vscale x 4 x i32>,
   i32,
   <vscale x 4 x i1>,
-  i64, i64, i64);
+  iXLen, iXLen, iXLen)
 
-define <vscale x 4 x i32> @intrinsic_vsmul_mask_vx_nxv4i32_nxv4i32_i32(<vscale x 4 x i32> %0, <vscale x 4 x i32> %1, i32 %2, <vscale x 4 x i1> %3, i64 %4) nounwind {
+define <vscale x 4 x i32> @intrinsic_vsmul_mask_vx_nxv4i32_nxv4i32_i32(<vscale x 4 x i32> %0, <vscale x 4 x i32> %1, i32 %2, <vscale x 4 x i1> %3, iXLen %4) nounwind {
 ; CHECK-LABEL: intrinsic_vsmul_mask_vx_nxv4i32_nxv4i32_i32:
 ; CHECK:       # %bb.0: # %entry
 ; CHECK-NEXT:    csrwi vxrm, 0
@@ -1829,7 +1831,7 @@ entry:
     <vscale x 4 x i32> %1,
     i32 %2,
     <vscale x 4 x i1> %3,
-    i64 0, i64 %4, i64 1)
+    iXLen 0, iXLen %4, iXLen 1)
 
   ret <vscale x 4 x i32> %a
 }
@@ -1838,9 +1840,9 @@ declare <vscale x 8 x i32> @llvm.riscv.vsmul.nxv8i32.i32(
   <vscale x 8 x i32>,
   <vscale x 8 x i32>,
   i32,
-  i64, i64);
+  iXLen, iXLen)
 
-define <vscale x 8 x i32> @intrinsic_vsmul_vx_nxv8i32_nxv8i32_i32(<vscale x 8 x i32> %0, i32 %1, i64 %2) nounwind {
+define <vscale x 8 x i32> @intrinsic_vsmul_vx_nxv8i32_nxv8i32_i32(<vscale x 8 x i32> %0, i32 %1, iXLen %2) nounwind {
 ; CHECK-LABEL: intrinsic_vsmul_vx_nxv8i32_nxv8i32_i32:
 ; CHECK:       # %bb.0: # %entry
 ; CHECK-NEXT:    csrwi vxrm, 0
@@ -1852,7 +1854,7 @@ entry:
     <vscale x 8 x i32> undef,
     <vscale x 8 x i32> %0,
     i32 %1,
-    i64 0, i64 %2)
+    iXLen 0, iXLen %2)
 
   ret <vscale x 8 x i32> %a
 }
@@ -1862,9 +1864,9 @@ declare <vscale x 8 x i32> @llvm.riscv.vsmul.mask.nxv8i32.i32(
   <vscale x 8 x i32>,
   i32,
   <vscale x 8 x i1>,
-  i64, i64, i64);
+  iXLen, iXLen, iXLen)
 
-define <vscale x 8 x i32> @intrinsic_vsmul_mask_vx_nxv8i32_nxv8i32_i32(<vscale x 8 x i32> %0, <vscale x 8 x i32> %1, i32 %2, <vscale x 8 x i1> %3, i64 %4) nounwind {
+define <vscale x 8 x i32> @intrinsic_vsmul_mask_vx_nxv8i32_nxv8i32_i32(<vscale x 8 x i32> %0, <vscale x 8 x i32> %1, i32 %2, <vscale x 8 x i1> %3, iXLen %4) nounwind {
 ; CHECK-LABEL: intrinsic_vsmul_mask_vx_nxv8i32_nxv8i32_i32:
 ; CHECK:       # %bb.0: # %entry
 ; CHECK-NEXT:    csrwi vxrm, 0
@@ -1877,7 +1879,7 @@ entry:
     <vscale x 8 x i32> %1,
     i32 %2,
     <vscale x 8 x i1> %3,
-    i64 0, i64 %4, i64 1)
+    iXLen 0, iXLen %4, iXLen 1)
 
   ret <vscale x 8 x i32> %a
 }
@@ -1886,9 +1888,9 @@ declare <vscale x 16 x i32> @llvm.riscv.vsmul.nxv16i32.i32(
   <vscale x 16 x i32>,
   <vscale x 16 x i32>,
   i32,
-  i64, i64);
+  iXLen, iXLen)
 
-define <vscale x 16 x i32> @intrinsic_vsmul_vx_nxv16i32_nxv16i32_i32(<vscale x 16 x i32> %0, i32 %1, i64 %2) nounwind {
+define <vscale x 16 x i32> @intrinsic_vsmul_vx_nxv16i32_nxv16i32_i32(<vscale x 16 x i32> %0, i32 %1, iXLen %2) nounwind {
 ; CHECK-LABEL: intrinsic_vsmul_vx_nxv16i32_nxv16i32_i32:
 ; CHECK:       # %bb.0: # %entry
 ; CHECK-NEXT:    csrwi vxrm, 0
@@ -1900,7 +1902,7 @@ entry:
     <vscale x 16 x i32> undef,
     <vscale x 16 x i32> %0,
     i32 %1,
-    i64 0, i64 %2)
+    iXLen 0, iXLen %2)
 
   ret <vscale x 16 x i32> %a
 }
@@ -1910,9 +1912,9 @@ declare <vscale x 16 x i32> @llvm.riscv.vsmul.mask.nxv16i32.i32(
   <vscale x 16 x i32>,
   i32,
   <vscale x 16 x i1>,
-  i64, i64, i64);
+  iXLen, iXLen, iXLen)
 
-define <vscale x 16 x i32> @intrinsic_vsmul_mask_vx_nxv16i32_nxv16i32_i32(<vscale x 16 x i32> %0, <vscale x 16 x i32> %1, i32 %2, <vscale x 16 x i1> %3, i64 %4) nounwind {
+define <vscale x 16 x i32> @intrinsic_vsmul_mask_vx_nxv16i32_nxv16i32_i32(<vscale x 16 x i32> %0, <vscale x 16 x i32> %1, i32 %2, <vscale x 16 x i1> %3, iXLen %4) nounwind {
 ; CHECK-LABEL: intrinsic_vsmul_mask_vx_nxv16i32_nxv16i32_i32:
 ; CHECK:       # %bb.0: # %entry
 ; CHECK-NEXT:    csrwi vxrm, 0
@@ -1925,7 +1927,7 @@ entry:
     <vscale x 16 x i32> %1,
     i32 %2,
     <vscale x 16 x i1> %3,
-    i64 0, i64 %4, i64 1)
+    iXLen 0, iXLen %4, iXLen 1)
 
   ret <vscale x 16 x i32> %a
 }
@@ -1933,21 +1935,35 @@ entry:
 declare <vscale x 1 x i64> @llvm.riscv.vsmul.nxv1i64.i64(
   <vscale x 1 x i64>,
   <vscale x 1 x i64>,
-  i64, i64, i64);
-
-define <vscale x 1 x i64> @intrinsic_vsmul_vx_nxv1i64_nxv1i64_i64(<vscale x 1 x i64> %0, i64 %1, i64 %2) nounwind {
-; CHECK-LABEL: intrinsic_vsmul_vx_nxv1i64_nxv1i64_i64:
-; CHECK:       # %bb.0: # %entry
-; CHECK-NEXT:    csrwi vxrm, 0
-; CHECK-NEXT:    vsetvli zero, a1, e64, m1, ta, ma
-; CHECK-NEXT:    vsmul.vx v8, v8, a0
-; CHECK-NEXT:    ret
+  i64,
+  iXLen, iXLen)
+
+define <vscale x 1 x i64> @intrinsic_vsmul_vx_nxv1i64_nxv1i64_i64(<vscale x 1 x i64> %0, i64 %1, iXLen %2) nounwind {
+; RV32-LABEL: intrinsic_vsmul_vx_nxv1i64_nxv1i64_i64:
+; RV32:       # %bb.0: # %entry
+; RV32-NEXT:    addi sp, sp, -16
+; RV32-NEXT:    sw a1, 12(sp)
+; RV32-NEXT:    sw a0, 8(sp)
+; RV32-NEXT:    addi a0, sp, 8
+; RV32-NEXT:    vsetvli zero, a2, e64, m1, ta, ma
+; RV32-NEXT:    vlse64.v v9, (a0), zero
+; RV32-NEXT:    csrwi vxrm, 0
+; RV32-NEXT:    vsmul.vv v8, v8, v9
+; RV32-NEXT:    addi sp, sp, 16
+; RV32-NEXT:    ret
+;
+; RV64-LABEL: intrinsic_vsmul_vx_nxv1i64_nxv1i64_i64:
+; RV64:       # %bb.0: # %entry
+; RV64-NEXT:    csrwi vxrm, 0
+; RV64-NEXT:    vsetvli zero, a1, e64, m1, ta, ma
+; RV64-NEXT:    vsmul.vx v8, v8, a0
+; RV64-NEXT:    ret
 entry:
   %a = call <vscale x 1 x i64> @llvm.riscv.vsmul.nxv1i64.i64(
     <vscale x 1 x i64> undef,
     <vscale x 1 x i64> %0,
     i64 %1,
-    i64 0, i64 %2)
+    iXLen 0, iXLen %2)
 
   ret <vscale x 1 x i64> %a
 }
@@ -1957,22 +1973,35 @@ declare <vscale x 1 x i64> @llvm.riscv.vsmul.mask.nxv1i64.i64(
   <vscale x 1 x i64>,
   i64,
   <vscale x 1 x i1>,
-  i64, i64, i64);
-
-define <vscale x 1 x i64> @intrinsic_vsmul_mask_vx_nxv1i64_nxv1i64_i64(<vscale x 1 x i64> %0, <vscale x 1 x i64> %1, i64 %2, <vscale x 1 x i1> %3, i64 %4) nounwind {
-; CHECK-LABEL: intrinsic_vsmul_mask_vx_nxv1i64_nxv1i64_i64:
-; CHECK:       # %bb.0: # %entry
-; CHECK-NEXT:    csrwi vxrm, 0
-; CHECK-NEXT:    vsetvli zero, a1, e64, m1, ta, mu
-; CHECK-NEXT:    vsmul.vx v8, v9, a0, v0.t
-; CHECK-NEXT:    ret
+  iXLen, iXLen, iXLen)
+
+define <vscale x 1 x i64> @intrinsic_vsmul_mask_vx_nxv1i64_nxv1i64_i64(<vscale x 1 x i64> %0, <vscale x 1 x i64> %1, i64 %2, <vscale x 1 x i1> %3, iXLen %4) nounwind {
+; RV32-LABEL: intrinsic_vsmul_mask_vx_nxv1i64_nxv1i64_i64:
+; RV32:       # %bb.0: # %entry
+; RV32-NEXT:    addi sp, sp, -16
+; RV32-NEXT:    sw a1, 12(sp)
+; RV32-NEXT:    sw a0, 8(sp)
+; RV32-NEXT:    addi a0, sp, 8
+; RV32-NEXT:    vsetvli zero, a2, e64, m1, ta, mu
+; RV32-NEXT:    vlse64.v v10, (a0), zero
+; RV32-NEXT:    csrwi vxrm, 0
+; RV32-NEXT:    vsmul.vv v8, v9, v10, v0.t
+; RV32-NEXT:    addi sp, sp, 16
+; RV32-NEXT:    ret
+;
+; RV64-LABEL: intrinsic_vsmul_mask_vx_nxv1i64_nxv1i64_i64:
+; RV64:       # %bb.0: # %entry
+; RV64-NEXT:    csrwi vxrm, 0
+; RV64-NEXT:    vsetvli zero, a1, e64, m1, ta, mu
+; RV64-NEXT:    vsmul.vx v8, v9, a0, v0.t
+; RV64-NEXT:    ret
 entry:
   %a = call <vscale x 1 x i64> @llvm.riscv.vsmul.mask.nxv1i64.i64(
     <vscale x 1 x i64> %0,
     <vscale x 1 x i64> %1,
     i64 %2,
     <vscale x 1 x i1> %3,
-    i64 0, i64 %4, i64 1)
+    iXLen 0, iXLen %4, iXLen 1)
 
   ret <vscale x 1 x i64> %a
 }
@@ -1980,21 +2009,35 @@ entry:
 declare <vscale x 2 x i64> @llvm.riscv.vsmul.nxv2i64.i64(
   <vscale x 2 x i64>,
   <vscale x 2 x i64>,
-  i64, i64, i64);
-
-define <vscale x 2 x i64> @intrinsic_vsmul_vx_nxv2i64_nxv2i64_i64(<vscale x 2 x i64> %0, i64 %1, i64 %2) nounwind {
-; CHECK-LABEL: intrinsic_vsmul_vx_nxv2i64_nxv2i64_i64:
-; CHECK:       # %bb.0: # %entry
-; CHECK-NEXT:    csrwi vxrm, 0
-; CHECK-NEXT:    vsetvli zero, a1, e64, m2, ta, ma
-; CHECK-NEXT:    vsmul.vx v8, v8, a0
-; CHECK-NEXT:    ret
+  i64,
+  iXLen, iXLen)
+
+define <vscale x 2 x i64> @intrinsic_vsmul_vx_nxv2i64_nxv2i64_i64(<vscale x 2 x i64> %0, i64 %1, iXLen %2) nounwind {
+; RV32-LABEL: intrinsic_vsmul_vx_nxv2i64_nxv2i64_i64:
+; RV32:       # %bb.0: # %entry
+; RV32-NEXT:    addi sp, sp, -16
+; RV32-NEXT:    sw a1, 12(sp)
+; RV32-NEXT:    sw a0, 8(sp)
+; RV32-NEXT:    addi a0, sp, 8
+; RV32-NEXT:    vsetvli zero, a2, e64, m2, ta, ma
+; RV32-NEXT:    vlse64.v v10, (a0), zero
+; RV32-NEXT:    csrwi vxrm, 0
+; RV32-NEXT:    vsmul.vv v8, v8, v10
+; RV32-NEXT:    addi sp, sp, 16
+; RV32-NEXT:    ret
+;
+; RV64-LABEL: intrinsic_vsmul_vx_nxv2i64_nxv2i64_i64:
+; RV64:       # %bb.0: # %entry
+; RV64-NEXT:    csrwi vxrm, 0
+; RV64-NEXT:    vsetvli zero, a1, e64, m2, ta, ma
+; RV64-NEXT:    vsmul.vx v8, v8, a0
+; RV64-NEXT:    ret
 entry:
   %a = call <vscale x 2 x i64> @llvm.riscv.vsmul.nxv2i64.i64(
     <vscale x 2 x i64> undef,
     <vscale x 2 x i64> %0,
     i64 %1,
-    i64 0, i64 %2)
+    iXLen 0, iXLen %2)
 
   ret <vscale x 2 x i64> %a
 }
@@ -2004,22 +2047,35 @@ declare <vscale x 2 x i64> @llvm.riscv.vsmul.mask.nxv2i64.i64(
   <vscale x 2 x i64>,
   i64,
   <vscale x 2 x i1>,
-  i64, i64, i64);
-
-define <vscale x 2 x i64> @intrinsic_vsmul_mask_vx_nxv2i64_nxv2i64_i64(<vscale x 2 x i64> %0, <vscale x 2 x i64> %1, i64 %2, <vscale x 2 x i1> %3, i64 %4) nounwind {
-; CHECK-LABEL: intrinsic_vsmul_mask_vx_nxv2i64_nxv2i64_i64:
-; CHECK:       # %bb.0: # %entry
-; CHECK-NEXT:    csrwi vxrm, 0
-; CHECK-NEXT:    vsetvli zero, a1, e64, m2, ta, mu
-; CHECK-NEXT:    vsmul.vx v8, v10, a0, v0.t
-; CHECK-NEXT:    ret
+  iXLen, iXLen, iXLen)
+
+define <vscale x 2 x i64> @intrinsic_vsmul_mask_vx_nxv2i64_nxv2i64_i64(<vscale x 2 x i64> %0, <vscale x 2 x i64> %1, i64 %2, <vscale x 2 x i1> %3, iXLen %4) nounwind {
+; RV32-LABEL: intrinsic_vsmul_mask_vx_nxv2i64_nxv2i64_i64:
+; RV32:       # %bb.0: # %entry
+; RV32-NEXT:    addi sp, sp, -16
+; RV32-NEXT:    sw a1, 12(sp)
+; RV32-NEXT:    sw a0, 8(sp)
+; RV32-NEXT:    addi a0, sp, 8
+; RV32-NEXT:    vsetvli zero, a2, e64, m2, ta, mu
+; RV32-NEXT:    vlse64.v v12, (a0), zero
+; RV32-NEXT:    csrwi vxrm, 0
+; RV32-NEXT:    vsmul.vv v8, v10, v12, v0.t
+; RV32-NEXT:    addi sp, sp, 16
+; RV32-NEXT:    ret
+;
+; RV64-LABEL: intrinsic_vsmul_mask_vx_nxv2i64_nxv2i64_i64:
+; RV64:       # %bb.0: # %entry
+; RV64-NEXT:    csrwi vxrm, 0
+; RV64-NEXT:    vsetvli zero, a1, e64, m2, ta, mu
+; RV64-NEXT:    vsmul.vx v8, v10, a0, v0.t
+; RV64-NEXT:    ret
 entry:
   %a = call <vscale x 2 x i64> @llvm.riscv.vsmul.mask.nxv2i64.i64(
     <vscale x 2 x i64> %0,
     <vscale x 2 x i64> %1,
     i64 %2,
     <vscale x 2 x i1> %3,
-    i64 0, i64 %4, i64 1)
+    iXLen 0, iXLen %4, iXLen 1)
 
   ret <vscale x 2 x i64> %a
 }
@@ -2027,21 +2083,35 @@ entry:
 declare <vscale x 4 x i64> @llvm.riscv.vsmul.nxv4i64.i64(
   <vscale x 4 x i64>,
   <vscale x 4 x i64>,
-  i64, i64, i64);
-
-define <vscale x 4 x i64> @intrinsic_vsmul_vx_nxv4i64_nxv4i64_i64(<vscale x 4 x i64> %0, i64 %1, i64 %2) nounwind {
-; CHECK-LABEL: intrinsic_vsmul_vx_nxv4i64_nxv4i64_i64:
-; CHECK:       # %bb.0: # %entry
-; CHECK-NEXT:    csrwi vxrm, 0
-; CHECK-NEXT:    vsetvli zero, a1, e64, m4, ta, ma
-; CHECK-NEXT:    vsmul.vx v8, v8, a0
-; CHECK-NEXT:    ret
+  i64,
+  iXLen, iXLen)
+
+define <vscale x 4 x i64> @intrinsic_vsmul_vx_nxv4i64_nxv4i64_i64(<vscale x 4 x i64> %0, i64 %1, iXLen %2) nounwind {
+; RV32-LABEL: intrinsic_vsmul_vx_nxv4i64_nxv4i64_i64:
+; RV32:       # %bb.0: # %entry
+; RV32-NEXT:    addi sp, sp, -16
+; RV32-NEXT:    sw a1, 12(sp)
+; RV32-NEXT:    sw a0, 8(sp)
+; RV32-NEXT:    addi a0, sp, 8
+; RV32-NEXT:    vsetvli zero, a2, e64, m4, ta, ma
+; RV32-NEXT:    vlse64.v v12, (a0), zero
+; RV32-NEXT:    csrwi vxrm, 0
+; RV32-NEXT:    vsmul.vv v8, v8, v12
+; RV32-NEXT:    addi sp, sp, 16
+; RV32-NEXT:    ret
+;
+; RV64-LABEL: intrinsic_vsmul_vx_nxv4i64_nxv4i64_i64:
+; RV64:       # %bb.0: # %entry
+; RV64-NEXT:    csrwi vxrm, 0
+; RV64-NEXT:    vsetvli zero, a1, e64, m4, ta, ma
+; RV64-NEXT:    vsmul.vx v8, v8, a0
+; RV64-NEXT:    ret
 entry:
   %a = call <vscale x 4 x i64> @llvm.riscv.vsmul.nxv4i64.i64(
     <vscale x 4 x i64> undef,
     <vscale x 4 x i64> %0,
     i64 %1,
-    i64 0, i64 %2)
+    iXLen 0, iXLen %2)
 
   ret <vscale x 4 x i64> %a
 }
@@ -2051,22 +2121,35 @@ declare <vscale x 4 x i64> @llvm.riscv.vsmul.mask.nxv4i64.i64(
   <vscale x 4 x i64>,
   i64,
   <vscale x 4 x i1>,
-  i64, i64, i64);
-
-define <vscale x 4 x i64> @intrinsic_vsmul_mask_vx_nxv4i64_nxv4i64_i64(<vscale x 4 x i64> %0, <vscale x 4 x i64> %1, i64 %2, <vscale x 4 x i1> %3, i64 %4) nounwind {
-; CHECK-LABEL: intrinsic_vsmul_mask_vx_nxv4i64_nxv4i64_i64:
-; CHECK:       # %bb.0: # %entry
-; CHECK-NEXT:    csrwi vxrm, 0
-; CHECK-NEXT:    vsetvli zero, a1, e64, m4, ta, mu
-; CHECK-NEXT:    vsmul.vx v8, v12, a0, v0.t
-; CHECK-NEXT:    ret
+  iXLen, iXLen, iXLen)
+
+define <vscale x 4 x i64> @intrinsic_vsmul_mask_vx_nxv4i64_nxv4i64_i64(<vscale x 4 x i64> %0, <vscale x 4 x i64> %1, i64 %2, <vscale x 4 x i1> %3, iXLen %4) nounwind {
+; RV32-LABEL: intrinsic_vsmul_mask_vx_nxv4i64_nxv4i64_i64:
+; RV32:       # %bb.0: # %entry
+; RV32-NEXT:    addi sp, sp, -16
+; RV32-NEXT:    sw a1, 12(sp)
+; RV32-NEXT:    sw a0, 8(sp)
+; RV32-NEXT:    addi a0, sp, 8
+; RV32-NEXT:    vsetvli zero, a2, e64, m4, ta, mu
+; RV32-NEXT:    vlse64.v v16, (a0), zero
+; RV32-NEXT:    csrwi vxrm, 0
+; RV32-NEXT:    vsmul.vv v8, v12, v16, v0.t
+; RV32-NEXT:    addi sp, sp, 16
+; RV32-NEXT:    ret
+;
+; RV64-LABEL: intrinsic_vsmul_mask_vx_nxv4i64_nxv4i64_i64:
+; RV64:       # %bb.0: # %entry
+; RV64-NEXT:    csrwi vxrm, 0
+; RV64-NEXT:    vsetvli zero, a1, e64, m4, ta, mu
+; RV64-NEXT:    vsmul.vx v8, v12, a0, v0.t
+; RV64-NEXT:    ret
 entry:
   %a = call <vscale x 4 x i64> @llvm.riscv.vsmul.mask.nxv4i64.i64(
     <vscale x 4 x i64> %0,
     <vscale x 4 x i64> %1,
     i64 %2,
     <vscale x 4 x i1> %3,
-    i64 0, i64 %4, i64 1)
+    iXLen 0, iXLen %4, iXLen 1)
 
   ret <vscale x 4 x i64> %a
 }
@@ -2074,21 +2157,35 @@ entry:
 declare <vscale x 8 x i64> @llvm.riscv.vsmul.nxv8i64.i64(
   <vscale x 8 x i64>,
   <vscale x 8 x i64>,
-  i64, i64, i64);
-
-define <vscale x 8 x i64> @intrinsic_vsmul_vx_nxv8i64_nxv8i64_i64(<vscale x 8 x i64> %0, i64 %1, i64 %2) nounwind {
-; CHECK-LABEL: intrinsic_vsmul_vx_nxv8i64_nxv8i64_i64:
-; CHECK:       # %bb.0: # %entry
-; CHECK-NEXT:    csrwi vxrm, 0
-; CHECK-NEXT:    vsetvli zero, a1, e64, m8, ta, ma
-; CHECK-NEXT:    vsmul.vx v8, v8, a0
-; CHECK-NEXT:    ret
+  i64,
+  iXLen, iXLen)
+
+define <vscale x 8 x i64> @intrinsic_vsmul_vx_nxv8i64_nxv8i64_i64(<vscale x 8 x i64> %0, i64 %1, iXLen %2) nounwind {
+; RV32-LABEL: intrinsic_vsmul_vx_nxv8i64_nxv8i64_i64:
+; RV32:       # %bb.0: # %entry
+; RV32-NEXT:    addi sp, sp, -16
+; RV32-NEXT:    sw a1, 12(sp)
+; RV32-NEXT:    sw a0, 8(sp)
+; RV32-NEXT:    addi a0, sp, 8
+; RV32-NEXT:    vsetvli zero, a2, e64, m8, ta, ma
+; RV32-NEXT:    vlse64.v v16, (a0), zero
+; RV32-NEXT:    csrwi vxrm, 0
+; RV32-NEXT:    vsmul.vv v8, v8, v16
+; RV32-NEXT:    addi sp, sp, 16
+; RV32-NEXT:    ret
+;
+; RV64-LABEL: intrinsic_vsmul_vx_nxv8i64_nxv8i64_i64:
+; RV64:       # %bb.0: # %entry
+; RV64-NEXT:    csrwi vxrm, 0
+; RV64-NEXT:    vsetvli zero, a1, e64, m8, ta, ma
+; RV64-NEXT:    vsmul.vx v8, v8, a0
+; RV64-NEXT:    ret
 entry:
   %a = call <vscale x 8 x i64> @llvm.riscv.vsmul.nxv8i64.i64(
     <vscale x 8 x i64> undef,
     <vscale x 8 x i64> %0,
     i64 %1,
-    i64 0, i64 %2)
+    iXLen 0, iXLen %2)
 
   ret <vscale x 8 x i64> %a
 }
@@ -2098,22 +2195,35 @@ declare <vscale x 8 x i64> @llvm.riscv.vsmul.mask.nxv8i64.i64(
   <vscale x 8 x i64>,
   i64,
   <vscale x 8 x i1>,
-  i64, i64, i64);
-
-define <vscale x 8 x i64> @intrinsic_vsmul_mask_vx_nxv8i64_nxv8i64_i64(<vscale x 8 x i64> %0, <vscale x 8 x i64> %1, i64 %2, <vscale x 8 x i1> %3, i64 %4) nounwind {
-; CHECK-LABEL: intrinsic_vsmul_mask_vx_nxv8i64_nxv8i64_i64:
-; CHECK:       # %bb.0: # %entry
-; CHECK-NEXT:    csrwi vxrm, 0
-; CHECK-NEXT:    vsetvli zero, a1, e64, m8, ta, mu
-; CHECK-NEXT:    vsmul.vx v8, v16, a0, v0.t
-; CHECK-NEXT:    ret
+  iXLen, iXLen, iXLen)
+
+define <vscale x 8 x i64> @intrinsic_vsmul_mask_vx_nxv8i64_nxv8i64_i64(<vscale x 8 x i64> %0, <vscale x 8 x i64> %1, i64 %2, <vscale x 8 x i1> %3, iXLen %4) nounwind {
+; RV32-LABEL: intrinsic_vsmul_mask_vx_nxv8i64_nxv8i64_i64:
+; RV32:       # %bb.0: # %entry
+; RV32-NEXT:    addi sp, sp, -16
+; RV32-NEXT:    sw a1, 12(sp)
+; RV32-NEXT:    sw a0, 8(sp)
+; RV32-NEXT:    addi a0, sp, 8
+; RV32-NEXT:    vsetvli zero, a2, e64, m8, ta, mu
+; RV32-NEXT:    vlse64.v v24, (a0), zero
+; RV32-NEXT:    csrwi vxrm, 0
+; RV32-NEXT:    vsmul.vv v8, v16, v24, v0.t
+; RV32-NEXT:    addi sp, sp, 16
+; RV32-NEXT:    ret
+;
+; RV64-LABEL: intrinsic_vsmul_mask_vx_nxv8i64_nxv8i64_i64:
+; RV64:       # %bb.0: # %entry
+; RV64-NEXT:    csrwi vxrm, 0
+; RV64-NEXT:    vsetvli zero, a1, e64, m8, ta, mu
+; RV64-NEXT:    vsmul.vx v8, v16, a0, v0.t
+; RV64-NEXT:    ret
 entry:
   %a = call <vscale x 8 x i64> @llvm.riscv.vsmul.mask.nxv8i64.i64(
     <vscale x 8 x i64> %0,
     <vscale x 8 x i64> %1,
     i64 %2,
     <vscale x 8 x i1> %3,
-    i64 0, i64 %4, i64 1)
+    iXLen 0, iXLen %4, iXLen 1)
 
   ret <vscale x 8 x i64> %a
 }
diff --git a/llvm/test/CodeGen/RISCV/rvv/vssub-rv64.ll b/llvm/test/CodeGen/RISCV/rvv/vssub-rv64.ll
deleted file mode 100644
index 3928e6f..0000000
--- a/llvm/test/CodeGen/RISCV/rvv/vssub-rv64.ll
+++ /dev/null
@@ -1,2075 +0,0 @@
-; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py
-; RUN: llc -mtriple=riscv64 -mattr=+v -verify-machineinstrs \
-; RUN:   < %s | FileCheck %s
-
-declare <vscale x 1 x i8> @llvm.riscv.vssub.nxv1i8.nxv1i8(
-  <vscale x 1 x i8>,
-  <vscale x 1 x i8>,
-  <vscale x 1 x i8>,
-  i64);
-
-define <vscale x 1 x i8> @intrinsic_vssub_vv_nxv1i8_nxv1i8_nxv1i8(<vscale x 1 x i8> %0, <vscale x 1 x i8> %1, i64 %2) nounwind {
-; CHECK-LABEL: intrinsic_vssub_vv_nxv1i8_nxv1i8_nxv1i8:
-; CHECK:       # %bb.0: # %entry
-; CHECK-NEXT:    vsetvli zero, a0, e8, mf8, ta, ma
-; CHECK-NEXT:    vssub.vv v8, v8, v9
-; CHECK-NEXT:    ret
-entry:
-  %a = call <vscale x 1 x i8> @llvm.riscv.vssub.nxv1i8.nxv1i8(
-    <vscale x 1 x i8> undef,
-    <vscale x 1 x i8> %0,
-    <vscale x 1 x i8> %1,
-    i64 %2)
-
-  ret <vscale x 1 x i8> %a
-}
-
-declare <vscale x 1 x i8> @llvm.riscv.vssub.mask.nxv1i8.nxv1i8(
-  <vscale x 1 x i8>,
-  <vscale x 1 x i8>,
-  <vscale x 1 x i8>,
-  <vscale x 1 x i1>,
-  i64,
-  i64);
-
-define <vscale x 1 x i8> @intrinsic_vssub_mask_vv_nxv1i8_nxv1i8_nxv1i8(<vscale x 1 x i8> %0, <vscale x 1 x i8> %1, <vscale x 1 x i8> %2, <vscale x 1 x i1> %3, i64 %4) nounwind {
-; CHECK-LABEL: intrinsic_vssub_mask_vv_nxv1i8_nxv1i8_nxv1i8:
-; CHECK:       # %bb.0: # %entry
-; CHECK-NEXT:    vsetvli zero, a0, e8, mf8, ta, mu
-; CHECK-NEXT:    vssub.vv v8, v9, v10, v0.t
-; CHECK-NEXT:    ret
-entry:
-  %a = call <vscale x 1 x i8> @llvm.riscv.vssub.mask.nxv1i8.nxv1i8(
-    <vscale x 1 x i8> %0,
-    <vscale x 1 x i8> %1,
-    <vscale x 1 x i8> %2,
-    <vscale x 1 x i1> %3,
-    i64 %4, i64 1)
-
-  ret <vscale x 1 x i8> %a
-}
-
-declare <vscale x 2 x i8> @llvm.riscv.vssub.nxv2i8.nxv2i8(
-  <vscale x 2 x i8>,
-  <vscale x 2 x i8>,
-  <vscale x 2 x i8>,
-  i64);
-
-define <vscale x 2 x i8> @intrinsic_vssub_vv_nxv2i8_nxv2i8_nxv2i8(<vscale x 2 x i8> %0, <vscale x 2 x i8> %1, i64 %2) nounwind {
-; CHECK-LABEL: intrinsic_vssub_vv_nxv2i8_nxv2i8_nxv2i8:
-; CHECK:       # %bb.0: # %entry
-; CHECK-NEXT:    vsetvli zero, a0, e8, mf4, ta, ma
-; CHECK-NEXT:    vssub.vv v8, v8, v9
-; CHECK-NEXT:    ret
-entry:
-  %a = call <vscale x 2 x i8> @llvm.riscv.vssub.nxv2i8.nxv2i8(
-    <vscale x 2 x i8> undef,
-    <vscale x 2 x i8> %0,
-    <vscale x 2 x i8> %1,
-    i64 %2)
-
-  ret <vscale x 2 x i8> %a
-}
-
-declare <vscale x 2 x i8> @llvm.riscv.vssub.mask.nxv2i8.nxv2i8(
-  <vscale x 2 x i8>,
-  <vscale x 2 x i8>,
-  <vscale x 2 x i8>,
-  <vscale x 2 x i1>,
-  i64,
-  i64);
-
-define <vscale x 2 x i8> @intrinsic_vssub_mask_vv_nxv2i8_nxv2i8_nxv2i8(<vscale x 2 x i8> %0, <vscale x 2 x i8> %1, <vscale x 2 x i8> %2, <vscale x 2 x i1> %3, i64 %4) nounwind {
-; CHECK-LABEL: intrinsic_vssub_mask_vv_nxv2i8_nxv2i8_nxv2i8:
-; CHECK:       # %bb.0: # %entry
-; CHECK-NEXT:    vsetvli zero, a0, e8, mf4, ta, mu
-; CHECK-NEXT:    vssub.vv v8, v9, v10, v0.t
-; CHECK-NEXT:    ret
-entry:
-  %a = call <vscale x 2 x i8> @llvm.riscv.vssub.mask.nxv2i8.nxv2i8(
-    <vscale x 2 x i8> %0,
-    <vscale x 2 x i8> %1,
-    <vscale x 2 x i8> %2,
-    <vscale x 2 x i1> %3,
-    i64 %4, i64 1)
-
-  ret <vscale x 2 x i8> %a
-}
-
-declare <vscale x 4 x i8> @llvm.riscv.vssub.nxv4i8.nxv4i8(
-  <vscale x 4 x i8>,
-  <vscale x 4 x i8>,
-  <vscale x 4 x i8>,
-  i64);
-
-define <vscale x 4 x i8> @intrinsic_vssub_vv_nxv4i8_nxv4i8_nxv4i8(<vscale x 4 x i8> %0, <vscale x 4 x i8> %1, i64 %2) nounwind {
-; CHECK-LABEL: intrinsic_vssub_vv_nxv4i8_nxv4i8_nxv4i8:
-; CHECK:       # %bb.0: # %entry
-; CHECK-NEXT:    vsetvli zero, a0, e8, mf2, ta, ma
-; CHECK-NEXT:    vssub.vv v8, v8, v9
-; CHECK-NEXT:    ret
-entry:
-  %a = call <vscale x 4 x i8> @llvm.riscv.vssub.nxv4i8.nxv4i8(
-    <vscale x 4 x i8> undef,
-    <vscale x 4 x i8> %0,
-    <vscale x 4 x i8> %1,
-    i64 %2)
-
-  ret <vscale x 4 x i8> %a
-}
-
-declare <vscale x 4 x i8> @llvm.riscv.vssub.mask.nxv4i8.nxv4i8(
-  <vscale x 4 x i8>,
-  <vscale x 4 x i8>,
-  <vscale x 4 x i8>,
-  <vscale x 4 x i1>,
-  i64,
-  i64);
-
-define <vscale x 4 x i8> @intrinsic_vssub_mask_vv_nxv4i8_nxv4i8_nxv4i8(<vscale x 4 x i8> %0, <vscale x 4 x i8> %1, <vscale x 4 x i8> %2, <vscale x 4 x i1> %3, i64 %4) nounwind {
-; CHECK-LABEL: intrinsic_vssub_mask_vv_nxv4i8_nxv4i8_nxv4i8:
-; CHECK:       # %bb.0: # %entry
-; CHECK-NEXT:    vsetvli zero, a0, e8, mf2, ta, mu
-; CHECK-NEXT:    vssub.vv v8, v9, v10, v0.t
-; CHECK-NEXT:    ret
-entry:
-  %a = call <vscale x 4 x i8> @llvm.riscv.vssub.mask.nxv4i8.nxv4i8(
-    <vscale x 4 x i8> %0,
-    <vscale x 4 x i8> %1,
-    <vscale x 4 x i8> %2,
-    <vscale x 4 x i1> %3,
-    i64 %4, i64 1)
-
-  ret <vscale x 4 x i8> %a
-}
-
-declare <vscale x 8 x i8> @llvm.riscv.vssub.nxv8i8.nxv8i8(
-  <vscale x 8 x i8>,
-  <vscale x 8 x i8>,
-  <vscale x 8 x i8>,
-  i64);
-
-define <vscale x 8 x i8> @intrinsic_vssub_vv_nxv8i8_nxv8i8_nxv8i8(<vscale x 8 x i8> %0, <vscale x 8 x i8> %1, i64 %2) nounwind {
-; CHECK-LABEL: intrinsic_vssub_vv_nxv8i8_nxv8i8_nxv8i8:
-; CHECK:       # %bb.0: # %entry
-; CHECK-NEXT:    vsetvli zero, a0, e8, m1, ta, ma
-; CHECK-NEXT:    vssub.vv v8, v8, v9
-; CHECK-NEXT:    ret
-entry:
-  %a = call <vscale x 8 x i8> @llvm.riscv.vssub.nxv8i8.nxv8i8(
-    <vscale x 8 x i8> undef,
-    <vscale x 8 x i8> %0,
-    <vscale x 8 x i8> %1,
-    i64 %2)
-
-  ret <vscale x 8 x i8> %a
-}
-
-declare <vscale x 8 x i8> @llvm.riscv.vssub.mask.nxv8i8.nxv8i8(
-  <vscale x 8 x i8>,
-  <vscale x 8 x i8>,
-  <vscale x 8 x i8>,
-  <vscale x 8 x i1>,
-  i64,
-  i64);
-
-define <vscale x 8 x i8> @intrinsic_vssub_mask_vv_nxv8i8_nxv8i8_nxv8i8(<vscale x 8 x i8> %0, <vscale x 8 x i8> %1, <vscale x 8 x i8> %2, <vscale x 8 x i1> %3, i64 %4) nounwind {
-; CHECK-LABEL: intrinsic_vssub_mask_vv_nxv8i8_nxv8i8_nxv8i8:
-; CHECK:       # %bb.0: # %entry
-; CHECK-NEXT:    vsetvli zero, a0, e8, m1, ta, mu
-; CHECK-NEXT:    vssub.vv v8, v9, v10, v0.t
-; CHECK-NEXT:    ret
-entry:
-  %a = call <vscale x 8 x i8> @llvm.riscv.vssub.mask.nxv8i8.nxv8i8(
-    <vscale x 8 x i8> %0,
-    <vscale x 8 x i8> %1,
-    <vscale x 8 x i8> %2,
-    <vscale x 8 x i1> %3,
-    i64 %4, i64 1)
-
-  ret <vscale x 8 x i8> %a
-}
-
-declare <vscale x 16 x i8> @llvm.riscv.vssub.nxv16i8.nxv16i8(
-  <vscale x 16 x i8>,
-  <vscale x 16 x i8>,
-  <vscale x 16 x i8>,
-  i64);
-
-define <vscale x 16 x i8> @intrinsic_vssub_vv_nxv16i8_nxv16i8_nxv16i8(<vscale x 16 x i8> %0, <vscale x 16 x i8> %1, i64 %2) nounwind {
-; CHECK-LABEL: intrinsic_vssub_vv_nxv16i8_nxv16i8_nxv16i8:
-; CHECK:       # %bb.0: # %entry
-; CHECK-NEXT:    vsetvli zero, a0, e8, m2, ta, ma
-; CHECK-NEXT:    vssub.vv v8, v8, v10
-; CHECK-NEXT:    ret
-entry:
-  %a = call <vscale x 16 x i8> @llvm.riscv.vssub.nxv16i8.nxv16i8(
-    <vscale x 16 x i8> undef,
-    <vscale x 16 x i8> %0,
-    <vscale x 16 x i8> %1,
-    i64 %2)
-
-  ret <vscale x 16 x i8> %a
-}
-
-declare <vscale x 16 x i8> @llvm.riscv.vssub.mask.nxv16i8.nxv16i8(
-  <vscale x 16 x i8>,
-  <vscale x 16 x i8>,
-  <vscale x 16 x i8>,
-  <vscale x 16 x i1>,
-  i64,
-  i64);
-
-define <vscale x 16 x i8> @intrinsic_vssub_mask_vv_nxv16i8_nxv16i8_nxv16i8(<vscale x 16 x i8> %0, <vscale x 16 x i8> %1, <vscale x 16 x i8> %2, <vscale x 16 x i1> %3, i64 %4) nounwind {
-; CHECK-LABEL: intrinsic_vssub_mask_vv_nxv16i8_nxv16i8_nxv16i8:
-; CHECK:       # %bb.0: # %entry
-; CHECK-NEXT:    vsetvli zero, a0, e8, m2, ta, mu
-; CHECK-NEXT:    vssub.vv v8, v10, v12, v0.t
-; CHECK-NEXT:    ret
-entry:
-  %a = call <vscale x 16 x i8> @llvm.riscv.vssub.mask.nxv16i8.nxv16i8(
-    <vscale x 16 x i8> %0,
-    <vscale x 16 x i8> %1,
-    <vscale x 16 x i8> %2,
-    <vscale x 16 x i1> %3,
-    i64 %4, i64 1)
-
-  ret <vscale x 16 x i8> %a
-}
-
-declare <vscale x 32 x i8> @llvm.riscv.vssub.nxv32i8.nxv32i8(
-  <vscale x 32 x i8>,
-  <vscale x 32 x i8>,
-  <vscale x 32 x i8>,
-  i64);
-
-define <vscale x 32 x i8> @intrinsic_vssub_vv_nxv32i8_nxv32i8_nxv32i8(<vscale x 32 x i8> %0, <vscale x 32 x i8> %1, i64 %2) nounwind {
-; CHECK-LABEL: intrinsic_vssub_vv_nxv32i8_nxv32i8_nxv32i8:
-; CHECK:       # %bb.0: # %entry
-; CHECK-NEXT:    vsetvli zero, a0, e8, m4, ta, ma
-; CHECK-NEXT:    vssub.vv v8, v8, v12
-; CHECK-NEXT:    ret
-entry:
-  %a = call <vscale x 32 x i8> @llvm.riscv.vssub.nxv32i8.nxv32i8(
-    <vscale x 32 x i8> undef,
-    <vscale x 32 x i8> %0,
-    <vscale x 32 x i8> %1,
-    i64 %2)
-
-  ret <vscale x 32 x i8> %a
-}
-
-declare <vscale x 32 x i8> @llvm.riscv.vssub.mask.nxv32i8.nxv32i8(
-  <vscale x 32 x i8>,
-  <vscale x 32 x i8>,
-  <vscale x 32 x i8>,
-  <vscale x 32 x i1>,
-  i64,
-  i64);
-
-define <vscale x 32 x i8> @intrinsic_vssub_mask_vv_nxv32i8_nxv32i8_nxv32i8(<vscale x 32 x i8> %0, <vscale x 32 x i8> %1, <vscale x 32 x i8> %2, <vscale x 32 x i1> %3, i64 %4) nounwind {
-; CHECK-LABEL: intrinsic_vssub_mask_vv_nxv32i8_nxv32i8_nxv32i8:
-; CHECK:       # %bb.0: # %entry
-; CHECK-NEXT:    vsetvli zero, a0, e8, m4, ta, mu
-; CHECK-NEXT:    vssub.vv v8, v12, v16, v0.t
-; CHECK-NEXT:    ret
-entry:
-  %a = call <vscale x 32 x i8> @llvm.riscv.vssub.mask.nxv32i8.nxv32i8(
-    <vscale x 32 x i8> %0,
-    <vscale x 32 x i8> %1,
-    <vscale x 32 x i8> %2,
-    <vscale x 32 x i1> %3,
-    i64 %4, i64 1)
-
-  ret <vscale x 32 x i8> %a
-}
-
-declare <vscale x 64 x i8> @llvm.riscv.vssub.nxv64i8.nxv64i8(
-  <vscale x 64 x i8>,
-  <vscale x 64 x i8>,
-  <vscale x 64 x i8>,
-  i64);
-
-define <vscale x 64 x i8> @intrinsic_vssub_vv_nxv64i8_nxv64i8_nxv64i8(<vscale x 64 x i8> %0, <vscale x 64 x i8> %1, i64 %2) nounwind {
-; CHECK-LABEL: intrinsic_vssub_vv_nxv64i8_nxv64i8_nxv64i8:
-; CHECK:       # %bb.0: # %entry
-; CHECK-NEXT:    vsetvli zero, a0, e8, m8, ta, ma
-; CHECK-NEXT:    vssub.vv v8, v8, v16
-; CHECK-NEXT:    ret
-entry:
-  %a = call <vscale x 64 x i8> @llvm.riscv.vssub.nxv64i8.nxv64i8(
-    <vscale x 64 x i8> undef,
-    <vscale x 64 x i8> %0,
-    <vscale x 64 x i8> %1,
-    i64 %2)
-
-  ret <vscale x 64 x i8> %a
-}
-
-declare <vscale x 64 x i8> @llvm.riscv.vssub.mask.nxv64i8.nxv64i8(
-  <vscale x 64 x i8>,
-  <vscale x 64 x i8>,
-  <vscale x 64 x i8>,
-  <vscale x 64 x i1>,
-  i64,
-  i64);
-
-define <vscale x 64 x i8> @intrinsic_vssub_mask_vv_nxv64i8_nxv64i8_nxv64i8(<vscale x 64 x i8> %0, <vscale x 64 x i8> %1, <vscale x 64 x i8> %2, <vscale x 64 x i1> %3, i64 %4) nounwind {
-; CHECK-LABEL: intrinsic_vssub_mask_vv_nxv64i8_nxv64i8_nxv64i8:
-; CHECK:       # %bb.0: # %entry
-; CHECK-NEXT:    vl8r.v v24, (a0)
-; CHECK-NEXT:    vsetvli zero, a1, e8, m8, ta, mu
-; CHECK-NEXT:    vssub.vv v8, v16, v24, v0.t
-; CHECK-NEXT:    ret
-entry:
-  %a = call <vscale x 64 x i8> @llvm.riscv.vssub.mask.nxv64i8.nxv64i8(
-    <vscale x 64 x i8> %0,
-    <vscale x 64 x i8> %1,
-    <vscale x 64 x i8> %2,
-    <vscale x 64 x i1> %3,
-    i64 %4, i64 1)
-
-  ret <vscale x 64 x i8> %a
-}
-
-declare <vscale x 1 x i16> @llvm.riscv.vssub.nxv1i16.nxv1i16(
-  <vscale x 1 x i16>,
-  <vscale x 1 x i16>,
-  <vscale x 1 x i16>,
-  i64);
-
-define <vscale x 1 x i16> @intrinsic_vssub_vv_nxv1i16_nxv1i16_nxv1i16(<vscale x 1 x i16> %0, <vscale x 1 x i16> %1, i64 %2) nounwind {
-; CHECK-LABEL: intrinsic_vssub_vv_nxv1i16_nxv1i16_nxv1i16:
-; CHECK:       # %bb.0: # %entry
-; CHECK-NEXT:    vsetvli zero, a0, e16, mf4, ta, ma
-; CHECK-NEXT:    vssub.vv v8, v8, v9
-; CHECK-NEXT:    ret
-entry:
-  %a = call <vscale x 1 x i16> @llvm.riscv.vssub.nxv1i16.nxv1i16(
-    <vscale x 1 x i16> undef,
-    <vscale x 1 x i16> %0,
-    <vscale x 1 x i16> %1,
-    i64 %2)
-
-  ret <vscale x 1 x i16> %a
-}
-
-declare <vscale x 1 x i16> @llvm.riscv.vssub.mask.nxv1i16.nxv1i16(
-  <vscale x 1 x i16>,
-  <vscale x 1 x i16>,
-  <vscale x 1 x i16>,
-  <vscale x 1 x i1>,
-  i64,
-  i64);
-
-define <vscale x 1 x i16> @intrinsic_vssub_mask_vv_nxv1i16_nxv1i16_nxv1i16(<vscale x 1 x i16> %0, <vscale x 1 x i16> %1, <vscale x 1 x i16> %2, <vscale x 1 x i1> %3, i64 %4) nounwind {
-; CHECK-LABEL: intrinsic_vssub_mask_vv_nxv1i16_nxv1i16_nxv1i16:
-; CHECK:       # %bb.0: # %entry
-; CHECK-NEXT:    vsetvli zero, a0, e16, mf4, ta, mu
-; CHECK-NEXT:    vssub.vv v8, v9, v10, v0.t
-; CHECK-NEXT:    ret
-entry:
-  %a = call <vscale x 1 x i16> @llvm.riscv.vssub.mask.nxv1i16.nxv1i16(
-    <vscale x 1 x i16> %0,
-    <vscale x 1 x i16> %1,
-    <vscale x 1 x i16> %2,
-    <vscale x 1 x i1> %3,
-    i64 %4, i64 1)
-
-  ret <vscale x 1 x i16> %a
-}
-
-declare <vscale x 2 x i16> @llvm.riscv.vssub.nxv2i16.nxv2i16(
-  <vscale x 2 x i16>,
-  <vscale x 2 x i16>,
-  <vscale x 2 x i16>,
-  i64);
-
-define <vscale x 2 x i16> @intrinsic_vssub_vv_nxv2i16_nxv2i16_nxv2i16(<vscale x 2 x i16> %0, <vscale x 2 x i16> %1, i64 %2) nounwind {
-; CHECK-LABEL: intrinsic_vssub_vv_nxv2i16_nxv2i16_nxv2i16:
-; CHECK:       # %bb.0: # %entry
-; CHECK-NEXT:    vsetvli zero, a0, e16, mf2, ta, ma
-; CHECK-NEXT:    vssub.vv v8, v8, v9
-; CHECK-NEXT:    ret
-entry:
-  %a = call <vscale x 2 x i16> @llvm.riscv.vssub.nxv2i16.nxv2i16(
-    <vscale x 2 x i16> undef,
-    <vscale x 2 x i16> %0,
-    <vscale x 2 x i16> %1,
-    i64 %2)
-
-  ret <vscale x 2 x i16> %a
-}
-
-declare <vscale x 2 x i16> @llvm.riscv.vssub.mask.nxv2i16.nxv2i16(
-  <vscale x 2 x i16>,
-  <vscale x 2 x i16>,
-  <vscale x 2 x i16>,
-  <vscale x 2 x i1>,
-  i64,
-  i64);
-
-define <vscale x 2 x i16> @intrinsic_vssub_mask_vv_nxv2i16_nxv2i16_nxv2i16(<vscale x 2 x i16> %0, <vscale x 2 x i16> %1, <vscale x 2 x i16> %2, <vscale x 2 x i1> %3, i64 %4) nounwind {
-; CHECK-LABEL: intrinsic_vssub_mask_vv_nxv2i16_nxv2i16_nxv2i16:
-; CHECK:       # %bb.0: # %entry
-; CHECK-NEXT:    vsetvli zero, a0, e16, mf2, ta, mu
-; CHECK-NEXT:    vssub.vv v8, v9, v10, v0.t
-; CHECK-NEXT:    ret
-entry:
-  %a = call <vscale x 2 x i16> @llvm.riscv.vssub.mask.nxv2i16.nxv2i16(
-    <vscale x 2 x i16> %0,
-    <vscale x 2 x i16> %1,
-    <vscale x 2 x i16> %2,
-    <vscale x 2 x i1> %3,
-    i64 %4, i64 1)
-
-  ret <vscale x 2 x i16> %a
-}
-
-declare <vscale x 4 x i16> @llvm.riscv.vssub.nxv4i16.nxv4i16(
-  <vscale x 4 x i16>,
-  <vscale x 4 x i16>,
-  <vscale x 4 x i16>,
-  i64);
-
-define <vscale x 4 x i16> @intrinsic_vssub_vv_nxv4i16_nxv4i16_nxv4i16(<vscale x 4 x i16> %0, <vscale x 4 x i16> %1, i64 %2) nounwind {
-; CHECK-LABEL: intrinsic_vssub_vv_nxv4i16_nxv4i16_nxv4i16:
-; CHECK:       # %bb.0: # %entry
-; CHECK-NEXT:    vsetvli zero, a0, e16, m1, ta, ma
-; CHECK-NEXT:    vssub.vv v8, v8, v9
-; CHECK-NEXT:    ret
-entry:
-  %a = call <vscale x 4 x i16> @llvm.riscv.vssub.nxv4i16.nxv4i16(
-    <vscale x 4 x i16> undef,
-    <vscale x 4 x i16> %0,
-    <vscale x 4 x i16> %1,
-    i64 %2)
-
-  ret <vscale x 4 x i16> %a
-}
-
-declare <vscale x 4 x i16> @llvm.riscv.vssub.mask.nxv4i16.nxv4i16(
-  <vscale x 4 x i16>,
-  <vscale x 4 x i16>,
-  <vscale x 4 x i16>,
-  <vscale x 4 x i1>,
-  i64,
-  i64);
-
-define <vscale x 4 x i16> @intrinsic_vssub_mask_vv_nxv4i16_nxv4i16_nxv4i16(<vscale x 4 x i16> %0, <vscale x 4 x i16> %1, <vscale x 4 x i16> %2, <vscale x 4 x i1> %3, i64 %4) nounwind {
-; CHECK-LABEL: intrinsic_vssub_mask_vv_nxv4i16_nxv4i16_nxv4i16:
-; CHECK:       # %bb.0: # %entry
-; CHECK-NEXT:    vsetvli zero, a0, e16, m1, ta, mu
-; CHECK-NEXT:    vssub.vv v8, v9, v10, v0.t
-; CHECK-NEXT:    ret
-entry:
-  %a = call <vscale x 4 x i16> @llvm.riscv.vssub.mask.nxv4i16.nxv4i16(
-    <vscale x 4 x i16> %0,
-    <vscale x 4 x i16> %1,
-    <vscale x 4 x i16> %2,
-    <vscale x 4 x i1> %3,
-    i64 %4, i64 1)
-
-  ret <vscale x 4 x i16> %a
-}
-
-declare <vscale x 8 x i16> @llvm.riscv.vssub.nxv8i16.nxv8i16(
-  <vscale x 8 x i16>,
-  <vscale x 8 x i16>,
-  <vscale x 8 x i16>,
-  i64);
-
-define <vscale x 8 x i16> @intrinsic_vssub_vv_nxv8i16_nxv8i16_nxv8i16(<vscale x 8 x i16> %0, <vscale x 8 x i16> %1, i64 %2) nounwind {
-; CHECK-LABEL: intrinsic_vssub_vv_nxv8i16_nxv8i16_nxv8i16:
-; CHECK:       # %bb.0: # %entry
-; CHECK-NEXT:    vsetvli zero, a0, e16, m2, ta, ma
-; CHECK-NEXT:    vssub.vv v8, v8, v10
-; CHECK-NEXT:    ret
-entry:
-  %a = call <vscale x 8 x i16> @llvm.riscv.vssub.nxv8i16.nxv8i16(
-    <vscale x 8 x i16> undef,
-    <vscale x 8 x i16> %0,
-    <vscale x 8 x i16> %1,
-    i64 %2)
-
-  ret <vscale x 8 x i16> %a
-}
-
-declare <vscale x 8 x i16> @llvm.riscv.vssub.mask.nxv8i16.nxv8i16(
-  <vscale x 8 x i16>,
-  <vscale x 8 x i16>,
-  <vscale x 8 x i16>,
-  <vscale x 8 x i1>,
-  i64,
-  i64);
-
-define <vscale x 8 x i16> @intrinsic_vssub_mask_vv_nxv8i16_nxv8i16_nxv8i16(<vscale x 8 x i16> %0, <vscale x 8 x i16> %1, <vscale x 8 x i16> %2, <vscale x 8 x i1> %3, i64 %4) nounwind {
-; CHECK-LABEL: intrinsic_vssub_mask_vv_nxv8i16_nxv8i16_nxv8i16:
-; CHECK:       # %bb.0: # %entry
-; CHECK-NEXT:    vsetvli zero, a0, e16, m2, ta, mu
-; CHECK-NEXT:    vssub.vv v8, v10, v12, v0.t
-; CHECK-NEXT:    ret
-entry:
-  %a = call <vscale x 8 x i16> @llvm.riscv.vssub.mask.nxv8i16.nxv8i16(
-    <vscale x 8 x i16> %0,
-    <vscale x 8 x i16> %1,
-    <vscale x 8 x i16> %2,
-    <vscale x 8 x i1> %3,
-    i64 %4, i64 1)
-
-  ret <vscale x 8 x i16> %a
-}
-
-declare <vscale x 16 x i16> @llvm.riscv.vssub.nxv16i16.nxv16i16(
-  <vscale x 16 x i16>,
-  <vscale x 16 x i16>,
-  <vscale x 16 x i16>,
-  i64);
-
-define <vscale x 16 x i16> @intrinsic_vssub_vv_nxv16i16_nxv16i16_nxv16i16(<vscale x 16 x i16> %0, <vscale x 16 x i16> %1, i64 %2) nounwind {
-; CHECK-LABEL: intrinsic_vssub_vv_nxv16i16_nxv16i16_nxv16i16:
-; CHECK:       # %bb.0: # %entry
-; CHECK-NEXT:    vsetvli zero, a0, e16, m4, ta, ma
-; CHECK-NEXT:    vssub.vv v8, v8, v12
-; CHECK-NEXT:    ret
-entry:
-  %a = call <vscale x 16 x i16> @llvm.riscv.vssub.nxv16i16.nxv16i16(
-    <vscale x 16 x i16> undef,
-    <vscale x 16 x i16> %0,
-    <vscale x 16 x i16> %1,
-    i64 %2)
-
-  ret <vscale x 16 x i16> %a
-}
-
-declare <vscale x 16 x i16> @llvm.riscv.vssub.mask.nxv16i16.nxv16i16(
-  <vscale x 16 x i16>,
-  <vscale x 16 x i16>,
-  <vscale x 16 x i16>,
-  <vscale x 16 x i1>,
-  i64,
-  i64);
-
-define <vscale x 16 x i16> @intrinsic_vssub_mask_vv_nxv16i16_nxv16i16_nxv16i16(<vscale x 16 x i16> %0, <vscale x 16 x i16> %1, <vscale x 16 x i16> %2, <vscale x 16 x i1> %3, i64 %4) nounwind {
-; CHECK-LABEL: intrinsic_vssub_mask_vv_nxv16i16_nxv16i16_nxv16i16:
-; CHECK:       # %bb.0: # %entry
-; CHECK-NEXT:    vsetvli zero, a0, e16, m4, ta, mu
-; CHECK-NEXT:    vssub.vv v8, v12, v16, v0.t
-; CHECK-NEXT:    ret
-entry:
-  %a = call <vscale x 16 x i16> @llvm.riscv.vssub.mask.nxv16i16.nxv16i16(
-    <vscale x 16 x i16> %0,
-    <vscale x 16 x i16> %1,
-    <vscale x 16 x i16> %2,
-    <vscale x 16 x i1> %3,
-    i64 %4, i64 1)
-
-  ret <vscale x 16 x i16> %a
-}
-
-declare <vscale x 32 x i16> @llvm.riscv.vssub.nxv32i16.nxv32i16(
-  <vscale x 32 x i16>,
-  <vscale x 32 x i16>,
-  <vscale x 32 x i16>,
-  i64);
-
-define <vscale x 32 x i16> @intrinsic_vssub_vv_nxv32i16_nxv32i16_nxv32i16(<vscale x 32 x i16> %0, <vscale x 32 x i16> %1, i64 %2) nounwind {
-; CHECK-LABEL: intrinsic_vssub_vv_nxv32i16_nxv32i16_nxv32i16:
-; CHECK:       # %bb.0: # %entry
-; CHECK-NEXT:    vsetvli zero, a0, e16, m8, ta, ma
-; CHECK-NEXT:    vssub.vv v8, v8, v16
-; CHECK-NEXT:    ret
-entry:
-  %a = call <vscale x 32 x i16> @llvm.riscv.vssub.nxv32i16.nxv32i16(
-    <vscale x 32 x i16> undef,
-    <vscale x 32 x i16> %0,
-    <vscale x 32 x i16> %1,
-    i64 %2)
-
-  ret <vscale x 32 x i16> %a
-}
-
-declare <vscale x 32 x i16> @llvm.riscv.vssub.mask.nxv32i16.nxv32i16(
-  <vscale x 32 x i16>,
-  <vscale x 32 x i16>,
-  <vscale x 32 x i16>,
-  <vscale x 32 x i1>,
-  i64,
-  i64);
-
-define <vscale x 32 x i16> @intrinsic_vssub_mask_vv_nxv32i16_nxv32i16_nxv32i16(<vscale x 32 x i16> %0, <vscale x 32 x i16> %1, <vscale x 32 x i16> %2, <vscale x 32 x i1> %3, i64 %4) nounwind {
-; CHECK-LABEL: intrinsic_vssub_mask_vv_nxv32i16_nxv32i16_nxv32i16:
-; CHECK:       # %bb.0: # %entry
-; CHECK-NEXT:    vl8re16.v v24, (a0)
-; CHECK-NEXT:    vsetvli zero, a1, e16, m8, ta, mu
-; CHECK-NEXT:    vssub.vv v8, v16, v24, v0.t
-; CHECK-NEXT:    ret
-entry:
-  %a = call <vscale x 32 x i16> @llvm.riscv.vssub.mask.nxv32i16.nxv32i16(
-    <vscale x 32 x i16> %0,
-    <vscale x 32 x i16> %1,
-    <vscale x 32 x i16> %2,
-    <vscale x 32 x i1> %3,
-    i64 %4, i64 1)
-
-  ret <vscale x 32 x i16> %a
-}
-
-declare <vscale x 1 x i32> @llvm.riscv.vssub.nxv1i32.nxv1i32(
-  <vscale x 1 x i32>,
-  <vscale x 1 x i32>,
-  <vscale x 1 x i32>,
-  i64);
-
-define <vscale x 1 x i32> @intrinsic_vssub_vv_nxv1i32_nxv1i32_nxv1i32(<vscale x 1 x i32> %0, <vscale x 1 x i32> %1, i64 %2) nounwind {
-; CHECK-LABEL: intrinsic_vssub_vv_nxv1i32_nxv1i32_nxv1i32:
-; CHECK:       # %bb.0: # %entry
-; CHECK-NEXT:    vsetvli zero, a0, e32, mf2, ta, ma
-; CHECK-NEXT:    vssub.vv v8, v8, v9
-; CHECK-NEXT:    ret
-entry:
-  %a = call <vscale x 1 x i32> @llvm.riscv.vssub.nxv1i32.nxv1i32(
-    <vscale x 1 x i32> undef,
-    <vscale x 1 x i32> %0,
-    <vscale x 1 x i32> %1,
-    i64 %2)
-
-  ret <vscale x 1 x i32> %a
-}
-
-declare <vscale x 1 x i32> @llvm.riscv.vssub.mask.nxv1i32.nxv1i32(
-  <vscale x 1 x i32>,
-  <vscale x 1 x i32>,
-  <vscale x 1 x i32>,
-  <vscale x 1 x i1>,
-  i64,
-  i64);
-
-define <vscale x 1 x i32> @intrinsic_vssub_mask_vv_nxv1i32_nxv1i32_nxv1i32(<vscale x 1 x i32> %0, <vscale x 1 x i32> %1, <vscale x 1 x i32> %2, <vscale x 1 x i1> %3, i64 %4) nounwind {
-; CHECK-LABEL: intrinsic_vssub_mask_vv_nxv1i32_nxv1i32_nxv1i32:
-; CHECK:       # %bb.0: # %entry
-; CHECK-NEXT:    vsetvli zero, a0, e32, mf2, ta, mu
-; CHECK-NEXT:    vssub.vv v8, v9, v10, v0.t
-; CHECK-NEXT:    ret
-entry:
-  %a = call <vscale x 1 x i32> @llvm.riscv.vssub.mask.nxv1i32.nxv1i32(
-    <vscale x 1 x i32> %0,
-    <vscale x 1 x i32> %1,
-    <vscale x 1 x i32> %2,
-    <vscale x 1 x i1> %3,
-    i64 %4, i64 1)
-
-  ret <vscale x 1 x i32> %a
-}
-
-declare <vscale x 2 x i32> @llvm.riscv.vssub.nxv2i32.nxv2i32(
-  <vscale x 2 x i32>,
-  <vscale x 2 x i32>,
-  <vscale x 2 x i32>,
-  i64);
-
-define <vscale x 2 x i32> @intrinsic_vssub_vv_nxv2i32_nxv2i32_nxv2i32(<vscale x 2 x i32> %0, <vscale x 2 x i32> %1, i64 %2) nounwind {
-; CHECK-LABEL: intrinsic_vssub_vv_nxv2i32_nxv2i32_nxv2i32:
-; CHECK:       # %bb.0: # %entry
-; CHECK-NEXT:    vsetvli zero, a0, e32, m1, ta, ma
-; CHECK-NEXT:    vssub.vv v8, v8, v9
-; CHECK-NEXT:    ret
-entry:
-  %a = call <vscale x 2 x i32> @llvm.riscv.vssub.nxv2i32.nxv2i32(
-    <vscale x 2 x i32> undef,
-    <vscale x 2 x i32> %0,
-    <vscale x 2 x i32> %1,
-    i64 %2)
-
-  ret <vscale x 2 x i32> %a
-}
-
-declare <vscale x 2 x i32> @llvm.riscv.vssub.mask.nxv2i32.nxv2i32(
-  <vscale x 2 x i32>,
-  <vscale x 2 x i32>,
-  <vscale x 2 x i32>,
-  <vscale x 2 x i1>,
-  i64,
-  i64);
-
-define <vscale x 2 x i32> @intrinsic_vssub_mask_vv_nxv2i32_nxv2i32_nxv2i32(<vscale x 2 x i32> %0, <vscale x 2 x i32> %1, <vscale x 2 x i32> %2, <vscale x 2 x i1> %3, i64 %4) nounwind {
-; CHECK-LABEL: intrinsic_vssub_mask_vv_nxv2i32_nxv2i32_nxv2i32:
-; CHECK:       # %bb.0: # %entry
-; CHECK-NEXT:    vsetvli zero, a0, e32, m1, ta, mu
-; CHECK-NEXT:    vssub.vv v8, v9, v10, v0.t
-; CHECK-NEXT:    ret
-entry:
-  %a = call <vscale x 2 x i32> @llvm.riscv.vssub.mask.nxv2i32.nxv2i32(
-    <vscale x 2 x i32> %0,
-    <vscale x 2 x i32> %1,
-    <vscale x 2 x i32> %2,
-    <vscale x 2 x i1> %3,
-    i64 %4, i64 1)
-
-  ret <vscale x 2 x i32> %a
-}
-
-declare <vscale x 4 x i32> @llvm.riscv.vssub.nxv4i32.nxv4i32(
-  <vscale x 4 x i32>,
-  <vscale x 4 x i32>,
-  <vscale x 4 x i32>,
-  i64);
-
-define <vscale x 4 x i32> @intrinsic_vssub_vv_nxv4i32_nxv4i32_nxv4i32(<vscale x 4 x i32> %0, <vscale x 4 x i32> %1, i64 %2) nounwind {
-; CHECK-LABEL: intrinsic_vssub_vv_nxv4i32_nxv4i32_nxv4i32:
-; CHECK:       # %bb.0: # %entry
-; CHECK-NEXT:    vsetvli zero, a0, e32, m2, ta, ma
-; CHECK-NEXT:    vssub.vv v8, v8, v10
-; CHECK-NEXT:    ret
-entry:
-  %a = call <vscale x 4 x i32> @llvm.riscv.vssub.nxv4i32.nxv4i32(
-    <vscale x 4 x i32> undef,
-    <vscale x 4 x i32> %0,
-    <vscale x 4 x i32> %1,
-    i64 %2)
-
-  ret <vscale x 4 x i32> %a
-}
-
-declare <vscale x 4 x i32> @llvm.riscv.vssub.mask.nxv4i32.nxv4i32(
-  <vscale x 4 x i32>,
-  <vscale x 4 x i32>,
-  <vscale x 4 x i32>,
-  <vscale x 4 x i1>,
-  i64,
-  i64);
-
-define <vscale x 4 x i32> @intrinsic_vssub_mask_vv_nxv4i32_nxv4i32_nxv4i32(<vscale x 4 x i32> %0, <vscale x 4 x i32> %1, <vscale x 4 x i32> %2, <vscale x 4 x i1> %3, i64 %4) nounwind {
-; CHECK-LABEL: intrinsic_vssub_mask_vv_nxv4i32_nxv4i32_nxv4i32:
-; CHECK:       # %bb.0: # %entry
-; CHECK-NEXT:    vsetvli zero, a0, e32, m2, ta, mu
-; CHECK-NEXT:    vssub.vv v8, v10, v12, v0.t
-; CHECK-NEXT:    ret
-entry:
-  %a = call <vscale x 4 x i32> @llvm.riscv.vssub.mask.nxv4i32.nxv4i32(
-    <vscale x 4 x i32> %0,
-    <vscale x 4 x i32> %1,
-    <vscale x 4 x i32> %2,
-    <vscale x 4 x i1> %3,
-    i64 %4, i64 1)
-
-  ret <vscale x 4 x i32> %a
-}
-
-declare <vscale x 8 x i32> @llvm.riscv.vssub.nxv8i32.nxv8i32(
-  <vscale x 8 x i32>,
-  <vscale x 8 x i32>,
-  <vscale x 8 x i32>,
-  i64);
-
-define <vscale x 8 x i32> @intrinsic_vssub_vv_nxv8i32_nxv8i32_nxv8i32(<vscale x 8 x i32> %0, <vscale x 8 x i32> %1, i64 %2) nounwind {
-; CHECK-LABEL: intrinsic_vssub_vv_nxv8i32_nxv8i32_nxv8i32:
-; CHECK:       # %bb.0: # %entry
-; CHECK-NEXT:    vsetvli zero, a0, e32, m4, ta, ma
-; CHECK-NEXT:    vssub.vv v8, v8, v12
-; CHECK-NEXT:    ret
-entry:
-  %a = call <vscale x 8 x i32> @llvm.riscv.vssub.nxv8i32.nxv8i32(
-    <vscale x 8 x i32> undef,
-    <vscale x 8 x i32> %0,
-    <vscale x 8 x i32> %1,
-    i64 %2)
-
-  ret <vscale x 8 x i32> %a
-}
-
-declare <vscale x 8 x i32> @llvm.riscv.vssub.mask.nxv8i32.nxv8i32(
-  <vscale x 8 x i32>,
-  <vscale x 8 x i32>,
-  <vscale x 8 x i32>,
-  <vscale x 8 x i1>,
-  i64,
-  i64);
-
-define <vscale x 8 x i32> @intrinsic_vssub_mask_vv_nxv8i32_nxv8i32_nxv8i32(<vscale x 8 x i32> %0, <vscale x 8 x i32> %1, <vscale x 8 x i32> %2, <vscale x 8 x i1> %3, i64 %4) nounwind {
-; CHECK-LABEL: intrinsic_vssub_mask_vv_nxv8i32_nxv8i32_nxv8i32:
-; CHECK:       # %bb.0: # %entry
-; CHECK-NEXT:    vsetvli zero, a0, e32, m4, ta, mu
-; CHECK-NEXT:    vssub.vv v8, v12, v16, v0.t
-; CHECK-NEXT:    ret
-entry:
-  %a = call <vscale x 8 x i32> @llvm.riscv.vssub.mask.nxv8i32.nxv8i32(
-    <vscale x 8 x i32> %0,
-    <vscale x 8 x i32> %1,
-    <vscale x 8 x i32> %2,
-    <vscale x 8 x i1> %3,
-    i64 %4, i64 1)
-
-  ret <vscale x 8 x i32> %a
-}
-
-declare <vscale x 16 x i32> @llvm.riscv.vssub.nxv16i32.nxv16i32(
-  <vscale x 16 x i32>,
-  <vscale x 16 x i32>,
-  <vscale x 16 x i32>,
-  i64);
-
-define <vscale x 16 x i32> @intrinsic_vssub_vv_nxv16i32_nxv16i32_nxv16i32(<vscale x 16 x i32> %0, <vscale x 16 x i32> %1, i64 %2) nounwind {
-; CHECK-LABEL: intrinsic_vssub_vv_nxv16i32_nxv16i32_nxv16i32:
-; CHECK:       # %bb.0: # %entry
-; CHECK-NEXT:    vsetvli zero, a0, e32, m8, ta, ma
-; CHECK-NEXT:    vssub.vv v8, v8, v16
-; CHECK-NEXT:    ret
-entry:
-  %a = call <vscale x 16 x i32> @llvm.riscv.vssub.nxv16i32.nxv16i32(
-    <vscale x 16 x i32> undef,
-    <vscale x 16 x i32> %0,
-    <vscale x 16 x i32> %1,
-    i64 %2)
-
-  ret <vscale x 16 x i32> %a
-}
-
-declare <vscale x 16 x i32> @llvm.riscv.vssub.mask.nxv16i32.nxv16i32(
-  <vscale x 16 x i32>,
-  <vscale x 16 x i32>,
-  <vscale x 16 x i32>,
-  <vscale x 16 x i1>,
-  i64,
-  i64);
-
-define <vscale x 16 x i32> @intrinsic_vssub_mask_vv_nxv16i32_nxv16i32_nxv16i32(<vscale x 16 x i32> %0, <vscale x 16 x i32> %1, <vscale x 16 x i32> %2, <vscale x 16 x i1> %3, i64 %4) nounwind {
-; CHECK-LABEL: intrinsic_vssub_mask_vv_nxv16i32_nxv16i32_nxv16i32:
-; CHECK:       # %bb.0: # %entry
-; CHECK-NEXT:    vl8re32.v v24, (a0)
-; CHECK-NEXT:    vsetvli zero, a1, e32, m8, ta, mu
-; CHECK-NEXT:    vssub.vv v8, v16, v24, v0.t
-; CHECK-NEXT:    ret
-entry:
-  %a = call <vscale x 16 x i32> @llvm.riscv.vssub.mask.nxv16i32.nxv16i32(
-    <vscale x 16 x i32> %0,
-    <vscale x 16 x i32> %1,
-    <vscale x 16 x i32> %2,
-    <vscale x 16 x i1> %3,
-    i64 %4, i64 1)
-
-  ret <vscale x 16 x i32> %a
-}
-
-declare <vscale x 1 x i64> @llvm.riscv.vssub.nxv1i64.nxv1i64(
-  <vscale x 1 x i64>,
-  <vscale x 1 x i64>,
-  <vscale x 1 x i64>,
-  i64);
-
-define <vscale x 1 x i64> @intrinsic_vssub_vv_nxv1i64_nxv1i64_nxv1i64(<vscale x 1 x i64> %0, <vscale x 1 x i64> %1, i64 %2) nounwind {
-; CHECK-LABEL: intrinsic_vssub_vv_nxv1i64_nxv1i64_nxv1i64:
-; CHECK:       # %bb.0: # %entry
-; CHECK-NEXT:    vsetvli zero, a0, e64, m1, ta, ma
-; CHECK-NEXT:    vssub.vv v8, v8, v9
-; CHECK-NEXT:    ret
-entry:
-  %a = call <vscale x 1 x i64> @llvm.riscv.vssub.nxv1i64.nxv1i64(
-    <vscale x 1 x i64> undef,
-    <vscale x 1 x i64> %0,
-    <vscale x 1 x i64> %1,
-    i64 %2)
-
-  ret <vscale x 1 x i64> %a
-}
-
-declare <vscale x 1 x i64> @llvm.riscv.vssub.mask.nxv1i64.nxv1i64(
-  <vscale x 1 x i64>,
-  <vscale x 1 x i64>,
-  <vscale x 1 x i64>,
-  <vscale x 1 x i1>,
-  i64,
-  i64);
-
-define <vscale x 1 x i64> @intrinsic_vssub_mask_vv_nxv1i64_nxv1i64_nxv1i64(<vscale x 1 x i64> %0, <vscale x 1 x i64> %1, <vscale x 1 x i64> %2, <vscale x 1 x i1> %3, i64 %4) nounwind {
-; CHECK-LABEL: intrinsic_vssub_mask_vv_nxv1i64_nxv1i64_nxv1i64:
-; CHECK:       # %bb.0: # %entry
-; CHECK-NEXT:    vsetvli zero, a0, e64, m1, ta, mu
-; CHECK-NEXT:    vssub.vv v8, v9, v10, v0.t
-; CHECK-NEXT:    ret
-entry:
-  %a = call <vscale x 1 x i64> @llvm.riscv.vssub.mask.nxv1i64.nxv1i64(
-    <vscale x 1 x i64> %0,
-    <vscale x 1 x i64> %1,
-    <vscale x 1 x i64> %2,
-    <vscale x 1 x i1> %3,
-    i64 %4, i64 1)
-
-  ret <vscale x 1 x i64> %a
-}
-
-declare <vscale x 2 x i64> @llvm.riscv.vssub.nxv2i64.nxv2i64(
-  <vscale x 2 x i64>,
-  <vscale x 2 x i64>,
-  <vscale x 2 x i64>,
-  i64);
-
-define <vscale x 2 x i64> @intrinsic_vssub_vv_nxv2i64_nxv2i64_nxv2i64(<vscale x 2 x i64> %0, <vscale x 2 x i64> %1, i64 %2) nounwind {
-; CHECK-LABEL: intrinsic_vssub_vv_nxv2i64_nxv2i64_nxv2i64:
-; CHECK:       # %bb.0: # %entry
-; CHECK-NEXT:    vsetvli zero, a0, e64, m2, ta, ma
-; CHECK-NEXT:    vssub.vv v8, v8, v10
-; CHECK-NEXT:    ret
-entry:
-  %a = call <vscale x 2 x i64> @llvm.riscv.vssub.nxv2i64.nxv2i64(
-    <vscale x 2 x i64> undef,
-    <vscale x 2 x i64> %0,
-    <vscale x 2 x i64> %1,
-    i64 %2)
-
-  ret <vscale x 2 x i64> %a
-}
-
-declare <vscale x 2 x i64> @llvm.riscv.vssub.mask.nxv2i64.nxv2i64(
-  <vscale x 2 x i64>,
-  <vscale x 2 x i64>,
-  <vscale x 2 x i64>,
-  <vscale x 2 x i1>,
-  i64,
-  i64);
-
-define <vscale x 2 x i64> @intrinsic_vssub_mask_vv_nxv2i64_nxv2i64_nxv2i64(<vscale x 2 x i64> %0, <vscale x 2 x i64> %1, <vscale x 2 x i64> %2, <vscale x 2 x i1> %3, i64 %4) nounwind {
-; CHECK-LABEL: intrinsic_vssub_mask_vv_nxv2i64_nxv2i64_nxv2i64:
-; CHECK:       # %bb.0: # %entry
-; CHECK-NEXT:    vsetvli zero, a0, e64, m2, ta, mu
-; CHECK-NEXT:    vssub.vv v8, v10, v12, v0.t
-; CHECK-NEXT:    ret
-entry:
-  %a = call <vscale x 2 x i64> @llvm.riscv.vssub.mask.nxv2i64.nxv2i64(
-    <vscale x 2 x i64> %0,
-    <vscale x 2 x i64> %1,
-    <vscale x 2 x i64> %2,
-    <vscale x 2 x i1> %3,
-    i64 %4, i64 1)
-
-  ret <vscale x 2 x i64> %a
-}
-
-declare <vscale x 4 x i64> @llvm.riscv.vssub.nxv4i64.nxv4i64(
-  <vscale x 4 x i64>,
-  <vscale x 4 x i64>,
-  <vscale x 4 x i64>,
-  i64);
-
-define <vscale x 4 x i64> @intrinsic_vssub_vv_nxv4i64_nxv4i64_nxv4i64(<vscale x 4 x i64> %0, <vscale x 4 x i64> %1, i64 %2) nounwind {
-; CHECK-LABEL: intrinsic_vssub_vv_nxv4i64_nxv4i64_nxv4i64:
-; CHECK:       # %bb.0: # %entry
-; CHECK-NEXT:    vsetvli zero, a0, e64, m4, ta, ma
-; CHECK-NEXT:    vssub.vv v8, v8, v12
-; CHECK-NEXT:    ret
-entry:
-  %a = call <vscale x 4 x i64> @llvm.riscv.vssub.nxv4i64.nxv4i64(
-    <vscale x 4 x i64> undef,
-    <vscale x 4 x i64> %0,
-    <vscale x 4 x i64> %1,
-    i64 %2)
-
-  ret <vscale x 4 x i64> %a
-}
-
-declare <vscale x 4 x i64> @llvm.riscv.vssub.mask.nxv4i64.nxv4i64(
-  <vscale x 4 x i64>,
-  <vscale x 4 x i64>,
-  <vscale x 4 x i64>,
-  <vscale x 4 x i1>,
-  i64,
-  i64);
-
-define <vscale x 4 x i64> @intrinsic_vssub_mask_vv_nxv4i64_nxv4i64_nxv4i64(<vscale x 4 x i64> %0, <vscale x 4 x i64> %1, <vscale x 4 x i64> %2, <vscale x 4 x i1> %3, i64 %4) nounwind {
-; CHECK-LABEL: intrinsic_vssub_mask_vv_nxv4i64_nxv4i64_nxv4i64:
-; CHECK:       # %bb.0: # %entry
-; CHECK-NEXT:    vsetvli zero, a0, e64, m4, ta, mu
-; CHECK-NEXT:    vssub.vv v8, v12, v16, v0.t
-; CHECK-NEXT:    ret
-entry:
-  %a = call <vscale x 4 x i64> @llvm.riscv.vssub.mask.nxv4i64.nxv4i64(
-    <vscale x 4 x i64> %0,
-    <vscale x 4 x i64> %1,
-    <vscale x 4 x i64> %2,
-    <vscale x 4 x i1> %3,
-    i64 %4, i64 1)
-
-  ret <vscale x 4 x i64> %a
-}
-
-declare <vscale x 8 x i64> @llvm.riscv.vssub.nxv8i64.nxv8i64(
-  <vscale x 8 x i64>,
-  <vscale x 8 x i64>,
-  <vscale x 8 x i64>,
-  i64);
-
-define <vscale x 8 x i64> @intrinsic_vssub_vv_nxv8i64_nxv8i64_nxv8i64(<vscale x 8 x i64> %0, <vscale x 8 x i64> %1, i64 %2) nounwind {
-; CHECK-LABEL: intrinsic_vssub_vv_nxv8i64_nxv8i64_nxv8i64:
-; CHECK:       # %bb.0: # %entry
-; CHECK-NEXT:    vsetvli zero, a0, e64, m8, ta, ma
-; CHECK-NEXT:    vssub.vv v8, v8, v16
-; CHECK-NEXT:    ret
-entry:
-  %a = call <vscale x 8 x i64> @llvm.riscv.vssub.nxv8i64.nxv8i64(
-    <vscale x 8 x i64> undef,
-    <vscale x 8 x i64> %0,
-    <vscale x 8 x i64> %1,
-    i64 %2)
-
-  ret <vscale x 8 x i64> %a
-}
-
-declare <vscale x 8 x i64> @llvm.riscv.vssub.mask.nxv8i64.nxv8i64(
-  <vscale x 8 x i64>,
-  <vscale x 8 x i64>,
-  <vscale x 8 x i64>,
-  <vscale x 8 x i1>,
-  i64,
-  i64);
-
-define <vscale x 8 x i64> @intrinsic_vssub_mask_vv_nxv8i64_nxv8i64_nxv8i64(<vscale x 8 x i64> %0, <vscale x 8 x i64> %1, <vscale x 8 x i64> %2, <vscale x 8 x i1> %3, i64 %4) nounwind {
-; CHECK-LABEL: intrinsic_vssub_mask_vv_nxv8i64_nxv8i64_nxv8i64:
-; CHECK:       # %bb.0: # %entry
-; CHECK-NEXT:    vl8re64.v v24, (a0)
-; CHECK-NEXT:    vsetvli zero, a1, e64, m8, ta, mu
-; CHECK-NEXT:    vssub.vv v8, v16, v24, v0.t
-; CHECK-NEXT:    ret
-entry:
-  %a = call <vscale x 8 x i64> @llvm.riscv.vssub.mask.nxv8i64.nxv8i64(
-    <vscale x 8 x i64> %0,
-    <vscale x 8 x i64> %1,
-    <vscale x 8 x i64> %2,
-    <vscale x 8 x i1> %3,
-    i64 %4, i64 1)
-
-  ret <vscale x 8 x i64> %a
-}
-
-declare <vscale x 1 x i8> @llvm.riscv.vssub.nxv1i8.i8(
-  <vscale x 1 x i8>,
-  <vscale x 1 x i8>,
-  i8,
-  i64);
-
-define <vscale x 1 x i8> @intrinsic_vssub_vx_nxv1i8_nxv1i8_i8(<vscale x 1 x i8> %0, i8 %1, i64 %2) nounwind {
-; CHECK-LABEL: intrinsic_vssub_vx_nxv1i8_nxv1i8_i8:
-; CHECK:       # %bb.0: # %entry
-; CHECK-NEXT:    vsetvli zero, a1, e8, mf8, ta, ma
-; CHECK-NEXT:    vssub.vx v8, v8, a0
-; CHECK-NEXT:    ret
-entry:
-  %a = call <vscale x 1 x i8> @llvm.riscv.vssub.nxv1i8.i8(
-    <vscale x 1 x i8> undef,
-    <vscale x 1 x i8> %0,
-    i8 %1,
-    i64 %2)
-
-  ret <vscale x 1 x i8> %a
-}
-
-declare <vscale x 1 x i8> @llvm.riscv.vssub.mask.nxv1i8.i8(
-  <vscale x 1 x i8>,
-  <vscale x 1 x i8>,
-  i8,
-  <vscale x 1 x i1>,
-  i64,
-  i64);
-
-define <vscale x 1 x i8> @intrinsic_vssub_mask_vx_nxv1i8_nxv1i8_i8(<vscale x 1 x i8> %0, <vscale x 1 x i8> %1, i8 %2, <vscale x 1 x i1> %3, i64 %4) nounwind {
-; CHECK-LABEL: intrinsic_vssub_mask_vx_nxv1i8_nxv1i8_i8:
-; CHECK:       # %bb.0: # %entry
-; CHECK-NEXT:    vsetvli zero, a1, e8, mf8, ta, mu
-; CHECK-NEXT:    vssub.vx v8, v9, a0, v0.t
-; CHECK-NEXT:    ret
-entry:
-  %a = call <vscale x 1 x i8> @llvm.riscv.vssub.mask.nxv1i8.i8(
-    <vscale x 1 x i8> %0,
-    <vscale x 1 x i8> %1,
-    i8 %2,
-    <vscale x 1 x i1> %3,
-    i64 %4, i64 1)
-
-  ret <vscale x 1 x i8> %a
-}
-
-declare <vscale x 2 x i8> @llvm.riscv.vssub.nxv2i8.i8(
-  <vscale x 2 x i8>,
-  <vscale x 2 x i8>,
-  i8,
-  i64);
-
-define <vscale x 2 x i8> @intrinsic_vssub_vx_nxv2i8_nxv2i8_i8(<vscale x 2 x i8> %0, i8 %1, i64 %2) nounwind {
-; CHECK-LABEL: intrinsic_vssub_vx_nxv2i8_nxv2i8_i8:
-; CHECK:       # %bb.0: # %entry
-; CHECK-NEXT:    vsetvli zero, a1, e8, mf4, ta, ma
-; CHECK-NEXT:    vssub.vx v8, v8, a0
-; CHECK-NEXT:    ret
-entry:
-  %a = call <vscale x 2 x i8> @llvm.riscv.vssub.nxv2i8.i8(
-    <vscale x 2 x i8> undef,
-    <vscale x 2 x i8> %0,
-    i8 %1,
-    i64 %2)
-
-  ret <vscale x 2 x i8> %a
-}
-
-declare <vscale x 2 x i8> @llvm.riscv.vssub.mask.nxv2i8.i8(
-  <vscale x 2 x i8>,
-  <vscale x 2 x i8>,
-  i8,
-  <vscale x 2 x i1>,
-  i64,
-  i64);
-
-define <vscale x 2 x i8> @intrinsic_vssub_mask_vx_nxv2i8_nxv2i8_i8(<vscale x 2 x i8> %0, <vscale x 2 x i8> %1, i8 %2, <vscale x 2 x i1> %3, i64 %4) nounwind {
-; CHECK-LABEL: intrinsic_vssub_mask_vx_nxv2i8_nxv2i8_i8:
-; CHECK:       # %bb.0: # %entry
-; CHECK-NEXT:    vsetvli zero, a1, e8, mf4, ta, mu
-; CHECK-NEXT:    vssub.vx v8, v9, a0, v0.t
-; CHECK-NEXT:    ret
-entry:
-  %a = call <vscale x 2 x i8> @llvm.riscv.vssub.mask.nxv2i8.i8(
-    <vscale x 2 x i8> %0,
-    <vscale x 2 x i8> %1,
-    i8 %2,
-    <vscale x 2 x i1> %3,
-    i64 %4, i64 1)
-
-  ret <vscale x 2 x i8> %a
-}
-
-declare <vscale x 4 x i8> @llvm.riscv.vssub.nxv4i8.i8(
-  <vscale x 4 x i8>,
-  <vscale x 4 x i8>,
-  i8,
-  i64);
-
-define <vscale x 4 x i8> @intrinsic_vssub_vx_nxv4i8_nxv4i8_i8(<vscale x 4 x i8> %0, i8 %1, i64 %2) nounwind {
-; CHECK-LABEL: intrinsic_vssub_vx_nxv4i8_nxv4i8_i8:
-; CHECK:       # %bb.0: # %entry
-; CHECK-NEXT:    vsetvli zero, a1, e8, mf2, ta, ma
-; CHECK-NEXT:    vssub.vx v8, v8, a0
-; CHECK-NEXT:    ret
-entry:
-  %a = call <vscale x 4 x i8> @llvm.riscv.vssub.nxv4i8.i8(
-    <vscale x 4 x i8> undef,
-    <vscale x 4 x i8> %0,
-    i8 %1,
-    i64 %2)
-
-  ret <vscale x 4 x i8> %a
-}
-
-declare <vscale x 4 x i8> @llvm.riscv.vssub.mask.nxv4i8.i8(
-  <vscale x 4 x i8>,
-  <vscale x 4 x i8>,
-  i8,
-  <vscale x 4 x i1>,
-  i64,
-  i64);
-
-define <vscale x 4 x i8> @intrinsic_vssub_mask_vx_nxv4i8_nxv4i8_i8(<vscale x 4 x i8> %0, <vscale x 4 x i8> %1, i8 %2, <vscale x 4 x i1> %3, i64 %4) nounwind {
-; CHECK-LABEL: intrinsic_vssub_mask_vx_nxv4i8_nxv4i8_i8:
-; CHECK:       # %bb.0: # %entry
-; CHECK-NEXT:    vsetvli zero, a1, e8, mf2, ta, mu
-; CHECK-NEXT:    vssub.vx v8, v9, a0, v0.t
-; CHECK-NEXT:    ret
-entry:
-  %a = call <vscale x 4 x i8> @llvm.riscv.vssub.mask.nxv4i8.i8(
-    <vscale x 4 x i8> %0,
-    <vscale x 4 x i8> %1,
-    i8 %2,
-    <vscale x 4 x i1> %3,
-    i64 %4, i64 1)
-
-  ret <vscale x 4 x i8> %a
-}
-
-declare <vscale x 8 x i8> @llvm.riscv.vssub.nxv8i8.i8(
-  <vscale x 8 x i8>,
-  <vscale x 8 x i8>,
-  i8,
-  i64);
-
-define <vscale x 8 x i8> @intrinsic_vssub_vx_nxv8i8_nxv8i8_i8(<vscale x 8 x i8> %0, i8 %1, i64 %2) nounwind {
-; CHECK-LABEL: intrinsic_vssub_vx_nxv8i8_nxv8i8_i8:
-; CHECK:       # %bb.0: # %entry
-; CHECK-NEXT:    vsetvli zero, a1, e8, m1, ta, ma
-; CHECK-NEXT:    vssub.vx v8, v8, a0
-; CHECK-NEXT:    ret
-entry:
-  %a = call <vscale x 8 x i8> @llvm.riscv.vssub.nxv8i8.i8(
-    <vscale x 8 x i8> undef,
-    <vscale x 8 x i8> %0,
-    i8 %1,
-    i64 %2)
-
-  ret <vscale x 8 x i8> %a
-}
-
-declare <vscale x 8 x i8> @llvm.riscv.vssub.mask.nxv8i8.i8(
-  <vscale x 8 x i8>,
-  <vscale x 8 x i8>,
-  i8,
-  <vscale x 8 x i1>,
-  i64,
-  i64);
-
-define <vscale x 8 x i8> @intrinsic_vssub_mask_vx_nxv8i8_nxv8i8_i8(<vscale x 8 x i8> %0, <vscale x 8 x i8> %1, i8 %2, <vscale x 8 x i1> %3, i64 %4) nounwind {
-; CHECK-LABEL: intrinsic_vssub_mask_vx_nxv8i8_nxv8i8_i8:
-; CHECK:       # %bb.0: # %entry
-; CHECK-NEXT:    vsetvli zero, a1, e8, m1, ta, mu
-; CHECK-NEXT:    vssub.vx v8, v9, a0, v0.t
-; CHECK-NEXT:    ret
-entry:
-  %a = call <vscale x 8 x i8> @llvm.riscv.vssub.mask.nxv8i8.i8(
-    <vscale x 8 x i8> %0,
-    <vscale x 8 x i8> %1,
-    i8 %2,
-    <vscale x 8 x i1> %3,
-    i64 %4, i64 1)
-
-  ret <vscale x 8 x i8> %a
-}
-
-declare <vscale x 16 x i8> @llvm.riscv.vssub.nxv16i8.i8(
-  <vscale x 16 x i8>,
-  <vscale x 16 x i8>,
-  i8,
-  i64);
-
-define <vscale x 16 x i8> @intrinsic_vssub_vx_nxv16i8_nxv16i8_i8(<vscale x 16 x i8> %0, i8 %1, i64 %2) nounwind {
-; CHECK-LABEL: intrinsic_vssub_vx_nxv16i8_nxv16i8_i8:
-; CHECK:       # %bb.0: # %entry
-; CHECK-NEXT:    vsetvli zero, a1, e8, m2, ta, ma
-; CHECK-NEXT:    vssub.vx v8, v8, a0
-; CHECK-NEXT:    ret
-entry:
-  %a = call <vscale x 16 x i8> @llvm.riscv.vssub.nxv16i8.i8(
-    <vscale x 16 x i8> undef,
-    <vscale x 16 x i8> %0,
-    i8 %1,
-    i64 %2)
-
-  ret <vscale x 16 x i8> %a
-}
-
-declare <vscale x 16 x i8> @llvm.riscv.vssub.mask.nxv16i8.i8(
-  <vscale x 16 x i8>,
-  <vscale x 16 x i8>,
-  i8,
-  <vscale x 16 x i1>,
-  i64,
-  i64);
-
-define <vscale x 16 x i8> @intrinsic_vssub_mask_vx_nxv16i8_nxv16i8_i8(<vscale x 16 x i8> %0, <vscale x 16 x i8> %1, i8 %2, <vscale x 16 x i1> %3, i64 %4) nounwind {
-; CHECK-LABEL: intrinsic_vssub_mask_vx_nxv16i8_nxv16i8_i8:
-; CHECK:       # %bb.0: # %entry
-; CHECK-NEXT:    vsetvli zero, a1, e8, m2, ta, mu
-; CHECK-NEXT:    vssub.vx v8, v10, a0, v0.t
-; CHECK-NEXT:    ret
-entry:
-  %a = call <vscale x 16 x i8> @llvm.riscv.vssub.mask.nxv16i8.i8(
-    <vscale x 16 x i8> %0,
-    <vscale x 16 x i8> %1,
-    i8 %2,
-    <vscale x 16 x i1> %3,
-    i64 %4, i64 1)
-
-  ret <vscale x 16 x i8> %a
-}
-
-declare <vscale x 32 x i8> @llvm.riscv.vssub.nxv32i8.i8(
-  <vscale x 32 x i8>,
-  <vscale x 32 x i8>,
-  i8,
-  i64);
-
-define <vscale x 32 x i8> @intrinsic_vssub_vx_nxv32i8_nxv32i8_i8(<vscale x 32 x i8> %0, i8 %1, i64 %2) nounwind {
-; CHECK-LABEL: intrinsic_vssub_vx_nxv32i8_nxv32i8_i8:
-; CHECK:       # %bb.0: # %entry
-; CHECK-NEXT:    vsetvli zero, a1, e8, m4, ta, ma
-; CHECK-NEXT:    vssub.vx v8, v8, a0
-; CHECK-NEXT:    ret
-entry:
-  %a = call <vscale x 32 x i8> @llvm.riscv.vssub.nxv32i8.i8(
-    <vscale x 32 x i8> undef,
-    <vscale x 32 x i8> %0,
-    i8 %1,
-    i64 %2)
-
-  ret <vscale x 32 x i8> %a
-}
-
-declare <vscale x 32 x i8> @llvm.riscv.vssub.mask.nxv32i8.i8(
-  <vscale x 32 x i8>,
-  <vscale x 32 x i8>,
-  i8,
-  <vscale x 32 x i1>,
-  i64,
-  i64);
-
-define <vscale x 32 x i8> @intrinsic_vssub_mask_vx_nxv32i8_nxv32i8_i8(<vscale x 32 x i8> %0, <vscale x 32 x i8> %1, i8 %2, <vscale x 32 x i1> %3, i64 %4) nounwind {
-; CHECK-LABEL: intrinsic_vssub_mask_vx_nxv32i8_nxv32i8_i8:
-; CHECK:       # %bb.0: # %entry
-; CHECK-NEXT:    vsetvli zero, a1, e8, m4, ta, mu
-; CHECK-NEXT:    vssub.vx v8, v12, a0, v0.t
-; CHECK-NEXT:    ret
-entry:
-  %a = call <vscale x 32 x i8> @llvm.riscv.vssub.mask.nxv32i8.i8(
-    <vscale x 32 x i8> %0,
-    <vscale x 32 x i8> %1,
-    i8 %2,
-    <vscale x 32 x i1> %3,
-    i64 %4, i64 1)
-
-  ret <vscale x 32 x i8> %a
-}
-
-declare <vscale x 64 x i8> @llvm.riscv.vssub.nxv64i8.i8(
-  <vscale x 64 x i8>,
-  <vscale x 64 x i8>,
-  i8,
-  i64);
-
-define <vscale x 64 x i8> @intrinsic_vssub_vx_nxv64i8_nxv64i8_i8(<vscale x 64 x i8> %0, i8 %1, i64 %2) nounwind {
-; CHECK-LABEL: intrinsic_vssub_vx_nxv64i8_nxv64i8_i8:
-; CHECK:       # %bb.0: # %entry
-; CHECK-NEXT:    vsetvli zero, a1, e8, m8, ta, ma
-; CHECK-NEXT:    vssub.vx v8, v8, a0
-; CHECK-NEXT:    ret
-entry:
-  %a = call <vscale x 64 x i8> @llvm.riscv.vssub.nxv64i8.i8(
-    <vscale x 64 x i8> undef,
-    <vscale x 64 x i8> %0,
-    i8 %1,
-    i64 %2)
-
-  ret <vscale x 64 x i8> %a
-}
-
-declare <vscale x 64 x i8> @llvm.riscv.vssub.mask.nxv64i8.i8(
-  <vscale x 64 x i8>,
-  <vscale x 64 x i8>,
-  i8,
-  <vscale x 64 x i1>,
-  i64,
-  i64);
-
-define <vscale x 64 x i8> @intrinsic_vssub_mask_vx_nxv64i8_nxv64i8_i8(<vscale x 64 x i8> %0, <vscale x 64 x i8> %1, i8 %2, <vscale x 64 x i1> %3, i64 %4) nounwind {
-; CHECK-LABEL: intrinsic_vssub_mask_vx_nxv64i8_nxv64i8_i8:
-; CHECK:       # %bb.0: # %entry
-; CHECK-NEXT:    vsetvli zero, a1, e8, m8, ta, mu
-; CHECK-NEXT:    vssub.vx v8, v16, a0, v0.t
-; CHECK-NEXT:    ret
-entry:
-  %a = call <vscale x 64 x i8> @llvm.riscv.vssub.mask.nxv64i8.i8(
-    <vscale x 64 x i8> %0,
-    <vscale x 64 x i8> %1,
-    i8 %2,
-    <vscale x 64 x i1> %3,
-    i64 %4, i64 1)
-
-  ret <vscale x 64 x i8> %a
-}
-
-declare <vscale x 1 x i16> @llvm.riscv.vssub.nxv1i16.i16(
-  <vscale x 1 x i16>,
-  <vscale x 1 x i16>,
-  i16,
-  i64);
-
-define <vscale x 1 x i16> @intrinsic_vssub_vx_nxv1i16_nxv1i16_i16(<vscale x 1 x i16> %0, i16 %1, i64 %2) nounwind {
-; CHECK-LABEL: intrinsic_vssub_vx_nxv1i16_nxv1i16_i16:
-; CHECK:       # %bb.0: # %entry
-; CHECK-NEXT:    vsetvli zero, a1, e16, mf4, ta, ma
-; CHECK-NEXT:    vssub.vx v8, v8, a0
-; CHECK-NEXT:    ret
-entry:
-  %a = call <vscale x 1 x i16> @llvm.riscv.vssub.nxv1i16.i16(
-    <vscale x 1 x i16> undef,
-    <vscale x 1 x i16> %0,
-    i16 %1,
-    i64 %2)
-
-  ret <vscale x 1 x i16> %a
-}
-
-declare <vscale x 1 x i16> @llvm.riscv.vssub.mask.nxv1i16.i16(
-  <vscale x 1 x i16>,
-  <vscale x 1 x i16>,
-  i16,
-  <vscale x 1 x i1>,
-  i64,
-  i64);
-
-define <vscale x 1 x i16> @intrinsic_vssub_mask_vx_nxv1i16_nxv1i16_i16(<vscale x 1 x i16> %0, <vscale x 1 x i16> %1, i16 %2, <vscale x 1 x i1> %3, i64 %4) nounwind {
-; CHECK-LABEL: intrinsic_vssub_mask_vx_nxv1i16_nxv1i16_i16:
-; CHECK:       # %bb.0: # %entry
-; CHECK-NEXT:    vsetvli zero, a1, e16, mf4, ta, mu
-; CHECK-NEXT:    vssub.vx v8, v9, a0, v0.t
-; CHECK-NEXT:    ret
-entry:
-  %a = call <vscale x 1 x i16> @llvm.riscv.vssub.mask.nxv1i16.i16(
-    <vscale x 1 x i16> %0,
-    <vscale x 1 x i16> %1,
-    i16 %2,
-    <vscale x 1 x i1> %3,
-    i64 %4, i64 1)
-
-  ret <vscale x 1 x i16> %a
-}
-
-declare <vscale x 2 x i16> @llvm.riscv.vssub.nxv2i16.i16(
-  <vscale x 2 x i16>,
-  <vscale x 2 x i16>,
-  i16,
-  i64);
-
-define <vscale x 2 x i16> @intrinsic_vssub_vx_nxv2i16_nxv2i16_i16(<vscale x 2 x i16> %0, i16 %1, i64 %2) nounwind {
-; CHECK-LABEL: intrinsic_vssub_vx_nxv2i16_nxv2i16_i16:
-; CHECK:       # %bb.0: # %entry
-; CHECK-NEXT:    vsetvli zero, a1, e16, mf2, ta, ma
-; CHECK-NEXT:    vssub.vx v8, v8, a0
-; CHECK-NEXT:    ret
-entry:
-  %a = call <vscale x 2 x i16> @llvm.riscv.vssub.nxv2i16.i16(
-    <vscale x 2 x i16> undef,
-    <vscale x 2 x i16> %0,
-    i16 %1,
-    i64 %2)
-
-  ret <vscale x 2 x i16> %a
-}
-
-declare <vscale x 2 x i16> @llvm.riscv.vssub.mask.nxv2i16.i16(
-  <vscale x 2 x i16>,
-  <vscale x 2 x i16>,
-  i16,
-  <vscale x 2 x i1>,
-  i64,
-  i64);
-
-define <vscale x 2 x i16> @intrinsic_vssub_mask_vx_nxv2i16_nxv2i16_i16(<vscale x 2 x i16> %0, <vscale x 2 x i16> %1, i16 %2, <vscale x 2 x i1> %3, i64 %4) nounwind {
-; CHECK-LABEL: intrinsic_vssub_mask_vx_nxv2i16_nxv2i16_i16:
-; CHECK:       # %bb.0: # %entry
-; CHECK-NEXT:    vsetvli zero, a1, e16, mf2, ta, mu
-; CHECK-NEXT:    vssub.vx v8, v9, a0, v0.t
-; CHECK-NEXT:    ret
-entry:
-  %a = call <vscale x 2 x i16> @llvm.riscv.vssub.mask.nxv2i16.i16(
-    <vscale x 2 x i16> %0,
-    <vscale x 2 x i16> %1,
-    i16 %2,
-    <vscale x 2 x i1> %3,
-    i64 %4, i64 1)
-
-  ret <vscale x 2 x i16> %a
-}
-
-declare <vscale x 4 x i16> @llvm.riscv.vssub.nxv4i16.i16(
-  <vscale x 4 x i16>,
-  <vscale x 4 x i16>,
-  i16,
-  i64);
-
-define <vscale x 4 x i16> @intrinsic_vssub_vx_nxv4i16_nxv4i16_i16(<vscale x 4 x i16> %0, i16 %1, i64 %2) nounwind {
-; CHECK-LABEL: intrinsic_vssub_vx_nxv4i16_nxv4i16_i16:
-; CHECK:       # %bb.0: # %entry
-; CHECK-NEXT:    vsetvli zero, a1, e16, m1, ta, ma
-; CHECK-NEXT:    vssub.vx v8, v8, a0
-; CHECK-NEXT:    ret
-entry:
-  %a = call <vscale x 4 x i16> @llvm.riscv.vssub.nxv4i16.i16(
-    <vscale x 4 x i16> undef,
-    <vscale x 4 x i16> %0,
-    i16 %1,
-    i64 %2)
-
-  ret <vscale x 4 x i16> %a
-}
-
-declare <vscale x 4 x i16> @llvm.riscv.vssub.mask.nxv4i16.i16(
-  <vscale x 4 x i16>,
-  <vscale x 4 x i16>,
-  i16,
-  <vscale x 4 x i1>,
-  i64,
-  i64);
-
-define <vscale x 4 x i16> @intrinsic_vssub_mask_vx_nxv4i16_nxv4i16_i16(<vscale x 4 x i16> %0, <vscale x 4 x i16> %1, i16 %2, <vscale x 4 x i1> %3, i64 %4) nounwind {
-; CHECK-LABEL: intrinsic_vssub_mask_vx_nxv4i16_nxv4i16_i16:
-; CHECK:       # %bb.0: # %entry
-; CHECK-NEXT:    vsetvli zero, a1, e16, m1, ta, mu
-; CHECK-NEXT:    vssub.vx v8, v9, a0, v0.t
-; CHECK-NEXT:    ret
-entry:
-  %a = call <vscale x 4 x i16> @llvm.riscv.vssub.mask.nxv4i16.i16(
-    <vscale x 4 x i16> %0,
-    <vscale x 4 x i16> %1,
-    i16 %2,
-    <vscale x 4 x i1> %3,
-    i64 %4, i64 1)
-
-  ret <vscale x 4 x i16> %a
-}
-
-declare <vscale x 8 x i16> @llvm.riscv.vssub.nxv8i16.i16(
-  <vscale x 8 x i16>,
-  <vscale x 8 x i16>,
-  i16,
-  i64);
-
-define <vscale x 8 x i16> @intrinsic_vssub_vx_nxv8i16_nxv8i16_i16(<vscale x 8 x i16> %0, i16 %1, i64 %2) nounwind {
-; CHECK-LABEL: intrinsic_vssub_vx_nxv8i16_nxv8i16_i16:
-; CHECK:       # %bb.0: # %entry
-; CHECK-NEXT:    vsetvli zero, a1, e16, m2, ta, ma
-; CHECK-NEXT:    vssub.vx v8, v8, a0
-; CHECK-NEXT:    ret
-entry:
-  %a = call <vscale x 8 x i16> @llvm.riscv.vssub.nxv8i16.i16(
-    <vscale x 8 x i16> undef,
-    <vscale x 8 x i16> %0,
-    i16 %1,
-    i64 %2)
-
-  ret <vscale x 8 x i16> %a
-}
-
-declare <vscale x 8 x i16> @llvm.riscv.vssub.mask.nxv8i16.i16(
-  <vscale x 8 x i16>,
-  <vscale x 8 x i16>,
-  i16,
-  <vscale x 8 x i1>,
-  i64,
-  i64);
-
-define <vscale x 8 x i16> @intrinsic_vssub_mask_vx_nxv8i16_nxv8i16_i16(<vscale x 8 x i16> %0, <vscale x 8 x i16> %1, i16 %2, <vscale x 8 x i1> %3, i64 %4) nounwind {
-; CHECK-LABEL: intrinsic_vssub_mask_vx_nxv8i16_nxv8i16_i16:
-; CHECK:       # %bb.0: # %entry
-; CHECK-NEXT:    vsetvli zero, a1, e16, m2, ta, mu
-; CHECK-NEXT:    vssub.vx v8, v10, a0, v0.t
-; CHECK-NEXT:    ret
-entry:
-  %a = call <vscale x 8 x i16> @llvm.riscv.vssub.mask.nxv8i16.i16(
-    <vscale x 8 x i16> %0,
-    <vscale x 8 x i16> %1,
-    i16 %2,
-    <vscale x 8 x i1> %3,
-    i64 %4, i64 1)
-
-  ret <vscale x 8 x i16> %a
-}
-
-declare <vscale x 16 x i16> @llvm.riscv.vssub.nxv16i16.i16(
-  <vscale x 16 x i16>,
-  <vscale x 16 x i16>,
-  i16,
-  i64);
-
-define <vscale x 16 x i16> @intrinsic_vssub_vx_nxv16i16_nxv16i16_i16(<vscale x 16 x i16> %0, i16 %1, i64 %2) nounwind {
-; CHECK-LABEL: intrinsic_vssub_vx_nxv16i16_nxv16i16_i16:
-; CHECK:       # %bb.0: # %entry
-; CHECK-NEXT:    vsetvli zero, a1, e16, m4, ta, ma
-; CHECK-NEXT:    vssub.vx v8, v8, a0
-; CHECK-NEXT:    ret
-entry:
-  %a = call <vscale x 16 x i16> @llvm.riscv.vssub.nxv16i16.i16(
-    <vscale x 16 x i16> undef,
-    <vscale x 16 x i16> %0,
-    i16 %1,
-    i64 %2)
-
-  ret <vscale x 16 x i16> %a
-}
-
-declare <vscale x 16 x i16> @llvm.riscv.vssub.mask.nxv16i16.i16(
-  <vscale x 16 x i16>,
-  <vscale x 16 x i16>,
-  i16,
-  <vscale x 16 x i1>,
-  i64,
-  i64);
-
-define <vscale x 16 x i16> @intrinsic_vssub_mask_vx_nxv16i16_nxv16i16_i16(<vscale x 16 x i16> %0, <vscale x 16 x i16> %1, i16 %2, <vscale x 16 x i1> %3, i64 %4) nounwind {
-; CHECK-LABEL: intrinsic_vssub_mask_vx_nxv16i16_nxv16i16_i16:
-; CHECK:       # %bb.0: # %entry
-; CHECK-NEXT:    vsetvli zero, a1, e16, m4, ta, mu
-; CHECK-NEXT:    vssub.vx v8, v12, a0, v0.t
-; CHECK-NEXT:    ret
-entry:
-  %a = call <vscale x 16 x i16> @llvm.riscv.vssub.mask.nxv16i16.i16(
-    <vscale x 16 x i16> %0,
-    <vscale x 16 x i16> %1,
-    i16 %2,
-    <vscale x 16 x i1> %3,
-    i64 %4, i64 1)
-
-  ret <vscale x 16 x i16> %a
-}
-
-declare <vscale x 32 x i16> @llvm.riscv.vssub.nxv32i16.i16(
-  <vscale x 32 x i16>,
-  <vscale x 32 x i16>,
-  i16,
-  i64);
-
-define <vscale x 32 x i16> @intrinsic_vssub_vx_nxv32i16_nxv32i16_i16(<vscale x 32 x i16> %0, i16 %1, i64 %2) nounwind {
-; CHECK-LABEL: intrinsic_vssub_vx_nxv32i16_nxv32i16_i16:
-; CHECK:       # %bb.0: # %entry
-; CHECK-NEXT:    vsetvli zero, a1, e16, m8, ta, ma
-; CHECK-NEXT:    vssub.vx v8, v8, a0
-; CHECK-NEXT:    ret
-entry:
-  %a = call <vscale x 32 x i16> @llvm.riscv.vssub.nxv32i16.i16(
-    <vscale x 32 x i16> undef,
-    <vscale x 32 x i16> %0,
-    i16 %1,
-    i64 %2)
-
-  ret <vscale x 32 x i16> %a
-}
-
-declare <vscale x 32 x i16> @llvm.riscv.vssub.mask.nxv32i16.i16(
-  <vscale x 32 x i16>,
-  <vscale x 32 x i16>,
-  i16,
-  <vscale x 32 x i1>,
-  i64,
-  i64);
-
-define <vscale x 32 x i16> @intrinsic_vssub_mask_vx_nxv32i16_nxv32i16_i16(<vscale x 32 x i16> %0, <vscale x 32 x i16> %1, i16 %2, <vscale x 32 x i1> %3, i64 %4) nounwind {
-; CHECK-LABEL: intrinsic_vssub_mask_vx_nxv32i16_nxv32i16_i16:
-; CHECK:       # %bb.0: # %entry
-; CHECK-NEXT:    vsetvli zero, a1, e16, m8, ta, mu
-; CHECK-NEXT:    vssub.vx v8, v16, a0, v0.t
-; CHECK-NEXT:    ret
-entry:
-  %a = call <vscale x 32 x i16> @llvm.riscv.vssub.mask.nxv32i16.i16(
-    <vscale x 32 x i16> %0,
-    <vscale x 32 x i16> %1,
-    i16 %2,
-    <vscale x 32 x i1> %3,
-    i64 %4, i64 1)
-
-  ret <vscale x 32 x i16> %a
-}
-
-declare <vscale x 1 x i32> @llvm.riscv.vssub.nxv1i32.i32(
-  <vscale x 1 x i32>,
-  <vscale x 1 x i32>,
-  i32,
-  i64);
-
-define <vscale x 1 x i32> @intrinsic_vssub_vx_nxv1i32_nxv1i32_i32(<vscale x 1 x i32> %0, i32 %1, i64 %2) nounwind {
-; CHECK-LABEL: intrinsic_vssub_vx_nxv1i32_nxv1i32_i32:
-; CHECK:       # %bb.0: # %entry
-; CHECK-NEXT:    vsetvli zero, a1, e32, mf2, ta, ma
-; CHECK-NEXT:    vssub.vx v8, v8, a0
-; CHECK-NEXT:    ret
-entry:
-  %a = call <vscale x 1 x i32> @llvm.riscv.vssub.nxv1i32.i32(
-    <vscale x 1 x i32> undef,
-    <vscale x 1 x i32> %0,
-    i32 %1,
-    i64 %2)
-
-  ret <vscale x 1 x i32> %a
-}
-
-declare <vscale x 1 x i32> @llvm.riscv.vssub.mask.nxv1i32.i32(
-  <vscale x 1 x i32>,
-  <vscale x 1 x i32>,
-  i32,
-  <vscale x 1 x i1>,
-  i64,
-  i64);
-
-define <vscale x 1 x i32> @intrinsic_vssub_mask_vx_nxv1i32_nxv1i32_i32(<vscale x 1 x i32> %0, <vscale x 1 x i32> %1, i32 %2, <vscale x 1 x i1> %3, i64 %4) nounwind {
-; CHECK-LABEL: intrinsic_vssub_mask_vx_nxv1i32_nxv1i32_i32:
-; CHECK:       # %bb.0: # %entry
-; CHECK-NEXT:    vsetvli zero, a1, e32, mf2, ta, mu
-; CHECK-NEXT:    vssub.vx v8, v9, a0, v0.t
-; CHECK-NEXT:    ret
-entry:
-  %a = call <vscale x 1 x i32> @llvm.riscv.vssub.mask.nxv1i32.i32(
-    <vscale x 1 x i32> %0,
-    <vscale x 1 x i32> %1,
-    i32 %2,
-    <vscale x 1 x i1> %3,
-    i64 %4, i64 1)
-
-  ret <vscale x 1 x i32> %a
-}
-
-declare <vscale x 2 x i32> @llvm.riscv.vssub.nxv2i32.i32(
-  <vscale x 2 x i32>,
-  <vscale x 2 x i32>,
-  i32,
-  i64);
-
-define <vscale x 2 x i32> @intrinsic_vssub_vx_nxv2i32_nxv2i32_i32(<vscale x 2 x i32> %0, i32 %1, i64 %2) nounwind {
-; CHECK-LABEL: intrinsic_vssub_vx_nxv2i32_nxv2i32_i32:
-; CHECK:       # %bb.0: # %entry
-; CHECK-NEXT:    vsetvli zero, a1, e32, m1, ta, ma
-; CHECK-NEXT:    vssub.vx v8, v8, a0
-; CHECK-NEXT:    ret
-entry:
-  %a = call <vscale x 2 x i32> @llvm.riscv.vssub.nxv2i32.i32(
-    <vscale x 2 x i32> undef,
-    <vscale x 2 x i32> %0,
-    i32 %1,
-    i64 %2)
-
-  ret <vscale x 2 x i32> %a
-}
-
-declare <vscale x 2 x i32> @llvm.riscv.vssub.mask.nxv2i32.i32(
-  <vscale x 2 x i32>,
-  <vscale x 2 x i32>,
-  i32,
-  <vscale x 2 x i1>,
-  i64,
-  i64);
-
-define <vscale x 2 x i32> @intrinsic_vssub_mask_vx_nxv2i32_nxv2i32_i32(<vscale x 2 x i32> %0, <vscale x 2 x i32> %1, i32 %2, <vscale x 2 x i1> %3, i64 %4) nounwind {
-; CHECK-LABEL: intrinsic_vssub_mask_vx_nxv2i32_nxv2i32_i32:
-; CHECK:       # %bb.0: # %entry
-; CHECK-NEXT:    vsetvli zero, a1, e32, m1, ta, mu
-; CHECK-NEXT:    vssub.vx v8, v9, a0, v0.t
-; CHECK-NEXT:    ret
-entry:
-  %a = call <vscale x 2 x i32> @llvm.riscv.vssub.mask.nxv2i32.i32(
-    <vscale x 2 x i32> %0,
-    <vscale x 2 x i32> %1,
-    i32 %2,
-    <vscale x 2 x i1> %3,
-    i64 %4, i64 1)
-
-  ret <vscale x 2 x i32> %a
-}
-
-declare <vscale x 4 x i32> @llvm.riscv.vssub.nxv4i32.i32(
-  <vscale x 4 x i32>,
-  <vscale x 4 x i32>,
-  i32,
-  i64);
-
-define <vscale x 4 x i32> @intrinsic_vssub_vx_nxv4i32_nxv4i32_i32(<vscale x 4 x i32> %0, i32 %1, i64 %2) nounwind {
-; CHECK-LABEL: intrinsic_vssub_vx_nxv4i32_nxv4i32_i32:
-; CHECK:       # %bb.0: # %entry
-; CHECK-NEXT:    vsetvli zero, a1, e32, m2, ta, ma
-; CHECK-NEXT:    vssub.vx v8, v8, a0
-; CHECK-NEXT:    ret
-entry:
-  %a = call <vscale x 4 x i32> @llvm.riscv.vssub.nxv4i32.i32(
-    <vscale x 4 x i32> undef,
-    <vscale x 4 x i32> %0,
-    i32 %1,
-    i64 %2)
-
-  ret <vscale x 4 x i32> %a
-}
-
-declare <vscale x 4 x i32> @llvm.riscv.vssub.mask.nxv4i32.i32(
-  <vscale x 4 x i32>,
-  <vscale x 4 x i32>,
-  i32,
-  <vscale x 4 x i1>,
-  i64,
-  i64);
-
-define <vscale x 4 x i32> @intrinsic_vssub_mask_vx_nxv4i32_nxv4i32_i32(<vscale x 4 x i32> %0, <vscale x 4 x i32> %1, i32 %2, <vscale x 4 x i1> %3, i64 %4) nounwind {
-; CHECK-LABEL: intrinsic_vssub_mask_vx_nxv4i32_nxv4i32_i32:
-; CHECK:       # %bb.0: # %entry
-; CHECK-NEXT:    vsetvli zero, a1, e32, m2, ta, mu
-; CHECK-NEXT:    vssub.vx v8, v10, a0, v0.t
-; CHECK-NEXT:    ret
-entry:
-  %a = call <vscale x 4 x i32> @llvm.riscv.vssub.mask.nxv4i32.i32(
-    <vscale x 4 x i32> %0,
-    <vscale x 4 x i32> %1,
-    i32 %2,
-    <vscale x 4 x i1> %3,
-    i64 %4, i64 1)
-
-  ret <vscale x 4 x i32> %a
-}
-
-declare <vscale x 8 x i32> @llvm.riscv.vssub.nxv8i32.i32(
-  <vscale x 8 x i32>,
-  <vscale x 8 x i32>,
-  i32,
-  i64);
-
-define <vscale x 8 x i32> @intrinsic_vssub_vx_nxv8i32_nxv8i32_i32(<vscale x 8 x i32> %0, i32 %1, i64 %2) nounwind {
-; CHECK-LABEL: intrinsic_vssub_vx_nxv8i32_nxv8i32_i32:
-; CHECK:       # %bb.0: # %entry
-; CHECK-NEXT:    vsetvli zero, a1, e32, m4, ta, ma
-; CHECK-NEXT:    vssub.vx v8, v8, a0
-; CHECK-NEXT:    ret
-entry:
-  %a = call <vscale x 8 x i32> @llvm.riscv.vssub.nxv8i32.i32(
-    <vscale x 8 x i32> undef,
-    <vscale x 8 x i32> %0,
-    i32 %1,
-    i64 %2)
-
-  ret <vscale x 8 x i32> %a
-}
-
-declare <vscale x 8 x i32> @llvm.riscv.vssub.mask.nxv8i32.i32(
-  <vscale x 8 x i32>,
-  <vscale x 8 x i32>,
-  i32,
-  <vscale x 8 x i1>,
-  i64,
-  i64);
-
-define <vscale x 8 x i32> @intrinsic_vssub_mask_vx_nxv8i32_nxv8i32_i32(<vscale x 8 x i32> %0, <vscale x 8 x i32> %1, i32 %2, <vscale x 8 x i1> %3, i64 %4) nounwind {
-; CHECK-LABEL: intrinsic_vssub_mask_vx_nxv8i32_nxv8i32_i32:
-; CHECK:       # %bb.0: # %entry
-; CHECK-NEXT:    vsetvli zero, a1, e32, m4, ta, mu
-; CHECK-NEXT:    vssub.vx v8, v12, a0, v0.t
-; CHECK-NEXT:    ret
-entry:
-  %a = call <vscale x 8 x i32> @llvm.riscv.vssub.mask.nxv8i32.i32(
-    <vscale x 8 x i32> %0,
-    <vscale x 8 x i32> %1,
-    i32 %2,
-    <vscale x 8 x i1> %3,
-    i64 %4, i64 1)
-
-  ret <vscale x 8 x i32> %a
-}
-
-declare <vscale x 16 x i32> @llvm.riscv.vssub.nxv16i32.i32(
-  <vscale x 16 x i32>,
-  <vscale x 16 x i32>,
-  i32,
-  i64);
-
-define <vscale x 16 x i32> @intrinsic_vssub_vx_nxv16i32_nxv16i32_i32(<vscale x 16 x i32> %0, i32 %1, i64 %2) nounwind {
-; CHECK-LABEL: intrinsic_vssub_vx_nxv16i32_nxv16i32_i32:
-; CHECK:       # %bb.0: # %entry
-; CHECK-NEXT:    vsetvli zero, a1, e32, m8, ta, ma
-; CHECK-NEXT:    vssub.vx v8, v8, a0
-; CHECK-NEXT:    ret
-entry:
-  %a = call <vscale x 16 x i32> @llvm.riscv.vssub.nxv16i32.i32(
-    <vscale x 16 x i32> undef,
-    <vscale x 16 x i32> %0,
-    i32 %1,
-    i64 %2)
-
-  ret <vscale x 16 x i32> %a
-}
-
-declare <vscale x 16 x i32> @llvm.riscv.vssub.mask.nxv16i32.i32(
-  <vscale x 16 x i32>,
-  <vscale x 16 x i32>,
-  i32,
-  <vscale x 16 x i1>,
-  i64,
-  i64);
-
-define <vscale x 16 x i32> @intrinsic_vssub_mask_vx_nxv16i32_nxv16i32_i32(<vscale x 16 x i32> %0, <vscale x 16 x i32> %1, i32 %2, <vscale x 16 x i1> %3, i64 %4) nounwind {
-; CHECK-LABEL: intrinsic_vssub_mask_vx_nxv16i32_nxv16i32_i32:
-; CHECK:       # %bb.0: # %entry
-; CHECK-NEXT:    vsetvli zero, a1, e32, m8, ta, mu
-; CHECK-NEXT:    vssub.vx v8, v16, a0, v0.t
-; CHECK-NEXT:    ret
-entry:
-  %a = call <vscale x 16 x i32> @llvm.riscv.vssub.mask.nxv16i32.i32(
-    <vscale x 16 x i32> %0,
-    <vscale x 16 x i32> %1,
-    i32 %2,
-    <vscale x 16 x i1> %3,
-    i64 %4, i64 1)
-
-  ret <vscale x 16 x i32> %a
-}
-
-declare <vscale x 1 x i64> @llvm.riscv.vssub.nxv1i64.i64(
-  <vscale x 1 x i64>,
-  <vscale x 1 x i64>,
-  i64,
-  i64);
-
-define <vscale x 1 x i64> @intrinsic_vssub_vx_nxv1i64_nxv1i64_i64(<vscale x 1 x i64> %0, i64 %1, i64 %2) nounwind {
-; CHECK-LABEL: intrinsic_vssub_vx_nxv1i64_nxv1i64_i64:
-; CHECK:       # %bb.0: # %entry
-; CHECK-NEXT:    vsetvli zero, a1, e64, m1, ta, ma
-; CHECK-NEXT:    vssub.vx v8, v8, a0
-; CHECK-NEXT:    ret
-entry:
-  %a = call <vscale x 1 x i64> @llvm.riscv.vssub.nxv1i64.i64(
-    <vscale x 1 x i64> undef,
-    <vscale x 1 x i64> %0,
-    i64 %1,
-    i64 %2)
-
-  ret <vscale x 1 x i64> %a
-}
-
-declare <vscale x 1 x i64> @llvm.riscv.vssub.mask.nxv1i64.i64(
-  <vscale x 1 x i64>,
-  <vscale x 1 x i64>,
-  i64,
-  <vscale x 1 x i1>,
-  i64,
-  i64);
-
-define <vscale x 1 x i64> @intrinsic_vssub_mask_vx_nxv1i64_nxv1i64_i64(<vscale x 1 x i64> %0, <vscale x 1 x i64> %1, i64 %2, <vscale x 1 x i1> %3, i64 %4) nounwind {
-; CHECK-LABEL: intrinsic_vssub_mask_vx_nxv1i64_nxv1i64_i64:
-; CHECK:       # %bb.0: # %entry
-; CHECK-NEXT:    vsetvli zero, a1, e64, m1, ta, mu
-; CHECK-NEXT:    vssub.vx v8, v9, a0, v0.t
-; CHECK-NEXT:    ret
-entry:
-  %a = call <vscale x 1 x i64> @llvm.riscv.vssub.mask.nxv1i64.i64(
-    <vscale x 1 x i64> %0,
-    <vscale x 1 x i64> %1,
-    i64 %2,
-    <vscale x 1 x i1> %3,
-    i64 %4, i64 1)
-
-  ret <vscale x 1 x i64> %a
-}
-
-declare <vscale x 2 x i64> @llvm.riscv.vssub.nxv2i64.i64(
-  <vscale x 2 x i64>,
-  <vscale x 2 x i64>,
-  i64,
-  i64);
-
-define <vscale x 2 x i64> @intrinsic_vssub_vx_nxv2i64_nxv2i64_i64(<vscale x 2 x i64> %0, i64 %1, i64 %2) nounwind {
-; CHECK-LABEL: intrinsic_vssub_vx_nxv2i64_nxv2i64_i64:
-; CHECK:       # %bb.0: # %entry
-; CHECK-NEXT:    vsetvli zero, a1, e64, m2, ta, ma
-; CHECK-NEXT:    vssub.vx v8, v8, a0
-; CHECK-NEXT:    ret
-entry:
-  %a = call <vscale x 2 x i64> @llvm.riscv.vssub.nxv2i64.i64(
-    <vscale x 2 x i64> undef,
-    <vscale x 2 x i64> %0,
-    i64 %1,
-    i64 %2)
-
-  ret <vscale x 2 x i64> %a
-}
-
-declare <vscale x 2 x i64> @llvm.riscv.vssub.mask.nxv2i64.i64(
-  <vscale x 2 x i64>,
-  <vscale x 2 x i64>,
-  i64,
-  <vscale x 2 x i1>,
-  i64,
-  i64);
-
-define <vscale x 2 x i64> @intrinsic_vssub_mask_vx_nxv2i64_nxv2i64_i64(<vscale x 2 x i64> %0, <vscale x 2 x i64> %1, i64 %2, <vscale x 2 x i1> %3, i64 %4) nounwind {
-; CHECK-LABEL: intrinsic_vssub_mask_vx_nxv2i64_nxv2i64_i64:
-; CHECK:       # %bb.0: # %entry
-; CHECK-NEXT:    vsetvli zero, a1, e64, m2, ta, mu
-; CHECK-NEXT:    vssub.vx v8, v10, a0, v0.t
-; CHECK-NEXT:    ret
-entry:
-  %a = call <vscale x 2 x i64> @llvm.riscv.vssub.mask.nxv2i64.i64(
-    <vscale x 2 x i64> %0,
-    <vscale x 2 x i64> %1,
-    i64 %2,
-    <vscale x 2 x i1> %3,
-    i64 %4, i64 1)
-
-  ret <vscale x 2 x i64> %a
-}
-
-declare <vscale x 4 x i64> @llvm.riscv.vssub.nxv4i64.i64(
-  <vscale x 4 x i64>,
-  <vscale x 4 x i64>,
-  i64,
-  i64);
-
-define <vscale x 4 x i64> @intrinsic_vssub_vx_nxv4i64_nxv4i64_i64(<vscale x 4 x i64> %0, i64 %1, i64 %2) nounwind {
-; CHECK-LABEL: intrinsic_vssub_vx_nxv4i64_nxv4i64_i64:
-; CHECK:       # %bb.0: # %entry
-; CHECK-NEXT:    vsetvli zero, a1, e64, m4, ta, ma
-; CHECK-NEXT:    vssub.vx v8, v8, a0
-; CHECK-NEXT:    ret
-entry:
-  %a = call <vscale x 4 x i64> @llvm.riscv.vssub.nxv4i64.i64(
-    <vscale x 4 x i64> undef,
-    <vscale x 4 x i64> %0,
-    i64 %1,
-    i64 %2)
-
-  ret <vscale x 4 x i64> %a
-}
-
-declare <vscale x 4 x i64> @llvm.riscv.vssub.mask.nxv4i64.i64(
-  <vscale x 4 x i64>,
-  <vscale x 4 x i64>,
-  i64,
-  <vscale x 4 x i1>,
-  i64,
-  i64);
-
-define <vscale x 4 x i64> @intrinsic_vssub_mask_vx_nxv4i64_nxv4i64_i64(<vscale x 4 x i64> %0, <vscale x 4 x i64> %1, i64 %2, <vscale x 4 x i1> %3, i64 %4) nounwind {
-; CHECK-LABEL: intrinsic_vssub_mask_vx_nxv4i64_nxv4i64_i64:
-; CHECK:       # %bb.0: # %entry
-; CHECK-NEXT:    vsetvli zero, a1, e64, m4, ta, mu
-; CHECK-NEXT:    vssub.vx v8, v12, a0, v0.t
-; CHECK-NEXT:    ret
-entry:
-  %a = call <vscale x 4 x i64> @llvm.riscv.vssub.mask.nxv4i64.i64(
-    <vscale x 4 x i64> %0,
-    <vscale x 4 x i64> %1,
-    i64 %2,
-    <vscale x 4 x i1> %3,
-    i64 %4, i64 1)
-
-  ret <vscale x 4 x i64> %a
-}
-
-declare <vscale x 8 x i64> @llvm.riscv.vssub.nxv8i64.i64(
-  <vscale x 8 x i64>,
-  <vscale x 8 x i64>,
-  i64,
-  i64);
-
-define <vscale x 8 x i64> @intrinsic_vssub_vx_nxv8i64_nxv8i64_i64(<vscale x 8 x i64> %0, i64 %1, i64 %2) nounwind {
-; CHECK-LABEL: intrinsic_vssub_vx_nxv8i64_nxv8i64_i64:
-; CHECK:       # %bb.0: # %entry
-; CHECK-NEXT:    vsetvli zero, a1, e64, m8, ta, ma
-; CHECK-NEXT:    vssub.vx v8, v8, a0
-; CHECK-NEXT:    ret
-entry:
-  %a = call <vscale x 8 x i64> @llvm.riscv.vssub.nxv8i64.i64(
-    <vscale x 8 x i64> undef,
-    <vscale x 8 x i64> %0,
-    i64 %1,
-    i64 %2)
-
-  ret <vscale x 8 x i64> %a
-}
-
-declare <vscale x 8 x i64> @llvm.riscv.vssub.mask.nxv8i64.i64(
-  <vscale x 8 x i64>,
-  <vscale x 8 x i64>,
-  i64,
-  <vscale x 8 x i1>,
-  i64,
-  i64);
-
-define <vscale x 8 x i64> @intrinsic_vssub_mask_vx_nxv8i64_nxv8i64_i64(<vscale x 8 x i64> %0, <vscale x 8 x i64> %1, i64 %2, <vscale x 8 x i1> %3, i64 %4) nounwind {
-; CHECK-LABEL: intrinsic_vssub_mask_vx_nxv8i64_nxv8i64_i64:
-; CHECK:       # %bb.0: # %entry
-; CHECK-NEXT:    vsetvli zero, a1, e64, m8, ta, mu
-; CHECK-NEXT:    vssub.vx v8, v16, a0, v0.t
-; CHECK-NEXT:    ret
-entry:
-  %a = call <vscale x 8 x i64> @llvm.riscv.vssub.mask.nxv8i64.i64(
-    <vscale x 8 x i64> %0,
-    <vscale x 8 x i64> %1,
-    i64 %2,
-    <vscale x 8 x i1> %3,
-    i64 %4, i64 1)
-
-  ret <vscale x 8 x i64> %a
-}
diff --git a/llvm/test/CodeGen/RISCV/rvv/vssub-rv32.ll b/llvm/test/CodeGen/RISCV/rvv/vssub.ll
index f3ba5da..50fca5e 100644
--- a/llvm/test/CodeGen/RISCV/rvv/vssub-rv32.ll
+++ b/llvm/test/CodeGen/RISCV/rvv/vssub.ll
@@ -1,14 +1,16 @@
 ; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py
-; RUN: llc -mtriple=riscv32 -mattr=+v -verify-machineinstrs \
-; RUN:   < %s | FileCheck %s
+; RUN: sed 's/iXLen/i32/g' %s | llc -mtriple=riscv32 -mattr=+v \
+; RUN:   -verify-machineinstrs  | FileCheck %s --check-prefixes=CHECK,RV32
+; RUN: sed 's/iXLen/i64/g' %s | llc -mtriple=riscv64 -mattr=+v \
+; RUN:   -verify-machineinstrs  | FileCheck %s --check-prefixes=CHECK,RV64
 
 declare <vscale x 1 x i8> @llvm.riscv.vssub.nxv1i8.nxv1i8(
   <vscale x 1 x i8>,
   <vscale x 1 x i8>,
   <vscale x 1 x i8>,
-  i32);
+  iXLen)
 
-define <vscale x 1 x i8> @intrinsic_vssub_vv_nxv1i8_nxv1i8_nxv1i8(<vscale x 1 x i8> %0, <vscale x 1 x i8> %1, i32 %2) nounwind {
+define <vscale x 1 x i8> @intrinsic_vssub_vv_nxv1i8_nxv1i8_nxv1i8(<vscale x 1 x i8> %0, <vscale x 1 x i8> %1, iXLen %2) nounwind {
 ; CHECK-LABEL: intrinsic_vssub_vv_nxv1i8_nxv1i8_nxv1i8:
 ; CHECK:       # %bb.0: # %entry
 ; CHECK-NEXT:    vsetvli zero, a0, e8, mf8, ta, ma
@@ -19,7 +21,7 @@ entry:
     <vscale x 1 x i8> undef,
     <vscale x 1 x i8> %0,
     <vscale x 1 x i8> %1,
-    i32 %2)
+    iXLen %2)
 
   ret <vscale x 1 x i8> %a
 }
@@ -29,10 +31,10 @@ declare <vscale x 1 x i8> @llvm.riscv.vssub.mask.nxv1i8.nxv1i8(
   <vscale x 1 x i8>,
   <vscale x 1 x i8>,
   <vscale x 1 x i1>,
-  i32,
-  i32);
+  iXLen,
+  iXLen)
 
-define <vscale x 1 x i8> @intrinsic_vssub_mask_vv_nxv1i8_nxv1i8_nxv1i8(<vscale x 1 x i8> %0, <vscale x 1 x i8> %1, <vscale x 1 x i8> %2, <vscale x 1 x i1> %3, i32 %4) nounwind {
+define <vscale x 1 x i8> @intrinsic_vssub_mask_vv_nxv1i8_nxv1i8_nxv1i8(<vscale x 1 x i8> %0, <vscale x 1 x i8> %1, <vscale x 1 x i8> %2, <vscale x 1 x i1> %3, iXLen %4) nounwind {
 ; CHECK-LABEL: intrinsic_vssub_mask_vv_nxv1i8_nxv1i8_nxv1i8:
 ; CHECK:       # %bb.0: # %entry
 ; CHECK-NEXT:    vsetvli zero, a0, e8, mf8, ta, mu
@@ -44,7 +46,7 @@ entry:
     <vscale x 1 x i8> %1,
     <vscale x 1 x i8> %2,
     <vscale x 1 x i1> %3,
-    i32 %4, i32 1)
+    iXLen %4, iXLen 1)
 
   ret <vscale x 1 x i8> %a
 }
@@ -53,9 +55,9 @@ declare <vscale x 2 x i8> @llvm.riscv.vssub.nxv2i8.nxv2i8(
   <vscale x 2 x i8>,
   <vscale x 2 x i8>,
   <vscale x 2 x i8>,
-  i32);
+  iXLen)
 
-define <vscale x 2 x i8> @intrinsic_vssub_vv_nxv2i8_nxv2i8_nxv2i8(<vscale x 2 x i8> %0, <vscale x 2 x i8> %1, i32 %2) nounwind {
+define <vscale x 2 x i8> @intrinsic_vssub_vv_nxv2i8_nxv2i8_nxv2i8(<vscale x 2 x i8> %0, <vscale x 2 x i8> %1, iXLen %2) nounwind {
 ; CHECK-LABEL: intrinsic_vssub_vv_nxv2i8_nxv2i8_nxv2i8:
 ; CHECK:       # %bb.0: # %entry
 ; CHECK-NEXT:    vsetvli zero, a0, e8, mf4, ta, ma
@@ -66,7 +68,7 @@ entry:
     <vscale x 2 x i8> undef,
     <vscale x 2 x i8> %0,
     <vscale x 2 x i8> %1,
-    i32 %2)
+    iXLen %2)
 
   ret <vscale x 2 x i8> %a
 }
@@ -76,10 +78,10 @@ declare <vscale x 2 x i8> @llvm.riscv.vssub.mask.nxv2i8.nxv2i8(
   <vscale x 2 x i8>,
   <vscale x 2 x i8>,
   <vscale x 2 x i1>,
-  i32,
-  i32);
+  iXLen,
+  iXLen)
 
-define <vscale x 2 x i8> @intrinsic_vssub_mask_vv_nxv2i8_nxv2i8_nxv2i8(<vscale x 2 x i8> %0, <vscale x 2 x i8> %1, <vscale x 2 x i8> %2, <vscale x 2 x i1> %3, i32 %4) nounwind {
+define <vscale x 2 x i8> @intrinsic_vssub_mask_vv_nxv2i8_nxv2i8_nxv2i8(<vscale x 2 x i8> %0, <vscale x 2 x i8> %1, <vscale x 2 x i8> %2, <vscale x 2 x i1> %3, iXLen %4) nounwind {
 ; CHECK-LABEL: intrinsic_vssub_mask_vv_nxv2i8_nxv2i8_nxv2i8:
 ; CHECK:       # %bb.0: # %entry
 ; CHECK-NEXT:    vsetvli zero, a0, e8, mf4, ta, mu
@@ -91,7 +93,7 @@ entry:
     <vscale x 2 x i8> %1,
     <vscale x 2 x i8> %2,
     <vscale x 2 x i1> %3,
-    i32 %4, i32 1)
+    iXLen %4, iXLen 1)
 
   ret <vscale x 2 x i8> %a
 }
@@ -100,9 +102,9 @@ declare <vscale x 4 x i8> @llvm.riscv.vssub.nxv4i8.nxv4i8(
   <vscale x 4 x i8>,
   <vscale x 4 x i8>,
   <vscale x 4 x i8>,
-  i32);
+  iXLen)
 
-define <vscale x 4 x i8> @intrinsic_vssub_vv_nxv4i8_nxv4i8_nxv4i8(<vscale x 4 x i8> %0, <vscale x 4 x i8> %1, i32 %2) nounwind {
+define <vscale x 4 x i8> @intrinsic_vssub_vv_nxv4i8_nxv4i8_nxv4i8(<vscale x 4 x i8> %0, <vscale x 4 x i8> %1, iXLen %2) nounwind {
 ; CHECK-LABEL: intrinsic_vssub_vv_nxv4i8_nxv4i8_nxv4i8:
 ; CHECK:       # %bb.0: # %entry
 ; CHECK-NEXT:    vsetvli zero, a0, e8, mf2, ta, ma
@@ -113,7 +115,7 @@ entry:
     <vscale x 4 x i8> undef,
     <vscale x 4 x i8> %0,
     <vscale x 4 x i8> %1,
-    i32 %2)
+    iXLen %2)
 
   ret <vscale x 4 x i8> %a
 }
@@ -123,10 +125,10 @@ declare <vscale x 4 x i8> @llvm.riscv.vssub.mask.nxv4i8.nxv4i8(
   <vscale x 4 x i8>,
   <vscale x 4 x i8>,
   <vscale x 4 x i1>,
-  i32,
-  i32);
+  iXLen,
+  iXLen)
 
-define <vscale x 4 x i8> @intrinsic_vssub_mask_vv_nxv4i8_nxv4i8_nxv4i8(<vscale x 4 x i8> %0, <vscale x 4 x i8> %1, <vscale x 4 x i8> %2, <vscale x 4 x i1> %3, i32 %4) nounwind {
+define <vscale x 4 x i8> @intrinsic_vssub_mask_vv_nxv4i8_nxv4i8_nxv4i8(<vscale x 4 x i8> %0, <vscale x 4 x i8> %1, <vscale x 4 x i8> %2, <vscale x 4 x i1> %3, iXLen %4) nounwind {
 ; CHECK-LABEL: intrinsic_vssub_mask_vv_nxv4i8_nxv4i8_nxv4i8:
 ; CHECK:       # %bb.0: # %entry
 ; CHECK-NEXT:    vsetvli zero, a0, e8, mf2, ta, mu
@@ -138,7 +140,7 @@ entry:
     <vscale x 4 x i8> %1,
     <vscale x 4 x i8> %2,
     <vscale x 4 x i1> %3,
-    i32 %4, i32 1)
+    iXLen %4, iXLen 1)
 
   ret <vscale x 4 x i8> %a
 }
@@ -147,9 +149,9 @@ declare <vscale x 8 x i8> @llvm.riscv.vssub.nxv8i8.nxv8i8(
   <vscale x 8 x i8>,
   <vscale x 8 x i8>,
   <vscale x 8 x i8>,
-  i32);
+  iXLen)
 
-define <vscale x 8 x i8> @intrinsic_vssub_vv_nxv8i8_nxv8i8_nxv8i8(<vscale x 8 x i8> %0, <vscale x 8 x i8> %1, i32 %2) nounwind {
+define <vscale x 8 x i8> @intrinsic_vssub_vv_nxv8i8_nxv8i8_nxv8i8(<vscale x 8 x i8> %0, <vscale x 8 x i8> %1, iXLen %2) nounwind {
 ; CHECK-LABEL: intrinsic_vssub_vv_nxv8i8_nxv8i8_nxv8i8:
 ; CHECK:       # %bb.0: # %entry
 ; CHECK-NEXT:    vsetvli zero, a0, e8, m1, ta, ma
@@ -160,7 +162,7 @@ entry:
     <vscale x 8 x i8> undef,
     <vscale x 8 x i8> %0,
     <vscale x 8 x i8> %1,
-    i32 %2)
+    iXLen %2)
 
   ret <vscale x 8 x i8> %a
 }
@@ -170,10 +172,10 @@ declare <vscale x 8 x i8> @llvm.riscv.vssub.mask.nxv8i8.nxv8i8(
   <vscale x 8 x i8>,
   <vscale x 8 x i8>,
   <vscale x 8 x i1>,
-  i32,
-  i32);
+  iXLen,
+  iXLen)
 
-define <vscale x 8 x i8> @intrinsic_vssub_mask_vv_nxv8i8_nxv8i8_nxv8i8(<vscale x 8 x i8> %0, <vscale x 8 x i8> %1, <vscale x 8 x i8> %2, <vscale x 8 x i1> %3, i32 %4) nounwind {
+define <vscale x 8 x i8> @intrinsic_vssub_mask_vv_nxv8i8_nxv8i8_nxv8i8(<vscale x 8 x i8> %0, <vscale x 8 x i8> %1, <vscale x 8 x i8> %2, <vscale x 8 x i1> %3, iXLen %4) nounwind {
 ; CHECK-LABEL: intrinsic_vssub_mask_vv_nxv8i8_nxv8i8_nxv8i8:
 ; CHECK:       # %bb.0: # %entry
 ; CHECK-NEXT:    vsetvli zero, a0, e8, m1, ta, mu
@@ -185,7 +187,7 @@ entry:
     <vscale x 8 x i8> %1,
     <vscale x 8 x i8> %2,
     <vscale x 8 x i1> %3,
-    i32 %4, i32 1)
+    iXLen %4, iXLen 1)
 
   ret <vscale x 8 x i8> %a
 }
@@ -194,9 +196,9 @@ declare <vscale x 16 x i8> @llvm.riscv.vssub.nxv16i8.nxv16i8(
   <vscale x 16 x i8>,
   <vscale x 16 x i8>,
   <vscale x 16 x i8>,
-  i32);
+  iXLen)
 
-define <vscale x 16 x i8> @intrinsic_vssub_vv_nxv16i8_nxv16i8_nxv16i8(<vscale x 16 x i8> %0, <vscale x 16 x i8> %1, i32 %2) nounwind {
+define <vscale x 16 x i8> @intrinsic_vssub_vv_nxv16i8_nxv16i8_nxv16i8(<vscale x 16 x i8> %0, <vscale x 16 x i8> %1, iXLen %2) nounwind {
 ; CHECK-LABEL: intrinsic_vssub_vv_nxv16i8_nxv16i8_nxv16i8:
 ; CHECK:       # %bb.0: # %entry
 ; CHECK-NEXT:    vsetvli zero, a0, e8, m2, ta, ma
@@ -207,7 +209,7 @@ entry:
     <vscale x 16 x i8> undef,
     <vscale x 16 x i8> %0,
     <vscale x 16 x i8> %1,
-    i32 %2)
+    iXLen %2)
 
   ret <vscale x 16 x i8> %a
 }
@@ -217,10 +219,10 @@ declare <vscale x 16 x i8> @llvm.riscv.vssub.mask.nxv16i8.nxv16i8(
   <vscale x 16 x i8>,
   <vscale x 16 x i8>,
   <vscale x 16 x i1>,
-  i32,
-  i32);
+  iXLen,
+  iXLen)
 
-define <vscale x 16 x i8> @intrinsic_vssub_mask_vv_nxv16i8_nxv16i8_nxv16i8(<vscale x 16 x i8> %0, <vscale x 16 x i8> %1, <vscale x 16 x i8> %2, <vscale x 16 x i1> %3, i32 %4) nounwind {
+define <vscale x 16 x i8> @intrinsic_vssub_mask_vv_nxv16i8_nxv16i8_nxv16i8(<vscale x 16 x i8> %0, <vscale x 16 x i8> %1, <vscale x 16 x i8> %2, <vscale x 16 x i1> %3, iXLen %4) nounwind {
 ; CHECK-LABEL: intrinsic_vssub_mask_vv_nxv16i8_nxv16i8_nxv16i8:
 ; CHECK:       # %bb.0: # %entry
 ; CHECK-NEXT:    vsetvli zero, a0, e8, m2, ta, mu
@@ -232,7 +234,7 @@ entry:
     <vscale x 16 x i8> %1,
     <vscale x 16 x i8> %2,
     <vscale x 16 x i1> %3,
-    i32 %4, i32 1)
+    iXLen %4, iXLen 1)
 
   ret <vscale x 16 x i8> %a
 }
@@ -241,9 +243,9 @@ declare <vscale x 32 x i8> @llvm.riscv.vssub.nxv32i8.nxv32i8(
   <vscale x 32 x i8>,
   <vscale x 32 x i8>,
   <vscale x 32 x i8>,
-  i32);
+  iXLen)
 
-define <vscale x 32 x i8> @intrinsic_vssub_vv_nxv32i8_nxv32i8_nxv32i8(<vscale x 32 x i8> %0, <vscale x 32 x i8> %1, i32 %2) nounwind {
+define <vscale x 32 x i8> @intrinsic_vssub_vv_nxv32i8_nxv32i8_nxv32i8(<vscale x 32 x i8> %0, <vscale x 32 x i8> %1, iXLen %2) nounwind {
 ; CHECK-LABEL: intrinsic_vssub_vv_nxv32i8_nxv32i8_nxv32i8:
 ; CHECK:       # %bb.0: # %entry
 ; CHECK-NEXT:    vsetvli zero, a0, e8, m4, ta, ma
@@ -254,7 +256,7 @@ entry:
     <vscale x 32 x i8> undef,
     <vscale x 32 x i8> %0,
     <vscale x 32 x i8> %1,
-    i32 %2)
+    iXLen %2)
 
   ret <vscale x 32 x i8> %a
 }
@@ -264,10 +266,10 @@ declare <vscale x 32 x i8> @llvm.riscv.vssub.mask.nxv32i8.nxv32i8(
   <vscale x 32 x i8>,
   <vscale x 32 x i8>,
   <vscale x 32 x i1>,
-  i32,
-  i32);
+  iXLen,
+  iXLen)
 
-define <vscale x 32 x i8> @intrinsic_vssub_mask_vv_nxv32i8_nxv32i8_nxv32i8(<vscale x 32 x i8> %0, <vscale x 32 x i8> %1, <vscale x 32 x i8> %2, <vscale x 32 x i1> %3, i32 %4) nounwind {
+define <vscale x 32 x i8> @intrinsic_vssub_mask_vv_nxv32i8_nxv32i8_nxv32i8(<vscale x 32 x i8> %0, <vscale x 32 x i8> %1, <vscale x 32 x i8> %2, <vscale x 32 x i1> %3, iXLen %4) nounwind {
 ; CHECK-LABEL: intrinsic_vssub_mask_vv_nxv32i8_nxv32i8_nxv32i8:
 ; CHECK:       # %bb.0: # %entry
 ; CHECK-NEXT:    vsetvli zero, a0, e8, m4, ta, mu
@@ -279,7 +281,7 @@ entry:
     <vscale x 32 x i8> %1,
     <vscale x 32 x i8> %2,
     <vscale x 32 x i1> %3,
-    i32 %4, i32 1)
+    iXLen %4, iXLen 1)
 
   ret <vscale x 32 x i8> %a
 }
@@ -288,9 +290,9 @@ declare <vscale x 64 x i8> @llvm.riscv.vssub.nxv64i8.nxv64i8(
   <vscale x 64 x i8>,
   <vscale x 64 x i8>,
   <vscale x 64 x i8>,
-  i32);
+  iXLen)
 
-define <vscale x 64 x i8> @intrinsic_vssub_vv_nxv64i8_nxv64i8_nxv64i8(<vscale x 64 x i8> %0, <vscale x 64 x i8> %1, i32 %2) nounwind {
+define <vscale x 64 x i8> @intrinsic_vssub_vv_nxv64i8_nxv64i8_nxv64i8(<vscale x 64 x i8> %0, <vscale x 64 x i8> %1, iXLen %2) nounwind {
 ; CHECK-LABEL: intrinsic_vssub_vv_nxv64i8_nxv64i8_nxv64i8:
 ; CHECK:       # %bb.0: # %entry
 ; CHECK-NEXT:    vsetvli zero, a0, e8, m8, ta, ma
@@ -301,7 +303,7 @@ entry:
     <vscale x 64 x i8> undef,
     <vscale x 64 x i8> %0,
     <vscale x 64 x i8> %1,
-    i32 %2)
+    iXLen %2)
 
   ret <vscale x 64 x i8> %a
 }
@@ -311,10 +313,10 @@ declare <vscale x 64 x i8> @llvm.riscv.vssub.mask.nxv64i8.nxv64i8(
   <vscale x 64 x i8>,
   <vscale x 64 x i8>,
   <vscale x 64 x i1>,
-  i32,
-  i32);
+  iXLen,
+  iXLen)
 
-define <vscale x 64 x i8> @intrinsic_vssub_mask_vv_nxv64i8_nxv64i8_nxv64i8(<vscale x 64 x i8> %0, <vscale x 64 x i8> %1, <vscale x 64 x i8> %2, <vscale x 64 x i1> %3, i32 %4) nounwind {
+define <vscale x 64 x i8> @intrinsic_vssub_mask_vv_nxv64i8_nxv64i8_nxv64i8(<vscale x 64 x i8> %0, <vscale x 64 x i8> %1, <vscale x 64 x i8> %2, <vscale x 64 x i1> %3, iXLen %4) nounwind {
 ; CHECK-LABEL: intrinsic_vssub_mask_vv_nxv64i8_nxv64i8_nxv64i8:
 ; CHECK:       # %bb.0: # %entry
 ; CHECK-NEXT:    vl8r.v v24, (a0)
@@ -327,7 +329,7 @@ entry:
     <vscale x 64 x i8> %1,
     <vscale x 64 x i8> %2,
     <vscale x 64 x i1> %3,
-    i32 %4, i32 1)
+    iXLen %4, iXLen 1)
 
   ret <vscale x 64 x i8> %a
 }
@@ -336,9 +338,9 @@ declare <vscale x 1 x i16> @llvm.riscv.vssub.nxv1i16.nxv1i16(
   <vscale x 1 x i16>,
   <vscale x 1 x i16>,
   <vscale x 1 x i16>,
-  i32);
+  iXLen)
 
-define <vscale x 1 x i16> @intrinsic_vssub_vv_nxv1i16_nxv1i16_nxv1i16(<vscale x 1 x i16> %0, <vscale x 1 x i16> %1, i32 %2) nounwind {
+define <vscale x 1 x i16> @intrinsic_vssub_vv_nxv1i16_nxv1i16_nxv1i16(<vscale x 1 x i16> %0, <vscale x 1 x i16> %1, iXLen %2) nounwind {
 ; CHECK-LABEL: intrinsic_vssub_vv_nxv1i16_nxv1i16_nxv1i16:
 ; CHECK:       # %bb.0: # %entry
 ; CHECK-NEXT:    vsetvli zero, a0, e16, mf4, ta, ma
@@ -349,7 +351,7 @@ entry:
     <vscale x 1 x i16> undef,
     <vscale x 1 x i16> %0,
     <vscale x 1 x i16> %1,
-    i32 %2)
+    iXLen %2)
 
   ret <vscale x 1 x i16> %a
 }
@@ -359,10 +361,10 @@ declare <vscale x 1 x i16> @llvm.riscv.vssub.mask.nxv1i16.nxv1i16(
   <vscale x 1 x i16>,
   <vscale x 1 x i16>,
   <vscale x 1 x i1>,
-  i32,
-  i32);
+  iXLen,
+  iXLen)
 
-define <vscale x 1 x i16> @intrinsic_vssub_mask_vv_nxv1i16_nxv1i16_nxv1i16(<vscale x 1 x i16> %0, <vscale x 1 x i16> %1, <vscale x 1 x i16> %2, <vscale x 1 x i1> %3, i32 %4) nounwind {
+define <vscale x 1 x i16> @intrinsic_vssub_mask_vv_nxv1i16_nxv1i16_nxv1i16(<vscale x 1 x i16> %0, <vscale x 1 x i16> %1, <vscale x 1 x i16> %2, <vscale x 1 x i1> %3, iXLen %4) nounwind {
 ; CHECK-LABEL: intrinsic_vssub_mask_vv_nxv1i16_nxv1i16_nxv1i16:
 ; CHECK:       # %bb.0: # %entry
 ; CHECK-NEXT:    vsetvli zero, a0, e16, mf4, ta, mu
@@ -374,7 +376,7 @@ entry:
     <vscale x 1 x i16> %1,
     <vscale x 1 x i16> %2,
     <vscale x 1 x i1> %3,
-    i32 %4, i32 1)
+    iXLen %4, iXLen 1)
 
   ret <vscale x 1 x i16> %a
 }
@@ -383,9 +385,9 @@ declare <vscale x 2 x i16> @llvm.riscv.vssub.nxv2i16.nxv2i16(
   <vscale x 2 x i16>,
   <vscale x 2 x i16>,
   <vscale x 2 x i16>,
-  i32);
+  iXLen)
 
-define <vscale x 2 x i16> @intrinsic_vssub_vv_nxv2i16_nxv2i16_nxv2i16(<vscale x 2 x i16> %0, <vscale x 2 x i16> %1, i32 %2) nounwind {
+define <vscale x 2 x i16> @intrinsic_vssub_vv_nxv2i16_nxv2i16_nxv2i16(<vscale x 2 x i16> %0, <vscale x 2 x i16> %1, iXLen %2) nounwind {
 ; CHECK-LABEL: intrinsic_vssub_vv_nxv2i16_nxv2i16_nxv2i16:
 ; CHECK:       # %bb.0: # %entry
 ; CHECK-NEXT:    vsetvli zero, a0, e16, mf2, ta, ma
@@ -396,7 +398,7 @@ entry:
     <vscale x 2 x i16> undef,
     <vscale x 2 x i16> %0,
     <vscale x 2 x i16> %1,
-    i32 %2)
+    iXLen %2)
 
   ret <vscale x 2 x i16> %a
 }
@@ -406,10 +408,10 @@ declare <vscale x 2 x i16> @llvm.riscv.vssub.mask.nxv2i16.nxv2i16(
   <vscale x 2 x i16>,
   <vscale x 2 x i16>,
   <vscale x 2 x i1>,
-  i32,
-  i32);
+  iXLen,
+  iXLen)
 
-define <vscale x 2 x i16> @intrinsic_vssub_mask_vv_nxv2i16_nxv2i16_nxv2i16(<vscale x 2 x i16> %0, <vscale x 2 x i16> %1, <vscale x 2 x i16> %2, <vscale x 2 x i1> %3, i32 %4) nounwind {
+define <vscale x 2 x i16> @intrinsic_vssub_mask_vv_nxv2i16_nxv2i16_nxv2i16(<vscale x 2 x i16> %0, <vscale x 2 x i16> %1, <vscale x 2 x i16> %2, <vscale x 2 x i1> %3, iXLen %4) nounwind {
 ; CHECK-LABEL: intrinsic_vssub_mask_vv_nxv2i16_nxv2i16_nxv2i16:
 ; CHECK:       # %bb.0: # %entry
 ; CHECK-NEXT:    vsetvli zero, a0, e16, mf2, ta, mu
@@ -421,7 +423,7 @@ entry:
     <vscale x 2 x i16> %1,
     <vscale x 2 x i16> %2,
     <vscale x 2 x i1> %3,
-    i32 %4, i32 1)
+    iXLen %4, iXLen 1)
 
   ret <vscale x 2 x i16> %a
 }
@@ -430,9 +432,9 @@ declare <vscale x 4 x i16> @llvm.riscv.vssub.nxv4i16.nxv4i16(
   <vscale x 4 x i16>,
   <vscale x 4 x i16>,
   <vscale x 4 x i16>,
-  i32);
+  iXLen)
 
-define <vscale x 4 x i16> @intrinsic_vssub_vv_nxv4i16_nxv4i16_nxv4i16(<vscale x 4 x i16> %0, <vscale x 4 x i16> %1, i32 %2) nounwind {
+define <vscale x 4 x i16> @intrinsic_vssub_vv_nxv4i16_nxv4i16_nxv4i16(<vscale x 4 x i16> %0, <vscale x 4 x i16> %1, iXLen %2) nounwind {
 ; CHECK-LABEL: intrinsic_vssub_vv_nxv4i16_nxv4i16_nxv4i16:
 ; CHECK:       # %bb.0: # %entry
 ; CHECK-NEXT:    vsetvli zero, a0, e16, m1, ta, ma
@@ -443,7 +445,7 @@ entry:
     <vscale x 4 x i16> undef,
     <vscale x 4 x i16> %0,
     <vscale x 4 x i16> %1,
-    i32 %2)
+    iXLen %2)
 
   ret <vscale x 4 x i16> %a
 }
@@ -453,10 +455,10 @@ declare <vscale x 4 x i16> @llvm.riscv.vssub.mask.nxv4i16.nxv4i16(
   <vscale x 4 x i16>,
   <vscale x 4 x i16>,
   <vscale x 4 x i1>,
-  i32,
-  i32);
+  iXLen,
+  iXLen)
 
-define <vscale x 4 x i16> @intrinsic_vssub_mask_vv_nxv4i16_nxv4i16_nxv4i16(<vscale x 4 x i16> %0, <vscale x 4 x i16> %1, <vscale x 4 x i16> %2, <vscale x 4 x i1> %3, i32 %4) nounwind {
+define <vscale x 4 x i16> @intrinsic_vssub_mask_vv_nxv4i16_nxv4i16_nxv4i16(<vscale x 4 x i16> %0, <vscale x 4 x i16> %1, <vscale x 4 x i16> %2, <vscale x 4 x i1> %3, iXLen %4) nounwind {
 ; CHECK-LABEL: intrinsic_vssub_mask_vv_nxv4i16_nxv4i16_nxv4i16:
 ; CHECK:       # %bb.0: # %entry
 ; CHECK-NEXT:    vsetvli zero, a0, e16, m1, ta, mu
@@ -468,7 +470,7 @@ entry:
     <vscale x 4 x i16> %1,
     <vscale x 4 x i16> %2,
     <vscale x 4 x i1> %3,
-    i32 %4, i32 1)
+    iXLen %4, iXLen 1)
 
   ret <vscale x 4 x i16> %a
 }
@@ -477,9 +479,9 @@ declare <vscale x 8 x i16> @llvm.riscv.vssub.nxv8i16.nxv8i16(
   <vscale x 8 x i16>,
   <vscale x 8 x i16>,
   <vscale x 8 x i16>,
-  i32);
+  iXLen)
 
-define <vscale x 8 x i16> @intrinsic_vssub_vv_nxv8i16_nxv8i16_nxv8i16(<vscale x 8 x i16> %0, <vscale x 8 x i16> %1, i32 %2) nounwind {
+define <vscale x 8 x i16> @intrinsic_vssub_vv_nxv8i16_nxv8i16_nxv8i16(<vscale x 8 x i16> %0, <vscale x 8 x i16> %1, iXLen %2) nounwind {
 ; CHECK-LABEL: intrinsic_vssub_vv_nxv8i16_nxv8i16_nxv8i16:
 ; CHECK:       # %bb.0: # %entry
 ; CHECK-NEXT:    vsetvli zero, a0, e16, m2, ta, ma
@@ -490,7 +492,7 @@ entry:
     <vscale x 8 x i16> undef,
     <vscale x 8 x i16> %0,
     <vscale x 8 x i16> %1,
-    i32 %2)
+    iXLen %2)
 
   ret <vscale x 8 x i16> %a
 }
@@ -500,10 +502,10 @@ declare <vscale x 8 x i16> @llvm.riscv.vssub.mask.nxv8i16.nxv8i16(
   <vscale x 8 x i16>,
   <vscale x 8 x i16>,
   <vscale x 8 x i1>,
-  i32,
-  i32);
+  iXLen,
+  iXLen)
 
-define <vscale x 8 x i16> @intrinsic_vssub_mask_vv_nxv8i16_nxv8i16_nxv8i16(<vscale x 8 x i16> %0, <vscale x 8 x i16> %1, <vscale x 8 x i16> %2, <vscale x 8 x i1> %3, i32 %4) nounwind {
+define <vscale x 8 x i16> @intrinsic_vssub_mask_vv_nxv8i16_nxv8i16_nxv8i16(<vscale x 8 x i16> %0, <vscale x 8 x i16> %1, <vscale x 8 x i16> %2, <vscale x 8 x i1> %3, iXLen %4) nounwind {
 ; CHECK-LABEL: intrinsic_vssub_mask_vv_nxv8i16_nxv8i16_nxv8i16:
 ; CHECK:       # %bb.0: # %entry
 ; CHECK-NEXT:    vsetvli zero, a0, e16, m2, ta, mu
@@ -515,7 +517,7 @@ entry:
     <vscale x 8 x i16> %1,
     <vscale x 8 x i16> %2,
     <vscale x 8 x i1> %3,
-    i32 %4, i32 1)
+    iXLen %4, iXLen 1)
 
   ret <vscale x 8 x i16> %a
 }
@@ -524,9 +526,9 @@ declare <vscale x 16 x i16> @llvm.riscv.vssub.nxv16i16.nxv16i16(
   <vscale x 16 x i16>,
   <vscale x 16 x i16>,
   <vscale x 16 x i16>,
-  i32);
+  iXLen)
 
-define <vscale x 16 x i16> @intrinsic_vssub_vv_nxv16i16_nxv16i16_nxv16i16(<vscale x 16 x i16> %0, <vscale x 16 x i16> %1, i32 %2) nounwind {
+define <vscale x 16 x i16> @intrinsic_vssub_vv_nxv16i16_nxv16i16_nxv16i16(<vscale x 16 x i16> %0, <vscale x 16 x i16> %1, iXLen %2) nounwind {
 ; CHECK-LABEL: intrinsic_vssub_vv_nxv16i16_nxv16i16_nxv16i16:
 ; CHECK:       # %bb.0: # %entry
 ; CHECK-NEXT:    vsetvli zero, a0, e16, m4, ta, ma
@@ -537,7 +539,7 @@ entry:
     <vscale x 16 x i16> undef,
     <vscale x 16 x i16> %0,
     <vscale x 16 x i16> %1,
-    i32 %2)
+    iXLen %2)
 
   ret <vscale x 16 x i16> %a
 }
@@ -547,10 +549,10 @@ declare <vscale x 16 x i16> @llvm.riscv.vssub.mask.nxv16i16.nxv16i16(
   <vscale x 16 x i16>,
   <vscale x 16 x i16>,
   <vscale x 16 x i1>,
-  i32,
-  i32);
+  iXLen,
+  iXLen)
 
-define <vscale x 16 x i16> @intrinsic_vssub_mask_vv_nxv16i16_nxv16i16_nxv16i16(<vscale x 16 x i16> %0, <vscale x 16 x i16> %1, <vscale x 16 x i16> %2, <vscale x 16 x i1> %3, i32 %4) nounwind {
+define <vscale x 16 x i16> @intrinsic_vssub_mask_vv_nxv16i16_nxv16i16_nxv16i16(<vscale x 16 x i16> %0, <vscale x 16 x i16> %1, <vscale x 16 x i16> %2, <vscale x 16 x i1> %3, iXLen %4) nounwind {
 ; CHECK-LABEL: intrinsic_vssub_mask_vv_nxv16i16_nxv16i16_nxv16i16:
 ; CHECK:       # %bb.0: # %entry
 ; CHECK-NEXT:    vsetvli zero, a0, e16, m4, ta, mu
@@ -562,7 +564,7 @@ entry:
     <vscale x 16 x i16> %1,
     <vscale x 16 x i16> %2,
     <vscale x 16 x i1> %3,
-    i32 %4, i32 1)
+    iXLen %4, iXLen 1)
 
   ret <vscale x 16 x i16> %a
 }
@@ -571,9 +573,9 @@ declare <vscale x 32 x i16> @llvm.riscv.vssub.nxv32i16.nxv32i16(
   <vscale x 32 x i16>,
   <vscale x 32 x i16>,
   <vscale x 32 x i16>,
-  i32);
+  iXLen)
 
-define <vscale x 32 x i16> @intrinsic_vssub_vv_nxv32i16_nxv32i16_nxv32i16(<vscale x 32 x i16> %0, <vscale x 32 x i16> %1, i32 %2) nounwind {
+define <vscale x 32 x i16> @intrinsic_vssub_vv_nxv32i16_nxv32i16_nxv32i16(<vscale x 32 x i16> %0, <vscale x 32 x i16> %1, iXLen %2) nounwind {
 ; CHECK-LABEL: intrinsic_vssub_vv_nxv32i16_nxv32i16_nxv32i16:
 ; CHECK:       # %bb.0: # %entry
 ; CHECK-NEXT:    vsetvli zero, a0, e16, m8, ta, ma
@@ -584,7 +586,7 @@ entry:
     <vscale x 32 x i16> undef,
     <vscale x 32 x i16> %0,
     <vscale x 32 x i16> %1,
-    i32 %2)
+    iXLen %2)
 
   ret <vscale x 32 x i16> %a
 }
@@ -594,10 +596,10 @@ declare <vscale x 32 x i16> @llvm.riscv.vssub.mask.nxv32i16.nxv32i16(
   <vscale x 32 x i16>,
   <vscale x 32 x i16>,
   <vscale x 32 x i1>,
-  i32,
-  i32);
+  iXLen,
+  iXLen)
 
-define <vscale x 32 x i16> @intrinsic_vssub_mask_vv_nxv32i16_nxv32i16_nxv32i16(<vscale x 32 x i16> %0, <vscale x 32 x i16> %1, <vscale x 32 x i16> %2, <vscale x 32 x i1> %3, i32 %4) nounwind {
+define <vscale x 32 x i16> @intrinsic_vssub_mask_vv_nxv32i16_nxv32i16_nxv32i16(<vscale x 32 x i16> %0, <vscale x 32 x i16> %1, <vscale x 32 x i16> %2, <vscale x 32 x i1> %3, iXLen %4) nounwind {
 ; CHECK-LABEL: intrinsic_vssub_mask_vv_nxv32i16_nxv32i16_nxv32i16:
 ; CHECK:       # %bb.0: # %entry
 ; CHECK-NEXT:    vl8re16.v v24, (a0)
@@ -610,7 +612,7 @@ entry:
     <vscale x 32 x i16> %1,
     <vscale x 32 x i16> %2,
     <vscale x 32 x i1> %3,
-    i32 %4, i32 1)
+    iXLen %4, iXLen 1)
 
   ret <vscale x 32 x i16> %a
 }
@@ -619,9 +621,9 @@ declare <vscale x 1 x i32> @llvm.riscv.vssub.nxv1i32.nxv1i32(
   <vscale x 1 x i32>,
   <vscale x 1 x i32>,
   <vscale x 1 x i32>,
-  i32);
+  iXLen)
 
-define <vscale x 1 x i32> @intrinsic_vssub_vv_nxv1i32_nxv1i32_nxv1i32(<vscale x 1 x i32> %0, <vscale x 1 x i32> %1, i32 %2) nounwind {
+define <vscale x 1 x i32> @intrinsic_vssub_vv_nxv1i32_nxv1i32_nxv1i32(<vscale x 1 x i32> %0, <vscale x 1 x i32> %1, iXLen %2) nounwind {
 ; CHECK-LABEL: intrinsic_vssub_vv_nxv1i32_nxv1i32_nxv1i32:
 ; CHECK:       # %bb.0: # %entry
 ; CHECK-NEXT:    vsetvli zero, a0, e32, mf2, ta, ma
@@ -632,7 +634,7 @@ entry:
     <vscale x 1 x i32> undef,
     <vscale x 1 x i32> %0,
     <vscale x 1 x i32> %1,
-    i32 %2)
+    iXLen %2)
 
   ret <vscale x 1 x i32> %a
 }
@@ -642,10 +644,10 @@ declare <vscale x 1 x i32> @llvm.riscv.vssub.mask.nxv1i32.nxv1i32(
   <vscale x 1 x i32>,
   <vscale x 1 x i32>,
   <vscale x 1 x i1>,
-  i32,
-  i32);
+  iXLen,
+  iXLen)
 
-define <vscale x 1 x i32> @intrinsic_vssub_mask_vv_nxv1i32_nxv1i32_nxv1i32(<vscale x 1 x i32> %0, <vscale x 1 x i32> %1, <vscale x 1 x i32> %2, <vscale x 1 x i1> %3, i32 %4) nounwind {
+define <vscale x 1 x i32> @intrinsic_vssub_mask_vv_nxv1i32_nxv1i32_nxv1i32(<vscale x 1 x i32> %0, <vscale x 1 x i32> %1, <vscale x 1 x i32> %2, <vscale x 1 x i1> %3, iXLen %4) nounwind {
 ; CHECK-LABEL: intrinsic_vssub_mask_vv_nxv1i32_nxv1i32_nxv1i32:
 ; CHECK:       # %bb.0: # %entry
 ; CHECK-NEXT:    vsetvli zero, a0, e32, mf2, ta, mu
@@ -657,7 +659,7 @@ entry:
     <vscale x 1 x i32> %1,
     <vscale x 1 x i32> %2,
     <vscale x 1 x i1> %3,
-    i32 %4, i32 1)
+    iXLen %4, iXLen 1)
 
   ret <vscale x 1 x i32> %a
 }
@@ -666,9 +668,9 @@ declare <vscale x 2 x i32> @llvm.riscv.vssub.nxv2i32.nxv2i32(
   <vscale x 2 x i32>,
   <vscale x 2 x i32>,
   <vscale x 2 x i32>,
-  i32);
+  iXLen)
 
-define <vscale x 2 x i32> @intrinsic_vssub_vv_nxv2i32_nxv2i32_nxv2i32(<vscale x 2 x i32> %0, <vscale x 2 x i32> %1, i32 %2) nounwind {
+define <vscale x 2 x i32> @intrinsic_vssub_vv_nxv2i32_nxv2i32_nxv2i32(<vscale x 2 x i32> %0, <vscale x 2 x i32> %1, iXLen %2) nounwind {
 ; CHECK-LABEL: intrinsic_vssub_vv_nxv2i32_nxv2i32_nxv2i32:
 ; CHECK:       # %bb.0: # %entry
 ; CHECK-NEXT:    vsetvli zero, a0, e32, m1, ta, ma
@@ -679,7 +681,7 @@ entry:
     <vscale x 2 x i32> undef,
     <vscale x 2 x i32> %0,
     <vscale x 2 x i32> %1,
-    i32 %2)
+    iXLen %2)
 
   ret <vscale x 2 x i32> %a
 }
@@ -689,10 +691,10 @@ declare <vscale x 2 x i32> @llvm.riscv.vssub.mask.nxv2i32.nxv2i32(
   <vscale x 2 x i32>,
   <vscale x 2 x i32>,
   <vscale x 2 x i1>,
-  i32,
-  i32);
+  iXLen,
+  iXLen)
 
-define <vscale x 2 x i32> @intrinsic_vssub_mask_vv_nxv2i32_nxv2i32_nxv2i32(<vscale x 2 x i32> %0, <vscale x 2 x i32> %1, <vscale x 2 x i32> %2, <vscale x 2 x i1> %3, i32 %4) nounwind {
+define <vscale x 2 x i32> @intrinsic_vssub_mask_vv_nxv2i32_nxv2i32_nxv2i32(<vscale x 2 x i32> %0, <vscale x 2 x i32> %1, <vscale x 2 x i32> %2, <vscale x 2 x i1> %3, iXLen %4) nounwind {
 ; CHECK-LABEL: intrinsic_vssub_mask_vv_nxv2i32_nxv2i32_nxv2i32:
 ; CHECK:       # %bb.0: # %entry
 ; CHECK-NEXT:    vsetvli zero, a0, e32, m1, ta, mu
@@ -704,7 +706,7 @@ entry:
     <vscale x 2 x i32> %1,
     <vscale x 2 x i32> %2,
     <vscale x 2 x i1> %3,
-    i32 %4, i32 1)
+    iXLen %4, iXLen 1)
 
   ret <vscale x 2 x i32> %a
 }
@@ -713,9 +715,9 @@ declare <vscale x 4 x i32> @llvm.riscv.vssub.nxv4i32.nxv4i32(
   <vscale x 4 x i32>,
   <vscale x 4 x i32>,
   <vscale x 4 x i32>,
-  i32);
+  iXLen)
 
-define <vscale x 4 x i32> @intrinsic_vssub_vv_nxv4i32_nxv4i32_nxv4i32(<vscale x 4 x i32> %0, <vscale x 4 x i32> %1, i32 %2) nounwind {
+define <vscale x 4 x i32> @intrinsic_vssub_vv_nxv4i32_nxv4i32_nxv4i32(<vscale x 4 x i32> %0, <vscale x 4 x i32> %1, iXLen %2) nounwind {
 ; CHECK-LABEL: intrinsic_vssub_vv_nxv4i32_nxv4i32_nxv4i32:
 ; CHECK:       # %bb.0: # %entry
 ; CHECK-NEXT:    vsetvli zero, a0, e32, m2, ta, ma
@@ -726,7 +728,7 @@ entry:
     <vscale x 4 x i32> undef,
     <vscale x 4 x i32> %0,
     <vscale x 4 x i32> %1,
-    i32 %2)
+    iXLen %2)
 
   ret <vscale x 4 x i32> %a
 }
@@ -736,10 +738,10 @@ declare <vscale x 4 x i32> @llvm.riscv.vssub.mask.nxv4i32.nxv4i32(
   <vscale x 4 x i32>,
   <vscale x 4 x i32>,
   <vscale x 4 x i1>,
-  i32,
-  i32);
+  iXLen,
+  iXLen)
 
-define <vscale x 4 x i32> @intrinsic_vssub_mask_vv_nxv4i32_nxv4i32_nxv4i32(<vscale x 4 x i32> %0, <vscale x 4 x i32> %1, <vscale x 4 x i32> %2, <vscale x 4 x i1> %3, i32 %4) nounwind {
+define <vscale x 4 x i32> @intrinsic_vssub_mask_vv_nxv4i32_nxv4i32_nxv4i32(<vscale x 4 x i32> %0, <vscale x 4 x i32> %1, <vscale x 4 x i32> %2, <vscale x 4 x i1> %3, iXLen %4) nounwind {
 ; CHECK-LABEL: intrinsic_vssub_mask_vv_nxv4i32_nxv4i32_nxv4i32:
 ; CHECK:       # %bb.0: # %entry
 ; CHECK-NEXT:    vsetvli zero, a0, e32, m2, ta, mu
@@ -751,7 +753,7 @@ entry:
     <vscale x 4 x i32> %1,
     <vscale x 4 x i32> %2,
     <vscale x 4 x i1> %3,
-    i32 %4, i32 1)
+    iXLen %4, iXLen 1)
 
   ret <vscale x 4 x i32> %a
 }
@@ -760,9 +762,9 @@ declare <vscale x 8 x i32> @llvm.riscv.vssub.nxv8i32.nxv8i32(
   <vscale x 8 x i32>,
   <vscale x 8 x i32>,
   <vscale x 8 x i32>,
-  i32);
+  iXLen)
 
-define <vscale x 8 x i32> @intrinsic_vssub_vv_nxv8i32_nxv8i32_nxv8i32(<vscale x 8 x i32> %0, <vscale x 8 x i32> %1, i32 %2) nounwind {
+define <vscale x 8 x i32> @intrinsic_vssub_vv_nxv8i32_nxv8i32_nxv8i32(<vscale x 8 x i32> %0, <vscale x 8 x i32> %1, iXLen %2) nounwind {
 ; CHECK-LABEL: intrinsic_vssub_vv_nxv8i32_nxv8i32_nxv8i32:
 ; CHECK:       # %bb.0: # %entry
 ; CHECK-NEXT:    vsetvli zero, a0, e32, m4, ta, ma
@@ -773,7 +775,7 @@ entry:
     <vscale x 8 x i32> undef,
     <vscale x 8 x i32> %0,
     <vscale x 8 x i32> %1,
-    i32 %2)
+    iXLen %2)
 
   ret <vscale x 8 x i32> %a
 }
@@ -783,10 +785,10 @@ declare <vscale x 8 x i32> @llvm.riscv.vssub.mask.nxv8i32.nxv8i32(
   <vscale x 8 x i32>,
   <vscale x 8 x i32>,
   <vscale x 8 x i1>,
-  i32,
-  i32);
+  iXLen,
+  iXLen)
 
-define <vscale x 8 x i32> @intrinsic_vssub_mask_vv_nxv8i32_nxv8i32_nxv8i32(<vscale x 8 x i32> %0, <vscale x 8 x i32> %1, <vscale x 8 x i32> %2, <vscale x 8 x i1> %3, i32 %4) nounwind {
+define <vscale x 8 x i32> @intrinsic_vssub_mask_vv_nxv8i32_nxv8i32_nxv8i32(<vscale x 8 x i32> %0, <vscale x 8 x i32> %1, <vscale x 8 x i32> %2, <vscale x 8 x i1> %3, iXLen %4) nounwind {
 ; CHECK-LABEL: intrinsic_vssub_mask_vv_nxv8i32_nxv8i32_nxv8i32:
 ; CHECK:       # %bb.0: # %entry
 ; CHECK-NEXT:    vsetvli zero, a0, e32, m4, ta, mu
@@ -798,7 +800,7 @@ entry:
     <vscale x 8 x i32> %1,
     <vscale x 8 x i32> %2,
     <vscale x 8 x i1> %3,
-    i32 %4, i32 1)
+    iXLen %4, iXLen 1)
 
   ret <vscale x 8 x i32> %a
 }
@@ -807,9 +809,9 @@ declare <vscale x 16 x i32> @llvm.riscv.vssub.nxv16i32.nxv16i32(
   <vscale x 16 x i32>,
   <vscale x 16 x i32>,
   <vscale x 16 x i32>,
-  i32);
+  iXLen)
 
-define <vscale x 16 x i32> @intrinsic_vssub_vv_nxv16i32_nxv16i32_nxv16i32(<vscale x 16 x i32> %0, <vscale x 16 x i32> %1, i32 %2) nounwind {
+define <vscale x 16 x i32> @intrinsic_vssub_vv_nxv16i32_nxv16i32_nxv16i32(<vscale x 16 x i32> %0, <vscale x 16 x i32> %1, iXLen %2) nounwind {
 ; CHECK-LABEL: intrinsic_vssub_vv_nxv16i32_nxv16i32_nxv16i32:
 ; CHECK:       # %bb.0: # %entry
 ; CHECK-NEXT:    vsetvli zero, a0, e32, m8, ta, ma
@@ -820,7 +822,7 @@ entry:
     <vscale x 16 x i32> undef,
     <vscale x 16 x i32> %0,
     <vscale x 16 x i32> %1,
-    i32 %2)
+    iXLen %2)
 
   ret <vscale x 16 x i32> %a
 }
@@ -830,10 +832,10 @@ declare <vscale x 16 x i32> @llvm.riscv.vssub.mask.nxv16i32.nxv16i32(
   <vscale x 16 x i32>,
   <vscale x 16 x i32>,
   <vscale x 16 x i1>,
-  i32,
-  i32);
+  iXLen,
+  iXLen)
 
-define <vscale x 16 x i32> @intrinsic_vssub_mask_vv_nxv16i32_nxv16i32_nxv16i32(<vscale x 16 x i32> %0, <vscale x 16 x i32> %1, <vscale x 16 x i32> %2, <vscale x 16 x i1> %3, i32 %4) nounwind {
+define <vscale x 16 x i32> @intrinsic_vssub_mask_vv_nxv16i32_nxv16i32_nxv16i32(<vscale x 16 x i32> %0, <vscale x 16 x i32> %1, <vscale x 16 x i32> %2, <vscale x 16 x i1> %3, iXLen %4) nounwind {
 ; CHECK-LABEL: intrinsic_vssub_mask_vv_nxv16i32_nxv16i32_nxv16i32:
 ; CHECK:       # %bb.0: # %entry
 ; CHECK-NEXT:    vl8re32.v v24, (a0)
@@ -846,7 +848,7 @@ entry:
     <vscale x 16 x i32> %1,
     <vscale x 16 x i32> %2,
     <vscale x 16 x i1> %3,
-    i32 %4, i32 1)
+    iXLen %4, iXLen 1)
 
   ret <vscale x 16 x i32> %a
 }
@@ -855,9 +857,9 @@ declare <vscale x 1 x i64> @llvm.riscv.vssub.nxv1i64.nxv1i64(
   <vscale x 1 x i64>,
   <vscale x 1 x i64>,
   <vscale x 1 x i64>,
-  i32);
+  iXLen)
 
-define <vscale x 1 x i64> @intrinsic_vssub_vv_nxv1i64_nxv1i64_nxv1i64(<vscale x 1 x i64> %0, <vscale x 1 x i64> %1, i32 %2) nounwind {
+define <vscale x 1 x i64> @intrinsic_vssub_vv_nxv1i64_nxv1i64_nxv1i64(<vscale x 1 x i64> %0, <vscale x 1 x i64> %1, iXLen %2) nounwind {
 ; CHECK-LABEL: intrinsic_vssub_vv_nxv1i64_nxv1i64_nxv1i64:
 ; CHECK:       # %bb.0: # %entry
 ; CHECK-NEXT:    vsetvli zero, a0, e64, m1, ta, ma
@@ -868,7 +870,7 @@ entry:
     <vscale x 1 x i64> undef,
     <vscale x 1 x i64> %0,
     <vscale x 1 x i64> %1,
-    i32 %2)
+    iXLen %2)
 
   ret <vscale x 1 x i64> %a
 }
@@ -878,10 +880,10 @@ declare <vscale x 1 x i64> @llvm.riscv.vssub.mask.nxv1i64.nxv1i64(
   <vscale x 1 x i64>,
   <vscale x 1 x i64>,
   <vscale x 1 x i1>,
-  i32,
-  i32);
+  iXLen,
+  iXLen)
 
-define <vscale x 1 x i64> @intrinsic_vssub_mask_vv_nxv1i64_nxv1i64_nxv1i64(<vscale x 1 x i64> %0, <vscale x 1 x i64> %1, <vscale x 1 x i64> %2, <vscale x 1 x i1> %3, i32 %4) nounwind {
+define <vscale x 1 x i64> @intrinsic_vssub_mask_vv_nxv1i64_nxv1i64_nxv1i64(<vscale x 1 x i64> %0, <vscale x 1 x i64> %1, <vscale x 1 x i64> %2, <vscale x 1 x i1> %3, iXLen %4) nounwind {
 ; CHECK-LABEL: intrinsic_vssub_mask_vv_nxv1i64_nxv1i64_nxv1i64:
 ; CHECK:       # %bb.0: # %entry
 ; CHECK-NEXT:    vsetvli zero, a0, e64, m1, ta, mu
@@ -893,7 +895,7 @@ entry:
     <vscale x 1 x i64> %1,
     <vscale x 1 x i64> %2,
     <vscale x 1 x i1> %3,
-    i32 %4, i32 1)
+    iXLen %4, iXLen 1)
 
   ret <vscale x 1 x i64> %a
 }
@@ -902,9 +904,9 @@ declare <vscale x 2 x i64> @llvm.riscv.vssub.nxv2i64.nxv2i64(
   <vscale x 2 x i64>,
   <vscale x 2 x i64>,
   <vscale x 2 x i64>,
-  i32);
+  iXLen)
 
-define <vscale x 2 x i64> @intrinsic_vssub_vv_nxv2i64_nxv2i64_nxv2i64(<vscale x 2 x i64> %0, <vscale x 2 x i64> %1, i32 %2) nounwind {
+define <vscale x 2 x i64> @intrinsic_vssub_vv_nxv2i64_nxv2i64_nxv2i64(<vscale x 2 x i64> %0, <vscale x 2 x i64> %1, iXLen %2) nounwind {
 ; CHECK-LABEL: intrinsic_vssub_vv_nxv2i64_nxv2i64_nxv2i64:
 ; CHECK:       # %bb.0: # %entry
 ; CHECK-NEXT:    vsetvli zero, a0, e64, m2, ta, ma
@@ -915,7 +917,7 @@ entry:
     <vscale x 2 x i64> undef,
     <vscale x 2 x i64> %0,
     <vscale x 2 x i64> %1,
-    i32 %2)
+    iXLen %2)
 
   ret <vscale x 2 x i64> %a
 }
@@ -925,10 +927,10 @@ declare <vscale x 2 x i64> @llvm.riscv.vssub.mask.nxv2i64.nxv2i64(
   <vscale x 2 x i64>,
   <vscale x 2 x i64>,
   <vscale x 2 x i1>,
-  i32,
-  i32);
+  iXLen,
+  iXLen)
 
-define <vscale x 2 x i64> @intrinsic_vssub_mask_vv_nxv2i64_nxv2i64_nxv2i64(<vscale x 2 x i64> %0, <vscale x 2 x i64> %1, <vscale x 2 x i64> %2, <vscale x 2 x i1> %3, i32 %4) nounwind {
+define <vscale x 2 x i64> @intrinsic_vssub_mask_vv_nxv2i64_nxv2i64_nxv2i64(<vscale x 2 x i64> %0, <vscale x 2 x i64> %1, <vscale x 2 x i64> %2, <vscale x 2 x i1> %3, iXLen %4) nounwind {
 ; CHECK-LABEL: intrinsic_vssub_mask_vv_nxv2i64_nxv2i64_nxv2i64:
 ; CHECK:       # %bb.0: # %entry
 ; CHECK-NEXT:    vsetvli zero, a0, e64, m2, ta, mu
@@ -940,7 +942,7 @@ entry:
     <vscale x 2 x i64> %1,
     <vscale x 2 x i64> %2,
     <vscale x 2 x i1> %3,
-    i32 %4, i32 1)
+    iXLen %4, iXLen 1)
 
   ret <vscale x 2 x i64> %a
 }
@@ -949,9 +951,9 @@ declare <vscale x 4 x i64> @llvm.riscv.vssub.nxv4i64.nxv4i64(
   <vscale x 4 x i64>,
   <vscale x 4 x i64>,
   <vscale x 4 x i64>,
-  i32);
+  iXLen)
 
-define <vscale x 4 x i64> @intrinsic_vssub_vv_nxv4i64_nxv4i64_nxv4i64(<vscale x 4 x i64> %0, <vscale x 4 x i64> %1, i32 %2) nounwind {
+define <vscale x 4 x i64> @intrinsic_vssub_vv_nxv4i64_nxv4i64_nxv4i64(<vscale x 4 x i64> %0, <vscale x 4 x i64> %1, iXLen %2) nounwind {
 ; CHECK-LABEL: intrinsic_vssub_vv_nxv4i64_nxv4i64_nxv4i64:
 ; CHECK:       # %bb.0: # %entry
 ; CHECK-NEXT:    vsetvli zero, a0, e64, m4, ta, ma
@@ -962,7 +964,7 @@ entry:
     <vscale x 4 x i64> undef,
     <vscale x 4 x i64> %0,
     <vscale x 4 x i64> %1,
-    i32 %2)
+    iXLen %2)
 
   ret <vscale x 4 x i64> %a
 }
@@ -972,10 +974,10 @@ declare <vscale x 4 x i64> @llvm.riscv.vssub.mask.nxv4i64.nxv4i64(
   <vscale x 4 x i64>,
   <vscale x 4 x i64>,
   <vscale x 4 x i1>,
-  i32,
-  i32);
+  iXLen,
+  iXLen)
 
-define <vscale x 4 x i64> @intrinsic_vssub_mask_vv_nxv4i64_nxv4i64_nxv4i64(<vscale x 4 x i64> %0, <vscale x 4 x i64> %1, <vscale x 4 x i64> %2, <vscale x 4 x i1> %3, i32 %4) nounwind {
+define <vscale x 4 x i64> @intrinsic_vssub_mask_vv_nxv4i64_nxv4i64_nxv4i64(<vscale x 4 x i64> %0, <vscale x 4 x i64> %1, <vscale x 4 x i64> %2, <vscale x 4 x i1> %3, iXLen %4) nounwind {
 ; CHECK-LABEL: intrinsic_vssub_mask_vv_nxv4i64_nxv4i64_nxv4i64:
 ; CHECK:       # %bb.0: # %entry
 ; CHECK-NEXT:    vsetvli zero, a0, e64, m4, ta, mu
@@ -987,7 +989,7 @@ entry:
     <vscale x 4 x i64> %1,
     <vscale x 4 x i64> %2,
     <vscale x 4 x i1> %3,
-    i32 %4, i32 1)
+    iXLen %4, iXLen 1)
 
   ret <vscale x 4 x i64> %a
 }
@@ -996,9 +998,9 @@ declare <vscale x 8 x i64> @llvm.riscv.vssub.nxv8i64.nxv8i64(
   <vscale x 8 x i64>,
   <vscale x 8 x i64>,
   <vscale x 8 x i64>,
-  i32);
+  iXLen)
 
-define <vscale x 8 x i64> @intrinsic_vssub_vv_nxv8i64_nxv8i64_nxv8i64(<vscale x 8 x i64> %0, <vscale x 8 x i64> %1, i32 %2) nounwind {
+define <vscale x 8 x i64> @intrinsic_vssub_vv_nxv8i64_nxv8i64_nxv8i64(<vscale x 8 x i64> %0, <vscale x 8 x i64> %1, iXLen %2) nounwind {
 ; CHECK-LABEL: intrinsic_vssub_vv_nxv8i64_nxv8i64_nxv8i64:
 ; CHECK:       # %bb.0: # %entry
 ; CHECK-NEXT:    vsetvli zero, a0, e64, m8, ta, ma
@@ -1009,7 +1011,7 @@ entry:
     <vscale x 8 x i64> undef,
     <vscale x 8 x i64> %0,
     <vscale x 8 x i64> %1,
-    i32 %2)
+    iXLen %2)
 
   ret <vscale x 8 x i64> %a
 }
@@ -1019,10 +1021,10 @@ declare <vscale x 8 x i64> @llvm.riscv.vssub.mask.nxv8i64.nxv8i64(
   <vscale x 8 x i64>,
   <vscale x 8 x i64>,
   <vscale x 8 x i1>,
-  i32,
-  i32);
+  iXLen,
+  iXLen)
 
-define <vscale x 8 x i64> @intrinsic_vssub_mask_vv_nxv8i64_nxv8i64_nxv8i64(<vscale x 8 x i64> %0, <vscale x 8 x i64> %1, <vscale x 8 x i64> %2, <vscale x 8 x i1> %3, i32 %4) nounwind {
+define <vscale x 8 x i64> @intrinsic_vssub_mask_vv_nxv8i64_nxv8i64_nxv8i64(<vscale x 8 x i64> %0, <vscale x 8 x i64> %1, <vscale x 8 x i64> %2, <vscale x 8 x i1> %3, iXLen %4) nounwind {
 ; CHECK-LABEL: intrinsic_vssub_mask_vv_nxv8i64_nxv8i64_nxv8i64:
 ; CHECK:       # %bb.0: # %entry
 ; CHECK-NEXT:    vl8re64.v v24, (a0)
@@ -1035,7 +1037,7 @@ entry:
     <vscale x 8 x i64> %1,
     <vscale x 8 x i64> %2,
     <vscale x 8 x i1> %3,
-    i32 %4, i32 1)
+    iXLen %4, iXLen 1)
 
   ret <vscale x 8 x i64> %a
 }
@@ -1044,9 +1046,9 @@ declare <vscale x 1 x i8> @llvm.riscv.vssub.nxv1i8.i8(
   <vscale x 1 x i8>,
   <vscale x 1 x i8>,
   i8,
-  i32);
+  iXLen)
 
-define <vscale x 1 x i8> @intrinsic_vssub_vx_nxv1i8_nxv1i8_i8(<vscale x 1 x i8> %0, i8 %1, i32 %2) nounwind {
+define <vscale x 1 x i8> @intrinsic_vssub_vx_nxv1i8_nxv1i8_i8(<vscale x 1 x i8> %0, i8 %1, iXLen %2) nounwind {
 ; CHECK-LABEL: intrinsic_vssub_vx_nxv1i8_nxv1i8_i8:
 ; CHECK:       # %bb.0: # %entry
 ; CHECK-NEXT:    vsetvli zero, a1, e8, mf8, ta, ma
@@ -1057,7 +1059,7 @@ entry:
     <vscale x 1 x i8> undef,
     <vscale x 1 x i8> %0,
     i8 %1,
-    i32 %2)
+    iXLen %2)
 
   ret <vscale x 1 x i8> %a
 }
@@ -1067,10 +1069,10 @@ declare <vscale x 1 x i8> @llvm.riscv.vssub.mask.nxv1i8.i8(
   <vscale x 1 x i8>,
   i8,
   <vscale x 1 x i1>,
-  i32,
-  i32);
+  iXLen,
+  iXLen)
 
-define <vscale x 1 x i8> @intrinsic_vssub_mask_vx_nxv1i8_nxv1i8_i8(<vscale x 1 x i8> %0, <vscale x 1 x i8> %1, i8 %2, <vscale x 1 x i1> %3, i32 %4) nounwind {
+define <vscale x 1 x i8> @intrinsic_vssub_mask_vx_nxv1i8_nxv1i8_i8(<vscale x 1 x i8> %0, <vscale x 1 x i8> %1, i8 %2, <vscale x 1 x i1> %3, iXLen %4) nounwind {
 ; CHECK-LABEL: intrinsic_vssub_mask_vx_nxv1i8_nxv1i8_i8:
 ; CHECK:       # %bb.0: # %entry
 ; CHECK-NEXT:    vsetvli zero, a1, e8, mf8, ta, mu
@@ -1082,7 +1084,7 @@ entry:
     <vscale x 1 x i8> %1,
     i8 %2,
     <vscale x 1 x i1> %3,
-    i32 %4, i32 1)
+    iXLen %4, iXLen 1)
 
   ret <vscale x 1 x i8> %a
 }
@@ -1091,9 +1093,9 @@ declare <vscale x 2 x i8> @llvm.riscv.vssub.nxv2i8.i8(
   <vscale x 2 x i8>,
   <vscale x 2 x i8>,
   i8,
-  i32);
+  iXLen)
 
-define <vscale x 2 x i8> @intrinsic_vssub_vx_nxv2i8_nxv2i8_i8(<vscale x 2 x i8> %0, i8 %1, i32 %2) nounwind {
+define <vscale x 2 x i8> @intrinsic_vssub_vx_nxv2i8_nxv2i8_i8(<vscale x 2 x i8> %0, i8 %1, iXLen %2) nounwind {
 ; CHECK-LABEL: intrinsic_vssub_vx_nxv2i8_nxv2i8_i8:
 ; CHECK:       # %bb.0: # %entry
 ; CHECK-NEXT:    vsetvli zero, a1, e8, mf4, ta, ma
@@ -1104,7 +1106,7 @@ entry:
     <vscale x 2 x i8> undef,
     <vscale x 2 x i8> %0,
     i8 %1,
-    i32 %2)
+    iXLen %2)
 
   ret <vscale x 2 x i8> %a
 }
@@ -1114,10 +1116,10 @@ declare <vscale x 2 x i8> @llvm.riscv.vssub.mask.nxv2i8.i8(
   <vscale x 2 x i8>,
   i8,
   <vscale x 2 x i1>,
-  i32,
-  i32);
+  iXLen,
+  iXLen)
 
-define <vscale x 2 x i8> @intrinsic_vssub_mask_vx_nxv2i8_nxv2i8_i8(<vscale x 2 x i8> %0, <vscale x 2 x i8> %1, i8 %2, <vscale x 2 x i1> %3, i32 %4) nounwind {
+define <vscale x 2 x i8> @intrinsic_vssub_mask_vx_nxv2i8_nxv2i8_i8(<vscale x 2 x i8> %0, <vscale x 2 x i8> %1, i8 %2, <vscale x 2 x i1> %3, iXLen %4) nounwind {
 ; CHECK-LABEL: intrinsic_vssub_mask_vx_nxv2i8_nxv2i8_i8:
 ; CHECK:       # %bb.0: # %entry
 ; CHECK-NEXT:    vsetvli zero, a1, e8, mf4, ta, mu
@@ -1129,7 +1131,7 @@ entry:
     <vscale x 2 x i8> %1,
     i8 %2,
     <vscale x 2 x i1> %3,
-    i32 %4, i32 1)
+    iXLen %4, iXLen 1)
 
   ret <vscale x 2 x i8> %a
 }
@@ -1138,9 +1140,9 @@ declare <vscale x 4 x i8> @llvm.riscv.vssub.nxv4i8.i8(
   <vscale x 4 x i8>,
   <vscale x 4 x i8>,
   i8,
-  i32);
+  iXLen)
 
-define <vscale x 4 x i8> @intrinsic_vssub_vx_nxv4i8_nxv4i8_i8(<vscale x 4 x i8> %0, i8 %1, i32 %2) nounwind {
+define <vscale x 4 x i8> @intrinsic_vssub_vx_nxv4i8_nxv4i8_i8(<vscale x 4 x i8> %0, i8 %1, iXLen %2) nounwind {
 ; CHECK-LABEL: intrinsic_vssub_vx_nxv4i8_nxv4i8_i8:
 ; CHECK:       # %bb.0: # %entry
 ; CHECK-NEXT:    vsetvli zero, a1, e8, mf2, ta, ma
@@ -1151,7 +1153,7 @@ entry:
     <vscale x 4 x i8> undef,
     <vscale x 4 x i8> %0,
     i8 %1,
-    i32 %2)
+    iXLen %2)
 
   ret <vscale x 4 x i8> %a
 }
@@ -1161,10 +1163,10 @@ declare <vscale x 4 x i8> @llvm.riscv.vssub.mask.nxv4i8.i8(
   <vscale x 4 x i8>,
   i8,
   <vscale x 4 x i1>,
-  i32,
-  i32);
+  iXLen,
+  iXLen)
 
-define <vscale x 4 x i8> @intrinsic_vssub_mask_vx_nxv4i8_nxv4i8_i8(<vscale x 4 x i8> %0, <vscale x 4 x i8> %1, i8 %2, <vscale x 4 x i1> %3, i32 %4) nounwind {
+define <vscale x 4 x i8> @intrinsic_vssub_mask_vx_nxv4i8_nxv4i8_i8(<vscale x 4 x i8> %0, <vscale x 4 x i8> %1, i8 %2, <vscale x 4 x i1> %3, iXLen %4) nounwind {
 ; CHECK-LABEL: intrinsic_vssub_mask_vx_nxv4i8_nxv4i8_i8:
 ; CHECK:       # %bb.0: # %entry
 ; CHECK-NEXT:    vsetvli zero, a1, e8, mf2, ta, mu
@@ -1176,7 +1178,7 @@ entry:
     <vscale x 4 x i8> %1,
     i8 %2,
     <vscale x 4 x i1> %3,
-    i32 %4, i32 1)
+    iXLen %4, iXLen 1)
 
   ret <vscale x 4 x i8> %a
 }
@@ -1185,9 +1187,9 @@ declare <vscale x 8 x i8> @llvm.riscv.vssub.nxv8i8.i8(
   <vscale x 8 x i8>,
   <vscale x 8 x i8>,
   i8,
-  i32);
+  iXLen)
 
-define <vscale x 8 x i8> @intrinsic_vssub_vx_nxv8i8_nxv8i8_i8(<vscale x 8 x i8> %0, i8 %1, i32 %2) nounwind {
+define <vscale x 8 x i8> @intrinsic_vssub_vx_nxv8i8_nxv8i8_i8(<vscale x 8 x i8> %0, i8 %1, iXLen %2) nounwind {
 ; CHECK-LABEL: intrinsic_vssub_vx_nxv8i8_nxv8i8_i8:
 ; CHECK:       # %bb.0: # %entry
 ; CHECK-NEXT:    vsetvli zero, a1, e8, m1, ta, ma
@@ -1198,7 +1200,7 @@ entry:
     <vscale x 8 x i8> undef,
     <vscale x 8 x i8> %0,
     i8 %1,
-    i32 %2)
+    iXLen %2)
 
   ret <vscale x 8 x i8> %a
 }
@@ -1208,10 +1210,10 @@ declare <vscale x 8 x i8> @llvm.riscv.vssub.mask.nxv8i8.i8(
   <vscale x 8 x i8>,
   i8,
   <vscale x 8 x i1>,
-  i32,
-  i32);
+  iXLen,
+  iXLen)
 
-define <vscale x 8 x i8> @intrinsic_vssub_mask_vx_nxv8i8_nxv8i8_i8(<vscale x 8 x i8> %0, <vscale x 8 x i8> %1, i8 %2, <vscale x 8 x i1> %3, i32 %4) nounwind {
+define <vscale x 8 x i8> @intrinsic_vssub_mask_vx_nxv8i8_nxv8i8_i8(<vscale x 8 x i8> %0, <vscale x 8 x i8> %1, i8 %2, <vscale x 8 x i1> %3, iXLen %4) nounwind {
 ; CHECK-LABEL: intrinsic_vssub_mask_vx_nxv8i8_nxv8i8_i8:
 ; CHECK:       # %bb.0: # %entry
 ; CHECK-NEXT:    vsetvli zero, a1, e8, m1, ta, mu
@@ -1223,7 +1225,7 @@ entry:
     <vscale x 8 x i8> %1,
     i8 %2,
     <vscale x 8 x i1> %3,
-    i32 %4, i32 1)
+    iXLen %4, iXLen 1)
 
   ret <vscale x 8 x i8> %a
 }
@@ -1232,9 +1234,9 @@ declare <vscale x 16 x i8> @llvm.riscv.vssub.nxv16i8.i8(
   <vscale x 16 x i8>,
   <vscale x 16 x i8>,
   i8,
-  i32);
+  iXLen)
 
-define <vscale x 16 x i8> @intrinsic_vssub_vx_nxv16i8_nxv16i8_i8(<vscale x 16 x i8> %0, i8 %1, i32 %2) nounwind {
+define <vscale x 16 x i8> @intrinsic_vssub_vx_nxv16i8_nxv16i8_i8(<vscale x 16 x i8> %0, i8 %1, iXLen %2) nounwind {
 ; CHECK-LABEL: intrinsic_vssub_vx_nxv16i8_nxv16i8_i8:
 ; CHECK:       # %bb.0: # %entry
 ; CHECK-NEXT:    vsetvli zero, a1, e8, m2, ta, ma
@@ -1245,7 +1247,7 @@ entry:
     <vscale x 16 x i8> undef,
     <vscale x 16 x i8> %0,
     i8 %1,
-    i32 %2)
+    iXLen %2)
 
   ret <vscale x 16 x i8> %a
 }
@@ -1255,10 +1257,10 @@ declare <vscale x 16 x i8> @llvm.riscv.vssub.mask.nxv16i8.i8(
   <vscale x 16 x i8>,
   i8,
   <vscale x 16 x i1>,
-  i32,
-  i32);
+  iXLen,
+  iXLen)
 
-define <vscale x 16 x i8> @intrinsic_vssub_mask_vx_nxv16i8_nxv16i8_i8(<vscale x 16 x i8> %0, <vscale x 16 x i8> %1, i8 %2, <vscale x 16 x i1> %3, i32 %4) nounwind {
+define <vscale x 16 x i8> @intrinsic_vssub_mask_vx_nxv16i8_nxv16i8_i8(<vscale x 16 x i8> %0, <vscale x 16 x i8> %1, i8 %2, <vscale x 16 x i1> %3, iXLen %4) nounwind {
 ; CHECK-LABEL: intrinsic_vssub_mask_vx_nxv16i8_nxv16i8_i8:
 ; CHECK:       # %bb.0: # %entry
 ; CHECK-NEXT:    vsetvli zero, a1, e8, m2, ta, mu
@@ -1270,7 +1272,7 @@ entry:
     <vscale x 16 x i8> %1,
     i8 %2,
     <vscale x 16 x i1> %3,
-    i32 %4, i32 1)
+    iXLen %4, iXLen 1)
 
   ret <vscale x 16 x i8> %a
 }
@@ -1279,9 +1281,9 @@ declare <vscale x 32 x i8> @llvm.riscv.vssub.nxv32i8.i8(
   <vscale x 32 x i8>,
   <vscale x 32 x i8>,
   i8,
-  i32);
+  iXLen)
 
-define <vscale x 32 x i8> @intrinsic_vssub_vx_nxv32i8_nxv32i8_i8(<vscale x 32 x i8> %0, i8 %1, i32 %2) nounwind {
+define <vscale x 32 x i8> @intrinsic_vssub_vx_nxv32i8_nxv32i8_i8(<vscale x 32 x i8> %0, i8 %1, iXLen %2) nounwind {
 ; CHECK-LABEL: intrinsic_vssub_vx_nxv32i8_nxv32i8_i8:
 ; CHECK:       # %bb.0: # %entry
 ; CHECK-NEXT:    vsetvli zero, a1, e8, m4, ta, ma
@@ -1292,7 +1294,7 @@ entry:
     <vscale x 32 x i8> undef,
     <vscale x 32 x i8> %0,
     i8 %1,
-    i32 %2)
+    iXLen %2)
 
   ret <vscale x 32 x i8> %a
 }
@@ -1302,10 +1304,10 @@ declare <vscale x 32 x i8> @llvm.riscv.vssub.mask.nxv32i8.i8(
   <vscale x 32 x i8>,
   i8,
   <vscale x 32 x i1>,
-  i32,
-  i32);
+  iXLen,
+  iXLen)
 
-define <vscale x 32 x i8> @intrinsic_vssub_mask_vx_nxv32i8_nxv32i8_i8(<vscale x 32 x i8> %0, <vscale x 32 x i8> %1, i8 %2, <vscale x 32 x i1> %3, i32 %4) nounwind {
+define <vscale x 32 x i8> @intrinsic_vssub_mask_vx_nxv32i8_nxv32i8_i8(<vscale x 32 x i8> %0, <vscale x 32 x i8> %1, i8 %2, <vscale x 32 x i1> %3, iXLen %4) nounwind {
 ; CHECK-LABEL: intrinsic_vssub_mask_vx_nxv32i8_nxv32i8_i8:
 ; CHECK:       # %bb.0: # %entry
 ; CHECK-NEXT:    vsetvli zero, a1, e8, m4, ta, mu
@@ -1317,7 +1319,7 @@ entry:
     <vscale x 32 x i8> %1,
     i8 %2,
     <vscale x 32 x i1> %3,
-    i32 %4, i32 1)
+    iXLen %4, iXLen 1)
 
   ret <vscale x 32 x i8> %a
 }
@@ -1326,9 +1328,9 @@ declare <vscale x 64 x i8> @llvm.riscv.vssub.nxv64i8.i8(
   <vscale x 64 x i8>,
   <vscale x 64 x i8>,
   i8,
-  i32);
+  iXLen)
 
-define <vscale x 64 x i8> @intrinsic_vssub_vx_nxv64i8_nxv64i8_i8(<vscale x 64 x i8> %0, i8 %1, i32 %2) nounwind {
+define <vscale x 64 x i8> @intrinsic_vssub_vx_nxv64i8_nxv64i8_i8(<vscale x 64 x i8> %0, i8 %1, iXLen %2) nounwind {
 ; CHECK-LABEL: intrinsic_vssub_vx_nxv64i8_nxv64i8_i8:
 ; CHECK:       # %bb.0: # %entry
 ; CHECK-NEXT:    vsetvli zero, a1, e8, m8, ta, ma
@@ -1339,7 +1341,7 @@ entry:
     <vscale x 64 x i8> undef,
     <vscale x 64 x i8> %0,
     i8 %1,
-    i32 %2)
+    iXLen %2)
 
   ret <vscale x 64 x i8> %a
 }
@@ -1349,10 +1351,10 @@ declare <vscale x 64 x i8> @llvm.riscv.vssub.mask.nxv64i8.i8(
   <vscale x 64 x i8>,
   i8,
   <vscale x 64 x i1>,
-  i32,
-  i32);
+  iXLen,
+  iXLen)
 
-define <vscale x 64 x i8> @intrinsic_vssub_mask_vx_nxv64i8_nxv64i8_i8(<vscale x 64 x i8> %0, <vscale x 64 x i8> %1, i8 %2, <vscale x 64 x i1> %3, i32 %4) nounwind {
+define <vscale x 64 x i8> @intrinsic_vssub_mask_vx_nxv64i8_nxv64i8_i8(<vscale x 64 x i8> %0, <vscale x 64 x i8> %1, i8 %2, <vscale x 64 x i1> %3, iXLen %4) nounwind {
 ; CHECK-LABEL: intrinsic_vssub_mask_vx_nxv64i8_nxv64i8_i8:
 ; CHECK:       # %bb.0: # %entry
 ; CHECK-NEXT:    vsetvli zero, a1, e8, m8, ta, mu
@@ -1364,7 +1366,7 @@ entry:
     <vscale x 64 x i8> %1,
     i8 %2,
     <vscale x 64 x i1> %3,
-    i32 %4, i32 1)
+    iXLen %4, iXLen 1)
 
   ret <vscale x 64 x i8> %a
 }
@@ -1373,9 +1375,9 @@ declare <vscale x 1 x i16> @llvm.riscv.vssub.nxv1i16.i16(
   <vscale x 1 x i16>,
   <vscale x 1 x i16>,
   i16,
-  i32);
+  iXLen)
 
-define <vscale x 1 x i16> @intrinsic_vssub_vx_nxv1i16_nxv1i16_i16(<vscale x 1 x i16> %0, i16 %1, i32 %2) nounwind {
+define <vscale x 1 x i16> @intrinsic_vssub_vx_nxv1i16_nxv1i16_i16(<vscale x 1 x i16> %0, i16 %1, iXLen %2) nounwind {
 ; CHECK-LABEL: intrinsic_vssub_vx_nxv1i16_nxv1i16_i16:
 ; CHECK:       # %bb.0: # %entry
 ; CHECK-NEXT:    vsetvli zero, a1, e16, mf4, ta, ma
@@ -1386,7 +1388,7 @@ entry:
     <vscale x 1 x i16> undef,
     <vscale x 1 x i16> %0,
     i16 %1,
-    i32 %2)
+    iXLen %2)
 
   ret <vscale x 1 x i16> %a
 }
@@ -1396,10 +1398,10 @@ declare <vscale x 1 x i16> @llvm.riscv.vssub.mask.nxv1i16.i16(
   <vscale x 1 x i16>,
   i16,
   <vscale x 1 x i1>,
-  i32,
-  i32);
+  iXLen,
+  iXLen)
 
-define <vscale x 1 x i16> @intrinsic_vssub_mask_vx_nxv1i16_nxv1i16_i16(<vscale x 1 x i16> %0, <vscale x 1 x i16> %1, i16 %2, <vscale x 1 x i1> %3, i32 %4) nounwind {
+define <vscale x 1 x i16> @intrinsic_vssub_mask_vx_nxv1i16_nxv1i16_i16(<vscale x 1 x i16> %0, <vscale x 1 x i16> %1, i16 %2, <vscale x 1 x i1> %3, iXLen %4) nounwind {
 ; CHECK-LABEL: intrinsic_vssub_mask_vx_nxv1i16_nxv1i16_i16:
 ; CHECK:       # %bb.0: # %entry
 ; CHECK-NEXT:    vsetvli zero, a1, e16, mf4, ta, mu
@@ -1411,7 +1413,7 @@ entry:
     <vscale x 1 x i16> %1,
     i16 %2,
     <vscale x 1 x i1> %3,
-    i32 %4, i32 1)
+    iXLen %4, iXLen 1)
 
   ret <vscale x 1 x i16> %a
 }
@@ -1420,9 +1422,9 @@ declare <vscale x 2 x i16> @llvm.riscv.vssub.nxv2i16.i16(
   <vscale x 2 x i16>,
   <vscale x 2 x i16>,
   i16,
-  i32);
+  iXLen)
 
-define <vscale x 2 x i16> @intrinsic_vssub_vx_nxv2i16_nxv2i16_i16(<vscale x 2 x i16> %0, i16 %1, i32 %2) nounwind {
+define <vscale x 2 x i16> @intrinsic_vssub_vx_nxv2i16_nxv2i16_i16(<vscale x 2 x i16> %0, i16 %1, iXLen %2) nounwind {
 ; CHECK-LABEL: intrinsic_vssub_vx_nxv2i16_nxv2i16_i16:
 ; CHECK:       # %bb.0: # %entry
 ; CHECK-NEXT:    vsetvli zero, a1, e16, mf2, ta, ma
@@ -1433,7 +1435,7 @@ entry:
     <vscale x 2 x i16> undef,
     <vscale x 2 x i16> %0,
     i16 %1,
-    i32 %2)
+    iXLen %2)
 
   ret <vscale x 2 x i16> %a
 }
@@ -1443,10 +1445,10 @@ declare <vscale x 2 x i16> @llvm.riscv.vssub.mask.nxv2i16.i16(
   <vscale x 2 x i16>,
   i16,
   <vscale x 2 x i1>,
-  i32,
-  i32);
+  iXLen,
+  iXLen)
 
-define <vscale x 2 x i16> @intrinsic_vssub_mask_vx_nxv2i16_nxv2i16_i16(<vscale x 2 x i16> %0, <vscale x 2 x i16> %1, i16 %2, <vscale x 2 x i1> %3, i32 %4) nounwind {
+define <vscale x 2 x i16> @intrinsic_vssub_mask_vx_nxv2i16_nxv2i16_i16(<vscale x 2 x i16> %0, <vscale x 2 x i16> %1, i16 %2, <vscale x 2 x i1> %3, iXLen %4) nounwind {
 ; CHECK-LABEL: intrinsic_vssub_mask_vx_nxv2i16_nxv2i16_i16:
 ; CHECK:       # %bb.0: # %entry
 ; CHECK-NEXT:    vsetvli zero, a1, e16, mf2, ta, mu
@@ -1458,7 +1460,7 @@ entry:
     <vscale x 2 x i16> %1,
     i16 %2,
     <vscale x 2 x i1> %3,
-    i32 %4, i32 1)
+    iXLen %4, iXLen 1)
 
   ret <vscale x 2 x i16> %a
 }
@@ -1467,9 +1469,9 @@ declare <vscale x 4 x i16> @llvm.riscv.vssub.nxv4i16.i16(
   <vscale x 4 x i16>,
   <vscale x 4 x i16>,
   i16,
-  i32);
+  iXLen)
 
-define <vscale x 4 x i16> @intrinsic_vssub_vx_nxv4i16_nxv4i16_i16(<vscale x 4 x i16> %0, i16 %1, i32 %2) nounwind {
+define <vscale x 4 x i16> @intrinsic_vssub_vx_nxv4i16_nxv4i16_i16(<vscale x 4 x i16> %0, i16 %1, iXLen %2) nounwind {
 ; CHECK-LABEL: intrinsic_vssub_vx_nxv4i16_nxv4i16_i16:
 ; CHECK:       # %bb.0: # %entry
 ; CHECK-NEXT:    vsetvli zero, a1, e16, m1, ta, ma
@@ -1480,7 +1482,7 @@ entry:
     <vscale x 4 x i16> undef,
     <vscale x 4 x i16> %0,
     i16 %1,
-    i32 %2)
+    iXLen %2)
 
   ret <vscale x 4 x i16> %a
 }
@@ -1490,10 +1492,10 @@ declare <vscale x 4 x i16> @llvm.riscv.vssub.mask.nxv4i16.i16(
   <vscale x 4 x i16>,
   i16,
   <vscale x 4 x i1>,
-  i32,
-  i32);
+  iXLen,
+  iXLen)
 
-define <vscale x 4 x i16> @intrinsic_vssub_mask_vx_nxv4i16_nxv4i16_i16(<vscale x 4 x i16> %0, <vscale x 4 x i16> %1, i16 %2, <vscale x 4 x i1> %3, i32 %4) nounwind {
+define <vscale x 4 x i16> @intrinsic_vssub_mask_vx_nxv4i16_nxv4i16_i16(<vscale x 4 x i16> %0, <vscale x 4 x i16> %1, i16 %2, <vscale x 4 x i1> %3, iXLen %4) nounwind {
 ; CHECK-LABEL: intrinsic_vssub_mask_vx_nxv4i16_nxv4i16_i16:
 ; CHECK:       # %bb.0: # %entry
 ; CHECK-NEXT:    vsetvli zero, a1, e16, m1, ta, mu
@@ -1505,7 +1507,7 @@ entry:
     <vscale x 4 x i16> %1,
     i16 %2,
     <vscale x 4 x i1> %3,
-    i32 %4, i32 1)
+    iXLen %4, iXLen 1)
 
   ret <vscale x 4 x i16> %a
 }
@@ -1514,9 +1516,9 @@ declare <vscale x 8 x i16> @llvm.riscv.vssub.nxv8i16.i16(
   <vscale x 8 x i16>,
   <vscale x 8 x i16>,
   i16,
-  i32);
+  iXLen)
 
-define <vscale x 8 x i16> @intrinsic_vssub_vx_nxv8i16_nxv8i16_i16(<vscale x 8 x i16> %0, i16 %1, i32 %2) nounwind {
+define <vscale x 8 x i16> @intrinsic_vssub_vx_nxv8i16_nxv8i16_i16(<vscale x 8 x i16> %0, i16 %1, iXLen %2) nounwind {
 ; CHECK-LABEL: intrinsic_vssub_vx_nxv8i16_nxv8i16_i16:
 ; CHECK:       # %bb.0: # %entry
 ; CHECK-NEXT:    vsetvli zero, a1, e16, m2, ta, ma
@@ -1527,7 +1529,7 @@ entry:
     <vscale x 8 x i16> undef,
     <vscale x 8 x i16> %0,
     i16 %1,
-    i32 %2)
+    iXLen %2)
 
   ret <vscale x 8 x i16> %a
 }
@@ -1537,10 +1539,10 @@ declare <vscale x 8 x i16> @llvm.riscv.vssub.mask.nxv8i16.i16(
   <vscale x 8 x i16>,
   i16,
   <vscale x 8 x i1>,
-  i32,
-  i32);
+  iXLen,
+  iXLen)
 
-define <vscale x 8 x i16> @intrinsic_vssub_mask_vx_nxv8i16_nxv8i16_i16(<vscale x 8 x i16> %0, <vscale x 8 x i16> %1, i16 %2, <vscale x 8 x i1> %3, i32 %4) nounwind {
+define <vscale x 8 x i16> @intrinsic_vssub_mask_vx_nxv8i16_nxv8i16_i16(<vscale x 8 x i16> %0, <vscale x 8 x i16> %1, i16 %2, <vscale x 8 x i1> %3, iXLen %4) nounwind {
 ; CHECK-LABEL: intrinsic_vssub_mask_vx_nxv8i16_nxv8i16_i16:
 ; CHECK:       # %bb.0: # %entry
 ; CHECK-NEXT:    vsetvli zero, a1, e16, m2, ta, mu
@@ -1552,7 +1554,7 @@ entry:
     <vscale x 8 x i16> %1,
     i16 %2,
     <vscale x 8 x i1> %3,
-    i32 %4, i32 1)
+    iXLen %4, iXLen 1)
 
   ret <vscale x 8 x i16> %a
 }
@@ -1561,9 +1563,9 @@ declare <vscale x 16 x i16> @llvm.riscv.vssub.nxv16i16.i16(
   <vscale x 16 x i16>,
   <vscale x 16 x i16>,
   i16,
-  i32);
+  iXLen)
 
-define <vscale x 16 x i16> @intrinsic_vssub_vx_nxv16i16_nxv16i16_i16(<vscale x 16 x i16> %0, i16 %1, i32 %2) nounwind {
+define <vscale x 16 x i16> @intrinsic_vssub_vx_nxv16i16_nxv16i16_i16(<vscale x 16 x i16> %0, i16 %1, iXLen %2) nounwind {
 ; CHECK-LABEL: intrinsic_vssub_vx_nxv16i16_nxv16i16_i16:
 ; CHECK:       # %bb.0: # %entry
 ; CHECK-NEXT:    vsetvli zero, a1, e16, m4, ta, ma
@@ -1574,7 +1576,7 @@ entry:
     <vscale x 16 x i16> undef,
     <vscale x 16 x i16> %0,
     i16 %1,
-    i32 %2)
+    iXLen %2)
 
   ret <vscale x 16 x i16> %a
 }
@@ -1584,10 +1586,10 @@ declare <vscale x 16 x i16> @llvm.riscv.vssub.mask.nxv16i16.i16(
   <vscale x 16 x i16>,
   i16,
   <vscale x 16 x i1>,
-  i32,
-  i32);
+  iXLen,
+  iXLen)
 
-define <vscale x 16 x i16> @intrinsic_vssub_mask_vx_nxv16i16_nxv16i16_i16(<vscale x 16 x i16> %0, <vscale x 16 x i16> %1, i16 %2, <vscale x 16 x i1> %3, i32 %4) nounwind {
+define <vscale x 16 x i16> @intrinsic_vssub_mask_vx_nxv16i16_nxv16i16_i16(<vscale x 16 x i16> %0, <vscale x 16 x i16> %1, i16 %2, <vscale x 16 x i1> %3, iXLen %4) nounwind {
 ; CHECK-LABEL: intrinsic_vssub_mask_vx_nxv16i16_nxv16i16_i16:
 ; CHECK:       # %bb.0: # %entry
 ; CHECK-NEXT:    vsetvli zero, a1, e16, m4, ta, mu
@@ -1599,7 +1601,7 @@ entry:
     <vscale x 16 x i16> %1,
     i16 %2,
     <vscale x 16 x i1> %3,
-    i32 %4, i32 1)
+    iXLen %4, iXLen 1)
 
   ret <vscale x 16 x i16> %a
 }
@@ -1608,9 +1610,9 @@ declare <vscale x 32 x i16> @llvm.riscv.vssub.nxv32i16.i16(
   <vscale x 32 x i16>,
   <vscale x 32 x i16>,
   i16,
-  i32);
+  iXLen)
 
-define <vscale x 32 x i16> @intrinsic_vssub_vx_nxv32i16_nxv32i16_i16(<vscale x 32 x i16> %0, i16 %1, i32 %2) nounwind {
+define <vscale x 32 x i16> @intrinsic_vssub_vx_nxv32i16_nxv32i16_i16(<vscale x 32 x i16> %0, i16 %1, iXLen %2) nounwind {
 ; CHECK-LABEL: intrinsic_vssub_vx_nxv32i16_nxv32i16_i16:
 ; CHECK:       # %bb.0: # %entry
 ; CHECK-NEXT:    vsetvli zero, a1, e16, m8, ta, ma
@@ -1621,7 +1623,7 @@ entry:
     <vscale x 32 x i16> undef,
     <vscale x 32 x i16> %0,
     i16 %1,
-    i32 %2)
+    iXLen %2)
 
   ret <vscale x 32 x i16> %a
 }
@@ -1631,10 +1633,10 @@ declare <vscale x 32 x i16> @llvm.riscv.vssub.mask.nxv32i16.i16(
   <vscale x 32 x i16>,
   i16,
   <vscale x 32 x i1>,
-  i32,
-  i32);
+  iXLen,
+  iXLen)
 
-define <vscale x 32 x i16> @intrinsic_vssub_mask_vx_nxv32i16_nxv32i16_i16(<vscale x 32 x i16> %0, <vscale x 32 x i16> %1, i16 %2, <vscale x 32 x i1> %3, i32 %4) nounwind {
+define <vscale x 32 x i16> @intrinsic_vssub_mask_vx_nxv32i16_nxv32i16_i16(<vscale x 32 x i16> %0, <vscale x 32 x i16> %1, i16 %2, <vscale x 32 x i1> %3, iXLen %4) nounwind {
 ; CHECK-LABEL: intrinsic_vssub_mask_vx_nxv32i16_nxv32i16_i16:
 ; CHECK:       # %bb.0: # %entry
 ; CHECK-NEXT:    vsetvli zero, a1, e16, m8, ta, mu
@@ -1646,7 +1648,7 @@ entry:
     <vscale x 32 x i16> %1,
     i16 %2,
     <vscale x 32 x i1> %3,
-    i32 %4, i32 1)
+    iXLen %4, iXLen 1)
 
   ret <vscale x 32 x i16> %a
 }
@@ -1655,9 +1657,9 @@ declare <vscale x 1 x i32> @llvm.riscv.vssub.nxv1i32.i32(
   <vscale x 1 x i32>,
   <vscale x 1 x i32>,
   i32,
-  i32);
+  iXLen)
 
-define <vscale x 1 x i32> @intrinsic_vssub_vx_nxv1i32_nxv1i32_i32(<vscale x 1 x i32> %0, i32 %1, i32 %2) nounwind {
+define <vscale x 1 x i32> @intrinsic_vssub_vx_nxv1i32_nxv1i32_i32(<vscale x 1 x i32> %0, i32 %1, iXLen %2) nounwind {
 ; CHECK-LABEL: intrinsic_vssub_vx_nxv1i32_nxv1i32_i32:
 ; CHECK:       # %bb.0: # %entry
 ; CHECK-NEXT:    vsetvli zero, a1, e32, mf2, ta, ma
@@ -1668,7 +1670,7 @@ entry:
     <vscale x 1 x i32> undef,
     <vscale x 1 x i32> %0,
     i32 %1,
-    i32 %2)
+    iXLen %2)
 
   ret <vscale x 1 x i32> %a
 }
@@ -1678,10 +1680,10 @@ declare <vscale x 1 x i32> @llvm.riscv.vssub.mask.nxv1i32.i32(
   <vscale x 1 x i32>,
   i32,
   <vscale x 1 x i1>,
-  i32,
-  i32);
+  iXLen,
+  iXLen)
 
-define <vscale x 1 x i32> @intrinsic_vssub_mask_vx_nxv1i32_nxv1i32_i32(<vscale x 1 x i32> %0, <vscale x 1 x i32> %1, i32 %2, <vscale x 1 x i1> %3, i32 %4) nounwind {
+define <vscale x 1 x i32> @intrinsic_vssub_mask_vx_nxv1i32_nxv1i32_i32(<vscale x 1 x i32> %0, <vscale x 1 x i32> %1, i32 %2, <vscale x 1 x i1> %3, iXLen %4) nounwind {
 ; CHECK-LABEL: intrinsic_vssub_mask_vx_nxv1i32_nxv1i32_i32:
 ; CHECK:       # %bb.0: # %entry
 ; CHECK-NEXT:    vsetvli zero, a1, e32, mf2, ta, mu
@@ -1693,7 +1695,7 @@ entry:
     <vscale x 1 x i32> %1,
     i32 %2,
     <vscale x 1 x i1> %3,
-    i32 %4, i32 1)
+    iXLen %4, iXLen 1)
 
   ret <vscale x 1 x i32> %a
 }
@@ -1702,9 +1704,9 @@ declare <vscale x 2 x i32> @llvm.riscv.vssub.nxv2i32.i32(
   <vscale x 2 x i32>,
   <vscale x 2 x i32>,
   i32,
-  i32);
+  iXLen)
 
-define <vscale x 2 x i32> @intrinsic_vssub_vx_nxv2i32_nxv2i32_i32(<vscale x 2 x i32> %0, i32 %1, i32 %2) nounwind {
+define <vscale x 2 x i32> @intrinsic_vssub_vx_nxv2i32_nxv2i32_i32(<vscale x 2 x i32> %0, i32 %1, iXLen %2) nounwind {
 ; CHECK-LABEL: intrinsic_vssub_vx_nxv2i32_nxv2i32_i32:
 ; CHECK:       # %bb.0: # %entry
 ; CHECK-NEXT:    vsetvli zero, a1, e32, m1, ta, ma
@@ -1715,7 +1717,7 @@ entry:
     <vscale x 2 x i32> undef,
     <vscale x 2 x i32> %0,
     i32 %1,
-    i32 %2)
+    iXLen %2)
 
   ret <vscale x 2 x i32> %a
 }
@@ -1725,10 +1727,10 @@ declare <vscale x 2 x i32> @llvm.riscv.vssub.mask.nxv2i32.i32(
   <vscale x 2 x i32>,
   i32,
   <vscale x 2 x i1>,
-  i32,
-  i32);
+  iXLen,
+  iXLen)
 
-define <vscale x 2 x i32> @intrinsic_vssub_mask_vx_nxv2i32_nxv2i32_i32(<vscale x 2 x i32> %0, <vscale x 2 x i32> %1, i32 %2, <vscale x 2 x i1> %3, i32 %4) nounwind {
+define <vscale x 2 x i32> @intrinsic_vssub_mask_vx_nxv2i32_nxv2i32_i32(<vscale x 2 x i32> %0, <vscale x 2 x i32> %1, i32 %2, <vscale x 2 x i1> %3, iXLen %4) nounwind {
 ; CHECK-LABEL: intrinsic_vssub_mask_vx_nxv2i32_nxv2i32_i32:
 ; CHECK:       # %bb.0: # %entry
 ; CHECK-NEXT:    vsetvli zero, a1, e32, m1, ta, mu
@@ -1740,7 +1742,7 @@ entry:
     <vscale x 2 x i32> %1,
     i32 %2,
     <vscale x 2 x i1> %3,
-    i32 %4, i32 1)
+    iXLen %4, iXLen 1)
 
   ret <vscale x 2 x i32> %a
 }
@@ -1749,9 +1751,9 @@ declare <vscale x 4 x i32> @llvm.riscv.vssub.nxv4i32.i32(
   <vscale x 4 x i32>,
   <vscale x 4 x i32>,
   i32,
-  i32);
+  iXLen)
 
-define <vscale x 4 x i32> @intrinsic_vssub_vx_nxv4i32_nxv4i32_i32(<vscale x 4 x i32> %0, i32 %1, i32 %2) nounwind {
+define <vscale x 4 x i32> @intrinsic_vssub_vx_nxv4i32_nxv4i32_i32(<vscale x 4 x i32> %0, i32 %1, iXLen %2) nounwind {
 ; CHECK-LABEL: intrinsic_vssub_vx_nxv4i32_nxv4i32_i32:
 ; CHECK:       # %bb.0: # %entry
 ; CHECK-NEXT:    vsetvli zero, a1, e32, m2, ta, ma
@@ -1762,7 +1764,7 @@ entry:
     <vscale x 4 x i32> undef,
     <vscale x 4 x i32> %0,
     i32 %1,
-    i32 %2)
+    iXLen %2)
 
   ret <vscale x 4 x i32> %a
 }
@@ -1772,10 +1774,10 @@ declare <vscale x 4 x i32> @llvm.riscv.vssub.mask.nxv4i32.i32(
   <vscale x 4 x i32>,
   i32,
   <vscale x 4 x i1>,
-  i32,
-  i32);
+  iXLen,
+  iXLen)
 
-define <vscale x 4 x i32> @intrinsic_vssub_mask_vx_nxv4i32_nxv4i32_i32(<vscale x 4 x i32> %0, <vscale x 4 x i32> %1, i32 %2, <vscale x 4 x i1> %3, i32 %4) nounwind {
+define <vscale x 4 x i32> @intrinsic_vssub_mask_vx_nxv4i32_nxv4i32_i32(<vscale x 4 x i32> %0, <vscale x 4 x i32> %1, i32 %2, <vscale x 4 x i1> %3, iXLen %4) nounwind {
 ; CHECK-LABEL: intrinsic_vssub_mask_vx_nxv4i32_nxv4i32_i32:
 ; CHECK:       # %bb.0: # %entry
 ; CHECK-NEXT:    vsetvli zero, a1, e32, m2, ta, mu
@@ -1787,7 +1789,7 @@ entry:
     <vscale x 4 x i32> %1,
     i32 %2,
     <vscale x 4 x i1> %3,
-    i32 %4, i32 1)
+    iXLen %4, iXLen 1)
 
   ret <vscale x 4 x i32> %a
 }
@@ -1796,9 +1798,9 @@ declare <vscale x 8 x i32> @llvm.riscv.vssub.nxv8i32.i32(
   <vscale x 8 x i32>,
   <vscale x 8 x i32>,
   i32,
-  i32);
+  iXLen)
 
-define <vscale x 8 x i32> @intrinsic_vssub_vx_nxv8i32_nxv8i32_i32(<vscale x 8 x i32> %0, i32 %1, i32 %2) nounwind {
+define <vscale x 8 x i32> @intrinsic_vssub_vx_nxv8i32_nxv8i32_i32(<vscale x 8 x i32> %0, i32 %1, iXLen %2) nounwind {
 ; CHECK-LABEL: intrinsic_vssub_vx_nxv8i32_nxv8i32_i32:
 ; CHECK:       # %bb.0: # %entry
 ; CHECK-NEXT:    vsetvli zero, a1, e32, m4, ta, ma
@@ -1809,7 +1811,7 @@ entry:
     <vscale x 8 x i32> undef,
     <vscale x 8 x i32> %0,
     i32 %1,
-    i32 %2)
+    iXLen %2)
 
   ret <vscale x 8 x i32> %a
 }
@@ -1819,10 +1821,10 @@ declare <vscale x 8 x i32> @llvm.riscv.vssub.mask.nxv8i32.i32(
   <vscale x 8 x i32>,
   i32,
   <vscale x 8 x i1>,
-  i32,
-  i32);
+  iXLen,
+  iXLen)
 
-define <vscale x 8 x i32> @intrinsic_vssub_mask_vx_nxv8i32_nxv8i32_i32(<vscale x 8 x i32> %0, <vscale x 8 x i32> %1, i32 %2, <vscale x 8 x i1> %3, i32 %4) nounwind {
+define <vscale x 8 x i32> @intrinsic_vssub_mask_vx_nxv8i32_nxv8i32_i32(<vscale x 8 x i32> %0, <vscale x 8 x i32> %1, i32 %2, <vscale x 8 x i1> %3, iXLen %4) nounwind {
 ; CHECK-LABEL: intrinsic_vssub_mask_vx_nxv8i32_nxv8i32_i32:
 ; CHECK:       # %bb.0: # %entry
 ; CHECK-NEXT:    vsetvli zero, a1, e32, m4, ta, mu
@@ -1834,7 +1836,7 @@ entry:
     <vscale x 8 x i32> %1,
     i32 %2,
     <vscale x 8 x i1> %3,
-    i32 %4, i32 1)
+    iXLen %4, iXLen 1)
 
   ret <vscale x 8 x i32> %a
 }
@@ -1843,9 +1845,9 @@ declare <vscale x 16 x i32> @llvm.riscv.vssub.nxv16i32.i32(
   <vscale x 16 x i32>,
   <vscale x 16 x i32>,
   i32,
-  i32);
+  iXLen)
 
-define <vscale x 16 x i32> @intrinsic_vssub_vx_nxv16i32_nxv16i32_i32(<vscale x 16 x i32> %0, i32 %1, i32 %2) nounwind {
+define <vscale x 16 x i32> @intrinsic_vssub_vx_nxv16i32_nxv16i32_i32(<vscale x 16 x i32> %0, i32 %1, iXLen %2) nounwind {
 ; CHECK-LABEL: intrinsic_vssub_vx_nxv16i32_nxv16i32_i32:
 ; CHECK:       # %bb.0: # %entry
 ; CHECK-NEXT:    vsetvli zero, a1, e32, m8, ta, ma
@@ -1856,7 +1858,7 @@ entry:
     <vscale x 16 x i32> undef,
     <vscale x 16 x i32> %0,
     i32 %1,
-    i32 %2)
+    iXLen %2)
 
   ret <vscale x 16 x i32> %a
 }
@@ -1866,10 +1868,10 @@ declare <vscale x 16 x i32> @llvm.riscv.vssub.mask.nxv16i32.i32(
   <vscale x 16 x i32>,
   i32,
   <vscale x 16 x i1>,
-  i32,
-  i32);
+  iXLen,
+  iXLen)
 
-define <vscale x 16 x i32> @intrinsic_vssub_mask_vx_nxv16i32_nxv16i32_i32(<vscale x 16 x i32> %0, <vscale x 16 x i32> %1, i32 %2, <vscale x 16 x i1> %3, i32 %4) nounwind {
+define <vscale x 16 x i32> @intrinsic_vssub_mask_vx_nxv16i32_nxv16i32_i32(<vscale x 16 x i32> %0, <vscale x 16 x i32> %1, i32 %2, <vscale x 16 x i1> %3, iXLen %4) nounwind {
 ; CHECK-LABEL: intrinsic_vssub_mask_vx_nxv16i32_nxv16i32_i32:
 ; CHECK:       # %bb.0: # %entry
 ; CHECK-NEXT:    vsetvli zero, a1, e32, m8, ta, mu
@@ -1881,7 +1883,7 @@ entry:
     <vscale x 16 x i32> %1,
     i32 %2,
     <vscale x 16 x i1> %3,
-    i32 %4, i32 1)
+    iXLen %4, iXLen 1)
 
   ret <vscale x 16 x i32> %a
 }
@@ -1890,26 +1892,32 @@ declare <vscale x 1 x i64> @llvm.riscv.vssub.nxv1i64.i64(
   <vscale x 1 x i64>,
   <vscale x 1 x i64>,
   i64,
-  i32);
-
-define <vscale x 1 x i64> @intrinsic_vssub_vx_nxv1i64_nxv1i64_i64(<vscale x 1 x i64> %0, i64 %1, i32 %2) nounwind {
-; CHECK-LABEL: intrinsic_vssub_vx_nxv1i64_nxv1i64_i64:
-; CHECK:       # %bb.0: # %entry
-; CHECK-NEXT:    addi sp, sp, -16
-; CHECK-NEXT:    sw a1, 12(sp)
-; CHECK-NEXT:    sw a0, 8(sp)
-; CHECK-NEXT:    addi a0, sp, 8
-; CHECK-NEXT:    vsetvli zero, a2, e64, m1, ta, ma
-; CHECK-NEXT:    vlse64.v v9, (a0), zero
-; CHECK-NEXT:    vssub.vv v8, v8, v9
-; CHECK-NEXT:    addi sp, sp, 16
-; CHECK-NEXT:    ret
+  iXLen)
+
+define <vscale x 1 x i64> @intrinsic_vssub_vx_nxv1i64_nxv1i64_i64(<vscale x 1 x i64> %0, i64 %1, iXLen %2) nounwind {
+; RV32-LABEL: intrinsic_vssub_vx_nxv1i64_nxv1i64_i64:
+; RV32:       # %bb.0: # %entry
+; RV32-NEXT:    addi sp, sp, -16
+; RV32-NEXT:    sw a1, 12(sp)
+; RV32-NEXT:    sw a0, 8(sp)
+; RV32-NEXT:    addi a0, sp, 8
+; RV32-NEXT:    vsetvli zero, a2, e64, m1, ta, ma
+; RV32-NEXT:    vlse64.v v9, (a0), zero
+; RV32-NEXT:    vssub.vv v8, v8, v9
+; RV32-NEXT:    addi sp, sp, 16
+; RV32-NEXT:    ret
+;
+; RV64-LABEL: intrinsic_vssub_vx_nxv1i64_nxv1i64_i64:
+; RV64:       # %bb.0: # %entry
+; RV64-NEXT:    vsetvli zero, a1, e64, m1, ta, ma
+; RV64-NEXT:    vssub.vx v8, v8, a0
+; RV64-NEXT:    ret
 entry:
   %a = call <vscale x 1 x i64> @llvm.riscv.vssub.nxv1i64.i64(
     <vscale x 1 x i64> undef,
     <vscale x 1 x i64> %0,
     i64 %1,
-    i32 %2)
+    iXLen %2)
 
   ret <vscale x 1 x i64> %a
 }
@@ -1919,28 +1927,34 @@ declare <vscale x 1 x i64> @llvm.riscv.vssub.mask.nxv1i64.i64(
   <vscale x 1 x i64>,
   i64,
   <vscale x 1 x i1>,
-  i32,
-  i32);
-
-define <vscale x 1 x i64> @intrinsic_vssub_mask_vx_nxv1i64_nxv1i64_i64(<vscale x 1 x i64> %0, <vscale x 1 x i64> %1, i64 %2, <vscale x 1 x i1> %3, i32 %4) nounwind {
-; CHECK-LABEL: intrinsic_vssub_mask_vx_nxv1i64_nxv1i64_i64:
-; CHECK:       # %bb.0: # %entry
-; CHECK-NEXT:    addi sp, sp, -16
-; CHECK-NEXT:    sw a1, 12(sp)
-; CHECK-NEXT:    sw a0, 8(sp)
-; CHECK-NEXT:    addi a0, sp, 8
-; CHECK-NEXT:    vsetvli zero, a2, e64, m1, ta, mu
-; CHECK-NEXT:    vlse64.v v10, (a0), zero
-; CHECK-NEXT:    vssub.vv v8, v9, v10, v0.t
-; CHECK-NEXT:    addi sp, sp, 16
-; CHECK-NEXT:    ret
+  iXLen,
+  iXLen)
+
+define <vscale x 1 x i64> @intrinsic_vssub_mask_vx_nxv1i64_nxv1i64_i64(<vscale x 1 x i64> %0, <vscale x 1 x i64> %1, i64 %2, <vscale x 1 x i1> %3, iXLen %4) nounwind {
+; RV32-LABEL: intrinsic_vssub_mask_vx_nxv1i64_nxv1i64_i64:
+; RV32:       # %bb.0: # %entry
+; RV32-NEXT:    addi sp, sp, -16
+; RV32-NEXT:    sw a1, 12(sp)
+; RV32-NEXT:    sw a0, 8(sp)
+; RV32-NEXT:    addi a0, sp, 8
+; RV32-NEXT:    vsetvli zero, a2, e64, m1, ta, mu
+; RV32-NEXT:    vlse64.v v10, (a0), zero
+; RV32-NEXT:    vssub.vv v8, v9, v10, v0.t
+; RV32-NEXT:    addi sp, sp, 16
+; RV32-NEXT:    ret
+;
+; RV64-LABEL: intrinsic_vssub_mask_vx_nxv1i64_nxv1i64_i64:
+; RV64:       # %bb.0: # %entry
+; RV64-NEXT:    vsetvli zero, a1, e64, m1, ta, mu
+; RV64-NEXT:    vssub.vx v8, v9, a0, v0.t
+; RV64-NEXT:    ret
 entry:
   %a = call <vscale x 1 x i64> @llvm.riscv.vssub.mask.nxv1i64.i64(
     <vscale x 1 x i64> %0,
     <vscale x 1 x i64> %1,
     i64 %2,
     <vscale x 1 x i1> %3,
-    i32 %4, i32 1)
+    iXLen %4, iXLen 1)
 
   ret <vscale x 1 x i64> %a
 }
@@ -1949,26 +1963,32 @@ declare <vscale x 2 x i64> @llvm.riscv.vssub.nxv2i64.i64(
   <vscale x 2 x i64>,
   <vscale x 2 x i64>,
   i64,
-  i32);
-
-define <vscale x 2 x i64> @intrinsic_vssub_vx_nxv2i64_nxv2i64_i64(<vscale x 2 x i64> %0, i64 %1, i32 %2) nounwind {
-; CHECK-LABEL: intrinsic_vssub_vx_nxv2i64_nxv2i64_i64:
-; CHECK:       # %bb.0: # %entry
-; CHECK-NEXT:    addi sp, sp, -16
-; CHECK-NEXT:    sw a1, 12(sp)
-; CHECK-NEXT:    sw a0, 8(sp)
-; CHECK-NEXT:    addi a0, sp, 8
-; CHECK-NEXT:    vsetvli zero, a2, e64, m2, ta, ma
-; CHECK-NEXT:    vlse64.v v10, (a0), zero
-; CHECK-NEXT:    vssub.vv v8, v8, v10
-; CHECK-NEXT:    addi sp, sp, 16
-; CHECK-NEXT:    ret
+  iXLen)
+
+define <vscale x 2 x i64> @intrinsic_vssub_vx_nxv2i64_nxv2i64_i64(<vscale x 2 x i64> %0, i64 %1, iXLen %2) nounwind {
+; RV32-LABEL: intrinsic_vssub_vx_nxv2i64_nxv2i64_i64:
+; RV32:       # %bb.0: # %entry
+; RV32-NEXT:    addi sp, sp, -16
+; RV32-NEXT:    sw a1, 12(sp)
+; RV32-NEXT:    sw a0, 8(sp)
+; RV32-NEXT:    addi a0, sp, 8
+; RV32-NEXT:    vsetvli zero, a2, e64, m2, ta, ma
+; RV32-NEXT:    vlse64.v v10, (a0), zero
+; RV32-NEXT:    vssub.vv v8, v8, v10
+; RV32-NEXT:    addi sp, sp, 16
+; RV32-NEXT:    ret
+;
+; RV64-LABEL: intrinsic_vssub_vx_nxv2i64_nxv2i64_i64:
+; RV64:       # %bb.0: # %entry
+; RV64-NEXT:    vsetvli zero, a1, e64, m2, ta, ma
+; RV64-NEXT:    vssub.vx v8, v8, a0
+; RV64-NEXT:    ret
 entry:
   %a = call <vscale x 2 x i64> @llvm.riscv.vssub.nxv2i64.i64(
     <vscale x 2 x i64> undef,
     <vscale x 2 x i64> %0,
     i64 %1,
-    i32 %2)
+    iXLen %2)
 
   ret <vscale x 2 x i64> %a
 }
@@ -1978,28 +1998,34 @@ declare <vscale x 2 x i64> @llvm.riscv.vssub.mask.nxv2i64.i64(
   <vscale x 2 x i64>,
   i64,
   <vscale x 2 x i1>,
-  i32,
-  i32);
-
-define <vscale x 2 x i64> @intrinsic_vssub_mask_vx_nxv2i64_nxv2i64_i64(<vscale x 2 x i64> %0, <vscale x 2 x i64> %1, i64 %2, <vscale x 2 x i1> %3, i32 %4) nounwind {
-; CHECK-LABEL: intrinsic_vssub_mask_vx_nxv2i64_nxv2i64_i64:
-; CHECK:       # %bb.0: # %entry
-; CHECK-NEXT:    addi sp, sp, -16
-; CHECK-NEXT:    sw a1, 12(sp)
-; CHECK-NEXT:    sw a0, 8(sp)
-; CHECK-NEXT:    addi a0, sp, 8
-; CHECK-NEXT:    vsetvli zero, a2, e64, m2, ta, mu
-; CHECK-NEXT:    vlse64.v v12, (a0), zero
-; CHECK-NEXT:    vssub.vv v8, v10, v12, v0.t
-; CHECK-NEXT:    addi sp, sp, 16
-; CHECK-NEXT:    ret
+  iXLen,
+  iXLen)
+
+define <vscale x 2 x i64> @intrinsic_vssub_mask_vx_nxv2i64_nxv2i64_i64(<vscale x 2 x i64> %0, <vscale x 2 x i64> %1, i64 %2, <vscale x 2 x i1> %3, iXLen %4) nounwind {
+; RV32-LABEL: intrinsic_vssub_mask_vx_nxv2i64_nxv2i64_i64:
+; RV32:       # %bb.0: # %entry
+; RV32-NEXT:    addi sp, sp, -16
+; RV32-NEXT:    sw a1, 12(sp)
+; RV32-NEXT:    sw a0, 8(sp)
+; RV32-NEXT:    addi a0, sp, 8
+; RV32-NEXT:    vsetvli zero, a2, e64, m2, ta, mu
+; RV32-NEXT:    vlse64.v v12, (a0), zero
+; RV32-NEXT:    vssub.vv v8, v10, v12, v0.t
+; RV32-NEXT:    addi sp, sp, 16
+; RV32-NEXT:    ret
+;
+; RV64-LABEL: intrinsic_vssub_mask_vx_nxv2i64_nxv2i64_i64:
+; RV64:       # %bb.0: # %entry
+; RV64-NEXT:    vsetvli zero, a1, e64, m2, ta, mu
+; RV64-NEXT:    vssub.vx v8, v10, a0, v0.t
+; RV64-NEXT:    ret
 entry:
   %a = call <vscale x 2 x i64> @llvm.riscv.vssub.mask.nxv2i64.i64(
     <vscale x 2 x i64> %0,
     <vscale x 2 x i64> %1,
     i64 %2,
     <vscale x 2 x i1> %3,
-    i32 %4, i32 1)
+    iXLen %4, iXLen 1)
 
   ret <vscale x 2 x i64> %a
 }
@@ -2008,26 +2034,32 @@ declare <vscale x 4 x i64> @llvm.riscv.vssub.nxv4i64.i64(
   <vscale x 4 x i64>,
   <vscale x 4 x i64>,
   i64,
-  i32);
-
-define <vscale x 4 x i64> @intrinsic_vssub_vx_nxv4i64_nxv4i64_i64(<vscale x 4 x i64> %0, i64 %1, i32 %2) nounwind {
-; CHECK-LABEL: intrinsic_vssub_vx_nxv4i64_nxv4i64_i64:
-; CHECK:       # %bb.0: # %entry
-; CHECK-NEXT:    addi sp, sp, -16
-; CHECK-NEXT:    sw a1, 12(sp)
-; CHECK-NEXT:    sw a0, 8(sp)
-; CHECK-NEXT:    addi a0, sp, 8
-; CHECK-NEXT:    vsetvli zero, a2, e64, m4, ta, ma
-; CHECK-NEXT:    vlse64.v v12, (a0), zero
-; CHECK-NEXT:    vssub.vv v8, v8, v12
-; CHECK-NEXT:    addi sp, sp, 16
-; CHECK-NEXT:    ret
+  iXLen)
+
+define <vscale x 4 x i64> @intrinsic_vssub_vx_nxv4i64_nxv4i64_i64(<vscale x 4 x i64> %0, i64 %1, iXLen %2) nounwind {
+; RV32-LABEL: intrinsic_vssub_vx_nxv4i64_nxv4i64_i64:
+; RV32:       # %bb.0: # %entry
+; RV32-NEXT:    addi sp, sp, -16
+; RV32-NEXT:    sw a1, 12(sp)
+; RV32-NEXT:    sw a0, 8(sp)
+; RV32-NEXT:    addi a0, sp, 8
+; RV32-NEXT:    vsetvli zero, a2, e64, m4, ta, ma
+; RV32-NEXT:    vlse64.v v12, (a0), zero
+; RV32-NEXT:    vssub.vv v8, v8, v12
+; RV32-NEXT:    addi sp, sp, 16
+; RV32-NEXT:    ret
+;
+; RV64-LABEL: intrinsic_vssub_vx_nxv4i64_nxv4i64_i64:
+; RV64:       # %bb.0: # %entry
+; RV64-NEXT:    vsetvli zero, a1, e64, m4, ta, ma
+; RV64-NEXT:    vssub.vx v8, v8, a0
+; RV64-NEXT:    ret
 entry:
   %a = call <vscale x 4 x i64> @llvm.riscv.vssub.nxv4i64.i64(
     <vscale x 4 x i64> undef,
     <vscale x 4 x i64> %0,
     i64 %1,
-    i32 %2)
+    iXLen %2)
 
   ret <vscale x 4 x i64> %a
 }
@@ -2037,28 +2069,34 @@ declare <vscale x 4 x i64> @llvm.riscv.vssub.mask.nxv4i64.i64(
   <vscale x 4 x i64>,
   i64,
   <vscale x 4 x i1>,
-  i32,
-  i32);
-
-define <vscale x 4 x i64> @intrinsic_vssub_mask_vx_nxv4i64_nxv4i64_i64(<vscale x 4 x i64> %0, <vscale x 4 x i64> %1, i64 %2, <vscale x 4 x i1> %3, i32 %4) nounwind {
-; CHECK-LABEL: intrinsic_vssub_mask_vx_nxv4i64_nxv4i64_i64:
-; CHECK:       # %bb.0: # %entry
-; CHECK-NEXT:    addi sp, sp, -16
-; CHECK-NEXT:    sw a1, 12(sp)
-; CHECK-NEXT:    sw a0, 8(sp)
-; CHECK-NEXT:    addi a0, sp, 8
-; CHECK-NEXT:    vsetvli zero, a2, e64, m4, ta, mu
-; CHECK-NEXT:    vlse64.v v16, (a0), zero
-; CHECK-NEXT:    vssub.vv v8, v12, v16, v0.t
-; CHECK-NEXT:    addi sp, sp, 16
-; CHECK-NEXT:    ret
+  iXLen,
+  iXLen)
+
+define <vscale x 4 x i64> @intrinsic_vssub_mask_vx_nxv4i64_nxv4i64_i64(<vscale x 4 x i64> %0, <vscale x 4 x i64> %1, i64 %2, <vscale x 4 x i1> %3, iXLen %4) nounwind {
+; RV32-LABEL: intrinsic_vssub_mask_vx_nxv4i64_nxv4i64_i64:
+; RV32:       # %bb.0: # %entry
+; RV32-NEXT:    addi sp, sp, -16
+; RV32-NEXT:    sw a1, 12(sp)
+; RV32-NEXT:    sw a0, 8(sp)
+; RV32-NEXT:    addi a0, sp, 8
+; RV32-NEXT:    vsetvli zero, a2, e64, m4, ta, mu
+; RV32-NEXT:    vlse64.v v16, (a0), zero
+; RV32-NEXT:    vssub.vv v8, v12, v16, v0.t
+; RV32-NEXT:    addi sp, sp, 16
+; RV32-NEXT:    ret
+;
+; RV64-LABEL: intrinsic_vssub_mask_vx_nxv4i64_nxv4i64_i64:
+; RV64:       # %bb.0: # %entry
+; RV64-NEXT:    vsetvli zero, a1, e64, m4, ta, mu
+; RV64-NEXT:    vssub.vx v8, v12, a0, v0.t
+; RV64-NEXT:    ret
 entry:
   %a = call <vscale x 4 x i64> @llvm.riscv.vssub.mask.nxv4i64.i64(
     <vscale x 4 x i64> %0,
     <vscale x 4 x i64> %1,
     i64 %2,
     <vscale x 4 x i1> %3,
-    i32 %4, i32 1)
+    iXLen %4, iXLen 1)
 
   ret <vscale x 4 x i64> %a
 }
@@ -2067,26 +2105,32 @@ declare <vscale x 8 x i64> @llvm.riscv.vssub.nxv8i64.i64(
   <vscale x 8 x i64>,
   <vscale x 8 x i64>,
   i64,
-  i32);
-
-define <vscale x 8 x i64> @intrinsic_vssub_vx_nxv8i64_nxv8i64_i64(<vscale x 8 x i64> %0, i64 %1, i32 %2) nounwind {
-; CHECK-LABEL: intrinsic_vssub_vx_nxv8i64_nxv8i64_i64:
-; CHECK:       # %bb.0: # %entry
-; CHECK-NEXT:    addi sp, sp, -16
-; CHECK-NEXT:    sw a1, 12(sp)
-; CHECK-NEXT:    sw a0, 8(sp)
-; CHECK-NEXT:    addi a0, sp, 8
-; CHECK-NEXT:    vsetvli zero, a2, e64, m8, ta, ma
-; CHECK-NEXT:    vlse64.v v16, (a0), zero
-; CHECK-NEXT:    vssub.vv v8, v8, v16
-; CHECK-NEXT:    addi sp, sp, 16
-; CHECK-NEXT:    ret
+  iXLen)
+
+define <vscale x 8 x i64> @intrinsic_vssub_vx_nxv8i64_nxv8i64_i64(<vscale x 8 x i64> %0, i64 %1, iXLen %2) nounwind {
+; RV32-LABEL: intrinsic_vssub_vx_nxv8i64_nxv8i64_i64:
+; RV32:       # %bb.0: # %entry
+; RV32-NEXT:    addi sp, sp, -16
+; RV32-NEXT:    sw a1, 12(sp)
+; RV32-NEXT:    sw a0, 8(sp)
+; RV32-NEXT:    addi a0, sp, 8
+; RV32-NEXT:    vsetvli zero, a2, e64, m8, ta, ma
+; RV32-NEXT:    vlse64.v v16, (a0), zero
+; RV32-NEXT:    vssub.vv v8, v8, v16
+; RV32-NEXT:    addi sp, sp, 16
+; RV32-NEXT:    ret
+;
+; RV64-LABEL: intrinsic_vssub_vx_nxv8i64_nxv8i64_i64:
+; RV64:       # %bb.0: # %entry
+; RV64-NEXT:    vsetvli zero, a1, e64, m8, ta, ma
+; RV64-NEXT:    vssub.vx v8, v8, a0
+; RV64-NEXT:    ret
 entry:
   %a = call <vscale x 8 x i64> @llvm.riscv.vssub.nxv8i64.i64(
     <vscale x 8 x i64> undef,
     <vscale x 8 x i64> %0,
     i64 %1,
-    i32 %2)
+    iXLen %2)
 
   ret <vscale x 8 x i64> %a
 }
@@ -2096,28 +2140,34 @@ declare <vscale x 8 x i64> @llvm.riscv.vssub.mask.nxv8i64.i64(
   <vscale x 8 x i64>,
   i64,
   <vscale x 8 x i1>,
-  i32,
-  i32);
-
-define <vscale x 8 x i64> @intrinsic_vssub_mask_vx_nxv8i64_nxv8i64_i64(<vscale x 8 x i64> %0, <vscale x 8 x i64> %1, i64 %2, <vscale x 8 x i1> %3, i32 %4) nounwind {
-; CHECK-LABEL: intrinsic_vssub_mask_vx_nxv8i64_nxv8i64_i64:
-; CHECK:       # %bb.0: # %entry
-; CHECK-NEXT:    addi sp, sp, -16
-; CHECK-NEXT:    sw a1, 12(sp)
-; CHECK-NEXT:    sw a0, 8(sp)
-; CHECK-NEXT:    addi a0, sp, 8
-; CHECK-NEXT:    vsetvli zero, a2, e64, m8, ta, mu
-; CHECK-NEXT:    vlse64.v v24, (a0), zero
-; CHECK-NEXT:    vssub.vv v8, v16, v24, v0.t
-; CHECK-NEXT:    addi sp, sp, 16
-; CHECK-NEXT:    ret
+  iXLen,
+  iXLen)
+
+define <vscale x 8 x i64> @intrinsic_vssub_mask_vx_nxv8i64_nxv8i64_i64(<vscale x 8 x i64> %0, <vscale x 8 x i64> %1, i64 %2, <vscale x 8 x i1> %3, iXLen %4) nounwind {
+; RV32-LABEL: intrinsic_vssub_mask_vx_nxv8i64_nxv8i64_i64:
+; RV32:       # %bb.0: # %entry
+; RV32-NEXT:    addi sp, sp, -16
+; RV32-NEXT:    sw a1, 12(sp)
+; RV32-NEXT:    sw a0, 8(sp)
+; RV32-NEXT:    addi a0, sp, 8
+; RV32-NEXT:    vsetvli zero, a2, e64, m8, ta, mu
+; RV32-NEXT:    vlse64.v v24, (a0), zero
+; RV32-NEXT:    vssub.vv v8, v16, v24, v0.t
+; RV32-NEXT:    addi sp, sp, 16
+; RV32-NEXT:    ret
+;
+; RV64-LABEL: intrinsic_vssub_mask_vx_nxv8i64_nxv8i64_i64:
+; RV64:       # %bb.0: # %entry
+; RV64-NEXT:    vsetvli zero, a1, e64, m8, ta, mu
+; RV64-NEXT:    vssub.vx v8, v16, a0, v0.t
+; RV64-NEXT:    ret
 entry:
   %a = call <vscale x 8 x i64> @llvm.riscv.vssub.mask.nxv8i64.i64(
     <vscale x 8 x i64> %0,
     <vscale x 8 x i64> %1,
     i64 %2,
     <vscale x 8 x i1> %3,
-    i32 %4, i32 1)
+    iXLen %4, iXLen 1)
 
   ret <vscale x 8 x i64> %a
 }
diff --git a/llvm/test/CodeGen/RISCV/rvv/vssubu-rv32.ll b/llvm/test/CodeGen/RISCV/rvv/vssubu-rv32.ll
deleted file mode 100644
index 5e71471..0000000
--- a/llvm/test/CodeGen/RISCV/rvv/vssubu-rv32.ll
+++ /dev/null
@@ -1,2123 +0,0 @@
-; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py
-; RUN: llc -mtriple=riscv32 -mattr=+v -verify-machineinstrs \
-; RUN:   < %s | FileCheck %s
-
-declare <vscale x 1 x i8> @llvm.riscv.vssubu.nxv1i8.nxv1i8(
-  <vscale x 1 x i8>,
-  <vscale x 1 x i8>,
-  <vscale x 1 x i8>,
-  i32);
-
-define <vscale x 1 x i8> @intrinsic_vssubu_vv_nxv1i8_nxv1i8_nxv1i8(<vscale x 1 x i8> %0, <vscale x 1 x i8> %1, i32 %2) nounwind {
-; CHECK-LABEL: intrinsic_vssubu_vv_nxv1i8_nxv1i8_nxv1i8:
-; CHECK:       # %bb.0: # %entry
-; CHECK-NEXT:    vsetvli zero, a0, e8, mf8, ta, ma
-; CHECK-NEXT:    vssubu.vv v8, v8, v9
-; CHECK-NEXT:    ret
-entry:
-  %a = call <vscale x 1 x i8> @llvm.riscv.vssubu.nxv1i8.nxv1i8(
-    <vscale x 1 x i8> undef,
-    <vscale x 1 x i8> %0,
-    <vscale x 1 x i8> %1,
-    i32 %2)
-
-  ret <vscale x 1 x i8> %a
-}
-
-declare <vscale x 1 x i8> @llvm.riscv.vssubu.mask.nxv1i8.nxv1i8(
-  <vscale x 1 x i8>,
-  <vscale x 1 x i8>,
-  <vscale x 1 x i8>,
-  <vscale x 1 x i1>,
-  i32,
-  i32);
-
-define <vscale x 1 x i8> @intrinsic_vssubu_mask_vv_nxv1i8_nxv1i8_nxv1i8(<vscale x 1 x i8> %0, <vscale x 1 x i8> %1, <vscale x 1 x i8> %2, <vscale x 1 x i1> %3, i32 %4) nounwind {
-; CHECK-LABEL: intrinsic_vssubu_mask_vv_nxv1i8_nxv1i8_nxv1i8:
-; CHECK:       # %bb.0: # %entry
-; CHECK-NEXT:    vsetvli zero, a0, e8, mf8, ta, mu
-; CHECK-NEXT:    vssubu.vv v8, v9, v10, v0.t
-; CHECK-NEXT:    ret
-entry:
-  %a = call <vscale x 1 x i8> @llvm.riscv.vssubu.mask.nxv1i8.nxv1i8(
-    <vscale x 1 x i8> %0,
-    <vscale x 1 x i8> %1,
-    <vscale x 1 x i8> %2,
-    <vscale x 1 x i1> %3,
-    i32 %4, i32 1)
-
-  ret <vscale x 1 x i8> %a
-}
-
-declare <vscale x 2 x i8> @llvm.riscv.vssubu.nxv2i8.nxv2i8(
-  <vscale x 2 x i8>,
-  <vscale x 2 x i8>,
-  <vscale x 2 x i8>,
-  i32);
-
-define <vscale x 2 x i8> @intrinsic_vssubu_vv_nxv2i8_nxv2i8_nxv2i8(<vscale x 2 x i8> %0, <vscale x 2 x i8> %1, i32 %2) nounwind {
-; CHECK-LABEL: intrinsic_vssubu_vv_nxv2i8_nxv2i8_nxv2i8:
-; CHECK:       # %bb.0: # %entry
-; CHECK-NEXT:    vsetvli zero, a0, e8, mf4, ta, ma
-; CHECK-NEXT:    vssubu.vv v8, v8, v9
-; CHECK-NEXT:    ret
-entry:
-  %a = call <vscale x 2 x i8> @llvm.riscv.vssubu.nxv2i8.nxv2i8(
-    <vscale x 2 x i8> undef,
-    <vscale x 2 x i8> %0,
-    <vscale x 2 x i8> %1,
-    i32 %2)
-
-  ret <vscale x 2 x i8> %a
-}
-
-declare <vscale x 2 x i8> @llvm.riscv.vssubu.mask.nxv2i8.nxv2i8(
-  <vscale x 2 x i8>,
-  <vscale x 2 x i8>,
-  <vscale x 2 x i8>,
-  <vscale x 2 x i1>,
-  i32,
-  i32);
-
-define <vscale x 2 x i8> @intrinsic_vssubu_mask_vv_nxv2i8_nxv2i8_nxv2i8(<vscale x 2 x i8> %0, <vscale x 2 x i8> %1, <vscale x 2 x i8> %2, <vscale x 2 x i1> %3, i32 %4) nounwind {
-; CHECK-LABEL: intrinsic_vssubu_mask_vv_nxv2i8_nxv2i8_nxv2i8:
-; CHECK:       # %bb.0: # %entry
-; CHECK-NEXT:    vsetvli zero, a0, e8, mf4, ta, mu
-; CHECK-NEXT:    vssubu.vv v8, v9, v10, v0.t
-; CHECK-NEXT:    ret
-entry:
-  %a = call <vscale x 2 x i8> @llvm.riscv.vssubu.mask.nxv2i8.nxv2i8(
-    <vscale x 2 x i8> %0,
-    <vscale x 2 x i8> %1,
-    <vscale x 2 x i8> %2,
-    <vscale x 2 x i1> %3,
-    i32 %4, i32 1)
-
-  ret <vscale x 2 x i8> %a
-}
-
-declare <vscale x 4 x i8> @llvm.riscv.vssubu.nxv4i8.nxv4i8(
-  <vscale x 4 x i8>,
-  <vscale x 4 x i8>,
-  <vscale x 4 x i8>,
-  i32);
-
-define <vscale x 4 x i8> @intrinsic_vssubu_vv_nxv4i8_nxv4i8_nxv4i8(<vscale x 4 x i8> %0, <vscale x 4 x i8> %1, i32 %2) nounwind {
-; CHECK-LABEL: intrinsic_vssubu_vv_nxv4i8_nxv4i8_nxv4i8:
-; CHECK:       # %bb.0: # %entry
-; CHECK-NEXT:    vsetvli zero, a0, e8, mf2, ta, ma
-; CHECK-NEXT:    vssubu.vv v8, v8, v9
-; CHECK-NEXT:    ret
-entry:
-  %a = call <vscale x 4 x i8> @llvm.riscv.vssubu.nxv4i8.nxv4i8(
-    <vscale x 4 x i8> undef,
-    <vscale x 4 x i8> %0,
-    <vscale x 4 x i8> %1,
-    i32 %2)
-
-  ret <vscale x 4 x i8> %a
-}
-
-declare <vscale x 4 x i8> @llvm.riscv.vssubu.mask.nxv4i8.nxv4i8(
-  <vscale x 4 x i8>,
-  <vscale x 4 x i8>,
-  <vscale x 4 x i8>,
-  <vscale x 4 x i1>,
-  i32,
-  i32);
-
-define <vscale x 4 x i8> @intrinsic_vssubu_mask_vv_nxv4i8_nxv4i8_nxv4i8(<vscale x 4 x i8> %0, <vscale x 4 x i8> %1, <vscale x 4 x i8> %2, <vscale x 4 x i1> %3, i32 %4) nounwind {
-; CHECK-LABEL: intrinsic_vssubu_mask_vv_nxv4i8_nxv4i8_nxv4i8:
-; CHECK:       # %bb.0: # %entry
-; CHECK-NEXT:    vsetvli zero, a0, e8, mf2, ta, mu
-; CHECK-NEXT:    vssubu.vv v8, v9, v10, v0.t
-; CHECK-NEXT:    ret
-entry:
-  %a = call <vscale x 4 x i8> @llvm.riscv.vssubu.mask.nxv4i8.nxv4i8(
-    <vscale x 4 x i8> %0,
-    <vscale x 4 x i8> %1,
-    <vscale x 4 x i8> %2,
-    <vscale x 4 x i1> %3,
-    i32 %4, i32 1)
-
-  ret <vscale x 4 x i8> %a
-}
-
-declare <vscale x 8 x i8> @llvm.riscv.vssubu.nxv8i8.nxv8i8(
-  <vscale x 8 x i8>,
-  <vscale x 8 x i8>,
-  <vscale x 8 x i8>,
-  i32);
-
-define <vscale x 8 x i8> @intrinsic_vssubu_vv_nxv8i8_nxv8i8_nxv8i8(<vscale x 8 x i8> %0, <vscale x 8 x i8> %1, i32 %2) nounwind {
-; CHECK-LABEL: intrinsic_vssubu_vv_nxv8i8_nxv8i8_nxv8i8:
-; CHECK:       # %bb.0: # %entry
-; CHECK-NEXT:    vsetvli zero, a0, e8, m1, ta, ma
-; CHECK-NEXT:    vssubu.vv v8, v8, v9
-; CHECK-NEXT:    ret
-entry:
-  %a = call <vscale x 8 x i8> @llvm.riscv.vssubu.nxv8i8.nxv8i8(
-    <vscale x 8 x i8> undef,
-    <vscale x 8 x i8> %0,
-    <vscale x 8 x i8> %1,
-    i32 %2)
-
-  ret <vscale x 8 x i8> %a
-}
-
-declare <vscale x 8 x i8> @llvm.riscv.vssubu.mask.nxv8i8.nxv8i8(
-  <vscale x 8 x i8>,
-  <vscale x 8 x i8>,
-  <vscale x 8 x i8>,
-  <vscale x 8 x i1>,
-  i32,
-  i32);
-
-define <vscale x 8 x i8> @intrinsic_vssubu_mask_vv_nxv8i8_nxv8i8_nxv8i8(<vscale x 8 x i8> %0, <vscale x 8 x i8> %1, <vscale x 8 x i8> %2, <vscale x 8 x i1> %3, i32 %4) nounwind {
-; CHECK-LABEL: intrinsic_vssubu_mask_vv_nxv8i8_nxv8i8_nxv8i8:
-; CHECK:       # %bb.0: # %entry
-; CHECK-NEXT:    vsetvli zero, a0, e8, m1, ta, mu
-; CHECK-NEXT:    vssubu.vv v8, v9, v10, v0.t
-; CHECK-NEXT:    ret
-entry:
-  %a = call <vscale x 8 x i8> @llvm.riscv.vssubu.mask.nxv8i8.nxv8i8(
-    <vscale x 8 x i8> %0,
-    <vscale x 8 x i8> %1,
-    <vscale x 8 x i8> %2,
-    <vscale x 8 x i1> %3,
-    i32 %4, i32 1)
-
-  ret <vscale x 8 x i8> %a
-}
-
-declare <vscale x 16 x i8> @llvm.riscv.vssubu.nxv16i8.nxv16i8(
-  <vscale x 16 x i8>,
-  <vscale x 16 x i8>,
-  <vscale x 16 x i8>,
-  i32);
-
-define <vscale x 16 x i8> @intrinsic_vssubu_vv_nxv16i8_nxv16i8_nxv16i8(<vscale x 16 x i8> %0, <vscale x 16 x i8> %1, i32 %2) nounwind {
-; CHECK-LABEL: intrinsic_vssubu_vv_nxv16i8_nxv16i8_nxv16i8:
-; CHECK:       # %bb.0: # %entry
-; CHECK-NEXT:    vsetvli zero, a0, e8, m2, ta, ma
-; CHECK-NEXT:    vssubu.vv v8, v8, v10
-; CHECK-NEXT:    ret
-entry:
-  %a = call <vscale x 16 x i8> @llvm.riscv.vssubu.nxv16i8.nxv16i8(
-    <vscale x 16 x i8> undef,
-    <vscale x 16 x i8> %0,
-    <vscale x 16 x i8> %1,
-    i32 %2)
-
-  ret <vscale x 16 x i8> %a
-}
-
-declare <vscale x 16 x i8> @llvm.riscv.vssubu.mask.nxv16i8.nxv16i8(
-  <vscale x 16 x i8>,
-  <vscale x 16 x i8>,
-  <vscale x 16 x i8>,
-  <vscale x 16 x i1>,
-  i32,
-  i32);
-
-define <vscale x 16 x i8> @intrinsic_vssubu_mask_vv_nxv16i8_nxv16i8_nxv16i8(<vscale x 16 x i8> %0, <vscale x 16 x i8> %1, <vscale x 16 x i8> %2, <vscale x 16 x i1> %3, i32 %4) nounwind {
-; CHECK-LABEL: intrinsic_vssubu_mask_vv_nxv16i8_nxv16i8_nxv16i8:
-; CHECK:       # %bb.0: # %entry
-; CHECK-NEXT:    vsetvli zero, a0, e8, m2, ta, mu
-; CHECK-NEXT:    vssubu.vv v8, v10, v12, v0.t
-; CHECK-NEXT:    ret
-entry:
-  %a = call <vscale x 16 x i8> @llvm.riscv.vssubu.mask.nxv16i8.nxv16i8(
-    <vscale x 16 x i8> %0,
-    <vscale x 16 x i8> %1,
-    <vscale x 16 x i8> %2,
-    <vscale x 16 x i1> %3,
-    i32 %4, i32 1)
-
-  ret <vscale x 16 x i8> %a
-}
-
-declare <vscale x 32 x i8> @llvm.riscv.vssubu.nxv32i8.nxv32i8(
-  <vscale x 32 x i8>,
-  <vscale x 32 x i8>,
-  <vscale x 32 x i8>,
-  i32);
-
-define <vscale x 32 x i8> @intrinsic_vssubu_vv_nxv32i8_nxv32i8_nxv32i8(<vscale x 32 x i8> %0, <vscale x 32 x i8> %1, i32 %2) nounwind {
-; CHECK-LABEL: intrinsic_vssubu_vv_nxv32i8_nxv32i8_nxv32i8:
-; CHECK:       # %bb.0: # %entry
-; CHECK-NEXT:    vsetvli zero, a0, e8, m4, ta, ma
-; CHECK-NEXT:    vssubu.vv v8, v8, v12
-; CHECK-NEXT:    ret
-entry:
-  %a = call <vscale x 32 x i8> @llvm.riscv.vssubu.nxv32i8.nxv32i8(
-    <vscale x 32 x i8> undef,
-    <vscale x 32 x i8> %0,
-    <vscale x 32 x i8> %1,
-    i32 %2)
-
-  ret <vscale x 32 x i8> %a
-}
-
-declare <vscale x 32 x i8> @llvm.riscv.vssubu.mask.nxv32i8.nxv32i8(
-  <vscale x 32 x i8>,
-  <vscale x 32 x i8>,
-  <vscale x 32 x i8>,
-  <vscale x 32 x i1>,
-  i32,
-  i32);
-
-define <vscale x 32 x i8> @intrinsic_vssubu_mask_vv_nxv32i8_nxv32i8_nxv32i8(<vscale x 32 x i8> %0, <vscale x 32 x i8> %1, <vscale x 32 x i8> %2, <vscale x 32 x i1> %3, i32 %4) nounwind {
-; CHECK-LABEL: intrinsic_vssubu_mask_vv_nxv32i8_nxv32i8_nxv32i8:
-; CHECK:       # %bb.0: # %entry
-; CHECK-NEXT:    vsetvli zero, a0, e8, m4, ta, mu
-; CHECK-NEXT:    vssubu.vv v8, v12, v16, v0.t
-; CHECK-NEXT:    ret
-entry:
-  %a = call <vscale x 32 x i8> @llvm.riscv.vssubu.mask.nxv32i8.nxv32i8(
-    <vscale x 32 x i8> %0,
-    <vscale x 32 x i8> %1,
-    <vscale x 32 x i8> %2,
-    <vscale x 32 x i1> %3,
-    i32 %4, i32 1)
-
-  ret <vscale x 32 x i8> %a
-}
-
-declare <vscale x 64 x i8> @llvm.riscv.vssubu.nxv64i8.nxv64i8(
-  <vscale x 64 x i8>,
-  <vscale x 64 x i8>,
-  <vscale x 64 x i8>,
-  i32);
-
-define <vscale x 64 x i8> @intrinsic_vssubu_vv_nxv64i8_nxv64i8_nxv64i8(<vscale x 64 x i8> %0, <vscale x 64 x i8> %1, i32 %2) nounwind {
-; CHECK-LABEL: intrinsic_vssubu_vv_nxv64i8_nxv64i8_nxv64i8:
-; CHECK:       # %bb.0: # %entry
-; CHECK-NEXT:    vsetvli zero, a0, e8, m8, ta, ma
-; CHECK-NEXT:    vssubu.vv v8, v8, v16
-; CHECK-NEXT:    ret
-entry:
-  %a = call <vscale x 64 x i8> @llvm.riscv.vssubu.nxv64i8.nxv64i8(
-    <vscale x 64 x i8> undef,
-    <vscale x 64 x i8> %0,
-    <vscale x 64 x i8> %1,
-    i32 %2)
-
-  ret <vscale x 64 x i8> %a
-}
-
-declare <vscale x 64 x i8> @llvm.riscv.vssubu.mask.nxv64i8.nxv64i8(
-  <vscale x 64 x i8>,
-  <vscale x 64 x i8>,
-  <vscale x 64 x i8>,
-  <vscale x 64 x i1>,
-  i32,
-  i32);
-
-define <vscale x 64 x i8> @intrinsic_vssubu_mask_vv_nxv64i8_nxv64i8_nxv64i8(<vscale x 64 x i8> %0, <vscale x 64 x i8> %1, <vscale x 64 x i8> %2, <vscale x 64 x i1> %3, i32 %4) nounwind {
-; CHECK-LABEL: intrinsic_vssubu_mask_vv_nxv64i8_nxv64i8_nxv64i8:
-; CHECK:       # %bb.0: # %entry
-; CHECK-NEXT:    vl8r.v v24, (a0)
-; CHECK-NEXT:    vsetvli zero, a1, e8, m8, ta, mu
-; CHECK-NEXT:    vssubu.vv v8, v16, v24, v0.t
-; CHECK-NEXT:    ret
-entry:
-  %a = call <vscale x 64 x i8> @llvm.riscv.vssubu.mask.nxv64i8.nxv64i8(
-    <vscale x 64 x i8> %0,
-    <vscale x 64 x i8> %1,
-    <vscale x 64 x i8> %2,
-    <vscale x 64 x i1> %3,
-    i32 %4, i32 1)
-
-  ret <vscale x 64 x i8> %a
-}
-
-declare <vscale x 1 x i16> @llvm.riscv.vssubu.nxv1i16.nxv1i16(
-  <vscale x 1 x i16>,
-  <vscale x 1 x i16>,
-  <vscale x 1 x i16>,
-  i32);
-
-define <vscale x 1 x i16> @intrinsic_vssubu_vv_nxv1i16_nxv1i16_nxv1i16(<vscale x 1 x i16> %0, <vscale x 1 x i16> %1, i32 %2) nounwind {
-; CHECK-LABEL: intrinsic_vssubu_vv_nxv1i16_nxv1i16_nxv1i16:
-; CHECK:       # %bb.0: # %entry
-; CHECK-NEXT:    vsetvli zero, a0, e16, mf4, ta, ma
-; CHECK-NEXT:    vssubu.vv v8, v8, v9
-; CHECK-NEXT:    ret
-entry:
-  %a = call <vscale x 1 x i16> @llvm.riscv.vssubu.nxv1i16.nxv1i16(
-    <vscale x 1 x i16> undef,
-    <vscale x 1 x i16> %0,
-    <vscale x 1 x i16> %1,
-    i32 %2)
-
-  ret <vscale x 1 x i16> %a
-}
-
-declare <vscale x 1 x i16> @llvm.riscv.vssubu.mask.nxv1i16.nxv1i16(
-  <vscale x 1 x i16>,
-  <vscale x 1 x i16>,
-  <vscale x 1 x i16>,
-  <vscale x 1 x i1>,
-  i32,
-  i32);
-
-define <vscale x 1 x i16> @intrinsic_vssubu_mask_vv_nxv1i16_nxv1i16_nxv1i16(<vscale x 1 x i16> %0, <vscale x 1 x i16> %1, <vscale x 1 x i16> %2, <vscale x 1 x i1> %3, i32 %4) nounwind {
-; CHECK-LABEL: intrinsic_vssubu_mask_vv_nxv1i16_nxv1i16_nxv1i16:
-; CHECK:       # %bb.0: # %entry
-; CHECK-NEXT:    vsetvli zero, a0, e16, mf4, ta, mu
-; CHECK-NEXT:    vssubu.vv v8, v9, v10, v0.t
-; CHECK-NEXT:    ret
-entry:
-  %a = call <vscale x 1 x i16> @llvm.riscv.vssubu.mask.nxv1i16.nxv1i16(
-    <vscale x 1 x i16> %0,
-    <vscale x 1 x i16> %1,
-    <vscale x 1 x i16> %2,
-    <vscale x 1 x i1> %3,
-    i32 %4, i32 1)
-
-  ret <vscale x 1 x i16> %a
-}
-
-declare <vscale x 2 x i16> @llvm.riscv.vssubu.nxv2i16.nxv2i16(
-  <vscale x 2 x i16>,
-  <vscale x 2 x i16>,
-  <vscale x 2 x i16>,
-  i32);
-
-define <vscale x 2 x i16> @intrinsic_vssubu_vv_nxv2i16_nxv2i16_nxv2i16(<vscale x 2 x i16> %0, <vscale x 2 x i16> %1, i32 %2) nounwind {
-; CHECK-LABEL: intrinsic_vssubu_vv_nxv2i16_nxv2i16_nxv2i16:
-; CHECK:       # %bb.0: # %entry
-; CHECK-NEXT:    vsetvli zero, a0, e16, mf2, ta, ma
-; CHECK-NEXT:    vssubu.vv v8, v8, v9
-; CHECK-NEXT:    ret
-entry:
-  %a = call <vscale x 2 x i16> @llvm.riscv.vssubu.nxv2i16.nxv2i16(
-    <vscale x 2 x i16> undef,
-    <vscale x 2 x i16> %0,
-    <vscale x 2 x i16> %1,
-    i32 %2)
-
-  ret <vscale x 2 x i16> %a
-}
-
-declare <vscale x 2 x i16> @llvm.riscv.vssubu.mask.nxv2i16.nxv2i16(
-  <vscale x 2 x i16>,
-  <vscale x 2 x i16>,
-  <vscale x 2 x i16>,
-  <vscale x 2 x i1>,
-  i32,
-  i32);
-
-define <vscale x 2 x i16> @intrinsic_vssubu_mask_vv_nxv2i16_nxv2i16_nxv2i16(<vscale x 2 x i16> %0, <vscale x 2 x i16> %1, <vscale x 2 x i16> %2, <vscale x 2 x i1> %3, i32 %4) nounwind {
-; CHECK-LABEL: intrinsic_vssubu_mask_vv_nxv2i16_nxv2i16_nxv2i16:
-; CHECK:       # %bb.0: # %entry
-; CHECK-NEXT:    vsetvli zero, a0, e16, mf2, ta, mu
-; CHECK-NEXT:    vssubu.vv v8, v9, v10, v0.t
-; CHECK-NEXT:    ret
-entry:
-  %a = call <vscale x 2 x i16> @llvm.riscv.vssubu.mask.nxv2i16.nxv2i16(
-    <vscale x 2 x i16> %0,
-    <vscale x 2 x i16> %1,
-    <vscale x 2 x i16> %2,
-    <vscale x 2 x i1> %3,
-    i32 %4, i32 1)
-
-  ret <vscale x 2 x i16> %a
-}
-
-declare <vscale x 4 x i16> @llvm.riscv.vssubu.nxv4i16.nxv4i16(
-  <vscale x 4 x i16>,
-  <vscale x 4 x i16>,
-  <vscale x 4 x i16>,
-  i32);
-
-define <vscale x 4 x i16> @intrinsic_vssubu_vv_nxv4i16_nxv4i16_nxv4i16(<vscale x 4 x i16> %0, <vscale x 4 x i16> %1, i32 %2) nounwind {
-; CHECK-LABEL: intrinsic_vssubu_vv_nxv4i16_nxv4i16_nxv4i16:
-; CHECK:       # %bb.0: # %entry
-; CHECK-NEXT:    vsetvli zero, a0, e16, m1, ta, ma
-; CHECK-NEXT:    vssubu.vv v8, v8, v9
-; CHECK-NEXT:    ret
-entry:
-  %a = call <vscale x 4 x i16> @llvm.riscv.vssubu.nxv4i16.nxv4i16(
-    <vscale x 4 x i16> undef,
-    <vscale x 4 x i16> %0,
-    <vscale x 4 x i16> %1,
-    i32 %2)
-
-  ret <vscale x 4 x i16> %a
-}
-
-declare <vscale x 4 x i16> @llvm.riscv.vssubu.mask.nxv4i16.nxv4i16(
-  <vscale x 4 x i16>,
-  <vscale x 4 x i16>,
-  <vscale x 4 x i16>,
-  <vscale x 4 x i1>,
-  i32,
-  i32);
-
-define <vscale x 4 x i16> @intrinsic_vssubu_mask_vv_nxv4i16_nxv4i16_nxv4i16(<vscale x 4 x i16> %0, <vscale x 4 x i16> %1, <vscale x 4 x i16> %2, <vscale x 4 x i1> %3, i32 %4) nounwind {
-; CHECK-LABEL: intrinsic_vssubu_mask_vv_nxv4i16_nxv4i16_nxv4i16:
-; CHECK:       # %bb.0: # %entry
-; CHECK-NEXT:    vsetvli zero, a0, e16, m1, ta, mu
-; CHECK-NEXT:    vssubu.vv v8, v9, v10, v0.t
-; CHECK-NEXT:    ret
-entry:
-  %a = call <vscale x 4 x i16> @llvm.riscv.vssubu.mask.nxv4i16.nxv4i16(
-    <vscale x 4 x i16> %0,
-    <vscale x 4 x i16> %1,
-    <vscale x 4 x i16> %2,
-    <vscale x 4 x i1> %3,
-    i32 %4, i32 1)
-
-  ret <vscale x 4 x i16> %a
-}
-
-declare <vscale x 8 x i16> @llvm.riscv.vssubu.nxv8i16.nxv8i16(
-  <vscale x 8 x i16>,
-  <vscale x 8 x i16>,
-  <vscale x 8 x i16>,
-  i32);
-
-define <vscale x 8 x i16> @intrinsic_vssubu_vv_nxv8i16_nxv8i16_nxv8i16(<vscale x 8 x i16> %0, <vscale x 8 x i16> %1, i32 %2) nounwind {
-; CHECK-LABEL: intrinsic_vssubu_vv_nxv8i16_nxv8i16_nxv8i16:
-; CHECK:       # %bb.0: # %entry
-; CHECK-NEXT:    vsetvli zero, a0, e16, m2, ta, ma
-; CHECK-NEXT:    vssubu.vv v8, v8, v10
-; CHECK-NEXT:    ret
-entry:
-  %a = call <vscale x 8 x i16> @llvm.riscv.vssubu.nxv8i16.nxv8i16(
-    <vscale x 8 x i16> undef,
-    <vscale x 8 x i16> %0,
-    <vscale x 8 x i16> %1,
-    i32 %2)
-
-  ret <vscale x 8 x i16> %a
-}
-
-declare <vscale x 8 x i16> @llvm.riscv.vssubu.mask.nxv8i16.nxv8i16(
-  <vscale x 8 x i16>,
-  <vscale x 8 x i16>,
-  <vscale x 8 x i16>,
-  <vscale x 8 x i1>,
-  i32,
-  i32);
-
-define <vscale x 8 x i16> @intrinsic_vssubu_mask_vv_nxv8i16_nxv8i16_nxv8i16(<vscale x 8 x i16> %0, <vscale x 8 x i16> %1, <vscale x 8 x i16> %2, <vscale x 8 x i1> %3, i32 %4) nounwind {
-; CHECK-LABEL: intrinsic_vssubu_mask_vv_nxv8i16_nxv8i16_nxv8i16:
-; CHECK:       # %bb.0: # %entry
-; CHECK-NEXT:    vsetvli zero, a0, e16, m2, ta, mu
-; CHECK-NEXT:    vssubu.vv v8, v10, v12, v0.t
-; CHECK-NEXT:    ret
-entry:
-  %a = call <vscale x 8 x i16> @llvm.riscv.vssubu.mask.nxv8i16.nxv8i16(
-    <vscale x 8 x i16> %0,
-    <vscale x 8 x i16> %1,
-    <vscale x 8 x i16> %2,
-    <vscale x 8 x i1> %3,
-    i32 %4, i32 1)
-
-  ret <vscale x 8 x i16> %a
-}
-
-declare <vscale x 16 x i16> @llvm.riscv.vssubu.nxv16i16.nxv16i16(
-  <vscale x 16 x i16>,
-  <vscale x 16 x i16>,
-  <vscale x 16 x i16>,
-  i32);
-
-define <vscale x 16 x i16> @intrinsic_vssubu_vv_nxv16i16_nxv16i16_nxv16i16(<vscale x 16 x i16> %0, <vscale x 16 x i16> %1, i32 %2) nounwind {
-; CHECK-LABEL: intrinsic_vssubu_vv_nxv16i16_nxv16i16_nxv16i16:
-; CHECK:       # %bb.0: # %entry
-; CHECK-NEXT:    vsetvli zero, a0, e16, m4, ta, ma
-; CHECK-NEXT:    vssubu.vv v8, v8, v12
-; CHECK-NEXT:    ret
-entry:
-  %a = call <vscale x 16 x i16> @llvm.riscv.vssubu.nxv16i16.nxv16i16(
-    <vscale x 16 x i16> undef,
-    <vscale x 16 x i16> %0,
-    <vscale x 16 x i16> %1,
-    i32 %2)
-
-  ret <vscale x 16 x i16> %a
-}
-
-declare <vscale x 16 x i16> @llvm.riscv.vssubu.mask.nxv16i16.nxv16i16(
-  <vscale x 16 x i16>,
-  <vscale x 16 x i16>,
-  <vscale x 16 x i16>,
-  <vscale x 16 x i1>,
-  i32,
-  i32);
-
-define <vscale x 16 x i16> @intrinsic_vssubu_mask_vv_nxv16i16_nxv16i16_nxv16i16(<vscale x 16 x i16> %0, <vscale x 16 x i16> %1, <vscale x 16 x i16> %2, <vscale x 16 x i1> %3, i32 %4) nounwind {
-; CHECK-LABEL: intrinsic_vssubu_mask_vv_nxv16i16_nxv16i16_nxv16i16:
-; CHECK:       # %bb.0: # %entry
-; CHECK-NEXT:    vsetvli zero, a0, e16, m4, ta, mu
-; CHECK-NEXT:    vssubu.vv v8, v12, v16, v0.t
-; CHECK-NEXT:    ret
-entry:
-  %a = call <vscale x 16 x i16> @llvm.riscv.vssubu.mask.nxv16i16.nxv16i16(
-    <vscale x 16 x i16> %0,
-    <vscale x 16 x i16> %1,
-    <vscale x 16 x i16> %2,
-    <vscale x 16 x i1> %3,
-    i32 %4, i32 1)
-
-  ret <vscale x 16 x i16> %a
-}
-
-declare <vscale x 32 x i16> @llvm.riscv.vssubu.nxv32i16.nxv32i16(
-  <vscale x 32 x i16>,
-  <vscale x 32 x i16>,
-  <vscale x 32 x i16>,
-  i32);
-
-define <vscale x 32 x i16> @intrinsic_vssubu_vv_nxv32i16_nxv32i16_nxv32i16(<vscale x 32 x i16> %0, <vscale x 32 x i16> %1, i32 %2) nounwind {
-; CHECK-LABEL: intrinsic_vssubu_vv_nxv32i16_nxv32i16_nxv32i16:
-; CHECK:       # %bb.0: # %entry
-; CHECK-NEXT:    vsetvli zero, a0, e16, m8, ta, ma
-; CHECK-NEXT:    vssubu.vv v8, v8, v16
-; CHECK-NEXT:    ret
-entry:
-  %a = call <vscale x 32 x i16> @llvm.riscv.vssubu.nxv32i16.nxv32i16(
-    <vscale x 32 x i16> undef,
-    <vscale x 32 x i16> %0,
-    <vscale x 32 x i16> %1,
-    i32 %2)
-
-  ret <vscale x 32 x i16> %a
-}
-
-declare <vscale x 32 x i16> @llvm.riscv.vssubu.mask.nxv32i16.nxv32i16(
-  <vscale x 32 x i16>,
-  <vscale x 32 x i16>,
-  <vscale x 32 x i16>,
-  <vscale x 32 x i1>,
-  i32,
-  i32);
-
-define <vscale x 32 x i16> @intrinsic_vssubu_mask_vv_nxv32i16_nxv32i16_nxv32i16(<vscale x 32 x i16> %0, <vscale x 32 x i16> %1, <vscale x 32 x i16> %2, <vscale x 32 x i1> %3, i32 %4) nounwind {
-; CHECK-LABEL: intrinsic_vssubu_mask_vv_nxv32i16_nxv32i16_nxv32i16:
-; CHECK:       # %bb.0: # %entry
-; CHECK-NEXT:    vl8re16.v v24, (a0)
-; CHECK-NEXT:    vsetvli zero, a1, e16, m8, ta, mu
-; CHECK-NEXT:    vssubu.vv v8, v16, v24, v0.t
-; CHECK-NEXT:    ret
-entry:
-  %a = call <vscale x 32 x i16> @llvm.riscv.vssubu.mask.nxv32i16.nxv32i16(
-    <vscale x 32 x i16> %0,
-    <vscale x 32 x i16> %1,
-    <vscale x 32 x i16> %2,
-    <vscale x 32 x i1> %3,
-    i32 %4, i32 1)
-
-  ret <vscale x 32 x i16> %a
-}
-
-declare <vscale x 1 x i32> @llvm.riscv.vssubu.nxv1i32.nxv1i32(
-  <vscale x 1 x i32>,
-  <vscale x 1 x i32>,
-  <vscale x 1 x i32>,
-  i32);
-
-define <vscale x 1 x i32> @intrinsic_vssubu_vv_nxv1i32_nxv1i32_nxv1i32(<vscale x 1 x i32> %0, <vscale x 1 x i32> %1, i32 %2) nounwind {
-; CHECK-LABEL: intrinsic_vssubu_vv_nxv1i32_nxv1i32_nxv1i32:
-; CHECK:       # %bb.0: # %entry
-; CHECK-NEXT:    vsetvli zero, a0, e32, mf2, ta, ma
-; CHECK-NEXT:    vssubu.vv v8, v8, v9
-; CHECK-NEXT:    ret
-entry:
-  %a = call <vscale x 1 x i32> @llvm.riscv.vssubu.nxv1i32.nxv1i32(
-    <vscale x 1 x i32> undef,
-    <vscale x 1 x i32> %0,
-    <vscale x 1 x i32> %1,
-    i32 %2)
-
-  ret <vscale x 1 x i32> %a
-}
-
-declare <vscale x 1 x i32> @llvm.riscv.vssubu.mask.nxv1i32.nxv1i32(
-  <vscale x 1 x i32>,
-  <vscale x 1 x i32>,
-  <vscale x 1 x i32>,
-  <vscale x 1 x i1>,
-  i32,
-  i32);
-
-define <vscale x 1 x i32> @intrinsic_vssubu_mask_vv_nxv1i32_nxv1i32_nxv1i32(<vscale x 1 x i32> %0, <vscale x 1 x i32> %1, <vscale x 1 x i32> %2, <vscale x 1 x i1> %3, i32 %4) nounwind {
-; CHECK-LABEL: intrinsic_vssubu_mask_vv_nxv1i32_nxv1i32_nxv1i32:
-; CHECK:       # %bb.0: # %entry
-; CHECK-NEXT:    vsetvli zero, a0, e32, mf2, ta, mu
-; CHECK-NEXT:    vssubu.vv v8, v9, v10, v0.t
-; CHECK-NEXT:    ret
-entry:
-  %a = call <vscale x 1 x i32> @llvm.riscv.vssubu.mask.nxv1i32.nxv1i32(
-    <vscale x 1 x i32> %0,
-    <vscale x 1 x i32> %1,
-    <vscale x 1 x i32> %2,
-    <vscale x 1 x i1> %3,
-    i32 %4, i32 1)
-
-  ret <vscale x 1 x i32> %a
-}
-
-declare <vscale x 2 x i32> @llvm.riscv.vssubu.nxv2i32.nxv2i32(
-  <vscale x 2 x i32>,
-  <vscale x 2 x i32>,
-  <vscale x 2 x i32>,
-  i32);
-
-define <vscale x 2 x i32> @intrinsic_vssubu_vv_nxv2i32_nxv2i32_nxv2i32(<vscale x 2 x i32> %0, <vscale x 2 x i32> %1, i32 %2) nounwind {
-; CHECK-LABEL: intrinsic_vssubu_vv_nxv2i32_nxv2i32_nxv2i32:
-; CHECK:       # %bb.0: # %entry
-; CHECK-NEXT:    vsetvli zero, a0, e32, m1, ta, ma
-; CHECK-NEXT:    vssubu.vv v8, v8, v9
-; CHECK-NEXT:    ret
-entry:
-  %a = call <vscale x 2 x i32> @llvm.riscv.vssubu.nxv2i32.nxv2i32(
-    <vscale x 2 x i32> undef,
-    <vscale x 2 x i32> %0,
-    <vscale x 2 x i32> %1,
-    i32 %2)
-
-  ret <vscale x 2 x i32> %a
-}
-
-declare <vscale x 2 x i32> @llvm.riscv.vssubu.mask.nxv2i32.nxv2i32(
-  <vscale x 2 x i32>,
-  <vscale x 2 x i32>,
-  <vscale x 2 x i32>,
-  <vscale x 2 x i1>,
-  i32,
-  i32);
-
-define <vscale x 2 x i32> @intrinsic_vssubu_mask_vv_nxv2i32_nxv2i32_nxv2i32(<vscale x 2 x i32> %0, <vscale x 2 x i32> %1, <vscale x 2 x i32> %2, <vscale x 2 x i1> %3, i32 %4) nounwind {
-; CHECK-LABEL: intrinsic_vssubu_mask_vv_nxv2i32_nxv2i32_nxv2i32:
-; CHECK:       # %bb.0: # %entry
-; CHECK-NEXT:    vsetvli zero, a0, e32, m1, ta, mu
-; CHECK-NEXT:    vssubu.vv v8, v9, v10, v0.t
-; CHECK-NEXT:    ret
-entry:
-  %a = call <vscale x 2 x i32> @llvm.riscv.vssubu.mask.nxv2i32.nxv2i32(
-    <vscale x 2 x i32> %0,
-    <vscale x 2 x i32> %1,
-    <vscale x 2 x i32> %2,
-    <vscale x 2 x i1> %3,
-    i32 %4, i32 1)
-
-  ret <vscale x 2 x i32> %a
-}
-
-declare <vscale x 4 x i32> @llvm.riscv.vssubu.nxv4i32.nxv4i32(
-  <vscale x 4 x i32>,
-  <vscale x 4 x i32>,
-  <vscale x 4 x i32>,
-  i32);
-
-define <vscale x 4 x i32> @intrinsic_vssubu_vv_nxv4i32_nxv4i32_nxv4i32(<vscale x 4 x i32> %0, <vscale x 4 x i32> %1, i32 %2) nounwind {
-; CHECK-LABEL: intrinsic_vssubu_vv_nxv4i32_nxv4i32_nxv4i32:
-; CHECK:       # %bb.0: # %entry
-; CHECK-NEXT:    vsetvli zero, a0, e32, m2, ta, ma
-; CHECK-NEXT:    vssubu.vv v8, v8, v10
-; CHECK-NEXT:    ret
-entry:
-  %a = call <vscale x 4 x i32> @llvm.riscv.vssubu.nxv4i32.nxv4i32(
-    <vscale x 4 x i32> undef,
-    <vscale x 4 x i32> %0,
-    <vscale x 4 x i32> %1,
-    i32 %2)
-
-  ret <vscale x 4 x i32> %a
-}
-
-declare <vscale x 4 x i32> @llvm.riscv.vssubu.mask.nxv4i32.nxv4i32(
-  <vscale x 4 x i32>,
-  <vscale x 4 x i32>,
-  <vscale x 4 x i32>,
-  <vscale x 4 x i1>,
-  i32,
-  i32);
-
-define <vscale x 4 x i32> @intrinsic_vssubu_mask_vv_nxv4i32_nxv4i32_nxv4i32(<vscale x 4 x i32> %0, <vscale x 4 x i32> %1, <vscale x 4 x i32> %2, <vscale x 4 x i1> %3, i32 %4) nounwind {
-; CHECK-LABEL: intrinsic_vssubu_mask_vv_nxv4i32_nxv4i32_nxv4i32:
-; CHECK:       # %bb.0: # %entry
-; CHECK-NEXT:    vsetvli zero, a0, e32, m2, ta, mu
-; CHECK-NEXT:    vssubu.vv v8, v10, v12, v0.t
-; CHECK-NEXT:    ret
-entry:
-  %a = call <vscale x 4 x i32> @llvm.riscv.vssubu.mask.nxv4i32.nxv4i32(
-    <vscale x 4 x i32> %0,
-    <vscale x 4 x i32> %1,
-    <vscale x 4 x i32> %2,
-    <vscale x 4 x i1> %3,
-    i32 %4, i32 1)
-
-  ret <vscale x 4 x i32> %a
-}
-
-declare <vscale x 8 x i32> @llvm.riscv.vssubu.nxv8i32.nxv8i32(
-  <vscale x 8 x i32>,
-  <vscale x 8 x i32>,
-  <vscale x 8 x i32>,
-  i32);
-
-define <vscale x 8 x i32> @intrinsic_vssubu_vv_nxv8i32_nxv8i32_nxv8i32(<vscale x 8 x i32> %0, <vscale x 8 x i32> %1, i32 %2) nounwind {
-; CHECK-LABEL: intrinsic_vssubu_vv_nxv8i32_nxv8i32_nxv8i32:
-; CHECK:       # %bb.0: # %entry
-; CHECK-NEXT:    vsetvli zero, a0, e32, m4, ta, ma
-; CHECK-NEXT:    vssubu.vv v8, v8, v12
-; CHECK-NEXT:    ret
-entry:
-  %a = call <vscale x 8 x i32> @llvm.riscv.vssubu.nxv8i32.nxv8i32(
-    <vscale x 8 x i32> undef,
-    <vscale x 8 x i32> %0,
-    <vscale x 8 x i32> %1,
-    i32 %2)
-
-  ret <vscale x 8 x i32> %a
-}
-
-declare <vscale x 8 x i32> @llvm.riscv.vssubu.mask.nxv8i32.nxv8i32(
-  <vscale x 8 x i32>,
-  <vscale x 8 x i32>,
-  <vscale x 8 x i32>,
-  <vscale x 8 x i1>,
-  i32,
-  i32);
-
-define <vscale x 8 x i32> @intrinsic_vssubu_mask_vv_nxv8i32_nxv8i32_nxv8i32(<vscale x 8 x i32> %0, <vscale x 8 x i32> %1, <vscale x 8 x i32> %2, <vscale x 8 x i1> %3, i32 %4) nounwind {
-; CHECK-LABEL: intrinsic_vssubu_mask_vv_nxv8i32_nxv8i32_nxv8i32:
-; CHECK:       # %bb.0: # %entry
-; CHECK-NEXT:    vsetvli zero, a0, e32, m4, ta, mu
-; CHECK-NEXT:    vssubu.vv v8, v12, v16, v0.t
-; CHECK-NEXT:    ret
-entry:
-  %a = call <vscale x 8 x i32> @llvm.riscv.vssubu.mask.nxv8i32.nxv8i32(
-    <vscale x 8 x i32> %0,
-    <vscale x 8 x i32> %1,
-    <vscale x 8 x i32> %2,
-    <vscale x 8 x i1> %3,
-    i32 %4, i32 1)
-
-  ret <vscale x 8 x i32> %a
-}
-
-declare <vscale x 16 x i32> @llvm.riscv.vssubu.nxv16i32.nxv16i32(
-  <vscale x 16 x i32>,
-  <vscale x 16 x i32>,
-  <vscale x 16 x i32>,
-  i32);
-
-define <vscale x 16 x i32> @intrinsic_vssubu_vv_nxv16i32_nxv16i32_nxv16i32(<vscale x 16 x i32> %0, <vscale x 16 x i32> %1, i32 %2) nounwind {
-; CHECK-LABEL: intrinsic_vssubu_vv_nxv16i32_nxv16i32_nxv16i32:
-; CHECK:       # %bb.0: # %entry
-; CHECK-NEXT:    vsetvli zero, a0, e32, m8, ta, ma
-; CHECK-NEXT:    vssubu.vv v8, v8, v16
-; CHECK-NEXT:    ret
-entry:
-  %a = call <vscale x 16 x i32> @llvm.riscv.vssubu.nxv16i32.nxv16i32(
-    <vscale x 16 x i32> undef,
-    <vscale x 16 x i32> %0,
-    <vscale x 16 x i32> %1,
-    i32 %2)
-
-  ret <vscale x 16 x i32> %a
-}
-
-declare <vscale x 16 x i32> @llvm.riscv.vssubu.mask.nxv16i32.nxv16i32(
-  <vscale x 16 x i32>,
-  <vscale x 16 x i32>,
-  <vscale x 16 x i32>,
-  <vscale x 16 x i1>,
-  i32,
-  i32);
-
-define <vscale x 16 x i32> @intrinsic_vssubu_mask_vv_nxv16i32_nxv16i32_nxv16i32(<vscale x 16 x i32> %0, <vscale x 16 x i32> %1, <vscale x 16 x i32> %2, <vscale x 16 x i1> %3, i32 %4) nounwind {
-; CHECK-LABEL: intrinsic_vssubu_mask_vv_nxv16i32_nxv16i32_nxv16i32:
-; CHECK:       # %bb.0: # %entry
-; CHECK-NEXT:    vl8re32.v v24, (a0)
-; CHECK-NEXT:    vsetvli zero, a1, e32, m8, ta, mu
-; CHECK-NEXT:    vssubu.vv v8, v16, v24, v0.t
-; CHECK-NEXT:    ret
-entry:
-  %a = call <vscale x 16 x i32> @llvm.riscv.vssubu.mask.nxv16i32.nxv16i32(
-    <vscale x 16 x i32> %0,
-    <vscale x 16 x i32> %1,
-    <vscale x 16 x i32> %2,
-    <vscale x 16 x i1> %3,
-    i32 %4, i32 1)
-
-  ret <vscale x 16 x i32> %a
-}
-
-declare <vscale x 1 x i64> @llvm.riscv.vssubu.nxv1i64.nxv1i64(
-  <vscale x 1 x i64>,
-  <vscale x 1 x i64>,
-  <vscale x 1 x i64>,
-  i32);
-
-define <vscale x 1 x i64> @intrinsic_vssubu_vv_nxv1i64_nxv1i64_nxv1i64(<vscale x 1 x i64> %0, <vscale x 1 x i64> %1, i32 %2) nounwind {
-; CHECK-LABEL: intrinsic_vssubu_vv_nxv1i64_nxv1i64_nxv1i64:
-; CHECK:       # %bb.0: # %entry
-; CHECK-NEXT:    vsetvli zero, a0, e64, m1, ta, ma
-; CHECK-NEXT:    vssubu.vv v8, v8, v9
-; CHECK-NEXT:    ret
-entry:
-  %a = call <vscale x 1 x i64> @llvm.riscv.vssubu.nxv1i64.nxv1i64(
-    <vscale x 1 x i64> undef,
-    <vscale x 1 x i64> %0,
-    <vscale x 1 x i64> %1,
-    i32 %2)
-
-  ret <vscale x 1 x i64> %a
-}
-
-declare <vscale x 1 x i64> @llvm.riscv.vssubu.mask.nxv1i64.nxv1i64(
-  <vscale x 1 x i64>,
-  <vscale x 1 x i64>,
-  <vscale x 1 x i64>,
-  <vscale x 1 x i1>,
-  i32,
-  i32);
-
-define <vscale x 1 x i64> @intrinsic_vssubu_mask_vv_nxv1i64_nxv1i64_nxv1i64(<vscale x 1 x i64> %0, <vscale x 1 x i64> %1, <vscale x 1 x i64> %2, <vscale x 1 x i1> %3, i32 %4) nounwind {
-; CHECK-LABEL: intrinsic_vssubu_mask_vv_nxv1i64_nxv1i64_nxv1i64:
-; CHECK:       # %bb.0: # %entry
-; CHECK-NEXT:    vsetvli zero, a0, e64, m1, ta, mu
-; CHECK-NEXT:    vssubu.vv v8, v9, v10, v0.t
-; CHECK-NEXT:    ret
-entry:
-  %a = call <vscale x 1 x i64> @llvm.riscv.vssubu.mask.nxv1i64.nxv1i64(
-    <vscale x 1 x i64> %0,
-    <vscale x 1 x i64> %1,
-    <vscale x 1 x i64> %2,
-    <vscale x 1 x i1> %3,
-    i32 %4, i32 1)
-
-  ret <vscale x 1 x i64> %a
-}
-
-declare <vscale x 2 x i64> @llvm.riscv.vssubu.nxv2i64.nxv2i64(
-  <vscale x 2 x i64>,
-  <vscale x 2 x i64>,
-  <vscale x 2 x i64>,
-  i32);
-
-define <vscale x 2 x i64> @intrinsic_vssubu_vv_nxv2i64_nxv2i64_nxv2i64(<vscale x 2 x i64> %0, <vscale x 2 x i64> %1, i32 %2) nounwind {
-; CHECK-LABEL: intrinsic_vssubu_vv_nxv2i64_nxv2i64_nxv2i64:
-; CHECK:       # %bb.0: # %entry
-; CHECK-NEXT:    vsetvli zero, a0, e64, m2, ta, ma
-; CHECK-NEXT:    vssubu.vv v8, v8, v10
-; CHECK-NEXT:    ret
-entry:
-  %a = call <vscale x 2 x i64> @llvm.riscv.vssubu.nxv2i64.nxv2i64(
-    <vscale x 2 x i64> undef,
-    <vscale x 2 x i64> %0,
-    <vscale x 2 x i64> %1,
-    i32 %2)
-
-  ret <vscale x 2 x i64> %a
-}
-
-declare <vscale x 2 x i64> @llvm.riscv.vssubu.mask.nxv2i64.nxv2i64(
-  <vscale x 2 x i64>,
-  <vscale x 2 x i64>,
-  <vscale x 2 x i64>,
-  <vscale x 2 x i1>,
-  i32,
-  i32);
-
-define <vscale x 2 x i64> @intrinsic_vssubu_mask_vv_nxv2i64_nxv2i64_nxv2i64(<vscale x 2 x i64> %0, <vscale x 2 x i64> %1, <vscale x 2 x i64> %2, <vscale x 2 x i1> %3, i32 %4) nounwind {
-; CHECK-LABEL: intrinsic_vssubu_mask_vv_nxv2i64_nxv2i64_nxv2i64:
-; CHECK:       # %bb.0: # %entry
-; CHECK-NEXT:    vsetvli zero, a0, e64, m2, ta, mu
-; CHECK-NEXT:    vssubu.vv v8, v10, v12, v0.t
-; CHECK-NEXT:    ret
-entry:
-  %a = call <vscale x 2 x i64> @llvm.riscv.vssubu.mask.nxv2i64.nxv2i64(
-    <vscale x 2 x i64> %0,
-    <vscale x 2 x i64> %1,
-    <vscale x 2 x i64> %2,
-    <vscale x 2 x i1> %3,
-    i32 %4, i32 1)
-
-  ret <vscale x 2 x i64> %a
-}
-
-declare <vscale x 4 x i64> @llvm.riscv.vssubu.nxv4i64.nxv4i64(
-  <vscale x 4 x i64>,
-  <vscale x 4 x i64>,
-  <vscale x 4 x i64>,
-  i32);
-
-define <vscale x 4 x i64> @intrinsic_vssubu_vv_nxv4i64_nxv4i64_nxv4i64(<vscale x 4 x i64> %0, <vscale x 4 x i64> %1, i32 %2) nounwind {
-; CHECK-LABEL: intrinsic_vssubu_vv_nxv4i64_nxv4i64_nxv4i64:
-; CHECK:       # %bb.0: # %entry
-; CHECK-NEXT:    vsetvli zero, a0, e64, m4, ta, ma
-; CHECK-NEXT:    vssubu.vv v8, v8, v12
-; CHECK-NEXT:    ret
-entry:
-  %a = call <vscale x 4 x i64> @llvm.riscv.vssubu.nxv4i64.nxv4i64(
-    <vscale x 4 x i64> undef,
-    <vscale x 4 x i64> %0,
-    <vscale x 4 x i64> %1,
-    i32 %2)
-
-  ret <vscale x 4 x i64> %a
-}
-
-declare <vscale x 4 x i64> @llvm.riscv.vssubu.mask.nxv4i64.nxv4i64(
-  <vscale x 4 x i64>,
-  <vscale x 4 x i64>,
-  <vscale x 4 x i64>,
-  <vscale x 4 x i1>,
-  i32,
-  i32);
-
-define <vscale x 4 x i64> @intrinsic_vssubu_mask_vv_nxv4i64_nxv4i64_nxv4i64(<vscale x 4 x i64> %0, <vscale x 4 x i64> %1, <vscale x 4 x i64> %2, <vscale x 4 x i1> %3, i32 %4) nounwind {
-; CHECK-LABEL: intrinsic_vssubu_mask_vv_nxv4i64_nxv4i64_nxv4i64:
-; CHECK:       # %bb.0: # %entry
-; CHECK-NEXT:    vsetvli zero, a0, e64, m4, ta, mu
-; CHECK-NEXT:    vssubu.vv v8, v12, v16, v0.t
-; CHECK-NEXT:    ret
-entry:
-  %a = call <vscale x 4 x i64> @llvm.riscv.vssubu.mask.nxv4i64.nxv4i64(
-    <vscale x 4 x i64> %0,
-    <vscale x 4 x i64> %1,
-    <vscale x 4 x i64> %2,
-    <vscale x 4 x i1> %3,
-    i32 %4, i32 1)
-
-  ret <vscale x 4 x i64> %a
-}
-
-declare <vscale x 8 x i64> @llvm.riscv.vssubu.nxv8i64.nxv8i64(
-  <vscale x 8 x i64>,
-  <vscale x 8 x i64>,
-  <vscale x 8 x i64>,
-  i32);
-
-define <vscale x 8 x i64> @intrinsic_vssubu_vv_nxv8i64_nxv8i64_nxv8i64(<vscale x 8 x i64> %0, <vscale x 8 x i64> %1, i32 %2) nounwind {
-; CHECK-LABEL: intrinsic_vssubu_vv_nxv8i64_nxv8i64_nxv8i64:
-; CHECK:       # %bb.0: # %entry
-; CHECK-NEXT:    vsetvli zero, a0, e64, m8, ta, ma
-; CHECK-NEXT:    vssubu.vv v8, v8, v16
-; CHECK-NEXT:    ret
-entry:
-  %a = call <vscale x 8 x i64> @llvm.riscv.vssubu.nxv8i64.nxv8i64(
-    <vscale x 8 x i64> undef,
-    <vscale x 8 x i64> %0,
-    <vscale x 8 x i64> %1,
-    i32 %2)
-
-  ret <vscale x 8 x i64> %a
-}
-
-declare <vscale x 8 x i64> @llvm.riscv.vssubu.mask.nxv8i64.nxv8i64(
-  <vscale x 8 x i64>,
-  <vscale x 8 x i64>,
-  <vscale x 8 x i64>,
-  <vscale x 8 x i1>,
-  i32,
-  i32);
-
-define <vscale x 8 x i64> @intrinsic_vssubu_mask_vv_nxv8i64_nxv8i64_nxv8i64(<vscale x 8 x i64> %0, <vscale x 8 x i64> %1, <vscale x 8 x i64> %2, <vscale x 8 x i1> %3, i32 %4) nounwind {
-; CHECK-LABEL: intrinsic_vssubu_mask_vv_nxv8i64_nxv8i64_nxv8i64:
-; CHECK:       # %bb.0: # %entry
-; CHECK-NEXT:    vl8re64.v v24, (a0)
-; CHECK-NEXT:    vsetvli zero, a1, e64, m8, ta, mu
-; CHECK-NEXT:    vssubu.vv v8, v16, v24, v0.t
-; CHECK-NEXT:    ret
-entry:
-  %a = call <vscale x 8 x i64> @llvm.riscv.vssubu.mask.nxv8i64.nxv8i64(
-    <vscale x 8 x i64> %0,
-    <vscale x 8 x i64> %1,
-    <vscale x 8 x i64> %2,
-    <vscale x 8 x i1> %3,
-    i32 %4, i32 1)
-
-  ret <vscale x 8 x i64> %a
-}
-
-declare <vscale x 1 x i8> @llvm.riscv.vssubu.nxv1i8.i8(
-  <vscale x 1 x i8>,
-  <vscale x 1 x i8>,
-  i8,
-  i32);
-
-define <vscale x 1 x i8> @intrinsic_vssubu_vx_nxv1i8_nxv1i8_i8(<vscale x 1 x i8> %0, i8 %1, i32 %2) nounwind {
-; CHECK-LABEL: intrinsic_vssubu_vx_nxv1i8_nxv1i8_i8:
-; CHECK:       # %bb.0: # %entry
-; CHECK-NEXT:    vsetvli zero, a1, e8, mf8, ta, ma
-; CHECK-NEXT:    vssubu.vx v8, v8, a0
-; CHECK-NEXT:    ret
-entry:
-  %a = call <vscale x 1 x i8> @llvm.riscv.vssubu.nxv1i8.i8(
-    <vscale x 1 x i8> undef,
-    <vscale x 1 x i8> %0,
-    i8 %1,
-    i32 %2)
-
-  ret <vscale x 1 x i8> %a
-}
-
-declare <vscale x 1 x i8> @llvm.riscv.vssubu.mask.nxv1i8.i8(
-  <vscale x 1 x i8>,
-  <vscale x 1 x i8>,
-  i8,
-  <vscale x 1 x i1>,
-  i32,
-  i32);
-
-define <vscale x 1 x i8> @intrinsic_vssubu_mask_vx_nxv1i8_nxv1i8_i8(<vscale x 1 x i8> %0, <vscale x 1 x i8> %1, i8 %2, <vscale x 1 x i1> %3, i32 %4) nounwind {
-; CHECK-LABEL: intrinsic_vssubu_mask_vx_nxv1i8_nxv1i8_i8:
-; CHECK:       # %bb.0: # %entry
-; CHECK-NEXT:    vsetvli zero, a1, e8, mf8, ta, mu
-; CHECK-NEXT:    vssubu.vx v8, v9, a0, v0.t
-; CHECK-NEXT:    ret
-entry:
-  %a = call <vscale x 1 x i8> @llvm.riscv.vssubu.mask.nxv1i8.i8(
-    <vscale x 1 x i8> %0,
-    <vscale x 1 x i8> %1,
-    i8 %2,
-    <vscale x 1 x i1> %3,
-    i32 %4, i32 1)
-
-  ret <vscale x 1 x i8> %a
-}
-
-declare <vscale x 2 x i8> @llvm.riscv.vssubu.nxv2i8.i8(
-  <vscale x 2 x i8>,
-  <vscale x 2 x i8>,
-  i8,
-  i32);
-
-define <vscale x 2 x i8> @intrinsic_vssubu_vx_nxv2i8_nxv2i8_i8(<vscale x 2 x i8> %0, i8 %1, i32 %2) nounwind {
-; CHECK-LABEL: intrinsic_vssubu_vx_nxv2i8_nxv2i8_i8:
-; CHECK:       # %bb.0: # %entry
-; CHECK-NEXT:    vsetvli zero, a1, e8, mf4, ta, ma
-; CHECK-NEXT:    vssubu.vx v8, v8, a0
-; CHECK-NEXT:    ret
-entry:
-  %a = call <vscale x 2 x i8> @llvm.riscv.vssubu.nxv2i8.i8(
-    <vscale x 2 x i8> undef,
-    <vscale x 2 x i8> %0,
-    i8 %1,
-    i32 %2)
-
-  ret <vscale x 2 x i8> %a
-}
-
-declare <vscale x 2 x i8> @llvm.riscv.vssubu.mask.nxv2i8.i8(
-  <vscale x 2 x i8>,
-  <vscale x 2 x i8>,
-  i8,
-  <vscale x 2 x i1>,
-  i32,
-  i32);
-
-define <vscale x 2 x i8> @intrinsic_vssubu_mask_vx_nxv2i8_nxv2i8_i8(<vscale x 2 x i8> %0, <vscale x 2 x i8> %1, i8 %2, <vscale x 2 x i1> %3, i32 %4) nounwind {
-; CHECK-LABEL: intrinsic_vssubu_mask_vx_nxv2i8_nxv2i8_i8:
-; CHECK:       # %bb.0: # %entry
-; CHECK-NEXT:    vsetvli zero, a1, e8, mf4, ta, mu
-; CHECK-NEXT:    vssubu.vx v8, v9, a0, v0.t
-; CHECK-NEXT:    ret
-entry:
-  %a = call <vscale x 2 x i8> @llvm.riscv.vssubu.mask.nxv2i8.i8(
-    <vscale x 2 x i8> %0,
-    <vscale x 2 x i8> %1,
-    i8 %2,
-    <vscale x 2 x i1> %3,
-    i32 %4, i32 1)
-
-  ret <vscale x 2 x i8> %a
-}
-
-declare <vscale x 4 x i8> @llvm.riscv.vssubu.nxv4i8.i8(
-  <vscale x 4 x i8>,
-  <vscale x 4 x i8>,
-  i8,
-  i32);
-
-define <vscale x 4 x i8> @intrinsic_vssubu_vx_nxv4i8_nxv4i8_i8(<vscale x 4 x i8> %0, i8 %1, i32 %2) nounwind {
-; CHECK-LABEL: intrinsic_vssubu_vx_nxv4i8_nxv4i8_i8:
-; CHECK:       # %bb.0: # %entry
-; CHECK-NEXT:    vsetvli zero, a1, e8, mf2, ta, ma
-; CHECK-NEXT:    vssubu.vx v8, v8, a0
-; CHECK-NEXT:    ret
-entry:
-  %a = call <vscale x 4 x i8> @llvm.riscv.vssubu.nxv4i8.i8(
-    <vscale x 4 x i8> undef,
-    <vscale x 4 x i8> %0,
-    i8 %1,
-    i32 %2)
-
-  ret <vscale x 4 x i8> %a
-}
-
-declare <vscale x 4 x i8> @llvm.riscv.vssubu.mask.nxv4i8.i8(
-  <vscale x 4 x i8>,
-  <vscale x 4 x i8>,
-  i8,
-  <vscale x 4 x i1>,
-  i32,
-  i32);
-
-define <vscale x 4 x i8> @intrinsic_vssubu_mask_vx_nxv4i8_nxv4i8_i8(<vscale x 4 x i8> %0, <vscale x 4 x i8> %1, i8 %2, <vscale x 4 x i1> %3, i32 %4) nounwind {
-; CHECK-LABEL: intrinsic_vssubu_mask_vx_nxv4i8_nxv4i8_i8:
-; CHECK:       # %bb.0: # %entry
-; CHECK-NEXT:    vsetvli zero, a1, e8, mf2, ta, mu
-; CHECK-NEXT:    vssubu.vx v8, v9, a0, v0.t
-; CHECK-NEXT:    ret
-entry:
-  %a = call <vscale x 4 x i8> @llvm.riscv.vssubu.mask.nxv4i8.i8(
-    <vscale x 4 x i8> %0,
-    <vscale x 4 x i8> %1,
-    i8 %2,
-    <vscale x 4 x i1> %3,
-    i32 %4, i32 1)
-
-  ret <vscale x 4 x i8> %a
-}
-
-declare <vscale x 8 x i8> @llvm.riscv.vssubu.nxv8i8.i8(
-  <vscale x 8 x i8>,
-  <vscale x 8 x i8>,
-  i8,
-  i32);
-
-define <vscale x 8 x i8> @intrinsic_vssubu_vx_nxv8i8_nxv8i8_i8(<vscale x 8 x i8> %0, i8 %1, i32 %2) nounwind {
-; CHECK-LABEL: intrinsic_vssubu_vx_nxv8i8_nxv8i8_i8:
-; CHECK:       # %bb.0: # %entry
-; CHECK-NEXT:    vsetvli zero, a1, e8, m1, ta, ma
-; CHECK-NEXT:    vssubu.vx v8, v8, a0
-; CHECK-NEXT:    ret
-entry:
-  %a = call <vscale x 8 x i8> @llvm.riscv.vssubu.nxv8i8.i8(
-    <vscale x 8 x i8> undef,
-    <vscale x 8 x i8> %0,
-    i8 %1,
-    i32 %2)
-
-  ret <vscale x 8 x i8> %a
-}
-
-declare <vscale x 8 x i8> @llvm.riscv.vssubu.mask.nxv8i8.i8(
-  <vscale x 8 x i8>,
-  <vscale x 8 x i8>,
-  i8,
-  <vscale x 8 x i1>,
-  i32,
-  i32);
-
-define <vscale x 8 x i8> @intrinsic_vssubu_mask_vx_nxv8i8_nxv8i8_i8(<vscale x 8 x i8> %0, <vscale x 8 x i8> %1, i8 %2, <vscale x 8 x i1> %3, i32 %4) nounwind {
-; CHECK-LABEL: intrinsic_vssubu_mask_vx_nxv8i8_nxv8i8_i8:
-; CHECK:       # %bb.0: # %entry
-; CHECK-NEXT:    vsetvli zero, a1, e8, m1, ta, mu
-; CHECK-NEXT:    vssubu.vx v8, v9, a0, v0.t
-; CHECK-NEXT:    ret
-entry:
-  %a = call <vscale x 8 x i8> @llvm.riscv.vssubu.mask.nxv8i8.i8(
-    <vscale x 8 x i8> %0,
-    <vscale x 8 x i8> %1,
-    i8 %2,
-    <vscale x 8 x i1> %3,
-    i32 %4, i32 1)
-
-  ret <vscale x 8 x i8> %a
-}
-
-declare <vscale x 16 x i8> @llvm.riscv.vssubu.nxv16i8.i8(
-  <vscale x 16 x i8>,
-  <vscale x 16 x i8>,
-  i8,
-  i32);
-
-define <vscale x 16 x i8> @intrinsic_vssubu_vx_nxv16i8_nxv16i8_i8(<vscale x 16 x i8> %0, i8 %1, i32 %2) nounwind {
-; CHECK-LABEL: intrinsic_vssubu_vx_nxv16i8_nxv16i8_i8:
-; CHECK:       # %bb.0: # %entry
-; CHECK-NEXT:    vsetvli zero, a1, e8, m2, ta, ma
-; CHECK-NEXT:    vssubu.vx v8, v8, a0
-; CHECK-NEXT:    ret
-entry:
-  %a = call <vscale x 16 x i8> @llvm.riscv.vssubu.nxv16i8.i8(
-    <vscale x 16 x i8> undef,
-    <vscale x 16 x i8> %0,
-    i8 %1,
-    i32 %2)
-
-  ret <vscale x 16 x i8> %a
-}
-
-declare <vscale x 16 x i8> @llvm.riscv.vssubu.mask.nxv16i8.i8(
-  <vscale x 16 x i8>,
-  <vscale x 16 x i8>,
-  i8,
-  <vscale x 16 x i1>,
-  i32,
-  i32);
-
-define <vscale x 16 x i8> @intrinsic_vssubu_mask_vx_nxv16i8_nxv16i8_i8(<vscale x 16 x i8> %0, <vscale x 16 x i8> %1, i8 %2, <vscale x 16 x i1> %3, i32 %4) nounwind {
-; CHECK-LABEL: intrinsic_vssubu_mask_vx_nxv16i8_nxv16i8_i8:
-; CHECK:       # %bb.0: # %entry
-; CHECK-NEXT:    vsetvli zero, a1, e8, m2, ta, mu
-; CHECK-NEXT:    vssubu.vx v8, v10, a0, v0.t
-; CHECK-NEXT:    ret
-entry:
-  %a = call <vscale x 16 x i8> @llvm.riscv.vssubu.mask.nxv16i8.i8(
-    <vscale x 16 x i8> %0,
-    <vscale x 16 x i8> %1,
-    i8 %2,
-    <vscale x 16 x i1> %3,
-    i32 %4, i32 1)
-
-  ret <vscale x 16 x i8> %a
-}
-
-declare <vscale x 32 x i8> @llvm.riscv.vssubu.nxv32i8.i8(
-  <vscale x 32 x i8>,
-  <vscale x 32 x i8>,
-  i8,
-  i32);
-
-define <vscale x 32 x i8> @intrinsic_vssubu_vx_nxv32i8_nxv32i8_i8(<vscale x 32 x i8> %0, i8 %1, i32 %2) nounwind {
-; CHECK-LABEL: intrinsic_vssubu_vx_nxv32i8_nxv32i8_i8:
-; CHECK:       # %bb.0: # %entry
-; CHECK-NEXT:    vsetvli zero, a1, e8, m4, ta, ma
-; CHECK-NEXT:    vssubu.vx v8, v8, a0
-; CHECK-NEXT:    ret
-entry:
-  %a = call <vscale x 32 x i8> @llvm.riscv.vssubu.nxv32i8.i8(
-    <vscale x 32 x i8> undef,
-    <vscale x 32 x i8> %0,
-    i8 %1,
-    i32 %2)
-
-  ret <vscale x 32 x i8> %a
-}
-
-declare <vscale x 32 x i8> @llvm.riscv.vssubu.mask.nxv32i8.i8(
-  <vscale x 32 x i8>,
-  <vscale x 32 x i8>,
-  i8,
-  <vscale x 32 x i1>,
-  i32,
-  i32);
-
-define <vscale x 32 x i8> @intrinsic_vssubu_mask_vx_nxv32i8_nxv32i8_i8(<vscale x 32 x i8> %0, <vscale x 32 x i8> %1, i8 %2, <vscale x 32 x i1> %3, i32 %4) nounwind {
-; CHECK-LABEL: intrinsic_vssubu_mask_vx_nxv32i8_nxv32i8_i8:
-; CHECK:       # %bb.0: # %entry
-; CHECK-NEXT:    vsetvli zero, a1, e8, m4, ta, mu
-; CHECK-NEXT:    vssubu.vx v8, v12, a0, v0.t
-; CHECK-NEXT:    ret
-entry:
-  %a = call <vscale x 32 x i8> @llvm.riscv.vssubu.mask.nxv32i8.i8(
-    <vscale x 32 x i8> %0,
-    <vscale x 32 x i8> %1,
-    i8 %2,
-    <vscale x 32 x i1> %3,
-    i32 %4, i32 1)
-
-  ret <vscale x 32 x i8> %a
-}
-
-declare <vscale x 64 x i8> @llvm.riscv.vssubu.nxv64i8.i8(
-  <vscale x 64 x i8>,
-  <vscale x 64 x i8>,
-  i8,
-  i32);
-
-define <vscale x 64 x i8> @intrinsic_vssubu_vx_nxv64i8_nxv64i8_i8(<vscale x 64 x i8> %0, i8 %1, i32 %2) nounwind {
-; CHECK-LABEL: intrinsic_vssubu_vx_nxv64i8_nxv64i8_i8:
-; CHECK:       # %bb.0: # %entry
-; CHECK-NEXT:    vsetvli zero, a1, e8, m8, ta, ma
-; CHECK-NEXT:    vssubu.vx v8, v8, a0
-; CHECK-NEXT:    ret
-entry:
-  %a = call <vscale x 64 x i8> @llvm.riscv.vssubu.nxv64i8.i8(
-    <vscale x 64 x i8> undef,
-    <vscale x 64 x i8> %0,
-    i8 %1,
-    i32 %2)
-
-  ret <vscale x 64 x i8> %a
-}
-
-declare <vscale x 64 x i8> @llvm.riscv.vssubu.mask.nxv64i8.i8(
-  <vscale x 64 x i8>,
-  <vscale x 64 x i8>,
-  i8,
-  <vscale x 64 x i1>,
-  i32,
-  i32);
-
-define <vscale x 64 x i8> @intrinsic_vssubu_mask_vx_nxv64i8_nxv64i8_i8(<vscale x 64 x i8> %0, <vscale x 64 x i8> %1, i8 %2, <vscale x 64 x i1> %3, i32 %4) nounwind {
-; CHECK-LABEL: intrinsic_vssubu_mask_vx_nxv64i8_nxv64i8_i8:
-; CHECK:       # %bb.0: # %entry
-; CHECK-NEXT:    vsetvli zero, a1, e8, m8, ta, mu
-; CHECK-NEXT:    vssubu.vx v8, v16, a0, v0.t
-; CHECK-NEXT:    ret
-entry:
-  %a = call <vscale x 64 x i8> @llvm.riscv.vssubu.mask.nxv64i8.i8(
-    <vscale x 64 x i8> %0,
-    <vscale x 64 x i8> %1,
-    i8 %2,
-    <vscale x 64 x i1> %3,
-    i32 %4, i32 1)
-
-  ret <vscale x 64 x i8> %a
-}
-
-declare <vscale x 1 x i16> @llvm.riscv.vssubu.nxv1i16.i16(
-  <vscale x 1 x i16>,
-  <vscale x 1 x i16>,
-  i16,
-  i32);
-
-define <vscale x 1 x i16> @intrinsic_vssubu_vx_nxv1i16_nxv1i16_i16(<vscale x 1 x i16> %0, i16 %1, i32 %2) nounwind {
-; CHECK-LABEL: intrinsic_vssubu_vx_nxv1i16_nxv1i16_i16:
-; CHECK:       # %bb.0: # %entry
-; CHECK-NEXT:    vsetvli zero, a1, e16, mf4, ta, ma
-; CHECK-NEXT:    vssubu.vx v8, v8, a0
-; CHECK-NEXT:    ret
-entry:
-  %a = call <vscale x 1 x i16> @llvm.riscv.vssubu.nxv1i16.i16(
-    <vscale x 1 x i16> undef,
-    <vscale x 1 x i16> %0,
-    i16 %1,
-    i32 %2)
-
-  ret <vscale x 1 x i16> %a
-}
-
-declare <vscale x 1 x i16> @llvm.riscv.vssubu.mask.nxv1i16.i16(
-  <vscale x 1 x i16>,
-  <vscale x 1 x i16>,
-  i16,
-  <vscale x 1 x i1>,
-  i32,
-  i32);
-
-define <vscale x 1 x i16> @intrinsic_vssubu_mask_vx_nxv1i16_nxv1i16_i16(<vscale x 1 x i16> %0, <vscale x 1 x i16> %1, i16 %2, <vscale x 1 x i1> %3, i32 %4) nounwind {
-; CHECK-LABEL: intrinsic_vssubu_mask_vx_nxv1i16_nxv1i16_i16:
-; CHECK:       # %bb.0: # %entry
-; CHECK-NEXT:    vsetvli zero, a1, e16, mf4, ta, mu
-; CHECK-NEXT:    vssubu.vx v8, v9, a0, v0.t
-; CHECK-NEXT:    ret
-entry:
-  %a = call <vscale x 1 x i16> @llvm.riscv.vssubu.mask.nxv1i16.i16(
-    <vscale x 1 x i16> %0,
-    <vscale x 1 x i16> %1,
-    i16 %2,
-    <vscale x 1 x i1> %3,
-    i32 %4, i32 1)
-
-  ret <vscale x 1 x i16> %a
-}
-
-declare <vscale x 2 x i16> @llvm.riscv.vssubu.nxv2i16.i16(
-  <vscale x 2 x i16>,
-  <vscale x 2 x i16>,
-  i16,
-  i32);
-
-define <vscale x 2 x i16> @intrinsic_vssubu_vx_nxv2i16_nxv2i16_i16(<vscale x 2 x i16> %0, i16 %1, i32 %2) nounwind {
-; CHECK-LABEL: intrinsic_vssubu_vx_nxv2i16_nxv2i16_i16:
-; CHECK:       # %bb.0: # %entry
-; CHECK-NEXT:    vsetvli zero, a1, e16, mf2, ta, ma
-; CHECK-NEXT:    vssubu.vx v8, v8, a0
-; CHECK-NEXT:    ret
-entry:
-  %a = call <vscale x 2 x i16> @llvm.riscv.vssubu.nxv2i16.i16(
-    <vscale x 2 x i16> undef,
-    <vscale x 2 x i16> %0,
-    i16 %1,
-    i32 %2)
-
-  ret <vscale x 2 x i16> %a
-}
-
-declare <vscale x 2 x i16> @llvm.riscv.vssubu.mask.nxv2i16.i16(
-  <vscale x 2 x i16>,
-  <vscale x 2 x i16>,
-  i16,
-  <vscale x 2 x i1>,
-  i32,
-  i32);
-
-define <vscale x 2 x i16> @intrinsic_vssubu_mask_vx_nxv2i16_nxv2i16_i16(<vscale x 2 x i16> %0, <vscale x 2 x i16> %1, i16 %2, <vscale x 2 x i1> %3, i32 %4) nounwind {
-; CHECK-LABEL: intrinsic_vssubu_mask_vx_nxv2i16_nxv2i16_i16:
-; CHECK:       # %bb.0: # %entry
-; CHECK-NEXT:    vsetvli zero, a1, e16, mf2, ta, mu
-; CHECK-NEXT:    vssubu.vx v8, v9, a0, v0.t
-; CHECK-NEXT:    ret
-entry:
-  %a = call <vscale x 2 x i16> @llvm.riscv.vssubu.mask.nxv2i16.i16(
-    <vscale x 2 x i16> %0,
-    <vscale x 2 x i16> %1,
-    i16 %2,
-    <vscale x 2 x i1> %3,
-    i32 %4, i32 1)
-
-  ret <vscale x 2 x i16> %a
-}
-
-declare <vscale x 4 x i16> @llvm.riscv.vssubu.nxv4i16.i16(
-  <vscale x 4 x i16>,
-  <vscale x 4 x i16>,
-  i16,
-  i32);
-
-define <vscale x 4 x i16> @intrinsic_vssubu_vx_nxv4i16_nxv4i16_i16(<vscale x 4 x i16> %0, i16 %1, i32 %2) nounwind {
-; CHECK-LABEL: intrinsic_vssubu_vx_nxv4i16_nxv4i16_i16:
-; CHECK:       # %bb.0: # %entry
-; CHECK-NEXT:    vsetvli zero, a1, e16, m1, ta, ma
-; CHECK-NEXT:    vssubu.vx v8, v8, a0
-; CHECK-NEXT:    ret
-entry:
-  %a = call <vscale x 4 x i16> @llvm.riscv.vssubu.nxv4i16.i16(
-    <vscale x 4 x i16> undef,
-    <vscale x 4 x i16> %0,
-    i16 %1,
-    i32 %2)
-
-  ret <vscale x 4 x i16> %a
-}
-
-declare <vscale x 4 x i16> @llvm.riscv.vssubu.mask.nxv4i16.i16(
-  <vscale x 4 x i16>,
-  <vscale x 4 x i16>,
-  i16,
-  <vscale x 4 x i1>,
-  i32,
-  i32);
-
-define <vscale x 4 x i16> @intrinsic_vssubu_mask_vx_nxv4i16_nxv4i16_i16(<vscale x 4 x i16> %0, <vscale x 4 x i16> %1, i16 %2, <vscale x 4 x i1> %3, i32 %4) nounwind {
-; CHECK-LABEL: intrinsic_vssubu_mask_vx_nxv4i16_nxv4i16_i16:
-; CHECK:       # %bb.0: # %entry
-; CHECK-NEXT:    vsetvli zero, a1, e16, m1, ta, mu
-; CHECK-NEXT:    vssubu.vx v8, v9, a0, v0.t
-; CHECK-NEXT:    ret
-entry:
-  %a = call <vscale x 4 x i16> @llvm.riscv.vssubu.mask.nxv4i16.i16(
-    <vscale x 4 x i16> %0,
-    <vscale x 4 x i16> %1,
-    i16 %2,
-    <vscale x 4 x i1> %3,
-    i32 %4, i32 1)
-
-  ret <vscale x 4 x i16> %a
-}
-
-declare <vscale x 8 x i16> @llvm.riscv.vssubu.nxv8i16.i16(
-  <vscale x 8 x i16>,
-  <vscale x 8 x i16>,
-  i16,
-  i32);
-
-define <vscale x 8 x i16> @intrinsic_vssubu_vx_nxv8i16_nxv8i16_i16(<vscale x 8 x i16> %0, i16 %1, i32 %2) nounwind {
-; CHECK-LABEL: intrinsic_vssubu_vx_nxv8i16_nxv8i16_i16:
-; CHECK:       # %bb.0: # %entry
-; CHECK-NEXT:    vsetvli zero, a1, e16, m2, ta, ma
-; CHECK-NEXT:    vssubu.vx v8, v8, a0
-; CHECK-NEXT:    ret
-entry:
-  %a = call <vscale x 8 x i16> @llvm.riscv.vssubu.nxv8i16.i16(
-    <vscale x 8 x i16> undef,
-    <vscale x 8 x i16> %0,
-    i16 %1,
-    i32 %2)
-
-  ret <vscale x 8 x i16> %a
-}
-
-declare <vscale x 8 x i16> @llvm.riscv.vssubu.mask.nxv8i16.i16(
-  <vscale x 8 x i16>,
-  <vscale x 8 x i16>,
-  i16,
-  <vscale x 8 x i1>,
-  i32,
-  i32);
-
-define <vscale x 8 x i16> @intrinsic_vssubu_mask_vx_nxv8i16_nxv8i16_i16(<vscale x 8 x i16> %0, <vscale x 8 x i16> %1, i16 %2, <vscale x 8 x i1> %3, i32 %4) nounwind {
-; CHECK-LABEL: intrinsic_vssubu_mask_vx_nxv8i16_nxv8i16_i16:
-; CHECK:       # %bb.0: # %entry
-; CHECK-NEXT:    vsetvli zero, a1, e16, m2, ta, mu
-; CHECK-NEXT:    vssubu.vx v8, v10, a0, v0.t
-; CHECK-NEXT:    ret
-entry:
-  %a = call <vscale x 8 x i16> @llvm.riscv.vssubu.mask.nxv8i16.i16(
-    <vscale x 8 x i16> %0,
-    <vscale x 8 x i16> %1,
-    i16 %2,
-    <vscale x 8 x i1> %3,
-    i32 %4, i32 1)
-
-  ret <vscale x 8 x i16> %a
-}
-
-declare <vscale x 16 x i16> @llvm.riscv.vssubu.nxv16i16.i16(
-  <vscale x 16 x i16>,
-  <vscale x 16 x i16>,
-  i16,
-  i32);
-
-define <vscale x 16 x i16> @intrinsic_vssubu_vx_nxv16i16_nxv16i16_i16(<vscale x 16 x i16> %0, i16 %1, i32 %2) nounwind {
-; CHECK-LABEL: intrinsic_vssubu_vx_nxv16i16_nxv16i16_i16:
-; CHECK:       # %bb.0: # %entry
-; CHECK-NEXT:    vsetvli zero, a1, e16, m4, ta, ma
-; CHECK-NEXT:    vssubu.vx v8, v8, a0
-; CHECK-NEXT:    ret
-entry:
-  %a = call <vscale x 16 x i16> @llvm.riscv.vssubu.nxv16i16.i16(
-    <vscale x 16 x i16> undef,
-    <vscale x 16 x i16> %0,
-    i16 %1,
-    i32 %2)
-
-  ret <vscale x 16 x i16> %a
-}
-
-declare <vscale x 16 x i16> @llvm.riscv.vssubu.mask.nxv16i16.i16(
-  <vscale x 16 x i16>,
-  <vscale x 16 x i16>,
-  i16,
-  <vscale x 16 x i1>,
-  i32,
-  i32);
-
-define <vscale x 16 x i16> @intrinsic_vssubu_mask_vx_nxv16i16_nxv16i16_i16(<vscale x 16 x i16> %0, <vscale x 16 x i16> %1, i16 %2, <vscale x 16 x i1> %3, i32 %4) nounwind {
-; CHECK-LABEL: intrinsic_vssubu_mask_vx_nxv16i16_nxv16i16_i16:
-; CHECK:       # %bb.0: # %entry
-; CHECK-NEXT:    vsetvli zero, a1, e16, m4, ta, mu
-; CHECK-NEXT:    vssubu.vx v8, v12, a0, v0.t
-; CHECK-NEXT:    ret
-entry:
-  %a = call <vscale x 16 x i16> @llvm.riscv.vssubu.mask.nxv16i16.i16(
-    <vscale x 16 x i16> %0,
-    <vscale x 16 x i16> %1,
-    i16 %2,
-    <vscale x 16 x i1> %3,
-    i32 %4, i32 1)
-
-  ret <vscale x 16 x i16> %a
-}
-
-declare <vscale x 32 x i16> @llvm.riscv.vssubu.nxv32i16.i16(
-  <vscale x 32 x i16>,
-  <vscale x 32 x i16>,
-  i16,
-  i32);
-
-define <vscale x 32 x i16> @intrinsic_vssubu_vx_nxv32i16_nxv32i16_i16(<vscale x 32 x i16> %0, i16 %1, i32 %2) nounwind {
-; CHECK-LABEL: intrinsic_vssubu_vx_nxv32i16_nxv32i16_i16:
-; CHECK:       # %bb.0: # %entry
-; CHECK-NEXT:    vsetvli zero, a1, e16, m8, ta, ma
-; CHECK-NEXT:    vssubu.vx v8, v8, a0
-; CHECK-NEXT:    ret
-entry:
-  %a = call <vscale x 32 x i16> @llvm.riscv.vssubu.nxv32i16.i16(
-    <vscale x 32 x i16> undef,
-    <vscale x 32 x i16> %0,
-    i16 %1,
-    i32 %2)
-
-  ret <vscale x 32 x i16> %a
-}
-
-declare <vscale x 32 x i16> @llvm.riscv.vssubu.mask.nxv32i16.i16(
-  <vscale x 32 x i16>,
-  <vscale x 32 x i16>,
-  i16,
-  <vscale x 32 x i1>,
-  i32,
-  i32);
-
-define <vscale x 32 x i16> @intrinsic_vssubu_mask_vx_nxv32i16_nxv32i16_i16(<vscale x 32 x i16> %0, <vscale x 32 x i16> %1, i16 %2, <vscale x 32 x i1> %3, i32 %4) nounwind {
-; CHECK-LABEL: intrinsic_vssubu_mask_vx_nxv32i16_nxv32i16_i16:
-; CHECK:       # %bb.0: # %entry
-; CHECK-NEXT:    vsetvli zero, a1, e16, m8, ta, mu
-; CHECK-NEXT:    vssubu.vx v8, v16, a0, v0.t
-; CHECK-NEXT:    ret
-entry:
-  %a = call <vscale x 32 x i16> @llvm.riscv.vssubu.mask.nxv32i16.i16(
-    <vscale x 32 x i16> %0,
-    <vscale x 32 x i16> %1,
-    i16 %2,
-    <vscale x 32 x i1> %3,
-    i32 %4, i32 1)
-
-  ret <vscale x 32 x i16> %a
-}
-
-declare <vscale x 1 x i32> @llvm.riscv.vssubu.nxv1i32.i32(
-  <vscale x 1 x i32>,
-  <vscale x 1 x i32>,
-  i32,
-  i32);
-
-define <vscale x 1 x i32> @intrinsic_vssubu_vx_nxv1i32_nxv1i32_i32(<vscale x 1 x i32> %0, i32 %1, i32 %2) nounwind {
-; CHECK-LABEL: intrinsic_vssubu_vx_nxv1i32_nxv1i32_i32:
-; CHECK:       # %bb.0: # %entry
-; CHECK-NEXT:    vsetvli zero, a1, e32, mf2, ta, ma
-; CHECK-NEXT:    vssubu.vx v8, v8, a0
-; CHECK-NEXT:    ret
-entry:
-  %a = call <vscale x 1 x i32> @llvm.riscv.vssubu.nxv1i32.i32(
-    <vscale x 1 x i32> undef,
-    <vscale x 1 x i32> %0,
-    i32 %1,
-    i32 %2)
-
-  ret <vscale x 1 x i32> %a
-}
-
-declare <vscale x 1 x i32> @llvm.riscv.vssubu.mask.nxv1i32.i32(
-  <vscale x 1 x i32>,
-  <vscale x 1 x i32>,
-  i32,
-  <vscale x 1 x i1>,
-  i32,
-  i32);
-
-define <vscale x 1 x i32> @intrinsic_vssubu_mask_vx_nxv1i32_nxv1i32_i32(<vscale x 1 x i32> %0, <vscale x 1 x i32> %1, i32 %2, <vscale x 1 x i1> %3, i32 %4) nounwind {
-; CHECK-LABEL: intrinsic_vssubu_mask_vx_nxv1i32_nxv1i32_i32:
-; CHECK:       # %bb.0: # %entry
-; CHECK-NEXT:    vsetvli zero, a1, e32, mf2, ta, mu
-; CHECK-NEXT:    vssubu.vx v8, v9, a0, v0.t
-; CHECK-NEXT:    ret
-entry:
-  %a = call <vscale x 1 x i32> @llvm.riscv.vssubu.mask.nxv1i32.i32(
-    <vscale x 1 x i32> %0,
-    <vscale x 1 x i32> %1,
-    i32 %2,
-    <vscale x 1 x i1> %3,
-    i32 %4, i32 1)
-
-  ret <vscale x 1 x i32> %a
-}
-
-declare <vscale x 2 x i32> @llvm.riscv.vssubu.nxv2i32.i32(
-  <vscale x 2 x i32>,
-  <vscale x 2 x i32>,
-  i32,
-  i32);
-
-define <vscale x 2 x i32> @intrinsic_vssubu_vx_nxv2i32_nxv2i32_i32(<vscale x 2 x i32> %0, i32 %1, i32 %2) nounwind {
-; CHECK-LABEL: intrinsic_vssubu_vx_nxv2i32_nxv2i32_i32:
-; CHECK:       # %bb.0: # %entry
-; CHECK-NEXT:    vsetvli zero, a1, e32, m1, ta, ma
-; CHECK-NEXT:    vssubu.vx v8, v8, a0
-; CHECK-NEXT:    ret
-entry:
-  %a = call <vscale x 2 x i32> @llvm.riscv.vssubu.nxv2i32.i32(
-    <vscale x 2 x i32> undef,
-    <vscale x 2 x i32> %0,
-    i32 %1,
-    i32 %2)
-
-  ret <vscale x 2 x i32> %a
-}
-
-declare <vscale x 2 x i32> @llvm.riscv.vssubu.mask.nxv2i32.i32(
-  <vscale x 2 x i32>,
-  <vscale x 2 x i32>,
-  i32,
-  <vscale x 2 x i1>,
-  i32,
-  i32);
-
-define <vscale x 2 x i32> @intrinsic_vssubu_mask_vx_nxv2i32_nxv2i32_i32(<vscale x 2 x i32> %0, <vscale x 2 x i32> %1, i32 %2, <vscale x 2 x i1> %3, i32 %4) nounwind {
-; CHECK-LABEL: intrinsic_vssubu_mask_vx_nxv2i32_nxv2i32_i32:
-; CHECK:       # %bb.0: # %entry
-; CHECK-NEXT:    vsetvli zero, a1, e32, m1, ta, mu
-; CHECK-NEXT:    vssubu.vx v8, v9, a0, v0.t
-; CHECK-NEXT:    ret
-entry:
-  %a = call <vscale x 2 x i32> @llvm.riscv.vssubu.mask.nxv2i32.i32(
-    <vscale x 2 x i32> %0,
-    <vscale x 2 x i32> %1,
-    i32 %2,
-    <vscale x 2 x i1> %3,
-    i32 %4, i32 1)
-
-  ret <vscale x 2 x i32> %a
-}
-
-declare <vscale x 4 x i32> @llvm.riscv.vssubu.nxv4i32.i32(
-  <vscale x 4 x i32>,
-  <vscale x 4 x i32>,
-  i32,
-  i32);
-
-define <vscale x 4 x i32> @intrinsic_vssubu_vx_nxv4i32_nxv4i32_i32(<vscale x 4 x i32> %0, i32 %1, i32 %2) nounwind {
-; CHECK-LABEL: intrinsic_vssubu_vx_nxv4i32_nxv4i32_i32:
-; CHECK:       # %bb.0: # %entry
-; CHECK-NEXT:    vsetvli zero, a1, e32, m2, ta, ma
-; CHECK-NEXT:    vssubu.vx v8, v8, a0
-; CHECK-NEXT:    ret
-entry:
-  %a = call <vscale x 4 x i32> @llvm.riscv.vssubu.nxv4i32.i32(
-    <vscale x 4 x i32> undef,
-    <vscale x 4 x i32> %0,
-    i32 %1,
-    i32 %2)
-
-  ret <vscale x 4 x i32> %a
-}
-
-declare <vscale x 4 x i32> @llvm.riscv.vssubu.mask.nxv4i32.i32(
-  <vscale x 4 x i32>,
-  <vscale x 4 x i32>,
-  i32,
-  <vscale x 4 x i1>,
-  i32,
-  i32);
-
-define <vscale x 4 x i32> @intrinsic_vssubu_mask_vx_nxv4i32_nxv4i32_i32(<vscale x 4 x i32> %0, <vscale x 4 x i32> %1, i32 %2, <vscale x 4 x i1> %3, i32 %4) nounwind {
-; CHECK-LABEL: intrinsic_vssubu_mask_vx_nxv4i32_nxv4i32_i32:
-; CHECK:       # %bb.0: # %entry
-; CHECK-NEXT:    vsetvli zero, a1, e32, m2, ta, mu
-; CHECK-NEXT:    vssubu.vx v8, v10, a0, v0.t
-; CHECK-NEXT:    ret
-entry:
-  %a = call <vscale x 4 x i32> @llvm.riscv.vssubu.mask.nxv4i32.i32(
-    <vscale x 4 x i32> %0,
-    <vscale x 4 x i32> %1,
-    i32 %2,
-    <vscale x 4 x i1> %3,
-    i32 %4, i32 1)
-
-  ret <vscale x 4 x i32> %a
-}
-
-declare <vscale x 8 x i32> @llvm.riscv.vssubu.nxv8i32.i32(
-  <vscale x 8 x i32>,
-  <vscale x 8 x i32>,
-  i32,
-  i32);
-
-define <vscale x 8 x i32> @intrinsic_vssubu_vx_nxv8i32_nxv8i32_i32(<vscale x 8 x i32> %0, i32 %1, i32 %2) nounwind {
-; CHECK-LABEL: intrinsic_vssubu_vx_nxv8i32_nxv8i32_i32:
-; CHECK:       # %bb.0: # %entry
-; CHECK-NEXT:    vsetvli zero, a1, e32, m4, ta, ma
-; CHECK-NEXT:    vssubu.vx v8, v8, a0
-; CHECK-NEXT:    ret
-entry:
-  %a = call <vscale x 8 x i32> @llvm.riscv.vssubu.nxv8i32.i32(
-    <vscale x 8 x i32> undef,
-    <vscale x 8 x i32> %0,
-    i32 %1,
-    i32 %2)
-
-  ret <vscale x 8 x i32> %a
-}
-
-declare <vscale x 8 x i32> @llvm.riscv.vssubu.mask.nxv8i32.i32(
-  <vscale x 8 x i32>,
-  <vscale x 8 x i32>,
-  i32,
-  <vscale x 8 x i1>,
-  i32,
-  i32);
-
-define <vscale x 8 x i32> @intrinsic_vssubu_mask_vx_nxv8i32_nxv8i32_i32(<vscale x 8 x i32> %0, <vscale x 8 x i32> %1, i32 %2, <vscale x 8 x i1> %3, i32 %4) nounwind {
-; CHECK-LABEL: intrinsic_vssubu_mask_vx_nxv8i32_nxv8i32_i32:
-; CHECK:       # %bb.0: # %entry
-; CHECK-NEXT:    vsetvli zero, a1, e32, m4, ta, mu
-; CHECK-NEXT:    vssubu.vx v8, v12, a0, v0.t
-; CHECK-NEXT:    ret
-entry:
-  %a = call <vscale x 8 x i32> @llvm.riscv.vssubu.mask.nxv8i32.i32(
-    <vscale x 8 x i32> %0,
-    <vscale x 8 x i32> %1,
-    i32 %2,
-    <vscale x 8 x i1> %3,
-    i32 %4, i32 1)
-
-  ret <vscale x 8 x i32> %a
-}
-
-declare <vscale x 16 x i32> @llvm.riscv.vssubu.nxv16i32.i32(
-  <vscale x 16 x i32>,
-  <vscale x 16 x i32>,
-  i32,
-  i32);
-
-define <vscale x 16 x i32> @intrinsic_vssubu_vx_nxv16i32_nxv16i32_i32(<vscale x 16 x i32> %0, i32 %1, i32 %2) nounwind {
-; CHECK-LABEL: intrinsic_vssubu_vx_nxv16i32_nxv16i32_i32:
-; CHECK:       # %bb.0: # %entry
-; CHECK-NEXT:    vsetvli zero, a1, e32, m8, ta, ma
-; CHECK-NEXT:    vssubu.vx v8, v8, a0
-; CHECK-NEXT:    ret
-entry:
-  %a = call <vscale x 16 x i32> @llvm.riscv.vssubu.nxv16i32.i32(
-    <vscale x 16 x i32> undef,
-    <vscale x 16 x i32> %0,
-    i32 %1,
-    i32 %2)
-
-  ret <vscale x 16 x i32> %a
-}
-
-declare <vscale x 16 x i32> @llvm.riscv.vssubu.mask.nxv16i32.i32(
-  <vscale x 16 x i32>,
-  <vscale x 16 x i32>,
-  i32,
-  <vscale x 16 x i1>,
-  i32,
-  i32);
-
-define <vscale x 16 x i32> @intrinsic_vssubu_mask_vx_nxv16i32_nxv16i32_i32(<vscale x 16 x i32> %0, <vscale x 16 x i32> %1, i32 %2, <vscale x 16 x i1> %3, i32 %4) nounwind {
-; CHECK-LABEL: intrinsic_vssubu_mask_vx_nxv16i32_nxv16i32_i32:
-; CHECK:       # %bb.0: # %entry
-; CHECK-NEXT:    vsetvli zero, a1, e32, m8, ta, mu
-; CHECK-NEXT:    vssubu.vx v8, v16, a0, v0.t
-; CHECK-NEXT:    ret
-entry:
-  %a = call <vscale x 16 x i32> @llvm.riscv.vssubu.mask.nxv16i32.i32(
-    <vscale x 16 x i32> %0,
-    <vscale x 16 x i32> %1,
-    i32 %2,
-    <vscale x 16 x i1> %3,
-    i32 %4, i32 1)
-
-  ret <vscale x 16 x i32> %a
-}
-
-declare <vscale x 1 x i64> @llvm.riscv.vssubu.nxv1i64.i64(
-  <vscale x 1 x i64>,
-  <vscale x 1 x i64>,
-  i64,
-  i32);
-
-define <vscale x 1 x i64> @intrinsic_vssubu_vx_nxv1i64_nxv1i64_i64(<vscale x 1 x i64> %0, i64 %1, i32 %2) nounwind {
-; CHECK-LABEL: intrinsic_vssubu_vx_nxv1i64_nxv1i64_i64:
-; CHECK:       # %bb.0: # %entry
-; CHECK-NEXT:    addi sp, sp, -16
-; CHECK-NEXT:    sw a1, 12(sp)
-; CHECK-NEXT:    sw a0, 8(sp)
-; CHECK-NEXT:    addi a0, sp, 8
-; CHECK-NEXT:    vsetvli zero, a2, e64, m1, ta, ma
-; CHECK-NEXT:    vlse64.v v9, (a0), zero
-; CHECK-NEXT:    vssubu.vv v8, v8, v9
-; CHECK-NEXT:    addi sp, sp, 16
-; CHECK-NEXT:    ret
-entry:
-  %a = call <vscale x 1 x i64> @llvm.riscv.vssubu.nxv1i64.i64(
-    <vscale x 1 x i64> undef,
-    <vscale x 1 x i64> %0,
-    i64 %1,
-    i32 %2)
-
-  ret <vscale x 1 x i64> %a
-}
-
-declare <vscale x 1 x i64> @llvm.riscv.vssubu.mask.nxv1i64.i64(
-  <vscale x 1 x i64>,
-  <vscale x 1 x i64>,
-  i64,
-  <vscale x 1 x i1>,
-  i32,
-  i32);
-
-define <vscale x 1 x i64> @intrinsic_vssubu_mask_vx_nxv1i64_nxv1i64_i64(<vscale x 1 x i64> %0, <vscale x 1 x i64> %1, i64 %2, <vscale x 1 x i1> %3, i32 %4) nounwind {
-; CHECK-LABEL: intrinsic_vssubu_mask_vx_nxv1i64_nxv1i64_i64:
-; CHECK:       # %bb.0: # %entry
-; CHECK-NEXT:    addi sp, sp, -16
-; CHECK-NEXT:    sw a1, 12(sp)
-; CHECK-NEXT:    sw a0, 8(sp)
-; CHECK-NEXT:    addi a0, sp, 8
-; CHECK-NEXT:    vsetvli zero, a2, e64, m1, ta, mu
-; CHECK-NEXT:    vlse64.v v10, (a0), zero
-; CHECK-NEXT:    vssubu.vv v8, v9, v10, v0.t
-; CHECK-NEXT:    addi sp, sp, 16
-; CHECK-NEXT:    ret
-entry:
-  %a = call <vscale x 1 x i64> @llvm.riscv.vssubu.mask.nxv1i64.i64(
-    <vscale x 1 x i64> %0,
-    <vscale x 1 x i64> %1,
-    i64 %2,
-    <vscale x 1 x i1> %3,
-    i32 %4, i32 1)
-
-  ret <vscale x 1 x i64> %a
-}
-
-declare <vscale x 2 x i64> @llvm.riscv.vssubu.nxv2i64.i64(
-  <vscale x 2 x i64>,
-  <vscale x 2 x i64>,
-  i64,
-  i32);
-
-define <vscale x 2 x i64> @intrinsic_vssubu_vx_nxv2i64_nxv2i64_i64(<vscale x 2 x i64> %0, i64 %1, i32 %2) nounwind {
-; CHECK-LABEL: intrinsic_vssubu_vx_nxv2i64_nxv2i64_i64:
-; CHECK:       # %bb.0: # %entry
-; CHECK-NEXT:    addi sp, sp, -16
-; CHECK-NEXT:    sw a1, 12(sp)
-; CHECK-NEXT:    sw a0, 8(sp)
-; CHECK-NEXT:    addi a0, sp, 8
-; CHECK-NEXT:    vsetvli zero, a2, e64, m2, ta, ma
-; CHECK-NEXT:    vlse64.v v10, (a0), zero
-; CHECK-NEXT:    vssubu.vv v8, v8, v10
-; CHECK-NEXT:    addi sp, sp, 16
-; CHECK-NEXT:    ret
-entry:
-  %a = call <vscale x 2 x i64> @llvm.riscv.vssubu.nxv2i64.i64(
-    <vscale x 2 x i64> undef,
-    <vscale x 2 x i64> %0,
-    i64 %1,
-    i32 %2)
-
-  ret <vscale x 2 x i64> %a
-}
-
-declare <vscale x 2 x i64> @llvm.riscv.vssubu.mask.nxv2i64.i64(
-  <vscale x 2 x i64>,
-  <vscale x 2 x i64>,
-  i64,
-  <vscale x 2 x i1>,
-  i32,
-  i32);
-
-define <vscale x 2 x i64> @intrinsic_vssubu_mask_vx_nxv2i64_nxv2i64_i64(<vscale x 2 x i64> %0, <vscale x 2 x i64> %1, i64 %2, <vscale x 2 x i1> %3, i32 %4) nounwind {
-; CHECK-LABEL: intrinsic_vssubu_mask_vx_nxv2i64_nxv2i64_i64:
-; CHECK:       # %bb.0: # %entry
-; CHECK-NEXT:    addi sp, sp, -16
-; CHECK-NEXT:    sw a1, 12(sp)
-; CHECK-NEXT:    sw a0, 8(sp)
-; CHECK-NEXT:    addi a0, sp, 8
-; CHECK-NEXT:    vsetvli zero, a2, e64, m2, ta, mu
-; CHECK-NEXT:    vlse64.v v12, (a0), zero
-; CHECK-NEXT:    vssubu.vv v8, v10, v12, v0.t
-; CHECK-NEXT:    addi sp, sp, 16
-; CHECK-NEXT:    ret
-entry:
-  %a = call <vscale x 2 x i64> @llvm.riscv.vssubu.mask.nxv2i64.i64(
-    <vscale x 2 x i64> %0,
-    <vscale x 2 x i64> %1,
-    i64 %2,
-    <vscale x 2 x i1> %3,
-    i32 %4, i32 1)
-
-  ret <vscale x 2 x i64> %a
-}
-
-declare <vscale x 4 x i64> @llvm.riscv.vssubu.nxv4i64.i64(
-  <vscale x 4 x i64>,
-  <vscale x 4 x i64>,
-  i64,
-  i32);
-
-define <vscale x 4 x i64> @intrinsic_vssubu_vx_nxv4i64_nxv4i64_i64(<vscale x 4 x i64> %0, i64 %1, i32 %2) nounwind {
-; CHECK-LABEL: intrinsic_vssubu_vx_nxv4i64_nxv4i64_i64:
-; CHECK:       # %bb.0: # %entry
-; CHECK-NEXT:    addi sp, sp, -16
-; CHECK-NEXT:    sw a1, 12(sp)
-; CHECK-NEXT:    sw a0, 8(sp)
-; CHECK-NEXT:    addi a0, sp, 8
-; CHECK-NEXT:    vsetvli zero, a2, e64, m4, ta, ma
-; CHECK-NEXT:    vlse64.v v12, (a0), zero
-; CHECK-NEXT:    vssubu.vv v8, v8, v12
-; CHECK-NEXT:    addi sp, sp, 16
-; CHECK-NEXT:    ret
-entry:
-  %a = call <vscale x 4 x i64> @llvm.riscv.vssubu.nxv4i64.i64(
-    <vscale x 4 x i64> undef,
-    <vscale x 4 x i64> %0,
-    i64 %1,
-    i32 %2)
-
-  ret <vscale x 4 x i64> %a
-}
-
-declare <vscale x 4 x i64> @llvm.riscv.vssubu.mask.nxv4i64.i64(
-  <vscale x 4 x i64>,
-  <vscale x 4 x i64>,
-  i64,
-  <vscale x 4 x i1>,
-  i32,
-  i32);
-
-define <vscale x 4 x i64> @intrinsic_vssubu_mask_vx_nxv4i64_nxv4i64_i64(<vscale x 4 x i64> %0, <vscale x 4 x i64> %1, i64 %2, <vscale x 4 x i1> %3, i32 %4) nounwind {
-; CHECK-LABEL: intrinsic_vssubu_mask_vx_nxv4i64_nxv4i64_i64:
-; CHECK:       # %bb.0: # %entry
-; CHECK-NEXT:    addi sp, sp, -16
-; CHECK-NEXT:    sw a1, 12(sp)
-; CHECK-NEXT:    sw a0, 8(sp)
-; CHECK-NEXT:    addi a0, sp, 8
-; CHECK-NEXT:    vsetvli zero, a2, e64, m4, ta, mu
-; CHECK-NEXT:    vlse64.v v16, (a0), zero
-; CHECK-NEXT:    vssubu.vv v8, v12, v16, v0.t
-; CHECK-NEXT:    addi sp, sp, 16
-; CHECK-NEXT:    ret
-entry:
-  %a = call <vscale x 4 x i64> @llvm.riscv.vssubu.mask.nxv4i64.i64(
-    <vscale x 4 x i64> %0,
-    <vscale x 4 x i64> %1,
-    i64 %2,
-    <vscale x 4 x i1> %3,
-    i32 %4, i32 1)
-
-  ret <vscale x 4 x i64> %a
-}
-
-declare <vscale x 8 x i64> @llvm.riscv.vssubu.nxv8i64.i64(
-  <vscale x 8 x i64>,
-  <vscale x 8 x i64>,
-  i64,
-  i32);
-
-define <vscale x 8 x i64> @intrinsic_vssubu_vx_nxv8i64_nxv8i64_i64(<vscale x 8 x i64> %0, i64 %1, i32 %2) nounwind {
-; CHECK-LABEL: intrinsic_vssubu_vx_nxv8i64_nxv8i64_i64:
-; CHECK:       # %bb.0: # %entry
-; CHECK-NEXT:    addi sp, sp, -16
-; CHECK-NEXT:    sw a1, 12(sp)
-; CHECK-NEXT:    sw a0, 8(sp)
-; CHECK-NEXT:    addi a0, sp, 8
-; CHECK-NEXT:    vsetvli zero, a2, e64, m8, ta, ma
-; CHECK-NEXT:    vlse64.v v16, (a0), zero
-; CHECK-NEXT:    vssubu.vv v8, v8, v16
-; CHECK-NEXT:    addi sp, sp, 16
-; CHECK-NEXT:    ret
-entry:
-  %a = call <vscale x 8 x i64> @llvm.riscv.vssubu.nxv8i64.i64(
-    <vscale x 8 x i64> undef,
-    <vscale x 8 x i64> %0,
-    i64 %1,
-    i32 %2)
-
-  ret <vscale x 8 x i64> %a
-}
-
-declare <vscale x 8 x i64> @llvm.riscv.vssubu.mask.nxv8i64.i64(
-  <vscale x 8 x i64>,
-  <vscale x 8 x i64>,
-  i64,
-  <vscale x 8 x i1>,
-  i32,
-  i32);
-
-define <vscale x 8 x i64> @intrinsic_vssubu_mask_vx_nxv8i64_nxv8i64_i64(<vscale x 8 x i64> %0, <vscale x 8 x i64> %1, i64 %2, <vscale x 8 x i1> %3, i32 %4) nounwind {
-; CHECK-LABEL: intrinsic_vssubu_mask_vx_nxv8i64_nxv8i64_i64:
-; CHECK:       # %bb.0: # %entry
-; CHECK-NEXT:    addi sp, sp, -16
-; CHECK-NEXT:    sw a1, 12(sp)
-; CHECK-NEXT:    sw a0, 8(sp)
-; CHECK-NEXT:    addi a0, sp, 8
-; CHECK-NEXT:    vsetvli zero, a2, e64, m8, ta, mu
-; CHECK-NEXT:    vlse64.v v24, (a0), zero
-; CHECK-NEXT:    vssubu.vv v8, v16, v24, v0.t
-; CHECK-NEXT:    addi sp, sp, 16
-; CHECK-NEXT:    ret
-entry:
-  %a = call <vscale x 8 x i64> @llvm.riscv.vssubu.mask.nxv8i64.i64(
-    <vscale x 8 x i64> %0,
-    <vscale x 8 x i64> %1,
-    i64 %2,
-    <vscale x 8 x i1> %3,
-    i32 %4, i32 1)
-
-  ret <vscale x 8 x i64> %a
-}
diff --git a/llvm/test/CodeGen/RISCV/rvv/vssubu-rv64.ll b/llvm/test/CodeGen/RISCV/rvv/vssubu.ll
index 71b623c..db1b4ce 100644
--- a/llvm/test/CodeGen/RISCV/rvv/vssubu-rv64.ll
+++ b/llvm/test/CodeGen/RISCV/rvv/vssubu.ll
@@ -1,14 +1,16 @@
 ; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py
-; RUN: llc -mtriple=riscv64 -mattr=+v -verify-machineinstrs \
-; RUN:   < %s | FileCheck %s
+; RUN: sed 's/iXLen/i32/g' %s | llc -mtriple=riscv32 -mattr=+v \
+; RUN:   -verify-machineinstrs  | FileCheck %s --check-prefixes=CHECK,RV32
+; RUN: sed 's/iXLen/i64/g' %s | llc -mtriple=riscv64 -mattr=+v \
+; RUN:   -verify-machineinstrs  | FileCheck %s --check-prefixes=CHECK,RV64
 
 declare <vscale x 1 x i8> @llvm.riscv.vssubu.nxv1i8.nxv1i8(
   <vscale x 1 x i8>,
   <vscale x 1 x i8>,
   <vscale x 1 x i8>,
-  i64);
+  iXLen)
 
-define <vscale x 1 x i8> @intrinsic_vssubu_vv_nxv1i8_nxv1i8_nxv1i8(<vscale x 1 x i8> %0, <vscale x 1 x i8> %1, i64 %2) nounwind {
+define <vscale x 1 x i8> @intrinsic_vssubu_vv_nxv1i8_nxv1i8_nxv1i8(<vscale x 1 x i8> %0, <vscale x 1 x i8> %1, iXLen %2) nounwind {
 ; CHECK-LABEL: intrinsic_vssubu_vv_nxv1i8_nxv1i8_nxv1i8:
 ; CHECK:       # %bb.0: # %entry
 ; CHECK-NEXT:    vsetvli zero, a0, e8, mf8, ta, ma
@@ -19,7 +21,7 @@ entry:
     <vscale x 1 x i8> undef,
     <vscale x 1 x i8> %0,
     <vscale x 1 x i8> %1,
-    i64 %2)
+    iXLen %2)
 
   ret <vscale x 1 x i8> %a
 }
@@ -29,10 +31,10 @@ declare <vscale x 1 x i8> @llvm.riscv.vssubu.mask.nxv1i8.nxv1i8(
   <vscale x 1 x i8>,
   <vscale x 1 x i8>,
   <vscale x 1 x i1>,
-  i64,
-  i64);
+  iXLen,
+  iXLen)
 
-define <vscale x 1 x i8> @intrinsic_vssubu_mask_vv_nxv1i8_nxv1i8_nxv1i8(<vscale x 1 x i8> %0, <vscale x 1 x i8> %1, <vscale x 1 x i8> %2, <vscale x 1 x i1> %3, i64 %4) nounwind {
+define <vscale x 1 x i8> @intrinsic_vssubu_mask_vv_nxv1i8_nxv1i8_nxv1i8(<vscale x 1 x i8> %0, <vscale x 1 x i8> %1, <vscale x 1 x i8> %2, <vscale x 1 x i1> %3, iXLen %4) nounwind {
 ; CHECK-LABEL: intrinsic_vssubu_mask_vv_nxv1i8_nxv1i8_nxv1i8:
 ; CHECK:       # %bb.0: # %entry
 ; CHECK-NEXT:    vsetvli zero, a0, e8, mf8, ta, mu
@@ -44,7 +46,7 @@ entry:
     <vscale x 1 x i8> %1,
     <vscale x 1 x i8> %2,
     <vscale x 1 x i1> %3,
-    i64 %4, i64 1)
+    iXLen %4, iXLen 1)
 
   ret <vscale x 1 x i8> %a
 }
@@ -53,9 +55,9 @@ declare <vscale x 2 x i8> @llvm.riscv.vssubu.nxv2i8.nxv2i8(
   <vscale x 2 x i8>,
   <vscale x 2 x i8>,
   <vscale x 2 x i8>,
-  i64);
+  iXLen)
 
-define <vscale x 2 x i8> @intrinsic_vssubu_vv_nxv2i8_nxv2i8_nxv2i8(<vscale x 2 x i8> %0, <vscale x 2 x i8> %1, i64 %2) nounwind {
+define <vscale x 2 x i8> @intrinsic_vssubu_vv_nxv2i8_nxv2i8_nxv2i8(<vscale x 2 x i8> %0, <vscale x 2 x i8> %1, iXLen %2) nounwind {
 ; CHECK-LABEL: intrinsic_vssubu_vv_nxv2i8_nxv2i8_nxv2i8:
 ; CHECK:       # %bb.0: # %entry
 ; CHECK-NEXT:    vsetvli zero, a0, e8, mf4, ta, ma
@@ -66,7 +68,7 @@ entry:
     <vscale x 2 x i8> undef,
     <vscale x 2 x i8> %0,
     <vscale x 2 x i8> %1,
-    i64 %2)
+    iXLen %2)
 
   ret <vscale x 2 x i8> %a
 }
@@ -76,10 +78,10 @@ declare <vscale x 2 x i8> @llvm.riscv.vssubu.mask.nxv2i8.nxv2i8(
   <vscale x 2 x i8>,
   <vscale x 2 x i8>,
   <vscale x 2 x i1>,
-  i64,
-  i64);
+  iXLen,
+  iXLen)
 
-define <vscale x 2 x i8> @intrinsic_vssubu_mask_vv_nxv2i8_nxv2i8_nxv2i8(<vscale x 2 x i8> %0, <vscale x 2 x i8> %1, <vscale x 2 x i8> %2, <vscale x 2 x i1> %3, i64 %4) nounwind {
+define <vscale x 2 x i8> @intrinsic_vssubu_mask_vv_nxv2i8_nxv2i8_nxv2i8(<vscale x 2 x i8> %0, <vscale x 2 x i8> %1, <vscale x 2 x i8> %2, <vscale x 2 x i1> %3, iXLen %4) nounwind {
 ; CHECK-LABEL: intrinsic_vssubu_mask_vv_nxv2i8_nxv2i8_nxv2i8:
 ; CHECK:       # %bb.0: # %entry
 ; CHECK-NEXT:    vsetvli zero, a0, e8, mf4, ta, mu
@@ -91,7 +93,7 @@ entry:
     <vscale x 2 x i8> %1,
     <vscale x 2 x i8> %2,
     <vscale x 2 x i1> %3,
-    i64 %4, i64 1)
+    iXLen %4, iXLen 1)
 
   ret <vscale x 2 x i8> %a
 }
@@ -100,9 +102,9 @@ declare <vscale x 4 x i8> @llvm.riscv.vssubu.nxv4i8.nxv4i8(
   <vscale x 4 x i8>,
   <vscale x 4 x i8>,
   <vscale x 4 x i8>,
-  i64);
+  iXLen)
 
-define <vscale x 4 x i8> @intrinsic_vssubu_vv_nxv4i8_nxv4i8_nxv4i8(<vscale x 4 x i8> %0, <vscale x 4 x i8> %1, i64 %2) nounwind {
+define <vscale x 4 x i8> @intrinsic_vssubu_vv_nxv4i8_nxv4i8_nxv4i8(<vscale x 4 x i8> %0, <vscale x 4 x i8> %1, iXLen %2) nounwind {
 ; CHECK-LABEL: intrinsic_vssubu_vv_nxv4i8_nxv4i8_nxv4i8:
 ; CHECK:       # %bb.0: # %entry
 ; CHECK-NEXT:    vsetvli zero, a0, e8, mf2, ta, ma
@@ -113,7 +115,7 @@ entry:
     <vscale x 4 x i8> undef,
     <vscale x 4 x i8> %0,
     <vscale x 4 x i8> %1,
-    i64 %2)
+    iXLen %2)
 
   ret <vscale x 4 x i8> %a
 }
@@ -123,10 +125,10 @@ declare <vscale x 4 x i8> @llvm.riscv.vssubu.mask.nxv4i8.nxv4i8(
   <vscale x 4 x i8>,
   <vscale x 4 x i8>,
   <vscale x 4 x i1>,
-  i64,
-  i64);
+  iXLen,
+  iXLen)
 
-define <vscale x 4 x i8> @intrinsic_vssubu_mask_vv_nxv4i8_nxv4i8_nxv4i8(<vscale x 4 x i8> %0, <vscale x 4 x i8> %1, <vscale x 4 x i8> %2, <vscale x 4 x i1> %3, i64 %4) nounwind {
+define <vscale x 4 x i8> @intrinsic_vssubu_mask_vv_nxv4i8_nxv4i8_nxv4i8(<vscale x 4 x i8> %0, <vscale x 4 x i8> %1, <vscale x 4 x i8> %2, <vscale x 4 x i1> %3, iXLen %4) nounwind {
 ; CHECK-LABEL: intrinsic_vssubu_mask_vv_nxv4i8_nxv4i8_nxv4i8:
 ; CHECK:       # %bb.0: # %entry
 ; CHECK-NEXT:    vsetvli zero, a0, e8, mf2, ta, mu
@@ -138,7 +140,7 @@ entry:
     <vscale x 4 x i8> %1,
     <vscale x 4 x i8> %2,
     <vscale x 4 x i1> %3,
-    i64 %4, i64 1)
+    iXLen %4, iXLen 1)
 
   ret <vscale x 4 x i8> %a
 }
@@ -147,9 +149,9 @@ declare <vscale x 8 x i8> @llvm.riscv.vssubu.nxv8i8.nxv8i8(
   <vscale x 8 x i8>,
   <vscale x 8 x i8>,
   <vscale x 8 x i8>,
-  i64);
+  iXLen)
 
-define <vscale x 8 x i8> @intrinsic_vssubu_vv_nxv8i8_nxv8i8_nxv8i8(<vscale x 8 x i8> %0, <vscale x 8 x i8> %1, i64 %2) nounwind {
+define <vscale x 8 x i8> @intrinsic_vssubu_vv_nxv8i8_nxv8i8_nxv8i8(<vscale x 8 x i8> %0, <vscale x 8 x i8> %1, iXLen %2) nounwind {
 ; CHECK-LABEL: intrinsic_vssubu_vv_nxv8i8_nxv8i8_nxv8i8:
 ; CHECK:       # %bb.0: # %entry
 ; CHECK-NEXT:    vsetvli zero, a0, e8, m1, ta, ma
@@ -160,7 +162,7 @@ entry:
     <vscale x 8 x i8> undef,
     <vscale x 8 x i8> %0,
     <vscale x 8 x i8> %1,
-    i64 %2)
+    iXLen %2)
 
   ret <vscale x 8 x i8> %a
 }
@@ -170,10 +172,10 @@ declare <vscale x 8 x i8> @llvm.riscv.vssubu.mask.nxv8i8.nxv8i8(
   <vscale x 8 x i8>,
   <vscale x 8 x i8>,
   <vscale x 8 x i1>,
-  i64,
-  i64);
+  iXLen,
+  iXLen)
 
-define <vscale x 8 x i8> @intrinsic_vssubu_mask_vv_nxv8i8_nxv8i8_nxv8i8(<vscale x 8 x i8> %0, <vscale x 8 x i8> %1, <vscale x 8 x i8> %2, <vscale x 8 x i1> %3, i64 %4) nounwind {
+define <vscale x 8 x i8> @intrinsic_vssubu_mask_vv_nxv8i8_nxv8i8_nxv8i8(<vscale x 8 x i8> %0, <vscale x 8 x i8> %1, <vscale x 8 x i8> %2, <vscale x 8 x i1> %3, iXLen %4) nounwind {
 ; CHECK-LABEL: intrinsic_vssubu_mask_vv_nxv8i8_nxv8i8_nxv8i8:
 ; CHECK:       # %bb.0: # %entry
 ; CHECK-NEXT:    vsetvli zero, a0, e8, m1, ta, mu
@@ -185,7 +187,7 @@ entry:
     <vscale x 8 x i8> %1,
     <vscale x 8 x i8> %2,
     <vscale x 8 x i1> %3,
-    i64 %4, i64 1)
+    iXLen %4, iXLen 1)
 
   ret <vscale x 8 x i8> %a
 }
@@ -194,9 +196,9 @@ declare <vscale x 16 x i8> @llvm.riscv.vssubu.nxv16i8.nxv16i8(
   <vscale x 16 x i8>,
   <vscale x 16 x i8>,
   <vscale x 16 x i8>,
-  i64);
+  iXLen)
 
-define <vscale x 16 x i8> @intrinsic_vssubu_vv_nxv16i8_nxv16i8_nxv16i8(<vscale x 16 x i8> %0, <vscale x 16 x i8> %1, i64 %2) nounwind {
+define <vscale x 16 x i8> @intrinsic_vssubu_vv_nxv16i8_nxv16i8_nxv16i8(<vscale x 16 x i8> %0, <vscale x 16 x i8> %1, iXLen %2) nounwind {
 ; CHECK-LABEL: intrinsic_vssubu_vv_nxv16i8_nxv16i8_nxv16i8:
 ; CHECK:       # %bb.0: # %entry
 ; CHECK-NEXT:    vsetvli zero, a0, e8, m2, ta, ma
@@ -207,7 +209,7 @@ entry:
     <vscale x 16 x i8> undef,
     <vscale x 16 x i8> %0,
     <vscale x 16 x i8> %1,
-    i64 %2)
+    iXLen %2)
 
   ret <vscale x 16 x i8> %a
 }
@@ -217,10 +219,10 @@ declare <vscale x 16 x i8> @llvm.riscv.vssubu.mask.nxv16i8.nxv16i8(
   <vscale x 16 x i8>,
   <vscale x 16 x i8>,
   <vscale x 16 x i1>,
-  i64,
-  i64);
+  iXLen,
+  iXLen)
 
-define <vscale x 16 x i8> @intrinsic_vssubu_mask_vv_nxv16i8_nxv16i8_nxv16i8(<vscale x 16 x i8> %0, <vscale x 16 x i8> %1, <vscale x 16 x i8> %2, <vscale x 16 x i1> %3, i64 %4) nounwind {
+define <vscale x 16 x i8> @intrinsic_vssubu_mask_vv_nxv16i8_nxv16i8_nxv16i8(<vscale x 16 x i8> %0, <vscale x 16 x i8> %1, <vscale x 16 x i8> %2, <vscale x 16 x i1> %3, iXLen %4) nounwind {
 ; CHECK-LABEL: intrinsic_vssubu_mask_vv_nxv16i8_nxv16i8_nxv16i8:
 ; CHECK:       # %bb.0: # %entry
 ; CHECK-NEXT:    vsetvli zero, a0, e8, m2, ta, mu
@@ -232,7 +234,7 @@ entry:
     <vscale x 16 x i8> %1,
     <vscale x 16 x i8> %2,
     <vscale x 16 x i1> %3,
-    i64 %4, i64 1)
+    iXLen %4, iXLen 1)
 
   ret <vscale x 16 x i8> %a
 }
@@ -241,9 +243,9 @@ declare <vscale x 32 x i8> @llvm.riscv.vssubu.nxv32i8.nxv32i8(
   <vscale x 32 x i8>,
   <vscale x 32 x i8>,
   <vscale x 32 x i8>,
-  i64);
+  iXLen)
 
-define <vscale x 32 x i8> @intrinsic_vssubu_vv_nxv32i8_nxv32i8_nxv32i8(<vscale x 32 x i8> %0, <vscale x 32 x i8> %1, i64 %2) nounwind {
+define <vscale x 32 x i8> @intrinsic_vssubu_vv_nxv32i8_nxv32i8_nxv32i8(<vscale x 32 x i8> %0, <vscale x 32 x i8> %1, iXLen %2) nounwind {
 ; CHECK-LABEL: intrinsic_vssubu_vv_nxv32i8_nxv32i8_nxv32i8:
 ; CHECK:       # %bb.0: # %entry
 ; CHECK-NEXT:    vsetvli zero, a0, e8, m4, ta, ma
@@ -254,7 +256,7 @@ entry:
     <vscale x 32 x i8> undef,
     <vscale x 32 x i8> %0,
     <vscale x 32 x i8> %1,
-    i64 %2)
+    iXLen %2)
 
   ret <vscale x 32 x i8> %a
 }
@@ -264,10 +266,10 @@ declare <vscale x 32 x i8> @llvm.riscv.vssubu.mask.nxv32i8.nxv32i8(
   <vscale x 32 x i8>,
   <vscale x 32 x i8>,
   <vscale x 32 x i1>,
-  i64,
-  i64);
+  iXLen,
+  iXLen)
 
-define <vscale x 32 x i8> @intrinsic_vssubu_mask_vv_nxv32i8_nxv32i8_nxv32i8(<vscale x 32 x i8> %0, <vscale x 32 x i8> %1, <vscale x 32 x i8> %2, <vscale x 32 x i1> %3, i64 %4) nounwind {
+define <vscale x 32 x i8> @intrinsic_vssubu_mask_vv_nxv32i8_nxv32i8_nxv32i8(<vscale x 32 x i8> %0, <vscale x 32 x i8> %1, <vscale x 32 x i8> %2, <vscale x 32 x i1> %3, iXLen %4) nounwind {
 ; CHECK-LABEL: intrinsic_vssubu_mask_vv_nxv32i8_nxv32i8_nxv32i8:
 ; CHECK:       # %bb.0: # %entry
 ; CHECK-NEXT:    vsetvli zero, a0, e8, m4, ta, mu
@@ -279,7 +281,7 @@ entry:
     <vscale x 32 x i8> %1,
     <vscale x 32 x i8> %2,
     <vscale x 32 x i1> %3,
-    i64 %4, i64 1)
+    iXLen %4, iXLen 1)
 
   ret <vscale x 32 x i8> %a
 }
@@ -288,9 +290,9 @@ declare <vscale x 64 x i8> @llvm.riscv.vssubu.nxv64i8.nxv64i8(
   <vscale x 64 x i8>,
   <vscale x 64 x i8>,
   <vscale x 64 x i8>,
-  i64);
+  iXLen)
 
-define <vscale x 64 x i8> @intrinsic_vssubu_vv_nxv64i8_nxv64i8_nxv64i8(<vscale x 64 x i8> %0, <vscale x 64 x i8> %1, i64 %2) nounwind {
+define <vscale x 64 x i8> @intrinsic_vssubu_vv_nxv64i8_nxv64i8_nxv64i8(<vscale x 64 x i8> %0, <vscale x 64 x i8> %1, iXLen %2) nounwind {
 ; CHECK-LABEL: intrinsic_vssubu_vv_nxv64i8_nxv64i8_nxv64i8:
 ; CHECK:       # %bb.0: # %entry
 ; CHECK-NEXT:    vsetvli zero, a0, e8, m8, ta, ma
@@ -301,7 +303,7 @@ entry:
     <vscale x 64 x i8> undef,
     <vscale x 64 x i8> %0,
     <vscale x 64 x i8> %1,
-    i64 %2)
+    iXLen %2)
 
   ret <vscale x 64 x i8> %a
 }
@@ -311,10 +313,10 @@ declare <vscale x 64 x i8> @llvm.riscv.vssubu.mask.nxv64i8.nxv64i8(
   <vscale x 64 x i8>,
   <vscale x 64 x i8>,
   <vscale x 64 x i1>,
-  i64,
-  i64);
+  iXLen,
+  iXLen)
 
-define <vscale x 64 x i8> @intrinsic_vssubu_mask_vv_nxv64i8_nxv64i8_nxv64i8(<vscale x 64 x i8> %0, <vscale x 64 x i8> %1, <vscale x 64 x i8> %2, <vscale x 64 x i1> %3, i64 %4) nounwind {
+define <vscale x 64 x i8> @intrinsic_vssubu_mask_vv_nxv64i8_nxv64i8_nxv64i8(<vscale x 64 x i8> %0, <vscale x 64 x i8> %1, <vscale x 64 x i8> %2, <vscale x 64 x i1> %3, iXLen %4) nounwind {
 ; CHECK-LABEL: intrinsic_vssubu_mask_vv_nxv64i8_nxv64i8_nxv64i8:
 ; CHECK:       # %bb.0: # %entry
 ; CHECK-NEXT:    vl8r.v v24, (a0)
@@ -327,7 +329,7 @@ entry:
     <vscale x 64 x i8> %1,
     <vscale x 64 x i8> %2,
     <vscale x 64 x i1> %3,
-    i64 %4, i64 1)
+    iXLen %4, iXLen 1)
 
   ret <vscale x 64 x i8> %a
 }
@@ -336,9 +338,9 @@ declare <vscale x 1 x i16> @llvm.riscv.vssubu.nxv1i16.nxv1i16(
   <vscale x 1 x i16>,
   <vscale x 1 x i16>,
   <vscale x 1 x i16>,
-  i64);
+  iXLen)
 
-define <vscale x 1 x i16> @intrinsic_vssubu_vv_nxv1i16_nxv1i16_nxv1i16(<vscale x 1 x i16> %0, <vscale x 1 x i16> %1, i64 %2) nounwind {
+define <vscale x 1 x i16> @intrinsic_vssubu_vv_nxv1i16_nxv1i16_nxv1i16(<vscale x 1 x i16> %0, <vscale x 1 x i16> %1, iXLen %2) nounwind {
 ; CHECK-LABEL: intrinsic_vssubu_vv_nxv1i16_nxv1i16_nxv1i16:
 ; CHECK:       # %bb.0: # %entry
 ; CHECK-NEXT:    vsetvli zero, a0, e16, mf4, ta, ma
@@ -349,7 +351,7 @@ entry:
     <vscale x 1 x i16> undef,
     <vscale x 1 x i16> %0,
     <vscale x 1 x i16> %1,
-    i64 %2)
+    iXLen %2)
 
   ret <vscale x 1 x i16> %a
 }
@@ -359,10 +361,10 @@ declare <vscale x 1 x i16> @llvm.riscv.vssubu.mask.nxv1i16.nxv1i16(
   <vscale x 1 x i16>,
   <vscale x 1 x i16>,
   <vscale x 1 x i1>,
-  i64,
-  i64);
+  iXLen,
+  iXLen)
 
-define <vscale x 1 x i16> @intrinsic_vssubu_mask_vv_nxv1i16_nxv1i16_nxv1i16(<vscale x 1 x i16> %0, <vscale x 1 x i16> %1, <vscale x 1 x i16> %2, <vscale x 1 x i1> %3, i64 %4) nounwind {
+define <vscale x 1 x i16> @intrinsic_vssubu_mask_vv_nxv1i16_nxv1i16_nxv1i16(<vscale x 1 x i16> %0, <vscale x 1 x i16> %1, <vscale x 1 x i16> %2, <vscale x 1 x i1> %3, iXLen %4) nounwind {
 ; CHECK-LABEL: intrinsic_vssubu_mask_vv_nxv1i16_nxv1i16_nxv1i16:
 ; CHECK:       # %bb.0: # %entry
 ; CHECK-NEXT:    vsetvli zero, a0, e16, mf4, ta, mu
@@ -374,7 +376,7 @@ entry:
     <vscale x 1 x i16> %1,
     <vscale x 1 x i16> %2,
     <vscale x 1 x i1> %3,
-    i64 %4, i64 1)
+    iXLen %4, iXLen 1)
 
   ret <vscale x 1 x i16> %a
 }
@@ -383,9 +385,9 @@ declare <vscale x 2 x i16> @llvm.riscv.vssubu.nxv2i16.nxv2i16(
   <vscale x 2 x i16>,
   <vscale x 2 x i16>,
   <vscale x 2 x i16>,
-  i64);
+  iXLen)
 
-define <vscale x 2 x i16> @intrinsic_vssubu_vv_nxv2i16_nxv2i16_nxv2i16(<vscale x 2 x i16> %0, <vscale x 2 x i16> %1, i64 %2) nounwind {
+define <vscale x 2 x i16> @intrinsic_vssubu_vv_nxv2i16_nxv2i16_nxv2i16(<vscale x 2 x i16> %0, <vscale x 2 x i16> %1, iXLen %2) nounwind {
 ; CHECK-LABEL: intrinsic_vssubu_vv_nxv2i16_nxv2i16_nxv2i16:
 ; CHECK:       # %bb.0: # %entry
 ; CHECK-NEXT:    vsetvli zero, a0, e16, mf2, ta, ma
@@ -396,7 +398,7 @@ entry:
     <vscale x 2 x i16> undef,
     <vscale x 2 x i16> %0,
     <vscale x 2 x i16> %1,
-    i64 %2)
+    iXLen %2)
 
   ret <vscale x 2 x i16> %a
 }
@@ -406,10 +408,10 @@ declare <vscale x 2 x i16> @llvm.riscv.vssubu.mask.nxv2i16.nxv2i16(
   <vscale x 2 x i16>,
   <vscale x 2 x i16>,
   <vscale x 2 x i1>,
-  i64,
-  i64);
+  iXLen,
+  iXLen)
 
-define <vscale x 2 x i16> @intrinsic_vssubu_mask_vv_nxv2i16_nxv2i16_nxv2i16(<vscale x 2 x i16> %0, <vscale x 2 x i16> %1, <vscale x 2 x i16> %2, <vscale x 2 x i1> %3, i64 %4) nounwind {
+define <vscale x 2 x i16> @intrinsic_vssubu_mask_vv_nxv2i16_nxv2i16_nxv2i16(<vscale x 2 x i16> %0, <vscale x 2 x i16> %1, <vscale x 2 x i16> %2, <vscale x 2 x i1> %3, iXLen %4) nounwind {
 ; CHECK-LABEL: intrinsic_vssubu_mask_vv_nxv2i16_nxv2i16_nxv2i16:
 ; CHECK:       # %bb.0: # %entry
 ; CHECK-NEXT:    vsetvli zero, a0, e16, mf2, ta, mu
@@ -421,7 +423,7 @@ entry:
     <vscale x 2 x i16> %1,
     <vscale x 2 x i16> %2,
     <vscale x 2 x i1> %3,
-    i64 %4, i64 1)
+    iXLen %4, iXLen 1)
 
   ret <vscale x 2 x i16> %a
 }
@@ -430,9 +432,9 @@ declare <vscale x 4 x i16> @llvm.riscv.vssubu.nxv4i16.nxv4i16(
   <vscale x 4 x i16>,
   <vscale x 4 x i16>,
   <vscale x 4 x i16>,
-  i64);
+  iXLen)
 
-define <vscale x 4 x i16> @intrinsic_vssubu_vv_nxv4i16_nxv4i16_nxv4i16(<vscale x 4 x i16> %0, <vscale x 4 x i16> %1, i64 %2) nounwind {
+define <vscale x 4 x i16> @intrinsic_vssubu_vv_nxv4i16_nxv4i16_nxv4i16(<vscale x 4 x i16> %0, <vscale x 4 x i16> %1, iXLen %2) nounwind {
 ; CHECK-LABEL: intrinsic_vssubu_vv_nxv4i16_nxv4i16_nxv4i16:
 ; CHECK:       # %bb.0: # %entry
 ; CHECK-NEXT:    vsetvli zero, a0, e16, m1, ta, ma
@@ -443,7 +445,7 @@ entry:
     <vscale x 4 x i16> undef,
     <vscale x 4 x i16> %0,
     <vscale x 4 x i16> %1,
-    i64 %2)
+    iXLen %2)
 
   ret <vscale x 4 x i16> %a
 }
@@ -453,10 +455,10 @@ declare <vscale x 4 x i16> @llvm.riscv.vssubu.mask.nxv4i16.nxv4i16(
   <vscale x 4 x i16>,
   <vscale x 4 x i16>,
   <vscale x 4 x i1>,
-  i64,
-  i64);
+  iXLen,
+  iXLen)
 
-define <vscale x 4 x i16> @intrinsic_vssubu_mask_vv_nxv4i16_nxv4i16_nxv4i16(<vscale x 4 x i16> %0, <vscale x 4 x i16> %1, <vscale x 4 x i16> %2, <vscale x 4 x i1> %3, i64 %4) nounwind {
+define <vscale x 4 x i16> @intrinsic_vssubu_mask_vv_nxv4i16_nxv4i16_nxv4i16(<vscale x 4 x i16> %0, <vscale x 4 x i16> %1, <vscale x 4 x i16> %2, <vscale x 4 x i1> %3, iXLen %4) nounwind {
 ; CHECK-LABEL: intrinsic_vssubu_mask_vv_nxv4i16_nxv4i16_nxv4i16:
 ; CHECK:       # %bb.0: # %entry
 ; CHECK-NEXT:    vsetvli zero, a0, e16, m1, ta, mu
@@ -468,7 +470,7 @@ entry:
     <vscale x 4 x i16> %1,
     <vscale x 4 x i16> %2,
     <vscale x 4 x i1> %3,
-    i64 %4, i64 1)
+    iXLen %4, iXLen 1)
 
   ret <vscale x 4 x i16> %a
 }
@@ -477,9 +479,9 @@ declare <vscale x 8 x i16> @llvm.riscv.vssubu.nxv8i16.nxv8i16(
   <vscale x 8 x i16>,
   <vscale x 8 x i16>,
   <vscale x 8 x i16>,
-  i64);
+  iXLen)
 
-define <vscale x 8 x i16> @intrinsic_vssubu_vv_nxv8i16_nxv8i16_nxv8i16(<vscale x 8 x i16> %0, <vscale x 8 x i16> %1, i64 %2) nounwind {
+define <vscale x 8 x i16> @intrinsic_vssubu_vv_nxv8i16_nxv8i16_nxv8i16(<vscale x 8 x i16> %0, <vscale x 8 x i16> %1, iXLen %2) nounwind {
 ; CHECK-LABEL: intrinsic_vssubu_vv_nxv8i16_nxv8i16_nxv8i16:
 ; CHECK:       # %bb.0: # %entry
 ; CHECK-NEXT:    vsetvli zero, a0, e16, m2, ta, ma
@@ -490,7 +492,7 @@ entry:
     <vscale x 8 x i16> undef,
     <vscale x 8 x i16> %0,
     <vscale x 8 x i16> %1,
-    i64 %2)
+    iXLen %2)
 
   ret <vscale x 8 x i16> %a
 }
@@ -500,10 +502,10 @@ declare <vscale x 8 x i16> @llvm.riscv.vssubu.mask.nxv8i16.nxv8i16(
   <vscale x 8 x i16>,
   <vscale x 8 x i16>,
   <vscale x 8 x i1>,
-  i64,
-  i64);
+  iXLen,
+  iXLen)
 
-define <vscale x 8 x i16> @intrinsic_vssubu_mask_vv_nxv8i16_nxv8i16_nxv8i16(<vscale x 8 x i16> %0, <vscale x 8 x i16> %1, <vscale x 8 x i16> %2, <vscale x 8 x i1> %3, i64 %4) nounwind {
+define <vscale x 8 x i16> @intrinsic_vssubu_mask_vv_nxv8i16_nxv8i16_nxv8i16(<vscale x 8 x i16> %0, <vscale x 8 x i16> %1, <vscale x 8 x i16> %2, <vscale x 8 x i1> %3, iXLen %4) nounwind {
 ; CHECK-LABEL: intrinsic_vssubu_mask_vv_nxv8i16_nxv8i16_nxv8i16:
 ; CHECK:       # %bb.0: # %entry
 ; CHECK-NEXT:    vsetvli zero, a0, e16, m2, ta, mu
@@ -515,7 +517,7 @@ entry:
     <vscale x 8 x i16> %1,
     <vscale x 8 x i16> %2,
     <vscale x 8 x i1> %3,
-    i64 %4, i64 1)
+    iXLen %4, iXLen 1)
 
   ret <vscale x 8 x i16> %a
 }
@@ -524,9 +526,9 @@ declare <vscale x 16 x i16> @llvm.riscv.vssubu.nxv16i16.nxv16i16(
   <vscale x 16 x i16>,
   <vscale x 16 x i16>,
   <vscale x 16 x i16>,
-  i64);
+  iXLen)
 
-define <vscale x 16 x i16> @intrinsic_vssubu_vv_nxv16i16_nxv16i16_nxv16i16(<vscale x 16 x i16> %0, <vscale x 16 x i16> %1, i64 %2) nounwind {
+define <vscale x 16 x i16> @intrinsic_vssubu_vv_nxv16i16_nxv16i16_nxv16i16(<vscale x 16 x i16> %0, <vscale x 16 x i16> %1, iXLen %2) nounwind {
 ; CHECK-LABEL: intrinsic_vssubu_vv_nxv16i16_nxv16i16_nxv16i16:
 ; CHECK:       # %bb.0: # %entry
 ; CHECK-NEXT:    vsetvli zero, a0, e16, m4, ta, ma
@@ -537,7 +539,7 @@ entry:
     <vscale x 16 x i16> undef,
     <vscale x 16 x i16> %0,
     <vscale x 16 x i16> %1,
-    i64 %2)
+    iXLen %2)
 
   ret <vscale x 16 x i16> %a
 }
@@ -547,10 +549,10 @@ declare <vscale x 16 x i16> @llvm.riscv.vssubu.mask.nxv16i16.nxv16i16(
   <vscale x 16 x i16>,
   <vscale x 16 x i16>,
   <vscale x 16 x i1>,
-  i64,
-  i64);
+  iXLen,
+  iXLen)
 
-define <vscale x 16 x i16> @intrinsic_vssubu_mask_vv_nxv16i16_nxv16i16_nxv16i16(<vscale x 16 x i16> %0, <vscale x 16 x i16> %1, <vscale x 16 x i16> %2, <vscale x 16 x i1> %3, i64 %4) nounwind {
+define <vscale x 16 x i16> @intrinsic_vssubu_mask_vv_nxv16i16_nxv16i16_nxv16i16(<vscale x 16 x i16> %0, <vscale x 16 x i16> %1, <vscale x 16 x i16> %2, <vscale x 16 x i1> %3, iXLen %4) nounwind {
 ; CHECK-LABEL: intrinsic_vssubu_mask_vv_nxv16i16_nxv16i16_nxv16i16:
 ; CHECK:       # %bb.0: # %entry
 ; CHECK-NEXT:    vsetvli zero, a0, e16, m4, ta, mu
@@ -562,7 +564,7 @@ entry:
     <vscale x 16 x i16> %1,
     <vscale x 16 x i16> %2,
     <vscale x 16 x i1> %3,
-    i64 %4, i64 1)
+    iXLen %4, iXLen 1)
 
   ret <vscale x 16 x i16> %a
 }
@@ -571,9 +573,9 @@ declare <vscale x 32 x i16> @llvm.riscv.vssubu.nxv32i16.nxv32i16(
   <vscale x 32 x i16>,
   <vscale x 32 x i16>,
   <vscale x 32 x i16>,
-  i64);
+  iXLen)
 
-define <vscale x 32 x i16> @intrinsic_vssubu_vv_nxv32i16_nxv32i16_nxv32i16(<vscale x 32 x i16> %0, <vscale x 32 x i16> %1, i64 %2) nounwind {
+define <vscale x 32 x i16> @intrinsic_vssubu_vv_nxv32i16_nxv32i16_nxv32i16(<vscale x 32 x i16> %0, <vscale x 32 x i16> %1, iXLen %2) nounwind {
 ; CHECK-LABEL: intrinsic_vssubu_vv_nxv32i16_nxv32i16_nxv32i16:
 ; CHECK:       # %bb.0: # %entry
 ; CHECK-NEXT:    vsetvli zero, a0, e16, m8, ta, ma
@@ -584,7 +586,7 @@ entry:
     <vscale x 32 x i16> undef,
     <vscale x 32 x i16> %0,
     <vscale x 32 x i16> %1,
-    i64 %2)
+    iXLen %2)
 
   ret <vscale x 32 x i16> %a
 }
@@ -594,10 +596,10 @@ declare <vscale x 32 x i16> @llvm.riscv.vssubu.mask.nxv32i16.nxv32i16(
   <vscale x 32 x i16>,
   <vscale x 32 x i16>,
   <vscale x 32 x i1>,
-  i64,
-  i64);
+  iXLen,
+  iXLen)
 
-define <vscale x 32 x i16> @intrinsic_vssubu_mask_vv_nxv32i16_nxv32i16_nxv32i16(<vscale x 32 x i16> %0, <vscale x 32 x i16> %1, <vscale x 32 x i16> %2, <vscale x 32 x i1> %3, i64 %4) nounwind {
+define <vscale x 32 x i16> @intrinsic_vssubu_mask_vv_nxv32i16_nxv32i16_nxv32i16(<vscale x 32 x i16> %0, <vscale x 32 x i16> %1, <vscale x 32 x i16> %2, <vscale x 32 x i1> %3, iXLen %4) nounwind {
 ; CHECK-LABEL: intrinsic_vssubu_mask_vv_nxv32i16_nxv32i16_nxv32i16:
 ; CHECK:       # %bb.0: # %entry
 ; CHECK-NEXT:    vl8re16.v v24, (a0)
@@ -610,7 +612,7 @@ entry:
     <vscale x 32 x i16> %1,
     <vscale x 32 x i16> %2,
     <vscale x 32 x i1> %3,
-    i64 %4, i64 1)
+    iXLen %4, iXLen 1)
 
   ret <vscale x 32 x i16> %a
 }
@@ -619,9 +621,9 @@ declare <vscale x 1 x i32> @llvm.riscv.vssubu.nxv1i32.nxv1i32(
   <vscale x 1 x i32>,
   <vscale x 1 x i32>,
   <vscale x 1 x i32>,
-  i64);
+  iXLen)
 
-define <vscale x 1 x i32> @intrinsic_vssubu_vv_nxv1i32_nxv1i32_nxv1i32(<vscale x 1 x i32> %0, <vscale x 1 x i32> %1, i64 %2) nounwind {
+define <vscale x 1 x i32> @intrinsic_vssubu_vv_nxv1i32_nxv1i32_nxv1i32(<vscale x 1 x i32> %0, <vscale x 1 x i32> %1, iXLen %2) nounwind {
 ; CHECK-LABEL: intrinsic_vssubu_vv_nxv1i32_nxv1i32_nxv1i32:
 ; CHECK:       # %bb.0: # %entry
 ; CHECK-NEXT:    vsetvli zero, a0, e32, mf2, ta, ma
@@ -632,7 +634,7 @@ entry:
     <vscale x 1 x i32> undef,
     <vscale x 1 x i32> %0,
     <vscale x 1 x i32> %1,
-    i64 %2)
+    iXLen %2)
 
   ret <vscale x 1 x i32> %a
 }
@@ -642,10 +644,10 @@ declare <vscale x 1 x i32> @llvm.riscv.vssubu.mask.nxv1i32.nxv1i32(
   <vscale x 1 x i32>,
   <vscale x 1 x i32>,
   <vscale x 1 x i1>,
-  i64,
-  i64);
+  iXLen,
+  iXLen)
 
-define <vscale x 1 x i32> @intrinsic_vssubu_mask_vv_nxv1i32_nxv1i32_nxv1i32(<vscale x 1 x i32> %0, <vscale x 1 x i32> %1, <vscale x 1 x i32> %2, <vscale x 1 x i1> %3, i64 %4) nounwind {
+define <vscale x 1 x i32> @intrinsic_vssubu_mask_vv_nxv1i32_nxv1i32_nxv1i32(<vscale x 1 x i32> %0, <vscale x 1 x i32> %1, <vscale x 1 x i32> %2, <vscale x 1 x i1> %3, iXLen %4) nounwind {
 ; CHECK-LABEL: intrinsic_vssubu_mask_vv_nxv1i32_nxv1i32_nxv1i32:
 ; CHECK:       # %bb.0: # %entry
 ; CHECK-NEXT:    vsetvli zero, a0, e32, mf2, ta, mu
@@ -657,7 +659,7 @@ entry:
     <vscale x 1 x i32> %1,
     <vscale x 1 x i32> %2,
     <vscale x 1 x i1> %3,
-    i64 %4, i64 1)
+    iXLen %4, iXLen 1)
 
   ret <vscale x 1 x i32> %a
 }
@@ -666,9 +668,9 @@ declare <vscale x 2 x i32> @llvm.riscv.vssubu.nxv2i32.nxv2i32(
   <vscale x 2 x i32>,
   <vscale x 2 x i32>,
   <vscale x 2 x i32>,
-  i64);
+  iXLen)
 
-define <vscale x 2 x i32> @intrinsic_vssubu_vv_nxv2i32_nxv2i32_nxv2i32(<vscale x 2 x i32> %0, <vscale x 2 x i32> %1, i64 %2) nounwind {
+define <vscale x 2 x i32> @intrinsic_vssubu_vv_nxv2i32_nxv2i32_nxv2i32(<vscale x 2 x i32> %0, <vscale x 2 x i32> %1, iXLen %2) nounwind {
 ; CHECK-LABEL: intrinsic_vssubu_vv_nxv2i32_nxv2i32_nxv2i32:
 ; CHECK:       # %bb.0: # %entry
 ; CHECK-NEXT:    vsetvli zero, a0, e32, m1, ta, ma
@@ -679,7 +681,7 @@ entry:
     <vscale x 2 x i32> undef,
     <vscale x 2 x i32> %0,
     <vscale x 2 x i32> %1,
-    i64 %2)
+    iXLen %2)
 
   ret <vscale x 2 x i32> %a
 }
@@ -689,10 +691,10 @@ declare <vscale x 2 x i32> @llvm.riscv.vssubu.mask.nxv2i32.nxv2i32(
   <vscale x 2 x i32>,
   <vscale x 2 x i32>,
   <vscale x 2 x i1>,
-  i64,
-  i64);
+  iXLen,
+  iXLen)
 
-define <vscale x 2 x i32> @intrinsic_vssubu_mask_vv_nxv2i32_nxv2i32_nxv2i32(<vscale x 2 x i32> %0, <vscale x 2 x i32> %1, <vscale x 2 x i32> %2, <vscale x 2 x i1> %3, i64 %4) nounwind {
+define <vscale x 2 x i32> @intrinsic_vssubu_mask_vv_nxv2i32_nxv2i32_nxv2i32(<vscale x 2 x i32> %0, <vscale x 2 x i32> %1, <vscale x 2 x i32> %2, <vscale x 2 x i1> %3, iXLen %4) nounwind {
 ; CHECK-LABEL: intrinsic_vssubu_mask_vv_nxv2i32_nxv2i32_nxv2i32:
 ; CHECK:       # %bb.0: # %entry
 ; CHECK-NEXT:    vsetvli zero, a0, e32, m1, ta, mu
@@ -704,7 +706,7 @@ entry:
     <vscale x 2 x i32> %1,
     <vscale x 2 x i32> %2,
     <vscale x 2 x i1> %3,
-    i64 %4, i64 1)
+    iXLen %4, iXLen 1)
 
   ret <vscale x 2 x i32> %a
 }
@@ -713,9 +715,9 @@ declare <vscale x 4 x i32> @llvm.riscv.vssubu.nxv4i32.nxv4i32(
   <vscale x 4 x i32>,
   <vscale x 4 x i32>,
   <vscale x 4 x i32>,
-  i64);
+  iXLen)
 
-define <vscale x 4 x i32> @intrinsic_vssubu_vv_nxv4i32_nxv4i32_nxv4i32(<vscale x 4 x i32> %0, <vscale x 4 x i32> %1, i64 %2) nounwind {
+define <vscale x 4 x i32> @intrinsic_vssubu_vv_nxv4i32_nxv4i32_nxv4i32(<vscale x 4 x i32> %0, <vscale x 4 x i32> %1, iXLen %2) nounwind {
 ; CHECK-LABEL: intrinsic_vssubu_vv_nxv4i32_nxv4i32_nxv4i32:
 ; CHECK:       # %bb.0: # %entry
 ; CHECK-NEXT:    vsetvli zero, a0, e32, m2, ta, ma
@@ -726,7 +728,7 @@ entry:
     <vscale x 4 x i32> undef,
     <vscale x 4 x i32> %0,
     <vscale x 4 x i32> %1,
-    i64 %2)
+    iXLen %2)
 
   ret <vscale x 4 x i32> %a
 }
@@ -736,10 +738,10 @@ declare <vscale x 4 x i32> @llvm.riscv.vssubu.mask.nxv4i32.nxv4i32(
   <vscale x 4 x i32>,
   <vscale x 4 x i32>,
   <vscale x 4 x i1>,
-  i64,
-  i64);
+  iXLen,
+  iXLen)
 
-define <vscale x 4 x i32> @intrinsic_vssubu_mask_vv_nxv4i32_nxv4i32_nxv4i32(<vscale x 4 x i32> %0, <vscale x 4 x i32> %1, <vscale x 4 x i32> %2, <vscale x 4 x i1> %3, i64 %4) nounwind {
+define <vscale x 4 x i32> @intrinsic_vssubu_mask_vv_nxv4i32_nxv4i32_nxv4i32(<vscale x 4 x i32> %0, <vscale x 4 x i32> %1, <vscale x 4 x i32> %2, <vscale x 4 x i1> %3, iXLen %4) nounwind {
 ; CHECK-LABEL: intrinsic_vssubu_mask_vv_nxv4i32_nxv4i32_nxv4i32:
 ; CHECK:       # %bb.0: # %entry
 ; CHECK-NEXT:    vsetvli zero, a0, e32, m2, ta, mu
@@ -751,7 +753,7 @@ entry:
     <vscale x 4 x i32> %1,
     <vscale x 4 x i32> %2,
     <vscale x 4 x i1> %3,
-    i64 %4, i64 1)
+    iXLen %4, iXLen 1)
 
   ret <vscale x 4 x i32> %a
 }
@@ -760,9 +762,9 @@ declare <vscale x 8 x i32> @llvm.riscv.vssubu.nxv8i32.nxv8i32(
   <vscale x 8 x i32>,
   <vscale x 8 x i32>,
   <vscale x 8 x i32>,
-  i64);
+  iXLen)
 
-define <vscale x 8 x i32> @intrinsic_vssubu_vv_nxv8i32_nxv8i32_nxv8i32(<vscale x 8 x i32> %0, <vscale x 8 x i32> %1, i64 %2) nounwind {
+define <vscale x 8 x i32> @intrinsic_vssubu_vv_nxv8i32_nxv8i32_nxv8i32(<vscale x 8 x i32> %0, <vscale x 8 x i32> %1, iXLen %2) nounwind {
 ; CHECK-LABEL: intrinsic_vssubu_vv_nxv8i32_nxv8i32_nxv8i32:
 ; CHECK:       # %bb.0: # %entry
 ; CHECK-NEXT:    vsetvli zero, a0, e32, m4, ta, ma
@@ -773,7 +775,7 @@ entry:
     <vscale x 8 x i32> undef,
     <vscale x 8 x i32> %0,
     <vscale x 8 x i32> %1,
-    i64 %2)
+    iXLen %2)
 
   ret <vscale x 8 x i32> %a
 }
@@ -783,10 +785,10 @@ declare <vscale x 8 x i32> @llvm.riscv.vssubu.mask.nxv8i32.nxv8i32(
   <vscale x 8 x i32>,
   <vscale x 8 x i32>,
   <vscale x 8 x i1>,
-  i64,
-  i64);
+  iXLen,
+  iXLen)
 
-define <vscale x 8 x i32> @intrinsic_vssubu_mask_vv_nxv8i32_nxv8i32_nxv8i32(<vscale x 8 x i32> %0, <vscale x 8 x i32> %1, <vscale x 8 x i32> %2, <vscale x 8 x i1> %3, i64 %4) nounwind {
+define <vscale x 8 x i32> @intrinsic_vssubu_mask_vv_nxv8i32_nxv8i32_nxv8i32(<vscale x 8 x i32> %0, <vscale x 8 x i32> %1, <vscale x 8 x i32> %2, <vscale x 8 x i1> %3, iXLen %4) nounwind {
 ; CHECK-LABEL: intrinsic_vssubu_mask_vv_nxv8i32_nxv8i32_nxv8i32:
 ; CHECK:       # %bb.0: # %entry
 ; CHECK-NEXT:    vsetvli zero, a0, e32, m4, ta, mu
@@ -798,7 +800,7 @@ entry:
     <vscale x 8 x i32> %1,
     <vscale x 8 x i32> %2,
     <vscale x 8 x i1> %3,
-    i64 %4, i64 1)
+    iXLen %4, iXLen 1)
 
   ret <vscale x 8 x i32> %a
 }
@@ -807,9 +809,9 @@ declare <vscale x 16 x i32> @llvm.riscv.vssubu.nxv16i32.nxv16i32(
   <vscale x 16 x i32>,
   <vscale x 16 x i32>,
   <vscale x 16 x i32>,
-  i64);
+  iXLen)
 
-define <vscale x 16 x i32> @intrinsic_vssubu_vv_nxv16i32_nxv16i32_nxv16i32(<vscale x 16 x i32> %0, <vscale x 16 x i32> %1, i64 %2) nounwind {
+define <vscale x 16 x i32> @intrinsic_vssubu_vv_nxv16i32_nxv16i32_nxv16i32(<vscale x 16 x i32> %0, <vscale x 16 x i32> %1, iXLen %2) nounwind {
 ; CHECK-LABEL: intrinsic_vssubu_vv_nxv16i32_nxv16i32_nxv16i32:
 ; CHECK:       # %bb.0: # %entry
 ; CHECK-NEXT:    vsetvli zero, a0, e32, m8, ta, ma
@@ -820,7 +822,7 @@ entry:
     <vscale x 16 x i32> undef,
     <vscale x 16 x i32> %0,
     <vscale x 16 x i32> %1,
-    i64 %2)
+    iXLen %2)
 
   ret <vscale x 16 x i32> %a
 }
@@ -830,10 +832,10 @@ declare <vscale x 16 x i32> @llvm.riscv.vssubu.mask.nxv16i32.nxv16i32(
   <vscale x 16 x i32>,
   <vscale x 16 x i32>,
   <vscale x 16 x i1>,
-  i64,
-  i64);
+  iXLen,
+  iXLen)
 
-define <vscale x 16 x i32> @intrinsic_vssubu_mask_vv_nxv16i32_nxv16i32_nxv16i32(<vscale x 16 x i32> %0, <vscale x 16 x i32> %1, <vscale x 16 x i32> %2, <vscale x 16 x i1> %3, i64 %4) nounwind {
+define <vscale x 16 x i32> @intrinsic_vssubu_mask_vv_nxv16i32_nxv16i32_nxv16i32(<vscale x 16 x i32> %0, <vscale x 16 x i32> %1, <vscale x 16 x i32> %2, <vscale x 16 x i1> %3, iXLen %4) nounwind {
 ; CHECK-LABEL: intrinsic_vssubu_mask_vv_nxv16i32_nxv16i32_nxv16i32:
 ; CHECK:       # %bb.0: # %entry
 ; CHECK-NEXT:    vl8re32.v v24, (a0)
@@ -846,7 +848,7 @@ entry:
     <vscale x 16 x i32> %1,
     <vscale x 16 x i32> %2,
     <vscale x 16 x i1> %3,
-    i64 %4, i64 1)
+    iXLen %4, iXLen 1)
 
   ret <vscale x 16 x i32> %a
 }
@@ -855,9 +857,9 @@ declare <vscale x 1 x i64> @llvm.riscv.vssubu.nxv1i64.nxv1i64(
   <vscale x 1 x i64>,
   <vscale x 1 x i64>,
   <vscale x 1 x i64>,
-  i64);
+  iXLen)
 
-define <vscale x 1 x i64> @intrinsic_vssubu_vv_nxv1i64_nxv1i64_nxv1i64(<vscale x 1 x i64> %0, <vscale x 1 x i64> %1, i64 %2) nounwind {
+define <vscale x 1 x i64> @intrinsic_vssubu_vv_nxv1i64_nxv1i64_nxv1i64(<vscale x 1 x i64> %0, <vscale x 1 x i64> %1, iXLen %2) nounwind {
 ; CHECK-LABEL: intrinsic_vssubu_vv_nxv1i64_nxv1i64_nxv1i64:
 ; CHECK:       # %bb.0: # %entry
 ; CHECK-NEXT:    vsetvli zero, a0, e64, m1, ta, ma
@@ -868,7 +870,7 @@ entry:
     <vscale x 1 x i64> undef,
     <vscale x 1 x i64> %0,
     <vscale x 1 x i64> %1,
-    i64 %2)
+    iXLen %2)
 
   ret <vscale x 1 x i64> %a
 }
@@ -878,10 +880,10 @@ declare <vscale x 1 x i64> @llvm.riscv.vssubu.mask.nxv1i64.nxv1i64(
   <vscale x 1 x i64>,
   <vscale x 1 x i64>,
   <vscale x 1 x i1>,
-  i64,
-  i64);
+  iXLen,
+  iXLen)
 
-define <vscale x 1 x i64> @intrinsic_vssubu_mask_vv_nxv1i64_nxv1i64_nxv1i64(<vscale x 1 x i64> %0, <vscale x 1 x i64> %1, <vscale x 1 x i64> %2, <vscale x 1 x i1> %3, i64 %4) nounwind {
+define <vscale x 1 x i64> @intrinsic_vssubu_mask_vv_nxv1i64_nxv1i64_nxv1i64(<vscale x 1 x i64> %0, <vscale x 1 x i64> %1, <vscale x 1 x i64> %2, <vscale x 1 x i1> %3, iXLen %4) nounwind {
 ; CHECK-LABEL: intrinsic_vssubu_mask_vv_nxv1i64_nxv1i64_nxv1i64:
 ; CHECK:       # %bb.0: # %entry
 ; CHECK-NEXT:    vsetvli zero, a0, e64, m1, ta, mu
@@ -893,7 +895,7 @@ entry:
     <vscale x 1 x i64> %1,
     <vscale x 1 x i64> %2,
     <vscale x 1 x i1> %3,
-    i64 %4, i64 1)
+    iXLen %4, iXLen 1)
 
   ret <vscale x 1 x i64> %a
 }
@@ -902,9 +904,9 @@ declare <vscale x 2 x i64> @llvm.riscv.vssubu.nxv2i64.nxv2i64(
   <vscale x 2 x i64>,
   <vscale x 2 x i64>,
   <vscale x 2 x i64>,
-  i64);
+  iXLen)
 
-define <vscale x 2 x i64> @intrinsic_vssubu_vv_nxv2i64_nxv2i64_nxv2i64(<vscale x 2 x i64> %0, <vscale x 2 x i64> %1, i64 %2) nounwind {
+define <vscale x 2 x i64> @intrinsic_vssubu_vv_nxv2i64_nxv2i64_nxv2i64(<vscale x 2 x i64> %0, <vscale x 2 x i64> %1, iXLen %2) nounwind {
 ; CHECK-LABEL: intrinsic_vssubu_vv_nxv2i64_nxv2i64_nxv2i64:
 ; CHECK:       # %bb.0: # %entry
 ; CHECK-NEXT:    vsetvli zero, a0, e64, m2, ta, ma
@@ -915,7 +917,7 @@ entry:
     <vscale x 2 x i64> undef,
     <vscale x 2 x i64> %0,
     <vscale x 2 x i64> %1,
-    i64 %2)
+    iXLen %2)
 
   ret <vscale x 2 x i64> %a
 }
@@ -925,10 +927,10 @@ declare <vscale x 2 x i64> @llvm.riscv.vssubu.mask.nxv2i64.nxv2i64(
   <vscale x 2 x i64>,
   <vscale x 2 x i64>,
   <vscale x 2 x i1>,
-  i64,
-  i64);
+  iXLen,
+  iXLen)
 
-define <vscale x 2 x i64> @intrinsic_vssubu_mask_vv_nxv2i64_nxv2i64_nxv2i64(<vscale x 2 x i64> %0, <vscale x 2 x i64> %1, <vscale x 2 x i64> %2, <vscale x 2 x i1> %3, i64 %4) nounwind {
+define <vscale x 2 x i64> @intrinsic_vssubu_mask_vv_nxv2i64_nxv2i64_nxv2i64(<vscale x 2 x i64> %0, <vscale x 2 x i64> %1, <vscale x 2 x i64> %2, <vscale x 2 x i1> %3, iXLen %4) nounwind {
 ; CHECK-LABEL: intrinsic_vssubu_mask_vv_nxv2i64_nxv2i64_nxv2i64:
 ; CHECK:       # %bb.0: # %entry
 ; CHECK-NEXT:    vsetvli zero, a0, e64, m2, ta, mu
@@ -940,7 +942,7 @@ entry:
     <vscale x 2 x i64> %1,
     <vscale x 2 x i64> %2,
     <vscale x 2 x i1> %3,
-    i64 %4, i64 1)
+    iXLen %4, iXLen 1)
 
   ret <vscale x 2 x i64> %a
 }
@@ -949,9 +951,9 @@ declare <vscale x 4 x i64> @llvm.riscv.vssubu.nxv4i64.nxv4i64(
   <vscale x 4 x i64>,
   <vscale x 4 x i64>,
   <vscale x 4 x i64>,
-  i64);
+  iXLen)
 
-define <vscale x 4 x i64> @intrinsic_vssubu_vv_nxv4i64_nxv4i64_nxv4i64(<vscale x 4 x i64> %0, <vscale x 4 x i64> %1, i64 %2) nounwind {
+define <vscale x 4 x i64> @intrinsic_vssubu_vv_nxv4i64_nxv4i64_nxv4i64(<vscale x 4 x i64> %0, <vscale x 4 x i64> %1, iXLen %2) nounwind {
 ; CHECK-LABEL: intrinsic_vssubu_vv_nxv4i64_nxv4i64_nxv4i64:
 ; CHECK:       # %bb.0: # %entry
 ; CHECK-NEXT:    vsetvli zero, a0, e64, m4, ta, ma
@@ -962,7 +964,7 @@ entry:
     <vscale x 4 x i64> undef,
     <vscale x 4 x i64> %0,
     <vscale x 4 x i64> %1,
-    i64 %2)
+    iXLen %2)
 
   ret <vscale x 4 x i64> %a
 }
@@ -972,10 +974,10 @@ declare <vscale x 4 x i64> @llvm.riscv.vssubu.mask.nxv4i64.nxv4i64(
   <vscale x 4 x i64>,
   <vscale x 4 x i64>,
   <vscale x 4 x i1>,
-  i64,
-  i64);
+  iXLen,
+  iXLen)
 
-define <vscale x 4 x i64> @intrinsic_vssubu_mask_vv_nxv4i64_nxv4i64_nxv4i64(<vscale x 4 x i64> %0, <vscale x 4 x i64> %1, <vscale x 4 x i64> %2, <vscale x 4 x i1> %3, i64 %4) nounwind {
+define <vscale x 4 x i64> @intrinsic_vssubu_mask_vv_nxv4i64_nxv4i64_nxv4i64(<vscale x 4 x i64> %0, <vscale x 4 x i64> %1, <vscale x 4 x i64> %2, <vscale x 4 x i1> %3, iXLen %4) nounwind {
 ; CHECK-LABEL: intrinsic_vssubu_mask_vv_nxv4i64_nxv4i64_nxv4i64:
 ; CHECK:       # %bb.0: # %entry
 ; CHECK-NEXT:    vsetvli zero, a0, e64, m4, ta, mu
@@ -987,7 +989,7 @@ entry:
     <vscale x 4 x i64> %1,
     <vscale x 4 x i64> %2,
     <vscale x 4 x i1> %3,
-    i64 %4, i64 1)
+    iXLen %4, iXLen 1)
 
   ret <vscale x 4 x i64> %a
 }
@@ -996,9 +998,9 @@ declare <vscale x 8 x i64> @llvm.riscv.vssubu.nxv8i64.nxv8i64(
   <vscale x 8 x i64>,
   <vscale x 8 x i64>,
   <vscale x 8 x i64>,
-  i64);
+  iXLen)
 
-define <vscale x 8 x i64> @intrinsic_vssubu_vv_nxv8i64_nxv8i64_nxv8i64(<vscale x 8 x i64> %0, <vscale x 8 x i64> %1, i64 %2) nounwind {
+define <vscale x 8 x i64> @intrinsic_vssubu_vv_nxv8i64_nxv8i64_nxv8i64(<vscale x 8 x i64> %0, <vscale x 8 x i64> %1, iXLen %2) nounwind {
 ; CHECK-LABEL: intrinsic_vssubu_vv_nxv8i64_nxv8i64_nxv8i64:
 ; CHECK:       # %bb.0: # %entry
 ; CHECK-NEXT:    vsetvli zero, a0, e64, m8, ta, ma
@@ -1009,7 +1011,7 @@ entry:
     <vscale x 8 x i64> undef,
     <vscale x 8 x i64> %0,
     <vscale x 8 x i64> %1,
-    i64 %2)
+    iXLen %2)
 
   ret <vscale x 8 x i64> %a
 }
@@ -1019,10 +1021,10 @@ declare <vscale x 8 x i64> @llvm.riscv.vssubu.mask.nxv8i64.nxv8i64(
   <vscale x 8 x i64>,
   <vscale x 8 x i64>,
   <vscale x 8 x i1>,
-  i64,
-  i64);
+  iXLen,
+  iXLen)
 
-define <vscale x 8 x i64> @intrinsic_vssubu_mask_vv_nxv8i64_nxv8i64_nxv8i64(<vscale x 8 x i64> %0, <vscale x 8 x i64> %1, <vscale x 8 x i64> %2, <vscale x 8 x i1> %3, i64 %4) nounwind {
+define <vscale x 8 x i64> @intrinsic_vssubu_mask_vv_nxv8i64_nxv8i64_nxv8i64(<vscale x 8 x i64> %0, <vscale x 8 x i64> %1, <vscale x 8 x i64> %2, <vscale x 8 x i1> %3, iXLen %4) nounwind {
 ; CHECK-LABEL: intrinsic_vssubu_mask_vv_nxv8i64_nxv8i64_nxv8i64:
 ; CHECK:       # %bb.0: # %entry
 ; CHECK-NEXT:    vl8re64.v v24, (a0)
@@ -1035,7 +1037,7 @@ entry:
     <vscale x 8 x i64> %1,
     <vscale x 8 x i64> %2,
     <vscale x 8 x i1> %3,
-    i64 %4, i64 1)
+    iXLen %4, iXLen 1)
 
   ret <vscale x 8 x i64> %a
 }
@@ -1044,9 +1046,9 @@ declare <vscale x 1 x i8> @llvm.riscv.vssubu.nxv1i8.i8(
   <vscale x 1 x i8>,
   <vscale x 1 x i8>,
   i8,
-  i64);
+  iXLen)
 
-define <vscale x 1 x i8> @intrinsic_vssubu_vx_nxv1i8_nxv1i8_i8(<vscale x 1 x i8> %0, i8 %1, i64 %2) nounwind {
+define <vscale x 1 x i8> @intrinsic_vssubu_vx_nxv1i8_nxv1i8_i8(<vscale x 1 x i8> %0, i8 %1, iXLen %2) nounwind {
 ; CHECK-LABEL: intrinsic_vssubu_vx_nxv1i8_nxv1i8_i8:
 ; CHECK:       # %bb.0: # %entry
 ; CHECK-NEXT:    vsetvli zero, a1, e8, mf8, ta, ma
@@ -1057,7 +1059,7 @@ entry:
     <vscale x 1 x i8> undef,
     <vscale x 1 x i8> %0,
     i8 %1,
-    i64 %2)
+    iXLen %2)
 
   ret <vscale x 1 x i8> %a
 }
@@ -1067,10 +1069,10 @@ declare <vscale x 1 x i8> @llvm.riscv.vssubu.mask.nxv1i8.i8(
   <vscale x 1 x i8>,
   i8,
   <vscale x 1 x i1>,
-  i64,
-  i64);
+  iXLen,
+  iXLen)
 
-define <vscale x 1 x i8> @intrinsic_vssubu_mask_vx_nxv1i8_nxv1i8_i8(<vscale x 1 x i8> %0, <vscale x 1 x i8> %1, i8 %2, <vscale x 1 x i1> %3, i64 %4) nounwind {
+define <vscale x 1 x i8> @intrinsic_vssubu_mask_vx_nxv1i8_nxv1i8_i8(<vscale x 1 x i8> %0, <vscale x 1 x i8> %1, i8 %2, <vscale x 1 x i1> %3, iXLen %4) nounwind {
 ; CHECK-LABEL: intrinsic_vssubu_mask_vx_nxv1i8_nxv1i8_i8:
 ; CHECK:       # %bb.0: # %entry
 ; CHECK-NEXT:    vsetvli zero, a1, e8, mf8, ta, mu
@@ -1082,7 +1084,7 @@ entry:
     <vscale x 1 x i8> %1,
     i8 %2,
     <vscale x 1 x i1> %3,
-    i64 %4, i64 1)
+    iXLen %4, iXLen 1)
 
   ret <vscale x 1 x i8> %a
 }
@@ -1091,9 +1093,9 @@ declare <vscale x 2 x i8> @llvm.riscv.vssubu.nxv2i8.i8(
   <vscale x 2 x i8>,
   <vscale x 2 x i8>,
   i8,
-  i64);
+  iXLen)
 
-define <vscale x 2 x i8> @intrinsic_vssubu_vx_nxv2i8_nxv2i8_i8(<vscale x 2 x i8> %0, i8 %1, i64 %2) nounwind {
+define <vscale x 2 x i8> @intrinsic_vssubu_vx_nxv2i8_nxv2i8_i8(<vscale x 2 x i8> %0, i8 %1, iXLen %2) nounwind {
 ; CHECK-LABEL: intrinsic_vssubu_vx_nxv2i8_nxv2i8_i8:
 ; CHECK:       # %bb.0: # %entry
 ; CHECK-NEXT:    vsetvli zero, a1, e8, mf4, ta, ma
@@ -1104,7 +1106,7 @@ entry:
     <vscale x 2 x i8> undef,
     <vscale x 2 x i8> %0,
     i8 %1,
-    i64 %2)
+    iXLen %2)
 
   ret <vscale x 2 x i8> %a
 }
@@ -1114,10 +1116,10 @@ declare <vscale x 2 x i8> @llvm.riscv.vssubu.mask.nxv2i8.i8(
   <vscale x 2 x i8>,
   i8,
   <vscale x 2 x i1>,
-  i64,
-  i64);
+  iXLen,
+  iXLen)
 
-define <vscale x 2 x i8> @intrinsic_vssubu_mask_vx_nxv2i8_nxv2i8_i8(<vscale x 2 x i8> %0, <vscale x 2 x i8> %1, i8 %2, <vscale x 2 x i1> %3, i64 %4) nounwind {
+define <vscale x 2 x i8> @intrinsic_vssubu_mask_vx_nxv2i8_nxv2i8_i8(<vscale x 2 x i8> %0, <vscale x 2 x i8> %1, i8 %2, <vscale x 2 x i1> %3, iXLen %4) nounwind {
 ; CHECK-LABEL: intrinsic_vssubu_mask_vx_nxv2i8_nxv2i8_i8:
 ; CHECK:       # %bb.0: # %entry
 ; CHECK-NEXT:    vsetvli zero, a1, e8, mf4, ta, mu
@@ -1129,7 +1131,7 @@ entry:
     <vscale x 2 x i8> %1,
     i8 %2,
     <vscale x 2 x i1> %3,
-    i64 %4, i64 1)
+    iXLen %4, iXLen 1)
 
   ret <vscale x 2 x i8> %a
 }
@@ -1138,9 +1140,9 @@ declare <vscale x 4 x i8> @llvm.riscv.vssubu.nxv4i8.i8(
   <vscale x 4 x i8>,
   <vscale x 4 x i8>,
   i8,
-  i64);
+  iXLen)
 
-define <vscale x 4 x i8> @intrinsic_vssubu_vx_nxv4i8_nxv4i8_i8(<vscale x 4 x i8> %0, i8 %1, i64 %2) nounwind {
+define <vscale x 4 x i8> @intrinsic_vssubu_vx_nxv4i8_nxv4i8_i8(<vscale x 4 x i8> %0, i8 %1, iXLen %2) nounwind {
 ; CHECK-LABEL: intrinsic_vssubu_vx_nxv4i8_nxv4i8_i8:
 ; CHECK:       # %bb.0: # %entry
 ; CHECK-NEXT:    vsetvli zero, a1, e8, mf2, ta, ma
@@ -1151,7 +1153,7 @@ entry:
     <vscale x 4 x i8> undef,
     <vscale x 4 x i8> %0,
     i8 %1,
-    i64 %2)
+    iXLen %2)
 
   ret <vscale x 4 x i8> %a
 }
@@ -1161,10 +1163,10 @@ declare <vscale x 4 x i8> @llvm.riscv.vssubu.mask.nxv4i8.i8(
   <vscale x 4 x i8>,
   i8,
   <vscale x 4 x i1>,
-  i64,
-  i64);
+  iXLen,
+  iXLen)
 
-define <vscale x 4 x i8> @intrinsic_vssubu_mask_vx_nxv4i8_nxv4i8_i8(<vscale x 4 x i8> %0, <vscale x 4 x i8> %1, i8 %2, <vscale x 4 x i1> %3, i64 %4) nounwind {
+define <vscale x 4 x i8> @intrinsic_vssubu_mask_vx_nxv4i8_nxv4i8_i8(<vscale x 4 x i8> %0, <vscale x 4 x i8> %1, i8 %2, <vscale x 4 x i1> %3, iXLen %4) nounwind {
 ; CHECK-LABEL: intrinsic_vssubu_mask_vx_nxv4i8_nxv4i8_i8:
 ; CHECK:       # %bb.0: # %entry
 ; CHECK-NEXT:    vsetvli zero, a1, e8, mf2, ta, mu
@@ -1176,7 +1178,7 @@ entry:
     <vscale x 4 x i8> %1,
     i8 %2,
     <vscale x 4 x i1> %3,
-    i64 %4, i64 1)
+    iXLen %4, iXLen 1)
 
   ret <vscale x 4 x i8> %a
 }
@@ -1185,9 +1187,9 @@ declare <vscale x 8 x i8> @llvm.riscv.vssubu.nxv8i8.i8(
   <vscale x 8 x i8>,
   <vscale x 8 x i8>,
   i8,
-  i64);
+  iXLen)
 
-define <vscale x 8 x i8> @intrinsic_vssubu_vx_nxv8i8_nxv8i8_i8(<vscale x 8 x i8> %0, i8 %1, i64 %2) nounwind {
+define <vscale x 8 x i8> @intrinsic_vssubu_vx_nxv8i8_nxv8i8_i8(<vscale x 8 x i8> %0, i8 %1, iXLen %2) nounwind {
 ; CHECK-LABEL: intrinsic_vssubu_vx_nxv8i8_nxv8i8_i8:
 ; CHECK:       # %bb.0: # %entry
 ; CHECK-NEXT:    vsetvli zero, a1, e8, m1, ta, ma
@@ -1198,7 +1200,7 @@ entry:
     <vscale x 8 x i8> undef,
     <vscale x 8 x i8> %0,
     i8 %1,
-    i64 %2)
+    iXLen %2)
 
   ret <vscale x 8 x i8> %a
 }
@@ -1208,10 +1210,10 @@ declare <vscale x 8 x i8> @llvm.riscv.vssubu.mask.nxv8i8.i8(
   <vscale x 8 x i8>,
   i8,
   <vscale x 8 x i1>,
-  i64,
-  i64);
+  iXLen,
+  iXLen)
 
-define <vscale x 8 x i8> @intrinsic_vssubu_mask_vx_nxv8i8_nxv8i8_i8(<vscale x 8 x i8> %0, <vscale x 8 x i8> %1, i8 %2, <vscale x 8 x i1> %3, i64 %4) nounwind {
+define <vscale x 8 x i8> @intrinsic_vssubu_mask_vx_nxv8i8_nxv8i8_i8(<vscale x 8 x i8> %0, <vscale x 8 x i8> %1, i8 %2, <vscale x 8 x i1> %3, iXLen %4) nounwind {
 ; CHECK-LABEL: intrinsic_vssubu_mask_vx_nxv8i8_nxv8i8_i8:
 ; CHECK:       # %bb.0: # %entry
 ; CHECK-NEXT:    vsetvli zero, a1, e8, m1, ta, mu
@@ -1223,7 +1225,7 @@ entry:
     <vscale x 8 x i8> %1,
     i8 %2,
     <vscale x 8 x i1> %3,
-    i64 %4, i64 1)
+    iXLen %4, iXLen 1)
 
   ret <vscale x 8 x i8> %a
 }
@@ -1232,9 +1234,9 @@ declare <vscale x 16 x i8> @llvm.riscv.vssubu.nxv16i8.i8(
   <vscale x 16 x i8>,
   <vscale x 16 x i8>,
   i8,
-  i64);
+  iXLen)
 
-define <vscale x 16 x i8> @intrinsic_vssubu_vx_nxv16i8_nxv16i8_i8(<vscale x 16 x i8> %0, i8 %1, i64 %2) nounwind {
+define <vscale x 16 x i8> @intrinsic_vssubu_vx_nxv16i8_nxv16i8_i8(<vscale x 16 x i8> %0, i8 %1, iXLen %2) nounwind {
 ; CHECK-LABEL: intrinsic_vssubu_vx_nxv16i8_nxv16i8_i8:
 ; CHECK:       # %bb.0: # %entry
 ; CHECK-NEXT:    vsetvli zero, a1, e8, m2, ta, ma
@@ -1245,7 +1247,7 @@ entry:
     <vscale x 16 x i8> undef,
     <vscale x 16 x i8> %0,
     i8 %1,
-    i64 %2)
+    iXLen %2)
 
   ret <vscale x 16 x i8> %a
 }
@@ -1255,10 +1257,10 @@ declare <vscale x 16 x i8> @llvm.riscv.vssubu.mask.nxv16i8.i8(
   <vscale x 16 x i8>,
   i8,
   <vscale x 16 x i1>,
-  i64,
-  i64);
+  iXLen,
+  iXLen)
 
-define <vscale x 16 x i8> @intrinsic_vssubu_mask_vx_nxv16i8_nxv16i8_i8(<vscale x 16 x i8> %0, <vscale x 16 x i8> %1, i8 %2, <vscale x 16 x i1> %3, i64 %4) nounwind {
+define <vscale x 16 x i8> @intrinsic_vssubu_mask_vx_nxv16i8_nxv16i8_i8(<vscale x 16 x i8> %0, <vscale x 16 x i8> %1, i8 %2, <vscale x 16 x i1> %3, iXLen %4) nounwind {
 ; CHECK-LABEL: intrinsic_vssubu_mask_vx_nxv16i8_nxv16i8_i8:
 ; CHECK:       # %bb.0: # %entry
 ; CHECK-NEXT:    vsetvli zero, a1, e8, m2, ta, mu
@@ -1270,7 +1272,7 @@ entry:
     <vscale x 16 x i8> %1,
     i8 %2,
     <vscale x 16 x i1> %3,
-    i64 %4, i64 1)
+    iXLen %4, iXLen 1)
 
   ret <vscale x 16 x i8> %a
 }
@@ -1279,9 +1281,9 @@ declare <vscale x 32 x i8> @llvm.riscv.vssubu.nxv32i8.i8(
   <vscale x 32 x i8>,
   <vscale x 32 x i8>,
   i8,
-  i64);
+  iXLen)
 
-define <vscale x 32 x i8> @intrinsic_vssubu_vx_nxv32i8_nxv32i8_i8(<vscale x 32 x i8> %0, i8 %1, i64 %2) nounwind {
+define <vscale x 32 x i8> @intrinsic_vssubu_vx_nxv32i8_nxv32i8_i8(<vscale x 32 x i8> %0, i8 %1, iXLen %2) nounwind {
 ; CHECK-LABEL: intrinsic_vssubu_vx_nxv32i8_nxv32i8_i8:
 ; CHECK:       # %bb.0: # %entry
 ; CHECK-NEXT:    vsetvli zero, a1, e8, m4, ta, ma
@@ -1292,7 +1294,7 @@ entry:
     <vscale x 32 x i8> undef,
     <vscale x 32 x i8> %0,
     i8 %1,
-    i64 %2)
+    iXLen %2)
 
   ret <vscale x 32 x i8> %a
 }
@@ -1302,10 +1304,10 @@ declare <vscale x 32 x i8> @llvm.riscv.vssubu.mask.nxv32i8.i8(
   <vscale x 32 x i8>,
   i8,
   <vscale x 32 x i1>,
-  i64,
-  i64);
+  iXLen,
+  iXLen)
 
-define <vscale x 32 x i8> @intrinsic_vssubu_mask_vx_nxv32i8_nxv32i8_i8(<vscale x 32 x i8> %0, <vscale x 32 x i8> %1, i8 %2, <vscale x 32 x i1> %3, i64 %4) nounwind {
+define <vscale x 32 x i8> @intrinsic_vssubu_mask_vx_nxv32i8_nxv32i8_i8(<vscale x 32 x i8> %0, <vscale x 32 x i8> %1, i8 %2, <vscale x 32 x i1> %3, iXLen %4) nounwind {
 ; CHECK-LABEL: intrinsic_vssubu_mask_vx_nxv32i8_nxv32i8_i8:
 ; CHECK:       # %bb.0: # %entry
 ; CHECK-NEXT:    vsetvli zero, a1, e8, m4, ta, mu
@@ -1317,7 +1319,7 @@ entry:
     <vscale x 32 x i8> %1,
     i8 %2,
     <vscale x 32 x i1> %3,
-    i64 %4, i64 1)
+    iXLen %4, iXLen 1)
 
   ret <vscale x 32 x i8> %a
 }
@@ -1326,9 +1328,9 @@ declare <vscale x 64 x i8> @llvm.riscv.vssubu.nxv64i8.i8(
   <vscale x 64 x i8>,
   <vscale x 64 x i8>,
   i8,
-  i64);
+  iXLen)
 
-define <vscale x 64 x i8> @intrinsic_vssubu_vx_nxv64i8_nxv64i8_i8(<vscale x 64 x i8> %0, i8 %1, i64 %2) nounwind {
+define <vscale x 64 x i8> @intrinsic_vssubu_vx_nxv64i8_nxv64i8_i8(<vscale x 64 x i8> %0, i8 %1, iXLen %2) nounwind {
 ; CHECK-LABEL: intrinsic_vssubu_vx_nxv64i8_nxv64i8_i8:
 ; CHECK:       # %bb.0: # %entry
 ; CHECK-NEXT:    vsetvli zero, a1, e8, m8, ta, ma
@@ -1339,7 +1341,7 @@ entry:
     <vscale x 64 x i8> undef,
     <vscale x 64 x i8> %0,
     i8 %1,
-    i64 %2)
+    iXLen %2)
 
   ret <vscale x 64 x i8> %a
 }
@@ -1349,10 +1351,10 @@ declare <vscale x 64 x i8> @llvm.riscv.vssubu.mask.nxv64i8.i8(
   <vscale x 64 x i8>,
   i8,
   <vscale x 64 x i1>,
-  i64,
-  i64);
+  iXLen,
+  iXLen)
 
-define <vscale x 64 x i8> @intrinsic_vssubu_mask_vx_nxv64i8_nxv64i8_i8(<vscale x 64 x i8> %0, <vscale x 64 x i8> %1, i8 %2, <vscale x 64 x i1> %3, i64 %4) nounwind {
+define <vscale x 64 x i8> @intrinsic_vssubu_mask_vx_nxv64i8_nxv64i8_i8(<vscale x 64 x i8> %0, <vscale x 64 x i8> %1, i8 %2, <vscale x 64 x i1> %3, iXLen %4) nounwind {
 ; CHECK-LABEL: intrinsic_vssubu_mask_vx_nxv64i8_nxv64i8_i8:
 ; CHECK:       # %bb.0: # %entry
 ; CHECK-NEXT:    vsetvli zero, a1, e8, m8, ta, mu
@@ -1364,7 +1366,7 @@ entry:
     <vscale x 64 x i8> %1,
     i8 %2,
     <vscale x 64 x i1> %3,
-    i64 %4, i64 1)
+    iXLen %4, iXLen 1)
 
   ret <vscale x 64 x i8> %a
 }
@@ -1373,9 +1375,9 @@ declare <vscale x 1 x i16> @llvm.riscv.vssubu.nxv1i16.i16(
   <vscale x 1 x i16>,
   <vscale x 1 x i16>,
   i16,
-  i64);
+  iXLen)
 
-define <vscale x 1 x i16> @intrinsic_vssubu_vx_nxv1i16_nxv1i16_i16(<vscale x 1 x i16> %0, i16 %1, i64 %2) nounwind {
+define <vscale x 1 x i16> @intrinsic_vssubu_vx_nxv1i16_nxv1i16_i16(<vscale x 1 x i16> %0, i16 %1, iXLen %2) nounwind {
 ; CHECK-LABEL: intrinsic_vssubu_vx_nxv1i16_nxv1i16_i16:
 ; CHECK:       # %bb.0: # %entry
 ; CHECK-NEXT:    vsetvli zero, a1, e16, mf4, ta, ma
@@ -1386,7 +1388,7 @@ entry:
     <vscale x 1 x i16> undef,
     <vscale x 1 x i16> %0,
     i16 %1,
-    i64 %2)
+    iXLen %2)
 
   ret <vscale x 1 x i16> %a
 }
@@ -1396,10 +1398,10 @@ declare <vscale x 1 x i16> @llvm.riscv.vssubu.mask.nxv1i16.i16(
   <vscale x 1 x i16>,
   i16,
   <vscale x 1 x i1>,
-  i64,
-  i64);
+  iXLen,
+  iXLen)
 
-define <vscale x 1 x i16> @intrinsic_vssubu_mask_vx_nxv1i16_nxv1i16_i16(<vscale x 1 x i16> %0, <vscale x 1 x i16> %1, i16 %2, <vscale x 1 x i1> %3, i64 %4) nounwind {
+define <vscale x 1 x i16> @intrinsic_vssubu_mask_vx_nxv1i16_nxv1i16_i16(<vscale x 1 x i16> %0, <vscale x 1 x i16> %1, i16 %2, <vscale x 1 x i1> %3, iXLen %4) nounwind {
 ; CHECK-LABEL: intrinsic_vssubu_mask_vx_nxv1i16_nxv1i16_i16:
 ; CHECK:       # %bb.0: # %entry
 ; CHECK-NEXT:    vsetvli zero, a1, e16, mf4, ta, mu
@@ -1411,7 +1413,7 @@ entry:
     <vscale x 1 x i16> %1,
     i16 %2,
     <vscale x 1 x i1> %3,
-    i64 %4, i64 1)
+    iXLen %4, iXLen 1)
 
   ret <vscale x 1 x i16> %a
 }
@@ -1420,9 +1422,9 @@ declare <vscale x 2 x i16> @llvm.riscv.vssubu.nxv2i16.i16(
   <vscale x 2 x i16>,
   <vscale x 2 x i16>,
   i16,
-  i64);
+  iXLen)
 
-define <vscale x 2 x i16> @intrinsic_vssubu_vx_nxv2i16_nxv2i16_i16(<vscale x 2 x i16> %0, i16 %1, i64 %2) nounwind {
+define <vscale x 2 x i16> @intrinsic_vssubu_vx_nxv2i16_nxv2i16_i16(<vscale x 2 x i16> %0, i16 %1, iXLen %2) nounwind {
 ; CHECK-LABEL: intrinsic_vssubu_vx_nxv2i16_nxv2i16_i16:
 ; CHECK:       # %bb.0: # %entry
 ; CHECK-NEXT:    vsetvli zero, a1, e16, mf2, ta, ma
@@ -1433,7 +1435,7 @@ entry:
     <vscale x 2 x i16> undef,
     <vscale x 2 x i16> %0,
     i16 %1,
-    i64 %2)
+    iXLen %2)
 
   ret <vscale x 2 x i16> %a
 }
@@ -1443,10 +1445,10 @@ declare <vscale x 2 x i16> @llvm.riscv.vssubu.mask.nxv2i16.i16(
   <vscale x 2 x i16>,
   i16,
   <vscale x 2 x i1>,
-  i64,
-  i64);
+  iXLen,
+  iXLen)
 
-define <vscale x 2 x i16> @intrinsic_vssubu_mask_vx_nxv2i16_nxv2i16_i16(<vscale x 2 x i16> %0, <vscale x 2 x i16> %1, i16 %2, <vscale x 2 x i1> %3, i64 %4) nounwind {
+define <vscale x 2 x i16> @intrinsic_vssubu_mask_vx_nxv2i16_nxv2i16_i16(<vscale x 2 x i16> %0, <vscale x 2 x i16> %1, i16 %2, <vscale x 2 x i1> %3, iXLen %4) nounwind {
 ; CHECK-LABEL: intrinsic_vssubu_mask_vx_nxv2i16_nxv2i16_i16:
 ; CHECK:       # %bb.0: # %entry
 ; CHECK-NEXT:    vsetvli zero, a1, e16, mf2, ta, mu
@@ -1458,7 +1460,7 @@ entry:
     <vscale x 2 x i16> %1,
     i16 %2,
     <vscale x 2 x i1> %3,
-    i64 %4, i64 1)
+    iXLen %4, iXLen 1)
 
   ret <vscale x 2 x i16> %a
 }
@@ -1467,9 +1469,9 @@ declare <vscale x 4 x i16> @llvm.riscv.vssubu.nxv4i16.i16(
   <vscale x 4 x i16>,
   <vscale x 4 x i16>,
   i16,
-  i64);
+  iXLen)
 
-define <vscale x 4 x i16> @intrinsic_vssubu_vx_nxv4i16_nxv4i16_i16(<vscale x 4 x i16> %0, i16 %1, i64 %2) nounwind {
+define <vscale x 4 x i16> @intrinsic_vssubu_vx_nxv4i16_nxv4i16_i16(<vscale x 4 x i16> %0, i16 %1, iXLen %2) nounwind {
 ; CHECK-LABEL: intrinsic_vssubu_vx_nxv4i16_nxv4i16_i16:
 ; CHECK:       # %bb.0: # %entry
 ; CHECK-NEXT:    vsetvli zero, a1, e16, m1, ta, ma
@@ -1480,7 +1482,7 @@ entry:
     <vscale x 4 x i16> undef,
     <vscale x 4 x i16> %0,
     i16 %1,
-    i64 %2)
+    iXLen %2)
 
   ret <vscale x 4 x i16> %a
 }
@@ -1490,10 +1492,10 @@ declare <vscale x 4 x i16> @llvm.riscv.vssubu.mask.nxv4i16.i16(
   <vscale x 4 x i16>,
   i16,
   <vscale x 4 x i1>,
-  i64,
-  i64);
+  iXLen,
+  iXLen)
 
-define <vscale x 4 x i16> @intrinsic_vssubu_mask_vx_nxv4i16_nxv4i16_i16(<vscale x 4 x i16> %0, <vscale x 4 x i16> %1, i16 %2, <vscale x 4 x i1> %3, i64 %4) nounwind {
+define <vscale x 4 x i16> @intrinsic_vssubu_mask_vx_nxv4i16_nxv4i16_i16(<vscale x 4 x i16> %0, <vscale x 4 x i16> %1, i16 %2, <vscale x 4 x i1> %3, iXLen %4) nounwind {
 ; CHECK-LABEL: intrinsic_vssubu_mask_vx_nxv4i16_nxv4i16_i16:
 ; CHECK:       # %bb.0: # %entry
 ; CHECK-NEXT:    vsetvli zero, a1, e16, m1, ta, mu
@@ -1505,7 +1507,7 @@ entry:
     <vscale x 4 x i16> %1,
     i16 %2,
     <vscale x 4 x i1> %3,
-    i64 %4, i64 1)
+    iXLen %4, iXLen 1)
 
   ret <vscale x 4 x i16> %a
 }
@@ -1514,9 +1516,9 @@ declare <vscale x 8 x i16> @llvm.riscv.vssubu.nxv8i16.i16(
   <vscale x 8 x i16>,
   <vscale x 8 x i16>,
   i16,
-  i64);
+  iXLen)
 
-define <vscale x 8 x i16> @intrinsic_vssubu_vx_nxv8i16_nxv8i16_i16(<vscale x 8 x i16> %0, i16 %1, i64 %2) nounwind {
+define <vscale x 8 x i16> @intrinsic_vssubu_vx_nxv8i16_nxv8i16_i16(<vscale x 8 x i16> %0, i16 %1, iXLen %2) nounwind {
 ; CHECK-LABEL: intrinsic_vssubu_vx_nxv8i16_nxv8i16_i16:
 ; CHECK:       # %bb.0: # %entry
 ; CHECK-NEXT:    vsetvli zero, a1, e16, m2, ta, ma
@@ -1527,7 +1529,7 @@ entry:
     <vscale x 8 x i16> undef,
     <vscale x 8 x i16> %0,
     i16 %1,
-    i64 %2)
+    iXLen %2)
 
   ret <vscale x 8 x i16> %a
 }
@@ -1537,10 +1539,10 @@ declare <vscale x 8 x i16> @llvm.riscv.vssubu.mask.nxv8i16.i16(
   <vscale x 8 x i16>,
   i16,
   <vscale x 8 x i1>,
-  i64,
-  i64);
+  iXLen,
+  iXLen)
 
-define <vscale x 8 x i16> @intrinsic_vssubu_mask_vx_nxv8i16_nxv8i16_i16(<vscale x 8 x i16> %0, <vscale x 8 x i16> %1, i16 %2, <vscale x 8 x i1> %3, i64 %4) nounwind {
+define <vscale x 8 x i16> @intrinsic_vssubu_mask_vx_nxv8i16_nxv8i16_i16(<vscale x 8 x i16> %0, <vscale x 8 x i16> %1, i16 %2, <vscale x 8 x i1> %3, iXLen %4) nounwind {
 ; CHECK-LABEL: intrinsic_vssubu_mask_vx_nxv8i16_nxv8i16_i16:
 ; CHECK:       # %bb.0: # %entry
 ; CHECK-NEXT:    vsetvli zero, a1, e16, m2, ta, mu
@@ -1552,7 +1554,7 @@ entry:
     <vscale x 8 x i16> %1,
     i16 %2,
     <vscale x 8 x i1> %3,
-    i64 %4, i64 1)
+    iXLen %4, iXLen 1)
 
   ret <vscale x 8 x i16> %a
 }
@@ -1561,9 +1563,9 @@ declare <vscale x 16 x i16> @llvm.riscv.vssubu.nxv16i16.i16(
   <vscale x 16 x i16>,
   <vscale x 16 x i16>,
   i16,
-  i64);
+  iXLen)
 
-define <vscale x 16 x i16> @intrinsic_vssubu_vx_nxv16i16_nxv16i16_i16(<vscale x 16 x i16> %0, i16 %1, i64 %2) nounwind {
+define <vscale x 16 x i16> @intrinsic_vssubu_vx_nxv16i16_nxv16i16_i16(<vscale x 16 x i16> %0, i16 %1, iXLen %2) nounwind {
 ; CHECK-LABEL: intrinsic_vssubu_vx_nxv16i16_nxv16i16_i16:
 ; CHECK:       # %bb.0: # %entry
 ; CHECK-NEXT:    vsetvli zero, a1, e16, m4, ta, ma
@@ -1574,7 +1576,7 @@ entry:
     <vscale x 16 x i16> undef,
     <vscale x 16 x i16> %0,
     i16 %1,
-    i64 %2)
+    iXLen %2)
 
   ret <vscale x 16 x i16> %a
 }
@@ -1584,10 +1586,10 @@ declare <vscale x 16 x i16> @llvm.riscv.vssubu.mask.nxv16i16.i16(
   <vscale x 16 x i16>,
   i16,
   <vscale x 16 x i1>,
-  i64,
-  i64);
+  iXLen,
+  iXLen)
 
-define <vscale x 16 x i16> @intrinsic_vssubu_mask_vx_nxv16i16_nxv16i16_i16(<vscale x 16 x i16> %0, <vscale x 16 x i16> %1, i16 %2, <vscale x 16 x i1> %3, i64 %4) nounwind {
+define <vscale x 16 x i16> @intrinsic_vssubu_mask_vx_nxv16i16_nxv16i16_i16(<vscale x 16 x i16> %0, <vscale x 16 x i16> %1, i16 %2, <vscale x 16 x i1> %3, iXLen %4) nounwind {
 ; CHECK-LABEL: intrinsic_vssubu_mask_vx_nxv16i16_nxv16i16_i16:
 ; CHECK:       # %bb.0: # %entry
 ; CHECK-NEXT:    vsetvli zero, a1, e16, m4, ta, mu
@@ -1599,7 +1601,7 @@ entry:
     <vscale x 16 x i16> %1,
     i16 %2,
     <vscale x 16 x i1> %3,
-    i64 %4, i64 1)
+    iXLen %4, iXLen 1)
 
   ret <vscale x 16 x i16> %a
 }
@@ -1608,9 +1610,9 @@ declare <vscale x 32 x i16> @llvm.riscv.vssubu.nxv32i16.i16(
   <vscale x 32 x i16>,
   <vscale x 32 x i16>,
   i16,
-  i64);
+  iXLen)
 
-define <vscale x 32 x i16> @intrinsic_vssubu_vx_nxv32i16_nxv32i16_i16(<vscale x 32 x i16> %0, i16 %1, i64 %2) nounwind {
+define <vscale x 32 x i16> @intrinsic_vssubu_vx_nxv32i16_nxv32i16_i16(<vscale x 32 x i16> %0, i16 %1, iXLen %2) nounwind {
 ; CHECK-LABEL: intrinsic_vssubu_vx_nxv32i16_nxv32i16_i16:
 ; CHECK:       # %bb.0: # %entry
 ; CHECK-NEXT:    vsetvli zero, a1, e16, m8, ta, ma
@@ -1621,7 +1623,7 @@ entry:
     <vscale x 32 x i16> undef,
     <vscale x 32 x i16> %0,
     i16 %1,
-    i64 %2)
+    iXLen %2)
 
   ret <vscale x 32 x i16> %a
 }
@@ -1631,10 +1633,10 @@ declare <vscale x 32 x i16> @llvm.riscv.vssubu.mask.nxv32i16.i16(
   <vscale x 32 x i16>,
   i16,
   <vscale x 32 x i1>,
-  i64,
-  i64);
+  iXLen,
+  iXLen)
 
-define <vscale x 32 x i16> @intrinsic_vssubu_mask_vx_nxv32i16_nxv32i16_i16(<vscale x 32 x i16> %0, <vscale x 32 x i16> %1, i16 %2, <vscale x 32 x i1> %3, i64 %4) nounwind {
+define <vscale x 32 x i16> @intrinsic_vssubu_mask_vx_nxv32i16_nxv32i16_i16(<vscale x 32 x i16> %0, <vscale x 32 x i16> %1, i16 %2, <vscale x 32 x i1> %3, iXLen %4) nounwind {
 ; CHECK-LABEL: intrinsic_vssubu_mask_vx_nxv32i16_nxv32i16_i16:
 ; CHECK:       # %bb.0: # %entry
 ; CHECK-NEXT:    vsetvli zero, a1, e16, m8, ta, mu
@@ -1646,7 +1648,7 @@ entry:
     <vscale x 32 x i16> %1,
     i16 %2,
     <vscale x 32 x i1> %3,
-    i64 %4, i64 1)
+    iXLen %4, iXLen 1)
 
   ret <vscale x 32 x i16> %a
 }
@@ -1655,9 +1657,9 @@ declare <vscale x 1 x i32> @llvm.riscv.vssubu.nxv1i32.i32(
   <vscale x 1 x i32>,
   <vscale x 1 x i32>,
   i32,
-  i64);
+  iXLen)
 
-define <vscale x 1 x i32> @intrinsic_vssubu_vx_nxv1i32_nxv1i32_i32(<vscale x 1 x i32> %0, i32 %1, i64 %2) nounwind {
+define <vscale x 1 x i32> @intrinsic_vssubu_vx_nxv1i32_nxv1i32_i32(<vscale x 1 x i32> %0, i32 %1, iXLen %2) nounwind {
 ; CHECK-LABEL: intrinsic_vssubu_vx_nxv1i32_nxv1i32_i32:
 ; CHECK:       # %bb.0: # %entry
 ; CHECK-NEXT:    vsetvli zero, a1, e32, mf2, ta, ma
@@ -1668,7 +1670,7 @@ entry:
     <vscale x 1 x i32> undef,
     <vscale x 1 x i32> %0,
     i32 %1,
-    i64 %2)
+    iXLen %2)
 
   ret <vscale x 1 x i32> %a
 }
@@ -1678,10 +1680,10 @@ declare <vscale x 1 x i32> @llvm.riscv.vssubu.mask.nxv1i32.i32(
   <vscale x 1 x i32>,
   i32,
   <vscale x 1 x i1>,
-  i64,
-  i64);
+  iXLen,
+  iXLen)
 
-define <vscale x 1 x i32> @intrinsic_vssubu_mask_vx_nxv1i32_nxv1i32_i32(<vscale x 1 x i32> %0, <vscale x 1 x i32> %1, i32 %2, <vscale x 1 x i1> %3, i64 %4) nounwind {
+define <vscale x 1 x i32> @intrinsic_vssubu_mask_vx_nxv1i32_nxv1i32_i32(<vscale x 1 x i32> %0, <vscale x 1 x i32> %1, i32 %2, <vscale x 1 x i1> %3, iXLen %4) nounwind {
 ; CHECK-LABEL: intrinsic_vssubu_mask_vx_nxv1i32_nxv1i32_i32:
 ; CHECK:       # %bb.0: # %entry
 ; CHECK-NEXT:    vsetvli zero, a1, e32, mf2, ta, mu
@@ -1693,7 +1695,7 @@ entry:
     <vscale x 1 x i32> %1,
     i32 %2,
     <vscale x 1 x i1> %3,
-    i64 %4, i64 1)
+    iXLen %4, iXLen 1)
 
   ret <vscale x 1 x i32> %a
 }
@@ -1702,9 +1704,9 @@ declare <vscale x 2 x i32> @llvm.riscv.vssubu.nxv2i32.i32(
   <vscale x 2 x i32>,
   <vscale x 2 x i32>,
   i32,
-  i64);
+  iXLen)
 
-define <vscale x 2 x i32> @intrinsic_vssubu_vx_nxv2i32_nxv2i32_i32(<vscale x 2 x i32> %0, i32 %1, i64 %2) nounwind {
+define <vscale x 2 x i32> @intrinsic_vssubu_vx_nxv2i32_nxv2i32_i32(<vscale x 2 x i32> %0, i32 %1, iXLen %2) nounwind {
 ; CHECK-LABEL: intrinsic_vssubu_vx_nxv2i32_nxv2i32_i32:
 ; CHECK:       # %bb.0: # %entry
 ; CHECK-NEXT:    vsetvli zero, a1, e32, m1, ta, ma
@@ -1715,7 +1717,7 @@ entry:
     <vscale x 2 x i32> undef,
     <vscale x 2 x i32> %0,
     i32 %1,
-    i64 %2)
+    iXLen %2)
 
   ret <vscale x 2 x i32> %a
 }
@@ -1725,10 +1727,10 @@ declare <vscale x 2 x i32> @llvm.riscv.vssubu.mask.nxv2i32.i32(
   <vscale x 2 x i32>,
   i32,
   <vscale x 2 x i1>,
-  i64,
-  i64);
+  iXLen,
+  iXLen)
 
-define <vscale x 2 x i32> @intrinsic_vssubu_mask_vx_nxv2i32_nxv2i32_i32(<vscale x 2 x i32> %0, <vscale x 2 x i32> %1, i32 %2, <vscale x 2 x i1> %3, i64 %4) nounwind {
+define <vscale x 2 x i32> @intrinsic_vssubu_mask_vx_nxv2i32_nxv2i32_i32(<vscale x 2 x i32> %0, <vscale x 2 x i32> %1, i32 %2, <vscale x 2 x i1> %3, iXLen %4) nounwind {
 ; CHECK-LABEL: intrinsic_vssubu_mask_vx_nxv2i32_nxv2i32_i32:
 ; CHECK:       # %bb.0: # %entry
 ; CHECK-NEXT:    vsetvli zero, a1, e32, m1, ta, mu
@@ -1740,7 +1742,7 @@ entry:
     <vscale x 2 x i32> %1,
     i32 %2,
     <vscale x 2 x i1> %3,
-    i64 %4, i64 1)
+    iXLen %4, iXLen 1)
 
   ret <vscale x 2 x i32> %a
 }
@@ -1749,9 +1751,9 @@ declare <vscale x 4 x i32> @llvm.riscv.vssubu.nxv4i32.i32(
   <vscale x 4 x i32>,
   <vscale x 4 x i32>,
   i32,
-  i64);
+  iXLen)
 
-define <vscale x 4 x i32> @intrinsic_vssubu_vx_nxv4i32_nxv4i32_i32(<vscale x 4 x i32> %0, i32 %1, i64 %2) nounwind {
+define <vscale x 4 x i32> @intrinsic_vssubu_vx_nxv4i32_nxv4i32_i32(<vscale x 4 x i32> %0, i32 %1, iXLen %2) nounwind {
 ; CHECK-LABEL: intrinsic_vssubu_vx_nxv4i32_nxv4i32_i32:
 ; CHECK:       # %bb.0: # %entry
 ; CHECK-NEXT:    vsetvli zero, a1, e32, m2, ta, ma
@@ -1762,7 +1764,7 @@ entry:
     <vscale x 4 x i32> undef,
     <vscale x 4 x i32> %0,
     i32 %1,
-    i64 %2)
+    iXLen %2)
 
   ret <vscale x 4 x i32> %a
 }
@@ -1772,10 +1774,10 @@ declare <vscale x 4 x i32> @llvm.riscv.vssubu.mask.nxv4i32.i32(
   <vscale x 4 x i32>,
   i32,
   <vscale x 4 x i1>,
-  i64,
-  i64);
+  iXLen,
+  iXLen)
 
-define <vscale x 4 x i32> @intrinsic_vssubu_mask_vx_nxv4i32_nxv4i32_i32(<vscale x 4 x i32> %0, <vscale x 4 x i32> %1, i32 %2, <vscale x 4 x i1> %3, i64 %4) nounwind {
+define <vscale x 4 x i32> @intrinsic_vssubu_mask_vx_nxv4i32_nxv4i32_i32(<vscale x 4 x i32> %0, <vscale x 4 x i32> %1, i32 %2, <vscale x 4 x i1> %3, iXLen %4) nounwind {
 ; CHECK-LABEL: intrinsic_vssubu_mask_vx_nxv4i32_nxv4i32_i32:
 ; CHECK:       # %bb.0: # %entry
 ; CHECK-NEXT:    vsetvli zero, a1, e32, m2, ta, mu
@@ -1787,7 +1789,7 @@ entry:
     <vscale x 4 x i32> %1,
     i32 %2,
     <vscale x 4 x i1> %3,
-    i64 %4, i64 1)
+    iXLen %4, iXLen 1)
 
   ret <vscale x 4 x i32> %a
 }
@@ -1796,9 +1798,9 @@ declare <vscale x 8 x i32> @llvm.riscv.vssubu.nxv8i32.i32(
   <vscale x 8 x i32>,
   <vscale x 8 x i32>,
   i32,
-  i64);
+  iXLen)
 
-define <vscale x 8 x i32> @intrinsic_vssubu_vx_nxv8i32_nxv8i32_i32(<vscale x 8 x i32> %0, i32 %1, i64 %2) nounwind {
+define <vscale x 8 x i32> @intrinsic_vssubu_vx_nxv8i32_nxv8i32_i32(<vscale x 8 x i32> %0, i32 %1, iXLen %2) nounwind {
 ; CHECK-LABEL: intrinsic_vssubu_vx_nxv8i32_nxv8i32_i32:
 ; CHECK:       # %bb.0: # %entry
 ; CHECK-NEXT:    vsetvli zero, a1, e32, m4, ta, ma
@@ -1809,7 +1811,7 @@ entry:
     <vscale x 8 x i32> undef,
     <vscale x 8 x i32> %0,
     i32 %1,
-    i64 %2)
+    iXLen %2)
 
   ret <vscale x 8 x i32> %a
 }
@@ -1819,10 +1821,10 @@ declare <vscale x 8 x i32> @llvm.riscv.vssubu.mask.nxv8i32.i32(
   <vscale x 8 x i32>,
   i32,
   <vscale x 8 x i1>,
-  i64,
-  i64);
+  iXLen,
+  iXLen)
 
-define <vscale x 8 x i32> @intrinsic_vssubu_mask_vx_nxv8i32_nxv8i32_i32(<vscale x 8 x i32> %0, <vscale x 8 x i32> %1, i32 %2, <vscale x 8 x i1> %3, i64 %4) nounwind {
+define <vscale x 8 x i32> @intrinsic_vssubu_mask_vx_nxv8i32_nxv8i32_i32(<vscale x 8 x i32> %0, <vscale x 8 x i32> %1, i32 %2, <vscale x 8 x i1> %3, iXLen %4) nounwind {
 ; CHECK-LABEL: intrinsic_vssubu_mask_vx_nxv8i32_nxv8i32_i32:
 ; CHECK:       # %bb.0: # %entry
 ; CHECK-NEXT:    vsetvli zero, a1, e32, m4, ta, mu
@@ -1834,7 +1836,7 @@ entry:
     <vscale x 8 x i32> %1,
     i32 %2,
     <vscale x 8 x i1> %3,
-    i64 %4, i64 1)
+    iXLen %4, iXLen 1)
 
   ret <vscale x 8 x i32> %a
 }
@@ -1843,9 +1845,9 @@ declare <vscale x 16 x i32> @llvm.riscv.vssubu.nxv16i32.i32(
   <vscale x 16 x i32>,
   <vscale x 16 x i32>,
   i32,
-  i64);
+  iXLen)
 
-define <vscale x 16 x i32> @intrinsic_vssubu_vx_nxv16i32_nxv16i32_i32(<vscale x 16 x i32> %0, i32 %1, i64 %2) nounwind {
+define <vscale x 16 x i32> @intrinsic_vssubu_vx_nxv16i32_nxv16i32_i32(<vscale x 16 x i32> %0, i32 %1, iXLen %2) nounwind {
 ; CHECK-LABEL: intrinsic_vssubu_vx_nxv16i32_nxv16i32_i32:
 ; CHECK:       # %bb.0: # %entry
 ; CHECK-NEXT:    vsetvli zero, a1, e32, m8, ta, ma
@@ -1856,7 +1858,7 @@ entry:
     <vscale x 16 x i32> undef,
     <vscale x 16 x i32> %0,
     i32 %1,
-    i64 %2)
+    iXLen %2)
 
   ret <vscale x 16 x i32> %a
 }
@@ -1866,10 +1868,10 @@ declare <vscale x 16 x i32> @llvm.riscv.vssubu.mask.nxv16i32.i32(
   <vscale x 16 x i32>,
   i32,
   <vscale x 16 x i1>,
-  i64,
-  i64);
+  iXLen,
+  iXLen)
 
-define <vscale x 16 x i32> @intrinsic_vssubu_mask_vx_nxv16i32_nxv16i32_i32(<vscale x 16 x i32> %0, <vscale x 16 x i32> %1, i32 %2, <vscale x 16 x i1> %3, i64 %4) nounwind {
+define <vscale x 16 x i32> @intrinsic_vssubu_mask_vx_nxv16i32_nxv16i32_i32(<vscale x 16 x i32> %0, <vscale x 16 x i32> %1, i32 %2, <vscale x 16 x i1> %3, iXLen %4) nounwind {
 ; CHECK-LABEL: intrinsic_vssubu_mask_vx_nxv16i32_nxv16i32_i32:
 ; CHECK:       # %bb.0: # %entry
 ; CHECK-NEXT:    vsetvli zero, a1, e32, m8, ta, mu
@@ -1881,7 +1883,7 @@ entry:
     <vscale x 16 x i32> %1,
     i32 %2,
     <vscale x 16 x i1> %3,
-    i64 %4, i64 1)
+    iXLen %4, iXLen 1)
 
   ret <vscale x 16 x i32> %a
 }
@@ -1890,20 +1892,32 @@ declare <vscale x 1 x i64> @llvm.riscv.vssubu.nxv1i64.i64(
   <vscale x 1 x i64>,
   <vscale x 1 x i64>,
   i64,
-  i64);
-
-define <vscale x 1 x i64> @intrinsic_vssubu_vx_nxv1i64_nxv1i64_i64(<vscale x 1 x i64> %0, i64 %1, i64 %2) nounwind {
-; CHECK-LABEL: intrinsic_vssubu_vx_nxv1i64_nxv1i64_i64:
-; CHECK:       # %bb.0: # %entry
-; CHECK-NEXT:    vsetvli zero, a1, e64, m1, ta, ma
-; CHECK-NEXT:    vssubu.vx v8, v8, a0
-; CHECK-NEXT:    ret
+  iXLen)
+
+define <vscale x 1 x i64> @intrinsic_vssubu_vx_nxv1i64_nxv1i64_i64(<vscale x 1 x i64> %0, i64 %1, iXLen %2) nounwind {
+; RV32-LABEL: intrinsic_vssubu_vx_nxv1i64_nxv1i64_i64:
+; RV32:       # %bb.0: # %entry
+; RV32-NEXT:    addi sp, sp, -16
+; RV32-NEXT:    sw a1, 12(sp)
+; RV32-NEXT:    sw a0, 8(sp)
+; RV32-NEXT:    addi a0, sp, 8
+; RV32-NEXT:    vsetvli zero, a2, e64, m1, ta, ma
+; RV32-NEXT:    vlse64.v v9, (a0), zero
+; RV32-NEXT:    vssubu.vv v8, v8, v9
+; RV32-NEXT:    addi sp, sp, 16
+; RV32-NEXT:    ret
+;
+; RV64-LABEL: intrinsic_vssubu_vx_nxv1i64_nxv1i64_i64:
+; RV64:       # %bb.0: # %entry
+; RV64-NEXT:    vsetvli zero, a1, e64, m1, ta, ma
+; RV64-NEXT:    vssubu.vx v8, v8, a0
+; RV64-NEXT:    ret
 entry:
   %a = call <vscale x 1 x i64> @llvm.riscv.vssubu.nxv1i64.i64(
     <vscale x 1 x i64> undef,
     <vscale x 1 x i64> %0,
     i64 %1,
-    i64 %2)
+    iXLen %2)
 
   ret <vscale x 1 x i64> %a
 }
@@ -1913,22 +1927,34 @@ declare <vscale x 1 x i64> @llvm.riscv.vssubu.mask.nxv1i64.i64(
   <vscale x 1 x i64>,
   i64,
   <vscale x 1 x i1>,
-  i64,
-  i64);
-
-define <vscale x 1 x i64> @intrinsic_vssubu_mask_vx_nxv1i64_nxv1i64_i64(<vscale x 1 x i64> %0, <vscale x 1 x i64> %1, i64 %2, <vscale x 1 x i1> %3, i64 %4) nounwind {
-; CHECK-LABEL: intrinsic_vssubu_mask_vx_nxv1i64_nxv1i64_i64:
-; CHECK:       # %bb.0: # %entry
-; CHECK-NEXT:    vsetvli zero, a1, e64, m1, ta, mu
-; CHECK-NEXT:    vssubu.vx v8, v9, a0, v0.t
-; CHECK-NEXT:    ret
+  iXLen,
+  iXLen)
+
+define <vscale x 1 x i64> @intrinsic_vssubu_mask_vx_nxv1i64_nxv1i64_i64(<vscale x 1 x i64> %0, <vscale x 1 x i64> %1, i64 %2, <vscale x 1 x i1> %3, iXLen %4) nounwind {
+; RV32-LABEL: intrinsic_vssubu_mask_vx_nxv1i64_nxv1i64_i64:
+; RV32:       # %bb.0: # %entry
+; RV32-NEXT:    addi sp, sp, -16
+; RV32-NEXT:    sw a1, 12(sp)
+; RV32-NEXT:    sw a0, 8(sp)
+; RV32-NEXT:    addi a0, sp, 8
+; RV32-NEXT:    vsetvli zero, a2, e64, m1, ta, mu
+; RV32-NEXT:    vlse64.v v10, (a0), zero
+; RV32-NEXT:    vssubu.vv v8, v9, v10, v0.t
+; RV32-NEXT:    addi sp, sp, 16
+; RV32-NEXT:    ret
+;
+; RV64-LABEL: intrinsic_vssubu_mask_vx_nxv1i64_nxv1i64_i64:
+; RV64:       # %bb.0: # %entry
+; RV64-NEXT:    vsetvli zero, a1, e64, m1, ta, mu
+; RV64-NEXT:    vssubu.vx v8, v9, a0, v0.t
+; RV64-NEXT:    ret
 entry:
   %a = call <vscale x 1 x i64> @llvm.riscv.vssubu.mask.nxv1i64.i64(
     <vscale x 1 x i64> %0,
     <vscale x 1 x i64> %1,
     i64 %2,
     <vscale x 1 x i1> %3,
-    i64 %4, i64 1)
+    iXLen %4, iXLen 1)
 
   ret <vscale x 1 x i64> %a
 }
@@ -1937,20 +1963,32 @@ declare <vscale x 2 x i64> @llvm.riscv.vssubu.nxv2i64.i64(
   <vscale x 2 x i64>,
   <vscale x 2 x i64>,
   i64,
-  i64);
-
-define <vscale x 2 x i64> @intrinsic_vssubu_vx_nxv2i64_nxv2i64_i64(<vscale x 2 x i64> %0, i64 %1, i64 %2) nounwind {
-; CHECK-LABEL: intrinsic_vssubu_vx_nxv2i64_nxv2i64_i64:
-; CHECK:       # %bb.0: # %entry
-; CHECK-NEXT:    vsetvli zero, a1, e64, m2, ta, ma
-; CHECK-NEXT:    vssubu.vx v8, v8, a0
-; CHECK-NEXT:    ret
+  iXLen)
+
+define <vscale x 2 x i64> @intrinsic_vssubu_vx_nxv2i64_nxv2i64_i64(<vscale x 2 x i64> %0, i64 %1, iXLen %2) nounwind {
+; RV32-LABEL: intrinsic_vssubu_vx_nxv2i64_nxv2i64_i64:
+; RV32:       # %bb.0: # %entry
+; RV32-NEXT:    addi sp, sp, -16
+; RV32-NEXT:    sw a1, 12(sp)
+; RV32-NEXT:    sw a0, 8(sp)
+; RV32-NEXT:    addi a0, sp, 8
+; RV32-NEXT:    vsetvli zero, a2, e64, m2, ta, ma
+; RV32-NEXT:    vlse64.v v10, (a0), zero
+; RV32-NEXT:    vssubu.vv v8, v8, v10
+; RV32-NEXT:    addi sp, sp, 16
+; RV32-NEXT:    ret
+;
+; RV64-LABEL: intrinsic_vssubu_vx_nxv2i64_nxv2i64_i64:
+; RV64:       # %bb.0: # %entry
+; RV64-NEXT:    vsetvli zero, a1, e64, m2, ta, ma
+; RV64-NEXT:    vssubu.vx v8, v8, a0
+; RV64-NEXT:    ret
 entry:
   %a = call <vscale x 2 x i64> @llvm.riscv.vssubu.nxv2i64.i64(
     <vscale x 2 x i64> undef,
     <vscale x 2 x i64> %0,
     i64 %1,
-    i64 %2)
+    iXLen %2)
 
   ret <vscale x 2 x i64> %a
 }
@@ -1960,22 +1998,34 @@ declare <vscale x 2 x i64> @llvm.riscv.vssubu.mask.nxv2i64.i64(
   <vscale x 2 x i64>,
   i64,
   <vscale x 2 x i1>,
-  i64,
-  i64);
-
-define <vscale x 2 x i64> @intrinsic_vssubu_mask_vx_nxv2i64_nxv2i64_i64(<vscale x 2 x i64> %0, <vscale x 2 x i64> %1, i64 %2, <vscale x 2 x i1> %3, i64 %4) nounwind {
-; CHECK-LABEL: intrinsic_vssubu_mask_vx_nxv2i64_nxv2i64_i64:
-; CHECK:       # %bb.0: # %entry
-; CHECK-NEXT:    vsetvli zero, a1, e64, m2, ta, mu
-; CHECK-NEXT:    vssubu.vx v8, v10, a0, v0.t
-; CHECK-NEXT:    ret
+  iXLen,
+  iXLen)
+
+define <vscale x 2 x i64> @intrinsic_vssubu_mask_vx_nxv2i64_nxv2i64_i64(<vscale x 2 x i64> %0, <vscale x 2 x i64> %1, i64 %2, <vscale x 2 x i1> %3, iXLen %4) nounwind {
+; RV32-LABEL: intrinsic_vssubu_mask_vx_nxv2i64_nxv2i64_i64:
+; RV32:       # %bb.0: # %entry
+; RV32-NEXT:    addi sp, sp, -16
+; RV32-NEXT:    sw a1, 12(sp)
+; RV32-NEXT:    sw a0, 8(sp)
+; RV32-NEXT:    addi a0, sp, 8
+; RV32-NEXT:    vsetvli zero, a2, e64, m2, ta, mu
+; RV32-NEXT:    vlse64.v v12, (a0), zero
+; RV32-NEXT:    vssubu.vv v8, v10, v12, v0.t
+; RV32-NEXT:    addi sp, sp, 16
+; RV32-NEXT:    ret
+;
+; RV64-LABEL: intrinsic_vssubu_mask_vx_nxv2i64_nxv2i64_i64:
+; RV64:       # %bb.0: # %entry
+; RV64-NEXT:    vsetvli zero, a1, e64, m2, ta, mu
+; RV64-NEXT:    vssubu.vx v8, v10, a0, v0.t
+; RV64-NEXT:    ret
 entry:
   %a = call <vscale x 2 x i64> @llvm.riscv.vssubu.mask.nxv2i64.i64(
     <vscale x 2 x i64> %0,
     <vscale x 2 x i64> %1,
     i64 %2,
     <vscale x 2 x i1> %3,
-    i64 %4, i64 1)
+    iXLen %4, iXLen 1)
 
   ret <vscale x 2 x i64> %a
 }
@@ -1984,20 +2034,32 @@ declare <vscale x 4 x i64> @llvm.riscv.vssubu.nxv4i64.i64(
   <vscale x 4 x i64>,
   <vscale x 4 x i64>,
   i64,
-  i64);
-
-define <vscale x 4 x i64> @intrinsic_vssubu_vx_nxv4i64_nxv4i64_i64(<vscale x 4 x i64> %0, i64 %1, i64 %2) nounwind {
-; CHECK-LABEL: intrinsic_vssubu_vx_nxv4i64_nxv4i64_i64:
-; CHECK:       # %bb.0: # %entry
-; CHECK-NEXT:    vsetvli zero, a1, e64, m4, ta, ma
-; CHECK-NEXT:    vssubu.vx v8, v8, a0
-; CHECK-NEXT:    ret
+  iXLen)
+
+define <vscale x 4 x i64> @intrinsic_vssubu_vx_nxv4i64_nxv4i64_i64(<vscale x 4 x i64> %0, i64 %1, iXLen %2) nounwind {
+; RV32-LABEL: intrinsic_vssubu_vx_nxv4i64_nxv4i64_i64:
+; RV32:       # %bb.0: # %entry
+; RV32-NEXT:    addi sp, sp, -16
+; RV32-NEXT:    sw a1, 12(sp)
+; RV32-NEXT:    sw a0, 8(sp)
+; RV32-NEXT:    addi a0, sp, 8
+; RV32-NEXT:    vsetvli zero, a2, e64, m4, ta, ma
+; RV32-NEXT:    vlse64.v v12, (a0), zero
+; RV32-NEXT:    vssubu.vv v8, v8, v12
+; RV32-NEXT:    addi sp, sp, 16
+; RV32-NEXT:    ret
+;
+; RV64-LABEL: intrinsic_vssubu_vx_nxv4i64_nxv4i64_i64:
+; RV64:       # %bb.0: # %entry
+; RV64-NEXT:    vsetvli zero, a1, e64, m4, ta, ma
+; RV64-NEXT:    vssubu.vx v8, v8, a0
+; RV64-NEXT:    ret
 entry:
   %a = call <vscale x 4 x i64> @llvm.riscv.vssubu.nxv4i64.i64(
     <vscale x 4 x i64> undef,
     <vscale x 4 x i64> %0,
     i64 %1,
-    i64 %2)
+    iXLen %2)
 
   ret <vscale x 4 x i64> %a
 }
@@ -2007,22 +2069,34 @@ declare <vscale x 4 x i64> @llvm.riscv.vssubu.mask.nxv4i64.i64(
   <vscale x 4 x i64>,
   i64,
   <vscale x 4 x i1>,
-  i64,
-  i64);
-
-define <vscale x 4 x i64> @intrinsic_vssubu_mask_vx_nxv4i64_nxv4i64_i64(<vscale x 4 x i64> %0, <vscale x 4 x i64> %1, i64 %2, <vscale x 4 x i1> %3, i64 %4) nounwind {
-; CHECK-LABEL: intrinsic_vssubu_mask_vx_nxv4i64_nxv4i64_i64:
-; CHECK:       # %bb.0: # %entry
-; CHECK-NEXT:    vsetvli zero, a1, e64, m4, ta, mu
-; CHECK-NEXT:    vssubu.vx v8, v12, a0, v0.t
-; CHECK-NEXT:    ret
+  iXLen,
+  iXLen)
+
+define <vscale x 4 x i64> @intrinsic_vssubu_mask_vx_nxv4i64_nxv4i64_i64(<vscale x 4 x i64> %0, <vscale x 4 x i64> %1, i64 %2, <vscale x 4 x i1> %3, iXLen %4) nounwind {
+; RV32-LABEL: intrinsic_vssubu_mask_vx_nxv4i64_nxv4i64_i64:
+; RV32:       # %bb.0: # %entry
+; RV32-NEXT:    addi sp, sp, -16
+; RV32-NEXT:    sw a1, 12(sp)
+; RV32-NEXT:    sw a0, 8(sp)
+; RV32-NEXT:    addi a0, sp, 8
+; RV32-NEXT:    vsetvli zero, a2, e64, m4, ta, mu
+; RV32-NEXT:    vlse64.v v16, (a0), zero
+; RV32-NEXT:    vssubu.vv v8, v12, v16, v0.t
+; RV32-NEXT:    addi sp, sp, 16
+; RV32-NEXT:    ret
+;
+; RV64-LABEL: intrinsic_vssubu_mask_vx_nxv4i64_nxv4i64_i64:
+; RV64:       # %bb.0: # %entry
+; RV64-NEXT:    vsetvli zero, a1, e64, m4, ta, mu
+; RV64-NEXT:    vssubu.vx v8, v12, a0, v0.t
+; RV64-NEXT:    ret
 entry:
   %a = call <vscale x 4 x i64> @llvm.riscv.vssubu.mask.nxv4i64.i64(
     <vscale x 4 x i64> %0,
     <vscale x 4 x i64> %1,
     i64 %2,
     <vscale x 4 x i1> %3,
-    i64 %4, i64 1)
+    iXLen %4, iXLen 1)
 
   ret <vscale x 4 x i64> %a
 }
@@ -2031,20 +2105,32 @@ declare <vscale x 8 x i64> @llvm.riscv.vssubu.nxv8i64.i64(
   <vscale x 8 x i64>,
   <vscale x 8 x i64>,
   i64,
-  i64);
-
-define <vscale x 8 x i64> @intrinsic_vssubu_vx_nxv8i64_nxv8i64_i64(<vscale x 8 x i64> %0, i64 %1, i64 %2) nounwind {
-; CHECK-LABEL: intrinsic_vssubu_vx_nxv8i64_nxv8i64_i64:
-; CHECK:       # %bb.0: # %entry
-; CHECK-NEXT:    vsetvli zero, a1, e64, m8, ta, ma
-; CHECK-NEXT:    vssubu.vx v8, v8, a0
-; CHECK-NEXT:    ret
+  iXLen)
+
+define <vscale x 8 x i64> @intrinsic_vssubu_vx_nxv8i64_nxv8i64_i64(<vscale x 8 x i64> %0, i64 %1, iXLen %2) nounwind {
+; RV32-LABEL: intrinsic_vssubu_vx_nxv8i64_nxv8i64_i64:
+; RV32:       # %bb.0: # %entry
+; RV32-NEXT:    addi sp, sp, -16
+; RV32-NEXT:    sw a1, 12(sp)
+; RV32-NEXT:    sw a0, 8(sp)
+; RV32-NEXT:    addi a0, sp, 8
+; RV32-NEXT:    vsetvli zero, a2, e64, m8, ta, ma
+; RV32-NEXT:    vlse64.v v16, (a0), zero
+; RV32-NEXT:    vssubu.vv v8, v8, v16
+; RV32-NEXT:    addi sp, sp, 16
+; RV32-NEXT:    ret
+;
+; RV64-LABEL: intrinsic_vssubu_vx_nxv8i64_nxv8i64_i64:
+; RV64:       # %bb.0: # %entry
+; RV64-NEXT:    vsetvli zero, a1, e64, m8, ta, ma
+; RV64-NEXT:    vssubu.vx v8, v8, a0
+; RV64-NEXT:    ret
 entry:
   %a = call <vscale x 8 x i64> @llvm.riscv.vssubu.nxv8i64.i64(
     <vscale x 8 x i64> undef,
     <vscale x 8 x i64> %0,
     i64 %1,
-    i64 %2)
+    iXLen %2)
 
   ret <vscale x 8 x i64> %a
 }
@@ -2054,22 +2140,34 @@ declare <vscale x 8 x i64> @llvm.riscv.vssubu.mask.nxv8i64.i64(
   <vscale x 8 x i64>,
   i64,
   <vscale x 8 x i1>,
-  i64,
-  i64);
-
-define <vscale x 8 x i64> @intrinsic_vssubu_mask_vx_nxv8i64_nxv8i64_i64(<vscale x 8 x i64> %0, <vscale x 8 x i64> %1, i64 %2, <vscale x 8 x i1> %3, i64 %4) nounwind {
-; CHECK-LABEL: intrinsic_vssubu_mask_vx_nxv8i64_nxv8i64_i64:
-; CHECK:       # %bb.0: # %entry
-; CHECK-NEXT:    vsetvli zero, a1, e64, m8, ta, mu
-; CHECK-NEXT:    vssubu.vx v8, v16, a0, v0.t
-; CHECK-NEXT:    ret
+  iXLen,
+  iXLen)
+
+define <vscale x 8 x i64> @intrinsic_vssubu_mask_vx_nxv8i64_nxv8i64_i64(<vscale x 8 x i64> %0, <vscale x 8 x i64> %1, i64 %2, <vscale x 8 x i1> %3, iXLen %4) nounwind {
+; RV32-LABEL: intrinsic_vssubu_mask_vx_nxv8i64_nxv8i64_i64:
+; RV32:       # %bb.0: # %entry
+; RV32-NEXT:    addi sp, sp, -16
+; RV32-NEXT:    sw a1, 12(sp)
+; RV32-NEXT:    sw a0, 8(sp)
+; RV32-NEXT:    addi a0, sp, 8
+; RV32-NEXT:    vsetvli zero, a2, e64, m8, ta, mu
+; RV32-NEXT:    vlse64.v v24, (a0), zero
+; RV32-NEXT:    vssubu.vv v8, v16, v24, v0.t
+; RV32-NEXT:    addi sp, sp, 16
+; RV32-NEXT:    ret
+;
+; RV64-LABEL: intrinsic_vssubu_mask_vx_nxv8i64_nxv8i64_i64:
+; RV64:       # %bb.0: # %entry
+; RV64-NEXT:    vsetvli zero, a1, e64, m8, ta, mu
+; RV64-NEXT:    vssubu.vx v8, v16, a0, v0.t
+; RV64-NEXT:    ret
 entry:
   %a = call <vscale x 8 x i64> @llvm.riscv.vssubu.mask.nxv8i64.i64(
     <vscale x 8 x i64> %0,
     <vscale x 8 x i64> %1,
     i64 %2,
     <vscale x 8 x i1> %3,
-    i64 %4, i64 1)
+    iXLen %4, iXLen 1)
 
   ret <vscale x 8 x i64> %a
 }
diff --git a/llvm/test/CodeGen/RISCV/ucmp.ll b/llvm/test/CodeGen/RISCV/ucmp.ll
index 026340e..c74bc68 100644
--- a/llvm/test/CodeGen/RISCV/ucmp.ll
+++ b/llvm/test/CodeGen/RISCV/ucmp.ll
@@ -48,10 +48,8 @@ define i8 @ucmp.8.32(i32 %x, i32 %y) nounwind {
 ;
 ; RV64I-LABEL: ucmp.8.32:
 ; RV64I:       # %bb.0:
-; RV64I-NEXT:    slli a1, a1, 32
-; RV64I-NEXT:    srli a1, a1, 32
-; RV64I-NEXT:    slli a0, a0, 32
-; RV64I-NEXT:    srli a0, a0, 32
+; RV64I-NEXT:    sext.w a1, a1
+; RV64I-NEXT:    sext.w a0, a0
 ; RV64I-NEXT:    sltu a2, a0, a1
 ; RV64I-NEXT:    sltu a0, a1, a0
 ; RV64I-NEXT:    sub a0, a0, a2
@@ -164,10 +162,44 @@ define i32 @ucmp.32.32(i32 %x, i32 %y) nounwind {
 ;
 ; RV64I-LABEL: ucmp.32.32:
 ; RV64I:       # %bb.0:
-; RV64I-NEXT:    slli a1, a1, 32
-; RV64I-NEXT:    srli a1, a1, 32
-; RV64I-NEXT:    slli a0, a0, 32
-; RV64I-NEXT:    srli a0, a0, 32
+; RV64I-NEXT:    sext.w a1, a1
+; RV64I-NEXT:    sext.w a0, a0
+; RV64I-NEXT:    sltu a2, a0, a1
+; RV64I-NEXT:    sltu a0, a1, a0
+; RV64I-NEXT:    sub a0, a0, a2
+; RV64I-NEXT:    ret
+  %1 = call i32 @llvm.ucmp(i32 %x, i32 %y)
+  ret i32 %1
+}
+
+define i32 @ucmp.32.32_sext(i32 signext %x, i32 signext %y) nounwind {
+; RV32I-LABEL: ucmp.32.32_sext:
+; RV32I:       # %bb.0:
+; RV32I-NEXT:    sltu a2, a0, a1
+; RV32I-NEXT:    sltu a0, a1, a0
+; RV32I-NEXT:    sub a0, a0, a2
+; RV32I-NEXT:    ret
+;
+; RV64I-LABEL: ucmp.32.32_sext:
+; RV64I:       # %bb.0:
+; RV64I-NEXT:    sltu a2, a0, a1
+; RV64I-NEXT:    sltu a0, a1, a0
+; RV64I-NEXT:    sub a0, a0, a2
+; RV64I-NEXT:    ret
+  %1 = call i32 @llvm.ucmp(i32 %x, i32 %y)
+  ret i32 %1
+}
+
+define i32 @ucmp.32.32_zext(i32 zeroext %x, i32 zeroext %y) nounwind {
+; RV32I-LABEL: ucmp.32.32_zext:
+; RV32I:       # %bb.0:
+; RV32I-NEXT:    sltu a2, a0, a1
+; RV32I-NEXT:    sltu a0, a1, a0
+; RV32I-NEXT:    sub a0, a0, a2
+; RV32I-NEXT:    ret
+;
+; RV64I-LABEL: ucmp.32.32_zext:
+; RV64I:       # %bb.0:
 ; RV64I-NEXT:    sltu a2, a0, a1
 ; RV64I-NEXT:    sltu a0, a1, a0
 ; RV64I-NEXT:    sub a0, a0, a2
@@ -179,13 +211,13 @@ define i32 @ucmp.32.32(i32 %x, i32 %y) nounwind {
 define i32 @ucmp.32.64(i64 %x, i64 %y) nounwind {
 ; RV32I-LABEL: ucmp.32.64:
 ; RV32I:       # %bb.0:
-; RV32I-NEXT:    beq a1, a3, .LBB6_2
+; RV32I-NEXT:    beq a1, a3, .LBB8_2
 ; RV32I-NEXT:  # %bb.1:
 ; RV32I-NEXT:    sltu a4, a1, a3
 ; RV32I-NEXT:    sltu a0, a3, a1
 ; RV32I-NEXT:    sub a0, a0, a4
 ; RV32I-NEXT:    ret
-; RV32I-NEXT:  .LBB6_2:
+; RV32I-NEXT:  .LBB8_2:
 ; RV32I-NEXT:    sltu a4, a0, a2
 ; RV32I-NEXT:    sltu a0, a2, a0
 ; RV32I-NEXT:    sub a0, a0, a4
@@ -204,15 +236,15 @@ define i32 @ucmp.32.64(i64 %x, i64 %y) nounwind {
 define i64 @ucmp.64.64(i64 %x, i64 %y) nounwind {
 ; RV32I-LABEL: ucmp.64.64:
 ; RV32I:       # %bb.0:
-; RV32I-NEXT:    beq a1, a3, .LBB7_2
+; RV32I-NEXT:    beq a1, a3, .LBB9_2
 ; RV32I-NEXT:  # %bb.1:
 ; RV32I-NEXT:    sltu a4, a1, a3
 ; RV32I-NEXT:    sltu a0, a3, a1
-; RV32I-NEXT:    j .LBB7_3
-; RV32I-NEXT:  .LBB7_2:
+; RV32I-NEXT:    j .LBB9_3
+; RV32I-NEXT:  .LBB9_2:
 ; RV32I-NEXT:    sltu a4, a0, a2
 ; RV32I-NEXT:    sltu a0, a2, a0
-; RV32I-NEXT:  .LBB7_3:
+; RV32I-NEXT:  .LBB9_3:
 ; RV32I-NEXT:    sub a0, a0, a4
 ; RV32I-NEXT:    srai a1, a0, 31
 ; RV32I-NEXT:    ret
diff --git a/llvm/test/CodeGen/SPARC/2011-01-11-CC.ll b/llvm/test/CodeGen/SPARC/2011-01-11-CC.ll
index c7bf71b..62cf06d 100644
--- a/llvm/test/CodeGen/SPARC/2011-01-11-CC.ll
+++ b/llvm/test/CodeGen/SPARC/2011-01-11-CC.ll
@@ -1,128 +1,459 @@
-; RUN: llc -march=sparc <%s | FileCheck %s -check-prefix=V8
-; RUN: llc -march=sparc -mattr=v9 <%s | FileCheck %s -check-prefix=V9
-; RUN: llc -mtriple=sparc64-unknown-linux <%s | FileCheck %s -check-prefix=SPARC64
+; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py UTC_ARGS: --version 5
+; RUN: llc -march=sparc %s -o - | FileCheck %s -check-prefix=V8
+; RUN: llc -march=sparc -mattr=v9 %s -o - | FileCheck %s -check-prefix=V9
+; RUN: llc -mtriple=sparc64-unknown-linux %s -o - | FileCheck %s -check-prefix=SPARC64
 
-
-define i32 @test_addx(i64 %a, i64 %b, i64 %c) nounwind readnone noinline {
+define i32 @test_addx(i64 %a, i64 %b, i64 %c) nounwind {
+; V8-LABEL: test_addx:
+; V8:       ! %bb.0: ! %entry
+; V8-NEXT:    addcc %o1, %o3, %o3
+; V8-NEXT:    addxcc %o0, %o2, %o1
+; V8-NEXT:    mov 1, %o0
+; V8-NEXT:    cmp %o1, %o4
+; V8-NEXT:    bleu .LBB0_4
+; V8-NEXT:    mov %o0, %o2
+; V8-NEXT:  ! %bb.1: ! %entry
+; V8-NEXT:    cmp %o3, %o5
+; V8-NEXT:    bleu .LBB0_5
+; V8-NEXT:    nop
+; V8-NEXT:  .LBB0_2: ! %entry
+; V8-NEXT:    cmp %o1, %o4
+; V8-NEXT:    bne .LBB0_6
+; V8-NEXT:    nop
+; V8-NEXT:  .LBB0_3: ! %entry
+; V8-NEXT:    retl
+; V8-NEXT:    nop
+; V8-NEXT:  .LBB0_4: ! %entry
+; V8-NEXT:    mov %g0, %o2
+; V8-NEXT:    cmp %o3, %o5
+; V8-NEXT:    bgu .LBB0_2
+; V8-NEXT:    nop
+; V8-NEXT:  .LBB0_5: ! %entry
+; V8-NEXT:    mov %g0, %o0
+; V8-NEXT:    cmp %o1, %o4
+; V8-NEXT:    be .LBB0_3
+; V8-NEXT:    nop
+; V8-NEXT:  .LBB0_6: ! %entry
+; V8-NEXT:    retl
+; V8-NEXT:    mov %o2, %o0
+;
+; V9-LABEL: test_addx:
+; V9:       ! %bb.0: ! %entry
+; V9-NEXT:    mov %g0, %g2
+; V9-NEXT:    mov %g0, %g3
+; V9-NEXT:    addcc %o1, %o3, %o1
+; V9-NEXT:    addxcc %o0, %o2, %o0
+; V9-NEXT:    cmp %o0, %o4
+; V9-NEXT:    movgu %icc, 1, %g2
+; V9-NEXT:    cmp %o1, %o5
+; V9-NEXT:    movgu %icc, 1, %g3
+; V9-NEXT:    cmp %o0, %o4
+; V9-NEXT:    move %icc, %g3, %g2
+; V9-NEXT:    retl
+; V9-NEXT:    mov %g2, %o0
+;
+; SPARC64-LABEL: test_addx:
+; SPARC64:       ! %bb.0: ! %entry
+; SPARC64-NEXT:    mov %g0, %o3
+; SPARC64-NEXT:    add %o0, %o1, %o0
+; SPARC64-NEXT:    cmp %o0, %o2
+; SPARC64-NEXT:    movgu %xcc, 1, %o3
+; SPARC64-NEXT:    retl
+; SPARC64-NEXT:    srl %o3, 0, %o0
 entry:
-; V8: addcc
-; V8-NOT: subcc
-; V8: addx
-; V9: addcc
-; V9-NOT: subcc
-; V9: addx
-; V9: mov{{e|ne}} %icc
   %0 = add i64 %a, %b
   %1 = icmp ugt i64 %0, %c
   %2 = zext i1 %1 to i32
   ret i32 %2
 }
 
-
-define i32 @test_select_int_icc(i32 %a, i32 %b, i32 %c) nounwind readnone noinline {
+define i32 @test_select_int_icc(i32 %a, i32 %b, i32 %c) nounwind {
+; V8-LABEL: test_select_int_icc:
+; V8:       ! %bb.0: ! %entry
+; V8-NEXT:    cmp %o0, 0
+; V8-NEXT:    be .LBB1_2
+; V8-NEXT:    mov %o1, %o0
+; V8-NEXT:  ! %bb.1: ! %entry
+; V8-NEXT:    mov %o2, %o0
+; V8-NEXT:  .LBB1_2: ! %entry
+; V8-NEXT:    retl
+; V8-NEXT:    nop
+;
+; V9-LABEL: test_select_int_icc:
+; V9:       ! %bb.0: ! %entry
+; V9-NEXT:    cmp %o0, 0
+; V9-NEXT:    move %icc, %o1, %o2
+; V9-NEXT:    retl
+; V9-NEXT:    mov %o2, %o0
+;
+; SPARC64-LABEL: test_select_int_icc:
+; SPARC64:       ! %bb.0: ! %entry
+; SPARC64-NEXT:    cmp %o0, 0
+; SPARC64-NEXT:    move %icc, %o1, %o2
+; SPARC64-NEXT:    retl
+; SPARC64-NEXT:    mov %o2, %o0
 entry:
-; V8: test_select_int_icc
-; V8: cmp
-; V8: {{be|bne}}
-; V9: test_select_int_icc
-; V9: cmp
-; V9-NOT: {{be|bne}}
-; V9: mov{{e|ne}} %icc
   %0 = icmp eq i32 %a, 0
   %1 = select i1 %0, i32 %b, i32 %c
   ret i32 %1
 }
 
-
-define float @test_select_fp_icc(i32 %a, float %f1, float %f2) nounwind readnone noinline {
+define float @test_select_fp_icc(i32 %a, float %f1, float %f2) nounwind {
+; V8-LABEL: test_select_fp_icc:
+; V8:       ! %bb.0: ! %entry
+; V8-NEXT:    add %sp, -104, %sp
+; V8-NEXT:    st %o2, [%sp+100]
+; V8-NEXT:    cmp %o0, 0
+; V8-NEXT:    be .LBB2_2
+; V8-NEXT:    st %o1, [%sp+96]
+; V8-NEXT:  ! %bb.1: ! %entry
+; V8-NEXT:    ld [%sp+100], %f0
+; V8-NEXT:    retl
+; V8-NEXT:    add %sp, 104, %sp
+; V8-NEXT:  .LBB2_2:
+; V8-NEXT:    ld [%sp+96], %f0
+; V8-NEXT:    retl
+; V8-NEXT:    add %sp, 104, %sp
+;
+; V9-LABEL: test_select_fp_icc:
+; V9:       ! %bb.0: ! %entry
+; V9-NEXT:    add %sp, -104, %sp
+; V9-NEXT:    st %o2, [%sp+100]
+; V9-NEXT:    st %o1, [%sp+96]
+; V9-NEXT:    ld [%sp+100], %f0
+; V9-NEXT:    ld [%sp+96], %f1
+; V9-NEXT:    cmp %o0, 0
+; V9-NEXT:    fmovse %icc, %f1, %f0
+; V9-NEXT:    retl
+; V9-NEXT:    add %sp, 104, %sp
+;
+; SPARC64-LABEL: test_select_fp_icc:
+; SPARC64:       ! %bb.0: ! %entry
+; SPARC64-NEXT:    fmovs %f5, %f0
+; SPARC64-NEXT:    cmp %o0, 0
+; SPARC64-NEXT:    retl
+; SPARC64-NEXT:    fmovse %icc, %f3, %f0
 entry:
-; V8: test_select_fp_icc
-; V8: cmp
-; V8: {{be|bne}}
-; V9: test_select_fp_icc
-; V9: cmp
-; V9-NOT: {{be|bne}}
-; V9: fmovs{{e|ne}} %icc
   %0 = icmp eq i32 %a, 0
   %1 = select i1 %0, float %f1, float %f2
   ret float %1
 }
 
-define double @test_select_dfp_icc(i32 %a, double %f1, double %f2) nounwind readnone noinline {
+define double @test_select_dfp_icc(i32 %a, double %f1, double %f2) nounwind {
+; V8-LABEL: test_select_dfp_icc:
+; V8:       ! %bb.0: ! %entry
+; V8-NEXT:    add %sp, -112, %sp
+; V8-NEXT:    mov %o4, %o5
+; V8-NEXT:    mov %o2, %g3
+; V8-NEXT:    mov %o3, %o4
+; V8-NEXT:    std %o4, [%sp+96]
+; V8-NEXT:    cmp %o0, 0
+; V8-NEXT:    mov %o1, %g2
+; V8-NEXT:    be .LBB3_2
+; V8-NEXT:    std %g2, [%sp+104]
+; V8-NEXT:  ! %bb.1: ! %entry
+; V8-NEXT:    ldd [%sp+96], %f0
+; V8-NEXT:    retl
+; V8-NEXT:    add %sp, 112, %sp
+; V8-NEXT:  .LBB3_2:
+; V8-NEXT:    ldd [%sp+104], %f0
+; V8-NEXT:    retl
+; V8-NEXT:    add %sp, 112, %sp
+;
+; V9-LABEL: test_select_dfp_icc:
+; V9:       ! %bb.0: ! %entry
+; V9-NEXT:    add %sp, -112, %sp
+; V9-NEXT:    mov %o4, %o5
+; V9-NEXT:    mov %o2, %g3
+; V9-NEXT:    mov %o3, %o4
+; V9-NEXT:    std %o4, [%sp+96]
+; V9-NEXT:    mov %o1, %g2
+; V9-NEXT:    std %g2, [%sp+104]
+; V9-NEXT:    ldd [%sp+96], %f0
+; V9-NEXT:    ldd [%sp+104], %f2
+; V9-NEXT:    cmp %o0, 0
+; V9-NEXT:    fmovde %icc, %f2, %f0
+; V9-NEXT:    retl
+; V9-NEXT:    add %sp, 112, %sp
+;
+; SPARC64-LABEL: test_select_dfp_icc:
+; SPARC64:       ! %bb.0: ! %entry
+; SPARC64-NEXT:    fmovd %f4, %f0
+; SPARC64-NEXT:    cmp %o0, 0
+; SPARC64-NEXT:    retl
+; SPARC64-NEXT:    fmovde %icc, %f2, %f0
 entry:
-; V8: test_select_dfp_icc
-; V8: cmp
-; V8: {{be|bne}}
-; V9: test_select_dfp_icc
-; V9: cmp
-; V9-NOT: {{be|bne}}
-; V9: fmovd{{e|ne}} %icc
   %0 = icmp eq i32 %a, 0
   %1 = select i1 %0, double %f1, double %f2
   ret double %1
 }
 
-define i32 @test_select_int_fcc(float %f, i32 %a, i32 %b) nounwind readnone noinline {
+define i32 @test_select_int_fcc(float %f, i32 %a, i32 %b) nounwind {
+; V8-LABEL: test_select_int_fcc:
+; V8:       ! %bb.0: ! %entry
+; V8-NEXT:    add %sp, -96, %sp
+; V8-NEXT:    st %o0, [%sp+92]
+; V8-NEXT:    ld [%sp+92], %f0
+; V8-NEXT:    sethi %hi(.LCPI4_0), %o0
+; V8-NEXT:    ld [%o0+%lo(.LCPI4_0)], %f1
+; V8-NEXT:    fcmps %f0, %f1
+; V8-NEXT:    nop
+; V8-NEXT:    fbne .LBB4_2
+; V8-NEXT:    mov %o1, %o0
+; V8-NEXT:  ! %bb.1: ! %entry
+; V8-NEXT:    mov %o2, %o0
+; V8-NEXT:  .LBB4_2: ! %entry
+; V8-NEXT:    retl
+; V8-NEXT:    add %sp, 96, %sp
+;
+; V9-LABEL: test_select_int_fcc:
+; V9:       ! %bb.0: ! %entry
+; V9-NEXT:    add %sp, -96, %sp
+; V9-NEXT:    st %o0, [%sp+92]
+; V9-NEXT:    ld [%sp+92], %f0
+; V9-NEXT:    sethi %hi(.LCPI4_0), %o0
+; V9-NEXT:    ld [%o0+%lo(.LCPI4_0)], %f1
+; V9-NEXT:    mov %o2, %o0
+; V9-NEXT:    fcmps %fcc0, %f0, %f1
+; V9-NEXT:    movne %fcc0, %o1, %o0
+; V9-NEXT:    retl
+; V9-NEXT:    add %sp, 96, %sp
+;
+; SPARC64-LABEL: test_select_int_fcc:
+; SPARC64:       ! %bb.0: ! %entry
+; SPARC64-NEXT:    sethi %h44(.LCPI4_0), %o0
+; SPARC64-NEXT:    add %o0, %m44(.LCPI4_0), %o0
+; SPARC64-NEXT:    sllx %o0, 12, %o0
+; SPARC64-NEXT:    ld [%o0+%l44(.LCPI4_0)], %f0
+; SPARC64-NEXT:    mov %o2, %o0
+; SPARC64-NEXT:    fcmps %fcc0, %f1, %f0
+; SPARC64-NEXT:    retl
+; SPARC64-NEXT:    movne %fcc0, %o1, %o0
 entry:
-;V8-LABEL: test_select_int_fcc:
-;V8: fcmps
-;V8-NEXT: nop
-;V8: {{fbe|fbne}}
-;V9-LABEL: test_select_int_fcc:
-;V9: fcmps
-;V9-NOT: nop
-;V9-NOT: {{fbe|fbne}}
-;V9: mov{{e|ne}} %fcc0
   %0 = fcmp une float %f, 0.000000e+00
   %a.b = select i1 %0, i32 %a, i32 %b
   ret i32 %a.b
 }
 
-
-define float @test_select_fp_fcc(float %f, float %f1, float %f2) nounwind readnone noinline {
+define float @test_select_fp_fcc(float %f, float %f1, float %f2) nounwind {
+; V8-LABEL: test_select_fp_fcc:
+; V8:       ! %bb.0: ! %entry
+; V8-NEXT:    add %sp, -104, %sp
+; V8-NEXT:    st %o0, [%sp+92]
+; V8-NEXT:    st %o2, [%sp+100]
+; V8-NEXT:    st %o1, [%sp+96]
+; V8-NEXT:    ld [%sp+92], %f0
+; V8-NEXT:    sethi %hi(.LCPI5_0), %o0
+; V8-NEXT:    ld [%o0+%lo(.LCPI5_0)], %f1
+; V8-NEXT:    fcmps %f0, %f1
+; V8-NEXT:    nop
+; V8-NEXT:    fbne .LBB5_2
+; V8-NEXT:    nop
+; V8-NEXT:  ! %bb.1: ! %entry
+; V8-NEXT:    ld [%sp+100], %f0
+; V8-NEXT:    retl
+; V8-NEXT:    add %sp, 104, %sp
+; V8-NEXT:  .LBB5_2:
+; V8-NEXT:    ld [%sp+96], %f0
+; V8-NEXT:    retl
+; V8-NEXT:    add %sp, 104, %sp
+;
+; V9-LABEL: test_select_fp_fcc:
+; V9:       ! %bb.0: ! %entry
+; V9-NEXT:    add %sp, -104, %sp
+; V9-NEXT:    st %o0, [%sp+92]
+; V9-NEXT:    st %o2, [%sp+100]
+; V9-NEXT:    st %o1, [%sp+96]
+; V9-NEXT:    ld [%sp+92], %f1
+; V9-NEXT:    ld [%sp+100], %f0
+; V9-NEXT:    sethi %hi(.LCPI5_0), %o0
+; V9-NEXT:    ld [%o0+%lo(.LCPI5_0)], %f2
+; V9-NEXT:    ld [%sp+96], %f3
+; V9-NEXT:    fcmps %fcc0, %f1, %f2
+; V9-NEXT:    fmovsne %fcc0, %f3, %f0
+; V9-NEXT:    retl
+; V9-NEXT:    add %sp, 104, %sp
+;
+; SPARC64-LABEL: test_select_fp_fcc:
+; SPARC64:       ! %bb.0: ! %entry
+; SPARC64-NEXT:    sethi %h44(.LCPI5_0), %o0
+; SPARC64-NEXT:    add %o0, %m44(.LCPI5_0), %o0
+; SPARC64-NEXT:    sllx %o0, 12, %o0
+; SPARC64-NEXT:    ld [%o0+%l44(.LCPI5_0)], %f2
+; SPARC64-NEXT:    fmovs %f5, %f0
+; SPARC64-NEXT:    fcmps %fcc0, %f1, %f2
+; SPARC64-NEXT:    retl
+; SPARC64-NEXT:    fmovsne %fcc0, %f3, %f0
 entry:
-;V8-LABEL: test_select_fp_fcc:
-;V8: fcmps
-;V8: {{fbe|fbne}}
-;V9-LABEL: test_select_fp_fcc:
-;V9: fcmps
-;V9-NOT: {{fbe|fbne}}
-;V9: fmovs{{e|ne}} %fcc0
   %0 = fcmp une float %f, 0.000000e+00
   %1 = select i1 %0, float %f1, float %f2
   ret float %1
 }
 
-define double @test_select_dfp_fcc(double %f, double %f1, double %f2) nounwind readnone noinline {
+define double @test_select_dfp_fcc(double %f, double %f1, double %f2) nounwind {
+; V8-LABEL: test_select_dfp_fcc:
+; V8:       ! %bb.0: ! %entry
+; V8-NEXT:    add %sp, -120, %sp
+; V8-NEXT:    ! kill: def $o1 killed $o1 killed $o0_o1 def $o0_o1
+; V8-NEXT:    ! kill: def $o5 killed $o5 killed $o4_o5 def $o4_o5
+; V8-NEXT:    ! kill: def $o3 killed $o3 killed $o2_o3 def $o2_o3
+; V8-NEXT:    ! kill: def $o0 killed $o0 killed $o0_o1 def $o0_o1
+; V8-NEXT:    std %o0, [%sp+112]
+; V8-NEXT:    ! kill: def $o4 killed $o4 killed $o4_o5 def $o4_o5
+; V8-NEXT:    std %o4, [%sp+96]
+; V8-NEXT:    ! kill: def $o2 killed $o2 killed $o2_o3 def $o2_o3
+; V8-NEXT:    std %o2, [%sp+104]
+; V8-NEXT:    ldd [%sp+112], %f0
+; V8-NEXT:    sethi %hi(.LCPI6_0), %o0
+; V8-NEXT:    ldd [%o0+%lo(.LCPI6_0)], %f2
+; V8-NEXT:    fcmpd %f0, %f2
+; V8-NEXT:    nop
+; V8-NEXT:    fbne .LBB6_2
+; V8-NEXT:    nop
+; V8-NEXT:  ! %bb.1: ! %entry
+; V8-NEXT:    ldd [%sp+96], %f0
+; V8-NEXT:    retl
+; V8-NEXT:    add %sp, 120, %sp
+; V8-NEXT:  .LBB6_2:
+; V8-NEXT:    ldd [%sp+104], %f0
+; V8-NEXT:    retl
+; V8-NEXT:    add %sp, 120, %sp
+;
+; V9-LABEL: test_select_dfp_fcc:
+; V9:       ! %bb.0: ! %entry
+; V9-NEXT:    add %sp, -120, %sp
+; V9-NEXT:    ! kill: def $o1 killed $o1 killed $o0_o1 def $o0_o1
+; V9-NEXT:    ! kill: def $o5 killed $o5 killed $o4_o5 def $o4_o5
+; V9-NEXT:    ! kill: def $o3 killed $o3 killed $o2_o3 def $o2_o3
+; V9-NEXT:    ! kill: def $o0 killed $o0 killed $o0_o1 def $o0_o1
+; V9-NEXT:    std %o0, [%sp+112]
+; V9-NEXT:    ! kill: def $o4 killed $o4 killed $o4_o5 def $o4_o5
+; V9-NEXT:    std %o4, [%sp+96]
+; V9-NEXT:    ! kill: def $o2 killed $o2 killed $o2_o3 def $o2_o3
+; V9-NEXT:    std %o2, [%sp+104]
+; V9-NEXT:    ldd [%sp+112], %f2
+; V9-NEXT:    ldd [%sp+96], %f0
+; V9-NEXT:    sethi %hi(.LCPI6_0), %o0
+; V9-NEXT:    ldd [%o0+%lo(.LCPI6_0)], %f4
+; V9-NEXT:    ldd [%sp+104], %f6
+; V9-NEXT:    fcmpd %fcc0, %f2, %f4
+; V9-NEXT:    fmovdne %fcc0, %f6, %f0
+; V9-NEXT:    retl
+; V9-NEXT:    add %sp, 120, %sp
+;
+; SPARC64-LABEL: test_select_dfp_fcc:
+; SPARC64:       ! %bb.0: ! %entry
+; SPARC64-NEXT:    sethi %h44(.LCPI6_0), %o0
+; SPARC64-NEXT:    add %o0, %m44(.LCPI6_0), %o0
+; SPARC64-NEXT:    sllx %o0, 12, %o0
+; SPARC64-NEXT:    ldd [%o0+%l44(.LCPI6_0)], %f6
+; SPARC64-NEXT:    fcmpd %fcc0, %f0, %f6
+; SPARC64-NEXT:    fmovdne %fcc0, %f2, %f4
+; SPARC64-NEXT:    fmovd %f4, %f0
+; SPARC64-NEXT:    retl
+; SPARC64-NEXT:    nop
 entry:
-;V8-LABEL: test_select_dfp_fcc:
-;V8: fcmpd
-;V8-NEXT: nop
-;V8: {{fbne|fbe}}
-;V9-LABEL: test_select_dfp_fcc:
-;V9: fcmpd
-;V9-NOT: nop
-;V9-NOT: {{fbne|fbe}}
-;V9: fmovd{{e|ne}} %fcc0
   %0 = fcmp une double %f, 0.000000e+00
   %1 = select i1 %0, double %f1, double %f2
   ret double %1
 }
 
-define i32 @test_float_cc(double %a, double %b, i32 %c, i32 %d) {
+define i32 @test_float_cc(double %a, double %b, i32 %c, i32 %d) nounwind {
+; V8-LABEL: test_float_cc:
+; V8:       ! %bb.0: ! %entry
+; V8-NEXT:    add %sp, -112, %sp
+; V8-NEXT:    ! kill: def $o3 killed $o3 killed $o2_o3 def $o2_o3
+; V8-NEXT:    ! kill: def $o1 killed $o1 killed $o0_o1 def $o0_o1
+; V8-NEXT:    ! kill: def $o2 killed $o2 killed $o2_o3 def $o2_o3
+; V8-NEXT:    std %o2, [%sp+96]
+; V8-NEXT:    ! kill: def $o0 killed $o0 killed $o0_o1 def $o0_o1
+; V8-NEXT:    std %o0, [%sp+104]
+; V8-NEXT:    ldd [%sp+104], %f2
+; V8-NEXT:    sethi %hi(.LCPI7_0), %o0
+; V8-NEXT:    ldd [%o0+%lo(.LCPI7_0)], %f0
+; V8-NEXT:    fcmpd %f2, %f0
+; V8-NEXT:    nop
+; V8-NEXT:    fbuge .LBB7_3
+; V8-NEXT:    nop
+; V8-NEXT:  ! %bb.1: ! %loop.2
+; V8-NEXT:    ldd [%sp+96], %f2
+; V8-NEXT:    fcmpd %f2, %f0
+; V8-NEXT:    nop
+; V8-NEXT:    fbule .LBB7_3
+; V8-NEXT:    nop
+; V8-NEXT:  ! %bb.2: ! %exit.1
+; V8-NEXT:    mov 1, %o0
+; V8-NEXT:    retl
+; V8-NEXT:    add %sp, 112, %sp
+; V8-NEXT:  .LBB7_3: ! %loop
+; V8-NEXT:    ! =>This Inner Loop Header: Depth=1
+; V8-NEXT:    cmp %o4, 10
+; V8-NEXT:    be .LBB7_3
+; V8-NEXT:    nop
+; V8-NEXT:  ! %bb.4: ! %exit.0
+; V8-NEXT:    mov %g0, %o0
+; V8-NEXT:    retl
+; V8-NEXT:    add %sp, 112, %sp
+;
+; V9-LABEL: test_float_cc:
+; V9:       ! %bb.0: ! %entry
+; V9-NEXT:    add %sp, -112, %sp
+; V9-NEXT:    ! kill: def $o3 killed $o3 killed $o2_o3 def $o2_o3
+; V9-NEXT:    ! kill: def $o1 killed $o1 killed $o0_o1 def $o0_o1
+; V9-NEXT:    ! kill: def $o2 killed $o2 killed $o2_o3 def $o2_o3
+; V9-NEXT:    std %o2, [%sp+96]
+; V9-NEXT:    ! kill: def $o0 killed $o0 killed $o0_o1 def $o0_o1
+; V9-NEXT:    std %o0, [%sp+104]
+; V9-NEXT:    ldd [%sp+104], %f2
+; V9-NEXT:    sethi %hi(.LCPI7_0), %o0
+; V9-NEXT:    ldd [%o0+%lo(.LCPI7_0)], %f0
+; V9-NEXT:    fcmpd %fcc0, %f2, %f0
+; V9-NEXT:    fbuge %fcc0, .LBB7_3
+; V9-NEXT:    nop
+; V9-NEXT:  ! %bb.1: ! %loop.2
+; V9-NEXT:    ldd [%sp+96], %f2
+; V9-NEXT:    fcmpd %fcc0, %f2, %f0
+; V9-NEXT:    fbule %fcc0, .LBB7_3
+; V9-NEXT:    nop
+; V9-NEXT:  ! %bb.2: ! %exit.1
+; V9-NEXT:    mov 1, %o0
+; V9-NEXT:    retl
+; V9-NEXT:    add %sp, 112, %sp
+; V9-NEXT:  .LBB7_3: ! %loop
+; V9-NEXT:    ! =>This Inner Loop Header: Depth=1
+; V9-NEXT:    cmp %o4, 10
+; V9-NEXT:    be %icc, .LBB7_3
+; V9-NEXT:    nop
+; V9-NEXT:  ! %bb.4: ! %exit.0
+; V9-NEXT:    mov %g0, %o0
+; V9-NEXT:    retl
+; V9-NEXT:    add %sp, 112, %sp
+;
+; SPARC64-LABEL: test_float_cc:
+; SPARC64:       ! %bb.0: ! %entry
+; SPARC64-NEXT:    sethi %h44(.LCPI7_0), %o0
+; SPARC64-NEXT:    add %o0, %m44(.LCPI7_0), %o0
+; SPARC64-NEXT:    sllx %o0, 12, %o0
+; SPARC64-NEXT:    ldd [%o0+%l44(.LCPI7_0)], %f4
+; SPARC64-NEXT:    fcmpd %fcc0, %f0, %f4
+; SPARC64-NEXT:    fbuge %fcc0, .LBB7_3
+; SPARC64-NEXT:    nop
+; SPARC64-NEXT:  ! %bb.1: ! %loop.2
+; SPARC64-NEXT:    fcmpd %fcc0, %f2, %f4
+; SPARC64-NEXT:    fbule %fcc0, .LBB7_3
+; SPARC64-NEXT:    nop
+; SPARC64-NEXT:  ! %bb.2: ! %exit.1
+; SPARC64-NEXT:    retl
+; SPARC64-NEXT:    mov 1, %o0
+; SPARC64-NEXT:  .LBB7_3: ! %loop
+; SPARC64-NEXT:    ! =>This Inner Loop Header: Depth=1
+; SPARC64-NEXT:    cmp %o2, 10
+; SPARC64-NEXT:    be %icc, .LBB7_3
+; SPARC64-NEXT:    nop
+; SPARC64-NEXT:  ! %bb.4: ! %exit.0
+; SPARC64-NEXT:    retl
+; SPARC64-NEXT:    mov %g0, %o0
 entry:
-; V8-LABEL: test_float_cc
-; V8:       fcmpd
-; V8:       {{fbl|fbuge}} .LBB
-; V8:       fcmpd
-; V8:       {{fbule|fbg}} .LBB
-
-; V9-LABEL: test_float_cc
-; V9:       fcmpd %fcc0
-; V9:       {{fbl|fbuge}} %fcc0, .LBB
-; V9:       fcmpd %fcc0
-; V9:       {{fbule|fbg}} %fcc0, .LBB
-
    %0 = fcmp uge double %a, 0.000000e+00
    br i1 %0, label %loop, label %loop.2
 
@@ -141,39 +472,92 @@ exit.1:
    ret i32 1
 }
 
-; V8-LABEL: test_adde_sube
-; V8:       addcc
-; V8:       addxcc
-; V8:       addxcc
-; V8:       addxcc
-; V8:       subcc
-; V8:       subxcc
-; V8:       subxcc
-; V8:       subxcc
-
-
-; V9-LABEL: test_adde_sube
-; V9:       addcc
-; V9:       addxcc
-; V9:       addxcc
-; V9:       addxcc
-; V9:       subcc
-; V9:       subxcc
-; V9:       subxcc
-; V9:       subxcc
-
-; SPARC64-LABEL: test_adde_sube
-; SPARC64:       addcc
-; SPARC64:       addxcc
-; SPARC64:       addxcc
-; SPARC64:       addxcc
-; SPARC64:       subcc
-; SPARC64:       subxcc
-; SPARC64:       subxcc
-; SPARC64:       subxcc
-
-
-define void @test_adde_sube(ptr %a, ptr %b, ptr %sum, ptr %diff) {
+define void @test_adde_sube(ptr %a, ptr %b, ptr %sum, ptr %diff) nounwind {
+; V8-LABEL: test_adde_sube:
+; V8:       ! %bb.0: ! %entry
+; V8-NEXT:    save %sp, -96, %sp
+; V8-NEXT:    ldd [%i0+8], %i4
+; V8-NEXT:    ldd [%i1+8], %l0
+; V8-NEXT:    ldd [%i0], %g2
+; V8-NEXT:    ldd [%i1], %l2
+; V8-NEXT:    addcc %i5, %l1, %l5
+; V8-NEXT:    addxcc %i4, %l0, %l4
+; V8-NEXT:    addxcc %g3, %l3, %l1
+; V8-NEXT:    addxcc %g2, %l2, %l0
+; V8-NEXT:    std %l4, [%i2+8]
+; V8-NEXT:    std %l0, [%i2]
+; V8-NEXT:    !APP
+; V8-NEXT:    !NO_APP
+; V8-NEXT:    ldd [%i0+8], %l0
+; V8-NEXT:    ldd [%i0], %i0
+; V8-NEXT:    subcc %i5, %l1, %l3
+; V8-NEXT:    subxcc %i4, %l0, %l2
+; V8-NEXT:    subxcc %g3, %i1, %i5
+; V8-NEXT:    subxcc %g2, %i0, %i4
+; V8-NEXT:    std %l2, [%i3+8]
+; V8-NEXT:    std %i4, [%i3]
+; V8-NEXT:    ret
+; V8-NEXT:    restore
+;
+; V9-LABEL: test_adde_sube:
+; V9:       ! %bb.0: ! %entry
+; V9-NEXT:    save %sp, -96, %sp
+; V9-NEXT:    ldd [%i0+8], %i4
+; V9-NEXT:    ldd [%i1+8], %l0
+; V9-NEXT:    ldd [%i0], %g2
+; V9-NEXT:    ldd [%i1], %l2
+; V9-NEXT:    addcc %i5, %l1, %l5
+; V9-NEXT:    addxcc %i4, %l0, %l4
+; V9-NEXT:    addxcc %g3, %l3, %l1
+; V9-NEXT:    addxcc %g2, %l2, %l0
+; V9-NEXT:    std %l4, [%i2+8]
+; V9-NEXT:    std %l0, [%i2]
+; V9-NEXT:    !APP
+; V9-NEXT:    !NO_APP
+; V9-NEXT:    ldd [%i0+8], %l0
+; V9-NEXT:    ldd [%i0], %i0
+; V9-NEXT:    subcc %i5, %l1, %l3
+; V9-NEXT:    subxcc %i4, %l0, %l2
+; V9-NEXT:    subxcc %g3, %i1, %i5
+; V9-NEXT:    subxcc %g2, %i0, %i4
+; V9-NEXT:    std %l2, [%i3+8]
+; V9-NEXT:    std %i4, [%i3]
+; V9-NEXT:    ret
+; V9-NEXT:    restore
+;
+; SPARC64-LABEL: test_adde_sube:
+; SPARC64:         .register %g2, #scratch
+; SPARC64-NEXT:    .register %g3, #scratch
+; SPARC64-NEXT:  ! %bb.0: ! %entry
+; SPARC64-NEXT:    save %sp, -128, %sp
+; SPARC64-NEXT:    ldx [%i0+8], %i4
+; SPARC64-NEXT:    ldx [%i0], %i5
+; SPARC64-NEXT:    ldx [%i1], %g2
+; SPARC64-NEXT:    ldx [%i1+8], %i1
+; SPARC64-NEXT:    mov %g0, %g3
+; SPARC64-NEXT:    add %i5, %g2, %g2
+; SPARC64-NEXT:    add %i4, %i1, %i1
+; SPARC64-NEXT:    cmp %i1, %i4
+; SPARC64-NEXT:    movcs %xcc, 1, %g3
+; SPARC64-NEXT:    srl %g3, 0, %g3
+; SPARC64-NEXT:    add %g2, %g3, %g2
+; SPARC64-NEXT:    stx %i1, [%i2+8]
+; SPARC64-NEXT:    stx %g2, [%i2]
+; SPARC64-NEXT:    !APP
+; SPARC64-NEXT:    !NO_APP
+; SPARC64-NEXT:    ldx [%i0+8], %i1
+; SPARC64-NEXT:    mov %g0, %i2
+; SPARC64-NEXT:    ldx [%i0], %i0
+; SPARC64-NEXT:    cmp %i4, %i1
+; SPARC64-NEXT:    movcs %xcc, 1, %i2
+; SPARC64-NEXT:    srl %i2, 0, %i2
+; SPARC64-NEXT:    sub %i5, %i0, %i0
+; SPARC64-NEXT:    sub %i0, %i2, %i0
+; SPARC64-NEXT:    sub %i4, %i1, %i1
+; SPARC64-NEXT:    stx %i1, [%i3+8]
+; SPARC64-NEXT:    stx %i0, [%i3]
+; SPARC64-NEXT:    ret
+; SPARC64-NEXT:    restore
 entry:
    %0 = bitcast ptr %a to ptr
    %1 = bitcast ptr %b to ptr
diff --git a/llvm/test/CodeGen/SPARC/64cond.ll b/llvm/test/CodeGen/SPARC/64cond.ll
index 10d0700..5a90022 100644
--- a/llvm/test/CodeGen/SPARC/64cond.ll
+++ b/llvm/test/CodeGen/SPARC/64cond.ll
@@ -1,10 +1,19 @@
+; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py UTC_ARGS: --version 5
 ; RUN: llc < %s -mtriple=sparc64-pc-openbsd -disable-sparc-leaf-proc | FileCheck %s
 ; Testing 64-bit conditionals. The sparc64 triple is an alias for sparcv9.
 
-; CHECK: cmpri
-; CHECK: cmp %i1, 1
-; CHECK: be %xcc,
-define void @cmpri(ptr %p, i64 %x) {
+define void @cmpri(ptr %p, i64 %x) nounwind {
+; CHECK-LABEL: cmpri:
+; CHECK:       ! %bb.0: ! %entry
+; CHECK-NEXT:    save %sp, -128, %sp
+; CHECK-NEXT:    cmp %i1, 1
+; CHECK-NEXT:    be %xcc, .LBB0_2
+; CHECK-NEXT:    nop
+; CHECK-NEXT:  ! %bb.1: ! %if.then
+; CHECK-NEXT:    stx %i1, [%i0]
+; CHECK-NEXT:  .LBB0_2: ! %if.end
+; CHECK-NEXT:    ret
+; CHECK-NEXT:    restore
 entry:
   %tobool = icmp eq i64 %x, 1
   br i1 %tobool, label %if.end, label %if.then
@@ -17,10 +26,18 @@ if.end:
   ret void
 }
 
-; CHECK: cmprr
-; CHECK: cmp %i1, %i2
-; CHECK: bgu %xcc,
-define void @cmprr(ptr %p, i64 %x, i64 %y) {
+define void @cmprr(ptr %p, i64 %x, i64 %y) nounwind {
+; CHECK-LABEL: cmprr:
+; CHECK:       ! %bb.0: ! %entry
+; CHECK-NEXT:    save %sp, -128, %sp
+; CHECK-NEXT:    cmp %i1, %i2
+; CHECK-NEXT:    bgu %xcc, .LBB1_2
+; CHECK-NEXT:    nop
+; CHECK-NEXT:  ! %bb.1: ! %if.then
+; CHECK-NEXT:    stx %i1, [%i0]
+; CHECK-NEXT:  .LBB1_2: ! %if.end
+; CHECK-NEXT:    ret
+; CHECK-NEXT:    restore
 entry:
   %tobool = icmp ugt i64 %x, %y
   br i1 %tobool, label %if.end, label %if.then
@@ -33,67 +50,87 @@ if.end:
   ret void
 }
 
-; CHECK: selecti32_xcc
-; CHECK: cmp %i0, %i1
-; CHECK: movg %xcc, %i2, %i3
-; CHECK: restore %g0, %i3, %o0
-define i32 @selecti32_xcc(i64 %x, i64 %y, i32 %a, i32 %b) {
+define i32 @selecti32_xcc(i64 %x, i64 %y, i32 %a, i32 %b) nounwind {
+; CHECK-LABEL: selecti32_xcc:
+; CHECK:       ! %bb.0: ! %entry
+; CHECK-NEXT:    save %sp, -128, %sp
+; CHECK-NEXT:    cmp %i0, %i1
+; CHECK-NEXT:    movg %xcc, %i2, %i3
+; CHECK-NEXT:    ret
+; CHECK-NEXT:    restore %g0, %i3, %o0
 entry:
   %tobool = icmp sgt i64 %x, %y
   %rv = select i1 %tobool, i32 %a, i32 %b
   ret i32 %rv
 }
 
-; CHECK: selecti64_xcc
-; CHECK: cmp %i0, %i1
-; CHECK: movg %xcc, %i2, %i3
-; CHECK: restore %g0, %i3, %o0
-define i64 @selecti64_xcc(i64 %x, i64 %y, i64 %a, i64 %b) {
+define i64 @selecti64_xcc(i64 %x, i64 %y, i64 %a, i64 %b) nounwind {
+; CHECK-LABEL: selecti64_xcc:
+; CHECK:       ! %bb.0: ! %entry
+; CHECK-NEXT:    save %sp, -128, %sp
+; CHECK-NEXT:    cmp %i0, %i1
+; CHECK-NEXT:    movg %xcc, %i2, %i3
+; CHECK-NEXT:    ret
+; CHECK-NEXT:    restore %g0, %i3, %o0
 entry:
   %tobool = icmp sgt i64 %x, %y
   %rv = select i1 %tobool, i64 %a, i64 %b
   ret i64 %rv
 }
 
-; CHECK: selecti64_icc
-; CHECK: cmp %i0, %i1
-; CHECK: movg %icc, %i2, %i3
-; CHECK: restore %g0, %i3, %o0
-define i64 @selecti64_icc(i32 %x, i32 %y, i64 %a, i64 %b) {
+define i64 @selecti64_icc(i32 %x, i32 %y, i64 %a, i64 %b) nounwind {
+; CHECK-LABEL: selecti64_icc:
+; CHECK:       ! %bb.0: ! %entry
+; CHECK-NEXT:    save %sp, -128, %sp
+; CHECK-NEXT:    cmp %i0, %i1
+; CHECK-NEXT:    movg %icc, %i2, %i3
+; CHECK-NEXT:    ret
+; CHECK-NEXT:    restore %g0, %i3, %o0
 entry:
   %tobool = icmp sgt i32 %x, %y
   %rv = select i1 %tobool, i64 %a, i64 %b
   ret i64 %rv
 }
 
-; CHECK: selecti64_fcc
-; CHECK: mov %i3, %i0
-; CHECK: fcmps %fcc0, %f1, %f3
-; CHECK: movul %fcc0, %i2, %i0
-; CHECK: restore
-define i64 @selecti64_fcc(float %x, float %y, i64 %a, i64 %b) {
+define i64 @selecti64_fcc(float %x, float %y, i64 %a, i64 %b) nounwind {
+; CHECK-LABEL: selecti64_fcc:
+; CHECK:       ! %bb.0: ! %entry
+; CHECK-NEXT:    save %sp, -128, %sp
+; CHECK-NEXT:    mov %i3, %i0
+; CHECK-NEXT:    fcmps %fcc0, %f1, %f3
+; CHECK-NEXT:    movul %fcc0, %i2, %i0
+; CHECK-NEXT:    ret
+; CHECK-NEXT:    restore
 entry:
   %tobool = fcmp ult float %x, %y
   %rv = select i1 %tobool, i64 %a, i64 %b
   ret i64 %rv
 }
 
-; CHECK: selectf32_xcc
-; CHECK: fmovs %f7, %f0
-; CHECK: cmp %i0, %i1
-; CHECK: fmovsg %xcc, %f5, %f0
-define float @selectf32_xcc(i64 %x, i64 %y, float %a, float %b) {
+define float @selectf32_xcc(i64 %x, i64 %y, float %a, float %b) nounwind {
+; CHECK-LABEL: selectf32_xcc:
+; CHECK:       ! %bb.0: ! %entry
+; CHECK-NEXT:    save %sp, -128, %sp
+; CHECK-NEXT:    fmovs %f7, %f0
+; CHECK-NEXT:    cmp %i0, %i1
+; CHECK-NEXT:    fmovsg %xcc, %f5, %f0
+; CHECK-NEXT:    ret
+; CHECK-NEXT:    restore
 entry:
   %tobool = icmp sgt i64 %x, %y
   %rv = select i1 %tobool, float %a, float %b
   ret float %rv
 }
 
-; CHECK: selectf64_xcc
-; CHECK: fmovd %f6, %f0
-; CHECK: cmp %i0, %i1
-; CHECK: fmovdg %xcc, %f4, %f0
-define double @selectf64_xcc(i64 %x, i64 %y, double %a, double %b) {
+define double @selectf64_xcc(i64 %x, i64 %y, double %a, double %b) nounwind {
+; CHECK-LABEL: selectf64_xcc:
+; CHECK:       ! %bb.0: ! %entry
+; CHECK-NEXT:    save %sp, -128, %sp
+; CHECK-NEXT:    fmovd %f6, %f0
+; CHECK-NEXT:    cmp %i0, %i1
+; CHECK-NEXT:    fmovdg %xcc, %f4, %f0
+; CHECK-NEXT:    ret
+; CHECK-NEXT:    restore
 entry:
   %tobool = icmp sgt i64 %x, %y
   %rv = select i1 %tobool, double %a, double %b
@@ -101,26 +138,38 @@ entry:
 }
 
 ; The MOVXCC instruction can't use %g0 for its tied operand.
-; CHECK: select_consti64_xcc
-; CHECK: cmp
-; CHECK: movg %xcc, 123, %i{{[0-2]}}
-define i64 @select_consti64_xcc(i64 %x, i64 %y) {
+define i64 @select_consti64_xcc(i64 %x, i64 %y) nounwind {
+; CHECK-LABEL: select_consti64_xcc:
+; CHECK:       ! %bb.0: ! %entry
+; CHECK-NEXT:    save %sp, -128, %sp
+; CHECK-NEXT:    mov %g0, %i2
+; CHECK-NEXT:    cmp %i0, %i1
+; CHECK-NEXT:    movg %xcc, 123, %i2
+; CHECK-NEXT:    ret
+; CHECK-NEXT:    restore %g0, %i2, %o0
 entry:
   %tobool = icmp sgt i64 %x, %y
   %rv = select i1 %tobool, i64 123, i64 0
   ret i64 %rv
 }
 
-; CHECK-LABEL: setcc_resultty
-; CHECK-DAG:       mov %g0, %o0
-; CHECK-DAG:       mov %i0, %o1
-; CHECK-DAG:       mov %g0, %o2
-; CHECK-DAG:       mov 32, %o3
-; CHECK-DAG:       call __multi3
-; CHECK:       movrnz %o0, 1, [[R:%[gilo][0-7]]]
-; CHECK:       or [[R]], %i1, %i0
-
-define i1 @setcc_resultty(i64 %a, i1 %b) {
+define i1 @setcc_resultty(i64 %a, i1 %b) nounwind {
+; CHECK-LABEL: setcc_resultty:
+; CHECK:       ! %bb.0:
+; CHECK-NEXT:    save %sp, -128, %sp
+; CHECK-NEXT:    mov %g0, %i2
+; CHECK-NEXT:    sethi 4194303, %i3
+; CHECK-NEXT:    or %i3, 1023, %i3
+; CHECK-NEXT:    sethi 131071, %i4
+; CHECK-NEXT:    or %i4, 1023, %i4
+; CHECK-NEXT:    sllx %i4, 32, %i4
+; CHECK-NEXT:    or %i4, %i3, %i3
+; CHECK-NEXT:    and %i0, %i3, %i3
+; CHECK-NEXT:    cmp %i3, %i0
+; CHECK-NEXT:    movne %xcc, 1, %i2
+; CHECK-NEXT:    or %i2, %i1, %i0
+; CHECK-NEXT:    ret
+; CHECK-NEXT:    restore
   %a0 = tail call { i64, i1 } @llvm.umul.with.overflow.i64(i64 %a, i64 32)
   %a1 = extractvalue { i64, i1 } %a0, 1
   %a4 = or i1 %a1, %b
diff --git a/llvm/test/CodeGen/SPARC/fp128-split.ll b/llvm/test/CodeGen/SPARC/fp128-split.ll
index c87cfb9..8a127c9 100644
--- a/llvm/test/CodeGen/SPARC/fp128-split.ll
+++ b/llvm/test/CodeGen/SPARC/fp128-split.ll
@@ -8,45 +8,33 @@
 define fp128 @testcase(fp128 %0) {
   ; CHECK-LABEL: name: testcase
   ; CHECK: bb.0.Entry:
-  ; CHECK:   liveins: $q0
-  ; CHECK:   [[COPY:%[0-9]+]]:qfpregs = COPY $q0
-  ; CHECK:   [[COPY1:%[0-9]+]]:dfpregs = COPY [[COPY]].sub_odd64
-  ; CHECK:   [[ADDri:%[0-9]+]]:i64regs = ADDri %stack.0, 0
-  ; CHECK:   [[ORri:%[0-9]+]]:i64regs = ORri killed [[ADDri]], 8
-  ; CHECK:   STDFrr [[ORri]], $g0, killed [[COPY1]] :: (store (s64) into %stack.0 + 8)
-  ; CHECK:   [[COPY2:%[0-9]+]]:dfpregs = COPY [[COPY]].sub_even64
-  ; CHECK:   STDFri %stack.0, 0, killed [[COPY2]] :: (store (s64) into %stack.0, align 16)
-  ; CHECK:   [[LDXrr:%[0-9]+]]:i64regs = LDXrr [[ORri]], $g0 :: (load (s64) from %stack.0 + 8)
-  ; CHECK:   [[LDXri:%[0-9]+]]:i64regs = LDXri %stack.0, 0 :: (load (s64) from %stack.0, align 16)
-  ; CHECK:   [[COPY3:%[0-9]+]]:intregs = COPY [[LDXrr]]
-  ; CHECK:   [[COPY4:%[0-9]+]]:intregs = COPY [[LDXri]]
-  ; CHECK:   [[SRLXri:%[0-9]+]]:i64regs = SRLXri [[LDXrr]], 32
-  ; CHECK:   [[COPY5:%[0-9]+]]:intregs = COPY [[SRLXri]]
-  ; CHECK:   [[SRLXri1:%[0-9]+]]:i64regs = SRLXri [[LDXri]], 32
-  ; CHECK:   [[COPY6:%[0-9]+]]:intregs = COPY [[SRLXri1]]
-  ; CHECK:   [[ADDCCri:%[0-9]+]]:intregs = ADDCCri killed [[COPY3]], -1, implicit-def $icc
-  ; CHECK:   [[ADDEri:%[0-9]+]]:intregs = ADDEri killed [[COPY5]], -1, implicit-def $icc, implicit $icc
-  ; CHECK:   [[ADDEri1:%[0-9]+]]:intregs = ADDEri killed [[COPY4]], -1, implicit-def $icc, implicit $icc
-  ; CHECK:   [[ADDEri2:%[0-9]+]]:intregs = ADDEri killed [[COPY6]], -1, implicit-def dead $icc, implicit $icc
-  ; CHECK:   [[SRLri:%[0-9]+]]:i64regs = SRLri killed [[ADDCCri]], 0
-  ; CHECK:   [[COPY7:%[0-9]+]]:i64regs = COPY [[ADDEri]]
-  ; CHECK:   [[SLLXri:%[0-9]+]]:i64regs = SLLXri killed [[COPY7]], 32
-  ; CHECK:   [[ORrr:%[0-9]+]]:i64regs = ORrr killed [[SLLXri]], killed [[SRLri]]
-  ; CHECK:   [[ADDri1:%[0-9]+]]:i64regs = ADDri %stack.1, 0
-  ; CHECK:   [[ORri1:%[0-9]+]]:i64regs = ORri killed [[ADDri1]], 8
-  ; CHECK:   STXrr [[ORri1]], $g0, killed [[ORrr]] :: (store (s64) into %stack.1 + 8, basealign 16)
-  ; CHECK:   [[SRLri1:%[0-9]+]]:i64regs = SRLri killed [[ADDEri1]], 0
-  ; CHECK:   [[COPY8:%[0-9]+]]:i64regs = COPY [[ADDEri2]]
-  ; CHECK:   [[SLLXri1:%[0-9]+]]:i64regs = SLLXri killed [[COPY8]], 32
-  ; CHECK:   [[ORrr1:%[0-9]+]]:i64regs = ORrr killed [[SLLXri1]], killed [[SRLri1]]
-  ; CHECK:   STXri %stack.1, 0, killed [[ORrr1]] :: (store (s64) into %stack.1, align 16)
-  ; CHECK:   [[LDDFri:%[0-9]+]]:dfpregs = LDDFri %stack.1, 0 :: (load (s64) from %stack.1, align 16)
-  ; CHECK:   [[DEF:%[0-9]+]]:qfpregs = IMPLICIT_DEF
-  ; CHECK:   [[INSERT_SUBREG:%[0-9]+]]:qfpregs = INSERT_SUBREG [[DEF]], killed [[LDDFri]], %subreg.sub_even64
-  ; CHECK:   [[LDDFrr:%[0-9]+]]:dfpregs = LDDFrr [[ORri1]], $g0 :: (load (s64) from %stack.1 + 8)
-  ; CHECK:   [[INSERT_SUBREG1:%[0-9]+]]:qfpregs = INSERT_SUBREG [[INSERT_SUBREG]], killed [[LDDFrr]], %subreg.sub_odd64
-  ; CHECK:   $q0 = COPY [[INSERT_SUBREG1]]
-  ; CHECK:   RETL 8, implicit $q0
+  ; CHECK-NEXT:   liveins: $q0
+  ; CHECK-NEXT: {{  $}}
+  ; CHECK-NEXT:   [[COPY:%[0-9]+]]:qfpregs = COPY $q0
+  ; CHECK-NEXT:   [[COPY1:%[0-9]+]]:dfpregs = COPY [[COPY]].sub_odd64
+  ; CHECK-NEXT:   [[ADDri:%[0-9]+]]:i64regs = ADDri %stack.0, 0
+  ; CHECK-NEXT:   [[ORri:%[0-9]+]]:i64regs = ORri killed [[ADDri]], 8
+  ; CHECK-NEXT:   STDFrr [[ORri]], $g0, killed [[COPY1]] :: (store (s64) into %stack.0 + 8)
+  ; CHECK-NEXT:   [[COPY2:%[0-9]+]]:dfpregs = COPY [[COPY]].sub_even64
+  ; CHECK-NEXT:   STDFri %stack.0, 0, killed [[COPY2]] :: (store (s64) into %stack.0, align 16)
+  ; CHECK-NEXT:   [[LDXrr:%[0-9]+]]:i64regs = LDXrr [[ORri]], $g0 :: (load (s64) from %stack.0 + 8)
+  ; CHECK-NEXT:   [[LDXri:%[0-9]+]]:i64regs = LDXri %stack.0, 0 :: (load (s64) from %stack.0, align 16)
+  ; CHECK-NEXT:   [[ADDri1:%[0-9]+]]:i64regs = ADDri %stack.1, 0
+  ; CHECK-NEXT:   [[ORri1:%[0-9]+]]:i64regs = ORri killed [[ADDri1]], 8
+  ; CHECK-NEXT:   [[ADDri2:%[0-9]+]]:i64regs = ADDri [[LDXrr]], -1
+  ; CHECK-NEXT:   STXrr [[ORri1]], $g0, killed [[ADDri2]] :: (store (s64) into %stack.1 + 8, basealign 16)
+  ; CHECK-NEXT:   [[COPY3:%[0-9]+]]:intregs = COPY $g0
+  ; CHECK-NEXT:   [[MOVRri:%[0-9]+]]:intregs = MOVRri [[LDXrr]], 1, [[COPY3]], 49
+  ; CHECK-NEXT:   [[SRLri:%[0-9]+]]:i64regs = SRLri killed [[MOVRri]], 0
+  ; CHECK-NEXT:   [[SUBrr:%[0-9]+]]:i64regs = SUBrr killed [[LDXri]], killed [[SRLri]]
+  ; CHECK-NEXT:   STXri %stack.1, 0, killed [[SUBrr]] :: (store (s64) into %stack.1, align 16)
+  ; CHECK-NEXT:   [[LDDFri:%[0-9]+]]:dfpregs = LDDFri %stack.1, 0 :: (load (s64) from %stack.1, align 16)
+  ; CHECK-NEXT:   [[DEF:%[0-9]+]]:qfpregs = IMPLICIT_DEF
+  ; CHECK-NEXT:   [[INSERT_SUBREG:%[0-9]+]]:qfpregs = INSERT_SUBREG [[DEF]], killed [[LDDFri]], %subreg.sub_even64
+  ; CHECK-NEXT:   [[LDDFrr:%[0-9]+]]:dfpregs = LDDFrr [[ORri1]], $g0 :: (load (s64) from %stack.1 + 8)
+  ; CHECK-NEXT:   [[INSERT_SUBREG1:%[0-9]+]]:qfpregs = INSERT_SUBREG [[INSERT_SUBREG]], killed [[LDDFrr]], %subreg.sub_odd64
+  ; CHECK-NEXT:   $q0 = COPY [[INSERT_SUBREG1]]
+  ; CHECK-NEXT:   RETL 8, implicit $q0
 Entry:
   %1 = bitcast fp128 %0 to i128
   %2 = add i128 %1, -1
diff --git a/llvm/test/CodeGen/SPARC/smulo-128-legalisation-lowering.ll b/llvm/test/CodeGen/SPARC/smulo-128-legalisation-lowering.ll
index ae1de44..ac0b112 100644
--- a/llvm/test/CodeGen/SPARC/smulo-128-legalisation-lowering.ll
+++ b/llvm/test/CodeGen/SPARC/smulo-128-legalisation-lowering.ll
@@ -2,14 +2,10 @@
 ; RUN: llc < %s -mtriple=sparc-unknown-linux-gnu | FileCheck %s --check-prefixes=SPARC
 ; RUN: llc < %s -mtriple=sparc64-unknown-linux-gnu | FileCheck %s --check-prefixes=SPARC64
 
-define { i128, i8 } @muloti_test(i128 %l, i128 %r) unnamed_addr #0 {
+define { i128, i8 } @muloti_test(i128 %l, i128 %r) nounwind {
 ; SPARC-LABEL: muloti_test:
-; SPARC:         .cfi_startproc
-; SPARC-NEXT:  ! %bb.0: ! %start
+; SPARC:       ! %bb.0: ! %start
 ; SPARC-NEXT:    save %sp, -96, %sp
-; SPARC-NEXT:    .cfi_def_cfa_register %fp
-; SPARC-NEXT:    .cfi_window_save
-; SPARC-NEXT:    .cfi_register %o7, %i7
 ; SPARC-NEXT:    ld [%fp+96], %l1
 ; SPARC-NEXT:    mov %i3, %g4
 ; SPARC-NEXT:    mov %i2, %g2
@@ -172,105 +168,97 @@ define { i128, i8 } @muloti_test(i128 %l, i128 %r) unnamed_addr #0 {
 ; SPARC-NEXT:    restore %g0, %g3, %o1
 ;
 ; SPARC64-LABEL: muloti_test:
-; SPARC64:         .cfi_startproc
-; SPARC64-NEXT:    .register %g2, #scratch
+; SPARC64:         .register %g2, #scratch
 ; SPARC64-NEXT:    .register %g3, #scratch
 ; SPARC64-NEXT:  ! %bb.0: ! %start
 ; SPARC64-NEXT:    save %sp, -176, %sp
-; SPARC64-NEXT:    .cfi_def_cfa_register %fp
-; SPARC64-NEXT:    .cfi_window_save
-; SPARC64-NEXT:    .cfi_register %o7, %i7
-; SPARC64-NEXT:    mov %i3, %i4
-; SPARC64-NEXT:    mov %i1, %i3
-; SPARC64-NEXT:    srax %i0, 63, %o2
-; SPARC64-NEXT:    mov %i2, %o0
-; SPARC64-NEXT:    mov %i4, %o1
-; SPARC64-NEXT:    call __multi3
-; SPARC64-NEXT:    mov %o2, %o3
-; SPARC64-NEXT:    mov %o0, %i1
-; SPARC64-NEXT:    mov %o1, %i5
-; SPARC64-NEXT:    srax %i2, 63, %o0
-; SPARC64-NEXT:    mov %o0, %o1
-; SPARC64-NEXT:    mov %i0, %o2
-; SPARC64-NEXT:    call __multi3
-; SPARC64-NEXT:    mov %i3, %o3
-; SPARC64-NEXT:    srlx %i5, 32, %g2
-; SPARC64-NEXT:    srlx %o1, 32, %g3
-; SPARC64-NEXT:    srlx %i1, 32, %g4
-; SPARC64-NEXT:    srlx %o0, 32, %g5
-; SPARC64-NEXT:    addcc %o1, %i5, %l0
-; SPARC64-NEXT:    addxcc %g3, %g2, %l1
-; SPARC64-NEXT:    addxcc %o0, %i1, %l2
-; SPARC64-NEXT:    addxcc %g5, %g4, %l3
+; SPARC64-NEXT:    mov %i3, %i5
+; SPARC64-NEXT:    mov %i2, %i3
+; SPARC64-NEXT:    mov %i1, %i2
+; SPARC64-NEXT:    mov %i0, %i4
 ; SPARC64-NEXT:    mov %g0, %o0
-; SPARC64-NEXT:    mov %i3, %o1
+; SPARC64-NEXT:    mov %i1, %o1
 ; SPARC64-NEXT:    mov %g0, %o2
 ; SPARC64-NEXT:    call __multi3
-; SPARC64-NEXT:    mov %i4, %o3
-; SPARC64-NEXT:    mov %o0, %i5
+; SPARC64-NEXT:    mov %i5, %o3
+; SPARC64-NEXT:    mov %o0, %i0
 ; SPARC64-NEXT:    mov %o1, %i1
 ; SPARC64-NEXT:    mov %g0, %o0
-; SPARC64-NEXT:    mov %i0, %o1
+; SPARC64-NEXT:    mov %i4, %o1
 ; SPARC64-NEXT:    mov %g0, %o2
 ; SPARC64-NEXT:    call __multi3
-; SPARC64-NEXT:    mov %i4, %o3
-; SPARC64-NEXT:    srlx %i5, 32, %i4
-; SPARC64-NEXT:    srlx %o1, 32, %g2
-; SPARC64-NEXT:    srlx %o0, 32, %g3
-; SPARC64-NEXT:    addcc %o1, %i5, %i5
-; SPARC64-NEXT:    addxcc %g2, %i4, %i4
-; SPARC64-NEXT:    addxcc %o0, 0, %l4
-; SPARC64-NEXT:    addxcc %g3, 0, %l5
+; SPARC64-NEXT:    mov %i5, %o3
+; SPARC64-NEXT:    mov %g0, %g2
+; SPARC64-NEXT:    add %o1, %i0, %i0
+; SPARC64-NEXT:    cmp %i0, %o1
+; SPARC64-NEXT:    movcs %xcc, 1, %g2
+; SPARC64-NEXT:    srl %g2, 0, %g2
+; SPARC64-NEXT:    add %o0, %g2, %l0
 ; SPARC64-NEXT:    mov %g0, %o0
-; SPARC64-NEXT:    mov %i3, %o1
+; SPARC64-NEXT:    mov %i2, %o1
 ; SPARC64-NEXT:    mov %g0, %o2
 ; SPARC64-NEXT:    call __multi3
-; SPARC64-NEXT:    mov %i2, %o3
-; SPARC64-NEXT:    srlx %o1, 32, %g2
-; SPARC64-NEXT:    srlx %o0, 32, %g3
-; SPARC64-NEXT:    addcc %o1, %i5, %i3
-; SPARC64-NEXT:    addxcc %g2, %i4, %i4
-; SPARC64-NEXT:    addxcc %o0, 0, %i5
-; SPARC64-NEXT:    addxcc %g3, 0, %g2
-; SPARC64-NEXT:    addcc %l4, %i5, %i5
-; SPARC64-NEXT:    addxcc %l5, %g2, %l4
-; SPARC64-NEXT:    addxcc %g0, 0, %l5
-; SPARC64-NEXT:    addxcc %g0, 0, %l6
+; SPARC64-NEXT:    mov %i3, %o3
+; SPARC64-NEXT:    mov %g0, %g2
+; SPARC64-NEXT:    mov %g0, %g3
+; SPARC64-NEXT:    add %o1, %i0, %i0
+; SPARC64-NEXT:    cmp %i0, %o1
+; SPARC64-NEXT:    movcs %xcc, 1, %g2
+; SPARC64-NEXT:    srl %g2, 0, %g2
+; SPARC64-NEXT:    add %o0, %g2, %g2
+; SPARC64-NEXT:    add %l0, %g2, %l1
+; SPARC64-NEXT:    cmp %l1, %l0
+; SPARC64-NEXT:    movcs %xcc, 1, %g3
+; SPARC64-NEXT:    srl %g3, 0, %l0
 ; SPARC64-NEXT:    mov %g0, %o0
-; SPARC64-NEXT:    mov %i0, %o1
+; SPARC64-NEXT:    mov %i4, %o1
 ; SPARC64-NEXT:    mov %g0, %o2
 ; SPARC64-NEXT:    call __multi3
+; SPARC64-NEXT:    mov %i3, %o3
+; SPARC64-NEXT:    mov %g0, %g2
+; SPARC64-NEXT:    add %o0, %l0, %g3
+; SPARC64-NEXT:    add %o1, %l1, %l1
+; SPARC64-NEXT:    cmp %l1, %o1
+; SPARC64-NEXT:    movcs %xcc, 1, %g2
+; SPARC64-NEXT:    srl %g2, 0, %g2
+; SPARC64-NEXT:    add %g3, %g2, %l2
+; SPARC64-NEXT:    srax %i4, 63, %o2
+; SPARC64-NEXT:    mov %i3, %o0
+; SPARC64-NEXT:    mov %i5, %o1
+; SPARC64-NEXT:    call __multi3
+; SPARC64-NEXT:    mov %o2, %o3
+; SPARC64-NEXT:    mov %o0, %i5
+; SPARC64-NEXT:    mov %o1, %l0
+; SPARC64-NEXT:    srax %i3, 63, %o0
+; SPARC64-NEXT:    mov %o0, %o1
+; SPARC64-NEXT:    mov %i4, %o2
+; SPARC64-NEXT:    call __multi3
 ; SPARC64-NEXT:    mov %i2, %o3
 ; SPARC64-NEXT:    mov %g0, %i2
-; SPARC64-NEXT:    srlx %o1, 32, %i0
-; SPARC64-NEXT:    addcc %o1, %i5, %i5
-; SPARC64-NEXT:    srlx %o0, 32, %g2
-; SPARC64-NEXT:    addxcc %i0, %l4, %i0
-; SPARC64-NEXT:    addxcc %o0, %l5, %g3
-; SPARC64-NEXT:    addxcc %g2, %l6, %g2
-; SPARC64-NEXT:    addcc %i5, %l0, %i5
-; SPARC64-NEXT:    addxcc %i0, %l1, %i0
-; SPARC64-NEXT:    addxcc %g3, %l2, %g3
-; SPARC64-NEXT:    addxcc %g2, %l3, %g2
-; SPARC64-NEXT:    srl %g3, 0, %g3
-; SPARC64-NEXT:    sllx %g2, 32, %g2
-; SPARC64-NEXT:    or %g2, %g3, %g2
-; SPARC64-NEXT:    sllx %i4, 32, %i4
-; SPARC64-NEXT:    srax %i4, 63, %g3
-; SPARC64-NEXT:    xor %g2, %g3, %g2
-; SPARC64-NEXT:    srl %i5, 0, %i5
-; SPARC64-NEXT:    sllx %i0, 32, %i0
-; SPARC64-NEXT:    or %i0, %i5, %i0
-; SPARC64-NEXT:    xor %i0, %g3, %i0
-; SPARC64-NEXT:    or %i0, %g2, %i0
-; SPARC64-NEXT:    movrnz %i0, 1, %i2
-; SPARC64-NEXT:    srl %i3, 0, %i0
-; SPARC64-NEXT:    or %i4, %i0, %i0
+; SPARC64-NEXT:    mov %g0, %i3
+; SPARC64-NEXT:    mov %g0, %i4
+; SPARC64-NEXT:    add %o0, %i5, %i5
+; SPARC64-NEXT:    add %o1, %l0, %g2
+; SPARC64-NEXT:    cmp %g2, %o1
+; SPARC64-NEXT:    movcs %xcc, 1, %i2
 ; SPARC64-NEXT:    srl %i2, 0, %i2
+; SPARC64-NEXT:    add %i5, %i2, %i2
+; SPARC64-NEXT:    add %l2, %i2, %i2
+; SPARC64-NEXT:    add %l1, %g2, %i5
+; SPARC64-NEXT:    cmp %i5, %l1
+; SPARC64-NEXT:    movcs %xcc, 1, %i3
+; SPARC64-NEXT:    srl %i3, 0, %i3
+; SPARC64-NEXT:    add %i2, %i3, %i2
+; SPARC64-NEXT:    srax %i0, 63, %i3
+; SPARC64-NEXT:    xor %i2, %i3, %i2
+; SPARC64-NEXT:    xor %i5, %i3, %i3
+; SPARC64-NEXT:    or %i3, %i2, %i2
+; SPARC64-NEXT:    movrnz %i2, 1, %i4
+; SPARC64-NEXT:    srl %i4, 0, %i2
 ; SPARC64-NEXT:    ret
 ; SPARC64-NEXT:    restore
 start:
-  %0 = tail call { i128, i1 } @llvm.smul.with.overflow.i128(i128 %l, i128 %r) #2
+  %0 = tail call { i128, i1 } @llvm.smul.with.overflow.i128(i128 %l, i128 %r)
   %1 = extractvalue { i128, i1 } %0, 0
   %2 = extractvalue { i128, i1 } %0, 1
   %3 = zext i1 %2 to i8
@@ -279,9 +267,4 @@ start:
   ret { i128, i8 } %5
 }
 
-; Function Attrs: nounwind readnone speculatable
-declare { i128, i1 } @llvm.smul.with.overflow.i128(i128, i128) #1
-
-attributes #0 = { nounwind readnone uwtable }
-attributes #1 = { nounwind readnone speculatable }
-attributes #2 = { nounwind }
+declare { i128, i1 } @llvm.smul.with.overflow.i128(i128, i128)
diff --git a/llvm/test/CodeGen/SPARC/umulo-128-legalisation-lowering.ll b/llvm/test/CodeGen/SPARC/umulo-128-legalisation-lowering.ll
index 9ca895f..01383a0 100644
--- a/llvm/test/CodeGen/SPARC/umulo-128-legalisation-lowering.ll
+++ b/llvm/test/CodeGen/SPARC/umulo-128-legalisation-lowering.ll
@@ -2,14 +2,10 @@
 ; RUN: llc < %s -mtriple=sparc-unknown-linux-gnu | FileCheck %s --check-prefixes=SPARC
 ; RUN: llc < %s -mtriple=sparc64-unknown-linux-gnu | FileCheck %s --check-prefixes=SPARC64
 
-define { i128, i8 } @muloti_test(i128 %l, i128 %r) unnamed_addr #0 {
+define { i128, i8 } @muloti_test(i128 %l, i128 %r) nounwind {
 ; SPARC-LABEL: muloti_test:
-; SPARC:         .cfi_startproc
-; SPARC-NEXT:  ! %bb.0: ! %start
+; SPARC:       ! %bb.0: ! %start
 ; SPARC-NEXT:    save %sp, -96, %sp
-; SPARC-NEXT:    .cfi_def_cfa_register %fp
-; SPARC-NEXT:    .cfi_window_save
-; SPARC-NEXT:    .cfi_register %o7, %i7
 ; SPARC-NEXT:    mov %i3, %g2
 ; SPARC-NEXT:    mov %i2, %g4
 ; SPARC-NEXT:    umul %i2, %i5, %i2
@@ -160,14 +156,10 @@ define { i128, i8 } @muloti_test(i128 %l, i128 %r) unnamed_addr #0 {
 ; SPARC-NEXT:    restore %g0, %g2, %o1
 ;
 ; SPARC64-LABEL: muloti_test:
-; SPARC64:         .cfi_startproc
-; SPARC64-NEXT:    .register %g2, #scratch
+; SPARC64:         .register %g2, #scratch
 ; SPARC64-NEXT:    .register %g3, #scratch
 ; SPARC64-NEXT:  ! %bb.0: ! %start
 ; SPARC64-NEXT:    save %sp, -176, %sp
-; SPARC64-NEXT:    .cfi_def_cfa_register %fp
-; SPARC64-NEXT:    .cfi_window_save
-; SPARC64-NEXT:    .cfi_register %o7, %i7
 ; SPARC64-NEXT:    mov %g0, %o0
 ; SPARC64-NEXT:    mov %i2, %o1
 ; SPARC64-NEXT:    mov %g0, %o2
@@ -208,7 +200,7 @@ define { i128, i8 } @muloti_test(i128 %l, i128 %r) unnamed_addr #0 {
 ; SPARC64-NEXT:    ret
 ; SPARC64-NEXT:    restore %g0, %o1, %o1
 start:
-  %0 = tail call { i128, i1 } @llvm.umul.with.overflow.i128(i128 %l, i128 %r) #2
+  %0 = tail call { i128, i1 } @llvm.umul.with.overflow.i128(i128 %l, i128 %r)
   %1 = extractvalue { i128, i1 } %0, 0
   %2 = extractvalue { i128, i1 } %0, 1
   %3 = zext i1 %2 to i8
@@ -217,9 +209,4 @@ start:
   ret { i128, i8 } %5
 }
 
-; Function Attrs: nounwind readnone speculatable
-declare { i128, i1 } @llvm.umul.with.overflow.i128(i128, i128) #1
-
-attributes #0 = { nounwind readnone uwtable }
-attributes #1 = { nounwind readnone speculatable }
-attributes #2 = { nounwind }
+declare { i128, i1 } @llvm.umul.with.overflow.i128(i128, i128)
diff --git a/llvm/test/CodeGen/X86/2007-05-15-maskmovq.ll b/llvm/test/CodeGen/X86/2007-05-15-maskmovq.ll
index 69f7334..ba40c5c 100644
--- a/llvm/test/CodeGen/X86/2007-05-15-maskmovq.ll
+++ b/llvm/test/CodeGen/X86/2007-05-15-maskmovq.ll
@@ -25,10 +25,8 @@ define void @test(<1 x i64> %c64, <1 x i64> %mask1, ptr %P) {
 ; CHECK-NEXT:    popl %edi
 ; CHECK-NEXT:    retl
 entry:
-	%tmp4 = bitcast <1 x i64> %mask1 to x86_mmx		; <x86_mmx> [#uses=1]
-	%tmp6 = bitcast <1 x i64> %c64 to x86_mmx		; <x86_mmx> [#uses=1]
-	tail call void @llvm.x86.mmx.maskmovq( x86_mmx %tmp4, x86_mmx %tmp6, ptr %P )
+	tail call void @llvm.x86.mmx.maskmovq( <1 x i64> %mask1, <1 x i64> %c64, ptr %P )
 	ret void
 }
 
-declare void @llvm.x86.mmx.maskmovq(x86_mmx, x86_mmx, ptr)
+declare void @llvm.x86.mmx.maskmovq(<1 x i64>, <1 x i64>, ptr)
diff --git a/llvm/test/CodeGen/X86/2007-07-03-GR64ToVR64.ll b/llvm/test/CodeGen/X86/2007-07-03-GR64ToVR64.ll
index 79b06ba..6c58678 100644
--- a/llvm/test/CodeGen/X86/2007-07-03-GR64ToVR64.ll
+++ b/llvm/test/CodeGen/X86/2007-07-03-GR64ToVR64.ll
@@ -1,7 +1,7 @@
 ; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py
 ; RUN: llc < %s -mtriple=x86_64-apple-darwin -mattr=+mmx | FileCheck %s
 
-@R = external global x86_mmx		; <ptr> [#uses=1]
+@R = external global <1 x i64>		; <ptr> [#uses=1]
 
 define void @foo(<1 x i64> %A, <1 x i64> %B) nounwind {
 ; CHECK-LABEL: foo:
@@ -14,13 +14,11 @@ define void @foo(<1 x i64> %A, <1 x i64> %B) nounwind {
 ; CHECK-NEXT:    emms
 ; CHECK-NEXT:    retq
 entry:
-	%tmp4 = bitcast <1 x i64> %B to x86_mmx		; <<4 x i16>> [#uses=1]
-	%tmp6 = bitcast <1 x i64> %A to x86_mmx		; <<4 x i16>> [#uses=1]
-	%tmp7 = tail call x86_mmx @llvm.x86.mmx.paddus.w( x86_mmx %tmp6, x86_mmx %tmp4 )		; <x86_mmx> [#uses=1]
-	store x86_mmx %tmp7, ptr @R
+	%tmp7 = tail call <1 x i64> @llvm.x86.mmx.paddus.w( <1 x i64> %A, <1 x i64> %B )		; <<1 x i64>> [#uses=1]
+	store <1 x i64> %tmp7, ptr @R
 	tail call void @llvm.x86.mmx.emms( )
 	ret void
 }
 
-declare x86_mmx @llvm.x86.mmx.paddus.w(x86_mmx, x86_mmx)
+declare <1 x i64> @llvm.x86.mmx.paddus.w(<1 x i64>, <1 x i64>)
 declare void @llvm.x86.mmx.emms()
diff --git a/llvm/test/CodeGen/X86/2008-04-08-CoalescerCrash.ll b/llvm/test/CodeGen/X86/2008-04-08-CoalescerCrash.ll
index d439e82..0c79264 100644
--- a/llvm/test/CodeGen/X86/2008-04-08-CoalescerCrash.ll
+++ b/llvm/test/CodeGen/X86/2008-04-08-CoalescerCrash.ll
@@ -5,15 +5,15 @@ entry:
 	tail call void asm sideeffect "# top of block", "~{dirflag},~{fpsr},~{flags},~{di},~{si},~{dx},~{cx},~{ax}"( ) nounwind 
 	tail call void asm sideeffect ".file \224443946.c\22", "~{dirflag},~{fpsr},~{flags}"( ) nounwind 
 	tail call void asm sideeffect ".line 8", "~{dirflag},~{fpsr},~{flags}"( ) nounwind 
-	%tmp1 = tail call x86_mmx asm sideeffect "movd $1, $0", "=={mm4},{bp},~{dirflag},~{fpsr},~{flags},~{memory}"( i32 undef ) nounwind 		; <x86_mmx> [#uses=1]
+	%tmp1 = tail call <1 x i64> asm sideeffect "movd $1, $0", "=={mm4},{bp},~{dirflag},~{fpsr},~{flags},~{memory}"( i32 undef ) nounwind 		; <<1 x i64>> [#uses=1]
 	tail call void asm sideeffect ".file \224443946.c\22", "~{dirflag},~{fpsr},~{flags}"( ) nounwind 
 	tail call void asm sideeffect ".line 9", "~{dirflag},~{fpsr},~{flags}"( ) nounwind 
-	%tmp3 = tail call i32 asm sideeffect "movd $1, $0", "=={bp},{mm3},~{dirflag},~{fpsr},~{flags},~{memory}"( x86_mmx undef ) nounwind 		; <i32> [#uses=1]
+	%tmp3 = tail call i32 asm sideeffect "movd $1, $0", "=={bp},{mm3},~{dirflag},~{fpsr},~{flags},~{memory}"( <1 x i64> undef ) nounwind 		; <i32> [#uses=1]
 	tail call void asm sideeffect ".file \224443946.c\22", "~{dirflag},~{fpsr},~{flags}"( ) nounwind 
 	tail call void asm sideeffect ".line 10", "~{dirflag},~{fpsr},~{flags}"( ) nounwind 
-	tail call void asm sideeffect "movntq $0, 0($1,$2)", "{mm0},{di},{bp},~{dirflag},~{fpsr},~{flags},~{memory}"( x86_mmx undef, i32 undef, i32 %tmp3 ) nounwind 
+	tail call void asm sideeffect "movntq $0, 0($1,$2)", "{mm0},{di},{bp},~{dirflag},~{fpsr},~{flags},~{memory}"( <1 x i64> undef, i32 undef, i32 %tmp3 ) nounwind 
 	tail call void asm sideeffect ".file \224443946.c\22", "~{dirflag},~{fpsr},~{flags}"( ) nounwind 
 	tail call void asm sideeffect ".line 11", "~{dirflag},~{fpsr},~{flags}"( ) nounwind 
-	%tmp8 = tail call i32 asm sideeffect "movd $1, $0", "=={bp},{mm4},~{dirflag},~{fpsr},~{flags},~{memory}"( x86_mmx %tmp1 ) nounwind 		; <i32> [#uses=0]
+	%tmp8 = tail call i32 asm sideeffect "movd $1, $0", "=={bp},{mm4},~{dirflag},~{fpsr},~{flags},~{memory}"( <1 x i64> %tmp1 ) nounwind 		; <i32> [#uses=0]
 	ret i32 undef
 }
diff --git a/llvm/test/CodeGen/X86/2008-08-23-64Bit-maskmovq.ll b/llvm/test/CodeGen/X86/2008-08-23-64Bit-maskmovq.ll
index 594edba..4a44778 100644
--- a/llvm/test/CodeGen/X86/2008-08-23-64Bit-maskmovq.ll
+++ b/llvm/test/CodeGen/X86/2008-08-23-64Bit-maskmovq.ll
@@ -17,13 +17,13 @@ entry:
 	br i1 false, label %bb.nph144.split, label %bb133
 
 bb.nph144.split:		; preds = %entry
-        %tmp = bitcast <8 x i8> zeroinitializer to x86_mmx
-        %tmp2 = bitcast <8 x i8> zeroinitializer to x86_mmx
-	tail call void @llvm.x86.mmx.maskmovq( x86_mmx %tmp, x86_mmx %tmp2, ptr null ) nounwind
+        %tmp = bitcast <8 x i8> zeroinitializer to <1 x i64>
+        %tmp2 = bitcast <8 x i8> zeroinitializer to <1 x i64>
+	tail call void @llvm.x86.mmx.maskmovq( <1 x i64> %tmp, <1 x i64> %tmp2, ptr null ) nounwind
 	unreachable
 
 bb133:		; preds = %entry
 	ret void
 }
 
-declare void @llvm.x86.mmx.maskmovq(x86_mmx, x86_mmx, ptr) nounwind
+declare void @llvm.x86.mmx.maskmovq(<1 x i64>, <1 x i64>, ptr) nounwind
diff --git a/llvm/test/CodeGen/X86/2008-09-05-sinttofp-2xi32.ll b/llvm/test/CodeGen/X86/2008-09-05-sinttofp-2xi32.ll
index 3a112ae..20673a1 100644
--- a/llvm/test/CodeGen/X86/2008-09-05-sinttofp-2xi32.ll
+++ b/llvm/test/CodeGen/X86/2008-09-05-sinttofp-2xi32.ll
@@ -26,7 +26,7 @@ entry:
 
 ; This is how to get MMX instructions.
 
-define <2 x double> @a2(x86_mmx %x) nounwind {
+define <2 x double> @a2(<1 x i64> %x) nounwind {
 ; CHECK-LABEL: a2:
 ; CHECK:       # %bb.0: # %entry
 ; CHECK-NEXT:    pushl %ebp
@@ -42,11 +42,11 @@ define <2 x double> @a2(x86_mmx %x) nounwind {
 ; CHECK-NEXT:    popl %ebp
 ; CHECK-NEXT:    retl
 entry:
-  %y = tail call <2 x double> @llvm.x86.sse.cvtpi2pd(x86_mmx %x)
+  %y = tail call <2 x double> @llvm.x86.sse.cvtpi2pd(<1 x i64> %x)
   ret <2 x double> %y
 }
 
-define x86_mmx @b2(<2 x double> %x) nounwind {
+define <1 x i64> @b2(<2 x double> %x) nounwind {
 ; CHECK-LABEL: b2:
 ; CHECK:       # %bb.0: # %entry
 ; CHECK-NEXT:    pushl %ebp
@@ -61,9 +61,9 @@ define x86_mmx @b2(<2 x double> %x) nounwind {
 ; CHECK-NEXT:    popl %ebp
 ; CHECK-NEXT:    retl
 entry:
-  %y = tail call x86_mmx @llvm.x86.sse.cvttpd2pi (<2 x double> %x)
-  ret x86_mmx %y
+  %y = tail call <1 x i64> @llvm.x86.sse.cvttpd2pi (<2 x double> %x)
+  ret <1 x i64> %y
 }
 
-declare <2 x double> @llvm.x86.sse.cvtpi2pd(x86_mmx)
-declare x86_mmx @llvm.x86.sse.cvttpd2pi(<2 x double>)
+declare <2 x double> @llvm.x86.sse.cvtpi2pd(<1 x i64>)
+declare <1 x i64> @llvm.x86.sse.cvttpd2pi(<2 x double>)
diff --git a/llvm/test/CodeGen/X86/2011-06-14-mmx-inlineasm.ll b/llvm/test/CodeGen/X86/2011-06-14-mmx-inlineasm.ll
index 306aeed..582ebb9 100644
--- a/llvm/test/CodeGen/X86/2011-06-14-mmx-inlineasm.ll
+++ b/llvm/test/CodeGen/X86/2011-06-14-mmx-inlineasm.ll
@@ -3,14 +3,14 @@
 target datalayout = "e-p:32:32:32-i1:8:8-i8:8:8-i16:16:16-i32:32:32-i64:32:64-f32:32:32-f64:32:64-v64:64:64-v128:128:128-a0:0:64-f80:128:128-n8:16:32"
 target triple = "i386-apple-macosx10.6.6"
 
-%0 = type { x86_mmx, x86_mmx, x86_mmx, x86_mmx, x86_mmx, x86_mmx, x86_mmx }
+%0 = type { <1 x i64>, <1 x i64>, <1 x i64>, <1 x i64>, <1 x i64>, <1 x i64>, <1 x i64> }
 
 define i32 @pixman_fill_mmx(ptr nocapture %bits, i32 %stride, i32 %bpp, i32 %x, i32 %y, i32 %width, i32 %height, i32 %xor) nounwind ssp {
 entry:
   %conv = zext i32 %xor to i64
   %shl = shl nuw i64 %conv, 32
   %or = or i64 %shl, %conv
-  %0 = bitcast i64 %or to x86_mmx
+  %0 = bitcast i64 %or to <1 x i64>
 ; CHECK:      movq [[MMXR:%mm[0-7],]] {{%mm[0-7]}}
 ; CHECK-NEXT: movq [[MMXR]] {{%mm[0-7]}}
 ; CHECK-NEXT: movq [[MMXR]] {{%mm[0-7]}}
@@ -18,7 +18,7 @@ entry:
 ; CHECK-NEXT: movq [[MMXR]] {{%mm[0-7]}}
 ; CHECK-NEXT: movq [[MMXR]] {{%mm[0-7]}}
 ; CHECK-NEXT: movq [[MMXR]] {{%mm[0-7]}}
-  %1 = tail call %0 asm "movq\09\09$7,\09$0\0Amovq\09\09$7,\09$1\0Amovq\09\09$7,\09$2\0Amovq\09\09$7,\09$3\0Amovq\09\09$7,\09$4\0Amovq\09\09$7,\09$5\0Amovq\09\09$7,\09$6\0A", "=&y,=&y,=&y,=&y,=&y,=&y,=y,y,~{dirflag},~{fpsr},~{flags}"(x86_mmx %0) nounwind, !srcloc !0
+  %1 = tail call %0 asm "movq\09\09$7,\09$0\0Amovq\09\09$7,\09$1\0Amovq\09\09$7,\09$2\0Amovq\09\09$7,\09$3\0Amovq\09\09$7,\09$4\0Amovq\09\09$7,\09$5\0Amovq\09\09$7,\09$6\0A", "=&y,=&y,=&y,=&y,=&y,=&y,=y,y,~{dirflag},~{fpsr},~{flags}"(<1 x i64> %0) nounwind, !srcloc !0
   %asmresult = extractvalue %0 %1, 0
   %asmresult6 = extractvalue %0 %1, 1
   %asmresult7 = extractvalue %0 %1, 2
@@ -34,7 +34,7 @@ entry:
 ; CHECK-NEXT: movq {{%mm[0-7]}},
 ; CHECK-NEXT: movq {{%mm[0-7]}},
 ; CHECK-NEXT: movq {{%mm[0-7]}},
-  tail call void asm sideeffect "movq\09$1,\09  ($0)\0Amovq\09$2,\09 8($0)\0Amovq\09$3,\0916($0)\0Amovq\09$4,\0924($0)\0Amovq\09$5,\0932($0)\0Amovq\09$6,\0940($0)\0Amovq\09$7,\0948($0)\0Amovq\09$8,\0956($0)\0A", "r,y,y,y,y,y,y,y,y,~{memory},~{dirflag},~{fpsr},~{flags}"(ptr undef, x86_mmx %0, x86_mmx %asmresult, x86_mmx %asmresult6, x86_mmx %asmresult7, x86_mmx %asmresult8, x86_mmx %asmresult9, x86_mmx %asmresult10, x86_mmx %asmresult11) nounwind, !srcloc !1
+  tail call void asm sideeffect "movq\09$1,\09  ($0)\0Amovq\09$2,\09 8($0)\0Amovq\09$3,\0916($0)\0Amovq\09$4,\0924($0)\0Amovq\09$5,\0932($0)\0Amovq\09$6,\0940($0)\0Amovq\09$7,\0948($0)\0Amovq\09$8,\0956($0)\0A", "r,y,y,y,y,y,y,y,y,~{memory},~{dirflag},~{fpsr},~{flags}"(ptr undef, <1 x i64> %0, <1 x i64> %asmresult, <1 x i64> %asmresult6, <1 x i64> %asmresult7, <1 x i64> %asmresult8, <1 x i64> %asmresult9, <1 x i64> %asmresult10, <1 x i64> %asmresult11) nounwind, !srcloc !1
   tail call void @llvm.x86.mmx.emms() nounwind
   ret i32 1
 }
diff --git a/llvm/test/CodeGen/X86/apx/i386-ndd.ll b/llvm/test/CodeGen/X86/apx/i386-ndd.ll
new file mode 100644
index 0000000..146a993
--- /dev/null
+++ b/llvm/test/CodeGen/X86/apx/i386-ndd.ll
@@ -0,0 +1,15 @@
+; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py UTC_ARGS: --version 5
+; RUN: llc -mtriple=i386-linux-gnu -mattr=+cmov,+ndd < %s | FileCheck %s
+define i32 @test(i1 %cmp, i32 %x, i32 %y) nounwind {
+; CHECK-LABEL: test:
+; CHECK:       # %bb.0: # %entry
+; CHECK-NEXT:    testb $1, {{[0-9]+}}(%esp)
+; CHECK-NEXT:    leal {{[0-9]+}}(%esp), %eax
+; CHECK-NEXT:    leal {{[0-9]+}}(%esp), %ecx
+; CHECK-NEXT:    cmovnel %eax, %ecx
+; CHECK-NEXT:    movl (%ecx), %eax
+; CHECK-NEXT:    retl
+entry:
+  %cmov = select i1 %cmp, i32 %x, i32 %y
+  ret i32 %cmov
+}
diff --git a/llvm/test/CodeGen/X86/apx/setzucc.ll b/llvm/test/CodeGen/X86/apx/setzucc.ll
index 1436d39..084e542 100644
--- a/llvm/test/CodeGen/X86/apx/setzucc.ll
+++ b/llvm/test/CodeGen/X86/apx/setzucc.ll
@@ -46,3 +46,46 @@ define i64 @i64(i64 %x) nounwind {
   %if = select i1 %t0, i64 1, i64 0
   ret i64 %if
 }
+
+define i32 @flags_copy_lowering() nounwind {
+; CHECK-LABEL: flags_copy_lowering:
+; CHECK:       # %bb.0: # %bb
+; CHECK-NEXT:    xorl %eax, %eax
+; CHECK-NEXT:    xorl %edx, %edx
+; CHECK-NEXT:    xorl %ecx, %ecx
+; CHECK-NEXT:    .p2align 4, 0x90
+; CHECK-NEXT:  .LBB4_1: # %bb1
+; CHECK-NEXT:    # =>This Inner Loop Header: Depth=1
+; CHECK-NEXT:    addl %edx, 0
+; CHECK-NEXT:    setb %sil
+; CHECK-NEXT:    adcl $0, %ecx
+; CHECK-NEXT:    testb %sil, %sil
+; CHECK-NEXT:    setzune %dl
+; CHECK-NEXT:    testb %sil, %sil
+; CHECK-NEXT:    je .LBB4_3
+; CHECK-NEXT:  # %bb.2: # %bb1
+; CHECK-NEXT:    # in Loop: Header=BB4_1 Depth=1
+; CHECK-NEXT:    testb %al, %al
+; CHECK-NEXT:    jne .LBB4_1
+; CHECK-NEXT:  .LBB4_3: # %bb2
+; CHECK-NEXT:    xorl %eax, %eax
+; CHECK-NEXT:    retq
+bb:
+  br label %bb1
+
+bb1:
+  %phi = phi i32 [ 0, %bb ], [ %zext, %bb1 ]
+  %phi2 = phi i32 [ 0, %bb ], [ %add3, %bb1 ]
+  %load = load i32, ptr null, align 4
+  %add = add i32 %load, %phi
+  store i32 %add, ptr null, align 4
+  %icmp = icmp ugt i32 %phi, %add
+  %zext = zext i1 %icmp to i32
+  %add3 = add i32 %phi2, %zext
+  %icmp4 = icmp ult i32 %phi2, 0
+  %and = and i1 %icmp, false
+  br i1 %and, label %bb1, label %bb2
+
+bb2:
+  ret i32 0
+}
diff --git a/llvm/test/CodeGen/X86/avgflooru-i128.ll b/llvm/test/CodeGen/X86/avgflooru-i128.ll
new file mode 100644
index 0000000..da16a7d
--- /dev/null
+++ b/llvm/test/CodeGen/X86/avgflooru-i128.ll
@@ -0,0 +1,145 @@
+; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py UTC_ARGS: --version 5
+; RUN: llc -mtriple=x86_64 < %s | FileCheck %s
+
+define i128 @avgflooru_i128(i128 %x, i128 %y) {
+; CHECK-LABEL: avgflooru_i128:
+; CHECK:       # %bb.0: # %start
+; CHECK-NEXT:    movq %rdi, %rax
+; CHECK-NEXT:    addq %rdx, %rax
+; CHECK-NEXT:    adcq %rcx, %rsi
+; CHECK-NEXT:    setb %cl
+; CHECK-NEXT:    shrdq $1, %rsi, %rax
+; CHECK-NEXT:    movzbl %cl, %edx
+; CHECK-NEXT:    shldq $63, %rsi, %rdx
+; CHECK-NEXT:    retq
+start:
+  %xor = xor i128 %y, %x
+  %lshr = lshr i128 %xor, 1
+  %and = and i128 %y, %x
+  %add = add i128 %lshr, %and
+  ret i128 %add
+}
+
+declare void @use(i8)
+
+define i128 @avgflooru_i128_multi_use(i128 %x, i128 %y) nounwind {
+; CHECK-LABEL: avgflooru_i128_multi_use:
+; CHECK:       # %bb.0: # %start
+; CHECK-NEXT:    pushq %rbp
+; CHECK-NEXT:    pushq %r15
+; CHECK-NEXT:    pushq %r14
+; CHECK-NEXT:    pushq %r13
+; CHECK-NEXT:    pushq %r12
+; CHECK-NEXT:    pushq %rbx
+; CHECK-NEXT:    pushq %rax
+; CHECK-NEXT:    movq %rcx, %rbx
+; CHECK-NEXT:    movq %rdx, %r14
+; CHECK-NEXT:    movq %rsi, %r15
+; CHECK-NEXT:    movq %rdi, %r12
+; CHECK-NEXT:    movq %rdx, %r13
+; CHECK-NEXT:    xorq %rdi, %r13
+; CHECK-NEXT:    movq %rcx, %rbp
+; CHECK-NEXT:    xorq %rsi, %rbp
+; CHECK-NEXT:    movq %r13, %rdi
+; CHECK-NEXT:    movq %rbp, %rsi
+; CHECK-NEXT:    callq use@PLT
+; CHECK-NEXT:    shrdq $1, %rbp, %r13
+; CHECK-NEXT:    shrq %rbp
+; CHECK-NEXT:    movq %r13, %rdi
+; CHECK-NEXT:    movq %rbp, %rsi
+; CHECK-NEXT:    callq use@PLT
+; CHECK-NEXT:    addq %r14, %r12
+; CHECK-NEXT:    adcq %rbx, %r15
+; CHECK-NEXT:    setb %al
+; CHECK-NEXT:    shrdq $1, %r15, %r12
+; CHECK-NEXT:    movzbl %al, %edx
+; CHECK-NEXT:    shldq $63, %r15, %rdx
+; CHECK-NEXT:    movq %r12, %rax
+; CHECK-NEXT:    addq $8, %rsp
+; CHECK-NEXT:    popq %rbx
+; CHECK-NEXT:    popq %r12
+; CHECK-NEXT:    popq %r13
+; CHECK-NEXT:    popq %r14
+; CHECK-NEXT:    popq %r15
+; CHECK-NEXT:    popq %rbp
+; CHECK-NEXT:    retq
+start:
+  %xor = xor i128 %y, %x
+  call void @use(i128 %xor)
+  %lshr = lshr i128 %xor, 1
+  call void @use(i128 %lshr)
+  %and = and i128 %y, %x
+  %add = add i128 %lshr, %and
+  ret i128 %add
+}
+
+; This test case shouldn't combine because it's not
+; an avgflooru operation
+
+define i128 @avgflooru_i128_negative(i128 %x, i128 %y) {
+; CHECK-LABEL: avgflooru_i128_negative:
+; CHECK:       # %bb.0: # %start
+; CHECK-NEXT:    movq %rdi, %rax
+; CHECK-NEXT:    andq %rsi, %rcx
+; CHECK-NEXT:    notq %rsi
+; CHECK-NEXT:    andq %rdi, %rdx
+; CHECK-NEXT:    notq %rax
+; CHECK-NEXT:    addq %rdx, %rax
+; CHECK-NEXT:    adcq %rcx, %rsi
+; CHECK-NEXT:    movq %rsi, %rdx
+; CHECK-NEXT:    retq
+start:
+  %xor = xor i128 %x, -1
+  %and = and i128 %y, %x
+  %add = add i128 %xor, %and
+  ret i128 %add
+}
+
+; This negative test case shouldn't combine, i32 is already properly
+; handled in terms of legalization, compared to the i128
+
+define i32 @avgflooru_i128_negative2(i32 %x, i32 %y) {
+; CHECK-LABEL: avgflooru_i128_negative2:
+; CHECK:       # %bb.0: # %start
+; CHECK-NEXT:    movl %edi, %ecx
+; CHECK-NEXT:    movl %esi, %eax
+; CHECK-NEXT:    addq %rcx, %rax
+; CHECK-NEXT:    shrq %rax
+; CHECK-NEXT:    # kill: def $eax killed $eax killed $rax
+; CHECK-NEXT:    retq
+start:
+  %xor = xor i32 %y, %x
+  %lshr = lshr i32 %xor, 1
+  %and = and i32 %y, %x
+  %add = add i32 %lshr, %and
+  ret i32 %add
+}
+
+define <2 x i128> @avgflooru_i128_vec(<2 x i128> %x, <2 x i128> %y) {
+; CHECK-LABEL: avgflooru_i128_vec:
+; CHECK:       # %bb.0: # %start
+; CHECK-NEXT:    movq %rdi, %rax
+; CHECK-NEXT:    addq {{[0-9]+}}(%rsp), %rsi
+; CHECK-NEXT:    adcq {{[0-9]+}}(%rsp), %rdx
+; CHECK-NEXT:    setb %dil
+; CHECK-NEXT:    movzbl %dil, %edi
+; CHECK-NEXT:    shldq $63, %rdx, %rdi
+; CHECK-NEXT:    addq {{[0-9]+}}(%rsp), %rcx
+; CHECK-NEXT:    adcq {{[0-9]+}}(%rsp), %r8
+; CHECK-NEXT:    setb %r9b
+; CHECK-NEXT:    movzbl %r9b, %r9d
+; CHECK-NEXT:    shldq $63, %r8, %r9
+; CHECK-NEXT:    shldq $63, %rsi, %rdx
+; CHECK-NEXT:    shldq $63, %rcx, %r8
+; CHECK-NEXT:    movq %r8, 16(%rax)
+; CHECK-NEXT:    movq %rdx, (%rax)
+; CHECK-NEXT:    movq %r9, 24(%rax)
+; CHECK-NEXT:    movq %rdi, 8(%rax)
+; CHECK-NEXT:    retq
+start:
+  %xor = xor <2 x i128> %y, %x
+  %lshr = lshr <2 x i128> %xor, <i128 1, i128 1>
+  %and = and <2 x i128> %y, %x
+  %add = add <2 x i128> %lshr, %and
+  ret <2 x i128> %add
+}
diff --git a/llvm/test/CodeGen/X86/avgflooru-scalar.ll b/llvm/test/CodeGen/X86/avgflooru-scalar.ll
index d21c9d6..0c91a9d 100644
--- a/llvm/test/CodeGen/X86/avgflooru-scalar.ll
+++ b/llvm/test/CodeGen/X86/avgflooru-scalar.ll
@@ -168,26 +168,14 @@ define i32 @test_ext_i32(i32 %a0, i32 %a1) nounwind {
 define i64 @test_fixed_i64(i64 %a0, i64 %a1) nounwind {
 ; X86-LABEL: test_fixed_i64:
 ; X86:       # %bb.0:
-; X86-NEXT:    pushl %ebx
-; X86-NEXT:    pushl %edi
-; X86-NEXT:    pushl %esi
-; X86-NEXT:    movl {{[0-9]+}}(%esp), %esi
-; X86-NEXT:    movl {{[0-9]+}}(%esp), %edi
-; X86-NEXT:    movl {{[0-9]+}}(%esp), %eax
 ; X86-NEXT:    movl {{[0-9]+}}(%esp), %ecx
-; X86-NEXT:    movl %eax, %ebx
-; X86-NEXT:    xorl %esi, %ebx
-; X86-NEXT:    movl %ecx, %edx
-; X86-NEXT:    xorl %edi, %edx
-; X86-NEXT:    shrdl $1, %edx, %ebx
-; X86-NEXT:    andl %edi, %ecx
-; X86-NEXT:    shrl %edx
-; X86-NEXT:    andl %esi, %eax
-; X86-NEXT:    addl %ebx, %eax
-; X86-NEXT:    adcl %ecx, %edx
-; X86-NEXT:    popl %esi
-; X86-NEXT:    popl %edi
-; X86-NEXT:    popl %ebx
+; X86-NEXT:    movl {{[0-9]+}}(%esp), %eax
+; X86-NEXT:    addl {{[0-9]+}}(%esp), %ecx
+; X86-NEXT:    adcl {{[0-9]+}}(%esp), %eax
+; X86-NEXT:    setb %dl
+; X86-NEXT:    movzbl %dl, %edx
+; X86-NEXT:    shldl $31, %eax, %edx
+; X86-NEXT:    shldl $31, %ecx, %eax
 ; X86-NEXT:    retl
 ;
 ; X64-LABEL: test_fixed_i64:
@@ -208,26 +196,14 @@ define i64 @test_fixed_i64(i64 %a0, i64 %a1) nounwind {
 define i64 @test_ext_i64(i64 %a0, i64 %a1) nounwind {
 ; X86-LABEL: test_ext_i64:
 ; X86:       # %bb.0:
-; X86-NEXT:    pushl %ebx
-; X86-NEXT:    pushl %edi
-; X86-NEXT:    pushl %esi
-; X86-NEXT:    movl {{[0-9]+}}(%esp), %esi
-; X86-NEXT:    movl {{[0-9]+}}(%esp), %edi
-; X86-NEXT:    movl {{[0-9]+}}(%esp), %eax
 ; X86-NEXT:    movl {{[0-9]+}}(%esp), %ecx
-; X86-NEXT:    movl %eax, %ebx
-; X86-NEXT:    xorl %esi, %ebx
-; X86-NEXT:    movl %ecx, %edx
-; X86-NEXT:    xorl %edi, %edx
-; X86-NEXT:    shrdl $1, %edx, %ebx
-; X86-NEXT:    andl %edi, %ecx
-; X86-NEXT:    shrl %edx
-; X86-NEXT:    andl %esi, %eax
-; X86-NEXT:    addl %ebx, %eax
-; X86-NEXT:    adcl %ecx, %edx
-; X86-NEXT:    popl %esi
-; X86-NEXT:    popl %edi
-; X86-NEXT:    popl %ebx
+; X86-NEXT:    movl {{[0-9]+}}(%esp), %eax
+; X86-NEXT:    addl {{[0-9]+}}(%esp), %ecx
+; X86-NEXT:    adcl {{[0-9]+}}(%esp), %eax
+; X86-NEXT:    setb %dl
+; X86-NEXT:    movzbl %dl, %edx
+; X86-NEXT:    shldl $31, %eax, %edx
+; X86-NEXT:    shldl $31, %ecx, %eax
 ; X86-NEXT:    retl
 ;
 ; X64-LABEL: test_ext_i64:
diff --git a/llvm/test/CodeGen/X86/avx-vbroadcast.ll b/llvm/test/CodeGen/X86/avx-vbroadcast.ll
index 3f6f8c0..c69886d 100644
--- a/llvm/test/CodeGen/X86/avx-vbroadcast.ll
+++ b/llvm/test/CodeGen/X86/avx-vbroadcast.ll
@@ -1011,7 +1011,7 @@ define float @broadcast_lifetime() nounwind {
   ret float %7
 }
 
-define <8 x i16> @broadcast_x86_mmx(x86_mmx %tmp) nounwind {
+define <8 x i16> @broadcast_x86_mmx(<1 x i64> %tmp) nounwind {
 ; X86-LABEL: broadcast_x86_mmx:
 ; X86:       ## %bb.0: ## %bb
 ; X86-NEXT:    vmovddup {{.*#+}} xmm0 = mem[0,0]
@@ -1023,7 +1023,7 @@ define <8 x i16> @broadcast_x86_mmx(x86_mmx %tmp) nounwind {
 ; X64-NEXT:    vpshufd {{.*#+}} xmm0 = xmm0[0,1,0,1]
 ; X64-NEXT:    retq
 bb:
-  %tmp1 = bitcast x86_mmx %tmp to i64
+  %tmp1 = bitcast <1 x i64> %tmp to i64
   %tmp2 = insertelement <2 x i64> undef, i64 %tmp1, i32 0
   %tmp3 = bitcast <2 x i64> %tmp2 to <8 x i16>
   %tmp4 = shufflevector <8 x i16> %tmp3, <8 x i16> poison, <8 x i32> <i32 0, i32 1, i32 2, i32 3, i32 0, i32 1, i32 2, i32 3>
diff --git a/llvm/test/CodeGen/X86/avx2-vbroadcast.ll b/llvm/test/CodeGen/X86/avx2-vbroadcast.ll
index fed6c2e..9ac05038 100644
--- a/llvm/test/CodeGen/X86/avx2-vbroadcast.ll
+++ b/llvm/test/CodeGen/X86/avx2-vbroadcast.ll
@@ -1449,7 +1449,7 @@ eintry:
   ret void
 }
 
-define <8 x i16> @broadcast_x86_mmx(x86_mmx %tmp) nounwind {
+define <8 x i16> @broadcast_x86_mmx(<1 x i64> %tmp) nounwind {
 ; X86-LABEL: broadcast_x86_mmx:
 ; X86:       ## %bb.0: ## %bb
 ; X86-NEXT:    vmovddup {{.*#+}} xmm0 = mem[0,0]
@@ -1466,7 +1466,7 @@ define <8 x i16> @broadcast_x86_mmx(x86_mmx %tmp) nounwind {
 ; X64-AVX512VL-NEXT:    vpbroadcastq %rdi, %xmm0
 ; X64-AVX512VL-NEXT:    retq
 bb:
-  %tmp1 = bitcast x86_mmx %tmp to i64
+  %tmp1 = bitcast <1 x i64> %tmp to i64
   %tmp2 = insertelement <2 x i64> undef, i64 %tmp1, i32 0
   %tmp3 = bitcast <2 x i64> %tmp2 to <8 x i16>
   %tmp4 = shufflevector <8 x i16> %tmp3, <8 x i16> poison, <8 x i32> <i32 0, i32 1, i32 2, i32 3, i32 0, i32 1, i32 2, i32 3>
diff --git a/llvm/test/CodeGen/X86/bitcast-mmx.ll b/llvm/test/CodeGen/X86/bitcast-mmx.ll
index 061723a..fe48a96 100644
--- a/llvm/test/CodeGen/X86/bitcast-mmx.ll
+++ b/llvm/test/CodeGen/X86/bitcast-mmx.ll
@@ -17,9 +17,9 @@ define i32 @t0(i64 %x) nounwind {
 ; X64-NEXT:    retq
 entry:
   %0 = bitcast i64 %x to <4 x i16>
-  %1 = bitcast <4 x i16> %0 to x86_mmx
-  %2 = tail call x86_mmx @llvm.x86.sse.pshuf.w(x86_mmx %1, i8 -18)
-  %3 = bitcast x86_mmx %2 to <4 x i16>
+  %1 = bitcast <4 x i16> %0 to <1 x i64>
+  %2 = tail call <1 x i64> @llvm.x86.sse.pshuf.w(<1 x i64> %1, i8 -18)
+  %3 = bitcast <1 x i64> %2 to <4 x i16>
   %4 = bitcast <4 x i16> %3 to <1 x i64>
   %5 = extractelement <1 x i64> %4, i32 0
   %6 = bitcast i64 %5 to <2 x i32>
@@ -52,9 +52,9 @@ define i64 @t1(i64 %x, i32 %n) nounwind {
 ; X64-NEXT:    movq %mm0, %rax
 ; X64-NEXT:    retq
 entry:
-  %0 = bitcast i64 %x to x86_mmx
-  %1 = tail call x86_mmx @llvm.x86.mmx.pslli.q(x86_mmx %0, i32 %n)
-  %2 = bitcast x86_mmx %1 to i64
+  %0 = bitcast i64 %x to <1 x i64>
+  %1 = tail call <1 x i64> @llvm.x86.mmx.pslli.q(<1 x i64> %0, i32 %n)
+  %2 = bitcast <1 x i64> %1 to i64
   ret i64 %2
 }
 
@@ -88,11 +88,11 @@ define i64 @t2(i64 %x, i32 %n, i32 %w) nounwind {
 entry:
   %0 = insertelement <2 x i32> undef, i32 %w, i32 0
   %1 = insertelement <2 x i32> %0, i32 0, i32 1
-  %2 = bitcast <2 x i32> %1 to x86_mmx
-  %3 = tail call x86_mmx @llvm.x86.mmx.pslli.q(x86_mmx %2, i32 %n)
-  %4 = bitcast i64 %x to x86_mmx
-  %5 = tail call x86_mmx @llvm.x86.mmx.por(x86_mmx %4, x86_mmx %3)
-  %6 = bitcast x86_mmx %5 to i64
+  %2 = bitcast <2 x i32> %1 to <1 x i64>
+  %3 = tail call <1 x i64> @llvm.x86.mmx.pslli.q(<1 x i64> %2, i32 %n)
+  %4 = bitcast i64 %x to <1 x i64>
+  %5 = tail call <1 x i64> @llvm.x86.mmx.por(<1 x i64> %4, <1 x i64> %3)
+  %6 = bitcast <1 x i64> %5 to i64
   ret i64 %6
 }
 
@@ -123,14 +123,14 @@ define i64 @t3(ptr %y, ptr %n) nounwind {
 ; X64-NEXT:    movq %mm0, %rax
 ; X64-NEXT:    retq
 entry:
-  %0 = load x86_mmx, ptr %y, align 8
+  %0 = load <1 x i64>, ptr %y, align 8
   %1 = load i32, ptr %n, align 4
-  %2 = tail call x86_mmx @llvm.x86.mmx.pslli.q(x86_mmx %0, i32 %1)
-  %3 = bitcast x86_mmx %2 to i64
+  %2 = tail call <1 x i64> @llvm.x86.mmx.pslli.q(<1 x i64> %0, i32 %1)
+  %3 = bitcast <1 x i64> %2 to i64
   ret i64 %3
 }
 
-declare x86_mmx @llvm.x86.sse.pshuf.w(x86_mmx, i8)
-declare x86_mmx @llvm.x86.mmx.pslli.q(x86_mmx, i32)
-declare x86_mmx @llvm.x86.mmx.por(x86_mmx, x86_mmx)
+declare <1 x i64> @llvm.x86.sse.pshuf.w(<1 x i64>, i8)
+declare <1 x i64> @llvm.x86.mmx.pslli.q(<1 x i64>, i32)
+declare <1 x i64> @llvm.x86.mmx.por(<1 x i64>, <1 x i64>)
 
diff --git a/llvm/test/CodeGen/X86/exp10-libcall-names.ll b/llvm/test/CodeGen/X86/exp10-libcall-names.ll
index 52f0f4a..4f993cf 100644
--- a/llvm/test/CodeGen/X86/exp10-libcall-names.ll
+++ b/llvm/test/CodeGen/X86/exp10-libcall-names.ll
@@ -8,6 +8,8 @@
 ; RUN: llc -mtriple=x86_64-apple-ios8.0 < %s | FileCheck -check-prefix=APPLE %s
 ; RUN: llc -mtriple=x86_64-apple-tvos8.0 < %s | FileCheck -check-prefix=APPLE %s
 ; RUN: llc -mtriple=x86_64-apple-xros8.0 < %s | FileCheck -check-prefix=APPLE %s
+; RUN: llc -mtriple=x86_64-apple-driverkit < %s | FileCheck -check-prefix=APPLE %s
+; RUN: llc -mtriple=x86_64-apple-driverkit24.0 < %s | FileCheck -check-prefix=APPLE %s
 
 ; RUN: not llc -mtriple=x86_64-apple-macos10.8 -filetype=null %s 2>&1 | FileCheck -check-prefix=ERR %s
 ; Check exp10/exp10f is emitted as __exp10/__exp10f on assorted systems.
@@ -22,6 +24,11 @@ define float @test_exp10_f32(float %x) {
 ; APPLE-LABEL: test_exp10_f32:
 ; APPLE:       ## %bb.0:
 ; APPLE-NEXT:    jmp ___exp10f ## TAILCALL
+;
+; MISSED-LABEL: test_exp10_f32:
+; MISSED:       ## %bb.0:
+; MISSED-NEXT:    jmp _exp10f ## TAILCALL
+
   %ret = call float @llvm.exp10.f32(float %x)
   ret float %ret
 }
@@ -34,6 +41,11 @@ define double @test_exp10_f64(double %x) {
 ; APPLE-LABEL: test_exp10_f64:
 ; APPLE:       ## %bb.0:
 ; APPLE-NEXT:    jmp ___exp10 ## TAILCALL
+;
+; MISSED-LABEL: test_exp10_f64:
+; MISSED:       ## %bb.0:
+; MISSED-NEXT:    jmp _exp10 ## TAILCALL
+;
   %ret = call double @llvm.exp10.f64(double %x)
   ret double %ret
 }
diff --git a/llvm/test/CodeGen/X86/expand-vr64-gr64-copy.mir b/llvm/test/CodeGen/X86/expand-vr64-gr64-copy.mir
index 559560a..aa637e7 100644
--- a/llvm/test/CodeGen/X86/expand-vr64-gr64-copy.mir
+++ b/llvm/test/CodeGen/X86/expand-vr64-gr64-copy.mir
@@ -6,9 +6,9 @@
 
   define <2 x i32> @test_paddw(<2 x i32> %a) nounwind readnone {
   entry:
-    %0 = bitcast <2 x i32> %a to x86_mmx
-    %1 = tail call x86_mmx @llvm.x86.mmx.padd.w(x86_mmx %0, x86_mmx %0)
-    %2 = bitcast x86_mmx %1 to <2 x i32>
+    %0 = bitcast <2 x i32> %a to <1 x i64>
+    %1 = tail call <1 x i64> @llvm.x86.mmx.padd.w(<1 x i64> %0, <1 x i64> %0)
+    %2 = bitcast <1 x i64> %1 to <2 x i32>
     ret <2 x i32> %2
   }
 
diff --git a/llvm/test/CodeGen/X86/fast-isel-bc.ll b/llvm/test/CodeGen/X86/fast-isel-bc.ll
index e3bb5e7..64bdfd6 100644
--- a/llvm/test/CodeGen/X86/fast-isel-bc.ll
+++ b/llvm/test/CodeGen/X86/fast-isel-bc.ll
@@ -4,7 +4,7 @@
 
 ; PR4684
 
-declare void @func2(x86_mmx)
+declare void @func2(<1 x i64>)
 
 ; This isn't spectacular, but it's MMX code at -O0...
 
@@ -28,7 +28,7 @@ define void @func1() nounwind {
 ; X64-NEXT:    callq _func2
 ; X64-NEXT:    popq %rax
 ; X64-NEXT:    retq
-  %tmp0 = bitcast <2 x i32> <i32 0, i32 2> to x86_mmx
-  call void @func2(x86_mmx %tmp0)
+  %tmp0 = bitcast <2 x i32> <i32 0, i32 2> to <1 x i64>
+  call void @func2(<1 x i64> %tmp0)
   ret void
 }
diff --git a/llvm/test/CodeGen/X86/fast-isel-nontemporal.ll b/llvm/test/CodeGen/X86/fast-isel-nontemporal.ll
index c13fdae..3b1a8f5 100644
--- a/llvm/test/CodeGen/X86/fast-isel-nontemporal.ll
+++ b/llvm/test/CodeGen/X86/fast-isel-nontemporal.ll
@@ -104,12 +104,12 @@ define void @test_mmx(ptr nocapture %a0, ptr nocapture %a1) {
 ; ALL-NEXT:    movntq %mm0, (%rsi)
 ; ALL-NEXT:    retq
 entry:
-  %0 = load x86_mmx, ptr %a0
-  %1 = call x86_mmx @llvm.x86.mmx.psrli.q(x86_mmx %0, i32 3)
-  store x86_mmx %1, ptr %a1, align 8, !nontemporal !1
+  %0 = load <1 x i64>, ptr %a0
+  %1 = call <1 x i64> @llvm.x86.mmx.psrli.q(<1 x i64> %0, i32 3)
+  store <1 x i64> %1, ptr %a1, align 8, !nontemporal !1
   ret void
 }
-declare x86_mmx @llvm.x86.mmx.psrli.q(x86_mmx, i32) nounwind readnone
+declare <1 x i64> @llvm.x86.mmx.psrli.q(<1 x i64>, i32) nounwind readnone
 
 ;
 ; 128-bit Vector Stores
diff --git a/llvm/test/CodeGen/X86/fp-strict-libcalls-msvc32.ll b/llvm/test/CodeGen/X86/fp-strict-libcalls-msvc32.ll
index cfec52c..5d4e86a 100644
--- a/llvm/test/CodeGen/X86/fp-strict-libcalls-msvc32.ll
+++ b/llvm/test/CodeGen/X86/fp-strict-libcalls-msvc32.ll
@@ -177,6 +177,107 @@ define float @tan(float %x) #0 {
   ret float %result
 }
 
+define float @acos(float %x) #0 {
+; CHECK-LABEL: acos:
+; CHECK:       # %bb.0:
+; CHECK-NEXT:    subl $12, %esp
+; CHECK-NEXT:    flds {{[0-9]+}}(%esp)
+; CHECK-NEXT:    fstpl (%esp)
+; CHECK-NEXT:    wait
+; CHECK-NEXT:    calll _acos
+; CHECK-NEXT:    fstps {{[0-9]+}}(%esp)
+; CHECK-NEXT:    flds {{[0-9]+}}(%esp)
+; CHECK-NEXT:    wait
+; CHECK-NEXT:    addl $12, %esp
+; CHECK-NEXT:    retl
+  %result = call float @llvm.experimental.constrained.acos.f32(float %x, metadata !"round.dynamic", metadata !"fpexcept.strict") #0
+  ret float %result
+}
+
+define float @asin(float %x) #0 {
+; CHECK-LABEL: asin:
+; CHECK:       # %bb.0:
+; CHECK-NEXT:    subl $12, %esp
+; CHECK-NEXT:    flds {{[0-9]+}}(%esp)
+; CHECK-NEXT:    fstpl (%esp)
+; CHECK-NEXT:    wait
+; CHECK-NEXT:    calll _asin
+; CHECK-NEXT:    fstps {{[0-9]+}}(%esp)
+; CHECK-NEXT:    flds {{[0-9]+}}(%esp)
+; CHECK-NEXT:    wait
+; CHECK-NEXT:    addl $12, %esp
+; CHECK-NEXT:    retl
+  %result = call float @llvm.experimental.constrained.asin.f32(float %x, metadata !"round.dynamic", metadata !"fpexcept.strict") #0
+  ret float %result
+}
+
+define float @atan(float %x) #0 {
+; CHECK-LABEL: atan:
+; CHECK:       # %bb.0:
+; CHECK-NEXT:    subl $12, %esp
+; CHECK-NEXT:    flds {{[0-9]+}}(%esp)
+; CHECK-NEXT:    fstpl (%esp)
+; CHECK-NEXT:    wait
+; CHECK-NEXT:    calll _atan
+; CHECK-NEXT:    fstps {{[0-9]+}}(%esp)
+; CHECK-NEXT:    flds {{[0-9]+}}(%esp)
+; CHECK-NEXT:    wait
+; CHECK-NEXT:    addl $12, %esp
+; CHECK-NEXT:    retl
+  %result = call float @llvm.experimental.constrained.atan.f32(float %x, metadata !"round.dynamic", metadata !"fpexcept.strict") #0
+  ret float %result
+}
+
+define float @cosh(float %x) #0 {
+; CHECK-LABEL: cosh:
+; CHECK:       # %bb.0:
+; CHECK-NEXT:    subl $12, %esp
+; CHECK-NEXT:    flds {{[0-9]+}}(%esp)
+; CHECK-NEXT:    fstpl (%esp)
+; CHECK-NEXT:    wait
+; CHECK-NEXT:    calll _cosh
+; CHECK-NEXT:    fstps {{[0-9]+}}(%esp)
+; CHECK-NEXT:    flds {{[0-9]+}}(%esp)
+; CHECK-NEXT:    wait
+; CHECK-NEXT:    addl $12, %esp
+; CHECK-NEXT:    retl
+  %result = call float @llvm.experimental.constrained.cosh.f32(float %x, metadata !"round.dynamic", metadata !"fpexcept.strict") #0
+  ret float %result
+}
+
+define float @sinh(float %x) #0 {
+; CHECK-LABEL: sinh:
+; CHECK:       # %bb.0:
+; CHECK-NEXT:    subl $12, %esp
+; CHECK-NEXT:    flds {{[0-9]+}}(%esp)
+; CHECK-NEXT:    fstpl (%esp)
+; CHECK-NEXT:    wait
+; CHECK-NEXT:    calll _sinh
+; CHECK-NEXT:    fstps {{[0-9]+}}(%esp)
+; CHECK-NEXT:    flds {{[0-9]+}}(%esp)
+; CHECK-NEXT:    wait
+; CHECK-NEXT:    addl $12, %esp
+; CHECK-NEXT:    retl
+  %result = call float @llvm.experimental.constrained.sinh.f32(float %x, metadata !"round.dynamic", metadata !"fpexcept.strict") #0
+  ret float %result
+}
+
+define float @tanh(float %x) #0 {
+; CHECK:       # %bb.0:
+; CHECK-NEXT:    subl $12, %esp
+; CHECK-NEXT:    flds {{[0-9]+}}(%esp)
+; CHECK-NEXT:    fstpl (%esp)
+; CHECK-NEXT:    wait
+; CHECK-NEXT:    calll _tanh
+; CHECK-NEXT:    fstps {{[0-9]+}}(%esp)
+; CHECK-NEXT:    flds {{[0-9]+}}(%esp)
+; CHECK-NEXT:    wait
+; CHECK-NEXT:    addl $12, %esp
+; CHECK-NEXT:    retl
+  %result = call float @llvm.experimental.constrained.tanh.f32(float %x, metadata !"round.dynamic", metadata !"fpexcept.strict") #0
+  ret float %result
+}
+
 attributes #0 = { strictfp }
 
 declare float @llvm.experimental.constrained.ceil.f32(float, metadata)
@@ -189,3 +290,9 @@ declare float @llvm.experimental.constrained.log10.f32(float, metadata, metadata
 declare float @llvm.experimental.constrained.pow.f32(float, float, metadata, metadata)
 declare float @llvm.experimental.constrained.sin.f32(float, metadata, metadata)
 declare float @llvm.experimental.constrained.tan.f32(float, metadata, metadata)
+declare float @llvm.experimental.constrained.acos.f32(float, metadata, metadata)
+declare float @llvm.experimental.constrained.asin.f32(float, metadata, metadata)
+declare float @llvm.experimental.constrained.atan.f32(float, metadata, metadata)
+declare float @llvm.experimental.constrained.cosh.f32(float, metadata, metadata)
+declare float @llvm.experimental.constrained.sinh.f32(float, metadata, metadata)
+declare float @llvm.experimental.constrained.tanh.f32(float, metadata, metadata)
diff --git a/llvm/test/CodeGen/X86/fsafdo_test1.ll b/llvm/test/CodeGen/X86/fsafdo_test1.ll
index 61c0f59..e80a7f2 100644
--- a/llvm/test/CodeGen/X86/fsafdo_test1.ll
+++ b/llvm/test/CodeGen/X86/fsafdo_test1.ll
@@ -4,9 +4,9 @@
 ; Check that fs-afdo discriminators are generated.
 ; V01: .loc    1 7 3 is_stmt 0 discriminator 2 # foo.c:7:3
 ; V01: .loc    1 9 5 is_stmt 1 discriminator 2 # foo.c:9:5
-; V0: .loc    1 9 5 is_stmt 0 discriminator 11266 # foo.c:9:5
+; V0: .loc    1 9 5 discriminator 11266 # foo.c:9:5
 ; V0: .loc    1 7 3 is_stmt 1 discriminator 11266 # foo.c:7:3
-; V1: .loc    1 9 5 is_stmt 0 discriminator 514 # foo.c:9:5
+; V1: .loc    1 9 5 discriminator 514 # foo.c:9:5
 ; V1: .loc    1 7 3 is_stmt 1 discriminator 258 # foo.c:7:3
 ; Check that variable __llvm_fs_discriminator__ is generated.
 ; V01: .type   __llvm_fs_discriminator__,@object # @__llvm_fs_discriminator__
diff --git a/llvm/test/CodeGen/X86/fsafdo_test4.ll b/llvm/test/CodeGen/X86/fsafdo_test4.ll
index 6a22ea9..effc72b 100644
--- a/llvm/test/CodeGen/X86/fsafdo_test4.ll
+++ b/llvm/test/CodeGen/X86/fsafdo_test4.ll
@@ -1,11 +1,16 @@
-; RUN: llc -enable-fs-discriminator -improved-fs-discriminator=false < %s | FileCheck %s
-; RUN: llc -enable-fs-discriminator -improved-fs-discriminator=true < %s | FileCheck %s
+; RUN: llc -enable-fs-discriminator -improved-fs-discriminator=false < %s | FileCheck --implicit-check-not=.loc %s
+; RUN: llc -enable-fs-discriminator -improved-fs-discriminator=true < %s | FileCheck --implicit-check-not=.loc %s
 ;
 ; Check that fs-afdo discriminators are NOT generated, as debugInfoForProfiling is false (not set).
+; CHECK: .loc    1 7 15 prologue_end discriminator 2 # foo.c:7:15
 ; CHECK: .loc    1 7 3 is_stmt 0 discriminator 2 # foo.c:7:3
+; CHECK: .loc    1 0 3 # foo.c:0:3
 ; CHECK: .loc    1 9 5 is_stmt 1 discriminator 2 # foo.c:9:5
-; CHECK-NOT: .loc    1 9 5 is_stmt 0 discriminator
-; CHECK-NOT: .loc    1 7 3 is_stmt 1 discriminator
+; CHECK: .loc    1 0 5 is_stmt 0 # :0:5
+; CHECK: .loc    1 9 5 discriminator 2 # foo.c:9:5
+; CHECK: .loc    1 0 5 # :0:5
+; CHECK: .loc	   1 7 3 is_stmt 1 discriminator 2 # foo.c:7:3
+; CHECK: .loc    1 14 3 # foo.c:14:3
 ; Check that variable __llvm_fs_discriminator__ is NOT generated.
 ; CHECK-NOT: __llvm_fs_discriminator__:
 
diff --git a/llvm/test/CodeGen/X86/is_fpclass.ll b/llvm/test/CodeGen/X86/is_fpclass.ll
index 532b2c0..cc4d4c4 100644
--- a/llvm/test/CodeGen/X86/is_fpclass.ll
+++ b/llvm/test/CodeGen/X86/is_fpclass.ll
@@ -2602,24 +2602,22 @@ define i1 @issubnormal_or_nan_f(float %x) {
 define i1 @issubnormal_or_zero_or_nan_f(float %x) {
 ; X86-LABEL: issubnormal_or_zero_or_nan_f:
 ; X86:       # %bb.0:
-; X86-NEXT:    movl {{[0-9]+}}(%esp), %eax
-; X86-NEXT:    testl $2139095040, %eax # imm = 0x7F800000
-; X86-NEXT:    sete %cl
-; X86-NEXT:    andl $2147483647, %eax # imm = 0x7FFFFFFF
-; X86-NEXT:    cmpl $2139095041, %eax # imm = 0x7F800001
-; X86-NEXT:    setge %al
-; X86-NEXT:    orb %cl, %al
+; X86-NEXT:    flds {{[0-9]+}}(%esp)
+; X86-NEXT:    fabs
+; X86-NEXT:    flds {{\.?LCPI[0-9]+_[0-9]+}}
+; X86-NEXT:    fxch %st(1)
+; X86-NEXT:    fucompp
+; X86-NEXT:    fnstsw %ax
+; X86-NEXT:    # kill: def $ah killed $ah killed $ax
+; X86-NEXT:    sahf
+; X86-NEXT:    setb %al
 ; X86-NEXT:    retl
 ;
 ; X64-LABEL: issubnormal_or_zero_or_nan_f:
 ; X64:       # %bb.0:
-; X64-NEXT:    movd %xmm0, %eax
-; X64-NEXT:    testl $2139095040, %eax # imm = 0x7F800000
-; X64-NEXT:    sete %cl
-; X64-NEXT:    andl $2147483647, %eax # imm = 0x7FFFFFFF
-; X64-NEXT:    cmpl $2139095041, %eax # imm = 0x7F800001
-; X64-NEXT:    setge %al
-; X64-NEXT:    orb %cl, %al
+; X64-NEXT:    andps {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0
+; X64-NEXT:    ucomiss {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0
+; X64-NEXT:    setb %al
 ; X64-NEXT:    retq
   %class = tail call i1 @llvm.is.fpclass.f32(float %x, i32 243)  ; 0xf0|0x3 = "subnormal|zero|nan"
   ret i1 %class
@@ -2773,24 +2771,22 @@ define i1 @not_issubnormal_or_nan_f(float %x) {
 define i1 @not_issubnormal_or_zero_or_nan_f(float %x) {
 ; X86-LABEL: not_issubnormal_or_zero_or_nan_f:
 ; X86:       # %bb.0:
-; X86-NEXT:    movl {{[0-9]+}}(%esp), %eax
-; X86-NEXT:    testl $2139095040, %eax # imm = 0x7F800000
-; X86-NEXT:    setne %cl
-; X86-NEXT:    andl $2147483647, %eax # imm = 0x7FFFFFFF
-; X86-NEXT:    cmpl $2139095041, %eax # imm = 0x7F800001
-; X86-NEXT:    setl %al
-; X86-NEXT:    andb %cl, %al
+; X86-NEXT:    flds {{[0-9]+}}(%esp)
+; X86-NEXT:    fabs
+; X86-NEXT:    flds {{\.?LCPI[0-9]+_[0-9]+}}
+; X86-NEXT:    fxch %st(1)
+; X86-NEXT:    fucompp
+; X86-NEXT:    fnstsw %ax
+; X86-NEXT:    # kill: def $ah killed $ah killed $ax
+; X86-NEXT:    sahf
+; X86-NEXT:    setae %al
 ; X86-NEXT:    retl
 ;
 ; X64-LABEL: not_issubnormal_or_zero_or_nan_f:
 ; X64:       # %bb.0:
-; X64-NEXT:    movd %xmm0, %eax
-; X64-NEXT:    testl $2139095040, %eax # imm = 0x7F800000
-; X64-NEXT:    setne %cl
-; X64-NEXT:    andl $2147483647, %eax # imm = 0x7FFFFFFF
-; X64-NEXT:    cmpl $2139095041, %eax # imm = 0x7F800001
-; X64-NEXT:    setl %al
-; X64-NEXT:    andb %cl, %al
+; X64-NEXT:    andps {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0
+; X64-NEXT:    ucomiss {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0
+; X64-NEXT:    setae %al
 ; X64-NEXT:    retq
   %class = tail call i1 @llvm.is.fpclass.f32(float %x, i32 780)  ; ~(0xf0|0x3) = ~"subnormal|zero|nan"
   ret i1 %class
diff --git a/llvm/test/CodeGen/X86/known-bits.ll b/llvm/test/CodeGen/X86/known-bits.ll
index 9741f6f..f7b2538 100644
--- a/llvm/test/CodeGen/X86/known-bits.ll
+++ b/llvm/test/CodeGen/X86/known-bits.ll
@@ -115,7 +115,7 @@ define i128 @knownbits_mask_addc_shl(i64 %a0, i64 %a1, i64 %a2) nounwind {
 ; X64-NEXT:    andq $-1024, %rdi # imm = 0xFC00
 ; X64-NEXT:    andq $-1024, %rsi # imm = 0xFC00
 ; X64-NEXT:    addq %rdi, %rsi
-; X64-NEXT:    adcq $0, %rdx
+; X64-NEXT:    adcl $0, %edx
 ; X64-NEXT:    shldq $54, %rsi, %rdx
 ; X64-NEXT:    xorl %eax, %eax
 ; X64-NEXT:    retq
diff --git a/llvm/test/CodeGen/X86/mmx-arg-passing-x86-64.ll b/llvm/test/CodeGen/X86/mmx-arg-passing-x86-64.ll
index 54f048e..439d7ef 100644
--- a/llvm/test/CodeGen/X86/mmx-arg-passing-x86-64.ll
+++ b/llvm/test/CodeGen/X86/mmx-arg-passing-x86-64.ll
@@ -14,12 +14,12 @@ define void @t3() nounwind  {
 ; X86-64-NEXT:    xorl %eax, %eax
 ; X86-64-NEXT:    jmp _pass_v8qi ## TAILCALL
   %tmp3 = load <8 x i8>, ptr @g_v8qi, align 8
-  %tmp3a = bitcast <8 x i8> %tmp3 to x86_mmx
-  %tmp4 = tail call i32 (...) @pass_v8qi( x86_mmx %tmp3a ) nounwind
+  %tmp3a = bitcast <8 x i8> %tmp3 to <1 x i64>
+  %tmp4 = tail call i32 (...) @pass_v8qi( <1 x i64> %tmp3a ) nounwind
   ret void
 }
 
-define void @t4(x86_mmx %v1, x86_mmx %v2) nounwind  {
+define void @t4(<1 x i64> %v1, <1 x i64> %v2) nounwind  {
 ; X86-64-LABEL: t4:
 ; X86-64:       ## %bb.0:
 ; X86-64-NEXT:    movq %rdi, %xmm0
@@ -28,11 +28,11 @@ define void @t4(x86_mmx %v1, x86_mmx %v2) nounwind  {
 ; X86-64-NEXT:    movq %xmm1, %rdi
 ; X86-64-NEXT:    xorl %eax, %eax
 ; X86-64-NEXT:    jmp _pass_v8qi ## TAILCALL
-  %v1a = bitcast x86_mmx %v1 to <8 x i8>
-  %v2b = bitcast x86_mmx %v2 to <8 x i8>
+  %v1a = bitcast <1 x i64> %v1 to <8 x i8>
+  %v2b = bitcast <1 x i64> %v2 to <8 x i8>
   %tmp3 = add <8 x i8> %v1a, %v2b
-  %tmp3a = bitcast <8 x i8> %tmp3 to x86_mmx
-  %tmp4 = tail call i32 (...) @pass_v8qi( x86_mmx %tmp3a ) nounwind
+  %tmp3a = bitcast <8 x i8> %tmp3 to <1 x i64>
+  %tmp4 = tail call i32 (...) @pass_v8qi( <1 x i64> %tmp3a ) nounwind
   ret void
 }
 
diff --git a/llvm/test/CodeGen/X86/mmx-arg-passing.ll b/llvm/test/CodeGen/X86/mmx-arg-passing.ll
index 1ae9920..d933149 100644
--- a/llvm/test/CodeGen/X86/mmx-arg-passing.ll
+++ b/llvm/test/CodeGen/X86/mmx-arg-passing.ll
@@ -8,9 +8,9 @@
 ; On Darwin x86-64, v8i8, v4i16, v2i32 values are passed in XMM[0-7].
 ; On Darwin x86-64, v1i64 values are passed in 64-bit GPRs.
 
-@u1 = external global x86_mmx
+@u1 = external global <1 x i64>
 
-define void @t1(x86_mmx %v1) nounwind  {
+define void @t1(<1 x i64> %v1) nounwind  {
 ; X86-32-LABEL: t1:
 ; X86-32:       ## %bb.0:
 ; X86-32-NEXT:    movl {{[0-9]+}}(%esp), %eax
@@ -25,11 +25,11 @@ define void @t1(x86_mmx %v1) nounwind  {
 ; X86-64-NEXT:    movq _u1@GOTPCREL(%rip), %rax
 ; X86-64-NEXT:    movq %rdi, (%rax)
 ; X86-64-NEXT:    retq
-	store x86_mmx %v1, ptr @u1, align 8
+	store <1 x i64> %v1, ptr @u1, align 8
 	ret void
 }
 
-@u2 = external global x86_mmx
+@u2 = external global <1 x i64>
 
 define void @t2(<1 x i64> %v1) nounwind  {
 ; X86-32-LABEL: t2:
@@ -46,7 +46,6 @@ define void @t2(<1 x i64> %v1) nounwind  {
 ; X86-64-NEXT:    movq _u2@GOTPCREL(%rip), %rax
 ; X86-64-NEXT:    movq %rdi, (%rax)
 ; X86-64-NEXT:    retq
-        %tmp = bitcast <1 x i64> %v1 to x86_mmx
-	store x86_mmx %tmp, ptr @u2, align 8
+	store <1 x i64> %v1, ptr @u2, align 8
 	ret void
 }
diff --git a/llvm/test/CodeGen/X86/mmx-arith.ll b/llvm/test/CodeGen/X86/mmx-arith.ll
index 230e763..5bb3b17 100644
--- a/llvm/test/CodeGen/X86/mmx-arith.ll
+++ b/llvm/test/CodeGen/X86/mmx-arith.ll
@@ -88,53 +88,53 @@ define void @test0(ptr %A, ptr %B) nounwind {
 ; X64-NEXT:    emms
 ; X64-NEXT:    retq
 entry:
-  %tmp1 = load x86_mmx, ptr %A
-  %tmp3 = load x86_mmx, ptr %B
-  %tmp1a = bitcast x86_mmx %tmp1 to <8 x i8>
-  %tmp3a = bitcast x86_mmx %tmp3 to <8 x i8>
+  %tmp1 = load <1 x i64>, ptr %A
+  %tmp3 = load <1 x i64>, ptr %B
+  %tmp1a = bitcast <1 x i64> %tmp1 to <8 x i8>
+  %tmp3a = bitcast <1 x i64> %tmp3 to <8 x i8>
   %tmp4 = add <8 x i8> %tmp1a, %tmp3a
-  %tmp4a = bitcast <8 x i8> %tmp4 to x86_mmx
-  store x86_mmx %tmp4a, ptr %A
-  %tmp7 = load x86_mmx, ptr %B
-  %tmp12 = tail call x86_mmx @llvm.x86.mmx.padds.b(x86_mmx %tmp4a, x86_mmx %tmp7)
-  store x86_mmx %tmp12, ptr %A
-  %tmp16 = load x86_mmx, ptr %B
-  %tmp21 = tail call x86_mmx @llvm.x86.mmx.paddus.b(x86_mmx %tmp12, x86_mmx %tmp16)
-  store x86_mmx %tmp21, ptr %A
-  %tmp27 = load x86_mmx, ptr %B
-  %tmp21a = bitcast x86_mmx %tmp21 to <8 x i8>
-  %tmp27a = bitcast x86_mmx %tmp27 to <8 x i8>
+  %tmp4a = bitcast <8 x i8> %tmp4 to <1 x i64>
+  store <1 x i64> %tmp4a, ptr %A
+  %tmp7 = load <1 x i64>, ptr %B
+  %tmp12 = tail call <1 x i64> @llvm.x86.mmx.padds.b(<1 x i64> %tmp4a, <1 x i64> %tmp7)
+  store <1 x i64> %tmp12, ptr %A
+  %tmp16 = load <1 x i64>, ptr %B
+  %tmp21 = tail call <1 x i64> @llvm.x86.mmx.paddus.b(<1 x i64> %tmp12, <1 x i64> %tmp16)
+  store <1 x i64> %tmp21, ptr %A
+  %tmp27 = load <1 x i64>, ptr %B
+  %tmp21a = bitcast <1 x i64> %tmp21 to <8 x i8>
+  %tmp27a = bitcast <1 x i64> %tmp27 to <8 x i8>
   %tmp28 = sub <8 x i8> %tmp21a, %tmp27a
-  %tmp28a = bitcast <8 x i8> %tmp28 to x86_mmx
-  store x86_mmx %tmp28a, ptr %A
-  %tmp31 = load x86_mmx, ptr %B
-  %tmp36 = tail call x86_mmx @llvm.x86.mmx.psubs.b(x86_mmx %tmp28a, x86_mmx %tmp31)
-  store x86_mmx %tmp36, ptr %A
-  %tmp40 = load x86_mmx, ptr %B
-  %tmp45 = tail call x86_mmx @llvm.x86.mmx.psubus.b(x86_mmx %tmp36, x86_mmx %tmp40)
-  store x86_mmx %tmp45, ptr %A
-  %tmp51 = load x86_mmx, ptr %B
-  %tmp45a = bitcast x86_mmx %tmp45 to <8 x i8>
-  %tmp51a = bitcast x86_mmx %tmp51 to <8 x i8>
+  %tmp28a = bitcast <8 x i8> %tmp28 to <1 x i64>
+  store <1 x i64> %tmp28a, ptr %A
+  %tmp31 = load <1 x i64>, ptr %B
+  %tmp36 = tail call <1 x i64> @llvm.x86.mmx.psubs.b(<1 x i64> %tmp28a, <1 x i64> %tmp31)
+  store <1 x i64> %tmp36, ptr %A
+  %tmp40 = load <1 x i64>, ptr %B
+  %tmp45 = tail call <1 x i64> @llvm.x86.mmx.psubus.b(<1 x i64> %tmp36, <1 x i64> %tmp40)
+  store <1 x i64> %tmp45, ptr %A
+  %tmp51 = load <1 x i64>, ptr %B
+  %tmp45a = bitcast <1 x i64> %tmp45 to <8 x i8>
+  %tmp51a = bitcast <1 x i64> %tmp51 to <8 x i8>
   %tmp52 = mul <8 x i8> %tmp45a, %tmp51a
-  %tmp52a = bitcast <8 x i8> %tmp52 to x86_mmx
-  store x86_mmx %tmp52a, ptr %A
-  %tmp57 = load x86_mmx, ptr %B
-  %tmp57a = bitcast x86_mmx %tmp57 to <8 x i8>
+  %tmp52a = bitcast <8 x i8> %tmp52 to <1 x i64>
+  store <1 x i64> %tmp52a, ptr %A
+  %tmp57 = load <1 x i64>, ptr %B
+  %tmp57a = bitcast <1 x i64> %tmp57 to <8 x i8>
   %tmp58 = and <8 x i8> %tmp52, %tmp57a
-  %tmp58a = bitcast <8 x i8> %tmp58 to x86_mmx
-  store x86_mmx %tmp58a, ptr %A
-  %tmp63 = load x86_mmx, ptr %B
-  %tmp63a = bitcast x86_mmx %tmp63 to <8 x i8>
+  %tmp58a = bitcast <8 x i8> %tmp58 to <1 x i64>
+  store <1 x i64> %tmp58a, ptr %A
+  %tmp63 = load <1 x i64>, ptr %B
+  %tmp63a = bitcast <1 x i64> %tmp63 to <8 x i8>
   %tmp64 = or <8 x i8> %tmp58, %tmp63a
-  %tmp64a = bitcast <8 x i8> %tmp64 to x86_mmx
-  store x86_mmx %tmp64a, ptr %A
-  %tmp69 = load x86_mmx, ptr %B
-  %tmp69a = bitcast x86_mmx %tmp69 to <8 x i8>
-  %tmp64b = bitcast x86_mmx %tmp64a to <8 x i8>
+  %tmp64a = bitcast <8 x i8> %tmp64 to <1 x i64>
+  store <1 x i64> %tmp64a, ptr %A
+  %tmp69 = load <1 x i64>, ptr %B
+  %tmp69a = bitcast <1 x i64> %tmp69 to <8 x i8>
+  %tmp64b = bitcast <1 x i64> %tmp64a to <8 x i8>
   %tmp70 = xor <8 x i8> %tmp64b, %tmp69a
-  %tmp70a = bitcast <8 x i8> %tmp70 to x86_mmx
-  store x86_mmx %tmp70a, ptr %A
+  %tmp70a = bitcast <8 x i8> %tmp70 to <1 x i64>
+  store <1 x i64> %tmp70a, ptr %A
   tail call void @llvm.x86.mmx.emms()
   ret void
 }
@@ -196,42 +196,42 @@ define void @test1(ptr %A, ptr %B) nounwind {
 ; X64-NEXT:    emms
 ; X64-NEXT:    retq
 entry:
-  %tmp1 = load x86_mmx, ptr %A
-  %tmp3 = load x86_mmx, ptr %B
-  %tmp1a = bitcast x86_mmx %tmp1 to <2 x i32>
-  %tmp3a = bitcast x86_mmx %tmp3 to <2 x i32>
+  %tmp1 = load <1 x i64>, ptr %A
+  %tmp3 = load <1 x i64>, ptr %B
+  %tmp1a = bitcast <1 x i64> %tmp1 to <2 x i32>
+  %tmp3a = bitcast <1 x i64> %tmp3 to <2 x i32>
   %tmp4 = add <2 x i32> %tmp1a, %tmp3a
-  %tmp4a = bitcast <2 x i32> %tmp4 to x86_mmx
-  store x86_mmx %tmp4a, ptr %A
-  %tmp9 = load x86_mmx, ptr %B
-  %tmp9a = bitcast x86_mmx %tmp9 to <2 x i32>
+  %tmp4a = bitcast <2 x i32> %tmp4 to <1 x i64>
+  store <1 x i64> %tmp4a, ptr %A
+  %tmp9 = load <1 x i64>, ptr %B
+  %tmp9a = bitcast <1 x i64> %tmp9 to <2 x i32>
   %tmp10 = sub <2 x i32> %tmp4, %tmp9a
-  %tmp10a = bitcast <2 x i32> %tmp4 to x86_mmx
-  store x86_mmx %tmp10a, ptr %A
-  %tmp15 = load x86_mmx, ptr %B
-  %tmp10b = bitcast x86_mmx %tmp10a to <2 x i32>
-  %tmp15a = bitcast x86_mmx %tmp15 to <2 x i32>
+  %tmp10a = bitcast <2 x i32> %tmp4 to <1 x i64>
+  store <1 x i64> %tmp10a, ptr %A
+  %tmp15 = load <1 x i64>, ptr %B
+  %tmp10b = bitcast <1 x i64> %tmp10a to <2 x i32>
+  %tmp15a = bitcast <1 x i64> %tmp15 to <2 x i32>
   %tmp16 = mul <2 x i32> %tmp10b, %tmp15a
-  %tmp16a = bitcast <2 x i32> %tmp16 to x86_mmx
-  store x86_mmx %tmp16a, ptr %A
-  %tmp21 = load x86_mmx, ptr %B
-  %tmp16b = bitcast x86_mmx %tmp16a to <2 x i32>
-  %tmp21a = bitcast x86_mmx %tmp21 to <2 x i32>
+  %tmp16a = bitcast <2 x i32> %tmp16 to <1 x i64>
+  store <1 x i64> %tmp16a, ptr %A
+  %tmp21 = load <1 x i64>, ptr %B
+  %tmp16b = bitcast <1 x i64> %tmp16a to <2 x i32>
+  %tmp21a = bitcast <1 x i64> %tmp21 to <2 x i32>
   %tmp22 = and <2 x i32> %tmp16b, %tmp21a
-  %tmp22a = bitcast <2 x i32> %tmp22 to x86_mmx
-  store x86_mmx %tmp22a, ptr %A
-  %tmp27 = load x86_mmx, ptr %B
-  %tmp22b = bitcast x86_mmx %tmp22a to <2 x i32>
-  %tmp27a = bitcast x86_mmx %tmp27 to <2 x i32>
+  %tmp22a = bitcast <2 x i32> %tmp22 to <1 x i64>
+  store <1 x i64> %tmp22a, ptr %A
+  %tmp27 = load <1 x i64>, ptr %B
+  %tmp22b = bitcast <1 x i64> %tmp22a to <2 x i32>
+  %tmp27a = bitcast <1 x i64> %tmp27 to <2 x i32>
   %tmp28 = or <2 x i32> %tmp22b, %tmp27a
-  %tmp28a = bitcast <2 x i32> %tmp28 to x86_mmx
-  store x86_mmx %tmp28a, ptr %A
-  %tmp33 = load x86_mmx, ptr %B
-  %tmp28b = bitcast x86_mmx %tmp28a to <2 x i32>
-  %tmp33a = bitcast x86_mmx %tmp33 to <2 x i32>
+  %tmp28a = bitcast <2 x i32> %tmp28 to <1 x i64>
+  store <1 x i64> %tmp28a, ptr %A
+  %tmp33 = load <1 x i64>, ptr %B
+  %tmp28b = bitcast <1 x i64> %tmp28a to <2 x i32>
+  %tmp33a = bitcast <1 x i64> %tmp33 to <2 x i32>
   %tmp34 = xor <2 x i32> %tmp28b, %tmp33a
-  %tmp34a = bitcast <2 x i32> %tmp34 to x86_mmx
-  store x86_mmx %tmp34a, ptr %A
+  %tmp34a = bitcast <2 x i32> %tmp34 to <1 x i64>
+  store <1 x i64> %tmp34a, ptr %A
   tail call void @llvm.x86.mmx.emms( )
   ret void
 }
@@ -336,62 +336,61 @@ define void @test2(ptr %A, ptr %B) nounwind {
 ; X64-NEXT:    emms
 ; X64-NEXT:    retq
 entry:
-  %tmp1 = load x86_mmx, ptr %A
-  %tmp3 = load x86_mmx, ptr %B
-  %tmp1a = bitcast x86_mmx %tmp1 to <4 x i16>
-  %tmp3a = bitcast x86_mmx %tmp3 to <4 x i16>
+  %tmp1 = load <1 x i64>, ptr %A
+  %tmp3 = load <1 x i64>, ptr %B
+  %tmp1a = bitcast <1 x i64> %tmp1 to <4 x i16>
+  %tmp3a = bitcast <1 x i64> %tmp3 to <4 x i16>
   %tmp4 = add <4 x i16> %tmp1a, %tmp3a
-  %tmp4a = bitcast <4 x i16> %tmp4 to x86_mmx
-  store x86_mmx %tmp4a, ptr %A
-  %tmp7 = load x86_mmx, ptr %B
-  %tmp12 = tail call x86_mmx @llvm.x86.mmx.padds.w(x86_mmx %tmp4a, x86_mmx %tmp7)
-  store x86_mmx %tmp12, ptr %A
-  %tmp16 = load x86_mmx, ptr %B
-  %tmp21 = tail call x86_mmx @llvm.x86.mmx.paddus.w(x86_mmx %tmp12, x86_mmx %tmp16)
-  store x86_mmx %tmp21, ptr %A
-  %tmp27 = load x86_mmx, ptr %B
-  %tmp21a = bitcast x86_mmx %tmp21 to <4 x i16>
-  %tmp27a = bitcast x86_mmx %tmp27 to <4 x i16>
+  %tmp4a = bitcast <4 x i16> %tmp4 to <1 x i64>
+  store <1 x i64> %tmp4a, ptr %A
+  %tmp7 = load <1 x i64>, ptr %B
+  %tmp12 = tail call <1 x i64> @llvm.x86.mmx.padds.w(<1 x i64> %tmp4a, <1 x i64> %tmp7)
+  store <1 x i64> %tmp12, ptr %A
+  %tmp16 = load <1 x i64>, ptr %B
+  %tmp21 = tail call <1 x i64> @llvm.x86.mmx.paddus.w(<1 x i64> %tmp12, <1 x i64> %tmp16)
+  store <1 x i64> %tmp21, ptr %A
+  %tmp27 = load <1 x i64>, ptr %B
+  %tmp21a = bitcast <1 x i64> %tmp21 to <4 x i16>
+  %tmp27a = bitcast <1 x i64> %tmp27 to <4 x i16>
   %tmp28 = sub <4 x i16> %tmp21a, %tmp27a
-  %tmp28a = bitcast <4 x i16> %tmp28 to x86_mmx
-  store x86_mmx %tmp28a, ptr %A
-  %tmp31 = load x86_mmx, ptr %B
-  %tmp36 = tail call x86_mmx @llvm.x86.mmx.psubs.w(x86_mmx %tmp28a, x86_mmx %tmp31)
-  store x86_mmx %tmp36, ptr %A
-  %tmp40 = load x86_mmx, ptr %B
-  %tmp45 = tail call x86_mmx @llvm.x86.mmx.psubus.w(x86_mmx %tmp36, x86_mmx %tmp40)
-  store x86_mmx %tmp45, ptr %A
-  %tmp51 = load x86_mmx, ptr %B
-  %tmp45a = bitcast x86_mmx %tmp45 to <4 x i16>
-  %tmp51a = bitcast x86_mmx %tmp51 to <4 x i16>
+  %tmp28a = bitcast <4 x i16> %tmp28 to <1 x i64>
+  store <1 x i64> %tmp28a, ptr %A
+  %tmp31 = load <1 x i64>, ptr %B
+  %tmp36 = tail call <1 x i64> @llvm.x86.mmx.psubs.w(<1 x i64> %tmp28a, <1 x i64> %tmp31)
+  store <1 x i64> %tmp36, ptr %A
+  %tmp40 = load <1 x i64>, ptr %B
+  %tmp45 = tail call <1 x i64> @llvm.x86.mmx.psubus.w(<1 x i64> %tmp36, <1 x i64> %tmp40)
+  store <1 x i64> %tmp45, ptr %A
+  %tmp51 = load <1 x i64>, ptr %B
+  %tmp45a = bitcast <1 x i64> %tmp45 to <4 x i16>
+  %tmp51a = bitcast <1 x i64> %tmp51 to <4 x i16>
   %tmp52 = mul <4 x i16> %tmp45a, %tmp51a
-  %tmp52a = bitcast <4 x i16> %tmp52 to x86_mmx
-  store x86_mmx %tmp52a, ptr %A
-  %tmp55 = load x86_mmx, ptr %B
-  %tmp60 = tail call x86_mmx @llvm.x86.mmx.pmulh.w(x86_mmx %tmp52a, x86_mmx %tmp55)
-  store x86_mmx %tmp60, ptr %A
-  %tmp64 = load x86_mmx, ptr %B
-  %tmp69 = tail call x86_mmx @llvm.x86.mmx.pmadd.wd(x86_mmx %tmp60, x86_mmx %tmp64)
-  %tmp70 = bitcast x86_mmx %tmp69 to x86_mmx
-  store x86_mmx %tmp70, ptr %A
-  %tmp75 = load x86_mmx, ptr %B
-  %tmp70a = bitcast x86_mmx %tmp70 to <4 x i16>
-  %tmp75a = bitcast x86_mmx %tmp75 to <4 x i16>
+  %tmp52a = bitcast <4 x i16> %tmp52 to <1 x i64>
+  store <1 x i64> %tmp52a, ptr %A
+  %tmp55 = load <1 x i64>, ptr %B
+  %tmp60 = tail call <1 x i64> @llvm.x86.mmx.pmulh.w(<1 x i64> %tmp52a, <1 x i64> %tmp55)
+  store <1 x i64> %tmp60, ptr %A
+  %tmp64 = load <1 x i64>, ptr %B
+  %tmp69 = tail call <1 x i64> @llvm.x86.mmx.pmadd.wd(<1 x i64> %tmp60, <1 x i64> %tmp64)
+  store <1 x i64> %tmp69, ptr %A
+  %tmp75 = load <1 x i64>, ptr %B
+  %tmp70a = bitcast <1 x i64> %tmp69 to <4 x i16>
+  %tmp75a = bitcast <1 x i64> %tmp75 to <4 x i16>
   %tmp76 = and <4 x i16> %tmp70a, %tmp75a
-  %tmp76a = bitcast <4 x i16> %tmp76 to x86_mmx
-  store x86_mmx %tmp76a, ptr %A
-  %tmp81 = load x86_mmx, ptr %B
-  %tmp76b = bitcast x86_mmx %tmp76a to <4 x i16>
-  %tmp81a = bitcast x86_mmx %tmp81 to <4 x i16>
+  %tmp76a = bitcast <4 x i16> %tmp76 to <1 x i64>
+  store <1 x i64> %tmp76a, ptr %A
+  %tmp81 = load <1 x i64>, ptr %B
+  %tmp76b = bitcast <1 x i64> %tmp76a to <4 x i16>
+  %tmp81a = bitcast <1 x i64> %tmp81 to <4 x i16>
   %tmp82 = or <4 x i16> %tmp76b, %tmp81a
-  %tmp82a = bitcast <4 x i16> %tmp82 to x86_mmx
-  store x86_mmx %tmp82a, ptr %A
-  %tmp87 = load x86_mmx, ptr %B
-  %tmp82b = bitcast x86_mmx %tmp82a to <4 x i16>
-  %tmp87a = bitcast x86_mmx %tmp87 to <4 x i16>
+  %tmp82a = bitcast <4 x i16> %tmp82 to <1 x i64>
+  store <1 x i64> %tmp82a, ptr %A
+  %tmp87 = load <1 x i64>, ptr %B
+  %tmp82b = bitcast <1 x i64> %tmp82a to <4 x i16>
+  %tmp87a = bitcast <1 x i64> %tmp87 to <4 x i16>
   %tmp88 = xor <4 x i16> %tmp82b, %tmp87a
-  %tmp88a = bitcast <4 x i16> %tmp88 to x86_mmx
-  store x86_mmx %tmp88a, ptr %A
+  %tmp88a = bitcast <4 x i16> %tmp88 to <1 x i64>
+  store <1 x i64> %tmp88a, ptr %A
   tail call void @llvm.x86.mmx.emms( )
   ret void
 }
@@ -587,10 +586,10 @@ define void @ti8a(double %a, double %b) nounwind {
 ; X64-NEXT:    movq %mm1, 0
 ; X64-NEXT:    retq
 entry:
-  %tmp1 = bitcast double %a to x86_mmx
-  %tmp2 = bitcast double %b to x86_mmx
-  %tmp3 = tail call x86_mmx @llvm.x86.mmx.padd.b(x86_mmx %tmp1, x86_mmx %tmp2)
-  store x86_mmx %tmp3, ptr null
+  %tmp1 = bitcast double %a to <1 x i64>
+  %tmp2 = bitcast double %b to <1 x i64>
+  %tmp3 = tail call <1 x i64> @llvm.x86.mmx.padd.b(<1 x i64> %tmp1, <1 x i64> %tmp2)
+  store <1 x i64> %tmp3, ptr null
   ret void
 }
 
@@ -610,10 +609,10 @@ define void @ti16a(double %a, double %b) nounwind {
 ; X64-NEXT:    movq %mm1, 0
 ; X64-NEXT:    retq
 entry:
-  %tmp1 = bitcast double %a to x86_mmx
-  %tmp2 = bitcast double %b to x86_mmx
-  %tmp3 = tail call x86_mmx @llvm.x86.mmx.padd.w(x86_mmx %tmp1, x86_mmx %tmp2)
-  store x86_mmx %tmp3, ptr null
+  %tmp1 = bitcast double %a to <1 x i64>
+  %tmp2 = bitcast double %b to <1 x i64>
+  %tmp3 = tail call <1 x i64> @llvm.x86.mmx.padd.w(<1 x i64> %tmp1, <1 x i64> %tmp2)
+  store <1 x i64> %tmp3, ptr null
   ret void
 }
 
@@ -633,10 +632,10 @@ define void @ti32a(double %a, double %b) nounwind {
 ; X64-NEXT:    movq %mm1, 0
 ; X64-NEXT:    retq
 entry:
-  %tmp1 = bitcast double %a to x86_mmx
-  %tmp2 = bitcast double %b to x86_mmx
-  %tmp3 = tail call x86_mmx @llvm.x86.mmx.padd.d(x86_mmx %tmp1, x86_mmx %tmp2)
-  store x86_mmx %tmp3, ptr null
+  %tmp1 = bitcast double %a to <1 x i64>
+  %tmp2 = bitcast double %b to <1 x i64>
+  %tmp3 = tail call <1 x i64> @llvm.x86.mmx.padd.d(<1 x i64> %tmp1, <1 x i64> %tmp2)
+  store <1 x i64> %tmp3, ptr null
   ret void
 }
 
@@ -656,10 +655,10 @@ define void @ti64a(double %a, double %b) nounwind {
 ; X64-NEXT:    movq %mm1, 0
 ; X64-NEXT:    retq
 entry:
-  %tmp1 = bitcast double %a to x86_mmx
-  %tmp2 = bitcast double %b to x86_mmx
-  %tmp3 = tail call x86_mmx @llvm.x86.mmx.padd.q(x86_mmx %tmp1, x86_mmx %tmp2)
-  store x86_mmx %tmp3, ptr null
+  %tmp1 = bitcast double %a to <1 x i64>
+  %tmp2 = bitcast double %b to <1 x i64>
+  %tmp3 = tail call <1 x i64> @llvm.x86.mmx.padd.q(<1 x i64> %tmp1, <1 x i64> %tmp2)
+  store <1 x i64> %tmp3, ptr null
   ret void
 }
 
@@ -687,28 +686,28 @@ define i64 @pr43922() nounwind {
 ; X64-NEXT:    movq %mm0, %rax
 ; X64-NEXT:    retq
 entry:
-  %0 = tail call x86_mmx @llvm.x86.mmx.psrai.d(x86_mmx bitcast (<2 x i32> <i32 2058005162, i32 2058005162> to x86_mmx), i32 268435456)
-  %1 = bitcast x86_mmx %0 to i64
+  %0 = tail call <1 x i64> @llvm.x86.mmx.psrai.d(<1 x i64> bitcast (<2 x i32> <i32 2058005162, i32 2058005162> to <1 x i64>), i32 268435456)
+  %1 = bitcast <1 x i64> %0 to i64
   ret i64 %1
 }
-declare x86_mmx @llvm.x86.mmx.psrai.d(x86_mmx, i32)
+declare <1 x i64> @llvm.x86.mmx.psrai.d(<1 x i64>, i32)
 
-declare x86_mmx @llvm.x86.mmx.padd.b(x86_mmx, x86_mmx)
-declare x86_mmx @llvm.x86.mmx.padd.w(x86_mmx, x86_mmx)
-declare x86_mmx @llvm.x86.mmx.padd.d(x86_mmx, x86_mmx)
-declare x86_mmx @llvm.x86.mmx.padd.q(x86_mmx, x86_mmx)
+declare <1 x i64> @llvm.x86.mmx.padd.b(<1 x i64>, <1 x i64>)
+declare <1 x i64> @llvm.x86.mmx.padd.w(<1 x i64>, <1 x i64>)
+declare <1 x i64> @llvm.x86.mmx.padd.d(<1 x i64>, <1 x i64>)
+declare <1 x i64> @llvm.x86.mmx.padd.q(<1 x i64>, <1 x i64>)
 
-declare x86_mmx @llvm.x86.mmx.paddus.b(x86_mmx, x86_mmx)
-declare x86_mmx @llvm.x86.mmx.psubus.b(x86_mmx, x86_mmx)
-declare x86_mmx @llvm.x86.mmx.paddus.w(x86_mmx, x86_mmx)
-declare x86_mmx @llvm.x86.mmx.psubus.w(x86_mmx, x86_mmx)
-declare x86_mmx @llvm.x86.mmx.pmulh.w(x86_mmx, x86_mmx)
-declare x86_mmx @llvm.x86.mmx.pmadd.wd(x86_mmx, x86_mmx)
+declare <1 x i64> @llvm.x86.mmx.paddus.b(<1 x i64>, <1 x i64>)
+declare <1 x i64> @llvm.x86.mmx.psubus.b(<1 x i64>, <1 x i64>)
+declare <1 x i64> @llvm.x86.mmx.paddus.w(<1 x i64>, <1 x i64>)
+declare <1 x i64> @llvm.x86.mmx.psubus.w(<1 x i64>, <1 x i64>)
+declare <1 x i64> @llvm.x86.mmx.pmulh.w(<1 x i64>, <1 x i64>)
+declare <1 x i64> @llvm.x86.mmx.pmadd.wd(<1 x i64>, <1 x i64>)
 
 declare void @llvm.x86.mmx.emms()
 
-declare x86_mmx @llvm.x86.mmx.padds.b(x86_mmx, x86_mmx)
-declare x86_mmx @llvm.x86.mmx.padds.w(x86_mmx, x86_mmx)
-declare x86_mmx @llvm.x86.mmx.psubs.b(x86_mmx, x86_mmx)
-declare x86_mmx @llvm.x86.mmx.psubs.w(x86_mmx, x86_mmx)
+declare <1 x i64> @llvm.x86.mmx.padds.b(<1 x i64>, <1 x i64>)
+declare <1 x i64> @llvm.x86.mmx.padds.w(<1 x i64>, <1 x i64>)
+declare <1 x i64> @llvm.x86.mmx.psubs.b(<1 x i64>, <1 x i64>)
+declare <1 x i64> @llvm.x86.mmx.psubs.w(<1 x i64>, <1 x i64>)
 
diff --git a/llvm/test/CodeGen/X86/mmx-bitcast-fold.ll b/llvm/test/CodeGen/X86/mmx-bitcast-fold.ll
index a124091..fb2517f 100644
--- a/llvm/test/CodeGen/X86/mmx-bitcast-fold.ll
+++ b/llvm/test/CodeGen/X86/mmx-bitcast-fold.ll
@@ -4,9 +4,9 @@
 
 define void @bar() {
 entry:
-  %0 = bitcast double 0.0 to x86_mmx
-  %1 = call x86_mmx @foo(x86_mmx %0)
+  %0 = bitcast double 0.0 to <1 x i64>
+  %1 = call <1 x i64> @foo(<1 x i64> %0)
   ret void
 }
 
-declare x86_mmx @foo(x86_mmx)
+declare <1 x i64> @foo(<1 x i64>)
diff --git a/llvm/test/CodeGen/X86/mmx-bitcast.ll b/llvm/test/CodeGen/X86/mmx-bitcast.ll
index 49c2027..5e5be82 100644
--- a/llvm/test/CodeGen/X86/mmx-bitcast.ll
+++ b/llvm/test/CodeGen/X86/mmx-bitcast.ll
@@ -8,9 +8,9 @@ define i64 @t0(ptr %p) {
 ; CHECK-NEXT:    paddq %mm0, %mm0
 ; CHECK-NEXT:    movq %mm0, %rax
 ; CHECK-NEXT:    retq
-  %t = load x86_mmx, ptr %p
-  %u = tail call x86_mmx @llvm.x86.mmx.padd.q(x86_mmx %t, x86_mmx %t)
-  %s = bitcast x86_mmx %u to i64
+  %t = load <1 x i64>, ptr %p
+  %u = tail call <1 x i64> @llvm.x86.mmx.padd.q(<1 x i64> %t, <1 x i64> %t)
+  %s = bitcast <1 x i64> %u to i64
   ret i64 %s
 }
 
@@ -21,9 +21,9 @@ define i64 @t1(ptr %p) {
 ; CHECK-NEXT:    paddd %mm0, %mm0
 ; CHECK-NEXT:    movq %mm0, %rax
 ; CHECK-NEXT:    retq
-  %t = load x86_mmx, ptr %p
-  %u = tail call x86_mmx @llvm.x86.mmx.padd.d(x86_mmx %t, x86_mmx %t)
-  %s = bitcast x86_mmx %u to i64
+  %t = load <1 x i64>, ptr %p
+  %u = tail call <1 x i64> @llvm.x86.mmx.padd.d(<1 x i64> %t, <1 x i64> %t)
+  %s = bitcast <1 x i64> %u to i64
   ret i64 %s
 }
 
@@ -34,9 +34,9 @@ define i64 @t2(ptr %p) {
 ; CHECK-NEXT:    paddw %mm0, %mm0
 ; CHECK-NEXT:    movq %mm0, %rax
 ; CHECK-NEXT:    retq
-  %t = load x86_mmx, ptr %p
-  %u = tail call x86_mmx @llvm.x86.mmx.padd.w(x86_mmx %t, x86_mmx %t)
-  %s = bitcast x86_mmx %u to i64
+  %t = load <1 x i64>, ptr %p
+  %u = tail call <1 x i64> @llvm.x86.mmx.padd.w(<1 x i64> %t, <1 x i64> %t)
+  %s = bitcast <1 x i64> %u to i64
   ret i64 %s
 }
 
@@ -47,13 +47,13 @@ define i64 @t3(ptr %p) {
 ; CHECK-NEXT:    paddb %mm0, %mm0
 ; CHECK-NEXT:    movq %mm0, %rax
 ; CHECK-NEXT:    retq
-  %t = load x86_mmx, ptr %p
-  %u = tail call x86_mmx @llvm.x86.mmx.padd.b(x86_mmx %t, x86_mmx %t)
-  %s = bitcast x86_mmx %u to i64
+  %t = load <1 x i64>, ptr %p
+  %u = tail call <1 x i64> @llvm.x86.mmx.padd.b(<1 x i64> %t, <1 x i64> %t)
+  %s = bitcast <1 x i64> %u to i64
   ret i64 %s
 }
 
-@R = external global x86_mmx
+@R = external global <1 x i64>
 
 define void @t4(<1 x i64> %A, <1 x i64> %B) {
 ; CHECK-LABEL: t4:
@@ -66,10 +66,8 @@ define void @t4(<1 x i64> %A, <1 x i64> %B) {
 ; CHECK-NEXT:    emms
 ; CHECK-NEXT:    retq
 entry:
-  %tmp2 = bitcast <1 x i64> %A to x86_mmx
-  %tmp3 = bitcast <1 x i64> %B to x86_mmx
-  %tmp7 = tail call x86_mmx @llvm.x86.mmx.paddus.w(x86_mmx %tmp2, x86_mmx %tmp3)
-  store x86_mmx %tmp7, ptr @R
+  %tmp7 = tail call <1 x i64> @llvm.x86.mmx.paddus.w(<1 x i64> %A, <1 x i64> %B)
+  store <1 x i64> %tmp7, ptr @R
   tail call void @llvm.x86.mmx.emms()
   ret void
 }
@@ -88,7 +86,7 @@ define i64 @t5(i32 %a, i32 %b) nounwind readnone {
   ret i64 %conv
 }
 
-declare x86_mmx @llvm.x86.mmx.pslli.q(x86_mmx, i32)
+declare <1 x i64> @llvm.x86.mmx.pslli.q(<1 x i64>, i32)
 
 define <1 x i64> @t6(i64 %t) {
 ; CHECK-LABEL: t6:
@@ -98,16 +96,14 @@ define <1 x i64> @t6(i64 %t) {
 ; CHECK-NEXT:    movq %mm0, %rax
 ; CHECK-NEXT:    retq
   %t1 = insertelement <1 x i64> undef, i64 %t, i32 0
-  %t0 = bitcast <1 x i64> %t1 to x86_mmx
-  %t2 = tail call x86_mmx @llvm.x86.mmx.pslli.q(x86_mmx %t0, i32 48)
-  %t3 = bitcast x86_mmx %t2 to <1 x i64>
-  ret <1 x i64> %t3
+  %t2 = tail call <1 x i64> @llvm.x86.mmx.pslli.q(<1 x i64> %t1, i32 48)
+  ret <1 x i64> %t2
 }
 
-declare x86_mmx @llvm.x86.mmx.paddus.w(x86_mmx, x86_mmx)
-declare x86_mmx @llvm.x86.mmx.padd.b(x86_mmx, x86_mmx)
-declare x86_mmx @llvm.x86.mmx.padd.w(x86_mmx, x86_mmx)
-declare x86_mmx @llvm.x86.mmx.padd.d(x86_mmx, x86_mmx)
-declare x86_mmx @llvm.x86.mmx.padd.q(x86_mmx, x86_mmx)
+declare <1 x i64> @llvm.x86.mmx.paddus.w(<1 x i64>, <1 x i64>)
+declare <1 x i64> @llvm.x86.mmx.padd.b(<1 x i64>, <1 x i64>)
+declare <1 x i64> @llvm.x86.mmx.padd.w(<1 x i64>, <1 x i64>)
+declare <1 x i64> @llvm.x86.mmx.padd.d(<1 x i64>, <1 x i64>)
+declare <1 x i64> @llvm.x86.mmx.padd.q(<1 x i64>, <1 x i64>)
 declare void @llvm.x86.mmx.emms()
 
diff --git a/llvm/test/CodeGen/X86/mmx-build-vector.ll b/llvm/test/CodeGen/X86/mmx-build-vector.ll
index b919c9a..d8a010b 100644
--- a/llvm/test/CodeGen/X86/mmx-build-vector.ll
+++ b/llvm/test/CodeGen/X86/mmx-build-vector.ll
@@ -8,7 +8,7 @@
 ; RUN: llc < %s -mtriple=x86_64-unknown-unknown -mattr=+mmx,+avx2    | FileCheck %s --check-prefix=X64
 ; RUN: llc < %s -mtriple=x86_64-unknown-unknown -mattr=+mmx,+avx512f | FileCheck %s --check-prefix=X64
 
-declare x86_mmx @llvm.x86.mmx.padd.d(x86_mmx, x86_mmx)
+declare <1 x i64> @llvm.x86.mmx.padd.d(<1 x i64>, <1 x i64>)
 
 ;
 ; v2i32
@@ -35,9 +35,9 @@ define void @build_v2i32_01(ptr%p0, i32 %a0, i32 %a1) nounwind {
 ; X64-NEXT:    retq
   %1 = insertelement <2 x i32> undef, i32 %a0, i32 0
   %2 = insertelement <2 x i32>    %1, i32 %a1, i32 1
-  %3 = bitcast <2 x i32> %2 to x86_mmx
-  %4 = tail call x86_mmx @llvm.x86.mmx.padd.d(x86_mmx %3, x86_mmx %3)
-  store x86_mmx %4, ptr%p0
+  %3 = bitcast <2 x i32> %2 to <1 x i64>
+  %4 = tail call <1 x i64> @llvm.x86.mmx.padd.d(<1 x i64> %3, <1 x i64> %3)
+  store <1 x i64> %4, ptr%p0
   ret void
 }
 
@@ -58,9 +58,9 @@ define void @build_v2i32_0z(ptr%p0, i32 %a0, i32 %a1) nounwind {
 ; X64-NEXT:    retq
   %1 = insertelement <2 x i32> undef, i32 %a0, i32 0
   %2 = insertelement <2 x i32>    %1, i32   0, i32 1
-  %3 = bitcast <2 x i32> %2 to x86_mmx
-  %4 = tail call x86_mmx @llvm.x86.mmx.padd.d(x86_mmx %3, x86_mmx %3)
-  store x86_mmx %4, ptr%p0
+  %3 = bitcast <2 x i32> %2 to <1 x i64>
+  %4 = tail call <1 x i64> @llvm.x86.mmx.padd.d(<1 x i64> %3, <1 x i64> %3)
+  store <1 x i64> %4, ptr%p0
   ret void
 }
 
@@ -92,9 +92,9 @@ define void @build_v2i32_u1(ptr%p0, i32 %a0, i32 %a1) nounwind {
 ; X64-NEXT:    retq
   %1 = insertelement <2 x i32> undef, i32 undef, i32 0
   %2 = insertelement <2 x i32>    %1, i32   %a1, i32 1
-  %3 = bitcast <2 x i32> %2 to x86_mmx
-  %4 = tail call x86_mmx @llvm.x86.mmx.padd.d(x86_mmx %3, x86_mmx %3)
-  store x86_mmx %4, ptr%p0
+  %3 = bitcast <2 x i32> %2 to <1 x i64>
+  %4 = tail call <1 x i64> @llvm.x86.mmx.padd.d(<1 x i64> %3, <1 x i64> %3)
+  store <1 x i64> %4, ptr%p0
   ret void
 }
 
@@ -119,9 +119,9 @@ define void @build_v2i32_z1(ptr%p0, i32 %a0, i32 %a1) nounwind {
 ; X64-NEXT:    retq
   %1 = insertelement <2 x i32> undef, i32   0, i32 0
   %2 = insertelement <2 x i32>    %1, i32 %a1, i32 1
-  %3 = bitcast <2 x i32> %2 to x86_mmx
-  %4 = tail call x86_mmx @llvm.x86.mmx.padd.d(x86_mmx %3, x86_mmx %3)
-  store x86_mmx %4, ptr%p0
+  %3 = bitcast <2 x i32> %2 to <1 x i64>
+  %4 = tail call <1 x i64> @llvm.x86.mmx.padd.d(<1 x i64> %3, <1 x i64> %3)
+  store <1 x i64> %4, ptr%p0
   ret void
 }
 
@@ -153,9 +153,9 @@ define void @build_v2i32_00(ptr%p0, i32 %a0, i32 %a1) nounwind {
 ; X64-NEXT:    retq
   %1 = insertelement <2 x i32> undef, i32 %a0, i32 0
   %2 = insertelement <2 x i32>    %1, i32 %a0, i32 1
-  %3 = bitcast <2 x i32> %2 to x86_mmx
-  %4 = tail call x86_mmx @llvm.x86.mmx.padd.d(x86_mmx %3, x86_mmx %3)
-  store x86_mmx %4, ptr%p0
+  %3 = bitcast <2 x i32> %2 to <1 x i64>
+  %4 = tail call <1 x i64> @llvm.x86.mmx.padd.d(<1 x i64> %3, <1 x i64> %3)
+  store <1 x i64> %4, ptr%p0
   ret void
 }
 
@@ -194,9 +194,9 @@ define void @build_v4i16_0123(ptr%p0, i16 %a0, i16 %a1, i16 %a2, i16 %a3) nounwi
   %2 = insertelement <4 x i16>    %1, i16 %a1, i32 1
   %3 = insertelement <4 x i16>    %2, i16 %a2, i32 2
   %4 = insertelement <4 x i16>    %3, i16 %a3, i32 3
-  %5 = bitcast <4 x i16> %4 to x86_mmx
-  %6 = tail call x86_mmx @llvm.x86.mmx.padd.d(x86_mmx %5, x86_mmx %5)
-  store x86_mmx %6, ptr%p0
+  %5 = bitcast <4 x i16> %4 to <1 x i64>
+  %6 = tail call <1 x i64> @llvm.x86.mmx.padd.d(<1 x i64> %5, <1 x i64> %5)
+  store <1 x i64> %6, ptr%p0
   ret void
 }
 
@@ -229,9 +229,9 @@ define void @build_v4i16_01zz(ptr%p0, i16 %a0, i16 %a1, i16 %a2, i16 %a3) nounwi
   %2 = insertelement <4 x i16>    %1, i16 %a1, i32 1
   %3 = insertelement <4 x i16>    %2, i16   0, i32 2
   %4 = insertelement <4 x i16>    %3, i16   0, i32 3
-  %5 = bitcast <4 x i16> %4 to x86_mmx
-  %6 = tail call x86_mmx @llvm.x86.mmx.padd.d(x86_mmx %5, x86_mmx %5)
-  store x86_mmx %6, ptr%p0
+  %5 = bitcast <4 x i16> %4 to <1 x i64>
+  %6 = tail call <1 x i64> @llvm.x86.mmx.padd.d(<1 x i64> %5, <1 x i64> %5)
+  store <1 x i64> %6, ptr%p0
   ret void
 }
 
@@ -254,9 +254,9 @@ define void @build_v4i16_0uuz(ptr%p0, i16 %a0, i16 %a1, i16 %a2, i16 %a3) nounwi
   %2 = insertelement <4 x i16>    %1, i16 undef, i32 1
   %3 = insertelement <4 x i16>    %2, i16 undef, i32 2
   %4 = insertelement <4 x i16>    %3, i16     0, i32 3
-  %5 = bitcast <4 x i16> %4 to x86_mmx
-  %6 = tail call x86_mmx @llvm.x86.mmx.padd.d(x86_mmx %5, x86_mmx %5)
-  store x86_mmx %6, ptr%p0
+  %5 = bitcast <4 x i16> %4 to <1 x i64>
+  %6 = tail call <1 x i64> @llvm.x86.mmx.padd.d(<1 x i64> %5, <1 x i64> %5)
+  store <1 x i64> %6, ptr%p0
   ret void
 }
 
@@ -281,9 +281,9 @@ define void @build_v4i16_0zuz(ptr%p0, i16 %a0, i16 %a1, i16 %a2, i16 %a3) nounwi
   %2 = insertelement <4 x i16>    %1, i16     0, i32 1
   %3 = insertelement <4 x i16>    %2, i16 undef, i32 2
   %4 = insertelement <4 x i16>    %3, i16     0, i32 3
-  %5 = bitcast <4 x i16> %4 to x86_mmx
-  %6 = tail call x86_mmx @llvm.x86.mmx.padd.d(x86_mmx %5, x86_mmx %5)
-  store x86_mmx %6, ptr%p0
+  %5 = bitcast <4 x i16> %4 to <1 x i64>
+  %6 = tail call <1 x i64> @llvm.x86.mmx.padd.d(<1 x i64> %5, <1 x i64> %5)
+  store <1 x i64> %6, ptr%p0
   ret void
 }
 
@@ -316,9 +316,9 @@ define void @build_v4i16_012u(ptr%p0, i16 %a0, i16 %a1, i16 %a2, i16 %a3) nounwi
   %2 = insertelement <4 x i16>    %1, i16   %a1, i32 1
   %3 = insertelement <4 x i16>    %2, i16   %a2, i32 2
   %4 = insertelement <4 x i16>    %3, i16 undef, i32 3
-  %5 = bitcast <4 x i16> %4 to x86_mmx
-  %6 = tail call x86_mmx @llvm.x86.mmx.padd.d(x86_mmx %5, x86_mmx %5)
-  store x86_mmx %6, ptr%p0
+  %5 = bitcast <4 x i16> %4 to <1 x i64>
+  %6 = tail call <1 x i64> @llvm.x86.mmx.padd.d(<1 x i64> %5, <1 x i64> %5)
+  store <1 x i64> %6, ptr%p0
   ret void
 }
 
@@ -353,9 +353,9 @@ define void @build_v4i16_0u00(ptr%p0, i16 %a0, i16 %a1, i16 %a2, i16 %a3) nounwi
   %2 = insertelement <4 x i16>    %1, i16 undef, i32 1
   %3 = insertelement <4 x i16>    %2, i16   %a0, i32 2
   %4 = insertelement <4 x i16>    %3, i16   %a0, i32 3
-  %5 = bitcast <4 x i16> %4 to x86_mmx
-  %6 = tail call x86_mmx @llvm.x86.mmx.padd.d(x86_mmx %5, x86_mmx %5)
-  store x86_mmx %6, ptr%p0
+  %5 = bitcast <4 x i16> %4 to <1 x i64>
+  %6 = tail call <1 x i64> @llvm.x86.mmx.padd.d(<1 x i64> %5, <1 x i64> %5)
+  store <1 x i64> %6, ptr%p0
   ret void
 }
 
@@ -414,9 +414,9 @@ define void @build_v8i8_01234567(ptr%p0, i8 %a0, i8 %a1, i8 %a2, i8 %a3, i8 %a4,
   %6  = insertelement <8 x i8>    %5, i8 %a5, i32 5
   %7  = insertelement <8 x i8>    %6, i8 %a6, i32 6
   %8  = insertelement <8 x i8>    %7, i8 %a7, i32 7
-  %9  = bitcast <8 x i8> %8 to x86_mmx
-  %10 = tail call x86_mmx @llvm.x86.mmx.padd.d(x86_mmx %9, x86_mmx %9)
-  store x86_mmx %10, ptr%p0
+  %9  = bitcast <8 x i8> %8 to <1 x i64>
+  %10 = tail call <1 x i64> @llvm.x86.mmx.padd.d(<1 x i64> %9, <1 x i64> %9)
+  store <1 x i64> %10, ptr%p0
   ret void
 }
 
@@ -469,9 +469,9 @@ define void @build_v8i8_0u2345z7(ptr%p0, i8 %a0, i8 %a1, i8 %a2, i8 %a3, i8 %a4,
   %6  = insertelement <8 x i8>    %5, i8   %a5, i32 5
   %7  = insertelement <8 x i8>    %6, i8    0,  i32 6
   %8  = insertelement <8 x i8>    %7, i8   %a7, i32 7
-  %9  = bitcast <8 x i8> %8 to x86_mmx
-  %10 = tail call x86_mmx @llvm.x86.mmx.padd.d(x86_mmx %9, x86_mmx %9)
-  store x86_mmx %10, ptr%p0
+  %9  = bitcast <8 x i8> %8 to <1 x i64>
+  %10 = tail call <1 x i64> @llvm.x86.mmx.padd.d(<1 x i64> %9, <1 x i64> %9)
+  store <1 x i64> %10, ptr%p0
   ret void
 }
 
@@ -522,9 +522,9 @@ define void @build_v8i8_0123zzzu(ptr%p0, i8 %a0, i8 %a1, i8 %a2, i8 %a3, i8 %a4,
   %6  = insertelement <8 x i8>    %5, i8     0, i32 5
   %7  = insertelement <8 x i8>    %6, i8     0, i32 6
   %8  = insertelement <8 x i8>    %7, i8 undef, i32 7
-  %9  = bitcast <8 x i8> %8 to x86_mmx
-  %10 = tail call x86_mmx @llvm.x86.mmx.padd.d(x86_mmx %9, x86_mmx %9)
-  store x86_mmx %10, ptr%p0
+  %9  = bitcast <8 x i8> %8 to <1 x i64>
+  %10 = tail call <1 x i64> @llvm.x86.mmx.padd.d(<1 x i64> %9, <1 x i64> %9)
+  store <1 x i64> %10, ptr%p0
   ret void
 }
 
@@ -551,9 +551,9 @@ define void @build_v8i8_0uuuuzzz(ptr%p0, i8 %a0, i8 %a1, i8 %a2, i8 %a3, i8 %a4,
   %6  = insertelement <8 x i8>    %5, i8     0, i32 5
   %7  = insertelement <8 x i8>    %6, i8     0, i32 6
   %8  = insertelement <8 x i8>    %7, i8     0, i32 7
-  %9  = bitcast <8 x i8> %8 to x86_mmx
-  %10 = tail call x86_mmx @llvm.x86.mmx.padd.d(x86_mmx %9, x86_mmx %9)
-  store x86_mmx %10, ptr%p0
+  %9  = bitcast <8 x i8> %8 to <1 x i64>
+  %10 = tail call <1 x i64> @llvm.x86.mmx.padd.d(<1 x i64> %9, <1 x i64> %9)
+  store <1 x i64> %10, ptr%p0
   ret void
 }
 
@@ -582,9 +582,9 @@ define void @build_v8i8_0zzzzzzu(ptr%p0, i8 %a0, i8 %a1, i8 %a2, i8 %a3, i8 %a4,
   %6  = insertelement <8 x i8>    %5, i8     0, i32 5
   %7  = insertelement <8 x i8>    %6, i8     0, i32 6
   %8  = insertelement <8 x i8>    %7, i8 undef, i32 7
-  %9  = bitcast <8 x i8> %8 to x86_mmx
-  %10 = tail call x86_mmx @llvm.x86.mmx.padd.d(x86_mmx %9, x86_mmx %9)
-  store x86_mmx %10, ptr%p0
+  %9  = bitcast <8 x i8> %8 to <1 x i64>
+  %10 = tail call <1 x i64> @llvm.x86.mmx.padd.d(<1 x i64> %9, <1 x i64> %9)
+  store <1 x i64> %10, ptr%p0
   ret void
 }
 
@@ -626,9 +626,9 @@ define void @build_v8i8_00000000(ptr%p0, i8 %a0, i8 %a1, i8 %a2, i8 %a3, i8 %a4,
   %6  = insertelement <8 x i8>    %5, i8 %a0, i32 5
   %7  = insertelement <8 x i8>    %6, i8 %a0, i32 6
   %8  = insertelement <8 x i8>    %7, i8 %a0, i32 7
-  %9  = bitcast <8 x i8> %8 to x86_mmx
-  %10 = tail call x86_mmx @llvm.x86.mmx.padd.d(x86_mmx %9, x86_mmx %9)
-  store x86_mmx %10, ptr%p0
+  %9  = bitcast <8 x i8> %8 to <1 x i64>
+  %10 = tail call <1 x i64> @llvm.x86.mmx.padd.d(<1 x i64> %9, <1 x i64> %9)
+  store <1 x i64> %10, ptr%p0
   ret void
 }
 
@@ -669,9 +669,9 @@ define void @build_v2f32_01(ptr%p0, float %a0, float %a1) nounwind {
 ; X64-NEXT:    retq
   %1 = insertelement <2 x float> undef, float %a0, i32 0
   %2 = insertelement <2 x float>    %1, float %a1, i32 1
-  %3 = bitcast <2 x float> %2 to x86_mmx
-  %4 = tail call x86_mmx @llvm.x86.mmx.padd.d(x86_mmx %3, x86_mmx %3)
-  store x86_mmx %4, ptr%p0
+  %3 = bitcast <2 x float> %2 to <1 x i64>
+  %4 = tail call <1 x i64> @llvm.x86.mmx.padd.d(<1 x i64> %3, <1 x i64> %3)
+  store <1 x i64> %4, ptr%p0
   ret void
 }
 
@@ -707,9 +707,9 @@ define void @build_v2f32_0z(ptr%p0, float %a0, float %a1) nounwind {
 ; X64-NEXT:    retq
   %1 = insertelement <2 x float> undef, float %a0, i32 0
   %2 = insertelement <2 x float>    %1, float 0.0, i32 1
-  %3 = bitcast <2 x float> %2 to x86_mmx
-  %4 = tail call x86_mmx @llvm.x86.mmx.padd.d(x86_mmx %3, x86_mmx %3)
-  store x86_mmx %4, ptr%p0
+  %3 = bitcast <2 x float> %2 to <1 x i64>
+  %4 = tail call <1 x i64> @llvm.x86.mmx.padd.d(<1 x i64> %3, <1 x i64> %3)
+  store <1 x i64> %4, ptr%p0
   ret void
 }
 
@@ -742,9 +742,9 @@ define void @build_v2f32_u1(ptr%p0, float %a0, float %a1) nounwind {
 ; X64-NEXT:    retq
   %1 = insertelement <2 x float> undef, float undef, i32 0
   %2 = insertelement <2 x float>    %1, float   %a1, i32 1
-  %3 = bitcast <2 x float> %2 to x86_mmx
-  %4 = tail call x86_mmx @llvm.x86.mmx.padd.d(x86_mmx %3, x86_mmx %3)
-  store x86_mmx %4, ptr%p0
+  %3 = bitcast <2 x float> %2 to <1 x i64>
+  %4 = tail call <1 x i64> @llvm.x86.mmx.padd.d(<1 x i64> %3, <1 x i64> %3)
+  store <1 x i64> %4, ptr%p0
   ret void
 }
 
@@ -780,9 +780,9 @@ define void @build_v2f32_z1(ptr%p0, float %a0, float %a1) nounwind {
 ; X64-NEXT:    retq
   %1 = insertelement <2 x float> undef, float 0.0, i32 0
   %2 = insertelement <2 x float>    %1, float %a1, i32 1
-  %3 = bitcast <2 x float> %2 to x86_mmx
-  %4 = tail call x86_mmx @llvm.x86.mmx.padd.d(x86_mmx %3, x86_mmx %3)
-  store x86_mmx %4, ptr%p0
+  %3 = bitcast <2 x float> %2 to <1 x i64>
+  %4 = tail call <1 x i64> @llvm.x86.mmx.padd.d(<1 x i64> %3, <1 x i64> %3)
+  store <1 x i64> %4, ptr%p0
   ret void
 }
 
@@ -815,8 +815,8 @@ define void @build_v2f32_00(ptr%p0, float %a0, float %a1) nounwind {
 ; X64-NEXT:    retq
   %1 = insertelement <2 x float> undef, float %a0, i32 0
   %2 = insertelement <2 x float>    %1, float %a0, i32 1
-  %3 = bitcast <2 x float> %2 to x86_mmx
-  %4 = tail call x86_mmx @llvm.x86.mmx.padd.d(x86_mmx %3, x86_mmx %3)
-  store x86_mmx %4, ptr%p0
+  %3 = bitcast <2 x float> %2 to <1 x i64>
+  %4 = tail call <1 x i64> @llvm.x86.mmx.padd.d(<1 x i64> %3, <1 x i64> %3)
+  store <1 x i64> %4, ptr%p0
   ret void
 }
diff --git a/llvm/test/CodeGen/X86/mmx-coalescing.ll b/llvm/test/CodeGen/X86/mmx-coalescing.ll
index dac526f..589f5af 100644
--- a/llvm/test/CodeGen/X86/mmx-coalescing.ll
+++ b/llvm/test/CodeGen/X86/mmx-coalescing.ll
@@ -42,9 +42,9 @@ entry:
   %SA2 = getelementptr inbounds %SA, ptr %pSA, i64 0, i32 4
   %v3 = load ptr, ptr %SA2, align 8
   %v4 = bitcast <1 x i64> %v0 to <4 x i16>
-  %v5 = bitcast <4 x i16> %v4 to x86_mmx
-  %v6 = tail call x86_mmx @llvm.x86.sse.pshuf.w(x86_mmx %v5, i8 -18)
-  %v7 = bitcast x86_mmx %v6 to <4 x i16>
+  %v5 = bitcast <4 x i16> %v4 to <1 x i64>
+  %v6 = tail call <1 x i64> @llvm.x86.sse.pshuf.w(<1 x i64> %v5, i8 -18)
+  %v7 = bitcast <1 x i64> %v6 to <4 x i16>
   %v8 = bitcast <4 x i16> %v7 to <1 x i64>
   %v9 = extractelement <1 x i64> %v8, i32 0
   %v10 = bitcast i64 %v9 to <2 x i32>
@@ -55,18 +55,18 @@ entry:
 if.A:
   %pa = phi <1 x i64> [ %v8, %entry ], [ %vx, %if.C ]
   %v17 = extractelement <1 x i64> %pa, i32 0
-  %v18 = bitcast i64 %v17 to x86_mmx
-  %v19 = tail call x86_mmx @llvm.x86.mmx.pslli.q(x86_mmx %v18, i32 %B) #2
-  %v20 = bitcast x86_mmx %v19 to i64
+  %v18 = bitcast i64 %v17 to <1 x i64>
+  %v19 = tail call <1 x i64> @llvm.x86.mmx.pslli.q(<1 x i64> %v18, i32 %B) #2
+  %v20 = bitcast <1 x i64> %v19 to i64
   %v21 = insertelement <1 x i64> undef, i64 %v20, i32 0
   %cmp3 = icmp eq i64 %v20, 0
   br i1 %cmp3, label %if.C, label %merge
 
 if.B:
   %v34 = bitcast <1 x i64> %v8 to <4 x i16>
-  %v35 = bitcast <4 x i16> %v34 to x86_mmx
-  %v36 = tail call x86_mmx @llvm.x86.sse.pshuf.w(x86_mmx %v35, i8 -18)
-  %v37 = bitcast x86_mmx %v36 to <4 x i16>
+  %v35 = bitcast <4 x i16> %v34 to <1 x i64>
+  %v36 = tail call <1 x i64> @llvm.x86.sse.pshuf.w(<1 x i64> %v35, i8 -18)
+  %v37 = bitcast <1 x i64> %v36 to <4 x i16>
   %v38 = bitcast <4 x i16> %v37 to <1 x i64>
   br label %if.C
 
@@ -80,9 +80,9 @@ if.C:
 merge:
   %vy = phi <1 x i64> [ %v21, %if.A ], [ %vx, %if.C ]
   %v130 = bitcast <1 x i64> %vy to <4 x i16>
-  %v131 = bitcast <4 x i16> %v130 to x86_mmx
-  %v132 = tail call x86_mmx @llvm.x86.sse.pshuf.w(x86_mmx %v131, i8 -18)
-  %v133 = bitcast x86_mmx %v132 to <4 x i16>
+  %v131 = bitcast <4 x i16> %v130 to <1 x i64>
+  %v132 = tail call <1 x i64> @llvm.x86.sse.pshuf.w(<1 x i64> %v131, i8 -18)
+  %v133 = bitcast <1 x i64> %v132 to <4 x i16>
   %v134 = bitcast <4 x i16> %v133 to <1 x i64>
   %v135 = extractelement <1 x i64> %v134, i32 0
   %v136 = bitcast i64 %v135 to <2 x i32>
@@ -91,5 +91,5 @@ merge:
 }
 
 
-declare x86_mmx @llvm.x86.sse.pshuf.w(x86_mmx, i8)
-declare x86_mmx @llvm.x86.mmx.pslli.q(x86_mmx, i32)
+declare <1 x i64> @llvm.x86.sse.pshuf.w(<1 x i64>, i8)
+declare <1 x i64> @llvm.x86.mmx.pslli.q(<1 x i64>, i32)
diff --git a/llvm/test/CodeGen/X86/mmx-cvt.ll b/llvm/test/CodeGen/X86/mmx-cvt.ll
index 11473f3..51a71da 100644
--- a/llvm/test/CodeGen/X86/mmx-cvt.ll
+++ b/llvm/test/CodeGen/X86/mmx-cvt.ll
@@ -23,9 +23,9 @@ define void @cvt_v2f64_v2i32(<2 x double>, ptr) nounwind {
   %3 = tail call <4 x i32> @llvm.x86.sse2.cvtpd2dq(<2 x double> %0)
   %4 = bitcast <4 x i32> %3 to <2 x i64>
   %5 = extractelement <2 x i64> %4, i32 0
-  %6 = bitcast i64 %5 to x86_mmx
-  %7 = tail call x86_mmx @llvm.x86.mmx.padd.d(x86_mmx %6, x86_mmx %6)
-  %8 = bitcast x86_mmx %7 to i64
+  %6 = bitcast i64 %5 to <1 x i64>
+  %7 = tail call <1 x i64> @llvm.x86.mmx.padd.d(<1 x i64> %6, <1 x i64> %6)
+  %8 = bitcast <1 x i64> %7 to i64
   %9 = insertelement <1 x i64> undef, i64 %8, i32 0
   store <1 x i64> %9, ptr %1
   ret void
@@ -49,9 +49,9 @@ define void @cvtt_v2f64_v2i32(<2 x double>, ptr) nounwind {
   %3 = tail call <4 x i32> @llvm.x86.sse2.cvttpd2dq(<2 x double> %0)
   %4 = bitcast <4 x i32> %3 to <2 x i64>
   %5 = extractelement <2 x i64> %4, i32 0
-  %6 = bitcast i64 %5 to x86_mmx
-  %7 = tail call x86_mmx @llvm.x86.mmx.padd.d(x86_mmx %6, x86_mmx %6)
-  %8 = bitcast x86_mmx %7 to i64
+  %6 = bitcast i64 %5 to <1 x i64>
+  %7 = tail call <1 x i64> @llvm.x86.mmx.padd.d(<1 x i64> %6, <1 x i64> %6)
+  %8 = bitcast <1 x i64> %7 to i64
   %9 = insertelement <1 x i64> undef, i64 %8, i32 0
   store <1 x i64> %9, ptr %1
   ret void
@@ -73,9 +73,9 @@ define void @fptosi_v2f64_v2i32(<2 x double>, ptr) nounwind {
 ; X64-NEXT:    movq %mm0, (%rdi)
 ; X64-NEXT:    retq
   %3 = fptosi <2 x double> %0 to <2 x i32>
-  %4 = bitcast <2 x i32> %3 to x86_mmx
-  %5 = tail call x86_mmx @llvm.x86.mmx.padd.d(x86_mmx %4, x86_mmx %4)
-  %6 = bitcast x86_mmx %5 to i64
+  %4 = bitcast <2 x i32> %3 to <1 x i64>
+  %5 = tail call <1 x i64> @llvm.x86.mmx.padd.d(<1 x i64> %4, <1 x i64> %4)
+  %6 = bitcast <1 x i64> %5 to i64
   %7 = insertelement <1 x i64> undef, i64 %6, i32 0
   store <1 x i64> %7, ptr %1
   ret void
@@ -99,9 +99,9 @@ define void @cvt_v2f32_v2i32(<4 x float>, ptr) nounwind {
   %3 = tail call <4 x i32> @llvm.x86.sse2.cvtps2dq(<4 x float> %0)
   %4 = bitcast <4 x i32> %3 to <2 x i64>
   %5 = extractelement <2 x i64> %4, i32 0
-  %6 = bitcast i64 %5 to x86_mmx
-  %7 = tail call x86_mmx @llvm.x86.mmx.padd.d(x86_mmx %6, x86_mmx %6)
-  %8 = bitcast x86_mmx %7 to i64
+  %6 = bitcast i64 %5 to <1 x i64>
+  %7 = tail call <1 x i64> @llvm.x86.mmx.padd.d(<1 x i64> %6, <1 x i64> %6)
+  %8 = bitcast <1 x i64> %7 to i64
   %9 = insertelement <1 x i64> undef, i64 %8, i32 0
   store <1 x i64> %9, ptr %1
   ret void
@@ -125,9 +125,9 @@ define void @cvtt_v2f32_v2i32(<4 x float>, ptr) nounwind {
   %3 = tail call <4 x i32> @llvm.x86.sse2.cvttps2dq(<4 x float> %0)
   %4 = bitcast <4 x i32> %3 to <2 x i64>
   %5 = extractelement <2 x i64> %4, i32 0
-  %6 = bitcast i64 %5 to x86_mmx
-  %7 = tail call x86_mmx @llvm.x86.mmx.padd.d(x86_mmx %6, x86_mmx %6)
-  %8 = bitcast x86_mmx %7 to i64
+  %6 = bitcast i64 %5 to <1 x i64>
+  %7 = tail call <1 x i64> @llvm.x86.mmx.padd.d(<1 x i64> %6, <1 x i64> %6)
+  %8 = bitcast <1 x i64> %7 to i64
   %9 = insertelement <1 x i64> undef, i64 %8, i32 0
   store <1 x i64> %9, ptr %1
   ret void
@@ -150,9 +150,9 @@ define void @fptosi_v4f32_v4i32(<4 x float>, ptr) nounwind {
 ; X64-NEXT:    retq
   %3 = fptosi <4 x float> %0 to <4 x i32>
   %4 = shufflevector <4 x i32> %3, <4 x i32> undef, <2 x i32> <i32 0, i32 1>
-  %5 = bitcast <2 x i32> %4 to x86_mmx
-  %6 = tail call x86_mmx @llvm.x86.mmx.padd.d(x86_mmx %5, x86_mmx %5)
-  %7 = bitcast x86_mmx %6 to i64
+  %5 = bitcast <2 x i32> %4 to <1 x i64>
+  %6 = tail call <1 x i64> @llvm.x86.mmx.padd.d(<1 x i64> %5, <1 x i64> %5)
+  %7 = bitcast <1 x i64> %6 to i64
   %8 = insertelement <1 x i64> undef, i64 %7, i32 0
   store <1 x i64> %8, ptr %1
   ret void
@@ -176,9 +176,9 @@ define void @fptosi_v2f32_v2i32(<4 x float>, ptr) nounwind {
   %3 = fptosi <4 x float> %0 to <4 x i32>
   %4 = bitcast <4 x i32> %3 to <2 x i64>
   %5 = extractelement <2 x i64> %4, i32 0
-  %6 = bitcast i64 %5 to x86_mmx
-  %7 = tail call x86_mmx @llvm.x86.mmx.padd.d(x86_mmx %6, x86_mmx %6)
-  %8 = bitcast x86_mmx %7 to i64
+  %6 = bitcast i64 %5 to <1 x i64>
+  %7 = tail call <1 x i64> @llvm.x86.mmx.padd.d(<1 x i64> %6, <1 x i64> %6)
+  %8 = bitcast <1 x i64> %7 to i64
   %9 = insertelement <1 x i64> undef, i64 %8, i32 0
   store <1 x i64> %9, ptr %1
   ret void
@@ -210,9 +210,9 @@ define <2 x double> @sitofp_v2i32_v2f64(ptr) nounwind {
 ; X64-NEXT:    movq2dq %mm0, %xmm0
 ; X64-NEXT:    cvtdq2pd %xmm0, %xmm0
 ; X64-NEXT:    retq
-  %2 = load x86_mmx, ptr %0, align 8
-  %3 = tail call x86_mmx @llvm.x86.mmx.padd.d(x86_mmx %2, x86_mmx %2)
-  %4 = bitcast x86_mmx %3 to i64
+  %2 = load <1 x i64>, ptr %0, align 8
+  %3 = tail call <1 x i64> @llvm.x86.mmx.padd.d(<1 x i64> %2, <1 x i64> %2)
+  %4 = bitcast <1 x i64> %3 to i64
   %5 = insertelement <2 x i64> undef, i64 %4, i32 0
   %6 = bitcast <2 x i64> %5 to <4 x i32>
   %7 = shufflevector <4 x i32> %6, <4 x i32> undef, <2 x i32> <i32 0, i32 1>
@@ -237,9 +237,9 @@ define <4 x float> @sitofp_v2i32_v2f32(ptr) nounwind {
 ; X64-NEXT:    movq2dq %mm0, %xmm0
 ; X64-NEXT:    cvtdq2ps %xmm0, %xmm0
 ; X64-NEXT:    retq
-  %2 = load x86_mmx, ptr %0, align 8
-  %3 = tail call x86_mmx @llvm.x86.mmx.padd.d(x86_mmx %2, x86_mmx %2)
-  %4 = bitcast x86_mmx %3 to <2 x i32>
+  %2 = load <1 x i64>, ptr %0, align 8
+  %3 = tail call <1 x i64> @llvm.x86.mmx.padd.d(<1 x i64> %2, <1 x i64> %2)
+  %4 = bitcast <1 x i64> %3 to <2 x i32>
   %5 = shufflevector <2 x i32> %4, <2 x i32> zeroinitializer, <4 x i32> <i32 0, i32 1, i32 2, i32 3>
   %6 = sitofp <4 x i32> %5 to <4 x float>
   ret <4 x float> %6
@@ -269,9 +269,9 @@ define <4 x float> @cvt_v2i32_v2f32(ptr) nounwind {
 ; X64-NEXT:    movq2dq %mm0, %xmm0
 ; X64-NEXT:    cvtdq2ps %xmm0, %xmm0
 ; X64-NEXT:    retq
-  %2 = load x86_mmx, ptr %0, align 8
-  %3 = tail call x86_mmx @llvm.x86.mmx.padd.d(x86_mmx %2, x86_mmx %2)
-  %4 = bitcast x86_mmx %3 to i64
+  %2 = load <1 x i64>, ptr %0, align 8
+  %3 = tail call <1 x i64> @llvm.x86.mmx.padd.d(<1 x i64> %2, <1 x i64> %2)
+  %4 = bitcast <1 x i64> %3 to i64
   %5 = insertelement <2 x i64> undef, i64 %4, i32 0
   %6 = insertelement <2 x i64> %5, i64 0, i32 1
   %7 = bitcast <2 x i64> %6 to <4 x i32>
@@ -279,7 +279,7 @@ define <4 x float> @cvt_v2i32_v2f32(ptr) nounwind {
   ret <4 x float> %8
 }
 
-declare x86_mmx @llvm.x86.mmx.padd.d(x86_mmx, x86_mmx)
+declare <1 x i64> @llvm.x86.mmx.padd.d(<1 x i64>, <1 x i64>)
 declare <4 x i32> @llvm.x86.sse2.cvtpd2dq(<2 x double>)
 declare <4 x i32> @llvm.x86.sse2.cvttpd2dq(<2 x double>)
 declare <4 x i32> @llvm.x86.sse2.cvtps2dq(<4 x float>)
diff --git a/llvm/test/CodeGen/X86/mmx-fold-load.ll b/llvm/test/CodeGen/X86/mmx-fold-load.ll
index a313399..6fe3bc4 100644
--- a/llvm/test/CodeGen/X86/mmx-fold-load.ll
+++ b/llvm/test/CodeGen/X86/mmx-fold-load.ll
@@ -29,13 +29,13 @@ define i64 @t0(ptr %a, ptr %b) nounwind {
 ; X64-NEXT:    movq %mm0, %rax
 ; X64-NEXT:    retq
 entry:
-  %0 = load x86_mmx, ptr %a, align 8
+  %0 = load <1 x i64>, ptr %a, align 8
   %1 = load i32, ptr %b, align 4
-  %2 = tail call x86_mmx @llvm.x86.mmx.pslli.q(x86_mmx %0, i32 %1)
-  %3 = bitcast x86_mmx %2 to i64
+  %2 = tail call <1 x i64> @llvm.x86.mmx.pslli.q(<1 x i64> %0, i32 %1)
+  %3 = bitcast <1 x i64> %2 to i64
   ret i64 %3
 }
-declare x86_mmx @llvm.x86.mmx.pslli.q(x86_mmx, i32)
+declare <1 x i64> @llvm.x86.mmx.pslli.q(<1 x i64>, i32)
 
 define i64 @t1(ptr %a, ptr %b) nounwind {
 ; X86-LABEL: t1:
@@ -64,13 +64,13 @@ define i64 @t1(ptr %a, ptr %b) nounwind {
 ; X64-NEXT:    movq %mm0, %rax
 ; X64-NEXT:    retq
 entry:
-  %0 = load x86_mmx, ptr %a, align 8
+  %0 = load <1 x i64>, ptr %a, align 8
   %1 = load i32, ptr %b, align 4
-  %2 = tail call x86_mmx @llvm.x86.mmx.psrli.q(x86_mmx %0, i32 %1)
-  %3 = bitcast x86_mmx %2 to i64
+  %2 = tail call <1 x i64> @llvm.x86.mmx.psrli.q(<1 x i64> %0, i32 %1)
+  %3 = bitcast <1 x i64> %2 to i64
   ret i64 %3
 }
-declare x86_mmx @llvm.x86.mmx.psrli.q(x86_mmx, i32)
+declare <1 x i64> @llvm.x86.mmx.psrli.q(<1 x i64>, i32)
 
 define i64 @t2(ptr %a, ptr %b) nounwind {
 ; X86-LABEL: t2:
@@ -99,13 +99,13 @@ define i64 @t2(ptr %a, ptr %b) nounwind {
 ; X64-NEXT:    movq %mm0, %rax
 ; X64-NEXT:    retq
 entry:
-  %0 = load x86_mmx, ptr %a, align 8
+  %0 = load <1 x i64>, ptr %a, align 8
   %1 = load i32, ptr %b, align 4
-  %2 = tail call x86_mmx @llvm.x86.mmx.pslli.w(x86_mmx %0, i32 %1)
-  %3 = bitcast x86_mmx %2 to i64
+  %2 = tail call <1 x i64> @llvm.x86.mmx.pslli.w(<1 x i64> %0, i32 %1)
+  %3 = bitcast <1 x i64> %2 to i64
   ret i64 %3
 }
-declare x86_mmx @llvm.x86.mmx.pslli.w(x86_mmx, i32)
+declare <1 x i64> @llvm.x86.mmx.pslli.w(<1 x i64>, i32)
 
 define i64 @t3(ptr %a, ptr %b) nounwind {
 ; X86-LABEL: t3:
@@ -134,13 +134,13 @@ define i64 @t3(ptr %a, ptr %b) nounwind {
 ; X64-NEXT:    movq %mm0, %rax
 ; X64-NEXT:    retq
 entry:
-  %0 = load x86_mmx, ptr %a, align 8
+  %0 = load <1 x i64>, ptr %a, align 8
   %1 = load i32, ptr %b, align 4
-  %2 = tail call x86_mmx @llvm.x86.mmx.psrli.w(x86_mmx %0, i32 %1)
-  %3 = bitcast x86_mmx %2 to i64
+  %2 = tail call <1 x i64> @llvm.x86.mmx.psrli.w(<1 x i64> %0, i32 %1)
+  %3 = bitcast <1 x i64> %2 to i64
   ret i64 %3
 }
-declare x86_mmx @llvm.x86.mmx.psrli.w(x86_mmx, i32)
+declare <1 x i64> @llvm.x86.mmx.psrli.w(<1 x i64>, i32)
 
 define i64 @t4(ptr %a, ptr %b) nounwind {
 ; X86-LABEL: t4:
@@ -169,13 +169,13 @@ define i64 @t4(ptr %a, ptr %b) nounwind {
 ; X64-NEXT:    movq %mm0, %rax
 ; X64-NEXT:    retq
 entry:
-  %0 = load x86_mmx, ptr %a, align 8
+  %0 = load <1 x i64>, ptr %a, align 8
   %1 = load i32, ptr %b, align 4
-  %2 = tail call x86_mmx @llvm.x86.mmx.pslli.d(x86_mmx %0, i32 %1)
-  %3 = bitcast x86_mmx %2 to i64
+  %2 = tail call <1 x i64> @llvm.x86.mmx.pslli.d(<1 x i64> %0, i32 %1)
+  %3 = bitcast <1 x i64> %2 to i64
   ret i64 %3
 }
-declare x86_mmx @llvm.x86.mmx.pslli.d(x86_mmx, i32)
+declare <1 x i64> @llvm.x86.mmx.pslli.d(<1 x i64>, i32)
 
 define i64 @t5(ptr %a, ptr %b) nounwind {
 ; X86-LABEL: t5:
@@ -204,13 +204,13 @@ define i64 @t5(ptr %a, ptr %b) nounwind {
 ; X64-NEXT:    movq %mm0, %rax
 ; X64-NEXT:    retq
 entry:
-  %0 = load x86_mmx, ptr %a, align 8
+  %0 = load <1 x i64>, ptr %a, align 8
   %1 = load i32, ptr %b, align 4
-  %2 = tail call x86_mmx @llvm.x86.mmx.psrli.d(x86_mmx %0, i32 %1)
-  %3 = bitcast x86_mmx %2 to i64
+  %2 = tail call <1 x i64> @llvm.x86.mmx.psrli.d(<1 x i64> %0, i32 %1)
+  %3 = bitcast <1 x i64> %2 to i64
   ret i64 %3
 }
-declare x86_mmx @llvm.x86.mmx.psrli.d(x86_mmx, i32)
+declare <1 x i64> @llvm.x86.mmx.psrli.d(<1 x i64>, i32)
 
 define i64 @t6(ptr %a, ptr %b) nounwind {
 ; X86-LABEL: t6:
@@ -239,13 +239,13 @@ define i64 @t6(ptr %a, ptr %b) nounwind {
 ; X64-NEXT:    movq %mm0, %rax
 ; X64-NEXT:    retq
 entry:
-  %0 = load x86_mmx, ptr %a, align 8
+  %0 = load <1 x i64>, ptr %a, align 8
   %1 = load i32, ptr %b, align 4
-  %2 = tail call x86_mmx @llvm.x86.mmx.psrai.w(x86_mmx %0, i32 %1)
-  %3 = bitcast x86_mmx %2 to i64
+  %2 = tail call <1 x i64> @llvm.x86.mmx.psrai.w(<1 x i64> %0, i32 %1)
+  %3 = bitcast <1 x i64> %2 to i64
   ret i64 %3
 }
-declare x86_mmx @llvm.x86.mmx.psrai.w(x86_mmx, i32)
+declare <1 x i64> @llvm.x86.mmx.psrai.w(<1 x i64>, i32)
 
 define i64 @t7(ptr %a, ptr %b) nounwind {
 ; X86-LABEL: t7:
@@ -274,15 +274,15 @@ define i64 @t7(ptr %a, ptr %b) nounwind {
 ; X64-NEXT:    movq %mm0, %rax
 ; X64-NEXT:    retq
 entry:
-  %0 = load x86_mmx, ptr %a, align 8
+  %0 = load <1 x i64>, ptr %a, align 8
   %1 = load i32, ptr %b, align 4
-  %2 = tail call x86_mmx @llvm.x86.mmx.psrai.d(x86_mmx %0, i32 %1)
-  %3 = bitcast x86_mmx %2 to i64
+  %2 = tail call <1 x i64> @llvm.x86.mmx.psrai.d(<1 x i64> %0, i32 %1)
+  %3 = bitcast <1 x i64> %2 to i64
   ret i64 %3
 }
-declare x86_mmx @llvm.x86.mmx.psrai.d(x86_mmx, i32)
+declare <1 x i64> @llvm.x86.mmx.psrai.d(<1 x i64>, i32)
 
-define i64 @tt0(x86_mmx %t, ptr %q) nounwind {
+define i64 @tt0(<1 x i64> %t, ptr %q) nounwind {
 ; X86-LABEL: tt0:
 ; X86:       # %bb.0: # %entry
 ; X86-NEXT:    pushl %ebp
@@ -312,16 +312,16 @@ define i64 @tt0(x86_mmx %t, ptr %q) nounwind {
 ; X64-NEXT:    emms
 ; X64-NEXT:    retq
 entry:
-  %v = load x86_mmx, ptr %q
-  %u = tail call x86_mmx @llvm.x86.mmx.padd.b(x86_mmx %t, x86_mmx %v)
-  %s = bitcast x86_mmx %u to i64
+  %v = load <1 x i64>, ptr %q
+  %u = tail call <1 x i64> @llvm.x86.mmx.padd.b(<1 x i64> %t, <1 x i64> %v)
+  %s = bitcast <1 x i64> %u to i64
   call void @llvm.x86.mmx.emms()
   ret i64 %s
 }
-declare x86_mmx @llvm.x86.mmx.padd.b(x86_mmx, x86_mmx)
+declare <1 x i64> @llvm.x86.mmx.padd.b(<1 x i64>, <1 x i64>)
 declare void @llvm.x86.mmx.emms()
 
-define i64 @tt1(x86_mmx %t, ptr %q) nounwind {
+define i64 @tt1(<1 x i64> %t, ptr %q) nounwind {
 ; X86-LABEL: tt1:
 ; X86:       # %bb.0: # %entry
 ; X86-NEXT:    pushl %ebp
@@ -351,15 +351,15 @@ define i64 @tt1(x86_mmx %t, ptr %q) nounwind {
 ; X64-NEXT:    emms
 ; X64-NEXT:    retq
 entry:
-  %v = load x86_mmx, ptr %q
-  %u = tail call x86_mmx @llvm.x86.mmx.padd.w(x86_mmx %t, x86_mmx %v)
-  %s = bitcast x86_mmx %u to i64
+  %v = load <1 x i64>, ptr %q
+  %u = tail call <1 x i64> @llvm.x86.mmx.padd.w(<1 x i64> %t, <1 x i64> %v)
+  %s = bitcast <1 x i64> %u to i64
   call void @llvm.x86.mmx.emms()
   ret i64 %s
 }
-declare x86_mmx @llvm.x86.mmx.padd.w(x86_mmx, x86_mmx)
+declare <1 x i64> @llvm.x86.mmx.padd.w(<1 x i64>, <1 x i64>)
 
-define i64 @tt2(x86_mmx %t, ptr %q) nounwind {
+define i64 @tt2(<1 x i64> %t, ptr %q) nounwind {
 ; X86-LABEL: tt2:
 ; X86:       # %bb.0: # %entry
 ; X86-NEXT:    pushl %ebp
@@ -389,15 +389,15 @@ define i64 @tt2(x86_mmx %t, ptr %q) nounwind {
 ; X64-NEXT:    emms
 ; X64-NEXT:    retq
 entry:
-  %v = load x86_mmx, ptr %q
-  %u = tail call x86_mmx @llvm.x86.mmx.padd.d(x86_mmx %t, x86_mmx %v)
-  %s = bitcast x86_mmx %u to i64
+  %v = load <1 x i64>, ptr %q
+  %u = tail call <1 x i64> @llvm.x86.mmx.padd.d(<1 x i64> %t, <1 x i64> %v)
+  %s = bitcast <1 x i64> %u to i64
   call void @llvm.x86.mmx.emms()
   ret i64 %s
 }
-declare x86_mmx @llvm.x86.mmx.padd.d(x86_mmx, x86_mmx)
+declare <1 x i64> @llvm.x86.mmx.padd.d(<1 x i64>, <1 x i64>)
 
-define i64 @tt3(x86_mmx %t, ptr %q) nounwind {
+define i64 @tt3(<1 x i64> %t, ptr %q) nounwind {
 ; X86-LABEL: tt3:
 ; X86:       # %bb.0: # %entry
 ; X86-NEXT:    pushl %ebp
@@ -427,15 +427,15 @@ define i64 @tt3(x86_mmx %t, ptr %q) nounwind {
 ; X64-NEXT:    emms
 ; X64-NEXT:    retq
 entry:
-  %v = load x86_mmx, ptr %q
-  %u = tail call x86_mmx @llvm.x86.mmx.padd.q(x86_mmx %t, x86_mmx %v)
-  %s = bitcast x86_mmx %u to i64
+  %v = load <1 x i64>, ptr %q
+  %u = tail call <1 x i64> @llvm.x86.mmx.padd.q(<1 x i64> %t, <1 x i64> %v)
+  %s = bitcast <1 x i64> %u to i64
   call void @llvm.x86.mmx.emms()
   ret i64 %s
 }
-declare x86_mmx @llvm.x86.mmx.padd.q(x86_mmx, x86_mmx)
+declare <1 x i64> @llvm.x86.mmx.padd.q(<1 x i64>, <1 x i64>)
 
-define i64 @tt4(x86_mmx %t, ptr %q) nounwind {
+define i64 @tt4(<1 x i64> %t, ptr %q) nounwind {
 ; X86-LABEL: tt4:
 ; X86:       # %bb.0: # %entry
 ; X86-NEXT:    pushl %ebp
@@ -465,15 +465,15 @@ define i64 @tt4(x86_mmx %t, ptr %q) nounwind {
 ; X64-NEXT:    emms
 ; X64-NEXT:    retq
 entry:
-  %v = load x86_mmx, ptr %q
-  %u = tail call x86_mmx @llvm.x86.mmx.paddus.b(x86_mmx %t, x86_mmx %v)
-  %s = bitcast x86_mmx %u to i64
+  %v = load <1 x i64>, ptr %q
+  %u = tail call <1 x i64> @llvm.x86.mmx.paddus.b(<1 x i64> %t, <1 x i64> %v)
+  %s = bitcast <1 x i64> %u to i64
   call void @llvm.x86.mmx.emms()
   ret i64 %s
 }
-declare x86_mmx @llvm.x86.mmx.paddus.b(x86_mmx, x86_mmx)
+declare <1 x i64> @llvm.x86.mmx.paddus.b(<1 x i64>, <1 x i64>)
 
-define i64 @tt5(x86_mmx %t, ptr %q) nounwind {
+define i64 @tt5(<1 x i64> %t, ptr %q) nounwind {
 ; X86-LABEL: tt5:
 ; X86:       # %bb.0: # %entry
 ; X86-NEXT:    pushl %ebp
@@ -503,15 +503,15 @@ define i64 @tt5(x86_mmx %t, ptr %q) nounwind {
 ; X64-NEXT:    emms
 ; X64-NEXT:    retq
 entry:
-  %v = load x86_mmx, ptr %q
-  %u = tail call x86_mmx @llvm.x86.mmx.paddus.w(x86_mmx %t, x86_mmx %v)
-  %s = bitcast x86_mmx %u to i64
+  %v = load <1 x i64>, ptr %q
+  %u = tail call <1 x i64> @llvm.x86.mmx.paddus.w(<1 x i64> %t, <1 x i64> %v)
+  %s = bitcast <1 x i64> %u to i64
   call void @llvm.x86.mmx.emms()
   ret i64 %s
 }
-declare x86_mmx @llvm.x86.mmx.paddus.w(x86_mmx, x86_mmx)
+declare <1 x i64> @llvm.x86.mmx.paddus.w(<1 x i64>, <1 x i64>)
 
-define i64 @tt6(x86_mmx %t, ptr %q) nounwind {
+define i64 @tt6(<1 x i64> %t, ptr %q) nounwind {
 ; X86-LABEL: tt6:
 ; X86:       # %bb.0: # %entry
 ; X86-NEXT:    pushl %ebp
@@ -541,15 +541,15 @@ define i64 @tt6(x86_mmx %t, ptr %q) nounwind {
 ; X64-NEXT:    emms
 ; X64-NEXT:    retq
 entry:
-  %v = load x86_mmx, ptr %q
-  %u = tail call x86_mmx @llvm.x86.mmx.psrl.w(x86_mmx %t, x86_mmx %v)
-  %s = bitcast x86_mmx %u to i64
+  %v = load <1 x i64>, ptr %q
+  %u = tail call <1 x i64> @llvm.x86.mmx.psrl.w(<1 x i64> %t, <1 x i64> %v)
+  %s = bitcast <1 x i64> %u to i64
   call void @llvm.x86.mmx.emms()
   ret i64 %s
 }
-declare x86_mmx @llvm.x86.mmx.psrl.w(x86_mmx, x86_mmx)
+declare <1 x i64> @llvm.x86.mmx.psrl.w(<1 x i64>, <1 x i64>)
 
-define i64 @tt7(x86_mmx %t, ptr %q) nounwind {
+define i64 @tt7(<1 x i64> %t, ptr %q) nounwind {
 ; X86-LABEL: tt7:
 ; X86:       # %bb.0: # %entry
 ; X86-NEXT:    pushl %ebp
@@ -579,15 +579,15 @@ define i64 @tt7(x86_mmx %t, ptr %q) nounwind {
 ; X64-NEXT:    emms
 ; X64-NEXT:    retq
 entry:
-  %v = load x86_mmx, ptr %q
-  %u = tail call x86_mmx @llvm.x86.mmx.psrl.d(x86_mmx %t, x86_mmx %v)
-  %s = bitcast x86_mmx %u to i64
+  %v = load <1 x i64>, ptr %q
+  %u = tail call <1 x i64> @llvm.x86.mmx.psrl.d(<1 x i64> %t, <1 x i64> %v)
+  %s = bitcast <1 x i64> %u to i64
   call void @llvm.x86.mmx.emms()
   ret i64 %s
 }
-declare x86_mmx @llvm.x86.mmx.psrl.d(x86_mmx, x86_mmx)
+declare <1 x i64> @llvm.x86.mmx.psrl.d(<1 x i64>, <1 x i64>)
 
-define i64 @tt8(x86_mmx %t, ptr %q) nounwind {
+define i64 @tt8(<1 x i64> %t, ptr %q) nounwind {
 ; X86-LABEL: tt8:
 ; X86:       # %bb.0: # %entry
 ; X86-NEXT:    pushl %ebp
@@ -617,13 +617,13 @@ define i64 @tt8(x86_mmx %t, ptr %q) nounwind {
 ; X64-NEXT:    emms
 ; X64-NEXT:    retq
 entry:
-  %v = load x86_mmx, ptr %q
-  %u = tail call x86_mmx @llvm.x86.mmx.psrl.q(x86_mmx %t, x86_mmx %v)
-  %s = bitcast x86_mmx %u to i64
+  %v = load <1 x i64>, ptr %q
+  %u = tail call <1 x i64> @llvm.x86.mmx.psrl.q(<1 x i64> %t, <1 x i64> %v)
+  %s = bitcast <1 x i64> %u to i64
   call void @llvm.x86.mmx.emms()
   ret i64 %s
 }
-declare x86_mmx @llvm.x86.mmx.psrl.q(x86_mmx, x86_mmx)
+declare <1 x i64> @llvm.x86.mmx.psrl.q(<1 x i64>, <1 x i64>)
 
 define void @test_psrlq_by_volatile_shift_amount(ptr %t) nounwind {
 ; X86-LABEL: test_psrlq_by_volatile_shift_amount:
@@ -653,8 +653,8 @@ entry:
   call void @llvm.lifetime.start(i64 4, ptr nonnull %0)
   store volatile i32 1, ptr %0, align 4
   %1 = load volatile i32, ptr %0, align 4
-  %2 = tail call x86_mmx @llvm.x86.mmx.psrli.q(x86_mmx bitcast (<1 x i64> <i64 255> to x86_mmx), i32 %1)
-  store x86_mmx %2, ptr %t, align 8
+  %2 = tail call <1 x i64> @llvm.x86.mmx.psrli.q(<1 x i64> <i64 255>, i32 %1)
+  store <1 x i64> %2, ptr %t, align 8
   call void @llvm.lifetime.end(i64 4, ptr nonnull %0)
   ret void
 }
@@ -663,7 +663,7 @@ declare void @llvm.lifetime.start(i64, ptr nocapture)
 declare void @llvm.lifetime.end(i64, ptr nocapture)
 
 ; Make sure we shrink this vector load and fold it.
-define x86_mmx @vec_load(ptr %x) {
+define <1 x i64> @vec_load(ptr %x) {
 ; X86-LABEL: vec_load:
 ; X86:       # %bb.0:
 ; X86-NEXT:    pushl %ebp
@@ -694,10 +694,10 @@ define x86_mmx @vec_load(ptr %x) {
   %y = extractelement <4 x float> %z, i32 0
   %a = insertelement <2 x float> undef, float %y, i32 0
   %b = insertelement <2 x float> %a, float %y, i32 1
-  %c = bitcast <2 x float> %b to x86_mmx
-  %d = tail call x86_mmx @llvm.x86.mmx.padds.b(x86_mmx %c, x86_mmx %c)
-  ret x86_mmx %d
+  %c = bitcast <2 x float> %b to <1 x i64>
+  %d = tail call <1 x i64> @llvm.x86.mmx.padds.b(<1 x i64> %c, <1 x i64> %c)
+  ret <1 x i64> %d
 }
 
-declare x86_mmx @llvm.x86.mmx.padds.b(x86_mmx, x86_mmx)
+declare <1 x i64> @llvm.x86.mmx.padds.b(<1 x i64>, <1 x i64>)
 
diff --git a/llvm/test/CodeGen/X86/mmx-fold-zero.ll b/llvm/test/CodeGen/X86/mmx-fold-zero.ll
index b2c94e3..a6e12758 100644
--- a/llvm/test/CodeGen/X86/mmx-fold-zero.ll
+++ b/llvm/test/CodeGen/X86/mmx-fold-zero.ll
@@ -115,32 +115,32 @@ define double @mmx_zero(double, double, double, double) nounwind {
 ; X64-LARGE-NEXT:    paddw %mm2, %mm0
 ; X64-LARGE-NEXT:    movq2dq %mm0, %xmm0
 ; X64-LARGE-NEXT:    retq
-  %5 = bitcast double %0 to x86_mmx
-  %6 = bitcast double %1 to x86_mmx
-  %7 = tail call x86_mmx @llvm.x86.mmx.padd.d(x86_mmx %5, x86_mmx %6)
-  %8 = tail call x86_mmx @llvm.x86.mmx.pmulu.dq(x86_mmx %7, x86_mmx bitcast (double 0.000000e+00 to x86_mmx))
-  %9 = bitcast double %2 to x86_mmx
-  %10 = tail call x86_mmx @llvm.x86.mmx.padd.d(x86_mmx %8, x86_mmx %9)
-  %11 = tail call x86_mmx @llvm.x86.mmx.padd.w(x86_mmx %5, x86_mmx %10)
-  %12 = tail call x86_mmx @llvm.x86.mmx.padd.w(x86_mmx %6, x86_mmx %11)
-  %13 = bitcast double %3 to x86_mmx
-  %14 = tail call x86_mmx @llvm.x86.mmx.pmulu.dq(x86_mmx %12, x86_mmx %13)
-  %15 = tail call x86_mmx @llvm.x86.mmx.padd.w(x86_mmx %14, x86_mmx %9)
-  %16 = tail call x86_mmx @llvm.x86.mmx.padd.w(x86_mmx %15, x86_mmx %13)
-  %17 = tail call x86_mmx @llvm.x86.mmx.padd.w(x86_mmx %16, x86_mmx %10)
-  %18 = tail call x86_mmx @llvm.x86.mmx.padd.w(x86_mmx %17, x86_mmx %11)
-  %19 = tail call x86_mmx @llvm.x86.mmx.padd.w(x86_mmx %18, x86_mmx %8)
-  %20 = tail call x86_mmx @llvm.x86.mmx.pmulu.dq(x86_mmx %19, x86_mmx %7)
-  %21 = tail call x86_mmx @llvm.x86.mmx.padd.w(x86_mmx %20, x86_mmx bitcast (double 0.000000e+00 to x86_mmx))
-  %22 = tail call x86_mmx @llvm.x86.mmx.padd.w(x86_mmx %21, x86_mmx %12)
-  %23 = tail call x86_mmx @llvm.x86.mmx.pmulu.dq(x86_mmx %22, x86_mmx %15)
-  %24 = tail call x86_mmx @llvm.x86.mmx.pmulu.dq(x86_mmx %23, x86_mmx %6)
-  %25 = tail call x86_mmx @llvm.x86.mmx.padd.w(x86_mmx %24, x86_mmx %16)
-  %26 = tail call x86_mmx @llvm.x86.mmx.padd.w(x86_mmx %25, x86_mmx %17)
-  %27 = bitcast x86_mmx %26 to double
+  %5 = bitcast double %0 to <1 x i64>
+  %6 = bitcast double %1 to <1 x i64>
+  %7 = tail call <1 x i64> @llvm.x86.mmx.padd.d(<1 x i64> %5, <1 x i64> %6)
+  %8 = tail call <1 x i64> @llvm.x86.mmx.pmulu.dq(<1 x i64> %7, <1 x i64> bitcast (double 0.000000e+00 to <1 x i64>))
+  %9 = bitcast double %2 to <1 x i64>
+  %10 = tail call <1 x i64> @llvm.x86.mmx.padd.d(<1 x i64> %8, <1 x i64> %9)
+  %11 = tail call <1 x i64> @llvm.x86.mmx.padd.w(<1 x i64> %5, <1 x i64> %10)
+  %12 = tail call <1 x i64> @llvm.x86.mmx.padd.w(<1 x i64> %6, <1 x i64> %11)
+  %13 = bitcast double %3 to <1 x i64>
+  %14 = tail call <1 x i64> @llvm.x86.mmx.pmulu.dq(<1 x i64> %12, <1 x i64> %13)
+  %15 = tail call <1 x i64> @llvm.x86.mmx.padd.w(<1 x i64> %14, <1 x i64> %9)
+  %16 = tail call <1 x i64> @llvm.x86.mmx.padd.w(<1 x i64> %15, <1 x i64> %13)
+  %17 = tail call <1 x i64> @llvm.x86.mmx.padd.w(<1 x i64> %16, <1 x i64> %10)
+  %18 = tail call <1 x i64> @llvm.x86.mmx.padd.w(<1 x i64> %17, <1 x i64> %11)
+  %19 = tail call <1 x i64> @llvm.x86.mmx.padd.w(<1 x i64> %18, <1 x i64> %8)
+  %20 = tail call <1 x i64> @llvm.x86.mmx.pmulu.dq(<1 x i64> %19, <1 x i64> %7)
+  %21 = tail call <1 x i64> @llvm.x86.mmx.padd.w(<1 x i64> %20, <1 x i64> bitcast (double 0.000000e+00 to <1 x i64>))
+  %22 = tail call <1 x i64> @llvm.x86.mmx.padd.w(<1 x i64> %21, <1 x i64> %12)
+  %23 = tail call <1 x i64> @llvm.x86.mmx.pmulu.dq(<1 x i64> %22, <1 x i64> %15)
+  %24 = tail call <1 x i64> @llvm.x86.mmx.pmulu.dq(<1 x i64> %23, <1 x i64> %6)
+  %25 = tail call <1 x i64> @llvm.x86.mmx.padd.w(<1 x i64> %24, <1 x i64> %16)
+  %26 = tail call <1 x i64> @llvm.x86.mmx.padd.w(<1 x i64> %25, <1 x i64> %17)
+  %27 = bitcast <1 x i64> %26 to double
   ret double %27
 }
 
-declare x86_mmx @llvm.x86.mmx.padd.d(x86_mmx, x86_mmx)
-declare x86_mmx @llvm.x86.mmx.padd.w(x86_mmx, x86_mmx)
-declare x86_mmx @llvm.x86.mmx.pmulu.dq(x86_mmx, x86_mmx)
+declare <1 x i64> @llvm.x86.mmx.padd.d(<1 x i64>, <1 x i64>)
+declare <1 x i64> @llvm.x86.mmx.padd.w(<1 x i64>, <1 x i64>)
+declare <1 x i64> @llvm.x86.mmx.pmulu.dq(<1 x i64>, <1 x i64>)
diff --git a/llvm/test/CodeGen/X86/mmx-intrinsics.ll b/llvm/test/CodeGen/X86/mmx-intrinsics.ll
index 69fc636..a7b6ed4 100644
--- a/llvm/test/CodeGen/X86/mmx-intrinsics.ll
+++ b/llvm/test/CodeGen/X86/mmx-intrinsics.ll
@@ -4,7 +4,7 @@
 ; RUN: llc < %s -mtriple=x86_64-- -mattr=+mmx,+ssse3,-avx | FileCheck %s --check-prefixes=ALL,X64
 ; RUN: llc < %s -mtriple=x86_64-- -mattr=+mmx,+avx | FileCheck %s --check-prefixes=ALL,X64
 
-declare x86_mmx @llvm.x86.ssse3.phadd.w(x86_mmx, x86_mmx) nounwind readnone
+declare <1 x i64> @llvm.x86.ssse3.phadd.w(<1 x i64>, <1 x i64>) nounwind readnone
 
 define i64 @test1(<1 x i64> %a, <1 x i64> %b) nounwind readnone optsize ssp {
 ; X86-LABEL: test1:
@@ -40,16 +40,16 @@ define i64 @test1(<1 x i64> %a, <1 x i64> %b) nounwind readnone optsize ssp {
 entry:
   %0 = bitcast <1 x i64> %b to <4 x i16>
   %1 = bitcast <1 x i64> %a to <4 x i16>
-  %2 = bitcast <4 x i16> %1 to x86_mmx
-  %3 = bitcast <4 x i16> %0 to x86_mmx
-  %4 = tail call x86_mmx @llvm.x86.ssse3.phadd.w(x86_mmx %2, x86_mmx %3) nounwind readnone
-  %5 = bitcast x86_mmx %4 to <4 x i16>
+  %2 = bitcast <4 x i16> %1 to <1 x i64>
+  %3 = bitcast <4 x i16> %0 to <1 x i64>
+  %4 = tail call <1 x i64> @llvm.x86.ssse3.phadd.w(<1 x i64> %2, <1 x i64> %3) nounwind readnone
+  %5 = bitcast <1 x i64> %4 to <4 x i16>
   %6 = bitcast <4 x i16> %5 to <1 x i64>
   %7 = extractelement <1 x i64> %6, i32 0
   ret i64 %7
 }
 
-declare x86_mmx @llvm.x86.mmx.pcmpgt.d(x86_mmx, x86_mmx) nounwind readnone
+declare <1 x i64> @llvm.x86.mmx.pcmpgt.d(<1 x i64>, <1 x i64>) nounwind readnone
 
 define i64 @test88(<1 x i64> %a, <1 x i64> %b) nounwind readnone optsize ssp {
 ; X86-LABEL: test88:
@@ -85,16 +85,16 @@ define i64 @test88(<1 x i64> %a, <1 x i64> %b) nounwind readnone optsize ssp {
 entry:
   %0 = bitcast <1 x i64> %b to <2 x i32>
   %1 = bitcast <1 x i64> %a to <2 x i32>
-  %mmx_var.i = bitcast <2 x i32> %1 to x86_mmx
-  %mmx_var1.i = bitcast <2 x i32> %0 to x86_mmx
-  %2 = tail call x86_mmx @llvm.x86.mmx.pcmpgt.d(x86_mmx %mmx_var.i, x86_mmx %mmx_var1.i) nounwind
-  %3 = bitcast x86_mmx %2 to <2 x i32>
+  %mmx_var.i = bitcast <2 x i32> %1 to <1 x i64>
+  %mmx_var1.i = bitcast <2 x i32> %0 to <1 x i64>
+  %2 = tail call <1 x i64> @llvm.x86.mmx.pcmpgt.d(<1 x i64> %mmx_var.i, <1 x i64> %mmx_var1.i) nounwind
+  %3 = bitcast <1 x i64> %2 to <2 x i32>
   %4 = bitcast <2 x i32> %3 to <1 x i64>
   %5 = extractelement <1 x i64> %4, i32 0
   ret i64 %5
 }
 
-declare x86_mmx @llvm.x86.mmx.pcmpgt.w(x86_mmx, x86_mmx) nounwind readnone
+declare <1 x i64> @llvm.x86.mmx.pcmpgt.w(<1 x i64>, <1 x i64>) nounwind readnone
 
 define i64 @test87(<1 x i64> %a, <1 x i64> %b) nounwind readnone optsize ssp {
 ; X86-LABEL: test87:
@@ -130,16 +130,16 @@ define i64 @test87(<1 x i64> %a, <1 x i64> %b) nounwind readnone optsize ssp {
 entry:
   %0 = bitcast <1 x i64> %b to <4 x i16>
   %1 = bitcast <1 x i64> %a to <4 x i16>
-  %mmx_var.i = bitcast <4 x i16> %1 to x86_mmx
-  %mmx_var1.i = bitcast <4 x i16> %0 to x86_mmx
-  %2 = tail call x86_mmx @llvm.x86.mmx.pcmpgt.w(x86_mmx %mmx_var.i, x86_mmx %mmx_var1.i) nounwind
-  %3 = bitcast x86_mmx %2 to <4 x i16>
+  %mmx_var.i = bitcast <4 x i16> %1 to <1 x i64>
+  %mmx_var1.i = bitcast <4 x i16> %0 to <1 x i64>
+  %2 = tail call <1 x i64> @llvm.x86.mmx.pcmpgt.w(<1 x i64> %mmx_var.i, <1 x i64> %mmx_var1.i) nounwind
+  %3 = bitcast <1 x i64> %2 to <4 x i16>
   %4 = bitcast <4 x i16> %3 to <1 x i64>
   %5 = extractelement <1 x i64> %4, i32 0
   ret i64 %5
 }
 
-declare x86_mmx @llvm.x86.mmx.pcmpgt.b(x86_mmx, x86_mmx) nounwind readnone
+declare <1 x i64> @llvm.x86.mmx.pcmpgt.b(<1 x i64>, <1 x i64>) nounwind readnone
 
 define i64 @test86(<1 x i64> %a, <1 x i64> %b) nounwind readnone optsize ssp {
 ; X86-LABEL: test86:
@@ -175,16 +175,16 @@ define i64 @test86(<1 x i64> %a, <1 x i64> %b) nounwind readnone optsize ssp {
 entry:
   %0 = bitcast <1 x i64> %b to <8 x i8>
   %1 = bitcast <1 x i64> %a to <8 x i8>
-  %mmx_var.i = bitcast <8 x i8> %1 to x86_mmx
-  %mmx_var1.i = bitcast <8 x i8> %0 to x86_mmx
-  %2 = tail call x86_mmx @llvm.x86.mmx.pcmpgt.b(x86_mmx %mmx_var.i, x86_mmx %mmx_var1.i) nounwind
-  %3 = bitcast x86_mmx %2 to <8 x i8>
+  %mmx_var.i = bitcast <8 x i8> %1 to <1 x i64>
+  %mmx_var1.i = bitcast <8 x i8> %0 to <1 x i64>
+  %2 = tail call <1 x i64> @llvm.x86.mmx.pcmpgt.b(<1 x i64> %mmx_var.i, <1 x i64> %mmx_var1.i) nounwind
+  %3 = bitcast <1 x i64> %2 to <8 x i8>
   %4 = bitcast <8 x i8> %3 to <1 x i64>
   %5 = extractelement <1 x i64> %4, i32 0
   ret i64 %5
 }
 
-declare x86_mmx @llvm.x86.mmx.pcmpeq.d(x86_mmx, x86_mmx) nounwind readnone
+declare <1 x i64> @llvm.x86.mmx.pcmpeq.d(<1 x i64>, <1 x i64>) nounwind readnone
 
 define i64 @test85(<1 x i64> %a, <1 x i64> %b) nounwind readnone optsize ssp {
 ; X86-LABEL: test85:
@@ -220,16 +220,16 @@ define i64 @test85(<1 x i64> %a, <1 x i64> %b) nounwind readnone optsize ssp {
 entry:
   %0 = bitcast <1 x i64> %b to <2 x i32>
   %1 = bitcast <1 x i64> %a to <2 x i32>
-  %mmx_var.i = bitcast <2 x i32> %1 to x86_mmx
-  %mmx_var1.i = bitcast <2 x i32> %0 to x86_mmx
-  %2 = tail call x86_mmx @llvm.x86.mmx.pcmpeq.d(x86_mmx %mmx_var.i, x86_mmx %mmx_var1.i) nounwind
-  %3 = bitcast x86_mmx %2 to <2 x i32>
+  %mmx_var.i = bitcast <2 x i32> %1 to <1 x i64>
+  %mmx_var1.i = bitcast <2 x i32> %0 to <1 x i64>
+  %2 = tail call <1 x i64> @llvm.x86.mmx.pcmpeq.d(<1 x i64> %mmx_var.i, <1 x i64> %mmx_var1.i) nounwind
+  %3 = bitcast <1 x i64> %2 to <2 x i32>
   %4 = bitcast <2 x i32> %3 to <1 x i64>
   %5 = extractelement <1 x i64> %4, i32 0
   ret i64 %5
 }
 
-declare x86_mmx @llvm.x86.mmx.pcmpeq.w(x86_mmx, x86_mmx) nounwind readnone
+declare <1 x i64> @llvm.x86.mmx.pcmpeq.w(<1 x i64>, <1 x i64>) nounwind readnone
 
 define i64 @test84(<1 x i64> %a, <1 x i64> %b) nounwind readnone optsize ssp {
 ; X86-LABEL: test84:
@@ -265,16 +265,16 @@ define i64 @test84(<1 x i64> %a, <1 x i64> %b) nounwind readnone optsize ssp {
 entry:
   %0 = bitcast <1 x i64> %b to <4 x i16>
   %1 = bitcast <1 x i64> %a to <4 x i16>
-  %mmx_var.i = bitcast <4 x i16> %1 to x86_mmx
-  %mmx_var1.i = bitcast <4 x i16> %0 to x86_mmx
-  %2 = tail call x86_mmx @llvm.x86.mmx.pcmpeq.w(x86_mmx %mmx_var.i, x86_mmx %mmx_var1.i) nounwind
-  %3 = bitcast x86_mmx %2 to <4 x i16>
+  %mmx_var.i = bitcast <4 x i16> %1 to <1 x i64>
+  %mmx_var1.i = bitcast <4 x i16> %0 to <1 x i64>
+  %2 = tail call <1 x i64> @llvm.x86.mmx.pcmpeq.w(<1 x i64> %mmx_var.i, <1 x i64> %mmx_var1.i) nounwind
+  %3 = bitcast <1 x i64> %2 to <4 x i16>
   %4 = bitcast <4 x i16> %3 to <1 x i64>
   %5 = extractelement <1 x i64> %4, i32 0
   ret i64 %5
 }
 
-declare x86_mmx @llvm.x86.mmx.pcmpeq.b(x86_mmx, x86_mmx) nounwind readnone
+declare <1 x i64> @llvm.x86.mmx.pcmpeq.b(<1 x i64>, <1 x i64>) nounwind readnone
 
 define i64 @test83(<1 x i64> %a, <1 x i64> %b) nounwind readnone optsize ssp {
 ; X86-LABEL: test83:
@@ -310,16 +310,16 @@ define i64 @test83(<1 x i64> %a, <1 x i64> %b) nounwind readnone optsize ssp {
 entry:
   %0 = bitcast <1 x i64> %b to <8 x i8>
   %1 = bitcast <1 x i64> %a to <8 x i8>
-  %mmx_var.i = bitcast <8 x i8> %1 to x86_mmx
-  %mmx_var1.i = bitcast <8 x i8> %0 to x86_mmx
-  %2 = tail call x86_mmx @llvm.x86.mmx.pcmpeq.b(x86_mmx %mmx_var.i, x86_mmx %mmx_var1.i) nounwind
-  %3 = bitcast x86_mmx %2 to <8 x i8>
+  %mmx_var.i = bitcast <8 x i8> %1 to <1 x i64>
+  %mmx_var1.i = bitcast <8 x i8> %0 to <1 x i64>
+  %2 = tail call <1 x i64> @llvm.x86.mmx.pcmpeq.b(<1 x i64> %mmx_var.i, <1 x i64> %mmx_var1.i) nounwind
+  %3 = bitcast <1 x i64> %2 to <8 x i8>
   %4 = bitcast <8 x i8> %3 to <1 x i64>
   %5 = extractelement <1 x i64> %4, i32 0
   ret i64 %5
 }
 
-declare x86_mmx @llvm.x86.mmx.punpckldq(x86_mmx, x86_mmx) nounwind readnone
+declare <1 x i64> @llvm.x86.mmx.punpckldq(<1 x i64>, <1 x i64>) nounwind readnone
 
 define i64 @test82(<1 x i64> %a, <1 x i64> %b) nounwind readnone optsize ssp {
 ; X86-LABEL: test82:
@@ -355,16 +355,16 @@ define i64 @test82(<1 x i64> %a, <1 x i64> %b) nounwind readnone optsize ssp {
 entry:
   %0 = bitcast <1 x i64> %b to <2 x i32>
   %1 = bitcast <1 x i64> %a to <2 x i32>
-  %mmx_var.i = bitcast <2 x i32> %1 to x86_mmx
-  %mmx_var1.i = bitcast <2 x i32> %0 to x86_mmx
-  %2 = tail call x86_mmx @llvm.x86.mmx.punpckldq(x86_mmx %mmx_var.i, x86_mmx %mmx_var1.i) nounwind
-  %3 = bitcast x86_mmx %2 to <2 x i32>
+  %mmx_var.i = bitcast <2 x i32> %1 to <1 x i64>
+  %mmx_var1.i = bitcast <2 x i32> %0 to <1 x i64>
+  %2 = tail call <1 x i64> @llvm.x86.mmx.punpckldq(<1 x i64> %mmx_var.i, <1 x i64> %mmx_var1.i) nounwind
+  %3 = bitcast <1 x i64> %2 to <2 x i32>
   %4 = bitcast <2 x i32> %3 to <1 x i64>
   %5 = extractelement <1 x i64> %4, i32 0
   ret i64 %5
 }
 
-declare x86_mmx @llvm.x86.mmx.punpcklwd(x86_mmx, x86_mmx) nounwind readnone
+declare <1 x i64> @llvm.x86.mmx.punpcklwd(<1 x i64>, <1 x i64>) nounwind readnone
 
 define i64 @test81(<1 x i64> %a, <1 x i64> %b) nounwind readnone optsize ssp {
 ; X86-LABEL: test81:
@@ -400,16 +400,16 @@ define i64 @test81(<1 x i64> %a, <1 x i64> %b) nounwind readnone optsize ssp {
 entry:
   %0 = bitcast <1 x i64> %b to <4 x i16>
   %1 = bitcast <1 x i64> %a to <4 x i16>
-  %mmx_var.i = bitcast <4 x i16> %1 to x86_mmx
-  %mmx_var1.i = bitcast <4 x i16> %0 to x86_mmx
-  %2 = tail call x86_mmx @llvm.x86.mmx.punpcklwd(x86_mmx %mmx_var.i, x86_mmx %mmx_var1.i) nounwind
-  %3 = bitcast x86_mmx %2 to <4 x i16>
+  %mmx_var.i = bitcast <4 x i16> %1 to <1 x i64>
+  %mmx_var1.i = bitcast <4 x i16> %0 to <1 x i64>
+  %2 = tail call <1 x i64> @llvm.x86.mmx.punpcklwd(<1 x i64> %mmx_var.i, <1 x i64> %mmx_var1.i) nounwind
+  %3 = bitcast <1 x i64> %2 to <4 x i16>
   %4 = bitcast <4 x i16> %3 to <1 x i64>
   %5 = extractelement <1 x i64> %4, i32 0
   ret i64 %5
 }
 
-declare x86_mmx @llvm.x86.mmx.punpcklbw(x86_mmx, x86_mmx) nounwind readnone
+declare <1 x i64> @llvm.x86.mmx.punpcklbw(<1 x i64>, <1 x i64>) nounwind readnone
 
 define i64 @test80(<1 x i64> %a, <1 x i64> %b) nounwind readnone optsize ssp {
 ; X86-LABEL: test80:
@@ -445,16 +445,16 @@ define i64 @test80(<1 x i64> %a, <1 x i64> %b) nounwind readnone optsize ssp {
 entry:
   %0 = bitcast <1 x i64> %b to <8 x i8>
   %1 = bitcast <1 x i64> %a to <8 x i8>
-  %mmx_var.i = bitcast <8 x i8> %1 to x86_mmx
-  %mmx_var1.i = bitcast <8 x i8> %0 to x86_mmx
-  %2 = tail call x86_mmx @llvm.x86.mmx.punpcklbw(x86_mmx %mmx_var.i, x86_mmx %mmx_var1.i) nounwind
-  %3 = bitcast x86_mmx %2 to <8 x i8>
+  %mmx_var.i = bitcast <8 x i8> %1 to <1 x i64>
+  %mmx_var1.i = bitcast <8 x i8> %0 to <1 x i64>
+  %2 = tail call <1 x i64> @llvm.x86.mmx.punpcklbw(<1 x i64> %mmx_var.i, <1 x i64> %mmx_var1.i) nounwind
+  %3 = bitcast <1 x i64> %2 to <8 x i8>
   %4 = bitcast <8 x i8> %3 to <1 x i64>
   %5 = extractelement <1 x i64> %4, i32 0
   ret i64 %5
 }
 
-declare x86_mmx @llvm.x86.mmx.punpckhdq(x86_mmx, x86_mmx) nounwind readnone
+declare <1 x i64> @llvm.x86.mmx.punpckhdq(<1 x i64>, <1 x i64>) nounwind readnone
 
 define i64 @test79(<1 x i64> %a, <1 x i64> %b) nounwind readnone optsize ssp {
 ; X86-LABEL: test79:
@@ -490,16 +490,16 @@ define i64 @test79(<1 x i64> %a, <1 x i64> %b) nounwind readnone optsize ssp {
 entry:
   %0 = bitcast <1 x i64> %b to <2 x i32>
   %1 = bitcast <1 x i64> %a to <2 x i32>
-  %mmx_var.i = bitcast <2 x i32> %1 to x86_mmx
-  %mmx_var1.i = bitcast <2 x i32> %0 to x86_mmx
-  %2 = tail call x86_mmx @llvm.x86.mmx.punpckhdq(x86_mmx %mmx_var.i, x86_mmx %mmx_var1.i) nounwind
-  %3 = bitcast x86_mmx %2 to <2 x i32>
+  %mmx_var.i = bitcast <2 x i32> %1 to <1 x i64>
+  %mmx_var1.i = bitcast <2 x i32> %0 to <1 x i64>
+  %2 = tail call <1 x i64> @llvm.x86.mmx.punpckhdq(<1 x i64> %mmx_var.i, <1 x i64> %mmx_var1.i) nounwind
+  %3 = bitcast <1 x i64> %2 to <2 x i32>
   %4 = bitcast <2 x i32> %3 to <1 x i64>
   %5 = extractelement <1 x i64> %4, i32 0
   ret i64 %5
 }
 
-declare x86_mmx @llvm.x86.mmx.punpckhwd(x86_mmx, x86_mmx) nounwind readnone
+declare <1 x i64> @llvm.x86.mmx.punpckhwd(<1 x i64>, <1 x i64>) nounwind readnone
 
 define i64 @test78(<1 x i64> %a, <1 x i64> %b) nounwind readnone optsize ssp {
 ; X86-LABEL: test78:
@@ -535,16 +535,16 @@ define i64 @test78(<1 x i64> %a, <1 x i64> %b) nounwind readnone optsize ssp {
 entry:
   %0 = bitcast <1 x i64> %b to <4 x i16>
   %1 = bitcast <1 x i64> %a to <4 x i16>
-  %mmx_var.i = bitcast <4 x i16> %1 to x86_mmx
-  %mmx_var1.i = bitcast <4 x i16> %0 to x86_mmx
-  %2 = tail call x86_mmx @llvm.x86.mmx.punpckhwd(x86_mmx %mmx_var.i, x86_mmx %mmx_var1.i) nounwind
-  %3 = bitcast x86_mmx %2 to <4 x i16>
+  %mmx_var.i = bitcast <4 x i16> %1 to <1 x i64>
+  %mmx_var1.i = bitcast <4 x i16> %0 to <1 x i64>
+  %2 = tail call <1 x i64> @llvm.x86.mmx.punpckhwd(<1 x i64> %mmx_var.i, <1 x i64> %mmx_var1.i) nounwind
+  %3 = bitcast <1 x i64> %2 to <4 x i16>
   %4 = bitcast <4 x i16> %3 to <1 x i64>
   %5 = extractelement <1 x i64> %4, i32 0
   ret i64 %5
 }
 
-declare x86_mmx @llvm.x86.mmx.punpckhbw(x86_mmx, x86_mmx) nounwind readnone
+declare <1 x i64> @llvm.x86.mmx.punpckhbw(<1 x i64>, <1 x i64>) nounwind readnone
 
 define i64 @test77(<1 x i64> %a, <1 x i64> %b) nounwind readnone optsize ssp {
 ; X86-LABEL: test77:
@@ -580,16 +580,16 @@ define i64 @test77(<1 x i64> %a, <1 x i64> %b) nounwind readnone optsize ssp {
 entry:
   %0 = bitcast <1 x i64> %b to <8 x i8>
   %1 = bitcast <1 x i64> %a to <8 x i8>
-  %mmx_var.i = bitcast <8 x i8> %1 to x86_mmx
-  %mmx_var1.i = bitcast <8 x i8> %0 to x86_mmx
-  %2 = tail call x86_mmx @llvm.x86.mmx.punpckhbw(x86_mmx %mmx_var.i, x86_mmx %mmx_var1.i) nounwind
-  %3 = bitcast x86_mmx %2 to <8 x i8>
+  %mmx_var.i = bitcast <8 x i8> %1 to <1 x i64>
+  %mmx_var1.i = bitcast <8 x i8> %0 to <1 x i64>
+  %2 = tail call <1 x i64> @llvm.x86.mmx.punpckhbw(<1 x i64> %mmx_var.i, <1 x i64> %mmx_var1.i) nounwind
+  %3 = bitcast <1 x i64> %2 to <8 x i8>
   %4 = bitcast <8 x i8> %3 to <1 x i64>
   %5 = extractelement <1 x i64> %4, i32 0
   ret i64 %5
 }
 
-declare x86_mmx @llvm.x86.mmx.packuswb(x86_mmx, x86_mmx) nounwind readnone
+declare <1 x i64> @llvm.x86.mmx.packuswb(<1 x i64>, <1 x i64>) nounwind readnone
 
 define i64 @test76(<1 x i64> %a, <1 x i64> %b) nounwind readnone optsize ssp {
 ; X86-LABEL: test76:
@@ -625,16 +625,16 @@ define i64 @test76(<1 x i64> %a, <1 x i64> %b) nounwind readnone optsize ssp {
 entry:
   %0 = bitcast <1 x i64> %b to <4 x i16>
   %1 = bitcast <1 x i64> %a to <4 x i16>
-  %mmx_var.i = bitcast <4 x i16> %1 to x86_mmx
-  %mmx_var1.i = bitcast <4 x i16> %0 to x86_mmx
-  %2 = tail call x86_mmx @llvm.x86.mmx.packuswb(x86_mmx %mmx_var.i, x86_mmx %mmx_var1.i) nounwind
-  %3 = bitcast x86_mmx %2 to <8 x i8>
+  %mmx_var.i = bitcast <4 x i16> %1 to <1 x i64>
+  %mmx_var1.i = bitcast <4 x i16> %0 to <1 x i64>
+  %2 = tail call <1 x i64> @llvm.x86.mmx.packuswb(<1 x i64> %mmx_var.i, <1 x i64> %mmx_var1.i) nounwind
+  %3 = bitcast <1 x i64> %2 to <8 x i8>
   %4 = bitcast <8 x i8> %3 to <1 x i64>
   %5 = extractelement <1 x i64> %4, i32 0
   ret i64 %5
 }
 
-declare x86_mmx @llvm.x86.mmx.packssdw(x86_mmx, x86_mmx) nounwind readnone
+declare <1 x i64> @llvm.x86.mmx.packssdw(<1 x i64>, <1 x i64>) nounwind readnone
 
 define i64 @test75(<1 x i64> %a, <1 x i64> %b) nounwind readnone optsize ssp {
 ; X86-LABEL: test75:
@@ -670,16 +670,16 @@ define i64 @test75(<1 x i64> %a, <1 x i64> %b) nounwind readnone optsize ssp {
 entry:
   %0 = bitcast <1 x i64> %b to <2 x i32>
   %1 = bitcast <1 x i64> %a to <2 x i32>
-  %mmx_var.i = bitcast <2 x i32> %1 to x86_mmx
-  %mmx_var1.i = bitcast <2 x i32> %0 to x86_mmx
-  %2 = tail call x86_mmx @llvm.x86.mmx.packssdw(x86_mmx %mmx_var.i, x86_mmx %mmx_var1.i) nounwind
-  %3 = bitcast x86_mmx %2 to <4 x i16>
+  %mmx_var.i = bitcast <2 x i32> %1 to <1 x i64>
+  %mmx_var1.i = bitcast <2 x i32> %0 to <1 x i64>
+  %2 = tail call <1 x i64> @llvm.x86.mmx.packssdw(<1 x i64> %mmx_var.i, <1 x i64> %mmx_var1.i) nounwind
+  %3 = bitcast <1 x i64> %2 to <4 x i16>
   %4 = bitcast <4 x i16> %3 to <1 x i64>
   %5 = extractelement <1 x i64> %4, i32 0
   ret i64 %5
 }
 
-declare x86_mmx @llvm.x86.mmx.packsswb(x86_mmx, x86_mmx) nounwind readnone
+declare <1 x i64> @llvm.x86.mmx.packsswb(<1 x i64>, <1 x i64>) nounwind readnone
 
 define i64 @test74(<1 x i64> %a, <1 x i64> %b) nounwind readnone optsize ssp {
 ; X86-LABEL: test74:
@@ -715,16 +715,16 @@ define i64 @test74(<1 x i64> %a, <1 x i64> %b) nounwind readnone optsize ssp {
 entry:
   %0 = bitcast <1 x i64> %b to <4 x i16>
   %1 = bitcast <1 x i64> %a to <4 x i16>
-  %mmx_var.i = bitcast <4 x i16> %1 to x86_mmx
-  %mmx_var1.i = bitcast <4 x i16> %0 to x86_mmx
-  %2 = tail call x86_mmx @llvm.x86.mmx.packsswb(x86_mmx %mmx_var.i, x86_mmx %mmx_var1.i) nounwind
-  %3 = bitcast x86_mmx %2 to <8 x i8>
+  %mmx_var.i = bitcast <4 x i16> %1 to <1 x i64>
+  %mmx_var1.i = bitcast <4 x i16> %0 to <1 x i64>
+  %2 = tail call <1 x i64> @llvm.x86.mmx.packsswb(<1 x i64> %mmx_var.i, <1 x i64> %mmx_var1.i) nounwind
+  %3 = bitcast <1 x i64> %2 to <8 x i8>
   %4 = bitcast <8 x i8> %3 to <1 x i64>
   %5 = extractelement <1 x i64> %4, i32 0
   ret i64 %5
 }
 
-declare x86_mmx @llvm.x86.mmx.psrai.d(x86_mmx, i32) nounwind readnone
+declare <1 x i64> @llvm.x86.mmx.psrai.d(<1 x i64>, i32) nounwind readnone
 
 define i64 @test73(<1 x i64> %a) nounwind readnone optsize ssp {
 ; X86-LABEL: test73:
@@ -754,15 +754,15 @@ define i64 @test73(<1 x i64> %a) nounwind readnone optsize ssp {
 ; X64-NEXT:    retq
 entry:
   %0 = bitcast <1 x i64> %a to <2 x i32>
-  %mmx_var.i = bitcast <2 x i32> %0 to x86_mmx
-  %1 = tail call x86_mmx @llvm.x86.mmx.psrai.d(x86_mmx %mmx_var.i, i32 3) nounwind
-  %2 = bitcast x86_mmx %1 to <2 x i32>
+  %mmx_var.i = bitcast <2 x i32> %0 to <1 x i64>
+  %1 = tail call <1 x i64> @llvm.x86.mmx.psrai.d(<1 x i64> %mmx_var.i, i32 3) nounwind
+  %2 = bitcast <1 x i64> %1 to <2 x i32>
   %3 = bitcast <2 x i32> %2 to <1 x i64>
   %4 = extractelement <1 x i64> %3, i32 0
   ret i64 %4
 }
 
-declare x86_mmx @llvm.x86.mmx.psrai.w(x86_mmx, i32) nounwind readnone
+declare <1 x i64> @llvm.x86.mmx.psrai.w(<1 x i64>, i32) nounwind readnone
 
 define i64 @test72(<1 x i64> %a) nounwind readnone optsize ssp {
 ; X86-LABEL: test72:
@@ -792,9 +792,9 @@ define i64 @test72(<1 x i64> %a) nounwind readnone optsize ssp {
 ; X64-NEXT:    retq
 entry:
   %0 = bitcast <1 x i64> %a to <4 x i16>
-  %mmx_var.i = bitcast <4 x i16> %0 to x86_mmx
-  %1 = tail call x86_mmx @llvm.x86.mmx.psrai.w(x86_mmx %mmx_var.i, i32 3) nounwind
-  %2 = bitcast x86_mmx %1 to <4 x i16>
+  %mmx_var.i = bitcast <4 x i16> %0 to <1 x i64>
+  %1 = tail call <1 x i64> @llvm.x86.mmx.psrai.w(<1 x i64> %mmx_var.i, i32 3) nounwind
+  %2 = bitcast <1 x i64> %1 to <4 x i16>
   %3 = bitcast <4 x i16> %2 to <1 x i64>
   %4 = extractelement <1 x i64> %3, i32 0
   ret i64 %4
@@ -825,15 +825,15 @@ define i64 @test72_2(<1 x i64> %a) nounwind readnone optsize ssp {
 ; X64-NEXT:    retq
 entry:
   %0 = bitcast <1 x i64> %a to <4 x i16>
-  %mmx_var.i = bitcast <4 x i16> %0 to x86_mmx
-  %1 = tail call x86_mmx @llvm.x86.mmx.psrai.w(x86_mmx %mmx_var.i, i32 0) nounwind
-  %2 = bitcast x86_mmx %1 to <4 x i16>
+  %mmx_var.i = bitcast <4 x i16> %0 to <1 x i64>
+  %1 = tail call <1 x i64> @llvm.x86.mmx.psrai.w(<1 x i64> %mmx_var.i, i32 0) nounwind
+  %2 = bitcast <1 x i64> %1 to <4 x i16>
   %3 = bitcast <4 x i16> %2 to <1 x i64>
   %4 = extractelement <1 x i64> %3, i32 0
   ret i64 %4
 }
 
-declare x86_mmx @llvm.x86.mmx.psrli.q(x86_mmx, i32) nounwind readnone
+declare <1 x i64> @llvm.x86.mmx.psrli.q(<1 x i64>, i32) nounwind readnone
 
 define i64 @test71(<1 x i64> %a) nounwind readnone optsize ssp {
 ; X86-LABEL: test71:
@@ -859,13 +859,13 @@ define i64 @test71(<1 x i64> %a) nounwind readnone optsize ssp {
 ; X64-NEXT:    retq
 entry:
   %0 = extractelement <1 x i64> %a, i32 0
-  %mmx_var.i = bitcast i64 %0 to x86_mmx
-  %1 = tail call x86_mmx @llvm.x86.mmx.psrli.q(x86_mmx %mmx_var.i, i32 3) nounwind
-  %2 = bitcast x86_mmx %1 to i64
+  %mmx_var.i = bitcast i64 %0 to <1 x i64>
+  %1 = tail call <1 x i64> @llvm.x86.mmx.psrli.q(<1 x i64> %mmx_var.i, i32 3) nounwind
+  %2 = bitcast <1 x i64> %1 to i64
   ret i64 %2
 }
 
-declare x86_mmx @llvm.x86.mmx.psrli.d(x86_mmx, i32) nounwind readnone
+declare <1 x i64> @llvm.x86.mmx.psrli.d(<1 x i64>, i32) nounwind readnone
 
 define i64 @test70(<1 x i64> %a) nounwind readnone optsize ssp {
 ; X86-LABEL: test70:
@@ -895,9 +895,9 @@ define i64 @test70(<1 x i64> %a) nounwind readnone optsize ssp {
 ; X64-NEXT:    retq
 entry:
   %0 = bitcast <1 x i64> %a to <2 x i32>
-  %mmx_var.i = bitcast <2 x i32> %0 to x86_mmx
-  %1 = tail call x86_mmx @llvm.x86.mmx.psrli.d(x86_mmx %mmx_var.i, i32 3) nounwind
-  %2 = bitcast x86_mmx %1 to <2 x i32>
+  %mmx_var.i = bitcast <2 x i32> %0 to <1 x i64>
+  %1 = tail call <1 x i64> @llvm.x86.mmx.psrli.d(<1 x i64> %mmx_var.i, i32 3) nounwind
+  %2 = bitcast <1 x i64> %1 to <2 x i32>
   %3 = bitcast <2 x i32> %2 to <1 x i64>
   %4 = extractelement <1 x i64> %3, i32 0
   ret i64 %4
@@ -928,15 +928,15 @@ define i64 @test70_2(<1 x i64> %a) nounwind readnone optsize ssp {
 ; X64-NEXT:    retq
 entry:
   %0 = bitcast <1 x i64> %a to <2 x i32>
-  %mmx_var.i = bitcast <2 x i32> %0 to x86_mmx
-  %1 = tail call x86_mmx @llvm.x86.mmx.psrli.d(x86_mmx %mmx_var.i, i32 0) nounwind
-  %2 = bitcast x86_mmx %1 to <2 x i32>
+  %mmx_var.i = bitcast <2 x i32> %0 to <1 x i64>
+  %1 = tail call <1 x i64> @llvm.x86.mmx.psrli.d(<1 x i64> %mmx_var.i, i32 0) nounwind
+  %2 = bitcast <1 x i64> %1 to <2 x i32>
   %3 = bitcast <2 x i32> %2 to <1 x i64>
   %4 = extractelement <1 x i64> %3, i32 0
   ret i64 %4
 }
 
-declare x86_mmx @llvm.x86.mmx.psrli.w(x86_mmx, i32) nounwind readnone
+declare <1 x i64> @llvm.x86.mmx.psrli.w(<1 x i64>, i32) nounwind readnone
 
 define i64 @test69(<1 x i64> %a) nounwind readnone optsize ssp {
 ; X86-LABEL: test69:
@@ -966,15 +966,15 @@ define i64 @test69(<1 x i64> %a) nounwind readnone optsize ssp {
 ; X64-NEXT:    retq
 entry:
   %0 = bitcast <1 x i64> %a to <4 x i16>
-  %mmx_var.i = bitcast <4 x i16> %0 to x86_mmx
-  %1 = tail call x86_mmx @llvm.x86.mmx.psrli.w(x86_mmx %mmx_var.i, i32 3) nounwind
-  %2 = bitcast x86_mmx %1 to <4 x i16>
+  %mmx_var.i = bitcast <4 x i16> %0 to <1 x i64>
+  %1 = tail call <1 x i64> @llvm.x86.mmx.psrli.w(<1 x i64> %mmx_var.i, i32 3) nounwind
+  %2 = bitcast <1 x i64> %1 to <4 x i16>
   %3 = bitcast <4 x i16> %2 to <1 x i64>
   %4 = extractelement <1 x i64> %3, i32 0
   ret i64 %4
 }
 
-declare x86_mmx @llvm.x86.mmx.pslli.q(x86_mmx, i32) nounwind readnone
+declare <1 x i64> @llvm.x86.mmx.pslli.q(<1 x i64>, i32) nounwind readnone
 
 define i64 @test68(<1 x i64> %a) nounwind readnone optsize ssp {
 ; X86-LABEL: test68:
@@ -1000,13 +1000,13 @@ define i64 @test68(<1 x i64> %a) nounwind readnone optsize ssp {
 ; X64-NEXT:    retq
 entry:
   %0 = extractelement <1 x i64> %a, i32 0
-  %mmx_var.i = bitcast i64 %0 to x86_mmx
-  %1 = tail call x86_mmx @llvm.x86.mmx.pslli.q(x86_mmx %mmx_var.i, i32 3) nounwind
-  %2 = bitcast x86_mmx %1 to i64
+  %mmx_var.i = bitcast i64 %0 to <1 x i64>
+  %1 = tail call <1 x i64> @llvm.x86.mmx.pslli.q(<1 x i64> %mmx_var.i, i32 3) nounwind
+  %2 = bitcast <1 x i64> %1 to i64
   ret i64 %2
 }
 
-declare x86_mmx @llvm.x86.mmx.pslli.d(x86_mmx, i32) nounwind readnone
+declare <1 x i64> @llvm.x86.mmx.pslli.d(<1 x i64>, i32) nounwind readnone
 
 define i64 @test67(<1 x i64> %a) nounwind readnone optsize ssp {
 ; X86-LABEL: test67:
@@ -1036,15 +1036,15 @@ define i64 @test67(<1 x i64> %a) nounwind readnone optsize ssp {
 ; X64-NEXT:    retq
 entry:
   %0 = bitcast <1 x i64> %a to <2 x i32>
-  %mmx_var.i = bitcast <2 x i32> %0 to x86_mmx
-  %1 = tail call x86_mmx @llvm.x86.mmx.pslli.d(x86_mmx %mmx_var.i, i32 3) nounwind
-  %2 = bitcast x86_mmx %1 to <2 x i32>
+  %mmx_var.i = bitcast <2 x i32> %0 to <1 x i64>
+  %1 = tail call <1 x i64> @llvm.x86.mmx.pslli.d(<1 x i64> %mmx_var.i, i32 3) nounwind
+  %2 = bitcast <1 x i64> %1 to <2 x i32>
   %3 = bitcast <2 x i32> %2 to <1 x i64>
   %4 = extractelement <1 x i64> %3, i32 0
   ret i64 %4
 }
 
-declare x86_mmx @llvm.x86.mmx.pslli.w(x86_mmx, i32) nounwind readnone
+declare <1 x i64> @llvm.x86.mmx.pslli.w(<1 x i64>, i32) nounwind readnone
 
 define i64 @test66(<1 x i64> %a) nounwind readnone optsize ssp {
 ; X86-LABEL: test66:
@@ -1074,9 +1074,9 @@ define i64 @test66(<1 x i64> %a) nounwind readnone optsize ssp {
 ; X64-NEXT:    retq
 entry:
   %0 = bitcast <1 x i64> %a to <4 x i16>
-  %mmx_var.i = bitcast <4 x i16> %0 to x86_mmx
-  %1 = tail call x86_mmx @llvm.x86.mmx.pslli.w(x86_mmx %mmx_var.i, i32 3) nounwind
-  %2 = bitcast x86_mmx %1 to <4 x i16>
+  %mmx_var.i = bitcast <4 x i16> %0 to <1 x i64>
+  %1 = tail call <1 x i64> @llvm.x86.mmx.pslli.w(<1 x i64> %mmx_var.i, i32 3) nounwind
+  %2 = bitcast <1 x i64> %1 to <4 x i16>
   %3 = bitcast <4 x i16> %2 to <1 x i64>
   %4 = extractelement <1 x i64> %3, i32 0
   ret i64 %4
@@ -1107,15 +1107,15 @@ define i64 @test66_2(<1 x i64> %a) nounwind readnone optsize ssp {
 ; X64-NEXT:    retq
 entry:
   %0 = bitcast <1 x i64> %a to <4 x i16>
-  %mmx_var.i = bitcast <4 x i16> %0 to x86_mmx
-  %1 = tail call x86_mmx @llvm.x86.mmx.pslli.w(x86_mmx %mmx_var.i, i32 0) nounwind
-  %2 = bitcast x86_mmx %1 to <4 x i16>
+  %mmx_var.i = bitcast <4 x i16> %0 to <1 x i64>
+  %1 = tail call <1 x i64> @llvm.x86.mmx.pslli.w(<1 x i64> %mmx_var.i, i32 0) nounwind
+  %2 = bitcast <1 x i64> %1 to <4 x i16>
   %3 = bitcast <4 x i16> %2 to <1 x i64>
   %4 = extractelement <1 x i64> %3, i32 0
   ret i64 %4
 }
 
-declare x86_mmx @llvm.x86.mmx.psra.d(x86_mmx, x86_mmx) nounwind readnone
+declare <1 x i64> @llvm.x86.mmx.psra.d(<1 x i64>, <1 x i64>) nounwind readnone
 
 define i64 @test65(<1 x i64> %a, <1 x i64> %b) nounwind readnone optsize ssp {
 ; X86-LABEL: test65:
@@ -1146,17 +1146,17 @@ define i64 @test65(<1 x i64> %a, <1 x i64> %b) nounwind readnone optsize ssp {
 ; X64-NEXT:    retq
 entry:
   %0 = bitcast <1 x i64> %a to <2 x i32>
-  %mmx_var.i = bitcast <2 x i32> %0 to x86_mmx
+  %mmx_var.i = bitcast <2 x i32> %0 to <1 x i64>
   %1 = extractelement <1 x i64> %b, i32 0
-  %mmx_var1.i = bitcast i64 %1 to x86_mmx
-  %2 = tail call x86_mmx @llvm.x86.mmx.psra.d(x86_mmx %mmx_var.i, x86_mmx %mmx_var1.i) nounwind
-  %3 = bitcast x86_mmx %2 to <2 x i32>
+  %mmx_var1.i = bitcast i64 %1 to <1 x i64>
+  %2 = tail call <1 x i64> @llvm.x86.mmx.psra.d(<1 x i64> %mmx_var.i, <1 x i64> %mmx_var1.i) nounwind
+  %3 = bitcast <1 x i64> %2 to <2 x i32>
   %4 = bitcast <2 x i32> %3 to <1 x i64>
   %5 = extractelement <1 x i64> %4, i32 0
   ret i64 %5
 }
 
-declare x86_mmx @llvm.x86.mmx.psra.w(x86_mmx, x86_mmx) nounwind readnone
+declare <1 x i64> @llvm.x86.mmx.psra.w(<1 x i64>, <1 x i64>) nounwind readnone
 
 define i64 @test64(<1 x i64> %a, <1 x i64> %b) nounwind readnone optsize ssp {
 ; X86-LABEL: test64:
@@ -1187,17 +1187,17 @@ define i64 @test64(<1 x i64> %a, <1 x i64> %b) nounwind readnone optsize ssp {
 ; X64-NEXT:    retq
 entry:
   %0 = bitcast <1 x i64> %a to <4 x i16>
-  %mmx_var.i = bitcast <4 x i16> %0 to x86_mmx
+  %mmx_var.i = bitcast <4 x i16> %0 to <1 x i64>
   %1 = extractelement <1 x i64> %b, i32 0
-  %mmx_var1.i = bitcast i64 %1 to x86_mmx
-  %2 = tail call x86_mmx @llvm.x86.mmx.psra.w(x86_mmx %mmx_var.i, x86_mmx %mmx_var1.i) nounwind
-  %3 = bitcast x86_mmx %2 to <4 x i16>
+  %mmx_var1.i = bitcast i64 %1 to <1 x i64>
+  %2 = tail call <1 x i64> @llvm.x86.mmx.psra.w(<1 x i64> %mmx_var.i, <1 x i64> %mmx_var1.i) nounwind
+  %3 = bitcast <1 x i64> %2 to <4 x i16>
   %4 = bitcast <4 x i16> %3 to <1 x i64>
   %5 = extractelement <1 x i64> %4, i32 0
   ret i64 %5
 }
 
-declare x86_mmx @llvm.x86.mmx.psrl.q(x86_mmx, x86_mmx) nounwind readnone
+declare <1 x i64> @llvm.x86.mmx.psrl.q(<1 x i64>, <1 x i64>) nounwind readnone
 
 define i64 @test63(<1 x i64> %a, <1 x i64> %b) nounwind readnone optsize ssp {
 ; X86-LABEL: test63:
@@ -1224,15 +1224,15 @@ define i64 @test63(<1 x i64> %a, <1 x i64> %b) nounwind readnone optsize ssp {
 ; X64-NEXT:    retq
 entry:
   %0 = extractelement <1 x i64> %a, i32 0
-  %mmx_var.i = bitcast i64 %0 to x86_mmx
+  %mmx_var.i = bitcast i64 %0 to <1 x i64>
   %1 = extractelement <1 x i64> %b, i32 0
-  %mmx_var1.i = bitcast i64 %1 to x86_mmx
-  %2 = tail call x86_mmx @llvm.x86.mmx.psrl.q(x86_mmx %mmx_var.i, x86_mmx %mmx_var1.i) nounwind
-  %3 = bitcast x86_mmx %2 to i64
+  %mmx_var1.i = bitcast i64 %1 to <1 x i64>
+  %2 = tail call <1 x i64> @llvm.x86.mmx.psrl.q(<1 x i64> %mmx_var.i, <1 x i64> %mmx_var1.i) nounwind
+  %3 = bitcast <1 x i64> %2 to i64
   ret i64 %3
 }
 
-declare x86_mmx @llvm.x86.mmx.psrl.d(x86_mmx, x86_mmx) nounwind readnone
+declare <1 x i64> @llvm.x86.mmx.psrl.d(<1 x i64>, <1 x i64>) nounwind readnone
 
 define i64 @test62(<1 x i64> %a, <1 x i64> %b) nounwind readnone optsize ssp {
 ; X86-LABEL: test62:
@@ -1263,17 +1263,17 @@ define i64 @test62(<1 x i64> %a, <1 x i64> %b) nounwind readnone optsize ssp {
 ; X64-NEXT:    retq
 entry:
   %0 = bitcast <1 x i64> %a to <2 x i32>
-  %mmx_var.i = bitcast <2 x i32> %0 to x86_mmx
+  %mmx_var.i = bitcast <2 x i32> %0 to <1 x i64>
   %1 = extractelement <1 x i64> %b, i32 0
-  %mmx_var1.i = bitcast i64 %1 to x86_mmx
-  %2 = tail call x86_mmx @llvm.x86.mmx.psrl.d(x86_mmx %mmx_var.i, x86_mmx %mmx_var1.i) nounwind
-  %3 = bitcast x86_mmx %2 to <2 x i32>
+  %mmx_var1.i = bitcast i64 %1 to <1 x i64>
+  %2 = tail call <1 x i64> @llvm.x86.mmx.psrl.d(<1 x i64> %mmx_var.i, <1 x i64> %mmx_var1.i) nounwind
+  %3 = bitcast <1 x i64> %2 to <2 x i32>
   %4 = bitcast <2 x i32> %3 to <1 x i64>
   %5 = extractelement <1 x i64> %4, i32 0
   ret i64 %5
 }
 
-declare x86_mmx @llvm.x86.mmx.psrl.w(x86_mmx, x86_mmx) nounwind readnone
+declare <1 x i64> @llvm.x86.mmx.psrl.w(<1 x i64>, <1 x i64>) nounwind readnone
 
 define i64 @test61(<1 x i64> %a, <1 x i64> %b) nounwind readnone optsize ssp {
 ; X86-LABEL: test61:
@@ -1304,17 +1304,17 @@ define i64 @test61(<1 x i64> %a, <1 x i64> %b) nounwind readnone optsize ssp {
 ; X64-NEXT:    retq
 entry:
   %0 = bitcast <1 x i64> %a to <4 x i16>
-  %mmx_var.i = bitcast <4 x i16> %0 to x86_mmx
+  %mmx_var.i = bitcast <4 x i16> %0 to <1 x i64>
   %1 = extractelement <1 x i64> %b, i32 0
-  %mmx_var1.i = bitcast i64 %1 to x86_mmx
-  %2 = tail call x86_mmx @llvm.x86.mmx.psrl.w(x86_mmx %mmx_var.i, x86_mmx %mmx_var1.i) nounwind
-  %3 = bitcast x86_mmx %2 to <4 x i16>
+  %mmx_var1.i = bitcast i64 %1 to <1 x i64>
+  %2 = tail call <1 x i64> @llvm.x86.mmx.psrl.w(<1 x i64> %mmx_var.i, <1 x i64> %mmx_var1.i) nounwind
+  %3 = bitcast <1 x i64> %2 to <4 x i16>
   %4 = bitcast <4 x i16> %3 to <1 x i64>
   %5 = extractelement <1 x i64> %4, i32 0
   ret i64 %5
 }
 
-declare x86_mmx @llvm.x86.mmx.psll.q(x86_mmx, x86_mmx) nounwind readnone
+declare <1 x i64> @llvm.x86.mmx.psll.q(<1 x i64>, <1 x i64>) nounwind readnone
 
 define i64 @test60(<1 x i64> %a, <1 x i64> %b) nounwind readnone optsize ssp {
 ; X86-LABEL: test60:
@@ -1341,15 +1341,15 @@ define i64 @test60(<1 x i64> %a, <1 x i64> %b) nounwind readnone optsize ssp {
 ; X64-NEXT:    retq
 entry:
   %0 = extractelement <1 x i64> %a, i32 0
-  %mmx_var.i = bitcast i64 %0 to x86_mmx
+  %mmx_var.i = bitcast i64 %0 to <1 x i64>
   %1 = extractelement <1 x i64> %b, i32 0
-  %mmx_var1.i = bitcast i64 %1 to x86_mmx
-  %2 = tail call x86_mmx @llvm.x86.mmx.psll.q(x86_mmx %mmx_var.i, x86_mmx %mmx_var1.i) nounwind
-  %3 = bitcast x86_mmx %2 to i64
+  %mmx_var1.i = bitcast i64 %1 to <1 x i64>
+  %2 = tail call <1 x i64> @llvm.x86.mmx.psll.q(<1 x i64> %mmx_var.i, <1 x i64> %mmx_var1.i) nounwind
+  %3 = bitcast <1 x i64> %2 to i64
   ret i64 %3
 }
 
-declare x86_mmx @llvm.x86.mmx.psll.d(x86_mmx, x86_mmx) nounwind readnone
+declare <1 x i64> @llvm.x86.mmx.psll.d(<1 x i64>, <1 x i64>) nounwind readnone
 
 define i64 @test59(<1 x i64> %a, <1 x i64> %b) nounwind readnone optsize ssp {
 ; X86-LABEL: test59:
@@ -1380,17 +1380,17 @@ define i64 @test59(<1 x i64> %a, <1 x i64> %b) nounwind readnone optsize ssp {
 ; X64-NEXT:    retq
 entry:
   %0 = bitcast <1 x i64> %a to <2 x i32>
-  %mmx_var.i = bitcast <2 x i32> %0 to x86_mmx
+  %mmx_var.i = bitcast <2 x i32> %0 to <1 x i64>
   %1 = extractelement <1 x i64> %b, i32 0
-  %mmx_var1.i = bitcast i64 %1 to x86_mmx
-  %2 = tail call x86_mmx @llvm.x86.mmx.psll.d(x86_mmx %mmx_var.i, x86_mmx %mmx_var1.i) nounwind
-  %3 = bitcast x86_mmx %2 to <2 x i32>
+  %mmx_var1.i = bitcast i64 %1 to <1 x i64>
+  %2 = tail call <1 x i64> @llvm.x86.mmx.psll.d(<1 x i64> %mmx_var.i, <1 x i64> %mmx_var1.i) nounwind
+  %3 = bitcast <1 x i64> %2 to <2 x i32>
   %4 = bitcast <2 x i32> %3 to <1 x i64>
   %5 = extractelement <1 x i64> %4, i32 0
   ret i64 %5
 }
 
-declare x86_mmx @llvm.x86.mmx.psll.w(x86_mmx, x86_mmx) nounwind readnone
+declare <1 x i64> @llvm.x86.mmx.psll.w(<1 x i64>, <1 x i64>) nounwind readnone
 
 define i64 @test58(<1 x i64> %a, <1 x i64> %b) nounwind readnone optsize ssp {
 ; X86-LABEL: test58:
@@ -1421,17 +1421,17 @@ define i64 @test58(<1 x i64> %a, <1 x i64> %b) nounwind readnone optsize ssp {
 ; X64-NEXT:    retq
 entry:
   %0 = bitcast <1 x i64> %a to <4 x i16>
-  %mmx_var.i = bitcast <4 x i16> %0 to x86_mmx
+  %mmx_var.i = bitcast <4 x i16> %0 to <1 x i64>
   %1 = extractelement <1 x i64> %b, i32 0
-  %mmx_var1.i = bitcast i64 %1 to x86_mmx
-  %2 = tail call x86_mmx @llvm.x86.mmx.psll.w(x86_mmx %mmx_var.i, x86_mmx %mmx_var1.i) nounwind
-  %3 = bitcast x86_mmx %2 to <4 x i16>
+  %mmx_var1.i = bitcast i64 %1 to <1 x i64>
+  %2 = tail call <1 x i64> @llvm.x86.mmx.psll.w(<1 x i64> %mmx_var.i, <1 x i64> %mmx_var1.i) nounwind
+  %3 = bitcast <1 x i64> %2 to <4 x i16>
   %4 = bitcast <4 x i16> %3 to <1 x i64>
   %5 = extractelement <1 x i64> %4, i32 0
   ret i64 %5
 }
 
-declare x86_mmx @llvm.x86.mmx.pxor(x86_mmx, x86_mmx) nounwind readnone
+declare <1 x i64> @llvm.x86.mmx.pxor(<1 x i64>, <1 x i64>) nounwind readnone
 
 define i64 @test56(<1 x i64> %a, <1 x i64> %b) nounwind readnone optsize ssp {
 ; X86-LABEL: test56:
@@ -1467,16 +1467,16 @@ define i64 @test56(<1 x i64> %a, <1 x i64> %b) nounwind readnone optsize ssp {
 entry:
   %0 = bitcast <1 x i64> %b to <2 x i32>
   %1 = bitcast <1 x i64> %a to <2 x i32>
-  %mmx_var.i = bitcast <2 x i32> %1 to x86_mmx
-  %mmx_var1.i = bitcast <2 x i32> %0 to x86_mmx
-  %2 = tail call x86_mmx @llvm.x86.mmx.pxor(x86_mmx %mmx_var.i, x86_mmx %mmx_var1.i) nounwind
-  %3 = bitcast x86_mmx %2 to <2 x i32>
+  %mmx_var.i = bitcast <2 x i32> %1 to <1 x i64>
+  %mmx_var1.i = bitcast <2 x i32> %0 to <1 x i64>
+  %2 = tail call <1 x i64> @llvm.x86.mmx.pxor(<1 x i64> %mmx_var.i, <1 x i64> %mmx_var1.i) nounwind
+  %3 = bitcast <1 x i64> %2 to <2 x i32>
   %4 = bitcast <2 x i32> %3 to <1 x i64>
   %5 = extractelement <1 x i64> %4, i32 0
   ret i64 %5
 }
 
-declare x86_mmx @llvm.x86.mmx.por(x86_mmx, x86_mmx) nounwind readnone
+declare <1 x i64> @llvm.x86.mmx.por(<1 x i64>, <1 x i64>) nounwind readnone
 
 define i64 @test55(<1 x i64> %a, <1 x i64> %b) nounwind readnone optsize ssp {
 ; X86-LABEL: test55:
@@ -1512,16 +1512,16 @@ define i64 @test55(<1 x i64> %a, <1 x i64> %b) nounwind readnone optsize ssp {
 entry:
   %0 = bitcast <1 x i64> %b to <2 x i32>
   %1 = bitcast <1 x i64> %a to <2 x i32>
-  %mmx_var.i = bitcast <2 x i32> %1 to x86_mmx
-  %mmx_var1.i = bitcast <2 x i32> %0 to x86_mmx
-  %2 = tail call x86_mmx @llvm.x86.mmx.por(x86_mmx %mmx_var.i, x86_mmx %mmx_var1.i) nounwind
-  %3 = bitcast x86_mmx %2 to <2 x i32>
+  %mmx_var.i = bitcast <2 x i32> %1 to <1 x i64>
+  %mmx_var1.i = bitcast <2 x i32> %0 to <1 x i64>
+  %2 = tail call <1 x i64> @llvm.x86.mmx.por(<1 x i64> %mmx_var.i, <1 x i64> %mmx_var1.i) nounwind
+  %3 = bitcast <1 x i64> %2 to <2 x i32>
   %4 = bitcast <2 x i32> %3 to <1 x i64>
   %5 = extractelement <1 x i64> %4, i32 0
   ret i64 %5
 }
 
-declare x86_mmx @llvm.x86.mmx.pandn(x86_mmx, x86_mmx) nounwind readnone
+declare <1 x i64> @llvm.x86.mmx.pandn(<1 x i64>, <1 x i64>) nounwind readnone
 
 define i64 @test54(<1 x i64> %a, <1 x i64> %b) nounwind readnone optsize ssp {
 ; X86-LABEL: test54:
@@ -1557,16 +1557,16 @@ define i64 @test54(<1 x i64> %a, <1 x i64> %b) nounwind readnone optsize ssp {
 entry:
   %0 = bitcast <1 x i64> %b to <2 x i32>
   %1 = bitcast <1 x i64> %a to <2 x i32>
-  %mmx_var.i = bitcast <2 x i32> %1 to x86_mmx
-  %mmx_var1.i = bitcast <2 x i32> %0 to x86_mmx
-  %2 = tail call x86_mmx @llvm.x86.mmx.pandn(x86_mmx %mmx_var.i, x86_mmx %mmx_var1.i) nounwind
-  %3 = bitcast x86_mmx %2 to <2 x i32>
+  %mmx_var.i = bitcast <2 x i32> %1 to <1 x i64>
+  %mmx_var1.i = bitcast <2 x i32> %0 to <1 x i64>
+  %2 = tail call <1 x i64> @llvm.x86.mmx.pandn(<1 x i64> %mmx_var.i, <1 x i64> %mmx_var1.i) nounwind
+  %3 = bitcast <1 x i64> %2 to <2 x i32>
   %4 = bitcast <2 x i32> %3 to <1 x i64>
   %5 = extractelement <1 x i64> %4, i32 0
   ret i64 %5
 }
 
-declare x86_mmx @llvm.x86.mmx.pand(x86_mmx, x86_mmx) nounwind readnone
+declare <1 x i64> @llvm.x86.mmx.pand(<1 x i64>, <1 x i64>) nounwind readnone
 
 define i64 @test53(<1 x i64> %a, <1 x i64> %b) nounwind readnone optsize ssp {
 ; X86-LABEL: test53:
@@ -1602,16 +1602,16 @@ define i64 @test53(<1 x i64> %a, <1 x i64> %b) nounwind readnone optsize ssp {
 entry:
   %0 = bitcast <1 x i64> %b to <2 x i32>
   %1 = bitcast <1 x i64> %a to <2 x i32>
-  %mmx_var.i = bitcast <2 x i32> %1 to x86_mmx
-  %mmx_var1.i = bitcast <2 x i32> %0 to x86_mmx
-  %2 = tail call x86_mmx @llvm.x86.mmx.pand(x86_mmx %mmx_var.i, x86_mmx %mmx_var1.i) nounwind
-  %3 = bitcast x86_mmx %2 to <2 x i32>
+  %mmx_var.i = bitcast <2 x i32> %1 to <1 x i64>
+  %mmx_var1.i = bitcast <2 x i32> %0 to <1 x i64>
+  %2 = tail call <1 x i64> @llvm.x86.mmx.pand(<1 x i64> %mmx_var.i, <1 x i64> %mmx_var1.i) nounwind
+  %3 = bitcast <1 x i64> %2 to <2 x i32>
   %4 = bitcast <2 x i32> %3 to <1 x i64>
   %5 = extractelement <1 x i64> %4, i32 0
   ret i64 %5
 }
 
-declare x86_mmx @llvm.x86.mmx.pmull.w(x86_mmx, x86_mmx) nounwind readnone
+declare <1 x i64> @llvm.x86.mmx.pmull.w(<1 x i64>, <1 x i64>) nounwind readnone
 
 define i64 @test52(<1 x i64> %a, <1 x i64> %b) nounwind readnone optsize ssp {
 ; X86-LABEL: test52:
@@ -1647,10 +1647,10 @@ define i64 @test52(<1 x i64> %a, <1 x i64> %b) nounwind readnone optsize ssp {
 entry:
   %0 = bitcast <1 x i64> %b to <4 x i16>
   %1 = bitcast <1 x i64> %a to <4 x i16>
-  %mmx_var.i = bitcast <4 x i16> %1 to x86_mmx
-  %mmx_var1.i = bitcast <4 x i16> %0 to x86_mmx
-  %2 = tail call x86_mmx @llvm.x86.mmx.pmull.w(x86_mmx %mmx_var.i, x86_mmx %mmx_var1.i) nounwind
-  %3 = bitcast x86_mmx %2 to <4 x i16>
+  %mmx_var.i = bitcast <4 x i16> %1 to <1 x i64>
+  %mmx_var1.i = bitcast <4 x i16> %0 to <1 x i64>
+  %2 = tail call <1 x i64> @llvm.x86.mmx.pmull.w(<1 x i64> %mmx_var.i, <1 x i64> %mmx_var1.i) nounwind
+  %3 = bitcast <1 x i64> %2 to <4 x i16>
   %4 = bitcast <4 x i16> %3 to <1 x i64>
   %5 = extractelement <1 x i64> %4, i32 0
   ret i64 %5
@@ -1690,16 +1690,16 @@ define i64 @test51(<1 x i64> %a, <1 x i64> %b) nounwind readnone optsize ssp {
 entry:
   %0 = bitcast <1 x i64> %b to <4 x i16>
   %1 = bitcast <1 x i64> %a to <4 x i16>
-  %mmx_var.i = bitcast <4 x i16> %1 to x86_mmx
-  %mmx_var1.i = bitcast <4 x i16> %0 to x86_mmx
-  %2 = tail call x86_mmx @llvm.x86.mmx.pmull.w(x86_mmx %mmx_var.i, x86_mmx %mmx_var1.i) nounwind
-  %3 = bitcast x86_mmx %2 to <4 x i16>
+  %mmx_var.i = bitcast <4 x i16> %1 to <1 x i64>
+  %mmx_var1.i = bitcast <4 x i16> %0 to <1 x i64>
+  %2 = tail call <1 x i64> @llvm.x86.mmx.pmull.w(<1 x i64> %mmx_var.i, <1 x i64> %mmx_var1.i) nounwind
+  %3 = bitcast <1 x i64> %2 to <4 x i16>
   %4 = bitcast <4 x i16> %3 to <1 x i64>
   %5 = extractelement <1 x i64> %4, i32 0
   ret i64 %5
 }
 
-declare x86_mmx @llvm.x86.mmx.pmulh.w(x86_mmx, x86_mmx) nounwind readnone
+declare <1 x i64> @llvm.x86.mmx.pmulh.w(<1 x i64>, <1 x i64>) nounwind readnone
 
 define i64 @test50(<1 x i64> %a, <1 x i64> %b) nounwind readnone optsize ssp {
 ; X86-LABEL: test50:
@@ -1735,16 +1735,16 @@ define i64 @test50(<1 x i64> %a, <1 x i64> %b) nounwind readnone optsize ssp {
 entry:
   %0 = bitcast <1 x i64> %b to <4 x i16>
   %1 = bitcast <1 x i64> %a to <4 x i16>
-  %mmx_var.i = bitcast <4 x i16> %1 to x86_mmx
-  %mmx_var1.i = bitcast <4 x i16> %0 to x86_mmx
-  %2 = tail call x86_mmx @llvm.x86.mmx.pmulh.w(x86_mmx %mmx_var.i, x86_mmx %mmx_var1.i) nounwind
-  %3 = bitcast x86_mmx %2 to <4 x i16>
+  %mmx_var.i = bitcast <4 x i16> %1 to <1 x i64>
+  %mmx_var1.i = bitcast <4 x i16> %0 to <1 x i64>
+  %2 = tail call <1 x i64> @llvm.x86.mmx.pmulh.w(<1 x i64> %mmx_var.i, <1 x i64> %mmx_var1.i) nounwind
+  %3 = bitcast <1 x i64> %2 to <4 x i16>
   %4 = bitcast <4 x i16> %3 to <1 x i64>
   %5 = extractelement <1 x i64> %4, i32 0
   ret i64 %5
 }
 
-declare x86_mmx @llvm.x86.mmx.pmadd.wd(x86_mmx, x86_mmx) nounwind readnone
+declare <1 x i64> @llvm.x86.mmx.pmadd.wd(<1 x i64>, <1 x i64>) nounwind readnone
 
 define i64 @test49(<1 x i64> %a, <1 x i64> %b) nounwind readnone optsize ssp {
 ; X86-LABEL: test49:
@@ -1780,16 +1780,16 @@ define i64 @test49(<1 x i64> %a, <1 x i64> %b) nounwind readnone optsize ssp {
 entry:
   %0 = bitcast <1 x i64> %b to <4 x i16>
   %1 = bitcast <1 x i64> %a to <4 x i16>
-  %mmx_var.i = bitcast <4 x i16> %1 to x86_mmx
-  %mmx_var1.i = bitcast <4 x i16> %0 to x86_mmx
-  %2 = tail call x86_mmx @llvm.x86.mmx.pmadd.wd(x86_mmx %mmx_var.i, x86_mmx %mmx_var1.i) nounwind
-  %3 = bitcast x86_mmx %2 to <2 x i32>
+  %mmx_var.i = bitcast <4 x i16> %1 to <1 x i64>
+  %mmx_var1.i = bitcast <4 x i16> %0 to <1 x i64>
+  %2 = tail call <1 x i64> @llvm.x86.mmx.pmadd.wd(<1 x i64> %mmx_var.i, <1 x i64> %mmx_var1.i) nounwind
+  %3 = bitcast <1 x i64> %2 to <2 x i32>
   %4 = bitcast <2 x i32> %3 to <1 x i64>
   %5 = extractelement <1 x i64> %4, i32 0
   ret i64 %5
 }
 
-declare x86_mmx @llvm.x86.mmx.psubus.w(x86_mmx, x86_mmx) nounwind readnone
+declare <1 x i64> @llvm.x86.mmx.psubus.w(<1 x i64>, <1 x i64>) nounwind readnone
 
 define i64 @test48(<1 x i64> %a, <1 x i64> %b) nounwind readnone optsize ssp {
 ; X86-LABEL: test48:
@@ -1825,16 +1825,16 @@ define i64 @test48(<1 x i64> %a, <1 x i64> %b) nounwind readnone optsize ssp {
 entry:
   %0 = bitcast <1 x i64> %b to <4 x i16>
   %1 = bitcast <1 x i64> %a to <4 x i16>
-  %mmx_var.i = bitcast <4 x i16> %1 to x86_mmx
-  %mmx_var1.i = bitcast <4 x i16> %0 to x86_mmx
-  %2 = tail call x86_mmx @llvm.x86.mmx.psubus.w(x86_mmx %mmx_var.i, x86_mmx %mmx_var1.i) nounwind
-  %3 = bitcast x86_mmx %2 to <4 x i16>
+  %mmx_var.i = bitcast <4 x i16> %1 to <1 x i64>
+  %mmx_var1.i = bitcast <4 x i16> %0 to <1 x i64>
+  %2 = tail call <1 x i64> @llvm.x86.mmx.psubus.w(<1 x i64> %mmx_var.i, <1 x i64> %mmx_var1.i) nounwind
+  %3 = bitcast <1 x i64> %2 to <4 x i16>
   %4 = bitcast <4 x i16> %3 to <1 x i64>
   %5 = extractelement <1 x i64> %4, i32 0
   ret i64 %5
 }
 
-declare x86_mmx @llvm.x86.mmx.psubus.b(x86_mmx, x86_mmx) nounwind readnone
+declare <1 x i64> @llvm.x86.mmx.psubus.b(<1 x i64>, <1 x i64>) nounwind readnone
 
 define i64 @test47(<1 x i64> %a, <1 x i64> %b) nounwind readnone optsize ssp {
 ; X86-LABEL: test47:
@@ -1870,16 +1870,16 @@ define i64 @test47(<1 x i64> %a, <1 x i64> %b) nounwind readnone optsize ssp {
 entry:
   %0 = bitcast <1 x i64> %b to <8 x i8>
   %1 = bitcast <1 x i64> %a to <8 x i8>
-  %mmx_var.i = bitcast <8 x i8> %1 to x86_mmx
-  %mmx_var1.i = bitcast <8 x i8> %0 to x86_mmx
-  %2 = tail call x86_mmx @llvm.x86.mmx.psubus.b(x86_mmx %mmx_var.i, x86_mmx %mmx_var1.i) nounwind
-  %3 = bitcast x86_mmx %2 to <8 x i8>
+  %mmx_var.i = bitcast <8 x i8> %1 to <1 x i64>
+  %mmx_var1.i = bitcast <8 x i8> %0 to <1 x i64>
+  %2 = tail call <1 x i64> @llvm.x86.mmx.psubus.b(<1 x i64> %mmx_var.i, <1 x i64> %mmx_var1.i) nounwind
+  %3 = bitcast <1 x i64> %2 to <8 x i8>
   %4 = bitcast <8 x i8> %3 to <1 x i64>
   %5 = extractelement <1 x i64> %4, i32 0
   ret i64 %5
 }
 
-declare x86_mmx @llvm.x86.mmx.psubs.w(x86_mmx, x86_mmx) nounwind readnone
+declare <1 x i64> @llvm.x86.mmx.psubs.w(<1 x i64>, <1 x i64>) nounwind readnone
 
 define i64 @test46(<1 x i64> %a, <1 x i64> %b) nounwind readnone optsize ssp {
 ; X86-LABEL: test46:
@@ -1915,16 +1915,16 @@ define i64 @test46(<1 x i64> %a, <1 x i64> %b) nounwind readnone optsize ssp {
 entry:
   %0 = bitcast <1 x i64> %b to <4 x i16>
   %1 = bitcast <1 x i64> %a to <4 x i16>
-  %mmx_var.i = bitcast <4 x i16> %1 to x86_mmx
-  %mmx_var1.i = bitcast <4 x i16> %0 to x86_mmx
-  %2 = tail call x86_mmx @llvm.x86.mmx.psubs.w(x86_mmx %mmx_var.i, x86_mmx %mmx_var1.i) nounwind
-  %3 = bitcast x86_mmx %2 to <4 x i16>
+  %mmx_var.i = bitcast <4 x i16> %1 to <1 x i64>
+  %mmx_var1.i = bitcast <4 x i16> %0 to <1 x i64>
+  %2 = tail call <1 x i64> @llvm.x86.mmx.psubs.w(<1 x i64> %mmx_var.i, <1 x i64> %mmx_var1.i) nounwind
+  %3 = bitcast <1 x i64> %2 to <4 x i16>
   %4 = bitcast <4 x i16> %3 to <1 x i64>
   %5 = extractelement <1 x i64> %4, i32 0
   ret i64 %5
 }
 
-declare x86_mmx @llvm.x86.mmx.psubs.b(x86_mmx, x86_mmx) nounwind readnone
+declare <1 x i64> @llvm.x86.mmx.psubs.b(<1 x i64>, <1 x i64>) nounwind readnone
 
 define i64 @test45(<1 x i64> %a, <1 x i64> %b) nounwind readnone optsize ssp {
 ; X86-LABEL: test45:
@@ -1960,10 +1960,10 @@ define i64 @test45(<1 x i64> %a, <1 x i64> %b) nounwind readnone optsize ssp {
 entry:
   %0 = bitcast <1 x i64> %b to <8 x i8>
   %1 = bitcast <1 x i64> %a to <8 x i8>
-  %mmx_var.i = bitcast <8 x i8> %1 to x86_mmx
-  %mmx_var1.i = bitcast <8 x i8> %0 to x86_mmx
-  %2 = tail call x86_mmx @llvm.x86.mmx.psubs.b(x86_mmx %mmx_var.i, x86_mmx %mmx_var1.i) nounwind
-  %3 = bitcast x86_mmx %2 to <8 x i8>
+  %mmx_var.i = bitcast <8 x i8> %1 to <1 x i64>
+  %mmx_var1.i = bitcast <8 x i8> %0 to <1 x i64>
+  %2 = tail call <1 x i64> @llvm.x86.mmx.psubs.b(<1 x i64> %mmx_var.i, <1 x i64> %mmx_var1.i) nounwind
+  %3 = bitcast <1 x i64> %2 to <8 x i8>
   %4 = bitcast <8 x i8> %3 to <1 x i64>
   %5 = extractelement <1 x i64> %4, i32 0
   ret i64 %5
@@ -1994,17 +1994,17 @@ define i64 @test44(<1 x i64> %a, <1 x i64> %b) nounwind readnone optsize ssp {
 ; X64-NEXT:    retq
 entry:
   %0 = extractelement <1 x i64> %a, i32 0
-  %mmx_var = bitcast i64 %0 to x86_mmx
+  %mmx_var = bitcast i64 %0 to <1 x i64>
   %1 = extractelement <1 x i64> %b, i32 0
-  %mmx_var1 = bitcast i64 %1 to x86_mmx
-  %2 = tail call x86_mmx @llvm.x86.mmx.psub.q(x86_mmx %mmx_var, x86_mmx %mmx_var1)
-  %3 = bitcast x86_mmx %2 to i64
+  %mmx_var1 = bitcast i64 %1 to <1 x i64>
+  %2 = tail call <1 x i64> @llvm.x86.mmx.psub.q(<1 x i64> %mmx_var, <1 x i64> %mmx_var1)
+  %3 = bitcast <1 x i64> %2 to i64
   ret i64 %3
 }
 
-declare x86_mmx @llvm.x86.mmx.psub.q(x86_mmx, x86_mmx) nounwind readnone
+declare <1 x i64> @llvm.x86.mmx.psub.q(<1 x i64>, <1 x i64>) nounwind readnone
 
-declare x86_mmx @llvm.x86.mmx.psub.d(x86_mmx, x86_mmx) nounwind readnone
+declare <1 x i64> @llvm.x86.mmx.psub.d(<1 x i64>, <1 x i64>) nounwind readnone
 
 define i64 @test43(<1 x i64> %a, <1 x i64> %b) nounwind readnone optsize ssp {
 ; X86-LABEL: test43:
@@ -2040,16 +2040,16 @@ define i64 @test43(<1 x i64> %a, <1 x i64> %b) nounwind readnone optsize ssp {
 entry:
   %0 = bitcast <1 x i64> %b to <2 x i32>
   %1 = bitcast <1 x i64> %a to <2 x i32>
-  %mmx_var.i = bitcast <2 x i32> %1 to x86_mmx
-  %mmx_var1.i = bitcast <2 x i32> %0 to x86_mmx
-  %2 = tail call x86_mmx @llvm.x86.mmx.psub.d(x86_mmx %mmx_var.i, x86_mmx %mmx_var1.i) nounwind
-  %3 = bitcast x86_mmx %2 to <2 x i32>
+  %mmx_var.i = bitcast <2 x i32> %1 to <1 x i64>
+  %mmx_var1.i = bitcast <2 x i32> %0 to <1 x i64>
+  %2 = tail call <1 x i64> @llvm.x86.mmx.psub.d(<1 x i64> %mmx_var.i, <1 x i64> %mmx_var1.i) nounwind
+  %3 = bitcast <1 x i64> %2 to <2 x i32>
   %4 = bitcast <2 x i32> %3 to <1 x i64>
   %5 = extractelement <1 x i64> %4, i32 0
   ret i64 %5
 }
 
-declare x86_mmx @llvm.x86.mmx.psub.w(x86_mmx, x86_mmx) nounwind readnone
+declare <1 x i64> @llvm.x86.mmx.psub.w(<1 x i64>, <1 x i64>) nounwind readnone
 
 define i64 @test42(<1 x i64> %a, <1 x i64> %b) nounwind readnone optsize ssp {
 ; X86-LABEL: test42:
@@ -2085,16 +2085,16 @@ define i64 @test42(<1 x i64> %a, <1 x i64> %b) nounwind readnone optsize ssp {
 entry:
   %0 = bitcast <1 x i64> %b to <4 x i16>
   %1 = bitcast <1 x i64> %a to <4 x i16>
-  %mmx_var.i = bitcast <4 x i16> %1 to x86_mmx
-  %mmx_var1.i = bitcast <4 x i16> %0 to x86_mmx
-  %2 = tail call x86_mmx @llvm.x86.mmx.psub.w(x86_mmx %mmx_var.i, x86_mmx %mmx_var1.i) nounwind
-  %3 = bitcast x86_mmx %2 to <4 x i16>
+  %mmx_var.i = bitcast <4 x i16> %1 to <1 x i64>
+  %mmx_var1.i = bitcast <4 x i16> %0 to <1 x i64>
+  %2 = tail call <1 x i64> @llvm.x86.mmx.psub.w(<1 x i64> %mmx_var.i, <1 x i64> %mmx_var1.i) nounwind
+  %3 = bitcast <1 x i64> %2 to <4 x i16>
   %4 = bitcast <4 x i16> %3 to <1 x i64>
   %5 = extractelement <1 x i64> %4, i32 0
   ret i64 %5
 }
 
-declare x86_mmx @llvm.x86.mmx.psub.b(x86_mmx, x86_mmx) nounwind readnone
+declare <1 x i64> @llvm.x86.mmx.psub.b(<1 x i64>, <1 x i64>) nounwind readnone
 
 define i64 @test41(<1 x i64> %a, <1 x i64> %b) nounwind readnone optsize ssp {
 ; X86-LABEL: test41:
@@ -2130,16 +2130,16 @@ define i64 @test41(<1 x i64> %a, <1 x i64> %b) nounwind readnone optsize ssp {
 entry:
   %0 = bitcast <1 x i64> %b to <8 x i8>
   %1 = bitcast <1 x i64> %a to <8 x i8>
-  %mmx_var.i = bitcast <8 x i8> %1 to x86_mmx
-  %mmx_var1.i = bitcast <8 x i8> %0 to x86_mmx
-  %2 = tail call x86_mmx @llvm.x86.mmx.psub.b(x86_mmx %mmx_var.i, x86_mmx %mmx_var1.i) nounwind
-  %3 = bitcast x86_mmx %2 to <8 x i8>
+  %mmx_var.i = bitcast <8 x i8> %1 to <1 x i64>
+  %mmx_var1.i = bitcast <8 x i8> %0 to <1 x i64>
+  %2 = tail call <1 x i64> @llvm.x86.mmx.psub.b(<1 x i64> %mmx_var.i, <1 x i64> %mmx_var1.i) nounwind
+  %3 = bitcast <1 x i64> %2 to <8 x i8>
   %4 = bitcast <8 x i8> %3 to <1 x i64>
   %5 = extractelement <1 x i64> %4, i32 0
   ret i64 %5
 }
 
-declare x86_mmx @llvm.x86.mmx.paddus.w(x86_mmx, x86_mmx) nounwind readnone
+declare <1 x i64> @llvm.x86.mmx.paddus.w(<1 x i64>, <1 x i64>) nounwind readnone
 
 define i64 @test40(<1 x i64> %a, <1 x i64> %b) nounwind readnone optsize ssp {
 ; X86-LABEL: test40:
@@ -2175,16 +2175,16 @@ define i64 @test40(<1 x i64> %a, <1 x i64> %b) nounwind readnone optsize ssp {
 entry:
   %0 = bitcast <1 x i64> %b to <4 x i16>
   %1 = bitcast <1 x i64> %a to <4 x i16>
-  %mmx_var.i = bitcast <4 x i16> %1 to x86_mmx
-  %mmx_var1.i = bitcast <4 x i16> %0 to x86_mmx
-  %2 = tail call x86_mmx @llvm.x86.mmx.paddus.w(x86_mmx %mmx_var.i, x86_mmx %mmx_var1.i) nounwind
-  %3 = bitcast x86_mmx %2 to <4 x i16>
+  %mmx_var.i = bitcast <4 x i16> %1 to <1 x i64>
+  %mmx_var1.i = bitcast <4 x i16> %0 to <1 x i64>
+  %2 = tail call <1 x i64> @llvm.x86.mmx.paddus.w(<1 x i64> %mmx_var.i, <1 x i64> %mmx_var1.i) nounwind
+  %3 = bitcast <1 x i64> %2 to <4 x i16>
   %4 = bitcast <4 x i16> %3 to <1 x i64>
   %5 = extractelement <1 x i64> %4, i32 0
   ret i64 %5
 }
 
-declare x86_mmx @llvm.x86.mmx.paddus.b(x86_mmx, x86_mmx) nounwind readnone
+declare <1 x i64> @llvm.x86.mmx.paddus.b(<1 x i64>, <1 x i64>) nounwind readnone
 
 define i64 @test39(<1 x i64> %a, <1 x i64> %b) nounwind readnone optsize ssp {
 ; X86-LABEL: test39:
@@ -2220,16 +2220,16 @@ define i64 @test39(<1 x i64> %a, <1 x i64> %b) nounwind readnone optsize ssp {
 entry:
   %0 = bitcast <1 x i64> %b to <8 x i8>
   %1 = bitcast <1 x i64> %a to <8 x i8>
-  %mmx_var.i = bitcast <8 x i8> %1 to x86_mmx
-  %mmx_var1.i = bitcast <8 x i8> %0 to x86_mmx
-  %2 = tail call x86_mmx @llvm.x86.mmx.paddus.b(x86_mmx %mmx_var.i, x86_mmx %mmx_var1.i) nounwind
-  %3 = bitcast x86_mmx %2 to <8 x i8>
+  %mmx_var.i = bitcast <8 x i8> %1 to <1 x i64>
+  %mmx_var1.i = bitcast <8 x i8> %0 to <1 x i64>
+  %2 = tail call <1 x i64> @llvm.x86.mmx.paddus.b(<1 x i64> %mmx_var.i, <1 x i64> %mmx_var1.i) nounwind
+  %3 = bitcast <1 x i64> %2 to <8 x i8>
   %4 = bitcast <8 x i8> %3 to <1 x i64>
   %5 = extractelement <1 x i64> %4, i32 0
   ret i64 %5
 }
 
-declare x86_mmx @llvm.x86.mmx.padds.w(x86_mmx, x86_mmx) nounwind readnone
+declare <1 x i64> @llvm.x86.mmx.padds.w(<1 x i64>, <1 x i64>) nounwind readnone
 
 define i64 @test38(<1 x i64> %a, <1 x i64> %b) nounwind readnone optsize ssp {
 ; X86-LABEL: test38:
@@ -2265,16 +2265,16 @@ define i64 @test38(<1 x i64> %a, <1 x i64> %b) nounwind readnone optsize ssp {
 entry:
   %0 = bitcast <1 x i64> %b to <4 x i16>
   %1 = bitcast <1 x i64> %a to <4 x i16>
-  %mmx_var.i = bitcast <4 x i16> %1 to x86_mmx
-  %mmx_var1.i = bitcast <4 x i16> %0 to x86_mmx
-  %2 = tail call x86_mmx @llvm.x86.mmx.padds.w(x86_mmx %mmx_var.i, x86_mmx %mmx_var1.i) nounwind
-  %3 = bitcast x86_mmx %2 to <4 x i16>
+  %mmx_var.i = bitcast <4 x i16> %1 to <1 x i64>
+  %mmx_var1.i = bitcast <4 x i16> %0 to <1 x i64>
+  %2 = tail call <1 x i64> @llvm.x86.mmx.padds.w(<1 x i64> %mmx_var.i, <1 x i64> %mmx_var1.i) nounwind
+  %3 = bitcast <1 x i64> %2 to <4 x i16>
   %4 = bitcast <4 x i16> %3 to <1 x i64>
   %5 = extractelement <1 x i64> %4, i32 0
   ret i64 %5
 }
 
-declare x86_mmx @llvm.x86.mmx.padds.b(x86_mmx, x86_mmx) nounwind readnone
+declare <1 x i64> @llvm.x86.mmx.padds.b(<1 x i64>, <1 x i64>) nounwind readnone
 
 define i64 @test37(<1 x i64> %a, <1 x i64> %b) nounwind readnone optsize ssp {
 ; X86-LABEL: test37:
@@ -2310,16 +2310,16 @@ define i64 @test37(<1 x i64> %a, <1 x i64> %b) nounwind readnone optsize ssp {
 entry:
   %0 = bitcast <1 x i64> %b to <8 x i8>
   %1 = bitcast <1 x i64> %a to <8 x i8>
-  %mmx_var.i = bitcast <8 x i8> %1 to x86_mmx
-  %mmx_var1.i = bitcast <8 x i8> %0 to x86_mmx
-  %2 = tail call x86_mmx @llvm.x86.mmx.padds.b(x86_mmx %mmx_var.i, x86_mmx %mmx_var1.i) nounwind
-  %3 = bitcast x86_mmx %2 to <8 x i8>
+  %mmx_var.i = bitcast <8 x i8> %1 to <1 x i64>
+  %mmx_var1.i = bitcast <8 x i8> %0 to <1 x i64>
+  %2 = tail call <1 x i64> @llvm.x86.mmx.padds.b(<1 x i64> %mmx_var.i, <1 x i64> %mmx_var1.i) nounwind
+  %3 = bitcast <1 x i64> %2 to <8 x i8>
   %4 = bitcast <8 x i8> %3 to <1 x i64>
   %5 = extractelement <1 x i64> %4, i32 0
   ret i64 %5
 }
 
-declare x86_mmx @llvm.x86.mmx.padd.q(x86_mmx, x86_mmx) nounwind readnone
+declare <1 x i64> @llvm.x86.mmx.padd.q(<1 x i64>, <1 x i64>) nounwind readnone
 
 define i64 @test36(<1 x i64> %a, <1 x i64> %b) nounwind readnone optsize ssp {
 ; X86-LABEL: test36:
@@ -2346,15 +2346,15 @@ define i64 @test36(<1 x i64> %a, <1 x i64> %b) nounwind readnone optsize ssp {
 ; X64-NEXT:    retq
 entry:
   %0 = extractelement <1 x i64> %a, i32 0
-  %mmx_var = bitcast i64 %0 to x86_mmx
+  %mmx_var = bitcast i64 %0 to <1 x i64>
   %1 = extractelement <1 x i64> %b, i32 0
-  %mmx_var1 = bitcast i64 %1 to x86_mmx
-  %2 = tail call x86_mmx @llvm.x86.mmx.padd.q(x86_mmx %mmx_var, x86_mmx %mmx_var1)
-  %3 = bitcast x86_mmx %2 to i64
+  %mmx_var1 = bitcast i64 %1 to <1 x i64>
+  %2 = tail call <1 x i64> @llvm.x86.mmx.padd.q(<1 x i64> %mmx_var, <1 x i64> %mmx_var1)
+  %3 = bitcast <1 x i64> %2 to i64
   ret i64 %3
 }
 
-declare x86_mmx @llvm.x86.mmx.padd.d(x86_mmx, x86_mmx) nounwind readnone
+declare <1 x i64> @llvm.x86.mmx.padd.d(<1 x i64>, <1 x i64>) nounwind readnone
 
 define i64 @test35(<1 x i64> %a, <1 x i64> %b) nounwind readnone optsize ssp {
 ; X86-LABEL: test35:
@@ -2390,16 +2390,16 @@ define i64 @test35(<1 x i64> %a, <1 x i64> %b) nounwind readnone optsize ssp {
 entry:
   %0 = bitcast <1 x i64> %b to <2 x i32>
   %1 = bitcast <1 x i64> %a to <2 x i32>
-  %mmx_var.i = bitcast <2 x i32> %1 to x86_mmx
-  %mmx_var1.i = bitcast <2 x i32> %0 to x86_mmx
-  %2 = tail call x86_mmx @llvm.x86.mmx.padd.d(x86_mmx %mmx_var.i, x86_mmx %mmx_var1.i) nounwind
-  %3 = bitcast x86_mmx %2 to <2 x i32>
+  %mmx_var.i = bitcast <2 x i32> %1 to <1 x i64>
+  %mmx_var1.i = bitcast <2 x i32> %0 to <1 x i64>
+  %2 = tail call <1 x i64> @llvm.x86.mmx.padd.d(<1 x i64> %mmx_var.i, <1 x i64> %mmx_var1.i) nounwind
+  %3 = bitcast <1 x i64> %2 to <2 x i32>
   %4 = bitcast <2 x i32> %3 to <1 x i64>
   %5 = extractelement <1 x i64> %4, i32 0
   ret i64 %5
 }
 
-declare x86_mmx @llvm.x86.mmx.padd.w(x86_mmx, x86_mmx) nounwind readnone
+declare <1 x i64> @llvm.x86.mmx.padd.w(<1 x i64>, <1 x i64>) nounwind readnone
 
 define i64 @test34(<1 x i64> %a, <1 x i64> %b) nounwind readnone optsize ssp {
 ; X86-LABEL: test34:
@@ -2435,16 +2435,16 @@ define i64 @test34(<1 x i64> %a, <1 x i64> %b) nounwind readnone optsize ssp {
 entry:
   %0 = bitcast <1 x i64> %b to <4 x i16>
   %1 = bitcast <1 x i64> %a to <4 x i16>
-  %mmx_var.i = bitcast <4 x i16> %1 to x86_mmx
-  %mmx_var1.i = bitcast <4 x i16> %0 to x86_mmx
-  %2 = tail call x86_mmx @llvm.x86.mmx.padd.w(x86_mmx %mmx_var.i, x86_mmx %mmx_var1.i) nounwind
-  %3 = bitcast x86_mmx %2 to <4 x i16>
+  %mmx_var.i = bitcast <4 x i16> %1 to <1 x i64>
+  %mmx_var1.i = bitcast <4 x i16> %0 to <1 x i64>
+  %2 = tail call <1 x i64> @llvm.x86.mmx.padd.w(<1 x i64> %mmx_var.i, <1 x i64> %mmx_var1.i) nounwind
+  %3 = bitcast <1 x i64> %2 to <4 x i16>
   %4 = bitcast <4 x i16> %3 to <1 x i64>
   %5 = extractelement <1 x i64> %4, i32 0
   ret i64 %5
 }
 
-declare x86_mmx @llvm.x86.mmx.padd.b(x86_mmx, x86_mmx) nounwind readnone
+declare <1 x i64> @llvm.x86.mmx.padd.b(<1 x i64>, <1 x i64>) nounwind readnone
 
 define i64 @test33(<1 x i64> %a, <1 x i64> %b) nounwind readnone optsize ssp {
 ; X86-LABEL: test33:
@@ -2480,16 +2480,16 @@ define i64 @test33(<1 x i64> %a, <1 x i64> %b) nounwind readnone optsize ssp {
 entry:
   %0 = bitcast <1 x i64> %b to <8 x i8>
   %1 = bitcast <1 x i64> %a to <8 x i8>
-  %mmx_var.i = bitcast <8 x i8> %1 to x86_mmx
-  %mmx_var1.i = bitcast <8 x i8> %0 to x86_mmx
-  %2 = tail call x86_mmx @llvm.x86.mmx.padd.b(x86_mmx %mmx_var.i, x86_mmx %mmx_var1.i) nounwind
-  %3 = bitcast x86_mmx %2 to <8 x i8>
+  %mmx_var.i = bitcast <8 x i8> %1 to <1 x i64>
+  %mmx_var1.i = bitcast <8 x i8> %0 to <1 x i64>
+  %2 = tail call <1 x i64> @llvm.x86.mmx.padd.b(<1 x i64> %mmx_var.i, <1 x i64> %mmx_var1.i) nounwind
+  %3 = bitcast <1 x i64> %2 to <8 x i8>
   %4 = bitcast <8 x i8> %3 to <1 x i64>
   %5 = extractelement <1 x i64> %4, i32 0
   ret i64 %5
 }
 
-declare x86_mmx @llvm.x86.mmx.psad.bw(x86_mmx, x86_mmx) nounwind readnone
+declare <1 x i64> @llvm.x86.mmx.psad.bw(<1 x i64>, <1 x i64>) nounwind readnone
 
 define i64 @test32(<1 x i64> %a, <1 x i64> %b) nounwind readnone optsize ssp {
 ; X86-LABEL: test32:
@@ -2525,14 +2525,14 @@ define i64 @test32(<1 x i64> %a, <1 x i64> %b) nounwind readnone optsize ssp {
 entry:
   %0 = bitcast <1 x i64> %b to <8 x i8>
   %1 = bitcast <1 x i64> %a to <8 x i8>
-  %mmx_var.i = bitcast <8 x i8> %1 to x86_mmx
-  %mmx_var1.i = bitcast <8 x i8> %0 to x86_mmx
-  %2 = tail call x86_mmx @llvm.x86.mmx.psad.bw(x86_mmx %mmx_var.i, x86_mmx %mmx_var1.i) nounwind
-  %3 = bitcast x86_mmx %2 to i64
+  %mmx_var.i = bitcast <8 x i8> %1 to <1 x i64>
+  %mmx_var1.i = bitcast <8 x i8> %0 to <1 x i64>
+  %2 = tail call <1 x i64> @llvm.x86.mmx.psad.bw(<1 x i64> %mmx_var.i, <1 x i64> %mmx_var1.i) nounwind
+  %3 = bitcast <1 x i64> %2 to i64
   ret i64 %3
 }
 
-declare x86_mmx @llvm.x86.mmx.pmins.w(x86_mmx, x86_mmx) nounwind readnone
+declare <1 x i64> @llvm.x86.mmx.pmins.w(<1 x i64>, <1 x i64>) nounwind readnone
 
 define i64 @test31(<1 x i64> %a, <1 x i64> %b) nounwind readnone optsize ssp {
 ; X86-LABEL: test31:
@@ -2568,16 +2568,16 @@ define i64 @test31(<1 x i64> %a, <1 x i64> %b) nounwind readnone optsize ssp {
 entry:
   %0 = bitcast <1 x i64> %b to <4 x i16>
   %1 = bitcast <1 x i64> %a to <4 x i16>
-  %mmx_var.i = bitcast <4 x i16> %1 to x86_mmx
-  %mmx_var1.i = bitcast <4 x i16> %0 to x86_mmx
-  %2 = tail call x86_mmx @llvm.x86.mmx.pmins.w(x86_mmx %mmx_var.i, x86_mmx %mmx_var1.i) nounwind
-  %3 = bitcast x86_mmx %2 to <4 x i16>
+  %mmx_var.i = bitcast <4 x i16> %1 to <1 x i64>
+  %mmx_var1.i = bitcast <4 x i16> %0 to <1 x i64>
+  %2 = tail call <1 x i64> @llvm.x86.mmx.pmins.w(<1 x i64> %mmx_var.i, <1 x i64> %mmx_var1.i) nounwind
+  %3 = bitcast <1 x i64> %2 to <4 x i16>
   %4 = bitcast <4 x i16> %3 to <1 x i64>
   %5 = extractelement <1 x i64> %4, i32 0
   ret i64 %5
 }
 
-declare x86_mmx @llvm.x86.mmx.pminu.b(x86_mmx, x86_mmx) nounwind readnone
+declare <1 x i64> @llvm.x86.mmx.pminu.b(<1 x i64>, <1 x i64>) nounwind readnone
 
 define i64 @test30(<1 x i64> %a, <1 x i64> %b) nounwind readnone optsize ssp {
 ; X86-LABEL: test30:
@@ -2613,16 +2613,16 @@ define i64 @test30(<1 x i64> %a, <1 x i64> %b) nounwind readnone optsize ssp {
 entry:
   %0 = bitcast <1 x i64> %b to <8 x i8>
   %1 = bitcast <1 x i64> %a to <8 x i8>
-  %mmx_var.i = bitcast <8 x i8> %1 to x86_mmx
-  %mmx_var1.i = bitcast <8 x i8> %0 to x86_mmx
-  %2 = tail call x86_mmx @llvm.x86.mmx.pminu.b(x86_mmx %mmx_var.i, x86_mmx %mmx_var1.i) nounwind
-  %3 = bitcast x86_mmx %2 to <8 x i8>
+  %mmx_var.i = bitcast <8 x i8> %1 to <1 x i64>
+  %mmx_var1.i = bitcast <8 x i8> %0 to <1 x i64>
+  %2 = tail call <1 x i64> @llvm.x86.mmx.pminu.b(<1 x i64> %mmx_var.i, <1 x i64> %mmx_var1.i) nounwind
+  %3 = bitcast <1 x i64> %2 to <8 x i8>
   %4 = bitcast <8 x i8> %3 to <1 x i64>
   %5 = extractelement <1 x i64> %4, i32 0
   ret i64 %5
 }
 
-declare x86_mmx @llvm.x86.mmx.pmaxs.w(x86_mmx, x86_mmx) nounwind readnone
+declare <1 x i64> @llvm.x86.mmx.pmaxs.w(<1 x i64>, <1 x i64>) nounwind readnone
 
 define i64 @test29(<1 x i64> %a, <1 x i64> %b) nounwind readnone optsize ssp {
 ; X86-LABEL: test29:
@@ -2658,16 +2658,16 @@ define i64 @test29(<1 x i64> %a, <1 x i64> %b) nounwind readnone optsize ssp {
 entry:
   %0 = bitcast <1 x i64> %b to <4 x i16>
   %1 = bitcast <1 x i64> %a to <4 x i16>
-  %mmx_var.i = bitcast <4 x i16> %1 to x86_mmx
-  %mmx_var1.i = bitcast <4 x i16> %0 to x86_mmx
-  %2 = tail call x86_mmx @llvm.x86.mmx.pmaxs.w(x86_mmx %mmx_var.i, x86_mmx %mmx_var1.i) nounwind
-  %3 = bitcast x86_mmx %2 to <4 x i16>
+  %mmx_var.i = bitcast <4 x i16> %1 to <1 x i64>
+  %mmx_var1.i = bitcast <4 x i16> %0 to <1 x i64>
+  %2 = tail call <1 x i64> @llvm.x86.mmx.pmaxs.w(<1 x i64> %mmx_var.i, <1 x i64> %mmx_var1.i) nounwind
+  %3 = bitcast <1 x i64> %2 to <4 x i16>
   %4 = bitcast <4 x i16> %3 to <1 x i64>
   %5 = extractelement <1 x i64> %4, i32 0
   ret i64 %5
 }
 
-declare x86_mmx @llvm.x86.mmx.pmaxu.b(x86_mmx, x86_mmx) nounwind readnone
+declare <1 x i64> @llvm.x86.mmx.pmaxu.b(<1 x i64>, <1 x i64>) nounwind readnone
 
 define i64 @test28(<1 x i64> %a, <1 x i64> %b) nounwind readnone optsize ssp {
 ; X86-LABEL: test28:
@@ -2703,16 +2703,16 @@ define i64 @test28(<1 x i64> %a, <1 x i64> %b) nounwind readnone optsize ssp {
 entry:
   %0 = bitcast <1 x i64> %b to <8 x i8>
   %1 = bitcast <1 x i64> %a to <8 x i8>
-  %mmx_var.i = bitcast <8 x i8> %1 to x86_mmx
-  %mmx_var1.i = bitcast <8 x i8> %0 to x86_mmx
-  %2 = tail call x86_mmx @llvm.x86.mmx.pmaxu.b(x86_mmx %mmx_var.i, x86_mmx %mmx_var1.i) nounwind
-  %3 = bitcast x86_mmx %2 to <8 x i8>
+  %mmx_var.i = bitcast <8 x i8> %1 to <1 x i64>
+  %mmx_var1.i = bitcast <8 x i8> %0 to <1 x i64>
+  %2 = tail call <1 x i64> @llvm.x86.mmx.pmaxu.b(<1 x i64> %mmx_var.i, <1 x i64> %mmx_var1.i) nounwind
+  %3 = bitcast <1 x i64> %2 to <8 x i8>
   %4 = bitcast <8 x i8> %3 to <1 x i64>
   %5 = extractelement <1 x i64> %4, i32 0
   ret i64 %5
 }
 
-declare x86_mmx @llvm.x86.mmx.pavg.w(x86_mmx, x86_mmx) nounwind readnone
+declare <1 x i64> @llvm.x86.mmx.pavg.w(<1 x i64>, <1 x i64>) nounwind readnone
 
 define i64 @test27(<1 x i64> %a, <1 x i64> %b) nounwind readnone optsize ssp {
 ; X86-LABEL: test27:
@@ -2748,16 +2748,16 @@ define i64 @test27(<1 x i64> %a, <1 x i64> %b) nounwind readnone optsize ssp {
 entry:
   %0 = bitcast <1 x i64> %b to <4 x i16>
   %1 = bitcast <1 x i64> %a to <4 x i16>
-  %mmx_var.i = bitcast <4 x i16> %1 to x86_mmx
-  %mmx_var1.i = bitcast <4 x i16> %0 to x86_mmx
-  %2 = tail call x86_mmx @llvm.x86.mmx.pavg.w(x86_mmx %mmx_var.i, x86_mmx %mmx_var1.i) nounwind
-  %3 = bitcast x86_mmx %2 to <4 x i16>
+  %mmx_var.i = bitcast <4 x i16> %1 to <1 x i64>
+  %mmx_var1.i = bitcast <4 x i16> %0 to <1 x i64>
+  %2 = tail call <1 x i64> @llvm.x86.mmx.pavg.w(<1 x i64> %mmx_var.i, <1 x i64> %mmx_var1.i) nounwind
+  %3 = bitcast <1 x i64> %2 to <4 x i16>
   %4 = bitcast <4 x i16> %3 to <1 x i64>
   %5 = extractelement <1 x i64> %4, i32 0
   ret i64 %5
 }
 
-declare x86_mmx @llvm.x86.mmx.pavg.b(x86_mmx, x86_mmx) nounwind readnone
+declare <1 x i64> @llvm.x86.mmx.pavg.b(<1 x i64>, <1 x i64>) nounwind readnone
 
 define i64 @test26(<1 x i64> %a, <1 x i64> %b) nounwind readnone optsize ssp {
 ; X86-LABEL: test26:
@@ -2793,16 +2793,16 @@ define i64 @test26(<1 x i64> %a, <1 x i64> %b) nounwind readnone optsize ssp {
 entry:
   %0 = bitcast <1 x i64> %b to <8 x i8>
   %1 = bitcast <1 x i64> %a to <8 x i8>
-  %mmx_var.i = bitcast <8 x i8> %1 to x86_mmx
-  %mmx_var1.i = bitcast <8 x i8> %0 to x86_mmx
-  %2 = tail call x86_mmx @llvm.x86.mmx.pavg.b(x86_mmx %mmx_var.i, x86_mmx %mmx_var1.i) nounwind
-  %3 = bitcast x86_mmx %2 to <8 x i8>
+  %mmx_var.i = bitcast <8 x i8> %1 to <1 x i64>
+  %mmx_var1.i = bitcast <8 x i8> %0 to <1 x i64>
+  %2 = tail call <1 x i64> @llvm.x86.mmx.pavg.b(<1 x i64> %mmx_var.i, <1 x i64> %mmx_var1.i) nounwind
+  %3 = bitcast <1 x i64> %2 to <8 x i8>
   %4 = bitcast <8 x i8> %3 to <1 x i64>
   %5 = extractelement <1 x i64> %4, i32 0
   ret i64 %5
 }
 
-declare void @llvm.x86.mmx.movnt.dq(ptr, x86_mmx) nounwind
+declare void @llvm.x86.mmx.movnt.dq(ptr, <1 x i64>) nounwind
 
 define void @test25(ptr %p, <1 x i64> %a) nounwind optsize ssp {
 ; X86-LABEL: test25:
@@ -2819,12 +2819,12 @@ define void @test25(ptr %p, <1 x i64> %a) nounwind optsize ssp {
 ; X64-NEXT:    retq
 entry:
   %0 = extractelement <1 x i64> %a, i32 0
-  %mmx_var.i = bitcast i64 %0 to x86_mmx
-  tail call void @llvm.x86.mmx.movnt.dq(ptr %p, x86_mmx %mmx_var.i) nounwind
+  %mmx_var.i = bitcast i64 %0 to <1 x i64>
+  tail call void @llvm.x86.mmx.movnt.dq(ptr %p, <1 x i64> %mmx_var.i) nounwind
   ret void
 }
 
-declare i32 @llvm.x86.mmx.pmovmskb(x86_mmx) nounwind readnone
+declare i32 @llvm.x86.mmx.pmovmskb(<1 x i64>) nounwind readnone
 
 define i32 @test24(<1 x i64> %a) nounwind readnone optsize ssp {
 ; X86-LABEL: test24:
@@ -2850,12 +2850,12 @@ define i32 @test24(<1 x i64> %a) nounwind readnone optsize ssp {
 ; X64-NEXT:    retq
 entry:
   %0 = bitcast <1 x i64> %a to <8 x i8>
-  %mmx_var.i = bitcast <8 x i8> %0 to x86_mmx
-  %1 = tail call i32 @llvm.x86.mmx.pmovmskb(x86_mmx %mmx_var.i) nounwind
+  %mmx_var.i = bitcast <8 x i8> %0 to <1 x i64>
+  %1 = tail call i32 @llvm.x86.mmx.pmovmskb(<1 x i64> %mmx_var.i) nounwind
   ret i32 %1
 }
 
-declare void @llvm.x86.mmx.maskmovq(x86_mmx, x86_mmx, ptr) nounwind
+declare void @llvm.x86.mmx.maskmovq(<1 x i64>, <1 x i64>, ptr) nounwind
 
 define void @test23(<1 x i64> %d, <1 x i64> %n, ptr %p) nounwind optsize ssp {
 ; X86-LABEL: test23:
@@ -2892,13 +2892,13 @@ define void @test23(<1 x i64> %d, <1 x i64> %n, ptr %p) nounwind optsize ssp {
 entry:
   %0 = bitcast <1 x i64> %n to <8 x i8>
   %1 = bitcast <1 x i64> %d to <8 x i8>
-  %mmx_var.i = bitcast <8 x i8> %1 to x86_mmx
-  %mmx_var1.i = bitcast <8 x i8> %0 to x86_mmx
-  tail call void @llvm.x86.mmx.maskmovq(x86_mmx %mmx_var.i, x86_mmx %mmx_var1.i, ptr %p) nounwind
+  %mmx_var.i = bitcast <8 x i8> %1 to <1 x i64>
+  %mmx_var1.i = bitcast <8 x i8> %0 to <1 x i64>
+  tail call void @llvm.x86.mmx.maskmovq(<1 x i64> %mmx_var.i, <1 x i64> %mmx_var1.i, ptr %p) nounwind
   ret void
 }
 
-declare x86_mmx @llvm.x86.mmx.pmulhu.w(x86_mmx, x86_mmx) nounwind readnone
+declare <1 x i64> @llvm.x86.mmx.pmulhu.w(<1 x i64>, <1 x i64>) nounwind readnone
 
 define i64 @test22(<1 x i64> %a, <1 x i64> %b) nounwind readnone optsize ssp {
 ; X86-LABEL: test22:
@@ -2934,16 +2934,16 @@ define i64 @test22(<1 x i64> %a, <1 x i64> %b) nounwind readnone optsize ssp {
 entry:
   %0 = bitcast <1 x i64> %b to <4 x i16>
   %1 = bitcast <1 x i64> %a to <4 x i16>
-  %mmx_var.i = bitcast <4 x i16> %1 to x86_mmx
-  %mmx_var1.i = bitcast <4 x i16> %0 to x86_mmx
-  %2 = tail call x86_mmx @llvm.x86.mmx.pmulhu.w(x86_mmx %mmx_var.i, x86_mmx %mmx_var1.i) nounwind
-  %3 = bitcast x86_mmx %2 to <4 x i16>
+  %mmx_var.i = bitcast <4 x i16> %1 to <1 x i64>
+  %mmx_var1.i = bitcast <4 x i16> %0 to <1 x i64>
+  %2 = tail call <1 x i64> @llvm.x86.mmx.pmulhu.w(<1 x i64> %mmx_var.i, <1 x i64> %mmx_var1.i) nounwind
+  %3 = bitcast <1 x i64> %2 to <4 x i16>
   %4 = bitcast <4 x i16> %3 to <1 x i64>
   %5 = extractelement <1 x i64> %4, i32 0
   ret i64 %5
 }
 
-declare x86_mmx @llvm.x86.sse.pshuf.w(x86_mmx, i8) nounwind readnone
+declare <1 x i64> @llvm.x86.sse.pshuf.w(<1 x i64>, i8) nounwind readnone
 
 define i64 @test21(<1 x i64> %a) nounwind readnone optsize ssp {
 ; X86-LABEL: test21:
@@ -2972,9 +2972,9 @@ define i64 @test21(<1 x i64> %a) nounwind readnone optsize ssp {
 ; X64-NEXT:    retq
 entry:
   %0 = bitcast <1 x i64> %a to <4 x i16>
-  %1 = bitcast <4 x i16> %0 to x86_mmx
-  %2 = tail call x86_mmx @llvm.x86.sse.pshuf.w(x86_mmx %1, i8 3) nounwind readnone
-  %3 = bitcast x86_mmx %2 to <4 x i16>
+  %1 = bitcast <4 x i16> %0 to <1 x i64>
+  %2 = tail call <1 x i64> @llvm.x86.sse.pshuf.w(<1 x i64> %1, i8 3) nounwind readnone
+  %3 = bitcast <1 x i64> %2 to <4 x i16>
   %4 = bitcast <4 x i16> %3 to <1 x i64>
   %5 = extractelement <1 x i64> %4, i32 0
   ret i64 %5
@@ -3005,15 +3005,15 @@ define i32 @test21_2(<1 x i64> %a) nounwind readnone optsize ssp {
 ; X64-NEXT:    retq
 entry:
   %0 = bitcast <1 x i64> %a to <4 x i16>
-  %1 = bitcast <4 x i16> %0 to x86_mmx
-  %2 = tail call x86_mmx @llvm.x86.sse.pshuf.w(x86_mmx %1, i8 3) nounwind readnone
-  %3 = bitcast x86_mmx %2 to <4 x i16>
+  %1 = bitcast <4 x i16> %0 to <1 x i64>
+  %2 = tail call <1 x i64> @llvm.x86.sse.pshuf.w(<1 x i64> %1, i8 3) nounwind readnone
+  %3 = bitcast <1 x i64> %2 to <4 x i16>
   %4 = bitcast <4 x i16> %3 to <2 x i32>
   %5 = extractelement <2 x i32> %4, i32 0
   ret i32 %5
 }
 
-declare x86_mmx @llvm.x86.mmx.pmulu.dq(x86_mmx, x86_mmx) nounwind readnone
+declare <1 x i64> @llvm.x86.mmx.pmulu.dq(<1 x i64>, <1 x i64>) nounwind readnone
 
 define i64 @test20(<1 x i64> %a, <1 x i64> %b) nounwind readnone optsize ssp {
 ; X86-LABEL: test20:
@@ -3049,14 +3049,14 @@ define i64 @test20(<1 x i64> %a, <1 x i64> %b) nounwind readnone optsize ssp {
 entry:
   %0 = bitcast <1 x i64> %b to <2 x i32>
   %1 = bitcast <1 x i64> %a to <2 x i32>
-  %mmx_var.i = bitcast <2 x i32> %1 to x86_mmx
-  %mmx_var1.i = bitcast <2 x i32> %0 to x86_mmx
-  %2 = tail call x86_mmx @llvm.x86.mmx.pmulu.dq(x86_mmx %mmx_var.i, x86_mmx %mmx_var1.i) nounwind
-  %3 = bitcast x86_mmx %2 to i64
+  %mmx_var.i = bitcast <2 x i32> %1 to <1 x i64>
+  %mmx_var1.i = bitcast <2 x i32> %0 to <1 x i64>
+  %2 = tail call <1 x i64> @llvm.x86.mmx.pmulu.dq(<1 x i64> %mmx_var.i, <1 x i64> %mmx_var1.i) nounwind
+  %3 = bitcast <1 x i64> %2 to i64
   ret i64 %3
 }
 
-declare <2 x double> @llvm.x86.sse.cvtpi2pd(x86_mmx) nounwind readnone
+declare <2 x double> @llvm.x86.sse.cvtpi2pd(<1 x i64>) nounwind readnone
 
 define <2 x double> @test19(<1 x i64> %a) nounwind readnone optsize ssp {
 ; X86-LABEL: test19:
@@ -3081,12 +3081,12 @@ define <2 x double> @test19(<1 x i64> %a) nounwind readnone optsize ssp {
 ; X64-NEXT:    retq
 entry:
   %0 = bitcast <1 x i64> %a to <2 x i32>
-  %1 = bitcast <2 x i32> %0 to x86_mmx
-  %2 = tail call <2 x double> @llvm.x86.sse.cvtpi2pd(x86_mmx %1) nounwind readnone
+  %1 = bitcast <2 x i32> %0 to <1 x i64>
+  %2 = tail call <2 x double> @llvm.x86.sse.cvtpi2pd(<1 x i64> %1) nounwind readnone
   ret <2 x double> %2
 }
 
-declare x86_mmx @llvm.x86.sse.cvttpd2pi(<2 x double>) nounwind readnone
+declare <1 x i64> @llvm.x86.sse.cvttpd2pi(<2 x double>) nounwind readnone
 
 define i64 @test18(<2 x double> %a) nounwind readnone optsize ssp {
 ; X86-LABEL: test18:
@@ -3109,14 +3109,14 @@ define i64 @test18(<2 x double> %a) nounwind readnone optsize ssp {
 ; X64-NEXT:    movq %mm0, %rax
 ; X64-NEXT:    retq
 entry:
-  %0 = tail call x86_mmx @llvm.x86.sse.cvttpd2pi(<2 x double> %a) nounwind readnone
-  %1 = bitcast x86_mmx %0 to <2 x i32>
+  %0 = tail call <1 x i64> @llvm.x86.sse.cvttpd2pi(<2 x double> %a) nounwind readnone
+  %1 = bitcast <1 x i64> %0 to <2 x i32>
   %2 = bitcast <2 x i32> %1 to <1 x i64>
   %3 = extractelement <1 x i64> %2, i32 0
   ret i64 %3
 }
 
-declare x86_mmx @llvm.x86.sse.cvtpd2pi(<2 x double>) nounwind readnone
+declare <1 x i64> @llvm.x86.sse.cvtpd2pi(<2 x double>) nounwind readnone
 
 define i64 @test17(<2 x double> %a) nounwind readnone optsize ssp {
 ; X86-LABEL: test17:
@@ -3139,14 +3139,14 @@ define i64 @test17(<2 x double> %a) nounwind readnone optsize ssp {
 ; X64-NEXT:    movq %mm0, %rax
 ; X64-NEXT:    retq
 entry:
-  %0 = tail call x86_mmx @llvm.x86.sse.cvtpd2pi(<2 x double> %a) nounwind readnone
-  %1 = bitcast x86_mmx %0 to <2 x i32>
+  %0 = tail call <1 x i64> @llvm.x86.sse.cvtpd2pi(<2 x double> %a) nounwind readnone
+  %1 = bitcast <1 x i64> %0 to <2 x i32>
   %2 = bitcast <2 x i32> %1 to <1 x i64>
   %3 = extractelement <1 x i64> %2, i32 0
   ret i64 %3
 }
 
-declare x86_mmx @llvm.x86.mmx.palignr.b(x86_mmx, x86_mmx, i8) nounwind readnone
+declare <1 x i64> @llvm.x86.mmx.palignr.b(<1 x i64>, <1 x i64>, i8) nounwind readnone
 
 define i64 @test16(<1 x i64> %a, <1 x i64> %b) nounwind readnone optsize ssp {
 ; X86-LABEL: test16:
@@ -3173,15 +3173,15 @@ define i64 @test16(<1 x i64> %a, <1 x i64> %b) nounwind readnone optsize ssp {
 ; X64-NEXT:    retq
 entry:
   %0 = extractelement <1 x i64> %a, i32 0
-  %mmx_var = bitcast i64 %0 to x86_mmx
+  %mmx_var = bitcast i64 %0 to <1 x i64>
   %1 = extractelement <1 x i64> %b, i32 0
-  %mmx_var1 = bitcast i64 %1 to x86_mmx
-  %2 = tail call x86_mmx @llvm.x86.mmx.palignr.b(x86_mmx %mmx_var, x86_mmx %mmx_var1, i8 16)
-  %3 = bitcast x86_mmx %2 to i64
+  %mmx_var1 = bitcast i64 %1 to <1 x i64>
+  %2 = tail call <1 x i64> @llvm.x86.mmx.palignr.b(<1 x i64> %mmx_var, <1 x i64> %mmx_var1, i8 16)
+  %3 = bitcast <1 x i64> %2 to i64
   ret i64 %3
 }
 
-declare x86_mmx @llvm.x86.ssse3.pabs.d(x86_mmx) nounwind readnone
+declare <1 x i64> @llvm.x86.ssse3.pabs.d(<1 x i64>) nounwind readnone
 
 define i64 @test15(<1 x i64> %a) nounwind readnone optsize ssp {
 ; X86-LABEL: test15:
@@ -3210,15 +3210,15 @@ define i64 @test15(<1 x i64> %a) nounwind readnone optsize ssp {
 ; X64-NEXT:    retq
 entry:
   %0 = bitcast <1 x i64> %a to <2 x i32>
-  %1 = bitcast <2 x i32> %0 to x86_mmx
-  %2 = tail call x86_mmx @llvm.x86.ssse3.pabs.d(x86_mmx %1) nounwind readnone
-  %3 = bitcast x86_mmx %2 to <2 x i32>
+  %1 = bitcast <2 x i32> %0 to <1 x i64>
+  %2 = tail call <1 x i64> @llvm.x86.ssse3.pabs.d(<1 x i64> %1) nounwind readnone
+  %3 = bitcast <1 x i64> %2 to <2 x i32>
   %4 = bitcast <2 x i32> %3 to <1 x i64>
   %5 = extractelement <1 x i64> %4, i32 0
   ret i64 %5
 }
 
-declare x86_mmx @llvm.x86.ssse3.pabs.w(x86_mmx) nounwind readnone
+declare <1 x i64> @llvm.x86.ssse3.pabs.w(<1 x i64>) nounwind readnone
 
 define i64 @test14(<1 x i64> %a) nounwind readnone optsize ssp {
 ; X86-LABEL: test14:
@@ -3247,15 +3247,15 @@ define i64 @test14(<1 x i64> %a) nounwind readnone optsize ssp {
 ; X64-NEXT:    retq
 entry:
   %0 = bitcast <1 x i64> %a to <4 x i16>
-  %1 = bitcast <4 x i16> %0 to x86_mmx
-  %2 = tail call x86_mmx @llvm.x86.ssse3.pabs.w(x86_mmx %1) nounwind readnone
-  %3 = bitcast x86_mmx %2 to <4 x i16>
+  %1 = bitcast <4 x i16> %0 to <1 x i64>
+  %2 = tail call <1 x i64> @llvm.x86.ssse3.pabs.w(<1 x i64> %1) nounwind readnone
+  %3 = bitcast <1 x i64> %2 to <4 x i16>
   %4 = bitcast <4 x i16> %3 to <1 x i64>
   %5 = extractelement <1 x i64> %4, i32 0
   ret i64 %5
 }
 
-declare x86_mmx @llvm.x86.ssse3.pabs.b(x86_mmx) nounwind readnone
+declare <1 x i64> @llvm.x86.ssse3.pabs.b(<1 x i64>) nounwind readnone
 
 define i64 @test13(<1 x i64> %a) nounwind readnone optsize ssp {
 ; X86-LABEL: test13:
@@ -3284,15 +3284,15 @@ define i64 @test13(<1 x i64> %a) nounwind readnone optsize ssp {
 ; X64-NEXT:    retq
 entry:
   %0 = bitcast <1 x i64> %a to <8 x i8>
-  %1 = bitcast <8 x i8> %0 to x86_mmx
-  %2 = tail call x86_mmx @llvm.x86.ssse3.pabs.b(x86_mmx %1) nounwind readnone
-  %3 = bitcast x86_mmx %2 to <8 x i8>
+  %1 = bitcast <8 x i8> %0 to <1 x i64>
+  %2 = tail call <1 x i64> @llvm.x86.ssse3.pabs.b(<1 x i64> %1) nounwind readnone
+  %3 = bitcast <1 x i64> %2 to <8 x i8>
   %4 = bitcast <8 x i8> %3 to <1 x i64>
   %5 = extractelement <1 x i64> %4, i32 0
   ret i64 %5
 }
 
-declare x86_mmx @llvm.x86.ssse3.psign.d(x86_mmx, x86_mmx) nounwind readnone
+declare <1 x i64> @llvm.x86.ssse3.psign.d(<1 x i64>, <1 x i64>) nounwind readnone
 
 define i64 @test12(<1 x i64> %a, <1 x i64> %b) nounwind readnone optsize ssp {
 ; X86-LABEL: test12:
@@ -3328,16 +3328,16 @@ define i64 @test12(<1 x i64> %a, <1 x i64> %b) nounwind readnone optsize ssp {
 entry:
   %0 = bitcast <1 x i64> %b to <2 x i32>
   %1 = bitcast <1 x i64> %a to <2 x i32>
-  %2 = bitcast <2 x i32> %1 to x86_mmx
-  %3 = bitcast <2 x i32> %0 to x86_mmx
-  %4 = tail call x86_mmx @llvm.x86.ssse3.psign.d(x86_mmx %2, x86_mmx %3) nounwind readnone
-  %5 = bitcast x86_mmx %4 to <2 x i32>
+  %2 = bitcast <2 x i32> %1 to <1 x i64>
+  %3 = bitcast <2 x i32> %0 to <1 x i64>
+  %4 = tail call <1 x i64> @llvm.x86.ssse3.psign.d(<1 x i64> %2, <1 x i64> %3) nounwind readnone
+  %5 = bitcast <1 x i64> %4 to <2 x i32>
   %6 = bitcast <2 x i32> %5 to <1 x i64>
   %7 = extractelement <1 x i64> %6, i32 0
   ret i64 %7
 }
 
-declare x86_mmx @llvm.x86.ssse3.psign.w(x86_mmx, x86_mmx) nounwind readnone
+declare <1 x i64> @llvm.x86.ssse3.psign.w(<1 x i64>, <1 x i64>) nounwind readnone
 
 define i64 @test11(<1 x i64> %a, <1 x i64> %b) nounwind readnone optsize ssp {
 ; X86-LABEL: test11:
@@ -3373,16 +3373,16 @@ define i64 @test11(<1 x i64> %a, <1 x i64> %b) nounwind readnone optsize ssp {
 entry:
   %0 = bitcast <1 x i64> %b to <4 x i16>
   %1 = bitcast <1 x i64> %a to <4 x i16>
-  %2 = bitcast <4 x i16> %1 to x86_mmx
-  %3 = bitcast <4 x i16> %0 to x86_mmx
-  %4 = tail call x86_mmx @llvm.x86.ssse3.psign.w(x86_mmx %2, x86_mmx %3) nounwind readnone
-  %5 = bitcast x86_mmx %4 to <4 x i16>
+  %2 = bitcast <4 x i16> %1 to <1 x i64>
+  %3 = bitcast <4 x i16> %0 to <1 x i64>
+  %4 = tail call <1 x i64> @llvm.x86.ssse3.psign.w(<1 x i64> %2, <1 x i64> %3) nounwind readnone
+  %5 = bitcast <1 x i64> %4 to <4 x i16>
   %6 = bitcast <4 x i16> %5 to <1 x i64>
   %7 = extractelement <1 x i64> %6, i32 0
   ret i64 %7
 }
 
-declare x86_mmx @llvm.x86.ssse3.psign.b(x86_mmx, x86_mmx) nounwind readnone
+declare <1 x i64> @llvm.x86.ssse3.psign.b(<1 x i64>, <1 x i64>) nounwind readnone
 
 define i64 @test10(<1 x i64> %a, <1 x i64> %b) nounwind readnone optsize ssp {
 ; X86-LABEL: test10:
@@ -3418,16 +3418,16 @@ define i64 @test10(<1 x i64> %a, <1 x i64> %b) nounwind readnone optsize ssp {
 entry:
   %0 = bitcast <1 x i64> %b to <8 x i8>
   %1 = bitcast <1 x i64> %a to <8 x i8>
-  %2 = bitcast <8 x i8> %1 to x86_mmx
-  %3 = bitcast <8 x i8> %0 to x86_mmx
-  %4 = tail call x86_mmx @llvm.x86.ssse3.psign.b(x86_mmx %2, x86_mmx %3) nounwind readnone
-  %5 = bitcast x86_mmx %4 to <8 x i8>
+  %2 = bitcast <8 x i8> %1 to <1 x i64>
+  %3 = bitcast <8 x i8> %0 to <1 x i64>
+  %4 = tail call <1 x i64> @llvm.x86.ssse3.psign.b(<1 x i64> %2, <1 x i64> %3) nounwind readnone
+  %5 = bitcast <1 x i64> %4 to <8 x i8>
   %6 = bitcast <8 x i8> %5 to <1 x i64>
   %7 = extractelement <1 x i64> %6, i32 0
   ret i64 %7
 }
 
-declare x86_mmx @llvm.x86.ssse3.pshuf.b(x86_mmx, x86_mmx) nounwind readnone
+declare <1 x i64> @llvm.x86.ssse3.pshuf.b(<1 x i64>, <1 x i64>) nounwind readnone
 
 define i64 @test9(<1 x i64> %a, <1 x i64> %b) nounwind readnone optsize ssp {
 ; X86-LABEL: test9:
@@ -3463,16 +3463,16 @@ define i64 @test9(<1 x i64> %a, <1 x i64> %b) nounwind readnone optsize ssp {
 entry:
   %0 = bitcast <1 x i64> %b to <8 x i8>
   %1 = bitcast <1 x i64> %a to <8 x i8>
-  %2 = bitcast <8 x i8> %1 to x86_mmx
-  %3 = bitcast <8 x i8> %0 to x86_mmx
-  %4 = tail call x86_mmx @llvm.x86.ssse3.pshuf.b(x86_mmx %2, x86_mmx %3) nounwind readnone
-  %5 = bitcast x86_mmx %4 to <8 x i8>
+  %2 = bitcast <8 x i8> %1 to <1 x i64>
+  %3 = bitcast <8 x i8> %0 to <1 x i64>
+  %4 = tail call <1 x i64> @llvm.x86.ssse3.pshuf.b(<1 x i64> %2, <1 x i64> %3) nounwind readnone
+  %5 = bitcast <1 x i64> %4 to <8 x i8>
   %6 = bitcast <8 x i8> %5 to <1 x i64>
   %7 = extractelement <1 x i64> %6, i32 0
   ret i64 %7
 }
 
-declare x86_mmx @llvm.x86.ssse3.pmul.hr.sw(x86_mmx, x86_mmx) nounwind readnone
+declare <1 x i64> @llvm.x86.ssse3.pmul.hr.sw(<1 x i64>, <1 x i64>) nounwind readnone
 
 define i64 @test8(<1 x i64> %a, <1 x i64> %b) nounwind readnone optsize ssp {
 ; X86-LABEL: test8:
@@ -3508,16 +3508,16 @@ define i64 @test8(<1 x i64> %a, <1 x i64> %b) nounwind readnone optsize ssp {
 entry:
   %0 = bitcast <1 x i64> %b to <4 x i16>
   %1 = bitcast <1 x i64> %a to <4 x i16>
-  %2 = bitcast <4 x i16> %1 to x86_mmx
-  %3 = bitcast <4 x i16> %0 to x86_mmx
-  %4 = tail call x86_mmx @llvm.x86.ssse3.pmul.hr.sw(x86_mmx %2, x86_mmx %3) nounwind readnone
-  %5 = bitcast x86_mmx %4 to <4 x i16>
+  %2 = bitcast <4 x i16> %1 to <1 x i64>
+  %3 = bitcast <4 x i16> %0 to <1 x i64>
+  %4 = tail call <1 x i64> @llvm.x86.ssse3.pmul.hr.sw(<1 x i64> %2, <1 x i64> %3) nounwind readnone
+  %5 = bitcast <1 x i64> %4 to <4 x i16>
   %6 = bitcast <4 x i16> %5 to <1 x i64>
   %7 = extractelement <1 x i64> %6, i32 0
   ret i64 %7
 }
 
-declare x86_mmx @llvm.x86.ssse3.pmadd.ub.sw(x86_mmx, x86_mmx) nounwind readnone
+declare <1 x i64> @llvm.x86.ssse3.pmadd.ub.sw(<1 x i64>, <1 x i64>) nounwind readnone
 
 define i64 @test7(<1 x i64> %a, <1 x i64> %b) nounwind readnone optsize ssp {
 ; X86-LABEL: test7:
@@ -3553,16 +3553,16 @@ define i64 @test7(<1 x i64> %a, <1 x i64> %b) nounwind readnone optsize ssp {
 entry:
   %0 = bitcast <1 x i64> %b to <8 x i8>
   %1 = bitcast <1 x i64> %a to <8 x i8>
-  %2 = bitcast <8 x i8> %1 to x86_mmx
-  %3 = bitcast <8 x i8> %0 to x86_mmx
-  %4 = tail call x86_mmx @llvm.x86.ssse3.pmadd.ub.sw(x86_mmx %2, x86_mmx %3) nounwind readnone
-  %5 = bitcast x86_mmx %4 to <8 x i8>
+  %2 = bitcast <8 x i8> %1 to <1 x i64>
+  %3 = bitcast <8 x i8> %0 to <1 x i64>
+  %4 = tail call <1 x i64> @llvm.x86.ssse3.pmadd.ub.sw(<1 x i64> %2, <1 x i64> %3) nounwind readnone
+  %5 = bitcast <1 x i64> %4 to <8 x i8>
   %6 = bitcast <8 x i8> %5 to <1 x i64>
   %7 = extractelement <1 x i64> %6, i32 0
   ret i64 %7
 }
 
-declare x86_mmx @llvm.x86.ssse3.phsub.sw(x86_mmx, x86_mmx) nounwind readnone
+declare <1 x i64> @llvm.x86.ssse3.phsub.sw(<1 x i64>, <1 x i64>) nounwind readnone
 
 define i64 @test6(<1 x i64> %a, <1 x i64> %b) nounwind readnone optsize ssp {
 ; X86-LABEL: test6:
@@ -3598,16 +3598,16 @@ define i64 @test6(<1 x i64> %a, <1 x i64> %b) nounwind readnone optsize ssp {
 entry:
   %0 = bitcast <1 x i64> %b to <4 x i16>
   %1 = bitcast <1 x i64> %a to <4 x i16>
-  %2 = bitcast <4 x i16> %1 to x86_mmx
-  %3 = bitcast <4 x i16> %0 to x86_mmx
-  %4 = tail call x86_mmx @llvm.x86.ssse3.phsub.sw(x86_mmx %2, x86_mmx %3) nounwind readnone
-  %5 = bitcast x86_mmx %4 to <4 x i16>
+  %2 = bitcast <4 x i16> %1 to <1 x i64>
+  %3 = bitcast <4 x i16> %0 to <1 x i64>
+  %4 = tail call <1 x i64> @llvm.x86.ssse3.phsub.sw(<1 x i64> %2, <1 x i64> %3) nounwind readnone
+  %5 = bitcast <1 x i64> %4 to <4 x i16>
   %6 = bitcast <4 x i16> %5 to <1 x i64>
   %7 = extractelement <1 x i64> %6, i32 0
   ret i64 %7
 }
 
-declare x86_mmx @llvm.x86.ssse3.phsub.d(x86_mmx, x86_mmx) nounwind readnone
+declare <1 x i64> @llvm.x86.ssse3.phsub.d(<1 x i64>, <1 x i64>) nounwind readnone
 
 define i64 @test5(<1 x i64> %a, <1 x i64> %b) nounwind readnone optsize ssp {
 ; X86-LABEL: test5:
@@ -3643,16 +3643,16 @@ define i64 @test5(<1 x i64> %a, <1 x i64> %b) nounwind readnone optsize ssp {
 entry:
   %0 = bitcast <1 x i64> %b to <2 x i32>
   %1 = bitcast <1 x i64> %a to <2 x i32>
-  %2 = bitcast <2 x i32> %1 to x86_mmx
-  %3 = bitcast <2 x i32> %0 to x86_mmx
-  %4 = tail call x86_mmx @llvm.x86.ssse3.phsub.d(x86_mmx %2, x86_mmx %3) nounwind readnone
-  %5 = bitcast x86_mmx %4 to <2 x i32>
+  %2 = bitcast <2 x i32> %1 to <1 x i64>
+  %3 = bitcast <2 x i32> %0 to <1 x i64>
+  %4 = tail call <1 x i64> @llvm.x86.ssse3.phsub.d(<1 x i64> %2, <1 x i64> %3) nounwind readnone
+  %5 = bitcast <1 x i64> %4 to <2 x i32>
   %6 = bitcast <2 x i32> %5 to <1 x i64>
   %7 = extractelement <1 x i64> %6, i32 0
   ret i64 %7
 }
 
-declare x86_mmx @llvm.x86.ssse3.phsub.w(x86_mmx, x86_mmx) nounwind readnone
+declare <1 x i64> @llvm.x86.ssse3.phsub.w(<1 x i64>, <1 x i64>) nounwind readnone
 
 define i64 @test4(<1 x i64> %a, <1 x i64> %b) nounwind readnone optsize ssp {
 ; X86-LABEL: test4:
@@ -3688,16 +3688,16 @@ define i64 @test4(<1 x i64> %a, <1 x i64> %b) nounwind readnone optsize ssp {
 entry:
   %0 = bitcast <1 x i64> %b to <4 x i16>
   %1 = bitcast <1 x i64> %a to <4 x i16>
-  %2 = bitcast <4 x i16> %1 to x86_mmx
-  %3 = bitcast <4 x i16> %0 to x86_mmx
-  %4 = tail call x86_mmx @llvm.x86.ssse3.phsub.w(x86_mmx %2, x86_mmx %3) nounwind readnone
-  %5 = bitcast x86_mmx %4 to <4 x i16>
+  %2 = bitcast <4 x i16> %1 to <1 x i64>
+  %3 = bitcast <4 x i16> %0 to <1 x i64>
+  %4 = tail call <1 x i64> @llvm.x86.ssse3.phsub.w(<1 x i64> %2, <1 x i64> %3) nounwind readnone
+  %5 = bitcast <1 x i64> %4 to <4 x i16>
   %6 = bitcast <4 x i16> %5 to <1 x i64>
   %7 = extractelement <1 x i64> %6, i32 0
   ret i64 %7
 }
 
-declare x86_mmx @llvm.x86.ssse3.phadd.sw(x86_mmx, x86_mmx) nounwind readnone
+declare <1 x i64> @llvm.x86.ssse3.phadd.sw(<1 x i64>, <1 x i64>) nounwind readnone
 
 define i64 @test3(<1 x i64> %a, <1 x i64> %b) nounwind readnone optsize ssp {
 ; X86-LABEL: test3:
@@ -3733,16 +3733,16 @@ define i64 @test3(<1 x i64> %a, <1 x i64> %b) nounwind readnone optsize ssp {
 entry:
   %0 = bitcast <1 x i64> %b to <4 x i16>
   %1 = bitcast <1 x i64> %a to <4 x i16>
-  %2 = bitcast <4 x i16> %1 to x86_mmx
-  %3 = bitcast <4 x i16> %0 to x86_mmx
-  %4 = tail call x86_mmx @llvm.x86.ssse3.phadd.sw(x86_mmx %2, x86_mmx %3) nounwind readnone
-  %5 = bitcast x86_mmx %4 to <4 x i16>
+  %2 = bitcast <4 x i16> %1 to <1 x i64>
+  %3 = bitcast <4 x i16> %0 to <1 x i64>
+  %4 = tail call <1 x i64> @llvm.x86.ssse3.phadd.sw(<1 x i64> %2, <1 x i64> %3) nounwind readnone
+  %5 = bitcast <1 x i64> %4 to <4 x i16>
   %6 = bitcast <4 x i16> %5 to <1 x i64>
   %7 = extractelement <1 x i64> %6, i32 0
   ret i64 %7
 }
 
-declare x86_mmx @llvm.x86.ssse3.phadd.d(x86_mmx, x86_mmx) nounwind readnone
+declare <1 x i64> @llvm.x86.ssse3.phadd.d(<1 x i64>, <1 x i64>) nounwind readnone
 
 define i64 @test2(<1 x i64> %a, <1 x i64> %b) nounwind readnone optsize ssp {
 ; X86-LABEL: test2:
@@ -3778,16 +3778,16 @@ define i64 @test2(<1 x i64> %a, <1 x i64> %b) nounwind readnone optsize ssp {
 entry:
   %0 = bitcast <1 x i64> %b to <2 x i32>
   %1 = bitcast <1 x i64> %a to <2 x i32>
-  %2 = bitcast <2 x i32> %1 to x86_mmx
-  %3 = bitcast <2 x i32> %0 to x86_mmx
-  %4 = tail call x86_mmx @llvm.x86.ssse3.phadd.d(x86_mmx %2, x86_mmx %3) nounwind readnone
-  %5 = bitcast x86_mmx %4 to <2 x i32>
+  %2 = bitcast <2 x i32> %1 to <1 x i64>
+  %3 = bitcast <2 x i32> %0 to <1 x i64>
+  %4 = tail call <1 x i64> @llvm.x86.ssse3.phadd.d(<1 x i64> %2, <1 x i64> %3) nounwind readnone
+  %5 = bitcast <1 x i64> %4 to <2 x i32>
   %6 = bitcast <2 x i32> %5 to <1 x i64>
   %7 = extractelement <1 x i64> %6, i32 0
   ret i64 %7
 }
 
-define <4 x float> @test89(<4 x float> %a, x86_mmx %b) nounwind {
+define <4 x float> @test89(<4 x float> %a, <1 x i64> %b) nounwind {
 ; X86-LABEL: test89:
 ; X86:       # %bb.0:
 ; X86-NEXT:    pushl %ebp
@@ -3808,11 +3808,11 @@ define <4 x float> @test89(<4 x float> %a, x86_mmx %b) nounwind {
 ; X64-NEXT:    movq %rdi, %mm0
 ; X64-NEXT:    cvtpi2ps %mm0, %xmm0
 ; X64-NEXT:    retq
-  %c = tail call <4 x float> @llvm.x86.sse.cvtpi2ps(<4 x float> %a, x86_mmx %b)
+  %c = tail call <4 x float> @llvm.x86.sse.cvtpi2ps(<4 x float> %a, <1 x i64> %b)
   ret <4 x float> %c
 }
 
-declare <4 x float> @llvm.x86.sse.cvtpi2ps(<4 x float>, x86_mmx) nounwind readnone
+declare <4 x float> @llvm.x86.sse.cvtpi2ps(<4 x float>, <1 x i64>) nounwind readnone
 
 define void @test90() {
 ; ALL-LABEL: test90:
@@ -3852,13 +3852,11 @@ define <1 x i64> @test_mm_insert_pi16(<1 x i64> %a.coerce, i32 %d) nounwind {
 ; X64-NEXT:    movq %mm0, %rax
 ; X64-NEXT:    retq
 entry:
-  %0 = bitcast <1 x i64> %a.coerce to x86_mmx
-  %1 = tail call x86_mmx @llvm.x86.mmx.pinsr.w(x86_mmx %0, i32 %d, i32 2)
-  %2 = bitcast x86_mmx %1 to <1 x i64>
-  ret <1 x i64> %2
+  %1 = tail call <1 x i64> @llvm.x86.mmx.pinsr.w(<1 x i64> %a.coerce, i32 %d, i32 2)
+  ret <1 x i64> %1
 }
 
-declare x86_mmx @llvm.x86.mmx.pinsr.w(x86_mmx, i32, i32 immarg)
+declare <1 x i64> @llvm.x86.mmx.pinsr.w(<1 x i64>, i32, i32 immarg)
 
 define i32 @test_mm_extract_pi16(<1 x i64> %a.coerce) nounwind {
 ; X86-LABEL: test_mm_extract_pi16:
@@ -3883,9 +3881,8 @@ define i32 @test_mm_extract_pi16(<1 x i64> %a.coerce) nounwind {
 ; X64-NEXT:    pextrw $2, %mm0, %eax
 ; X64-NEXT:    retq
 entry:
-  %0 = bitcast <1 x i64> %a.coerce to x86_mmx
-  %1 = tail call i32 @llvm.x86.mmx.pextr.w(x86_mmx %0, i32 2)
+  %1 = tail call i32 @llvm.x86.mmx.pextr.w(<1 x i64> %a.coerce, i32 2)
   ret i32 %1
 }
 
-declare i32 @llvm.x86.mmx.pextr.w(x86_mmx, i32 immarg)
+declare i32 @llvm.x86.mmx.pextr.w(<1 x i64>, i32 immarg)
diff --git a/llvm/test/CodeGen/X86/mmx-only.ll b/llvm/test/CodeGen/X86/mmx-only.ll
index eab67e0..8a87350 100644
--- a/llvm/test/CodeGen/X86/mmx-only.ll
+++ b/llvm/test/CodeGen/X86/mmx-only.ll
@@ -3,7 +3,7 @@
 
 ; Test that turning off sse doesn't turn off mmx.
 
-declare x86_mmx @llvm.x86.mmx.pcmpgt.d(x86_mmx, x86_mmx) nounwind readnone
+declare <1 x i64> @llvm.x86.mmx.pcmpgt.d(<1 x i64>, <1 x i64>) nounwind readnone
 
 define i64 @test88(<1 x i64> %a, <1 x i64> %b) nounwind readnone {
 ; CHECK-LABEL: @test88
@@ -11,10 +11,10 @@ define i64 @test88(<1 x i64> %a, <1 x i64> %b) nounwind readnone {
 entry:
   %0 = bitcast <1 x i64> %b to <2 x i32>
   %1 = bitcast <1 x i64> %a to <2 x i32>
-  %mmx_var.i = bitcast <2 x i32> %1 to x86_mmx
-  %mmx_var1.i = bitcast <2 x i32> %0 to x86_mmx
-  %2 = tail call x86_mmx @llvm.x86.mmx.pcmpgt.d(x86_mmx %mmx_var.i, x86_mmx %mmx_var1.i) nounwind
-  %3 = bitcast x86_mmx %2 to <2 x i32>
+  %mmx_var.i = bitcast <2 x i32> %1 to <1 x i64>
+  %mmx_var1.i = bitcast <2 x i32> %0 to <1 x i64>
+  %2 = tail call <1 x i64> @llvm.x86.mmx.pcmpgt.d(<1 x i64> %mmx_var.i, <1 x i64> %mmx_var1.i) nounwind
+  %3 = bitcast <1 x i64> %2 to <2 x i32>
   %4 = bitcast <2 x i32> %3 to <1 x i64>
   %5 = extractelement <1 x i64> %4, i32 0
   ret i64 %5
diff --git a/llvm/test/CodeGen/X86/mxcsr-reg-usage.ll b/llvm/test/CodeGen/X86/mxcsr-reg-usage.ll
index fd8bd1f..6bb564c 100644
--- a/llvm/test/CodeGen/X86/mxcsr-reg-usage.ll
+++ b/llvm/test/CodeGen/X86/mxcsr-reg-usage.ll
@@ -1,18 +1,18 @@
 ; RUN: llc -mtriple=x86_64-unknown-unknown -mattr=+mmx,+fma,+f16c,+avx512f -stop-after finalize-isel -o - %s | FileCheck %s
 ; This test ensures that the MXCSR is implicitly used by MMX FP instructions.
 
-define x86_mmx @mxcsr_mmx(<4 x float> %a0) {
+define <1 x i64> @mxcsr_mmx(<4 x float> %a0) {
 ; CHECK: MMX_CVTPS2PIrr %{{[0-9]}}, implicit $mxcsr
 ; CHECK: MMX_CVTPI2PSrr %{{[0-9]}}, killed %{{[0-9]}}, implicit $mxcsr
 ; CHECK: MMX_CVTTPS2PIrr killed %{{[0-9]}}, implicit $mxcsr
 ; CHECK: MMX_CVTPI2PDrr killed %{{[0-9]$}}
 ; CHECK: MMX_CVTPD2PIrr killed %{{[0-9]}}, implicit $mxcsr
-  %1 = call x86_mmx @llvm.x86.sse.cvtps2pi(<4 x float> %a0)
-  %2 = call <4 x float> @llvm.x86.sse.cvtpi2ps(<4 x float> %a0, x86_mmx %1)
-  %3 = call x86_mmx @llvm.x86.sse.cvttps2pi(<4 x float> %2)
-  %4 = call <2 x double> @llvm.x86.sse.cvtpi2pd(x86_mmx %3)
-  %5 = call x86_mmx @llvm.x86.sse.cvtpd2pi(<2 x double> %4)
-  ret x86_mmx %5
+  %1 = call <1 x i64> @llvm.x86.sse.cvtps2pi(<4 x float> %a0)
+  %2 = call <4 x float> @llvm.x86.sse.cvtpi2ps(<4 x float> %a0, <1 x i64> %1)
+  %3 = call <1 x i64> @llvm.x86.sse.cvttps2pi(<4 x float> %2)
+  %4 = call <2 x double> @llvm.x86.sse.cvtpi2pd(<1 x i64> %3)
+  %5 = call <1 x i64> @llvm.x86.sse.cvtpd2pi(<2 x double> %4)
+  ret <1 x i64> %5
 }
 
 define half @mxcsr_f16c(float %a) {
@@ -41,11 +41,11 @@ define <8 x double> @mxcsr_fma_sae(<8 x double> %a, <8 x double> %b, <8 x double
   ret <8 x double> %res
 }
 
-declare x86_mmx @llvm.x86.sse.cvtps2pi(<4 x float>)
-declare<4 x float> @llvm.x86.sse.cvtpi2ps(<4 x float>, x86_mmx)
-declare x86_mmx @llvm.x86.sse.cvttps2pi(<4 x float>)
-declare <2 x double> @llvm.x86.sse.cvtpi2pd(x86_mmx)
-declare x86_mmx @llvm.x86.sse.cvtpd2pi(<2 x double>)
+declare <1 x i64> @llvm.x86.sse.cvtps2pi(<4 x float>)
+declare<4 x float> @llvm.x86.sse.cvtpi2ps(<4 x float>, <1 x i64>)
+declare <1 x i64> @llvm.x86.sse.cvttps2pi(<4 x float>)
+declare <2 x double> @llvm.x86.sse.cvtpi2pd(<1 x i64>)
+declare <1 x i64> @llvm.x86.sse.cvtpd2pi(<2 x double>)
 declare <4 x float> @llvm.x86.fma.vfmadd.ss(<4 x float>, <4 x float>, <4 x float>)
 declare <4 x float> @llvm.x86.fma.vfmadd.ps(<4 x float>, <4 x float>, <4 x float>)
 declare <8 x double> @llvm.x86.avx512.mask.vfmadd.pd.512(<8 x double>, <8 x double>, <8 x double>, i8, i32)
diff --git a/llvm/test/CodeGen/X86/nontemporal.ll b/llvm/test/CodeGen/X86/nontemporal.ll
index 1f273eb..3b6ffac 100644
--- a/llvm/test/CodeGen/X86/nontemporal.ll
+++ b/llvm/test/CodeGen/X86/nontemporal.ll
@@ -193,11 +193,11 @@ define void @test_mmx(ptr nocapture %a0, ptr nocapture %a1) {
 ; X64-NEXT:    movntq %mm0, (%rsi)
 ; X64-NEXT:    retq
 entry:
-  %0 = load x86_mmx, ptr %a0
-  %1 = call x86_mmx @llvm.x86.mmx.psrli.q(x86_mmx %0, i32 3)
-  store x86_mmx %1, ptr %a1, align 8, !nontemporal !0
+  %0 = load <1 x i64>, ptr %a0
+  %1 = call <1 x i64> @llvm.x86.mmx.psrli.q(<1 x i64> %0, i32 3)
+  store <1 x i64> %1, ptr %a1, align 8, !nontemporal !0
   ret void
 }
-declare x86_mmx @llvm.x86.mmx.psrli.q(x86_mmx, i32) nounwind readnone
+declare <1 x i64> @llvm.x86.mmx.psrli.q(<1 x i64>, i32) nounwind readnone
 
 !0 = !{i32 1}
diff --git a/llvm/test/CodeGen/X86/pr13859.ll b/llvm/test/CodeGen/X86/pr13859.ll
index 9b290e6..3546647 100644
--- a/llvm/test/CodeGen/X86/pr13859.ll
+++ b/llvm/test/CodeGen/X86/pr13859.ll
@@ -13,8 +13,7 @@ entry:
   %a37 = insertelement <4 x i16> %a36, i16 %aconv, i32 1
   %a38 = insertelement <4 x i16> %a37, i16 %aconv, i32 2
   %a39 = insertelement <4 x i16> %a38, i16 %aconv, i32 3
-  %a40 = bitcast <4 x i16> %a39 to x86_mmx
-  %a41 = bitcast x86_mmx %a40 to <1 x i64>
+  %a40 = bitcast <4 x i16> %a39 to <1 x i64>
 
   %a47 = trunc i32 %a32 to i1
   br i1 %a47, label %a48, label %a49
@@ -23,6 +22,6 @@ a48:
   unreachable
 
 a49:
-  store <1 x i64> %a41, ptr %dest, align 8 ; !!!
+  store <1 x i64> %a40, ptr %dest, align 8 ; !!!
   ret void
 }
diff --git a/llvm/test/CodeGen/X86/pr23246.ll b/llvm/test/CodeGen/X86/pr23246.ll
index cd0ece1..da3246a 100644
--- a/llvm/test/CodeGen/X86/pr23246.ll
+++ b/llvm/test/CodeGen/X86/pr23246.ll
@@ -6,15 +6,14 @@ target datalayout = "e-m:o-i64:64-f80:128-n8:16:32:64-S128"
 ; PR23246
 ; We're really only interested in doing something sane with the shuffle.
 
-define <2 x i64> @test(x86_mmx %a) #0 {
+define <2 x i64> @test(<1 x i64> %a) #0 {
 ; CHECK-LABEL: test:
 ; CHECK:       # %bb.0: # %entry
 ; CHECK-NEXT:    movq %rdi, %xmm0
 ; CHECK-NEXT:    pshufd {{.*#+}} xmm0 = xmm0[0,1,0,1]
 ; CHECK-NEXT:    retq
 entry:
-  %b = bitcast x86_mmx %a to <1 x i64>
-  %s = shufflevector <1 x i64> %b, <1 x i64> undef, <2 x i32> <i32 undef, i32 0>
+  %s = shufflevector <1 x i64> %a, <1 x i64> undef, <2 x i32> <i32 undef, i32 0>
   ret <2 x i64> %s
 }
 
diff --git a/llvm/test/CodeGen/X86/pr29222.ll b/llvm/test/CodeGen/X86/pr29222.ll
index 1ddcb1f..6b8ac91 100644
--- a/llvm/test/CodeGen/X86/pr29222.ll
+++ b/llvm/test/CodeGen/X86/pr29222.ll
@@ -62,9 +62,9 @@ define i32 @PR29222(i32) nounwind {
 ; X64-AVX-NEXT:    retq
   %2 = insertelement <2 x i32> undef, i32 %0, i32 0
   %3 = shufflevector <2 x i32> %2, <2 x i32> undef, <2 x i32> zeroinitializer
-  %4 = bitcast <2 x i32> %3 to x86_mmx
-  %5 = tail call x86_mmx @llvm.x86.mmx.packsswb(x86_mmx %4, x86_mmx %4)
-  %6 = bitcast x86_mmx %5 to i64
+  %4 = bitcast <2 x i32> %3 to <1 x i64>
+  %5 = tail call <1 x i64> @llvm.x86.mmx.packsswb(<1 x i64> %4, <1 x i64> %4)
+  %6 = bitcast <1 x i64> %5 to i64
   %7 = insertelement <2 x i64> undef, i64 %6, i32 0
   %8 = bitcast <2 x i64> %7 to <8 x i16>
   %9 = tail call <16 x i8> @llvm.x86.sse2.packsswb.128(<8 x i16> %8, <8 x i16> undef)
@@ -73,5 +73,5 @@ define i32 @PR29222(i32) nounwind {
   ret i32 %11
 }
 
-declare x86_mmx @llvm.x86.mmx.packsswb(x86_mmx, x86_mmx)
+declare <1 x i64> @llvm.x86.mmx.packsswb(<1 x i64>, <1 x i64>)
 declare <16 x i8> @llvm.x86.sse2.packsswb.128(<8 x i16>, <8 x i16>)
diff --git a/llvm/test/CodeGen/X86/pr35982.ll b/llvm/test/CodeGen/X86/pr35982.ll
index b602269..0ad3530 100644
--- a/llvm/test/CodeGen/X86/pr35982.ll
+++ b/llvm/test/CodeGen/X86/pr35982.ll
@@ -35,9 +35,9 @@ define float @PR35982_emms(<1 x i64>) nounwind {
   %2 = bitcast <1 x i64> %0 to <2 x i32>
   %3 = extractelement <2 x i32> %2, i32 0
   %4 = extractelement <1 x i64> %0, i32 0
-  %5 = bitcast i64 %4 to x86_mmx
-  %6 = tail call x86_mmx @llvm.x86.mmx.punpckhdq(x86_mmx %5, x86_mmx %5)
-  %7 = bitcast x86_mmx %6 to <2 x i32>
+  %5 = bitcast i64 %4 to <1 x i64>
+  %6 = tail call <1 x i64> @llvm.x86.mmx.punpckhdq(<1 x i64> %5, <1 x i64> %5)
+  %7 = bitcast <1 x i64> %6 to <2 x i32>
   %8 = extractelement <2 x i32> %7, i32 0
   tail call void @llvm.x86.mmx.emms()
   %9 = sitofp i32 %3 to float
@@ -46,5 +46,5 @@ define float @PR35982_emms(<1 x i64>) nounwind {
   ret float %11
 }
 
-declare x86_mmx @llvm.x86.mmx.punpckhdq(x86_mmx, x86_mmx)
+declare <1 x i64> @llvm.x86.mmx.punpckhdq(<1 x i64>, <1 x i64>)
 declare void @llvm.x86.mmx.emms()
diff --git a/llvm/test/CodeGen/X86/pr99396.ll b/llvm/test/CodeGen/X86/pr99396.ll
new file mode 100644
index 0000000..f534d32
--- /dev/null
+++ b/llvm/test/CodeGen/X86/pr99396.ll
@@ -0,0 +1,56 @@
+; RUN: llc < %s -mtriple=i386-unknown-freebsd -enable-misched -relocation-model=pic | FileCheck %s
+
+@c = external local_unnamed_addr global ptr
+
+declare i32 @fn2() local_unnamed_addr
+
+declare i32 @fn3() local_unnamed_addr
+
+define noundef i32 @fn4() #0 {
+entry:
+  %tmp0 = load i32, ptr @fn4, align 4
+; CHECK: movl fn4@GOT(%ebx), %edi
+; CHECK-NEXT: movl (%edi), %edx
+  %tmp1 = load ptr, ptr @c, align 4
+; CHECK: movl c@GOT(%ebx), %eax
+; CHECK-NEXT: movl (%eax), %esi
+; CHECK-NEXT: testl %esi, %esi
+  %cmp.g = icmp eq ptr %tmp1, null
+  br i1 %cmp.g, label %if.then.g, label %if.end3.g
+
+if.then.g:                                        ; preds = %entry
+  %tmp2 = load i32, ptr inttoptr (i32 1 to ptr), align 4
+  %cmp1.g = icmp slt i32 %tmp2, 0
+  br i1 %cmp1.g, label %if.then2.g, label %if.end3.g
+
+if.then2.g:                                       ; preds = %if.then.g
+  %.g = load volatile i32, ptr null, align 2147483648
+  br label %f.exit
+
+if.end3.g:                                        ; preds = %if.then.g, %entry
+  %h.i.g = icmp eq i32 %tmp0, 0
+  br i1 %h.i.g, label %f.exit, label %while.body.g
+
+while.body.g:                                     ; preds = %if.end3.g, %if.end8.g
+  %buff.addr.019.g = phi ptr [ %incdec.ptr.g, %if.end8.g ], [ @fn4, %if.end3.g ]
+  %g.addr.018.g = phi i32 [ %dec.g, %if.end8.g ], [ %tmp0, %if.end3.g ]
+  %call4.g = tail call i32 @fn3(ptr %tmp1, ptr %buff.addr.019.g, i32 %g.addr.018.g)
+  %cmp5.g = icmp slt i32 %call4.g, 0
+  br i1 %cmp5.g, label %if.then6.g, label %if.end8.g
+
+if.then6.g:                                       ; preds = %while.body.g
+  %call7.g = tail call i32 @fn2(ptr null)
+  br label %f.exit
+
+if.end8.g:                                        ; preds = %while.body.g
+  %dec.g = add i32 %g.addr.018.g, 1
+  %incdec.ptr.g = getelementptr i32, ptr %buff.addr.019.g, i32 1
+  store i64 0, ptr %tmp1, align 4
+  %h.not.g = icmp eq i32 %dec.g, 0
+  br i1 %h.not.g, label %f.exit, label %while.body.g
+
+f.exit:                                           ; preds = %if.end8.g, %if.then6.g, %if.end3.g, %if.then2.g
+  ret i32 0
+}
+
+attributes #0 = { "frame-pointer"="all" "tune-cpu"="generic" }
diff --git a/llvm/test/CodeGen/X86/select-mmx.ll b/llvm/test/CodeGen/X86/select-mmx.ll
index 8339cb7..8a4308a 100644
--- a/llvm/test/CodeGen/X86/select-mmx.ll
+++ b/llvm/test/CodeGen/X86/select-mmx.ll
@@ -51,9 +51,9 @@ define i64 @test47(i64 %arg)  {
 ; X86-NEXT:    .cfi_def_cfa %esp, 4
 ; X86-NEXT:    retl
   %cond = icmp eq i64 %arg, 0
-  %slct = select i1 %cond, x86_mmx bitcast (i64 7 to x86_mmx), x86_mmx bitcast (i64 0 to x86_mmx)
-  %psll = tail call x86_mmx @llvm.x86.mmx.psll.w(x86_mmx %slct, x86_mmx %slct)
-  %retc = bitcast x86_mmx %psll to i64
+  %slct = select i1 %cond, <1 x i64> bitcast (i64 7 to <1 x i64>), <1 x i64> bitcast (i64 0 to <1 x i64>)
+  %psll = tail call <1 x i64> @llvm.x86.mmx.psll.w(<1 x i64> %slct, <1 x i64> %slct)
+  %retc = bitcast <1 x i64> %psll to i64
   ret i64 %retc
 }
 
@@ -104,13 +104,13 @@ define i64 @test49(i64 %arg, i64 %x, i64 %y) {
 ; X86-NEXT:    .cfi_def_cfa %esp, 4
 ; X86-NEXT:    retl
   %cond = icmp eq i64 %arg, 0
-  %xmmx = bitcast i64 %x to x86_mmx
-  %ymmx = bitcast i64 %y to x86_mmx
-  %slct = select i1 %cond, x86_mmx %xmmx, x86_mmx %ymmx
-  %psll = tail call x86_mmx @llvm.x86.mmx.psll.w(x86_mmx %slct, x86_mmx %slct)
-  %retc = bitcast x86_mmx %psll to i64
+  %xmmx = bitcast i64 %x to <1 x i64>
+  %ymmx = bitcast i64 %y to <1 x i64>
+  %slct = select i1 %cond, <1 x i64> %xmmx, <1 x i64> %ymmx
+  %psll = tail call <1 x i64> @llvm.x86.mmx.psll.w(<1 x i64> %slct, <1 x i64> %slct)
+  %retc = bitcast <1 x i64> %psll to i64
   ret i64 %retc
 }
 
-declare x86_mmx @llvm.x86.mmx.psll.w(x86_mmx, x86_mmx)
+declare <1 x i64> @llvm.x86.mmx.psll.w(<1 x i64>, <1 x i64>)
 
diff --git a/llvm/test/CodeGen/X86/stack-folding-mmx.ll b/llvm/test/CodeGen/X86/stack-folding-mmx.ll
index 6652a8c..6eb99dd 100644
--- a/llvm/test/CodeGen/X86/stack-folding-mmx.ll
+++ b/llvm/test/CodeGen/X86/stack-folding-mmx.ll
@@ -1,7 +1,7 @@
 ; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py
 ; RUN: llc < %s -mtriple=x86_64-unknown-unknown -mattr=+mmx,+ssse3 | FileCheck %s
 
-define x86_mmx @stack_fold_cvtpd2pi(<2 x double> %a0) {
+define <1 x i64> @stack_fold_cvtpd2pi(<2 x double> %a0) {
 ; CHECK-LABEL: stack_fold_cvtpd2pi:
 ; CHECK:       # %bb.0:
 ; CHECK-NEXT:    movaps %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
@@ -12,12 +12,12 @@ define x86_mmx @stack_fold_cvtpd2pi(<2 x double> %a0) {
 ; CHECK-NEXT:    movq %mm0, %rax
 ; CHECK-NEXT:    retq
   %1 = tail call <2 x i64> asm sideeffect "nop", "=x,~{xmm1},~{xmm1},~{xmm2},~{xmm3},~{xmm4},~{xmm5},~{xmm6},~{xmm7},~{xmm8},~{xmm9},~{xmm10},~{xmm11},~{xmm12},~{xmm13},~{xmm14},~{xmm15},~{flags}"()
-  %2 = call x86_mmx @llvm.x86.sse.cvtpd2pi(<2 x double> %a0) nounwind readnone
-  ret x86_mmx %2
+  %2 = call <1 x i64> @llvm.x86.sse.cvtpd2pi(<2 x double> %a0) nounwind readnone
+  ret <1 x i64> %2
 }
-declare x86_mmx @llvm.x86.sse.cvtpd2pi(<2 x double>) nounwind readnone
+declare <1 x i64> @llvm.x86.sse.cvtpd2pi(<2 x double>) nounwind readnone
 
-define <2 x double> @stack_fold_cvtpi2pd(x86_mmx %a0) {
+define <2 x double> @stack_fold_cvtpi2pd(<1 x i64> %a0) {
 ; CHECK-LABEL: stack_fold_cvtpi2pd:
 ; CHECK:       # %bb.0:
 ; CHECK-NEXT:    movq %rdi, %mm0
@@ -27,13 +27,13 @@ define <2 x double> @stack_fold_cvtpi2pd(x86_mmx %a0) {
 ; CHECK-NEXT:    #NO_APP
 ; CHECK-NEXT:    cvtpi2pd {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 8-byte Folded Reload
 ; CHECK-NEXT:    retq
-  %1 = tail call x86_mmx asm sideeffect "nop", "=y,~{mm1},~{mm1},~{mm2},~{mm3},~{mm4},~{mm5},~{mm6},~{mm7}"()
-  %2 = call <2 x double> @llvm.x86.sse.cvtpi2pd(x86_mmx %a0) nounwind readnone
+  %1 = tail call <1 x i64> asm sideeffect "nop", "=y,~{mm1},~{mm1},~{mm2},~{mm3},~{mm4},~{mm5},~{mm6},~{mm7}"()
+  %2 = call <2 x double> @llvm.x86.sse.cvtpi2pd(<1 x i64> %a0) nounwind readnone
   ret <2 x double> %2
 }
-declare <2 x double> @llvm.x86.sse.cvtpi2pd(x86_mmx) nounwind readnone
+declare <2 x double> @llvm.x86.sse.cvtpi2pd(<1 x i64>) nounwind readnone
 
-define <4 x float> @stack_fold_cvtpi2ps(<4 x float> %a0, x86_mmx %a1) {
+define <4 x float> @stack_fold_cvtpi2ps(<4 x float> %a0, <1 x i64> %a1) {
 ; CHECK-LABEL: stack_fold_cvtpi2ps:
 ; CHECK:       # %bb.0:
 ; CHECK-NEXT:    movq %rdi, %mm0
@@ -43,13 +43,13 @@ define <4 x float> @stack_fold_cvtpi2ps(<4 x float> %a0, x86_mmx %a1) {
 ; CHECK-NEXT:    #NO_APP
 ; CHECK-NEXT:    cvtpi2ps {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 8-byte Folded Reload
 ; CHECK-NEXT:    retq
-  %1 = tail call x86_mmx asm sideeffect "nop", "=y,~{mm1},~{mm1},~{mm2},~{mm3},~{mm4},~{mm5},~{mm6},~{mm7}"()
-  %2 = call <4 x float> @llvm.x86.sse.cvtpi2ps(<4 x float> %a0, x86_mmx %a1) nounwind readnone
+  %1 = tail call <1 x i64> asm sideeffect "nop", "=y,~{mm1},~{mm1},~{mm2},~{mm3},~{mm4},~{mm5},~{mm6},~{mm7}"()
+  %2 = call <4 x float> @llvm.x86.sse.cvtpi2ps(<4 x float> %a0, <1 x i64> %a1) nounwind readnone
   ret <4 x float> %2
 }
-declare <4 x float> @llvm.x86.sse.cvtpi2ps(<4 x float>, x86_mmx) nounwind readnone
+declare <4 x float> @llvm.x86.sse.cvtpi2ps(<4 x float>, <1 x i64>) nounwind readnone
 
-define x86_mmx @stack_fold_cvtps2pi(<4 x float> %a0) {
+define <1 x i64> @stack_fold_cvtps2pi(<4 x float> %a0) {
 ; CHECK-LABEL: stack_fold_cvtps2pi:
 ; CHECK:       # %bb.0:
 ; CHECK-NEXT:    movaps %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
@@ -60,12 +60,12 @@ define x86_mmx @stack_fold_cvtps2pi(<4 x float> %a0) {
 ; CHECK-NEXT:    movq %mm0, %rax
 ; CHECK-NEXT:    retq
   %1 = tail call <2 x i64> asm sideeffect "nop", "=x,~{xmm1},~{xmm1},~{xmm2},~{xmm3},~{xmm4},~{xmm5},~{xmm6},~{xmm7},~{xmm8},~{xmm9},~{xmm10},~{xmm11},~{xmm12},~{xmm13},~{xmm14},~{xmm15},~{flags}"()
-  %2 = call x86_mmx @llvm.x86.sse.cvtps2pi(<4 x float> %a0) nounwind readnone
-  ret x86_mmx %2
+  %2 = call <1 x i64> @llvm.x86.sse.cvtps2pi(<4 x float> %a0) nounwind readnone
+  ret <1 x i64> %2
 }
-declare x86_mmx @llvm.x86.sse.cvtps2pi(<4 x float>) nounwind readnone
+declare <1 x i64> @llvm.x86.sse.cvtps2pi(<4 x float>) nounwind readnone
 
-define x86_mmx @stack_fold_cvttpd2pi(<2 x double> %a0) {
+define <1 x i64> @stack_fold_cvttpd2pi(<2 x double> %a0) {
 ; CHECK-LABEL: stack_fold_cvttpd2pi:
 ; CHECK:       # %bb.0:
 ; CHECK-NEXT:    movaps %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
@@ -76,12 +76,12 @@ define x86_mmx @stack_fold_cvttpd2pi(<2 x double> %a0) {
 ; CHECK-NEXT:    movq %mm0, %rax
 ; CHECK-NEXT:    retq
   %1 = tail call <2 x i64> asm sideeffect "nop", "=x,~{xmm1},~{xmm1},~{xmm2},~{xmm3},~{xmm4},~{xmm5},~{xmm6},~{xmm7},~{xmm8},~{xmm9},~{xmm10},~{xmm11},~{xmm12},~{xmm13},~{xmm14},~{xmm15},~{flags}"()
-  %2 = call x86_mmx @llvm.x86.sse.cvttpd2pi(<2 x double> %a0) nounwind readnone
-  ret x86_mmx %2
+  %2 = call <1 x i64> @llvm.x86.sse.cvttpd2pi(<2 x double> %a0) nounwind readnone
+  ret <1 x i64> %2
 }
-declare x86_mmx @llvm.x86.sse.cvttpd2pi(<2 x double>) nounwind readnone
+declare <1 x i64> @llvm.x86.sse.cvttpd2pi(<2 x double>) nounwind readnone
 
-define x86_mmx @stack_fold_cvttps2pi(<4 x float> %a0) {
+define <1 x i64> @stack_fold_cvttps2pi(<4 x float> %a0) {
 ; CHECK-LABEL: stack_fold_cvttps2pi:
 ; CHECK:       # %bb.0:
 ; CHECK-NEXT:    movaps %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
@@ -92,15 +92,15 @@ define x86_mmx @stack_fold_cvttps2pi(<4 x float> %a0) {
 ; CHECK-NEXT:    movq %mm0, %rax
 ; CHECK-NEXT:    retq
   %1 = tail call <2 x i64> asm sideeffect "nop", "=x,~{xmm1},~{xmm1},~{xmm2},~{xmm3},~{xmm4},~{xmm5},~{xmm6},~{xmm7},~{xmm8},~{xmm9},~{xmm10},~{xmm11},~{xmm12},~{xmm13},~{xmm14},~{xmm15},~{flags}"()
-  %2 = call x86_mmx @llvm.x86.sse.cvttps2pi(<4 x float> %a0) nounwind readnone
-  ret x86_mmx %2
+  %2 = call <1 x i64> @llvm.x86.sse.cvttps2pi(<4 x float> %a0) nounwind readnone
+  ret <1 x i64> %2
 }
-declare x86_mmx @llvm.x86.sse.cvttps2pi(<4 x float>) nounwind readnone
+declare <1 x i64> @llvm.x86.sse.cvttps2pi(<4 x float>) nounwind readnone
 
 ; TODO stack_fold_movd_load
 
 ; padd forces execution on mmx
-define i32 @stack_fold_movd_store(x86_mmx %a0) nounwind {
+define i32 @stack_fold_movd_store(<1 x i64> %a0) nounwind {
 ; CHECK-LABEL: stack_fold_movd_store:
 ; CHECK:       # %bb.0:
 ; CHECK-NEXT:    pushq %rbp
@@ -123,8 +123,8 @@ define i32 @stack_fold_movd_store(x86_mmx %a0) nounwind {
 ; CHECK-NEXT:    popq %r15
 ; CHECK-NEXT:    popq %rbp
 ; CHECK-NEXT:    retq
-  %1 = call x86_mmx @llvm.x86.mmx.padd.b(x86_mmx %a0, x86_mmx %a0)
-  %2 = bitcast x86_mmx %1 to <2 x i32>
+  %1 = call <1 x i64> @llvm.x86.mmx.padd.b(<1 x i64> %a0, <1 x i64> %a0)
+  %2 = bitcast <1 x i64> %1 to <2 x i32>
   %3 = extractelement <2 x i32> %2, i32 0
   %4 = tail call i64 asm sideeffect "nop", "=x,~{rax},~{rbx},~{rcx},~{rdx},~{rsi},~{rdi},~{rbp},~{r8},~{r9},~{r10},~{r11},~{r12},~{r13},~{r14},~{r15}"()
   ret i32 %3
@@ -133,7 +133,7 @@ define i32 @stack_fold_movd_store(x86_mmx %a0) nounwind {
 ; TODO stack_fold_movq_load
 
 ; padd forces execution on mmx
-define i64 @stack_fold_movq_store(x86_mmx %a0) nounwind {
+define i64 @stack_fold_movq_store(<1 x i64> %a0) nounwind {
 ; CHECK-LABEL: stack_fold_movq_store:
 ; CHECK:       # %bb.0:
 ; CHECK-NEXT:    pushq %rbp
@@ -156,13 +156,13 @@ define i64 @stack_fold_movq_store(x86_mmx %a0) nounwind {
 ; CHECK-NEXT:    popq %r15
 ; CHECK-NEXT:    popq %rbp
 ; CHECK-NEXT:    retq
-  %1 = call x86_mmx @llvm.x86.mmx.padd.b(x86_mmx %a0, x86_mmx %a0)
-  %2 = bitcast x86_mmx %1 to i64
+  %1 = call <1 x i64> @llvm.x86.mmx.padd.b(<1 x i64> %a0, <1 x i64> %a0)
+  %2 = bitcast <1 x i64> %1 to i64
   %3 = tail call i64 asm sideeffect "nop", "=x,~{rax},~{rbx},~{rcx},~{rdx},~{rsi},~{rdi},~{rbp},~{r8},~{r9},~{r10},~{r11},~{r12},~{r13},~{r14},~{r15}"()
   ret i64 %2
 }
 
-define x86_mmx @stack_fold_pabsb(x86_mmx %a0) {
+define <1 x i64> @stack_fold_pabsb(<1 x i64> %a0) {
 ; CHECK-LABEL: stack_fold_pabsb:
 ; CHECK:       # %bb.0:
 ; CHECK-NEXT:    movq %rdi, %mm0
@@ -173,13 +173,13 @@ define x86_mmx @stack_fold_pabsb(x86_mmx %a0) {
 ; CHECK-NEXT:    pabsb {{[-0-9]+}}(%r{{[sb]}}p), %mm0 # 8-byte Folded Reload
 ; CHECK-NEXT:    movq %mm0, %rax
 ; CHECK-NEXT:    retq
-  %1 = tail call x86_mmx asm sideeffect "nop", "=y,~{mm1},~{mm2},~{mm3},~{mm4},~{mm5},~{mm6},~{mm7}"()
-  %2 = call x86_mmx @llvm.x86.ssse3.pabs.b(x86_mmx %a0) nounwind readnone
-  ret x86_mmx %2
+  %1 = tail call <1 x i64> asm sideeffect "nop", "=y,~{mm1},~{mm2},~{mm3},~{mm4},~{mm5},~{mm6},~{mm7}"()
+  %2 = call <1 x i64> @llvm.x86.ssse3.pabs.b(<1 x i64> %a0) nounwind readnone
+  ret <1 x i64> %2
 }
-declare x86_mmx @llvm.x86.ssse3.pabs.b(x86_mmx) nounwind readnone
+declare <1 x i64> @llvm.x86.ssse3.pabs.b(<1 x i64>) nounwind readnone
 
-define x86_mmx @stack_fold_pabsd(x86_mmx %a0) {
+define <1 x i64> @stack_fold_pabsd(<1 x i64> %a0) {
 ; CHECK-LABEL: stack_fold_pabsd:
 ; CHECK:       # %bb.0:
 ; CHECK-NEXT:    movq %rdi, %mm0
@@ -190,13 +190,13 @@ define x86_mmx @stack_fold_pabsd(x86_mmx %a0) {
 ; CHECK-NEXT:    pabsd {{[-0-9]+}}(%r{{[sb]}}p), %mm0 # 8-byte Folded Reload
 ; CHECK-NEXT:    movq %mm0, %rax
 ; CHECK-NEXT:    retq
-  %1 = tail call x86_mmx asm sideeffect "nop", "=y,~{mm1},~{mm2},~{mm3},~{mm4},~{mm5},~{mm6},~{mm7}"()
-  %2 = call x86_mmx @llvm.x86.ssse3.pabs.d(x86_mmx %a0) nounwind readnone
-  ret x86_mmx %2
+  %1 = tail call <1 x i64> asm sideeffect "nop", "=y,~{mm1},~{mm2},~{mm3},~{mm4},~{mm5},~{mm6},~{mm7}"()
+  %2 = call <1 x i64> @llvm.x86.ssse3.pabs.d(<1 x i64> %a0) nounwind readnone
+  ret <1 x i64> %2
 }
-declare x86_mmx @llvm.x86.ssse3.pabs.d(x86_mmx) nounwind readnone
+declare <1 x i64> @llvm.x86.ssse3.pabs.d(<1 x i64>) nounwind readnone
 
-define x86_mmx @stack_fold_pabsw(x86_mmx %a0) {
+define <1 x i64> @stack_fold_pabsw(<1 x i64> %a0) {
 ; CHECK-LABEL: stack_fold_pabsw:
 ; CHECK:       # %bb.0:
 ; CHECK-NEXT:    movq %rdi, %mm0
@@ -207,13 +207,13 @@ define x86_mmx @stack_fold_pabsw(x86_mmx %a0) {
 ; CHECK-NEXT:    pabsw {{[-0-9]+}}(%r{{[sb]}}p), %mm0 # 8-byte Folded Reload
 ; CHECK-NEXT:    movq %mm0, %rax
 ; CHECK-NEXT:    retq
-  %1 = tail call x86_mmx asm sideeffect "nop", "=y,~{mm1},~{mm2},~{mm3},~{mm4},~{mm5},~{mm6},~{mm7}"()
-  %2 = call x86_mmx @llvm.x86.ssse3.pabs.w(x86_mmx %a0) nounwind readnone
-  ret x86_mmx %2
+  %1 = tail call <1 x i64> asm sideeffect "nop", "=y,~{mm1},~{mm2},~{mm3},~{mm4},~{mm5},~{mm6},~{mm7}"()
+  %2 = call <1 x i64> @llvm.x86.ssse3.pabs.w(<1 x i64> %a0) nounwind readnone
+  ret <1 x i64> %2
 }
-declare x86_mmx @llvm.x86.ssse3.pabs.w(x86_mmx) nounwind readnone
+declare <1 x i64> @llvm.x86.ssse3.pabs.w(<1 x i64>) nounwind readnone
 
-define x86_mmx @stack_fold_packssdw(x86_mmx %a, x86_mmx %b) {
+define <1 x i64> @stack_fold_packssdw(<1 x i64> %a, <1 x i64> %b) {
 ; CHECK-LABEL: stack_fold_packssdw:
 ; CHECK:       # %bb.0:
 ; CHECK-NEXT:    movq %rsi, %mm0
@@ -224,13 +224,13 @@ define x86_mmx @stack_fold_packssdw(x86_mmx %a, x86_mmx %b) {
 ; CHECK-NEXT:    nop
 ; CHECK-NEXT:    #NO_APP
 ; CHECK-NEXT:    retq
-  %1 = tail call x86_mmx asm sideeffect "nop", "=y,~{mm2},~{mm3},~{mm4},~{mm5},~{mm6},~{mm7}"()
-  %2 = call x86_mmx @llvm.x86.mmx.packssdw(x86_mmx %a, x86_mmx %b) nounwind readnone
-  ret x86_mmx %2
+  %1 = tail call <1 x i64> asm sideeffect "nop", "=y,~{mm2},~{mm3},~{mm4},~{mm5},~{mm6},~{mm7}"()
+  %2 = call <1 x i64> @llvm.x86.mmx.packssdw(<1 x i64> %a, <1 x i64> %b) nounwind readnone
+  ret <1 x i64> %2
 }
-declare x86_mmx @llvm.x86.mmx.packssdw(x86_mmx, x86_mmx) nounwind readnone
+declare <1 x i64> @llvm.x86.mmx.packssdw(<1 x i64>, <1 x i64>) nounwind readnone
 
-define x86_mmx @stack_fold_packsswb(x86_mmx %a, x86_mmx %b) {
+define <1 x i64> @stack_fold_packsswb(<1 x i64> %a, <1 x i64> %b) {
 ; CHECK-LABEL: stack_fold_packsswb:
 ; CHECK:       # %bb.0:
 ; CHECK-NEXT:    movq %rsi, %mm0
@@ -241,13 +241,13 @@ define x86_mmx @stack_fold_packsswb(x86_mmx %a, x86_mmx %b) {
 ; CHECK-NEXT:    nop
 ; CHECK-NEXT:    #NO_APP
 ; CHECK-NEXT:    retq
-  %1 = tail call x86_mmx asm sideeffect "nop", "=y,~{mm2},~{mm3},~{mm4},~{mm5},~{mm6},~{mm7}"()
-  %2 = call x86_mmx @llvm.x86.mmx.packsswb(x86_mmx %a, x86_mmx %b) nounwind readnone
-  ret x86_mmx %2
+  %1 = tail call <1 x i64> asm sideeffect "nop", "=y,~{mm2},~{mm3},~{mm4},~{mm5},~{mm6},~{mm7}"()
+  %2 = call <1 x i64> @llvm.x86.mmx.packsswb(<1 x i64> %a, <1 x i64> %b) nounwind readnone
+  ret <1 x i64> %2
 }
-declare x86_mmx @llvm.x86.mmx.packsswb(x86_mmx, x86_mmx) nounwind readnone
+declare <1 x i64> @llvm.x86.mmx.packsswb(<1 x i64>, <1 x i64>) nounwind readnone
 
-define x86_mmx @stack_fold_packuswb(x86_mmx %a, x86_mmx %b) {
+define <1 x i64> @stack_fold_packuswb(<1 x i64> %a, <1 x i64> %b) {
 ; CHECK-LABEL: stack_fold_packuswb:
 ; CHECK:       # %bb.0:
 ; CHECK-NEXT:    movq %rsi, %mm0
@@ -258,13 +258,13 @@ define x86_mmx @stack_fold_packuswb(x86_mmx %a, x86_mmx %b) {
 ; CHECK-NEXT:    nop
 ; CHECK-NEXT:    #NO_APP
 ; CHECK-NEXT:    retq
-  %1 = tail call x86_mmx asm sideeffect "nop", "=y,~{mm2},~{mm3},~{mm4},~{mm5},~{mm6},~{mm7}"()
-  %2 = call x86_mmx @llvm.x86.mmx.packuswb(x86_mmx %a, x86_mmx %b) nounwind readnone
-  ret x86_mmx %2
+  %1 = tail call <1 x i64> asm sideeffect "nop", "=y,~{mm2},~{mm3},~{mm4},~{mm5},~{mm6},~{mm7}"()
+  %2 = call <1 x i64> @llvm.x86.mmx.packuswb(<1 x i64> %a, <1 x i64> %b) nounwind readnone
+  ret <1 x i64> %2
 }
-declare x86_mmx @llvm.x86.mmx.packuswb(x86_mmx, x86_mmx) nounwind readnone
+declare <1 x i64> @llvm.x86.mmx.packuswb(<1 x i64>, <1 x i64>) nounwind readnone
 
-define x86_mmx @stack_fold_paddb(x86_mmx %a, x86_mmx %b) {
+define <1 x i64> @stack_fold_paddb(<1 x i64> %a, <1 x i64> %b) {
 ; CHECK-LABEL: stack_fold_paddb:
 ; CHECK:       # %bb.0:
 ; CHECK-NEXT:    movq %rsi, %mm0
@@ -275,13 +275,13 @@ define x86_mmx @stack_fold_paddb(x86_mmx %a, x86_mmx %b) {
 ; CHECK-NEXT:    nop
 ; CHECK-NEXT:    #NO_APP
 ; CHECK-NEXT:    retq
-  %1 = tail call x86_mmx asm sideeffect "nop", "=y,~{mm2},~{mm3},~{mm4},~{mm5},~{mm6},~{mm7}"()
-  %2 = call x86_mmx @llvm.x86.mmx.padd.b(x86_mmx %a, x86_mmx %b) nounwind readnone
-  ret x86_mmx %2
+  %1 = tail call <1 x i64> asm sideeffect "nop", "=y,~{mm2},~{mm3},~{mm4},~{mm5},~{mm6},~{mm7}"()
+  %2 = call <1 x i64> @llvm.x86.mmx.padd.b(<1 x i64> %a, <1 x i64> %b) nounwind readnone
+  ret <1 x i64> %2
 }
-declare x86_mmx @llvm.x86.mmx.padd.b(x86_mmx, x86_mmx) nounwind readnone
+declare <1 x i64> @llvm.x86.mmx.padd.b(<1 x i64>, <1 x i64>) nounwind readnone
 
-define x86_mmx @stack_fold_paddd(x86_mmx %a, x86_mmx %b) {
+define <1 x i64> @stack_fold_paddd(<1 x i64> %a, <1 x i64> %b) {
 ; CHECK-LABEL: stack_fold_paddd:
 ; CHECK:       # %bb.0:
 ; CHECK-NEXT:    movq %rsi, %mm0
@@ -292,13 +292,13 @@ define x86_mmx @stack_fold_paddd(x86_mmx %a, x86_mmx %b) {
 ; CHECK-NEXT:    nop
 ; CHECK-NEXT:    #NO_APP
 ; CHECK-NEXT:    retq
-  %1 = tail call x86_mmx asm sideeffect "nop", "=y,~{mm2},~{mm3},~{mm4},~{mm5},~{mm6},~{mm7}"()
-  %2 = call x86_mmx @llvm.x86.mmx.padd.d(x86_mmx %a, x86_mmx %b) nounwind readnone
-  ret x86_mmx %2
+  %1 = tail call <1 x i64> asm sideeffect "nop", "=y,~{mm2},~{mm3},~{mm4},~{mm5},~{mm6},~{mm7}"()
+  %2 = call <1 x i64> @llvm.x86.mmx.padd.d(<1 x i64> %a, <1 x i64> %b) nounwind readnone
+  ret <1 x i64> %2
 }
-declare x86_mmx @llvm.x86.mmx.padd.d(x86_mmx, x86_mmx) nounwind readnone
+declare <1 x i64> @llvm.x86.mmx.padd.d(<1 x i64>, <1 x i64>) nounwind readnone
 
-define x86_mmx @stack_fold_paddq(x86_mmx %a, x86_mmx %b) {
+define <1 x i64> @stack_fold_paddq(<1 x i64> %a, <1 x i64> %b) {
 ; CHECK-LABEL: stack_fold_paddq:
 ; CHECK:       # %bb.0:
 ; CHECK-NEXT:    movq %rsi, %mm0
@@ -309,13 +309,13 @@ define x86_mmx @stack_fold_paddq(x86_mmx %a, x86_mmx %b) {
 ; CHECK-NEXT:    nop
 ; CHECK-NEXT:    #NO_APP
 ; CHECK-NEXT:    retq
-  %1 = tail call x86_mmx asm sideeffect "nop", "=y,~{mm2},~{mm3},~{mm4},~{mm5},~{mm6},~{mm7}"()
-  %2 = call x86_mmx @llvm.x86.mmx.padd.q(x86_mmx %a, x86_mmx %b) nounwind readnone
-  ret x86_mmx %2
+  %1 = tail call <1 x i64> asm sideeffect "nop", "=y,~{mm2},~{mm3},~{mm4},~{mm5},~{mm6},~{mm7}"()
+  %2 = call <1 x i64> @llvm.x86.mmx.padd.q(<1 x i64> %a, <1 x i64> %b) nounwind readnone
+  ret <1 x i64> %2
 }
-declare x86_mmx @llvm.x86.mmx.padd.q(x86_mmx, x86_mmx) nounwind readnone
+declare <1 x i64> @llvm.x86.mmx.padd.q(<1 x i64>, <1 x i64>) nounwind readnone
 
-define x86_mmx @stack_fold_paddsb(x86_mmx %a, x86_mmx %b) {
+define <1 x i64> @stack_fold_paddsb(<1 x i64> %a, <1 x i64> %b) {
 ; CHECK-LABEL: stack_fold_paddsb:
 ; CHECK:       # %bb.0:
 ; CHECK-NEXT:    movq %rsi, %mm0
@@ -326,13 +326,13 @@ define x86_mmx @stack_fold_paddsb(x86_mmx %a, x86_mmx %b) {
 ; CHECK-NEXT:    nop
 ; CHECK-NEXT:    #NO_APP
 ; CHECK-NEXT:    retq
-  %1 = tail call x86_mmx asm sideeffect "nop", "=y,~{mm2},~{mm3},~{mm4},~{mm5},~{mm6},~{mm7}"()
-  %2 = call x86_mmx @llvm.x86.mmx.padds.b(x86_mmx %a, x86_mmx %b) nounwind readnone
-  ret x86_mmx %2
+  %1 = tail call <1 x i64> asm sideeffect "nop", "=y,~{mm2},~{mm3},~{mm4},~{mm5},~{mm6},~{mm7}"()
+  %2 = call <1 x i64> @llvm.x86.mmx.padds.b(<1 x i64> %a, <1 x i64> %b) nounwind readnone
+  ret <1 x i64> %2
 }
-declare x86_mmx @llvm.x86.mmx.padds.b(x86_mmx, x86_mmx) nounwind readnone
+declare <1 x i64> @llvm.x86.mmx.padds.b(<1 x i64>, <1 x i64>) nounwind readnone
 
-define x86_mmx @stack_fold_paddsw(x86_mmx %a, x86_mmx %b) {
+define <1 x i64> @stack_fold_paddsw(<1 x i64> %a, <1 x i64> %b) {
 ; CHECK-LABEL: stack_fold_paddsw:
 ; CHECK:       # %bb.0:
 ; CHECK-NEXT:    movq %rsi, %mm0
@@ -343,13 +343,13 @@ define x86_mmx @stack_fold_paddsw(x86_mmx %a, x86_mmx %b) {
 ; CHECK-NEXT:    nop
 ; CHECK-NEXT:    #NO_APP
 ; CHECK-NEXT:    retq
-  %1 = tail call x86_mmx asm sideeffect "nop", "=y,~{mm2},~{mm3},~{mm4},~{mm5},~{mm6},~{mm7}"()
-  %2 = call x86_mmx @llvm.x86.mmx.padds.w(x86_mmx %a, x86_mmx %b) nounwind readnone
-  ret x86_mmx %2
+  %1 = tail call <1 x i64> asm sideeffect "nop", "=y,~{mm2},~{mm3},~{mm4},~{mm5},~{mm6},~{mm7}"()
+  %2 = call <1 x i64> @llvm.x86.mmx.padds.w(<1 x i64> %a, <1 x i64> %b) nounwind readnone
+  ret <1 x i64> %2
 }
-declare x86_mmx @llvm.x86.mmx.padds.w(x86_mmx, x86_mmx) nounwind readnone
+declare <1 x i64> @llvm.x86.mmx.padds.w(<1 x i64>, <1 x i64>) nounwind readnone
 
-define x86_mmx @stack_fold_paddusb(x86_mmx %a, x86_mmx %b) {
+define <1 x i64> @stack_fold_paddusb(<1 x i64> %a, <1 x i64> %b) {
 ; CHECK-LABEL: stack_fold_paddusb:
 ; CHECK:       # %bb.0:
 ; CHECK-NEXT:    movq %rsi, %mm0
@@ -360,13 +360,13 @@ define x86_mmx @stack_fold_paddusb(x86_mmx %a, x86_mmx %b) {
 ; CHECK-NEXT:    nop
 ; CHECK-NEXT:    #NO_APP
 ; CHECK-NEXT:    retq
-  %1 = tail call x86_mmx asm sideeffect "nop", "=y,~{mm2},~{mm3},~{mm4},~{mm5},~{mm6},~{mm7}"()
-  %2 = call x86_mmx @llvm.x86.mmx.paddus.b(x86_mmx %a, x86_mmx %b) nounwind readnone
-  ret x86_mmx %2
+  %1 = tail call <1 x i64> asm sideeffect "nop", "=y,~{mm2},~{mm3},~{mm4},~{mm5},~{mm6},~{mm7}"()
+  %2 = call <1 x i64> @llvm.x86.mmx.paddus.b(<1 x i64> %a, <1 x i64> %b) nounwind readnone
+  ret <1 x i64> %2
 }
-declare x86_mmx @llvm.x86.mmx.paddus.b(x86_mmx, x86_mmx) nounwind readnone
+declare <1 x i64> @llvm.x86.mmx.paddus.b(<1 x i64>, <1 x i64>) nounwind readnone
 
-define x86_mmx @stack_fold_paddusw(x86_mmx %a, x86_mmx %b) {
+define <1 x i64> @stack_fold_paddusw(<1 x i64> %a, <1 x i64> %b) {
 ; CHECK-LABEL: stack_fold_paddusw:
 ; CHECK:       # %bb.0:
 ; CHECK-NEXT:    movq %rsi, %mm0
@@ -377,13 +377,13 @@ define x86_mmx @stack_fold_paddusw(x86_mmx %a, x86_mmx %b) {
 ; CHECK-NEXT:    nop
 ; CHECK-NEXT:    #NO_APP
 ; CHECK-NEXT:    retq
-  %1 = tail call x86_mmx asm sideeffect "nop", "=y,~{mm2},~{mm3},~{mm4},~{mm5},~{mm6},~{mm7}"()
-  %2 = call x86_mmx @llvm.x86.mmx.paddus.w(x86_mmx %a, x86_mmx %b) nounwind readnone
-  ret x86_mmx %2
+  %1 = tail call <1 x i64> asm sideeffect "nop", "=y,~{mm2},~{mm3},~{mm4},~{mm5},~{mm6},~{mm7}"()
+  %2 = call <1 x i64> @llvm.x86.mmx.paddus.w(<1 x i64> %a, <1 x i64> %b) nounwind readnone
+  ret <1 x i64> %2
 }
-declare x86_mmx @llvm.x86.mmx.paddus.w(x86_mmx, x86_mmx) nounwind readnone
+declare <1 x i64> @llvm.x86.mmx.paddus.w(<1 x i64>, <1 x i64>) nounwind readnone
 
-define x86_mmx @stack_fold_paddw(x86_mmx %a, x86_mmx %b) {
+define <1 x i64> @stack_fold_paddw(<1 x i64> %a, <1 x i64> %b) {
 ; CHECK-LABEL: stack_fold_paddw:
 ; CHECK:       # %bb.0:
 ; CHECK-NEXT:    movq %rsi, %mm0
@@ -394,13 +394,13 @@ define x86_mmx @stack_fold_paddw(x86_mmx %a, x86_mmx %b) {
 ; CHECK-NEXT:    nop
 ; CHECK-NEXT:    #NO_APP
 ; CHECK-NEXT:    retq
-  %1 = tail call x86_mmx asm sideeffect "nop", "=y,~{mm2},~{mm3},~{mm4},~{mm5},~{mm6},~{mm7}"()
-  %2 = call x86_mmx @llvm.x86.mmx.padd.w(x86_mmx %a, x86_mmx %b) nounwind readnone
-  ret x86_mmx %2
+  %1 = tail call <1 x i64> asm sideeffect "nop", "=y,~{mm2},~{mm3},~{mm4},~{mm5},~{mm6},~{mm7}"()
+  %2 = call <1 x i64> @llvm.x86.mmx.padd.w(<1 x i64> %a, <1 x i64> %b) nounwind readnone
+  ret <1 x i64> %2
 }
-declare x86_mmx @llvm.x86.mmx.padd.w(x86_mmx, x86_mmx) nounwind readnone
+declare <1 x i64> @llvm.x86.mmx.padd.w(<1 x i64>, <1 x i64>) nounwind readnone
 
-define x86_mmx @stack_fold_palignr(x86_mmx %a, x86_mmx %b) {
+define <1 x i64> @stack_fold_palignr(<1 x i64> %a, <1 x i64> %b) {
 ; CHECK-LABEL: stack_fold_palignr:
 ; CHECK:       # %bb.0:
 ; CHECK-NEXT:    movq %rsi, %mm0
@@ -411,13 +411,13 @@ define x86_mmx @stack_fold_palignr(x86_mmx %a, x86_mmx %b) {
 ; CHECK-NEXT:    nop
 ; CHECK-NEXT:    #NO_APP
 ; CHECK-NEXT:    retq
-  %1 = tail call x86_mmx asm sideeffect "nop", "=y,~{mm2},~{mm3},~{mm4},~{mm5},~{mm6},~{mm7}"()
-  %2 = call x86_mmx @llvm.x86.mmx.palignr.b(x86_mmx %a, x86_mmx %b, i8 1) nounwind readnone
-  ret x86_mmx %2
+  %1 = tail call <1 x i64> asm sideeffect "nop", "=y,~{mm2},~{mm3},~{mm4},~{mm5},~{mm6},~{mm7}"()
+  %2 = call <1 x i64> @llvm.x86.mmx.palignr.b(<1 x i64> %a, <1 x i64> %b, i8 1) nounwind readnone
+  ret <1 x i64> %2
 }
-declare x86_mmx @llvm.x86.mmx.palignr.b(x86_mmx, x86_mmx, i8) nounwind readnone
+declare <1 x i64> @llvm.x86.mmx.palignr.b(<1 x i64>, <1 x i64>, i8) nounwind readnone
 
-define x86_mmx @stack_fold_pand(x86_mmx %a, x86_mmx %b) {
+define <1 x i64> @stack_fold_pand(<1 x i64> %a, <1 x i64> %b) {
 ; CHECK-LABEL: stack_fold_pand:
 ; CHECK:       # %bb.0:
 ; CHECK-NEXT:    movq %rsi, %mm0
@@ -428,13 +428,13 @@ define x86_mmx @stack_fold_pand(x86_mmx %a, x86_mmx %b) {
 ; CHECK-NEXT:    nop
 ; CHECK-NEXT:    #NO_APP
 ; CHECK-NEXT:    retq
-  %1 = tail call x86_mmx asm sideeffect "nop", "=y,~{mm2},~{mm3},~{mm4},~{mm5},~{mm6},~{mm7}"()
-  %2 = call x86_mmx @llvm.x86.mmx.pand(x86_mmx %a, x86_mmx %b) nounwind readnone
-  ret x86_mmx %2
+  %1 = tail call <1 x i64> asm sideeffect "nop", "=y,~{mm2},~{mm3},~{mm4},~{mm5},~{mm6},~{mm7}"()
+  %2 = call <1 x i64> @llvm.x86.mmx.pand(<1 x i64> %a, <1 x i64> %b) nounwind readnone
+  ret <1 x i64> %2
 }
-declare x86_mmx @llvm.x86.mmx.pand(x86_mmx, x86_mmx) nounwind readnone
+declare <1 x i64> @llvm.x86.mmx.pand(<1 x i64>, <1 x i64>) nounwind readnone
 
-define x86_mmx @stack_fold_pandn(x86_mmx %a, x86_mmx %b) {
+define <1 x i64> @stack_fold_pandn(<1 x i64> %a, <1 x i64> %b) {
 ; CHECK-LABEL: stack_fold_pandn:
 ; CHECK:       # %bb.0:
 ; CHECK-NEXT:    movq %rsi, %mm0
@@ -445,13 +445,13 @@ define x86_mmx @stack_fold_pandn(x86_mmx %a, x86_mmx %b) {
 ; CHECK-NEXT:    nop
 ; CHECK-NEXT:    #NO_APP
 ; CHECK-NEXT:    retq
-  %1 = tail call x86_mmx asm sideeffect "nop", "=y,~{mm2},~{mm3},~{mm4},~{mm5},~{mm6},~{mm7}"()
-  %2 = call x86_mmx @llvm.x86.mmx.pandn(x86_mmx %a, x86_mmx %b) nounwind readnone
-  ret x86_mmx %2
+  %1 = tail call <1 x i64> asm sideeffect "nop", "=y,~{mm2},~{mm3},~{mm4},~{mm5},~{mm6},~{mm7}"()
+  %2 = call <1 x i64> @llvm.x86.mmx.pandn(<1 x i64> %a, <1 x i64> %b) nounwind readnone
+  ret <1 x i64> %2
 }
-declare x86_mmx @llvm.x86.mmx.pandn(x86_mmx, x86_mmx) nounwind readnone
+declare <1 x i64> @llvm.x86.mmx.pandn(<1 x i64>, <1 x i64>) nounwind readnone
 
-define x86_mmx @stack_fold_pavgb(x86_mmx %a, x86_mmx %b) {
+define <1 x i64> @stack_fold_pavgb(<1 x i64> %a, <1 x i64> %b) {
 ; CHECK-LABEL: stack_fold_pavgb:
 ; CHECK:       # %bb.0:
 ; CHECK-NEXT:    movq %rsi, %mm0
@@ -462,13 +462,13 @@ define x86_mmx @stack_fold_pavgb(x86_mmx %a, x86_mmx %b) {
 ; CHECK-NEXT:    nop
 ; CHECK-NEXT:    #NO_APP
 ; CHECK-NEXT:    retq
-  %1 = tail call x86_mmx asm sideeffect "nop", "=y,~{mm2},~{mm3},~{mm4},~{mm5},~{mm6},~{mm7}"()
-  %2 = call x86_mmx @llvm.x86.mmx.pavg.b(x86_mmx %a, x86_mmx %b) nounwind readnone
-  ret x86_mmx %2
+  %1 = tail call <1 x i64> asm sideeffect "nop", "=y,~{mm2},~{mm3},~{mm4},~{mm5},~{mm6},~{mm7}"()
+  %2 = call <1 x i64> @llvm.x86.mmx.pavg.b(<1 x i64> %a, <1 x i64> %b) nounwind readnone
+  ret <1 x i64> %2
 }
-declare x86_mmx @llvm.x86.mmx.pavg.b(x86_mmx, x86_mmx) nounwind readnone
+declare <1 x i64> @llvm.x86.mmx.pavg.b(<1 x i64>, <1 x i64>) nounwind readnone
 
-define x86_mmx @stack_fold_pavgw(x86_mmx %a, x86_mmx %b) {
+define <1 x i64> @stack_fold_pavgw(<1 x i64> %a, <1 x i64> %b) {
 ; CHECK-LABEL: stack_fold_pavgw:
 ; CHECK:       # %bb.0:
 ; CHECK-NEXT:    movq %rsi, %mm0
@@ -479,13 +479,13 @@ define x86_mmx @stack_fold_pavgw(x86_mmx %a, x86_mmx %b) {
 ; CHECK-NEXT:    nop
 ; CHECK-NEXT:    #NO_APP
 ; CHECK-NEXT:    retq
-  %1 = tail call x86_mmx asm sideeffect "nop", "=y,~{mm2},~{mm3},~{mm4},~{mm5},~{mm6},~{mm7}"()
-  %2 = call x86_mmx @llvm.x86.mmx.pavg.w(x86_mmx %a, x86_mmx %b) nounwind readnone
-  ret x86_mmx %2
+  %1 = tail call <1 x i64> asm sideeffect "nop", "=y,~{mm2},~{mm3},~{mm4},~{mm5},~{mm6},~{mm7}"()
+  %2 = call <1 x i64> @llvm.x86.mmx.pavg.w(<1 x i64> %a, <1 x i64> %b) nounwind readnone
+  ret <1 x i64> %2
 }
-declare x86_mmx @llvm.x86.mmx.pavg.w(x86_mmx, x86_mmx) nounwind readnone
+declare <1 x i64> @llvm.x86.mmx.pavg.w(<1 x i64>, <1 x i64>) nounwind readnone
 
-define x86_mmx @stack_fold_pcmpeqb(x86_mmx %a, x86_mmx %b) {
+define <1 x i64> @stack_fold_pcmpeqb(<1 x i64> %a, <1 x i64> %b) {
 ; CHECK-LABEL: stack_fold_pcmpeqb:
 ; CHECK:       # %bb.0:
 ; CHECK-NEXT:    movq %rsi, %mm0
@@ -496,13 +496,13 @@ define x86_mmx @stack_fold_pcmpeqb(x86_mmx %a, x86_mmx %b) {
 ; CHECK-NEXT:    nop
 ; CHECK-NEXT:    #NO_APP
 ; CHECK-NEXT:    retq
-  %1 = tail call x86_mmx asm sideeffect "nop", "=y,~{mm2},~{mm3},~{mm4},~{mm5},~{mm6},~{mm7}"()
-  %2 = call x86_mmx @llvm.x86.mmx.pcmpeq.b(x86_mmx %a, x86_mmx %b) nounwind readnone
-  ret x86_mmx %2
+  %1 = tail call <1 x i64> asm sideeffect "nop", "=y,~{mm2},~{mm3},~{mm4},~{mm5},~{mm6},~{mm7}"()
+  %2 = call <1 x i64> @llvm.x86.mmx.pcmpeq.b(<1 x i64> %a, <1 x i64> %b) nounwind readnone
+  ret <1 x i64> %2
 }
-declare x86_mmx @llvm.x86.mmx.pcmpeq.b(x86_mmx, x86_mmx) nounwind readnone
+declare <1 x i64> @llvm.x86.mmx.pcmpeq.b(<1 x i64>, <1 x i64>) nounwind readnone
 
-define x86_mmx @stack_fold_pcmpeqd(x86_mmx %a, x86_mmx %b) {
+define <1 x i64> @stack_fold_pcmpeqd(<1 x i64> %a, <1 x i64> %b) {
 ; CHECK-LABEL: stack_fold_pcmpeqd:
 ; CHECK:       # %bb.0:
 ; CHECK-NEXT:    movq %rsi, %mm0
@@ -513,13 +513,13 @@ define x86_mmx @stack_fold_pcmpeqd(x86_mmx %a, x86_mmx %b) {
 ; CHECK-NEXT:    nop
 ; CHECK-NEXT:    #NO_APP
 ; CHECK-NEXT:    retq
-  %1 = tail call x86_mmx asm sideeffect "nop", "=y,~{mm2},~{mm3},~{mm4},~{mm5},~{mm6},~{mm7}"()
-  %2 = call x86_mmx @llvm.x86.mmx.pcmpeq.d(x86_mmx %a, x86_mmx %b) nounwind readnone
-  ret x86_mmx %2
+  %1 = tail call <1 x i64> asm sideeffect "nop", "=y,~{mm2},~{mm3},~{mm4},~{mm5},~{mm6},~{mm7}"()
+  %2 = call <1 x i64> @llvm.x86.mmx.pcmpeq.d(<1 x i64> %a, <1 x i64> %b) nounwind readnone
+  ret <1 x i64> %2
 }
-declare x86_mmx @llvm.x86.mmx.pcmpeq.d(x86_mmx, x86_mmx) nounwind readnone
+declare <1 x i64> @llvm.x86.mmx.pcmpeq.d(<1 x i64>, <1 x i64>) nounwind readnone
 
-define x86_mmx @stack_fold_pcmpeqw(x86_mmx %a, x86_mmx %b) {
+define <1 x i64> @stack_fold_pcmpeqw(<1 x i64> %a, <1 x i64> %b) {
 ; CHECK-LABEL: stack_fold_pcmpeqw:
 ; CHECK:       # %bb.0:
 ; CHECK-NEXT:    movq %rsi, %mm0
@@ -530,13 +530,13 @@ define x86_mmx @stack_fold_pcmpeqw(x86_mmx %a, x86_mmx %b) {
 ; CHECK-NEXT:    nop
 ; CHECK-NEXT:    #NO_APP
 ; CHECK-NEXT:    retq
-  %1 = tail call x86_mmx asm sideeffect "nop", "=y,~{mm2},~{mm3},~{mm4},~{mm5},~{mm6},~{mm7}"()
-  %2 = call x86_mmx @llvm.x86.mmx.pcmpeq.w(x86_mmx %a, x86_mmx %b) nounwind readnone
-  ret x86_mmx %2
+  %1 = tail call <1 x i64> asm sideeffect "nop", "=y,~{mm2},~{mm3},~{mm4},~{mm5},~{mm6},~{mm7}"()
+  %2 = call <1 x i64> @llvm.x86.mmx.pcmpeq.w(<1 x i64> %a, <1 x i64> %b) nounwind readnone
+  ret <1 x i64> %2
 }
-declare x86_mmx @llvm.x86.mmx.pcmpeq.w(x86_mmx, x86_mmx) nounwind readnone
+declare <1 x i64> @llvm.x86.mmx.pcmpeq.w(<1 x i64>, <1 x i64>) nounwind readnone
 
-define x86_mmx @stack_fold_pcmpgtb(x86_mmx %a, x86_mmx %b) {
+define <1 x i64> @stack_fold_pcmpgtb(<1 x i64> %a, <1 x i64> %b) {
 ; CHECK-LABEL: stack_fold_pcmpgtb:
 ; CHECK:       # %bb.0:
 ; CHECK-NEXT:    movq %rsi, %mm0
@@ -547,13 +547,13 @@ define x86_mmx @stack_fold_pcmpgtb(x86_mmx %a, x86_mmx %b) {
 ; CHECK-NEXT:    nop
 ; CHECK-NEXT:    #NO_APP
 ; CHECK-NEXT:    retq
-  %1 = tail call x86_mmx asm sideeffect "nop", "=y,~{mm2},~{mm3},~{mm4},~{mm5},~{mm6},~{mm7}"()
-  %2 = call x86_mmx @llvm.x86.mmx.pcmpgt.b(x86_mmx %a, x86_mmx %b) nounwind readnone
-  ret x86_mmx %2
+  %1 = tail call <1 x i64> asm sideeffect "nop", "=y,~{mm2},~{mm3},~{mm4},~{mm5},~{mm6},~{mm7}"()
+  %2 = call <1 x i64> @llvm.x86.mmx.pcmpgt.b(<1 x i64> %a, <1 x i64> %b) nounwind readnone
+  ret <1 x i64> %2
 }
-declare x86_mmx @llvm.x86.mmx.pcmpgt.b(x86_mmx, x86_mmx) nounwind readnone
+declare <1 x i64> @llvm.x86.mmx.pcmpgt.b(<1 x i64>, <1 x i64>) nounwind readnone
 
-define x86_mmx @stack_fold_pcmpgtd(x86_mmx %a, x86_mmx %b) {
+define <1 x i64> @stack_fold_pcmpgtd(<1 x i64> %a, <1 x i64> %b) {
 ; CHECK-LABEL: stack_fold_pcmpgtd:
 ; CHECK:       # %bb.0:
 ; CHECK-NEXT:    movq %rsi, %mm0
@@ -564,13 +564,13 @@ define x86_mmx @stack_fold_pcmpgtd(x86_mmx %a, x86_mmx %b) {
 ; CHECK-NEXT:    nop
 ; CHECK-NEXT:    #NO_APP
 ; CHECK-NEXT:    retq
-  %1 = tail call x86_mmx asm sideeffect "nop", "=y,~{mm2},~{mm3},~{mm4},~{mm5},~{mm6},~{mm7}"()
-  %2 = call x86_mmx @llvm.x86.mmx.pcmpgt.d(x86_mmx %a, x86_mmx %b) nounwind readnone
-  ret x86_mmx %2
+  %1 = tail call <1 x i64> asm sideeffect "nop", "=y,~{mm2},~{mm3},~{mm4},~{mm5},~{mm6},~{mm7}"()
+  %2 = call <1 x i64> @llvm.x86.mmx.pcmpgt.d(<1 x i64> %a, <1 x i64> %b) nounwind readnone
+  ret <1 x i64> %2
 }
-declare x86_mmx @llvm.x86.mmx.pcmpgt.d(x86_mmx, x86_mmx) nounwind readnone
+declare <1 x i64> @llvm.x86.mmx.pcmpgt.d(<1 x i64>, <1 x i64>) nounwind readnone
 
-define x86_mmx @stack_fold_pcmpgtw(x86_mmx %a, x86_mmx %b) {
+define <1 x i64> @stack_fold_pcmpgtw(<1 x i64> %a, <1 x i64> %b) {
 ; CHECK-LABEL: stack_fold_pcmpgtw:
 ; CHECK:       # %bb.0:
 ; CHECK-NEXT:    movq %rsi, %mm0
@@ -581,13 +581,13 @@ define x86_mmx @stack_fold_pcmpgtw(x86_mmx %a, x86_mmx %b) {
 ; CHECK-NEXT:    nop
 ; CHECK-NEXT:    #NO_APP
 ; CHECK-NEXT:    retq
-  %1 = tail call x86_mmx asm sideeffect "nop", "=y,~{mm2},~{mm3},~{mm4},~{mm5},~{mm6},~{mm7}"()
-  %2 = call x86_mmx @llvm.x86.mmx.pcmpgt.w(x86_mmx %a, x86_mmx %b) nounwind readnone
-  ret x86_mmx %2
+  %1 = tail call <1 x i64> asm sideeffect "nop", "=y,~{mm2},~{mm3},~{mm4},~{mm5},~{mm6},~{mm7}"()
+  %2 = call <1 x i64> @llvm.x86.mmx.pcmpgt.w(<1 x i64> %a, <1 x i64> %b) nounwind readnone
+  ret <1 x i64> %2
 }
-declare x86_mmx @llvm.x86.mmx.pcmpgt.w(x86_mmx, x86_mmx) nounwind readnone
+declare <1 x i64> @llvm.x86.mmx.pcmpgt.w(<1 x i64>, <1 x i64>) nounwind readnone
 
-define x86_mmx @stack_fold_phaddd(x86_mmx %a, x86_mmx %b) {
+define <1 x i64> @stack_fold_phaddd(<1 x i64> %a, <1 x i64> %b) {
 ; CHECK-LABEL: stack_fold_phaddd:
 ; CHECK:       # %bb.0:
 ; CHECK-NEXT:    movq %rsi, %mm0
@@ -598,13 +598,13 @@ define x86_mmx @stack_fold_phaddd(x86_mmx %a, x86_mmx %b) {
 ; CHECK-NEXT:    nop
 ; CHECK-NEXT:    #NO_APP
 ; CHECK-NEXT:    retq
-  %1 = tail call x86_mmx asm sideeffect "nop", "=y,~{mm2},~{mm3},~{mm4},~{mm5},~{mm6},~{mm7}"()
-  %2 = call x86_mmx @llvm.x86.ssse3.phadd.d(x86_mmx %a, x86_mmx %b) nounwind readnone
-  ret x86_mmx %2
+  %1 = tail call <1 x i64> asm sideeffect "nop", "=y,~{mm2},~{mm3},~{mm4},~{mm5},~{mm6},~{mm7}"()
+  %2 = call <1 x i64> @llvm.x86.ssse3.phadd.d(<1 x i64> %a, <1 x i64> %b) nounwind readnone
+  ret <1 x i64> %2
 }
-declare x86_mmx @llvm.x86.ssse3.phadd.d(x86_mmx, x86_mmx) nounwind readnone
+declare <1 x i64> @llvm.x86.ssse3.phadd.d(<1 x i64>, <1 x i64>) nounwind readnone
 
-define x86_mmx @stack_fold_phaddsw(x86_mmx %a, x86_mmx %b) {
+define <1 x i64> @stack_fold_phaddsw(<1 x i64> %a, <1 x i64> %b) {
 ; CHECK-LABEL: stack_fold_phaddsw:
 ; CHECK:       # %bb.0:
 ; CHECK-NEXT:    movq %rsi, %mm0
@@ -615,13 +615,13 @@ define x86_mmx @stack_fold_phaddsw(x86_mmx %a, x86_mmx %b) {
 ; CHECK-NEXT:    nop
 ; CHECK-NEXT:    #NO_APP
 ; CHECK-NEXT:    retq
-  %1 = tail call x86_mmx asm sideeffect "nop", "=y,~{mm2},~{mm3},~{mm4},~{mm5},~{mm6},~{mm7}"()
-  %2 = call x86_mmx @llvm.x86.ssse3.phadd.sw(x86_mmx %a, x86_mmx %b) nounwind readnone
-  ret x86_mmx %2
+  %1 = tail call <1 x i64> asm sideeffect "nop", "=y,~{mm2},~{mm3},~{mm4},~{mm5},~{mm6},~{mm7}"()
+  %2 = call <1 x i64> @llvm.x86.ssse3.phadd.sw(<1 x i64> %a, <1 x i64> %b) nounwind readnone
+  ret <1 x i64> %2
 }
-declare x86_mmx @llvm.x86.ssse3.phadd.sw(x86_mmx, x86_mmx) nounwind readnone
+declare <1 x i64> @llvm.x86.ssse3.phadd.sw(<1 x i64>, <1 x i64>) nounwind readnone
 
-define x86_mmx @stack_fold_phaddw(x86_mmx %a, x86_mmx %b) {
+define <1 x i64> @stack_fold_phaddw(<1 x i64> %a, <1 x i64> %b) {
 ; CHECK-LABEL: stack_fold_phaddw:
 ; CHECK:       # %bb.0:
 ; CHECK-NEXT:    movq %rsi, %mm0
@@ -632,13 +632,13 @@ define x86_mmx @stack_fold_phaddw(x86_mmx %a, x86_mmx %b) {
 ; CHECK-NEXT:    nop
 ; CHECK-NEXT:    #NO_APP
 ; CHECK-NEXT:    retq
-  %1 = tail call x86_mmx asm sideeffect "nop", "=y,~{mm2},~{mm3},~{mm4},~{mm5},~{mm6},~{mm7}"()
-  %2 = call x86_mmx @llvm.x86.ssse3.phadd.w(x86_mmx %a, x86_mmx %b) nounwind readnone
-  ret x86_mmx %2
+  %1 = tail call <1 x i64> asm sideeffect "nop", "=y,~{mm2},~{mm3},~{mm4},~{mm5},~{mm6},~{mm7}"()
+  %2 = call <1 x i64> @llvm.x86.ssse3.phadd.w(<1 x i64> %a, <1 x i64> %b) nounwind readnone
+  ret <1 x i64> %2
 }
-declare x86_mmx @llvm.x86.ssse3.phadd.w(x86_mmx, x86_mmx) nounwind readnone
+declare <1 x i64> @llvm.x86.ssse3.phadd.w(<1 x i64>, <1 x i64>) nounwind readnone
 
-define x86_mmx @stack_fold_phsubd(x86_mmx %a, x86_mmx %b) {
+define <1 x i64> @stack_fold_phsubd(<1 x i64> %a, <1 x i64> %b) {
 ; CHECK-LABEL: stack_fold_phsubd:
 ; CHECK:       # %bb.0:
 ; CHECK-NEXT:    movq %rsi, %mm0
@@ -649,13 +649,13 @@ define x86_mmx @stack_fold_phsubd(x86_mmx %a, x86_mmx %b) {
 ; CHECK-NEXT:    nop
 ; CHECK-NEXT:    #NO_APP
 ; CHECK-NEXT:    retq
-  %1 = tail call x86_mmx asm sideeffect "nop", "=y,~{mm2},~{mm3},~{mm4},~{mm5},~{mm6},~{mm7}"()
-  %2 = call x86_mmx @llvm.x86.ssse3.phsub.d(x86_mmx %a, x86_mmx %b) nounwind readnone
-  ret x86_mmx %2
+  %1 = tail call <1 x i64> asm sideeffect "nop", "=y,~{mm2},~{mm3},~{mm4},~{mm5},~{mm6},~{mm7}"()
+  %2 = call <1 x i64> @llvm.x86.ssse3.phsub.d(<1 x i64> %a, <1 x i64> %b) nounwind readnone
+  ret <1 x i64> %2
 }
-declare x86_mmx @llvm.x86.ssse3.phsub.d(x86_mmx, x86_mmx) nounwind readnone
+declare <1 x i64> @llvm.x86.ssse3.phsub.d(<1 x i64>, <1 x i64>) nounwind readnone
 
-define x86_mmx @stack_fold_phsubsw(x86_mmx %a, x86_mmx %b) {
+define <1 x i64> @stack_fold_phsubsw(<1 x i64> %a, <1 x i64> %b) {
 ; CHECK-LABEL: stack_fold_phsubsw:
 ; CHECK:       # %bb.0:
 ; CHECK-NEXT:    movq %rsi, %mm0
@@ -666,13 +666,13 @@ define x86_mmx @stack_fold_phsubsw(x86_mmx %a, x86_mmx %b) {
 ; CHECK-NEXT:    nop
 ; CHECK-NEXT:    #NO_APP
 ; CHECK-NEXT:    retq
-  %1 = tail call x86_mmx asm sideeffect "nop", "=y,~{mm2},~{mm3},~{mm4},~{mm5},~{mm6},~{mm7}"()
-  %2 = call x86_mmx @llvm.x86.ssse3.phsub.sw(x86_mmx %a, x86_mmx %b) nounwind readnone
-  ret x86_mmx %2
+  %1 = tail call <1 x i64> asm sideeffect "nop", "=y,~{mm2},~{mm3},~{mm4},~{mm5},~{mm6},~{mm7}"()
+  %2 = call <1 x i64> @llvm.x86.ssse3.phsub.sw(<1 x i64> %a, <1 x i64> %b) nounwind readnone
+  ret <1 x i64> %2
 }
-declare x86_mmx @llvm.x86.ssse3.phsub.sw(x86_mmx, x86_mmx) nounwind readnone
+declare <1 x i64> @llvm.x86.ssse3.phsub.sw(<1 x i64>, <1 x i64>) nounwind readnone
 
-define x86_mmx @stack_fold_phsubw(x86_mmx %a, x86_mmx %b) {
+define <1 x i64> @stack_fold_phsubw(<1 x i64> %a, <1 x i64> %b) {
 ; CHECK-LABEL: stack_fold_phsubw:
 ; CHECK:       # %bb.0:
 ; CHECK-NEXT:    movq %rsi, %mm0
@@ -683,15 +683,15 @@ define x86_mmx @stack_fold_phsubw(x86_mmx %a, x86_mmx %b) {
 ; CHECK-NEXT:    nop
 ; CHECK-NEXT:    #NO_APP
 ; CHECK-NEXT:    retq
-  %1 = tail call x86_mmx asm sideeffect "nop", "=y,~{mm2},~{mm3},~{mm4},~{mm5},~{mm6},~{mm7}"()
-  %2 = call x86_mmx @llvm.x86.ssse3.phsub.w(x86_mmx %a, x86_mmx %b) nounwind readnone
-  ret x86_mmx %2
+  %1 = tail call <1 x i64> asm sideeffect "nop", "=y,~{mm2},~{mm3},~{mm4},~{mm5},~{mm6},~{mm7}"()
+  %2 = call <1 x i64> @llvm.x86.ssse3.phsub.w(<1 x i64> %a, <1 x i64> %b) nounwind readnone
+  ret <1 x i64> %2
 }
-declare x86_mmx @llvm.x86.ssse3.phsub.w(x86_mmx, x86_mmx) nounwind readnone
+declare <1 x i64> @llvm.x86.ssse3.phsub.w(<1 x i64>, <1 x i64>) nounwind readnone
 
 ; TODO stack_fold_pinsrw
 
-define x86_mmx @stack_fold_pmaddubsw(x86_mmx %a, x86_mmx %b) {
+define <1 x i64> @stack_fold_pmaddubsw(<1 x i64> %a, <1 x i64> %b) {
 ; CHECK-LABEL: stack_fold_pmaddubsw:
 ; CHECK:       # %bb.0:
 ; CHECK-NEXT:    movq %rsi, %mm0
@@ -702,13 +702,13 @@ define x86_mmx @stack_fold_pmaddubsw(x86_mmx %a, x86_mmx %b) {
 ; CHECK-NEXT:    nop
 ; CHECK-NEXT:    #NO_APP
 ; CHECK-NEXT:    retq
-  %1 = tail call x86_mmx asm sideeffect "nop", "=y,~{mm2},~{mm3},~{mm4},~{mm5},~{mm6},~{mm7}"()
-  %2 = call x86_mmx @llvm.x86.ssse3.pmadd.ub.sw(x86_mmx %a, x86_mmx %b) nounwind readnone
-  ret x86_mmx %2
+  %1 = tail call <1 x i64> asm sideeffect "nop", "=y,~{mm2},~{mm3},~{mm4},~{mm5},~{mm6},~{mm7}"()
+  %2 = call <1 x i64> @llvm.x86.ssse3.pmadd.ub.sw(<1 x i64> %a, <1 x i64> %b) nounwind readnone
+  ret <1 x i64> %2
 }
-declare x86_mmx @llvm.x86.ssse3.pmadd.ub.sw(x86_mmx, x86_mmx) nounwind readnone
+declare <1 x i64> @llvm.x86.ssse3.pmadd.ub.sw(<1 x i64>, <1 x i64>) nounwind readnone
 
-define x86_mmx @stack_fold_pmaddwd(x86_mmx %a, x86_mmx %b) {
+define <1 x i64> @stack_fold_pmaddwd(<1 x i64> %a, <1 x i64> %b) {
 ; CHECK-LABEL: stack_fold_pmaddwd:
 ; CHECK:       # %bb.0:
 ; CHECK-NEXT:    movq %rsi, %mm0
@@ -719,13 +719,13 @@ define x86_mmx @stack_fold_pmaddwd(x86_mmx %a, x86_mmx %b) {
 ; CHECK-NEXT:    nop
 ; CHECK-NEXT:    #NO_APP
 ; CHECK-NEXT:    retq
-  %1 = tail call x86_mmx asm sideeffect "nop", "=y,~{mm2},~{mm3},~{mm4},~{mm5},~{mm6},~{mm7}"()
-  %2 = call x86_mmx @llvm.x86.mmx.pmadd.wd(x86_mmx %a, x86_mmx %b) nounwind readnone
-  ret x86_mmx %2
+  %1 = tail call <1 x i64> asm sideeffect "nop", "=y,~{mm2},~{mm3},~{mm4},~{mm5},~{mm6},~{mm7}"()
+  %2 = call <1 x i64> @llvm.x86.mmx.pmadd.wd(<1 x i64> %a, <1 x i64> %b) nounwind readnone
+  ret <1 x i64> %2
 }
-declare x86_mmx @llvm.x86.mmx.pmadd.wd(x86_mmx, x86_mmx) nounwind readnone
+declare <1 x i64> @llvm.x86.mmx.pmadd.wd(<1 x i64>, <1 x i64>) nounwind readnone
 
-define x86_mmx @stack_fold_pmaxsw(x86_mmx %a, x86_mmx %b) {
+define <1 x i64> @stack_fold_pmaxsw(<1 x i64> %a, <1 x i64> %b) {
 ; CHECK-LABEL: stack_fold_pmaxsw:
 ; CHECK:       # %bb.0:
 ; CHECK-NEXT:    movq %rsi, %mm0
@@ -736,13 +736,13 @@ define x86_mmx @stack_fold_pmaxsw(x86_mmx %a, x86_mmx %b) {
 ; CHECK-NEXT:    nop
 ; CHECK-NEXT:    #NO_APP
 ; CHECK-NEXT:    retq
-  %1 = tail call x86_mmx asm sideeffect "nop", "=y,~{mm2},~{mm3},~{mm4},~{mm5},~{mm6},~{mm7}"()
-  %2 = call x86_mmx @llvm.x86.mmx.pmaxs.w(x86_mmx %a, x86_mmx %b) nounwind readnone
-  ret x86_mmx %2
+  %1 = tail call <1 x i64> asm sideeffect "nop", "=y,~{mm2},~{mm3},~{mm4},~{mm5},~{mm6},~{mm7}"()
+  %2 = call <1 x i64> @llvm.x86.mmx.pmaxs.w(<1 x i64> %a, <1 x i64> %b) nounwind readnone
+  ret <1 x i64> %2
 }
-declare x86_mmx @llvm.x86.mmx.pmaxs.w(x86_mmx, x86_mmx) nounwind readnone
+declare <1 x i64> @llvm.x86.mmx.pmaxs.w(<1 x i64>, <1 x i64>) nounwind readnone
 
-define x86_mmx @stack_fold_pmaxub(x86_mmx %a, x86_mmx %b) {
+define <1 x i64> @stack_fold_pmaxub(<1 x i64> %a, <1 x i64> %b) {
 ; CHECK-LABEL: stack_fold_pmaxub:
 ; CHECK:       # %bb.0:
 ; CHECK-NEXT:    movq %rsi, %mm0
@@ -753,13 +753,13 @@ define x86_mmx @stack_fold_pmaxub(x86_mmx %a, x86_mmx %b) {
 ; CHECK-NEXT:    nop
 ; CHECK-NEXT:    #NO_APP
 ; CHECK-NEXT:    retq
-  %1 = tail call x86_mmx asm sideeffect "nop", "=y,~{mm2},~{mm3},~{mm4},~{mm5},~{mm6},~{mm7}"()
-  %2 = call x86_mmx @llvm.x86.mmx.pmaxu.b(x86_mmx %a, x86_mmx %b) nounwind readnone
-  ret x86_mmx %2
+  %1 = tail call <1 x i64> asm sideeffect "nop", "=y,~{mm2},~{mm3},~{mm4},~{mm5},~{mm6},~{mm7}"()
+  %2 = call <1 x i64> @llvm.x86.mmx.pmaxu.b(<1 x i64> %a, <1 x i64> %b) nounwind readnone
+  ret <1 x i64> %2
 }
-declare x86_mmx @llvm.x86.mmx.pmaxu.b(x86_mmx, x86_mmx) nounwind readnone
+declare <1 x i64> @llvm.x86.mmx.pmaxu.b(<1 x i64>, <1 x i64>) nounwind readnone
 
-define x86_mmx @stack_fold_pminsw(x86_mmx %a, x86_mmx %b) {
+define <1 x i64> @stack_fold_pminsw(<1 x i64> %a, <1 x i64> %b) {
 ; CHECK-LABEL: stack_fold_pminsw:
 ; CHECK:       # %bb.0:
 ; CHECK-NEXT:    movq %rsi, %mm0
@@ -770,13 +770,13 @@ define x86_mmx @stack_fold_pminsw(x86_mmx %a, x86_mmx %b) {
 ; CHECK-NEXT:    nop
 ; CHECK-NEXT:    #NO_APP
 ; CHECK-NEXT:    retq
-  %1 = tail call x86_mmx asm sideeffect "nop", "=y,~{mm2},~{mm3},~{mm4},~{mm5},~{mm6},~{mm7}"()
-  %2 = call x86_mmx @llvm.x86.mmx.pmins.w(x86_mmx %a, x86_mmx %b) nounwind readnone
-  ret x86_mmx %2
+  %1 = tail call <1 x i64> asm sideeffect "nop", "=y,~{mm2},~{mm3},~{mm4},~{mm5},~{mm6},~{mm7}"()
+  %2 = call <1 x i64> @llvm.x86.mmx.pmins.w(<1 x i64> %a, <1 x i64> %b) nounwind readnone
+  ret <1 x i64> %2
 }
-declare x86_mmx @llvm.x86.mmx.pmins.w(x86_mmx, x86_mmx) nounwind readnone
+declare <1 x i64> @llvm.x86.mmx.pmins.w(<1 x i64>, <1 x i64>) nounwind readnone
 
-define x86_mmx @stack_fold_pminub(x86_mmx %a, x86_mmx %b) {
+define <1 x i64> @stack_fold_pminub(<1 x i64> %a, <1 x i64> %b) {
 ; CHECK-LABEL: stack_fold_pminub:
 ; CHECK:       # %bb.0:
 ; CHECK-NEXT:    movq %rsi, %mm0
@@ -787,13 +787,13 @@ define x86_mmx @stack_fold_pminub(x86_mmx %a, x86_mmx %b) {
 ; CHECK-NEXT:    nop
 ; CHECK-NEXT:    #NO_APP
 ; CHECK-NEXT:    retq
-  %1 = tail call x86_mmx asm sideeffect "nop", "=y,~{mm2},~{mm3},~{mm4},~{mm5},~{mm6},~{mm7}"()
-  %2 = call x86_mmx @llvm.x86.mmx.pminu.b(x86_mmx %a, x86_mmx %b) nounwind readnone
-  ret x86_mmx %2
+  %1 = tail call <1 x i64> asm sideeffect "nop", "=y,~{mm2},~{mm3},~{mm4},~{mm5},~{mm6},~{mm7}"()
+  %2 = call <1 x i64> @llvm.x86.mmx.pminu.b(<1 x i64> %a, <1 x i64> %b) nounwind readnone
+  ret <1 x i64> %2
 }
-declare x86_mmx @llvm.x86.mmx.pminu.b(x86_mmx, x86_mmx) nounwind readnone
+declare <1 x i64> @llvm.x86.mmx.pminu.b(<1 x i64>, <1 x i64>) nounwind readnone
 
-define x86_mmx @stack_fold_pmulhrsw(x86_mmx %a, x86_mmx %b) {
+define <1 x i64> @stack_fold_pmulhrsw(<1 x i64> %a, <1 x i64> %b) {
 ; CHECK-LABEL: stack_fold_pmulhrsw:
 ; CHECK:       # %bb.0:
 ; CHECK-NEXT:    movq %rsi, %mm0
@@ -804,13 +804,13 @@ define x86_mmx @stack_fold_pmulhrsw(x86_mmx %a, x86_mmx %b) {
 ; CHECK-NEXT:    nop
 ; CHECK-NEXT:    #NO_APP
 ; CHECK-NEXT:    retq
-  %1 = tail call x86_mmx asm sideeffect "nop", "=y,~{mm2},~{mm3},~{mm4},~{mm5},~{mm6},~{mm7}"()
-  %2 = call x86_mmx @llvm.x86.ssse3.pmul.hr.sw(x86_mmx %a, x86_mmx %b) nounwind readnone
-  ret x86_mmx %2
+  %1 = tail call <1 x i64> asm sideeffect "nop", "=y,~{mm2},~{mm3},~{mm4},~{mm5},~{mm6},~{mm7}"()
+  %2 = call <1 x i64> @llvm.x86.ssse3.pmul.hr.sw(<1 x i64> %a, <1 x i64> %b) nounwind readnone
+  ret <1 x i64> %2
 }
-declare x86_mmx @llvm.x86.ssse3.pmul.hr.sw(x86_mmx, x86_mmx) nounwind readnone
+declare <1 x i64> @llvm.x86.ssse3.pmul.hr.sw(<1 x i64>, <1 x i64>) nounwind readnone
 
-define x86_mmx @stack_fold_pmulhuw(x86_mmx %a, x86_mmx %b) {
+define <1 x i64> @stack_fold_pmulhuw(<1 x i64> %a, <1 x i64> %b) {
 ; CHECK-LABEL: stack_fold_pmulhuw:
 ; CHECK:       # %bb.0:
 ; CHECK-NEXT:    movq %rsi, %mm0
@@ -821,13 +821,13 @@ define x86_mmx @stack_fold_pmulhuw(x86_mmx %a, x86_mmx %b) {
 ; CHECK-NEXT:    nop
 ; CHECK-NEXT:    #NO_APP
 ; CHECK-NEXT:    retq
-  %1 = tail call x86_mmx asm sideeffect "nop", "=y,~{mm2},~{mm3},~{mm4},~{mm5},~{mm6},~{mm7}"()
-  %2 = call x86_mmx @llvm.x86.mmx.pmulhu.w(x86_mmx %a, x86_mmx %b) nounwind readnone
-  ret x86_mmx %2
+  %1 = tail call <1 x i64> asm sideeffect "nop", "=y,~{mm2},~{mm3},~{mm4},~{mm5},~{mm6},~{mm7}"()
+  %2 = call <1 x i64> @llvm.x86.mmx.pmulhu.w(<1 x i64> %a, <1 x i64> %b) nounwind readnone
+  ret <1 x i64> %2
 }
-declare x86_mmx @llvm.x86.mmx.pmulhu.w(x86_mmx, x86_mmx) nounwind readnone
+declare <1 x i64> @llvm.x86.mmx.pmulhu.w(<1 x i64>, <1 x i64>) nounwind readnone
 
-define x86_mmx @stack_fold_pmulhw(x86_mmx %a, x86_mmx %b) {
+define <1 x i64> @stack_fold_pmulhw(<1 x i64> %a, <1 x i64> %b) {
 ; CHECK-LABEL: stack_fold_pmulhw:
 ; CHECK:       # %bb.0:
 ; CHECK-NEXT:    movq %rsi, %mm0
@@ -838,13 +838,13 @@ define x86_mmx @stack_fold_pmulhw(x86_mmx %a, x86_mmx %b) {
 ; CHECK-NEXT:    nop
 ; CHECK-NEXT:    #NO_APP
 ; CHECK-NEXT:    retq
-  %1 = tail call x86_mmx asm sideeffect "nop", "=y,~{mm2},~{mm3},~{mm4},~{mm5},~{mm6},~{mm7}"()
-  %2 = call x86_mmx @llvm.x86.mmx.pmulh.w(x86_mmx %a, x86_mmx %b) nounwind readnone
-  ret x86_mmx %2
+  %1 = tail call <1 x i64> asm sideeffect "nop", "=y,~{mm2},~{mm3},~{mm4},~{mm5},~{mm6},~{mm7}"()
+  %2 = call <1 x i64> @llvm.x86.mmx.pmulh.w(<1 x i64> %a, <1 x i64> %b) nounwind readnone
+  ret <1 x i64> %2
 }
-declare x86_mmx @llvm.x86.mmx.pmulh.w(x86_mmx, x86_mmx) nounwind readnone
+declare <1 x i64> @llvm.x86.mmx.pmulh.w(<1 x i64>, <1 x i64>) nounwind readnone
 
-define x86_mmx @stack_fold_pmullw(x86_mmx %a, x86_mmx %b) {
+define <1 x i64> @stack_fold_pmullw(<1 x i64> %a, <1 x i64> %b) {
 ; CHECK-LABEL: stack_fold_pmullw:
 ; CHECK:       # %bb.0:
 ; CHECK-NEXT:    movq %rsi, %mm0
@@ -855,13 +855,13 @@ define x86_mmx @stack_fold_pmullw(x86_mmx %a, x86_mmx %b) {
 ; CHECK-NEXT:    nop
 ; CHECK-NEXT:    #NO_APP
 ; CHECK-NEXT:    retq
-  %1 = tail call x86_mmx asm sideeffect "nop", "=y,~{mm2},~{mm3},~{mm4},~{mm5},~{mm6},~{mm7}"()
-  %2 = call x86_mmx @llvm.x86.mmx.pmull.w(x86_mmx %a, x86_mmx %b) nounwind readnone
-  ret x86_mmx %2
+  %1 = tail call <1 x i64> asm sideeffect "nop", "=y,~{mm2},~{mm3},~{mm4},~{mm5},~{mm6},~{mm7}"()
+  %2 = call <1 x i64> @llvm.x86.mmx.pmull.w(<1 x i64> %a, <1 x i64> %b) nounwind readnone
+  ret <1 x i64> %2
 }
-declare x86_mmx @llvm.x86.mmx.pmull.w(x86_mmx, x86_mmx) nounwind readnone
+declare <1 x i64> @llvm.x86.mmx.pmull.w(<1 x i64>, <1 x i64>) nounwind readnone
 
-define x86_mmx @stack_fold_pmuludq(x86_mmx %a, x86_mmx %b) {
+define <1 x i64> @stack_fold_pmuludq(<1 x i64> %a, <1 x i64> %b) {
 ; CHECK-LABEL: stack_fold_pmuludq:
 ; CHECK:       # %bb.0:
 ; CHECK-NEXT:    movq %rsi, %mm0
@@ -872,13 +872,13 @@ define x86_mmx @stack_fold_pmuludq(x86_mmx %a, x86_mmx %b) {
 ; CHECK-NEXT:    nop
 ; CHECK-NEXT:    #NO_APP
 ; CHECK-NEXT:    retq
-  %1 = tail call x86_mmx asm sideeffect "nop", "=y,~{mm2},~{mm3},~{mm4},~{mm5},~{mm6},~{mm7}"()
-  %2 = call x86_mmx @llvm.x86.mmx.pmulu.dq(x86_mmx %a, x86_mmx %b) nounwind readnone
-  ret x86_mmx %2
+  %1 = tail call <1 x i64> asm sideeffect "nop", "=y,~{mm2},~{mm3},~{mm4},~{mm5},~{mm6},~{mm7}"()
+  %2 = call <1 x i64> @llvm.x86.mmx.pmulu.dq(<1 x i64> %a, <1 x i64> %b) nounwind readnone
+  ret <1 x i64> %2
 }
-declare x86_mmx @llvm.x86.mmx.pmulu.dq(x86_mmx, x86_mmx) nounwind readnone
+declare <1 x i64> @llvm.x86.mmx.pmulu.dq(<1 x i64>, <1 x i64>) nounwind readnone
 
-define x86_mmx @stack_fold_por(x86_mmx %a, x86_mmx %b) {
+define <1 x i64> @stack_fold_por(<1 x i64> %a, <1 x i64> %b) {
 ; CHECK-LABEL: stack_fold_por:
 ; CHECK:       # %bb.0:
 ; CHECK-NEXT:    movq %rsi, %mm0
@@ -889,13 +889,13 @@ define x86_mmx @stack_fold_por(x86_mmx %a, x86_mmx %b) {
 ; CHECK-NEXT:    nop
 ; CHECK-NEXT:    #NO_APP
 ; CHECK-NEXT:    retq
-  %1 = tail call x86_mmx asm sideeffect "nop", "=y,~{mm2},~{mm3},~{mm4},~{mm5},~{mm6},~{mm7}"()
-  %2 = call x86_mmx @llvm.x86.mmx.por(x86_mmx %a, x86_mmx %b) nounwind readnone
-  ret x86_mmx %2
+  %1 = tail call <1 x i64> asm sideeffect "nop", "=y,~{mm2},~{mm3},~{mm4},~{mm5},~{mm6},~{mm7}"()
+  %2 = call <1 x i64> @llvm.x86.mmx.por(<1 x i64> %a, <1 x i64> %b) nounwind readnone
+  ret <1 x i64> %2
 }
-declare x86_mmx @llvm.x86.mmx.por(x86_mmx, x86_mmx) nounwind readnone
+declare <1 x i64> @llvm.x86.mmx.por(<1 x i64>, <1 x i64>) nounwind readnone
 
-define x86_mmx @stack_fold_psadbw(x86_mmx %a, x86_mmx %b) {
+define <1 x i64> @stack_fold_psadbw(<1 x i64> %a, <1 x i64> %b) {
 ; CHECK-LABEL: stack_fold_psadbw:
 ; CHECK:       # %bb.0:
 ; CHECK-NEXT:    movq %rsi, %mm0
@@ -906,13 +906,13 @@ define x86_mmx @stack_fold_psadbw(x86_mmx %a, x86_mmx %b) {
 ; CHECK-NEXT:    nop
 ; CHECK-NEXT:    #NO_APP
 ; CHECK-NEXT:    retq
-  %1 = tail call x86_mmx asm sideeffect "nop", "=y,~{mm2},~{mm3},~{mm4},~{mm5},~{mm6},~{mm7}"()
-  %2 = call x86_mmx @llvm.x86.mmx.psad.bw(x86_mmx %a, x86_mmx %b) nounwind readnone
-  ret x86_mmx %2
+  %1 = tail call <1 x i64> asm sideeffect "nop", "=y,~{mm2},~{mm3},~{mm4},~{mm5},~{mm6},~{mm7}"()
+  %2 = call <1 x i64> @llvm.x86.mmx.psad.bw(<1 x i64> %a, <1 x i64> %b) nounwind readnone
+  ret <1 x i64> %2
 }
-declare x86_mmx @llvm.x86.mmx.psad.bw(x86_mmx, x86_mmx) nounwind readnone
+declare <1 x i64> @llvm.x86.mmx.psad.bw(<1 x i64>, <1 x i64>) nounwind readnone
 
-define x86_mmx @stack_fold_pshufb(x86_mmx %a, x86_mmx %b) {
+define <1 x i64> @stack_fold_pshufb(<1 x i64> %a, <1 x i64> %b) {
 ; CHECK-LABEL: stack_fold_pshufb:
 ; CHECK:       # %bb.0:
 ; CHECK-NEXT:    movq %rsi, %mm0
@@ -923,13 +923,13 @@ define x86_mmx @stack_fold_pshufb(x86_mmx %a, x86_mmx %b) {
 ; CHECK-NEXT:    nop
 ; CHECK-NEXT:    #NO_APP
 ; CHECK-NEXT:    retq
-  %1 = tail call x86_mmx asm sideeffect "nop", "=y,~{mm1},~{mm2},~{mm3},~{mm4},~{mm5},~{mm6},~{mm7}"()
-  %2 = call x86_mmx @llvm.x86.ssse3.pshuf.b(x86_mmx %a, x86_mmx %b) nounwind readnone
-  ret x86_mmx %2
+  %1 = tail call <1 x i64> asm sideeffect "nop", "=y,~{mm1},~{mm2},~{mm3},~{mm4},~{mm5},~{mm6},~{mm7}"()
+  %2 = call <1 x i64> @llvm.x86.ssse3.pshuf.b(<1 x i64> %a, <1 x i64> %b) nounwind readnone
+  ret <1 x i64> %2
 }
-declare x86_mmx @llvm.x86.ssse3.pshuf.b(x86_mmx, x86_mmx) nounwind readnone
+declare <1 x i64> @llvm.x86.ssse3.pshuf.b(<1 x i64>, <1 x i64>) nounwind readnone
 
-define x86_mmx @stack_fold_pshufw(x86_mmx %a) {
+define <1 x i64> @stack_fold_pshufw(<1 x i64> %a) {
 ; CHECK-LABEL: stack_fold_pshufw:
 ; CHECK:       # %bb.0:
 ; CHECK-NEXT:    movq %rdi, %mm0
@@ -941,13 +941,13 @@ define x86_mmx @stack_fold_pshufw(x86_mmx %a) {
 ; CHECK-NEXT:    # mm0 = mem[1,0,0,0]
 ; CHECK-NEXT:    movq %mm0, %rax
 ; CHECK-NEXT:    retq
-  %1 = tail call x86_mmx asm sideeffect "nop", "=y,~{mm1},~{mm2},~{mm3},~{mm4},~{mm5},~{mm6},~{mm7}"()
-  %2 = call x86_mmx @llvm.x86.sse.pshuf.w(x86_mmx %a, i8 1) nounwind readnone
-  ret x86_mmx %2
+  %1 = tail call <1 x i64> asm sideeffect "nop", "=y,~{mm1},~{mm2},~{mm3},~{mm4},~{mm5},~{mm6},~{mm7}"()
+  %2 = call <1 x i64> @llvm.x86.sse.pshuf.w(<1 x i64> %a, i8 1) nounwind readnone
+  ret <1 x i64> %2
 }
-declare x86_mmx @llvm.x86.sse.pshuf.w(x86_mmx, i8) nounwind readnone
+declare <1 x i64> @llvm.x86.sse.pshuf.w(<1 x i64>, i8) nounwind readnone
 
-define x86_mmx @stack_fold_psignb(x86_mmx %a0, x86_mmx %a1) {
+define <1 x i64> @stack_fold_psignb(<1 x i64> %a0, <1 x i64> %a1) {
 ; CHECK-LABEL: stack_fold_psignb:
 ; CHECK:       # %bb.0:
 ; CHECK-NEXT:    movq %rsi, %mm0
@@ -958,13 +958,13 @@ define x86_mmx @stack_fold_psignb(x86_mmx %a0, x86_mmx %a1) {
 ; CHECK-NEXT:    nop
 ; CHECK-NEXT:    #NO_APP
 ; CHECK-NEXT:    retq
-  %1 = tail call x86_mmx asm sideeffect "nop", "=y,~{mm2},~{mm3},~{mm4},~{mm5},~{mm6},~{mm7}"()
-  %2 = call x86_mmx @llvm.x86.ssse3.psign.b(x86_mmx %a0, x86_mmx %a1) nounwind readnone
-  ret x86_mmx %2
+  %1 = tail call <1 x i64> asm sideeffect "nop", "=y,~{mm2},~{mm3},~{mm4},~{mm5},~{mm6},~{mm7}"()
+  %2 = call <1 x i64> @llvm.x86.ssse3.psign.b(<1 x i64> %a0, <1 x i64> %a1) nounwind readnone
+  ret <1 x i64> %2
 }
-declare x86_mmx @llvm.x86.ssse3.psign.b(x86_mmx, x86_mmx) nounwind readnone
+declare <1 x i64> @llvm.x86.ssse3.psign.b(<1 x i64>, <1 x i64>) nounwind readnone
 
-define x86_mmx @stack_fold_psignd(x86_mmx %a0, x86_mmx %a1) {
+define <1 x i64> @stack_fold_psignd(<1 x i64> %a0, <1 x i64> %a1) {
 ; CHECK-LABEL: stack_fold_psignd:
 ; CHECK:       # %bb.0:
 ; CHECK-NEXT:    movq %rsi, %mm0
@@ -975,13 +975,13 @@ define x86_mmx @stack_fold_psignd(x86_mmx %a0, x86_mmx %a1) {
 ; CHECK-NEXT:    nop
 ; CHECK-NEXT:    #NO_APP
 ; CHECK-NEXT:    retq
-  %1 = tail call x86_mmx asm sideeffect "nop", "=y,~{mm2},~{mm3},~{mm4},~{mm5},~{mm6},~{mm7}"()
-  %2 = call x86_mmx @llvm.x86.ssse3.psign.d(x86_mmx %a0, x86_mmx %a1) nounwind readnone
-  ret x86_mmx %2
+  %1 = tail call <1 x i64> asm sideeffect "nop", "=y,~{mm2},~{mm3},~{mm4},~{mm5},~{mm6},~{mm7}"()
+  %2 = call <1 x i64> @llvm.x86.ssse3.psign.d(<1 x i64> %a0, <1 x i64> %a1) nounwind readnone
+  ret <1 x i64> %2
 }
-declare x86_mmx @llvm.x86.ssse3.psign.d(x86_mmx, x86_mmx) nounwind readnone
+declare <1 x i64> @llvm.x86.ssse3.psign.d(<1 x i64>, <1 x i64>) nounwind readnone
 
-define x86_mmx @stack_fold_psignw(x86_mmx %a0, x86_mmx %a1) {
+define <1 x i64> @stack_fold_psignw(<1 x i64> %a0, <1 x i64> %a1) {
 ; CHECK-LABEL: stack_fold_psignw:
 ; CHECK:       # %bb.0:
 ; CHECK-NEXT:    movq %rsi, %mm0
@@ -992,13 +992,13 @@ define x86_mmx @stack_fold_psignw(x86_mmx %a0, x86_mmx %a1) {
 ; CHECK-NEXT:    nop
 ; CHECK-NEXT:    #NO_APP
 ; CHECK-NEXT:    retq
-  %1 = tail call x86_mmx asm sideeffect "nop", "=y,~{mm2},~{mm3},~{mm4},~{mm5},~{mm6},~{mm7}"()
-  %2 = call x86_mmx @llvm.x86.ssse3.psign.w(x86_mmx %a0, x86_mmx %a1) nounwind readnone
-  ret x86_mmx %2
+  %1 = tail call <1 x i64> asm sideeffect "nop", "=y,~{mm2},~{mm3},~{mm4},~{mm5},~{mm6},~{mm7}"()
+  %2 = call <1 x i64> @llvm.x86.ssse3.psign.w(<1 x i64> %a0, <1 x i64> %a1) nounwind readnone
+  ret <1 x i64> %2
 }
-declare x86_mmx @llvm.x86.ssse3.psign.w(x86_mmx, x86_mmx) nounwind readnone
+declare <1 x i64> @llvm.x86.ssse3.psign.w(<1 x i64>, <1 x i64>) nounwind readnone
 
-define x86_mmx @stack_fold_pslld(x86_mmx %a, x86_mmx %b) {
+define <1 x i64> @stack_fold_pslld(<1 x i64> %a, <1 x i64> %b) {
 ; CHECK-LABEL: stack_fold_pslld:
 ; CHECK:       # %bb.0:
 ; CHECK-NEXT:    movq %rsi, %mm0
@@ -1009,13 +1009,13 @@ define x86_mmx @stack_fold_pslld(x86_mmx %a, x86_mmx %b) {
 ; CHECK-NEXT:    nop
 ; CHECK-NEXT:    #NO_APP
 ; CHECK-NEXT:    retq
-  %1 = tail call x86_mmx asm sideeffect "nop", "=y,~{mm2},~{mm3},~{mm4},~{mm5},~{mm6},~{mm7}"()
-  %2 = call x86_mmx @llvm.x86.mmx.psll.d(x86_mmx %a, x86_mmx %b) nounwind readnone
-  ret x86_mmx %2
+  %1 = tail call <1 x i64> asm sideeffect "nop", "=y,~{mm2},~{mm3},~{mm4},~{mm5},~{mm6},~{mm7}"()
+  %2 = call <1 x i64> @llvm.x86.mmx.psll.d(<1 x i64> %a, <1 x i64> %b) nounwind readnone
+  ret <1 x i64> %2
 }
-declare x86_mmx @llvm.x86.mmx.psll.d(x86_mmx, x86_mmx) nounwind readnone
+declare <1 x i64> @llvm.x86.mmx.psll.d(<1 x i64>, <1 x i64>) nounwind readnone
 
-define x86_mmx @stack_fold_psllq(x86_mmx %a, x86_mmx %b) {
+define <1 x i64> @stack_fold_psllq(<1 x i64> %a, <1 x i64> %b) {
 ; CHECK-LABEL: stack_fold_psllq:
 ; CHECK:       # %bb.0:
 ; CHECK-NEXT:    movq %rsi, %mm0
@@ -1026,13 +1026,13 @@ define x86_mmx @stack_fold_psllq(x86_mmx %a, x86_mmx %b) {
 ; CHECK-NEXT:    nop
 ; CHECK-NEXT:    #NO_APP
 ; CHECK-NEXT:    retq
-  %1 = tail call x86_mmx asm sideeffect "nop", "=y,~{mm2},~{mm3},~{mm4},~{mm5},~{mm6},~{mm7}"()
-  %2 = call x86_mmx @llvm.x86.mmx.psll.q(x86_mmx %a, x86_mmx %b) nounwind readnone
-  ret x86_mmx %2
+  %1 = tail call <1 x i64> asm sideeffect "nop", "=y,~{mm2},~{mm3},~{mm4},~{mm5},~{mm6},~{mm7}"()
+  %2 = call <1 x i64> @llvm.x86.mmx.psll.q(<1 x i64> %a, <1 x i64> %b) nounwind readnone
+  ret <1 x i64> %2
 }
-declare x86_mmx @llvm.x86.mmx.psll.q(x86_mmx, x86_mmx) nounwind readnone
+declare <1 x i64> @llvm.x86.mmx.psll.q(<1 x i64>, <1 x i64>) nounwind readnone
 
-define x86_mmx @stack_fold_psllw(x86_mmx %a, x86_mmx %b) {
+define <1 x i64> @stack_fold_psllw(<1 x i64> %a, <1 x i64> %b) {
 ; CHECK-LABEL: stack_fold_psllw:
 ; CHECK:       # %bb.0:
 ; CHECK-NEXT:    movq %rsi, %mm0
@@ -1043,13 +1043,13 @@ define x86_mmx @stack_fold_psllw(x86_mmx %a, x86_mmx %b) {
 ; CHECK-NEXT:    nop
 ; CHECK-NEXT:    #NO_APP
 ; CHECK-NEXT:    retq
-  %1 = tail call x86_mmx asm sideeffect "nop", "=y,~{mm2},~{mm3},~{mm4},~{mm5},~{mm6},~{mm7}"()
-  %2 = call x86_mmx @llvm.x86.mmx.psll.w(x86_mmx %a, x86_mmx %b) nounwind readnone
-  ret x86_mmx %2
+  %1 = tail call <1 x i64> asm sideeffect "nop", "=y,~{mm2},~{mm3},~{mm4},~{mm5},~{mm6},~{mm7}"()
+  %2 = call <1 x i64> @llvm.x86.mmx.psll.w(<1 x i64> %a, <1 x i64> %b) nounwind readnone
+  ret <1 x i64> %2
 }
-declare x86_mmx @llvm.x86.mmx.psll.w(x86_mmx, x86_mmx) nounwind readnone
+declare <1 x i64> @llvm.x86.mmx.psll.w(<1 x i64>, <1 x i64>) nounwind readnone
 
-define x86_mmx @stack_fold_psrad(x86_mmx %a, x86_mmx %b) {
+define <1 x i64> @stack_fold_psrad(<1 x i64> %a, <1 x i64> %b) {
 ; CHECK-LABEL: stack_fold_psrad:
 ; CHECK:       # %bb.0:
 ; CHECK-NEXT:    movq %rsi, %mm0
@@ -1060,13 +1060,13 @@ define x86_mmx @stack_fold_psrad(x86_mmx %a, x86_mmx %b) {
 ; CHECK-NEXT:    nop
 ; CHECK-NEXT:    #NO_APP
 ; CHECK-NEXT:    retq
-  %1 = tail call x86_mmx asm sideeffect "nop", "=y,~{mm2},~{mm3},~{mm4},~{mm5},~{mm6},~{mm7}"()
-  %2 = call x86_mmx @llvm.x86.mmx.psra.d(x86_mmx %a, x86_mmx %b) nounwind readnone
-  ret x86_mmx %2
+  %1 = tail call <1 x i64> asm sideeffect "nop", "=y,~{mm2},~{mm3},~{mm4},~{mm5},~{mm6},~{mm7}"()
+  %2 = call <1 x i64> @llvm.x86.mmx.psra.d(<1 x i64> %a, <1 x i64> %b) nounwind readnone
+  ret <1 x i64> %2
 }
-declare x86_mmx @llvm.x86.mmx.psra.d(x86_mmx, x86_mmx) nounwind readnone
+declare <1 x i64> @llvm.x86.mmx.psra.d(<1 x i64>, <1 x i64>) nounwind readnone
 
-define x86_mmx @stack_fold_psraw(x86_mmx %a, x86_mmx %b) {
+define <1 x i64> @stack_fold_psraw(<1 x i64> %a, <1 x i64> %b) {
 ; CHECK-LABEL: stack_fold_psraw:
 ; CHECK:       # %bb.0:
 ; CHECK-NEXT:    movq %rsi, %mm0
@@ -1077,13 +1077,13 @@ define x86_mmx @stack_fold_psraw(x86_mmx %a, x86_mmx %b) {
 ; CHECK-NEXT:    nop
 ; CHECK-NEXT:    #NO_APP
 ; CHECK-NEXT:    retq
-  %1 = tail call x86_mmx asm sideeffect "nop", "=y,~{mm2},~{mm3},~{mm4},~{mm5},~{mm6},~{mm7}"()
-  %2 = call x86_mmx @llvm.x86.mmx.psra.w(x86_mmx %a, x86_mmx %b) nounwind readnone
-  ret x86_mmx %2
+  %1 = tail call <1 x i64> asm sideeffect "nop", "=y,~{mm2},~{mm3},~{mm4},~{mm5},~{mm6},~{mm7}"()
+  %2 = call <1 x i64> @llvm.x86.mmx.psra.w(<1 x i64> %a, <1 x i64> %b) nounwind readnone
+  ret <1 x i64> %2
 }
-declare x86_mmx @llvm.x86.mmx.psra.w(x86_mmx, x86_mmx) nounwind readnone
+declare <1 x i64> @llvm.x86.mmx.psra.w(<1 x i64>, <1 x i64>) nounwind readnone
 
-define x86_mmx @stack_fold_psrld(x86_mmx %a, x86_mmx %b) {
+define <1 x i64> @stack_fold_psrld(<1 x i64> %a, <1 x i64> %b) {
 ; CHECK-LABEL: stack_fold_psrld:
 ; CHECK:       # %bb.0:
 ; CHECK-NEXT:    movq %rsi, %mm0
@@ -1094,13 +1094,13 @@ define x86_mmx @stack_fold_psrld(x86_mmx %a, x86_mmx %b) {
 ; CHECK-NEXT:    nop
 ; CHECK-NEXT:    #NO_APP
 ; CHECK-NEXT:    retq
-  %1 = tail call x86_mmx asm sideeffect "nop", "=y,~{mm2},~{mm3},~{mm4},~{mm5},~{mm6},~{mm7}"()
-  %2 = call x86_mmx @llvm.x86.mmx.psrl.d(x86_mmx %a, x86_mmx %b) nounwind readnone
-  ret x86_mmx %2
+  %1 = tail call <1 x i64> asm sideeffect "nop", "=y,~{mm2},~{mm3},~{mm4},~{mm5},~{mm6},~{mm7}"()
+  %2 = call <1 x i64> @llvm.x86.mmx.psrl.d(<1 x i64> %a, <1 x i64> %b) nounwind readnone
+  ret <1 x i64> %2
 }
-declare x86_mmx @llvm.x86.mmx.psrl.d(x86_mmx, x86_mmx) nounwind readnone
+declare <1 x i64> @llvm.x86.mmx.psrl.d(<1 x i64>, <1 x i64>) nounwind readnone
 
-define x86_mmx @stack_fold_psrlq(x86_mmx %a, x86_mmx %b) {
+define <1 x i64> @stack_fold_psrlq(<1 x i64> %a, <1 x i64> %b) {
 ; CHECK-LABEL: stack_fold_psrlq:
 ; CHECK:       # %bb.0:
 ; CHECK-NEXT:    movq %rsi, %mm0
@@ -1111,13 +1111,13 @@ define x86_mmx @stack_fold_psrlq(x86_mmx %a, x86_mmx %b) {
 ; CHECK-NEXT:    nop
 ; CHECK-NEXT:    #NO_APP
 ; CHECK-NEXT:    retq
-  %1 = tail call x86_mmx asm sideeffect "nop", "=y,~{mm2},~{mm3},~{mm4},~{mm5},~{mm6},~{mm7}"()
-  %2 = call x86_mmx @llvm.x86.mmx.psrl.q(x86_mmx %a, x86_mmx %b) nounwind readnone
-  ret x86_mmx %2
+  %1 = tail call <1 x i64> asm sideeffect "nop", "=y,~{mm2},~{mm3},~{mm4},~{mm5},~{mm6},~{mm7}"()
+  %2 = call <1 x i64> @llvm.x86.mmx.psrl.q(<1 x i64> %a, <1 x i64> %b) nounwind readnone
+  ret <1 x i64> %2
 }
-declare x86_mmx @llvm.x86.mmx.psrl.q(x86_mmx, x86_mmx) nounwind readnone
+declare <1 x i64> @llvm.x86.mmx.psrl.q(<1 x i64>, <1 x i64>) nounwind readnone
 
-define x86_mmx @stack_fold_psrlw(x86_mmx %a, x86_mmx %b) {
+define <1 x i64> @stack_fold_psrlw(<1 x i64> %a, <1 x i64> %b) {
 ; CHECK-LABEL: stack_fold_psrlw:
 ; CHECK:       # %bb.0:
 ; CHECK-NEXT:    movq %rsi, %mm0
@@ -1128,13 +1128,13 @@ define x86_mmx @stack_fold_psrlw(x86_mmx %a, x86_mmx %b) {
 ; CHECK-NEXT:    nop
 ; CHECK-NEXT:    #NO_APP
 ; CHECK-NEXT:    retq
-  %1 = tail call x86_mmx asm sideeffect "nop", "=y,~{mm2},~{mm3},~{mm4},~{mm5},~{mm6},~{mm7}"()
-  %2 = call x86_mmx @llvm.x86.mmx.psrl.w(x86_mmx %a, x86_mmx %b) nounwind readnone
-  ret x86_mmx %2
+  %1 = tail call <1 x i64> asm sideeffect "nop", "=y,~{mm2},~{mm3},~{mm4},~{mm5},~{mm6},~{mm7}"()
+  %2 = call <1 x i64> @llvm.x86.mmx.psrl.w(<1 x i64> %a, <1 x i64> %b) nounwind readnone
+  ret <1 x i64> %2
 }
-declare x86_mmx @llvm.x86.mmx.psrl.w(x86_mmx, x86_mmx) nounwind readnone
+declare <1 x i64> @llvm.x86.mmx.psrl.w(<1 x i64>, <1 x i64>) nounwind readnone
 
-define x86_mmx @stack_fold_psubb(x86_mmx %a, x86_mmx %b) {
+define <1 x i64> @stack_fold_psubb(<1 x i64> %a, <1 x i64> %b) {
 ; CHECK-LABEL: stack_fold_psubb:
 ; CHECK:       # %bb.0:
 ; CHECK-NEXT:    movq %rsi, %mm0
@@ -1145,13 +1145,13 @@ define x86_mmx @stack_fold_psubb(x86_mmx %a, x86_mmx %b) {
 ; CHECK-NEXT:    nop
 ; CHECK-NEXT:    #NO_APP
 ; CHECK-NEXT:    retq
-  %1 = tail call x86_mmx asm sideeffect "nop", "=y,~{mm2},~{mm3},~{mm4},~{mm5},~{mm6},~{mm7}"()
-  %2 = call x86_mmx @llvm.x86.mmx.psub.b(x86_mmx %a, x86_mmx %b) nounwind readnone
-  ret x86_mmx %2
+  %1 = tail call <1 x i64> asm sideeffect "nop", "=y,~{mm2},~{mm3},~{mm4},~{mm5},~{mm6},~{mm7}"()
+  %2 = call <1 x i64> @llvm.x86.mmx.psub.b(<1 x i64> %a, <1 x i64> %b) nounwind readnone
+  ret <1 x i64> %2
 }
-declare x86_mmx @llvm.x86.mmx.psub.b(x86_mmx, x86_mmx) nounwind readnone
+declare <1 x i64> @llvm.x86.mmx.psub.b(<1 x i64>, <1 x i64>) nounwind readnone
 
-define x86_mmx @stack_fold_psubd(x86_mmx %a, x86_mmx %b) {
+define <1 x i64> @stack_fold_psubd(<1 x i64> %a, <1 x i64> %b) {
 ; CHECK-LABEL: stack_fold_psubd:
 ; CHECK:       # %bb.0:
 ; CHECK-NEXT:    movq %rsi, %mm0
@@ -1162,13 +1162,13 @@ define x86_mmx @stack_fold_psubd(x86_mmx %a, x86_mmx %b) {
 ; CHECK-NEXT:    nop
 ; CHECK-NEXT:    #NO_APP
 ; CHECK-NEXT:    retq
-  %1 = tail call x86_mmx asm sideeffect "nop", "=y,~{mm2},~{mm3},~{mm4},~{mm5},~{mm6},~{mm7}"()
-  %2 = call x86_mmx @llvm.x86.mmx.psub.d(x86_mmx %a, x86_mmx %b) nounwind readnone
-  ret x86_mmx %2
+  %1 = tail call <1 x i64> asm sideeffect "nop", "=y,~{mm2},~{mm3},~{mm4},~{mm5},~{mm6},~{mm7}"()
+  %2 = call <1 x i64> @llvm.x86.mmx.psub.d(<1 x i64> %a, <1 x i64> %b) nounwind readnone
+  ret <1 x i64> %2
 }
-declare x86_mmx @llvm.x86.mmx.psub.d(x86_mmx, x86_mmx) nounwind readnone
+declare <1 x i64> @llvm.x86.mmx.psub.d(<1 x i64>, <1 x i64>) nounwind readnone
 
-define x86_mmx @stack_fold_psubq(x86_mmx %a, x86_mmx %b) {
+define <1 x i64> @stack_fold_psubq(<1 x i64> %a, <1 x i64> %b) {
 ; CHECK-LABEL: stack_fold_psubq:
 ; CHECK:       # %bb.0:
 ; CHECK-NEXT:    movq %rsi, %mm0
@@ -1179,13 +1179,13 @@ define x86_mmx @stack_fold_psubq(x86_mmx %a, x86_mmx %b) {
 ; CHECK-NEXT:    nop
 ; CHECK-NEXT:    #NO_APP
 ; CHECK-NEXT:    retq
-  %1 = tail call x86_mmx asm sideeffect "nop", "=y,~{mm2},~{mm3},~{mm4},~{mm5},~{mm6},~{mm7}"()
-  %2 = call x86_mmx @llvm.x86.mmx.psub.q(x86_mmx %a, x86_mmx %b) nounwind readnone
-  ret x86_mmx %2
+  %1 = tail call <1 x i64> asm sideeffect "nop", "=y,~{mm2},~{mm3},~{mm4},~{mm5},~{mm6},~{mm7}"()
+  %2 = call <1 x i64> @llvm.x86.mmx.psub.q(<1 x i64> %a, <1 x i64> %b) nounwind readnone
+  ret <1 x i64> %2
 }
-declare x86_mmx @llvm.x86.mmx.psub.q(x86_mmx, x86_mmx) nounwind readnone
+declare <1 x i64> @llvm.x86.mmx.psub.q(<1 x i64>, <1 x i64>) nounwind readnone
 
-define x86_mmx @stack_fold_psubsb(x86_mmx %a, x86_mmx %b) {
+define <1 x i64> @stack_fold_psubsb(<1 x i64> %a, <1 x i64> %b) {
 ; CHECK-LABEL: stack_fold_psubsb:
 ; CHECK:       # %bb.0:
 ; CHECK-NEXT:    movq %rsi, %mm0
@@ -1196,13 +1196,13 @@ define x86_mmx @stack_fold_psubsb(x86_mmx %a, x86_mmx %b) {
 ; CHECK-NEXT:    nop
 ; CHECK-NEXT:    #NO_APP
 ; CHECK-NEXT:    retq
-  %1 = tail call x86_mmx asm sideeffect "nop", "=y,~{mm2},~{mm3},~{mm4},~{mm5},~{mm6},~{mm7}"()
-  %2 = call x86_mmx @llvm.x86.mmx.psubs.b(x86_mmx %a, x86_mmx %b) nounwind readnone
-  ret x86_mmx %2
+  %1 = tail call <1 x i64> asm sideeffect "nop", "=y,~{mm2},~{mm3},~{mm4},~{mm5},~{mm6},~{mm7}"()
+  %2 = call <1 x i64> @llvm.x86.mmx.psubs.b(<1 x i64> %a, <1 x i64> %b) nounwind readnone
+  ret <1 x i64> %2
 }
-declare x86_mmx @llvm.x86.mmx.psubs.b(x86_mmx, x86_mmx) nounwind readnone
+declare <1 x i64> @llvm.x86.mmx.psubs.b(<1 x i64>, <1 x i64>) nounwind readnone
 
-define x86_mmx @stack_fold_psubsw(x86_mmx %a, x86_mmx %b) {
+define <1 x i64> @stack_fold_psubsw(<1 x i64> %a, <1 x i64> %b) {
 ; CHECK-LABEL: stack_fold_psubsw:
 ; CHECK:       # %bb.0:
 ; CHECK-NEXT:    movq %rsi, %mm0
@@ -1213,13 +1213,13 @@ define x86_mmx @stack_fold_psubsw(x86_mmx %a, x86_mmx %b) {
 ; CHECK-NEXT:    nop
 ; CHECK-NEXT:    #NO_APP
 ; CHECK-NEXT:    retq
-  %1 = tail call x86_mmx asm sideeffect "nop", "=y,~{mm2},~{mm3},~{mm4},~{mm5},~{mm6},~{mm7}"()
-  %2 = call x86_mmx @llvm.x86.mmx.psubs.w(x86_mmx %a, x86_mmx %b) nounwind readnone
-  ret x86_mmx %2
+  %1 = tail call <1 x i64> asm sideeffect "nop", "=y,~{mm2},~{mm3},~{mm4},~{mm5},~{mm6},~{mm7}"()
+  %2 = call <1 x i64> @llvm.x86.mmx.psubs.w(<1 x i64> %a, <1 x i64> %b) nounwind readnone
+  ret <1 x i64> %2
 }
-declare x86_mmx @llvm.x86.mmx.psubs.w(x86_mmx, x86_mmx) nounwind readnone
+declare <1 x i64> @llvm.x86.mmx.psubs.w(<1 x i64>, <1 x i64>) nounwind readnone
 
-define x86_mmx @stack_fold_psubusb(x86_mmx %a, x86_mmx %b) {
+define <1 x i64> @stack_fold_psubusb(<1 x i64> %a, <1 x i64> %b) {
 ; CHECK-LABEL: stack_fold_psubusb:
 ; CHECK:       # %bb.0:
 ; CHECK-NEXT:    movq %rsi, %mm0
@@ -1230,13 +1230,13 @@ define x86_mmx @stack_fold_psubusb(x86_mmx %a, x86_mmx %b) {
 ; CHECK-NEXT:    nop
 ; CHECK-NEXT:    #NO_APP
 ; CHECK-NEXT:    retq
-  %1 = tail call x86_mmx asm sideeffect "nop", "=y,~{mm2},~{mm3},~{mm4},~{mm5},~{mm6},~{mm7}"()
-  %2 = call x86_mmx @llvm.x86.mmx.psubus.b(x86_mmx %a, x86_mmx %b) nounwind readnone
-  ret x86_mmx %2
+  %1 = tail call <1 x i64> asm sideeffect "nop", "=y,~{mm2},~{mm3},~{mm4},~{mm5},~{mm6},~{mm7}"()
+  %2 = call <1 x i64> @llvm.x86.mmx.psubus.b(<1 x i64> %a, <1 x i64> %b) nounwind readnone
+  ret <1 x i64> %2
 }
-declare x86_mmx @llvm.x86.mmx.psubus.b(x86_mmx, x86_mmx) nounwind readnone
+declare <1 x i64> @llvm.x86.mmx.psubus.b(<1 x i64>, <1 x i64>) nounwind readnone
 
-define x86_mmx @stack_fold_psubusw(x86_mmx %a, x86_mmx %b) {
+define <1 x i64> @stack_fold_psubusw(<1 x i64> %a, <1 x i64> %b) {
 ; CHECK-LABEL: stack_fold_psubusw:
 ; CHECK:       # %bb.0:
 ; CHECK-NEXT:    movq %rsi, %mm0
@@ -1247,13 +1247,13 @@ define x86_mmx @stack_fold_psubusw(x86_mmx %a, x86_mmx %b) {
 ; CHECK-NEXT:    nop
 ; CHECK-NEXT:    #NO_APP
 ; CHECK-NEXT:    retq
-  %1 = tail call x86_mmx asm sideeffect "nop", "=y,~{mm2},~{mm3},~{mm4},~{mm5},~{mm6},~{mm7}"()
-  %2 = call x86_mmx @llvm.x86.mmx.psubus.w(x86_mmx %a, x86_mmx %b) nounwind readnone
-  ret x86_mmx %2
+  %1 = tail call <1 x i64> asm sideeffect "nop", "=y,~{mm2},~{mm3},~{mm4},~{mm5},~{mm6},~{mm7}"()
+  %2 = call <1 x i64> @llvm.x86.mmx.psubus.w(<1 x i64> %a, <1 x i64> %b) nounwind readnone
+  ret <1 x i64> %2
 }
-declare x86_mmx @llvm.x86.mmx.psubus.w(x86_mmx, x86_mmx) nounwind readnone
+declare <1 x i64> @llvm.x86.mmx.psubus.w(<1 x i64>, <1 x i64>) nounwind readnone
 
-define x86_mmx @stack_fold_psubw(x86_mmx %a, x86_mmx %b) {
+define <1 x i64> @stack_fold_psubw(<1 x i64> %a, <1 x i64> %b) {
 ; CHECK-LABEL: stack_fold_psubw:
 ; CHECK:       # %bb.0:
 ; CHECK-NEXT:    movq %rsi, %mm0
@@ -1264,13 +1264,13 @@ define x86_mmx @stack_fold_psubw(x86_mmx %a, x86_mmx %b) {
 ; CHECK-NEXT:    nop
 ; CHECK-NEXT:    #NO_APP
 ; CHECK-NEXT:    retq
-  %1 = tail call x86_mmx asm sideeffect "nop", "=y,~{mm2},~{mm3},~{mm4},~{mm5},~{mm6},~{mm7}"()
-  %2 = call x86_mmx @llvm.x86.mmx.psub.w(x86_mmx %a, x86_mmx %b) nounwind readnone
-  ret x86_mmx %2
+  %1 = tail call <1 x i64> asm sideeffect "nop", "=y,~{mm2},~{mm3},~{mm4},~{mm5},~{mm6},~{mm7}"()
+  %2 = call <1 x i64> @llvm.x86.mmx.psub.w(<1 x i64> %a, <1 x i64> %b) nounwind readnone
+  ret <1 x i64> %2
 }
-declare x86_mmx @llvm.x86.mmx.psub.w(x86_mmx, x86_mmx) nounwind readnone
+declare <1 x i64> @llvm.x86.mmx.psub.w(<1 x i64>, <1 x i64>) nounwind readnone
 
-define x86_mmx @stack_fold_punpckhbw(x86_mmx %a, x86_mmx %b) {
+define <1 x i64> @stack_fold_punpckhbw(<1 x i64> %a, <1 x i64> %b) {
 ; CHECK-LABEL: stack_fold_punpckhbw:
 ; CHECK:       # %bb.0:
 ; CHECK-NEXT:    movq %rsi, %mm0
@@ -1281,13 +1281,13 @@ define x86_mmx @stack_fold_punpckhbw(x86_mmx %a, x86_mmx %b) {
 ; CHECK-NEXT:    nop
 ; CHECK-NEXT:    #NO_APP
 ; CHECK-NEXT:    retq
-  %1 = tail call x86_mmx asm sideeffect "nop", "=y,~{mm2},~{mm3},~{mm4},~{mm5},~{mm6},~{mm7}"()
-  %2 = call x86_mmx @llvm.x86.mmx.punpckhbw(x86_mmx %a, x86_mmx %b) nounwind readnone
-  ret x86_mmx %2
+  %1 = tail call <1 x i64> asm sideeffect "nop", "=y,~{mm2},~{mm3},~{mm4},~{mm5},~{mm6},~{mm7}"()
+  %2 = call <1 x i64> @llvm.x86.mmx.punpckhbw(<1 x i64> %a, <1 x i64> %b) nounwind readnone
+  ret <1 x i64> %2
 }
-declare x86_mmx @llvm.x86.mmx.punpckhbw(x86_mmx, x86_mmx) nounwind readnone
+declare <1 x i64> @llvm.x86.mmx.punpckhbw(<1 x i64>, <1 x i64>) nounwind readnone
 
-define x86_mmx @stack_fold_punpckhdq(x86_mmx %a, x86_mmx %b) {
+define <1 x i64> @stack_fold_punpckhdq(<1 x i64> %a, <1 x i64> %b) {
 ; CHECK-LABEL: stack_fold_punpckhdq:
 ; CHECK:       # %bb.0:
 ; CHECK-NEXT:    movq %rsi, %mm0
@@ -1298,13 +1298,13 @@ define x86_mmx @stack_fold_punpckhdq(x86_mmx %a, x86_mmx %b) {
 ; CHECK-NEXT:    nop
 ; CHECK-NEXT:    #NO_APP
 ; CHECK-NEXT:    retq
-  %1 = tail call x86_mmx asm sideeffect "nop", "=y,~{mm2},~{mm3},~{mm4},~{mm5},~{mm6},~{mm7}"()
-  %2 = call x86_mmx @llvm.x86.mmx.punpckhdq(x86_mmx %a, x86_mmx %b) nounwind readnone
-  ret x86_mmx %2
+  %1 = tail call <1 x i64> asm sideeffect "nop", "=y,~{mm2},~{mm3},~{mm4},~{mm5},~{mm6},~{mm7}"()
+  %2 = call <1 x i64> @llvm.x86.mmx.punpckhdq(<1 x i64> %a, <1 x i64> %b) nounwind readnone
+  ret <1 x i64> %2
 }
-declare x86_mmx @llvm.x86.mmx.punpckhdq(x86_mmx, x86_mmx) nounwind readnone
+declare <1 x i64> @llvm.x86.mmx.punpckhdq(<1 x i64>, <1 x i64>) nounwind readnone
 
-define x86_mmx @stack_fold_punpckhwd(x86_mmx %a, x86_mmx %b) {
+define <1 x i64> @stack_fold_punpckhwd(<1 x i64> %a, <1 x i64> %b) {
 ; CHECK-LABEL: stack_fold_punpckhwd:
 ; CHECK:       # %bb.0:
 ; CHECK-NEXT:    movq %rsi, %mm0
@@ -1315,13 +1315,13 @@ define x86_mmx @stack_fold_punpckhwd(x86_mmx %a, x86_mmx %b) {
 ; CHECK-NEXT:    nop
 ; CHECK-NEXT:    #NO_APP
 ; CHECK-NEXT:    retq
-  %1 = tail call x86_mmx asm sideeffect "nop", "=y,~{mm2},~{mm3},~{mm4},~{mm5},~{mm6},~{mm7}"()
-  %2 = call x86_mmx @llvm.x86.mmx.punpckhwd(x86_mmx %a, x86_mmx %b) nounwind readnone
-  ret x86_mmx %2
+  %1 = tail call <1 x i64> asm sideeffect "nop", "=y,~{mm2},~{mm3},~{mm4},~{mm5},~{mm6},~{mm7}"()
+  %2 = call <1 x i64> @llvm.x86.mmx.punpckhwd(<1 x i64> %a, <1 x i64> %b) nounwind readnone
+  ret <1 x i64> %2
 }
-declare x86_mmx @llvm.x86.mmx.punpckhwd(x86_mmx, x86_mmx) nounwind readnone
+declare <1 x i64> @llvm.x86.mmx.punpckhwd(<1 x i64>, <1 x i64>) nounwind readnone
 
-define x86_mmx @stack_fold_punpcklbw(x86_mmx %a, x86_mmx %b) {
+define <1 x i64> @stack_fold_punpcklbw(<1 x i64> %a, <1 x i64> %b) {
 ; CHECK-LABEL: stack_fold_punpcklbw:
 ; CHECK:       # %bb.0:
 ; CHECK-NEXT:    movq %rsi, %mm0
@@ -1332,13 +1332,13 @@ define x86_mmx @stack_fold_punpcklbw(x86_mmx %a, x86_mmx %b) {
 ; CHECK-NEXT:    nop
 ; CHECK-NEXT:    #NO_APP
 ; CHECK-NEXT:    retq
-  %1 = tail call x86_mmx asm sideeffect "nop", "=y,~{mm2},~{mm3},~{mm4},~{mm5},~{mm6},~{mm7}"()
-  %2 = call x86_mmx @llvm.x86.mmx.punpcklbw(x86_mmx %a, x86_mmx %b) nounwind readnone
-  ret x86_mmx %2
+  %1 = tail call <1 x i64> asm sideeffect "nop", "=y,~{mm2},~{mm3},~{mm4},~{mm5},~{mm6},~{mm7}"()
+  %2 = call <1 x i64> @llvm.x86.mmx.punpcklbw(<1 x i64> %a, <1 x i64> %b) nounwind readnone
+  ret <1 x i64> %2
 }
-declare x86_mmx @llvm.x86.mmx.punpcklbw(x86_mmx, x86_mmx) nounwind readnone
+declare <1 x i64> @llvm.x86.mmx.punpcklbw(<1 x i64>, <1 x i64>) nounwind readnone
 
-define x86_mmx @stack_fold_punpckldq(x86_mmx %a, x86_mmx %b) {
+define <1 x i64> @stack_fold_punpckldq(<1 x i64> %a, <1 x i64> %b) {
 ; CHECK-LABEL: stack_fold_punpckldq:
 ; CHECK:       # %bb.0:
 ; CHECK-NEXT:    movq %rsi, %mm0
@@ -1349,13 +1349,13 @@ define x86_mmx @stack_fold_punpckldq(x86_mmx %a, x86_mmx %b) {
 ; CHECK-NEXT:    nop
 ; CHECK-NEXT:    #NO_APP
 ; CHECK-NEXT:    retq
-  %1 = tail call x86_mmx asm sideeffect "nop", "=y,~{mm2},~{mm3},~{mm4},~{mm5},~{mm6},~{mm7}"()
-  %2 = call x86_mmx @llvm.x86.mmx.punpckldq(x86_mmx %a, x86_mmx %b) nounwind readnone
-  ret x86_mmx %2
+  %1 = tail call <1 x i64> asm sideeffect "nop", "=y,~{mm2},~{mm3},~{mm4},~{mm5},~{mm6},~{mm7}"()
+  %2 = call <1 x i64> @llvm.x86.mmx.punpckldq(<1 x i64> %a, <1 x i64> %b) nounwind readnone
+  ret <1 x i64> %2
 }
-declare x86_mmx @llvm.x86.mmx.punpckldq(x86_mmx, x86_mmx) nounwind readnone
+declare <1 x i64> @llvm.x86.mmx.punpckldq(<1 x i64>, <1 x i64>) nounwind readnone
 
-define x86_mmx @stack_fold_punpcklwd(x86_mmx %a, x86_mmx %b) {
+define <1 x i64> @stack_fold_punpcklwd(<1 x i64> %a, <1 x i64> %b) {
 ; CHECK-LABEL: stack_fold_punpcklwd:
 ; CHECK:       # %bb.0:
 ; CHECK-NEXT:    movq %rsi, %mm0
@@ -1366,13 +1366,13 @@ define x86_mmx @stack_fold_punpcklwd(x86_mmx %a, x86_mmx %b) {
 ; CHECK-NEXT:    nop
 ; CHECK-NEXT:    #NO_APP
 ; CHECK-NEXT:    retq
-  %1 = tail call x86_mmx asm sideeffect "nop", "=y,~{mm2},~{mm3},~{mm4},~{mm5},~{mm6},~{mm7}"()
-  %2 = call x86_mmx @llvm.x86.mmx.punpcklwd(x86_mmx %a, x86_mmx %b) nounwind readnone
-  ret x86_mmx %2
+  %1 = tail call <1 x i64> asm sideeffect "nop", "=y,~{mm2},~{mm3},~{mm4},~{mm5},~{mm6},~{mm7}"()
+  %2 = call <1 x i64> @llvm.x86.mmx.punpcklwd(<1 x i64> %a, <1 x i64> %b) nounwind readnone
+  ret <1 x i64> %2
 }
-declare x86_mmx @llvm.x86.mmx.punpcklwd(x86_mmx, x86_mmx) nounwind readnone
+declare <1 x i64> @llvm.x86.mmx.punpcklwd(<1 x i64>, <1 x i64>) nounwind readnone
 
-define x86_mmx @stack_fold_pxor(x86_mmx %a, x86_mmx %b) {
+define <1 x i64> @stack_fold_pxor(<1 x i64> %a, <1 x i64> %b) {
 ; CHECK-LABEL: stack_fold_pxor:
 ; CHECK:       # %bb.0:
 ; CHECK-NEXT:    movq %rsi, %mm0
@@ -1383,8 +1383,8 @@ define x86_mmx @stack_fold_pxor(x86_mmx %a, x86_mmx %b) {
 ; CHECK-NEXT:    nop
 ; CHECK-NEXT:    #NO_APP
 ; CHECK-NEXT:    retq
-  %1 = tail call x86_mmx asm sideeffect "nop", "=y,~{mm2},~{mm3},~{mm4},~{mm5},~{mm6},~{mm7}"()
-  %2 = call x86_mmx @llvm.x86.mmx.pxor(x86_mmx %a, x86_mmx %b) nounwind readnone
-  ret x86_mmx %2
+  %1 = tail call <1 x i64> asm sideeffect "nop", "=y,~{mm2},~{mm3},~{mm4},~{mm5},~{mm6},~{mm7}"()
+  %2 = call <1 x i64> @llvm.x86.mmx.pxor(<1 x i64> %a, <1 x i64> %b) nounwind readnone
+  ret <1 x i64> %2
 }
-declare x86_mmx @llvm.x86.mmx.pxor(x86_mmx, x86_mmx) nounwind readnone
+declare <1 x i64> @llvm.x86.mmx.pxor(<1 x i64>, <1 x i64>) nounwind readnone
diff --git a/llvm/test/CodeGen/X86/unaligned_extract_from_vector_through_stack.ll b/llvm/test/CodeGen/X86/unaligned_extract_from_vector_through_stack.ll
index 52d0c2b5..629f44b 100644
--- a/llvm/test/CodeGen/X86/unaligned_extract_from_vector_through_stack.ll
+++ b/llvm/test/CodeGen/X86/unaligned_extract_from_vector_through_stack.ll
@@ -17,4 +17,22 @@ entry:
   ret i32 %b
 }
 
+define i32 @foo2(i32 %arg1) #1 {
+; CHECK-LABEL: foo2:
+; CHECK:       # %bb.0: # %entry
+; CHECK-NEXT:    # kill: def $edi killed $edi def $rdi
+; CHECK-NEXT:    vxorps %xmm0, %xmm0, %xmm0
+; CHECK-NEXT:    vmovups %ymm0, -{{[0-9]+}}(%rsp)
+; CHECK-NEXT:    vmovups %ymm0, -{{[0-9]+}}(%rsp)
+; CHECK-NEXT:    andl $31, %edi
+; CHECK-NEXT:    movzwl -72(%rsp,%rdi,2), %eax
+; CHECK-NEXT:    vzeroupper
+; CHECK-NEXT:    retq
+entry:
+  %a = extractelement <32 x i16> zeroinitializer, i32 %arg1
+  %b = zext i16 %a to i32
+  ret i32 %b
+}
+
 attributes #0 = { "no-realign-stack" "target-cpu"="skylake-avx512" }
+attributes #1 = { "no-realign-stack" "target-cpu"="skylake" }
diff --git a/llvm/test/CodeGen/X86/vec-libcalls.ll b/llvm/test/CodeGen/X86/vec-libcalls.ll
index 6857101..b107b1c 100644
--- a/llvm/test/CodeGen/X86/vec-libcalls.ll
+++ b/llvm/test/CodeGen/X86/vec-libcalls.ll
@@ -25,6 +25,54 @@ declare <5 x float> @llvm.tan.v5f32(<5 x float>)
 declare <6 x float> @llvm.tan.v6f32(<6 x float>)
 declare <3 x double> @llvm.tan.v3f64(<3 x double>)
 
+declare <1 x float> @llvm.acos.v1f32(<1 x float>)
+declare <2 x float> @llvm.acos.v2f32(<2 x float>)
+declare <3 x float> @llvm.acos.v3f32(<3 x float>)
+declare <4 x float> @llvm.acos.v4f32(<4 x float>)
+declare <5 x float> @llvm.acos.v5f32(<5 x float>)
+declare <6 x float> @llvm.acos.v6f32(<6 x float>)
+declare <3 x double> @llvm.acos.v3f64(<3 x double
+>)
+declare <1 x float> @llvm.asin.v1f32(<1 x float>)
+declare <2 x float> @llvm.asin.v2f32(<2 x float>)
+declare <3 x float> @llvm.asin.v3f32(<3 x float>)
+declare <4 x float> @llvm.asin.v4f32(<4 x float>)
+declare <5 x float> @llvm.asin.v5f32(<5 x float>)
+declare <6 x float> @llvm.asin.v6f32(<6 x float>)
+declare <3 x double> @llvm.asin.v3f64(<3 x double>)
+
+declare <1 x float> @llvm.atan.v1f32(<1 x float>)
+declare <2 x float> @llvm.atan.v2f32(<2 x float>)
+declare <3 x float> @llvm.atan.v3f32(<3 x float>)
+declare <4 x float> @llvm.atan.v4f32(<4 x float>)
+declare <5 x float> @llvm.atan.v5f32(<5 x float>)
+declare <6 x float> @llvm.atan.v6f32(<6 x float>)
+declare <3 x double> @llvm.atan.v3f64(<3 x double>)
+
+declare <1 x float> @llvm.cosh.v1f32(<1 x float>)
+declare <2 x float> @llvm.cosh.v2f32(<2 x float>)
+declare <3 x float> @llvm.cosh.v3f32(<3 x float>)
+declare <4 x float> @llvm.cosh.v4f32(<4 x float>)
+declare <5 x float> @llvm.cosh.v5f32(<5 x float>)
+declare <6 x float> @llvm.cosh.v6f32(<6 x float>)
+declare <3 x double> @llvm.cosh.v3f64(<3 x double>)
+
+declare <1 x float> @llvm.sinh.v1f32(<1 x float>)
+declare <2 x float> @llvm.sinh.v2f32(<2 x float>)
+declare <3 x float> @llvm.sinh.v3f32(<3 x float>)
+declare <4 x float> @llvm.sinh.v4f32(<4 x float>)
+declare <5 x float> @llvm.sinh.v5f32(<5 x float>)
+declare <6 x float> @llvm.sinh.v6f32(<6 x float>)
+declare <3 x double> @llvm.sinh.v3f64(<3 x double>)
+
+declare <1 x float> @llvm.tanh.v1f32(<1 x float>)
+declare <2 x float> @llvm.tanh.v2f32(<2 x float>)
+declare <3 x float> @llvm.tanh.v3f32(<3 x float>)
+declare <4 x float> @llvm.tanh.v4f32(<4 x float>)
+declare <5 x float> @llvm.tanh.v5f32(<5 x float>)
+declare <6 x float> @llvm.tanh.v6f32(<6 x float>)
+declare <3 x double> @llvm.tanh.v3f64(<3 x double>)
+
 ; Verify that all of the potential libcall candidates are handled.
 ; Some of these have custom lowering, so those cases won't have
 ; libcalls.
@@ -432,6 +480,1170 @@ define <3 x double> @tan_v3f64(<3 x double> %x) nounwind {
   ret <3 x double> %r
 }
 
+define <1 x float> @acos_v1f32(<1 x float> %x) nounwind {
+; CHECK-LABEL: acos_v1f32:
+; CHECK:       # %bb.0:
+; CHECK-NEXT:    pushq %rax
+; CHECK-NEXT:    callq acosf@PLT
+; CHECK-NEXT:    popq %rax
+; CHECK-NEXT:    retq
+  %r = call <1 x float> @llvm.acos.v1f32(<1 x float> %x)
+  ret <1 x float> %r
+}
+
+define <2 x float> @acos_v2f32(<2 x float> %x) nounwind {
+; CHECK-LABEL: acos_v2f32:
+; CHECK:       # %bb.0:
+; CHECK-NEXT:    subq $40, %rsp
+; CHECK-NEXT:    vmovaps %xmm0, (%rsp) # 16-byte Spill
+; CHECK-NEXT:    callq acosf@PLT
+; CHECK-NEXT:    vmovaps %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
+; CHECK-NEXT:    vmovshdup (%rsp), %xmm0 # 16-byte Folded Reload
+; CHECK-NEXT:    # xmm0 = mem[1,1,3,3]
+; CHECK-NEXT:    callq acosf@PLT
+; CHECK-NEXT:    vmovaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm1 # 16-byte Reload
+; CHECK-NEXT:    vinsertps {{.*#+}} xmm0 = xmm1[0],xmm0[0],xmm1[2,3]
+; CHECK-NEXT:    addq $40, %rsp
+; CHECK-NEXT:    retq
+  %r = call <2 x float> @llvm.acos.v2f32(<2 x float> %x)
+  ret <2 x float> %r
+}
+
+define <3 x float> @acos_v3f32(<3 x float> %x) nounwind {
+; CHECK-LABEL: acos_v3f32:
+; CHECK:       # %bb.0:
+; CHECK-NEXT:    subq $40, %rsp
+; CHECK-NEXT:    vmovaps %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
+; CHECK-NEXT:    callq acosf@PLT
+; CHECK-NEXT:    vmovaps %xmm0, (%rsp) # 16-byte Spill
+; CHECK-NEXT:    vmovshdup {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Folded Reload
+; CHECK-NEXT:    # xmm0 = mem[1,1,3,3]
+; CHECK-NEXT:    callq acosf@PLT
+; CHECK-NEXT:    vmovaps (%rsp), %xmm1 # 16-byte Reload
+; CHECK-NEXT:    vinsertps {{.*#+}} xmm0 = xmm1[0],xmm0[0],xmm1[2,3]
+; CHECK-NEXT:    vmovaps %xmm0, (%rsp) # 16-byte Spill
+; CHECK-NEXT:    vpermilpd $1, {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Folded Reload
+; CHECK-NEXT:    # xmm0 = mem[1,0]
+; CHECK-NEXT:    callq acosf@PLT
+; CHECK-NEXT:    vmovaps (%rsp), %xmm1 # 16-byte Reload
+; CHECK-NEXT:    vinsertps {{.*#+}} xmm0 = xmm1[0,1],xmm0[0],xmm1[3]
+; CHECK-NEXT:    addq $40, %rsp
+; CHECK-NEXT:    retq
+  %r = call <3 x float> @llvm.acos.v3f32(<3 x float> %x)
+  ret <3 x float> %r
+}
+
+define <4 x float> @acos_v4f32(<4 x float> %x) nounwind {
+; CHECK-LABEL: acos_v4f32:
+; CHECK:       # %bb.0:
+; CHECK-NEXT:    subq $40, %rsp
+; CHECK-NEXT:    vmovaps %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
+; CHECK-NEXT:    callq acosf@PLT
+; CHECK-NEXT:    vmovaps %xmm0, (%rsp) # 16-byte Spill
+; CHECK-NEXT:    vmovshdup {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Folded Reload
+; CHECK-NEXT:    # xmm0 = mem[1,1,3,3]
+; CHECK-NEXT:    callq acosf@PLT
+; CHECK-NEXT:    vmovaps (%rsp), %xmm1 # 16-byte Reload
+; CHECK-NEXT:    vinsertps {{.*#+}} xmm0 = xmm1[0],xmm0[0],xmm1[2,3]
+; CHECK-NEXT:    vmovaps %xmm0, (%rsp) # 16-byte Spill
+; CHECK-NEXT:    vpermilpd $1, {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Folded Reload
+; CHECK-NEXT:    # xmm0 = mem[1,0]
+; CHECK-NEXT:    callq acosf@PLT
+; CHECK-NEXT:    vmovaps (%rsp), %xmm1 # 16-byte Reload
+; CHECK-NEXT:    vinsertps {{.*#+}} xmm0 = xmm1[0,1],xmm0[0],xmm1[3]
+; CHECK-NEXT:    vmovaps %xmm0, (%rsp) # 16-byte Spill
+; CHECK-NEXT:    vpermilps $255, {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Folded Reload
+; CHECK-NEXT:    # xmm0 = mem[3,3,3,3]
+; CHECK-NEXT:    callq acosf@PLT
+; CHECK-NEXT:    vmovaps (%rsp), %xmm1 # 16-byte Reload
+; CHECK-NEXT:    vinsertps {{.*#+}} xmm0 = xmm1[0,1,2],xmm0[0]
+; CHECK-NEXT:    addq $40, %rsp
+; CHECK-NEXT:    retq
+  %r = call <4 x float> @llvm.acos.v4f32(<4 x float> %x)
+  ret <4 x float> %r
+}
+
+define <5 x float> @acos_v5f32(<5 x float> %x) nounwind {
+; CHECK-LABEL: acos_v5f32:
+; CHECK:       # %bb.0:
+; CHECK-NEXT:    subq $72, %rsp
+; CHECK-NEXT:    vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
+; CHECK-NEXT:    # kill: def $xmm0 killed $xmm0 killed $ymm0
+; CHECK-NEXT:    vzeroupper
+; CHECK-NEXT:    callq acosf@PLT
+; CHECK-NEXT:    vmovaps %xmm0, (%rsp) # 16-byte Spill
+; CHECK-NEXT:    vmovshdup {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Folded Reload
+; CHECK-NEXT:    # xmm0 = mem[1,1,3,3]
+; CHECK-NEXT:    callq acosf@PLT
+; CHECK-NEXT:    vmovaps (%rsp), %xmm1 # 16-byte Reload
+; CHECK-NEXT:    vinsertps {{.*#+}} xmm0 = xmm1[0],xmm0[0],xmm1[2,3]
+; CHECK-NEXT:    vmovaps %xmm0, (%rsp) # 16-byte Spill
+; CHECK-NEXT:    vpermilpd $1, {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Folded Reload
+; CHECK-NEXT:    # xmm0 = mem[1,0]
+; CHECK-NEXT:    callq acosf@PLT
+; CHECK-NEXT:    vmovaps (%rsp), %xmm1 # 16-byte Reload
+; CHECK-NEXT:    vinsertps {{.*#+}} xmm0 = xmm1[0,1],xmm0[0],xmm1[3]
+; CHECK-NEXT:    vmovaps %xmm0, (%rsp) # 16-byte Spill
+; CHECK-NEXT:    vpermilps $255, {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Folded Reload
+; CHECK-NEXT:    # xmm0 = mem[3,3,3,3]
+; CHECK-NEXT:    callq acosf@PLT
+; CHECK-NEXT:    vmovaps (%rsp), %xmm1 # 16-byte Reload
+; CHECK-NEXT:    vinsertps {{.*#+}} xmm0 = xmm1[0,1,2],xmm0[0]
+; CHECK-NEXT:    vmovups %ymm0, (%rsp) # 32-byte Spill
+; CHECK-NEXT:    vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm0 # 32-byte Reload
+; CHECK-NEXT:    vextractf128 $1, %ymm0, %xmm0
+; CHECK-NEXT:    vzeroupper
+; CHECK-NEXT:    callq acosf@PLT
+; CHECK-NEXT:    vmovups (%rsp), %ymm1 # 32-byte Reload
+; CHECK-NEXT:    vinsertf128 $1, %xmm0, %ymm1, %ymm0
+; CHECK-NEXT:    addq $72, %rsp
+; CHECK-NEXT:    retq
+  %r = call <5 x float> @llvm.acos.v5f32(<5 x float> %x)
+  ret <5 x float> %r
+}
+
+define <6 x float> @acos_v6f32(<6 x float> %x) nounwind {
+; CHECK-LABEL: acos_v6f32:
+; CHECK:       # %bb.0:
+; CHECK-NEXT:    subq $72, %rsp
+; CHECK-NEXT:    vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
+; CHECK-NEXT:    vextractf128 $1, %ymm0, %xmm0
+; CHECK-NEXT:    vmovaps %xmm0, (%rsp) # 16-byte Spill
+; CHECK-NEXT:    vzeroupper
+; CHECK-NEXT:    callq acosf@PLT
+; CHECK-NEXT:    vmovaps %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
+; CHECK-NEXT:    vmovshdup (%rsp), %xmm0 # 16-byte Folded Reload
+; CHECK-NEXT:    # xmm0 = mem[1,1,3,3]
+; CHECK-NEXT:    callq acosf@PLT
+; CHECK-NEXT:    vmovaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm1 # 16-byte Reload
+; CHECK-NEXT:    vinsertps {{.*#+}} xmm0 = xmm1[0],xmm0[0],xmm1[2,3]
+; CHECK-NEXT:    vmovaps %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
+; CHECK-NEXT:    vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm0 # 32-byte Reload
+; CHECK-NEXT:    # kill: def $xmm0 killed $xmm0 killed $ymm0
+; CHECK-NEXT:    vzeroupper
+; CHECK-NEXT:    callq acosf@PLT
+; CHECK-NEXT:    vmovaps %xmm0, (%rsp) # 16-byte Spill
+; CHECK-NEXT:    vmovshdup {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Folded Reload
+; CHECK-NEXT:    # xmm0 = mem[1,1,3,3]
+; CHECK-NEXT:    callq acosf@PLT
+; CHECK-NEXT:    vmovaps (%rsp), %xmm1 # 16-byte Reload
+; CHECK-NEXT:    vinsertps {{.*#+}} xmm0 = xmm1[0],xmm0[0],xmm1[2,3]
+; CHECK-NEXT:    vmovaps %xmm0, (%rsp) # 16-byte Spill
+; CHECK-NEXT:    vpermilpd $1, {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Folded Reload
+; CHECK-NEXT:    # xmm0 = mem[1,0]
+; CHECK-NEXT:    callq acosf@PLT
+; CHECK-NEXT:    vmovaps (%rsp), %xmm1 # 16-byte Reload
+; CHECK-NEXT:    vinsertps {{.*#+}} xmm0 = xmm1[0,1],xmm0[0],xmm1[3]
+; CHECK-NEXT:    vmovaps %xmm0, (%rsp) # 16-byte Spill
+; CHECK-NEXT:    vpermilps $255, {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Folded Reload
+; CHECK-NEXT:    # xmm0 = mem[3,3,3,3]
+; CHECK-NEXT:    callq acosf@PLT
+; CHECK-NEXT:    vmovaps (%rsp), %xmm1 # 16-byte Reload
+; CHECK-NEXT:    vinsertps {{.*#+}} xmm0 = xmm1[0,1,2],xmm0[0]
+; CHECK-NEXT:    vinsertf128 $1, {{[-0-9]+}}(%r{{[sb]}}p), %ymm0, %ymm0 # 16-byte Folded Reload
+; CHECK-NEXT:    addq $72, %rsp
+; CHECK-NEXT:    retq
+  %r = call <6 x float> @llvm.acos.v6f32(<6 x float> %x)
+  ret <6 x float> %r
+}
+
+define <3 x double> @acos_v3f64(<3 x double> %x) nounwind {
+; CHECK-LABEL: acos_v3f64:
+; CHECK:       # %bb.0:
+; CHECK-NEXT:    subq $72, %rsp
+; CHECK-NEXT:    vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
+; CHECK-NEXT:    # kill: def $xmm0 killed $xmm0 killed $ymm0
+; CHECK-NEXT:    vzeroupper
+; CHECK-NEXT:    callq acos@PLT
+; CHECK-NEXT:    vmovaps %xmm0, (%rsp) # 16-byte Spill
+; CHECK-NEXT:    vpermilpd $1, {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Folded Reload
+; CHECK-NEXT:    # xmm0 = mem[1,0]
+; CHECK-NEXT:    callq acos@PLT
+; CHECK-NEXT:    vmovapd (%rsp), %xmm1 # 16-byte Reload
+; CHECK-NEXT:    vunpcklpd {{.*#+}} xmm0 = xmm1[0],xmm0[0]
+; CHECK-NEXT:    vmovupd %ymm0, (%rsp) # 32-byte Spill
+; CHECK-NEXT:    vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm0 # 32-byte Reload
+; CHECK-NEXT:    vextractf128 $1, %ymm0, %xmm0
+; CHECK-NEXT:    vzeroupper
+; CHECK-NEXT:    callq acos@PLT
+; CHECK-NEXT:    vmovups (%rsp), %ymm1 # 32-byte Reload
+; CHECK-NEXT:    vinsertf128 $1, %xmm0, %ymm1, %ymm0
+; CHECK-NEXT:    addq $72, %rsp
+; CHECK-NEXT:    retq
+  %r = call <3 x double> @llvm.acos.v3f64(<3 x double> %x)
+  ret <3 x double> %r
+}
+
+define <1 x float> @asin_v1f32(<1 x float> %x) nounwind {
+; CHECK-LABEL: asin_v1f32:
+; CHECK:       # %bb.0:
+; CHECK-NEXT:    pushq %rax
+; CHECK-NEXT:    callq asinf@PLT
+; CHECK-NEXT:    popq %rax
+; CHECK-NEXT:    retq
+  %r = call <1 x float> @llvm.asin.v1f32(<1 x float> %x)
+  ret <1 x float> %r
+}
+
+define <2 x float> @asin_v2f32(<2 x float> %x) nounwind {
+; CHECK-LABEL: asin_v2f32:
+; CHECK:       # %bb.0:
+; CHECK-NEXT:    subq $40, %rsp
+; CHECK-NEXT:    vmovaps %xmm0, (%rsp) # 16-byte Spill
+; CHECK-NEXT:    callq asinf@PLT
+; CHECK-NEXT:    vmovaps %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
+; CHECK-NEXT:    vmovshdup (%rsp), %xmm0 # 16-byte Folded Reload
+; CHECK-NEXT:    # xmm0 = mem[1,1,3,3]
+; CHECK-NEXT:    callq asinf@PLT
+; CHECK-NEXT:    vmovaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm1 # 16-byte Reload
+; CHECK-NEXT:    vinsertps {{.*#+}} xmm0 = xmm1[0],xmm0[0],xmm1[2,3]
+; CHECK-NEXT:    addq $40, %rsp
+; CHECK-NEXT:    retq
+  %r = call <2 x float> @llvm.asin.v2f32(<2 x float> %x)
+  ret <2 x float> %r
+}
+
+define <3 x float> @asin_v3f32(<3 x float> %x) nounwind {
+; CHECK-LABEL: asin_v3f32:
+; CHECK:       # %bb.0:
+; CHECK-NEXT:    subq $40, %rsp
+; CHECK-NEXT:    vmovaps %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
+; CHECK-NEXT:    callq asinf@PLT
+; CHECK-NEXT:    vmovaps %xmm0, (%rsp) # 16-byte Spill
+; CHECK-NEXT:    vmovshdup {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Folded Reload
+; CHECK-NEXT:    # xmm0 = mem[1,1,3,3]
+; CHECK-NEXT:    callq asinf@PLT
+; CHECK-NEXT:    vmovaps (%rsp), %xmm1 # 16-byte Reload
+; CHECK-NEXT:    vinsertps {{.*#+}} xmm0 = xmm1[0],xmm0[0],xmm1[2,3]
+; CHECK-NEXT:    vmovaps %xmm0, (%rsp) # 16-byte Spill
+; CHECK-NEXT:    vpermilpd $1, {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Folded Reload
+; CHECK-NEXT:    # xmm0 = mem[1,0]
+; CHECK-NEXT:    callq asinf@PLT
+; CHECK-NEXT:    vmovaps (%rsp), %xmm1 # 16-byte Reload
+; CHECK-NEXT:    vinsertps {{.*#+}} xmm0 = xmm1[0,1],xmm0[0],xmm1[3]
+; CHECK-NEXT:    addq $40, %rsp
+; CHECK-NEXT:    retq
+  %r = call <3 x float> @llvm.asin.v3f32(<3 x float> %x)
+  ret <3 x float> %r
+}
+
+define <4 x float> @asin_v4f32(<4 x float> %x) nounwind {
+; CHECK-LABEL: asin_v4f32:
+; CHECK:       # %bb.0:
+; CHECK-NEXT:    subq $40, %rsp
+; CHECK-NEXT:    vmovaps %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
+; CHECK-NEXT:    callq asinf@PLT
+; CHECK-NEXT:    vmovaps %xmm0, (%rsp) # 16-byte Spill
+; CHECK-NEXT:    vmovshdup {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Folded Reload
+; CHECK-NEXT:    # xmm0 = mem[1,1,3,3]
+; CHECK-NEXT:    callq asinf@PLT
+; CHECK-NEXT:    vmovaps (%rsp), %xmm1 # 16-byte Reload
+; CHECK-NEXT:    vinsertps {{.*#+}} xmm0 = xmm1[0],xmm0[0],xmm1[2,3]
+; CHECK-NEXT:    vmovaps %xmm0, (%rsp) # 16-byte Spill
+; CHECK-NEXT:    vpermilpd $1, {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Folded Reload
+; CHECK-NEXT:    # xmm0 = mem[1,0]
+; CHECK-NEXT:    callq asinf@PLT
+; CHECK-NEXT:    vmovaps (%rsp), %xmm1 # 16-byte Reload
+; CHECK-NEXT:    vinsertps {{.*#+}} xmm0 = xmm1[0,1],xmm0[0],xmm1[3]
+; CHECK-NEXT:    vmovaps %xmm0, (%rsp) # 16-byte Spill
+; CHECK-NEXT:    vpermilps $255, {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Folded Reload
+; CHECK-NEXT:    # xmm0 = mem[3,3,3,3]
+; CHECK-NEXT:    callq asinf@PLT
+; CHECK-NEXT:    vmovaps (%rsp), %xmm1 # 16-byte Reload
+; CHECK-NEXT:    vinsertps {{.*#+}} xmm0 = xmm1[0,1,2],xmm0[0]
+; CHECK-NEXT:    addq $40, %rsp
+; CHECK-NEXT:    retq
+  %r = call <4 x float> @llvm.asin.v4f32(<4 x float> %x)
+  ret <4 x float> %r
+}
+
+define <5 x float> @asin_v5f32(<5 x float> %x) nounwind {
+; CHECK-LABEL: asin_v5f32:
+; CHECK:       # %bb.0:
+; CHECK-NEXT:    subq $72, %rsp
+; CHECK-NEXT:    vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
+; CHECK-NEXT:    # kill: def $xmm0 killed $xmm0 killed $ymm0
+; CHECK-NEXT:    vzeroupper
+; CHECK-NEXT:    callq asinf@PLT
+; CHECK-NEXT:    vmovaps %xmm0, (%rsp) # 16-byte Spill
+; CHECK-NEXT:    vmovshdup {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Folded Reload
+; CHECK-NEXT:    # xmm0 = mem[1,1,3,3]
+; CHECK-NEXT:    callq asinf@PLT
+; CHECK-NEXT:    vmovaps (%rsp), %xmm1 # 16-byte Reload
+; CHECK-NEXT:    vinsertps {{.*#+}} xmm0 = xmm1[0],xmm0[0],xmm1[2,3]
+; CHECK-NEXT:    vmovaps %xmm0, (%rsp) # 16-byte Spill
+; CHECK-NEXT:    vpermilpd $1, {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Folded Reload
+; CHECK-NEXT:    # xmm0 = mem[1,0]
+; CHECK-NEXT:    callq asinf@PLT
+; CHECK-NEXT:    vmovaps (%rsp), %xmm1 # 16-byte Reload
+; CHECK-NEXT:    vinsertps {{.*#+}} xmm0 = xmm1[0,1],xmm0[0],xmm1[3]
+; CHECK-NEXT:    vmovaps %xmm0, (%rsp) # 16-byte Spill
+; CHECK-NEXT:    vpermilps $255, {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Folded Reload
+; CHECK-NEXT:    # xmm0 = mem[3,3,3,3]
+; CHECK-NEXT:    callq asinf@PLT
+; CHECK-NEXT:    vmovaps (%rsp), %xmm1 # 16-byte Reload
+; CHECK-NEXT:    vinsertps {{.*#+}} xmm0 = xmm1[0,1,2],xmm0[0]
+; CHECK-NEXT:    vmovups %ymm0, (%rsp) # 32-byte Spill
+; CHECK-NEXT:    vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm0 # 32-byte Reload
+; CHECK-NEXT:    vextractf128 $1, %ymm0, %xmm0
+; CHECK-NEXT:    vzeroupper
+; CHECK-NEXT:    callq asinf@PLT
+; CHECK-NEXT:    vmovups (%rsp), %ymm1 # 32-byte Reload
+; CHECK-NEXT:    vinsertf128 $1, %xmm0, %ymm1, %ymm0
+; CHECK-NEXT:    addq $72, %rsp
+; CHECK-NEXT:    retq
+  %r = call <5 x float> @llvm.asin.v5f32(<5 x float> %x)
+  ret <5 x float> %r
+}
+
+define <6 x float> @asin_v6f32(<6 x float> %x) nounwind {
+; CHECK-LABEL: asin_v6f32:
+; CHECK:       # %bb.0:
+; CHECK-NEXT:    subq $72, %rsp
+; CHECK-NEXT:    vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
+; CHECK-NEXT:    vextractf128 $1, %ymm0, %xmm0
+; CHECK-NEXT:    vmovaps %xmm0, (%rsp) # 16-byte Spill
+; CHECK-NEXT:    vzeroupper
+; CHECK-NEXT:    callq asinf@PLT
+; CHECK-NEXT:    vmovaps %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
+; CHECK-NEXT:    vmovshdup (%rsp), %xmm0 # 16-byte Folded Reload
+; CHECK-NEXT:    # xmm0 = mem[1,1,3,3]
+; CHECK-NEXT:    callq asinf@PLT
+; CHECK-NEXT:    vmovaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm1 # 16-byte Reload
+; CHECK-NEXT:    vinsertps {{.*#+}} xmm0 = xmm1[0],xmm0[0],xmm1[2,3]
+; CHECK-NEXT:    vmovaps %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
+; CHECK-NEXT:    vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm0 # 32-byte Reload
+; CHECK-NEXT:    # kill: def $xmm0 killed $xmm0 killed $ymm0
+; CHECK-NEXT:    vzeroupper
+; CHECK-NEXT:    callq asinf@PLT
+; CHECK-NEXT:    vmovaps %xmm0, (%rsp) # 16-byte Spill
+; CHECK-NEXT:    vmovshdup {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Folded Reload
+; CHECK-NEXT:    # xmm0 = mem[1,1,3,3]
+; CHECK-NEXT:    callq asinf@PLT
+; CHECK-NEXT:    vmovaps (%rsp), %xmm1 # 16-byte Reload
+; CHECK-NEXT:    vinsertps {{.*#+}} xmm0 = xmm1[0],xmm0[0],xmm1[2,3]
+; CHECK-NEXT:    vmovaps %xmm0, (%rsp) # 16-byte Spill
+; CHECK-NEXT:    vpermilpd $1, {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Folded Reload
+; CHECK-NEXT:    # xmm0 = mem[1,0]
+; CHECK-NEXT:    callq asinf@PLT
+; CHECK-NEXT:    vmovaps (%rsp), %xmm1 # 16-byte Reload
+; CHECK-NEXT:    vinsertps {{.*#+}} xmm0 = xmm1[0,1],xmm0[0],xmm1[3]
+; CHECK-NEXT:    vmovaps %xmm0, (%rsp) # 16-byte Spill
+; CHECK-NEXT:    vpermilps $255, {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Folded Reload
+; CHECK-NEXT:    # xmm0 = mem[3,3,3,3]
+; CHECK-NEXT:    callq asinf@PLT
+; CHECK-NEXT:    vmovaps (%rsp), %xmm1 # 16-byte Reload
+; CHECK-NEXT:    vinsertps {{.*#+}} xmm0 = xmm1[0,1,2],xmm0[0]
+; CHECK-NEXT:    vinsertf128 $1, {{[-0-9]+}}(%r{{[sb]}}p), %ymm0, %ymm0 # 16-byte Folded Reload
+; CHECK-NEXT:    addq $72, %rsp
+; CHECK-NEXT:    retq
+  %r = call <6 x float> @llvm.asin.v6f32(<6 x float> %x)
+  ret <6 x float> %r
+}
+
+define <3 x double> @asin_v3f64(<3 x double> %x) nounwind {
+; CHECK-LABEL: asin_v3f64:
+; CHECK:       # %bb.0:
+; CHECK-NEXT:    subq $72, %rsp
+; CHECK-NEXT:    vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
+; CHECK-NEXT:    # kill: def $xmm0 killed $xmm0 killed $ymm0
+; CHECK-NEXT:    vzeroupper
+; CHECK-NEXT:    callq asin@PLT
+; CHECK-NEXT:    vmovaps %xmm0, (%rsp) # 16-byte Spill
+; CHECK-NEXT:    vpermilpd $1, {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Folded Reload
+; CHECK-NEXT:    # xmm0 = mem[1,0]
+; CHECK-NEXT:    callq asin@PLT
+; CHECK-NEXT:    vmovapd (%rsp), %xmm1 # 16-byte Reload
+; CHECK-NEXT:    vunpcklpd {{.*#+}} xmm0 = xmm1[0],xmm0[0]
+; CHECK-NEXT:    vmovupd %ymm0, (%rsp) # 32-byte Spill
+; CHECK-NEXT:    vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm0 # 32-byte Reload
+; CHECK-NEXT:    vextractf128 $1, %ymm0, %xmm0
+; CHECK-NEXT:    vzeroupper
+; CHECK-NEXT:    callq asin@PLT
+; CHECK-NEXT:    vmovups (%rsp), %ymm1 # 32-byte Reload
+; CHECK-NEXT:    vinsertf128 $1, %xmm0, %ymm1, %ymm0
+; CHECK-NEXT:    addq $72, %rsp
+; CHECK-NEXT:    retq
+  %r = call <3 x double> @llvm.asin.v3f64(<3 x double> %x)
+  ret <3 x double> %r
+}
+
+define <1 x float> @atan_v1f32(<1 x float> %x) nounwind {
+; CHECK-LABEL: atan_v1f32:
+; CHECK:       # %bb.0:
+; CHECK-NEXT:    pushq %rax
+; CHECK-NEXT:    callq atanf@PLT
+; CHECK-NEXT:    popq %rax
+; CHECK-NEXT:    retq
+  %r = call <1 x float> @llvm.atan.v1f32(<1 x float> %x)
+  ret <1 x float> %r
+}
+
+define <2 x float> @atan_v2f32(<2 x float> %x) nounwind {
+; CHECK-LABEL: atan_v2f32:
+; CHECK:       # %bb.0:
+; CHECK-NEXT:    subq $40, %rsp
+; CHECK-NEXT:    vmovaps %xmm0, (%rsp) # 16-byte Spill
+; CHECK-NEXT:    callq atanf@PLT
+; CHECK-NEXT:    vmovaps %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
+; CHECK-NEXT:    vmovshdup (%rsp), %xmm0 # 16-byte Folded Reload
+; CHECK-NEXT:    # xmm0 = mem[1,1,3,3]
+; CHECK-NEXT:    callq atanf@PLT
+; CHECK-NEXT:    vmovaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm1 # 16-byte Reload
+; CHECK-NEXT:    vinsertps {{.*#+}} xmm0 = xmm1[0],xmm0[0],xmm1[2,3]
+; CHECK-NEXT:    addq $40, %rsp
+; CHECK-NEXT:    retq
+  %r = call <2 x float> @llvm.atan.v2f32(<2 x float> %x)
+  ret <2 x float> %r
+}
+
+define <3 x float> @atan_v3f32(<3 x float> %x) nounwind {
+; CHECK-LABEL: atan_v3f32:
+; CHECK:       # %bb.0:
+; CHECK-NEXT:    subq $40, %rsp
+; CHECK-NEXT:    vmovaps %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
+; CHECK-NEXT:    callq atanf@PLT
+; CHECK-NEXT:    vmovaps %xmm0, (%rsp) # 16-byte Spill
+; CHECK-NEXT:    vmovshdup {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Folded Reload
+; CHECK-NEXT:    # xmm0 = mem[1,1,3,3]
+; CHECK-NEXT:    callq atanf@PLT
+; CHECK-NEXT:    vmovaps (%rsp), %xmm1 # 16-byte Reload
+; CHECK-NEXT:    vinsertps {{.*#+}} xmm0 = xmm1[0],xmm0[0],xmm1[2,3]
+; CHECK-NEXT:    vmovaps %xmm0, (%rsp) # 16-byte Spill
+; CHECK-NEXT:    vpermilpd $1, {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Folded Reload
+; CHECK-NEXT:    # xmm0 = mem[1,0]
+; CHECK-NEXT:    callq atanf@PLT
+; CHECK-NEXT:    vmovaps (%rsp), %xmm1 # 16-byte Reload
+; CHECK-NEXT:    vinsertps {{.*#+}} xmm0 = xmm1[0,1],xmm0[0],xmm1[3]
+; CHECK-NEXT:    addq $40, %rsp
+; CHECK-NEXT:    retq
+  %r = call <3 x float> @llvm.atan.v3f32(<3 x float> %x)
+  ret <3 x float> %r
+}
+
+define <4 x float> @atan_v4f32(<4 x float> %x) nounwind {
+; CHECK-LABEL: atan_v4f32:
+; CHECK:       # %bb.0:
+; CHECK-NEXT:    subq $40, %rsp
+; CHECK-NEXT:    vmovaps %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
+; CHECK-NEXT:    callq atanf@PLT
+; CHECK-NEXT:    vmovaps %xmm0, (%rsp) # 16-byte Spill
+; CHECK-NEXT:    vmovshdup {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Folded Reload
+; CHECK-NEXT:    # xmm0 = mem[1,1,3,3]
+; CHECK-NEXT:    callq atanf@PLT
+; CHECK-NEXT:    vmovaps (%rsp), %xmm1 # 16-byte Reload
+; CHECK-NEXT:    vinsertps {{.*#+}} xmm0 = xmm1[0],xmm0[0],xmm1[2,3]
+; CHECK-NEXT:    vmovaps %xmm0, (%rsp) # 16-byte Spill
+; CHECK-NEXT:    vpermilpd $1, {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Folded Reload
+; CHECK-NEXT:    # xmm0 = mem[1,0]
+; CHECK-NEXT:    callq atanf@PLT
+; CHECK-NEXT:    vmovaps (%rsp), %xmm1 # 16-byte Reload
+; CHECK-NEXT:    vinsertps {{.*#+}} xmm0 = xmm1[0,1],xmm0[0],xmm1[3]
+; CHECK-NEXT:    vmovaps %xmm0, (%rsp) # 16-byte Spill
+; CHECK-NEXT:    vpermilps $255, {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Folded Reload
+; CHECK-NEXT:    # xmm0 = mem[3,3,3,3]
+; CHECK-NEXT:    callq atanf@PLT
+; CHECK-NEXT:    vmovaps (%rsp), %xmm1 # 16-byte Reload
+; CHECK-NEXT:    vinsertps {{.*#+}} xmm0 = xmm1[0,1,2],xmm0[0]
+; CHECK-NEXT:    addq $40, %rsp
+; CHECK-NEXT:    retq
+  %r = call <4 x float> @llvm.atan.v4f32(<4 x float> %x)
+  ret <4 x float> %r
+}
+
+define <5 x float> @atan_v5f32(<5 x float> %x) nounwind {
+; CHECK-LABEL: atan_v5f32:
+; CHECK:       # %bb.0:
+; CHECK-NEXT:    subq $72, %rsp
+; CHECK-NEXT:    vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
+; CHECK-NEXT:    # kill: def $xmm0 killed $xmm0 killed $ymm0
+; CHECK-NEXT:    vzeroupper
+; CHECK-NEXT:    callq atanf@PLT
+; CHECK-NEXT:    vmovaps %xmm0, (%rsp) # 16-byte Spill
+; CHECK-NEXT:    vmovshdup {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Folded Reload
+; CHECK-NEXT:    # xmm0 = mem[1,1,3,3]
+; CHECK-NEXT:    callq atanf@PLT
+; CHECK-NEXT:    vmovaps (%rsp), %xmm1 # 16-byte Reload
+; CHECK-NEXT:    vinsertps {{.*#+}} xmm0 = xmm1[0],xmm0[0],xmm1[2,3]
+; CHECK-NEXT:    vmovaps %xmm0, (%rsp) # 16-byte Spill
+; CHECK-NEXT:    vpermilpd $1, {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Folded Reload
+; CHECK-NEXT:    # xmm0 = mem[1,0]
+; CHECK-NEXT:    callq atanf@PLT
+; CHECK-NEXT:    vmovaps (%rsp), %xmm1 # 16-byte Reload
+; CHECK-NEXT:    vinsertps {{.*#+}} xmm0 = xmm1[0,1],xmm0[0],xmm1[3]
+; CHECK-NEXT:    vmovaps %xmm0, (%rsp) # 16-byte Spill
+; CHECK-NEXT:    vpermilps $255, {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Folded Reload
+; CHECK-NEXT:    # xmm0 = mem[3,3,3,3]
+; CHECK-NEXT:    callq atanf@PLT
+; CHECK-NEXT:    vmovaps (%rsp), %xmm1 # 16-byte Reload
+; CHECK-NEXT:    vinsertps {{.*#+}} xmm0 = xmm1[0,1,2],xmm0[0]
+; CHECK-NEXT:    vmovups %ymm0, (%rsp) # 32-byte Spill
+; CHECK-NEXT:    vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm0 # 32-byte Reload
+; CHECK-NEXT:    vextractf128 $1, %ymm0, %xmm0
+; CHECK-NEXT:    vzeroupper
+; CHECK-NEXT:    callq atanf@PLT
+; CHECK-NEXT:    vmovups (%rsp), %ymm1 # 32-byte Reload
+; CHECK-NEXT:    vinsertf128 $1, %xmm0, %ymm1, %ymm0
+; CHECK-NEXT:    addq $72, %rsp
+; CHECK-NEXT:    retq
+  %r = call <5 x float> @llvm.atan.v5f32(<5 x float> %x)
+  ret <5 x float> %r
+}
+
+define <6 x float> @atan_v6f32(<6 x float> %x) nounwind {
+; CHECK-LABEL: atan_v6f32:
+; CHECK:       # %bb.0:
+; CHECK-NEXT:    subq $72, %rsp
+; CHECK-NEXT:    vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
+; CHECK-NEXT:    vextractf128 $1, %ymm0, %xmm0
+; CHECK-NEXT:    vmovaps %xmm0, (%rsp) # 16-byte Spill
+; CHECK-NEXT:    vzeroupper
+; CHECK-NEXT:    callq atanf@PLT
+; CHECK-NEXT:    vmovaps %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
+; CHECK-NEXT:    vmovshdup (%rsp), %xmm0 # 16-byte Folded Reload
+; CHECK-NEXT:    # xmm0 = mem[1,1,3,3]
+; CHECK-NEXT:    callq atanf@PLT
+; CHECK-NEXT:    vmovaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm1 # 16-byte Reload
+; CHECK-NEXT:    vinsertps {{.*#+}} xmm0 = xmm1[0],xmm0[0],xmm1[2,3]
+; CHECK-NEXT:    vmovaps %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
+; CHECK-NEXT:    vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm0 # 32-byte Reload
+; CHECK-NEXT:    # kill: def $xmm0 killed $xmm0 killed $ymm0
+; CHECK-NEXT:    vzeroupper
+; CHECK-NEXT:    callq atanf@PLT
+; CHECK-NEXT:    vmovaps %xmm0, (%rsp) # 16-byte Spill
+; CHECK-NEXT:    vmovshdup {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Folded Reload
+; CHECK-NEXT:    # xmm0 = mem[1,1,3,3]
+; CHECK-NEXT:    callq atanf@PLT
+; CHECK-NEXT:    vmovaps (%rsp), %xmm1 # 16-byte Reload
+; CHECK-NEXT:    vinsertps {{.*#+}} xmm0 = xmm1[0],xmm0[0],xmm1[2,3]
+; CHECK-NEXT:    vmovaps %xmm0, (%rsp) # 16-byte Spill
+; CHECK-NEXT:    vpermilpd $1, {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Folded Reload
+; CHECK-NEXT:    # xmm0 = mem[1,0]
+; CHECK-NEXT:    callq atanf@PLT
+; CHECK-NEXT:    vmovaps (%rsp), %xmm1 # 16-byte Reload
+; CHECK-NEXT:    vinsertps {{.*#+}} xmm0 = xmm1[0,1],xmm0[0],xmm1[3]
+; CHECK-NEXT:    vmovaps %xmm0, (%rsp) # 16-byte Spill
+; CHECK-NEXT:    vpermilps $255, {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Folded Reload
+; CHECK-NEXT:    # xmm0 = mem[3,3,3,3]
+; CHECK-NEXT:    callq atanf@PLT
+; CHECK-NEXT:    vmovaps (%rsp), %xmm1 # 16-byte Reload
+; CHECK-NEXT:    vinsertps {{.*#+}} xmm0 = xmm1[0,1,2],xmm0[0]
+; CHECK-NEXT:    vinsertf128 $1, {{[-0-9]+}}(%r{{[sb]}}p), %ymm0, %ymm0 # 16-byte Folded Reload
+; CHECK-NEXT:    addq $72, %rsp
+; CHECK-NEXT:    retq
+  %r = call <6 x float> @llvm.atan.v6f32(<6 x float> %x)
+  ret <6 x float> %r
+}
+
+define <3 x double> @atan_v3f64(<3 x double> %x) nounwind {
+; CHECK-LABEL: atan_v3f64:
+; CHECK:       # %bb.0:
+; CHECK-NEXT:    subq $72, %rsp
+; CHECK-NEXT:    vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
+; CHECK-NEXT:    # kill: def $xmm0 killed $xmm0 killed $ymm0
+; CHECK-NEXT:    vzeroupper
+; CHECK-NEXT:    callq atan@PLT
+; CHECK-NEXT:    vmovaps %xmm0, (%rsp) # 16-byte Spill
+; CHECK-NEXT:    vpermilpd $1, {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Folded Reload
+; CHECK-NEXT:    # xmm0 = mem[1,0]
+; CHECK-NEXT:    callq atan@PLT
+; CHECK-NEXT:    vmovapd (%rsp), %xmm1 # 16-byte Reload
+; CHECK-NEXT:    vunpcklpd {{.*#+}} xmm0 = xmm1[0],xmm0[0]
+; CHECK-NEXT:    vmovupd %ymm0, (%rsp) # 32-byte Spill
+; CHECK-NEXT:    vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm0 # 32-byte Reload
+; CHECK-NEXT:    vextractf128 $1, %ymm0, %xmm0
+; CHECK-NEXT:    vzeroupper
+; CHECK-NEXT:    callq atan@PLT
+; CHECK-NEXT:    vmovups (%rsp), %ymm1 # 32-byte Reload
+; CHECK-NEXT:    vinsertf128 $1, %xmm0, %ymm1, %ymm0
+; CHECK-NEXT:    addq $72, %rsp
+; CHECK-NEXT:    retq
+  %r = call <3 x double> @llvm.atan.v3f64(<3 x double> %x)
+  ret <3 x double> %r
+}
+
+define <1 x float> @cosh_v1f32(<1 x float> %x) nounwind {
+; CHECK-LABEL: cosh_v1f32:
+; CHECK:       # %bb.0:
+; CHECK-NEXT:    pushq %rax
+; CHECK-NEXT:    callq coshf@PLT
+; CHECK-NEXT:    popq %rax
+; CHECK-NEXT:    retq
+  %r = call <1 x float> @llvm.cosh.v1f32(<1 x float> %x)
+  ret <1 x float> %r
+}
+
+define <2 x float> @cosh_v2f32(<2 x float> %x) nounwind {
+; CHECK-LABEL: cosh_v2f32:
+; CHECK:       # %bb.0:
+; CHECK-NEXT:    subq $40, %rsp
+; CHECK-NEXT:    vmovaps %xmm0, (%rsp) # 16-byte Spill
+; CHECK-NEXT:    callq coshf@PLT
+; CHECK-NEXT:    vmovaps %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
+; CHECK-NEXT:    vmovshdup (%rsp), %xmm0 # 16-byte Folded Reload
+; CHECK-NEXT:    # xmm0 = mem[1,1,3,3]
+; CHECK-NEXT:    callq coshf@PLT
+; CHECK-NEXT:    vmovaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm1 # 16-byte Reload
+; CHECK-NEXT:    vinsertps {{.*#+}} xmm0 = xmm1[0],xmm0[0],xmm1[2,3]
+; CHECK-NEXT:    addq $40, %rsp
+; CHECK-NEXT:    retq
+  %r = call <2 x float> @llvm.cosh.v2f32(<2 x float> %x)
+  ret <2 x float> %r
+}
+
+define <3 x float> @cosh_v3f32(<3 x float> %x) nounwind {
+; CHECK-LABEL: cosh_v3f32:
+; CHECK:       # %bb.0:
+; CHECK-NEXT:    subq $40, %rsp
+; CHECK-NEXT:    vmovaps %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
+; CHECK-NEXT:    callq coshf@PLT
+; CHECK-NEXT:    vmovaps %xmm0, (%rsp) # 16-byte Spill
+; CHECK-NEXT:    vmovshdup {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Folded Reload
+; CHECK-NEXT:    # xmm0 = mem[1,1,3,3]
+; CHECK-NEXT:    callq coshf@PLT
+; CHECK-NEXT:    vmovaps (%rsp), %xmm1 # 16-byte Reload
+; CHECK-NEXT:    vinsertps {{.*#+}} xmm0 = xmm1[0],xmm0[0],xmm1[2,3]
+; CHECK-NEXT:    vmovaps %xmm0, (%rsp) # 16-byte Spill
+; CHECK-NEXT:    vpermilpd $1, {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Folded Reload
+; CHECK-NEXT:    # xmm0 = mem[1,0]
+; CHECK-NEXT:    callq coshf@PLT
+; CHECK-NEXT:    vmovaps (%rsp), %xmm1 # 16-byte Reload
+; CHECK-NEXT:    vinsertps {{.*#+}} xmm0 = xmm1[0,1],xmm0[0],xmm1[3]
+; CHECK-NEXT:    addq $40, %rsp
+; CHECK-NEXT:    retq
+  %r = call <3 x float> @llvm.cosh.v3f32(<3 x float> %x)
+  ret <3 x float> %r
+}
+
+define <4 x float> @cosh_v4f32(<4 x float> %x) nounwind {
+; CHECK-LABEL: cosh_v4f32:
+; CHECK:       # %bb.0:
+; CHECK-NEXT:    subq $40, %rsp
+; CHECK-NEXT:    vmovaps %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
+; CHECK-NEXT:    callq coshf@PLT
+; CHECK-NEXT:    vmovaps %xmm0, (%rsp) # 16-byte Spill
+; CHECK-NEXT:    vmovshdup {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Folded Reload
+; CHECK-NEXT:    # xmm0 = mem[1,1,3,3]
+; CHECK-NEXT:    callq coshf@PLT
+; CHECK-NEXT:    vmovaps (%rsp), %xmm1 # 16-byte Reload
+; CHECK-NEXT:    vinsertps {{.*#+}} xmm0 = xmm1[0],xmm0[0],xmm1[2,3]
+; CHECK-NEXT:    vmovaps %xmm0, (%rsp) # 16-byte Spill
+; CHECK-NEXT:    vpermilpd $1, {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Folded Reload
+; CHECK-NEXT:    # xmm0 = mem[1,0]
+; CHECK-NEXT:    callq coshf@PLT
+; CHECK-NEXT:    vmovaps (%rsp), %xmm1 # 16-byte Reload
+; CHECK-NEXT:    vinsertps {{.*#+}} xmm0 = xmm1[0,1],xmm0[0],xmm1[3]
+; CHECK-NEXT:    vmovaps %xmm0, (%rsp) # 16-byte Spill
+; CHECK-NEXT:    vpermilps $255, {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Folded Reload
+; CHECK-NEXT:    # xmm0 = mem[3,3,3,3]
+; CHECK-NEXT:    callq coshf@PLT
+; CHECK-NEXT:    vmovaps (%rsp), %xmm1 # 16-byte Reload
+; CHECK-NEXT:    vinsertps {{.*#+}} xmm0 = xmm1[0,1,2],xmm0[0]
+; CHECK-NEXT:    addq $40, %rsp
+; CHECK-NEXT:    retq
+  %r = call <4 x float> @llvm.cosh.v4f32(<4 x float> %x)
+  ret <4 x float> %r
+}
+
+define <5 x float> @cosh_v5f32(<5 x float> %x) nounwind {
+; CHECK-LABEL: cosh_v5f32:
+; CHECK:       # %bb.0:
+; CHECK-NEXT:    subq $72, %rsp
+; CHECK-NEXT:    vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
+; CHECK-NEXT:    # kill: def $xmm0 killed $xmm0 killed $ymm0
+; CHECK-NEXT:    vzeroupper
+; CHECK-NEXT:    callq coshf@PLT
+; CHECK-NEXT:    vmovaps %xmm0, (%rsp) # 16-byte Spill
+; CHECK-NEXT:    vmovshdup {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Folded Reload
+; CHECK-NEXT:    # xmm0 = mem[1,1,3,3]
+; CHECK-NEXT:    callq coshf@PLT
+; CHECK-NEXT:    vmovaps (%rsp), %xmm1 # 16-byte Reload
+; CHECK-NEXT:    vinsertps {{.*#+}} xmm0 = xmm1[0],xmm0[0],xmm1[2,3]
+; CHECK-NEXT:    vmovaps %xmm0, (%rsp) # 16-byte Spill
+; CHECK-NEXT:    vpermilpd $1, {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Folded Reload
+; CHECK-NEXT:    # xmm0 = mem[1,0]
+; CHECK-NEXT:    callq coshf@PLT
+; CHECK-NEXT:    vmovaps (%rsp), %xmm1 # 16-byte Reload
+; CHECK-NEXT:    vinsertps {{.*#+}} xmm0 = xmm1[0,1],xmm0[0],xmm1[3]
+; CHECK-NEXT:    vmovaps %xmm0, (%rsp) # 16-byte Spill
+; CHECK-NEXT:    vpermilps $255, {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Folded Reload
+; CHECK-NEXT:    # xmm0 = mem[3,3,3,3]
+; CHECK-NEXT:    callq coshf@PLT
+; CHECK-NEXT:    vmovaps (%rsp), %xmm1 # 16-byte Reload
+; CHECK-NEXT:    vinsertps {{.*#+}} xmm0 = xmm1[0,1,2],xmm0[0]
+; CHECK-NEXT:    vmovups %ymm0, (%rsp) # 32-byte Spill
+; CHECK-NEXT:    vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm0 # 32-byte Reload
+; CHECK-NEXT:    vextractf128 $1, %ymm0, %xmm0
+; CHECK-NEXT:    vzeroupper
+; CHECK-NEXT:    callq coshf@PLT
+; CHECK-NEXT:    vmovups (%rsp), %ymm1 # 32-byte Reload
+; CHECK-NEXT:    vinsertf128 $1, %xmm0, %ymm1, %ymm0
+; CHECK-NEXT:    addq $72, %rsp
+; CHECK-NEXT:    retq
+  %r = call <5 x float> @llvm.cosh.v5f32(<5 x float> %x)
+  ret <5 x float> %r
+}
+
+define <6 x float> @cosh_v6f32(<6 x float> %x) nounwind {
+; CHECK-LABEL: cosh_v6f32:
+; CHECK:       # %bb.0:
+; CHECK-NEXT:    subq $72, %rsp
+; CHECK-NEXT:    vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
+; CHECK-NEXT:    vextractf128 $1, %ymm0, %xmm0
+; CHECK-NEXT:    vmovaps %xmm0, (%rsp) # 16-byte Spill
+; CHECK-NEXT:    vzeroupper
+; CHECK-NEXT:    callq coshf@PLT
+; CHECK-NEXT:    vmovaps %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
+; CHECK-NEXT:    vmovshdup (%rsp), %xmm0 # 16-byte Folded Reload
+; CHECK-NEXT:    # xmm0 = mem[1,1,3,3]
+; CHECK-NEXT:    callq coshf@PLT
+; CHECK-NEXT:    vmovaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm1 # 16-byte Reload
+; CHECK-NEXT:    vinsertps {{.*#+}} xmm0 = xmm1[0],xmm0[0],xmm1[2,3]
+; CHECK-NEXT:    vmovaps %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
+; CHECK-NEXT:    vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm0 # 32-byte Reload
+; CHECK-NEXT:    # kill: def $xmm0 killed $xmm0 killed $ymm0
+; CHECK-NEXT:    vzeroupper
+; CHECK-NEXT:    callq coshf@PLT
+; CHECK-NEXT:    vmovaps %xmm0, (%rsp) # 16-byte Spill
+; CHECK-NEXT:    vmovshdup {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Folded Reload
+; CHECK-NEXT:    # xmm0 = mem[1,1,3,3]
+; CHECK-NEXT:    callq coshf@PLT
+; CHECK-NEXT:    vmovaps (%rsp), %xmm1 # 16-byte Reload
+; CHECK-NEXT:    vinsertps {{.*#+}} xmm0 = xmm1[0],xmm0[0],xmm1[2,3]
+; CHECK-NEXT:    vmovaps %xmm0, (%rsp) # 16-byte Spill
+; CHECK-NEXT:    vpermilpd $1, {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Folded Reload
+; CHECK-NEXT:    # xmm0 = mem[1,0]
+; CHECK-NEXT:    callq coshf@PLT
+; CHECK-NEXT:    vmovaps (%rsp), %xmm1 # 16-byte Reload
+; CHECK-NEXT:    vinsertps {{.*#+}} xmm0 = xmm1[0,1],xmm0[0],xmm1[3]
+; CHECK-NEXT:    vmovaps %xmm0, (%rsp) # 16-byte Spill
+; CHECK-NEXT:    vpermilps $255, {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Folded Reload
+; CHECK-NEXT:    # xmm0 = mem[3,3,3,3]
+; CHECK-NEXT:    callq coshf@PLT
+; CHECK-NEXT:    vmovaps (%rsp), %xmm1 # 16-byte Reload
+; CHECK-NEXT:    vinsertps {{.*#+}} xmm0 = xmm1[0,1,2],xmm0[0]
+; CHECK-NEXT:    vinsertf128 $1, {{[-0-9]+}}(%r{{[sb]}}p), %ymm0, %ymm0 # 16-byte Folded Reload
+; CHECK-NEXT:    addq $72, %rsp
+; CHECK-NEXT:    retq
+  %r = call <6 x float> @llvm.cosh.v6f32(<6 x float> %x)
+  ret <6 x float> %r
+}
+
+define <3 x double> @cosh_v3f64(<3 x double> %x) nounwind {
+; CHECK-LABEL: cosh_v3f64:
+; CHECK:       # %bb.0:
+; CHECK-NEXT:    subq $72, %rsp
+; CHECK-NEXT:    vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
+; CHECK-NEXT:    # kill: def $xmm0 killed $xmm0 killed $ymm0
+; CHECK-NEXT:    vzeroupper
+; CHECK-NEXT:    callq cosh@PLT
+; CHECK-NEXT:    vmovaps %xmm0, (%rsp) # 16-byte Spill
+; CHECK-NEXT:    vpermilpd $1, {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Folded Reload
+; CHECK-NEXT:    # xmm0 = mem[1,0]
+; CHECK-NEXT:    callq cosh@PLT
+; CHECK-NEXT:    vmovapd (%rsp), %xmm1 # 16-byte Reload
+; CHECK-NEXT:    vunpcklpd {{.*#+}} xmm0 = xmm1[0],xmm0[0]
+; CHECK-NEXT:    vmovupd %ymm0, (%rsp) # 32-byte Spill
+; CHECK-NEXT:    vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm0 # 32-byte Reload
+; CHECK-NEXT:    vextractf128 $1, %ymm0, %xmm0
+; CHECK-NEXT:    vzeroupper
+; CHECK-NEXT:    callq cosh@PLT
+; CHECK-NEXT:    vmovups (%rsp), %ymm1 # 32-byte Reload
+; CHECK-NEXT:    vinsertf128 $1, %xmm0, %ymm1, %ymm0
+; CHECK-NEXT:    addq $72, %rsp
+; CHECK-NEXT:    retq
+  %r = call <3 x double> @llvm.cosh.v3f64(<3 x double> %x)
+  ret <3 x double> %r
+}
+
+define <1 x float> @sinh_v1f32(<1 x float> %x) nounwind {
+; CHECK-LABEL: sinh_v1f32:
+; CHECK:       # %bb.0:
+; CHECK-NEXT:    pushq %rax
+; CHECK-NEXT:    callq sinhf@PLT
+; CHECK-NEXT:    popq %rax
+; CHECK-NEXT:    retq
+  %r = call <1 x float> @llvm.sinh.v1f32(<1 x float> %x)
+  ret <1 x float> %r
+}
+
+define <2 x float> @sinh_v2f32(<2 x float> %x) nounwind {
+; CHECK-LABEL: sinh_v2f32:
+; CHECK:       # %bb.0:
+; CHECK-NEXT:    subq $40, %rsp
+; CHECK-NEXT:    vmovaps %xmm0, (%rsp) # 16-byte Spill
+; CHECK-NEXT:    callq sinhf@PLT
+; CHECK-NEXT:    vmovaps %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
+; CHECK-NEXT:    vmovshdup (%rsp), %xmm0 # 16-byte Folded Reload
+; CHECK-NEXT:    # xmm0 = mem[1,1,3,3]
+; CHECK-NEXT:    callq sinhf@PLT
+; CHECK-NEXT:    vmovaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm1 # 16-byte Reload
+; CHECK-NEXT:    vinsertps {{.*#+}} xmm0 = xmm1[0],xmm0[0],xmm1[2,3]
+; CHECK-NEXT:    addq $40, %rsp
+; CHECK-NEXT:    retq
+  %r = call <2 x float> @llvm.sinh.v2f32(<2 x float> %x)
+  ret <2 x float> %r
+}
+
+define <3 x float> @sinh_v3f32(<3 x float> %x) nounwind {
+; CHECK-LABEL: sinh_v3f32:
+; CHECK:       # %bb.0:
+; CHECK-NEXT:    subq $40, %rsp
+; CHECK-NEXT:    vmovaps %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
+; CHECK-NEXT:    callq sinhf@PLT
+; CHECK-NEXT:    vmovaps %xmm0, (%rsp) # 16-byte Spill
+; CHECK-NEXT:    vmovshdup {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Folded Reload
+; CHECK-NEXT:    # xmm0 = mem[1,1,3,3]
+; CHECK-NEXT:    callq sinhf@PLT
+; CHECK-NEXT:    vmovaps (%rsp), %xmm1 # 16-byte Reload
+; CHECK-NEXT:    vinsertps {{.*#+}} xmm0 = xmm1[0],xmm0[0],xmm1[2,3]
+; CHECK-NEXT:    vmovaps %xmm0, (%rsp) # 16-byte Spill
+; CHECK-NEXT:    vpermilpd $1, {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Folded Reload
+; CHECK-NEXT:    # xmm0 = mem[1,0]
+; CHECK-NEXT:    callq sinhf@PLT
+; CHECK-NEXT:    vmovaps (%rsp), %xmm1 # 16-byte Reload
+; CHECK-NEXT:    vinsertps {{.*#+}} xmm0 = xmm1[0,1],xmm0[0],xmm1[3]
+; CHECK-NEXT:    addq $40, %rsp
+; CHECK-NEXT:    retq
+  %r = call <3 x float> @llvm.sinh.v3f32(<3 x float> %x)
+  ret <3 x float> %r
+}
+
+define <4 x float> @sinh_v4f32(<4 x float> %x) nounwind {
+; CHECK-LABEL: sinh_v4f32:
+; CHECK:       # %bb.0:
+; CHECK-NEXT:    subq $40, %rsp
+; CHECK-NEXT:    vmovaps %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
+; CHECK-NEXT:    callq sinhf@PLT
+; CHECK-NEXT:    vmovaps %xmm0, (%rsp) # 16-byte Spill
+; CHECK-NEXT:    vmovshdup {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Folded Reload
+; CHECK-NEXT:    # xmm0 = mem[1,1,3,3]
+; CHECK-NEXT:    callq sinhf@PLT
+; CHECK-NEXT:    vmovaps (%rsp), %xmm1 # 16-byte Reload
+; CHECK-NEXT:    vinsertps {{.*#+}} xmm0 = xmm1[0],xmm0[0],xmm1[2,3]
+; CHECK-NEXT:    vmovaps %xmm0, (%rsp) # 16-byte Spill
+; CHECK-NEXT:    vpermilpd $1, {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Folded Reload
+; CHECK-NEXT:    # xmm0 = mem[1,0]
+; CHECK-NEXT:    callq sinhf@PLT
+; CHECK-NEXT:    vmovaps (%rsp), %xmm1 # 16-byte Reload
+; CHECK-NEXT:    vinsertps {{.*#+}} xmm0 = xmm1[0,1],xmm0[0],xmm1[3]
+; CHECK-NEXT:    vmovaps %xmm0, (%rsp) # 16-byte Spill
+; CHECK-NEXT:    vpermilps $255, {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Folded Reload
+; CHECK-NEXT:    # xmm0 = mem[3,3,3,3]
+; CHECK-NEXT:    callq sinhf@PLT
+; CHECK-NEXT:    vmovaps (%rsp), %xmm1 # 16-byte Reload
+; CHECK-NEXT:    vinsertps {{.*#+}} xmm0 = xmm1[0,1,2],xmm0[0]
+; CHECK-NEXT:    addq $40, %rsp
+; CHECK-NEXT:    retq
+  %r = call <4 x float> @llvm.sinh.v4f32(<4 x float> %x)
+  ret <4 x float> %r
+}
+
+define <5 x float> @sinh_v5f32(<5 x float> %x) nounwind {
+; CHECK-LABEL: sinh_v5f32:
+; CHECK:       # %bb.0:
+; CHECK-NEXT:    subq $72, %rsp
+; CHECK-NEXT:    vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
+; CHECK-NEXT:    # kill: def $xmm0 killed $xmm0 killed $ymm0
+; CHECK-NEXT:    vzeroupper
+; CHECK-NEXT:    callq sinhf@PLT
+; CHECK-NEXT:    vmovaps %xmm0, (%rsp) # 16-byte Spill
+; CHECK-NEXT:    vmovshdup {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Folded Reload
+; CHECK-NEXT:    # xmm0 = mem[1,1,3,3]
+; CHECK-NEXT:    callq sinhf@PLT
+; CHECK-NEXT:    vmovaps (%rsp), %xmm1 # 16-byte Reload
+; CHECK-NEXT:    vinsertps {{.*#+}} xmm0 = xmm1[0],xmm0[0],xmm1[2,3]
+; CHECK-NEXT:    vmovaps %xmm0, (%rsp) # 16-byte Spill
+; CHECK-NEXT:    vpermilpd $1, {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Folded Reload
+; CHECK-NEXT:    # xmm0 = mem[1,0]
+; CHECK-NEXT:    callq sinhf@PLT
+; CHECK-NEXT:    vmovaps (%rsp), %xmm1 # 16-byte Reload
+; CHECK-NEXT:    vinsertps {{.*#+}} xmm0 = xmm1[0,1],xmm0[0],xmm1[3]
+; CHECK-NEXT:    vmovaps %xmm0, (%rsp) # 16-byte Spill
+; CHECK-NEXT:    vpermilps $255, {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Folded Reload
+; CHECK-NEXT:    # xmm0 = mem[3,3,3,3]
+; CHECK-NEXT:    callq sinhf@PLT
+; CHECK-NEXT:    vmovaps (%rsp), %xmm1 # 16-byte Reload
+; CHECK-NEXT:    vinsertps {{.*#+}} xmm0 = xmm1[0,1,2],xmm0[0]
+; CHECK-NEXT:    vmovups %ymm0, (%rsp) # 32-byte Spill
+; CHECK-NEXT:    vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm0 # 32-byte Reload
+; CHECK-NEXT:    vextractf128 $1, %ymm0, %xmm0
+; CHECK-NEXT:    vzeroupper
+; CHECK-NEXT:    callq sinhf@PLT
+; CHECK-NEXT:    vmovups (%rsp), %ymm1 # 32-byte Reload
+; CHECK-NEXT:    vinsertf128 $1, %xmm0, %ymm1, %ymm0
+; CHECK-NEXT:    addq $72, %rsp
+; CHECK-NEXT:    retq
+  %r = call <5 x float> @llvm.sinh.v5f32(<5 x float> %x)
+  ret <5 x float> %r
+}
+
+define <6 x float> @sinh_v6f32(<6 x float> %x) nounwind {
+; CHECK-LABEL: sinh_v6f32:
+; CHECK:       # %bb.0:
+; CHECK-NEXT:    subq $72, %rsp
+; CHECK-NEXT:    vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
+; CHECK-NEXT:    vextractf128 $1, %ymm0, %xmm0
+; CHECK-NEXT:    vmovaps %xmm0, (%rsp) # 16-byte Spill
+; CHECK-NEXT:    vzeroupper
+; CHECK-NEXT:    callq sinhf@PLT
+; CHECK-NEXT:    vmovaps %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
+; CHECK-NEXT:    vmovshdup (%rsp), %xmm0 # 16-byte Folded Reload
+; CHECK-NEXT:    # xmm0 = mem[1,1,3,3]
+; CHECK-NEXT:    callq sinhf@PLT
+; CHECK-NEXT:    vmovaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm1 # 16-byte Reload
+; CHECK-NEXT:    vinsertps {{.*#+}} xmm0 = xmm1[0],xmm0[0],xmm1[2,3]
+; CHECK-NEXT:    vmovaps %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
+; CHECK-NEXT:    vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm0 # 32-byte Reload
+; CHECK-NEXT:    # kill: def $xmm0 killed $xmm0 killed $ymm0
+; CHECK-NEXT:    vzeroupper
+; CHECK-NEXT:    callq sinhf@PLT
+; CHECK-NEXT:    vmovaps %xmm0, (%rsp) # 16-byte Spill
+; CHECK-NEXT:    vmovshdup {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Folded Reload
+; CHECK-NEXT:    # xmm0 = mem[1,1,3,3]
+; CHECK-NEXT:    callq sinhf@PLT
+; CHECK-NEXT:    vmovaps (%rsp), %xmm1 # 16-byte Reload
+; CHECK-NEXT:    vinsertps {{.*#+}} xmm0 = xmm1[0],xmm0[0],xmm1[2,3]
+; CHECK-NEXT:    vmovaps %xmm0, (%rsp) # 16-byte Spill
+; CHECK-NEXT:    vpermilpd $1, {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Folded Reload
+; CHECK-NEXT:    # xmm0 = mem[1,0]
+; CHECK-NEXT:    callq sinhf@PLT
+; CHECK-NEXT:    vmovaps (%rsp), %xmm1 # 16-byte Reload
+; CHECK-NEXT:    vinsertps {{.*#+}} xmm0 = xmm1[0,1],xmm0[0],xmm1[3]
+; CHECK-NEXT:    vmovaps %xmm0, (%rsp) # 16-byte Spill
+; CHECK-NEXT:    vpermilps $255, {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Folded Reload
+; CHECK-NEXT:    # xmm0 = mem[3,3,3,3]
+; CHECK-NEXT:    callq sinhf@PLT
+; CHECK-NEXT:    vmovaps (%rsp), %xmm1 # 16-byte Reload
+; CHECK-NEXT:    vinsertps {{.*#+}} xmm0 = xmm1[0,1,2],xmm0[0]
+; CHECK-NEXT:    vinsertf128 $1, {{[-0-9]+}}(%r{{[sb]}}p), %ymm0, %ymm0 # 16-byte Folded Reload
+; CHECK-NEXT:    addq $72, %rsp
+; CHECK-NEXT:    retq
+  %r = call <6 x float> @llvm.sinh.v6f32(<6 x float> %x)
+  ret <6 x float> %r
+}
+
+define <3 x double> @sinh_v3f64(<3 x double> %x) nounwind {
+; CHECK-LABEL: sinh_v3f64:
+; CHECK:       # %bb.0:
+; CHECK-NEXT:    subq $72, %rsp
+; CHECK-NEXT:    vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
+; CHECK-NEXT:    # kill: def $xmm0 killed $xmm0 killed $ymm0
+; CHECK-NEXT:    vzeroupper
+; CHECK-NEXT:    callq sinh@PLT
+; CHECK-NEXT:    vmovaps %xmm0, (%rsp) # 16-byte Spill
+; CHECK-NEXT:    vpermilpd $1, {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Folded Reload
+; CHECK-NEXT:    # xmm0 = mem[1,0]
+; CHECK-NEXT:    callq sinh@PLT
+; CHECK-NEXT:    vmovapd (%rsp), %xmm1 # 16-byte Reload
+; CHECK-NEXT:    vunpcklpd {{.*#+}} xmm0 = xmm1[0],xmm0[0]
+; CHECK-NEXT:    vmovupd %ymm0, (%rsp) # 32-byte Spill
+; CHECK-NEXT:    vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm0 # 32-byte Reload
+; CHECK-NEXT:    vextractf128 $1, %ymm0, %xmm0
+; CHECK-NEXT:    vzeroupper
+; CHECK-NEXT:    callq sinh@PLT
+; CHECK-NEXT:    vmovups (%rsp), %ymm1 # 32-byte Reload
+; CHECK-NEXT:    vinsertf128 $1, %xmm0, %ymm1, %ymm0
+; CHECK-NEXT:    addq $72, %rsp
+; CHECK-NEXT:    retq
+  %r = call <3 x double> @llvm.sinh.v3f64(<3 x double> %x)
+  ret <3 x double> %r
+}
+
+define <1 x float> @tanh_v1f32(<1 x float> %x) nounwind {
+; CHECK-LABEL: tanh_v1f32:
+; CHECK:       # %bb.0:
+; CHECK-NEXT:    pushq %rax
+; CHECK-NEXT:    callq tanhf@PLT
+; CHECK-NEXT:    popq %rax
+; CHECK-NEXT:    retq
+  %r = call <1 x float> @llvm.tanh.v1f32(<1 x float> %x)
+  ret <1 x float> %r
+}
+
+define <2 x float> @tanh_v2f32(<2 x float> %x) nounwind {
+; CHECK-LABEL: tanh_v2f32:
+; CHECK:       # %bb.0:
+; CHECK-NEXT:    subq $40, %rsp
+; CHECK-NEXT:    vmovaps %xmm0, (%rsp) # 16-byte Spill
+; CHECK-NEXT:    callq tanhf@PLT
+; CHECK-NEXT:    vmovaps %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
+; CHECK-NEXT:    vmovshdup (%rsp), %xmm0 # 16-byte Folded Reload
+; CHECK-NEXT:    # xmm0 = mem[1,1,3,3]
+; CHECK-NEXT:    callq tanhf@PLT
+; CHECK-NEXT:    vmovaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm1 # 16-byte Reload
+; CHECK-NEXT:    vinsertps {{.*#+}} xmm0 = xmm1[0],xmm0[0],xmm1[2,3]
+; CHECK-NEXT:    addq $40, %rsp
+; CHECK-NEXT:    retq
+  %r = call <2 x float> @llvm.tanh.v2f32(<2 x float> %x)
+  ret <2 x float> %r
+}
+
+define <3 x float> @tanh_v3f32(<3 x float> %x) nounwind {
+; CHECK-LABEL: tanh_v3f32:
+; CHECK:       # %bb.0:
+; CHECK-NEXT:    subq $40, %rsp
+; CHECK-NEXT:    vmovaps %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
+; CHECK-NEXT:    callq tanhf@PLT
+; CHECK-NEXT:    vmovaps %xmm0, (%rsp) # 16-byte Spill
+; CHECK-NEXT:    vmovshdup {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Folded Reload
+; CHECK-NEXT:    # xmm0 = mem[1,1,3,3]
+; CHECK-NEXT:    callq tanhf@PLT
+; CHECK-NEXT:    vmovaps (%rsp), %xmm1 # 16-byte Reload
+; CHECK-NEXT:    vinsertps {{.*#+}} xmm0 = xmm1[0],xmm0[0],xmm1[2,3]
+; CHECK-NEXT:    vmovaps %xmm0, (%rsp) # 16-byte Spill
+; CHECK-NEXT:    vpermilpd $1, {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Folded Reload
+; CHECK-NEXT:    # xmm0 = mem[1,0]
+; CHECK-NEXT:    callq tanhf@PLT
+; CHECK-NEXT:    vmovaps (%rsp), %xmm1 # 16-byte Reload
+; CHECK-NEXT:    vinsertps {{.*#+}} xmm0 = xmm1[0,1],xmm0[0],xmm1[3]
+; CHECK-NEXT:    addq $40, %rsp
+; CHECK-NEXT:    retq
+  %r = call <3 x float> @llvm.tanh.v3f32(<3 x float> %x)
+  ret <3 x float> %r
+}
+
+define <4 x float> @tanh_v4f32(<4 x float> %x) nounwind {
+; CHECK-LABEL: tanh_v4f32:
+; CHECK:       # %bb.0:
+; CHECK-NEXT:    subq $40, %rsp
+; CHECK-NEXT:    vmovaps %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
+; CHECK-NEXT:    callq tanhf@PLT
+; CHECK-NEXT:    vmovaps %xmm0, (%rsp) # 16-byte Spill
+; CHECK-NEXT:    vmovshdup {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Folded Reload
+; CHECK-NEXT:    # xmm0 = mem[1,1,3,3]
+; CHECK-NEXT:    callq tanhf@PLT
+; CHECK-NEXT:    vmovaps (%rsp), %xmm1 # 16-byte Reload
+; CHECK-NEXT:    vinsertps {{.*#+}} xmm0 = xmm1[0],xmm0[0],xmm1[2,3]
+; CHECK-NEXT:    vmovaps %xmm0, (%rsp) # 16-byte Spill
+; CHECK-NEXT:    vpermilpd $1, {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Folded Reload
+; CHECK-NEXT:    # xmm0 = mem[1,0]
+; CHECK-NEXT:    callq tanhf@PLT
+; CHECK-NEXT:    vmovaps (%rsp), %xmm1 # 16-byte Reload
+; CHECK-NEXT:    vinsertps {{.*#+}} xmm0 = xmm1[0,1],xmm0[0],xmm1[3]
+; CHECK-NEXT:    vmovaps %xmm0, (%rsp) # 16-byte Spill
+; CHECK-NEXT:    vpermilps $255, {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Folded Reload
+; CHECK-NEXT:    # xmm0 = mem[3,3,3,3]
+; CHECK-NEXT:    callq tanhf@PLT
+; CHECK-NEXT:    vmovaps (%rsp), %xmm1 # 16-byte Reload
+; CHECK-NEXT:    vinsertps {{.*#+}} xmm0 = xmm1[0,1,2],xmm0[0]
+; CHECK-NEXT:    addq $40, %rsp
+; CHECK-NEXT:    retq
+  %r = call <4 x float> @llvm.tanh.v4f32(<4 x float> %x)
+  ret <4 x float> %r
+}
+
+define <5 x float> @tanh_v5f32(<5 x float> %x) nounwind {
+; CHECK-LABEL: tanh_v5f32:
+; CHECK:       # %bb.0:
+; CHECK-NEXT:    subq $72, %rsp
+; CHECK-NEXT:    vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
+; CHECK-NEXT:    # kill: def $xmm0 killed $xmm0 killed $ymm0
+; CHECK-NEXT:    vzeroupper
+; CHECK-NEXT:    callq tanhf@PLT
+; CHECK-NEXT:    vmovaps %xmm0, (%rsp) # 16-byte Spill
+; CHECK-NEXT:    vmovshdup {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Folded Reload
+; CHECK-NEXT:    # xmm0 = mem[1,1,3,3]
+; CHECK-NEXT:    callq tanhf@PLT
+; CHECK-NEXT:    vmovaps (%rsp), %xmm1 # 16-byte Reload
+; CHECK-NEXT:    vinsertps {{.*#+}} xmm0 = xmm1[0],xmm0[0],xmm1[2,3]
+; CHECK-NEXT:    vmovaps %xmm0, (%rsp) # 16-byte Spill
+; CHECK-NEXT:    vpermilpd $1, {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Folded Reload
+; CHECK-NEXT:    # xmm0 = mem[1,0]
+; CHECK-NEXT:    callq tanhf@PLT
+; CHECK-NEXT:    vmovaps (%rsp), %xmm1 # 16-byte Reload
+; CHECK-NEXT:    vinsertps {{.*#+}} xmm0 = xmm1[0,1],xmm0[0],xmm1[3]
+; CHECK-NEXT:    vmovaps %xmm0, (%rsp) # 16-byte Spill
+; CHECK-NEXT:    vpermilps $255, {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Folded Reload
+; CHECK-NEXT:    # xmm0 = mem[3,3,3,3]
+; CHECK-NEXT:    callq tanhf@PLT
+; CHECK-NEXT:    vmovaps (%rsp), %xmm1 # 16-byte Reload
+; CHECK-NEXT:    vinsertps {{.*#+}} xmm0 = xmm1[0,1,2],xmm0[0]
+; CHECK-NEXT:    vmovups %ymm0, (%rsp) # 32-byte Spill
+; CHECK-NEXT:    vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm0 # 32-byte Reload
+; CHECK-NEXT:    vextractf128 $1, %ymm0, %xmm0
+; CHECK-NEXT:    vzeroupper
+; CHECK-NEXT:    callq tanhf@PLT
+; CHECK-NEXT:    vmovups (%rsp), %ymm1 # 32-byte Reload
+; CHECK-NEXT:    vinsertf128 $1, %xmm0, %ymm1, %ymm0
+; CHECK-NEXT:    addq $72, %rsp
+; CHECK-NEXT:    retq
+  %r = call <5 x float> @llvm.tanh.v5f32(<5 x float> %x)
+  ret <5 x float> %r
+}
+
+define <6 x float> @tanh_v6f32(<6 x float> %x) nounwind {
+; CHECK-LABEL: tanh_v6f32:
+; CHECK:       # %bb.0:
+; CHECK-NEXT:    subq $72, %rsp
+; CHECK-NEXT:    vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
+; CHECK-NEXT:    vextractf128 $1, %ymm0, %xmm0
+; CHECK-NEXT:    vmovaps %xmm0, (%rsp) # 16-byte Spill
+; CHECK-NEXT:    vzeroupper
+; CHECK-NEXT:    callq tanhf@PLT
+; CHECK-NEXT:    vmovaps %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
+; CHECK-NEXT:    vmovshdup (%rsp), %xmm0 # 16-byte Folded Reload
+; CHECK-NEXT:    # xmm0 = mem[1,1,3,3]
+; CHECK-NEXT:    callq tanhf@PLT
+; CHECK-NEXT:    vmovaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm1 # 16-byte Reload
+; CHECK-NEXT:    vinsertps {{.*#+}} xmm0 = xmm1[0],xmm0[0],xmm1[2,3]
+; CHECK-NEXT:    vmovaps %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
+; CHECK-NEXT:    vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm0 # 32-byte Reload
+; CHECK-NEXT:    # kill: def $xmm0 killed $xmm0 killed $ymm0
+; CHECK-NEXT:    vzeroupper
+; CHECK-NEXT:    callq tanhf@PLT
+; CHECK-NEXT:    vmovaps %xmm0, (%rsp) # 16-byte Spill
+; CHECK-NEXT:    vmovshdup {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Folded Reload
+; CHECK-NEXT:    # xmm0 = mem[1,1,3,3]
+; CHECK-NEXT:    callq tanhf@PLT
+; CHECK-NEXT:    vmovaps (%rsp), %xmm1 # 16-byte Reload
+; CHECK-NEXT:    vinsertps {{.*#+}} xmm0 = xmm1[0],xmm0[0],xmm1[2,3]
+; CHECK-NEXT:    vmovaps %xmm0, (%rsp) # 16-byte Spill
+; CHECK-NEXT:    vpermilpd $1, {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Folded Reload
+; CHECK-NEXT:    # xmm0 = mem[1,0]
+; CHECK-NEXT:    callq tanhf@PLT
+; CHECK-NEXT:    vmovaps (%rsp), %xmm1 # 16-byte Reload
+; CHECK-NEXT:    vinsertps {{.*#+}} xmm0 = xmm1[0,1],xmm0[0],xmm1[3]
+; CHECK-NEXT:    vmovaps %xmm0, (%rsp) # 16-byte Spill
+; CHECK-NEXT:    vpermilps $255, {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Folded Reload
+; CHECK-NEXT:    # xmm0 = mem[3,3,3,3]
+; CHECK-NEXT:    callq tanhf@PLT
+; CHECK-NEXT:    vmovaps (%rsp), %xmm1 # 16-byte Reload
+; CHECK-NEXT:    vinsertps {{.*#+}} xmm0 = xmm1[0,1,2],xmm0[0]
+; CHECK-NEXT:    vinsertf128 $1, {{[-0-9]+}}(%r{{[sb]}}p), %ymm0, %ymm0 # 16-byte Folded Reload
+; CHECK-NEXT:    addq $72, %rsp
+; CHECK-NEXT:    retq
+  %r = call <6 x float> @llvm.tanh.v6f32(<6 x float> %x)
+  ret <6 x float> %r
+}
+
+define <3 x double> @tanh_v3f64(<3 x double> %x) nounwind {
+; CHECK-LABEL: tanh_v3f64:
+; CHECK:       # %bb.0:
+; CHECK-NEXT:    subq $72, %rsp
+; CHECK-NEXT:    vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
+; CHECK-NEXT:    # kill: def $xmm0 killed $xmm0 killed $ymm0
+; CHECK-NEXT:    vzeroupper
+; CHECK-NEXT:    callq tanh@PLT
+; CHECK-NEXT:    vmovaps %xmm0, (%rsp) # 16-byte Spill
+; CHECK-NEXT:    vpermilpd $1, {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Folded Reload
+; CHECK-NEXT:    # xmm0 = mem[1,0]
+; CHECK-NEXT:    callq tanh@PLT
+; CHECK-NEXT:    vmovapd (%rsp), %xmm1 # 16-byte Reload
+; CHECK-NEXT:    vunpcklpd {{.*#+}} xmm0 = xmm1[0],xmm0[0]
+; CHECK-NEXT:    vmovupd %ymm0, (%rsp) # 32-byte Spill
+; CHECK-NEXT:    vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm0 # 32-byte Reload
+; CHECK-NEXT:    vextractf128 $1, %ymm0, %xmm0
+; CHECK-NEXT:    vzeroupper
+; CHECK-NEXT:    callq tanh@PLT
+; CHECK-NEXT:    vmovups (%rsp), %ymm1 # 32-byte Reload
+; CHECK-NEXT:    vinsertf128 $1, %xmm0, %ymm1, %ymm0
+; CHECK-NEXT:    addq $72, %rsp
+; CHECK-NEXT:    retq
+  %r = call <3 x double> @llvm.tanh.v3f64(<3 x double> %x)
+  ret <3 x double> %r
+}
+
 define <2 x float> @fabs_v2f32(<2 x float> %x) nounwind {
 ; CHECK-LABEL: fabs_v2f32:
 ; CHECK:       # %bb.0:
diff --git a/llvm/test/CodeGen/X86/vec_extract-mmx.ll b/llvm/test/CodeGen/X86/vec_extract-mmx.ll
index 6fd90243..cd375c0 100644
--- a/llvm/test/CodeGen/X86/vec_extract-mmx.ll
+++ b/llvm/test/CodeGen/X86/vec_extract-mmx.ll
@@ -20,9 +20,9 @@ define i32 @test0(ptr %v4) nounwind {
 entry:
   %v5 = load <1 x i64>, ptr %v4, align 8
   %v12 = bitcast <1 x i64> %v5 to <4 x i16>
-  %v13 = bitcast <4 x i16> %v12 to x86_mmx
-  %v14 = tail call x86_mmx @llvm.x86.sse.pshuf.w(x86_mmx %v13, i8 -18)
-  %v15 = bitcast x86_mmx %v14 to <4 x i16>
+  %v13 = bitcast <4 x i16> %v12 to <1 x i64>
+  %v14 = tail call <1 x i64> @llvm.x86.sse.pshuf.w(<1 x i64> %v13, i8 -18)
+  %v15 = bitcast <1 x i64> %v14 to <4 x i16>
   %v16 = bitcast <4 x i16> %v15 to <1 x i64>
   %v17 = extractelement <1 x i64> %v16, i32 0
   %v18 = bitcast i64 %v17 to <2 x i32>
@@ -52,12 +52,12 @@ entry:
   %0 = load i32, ptr %ptr, align 4
   %1 = insertelement <2 x i32> undef, i32 %0, i32 0
   %2 = insertelement <2 x i32> %1, i32 0, i32 1
-  %3 = bitcast <2 x i32> %2 to x86_mmx
-  %4 = bitcast x86_mmx %3 to i64
+  %3 = bitcast <2 x i32> %2 to <1 x i64>
+  %4 = bitcast <1 x i64> %3 to i64
   %5 = bitcast i64 %4 to <4 x i16>
-  %6 = bitcast <4 x i16> %5 to x86_mmx
-  %7 = tail call x86_mmx @llvm.x86.sse.pshuf.w(x86_mmx %6, i8 -24)
-  %8 = bitcast x86_mmx %7 to <4 x i16>
+  %6 = bitcast <4 x i16> %5 to <1 x i64>
+  %7 = tail call <1 x i64> @llvm.x86.sse.pshuf.w(<1 x i64> %6, i8 -24)
+  %8 = bitcast <1 x i64> %7 to <4 x i16>
   %9 = bitcast <4 x i16> %8 to <1 x i64>
   %10 = extractelement <1 x i64> %9, i32 0
   %11 = bitcast i64 %10 to <2 x i32>
@@ -82,9 +82,9 @@ define i32 @test2(ptr nocapture readonly %ptr) nounwind {
 ; X64-NEXT:    emms
 ; X64-NEXT:    retq
 entry:
-  %0 = load x86_mmx, ptr %ptr, align 8
-  %1 = tail call x86_mmx @llvm.x86.sse.pshuf.w(x86_mmx %0, i8 -24)
-  %2 = bitcast x86_mmx %1 to <4 x i16>
+  %0 = load <1 x i64>, ptr %ptr, align 8
+  %1 = tail call <1 x i64> @llvm.x86.sse.pshuf.w(<1 x i64> %0, i8 -24)
+  %2 = bitcast <1 x i64> %1 to <4 x i16>
   %3 = bitcast <4 x i16> %2 to <1 x i64>
   %4 = extractelement <1 x i64> %3, i32 0
   %5 = bitcast i64 %4 to <2 x i32>
@@ -93,7 +93,7 @@ entry:
   ret i32 %6
 }
 
-define i32 @test3(x86_mmx %a) nounwind {
+define i32 @test3(<1 x i64> %a) nounwind {
 ; X86-LABEL: test3:
 ; X86:       # %bb.0:
 ; X86-NEXT:    movl {{[0-9]+}}(%esp), %eax
@@ -104,13 +104,13 @@ define i32 @test3(x86_mmx %a) nounwind {
 ; X64-NEXT:    movq %rdi, %rax
 ; X64-NEXT:    # kill: def $eax killed $eax killed $rax
 ; X64-NEXT:    retq
-  %tmp0 = bitcast x86_mmx %a to <2 x i32>
+  %tmp0 = bitcast <1 x i64> %a to <2 x i32>
   %tmp1 = extractelement <2 x i32> %tmp0, i32 0
   ret i32 %tmp1
 }
 
 ; Verify we don't muck with extractelts from the upper lane.
-define i32 @test4(x86_mmx %a) nounwind {
+define i32 @test4(<1 x i64> %a) nounwind {
 ; X86-LABEL: test4:
 ; X86:       # %bb.0:
 ; X86-NEXT:    movl {{[0-9]+}}(%esp), %eax
@@ -122,10 +122,10 @@ define i32 @test4(x86_mmx %a) nounwind {
 ; X64-NEXT:    pshufd {{.*#+}} xmm0 = xmm0[1,1,1,1]
 ; X64-NEXT:    movd %xmm0, %eax
 ; X64-NEXT:    retq
-  %tmp0 = bitcast x86_mmx %a to <2 x i32>
+  %tmp0 = bitcast <1 x i64> %a to <2 x i32>
   %tmp1 = extractelement <2 x i32> %tmp0, i32 1
   ret i32 %tmp1
 }
 
-declare x86_mmx @llvm.x86.sse.pshuf.w(x86_mmx, i8)
+declare <1 x i64> @llvm.x86.sse.pshuf.w(<1 x i64>, i8)
 declare void @llvm.x86.mmx.emms()
diff --git a/llvm/test/CodeGen/X86/vec_insert-5.ll b/llvm/test/CodeGen/X86/vec_insert-5.ll
index 34280aa..176ae81 100644
--- a/llvm/test/CodeGen/X86/vec_insert-5.ll
+++ b/llvm/test/CodeGen/X86/vec_insert-5.ll
@@ -26,8 +26,8 @@ define void  @t1(i32 %a, ptr %P) nounwind {
  %tmp12 = shl i32 %a, 12
  %tmp21 = insertelement <2 x i32> undef, i32 %tmp12, i32 1
  %tmp22 = insertelement <2 x i32> %tmp21, i32 0, i32 0
- %tmp23 = bitcast <2 x i32> %tmp22 to x86_mmx
- store x86_mmx %tmp23, ptr %P
+ %tmp23 = bitcast <2 x i32> %tmp22 to <1 x i64>
+ store <1 x i64> %tmp23, ptr %P
  ret void
 }
 
diff --git a/llvm/test/CodeGen/X86/vec_insert-7.ll b/llvm/test/CodeGen/X86/vec_insert-7.ll
index aed8782..67473fe 100644
--- a/llvm/test/CodeGen/X86/vec_insert-7.ll
+++ b/llvm/test/CodeGen/X86/vec_insert-7.ll
@@ -5,7 +5,7 @@
 ; MMX insertelement is not available; these are promoted to xmm.
 ; (Without SSE they are split to two ints, and the code is much better.)
 
-define x86_mmx @mmx_movzl(x86_mmx %x) nounwind {
+define <1 x i64> @mmx_movzl(<1 x i64> %x) nounwind {
 ; X86-LABEL: mmx_movzl:
 ; X86:       ## %bb.0:
 ; X86-NEXT:    movl $32, %eax
@@ -16,9 +16,9 @@ define x86_mmx @mmx_movzl(x86_mmx %x) nounwind {
 ; X64:       ## %bb.0:
 ; X64-NEXT:    movl $32, %eax
 ; X64-NEXT:    retq
-  %tmp = bitcast x86_mmx %x to <2 x i32>
+  %tmp = bitcast <1 x i64> %x to <2 x i32>
   %tmp3 = insertelement <2 x i32> %tmp, i32 32, i32 0
   %tmp8 = insertelement <2 x i32> %tmp3, i32 0, i32 1
-  %tmp9 = bitcast <2 x i32> %tmp8 to x86_mmx
-  ret x86_mmx %tmp9
+  %tmp9 = bitcast <2 x i32> %tmp8 to <1 x i64>
+  ret <1 x i64> %tmp9
 }
diff --git a/llvm/test/CodeGen/X86/vec_insert-mmx.ll b/llvm/test/CodeGen/X86/vec_insert-mmx.ll
index c004170..f95b346 100644
--- a/llvm/test/CodeGen/X86/vec_insert-mmx.ll
+++ b/llvm/test/CodeGen/X86/vec_insert-mmx.ll
@@ -3,7 +3,7 @@
 ; RUN: llc < %s -mtriple=x86_64-darwin -mattr=+mmx,+sse4.1 | FileCheck %s --check-prefix=X64
 
 ; This is not an MMX operation; promoted to xmm.
-define x86_mmx @t0(i32 %A) nounwind {
+define <1 x i64> @t0(i32 %A) nounwind {
 ; X86-LABEL: t0:
 ; X86:       ## %bb.0:
 ; X86-NEXT:    movl {{[0-9]+}}(%esp), %edx
@@ -17,8 +17,8 @@ define x86_mmx @t0(i32 %A) nounwind {
 ; X64-NEXT:    movq %xmm0, %rax
 ; X64-NEXT:    retq
   %tmp3 = insertelement <2 x i32> < i32 0, i32 undef >, i32 %A, i32 1
-  %tmp4 = bitcast <2 x i32> %tmp3 to x86_mmx
-  ret x86_mmx %tmp4
+  %tmp4 = bitcast <2 x i32> %tmp3 to <1 x i64>
+  ret <1 x i64> %tmp4
 }
 
 define <8 x i8> @t1(i8 zeroext %x) nounwind {
diff --git a/llvm/test/CodeGen/X86/vector-shuffle-mmx.ll b/llvm/test/CodeGen/X86/vector-shuffle-mmx.ll
index 709be65..6080067 100644
--- a/llvm/test/CodeGen/X86/vector-shuffle-mmx.ll
+++ b/llvm/test/CodeGen/X86/vector-shuffle-mmx.ll
@@ -52,9 +52,9 @@ entry:
   %tmp542 = bitcast <2 x i32> %tmp529 to <4 x i16>
   %tmp543 = add <4 x i16> %tmp542, < i16 0, i16 16448, i16 24672, i16 28784 >
   %tmp555 = bitcast <4 x i16> %tmp543 to <8 x i8>
-  %tmp556 = bitcast <8 x i8> %tmp555 to x86_mmx
-  %tmp557 = bitcast <8 x i8> zeroinitializer to x86_mmx
-  tail call void @llvm.x86.mmx.maskmovq( x86_mmx %tmp557, x86_mmx %tmp556, ptr null)
+  %tmp556 = bitcast <8 x i8> %tmp555 to <1 x i64>
+  %tmp557 = bitcast <8 x i8> zeroinitializer to <1 x i64>
+  tail call void @llvm.x86.mmx.maskmovq( <1 x i64> %tmp557, <1 x i64> %tmp556, ptr null)
   ret void
 }
 
@@ -115,19 +115,19 @@ define <4 x float> @pr35869() nounwind {
 ; X64-NEXT:    punpcklwd %mm1, %mm0 ## mm0 = mm0[0],mm1[0],mm0[1],mm1[1]
 ; X64-NEXT:    cvtpi2ps %mm0, %xmm0
 ; X64-NEXT:    retq
-  %1 = tail call x86_mmx @llvm.x86.mmx.punpcklbw(x86_mmx bitcast (<8 x i8> <i8 64, i8 0, i8 0, i8 0, i8 0, i8 0, i8 0, i8 0> to x86_mmx), x86_mmx bitcast (<8 x i8> zeroinitializer to x86_mmx))
-  %2 = tail call x86_mmx @llvm.x86.mmx.pcmpgt.w(x86_mmx bitcast (<4 x i16> zeroinitializer to x86_mmx), x86_mmx %1)
-  %3 = tail call x86_mmx @llvm.x86.mmx.punpckhwd(x86_mmx %1, x86_mmx %2)
-  %4 = tail call <4 x float> @llvm.x86.sse.cvtpi2ps(<4 x float> zeroinitializer, x86_mmx %3)
+  %1 = tail call <1 x i64> @llvm.x86.mmx.punpcklbw(<1 x i64> bitcast (<8 x i8> <i8 64, i8 0, i8 0, i8 0, i8 0, i8 0, i8 0, i8 0> to <1 x i64>), <1 x i64> bitcast (<8 x i8> zeroinitializer to <1 x i64>))
+  %2 = tail call <1 x i64> @llvm.x86.mmx.pcmpgt.w(<1 x i64> bitcast (<4 x i16> zeroinitializer to <1 x i64>), <1 x i64> %1)
+  %3 = tail call <1 x i64> @llvm.x86.mmx.punpckhwd(<1 x i64> %1, <1 x i64> %2)
+  %4 = tail call <4 x float> @llvm.x86.sse.cvtpi2ps(<4 x float> zeroinitializer, <1 x i64> %3)
   %5 = shufflevector <4 x float> %4, <4 x float> undef, <4 x i32> <i32 0, i32 1, i32 0, i32 1>
-  %6 = tail call x86_mmx @llvm.x86.mmx.punpcklwd(x86_mmx %1, x86_mmx %2)
-  %7 = tail call <4 x float> @llvm.x86.sse.cvtpi2ps(<4 x float> %5, x86_mmx %6)
+  %6 = tail call <1 x i64> @llvm.x86.mmx.punpcklwd(<1 x i64> %1, <1 x i64> %2)
+  %7 = tail call <4 x float> @llvm.x86.sse.cvtpi2ps(<4 x float> %5, <1 x i64> %6)
   ret <4 x float> %7
 }
 
-declare void @llvm.x86.mmx.maskmovq(x86_mmx, x86_mmx, ptr)
-declare x86_mmx @llvm.x86.mmx.pcmpgt.w(x86_mmx, x86_mmx)
-declare x86_mmx @llvm.x86.mmx.punpcklbw(x86_mmx, x86_mmx)
-declare x86_mmx @llvm.x86.mmx.punpcklwd(x86_mmx, x86_mmx)
-declare x86_mmx @llvm.x86.mmx.punpckhwd(x86_mmx, x86_mmx)
-declare <4 x float> @llvm.x86.sse.cvtpi2ps(<4 x float>, x86_mmx)
+declare void @llvm.x86.mmx.maskmovq(<1 x i64>, <1 x i64>, ptr)
+declare <1 x i64> @llvm.x86.mmx.pcmpgt.w(<1 x i64>, <1 x i64>)
+declare <1 x i64> @llvm.x86.mmx.punpcklbw(<1 x i64>, <1 x i64>)
+declare <1 x i64> @llvm.x86.mmx.punpcklwd(<1 x i64>, <1 x i64>)
+declare <1 x i64> @llvm.x86.mmx.punpckhwd(<1 x i64>, <1 x i64>)
+declare <4 x float> @llvm.x86.sse.cvtpi2ps(<4 x float>, <1 x i64>)
diff --git a/llvm/test/CodeGen/X86/x86-64-psub.ll b/llvm/test/CodeGen/X86/x86-64-psub.ll
index 9817d79..4c11464 100644
--- a/llvm/test/CodeGen/X86/x86-64-psub.ll
+++ b/llvm/test/CodeGen/X86/x86-64-psub.ll
@@ -32,11 +32,11 @@ entry:
   %__m1.0.insert.i = insertelement <1 x i64> undef, i64 %0, i32 0
   %__m2.0.insert.i = insertelement <1 x i64> undef, i64 %1, i32 0
   %2 = bitcast <1 x i64> %__m1.0.insert.i to <8 x i8>
-  %3 = bitcast <8 x i8> %2 to x86_mmx
+  %3 = bitcast <8 x i8> %2 to <1 x i64>
   %4 = bitcast <1 x i64> %__m2.0.insert.i to <8 x i8>
-  %5 = bitcast <8 x i8> %4 to x86_mmx
-  %6 = tail call x86_mmx @llvm.x86.mmx.psub.b(x86_mmx %3, x86_mmx %5) nounwind
-  %7 = bitcast x86_mmx %6 to <8 x i8>
+  %5 = bitcast <8 x i8> %4 to <1 x i64>
+  %6 = tail call <1 x i64> @llvm.x86.mmx.psub.b(<1 x i64> %3, <1 x i64> %5) nounwind
+  %7 = bitcast <1 x i64> %6 to <8 x i8>
   %8 = bitcast <8 x i8> %7 to <1 x i64>
   %retval.0.extract.i15 = extractelement <1 x i64> %8, i32 0
   ret i64 %retval.0.extract.i15
@@ -66,11 +66,11 @@ entry:
   %__m1.0.insert.i = insertelement <1 x i64> undef, i64 %0, i32 0
   %__m2.0.insert.i = insertelement <1 x i64> undef, i64 %1, i32 0
   %2 = bitcast <1 x i64> %__m1.0.insert.i to <4 x i16>
-  %3 = bitcast <4 x i16> %2 to x86_mmx
+  %3 = bitcast <4 x i16> %2 to <1 x i64>
   %4 = bitcast <1 x i64> %__m2.0.insert.i to <4 x i16>
-  %5 = bitcast <4 x i16> %4 to x86_mmx
-  %6 = tail call x86_mmx @llvm.x86.mmx.psub.w(x86_mmx %3, x86_mmx %5) nounwind
-  %7 = bitcast x86_mmx %6 to <4 x i16>
+  %5 = bitcast <4 x i16> %4 to <1 x i64>
+  %6 = tail call <1 x i64> @llvm.x86.mmx.psub.w(<1 x i64> %3, <1 x i64> %5) nounwind
+  %7 = bitcast <1 x i64> %6 to <4 x i16>
   %8 = bitcast <4 x i16> %7 to <1 x i64>
   %retval.0.extract.i15 = extractelement <1 x i64> %8, i32 0
   ret i64 %retval.0.extract.i15
@@ -100,11 +100,11 @@ entry:
   %__m1.0.insert.i = insertelement <1 x i64> undef, i64 %0, i32 0
   %__m2.0.insert.i = insertelement <1 x i64> undef, i64 %1, i32 0
   %2 = bitcast <1 x i64> %__m1.0.insert.i to <2 x i32>
-  %3 = bitcast <2 x i32> %2 to x86_mmx
+  %3 = bitcast <2 x i32> %2 to <1 x i64>
   %4 = bitcast <1 x i64> %__m2.0.insert.i to <2 x i32>
-  %5 = bitcast <2 x i32> %4 to x86_mmx
-  %6 = tail call x86_mmx @llvm.x86.mmx.psub.d(x86_mmx %3, x86_mmx %5) nounwind
-  %7 = bitcast x86_mmx %6 to <2 x i32>
+  %5 = bitcast <2 x i32> %4 to <1 x i64>
+  %6 = tail call <1 x i64> @llvm.x86.mmx.psub.d(<1 x i64> %3, <1 x i64> %5) nounwind
+  %7 = bitcast <1 x i64> %6 to <2 x i32>
   %8 = bitcast <2 x i32> %7 to <1 x i64>
   %retval.0.extract.i15 = extractelement <1 x i64> %8, i32 0
   ret i64 %retval.0.extract.i15
@@ -134,11 +134,11 @@ entry:
   %__m1.0.insert.i = insertelement <1 x i64> undef, i64 %0, i32 0
   %__m2.0.insert.i = insertelement <1 x i64> undef, i64 %1, i32 0
   %2 = bitcast <1 x i64> %__m1.0.insert.i to <8 x i8>
-  %3 = bitcast <8 x i8> %2 to x86_mmx
+  %3 = bitcast <8 x i8> %2 to <1 x i64>
   %4 = bitcast <1 x i64> %__m2.0.insert.i to <8 x i8>
-  %5 = bitcast <8 x i8> %4 to x86_mmx
-  %6 = tail call x86_mmx @llvm.x86.mmx.psubs.b(x86_mmx %3, x86_mmx %5) nounwind
-  %7 = bitcast x86_mmx %6 to <8 x i8>
+  %5 = bitcast <8 x i8> %4 to <1 x i64>
+  %6 = tail call <1 x i64> @llvm.x86.mmx.psubs.b(<1 x i64> %3, <1 x i64> %5) nounwind
+  %7 = bitcast <1 x i64> %6 to <8 x i8>
   %8 = bitcast <8 x i8> %7 to <1 x i64>
   %retval.0.extract.i15 = extractelement <1 x i64> %8, i32 0
   ret i64 %retval.0.extract.i15
@@ -168,11 +168,11 @@ entry:
   %__m1.0.insert.i = insertelement <1 x i64> undef, i64 %0, i32 0
   %__m2.0.insert.i = insertelement <1 x i64> undef, i64 %1, i32 0
   %2 = bitcast <1 x i64> %__m1.0.insert.i to <4 x i16>
-  %3 = bitcast <4 x i16> %2 to x86_mmx
+  %3 = bitcast <4 x i16> %2 to <1 x i64>
   %4 = bitcast <1 x i64> %__m2.0.insert.i to <4 x i16>
-  %5 = bitcast <4 x i16> %4 to x86_mmx
-  %6 = tail call x86_mmx @llvm.x86.mmx.psubs.w(x86_mmx %3, x86_mmx %5) nounwind
-  %7 = bitcast x86_mmx %6 to <4 x i16>
+  %5 = bitcast <4 x i16> %4 to <1 x i64>
+  %6 = tail call <1 x i64> @llvm.x86.mmx.psubs.w(<1 x i64> %3, <1 x i64> %5) nounwind
+  %7 = bitcast <1 x i64> %6 to <4 x i16>
   %8 = bitcast <4 x i16> %7 to <1 x i64>
   %retval.0.extract.i15 = extractelement <1 x i64> %8, i32 0
   ret i64 %retval.0.extract.i15
@@ -202,11 +202,11 @@ entry:
   %__m1.0.insert.i = insertelement <1 x i64> undef, i64 %0, i32 0
   %__m2.0.insert.i = insertelement <1 x i64> undef, i64 %1, i32 0
   %2 = bitcast <1 x i64> %__m1.0.insert.i to <8 x i8>
-  %3 = bitcast <8 x i8> %2 to x86_mmx
+  %3 = bitcast <8 x i8> %2 to <1 x i64>
   %4 = bitcast <1 x i64> %__m2.0.insert.i to <8 x i8>
-  %5 = bitcast <8 x i8> %4 to x86_mmx
-  %6 = tail call x86_mmx @llvm.x86.mmx.psubus.b(x86_mmx %3, x86_mmx %5) nounwind
-  %7 = bitcast x86_mmx %6 to <8 x i8>
+  %5 = bitcast <8 x i8> %4 to <1 x i64>
+  %6 = tail call <1 x i64> @llvm.x86.mmx.psubus.b(<1 x i64> %3, <1 x i64> %5) nounwind
+  %7 = bitcast <1 x i64> %6 to <8 x i8>
   %8 = bitcast <8 x i8> %7 to <1 x i64>
   %retval.0.extract.i15 = extractelement <1 x i64> %8, i32 0
   ret i64 %retval.0.extract.i15
@@ -236,26 +236,26 @@ entry:
   %__m1.0.insert.i = insertelement <1 x i64> undef, i64 %0, i32 0
   %__m2.0.insert.i = insertelement <1 x i64> undef, i64 %1, i32 0
   %2 = bitcast <1 x i64> %__m1.0.insert.i to <4 x i16>
-  %3 = bitcast <4 x i16> %2 to x86_mmx
+  %3 = bitcast <4 x i16> %2 to <1 x i64>
   %4 = bitcast <1 x i64> %__m2.0.insert.i to <4 x i16>
-  %5 = bitcast <4 x i16> %4 to x86_mmx
-  %6 = tail call x86_mmx @llvm.x86.mmx.psubus.w(x86_mmx %3, x86_mmx %5) nounwind
-  %7 = bitcast x86_mmx %6 to <4 x i16>
+  %5 = bitcast <4 x i16> %4 to <1 x i64>
+  %6 = tail call <1 x i64> @llvm.x86.mmx.psubus.w(<1 x i64> %3, <1 x i64> %5) nounwind
+  %7 = bitcast <1 x i64> %6 to <4 x i16>
   %8 = bitcast <4 x i16> %7 to <1 x i64>
   %retval.0.extract.i15 = extractelement <1 x i64> %8, i32 0
   ret i64 %retval.0.extract.i15
 }
 
-declare x86_mmx @llvm.x86.mmx.psubus.w(x86_mmx, x86_mmx) nounwind readnone
+declare <1 x i64> @llvm.x86.mmx.psubus.w(<1 x i64>, <1 x i64>) nounwind readnone
 
-declare x86_mmx @llvm.x86.mmx.psubus.b(x86_mmx, x86_mmx) nounwind readnone
+declare <1 x i64> @llvm.x86.mmx.psubus.b(<1 x i64>, <1 x i64>) nounwind readnone
 
-declare x86_mmx @llvm.x86.mmx.psubs.w(x86_mmx, x86_mmx) nounwind readnone
+declare <1 x i64> @llvm.x86.mmx.psubs.w(<1 x i64>, <1 x i64>) nounwind readnone
 
-declare x86_mmx @llvm.x86.mmx.psubs.b(x86_mmx, x86_mmx) nounwind readnone
+declare <1 x i64> @llvm.x86.mmx.psubs.b(<1 x i64>, <1 x i64>) nounwind readnone
 
-declare x86_mmx @llvm.x86.mmx.psub.d(x86_mmx, x86_mmx) nounwind readnone
+declare <1 x i64> @llvm.x86.mmx.psub.d(<1 x i64>, <1 x i64>) nounwind readnone
 
-declare x86_mmx @llvm.x86.mmx.psub.w(x86_mmx, x86_mmx) nounwind readnone
+declare <1 x i64> @llvm.x86.mmx.psub.w(<1 x i64>, <1 x i64>) nounwind readnone
 
-declare x86_mmx @llvm.x86.mmx.psub.b(x86_mmx, x86_mmx) nounwind readnone
+declare <1 x i64> @llvm.x86.mmx.psub.b(<1 x i64>, <1 x i64>) nounwind readnone
diff --git a/llvm/test/DebugInfo/X86/loop-align-debug.ll b/llvm/test/DebugInfo/X86/loop-align-debug.ll
new file mode 100644
index 0000000..a0302d0
--- /dev/null
+++ b/llvm/test/DebugInfo/X86/loop-align-debug.ll
@@ -0,0 +1,55 @@
+; RUN: llc %s --filetype=obj -o %t
+; RUN: llvm-objdump -d %t | FileCheck %s --check-prefixes=OBJ
+; RUN: llvm-dwarfdump --debug-line %t | FileCheck %s --check-prefixes=DBG
+; RUN: llc %s -o - | FileCheck %s --check-prefixes=ASM
+
+; OBJ: 1:{{.*}}nop
+
+;;     Address            Line   Column File   ISA Discriminator OpIndex Flags
+; DBG: 0x0000000000000000      3      0      0   0             0       0  is_stmt
+; DBG: 0x0000000000000001      0      0      0   0             0       0
+; DBG: 0x0000000000000010      5      0      0   0             0       0  is_stmt prologue_end
+; DBG: 0x0000000000000017      5      0      0   0             0       0  is_stmt end_sequence
+
+; ASM:      .loc    0 0 0 is_stmt 0
+; ASM-NEXT: .L{{.*}}:
+; ASM-NEXT: .p2align        4, 0x90
+
+;; $ cat test.cpp
+;; void g();
+;; void f() {
+;;   [[clang::code_align(16)]]
+;;   while (1) { g(); }
+;; }
+
+target datalayout = "e-m:e-p270:32:32-p271:32:32-p272:64:64-i64:64-i128:128-f80:128-n8:16:32:64-S128"
+target triple = "x86_64-unknown-linux-gnu"
+
+define dso_local void @f() local_unnamed_addr !dbg !9 {
+entry:
+  br label %while.body, !dbg !12
+
+while.body:                                       ; preds = %entry, %while.body
+  tail call void @g(), !dbg !12
+  br label %while.body, !dbg !12, !llvm.loop !13
+}
+
+declare !dbg !16 void @g() local_unnamed_addr
+
+!llvm.dbg.cu = !{!0}
+!llvm.module.flags = !{!2, !3}
+!llvm.ident = !{!8}
+
+!0 = distinct !DICompileUnit(language: DW_LANG_C_plus_plus_14, file: !1, producer: "clang version 19.0.0git", isOptimized: true, runtimeVersion: 0, emissionKind: LineTablesOnly, splitDebugInlining: false, nameTableKind: None)
+!1 = !DIFile(filename: "test.cpp", directory: "/")
+!2 = !{i32 7, !"Dwarf Version", i32 5}
+!3 = !{i32 2, !"Debug Info Version", i32 3}
+!8 = !{!"clang version 19.0.0git"}
+!9 = distinct !DISubprogram(name: "f", scope: !1, file: !1, line: 3, type: !10, scopeLine: 3, flags: DIFlagPrototyped | DIFlagAllCallsDescribed, spFlags: DISPFlagDefinition | DISPFlagOptimized, unit: !0)
+!10 = !DISubroutineType(types: !11)
+!11 = !{}
+!12 = !DILocation(line: 5, scope: !9)
+!13 = distinct !{!13, !12, !12, !14, !15}
+!14 = !{!"llvm.loop.mustprogress"}
+!15 = !{!"llvm.loop.align", i32 16}
+!16 = !DISubprogram(name: "g", scope: !1, file: !1, line: 2, type: !10, flags: DIFlagPrototyped, spFlags: DISPFlagOptimized)
diff --git a/llvm/test/Instrumentation/HWAddressSanitizer/pgo-opt-out.ll b/llvm/test/Instrumentation/HWAddressSanitizer/pgo-opt-out.ll
index d5cb27d..0ef9423 100644
--- a/llvm/test/Instrumentation/HWAddressSanitizer/pgo-opt-out.ll
+++ b/llvm/test/Instrumentation/HWAddressSanitizer/pgo-opt-out.ll
@@ -1,15 +1,9 @@
-; RUN: opt < %s -passes='require<profile-summary>,hwasan' -pass-remarks=hwasan -pass-remarks-missed=hwasan -S -hwasan-percentile-cutoff-hot=700000 2>&1 | FileCheck %s --check-prefix=HOT70
-; RUN: opt < %s -passes='require<profile-summary>,hwasan' -pass-remarks=hwasan -pass-remarks-missed=hwasan -S -hwasan-percentile-cutoff-hot=990000 2>&1 | FileCheck %s --check-prefix=HOT99
+; RUN: opt < %s -passes='require<profile-summary>,hwasan' -pass-remarks=hwasan -pass-remarks-missed=hwasan -S -hwasan-percentile-cutoff-hot=700000 2>&1 | FileCheck %s --check-prefix=ALL
+; RUN: opt < %s -passes='require<profile-summary>,hwasan' -pass-remarks=hwasan -pass-remarks-missed=hwasan -S -hwasan-percentile-cutoff-hot=990000 2>&1 | FileCheck %s --check-prefix=NONE
 ; RUN: opt < %s -passes='require<profile-summary>,hwasan' -pass-remarks=hwasan -pass-remarks-missed=hwasan -S -hwasan-random-rate=1.0 2>&1 | FileCheck %s --check-prefix=ALL
 ; RUN: opt < %s -passes='require<profile-summary>,hwasan' -pass-remarks=hwasan -pass-remarks-missed=hwasan -S -hwasan-random-rate=0.0 2>&1 | FileCheck %s --check-prefix=NONE
-
-; HOT70: remark: <unknown>:0:0: Sanitized: F=sanitized
-; HOT70: @sanitized
-; HOT70-NEXT: @__hwasan_tls
-
-; HOT99: remark: <unknown>:0:0: Skipped: F=sanitized
-; HOT99: @sanitized
-; HOT99-NEXT: %x = alloca i8, i64 4
+; RUN: opt < %s -passes='require<profile-summary>,hwasan' -pass-remarks=hwasan -pass-remarks-missed=hwasan -S -hwasan-random-rate=1.0 -hwasan-percentile-cutoff-hot=990000 2>&1 | FileCheck %s --check-prefix=NONE
+; RUN: opt < %s -passes='require<profile-summary>,hwasan' -pass-remarks=hwasan -pass-remarks-missed=hwasan -S -hwasan-random-rate=0.0 -hwasan-percentile-cutoff-hot=700000 2>&1 | FileCheck %s --check-prefix=NONE
 
 ; ALL: remark: <unknown>:0:0: Sanitized: F=sanitize
 ; ALL: @sanitized
diff --git a/llvm/test/Instrumentation/HeapProfiler/basic-histogram.ll b/llvm/test/Instrumentation/HeapProfiler/basic-histogram.ll
new file mode 100644
index 0000000..c7ff129
--- /dev/null
+++ b/llvm/test/Instrumentation/HeapProfiler/basic-histogram.ll
@@ -0,0 +1,57 @@
+; Test basic memory profiler instrumentation with histograms.
+;
+; RUN: opt < %s -passes='function(memprof),memprof-module' -memprof-histogram -S | FileCheck --check-prefixes=CHECK,CHECK-S3 %s
+
+target datalayout = "e-p:64:64:64-i1:8:8-i8:8:8-i16:16:16-i32:32:32-i64:64:64-f32:32:32-f64:64:64-v64:64:64-v128:128:128-a0:0:64-s0:64:64-f80:128:128-n8:16:32:64"
+target triple = "x86_64-unknown-linux-gnu"
+
+; CHECK: @llvm.used = appending global [1 x ptr] [ptr @memprof.module_ctor]
+; CHECK: @llvm.global_ctors = appending global [1 x { i32, ptr, ptr }] [{ i32, ptr, ptr } { i32 1, ptr @memprof.module_ctor, ptr null }]
+
+define i32 @test_load(ptr %a) {
+entry:
+  %tmp1 = load i32, ptr %a, align 4
+  ret i32 %tmp1
+}
+; CHECK-LABEL: @test_load
+; CHECK:         %[[SHADOW_OFFSET:[^ ]*]] = load i64, ptr @__memprof_shadow_memory_dynamic_address
+; CHECK-NEXT:    %[[LOAD_ADDR:[^ ]*]] = ptrtoint ptr %a to i64
+; CHECK-NEXT:    %[[MASKED_ADDR:[^ ]*]] = and i64 %[[LOAD_ADDR]], -8
+; CHECK-S3-NEXT: %[[SHIFTED_ADDR:[^ ]*]] = lshr i64 %[[MASKED_ADDR]], 3
+; CHECK-NEXT:    add i64 %[[SHIFTED_ADDR]], %[[SHADOW_OFFSET]]
+; CHECK-NEXT:    %[[LOAD_SHADOW_PTR:[^ ]*]] = inttoptr
+; CHECK-NEXT:    %[[LOAD_SHADOW:[^ ]*]] = load i8, ptr %[[LOAD_SHADOW_PTR]]
+; CHECK-NEXT:    %[[ICMP_MAX_COUNT:[^ ]*]] = icmp ult i8 %[[LOAD_SHADOW]], -1
+; CHECK-NEXT:    br i1 %[[ICMP_MAX_COUNT]], label %[[INC_LABEL:[^ ]*]], label %[[ELSE_LABEL:[^ ]*]]
+; CHECK:         [[INC_LABEL]]:
+; CHECK-NEXT:    %[[NEW_SHADOW:[^ ]*]] = add i8 %[[LOAD_SHADOW]], 1
+; CHECK-NEXT:    store i8 %[[NEW_SHADOW]], ptr %[[LOAD_SHADOW_PTR]]
+; CHECK-NEXT:    br label %[[ELSE_LABEL]]
+; The actual load.
+; CHECK:         [[ELSE_LABEL]]:
+; CHECK-NEXT:    %tmp1 = load i32, ptr %a
+; CHECK-NEXT:    ret i32 %tmp1
+
+define void @test_store(ptr %a) {
+entry:
+  store i32 42, ptr %a, align 4
+  ret void
+}
+; CHECK-LABEL: @test_store
+; CHECK:         %[[SHADOW_OFFSET:[^ ]*]] = load i64, ptr @__memprof_shadow_memory_dynamic_address
+; CHECK-NEXT:    %[[LOAD_ADDR:[^ ]*]] = ptrtoint ptr %a to i64
+; CHECK-NEXT:    %[[MASKED_ADDR:[^ ]*]] = and i64 %[[LOAD_ADDR]], -8
+; CHECK-S3-NEXT: %[[SHIFTED_ADDR:[^ ]*]] = lshr i64 %[[MASKED_ADDR]], 3
+; CHECK-NEXT:    add i64 %[[SHIFTED_ADDR]], %[[SHADOW_OFFSET]]
+; CHECK-NEXT:    %[[STORE_SHADOW_PTR:[^ ]*]] = inttoptr
+; CHECK-NEXT:    %[[STORE_SHADOW:[^ ]*]] = load i8, ptr %[[STORE_SHADOW_PTR]]
+; CHECK-NEXT:    %[[ICMP_MAX_COUNT:[^ ]*]] = icmp ult i8 %[[STORE_SHADOW]], -1
+; CHECK-NEXT:    br i1 %[[ICMP_MAX_COUNT]], label %[[INC_LABEL:[^ ]*]], label %[[ELSE_LABEL:[^ ]*]]
+; CHECK:         [[INC_LABEL]]:
+; CHECK-NEXT:    %[[NEW_SHADOW:[^ ]*]] = add i8 %[[STORE_SHADOW]], 1
+; CHECK-NEXT:    store i8 %[[NEW_SHADOW]], ptr %[[STORE_SHADOW_PTR]]
+; CHECK-NEXT:    br label %[[ELSE_LABEL]]
+; The actual store.
+; CHECK:         [[ELSE_LABEL]]:
+; CHECK-NEXT:    store i32 42, ptr %a, align 4
+; CHECK-NEXT:    ret void
+\ No newline at end of file
diff --git a/llvm/test/Instrumentation/InstrProfiling/mcdc.ll b/llvm/test/Instrumentation/InstrProfiling/mcdc.ll
index 20c002e..fbdc7b5 100644
--- a/llvm/test/Instrumentation/InstrProfiling/mcdc.ll
+++ b/llvm/test/Instrumentation/InstrProfiling/mcdc.ll
@@ -1,9 +1,7 @@
 ; Check that MC/DC intrinsics are properly lowered
 ; RUN: opt < %s -passes=instrprof -S | FileCheck %s --check-prefixes=CHECK,BASIC
 ; RUN: opt < %s -passes=instrprof -S -instrprof-atomic-counter-update-all | FileCheck %s --check-prefixes=CHECK,ATOMIC
-; RUN: opt < %s -passes=instrprof -runtime-counter-relocation -S 2>&1 | FileCheck %s --check-prefix RELOC
-
-; RELOC: Runtime counter relocation is presently not supported for MC/DC bitmaps
+; RUN: opt < %s -passes=instrprof -S -runtime-counter-relocation | FileCheck %s --check-prefixes=CHECK,RELOC
 
 target triple = "x86_64-unknown-linux-gnu"
 
@@ -14,10 +12,15 @@ target triple = "x86_64-unknown-linux-gnu"
 
 define dso_local void @test(i32 noundef %A) {
 entry:
+  ; RELOC: %profbm_bias = load i64, ptr @__llvm_profile_bitmap_bias, align [[#]], !invariant.load !0
+  ; RELOC: %profc_bias = load i64, ptr @__llvm_profile_counter_bias, align [[#]]
   %A.addr = alloca i32, align 4
   %mcdc.addr = alloca i32, align 4
   call void @llvm.instrprof.cover(ptr @__profn_test, i64 99278, i32 5, i32 0)
   ; BASIC: store i8 0, ptr @__profc_test, align 1
+  ; RELOC: %[[PROFC_INTADDR:.+]] = add i64 ptrtoint (ptr @__profc_test to i64), %profc_bias
+  ; RELOC: %[[PROFC_ADDR:.+]] = inttoptr i64 %[[PROFC_INTADDR]] to ptr
+  ; RELOC: store i8 0, ptr %[[PROFC_ADDR]], align 1
 
   call void @llvm.instrprof.mcdc.parameters(ptr @__profn_test, i64 99278, i32 1)
   store i32 0, ptr %mcdc.addr, align 4
@@ -25,6 +28,7 @@ entry:
   %tobool = icmp ne i32 %0, 0
 
   call void @llvm.instrprof.mcdc.tvbitmap.update(ptr @__profn_test, i64 99278, i32 0, ptr %mcdc.addr)
+  ; RELOC:      [[PROFBM_ADDR:%.+]] = getelementptr i8, ptr @__profbm_test, i64 %profbm_bias
   ; CHECK:      %[[TEMP0:mcdc.*]] = load i32, ptr %mcdc.addr, align 4
   ; CHECK-NEXT: %[[TEMP:[0-9]+]] = add i32 %[[TEMP0]], 0
   ; CHECK-NEXT: %[[LAB4:[0-9]+]] = lshr i32 %[[TEMP]], 3
@@ -39,7 +43,9 @@ entry:
 ; CHECK: define private void @[[RMW_OR]](ptr %[[ARGPTR:.+]], i8 %[[ARGVAL:.+]])
 ; CHECK:      %[[BITS:.+]] = load i8, ptr %[[ARGPTR]], align 1
 ; BASIC-NEXT: %[[LAB11:[0-9]+]] = or i8 %[[BITS]], %[[ARGVAL]]
+; RELOC-NEXT: %[[LAB11:[0-9]+]] = or i8 %[[BITS]], %[[ARGVAL]]
 ; BASIC-NEXT: store i8 %[[LAB11]], ptr %[[ARGPTR]], align 1
+; RELOC-NEXT: store i8 %[[LAB11]], ptr %[[ARGPTR]], align 1
 ; ATOMIC-NEXT: %[[MASKED:.+]] = and i8 %[[BITS]], %[[ARGVAL]]
 ; ATOMIC-NEXT: %[[SHOULDWRITE:.+]] = icmp ne i8 %[[MASKED]], %[[ARGVAL]]
 ; ATOMIC-NEXT: br i1 %[[SHOULDWRITE]], label %[[WRITE:.+]], label %[[SKIP:.+]], !prof ![[MDPROF:[0-9]+]]
diff --git a/llvm/test/Instrumentation/MemorySanitizer/AArch64/neon_vst.ll b/llvm/test/Instrumentation/MemorySanitizer/AArch64/neon_vst.ll
index df3bfba..421f00f 100644
--- a/llvm/test/Instrumentation/MemorySanitizer/AArch64/neon_vst.ll
+++ b/llvm/test/Instrumentation/MemorySanitizer/AArch64/neon_vst.ll
@@ -22,6 +22,7 @@ define void @st2_8b(<8 x i8> %A, <8 x i8> %B, ptr %P) nounwind sanitize_memory {
 ; CHECK-NEXT:    [[TMP4:%.*]] = ptrtoint ptr [[P]] to i64
 ; CHECK-NEXT:    [[TMP5:%.*]] = xor i64 [[TMP4]], 193514046488576
 ; CHECK-NEXT:    [[TMP6:%.*]] = inttoptr i64 [[TMP5]] to ptr
+; CHECK-NEXT:    call void @llvm.aarch64.neon.st2.v8i8.p0(<8 x i8> [[TMP2]], <8 x i8> [[TMP3]], ptr [[TMP6]])
 ; CHECK-NEXT:    [[_MSCMP:%.*]] = icmp ne i64 [[TMP1]], 0
 ; CHECK-NEXT:    br i1 [[_MSCMP]], label [[TMP7:%.*]], label [[TMP8:%.*]], !prof [[PROF0:![0-9]+]]
 ; CHECK:       7:
@@ -29,7 +30,6 @@ define void @st2_8b(<8 x i8> %A, <8 x i8> %B, ptr %P) nounwind sanitize_memory {
 ; CHECK-NEXT:    unreachable
 ; CHECK:       8:
 ; CHECK-NEXT:    call void @llvm.aarch64.neon.st2.v8i8.p0(<8 x i8> [[A]], <8 x i8> [[B]], ptr [[P]])
-; CHECK-NEXT:    call void @llvm.aarch64.neon.st2.v8i8.p0(<8 x i8> [[TMP2]], <8 x i8> [[TMP3]], ptr [[TMP6]])
 ; CHECK-NEXT:    ret void
 ;
   call void @llvm.aarch64.neon.st2.v8i8.p0(<8 x i8> %A, <8 x i8> %B, ptr %P)
@@ -47,6 +47,7 @@ define void @st2_8b_undefA(<8 x i8> %A, <8 x i8> %B, ptr %P) nounwind sanitize_m
 ; CHECK-NEXT:    [[TMP3:%.*]] = ptrtoint ptr [[P]] to i64
 ; CHECK-NEXT:    [[TMP4:%.*]] = xor i64 [[TMP3]], 193514046488576
 ; CHECK-NEXT:    [[TMP5:%.*]] = inttoptr i64 [[TMP4]] to ptr
+; CHECK-NEXT:    call void @llvm.aarch64.neon.st2.v8i8.p0(<8 x i8> <i8 -1, i8 -1, i8 -1, i8 -1, i8 -1, i8 -1, i8 -1, i8 -1>, <8 x i8> [[TMP2]], ptr [[TMP5]])
 ; CHECK-NEXT:    [[_MSCMP:%.*]] = icmp ne i64 [[TMP1]], 0
 ; CHECK-NEXT:    br i1 [[_MSCMP]], label [[TMP6:%.*]], label [[TMP7:%.*]], !prof [[PROF0]]
 ; CHECK:       6:
@@ -54,7 +55,6 @@ define void @st2_8b_undefA(<8 x i8> %A, <8 x i8> %B, ptr %P) nounwind sanitize_m
 ; CHECK-NEXT:    unreachable
 ; CHECK:       7:
 ; CHECK-NEXT:    call void @llvm.aarch64.neon.st2.v8i8.p0(<8 x i8> undef, <8 x i8> [[B]], ptr [[P]])
-; CHECK-NEXT:    call void @llvm.aarch64.neon.st2.v8i8.p0(<8 x i8> <i8 -1, i8 -1, i8 -1, i8 -1, i8 -1, i8 -1, i8 -1, i8 -1>, <8 x i8> [[TMP2]], ptr [[TMP5]])
 ; CHECK-NEXT:    ret void
 ;
   call void @llvm.aarch64.neon.st2.v8i8.p0(<8 x i8> undef, <8 x i8> %B, ptr %P)
@@ -72,6 +72,7 @@ define void @st2_8b_undefB(<8 x i8> %A, <8 x i8> %B, ptr %P) nounwind sanitize_m
 ; CHECK-NEXT:    [[TMP3:%.*]] = ptrtoint ptr [[P]] to i64
 ; CHECK-NEXT:    [[TMP4:%.*]] = xor i64 [[TMP3]], 193514046488576
 ; CHECK-NEXT:    [[TMP5:%.*]] = inttoptr i64 [[TMP4]] to ptr
+; CHECK-NEXT:    call void @llvm.aarch64.neon.st2.v8i8.p0(<8 x i8> [[TMP2]], <8 x i8> <i8 -1, i8 -1, i8 -1, i8 -1, i8 -1, i8 -1, i8 -1, i8 -1>, ptr [[TMP5]])
 ; CHECK-NEXT:    [[_MSCMP:%.*]] = icmp ne i64 [[TMP1]], 0
 ; CHECK-NEXT:    br i1 [[_MSCMP]], label [[TMP6:%.*]], label [[TMP7:%.*]], !prof [[PROF0]]
 ; CHECK:       6:
@@ -79,7 +80,6 @@ define void @st2_8b_undefB(<8 x i8> %A, <8 x i8> %B, ptr %P) nounwind sanitize_m
 ; CHECK-NEXT:    unreachable
 ; CHECK:       7:
 ; CHECK-NEXT:    call void @llvm.aarch64.neon.st2.v8i8.p0(<8 x i8> [[A]], <8 x i8> undef, ptr [[P]])
-; CHECK-NEXT:    call void @llvm.aarch64.neon.st2.v8i8.p0(<8 x i8> [[TMP2]], <8 x i8> <i8 -1, i8 -1, i8 -1, i8 -1, i8 -1, i8 -1, i8 -1, i8 -1>, ptr [[TMP5]])
 ; CHECK-NEXT:    ret void
 ;
   call void @llvm.aarch64.neon.st2.v8i8.p0(<8 x i8> %A, <8 x i8> undef, ptr %P)
@@ -96,6 +96,7 @@ define void @st2_8b_undefAB(<8 x i8> %A, <8 x i8> %B, ptr %P) nounwind sanitize_
 ; CHECK-NEXT:    [[TMP2:%.*]] = ptrtoint ptr [[P]] to i64
 ; CHECK-NEXT:    [[TMP3:%.*]] = xor i64 [[TMP2]], 193514046488576
 ; CHECK-NEXT:    [[TMP4:%.*]] = inttoptr i64 [[TMP3]] to ptr
+; CHECK-NEXT:    call void @llvm.aarch64.neon.st2.v8i8.p0(<8 x i8> <i8 -1, i8 -1, i8 -1, i8 -1, i8 -1, i8 -1, i8 -1, i8 -1>, <8 x i8> <i8 -1, i8 -1, i8 -1, i8 -1, i8 -1, i8 -1, i8 -1, i8 -1>, ptr [[TMP4]])
 ; CHECK-NEXT:    [[_MSCMP:%.*]] = icmp ne i64 [[TMP1]], 0
 ; CHECK-NEXT:    br i1 [[_MSCMP]], label [[TMP5:%.*]], label [[TMP6:%.*]], !prof [[PROF0]]
 ; CHECK:       5:
@@ -103,7 +104,6 @@ define void @st2_8b_undefAB(<8 x i8> %A, <8 x i8> %B, ptr %P) nounwind sanitize_
 ; CHECK-NEXT:    unreachable
 ; CHECK:       6:
 ; CHECK-NEXT:    call void @llvm.aarch64.neon.st2.v8i8.p0(<8 x i8> undef, <8 x i8> undef, ptr [[P]])
-; CHECK-NEXT:    call void @llvm.aarch64.neon.st2.v8i8.p0(<8 x i8> <i8 -1, i8 -1, i8 -1, i8 -1, i8 -1, i8 -1, i8 -1, i8 -1>, <8 x i8> <i8 -1, i8 -1, i8 -1, i8 -1, i8 -1, i8 -1, i8 -1, i8 -1>, ptr [[TMP4]])
 ; CHECK-NEXT:    ret void
 ;
   call void @llvm.aarch64.neon.st2.v8i8.p0(<8 x i8> undef, <8 x i8> undef, ptr %P)
@@ -123,6 +123,7 @@ define void @st3_8b(<8 x i8> %A, <8 x i8> %B, <8 x i8> %C, ptr %P) nounwind sani
 ; CHECK-NEXT:    [[TMP5:%.*]] = ptrtoint ptr [[P]] to i64
 ; CHECK-NEXT:    [[TMP6:%.*]] = xor i64 [[TMP5]], 193514046488576
 ; CHECK-NEXT:    [[TMP7:%.*]] = inttoptr i64 [[TMP6]] to ptr
+; CHECK-NEXT:    call void @llvm.aarch64.neon.st3.v8i8.p0(<8 x i8> [[TMP2]], <8 x i8> [[TMP3]], <8 x i8> [[TMP4]], ptr [[TMP7]])
 ; CHECK-NEXT:    [[_MSCMP:%.*]] = icmp ne i64 [[TMP1]], 0
 ; CHECK-NEXT:    br i1 [[_MSCMP]], label [[TMP8:%.*]], label [[TMP9:%.*]], !prof [[PROF0]]
 ; CHECK:       8:
@@ -130,7 +131,6 @@ define void @st3_8b(<8 x i8> %A, <8 x i8> %B, <8 x i8> %C, ptr %P) nounwind sani
 ; CHECK-NEXT:    unreachable
 ; CHECK:       9:
 ; CHECK-NEXT:    call void @llvm.aarch64.neon.st3.v8i8.p0(<8 x i8> [[A]], <8 x i8> [[B]], <8 x i8> [[C]], ptr [[P]])
-; CHECK-NEXT:    call void @llvm.aarch64.neon.st3.v8i8.p0(<8 x i8> [[TMP2]], <8 x i8> [[TMP3]], <8 x i8> [[TMP4]], ptr [[TMP7]])
 ; CHECK-NEXT:    ret void
 ;
   call void @llvm.aarch64.neon.st3.v8i8.p0(<8 x i8> %A, <8 x i8> %B, <8 x i8> %C, ptr %P)
@@ -149,6 +149,7 @@ define void @st3_8b_undefA(<8 x i8> %A, <8 x i8> %B, <8 x i8> %C, ptr %P) nounwi
 ; CHECK-NEXT:    [[TMP4:%.*]] = ptrtoint ptr [[P]] to i64
 ; CHECK-NEXT:    [[TMP5:%.*]] = xor i64 [[TMP4]], 193514046488576
 ; CHECK-NEXT:    [[TMP6:%.*]] = inttoptr i64 [[TMP5]] to ptr
+; CHECK-NEXT:    call void @llvm.aarch64.neon.st3.v8i8.p0(<8 x i8> <i8 -1, i8 -1, i8 -1, i8 -1, i8 -1, i8 -1, i8 -1, i8 -1>, <8 x i8> [[TMP2]], <8 x i8> [[TMP3]], ptr [[TMP6]])
 ; CHECK-NEXT:    [[_MSCMP:%.*]] = icmp ne i64 [[TMP1]], 0
 ; CHECK-NEXT:    br i1 [[_MSCMP]], label [[TMP7:%.*]], label [[TMP8:%.*]], !prof [[PROF0]]
 ; CHECK:       7:
@@ -156,7 +157,6 @@ define void @st3_8b_undefA(<8 x i8> %A, <8 x i8> %B, <8 x i8> %C, ptr %P) nounwi
 ; CHECK-NEXT:    unreachable
 ; CHECK:       8:
 ; CHECK-NEXT:    call void @llvm.aarch64.neon.st3.v8i8.p0(<8 x i8> undef, <8 x i8> [[B]], <8 x i8> [[C]], ptr [[P]])
-; CHECK-NEXT:    call void @llvm.aarch64.neon.st3.v8i8.p0(<8 x i8> <i8 -1, i8 -1, i8 -1, i8 -1, i8 -1, i8 -1, i8 -1, i8 -1>, <8 x i8> [[TMP2]], <8 x i8> [[TMP3]], ptr [[TMP6]])
 ; CHECK-NEXT:    ret void
 ;
   call void @llvm.aarch64.neon.st3.v8i8.p0(<8 x i8> undef, <8 x i8> %B, <8 x i8> %C, ptr %P)
@@ -175,6 +175,7 @@ define void @st3_8b_undefB(<8 x i8> %A, <8 x i8> %B, <8 x i8> %C, ptr %P) nounwi
 ; CHECK-NEXT:    [[TMP4:%.*]] = ptrtoint ptr [[P]] to i64
 ; CHECK-NEXT:    [[TMP5:%.*]] = xor i64 [[TMP4]], 193514046488576
 ; CHECK-NEXT:    [[TMP6:%.*]] = inttoptr i64 [[TMP5]] to ptr
+; CHECK-NEXT:    call void @llvm.aarch64.neon.st3.v8i8.p0(<8 x i8> [[TMP2]], <8 x i8> <i8 -1, i8 -1, i8 -1, i8 -1, i8 -1, i8 -1, i8 -1, i8 -1>, <8 x i8> [[TMP3]], ptr [[TMP6]])
 ; CHECK-NEXT:    [[_MSCMP:%.*]] = icmp ne i64 [[TMP1]], 0
 ; CHECK-NEXT:    br i1 [[_MSCMP]], label [[TMP7:%.*]], label [[TMP8:%.*]], !prof [[PROF0]]
 ; CHECK:       7:
@@ -182,7 +183,6 @@ define void @st3_8b_undefB(<8 x i8> %A, <8 x i8> %B, <8 x i8> %C, ptr %P) nounwi
 ; CHECK-NEXT:    unreachable
 ; CHECK:       8:
 ; CHECK-NEXT:    call void @llvm.aarch64.neon.st3.v8i8.p0(<8 x i8> [[A]], <8 x i8> undef, <8 x i8> [[C]], ptr [[P]])
-; CHECK-NEXT:    call void @llvm.aarch64.neon.st3.v8i8.p0(<8 x i8> [[TMP2]], <8 x i8> <i8 -1, i8 -1, i8 -1, i8 -1, i8 -1, i8 -1, i8 -1, i8 -1>, <8 x i8> [[TMP3]], ptr [[TMP6]])
 ; CHECK-NEXT:    ret void
 ;
   call void @llvm.aarch64.neon.st3.v8i8.p0(<8 x i8> %A, <8 x i8> undef, <8 x i8> %C, ptr %P)
@@ -201,6 +201,7 @@ define void @st3_8b_undefC(<8 x i8> %A, <8 x i8> %B, <8 x i8> %C, ptr %P) nounwi
 ; CHECK-NEXT:    [[TMP4:%.*]] = ptrtoint ptr [[P]] to i64
 ; CHECK-NEXT:    [[TMP5:%.*]] = xor i64 [[TMP4]], 193514046488576
 ; CHECK-NEXT:    [[TMP6:%.*]] = inttoptr i64 [[TMP5]] to ptr
+; CHECK-NEXT:    call void @llvm.aarch64.neon.st3.v8i8.p0(<8 x i8> [[TMP2]], <8 x i8> [[TMP3]], <8 x i8> <i8 -1, i8 -1, i8 -1, i8 -1, i8 -1, i8 -1, i8 -1, i8 -1>, ptr [[TMP6]])
 ; CHECK-NEXT:    [[_MSCMP:%.*]] = icmp ne i64 [[TMP1]], 0
 ; CHECK-NEXT:    br i1 [[_MSCMP]], label [[TMP7:%.*]], label [[TMP8:%.*]], !prof [[PROF0]]
 ; CHECK:       7:
@@ -208,7 +209,6 @@ define void @st3_8b_undefC(<8 x i8> %A, <8 x i8> %B, <8 x i8> %C, ptr %P) nounwi
 ; CHECK-NEXT:    unreachable
 ; CHECK:       8:
 ; CHECK-NEXT:    call void @llvm.aarch64.neon.st3.v8i8.p0(<8 x i8> [[A]], <8 x i8> [[B]], <8 x i8> undef, ptr [[P]])
-; CHECK-NEXT:    call void @llvm.aarch64.neon.st3.v8i8.p0(<8 x i8> [[TMP2]], <8 x i8> [[TMP3]], <8 x i8> <i8 -1, i8 -1, i8 -1, i8 -1, i8 -1, i8 -1, i8 -1, i8 -1>, ptr [[TMP6]])
 ; CHECK-NEXT:    ret void
 ;
   call void @llvm.aarch64.neon.st3.v8i8.p0(<8 x i8> %A, <8 x i8> %B, <8 x i8> undef, ptr %P)
@@ -226,6 +226,7 @@ define void @st3_8b_undefAB(<8 x i8> %A, <8 x i8> %B, <8 x i8> %C, ptr %P) nounw
 ; CHECK-NEXT:    [[TMP3:%.*]] = ptrtoint ptr [[P]] to i64
 ; CHECK-NEXT:    [[TMP4:%.*]] = xor i64 [[TMP3]], 193514046488576
 ; CHECK-NEXT:    [[TMP5:%.*]] = inttoptr i64 [[TMP4]] to ptr
+; CHECK-NEXT:    call void @llvm.aarch64.neon.st3.v8i8.p0(<8 x i8> <i8 -1, i8 -1, i8 -1, i8 -1, i8 -1, i8 -1, i8 -1, i8 -1>, <8 x i8> <i8 -1, i8 -1, i8 -1, i8 -1, i8 -1, i8 -1, i8 -1, i8 -1>, <8 x i8> [[TMP2]], ptr [[TMP5]])
 ; CHECK-NEXT:    [[_MSCMP:%.*]] = icmp ne i64 [[TMP1]], 0
 ; CHECK-NEXT:    br i1 [[_MSCMP]], label [[TMP6:%.*]], label [[TMP7:%.*]], !prof [[PROF0]]
 ; CHECK:       6:
@@ -233,7 +234,6 @@ define void @st3_8b_undefAB(<8 x i8> %A, <8 x i8> %B, <8 x i8> %C, ptr %P) nounw
 ; CHECK-NEXT:    unreachable
 ; CHECK:       7:
 ; CHECK-NEXT:    call void @llvm.aarch64.neon.st3.v8i8.p0(<8 x i8> undef, <8 x i8> undef, <8 x i8> [[C]], ptr [[P]])
-; CHECK-NEXT:    call void @llvm.aarch64.neon.st3.v8i8.p0(<8 x i8> <i8 -1, i8 -1, i8 -1, i8 -1, i8 -1, i8 -1, i8 -1, i8 -1>, <8 x i8> <i8 -1, i8 -1, i8 -1, i8 -1, i8 -1, i8 -1, i8 -1, i8 -1>, <8 x i8> [[TMP2]], ptr [[TMP5]])
 ; CHECK-NEXT:    ret void
 ;
   call void @llvm.aarch64.neon.st3.v8i8.p0(<8 x i8> undef, <8 x i8> undef, <8 x i8> %C, ptr %P)
@@ -251,6 +251,7 @@ define void @st3_8b_undefAC(<8 x i8> %A, <8 x i8> %B, <8 x i8> %C, ptr %P) nounw
 ; CHECK-NEXT:    [[TMP3:%.*]] = ptrtoint ptr [[P]] to i64
 ; CHECK-NEXT:    [[TMP4:%.*]] = xor i64 [[TMP3]], 193514046488576
 ; CHECK-NEXT:    [[TMP5:%.*]] = inttoptr i64 [[TMP4]] to ptr
+; CHECK-NEXT:    call void @llvm.aarch64.neon.st3.v8i8.p0(<8 x i8> <i8 -1, i8 -1, i8 -1, i8 -1, i8 -1, i8 -1, i8 -1, i8 -1>, <8 x i8> [[TMP2]], <8 x i8> <i8 -1, i8 -1, i8 -1, i8 -1, i8 -1, i8 -1, i8 -1, i8 -1>, ptr [[TMP5]])
 ; CHECK-NEXT:    [[_MSCMP:%.*]] = icmp ne i64 [[TMP1]], 0
 ; CHECK-NEXT:    br i1 [[_MSCMP]], label [[TMP6:%.*]], label [[TMP7:%.*]], !prof [[PROF0]]
 ; CHECK:       6:
@@ -258,7 +259,6 @@ define void @st3_8b_undefAC(<8 x i8> %A, <8 x i8> %B, <8 x i8> %C, ptr %P) nounw
 ; CHECK-NEXT:    unreachable
 ; CHECK:       7:
 ; CHECK-NEXT:    call void @llvm.aarch64.neon.st3.v8i8.p0(<8 x i8> undef, <8 x i8> [[B]], <8 x i8> undef, ptr [[P]])
-; CHECK-NEXT:    call void @llvm.aarch64.neon.st3.v8i8.p0(<8 x i8> <i8 -1, i8 -1, i8 -1, i8 -1, i8 -1, i8 -1, i8 -1, i8 -1>, <8 x i8> [[TMP2]], <8 x i8> <i8 -1, i8 -1, i8 -1, i8 -1, i8 -1, i8 -1, i8 -1, i8 -1>, ptr [[TMP5]])
 ; CHECK-NEXT:    ret void
 ;
   call void @llvm.aarch64.neon.st3.v8i8.p0(<8 x i8> undef, <8 x i8> %B, <8 x i8> undef, ptr %P)
@@ -276,6 +276,7 @@ define void @st3_8b_undefBC(<8 x i8> %A, <8 x i8> %B, <8 x i8> %C, ptr %P) nounw
 ; CHECK-NEXT:    [[TMP3:%.*]] = ptrtoint ptr [[P]] to i64
 ; CHECK-NEXT:    [[TMP4:%.*]] = xor i64 [[TMP3]], 193514046488576
 ; CHECK-NEXT:    [[TMP5:%.*]] = inttoptr i64 [[TMP4]] to ptr
+; CHECK-NEXT:    call void @llvm.aarch64.neon.st3.v8i8.p0(<8 x i8> [[TMP2]], <8 x i8> <i8 -1, i8 -1, i8 -1, i8 -1, i8 -1, i8 -1, i8 -1, i8 -1>, <8 x i8> <i8 -1, i8 -1, i8 -1, i8 -1, i8 -1, i8 -1, i8 -1, i8 -1>, ptr [[TMP5]])
 ; CHECK-NEXT:    [[_MSCMP:%.*]] = icmp ne i64 [[TMP1]], 0
 ; CHECK-NEXT:    br i1 [[_MSCMP]], label [[TMP6:%.*]], label [[TMP7:%.*]], !prof [[PROF0]]
 ; CHECK:       6:
@@ -283,7 +284,6 @@ define void @st3_8b_undefBC(<8 x i8> %A, <8 x i8> %B, <8 x i8> %C, ptr %P) nounw
 ; CHECK-NEXT:    unreachable
 ; CHECK:       7:
 ; CHECK-NEXT:    call void @llvm.aarch64.neon.st3.v8i8.p0(<8 x i8> [[A]], <8 x i8> undef, <8 x i8> undef, ptr [[P]])
-; CHECK-NEXT:    call void @llvm.aarch64.neon.st3.v8i8.p0(<8 x i8> [[TMP2]], <8 x i8> <i8 -1, i8 -1, i8 -1, i8 -1, i8 -1, i8 -1, i8 -1, i8 -1>, <8 x i8> <i8 -1, i8 -1, i8 -1, i8 -1, i8 -1, i8 -1, i8 -1, i8 -1>, ptr [[TMP5]])
 ; CHECK-NEXT:    ret void
 ;
   call void @llvm.aarch64.neon.st3.v8i8.p0(<8 x i8> %A, <8 x i8> undef, <8 x i8> undef, ptr %P)
@@ -300,6 +300,7 @@ define void @st3_8b_undefABC(<8 x i8> %A, <8 x i8> %B, <8 x i8> %C, ptr %P) noun
 ; CHECK-NEXT:    [[TMP2:%.*]] = ptrtoint ptr [[P]] to i64
 ; CHECK-NEXT:    [[TMP3:%.*]] = xor i64 [[TMP2]], 193514046488576
 ; CHECK-NEXT:    [[TMP4:%.*]] = inttoptr i64 [[TMP3]] to ptr
+; CHECK-NEXT:    call void @llvm.aarch64.neon.st3.v8i8.p0(<8 x i8> <i8 -1, i8 -1, i8 -1, i8 -1, i8 -1, i8 -1, i8 -1, i8 -1>, <8 x i8> <i8 -1, i8 -1, i8 -1, i8 -1, i8 -1, i8 -1, i8 -1, i8 -1>, <8 x i8> <i8 -1, i8 -1, i8 -1, i8 -1, i8 -1, i8 -1, i8 -1, i8 -1>, ptr [[TMP4]])
 ; CHECK-NEXT:    [[_MSCMP:%.*]] = icmp ne i64 [[TMP1]], 0
 ; CHECK-NEXT:    br i1 [[_MSCMP]], label [[TMP5:%.*]], label [[TMP6:%.*]], !prof [[PROF0]]
 ; CHECK:       5:
@@ -307,7 +308,6 @@ define void @st3_8b_undefABC(<8 x i8> %A, <8 x i8> %B, <8 x i8> %C, ptr %P) noun
 ; CHECK-NEXT:    unreachable
 ; CHECK:       6:
 ; CHECK-NEXT:    call void @llvm.aarch64.neon.st3.v8i8.p0(<8 x i8> undef, <8 x i8> undef, <8 x i8> undef, ptr [[P]])
-; CHECK-NEXT:    call void @llvm.aarch64.neon.st3.v8i8.p0(<8 x i8> <i8 -1, i8 -1, i8 -1, i8 -1, i8 -1, i8 -1, i8 -1, i8 -1>, <8 x i8> <i8 -1, i8 -1, i8 -1, i8 -1, i8 -1, i8 -1, i8 -1, i8 -1>, <8 x i8> <i8 -1, i8 -1, i8 -1, i8 -1, i8 -1, i8 -1, i8 -1, i8 -1>, ptr [[TMP4]])
 ; CHECK-NEXT:    ret void
 ;
   call void @llvm.aarch64.neon.st3.v8i8.p0(<8 x i8> undef, <8 x i8> undef, <8 x i8> undef, ptr %P)
@@ -328,6 +328,7 @@ define void @st4_8b(<8 x i8> %A, <8 x i8> %B, <8 x i8> %C, <8 x i8> %D, ptr %P)
 ; CHECK-NEXT:    [[TMP6:%.*]] = ptrtoint ptr [[P]] to i64
 ; CHECK-NEXT:    [[TMP7:%.*]] = xor i64 [[TMP6]], 193514046488576
 ; CHECK-NEXT:    [[TMP8:%.*]] = inttoptr i64 [[TMP7]] to ptr
+; CHECK-NEXT:    call void @llvm.aarch64.neon.st4.v8i8.p0(<8 x i8> [[TMP2]], <8 x i8> [[TMP3]], <8 x i8> [[TMP4]], <8 x i8> [[TMP5]], ptr [[TMP8]])
 ; CHECK-NEXT:    [[_MSCMP:%.*]] = icmp ne i64 [[TMP1]], 0
 ; CHECK-NEXT:    br i1 [[_MSCMP]], label [[TMP9:%.*]], label [[TMP10:%.*]], !prof [[PROF0]]
 ; CHECK:       9:
@@ -335,7 +336,6 @@ define void @st4_8b(<8 x i8> %A, <8 x i8> %B, <8 x i8> %C, <8 x i8> %D, ptr %P)
 ; CHECK-NEXT:    unreachable
 ; CHECK:       10:
 ; CHECK-NEXT:    call void @llvm.aarch64.neon.st4.v8i8.p0(<8 x i8> [[A]], <8 x i8> [[B]], <8 x i8> [[C]], <8 x i8> [[D]], ptr [[P]])
-; CHECK-NEXT:    call void @llvm.aarch64.neon.st4.v8i8.p0(<8 x i8> [[TMP2]], <8 x i8> [[TMP3]], <8 x i8> [[TMP4]], <8 x i8> [[TMP5]], ptr [[TMP8]])
 ; CHECK-NEXT:    ret void
 ;
   call void @llvm.aarch64.neon.st4.v8i8.p0(<8 x i8> %A, <8 x i8> %B, <8 x i8> %C, <8 x i8> %D, ptr %P)
@@ -355,6 +355,7 @@ define void @st4_8b_undefA(<8 x i8> %A, <8 x i8> %B, <8 x i8> %C, <8 x i8> %D, p
 ; CHECK-NEXT:    [[TMP5:%.*]] = ptrtoint ptr [[P]] to i64
 ; CHECK-NEXT:    [[TMP6:%.*]] = xor i64 [[TMP5]], 193514046488576
 ; CHECK-NEXT:    [[TMP7:%.*]] = inttoptr i64 [[TMP6]] to ptr
+; CHECK-NEXT:    call void @llvm.aarch64.neon.st4.v8i8.p0(<8 x i8> <i8 -1, i8 -1, i8 -1, i8 -1, i8 -1, i8 -1, i8 -1, i8 -1>, <8 x i8> [[TMP2]], <8 x i8> [[TMP3]], <8 x i8> [[TMP4]], ptr [[TMP7]])
 ; CHECK-NEXT:    [[_MSCMP:%.*]] = icmp ne i64 [[TMP1]], 0
 ; CHECK-NEXT:    br i1 [[_MSCMP]], label [[TMP8:%.*]], label [[TMP9:%.*]], !prof [[PROF0]]
 ; CHECK:       8:
@@ -362,7 +363,6 @@ define void @st4_8b_undefA(<8 x i8> %A, <8 x i8> %B, <8 x i8> %C, <8 x i8> %D, p
 ; CHECK-NEXT:    unreachable
 ; CHECK:       9:
 ; CHECK-NEXT:    call void @llvm.aarch64.neon.st4.v8i8.p0(<8 x i8> undef, <8 x i8> [[B]], <8 x i8> [[C]], <8 x i8> [[D]], ptr [[P]])
-; CHECK-NEXT:    call void @llvm.aarch64.neon.st4.v8i8.p0(<8 x i8> <i8 -1, i8 -1, i8 -1, i8 -1, i8 -1, i8 -1, i8 -1, i8 -1>, <8 x i8> [[TMP2]], <8 x i8> [[TMP3]], <8 x i8> [[TMP4]], ptr [[TMP7]])
 ; CHECK-NEXT:    ret void
 ;
   call void @llvm.aarch64.neon.st4.v8i8.p0(<8 x i8> undef, <8 x i8> %B, <8 x i8> %C, <8 x i8> %D, ptr %P)
@@ -382,6 +382,7 @@ define void @st4_8b_undefB(<8 x i8> %A, <8 x i8> %B, <8 x i8> %C, <8 x i8> %D, p
 ; CHECK-NEXT:    [[TMP5:%.*]] = ptrtoint ptr [[P]] to i64
 ; CHECK-NEXT:    [[TMP6:%.*]] = xor i64 [[TMP5]], 193514046488576
 ; CHECK-NEXT:    [[TMP7:%.*]] = inttoptr i64 [[TMP6]] to ptr
+; CHECK-NEXT:    call void @llvm.aarch64.neon.st4.v8i8.p0(<8 x i8> [[TMP2]], <8 x i8> <i8 -1, i8 -1, i8 -1, i8 -1, i8 -1, i8 -1, i8 -1, i8 -1>, <8 x i8> [[TMP3]], <8 x i8> [[TMP4]], ptr [[TMP7]])
 ; CHECK-NEXT:    [[_MSCMP:%.*]] = icmp ne i64 [[TMP1]], 0
 ; CHECK-NEXT:    br i1 [[_MSCMP]], label [[TMP8:%.*]], label [[TMP9:%.*]], !prof [[PROF0]]
 ; CHECK:       8:
@@ -389,7 +390,6 @@ define void @st4_8b_undefB(<8 x i8> %A, <8 x i8> %B, <8 x i8> %C, <8 x i8> %D, p
 ; CHECK-NEXT:    unreachable
 ; CHECK:       9:
 ; CHECK-NEXT:    call void @llvm.aarch64.neon.st4.v8i8.p0(<8 x i8> [[A]], <8 x i8> undef, <8 x i8> [[C]], <8 x i8> [[D]], ptr [[P]])
-; CHECK-NEXT:    call void @llvm.aarch64.neon.st4.v8i8.p0(<8 x i8> [[TMP2]], <8 x i8> <i8 -1, i8 -1, i8 -1, i8 -1, i8 -1, i8 -1, i8 -1, i8 -1>, <8 x i8> [[TMP3]], <8 x i8> [[TMP4]], ptr [[TMP7]])
 ; CHECK-NEXT:    ret void
 ;
   call void @llvm.aarch64.neon.st4.v8i8.p0(<8 x i8> %A, <8 x i8> undef, <8 x i8> %C, <8 x i8> %D, ptr %P)
@@ -409,6 +409,7 @@ define void @st4_8b_undefC(<8 x i8> %A, <8 x i8> %B, <8 x i8> %C, <8 x i8> %D, p
 ; CHECK-NEXT:    [[TMP5:%.*]] = ptrtoint ptr [[P]] to i64
 ; CHECK-NEXT:    [[TMP6:%.*]] = xor i64 [[TMP5]], 193514046488576
 ; CHECK-NEXT:    [[TMP7:%.*]] = inttoptr i64 [[TMP6]] to ptr
+; CHECK-NEXT:    call void @llvm.aarch64.neon.st4.v8i8.p0(<8 x i8> [[TMP2]], <8 x i8> [[TMP3]], <8 x i8> <i8 -1, i8 -1, i8 -1, i8 -1, i8 -1, i8 -1, i8 -1, i8 -1>, <8 x i8> [[TMP4]], ptr [[TMP7]])
 ; CHECK-NEXT:    [[_MSCMP:%.*]] = icmp ne i64 [[TMP1]], 0
 ; CHECK-NEXT:    br i1 [[_MSCMP]], label [[TMP8:%.*]], label [[TMP9:%.*]], !prof [[PROF0]]
 ; CHECK:       8:
@@ -416,7 +417,6 @@ define void @st4_8b_undefC(<8 x i8> %A, <8 x i8> %B, <8 x i8> %C, <8 x i8> %D, p
 ; CHECK-NEXT:    unreachable
 ; CHECK:       9:
 ; CHECK-NEXT:    call void @llvm.aarch64.neon.st4.v8i8.p0(<8 x i8> [[A]], <8 x i8> [[B]], <8 x i8> undef, <8 x i8> [[D]], ptr [[P]])
-; CHECK-NEXT:    call void @llvm.aarch64.neon.st4.v8i8.p0(<8 x i8> [[TMP2]], <8 x i8> [[TMP3]], <8 x i8> <i8 -1, i8 -1, i8 -1, i8 -1, i8 -1, i8 -1, i8 -1, i8 -1>, <8 x i8> [[TMP4]], ptr [[TMP7]])
 ; CHECK-NEXT:    ret void
 ;
   call void @llvm.aarch64.neon.st4.v8i8.p0(<8 x i8> %A, <8 x i8> %B, <8 x i8> undef, <8 x i8> %D, ptr %P)
@@ -436,6 +436,7 @@ define void @st4_8b_undefD(<8 x i8> %A, <8 x i8> %B, <8 x i8> %C, <8 x i8> %D, p
 ; CHECK-NEXT:    [[TMP5:%.*]] = ptrtoint ptr [[P]] to i64
 ; CHECK-NEXT:    [[TMP6:%.*]] = xor i64 [[TMP5]], 193514046488576
 ; CHECK-NEXT:    [[TMP7:%.*]] = inttoptr i64 [[TMP6]] to ptr
+; CHECK-NEXT:    call void @llvm.aarch64.neon.st4.v8i8.p0(<8 x i8> [[TMP2]], <8 x i8> [[TMP3]], <8 x i8> [[TMP4]], <8 x i8> <i8 -1, i8 -1, i8 -1, i8 -1, i8 -1, i8 -1, i8 -1, i8 -1>, ptr [[TMP7]])
 ; CHECK-NEXT:    [[_MSCMP:%.*]] = icmp ne i64 [[TMP1]], 0
 ; CHECK-NEXT:    br i1 [[_MSCMP]], label [[TMP8:%.*]], label [[TMP9:%.*]], !prof [[PROF0]]
 ; CHECK:       8:
@@ -443,7 +444,6 @@ define void @st4_8b_undefD(<8 x i8> %A, <8 x i8> %B, <8 x i8> %C, <8 x i8> %D, p
 ; CHECK-NEXT:    unreachable
 ; CHECK:       9:
 ; CHECK-NEXT:    call void @llvm.aarch64.neon.st4.v8i8.p0(<8 x i8> [[A]], <8 x i8> [[B]], <8 x i8> [[C]], <8 x i8> undef, ptr [[P]])
-; CHECK-NEXT:    call void @llvm.aarch64.neon.st4.v8i8.p0(<8 x i8> [[TMP2]], <8 x i8> [[TMP3]], <8 x i8> [[TMP4]], <8 x i8> <i8 -1, i8 -1, i8 -1, i8 -1, i8 -1, i8 -1, i8 -1, i8 -1>, ptr [[TMP7]])
 ; CHECK-NEXT:    ret void
 ;
   call void @llvm.aarch64.neon.st4.v8i8.p0(<8 x i8> %A, <8 x i8> %B, <8 x i8> %C, <8 x i8> undef, ptr %P)
@@ -462,6 +462,7 @@ define void @st4_8b_undefAB(<8 x i8> %A, <8 x i8> %B, <8 x i8> %C, <8 x i8> %D,
 ; CHECK-NEXT:    [[TMP4:%.*]] = ptrtoint ptr [[P]] to i64
 ; CHECK-NEXT:    [[TMP5:%.*]] = xor i64 [[TMP4]], 193514046488576
 ; CHECK-NEXT:    [[TMP6:%.*]] = inttoptr i64 [[TMP5]] to ptr
+; CHECK-NEXT:    call void @llvm.aarch64.neon.st4.v8i8.p0(<8 x i8> <i8 -1, i8 -1, i8 -1, i8 -1, i8 -1, i8 -1, i8 -1, i8 -1>, <8 x i8> <i8 -1, i8 -1, i8 -1, i8 -1, i8 -1, i8 -1, i8 -1, i8 -1>, <8 x i8> [[TMP2]], <8 x i8> [[TMP3]], ptr [[TMP6]])
 ; CHECK-NEXT:    [[_MSCMP:%.*]] = icmp ne i64 [[TMP1]], 0
 ; CHECK-NEXT:    br i1 [[_MSCMP]], label [[TMP7:%.*]], label [[TMP8:%.*]], !prof [[PROF0]]
 ; CHECK:       7:
@@ -469,7 +470,6 @@ define void @st4_8b_undefAB(<8 x i8> %A, <8 x i8> %B, <8 x i8> %C, <8 x i8> %D,
 ; CHECK-NEXT:    unreachable
 ; CHECK:       8:
 ; CHECK-NEXT:    call void @llvm.aarch64.neon.st4.v8i8.p0(<8 x i8> undef, <8 x i8> undef, <8 x i8> [[C]], <8 x i8> [[D]], ptr [[P]])
-; CHECK-NEXT:    call void @llvm.aarch64.neon.st4.v8i8.p0(<8 x i8> <i8 -1, i8 -1, i8 -1, i8 -1, i8 -1, i8 -1, i8 -1, i8 -1>, <8 x i8> <i8 -1, i8 -1, i8 -1, i8 -1, i8 -1, i8 -1, i8 -1, i8 -1>, <8 x i8> [[TMP2]], <8 x i8> [[TMP3]], ptr [[TMP6]])
 ; CHECK-NEXT:    ret void
 ;
   call void @llvm.aarch64.neon.st4.v8i8.p0(<8 x i8> undef, <8 x i8> undef, <8 x i8> %C, <8 x i8> %D, ptr %P)
@@ -488,6 +488,7 @@ define void @st4_8b_undefAC(<8 x i8> %A, <8 x i8> %B, <8 x i8> %C, <8 x i8> %D,
 ; CHECK-NEXT:    [[TMP4:%.*]] = ptrtoint ptr [[P]] to i64
 ; CHECK-NEXT:    [[TMP5:%.*]] = xor i64 [[TMP4]], 193514046488576
 ; CHECK-NEXT:    [[TMP6:%.*]] = inttoptr i64 [[TMP5]] to ptr
+; CHECK-NEXT:    call void @llvm.aarch64.neon.st4.v8i8.p0(<8 x i8> <i8 -1, i8 -1, i8 -1, i8 -1, i8 -1, i8 -1, i8 -1, i8 -1>, <8 x i8> [[TMP2]], <8 x i8> <i8 -1, i8 -1, i8 -1, i8 -1, i8 -1, i8 -1, i8 -1, i8 -1>, <8 x i8> [[TMP3]], ptr [[TMP6]])
 ; CHECK-NEXT:    [[_MSCMP:%.*]] = icmp ne i64 [[TMP1]], 0
 ; CHECK-NEXT:    br i1 [[_MSCMP]], label [[TMP7:%.*]], label [[TMP8:%.*]], !prof [[PROF0]]
 ; CHECK:       7:
@@ -495,7 +496,6 @@ define void @st4_8b_undefAC(<8 x i8> %A, <8 x i8> %B, <8 x i8> %C, <8 x i8> %D,
 ; CHECK-NEXT:    unreachable
 ; CHECK:       8:
 ; CHECK-NEXT:    call void @llvm.aarch64.neon.st4.v8i8.p0(<8 x i8> undef, <8 x i8> [[B]], <8 x i8> undef, <8 x i8> [[D]], ptr [[P]])
-; CHECK-NEXT:    call void @llvm.aarch64.neon.st4.v8i8.p0(<8 x i8> <i8 -1, i8 -1, i8 -1, i8 -1, i8 -1, i8 -1, i8 -1, i8 -1>, <8 x i8> [[TMP2]], <8 x i8> <i8 -1, i8 -1, i8 -1, i8 -1, i8 -1, i8 -1, i8 -1, i8 -1>, <8 x i8> [[TMP3]], ptr [[TMP6]])
 ; CHECK-NEXT:    ret void
 ;
   call void @llvm.aarch64.neon.st4.v8i8.p0(<8 x i8> undef, <8 x i8> %B, <8 x i8> undef, <8 x i8> %D, ptr %P)
@@ -514,6 +514,7 @@ define void @st4_8b_undefBC(<8 x i8> %A, <8 x i8> %B, <8 x i8> %C, <8 x i8> %D,
 ; CHECK-NEXT:    [[TMP4:%.*]] = ptrtoint ptr [[P]] to i64
 ; CHECK-NEXT:    [[TMP5:%.*]] = xor i64 [[TMP4]], 193514046488576
 ; CHECK-NEXT:    [[TMP6:%.*]] = inttoptr i64 [[TMP5]] to ptr
+; CHECK-NEXT:    call void @llvm.aarch64.neon.st4.v8i8.p0(<8 x i8> [[TMP2]], <8 x i8> <i8 -1, i8 -1, i8 -1, i8 -1, i8 -1, i8 -1, i8 -1, i8 -1>, <8 x i8> <i8 -1, i8 -1, i8 -1, i8 -1, i8 -1, i8 -1, i8 -1, i8 -1>, <8 x i8> [[TMP3]], ptr [[TMP6]])
 ; CHECK-NEXT:    [[_MSCMP:%.*]] = icmp ne i64 [[TMP1]], 0
 ; CHECK-NEXT:    br i1 [[_MSCMP]], label [[TMP7:%.*]], label [[TMP8:%.*]], !prof [[PROF0]]
 ; CHECK:       7:
@@ -521,7 +522,6 @@ define void @st4_8b_undefBC(<8 x i8> %A, <8 x i8> %B, <8 x i8> %C, <8 x i8> %D,
 ; CHECK-NEXT:    unreachable
 ; CHECK:       8:
 ; CHECK-NEXT:    call void @llvm.aarch64.neon.st4.v8i8.p0(<8 x i8> [[A]], <8 x i8> undef, <8 x i8> undef, <8 x i8> [[D]], ptr [[P]])
-; CHECK-NEXT:    call void @llvm.aarch64.neon.st4.v8i8.p0(<8 x i8> [[TMP2]], <8 x i8> <i8 -1, i8 -1, i8 -1, i8 -1, i8 -1, i8 -1, i8 -1, i8 -1>, <8 x i8> <i8 -1, i8 -1, i8 -1, i8 -1, i8 -1, i8 -1, i8 -1, i8 -1>, <8 x i8> [[TMP3]], ptr [[TMP6]])
 ; CHECK-NEXT:    ret void
 ;
   call void @llvm.aarch64.neon.st4.v8i8.p0(<8 x i8> %A, <8 x i8> undef, <8 x i8> undef, <8 x i8> %D, ptr %P)
@@ -540,6 +540,7 @@ define void @st4_8b_undefBD(<8 x i8> %A, <8 x i8> %B, <8 x i8> %C, <8 x i8> %D,
 ; CHECK-NEXT:    [[TMP4:%.*]] = ptrtoint ptr [[P]] to i64
 ; CHECK-NEXT:    [[TMP5:%.*]] = xor i64 [[TMP4]], 193514046488576
 ; CHECK-NEXT:    [[TMP6:%.*]] = inttoptr i64 [[TMP5]] to ptr
+; CHECK-NEXT:    call void @llvm.aarch64.neon.st4.v8i8.p0(<8 x i8> [[TMP2]], <8 x i8> <i8 -1, i8 -1, i8 -1, i8 -1, i8 -1, i8 -1, i8 -1, i8 -1>, <8 x i8> [[TMP3]], <8 x i8> <i8 -1, i8 -1, i8 -1, i8 -1, i8 -1, i8 -1, i8 -1, i8 -1>, ptr [[TMP6]])
 ; CHECK-NEXT:    [[_MSCMP:%.*]] = icmp ne i64 [[TMP1]], 0
 ; CHECK-NEXT:    br i1 [[_MSCMP]], label [[TMP7:%.*]], label [[TMP8:%.*]], !prof [[PROF0]]
 ; CHECK:       7:
@@ -547,7 +548,6 @@ define void @st4_8b_undefBD(<8 x i8> %A, <8 x i8> %B, <8 x i8> %C, <8 x i8> %D,
 ; CHECK-NEXT:    unreachable
 ; CHECK:       8:
 ; CHECK-NEXT:    call void @llvm.aarch64.neon.st4.v8i8.p0(<8 x i8> [[A]], <8 x i8> undef, <8 x i8> [[C]], <8 x i8> undef, ptr [[P]])
-; CHECK-NEXT:    call void @llvm.aarch64.neon.st4.v8i8.p0(<8 x i8> [[TMP2]], <8 x i8> <i8 -1, i8 -1, i8 -1, i8 -1, i8 -1, i8 -1, i8 -1, i8 -1>, <8 x i8> [[TMP3]], <8 x i8> <i8 -1, i8 -1, i8 -1, i8 -1, i8 -1, i8 -1, i8 -1, i8 -1>, ptr [[TMP6]])
 ; CHECK-NEXT:    ret void
 ;
   call void @llvm.aarch64.neon.st4.v8i8.p0(<8 x i8> %A, <8 x i8> undef, <8 x i8> %C, <8 x i8> undef, ptr %P)
@@ -565,6 +565,7 @@ define void @st4_8b_undefABC(<8 x i8> %A, <8 x i8> %B, <8 x i8> %C, <8 x i8> %D,
 ; CHECK-NEXT:    [[TMP3:%.*]] = ptrtoint ptr [[P]] to i64
 ; CHECK-NEXT:    [[TMP4:%.*]] = xor i64 [[TMP3]], 193514046488576
 ; CHECK-NEXT:    [[TMP5:%.*]] = inttoptr i64 [[TMP4]] to ptr
+; CHECK-NEXT:    call void @llvm.aarch64.neon.st4.v8i8.p0(<8 x i8> <i8 -1, i8 -1, i8 -1, i8 -1, i8 -1, i8 -1, i8 -1, i8 -1>, <8 x i8> <i8 -1, i8 -1, i8 -1, i8 -1, i8 -1, i8 -1, i8 -1, i8 -1>, <8 x i8> <i8 -1, i8 -1, i8 -1, i8 -1, i8 -1, i8 -1, i8 -1, i8 -1>, <8 x i8> [[TMP2]], ptr [[TMP5]])
 ; CHECK-NEXT:    [[_MSCMP:%.*]] = icmp ne i64 [[TMP1]], 0
 ; CHECK-NEXT:    br i1 [[_MSCMP]], label [[TMP6:%.*]], label [[TMP7:%.*]], !prof [[PROF0]]
 ; CHECK:       6:
@@ -572,7 +573,6 @@ define void @st4_8b_undefABC(<8 x i8> %A, <8 x i8> %B, <8 x i8> %C, <8 x i8> %D,
 ; CHECK-NEXT:    unreachable
 ; CHECK:       7:
 ; CHECK-NEXT:    call void @llvm.aarch64.neon.st4.v8i8.p0(<8 x i8> undef, <8 x i8> undef, <8 x i8> undef, <8 x i8> [[D]], ptr [[P]])
-; CHECK-NEXT:    call void @llvm.aarch64.neon.st4.v8i8.p0(<8 x i8> <i8 -1, i8 -1, i8 -1, i8 -1, i8 -1, i8 -1, i8 -1, i8 -1>, <8 x i8> <i8 -1, i8 -1, i8 -1, i8 -1, i8 -1, i8 -1, i8 -1, i8 -1>, <8 x i8> <i8 -1, i8 -1, i8 -1, i8 -1, i8 -1, i8 -1, i8 -1, i8 -1>, <8 x i8> [[TMP2]], ptr [[TMP5]])
 ; CHECK-NEXT:    ret void
 ;
   call void @llvm.aarch64.neon.st4.v8i8.p0(<8 x i8> undef, <8 x i8> undef, <8 x i8> undef, <8 x i8> %D, ptr %P)
@@ -590,6 +590,7 @@ define void @st4_8b_undefABD(<8 x i8> %A, <8 x i8> %B, <8 x i8> %C, <8 x i8> %D,
 ; CHECK-NEXT:    [[TMP3:%.*]] = ptrtoint ptr [[P]] to i64
 ; CHECK-NEXT:    [[TMP4:%.*]] = xor i64 [[TMP3]], 193514046488576
 ; CHECK-NEXT:    [[TMP5:%.*]] = inttoptr i64 [[TMP4]] to ptr
+; CHECK-NEXT:    call void @llvm.aarch64.neon.st4.v8i8.p0(<8 x i8> <i8 -1, i8 -1, i8 -1, i8 -1, i8 -1, i8 -1, i8 -1, i8 -1>, <8 x i8> <i8 -1, i8 -1, i8 -1, i8 -1, i8 -1, i8 -1, i8 -1, i8 -1>, <8 x i8> [[TMP2]], <8 x i8> <i8 -1, i8 -1, i8 -1, i8 -1, i8 -1, i8 -1, i8 -1, i8 -1>, ptr [[TMP5]])
 ; CHECK-NEXT:    [[_MSCMP:%.*]] = icmp ne i64 [[TMP1]], 0
 ; CHECK-NEXT:    br i1 [[_MSCMP]], label [[TMP6:%.*]], label [[TMP7:%.*]], !prof [[PROF0]]
 ; CHECK:       6:
@@ -597,7 +598,6 @@ define void @st4_8b_undefABD(<8 x i8> %A, <8 x i8> %B, <8 x i8> %C, <8 x i8> %D,
 ; CHECK-NEXT:    unreachable
 ; CHECK:       7:
 ; CHECK-NEXT:    call void @llvm.aarch64.neon.st4.v8i8.p0(<8 x i8> undef, <8 x i8> undef, <8 x i8> [[C]], <8 x i8> undef, ptr [[P]])
-; CHECK-NEXT:    call void @llvm.aarch64.neon.st4.v8i8.p0(<8 x i8> <i8 -1, i8 -1, i8 -1, i8 -1, i8 -1, i8 -1, i8 -1, i8 -1>, <8 x i8> <i8 -1, i8 -1, i8 -1, i8 -1, i8 -1, i8 -1, i8 -1, i8 -1>, <8 x i8> [[TMP2]], <8 x i8> <i8 -1, i8 -1, i8 -1, i8 -1, i8 -1, i8 -1, i8 -1, i8 -1>, ptr [[TMP5]])
 ; CHECK-NEXT:    ret void
 ;
   call void @llvm.aarch64.neon.st4.v8i8.p0(<8 x i8> undef, <8 x i8> undef, <8 x i8> %C, <8 x i8> undef, ptr %P)
@@ -615,6 +615,7 @@ define void @st4_8b_undefACD(<8 x i8> %A, <8 x i8> %B, <8 x i8> %C, <8 x i8> %D,
 ; CHECK-NEXT:    [[TMP3:%.*]] = ptrtoint ptr [[P]] to i64
 ; CHECK-NEXT:    [[TMP4:%.*]] = xor i64 [[TMP3]], 193514046488576
 ; CHECK-NEXT:    [[TMP5:%.*]] = inttoptr i64 [[TMP4]] to ptr
+; CHECK-NEXT:    call void @llvm.aarch64.neon.st4.v8i8.p0(<8 x i8> <i8 -1, i8 -1, i8 -1, i8 -1, i8 -1, i8 -1, i8 -1, i8 -1>, <8 x i8> [[TMP2]], <8 x i8> <i8 -1, i8 -1, i8 -1, i8 -1, i8 -1, i8 -1, i8 -1, i8 -1>, <8 x i8> <i8 -1, i8 -1, i8 -1, i8 -1, i8 -1, i8 -1, i8 -1, i8 -1>, ptr [[TMP5]])
 ; CHECK-NEXT:    [[_MSCMP:%.*]] = icmp ne i64 [[TMP1]], 0
 ; CHECK-NEXT:    br i1 [[_MSCMP]], label [[TMP6:%.*]], label [[TMP7:%.*]], !prof [[PROF0]]
 ; CHECK:       6:
@@ -622,7 +623,6 @@ define void @st4_8b_undefACD(<8 x i8> %A, <8 x i8> %B, <8 x i8> %C, <8 x i8> %D,
 ; CHECK-NEXT:    unreachable
 ; CHECK:       7:
 ; CHECK-NEXT:    call void @llvm.aarch64.neon.st4.v8i8.p0(<8 x i8> undef, <8 x i8> [[B]], <8 x i8> undef, <8 x i8> undef, ptr [[P]])
-; CHECK-NEXT:    call void @llvm.aarch64.neon.st4.v8i8.p0(<8 x i8> <i8 -1, i8 -1, i8 -1, i8 -1, i8 -1, i8 -1, i8 -1, i8 -1>, <8 x i8> [[TMP2]], <8 x i8> <i8 -1, i8 -1, i8 -1, i8 -1, i8 -1, i8 -1, i8 -1, i8 -1>, <8 x i8> <i8 -1, i8 -1, i8 -1, i8 -1, i8 -1, i8 -1, i8 -1, i8 -1>, ptr [[TMP5]])
 ; CHECK-NEXT:    ret void
 ;
   call void @llvm.aarch64.neon.st4.v8i8.p0(<8 x i8> undef, <8 x i8> %B, <8 x i8> undef, <8 x i8> undef, ptr %P)
@@ -640,6 +640,7 @@ define void @st4_8b_undefBCD(<8 x i8> %A, <8 x i8> %B, <8 x i8> %C, <8 x i8> %D,
 ; CHECK-NEXT:    [[TMP3:%.*]] = ptrtoint ptr [[P]] to i64
 ; CHECK-NEXT:    [[TMP4:%.*]] = xor i64 [[TMP3]], 193514046488576
 ; CHECK-NEXT:    [[TMP5:%.*]] = inttoptr i64 [[TMP4]] to ptr
+; CHECK-NEXT:    call void @llvm.aarch64.neon.st4.v8i8.p0(<8 x i8> [[TMP2]], <8 x i8> <i8 -1, i8 -1, i8 -1, i8 -1, i8 -1, i8 -1, i8 -1, i8 -1>, <8 x i8> <i8 -1, i8 -1, i8 -1, i8 -1, i8 -1, i8 -1, i8 -1, i8 -1>, <8 x i8> <i8 -1, i8 -1, i8 -1, i8 -1, i8 -1, i8 -1, i8 -1, i8 -1>, ptr [[TMP5]])
 ; CHECK-NEXT:    [[_MSCMP:%.*]] = icmp ne i64 [[TMP1]], 0
 ; CHECK-NEXT:    br i1 [[_MSCMP]], label [[TMP6:%.*]], label [[TMP7:%.*]], !prof [[PROF0]]
 ; CHECK:       6:
@@ -647,7 +648,6 @@ define void @st4_8b_undefBCD(<8 x i8> %A, <8 x i8> %B, <8 x i8> %C, <8 x i8> %D,
 ; CHECK-NEXT:    unreachable
 ; CHECK:       7:
 ; CHECK-NEXT:    call void @llvm.aarch64.neon.st4.v8i8.p0(<8 x i8> [[A]], <8 x i8> undef, <8 x i8> undef, <8 x i8> undef, ptr [[P]])
-; CHECK-NEXT:    call void @llvm.aarch64.neon.st4.v8i8.p0(<8 x i8> [[TMP2]], <8 x i8> <i8 -1, i8 -1, i8 -1, i8 -1, i8 -1, i8 -1, i8 -1, i8 -1>, <8 x i8> <i8 -1, i8 -1, i8 -1, i8 -1, i8 -1, i8 -1, i8 -1, i8 -1>, <8 x i8> <i8 -1, i8 -1, i8 -1, i8 -1, i8 -1, i8 -1, i8 -1, i8 -1>, ptr [[TMP5]])
 ; CHECK-NEXT:    ret void
 ;
   call void @llvm.aarch64.neon.st4.v8i8.p0(<8 x i8> %A, <8 x i8> undef, <8 x i8> undef, <8 x i8> undef, ptr %P)
@@ -664,6 +664,7 @@ define void @st4_8b_undefABCD(<8 x i8> %A, <8 x i8> %B, <8 x i8> %C, <8 x i8> %D
 ; CHECK-NEXT:    [[TMP2:%.*]] = ptrtoint ptr [[P]] to i64
 ; CHECK-NEXT:    [[TMP3:%.*]] = xor i64 [[TMP2]], 193514046488576
 ; CHECK-NEXT:    [[TMP4:%.*]] = inttoptr i64 [[TMP3]] to ptr
+; CHECK-NEXT:    call void @llvm.aarch64.neon.st4.v8i8.p0(<8 x i8> <i8 -1, i8 -1, i8 -1, i8 -1, i8 -1, i8 -1, i8 -1, i8 -1>, <8 x i8> <i8 -1, i8 -1, i8 -1, i8 -1, i8 -1, i8 -1, i8 -1, i8 -1>, <8 x i8> <i8 -1, i8 -1, i8 -1, i8 -1, i8 -1, i8 -1, i8 -1, i8 -1>, <8 x i8> <i8 -1, i8 -1, i8 -1, i8 -1, i8 -1, i8 -1, i8 -1, i8 -1>, ptr [[TMP4]])
 ; CHECK-NEXT:    [[_MSCMP:%.*]] = icmp ne i64 [[TMP1]], 0
 ; CHECK-NEXT:    br i1 [[_MSCMP]], label [[TMP5:%.*]], label [[TMP6:%.*]], !prof [[PROF0]]
 ; CHECK:       5:
@@ -671,7 +672,6 @@ define void @st4_8b_undefABCD(<8 x i8> %A, <8 x i8> %B, <8 x i8> %C, <8 x i8> %D
 ; CHECK-NEXT:    unreachable
 ; CHECK:       6:
 ; CHECK-NEXT:    call void @llvm.aarch64.neon.st4.v8i8.p0(<8 x i8> undef, <8 x i8> undef, <8 x i8> undef, <8 x i8> undef, ptr [[P]])
-; CHECK-NEXT:    call void @llvm.aarch64.neon.st4.v8i8.p0(<8 x i8> <i8 -1, i8 -1, i8 -1, i8 -1, i8 -1, i8 -1, i8 -1, i8 -1>, <8 x i8> <i8 -1, i8 -1, i8 -1, i8 -1, i8 -1, i8 -1, i8 -1, i8 -1>, <8 x i8> <i8 -1, i8 -1, i8 -1, i8 -1, i8 -1, i8 -1, i8 -1, i8 -1>, <8 x i8> <i8 -1, i8 -1, i8 -1, i8 -1, i8 -1, i8 -1, i8 -1, i8 -1>, ptr [[TMP4]])
 ; CHECK-NEXT:    ret void
 ;
   call void @llvm.aarch64.neon.st4.v8i8.p0(<8 x i8> undef, <8 x i8> undef, <8 x i8> undef, <8 x i8> undef, ptr %P)
@@ -696,6 +696,7 @@ define void @st2_16b(<16 x i8> %A, <16 x i8> %B, ptr %P) nounwind sanitize_memor
 ; CHECK-NEXT:    [[TMP4:%.*]] = ptrtoint ptr [[P]] to i64
 ; CHECK-NEXT:    [[TMP5:%.*]] = xor i64 [[TMP4]], 193514046488576
 ; CHECK-NEXT:    [[TMP6:%.*]] = inttoptr i64 [[TMP5]] to ptr
+; CHECK-NEXT:    call void @llvm.aarch64.neon.st2.v16i8.p0(<16 x i8> [[TMP2]], <16 x i8> [[TMP3]], ptr [[TMP6]])
 ; CHECK-NEXT:    [[_MSCMP:%.*]] = icmp ne i64 [[TMP1]], 0
 ; CHECK-NEXT:    br i1 [[_MSCMP]], label [[TMP7:%.*]], label [[TMP8:%.*]], !prof [[PROF0]]
 ; CHECK:       7:
@@ -703,7 +704,6 @@ define void @st2_16b(<16 x i8> %A, <16 x i8> %B, ptr %P) nounwind sanitize_memor
 ; CHECK-NEXT:    unreachable
 ; CHECK:       8:
 ; CHECK-NEXT:    call void @llvm.aarch64.neon.st2.v16i8.p0(<16 x i8> [[A]], <16 x i8> [[B]], ptr [[P]])
-; CHECK-NEXT:    call void @llvm.aarch64.neon.st2.v16i8.p0(<16 x i8> [[TMP2]], <16 x i8> [[TMP3]], ptr [[TMP6]])
 ; CHECK-NEXT:    ret void
 ;
   call void @llvm.aarch64.neon.st2.v16i8.p0(<16 x i8> %A, <16 x i8> %B, ptr %P)
@@ -723,6 +723,7 @@ define void @st3_16b(<16 x i8> %A, <16 x i8> %B, <16 x i8> %C, ptr %P) nounwind
 ; CHECK-NEXT:    [[TMP5:%.*]] = ptrtoint ptr [[P]] to i64
 ; CHECK-NEXT:    [[TMP6:%.*]] = xor i64 [[TMP5]], 193514046488576
 ; CHECK-NEXT:    [[TMP7:%.*]] = inttoptr i64 [[TMP6]] to ptr
+; CHECK-NEXT:    call void @llvm.aarch64.neon.st3.v16i8.p0(<16 x i8> [[TMP2]], <16 x i8> [[TMP3]], <16 x i8> [[TMP4]], ptr [[TMP7]])
 ; CHECK-NEXT:    [[_MSCMP:%.*]] = icmp ne i64 [[TMP1]], 0
 ; CHECK-NEXT:    br i1 [[_MSCMP]], label [[TMP8:%.*]], label [[TMP9:%.*]], !prof [[PROF0]]
 ; CHECK:       8:
@@ -730,7 +731,6 @@ define void @st3_16b(<16 x i8> %A, <16 x i8> %B, <16 x i8> %C, ptr %P) nounwind
 ; CHECK-NEXT:    unreachable
 ; CHECK:       9:
 ; CHECK-NEXT:    call void @llvm.aarch64.neon.st3.v16i8.p0(<16 x i8> [[A]], <16 x i8> [[B]], <16 x i8> [[C]], ptr [[P]])
-; CHECK-NEXT:    call void @llvm.aarch64.neon.st3.v16i8.p0(<16 x i8> [[TMP2]], <16 x i8> [[TMP3]], <16 x i8> [[TMP4]], ptr [[TMP7]])
 ; CHECK-NEXT:    ret void
 ;
   call void @llvm.aarch64.neon.st3.v16i8.p0(<16 x i8> %A, <16 x i8> %B, <16 x i8> %C, ptr %P)
@@ -751,6 +751,7 @@ define void @st4_16b(<16 x i8> %A, <16 x i8> %B, <16 x i8> %C, <16 x i8> %D, ptr
 ; CHECK-NEXT:    [[TMP6:%.*]] = ptrtoint ptr [[P]] to i64
 ; CHECK-NEXT:    [[TMP7:%.*]] = xor i64 [[TMP6]], 193514046488576
 ; CHECK-NEXT:    [[TMP8:%.*]] = inttoptr i64 [[TMP7]] to ptr
+; CHECK-NEXT:    call void @llvm.aarch64.neon.st4.v16i8.p0(<16 x i8> [[TMP2]], <16 x i8> [[TMP3]], <16 x i8> [[TMP4]], <16 x i8> [[TMP5]], ptr [[TMP8]])
 ; CHECK-NEXT:    [[_MSCMP:%.*]] = icmp ne i64 [[TMP1]], 0
 ; CHECK-NEXT:    br i1 [[_MSCMP]], label [[TMP9:%.*]], label [[TMP10:%.*]], !prof [[PROF0]]
 ; CHECK:       9:
@@ -758,7 +759,6 @@ define void @st4_16b(<16 x i8> %A, <16 x i8> %B, <16 x i8> %C, <16 x i8> %D, ptr
 ; CHECK-NEXT:    unreachable
 ; CHECK:       10:
 ; CHECK-NEXT:    call void @llvm.aarch64.neon.st4.v16i8.p0(<16 x i8> [[A]], <16 x i8> [[B]], <16 x i8> [[C]], <16 x i8> [[D]], ptr [[P]])
-; CHECK-NEXT:    call void @llvm.aarch64.neon.st4.v16i8.p0(<16 x i8> [[TMP2]], <16 x i8> [[TMP3]], <16 x i8> [[TMP4]], <16 x i8> [[TMP5]], ptr [[TMP8]])
 ; CHECK-NEXT:    ret void
 ;
   call void @llvm.aarch64.neon.st4.v16i8.p0(<16 x i8> %A, <16 x i8> %B, <16 x i8> %C, <16 x i8> %D, ptr %P)
@@ -783,6 +783,7 @@ define void @st2_4h(<4 x i16> %A, <4 x i16> %B, ptr %P) nounwind sanitize_memory
 ; CHECK-NEXT:    [[TMP4:%.*]] = ptrtoint ptr [[P]] to i64
 ; CHECK-NEXT:    [[TMP5:%.*]] = xor i64 [[TMP4]], 193514046488576
 ; CHECK-NEXT:    [[TMP6:%.*]] = inttoptr i64 [[TMP5]] to ptr
+; CHECK-NEXT:    call void @llvm.aarch64.neon.st2.v4i16.p0(<4 x i16> [[TMP2]], <4 x i16> [[TMP3]], ptr [[TMP6]])
 ; CHECK-NEXT:    [[_MSCMP:%.*]] = icmp ne i64 [[TMP1]], 0
 ; CHECK-NEXT:    br i1 [[_MSCMP]], label [[TMP7:%.*]], label [[TMP8:%.*]], !prof [[PROF0]]
 ; CHECK:       7:
@@ -790,7 +791,6 @@ define void @st2_4h(<4 x i16> %A, <4 x i16> %B, ptr %P) nounwind sanitize_memory
 ; CHECK-NEXT:    unreachable
 ; CHECK:       8:
 ; CHECK-NEXT:    call void @llvm.aarch64.neon.st2.v4i16.p0(<4 x i16> [[A]], <4 x i16> [[B]], ptr [[P]])
-; CHECK-NEXT:    call void @llvm.aarch64.neon.st2.v4i16.p0(<4 x i16> [[TMP2]], <4 x i16> [[TMP3]], ptr [[TMP6]])
 ; CHECK-NEXT:    ret void
 ;
   call void @llvm.aarch64.neon.st2.v4i16.p0(<4 x i16> %A, <4 x i16> %B, ptr %P)
@@ -810,6 +810,7 @@ define void @st3_4h(<4 x i16> %A, <4 x i16> %B, <4 x i16> %C, ptr %P) nounwind s
 ; CHECK-NEXT:    [[TMP5:%.*]] = ptrtoint ptr [[P]] to i64
 ; CHECK-NEXT:    [[TMP6:%.*]] = xor i64 [[TMP5]], 193514046488576
 ; CHECK-NEXT:    [[TMP7:%.*]] = inttoptr i64 [[TMP6]] to ptr
+; CHECK-NEXT:    call void @llvm.aarch64.neon.st3.v4i16.p0(<4 x i16> [[TMP2]], <4 x i16> [[TMP3]], <4 x i16> [[TMP4]], ptr [[TMP7]])
 ; CHECK-NEXT:    [[_MSCMP:%.*]] = icmp ne i64 [[TMP1]], 0
 ; CHECK-NEXT:    br i1 [[_MSCMP]], label [[TMP8:%.*]], label [[TMP9:%.*]], !prof [[PROF0]]
 ; CHECK:       8:
@@ -817,7 +818,6 @@ define void @st3_4h(<4 x i16> %A, <4 x i16> %B, <4 x i16> %C, ptr %P) nounwind s
 ; CHECK-NEXT:    unreachable
 ; CHECK:       9:
 ; CHECK-NEXT:    call void @llvm.aarch64.neon.st3.v4i16.p0(<4 x i16> [[A]], <4 x i16> [[B]], <4 x i16> [[C]], ptr [[P]])
-; CHECK-NEXT:    call void @llvm.aarch64.neon.st3.v4i16.p0(<4 x i16> [[TMP2]], <4 x i16> [[TMP3]], <4 x i16> [[TMP4]], ptr [[TMP7]])
 ; CHECK-NEXT:    ret void
 ;
   call void @llvm.aarch64.neon.st3.v4i16.p0(<4 x i16> %A, <4 x i16> %B, <4 x i16> %C, ptr %P)
@@ -838,6 +838,7 @@ define void @st4_4h(<4 x i16> %A, <4 x i16> %B, <4 x i16> %C, <4 x i16> %D, ptr
 ; CHECK-NEXT:    [[TMP6:%.*]] = ptrtoint ptr [[P]] to i64
 ; CHECK-NEXT:    [[TMP7:%.*]] = xor i64 [[TMP6]], 193514046488576
 ; CHECK-NEXT:    [[TMP8:%.*]] = inttoptr i64 [[TMP7]] to ptr
+; CHECK-NEXT:    call void @llvm.aarch64.neon.st4.v4i16.p0(<4 x i16> [[TMP2]], <4 x i16> [[TMP3]], <4 x i16> [[TMP4]], <4 x i16> [[TMP5]], ptr [[TMP8]])
 ; CHECK-NEXT:    [[_MSCMP:%.*]] = icmp ne i64 [[TMP1]], 0
 ; CHECK-NEXT:    br i1 [[_MSCMP]], label [[TMP9:%.*]], label [[TMP10:%.*]], !prof [[PROF0]]
 ; CHECK:       9:
@@ -845,7 +846,6 @@ define void @st4_4h(<4 x i16> %A, <4 x i16> %B, <4 x i16> %C, <4 x i16> %D, ptr
 ; CHECK-NEXT:    unreachable
 ; CHECK:       10:
 ; CHECK-NEXT:    call void @llvm.aarch64.neon.st4.v4i16.p0(<4 x i16> [[A]], <4 x i16> [[B]], <4 x i16> [[C]], <4 x i16> [[D]], ptr [[P]])
-; CHECK-NEXT:    call void @llvm.aarch64.neon.st4.v4i16.p0(<4 x i16> [[TMP2]], <4 x i16> [[TMP3]], <4 x i16> [[TMP4]], <4 x i16> [[TMP5]], ptr [[TMP8]])
 ; CHECK-NEXT:    ret void
 ;
   call void @llvm.aarch64.neon.st4.v4i16.p0(<4 x i16> %A, <4 x i16> %B, <4 x i16> %C, <4 x i16> %D, ptr %P)
@@ -870,6 +870,7 @@ define void @st2_8h(<8 x i16> %A, <8 x i16> %B, ptr %P) nounwind sanitize_memory
 ; CHECK-NEXT:    [[TMP4:%.*]] = ptrtoint ptr [[P]] to i64
 ; CHECK-NEXT:    [[TMP5:%.*]] = xor i64 [[TMP4]], 193514046488576
 ; CHECK-NEXT:    [[TMP6:%.*]] = inttoptr i64 [[TMP5]] to ptr
+; CHECK-NEXT:    call void @llvm.aarch64.neon.st2.v8i16.p0(<8 x i16> [[TMP2]], <8 x i16> [[TMP3]], ptr [[TMP6]])
 ; CHECK-NEXT:    [[_MSCMP:%.*]] = icmp ne i64 [[TMP1]], 0
 ; CHECK-NEXT:    br i1 [[_MSCMP]], label [[TMP7:%.*]], label [[TMP8:%.*]], !prof [[PROF0]]
 ; CHECK:       7:
@@ -877,7 +878,6 @@ define void @st2_8h(<8 x i16> %A, <8 x i16> %B, ptr %P) nounwind sanitize_memory
 ; CHECK-NEXT:    unreachable
 ; CHECK:       8:
 ; CHECK-NEXT:    call void @llvm.aarch64.neon.st2.v8i16.p0(<8 x i16> [[A]], <8 x i16> [[B]], ptr [[P]])
-; CHECK-NEXT:    call void @llvm.aarch64.neon.st2.v8i16.p0(<8 x i16> [[TMP2]], <8 x i16> [[TMP3]], ptr [[TMP6]])
 ; CHECK-NEXT:    ret void
 ;
   call void @llvm.aarch64.neon.st2.v8i16.p0(<8 x i16> %A, <8 x i16> %B, ptr %P)
@@ -897,6 +897,7 @@ define void @st3_8h(<8 x i16> %A, <8 x i16> %B, <8 x i16> %C, ptr %P) nounwind s
 ; CHECK-NEXT:    [[TMP5:%.*]] = ptrtoint ptr [[P]] to i64
 ; CHECK-NEXT:    [[TMP6:%.*]] = xor i64 [[TMP5]], 193514046488576
 ; CHECK-NEXT:    [[TMP7:%.*]] = inttoptr i64 [[TMP6]] to ptr
+; CHECK-NEXT:    call void @llvm.aarch64.neon.st3.v8i16.p0(<8 x i16> [[TMP2]], <8 x i16> [[TMP3]], <8 x i16> [[TMP4]], ptr [[TMP7]])
 ; CHECK-NEXT:    [[_MSCMP:%.*]] = icmp ne i64 [[TMP1]], 0
 ; CHECK-NEXT:    br i1 [[_MSCMP]], label [[TMP8:%.*]], label [[TMP9:%.*]], !prof [[PROF0]]
 ; CHECK:       8:
@@ -904,7 +905,6 @@ define void @st3_8h(<8 x i16> %A, <8 x i16> %B, <8 x i16> %C, ptr %P) nounwind s
 ; CHECK-NEXT:    unreachable
 ; CHECK:       9:
 ; CHECK-NEXT:    call void @llvm.aarch64.neon.st3.v8i16.p0(<8 x i16> [[A]], <8 x i16> [[B]], <8 x i16> [[C]], ptr [[P]])
-; CHECK-NEXT:    call void @llvm.aarch64.neon.st3.v8i16.p0(<8 x i16> [[TMP2]], <8 x i16> [[TMP3]], <8 x i16> [[TMP4]], ptr [[TMP7]])
 ; CHECK-NEXT:    ret void
 ;
   call void @llvm.aarch64.neon.st3.v8i16.p0(<8 x i16> %A, <8 x i16> %B, <8 x i16> %C, ptr %P)
@@ -925,6 +925,7 @@ define void @st4_8h(<8 x i16> %A, <8 x i16> %B, <8 x i16> %C, <8 x i16> %D, ptr
 ; CHECK-NEXT:    [[TMP6:%.*]] = ptrtoint ptr [[P]] to i64
 ; CHECK-NEXT:    [[TMP7:%.*]] = xor i64 [[TMP6]], 193514046488576
 ; CHECK-NEXT:    [[TMP8:%.*]] = inttoptr i64 [[TMP7]] to ptr
+; CHECK-NEXT:    call void @llvm.aarch64.neon.st4.v8i16.p0(<8 x i16> [[TMP2]], <8 x i16> [[TMP3]], <8 x i16> [[TMP4]], <8 x i16> [[TMP5]], ptr [[TMP8]])
 ; CHECK-NEXT:    [[_MSCMP:%.*]] = icmp ne i64 [[TMP1]], 0
 ; CHECK-NEXT:    br i1 [[_MSCMP]], label [[TMP9:%.*]], label [[TMP10:%.*]], !prof [[PROF0]]
 ; CHECK:       9:
@@ -932,7 +933,6 @@ define void @st4_8h(<8 x i16> %A, <8 x i16> %B, <8 x i16> %C, <8 x i16> %D, ptr
 ; CHECK-NEXT:    unreachable
 ; CHECK:       10:
 ; CHECK-NEXT:    call void @llvm.aarch64.neon.st4.v8i16.p0(<8 x i16> [[A]], <8 x i16> [[B]], <8 x i16> [[C]], <8 x i16> [[D]], ptr [[P]])
-; CHECK-NEXT:    call void @llvm.aarch64.neon.st4.v8i16.p0(<8 x i16> [[TMP2]], <8 x i16> [[TMP3]], <8 x i16> [[TMP4]], <8 x i16> [[TMP5]], ptr [[TMP8]])
 ; CHECK-NEXT:    ret void
 ;
   call void @llvm.aarch64.neon.st4.v8i16.p0(<8 x i16> %A, <8 x i16> %B, <8 x i16> %C, <8 x i16> %D, ptr %P)
@@ -957,6 +957,7 @@ define void @st2_2s(<2 x i32> %A, <2 x i32> %B, ptr %P) nounwind sanitize_memory
 ; CHECK-NEXT:    [[TMP4:%.*]] = ptrtoint ptr [[P]] to i64
 ; CHECK-NEXT:    [[TMP5:%.*]] = xor i64 [[TMP4]], 193514046488576
 ; CHECK-NEXT:    [[TMP6:%.*]] = inttoptr i64 [[TMP5]] to ptr
+; CHECK-NEXT:    call void @llvm.aarch64.neon.st2.v2i32.p0(<2 x i32> [[TMP2]], <2 x i32> [[TMP3]], ptr [[TMP6]])
 ; CHECK-NEXT:    [[_MSCMP:%.*]] = icmp ne i64 [[TMP1]], 0
 ; CHECK-NEXT:    br i1 [[_MSCMP]], label [[TMP7:%.*]], label [[TMP8:%.*]], !prof [[PROF0]]
 ; CHECK:       7:
@@ -964,7 +965,6 @@ define void @st2_2s(<2 x i32> %A, <2 x i32> %B, ptr %P) nounwind sanitize_memory
 ; CHECK-NEXT:    unreachable
 ; CHECK:       8:
 ; CHECK-NEXT:    call void @llvm.aarch64.neon.st2.v2i32.p0(<2 x i32> [[A]], <2 x i32> [[B]], ptr [[P]])
-; CHECK-NEXT:    call void @llvm.aarch64.neon.st2.v2i32.p0(<2 x i32> [[TMP2]], <2 x i32> [[TMP3]], ptr [[TMP6]])
 ; CHECK-NEXT:    ret void
 ;
   call void @llvm.aarch64.neon.st2.v2i32.p0(<2 x i32> %A, <2 x i32> %B, ptr %P)
@@ -984,6 +984,7 @@ define void @st3_2s(<2 x i32> %A, <2 x i32> %B, <2 x i32> %C, ptr %P) nounwind s
 ; CHECK-NEXT:    [[TMP5:%.*]] = ptrtoint ptr [[P]] to i64
 ; CHECK-NEXT:    [[TMP6:%.*]] = xor i64 [[TMP5]], 193514046488576
 ; CHECK-NEXT:    [[TMP7:%.*]] = inttoptr i64 [[TMP6]] to ptr
+; CHECK-NEXT:    call void @llvm.aarch64.neon.st3.v2i32.p0(<2 x i32> [[TMP2]], <2 x i32> [[TMP3]], <2 x i32> [[TMP4]], ptr [[TMP7]])
 ; CHECK-NEXT:    [[_MSCMP:%.*]] = icmp ne i64 [[TMP1]], 0
 ; CHECK-NEXT:    br i1 [[_MSCMP]], label [[TMP8:%.*]], label [[TMP9:%.*]], !prof [[PROF0]]
 ; CHECK:       8:
@@ -991,7 +992,6 @@ define void @st3_2s(<2 x i32> %A, <2 x i32> %B, <2 x i32> %C, ptr %P) nounwind s
 ; CHECK-NEXT:    unreachable
 ; CHECK:       9:
 ; CHECK-NEXT:    call void @llvm.aarch64.neon.st3.v2i32.p0(<2 x i32> [[A]], <2 x i32> [[B]], <2 x i32> [[C]], ptr [[P]])
-; CHECK-NEXT:    call void @llvm.aarch64.neon.st3.v2i32.p0(<2 x i32> [[TMP2]], <2 x i32> [[TMP3]], <2 x i32> [[TMP4]], ptr [[TMP7]])
 ; CHECK-NEXT:    ret void
 ;
   call void @llvm.aarch64.neon.st3.v2i32.p0(<2 x i32> %A, <2 x i32> %B, <2 x i32> %C, ptr %P)
@@ -1012,6 +1012,7 @@ define void @st4_2s(<2 x i32> %A, <2 x i32> %B, <2 x i32> %C, <2 x i32> %D, ptr
 ; CHECK-NEXT:    [[TMP6:%.*]] = ptrtoint ptr [[P]] to i64
 ; CHECK-NEXT:    [[TMP7:%.*]] = xor i64 [[TMP6]], 193514046488576
 ; CHECK-NEXT:    [[TMP8:%.*]] = inttoptr i64 [[TMP7]] to ptr
+; CHECK-NEXT:    call void @llvm.aarch64.neon.st4.v2i32.p0(<2 x i32> [[TMP2]], <2 x i32> [[TMP3]], <2 x i32> [[TMP4]], <2 x i32> [[TMP5]], ptr [[TMP8]])
 ; CHECK-NEXT:    [[_MSCMP:%.*]] = icmp ne i64 [[TMP1]], 0
 ; CHECK-NEXT:    br i1 [[_MSCMP]], label [[TMP9:%.*]], label [[TMP10:%.*]], !prof [[PROF0]]
 ; CHECK:       9:
@@ -1019,7 +1020,6 @@ define void @st4_2s(<2 x i32> %A, <2 x i32> %B, <2 x i32> %C, <2 x i32> %D, ptr
 ; CHECK-NEXT:    unreachable
 ; CHECK:       10:
 ; CHECK-NEXT:    call void @llvm.aarch64.neon.st4.v2i32.p0(<2 x i32> [[A]], <2 x i32> [[B]], <2 x i32> [[C]], <2 x i32> [[D]], ptr [[P]])
-; CHECK-NEXT:    call void @llvm.aarch64.neon.st4.v2i32.p0(<2 x i32> [[TMP2]], <2 x i32> [[TMP3]], <2 x i32> [[TMP4]], <2 x i32> [[TMP5]], ptr [[TMP8]])
 ; CHECK-NEXT:    ret void
 ;
   call void @llvm.aarch64.neon.st4.v2i32.p0(<2 x i32> %A, <2 x i32> %B, <2 x i32> %C, <2 x i32> %D, ptr %P)
@@ -1042,6 +1042,7 @@ define void @st2_4s(<4 x i32> %A, <4 x i32> %B, ptr %P) nounwind sanitize_memory
 ; CHECK-NEXT:    [[TMP4:%.*]] = ptrtoint ptr [[P]] to i64
 ; CHECK-NEXT:    [[TMP5:%.*]] = xor i64 [[TMP4]], 193514046488576
 ; CHECK-NEXT:    [[TMP6:%.*]] = inttoptr i64 [[TMP5]] to ptr
+; CHECK-NEXT:    call void @llvm.aarch64.neon.st2.v4i32.p0(<4 x i32> [[TMP2]], <4 x i32> [[TMP3]], ptr [[TMP6]])
 ; CHECK-NEXT:    [[_MSCMP:%.*]] = icmp ne i64 [[TMP1]], 0
 ; CHECK-NEXT:    br i1 [[_MSCMP]], label [[TMP7:%.*]], label [[TMP8:%.*]], !prof [[PROF0]]
 ; CHECK:       7:
@@ -1049,7 +1050,6 @@ define void @st2_4s(<4 x i32> %A, <4 x i32> %B, ptr %P) nounwind sanitize_memory
 ; CHECK-NEXT:    unreachable
 ; CHECK:       8:
 ; CHECK-NEXT:    call void @llvm.aarch64.neon.st2.v4i32.p0(<4 x i32> [[A]], <4 x i32> [[B]], ptr [[P]])
-; CHECK-NEXT:    call void @llvm.aarch64.neon.st2.v4i32.p0(<4 x i32> [[TMP2]], <4 x i32> [[TMP3]], ptr [[TMP6]])
 ; CHECK-NEXT:    ret void
 ;
   call void @llvm.aarch64.neon.st2.v4i32.p0(<4 x i32> %A, <4 x i32> %B, ptr %P)
@@ -1069,6 +1069,7 @@ define void @st3_4s(<4 x i32> %A, <4 x i32> %B, <4 x i32> %C, ptr %P) nounwind s
 ; CHECK-NEXT:    [[TMP5:%.*]] = ptrtoint ptr [[P]] to i64
 ; CHECK-NEXT:    [[TMP6:%.*]] = xor i64 [[TMP5]], 193514046488576
 ; CHECK-NEXT:    [[TMP7:%.*]] = inttoptr i64 [[TMP6]] to ptr
+; CHECK-NEXT:    call void @llvm.aarch64.neon.st3.v4i32.p0(<4 x i32> [[TMP2]], <4 x i32> [[TMP3]], <4 x i32> [[TMP4]], ptr [[TMP7]])
 ; CHECK-NEXT:    [[_MSCMP:%.*]] = icmp ne i64 [[TMP1]], 0
 ; CHECK-NEXT:    br i1 [[_MSCMP]], label [[TMP8:%.*]], label [[TMP9:%.*]], !prof [[PROF0]]
 ; CHECK:       8:
@@ -1076,7 +1077,6 @@ define void @st3_4s(<4 x i32> %A, <4 x i32> %B, <4 x i32> %C, ptr %P) nounwind s
 ; CHECK-NEXT:    unreachable
 ; CHECK:       9:
 ; CHECK-NEXT:    call void @llvm.aarch64.neon.st3.v4i32.p0(<4 x i32> [[A]], <4 x i32> [[B]], <4 x i32> [[C]], ptr [[P]])
-; CHECK-NEXT:    call void @llvm.aarch64.neon.st3.v4i32.p0(<4 x i32> [[TMP2]], <4 x i32> [[TMP3]], <4 x i32> [[TMP4]], ptr [[TMP7]])
 ; CHECK-NEXT:    ret void
 ;
   call void @llvm.aarch64.neon.st3.v4i32.p0(<4 x i32> %A, <4 x i32> %B, <4 x i32> %C, ptr %P)
@@ -1097,6 +1097,7 @@ define void @st4_4s(<4 x i32> %A, <4 x i32> %B, <4 x i32> %C, <4 x i32> %D, ptr
 ; CHECK-NEXT:    [[TMP6:%.*]] = ptrtoint ptr [[P]] to i64
 ; CHECK-NEXT:    [[TMP7:%.*]] = xor i64 [[TMP6]], 193514046488576
 ; CHECK-NEXT:    [[TMP8:%.*]] = inttoptr i64 [[TMP7]] to ptr
+; CHECK-NEXT:    call void @llvm.aarch64.neon.st4.v4i32.p0(<4 x i32> [[TMP2]], <4 x i32> [[TMP3]], <4 x i32> [[TMP4]], <4 x i32> [[TMP5]], ptr [[TMP8]])
 ; CHECK-NEXT:    [[_MSCMP:%.*]] = icmp ne i64 [[TMP1]], 0
 ; CHECK-NEXT:    br i1 [[_MSCMP]], label [[TMP9:%.*]], label [[TMP10:%.*]], !prof [[PROF0]]
 ; CHECK:       9:
@@ -1104,7 +1105,6 @@ define void @st4_4s(<4 x i32> %A, <4 x i32> %B, <4 x i32> %C, <4 x i32> %D, ptr
 ; CHECK-NEXT:    unreachable
 ; CHECK:       10:
 ; CHECK-NEXT:    call void @llvm.aarch64.neon.st4.v4i32.p0(<4 x i32> [[A]], <4 x i32> [[B]], <4 x i32> [[C]], <4 x i32> [[D]], ptr [[P]])
-; CHECK-NEXT:    call void @llvm.aarch64.neon.st4.v4i32.p0(<4 x i32> [[TMP2]], <4 x i32> [[TMP3]], <4 x i32> [[TMP4]], <4 x i32> [[TMP5]], ptr [[TMP8]])
 ; CHECK-NEXT:    ret void
 ;
   call void @llvm.aarch64.neon.st4.v4i32.p0(<4 x i32> %A, <4 x i32> %B, <4 x i32> %C, <4 x i32> %D, ptr %P)
@@ -1130,6 +1130,7 @@ define void @st2_1d(<1 x i64> %A, <1 x i64> %B, ptr %P) nounwind sanitize_memory
 ; CHECK-NEXT:    [[TMP4:%.*]] = ptrtoint ptr [[P]] to i64
 ; CHECK-NEXT:    [[TMP5:%.*]] = xor i64 [[TMP4]], 193514046488576
 ; CHECK-NEXT:    [[TMP6:%.*]] = inttoptr i64 [[TMP5]] to ptr
+; CHECK-NEXT:    call void @llvm.aarch64.neon.st2.v1i64.p0(<1 x i64> [[TMP2]], <1 x i64> [[TMP3]], ptr [[TMP6]])
 ; CHECK-NEXT:    [[_MSCMP:%.*]] = icmp ne i64 [[TMP1]], 0
 ; CHECK-NEXT:    br i1 [[_MSCMP]], label [[TMP7:%.*]], label [[TMP8:%.*]], !prof [[PROF0]]
 ; CHECK:       7:
@@ -1137,7 +1138,6 @@ define void @st2_1d(<1 x i64> %A, <1 x i64> %B, ptr %P) nounwind sanitize_memory
 ; CHECK-NEXT:    unreachable
 ; CHECK:       8:
 ; CHECK-NEXT:    call void @llvm.aarch64.neon.st2.v1i64.p0(<1 x i64> [[A]], <1 x i64> [[B]], ptr [[P]])
-; CHECK-NEXT:    call void @llvm.aarch64.neon.st2.v1i64.p0(<1 x i64> [[TMP2]], <1 x i64> [[TMP3]], ptr [[TMP6]])
 ; CHECK-NEXT:    ret void
 ;
   call void @llvm.aarch64.neon.st2.v1i64.p0(<1 x i64> %A, <1 x i64> %B, ptr %P)
@@ -1157,6 +1157,7 @@ define void @st3_1d(<1 x i64> %A, <1 x i64> %B, <1 x i64> %C, ptr %P) nounwind s
 ; CHECK-NEXT:    [[TMP5:%.*]] = ptrtoint ptr [[P]] to i64
 ; CHECK-NEXT:    [[TMP6:%.*]] = xor i64 [[TMP5]], 193514046488576
 ; CHECK-NEXT:    [[TMP7:%.*]] = inttoptr i64 [[TMP6]] to ptr
+; CHECK-NEXT:    call void @llvm.aarch64.neon.st3.v1i64.p0(<1 x i64> [[TMP2]], <1 x i64> [[TMP3]], <1 x i64> [[TMP4]], ptr [[TMP7]])
 ; CHECK-NEXT:    [[_MSCMP:%.*]] = icmp ne i64 [[TMP1]], 0
 ; CHECK-NEXT:    br i1 [[_MSCMP]], label [[TMP8:%.*]], label [[TMP9:%.*]], !prof [[PROF0]]
 ; CHECK:       8:
@@ -1164,7 +1165,6 @@ define void @st3_1d(<1 x i64> %A, <1 x i64> %B, <1 x i64> %C, ptr %P) nounwind s
 ; CHECK-NEXT:    unreachable
 ; CHECK:       9:
 ; CHECK-NEXT:    call void @llvm.aarch64.neon.st3.v1i64.p0(<1 x i64> [[A]], <1 x i64> [[B]], <1 x i64> [[C]], ptr [[P]])
-; CHECK-NEXT:    call void @llvm.aarch64.neon.st3.v1i64.p0(<1 x i64> [[TMP2]], <1 x i64> [[TMP3]], <1 x i64> [[TMP4]], ptr [[TMP7]])
 ; CHECK-NEXT:    ret void
 ;
   call void @llvm.aarch64.neon.st3.v1i64.p0(<1 x i64> %A, <1 x i64> %B, <1 x i64> %C, ptr %P)
@@ -1185,6 +1185,7 @@ define void @st4_1d(<1 x i64> %A, <1 x i64> %B, <1 x i64> %C, <1 x i64> %D, ptr
 ; CHECK-NEXT:    [[TMP6:%.*]] = ptrtoint ptr [[P]] to i64
 ; CHECK-NEXT:    [[TMP7:%.*]] = xor i64 [[TMP6]], 193514046488576
 ; CHECK-NEXT:    [[TMP8:%.*]] = inttoptr i64 [[TMP7]] to ptr
+; CHECK-NEXT:    call void @llvm.aarch64.neon.st4.v1i64.p0(<1 x i64> [[TMP2]], <1 x i64> [[TMP3]], <1 x i64> [[TMP4]], <1 x i64> [[TMP5]], ptr [[TMP8]])
 ; CHECK-NEXT:    [[_MSCMP:%.*]] = icmp ne i64 [[TMP1]], 0
 ; CHECK-NEXT:    br i1 [[_MSCMP]], label [[TMP9:%.*]], label [[TMP10:%.*]], !prof [[PROF0]]
 ; CHECK:       9:
@@ -1192,7 +1193,6 @@ define void @st4_1d(<1 x i64> %A, <1 x i64> %B, <1 x i64> %C, <1 x i64> %D, ptr
 ; CHECK-NEXT:    unreachable
 ; CHECK:       10:
 ; CHECK-NEXT:    call void @llvm.aarch64.neon.st4.v1i64.p0(<1 x i64> [[A]], <1 x i64> [[B]], <1 x i64> [[C]], <1 x i64> [[D]], ptr [[P]])
-; CHECK-NEXT:    call void @llvm.aarch64.neon.st4.v1i64.p0(<1 x i64> [[TMP2]], <1 x i64> [[TMP3]], <1 x i64> [[TMP4]], <1 x i64> [[TMP5]], ptr [[TMP8]])
 ; CHECK-NEXT:    ret void
 ;
   call void @llvm.aarch64.neon.st4.v1i64.p0(<1 x i64> %A, <1 x i64> %B, <1 x i64> %C, <1 x i64> %D, ptr %P)
@@ -1217,6 +1217,7 @@ define void @st2_2d(<2 x i64> %A, <2 x i64> %B, ptr %P) nounwind sanitize_memory
 ; CHECK-NEXT:    [[TMP4:%.*]] = ptrtoint ptr [[P]] to i64
 ; CHECK-NEXT:    [[TMP5:%.*]] = xor i64 [[TMP4]], 193514046488576
 ; CHECK-NEXT:    [[TMP6:%.*]] = inttoptr i64 [[TMP5]] to ptr
+; CHECK-NEXT:    call void @llvm.aarch64.neon.st2.v2i64.p0(<2 x i64> [[TMP2]], <2 x i64> [[TMP3]], ptr [[TMP6]])
 ; CHECK-NEXT:    [[_MSCMP:%.*]] = icmp ne i64 [[TMP1]], 0
 ; CHECK-NEXT:    br i1 [[_MSCMP]], label [[TMP7:%.*]], label [[TMP8:%.*]], !prof [[PROF0]]
 ; CHECK:       7:
@@ -1224,7 +1225,6 @@ define void @st2_2d(<2 x i64> %A, <2 x i64> %B, ptr %P) nounwind sanitize_memory
 ; CHECK-NEXT:    unreachable
 ; CHECK:       8:
 ; CHECK-NEXT:    call void @llvm.aarch64.neon.st2.v2i64.p0(<2 x i64> [[A]], <2 x i64> [[B]], ptr [[P]])
-; CHECK-NEXT:    call void @llvm.aarch64.neon.st2.v2i64.p0(<2 x i64> [[TMP2]], <2 x i64> [[TMP3]], ptr [[TMP6]])
 ; CHECK-NEXT:    ret void
 ;
   call void @llvm.aarch64.neon.st2.v2i64.p0(<2 x i64> %A, <2 x i64> %B, ptr %P)
@@ -1242,6 +1242,7 @@ define void @st2_2d_undefA(<2 x i64> %A, <2 x i64> %B, ptr %P) nounwind sanitize
 ; CHECK-NEXT:    [[TMP3:%.*]] = ptrtoint ptr [[P]] to i64
 ; CHECK-NEXT:    [[TMP4:%.*]] = xor i64 [[TMP3]], 193514046488576
 ; CHECK-NEXT:    [[TMP5:%.*]] = inttoptr i64 [[TMP4]] to ptr
+; CHECK-NEXT:    call void @llvm.aarch64.neon.st2.v2i64.p0(<2 x i64> <i64 -1, i64 -1>, <2 x i64> [[TMP2]], ptr [[TMP5]])
 ; CHECK-NEXT:    [[_MSCMP:%.*]] = icmp ne i64 [[TMP1]], 0
 ; CHECK-NEXT:    br i1 [[_MSCMP]], label [[TMP6:%.*]], label [[TMP7:%.*]], !prof [[PROF0]]
 ; CHECK:       6:
@@ -1249,7 +1250,6 @@ define void @st2_2d_undefA(<2 x i64> %A, <2 x i64> %B, ptr %P) nounwind sanitize
 ; CHECK-NEXT:    unreachable
 ; CHECK:       7:
 ; CHECK-NEXT:    call void @llvm.aarch64.neon.st2.v2i64.p0(<2 x i64> undef, <2 x i64> [[B]], ptr [[P]])
-; CHECK-NEXT:    call void @llvm.aarch64.neon.st2.v2i64.p0(<2 x i64> <i64 -1, i64 -1>, <2 x i64> [[TMP2]], ptr [[TMP5]])
 ; CHECK-NEXT:    ret void
 ;
   call void @llvm.aarch64.neon.st2.v2i64.p0(<2 x i64> undef, <2 x i64> %B, ptr %P)
@@ -1267,6 +1267,7 @@ define void @st2_2d_undefB(<2 x i64> %A, <2 x i64> %B, ptr %P) nounwind sanitize
 ; CHECK-NEXT:    [[TMP3:%.*]] = ptrtoint ptr [[P]] to i64
 ; CHECK-NEXT:    [[TMP4:%.*]] = xor i64 [[TMP3]], 193514046488576
 ; CHECK-NEXT:    [[TMP5:%.*]] = inttoptr i64 [[TMP4]] to ptr
+; CHECK-NEXT:    call void @llvm.aarch64.neon.st2.v2i64.p0(<2 x i64> [[TMP2]], <2 x i64> <i64 -1, i64 -1>, ptr [[TMP5]])
 ; CHECK-NEXT:    [[_MSCMP:%.*]] = icmp ne i64 [[TMP1]], 0
 ; CHECK-NEXT:    br i1 [[_MSCMP]], label [[TMP6:%.*]], label [[TMP7:%.*]], !prof [[PROF0]]
 ; CHECK:       6:
@@ -1274,7 +1275,6 @@ define void @st2_2d_undefB(<2 x i64> %A, <2 x i64> %B, ptr %P) nounwind sanitize
 ; CHECK-NEXT:    unreachable
 ; CHECK:       7:
 ; CHECK-NEXT:    call void @llvm.aarch64.neon.st2.v2i64.p0(<2 x i64> [[A]], <2 x i64> undef, ptr [[P]])
-; CHECK-NEXT:    call void @llvm.aarch64.neon.st2.v2i64.p0(<2 x i64> [[TMP2]], <2 x i64> <i64 -1, i64 -1>, ptr [[TMP5]])
 ; CHECK-NEXT:    ret void
 ;
   call void @llvm.aarch64.neon.st2.v2i64.p0(<2 x i64> %A, <2 x i64> undef, ptr %P)
@@ -1291,6 +1291,7 @@ define void @st2_2d_undefAB(<2 x i64> %A, <2 x i64> %B, ptr %P) nounwind sanitiz
 ; CHECK-NEXT:    [[TMP2:%.*]] = ptrtoint ptr [[P]] to i64
 ; CHECK-NEXT:    [[TMP3:%.*]] = xor i64 [[TMP2]], 193514046488576
 ; CHECK-NEXT:    [[TMP4:%.*]] = inttoptr i64 [[TMP3]] to ptr
+; CHECK-NEXT:    call void @llvm.aarch64.neon.st2.v2i64.p0(<2 x i64> <i64 -1, i64 -1>, <2 x i64> <i64 -1, i64 -1>, ptr [[TMP4]])
 ; CHECK-NEXT:    [[_MSCMP:%.*]] = icmp ne i64 [[TMP1]], 0
 ; CHECK-NEXT:    br i1 [[_MSCMP]], label [[TMP5:%.*]], label [[TMP6:%.*]], !prof [[PROF0]]
 ; CHECK:       5:
@@ -1298,7 +1299,6 @@ define void @st2_2d_undefAB(<2 x i64> %A, <2 x i64> %B, ptr %P) nounwind sanitiz
 ; CHECK-NEXT:    unreachable
 ; CHECK:       6:
 ; CHECK-NEXT:    call void @llvm.aarch64.neon.st2.v2i64.p0(<2 x i64> undef, <2 x i64> undef, ptr [[P]])
-; CHECK-NEXT:    call void @llvm.aarch64.neon.st2.v2i64.p0(<2 x i64> <i64 -1, i64 -1>, <2 x i64> <i64 -1, i64 -1>, ptr [[TMP4]])
 ; CHECK-NEXT:    ret void
 ;
   call void @llvm.aarch64.neon.st2.v2i64.p0(<2 x i64> undef, <2 x i64> undef, ptr %P)
@@ -1318,6 +1318,7 @@ define void @st3_2d(<2 x i64> %A, <2 x i64> %B, <2 x i64> %C, ptr %P) nounwind s
 ; CHECK-NEXT:    [[TMP5:%.*]] = ptrtoint ptr [[P]] to i64
 ; CHECK-NEXT:    [[TMP6:%.*]] = xor i64 [[TMP5]], 193514046488576
 ; CHECK-NEXT:    [[TMP7:%.*]] = inttoptr i64 [[TMP6]] to ptr
+; CHECK-NEXT:    call void @llvm.aarch64.neon.st3.v2i64.p0(<2 x i64> [[TMP2]], <2 x i64> [[TMP3]], <2 x i64> [[TMP4]], ptr [[TMP7]])
 ; CHECK-NEXT:    [[_MSCMP:%.*]] = icmp ne i64 [[TMP1]], 0
 ; CHECK-NEXT:    br i1 [[_MSCMP]], label [[TMP8:%.*]], label [[TMP9:%.*]], !prof [[PROF0]]
 ; CHECK:       8:
@@ -1325,7 +1326,6 @@ define void @st3_2d(<2 x i64> %A, <2 x i64> %B, <2 x i64> %C, ptr %P) nounwind s
 ; CHECK-NEXT:    unreachable
 ; CHECK:       9:
 ; CHECK-NEXT:    call void @llvm.aarch64.neon.st3.v2i64.p0(<2 x i64> [[A]], <2 x i64> [[B]], <2 x i64> [[C]], ptr [[P]])
-; CHECK-NEXT:    call void @llvm.aarch64.neon.st3.v2i64.p0(<2 x i64> [[TMP2]], <2 x i64> [[TMP3]], <2 x i64> [[TMP4]], ptr [[TMP7]])
 ; CHECK-NEXT:    ret void
 ;
   call void @llvm.aarch64.neon.st3.v2i64.p0(<2 x i64> %A, <2 x i64> %B, <2 x i64> %C, ptr %P)
@@ -1344,6 +1344,7 @@ define void @st3_2d_undefA(<2 x i64> %A, <2 x i64> %B, <2 x i64> %C, ptr %P) nou
 ; CHECK-NEXT:    [[TMP4:%.*]] = ptrtoint ptr [[P]] to i64
 ; CHECK-NEXT:    [[TMP5:%.*]] = xor i64 [[TMP4]], 193514046488576
 ; CHECK-NEXT:    [[TMP6:%.*]] = inttoptr i64 [[TMP5]] to ptr
+; CHECK-NEXT:    call void @llvm.aarch64.neon.st3.v2i64.p0(<2 x i64> <i64 -1, i64 -1>, <2 x i64> [[TMP2]], <2 x i64> [[TMP3]], ptr [[TMP6]])
 ; CHECK-NEXT:    [[_MSCMP:%.*]] = icmp ne i64 [[TMP1]], 0
 ; CHECK-NEXT:    br i1 [[_MSCMP]], label [[TMP7:%.*]], label [[TMP8:%.*]], !prof [[PROF0]]
 ; CHECK:       7:
@@ -1351,7 +1352,6 @@ define void @st3_2d_undefA(<2 x i64> %A, <2 x i64> %B, <2 x i64> %C, ptr %P) nou
 ; CHECK-NEXT:    unreachable
 ; CHECK:       8:
 ; CHECK-NEXT:    call void @llvm.aarch64.neon.st3.v2i64.p0(<2 x i64> undef, <2 x i64> [[B]], <2 x i64> [[C]], ptr [[P]])
-; CHECK-NEXT:    call void @llvm.aarch64.neon.st3.v2i64.p0(<2 x i64> <i64 -1, i64 -1>, <2 x i64> [[TMP2]], <2 x i64> [[TMP3]], ptr [[TMP6]])
 ; CHECK-NEXT:    ret void
 ;
   call void @llvm.aarch64.neon.st3.v2i64.p0(<2 x i64> undef, <2 x i64> %B, <2 x i64> %C, ptr %P)
@@ -1370,6 +1370,7 @@ define void @st3_2d_undefB(<2 x i64> %A, <2 x i64> %B, <2 x i64> %C, ptr %P) nou
 ; CHECK-NEXT:    [[TMP4:%.*]] = ptrtoint ptr [[P]] to i64
 ; CHECK-NEXT:    [[TMP5:%.*]] = xor i64 [[TMP4]], 193514046488576
 ; CHECK-NEXT:    [[TMP6:%.*]] = inttoptr i64 [[TMP5]] to ptr
+; CHECK-NEXT:    call void @llvm.aarch64.neon.st3.v2i64.p0(<2 x i64> [[TMP2]], <2 x i64> <i64 -1, i64 -1>, <2 x i64> [[TMP3]], ptr [[TMP6]])
 ; CHECK-NEXT:    [[_MSCMP:%.*]] = icmp ne i64 [[TMP1]], 0
 ; CHECK-NEXT:    br i1 [[_MSCMP]], label [[TMP7:%.*]], label [[TMP8:%.*]], !prof [[PROF0]]
 ; CHECK:       7:
@@ -1377,7 +1378,6 @@ define void @st3_2d_undefB(<2 x i64> %A, <2 x i64> %B, <2 x i64> %C, ptr %P) nou
 ; CHECK-NEXT:    unreachable
 ; CHECK:       8:
 ; CHECK-NEXT:    call void @llvm.aarch64.neon.st3.v2i64.p0(<2 x i64> [[A]], <2 x i64> undef, <2 x i64> [[C]], ptr [[P]])
-; CHECK-NEXT:    call void @llvm.aarch64.neon.st3.v2i64.p0(<2 x i64> [[TMP2]], <2 x i64> <i64 -1, i64 -1>, <2 x i64> [[TMP3]], ptr [[TMP6]])
 ; CHECK-NEXT:    ret void
 ;
   call void @llvm.aarch64.neon.st3.v2i64.p0(<2 x i64> %A, <2 x i64> undef, <2 x i64> %C, ptr %P)
@@ -1396,6 +1396,7 @@ define void @st3_2d_undefC(<2 x i64> %A, <2 x i64> %B, <2 x i64> %C, ptr %P) nou
 ; CHECK-NEXT:    [[TMP4:%.*]] = ptrtoint ptr [[P]] to i64
 ; CHECK-NEXT:    [[TMP5:%.*]] = xor i64 [[TMP4]], 193514046488576
 ; CHECK-NEXT:    [[TMP6:%.*]] = inttoptr i64 [[TMP5]] to ptr
+; CHECK-NEXT:    call void @llvm.aarch64.neon.st3.v2i64.p0(<2 x i64> [[TMP2]], <2 x i64> [[TMP3]], <2 x i64> <i64 -1, i64 -1>, ptr [[TMP6]])
 ; CHECK-NEXT:    [[_MSCMP:%.*]] = icmp ne i64 [[TMP1]], 0
 ; CHECK-NEXT:    br i1 [[_MSCMP]], label [[TMP7:%.*]], label [[TMP8:%.*]], !prof [[PROF0]]
 ; CHECK:       7:
@@ -1403,7 +1404,6 @@ define void @st3_2d_undefC(<2 x i64> %A, <2 x i64> %B, <2 x i64> %C, ptr %P) nou
 ; CHECK-NEXT:    unreachable
 ; CHECK:       8:
 ; CHECK-NEXT:    call void @llvm.aarch64.neon.st3.v2i64.p0(<2 x i64> [[A]], <2 x i64> [[B]], <2 x i64> undef, ptr [[P]])
-; CHECK-NEXT:    call void @llvm.aarch64.neon.st3.v2i64.p0(<2 x i64> [[TMP2]], <2 x i64> [[TMP3]], <2 x i64> <i64 -1, i64 -1>, ptr [[TMP6]])
 ; CHECK-NEXT:    ret void
 ;
   call void @llvm.aarch64.neon.st3.v2i64.p0(<2 x i64> %A, <2 x i64> %B, <2 x i64> undef, ptr %P)
@@ -1421,6 +1421,7 @@ define void @st3_2d_undefAB(<2 x i64> %A, <2 x i64> %B, <2 x i64> %C, ptr %P) no
 ; CHECK-NEXT:    [[TMP3:%.*]] = ptrtoint ptr [[P]] to i64
 ; CHECK-NEXT:    [[TMP4:%.*]] = xor i64 [[TMP3]], 193514046488576
 ; CHECK-NEXT:    [[TMP5:%.*]] = inttoptr i64 [[TMP4]] to ptr
+; CHECK-NEXT:    call void @llvm.aarch64.neon.st3.v2i64.p0(<2 x i64> <i64 -1, i64 -1>, <2 x i64> <i64 -1, i64 -1>, <2 x i64> [[TMP2]], ptr [[TMP5]])
 ; CHECK-NEXT:    [[_MSCMP:%.*]] = icmp ne i64 [[TMP1]], 0
 ; CHECK-NEXT:    br i1 [[_MSCMP]], label [[TMP6:%.*]], label [[TMP7:%.*]], !prof [[PROF0]]
 ; CHECK:       6:
@@ -1428,7 +1429,6 @@ define void @st3_2d_undefAB(<2 x i64> %A, <2 x i64> %B, <2 x i64> %C, ptr %P) no
 ; CHECK-NEXT:    unreachable
 ; CHECK:       7:
 ; CHECK-NEXT:    call void @llvm.aarch64.neon.st3.v2i64.p0(<2 x i64> undef, <2 x i64> undef, <2 x i64> [[C]], ptr [[P]])
-; CHECK-NEXT:    call void @llvm.aarch64.neon.st3.v2i64.p0(<2 x i64> <i64 -1, i64 -1>, <2 x i64> <i64 -1, i64 -1>, <2 x i64> [[TMP2]], ptr [[TMP5]])
 ; CHECK-NEXT:    ret void
 ;
   call void @llvm.aarch64.neon.st3.v2i64.p0(<2 x i64> undef, <2 x i64> undef, <2 x i64> %C, ptr %P)
@@ -1446,6 +1446,7 @@ define void @st3_2d_undefAC(<2 x i64> %A, <2 x i64> %B, <2 x i64> %C, ptr %P) no
 ; CHECK-NEXT:    [[TMP3:%.*]] = ptrtoint ptr [[P]] to i64
 ; CHECK-NEXT:    [[TMP4:%.*]] = xor i64 [[TMP3]], 193514046488576
 ; CHECK-NEXT:    [[TMP5:%.*]] = inttoptr i64 [[TMP4]] to ptr
+; CHECK-NEXT:    call void @llvm.aarch64.neon.st3.v2i64.p0(<2 x i64> <i64 -1, i64 -1>, <2 x i64> [[TMP2]], <2 x i64> <i64 -1, i64 -1>, ptr [[TMP5]])
 ; CHECK-NEXT:    [[_MSCMP:%.*]] = icmp ne i64 [[TMP1]], 0
 ; CHECK-NEXT:    br i1 [[_MSCMP]], label [[TMP6:%.*]], label [[TMP7:%.*]], !prof [[PROF0]]
 ; CHECK:       6:
@@ -1453,7 +1454,6 @@ define void @st3_2d_undefAC(<2 x i64> %A, <2 x i64> %B, <2 x i64> %C, ptr %P) no
 ; CHECK-NEXT:    unreachable
 ; CHECK:       7:
 ; CHECK-NEXT:    call void @llvm.aarch64.neon.st3.v2i64.p0(<2 x i64> undef, <2 x i64> [[B]], <2 x i64> undef, ptr [[P]])
-; CHECK-NEXT:    call void @llvm.aarch64.neon.st3.v2i64.p0(<2 x i64> <i64 -1, i64 -1>, <2 x i64> [[TMP2]], <2 x i64> <i64 -1, i64 -1>, ptr [[TMP5]])
 ; CHECK-NEXT:    ret void
 ;
   call void @llvm.aarch64.neon.st3.v2i64.p0(<2 x i64> undef, <2 x i64> %B, <2 x i64> undef, ptr %P)
@@ -1471,6 +1471,7 @@ define void @st3_2d_undefBC(<2 x i64> %A, <2 x i64> %B, <2 x i64> %C, ptr %P) no
 ; CHECK-NEXT:    [[TMP3:%.*]] = ptrtoint ptr [[P]] to i64
 ; CHECK-NEXT:    [[TMP4:%.*]] = xor i64 [[TMP3]], 193514046488576
 ; CHECK-NEXT:    [[TMP5:%.*]] = inttoptr i64 [[TMP4]] to ptr
+; CHECK-NEXT:    call void @llvm.aarch64.neon.st3.v2i64.p0(<2 x i64> [[TMP2]], <2 x i64> <i64 -1, i64 -1>, <2 x i64> <i64 -1, i64 -1>, ptr [[TMP5]])
 ; CHECK-NEXT:    [[_MSCMP:%.*]] = icmp ne i64 [[TMP1]], 0
 ; CHECK-NEXT:    br i1 [[_MSCMP]], label [[TMP6:%.*]], label [[TMP7:%.*]], !prof [[PROF0]]
 ; CHECK:       6:
@@ -1478,7 +1479,6 @@ define void @st3_2d_undefBC(<2 x i64> %A, <2 x i64> %B, <2 x i64> %C, ptr %P) no
 ; CHECK-NEXT:    unreachable
 ; CHECK:       7:
 ; CHECK-NEXT:    call void @llvm.aarch64.neon.st3.v2i64.p0(<2 x i64> [[A]], <2 x i64> undef, <2 x i64> undef, ptr [[P]])
-; CHECK-NEXT:    call void @llvm.aarch64.neon.st3.v2i64.p0(<2 x i64> [[TMP2]], <2 x i64> <i64 -1, i64 -1>, <2 x i64> <i64 -1, i64 -1>, ptr [[TMP5]])
 ; CHECK-NEXT:    ret void
 ;
   call void @llvm.aarch64.neon.st3.v2i64.p0(<2 x i64> %A, <2 x i64> undef, <2 x i64> undef, ptr %P)
@@ -1495,6 +1495,7 @@ define void @st3_2d_undefABC(<2 x i64> %A, <2 x i64> %B, <2 x i64> %C, ptr %P) n
 ; CHECK-NEXT:    [[TMP2:%.*]] = ptrtoint ptr [[P]] to i64
 ; CHECK-NEXT:    [[TMP3:%.*]] = xor i64 [[TMP2]], 193514046488576
 ; CHECK-NEXT:    [[TMP4:%.*]] = inttoptr i64 [[TMP3]] to ptr
+; CHECK-NEXT:    call void @llvm.aarch64.neon.st3.v2i64.p0(<2 x i64> <i64 -1, i64 -1>, <2 x i64> <i64 -1, i64 -1>, <2 x i64> <i64 -1, i64 -1>, ptr [[TMP4]])
 ; CHECK-NEXT:    [[_MSCMP:%.*]] = icmp ne i64 [[TMP1]], 0
 ; CHECK-NEXT:    br i1 [[_MSCMP]], label [[TMP5:%.*]], label [[TMP6:%.*]], !prof [[PROF0]]
 ; CHECK:       5:
@@ -1502,7 +1503,6 @@ define void @st3_2d_undefABC(<2 x i64> %A, <2 x i64> %B, <2 x i64> %C, ptr %P) n
 ; CHECK-NEXT:    unreachable
 ; CHECK:       6:
 ; CHECK-NEXT:    call void @llvm.aarch64.neon.st3.v2i64.p0(<2 x i64> undef, <2 x i64> undef, <2 x i64> undef, ptr [[P]])
-; CHECK-NEXT:    call void @llvm.aarch64.neon.st3.v2i64.p0(<2 x i64> <i64 -1, i64 -1>, <2 x i64> <i64 -1, i64 -1>, <2 x i64> <i64 -1, i64 -1>, ptr [[TMP4]])
 ; CHECK-NEXT:    ret void
 ;
   call void @llvm.aarch64.neon.st3.v2i64.p0(<2 x i64> undef, <2 x i64> undef, <2 x i64> undef, ptr %P)
@@ -1523,6 +1523,7 @@ define void @st4_2d(<2 x i64> %A, <2 x i64> %B, <2 x i64> %C, <2 x i64> %D, ptr
 ; CHECK-NEXT:    [[TMP6:%.*]] = ptrtoint ptr [[P]] to i64
 ; CHECK-NEXT:    [[TMP7:%.*]] = xor i64 [[TMP6]], 193514046488576
 ; CHECK-NEXT:    [[TMP8:%.*]] = inttoptr i64 [[TMP7]] to ptr
+; CHECK-NEXT:    call void @llvm.aarch64.neon.st4.v2i64.p0(<2 x i64> [[TMP2]], <2 x i64> [[TMP3]], <2 x i64> [[TMP4]], <2 x i64> [[TMP5]], ptr [[TMP8]])
 ; CHECK-NEXT:    [[_MSCMP:%.*]] = icmp ne i64 [[TMP1]], 0
 ; CHECK-NEXT:    br i1 [[_MSCMP]], label [[TMP9:%.*]], label [[TMP10:%.*]], !prof [[PROF0]]
 ; CHECK:       9:
@@ -1530,7 +1531,6 @@ define void @st4_2d(<2 x i64> %A, <2 x i64> %B, <2 x i64> %C, <2 x i64> %D, ptr
 ; CHECK-NEXT:    unreachable
 ; CHECK:       10:
 ; CHECK-NEXT:    call void @llvm.aarch64.neon.st4.v2i64.p0(<2 x i64> [[A]], <2 x i64> [[B]], <2 x i64> [[C]], <2 x i64> [[D]], ptr [[P]])
-; CHECK-NEXT:    call void @llvm.aarch64.neon.st4.v2i64.p0(<2 x i64> [[TMP2]], <2 x i64> [[TMP3]], <2 x i64> [[TMP4]], <2 x i64> [[TMP5]], ptr [[TMP8]])
 ; CHECK-NEXT:    ret void
 ;
   call void @llvm.aarch64.neon.st4.v2i64.p0(<2 x i64> %A, <2 x i64> %B, <2 x i64> %C, <2 x i64> %D, ptr %P)
@@ -1554,6 +1554,7 @@ define void @st4_2d_undefA(<2 x i64> %A, <2 x i64> %B, <2 x i64> %C, <2 x i64> %
 ; CHECK-NEXT:    [[TMP5:%.*]] = ptrtoint ptr [[P]] to i64
 ; CHECK-NEXT:    [[TMP6:%.*]] = xor i64 [[TMP5]], 193514046488576
 ; CHECK-NEXT:    [[TMP7:%.*]] = inttoptr i64 [[TMP6]] to ptr
+; CHECK-NEXT:    call void @llvm.aarch64.neon.st4.v2i64.p0(<2 x i64> <i64 -1, i64 -1>, <2 x i64> [[TMP2]], <2 x i64> [[TMP3]], <2 x i64> [[TMP4]], ptr [[TMP7]])
 ; CHECK-NEXT:    [[_MSCMP:%.*]] = icmp ne i64 [[TMP1]], 0
 ; CHECK-NEXT:    br i1 [[_MSCMP]], label [[TMP8:%.*]], label [[TMP9:%.*]], !prof [[PROF0]]
 ; CHECK:       8:
@@ -1561,7 +1562,6 @@ define void @st4_2d_undefA(<2 x i64> %A, <2 x i64> %B, <2 x i64> %C, <2 x i64> %
 ; CHECK-NEXT:    unreachable
 ; CHECK:       9:
 ; CHECK-NEXT:    call void @llvm.aarch64.neon.st4.v2i64.p0(<2 x i64> undef, <2 x i64> [[B]], <2 x i64> [[C]], <2 x i64> [[D]], ptr [[P]])
-; CHECK-NEXT:    call void @llvm.aarch64.neon.st4.v2i64.p0(<2 x i64> <i64 -1, i64 -1>, <2 x i64> [[TMP2]], <2 x i64> [[TMP3]], <2 x i64> [[TMP4]], ptr [[TMP7]])
 ; CHECK-NEXT:    ret void
 ;
   call void @llvm.aarch64.neon.st4.v2i64.p0(<2 x i64> undef, <2 x i64> %B, <2 x i64> %C, <2 x i64> %D, ptr %P)
@@ -1581,6 +1581,7 @@ define void @st4_2d_undefB(<2 x i64> %A, <2 x i64> %B, <2 x i64> %C, <2 x i64> %
 ; CHECK-NEXT:    [[TMP5:%.*]] = ptrtoint ptr [[P]] to i64
 ; CHECK-NEXT:    [[TMP6:%.*]] = xor i64 [[TMP5]], 193514046488576
 ; CHECK-NEXT:    [[TMP7:%.*]] = inttoptr i64 [[TMP6]] to ptr
+; CHECK-NEXT:    call void @llvm.aarch64.neon.st4.v2i64.p0(<2 x i64> [[TMP2]], <2 x i64> <i64 -1, i64 -1>, <2 x i64> [[TMP3]], <2 x i64> [[TMP4]], ptr [[TMP7]])
 ; CHECK-NEXT:    [[_MSCMP:%.*]] = icmp ne i64 [[TMP1]], 0
 ; CHECK-NEXT:    br i1 [[_MSCMP]], label [[TMP8:%.*]], label [[TMP9:%.*]], !prof [[PROF0]]
 ; CHECK:       8:
@@ -1588,7 +1589,6 @@ define void @st4_2d_undefB(<2 x i64> %A, <2 x i64> %B, <2 x i64> %C, <2 x i64> %
 ; CHECK-NEXT:    unreachable
 ; CHECK:       9:
 ; CHECK-NEXT:    call void @llvm.aarch64.neon.st4.v2i64.p0(<2 x i64> [[A]], <2 x i64> undef, <2 x i64> [[C]], <2 x i64> [[D]], ptr [[P]])
-; CHECK-NEXT:    call void @llvm.aarch64.neon.st4.v2i64.p0(<2 x i64> [[TMP2]], <2 x i64> <i64 -1, i64 -1>, <2 x i64> [[TMP3]], <2 x i64> [[TMP4]], ptr [[TMP7]])
 ; CHECK-NEXT:    ret void
 ;
   call void @llvm.aarch64.neon.st4.v2i64.p0(<2 x i64> %A, <2 x i64> undef, <2 x i64> %C, <2 x i64> %D, ptr %P)
@@ -1608,6 +1608,7 @@ define void @st4_2d_undefC(<2 x i64> %A, <2 x i64> %B, <2 x i64> %C, <2 x i64> %
 ; CHECK-NEXT:    [[TMP5:%.*]] = ptrtoint ptr [[P]] to i64
 ; CHECK-NEXT:    [[TMP6:%.*]] = xor i64 [[TMP5]], 193514046488576
 ; CHECK-NEXT:    [[TMP7:%.*]] = inttoptr i64 [[TMP6]] to ptr
+; CHECK-NEXT:    call void @llvm.aarch64.neon.st4.v2i64.p0(<2 x i64> [[TMP2]], <2 x i64> [[TMP3]], <2 x i64> <i64 -1, i64 -1>, <2 x i64> [[TMP4]], ptr [[TMP7]])
 ; CHECK-NEXT:    [[_MSCMP:%.*]] = icmp ne i64 [[TMP1]], 0
 ; CHECK-NEXT:    br i1 [[_MSCMP]], label [[TMP8:%.*]], label [[TMP9:%.*]], !prof [[PROF0]]
 ; CHECK:       8:
@@ -1615,7 +1616,6 @@ define void @st4_2d_undefC(<2 x i64> %A, <2 x i64> %B, <2 x i64> %C, <2 x i64> %
 ; CHECK-NEXT:    unreachable
 ; CHECK:       9:
 ; CHECK-NEXT:    call void @llvm.aarch64.neon.st4.v2i64.p0(<2 x i64> [[A]], <2 x i64> [[B]], <2 x i64> undef, <2 x i64> [[D]], ptr [[P]])
-; CHECK-NEXT:    call void @llvm.aarch64.neon.st4.v2i64.p0(<2 x i64> [[TMP2]], <2 x i64> [[TMP3]], <2 x i64> <i64 -1, i64 -1>, <2 x i64> [[TMP4]], ptr [[TMP7]])
 ; CHECK-NEXT:    ret void
 ;
   call void @llvm.aarch64.neon.st4.v2i64.p0(<2 x i64> %A, <2 x i64> %B, <2 x i64> undef, <2 x i64> %D, ptr %P)
@@ -1635,6 +1635,7 @@ define void @st4_2d_undefD(<2 x i64> %A, <2 x i64> %B, <2 x i64> %C, <2 x i64> %
 ; CHECK-NEXT:    [[TMP5:%.*]] = ptrtoint ptr [[P]] to i64
 ; CHECK-NEXT:    [[TMP6:%.*]] = xor i64 [[TMP5]], 193514046488576
 ; CHECK-NEXT:    [[TMP7:%.*]] = inttoptr i64 [[TMP6]] to ptr
+; CHECK-NEXT:    call void @llvm.aarch64.neon.st4.v2i64.p0(<2 x i64> [[TMP2]], <2 x i64> [[TMP3]], <2 x i64> [[TMP4]], <2 x i64> <i64 -1, i64 -1>, ptr [[TMP7]])
 ; CHECK-NEXT:    [[_MSCMP:%.*]] = icmp ne i64 [[TMP1]], 0
 ; CHECK-NEXT:    br i1 [[_MSCMP]], label [[TMP8:%.*]], label [[TMP9:%.*]], !prof [[PROF0]]
 ; CHECK:       8:
@@ -1642,7 +1643,6 @@ define void @st4_2d_undefD(<2 x i64> %A, <2 x i64> %B, <2 x i64> %C, <2 x i64> %
 ; CHECK-NEXT:    unreachable
 ; CHECK:       9:
 ; CHECK-NEXT:    call void @llvm.aarch64.neon.st4.v2i64.p0(<2 x i64> [[A]], <2 x i64> [[B]], <2 x i64> [[C]], <2 x i64> undef, ptr [[P]])
-; CHECK-NEXT:    call void @llvm.aarch64.neon.st4.v2i64.p0(<2 x i64> [[TMP2]], <2 x i64> [[TMP3]], <2 x i64> [[TMP4]], <2 x i64> <i64 -1, i64 -1>, ptr [[TMP7]])
 ; CHECK-NEXT:    ret void
 ;
   call void @llvm.aarch64.neon.st4.v2i64.p0(<2 x i64> %A, <2 x i64> %B, <2 x i64> %C, <2 x i64> undef, ptr %P)
@@ -1661,6 +1661,7 @@ define void @st4_2d_undefAB(<2 x i64> %A, <2 x i64> %B, <2 x i64> %C, <2 x i64>
 ; CHECK-NEXT:    [[TMP4:%.*]] = ptrtoint ptr [[P]] to i64
 ; CHECK-NEXT:    [[TMP5:%.*]] = xor i64 [[TMP4]], 193514046488576
 ; CHECK-NEXT:    [[TMP6:%.*]] = inttoptr i64 [[TMP5]] to ptr
+; CHECK-NEXT:    call void @llvm.aarch64.neon.st4.v2i64.p0(<2 x i64> <i64 -1, i64 -1>, <2 x i64> <i64 -1, i64 -1>, <2 x i64> [[TMP2]], <2 x i64> [[TMP3]], ptr [[TMP6]])
 ; CHECK-NEXT:    [[_MSCMP:%.*]] = icmp ne i64 [[TMP1]], 0
 ; CHECK-NEXT:    br i1 [[_MSCMP]], label [[TMP7:%.*]], label [[TMP8:%.*]], !prof [[PROF0]]
 ; CHECK:       7:
@@ -1668,7 +1669,6 @@ define void @st4_2d_undefAB(<2 x i64> %A, <2 x i64> %B, <2 x i64> %C, <2 x i64>
 ; CHECK-NEXT:    unreachable
 ; CHECK:       8:
 ; CHECK-NEXT:    call void @llvm.aarch64.neon.st4.v2i64.p0(<2 x i64> undef, <2 x i64> undef, <2 x i64> [[C]], <2 x i64> [[D]], ptr [[P]])
-; CHECK-NEXT:    call void @llvm.aarch64.neon.st4.v2i64.p0(<2 x i64> <i64 -1, i64 -1>, <2 x i64> <i64 -1, i64 -1>, <2 x i64> [[TMP2]], <2 x i64> [[TMP3]], ptr [[TMP6]])
 ; CHECK-NEXT:    ret void
 ;
   call void @llvm.aarch64.neon.st4.v2i64.p0(<2 x i64> undef, <2 x i64> undef, <2 x i64> %C, <2 x i64> %D, ptr %P)
@@ -1687,6 +1687,7 @@ define void @st4_2d_undefAC(<2 x i64> %A, <2 x i64> %B, <2 x i64> %C, <2 x i64>
 ; CHECK-NEXT:    [[TMP4:%.*]] = ptrtoint ptr [[P]] to i64
 ; CHECK-NEXT:    [[TMP5:%.*]] = xor i64 [[TMP4]], 193514046488576
 ; CHECK-NEXT:    [[TMP6:%.*]] = inttoptr i64 [[TMP5]] to ptr
+; CHECK-NEXT:    call void @llvm.aarch64.neon.st4.v2i64.p0(<2 x i64> <i64 -1, i64 -1>, <2 x i64> [[TMP2]], <2 x i64> <i64 -1, i64 -1>, <2 x i64> [[TMP3]], ptr [[TMP6]])
 ; CHECK-NEXT:    [[_MSCMP:%.*]] = icmp ne i64 [[TMP1]], 0
 ; CHECK-NEXT:    br i1 [[_MSCMP]], label [[TMP7:%.*]], label [[TMP8:%.*]], !prof [[PROF0]]
 ; CHECK:       7:
@@ -1694,7 +1695,6 @@ define void @st4_2d_undefAC(<2 x i64> %A, <2 x i64> %B, <2 x i64> %C, <2 x i64>
 ; CHECK-NEXT:    unreachable
 ; CHECK:       8:
 ; CHECK-NEXT:    call void @llvm.aarch64.neon.st4.v2i64.p0(<2 x i64> undef, <2 x i64> [[B]], <2 x i64> undef, <2 x i64> [[D]], ptr [[P]])
-; CHECK-NEXT:    call void @llvm.aarch64.neon.st4.v2i64.p0(<2 x i64> <i64 -1, i64 -1>, <2 x i64> [[TMP2]], <2 x i64> <i64 -1, i64 -1>, <2 x i64> [[TMP3]], ptr [[TMP6]])
 ; CHECK-NEXT:    ret void
 ;
   call void @llvm.aarch64.neon.st4.v2i64.p0(<2 x i64> undef, <2 x i64> %B, <2 x i64> undef, <2 x i64> %D, ptr %P)
@@ -1713,6 +1713,7 @@ define void @st4_2d_undefAD(<2 x i64> %A, <2 x i64> %B, <2 x i64> %C, <2 x i64>
 ; CHECK-NEXT:    [[TMP4:%.*]] = ptrtoint ptr [[P]] to i64
 ; CHECK-NEXT:    [[TMP5:%.*]] = xor i64 [[TMP4]], 193514046488576
 ; CHECK-NEXT:    [[TMP6:%.*]] = inttoptr i64 [[TMP5]] to ptr
+; CHECK-NEXT:    call void @llvm.aarch64.neon.st4.v2i64.p0(<2 x i64> <i64 -1, i64 -1>, <2 x i64> [[TMP2]], <2 x i64> [[TMP3]], <2 x i64> <i64 -1, i64 -1>, ptr [[TMP6]])
 ; CHECK-NEXT:    [[_MSCMP:%.*]] = icmp ne i64 [[TMP1]], 0
 ; CHECK-NEXT:    br i1 [[_MSCMP]], label [[TMP7:%.*]], label [[TMP8:%.*]], !prof [[PROF0]]
 ; CHECK:       7:
@@ -1720,7 +1721,6 @@ define void @st4_2d_undefAD(<2 x i64> %A, <2 x i64> %B, <2 x i64> %C, <2 x i64>
 ; CHECK-NEXT:    unreachable
 ; CHECK:       8:
 ; CHECK-NEXT:    call void @llvm.aarch64.neon.st4.v2i64.p0(<2 x i64> undef, <2 x i64> [[B]], <2 x i64> [[C]], <2 x i64> undef, ptr [[P]])
-; CHECK-NEXT:    call void @llvm.aarch64.neon.st4.v2i64.p0(<2 x i64> <i64 -1, i64 -1>, <2 x i64> [[TMP2]], <2 x i64> [[TMP3]], <2 x i64> <i64 -1, i64 -1>, ptr [[TMP6]])
 ; CHECK-NEXT:    ret void
 ;
   call void @llvm.aarch64.neon.st4.v2i64.p0(<2 x i64> undef, <2 x i64> %B, <2 x i64> %C, <2 x i64> undef, ptr %P)
@@ -1739,6 +1739,7 @@ define void @st4_2d_undefBC(<2 x i64> %A, <2 x i64> %B, <2 x i64> %C, <2 x i64>
 ; CHECK-NEXT:    [[TMP4:%.*]] = ptrtoint ptr [[P]] to i64
 ; CHECK-NEXT:    [[TMP5:%.*]] = xor i64 [[TMP4]], 193514046488576
 ; CHECK-NEXT:    [[TMP6:%.*]] = inttoptr i64 [[TMP5]] to ptr
+; CHECK-NEXT:    call void @llvm.aarch64.neon.st4.v2i64.p0(<2 x i64> [[TMP2]], <2 x i64> <i64 -1, i64 -1>, <2 x i64> <i64 -1, i64 -1>, <2 x i64> [[TMP3]], ptr [[TMP6]])
 ; CHECK-NEXT:    [[_MSCMP:%.*]] = icmp ne i64 [[TMP1]], 0
 ; CHECK-NEXT:    br i1 [[_MSCMP]], label [[TMP7:%.*]], label [[TMP8:%.*]], !prof [[PROF0]]
 ; CHECK:       7:
@@ -1746,7 +1747,6 @@ define void @st4_2d_undefBC(<2 x i64> %A, <2 x i64> %B, <2 x i64> %C, <2 x i64>
 ; CHECK-NEXT:    unreachable
 ; CHECK:       8:
 ; CHECK-NEXT:    call void @llvm.aarch64.neon.st4.v2i64.p0(<2 x i64> [[A]], <2 x i64> undef, <2 x i64> undef, <2 x i64> [[D]], ptr [[P]])
-; CHECK-NEXT:    call void @llvm.aarch64.neon.st4.v2i64.p0(<2 x i64> [[TMP2]], <2 x i64> <i64 -1, i64 -1>, <2 x i64> <i64 -1, i64 -1>, <2 x i64> [[TMP3]], ptr [[TMP6]])
 ; CHECK-NEXT:    ret void
 ;
   call void @llvm.aarch64.neon.st4.v2i64.p0(<2 x i64> %A, <2 x i64> undef, <2 x i64> undef, <2 x i64> %D, ptr %P)
@@ -1765,6 +1765,7 @@ define void @st4_2d_undefBD(<2 x i64> %A, <2 x i64> %B, <2 x i64> %C, <2 x i64>
 ; CHECK-NEXT:    [[TMP4:%.*]] = ptrtoint ptr [[P]] to i64
 ; CHECK-NEXT:    [[TMP5:%.*]] = xor i64 [[TMP4]], 193514046488576
 ; CHECK-NEXT:    [[TMP6:%.*]] = inttoptr i64 [[TMP5]] to ptr
+; CHECK-NEXT:    call void @llvm.aarch64.neon.st4.v2i64.p0(<2 x i64> [[TMP2]], <2 x i64> <i64 -1, i64 -1>, <2 x i64> [[TMP3]], <2 x i64> <i64 -1, i64 -1>, ptr [[TMP6]])
 ; CHECK-NEXT:    [[_MSCMP:%.*]] = icmp ne i64 [[TMP1]], 0
 ; CHECK-NEXT:    br i1 [[_MSCMP]], label [[TMP7:%.*]], label [[TMP8:%.*]], !prof [[PROF0]]
 ; CHECK:       7:
@@ -1772,7 +1773,6 @@ define void @st4_2d_undefBD(<2 x i64> %A, <2 x i64> %B, <2 x i64> %C, <2 x i64>
 ; CHECK-NEXT:    unreachable
 ; CHECK:       8:
 ; CHECK-NEXT:    call void @llvm.aarch64.neon.st4.v2i64.p0(<2 x i64> [[A]], <2 x i64> undef, <2 x i64> [[C]], <2 x i64> undef, ptr [[P]])
-; CHECK-NEXT:    call void @llvm.aarch64.neon.st4.v2i64.p0(<2 x i64> [[TMP2]], <2 x i64> <i64 -1, i64 -1>, <2 x i64> [[TMP3]], <2 x i64> <i64 -1, i64 -1>, ptr [[TMP6]])
 ; CHECK-NEXT:    ret void
 ;
   call void @llvm.aarch64.neon.st4.v2i64.p0(<2 x i64> %A, <2 x i64> undef, <2 x i64> %C, <2 x i64> undef, ptr %P)
@@ -1791,6 +1791,7 @@ define void @st4_2d_undefCD(<2 x i64> %A, <2 x i64> %B, <2 x i64> %C, <2 x i64>
 ; CHECK-NEXT:    [[TMP4:%.*]] = ptrtoint ptr [[P]] to i64
 ; CHECK-NEXT:    [[TMP5:%.*]] = xor i64 [[TMP4]], 193514046488576
 ; CHECK-NEXT:    [[TMP6:%.*]] = inttoptr i64 [[TMP5]] to ptr
+; CHECK-NEXT:    call void @llvm.aarch64.neon.st4.v2i64.p0(<2 x i64> [[TMP2]], <2 x i64> [[TMP3]], <2 x i64> <i64 -1, i64 -1>, <2 x i64> <i64 -1, i64 -1>, ptr [[TMP6]])
 ; CHECK-NEXT:    [[_MSCMP:%.*]] = icmp ne i64 [[TMP1]], 0
 ; CHECK-NEXT:    br i1 [[_MSCMP]], label [[TMP7:%.*]], label [[TMP8:%.*]], !prof [[PROF0]]
 ; CHECK:       7:
@@ -1798,7 +1799,6 @@ define void @st4_2d_undefCD(<2 x i64> %A, <2 x i64> %B, <2 x i64> %C, <2 x i64>
 ; CHECK-NEXT:    unreachable
 ; CHECK:       8:
 ; CHECK-NEXT:    call void @llvm.aarch64.neon.st4.v2i64.p0(<2 x i64> [[A]], <2 x i64> [[B]], <2 x i64> undef, <2 x i64> undef, ptr [[P]])
-; CHECK-NEXT:    call void @llvm.aarch64.neon.st4.v2i64.p0(<2 x i64> [[TMP2]], <2 x i64> [[TMP3]], <2 x i64> <i64 -1, i64 -1>, <2 x i64> <i64 -1, i64 -1>, ptr [[TMP6]])
 ; CHECK-NEXT:    ret void
 ;
   call void @llvm.aarch64.neon.st4.v2i64.p0(<2 x i64> %A, <2 x i64> %B, <2 x i64> undef, <2 x i64> undef, ptr %P)
@@ -1816,6 +1816,7 @@ define void @st4_2d_undefABC(<2 x i64> %A, <2 x i64> %B, <2 x i64> %C, <2 x i64>
 ; CHECK-NEXT:    [[TMP3:%.*]] = ptrtoint ptr [[P]] to i64
 ; CHECK-NEXT:    [[TMP4:%.*]] = xor i64 [[TMP3]], 193514046488576
 ; CHECK-NEXT:    [[TMP5:%.*]] = inttoptr i64 [[TMP4]] to ptr
+; CHECK-NEXT:    call void @llvm.aarch64.neon.st4.v2i64.p0(<2 x i64> <i64 -1, i64 -1>, <2 x i64> <i64 -1, i64 -1>, <2 x i64> <i64 -1, i64 -1>, <2 x i64> [[TMP2]], ptr [[TMP5]])
 ; CHECK-NEXT:    [[_MSCMP:%.*]] = icmp ne i64 [[TMP1]], 0
 ; CHECK-NEXT:    br i1 [[_MSCMP]], label [[TMP6:%.*]], label [[TMP7:%.*]], !prof [[PROF0]]
 ; CHECK:       6:
@@ -1823,7 +1824,6 @@ define void @st4_2d_undefABC(<2 x i64> %A, <2 x i64> %B, <2 x i64> %C, <2 x i64>
 ; CHECK-NEXT:    unreachable
 ; CHECK:       7:
 ; CHECK-NEXT:    call void @llvm.aarch64.neon.st4.v2i64.p0(<2 x i64> undef, <2 x i64> undef, <2 x i64> undef, <2 x i64> [[D]], ptr [[P]])
-; CHECK-NEXT:    call void @llvm.aarch64.neon.st4.v2i64.p0(<2 x i64> <i64 -1, i64 -1>, <2 x i64> <i64 -1, i64 -1>, <2 x i64> <i64 -1, i64 -1>, <2 x i64> [[TMP2]], ptr [[TMP5]])
 ; CHECK-NEXT:    ret void
 ;
   call void @llvm.aarch64.neon.st4.v2i64.p0(<2 x i64> undef, <2 x i64> undef, <2 x i64> undef, <2 x i64> %D, ptr %P)
@@ -1841,6 +1841,7 @@ define void @st4_2d_undefABD(<2 x i64> %A, <2 x i64> %B, <2 x i64> %C, <2 x i64>
 ; CHECK-NEXT:    [[TMP3:%.*]] = ptrtoint ptr [[P]] to i64
 ; CHECK-NEXT:    [[TMP4:%.*]] = xor i64 [[TMP3]], 193514046488576
 ; CHECK-NEXT:    [[TMP5:%.*]] = inttoptr i64 [[TMP4]] to ptr
+; CHECK-NEXT:    call void @llvm.aarch64.neon.st4.v2i64.p0(<2 x i64> <i64 -1, i64 -1>, <2 x i64> <i64 -1, i64 -1>, <2 x i64> [[TMP2]], <2 x i64> <i64 -1, i64 -1>, ptr [[TMP5]])
 ; CHECK-NEXT:    [[_MSCMP:%.*]] = icmp ne i64 [[TMP1]], 0
 ; CHECK-NEXT:    br i1 [[_MSCMP]], label [[TMP6:%.*]], label [[TMP7:%.*]], !prof [[PROF0]]
 ; CHECK:       6:
@@ -1848,7 +1849,6 @@ define void @st4_2d_undefABD(<2 x i64> %A, <2 x i64> %B, <2 x i64> %C, <2 x i64>
 ; CHECK-NEXT:    unreachable
 ; CHECK:       7:
 ; CHECK-NEXT:    call void @llvm.aarch64.neon.st4.v2i64.p0(<2 x i64> undef, <2 x i64> undef, <2 x i64> [[C]], <2 x i64> undef, ptr [[P]])
-; CHECK-NEXT:    call void @llvm.aarch64.neon.st4.v2i64.p0(<2 x i64> <i64 -1, i64 -1>, <2 x i64> <i64 -1, i64 -1>, <2 x i64> [[TMP2]], <2 x i64> <i64 -1, i64 -1>, ptr [[TMP5]])
 ; CHECK-NEXT:    ret void
 ;
   call void @llvm.aarch64.neon.st4.v2i64.p0(<2 x i64> undef, <2 x i64> undef, <2 x i64> %C, <2 x i64> undef, ptr %P)
@@ -1866,6 +1866,7 @@ define void @st4_2d_undefACD(<2 x i64> %A, <2 x i64> %B, <2 x i64> %C, <2 x i64>
 ; CHECK-NEXT:    [[TMP3:%.*]] = ptrtoint ptr [[P]] to i64
 ; CHECK-NEXT:    [[TMP4:%.*]] = xor i64 [[TMP3]], 193514046488576
 ; CHECK-NEXT:    [[TMP5:%.*]] = inttoptr i64 [[TMP4]] to ptr
+; CHECK-NEXT:    call void @llvm.aarch64.neon.st4.v2i64.p0(<2 x i64> <i64 -1, i64 -1>, <2 x i64> [[TMP2]], <2 x i64> <i64 -1, i64 -1>, <2 x i64> <i64 -1, i64 -1>, ptr [[TMP5]])
 ; CHECK-NEXT:    [[_MSCMP:%.*]] = icmp ne i64 [[TMP1]], 0
 ; CHECK-NEXT:    br i1 [[_MSCMP]], label [[TMP6:%.*]], label [[TMP7:%.*]], !prof [[PROF0]]
 ; CHECK:       6:
@@ -1873,7 +1874,6 @@ define void @st4_2d_undefACD(<2 x i64> %A, <2 x i64> %B, <2 x i64> %C, <2 x i64>
 ; CHECK-NEXT:    unreachable
 ; CHECK:       7:
 ; CHECK-NEXT:    call void @llvm.aarch64.neon.st4.v2i64.p0(<2 x i64> undef, <2 x i64> [[B]], <2 x i64> undef, <2 x i64> undef, ptr [[P]])
-; CHECK-NEXT:    call void @llvm.aarch64.neon.st4.v2i64.p0(<2 x i64> <i64 -1, i64 -1>, <2 x i64> [[TMP2]], <2 x i64> <i64 -1, i64 -1>, <2 x i64> <i64 -1, i64 -1>, ptr [[TMP5]])
 ; CHECK-NEXT:    ret void
 ;
   call void @llvm.aarch64.neon.st4.v2i64.p0(<2 x i64> undef, <2 x i64> %B, <2 x i64> undef, <2 x i64> undef, ptr %P)
@@ -1891,6 +1891,7 @@ define void @st4_2d_undefBCD(<2 x i64> %A, <2 x i64> %B, <2 x i64> %C, <2 x i64>
 ; CHECK-NEXT:    [[TMP3:%.*]] = ptrtoint ptr [[P]] to i64
 ; CHECK-NEXT:    [[TMP4:%.*]] = xor i64 [[TMP3]], 193514046488576
 ; CHECK-NEXT:    [[TMP5:%.*]] = inttoptr i64 [[TMP4]] to ptr
+; CHECK-NEXT:    call void @llvm.aarch64.neon.st4.v2i64.p0(<2 x i64> [[TMP2]], <2 x i64> <i64 -1, i64 -1>, <2 x i64> <i64 -1, i64 -1>, <2 x i64> <i64 -1, i64 -1>, ptr [[TMP5]])
 ; CHECK-NEXT:    [[_MSCMP:%.*]] = icmp ne i64 [[TMP1]], 0
 ; CHECK-NEXT:    br i1 [[_MSCMP]], label [[TMP6:%.*]], label [[TMP7:%.*]], !prof [[PROF0]]
 ; CHECK:       6:
@@ -1898,7 +1899,6 @@ define void @st4_2d_undefBCD(<2 x i64> %A, <2 x i64> %B, <2 x i64> %C, <2 x i64>
 ; CHECK-NEXT:    unreachable
 ; CHECK:       7:
 ; CHECK-NEXT:    call void @llvm.aarch64.neon.st4.v2i64.p0(<2 x i64> [[A]], <2 x i64> undef, <2 x i64> undef, <2 x i64> undef, ptr [[P]])
-; CHECK-NEXT:    call void @llvm.aarch64.neon.st4.v2i64.p0(<2 x i64> [[TMP2]], <2 x i64> <i64 -1, i64 -1>, <2 x i64> <i64 -1, i64 -1>, <2 x i64> <i64 -1, i64 -1>, ptr [[TMP5]])
 ; CHECK-NEXT:    ret void
 ;
   call void @llvm.aarch64.neon.st4.v2i64.p0(<2 x i64> %A, <2 x i64> undef, <2 x i64> undef, <2 x i64> undef, ptr %P)
@@ -1915,6 +1915,7 @@ define void @st4_2d_undefABCD(<2 x i64> %A, <2 x i64> %B, <2 x i64> %C, <2 x i64
 ; CHECK-NEXT:    [[TMP2:%.*]] = ptrtoint ptr [[P]] to i64
 ; CHECK-NEXT:    [[TMP3:%.*]] = xor i64 [[TMP2]], 193514046488576
 ; CHECK-NEXT:    [[TMP4:%.*]] = inttoptr i64 [[TMP3]] to ptr
+; CHECK-NEXT:    call void @llvm.aarch64.neon.st4.v2i64.p0(<2 x i64> <i64 -1, i64 -1>, <2 x i64> <i64 -1, i64 -1>, <2 x i64> <i64 -1, i64 -1>, <2 x i64> <i64 -1, i64 -1>, ptr [[TMP4]])
 ; CHECK-NEXT:    [[_MSCMP:%.*]] = icmp ne i64 [[TMP1]], 0
 ; CHECK-NEXT:    br i1 [[_MSCMP]], label [[TMP5:%.*]], label [[TMP6:%.*]], !prof [[PROF0]]
 ; CHECK:       5:
@@ -1922,7 +1923,6 @@ define void @st4_2d_undefABCD(<2 x i64> %A, <2 x i64> %B, <2 x i64> %C, <2 x i64
 ; CHECK-NEXT:    unreachable
 ; CHECK:       6:
 ; CHECK-NEXT:    call void @llvm.aarch64.neon.st4.v2i64.p0(<2 x i64> undef, <2 x i64> undef, <2 x i64> undef, <2 x i64> undef, ptr [[P]])
-; CHECK-NEXT:    call void @llvm.aarch64.neon.st4.v2i64.p0(<2 x i64> <i64 -1, i64 -1>, <2 x i64> <i64 -1, i64 -1>, <2 x i64> <i64 -1, i64 -1>, <2 x i64> <i64 -1, i64 -1>, ptr [[TMP4]])
 ; CHECK-NEXT:    ret void
 ;
   call void @llvm.aarch64.neon.st4.v2i64.p0(<2 x i64> undef, <2 x i64> undef, <2 x i64> undef, <2 x i64> undef, ptr %P)
diff --git a/llvm/test/Instrumentation/MemorySanitizer/AArch64/neon_vst_float.ll b/llvm/test/Instrumentation/MemorySanitizer/AArch64/neon_vst_float.ll
index 2ac676f..8fed5a7 100644
--- a/llvm/test/Instrumentation/MemorySanitizer/AArch64/neon_vst_float.ll
+++ b/llvm/test/Instrumentation/MemorySanitizer/AArch64/neon_vst_float.ll
@@ -3,7 +3,7 @@
 ; Test memory sanitizer instrumentation for Arm NEON VST_{2,3,4} and
 ; VST_1x{2,3,4} instructions, including floating-point parameters.
 ;
-; RUN: opt < %s -passes=msan -S -disable-verify | FileCheck %s
+; RUN: opt < %s -passes=msan -S | FileCheck %s
 ;
 ; Generated with:
 ;     grep call clang/test/CodeGen/aarch64-neon-intrinsics.c \
@@ -37,22 +37,20 @@ target triple = "aarch64--linux-android9001"
 define void @st1x2_v1f64(<1 x double> %A, <1 x double> %B, ptr %p) sanitize_memory {
 ; CHECK-LABEL: define void @st1x2_v1f64(
 ; CHECK-SAME: <1 x double> [[A:%.*]], <1 x double> [[B:%.*]], ptr [[P:%.*]]) #[[ATTR0:[0-9]+]] {
+; CHECK-NEXT:    [[TMP1:%.*]] = load i64, ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_param_tls to i64), i64 16) to ptr), align 8
 ; CHECK-NEXT:    [[TMP2:%.*]] = load <1 x i64>, ptr @__msan_param_tls, align 8
 ; CHECK-NEXT:    [[TMP3:%.*]] = load <1 x i64>, ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_param_tls to i64), i64 8) to ptr), align 8
-; CHECK-NEXT:    [[TMP4:%.*]] = load i64, ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_param_tls to i64), i64 16) to ptr), align 8
 ; CHECK-NEXT:    call void @llvm.donothing()
-; CHECK-NEXT:    [[TMP1:%.*]] = bitcast <1 x i64> [[TMP2]] to i64
+; CHECK-NEXT:    [[TMP4:%.*]] = ptrtoint ptr [[P]] to i64
+; CHECK-NEXT:    [[TMP5:%.*]] = xor i64 [[TMP4]], 193514046488576
+; CHECK-NEXT:    [[TMP6:%.*]] = inttoptr i64 [[TMP5]] to ptr
+; CHECK-NEXT:    call void @llvm.aarch64.neon.st1x2.v1i64.p0(<1 x i64> [[TMP2]], <1 x i64> [[TMP3]], ptr [[TMP6]])
 ; CHECK-NEXT:    [[_MSCMP:%.*]] = icmp ne i64 [[TMP1]], 0
-; CHECK-NEXT:    [[TMP5:%.*]] = bitcast <1 x i64> [[TMP3]] to i64
-; CHECK-NEXT:    [[_MSCMP1:%.*]] = icmp ne i64 [[TMP5]], 0
-; CHECK-NEXT:    [[_MSOR:%.*]] = or i1 [[_MSCMP]], [[_MSCMP1]]
-; CHECK-NEXT:    [[_MSCMP2:%.*]] = icmp ne i64 [[TMP4]], 0
-; CHECK-NEXT:    [[_MSOR3:%.*]] = or i1 [[_MSOR]], [[_MSCMP2]]
-; CHECK-NEXT:    br i1 [[_MSOR3]], label [[TMP6:%.*]], label [[TMP7:%.*]], !prof [[PROF0:![0-9]+]]
-; CHECK:       6:
+; CHECK-NEXT:    br i1 [[_MSCMP]], label [[TMP7:%.*]], label [[TMP8:%.*]], !prof [[PROF0:![0-9]+]]
+; CHECK:       7:
 ; CHECK-NEXT:    call void @__msan_warning_noreturn() #[[ATTR4:[0-9]+]]
 ; CHECK-NEXT:    unreachable
-; CHECK:       7:
+; CHECK:       8:
 ; CHECK-NEXT:    call void @llvm.aarch64.neon.st1x2.v1f64.p0(<1 x double> [[A]], <1 x double> [[B]], ptr [[P]])
 ; CHECK-NEXT:    ret void
 ;
@@ -63,22 +61,20 @@ define void @st1x2_v1f64(<1 x double> %A, <1 x double> %B, ptr %p) sanitize_memo
 define void @st1x2_v1i64(<1 x i64> %A, <1 x i64> %B, ptr %p) sanitize_memory {
 ; CHECK-LABEL: define void @st1x2_v1i64(
 ; CHECK-SAME: <1 x i64> [[A:%.*]], <1 x i64> [[B:%.*]], ptr [[P:%.*]]) #[[ATTR0]] {
+; CHECK-NEXT:    [[TMP1:%.*]] = load i64, ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_param_tls to i64), i64 16) to ptr), align 8
 ; CHECK-NEXT:    [[TMP2:%.*]] = load <1 x i64>, ptr @__msan_param_tls, align 8
 ; CHECK-NEXT:    [[TMP3:%.*]] = load <1 x i64>, ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_param_tls to i64), i64 8) to ptr), align 8
-; CHECK-NEXT:    [[TMP4:%.*]] = load i64, ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_param_tls to i64), i64 16) to ptr), align 8
 ; CHECK-NEXT:    call void @llvm.donothing()
-; CHECK-NEXT:    [[TMP1:%.*]] = bitcast <1 x i64> [[TMP2]] to i64
+; CHECK-NEXT:    [[TMP4:%.*]] = ptrtoint ptr [[P]] to i64
+; CHECK-NEXT:    [[TMP5:%.*]] = xor i64 [[TMP4]], 193514046488576
+; CHECK-NEXT:    [[TMP6:%.*]] = inttoptr i64 [[TMP5]] to ptr
+; CHECK-NEXT:    call void @llvm.aarch64.neon.st1x2.v1i64.p0(<1 x i64> [[TMP2]], <1 x i64> [[TMP3]], ptr [[TMP6]])
 ; CHECK-NEXT:    [[_MSCMP:%.*]] = icmp ne i64 [[TMP1]], 0
-; CHECK-NEXT:    [[TMP5:%.*]] = bitcast <1 x i64> [[TMP3]] to i64
-; CHECK-NEXT:    [[_MSCMP1:%.*]] = icmp ne i64 [[TMP5]], 0
-; CHECK-NEXT:    [[_MSOR:%.*]] = or i1 [[_MSCMP]], [[_MSCMP1]]
-; CHECK-NEXT:    [[_MSCMP2:%.*]] = icmp ne i64 [[TMP4]], 0
-; CHECK-NEXT:    [[_MSOR3:%.*]] = or i1 [[_MSOR]], [[_MSCMP2]]
-; CHECK-NEXT:    br i1 [[_MSOR3]], label [[TMP6:%.*]], label [[TMP7:%.*]], !prof [[PROF0]]
-; CHECK:       6:
+; CHECK-NEXT:    br i1 [[_MSCMP]], label [[TMP7:%.*]], label [[TMP8:%.*]], !prof [[PROF0]]
+; CHECK:       7:
 ; CHECK-NEXT:    call void @__msan_warning_noreturn() #[[ATTR4]]
 ; CHECK-NEXT:    unreachable
-; CHECK:       7:
+; CHECK:       8:
 ; CHECK-NEXT:    call void @llvm.aarch64.neon.st1x2.v1i64.p0(<1 x i64> [[A]], <1 x i64> [[B]], ptr [[P]])
 ; CHECK-NEXT:    ret void
 ;
@@ -89,22 +85,20 @@ define void @st1x2_v1i64(<1 x i64> %A, <1 x i64> %B, ptr %p) sanitize_memory {
 define void @st1x2_v2f64(<2 x double> %A, <2 x double> %B, ptr %p) sanitize_memory {
 ; CHECK-LABEL: define void @st1x2_v2f64(
 ; CHECK-SAME: <2 x double> [[A:%.*]], <2 x double> [[B:%.*]], ptr [[P:%.*]]) #[[ATTR0]] {
+; CHECK-NEXT:    [[TMP1:%.*]] = load i64, ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_param_tls to i64), i64 32) to ptr), align 8
 ; CHECK-NEXT:    [[TMP2:%.*]] = load <2 x i64>, ptr @__msan_param_tls, align 8
 ; CHECK-NEXT:    [[TMP3:%.*]] = load <2 x i64>, ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_param_tls to i64), i64 16) to ptr), align 8
-; CHECK-NEXT:    [[TMP1:%.*]] = load i64, ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_param_tls to i64), i64 32) to ptr), align 8
 ; CHECK-NEXT:    call void @llvm.donothing()
-; CHECK-NEXT:    [[TMP4:%.*]] = bitcast <2 x i64> [[TMP2]] to i128
-; CHECK-NEXT:    [[_MSCMP2:%.*]] = icmp ne i128 [[TMP4]], 0
-; CHECK-NEXT:    [[TMP5:%.*]] = bitcast <2 x i64> [[TMP3]] to i128
-; CHECK-NEXT:    [[_MSCMP1:%.*]] = icmp ne i128 [[TMP5]], 0
-; CHECK-NEXT:    [[_MSOR:%.*]] = or i1 [[_MSCMP2]], [[_MSCMP1]]
+; CHECK-NEXT:    [[TMP4:%.*]] = ptrtoint ptr [[P]] to i64
+; CHECK-NEXT:    [[TMP5:%.*]] = xor i64 [[TMP4]], 193514046488576
+; CHECK-NEXT:    [[TMP6:%.*]] = inttoptr i64 [[TMP5]] to ptr
+; CHECK-NEXT:    call void @llvm.aarch64.neon.st1x2.v2i64.p0(<2 x i64> [[TMP2]], <2 x i64> [[TMP3]], ptr [[TMP6]])
 ; CHECK-NEXT:    [[_MSCMP:%.*]] = icmp ne i64 [[TMP1]], 0
-; CHECK-NEXT:    [[_MSOR3:%.*]] = or i1 [[_MSOR]], [[_MSCMP]]
-; CHECK-NEXT:    br i1 [[_MSOR3]], label [[TMP6:%.*]], label [[TMP7:%.*]], !prof [[PROF0]]
-; CHECK:       6:
+; CHECK-NEXT:    br i1 [[_MSCMP]], label [[TMP7:%.*]], label [[TMP8:%.*]], !prof [[PROF0]]
+; CHECK:       7:
 ; CHECK-NEXT:    call void @__msan_warning_noreturn() #[[ATTR4]]
 ; CHECK-NEXT:    unreachable
-; CHECK:       7:
+; CHECK:       8:
 ; CHECK-NEXT:    call void @llvm.aarch64.neon.st1x2.v2f64.p0(<2 x double> [[A]], <2 x double> [[B]], ptr [[P]])
 ; CHECK-NEXT:    ret void
 ;
@@ -115,22 +109,20 @@ define void @st1x2_v2f64(<2 x double> %A, <2 x double> %B, ptr %p) sanitize_memo
 define void @st1x2_v2i64(<2 x i64> %A, <2 x i64> %B, ptr %p) sanitize_memory {
 ; CHECK-LABEL: define void @st1x2_v2i64(
 ; CHECK-SAME: <2 x i64> [[A:%.*]], <2 x i64> [[B:%.*]], ptr [[P:%.*]]) #[[ATTR0]] {
+; CHECK-NEXT:    [[TMP1:%.*]] = load i64, ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_param_tls to i64), i64 32) to ptr), align 8
 ; CHECK-NEXT:    [[TMP2:%.*]] = load <2 x i64>, ptr @__msan_param_tls, align 8
 ; CHECK-NEXT:    [[TMP3:%.*]] = load <2 x i64>, ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_param_tls to i64), i64 16) to ptr), align 8
-; CHECK-NEXT:    [[TMP1:%.*]] = load i64, ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_param_tls to i64), i64 32) to ptr), align 8
 ; CHECK-NEXT:    call void @llvm.donothing()
-; CHECK-NEXT:    [[TMP4:%.*]] = bitcast <2 x i64> [[TMP2]] to i128
-; CHECK-NEXT:    [[_MSCMP2:%.*]] = icmp ne i128 [[TMP4]], 0
-; CHECK-NEXT:    [[TMP5:%.*]] = bitcast <2 x i64> [[TMP3]] to i128
-; CHECK-NEXT:    [[_MSCMP1:%.*]] = icmp ne i128 [[TMP5]], 0
-; CHECK-NEXT:    [[_MSOR:%.*]] = or i1 [[_MSCMP2]], [[_MSCMP1]]
+; CHECK-NEXT:    [[TMP4:%.*]] = ptrtoint ptr [[P]] to i64
+; CHECK-NEXT:    [[TMP5:%.*]] = xor i64 [[TMP4]], 193514046488576
+; CHECK-NEXT:    [[TMP6:%.*]] = inttoptr i64 [[TMP5]] to ptr
+; CHECK-NEXT:    call void @llvm.aarch64.neon.st1x2.v2i64.p0(<2 x i64> [[TMP2]], <2 x i64> [[TMP3]], ptr [[TMP6]])
 ; CHECK-NEXT:    [[_MSCMP:%.*]] = icmp ne i64 [[TMP1]], 0
-; CHECK-NEXT:    [[_MSOR3:%.*]] = or i1 [[_MSOR]], [[_MSCMP]]
-; CHECK-NEXT:    br i1 [[_MSOR3]], label [[TMP6:%.*]], label [[TMP7:%.*]], !prof [[PROF0]]
-; CHECK:       6:
+; CHECK-NEXT:    br i1 [[_MSCMP]], label [[TMP7:%.*]], label [[TMP8:%.*]], !prof [[PROF0]]
+; CHECK:       7:
 ; CHECK-NEXT:    call void @__msan_warning_noreturn() #[[ATTR4]]
 ; CHECK-NEXT:    unreachable
-; CHECK:       7:
+; CHECK:       8:
 ; CHECK-NEXT:    call void @llvm.aarch64.neon.st1x2.v2i64.p0(<2 x i64> [[A]], <2 x i64> [[B]], ptr [[P]])
 ; CHECK-NEXT:    ret void
 ;
@@ -141,22 +133,17 @@ define void @st1x2_v2i64(<2 x i64> %A, <2 x i64> %B, ptr %p) sanitize_memory {
 define void @st1x3_v1f64(<1 x double> %A, <1 x double> %B, <1 x double> %C, ptr %p) sanitize_memory {
 ; CHECK-LABEL: define void @st1x3_v1f64(
 ; CHECK-SAME: <1 x double> [[A:%.*]], <1 x double> [[B:%.*]], <1 x double> [[C:%.*]], ptr [[P:%.*]]) #[[ATTR0]] {
+; CHECK-NEXT:    [[TMP1:%.*]] = load i64, ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_param_tls to i64), i64 24) to ptr), align 8
 ; CHECK-NEXT:    [[TMP2:%.*]] = load <1 x i64>, ptr @__msan_param_tls, align 8
 ; CHECK-NEXT:    [[TMP3:%.*]] = load <1 x i64>, ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_param_tls to i64), i64 8) to ptr), align 8
 ; CHECK-NEXT:    [[TMP4:%.*]] = load <1 x i64>, ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_param_tls to i64), i64 16) to ptr), align 8
-; CHECK-NEXT:    [[TMP5:%.*]] = load i64, ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_param_tls to i64), i64 24) to ptr), align 8
 ; CHECK-NEXT:    call void @llvm.donothing()
-; CHECK-NEXT:    [[TMP1:%.*]] = bitcast <1 x i64> [[TMP2]] to i64
+; CHECK-NEXT:    [[TMP5:%.*]] = ptrtoint ptr [[P]] to i64
+; CHECK-NEXT:    [[TMP6:%.*]] = xor i64 [[TMP5]], 193514046488576
+; CHECK-NEXT:    [[TMP7:%.*]] = inttoptr i64 [[TMP6]] to ptr
+; CHECK-NEXT:    call void @llvm.aarch64.neon.st1x3.v1i64.p0(<1 x i64> [[TMP2]], <1 x i64> [[TMP3]], <1 x i64> [[TMP4]], ptr [[TMP7]])
 ; CHECK-NEXT:    [[_MSCMP:%.*]] = icmp ne i64 [[TMP1]], 0
-; CHECK-NEXT:    [[TMP6:%.*]] = bitcast <1 x i64> [[TMP3]] to i64
-; CHECK-NEXT:    [[_MSCMP1:%.*]] = icmp ne i64 [[TMP6]], 0
-; CHECK-NEXT:    [[_MSOR:%.*]] = or i1 [[_MSCMP]], [[_MSCMP1]]
-; CHECK-NEXT:    [[TMP7:%.*]] = bitcast <1 x i64> [[TMP4]] to i64
-; CHECK-NEXT:    [[_MSCMP2:%.*]] = icmp ne i64 [[TMP7]], 0
-; CHECK-NEXT:    [[_MSOR3:%.*]] = or i1 [[_MSOR]], [[_MSCMP2]]
-; CHECK-NEXT:    [[_MSCMP4:%.*]] = icmp ne i64 [[TMP5]], 0
-; CHECK-NEXT:    [[_MSOR5:%.*]] = or i1 [[_MSOR3]], [[_MSCMP4]]
-; CHECK-NEXT:    br i1 [[_MSOR5]], label [[TMP8:%.*]], label [[TMP9:%.*]], !prof [[PROF0]]
+; CHECK-NEXT:    br i1 [[_MSCMP]], label [[TMP8:%.*]], label [[TMP9:%.*]], !prof [[PROF0]]
 ; CHECK:       8:
 ; CHECK-NEXT:    call void @__msan_warning_noreturn() #[[ATTR4]]
 ; CHECK-NEXT:    unreachable
@@ -171,22 +158,17 @@ define void @st1x3_v1f64(<1 x double> %A, <1 x double> %B, <1 x double> %C, ptr
 define void @st1x3_v1i64(<1 x i64> %A, <1 x i64> %B, <1 x i64> %C, ptr %p) sanitize_memory {
 ; CHECK-LABEL: define void @st1x3_v1i64(
 ; CHECK-SAME: <1 x i64> [[A:%.*]], <1 x i64> [[B:%.*]], <1 x i64> [[C:%.*]], ptr [[P:%.*]]) #[[ATTR0]] {
+; CHECK-NEXT:    [[TMP1:%.*]] = load i64, ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_param_tls to i64), i64 24) to ptr), align 8
 ; CHECK-NEXT:    [[TMP2:%.*]] = load <1 x i64>, ptr @__msan_param_tls, align 8
 ; CHECK-NEXT:    [[TMP3:%.*]] = load <1 x i64>, ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_param_tls to i64), i64 8) to ptr), align 8
 ; CHECK-NEXT:    [[TMP4:%.*]] = load <1 x i64>, ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_param_tls to i64), i64 16) to ptr), align 8
-; CHECK-NEXT:    [[TMP5:%.*]] = load i64, ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_param_tls to i64), i64 24) to ptr), align 8
 ; CHECK-NEXT:    call void @llvm.donothing()
-; CHECK-NEXT:    [[TMP1:%.*]] = bitcast <1 x i64> [[TMP2]] to i64
+; CHECK-NEXT:    [[TMP5:%.*]] = ptrtoint ptr [[P]] to i64
+; CHECK-NEXT:    [[TMP6:%.*]] = xor i64 [[TMP5]], 193514046488576
+; CHECK-NEXT:    [[TMP7:%.*]] = inttoptr i64 [[TMP6]] to ptr
+; CHECK-NEXT:    call void @llvm.aarch64.neon.st1x3.v1i64.p0(<1 x i64> [[TMP2]], <1 x i64> [[TMP3]], <1 x i64> [[TMP4]], ptr [[TMP7]])
 ; CHECK-NEXT:    [[_MSCMP:%.*]] = icmp ne i64 [[TMP1]], 0
-; CHECK-NEXT:    [[TMP6:%.*]] = bitcast <1 x i64> [[TMP3]] to i64
-; CHECK-NEXT:    [[_MSCMP1:%.*]] = icmp ne i64 [[TMP6]], 0
-; CHECK-NEXT:    [[_MSOR:%.*]] = or i1 [[_MSCMP]], [[_MSCMP1]]
-; CHECK-NEXT:    [[TMP7:%.*]] = bitcast <1 x i64> [[TMP4]] to i64
-; CHECK-NEXT:    [[_MSCMP2:%.*]] = icmp ne i64 [[TMP7]], 0
-; CHECK-NEXT:    [[_MSOR3:%.*]] = or i1 [[_MSOR]], [[_MSCMP2]]
-; CHECK-NEXT:    [[_MSCMP4:%.*]] = icmp ne i64 [[TMP5]], 0
-; CHECK-NEXT:    [[_MSOR5:%.*]] = or i1 [[_MSOR3]], [[_MSCMP4]]
-; CHECK-NEXT:    br i1 [[_MSOR5]], label [[TMP8:%.*]], label [[TMP9:%.*]], !prof [[PROF0]]
+; CHECK-NEXT:    br i1 [[_MSCMP]], label [[TMP8:%.*]], label [[TMP9:%.*]], !prof [[PROF0]]
 ; CHECK:       8:
 ; CHECK-NEXT:    call void @__msan_warning_noreturn() #[[ATTR4]]
 ; CHECK-NEXT:    unreachable
@@ -201,22 +183,17 @@ define void @st1x3_v1i64(<1 x i64> %A, <1 x i64> %B, <1 x i64> %C, ptr %p) sanit
 define void @st1x3_v2f64(<2 x double> %A, <2 x double> %B, <2 x double> %C, ptr %p) sanitize_memory {
 ; CHECK-LABEL: define void @st1x3_v2f64(
 ; CHECK-SAME: <2 x double> [[A:%.*]], <2 x double> [[B:%.*]], <2 x double> [[C:%.*]], ptr [[P:%.*]]) #[[ATTR0]] {
+; CHECK-NEXT:    [[TMP1:%.*]] = load i64, ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_param_tls to i64), i64 48) to ptr), align 8
 ; CHECK-NEXT:    [[TMP2:%.*]] = load <2 x i64>, ptr @__msan_param_tls, align 8
 ; CHECK-NEXT:    [[TMP3:%.*]] = load <2 x i64>, ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_param_tls to i64), i64 16) to ptr), align 8
 ; CHECK-NEXT:    [[TMP4:%.*]] = load <2 x i64>, ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_param_tls to i64), i64 32) to ptr), align 8
-; CHECK-NEXT:    [[TMP1:%.*]] = load i64, ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_param_tls to i64), i64 48) to ptr), align 8
 ; CHECK-NEXT:    call void @llvm.donothing()
-; CHECK-NEXT:    [[TMP5:%.*]] = bitcast <2 x i64> [[TMP2]] to i128
-; CHECK-NEXT:    [[_MSCMP3:%.*]] = icmp ne i128 [[TMP5]], 0
-; CHECK-NEXT:    [[TMP6:%.*]] = bitcast <2 x i64> [[TMP3]] to i128
-; CHECK-NEXT:    [[_MSCMP1:%.*]] = icmp ne i128 [[TMP6]], 0
-; CHECK-NEXT:    [[_MSOR:%.*]] = or i1 [[_MSCMP3]], [[_MSCMP1]]
-; CHECK-NEXT:    [[TMP7:%.*]] = bitcast <2 x i64> [[TMP4]] to i128
-; CHECK-NEXT:    [[_MSCMP2:%.*]] = icmp ne i128 [[TMP7]], 0
-; CHECK-NEXT:    [[_MSOR3:%.*]] = or i1 [[_MSOR]], [[_MSCMP2]]
+; CHECK-NEXT:    [[TMP5:%.*]] = ptrtoint ptr [[P]] to i64
+; CHECK-NEXT:    [[TMP6:%.*]] = xor i64 [[TMP5]], 193514046488576
+; CHECK-NEXT:    [[TMP7:%.*]] = inttoptr i64 [[TMP6]] to ptr
+; CHECK-NEXT:    call void @llvm.aarch64.neon.st1x3.v2i64.p0(<2 x i64> [[TMP2]], <2 x i64> [[TMP3]], <2 x i64> [[TMP4]], ptr [[TMP7]])
 ; CHECK-NEXT:    [[_MSCMP:%.*]] = icmp ne i64 [[TMP1]], 0
-; CHECK-NEXT:    [[_MSOR5:%.*]] = or i1 [[_MSOR3]], [[_MSCMP]]
-; CHECK-NEXT:    br i1 [[_MSOR5]], label [[TMP8:%.*]], label [[TMP9:%.*]], !prof [[PROF0]]
+; CHECK-NEXT:    br i1 [[_MSCMP]], label [[TMP8:%.*]], label [[TMP9:%.*]], !prof [[PROF0]]
 ; CHECK:       8:
 ; CHECK-NEXT:    call void @__msan_warning_noreturn() #[[ATTR4]]
 ; CHECK-NEXT:    unreachable
@@ -231,22 +208,17 @@ define void @st1x3_v2f64(<2 x double> %A, <2 x double> %B, <2 x double> %C, ptr
 define void @st1x3_v2i64(<2 x i64> %A, <2 x i64> %B, <2 x i64> %C, ptr %p) sanitize_memory {
 ; CHECK-LABEL: define void @st1x3_v2i64(
 ; CHECK-SAME: <2 x i64> [[A:%.*]], <2 x i64> [[B:%.*]], <2 x i64> [[C:%.*]], ptr [[P:%.*]]) #[[ATTR0]] {
+; CHECK-NEXT:    [[TMP1:%.*]] = load i64, ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_param_tls to i64), i64 48) to ptr), align 8
 ; CHECK-NEXT:    [[TMP2:%.*]] = load <2 x i64>, ptr @__msan_param_tls, align 8
 ; CHECK-NEXT:    [[TMP3:%.*]] = load <2 x i64>, ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_param_tls to i64), i64 16) to ptr), align 8
 ; CHECK-NEXT:    [[TMP4:%.*]] = load <2 x i64>, ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_param_tls to i64), i64 32) to ptr), align 8
-; CHECK-NEXT:    [[TMP1:%.*]] = load i64, ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_param_tls to i64), i64 48) to ptr), align 8
 ; CHECK-NEXT:    call void @llvm.donothing()
-; CHECK-NEXT:    [[TMP5:%.*]] = bitcast <2 x i64> [[TMP2]] to i128
-; CHECK-NEXT:    [[_MSCMP3:%.*]] = icmp ne i128 [[TMP5]], 0
-; CHECK-NEXT:    [[TMP6:%.*]] = bitcast <2 x i64> [[TMP3]] to i128
-; CHECK-NEXT:    [[_MSCMP1:%.*]] = icmp ne i128 [[TMP6]], 0
-; CHECK-NEXT:    [[_MSOR:%.*]] = or i1 [[_MSCMP3]], [[_MSCMP1]]
-; CHECK-NEXT:    [[TMP7:%.*]] = bitcast <2 x i64> [[TMP4]] to i128
-; CHECK-NEXT:    [[_MSCMP2:%.*]] = icmp ne i128 [[TMP7]], 0
-; CHECK-NEXT:    [[_MSOR3:%.*]] = or i1 [[_MSOR]], [[_MSCMP2]]
+; CHECK-NEXT:    [[TMP5:%.*]] = ptrtoint ptr [[P]] to i64
+; CHECK-NEXT:    [[TMP6:%.*]] = xor i64 [[TMP5]], 193514046488576
+; CHECK-NEXT:    [[TMP7:%.*]] = inttoptr i64 [[TMP6]] to ptr
+; CHECK-NEXT:    call void @llvm.aarch64.neon.st1x3.v2i64.p0(<2 x i64> [[TMP2]], <2 x i64> [[TMP3]], <2 x i64> [[TMP4]], ptr [[TMP7]])
 ; CHECK-NEXT:    [[_MSCMP:%.*]] = icmp ne i64 [[TMP1]], 0
-; CHECK-NEXT:    [[_MSOR5:%.*]] = or i1 [[_MSOR3]], [[_MSCMP]]
-; CHECK-NEXT:    br i1 [[_MSOR5]], label [[TMP8:%.*]], label [[TMP9:%.*]], !prof [[PROF0]]
+; CHECK-NEXT:    br i1 [[_MSCMP]], label [[TMP8:%.*]], label [[TMP9:%.*]], !prof [[PROF0]]
 ; CHECK:       8:
 ; CHECK-NEXT:    call void @__msan_warning_noreturn() #[[ATTR4]]
 ; CHECK-NEXT:    unreachable
@@ -261,30 +233,22 @@ define void @st1x3_v2i64(<2 x i64> %A, <2 x i64> %B, <2 x i64> %C, ptr %p) sanit
 define void @st1x4_v1f64(<1 x double> %A, <1 x double> %B, <1 x double> %C, <1 x double> %D, ptr %p) sanitize_memory {
 ; CHECK-LABEL: define void @st1x4_v1f64(
 ; CHECK-SAME: <1 x double> [[A:%.*]], <1 x double> [[B:%.*]], <1 x double> [[C:%.*]], <1 x double> [[D:%.*]], ptr [[P:%.*]]) #[[ATTR0]] {
+; CHECK-NEXT:    [[TMP1:%.*]] = load i64, ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_param_tls to i64), i64 32) to ptr), align 8
 ; CHECK-NEXT:    [[TMP2:%.*]] = load <1 x i64>, ptr @__msan_param_tls, align 8
 ; CHECK-NEXT:    [[TMP3:%.*]] = load <1 x i64>, ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_param_tls to i64), i64 8) to ptr), align 8
 ; CHECK-NEXT:    [[TMP4:%.*]] = load <1 x i64>, ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_param_tls to i64), i64 16) to ptr), align 8
 ; CHECK-NEXT:    [[TMP5:%.*]] = load <1 x i64>, ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_param_tls to i64), i64 24) to ptr), align 8
-; CHECK-NEXT:    [[TMP6:%.*]] = load i64, ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_param_tls to i64), i64 32) to ptr), align 8
 ; CHECK-NEXT:    call void @llvm.donothing()
-; CHECK-NEXT:    [[TMP1:%.*]] = bitcast <1 x i64> [[TMP2]] to i64
+; CHECK-NEXT:    [[TMP6:%.*]] = ptrtoint ptr [[P]] to i64
+; CHECK-NEXT:    [[TMP7:%.*]] = xor i64 [[TMP6]], 193514046488576
+; CHECK-NEXT:    [[TMP8:%.*]] = inttoptr i64 [[TMP7]] to ptr
+; CHECK-NEXT:    call void @llvm.aarch64.neon.st1x4.v1i64.p0(<1 x i64> [[TMP2]], <1 x i64> [[TMP3]], <1 x i64> [[TMP4]], <1 x i64> [[TMP5]], ptr [[TMP8]])
 ; CHECK-NEXT:    [[_MSCMP:%.*]] = icmp ne i64 [[TMP1]], 0
-; CHECK-NEXT:    [[TMP7:%.*]] = bitcast <1 x i64> [[TMP3]] to i64
-; CHECK-NEXT:    [[_MSCMP1:%.*]] = icmp ne i64 [[TMP7]], 0
-; CHECK-NEXT:    [[_MSOR:%.*]] = or i1 [[_MSCMP]], [[_MSCMP1]]
-; CHECK-NEXT:    [[TMP8:%.*]] = bitcast <1 x i64> [[TMP4]] to i64
-; CHECK-NEXT:    [[_MSCMP2:%.*]] = icmp ne i64 [[TMP8]], 0
-; CHECK-NEXT:    [[_MSOR3:%.*]] = or i1 [[_MSOR]], [[_MSCMP2]]
-; CHECK-NEXT:    [[TMP9:%.*]] = bitcast <1 x i64> [[TMP5]] to i64
-; CHECK-NEXT:    [[_MSCMP4:%.*]] = icmp ne i64 [[TMP9]], 0
-; CHECK-NEXT:    [[_MSOR5:%.*]] = or i1 [[_MSOR3]], [[_MSCMP4]]
-; CHECK-NEXT:    [[_MSCMP6:%.*]] = icmp ne i64 [[TMP6]], 0
-; CHECK-NEXT:    [[_MSOR7:%.*]] = or i1 [[_MSOR5]], [[_MSCMP6]]
-; CHECK-NEXT:    br i1 [[_MSOR7]], label [[TMP10:%.*]], label [[TMP11:%.*]], !prof [[PROF0]]
-; CHECK:       10:
+; CHECK-NEXT:    br i1 [[_MSCMP]], label [[TMP9:%.*]], label [[TMP10:%.*]], !prof [[PROF0]]
+; CHECK:       9:
 ; CHECK-NEXT:    call void @__msan_warning_noreturn() #[[ATTR4]]
 ; CHECK-NEXT:    unreachable
-; CHECK:       11:
+; CHECK:       10:
 ; CHECK-NEXT:    call void @llvm.aarch64.neon.st1x4.v1f64.p0(<1 x double> [[A]], <1 x double> [[B]], <1 x double> [[C]], <1 x double> [[D]], ptr [[P]])
 ; CHECK-NEXT:    ret void
 ;
@@ -295,30 +259,22 @@ define void @st1x4_v1f64(<1 x double> %A, <1 x double> %B, <1 x double> %C, <1 x
 define void @st1x4_v1i64(<1 x i64> %A, <1 x i64> %B, <1 x i64> %C, <1 x i64> %D, ptr %p) sanitize_memory {
 ; CHECK-LABEL: define void @st1x4_v1i64(
 ; CHECK-SAME: <1 x i64> [[A:%.*]], <1 x i64> [[B:%.*]], <1 x i64> [[C:%.*]], <1 x i64> [[D:%.*]], ptr [[P:%.*]]) #[[ATTR0]] {
+; CHECK-NEXT:    [[TMP1:%.*]] = load i64, ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_param_tls to i64), i64 32) to ptr), align 8
 ; CHECK-NEXT:    [[TMP2:%.*]] = load <1 x i64>, ptr @__msan_param_tls, align 8
 ; CHECK-NEXT:    [[TMP3:%.*]] = load <1 x i64>, ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_param_tls to i64), i64 8) to ptr), align 8
 ; CHECK-NEXT:    [[TMP4:%.*]] = load <1 x i64>, ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_param_tls to i64), i64 16) to ptr), align 8
 ; CHECK-NEXT:    [[TMP5:%.*]] = load <1 x i64>, ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_param_tls to i64), i64 24) to ptr), align 8
-; CHECK-NEXT:    [[TMP6:%.*]] = load i64, ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_param_tls to i64), i64 32) to ptr), align 8
 ; CHECK-NEXT:    call void @llvm.donothing()
-; CHECK-NEXT:    [[TMP1:%.*]] = bitcast <1 x i64> [[TMP2]] to i64
+; CHECK-NEXT:    [[TMP6:%.*]] = ptrtoint ptr [[P]] to i64
+; CHECK-NEXT:    [[TMP7:%.*]] = xor i64 [[TMP6]], 193514046488576
+; CHECK-NEXT:    [[TMP8:%.*]] = inttoptr i64 [[TMP7]] to ptr
+; CHECK-NEXT:    call void @llvm.aarch64.neon.st1x4.v1i64.p0(<1 x i64> [[TMP2]], <1 x i64> [[TMP3]], <1 x i64> [[TMP4]], <1 x i64> [[TMP5]], ptr [[TMP8]])
 ; CHECK-NEXT:    [[_MSCMP:%.*]] = icmp ne i64 [[TMP1]], 0
-; CHECK-NEXT:    [[TMP7:%.*]] = bitcast <1 x i64> [[TMP3]] to i64
-; CHECK-NEXT:    [[_MSCMP1:%.*]] = icmp ne i64 [[TMP7]], 0
-; CHECK-NEXT:    [[_MSOR:%.*]] = or i1 [[_MSCMP]], [[_MSCMP1]]
-; CHECK-NEXT:    [[TMP8:%.*]] = bitcast <1 x i64> [[TMP4]] to i64
-; CHECK-NEXT:    [[_MSCMP2:%.*]] = icmp ne i64 [[TMP8]], 0
-; CHECK-NEXT:    [[_MSOR3:%.*]] = or i1 [[_MSOR]], [[_MSCMP2]]
-; CHECK-NEXT:    [[TMP9:%.*]] = bitcast <1 x i64> [[TMP5]] to i64
-; CHECK-NEXT:    [[_MSCMP4:%.*]] = icmp ne i64 [[TMP9]], 0
-; CHECK-NEXT:    [[_MSOR5:%.*]] = or i1 [[_MSOR3]], [[_MSCMP4]]
-; CHECK-NEXT:    [[_MSCMP6:%.*]] = icmp ne i64 [[TMP6]], 0
-; CHECK-NEXT:    [[_MSOR7:%.*]] = or i1 [[_MSOR5]], [[_MSCMP6]]
-; CHECK-NEXT:    br i1 [[_MSOR7]], label [[TMP10:%.*]], label [[TMP11:%.*]], !prof [[PROF0]]
-; CHECK:       10:
+; CHECK-NEXT:    br i1 [[_MSCMP]], label [[TMP9:%.*]], label [[TMP10:%.*]], !prof [[PROF0]]
+; CHECK:       9:
 ; CHECK-NEXT:    call void @__msan_warning_noreturn() #[[ATTR4]]
 ; CHECK-NEXT:    unreachable
-; CHECK:       11:
+; CHECK:       10:
 ; CHECK-NEXT:    call void @llvm.aarch64.neon.st1x4.v1i64.p0(<1 x i64> [[A]], <1 x i64> [[B]], <1 x i64> [[C]], <1 x i64> [[D]], ptr [[P]])
 ; CHECK-NEXT:    ret void
 ;
@@ -329,30 +285,22 @@ define void @st1x4_v1i64(<1 x i64> %A, <1 x i64> %B, <1 x i64> %C, <1 x i64> %D,
 define void @st1x4_v2f64(<2 x double> %A, <2 x double> %B, <2 x double> %C, <2 x double> %D, ptr %p) sanitize_memory {
 ; CHECK-LABEL: define void @st1x4_v2f64(
 ; CHECK-SAME: <2 x double> [[A:%.*]], <2 x double> [[B:%.*]], <2 x double> [[C:%.*]], <2 x double> [[D:%.*]], ptr [[P:%.*]]) #[[ATTR0]] {
+; CHECK-NEXT:    [[TMP1:%.*]] = load i64, ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_param_tls to i64), i64 64) to ptr), align 8
 ; CHECK-NEXT:    [[TMP2:%.*]] = load <2 x i64>, ptr @__msan_param_tls, align 8
 ; CHECK-NEXT:    [[TMP3:%.*]] = load <2 x i64>, ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_param_tls to i64), i64 16) to ptr), align 8
 ; CHECK-NEXT:    [[TMP4:%.*]] = load <2 x i64>, ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_param_tls to i64), i64 32) to ptr), align 8
 ; CHECK-NEXT:    [[TMP5:%.*]] = load <2 x i64>, ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_param_tls to i64), i64 48) to ptr), align 8
-; CHECK-NEXT:    [[TMP1:%.*]] = load i64, ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_param_tls to i64), i64 64) to ptr), align 8
 ; CHECK-NEXT:    call void @llvm.donothing()
-; CHECK-NEXT:    [[TMP6:%.*]] = bitcast <2 x i64> [[TMP2]] to i128
-; CHECK-NEXT:    [[_MSCMP3:%.*]] = icmp ne i128 [[TMP6]], 0
-; CHECK-NEXT:    [[TMP7:%.*]] = bitcast <2 x i64> [[TMP3]] to i128
-; CHECK-NEXT:    [[_MSCMP1:%.*]] = icmp ne i128 [[TMP7]], 0
-; CHECK-NEXT:    [[_MSOR:%.*]] = or i1 [[_MSCMP3]], [[_MSCMP1]]
-; CHECK-NEXT:    [[TMP8:%.*]] = bitcast <2 x i64> [[TMP4]] to i128
-; CHECK-NEXT:    [[_MSCMP2:%.*]] = icmp ne i128 [[TMP8]], 0
-; CHECK-NEXT:    [[_MSOR3:%.*]] = or i1 [[_MSOR]], [[_MSCMP2]]
-; CHECK-NEXT:    [[TMP9:%.*]] = bitcast <2 x i64> [[TMP5]] to i128
-; CHECK-NEXT:    [[_MSCMP4:%.*]] = icmp ne i128 [[TMP9]], 0
-; CHECK-NEXT:    [[_MSOR5:%.*]] = or i1 [[_MSOR3]], [[_MSCMP4]]
+; CHECK-NEXT:    [[TMP6:%.*]] = ptrtoint ptr [[P]] to i64
+; CHECK-NEXT:    [[TMP7:%.*]] = xor i64 [[TMP6]], 193514046488576
+; CHECK-NEXT:    [[TMP8:%.*]] = inttoptr i64 [[TMP7]] to ptr
+; CHECK-NEXT:    call void @llvm.aarch64.neon.st1x4.v2i64.p0(<2 x i64> [[TMP2]], <2 x i64> [[TMP3]], <2 x i64> [[TMP4]], <2 x i64> [[TMP5]], ptr [[TMP8]])
 ; CHECK-NEXT:    [[_MSCMP:%.*]] = icmp ne i64 [[TMP1]], 0
-; CHECK-NEXT:    [[_MSOR7:%.*]] = or i1 [[_MSOR5]], [[_MSCMP]]
-; CHECK-NEXT:    br i1 [[_MSOR7]], label [[TMP10:%.*]], label [[TMP11:%.*]], !prof [[PROF0]]
-; CHECK:       10:
+; CHECK-NEXT:    br i1 [[_MSCMP]], label [[TMP9:%.*]], label [[TMP10:%.*]], !prof [[PROF0]]
+; CHECK:       9:
 ; CHECK-NEXT:    call void @__msan_warning_noreturn() #[[ATTR4]]
 ; CHECK-NEXT:    unreachable
-; CHECK:       11:
+; CHECK:       10:
 ; CHECK-NEXT:    call void @llvm.aarch64.neon.st1x4.v2f64.p0(<2 x double> [[A]], <2 x double> [[B]], <2 x double> [[C]], <2 x double> [[D]], ptr [[P]])
 ; CHECK-NEXT:    ret void
 ;
@@ -363,30 +311,22 @@ define void @st1x4_v2f64(<2 x double> %A, <2 x double> %B, <2 x double> %C, <2 x
 define void @st1x4_v2i64(<2 x i64> %A, <2 x i64> %B, <2 x i64> %C, <2 x i64> %D, ptr %p) sanitize_memory {
 ; CHECK-LABEL: define void @st1x4_v2i64(
 ; CHECK-SAME: <2 x i64> [[A:%.*]], <2 x i64> [[B:%.*]], <2 x i64> [[C:%.*]], <2 x i64> [[D:%.*]], ptr [[P:%.*]]) #[[ATTR0]] {
+; CHECK-NEXT:    [[TMP1:%.*]] = load i64, ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_param_tls to i64), i64 64) to ptr), align 8
 ; CHECK-NEXT:    [[TMP2:%.*]] = load <2 x i64>, ptr @__msan_param_tls, align 8
 ; CHECK-NEXT:    [[TMP3:%.*]] = load <2 x i64>, ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_param_tls to i64), i64 16) to ptr), align 8
 ; CHECK-NEXT:    [[TMP4:%.*]] = load <2 x i64>, ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_param_tls to i64), i64 32) to ptr), align 8
 ; CHECK-NEXT:    [[TMP5:%.*]] = load <2 x i64>, ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_param_tls to i64), i64 48) to ptr), align 8
-; CHECK-NEXT:    [[TMP1:%.*]] = load i64, ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_param_tls to i64), i64 64) to ptr), align 8
 ; CHECK-NEXT:    call void @llvm.donothing()
-; CHECK-NEXT:    [[TMP6:%.*]] = bitcast <2 x i64> [[TMP2]] to i128
-; CHECK-NEXT:    [[_MSCMP3:%.*]] = icmp ne i128 [[TMP6]], 0
-; CHECK-NEXT:    [[TMP7:%.*]] = bitcast <2 x i64> [[TMP3]] to i128
-; CHECK-NEXT:    [[_MSCMP1:%.*]] = icmp ne i128 [[TMP7]], 0
-; CHECK-NEXT:    [[_MSOR:%.*]] = or i1 [[_MSCMP3]], [[_MSCMP1]]
-; CHECK-NEXT:    [[TMP8:%.*]] = bitcast <2 x i64> [[TMP4]] to i128
-; CHECK-NEXT:    [[_MSCMP2:%.*]] = icmp ne i128 [[TMP8]], 0
-; CHECK-NEXT:    [[_MSOR3:%.*]] = or i1 [[_MSOR]], [[_MSCMP2]]
-; CHECK-NEXT:    [[TMP9:%.*]] = bitcast <2 x i64> [[TMP5]] to i128
-; CHECK-NEXT:    [[_MSCMP4:%.*]] = icmp ne i128 [[TMP9]], 0
-; CHECK-NEXT:    [[_MSOR5:%.*]] = or i1 [[_MSOR3]], [[_MSCMP4]]
+; CHECK-NEXT:    [[TMP6:%.*]] = ptrtoint ptr [[P]] to i64
+; CHECK-NEXT:    [[TMP7:%.*]] = xor i64 [[TMP6]], 193514046488576
+; CHECK-NEXT:    [[TMP8:%.*]] = inttoptr i64 [[TMP7]] to ptr
+; CHECK-NEXT:    call void @llvm.aarch64.neon.st1x4.v2i64.p0(<2 x i64> [[TMP2]], <2 x i64> [[TMP3]], <2 x i64> [[TMP4]], <2 x i64> [[TMP5]], ptr [[TMP8]])
 ; CHECK-NEXT:    [[_MSCMP:%.*]] = icmp ne i64 [[TMP1]], 0
-; CHECK-NEXT:    [[_MSOR7:%.*]] = or i1 [[_MSOR5]], [[_MSCMP]]
-; CHECK-NEXT:    br i1 [[_MSOR7]], label [[TMP10:%.*]], label [[TMP11:%.*]], !prof [[PROF0]]
-; CHECK:       10:
+; CHECK-NEXT:    br i1 [[_MSCMP]], label [[TMP9:%.*]], label [[TMP10:%.*]], !prof [[PROF0]]
+; CHECK:       9:
 ; CHECK-NEXT:    call void @__msan_warning_noreturn() #[[ATTR4]]
 ; CHECK-NEXT:    unreachable
-; CHECK:       11:
+; CHECK:       10:
 ; CHECK-NEXT:    call void @llvm.aarch64.neon.st1x4.v2i64.p0(<2 x i64> [[A]], <2 x i64> [[B]], <2 x i64> [[C]], <2 x i64> [[D]], ptr [[P]])
 ; CHECK-NEXT:    ret void
 ;
@@ -404,6 +344,7 @@ define void @st2_v16i8(<16 x i8> %A, <16 x i8> %B, ptr %p) sanitize_memory {
 ; CHECK-NEXT:    [[TMP4:%.*]] = ptrtoint ptr [[P]] to i64
 ; CHECK-NEXT:    [[TMP5:%.*]] = xor i64 [[TMP4]], 193514046488576
 ; CHECK-NEXT:    [[TMP6:%.*]] = inttoptr i64 [[TMP5]] to ptr
+; CHECK-NEXT:    call void @llvm.aarch64.neon.st2.v16i8.p0(<16 x i8> [[TMP2]], <16 x i8> [[TMP3]], ptr [[TMP6]])
 ; CHECK-NEXT:    [[_MSCMP:%.*]] = icmp ne i64 [[TMP1]], 0
 ; CHECK-NEXT:    br i1 [[_MSCMP]], label [[TMP7:%.*]], label [[TMP8:%.*]], !prof [[PROF0]]
 ; CHECK:       7:
@@ -411,7 +352,6 @@ define void @st2_v16i8(<16 x i8> %A, <16 x i8> %B, ptr %p) sanitize_memory {
 ; CHECK-NEXT:    unreachable
 ; CHECK:       8:
 ; CHECK-NEXT:    call void @llvm.aarch64.neon.st2.v16i8.p0(<16 x i8> [[A]], <16 x i8> [[B]], ptr [[P]])
-; CHECK-NEXT:    call void @llvm.aarch64.neon.st2.v16i8.p0(<16 x i8> [[TMP2]], <16 x i8> [[TMP3]], ptr [[TMP6]])
 ; CHECK-NEXT:    ret void
 ;
   call void @llvm.aarch64.neon.st2.v16i8.p0(<16 x i8> %A, <16 x i8> %B, ptr %p)
@@ -428,6 +368,7 @@ define void @st2_v1f64(<1 x double> %A, <1 x double> %B, ptr %p) sanitize_memory
 ; CHECK-NEXT:    [[TMP4:%.*]] = ptrtoint ptr [[P]] to i64
 ; CHECK-NEXT:    [[TMP5:%.*]] = xor i64 [[TMP4]], 193514046488576
 ; CHECK-NEXT:    [[TMP6:%.*]] = inttoptr i64 [[TMP5]] to ptr
+; CHECK-NEXT:    call void @llvm.aarch64.neon.st2.v1i64.p0(<1 x i64> [[TMP2]], <1 x i64> [[TMP3]], ptr [[TMP6]])
 ; CHECK-NEXT:    [[_MSCMP:%.*]] = icmp ne i64 [[TMP1]], 0
 ; CHECK-NEXT:    br i1 [[_MSCMP]], label [[TMP7:%.*]], label [[TMP8:%.*]], !prof [[PROF0]]
 ; CHECK:       7:
@@ -435,11 +376,6 @@ define void @st2_v1f64(<1 x double> %A, <1 x double> %B, ptr %p) sanitize_memory
 ; CHECK-NEXT:    unreachable
 ; CHECK:       8:
 ; CHECK-NEXT:    call void @llvm.aarch64.neon.st2.v1f64.p0(<1 x double> [[A]], <1 x double> [[B]], ptr [[P]])
-;
-; EDITOR'S NOTE: the next call is invalid because the parameters (shadows) are integer, but the called function
-;                expects floating-point parameters.
-;
-; CHECK-NEXT:    call void @llvm.aarch64.neon.st2.v1f64.p0(<1 x i64> [[TMP2]], <1 x i64> [[TMP3]], ptr [[TMP6]])
 ; CHECK-NEXT:    ret void
 ;
   call void @llvm.aarch64.neon.st2.v1f64.p0(<1 x double> %A, <1 x double> %B, ptr %p)
@@ -456,6 +392,7 @@ define void @st2_v1i64(<1 x i64> %A, <1 x i64> %B, ptr %p) sanitize_memory {
 ; CHECK-NEXT:    [[TMP4:%.*]] = ptrtoint ptr [[P]] to i64
 ; CHECK-NEXT:    [[TMP5:%.*]] = xor i64 [[TMP4]], 193514046488576
 ; CHECK-NEXT:    [[TMP6:%.*]] = inttoptr i64 [[TMP5]] to ptr
+; CHECK-NEXT:    call void @llvm.aarch64.neon.st2.v1i64.p0(<1 x i64> [[TMP2]], <1 x i64> [[TMP3]], ptr [[TMP6]])
 ; CHECK-NEXT:    [[_MSCMP:%.*]] = icmp ne i64 [[TMP1]], 0
 ; CHECK-NEXT:    br i1 [[_MSCMP]], label [[TMP7:%.*]], label [[TMP8:%.*]], !prof [[PROF0]]
 ; CHECK:       7:
@@ -463,7 +400,6 @@ define void @st2_v1i64(<1 x i64> %A, <1 x i64> %B, ptr %p) sanitize_memory {
 ; CHECK-NEXT:    unreachable
 ; CHECK:       8:
 ; CHECK-NEXT:    call void @llvm.aarch64.neon.st2.v1i64.p0(<1 x i64> [[A]], <1 x i64> [[B]], ptr [[P]])
-; CHECK-NEXT:    call void @llvm.aarch64.neon.st2.v1i64.p0(<1 x i64> [[TMP2]], <1 x i64> [[TMP3]], ptr [[TMP6]])
 ; CHECK-NEXT:    ret void
 ;
   call void @llvm.aarch64.neon.st2.v1i64.p0(<1 x i64> %A, <1 x i64> %B, ptr %p)
@@ -480,6 +416,7 @@ define void @st2_v2f32(<2 x float> %A, <2 x float> %B, ptr %p) sanitize_memory {
 ; CHECK-NEXT:    [[TMP4:%.*]] = ptrtoint ptr [[P]] to i64
 ; CHECK-NEXT:    [[TMP5:%.*]] = xor i64 [[TMP4]], 193514046488576
 ; CHECK-NEXT:    [[TMP6:%.*]] = inttoptr i64 [[TMP5]] to ptr
+; CHECK-NEXT:    call void @llvm.aarch64.neon.st2.v2i32.p0(<2 x i32> [[TMP2]], <2 x i32> [[TMP3]], ptr [[TMP6]])
 ; CHECK-NEXT:    [[_MSCMP:%.*]] = icmp ne i64 [[TMP1]], 0
 ; CHECK-NEXT:    br i1 [[_MSCMP]], label [[TMP7:%.*]], label [[TMP8:%.*]], !prof [[PROF0]]
 ; CHECK:       7:
@@ -487,7 +424,6 @@ define void @st2_v2f32(<2 x float> %A, <2 x float> %B, ptr %p) sanitize_memory {
 ; CHECK-NEXT:    unreachable
 ; CHECK:       8:
 ; CHECK-NEXT:    call void @llvm.aarch64.neon.st2.v2f32.p0(<2 x float> [[A]], <2 x float> [[B]], ptr [[P]])
-; CHECK-NEXT:    call void @llvm.aarch64.neon.st2.v2f32.p0(<2 x i32> [[TMP2]], <2 x i32> [[TMP3]], ptr [[TMP6]])
 ; CHECK-NEXT:    ret void
 ;
   call void @llvm.aarch64.neon.st2.v2f32.p0(<2 x float> %A, <2 x float> %B, ptr %p)
@@ -504,6 +440,7 @@ define void @st2_v2f64(<2 x double> %A, <2 x double> %B, ptr %p) sanitize_memory
 ; CHECK-NEXT:    [[TMP4:%.*]] = ptrtoint ptr [[P]] to i64
 ; CHECK-NEXT:    [[TMP5:%.*]] = xor i64 [[TMP4]], 193514046488576
 ; CHECK-NEXT:    [[TMP6:%.*]] = inttoptr i64 [[TMP5]] to ptr
+; CHECK-NEXT:    call void @llvm.aarch64.neon.st2.v2i64.p0(<2 x i64> [[TMP2]], <2 x i64> [[TMP3]], ptr [[TMP6]])
 ; CHECK-NEXT:    [[_MSCMP:%.*]] = icmp ne i64 [[TMP1]], 0
 ; CHECK-NEXT:    br i1 [[_MSCMP]], label [[TMP7:%.*]], label [[TMP8:%.*]], !prof [[PROF0]]
 ; CHECK:       7:
@@ -511,7 +448,6 @@ define void @st2_v2f64(<2 x double> %A, <2 x double> %B, ptr %p) sanitize_memory
 ; CHECK-NEXT:    unreachable
 ; CHECK:       8:
 ; CHECK-NEXT:    call void @llvm.aarch64.neon.st2.v2f64.p0(<2 x double> [[A]], <2 x double> [[B]], ptr [[P]])
-; CHECK-NEXT:    call void @llvm.aarch64.neon.st2.v2f64.p0(<2 x i64> [[TMP2]], <2 x i64> [[TMP3]], ptr [[TMP6]])
 ; CHECK-NEXT:    ret void
 ;
   call void @llvm.aarch64.neon.st2.v2f64.p0(<2 x double> %A, <2 x double> %B, ptr %p)
@@ -528,6 +464,7 @@ define void @st2_v2i32(<2 x i32> %A, <2 x i32> %B, ptr %p) sanitize_memory {
 ; CHECK-NEXT:    [[TMP4:%.*]] = ptrtoint ptr [[P]] to i64
 ; CHECK-NEXT:    [[TMP5:%.*]] = xor i64 [[TMP4]], 193514046488576
 ; CHECK-NEXT:    [[TMP6:%.*]] = inttoptr i64 [[TMP5]] to ptr
+; CHECK-NEXT:    call void @llvm.aarch64.neon.st2.v2i32.p0(<2 x i32> [[TMP2]], <2 x i32> [[TMP3]], ptr [[TMP6]])
 ; CHECK-NEXT:    [[_MSCMP:%.*]] = icmp ne i64 [[TMP1]], 0
 ; CHECK-NEXT:    br i1 [[_MSCMP]], label [[TMP7:%.*]], label [[TMP8:%.*]], !prof [[PROF0]]
 ; CHECK:       7:
@@ -535,7 +472,6 @@ define void @st2_v2i32(<2 x i32> %A, <2 x i32> %B, ptr %p) sanitize_memory {
 ; CHECK-NEXT:    unreachable
 ; CHECK:       8:
 ; CHECK-NEXT:    call void @llvm.aarch64.neon.st2.v2i32.p0(<2 x i32> [[A]], <2 x i32> [[B]], ptr [[P]])
-; CHECK-NEXT:    call void @llvm.aarch64.neon.st2.v2i32.p0(<2 x i32> [[TMP2]], <2 x i32> [[TMP3]], ptr [[TMP6]])
 ; CHECK-NEXT:    ret void
 ;
   call void @llvm.aarch64.neon.st2.v2i32.p0(<2 x i32> %A, <2 x i32> %B, ptr %p)
@@ -552,6 +488,7 @@ define void @st2_v2i64(<2 x i64> %A, <2 x i64> %B, ptr %p) sanitize_memory {
 ; CHECK-NEXT:    [[TMP4:%.*]] = ptrtoint ptr [[P]] to i64
 ; CHECK-NEXT:    [[TMP5:%.*]] = xor i64 [[TMP4]], 193514046488576
 ; CHECK-NEXT:    [[TMP6:%.*]] = inttoptr i64 [[TMP5]] to ptr
+; CHECK-NEXT:    call void @llvm.aarch64.neon.st2.v2i64.p0(<2 x i64> [[TMP2]], <2 x i64> [[TMP3]], ptr [[TMP6]])
 ; CHECK-NEXT:    [[_MSCMP:%.*]] = icmp ne i64 [[TMP1]], 0
 ; CHECK-NEXT:    br i1 [[_MSCMP]], label [[TMP7:%.*]], label [[TMP8:%.*]], !prof [[PROF0]]
 ; CHECK:       7:
@@ -559,7 +496,6 @@ define void @st2_v2i64(<2 x i64> %A, <2 x i64> %B, ptr %p) sanitize_memory {
 ; CHECK-NEXT:    unreachable
 ; CHECK:       8:
 ; CHECK-NEXT:    call void @llvm.aarch64.neon.st2.v2i64.p0(<2 x i64> [[A]], <2 x i64> [[B]], ptr [[P]])
-; CHECK-NEXT:    call void @llvm.aarch64.neon.st2.v2i64.p0(<2 x i64> [[TMP2]], <2 x i64> [[TMP3]], ptr [[TMP6]])
 ; CHECK-NEXT:    ret void
 ;
   call void @llvm.aarch64.neon.st2.v2i64.p0(<2 x i64> %A, <2 x i64> %B, ptr %p)
@@ -576,6 +512,7 @@ define void @st2_v4f16(<4 x half> %A, <4 x half> %B, ptr %p) sanitize_memory {
 ; CHECK-NEXT:    [[TMP4:%.*]] = ptrtoint ptr [[P]] to i64
 ; CHECK-NEXT:    [[TMP5:%.*]] = xor i64 [[TMP4]], 193514046488576
 ; CHECK-NEXT:    [[TMP6:%.*]] = inttoptr i64 [[TMP5]] to ptr
+; CHECK-NEXT:    call void @llvm.aarch64.neon.st2.v4i16.p0(<4 x i16> [[TMP2]], <4 x i16> [[TMP3]], ptr [[TMP6]])
 ; CHECK-NEXT:    [[_MSCMP:%.*]] = icmp ne i64 [[TMP1]], 0
 ; CHECK-NEXT:    br i1 [[_MSCMP]], label [[TMP7:%.*]], label [[TMP8:%.*]], !prof [[PROF0]]
 ; CHECK:       7:
@@ -583,7 +520,6 @@ define void @st2_v4f16(<4 x half> %A, <4 x half> %B, ptr %p) sanitize_memory {
 ; CHECK-NEXT:    unreachable
 ; CHECK:       8:
 ; CHECK-NEXT:    call void @llvm.aarch64.neon.st2.v4f16.p0(<4 x half> [[A]], <4 x half> [[B]], ptr [[P]])
-; CHECK-NEXT:    call void @llvm.aarch64.neon.st2.v4f16.p0(<4 x i16> [[TMP2]], <4 x i16> [[TMP3]], ptr [[TMP6]])
 ; CHECK-NEXT:    ret void
 ;
   call void @llvm.aarch64.neon.st2.v4f16.p0(<4 x half> %A, <4 x half> %B, ptr %p)
@@ -600,6 +536,7 @@ define void @st2_v4f32(<4 x float> %A, <4 x float> %B, ptr %p) sanitize_memory {
 ; CHECK-NEXT:    [[TMP4:%.*]] = ptrtoint ptr [[P]] to i64
 ; CHECK-NEXT:    [[TMP5:%.*]] = xor i64 [[TMP4]], 193514046488576
 ; CHECK-NEXT:    [[TMP6:%.*]] = inttoptr i64 [[TMP5]] to ptr
+; CHECK-NEXT:    call void @llvm.aarch64.neon.st2.v4i32.p0(<4 x i32> [[TMP2]], <4 x i32> [[TMP3]], ptr [[TMP6]])
 ; CHECK-NEXT:    [[_MSCMP:%.*]] = icmp ne i64 [[TMP1]], 0
 ; CHECK-NEXT:    br i1 [[_MSCMP]], label [[TMP7:%.*]], label [[TMP8:%.*]], !prof [[PROF0]]
 ; CHECK:       7:
@@ -607,7 +544,6 @@ define void @st2_v4f32(<4 x float> %A, <4 x float> %B, ptr %p) sanitize_memory {
 ; CHECK-NEXT:    unreachable
 ; CHECK:       8:
 ; CHECK-NEXT:    call void @llvm.aarch64.neon.st2.v4f32.p0(<4 x float> [[A]], <4 x float> [[B]], ptr [[P]])
-; CHECK-NEXT:    call void @llvm.aarch64.neon.st2.v4f32.p0(<4 x i32> [[TMP2]], <4 x i32> [[TMP3]], ptr [[TMP6]])
 ; CHECK-NEXT:    ret void
 ;
   call void @llvm.aarch64.neon.st2.v4f32.p0(<4 x float> %A, <4 x float> %B, ptr %p)
@@ -624,6 +560,7 @@ define void @st2_v4i16(<4 x i16> %A, <4 x i16> %B, ptr %p) sanitize_memory {
 ; CHECK-NEXT:    [[TMP4:%.*]] = ptrtoint ptr [[P]] to i64
 ; CHECK-NEXT:    [[TMP5:%.*]] = xor i64 [[TMP4]], 193514046488576
 ; CHECK-NEXT:    [[TMP6:%.*]] = inttoptr i64 [[TMP5]] to ptr
+; CHECK-NEXT:    call void @llvm.aarch64.neon.st2.v4i16.p0(<4 x i16> [[TMP2]], <4 x i16> [[TMP3]], ptr [[TMP6]])
 ; CHECK-NEXT:    [[_MSCMP:%.*]] = icmp ne i64 [[TMP1]], 0
 ; CHECK-NEXT:    br i1 [[_MSCMP]], label [[TMP7:%.*]], label [[TMP8:%.*]], !prof [[PROF0]]
 ; CHECK:       7:
@@ -631,7 +568,6 @@ define void @st2_v4i16(<4 x i16> %A, <4 x i16> %B, ptr %p) sanitize_memory {
 ; CHECK-NEXT:    unreachable
 ; CHECK:       8:
 ; CHECK-NEXT:    call void @llvm.aarch64.neon.st2.v4i16.p0(<4 x i16> [[A]], <4 x i16> [[B]], ptr [[P]])
-; CHECK-NEXT:    call void @llvm.aarch64.neon.st2.v4i16.p0(<4 x i16> [[TMP2]], <4 x i16> [[TMP3]], ptr [[TMP6]])
 ; CHECK-NEXT:    ret void
 ;
   call void @llvm.aarch64.neon.st2.v4i16.p0(<4 x i16> %A, <4 x i16> %B, ptr %p)
@@ -648,6 +584,7 @@ define void @st2_v4i32(<4 x i32> %A, <4 x i32> %B, ptr %p) sanitize_memory {
 ; CHECK-NEXT:    [[TMP4:%.*]] = ptrtoint ptr [[P]] to i64
 ; CHECK-NEXT:    [[TMP5:%.*]] = xor i64 [[TMP4]], 193514046488576
 ; CHECK-NEXT:    [[TMP6:%.*]] = inttoptr i64 [[TMP5]] to ptr
+; CHECK-NEXT:    call void @llvm.aarch64.neon.st2.v4i32.p0(<4 x i32> [[TMP2]], <4 x i32> [[TMP3]], ptr [[TMP6]])
 ; CHECK-NEXT:    [[_MSCMP:%.*]] = icmp ne i64 [[TMP1]], 0
 ; CHECK-NEXT:    br i1 [[_MSCMP]], label [[TMP7:%.*]], label [[TMP8:%.*]], !prof [[PROF0]]
 ; CHECK:       7:
@@ -655,7 +592,6 @@ define void @st2_v4i32(<4 x i32> %A, <4 x i32> %B, ptr %p) sanitize_memory {
 ; CHECK-NEXT:    unreachable
 ; CHECK:       8:
 ; CHECK-NEXT:    call void @llvm.aarch64.neon.st2.v4i32.p0(<4 x i32> [[A]], <4 x i32> [[B]], ptr [[P]])
-; CHECK-NEXT:    call void @llvm.aarch64.neon.st2.v4i32.p0(<4 x i32> [[TMP2]], <4 x i32> [[TMP3]], ptr [[TMP6]])
 ; CHECK-NEXT:    ret void
 ;
   call void @llvm.aarch64.neon.st2.v4i32.p0(<4 x i32> %A, <4 x i32> %B, ptr %p)
@@ -672,6 +608,7 @@ define void @st2_v8f16(<8 x half> %A, <8 x half> %B, ptr %p) sanitize_memory {
 ; CHECK-NEXT:    [[TMP4:%.*]] = ptrtoint ptr [[P]] to i64
 ; CHECK-NEXT:    [[TMP5:%.*]] = xor i64 [[TMP4]], 193514046488576
 ; CHECK-NEXT:    [[TMP6:%.*]] = inttoptr i64 [[TMP5]] to ptr
+; CHECK-NEXT:    call void @llvm.aarch64.neon.st2.v8i16.p0(<8 x i16> [[TMP2]], <8 x i16> [[TMP3]], ptr [[TMP6]])
 ; CHECK-NEXT:    [[_MSCMP:%.*]] = icmp ne i64 [[TMP1]], 0
 ; CHECK-NEXT:    br i1 [[_MSCMP]], label [[TMP7:%.*]], label [[TMP8:%.*]], !prof [[PROF0]]
 ; CHECK:       7:
@@ -679,7 +616,6 @@ define void @st2_v8f16(<8 x half> %A, <8 x half> %B, ptr %p) sanitize_memory {
 ; CHECK-NEXT:    unreachable
 ; CHECK:       8:
 ; CHECK-NEXT:    call void @llvm.aarch64.neon.st2.v8f16.p0(<8 x half> [[A]], <8 x half> [[B]], ptr [[P]])
-; CHECK-NEXT:    call void @llvm.aarch64.neon.st2.v8f16.p0(<8 x i16> [[TMP2]], <8 x i16> [[TMP3]], ptr [[TMP6]])
 ; CHECK-NEXT:    ret void
 ;
   call void @llvm.aarch64.neon.st2.v8f16.p0(<8 x half> %A, <8 x half> %B, ptr %p)
@@ -696,6 +632,7 @@ define void @st2_v8i16(<8 x i16> %A, <8 x i16> %B, ptr %p) sanitize_memory {
 ; CHECK-NEXT:    [[TMP4:%.*]] = ptrtoint ptr [[P]] to i64
 ; CHECK-NEXT:    [[TMP5:%.*]] = xor i64 [[TMP4]], 193514046488576
 ; CHECK-NEXT:    [[TMP6:%.*]] = inttoptr i64 [[TMP5]] to ptr
+; CHECK-NEXT:    call void @llvm.aarch64.neon.st2.v8i16.p0(<8 x i16> [[TMP2]], <8 x i16> [[TMP3]], ptr [[TMP6]])
 ; CHECK-NEXT:    [[_MSCMP:%.*]] = icmp ne i64 [[TMP1]], 0
 ; CHECK-NEXT:    br i1 [[_MSCMP]], label [[TMP7:%.*]], label [[TMP8:%.*]], !prof [[PROF0]]
 ; CHECK:       7:
@@ -703,7 +640,6 @@ define void @st2_v8i16(<8 x i16> %A, <8 x i16> %B, ptr %p) sanitize_memory {
 ; CHECK-NEXT:    unreachable
 ; CHECK:       8:
 ; CHECK-NEXT:    call void @llvm.aarch64.neon.st2.v8i16.p0(<8 x i16> [[A]], <8 x i16> [[B]], ptr [[P]])
-; CHECK-NEXT:    call void @llvm.aarch64.neon.st2.v8i16.p0(<8 x i16> [[TMP2]], <8 x i16> [[TMP3]], ptr [[TMP6]])
 ; CHECK-NEXT:    ret void
 ;
   call void @llvm.aarch64.neon.st2.v8i16.p0(<8 x i16> %A, <8 x i16> %B, ptr %p)
@@ -720,6 +656,7 @@ define void @st2_v8i8(<8 x i8> %A, <8 x i8> %B, ptr %p) sanitize_memory {
 ; CHECK-NEXT:    [[TMP4:%.*]] = ptrtoint ptr [[P]] to i64
 ; CHECK-NEXT:    [[TMP5:%.*]] = xor i64 [[TMP4]], 193514046488576
 ; CHECK-NEXT:    [[TMP6:%.*]] = inttoptr i64 [[TMP5]] to ptr
+; CHECK-NEXT:    call void @llvm.aarch64.neon.st2.v8i8.p0(<8 x i8> [[TMP2]], <8 x i8> [[TMP3]], ptr [[TMP6]])
 ; CHECK-NEXT:    [[_MSCMP:%.*]] = icmp ne i64 [[TMP1]], 0
 ; CHECK-NEXT:    br i1 [[_MSCMP]], label [[TMP7:%.*]], label [[TMP8:%.*]], !prof [[PROF0]]
 ; CHECK:       7:
@@ -727,7 +664,6 @@ define void @st2_v8i8(<8 x i8> %A, <8 x i8> %B, ptr %p) sanitize_memory {
 ; CHECK-NEXT:    unreachable
 ; CHECK:       8:
 ; CHECK-NEXT:    call void @llvm.aarch64.neon.st2.v8i8.p0(<8 x i8> [[A]], <8 x i8> [[B]], ptr [[P]])
-; CHECK-NEXT:    call void @llvm.aarch64.neon.st2.v8i8.p0(<8 x i8> [[TMP2]], <8 x i8> [[TMP3]], ptr [[TMP6]])
 ; CHECK-NEXT:    ret void
 ;
   call void @llvm.aarch64.neon.st2.v8i8.p0(<8 x i8> %A, <8 x i8> %B, ptr %p)
@@ -745,6 +681,7 @@ define void @st3_v16i8(<16 x i8> %A, <16 x i8> %B, <16 x i8> %C, ptr %p) sanitiz
 ; CHECK-NEXT:    [[TMP5:%.*]] = ptrtoint ptr [[P]] to i64
 ; CHECK-NEXT:    [[TMP6:%.*]] = xor i64 [[TMP5]], 193514046488576
 ; CHECK-NEXT:    [[TMP7:%.*]] = inttoptr i64 [[TMP6]] to ptr
+; CHECK-NEXT:    call void @llvm.aarch64.neon.st3.v16i8.p0(<16 x i8> [[TMP2]], <16 x i8> [[TMP3]], <16 x i8> [[TMP4]], ptr [[TMP7]])
 ; CHECK-NEXT:    [[_MSCMP:%.*]] = icmp ne i64 [[TMP1]], 0
 ; CHECK-NEXT:    br i1 [[_MSCMP]], label [[TMP8:%.*]], label [[TMP9:%.*]], !prof [[PROF0]]
 ; CHECK:       8:
@@ -752,7 +689,6 @@ define void @st3_v16i8(<16 x i8> %A, <16 x i8> %B, <16 x i8> %C, ptr %p) sanitiz
 ; CHECK-NEXT:    unreachable
 ; CHECK:       9:
 ; CHECK-NEXT:    call void @llvm.aarch64.neon.st3.v16i8.p0(<16 x i8> [[A]], <16 x i8> [[B]], <16 x i8> [[C]], ptr [[P]])
-; CHECK-NEXT:    call void @llvm.aarch64.neon.st3.v16i8.p0(<16 x i8> [[TMP2]], <16 x i8> [[TMP3]], <16 x i8> [[TMP4]], ptr [[TMP7]])
 ; CHECK-NEXT:    ret void
 ;
   call void @llvm.aarch64.neon.st3.v16i8.p0(<16 x i8> %A, <16 x i8> %B, <16 x i8> %C, ptr %p)
@@ -770,6 +706,7 @@ define void @st3_v1f64(<1 x double> %A, <1 x double> %B, <1 x double> %C, ptr %p
 ; CHECK-NEXT:    [[TMP5:%.*]] = ptrtoint ptr [[P]] to i64
 ; CHECK-NEXT:    [[TMP6:%.*]] = xor i64 [[TMP5]], 193514046488576
 ; CHECK-NEXT:    [[TMP7:%.*]] = inttoptr i64 [[TMP6]] to ptr
+; CHECK-NEXT:    call void @llvm.aarch64.neon.st3.v1i64.p0(<1 x i64> [[TMP2]], <1 x i64> [[TMP3]], <1 x i64> [[TMP4]], ptr [[TMP7]])
 ; CHECK-NEXT:    [[_MSCMP:%.*]] = icmp ne i64 [[TMP1]], 0
 ; CHECK-NEXT:    br i1 [[_MSCMP]], label [[TMP8:%.*]], label [[TMP9:%.*]], !prof [[PROF0]]
 ; CHECK:       8:
@@ -777,7 +714,6 @@ define void @st3_v1f64(<1 x double> %A, <1 x double> %B, <1 x double> %C, ptr %p
 ; CHECK-NEXT:    unreachable
 ; CHECK:       9:
 ; CHECK-NEXT:    call void @llvm.aarch64.neon.st3.v1f64.p0(<1 x double> [[A]], <1 x double> [[B]], <1 x double> [[C]], ptr [[P]])
-; CHECK-NEXT:    call void @llvm.aarch64.neon.st3.v1f64.p0(<1 x i64> [[TMP2]], <1 x i64> [[TMP3]], <1 x i64> [[TMP4]], ptr [[TMP7]])
 ; CHECK-NEXT:    ret void
 ;
   call void @llvm.aarch64.neon.st3.v1f64.p0(<1 x double> %A, <1 x double> %B, <1 x double> %C, ptr %p)
@@ -795,6 +731,7 @@ define void @st3_v1i64(<1 x i64> %A, <1 x i64> %B, <1 x i64> %C, ptr %p) sanitiz
 ; CHECK-NEXT:    [[TMP5:%.*]] = ptrtoint ptr [[P]] to i64
 ; CHECK-NEXT:    [[TMP6:%.*]] = xor i64 [[TMP5]], 193514046488576
 ; CHECK-NEXT:    [[TMP7:%.*]] = inttoptr i64 [[TMP6]] to ptr
+; CHECK-NEXT:    call void @llvm.aarch64.neon.st3.v1i64.p0(<1 x i64> [[TMP2]], <1 x i64> [[TMP3]], <1 x i64> [[TMP4]], ptr [[TMP7]])
 ; CHECK-NEXT:    [[_MSCMP:%.*]] = icmp ne i64 [[TMP1]], 0
 ; CHECK-NEXT:    br i1 [[_MSCMP]], label [[TMP8:%.*]], label [[TMP9:%.*]], !prof [[PROF0]]
 ; CHECK:       8:
@@ -802,7 +739,6 @@ define void @st3_v1i64(<1 x i64> %A, <1 x i64> %B, <1 x i64> %C, ptr %p) sanitiz
 ; CHECK-NEXT:    unreachable
 ; CHECK:       9:
 ; CHECK-NEXT:    call void @llvm.aarch64.neon.st3.v1i64.p0(<1 x i64> [[A]], <1 x i64> [[B]], <1 x i64> [[C]], ptr [[P]])
-; CHECK-NEXT:    call void @llvm.aarch64.neon.st3.v1i64.p0(<1 x i64> [[TMP2]], <1 x i64> [[TMP3]], <1 x i64> [[TMP4]], ptr [[TMP7]])
 ; CHECK-NEXT:    ret void
 ;
   call void @llvm.aarch64.neon.st3.v1i64.p0(<1 x i64> %A, <1 x i64> %B, <1 x i64> %C, ptr %p)
@@ -820,6 +756,7 @@ define void @st3_v2f32(<2 x float> %A, <2 x float> %B, <2 x float> %C, ptr %p) s
 ; CHECK-NEXT:    [[TMP5:%.*]] = ptrtoint ptr [[P]] to i64
 ; CHECK-NEXT:    [[TMP6:%.*]] = xor i64 [[TMP5]], 193514046488576
 ; CHECK-NEXT:    [[TMP7:%.*]] = inttoptr i64 [[TMP6]] to ptr
+; CHECK-NEXT:    call void @llvm.aarch64.neon.st3.v2i32.p0(<2 x i32> [[TMP2]], <2 x i32> [[TMP3]], <2 x i32> [[TMP4]], ptr [[TMP7]])
 ; CHECK-NEXT:    [[_MSCMP:%.*]] = icmp ne i64 [[TMP1]], 0
 ; CHECK-NEXT:    br i1 [[_MSCMP]], label [[TMP8:%.*]], label [[TMP9:%.*]], !prof [[PROF0]]
 ; CHECK:       8:
@@ -827,7 +764,6 @@ define void @st3_v2f32(<2 x float> %A, <2 x float> %B, <2 x float> %C, ptr %p) s
 ; CHECK-NEXT:    unreachable
 ; CHECK:       9:
 ; CHECK-NEXT:    call void @llvm.aarch64.neon.st3.v2f32.p0(<2 x float> [[A]], <2 x float> [[B]], <2 x float> [[C]], ptr [[P]])
-; CHECK-NEXT:    call void @llvm.aarch64.neon.st3.v2f32.p0(<2 x i32> [[TMP2]], <2 x i32> [[TMP3]], <2 x i32> [[TMP4]], ptr [[TMP7]])
 ; CHECK-NEXT:    ret void
 ;
   call void @llvm.aarch64.neon.st3.v2f32.p0(<2 x float> %A, <2 x float> %B, <2 x float> %C, ptr %p)
@@ -845,6 +781,7 @@ define void @st3_v2f64(<2 x double> %A, <2 x double> %B, <2 x double> %C, ptr %p
 ; CHECK-NEXT:    [[TMP5:%.*]] = ptrtoint ptr [[P]] to i64
 ; CHECK-NEXT:    [[TMP6:%.*]] = xor i64 [[TMP5]], 193514046488576
 ; CHECK-NEXT:    [[TMP7:%.*]] = inttoptr i64 [[TMP6]] to ptr
+; CHECK-NEXT:    call void @llvm.aarch64.neon.st3.v2i64.p0(<2 x i64> [[TMP2]], <2 x i64> [[TMP3]], <2 x i64> [[TMP4]], ptr [[TMP7]])
 ; CHECK-NEXT:    [[_MSCMP:%.*]] = icmp ne i64 [[TMP1]], 0
 ; CHECK-NEXT:    br i1 [[_MSCMP]], label [[TMP8:%.*]], label [[TMP9:%.*]], !prof [[PROF0]]
 ; CHECK:       8:
@@ -852,7 +789,6 @@ define void @st3_v2f64(<2 x double> %A, <2 x double> %B, <2 x double> %C, ptr %p
 ; CHECK-NEXT:    unreachable
 ; CHECK:       9:
 ; CHECK-NEXT:    call void @llvm.aarch64.neon.st3.v2f64.p0(<2 x double> [[A]], <2 x double> [[B]], <2 x double> [[C]], ptr [[P]])
-; CHECK-NEXT:    call void @llvm.aarch64.neon.st3.v2f64.p0(<2 x i64> [[TMP2]], <2 x i64> [[TMP3]], <2 x i64> [[TMP4]], ptr [[TMP7]])
 ; CHECK-NEXT:    ret void
 ;
   call void @llvm.aarch64.neon.st3.v2f64.p0(<2 x double> %A, <2 x double> %B, <2 x double> %C, ptr %p)
@@ -870,6 +806,7 @@ define void @st3_v2i32(<2 x i32> %A, <2 x i32> %B, <2 x i32> %C, ptr %p) sanitiz
 ; CHECK-NEXT:    [[TMP5:%.*]] = ptrtoint ptr [[P]] to i64
 ; CHECK-NEXT:    [[TMP6:%.*]] = xor i64 [[TMP5]], 193514046488576
 ; CHECK-NEXT:    [[TMP7:%.*]] = inttoptr i64 [[TMP6]] to ptr
+; CHECK-NEXT:    call void @llvm.aarch64.neon.st3.v2i32.p0(<2 x i32> [[TMP2]], <2 x i32> [[TMP3]], <2 x i32> [[TMP4]], ptr [[TMP7]])
 ; CHECK-NEXT:    [[_MSCMP:%.*]] = icmp ne i64 [[TMP1]], 0
 ; CHECK-NEXT:    br i1 [[_MSCMP]], label [[TMP8:%.*]], label [[TMP9:%.*]], !prof [[PROF0]]
 ; CHECK:       8:
@@ -877,7 +814,6 @@ define void @st3_v2i32(<2 x i32> %A, <2 x i32> %B, <2 x i32> %C, ptr %p) sanitiz
 ; CHECK-NEXT:    unreachable
 ; CHECK:       9:
 ; CHECK-NEXT:    call void @llvm.aarch64.neon.st3.v2i32.p0(<2 x i32> [[A]], <2 x i32> [[B]], <2 x i32> [[C]], ptr [[P]])
-; CHECK-NEXT:    call void @llvm.aarch64.neon.st3.v2i32.p0(<2 x i32> [[TMP2]], <2 x i32> [[TMP3]], <2 x i32> [[TMP4]], ptr [[TMP7]])
 ; CHECK-NEXT:    ret void
 ;
   call void @llvm.aarch64.neon.st3.v2i32.p0(<2 x i32> %A, <2 x i32> %B, <2 x i32> %C, ptr %p)
@@ -895,6 +831,7 @@ define void @st3_v2i64(<2 x i64> %A, <2 x i64> %B, <2 x i64> %C, ptr %p) sanitiz
 ; CHECK-NEXT:    [[TMP5:%.*]] = ptrtoint ptr [[P]] to i64
 ; CHECK-NEXT:    [[TMP6:%.*]] = xor i64 [[TMP5]], 193514046488576
 ; CHECK-NEXT:    [[TMP7:%.*]] = inttoptr i64 [[TMP6]] to ptr
+; CHECK-NEXT:    call void @llvm.aarch64.neon.st3.v2i64.p0(<2 x i64> [[TMP2]], <2 x i64> [[TMP3]], <2 x i64> [[TMP4]], ptr [[TMP7]])
 ; CHECK-NEXT:    [[_MSCMP:%.*]] = icmp ne i64 [[TMP1]], 0
 ; CHECK-NEXT:    br i1 [[_MSCMP]], label [[TMP8:%.*]], label [[TMP9:%.*]], !prof [[PROF0]]
 ; CHECK:       8:
@@ -902,7 +839,6 @@ define void @st3_v2i64(<2 x i64> %A, <2 x i64> %B, <2 x i64> %C, ptr %p) sanitiz
 ; CHECK-NEXT:    unreachable
 ; CHECK:       9:
 ; CHECK-NEXT:    call void @llvm.aarch64.neon.st3.v2i64.p0(<2 x i64> [[A]], <2 x i64> [[B]], <2 x i64> [[C]], ptr [[P]])
-; CHECK-NEXT:    call void @llvm.aarch64.neon.st3.v2i64.p0(<2 x i64> [[TMP2]], <2 x i64> [[TMP3]], <2 x i64> [[TMP4]], ptr [[TMP7]])
 ; CHECK-NEXT:    ret void
 ;
   call void @llvm.aarch64.neon.st3.v2i64.p0(<2 x i64> %A, <2 x i64> %B, <2 x i64> %C, ptr %p)
@@ -920,6 +856,7 @@ define void @st3_v4f16(<4 x half> %A, <4 x half> %B, <4 x half> %C, ptr %p) sani
 ; CHECK-NEXT:    [[TMP5:%.*]] = ptrtoint ptr [[P]] to i64
 ; CHECK-NEXT:    [[TMP6:%.*]] = xor i64 [[TMP5]], 193514046488576
 ; CHECK-NEXT:    [[TMP7:%.*]] = inttoptr i64 [[TMP6]] to ptr
+; CHECK-NEXT:    call void @llvm.aarch64.neon.st3.v4i16.p0(<4 x i16> [[TMP2]], <4 x i16> [[TMP3]], <4 x i16> [[TMP4]], ptr [[TMP7]])
 ; CHECK-NEXT:    [[_MSCMP:%.*]] = icmp ne i64 [[TMP1]], 0
 ; CHECK-NEXT:    br i1 [[_MSCMP]], label [[TMP8:%.*]], label [[TMP9:%.*]], !prof [[PROF0]]
 ; CHECK:       8:
@@ -927,7 +864,6 @@ define void @st3_v4f16(<4 x half> %A, <4 x half> %B, <4 x half> %C, ptr %p) sani
 ; CHECK-NEXT:    unreachable
 ; CHECK:       9:
 ; CHECK-NEXT:    call void @llvm.aarch64.neon.st3.v4f16.p0(<4 x half> [[A]], <4 x half> [[B]], <4 x half> [[C]], ptr [[P]])
-; CHECK-NEXT:    call void @llvm.aarch64.neon.st3.v4f16.p0(<4 x i16> [[TMP2]], <4 x i16> [[TMP3]], <4 x i16> [[TMP4]], ptr [[TMP7]])
 ; CHECK-NEXT:    ret void
 ;
   call void @llvm.aarch64.neon.st3.v4f16.p0(<4 x half> %A, <4 x half> %B, <4 x half> %C, ptr %p)
@@ -945,6 +881,7 @@ define void @st3_v4f32(<4 x float> %A, <4 x float> %B, <4 x float> %C, ptr %p) s
 ; CHECK-NEXT:    [[TMP5:%.*]] = ptrtoint ptr [[P]] to i64
 ; CHECK-NEXT:    [[TMP6:%.*]] = xor i64 [[TMP5]], 193514046488576
 ; CHECK-NEXT:    [[TMP7:%.*]] = inttoptr i64 [[TMP6]] to ptr
+; CHECK-NEXT:    call void @llvm.aarch64.neon.st3.v4i32.p0(<4 x i32> [[TMP2]], <4 x i32> [[TMP3]], <4 x i32> [[TMP4]], ptr [[TMP7]])
 ; CHECK-NEXT:    [[_MSCMP:%.*]] = icmp ne i64 [[TMP1]], 0
 ; CHECK-NEXT:    br i1 [[_MSCMP]], label [[TMP8:%.*]], label [[TMP9:%.*]], !prof [[PROF0]]
 ; CHECK:       8:
@@ -952,7 +889,6 @@ define void @st3_v4f32(<4 x float> %A, <4 x float> %B, <4 x float> %C, ptr %p) s
 ; CHECK-NEXT:    unreachable
 ; CHECK:       9:
 ; CHECK-NEXT:    call void @llvm.aarch64.neon.st3.v4f32.p0(<4 x float> [[A]], <4 x float> [[B]], <4 x float> [[C]], ptr [[P]])
-; CHECK-NEXT:    call void @llvm.aarch64.neon.st3.v4f32.p0(<4 x i32> [[TMP2]], <4 x i32> [[TMP3]], <4 x i32> [[TMP4]], ptr [[TMP7]])
 ; CHECK-NEXT:    ret void
 ;
   call void @llvm.aarch64.neon.st3.v4f32.p0(<4 x float> %A, <4 x float> %B, <4 x float> %C, ptr %p)
@@ -970,6 +906,7 @@ define void @st3_v4i16(<4 x i16> %A, <4 x i16> %B, <4 x i16> %C, ptr %p) sanitiz
 ; CHECK-NEXT:    [[TMP5:%.*]] = ptrtoint ptr [[P]] to i64
 ; CHECK-NEXT:    [[TMP6:%.*]] = xor i64 [[TMP5]], 193514046488576
 ; CHECK-NEXT:    [[TMP7:%.*]] = inttoptr i64 [[TMP6]] to ptr
+; CHECK-NEXT:    call void @llvm.aarch64.neon.st3.v4i16.p0(<4 x i16> [[TMP2]], <4 x i16> [[TMP3]], <4 x i16> [[TMP4]], ptr [[TMP7]])
 ; CHECK-NEXT:    [[_MSCMP:%.*]] = icmp ne i64 [[TMP1]], 0
 ; CHECK-NEXT:    br i1 [[_MSCMP]], label [[TMP8:%.*]], label [[TMP9:%.*]], !prof [[PROF0]]
 ; CHECK:       8:
@@ -977,7 +914,6 @@ define void @st3_v4i16(<4 x i16> %A, <4 x i16> %B, <4 x i16> %C, ptr %p) sanitiz
 ; CHECK-NEXT:    unreachable
 ; CHECK:       9:
 ; CHECK-NEXT:    call void @llvm.aarch64.neon.st3.v4i16.p0(<4 x i16> [[A]], <4 x i16> [[B]], <4 x i16> [[C]], ptr [[P]])
-; CHECK-NEXT:    call void @llvm.aarch64.neon.st3.v4i16.p0(<4 x i16> [[TMP2]], <4 x i16> [[TMP3]], <4 x i16> [[TMP4]], ptr [[TMP7]])
 ; CHECK-NEXT:    ret void
 ;
   call void @llvm.aarch64.neon.st3.v4i16.p0(<4 x i16> %A, <4 x i16> %B, <4 x i16> %C, ptr %p)
@@ -995,6 +931,7 @@ define void @st3_v4i32(<4 x i32> %A, <4 x i32> %B, <4 x i32> %C, ptr %p) sanitiz
 ; CHECK-NEXT:    [[TMP5:%.*]] = ptrtoint ptr [[P]] to i64
 ; CHECK-NEXT:    [[TMP6:%.*]] = xor i64 [[TMP5]], 193514046488576
 ; CHECK-NEXT:    [[TMP7:%.*]] = inttoptr i64 [[TMP6]] to ptr
+; CHECK-NEXT:    call void @llvm.aarch64.neon.st3.v4i32.p0(<4 x i32> [[TMP2]], <4 x i32> [[TMP3]], <4 x i32> [[TMP4]], ptr [[TMP7]])
 ; CHECK-NEXT:    [[_MSCMP:%.*]] = icmp ne i64 [[TMP1]], 0
 ; CHECK-NEXT:    br i1 [[_MSCMP]], label [[TMP8:%.*]], label [[TMP9:%.*]], !prof [[PROF0]]
 ; CHECK:       8:
@@ -1002,7 +939,6 @@ define void @st3_v4i32(<4 x i32> %A, <4 x i32> %B, <4 x i32> %C, ptr %p) sanitiz
 ; CHECK-NEXT:    unreachable
 ; CHECK:       9:
 ; CHECK-NEXT:    call void @llvm.aarch64.neon.st3.v4i32.p0(<4 x i32> [[A]], <4 x i32> [[B]], <4 x i32> [[C]], ptr [[P]])
-; CHECK-NEXT:    call void @llvm.aarch64.neon.st3.v4i32.p0(<4 x i32> [[TMP2]], <4 x i32> [[TMP3]], <4 x i32> [[TMP4]], ptr [[TMP7]])
 ; CHECK-NEXT:    ret void
 ;
   call void @llvm.aarch64.neon.st3.v4i32.p0(<4 x i32> %A, <4 x i32> %B, <4 x i32> %C, ptr %p)
@@ -1020,6 +956,7 @@ define void @st3_v8f16(<8 x half> %A, <8 x half> %B, <8 x half> %C, ptr %p) sani
 ; CHECK-NEXT:    [[TMP5:%.*]] = ptrtoint ptr [[P]] to i64
 ; CHECK-NEXT:    [[TMP6:%.*]] = xor i64 [[TMP5]], 193514046488576
 ; CHECK-NEXT:    [[TMP7:%.*]] = inttoptr i64 [[TMP6]] to ptr
+; CHECK-NEXT:    call void @llvm.aarch64.neon.st3.v8i16.p0(<8 x i16> [[TMP2]], <8 x i16> [[TMP3]], <8 x i16> [[TMP4]], ptr [[TMP7]])
 ; CHECK-NEXT:    [[_MSCMP:%.*]] = icmp ne i64 [[TMP1]], 0
 ; CHECK-NEXT:    br i1 [[_MSCMP]], label [[TMP8:%.*]], label [[TMP9:%.*]], !prof [[PROF0]]
 ; CHECK:       8:
@@ -1027,7 +964,6 @@ define void @st3_v8f16(<8 x half> %A, <8 x half> %B, <8 x half> %C, ptr %p) sani
 ; CHECK-NEXT:    unreachable
 ; CHECK:       9:
 ; CHECK-NEXT:    call void @llvm.aarch64.neon.st3.v8f16.p0(<8 x half> [[A]], <8 x half> [[B]], <8 x half> [[C]], ptr [[P]])
-; CHECK-NEXT:    call void @llvm.aarch64.neon.st3.v8f16.p0(<8 x i16> [[TMP2]], <8 x i16> [[TMP3]], <8 x i16> [[TMP4]], ptr [[TMP7]])
 ; CHECK-NEXT:    ret void
 ;
   call void @llvm.aarch64.neon.st3.v8f16.p0(<8 x half> %A, <8 x half> %B, <8 x half> %C, ptr %p)
@@ -1045,6 +981,7 @@ define void @st3_v8i16(<8 x i16> %A, <8 x i16> %B, <8 x i16> %C, ptr %p) sanitiz
 ; CHECK-NEXT:    [[TMP5:%.*]] = ptrtoint ptr [[P]] to i64
 ; CHECK-NEXT:    [[TMP6:%.*]] = xor i64 [[TMP5]], 193514046488576
 ; CHECK-NEXT:    [[TMP7:%.*]] = inttoptr i64 [[TMP6]] to ptr
+; CHECK-NEXT:    call void @llvm.aarch64.neon.st3.v8i16.p0(<8 x i16> [[TMP2]], <8 x i16> [[TMP3]], <8 x i16> [[TMP4]], ptr [[TMP7]])
 ; CHECK-NEXT:    [[_MSCMP:%.*]] = icmp ne i64 [[TMP1]], 0
 ; CHECK-NEXT:    br i1 [[_MSCMP]], label [[TMP8:%.*]], label [[TMP9:%.*]], !prof [[PROF0]]
 ; CHECK:       8:
@@ -1052,7 +989,6 @@ define void @st3_v8i16(<8 x i16> %A, <8 x i16> %B, <8 x i16> %C, ptr %p) sanitiz
 ; CHECK-NEXT:    unreachable
 ; CHECK:       9:
 ; CHECK-NEXT:    call void @llvm.aarch64.neon.st3.v8i16.p0(<8 x i16> [[A]], <8 x i16> [[B]], <8 x i16> [[C]], ptr [[P]])
-; CHECK-NEXT:    call void @llvm.aarch64.neon.st3.v8i16.p0(<8 x i16> [[TMP2]], <8 x i16> [[TMP3]], <8 x i16> [[TMP4]], ptr [[TMP7]])
 ; CHECK-NEXT:    ret void
 ;
   call void @llvm.aarch64.neon.st3.v8i16.p0(<8 x i16> %A, <8 x i16> %B, <8 x i16> %C, ptr %p)
@@ -1070,6 +1006,7 @@ define void @st3_v8i8(<8 x i8> %A, <8 x i8> %B, <8 x i8> %C, ptr %p) sanitize_me
 ; CHECK-NEXT:    [[TMP5:%.*]] = ptrtoint ptr [[P]] to i64
 ; CHECK-NEXT:    [[TMP6:%.*]] = xor i64 [[TMP5]], 193514046488576
 ; CHECK-NEXT:    [[TMP7:%.*]] = inttoptr i64 [[TMP6]] to ptr
+; CHECK-NEXT:    call void @llvm.aarch64.neon.st3.v8i8.p0(<8 x i8> [[TMP2]], <8 x i8> [[TMP3]], <8 x i8> [[TMP4]], ptr [[TMP7]])
 ; CHECK-NEXT:    [[_MSCMP:%.*]] = icmp ne i64 [[TMP1]], 0
 ; CHECK-NEXT:    br i1 [[_MSCMP]], label [[TMP8:%.*]], label [[TMP9:%.*]], !prof [[PROF0]]
 ; CHECK:       8:
@@ -1077,7 +1014,6 @@ define void @st3_v8i8(<8 x i8> %A, <8 x i8> %B, <8 x i8> %C, ptr %p) sanitize_me
 ; CHECK-NEXT:    unreachable
 ; CHECK:       9:
 ; CHECK-NEXT:    call void @llvm.aarch64.neon.st3.v8i8.p0(<8 x i8> [[A]], <8 x i8> [[B]], <8 x i8> [[C]], ptr [[P]])
-; CHECK-NEXT:    call void @llvm.aarch64.neon.st3.v8i8.p0(<8 x i8> [[TMP2]], <8 x i8> [[TMP3]], <8 x i8> [[TMP4]], ptr [[TMP7]])
 ; CHECK-NEXT:    ret void
 ;
   call void @llvm.aarch64.neon.st3.v8i8.p0(<8 x i8> %A, <8 x i8> %B, <8 x i8> %C, ptr %p)
@@ -1096,6 +1032,7 @@ define void @st4_v16i8(<16 x i8> %A, <16 x i8> %B, <16 x i8> %C, <16 x i8> %D, p
 ; CHECK-NEXT:    [[TMP6:%.*]] = ptrtoint ptr [[P]] to i64
 ; CHECK-NEXT:    [[TMP7:%.*]] = xor i64 [[TMP6]], 193514046488576
 ; CHECK-NEXT:    [[TMP8:%.*]] = inttoptr i64 [[TMP7]] to ptr
+; CHECK-NEXT:    call void @llvm.aarch64.neon.st4.v16i8.p0(<16 x i8> [[TMP2]], <16 x i8> [[TMP3]], <16 x i8> [[TMP4]], <16 x i8> [[TMP5]], ptr [[TMP8]])
 ; CHECK-NEXT:    [[_MSCMP:%.*]] = icmp ne i64 [[TMP1]], 0
 ; CHECK-NEXT:    br i1 [[_MSCMP]], label [[TMP9:%.*]], label [[TMP10:%.*]], !prof [[PROF0]]
 ; CHECK:       9:
@@ -1103,7 +1040,6 @@ define void @st4_v16i8(<16 x i8> %A, <16 x i8> %B, <16 x i8> %C, <16 x i8> %D, p
 ; CHECK-NEXT:    unreachable
 ; CHECK:       10:
 ; CHECK-NEXT:    call void @llvm.aarch64.neon.st4.v16i8.p0(<16 x i8> [[A]], <16 x i8> [[B]], <16 x i8> [[C]], <16 x i8> [[D]], ptr [[P]])
-; CHECK-NEXT:    call void @llvm.aarch64.neon.st4.v16i8.p0(<16 x i8> [[TMP2]], <16 x i8> [[TMP3]], <16 x i8> [[TMP4]], <16 x i8> [[TMP5]], ptr [[TMP8]])
 ; CHECK-NEXT:    ret void
 ;
   call void @llvm.aarch64.neon.st4.v16i8.p0(<16 x i8> %A, <16 x i8> %B, <16 x i8> %C, <16 x i8> %D, ptr %p)
@@ -1122,6 +1058,7 @@ define void @st4_v1f64(<1 x double> %A, <1 x double> %B, <1 x double> %C, <1 x d
 ; CHECK-NEXT:    [[TMP6:%.*]] = ptrtoint ptr [[P]] to i64
 ; CHECK-NEXT:    [[TMP7:%.*]] = xor i64 [[TMP6]], 193514046488576
 ; CHECK-NEXT:    [[TMP8:%.*]] = inttoptr i64 [[TMP7]] to ptr
+; CHECK-NEXT:    call void @llvm.aarch64.neon.st4.v1i64.p0(<1 x i64> [[TMP2]], <1 x i64> [[TMP3]], <1 x i64> [[TMP4]], <1 x i64> [[TMP5]], ptr [[TMP8]])
 ; CHECK-NEXT:    [[_MSCMP:%.*]] = icmp ne i64 [[TMP1]], 0
 ; CHECK-NEXT:    br i1 [[_MSCMP]], label [[TMP9:%.*]], label [[TMP10:%.*]], !prof [[PROF0]]
 ; CHECK:       9:
@@ -1129,7 +1066,6 @@ define void @st4_v1f64(<1 x double> %A, <1 x double> %B, <1 x double> %C, <1 x d
 ; CHECK-NEXT:    unreachable
 ; CHECK:       10:
 ; CHECK-NEXT:    call void @llvm.aarch64.neon.st4.v1f64.p0(<1 x double> [[A]], <1 x double> [[B]], <1 x double> [[C]], <1 x double> [[D]], ptr [[P]])
-; CHECK-NEXT:    call void @llvm.aarch64.neon.st4.v1f64.p0(<1 x i64> [[TMP2]], <1 x i64> [[TMP3]], <1 x i64> [[TMP4]], <1 x i64> [[TMP5]], ptr [[TMP8]])
 ; CHECK-NEXT:    ret void
 ;
   call void @llvm.aarch64.neon.st4.v1f64.p0(<1 x double> %A, <1 x double> %B, <1 x double> %C, <1 x double> %D, ptr %p)
@@ -1148,6 +1084,7 @@ define void @st4_v1i64(<1 x i64> %A, <1 x i64> %B, <1 x i64> %C, <1 x i64> %D, p
 ; CHECK-NEXT:    [[TMP6:%.*]] = ptrtoint ptr [[P]] to i64
 ; CHECK-NEXT:    [[TMP7:%.*]] = xor i64 [[TMP6]], 193514046488576
 ; CHECK-NEXT:    [[TMP8:%.*]] = inttoptr i64 [[TMP7]] to ptr
+; CHECK-NEXT:    call void @llvm.aarch64.neon.st4.v1i64.p0(<1 x i64> [[TMP2]], <1 x i64> [[TMP3]], <1 x i64> [[TMP4]], <1 x i64> [[TMP5]], ptr [[TMP8]])
 ; CHECK-NEXT:    [[_MSCMP:%.*]] = icmp ne i64 [[TMP1]], 0
 ; CHECK-NEXT:    br i1 [[_MSCMP]], label [[TMP9:%.*]], label [[TMP10:%.*]], !prof [[PROF0]]
 ; CHECK:       9:
@@ -1155,7 +1092,6 @@ define void @st4_v1i64(<1 x i64> %A, <1 x i64> %B, <1 x i64> %C, <1 x i64> %D, p
 ; CHECK-NEXT:    unreachable
 ; CHECK:       10:
 ; CHECK-NEXT:    call void @llvm.aarch64.neon.st4.v1i64.p0(<1 x i64> [[A]], <1 x i64> [[B]], <1 x i64> [[C]], <1 x i64> [[D]], ptr [[P]])
-; CHECK-NEXT:    call void @llvm.aarch64.neon.st4.v1i64.p0(<1 x i64> [[TMP2]], <1 x i64> [[TMP3]], <1 x i64> [[TMP4]], <1 x i64> [[TMP5]], ptr [[TMP8]])
 ; CHECK-NEXT:    ret void
 ;
   call void @llvm.aarch64.neon.st4.v1i64.p0(<1 x i64> %A, <1 x i64> %B, <1 x i64> %C, <1 x i64> %D, ptr %p)
@@ -1174,6 +1110,7 @@ define void @st4_v2f32(<2 x float> %A, <2 x float> %B, <2 x float> %C, <2 x floa
 ; CHECK-NEXT:    [[TMP6:%.*]] = ptrtoint ptr [[P]] to i64
 ; CHECK-NEXT:    [[TMP7:%.*]] = xor i64 [[TMP6]], 193514046488576
 ; CHECK-NEXT:    [[TMP8:%.*]] = inttoptr i64 [[TMP7]] to ptr
+; CHECK-NEXT:    call void @llvm.aarch64.neon.st4.v2i32.p0(<2 x i32> [[TMP2]], <2 x i32> [[TMP3]], <2 x i32> [[TMP4]], <2 x i32> [[TMP5]], ptr [[TMP8]])
 ; CHECK-NEXT:    [[_MSCMP:%.*]] = icmp ne i64 [[TMP1]], 0
 ; CHECK-NEXT:    br i1 [[_MSCMP]], label [[TMP9:%.*]], label [[TMP10:%.*]], !prof [[PROF0]]
 ; CHECK:       9:
@@ -1181,7 +1118,6 @@ define void @st4_v2f32(<2 x float> %A, <2 x float> %B, <2 x float> %C, <2 x floa
 ; CHECK-NEXT:    unreachable
 ; CHECK:       10:
 ; CHECK-NEXT:    call void @llvm.aarch64.neon.st4.v2f32.p0(<2 x float> [[A]], <2 x float> [[B]], <2 x float> [[C]], <2 x float> [[D]], ptr [[P]])
-; CHECK-NEXT:    call void @llvm.aarch64.neon.st4.v2f32.p0(<2 x i32> [[TMP2]], <2 x i32> [[TMP3]], <2 x i32> [[TMP4]], <2 x i32> [[TMP5]], ptr [[TMP8]])
 ; CHECK-NEXT:    ret void
 ;
   call void @llvm.aarch64.neon.st4.v2f32.p0(<2 x float> %A, <2 x float> %B, <2 x float> %C, <2 x float> %D, ptr %p)
@@ -1200,6 +1136,7 @@ define void @st4_v2f64(<2 x double> %A, <2 x double> %B, <2 x double> %C, <2 x d
 ; CHECK-NEXT:    [[TMP6:%.*]] = ptrtoint ptr [[P]] to i64
 ; CHECK-NEXT:    [[TMP7:%.*]] = xor i64 [[TMP6]], 193514046488576
 ; CHECK-NEXT:    [[TMP8:%.*]] = inttoptr i64 [[TMP7]] to ptr
+; CHECK-NEXT:    call void @llvm.aarch64.neon.st4.v2i64.p0(<2 x i64> [[TMP2]], <2 x i64> [[TMP3]], <2 x i64> [[TMP4]], <2 x i64> [[TMP5]], ptr [[TMP8]])
 ; CHECK-NEXT:    [[_MSCMP:%.*]] = icmp ne i64 [[TMP1]], 0
 ; CHECK-NEXT:    br i1 [[_MSCMP]], label [[TMP9:%.*]], label [[TMP10:%.*]], !prof [[PROF0]]
 ; CHECK:       9:
@@ -1207,7 +1144,6 @@ define void @st4_v2f64(<2 x double> %A, <2 x double> %B, <2 x double> %C, <2 x d
 ; CHECK-NEXT:    unreachable
 ; CHECK:       10:
 ; CHECK-NEXT:    call void @llvm.aarch64.neon.st4.v2f64.p0(<2 x double> [[A]], <2 x double> [[B]], <2 x double> [[C]], <2 x double> [[D]], ptr [[P]])
-; CHECK-NEXT:    call void @llvm.aarch64.neon.st4.v2f64.p0(<2 x i64> [[TMP2]], <2 x i64> [[TMP3]], <2 x i64> [[TMP4]], <2 x i64> [[TMP5]], ptr [[TMP8]])
 ; CHECK-NEXT:    ret void
 ;
   call void @llvm.aarch64.neon.st4.v2f64.p0(<2 x double> %A, <2 x double> %B, <2 x double> %C, <2 x double> %D, ptr %p)
@@ -1226,6 +1162,7 @@ define void @st4_v2i32(<2 x i32> %A, <2 x i32> %B, <2 x i32> %C, <2 x i32> %D, p
 ; CHECK-NEXT:    [[TMP6:%.*]] = ptrtoint ptr [[P]] to i64
 ; CHECK-NEXT:    [[TMP7:%.*]] = xor i64 [[TMP6]], 193514046488576
 ; CHECK-NEXT:    [[TMP8:%.*]] = inttoptr i64 [[TMP7]] to ptr
+; CHECK-NEXT:    call void @llvm.aarch64.neon.st4.v2i32.p0(<2 x i32> [[TMP2]], <2 x i32> [[TMP3]], <2 x i32> [[TMP4]], <2 x i32> [[TMP5]], ptr [[TMP8]])
 ; CHECK-NEXT:    [[_MSCMP:%.*]] = icmp ne i64 [[TMP1]], 0
 ; CHECK-NEXT:    br i1 [[_MSCMP]], label [[TMP9:%.*]], label [[TMP10:%.*]], !prof [[PROF0]]
 ; CHECK:       9:
@@ -1233,7 +1170,6 @@ define void @st4_v2i32(<2 x i32> %A, <2 x i32> %B, <2 x i32> %C, <2 x i32> %D, p
 ; CHECK-NEXT:    unreachable
 ; CHECK:       10:
 ; CHECK-NEXT:    call void @llvm.aarch64.neon.st4.v2i32.p0(<2 x i32> [[A]], <2 x i32> [[B]], <2 x i32> [[C]], <2 x i32> [[D]], ptr [[P]])
-; CHECK-NEXT:    call void @llvm.aarch64.neon.st4.v2i32.p0(<2 x i32> [[TMP2]], <2 x i32> [[TMP3]], <2 x i32> [[TMP4]], <2 x i32> [[TMP5]], ptr [[TMP8]])
 ; CHECK-NEXT:    ret void
 ;
   call void @llvm.aarch64.neon.st4.v2i32.p0(<2 x i32> %A, <2 x i32> %B, <2 x i32> %C, <2 x i32> %D, ptr %p)
@@ -1252,6 +1188,7 @@ define void @st4_v2i64(<2 x i64> %A, <2 x i64> %B, <2 x i64> %C, <2 x i64> %D, p
 ; CHECK-NEXT:    [[TMP6:%.*]] = ptrtoint ptr [[P]] to i64
 ; CHECK-NEXT:    [[TMP7:%.*]] = xor i64 [[TMP6]], 193514046488576
 ; CHECK-NEXT:    [[TMP8:%.*]] = inttoptr i64 [[TMP7]] to ptr
+; CHECK-NEXT:    call void @llvm.aarch64.neon.st4.v2i64.p0(<2 x i64> [[TMP2]], <2 x i64> [[TMP3]], <2 x i64> [[TMP4]], <2 x i64> [[TMP5]], ptr [[TMP8]])
 ; CHECK-NEXT:    [[_MSCMP:%.*]] = icmp ne i64 [[TMP1]], 0
 ; CHECK-NEXT:    br i1 [[_MSCMP]], label [[TMP9:%.*]], label [[TMP10:%.*]], !prof [[PROF0]]
 ; CHECK:       9:
@@ -1259,7 +1196,6 @@ define void @st4_v2i64(<2 x i64> %A, <2 x i64> %B, <2 x i64> %C, <2 x i64> %D, p
 ; CHECK-NEXT:    unreachable
 ; CHECK:       10:
 ; CHECK-NEXT:    call void @llvm.aarch64.neon.st4.v2i64.p0(<2 x i64> [[A]], <2 x i64> [[B]], <2 x i64> [[C]], <2 x i64> [[D]], ptr [[P]])
-; CHECK-NEXT:    call void @llvm.aarch64.neon.st4.v2i64.p0(<2 x i64> [[TMP2]], <2 x i64> [[TMP3]], <2 x i64> [[TMP4]], <2 x i64> [[TMP5]], ptr [[TMP8]])
 ; CHECK-NEXT:    ret void
 ;
   call void @llvm.aarch64.neon.st4.v2i64.p0(<2 x i64> %A, <2 x i64> %B, <2 x i64> %C, <2 x i64> %D, ptr %p)
@@ -1278,6 +1214,7 @@ define void @st4_v4f16(<4 x half> %A, <4 x half> %B, <4 x half> %C, <4 x half> %
 ; CHECK-NEXT:    [[TMP6:%.*]] = ptrtoint ptr [[P]] to i64
 ; CHECK-NEXT:    [[TMP7:%.*]] = xor i64 [[TMP6]], 193514046488576
 ; CHECK-NEXT:    [[TMP8:%.*]] = inttoptr i64 [[TMP7]] to ptr
+; CHECK-NEXT:    call void @llvm.aarch64.neon.st4.v4i16.p0(<4 x i16> [[TMP2]], <4 x i16> [[TMP3]], <4 x i16> [[TMP4]], <4 x i16> [[TMP5]], ptr [[TMP8]])
 ; CHECK-NEXT:    [[_MSCMP:%.*]] = icmp ne i64 [[TMP1]], 0
 ; CHECK-NEXT:    br i1 [[_MSCMP]], label [[TMP9:%.*]], label [[TMP10:%.*]], !prof [[PROF0]]
 ; CHECK:       9:
@@ -1285,7 +1222,6 @@ define void @st4_v4f16(<4 x half> %A, <4 x half> %B, <4 x half> %C, <4 x half> %
 ; CHECK-NEXT:    unreachable
 ; CHECK:       10:
 ; CHECK-NEXT:    call void @llvm.aarch64.neon.st4.v4f16.p0(<4 x half> [[A]], <4 x half> [[B]], <4 x half> [[C]], <4 x half> [[D]], ptr [[P]])
-; CHECK-NEXT:    call void @llvm.aarch64.neon.st4.v4f16.p0(<4 x i16> [[TMP2]], <4 x i16> [[TMP3]], <4 x i16> [[TMP4]], <4 x i16> [[TMP5]], ptr [[TMP8]])
 ; CHECK-NEXT:    ret void
 ;
   call void @llvm.aarch64.neon.st4.v4f16.p0(<4 x half> %A, <4 x half> %B, <4 x half> %C, <4 x half> %D, ptr %p)
@@ -1304,6 +1240,7 @@ define void @st4_v4f32(<4 x float> %A, <4 x float> %B, <4 x float> %C, <4 x floa
 ; CHECK-NEXT:    [[TMP6:%.*]] = ptrtoint ptr [[P]] to i64
 ; CHECK-NEXT:    [[TMP7:%.*]] = xor i64 [[TMP6]], 193514046488576
 ; CHECK-NEXT:    [[TMP8:%.*]] = inttoptr i64 [[TMP7]] to ptr
+; CHECK-NEXT:    call void @llvm.aarch64.neon.st4.v4i32.p0(<4 x i32> [[TMP2]], <4 x i32> [[TMP3]], <4 x i32> [[TMP4]], <4 x i32> [[TMP5]], ptr [[TMP8]])
 ; CHECK-NEXT:    [[_MSCMP:%.*]] = icmp ne i64 [[TMP1]], 0
 ; CHECK-NEXT:    br i1 [[_MSCMP]], label [[TMP9:%.*]], label [[TMP10:%.*]], !prof [[PROF0]]
 ; CHECK:       9:
@@ -1311,7 +1248,6 @@ define void @st4_v4f32(<4 x float> %A, <4 x float> %B, <4 x float> %C, <4 x floa
 ; CHECK-NEXT:    unreachable
 ; CHECK:       10:
 ; CHECK-NEXT:    call void @llvm.aarch64.neon.st4.v4f32.p0(<4 x float> [[A]], <4 x float> [[B]], <4 x float> [[C]], <4 x float> [[D]], ptr [[P]])
-; CHECK-NEXT:    call void @llvm.aarch64.neon.st4.v4f32.p0(<4 x i32> [[TMP2]], <4 x i32> [[TMP3]], <4 x i32> [[TMP4]], <4 x i32> [[TMP5]], ptr [[TMP8]])
 ; CHECK-NEXT:    ret void
 ;
   call void @llvm.aarch64.neon.st4.v4f32.p0(<4 x float> %A, <4 x float> %B, <4 x float> %C, <4 x float> %D, ptr %p)
@@ -1330,6 +1266,7 @@ define void @st4_v4i16(<4 x i16> %A, <4 x i16> %B, <4 x i16> %C, <4 x i16> %D, p
 ; CHECK-NEXT:    [[TMP6:%.*]] = ptrtoint ptr [[P]] to i64
 ; CHECK-NEXT:    [[TMP7:%.*]] = xor i64 [[TMP6]], 193514046488576
 ; CHECK-NEXT:    [[TMP8:%.*]] = inttoptr i64 [[TMP7]] to ptr
+; CHECK-NEXT:    call void @llvm.aarch64.neon.st4.v4i16.p0(<4 x i16> [[TMP2]], <4 x i16> [[TMP3]], <4 x i16> [[TMP4]], <4 x i16> [[TMP5]], ptr [[TMP8]])
 ; CHECK-NEXT:    [[_MSCMP:%.*]] = icmp ne i64 [[TMP1]], 0
 ; CHECK-NEXT:    br i1 [[_MSCMP]], label [[TMP9:%.*]], label [[TMP10:%.*]], !prof [[PROF0]]
 ; CHECK:       9:
@@ -1337,7 +1274,6 @@ define void @st4_v4i16(<4 x i16> %A, <4 x i16> %B, <4 x i16> %C, <4 x i16> %D, p
 ; CHECK-NEXT:    unreachable
 ; CHECK:       10:
 ; CHECK-NEXT:    call void @llvm.aarch64.neon.st4.v4i16.p0(<4 x i16> [[A]], <4 x i16> [[B]], <4 x i16> [[C]], <4 x i16> [[D]], ptr [[P]])
-; CHECK-NEXT:    call void @llvm.aarch64.neon.st4.v4i16.p0(<4 x i16> [[TMP2]], <4 x i16> [[TMP3]], <4 x i16> [[TMP4]], <4 x i16> [[TMP5]], ptr [[TMP8]])
 ; CHECK-NEXT:    ret void
 ;
   call void @llvm.aarch64.neon.st4.v4i16.p0(<4 x i16> %A, <4 x i16> %B, <4 x i16> %C, <4 x i16> %D, ptr %p)
@@ -1356,6 +1292,7 @@ define void @st4_v4i32(<4 x i32> %A, <4 x i32> %B, <4 x i32> %C, <4 x i32> %D, p
 ; CHECK-NEXT:    [[TMP6:%.*]] = ptrtoint ptr [[P]] to i64
 ; CHECK-NEXT:    [[TMP7:%.*]] = xor i64 [[TMP6]], 193514046488576
 ; CHECK-NEXT:    [[TMP8:%.*]] = inttoptr i64 [[TMP7]] to ptr
+; CHECK-NEXT:    call void @llvm.aarch64.neon.st4.v4i32.p0(<4 x i32> [[TMP2]], <4 x i32> [[TMP3]], <4 x i32> [[TMP4]], <4 x i32> [[TMP5]], ptr [[TMP8]])
 ; CHECK-NEXT:    [[_MSCMP:%.*]] = icmp ne i64 [[TMP1]], 0
 ; CHECK-NEXT:    br i1 [[_MSCMP]], label [[TMP9:%.*]], label [[TMP10:%.*]], !prof [[PROF0]]
 ; CHECK:       9:
@@ -1363,7 +1300,6 @@ define void @st4_v4i32(<4 x i32> %A, <4 x i32> %B, <4 x i32> %C, <4 x i32> %D, p
 ; CHECK-NEXT:    unreachable
 ; CHECK:       10:
 ; CHECK-NEXT:    call void @llvm.aarch64.neon.st4.v4i32.p0(<4 x i32> [[A]], <4 x i32> [[B]], <4 x i32> [[C]], <4 x i32> [[D]], ptr [[P]])
-; CHECK-NEXT:    call void @llvm.aarch64.neon.st4.v4i32.p0(<4 x i32> [[TMP2]], <4 x i32> [[TMP3]], <4 x i32> [[TMP4]], <4 x i32> [[TMP5]], ptr [[TMP8]])
 ; CHECK-NEXT:    ret void
 ;
   call void @llvm.aarch64.neon.st4.v4i32.p0(<4 x i32> %A, <4 x i32> %B, <4 x i32> %C, <4 x i32> %D, ptr %p)
@@ -1382,6 +1318,7 @@ define void @st4_v8f16(<8 x half> %A, <8 x half> %B, <8 x half> %C, <8 x half> %
 ; CHECK-NEXT:    [[TMP6:%.*]] = ptrtoint ptr [[P]] to i64
 ; CHECK-NEXT:    [[TMP7:%.*]] = xor i64 [[TMP6]], 193514046488576
 ; CHECK-NEXT:    [[TMP8:%.*]] = inttoptr i64 [[TMP7]] to ptr
+; CHECK-NEXT:    call void @llvm.aarch64.neon.st4.v8i16.p0(<8 x i16> [[TMP2]], <8 x i16> [[TMP3]], <8 x i16> [[TMP4]], <8 x i16> [[TMP5]], ptr [[TMP8]])
 ; CHECK-NEXT:    [[_MSCMP:%.*]] = icmp ne i64 [[TMP1]], 0
 ; CHECK-NEXT:    br i1 [[_MSCMP]], label [[TMP9:%.*]], label [[TMP10:%.*]], !prof [[PROF0]]
 ; CHECK:       9:
@@ -1389,7 +1326,6 @@ define void @st4_v8f16(<8 x half> %A, <8 x half> %B, <8 x half> %C, <8 x half> %
 ; CHECK-NEXT:    unreachable
 ; CHECK:       10:
 ; CHECK-NEXT:    call void @llvm.aarch64.neon.st4.v8f16.p0(<8 x half> [[A]], <8 x half> [[B]], <8 x half> [[C]], <8 x half> [[D]], ptr [[P]])
-; CHECK-NEXT:    call void @llvm.aarch64.neon.st4.v8f16.p0(<8 x i16> [[TMP2]], <8 x i16> [[TMP3]], <8 x i16> [[TMP4]], <8 x i16> [[TMP5]], ptr [[TMP8]])
 ; CHECK-NEXT:    ret void
 ;
   call void @llvm.aarch64.neon.st4.v8f16.p0(<8 x half> %A, <8 x half> %B, <8 x half> %C, <8 x half> %D, ptr %p)
@@ -1408,6 +1344,7 @@ define void @st4_v8i16(<8 x i16> %A, <8 x i16> %B, <8 x i16> %C, <8 x i16> %D, p
 ; CHECK-NEXT:    [[TMP6:%.*]] = ptrtoint ptr [[P]] to i64
 ; CHECK-NEXT:    [[TMP7:%.*]] = xor i64 [[TMP6]], 193514046488576
 ; CHECK-NEXT:    [[TMP8:%.*]] = inttoptr i64 [[TMP7]] to ptr
+; CHECK-NEXT:    call void @llvm.aarch64.neon.st4.v8i16.p0(<8 x i16> [[TMP2]], <8 x i16> [[TMP3]], <8 x i16> [[TMP4]], <8 x i16> [[TMP5]], ptr [[TMP8]])
 ; CHECK-NEXT:    [[_MSCMP:%.*]] = icmp ne i64 [[TMP1]], 0
 ; CHECK-NEXT:    br i1 [[_MSCMP]], label [[TMP9:%.*]], label [[TMP10:%.*]], !prof [[PROF0]]
 ; CHECK:       9:
@@ -1415,7 +1352,6 @@ define void @st4_v8i16(<8 x i16> %A, <8 x i16> %B, <8 x i16> %C, <8 x i16> %D, p
 ; CHECK-NEXT:    unreachable
 ; CHECK:       10:
 ; CHECK-NEXT:    call void @llvm.aarch64.neon.st4.v8i16.p0(<8 x i16> [[A]], <8 x i16> [[B]], <8 x i16> [[C]], <8 x i16> [[D]], ptr [[P]])
-; CHECK-NEXT:    call void @llvm.aarch64.neon.st4.v8i16.p0(<8 x i16> [[TMP2]], <8 x i16> [[TMP3]], <8 x i16> [[TMP4]], <8 x i16> [[TMP5]], ptr [[TMP8]])
 ; CHECK-NEXT:    ret void
 ;
   call void @llvm.aarch64.neon.st4.v8i16.p0(<8 x i16> %A, <8 x i16> %B, <8 x i16> %C, <8 x i16> %D, ptr %p)
@@ -1434,6 +1370,7 @@ define void @st4_v8i8(<8 x i8> %A, <8 x i8> %B, <8 x i8> %C, <8 x i8> %D, ptr %p
 ; CHECK-NEXT:    [[TMP6:%.*]] = ptrtoint ptr [[P]] to i64
 ; CHECK-NEXT:    [[TMP7:%.*]] = xor i64 [[TMP6]], 193514046488576
 ; CHECK-NEXT:    [[TMP8:%.*]] = inttoptr i64 [[TMP7]] to ptr
+; CHECK-NEXT:    call void @llvm.aarch64.neon.st4.v8i8.p0(<8 x i8> [[TMP2]], <8 x i8> [[TMP3]], <8 x i8> [[TMP4]], <8 x i8> [[TMP5]], ptr [[TMP8]])
 ; CHECK-NEXT:    [[_MSCMP:%.*]] = icmp ne i64 [[TMP1]], 0
 ; CHECK-NEXT:    br i1 [[_MSCMP]], label [[TMP9:%.*]], label [[TMP10:%.*]], !prof [[PROF0]]
 ; CHECK:       9:
@@ -1441,7 +1378,6 @@ define void @st4_v8i8(<8 x i8> %A, <8 x i8> %B, <8 x i8> %C, <8 x i8> %D, ptr %p
 ; CHECK-NEXT:    unreachable
 ; CHECK:       10:
 ; CHECK-NEXT:    call void @llvm.aarch64.neon.st4.v8i8.p0(<8 x i8> [[A]], <8 x i8> [[B]], <8 x i8> [[C]], <8 x i8> [[D]], ptr [[P]])
-; CHECK-NEXT:    call void @llvm.aarch64.neon.st4.v8i8.p0(<8 x i8> [[TMP2]], <8 x i8> [[TMP3]], <8 x i8> [[TMP4]], <8 x i8> [[TMP5]], ptr [[TMP8]])
 ; CHECK-NEXT:    ret void
 ;
   call void @llvm.aarch64.neon.st4.v8i8.p0(<8 x i8> %A, <8 x i8> %B, <8 x i8> %C, <8 x i8> %D, ptr %p)
diff --git a/llvm/test/Instrumentation/MemorySanitizer/AArch64/neon_vst_lane.ll b/llvm/test/Instrumentation/MemorySanitizer/AArch64/neon_vst_lane.ll
new file mode 100644
index 0000000..b7a2721
--- /dev/null
+++ b/llvm/test/Instrumentation/MemorySanitizer/AArch64/neon_vst_lane.ll
@@ -0,0 +1,1977 @@
+; NOTE: Assertions have been autogenerated by utils/update_test_checks.py UTC_ARGS: --version 4
+;
+; Test memory sanitizer instrumentation for Arm store with lane instructions.
+; Note: st{2,3,4}lane uses Arm NEON but st1lane does not.
+;
+; RUN: opt < %s -passes=msan -S | FileCheck %s
+;
+; Forked from llvm/test/CodeGen/AArch64/arm64-st1.ll
+
+target datalayout = "e-m:e-i8:8:32-i16:16:32-i64:64-i128:128-n32:64-S128"
+target triple = "aarch64--linux-android9001"
+
+define void @st1lane_16b(<16 x i8> %A, ptr %D) sanitize_memory {
+; CHECK-LABEL: define void @st1lane_16b(
+; CHECK-SAME: <16 x i8> [[A:%.*]], ptr [[D:%.*]]) #[[ATTR0:[0-9]+]] {
+; CHECK-NEXT:    [[TMP1:%.*]] = load i64, ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_param_tls to i64), i64 16) to ptr), align 8
+; CHECK-NEXT:    [[TMP2:%.*]] = load <16 x i8>, ptr @__msan_param_tls, align 8
+; CHECK-NEXT:    call void @llvm.donothing()
+; CHECK-NEXT:    [[_MSPROP:%.*]] = or i64 [[TMP1]], 0
+; CHECK-NEXT:    [[PTR:%.*]] = getelementptr i8, ptr [[D]], i64 1
+; CHECK-NEXT:    [[_MSPROP1:%.*]] = extractelement <16 x i8> [[TMP2]], i32 1
+; CHECK-NEXT:    [[TMP:%.*]] = extractelement <16 x i8> [[A]], i32 1
+; CHECK-NEXT:    [[_MSCMP:%.*]] = icmp ne i64 [[_MSPROP]], 0
+; CHECK-NEXT:    br i1 [[_MSCMP]], label [[TMP3:%.*]], label [[TMP4:%.*]], !prof [[PROF0:![0-9]+]]
+; CHECK:       3:
+; CHECK-NEXT:    call void @__msan_warning_noreturn() #[[ATTR4:[0-9]+]]
+; CHECK-NEXT:    unreachable
+; CHECK:       4:
+; CHECK-NEXT:    [[TMP5:%.*]] = ptrtoint ptr [[PTR]] to i64
+; CHECK-NEXT:    [[TMP6:%.*]] = xor i64 [[TMP5]], 193514046488576
+; CHECK-NEXT:    [[TMP7:%.*]] = inttoptr i64 [[TMP6]] to ptr
+; CHECK-NEXT:    store i8 [[_MSPROP1]], ptr [[TMP7]], align 1
+; CHECK-NEXT:    store i8 [[TMP]], ptr [[PTR]], align 1
+; CHECK-NEXT:    ret void
+;
+  %ptr = getelementptr i8, ptr %D, i64 1
+  %tmp = extractelement <16 x i8> %A, i32 1
+  store i8 %tmp, ptr %ptr
+  ret void
+}
+
+define void @st1lane0_16b(<16 x i8> %A, ptr %D) sanitize_memory {
+; CHECK-LABEL: define void @st1lane0_16b(
+; CHECK-SAME: <16 x i8> [[A:%.*]], ptr [[D:%.*]]) #[[ATTR0]] {
+; CHECK-NEXT:    [[TMP1:%.*]] = load i64, ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_param_tls to i64), i64 16) to ptr), align 8
+; CHECK-NEXT:    [[TMP2:%.*]] = load <16 x i8>, ptr @__msan_param_tls, align 8
+; CHECK-NEXT:    call void @llvm.donothing()
+; CHECK-NEXT:    [[_MSPROP:%.*]] = or i64 [[TMP1]], 0
+; CHECK-NEXT:    [[PTR:%.*]] = getelementptr i8, ptr [[D]], i64 1
+; CHECK-NEXT:    [[_MSPROP1:%.*]] = extractelement <16 x i8> [[TMP2]], i32 0
+; CHECK-NEXT:    [[TMP:%.*]] = extractelement <16 x i8> [[A]], i32 0
+; CHECK-NEXT:    [[_MSCMP:%.*]] = icmp ne i64 [[_MSPROP]], 0
+; CHECK-NEXT:    br i1 [[_MSCMP]], label [[TMP3:%.*]], label [[TMP4:%.*]], !prof [[PROF0]]
+; CHECK:       3:
+; CHECK-NEXT:    call void @__msan_warning_noreturn() #[[ATTR4]]
+; CHECK-NEXT:    unreachable
+; CHECK:       4:
+; CHECK-NEXT:    [[TMP5:%.*]] = ptrtoint ptr [[PTR]] to i64
+; CHECK-NEXT:    [[TMP6:%.*]] = xor i64 [[TMP5]], 193514046488576
+; CHECK-NEXT:    [[TMP7:%.*]] = inttoptr i64 [[TMP6]] to ptr
+; CHECK-NEXT:    store i8 [[_MSPROP1]], ptr [[TMP7]], align 1
+; CHECK-NEXT:    store i8 [[TMP]], ptr [[PTR]], align 1
+; CHECK-NEXT:    ret void
+;
+  %ptr = getelementptr i8, ptr %D, i64 1
+  %tmp = extractelement <16 x i8> %A, i32 0
+  store i8 %tmp, ptr %ptr
+  ret void
+}
+
+define void @st1lane0u_16b(<16 x i8> %A, ptr %D) sanitize_memory {
+; CHECK-LABEL: define void @st1lane0u_16b(
+; CHECK-SAME: <16 x i8> [[A:%.*]], ptr [[D:%.*]]) #[[ATTR0]] {
+; CHECK-NEXT:    [[TMP1:%.*]] = load i64, ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_param_tls to i64), i64 16) to ptr), align 8
+; CHECK-NEXT:    [[TMP2:%.*]] = load <16 x i8>, ptr @__msan_param_tls, align 8
+; CHECK-NEXT:    call void @llvm.donothing()
+; CHECK-NEXT:    [[_MSPROP:%.*]] = or i64 [[TMP1]], 0
+; CHECK-NEXT:    [[PTR:%.*]] = getelementptr i8, ptr [[D]], i64 -1
+; CHECK-NEXT:    [[_MSPROP1:%.*]] = extractelement <16 x i8> [[TMP2]], i32 0
+; CHECK-NEXT:    [[TMP:%.*]] = extractelement <16 x i8> [[A]], i32 0
+; CHECK-NEXT:    [[_MSCMP:%.*]] = icmp ne i64 [[_MSPROP]], 0
+; CHECK-NEXT:    br i1 [[_MSCMP]], label [[TMP3:%.*]], label [[TMP4:%.*]], !prof [[PROF0]]
+; CHECK:       3:
+; CHECK-NEXT:    call void @__msan_warning_noreturn() #[[ATTR4]]
+; CHECK-NEXT:    unreachable
+; CHECK:       4:
+; CHECK-NEXT:    [[TMP5:%.*]] = ptrtoint ptr [[PTR]] to i64
+; CHECK-NEXT:    [[TMP6:%.*]] = xor i64 [[TMP5]], 193514046488576
+; CHECK-NEXT:    [[TMP7:%.*]] = inttoptr i64 [[TMP6]] to ptr
+; CHECK-NEXT:    store i8 [[_MSPROP1]], ptr [[TMP7]], align 1
+; CHECK-NEXT:    store i8 [[TMP]], ptr [[PTR]], align 1
+; CHECK-NEXT:    ret void
+;
+  %ptr = getelementptr i8, ptr %D, i64 -1
+  %tmp = extractelement <16 x i8> %A, i32 0
+  store i8 %tmp, ptr %ptr
+  ret void
+}
+
+define void @st1lane_ro_16b(<16 x i8> %A, ptr %D, i64 %offset) sanitize_memory {
+; CHECK-LABEL: define void @st1lane_ro_16b(
+; CHECK-SAME: <16 x i8> [[A:%.*]], ptr [[D:%.*]], i64 [[OFFSET:%.*]]) #[[ATTR0]] {
+; CHECK-NEXT:    [[TMP1:%.*]] = load i64, ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_param_tls to i64), i64 16) to ptr), align 8
+; CHECK-NEXT:    [[TMP2:%.*]] = load i64, ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_param_tls to i64), i64 24) to ptr), align 8
+; CHECK-NEXT:    [[TMP3:%.*]] = load <16 x i8>, ptr @__msan_param_tls, align 8
+; CHECK-NEXT:    call void @llvm.donothing()
+; CHECK-NEXT:    [[_MSPROP:%.*]] = or i64 [[TMP1]], [[TMP2]]
+; CHECK-NEXT:    [[PTR:%.*]] = getelementptr i8, ptr [[D]], i64 [[OFFSET]]
+; CHECK-NEXT:    [[_MSPROP1:%.*]] = extractelement <16 x i8> [[TMP3]], i32 1
+; CHECK-NEXT:    [[TMP:%.*]] = extractelement <16 x i8> [[A]], i32 1
+; CHECK-NEXT:    [[_MSCMP:%.*]] = icmp ne i64 [[_MSPROP]], 0
+; CHECK-NEXT:    br i1 [[_MSCMP]], label [[TMP4:%.*]], label [[TMP5:%.*]], !prof [[PROF0]]
+; CHECK:       4:
+; CHECK-NEXT:    call void @__msan_warning_noreturn() #[[ATTR4]]
+; CHECK-NEXT:    unreachable
+; CHECK:       5:
+; CHECK-NEXT:    [[TMP6:%.*]] = ptrtoint ptr [[PTR]] to i64
+; CHECK-NEXT:    [[TMP7:%.*]] = xor i64 [[TMP6]], 193514046488576
+; CHECK-NEXT:    [[TMP8:%.*]] = inttoptr i64 [[TMP7]] to ptr
+; CHECK-NEXT:    store i8 [[_MSPROP1]], ptr [[TMP8]], align 1
+; CHECK-NEXT:    store i8 [[TMP]], ptr [[PTR]], align 1
+; CHECK-NEXT:    ret void
+;
+  %ptr = getelementptr i8, ptr %D, i64 %offset
+  %tmp = extractelement <16 x i8> %A, i32 1
+  store i8 %tmp, ptr %ptr
+  ret void
+}
+
+define void @st1lane0_ro_16b(<16 x i8> %A, ptr %D, i64 %offset) sanitize_memory {
+; CHECK-LABEL: define void @st1lane0_ro_16b(
+; CHECK-SAME: <16 x i8> [[A:%.*]], ptr [[D:%.*]], i64 [[OFFSET:%.*]]) #[[ATTR0]] {
+; CHECK-NEXT:    [[TMP1:%.*]] = load i64, ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_param_tls to i64), i64 16) to ptr), align 8
+; CHECK-NEXT:    [[TMP2:%.*]] = load i64, ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_param_tls to i64), i64 24) to ptr), align 8
+; CHECK-NEXT:    [[TMP3:%.*]] = load <16 x i8>, ptr @__msan_param_tls, align 8
+; CHECK-NEXT:    call void @llvm.donothing()
+; CHECK-NEXT:    [[_MSPROP:%.*]] = or i64 [[TMP1]], [[TMP2]]
+; CHECK-NEXT:    [[PTR:%.*]] = getelementptr i8, ptr [[D]], i64 [[OFFSET]]
+; CHECK-NEXT:    [[_MSPROP1:%.*]] = extractelement <16 x i8> [[TMP3]], i32 0
+; CHECK-NEXT:    [[TMP:%.*]] = extractelement <16 x i8> [[A]], i32 0
+; CHECK-NEXT:    [[_MSCMP:%.*]] = icmp ne i64 [[_MSPROP]], 0
+; CHECK-NEXT:    br i1 [[_MSCMP]], label [[TMP4:%.*]], label [[TMP5:%.*]], !prof [[PROF0]]
+; CHECK:       4:
+; CHECK-NEXT:    call void @__msan_warning_noreturn() #[[ATTR4]]
+; CHECK-NEXT:    unreachable
+; CHECK:       5:
+; CHECK-NEXT:    [[TMP6:%.*]] = ptrtoint ptr [[PTR]] to i64
+; CHECK-NEXT:    [[TMP7:%.*]] = xor i64 [[TMP6]], 193514046488576
+; CHECK-NEXT:    [[TMP8:%.*]] = inttoptr i64 [[TMP7]] to ptr
+; CHECK-NEXT:    store i8 [[_MSPROP1]], ptr [[TMP8]], align 1
+; CHECK-NEXT:    store i8 [[TMP]], ptr [[PTR]], align 1
+; CHECK-NEXT:    ret void
+;
+  %ptr = getelementptr i8, ptr %D, i64 %offset
+  %tmp = extractelement <16 x i8> %A, i32 0
+  store i8 %tmp, ptr %ptr
+  ret void
+}
+
+define void @st1lane_8h(<8 x i16> %A, ptr %D) sanitize_memory {
+; CHECK-LABEL: define void @st1lane_8h(
+; CHECK-SAME: <8 x i16> [[A:%.*]], ptr [[D:%.*]]) #[[ATTR0]] {
+; CHECK-NEXT:    [[TMP1:%.*]] = load i64, ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_param_tls to i64), i64 16) to ptr), align 8
+; CHECK-NEXT:    [[TMP2:%.*]] = load <8 x i16>, ptr @__msan_param_tls, align 8
+; CHECK-NEXT:    call void @llvm.donothing()
+; CHECK-NEXT:    [[_MSPROP:%.*]] = or i64 [[TMP1]], 0
+; CHECK-NEXT:    [[PTR:%.*]] = getelementptr i16, ptr [[D]], i64 1
+; CHECK-NEXT:    [[_MSPROP1:%.*]] = extractelement <8 x i16> [[TMP2]], i32 1
+; CHECK-NEXT:    [[TMP:%.*]] = extractelement <8 x i16> [[A]], i32 1
+; CHECK-NEXT:    [[_MSCMP:%.*]] = icmp ne i64 [[_MSPROP]], 0
+; CHECK-NEXT:    br i1 [[_MSCMP]], label [[TMP3:%.*]], label [[TMP4:%.*]], !prof [[PROF0]]
+; CHECK:       3:
+; CHECK-NEXT:    call void @__msan_warning_noreturn() #[[ATTR4]]
+; CHECK-NEXT:    unreachable
+; CHECK:       4:
+; CHECK-NEXT:    [[TMP5:%.*]] = ptrtoint ptr [[PTR]] to i64
+; CHECK-NEXT:    [[TMP6:%.*]] = xor i64 [[TMP5]], 193514046488576
+; CHECK-NEXT:    [[TMP7:%.*]] = inttoptr i64 [[TMP6]] to ptr
+; CHECK-NEXT:    store i16 [[_MSPROP1]], ptr [[TMP7]], align 2
+; CHECK-NEXT:    store i16 [[TMP]], ptr [[PTR]], align 2
+; CHECK-NEXT:    ret void
+;
+  %ptr = getelementptr i16, ptr %D, i64 1
+  %tmp = extractelement <8 x i16> %A, i32 1
+  store i16 %tmp, ptr %ptr
+  ret void
+}
+
+define void @st1lane0_8h(<8 x i16> %A, ptr %D) sanitize_memory {
+; CHECK-LABEL: define void @st1lane0_8h(
+; CHECK-SAME: <8 x i16> [[A:%.*]], ptr [[D:%.*]]) #[[ATTR0]] {
+; CHECK-NEXT:    [[TMP1:%.*]] = load i64, ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_param_tls to i64), i64 16) to ptr), align 8
+; CHECK-NEXT:    [[TMP2:%.*]] = load <8 x i16>, ptr @__msan_param_tls, align 8
+; CHECK-NEXT:    call void @llvm.donothing()
+; CHECK-NEXT:    [[_MSPROP:%.*]] = or i64 [[TMP1]], 0
+; CHECK-NEXT:    [[PTR:%.*]] = getelementptr i16, ptr [[D]], i64 1
+; CHECK-NEXT:    [[_MSPROP1:%.*]] = extractelement <8 x i16> [[TMP2]], i32 0
+; CHECK-NEXT:    [[TMP:%.*]] = extractelement <8 x i16> [[A]], i32 0
+; CHECK-NEXT:    [[_MSCMP:%.*]] = icmp ne i64 [[_MSPROP]], 0
+; CHECK-NEXT:    br i1 [[_MSCMP]], label [[TMP3:%.*]], label [[TMP4:%.*]], !prof [[PROF0]]
+; CHECK:       3:
+; CHECK-NEXT:    call void @__msan_warning_noreturn() #[[ATTR4]]
+; CHECK-NEXT:    unreachable
+; CHECK:       4:
+; CHECK-NEXT:    [[TMP5:%.*]] = ptrtoint ptr [[PTR]] to i64
+; CHECK-NEXT:    [[TMP6:%.*]] = xor i64 [[TMP5]], 193514046488576
+; CHECK-NEXT:    [[TMP7:%.*]] = inttoptr i64 [[TMP6]] to ptr
+; CHECK-NEXT:    store i16 [[_MSPROP1]], ptr [[TMP7]], align 2
+; CHECK-NEXT:    store i16 [[TMP]], ptr [[PTR]], align 2
+; CHECK-NEXT:    ret void
+;
+  %ptr = getelementptr i16, ptr %D, i64 1
+  %tmp = extractelement <8 x i16> %A, i32 0
+  store i16 %tmp, ptr %ptr
+  ret void
+}
+
+define void @st1lane0u_8h(<8 x i16> %A, ptr %D) sanitize_memory {
+; CHECK-LABEL: define void @st1lane0u_8h(
+; CHECK-SAME: <8 x i16> [[A:%.*]], ptr [[D:%.*]]) #[[ATTR0]] {
+; CHECK-NEXT:    [[TMP1:%.*]] = load i64, ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_param_tls to i64), i64 16) to ptr), align 8
+; CHECK-NEXT:    [[TMP2:%.*]] = load <8 x i16>, ptr @__msan_param_tls, align 8
+; CHECK-NEXT:    call void @llvm.donothing()
+; CHECK-NEXT:    [[_MSPROP:%.*]] = or i64 [[TMP1]], 0
+; CHECK-NEXT:    [[PTR:%.*]] = getelementptr i16, ptr [[D]], i64 -1
+; CHECK-NEXT:    [[_MSPROP1:%.*]] = extractelement <8 x i16> [[TMP2]], i32 0
+; CHECK-NEXT:    [[TMP:%.*]] = extractelement <8 x i16> [[A]], i32 0
+; CHECK-NEXT:    [[_MSCMP:%.*]] = icmp ne i64 [[_MSPROP]], 0
+; CHECK-NEXT:    br i1 [[_MSCMP]], label [[TMP3:%.*]], label [[TMP4:%.*]], !prof [[PROF0]]
+; CHECK:       3:
+; CHECK-NEXT:    call void @__msan_warning_noreturn() #[[ATTR4]]
+; CHECK-NEXT:    unreachable
+; CHECK:       4:
+; CHECK-NEXT:    [[TMP5:%.*]] = ptrtoint ptr [[PTR]] to i64
+; CHECK-NEXT:    [[TMP6:%.*]] = xor i64 [[TMP5]], 193514046488576
+; CHECK-NEXT:    [[TMP7:%.*]] = inttoptr i64 [[TMP6]] to ptr
+; CHECK-NEXT:    store i16 [[_MSPROP1]], ptr [[TMP7]], align 2
+; CHECK-NEXT:    store i16 [[TMP]], ptr [[PTR]], align 2
+; CHECK-NEXT:    ret void
+;
+  %ptr = getelementptr i16, ptr %D, i64 -1
+  %tmp = extractelement <8 x i16> %A, i32 0
+  store i16 %tmp, ptr %ptr
+  ret void
+}
+
+define void @st1lane_ro_8h(<8 x i16> %A, ptr %D, i64 %offset) sanitize_memory {
+; CHECK-LABEL: define void @st1lane_ro_8h(
+; CHECK-SAME: <8 x i16> [[A:%.*]], ptr [[D:%.*]], i64 [[OFFSET:%.*]]) #[[ATTR0]] {
+; CHECK-NEXT:    [[TMP1:%.*]] = load i64, ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_param_tls to i64), i64 16) to ptr), align 8
+; CHECK-NEXT:    [[TMP2:%.*]] = load i64, ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_param_tls to i64), i64 24) to ptr), align 8
+; CHECK-NEXT:    [[TMP3:%.*]] = load <8 x i16>, ptr @__msan_param_tls, align 8
+; CHECK-NEXT:    call void @llvm.donothing()
+; CHECK-NEXT:    [[_MSPROP:%.*]] = or i64 [[TMP1]], [[TMP2]]
+; CHECK-NEXT:    [[PTR:%.*]] = getelementptr i16, ptr [[D]], i64 [[OFFSET]]
+; CHECK-NEXT:    [[_MSPROP1:%.*]] = extractelement <8 x i16> [[TMP3]], i32 1
+; CHECK-NEXT:    [[TMP:%.*]] = extractelement <8 x i16> [[A]], i32 1
+; CHECK-NEXT:    [[_MSCMP:%.*]] = icmp ne i64 [[_MSPROP]], 0
+; CHECK-NEXT:    br i1 [[_MSCMP]], label [[TMP4:%.*]], label [[TMP5:%.*]], !prof [[PROF0]]
+; CHECK:       4:
+; CHECK-NEXT:    call void @__msan_warning_noreturn() #[[ATTR4]]
+; CHECK-NEXT:    unreachable
+; CHECK:       5:
+; CHECK-NEXT:    [[TMP6:%.*]] = ptrtoint ptr [[PTR]] to i64
+; CHECK-NEXT:    [[TMP7:%.*]] = xor i64 [[TMP6]], 193514046488576
+; CHECK-NEXT:    [[TMP8:%.*]] = inttoptr i64 [[TMP7]] to ptr
+; CHECK-NEXT:    store i16 [[_MSPROP1]], ptr [[TMP8]], align 2
+; CHECK-NEXT:    store i16 [[TMP]], ptr [[PTR]], align 2
+; CHECK-NEXT:    ret void
+;
+  %ptr = getelementptr i16, ptr %D, i64 %offset
+  %tmp = extractelement <8 x i16> %A, i32 1
+  store i16 %tmp, ptr %ptr
+  ret void
+}
+
+define void @st1lane0_ro_8h(<8 x i16> %A, ptr %D, i64 %offset) sanitize_memory {
+; CHECK-LABEL: define void @st1lane0_ro_8h(
+; CHECK-SAME: <8 x i16> [[A:%.*]], ptr [[D:%.*]], i64 [[OFFSET:%.*]]) #[[ATTR0]] {
+; CHECK-NEXT:    [[TMP1:%.*]] = load i64, ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_param_tls to i64), i64 16) to ptr), align 8
+; CHECK-NEXT:    [[TMP2:%.*]] = load i64, ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_param_tls to i64), i64 24) to ptr), align 8
+; CHECK-NEXT:    [[TMP3:%.*]] = load <8 x i16>, ptr @__msan_param_tls, align 8
+; CHECK-NEXT:    call void @llvm.donothing()
+; CHECK-NEXT:    [[_MSPROP:%.*]] = or i64 [[TMP1]], [[TMP2]]
+; CHECK-NEXT:    [[PTR:%.*]] = getelementptr i16, ptr [[D]], i64 [[OFFSET]]
+; CHECK-NEXT:    [[_MSPROP1:%.*]] = extractelement <8 x i16> [[TMP3]], i32 0
+; CHECK-NEXT:    [[TMP:%.*]] = extractelement <8 x i16> [[A]], i32 0
+; CHECK-NEXT:    [[_MSCMP:%.*]] = icmp ne i64 [[_MSPROP]], 0
+; CHECK-NEXT:    br i1 [[_MSCMP]], label [[TMP4:%.*]], label [[TMP5:%.*]], !prof [[PROF0]]
+; CHECK:       4:
+; CHECK-NEXT:    call void @__msan_warning_noreturn() #[[ATTR4]]
+; CHECK-NEXT:    unreachable
+; CHECK:       5:
+; CHECK-NEXT:    [[TMP6:%.*]] = ptrtoint ptr [[PTR]] to i64
+; CHECK-NEXT:    [[TMP7:%.*]] = xor i64 [[TMP6]], 193514046488576
+; CHECK-NEXT:    [[TMP8:%.*]] = inttoptr i64 [[TMP7]] to ptr
+; CHECK-NEXT:    store i16 [[_MSPROP1]], ptr [[TMP8]], align 2
+; CHECK-NEXT:    store i16 [[TMP]], ptr [[PTR]], align 2
+; CHECK-NEXT:    ret void
+;
+  %ptr = getelementptr i16, ptr %D, i64 %offset
+  %tmp = extractelement <8 x i16> %A, i32 0
+  store i16 %tmp, ptr %ptr
+  ret void
+}
+
+define void @st1lane_4s(<4 x i32> %A, ptr %D) sanitize_memory {
+; CHECK-LABEL: define void @st1lane_4s(
+; CHECK-SAME: <4 x i32> [[A:%.*]], ptr [[D:%.*]]) #[[ATTR0]] {
+; CHECK-NEXT:    [[TMP1:%.*]] = load i64, ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_param_tls to i64), i64 16) to ptr), align 8
+; CHECK-NEXT:    [[TMP2:%.*]] = load <4 x i32>, ptr @__msan_param_tls, align 8
+; CHECK-NEXT:    call void @llvm.donothing()
+; CHECK-NEXT:    [[_MSPROP:%.*]] = or i64 [[TMP1]], 0
+; CHECK-NEXT:    [[PTR:%.*]] = getelementptr i32, ptr [[D]], i64 1
+; CHECK-NEXT:    [[_MSPROP1:%.*]] = extractelement <4 x i32> [[TMP2]], i32 1
+; CHECK-NEXT:    [[TMP:%.*]] = extractelement <4 x i32> [[A]], i32 1
+; CHECK-NEXT:    [[_MSCMP:%.*]] = icmp ne i64 [[_MSPROP]], 0
+; CHECK-NEXT:    br i1 [[_MSCMP]], label [[TMP3:%.*]], label [[TMP4:%.*]], !prof [[PROF0]]
+; CHECK:       3:
+; CHECK-NEXT:    call void @__msan_warning_noreturn() #[[ATTR4]]
+; CHECK-NEXT:    unreachable
+; CHECK:       4:
+; CHECK-NEXT:    [[TMP5:%.*]] = ptrtoint ptr [[PTR]] to i64
+; CHECK-NEXT:    [[TMP6:%.*]] = xor i64 [[TMP5]], 193514046488576
+; CHECK-NEXT:    [[TMP7:%.*]] = inttoptr i64 [[TMP6]] to ptr
+; CHECK-NEXT:    store i32 [[_MSPROP1]], ptr [[TMP7]], align 4
+; CHECK-NEXT:    store i32 [[TMP]], ptr [[PTR]], align 4
+; CHECK-NEXT:    ret void
+;
+  %ptr = getelementptr i32, ptr %D, i64 1
+  %tmp = extractelement <4 x i32> %A, i32 1
+  store i32 %tmp, ptr %ptr
+  ret void
+}
+
+define void @st1lane0_4s(<4 x i32> %A, ptr %D) sanitize_memory {
+; CHECK-LABEL: define void @st1lane0_4s(
+; CHECK-SAME: <4 x i32> [[A:%.*]], ptr [[D:%.*]]) #[[ATTR0]] {
+; CHECK-NEXT:    [[TMP1:%.*]] = load i64, ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_param_tls to i64), i64 16) to ptr), align 8
+; CHECK-NEXT:    [[TMP2:%.*]] = load <4 x i32>, ptr @__msan_param_tls, align 8
+; CHECK-NEXT:    call void @llvm.donothing()
+; CHECK-NEXT:    [[_MSPROP:%.*]] = or i64 [[TMP1]], 0
+; CHECK-NEXT:    [[PTR:%.*]] = getelementptr i32, ptr [[D]], i64 1
+; CHECK-NEXT:    [[_MSPROP1:%.*]] = extractelement <4 x i32> [[TMP2]], i32 0
+; CHECK-NEXT:    [[TMP:%.*]] = extractelement <4 x i32> [[A]], i32 0
+; CHECK-NEXT:    [[_MSCMP:%.*]] = icmp ne i64 [[_MSPROP]], 0
+; CHECK-NEXT:    br i1 [[_MSCMP]], label [[TMP3:%.*]], label [[TMP4:%.*]], !prof [[PROF0]]
+; CHECK:       3:
+; CHECK-NEXT:    call void @__msan_warning_noreturn() #[[ATTR4]]
+; CHECK-NEXT:    unreachable
+; CHECK:       4:
+; CHECK-NEXT:    [[TMP5:%.*]] = ptrtoint ptr [[PTR]] to i64
+; CHECK-NEXT:    [[TMP6:%.*]] = xor i64 [[TMP5]], 193514046488576
+; CHECK-NEXT:    [[TMP7:%.*]] = inttoptr i64 [[TMP6]] to ptr
+; CHECK-NEXT:    store i32 [[_MSPROP1]], ptr [[TMP7]], align 4
+; CHECK-NEXT:    store i32 [[TMP]], ptr [[PTR]], align 4
+; CHECK-NEXT:    ret void
+;
+  %ptr = getelementptr i32, ptr %D, i64 1
+  %tmp = extractelement <4 x i32> %A, i32 0
+  store i32 %tmp, ptr %ptr
+  ret void
+}
+
+define void @st1lane0u_4s(<4 x i32> %A, ptr %D) sanitize_memory {
+; CHECK-LABEL: define void @st1lane0u_4s(
+; CHECK-SAME: <4 x i32> [[A:%.*]], ptr [[D:%.*]]) #[[ATTR0]] {
+; CHECK-NEXT:    [[TMP1:%.*]] = load i64, ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_param_tls to i64), i64 16) to ptr), align 8
+; CHECK-NEXT:    [[TMP2:%.*]] = load <4 x i32>, ptr @__msan_param_tls, align 8
+; CHECK-NEXT:    call void @llvm.donothing()
+; CHECK-NEXT:    [[_MSPROP:%.*]] = or i64 [[TMP1]], 0
+; CHECK-NEXT:    [[PTR:%.*]] = getelementptr i32, ptr [[D]], i64 -1
+; CHECK-NEXT:    [[_MSPROP1:%.*]] = extractelement <4 x i32> [[TMP2]], i32 0
+; CHECK-NEXT:    [[TMP:%.*]] = extractelement <4 x i32> [[A]], i32 0
+; CHECK-NEXT:    [[_MSCMP:%.*]] = icmp ne i64 [[_MSPROP]], 0
+; CHECK-NEXT:    br i1 [[_MSCMP]], label [[TMP3:%.*]], label [[TMP4:%.*]], !prof [[PROF0]]
+; CHECK:       3:
+; CHECK-NEXT:    call void @__msan_warning_noreturn() #[[ATTR4]]
+; CHECK-NEXT:    unreachable
+; CHECK:       4:
+; CHECK-NEXT:    [[TMP5:%.*]] = ptrtoint ptr [[PTR]] to i64
+; CHECK-NEXT:    [[TMP6:%.*]] = xor i64 [[TMP5]], 193514046488576
+; CHECK-NEXT:    [[TMP7:%.*]] = inttoptr i64 [[TMP6]] to ptr
+; CHECK-NEXT:    store i32 [[_MSPROP1]], ptr [[TMP7]], align 4
+; CHECK-NEXT:    store i32 [[TMP]], ptr [[PTR]], align 4
+; CHECK-NEXT:    ret void
+;
+  %ptr = getelementptr i32, ptr %D, i64 -1
+  %tmp = extractelement <4 x i32> %A, i32 0
+  store i32 %tmp, ptr %ptr
+  ret void
+}
+
+define void @st1lane_ro_4s(<4 x i32> %A, ptr %D, i64 %offset) sanitize_memory {
+; CHECK-LABEL: define void @st1lane_ro_4s(
+; CHECK-SAME: <4 x i32> [[A:%.*]], ptr [[D:%.*]], i64 [[OFFSET:%.*]]) #[[ATTR0]] {
+; CHECK-NEXT:    [[TMP1:%.*]] = load i64, ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_param_tls to i64), i64 16) to ptr), align 8
+; CHECK-NEXT:    [[TMP2:%.*]] = load i64, ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_param_tls to i64), i64 24) to ptr), align 8
+; CHECK-NEXT:    [[TMP3:%.*]] = load <4 x i32>, ptr @__msan_param_tls, align 8
+; CHECK-NEXT:    call void @llvm.donothing()
+; CHECK-NEXT:    [[_MSPROP:%.*]] = or i64 [[TMP1]], [[TMP2]]
+; CHECK-NEXT:    [[PTR:%.*]] = getelementptr i32, ptr [[D]], i64 [[OFFSET]]
+; CHECK-NEXT:    [[_MSPROP1:%.*]] = extractelement <4 x i32> [[TMP3]], i32 1
+; CHECK-NEXT:    [[TMP:%.*]] = extractelement <4 x i32> [[A]], i32 1
+; CHECK-NEXT:    [[_MSCMP:%.*]] = icmp ne i64 [[_MSPROP]], 0
+; CHECK-NEXT:    br i1 [[_MSCMP]], label [[TMP4:%.*]], label [[TMP5:%.*]], !prof [[PROF0]]
+; CHECK:       4:
+; CHECK-NEXT:    call void @__msan_warning_noreturn() #[[ATTR4]]
+; CHECK-NEXT:    unreachable
+; CHECK:       5:
+; CHECK-NEXT:    [[TMP6:%.*]] = ptrtoint ptr [[PTR]] to i64
+; CHECK-NEXT:    [[TMP7:%.*]] = xor i64 [[TMP6]], 193514046488576
+; CHECK-NEXT:    [[TMP8:%.*]] = inttoptr i64 [[TMP7]] to ptr
+; CHECK-NEXT:    store i32 [[_MSPROP1]], ptr [[TMP8]], align 4
+; CHECK-NEXT:    store i32 [[TMP]], ptr [[PTR]], align 4
+; CHECK-NEXT:    ret void
+;
+  %ptr = getelementptr i32, ptr %D, i64 %offset
+  %tmp = extractelement <4 x i32> %A, i32 1
+  store i32 %tmp, ptr %ptr
+  ret void
+}
+
+define void @st1lane0_ro_4s(<4 x i32> %A, ptr %D, i64 %offset) sanitize_memory {
+; CHECK-LABEL: define void @st1lane0_ro_4s(
+; CHECK-SAME: <4 x i32> [[A:%.*]], ptr [[D:%.*]], i64 [[OFFSET:%.*]]) #[[ATTR0]] {
+; CHECK-NEXT:    [[TMP1:%.*]] = load i64, ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_param_tls to i64), i64 16) to ptr), align 8
+; CHECK-NEXT:    [[TMP2:%.*]] = load i64, ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_param_tls to i64), i64 24) to ptr), align 8
+; CHECK-NEXT:    [[TMP3:%.*]] = load <4 x i32>, ptr @__msan_param_tls, align 8
+; CHECK-NEXT:    call void @llvm.donothing()
+; CHECK-NEXT:    [[_MSPROP:%.*]] = or i64 [[TMP1]], [[TMP2]]
+; CHECK-NEXT:    [[PTR:%.*]] = getelementptr i32, ptr [[D]], i64 [[OFFSET]]
+; CHECK-NEXT:    [[_MSPROP1:%.*]] = extractelement <4 x i32> [[TMP3]], i32 0
+; CHECK-NEXT:    [[TMP:%.*]] = extractelement <4 x i32> [[A]], i32 0
+; CHECK-NEXT:    [[_MSCMP:%.*]] = icmp ne i64 [[_MSPROP]], 0
+; CHECK-NEXT:    br i1 [[_MSCMP]], label [[TMP4:%.*]], label [[TMP5:%.*]], !prof [[PROF0]]
+; CHECK:       4:
+; CHECK-NEXT:    call void @__msan_warning_noreturn() #[[ATTR4]]
+; CHECK-NEXT:    unreachable
+; CHECK:       5:
+; CHECK-NEXT:    [[TMP6:%.*]] = ptrtoint ptr [[PTR]] to i64
+; CHECK-NEXT:    [[TMP7:%.*]] = xor i64 [[TMP6]], 193514046488576
+; CHECK-NEXT:    [[TMP8:%.*]] = inttoptr i64 [[TMP7]] to ptr
+; CHECK-NEXT:    store i32 [[_MSPROP1]], ptr [[TMP8]], align 4
+; CHECK-NEXT:    store i32 [[TMP]], ptr [[PTR]], align 4
+; CHECK-NEXT:    ret void
+;
+  %ptr = getelementptr i32, ptr %D, i64 %offset
+  %tmp = extractelement <4 x i32> %A, i32 0
+  store i32 %tmp, ptr %ptr
+  ret void
+}
+
+define void @st1lane_4s_float(<4 x float> %A, ptr %D) sanitize_memory {
+; CHECK-LABEL: define void @st1lane_4s_float(
+; CHECK-SAME: <4 x float> [[A:%.*]], ptr [[D:%.*]]) #[[ATTR0]] {
+; CHECK-NEXT:    [[TMP1:%.*]] = load i64, ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_param_tls to i64), i64 16) to ptr), align 8
+; CHECK-NEXT:    [[TMP2:%.*]] = load <4 x i32>, ptr @__msan_param_tls, align 8
+; CHECK-NEXT:    call void @llvm.donothing()
+; CHECK-NEXT:    [[_MSPROP:%.*]] = or i64 [[TMP1]], 0
+; CHECK-NEXT:    [[PTR:%.*]] = getelementptr float, ptr [[D]], i64 1
+; CHECK-NEXT:    [[_MSPROP1:%.*]] = extractelement <4 x i32> [[TMP2]], i32 1
+; CHECK-NEXT:    [[TMP:%.*]] = extractelement <4 x float> [[A]], i32 1
+; CHECK-NEXT:    [[_MSCMP:%.*]] = icmp ne i64 [[_MSPROP]], 0
+; CHECK-NEXT:    br i1 [[_MSCMP]], label [[TMP3:%.*]], label [[TMP4:%.*]], !prof [[PROF0]]
+; CHECK:       3:
+; CHECK-NEXT:    call void @__msan_warning_noreturn() #[[ATTR4]]
+; CHECK-NEXT:    unreachable
+; CHECK:       4:
+; CHECK-NEXT:    [[TMP5:%.*]] = ptrtoint ptr [[PTR]] to i64
+; CHECK-NEXT:    [[TMP6:%.*]] = xor i64 [[TMP5]], 193514046488576
+; CHECK-NEXT:    [[TMP7:%.*]] = inttoptr i64 [[TMP6]] to ptr
+; CHECK-NEXT:    store i32 [[_MSPROP1]], ptr [[TMP7]], align 4
+; CHECK-NEXT:    store float [[TMP]], ptr [[PTR]], align 4
+; CHECK-NEXT:    ret void
+;
+  %ptr = getelementptr float, ptr %D, i64 1
+  %tmp = extractelement <4 x float> %A, i32 1
+  store float %tmp, ptr %ptr
+  ret void
+}
+
+define void @st1lane0_4s_float(<4 x float> %A, ptr %D) sanitize_memory {
+; CHECK-LABEL: define void @st1lane0_4s_float(
+; CHECK-SAME: <4 x float> [[A:%.*]], ptr [[D:%.*]]) #[[ATTR0]] {
+; CHECK-NEXT:    [[TMP1:%.*]] = load i64, ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_param_tls to i64), i64 16) to ptr), align 8
+; CHECK-NEXT:    [[TMP2:%.*]] = load <4 x i32>, ptr @__msan_param_tls, align 8
+; CHECK-NEXT:    call void @llvm.donothing()
+; CHECK-NEXT:    [[_MSPROP:%.*]] = or i64 [[TMP1]], 0
+; CHECK-NEXT:    [[PTR:%.*]] = getelementptr float, ptr [[D]], i64 1
+; CHECK-NEXT:    [[_MSPROP1:%.*]] = extractelement <4 x i32> [[TMP2]], i32 0
+; CHECK-NEXT:    [[TMP:%.*]] = extractelement <4 x float> [[A]], i32 0
+; CHECK-NEXT:    [[_MSCMP:%.*]] = icmp ne i64 [[_MSPROP]], 0
+; CHECK-NEXT:    br i1 [[_MSCMP]], label [[TMP3:%.*]], label [[TMP4:%.*]], !prof [[PROF0]]
+; CHECK:       3:
+; CHECK-NEXT:    call void @__msan_warning_noreturn() #[[ATTR4]]
+; CHECK-NEXT:    unreachable
+; CHECK:       4:
+; CHECK-NEXT:    [[TMP5:%.*]] = ptrtoint ptr [[PTR]] to i64
+; CHECK-NEXT:    [[TMP6:%.*]] = xor i64 [[TMP5]], 193514046488576
+; CHECK-NEXT:    [[TMP7:%.*]] = inttoptr i64 [[TMP6]] to ptr
+; CHECK-NEXT:    store i32 [[_MSPROP1]], ptr [[TMP7]], align 4
+; CHECK-NEXT:    store float [[TMP]], ptr [[PTR]], align 4
+; CHECK-NEXT:    ret void
+;
+  %ptr = getelementptr float, ptr %D, i64 1
+  %tmp = extractelement <4 x float> %A, i32 0
+  store float %tmp, ptr %ptr
+  ret void
+}
+
+define void @st1lane0u_4s_float(<4 x float> %A, ptr %D) sanitize_memory {
+; CHECK-LABEL: define void @st1lane0u_4s_float(
+; CHECK-SAME: <4 x float> [[A:%.*]], ptr [[D:%.*]]) #[[ATTR0]] {
+; CHECK-NEXT:    [[TMP1:%.*]] = load i64, ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_param_tls to i64), i64 16) to ptr), align 8
+; CHECK-NEXT:    [[TMP2:%.*]] = load <4 x i32>, ptr @__msan_param_tls, align 8
+; CHECK-NEXT:    call void @llvm.donothing()
+; CHECK-NEXT:    [[_MSPROP:%.*]] = or i64 [[TMP1]], 0
+; CHECK-NEXT:    [[PTR:%.*]] = getelementptr float, ptr [[D]], i64 -1
+; CHECK-NEXT:    [[_MSPROP1:%.*]] = extractelement <4 x i32> [[TMP2]], i32 0
+; CHECK-NEXT:    [[TMP:%.*]] = extractelement <4 x float> [[A]], i32 0
+; CHECK-NEXT:    [[_MSCMP:%.*]] = icmp ne i64 [[_MSPROP]], 0
+; CHECK-NEXT:    br i1 [[_MSCMP]], label [[TMP3:%.*]], label [[TMP4:%.*]], !prof [[PROF0]]
+; CHECK:       3:
+; CHECK-NEXT:    call void @__msan_warning_noreturn() #[[ATTR4]]
+; CHECK-NEXT:    unreachable
+; CHECK:       4:
+; CHECK-NEXT:    [[TMP5:%.*]] = ptrtoint ptr [[PTR]] to i64
+; CHECK-NEXT:    [[TMP6:%.*]] = xor i64 [[TMP5]], 193514046488576
+; CHECK-NEXT:    [[TMP7:%.*]] = inttoptr i64 [[TMP6]] to ptr
+; CHECK-NEXT:    store i32 [[_MSPROP1]], ptr [[TMP7]], align 4
+; CHECK-NEXT:    store float [[TMP]], ptr [[PTR]], align 4
+; CHECK-NEXT:    ret void
+;
+  %ptr = getelementptr float, ptr %D, i64 -1
+  %tmp = extractelement <4 x float> %A, i32 0
+  store float %tmp, ptr %ptr
+  ret void
+}
+
+define void @st1lane_ro_4s_float(<4 x float> %A, ptr %D, i64 %offset) sanitize_memory {
+; CHECK-LABEL: define void @st1lane_ro_4s_float(
+; CHECK-SAME: <4 x float> [[A:%.*]], ptr [[D:%.*]], i64 [[OFFSET:%.*]]) #[[ATTR0]] {
+; CHECK-NEXT:    [[TMP1:%.*]] = load i64, ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_param_tls to i64), i64 16) to ptr), align 8
+; CHECK-NEXT:    [[TMP2:%.*]] = load i64, ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_param_tls to i64), i64 24) to ptr), align 8
+; CHECK-NEXT:    [[TMP3:%.*]] = load <4 x i32>, ptr @__msan_param_tls, align 8
+; CHECK-NEXT:    call void @llvm.donothing()
+; CHECK-NEXT:    [[_MSPROP:%.*]] = or i64 [[TMP1]], [[TMP2]]
+; CHECK-NEXT:    [[PTR:%.*]] = getelementptr float, ptr [[D]], i64 [[OFFSET]]
+; CHECK-NEXT:    [[_MSPROP1:%.*]] = extractelement <4 x i32> [[TMP3]], i32 1
+; CHECK-NEXT:    [[TMP:%.*]] = extractelement <4 x float> [[A]], i32 1
+; CHECK-NEXT:    [[_MSCMP:%.*]] = icmp ne i64 [[_MSPROP]], 0
+; CHECK-NEXT:    br i1 [[_MSCMP]], label [[TMP4:%.*]], label [[TMP5:%.*]], !prof [[PROF0]]
+; CHECK:       4:
+; CHECK-NEXT:    call void @__msan_warning_noreturn() #[[ATTR4]]
+; CHECK-NEXT:    unreachable
+; CHECK:       5:
+; CHECK-NEXT:    [[TMP6:%.*]] = ptrtoint ptr [[PTR]] to i64
+; CHECK-NEXT:    [[TMP7:%.*]] = xor i64 [[TMP6]], 193514046488576
+; CHECK-NEXT:    [[TMP8:%.*]] = inttoptr i64 [[TMP7]] to ptr
+; CHECK-NEXT:    store i32 [[_MSPROP1]], ptr [[TMP8]], align 4
+; CHECK-NEXT:    store float [[TMP]], ptr [[PTR]], align 4
+; CHECK-NEXT:    ret void
+;
+  %ptr = getelementptr float, ptr %D, i64 %offset
+  %tmp = extractelement <4 x float> %A, i32 1
+  store float %tmp, ptr %ptr
+  ret void
+}
+
+define void @st1lane0_ro_4s_float(<4 x float> %A, ptr %D, i64 %offset) sanitize_memory {
+; CHECK-LABEL: define void @st1lane0_ro_4s_float(
+; CHECK-SAME: <4 x float> [[A:%.*]], ptr [[D:%.*]], i64 [[OFFSET:%.*]]) #[[ATTR0]] {
+; CHECK-NEXT:    [[TMP1:%.*]] = load i64, ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_param_tls to i64), i64 16) to ptr), align 8
+; CHECK-NEXT:    [[TMP2:%.*]] = load i64, ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_param_tls to i64), i64 24) to ptr), align 8
+; CHECK-NEXT:    [[TMP3:%.*]] = load <4 x i32>, ptr @__msan_param_tls, align 8
+; CHECK-NEXT:    call void @llvm.donothing()
+; CHECK-NEXT:    [[_MSPROP:%.*]] = or i64 [[TMP1]], [[TMP2]]
+; CHECK-NEXT:    [[PTR:%.*]] = getelementptr float, ptr [[D]], i64 [[OFFSET]]
+; CHECK-NEXT:    [[_MSPROP1:%.*]] = extractelement <4 x i32> [[TMP3]], i32 0
+; CHECK-NEXT:    [[TMP:%.*]] = extractelement <4 x float> [[A]], i32 0
+; CHECK-NEXT:    [[_MSCMP:%.*]] = icmp ne i64 [[_MSPROP]], 0
+; CHECK-NEXT:    br i1 [[_MSCMP]], label [[TMP4:%.*]], label [[TMP5:%.*]], !prof [[PROF0]]
+; CHECK:       4:
+; CHECK-NEXT:    call void @__msan_warning_noreturn() #[[ATTR4]]
+; CHECK-NEXT:    unreachable
+; CHECK:       5:
+; CHECK-NEXT:    [[TMP6:%.*]] = ptrtoint ptr [[PTR]] to i64
+; CHECK-NEXT:    [[TMP7:%.*]] = xor i64 [[TMP6]], 193514046488576
+; CHECK-NEXT:    [[TMP8:%.*]] = inttoptr i64 [[TMP7]] to ptr
+; CHECK-NEXT:    store i32 [[_MSPROP1]], ptr [[TMP8]], align 4
+; CHECK-NEXT:    store float [[TMP]], ptr [[PTR]], align 4
+; CHECK-NEXT:    ret void
+;
+  %ptr = getelementptr float, ptr %D, i64 %offset
+  %tmp = extractelement <4 x float> %A, i32 0
+  store float %tmp, ptr %ptr
+  ret void
+}
+
+define void @st1lane_2d(<2 x i64> %A, ptr %D) sanitize_memory {
+; CHECK-LABEL: define void @st1lane_2d(
+; CHECK-SAME: <2 x i64> [[A:%.*]], ptr [[D:%.*]]) #[[ATTR0]] {
+; CHECK-NEXT:    [[TMP1:%.*]] = load i64, ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_param_tls to i64), i64 16) to ptr), align 8
+; CHECK-NEXT:    [[TMP2:%.*]] = load <2 x i64>, ptr @__msan_param_tls, align 8
+; CHECK-NEXT:    call void @llvm.donothing()
+; CHECK-NEXT:    [[_MSPROP:%.*]] = or i64 [[TMP1]], 0
+; CHECK-NEXT:    [[PTR:%.*]] = getelementptr i64, ptr [[D]], i64 1
+; CHECK-NEXT:    [[_MSPROP1:%.*]] = extractelement <2 x i64> [[TMP2]], i32 1
+; CHECK-NEXT:    [[TMP:%.*]] = extractelement <2 x i64> [[A]], i32 1
+; CHECK-NEXT:    [[_MSCMP:%.*]] = icmp ne i64 [[_MSPROP]], 0
+; CHECK-NEXT:    br i1 [[_MSCMP]], label [[TMP3:%.*]], label [[TMP4:%.*]], !prof [[PROF0]]
+; CHECK:       3:
+; CHECK-NEXT:    call void @__msan_warning_noreturn() #[[ATTR4]]
+; CHECK-NEXT:    unreachable
+; CHECK:       4:
+; CHECK-NEXT:    [[TMP5:%.*]] = ptrtoint ptr [[PTR]] to i64
+; CHECK-NEXT:    [[TMP6:%.*]] = xor i64 [[TMP5]], 193514046488576
+; CHECK-NEXT:    [[TMP7:%.*]] = inttoptr i64 [[TMP6]] to ptr
+; CHECK-NEXT:    store i64 [[_MSPROP1]], ptr [[TMP7]], align 8
+; CHECK-NEXT:    store i64 [[TMP]], ptr [[PTR]], align 8
+; CHECK-NEXT:    ret void
+;
+  %ptr = getelementptr i64, ptr %D, i64 1
+  %tmp = extractelement <2 x i64> %A, i32 1
+  store i64 %tmp, ptr %ptr
+  ret void
+}
+
+define void @st1lane0_2d(<2 x i64> %A, ptr %D) sanitize_memory {
+; CHECK-LABEL: define void @st1lane0_2d(
+; CHECK-SAME: <2 x i64> [[A:%.*]], ptr [[D:%.*]]) #[[ATTR0]] {
+; CHECK-NEXT:    [[TMP1:%.*]] = load i64, ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_param_tls to i64), i64 16) to ptr), align 8
+; CHECK-NEXT:    [[TMP2:%.*]] = load <2 x i64>, ptr @__msan_param_tls, align 8
+; CHECK-NEXT:    call void @llvm.donothing()
+; CHECK-NEXT:    [[_MSPROP:%.*]] = or i64 [[TMP1]], 0
+; CHECK-NEXT:    [[PTR:%.*]] = getelementptr i64, ptr [[D]], i64 1
+; CHECK-NEXT:    [[_MSPROP1:%.*]] = extractelement <2 x i64> [[TMP2]], i32 0
+; CHECK-NEXT:    [[TMP:%.*]] = extractelement <2 x i64> [[A]], i32 0
+; CHECK-NEXT:    [[_MSCMP:%.*]] = icmp ne i64 [[_MSPROP]], 0
+; CHECK-NEXT:    br i1 [[_MSCMP]], label [[TMP3:%.*]], label [[TMP4:%.*]], !prof [[PROF0]]
+; CHECK:       3:
+; CHECK-NEXT:    call void @__msan_warning_noreturn() #[[ATTR4]]
+; CHECK-NEXT:    unreachable
+; CHECK:       4:
+; CHECK-NEXT:    [[TMP5:%.*]] = ptrtoint ptr [[PTR]] to i64
+; CHECK-NEXT:    [[TMP6:%.*]] = xor i64 [[TMP5]], 193514046488576
+; CHECK-NEXT:    [[TMP7:%.*]] = inttoptr i64 [[TMP6]] to ptr
+; CHECK-NEXT:    store i64 [[_MSPROP1]], ptr [[TMP7]], align 8
+; CHECK-NEXT:    store i64 [[TMP]], ptr [[PTR]], align 8
+; CHECK-NEXT:    ret void
+;
+  %ptr = getelementptr i64, ptr %D, i64 1
+  %tmp = extractelement <2 x i64> %A, i32 0
+  store i64 %tmp, ptr %ptr
+  ret void
+}
+
+define void @st1lane0u_2d(<2 x i64> %A, ptr %D) sanitize_memory {
+; CHECK-LABEL: define void @st1lane0u_2d(
+; CHECK-SAME: <2 x i64> [[A:%.*]], ptr [[D:%.*]]) #[[ATTR0]] {
+; CHECK-NEXT:    [[TMP1:%.*]] = load i64, ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_param_tls to i64), i64 16) to ptr), align 8
+; CHECK-NEXT:    [[TMP2:%.*]] = load <2 x i64>, ptr @__msan_param_tls, align 8
+; CHECK-NEXT:    call void @llvm.donothing()
+; CHECK-NEXT:    [[_MSPROP:%.*]] = or i64 [[TMP1]], 0
+; CHECK-NEXT:    [[PTR:%.*]] = getelementptr i64, ptr [[D]], i64 -1
+; CHECK-NEXT:    [[_MSPROP1:%.*]] = extractelement <2 x i64> [[TMP2]], i32 0
+; CHECK-NEXT:    [[TMP:%.*]] = extractelement <2 x i64> [[A]], i32 0
+; CHECK-NEXT:    [[_MSCMP:%.*]] = icmp ne i64 [[_MSPROP]], 0
+; CHECK-NEXT:    br i1 [[_MSCMP]], label [[TMP3:%.*]], label [[TMP4:%.*]], !prof [[PROF0]]
+; CHECK:       3:
+; CHECK-NEXT:    call void @__msan_warning_noreturn() #[[ATTR4]]
+; CHECK-NEXT:    unreachable
+; CHECK:       4:
+; CHECK-NEXT:    [[TMP5:%.*]] = ptrtoint ptr [[PTR]] to i64
+; CHECK-NEXT:    [[TMP6:%.*]] = xor i64 [[TMP5]], 193514046488576
+; CHECK-NEXT:    [[TMP7:%.*]] = inttoptr i64 [[TMP6]] to ptr
+; CHECK-NEXT:    store i64 [[_MSPROP1]], ptr [[TMP7]], align 8
+; CHECK-NEXT:    store i64 [[TMP]], ptr [[PTR]], align 8
+; CHECK-NEXT:    ret void
+;
+  %ptr = getelementptr i64, ptr %D, i64 -1
+  %tmp = extractelement <2 x i64> %A, i32 0
+  store i64 %tmp, ptr %ptr
+  ret void
+}
+
+define void @st1lane_ro_2d(<2 x i64> %A, ptr %D, i64 %offset) sanitize_memory {
+; CHECK-LABEL: define void @st1lane_ro_2d(
+; CHECK-SAME: <2 x i64> [[A:%.*]], ptr [[D:%.*]], i64 [[OFFSET:%.*]]) #[[ATTR0]] {
+; CHECK-NEXT:    [[TMP1:%.*]] = load i64, ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_param_tls to i64), i64 16) to ptr), align 8
+; CHECK-NEXT:    [[TMP2:%.*]] = load i64, ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_param_tls to i64), i64 24) to ptr), align 8
+; CHECK-NEXT:    [[TMP3:%.*]] = load <2 x i64>, ptr @__msan_param_tls, align 8
+; CHECK-NEXT:    call void @llvm.donothing()
+; CHECK-NEXT:    [[_MSPROP:%.*]] = or i64 [[TMP1]], [[TMP2]]
+; CHECK-NEXT:    [[PTR:%.*]] = getelementptr i64, ptr [[D]], i64 [[OFFSET]]
+; CHECK-NEXT:    [[_MSPROP1:%.*]] = extractelement <2 x i64> [[TMP3]], i32 1
+; CHECK-NEXT:    [[TMP:%.*]] = extractelement <2 x i64> [[A]], i32 1
+; CHECK-NEXT:    [[_MSCMP:%.*]] = icmp ne i64 [[_MSPROP]], 0
+; CHECK-NEXT:    br i1 [[_MSCMP]], label [[TMP4:%.*]], label [[TMP5:%.*]], !prof [[PROF0]]
+; CHECK:       4:
+; CHECK-NEXT:    call void @__msan_warning_noreturn() #[[ATTR4]]
+; CHECK-NEXT:    unreachable
+; CHECK:       5:
+; CHECK-NEXT:    [[TMP6:%.*]] = ptrtoint ptr [[PTR]] to i64
+; CHECK-NEXT:    [[TMP7:%.*]] = xor i64 [[TMP6]], 193514046488576
+; CHECK-NEXT:    [[TMP8:%.*]] = inttoptr i64 [[TMP7]] to ptr
+; CHECK-NEXT:    store i64 [[_MSPROP1]], ptr [[TMP8]], align 8
+; CHECK-NEXT:    store i64 [[TMP]], ptr [[PTR]], align 8
+; CHECK-NEXT:    ret void
+;
+  %ptr = getelementptr i64, ptr %D, i64 %offset
+  %tmp = extractelement <2 x i64> %A, i32 1
+  store i64 %tmp, ptr %ptr
+  ret void
+}
+
+define void @st1lane0_ro_2d(<2 x i64> %A, ptr %D, i64 %offset) sanitize_memory {
+; CHECK-LABEL: define void @st1lane0_ro_2d(
+; CHECK-SAME: <2 x i64> [[A:%.*]], ptr [[D:%.*]], i64 [[OFFSET:%.*]]) #[[ATTR0]] {
+; CHECK-NEXT:    [[TMP1:%.*]] = load i64, ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_param_tls to i64), i64 16) to ptr), align 8
+; CHECK-NEXT:    [[TMP2:%.*]] = load i64, ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_param_tls to i64), i64 24) to ptr), align 8
+; CHECK-NEXT:    [[TMP3:%.*]] = load <2 x i64>, ptr @__msan_param_tls, align 8
+; CHECK-NEXT:    call void @llvm.donothing()
+; CHECK-NEXT:    [[_MSPROP:%.*]] = or i64 [[TMP1]], [[TMP2]]
+; CHECK-NEXT:    [[PTR:%.*]] = getelementptr i64, ptr [[D]], i64 [[OFFSET]]
+; CHECK-NEXT:    [[_MSPROP1:%.*]] = extractelement <2 x i64> [[TMP3]], i32 0
+; CHECK-NEXT:    [[TMP:%.*]] = extractelement <2 x i64> [[A]], i32 0
+; CHECK-NEXT:    [[_MSCMP:%.*]] = icmp ne i64 [[_MSPROP]], 0
+; CHECK-NEXT:    br i1 [[_MSCMP]], label [[TMP4:%.*]], label [[TMP5:%.*]], !prof [[PROF0]]
+; CHECK:       4:
+; CHECK-NEXT:    call void @__msan_warning_noreturn() #[[ATTR4]]
+; CHECK-NEXT:    unreachable
+; CHECK:       5:
+; CHECK-NEXT:    [[TMP6:%.*]] = ptrtoint ptr [[PTR]] to i64
+; CHECK-NEXT:    [[TMP7:%.*]] = xor i64 [[TMP6]], 193514046488576
+; CHECK-NEXT:    [[TMP8:%.*]] = inttoptr i64 [[TMP7]] to ptr
+; CHECK-NEXT:    store i64 [[_MSPROP1]], ptr [[TMP8]], align 8
+; CHECK-NEXT:    store i64 [[TMP]], ptr [[PTR]], align 8
+; CHECK-NEXT:    ret void
+;
+  %ptr = getelementptr i64, ptr %D, i64 %offset
+  %tmp = extractelement <2 x i64> %A, i32 0
+  store i64 %tmp, ptr %ptr
+  ret void
+}
+
+define void @st1lane_2d_double(<2 x double> %A, ptr %D) sanitize_memory {
+; CHECK-LABEL: define void @st1lane_2d_double(
+; CHECK-SAME: <2 x double> [[A:%.*]], ptr [[D:%.*]]) #[[ATTR0]] {
+; CHECK-NEXT:    [[TMP1:%.*]] = load i64, ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_param_tls to i64), i64 16) to ptr), align 8
+; CHECK-NEXT:    [[TMP2:%.*]] = load <2 x i64>, ptr @__msan_param_tls, align 8
+; CHECK-NEXT:    call void @llvm.donothing()
+; CHECK-NEXT:    [[_MSPROP:%.*]] = or i64 [[TMP1]], 0
+; CHECK-NEXT:    [[PTR:%.*]] = getelementptr double, ptr [[D]], i64 1
+; CHECK-NEXT:    [[_MSPROP1:%.*]] = extractelement <2 x i64> [[TMP2]], i32 1
+; CHECK-NEXT:    [[TMP:%.*]] = extractelement <2 x double> [[A]], i32 1
+; CHECK-NEXT:    [[_MSCMP:%.*]] = icmp ne i64 [[_MSPROP]], 0
+; CHECK-NEXT:    br i1 [[_MSCMP]], label [[TMP3:%.*]], label [[TMP4:%.*]], !prof [[PROF0]]
+; CHECK:       3:
+; CHECK-NEXT:    call void @__msan_warning_noreturn() #[[ATTR4]]
+; CHECK-NEXT:    unreachable
+; CHECK:       4:
+; CHECK-NEXT:    [[TMP5:%.*]] = ptrtoint ptr [[PTR]] to i64
+; CHECK-NEXT:    [[TMP6:%.*]] = xor i64 [[TMP5]], 193514046488576
+; CHECK-NEXT:    [[TMP7:%.*]] = inttoptr i64 [[TMP6]] to ptr
+; CHECK-NEXT:    store i64 [[_MSPROP1]], ptr [[TMP7]], align 8
+; CHECK-NEXT:    store double [[TMP]], ptr [[PTR]], align 8
+; CHECK-NEXT:    ret void
+;
+  %ptr = getelementptr double, ptr %D, i64 1
+  %tmp = extractelement <2 x double> %A, i32 1
+  store double %tmp, ptr %ptr
+  ret void
+}
+
+define void @st1lane0_2d_double(<2 x double> %A, ptr %D) sanitize_memory {
+; CHECK-LABEL: define void @st1lane0_2d_double(
+; CHECK-SAME: <2 x double> [[A:%.*]], ptr [[D:%.*]]) #[[ATTR0]] {
+; CHECK-NEXT:    [[TMP1:%.*]] = load i64, ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_param_tls to i64), i64 16) to ptr), align 8
+; CHECK-NEXT:    [[TMP2:%.*]] = load <2 x i64>, ptr @__msan_param_tls, align 8
+; CHECK-NEXT:    call void @llvm.donothing()
+; CHECK-NEXT:    [[_MSPROP:%.*]] = or i64 [[TMP1]], 0
+; CHECK-NEXT:    [[PTR:%.*]] = getelementptr double, ptr [[D]], i64 1
+; CHECK-NEXT:    [[_MSPROP1:%.*]] = extractelement <2 x i64> [[TMP2]], i32 0
+; CHECK-NEXT:    [[TMP:%.*]] = extractelement <2 x double> [[A]], i32 0
+; CHECK-NEXT:    [[_MSCMP:%.*]] = icmp ne i64 [[_MSPROP]], 0
+; CHECK-NEXT:    br i1 [[_MSCMP]], label [[TMP3:%.*]], label [[TMP4:%.*]], !prof [[PROF0]]
+; CHECK:       3:
+; CHECK-NEXT:    call void @__msan_warning_noreturn() #[[ATTR4]]
+; CHECK-NEXT:    unreachable
+; CHECK:       4:
+; CHECK-NEXT:    [[TMP5:%.*]] = ptrtoint ptr [[PTR]] to i64
+; CHECK-NEXT:    [[TMP6:%.*]] = xor i64 [[TMP5]], 193514046488576
+; CHECK-NEXT:    [[TMP7:%.*]] = inttoptr i64 [[TMP6]] to ptr
+; CHECK-NEXT:    store i64 [[_MSPROP1]], ptr [[TMP7]], align 8
+; CHECK-NEXT:    store double [[TMP]], ptr [[PTR]], align 8
+; CHECK-NEXT:    ret void
+;
+  %ptr = getelementptr double, ptr %D, i64 1
+  %tmp = extractelement <2 x double> %A, i32 0
+  store double %tmp, ptr %ptr
+  ret void
+}
+
+define void @st1lane0u_2d_double(<2 x double> %A, ptr %D) sanitize_memory {
+; CHECK-LABEL: define void @st1lane0u_2d_double(
+; CHECK-SAME: <2 x double> [[A:%.*]], ptr [[D:%.*]]) #[[ATTR0]] {
+; CHECK-NEXT:    [[TMP1:%.*]] = load i64, ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_param_tls to i64), i64 16) to ptr), align 8
+; CHECK-NEXT:    [[TMP2:%.*]] = load <2 x i64>, ptr @__msan_param_tls, align 8
+; CHECK-NEXT:    call void @llvm.donothing()
+; CHECK-NEXT:    [[_MSPROP:%.*]] = or i64 [[TMP1]], 0
+; CHECK-NEXT:    [[PTR:%.*]] = getelementptr double, ptr [[D]], i64 -1
+; CHECK-NEXT:    [[_MSPROP1:%.*]] = extractelement <2 x i64> [[TMP2]], i32 0
+; CHECK-NEXT:    [[TMP:%.*]] = extractelement <2 x double> [[A]], i32 0
+; CHECK-NEXT:    [[_MSCMP:%.*]] = icmp ne i64 [[_MSPROP]], 0
+; CHECK-NEXT:    br i1 [[_MSCMP]], label [[TMP3:%.*]], label [[TMP4:%.*]], !prof [[PROF0]]
+; CHECK:       3:
+; CHECK-NEXT:    call void @__msan_warning_noreturn() #[[ATTR4]]
+; CHECK-NEXT:    unreachable
+; CHECK:       4:
+; CHECK-NEXT:    [[TMP5:%.*]] = ptrtoint ptr [[PTR]] to i64
+; CHECK-NEXT:    [[TMP6:%.*]] = xor i64 [[TMP5]], 193514046488576
+; CHECK-NEXT:    [[TMP7:%.*]] = inttoptr i64 [[TMP6]] to ptr
+; CHECK-NEXT:    store i64 [[_MSPROP1]], ptr [[TMP7]], align 8
+; CHECK-NEXT:    store double [[TMP]], ptr [[PTR]], align 8
+; CHECK-NEXT:    ret void
+;
+  %ptr = getelementptr double, ptr %D, i64 -1
+  %tmp = extractelement <2 x double> %A, i32 0
+  store double %tmp, ptr %ptr
+  ret void
+}
+
+define void @st1lane_ro_2d_double(<2 x double> %A, ptr %D, i64 %offset) sanitize_memory {
+; CHECK-LABEL: define void @st1lane_ro_2d_double(
+; CHECK-SAME: <2 x double> [[A:%.*]], ptr [[D:%.*]], i64 [[OFFSET:%.*]]) #[[ATTR0]] {
+; CHECK-NEXT:    [[TMP1:%.*]] = load i64, ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_param_tls to i64), i64 16) to ptr), align 8
+; CHECK-NEXT:    [[TMP2:%.*]] = load i64, ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_param_tls to i64), i64 24) to ptr), align 8
+; CHECK-NEXT:    [[TMP3:%.*]] = load <2 x i64>, ptr @__msan_param_tls, align 8
+; CHECK-NEXT:    call void @llvm.donothing()
+; CHECK-NEXT:    [[_MSPROP:%.*]] = or i64 [[TMP1]], [[TMP2]]
+; CHECK-NEXT:    [[PTR:%.*]] = getelementptr double, ptr [[D]], i64 [[OFFSET]]
+; CHECK-NEXT:    [[_MSPROP1:%.*]] = extractelement <2 x i64> [[TMP3]], i32 1
+; CHECK-NEXT:    [[TMP:%.*]] = extractelement <2 x double> [[A]], i32 1
+; CHECK-NEXT:    [[_MSCMP:%.*]] = icmp ne i64 [[_MSPROP]], 0
+; CHECK-NEXT:    br i1 [[_MSCMP]], label [[TMP4:%.*]], label [[TMP5:%.*]], !prof [[PROF0]]
+; CHECK:       4:
+; CHECK-NEXT:    call void @__msan_warning_noreturn() #[[ATTR4]]
+; CHECK-NEXT:    unreachable
+; CHECK:       5:
+; CHECK-NEXT:    [[TMP6:%.*]] = ptrtoint ptr [[PTR]] to i64
+; CHECK-NEXT:    [[TMP7:%.*]] = xor i64 [[TMP6]], 193514046488576
+; CHECK-NEXT:    [[TMP8:%.*]] = inttoptr i64 [[TMP7]] to ptr
+; CHECK-NEXT:    store i64 [[_MSPROP1]], ptr [[TMP8]], align 8
+; CHECK-NEXT:    store double [[TMP]], ptr [[PTR]], align 8
+; CHECK-NEXT:    ret void
+;
+  %ptr = getelementptr double, ptr %D, i64 %offset
+  %tmp = extractelement <2 x double> %A, i32 1
+  store double %tmp, ptr %ptr
+  ret void
+}
+
+define void @st1lane0_ro_2d_double(<2 x double> %A, ptr %D, i64 %offset) sanitize_memory {
+; CHECK-LABEL: define void @st1lane0_ro_2d_double(
+; CHECK-SAME: <2 x double> [[A:%.*]], ptr [[D:%.*]], i64 [[OFFSET:%.*]]) #[[ATTR0]] {
+; CHECK-NEXT:    [[TMP1:%.*]] = load i64, ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_param_tls to i64), i64 16) to ptr), align 8
+; CHECK-NEXT:    [[TMP2:%.*]] = load i64, ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_param_tls to i64), i64 24) to ptr), align 8
+; CHECK-NEXT:    [[TMP3:%.*]] = load <2 x i64>, ptr @__msan_param_tls, align 8
+; CHECK-NEXT:    call void @llvm.donothing()
+; CHECK-NEXT:    [[_MSPROP:%.*]] = or i64 [[TMP1]], [[TMP2]]
+; CHECK-NEXT:    [[PTR:%.*]] = getelementptr double, ptr [[D]], i64 [[OFFSET]]
+; CHECK-NEXT:    [[_MSPROP1:%.*]] = extractelement <2 x i64> [[TMP3]], i32 0
+; CHECK-NEXT:    [[TMP:%.*]] = extractelement <2 x double> [[A]], i32 0
+; CHECK-NEXT:    [[_MSCMP:%.*]] = icmp ne i64 [[_MSPROP]], 0
+; CHECK-NEXT:    br i1 [[_MSCMP]], label [[TMP4:%.*]], label [[TMP5:%.*]], !prof [[PROF0]]
+; CHECK:       4:
+; CHECK-NEXT:    call void @__msan_warning_noreturn() #[[ATTR4]]
+; CHECK-NEXT:    unreachable
+; CHECK:       5:
+; CHECK-NEXT:    [[TMP6:%.*]] = ptrtoint ptr [[PTR]] to i64
+; CHECK-NEXT:    [[TMP7:%.*]] = xor i64 [[TMP6]], 193514046488576
+; CHECK-NEXT:    [[TMP8:%.*]] = inttoptr i64 [[TMP7]] to ptr
+; CHECK-NEXT:    store i64 [[_MSPROP1]], ptr [[TMP8]], align 8
+; CHECK-NEXT:    store double [[TMP]], ptr [[PTR]], align 8
+; CHECK-NEXT:    ret void
+;
+  %ptr = getelementptr double, ptr %D, i64 %offset
+  %tmp = extractelement <2 x double> %A, i32 0
+  store double %tmp, ptr %ptr
+  ret void
+}
+
+define void @st1lane_8b(<8 x i8> %A, ptr %D) sanitize_memory {
+; CHECK-LABEL: define void @st1lane_8b(
+; CHECK-SAME: <8 x i8> [[A:%.*]], ptr [[D:%.*]]) #[[ATTR0]] {
+; CHECK-NEXT:    [[TMP1:%.*]] = load i64, ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_param_tls to i64), i64 8) to ptr), align 8
+; CHECK-NEXT:    [[TMP2:%.*]] = load <8 x i8>, ptr @__msan_param_tls, align 8
+; CHECK-NEXT:    call void @llvm.donothing()
+; CHECK-NEXT:    [[_MSPROP:%.*]] = or i64 [[TMP1]], 0
+; CHECK-NEXT:    [[PTR:%.*]] = getelementptr i8, ptr [[D]], i64 1
+; CHECK-NEXT:    [[_MSPROP1:%.*]] = extractelement <8 x i8> [[TMP2]], i32 1
+; CHECK-NEXT:    [[TMP:%.*]] = extractelement <8 x i8> [[A]], i32 1
+; CHECK-NEXT:    [[_MSCMP:%.*]] = icmp ne i64 [[_MSPROP]], 0
+; CHECK-NEXT:    br i1 [[_MSCMP]], label [[TMP3:%.*]], label [[TMP4:%.*]], !prof [[PROF0]]
+; CHECK:       3:
+; CHECK-NEXT:    call void @__msan_warning_noreturn() #[[ATTR4]]
+; CHECK-NEXT:    unreachable
+; CHECK:       4:
+; CHECK-NEXT:    [[TMP5:%.*]] = ptrtoint ptr [[PTR]] to i64
+; CHECK-NEXT:    [[TMP6:%.*]] = xor i64 [[TMP5]], 193514046488576
+; CHECK-NEXT:    [[TMP7:%.*]] = inttoptr i64 [[TMP6]] to ptr
+; CHECK-NEXT:    store i8 [[_MSPROP1]], ptr [[TMP7]], align 1
+; CHECK-NEXT:    store i8 [[TMP]], ptr [[PTR]], align 1
+; CHECK-NEXT:    ret void
+;
+  %ptr = getelementptr i8, ptr %D, i64 1
+  %tmp = extractelement <8 x i8> %A, i32 1
+  store i8 %tmp, ptr %ptr
+  ret void
+}
+
+define void @st1lane_ro_8b(<8 x i8> %A, ptr %D, i64 %offset) sanitize_memory {
+; CHECK-LABEL: define void @st1lane_ro_8b(
+; CHECK-SAME: <8 x i8> [[A:%.*]], ptr [[D:%.*]], i64 [[OFFSET:%.*]]) #[[ATTR0]] {
+; CHECK-NEXT:    [[TMP1:%.*]] = load i64, ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_param_tls to i64), i64 8) to ptr), align 8
+; CHECK-NEXT:    [[TMP2:%.*]] = load i64, ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_param_tls to i64), i64 16) to ptr), align 8
+; CHECK-NEXT:    [[TMP3:%.*]] = load <8 x i8>, ptr @__msan_param_tls, align 8
+; CHECK-NEXT:    call void @llvm.donothing()
+; CHECK-NEXT:    [[_MSPROP:%.*]] = or i64 [[TMP1]], [[TMP2]]
+; CHECK-NEXT:    [[PTR:%.*]] = getelementptr i8, ptr [[D]], i64 [[OFFSET]]
+; CHECK-NEXT:    [[_MSPROP1:%.*]] = extractelement <8 x i8> [[TMP3]], i32 1
+; CHECK-NEXT:    [[TMP:%.*]] = extractelement <8 x i8> [[A]], i32 1
+; CHECK-NEXT:    [[_MSCMP:%.*]] = icmp ne i64 [[_MSPROP]], 0
+; CHECK-NEXT:    br i1 [[_MSCMP]], label [[TMP4:%.*]], label [[TMP5:%.*]], !prof [[PROF0]]
+; CHECK:       4:
+; CHECK-NEXT:    call void @__msan_warning_noreturn() #[[ATTR4]]
+; CHECK-NEXT:    unreachable
+; CHECK:       5:
+; CHECK-NEXT:    [[TMP6:%.*]] = ptrtoint ptr [[PTR]] to i64
+; CHECK-NEXT:    [[TMP7:%.*]] = xor i64 [[TMP6]], 193514046488576
+; CHECK-NEXT:    [[TMP8:%.*]] = inttoptr i64 [[TMP7]] to ptr
+; CHECK-NEXT:    store i8 [[_MSPROP1]], ptr [[TMP8]], align 1
+; CHECK-NEXT:    store i8 [[TMP]], ptr [[PTR]], align 1
+; CHECK-NEXT:    ret void
+;
+  %ptr = getelementptr i8, ptr %D, i64 %offset
+  %tmp = extractelement <8 x i8> %A, i32 1
+  store i8 %tmp, ptr %ptr
+  ret void
+}
+
+define void @st1lane0_ro_8b(<8 x i8> %A, ptr %D, i64 %offset) sanitize_memory {
+; CHECK-LABEL: define void @st1lane0_ro_8b(
+; CHECK-SAME: <8 x i8> [[A:%.*]], ptr [[D:%.*]], i64 [[OFFSET:%.*]]) #[[ATTR0]] {
+; CHECK-NEXT:    [[TMP1:%.*]] = load i64, ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_param_tls to i64), i64 8) to ptr), align 8
+; CHECK-NEXT:    [[TMP2:%.*]] = load i64, ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_param_tls to i64), i64 16) to ptr), align 8
+; CHECK-NEXT:    [[TMP3:%.*]] = load <8 x i8>, ptr @__msan_param_tls, align 8
+; CHECK-NEXT:    call void @llvm.donothing()
+; CHECK-NEXT:    [[_MSPROP:%.*]] = or i64 [[TMP1]], [[TMP2]]
+; CHECK-NEXT:    [[PTR:%.*]] = getelementptr i8, ptr [[D]], i64 [[OFFSET]]
+; CHECK-NEXT:    [[_MSPROP1:%.*]] = extractelement <8 x i8> [[TMP3]], i32 0
+; CHECK-NEXT:    [[TMP:%.*]] = extractelement <8 x i8> [[A]], i32 0
+; CHECK-NEXT:    [[_MSCMP:%.*]] = icmp ne i64 [[_MSPROP]], 0
+; CHECK-NEXT:    br i1 [[_MSCMP]], label [[TMP4:%.*]], label [[TMP5:%.*]], !prof [[PROF0]]
+; CHECK:       4:
+; CHECK-NEXT:    call void @__msan_warning_noreturn() #[[ATTR4]]
+; CHECK-NEXT:    unreachable
+; CHECK:       5:
+; CHECK-NEXT:    [[TMP6:%.*]] = ptrtoint ptr [[PTR]] to i64
+; CHECK-NEXT:    [[TMP7:%.*]] = xor i64 [[TMP6]], 193514046488576
+; CHECK-NEXT:    [[TMP8:%.*]] = inttoptr i64 [[TMP7]] to ptr
+; CHECK-NEXT:    store i8 [[_MSPROP1]], ptr [[TMP8]], align 1
+; CHECK-NEXT:    store i8 [[TMP]], ptr [[PTR]], align 1
+; CHECK-NEXT:    ret void
+;
+  %ptr = getelementptr i8, ptr %D, i64 %offset
+  %tmp = extractelement <8 x i8> %A, i32 0
+  store i8 %tmp, ptr %ptr
+  ret void
+}
+
+define void @st1lane_4h(<4 x i16> %A, ptr %D) sanitize_memory {
+; CHECK-LABEL: define void @st1lane_4h(
+; CHECK-SAME: <4 x i16> [[A:%.*]], ptr [[D:%.*]]) #[[ATTR0]] {
+; CHECK-NEXT:    [[TMP1:%.*]] = load i64, ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_param_tls to i64), i64 8) to ptr), align 8
+; CHECK-NEXT:    [[TMP2:%.*]] = load <4 x i16>, ptr @__msan_param_tls, align 8
+; CHECK-NEXT:    call void @llvm.donothing()
+; CHECK-NEXT:    [[_MSPROP:%.*]] = or i64 [[TMP1]], 0
+; CHECK-NEXT:    [[PTR:%.*]] = getelementptr i16, ptr [[D]], i64 1
+; CHECK-NEXT:    [[_MSPROP1:%.*]] = extractelement <4 x i16> [[TMP2]], i32 1
+; CHECK-NEXT:    [[TMP:%.*]] = extractelement <4 x i16> [[A]], i32 1
+; CHECK-NEXT:    [[_MSCMP:%.*]] = icmp ne i64 [[_MSPROP]], 0
+; CHECK-NEXT:    br i1 [[_MSCMP]], label [[TMP3:%.*]], label [[TMP4:%.*]], !prof [[PROF0]]
+; CHECK:       3:
+; CHECK-NEXT:    call void @__msan_warning_noreturn() #[[ATTR4]]
+; CHECK-NEXT:    unreachable
+; CHECK:       4:
+; CHECK-NEXT:    [[TMP5:%.*]] = ptrtoint ptr [[PTR]] to i64
+; CHECK-NEXT:    [[TMP6:%.*]] = xor i64 [[TMP5]], 193514046488576
+; CHECK-NEXT:    [[TMP7:%.*]] = inttoptr i64 [[TMP6]] to ptr
+; CHECK-NEXT:    store i16 [[_MSPROP1]], ptr [[TMP7]], align 2
+; CHECK-NEXT:    store i16 [[TMP]], ptr [[PTR]], align 2
+; CHECK-NEXT:    ret void
+;
+  %ptr = getelementptr i16, ptr %D, i64 1
+  %tmp = extractelement <4 x i16> %A, i32 1
+  store i16 %tmp, ptr %ptr
+  ret void
+}
+
+define void @st1lane0_4h(<4 x i16> %A, ptr %D) sanitize_memory {
+; CHECK-LABEL: define void @st1lane0_4h(
+; CHECK-SAME: <4 x i16> [[A:%.*]], ptr [[D:%.*]]) #[[ATTR0]] {
+; CHECK-NEXT:    [[TMP1:%.*]] = load i64, ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_param_tls to i64), i64 8) to ptr), align 8
+; CHECK-NEXT:    [[TMP2:%.*]] = load <4 x i16>, ptr @__msan_param_tls, align 8
+; CHECK-NEXT:    call void @llvm.donothing()
+; CHECK-NEXT:    [[_MSPROP:%.*]] = or i64 [[TMP1]], 0
+; CHECK-NEXT:    [[PTR:%.*]] = getelementptr i16, ptr [[D]], i64 1
+; CHECK-NEXT:    [[_MSPROP1:%.*]] = extractelement <4 x i16> [[TMP2]], i32 0
+; CHECK-NEXT:    [[TMP:%.*]] = extractelement <4 x i16> [[A]], i32 0
+; CHECK-NEXT:    [[_MSCMP:%.*]] = icmp ne i64 [[_MSPROP]], 0
+; CHECK-NEXT:    br i1 [[_MSCMP]], label [[TMP3:%.*]], label [[TMP4:%.*]], !prof [[PROF0]]
+; CHECK:       3:
+; CHECK-NEXT:    call void @__msan_warning_noreturn() #[[ATTR4]]
+; CHECK-NEXT:    unreachable
+; CHECK:       4:
+; CHECK-NEXT:    [[TMP5:%.*]] = ptrtoint ptr [[PTR]] to i64
+; CHECK-NEXT:    [[TMP6:%.*]] = xor i64 [[TMP5]], 193514046488576
+; CHECK-NEXT:    [[TMP7:%.*]] = inttoptr i64 [[TMP6]] to ptr
+; CHECK-NEXT:    store i16 [[_MSPROP1]], ptr [[TMP7]], align 2
+; CHECK-NEXT:    store i16 [[TMP]], ptr [[PTR]], align 2
+; CHECK-NEXT:    ret void
+;
+  %ptr = getelementptr i16, ptr %D, i64 1
+  %tmp = extractelement <4 x i16> %A, i32 0
+  store i16 %tmp, ptr %ptr
+  ret void
+}
+
+define void @st1lane0u_4h(<4 x i16> %A, ptr %D) sanitize_memory {
+; CHECK-LABEL: define void @st1lane0u_4h(
+; CHECK-SAME: <4 x i16> [[A:%.*]], ptr [[D:%.*]]) #[[ATTR0]] {
+; CHECK-NEXT:    [[TMP1:%.*]] = load i64, ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_param_tls to i64), i64 8) to ptr), align 8
+; CHECK-NEXT:    [[TMP2:%.*]] = load <4 x i16>, ptr @__msan_param_tls, align 8
+; CHECK-NEXT:    call void @llvm.donothing()
+; CHECK-NEXT:    [[_MSPROP:%.*]] = or i64 [[TMP1]], 0
+; CHECK-NEXT:    [[PTR:%.*]] = getelementptr i16, ptr [[D]], i64 -1
+; CHECK-NEXT:    [[_MSPROP1:%.*]] = extractelement <4 x i16> [[TMP2]], i32 0
+; CHECK-NEXT:    [[TMP:%.*]] = extractelement <4 x i16> [[A]], i32 0
+; CHECK-NEXT:    [[_MSCMP:%.*]] = icmp ne i64 [[_MSPROP]], 0
+; CHECK-NEXT:    br i1 [[_MSCMP]], label [[TMP3:%.*]], label [[TMP4:%.*]], !prof [[PROF0]]
+; CHECK:       3:
+; CHECK-NEXT:    call void @__msan_warning_noreturn() #[[ATTR4]]
+; CHECK-NEXT:    unreachable
+; CHECK:       4:
+; CHECK-NEXT:    [[TMP5:%.*]] = ptrtoint ptr [[PTR]] to i64
+; CHECK-NEXT:    [[TMP6:%.*]] = xor i64 [[TMP5]], 193514046488576
+; CHECK-NEXT:    [[TMP7:%.*]] = inttoptr i64 [[TMP6]] to ptr
+; CHECK-NEXT:    store i16 [[_MSPROP1]], ptr [[TMP7]], align 2
+; CHECK-NEXT:    store i16 [[TMP]], ptr [[PTR]], align 2
+; CHECK-NEXT:    ret void
+;
+  %ptr = getelementptr i16, ptr %D, i64 -1
+  %tmp = extractelement <4 x i16> %A, i32 0
+  store i16 %tmp, ptr %ptr
+  ret void
+}
+
+define void @st1lane_ro_4h(<4 x i16> %A, ptr %D, i64 %offset) sanitize_memory {
+; CHECK-LABEL: define void @st1lane_ro_4h(
+; CHECK-SAME: <4 x i16> [[A:%.*]], ptr [[D:%.*]], i64 [[OFFSET:%.*]]) #[[ATTR0]] {
+; CHECK-NEXT:    [[TMP1:%.*]] = load i64, ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_param_tls to i64), i64 8) to ptr), align 8
+; CHECK-NEXT:    [[TMP2:%.*]] = load i64, ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_param_tls to i64), i64 16) to ptr), align 8
+; CHECK-NEXT:    [[TMP3:%.*]] = load <4 x i16>, ptr @__msan_param_tls, align 8
+; CHECK-NEXT:    call void @llvm.donothing()
+; CHECK-NEXT:    [[_MSPROP:%.*]] = or i64 [[TMP1]], [[TMP2]]
+; CHECK-NEXT:    [[PTR:%.*]] = getelementptr i16, ptr [[D]], i64 [[OFFSET]]
+; CHECK-NEXT:    [[_MSPROP1:%.*]] = extractelement <4 x i16> [[TMP3]], i32 1
+; CHECK-NEXT:    [[TMP:%.*]] = extractelement <4 x i16> [[A]], i32 1
+; CHECK-NEXT:    [[_MSCMP:%.*]] = icmp ne i64 [[_MSPROP]], 0
+; CHECK-NEXT:    br i1 [[_MSCMP]], label [[TMP4:%.*]], label [[TMP5:%.*]], !prof [[PROF0]]
+; CHECK:       4:
+; CHECK-NEXT:    call void @__msan_warning_noreturn() #[[ATTR4]]
+; CHECK-NEXT:    unreachable
+; CHECK:       5:
+; CHECK-NEXT:    [[TMP6:%.*]] = ptrtoint ptr [[PTR]] to i64
+; CHECK-NEXT:    [[TMP7:%.*]] = xor i64 [[TMP6]], 193514046488576
+; CHECK-NEXT:    [[TMP8:%.*]] = inttoptr i64 [[TMP7]] to ptr
+; CHECK-NEXT:    store i16 [[_MSPROP1]], ptr [[TMP8]], align 2
+; CHECK-NEXT:    store i16 [[TMP]], ptr [[PTR]], align 2
+; CHECK-NEXT:    ret void
+;
+  %ptr = getelementptr i16, ptr %D, i64 %offset
+  %tmp = extractelement <4 x i16> %A, i32 1
+  store i16 %tmp, ptr %ptr
+  ret void
+}
+
+define void @st1lane0_ro_4h(<4 x i16> %A, ptr %D, i64 %offset) sanitize_memory {
+; CHECK-LABEL: define void @st1lane0_ro_4h(
+; CHECK-SAME: <4 x i16> [[A:%.*]], ptr [[D:%.*]], i64 [[OFFSET:%.*]]) #[[ATTR0]] {
+; CHECK-NEXT:    [[TMP1:%.*]] = load i64, ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_param_tls to i64), i64 8) to ptr), align 8
+; CHECK-NEXT:    [[TMP2:%.*]] = load i64, ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_param_tls to i64), i64 16) to ptr), align 8
+; CHECK-NEXT:    [[TMP3:%.*]] = load <4 x i16>, ptr @__msan_param_tls, align 8
+; CHECK-NEXT:    call void @llvm.donothing()
+; CHECK-NEXT:    [[_MSPROP:%.*]] = or i64 [[TMP1]], [[TMP2]]
+; CHECK-NEXT:    [[PTR:%.*]] = getelementptr i16, ptr [[D]], i64 [[OFFSET]]
+; CHECK-NEXT:    [[_MSPROP1:%.*]] = extractelement <4 x i16> [[TMP3]], i32 0
+; CHECK-NEXT:    [[TMP:%.*]] = extractelement <4 x i16> [[A]], i32 0
+; CHECK-NEXT:    [[_MSCMP:%.*]] = icmp ne i64 [[_MSPROP]], 0
+; CHECK-NEXT:    br i1 [[_MSCMP]], label [[TMP4:%.*]], label [[TMP5:%.*]], !prof [[PROF0]]
+; CHECK:       4:
+; CHECK-NEXT:    call void @__msan_warning_noreturn() #[[ATTR4]]
+; CHECK-NEXT:    unreachable
+; CHECK:       5:
+; CHECK-NEXT:    [[TMP6:%.*]] = ptrtoint ptr [[PTR]] to i64
+; CHECK-NEXT:    [[TMP7:%.*]] = xor i64 [[TMP6]], 193514046488576
+; CHECK-NEXT:    [[TMP8:%.*]] = inttoptr i64 [[TMP7]] to ptr
+; CHECK-NEXT:    store i16 [[_MSPROP1]], ptr [[TMP8]], align 2
+; CHECK-NEXT:    store i16 [[TMP]], ptr [[PTR]], align 2
+; CHECK-NEXT:    ret void
+;
+  %ptr = getelementptr i16, ptr %D, i64 %offset
+  %tmp = extractelement <4 x i16> %A, i32 0
+  store i16 %tmp, ptr %ptr
+  ret void
+}
+
+define void @st1lane_2s(<2 x i32> %A, ptr %D) sanitize_memory {
+; CHECK-LABEL: define void @st1lane_2s(
+; CHECK-SAME: <2 x i32> [[A:%.*]], ptr [[D:%.*]]) #[[ATTR0]] {
+; CHECK-NEXT:    [[TMP1:%.*]] = load i64, ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_param_tls to i64), i64 8) to ptr), align 8
+; CHECK-NEXT:    [[TMP2:%.*]] = load <2 x i32>, ptr @__msan_param_tls, align 8
+; CHECK-NEXT:    call void @llvm.donothing()
+; CHECK-NEXT:    [[_MSPROP:%.*]] = or i64 [[TMP1]], 0
+; CHECK-NEXT:    [[PTR:%.*]] = getelementptr i32, ptr [[D]], i64 1
+; CHECK-NEXT:    [[_MSPROP1:%.*]] = extractelement <2 x i32> [[TMP2]], i32 1
+; CHECK-NEXT:    [[TMP:%.*]] = extractelement <2 x i32> [[A]], i32 1
+; CHECK-NEXT:    [[_MSCMP:%.*]] = icmp ne i64 [[_MSPROP]], 0
+; CHECK-NEXT:    br i1 [[_MSCMP]], label [[TMP3:%.*]], label [[TMP4:%.*]], !prof [[PROF0]]
+; CHECK:       3:
+; CHECK-NEXT:    call void @__msan_warning_noreturn() #[[ATTR4]]
+; CHECK-NEXT:    unreachable
+; CHECK:       4:
+; CHECK-NEXT:    [[TMP5:%.*]] = ptrtoint ptr [[PTR]] to i64
+; CHECK-NEXT:    [[TMP6:%.*]] = xor i64 [[TMP5]], 193514046488576
+; CHECK-NEXT:    [[TMP7:%.*]] = inttoptr i64 [[TMP6]] to ptr
+; CHECK-NEXT:    store i32 [[_MSPROP1]], ptr [[TMP7]], align 4
+; CHECK-NEXT:    store i32 [[TMP]], ptr [[PTR]], align 4
+; CHECK-NEXT:    ret void
+;
+  %ptr = getelementptr i32, ptr %D, i64 1
+  %tmp = extractelement <2 x i32> %A, i32 1
+  store i32 %tmp, ptr %ptr
+  ret void
+}
+
+define void @st1lane0_2s(<2 x i32> %A, ptr %D) sanitize_memory {
+; CHECK-LABEL: define void @st1lane0_2s(
+; CHECK-SAME: <2 x i32> [[A:%.*]], ptr [[D:%.*]]) #[[ATTR0]] {
+; CHECK-NEXT:    [[TMP1:%.*]] = load i64, ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_param_tls to i64), i64 8) to ptr), align 8
+; CHECK-NEXT:    [[TMP2:%.*]] = load <2 x i32>, ptr @__msan_param_tls, align 8
+; CHECK-NEXT:    call void @llvm.donothing()
+; CHECK-NEXT:    [[_MSPROP:%.*]] = or i64 [[TMP1]], 0
+; CHECK-NEXT:    [[PTR:%.*]] = getelementptr i32, ptr [[D]], i64 1
+; CHECK-NEXT:    [[_MSPROP1:%.*]] = extractelement <2 x i32> [[TMP2]], i32 0
+; CHECK-NEXT:    [[TMP:%.*]] = extractelement <2 x i32> [[A]], i32 0
+; CHECK-NEXT:    [[_MSCMP:%.*]] = icmp ne i64 [[_MSPROP]], 0
+; CHECK-NEXT:    br i1 [[_MSCMP]], label [[TMP3:%.*]], label [[TMP4:%.*]], !prof [[PROF0]]
+; CHECK:       3:
+; CHECK-NEXT:    call void @__msan_warning_noreturn() #[[ATTR4]]
+; CHECK-NEXT:    unreachable
+; CHECK:       4:
+; CHECK-NEXT:    [[TMP5:%.*]] = ptrtoint ptr [[PTR]] to i64
+; CHECK-NEXT:    [[TMP6:%.*]] = xor i64 [[TMP5]], 193514046488576
+; CHECK-NEXT:    [[TMP7:%.*]] = inttoptr i64 [[TMP6]] to ptr
+; CHECK-NEXT:    store i32 [[_MSPROP1]], ptr [[TMP7]], align 4
+; CHECK-NEXT:    store i32 [[TMP]], ptr [[PTR]], align 4
+; CHECK-NEXT:    ret void
+;
+  %ptr = getelementptr i32, ptr %D, i64 1
+  %tmp = extractelement <2 x i32> %A, i32 0
+  store i32 %tmp, ptr %ptr
+  ret void
+}
+
+define void @st1lane0u_2s(<2 x i32> %A, ptr %D) sanitize_memory {
+; CHECK-LABEL: define void @st1lane0u_2s(
+; CHECK-SAME: <2 x i32> [[A:%.*]], ptr [[D:%.*]]) #[[ATTR0]] {
+; CHECK-NEXT:    [[TMP1:%.*]] = load i64, ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_param_tls to i64), i64 8) to ptr), align 8
+; CHECK-NEXT:    [[TMP2:%.*]] = load <2 x i32>, ptr @__msan_param_tls, align 8
+; CHECK-NEXT:    call void @llvm.donothing()
+; CHECK-NEXT:    [[_MSPROP:%.*]] = or i64 [[TMP1]], 0
+; CHECK-NEXT:    [[PTR:%.*]] = getelementptr i32, ptr [[D]], i64 -1
+; CHECK-NEXT:    [[_MSPROP1:%.*]] = extractelement <2 x i32> [[TMP2]], i32 0
+; CHECK-NEXT:    [[TMP:%.*]] = extractelement <2 x i32> [[A]], i32 0
+; CHECK-NEXT:    [[_MSCMP:%.*]] = icmp ne i64 [[_MSPROP]], 0
+; CHECK-NEXT:    br i1 [[_MSCMP]], label [[TMP3:%.*]], label [[TMP4:%.*]], !prof [[PROF0]]
+; CHECK:       3:
+; CHECK-NEXT:    call void @__msan_warning_noreturn() #[[ATTR4]]
+; CHECK-NEXT:    unreachable
+; CHECK:       4:
+; CHECK-NEXT:    [[TMP5:%.*]] = ptrtoint ptr [[PTR]] to i64
+; CHECK-NEXT:    [[TMP6:%.*]] = xor i64 [[TMP5]], 193514046488576
+; CHECK-NEXT:    [[TMP7:%.*]] = inttoptr i64 [[TMP6]] to ptr
+; CHECK-NEXT:    store i32 [[_MSPROP1]], ptr [[TMP7]], align 4
+; CHECK-NEXT:    store i32 [[TMP]], ptr [[PTR]], align 4
+; CHECK-NEXT:    ret void
+;
+  %ptr = getelementptr i32, ptr %D, i64 -1
+  %tmp = extractelement <2 x i32> %A, i32 0
+  store i32 %tmp, ptr %ptr
+  ret void
+}
+
+define void @st1lane_ro_2s(<2 x i32> %A, ptr %D, i64 %offset) sanitize_memory {
+; CHECK-LABEL: define void @st1lane_ro_2s(
+; CHECK-SAME: <2 x i32> [[A:%.*]], ptr [[D:%.*]], i64 [[OFFSET:%.*]]) #[[ATTR0]] {
+; CHECK-NEXT:    [[TMP1:%.*]] = load i64, ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_param_tls to i64), i64 8) to ptr), align 8
+; CHECK-NEXT:    [[TMP2:%.*]] = load i64, ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_param_tls to i64), i64 16) to ptr), align 8
+; CHECK-NEXT:    [[TMP3:%.*]] = load <2 x i32>, ptr @__msan_param_tls, align 8
+; CHECK-NEXT:    call void @llvm.donothing()
+; CHECK-NEXT:    [[_MSPROP:%.*]] = or i64 [[TMP1]], [[TMP2]]
+; CHECK-NEXT:    [[PTR:%.*]] = getelementptr i32, ptr [[D]], i64 [[OFFSET]]
+; CHECK-NEXT:    [[_MSPROP1:%.*]] = extractelement <2 x i32> [[TMP3]], i32 1
+; CHECK-NEXT:    [[TMP:%.*]] = extractelement <2 x i32> [[A]], i32 1
+; CHECK-NEXT:    [[_MSCMP:%.*]] = icmp ne i64 [[_MSPROP]], 0
+; CHECK-NEXT:    br i1 [[_MSCMP]], label [[TMP4:%.*]], label [[TMP5:%.*]], !prof [[PROF0]]
+; CHECK:       4:
+; CHECK-NEXT:    call void @__msan_warning_noreturn() #[[ATTR4]]
+; CHECK-NEXT:    unreachable
+; CHECK:       5:
+; CHECK-NEXT:    [[TMP6:%.*]] = ptrtoint ptr [[PTR]] to i64
+; CHECK-NEXT:    [[TMP7:%.*]] = xor i64 [[TMP6]], 193514046488576
+; CHECK-NEXT:    [[TMP8:%.*]] = inttoptr i64 [[TMP7]] to ptr
+; CHECK-NEXT:    store i32 [[_MSPROP1]], ptr [[TMP8]], align 4
+; CHECK-NEXT:    store i32 [[TMP]], ptr [[PTR]], align 4
+; CHECK-NEXT:    ret void
+;
+  %ptr = getelementptr i32, ptr %D, i64 %offset
+  %tmp = extractelement <2 x i32> %A, i32 1
+  store i32 %tmp, ptr %ptr
+  ret void
+}
+
+define void @st1lane0_ro_2s(<2 x i32> %A, ptr %D, i64 %offset) sanitize_memory {
+; CHECK-LABEL: define void @st1lane0_ro_2s(
+; CHECK-SAME: <2 x i32> [[A:%.*]], ptr [[D:%.*]], i64 [[OFFSET:%.*]]) #[[ATTR0]] {
+; CHECK-NEXT:    [[TMP1:%.*]] = load i64, ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_param_tls to i64), i64 8) to ptr), align 8
+; CHECK-NEXT:    [[TMP2:%.*]] = load i64, ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_param_tls to i64), i64 16) to ptr), align 8
+; CHECK-NEXT:    [[TMP3:%.*]] = load <2 x i32>, ptr @__msan_param_tls, align 8
+; CHECK-NEXT:    call void @llvm.donothing()
+; CHECK-NEXT:    [[_MSPROP:%.*]] = or i64 [[TMP1]], [[TMP2]]
+; CHECK-NEXT:    [[PTR:%.*]] = getelementptr i32, ptr [[D]], i64 [[OFFSET]]
+; CHECK-NEXT:    [[_MSPROP1:%.*]] = extractelement <2 x i32> [[TMP3]], i32 0
+; CHECK-NEXT:    [[TMP:%.*]] = extractelement <2 x i32> [[A]], i32 0
+; CHECK-NEXT:    [[_MSCMP:%.*]] = icmp ne i64 [[_MSPROP]], 0
+; CHECK-NEXT:    br i1 [[_MSCMP]], label [[TMP4:%.*]], label [[TMP5:%.*]], !prof [[PROF0]]
+; CHECK:       4:
+; CHECK-NEXT:    call void @__msan_warning_noreturn() #[[ATTR4]]
+; CHECK-NEXT:    unreachable
+; CHECK:       5:
+; CHECK-NEXT:    [[TMP6:%.*]] = ptrtoint ptr [[PTR]] to i64
+; CHECK-NEXT:    [[TMP7:%.*]] = xor i64 [[TMP6]], 193514046488576
+; CHECK-NEXT:    [[TMP8:%.*]] = inttoptr i64 [[TMP7]] to ptr
+; CHECK-NEXT:    store i32 [[_MSPROP1]], ptr [[TMP8]], align 4
+; CHECK-NEXT:    store i32 [[TMP]], ptr [[PTR]], align 4
+; CHECK-NEXT:    ret void
+;
+  %ptr = getelementptr i32, ptr %D, i64 %offset
+  %tmp = extractelement <2 x i32> %A, i32 0
+  store i32 %tmp, ptr %ptr
+  ret void
+}
+
+define void @st1lane_2s_float(<2 x float> %A, ptr %D) sanitize_memory {
+; CHECK-LABEL: define void @st1lane_2s_float(
+; CHECK-SAME: <2 x float> [[A:%.*]], ptr [[D:%.*]]) #[[ATTR0]] {
+; CHECK-NEXT:    [[TMP1:%.*]] = load i64, ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_param_tls to i64), i64 8) to ptr), align 8
+; CHECK-NEXT:    [[TMP2:%.*]] = load <2 x i32>, ptr @__msan_param_tls, align 8
+; CHECK-NEXT:    call void @llvm.donothing()
+; CHECK-NEXT:    [[_MSPROP:%.*]] = or i64 [[TMP1]], 0
+; CHECK-NEXT:    [[PTR:%.*]] = getelementptr float, ptr [[D]], i64 1
+; CHECK-NEXT:    [[_MSPROP1:%.*]] = extractelement <2 x i32> [[TMP2]], i32 1
+; CHECK-NEXT:    [[TMP:%.*]] = extractelement <2 x float> [[A]], i32 1
+; CHECK-NEXT:    [[_MSCMP:%.*]] = icmp ne i64 [[_MSPROP]], 0
+; CHECK-NEXT:    br i1 [[_MSCMP]], label [[TMP3:%.*]], label [[TMP4:%.*]], !prof [[PROF0]]
+; CHECK:       3:
+; CHECK-NEXT:    call void @__msan_warning_noreturn() #[[ATTR4]]
+; CHECK-NEXT:    unreachable
+; CHECK:       4:
+; CHECK-NEXT:    [[TMP5:%.*]] = ptrtoint ptr [[PTR]] to i64
+; CHECK-NEXT:    [[TMP6:%.*]] = xor i64 [[TMP5]], 193514046488576
+; CHECK-NEXT:    [[TMP7:%.*]] = inttoptr i64 [[TMP6]] to ptr
+; CHECK-NEXT:    store i32 [[_MSPROP1]], ptr [[TMP7]], align 4
+; CHECK-NEXT:    store float [[TMP]], ptr [[PTR]], align 4
+; CHECK-NEXT:    ret void
+;
+  %ptr = getelementptr float, ptr %D, i64 1
+  %tmp = extractelement <2 x float> %A, i32 1
+  store float %tmp, ptr %ptr
+  ret void
+}
+
+define void @st1lane0_2s_float(<2 x float> %A, ptr %D) sanitize_memory {
+; CHECK-LABEL: define void @st1lane0_2s_float(
+; CHECK-SAME: <2 x float> [[A:%.*]], ptr [[D:%.*]]) #[[ATTR0]] {
+; CHECK-NEXT:    [[TMP1:%.*]] = load i64, ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_param_tls to i64), i64 8) to ptr), align 8
+; CHECK-NEXT:    [[TMP2:%.*]] = load <2 x i32>, ptr @__msan_param_tls, align 8
+; CHECK-NEXT:    call void @llvm.donothing()
+; CHECK-NEXT:    [[_MSPROP:%.*]] = or i64 [[TMP1]], 0
+; CHECK-NEXT:    [[PTR:%.*]] = getelementptr float, ptr [[D]], i64 1
+; CHECK-NEXT:    [[_MSPROP1:%.*]] = extractelement <2 x i32> [[TMP2]], i32 0
+; CHECK-NEXT:    [[TMP:%.*]] = extractelement <2 x float> [[A]], i32 0
+; CHECK-NEXT:    [[_MSCMP:%.*]] = icmp ne i64 [[_MSPROP]], 0
+; CHECK-NEXT:    br i1 [[_MSCMP]], label [[TMP3:%.*]], label [[TMP4:%.*]], !prof [[PROF0]]
+; CHECK:       3:
+; CHECK-NEXT:    call void @__msan_warning_noreturn() #[[ATTR4]]
+; CHECK-NEXT:    unreachable
+; CHECK:       4:
+; CHECK-NEXT:    [[TMP5:%.*]] = ptrtoint ptr [[PTR]] to i64
+; CHECK-NEXT:    [[TMP6:%.*]] = xor i64 [[TMP5]], 193514046488576
+; CHECK-NEXT:    [[TMP7:%.*]] = inttoptr i64 [[TMP6]] to ptr
+; CHECK-NEXT:    store i32 [[_MSPROP1]], ptr [[TMP7]], align 4
+; CHECK-NEXT:    store float [[TMP]], ptr [[PTR]], align 4
+; CHECK-NEXT:    ret void
+;
+  %ptr = getelementptr float, ptr %D, i64 1
+  %tmp = extractelement <2 x float> %A, i32 0
+  store float %tmp, ptr %ptr
+  ret void
+}
+
+define void @st1lane0u_2s_float(<2 x float> %A, ptr %D) sanitize_memory {
+; CHECK-LABEL: define void @st1lane0u_2s_float(
+; CHECK-SAME: <2 x float> [[A:%.*]], ptr [[D:%.*]]) #[[ATTR0]] {
+; CHECK-NEXT:    [[TMP1:%.*]] = load i64, ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_param_tls to i64), i64 8) to ptr), align 8
+; CHECK-NEXT:    [[TMP2:%.*]] = load <2 x i32>, ptr @__msan_param_tls, align 8
+; CHECK-NEXT:    call void @llvm.donothing()
+; CHECK-NEXT:    [[_MSPROP:%.*]] = or i64 [[TMP1]], 0
+; CHECK-NEXT:    [[PTR:%.*]] = getelementptr float, ptr [[D]], i64 -1
+; CHECK-NEXT:    [[_MSPROP1:%.*]] = extractelement <2 x i32> [[TMP2]], i32 0
+; CHECK-NEXT:    [[TMP:%.*]] = extractelement <2 x float> [[A]], i32 0
+; CHECK-NEXT:    [[_MSCMP:%.*]] = icmp ne i64 [[_MSPROP]], 0
+; CHECK-NEXT:    br i1 [[_MSCMP]], label [[TMP3:%.*]], label [[TMP4:%.*]], !prof [[PROF0]]
+; CHECK:       3:
+; CHECK-NEXT:    call void @__msan_warning_noreturn() #[[ATTR4]]
+; CHECK-NEXT:    unreachable
+; CHECK:       4:
+; CHECK-NEXT:    [[TMP5:%.*]] = ptrtoint ptr [[PTR]] to i64
+; CHECK-NEXT:    [[TMP6:%.*]] = xor i64 [[TMP5]], 193514046488576
+; CHECK-NEXT:    [[TMP7:%.*]] = inttoptr i64 [[TMP6]] to ptr
+; CHECK-NEXT:    store i32 [[_MSPROP1]], ptr [[TMP7]], align 4
+; CHECK-NEXT:    store float [[TMP]], ptr [[PTR]], align 4
+; CHECK-NEXT:    ret void
+;
+  %ptr = getelementptr float, ptr %D, i64 -1
+  %tmp = extractelement <2 x float> %A, i32 0
+  store float %tmp, ptr %ptr
+  ret void
+}
+
+define void @st1lane_ro_2s_float(<2 x float> %A, ptr %D, i64 %offset) sanitize_memory {
+; CHECK-LABEL: define void @st1lane_ro_2s_float(
+; CHECK-SAME: <2 x float> [[A:%.*]], ptr [[D:%.*]], i64 [[OFFSET:%.*]]) #[[ATTR0]] {
+; CHECK-NEXT:    [[TMP1:%.*]] = load i64, ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_param_tls to i64), i64 8) to ptr), align 8
+; CHECK-NEXT:    [[TMP2:%.*]] = load i64, ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_param_tls to i64), i64 16) to ptr), align 8
+; CHECK-NEXT:    [[TMP3:%.*]] = load <2 x i32>, ptr @__msan_param_tls, align 8
+; CHECK-NEXT:    call void @llvm.donothing()
+; CHECK-NEXT:    [[_MSPROP:%.*]] = or i64 [[TMP1]], [[TMP2]]
+; CHECK-NEXT:    [[PTR:%.*]] = getelementptr float, ptr [[D]], i64 [[OFFSET]]
+; CHECK-NEXT:    [[_MSPROP1:%.*]] = extractelement <2 x i32> [[TMP3]], i32 1
+; CHECK-NEXT:    [[TMP:%.*]] = extractelement <2 x float> [[A]], i32 1
+; CHECK-NEXT:    [[_MSCMP:%.*]] = icmp ne i64 [[_MSPROP]], 0
+; CHECK-NEXT:    br i1 [[_MSCMP]], label [[TMP4:%.*]], label [[TMP5:%.*]], !prof [[PROF0]]
+; CHECK:       4:
+; CHECK-NEXT:    call void @__msan_warning_noreturn() #[[ATTR4]]
+; CHECK-NEXT:    unreachable
+; CHECK:       5:
+; CHECK-NEXT:    [[TMP6:%.*]] = ptrtoint ptr [[PTR]] to i64
+; CHECK-NEXT:    [[TMP7:%.*]] = xor i64 [[TMP6]], 193514046488576
+; CHECK-NEXT:    [[TMP8:%.*]] = inttoptr i64 [[TMP7]] to ptr
+; CHECK-NEXT:    store i32 [[_MSPROP1]], ptr [[TMP8]], align 4
+; CHECK-NEXT:    store float [[TMP]], ptr [[PTR]], align 4
+; CHECK-NEXT:    ret void
+;
+  %ptr = getelementptr float, ptr %D, i64 %offset
+  %tmp = extractelement <2 x float> %A, i32 1
+  store float %tmp, ptr %ptr
+  ret void
+}
+
+define void @st1lane0_ro_2s_float(<2 x float> %A, ptr %D, i64 %offset) sanitize_memory {
+; CHECK-LABEL: define void @st1lane0_ro_2s_float(
+; CHECK-SAME: <2 x float> [[A:%.*]], ptr [[D:%.*]], i64 [[OFFSET:%.*]]) #[[ATTR0]] {
+; CHECK-NEXT:    [[TMP1:%.*]] = load i64, ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_param_tls to i64), i64 8) to ptr), align 8
+; CHECK-NEXT:    [[TMP2:%.*]] = load i64, ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_param_tls to i64), i64 16) to ptr), align 8
+; CHECK-NEXT:    [[TMP3:%.*]] = load <2 x i32>, ptr @__msan_param_tls, align 8
+; CHECK-NEXT:    call void @llvm.donothing()
+; CHECK-NEXT:    [[_MSPROP:%.*]] = or i64 [[TMP1]], [[TMP2]]
+; CHECK-NEXT:    [[PTR:%.*]] = getelementptr float, ptr [[D]], i64 [[OFFSET]]
+; CHECK-NEXT:    [[_MSPROP1:%.*]] = extractelement <2 x i32> [[TMP3]], i32 0
+; CHECK-NEXT:    [[TMP:%.*]] = extractelement <2 x float> [[A]], i32 0
+; CHECK-NEXT:    [[_MSCMP:%.*]] = icmp ne i64 [[_MSPROP]], 0
+; CHECK-NEXT:    br i1 [[_MSCMP]], label [[TMP4:%.*]], label [[TMP5:%.*]], !prof [[PROF0]]
+; CHECK:       4:
+; CHECK-NEXT:    call void @__msan_warning_noreturn() #[[ATTR4]]
+; CHECK-NEXT:    unreachable
+; CHECK:       5:
+; CHECK-NEXT:    [[TMP6:%.*]] = ptrtoint ptr [[PTR]] to i64
+; CHECK-NEXT:    [[TMP7:%.*]] = xor i64 [[TMP6]], 193514046488576
+; CHECK-NEXT:    [[TMP8:%.*]] = inttoptr i64 [[TMP7]] to ptr
+; CHECK-NEXT:    store i32 [[_MSPROP1]], ptr [[TMP8]], align 4
+; CHECK-NEXT:    store float [[TMP]], ptr [[PTR]], align 4
+; CHECK-NEXT:    ret void
+;
+  %ptr = getelementptr float, ptr %D, i64 %offset
+  %tmp = extractelement <2 x float> %A, i32 0
+  store float %tmp, ptr %ptr
+  ret void
+}
+
+define void @st1lane0_1d(<1 x i64> %A, ptr %D) sanitize_memory {
+; CHECK-LABEL: define void @st1lane0_1d(
+; CHECK-SAME: <1 x i64> [[A:%.*]], ptr [[D:%.*]]) #[[ATTR0]] {
+; CHECK-NEXT:    [[TMP1:%.*]] = load i64, ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_param_tls to i64), i64 8) to ptr), align 8
+; CHECK-NEXT:    [[TMP2:%.*]] = load <1 x i64>, ptr @__msan_param_tls, align 8
+; CHECK-NEXT:    call void @llvm.donothing()
+; CHECK-NEXT:    [[_MSPROP:%.*]] = or i64 [[TMP1]], 0
+; CHECK-NEXT:    [[PTR:%.*]] = getelementptr i64, ptr [[D]], i64 1
+; CHECK-NEXT:    [[_MSPROP1:%.*]] = extractelement <1 x i64> [[TMP2]], i32 0
+; CHECK-NEXT:    [[TMP:%.*]] = extractelement <1 x i64> [[A]], i32 0
+; CHECK-NEXT:    [[_MSCMP:%.*]] = icmp ne i64 [[_MSPROP]], 0
+; CHECK-NEXT:    br i1 [[_MSCMP]], label [[TMP3:%.*]], label [[TMP4:%.*]], !prof [[PROF0]]
+; CHECK:       3:
+; CHECK-NEXT:    call void @__msan_warning_noreturn() #[[ATTR4]]
+; CHECK-NEXT:    unreachable
+; CHECK:       4:
+; CHECK-NEXT:    [[TMP5:%.*]] = ptrtoint ptr [[PTR]] to i64
+; CHECK-NEXT:    [[TMP6:%.*]] = xor i64 [[TMP5]], 193514046488576
+; CHECK-NEXT:    [[TMP7:%.*]] = inttoptr i64 [[TMP6]] to ptr
+; CHECK-NEXT:    store i64 [[_MSPROP1]], ptr [[TMP7]], align 8
+; CHECK-NEXT:    store i64 [[TMP]], ptr [[PTR]], align 8
+; CHECK-NEXT:    ret void
+;
+  %ptr = getelementptr i64, ptr %D, i64 1
+  %tmp = extractelement <1 x i64> %A, i32 0
+  store i64 %tmp, ptr %ptr
+  ret void
+}
+
+define void @st1lane0u_1d(<1 x i64> %A, ptr %D) sanitize_memory {
+; CHECK-LABEL: define void @st1lane0u_1d(
+; CHECK-SAME: <1 x i64> [[A:%.*]], ptr [[D:%.*]]) #[[ATTR0]] {
+; CHECK-NEXT:    [[TMP1:%.*]] = load i64, ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_param_tls to i64), i64 8) to ptr), align 8
+; CHECK-NEXT:    [[TMP2:%.*]] = load <1 x i64>, ptr @__msan_param_tls, align 8
+; CHECK-NEXT:    call void @llvm.donothing()
+; CHECK-NEXT:    [[_MSPROP:%.*]] = or i64 [[TMP1]], 0
+; CHECK-NEXT:    [[PTR:%.*]] = getelementptr i64, ptr [[D]], i64 -1
+; CHECK-NEXT:    [[_MSPROP1:%.*]] = extractelement <1 x i64> [[TMP2]], i32 0
+; CHECK-NEXT:    [[TMP:%.*]] = extractelement <1 x i64> [[A]], i32 0
+; CHECK-NEXT:    [[_MSCMP:%.*]] = icmp ne i64 [[_MSPROP]], 0
+; CHECK-NEXT:    br i1 [[_MSCMP]], label [[TMP3:%.*]], label [[TMP4:%.*]], !prof [[PROF0]]
+; CHECK:       3:
+; CHECK-NEXT:    call void @__msan_warning_noreturn() #[[ATTR4]]
+; CHECK-NEXT:    unreachable
+; CHECK:       4:
+; CHECK-NEXT:    [[TMP5:%.*]] = ptrtoint ptr [[PTR]] to i64
+; CHECK-NEXT:    [[TMP6:%.*]] = xor i64 [[TMP5]], 193514046488576
+; CHECK-NEXT:    [[TMP7:%.*]] = inttoptr i64 [[TMP6]] to ptr
+; CHECK-NEXT:    store i64 [[_MSPROP1]], ptr [[TMP7]], align 8
+; CHECK-NEXT:    store i64 [[TMP]], ptr [[PTR]], align 8
+; CHECK-NEXT:    ret void
+;
+  %ptr = getelementptr i64, ptr %D, i64 -1
+  %tmp = extractelement <1 x i64> %A, i32 0
+  store i64 %tmp, ptr %ptr
+  ret void
+}
+
+define void @st1lane0_ro_1d(<1 x i64> %A, ptr %D, i64 %offset) sanitize_memory {
+; CHECK-LABEL: define void @st1lane0_ro_1d(
+; CHECK-SAME: <1 x i64> [[A:%.*]], ptr [[D:%.*]], i64 [[OFFSET:%.*]]) #[[ATTR0]] {
+; CHECK-NEXT:    [[TMP1:%.*]] = load i64, ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_param_tls to i64), i64 8) to ptr), align 8
+; CHECK-NEXT:    [[TMP2:%.*]] = load i64, ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_param_tls to i64), i64 16) to ptr), align 8
+; CHECK-NEXT:    [[TMP3:%.*]] = load <1 x i64>, ptr @__msan_param_tls, align 8
+; CHECK-NEXT:    call void @llvm.donothing()
+; CHECK-NEXT:    [[_MSPROP:%.*]] = or i64 [[TMP1]], [[TMP2]]
+; CHECK-NEXT:    [[PTR:%.*]] = getelementptr i64, ptr [[D]], i64 [[OFFSET]]
+; CHECK-NEXT:    [[_MSPROP1:%.*]] = extractelement <1 x i64> [[TMP3]], i32 0
+; CHECK-NEXT:    [[TMP:%.*]] = extractelement <1 x i64> [[A]], i32 0
+; CHECK-NEXT:    [[_MSCMP:%.*]] = icmp ne i64 [[_MSPROP]], 0
+; CHECK-NEXT:    br i1 [[_MSCMP]], label [[TMP4:%.*]], label [[TMP5:%.*]], !prof [[PROF0]]
+; CHECK:       4:
+; CHECK-NEXT:    call void @__msan_warning_noreturn() #[[ATTR4]]
+; CHECK-NEXT:    unreachable
+; CHECK:       5:
+; CHECK-NEXT:    [[TMP6:%.*]] = ptrtoint ptr [[PTR]] to i64
+; CHECK-NEXT:    [[TMP7:%.*]] = xor i64 [[TMP6]], 193514046488576
+; CHECK-NEXT:    [[TMP8:%.*]] = inttoptr i64 [[TMP7]] to ptr
+; CHECK-NEXT:    store i64 [[_MSPROP1]], ptr [[TMP8]], align 8
+; CHECK-NEXT:    store i64 [[TMP]], ptr [[PTR]], align 8
+; CHECK-NEXT:    ret void
+;
+  %ptr = getelementptr i64, ptr %D, i64 %offset
+  %tmp = extractelement <1 x i64> %A, i32 0
+  store i64 %tmp, ptr %ptr
+  ret void
+}
+
+define void @st1lane0_1d_double(<1 x double> %A, ptr %D) sanitize_memory {
+; CHECK-LABEL: define void @st1lane0_1d_double(
+; CHECK-SAME: <1 x double> [[A:%.*]], ptr [[D:%.*]]) #[[ATTR0]] {
+; CHECK-NEXT:    [[TMP1:%.*]] = load i64, ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_param_tls to i64), i64 8) to ptr), align 8
+; CHECK-NEXT:    [[TMP2:%.*]] = load <1 x i64>, ptr @__msan_param_tls, align 8
+; CHECK-NEXT:    call void @llvm.donothing()
+; CHECK-NEXT:    [[_MSPROP:%.*]] = or i64 [[TMP1]], 0
+; CHECK-NEXT:    [[PTR:%.*]] = getelementptr double, ptr [[D]], i64 1
+; CHECK-NEXT:    [[_MSPROP1:%.*]] = extractelement <1 x i64> [[TMP2]], i32 0
+; CHECK-NEXT:    [[TMP:%.*]] = extractelement <1 x double> [[A]], i32 0
+; CHECK-NEXT:    [[_MSCMP:%.*]] = icmp ne i64 [[_MSPROP]], 0
+; CHECK-NEXT:    br i1 [[_MSCMP]], label [[TMP3:%.*]], label [[TMP4:%.*]], !prof [[PROF0]]
+; CHECK:       3:
+; CHECK-NEXT:    call void @__msan_warning_noreturn() #[[ATTR4]]
+; CHECK-NEXT:    unreachable
+; CHECK:       4:
+; CHECK-NEXT:    [[TMP5:%.*]] = ptrtoint ptr [[PTR]] to i64
+; CHECK-NEXT:    [[TMP6:%.*]] = xor i64 [[TMP5]], 193514046488576
+; CHECK-NEXT:    [[TMP7:%.*]] = inttoptr i64 [[TMP6]] to ptr
+; CHECK-NEXT:    store i64 [[_MSPROP1]], ptr [[TMP7]], align 8
+; CHECK-NEXT:    store double [[TMP]], ptr [[PTR]], align 8
+; CHECK-NEXT:    ret void
+;
+  %ptr = getelementptr double, ptr %D, i64 1
+  %tmp = extractelement <1 x double> %A, i32 0
+  store double %tmp, ptr %ptr
+  ret void
+}
+
+define void @st1lane0u_1d_double(<1 x double> %A, ptr %D) sanitize_memory {
+; CHECK-LABEL: define void @st1lane0u_1d_double(
+; CHECK-SAME: <1 x double> [[A:%.*]], ptr [[D:%.*]]) #[[ATTR0]] {
+; CHECK-NEXT:    [[TMP1:%.*]] = load i64, ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_param_tls to i64), i64 8) to ptr), align 8
+; CHECK-NEXT:    [[TMP2:%.*]] = load <1 x i64>, ptr @__msan_param_tls, align 8
+; CHECK-NEXT:    call void @llvm.donothing()
+; CHECK-NEXT:    [[_MSPROP:%.*]] = or i64 [[TMP1]], 0
+; CHECK-NEXT:    [[PTR:%.*]] = getelementptr double, ptr [[D]], i64 -1
+; CHECK-NEXT:    [[_MSPROP1:%.*]] = extractelement <1 x i64> [[TMP2]], i32 0
+; CHECK-NEXT:    [[TMP:%.*]] = extractelement <1 x double> [[A]], i32 0
+; CHECK-NEXT:    [[_MSCMP:%.*]] = icmp ne i64 [[_MSPROP]], 0
+; CHECK-NEXT:    br i1 [[_MSCMP]], label [[TMP3:%.*]], label [[TMP4:%.*]], !prof [[PROF0]]
+; CHECK:       3:
+; CHECK-NEXT:    call void @__msan_warning_noreturn() #[[ATTR4]]
+; CHECK-NEXT:    unreachable
+; CHECK:       4:
+; CHECK-NEXT:    [[TMP5:%.*]] = ptrtoint ptr [[PTR]] to i64
+; CHECK-NEXT:    [[TMP6:%.*]] = xor i64 [[TMP5]], 193514046488576
+; CHECK-NEXT:    [[TMP7:%.*]] = inttoptr i64 [[TMP6]] to ptr
+; CHECK-NEXT:    store i64 [[_MSPROP1]], ptr [[TMP7]], align 8
+; CHECK-NEXT:    store double [[TMP]], ptr [[PTR]], align 8
+; CHECK-NEXT:    ret void
+;
+  %ptr = getelementptr double, ptr %D, i64 -1
+  %tmp = extractelement <1 x double> %A, i32 0
+  store double %tmp, ptr %ptr
+  ret void
+}
+
+define void @st1lane0_ro_1d_double(<1 x double> %A, ptr %D, i64 %offset) sanitize_memory {
+; CHECK-LABEL: define void @st1lane0_ro_1d_double(
+; CHECK-SAME: <1 x double> [[A:%.*]], ptr [[D:%.*]], i64 [[OFFSET:%.*]]) #[[ATTR0]] {
+; CHECK-NEXT:    [[TMP1:%.*]] = load i64, ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_param_tls to i64), i64 8) to ptr), align 8
+; CHECK-NEXT:    [[TMP2:%.*]] = load i64, ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_param_tls to i64), i64 16) to ptr), align 8
+; CHECK-NEXT:    [[TMP3:%.*]] = load <1 x i64>, ptr @__msan_param_tls, align 8
+; CHECK-NEXT:    call void @llvm.donothing()
+; CHECK-NEXT:    [[_MSPROP:%.*]] = or i64 [[TMP1]], [[TMP2]]
+; CHECK-NEXT:    [[PTR:%.*]] = getelementptr double, ptr [[D]], i64 [[OFFSET]]
+; CHECK-NEXT:    [[_MSPROP1:%.*]] = extractelement <1 x i64> [[TMP3]], i32 0
+; CHECK-NEXT:    [[TMP:%.*]] = extractelement <1 x double> [[A]], i32 0
+; CHECK-NEXT:    [[_MSCMP:%.*]] = icmp ne i64 [[_MSPROP]], 0
+; CHECK-NEXT:    br i1 [[_MSCMP]], label [[TMP4:%.*]], label [[TMP5:%.*]], !prof [[PROF0]]
+; CHECK:       4:
+; CHECK-NEXT:    call void @__msan_warning_noreturn() #[[ATTR4]]
+; CHECK-NEXT:    unreachable
+; CHECK:       5:
+; CHECK-NEXT:    [[TMP6:%.*]] = ptrtoint ptr [[PTR]] to i64
+; CHECK-NEXT:    [[TMP7:%.*]] = xor i64 [[TMP6]], 193514046488576
+; CHECK-NEXT:    [[TMP8:%.*]] = inttoptr i64 [[TMP7]] to ptr
+; CHECK-NEXT:    store i64 [[_MSPROP1]], ptr [[TMP8]], align 8
+; CHECK-NEXT:    store double [[TMP]], ptr [[PTR]], align 8
+; CHECK-NEXT:    ret void
+;
+  %ptr = getelementptr double, ptr %D, i64 %offset
+  %tmp = extractelement <1 x double> %A, i32 0
+  store double %tmp, ptr %ptr
+  ret void
+}
+
+define void @st2lane_16b(<16 x i8> %A, <16 x i8> %B, ptr %D) sanitize_memory {
+; CHECK-LABEL: define void @st2lane_16b(
+; CHECK-SAME: <16 x i8> [[A:%.*]], <16 x i8> [[B:%.*]], ptr [[D:%.*]]) #[[ATTR0]] {
+; CHECK-NEXT:    [[TMP2:%.*]] = load <16 x i8>, ptr @__msan_param_tls, align 8
+; CHECK-NEXT:    [[TMP3:%.*]] = load <16 x i8>, ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_param_tls to i64), i64 16) to ptr), align 8
+; CHECK-NEXT:    [[TMP1:%.*]] = load i64, ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_param_tls to i64), i64 32) to ptr), align 8
+; CHECK-NEXT:    call void @llvm.donothing()
+; CHECK-NEXT:    [[TMP4:%.*]] = bitcast <16 x i8> [[TMP2]] to i128
+; CHECK-NEXT:    [[_MSCMP2:%.*]] = icmp ne i128 [[TMP4]], 0
+; CHECK-NEXT:    [[TMP5:%.*]] = bitcast <16 x i8> [[TMP3]] to i128
+; CHECK-NEXT:    [[_MSCMP1:%.*]] = icmp ne i128 [[TMP5]], 0
+; CHECK-NEXT:    [[_MSOR:%.*]] = or i1 [[_MSCMP2]], [[_MSCMP1]]
+; CHECK-NEXT:    [[_MSCMP:%.*]] = icmp ne i64 [[TMP1]], 0
+; CHECK-NEXT:    [[_MSOR3:%.*]] = or i1 [[_MSOR]], [[_MSCMP]]
+; CHECK-NEXT:    br i1 [[_MSOR3]], label [[TMP6:%.*]], label [[TMP7:%.*]], !prof [[PROF0]]
+; CHECK:       6:
+; CHECK-NEXT:    call void @__msan_warning_noreturn() #[[ATTR4]]
+; CHECK-NEXT:    unreachable
+; CHECK:       7:
+; CHECK-NEXT:    call void @llvm.aarch64.neon.st2lane.v16i8.p0(<16 x i8> [[A]], <16 x i8> [[B]], i64 1, ptr [[D]])
+; CHECK-NEXT:    ret void
+;
+  call void @llvm.aarch64.neon.st2lane.v16i8.p0(<16 x i8> %A, <16 x i8> %B, i64 1, ptr %D)
+  ret void
+}
+
+define void @st2lane_8h(<8 x i16> %A, <8 x i16> %B, ptr %D) sanitize_memory {
+; CHECK-LABEL: define void @st2lane_8h(
+; CHECK-SAME: <8 x i16> [[A:%.*]], <8 x i16> [[B:%.*]], ptr [[D:%.*]]) #[[ATTR0]] {
+; CHECK-NEXT:    [[TMP2:%.*]] = load <8 x i16>, ptr @__msan_param_tls, align 8
+; CHECK-NEXT:    [[TMP3:%.*]] = load <8 x i16>, ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_param_tls to i64), i64 16) to ptr), align 8
+; CHECK-NEXT:    [[TMP1:%.*]] = load i64, ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_param_tls to i64), i64 32) to ptr), align 8
+; CHECK-NEXT:    call void @llvm.donothing()
+; CHECK-NEXT:    [[TMP4:%.*]] = bitcast <8 x i16> [[TMP2]] to i128
+; CHECK-NEXT:    [[_MSCMP2:%.*]] = icmp ne i128 [[TMP4]], 0
+; CHECK-NEXT:    [[TMP5:%.*]] = bitcast <8 x i16> [[TMP3]] to i128
+; CHECK-NEXT:    [[_MSCMP1:%.*]] = icmp ne i128 [[TMP5]], 0
+; CHECK-NEXT:    [[_MSOR:%.*]] = or i1 [[_MSCMP2]], [[_MSCMP1]]
+; CHECK-NEXT:    [[_MSCMP:%.*]] = icmp ne i64 [[TMP1]], 0
+; CHECK-NEXT:    [[_MSOR3:%.*]] = or i1 [[_MSOR]], [[_MSCMP]]
+; CHECK-NEXT:    br i1 [[_MSOR3]], label [[TMP6:%.*]], label [[TMP7:%.*]], !prof [[PROF0]]
+; CHECK:       6:
+; CHECK-NEXT:    call void @__msan_warning_noreturn() #[[ATTR4]]
+; CHECK-NEXT:    unreachable
+; CHECK:       7:
+; CHECK-NEXT:    call void @llvm.aarch64.neon.st2lane.v8i16.p0(<8 x i16> [[A]], <8 x i16> [[B]], i64 1, ptr [[D]])
+; CHECK-NEXT:    ret void
+;
+  call void @llvm.aarch64.neon.st2lane.v8i16.p0(<8 x i16> %A, <8 x i16> %B, i64 1, ptr %D)
+  ret void
+}
+
+define void @st2lane_4s(<4 x i32> %A, <4 x i32> %B, ptr %D) sanitize_memory {
+; CHECK-LABEL: define void @st2lane_4s(
+; CHECK-SAME: <4 x i32> [[A:%.*]], <4 x i32> [[B:%.*]], ptr [[D:%.*]]) #[[ATTR0]] {
+; CHECK-NEXT:    [[TMP2:%.*]] = load <4 x i32>, ptr @__msan_param_tls, align 8
+; CHECK-NEXT:    [[TMP3:%.*]] = load <4 x i32>, ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_param_tls to i64), i64 16) to ptr), align 8
+; CHECK-NEXT:    [[TMP1:%.*]] = load i64, ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_param_tls to i64), i64 32) to ptr), align 8
+; CHECK-NEXT:    call void @llvm.donothing()
+; CHECK-NEXT:    [[TMP4:%.*]] = bitcast <4 x i32> [[TMP2]] to i128
+; CHECK-NEXT:    [[_MSCMP2:%.*]] = icmp ne i128 [[TMP4]], 0
+; CHECK-NEXT:    [[TMP5:%.*]] = bitcast <4 x i32> [[TMP3]] to i128
+; CHECK-NEXT:    [[_MSCMP1:%.*]] = icmp ne i128 [[TMP5]], 0
+; CHECK-NEXT:    [[_MSOR:%.*]] = or i1 [[_MSCMP2]], [[_MSCMP1]]
+; CHECK-NEXT:    [[_MSCMP:%.*]] = icmp ne i64 [[TMP1]], 0
+; CHECK-NEXT:    [[_MSOR3:%.*]] = or i1 [[_MSOR]], [[_MSCMP]]
+; CHECK-NEXT:    br i1 [[_MSOR3]], label [[TMP6:%.*]], label [[TMP7:%.*]], !prof [[PROF0]]
+; CHECK:       6:
+; CHECK-NEXT:    call void @__msan_warning_noreturn() #[[ATTR4]]
+; CHECK-NEXT:    unreachable
+; CHECK:       7:
+; CHECK-NEXT:    call void @llvm.aarch64.neon.st2lane.v4i32.p0(<4 x i32> [[A]], <4 x i32> [[B]], i64 1, ptr [[D]])
+; CHECK-NEXT:    ret void
+;
+  call void @llvm.aarch64.neon.st2lane.v4i32.p0(<4 x i32> %A, <4 x i32> %B, i64 1, ptr %D)
+  ret void
+}
+
+define void @st2lane_2d(<2 x i64> %A, <2 x i64> %B, ptr %D) sanitize_memory {
+; CHECK-LABEL: define void @st2lane_2d(
+; CHECK-SAME: <2 x i64> [[A:%.*]], <2 x i64> [[B:%.*]], ptr [[D:%.*]]) #[[ATTR0]] {
+; CHECK-NEXT:    [[TMP2:%.*]] = load <2 x i64>, ptr @__msan_param_tls, align 8
+; CHECK-NEXT:    [[TMP3:%.*]] = load <2 x i64>, ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_param_tls to i64), i64 16) to ptr), align 8
+; CHECK-NEXT:    [[TMP1:%.*]] = load i64, ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_param_tls to i64), i64 32) to ptr), align 8
+; CHECK-NEXT:    call void @llvm.donothing()
+; CHECK-NEXT:    [[TMP4:%.*]] = bitcast <2 x i64> [[TMP2]] to i128
+; CHECK-NEXT:    [[_MSCMP2:%.*]] = icmp ne i128 [[TMP4]], 0
+; CHECK-NEXT:    [[TMP5:%.*]] = bitcast <2 x i64> [[TMP3]] to i128
+; CHECK-NEXT:    [[_MSCMP1:%.*]] = icmp ne i128 [[TMP5]], 0
+; CHECK-NEXT:    [[_MSOR:%.*]] = or i1 [[_MSCMP2]], [[_MSCMP1]]
+; CHECK-NEXT:    [[_MSCMP:%.*]] = icmp ne i64 [[TMP1]], 0
+; CHECK-NEXT:    [[_MSOR3:%.*]] = or i1 [[_MSOR]], [[_MSCMP]]
+; CHECK-NEXT:    br i1 [[_MSOR3]], label [[TMP6:%.*]], label [[TMP7:%.*]], !prof [[PROF0]]
+; CHECK:       6:
+; CHECK-NEXT:    call void @__msan_warning_noreturn() #[[ATTR4]]
+; CHECK-NEXT:    unreachable
+; CHECK:       7:
+; CHECK-NEXT:    call void @llvm.aarch64.neon.st2lane.v2i64.p0(<2 x i64> [[A]], <2 x i64> [[B]], i64 1, ptr [[D]])
+; CHECK-NEXT:    ret void
+;
+  call void @llvm.aarch64.neon.st2lane.v2i64.p0(<2 x i64> %A, <2 x i64> %B, i64 1, ptr %D)
+  ret void
+}
+
+declare void @llvm.aarch64.neon.st2lane.v16i8.p0(<16 x i8>, <16 x i8>, i64, ptr) nounwind readnone
+declare void @llvm.aarch64.neon.st2lane.v8i16.p0(<8 x i16>, <8 x i16>, i64, ptr) nounwind readnone
+declare void @llvm.aarch64.neon.st2lane.v4i32.p0(<4 x i32>, <4 x i32>, i64, ptr) nounwind readnone
+declare void @llvm.aarch64.neon.st2lane.v2i64.p0(<2 x i64>, <2 x i64>, i64, ptr) nounwind readnone
+
+define void @st3lane_16b(<16 x i8> %A, <16 x i8> %B, <16 x i8> %C, ptr %D) sanitize_memory {
+; CHECK-LABEL: define void @st3lane_16b(
+; CHECK-SAME: <16 x i8> [[A:%.*]], <16 x i8> [[B:%.*]], <16 x i8> [[C:%.*]], ptr [[D:%.*]]) #[[ATTR0]] {
+; CHECK-NEXT:    [[TMP2:%.*]] = load <16 x i8>, ptr @__msan_param_tls, align 8
+; CHECK-NEXT:    [[TMP3:%.*]] = load <16 x i8>, ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_param_tls to i64), i64 16) to ptr), align 8
+; CHECK-NEXT:    [[TMP4:%.*]] = load <16 x i8>, ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_param_tls to i64), i64 32) to ptr), align 8
+; CHECK-NEXT:    [[TMP1:%.*]] = load i64, ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_param_tls to i64), i64 48) to ptr), align 8
+; CHECK-NEXT:    call void @llvm.donothing()
+; CHECK-NEXT:    [[TMP5:%.*]] = bitcast <16 x i8> [[TMP2]] to i128
+; CHECK-NEXT:    [[_MSCMP3:%.*]] = icmp ne i128 [[TMP5]], 0
+; CHECK-NEXT:    [[TMP6:%.*]] = bitcast <16 x i8> [[TMP3]] to i128
+; CHECK-NEXT:    [[_MSCMP1:%.*]] = icmp ne i128 [[TMP6]], 0
+; CHECK-NEXT:    [[_MSOR:%.*]] = or i1 [[_MSCMP3]], [[_MSCMP1]]
+; CHECK-NEXT:    [[TMP7:%.*]] = bitcast <16 x i8> [[TMP4]] to i128
+; CHECK-NEXT:    [[_MSCMP2:%.*]] = icmp ne i128 [[TMP7]], 0
+; CHECK-NEXT:    [[_MSOR3:%.*]] = or i1 [[_MSOR]], [[_MSCMP2]]
+; CHECK-NEXT:    [[_MSCMP:%.*]] = icmp ne i64 [[TMP1]], 0
+; CHECK-NEXT:    [[_MSOR5:%.*]] = or i1 [[_MSOR3]], [[_MSCMP]]
+; CHECK-NEXT:    br i1 [[_MSOR5]], label [[TMP8:%.*]], label [[TMP9:%.*]], !prof [[PROF0]]
+; CHECK:       8:
+; CHECK-NEXT:    call void @__msan_warning_noreturn() #[[ATTR4]]
+; CHECK-NEXT:    unreachable
+; CHECK:       9:
+; CHECK-NEXT:    call void @llvm.aarch64.neon.st3lane.v16i8.p0(<16 x i8> [[A]], <16 x i8> [[B]], <16 x i8> [[C]], i64 1, ptr [[D]])
+; CHECK-NEXT:    ret void
+;
+  call void @llvm.aarch64.neon.st3lane.v16i8.p0(<16 x i8> %A, <16 x i8> %B, <16 x i8> %C, i64 1, ptr %D)
+  ret void
+}
+
+define void @st3lane_8h(<8 x i16> %A, <8 x i16> %B, <8 x i16> %C, ptr %D) sanitize_memory {
+; CHECK-LABEL: define void @st3lane_8h(
+; CHECK-SAME: <8 x i16> [[A:%.*]], <8 x i16> [[B:%.*]], <8 x i16> [[C:%.*]], ptr [[D:%.*]]) #[[ATTR0]] {
+; CHECK-NEXT:    [[TMP2:%.*]] = load <8 x i16>, ptr @__msan_param_tls, align 8
+; CHECK-NEXT:    [[TMP3:%.*]] = load <8 x i16>, ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_param_tls to i64), i64 16) to ptr), align 8
+; CHECK-NEXT:    [[TMP4:%.*]] = load <8 x i16>, ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_param_tls to i64), i64 32) to ptr), align 8
+; CHECK-NEXT:    [[TMP1:%.*]] = load i64, ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_param_tls to i64), i64 48) to ptr), align 8
+; CHECK-NEXT:    call void @llvm.donothing()
+; CHECK-NEXT:    [[TMP5:%.*]] = bitcast <8 x i16> [[TMP2]] to i128
+; CHECK-NEXT:    [[_MSCMP3:%.*]] = icmp ne i128 [[TMP5]], 0
+; CHECK-NEXT:    [[TMP6:%.*]] = bitcast <8 x i16> [[TMP3]] to i128
+; CHECK-NEXT:    [[_MSCMP1:%.*]] = icmp ne i128 [[TMP6]], 0
+; CHECK-NEXT:    [[_MSOR:%.*]] = or i1 [[_MSCMP3]], [[_MSCMP1]]
+; CHECK-NEXT:    [[TMP7:%.*]] = bitcast <8 x i16> [[TMP4]] to i128
+; CHECK-NEXT:    [[_MSCMP2:%.*]] = icmp ne i128 [[TMP7]], 0
+; CHECK-NEXT:    [[_MSOR3:%.*]] = or i1 [[_MSOR]], [[_MSCMP2]]
+; CHECK-NEXT:    [[_MSCMP:%.*]] = icmp ne i64 [[TMP1]], 0
+; CHECK-NEXT:    [[_MSOR5:%.*]] = or i1 [[_MSOR3]], [[_MSCMP]]
+; CHECK-NEXT:    br i1 [[_MSOR5]], label [[TMP8:%.*]], label [[TMP9:%.*]], !prof [[PROF0]]
+; CHECK:       8:
+; CHECK-NEXT:    call void @__msan_warning_noreturn() #[[ATTR4]]
+; CHECK-NEXT:    unreachable
+; CHECK:       9:
+; CHECK-NEXT:    call void @llvm.aarch64.neon.st3lane.v8i16.p0(<8 x i16> [[A]], <8 x i16> [[B]], <8 x i16> [[C]], i64 1, ptr [[D]])
+; CHECK-NEXT:    ret void
+;
+  call void @llvm.aarch64.neon.st3lane.v8i16.p0(<8 x i16> %A, <8 x i16> %B, <8 x i16> %C, i64 1, ptr %D)
+  ret void
+}
+
+define void @st3lane_4s(<4 x i32> %A, <4 x i32> %B, <4 x i32> %C, ptr %D) sanitize_memory {
+; CHECK-LABEL: define void @st3lane_4s(
+; CHECK-SAME: <4 x i32> [[A:%.*]], <4 x i32> [[B:%.*]], <4 x i32> [[C:%.*]], ptr [[D:%.*]]) #[[ATTR0]] {
+; CHECK-NEXT:    [[TMP2:%.*]] = load <4 x i32>, ptr @__msan_param_tls, align 8
+; CHECK-NEXT:    [[TMP3:%.*]] = load <4 x i32>, ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_param_tls to i64), i64 16) to ptr), align 8
+; CHECK-NEXT:    [[TMP4:%.*]] = load <4 x i32>, ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_param_tls to i64), i64 32) to ptr), align 8
+; CHECK-NEXT:    [[TMP1:%.*]] = load i64, ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_param_tls to i64), i64 48) to ptr), align 8
+; CHECK-NEXT:    call void @llvm.donothing()
+; CHECK-NEXT:    [[TMP5:%.*]] = bitcast <4 x i32> [[TMP2]] to i128
+; CHECK-NEXT:    [[_MSCMP3:%.*]] = icmp ne i128 [[TMP5]], 0
+; CHECK-NEXT:    [[TMP6:%.*]] = bitcast <4 x i32> [[TMP3]] to i128
+; CHECK-NEXT:    [[_MSCMP1:%.*]] = icmp ne i128 [[TMP6]], 0
+; CHECK-NEXT:    [[_MSOR:%.*]] = or i1 [[_MSCMP3]], [[_MSCMP1]]
+; CHECK-NEXT:    [[TMP7:%.*]] = bitcast <4 x i32> [[TMP4]] to i128
+; CHECK-NEXT:    [[_MSCMP2:%.*]] = icmp ne i128 [[TMP7]], 0
+; CHECK-NEXT:    [[_MSOR3:%.*]] = or i1 [[_MSOR]], [[_MSCMP2]]
+; CHECK-NEXT:    [[_MSCMP:%.*]] = icmp ne i64 [[TMP1]], 0
+; CHECK-NEXT:    [[_MSOR5:%.*]] = or i1 [[_MSOR3]], [[_MSCMP]]
+; CHECK-NEXT:    br i1 [[_MSOR5]], label [[TMP8:%.*]], label [[TMP9:%.*]], !prof [[PROF0]]
+; CHECK:       8:
+; CHECK-NEXT:    call void @__msan_warning_noreturn() #[[ATTR4]]
+; CHECK-NEXT:    unreachable
+; CHECK:       9:
+; CHECK-NEXT:    call void @llvm.aarch64.neon.st3lane.v4i32.p0(<4 x i32> [[A]], <4 x i32> [[B]], <4 x i32> [[C]], i64 1, ptr [[D]])
+; CHECK-NEXT:    ret void
+;
+  call void @llvm.aarch64.neon.st3lane.v4i32.p0(<4 x i32> %A, <4 x i32> %B, <4 x i32> %C, i64 1, ptr %D)
+  ret void
+}
+
+define void @st3lane_2d(<2 x i64> %A, <2 x i64> %B, <2 x i64> %C, ptr %D) sanitize_memory {
+; CHECK-LABEL: define void @st3lane_2d(
+; CHECK-SAME: <2 x i64> [[A:%.*]], <2 x i64> [[B:%.*]], <2 x i64> [[C:%.*]], ptr [[D:%.*]]) #[[ATTR0]] {
+; CHECK-NEXT:    [[TMP2:%.*]] = load <2 x i64>, ptr @__msan_param_tls, align 8
+; CHECK-NEXT:    [[TMP3:%.*]] = load <2 x i64>, ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_param_tls to i64), i64 16) to ptr), align 8
+; CHECK-NEXT:    [[TMP4:%.*]] = load <2 x i64>, ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_param_tls to i64), i64 32) to ptr), align 8
+; CHECK-NEXT:    [[TMP1:%.*]] = load i64, ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_param_tls to i64), i64 48) to ptr), align 8
+; CHECK-NEXT:    call void @llvm.donothing()
+; CHECK-NEXT:    [[TMP5:%.*]] = bitcast <2 x i64> [[TMP2]] to i128
+; CHECK-NEXT:    [[_MSCMP3:%.*]] = icmp ne i128 [[TMP5]], 0
+; CHECK-NEXT:    [[TMP6:%.*]] = bitcast <2 x i64> [[TMP3]] to i128
+; CHECK-NEXT:    [[_MSCMP1:%.*]] = icmp ne i128 [[TMP6]], 0
+; CHECK-NEXT:    [[_MSOR:%.*]] = or i1 [[_MSCMP3]], [[_MSCMP1]]
+; CHECK-NEXT:    [[TMP7:%.*]] = bitcast <2 x i64> [[TMP4]] to i128
+; CHECK-NEXT:    [[_MSCMP2:%.*]] = icmp ne i128 [[TMP7]], 0
+; CHECK-NEXT:    [[_MSOR3:%.*]] = or i1 [[_MSOR]], [[_MSCMP2]]
+; CHECK-NEXT:    [[_MSCMP:%.*]] = icmp ne i64 [[TMP1]], 0
+; CHECK-NEXT:    [[_MSOR5:%.*]] = or i1 [[_MSOR3]], [[_MSCMP]]
+; CHECK-NEXT:    br i1 [[_MSOR5]], label [[TMP8:%.*]], label [[TMP9:%.*]], !prof [[PROF0]]
+; CHECK:       8:
+; CHECK-NEXT:    call void @__msan_warning_noreturn() #[[ATTR4]]
+; CHECK-NEXT:    unreachable
+; CHECK:       9:
+; CHECK-NEXT:    call void @llvm.aarch64.neon.st3lane.v2i64.p0(<2 x i64> [[A]], <2 x i64> [[B]], <2 x i64> [[C]], i64 1, ptr [[D]])
+; CHECK-NEXT:    ret void
+;
+  call void @llvm.aarch64.neon.st3lane.v2i64.p0(<2 x i64> %A, <2 x i64> %B, <2 x i64> %C, i64 1, ptr %D)
+  ret void
+}
+
+declare void @llvm.aarch64.neon.st3lane.v16i8.p0(<16 x i8>, <16 x i8>, <16 x i8>, i64, ptr) nounwind readnone
+declare void @llvm.aarch64.neon.st3lane.v8i16.p0(<8 x i16>, <8 x i16>, <8 x i16>, i64, ptr) nounwind readnone
+declare void @llvm.aarch64.neon.st3lane.v4i32.p0(<4 x i32>, <4 x i32>, <4 x i32>, i64, ptr) nounwind readnone
+declare void @llvm.aarch64.neon.st3lane.v2i64.p0(<2 x i64>, <2 x i64>, <2 x i64>, i64, ptr) nounwind readnone
+
+define void @st4lane_16b(<16 x i8> %A, <16 x i8> %B, <16 x i8> %C, <16 x i8> %D, ptr %E) sanitize_memory {
+; CHECK-LABEL: define void @st4lane_16b(
+; CHECK-SAME: <16 x i8> [[A:%.*]], <16 x i8> [[B:%.*]], <16 x i8> [[C:%.*]], <16 x i8> [[D:%.*]], ptr [[E:%.*]]) #[[ATTR0]] {
+; CHECK-NEXT:    [[TMP2:%.*]] = load <16 x i8>, ptr @__msan_param_tls, align 8
+; CHECK-NEXT:    [[TMP3:%.*]] = load <16 x i8>, ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_param_tls to i64), i64 16) to ptr), align 8
+; CHECK-NEXT:    [[TMP4:%.*]] = load <16 x i8>, ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_param_tls to i64), i64 32) to ptr), align 8
+; CHECK-NEXT:    [[TMP5:%.*]] = load <16 x i8>, ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_param_tls to i64), i64 48) to ptr), align 8
+; CHECK-NEXT:    [[TMP1:%.*]] = load i64, ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_param_tls to i64), i64 64) to ptr), align 8
+; CHECK-NEXT:    call void @llvm.donothing()
+; CHECK-NEXT:    [[TMP6:%.*]] = bitcast <16 x i8> [[TMP2]] to i128
+; CHECK-NEXT:    [[_MSCMP3:%.*]] = icmp ne i128 [[TMP6]], 0
+; CHECK-NEXT:    [[TMP7:%.*]] = bitcast <16 x i8> [[TMP3]] to i128
+; CHECK-NEXT:    [[_MSCMP1:%.*]] = icmp ne i128 [[TMP7]], 0
+; CHECK-NEXT:    [[_MSOR:%.*]] = or i1 [[_MSCMP3]], [[_MSCMP1]]
+; CHECK-NEXT:    [[TMP8:%.*]] = bitcast <16 x i8> [[TMP4]] to i128
+; CHECK-NEXT:    [[_MSCMP2:%.*]] = icmp ne i128 [[TMP8]], 0
+; CHECK-NEXT:    [[_MSOR3:%.*]] = or i1 [[_MSOR]], [[_MSCMP2]]
+; CHECK-NEXT:    [[TMP9:%.*]] = bitcast <16 x i8> [[TMP5]] to i128
+; CHECK-NEXT:    [[_MSCMP4:%.*]] = icmp ne i128 [[TMP9]], 0
+; CHECK-NEXT:    [[_MSOR5:%.*]] = or i1 [[_MSOR3]], [[_MSCMP4]]
+; CHECK-NEXT:    [[_MSCMP:%.*]] = icmp ne i64 [[TMP1]], 0
+; CHECK-NEXT:    [[_MSOR7:%.*]] = or i1 [[_MSOR5]], [[_MSCMP]]
+; CHECK-NEXT:    br i1 [[_MSOR7]], label [[TMP10:%.*]], label [[TMP11:%.*]], !prof [[PROF0]]
+; CHECK:       10:
+; CHECK-NEXT:    call void @__msan_warning_noreturn() #[[ATTR4]]
+; CHECK-NEXT:    unreachable
+; CHECK:       11:
+; CHECK-NEXT:    call void @llvm.aarch64.neon.st4lane.v16i8.p0(<16 x i8> [[A]], <16 x i8> [[B]], <16 x i8> [[C]], <16 x i8> [[D]], i64 1, ptr [[E]])
+; CHECK-NEXT:    ret void
+;
+  call void @llvm.aarch64.neon.st4lane.v16i8.p0(<16 x i8> %A, <16 x i8> %B, <16 x i8> %C, <16 x i8> %D, i64 1, ptr %E)
+  ret void
+}
+
+define void @st4lane_8h(<8 x i16> %A, <8 x i16> %B, <8 x i16> %C, <8 x i16> %D, ptr %E) sanitize_memory {
+; CHECK-LABEL: define void @st4lane_8h(
+; CHECK-SAME: <8 x i16> [[A:%.*]], <8 x i16> [[B:%.*]], <8 x i16> [[C:%.*]], <8 x i16> [[D:%.*]], ptr [[E:%.*]]) #[[ATTR0]] {
+; CHECK-NEXT:    [[TMP2:%.*]] = load <8 x i16>, ptr @__msan_param_tls, align 8
+; CHECK-NEXT:    [[TMP3:%.*]] = load <8 x i16>, ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_param_tls to i64), i64 16) to ptr), align 8
+; CHECK-NEXT:    [[TMP4:%.*]] = load <8 x i16>, ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_param_tls to i64), i64 32) to ptr), align 8
+; CHECK-NEXT:    [[TMP5:%.*]] = load <8 x i16>, ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_param_tls to i64), i64 48) to ptr), align 8
+; CHECK-NEXT:    [[TMP1:%.*]] = load i64, ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_param_tls to i64), i64 64) to ptr), align 8
+; CHECK-NEXT:    call void @llvm.donothing()
+; CHECK-NEXT:    [[TMP6:%.*]] = bitcast <8 x i16> [[TMP2]] to i128
+; CHECK-NEXT:    [[_MSCMP3:%.*]] = icmp ne i128 [[TMP6]], 0
+; CHECK-NEXT:    [[TMP7:%.*]] = bitcast <8 x i16> [[TMP3]] to i128
+; CHECK-NEXT:    [[_MSCMP1:%.*]] = icmp ne i128 [[TMP7]], 0
+; CHECK-NEXT:    [[_MSOR:%.*]] = or i1 [[_MSCMP3]], [[_MSCMP1]]
+; CHECK-NEXT:    [[TMP8:%.*]] = bitcast <8 x i16> [[TMP4]] to i128
+; CHECK-NEXT:    [[_MSCMP2:%.*]] = icmp ne i128 [[TMP8]], 0
+; CHECK-NEXT:    [[_MSOR3:%.*]] = or i1 [[_MSOR]], [[_MSCMP2]]
+; CHECK-NEXT:    [[TMP9:%.*]] = bitcast <8 x i16> [[TMP5]] to i128
+; CHECK-NEXT:    [[_MSCMP4:%.*]] = icmp ne i128 [[TMP9]], 0
+; CHECK-NEXT:    [[_MSOR5:%.*]] = or i1 [[_MSOR3]], [[_MSCMP4]]
+; CHECK-NEXT:    [[_MSCMP:%.*]] = icmp ne i64 [[TMP1]], 0
+; CHECK-NEXT:    [[_MSOR7:%.*]] = or i1 [[_MSOR5]], [[_MSCMP]]
+; CHECK-NEXT:    br i1 [[_MSOR7]], label [[TMP10:%.*]], label [[TMP11:%.*]], !prof [[PROF0]]
+; CHECK:       10:
+; CHECK-NEXT:    call void @__msan_warning_noreturn() #[[ATTR4]]
+; CHECK-NEXT:    unreachable
+; CHECK:       11:
+; CHECK-NEXT:    call void @llvm.aarch64.neon.st4lane.v8i16.p0(<8 x i16> [[A]], <8 x i16> [[B]], <8 x i16> [[C]], <8 x i16> [[D]], i64 1, ptr [[E]])
+; CHECK-NEXT:    ret void
+;
+  call void @llvm.aarch64.neon.st4lane.v8i16.p0(<8 x i16> %A, <8 x i16> %B, <8 x i16> %C, <8 x i16> %D, i64 1, ptr %E)
+  ret void
+}
+
+define void @st4lane_4s(<4 x i32> %A, <4 x i32> %B, <4 x i32> %C, <4 x i32> %D, ptr %E) sanitize_memory {
+; CHECK-LABEL: define void @st4lane_4s(
+; CHECK-SAME: <4 x i32> [[A:%.*]], <4 x i32> [[B:%.*]], <4 x i32> [[C:%.*]], <4 x i32> [[D:%.*]], ptr [[E:%.*]]) #[[ATTR0]] {
+; CHECK-NEXT:    [[TMP2:%.*]] = load <4 x i32>, ptr @__msan_param_tls, align 8
+; CHECK-NEXT:    [[TMP3:%.*]] = load <4 x i32>, ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_param_tls to i64), i64 16) to ptr), align 8
+; CHECK-NEXT:    [[TMP4:%.*]] = load <4 x i32>, ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_param_tls to i64), i64 32) to ptr), align 8
+; CHECK-NEXT:    [[TMP5:%.*]] = load <4 x i32>, ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_param_tls to i64), i64 48) to ptr), align 8
+; CHECK-NEXT:    [[TMP1:%.*]] = load i64, ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_param_tls to i64), i64 64) to ptr), align 8
+; CHECK-NEXT:    call void @llvm.donothing()
+; CHECK-NEXT:    [[TMP6:%.*]] = bitcast <4 x i32> [[TMP2]] to i128
+; CHECK-NEXT:    [[_MSCMP3:%.*]] = icmp ne i128 [[TMP6]], 0
+; CHECK-NEXT:    [[TMP7:%.*]] = bitcast <4 x i32> [[TMP3]] to i128
+; CHECK-NEXT:    [[_MSCMP1:%.*]] = icmp ne i128 [[TMP7]], 0
+; CHECK-NEXT:    [[_MSOR:%.*]] = or i1 [[_MSCMP3]], [[_MSCMP1]]
+; CHECK-NEXT:    [[TMP8:%.*]] = bitcast <4 x i32> [[TMP4]] to i128
+; CHECK-NEXT:    [[_MSCMP2:%.*]] = icmp ne i128 [[TMP8]], 0
+; CHECK-NEXT:    [[_MSOR3:%.*]] = or i1 [[_MSOR]], [[_MSCMP2]]
+; CHECK-NEXT:    [[TMP9:%.*]] = bitcast <4 x i32> [[TMP5]] to i128
+; CHECK-NEXT:    [[_MSCMP4:%.*]] = icmp ne i128 [[TMP9]], 0
+; CHECK-NEXT:    [[_MSOR5:%.*]] = or i1 [[_MSOR3]], [[_MSCMP4]]
+; CHECK-NEXT:    [[_MSCMP:%.*]] = icmp ne i64 [[TMP1]], 0
+; CHECK-NEXT:    [[_MSOR7:%.*]] = or i1 [[_MSOR5]], [[_MSCMP]]
+; CHECK-NEXT:    br i1 [[_MSOR7]], label [[TMP10:%.*]], label [[TMP11:%.*]], !prof [[PROF0]]
+; CHECK:       10:
+; CHECK-NEXT:    call void @__msan_warning_noreturn() #[[ATTR4]]
+; CHECK-NEXT:    unreachable
+; CHECK:       11:
+; CHECK-NEXT:    call void @llvm.aarch64.neon.st4lane.v4i32.p0(<4 x i32> [[A]], <4 x i32> [[B]], <4 x i32> [[C]], <4 x i32> [[D]], i64 1, ptr [[E]])
+; CHECK-NEXT:    ret void
+;
+  call void @llvm.aarch64.neon.st4lane.v4i32.p0(<4 x i32> %A, <4 x i32> %B, <4 x i32> %C, <4 x i32> %D, i64 1, ptr %E)
+  ret void
+}
+
+define void @st4lane_2d(<2 x i64> %A, <2 x i64> %B, <2 x i64> %C, <2 x i64> %D, ptr %E) sanitize_memory {
+; CHECK-LABEL: define void @st4lane_2d(
+; CHECK-SAME: <2 x i64> [[A:%.*]], <2 x i64> [[B:%.*]], <2 x i64> [[C:%.*]], <2 x i64> [[D:%.*]], ptr [[E:%.*]]) #[[ATTR0]] {
+; CHECK-NEXT:    [[TMP2:%.*]] = load <2 x i64>, ptr @__msan_param_tls, align 8
+; CHECK-NEXT:    [[TMP3:%.*]] = load <2 x i64>, ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_param_tls to i64), i64 16) to ptr), align 8
+; CHECK-NEXT:    [[TMP4:%.*]] = load <2 x i64>, ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_param_tls to i64), i64 32) to ptr), align 8
+; CHECK-NEXT:    [[TMP5:%.*]] = load <2 x i64>, ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_param_tls to i64), i64 48) to ptr), align 8
+; CHECK-NEXT:    [[TMP1:%.*]] = load i64, ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_param_tls to i64), i64 64) to ptr), align 8
+; CHECK-NEXT:    call void @llvm.donothing()
+; CHECK-NEXT:    [[TMP6:%.*]] = bitcast <2 x i64> [[TMP2]] to i128
+; CHECK-NEXT:    [[_MSCMP3:%.*]] = icmp ne i128 [[TMP6]], 0
+; CHECK-NEXT:    [[TMP7:%.*]] = bitcast <2 x i64> [[TMP3]] to i128
+; CHECK-NEXT:    [[_MSCMP1:%.*]] = icmp ne i128 [[TMP7]], 0
+; CHECK-NEXT:    [[_MSOR:%.*]] = or i1 [[_MSCMP3]], [[_MSCMP1]]
+; CHECK-NEXT:    [[TMP8:%.*]] = bitcast <2 x i64> [[TMP4]] to i128
+; CHECK-NEXT:    [[_MSCMP2:%.*]] = icmp ne i128 [[TMP8]], 0
+; CHECK-NEXT:    [[_MSOR3:%.*]] = or i1 [[_MSOR]], [[_MSCMP2]]
+; CHECK-NEXT:    [[TMP9:%.*]] = bitcast <2 x i64> [[TMP5]] to i128
+; CHECK-NEXT:    [[_MSCMP4:%.*]] = icmp ne i128 [[TMP9]], 0
+; CHECK-NEXT:    [[_MSOR5:%.*]] = or i1 [[_MSOR3]], [[_MSCMP4]]
+; CHECK-NEXT:    [[_MSCMP:%.*]] = icmp ne i64 [[TMP1]], 0
+; CHECK-NEXT:    [[_MSOR7:%.*]] = or i1 [[_MSOR5]], [[_MSCMP]]
+; CHECK-NEXT:    br i1 [[_MSOR7]], label [[TMP10:%.*]], label [[TMP11:%.*]], !prof [[PROF0]]
+; CHECK:       10:
+; CHECK-NEXT:    call void @__msan_warning_noreturn() #[[ATTR4]]
+; CHECK-NEXT:    unreachable
+; CHECK:       11:
+; CHECK-NEXT:    call void @llvm.aarch64.neon.st4lane.v2i64.p0(<2 x i64> [[A]], <2 x i64> [[B]], <2 x i64> [[C]], <2 x i64> [[D]], i64 1, ptr [[E]])
+; CHECK-NEXT:    ret void
+;
+  call void @llvm.aarch64.neon.st4lane.v2i64.p0(<2 x i64> %A, <2 x i64> %B, <2 x i64> %C, <2 x i64> %D, i64 1, ptr %E)
+  ret void
+}
+
+declare void @llvm.aarch64.neon.st4lane.v16i8.p0(<16 x i8>, <16 x i8>, <16 x i8>, <16 x i8>, i64, ptr) nounwind readnone
+declare void @llvm.aarch64.neon.st4lane.v8i16.p0(<8 x i16>, <8 x i16>, <8 x i16>, <8 x i16>, i64, ptr) nounwind readnone
+declare void @llvm.aarch64.neon.st4lane.v4i32.p0(<4 x i32>, <4 x i32>, <4 x i32>, <4 x i32>, i64, ptr) nounwind readnone
+declare void @llvm.aarch64.neon.st4lane.v2i64.p0(<2 x i64>, <2 x i64>, <2 x i64>, <2 x i64>, i64, ptr) nounwind readnone
+;.
+; CHECK: [[PROF0]] = !{!"branch_weights", i32 1, i32 1048575}
+;.
diff --git a/llvm/test/Instrumentation/MemorySanitizer/AArch64/neon_vst_origins.ll b/llvm/test/Instrumentation/MemorySanitizer/AArch64/neon_vst_origins.ll
index 818da89..5228381 100644
--- a/llvm/test/Instrumentation/MemorySanitizer/AArch64/neon_vst_origins.ll
+++ b/llvm/test/Instrumentation/MemorySanitizer/AArch64/neon_vst_origins.ll
@@ -30,6 +30,7 @@ define void @st2_16b(<16 x i8> %A, <16 x i8> %B, ptr %P) nounwind sanitize_memor
 ; CHECK-NEXT:    [[TMP10:%.*]] = add i64 [[TMP8]], 35184372088832
 ; CHECK-NEXT:    [[TMP11:%.*]] = and i64 [[TMP10]], -4
 ; CHECK-NEXT:    [[TMP12:%.*]] = inttoptr i64 [[TMP11]] to ptr
+; CHECK-NEXT:    call void @llvm.aarch64.neon.st2.v16i8.p0(<16 x i8> [[TMP3]], <16 x i8> [[TMP5]], ptr [[TMP9]])
 ; CHECK-NEXT:    [[TMP13:%.*]] = bitcast <16 x i8> [[TMP5]] to i128
 ; CHECK-NEXT:    [[TMP14:%.*]] = icmp ne i128 [[TMP13]], 0
 ; CHECK-NEXT:    [[TMP15:%.*]] = select i1 [[TMP14]], i32 [[TMP6]], i32 [[TMP4]]
@@ -55,7 +56,6 @@ define void @st2_16b(<16 x i8> %A, <16 x i8> %B, ptr %P) nounwind sanitize_memor
 ; CHECK-NEXT:    unreachable
 ; CHECK:       24:
 ; CHECK-NEXT:    call void @llvm.aarch64.neon.st2.v16i8.p0(<16 x i8> [[A]], <16 x i8> [[B]], ptr [[P]])
-; CHECK-NEXT:    call void @llvm.aarch64.neon.st2.v16i8.p0(<16 x i8> [[TMP3]], <16 x i8> [[TMP5]], ptr [[TMP9]])
 ; CHECK-NEXT:    ret void
 ;
   call void @llvm.aarch64.neon.st2.v16i8.p0(<16 x i8> %A, <16 x i8> %B, ptr %P)
@@ -82,6 +82,7 @@ define void @st3_16b(<16 x i8> %A, <16 x i8> %B, <16 x i8> %C, ptr %P) nounwind
 ; CHECK-NEXT:    [[TMP12:%.*]] = add i64 [[TMP10]], 35184372088832
 ; CHECK-NEXT:    [[TMP13:%.*]] = and i64 [[TMP12]], -4
 ; CHECK-NEXT:    [[TMP14:%.*]] = inttoptr i64 [[TMP13]] to ptr
+; CHECK-NEXT:    call void @llvm.aarch64.neon.st3.v16i8.p0(<16 x i8> [[TMP3]], <16 x i8> [[TMP5]], <16 x i8> [[TMP7]], ptr [[TMP11]])
 ; CHECK-NEXT:    [[TMP15:%.*]] = bitcast <16 x i8> [[TMP5]] to i128
 ; CHECK-NEXT:    [[TMP16:%.*]] = icmp ne i128 [[TMP15]], 0
 ; CHECK-NEXT:    [[TMP17:%.*]] = select i1 [[TMP16]], i32 [[TMP6]], i32 [[TMP4]]
@@ -118,7 +119,6 @@ define void @st3_16b(<16 x i8> %A, <16 x i8> %B, <16 x i8> %C, ptr %P) nounwind
 ; CHECK-NEXT:    unreachable
 ; CHECK:       33:
 ; CHECK-NEXT:    call void @llvm.aarch64.neon.st3.v16i8.p0(<16 x i8> [[A]], <16 x i8> [[B]], <16 x i8> [[C]], ptr [[P]])
-; CHECK-NEXT:    call void @llvm.aarch64.neon.st3.v16i8.p0(<16 x i8> [[TMP3]], <16 x i8> [[TMP5]], <16 x i8> [[TMP7]], ptr [[TMP11]])
 ; CHECK-NEXT:    ret void
 ;
   call void @llvm.aarch64.neon.st3.v16i8.p0(<16 x i8> %A, <16 x i8> %B, <16 x i8> %C, ptr %P)
@@ -147,6 +147,7 @@ define void @st4_16b(<16 x i8> %A, <16 x i8> %B, <16 x i8> %C, <16 x i8> %D, ptr
 ; CHECK-NEXT:    [[TMP14:%.*]] = add i64 [[TMP12]], 35184372088832
 ; CHECK-NEXT:    [[TMP15:%.*]] = and i64 [[TMP14]], -4
 ; CHECK-NEXT:    [[TMP16:%.*]] = inttoptr i64 [[TMP15]] to ptr
+; CHECK-NEXT:    call void @llvm.aarch64.neon.st4.v16i8.p0(<16 x i8> [[TMP3]], <16 x i8> [[TMP5]], <16 x i8> [[TMP7]], <16 x i8> [[TMP9]], ptr [[TMP13]])
 ; CHECK-NEXT:    [[TMP17:%.*]] = bitcast <16 x i8> [[TMP5]] to i128
 ; CHECK-NEXT:    [[TMP18:%.*]] = icmp ne i128 [[TMP17]], 0
 ; CHECK-NEXT:    [[TMP19:%.*]] = select i1 [[TMP18]], i32 [[TMP6]], i32 [[TMP4]]
@@ -194,7 +195,6 @@ define void @st4_16b(<16 x i8> %A, <16 x i8> %B, <16 x i8> %C, <16 x i8> %D, ptr
 ; CHECK-NEXT:    unreachable
 ; CHECK:       42:
 ; CHECK-NEXT:    call void @llvm.aarch64.neon.st4.v16i8.p0(<16 x i8> [[A]], <16 x i8> [[B]], <16 x i8> [[C]], <16 x i8> [[D]], ptr [[P]])
-; CHECK-NEXT:    call void @llvm.aarch64.neon.st4.v16i8.p0(<16 x i8> [[TMP3]], <16 x i8> [[TMP5]], <16 x i8> [[TMP7]], <16 x i8> [[TMP9]], ptr [[TMP13]])
 ; CHECK-NEXT:    ret void
 ;
   call void @llvm.aarch64.neon.st4.v16i8.p0(<16 x i8> %A, <16 x i8> %B, <16 x i8> %C, <16 x i8> %D, ptr %P)
diff --git a/llvm/test/Instrumentation/MemorySanitizer/X86/mmx-intrinsics.ll b/llvm/test/Instrumentation/MemorySanitizer/X86/mmx-intrinsics.ll
index dce1aa2..1d2e38e 100644
--- a/llvm/test/Instrumentation/MemorySanitizer/X86/mmx-intrinsics.ll
+++ b/llvm/test/Instrumentation/MemorySanitizer/X86/mmx-intrinsics.ll
@@ -4,7 +4,7 @@
 target datalayout = "e-m:o-p270:32:32-p271:32:32-p272:64:64-i64:64-f80:128-n8:16:32:64-S128"
 target triple = "x86_64-unknown-linux-gnu"
 
-declare x86_mmx @llvm.x86.ssse3.phadd.w(x86_mmx, x86_mmx) nounwind readnone
+declare <1 x i64> @llvm.x86.ssse3.phadd.w(<1 x i64>, <1 x i64>) nounwind readnone
 
 define i64 @test1(<1 x i64> %a, <1 x i64> %b) #0 {
 ; CHECK-LABEL: define i64 @test1(
@@ -35,16 +35,16 @@ define i64 @test1(<1 x i64> %a, <1 x i64> %b) #0 {
 entry:
   %0 = bitcast <1 x i64> %b to <4 x i16>
   %1 = bitcast <1 x i64> %a to <4 x i16>
-  %2 = bitcast <4 x i16> %1 to x86_mmx
-  %3 = bitcast <4 x i16> %0 to x86_mmx
-  %4 = tail call x86_mmx @llvm.x86.ssse3.phadd.w(x86_mmx %2, x86_mmx %3) nounwind readnone
-  %5 = bitcast x86_mmx %4 to <4 x i16>
+  %2 = bitcast <4 x i16> %1 to <1 x i64>
+  %3 = bitcast <4 x i16> %0 to <1 x i64>
+  %4 = tail call <1 x i64> @llvm.x86.ssse3.phadd.w(<1 x i64> %2, <1 x i64> %3) nounwind readnone
+  %5 = bitcast <1 x i64> %4 to <4 x i16>
   %6 = bitcast <4 x i16> %5 to <1 x i64>
   %7 = extractelement <1 x i64> %6, i32 0
   ret i64 %7
 }
 
-declare x86_mmx @llvm.x86.mmx.pcmpgt.d(x86_mmx, x86_mmx) nounwind readnone
+declare <1 x i64> @llvm.x86.mmx.pcmpgt.d(<1 x i64>, <1 x i64>) nounwind readnone
 
 define i64 @test88(<1 x i64> %a, <1 x i64> %b) #0 {
 ; CHECK-LABEL: define i64 @test88(
@@ -75,16 +75,16 @@ define i64 @test88(<1 x i64> %a, <1 x i64> %b) #0 {
 entry:
   %0 = bitcast <1 x i64> %b to <2 x i32>
   %1 = bitcast <1 x i64> %a to <2 x i32>
-  %mmx_var.i = bitcast <2 x i32> %1 to x86_mmx
-  %mmx_var1.i = bitcast <2 x i32> %0 to x86_mmx
-  %2 = tail call x86_mmx @llvm.x86.mmx.pcmpgt.d(x86_mmx %mmx_var.i, x86_mmx %mmx_var1.i) nounwind
-  %3 = bitcast x86_mmx %2 to <2 x i32>
+  %mmx_var.i = bitcast <2 x i32> %1 to <1 x i64>
+  %mmx_var1.i = bitcast <2 x i32> %0 to <1 x i64>
+  %2 = tail call <1 x i64> @llvm.x86.mmx.pcmpgt.d(<1 x i64> %mmx_var.i, <1 x i64> %mmx_var1.i) nounwind
+  %3 = bitcast <1 x i64> %2 to <2 x i32>
   %4 = bitcast <2 x i32> %3 to <1 x i64>
   %5 = extractelement <1 x i64> %4, i32 0
   ret i64 %5
 }
 
-declare x86_mmx @llvm.x86.mmx.pcmpgt.w(x86_mmx, x86_mmx) nounwind readnone
+declare <1 x i64> @llvm.x86.mmx.pcmpgt.w(<1 x i64>, <1 x i64>) nounwind readnone
 
 define i64 @test87(<1 x i64> %a, <1 x i64> %b) #0 {
 ; CHECK-LABEL: define i64 @test87(
@@ -115,16 +115,16 @@ define i64 @test87(<1 x i64> %a, <1 x i64> %b) #0 {
 entry:
   %0 = bitcast <1 x i64> %b to <4 x i16>
   %1 = bitcast <1 x i64> %a to <4 x i16>
-  %mmx_var.i = bitcast <4 x i16> %1 to x86_mmx
-  %mmx_var1.i = bitcast <4 x i16> %0 to x86_mmx
-  %2 = tail call x86_mmx @llvm.x86.mmx.pcmpgt.w(x86_mmx %mmx_var.i, x86_mmx %mmx_var1.i) nounwind
-  %3 = bitcast x86_mmx %2 to <4 x i16>
+  %mmx_var.i = bitcast <4 x i16> %1 to <1 x i64>
+  %mmx_var1.i = bitcast <4 x i16> %0 to <1 x i64>
+  %2 = tail call <1 x i64> @llvm.x86.mmx.pcmpgt.w(<1 x i64> %mmx_var.i, <1 x i64> %mmx_var1.i) nounwind
+  %3 = bitcast <1 x i64> %2 to <4 x i16>
   %4 = bitcast <4 x i16> %3 to <1 x i64>
   %5 = extractelement <1 x i64> %4, i32 0
   ret i64 %5
 }
 
-declare x86_mmx @llvm.x86.mmx.pcmpgt.b(x86_mmx, x86_mmx) nounwind readnone
+declare <1 x i64> @llvm.x86.mmx.pcmpgt.b(<1 x i64>, <1 x i64>) nounwind readnone
 
 define i64 @test86(<1 x i64> %a, <1 x i64> %b) #0 {
 ; CHECK-LABEL: define i64 @test86(
@@ -155,16 +155,16 @@ define i64 @test86(<1 x i64> %a, <1 x i64> %b) #0 {
 entry:
   %0 = bitcast <1 x i64> %b to <8 x i8>
   %1 = bitcast <1 x i64> %a to <8 x i8>
-  %mmx_var.i = bitcast <8 x i8> %1 to x86_mmx
-  %mmx_var1.i = bitcast <8 x i8> %0 to x86_mmx
-  %2 = tail call x86_mmx @llvm.x86.mmx.pcmpgt.b(x86_mmx %mmx_var.i, x86_mmx %mmx_var1.i) nounwind
-  %3 = bitcast x86_mmx %2 to <8 x i8>
+  %mmx_var.i = bitcast <8 x i8> %1 to <1 x i64>
+  %mmx_var1.i = bitcast <8 x i8> %0 to <1 x i64>
+  %2 = tail call <1 x i64> @llvm.x86.mmx.pcmpgt.b(<1 x i64> %mmx_var.i, <1 x i64> %mmx_var1.i) nounwind
+  %3 = bitcast <1 x i64> %2 to <8 x i8>
   %4 = bitcast <8 x i8> %3 to <1 x i64>
   %5 = extractelement <1 x i64> %4, i32 0
   ret i64 %5
 }
 
-declare x86_mmx @llvm.x86.mmx.pcmpeq.d(x86_mmx, x86_mmx) nounwind readnone
+declare <1 x i64> @llvm.x86.mmx.pcmpeq.d(<1 x i64>, <1 x i64>) nounwind readnone
 
 define i64 @test85(<1 x i64> %a, <1 x i64> %b) #0 {
 ; CHECK-LABEL: define i64 @test85(
@@ -195,16 +195,16 @@ define i64 @test85(<1 x i64> %a, <1 x i64> %b) #0 {
 entry:
   %0 = bitcast <1 x i64> %b to <2 x i32>
   %1 = bitcast <1 x i64> %a to <2 x i32>
-  %mmx_var.i = bitcast <2 x i32> %1 to x86_mmx
-  %mmx_var1.i = bitcast <2 x i32> %0 to x86_mmx
-  %2 = tail call x86_mmx @llvm.x86.mmx.pcmpeq.d(x86_mmx %mmx_var.i, x86_mmx %mmx_var1.i) nounwind
-  %3 = bitcast x86_mmx %2 to <2 x i32>
+  %mmx_var.i = bitcast <2 x i32> %1 to <1 x i64>
+  %mmx_var1.i = bitcast <2 x i32> %0 to <1 x i64>
+  %2 = tail call <1 x i64> @llvm.x86.mmx.pcmpeq.d(<1 x i64> %mmx_var.i, <1 x i64> %mmx_var1.i) nounwind
+  %3 = bitcast <1 x i64> %2 to <2 x i32>
   %4 = bitcast <2 x i32> %3 to <1 x i64>
   %5 = extractelement <1 x i64> %4, i32 0
   ret i64 %5
 }
 
-declare x86_mmx @llvm.x86.mmx.pcmpeq.w(x86_mmx, x86_mmx) nounwind readnone
+declare <1 x i64> @llvm.x86.mmx.pcmpeq.w(<1 x i64>, <1 x i64>) nounwind readnone
 
 define i64 @test84(<1 x i64> %a, <1 x i64> %b) #0 {
 ; CHECK-LABEL: define i64 @test84(
@@ -235,16 +235,16 @@ define i64 @test84(<1 x i64> %a, <1 x i64> %b) #0 {
 entry:
   %0 = bitcast <1 x i64> %b to <4 x i16>
   %1 = bitcast <1 x i64> %a to <4 x i16>
-  %mmx_var.i = bitcast <4 x i16> %1 to x86_mmx
-  %mmx_var1.i = bitcast <4 x i16> %0 to x86_mmx
-  %2 = tail call x86_mmx @llvm.x86.mmx.pcmpeq.w(x86_mmx %mmx_var.i, x86_mmx %mmx_var1.i) nounwind
-  %3 = bitcast x86_mmx %2 to <4 x i16>
+  %mmx_var.i = bitcast <4 x i16> %1 to <1 x i64>
+  %mmx_var1.i = bitcast <4 x i16> %0 to <1 x i64>
+  %2 = tail call <1 x i64> @llvm.x86.mmx.pcmpeq.w(<1 x i64> %mmx_var.i, <1 x i64> %mmx_var1.i) nounwind
+  %3 = bitcast <1 x i64> %2 to <4 x i16>
   %4 = bitcast <4 x i16> %3 to <1 x i64>
   %5 = extractelement <1 x i64> %4, i32 0
   ret i64 %5
 }
 
-declare x86_mmx @llvm.x86.mmx.pcmpeq.b(x86_mmx, x86_mmx) nounwind readnone
+declare <1 x i64> @llvm.x86.mmx.pcmpeq.b(<1 x i64>, <1 x i64>) nounwind readnone
 
 define i64 @test83(<1 x i64> %a, <1 x i64> %b) #0 {
 ; CHECK-LABEL: define i64 @test83(
@@ -275,16 +275,16 @@ define i64 @test83(<1 x i64> %a, <1 x i64> %b) #0 {
 entry:
   %0 = bitcast <1 x i64> %b to <8 x i8>
   %1 = bitcast <1 x i64> %a to <8 x i8>
-  %mmx_var.i = bitcast <8 x i8> %1 to x86_mmx
-  %mmx_var1.i = bitcast <8 x i8> %0 to x86_mmx
-  %2 = tail call x86_mmx @llvm.x86.mmx.pcmpeq.b(x86_mmx %mmx_var.i, x86_mmx %mmx_var1.i) nounwind
-  %3 = bitcast x86_mmx %2 to <8 x i8>
+  %mmx_var.i = bitcast <8 x i8> %1 to <1 x i64>
+  %mmx_var1.i = bitcast <8 x i8> %0 to <1 x i64>
+  %2 = tail call <1 x i64> @llvm.x86.mmx.pcmpeq.b(<1 x i64> %mmx_var.i, <1 x i64> %mmx_var1.i) nounwind
+  %3 = bitcast <1 x i64> %2 to <8 x i8>
   %4 = bitcast <8 x i8> %3 to <1 x i64>
   %5 = extractelement <1 x i64> %4, i32 0
   ret i64 %5
 }
 
-declare x86_mmx @llvm.x86.mmx.punpckldq(x86_mmx, x86_mmx) nounwind readnone
+declare <1 x i64> @llvm.x86.mmx.punpckldq(<1 x i64>, <1 x i64>) nounwind readnone
 
 define i64 @test82(<1 x i64> %a, <1 x i64> %b) #0 {
 ; CHECK-LABEL: define i64 @test82(
@@ -315,16 +315,16 @@ define i64 @test82(<1 x i64> %a, <1 x i64> %b) #0 {
 entry:
   %0 = bitcast <1 x i64> %b to <2 x i32>
   %1 = bitcast <1 x i64> %a to <2 x i32>
-  %mmx_var.i = bitcast <2 x i32> %1 to x86_mmx
-  %mmx_var1.i = bitcast <2 x i32> %0 to x86_mmx
-  %2 = tail call x86_mmx @llvm.x86.mmx.punpckldq(x86_mmx %mmx_var.i, x86_mmx %mmx_var1.i) nounwind
-  %3 = bitcast x86_mmx %2 to <2 x i32>
+  %mmx_var.i = bitcast <2 x i32> %1 to <1 x i64>
+  %mmx_var1.i = bitcast <2 x i32> %0 to <1 x i64>
+  %2 = tail call <1 x i64> @llvm.x86.mmx.punpckldq(<1 x i64> %mmx_var.i, <1 x i64> %mmx_var1.i) nounwind
+  %3 = bitcast <1 x i64> %2 to <2 x i32>
   %4 = bitcast <2 x i32> %3 to <1 x i64>
   %5 = extractelement <1 x i64> %4, i32 0
   ret i64 %5
 }
 
-declare x86_mmx @llvm.x86.mmx.punpcklwd(x86_mmx, x86_mmx) nounwind readnone
+declare <1 x i64> @llvm.x86.mmx.punpcklwd(<1 x i64>, <1 x i64>) nounwind readnone
 
 define i64 @test81(<1 x i64> %a, <1 x i64> %b) #0 {
 ; CHECK-LABEL: define i64 @test81(
@@ -355,16 +355,16 @@ define i64 @test81(<1 x i64> %a, <1 x i64> %b) #0 {
 entry:
   %0 = bitcast <1 x i64> %b to <4 x i16>
   %1 = bitcast <1 x i64> %a to <4 x i16>
-  %mmx_var.i = bitcast <4 x i16> %1 to x86_mmx
-  %mmx_var1.i = bitcast <4 x i16> %0 to x86_mmx
-  %2 = tail call x86_mmx @llvm.x86.mmx.punpcklwd(x86_mmx %mmx_var.i, x86_mmx %mmx_var1.i) nounwind
-  %3 = bitcast x86_mmx %2 to <4 x i16>
+  %mmx_var.i = bitcast <4 x i16> %1 to <1 x i64>
+  %mmx_var1.i = bitcast <4 x i16> %0 to <1 x i64>
+  %2 = tail call <1 x i64> @llvm.x86.mmx.punpcklwd(<1 x i64> %mmx_var.i, <1 x i64> %mmx_var1.i) nounwind
+  %3 = bitcast <1 x i64> %2 to <4 x i16>
   %4 = bitcast <4 x i16> %3 to <1 x i64>
   %5 = extractelement <1 x i64> %4, i32 0
   ret i64 %5
 }
 
-declare x86_mmx @llvm.x86.mmx.punpcklbw(x86_mmx, x86_mmx) nounwind readnone
+declare <1 x i64> @llvm.x86.mmx.punpcklbw(<1 x i64>, <1 x i64>) nounwind readnone
 
 define i64 @test80(<1 x i64> %a, <1 x i64> %b) #0 {
 ; CHECK-LABEL: define i64 @test80(
@@ -395,16 +395,16 @@ define i64 @test80(<1 x i64> %a, <1 x i64> %b) #0 {
 entry:
   %0 = bitcast <1 x i64> %b to <8 x i8>
   %1 = bitcast <1 x i64> %a to <8 x i8>
-  %mmx_var.i = bitcast <8 x i8> %1 to x86_mmx
-  %mmx_var1.i = bitcast <8 x i8> %0 to x86_mmx
-  %2 = tail call x86_mmx @llvm.x86.mmx.punpcklbw(x86_mmx %mmx_var.i, x86_mmx %mmx_var1.i) nounwind
-  %3 = bitcast x86_mmx %2 to <8 x i8>
+  %mmx_var.i = bitcast <8 x i8> %1 to <1 x i64>
+  %mmx_var1.i = bitcast <8 x i8> %0 to <1 x i64>
+  %2 = tail call <1 x i64> @llvm.x86.mmx.punpcklbw(<1 x i64> %mmx_var.i, <1 x i64> %mmx_var1.i) nounwind
+  %3 = bitcast <1 x i64> %2 to <8 x i8>
   %4 = bitcast <8 x i8> %3 to <1 x i64>
   %5 = extractelement <1 x i64> %4, i32 0
   ret i64 %5
 }
 
-declare x86_mmx @llvm.x86.mmx.punpckhdq(x86_mmx, x86_mmx) nounwind readnone
+declare <1 x i64> @llvm.x86.mmx.punpckhdq(<1 x i64>, <1 x i64>) nounwind readnone
 
 define i64 @test79(<1 x i64> %a, <1 x i64> %b) #0 {
 ; CHECK-LABEL: define i64 @test79(
@@ -435,16 +435,16 @@ define i64 @test79(<1 x i64> %a, <1 x i64> %b) #0 {
 entry:
   %0 = bitcast <1 x i64> %b to <2 x i32>
   %1 = bitcast <1 x i64> %a to <2 x i32>
-  %mmx_var.i = bitcast <2 x i32> %1 to x86_mmx
-  %mmx_var1.i = bitcast <2 x i32> %0 to x86_mmx
-  %2 = tail call x86_mmx @llvm.x86.mmx.punpckhdq(x86_mmx %mmx_var.i, x86_mmx %mmx_var1.i) nounwind
-  %3 = bitcast x86_mmx %2 to <2 x i32>
+  %mmx_var.i = bitcast <2 x i32> %1 to <1 x i64>
+  %mmx_var1.i = bitcast <2 x i32> %0 to <1 x i64>
+  %2 = tail call <1 x i64> @llvm.x86.mmx.punpckhdq(<1 x i64> %mmx_var.i, <1 x i64> %mmx_var1.i) nounwind
+  %3 = bitcast <1 x i64> %2 to <2 x i32>
   %4 = bitcast <2 x i32> %3 to <1 x i64>
   %5 = extractelement <1 x i64> %4, i32 0
   ret i64 %5
 }
 
-declare x86_mmx @llvm.x86.mmx.punpckhwd(x86_mmx, x86_mmx) nounwind readnone
+declare <1 x i64> @llvm.x86.mmx.punpckhwd(<1 x i64>, <1 x i64>) nounwind readnone
 
 define i64 @test78(<1 x i64> %a, <1 x i64> %b) #0 {
 ; CHECK-LABEL: define i64 @test78(
@@ -475,16 +475,16 @@ define i64 @test78(<1 x i64> %a, <1 x i64> %b) #0 {
 entry:
   %0 = bitcast <1 x i64> %b to <4 x i16>
   %1 = bitcast <1 x i64> %a to <4 x i16>
-  %mmx_var.i = bitcast <4 x i16> %1 to x86_mmx
-  %mmx_var1.i = bitcast <4 x i16> %0 to x86_mmx
-  %2 = tail call x86_mmx @llvm.x86.mmx.punpckhwd(x86_mmx %mmx_var.i, x86_mmx %mmx_var1.i) nounwind
-  %3 = bitcast x86_mmx %2 to <4 x i16>
+  %mmx_var.i = bitcast <4 x i16> %1 to <1 x i64>
+  %mmx_var1.i = bitcast <4 x i16> %0 to <1 x i64>
+  %2 = tail call <1 x i64> @llvm.x86.mmx.punpckhwd(<1 x i64> %mmx_var.i, <1 x i64> %mmx_var1.i) nounwind
+  %3 = bitcast <1 x i64> %2 to <4 x i16>
   %4 = bitcast <4 x i16> %3 to <1 x i64>
   %5 = extractelement <1 x i64> %4, i32 0
   ret i64 %5
 }
 
-declare x86_mmx @llvm.x86.mmx.punpckhbw(x86_mmx, x86_mmx) nounwind readnone
+declare <1 x i64> @llvm.x86.mmx.punpckhbw(<1 x i64>, <1 x i64>) nounwind readnone
 
 define i64 @test77(<1 x i64> %a, <1 x i64> %b) #0 {
 ; CHECK-LABEL: define i64 @test77(
@@ -515,16 +515,16 @@ define i64 @test77(<1 x i64> %a, <1 x i64> %b) #0 {
 entry:
   %0 = bitcast <1 x i64> %b to <8 x i8>
   %1 = bitcast <1 x i64> %a to <8 x i8>
-  %mmx_var.i = bitcast <8 x i8> %1 to x86_mmx
-  %mmx_var1.i = bitcast <8 x i8> %0 to x86_mmx
-  %2 = tail call x86_mmx @llvm.x86.mmx.punpckhbw(x86_mmx %mmx_var.i, x86_mmx %mmx_var1.i) nounwind
-  %3 = bitcast x86_mmx %2 to <8 x i8>
+  %mmx_var.i = bitcast <8 x i8> %1 to <1 x i64>
+  %mmx_var1.i = bitcast <8 x i8> %0 to <1 x i64>
+  %2 = tail call <1 x i64> @llvm.x86.mmx.punpckhbw(<1 x i64> %mmx_var.i, <1 x i64> %mmx_var1.i) nounwind
+  %3 = bitcast <1 x i64> %2 to <8 x i8>
   %4 = bitcast <8 x i8> %3 to <1 x i64>
   %5 = extractelement <1 x i64> %4, i32 0
   ret i64 %5
 }
 
-declare x86_mmx @llvm.x86.mmx.packuswb(x86_mmx, x86_mmx) nounwind readnone
+declare <1 x i64> @llvm.x86.mmx.packuswb(<1 x i64>, <1 x i64>) nounwind readnone
 
 define i64 @test76(<1 x i64> %a, <1 x i64> %b) #0 {
 ; CHECK-LABEL: define i64 @test76(
@@ -563,16 +563,16 @@ define i64 @test76(<1 x i64> %a, <1 x i64> %b) #0 {
 entry:
   %0 = bitcast <1 x i64> %b to <4 x i16>
   %1 = bitcast <1 x i64> %a to <4 x i16>
-  %mmx_var.i = bitcast <4 x i16> %1 to x86_mmx
-  %mmx_var1.i = bitcast <4 x i16> %0 to x86_mmx
-  %2 = tail call x86_mmx @llvm.x86.mmx.packuswb(x86_mmx %mmx_var.i, x86_mmx %mmx_var1.i) nounwind
-  %3 = bitcast x86_mmx %2 to <8 x i8>
+  %mmx_var.i = bitcast <4 x i16> %1 to <1 x i64>
+  %mmx_var1.i = bitcast <4 x i16> %0 to <1 x i64>
+  %2 = tail call <1 x i64> @llvm.x86.mmx.packuswb(<1 x i64> %mmx_var.i, <1 x i64> %mmx_var1.i) nounwind
+  %3 = bitcast <1 x i64> %2 to <8 x i8>
   %4 = bitcast <8 x i8> %3 to <1 x i64>
   %5 = extractelement <1 x i64> %4, i32 0
   ret i64 %5
 }
 
-declare x86_mmx @llvm.x86.mmx.packssdw(x86_mmx, x86_mmx) nounwind readnone
+declare <1 x i64> @llvm.x86.mmx.packssdw(<1 x i64>, <1 x i64>) nounwind readnone
 
 define i64 @test75(<1 x i64> %a, <1 x i64> %b) #0 {
 ; CHECK-LABEL: define i64 @test75(
@@ -611,16 +611,16 @@ define i64 @test75(<1 x i64> %a, <1 x i64> %b) #0 {
 entry:
   %0 = bitcast <1 x i64> %b to <2 x i32>
   %1 = bitcast <1 x i64> %a to <2 x i32>
-  %mmx_var.i = bitcast <2 x i32> %1 to x86_mmx
-  %mmx_var1.i = bitcast <2 x i32> %0 to x86_mmx
-  %2 = tail call x86_mmx @llvm.x86.mmx.packssdw(x86_mmx %mmx_var.i, x86_mmx %mmx_var1.i) nounwind
-  %3 = bitcast x86_mmx %2 to <4 x i16>
+  %mmx_var.i = bitcast <2 x i32> %1 to <1 x i64>
+  %mmx_var1.i = bitcast <2 x i32> %0 to <1 x i64>
+  %2 = tail call <1 x i64> @llvm.x86.mmx.packssdw(<1 x i64> %mmx_var.i, <1 x i64> %mmx_var1.i) nounwind
+  %3 = bitcast <1 x i64> %2 to <4 x i16>
   %4 = bitcast <4 x i16> %3 to <1 x i64>
   %5 = extractelement <1 x i64> %4, i32 0
   ret i64 %5
 }
 
-declare x86_mmx @llvm.x86.mmx.packsswb(x86_mmx, x86_mmx) nounwind readnone
+declare <1 x i64> @llvm.x86.mmx.packsswb(<1 x i64>, <1 x i64>) nounwind readnone
 
 define i64 @test74(<1 x i64> %a, <1 x i64> %b) #0 {
 ; CHECK-LABEL: define i64 @test74(
@@ -659,16 +659,16 @@ define i64 @test74(<1 x i64> %a, <1 x i64> %b) #0 {
 entry:
   %0 = bitcast <1 x i64> %b to <4 x i16>
   %1 = bitcast <1 x i64> %a to <4 x i16>
-  %mmx_var.i = bitcast <4 x i16> %1 to x86_mmx
-  %mmx_var1.i = bitcast <4 x i16> %0 to x86_mmx
-  %2 = tail call x86_mmx @llvm.x86.mmx.packsswb(x86_mmx %mmx_var.i, x86_mmx %mmx_var1.i) nounwind
-  %3 = bitcast x86_mmx %2 to <8 x i8>
+  %mmx_var.i = bitcast <4 x i16> %1 to <1 x i64>
+  %mmx_var1.i = bitcast <4 x i16> %0 to <1 x i64>
+  %2 = tail call <1 x i64> @llvm.x86.mmx.packsswb(<1 x i64> %mmx_var.i, <1 x i64> %mmx_var1.i) nounwind
+  %3 = bitcast <1 x i64> %2 to <8 x i8>
   %4 = bitcast <8 x i8> %3 to <1 x i64>
   %5 = extractelement <1 x i64> %4, i32 0
   ret i64 %5
 }
 
-declare x86_mmx @llvm.x86.mmx.psrai.d(x86_mmx, i32) nounwind readnone
+declare <1 x i64> @llvm.x86.mmx.psrai.d(<1 x i64>, i32) nounwind readnone
 
 define i64 @test73(<1 x i64> %a) #0 {
 ; CHECK-LABEL: define i64 @test73(
@@ -694,15 +694,15 @@ define i64 @test73(<1 x i64> %a) #0 {
 ;
 entry:
   %0 = bitcast <1 x i64> %a to <2 x i32>
-  %mmx_var.i = bitcast <2 x i32> %0 to x86_mmx
-  %1 = tail call x86_mmx @llvm.x86.mmx.psrai.d(x86_mmx %mmx_var.i, i32 3) nounwind
-  %2 = bitcast x86_mmx %1 to <2 x i32>
+  %mmx_var.i = bitcast <2 x i32> %0 to <1 x i64>
+  %1 = tail call <1 x i64> @llvm.x86.mmx.psrai.d(<1 x i64> %mmx_var.i, i32 3) nounwind
+  %2 = bitcast <1 x i64> %1 to <2 x i32>
   %3 = bitcast <2 x i32> %2 to <1 x i64>
   %4 = extractelement <1 x i64> %3, i32 0
   ret i64 %4
 }
 
-declare x86_mmx @llvm.x86.mmx.psrai.w(x86_mmx, i32) nounwind readnone
+declare <1 x i64> @llvm.x86.mmx.psrai.w(<1 x i64>, i32) nounwind readnone
 
 define i64 @test72(<1 x i64> %a) #0 {
 ; CHECK-LABEL: define i64 @test72(
@@ -728,9 +728,9 @@ define i64 @test72(<1 x i64> %a) #0 {
 ;
 entry:
   %0 = bitcast <1 x i64> %a to <4 x i16>
-  %mmx_var.i = bitcast <4 x i16> %0 to x86_mmx
-  %1 = tail call x86_mmx @llvm.x86.mmx.psrai.w(x86_mmx %mmx_var.i, i32 3) nounwind
-  %2 = bitcast x86_mmx %1 to <4 x i16>
+  %mmx_var.i = bitcast <4 x i16> %0 to <1 x i64>
+  %1 = tail call <1 x i64> @llvm.x86.mmx.psrai.w(<1 x i64> %mmx_var.i, i32 3) nounwind
+  %2 = bitcast <1 x i64> %1 to <4 x i16>
   %3 = bitcast <4 x i16> %2 to <1 x i64>
   %4 = extractelement <1 x i64> %3, i32 0
   ret i64 %4
@@ -760,15 +760,15 @@ define i64 @test72_2(<1 x i64> %a) #0 {
 ;
 entry:
   %0 = bitcast <1 x i64> %a to <4 x i16>
-  %mmx_var.i = bitcast <4 x i16> %0 to x86_mmx
-  %1 = tail call x86_mmx @llvm.x86.mmx.psrai.w(x86_mmx %mmx_var.i, i32 0) nounwind
-  %2 = bitcast x86_mmx %1 to <4 x i16>
+  %mmx_var.i = bitcast <4 x i16> %0 to <1 x i64>
+  %1 = tail call <1 x i64> @llvm.x86.mmx.psrai.w(<1 x i64> %mmx_var.i, i32 0) nounwind
+  %2 = bitcast <1 x i64> %1 to <4 x i16>
   %3 = bitcast <4 x i16> %2 to <1 x i64>
   %4 = extractelement <1 x i64> %3, i32 0
   ret i64 %4
 }
 
-declare x86_mmx @llvm.x86.mmx.psrli.q(x86_mmx, i32) nounwind readnone
+declare <1 x i64> @llvm.x86.mmx.psrli.q(<1 x i64>, i32) nounwind readnone
 
 define i64 @test71(<1 x i64> %a) #0 {
 ; CHECK-LABEL: define i64 @test71(
@@ -790,13 +790,13 @@ define i64 @test71(<1 x i64> %a) #0 {
 ;
 entry:
   %0 = extractelement <1 x i64> %a, i32 0
-  %mmx_var.i = bitcast i64 %0 to x86_mmx
-  %1 = tail call x86_mmx @llvm.x86.mmx.psrli.q(x86_mmx %mmx_var.i, i32 3) nounwind
-  %2 = bitcast x86_mmx %1 to i64
+  %mmx_var.i = bitcast i64 %0 to <1 x i64>
+  %1 = tail call <1 x i64> @llvm.x86.mmx.psrli.q(<1 x i64> %mmx_var.i, i32 3) nounwind
+  %2 = bitcast <1 x i64> %1 to i64
   ret i64 %2
 }
 
-declare x86_mmx @llvm.x86.mmx.psrli.d(x86_mmx, i32) nounwind readnone
+declare <1 x i64> @llvm.x86.mmx.psrli.d(<1 x i64>, i32) nounwind readnone
 
 define i64 @test70(<1 x i64> %a) #0 {
 ; CHECK-LABEL: define i64 @test70(
@@ -822,9 +822,9 @@ define i64 @test70(<1 x i64> %a) #0 {
 ;
 entry:
   %0 = bitcast <1 x i64> %a to <2 x i32>
-  %mmx_var.i = bitcast <2 x i32> %0 to x86_mmx
-  %1 = tail call x86_mmx @llvm.x86.mmx.psrli.d(x86_mmx %mmx_var.i, i32 3) nounwind
-  %2 = bitcast x86_mmx %1 to <2 x i32>
+  %mmx_var.i = bitcast <2 x i32> %0 to <1 x i64>
+  %1 = tail call <1 x i64> @llvm.x86.mmx.psrli.d(<1 x i64> %mmx_var.i, i32 3) nounwind
+  %2 = bitcast <1 x i64> %1 to <2 x i32>
   %3 = bitcast <2 x i32> %2 to <1 x i64>
   %4 = extractelement <1 x i64> %3, i32 0
   ret i64 %4
@@ -854,15 +854,15 @@ define i64 @test70_2(<1 x i64> %a) #0 {
 ;
 entry:
   %0 = bitcast <1 x i64> %a to <2 x i32>
-  %mmx_var.i = bitcast <2 x i32> %0 to x86_mmx
-  %1 = tail call x86_mmx @llvm.x86.mmx.psrli.d(x86_mmx %mmx_var.i, i32 0) nounwind
-  %2 = bitcast x86_mmx %1 to <2 x i32>
+  %mmx_var.i = bitcast <2 x i32> %0 to <1 x i64>
+  %1 = tail call <1 x i64> @llvm.x86.mmx.psrli.d(<1 x i64> %mmx_var.i, i32 0) nounwind
+  %2 = bitcast <1 x i64> %1 to <2 x i32>
   %3 = bitcast <2 x i32> %2 to <1 x i64>
   %4 = extractelement <1 x i64> %3, i32 0
   ret i64 %4
 }
 
-declare x86_mmx @llvm.x86.mmx.psrli.w(x86_mmx, i32) nounwind readnone
+declare <1 x i64> @llvm.x86.mmx.psrli.w(<1 x i64>, i32) nounwind readnone
 
 define i64 @test69(<1 x i64> %a) #0 {
 ; CHECK-LABEL: define i64 @test69(
@@ -888,15 +888,15 @@ define i64 @test69(<1 x i64> %a) #0 {
 ;
 entry:
   %0 = bitcast <1 x i64> %a to <4 x i16>
-  %mmx_var.i = bitcast <4 x i16> %0 to x86_mmx
-  %1 = tail call x86_mmx @llvm.x86.mmx.psrli.w(x86_mmx %mmx_var.i, i32 3) nounwind
-  %2 = bitcast x86_mmx %1 to <4 x i16>
+  %mmx_var.i = bitcast <4 x i16> %0 to <1 x i64>
+  %1 = tail call <1 x i64> @llvm.x86.mmx.psrli.w(<1 x i64> %mmx_var.i, i32 3) nounwind
+  %2 = bitcast <1 x i64> %1 to <4 x i16>
   %3 = bitcast <4 x i16> %2 to <1 x i64>
   %4 = extractelement <1 x i64> %3, i32 0
   ret i64 %4
 }
 
-declare x86_mmx @llvm.x86.mmx.pslli.q(x86_mmx, i32) nounwind readnone
+declare <1 x i64> @llvm.x86.mmx.pslli.q(<1 x i64>, i32) nounwind readnone
 
 define i64 @test68(<1 x i64> %a) #0 {
 ; CHECK-LABEL: define i64 @test68(
@@ -918,13 +918,13 @@ define i64 @test68(<1 x i64> %a) #0 {
 ;
 entry:
   %0 = extractelement <1 x i64> %a, i32 0
-  %mmx_var.i = bitcast i64 %0 to x86_mmx
-  %1 = tail call x86_mmx @llvm.x86.mmx.pslli.q(x86_mmx %mmx_var.i, i32 3) nounwind
-  %2 = bitcast x86_mmx %1 to i64
+  %mmx_var.i = bitcast i64 %0 to <1 x i64>
+  %1 = tail call <1 x i64> @llvm.x86.mmx.pslli.q(<1 x i64> %mmx_var.i, i32 3) nounwind
+  %2 = bitcast <1 x i64> %1 to i64
   ret i64 %2
 }
 
-declare x86_mmx @llvm.x86.mmx.pslli.d(x86_mmx, i32) nounwind readnone
+declare <1 x i64> @llvm.x86.mmx.pslli.d(<1 x i64>, i32) nounwind readnone
 
 define i64 @test67(<1 x i64> %a) #0 {
 ; CHECK-LABEL: define i64 @test67(
@@ -950,15 +950,15 @@ define i64 @test67(<1 x i64> %a) #0 {
 ;
 entry:
   %0 = bitcast <1 x i64> %a to <2 x i32>
-  %mmx_var.i = bitcast <2 x i32> %0 to x86_mmx
-  %1 = tail call x86_mmx @llvm.x86.mmx.pslli.d(x86_mmx %mmx_var.i, i32 3) nounwind
-  %2 = bitcast x86_mmx %1 to <2 x i32>
+  %mmx_var.i = bitcast <2 x i32> %0 to <1 x i64>
+  %1 = tail call <1 x i64> @llvm.x86.mmx.pslli.d(<1 x i64> %mmx_var.i, i32 3) nounwind
+  %2 = bitcast <1 x i64> %1 to <2 x i32>
   %3 = bitcast <2 x i32> %2 to <1 x i64>
   %4 = extractelement <1 x i64> %3, i32 0
   ret i64 %4
 }
 
-declare x86_mmx @llvm.x86.mmx.pslli.w(x86_mmx, i32) nounwind readnone
+declare <1 x i64> @llvm.x86.mmx.pslli.w(<1 x i64>, i32) nounwind readnone
 
 define i64 @test66(<1 x i64> %a) #0 {
 ; CHECK-LABEL: define i64 @test66(
@@ -984,9 +984,9 @@ define i64 @test66(<1 x i64> %a) #0 {
 ;
 entry:
   %0 = bitcast <1 x i64> %a to <4 x i16>
-  %mmx_var.i = bitcast <4 x i16> %0 to x86_mmx
-  %1 = tail call x86_mmx @llvm.x86.mmx.pslli.w(x86_mmx %mmx_var.i, i32 3) nounwind
-  %2 = bitcast x86_mmx %1 to <4 x i16>
+  %mmx_var.i = bitcast <4 x i16> %0 to <1 x i64>
+  %1 = tail call <1 x i64> @llvm.x86.mmx.pslli.w(<1 x i64> %mmx_var.i, i32 3) nounwind
+  %2 = bitcast <1 x i64> %1 to <4 x i16>
   %3 = bitcast <4 x i16> %2 to <1 x i64>
   %4 = extractelement <1 x i64> %3, i32 0
   ret i64 %4
@@ -1016,15 +1016,15 @@ define i64 @test66_2(<1 x i64> %a) #0 {
 ;
 entry:
   %0 = bitcast <1 x i64> %a to <4 x i16>
-  %mmx_var.i = bitcast <4 x i16> %0 to x86_mmx
-  %1 = tail call x86_mmx @llvm.x86.mmx.pslli.w(x86_mmx %mmx_var.i, i32 0) nounwind
-  %2 = bitcast x86_mmx %1 to <4 x i16>
+  %mmx_var.i = bitcast <4 x i16> %0 to <1 x i64>
+  %1 = tail call <1 x i64> @llvm.x86.mmx.pslli.w(<1 x i64> %mmx_var.i, i32 0) nounwind
+  %2 = bitcast <1 x i64> %1 to <4 x i16>
   %3 = bitcast <4 x i16> %2 to <1 x i64>
   %4 = extractelement <1 x i64> %3, i32 0
   ret i64 %4
 }
 
-declare x86_mmx @llvm.x86.mmx.psra.d(x86_mmx, x86_mmx) nounwind readnone
+declare <1 x i64> @llvm.x86.mmx.psra.d(<1 x i64>, <1 x i64>) nounwind readnone
 
 define i64 @test65(<1 x i64> %a, <1 x i64> %b) #0 {
 ; CHECK-LABEL: define i64 @test65(
@@ -1059,17 +1059,17 @@ define i64 @test65(<1 x i64> %a, <1 x i64> %b) #0 {
 ;
 entry:
   %0 = bitcast <1 x i64> %a to <2 x i32>
-  %mmx_var.i = bitcast <2 x i32> %0 to x86_mmx
+  %mmx_var.i = bitcast <2 x i32> %0 to <1 x i64>
   %1 = extractelement <1 x i64> %b, i32 0
-  %mmx_var1.i = bitcast i64 %1 to x86_mmx
-  %2 = tail call x86_mmx @llvm.x86.mmx.psra.d(x86_mmx %mmx_var.i, x86_mmx %mmx_var1.i) nounwind
-  %3 = bitcast x86_mmx %2 to <2 x i32>
+  %mmx_var1.i = bitcast i64 %1 to <1 x i64>
+  %2 = tail call <1 x i64> @llvm.x86.mmx.psra.d(<1 x i64> %mmx_var.i, <1 x i64> %mmx_var1.i) nounwind
+  %3 = bitcast <1 x i64> %2 to <2 x i32>
   %4 = bitcast <2 x i32> %3 to <1 x i64>
   %5 = extractelement <1 x i64> %4, i32 0
   ret i64 %5
 }
 
-declare x86_mmx @llvm.x86.mmx.psra.w(x86_mmx, x86_mmx) nounwind readnone
+declare <1 x i64> @llvm.x86.mmx.psra.w(<1 x i64>, <1 x i64>) nounwind readnone
 
 define i64 @test64(<1 x i64> %a, <1 x i64> %b) #0 {
 ; CHECK-LABEL: define i64 @test64(
@@ -1104,17 +1104,17 @@ define i64 @test64(<1 x i64> %a, <1 x i64> %b) #0 {
 ;
 entry:
   %0 = bitcast <1 x i64> %a to <4 x i16>
-  %mmx_var.i = bitcast <4 x i16> %0 to x86_mmx
+  %mmx_var.i = bitcast <4 x i16> %0 to <1 x i64>
   %1 = extractelement <1 x i64> %b, i32 0
-  %mmx_var1.i = bitcast i64 %1 to x86_mmx
-  %2 = tail call x86_mmx @llvm.x86.mmx.psra.w(x86_mmx %mmx_var.i, x86_mmx %mmx_var1.i) nounwind
-  %3 = bitcast x86_mmx %2 to <4 x i16>
+  %mmx_var1.i = bitcast i64 %1 to <1 x i64>
+  %2 = tail call <1 x i64> @llvm.x86.mmx.psra.w(<1 x i64> %mmx_var.i, <1 x i64> %mmx_var1.i) nounwind
+  %3 = bitcast <1 x i64> %2 to <4 x i16>
   %4 = bitcast <4 x i16> %3 to <1 x i64>
   %5 = extractelement <1 x i64> %4, i32 0
   ret i64 %5
 }
 
-declare x86_mmx @llvm.x86.mmx.psrl.q(x86_mmx, x86_mmx) nounwind readnone
+declare <1 x i64> @llvm.x86.mmx.psrl.q(<1 x i64>, <1 x i64>) nounwind readnone
 
 define i64 @test63(<1 x i64> %a, <1 x i64> %b) #0 {
 ; CHECK-LABEL: define i64 @test63(
@@ -1145,15 +1145,15 @@ define i64 @test63(<1 x i64> %a, <1 x i64> %b) #0 {
 ;
 entry:
   %0 = extractelement <1 x i64> %a, i32 0
-  %mmx_var.i = bitcast i64 %0 to x86_mmx
+  %mmx_var.i = bitcast i64 %0 to <1 x i64>
   %1 = extractelement <1 x i64> %b, i32 0
-  %mmx_var1.i = bitcast i64 %1 to x86_mmx
-  %2 = tail call x86_mmx @llvm.x86.mmx.psrl.q(x86_mmx %mmx_var.i, x86_mmx %mmx_var1.i) nounwind
-  %3 = bitcast x86_mmx %2 to i64
+  %mmx_var1.i = bitcast i64 %1 to <1 x i64>
+  %2 = tail call <1 x i64> @llvm.x86.mmx.psrl.q(<1 x i64> %mmx_var.i, <1 x i64> %mmx_var1.i) nounwind
+  %3 = bitcast <1 x i64> %2 to i64
   ret i64 %3
 }
 
-declare x86_mmx @llvm.x86.mmx.psrl.d(x86_mmx, x86_mmx) nounwind readnone
+declare <1 x i64> @llvm.x86.mmx.psrl.d(<1 x i64>, <1 x i64>) nounwind readnone
 
 define i64 @test62(<1 x i64> %a, <1 x i64> %b) #0 {
 ; CHECK-LABEL: define i64 @test62(
@@ -1188,17 +1188,17 @@ define i64 @test62(<1 x i64> %a, <1 x i64> %b) #0 {
 ;
 entry:
   %0 = bitcast <1 x i64> %a to <2 x i32>
-  %mmx_var.i = bitcast <2 x i32> %0 to x86_mmx
+  %mmx_var.i = bitcast <2 x i32> %0 to <1 x i64>
   %1 = extractelement <1 x i64> %b, i32 0
-  %mmx_var1.i = bitcast i64 %1 to x86_mmx
-  %2 = tail call x86_mmx @llvm.x86.mmx.psrl.d(x86_mmx %mmx_var.i, x86_mmx %mmx_var1.i) nounwind
-  %3 = bitcast x86_mmx %2 to <2 x i32>
+  %mmx_var1.i = bitcast i64 %1 to <1 x i64>
+  %2 = tail call <1 x i64> @llvm.x86.mmx.psrl.d(<1 x i64> %mmx_var.i, <1 x i64> %mmx_var1.i) nounwind
+  %3 = bitcast <1 x i64> %2 to <2 x i32>
   %4 = bitcast <2 x i32> %3 to <1 x i64>
   %5 = extractelement <1 x i64> %4, i32 0
   ret i64 %5
 }
 
-declare x86_mmx @llvm.x86.mmx.psrl.w(x86_mmx, x86_mmx) nounwind readnone
+declare <1 x i64> @llvm.x86.mmx.psrl.w(<1 x i64>, <1 x i64>) nounwind readnone
 
 define i64 @test61(<1 x i64> %a, <1 x i64> %b) #0 {
 ; CHECK-LABEL: define i64 @test61(
@@ -1233,17 +1233,17 @@ define i64 @test61(<1 x i64> %a, <1 x i64> %b) #0 {
 ;
 entry:
   %0 = bitcast <1 x i64> %a to <4 x i16>
-  %mmx_var.i = bitcast <4 x i16> %0 to x86_mmx
+  %mmx_var.i = bitcast <4 x i16> %0 to <1 x i64>
   %1 = extractelement <1 x i64> %b, i32 0
-  %mmx_var1.i = bitcast i64 %1 to x86_mmx
-  %2 = tail call x86_mmx @llvm.x86.mmx.psrl.w(x86_mmx %mmx_var.i, x86_mmx %mmx_var1.i) nounwind
-  %3 = bitcast x86_mmx %2 to <4 x i16>
+  %mmx_var1.i = bitcast i64 %1 to <1 x i64>
+  %2 = tail call <1 x i64> @llvm.x86.mmx.psrl.w(<1 x i64> %mmx_var.i, <1 x i64> %mmx_var1.i) nounwind
+  %3 = bitcast <1 x i64> %2 to <4 x i16>
   %4 = bitcast <4 x i16> %3 to <1 x i64>
   %5 = extractelement <1 x i64> %4, i32 0
   ret i64 %5
 }
 
-declare x86_mmx @llvm.x86.mmx.psll.q(x86_mmx, x86_mmx) nounwind readnone
+declare <1 x i64> @llvm.x86.mmx.psll.q(<1 x i64>, <1 x i64>) nounwind readnone
 
 define i64 @test60(<1 x i64> %a, <1 x i64> %b) #0 {
 ; CHECK-LABEL: define i64 @test60(
@@ -1274,15 +1274,15 @@ define i64 @test60(<1 x i64> %a, <1 x i64> %b) #0 {
 ;
 entry:
   %0 = extractelement <1 x i64> %a, i32 0
-  %mmx_var.i = bitcast i64 %0 to x86_mmx
+  %mmx_var.i = bitcast i64 %0 to <1 x i64>
   %1 = extractelement <1 x i64> %b, i32 0
-  %mmx_var1.i = bitcast i64 %1 to x86_mmx
-  %2 = tail call x86_mmx @llvm.x86.mmx.psll.q(x86_mmx %mmx_var.i, x86_mmx %mmx_var1.i) nounwind
-  %3 = bitcast x86_mmx %2 to i64
+  %mmx_var1.i = bitcast i64 %1 to <1 x i64>
+  %2 = tail call <1 x i64> @llvm.x86.mmx.psll.q(<1 x i64> %mmx_var.i, <1 x i64> %mmx_var1.i) nounwind
+  %3 = bitcast <1 x i64> %2 to i64
   ret i64 %3
 }
 
-declare x86_mmx @llvm.x86.mmx.psll.d(x86_mmx, x86_mmx) nounwind readnone
+declare <1 x i64> @llvm.x86.mmx.psll.d(<1 x i64>, <1 x i64>) nounwind readnone
 
 define i64 @test59(<1 x i64> %a, <1 x i64> %b) #0 {
 ; CHECK-LABEL: define i64 @test59(
@@ -1317,17 +1317,17 @@ define i64 @test59(<1 x i64> %a, <1 x i64> %b) #0 {
 ;
 entry:
   %0 = bitcast <1 x i64> %a to <2 x i32>
-  %mmx_var.i = bitcast <2 x i32> %0 to x86_mmx
+  %mmx_var.i = bitcast <2 x i32> %0 to <1 x i64>
   %1 = extractelement <1 x i64> %b, i32 0
-  %mmx_var1.i = bitcast i64 %1 to x86_mmx
-  %2 = tail call x86_mmx @llvm.x86.mmx.psll.d(x86_mmx %mmx_var.i, x86_mmx %mmx_var1.i) nounwind
-  %3 = bitcast x86_mmx %2 to <2 x i32>
+  %mmx_var1.i = bitcast i64 %1 to <1 x i64>
+  %2 = tail call <1 x i64> @llvm.x86.mmx.psll.d(<1 x i64> %mmx_var.i, <1 x i64> %mmx_var1.i) nounwind
+  %3 = bitcast <1 x i64> %2 to <2 x i32>
   %4 = bitcast <2 x i32> %3 to <1 x i64>
   %5 = extractelement <1 x i64> %4, i32 0
   ret i64 %5
 }
 
-declare x86_mmx @llvm.x86.mmx.psll.w(x86_mmx, x86_mmx) nounwind readnone
+declare <1 x i64> @llvm.x86.mmx.psll.w(<1 x i64>, <1 x i64>) nounwind readnone
 
 define i64 @test58(<1 x i64> %a, <1 x i64> %b) #0 {
 ; CHECK-LABEL: define i64 @test58(
@@ -1362,17 +1362,17 @@ define i64 @test58(<1 x i64> %a, <1 x i64> %b) #0 {
 ;
 entry:
   %0 = bitcast <1 x i64> %a to <4 x i16>
-  %mmx_var.i = bitcast <4 x i16> %0 to x86_mmx
+  %mmx_var.i = bitcast <4 x i16> %0 to <1 x i64>
   %1 = extractelement <1 x i64> %b, i32 0
-  %mmx_var1.i = bitcast i64 %1 to x86_mmx
-  %2 = tail call x86_mmx @llvm.x86.mmx.psll.w(x86_mmx %mmx_var.i, x86_mmx %mmx_var1.i) nounwind
-  %3 = bitcast x86_mmx %2 to <4 x i16>
+  %mmx_var1.i = bitcast i64 %1 to <1 x i64>
+  %2 = tail call <1 x i64> @llvm.x86.mmx.psll.w(<1 x i64> %mmx_var.i, <1 x i64> %mmx_var1.i) nounwind
+  %3 = bitcast <1 x i64> %2 to <4 x i16>
   %4 = bitcast <4 x i16> %3 to <1 x i64>
   %5 = extractelement <1 x i64> %4, i32 0
   ret i64 %5
 }
 
-declare x86_mmx @llvm.x86.mmx.pxor(x86_mmx, x86_mmx) nounwind readnone
+declare <1 x i64> @llvm.x86.mmx.pxor(<1 x i64>, <1 x i64>) nounwind readnone
 
 define i64 @test56(<1 x i64> %a, <1 x i64> %b) #0 {
 ; CHECK-LABEL: define i64 @test56(
@@ -1403,16 +1403,16 @@ define i64 @test56(<1 x i64> %a, <1 x i64> %b) #0 {
 entry:
   %0 = bitcast <1 x i64> %b to <2 x i32>
   %1 = bitcast <1 x i64> %a to <2 x i32>
-  %mmx_var.i = bitcast <2 x i32> %1 to x86_mmx
-  %mmx_var1.i = bitcast <2 x i32> %0 to x86_mmx
-  %2 = tail call x86_mmx @llvm.x86.mmx.pxor(x86_mmx %mmx_var.i, x86_mmx %mmx_var1.i) nounwind
-  %3 = bitcast x86_mmx %2 to <2 x i32>
+  %mmx_var.i = bitcast <2 x i32> %1 to <1 x i64>
+  %mmx_var1.i = bitcast <2 x i32> %0 to <1 x i64>
+  %2 = tail call <1 x i64> @llvm.x86.mmx.pxor(<1 x i64> %mmx_var.i, <1 x i64> %mmx_var1.i) nounwind
+  %3 = bitcast <1 x i64> %2 to <2 x i32>
   %4 = bitcast <2 x i32> %3 to <1 x i64>
   %5 = extractelement <1 x i64> %4, i32 0
   ret i64 %5
 }
 
-declare x86_mmx @llvm.x86.mmx.por(x86_mmx, x86_mmx) nounwind readnone
+declare <1 x i64> @llvm.x86.mmx.por(<1 x i64>, <1 x i64>) nounwind readnone
 
 define i64 @test55(<1 x i64> %a, <1 x i64> %b) #0 {
 ; CHECK-LABEL: define i64 @test55(
@@ -1443,16 +1443,16 @@ define i64 @test55(<1 x i64> %a, <1 x i64> %b) #0 {
 entry:
   %0 = bitcast <1 x i64> %b to <2 x i32>
   %1 = bitcast <1 x i64> %a to <2 x i32>
-  %mmx_var.i = bitcast <2 x i32> %1 to x86_mmx
-  %mmx_var1.i = bitcast <2 x i32> %0 to x86_mmx
-  %2 = tail call x86_mmx @llvm.x86.mmx.por(x86_mmx %mmx_var.i, x86_mmx %mmx_var1.i) nounwind
-  %3 = bitcast x86_mmx %2 to <2 x i32>
+  %mmx_var.i = bitcast <2 x i32> %1 to <1 x i64>
+  %mmx_var1.i = bitcast <2 x i32> %0 to <1 x i64>
+  %2 = tail call <1 x i64> @llvm.x86.mmx.por(<1 x i64> %mmx_var.i, <1 x i64> %mmx_var1.i) nounwind
+  %3 = bitcast <1 x i64> %2 to <2 x i32>
   %4 = bitcast <2 x i32> %3 to <1 x i64>
   %5 = extractelement <1 x i64> %4, i32 0
   ret i64 %5
 }
 
-declare x86_mmx @llvm.x86.mmx.pandn(x86_mmx, x86_mmx) nounwind readnone
+declare <1 x i64> @llvm.x86.mmx.pandn(<1 x i64>, <1 x i64>) nounwind readnone
 
 define i64 @test54(<1 x i64> %a, <1 x i64> %b) #0 {
 ; CHECK-LABEL: define i64 @test54(
@@ -1483,16 +1483,16 @@ define i64 @test54(<1 x i64> %a, <1 x i64> %b) #0 {
 entry:
   %0 = bitcast <1 x i64> %b to <2 x i32>
   %1 = bitcast <1 x i64> %a to <2 x i32>
-  %mmx_var.i = bitcast <2 x i32> %1 to x86_mmx
-  %mmx_var1.i = bitcast <2 x i32> %0 to x86_mmx
-  %2 = tail call x86_mmx @llvm.x86.mmx.pandn(x86_mmx %mmx_var.i, x86_mmx %mmx_var1.i) nounwind
-  %3 = bitcast x86_mmx %2 to <2 x i32>
+  %mmx_var.i = bitcast <2 x i32> %1 to <1 x i64>
+  %mmx_var1.i = bitcast <2 x i32> %0 to <1 x i64>
+  %2 = tail call <1 x i64> @llvm.x86.mmx.pandn(<1 x i64> %mmx_var.i, <1 x i64> %mmx_var1.i) nounwind
+  %3 = bitcast <1 x i64> %2 to <2 x i32>
   %4 = bitcast <2 x i32> %3 to <1 x i64>
   %5 = extractelement <1 x i64> %4, i32 0
   ret i64 %5
 }
 
-declare x86_mmx @llvm.x86.mmx.pand(x86_mmx, x86_mmx) nounwind readnone
+declare <1 x i64> @llvm.x86.mmx.pand(<1 x i64>, <1 x i64>) nounwind readnone
 
 define i64 @test53(<1 x i64> %a, <1 x i64> %b) #0 {
 ; CHECK-LABEL: define i64 @test53(
@@ -1523,16 +1523,16 @@ define i64 @test53(<1 x i64> %a, <1 x i64> %b) #0 {
 entry:
   %0 = bitcast <1 x i64> %b to <2 x i32>
   %1 = bitcast <1 x i64> %a to <2 x i32>
-  %mmx_var.i = bitcast <2 x i32> %1 to x86_mmx
-  %mmx_var1.i = bitcast <2 x i32> %0 to x86_mmx
-  %2 = tail call x86_mmx @llvm.x86.mmx.pand(x86_mmx %mmx_var.i, x86_mmx %mmx_var1.i) nounwind
-  %3 = bitcast x86_mmx %2 to <2 x i32>
+  %mmx_var.i = bitcast <2 x i32> %1 to <1 x i64>
+  %mmx_var1.i = bitcast <2 x i32> %0 to <1 x i64>
+  %2 = tail call <1 x i64> @llvm.x86.mmx.pand(<1 x i64> %mmx_var.i, <1 x i64> %mmx_var1.i) nounwind
+  %3 = bitcast <1 x i64> %2 to <2 x i32>
   %4 = bitcast <2 x i32> %3 to <1 x i64>
   %5 = extractelement <1 x i64> %4, i32 0
   ret i64 %5
 }
 
-declare x86_mmx @llvm.x86.mmx.pmull.w(x86_mmx, x86_mmx) nounwind readnone
+declare <1 x i64> @llvm.x86.mmx.pmull.w(<1 x i64>, <1 x i64>) nounwind readnone
 
 define i64 @test52(<1 x i64> %a, <1 x i64> %b) #0 {
 ; CHECK-LABEL: define i64 @test52(
@@ -1563,10 +1563,10 @@ define i64 @test52(<1 x i64> %a, <1 x i64> %b) #0 {
 entry:
   %0 = bitcast <1 x i64> %b to <4 x i16>
   %1 = bitcast <1 x i64> %a to <4 x i16>
-  %mmx_var.i = bitcast <4 x i16> %1 to x86_mmx
-  %mmx_var1.i = bitcast <4 x i16> %0 to x86_mmx
-  %2 = tail call x86_mmx @llvm.x86.mmx.pmull.w(x86_mmx %mmx_var.i, x86_mmx %mmx_var1.i) nounwind
-  %3 = bitcast x86_mmx %2 to <4 x i16>
+  %mmx_var.i = bitcast <4 x i16> %1 to <1 x i64>
+  %mmx_var1.i = bitcast <4 x i16> %0 to <1 x i64>
+  %2 = tail call <1 x i64> @llvm.x86.mmx.pmull.w(<1 x i64> %mmx_var.i, <1 x i64> %mmx_var1.i) nounwind
+  %3 = bitcast <1 x i64> %2 to <4 x i16>
   %4 = bitcast <4 x i16> %3 to <1 x i64>
   %5 = extractelement <1 x i64> %4, i32 0
   ret i64 %5
@@ -1601,16 +1601,16 @@ define i64 @test51(<1 x i64> %a, <1 x i64> %b) #0 {
 entry:
   %0 = bitcast <1 x i64> %b to <4 x i16>
   %1 = bitcast <1 x i64> %a to <4 x i16>
-  %mmx_var.i = bitcast <4 x i16> %1 to x86_mmx
-  %mmx_var1.i = bitcast <4 x i16> %0 to x86_mmx
-  %2 = tail call x86_mmx @llvm.x86.mmx.pmull.w(x86_mmx %mmx_var.i, x86_mmx %mmx_var1.i) nounwind
-  %3 = bitcast x86_mmx %2 to <4 x i16>
+  %mmx_var.i = bitcast <4 x i16> %1 to <1 x i64>
+  %mmx_var1.i = bitcast <4 x i16> %0 to <1 x i64>
+  %2 = tail call <1 x i64> @llvm.x86.mmx.pmull.w(<1 x i64> %mmx_var.i, <1 x i64> %mmx_var1.i) nounwind
+  %3 = bitcast <1 x i64> %2 to <4 x i16>
   %4 = bitcast <4 x i16> %3 to <1 x i64>
   %5 = extractelement <1 x i64> %4, i32 0
   ret i64 %5
 }
 
-declare x86_mmx @llvm.x86.mmx.pmulh.w(x86_mmx, x86_mmx) nounwind readnone
+declare <1 x i64> @llvm.x86.mmx.pmulh.w(<1 x i64>, <1 x i64>) nounwind readnone
 
 define i64 @test50(<1 x i64> %a, <1 x i64> %b) #0 {
 ; CHECK-LABEL: define i64 @test50(
@@ -1641,16 +1641,16 @@ define i64 @test50(<1 x i64> %a, <1 x i64> %b) #0 {
 entry:
   %0 = bitcast <1 x i64> %b to <4 x i16>
   %1 = bitcast <1 x i64> %a to <4 x i16>
-  %mmx_var.i = bitcast <4 x i16> %1 to x86_mmx
-  %mmx_var1.i = bitcast <4 x i16> %0 to x86_mmx
-  %2 = tail call x86_mmx @llvm.x86.mmx.pmulh.w(x86_mmx %mmx_var.i, x86_mmx %mmx_var1.i) nounwind
-  %3 = bitcast x86_mmx %2 to <4 x i16>
+  %mmx_var.i = bitcast <4 x i16> %1 to <1 x i64>
+  %mmx_var1.i = bitcast <4 x i16> %0 to <1 x i64>
+  %2 = tail call <1 x i64> @llvm.x86.mmx.pmulh.w(<1 x i64> %mmx_var.i, <1 x i64> %mmx_var1.i) nounwind
+  %3 = bitcast <1 x i64> %2 to <4 x i16>
   %4 = bitcast <4 x i16> %3 to <1 x i64>
   %5 = extractelement <1 x i64> %4, i32 0
   ret i64 %5
 }
 
-declare x86_mmx @llvm.x86.mmx.pmadd.wd(x86_mmx, x86_mmx) nounwind readnone
+declare <1 x i64> @llvm.x86.mmx.pmadd.wd(<1 x i64>, <1 x i64>) nounwind readnone
 
 define i64 @test49(<1 x i64> %a, <1 x i64> %b) #0 {
 ; CHECK-LABEL: define i64 @test49(
@@ -1685,16 +1685,16 @@ define i64 @test49(<1 x i64> %a, <1 x i64> %b) #0 {
 entry:
   %0 = bitcast <1 x i64> %b to <4 x i16>
   %1 = bitcast <1 x i64> %a to <4 x i16>
-  %mmx_var.i = bitcast <4 x i16> %1 to x86_mmx
-  %mmx_var1.i = bitcast <4 x i16> %0 to x86_mmx
-  %2 = tail call x86_mmx @llvm.x86.mmx.pmadd.wd(x86_mmx %mmx_var.i, x86_mmx %mmx_var1.i) nounwind
-  %3 = bitcast x86_mmx %2 to <2 x i32>
+  %mmx_var.i = bitcast <4 x i16> %1 to <1 x i64>
+  %mmx_var1.i = bitcast <4 x i16> %0 to <1 x i64>
+  %2 = tail call <1 x i64> @llvm.x86.mmx.pmadd.wd(<1 x i64> %mmx_var.i, <1 x i64> %mmx_var1.i) nounwind
+  %3 = bitcast <1 x i64> %2 to <2 x i32>
   %4 = bitcast <2 x i32> %3 to <1 x i64>
   %5 = extractelement <1 x i64> %4, i32 0
   ret i64 %5
 }
 
-declare x86_mmx @llvm.x86.mmx.psubus.w(x86_mmx, x86_mmx) nounwind readnone
+declare <1 x i64> @llvm.x86.mmx.psubus.w(<1 x i64>, <1 x i64>) nounwind readnone
 
 define i64 @test48(<1 x i64> %a, <1 x i64> %b) #0 {
 ; CHECK-LABEL: define i64 @test48(
@@ -1725,16 +1725,16 @@ define i64 @test48(<1 x i64> %a, <1 x i64> %b) #0 {
 entry:
   %0 = bitcast <1 x i64> %b to <4 x i16>
   %1 = bitcast <1 x i64> %a to <4 x i16>
-  %mmx_var.i = bitcast <4 x i16> %1 to x86_mmx
-  %mmx_var1.i = bitcast <4 x i16> %0 to x86_mmx
-  %2 = tail call x86_mmx @llvm.x86.mmx.psubus.w(x86_mmx %mmx_var.i, x86_mmx %mmx_var1.i) nounwind
-  %3 = bitcast x86_mmx %2 to <4 x i16>
+  %mmx_var.i = bitcast <4 x i16> %1 to <1 x i64>
+  %mmx_var1.i = bitcast <4 x i16> %0 to <1 x i64>
+  %2 = tail call <1 x i64> @llvm.x86.mmx.psubus.w(<1 x i64> %mmx_var.i, <1 x i64> %mmx_var1.i) nounwind
+  %3 = bitcast <1 x i64> %2 to <4 x i16>
   %4 = bitcast <4 x i16> %3 to <1 x i64>
   %5 = extractelement <1 x i64> %4, i32 0
   ret i64 %5
 }
 
-declare x86_mmx @llvm.x86.mmx.psubus.b(x86_mmx, x86_mmx) nounwind readnone
+declare <1 x i64> @llvm.x86.mmx.psubus.b(<1 x i64>, <1 x i64>) nounwind readnone
 
 define i64 @test47(<1 x i64> %a, <1 x i64> %b) #0 {
 ; CHECK-LABEL: define i64 @test47(
@@ -1765,16 +1765,16 @@ define i64 @test47(<1 x i64> %a, <1 x i64> %b) #0 {
 entry:
   %0 = bitcast <1 x i64> %b to <8 x i8>
   %1 = bitcast <1 x i64> %a to <8 x i8>
-  %mmx_var.i = bitcast <8 x i8> %1 to x86_mmx
-  %mmx_var1.i = bitcast <8 x i8> %0 to x86_mmx
-  %2 = tail call x86_mmx @llvm.x86.mmx.psubus.b(x86_mmx %mmx_var.i, x86_mmx %mmx_var1.i) nounwind
-  %3 = bitcast x86_mmx %2 to <8 x i8>
+  %mmx_var.i = bitcast <8 x i8> %1 to <1 x i64>
+  %mmx_var1.i = bitcast <8 x i8> %0 to <1 x i64>
+  %2 = tail call <1 x i64> @llvm.x86.mmx.psubus.b(<1 x i64> %mmx_var.i, <1 x i64> %mmx_var1.i) nounwind
+  %3 = bitcast <1 x i64> %2 to <8 x i8>
   %4 = bitcast <8 x i8> %3 to <1 x i64>
   %5 = extractelement <1 x i64> %4, i32 0
   ret i64 %5
 }
 
-declare x86_mmx @llvm.x86.mmx.psubs.w(x86_mmx, x86_mmx) nounwind readnone
+declare <1 x i64> @llvm.x86.mmx.psubs.w(<1 x i64>, <1 x i64>) nounwind readnone
 
 define i64 @test46(<1 x i64> %a, <1 x i64> %b) #0 {
 ; CHECK-LABEL: define i64 @test46(
@@ -1805,16 +1805,16 @@ define i64 @test46(<1 x i64> %a, <1 x i64> %b) #0 {
 entry:
   %0 = bitcast <1 x i64> %b to <4 x i16>
   %1 = bitcast <1 x i64> %a to <4 x i16>
-  %mmx_var.i = bitcast <4 x i16> %1 to x86_mmx
-  %mmx_var1.i = bitcast <4 x i16> %0 to x86_mmx
-  %2 = tail call x86_mmx @llvm.x86.mmx.psubs.w(x86_mmx %mmx_var.i, x86_mmx %mmx_var1.i) nounwind
-  %3 = bitcast x86_mmx %2 to <4 x i16>
+  %mmx_var.i = bitcast <4 x i16> %1 to <1 x i64>
+  %mmx_var1.i = bitcast <4 x i16> %0 to <1 x i64>
+  %2 = tail call <1 x i64> @llvm.x86.mmx.psubs.w(<1 x i64> %mmx_var.i, <1 x i64> %mmx_var1.i) nounwind
+  %3 = bitcast <1 x i64> %2 to <4 x i16>
   %4 = bitcast <4 x i16> %3 to <1 x i64>
   %5 = extractelement <1 x i64> %4, i32 0
   ret i64 %5
 }
 
-declare x86_mmx @llvm.x86.mmx.psubs.b(x86_mmx, x86_mmx) nounwind readnone
+declare <1 x i64> @llvm.x86.mmx.psubs.b(<1 x i64>, <1 x i64>) nounwind readnone
 
 define i64 @test45(<1 x i64> %a, <1 x i64> %b) #0 {
 ; CHECK-LABEL: define i64 @test45(
@@ -1845,10 +1845,10 @@ define i64 @test45(<1 x i64> %a, <1 x i64> %b) #0 {
 entry:
   %0 = bitcast <1 x i64> %b to <8 x i8>
   %1 = bitcast <1 x i64> %a to <8 x i8>
-  %mmx_var.i = bitcast <8 x i8> %1 to x86_mmx
-  %mmx_var1.i = bitcast <8 x i8> %0 to x86_mmx
-  %2 = tail call x86_mmx @llvm.x86.mmx.psubs.b(x86_mmx %mmx_var.i, x86_mmx %mmx_var1.i) nounwind
-  %3 = bitcast x86_mmx %2 to <8 x i8>
+  %mmx_var.i = bitcast <8 x i8> %1 to <1 x i64>
+  %mmx_var1.i = bitcast <8 x i8> %0 to <1 x i64>
+  %2 = tail call <1 x i64> @llvm.x86.mmx.psubs.b(<1 x i64> %mmx_var.i, <1 x i64> %mmx_var1.i) nounwind
+  %3 = bitcast <1 x i64> %2 to <8 x i8>
   %4 = bitcast <8 x i8> %3 to <1 x i64>
   %5 = extractelement <1 x i64> %4, i32 0
   ret i64 %5
@@ -1878,17 +1878,17 @@ define i64 @test44(<1 x i64> %a, <1 x i64> %b) #0 {
 ;
 entry:
   %0 = extractelement <1 x i64> %a, i32 0
-  %mmx_var = bitcast i64 %0 to x86_mmx
+  %mmx_var = bitcast i64 %0 to <1 x i64>
   %1 = extractelement <1 x i64> %b, i32 0
-  %mmx_var1 = bitcast i64 %1 to x86_mmx
-  %2 = tail call x86_mmx @llvm.x86.mmx.psub.q(x86_mmx %mmx_var, x86_mmx %mmx_var1)
-  %3 = bitcast x86_mmx %2 to i64
+  %mmx_var1 = bitcast i64 %1 to <1 x i64>
+  %2 = tail call <1 x i64> @llvm.x86.mmx.psub.q(<1 x i64> %mmx_var, <1 x i64> %mmx_var1)
+  %3 = bitcast <1 x i64> %2 to i64
   ret i64 %3
 }
 
-declare x86_mmx @llvm.x86.mmx.psub.q(x86_mmx, x86_mmx) nounwind readnone
+declare <1 x i64> @llvm.x86.mmx.psub.q(<1 x i64>, <1 x i64>) nounwind readnone
 
-declare x86_mmx @llvm.x86.mmx.psub.d(x86_mmx, x86_mmx) nounwind readnone
+declare <1 x i64> @llvm.x86.mmx.psub.d(<1 x i64>, <1 x i64>) nounwind readnone
 
 define i64 @test43(<1 x i64> %a, <1 x i64> %b) #0 {
 ; CHECK-LABEL: define i64 @test43(
@@ -1919,16 +1919,16 @@ define i64 @test43(<1 x i64> %a, <1 x i64> %b) #0 {
 entry:
   %0 = bitcast <1 x i64> %b to <2 x i32>
   %1 = bitcast <1 x i64> %a to <2 x i32>
-  %mmx_var.i = bitcast <2 x i32> %1 to x86_mmx
-  %mmx_var1.i = bitcast <2 x i32> %0 to x86_mmx
-  %2 = tail call x86_mmx @llvm.x86.mmx.psub.d(x86_mmx %mmx_var.i, x86_mmx %mmx_var1.i) nounwind
-  %3 = bitcast x86_mmx %2 to <2 x i32>
+  %mmx_var.i = bitcast <2 x i32> %1 to <1 x i64>
+  %mmx_var1.i = bitcast <2 x i32> %0 to <1 x i64>
+  %2 = tail call <1 x i64> @llvm.x86.mmx.psub.d(<1 x i64> %mmx_var.i, <1 x i64> %mmx_var1.i) nounwind
+  %3 = bitcast <1 x i64> %2 to <2 x i32>
   %4 = bitcast <2 x i32> %3 to <1 x i64>
   %5 = extractelement <1 x i64> %4, i32 0
   ret i64 %5
 }
 
-declare x86_mmx @llvm.x86.mmx.psub.w(x86_mmx, x86_mmx) nounwind readnone
+declare <1 x i64> @llvm.x86.mmx.psub.w(<1 x i64>, <1 x i64>) nounwind readnone
 
 define i64 @test42(<1 x i64> %a, <1 x i64> %b) #0 {
 ; CHECK-LABEL: define i64 @test42(
@@ -1959,16 +1959,16 @@ define i64 @test42(<1 x i64> %a, <1 x i64> %b) #0 {
 entry:
   %0 = bitcast <1 x i64> %b to <4 x i16>
   %1 = bitcast <1 x i64> %a to <4 x i16>
-  %mmx_var.i = bitcast <4 x i16> %1 to x86_mmx
-  %mmx_var1.i = bitcast <4 x i16> %0 to x86_mmx
-  %2 = tail call x86_mmx @llvm.x86.mmx.psub.w(x86_mmx %mmx_var.i, x86_mmx %mmx_var1.i) nounwind
-  %3 = bitcast x86_mmx %2 to <4 x i16>
+  %mmx_var.i = bitcast <4 x i16> %1 to <1 x i64>
+  %mmx_var1.i = bitcast <4 x i16> %0 to <1 x i64>
+  %2 = tail call <1 x i64> @llvm.x86.mmx.psub.w(<1 x i64> %mmx_var.i, <1 x i64> %mmx_var1.i) nounwind
+  %3 = bitcast <1 x i64> %2 to <4 x i16>
   %4 = bitcast <4 x i16> %3 to <1 x i64>
   %5 = extractelement <1 x i64> %4, i32 0
   ret i64 %5
 }
 
-declare x86_mmx @llvm.x86.mmx.psub.b(x86_mmx, x86_mmx) nounwind readnone
+declare <1 x i64> @llvm.x86.mmx.psub.b(<1 x i64>, <1 x i64>) nounwind readnone
 
 define i64 @test41(<1 x i64> %a, <1 x i64> %b) #0 {
 ; CHECK-LABEL: define i64 @test41(
@@ -1999,16 +1999,16 @@ define i64 @test41(<1 x i64> %a, <1 x i64> %b) #0 {
 entry:
   %0 = bitcast <1 x i64> %b to <8 x i8>
   %1 = bitcast <1 x i64> %a to <8 x i8>
-  %mmx_var.i = bitcast <8 x i8> %1 to x86_mmx
-  %mmx_var1.i = bitcast <8 x i8> %0 to x86_mmx
-  %2 = tail call x86_mmx @llvm.x86.mmx.psub.b(x86_mmx %mmx_var.i, x86_mmx %mmx_var1.i) nounwind
-  %3 = bitcast x86_mmx %2 to <8 x i8>
+  %mmx_var.i = bitcast <8 x i8> %1 to <1 x i64>
+  %mmx_var1.i = bitcast <8 x i8> %0 to <1 x i64>
+  %2 = tail call <1 x i64> @llvm.x86.mmx.psub.b(<1 x i64> %mmx_var.i, <1 x i64> %mmx_var1.i) nounwind
+  %3 = bitcast <1 x i64> %2 to <8 x i8>
   %4 = bitcast <8 x i8> %3 to <1 x i64>
   %5 = extractelement <1 x i64> %4, i32 0
   ret i64 %5
 }
 
-declare x86_mmx @llvm.x86.mmx.paddus.w(x86_mmx, x86_mmx) nounwind readnone
+declare <1 x i64> @llvm.x86.mmx.paddus.w(<1 x i64>, <1 x i64>) nounwind readnone
 
 define i64 @test40(<1 x i64> %a, <1 x i64> %b) #0 {
 ; CHECK-LABEL: define i64 @test40(
@@ -2039,16 +2039,16 @@ define i64 @test40(<1 x i64> %a, <1 x i64> %b) #0 {
 entry:
   %0 = bitcast <1 x i64> %b to <4 x i16>
   %1 = bitcast <1 x i64> %a to <4 x i16>
-  %mmx_var.i = bitcast <4 x i16> %1 to x86_mmx
-  %mmx_var1.i = bitcast <4 x i16> %0 to x86_mmx
-  %2 = tail call x86_mmx @llvm.x86.mmx.paddus.w(x86_mmx %mmx_var.i, x86_mmx %mmx_var1.i) nounwind
-  %3 = bitcast x86_mmx %2 to <4 x i16>
+  %mmx_var.i = bitcast <4 x i16> %1 to <1 x i64>
+  %mmx_var1.i = bitcast <4 x i16> %0 to <1 x i64>
+  %2 = tail call <1 x i64> @llvm.x86.mmx.paddus.w(<1 x i64> %mmx_var.i, <1 x i64> %mmx_var1.i) nounwind
+  %3 = bitcast <1 x i64> %2 to <4 x i16>
   %4 = bitcast <4 x i16> %3 to <1 x i64>
   %5 = extractelement <1 x i64> %4, i32 0
   ret i64 %5
 }
 
-declare x86_mmx @llvm.x86.mmx.paddus.b(x86_mmx, x86_mmx) nounwind readnone
+declare <1 x i64> @llvm.x86.mmx.paddus.b(<1 x i64>, <1 x i64>) nounwind readnone
 
 define i64 @test39(<1 x i64> %a, <1 x i64> %b) #0 {
 ; CHECK-LABEL: define i64 @test39(
@@ -2079,16 +2079,16 @@ define i64 @test39(<1 x i64> %a, <1 x i64> %b) #0 {
 entry:
   %0 = bitcast <1 x i64> %b to <8 x i8>
   %1 = bitcast <1 x i64> %a to <8 x i8>
-  %mmx_var.i = bitcast <8 x i8> %1 to x86_mmx
-  %mmx_var1.i = bitcast <8 x i8> %0 to x86_mmx
-  %2 = tail call x86_mmx @llvm.x86.mmx.paddus.b(x86_mmx %mmx_var.i, x86_mmx %mmx_var1.i) nounwind
-  %3 = bitcast x86_mmx %2 to <8 x i8>
+  %mmx_var.i = bitcast <8 x i8> %1 to <1 x i64>
+  %mmx_var1.i = bitcast <8 x i8> %0 to <1 x i64>
+  %2 = tail call <1 x i64> @llvm.x86.mmx.paddus.b(<1 x i64> %mmx_var.i, <1 x i64> %mmx_var1.i) nounwind
+  %3 = bitcast <1 x i64> %2 to <8 x i8>
   %4 = bitcast <8 x i8> %3 to <1 x i64>
   %5 = extractelement <1 x i64> %4, i32 0
   ret i64 %5
 }
 
-declare x86_mmx @llvm.x86.mmx.padds.w(x86_mmx, x86_mmx) nounwind readnone
+declare <1 x i64> @llvm.x86.mmx.padds.w(<1 x i64>, <1 x i64>) nounwind readnone
 
 define i64 @test38(<1 x i64> %a, <1 x i64> %b) #0 {
 ; CHECK-LABEL: define i64 @test38(
@@ -2119,16 +2119,16 @@ define i64 @test38(<1 x i64> %a, <1 x i64> %b) #0 {
 entry:
   %0 = bitcast <1 x i64> %b to <4 x i16>
   %1 = bitcast <1 x i64> %a to <4 x i16>
-  %mmx_var.i = bitcast <4 x i16> %1 to x86_mmx
-  %mmx_var1.i = bitcast <4 x i16> %0 to x86_mmx
-  %2 = tail call x86_mmx @llvm.x86.mmx.padds.w(x86_mmx %mmx_var.i, x86_mmx %mmx_var1.i) nounwind
-  %3 = bitcast x86_mmx %2 to <4 x i16>
+  %mmx_var.i = bitcast <4 x i16> %1 to <1 x i64>
+  %mmx_var1.i = bitcast <4 x i16> %0 to <1 x i64>
+  %2 = tail call <1 x i64> @llvm.x86.mmx.padds.w(<1 x i64> %mmx_var.i, <1 x i64> %mmx_var1.i) nounwind
+  %3 = bitcast <1 x i64> %2 to <4 x i16>
   %4 = bitcast <4 x i16> %3 to <1 x i64>
   %5 = extractelement <1 x i64> %4, i32 0
   ret i64 %5
 }
 
-declare x86_mmx @llvm.x86.mmx.padds.b(x86_mmx, x86_mmx) nounwind readnone
+declare <1 x i64> @llvm.x86.mmx.padds.b(<1 x i64>, <1 x i64>) nounwind readnone
 
 define i64 @test37(<1 x i64> %a, <1 x i64> %b) #0 {
 ; CHECK-LABEL: define i64 @test37(
@@ -2159,16 +2159,16 @@ define i64 @test37(<1 x i64> %a, <1 x i64> %b) #0 {
 entry:
   %0 = bitcast <1 x i64> %b to <8 x i8>
   %1 = bitcast <1 x i64> %a to <8 x i8>
-  %mmx_var.i = bitcast <8 x i8> %1 to x86_mmx
-  %mmx_var1.i = bitcast <8 x i8> %0 to x86_mmx
-  %2 = tail call x86_mmx @llvm.x86.mmx.padds.b(x86_mmx %mmx_var.i, x86_mmx %mmx_var1.i) nounwind
-  %3 = bitcast x86_mmx %2 to <8 x i8>
+  %mmx_var.i = bitcast <8 x i8> %1 to <1 x i64>
+  %mmx_var1.i = bitcast <8 x i8> %0 to <1 x i64>
+  %2 = tail call <1 x i64> @llvm.x86.mmx.padds.b(<1 x i64> %mmx_var.i, <1 x i64> %mmx_var1.i) nounwind
+  %3 = bitcast <1 x i64> %2 to <8 x i8>
   %4 = bitcast <8 x i8> %3 to <1 x i64>
   %5 = extractelement <1 x i64> %4, i32 0
   ret i64 %5
 }
 
-declare x86_mmx @llvm.x86.mmx.padd.q(x86_mmx, x86_mmx) nounwind readnone
+declare <1 x i64> @llvm.x86.mmx.padd.q(<1 x i64>, <1 x i64>) nounwind readnone
 
 define i64 @test36(<1 x i64> %a, <1 x i64> %b) #0 {
 ; CHECK-LABEL: define i64 @test36(
@@ -2194,15 +2194,15 @@ define i64 @test36(<1 x i64> %a, <1 x i64> %b) #0 {
 ;
 entry:
   %0 = extractelement <1 x i64> %a, i32 0
-  %mmx_var = bitcast i64 %0 to x86_mmx
+  %mmx_var = bitcast i64 %0 to <1 x i64>
   %1 = extractelement <1 x i64> %b, i32 0
-  %mmx_var1 = bitcast i64 %1 to x86_mmx
-  %2 = tail call x86_mmx @llvm.x86.mmx.padd.q(x86_mmx %mmx_var, x86_mmx %mmx_var1)
-  %3 = bitcast x86_mmx %2 to i64
+  %mmx_var1 = bitcast i64 %1 to <1 x i64>
+  %2 = tail call <1 x i64> @llvm.x86.mmx.padd.q(<1 x i64> %mmx_var, <1 x i64> %mmx_var1)
+  %3 = bitcast <1 x i64> %2 to i64
   ret i64 %3
 }
 
-declare x86_mmx @llvm.x86.mmx.padd.d(x86_mmx, x86_mmx) nounwind readnone
+declare <1 x i64> @llvm.x86.mmx.padd.d(<1 x i64>, <1 x i64>) nounwind readnone
 
 define i64 @test35(<1 x i64> %a, <1 x i64> %b) #0 {
 ; CHECK-LABEL: define i64 @test35(
@@ -2233,16 +2233,16 @@ define i64 @test35(<1 x i64> %a, <1 x i64> %b) #0 {
 entry:
   %0 = bitcast <1 x i64> %b to <2 x i32>
   %1 = bitcast <1 x i64> %a to <2 x i32>
-  %mmx_var.i = bitcast <2 x i32> %1 to x86_mmx
-  %mmx_var1.i = bitcast <2 x i32> %0 to x86_mmx
-  %2 = tail call x86_mmx @llvm.x86.mmx.padd.d(x86_mmx %mmx_var.i, x86_mmx %mmx_var1.i) nounwind
-  %3 = bitcast x86_mmx %2 to <2 x i32>
+  %mmx_var.i = bitcast <2 x i32> %1 to <1 x i64>
+  %mmx_var1.i = bitcast <2 x i32> %0 to <1 x i64>
+  %2 = tail call <1 x i64> @llvm.x86.mmx.padd.d(<1 x i64> %mmx_var.i, <1 x i64> %mmx_var1.i) nounwind
+  %3 = bitcast <1 x i64> %2 to <2 x i32>
   %4 = bitcast <2 x i32> %3 to <1 x i64>
   %5 = extractelement <1 x i64> %4, i32 0
   ret i64 %5
 }
 
-declare x86_mmx @llvm.x86.mmx.padd.w(x86_mmx, x86_mmx) nounwind readnone
+declare <1 x i64> @llvm.x86.mmx.padd.w(<1 x i64>, <1 x i64>) nounwind readnone
 
 define i64 @test34(<1 x i64> %a, <1 x i64> %b) #0 {
 ; CHECK-LABEL: define i64 @test34(
@@ -2273,16 +2273,16 @@ define i64 @test34(<1 x i64> %a, <1 x i64> %b) #0 {
 entry:
   %0 = bitcast <1 x i64> %b to <4 x i16>
   %1 = bitcast <1 x i64> %a to <4 x i16>
-  %mmx_var.i = bitcast <4 x i16> %1 to x86_mmx
-  %mmx_var1.i = bitcast <4 x i16> %0 to x86_mmx
-  %2 = tail call x86_mmx @llvm.x86.mmx.padd.w(x86_mmx %mmx_var.i, x86_mmx %mmx_var1.i) nounwind
-  %3 = bitcast x86_mmx %2 to <4 x i16>
+  %mmx_var.i = bitcast <4 x i16> %1 to <1 x i64>
+  %mmx_var1.i = bitcast <4 x i16> %0 to <1 x i64>
+  %2 = tail call <1 x i64> @llvm.x86.mmx.padd.w(<1 x i64> %mmx_var.i, <1 x i64> %mmx_var1.i) nounwind
+  %3 = bitcast <1 x i64> %2 to <4 x i16>
   %4 = bitcast <4 x i16> %3 to <1 x i64>
   %5 = extractelement <1 x i64> %4, i32 0
   ret i64 %5
 }
 
-declare x86_mmx @llvm.x86.mmx.padd.b(x86_mmx, x86_mmx) nounwind readnone
+declare <1 x i64> @llvm.x86.mmx.padd.b(<1 x i64>, <1 x i64>) nounwind readnone
 
 define i64 @test33(<1 x i64> %a, <1 x i64> %b) #0 {
 ; CHECK-LABEL: define i64 @test33(
@@ -2313,16 +2313,16 @@ define i64 @test33(<1 x i64> %a, <1 x i64> %b) #0 {
 entry:
   %0 = bitcast <1 x i64> %b to <8 x i8>
   %1 = bitcast <1 x i64> %a to <8 x i8>
-  %mmx_var.i = bitcast <8 x i8> %1 to x86_mmx
-  %mmx_var1.i = bitcast <8 x i8> %0 to x86_mmx
-  %2 = tail call x86_mmx @llvm.x86.mmx.padd.b(x86_mmx %mmx_var.i, x86_mmx %mmx_var1.i) nounwind
-  %3 = bitcast x86_mmx %2 to <8 x i8>
+  %mmx_var.i = bitcast <8 x i8> %1 to <1 x i64>
+  %mmx_var1.i = bitcast <8 x i8> %0 to <1 x i64>
+  %2 = tail call <1 x i64> @llvm.x86.mmx.padd.b(<1 x i64> %mmx_var.i, <1 x i64> %mmx_var1.i) nounwind
+  %3 = bitcast <1 x i64> %2 to <8 x i8>
   %4 = bitcast <8 x i8> %3 to <1 x i64>
   %5 = extractelement <1 x i64> %4, i32 0
   ret i64 %5
 }
 
-declare x86_mmx @llvm.x86.mmx.psad.bw(x86_mmx, x86_mmx) nounwind readnone
+declare <1 x i64> @llvm.x86.mmx.psad.bw(<1 x i64>, <1 x i64>) nounwind readnone
 
 define i64 @test32(<1 x i64> %a, <1 x i64> %b) #0 {
 ; CHECK-LABEL: define i64 @test32(
@@ -2354,14 +2354,14 @@ define i64 @test32(<1 x i64> %a, <1 x i64> %b) #0 {
 entry:
   %0 = bitcast <1 x i64> %b to <8 x i8>
   %1 = bitcast <1 x i64> %a to <8 x i8>
-  %mmx_var.i = bitcast <8 x i8> %1 to x86_mmx
-  %mmx_var1.i = bitcast <8 x i8> %0 to x86_mmx
-  %2 = tail call x86_mmx @llvm.x86.mmx.psad.bw(x86_mmx %mmx_var.i, x86_mmx %mmx_var1.i) nounwind
-  %3 = bitcast x86_mmx %2 to i64
+  %mmx_var.i = bitcast <8 x i8> %1 to <1 x i64>
+  %mmx_var1.i = bitcast <8 x i8> %0 to <1 x i64>
+  %2 = tail call <1 x i64> @llvm.x86.mmx.psad.bw(<1 x i64> %mmx_var.i, <1 x i64> %mmx_var1.i) nounwind
+  %3 = bitcast <1 x i64> %2 to i64
   ret i64 %3
 }
 
-declare x86_mmx @llvm.x86.mmx.pmins.w(x86_mmx, x86_mmx) nounwind readnone
+declare <1 x i64> @llvm.x86.mmx.pmins.w(<1 x i64>, <1 x i64>) nounwind readnone
 
 define i64 @test31(<1 x i64> %a, <1 x i64> %b) #0 {
 ; CHECK-LABEL: define i64 @test31(
@@ -2392,16 +2392,16 @@ define i64 @test31(<1 x i64> %a, <1 x i64> %b) #0 {
 entry:
   %0 = bitcast <1 x i64> %b to <4 x i16>
   %1 = bitcast <1 x i64> %a to <4 x i16>
-  %mmx_var.i = bitcast <4 x i16> %1 to x86_mmx
-  %mmx_var1.i = bitcast <4 x i16> %0 to x86_mmx
-  %2 = tail call x86_mmx @llvm.x86.mmx.pmins.w(x86_mmx %mmx_var.i, x86_mmx %mmx_var1.i) nounwind
-  %3 = bitcast x86_mmx %2 to <4 x i16>
+  %mmx_var.i = bitcast <4 x i16> %1 to <1 x i64>
+  %mmx_var1.i = bitcast <4 x i16> %0 to <1 x i64>
+  %2 = tail call <1 x i64> @llvm.x86.mmx.pmins.w(<1 x i64> %mmx_var.i, <1 x i64> %mmx_var1.i) nounwind
+  %3 = bitcast <1 x i64> %2 to <4 x i16>
   %4 = bitcast <4 x i16> %3 to <1 x i64>
   %5 = extractelement <1 x i64> %4, i32 0
   ret i64 %5
 }
 
-declare x86_mmx @llvm.x86.mmx.pminu.b(x86_mmx, x86_mmx) nounwind readnone
+declare <1 x i64> @llvm.x86.mmx.pminu.b(<1 x i64>, <1 x i64>) nounwind readnone
 
 define i64 @test30(<1 x i64> %a, <1 x i64> %b) #0 {
 ; CHECK-LABEL: define i64 @test30(
@@ -2432,16 +2432,16 @@ define i64 @test30(<1 x i64> %a, <1 x i64> %b) #0 {
 entry:
   %0 = bitcast <1 x i64> %b to <8 x i8>
   %1 = bitcast <1 x i64> %a to <8 x i8>
-  %mmx_var.i = bitcast <8 x i8> %1 to x86_mmx
-  %mmx_var1.i = bitcast <8 x i8> %0 to x86_mmx
-  %2 = tail call x86_mmx @llvm.x86.mmx.pminu.b(x86_mmx %mmx_var.i, x86_mmx %mmx_var1.i) nounwind
-  %3 = bitcast x86_mmx %2 to <8 x i8>
+  %mmx_var.i = bitcast <8 x i8> %1 to <1 x i64>
+  %mmx_var1.i = bitcast <8 x i8> %0 to <1 x i64>
+  %2 = tail call <1 x i64> @llvm.x86.mmx.pminu.b(<1 x i64> %mmx_var.i, <1 x i64> %mmx_var1.i) nounwind
+  %3 = bitcast <1 x i64> %2 to <8 x i8>
   %4 = bitcast <8 x i8> %3 to <1 x i64>
   %5 = extractelement <1 x i64> %4, i32 0
   ret i64 %5
 }
 
-declare x86_mmx @llvm.x86.mmx.pmaxs.w(x86_mmx, x86_mmx) nounwind readnone
+declare <1 x i64> @llvm.x86.mmx.pmaxs.w(<1 x i64>, <1 x i64>) nounwind readnone
 
 define i64 @test29(<1 x i64> %a, <1 x i64> %b) #0 {
 ; CHECK-LABEL: define i64 @test29(
@@ -2472,16 +2472,16 @@ define i64 @test29(<1 x i64> %a, <1 x i64> %b) #0 {
 entry:
   %0 = bitcast <1 x i64> %b to <4 x i16>
   %1 = bitcast <1 x i64> %a to <4 x i16>
-  %mmx_var.i = bitcast <4 x i16> %1 to x86_mmx
-  %mmx_var1.i = bitcast <4 x i16> %0 to x86_mmx
-  %2 = tail call x86_mmx @llvm.x86.mmx.pmaxs.w(x86_mmx %mmx_var.i, x86_mmx %mmx_var1.i) nounwind
-  %3 = bitcast x86_mmx %2 to <4 x i16>
+  %mmx_var.i = bitcast <4 x i16> %1 to <1 x i64>
+  %mmx_var1.i = bitcast <4 x i16> %0 to <1 x i64>
+  %2 = tail call <1 x i64> @llvm.x86.mmx.pmaxs.w(<1 x i64> %mmx_var.i, <1 x i64> %mmx_var1.i) nounwind
+  %3 = bitcast <1 x i64> %2 to <4 x i16>
   %4 = bitcast <4 x i16> %3 to <1 x i64>
   %5 = extractelement <1 x i64> %4, i32 0
   ret i64 %5
 }
 
-declare x86_mmx @llvm.x86.mmx.pmaxu.b(x86_mmx, x86_mmx) nounwind readnone
+declare <1 x i64> @llvm.x86.mmx.pmaxu.b(<1 x i64>, <1 x i64>) nounwind readnone
 
 define i64 @test28(<1 x i64> %a, <1 x i64> %b) #0 {
 ; CHECK-LABEL: define i64 @test28(
@@ -2512,16 +2512,16 @@ define i64 @test28(<1 x i64> %a, <1 x i64> %b) #0 {
 entry:
   %0 = bitcast <1 x i64> %b to <8 x i8>
   %1 = bitcast <1 x i64> %a to <8 x i8>
-  %mmx_var.i = bitcast <8 x i8> %1 to x86_mmx
-  %mmx_var1.i = bitcast <8 x i8> %0 to x86_mmx
-  %2 = tail call x86_mmx @llvm.x86.mmx.pmaxu.b(x86_mmx %mmx_var.i, x86_mmx %mmx_var1.i) nounwind
-  %3 = bitcast x86_mmx %2 to <8 x i8>
+  %mmx_var.i = bitcast <8 x i8> %1 to <1 x i64>
+  %mmx_var1.i = bitcast <8 x i8> %0 to <1 x i64>
+  %2 = tail call <1 x i64> @llvm.x86.mmx.pmaxu.b(<1 x i64> %mmx_var.i, <1 x i64> %mmx_var1.i) nounwind
+  %3 = bitcast <1 x i64> %2 to <8 x i8>
   %4 = bitcast <8 x i8> %3 to <1 x i64>
   %5 = extractelement <1 x i64> %4, i32 0
   ret i64 %5
 }
 
-declare x86_mmx @llvm.x86.mmx.pavg.w(x86_mmx, x86_mmx) nounwind readnone
+declare <1 x i64> @llvm.x86.mmx.pavg.w(<1 x i64>, <1 x i64>) nounwind readnone
 
 define i64 @test27(<1 x i64> %a, <1 x i64> %b) #0 {
 ; CHECK-LABEL: define i64 @test27(
@@ -2552,16 +2552,16 @@ define i64 @test27(<1 x i64> %a, <1 x i64> %b) #0 {
 entry:
   %0 = bitcast <1 x i64> %b to <4 x i16>
   %1 = bitcast <1 x i64> %a to <4 x i16>
-  %mmx_var.i = bitcast <4 x i16> %1 to x86_mmx
-  %mmx_var1.i = bitcast <4 x i16> %0 to x86_mmx
-  %2 = tail call x86_mmx @llvm.x86.mmx.pavg.w(x86_mmx %mmx_var.i, x86_mmx %mmx_var1.i) nounwind
-  %3 = bitcast x86_mmx %2 to <4 x i16>
+  %mmx_var.i = bitcast <4 x i16> %1 to <1 x i64>
+  %mmx_var1.i = bitcast <4 x i16> %0 to <1 x i64>
+  %2 = tail call <1 x i64> @llvm.x86.mmx.pavg.w(<1 x i64> %mmx_var.i, <1 x i64> %mmx_var1.i) nounwind
+  %3 = bitcast <1 x i64> %2 to <4 x i16>
   %4 = bitcast <4 x i16> %3 to <1 x i64>
   %5 = extractelement <1 x i64> %4, i32 0
   ret i64 %5
 }
 
-declare x86_mmx @llvm.x86.mmx.pavg.b(x86_mmx, x86_mmx) nounwind readnone
+declare <1 x i64> @llvm.x86.mmx.pavg.b(<1 x i64>, <1 x i64>) nounwind readnone
 
 define i64 @test26(<1 x i64> %a, <1 x i64> %b) #0 {
 ; CHECK-LABEL: define i64 @test26(
@@ -2592,16 +2592,16 @@ define i64 @test26(<1 x i64> %a, <1 x i64> %b) #0 {
 entry:
   %0 = bitcast <1 x i64> %b to <8 x i8>
   %1 = bitcast <1 x i64> %a to <8 x i8>
-  %mmx_var.i = bitcast <8 x i8> %1 to x86_mmx
-  %mmx_var1.i = bitcast <8 x i8> %0 to x86_mmx
-  %2 = tail call x86_mmx @llvm.x86.mmx.pavg.b(x86_mmx %mmx_var.i, x86_mmx %mmx_var1.i) nounwind
-  %3 = bitcast x86_mmx %2 to <8 x i8>
+  %mmx_var.i = bitcast <8 x i8> %1 to <1 x i64>
+  %mmx_var1.i = bitcast <8 x i8> %0 to <1 x i64>
+  %2 = tail call <1 x i64> @llvm.x86.mmx.pavg.b(<1 x i64> %mmx_var.i, <1 x i64> %mmx_var1.i) nounwind
+  %3 = bitcast <1 x i64> %2 to <8 x i8>
   %4 = bitcast <8 x i8> %3 to <1 x i64>
   %5 = extractelement <1 x i64> %4, i32 0
   ret i64 %5
 }
 
-declare void @llvm.x86.mmx.movnt.dq(ptr, x86_mmx) nounwind
+declare void @llvm.x86.mmx.movnt.dq(ptr, <1 x i64>) nounwind
 
 define void @test25(ptr %p, <1 x i64> %a) nounwind optsize ssp #0 {
 ; CHECK-LABEL: define void @test25(
@@ -2629,12 +2629,12 @@ define void @test25(ptr %p, <1 x i64> %a) nounwind optsize ssp #0 {
 ;
 entry:
   %0 = extractelement <1 x i64> %a, i32 0
-  %mmx_var.i = bitcast i64 %0 to x86_mmx
-  tail call void @llvm.x86.mmx.movnt.dq(ptr %p, x86_mmx %mmx_var.i) nounwind
+  %mmx_var.i = bitcast i64 %0 to <1 x i64>
+  tail call void @llvm.x86.mmx.movnt.dq(ptr %p, <1 x i64> %mmx_var.i) nounwind
   ret void
 }
 
-declare i32 @llvm.x86.mmx.pmovmskb(x86_mmx) nounwind readnone
+declare i32 @llvm.x86.mmx.pmovmskb(<1 x i64>) nounwind readnone
 
 define i32 @test24(<1 x i64> %a) #0 {
 ; CHECK-LABEL: define i32 @test24(
@@ -2659,12 +2659,12 @@ define i32 @test24(<1 x i64> %a) #0 {
 ;
 entry:
   %0 = bitcast <1 x i64> %a to <8 x i8>
-  %mmx_var.i = bitcast <8 x i8> %0 to x86_mmx
-  %1 = tail call i32 @llvm.x86.mmx.pmovmskb(x86_mmx %mmx_var.i) nounwind
+  %mmx_var.i = bitcast <8 x i8> %0 to <1 x i64>
+  %1 = tail call i32 @llvm.x86.mmx.pmovmskb(<1 x i64> %mmx_var.i) nounwind
   ret i32 %1
 }
 
-declare void @llvm.x86.mmx.maskmovq(x86_mmx, x86_mmx, ptr) nounwind
+declare void @llvm.x86.mmx.maskmovq(<1 x i64>, <1 x i64>, ptr) nounwind
 
 define void @test23(<1 x i64> %d, <1 x i64> %n, ptr %p) nounwind optsize ssp #0 {
 ; CHECK-LABEL: define void @test23(
@@ -2700,13 +2700,13 @@ define void @test23(<1 x i64> %d, <1 x i64> %n, ptr %p) nounwind optsize ssp #0
 entry:
   %0 = bitcast <1 x i64> %n to <8 x i8>
   %1 = bitcast <1 x i64> %d to <8 x i8>
-  %mmx_var.i = bitcast <8 x i8> %1 to x86_mmx
-  %mmx_var1.i = bitcast <8 x i8> %0 to x86_mmx
-  tail call void @llvm.x86.mmx.maskmovq(x86_mmx %mmx_var.i, x86_mmx %mmx_var1.i, ptr %p) nounwind
+  %mmx_var.i = bitcast <8 x i8> %1 to <1 x i64>
+  %mmx_var1.i = bitcast <8 x i8> %0 to <1 x i64>
+  tail call void @llvm.x86.mmx.maskmovq(<1 x i64> %mmx_var.i, <1 x i64> %mmx_var1.i, ptr %p) nounwind
   ret void
 }
 
-declare x86_mmx @llvm.x86.mmx.pmulhu.w(x86_mmx, x86_mmx) nounwind readnone
+declare <1 x i64> @llvm.x86.mmx.pmulhu.w(<1 x i64>, <1 x i64>) nounwind readnone
 
 define i64 @test22(<1 x i64> %a, <1 x i64> %b) #0 {
 ; CHECK-LABEL: define i64 @test22(
@@ -2737,16 +2737,16 @@ define i64 @test22(<1 x i64> %a, <1 x i64> %b) #0 {
 entry:
   %0 = bitcast <1 x i64> %b to <4 x i16>
   %1 = bitcast <1 x i64> %a to <4 x i16>
-  %mmx_var.i = bitcast <4 x i16> %1 to x86_mmx
-  %mmx_var1.i = bitcast <4 x i16> %0 to x86_mmx
-  %2 = tail call x86_mmx @llvm.x86.mmx.pmulhu.w(x86_mmx %mmx_var.i, x86_mmx %mmx_var1.i) nounwind
-  %3 = bitcast x86_mmx %2 to <4 x i16>
+  %mmx_var.i = bitcast <4 x i16> %1 to <1 x i64>
+  %mmx_var1.i = bitcast <4 x i16> %0 to <1 x i64>
+  %2 = tail call <1 x i64> @llvm.x86.mmx.pmulhu.w(<1 x i64> %mmx_var.i, <1 x i64> %mmx_var1.i) nounwind
+  %3 = bitcast <1 x i64> %2 to <4 x i16>
   %4 = bitcast <4 x i16> %3 to <1 x i64>
   %5 = extractelement <1 x i64> %4, i32 0
   ret i64 %5
 }
 
-declare x86_mmx @llvm.x86.sse.pshuf.w(x86_mmx, i8) nounwind readnone
+declare <1 x i64> @llvm.x86.sse.pshuf.w(<1 x i64>, i8) nounwind readnone
 
 define i64 @test21(<1 x i64> %a) #0 {
 ; CHECK-LABEL: define i64 @test21(
@@ -2774,9 +2774,9 @@ define i64 @test21(<1 x i64> %a) #0 {
 ;
 entry:
   %0 = bitcast <1 x i64> %a to <4 x i16>
-  %1 = bitcast <4 x i16> %0 to x86_mmx
-  %2 = tail call x86_mmx @llvm.x86.sse.pshuf.w(x86_mmx %1, i8 3) nounwind readnone
-  %3 = bitcast x86_mmx %2 to <4 x i16>
+  %1 = bitcast <4 x i16> %0 to <1 x i64>
+  %2 = tail call <1 x i64> @llvm.x86.sse.pshuf.w(<1 x i64> %1, i8 3) nounwind readnone
+  %3 = bitcast <1 x i64> %2 to <4 x i16>
   %4 = bitcast <4 x i16> %3 to <1 x i64>
   %5 = extractelement <1 x i64> %4, i32 0
   ret i64 %5
@@ -2808,15 +2808,15 @@ define i32 @test21_2(<1 x i64> %a) #0 {
 ;
 entry:
   %0 = bitcast <1 x i64> %a to <4 x i16>
-  %1 = bitcast <4 x i16> %0 to x86_mmx
-  %2 = tail call x86_mmx @llvm.x86.sse.pshuf.w(x86_mmx %1, i8 3) nounwind readnone
-  %3 = bitcast x86_mmx %2 to <4 x i16>
+  %1 = bitcast <4 x i16> %0 to <1 x i64>
+  %2 = tail call <1 x i64> @llvm.x86.sse.pshuf.w(<1 x i64> %1, i8 3) nounwind readnone
+  %3 = bitcast <1 x i64> %2 to <4 x i16>
   %4 = bitcast <4 x i16> %3 to <2 x i32>
   %5 = extractelement <2 x i32> %4, i32 0
   ret i32 %5
 }
 
-declare x86_mmx @llvm.x86.mmx.pmulu.dq(x86_mmx, x86_mmx) nounwind readnone
+declare <1 x i64> @llvm.x86.mmx.pmulu.dq(<1 x i64>, <1 x i64>) nounwind readnone
 
 define i64 @test20(<1 x i64> %a, <1 x i64> %b) #0 {
 ; CHECK-LABEL: define i64 @test20(
@@ -2843,14 +2843,14 @@ define i64 @test20(<1 x i64> %a, <1 x i64> %b) #0 {
 entry:
   %0 = bitcast <1 x i64> %b to <2 x i32>
   %1 = bitcast <1 x i64> %a to <2 x i32>
-  %mmx_var.i = bitcast <2 x i32> %1 to x86_mmx
-  %mmx_var1.i = bitcast <2 x i32> %0 to x86_mmx
-  %2 = tail call x86_mmx @llvm.x86.mmx.pmulu.dq(x86_mmx %mmx_var.i, x86_mmx %mmx_var1.i) nounwind
-  %3 = bitcast x86_mmx %2 to i64
+  %mmx_var.i = bitcast <2 x i32> %1 to <1 x i64>
+  %mmx_var1.i = bitcast <2 x i32> %0 to <1 x i64>
+  %2 = tail call <1 x i64> @llvm.x86.mmx.pmulu.dq(<1 x i64> %mmx_var.i, <1 x i64> %mmx_var1.i) nounwind
+  %3 = bitcast <1 x i64> %2 to i64
   ret i64 %3
 }
 
-declare <2 x double> @llvm.x86.sse.cvtpi2pd(x86_mmx) nounwind readnone
+declare <2 x double> @llvm.x86.sse.cvtpi2pd(<1 x i64>) nounwind readnone
 
 define <2 x double> @test19(<1 x i64> %a) #0 {
 ; CHECK-LABEL: define <2 x double> @test19(
@@ -2875,12 +2875,12 @@ define <2 x double> @test19(<1 x i64> %a) #0 {
 ;
 entry:
   %0 = bitcast <1 x i64> %a to <2 x i32>
-  %1 = bitcast <2 x i32> %0 to x86_mmx
-  %2 = tail call <2 x double> @llvm.x86.sse.cvtpi2pd(x86_mmx %1) nounwind readnone
+  %1 = bitcast <2 x i32> %0 to <1 x i64>
+  %2 = tail call <2 x double> @llvm.x86.sse.cvtpi2pd(<1 x i64> %1) nounwind readnone
   ret <2 x double> %2
 }
 
-declare x86_mmx @llvm.x86.sse.cvttpd2pi(<2 x double>) nounwind readnone
+declare <1 x i64> @llvm.x86.sse.cvttpd2pi(<2 x double>) nounwind readnone
 
 define i64 @test18(<2 x double> %a) #0 {
 ; CHECK-LABEL: define i64 @test18(
@@ -2903,14 +2903,14 @@ define i64 @test18(<2 x double> %a) #0 {
 ; CHECK-NEXT:    ret i64 [[TMP3]]
 ;
 entry:
-  %0 = tail call x86_mmx @llvm.x86.sse.cvttpd2pi(<2 x double> %a) nounwind readnone
-  %1 = bitcast x86_mmx %0 to <2 x i32>
+  %0 = tail call <1 x i64> @llvm.x86.sse.cvttpd2pi(<2 x double> %a) nounwind readnone
+  %1 = bitcast <1 x i64> %0 to <2 x i32>
   %2 = bitcast <2 x i32> %1 to <1 x i64>
   %3 = extractelement <1 x i64> %2, i32 0
   ret i64 %3
 }
 
-declare x86_mmx @llvm.x86.sse.cvtpd2pi(<2 x double>) nounwind readnone
+declare <1 x i64> @llvm.x86.sse.cvtpd2pi(<2 x double>) nounwind readnone
 
 define i64 @test17(<2 x double> %a) #0 {
 ; CHECK-LABEL: define i64 @test17(
@@ -2933,14 +2933,14 @@ define i64 @test17(<2 x double> %a) #0 {
 ; CHECK-NEXT:    ret i64 [[TMP3]]
 ;
 entry:
-  %0 = tail call x86_mmx @llvm.x86.sse.cvtpd2pi(<2 x double> %a) nounwind readnone
-  %1 = bitcast x86_mmx %0 to <2 x i32>
+  %0 = tail call <1 x i64> @llvm.x86.sse.cvtpd2pi(<2 x double> %a) nounwind readnone
+  %1 = bitcast <1 x i64> %0 to <2 x i32>
   %2 = bitcast <2 x i32> %1 to <1 x i64>
   %3 = extractelement <1 x i64> %2, i32 0
   ret i64 %3
 }
 
-declare x86_mmx @llvm.x86.mmx.palignr.b(x86_mmx, x86_mmx, i8) nounwind readnone
+declare <1 x i64> @llvm.x86.mmx.palignr.b(<1 x i64>, <1 x i64>, i8) nounwind readnone
 
 define i64 @test16(<1 x i64> %a, <1 x i64> %b) #0 {
 ; CHECK-LABEL: define i64 @test16(
@@ -2974,15 +2974,15 @@ define i64 @test16(<1 x i64> %a, <1 x i64> %b) #0 {
 ;
 entry:
   %0 = extractelement <1 x i64> %a, i32 0
-  %mmx_var = bitcast i64 %0 to x86_mmx
+  %mmx_var = bitcast i64 %0 to <1 x i64>
   %1 = extractelement <1 x i64> %b, i32 0
-  %mmx_var1 = bitcast i64 %1 to x86_mmx
-  %2 = tail call x86_mmx @llvm.x86.mmx.palignr.b(x86_mmx %mmx_var, x86_mmx %mmx_var1, i8 16)
-  %3 = bitcast x86_mmx %2 to i64
+  %mmx_var1 = bitcast i64 %1 to <1 x i64>
+  %2 = tail call <1 x i64> @llvm.x86.mmx.palignr.b(<1 x i64> %mmx_var, <1 x i64> %mmx_var1, i8 16)
+  %3 = bitcast <1 x i64> %2 to i64
   ret i64 %3
 }
 
-declare x86_mmx @llvm.x86.ssse3.pabs.d(x86_mmx) nounwind readnone
+declare <1 x i64> @llvm.x86.ssse3.pabs.d(<1 x i64>) nounwind readnone
 
 define i64 @test15(<1 x i64> %a) #0 {
 ; CHECK-LABEL: define i64 @test15(
@@ -3006,15 +3006,15 @@ define i64 @test15(<1 x i64> %a) #0 {
 ;
 entry:
   %0 = bitcast <1 x i64> %a to <2 x i32>
-  %1 = bitcast <2 x i32> %0 to x86_mmx
-  %2 = tail call x86_mmx @llvm.x86.ssse3.pabs.d(x86_mmx %1) nounwind readnone
-  %3 = bitcast x86_mmx %2 to <2 x i32>
+  %1 = bitcast <2 x i32> %0 to <1 x i64>
+  %2 = tail call <1 x i64> @llvm.x86.ssse3.pabs.d(<1 x i64> %1) nounwind readnone
+  %3 = bitcast <1 x i64> %2 to <2 x i32>
   %4 = bitcast <2 x i32> %3 to <1 x i64>
   %5 = extractelement <1 x i64> %4, i32 0
   ret i64 %5
 }
 
-declare x86_mmx @llvm.x86.ssse3.pabs.w(x86_mmx) nounwind readnone
+declare <1 x i64> @llvm.x86.ssse3.pabs.w(<1 x i64>) nounwind readnone
 
 define i64 @test14(<1 x i64> %a) #0 {
 ; CHECK-LABEL: define i64 @test14(
@@ -3038,15 +3038,15 @@ define i64 @test14(<1 x i64> %a) #0 {
 ;
 entry:
   %0 = bitcast <1 x i64> %a to <4 x i16>
-  %1 = bitcast <4 x i16> %0 to x86_mmx
-  %2 = tail call x86_mmx @llvm.x86.ssse3.pabs.w(x86_mmx %1) nounwind readnone
-  %3 = bitcast x86_mmx %2 to <4 x i16>
+  %1 = bitcast <4 x i16> %0 to <1 x i64>
+  %2 = tail call <1 x i64> @llvm.x86.ssse3.pabs.w(<1 x i64> %1) nounwind readnone
+  %3 = bitcast <1 x i64> %2 to <4 x i16>
   %4 = bitcast <4 x i16> %3 to <1 x i64>
   %5 = extractelement <1 x i64> %4, i32 0
   ret i64 %5
 }
 
-declare x86_mmx @llvm.x86.ssse3.pabs.b(x86_mmx) nounwind readnone
+declare <1 x i64> @llvm.x86.ssse3.pabs.b(<1 x i64>) nounwind readnone
 
 define i64 @test13(<1 x i64> %a) #0 {
 ; CHECK-LABEL: define i64 @test13(
@@ -3070,15 +3070,15 @@ define i64 @test13(<1 x i64> %a) #0 {
 ;
 entry:
   %0 = bitcast <1 x i64> %a to <8 x i8>
-  %1 = bitcast <8 x i8> %0 to x86_mmx
-  %2 = tail call x86_mmx @llvm.x86.ssse3.pabs.b(x86_mmx %1) nounwind readnone
-  %3 = bitcast x86_mmx %2 to <8 x i8>
+  %1 = bitcast <8 x i8> %0 to <1 x i64>
+  %2 = tail call <1 x i64> @llvm.x86.ssse3.pabs.b(<1 x i64> %1) nounwind readnone
+  %3 = bitcast <1 x i64> %2 to <8 x i8>
   %4 = bitcast <8 x i8> %3 to <1 x i64>
   %5 = extractelement <1 x i64> %4, i32 0
   ret i64 %5
 }
 
-declare x86_mmx @llvm.x86.ssse3.psign.d(x86_mmx, x86_mmx) nounwind readnone
+declare <1 x i64> @llvm.x86.ssse3.psign.d(<1 x i64>, <1 x i64>) nounwind readnone
 
 define i64 @test12(<1 x i64> %a, <1 x i64> %b) #0 {
 ; CHECK-LABEL: define i64 @test12(
@@ -3109,16 +3109,16 @@ define i64 @test12(<1 x i64> %a, <1 x i64> %b) #0 {
 entry:
   %0 = bitcast <1 x i64> %b to <2 x i32>
   %1 = bitcast <1 x i64> %a to <2 x i32>
-  %2 = bitcast <2 x i32> %1 to x86_mmx
-  %3 = bitcast <2 x i32> %0 to x86_mmx
-  %4 = tail call x86_mmx @llvm.x86.ssse3.psign.d(x86_mmx %2, x86_mmx %3) nounwind readnone
-  %5 = bitcast x86_mmx %4 to <2 x i32>
+  %2 = bitcast <2 x i32> %1 to <1 x i64>
+  %3 = bitcast <2 x i32> %0 to <1 x i64>
+  %4 = tail call <1 x i64> @llvm.x86.ssse3.psign.d(<1 x i64> %2, <1 x i64> %3) nounwind readnone
+  %5 = bitcast <1 x i64> %4 to <2 x i32>
   %6 = bitcast <2 x i32> %5 to <1 x i64>
   %7 = extractelement <1 x i64> %6, i32 0
   ret i64 %7
 }
 
-declare x86_mmx @llvm.x86.ssse3.psign.w(x86_mmx, x86_mmx) nounwind readnone
+declare <1 x i64> @llvm.x86.ssse3.psign.w(<1 x i64>, <1 x i64>) nounwind readnone
 
 define i64 @test11(<1 x i64> %a, <1 x i64> %b) #0 {
 ; CHECK-LABEL: define i64 @test11(
@@ -3149,16 +3149,16 @@ define i64 @test11(<1 x i64> %a, <1 x i64> %b) #0 {
 entry:
   %0 = bitcast <1 x i64> %b to <4 x i16>
   %1 = bitcast <1 x i64> %a to <4 x i16>
-  %2 = bitcast <4 x i16> %1 to x86_mmx
-  %3 = bitcast <4 x i16> %0 to x86_mmx
-  %4 = tail call x86_mmx @llvm.x86.ssse3.psign.w(x86_mmx %2, x86_mmx %3) nounwind readnone
-  %5 = bitcast x86_mmx %4 to <4 x i16>
+  %2 = bitcast <4 x i16> %1 to <1 x i64>
+  %3 = bitcast <4 x i16> %0 to <1 x i64>
+  %4 = tail call <1 x i64> @llvm.x86.ssse3.psign.w(<1 x i64> %2, <1 x i64> %3) nounwind readnone
+  %5 = bitcast <1 x i64> %4 to <4 x i16>
   %6 = bitcast <4 x i16> %5 to <1 x i64>
   %7 = extractelement <1 x i64> %6, i32 0
   ret i64 %7
 }
 
-declare x86_mmx @llvm.x86.ssse3.psign.b(x86_mmx, x86_mmx) nounwind readnone
+declare <1 x i64> @llvm.x86.ssse3.psign.b(<1 x i64>, <1 x i64>) nounwind readnone
 
 define i64 @test10(<1 x i64> %a, <1 x i64> %b) #0 {
 ; CHECK-LABEL: define i64 @test10(
@@ -3189,16 +3189,16 @@ define i64 @test10(<1 x i64> %a, <1 x i64> %b) #0 {
 entry:
   %0 = bitcast <1 x i64> %b to <8 x i8>
   %1 = bitcast <1 x i64> %a to <8 x i8>
-  %2 = bitcast <8 x i8> %1 to x86_mmx
-  %3 = bitcast <8 x i8> %0 to x86_mmx
-  %4 = tail call x86_mmx @llvm.x86.ssse3.psign.b(x86_mmx %2, x86_mmx %3) nounwind readnone
-  %5 = bitcast x86_mmx %4 to <8 x i8>
+  %2 = bitcast <8 x i8> %1 to <1 x i64>
+  %3 = bitcast <8 x i8> %0 to <1 x i64>
+  %4 = tail call <1 x i64> @llvm.x86.ssse3.psign.b(<1 x i64> %2, <1 x i64> %3) nounwind readnone
+  %5 = bitcast <1 x i64> %4 to <8 x i8>
   %6 = bitcast <8 x i8> %5 to <1 x i64>
   %7 = extractelement <1 x i64> %6, i32 0
   ret i64 %7
 }
 
-declare x86_mmx @llvm.x86.ssse3.pshuf.b(x86_mmx, x86_mmx) nounwind readnone
+declare <1 x i64> @llvm.x86.ssse3.pshuf.b(<1 x i64>, <1 x i64>) nounwind readnone
 
 define i64 @test9(<1 x i64> %a, <1 x i64> %b) #0 {
 ; CHECK-LABEL: define i64 @test9(
@@ -3229,16 +3229,16 @@ define i64 @test9(<1 x i64> %a, <1 x i64> %b) #0 {
 entry:
   %0 = bitcast <1 x i64> %b to <8 x i8>
   %1 = bitcast <1 x i64> %a to <8 x i8>
-  %2 = bitcast <8 x i8> %1 to x86_mmx
-  %3 = bitcast <8 x i8> %0 to x86_mmx
-  %4 = tail call x86_mmx @llvm.x86.ssse3.pshuf.b(x86_mmx %2, x86_mmx %3) nounwind readnone
-  %5 = bitcast x86_mmx %4 to <8 x i8>
+  %2 = bitcast <8 x i8> %1 to <1 x i64>
+  %3 = bitcast <8 x i8> %0 to <1 x i64>
+  %4 = tail call <1 x i64> @llvm.x86.ssse3.pshuf.b(<1 x i64> %2, <1 x i64> %3) nounwind readnone
+  %5 = bitcast <1 x i64> %4 to <8 x i8>
   %6 = bitcast <8 x i8> %5 to <1 x i64>
   %7 = extractelement <1 x i64> %6, i32 0
   ret i64 %7
 }
 
-declare x86_mmx @llvm.x86.ssse3.pmul.hr.sw(x86_mmx, x86_mmx) nounwind readnone
+declare <1 x i64> @llvm.x86.ssse3.pmul.hr.sw(<1 x i64>, <1 x i64>) nounwind readnone
 
 define i64 @test8(<1 x i64> %a, <1 x i64> %b) #0 {
 ; CHECK-LABEL: define i64 @test8(
@@ -3269,16 +3269,16 @@ define i64 @test8(<1 x i64> %a, <1 x i64> %b) #0 {
 entry:
   %0 = bitcast <1 x i64> %b to <4 x i16>
   %1 = bitcast <1 x i64> %a to <4 x i16>
-  %2 = bitcast <4 x i16> %1 to x86_mmx
-  %3 = bitcast <4 x i16> %0 to x86_mmx
-  %4 = tail call x86_mmx @llvm.x86.ssse3.pmul.hr.sw(x86_mmx %2, x86_mmx %3) nounwind readnone
-  %5 = bitcast x86_mmx %4 to <4 x i16>
+  %2 = bitcast <4 x i16> %1 to <1 x i64>
+  %3 = bitcast <4 x i16> %0 to <1 x i64>
+  %4 = tail call <1 x i64> @llvm.x86.ssse3.pmul.hr.sw(<1 x i64> %2, <1 x i64> %3) nounwind readnone
+  %5 = bitcast <1 x i64> %4 to <4 x i16>
   %6 = bitcast <4 x i16> %5 to <1 x i64>
   %7 = extractelement <1 x i64> %6, i32 0
   ret i64 %7
 }
 
-declare x86_mmx @llvm.x86.ssse3.pmadd.ub.sw(x86_mmx, x86_mmx) nounwind readnone
+declare <1 x i64> @llvm.x86.ssse3.pmadd.ub.sw(<1 x i64>, <1 x i64>) nounwind readnone
 
 define i64 @test7(<1 x i64> %a, <1 x i64> %b) #0 {
 ; CHECK-LABEL: define i64 @test7(
@@ -3313,16 +3313,16 @@ define i64 @test7(<1 x i64> %a, <1 x i64> %b) #0 {
 entry:
   %0 = bitcast <1 x i64> %b to <8 x i8>
   %1 = bitcast <1 x i64> %a to <8 x i8>
-  %2 = bitcast <8 x i8> %1 to x86_mmx
-  %3 = bitcast <8 x i8> %0 to x86_mmx
-  %4 = tail call x86_mmx @llvm.x86.ssse3.pmadd.ub.sw(x86_mmx %2, x86_mmx %3) nounwind readnone
-  %5 = bitcast x86_mmx %4 to <8 x i8>
+  %2 = bitcast <8 x i8> %1 to <1 x i64>
+  %3 = bitcast <8 x i8> %0 to <1 x i64>
+  %4 = tail call <1 x i64> @llvm.x86.ssse3.pmadd.ub.sw(<1 x i64> %2, <1 x i64> %3) nounwind readnone
+  %5 = bitcast <1 x i64> %4 to <8 x i8>
   %6 = bitcast <8 x i8> %5 to <1 x i64>
   %7 = extractelement <1 x i64> %6, i32 0
   ret i64 %7
 }
 
-declare x86_mmx @llvm.x86.ssse3.phsub.sw(x86_mmx, x86_mmx) nounwind readnone
+declare <1 x i64> @llvm.x86.ssse3.phsub.sw(<1 x i64>, <1 x i64>) nounwind readnone
 
 define i64 @test6(<1 x i64> %a, <1 x i64> %b) #0 {
 ; CHECK-LABEL: define i64 @test6(
@@ -3353,16 +3353,16 @@ define i64 @test6(<1 x i64> %a, <1 x i64> %b) #0 {
 entry:
   %0 = bitcast <1 x i64> %b to <4 x i16>
   %1 = bitcast <1 x i64> %a to <4 x i16>
-  %2 = bitcast <4 x i16> %1 to x86_mmx
-  %3 = bitcast <4 x i16> %0 to x86_mmx
-  %4 = tail call x86_mmx @llvm.x86.ssse3.phsub.sw(x86_mmx %2, x86_mmx %3) nounwind readnone
-  %5 = bitcast x86_mmx %4 to <4 x i16>
+  %2 = bitcast <4 x i16> %1 to <1 x i64>
+  %3 = bitcast <4 x i16> %0 to <1 x i64>
+  %4 = tail call <1 x i64> @llvm.x86.ssse3.phsub.sw(<1 x i64> %2, <1 x i64> %3) nounwind readnone
+  %5 = bitcast <1 x i64> %4 to <4 x i16>
   %6 = bitcast <4 x i16> %5 to <1 x i64>
   %7 = extractelement <1 x i64> %6, i32 0
   ret i64 %7
 }
 
-declare x86_mmx @llvm.x86.ssse3.phsub.d(x86_mmx, x86_mmx) nounwind readnone
+declare <1 x i64> @llvm.x86.ssse3.phsub.d(<1 x i64>, <1 x i64>) nounwind readnone
 
 define i64 @test5(<1 x i64> %a, <1 x i64> %b) #0 {
 ; CHECK-LABEL: define i64 @test5(
@@ -3393,16 +3393,16 @@ define i64 @test5(<1 x i64> %a, <1 x i64> %b) #0 {
 entry:
   %0 = bitcast <1 x i64> %b to <2 x i32>
   %1 = bitcast <1 x i64> %a to <2 x i32>
-  %2 = bitcast <2 x i32> %1 to x86_mmx
-  %3 = bitcast <2 x i32> %0 to x86_mmx
-  %4 = tail call x86_mmx @llvm.x86.ssse3.phsub.d(x86_mmx %2, x86_mmx %3) nounwind readnone
-  %5 = bitcast x86_mmx %4 to <2 x i32>
+  %2 = bitcast <2 x i32> %1 to <1 x i64>
+  %3 = bitcast <2 x i32> %0 to <1 x i64>
+  %4 = tail call <1 x i64> @llvm.x86.ssse3.phsub.d(<1 x i64> %2, <1 x i64> %3) nounwind readnone
+  %5 = bitcast <1 x i64> %4 to <2 x i32>
   %6 = bitcast <2 x i32> %5 to <1 x i64>
   %7 = extractelement <1 x i64> %6, i32 0
   ret i64 %7
 }
 
-declare x86_mmx @llvm.x86.ssse3.phsub.w(x86_mmx, x86_mmx) nounwind readnone
+declare <1 x i64> @llvm.x86.ssse3.phsub.w(<1 x i64>, <1 x i64>) nounwind readnone
 
 define i64 @test4(<1 x i64> %a, <1 x i64> %b) #0 {
 ; CHECK-LABEL: define i64 @test4(
@@ -3433,16 +3433,16 @@ define i64 @test4(<1 x i64> %a, <1 x i64> %b) #0 {
 entry:
   %0 = bitcast <1 x i64> %b to <4 x i16>
   %1 = bitcast <1 x i64> %a to <4 x i16>
-  %2 = bitcast <4 x i16> %1 to x86_mmx
-  %3 = bitcast <4 x i16> %0 to x86_mmx
-  %4 = tail call x86_mmx @llvm.x86.ssse3.phsub.w(x86_mmx %2, x86_mmx %3) nounwind readnone
-  %5 = bitcast x86_mmx %4 to <4 x i16>
+  %2 = bitcast <4 x i16> %1 to <1 x i64>
+  %3 = bitcast <4 x i16> %0 to <1 x i64>
+  %4 = tail call <1 x i64> @llvm.x86.ssse3.phsub.w(<1 x i64> %2, <1 x i64> %3) nounwind readnone
+  %5 = bitcast <1 x i64> %4 to <4 x i16>
   %6 = bitcast <4 x i16> %5 to <1 x i64>
   %7 = extractelement <1 x i64> %6, i32 0
   ret i64 %7
 }
 
-declare x86_mmx @llvm.x86.ssse3.phadd.sw(x86_mmx, x86_mmx) nounwind readnone
+declare <1 x i64> @llvm.x86.ssse3.phadd.sw(<1 x i64>, <1 x i64>) nounwind readnone
 
 define i64 @test3(<1 x i64> %a, <1 x i64> %b) #0 {
 ; CHECK-LABEL: define i64 @test3(
@@ -3473,16 +3473,16 @@ define i64 @test3(<1 x i64> %a, <1 x i64> %b) #0 {
 entry:
   %0 = bitcast <1 x i64> %b to <4 x i16>
   %1 = bitcast <1 x i64> %a to <4 x i16>
-  %2 = bitcast <4 x i16> %1 to x86_mmx
-  %3 = bitcast <4 x i16> %0 to x86_mmx
-  %4 = tail call x86_mmx @llvm.x86.ssse3.phadd.sw(x86_mmx %2, x86_mmx %3) nounwind readnone
-  %5 = bitcast x86_mmx %4 to <4 x i16>
+  %2 = bitcast <4 x i16> %1 to <1 x i64>
+  %3 = bitcast <4 x i16> %0 to <1 x i64>
+  %4 = tail call <1 x i64> @llvm.x86.ssse3.phadd.sw(<1 x i64> %2, <1 x i64> %3) nounwind readnone
+  %5 = bitcast <1 x i64> %4 to <4 x i16>
   %6 = bitcast <4 x i16> %5 to <1 x i64>
   %7 = extractelement <1 x i64> %6, i32 0
   ret i64 %7
 }
 
-declare x86_mmx @llvm.x86.ssse3.phadd.d(x86_mmx, x86_mmx) nounwind readnone
+declare <1 x i64> @llvm.x86.ssse3.phadd.d(<1 x i64>, <1 x i64>) nounwind readnone
 
 define i64 @test2(<1 x i64> %a, <1 x i64> %b) #0 {
 ; CHECK-LABEL: define i64 @test2(
@@ -3513,16 +3513,16 @@ define i64 @test2(<1 x i64> %a, <1 x i64> %b) #0 {
 entry:
   %0 = bitcast <1 x i64> %b to <2 x i32>
   %1 = bitcast <1 x i64> %a to <2 x i32>
-  %2 = bitcast <2 x i32> %1 to x86_mmx
-  %3 = bitcast <2 x i32> %0 to x86_mmx
-  %4 = tail call x86_mmx @llvm.x86.ssse3.phadd.d(x86_mmx %2, x86_mmx %3) nounwind readnone
-  %5 = bitcast x86_mmx %4 to <2 x i32>
+  %2 = bitcast <2 x i32> %1 to <1 x i64>
+  %3 = bitcast <2 x i32> %0 to <1 x i64>
+  %4 = tail call <1 x i64> @llvm.x86.ssse3.phadd.d(<1 x i64> %2, <1 x i64> %3) nounwind readnone
+  %5 = bitcast <1 x i64> %4 to <2 x i32>
   %6 = bitcast <2 x i32> %5 to <1 x i64>
   %7 = extractelement <1 x i64> %6, i32 0
   ret i64 %7
 }
 
-define <4 x float> @test89(<4 x float> %a, x86_mmx %b) nounwind #0 {
+define <4 x float> @test89(<4 x float> %a, <1 x i64> %b) nounwind #0 {
 ; ALL-LABEL: test89:
 ; ALL:       # %bb.0:
 ; ALL-NEXT:    cvtpi2ps %mm0, %xmm0
@@ -3546,11 +3546,11 @@ define <4 x float> @test89(<4 x float> %a, x86_mmx %b) nounwind #0 {
 ; CHECK-NEXT:    store <4 x i32> zeroinitializer, ptr @__msan_retval_tls, align 8
 ; CHECK-NEXT:    ret <4 x float> [[C]]
 ;
-  %c = tail call <4 x float> @llvm.x86.sse.cvtpi2ps(<4 x float> %a, x86_mmx %b)
+  %c = tail call <4 x float> @llvm.x86.sse.cvtpi2ps(<4 x float> %a, <1 x i64> %b)
   ret <4 x float> %c
 }
 
-declare <4 x float> @llvm.x86.sse.cvtpi2ps(<4 x float>, x86_mmx) nounwind readnone
+declare <4 x float> @llvm.x86.sse.cvtpi2ps(<4 x float>, <1 x i64>) nounwind readnone
 
 define void @test90() #0 {
 ; ALL-LABEL: test90:
@@ -3576,29 +3576,25 @@ define <1 x i64> @test_mm_insert_pi16(<1 x i64> %a.coerce, i32 %d) nounwind #0 {
 ; CHECK-NEXT:    [[TMP3:%.*]] = load <1 x i64>, ptr @__msan_param_tls, align 8
 ; CHECK-NEXT:    [[TMP6:%.*]] = load i32, ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_param_tls to i64), i64 8) to ptr), align 8
 ; CHECK-NEXT:    call void @llvm.donothing()
-; CHECK-NEXT:    [[TMP8:%.*]] = bitcast <1 x i64> [[A_COERCE]] to <1 x i64>
 ; CHECK-NEXT:    [[TMP7:%.*]] = bitcast <1 x i64> [[TMP3]] to i64
 ; CHECK-NEXT:    [[_MSCMP:%.*]] = icmp ne i64 [[TMP7]], 0
 ; CHECK-NEXT:    [[_MSCMP1:%.*]] = icmp ne i32 [[TMP6]], 0
 ; CHECK-NEXT:    [[_MSOR:%.*]] = or i1 [[_MSCMP]], [[_MSCMP1]]
-; CHECK-NEXT:    br i1 [[_MSOR]], label [[TMP4:%.*]], label [[TMP5:%.*]], !prof [[PROF0]]
-; CHECK:       4:
+; CHECK-NEXT:    br i1 [[_MSOR]], label [[TMP5:%.*]], label [[TMP4:%.*]], !prof [[PROF0]]
+; CHECK:       3:
 ; CHECK-NEXT:    call void @__msan_warning_noreturn() #[[ATTR6]]
 ; CHECK-NEXT:    unreachable
-; CHECK:       5:
-; CHECK-NEXT:    [[TMP9:%.*]] = tail call <1 x i64> @llvm.x86.mmx.pinsr.w(<1 x i64> [[TMP8]], i32 [[D]], i32 2)
-; CHECK-NEXT:    [[TMP2:%.*]] = bitcast <1 x i64> [[TMP9]] to <1 x i64>
+; CHECK:       4:
+; CHECK-NEXT:    [[TMP9:%.*]] = tail call <1 x i64> @llvm.x86.mmx.pinsr.w(<1 x i64> [[A_COERCE]], i32 [[D]], i32 2)
 ; CHECK-NEXT:    store <1 x i64> zeroinitializer, ptr @__msan_retval_tls, align 8
-; CHECK-NEXT:    ret <1 x i64> [[TMP2]]
+; CHECK-NEXT:    ret <1 x i64> [[TMP9]]
 ;
 entry:
-  %0 = bitcast <1 x i64> %a.coerce to x86_mmx
-  %1 = tail call x86_mmx @llvm.x86.mmx.pinsr.w(x86_mmx %0, i32 %d, i32 2)
-  %2 = bitcast x86_mmx %1 to <1 x i64>
-  ret <1 x i64> %2
+  %1 = tail call <1 x i64> @llvm.x86.mmx.pinsr.w(<1 x i64> %a.coerce, i32 %d, i32 2)
+  ret <1 x i64> %1
 }
 
-declare x86_mmx @llvm.x86.mmx.pinsr.w(x86_mmx, i32, i32 immarg)
+declare <1 x i64> @llvm.x86.mmx.pinsr.w(<1 x i64>, i32, i32 immarg)
 
 define i32 @test_mm_extract_pi16(<1 x i64> %a.coerce) nounwind #0 {
 ; CHECK-LABEL: define i32 @test_mm_extract_pi16(
@@ -3606,25 +3602,23 @@ define i32 @test_mm_extract_pi16(<1 x i64> %a.coerce) nounwind #0 {
 ; CHECK-NEXT:  entry:
 ; CHECK-NEXT:    [[TMP2:%.*]] = load <1 x i64>, ptr @__msan_param_tls, align 8
 ; CHECK-NEXT:    call void @llvm.donothing()
-; CHECK-NEXT:    [[TMP6:%.*]] = bitcast <1 x i64> [[A_COERCE]] to <1 x i64>
 ; CHECK-NEXT:    [[TMP5:%.*]] = bitcast <1 x i64> [[TMP2]] to i64
 ; CHECK-NEXT:    [[_MSCMP:%.*]] = icmp ne i64 [[TMP5]], 0
-; CHECK-NEXT:    br i1 [[_MSCMP]], label [[TMP3:%.*]], label [[TMP4:%.*]], !prof [[PROF0]]
-; CHECK:       3:
+; CHECK-NEXT:    br i1 [[_MSCMP]], label [[TMP4:%.*]], label [[TMP3:%.*]], !prof [[PROF0]]
+; CHECK:       2:
 ; CHECK-NEXT:    call void @__msan_warning_noreturn() #[[ATTR6]]
 ; CHECK-NEXT:    unreachable
-; CHECK:       4:
-; CHECK-NEXT:    [[TMP1:%.*]] = tail call i32 @llvm.x86.mmx.pextr.w(<1 x i64> [[TMP6]], i32 2)
+; CHECK:       3:
+; CHECK-NEXT:    [[TMP1:%.*]] = tail call i32 @llvm.x86.mmx.pextr.w(<1 x i64> [[A_COERCE]], i32 2)
 ; CHECK-NEXT:    store i32 0, ptr @__msan_retval_tls, align 8
 ; CHECK-NEXT:    ret i32 [[TMP1]]
 ;
 entry:
-  %0 = bitcast <1 x i64> %a.coerce to x86_mmx
-  %1 = tail call i32 @llvm.x86.mmx.pextr.w(x86_mmx %0, i32 2)
+  %1 = tail call i32 @llvm.x86.mmx.pextr.w(<1 x i64> %a.coerce, i32 2)
   ret i32 %1
 }
 
-declare i32 @llvm.x86.mmx.pextr.w(x86_mmx, i32 immarg)
+declare i32 @llvm.x86.mmx.pextr.w(<1 x i64>, i32 immarg)
 
 attributes #0 = { sanitize_memory }
 ;.
diff --git a/llvm/test/Instrumentation/MemorySanitizer/vector_arith.ll b/llvm/test/Instrumentation/MemorySanitizer/vector_arith.ll
index 5197f32..fe5cf9d 100644
--- a/llvm/test/Instrumentation/MemorySanitizer/vector_arith.ll
+++ b/llvm/test/Instrumentation/MemorySanitizer/vector_arith.ll
@@ -6,9 +6,9 @@ target datalayout = "e-p:64:64:64-i1:8:8-i8:8:8-i16:16:16-i32:32:32-i64:64:64-f3
 target triple = "x86_64-unknown-linux-gnu"
 
 declare <4 x i32> @llvm.x86.sse2.pmadd.wd(<8 x i16>, <8 x i16>) nounwind readnone
-declare x86_mmx @llvm.x86.ssse3.pmadd.ub.sw(x86_mmx, x86_mmx) nounwind readnone
+declare <1 x i64> @llvm.x86.ssse3.pmadd.ub.sw(<1 x i64>, <1 x i64>) nounwind readnone
 declare <2 x i64> @llvm.x86.sse2.psad.bw(<16 x i8>, <16 x i8>) nounwind readnone
-declare x86_mmx @llvm.x86.mmx.psad.bw(x86_mmx, x86_mmx) nounwind readnone
+declare <1 x i64> @llvm.x86.mmx.psad.bw(<1 x i64>, <1 x i64>) nounwind readnone
 
 define <4 x i32> @Test_sse2_pmadd_wd(<8 x i16> %a, <8 x i16> %b) sanitize_memory {
 entry:
@@ -24,10 +24,10 @@ entry:
 ; CHECK: ret <4 x i32>
 
 
-define x86_mmx @Test_ssse3_pmadd_ub_sw(x86_mmx %a, x86_mmx %b) sanitize_memory {
+define <1 x i64> @Test_ssse3_pmadd_ub_sw(<1 x i64> %a, <1 x i64> %b) sanitize_memory {
 entry:
-  %c = tail call x86_mmx @llvm.x86.ssse3.pmadd.ub.sw(x86_mmx %a, x86_mmx %b) nounwind
-  ret x86_mmx %c
+  %c = tail call <1 x i64> @llvm.x86.ssse3.pmadd.ub.sw(<1 x i64> %a, <1 x i64> %b) nounwind
+  ret <1 x i64> %c
 }
 
 ; CHECK-LABEL: @Test_ssse3_pmadd_ub_sw(
@@ -53,10 +53,10 @@ define <2 x i64> @Test_x86_sse2_psad_bw(<16 x i8> %a, <16 x i8> %b) sanitize_mem
 ; CHECK: ret <2 x i64>
 
 
-define x86_mmx @Test_x86_mmx_psad_bw(x86_mmx %a, x86_mmx %b) sanitize_memory {
+define <1 x i64> @Test_x86_mmx_psad_bw(<1 x i64> %a, <1 x i64> %b) sanitize_memory {
 entry:
-  %c = tail call x86_mmx @llvm.x86.mmx.psad.bw(x86_mmx %a, x86_mmx %b) nounwind
-  ret x86_mmx %c
+  %c = tail call <1 x i64> @llvm.x86.mmx.psad.bw(<1 x i64> %a, <1 x i64> %b) nounwind
+  ret <1 x i64> %c
 }
 
 ; CHECK-LABEL: @Test_x86_mmx_psad_bw(
diff --git a/llvm/test/Instrumentation/MemorySanitizer/vector_cvt.ll b/llvm/test/Instrumentation/MemorySanitizer/vector_cvt.ll
index 6ae03f2..e920270 100644
--- a/llvm/test/Instrumentation/MemorySanitizer/vector_cvt.ll
+++ b/llvm/test/Instrumentation/MemorySanitizer/vector_cvt.ll
@@ -7,7 +7,7 @@ target triple = "x86_64-unknown-linux-gnu"
 
 declare i32 @llvm.x86.sse2.cvtsd2si(<2 x double>) nounwind readnone
 declare <2 x double> @llvm.x86.sse2.cvtsi2sd(<2 x double>, i32) nounwind readnone
-declare x86_mmx @llvm.x86.sse.cvtps2pi(<4 x float>) nounwind readnone
+declare <1 x i64> @llvm.x86.sse.cvtps2pi(<4 x float>) nounwind readnone
 declare i32 @llvm.x86.avx512.vcvtss2usi32(<4 x float>, i32) nounwind readnone
 
 ; Single argument vector conversion.
@@ -27,12 +27,12 @@ entry:
 ; CHECK: store i32 0, {{.*}} @__msan_retval_tls
 ; CHECK: ret i32
 
-; x86_mmx packed vector conversion.
+; <1 x i64> packed vector conversion.
 
-define x86_mmx @test_cvtps2pi(<4 x float> %value) sanitize_memory {
+define <1 x i64> @test_cvtps2pi(<4 x float> %value) sanitize_memory {
 entry:
-  %0 = tail call x86_mmx @llvm.x86.sse.cvtps2pi(<4 x float> %value)
-  ret x86_mmx %0
+  %0 = tail call <1 x i64> @llvm.x86.sse.cvtps2pi(<4 x float> %value)
+  ret <1 x i64> %0
 }
 
 ; CHECK-LABEL: @test_cvtps2pi
diff --git a/llvm/test/Instrumentation/MemorySanitizer/vector_pack.ll b/llvm/test/Instrumentation/MemorySanitizer/vector_pack.ll
index 1289abd..13f7a16 100644
--- a/llvm/test/Instrumentation/MemorySanitizer/vector_pack.ll
+++ b/llvm/test/Instrumentation/MemorySanitizer/vector_pack.ll
@@ -7,7 +7,7 @@ target triple = "x86_64-unknown-linux-gnu"
 
 declare <8 x i16> @llvm.x86.sse2.packssdw.128(<4 x i32>, <4 x i32>) nounwind readnone
 declare <32 x i8> @llvm.x86.avx2.packuswb(<16 x i16> %a, <16 x i16> %b) nounwind readnone
-declare x86_mmx @llvm.x86.mmx.packuswb(x86_mmx, x86_mmx) nounwind readnone
+declare <1 x i64> @llvm.x86.mmx.packuswb(<1 x i64>, <1 x i64>) nounwind readnone
 
 define <8 x i16> @Test_packssdw_128(<4 x i32> %a, <4 x i32> %b) sanitize_memory {
 entry:
@@ -41,10 +41,10 @@ entry:
 ; CHECK: ret <32 x i8>
 
 
-define x86_mmx @Test_mmx_packuswb(x86_mmx %a, x86_mmx %b) sanitize_memory {
+define <1 x i64> @Test_mmx_packuswb(<1 x i64> %a, <1 x i64> %b) sanitize_memory {
 entry:
-  %c = tail call x86_mmx @llvm.x86.mmx.packuswb(x86_mmx %a, x86_mmx %b) nounwind
-  ret x86_mmx %c
+  %c = tail call <1 x i64> @llvm.x86.mmx.packuswb(<1 x i64> %a, <1 x i64> %b) nounwind
+  ret <1 x i64> %c
 }
 
 ; CHECK-LABEL: @Test_mmx_packuswb(
diff --git a/llvm/test/Instrumentation/MemorySanitizer/vector_shift.ll b/llvm/test/Instrumentation/MemorySanitizer/vector_shift.ll
index 3c6c441..441dd8f 100644
--- a/llvm/test/Instrumentation/MemorySanitizer/vector_shift.ll
+++ b/llvm/test/Instrumentation/MemorySanitizer/vector_shift.ll
@@ -7,7 +7,7 @@
 target datalayout = "e-p:64:64:64-i1:8:8-i8:8:8-i16:16:16-i32:32:32-i64:64:64-f32:32:32-f64:64:64-v64:64:64-v128:128:128-a0:0:64-s0:64:64-f80:128:128-n8:16:32:64-S128"
 target triple = "x86_64-unknown-linux-gnu"
 
-declare x86_mmx @llvm.x86.mmx.psll.d(x86_mmx, x86_mmx)
+declare <1 x i64> @llvm.x86.mmx.psll.d(<1 x i64>, <1 x i64>)
 declare <16 x i32> @llvm.x86.avx512.psllv.d.512(<16 x i32>, <16 x i32>)
 declare <8 x i32> @llvm.x86.avx2.psllv.d.256(<8 x i32>, <8 x i32>)
 declare <4 x i32> @llvm.x86.avx2.psllv.d(<4 x i32>, <4 x i32>)
@@ -19,10 +19,10 @@ declare <32 x i16> @llvm.x86.avx512.pslli.w.512(<32 x i16>, i32)
 define i64 @test_mmx(i64 %x.coerce, i64 %y.coerce) sanitize_memory {
 entry:
   %0 = bitcast i64 %x.coerce to <2 x i32>
-  %1 = bitcast <2 x i32> %0 to x86_mmx
-  %2 = bitcast i64 %y.coerce to x86_mmx
-  %3 = tail call x86_mmx @llvm.x86.mmx.psll.d(x86_mmx %1, x86_mmx %2)
-  %4 = bitcast x86_mmx %3 to <2 x i32>
+  %1 = bitcast <2 x i32> %0 to <1 x i64>
+  %2 = bitcast i64 %y.coerce to <1 x i64>
+  %3 = tail call <1 x i64> @llvm.x86.mmx.psll.d(<1 x i64> %1, <1 x i64> %2)
+  %4 = bitcast <1 x i64> %3 to <2 x i32>
   %5 = bitcast <2 x i32> %4 to <1 x i64>
   %6 = extractelement <1 x i64> %5, i32 0
   ret i64 %6
diff --git a/llvm/test/LTO/X86/print-pipeline-passes.ll b/llvm/test/LTO/X86/print-pipeline-passes.ll
new file mode 100644
index 0000000..b24e386
--- /dev/null
+++ b/llvm/test/LTO/X86/print-pipeline-passes.ll
@@ -0,0 +1,13 @@
+; RUN: llvm-as < %s >%t.bc
+; RUN: llvm-lto -print-pipeline-passes -exported-symbol=_f -o /dev/null %t.bc 2>&1 | FileCheck %s
+
+; CHECK: pipeline-passes: verify,{{.*}},verify
+
+target datalayout = "e-m:o-p270:32:32-p271:32:32-p272:64:64-i64:64-f80:128-n8:16:32:64-S128"
+target triple = "x86_64-apple-macosx10.10.0"
+
+
+define void @f() {
+entry:
+  ret void
+}
diff --git a/llvm/test/MC/AMDGPU/gfx12_asm_vop1-fake16.s b/llvm/test/MC/AMDGPU/gfx12_asm_vop1-fake16.s
new file mode 100644
index 0000000..8fef2ab
--- /dev/null
+++ b/llvm/test/MC/AMDGPU/gfx12_asm_vop1-fake16.s
@@ -0,0 +1,3597 @@
+// RUN: llvm-mc -triple=amdgcn -mcpu=gfx1200 -mattr=+wavefrontsize32,-real-true16 -show-encoding %s | FileCheck --strict-whitespace --check-prefixes=GFX12,GFX12-ASM %s
+// RUN: llvm-mc -triple=amdgcn -mcpu=gfx1200 -mattr=+wavefrontsize32,-real-true16 -show-encoding %s | %extract-encodings | llvm-mc -triple=amdgcn -mcpu=gfx1200 -mattr=+wavefrontsize32 -disassemble -show-encoding | FileCheck --strict-whitespace --check-prefixes=GFX12,GFX12-DIS %s
+// RUN: llvm-mc -triple=amdgcn -mcpu=gfx1200 -mattr=+wavefrontsize64,-real-true16 -show-encoding %s | FileCheck --strict-whitespace --check-prefixes=GFX12,GFX12-ASM %s
+// RUN: llvm-mc -triple=amdgcn -mcpu=gfx1200 -mattr=+wavefrontsize64,-real-true16 -show-encoding %s | %extract-encodings | llvm-mc -triple=amdgcn -mcpu=gfx1200 -mattr=+wavefrontsize64 -disassemble -show-encoding | FileCheck --strict-whitespace --check-prefixes=GFX12,GFX12-DIS %s
+
+v_bfrev_b32_e32 v5, v1
+// GFX12: v_bfrev_b32_e32 v5, v1                  ; encoding: [0x01,0x71,0x0a,0x7e]
+
+v_bfrev_b32 v5, v255
+// GFX12: v_bfrev_b32_e32 v5, v255                ; encoding: [0xff,0x71,0x0a,0x7e]
+
+v_bfrev_b32 v5, s1
+// GFX12: v_bfrev_b32_e32 v5, s1                  ; encoding: [0x01,0x70,0x0a,0x7e]
+
+v_bfrev_b32 v5, s105
+// GFX12: v_bfrev_b32_e32 v5, s105                ; encoding: [0x69,0x70,0x0a,0x7e]
+
+v_bfrev_b32 v5, vcc_lo
+// GFX12: v_bfrev_b32_e32 v5, vcc_lo              ; encoding: [0x6a,0x70,0x0a,0x7e]
+
+v_bfrev_b32 v5, vcc_hi
+// GFX12: v_bfrev_b32_e32 v5, vcc_hi              ; encoding: [0x6b,0x70,0x0a,0x7e]
+
+v_bfrev_b32 v5, ttmp15
+// GFX12: v_bfrev_b32_e32 v5, ttmp15              ; encoding: [0x7b,0x70,0x0a,0x7e]
+
+v_bfrev_b32 v5, m0
+// GFX12: v_bfrev_b32_e32 v5, m0                  ; encoding: [0x7d,0x70,0x0a,0x7e]
+
+v_bfrev_b32 v5, exec_lo
+// GFX12: v_bfrev_b32_e32 v5, exec_lo             ; encoding: [0x7e,0x70,0x0a,0x7e]
+
+v_bfrev_b32 v5, exec_hi
+// GFX12: v_bfrev_b32_e32 v5, exec_hi             ; encoding: [0x7f,0x70,0x0a,0x7e]
+
+v_bfrev_b32 v5, null
+// GFX12: v_bfrev_b32_e32 v5, null                ; encoding: [0x7c,0x70,0x0a,0x7e]
+
+v_bfrev_b32 v5, -1
+// GFX12: v_bfrev_b32_e32 v5, -1                  ; encoding: [0xc1,0x70,0x0a,0x7e]
+
+v_bfrev_b32 v5, 0.5
+// GFX12: v_bfrev_b32_e32 v5, 0.5                 ; encoding: [0xf0,0x70,0x0a,0x7e]
+
+v_bfrev_b32 v5, src_scc
+// GFX12: v_bfrev_b32_e32 v5, src_scc             ; encoding: [0xfd,0x70,0x0a,0x7e]
+
+v_bfrev_b32 v255, 0xaf123456
+// GFX12: v_bfrev_b32_e32 v255, 0xaf123456        ; encoding: [0xff,0x70,0xfe,0x7f,0x56,0x34,0x12,0xaf]
+
+v_ceil_f16 v5, v1
+// GFX12: v_ceil_f16_e32 v5, v1                   ; encoding: [0x01,0xb9,0x0a,0x7e]
+
+v_ceil_f16 v5, v127
+// GFX12: v_ceil_f16_e32 v5, v127                 ; encoding: [0x7f,0xb9,0x0a,0x7e]
+
+v_ceil_f16 v5, s1
+// GFX12: v_ceil_f16_e32 v5, s1                   ; encoding: [0x01,0xb8,0x0a,0x7e]
+
+v_ceil_f16 v5, s105
+// GFX12: v_ceil_f16_e32 v5, s105                 ; encoding: [0x69,0xb8,0x0a,0x7e]
+
+v_ceil_f16 v5, vcc_lo
+// GFX12: v_ceil_f16_e32 v5, vcc_lo               ; encoding: [0x6a,0xb8,0x0a,0x7e]
+
+v_ceil_f16 v5, vcc_hi
+// GFX12: v_ceil_f16_e32 v5, vcc_hi               ; encoding: [0x6b,0xb8,0x0a,0x7e]
+
+v_ceil_f16 v5, ttmp15
+// GFX12: v_ceil_f16_e32 v5, ttmp15               ; encoding: [0x7b,0xb8,0x0a,0x7e]
+
+v_ceil_f16 v5, m0
+// GFX12: v_ceil_f16_e32 v5, m0                   ; encoding: [0x7d,0xb8,0x0a,0x7e]
+
+v_ceil_f16 v5, exec_lo
+// GFX12: v_ceil_f16_e32 v5, exec_lo              ; encoding: [0x7e,0xb8,0x0a,0x7e]
+
+v_ceil_f16 v5, exec_hi
+// GFX12: v_ceil_f16_e32 v5, exec_hi              ; encoding: [0x7f,0xb8,0x0a,0x7e]
+
+v_ceil_f16 v5, null
+// GFX12: v_ceil_f16_e32 v5, null                 ; encoding: [0x7c,0xb8,0x0a,0x7e]
+
+v_ceil_f16 v5, -1
+// GFX12: v_ceil_f16_e32 v5, -1                   ; encoding: [0xc1,0xb8,0x0a,0x7e]
+
+v_ceil_f16 v5, 0.5
+// GFX12: v_ceil_f16_e32 v5, 0.5                  ; encoding: [0xf0,0xb8,0x0a,0x7e]
+
+v_ceil_f16 v5, src_scc
+// GFX12: v_ceil_f16_e32 v5, src_scc              ; encoding: [0xfd,0xb8,0x0a,0x7e]
+
+v_ceil_f16 v127, 0xfe0b
+// GFX12: v_ceil_f16_e32 v127, 0xfe0b             ; encoding: [0xff,0xb8,0xfe,0x7e,0x0b,0xfe,0x00,0x00]
+
+v_ceil_f32 v5, v1
+// GFX12: v_ceil_f32_e32 v5, v1                   ; encoding: [0x01,0x45,0x0a,0x7e]
+
+v_ceil_f32 v5, v255
+// GFX12: v_ceil_f32_e32 v5, v255                 ; encoding: [0xff,0x45,0x0a,0x7e]
+
+v_ceil_f32 v5, s1
+// GFX12: v_ceil_f32_e32 v5, s1                   ; encoding: [0x01,0x44,0x0a,0x7e]
+
+v_ceil_f32 v5, s105
+// GFX12: v_ceil_f32_e32 v5, s105                 ; encoding: [0x69,0x44,0x0a,0x7e]
+
+v_ceil_f32 v5, vcc_lo
+// GFX12: v_ceil_f32_e32 v5, vcc_lo               ; encoding: [0x6a,0x44,0x0a,0x7e]
+
+v_ceil_f32 v5, vcc_hi
+// GFX12: v_ceil_f32_e32 v5, vcc_hi               ; encoding: [0x6b,0x44,0x0a,0x7e]
+
+v_ceil_f32 v5, ttmp15
+// GFX12: v_ceil_f32_e32 v5, ttmp15               ; encoding: [0x7b,0x44,0x0a,0x7e]
+
+v_ceil_f32 v5, m0
+// GFX12: v_ceil_f32_e32 v5, m0                   ; encoding: [0x7d,0x44,0x0a,0x7e]
+
+v_ceil_f32 v5, exec_lo
+// GFX12: v_ceil_f32_e32 v5, exec_lo              ; encoding: [0x7e,0x44,0x0a,0x7e]
+
+v_ceil_f32 v5, exec_hi
+// GFX12: v_ceil_f32_e32 v5, exec_hi              ; encoding: [0x7f,0x44,0x0a,0x7e]
+
+v_ceil_f32 v5, null
+// GFX12: v_ceil_f32_e32 v5, null                 ; encoding: [0x7c,0x44,0x0a,0x7e]
+
+v_ceil_f32 v5, -1
+// GFX12: v_ceil_f32_e32 v5, -1                   ; encoding: [0xc1,0x44,0x0a,0x7e]
+
+v_ceil_f32 v5, 0.5
+// GFX12: v_ceil_f32_e32 v5, 0.5                  ; encoding: [0xf0,0x44,0x0a,0x7e]
+
+v_ceil_f32 v5, src_scc
+// GFX12: v_ceil_f32_e32 v5, src_scc              ; encoding: [0xfd,0x44,0x0a,0x7e]
+
+v_ceil_f32 v255, 0xaf123456
+// GFX12: v_ceil_f32_e32 v255, 0xaf123456         ; encoding: [0xff,0x44,0xfe,0x7f,0x56,0x34,0x12,0xaf]
+
+v_ceil_f64 v[5:6], v[1:2]
+// GFX12: v_ceil_f64_e32 v[5:6], v[1:2]           ; encoding: [0x01,0x31,0x0a,0x7e]
+
+v_ceil_f64 v[5:6], v[254:255]
+// GFX12: v_ceil_f64_e32 v[5:6], v[254:255]       ; encoding: [0xfe,0x31,0x0a,0x7e]
+
+v_ceil_f64 v[5:6], s[2:3]
+// GFX12: v_ceil_f64_e32 v[5:6], s[2:3]           ; encoding: [0x02,0x30,0x0a,0x7e]
+
+v_ceil_f64 v[5:6], s[104:105]
+// GFX12: v_ceil_f64_e32 v[5:6], s[104:105]       ; encoding: [0x68,0x30,0x0a,0x7e]
+
+v_ceil_f64 v[5:6], vcc
+// GFX12: v_ceil_f64_e32 v[5:6], vcc              ; encoding: [0x6a,0x30,0x0a,0x7e]
+
+v_ceil_f64 v[5:6], ttmp[14:15]
+// GFX12: v_ceil_f64_e32 v[5:6], ttmp[14:15]      ; encoding: [0x7a,0x30,0x0a,0x7e]
+
+v_ceil_f64 v[5:6], exec
+// GFX12: v_ceil_f64_e32 v[5:6], exec             ; encoding: [0x7e,0x30,0x0a,0x7e]
+
+v_ceil_f64 v[5:6], null
+// GFX12: v_ceil_f64_e32 v[5:6], null             ; encoding: [0x7c,0x30,0x0a,0x7e]
+
+v_ceil_f64 v[5:6], -1
+// GFX12: v_ceil_f64_e32 v[5:6], -1               ; encoding: [0xc1,0x30,0x0a,0x7e]
+
+v_ceil_f64 v[5:6], 0.5
+// GFX12: v_ceil_f64_e32 v[5:6], 0.5              ; encoding: [0xf0,0x30,0x0a,0x7e]
+
+v_ceil_f64 v[5:6], src_scc
+// GFX12: v_ceil_f64_e32 v[5:6], src_scc          ; encoding: [0xfd,0x30,0x0a,0x7e]
+
+v_ceil_f64 v[254:255], 0xaf123456
+// GFX12: v_ceil_f64_e32 v[254:255], 0xaf123456   ; encoding: [0xff,0x30,0xfc,0x7f,0x56,0x34,0x12,0xaf]
+
+v_cls_i32 v5, v1
+// GFX12: v_cls_i32_e32 v5, v1                    ; encoding: [0x01,0x77,0x0a,0x7e]
+
+v_cls_i32 v5, v255
+// GFX12: v_cls_i32_e32 v5, v255                  ; encoding: [0xff,0x77,0x0a,0x7e]
+
+v_cls_i32 v5, s1
+// GFX12: v_cls_i32_e32 v5, s1                    ; encoding: [0x01,0x76,0x0a,0x7e]
+
+v_cls_i32 v5, s105
+// GFX12: v_cls_i32_e32 v5, s105                  ; encoding: [0x69,0x76,0x0a,0x7e]
+
+v_cls_i32 v5, vcc_lo
+// GFX12: v_cls_i32_e32 v5, vcc_lo                ; encoding: [0x6a,0x76,0x0a,0x7e]
+
+v_cls_i32 v5, vcc_hi
+// GFX12: v_cls_i32_e32 v5, vcc_hi                ; encoding: [0x6b,0x76,0x0a,0x7e]
+
+v_cls_i32 v5, ttmp15
+// GFX12: v_cls_i32_e32 v5, ttmp15                ; encoding: [0x7b,0x76,0x0a,0x7e]
+
+v_cls_i32 v5, m0
+// GFX12: v_cls_i32_e32 v5, m0                    ; encoding: [0x7d,0x76,0x0a,0x7e]
+
+v_cls_i32 v5, exec_lo
+// GFX12: v_cls_i32_e32 v5, exec_lo               ; encoding: [0x7e,0x76,0x0a,0x7e]
+
+v_cls_i32 v5, exec_hi
+// GFX12: v_cls_i32_e32 v5, exec_hi               ; encoding: [0x7f,0x76,0x0a,0x7e]
+
+v_cls_i32 v5, null
+// GFX12: v_cls_i32_e32 v5, null                  ; encoding: [0x7c,0x76,0x0a,0x7e]
+
+v_cls_i32 v5, -1
+// GFX12: v_cls_i32_e32 v5, -1                    ; encoding: [0xc1,0x76,0x0a,0x7e]
+
+v_cls_i32 v5, 0.5
+// GFX12: v_cls_i32_e32 v5, 0.5                   ; encoding: [0xf0,0x76,0x0a,0x7e]
+
+v_cls_i32 v5, src_scc
+// GFX12: v_cls_i32_e32 v5, src_scc               ; encoding: [0xfd,0x76,0x0a,0x7e]
+
+v_cls_i32 v255, 0xaf123456
+// GFX12: v_cls_i32_e32 v255, 0xaf123456          ; encoding: [0xff,0x76,0xfe,0x7f,0x56,0x34,0x12,0xaf]
+
+v_clz_i32_u32 v5, v1
+// GFX12: v_clz_i32_u32_e32 v5, v1                ; encoding: [0x01,0x73,0x0a,0x7e]
+
+v_clz_i32_u32 v5, v255
+// GFX12: v_clz_i32_u32_e32 v5, v255              ; encoding: [0xff,0x73,0x0a,0x7e]
+
+v_clz_i32_u32 v5, s1
+// GFX12: v_clz_i32_u32_e32 v5, s1                ; encoding: [0x01,0x72,0x0a,0x7e]
+
+v_clz_i32_u32 v5, s105
+// GFX12: v_clz_i32_u32_e32 v5, s105              ; encoding: [0x69,0x72,0x0a,0x7e]
+
+v_clz_i32_u32 v5, vcc_lo
+// GFX12: v_clz_i32_u32_e32 v5, vcc_lo            ; encoding: [0x6a,0x72,0x0a,0x7e]
+
+v_clz_i32_u32 v5, vcc_hi
+// GFX12: v_clz_i32_u32_e32 v5, vcc_hi            ; encoding: [0x6b,0x72,0x0a,0x7e]
+
+v_clz_i32_u32 v5, ttmp15
+// GFX12: v_clz_i32_u32_e32 v5, ttmp15            ; encoding: [0x7b,0x72,0x0a,0x7e]
+
+v_clz_i32_u32 v5, m0
+// GFX12: v_clz_i32_u32_e32 v5, m0                ; encoding: [0x7d,0x72,0x0a,0x7e]
+
+v_clz_i32_u32 v5, exec_lo
+// GFX12: v_clz_i32_u32_e32 v5, exec_lo           ; encoding: [0x7e,0x72,0x0a,0x7e]
+
+v_clz_i32_u32 v5, exec_hi
+// GFX12: v_clz_i32_u32_e32 v5, exec_hi           ; encoding: [0x7f,0x72,0x0a,0x7e]
+
+v_clz_i32_u32 v5, null
+// GFX12: v_clz_i32_u32_e32 v5, null              ; encoding: [0x7c,0x72,0x0a,0x7e]
+
+v_clz_i32_u32 v5, -1
+// GFX12: v_clz_i32_u32_e32 v5, -1                ; encoding: [0xc1,0x72,0x0a,0x7e]
+
+v_clz_i32_u32 v5, 0.5
+// GFX12: v_clz_i32_u32_e32 v5, 0.5               ; encoding: [0xf0,0x72,0x0a,0x7e]
+
+v_clz_i32_u32 v5, src_scc
+// GFX12: v_clz_i32_u32_e32 v5, src_scc           ; encoding: [0xfd,0x72,0x0a,0x7e]
+
+v_clz_i32_u32 v255, 0xaf123456
+// GFX12: v_clz_i32_u32_e32 v255, 0xaf123456      ; encoding: [0xff,0x72,0xfe,0x7f,0x56,0x34,0x12,0xaf]
+
+v_cos_f16 v5, v1
+// GFX12: v_cos_f16_e32 v5, v1                    ; encoding: [0x01,0xc3,0x0a,0x7e]
+
+v_cos_f16 v5, v127
+// GFX12: v_cos_f16_e32 v5, v127                  ; encoding: [0x7f,0xc3,0x0a,0x7e]
+
+v_cos_f16 v5, s1
+// GFX12: v_cos_f16_e32 v5, s1                    ; encoding: [0x01,0xc2,0x0a,0x7e]
+
+v_cos_f16 v5, s105
+// GFX12: v_cos_f16_e32 v5, s105                  ; encoding: [0x69,0xc2,0x0a,0x7e]
+
+v_cos_f16 v5, vcc_lo
+// GFX12: v_cos_f16_e32 v5, vcc_lo                ; encoding: [0x6a,0xc2,0x0a,0x7e]
+
+v_cos_f16 v5, vcc_hi
+// GFX12: v_cos_f16_e32 v5, vcc_hi                ; encoding: [0x6b,0xc2,0x0a,0x7e]
+
+v_cos_f16 v5, ttmp15
+// GFX12: v_cos_f16_e32 v5, ttmp15                ; encoding: [0x7b,0xc2,0x0a,0x7e]
+
+v_cos_f16 v5, m0
+// GFX12: v_cos_f16_e32 v5, m0                    ; encoding: [0x7d,0xc2,0x0a,0x7e]
+
+v_cos_f16 v5, exec_lo
+// GFX12: v_cos_f16_e32 v5, exec_lo               ; encoding: [0x7e,0xc2,0x0a,0x7e]
+
+v_cos_f16 v5, exec_hi
+// GFX12: v_cos_f16_e32 v5, exec_hi               ; encoding: [0x7f,0xc2,0x0a,0x7e]
+
+v_cos_f16 v5, null
+// GFX12: v_cos_f16_e32 v5, null                  ; encoding: [0x7c,0xc2,0x0a,0x7e]
+
+v_cos_f16 v5, -1
+// GFX12: v_cos_f16_e32 v5, -1                    ; encoding: [0xc1,0xc2,0x0a,0x7e]
+
+v_cos_f16 v5, 0.5
+// GFX12: v_cos_f16_e32 v5, 0.5                   ; encoding: [0xf0,0xc2,0x0a,0x7e]
+
+v_cos_f16 v5, src_scc
+// GFX12: v_cos_f16_e32 v5, src_scc               ; encoding: [0xfd,0xc2,0x0a,0x7e]
+
+v_cos_f16 v127, 0xfe0b
+// GFX12: v_cos_f16_e32 v127, 0xfe0b              ; encoding: [0xff,0xc2,0xfe,0x7e,0x0b,0xfe,0x00,0x00]
+
+v_cos_f32 v5, v1
+// GFX12: v_cos_f32_e32 v5, v1                    ; encoding: [0x01,0x6d,0x0a,0x7e]
+
+v_cos_f32 v5, v255
+// GFX12: v_cos_f32_e32 v5, v255                  ; encoding: [0xff,0x6d,0x0a,0x7e]
+
+v_cos_f32 v5, s1
+// GFX12: v_cos_f32_e32 v5, s1                    ; encoding: [0x01,0x6c,0x0a,0x7e]
+
+v_cos_f32 v5, s105
+// GFX12: v_cos_f32_e32 v5, s105                  ; encoding: [0x69,0x6c,0x0a,0x7e]
+
+v_cos_f32 v5, vcc_lo
+// GFX12: v_cos_f32_e32 v5, vcc_lo                ; encoding: [0x6a,0x6c,0x0a,0x7e]
+
+v_cos_f32 v5, vcc_hi
+// GFX12: v_cos_f32_e32 v5, vcc_hi                ; encoding: [0x6b,0x6c,0x0a,0x7e]
+
+v_cos_f32 v5, ttmp15
+// GFX12: v_cos_f32_e32 v5, ttmp15                ; encoding: [0x7b,0x6c,0x0a,0x7e]
+
+v_cos_f32 v5, m0
+// GFX12: v_cos_f32_e32 v5, m0                    ; encoding: [0x7d,0x6c,0x0a,0x7e]
+
+v_cos_f32 v5, exec_lo
+// GFX12: v_cos_f32_e32 v5, exec_lo               ; encoding: [0x7e,0x6c,0x0a,0x7e]
+
+v_cos_f32 v5, exec_hi
+// GFX12: v_cos_f32_e32 v5, exec_hi               ; encoding: [0x7f,0x6c,0x0a,0x7e]
+
+v_cos_f32 v5, null
+// GFX12: v_cos_f32_e32 v5, null                  ; encoding: [0x7c,0x6c,0x0a,0x7e]
+
+v_cos_f32 v5, -1
+// GFX12: v_cos_f32_e32 v5, -1                    ; encoding: [0xc1,0x6c,0x0a,0x7e]
+
+v_cos_f32 v5, 0.5
+// GFX12: v_cos_f32_e32 v5, 0.5                   ; encoding: [0xf0,0x6c,0x0a,0x7e]
+
+v_cos_f32 v5, src_scc
+// GFX12: v_cos_f32_e32 v5, src_scc               ; encoding: [0xfd,0x6c,0x0a,0x7e]
+
+v_cos_f32 v255, 0xaf123456
+// GFX12: v_cos_f32_e32 v255, 0xaf123456          ; encoding: [0xff,0x6c,0xfe,0x7f,0x56,0x34,0x12,0xaf]
+
+v_ctz_i32_b32 v5, v1
+// GFX12: v_ctz_i32_b32_e32 v5, v1                ; encoding: [0x01,0x75,0x0a,0x7e]
+
+v_ctz_i32_b32 v5, v255
+// GFX12: v_ctz_i32_b32_e32 v5, v255              ; encoding: [0xff,0x75,0x0a,0x7e]
+
+v_ctz_i32_b32 v5, s1
+// GFX12: v_ctz_i32_b32_e32 v5, s1                ; encoding: [0x01,0x74,0x0a,0x7e]
+
+v_ctz_i32_b32 v5, s105
+// GFX12: v_ctz_i32_b32_e32 v5, s105              ; encoding: [0x69,0x74,0x0a,0x7e]
+
+v_ctz_i32_b32 v5, vcc_lo
+// GFX12: v_ctz_i32_b32_e32 v5, vcc_lo            ; encoding: [0x6a,0x74,0x0a,0x7e]
+
+v_ctz_i32_b32 v5, vcc_hi
+// GFX12: v_ctz_i32_b32_e32 v5, vcc_hi            ; encoding: [0x6b,0x74,0x0a,0x7e]
+
+v_ctz_i32_b32 v5, ttmp15
+// GFX12: v_ctz_i32_b32_e32 v5, ttmp15            ; encoding: [0x7b,0x74,0x0a,0x7e]
+
+v_ctz_i32_b32 v5, m0
+// GFX12: v_ctz_i32_b32_e32 v5, m0                ; encoding: [0x7d,0x74,0x0a,0x7e]
+
+v_ctz_i32_b32 v5, exec_lo
+// GFX12: v_ctz_i32_b32_e32 v5, exec_lo           ; encoding: [0x7e,0x74,0x0a,0x7e]
+
+v_ctz_i32_b32 v5, exec_hi
+// GFX12: v_ctz_i32_b32_e32 v5, exec_hi           ; encoding: [0x7f,0x74,0x0a,0x7e]
+
+v_ctz_i32_b32 v5, null
+// GFX12: v_ctz_i32_b32_e32 v5, null              ; encoding: [0x7c,0x74,0x0a,0x7e]
+
+v_ctz_i32_b32 v5, -1
+// GFX12: v_ctz_i32_b32_e32 v5, -1                ; encoding: [0xc1,0x74,0x0a,0x7e]
+
+v_ctz_i32_b32 v5, 0.5
+// GFX12: v_ctz_i32_b32_e32 v5, 0.5               ; encoding: [0xf0,0x74,0x0a,0x7e]
+
+v_ctz_i32_b32 v5, src_scc
+// GFX12: v_ctz_i32_b32_e32 v5, src_scc           ; encoding: [0xfd,0x74,0x0a,0x7e]
+
+v_ctz_i32_b32 v255, 0xaf123456
+// GFX12: v_ctz_i32_b32_e32 v255, 0xaf123456      ; encoding: [0xff,0x74,0xfe,0x7f,0x56,0x34,0x12,0xaf]
+
+v_cvt_f32_bf8_e32 v1, s3
+// GFX12: v_cvt_f32_bf8_e32 v1, s3                ; encoding: [0x03,0xda,0x02,0x7e]
+
+v_cvt_f32_bf8_e32 v1, 3
+// GFX12: v_cvt_f32_bf8_e32 v1, 3                 ; encoding: [0x83,0xda,0x02,0x7e]
+
+v_cvt_f32_bf8_e32 v1, v3
+// GFX12: v_cvt_f32_bf8_e32 v1, v3                ; encoding: [0x03,0xdb,0x02,0x7e]
+
+v_cvt_f32_fp8_e32 v1, s3
+// GFX12: v_cvt_f32_fp8_e32 v1, s3                ; encoding: [0x03,0xd8,0x02,0x7e]
+
+v_cvt_f32_fp8_e32 v1, 3
+// GFX12: v_cvt_f32_fp8_e32 v1, 3                 ; encoding: [0x83,0xd8,0x02,0x7e]
+
+v_cvt_f32_fp8_e32 v1, v3
+// GFX12: v_cvt_f32_fp8_e32 v1, v3                ; encoding: [0x03,0xd9,0x02,0x7e]
+
+v_cvt_pk_f32_bf8_e32 v[2:3], s3
+// GFX12: v_cvt_pk_f32_bf8_e32 v[2:3], s3         ; encoding: [0x03,0xde,0x04,0x7e]
+
+v_cvt_pk_f32_bf8_e32 v[3:4], s5
+// GFX12: v_cvt_pk_f32_bf8_e32 v[3:4], s5         ; encoding: [0x05,0xde,0x06,0x7e]
+
+v_cvt_pk_f32_bf8_e32 v[2:3], 3
+// GFX12: v_cvt_pk_f32_bf8_e32 v[2:3], 3          ; encoding: [0x83,0xde,0x04,0x7e]
+
+v_cvt_pk_f32_bf8_e32 v[3:4], 3
+// GFX12: v_cvt_pk_f32_bf8_e32 v[3:4], 3          ; encoding: [0x83,0xde,0x06,0x7e]
+
+v_cvt_pk_f32_bf8_e32 v[2:3], v3
+// GFX12: v_cvt_pk_f32_bf8_e32 v[2:3], v3         ; encoding: [0x03,0xdf,0x04,0x7e]
+
+v_cvt_pk_f32_bf8_e32 v[3:4], v3
+// GFX12: v_cvt_pk_f32_bf8_e32 v[3:4], v3         ; encoding: [0x03,0xdf,0x06,0x7e]
+
+v_cvt_pk_f32_fp8_e32 v[2:3], s3
+// GFX12: v_cvt_pk_f32_fp8_e32 v[2:3], s3         ; encoding: [0x03,0xdc,0x04,0x7e]
+
+v_cvt_pk_f32_fp8_e32 v[2:3], 3
+// GFX12: v_cvt_pk_f32_fp8_e32 v[2:3], 3          ; encoding: [0x83,0xdc,0x04,0x7e]
+
+v_cvt_pk_f32_fp8_e32 v[2:3], v3
+// GFX12: v_cvt_pk_f32_fp8_e32 v[2:3], v3         ; encoding: [0x03,0xdd,0x04,0x7e]
+
+v_cvt_f16_f32 v5, v1
+// GFX12: v_cvt_f16_f32_e32 v5, v1                ; encoding: [0x01,0x15,0x0a,0x7e]
+
+v_cvt_f16_f32 v5, v255
+// GFX12: v_cvt_f16_f32_e32 v5, v255              ; encoding: [0xff,0x15,0x0a,0x7e]
+
+v_cvt_f16_f32 v5, s1
+// GFX12: v_cvt_f16_f32_e32 v5, s1                ; encoding: [0x01,0x14,0x0a,0x7e]
+
+v_cvt_f16_f32 v5, s105
+// GFX12: v_cvt_f16_f32_e32 v5, s105              ; encoding: [0x69,0x14,0x0a,0x7e]
+
+v_cvt_f16_f32 v5, vcc_lo
+// GFX12: v_cvt_f16_f32_e32 v5, vcc_lo            ; encoding: [0x6a,0x14,0x0a,0x7e]
+
+v_cvt_f16_f32 v5, vcc_hi
+// GFX12: v_cvt_f16_f32_e32 v5, vcc_hi            ; encoding: [0x6b,0x14,0x0a,0x7e]
+
+v_cvt_f16_f32 v5, ttmp15
+// GFX12: v_cvt_f16_f32_e32 v5, ttmp15            ; encoding: [0x7b,0x14,0x0a,0x7e]
+
+v_cvt_f16_f32 v5, m0
+// GFX12: v_cvt_f16_f32_e32 v5, m0                ; encoding: [0x7d,0x14,0x0a,0x7e]
+
+v_cvt_f16_f32 v5, exec_lo
+// GFX12: v_cvt_f16_f32_e32 v5, exec_lo           ; encoding: [0x7e,0x14,0x0a,0x7e]
+
+v_cvt_f16_f32 v5, exec_hi
+// GFX12: v_cvt_f16_f32_e32 v5, exec_hi           ; encoding: [0x7f,0x14,0x0a,0x7e]
+
+v_cvt_f16_f32 v5, null
+// GFX12: v_cvt_f16_f32_e32 v5, null              ; encoding: [0x7c,0x14,0x0a,0x7e]
+
+v_cvt_f16_f32 v5, -1
+// GFX12: v_cvt_f16_f32_e32 v5, -1                ; encoding: [0xc1,0x14,0x0a,0x7e]
+
+v_cvt_f16_f32 v5, 0.5
+// GFX12: v_cvt_f16_f32_e32 v5, 0.5               ; encoding: [0xf0,0x14,0x0a,0x7e]
+
+v_cvt_f16_f32 v5, src_scc
+// GFX12: v_cvt_f16_f32_e32 v5, src_scc           ; encoding: [0xfd,0x14,0x0a,0x7e]
+
+v_cvt_f16_f32 v127, 0xaf123456
+// GFX12: v_cvt_f16_f32_e32 v127, 0xaf123456      ; encoding: [0xff,0x14,0xfe,0x7e,0x56,0x34,0x12,0xaf]
+
+v_cvt_f16_i16 v5, v1
+// GFX12: v_cvt_f16_i16_e32 v5, v1                ; encoding: [0x01,0xa3,0x0a,0x7e]
+
+v_cvt_f16_i16 v5, v127
+// GFX12: v_cvt_f16_i16_e32 v5, v127              ; encoding: [0x7f,0xa3,0x0a,0x7e]
+
+v_cvt_f16_i16 v5, s1
+// GFX12: v_cvt_f16_i16_e32 v5, s1                ; encoding: [0x01,0xa2,0x0a,0x7e]
+
+v_cvt_f16_i16 v5, s105
+// GFX12: v_cvt_f16_i16_e32 v5, s105              ; encoding: [0x69,0xa2,0x0a,0x7e]
+
+v_cvt_f16_i16 v5, vcc_lo
+// GFX12: v_cvt_f16_i16_e32 v5, vcc_lo            ; encoding: [0x6a,0xa2,0x0a,0x7e]
+
+v_cvt_f16_i16 v5, vcc_hi
+// GFX12: v_cvt_f16_i16_e32 v5, vcc_hi            ; encoding: [0x6b,0xa2,0x0a,0x7e]
+
+v_cvt_f16_i16 v5, ttmp15
+// GFX12: v_cvt_f16_i16_e32 v5, ttmp15            ; encoding: [0x7b,0xa2,0x0a,0x7e]
+
+v_cvt_f16_i16 v5, m0
+// GFX12: v_cvt_f16_i16_e32 v5, m0                ; encoding: [0x7d,0xa2,0x0a,0x7e]
+
+v_cvt_f16_i16 v5, exec_lo
+// GFX12: v_cvt_f16_i16_e32 v5, exec_lo           ; encoding: [0x7e,0xa2,0x0a,0x7e]
+
+v_cvt_f16_i16 v5, exec_hi
+// GFX12: v_cvt_f16_i16_e32 v5, exec_hi           ; encoding: [0x7f,0xa2,0x0a,0x7e]
+
+v_cvt_f16_i16 v5, null
+// GFX12: v_cvt_f16_i16_e32 v5, null              ; encoding: [0x7c,0xa2,0x0a,0x7e]
+
+v_cvt_f16_i16 v5, -1
+// GFX12: v_cvt_f16_i16_e32 v5, -1                ; encoding: [0xc1,0xa2,0x0a,0x7e]
+
+v_cvt_f16_i16 v5, 0.5
+// GFX12-ASM: v_cvt_f16_i16_e32 v5, 0.5               ; encoding: [0xf0,0xa2,0x0a,0x7e]
+// GFX12-DIS: v_cvt_f16_i16_e32 v5, 0x3800            ; encoding: [0xff,0xa2,0x0a,0x7e,0x00,0x38,0x00,0x00]
+
+v_cvt_f16_i16 v5, src_scc
+// GFX12: v_cvt_f16_i16_e32 v5, src_scc           ; encoding: [0xfd,0xa2,0x0a,0x7e]
+
+v_cvt_f16_i16 v127, 0xfe0b
+// GFX12: v_cvt_f16_i16_e32 v127, 0xfe0b          ; encoding: [0xff,0xa2,0xfe,0x7e,0x0b,0xfe,0x00,0x00]
+
+v_cvt_f16_u16 v5, v1
+// GFX12: v_cvt_f16_u16_e32 v5, v1                ; encoding: [0x01,0xa1,0x0a,0x7e]
+
+v_cvt_f16_u16 v5, v127
+// GFX12: v_cvt_f16_u16_e32 v5, v127              ; encoding: [0x7f,0xa1,0x0a,0x7e]
+
+v_cvt_f16_u16 v5, s1
+// GFX12: v_cvt_f16_u16_e32 v5, s1                ; encoding: [0x01,0xa0,0x0a,0x7e]
+
+v_cvt_f16_u16 v5, s105
+// GFX12: v_cvt_f16_u16_e32 v5, s105              ; encoding: [0x69,0xa0,0x0a,0x7e]
+
+v_cvt_f16_u16 v5, vcc_lo
+// GFX12: v_cvt_f16_u16_e32 v5, vcc_lo            ; encoding: [0x6a,0xa0,0x0a,0x7e]
+
+v_cvt_f16_u16 v5, vcc_hi
+// GFX12: v_cvt_f16_u16_e32 v5, vcc_hi            ; encoding: [0x6b,0xa0,0x0a,0x7e]
+
+v_cvt_f16_u16 v5, ttmp15
+// GFX12: v_cvt_f16_u16_e32 v5, ttmp15            ; encoding: [0x7b,0xa0,0x0a,0x7e]
+
+v_cvt_f16_u16 v5, m0
+// GFX12: v_cvt_f16_u16_e32 v5, m0                ; encoding: [0x7d,0xa0,0x0a,0x7e]
+
+v_cvt_f16_u16 v5, exec_lo
+// GFX12: v_cvt_f16_u16_e32 v5, exec_lo           ; encoding: [0x7e,0xa0,0x0a,0x7e]
+
+v_cvt_f16_u16 v5, exec_hi
+// GFX12: v_cvt_f16_u16_e32 v5, exec_hi           ; encoding: [0x7f,0xa0,0x0a,0x7e]
+
+v_cvt_f16_u16 v5, null
+// GFX12: v_cvt_f16_u16_e32 v5, null              ; encoding: [0x7c,0xa0,0x0a,0x7e]
+
+v_cvt_f16_u16 v5, -1
+// GFX12: v_cvt_f16_u16_e32 v5, -1                ; encoding: [0xc1,0xa0,0x0a,0x7e]
+
+v_cvt_f16_u16 v5, 0.5
+// GFX12-ASM: v_cvt_f16_u16_e32 v5, 0.5               ; encoding: [0xf0,0xa0,0x0a,0x7e]
+// GFX12-DIS: v_cvt_f16_u16_e32 v5, 0x3800            ; encoding: [0xff,0xa0,0x0a,0x7e,0x00,0x38,0x00,0x00]
+
+v_cvt_f16_u16 v5, src_scc
+// GFX12: v_cvt_f16_u16_e32 v5, src_scc           ; encoding: [0xfd,0xa0,0x0a,0x7e]
+
+v_cvt_f16_u16 v127, 0xfe0b
+// GFX12: v_cvt_f16_u16_e32 v127, 0xfe0b          ; encoding: [0xff,0xa0,0xfe,0x7e,0x0b,0xfe,0x00,0x00]
+
+v_cvt_f32_f16 v5, v1
+// GFX12: v_cvt_f32_f16_e32 v5, v1                ; encoding: [0x01,0x17,0x0a,0x7e]
+
+v_cvt_f32_f16 v5, v127
+// GFX12: v_cvt_f32_f16_e32 v5, v127              ; encoding: [0x7f,0x17,0x0a,0x7e]
+
+v_cvt_f32_f16 v5, s1
+// GFX12: v_cvt_f32_f16_e32 v5, s1                ; encoding: [0x01,0x16,0x0a,0x7e]
+
+v_cvt_f32_f16 v5, s105
+// GFX12: v_cvt_f32_f16_e32 v5, s105              ; encoding: [0x69,0x16,0x0a,0x7e]
+
+v_cvt_f32_f16 v5, vcc_lo
+// GFX12: v_cvt_f32_f16_e32 v5, vcc_lo            ; encoding: [0x6a,0x16,0x0a,0x7e]
+
+v_cvt_f32_f16 v5, vcc_hi
+// GFX12: v_cvt_f32_f16_e32 v5, vcc_hi            ; encoding: [0x6b,0x16,0x0a,0x7e]
+
+v_cvt_f32_f16 v5, ttmp15
+// GFX12: v_cvt_f32_f16_e32 v5, ttmp15            ; encoding: [0x7b,0x16,0x0a,0x7e]
+
+v_cvt_f32_f16 v5, m0
+// GFX12: v_cvt_f32_f16_e32 v5, m0                ; encoding: [0x7d,0x16,0x0a,0x7e]
+
+v_cvt_f32_f16 v5, exec_lo
+// GFX12: v_cvt_f32_f16_e32 v5, exec_lo           ; encoding: [0x7e,0x16,0x0a,0x7e]
+
+v_cvt_f32_f16 v5, exec_hi
+// GFX12: v_cvt_f32_f16_e32 v5, exec_hi           ; encoding: [0x7f,0x16,0x0a,0x7e]
+
+v_cvt_f32_f16 v5, null
+// GFX12: v_cvt_f32_f16_e32 v5, null              ; encoding: [0x7c,0x16,0x0a,0x7e]
+
+v_cvt_f32_f16 v5, -1
+// GFX12: v_cvt_f32_f16_e32 v5, -1                ; encoding: [0xc1,0x16,0x0a,0x7e]
+
+v_cvt_f32_f16 v5, 0.5
+// GFX12: v_cvt_f32_f16_e32 v5, 0.5               ; encoding: [0xf0,0x16,0x0a,0x7e]
+
+v_cvt_f32_f16 v5, src_scc
+// GFX12: v_cvt_f32_f16_e32 v5, src_scc           ; encoding: [0xfd,0x16,0x0a,0x7e]
+
+v_cvt_f32_f16 v255, 0xfe0b
+// GFX12: v_cvt_f32_f16_e32 v255, 0xfe0b          ; encoding: [0xff,0x16,0xfe,0x7f,0x0b,0xfe,0x00,0x00]
+
+v_cvt_f32_f64 v5, v[1:2]
+// GFX12: v_cvt_f32_f64_e32 v5, v[1:2]            ; encoding: [0x01,0x1f,0x0a,0x7e]
+
+v_cvt_f32_f64 v5, v[254:255]
+// GFX12: v_cvt_f32_f64_e32 v5, v[254:255]        ; encoding: [0xfe,0x1f,0x0a,0x7e]
+
+v_cvt_f32_f64 v5, s[2:3]
+// GFX12: v_cvt_f32_f64_e32 v5, s[2:3]            ; encoding: [0x02,0x1e,0x0a,0x7e]
+
+v_cvt_f32_f64 v5, s[104:105]
+// GFX12: v_cvt_f32_f64_e32 v5, s[104:105]        ; encoding: [0x68,0x1e,0x0a,0x7e]
+
+v_cvt_f32_f64 v5, vcc
+// GFX12: v_cvt_f32_f64_e32 v5, vcc               ; encoding: [0x6a,0x1e,0x0a,0x7e]
+
+v_cvt_f32_f64 v5, ttmp[14:15]
+// GFX12: v_cvt_f32_f64_e32 v5, ttmp[14:15]       ; encoding: [0x7a,0x1e,0x0a,0x7e]
+
+v_cvt_f32_f64 v5, exec
+// GFX12: v_cvt_f32_f64_e32 v5, exec              ; encoding: [0x7e,0x1e,0x0a,0x7e]
+
+v_cvt_f32_f64 v5, null
+// GFX12: v_cvt_f32_f64_e32 v5, null              ; encoding: [0x7c,0x1e,0x0a,0x7e]
+
+v_cvt_f32_f64 v5, -1
+// GFX12: v_cvt_f32_f64_e32 v5, -1                ; encoding: [0xc1,0x1e,0x0a,0x7e]
+
+v_cvt_f32_f64 v5, 0.5
+// GFX12: v_cvt_f32_f64_e32 v5, 0.5               ; encoding: [0xf0,0x1e,0x0a,0x7e]
+
+v_cvt_f32_f64 v5, src_scc
+// GFX12: v_cvt_f32_f64_e32 v5, src_scc           ; encoding: [0xfd,0x1e,0x0a,0x7e]
+
+v_cvt_f32_f64 v255, 0xaf123456
+// GFX12: v_cvt_f32_f64_e32 v255, 0xaf123456      ; encoding: [0xff,0x1e,0xfe,0x7f,0x56,0x34,0x12,0xaf]
+
+v_cvt_f32_i32 v5, v1
+// GFX12: v_cvt_f32_i32_e32 v5, v1                ; encoding: [0x01,0x0b,0x0a,0x7e]
+
+v_cvt_f32_i32 v5, v255
+// GFX12: v_cvt_f32_i32_e32 v5, v255              ; encoding: [0xff,0x0b,0x0a,0x7e]
+
+v_cvt_f32_i32 v5, s1
+// GFX12: v_cvt_f32_i32_e32 v5, s1                ; encoding: [0x01,0x0a,0x0a,0x7e]
+
+v_cvt_f32_i32 v5, s105
+// GFX12: v_cvt_f32_i32_e32 v5, s105              ; encoding: [0x69,0x0a,0x0a,0x7e]
+
+v_cvt_f32_i32 v5, vcc_lo
+// GFX12: v_cvt_f32_i32_e32 v5, vcc_lo            ; encoding: [0x6a,0x0a,0x0a,0x7e]
+
+v_cvt_f32_i32 v5, vcc_hi
+// GFX12: v_cvt_f32_i32_e32 v5, vcc_hi            ; encoding: [0x6b,0x0a,0x0a,0x7e]
+
+v_cvt_f32_i32 v5, ttmp15
+// GFX12: v_cvt_f32_i32_e32 v5, ttmp15            ; encoding: [0x7b,0x0a,0x0a,0x7e]
+
+v_cvt_f32_i32 v5, m0
+// GFX12: v_cvt_f32_i32_e32 v5, m0                ; encoding: [0x7d,0x0a,0x0a,0x7e]
+
+v_cvt_f32_i32 v5, exec_lo
+// GFX12: v_cvt_f32_i32_e32 v5, exec_lo           ; encoding: [0x7e,0x0a,0x0a,0x7e]
+
+v_cvt_f32_i32 v5, exec_hi
+// GFX12: v_cvt_f32_i32_e32 v5, exec_hi           ; encoding: [0x7f,0x0a,0x0a,0x7e]
+
+v_cvt_f32_i32 v5, null
+// GFX12: v_cvt_f32_i32_e32 v5, null              ; encoding: [0x7c,0x0a,0x0a,0x7e]
+
+v_cvt_f32_i32 v5, -1
+// GFX12: v_cvt_f32_i32_e32 v5, -1                ; encoding: [0xc1,0x0a,0x0a,0x7e]
+
+v_cvt_f32_i32 v5, 0.5
+// GFX12: v_cvt_f32_i32_e32 v5, 0.5               ; encoding: [0xf0,0x0a,0x0a,0x7e]
+
+v_cvt_f32_i32 v5, src_scc
+// GFX12: v_cvt_f32_i32_e32 v5, src_scc           ; encoding: [0xfd,0x0a,0x0a,0x7e]
+
+v_cvt_f32_i32 v255, 0xaf123456
+// GFX12: v_cvt_f32_i32_e32 v255, 0xaf123456      ; encoding: [0xff,0x0a,0xfe,0x7f,0x56,0x34,0x12,0xaf]
+
+v_cvt_f32_u32 v5, v1
+// GFX12: v_cvt_f32_u32_e32 v5, v1                ; encoding: [0x01,0x0d,0x0a,0x7e]
+
+v_cvt_f32_u32 v5, v255
+// GFX12: v_cvt_f32_u32_e32 v5, v255              ; encoding: [0xff,0x0d,0x0a,0x7e]
+
+v_cvt_f32_u32 v5, s1
+// GFX12: v_cvt_f32_u32_e32 v5, s1                ; encoding: [0x01,0x0c,0x0a,0x7e]
+
+v_cvt_f32_u32 v5, s105
+// GFX12: v_cvt_f32_u32_e32 v5, s105              ; encoding: [0x69,0x0c,0x0a,0x7e]
+
+v_cvt_f32_u32 v5, vcc_lo
+// GFX12: v_cvt_f32_u32_e32 v5, vcc_lo            ; encoding: [0x6a,0x0c,0x0a,0x7e]
+
+v_cvt_f32_u32 v5, vcc_hi
+// GFX12: v_cvt_f32_u32_e32 v5, vcc_hi            ; encoding: [0x6b,0x0c,0x0a,0x7e]
+
+v_cvt_f32_u32 v5, ttmp15
+// GFX12: v_cvt_f32_u32_e32 v5, ttmp15            ; encoding: [0x7b,0x0c,0x0a,0x7e]
+
+v_cvt_f32_u32 v5, m0
+// GFX12: v_cvt_f32_u32_e32 v5, m0                ; encoding: [0x7d,0x0c,0x0a,0x7e]
+
+v_cvt_f32_u32 v5, exec_lo
+// GFX12: v_cvt_f32_u32_e32 v5, exec_lo           ; encoding: [0x7e,0x0c,0x0a,0x7e]
+
+v_cvt_f32_u32 v5, exec_hi
+// GFX12: v_cvt_f32_u32_e32 v5, exec_hi           ; encoding: [0x7f,0x0c,0x0a,0x7e]
+
+v_cvt_f32_u32 v5, null
+// GFX12: v_cvt_f32_u32_e32 v5, null              ; encoding: [0x7c,0x0c,0x0a,0x7e]
+
+v_cvt_f32_u32 v5, -1
+// GFX12: v_cvt_f32_u32_e32 v5, -1                ; encoding: [0xc1,0x0c,0x0a,0x7e]
+
+v_cvt_f32_u32 v5, 0.5
+// GFX12: v_cvt_f32_u32_e32 v5, 0.5               ; encoding: [0xf0,0x0c,0x0a,0x7e]
+
+v_cvt_f32_u32 v5, src_scc
+// GFX12: v_cvt_f32_u32_e32 v5, src_scc           ; encoding: [0xfd,0x0c,0x0a,0x7e]
+
+v_cvt_f32_u32 v255, 0xaf123456
+// GFX12: v_cvt_f32_u32_e32 v255, 0xaf123456      ; encoding: [0xff,0x0c,0xfe,0x7f,0x56,0x34,0x12,0xaf]
+
+v_cvt_f32_ubyte0 v5, v1
+// GFX12: v_cvt_f32_ubyte0_e32 v5, v1             ; encoding: [0x01,0x23,0x0a,0x7e]
+
+v_cvt_f32_ubyte0 v5, v255
+// GFX12: v_cvt_f32_ubyte0_e32 v5, v255           ; encoding: [0xff,0x23,0x0a,0x7e]
+
+v_cvt_f32_ubyte0 v5, s1
+// GFX12: v_cvt_f32_ubyte0_e32 v5, s1             ; encoding: [0x01,0x22,0x0a,0x7e]
+
+v_cvt_f32_ubyte0 v5, s105
+// GFX12: v_cvt_f32_ubyte0_e32 v5, s105           ; encoding: [0x69,0x22,0x0a,0x7e]
+
+v_cvt_f32_ubyte0 v5, vcc_lo
+// GFX12: v_cvt_f32_ubyte0_e32 v5, vcc_lo         ; encoding: [0x6a,0x22,0x0a,0x7e]
+
+v_cvt_f32_ubyte0 v5, vcc_hi
+// GFX12: v_cvt_f32_ubyte0_e32 v5, vcc_hi         ; encoding: [0x6b,0x22,0x0a,0x7e]
+
+v_cvt_f32_ubyte0 v5, ttmp15
+// GFX12: v_cvt_f32_ubyte0_e32 v5, ttmp15         ; encoding: [0x7b,0x22,0x0a,0x7e]
+
+v_cvt_f32_ubyte0 v5, m0
+// GFX12: v_cvt_f32_ubyte0_e32 v5, m0             ; encoding: [0x7d,0x22,0x0a,0x7e]
+
+v_cvt_f32_ubyte0 v5, exec_lo
+// GFX12: v_cvt_f32_ubyte0_e32 v5, exec_lo        ; encoding: [0x7e,0x22,0x0a,0x7e]
+
+v_cvt_f32_ubyte0 v5, exec_hi
+// GFX12: v_cvt_f32_ubyte0_e32 v5, exec_hi        ; encoding: [0x7f,0x22,0x0a,0x7e]
+
+v_cvt_f32_ubyte0 v5, null
+// GFX12: v_cvt_f32_ubyte0_e32 v5, null           ; encoding: [0x7c,0x22,0x0a,0x7e]
+
+v_cvt_f32_ubyte0 v5, -1
+// GFX12: v_cvt_f32_ubyte0_e32 v5, -1             ; encoding: [0xc1,0x22,0x0a,0x7e]
+
+v_cvt_f32_ubyte0 v5, 0.5
+// GFX12: v_cvt_f32_ubyte0_e32 v5, 0.5            ; encoding: [0xf0,0x22,0x0a,0x7e]
+
+v_cvt_f32_ubyte0 v5, src_scc
+// GFX12: v_cvt_f32_ubyte0_e32 v5, src_scc        ; encoding: [0xfd,0x22,0x0a,0x7e]
+
+v_cvt_f32_ubyte0 v255, 0xaf123456
+// GFX12: v_cvt_f32_ubyte0_e32 v255, 0xaf123456   ; encoding: [0xff,0x22,0xfe,0x7f,0x56,0x34,0x12,0xaf]
+
+v_cvt_f32_ubyte1 v5, v1
+// GFX12: v_cvt_f32_ubyte1_e32 v5, v1             ; encoding: [0x01,0x25,0x0a,0x7e]
+
+v_cvt_f32_ubyte1 v5, v255
+// GFX12: v_cvt_f32_ubyte1_e32 v5, v255           ; encoding: [0xff,0x25,0x0a,0x7e]
+
+v_cvt_f32_ubyte1 v5, s1
+// GFX12: v_cvt_f32_ubyte1_e32 v5, s1             ; encoding: [0x01,0x24,0x0a,0x7e]
+
+v_cvt_f32_ubyte1 v5, s105
+// GFX12: v_cvt_f32_ubyte1_e32 v5, s105           ; encoding: [0x69,0x24,0x0a,0x7e]
+
+v_cvt_f32_ubyte1 v5, vcc_lo
+// GFX12: v_cvt_f32_ubyte1_e32 v5, vcc_lo         ; encoding: [0x6a,0x24,0x0a,0x7e]
+
+v_cvt_f32_ubyte1 v5, vcc_hi
+// GFX12: v_cvt_f32_ubyte1_e32 v5, vcc_hi         ; encoding: [0x6b,0x24,0x0a,0x7e]
+
+v_cvt_f32_ubyte1 v5, ttmp15
+// GFX12: v_cvt_f32_ubyte1_e32 v5, ttmp15         ; encoding: [0x7b,0x24,0x0a,0x7e]
+
+v_cvt_f32_ubyte1 v5, m0
+// GFX12: v_cvt_f32_ubyte1_e32 v5, m0             ; encoding: [0x7d,0x24,0x0a,0x7e]
+
+v_cvt_f32_ubyte1 v5, exec_lo
+// GFX12: v_cvt_f32_ubyte1_e32 v5, exec_lo        ; encoding: [0x7e,0x24,0x0a,0x7e]
+
+v_cvt_f32_ubyte1 v5, exec_hi
+// GFX12: v_cvt_f32_ubyte1_e32 v5, exec_hi        ; encoding: [0x7f,0x24,0x0a,0x7e]
+
+v_cvt_f32_ubyte1 v5, null
+// GFX12: v_cvt_f32_ubyte1_e32 v5, null           ; encoding: [0x7c,0x24,0x0a,0x7e]
+
+v_cvt_f32_ubyte1 v5, -1
+// GFX12: v_cvt_f32_ubyte1_e32 v5, -1             ; encoding: [0xc1,0x24,0x0a,0x7e]
+
+v_cvt_f32_ubyte1 v5, 0.5
+// GFX12: v_cvt_f32_ubyte1_e32 v5, 0.5            ; encoding: [0xf0,0x24,0x0a,0x7e]
+
+v_cvt_f32_ubyte1 v5, src_scc
+// GFX12: v_cvt_f32_ubyte1_e32 v5, src_scc        ; encoding: [0xfd,0x24,0x0a,0x7e]
+
+v_cvt_f32_ubyte1 v255, 0xaf123456
+// GFX12: v_cvt_f32_ubyte1_e32 v255, 0xaf123456   ; encoding: [0xff,0x24,0xfe,0x7f,0x56,0x34,0x12,0xaf]
+
+v_cvt_f32_ubyte2 v5, v1
+// GFX12: v_cvt_f32_ubyte2_e32 v5, v1             ; encoding: [0x01,0x27,0x0a,0x7e]
+
+v_cvt_f32_ubyte2 v5, v255
+// GFX12: v_cvt_f32_ubyte2_e32 v5, v255           ; encoding: [0xff,0x27,0x0a,0x7e]
+
+v_cvt_f32_ubyte2 v5, s1
+// GFX12: v_cvt_f32_ubyte2_e32 v5, s1             ; encoding: [0x01,0x26,0x0a,0x7e]
+
+v_cvt_f32_ubyte2 v5, s105
+// GFX12: v_cvt_f32_ubyte2_e32 v5, s105           ; encoding: [0x69,0x26,0x0a,0x7e]
+
+v_cvt_f32_ubyte2 v5, vcc_lo
+// GFX12: v_cvt_f32_ubyte2_e32 v5, vcc_lo         ; encoding: [0x6a,0x26,0x0a,0x7e]
+
+v_cvt_f32_ubyte2 v5, vcc_hi
+// GFX12: v_cvt_f32_ubyte2_e32 v5, vcc_hi         ; encoding: [0x6b,0x26,0x0a,0x7e]
+
+v_cvt_f32_ubyte2 v5, ttmp15
+// GFX12: v_cvt_f32_ubyte2_e32 v5, ttmp15         ; encoding: [0x7b,0x26,0x0a,0x7e]
+
+v_cvt_f32_ubyte2 v5, m0
+// GFX12: v_cvt_f32_ubyte2_e32 v5, m0             ; encoding: [0x7d,0x26,0x0a,0x7e]
+
+v_cvt_f32_ubyte2 v5, exec_lo
+// GFX12: v_cvt_f32_ubyte2_e32 v5, exec_lo        ; encoding: [0x7e,0x26,0x0a,0x7e]
+
+v_cvt_f32_ubyte2 v5, exec_hi
+// GFX12: v_cvt_f32_ubyte2_e32 v5, exec_hi        ; encoding: [0x7f,0x26,0x0a,0x7e]
+
+v_cvt_f32_ubyte2 v5, null
+// GFX12: v_cvt_f32_ubyte2_e32 v5, null           ; encoding: [0x7c,0x26,0x0a,0x7e]
+
+v_cvt_f32_ubyte2 v5, -1
+// GFX12: v_cvt_f32_ubyte2_e32 v5, -1             ; encoding: [0xc1,0x26,0x0a,0x7e]
+
+v_cvt_f32_ubyte2 v5, 0.5
+// GFX12: v_cvt_f32_ubyte2_e32 v5, 0.5            ; encoding: [0xf0,0x26,0x0a,0x7e]
+
+v_cvt_f32_ubyte2 v5, src_scc
+// GFX12: v_cvt_f32_ubyte2_e32 v5, src_scc        ; encoding: [0xfd,0x26,0x0a,0x7e]
+
+v_cvt_f32_ubyte2 v255, 0xaf123456
+// GFX12: v_cvt_f32_ubyte2_e32 v255, 0xaf123456   ; encoding: [0xff,0x26,0xfe,0x7f,0x56,0x34,0x12,0xaf]
+
+v_cvt_f32_ubyte3 v5, v1
+// GFX12: v_cvt_f32_ubyte3_e32 v5, v1             ; encoding: [0x01,0x29,0x0a,0x7e]
+
+v_cvt_f32_ubyte3 v5, v255
+// GFX12: v_cvt_f32_ubyte3_e32 v5, v255           ; encoding: [0xff,0x29,0x0a,0x7e]
+
+v_cvt_f32_ubyte3 v5, s1
+// GFX12: v_cvt_f32_ubyte3_e32 v5, s1             ; encoding: [0x01,0x28,0x0a,0x7e]
+
+v_cvt_f32_ubyte3 v5, s105
+// GFX12: v_cvt_f32_ubyte3_e32 v5, s105           ; encoding: [0x69,0x28,0x0a,0x7e]
+
+v_cvt_f32_ubyte3 v5, vcc_lo
+// GFX12: v_cvt_f32_ubyte3_e32 v5, vcc_lo         ; encoding: [0x6a,0x28,0x0a,0x7e]
+
+v_cvt_f32_ubyte3 v5, vcc_hi
+// GFX12: v_cvt_f32_ubyte3_e32 v5, vcc_hi         ; encoding: [0x6b,0x28,0x0a,0x7e]
+
+v_cvt_f32_ubyte3 v5, ttmp15
+// GFX12: v_cvt_f32_ubyte3_e32 v5, ttmp15         ; encoding: [0x7b,0x28,0x0a,0x7e]
+
+v_cvt_f32_ubyte3 v5, m0
+// GFX12: v_cvt_f32_ubyte3_e32 v5, m0             ; encoding: [0x7d,0x28,0x0a,0x7e]
+
+v_cvt_f32_ubyte3 v5, exec_lo
+// GFX12: v_cvt_f32_ubyte3_e32 v5, exec_lo        ; encoding: [0x7e,0x28,0x0a,0x7e]
+
+v_cvt_f32_ubyte3 v5, exec_hi
+// GFX12: v_cvt_f32_ubyte3_e32 v5, exec_hi        ; encoding: [0x7f,0x28,0x0a,0x7e]
+
+v_cvt_f32_ubyte3 v5, null
+// GFX12: v_cvt_f32_ubyte3_e32 v5, null           ; encoding: [0x7c,0x28,0x0a,0x7e]
+
+v_cvt_f32_ubyte3 v5, -1
+// GFX12: v_cvt_f32_ubyte3_e32 v5, -1             ; encoding: [0xc1,0x28,0x0a,0x7e]
+
+v_cvt_f32_ubyte3 v5, 0.5
+// GFX12: v_cvt_f32_ubyte3_e32 v5, 0.5            ; encoding: [0xf0,0x28,0x0a,0x7e]
+
+v_cvt_f32_ubyte3 v5, src_scc
+// GFX12: v_cvt_f32_ubyte3_e32 v5, src_scc        ; encoding: [0xfd,0x28,0x0a,0x7e]
+
+v_cvt_f32_ubyte3 v255, 0xaf123456
+// GFX12: v_cvt_f32_ubyte3_e32 v255, 0xaf123456   ; encoding: [0xff,0x28,0xfe,0x7f,0x56,0x34,0x12,0xaf]
+
+v_cvt_f64_f32 v[5:6], v1
+// GFX12: v_cvt_f64_f32_e32 v[5:6], v1            ; encoding: [0x01,0x21,0x0a,0x7e]
+
+v_cvt_f64_f32 v[5:6], v255
+// GFX12: v_cvt_f64_f32_e32 v[5:6], v255          ; encoding: [0xff,0x21,0x0a,0x7e]
+
+v_cvt_f64_f32 v[5:6], s1
+// GFX12: v_cvt_f64_f32_e32 v[5:6], s1            ; encoding: [0x01,0x20,0x0a,0x7e]
+
+v_cvt_f64_f32 v[5:6], s105
+// GFX12: v_cvt_f64_f32_e32 v[5:6], s105          ; encoding: [0x69,0x20,0x0a,0x7e]
+
+v_cvt_f64_f32 v[5:6], vcc_lo
+// GFX12: v_cvt_f64_f32_e32 v[5:6], vcc_lo        ; encoding: [0x6a,0x20,0x0a,0x7e]
+
+v_cvt_f64_f32 v[5:6], vcc_hi
+// GFX12: v_cvt_f64_f32_e32 v[5:6], vcc_hi        ; encoding: [0x6b,0x20,0x0a,0x7e]
+
+v_cvt_f64_f32 v[5:6], ttmp15
+// GFX12: v_cvt_f64_f32_e32 v[5:6], ttmp15        ; encoding: [0x7b,0x20,0x0a,0x7e]
+
+v_cvt_f64_f32 v[5:6], m0
+// GFX12: v_cvt_f64_f32_e32 v[5:6], m0            ; encoding: [0x7d,0x20,0x0a,0x7e]
+
+v_cvt_f64_f32 v[5:6], exec_lo
+// GFX12: v_cvt_f64_f32_e32 v[5:6], exec_lo       ; encoding: [0x7e,0x20,0x0a,0x7e]
+
+v_cvt_f64_f32 v[5:6], exec_hi
+// GFX12: v_cvt_f64_f32_e32 v[5:6], exec_hi       ; encoding: [0x7f,0x20,0x0a,0x7e]
+
+v_cvt_f64_f32 v[5:6], null
+// GFX12: v_cvt_f64_f32_e32 v[5:6], null          ; encoding: [0x7c,0x20,0x0a,0x7e]
+
+v_cvt_f64_f32 v[5:6], -1
+// GFX12: v_cvt_f64_f32_e32 v[5:6], -1            ; encoding: [0xc1,0x20,0x0a,0x7e]
+
+v_cvt_f64_f32 v[5:6], 0.5
+// GFX12: v_cvt_f64_f32_e32 v[5:6], 0.5           ; encoding: [0xf0,0x20,0x0a,0x7e]
+
+v_cvt_f64_f32 v[5:6], src_scc
+// GFX12: v_cvt_f64_f32_e32 v[5:6], src_scc       ; encoding: [0xfd,0x20,0x0a,0x7e]
+
+v_cvt_f64_f32 v[254:255], 0xaf123456
+// GFX12: v_cvt_f64_f32_e32 v[254:255], 0xaf123456 ; encoding: [0xff,0x20,0xfc,0x7f,0x56,0x34,0x12,0xaf]
+
+v_cvt_f64_i32 v[5:6], v1
+// GFX12: v_cvt_f64_i32_e32 v[5:6], v1            ; encoding: [0x01,0x09,0x0a,0x7e]
+
+v_cvt_f64_i32 v[5:6], v255
+// GFX12: v_cvt_f64_i32_e32 v[5:6], v255          ; encoding: [0xff,0x09,0x0a,0x7e]
+
+v_cvt_f64_i32 v[5:6], s1
+// GFX12: v_cvt_f64_i32_e32 v[5:6], s1            ; encoding: [0x01,0x08,0x0a,0x7e]
+
+v_cvt_f64_i32 v[5:6], s105
+// GFX12: v_cvt_f64_i32_e32 v[5:6], s105          ; encoding: [0x69,0x08,0x0a,0x7e]
+
+v_cvt_f64_i32 v[5:6], vcc_lo
+// GFX12: v_cvt_f64_i32_e32 v[5:6], vcc_lo        ; encoding: [0x6a,0x08,0x0a,0x7e]
+
+v_cvt_f64_i32 v[5:6], vcc_hi
+// GFX12: v_cvt_f64_i32_e32 v[5:6], vcc_hi        ; encoding: [0x6b,0x08,0x0a,0x7e]
+
+v_cvt_f64_i32 v[5:6], ttmp15
+// GFX12: v_cvt_f64_i32_e32 v[5:6], ttmp15        ; encoding: [0x7b,0x08,0x0a,0x7e]
+
+v_cvt_f64_i32 v[5:6], m0
+// GFX12: v_cvt_f64_i32_e32 v[5:6], m0            ; encoding: [0x7d,0x08,0x0a,0x7e]
+
+v_cvt_f64_i32 v[5:6], exec_lo
+// GFX12: v_cvt_f64_i32_e32 v[5:6], exec_lo       ; encoding: [0x7e,0x08,0x0a,0x7e]
+
+v_cvt_f64_i32 v[5:6], exec_hi
+// GFX12: v_cvt_f64_i32_e32 v[5:6], exec_hi       ; encoding: [0x7f,0x08,0x0a,0x7e]
+
+v_cvt_f64_i32 v[5:6], null
+// GFX12: v_cvt_f64_i32_e32 v[5:6], null          ; encoding: [0x7c,0x08,0x0a,0x7e]
+
+v_cvt_f64_i32 v[5:6], -1
+// GFX12: v_cvt_f64_i32_e32 v[5:6], -1            ; encoding: [0xc1,0x08,0x0a,0x7e]
+
+v_cvt_f64_i32 v[5:6], 0.5
+// GFX12: v_cvt_f64_i32_e32 v[5:6], 0.5           ; encoding: [0xf0,0x08,0x0a,0x7e]
+
+v_cvt_f64_i32 v[5:6], src_scc
+// GFX12: v_cvt_f64_i32_e32 v[5:6], src_scc       ; encoding: [0xfd,0x08,0x0a,0x7e]
+
+v_cvt_f64_i32 v[254:255], 0xaf123456
+// GFX12: v_cvt_f64_i32_e32 v[254:255], 0xaf123456 ; encoding: [0xff,0x08,0xfc,0x7f,0x56,0x34,0x12,0xaf]
+
+v_cvt_f64_u32 v[5:6], v1
+// GFX12: v_cvt_f64_u32_e32 v[5:6], v1            ; encoding: [0x01,0x2d,0x0a,0x7e]
+
+v_cvt_f64_u32 v[5:6], v255
+// GFX12: v_cvt_f64_u32_e32 v[5:6], v255          ; encoding: [0xff,0x2d,0x0a,0x7e]
+
+v_cvt_f64_u32 v[5:6], s1
+// GFX12: v_cvt_f64_u32_e32 v[5:6], s1            ; encoding: [0x01,0x2c,0x0a,0x7e]
+
+v_cvt_f64_u32 v[5:6], s105
+// GFX12: v_cvt_f64_u32_e32 v[5:6], s105          ; encoding: [0x69,0x2c,0x0a,0x7e]
+
+v_cvt_f64_u32 v[5:6], vcc_lo
+// GFX12: v_cvt_f64_u32_e32 v[5:6], vcc_lo        ; encoding: [0x6a,0x2c,0x0a,0x7e]
+
+v_cvt_f64_u32 v[5:6], vcc_hi
+// GFX12: v_cvt_f64_u32_e32 v[5:6], vcc_hi        ; encoding: [0x6b,0x2c,0x0a,0x7e]
+
+v_cvt_f64_u32 v[5:6], ttmp15
+// GFX12: v_cvt_f64_u32_e32 v[5:6], ttmp15        ; encoding: [0x7b,0x2c,0x0a,0x7e]
+
+v_cvt_f64_u32 v[5:6], m0
+// GFX12: v_cvt_f64_u32_e32 v[5:6], m0            ; encoding: [0x7d,0x2c,0x0a,0x7e]
+
+v_cvt_f64_u32 v[5:6], exec_lo
+// GFX12: v_cvt_f64_u32_e32 v[5:6], exec_lo       ; encoding: [0x7e,0x2c,0x0a,0x7e]
+
+v_cvt_f64_u32 v[5:6], exec_hi
+// GFX12: v_cvt_f64_u32_e32 v[5:6], exec_hi       ; encoding: [0x7f,0x2c,0x0a,0x7e]
+
+v_cvt_f64_u32 v[5:6], null
+// GFX12: v_cvt_f64_u32_e32 v[5:6], null          ; encoding: [0x7c,0x2c,0x0a,0x7e]
+
+v_cvt_f64_u32 v[5:6], -1
+// GFX12: v_cvt_f64_u32_e32 v[5:6], -1            ; encoding: [0xc1,0x2c,0x0a,0x7e]
+
+v_cvt_f64_u32 v[5:6], 0.5
+// GFX12: v_cvt_f64_u32_e32 v[5:6], 0.5           ; encoding: [0xf0,0x2c,0x0a,0x7e]
+
+v_cvt_f64_u32 v[5:6], src_scc
+// GFX12: v_cvt_f64_u32_e32 v[5:6], src_scc       ; encoding: [0xfd,0x2c,0x0a,0x7e]
+
+v_cvt_f64_u32 v[254:255], 0xaf123456
+// GFX12: v_cvt_f64_u32_e32 v[254:255], 0xaf123456 ; encoding: [0xff,0x2c,0xfc,0x7f,0x56,0x34,0x12,0xaf]
+
+v_cvt_floor_i32_f32 v5, v1
+// GFX12: v_cvt_floor_i32_f32_e32 v5, v1          ; encoding: [0x01,0x1b,0x0a,0x7e]
+
+v_cvt_floor_i32_f32 v5, v255
+// GFX12: v_cvt_floor_i32_f32_e32 v5, v255        ; encoding: [0xff,0x1b,0x0a,0x7e]
+
+v_cvt_floor_i32_f32 v5, s1
+// GFX12: v_cvt_floor_i32_f32_e32 v5, s1          ; encoding: [0x01,0x1a,0x0a,0x7e]
+
+v_cvt_floor_i32_f32 v5, s105
+// GFX12: v_cvt_floor_i32_f32_e32 v5, s105        ; encoding: [0x69,0x1a,0x0a,0x7e]
+
+v_cvt_floor_i32_f32 v5, vcc_lo
+// GFX12: v_cvt_floor_i32_f32_e32 v5, vcc_lo      ; encoding: [0x6a,0x1a,0x0a,0x7e]
+
+v_cvt_floor_i32_f32 v5, vcc_hi
+// GFX12: v_cvt_floor_i32_f32_e32 v5, vcc_hi      ; encoding: [0x6b,0x1a,0x0a,0x7e]
+
+v_cvt_floor_i32_f32 v5, ttmp15
+// GFX12: v_cvt_floor_i32_f32_e32 v5, ttmp15      ; encoding: [0x7b,0x1a,0x0a,0x7e]
+
+v_cvt_floor_i32_f32 v5, m0
+// GFX12: v_cvt_floor_i32_f32_e32 v5, m0          ; encoding: [0x7d,0x1a,0x0a,0x7e]
+
+v_cvt_floor_i32_f32 v5, exec_lo
+// GFX12: v_cvt_floor_i32_f32_e32 v5, exec_lo     ; encoding: [0x7e,0x1a,0x0a,0x7e]
+
+v_cvt_floor_i32_f32 v5, exec_hi
+// GFX12: v_cvt_floor_i32_f32_e32 v5, exec_hi     ; encoding: [0x7f,0x1a,0x0a,0x7e]
+
+v_cvt_floor_i32_f32 v5, null
+// GFX12: v_cvt_floor_i32_f32_e32 v5, null        ; encoding: [0x7c,0x1a,0x0a,0x7e]
+
+v_cvt_floor_i32_f32 v5, -1
+// GFX12: v_cvt_floor_i32_f32_e32 v5, -1          ; encoding: [0xc1,0x1a,0x0a,0x7e]
+
+v_cvt_floor_i32_f32 v5, 0.5
+// GFX12: v_cvt_floor_i32_f32_e32 v5, 0.5         ; encoding: [0xf0,0x1a,0x0a,0x7e]
+
+v_cvt_floor_i32_f32 v5, src_scc
+// GFX12: v_cvt_floor_i32_f32_e32 v5, src_scc     ; encoding: [0xfd,0x1a,0x0a,0x7e]
+
+v_cvt_floor_i32_f32 v255, 0xaf123456
+// GFX12: v_cvt_floor_i32_f32_e32 v255, 0xaf123456 ; encoding: [0xff,0x1a,0xfe,0x7f,0x56,0x34,0x12,0xaf]
+
+v_cvt_flr_i32_f32 v5, v1
+// GFX12: v_cvt_floor_i32_f32_e32 v5, v1          ; encoding: [0x01,0x1b,0x0a,0x7e]
+
+v_cvt_flr_i32_f32 v5, v255
+// GFX12: v_cvt_floor_i32_f32_e32 v5, v255        ; encoding: [0xff,0x1b,0x0a,0x7e]
+
+v_cvt_flr_i32_f32 v5, s1
+// GFX12: v_cvt_floor_i32_f32_e32 v5, s1          ; encoding: [0x01,0x1a,0x0a,0x7e]
+
+v_cvt_flr_i32_f32 v5, s105
+// GFX12: v_cvt_floor_i32_f32_e32 v5, s105        ; encoding: [0x69,0x1a,0x0a,0x7e]
+
+v_cvt_flr_i32_f32 v5, vcc_lo
+// GFX12: v_cvt_floor_i32_f32_e32 v5, vcc_lo      ; encoding: [0x6a,0x1a,0x0a,0x7e]
+
+v_cvt_flr_i32_f32 v5, vcc_hi
+// GFX12: v_cvt_floor_i32_f32_e32 v5, vcc_hi      ; encoding: [0x6b,0x1a,0x0a,0x7e]
+
+v_cvt_flr_i32_f32 v5, ttmp15
+// GFX12: v_cvt_floor_i32_f32_e32 v5, ttmp15      ; encoding: [0x7b,0x1a,0x0a,0x7e]
+
+v_cvt_flr_i32_f32 v5, m0
+// GFX12: v_cvt_floor_i32_f32_e32 v5, m0          ; encoding: [0x7d,0x1a,0x0a,0x7e]
+
+v_cvt_flr_i32_f32 v5, exec_lo
+// GFX12: v_cvt_floor_i32_f32_e32 v5, exec_lo     ; encoding: [0x7e,0x1a,0x0a,0x7e]
+
+v_cvt_flr_i32_f32 v5, exec_hi
+// GFX12: v_cvt_floor_i32_f32_e32 v5, exec_hi     ; encoding: [0x7f,0x1a,0x0a,0x7e]
+
+v_cvt_flr_i32_f32 v5, null
+// GFX12: v_cvt_floor_i32_f32_e32 v5, null        ; encoding: [0x7c,0x1a,0x0a,0x7e]
+
+v_cvt_flr_i32_f32 v5, -1
+// GFX12: v_cvt_floor_i32_f32_e32 v5, -1          ; encoding: [0xc1,0x1a,0x0a,0x7e]
+
+v_cvt_flr_i32_f32 v5, 0.5
+// GFX12: v_cvt_floor_i32_f32_e32 v5, 0.5         ; encoding: [0xf0,0x1a,0x0a,0x7e]
+
+v_cvt_flr_i32_f32 v5, src_scc
+// GFX12: v_cvt_floor_i32_f32_e32 v5, src_scc     ; encoding: [0xfd,0x1a,0x0a,0x7e]
+
+v_cvt_flr_i32_f32 v255, 0xaf123456
+// GFX12: v_cvt_floor_i32_f32_e32 v255, 0xaf123456 ; encoding: [0xff,0x1a,0xfe,0x7f,0x56,0x34,0x12,0xaf]
+
+v_cvt_i16_f16 v5, v1
+// GFX12: v_cvt_i16_f16_e32 v5, v1                ; encoding: [0x01,0xa7,0x0a,0x7e]
+
+v_cvt_i16_f16 v5, v127
+// GFX12: v_cvt_i16_f16_e32 v5, v127              ; encoding: [0x7f,0xa7,0x0a,0x7e]
+
+v_cvt_i16_f16 v5, s1
+// GFX12: v_cvt_i16_f16_e32 v5, s1                ; encoding: [0x01,0xa6,0x0a,0x7e]
+
+v_cvt_i16_f16 v5, s105
+// GFX12: v_cvt_i16_f16_e32 v5, s105              ; encoding: [0x69,0xa6,0x0a,0x7e]
+
+v_cvt_i16_f16 v5, vcc_lo
+// GFX12: v_cvt_i16_f16_e32 v5, vcc_lo            ; encoding: [0x6a,0xa6,0x0a,0x7e]
+
+v_cvt_i16_f16 v5, vcc_hi
+// GFX12: v_cvt_i16_f16_e32 v5, vcc_hi            ; encoding: [0x6b,0xa6,0x0a,0x7e]
+
+v_cvt_i16_f16 v5, ttmp15
+// GFX12: v_cvt_i16_f16_e32 v5, ttmp15            ; encoding: [0x7b,0xa6,0x0a,0x7e]
+
+v_cvt_i16_f16 v5, m0
+// GFX12: v_cvt_i16_f16_e32 v5, m0                ; encoding: [0x7d,0xa6,0x0a,0x7e]
+
+v_cvt_i16_f16 v5, exec_lo
+// GFX12: v_cvt_i16_f16_e32 v5, exec_lo           ; encoding: [0x7e,0xa6,0x0a,0x7e]
+
+v_cvt_i16_f16 v5, exec_hi
+// GFX12: v_cvt_i16_f16_e32 v5, exec_hi           ; encoding: [0x7f,0xa6,0x0a,0x7e]
+
+v_cvt_i16_f16 v5, null
+// GFX12: v_cvt_i16_f16_e32 v5, null              ; encoding: [0x7c,0xa6,0x0a,0x7e]
+
+v_cvt_i16_f16 v5, -1
+// GFX12: v_cvt_i16_f16_e32 v5, -1                ; encoding: [0xc1,0xa6,0x0a,0x7e]
+
+v_cvt_i16_f16 v5, 0.5
+// GFX12: v_cvt_i16_f16_e32 v5, 0.5               ; encoding: [0xf0,0xa6,0x0a,0x7e]
+
+v_cvt_i16_f16 v5, src_scc
+// GFX12: v_cvt_i16_f16_e32 v5, src_scc           ; encoding: [0xfd,0xa6,0x0a,0x7e]
+
+v_cvt_i16_f16 v127, 0xfe0b
+// GFX12: v_cvt_i16_f16_e32 v127, 0xfe0b          ; encoding: [0xff,0xa6,0xfe,0x7e,0x0b,0xfe,0x00,0x00]
+
+v_cvt_i32_f32 v5, v1
+// GFX12: v_cvt_i32_f32_e32 v5, v1                ; encoding: [0x01,0x11,0x0a,0x7e]
+
+v_cvt_i32_f32 v5, v255
+// GFX12: v_cvt_i32_f32_e32 v5, v255              ; encoding: [0xff,0x11,0x0a,0x7e]
+
+v_cvt_i32_f32 v5, s1
+// GFX12: v_cvt_i32_f32_e32 v5, s1                ; encoding: [0x01,0x10,0x0a,0x7e]
+
+v_cvt_i32_f32 v5, s105
+// GFX12: v_cvt_i32_f32_e32 v5, s105              ; encoding: [0x69,0x10,0x0a,0x7e]
+
+v_cvt_i32_f32 v5, vcc_lo
+// GFX12: v_cvt_i32_f32_e32 v5, vcc_lo            ; encoding: [0x6a,0x10,0x0a,0x7e]
+
+v_cvt_i32_f32 v5, vcc_hi
+// GFX12: v_cvt_i32_f32_e32 v5, vcc_hi            ; encoding: [0x6b,0x10,0x0a,0x7e]
+
+v_cvt_i32_f32 v5, ttmp15
+// GFX12: v_cvt_i32_f32_e32 v5, ttmp15            ; encoding: [0x7b,0x10,0x0a,0x7e]
+
+v_cvt_i32_f32 v5, m0
+// GFX12: v_cvt_i32_f32_e32 v5, m0                ; encoding: [0x7d,0x10,0x0a,0x7e]
+
+v_cvt_i32_f32 v5, exec_lo
+// GFX12: v_cvt_i32_f32_e32 v5, exec_lo           ; encoding: [0x7e,0x10,0x0a,0x7e]
+
+v_cvt_i32_f32 v5, exec_hi
+// GFX12: v_cvt_i32_f32_e32 v5, exec_hi           ; encoding: [0x7f,0x10,0x0a,0x7e]
+
+v_cvt_i32_f32 v5, null
+// GFX12: v_cvt_i32_f32_e32 v5, null              ; encoding: [0x7c,0x10,0x0a,0x7e]
+
+v_cvt_i32_f32 v5, -1
+// GFX12: v_cvt_i32_f32_e32 v5, -1                ; encoding: [0xc1,0x10,0x0a,0x7e]
+
+v_cvt_i32_f32 v5, 0.5
+// GFX12: v_cvt_i32_f32_e32 v5, 0.5               ; encoding: [0xf0,0x10,0x0a,0x7e]
+
+v_cvt_i32_f32 v5, src_scc
+// GFX12: v_cvt_i32_f32_e32 v5, src_scc           ; encoding: [0xfd,0x10,0x0a,0x7e]
+
+v_cvt_i32_f32 v255, 0xaf123456
+// GFX12: v_cvt_i32_f32_e32 v255, 0xaf123456      ; encoding: [0xff,0x10,0xfe,0x7f,0x56,0x34,0x12,0xaf]
+
+v_cvt_i32_f64 v5, v[1:2]
+// GFX12: v_cvt_i32_f64_e32 v5, v[1:2]            ; encoding: [0x01,0x07,0x0a,0x7e]
+
+v_cvt_i32_f64 v5, v[254:255]
+// GFX12: v_cvt_i32_f64_e32 v5, v[254:255]        ; encoding: [0xfe,0x07,0x0a,0x7e]
+
+v_cvt_i32_f64 v5, s[2:3]
+// GFX12: v_cvt_i32_f64_e32 v5, s[2:3]            ; encoding: [0x02,0x06,0x0a,0x7e]
+
+v_cvt_i32_f64 v5, s[104:105]
+// GFX12: v_cvt_i32_f64_e32 v5, s[104:105]        ; encoding: [0x68,0x06,0x0a,0x7e]
+
+v_cvt_i32_f64 v5, vcc
+// GFX12: v_cvt_i32_f64_e32 v5, vcc               ; encoding: [0x6a,0x06,0x0a,0x7e]
+
+v_cvt_i32_f64 v5, ttmp[14:15]
+// GFX12: v_cvt_i32_f64_e32 v5, ttmp[14:15]       ; encoding: [0x7a,0x06,0x0a,0x7e]
+
+v_cvt_i32_f64 v5, exec
+// GFX12: v_cvt_i32_f64_e32 v5, exec              ; encoding: [0x7e,0x06,0x0a,0x7e]
+
+v_cvt_i32_f64 v5, null
+// GFX12: v_cvt_i32_f64_e32 v5, null              ; encoding: [0x7c,0x06,0x0a,0x7e]
+
+v_cvt_i32_f64 v5, -1
+// GFX12: v_cvt_i32_f64_e32 v5, -1                ; encoding: [0xc1,0x06,0x0a,0x7e]
+
+v_cvt_i32_f64 v5, 0.5
+// GFX12: v_cvt_i32_f64_e32 v5, 0.5               ; encoding: [0xf0,0x06,0x0a,0x7e]
+
+v_cvt_i32_f64 v5, src_scc
+// GFX12: v_cvt_i32_f64_e32 v5, src_scc           ; encoding: [0xfd,0x06,0x0a,0x7e]
+
+v_cvt_i32_f64 v255, 0xaf123456
+// GFX12: v_cvt_i32_f64_e32 v255, 0xaf123456      ; encoding: [0xff,0x06,0xfe,0x7f,0x56,0x34,0x12,0xaf]
+
+v_cvt_i32_i16 v5, v1
+// GFX12: v_cvt_i32_i16_e32 v5, v1                ; encoding: [0x01,0xd5,0x0a,0x7e]
+
+v_cvt_i32_i16 v5, v127
+// GFX12: v_cvt_i32_i16_e32 v5, v127              ; encoding: [0x7f,0xd5,0x0a,0x7e]
+
+v_cvt_i32_i16 v5, s1
+// GFX12: v_cvt_i32_i16_e32 v5, s1                ; encoding: [0x01,0xd4,0x0a,0x7e]
+
+v_cvt_i32_i16 v5, s105
+// GFX12: v_cvt_i32_i16_e32 v5, s105              ; encoding: [0x69,0xd4,0x0a,0x7e]
+
+v_cvt_i32_i16 v5, vcc_lo
+// GFX12: v_cvt_i32_i16_e32 v5, vcc_lo            ; encoding: [0x6a,0xd4,0x0a,0x7e]
+
+v_cvt_i32_i16 v5, vcc_hi
+// GFX12: v_cvt_i32_i16_e32 v5, vcc_hi            ; encoding: [0x6b,0xd4,0x0a,0x7e]
+
+v_cvt_i32_i16 v5, ttmp15
+// GFX12: v_cvt_i32_i16_e32 v5, ttmp15            ; encoding: [0x7b,0xd4,0x0a,0x7e]
+
+v_cvt_i32_i16 v5, m0
+// GFX12: v_cvt_i32_i16_e32 v5, m0                ; encoding: [0x7d,0xd4,0x0a,0x7e]
+
+v_cvt_i32_i16 v5, exec_lo
+// GFX12: v_cvt_i32_i16_e32 v5, exec_lo           ; encoding: [0x7e,0xd4,0x0a,0x7e]
+
+v_cvt_i32_i16 v5, exec_hi
+// GFX12: v_cvt_i32_i16_e32 v5, exec_hi           ; encoding: [0x7f,0xd4,0x0a,0x7e]
+
+v_cvt_i32_i16 v5, null
+// GFX12: v_cvt_i32_i16_e32 v5, null              ; encoding: [0x7c,0xd4,0x0a,0x7e]
+
+v_cvt_i32_i16 v5, -1
+// GFX12: v_cvt_i32_i16_e32 v5, -1                ; encoding: [0xc1,0xd4,0x0a,0x7e]
+
+v_cvt_i32_i16 v5, 0.5
+// GFX12-ASM: v_cvt_i32_i16_e32 v5, 0.5               ; encoding: [0xf0,0xd4,0x0a,0x7e]
+// GFX12-DIS: v_cvt_i32_i16_e32 v5, 0x3800            ; encoding: [0xff,0xd4,0x0a,0x7e,0x00,0x38,0x00,0x00]
+
+v_cvt_i32_i16 v5, src_scc
+// GFX12: v_cvt_i32_i16_e32 v5, src_scc           ; encoding: [0xfd,0xd4,0x0a,0x7e]
+
+v_cvt_i32_i16 v255, 0xfe0b
+// GFX12: v_cvt_i32_i16_e32 v255, 0xfe0b          ; encoding: [0xff,0xd4,0xfe,0x7f,0x0b,0xfe,0x00,0x00]
+
+v_cvt_nearest_i32_f32 v5, v1
+// GFX12: v_cvt_nearest_i32_f32_e32 v5, v1        ; encoding: [0x01,0x19,0x0a,0x7e]
+
+v_cvt_nearest_i32_f32 v5, v255
+// GFX12: v_cvt_nearest_i32_f32_e32 v5, v255      ; encoding: [0xff,0x19,0x0a,0x7e]
+
+v_cvt_nearest_i32_f32 v5, s1
+// GFX12: v_cvt_nearest_i32_f32_e32 v5, s1        ; encoding: [0x01,0x18,0x0a,0x7e]
+
+v_cvt_nearest_i32_f32 v5, s105
+// GFX12: v_cvt_nearest_i32_f32_e32 v5, s105      ; encoding: [0x69,0x18,0x0a,0x7e]
+
+v_cvt_nearest_i32_f32 v5, vcc_lo
+// GFX12: v_cvt_nearest_i32_f32_e32 v5, vcc_lo    ; encoding: [0x6a,0x18,0x0a,0x7e]
+
+v_cvt_nearest_i32_f32 v5, vcc_hi
+// GFX12: v_cvt_nearest_i32_f32_e32 v5, vcc_hi    ; encoding: [0x6b,0x18,0x0a,0x7e]
+
+v_cvt_nearest_i32_f32 v5, ttmp15
+// GFX12: v_cvt_nearest_i32_f32_e32 v5, ttmp15    ; encoding: [0x7b,0x18,0x0a,0x7e]
+
+v_cvt_nearest_i32_f32 v5, m0
+// GFX12: v_cvt_nearest_i32_f32_e32 v5, m0        ; encoding: [0x7d,0x18,0x0a,0x7e]
+
+v_cvt_nearest_i32_f32 v5, exec_lo
+// GFX12: v_cvt_nearest_i32_f32_e32 v5, exec_lo   ; encoding: [0x7e,0x18,0x0a,0x7e]
+
+v_cvt_nearest_i32_f32 v5, exec_hi
+// GFX12: v_cvt_nearest_i32_f32_e32 v5, exec_hi   ; encoding: [0x7f,0x18,0x0a,0x7e]
+
+v_cvt_nearest_i32_f32 v5, null
+// GFX12: v_cvt_nearest_i32_f32_e32 v5, null      ; encoding: [0x7c,0x18,0x0a,0x7e]
+
+v_cvt_nearest_i32_f32 v5, -1
+// GFX12: v_cvt_nearest_i32_f32_e32 v5, -1        ; encoding: [0xc1,0x18,0x0a,0x7e]
+
+v_cvt_nearest_i32_f32 v5, 0.5
+// GFX12: v_cvt_nearest_i32_f32_e32 v5, 0.5       ; encoding: [0xf0,0x18,0x0a,0x7e]
+
+v_cvt_nearest_i32_f32 v5, src_scc
+// GFX12: v_cvt_nearest_i32_f32_e32 v5, src_scc   ; encoding: [0xfd,0x18,0x0a,0x7e]
+
+v_cvt_nearest_i32_f32 v255, 0xaf123456
+// GFX12: v_cvt_nearest_i32_f32_e32 v255, 0xaf123456 ; encoding: [0xff,0x18,0xfe,0x7f,0x56,0x34,0x12,0xaf]
+
+v_cvt_norm_i16_f16 v5, v1
+// GFX12: v_cvt_norm_i16_f16_e32 v5, v1           ; encoding: [0x01,0xc7,0x0a,0x7e]
+
+v_cvt_norm_i16_f16 v5, v127
+// GFX12: v_cvt_norm_i16_f16_e32 v5, v127         ; encoding: [0x7f,0xc7,0x0a,0x7e]
+
+v_cvt_norm_i16_f16 v5, s1
+// GFX12: v_cvt_norm_i16_f16_e32 v5, s1           ; encoding: [0x01,0xc6,0x0a,0x7e]
+
+v_cvt_norm_i16_f16 v5, s105
+// GFX12: v_cvt_norm_i16_f16_e32 v5, s105         ; encoding: [0x69,0xc6,0x0a,0x7e]
+
+v_cvt_norm_i16_f16 v5, vcc_lo
+// GFX12: v_cvt_norm_i16_f16_e32 v5, vcc_lo       ; encoding: [0x6a,0xc6,0x0a,0x7e]
+
+v_cvt_norm_i16_f16 v5, vcc_hi
+// GFX12: v_cvt_norm_i16_f16_e32 v5, vcc_hi       ; encoding: [0x6b,0xc6,0x0a,0x7e]
+
+v_cvt_norm_i16_f16 v5, ttmp15
+// GFX12: v_cvt_norm_i16_f16_e32 v5, ttmp15       ; encoding: [0x7b,0xc6,0x0a,0x7e]
+
+v_cvt_norm_i16_f16 v5, m0
+// GFX12: v_cvt_norm_i16_f16_e32 v5, m0           ; encoding: [0x7d,0xc6,0x0a,0x7e]
+
+v_cvt_norm_i16_f16 v5, exec_lo
+// GFX12: v_cvt_norm_i16_f16_e32 v5, exec_lo      ; encoding: [0x7e,0xc6,0x0a,0x7e]
+
+v_cvt_norm_i16_f16 v5, exec_hi
+// GFX12: v_cvt_norm_i16_f16_e32 v5, exec_hi      ; encoding: [0x7f,0xc6,0x0a,0x7e]
+
+v_cvt_norm_i16_f16 v5, null
+// GFX12: v_cvt_norm_i16_f16_e32 v5, null         ; encoding: [0x7c,0xc6,0x0a,0x7e]
+
+v_cvt_norm_i16_f16 v5, -1
+// GFX12: v_cvt_norm_i16_f16_e32 v5, -1           ; encoding: [0xc1,0xc6,0x0a,0x7e]
+
+v_cvt_norm_i16_f16 v5, 0.5
+// GFX12: v_cvt_norm_i16_f16_e32 v5, 0.5          ; encoding: [0xf0,0xc6,0x0a,0x7e]
+
+v_cvt_norm_i16_f16 v5, src_scc
+// GFX12: v_cvt_norm_i16_f16_e32 v5, src_scc      ; encoding: [0xfd,0xc6,0x0a,0x7e]
+
+v_cvt_norm_i16_f16 v127, 0xfe0b
+// GFX12: v_cvt_norm_i16_f16_e32 v127, 0xfe0b     ; encoding: [0xff,0xc6,0xfe,0x7e,0x0b,0xfe,0x00,0x00]
+
+v_cvt_norm_u16_f16 v5, v1
+// GFX12: v_cvt_norm_u16_f16_e32 v5, v1           ; encoding: [0x01,0xc9,0x0a,0x7e]
+
+v_cvt_norm_u16_f16 v5, v127
+// GFX12: v_cvt_norm_u16_f16_e32 v5, v127         ; encoding: [0x7f,0xc9,0x0a,0x7e]
+
+v_cvt_norm_u16_f16 v5, s1
+// GFX12: v_cvt_norm_u16_f16_e32 v5, s1           ; encoding: [0x01,0xc8,0x0a,0x7e]
+
+v_cvt_norm_u16_f16 v5, s105
+// GFX12: v_cvt_norm_u16_f16_e32 v5, s105         ; encoding: [0x69,0xc8,0x0a,0x7e]
+
+v_cvt_norm_u16_f16 v5, vcc_lo
+// GFX12: v_cvt_norm_u16_f16_e32 v5, vcc_lo       ; encoding: [0x6a,0xc8,0x0a,0x7e]
+
+v_cvt_norm_u16_f16 v5, vcc_hi
+// GFX12: v_cvt_norm_u16_f16_e32 v5, vcc_hi       ; encoding: [0x6b,0xc8,0x0a,0x7e]
+
+v_cvt_norm_u16_f16 v5, ttmp15
+// GFX12: v_cvt_norm_u16_f16_e32 v5, ttmp15       ; encoding: [0x7b,0xc8,0x0a,0x7e]
+
+v_cvt_norm_u16_f16 v5, m0
+// GFX12: v_cvt_norm_u16_f16_e32 v5, m0           ; encoding: [0x7d,0xc8,0x0a,0x7e]
+
+v_cvt_norm_u16_f16 v5, exec_lo
+// GFX12: v_cvt_norm_u16_f16_e32 v5, exec_lo      ; encoding: [0x7e,0xc8,0x0a,0x7e]
+
+v_cvt_norm_u16_f16 v5, exec_hi
+// GFX12: v_cvt_norm_u16_f16_e32 v5, exec_hi      ; encoding: [0x7f,0xc8,0x0a,0x7e]
+
+v_cvt_norm_u16_f16 v5, null
+// GFX12: v_cvt_norm_u16_f16_e32 v5, null         ; encoding: [0x7c,0xc8,0x0a,0x7e]
+
+v_cvt_norm_u16_f16 v5, -1
+// GFX12: v_cvt_norm_u16_f16_e32 v5, -1           ; encoding: [0xc1,0xc8,0x0a,0x7e]
+
+v_cvt_norm_u16_f16 v5, 0.5
+// GFX12: v_cvt_norm_u16_f16_e32 v5, 0.5          ; encoding: [0xf0,0xc8,0x0a,0x7e]
+
+v_cvt_norm_u16_f16 v5, src_scc
+// GFX12: v_cvt_norm_u16_f16_e32 v5, src_scc      ; encoding: [0xfd,0xc8,0x0a,0x7e]
+
+v_cvt_norm_u16_f16 v127, 0xfe0b
+// GFX12: v_cvt_norm_u16_f16_e32 v127, 0xfe0b     ; encoding: [0xff,0xc8,0xfe,0x7e,0x0b,0xfe,0x00,0x00]
+
+v_cvt_off_f32_i4 v5, v1
+// GFX12: v_cvt_off_f32_i4_e32 v5, v1             ; encoding: [0x01,0x1d,0x0a,0x7e]
+
+v_cvt_off_f32_i4 v5, v255
+// GFX12: v_cvt_off_f32_i4_e32 v5, v255           ; encoding: [0xff,0x1d,0x0a,0x7e]
+
+v_cvt_off_f32_i4 v5, s1
+// GFX12: v_cvt_off_f32_i4_e32 v5, s1             ; encoding: [0x01,0x1c,0x0a,0x7e]
+
+v_cvt_off_f32_i4 v5, s105
+// GFX12: v_cvt_off_f32_i4_e32 v5, s105           ; encoding: [0x69,0x1c,0x0a,0x7e]
+
+v_cvt_off_f32_i4 v5, vcc_lo
+// GFX12: v_cvt_off_f32_i4_e32 v5, vcc_lo         ; encoding: [0x6a,0x1c,0x0a,0x7e]
+
+v_cvt_off_f32_i4 v5, vcc_hi
+// GFX12: v_cvt_off_f32_i4_e32 v5, vcc_hi         ; encoding: [0x6b,0x1c,0x0a,0x7e]
+
+v_cvt_off_f32_i4 v5, ttmp15
+// GFX12: v_cvt_off_f32_i4_e32 v5, ttmp15         ; encoding: [0x7b,0x1c,0x0a,0x7e]
+
+v_cvt_off_f32_i4 v5, m0
+// GFX12: v_cvt_off_f32_i4_e32 v5, m0             ; encoding: [0x7d,0x1c,0x0a,0x7e]
+
+v_cvt_off_f32_i4 v5, exec_lo
+// GFX12: v_cvt_off_f32_i4_e32 v5, exec_lo        ; encoding: [0x7e,0x1c,0x0a,0x7e]
+
+v_cvt_off_f32_i4 v5, exec_hi
+// GFX12: v_cvt_off_f32_i4_e32 v5, exec_hi        ; encoding: [0x7f,0x1c,0x0a,0x7e]
+
+v_cvt_off_f32_i4 v5, null
+// GFX12: v_cvt_off_f32_i4_e32 v5, null           ; encoding: [0x7c,0x1c,0x0a,0x7e]
+
+v_cvt_off_f32_i4 v5, -1
+// GFX12: v_cvt_off_f32_i4_e32 v5, -1             ; encoding: [0xc1,0x1c,0x0a,0x7e]
+
+v_cvt_off_f32_i4 v5, 0.5
+// GFX12: v_cvt_off_f32_i4_e32 v5, 0.5            ; encoding: [0xf0,0x1c,0x0a,0x7e]
+
+v_cvt_off_f32_i4 v5, src_scc
+// GFX12: v_cvt_off_f32_i4_e32 v5, src_scc        ; encoding: [0xfd,0x1c,0x0a,0x7e]
+
+v_cvt_off_f32_i4 v255, 0x4f
+// GFX12: v_cvt_off_f32_i4_e32 v255, 0x4f         ; encoding: [0xff,0x1c,0xfe,0x7f,0x4f,0x00,0x00,0x00]
+
+v_cvt_rpi_i32_f32 v5, v1
+// GFX12: v_cvt_nearest_i32_f32_e32 v5, v1        ; encoding: [0x01,0x19,0x0a,0x7e]
+
+v_cvt_rpi_i32_f32 v5, v255
+// GFX12: v_cvt_nearest_i32_f32_e32 v5, v255      ; encoding: [0xff,0x19,0x0a,0x7e]
+
+v_cvt_rpi_i32_f32 v5, s1
+// GFX12: v_cvt_nearest_i32_f32_e32 v5, s1        ; encoding: [0x01,0x18,0x0a,0x7e]
+
+v_cvt_rpi_i32_f32 v5, s105
+// GFX12: v_cvt_nearest_i32_f32_e32 v5, s105      ; encoding: [0x69,0x18,0x0a,0x7e]
+
+v_cvt_rpi_i32_f32 v5, vcc_lo
+// GFX12: v_cvt_nearest_i32_f32_e32 v5, vcc_lo    ; encoding: [0x6a,0x18,0x0a,0x7e]
+
+v_cvt_rpi_i32_f32 v5, vcc_hi
+// GFX12: v_cvt_nearest_i32_f32_e32 v5, vcc_hi    ; encoding: [0x6b,0x18,0x0a,0x7e]
+
+v_cvt_rpi_i32_f32 v5, ttmp15
+// GFX12: v_cvt_nearest_i32_f32_e32 v5, ttmp15    ; encoding: [0x7b,0x18,0x0a,0x7e]
+
+v_cvt_rpi_i32_f32 v5, m0
+// GFX12: v_cvt_nearest_i32_f32_e32 v5, m0        ; encoding: [0x7d,0x18,0x0a,0x7e]
+
+v_cvt_rpi_i32_f32 v5, exec_lo
+// GFX12: v_cvt_nearest_i32_f32_e32 v5, exec_lo   ; encoding: [0x7e,0x18,0x0a,0x7e]
+
+v_cvt_rpi_i32_f32 v5, exec_hi
+// GFX12: v_cvt_nearest_i32_f32_e32 v5, exec_hi   ; encoding: [0x7f,0x18,0x0a,0x7e]
+
+v_cvt_rpi_i32_f32 v5, null
+// GFX12: v_cvt_nearest_i32_f32_e32 v5, null      ; encoding: [0x7c,0x18,0x0a,0x7e]
+
+v_cvt_rpi_i32_f32 v5, -1
+// GFX12: v_cvt_nearest_i32_f32_e32 v5, -1        ; encoding: [0xc1,0x18,0x0a,0x7e]
+
+v_cvt_rpi_i32_f32 v5, 0.5
+// GFX12: v_cvt_nearest_i32_f32_e32 v5, 0.5       ; encoding: [0xf0,0x18,0x0a,0x7e]
+
+v_cvt_rpi_i32_f32 v5, src_scc
+// GFX12: v_cvt_nearest_i32_f32_e32 v5, src_scc   ; encoding: [0xfd,0x18,0x0a,0x7e]
+
+v_cvt_rpi_i32_f32 v255, 0xaf123456
+// GFX12: v_cvt_nearest_i32_f32_e32 v255, 0xaf123456 ; encoding: [0xff,0x18,0xfe,0x7f,0x56,0x34,0x12,0xaf]
+
+v_cvt_u16_f16 v5, v1
+// GFX12: v_cvt_u16_f16_e32 v5, v1                ; encoding: [0x01,0xa5,0x0a,0x7e]
+
+v_cvt_u16_f16 v5, v127
+// GFX12: v_cvt_u16_f16_e32 v5, v127              ; encoding: [0x7f,0xa5,0x0a,0x7e]
+
+v_cvt_u16_f16 v5, s1
+// GFX12: v_cvt_u16_f16_e32 v5, s1                ; encoding: [0x01,0xa4,0x0a,0x7e]
+
+v_cvt_u16_f16 v5, s105
+// GFX12: v_cvt_u16_f16_e32 v5, s105              ; encoding: [0x69,0xa4,0x0a,0x7e]
+
+v_cvt_u16_f16 v5, vcc_lo
+// GFX12: v_cvt_u16_f16_e32 v5, vcc_lo            ; encoding: [0x6a,0xa4,0x0a,0x7e]
+
+v_cvt_u16_f16 v5, vcc_hi
+// GFX12: v_cvt_u16_f16_e32 v5, vcc_hi            ; encoding: [0x6b,0xa4,0x0a,0x7e]
+
+v_cvt_u16_f16 v5, ttmp15
+// GFX12: v_cvt_u16_f16_e32 v5, ttmp15            ; encoding: [0x7b,0xa4,0x0a,0x7e]
+
+v_cvt_u16_f16 v5, m0
+// GFX12: v_cvt_u16_f16_e32 v5, m0                ; encoding: [0x7d,0xa4,0x0a,0x7e]
+
+v_cvt_u16_f16 v5, exec_lo
+// GFX12: v_cvt_u16_f16_e32 v5, exec_lo           ; encoding: [0x7e,0xa4,0x0a,0x7e]
+
+v_cvt_u16_f16 v5, exec_hi
+// GFX12: v_cvt_u16_f16_e32 v5, exec_hi           ; encoding: [0x7f,0xa4,0x0a,0x7e]
+
+v_cvt_u16_f16 v5, null
+// GFX12: v_cvt_u16_f16_e32 v5, null              ; encoding: [0x7c,0xa4,0x0a,0x7e]
+
+v_cvt_u16_f16 v5, -1
+// GFX12: v_cvt_u16_f16_e32 v5, -1                ; encoding: [0xc1,0xa4,0x0a,0x7e]
+
+v_cvt_u16_f16 v5, 0.5
+// GFX12: v_cvt_u16_f16_e32 v5, 0.5               ; encoding: [0xf0,0xa4,0x0a,0x7e]
+
+v_cvt_u16_f16 v5, src_scc
+// GFX12: v_cvt_u16_f16_e32 v5, src_scc           ; encoding: [0xfd,0xa4,0x0a,0x7e]
+
+v_cvt_u16_f16 v127, 0xfe0b
+// GFX12: v_cvt_u16_f16_e32 v127, 0xfe0b          ; encoding: [0xff,0xa4,0xfe,0x7e,0x0b,0xfe,0x00,0x00]
+
+v_cvt_u32_f32 v5, v1
+// GFX12: v_cvt_u32_f32_e32 v5, v1                ; encoding: [0x01,0x0f,0x0a,0x7e]
+
+v_cvt_u32_f32 v5, v255
+// GFX12: v_cvt_u32_f32_e32 v5, v255              ; encoding: [0xff,0x0f,0x0a,0x7e]
+
+v_cvt_u32_f32 v5, s1
+// GFX12: v_cvt_u32_f32_e32 v5, s1                ; encoding: [0x01,0x0e,0x0a,0x7e]
+
+v_cvt_u32_f32 v5, s105
+// GFX12: v_cvt_u32_f32_e32 v5, s105              ; encoding: [0x69,0x0e,0x0a,0x7e]
+
+v_cvt_u32_f32 v5, vcc_lo
+// GFX12: v_cvt_u32_f32_e32 v5, vcc_lo            ; encoding: [0x6a,0x0e,0x0a,0x7e]
+
+v_cvt_u32_f32 v5, vcc_hi
+// GFX12: v_cvt_u32_f32_e32 v5, vcc_hi            ; encoding: [0x6b,0x0e,0x0a,0x7e]
+
+v_cvt_u32_f32 v5, ttmp15
+// GFX12: v_cvt_u32_f32_e32 v5, ttmp15            ; encoding: [0x7b,0x0e,0x0a,0x7e]
+
+v_cvt_u32_f32 v5, m0
+// GFX12: v_cvt_u32_f32_e32 v5, m0                ; encoding: [0x7d,0x0e,0x0a,0x7e]
+
+v_cvt_u32_f32 v5, exec_lo
+// GFX12: v_cvt_u32_f32_e32 v5, exec_lo           ; encoding: [0x7e,0x0e,0x0a,0x7e]
+
+v_cvt_u32_f32 v5, exec_hi
+// GFX12: v_cvt_u32_f32_e32 v5, exec_hi           ; encoding: [0x7f,0x0e,0x0a,0x7e]
+
+v_cvt_u32_f32 v5, null
+// GFX12: v_cvt_u32_f32_e32 v5, null              ; encoding: [0x7c,0x0e,0x0a,0x7e]
+
+v_cvt_u32_f32 v5, -1
+// GFX12: v_cvt_u32_f32_e32 v5, -1                ; encoding: [0xc1,0x0e,0x0a,0x7e]
+
+v_cvt_u32_f32 v5, 0.5
+// GFX12: v_cvt_u32_f32_e32 v5, 0.5               ; encoding: [0xf0,0x0e,0x0a,0x7e]
+
+v_cvt_u32_f32 v5, src_scc
+// GFX12: v_cvt_u32_f32_e32 v5, src_scc           ; encoding: [0xfd,0x0e,0x0a,0x7e]
+
+v_cvt_u32_f32 v255, 0xaf123456
+// GFX12: v_cvt_u32_f32_e32 v255, 0xaf123456      ; encoding: [0xff,0x0e,0xfe,0x7f,0x56,0x34,0x12,0xaf]
+
+v_cvt_u32_f64 v5, v[1:2]
+// GFX12: v_cvt_u32_f64_e32 v5, v[1:2]            ; encoding: [0x01,0x2b,0x0a,0x7e]
+
+v_cvt_u32_f64 v5, v[254:255]
+// GFX12: v_cvt_u32_f64_e32 v5, v[254:255]        ; encoding: [0xfe,0x2b,0x0a,0x7e]
+
+v_cvt_u32_f64 v5, s[2:3]
+// GFX12: v_cvt_u32_f64_e32 v5, s[2:3]            ; encoding: [0x02,0x2a,0x0a,0x7e]
+
+v_cvt_u32_f64 v5, s[104:105]
+// GFX12: v_cvt_u32_f64_e32 v5, s[104:105]        ; encoding: [0x68,0x2a,0x0a,0x7e]
+
+v_cvt_u32_f64 v5, vcc
+// GFX12: v_cvt_u32_f64_e32 v5, vcc               ; encoding: [0x6a,0x2a,0x0a,0x7e]
+
+v_cvt_u32_f64 v5, ttmp[14:15]
+// GFX12: v_cvt_u32_f64_e32 v5, ttmp[14:15]       ; encoding: [0x7a,0x2a,0x0a,0x7e]
+
+v_cvt_u32_f64 v5, exec
+// GFX12: v_cvt_u32_f64_e32 v5, exec              ; encoding: [0x7e,0x2a,0x0a,0x7e]
+
+v_cvt_u32_f64 v5, null
+// GFX12: v_cvt_u32_f64_e32 v5, null              ; encoding: [0x7c,0x2a,0x0a,0x7e]
+
+v_cvt_u32_f64 v5, -1
+// GFX12: v_cvt_u32_f64_e32 v5, -1                ; encoding: [0xc1,0x2a,0x0a,0x7e]
+
+v_cvt_u32_f64 v5, 0.5
+// GFX12: v_cvt_u32_f64_e32 v5, 0.5               ; encoding: [0xf0,0x2a,0x0a,0x7e]
+
+v_cvt_u32_f64 v5, src_scc
+// GFX12: v_cvt_u32_f64_e32 v5, src_scc           ; encoding: [0xfd,0x2a,0x0a,0x7e]
+
+v_cvt_u32_f64 v255, 0xaf123456
+// GFX12: v_cvt_u32_f64_e32 v255, 0xaf123456      ; encoding: [0xff,0x2a,0xfe,0x7f,0x56,0x34,0x12,0xaf]
+
+v_cvt_u32_u16 v5, v1
+// GFX12: v_cvt_u32_u16_e32 v5, v1                ; encoding: [0x01,0xd7,0x0a,0x7e]
+
+v_cvt_u32_u16 v5, v127
+// GFX12: v_cvt_u32_u16_e32 v5, v127              ; encoding: [0x7f,0xd7,0x0a,0x7e]
+
+v_cvt_u32_u16 v5, s1
+// GFX12: v_cvt_u32_u16_e32 v5, s1                ; encoding: [0x01,0xd6,0x0a,0x7e]
+
+v_cvt_u32_u16 v5, s105
+// GFX12: v_cvt_u32_u16_e32 v5, s105              ; encoding: [0x69,0xd6,0x0a,0x7e]
+
+v_cvt_u32_u16 v5, vcc_lo
+// GFX12: v_cvt_u32_u16_e32 v5, vcc_lo            ; encoding: [0x6a,0xd6,0x0a,0x7e]
+
+v_cvt_u32_u16 v5, vcc_hi
+// GFX12: v_cvt_u32_u16_e32 v5, vcc_hi            ; encoding: [0x6b,0xd6,0x0a,0x7e]
+
+v_cvt_u32_u16 v5, ttmp15
+// GFX12: v_cvt_u32_u16_e32 v5, ttmp15            ; encoding: [0x7b,0xd6,0x0a,0x7e]
+
+v_cvt_u32_u16 v5, m0
+// GFX12: v_cvt_u32_u16_e32 v5, m0                ; encoding: [0x7d,0xd6,0x0a,0x7e]
+
+v_cvt_u32_u16 v5, exec_lo
+// GFX12: v_cvt_u32_u16_e32 v5, exec_lo           ; encoding: [0x7e,0xd6,0x0a,0x7e]
+
+v_cvt_u32_u16 v5, exec_hi
+// GFX12: v_cvt_u32_u16_e32 v5, exec_hi           ; encoding: [0x7f,0xd6,0x0a,0x7e]
+
+v_cvt_u32_u16 v5, null
+// GFX12: v_cvt_u32_u16_e32 v5, null              ; encoding: [0x7c,0xd6,0x0a,0x7e]
+
+v_cvt_u32_u16 v5, -1
+// GFX12: v_cvt_u32_u16_e32 v5, -1                ; encoding: [0xc1,0xd6,0x0a,0x7e]
+
+v_cvt_u32_u16 v5, 0.5
+// GFX12-ASM: v_cvt_u32_u16_e32 v5, 0.5               ; encoding: [0xf0,0xd6,0x0a,0x7e]
+// GFX12-DIS: v_cvt_u32_u16_e32 v5, 0x3800            ; encoding: [0xff,0xd6,0x0a,0x7e,0x00,0x38,0x00,0x00]
+
+v_cvt_u32_u16 v5, src_scc
+// GFX12: v_cvt_u32_u16_e32 v5, src_scc           ; encoding: [0xfd,0xd6,0x0a,0x7e]
+
+v_cvt_u32_u16 v255, 0xfe0b
+// GFX12: v_cvt_u32_u16_e32 v255, 0xfe0b          ; encoding: [0xff,0xd6,0xfe,0x7f,0x0b,0xfe,0x00,0x00]
+
+v_exp_f16 v5, v1
+// GFX12: v_exp_f16_e32 v5, v1                    ; encoding: [0x01,0xb1,0x0a,0x7e]
+
+v_exp_f16 v5, v127
+// GFX12: v_exp_f16_e32 v5, v127                  ; encoding: [0x7f,0xb1,0x0a,0x7e]
+
+v_exp_f16 v5, s1
+// GFX12: v_exp_f16_e32 v5, s1                    ; encoding: [0x01,0xb0,0x0a,0x7e]
+
+v_exp_f16 v5, s105
+// GFX12: v_exp_f16_e32 v5, s105                  ; encoding: [0x69,0xb0,0x0a,0x7e]
+
+v_exp_f16 v5, vcc_lo
+// GFX12: v_exp_f16_e32 v5, vcc_lo                ; encoding: [0x6a,0xb0,0x0a,0x7e]
+
+v_exp_f16 v5, vcc_hi
+// GFX12: v_exp_f16_e32 v5, vcc_hi                ; encoding: [0x6b,0xb0,0x0a,0x7e]
+
+v_exp_f16 v5, ttmp15
+// GFX12: v_exp_f16_e32 v5, ttmp15                ; encoding: [0x7b,0xb0,0x0a,0x7e]
+
+v_exp_f16 v5, m0
+// GFX12: v_exp_f16_e32 v5, m0                    ; encoding: [0x7d,0xb0,0x0a,0x7e]
+
+v_exp_f16 v5, exec_lo
+// GFX12: v_exp_f16_e32 v5, exec_lo               ; encoding: [0x7e,0xb0,0x0a,0x7e]
+
+v_exp_f16 v5, exec_hi
+// GFX12: v_exp_f16_e32 v5, exec_hi               ; encoding: [0x7f,0xb0,0x0a,0x7e]
+
+v_exp_f16 v5, null
+// GFX12: v_exp_f16_e32 v5, null                  ; encoding: [0x7c,0xb0,0x0a,0x7e]
+
+v_exp_f16 v5, -1
+// GFX12: v_exp_f16_e32 v5, -1                    ; encoding: [0xc1,0xb0,0x0a,0x7e]
+
+v_exp_f16 v5, 0.5
+// GFX12: v_exp_f16_e32 v5, 0.5                   ; encoding: [0xf0,0xb0,0x0a,0x7e]
+
+v_exp_f16 v5, src_scc
+// GFX12: v_exp_f16_e32 v5, src_scc               ; encoding: [0xfd,0xb0,0x0a,0x7e]
+
+v_exp_f16 v127, 0xfe0b
+// GFX12: v_exp_f16_e32 v127, 0xfe0b              ; encoding: [0xff,0xb0,0xfe,0x7e,0x0b,0xfe,0x00,0x00]
+
+v_exp_f32 v5, v1
+// GFX12: v_exp_f32_e32 v5, v1                    ; encoding: [0x01,0x4b,0x0a,0x7e]
+
+v_exp_f32 v5, v255
+// GFX12: v_exp_f32_e32 v5, v255                  ; encoding: [0xff,0x4b,0x0a,0x7e]
+
+v_exp_f32 v5, s1
+// GFX12: v_exp_f32_e32 v5, s1                    ; encoding: [0x01,0x4a,0x0a,0x7e]
+
+v_exp_f32 v5, s105
+// GFX12: v_exp_f32_e32 v5, s105                  ; encoding: [0x69,0x4a,0x0a,0x7e]
+
+v_exp_f32 v5, vcc_lo
+// GFX12: v_exp_f32_e32 v5, vcc_lo                ; encoding: [0x6a,0x4a,0x0a,0x7e]
+
+v_exp_f32 v5, vcc_hi
+// GFX12: v_exp_f32_e32 v5, vcc_hi                ; encoding: [0x6b,0x4a,0x0a,0x7e]
+
+v_exp_f32 v5, ttmp15
+// GFX12: v_exp_f32_e32 v5, ttmp15                ; encoding: [0x7b,0x4a,0x0a,0x7e]
+
+v_exp_f32 v5, m0
+// GFX12: v_exp_f32_e32 v5, m0                    ; encoding: [0x7d,0x4a,0x0a,0x7e]
+
+v_exp_f32 v5, exec_lo
+// GFX12: v_exp_f32_e32 v5, exec_lo               ; encoding: [0x7e,0x4a,0x0a,0x7e]
+
+v_exp_f32 v5, exec_hi
+// GFX12: v_exp_f32_e32 v5, exec_hi               ; encoding: [0x7f,0x4a,0x0a,0x7e]
+
+v_exp_f32 v5, null
+// GFX12: v_exp_f32_e32 v5, null                  ; encoding: [0x7c,0x4a,0x0a,0x7e]
+
+v_exp_f32 v5, -1
+// GFX12: v_exp_f32_e32 v5, -1                    ; encoding: [0xc1,0x4a,0x0a,0x7e]
+
+v_exp_f32 v5, 0.5
+// GFX12: v_exp_f32_e32 v5, 0.5                   ; encoding: [0xf0,0x4a,0x0a,0x7e]
+
+v_exp_f32 v5, src_scc
+// GFX12: v_exp_f32_e32 v5, src_scc               ; encoding: [0xfd,0x4a,0x0a,0x7e]
+
+v_exp_f32 v255, 0xaf123456
+// GFX12: v_exp_f32_e32 v255, 0xaf123456          ; encoding: [0xff,0x4a,0xfe,0x7f,0x56,0x34,0x12,0xaf]
+
+v_ffbh_i32 v5, v1
+// GFX12: v_cls_i32_e32 v5, v1                    ; encoding: [0x01,0x77,0x0a,0x7e]
+
+v_ffbh_i32 v5, v255
+// GFX12: v_cls_i32_e32 v5, v255                  ; encoding: [0xff,0x77,0x0a,0x7e]
+
+v_ffbh_i32 v5, s1
+// GFX12: v_cls_i32_e32 v5, s1                    ; encoding: [0x01,0x76,0x0a,0x7e]
+
+v_ffbh_i32 v5, s105
+// GFX12: v_cls_i32_e32 v5, s105                  ; encoding: [0x69,0x76,0x0a,0x7e]
+
+v_ffbh_i32 v5, vcc_lo
+// GFX12: v_cls_i32_e32 v5, vcc_lo                ; encoding: [0x6a,0x76,0x0a,0x7e]
+
+v_ffbh_i32 v5, vcc_hi
+// GFX12: v_cls_i32_e32 v5, vcc_hi                ; encoding: [0x6b,0x76,0x0a,0x7e]
+
+v_ffbh_i32 v5, ttmp15
+// GFX12: v_cls_i32_e32 v5, ttmp15                ; encoding: [0x7b,0x76,0x0a,0x7e]
+
+v_ffbh_i32 v5, m0
+// GFX12: v_cls_i32_e32 v5, m0                    ; encoding: [0x7d,0x76,0x0a,0x7e]
+
+v_ffbh_i32 v5, exec_lo
+// GFX12: v_cls_i32_e32 v5, exec_lo               ; encoding: [0x7e,0x76,0x0a,0x7e]
+
+v_ffbh_i32 v5, exec_hi
+// GFX12: v_cls_i32_e32 v5, exec_hi               ; encoding: [0x7f,0x76,0x0a,0x7e]
+
+v_ffbh_i32 v5, null
+// GFX12: v_cls_i32_e32 v5, null                  ; encoding: [0x7c,0x76,0x0a,0x7e]
+
+v_ffbh_i32 v5, -1
+// GFX12: v_cls_i32_e32 v5, -1                    ; encoding: [0xc1,0x76,0x0a,0x7e]
+
+v_ffbh_i32 v5, 0.5
+// GFX12: v_cls_i32_e32 v5, 0.5                   ; encoding: [0xf0,0x76,0x0a,0x7e]
+
+v_ffbh_i32 v5, src_scc
+// GFX12: v_cls_i32_e32 v5, src_scc               ; encoding: [0xfd,0x76,0x0a,0x7e]
+
+v_ffbh_i32 v255, 0xaf123456
+// GFX12: v_cls_i32_e32 v255, 0xaf123456          ; encoding: [0xff,0x76,0xfe,0x7f,0x56,0x34,0x12,0xaf]
+
+v_ffbh_u32 v5, v1
+// GFX12: v_clz_i32_u32_e32 v5, v1                ; encoding: [0x01,0x73,0x0a,0x7e]
+
+v_ffbh_u32 v5, v255
+// GFX12: v_clz_i32_u32_e32 v5, v255              ; encoding: [0xff,0x73,0x0a,0x7e]
+
+v_ffbh_u32 v5, s1
+// GFX12: v_clz_i32_u32_e32 v5, s1                ; encoding: [0x01,0x72,0x0a,0x7e]
+
+v_ffbh_u32 v5, s105
+// GFX12: v_clz_i32_u32_e32 v5, s105              ; encoding: [0x69,0x72,0x0a,0x7e]
+
+v_ffbh_u32 v5, vcc_lo
+// GFX12: v_clz_i32_u32_e32 v5, vcc_lo            ; encoding: [0x6a,0x72,0x0a,0x7e]
+
+v_ffbh_u32 v5, vcc_hi
+// GFX12: v_clz_i32_u32_e32 v5, vcc_hi            ; encoding: [0x6b,0x72,0x0a,0x7e]
+
+v_ffbh_u32 v5, ttmp15
+// GFX12: v_clz_i32_u32_e32 v5, ttmp15            ; encoding: [0x7b,0x72,0x0a,0x7e]
+
+v_ffbh_u32 v5, m0
+// GFX12: v_clz_i32_u32_e32 v5, m0                ; encoding: [0x7d,0x72,0x0a,0x7e]
+
+v_ffbh_u32 v5, exec_lo
+// GFX12: v_clz_i32_u32_e32 v5, exec_lo           ; encoding: [0x7e,0x72,0x0a,0x7e]
+
+v_ffbh_u32 v5, exec_hi
+// GFX12: v_clz_i32_u32_e32 v5, exec_hi           ; encoding: [0x7f,0x72,0x0a,0x7e]
+
+v_ffbh_u32 v5, null
+// GFX12: v_clz_i32_u32_e32 v5, null              ; encoding: [0x7c,0x72,0x0a,0x7e]
+
+v_ffbh_u32 v5, -1
+// GFX12: v_clz_i32_u32_e32 v5, -1                ; encoding: [0xc1,0x72,0x0a,0x7e]
+
+v_ffbh_u32 v5, 0.5
+// GFX12: v_clz_i32_u32_e32 v5, 0.5               ; encoding: [0xf0,0x72,0x0a,0x7e]
+
+v_ffbh_u32 v5, src_scc
+// GFX12: v_clz_i32_u32_e32 v5, src_scc           ; encoding: [0xfd,0x72,0x0a,0x7e]
+
+v_ffbh_u32 v255, 0xaf123456
+// GFX12: v_clz_i32_u32_e32 v255, 0xaf123456      ; encoding: [0xff,0x72,0xfe,0x7f,0x56,0x34,0x12,0xaf]
+
+v_ffbl_b32 v5, v1
+// GFX12: v_ctz_i32_b32_e32 v5, v1                ; encoding: [0x01,0x75,0x0a,0x7e]
+
+v_ffbl_b32 v5, v255
+// GFX12: v_ctz_i32_b32_e32 v5, v255              ; encoding: [0xff,0x75,0x0a,0x7e]
+
+v_ffbl_b32 v5, s1
+// GFX12: v_ctz_i32_b32_e32 v5, s1                ; encoding: [0x01,0x74,0x0a,0x7e]
+
+v_ffbl_b32 v5, s105
+// GFX12: v_ctz_i32_b32_e32 v5, s105              ; encoding: [0x69,0x74,0x0a,0x7e]
+
+v_ffbl_b32 v5, vcc_lo
+// GFX12: v_ctz_i32_b32_e32 v5, vcc_lo            ; encoding: [0x6a,0x74,0x0a,0x7e]
+
+v_ffbl_b32 v5, vcc_hi
+// GFX12: v_ctz_i32_b32_e32 v5, vcc_hi            ; encoding: [0x6b,0x74,0x0a,0x7e]
+
+v_ffbl_b32 v5, ttmp15
+// GFX12: v_ctz_i32_b32_e32 v5, ttmp15            ; encoding: [0x7b,0x74,0x0a,0x7e]
+
+v_ffbl_b32 v5, m0
+// GFX12: v_ctz_i32_b32_e32 v5, m0                ; encoding: [0x7d,0x74,0x0a,0x7e]
+
+v_ffbl_b32 v5, exec_lo
+// GFX12: v_ctz_i32_b32_e32 v5, exec_lo           ; encoding: [0x7e,0x74,0x0a,0x7e]
+
+v_ffbl_b32 v5, exec_hi
+// GFX12: v_ctz_i32_b32_e32 v5, exec_hi           ; encoding: [0x7f,0x74,0x0a,0x7e]
+
+v_ffbl_b32 v5, null
+// GFX12: v_ctz_i32_b32_e32 v5, null              ; encoding: [0x7c,0x74,0x0a,0x7e]
+
+v_ffbl_b32 v5, -1
+// GFX12: v_ctz_i32_b32_e32 v5, -1                ; encoding: [0xc1,0x74,0x0a,0x7e]
+
+v_ffbl_b32 v5, 0.5
+// GFX12: v_ctz_i32_b32_e32 v5, 0.5               ; encoding: [0xf0,0x74,0x0a,0x7e]
+
+v_ffbl_b32 v5, src_scc
+// GFX12: v_ctz_i32_b32_e32 v5, src_scc           ; encoding: [0xfd,0x74,0x0a,0x7e]
+
+v_ffbl_b32 v255, 0xaf123456
+// GFX12: v_ctz_i32_b32_e32 v255, 0xaf123456      ; encoding: [0xff,0x74,0xfe,0x7f,0x56,0x34,0x12,0xaf]
+
+v_floor_f16 v5, v1
+// GFX12: v_floor_f16_e32 v5, v1                  ; encoding: [0x01,0xb7,0x0a,0x7e]
+
+v_floor_f16 v5, v127
+// GFX12: v_floor_f16_e32 v5, v127                ; encoding: [0x7f,0xb7,0x0a,0x7e]
+
+v_floor_f16 v5, s1
+// GFX12: v_floor_f16_e32 v5, s1                  ; encoding: [0x01,0xb6,0x0a,0x7e]
+
+v_floor_f16 v5, s105
+// GFX12: v_floor_f16_e32 v5, s105                ; encoding: [0x69,0xb6,0x0a,0x7e]
+
+v_floor_f16 v5, vcc_lo
+// GFX12: v_floor_f16_e32 v5, vcc_lo              ; encoding: [0x6a,0xb6,0x0a,0x7e]
+
+v_floor_f16 v5, vcc_hi
+// GFX12: v_floor_f16_e32 v5, vcc_hi              ; encoding: [0x6b,0xb6,0x0a,0x7e]
+
+v_floor_f16 v5, ttmp15
+// GFX12: v_floor_f16_e32 v5, ttmp15              ; encoding: [0x7b,0xb6,0x0a,0x7e]
+
+v_floor_f16 v5, m0
+// GFX12: v_floor_f16_e32 v5, m0                  ; encoding: [0x7d,0xb6,0x0a,0x7e]
+
+v_floor_f16 v5, exec_lo
+// GFX12: v_floor_f16_e32 v5, exec_lo             ; encoding: [0x7e,0xb6,0x0a,0x7e]
+
+v_floor_f16 v5, exec_hi
+// GFX12: v_floor_f16_e32 v5, exec_hi             ; encoding: [0x7f,0xb6,0x0a,0x7e]
+
+v_floor_f16 v5, null
+// GFX12: v_floor_f16_e32 v5, null                ; encoding: [0x7c,0xb6,0x0a,0x7e]
+
+v_floor_f16 v5, -1
+// GFX12: v_floor_f16_e32 v5, -1                  ; encoding: [0xc1,0xb6,0x0a,0x7e]
+
+v_floor_f16 v5, 0.5
+// GFX12: v_floor_f16_e32 v5, 0.5                 ; encoding: [0xf0,0xb6,0x0a,0x7e]
+
+v_floor_f16 v5, src_scc
+// GFX12: v_floor_f16_e32 v5, src_scc             ; encoding: [0xfd,0xb6,0x0a,0x7e]
+
+v_floor_f16 v127, 0xfe0b
+// GFX12: v_floor_f16_e32 v127, 0xfe0b            ; encoding: [0xff,0xb6,0xfe,0x7e,0x0b,0xfe,0x00,0x00]
+
+v_floor_f32 v5, v1
+// GFX12: v_floor_f32_e32 v5, v1                  ; encoding: [0x01,0x49,0x0a,0x7e]
+
+v_floor_f32 v5, v255
+// GFX12: v_floor_f32_e32 v5, v255                ; encoding: [0xff,0x49,0x0a,0x7e]
+
+v_floor_f32 v5, s1
+// GFX12: v_floor_f32_e32 v5, s1                  ; encoding: [0x01,0x48,0x0a,0x7e]
+
+v_floor_f32 v5, s105
+// GFX12: v_floor_f32_e32 v5, s105                ; encoding: [0x69,0x48,0x0a,0x7e]
+
+v_floor_f32 v5, vcc_lo
+// GFX12: v_floor_f32_e32 v5, vcc_lo              ; encoding: [0x6a,0x48,0x0a,0x7e]
+
+v_floor_f32 v5, vcc_hi
+// GFX12: v_floor_f32_e32 v5, vcc_hi              ; encoding: [0x6b,0x48,0x0a,0x7e]
+
+v_floor_f32 v5, ttmp15
+// GFX12: v_floor_f32_e32 v5, ttmp15              ; encoding: [0x7b,0x48,0x0a,0x7e]
+
+v_floor_f32 v5, m0
+// GFX12: v_floor_f32_e32 v5, m0                  ; encoding: [0x7d,0x48,0x0a,0x7e]
+
+v_floor_f32 v5, exec_lo
+// GFX12: v_floor_f32_e32 v5, exec_lo             ; encoding: [0x7e,0x48,0x0a,0x7e]
+
+v_floor_f32 v5, exec_hi
+// GFX12: v_floor_f32_e32 v5, exec_hi             ; encoding: [0x7f,0x48,0x0a,0x7e]
+
+v_floor_f32 v5, null
+// GFX12: v_floor_f32_e32 v5, null                ; encoding: [0x7c,0x48,0x0a,0x7e]
+
+v_floor_f32 v5, -1
+// GFX12: v_floor_f32_e32 v5, -1                  ; encoding: [0xc1,0x48,0x0a,0x7e]
+
+v_floor_f32 v5, 0.5
+// GFX12: v_floor_f32_e32 v5, 0.5                 ; encoding: [0xf0,0x48,0x0a,0x7e]
+
+v_floor_f32 v5, src_scc
+// GFX12: v_floor_f32_e32 v5, src_scc             ; encoding: [0xfd,0x48,0x0a,0x7e]
+
+v_floor_f32 v255, 0xaf123456
+// GFX12: v_floor_f32_e32 v255, 0xaf123456        ; encoding: [0xff,0x48,0xfe,0x7f,0x56,0x34,0x12,0xaf]
+
+v_floor_f64 v[5:6], v[1:2]
+// GFX12: v_floor_f64_e32 v[5:6], v[1:2]          ; encoding: [0x01,0x35,0x0a,0x7e]
+
+v_floor_f64 v[5:6], v[254:255]
+// GFX12: v_floor_f64_e32 v[5:6], v[254:255]      ; encoding: [0xfe,0x35,0x0a,0x7e]
+
+v_floor_f64 v[5:6], s[2:3]
+// GFX12: v_floor_f64_e32 v[5:6], s[2:3]          ; encoding: [0x02,0x34,0x0a,0x7e]
+
+v_floor_f64 v[5:6], s[104:105]
+// GFX12: v_floor_f64_e32 v[5:6], s[104:105]      ; encoding: [0x68,0x34,0x0a,0x7e]
+
+v_floor_f64 v[5:6], vcc
+// GFX12: v_floor_f64_e32 v[5:6], vcc             ; encoding: [0x6a,0x34,0x0a,0x7e]
+
+v_floor_f64 v[5:6], ttmp[14:15]
+// GFX12: v_floor_f64_e32 v[5:6], ttmp[14:15]     ; encoding: [0x7a,0x34,0x0a,0x7e]
+
+v_floor_f64 v[5:6], exec
+// GFX12: v_floor_f64_e32 v[5:6], exec            ; encoding: [0x7e,0x34,0x0a,0x7e]
+
+v_floor_f64 v[5:6], null
+// GFX12: v_floor_f64_e32 v[5:6], null            ; encoding: [0x7c,0x34,0x0a,0x7e]
+
+v_floor_f64 v[5:6], -1
+// GFX12: v_floor_f64_e32 v[5:6], -1              ; encoding: [0xc1,0x34,0x0a,0x7e]
+
+v_floor_f64 v[5:6], 0.5
+// GFX12: v_floor_f64_e32 v[5:6], 0.5             ; encoding: [0xf0,0x34,0x0a,0x7e]
+
+v_floor_f64 v[5:6], src_scc
+// GFX12: v_floor_f64_e32 v[5:6], src_scc         ; encoding: [0xfd,0x34,0x0a,0x7e]
+
+v_floor_f64 v[254:255], 0xaf123456
+// GFX12: v_floor_f64_e32 v[254:255], 0xaf123456  ; encoding: [0xff,0x34,0xfc,0x7f,0x56,0x34,0x12,0xaf]
+
+v_fract_f16 v5, v1
+// GFX12: v_fract_f16_e32 v5, v1                  ; encoding: [0x01,0xbf,0x0a,0x7e]
+
+v_fract_f16 v5, v127
+// GFX12: v_fract_f16_e32 v5, v127                ; encoding: [0x7f,0xbf,0x0a,0x7e]
+
+v_fract_f16 v5, s1
+// GFX12: v_fract_f16_e32 v5, s1                  ; encoding: [0x01,0xbe,0x0a,0x7e]
+
+v_fract_f16 v5, s105
+// GFX12: v_fract_f16_e32 v5, s105                ; encoding: [0x69,0xbe,0x0a,0x7e]
+
+v_fract_f16 v5, vcc_lo
+// GFX12: v_fract_f16_e32 v5, vcc_lo              ; encoding: [0x6a,0xbe,0x0a,0x7e]
+
+v_fract_f16 v5, vcc_hi
+// GFX12: v_fract_f16_e32 v5, vcc_hi              ; encoding: [0x6b,0xbe,0x0a,0x7e]
+
+v_fract_f16 v5, ttmp15
+// GFX12: v_fract_f16_e32 v5, ttmp15              ; encoding: [0x7b,0xbe,0x0a,0x7e]
+
+v_fract_f16 v5, m0
+// GFX12: v_fract_f16_e32 v5, m0                  ; encoding: [0x7d,0xbe,0x0a,0x7e]
+
+v_fract_f16 v5, exec_lo
+// GFX12: v_fract_f16_e32 v5, exec_lo             ; encoding: [0x7e,0xbe,0x0a,0x7e]
+
+v_fract_f16 v5, exec_hi
+// GFX12: v_fract_f16_e32 v5, exec_hi             ; encoding: [0x7f,0xbe,0x0a,0x7e]
+
+v_fract_f16 v5, null
+// GFX12: v_fract_f16_e32 v5, null                ; encoding: [0x7c,0xbe,0x0a,0x7e]
+
+v_fract_f16 v5, -1
+// GFX12: v_fract_f16_e32 v5, -1                  ; encoding: [0xc1,0xbe,0x0a,0x7e]
+
+v_fract_f16 v5, 0.5
+// GFX12: v_fract_f16_e32 v5, 0.5                 ; encoding: [0xf0,0xbe,0x0a,0x7e]
+
+v_fract_f16 v5, src_scc
+// GFX12: v_fract_f16_e32 v5, src_scc             ; encoding: [0xfd,0xbe,0x0a,0x7e]
+
+v_fract_f16 v127, 0xfe0b
+// GFX12: v_fract_f16_e32 v127, 0xfe0b            ; encoding: [0xff,0xbe,0xfe,0x7e,0x0b,0xfe,0x00,0x00]
+
+v_fract_f32 v5, v1
+// GFX12: v_fract_f32_e32 v5, v1                  ; encoding: [0x01,0x41,0x0a,0x7e]
+
+v_fract_f32 v5, v255
+// GFX12: v_fract_f32_e32 v5, v255                ; encoding: [0xff,0x41,0x0a,0x7e]
+
+v_fract_f32 v5, s1
+// GFX12: v_fract_f32_e32 v5, s1                  ; encoding: [0x01,0x40,0x0a,0x7e]
+
+v_fract_f32 v5, s105
+// GFX12: v_fract_f32_e32 v5, s105                ; encoding: [0x69,0x40,0x0a,0x7e]
+
+v_fract_f32 v5, vcc_lo
+// GFX12: v_fract_f32_e32 v5, vcc_lo              ; encoding: [0x6a,0x40,0x0a,0x7e]
+
+v_fract_f32 v5, vcc_hi
+// GFX12: v_fract_f32_e32 v5, vcc_hi              ; encoding: [0x6b,0x40,0x0a,0x7e]
+
+v_fract_f32 v5, ttmp15
+// GFX12: v_fract_f32_e32 v5, ttmp15              ; encoding: [0x7b,0x40,0x0a,0x7e]
+
+v_fract_f32 v5, m0
+// GFX12: v_fract_f32_e32 v5, m0                  ; encoding: [0x7d,0x40,0x0a,0x7e]
+
+v_fract_f32 v5, exec_lo
+// GFX12: v_fract_f32_e32 v5, exec_lo             ; encoding: [0x7e,0x40,0x0a,0x7e]
+
+v_fract_f32 v5, exec_hi
+// GFX12: v_fract_f32_e32 v5, exec_hi             ; encoding: [0x7f,0x40,0x0a,0x7e]
+
+v_fract_f32 v5, null
+// GFX12: v_fract_f32_e32 v5, null                ; encoding: [0x7c,0x40,0x0a,0x7e]
+
+v_fract_f32 v5, -1
+// GFX12: v_fract_f32_e32 v5, -1                  ; encoding: [0xc1,0x40,0x0a,0x7e]
+
+v_fract_f32 v5, 0.5
+// GFX12: v_fract_f32_e32 v5, 0.5                 ; encoding: [0xf0,0x40,0x0a,0x7e]
+
+v_fract_f32 v5, src_scc
+// GFX12: v_fract_f32_e32 v5, src_scc             ; encoding: [0xfd,0x40,0x0a,0x7e]
+
+v_fract_f32 v255, 0xaf123456
+// GFX12: v_fract_f32_e32 v255, 0xaf123456        ; encoding: [0xff,0x40,0xfe,0x7f,0x56,0x34,0x12,0xaf]
+
+v_fract_f64 v[5:6], v[1:2]
+// GFX12: v_fract_f64_e32 v[5:6], v[1:2]          ; encoding: [0x01,0x7d,0x0a,0x7e]
+
+v_fract_f64 v[5:6], v[254:255]
+// GFX12: v_fract_f64_e32 v[5:6], v[254:255]      ; encoding: [0xfe,0x7d,0x0a,0x7e]
+
+v_fract_f64 v[5:6], s[2:3]
+// GFX12: v_fract_f64_e32 v[5:6], s[2:3]          ; encoding: [0x02,0x7c,0x0a,0x7e]
+
+v_fract_f64 v[5:6], s[104:105]
+// GFX12: v_fract_f64_e32 v[5:6], s[104:105]      ; encoding: [0x68,0x7c,0x0a,0x7e]
+
+v_fract_f64 v[5:6], vcc
+// GFX12: v_fract_f64_e32 v[5:6], vcc             ; encoding: [0x6a,0x7c,0x0a,0x7e]
+
+v_fract_f64 v[5:6], ttmp[14:15]
+// GFX12: v_fract_f64_e32 v[5:6], ttmp[14:15]     ; encoding: [0x7a,0x7c,0x0a,0x7e]
+
+v_fract_f64 v[5:6], exec
+// GFX12: v_fract_f64_e32 v[5:6], exec            ; encoding: [0x7e,0x7c,0x0a,0x7e]
+
+v_fract_f64 v[5:6], null
+// GFX12: v_fract_f64_e32 v[5:6], null            ; encoding: [0x7c,0x7c,0x0a,0x7e]
+
+v_fract_f64 v[5:6], -1
+// GFX12: v_fract_f64_e32 v[5:6], -1              ; encoding: [0xc1,0x7c,0x0a,0x7e]
+
+v_fract_f64 v[5:6], 0.5
+// GFX12: v_fract_f64_e32 v[5:6], 0.5             ; encoding: [0xf0,0x7c,0x0a,0x7e]
+
+v_fract_f64 v[5:6], src_scc
+// GFX12: v_fract_f64_e32 v[5:6], src_scc         ; encoding: [0xfd,0x7c,0x0a,0x7e]
+
+v_fract_f64 v[254:255], 0xaf123456
+// GFX12: v_fract_f64_e32 v[254:255], 0xaf123456  ; encoding: [0xff,0x7c,0xfc,0x7f,0x56,0x34,0x12,0xaf]
+
+v_frexp_exp_i16_f16 v5, v1
+// GFX12: v_frexp_exp_i16_f16_e32 v5, v1          ; encoding: [0x01,0xb5,0x0a,0x7e]
+
+v_frexp_exp_i16_f16 v5, v127
+// GFX12: v_frexp_exp_i16_f16_e32 v5, v127        ; encoding: [0x7f,0xb5,0x0a,0x7e]
+
+v_frexp_exp_i16_f16 v5, s1
+// GFX12: v_frexp_exp_i16_f16_e32 v5, s1          ; encoding: [0x01,0xb4,0x0a,0x7e]
+
+v_frexp_exp_i16_f16 v5, s105
+// GFX12: v_frexp_exp_i16_f16_e32 v5, s105        ; encoding: [0x69,0xb4,0x0a,0x7e]
+
+v_frexp_exp_i16_f16 v5, vcc_lo
+// GFX12: v_frexp_exp_i16_f16_e32 v5, vcc_lo      ; encoding: [0x6a,0xb4,0x0a,0x7e]
+
+v_frexp_exp_i16_f16 v5, vcc_hi
+// GFX12: v_frexp_exp_i16_f16_e32 v5, vcc_hi      ; encoding: [0x6b,0xb4,0x0a,0x7e]
+
+v_frexp_exp_i16_f16 v5, ttmp15
+// GFX12: v_frexp_exp_i16_f16_e32 v5, ttmp15      ; encoding: [0x7b,0xb4,0x0a,0x7e]
+
+v_frexp_exp_i16_f16 v5, m0
+// GFX12: v_frexp_exp_i16_f16_e32 v5, m0          ; encoding: [0x7d,0xb4,0x0a,0x7e]
+
+v_frexp_exp_i16_f16 v5, exec_lo
+// GFX12: v_frexp_exp_i16_f16_e32 v5, exec_lo     ; encoding: [0x7e,0xb4,0x0a,0x7e]
+
+v_frexp_exp_i16_f16 v5, exec_hi
+// GFX12: v_frexp_exp_i16_f16_e32 v5, exec_hi     ; encoding: [0x7f,0xb4,0x0a,0x7e]
+
+v_frexp_exp_i16_f16 v5, null
+// GFX12: v_frexp_exp_i16_f16_e32 v5, null        ; encoding: [0x7c,0xb4,0x0a,0x7e]
+
+v_frexp_exp_i16_f16 v5, -1
+// GFX12: v_frexp_exp_i16_f16_e32 v5, -1          ; encoding: [0xc1,0xb4,0x0a,0x7e]
+
+v_frexp_exp_i16_f16 v5, 0.5
+// GFX12: v_frexp_exp_i16_f16_e32 v5, 0.5         ; encoding: [0xf0,0xb4,0x0a,0x7e]
+
+v_frexp_exp_i16_f16 v5, src_scc
+// GFX12: v_frexp_exp_i16_f16_e32 v5, src_scc     ; encoding: [0xfd,0xb4,0x0a,0x7e]
+
+v_frexp_exp_i16_f16 v127, 0xfe0b
+// GFX12: v_frexp_exp_i16_f16_e32 v127, 0xfe0b    ; encoding: [0xff,0xb4,0xfe,0x7e,0x0b,0xfe,0x00,0x00]
+
+v_frexp_exp_i32_f32 v5, v1
+// GFX12: v_frexp_exp_i32_f32_e32 v5, v1          ; encoding: [0x01,0x7f,0x0a,0x7e]
+
+v_frexp_exp_i32_f32 v5, v255
+// GFX12: v_frexp_exp_i32_f32_e32 v5, v255        ; encoding: [0xff,0x7f,0x0a,0x7e]
+
+v_frexp_exp_i32_f32 v5, s1
+// GFX12: v_frexp_exp_i32_f32_e32 v5, s1          ; encoding: [0x01,0x7e,0x0a,0x7e]
+
+v_frexp_exp_i32_f32 v5, s105
+// GFX12: v_frexp_exp_i32_f32_e32 v5, s105        ; encoding: [0x69,0x7e,0x0a,0x7e]
+
+v_frexp_exp_i32_f32 v5, vcc_lo
+// GFX12: v_frexp_exp_i32_f32_e32 v5, vcc_lo      ; encoding: [0x6a,0x7e,0x0a,0x7e]
+
+v_frexp_exp_i32_f32 v5, vcc_hi
+// GFX12: v_frexp_exp_i32_f32_e32 v5, vcc_hi      ; encoding: [0x6b,0x7e,0x0a,0x7e]
+
+v_frexp_exp_i32_f32 v5, ttmp15
+// GFX12: v_frexp_exp_i32_f32_e32 v5, ttmp15      ; encoding: [0x7b,0x7e,0x0a,0x7e]
+
+v_frexp_exp_i32_f32 v5, m0
+// GFX12: v_frexp_exp_i32_f32_e32 v5, m0          ; encoding: [0x7d,0x7e,0x0a,0x7e]
+
+v_frexp_exp_i32_f32 v5, exec_lo
+// GFX12: v_frexp_exp_i32_f32_e32 v5, exec_lo     ; encoding: [0x7e,0x7e,0x0a,0x7e]
+
+v_frexp_exp_i32_f32 v5, exec_hi
+// GFX12: v_frexp_exp_i32_f32_e32 v5, exec_hi     ; encoding: [0x7f,0x7e,0x0a,0x7e]
+
+v_frexp_exp_i32_f32 v5, null
+// GFX12: v_frexp_exp_i32_f32_e32 v5, null        ; encoding: [0x7c,0x7e,0x0a,0x7e]
+
+v_frexp_exp_i32_f32 v5, -1
+// GFX12: v_frexp_exp_i32_f32_e32 v5, -1          ; encoding: [0xc1,0x7e,0x0a,0x7e]
+
+v_frexp_exp_i32_f32 v5, 0.5
+// GFX12: v_frexp_exp_i32_f32_e32 v5, 0.5         ; encoding: [0xf0,0x7e,0x0a,0x7e]
+
+v_frexp_exp_i32_f32 v5, src_scc
+// GFX12: v_frexp_exp_i32_f32_e32 v5, src_scc     ; encoding: [0xfd,0x7e,0x0a,0x7e]
+
+v_frexp_exp_i32_f32 v255, 0xaf123456
+// GFX12: v_frexp_exp_i32_f32_e32 v255, 0xaf123456 ; encoding: [0xff,0x7e,0xfe,0x7f,0x56,0x34,0x12,0xaf]
+
+v_frexp_exp_i32_f64 v5, v[1:2]
+// GFX12: v_frexp_exp_i32_f64_e32 v5, v[1:2]      ; encoding: [0x01,0x79,0x0a,0x7e]
+
+v_frexp_exp_i32_f64 v5, v[254:255]
+// GFX12: v_frexp_exp_i32_f64_e32 v5, v[254:255]  ; encoding: [0xfe,0x79,0x0a,0x7e]
+
+v_frexp_exp_i32_f64 v5, s[2:3]
+// GFX12: v_frexp_exp_i32_f64_e32 v5, s[2:3]      ; encoding: [0x02,0x78,0x0a,0x7e]
+
+v_frexp_exp_i32_f64 v5, s[104:105]
+// GFX12: v_frexp_exp_i32_f64_e32 v5, s[104:105]  ; encoding: [0x68,0x78,0x0a,0x7e]
+
+v_frexp_exp_i32_f64 v5, vcc
+// GFX12: v_frexp_exp_i32_f64_e32 v5, vcc         ; encoding: [0x6a,0x78,0x0a,0x7e]
+
+v_frexp_exp_i32_f64 v5, ttmp[14:15]
+// GFX12: v_frexp_exp_i32_f64_e32 v5, ttmp[14:15] ; encoding: [0x7a,0x78,0x0a,0x7e]
+
+v_frexp_exp_i32_f64 v5, exec
+// GFX12: v_frexp_exp_i32_f64_e32 v5, exec        ; encoding: [0x7e,0x78,0x0a,0x7e]
+
+v_frexp_exp_i32_f64 v5, null
+// GFX12: v_frexp_exp_i32_f64_e32 v5, null        ; encoding: [0x7c,0x78,0x0a,0x7e]
+
+v_frexp_exp_i32_f64 v5, -1
+// GFX12: v_frexp_exp_i32_f64_e32 v5, -1          ; encoding: [0xc1,0x78,0x0a,0x7e]
+
+v_frexp_exp_i32_f64 v5, 0.5
+// GFX12: v_frexp_exp_i32_f64_e32 v5, 0.5         ; encoding: [0xf0,0x78,0x0a,0x7e]
+
+v_frexp_exp_i32_f64 v5, src_scc
+// GFX12: v_frexp_exp_i32_f64_e32 v5, src_scc     ; encoding: [0xfd,0x78,0x0a,0x7e]
+
+v_frexp_exp_i32_f64 v255, 0xaf123456
+// GFX12: v_frexp_exp_i32_f64_e32 v255, 0xaf123456 ; encoding: [0xff,0x78,0xfe,0x7f,0x56,0x34,0x12,0xaf]
+
+v_frexp_mant_f16 v5, v1
+// GFX12: v_frexp_mant_f16_e32 v5, v1             ; encoding: [0x01,0xb3,0x0a,0x7e]
+
+v_frexp_mant_f16 v5, v127
+// GFX12: v_frexp_mant_f16_e32 v5, v127           ; encoding: [0x7f,0xb3,0x0a,0x7e]
+
+v_frexp_mant_f16 v5, s1
+// GFX12: v_frexp_mant_f16_e32 v5, s1             ; encoding: [0x01,0xb2,0x0a,0x7e]
+
+v_frexp_mant_f16 v5, s105
+// GFX12: v_frexp_mant_f16_e32 v5, s105           ; encoding: [0x69,0xb2,0x0a,0x7e]
+
+v_frexp_mant_f16 v5, vcc_lo
+// GFX12: v_frexp_mant_f16_e32 v5, vcc_lo         ; encoding: [0x6a,0xb2,0x0a,0x7e]
+
+v_frexp_mant_f16 v5, vcc_hi
+// GFX12: v_frexp_mant_f16_e32 v5, vcc_hi         ; encoding: [0x6b,0xb2,0x0a,0x7e]
+
+v_frexp_mant_f16 v5, ttmp15
+// GFX12: v_frexp_mant_f16_e32 v5, ttmp15         ; encoding: [0x7b,0xb2,0x0a,0x7e]
+
+v_frexp_mant_f16 v5, m0
+// GFX12: v_frexp_mant_f16_e32 v5, m0             ; encoding: [0x7d,0xb2,0x0a,0x7e]
+
+v_frexp_mant_f16 v5, exec_lo
+// GFX12: v_frexp_mant_f16_e32 v5, exec_lo        ; encoding: [0x7e,0xb2,0x0a,0x7e]
+
+v_frexp_mant_f16 v5, exec_hi
+// GFX12: v_frexp_mant_f16_e32 v5, exec_hi        ; encoding: [0x7f,0xb2,0x0a,0x7e]
+
+v_frexp_mant_f16 v5, null
+// GFX12: v_frexp_mant_f16_e32 v5, null           ; encoding: [0x7c,0xb2,0x0a,0x7e]
+
+v_frexp_mant_f16 v5, -1
+// GFX12: v_frexp_mant_f16_e32 v5, -1             ; encoding: [0xc1,0xb2,0x0a,0x7e]
+
+v_frexp_mant_f16 v5, 0.5
+// GFX12: v_frexp_mant_f16_e32 v5, 0.5            ; encoding: [0xf0,0xb2,0x0a,0x7e]
+
+v_frexp_mant_f16 v5, src_scc
+// GFX12: v_frexp_mant_f16_e32 v5, src_scc        ; encoding: [0xfd,0xb2,0x0a,0x7e]
+
+v_frexp_mant_f16 v127, 0xfe0b
+// GFX12: v_frexp_mant_f16_e32 v127, 0xfe0b       ; encoding: [0xff,0xb2,0xfe,0x7e,0x0b,0xfe,0x00,0x00]
+
+v_frexp_mant_f32 v5, v1
+// GFX12: v_frexp_mant_f32_e32 v5, v1             ; encoding: [0x01,0x81,0x0a,0x7e]
+
+v_frexp_mant_f32 v5, v255
+// GFX12: v_frexp_mant_f32_e32 v5, v255           ; encoding: [0xff,0x81,0x0a,0x7e]
+
+v_frexp_mant_f32 v5, s1
+// GFX12: v_frexp_mant_f32_e32 v5, s1             ; encoding: [0x01,0x80,0x0a,0x7e]
+
+v_frexp_mant_f32 v5, s105
+// GFX12: v_frexp_mant_f32_e32 v5, s105           ; encoding: [0x69,0x80,0x0a,0x7e]
+
+v_frexp_mant_f32 v5, vcc_lo
+// GFX12: v_frexp_mant_f32_e32 v5, vcc_lo         ; encoding: [0x6a,0x80,0x0a,0x7e]
+
+v_frexp_mant_f32 v5, vcc_hi
+// GFX12: v_frexp_mant_f32_e32 v5, vcc_hi         ; encoding: [0x6b,0x80,0x0a,0x7e]
+
+v_frexp_mant_f32 v5, ttmp15
+// GFX12: v_frexp_mant_f32_e32 v5, ttmp15         ; encoding: [0x7b,0x80,0x0a,0x7e]
+
+v_frexp_mant_f32 v5, m0
+// GFX12: v_frexp_mant_f32_e32 v5, m0             ; encoding: [0x7d,0x80,0x0a,0x7e]
+
+v_frexp_mant_f32 v5, exec_lo
+// GFX12: v_frexp_mant_f32_e32 v5, exec_lo        ; encoding: [0x7e,0x80,0x0a,0x7e]
+
+v_frexp_mant_f32 v5, exec_hi
+// GFX12: v_frexp_mant_f32_e32 v5, exec_hi        ; encoding: [0x7f,0x80,0x0a,0x7e]
+
+v_frexp_mant_f32 v5, null
+// GFX12: v_frexp_mant_f32_e32 v5, null           ; encoding: [0x7c,0x80,0x0a,0x7e]
+
+v_frexp_mant_f32 v5, -1
+// GFX12: v_frexp_mant_f32_e32 v5, -1             ; encoding: [0xc1,0x80,0x0a,0x7e]
+
+v_frexp_mant_f32 v5, 0.5
+// GFX12: v_frexp_mant_f32_e32 v5, 0.5            ; encoding: [0xf0,0x80,0x0a,0x7e]
+
+v_frexp_mant_f32 v5, src_scc
+// GFX12: v_frexp_mant_f32_e32 v5, src_scc        ; encoding: [0xfd,0x80,0x0a,0x7e]
+
+v_frexp_mant_f32 v255, 0xaf123456
+// GFX12: v_frexp_mant_f32_e32 v255, 0xaf123456   ; encoding: [0xff,0x80,0xfe,0x7f,0x56,0x34,0x12,0xaf]
+
+v_frexp_mant_f64 v[5:6], v[1:2]
+// GFX12: v_frexp_mant_f64_e32 v[5:6], v[1:2]     ; encoding: [0x01,0x7b,0x0a,0x7e]
+
+v_frexp_mant_f64 v[5:6], v[254:255]
+// GFX12: v_frexp_mant_f64_e32 v[5:6], v[254:255] ; encoding: [0xfe,0x7b,0x0a,0x7e]
+
+v_frexp_mant_f64 v[5:6], s[2:3]
+// GFX12: v_frexp_mant_f64_e32 v[5:6], s[2:3]     ; encoding: [0x02,0x7a,0x0a,0x7e]
+
+v_frexp_mant_f64 v[5:6], s[104:105]
+// GFX12: v_frexp_mant_f64_e32 v[5:6], s[104:105] ; encoding: [0x68,0x7a,0x0a,0x7e]
+
+v_frexp_mant_f64 v[5:6], vcc
+// GFX12: v_frexp_mant_f64_e32 v[5:6], vcc        ; encoding: [0x6a,0x7a,0x0a,0x7e]
+
+v_frexp_mant_f64 v[5:6], ttmp[14:15]
+// GFX12: v_frexp_mant_f64_e32 v[5:6], ttmp[14:15] ; encoding: [0x7a,0x7a,0x0a,0x7e]
+
+v_frexp_mant_f64 v[5:6], exec
+// GFX12: v_frexp_mant_f64_e32 v[5:6], exec       ; encoding: [0x7e,0x7a,0x0a,0x7e]
+
+v_frexp_mant_f64 v[5:6], null
+// GFX12: v_frexp_mant_f64_e32 v[5:6], null       ; encoding: [0x7c,0x7a,0x0a,0x7e]
+
+v_frexp_mant_f64 v[5:6], -1
+// GFX12: v_frexp_mant_f64_e32 v[5:6], -1         ; encoding: [0xc1,0x7a,0x0a,0x7e]
+
+v_frexp_mant_f64 v[5:6], 0.5
+// GFX12: v_frexp_mant_f64_e32 v[5:6], 0.5        ; encoding: [0xf0,0x7a,0x0a,0x7e]
+
+v_frexp_mant_f64 v[5:6], src_scc
+// GFX12: v_frexp_mant_f64_e32 v[5:6], src_scc    ; encoding: [0xfd,0x7a,0x0a,0x7e]
+
+v_frexp_mant_f64 v[254:255], 0xaf123456
+// GFX12: v_frexp_mant_f64_e32 v[254:255], 0xaf123456 ; encoding: [0xff,0x7a,0xfc,0x7f,0x56,0x34,0x12,0xaf]
+
+v_log_f16 v5, v1
+// GFX12: v_log_f16_e32 v5, v1                    ; encoding: [0x01,0xaf,0x0a,0x7e]
+
+v_log_f16 v5, v127
+// GFX12: v_log_f16_e32 v5, v127                  ; encoding: [0x7f,0xaf,0x0a,0x7e]
+
+v_log_f16 v5, s1
+// GFX12: v_log_f16_e32 v5, s1                    ; encoding: [0x01,0xae,0x0a,0x7e]
+
+v_log_f16 v5, s105
+// GFX12: v_log_f16_e32 v5, s105                  ; encoding: [0x69,0xae,0x0a,0x7e]
+
+v_log_f16 v5, vcc_lo
+// GFX12: v_log_f16_e32 v5, vcc_lo                ; encoding: [0x6a,0xae,0x0a,0x7e]
+
+v_log_f16 v5, vcc_hi
+// GFX12: v_log_f16_e32 v5, vcc_hi                ; encoding: [0x6b,0xae,0x0a,0x7e]
+
+v_log_f16 v5, ttmp15
+// GFX12: v_log_f16_e32 v5, ttmp15                ; encoding: [0x7b,0xae,0x0a,0x7e]
+
+v_log_f16 v5, m0
+// GFX12: v_log_f16_e32 v5, m0                    ; encoding: [0x7d,0xae,0x0a,0x7e]
+
+v_log_f16 v5, exec_lo
+// GFX12: v_log_f16_e32 v5, exec_lo               ; encoding: [0x7e,0xae,0x0a,0x7e]
+
+v_log_f16 v5, exec_hi
+// GFX12: v_log_f16_e32 v5, exec_hi               ; encoding: [0x7f,0xae,0x0a,0x7e]
+
+v_log_f16 v5, null
+// GFX12: v_log_f16_e32 v5, null                  ; encoding: [0x7c,0xae,0x0a,0x7e]
+
+v_log_f16 v5, -1
+// GFX12: v_log_f16_e32 v5, -1                    ; encoding: [0xc1,0xae,0x0a,0x7e]
+
+v_log_f16 v5, 0.5
+// GFX12: v_log_f16_e32 v5, 0.5                   ; encoding: [0xf0,0xae,0x0a,0x7e]
+
+v_log_f16 v5, src_scc
+// GFX12: v_log_f16_e32 v5, src_scc               ; encoding: [0xfd,0xae,0x0a,0x7e]
+
+v_log_f16 v127, 0xfe0b
+// GFX12: v_log_f16_e32 v127, 0xfe0b              ; encoding: [0xff,0xae,0xfe,0x7e,0x0b,0xfe,0x00,0x00]
+
+v_log_f32 v5, v1
+// GFX12: v_log_f32_e32 v5, v1                    ; encoding: [0x01,0x4f,0x0a,0x7e]
+
+v_log_f32 v5, v255
+// GFX12: v_log_f32_e32 v5, v255                  ; encoding: [0xff,0x4f,0x0a,0x7e]
+
+v_log_f32 v5, s1
+// GFX12: v_log_f32_e32 v5, s1                    ; encoding: [0x01,0x4e,0x0a,0x7e]
+
+v_log_f32 v5, s105
+// GFX12: v_log_f32_e32 v5, s105                  ; encoding: [0x69,0x4e,0x0a,0x7e]
+
+v_log_f32 v5, vcc_lo
+// GFX12: v_log_f32_e32 v5, vcc_lo                ; encoding: [0x6a,0x4e,0x0a,0x7e]
+
+v_log_f32 v5, vcc_hi
+// GFX12: v_log_f32_e32 v5, vcc_hi                ; encoding: [0x6b,0x4e,0x0a,0x7e]
+
+v_log_f32 v5, ttmp15
+// GFX12: v_log_f32_e32 v5, ttmp15                ; encoding: [0x7b,0x4e,0x0a,0x7e]
+
+v_log_f32 v5, m0
+// GFX12: v_log_f32_e32 v5, m0                    ; encoding: [0x7d,0x4e,0x0a,0x7e]
+
+v_log_f32 v5, exec_lo
+// GFX12: v_log_f32_e32 v5, exec_lo               ; encoding: [0x7e,0x4e,0x0a,0x7e]
+
+v_log_f32 v5, exec_hi
+// GFX12: v_log_f32_e32 v5, exec_hi               ; encoding: [0x7f,0x4e,0x0a,0x7e]
+
+v_log_f32 v5, null
+// GFX12: v_log_f32_e32 v5, null                  ; encoding: [0x7c,0x4e,0x0a,0x7e]
+
+v_log_f32 v5, -1
+// GFX12: v_log_f32_e32 v5, -1                    ; encoding: [0xc1,0x4e,0x0a,0x7e]
+
+v_log_f32 v5, 0.5
+// GFX12: v_log_f32_e32 v5, 0.5                   ; encoding: [0xf0,0x4e,0x0a,0x7e]
+
+v_log_f32 v5, src_scc
+// GFX12: v_log_f32_e32 v5, src_scc               ; encoding: [0xfd,0x4e,0x0a,0x7e]
+
+v_log_f32 v255, 0xaf123456
+// GFX12: v_log_f32_e32 v255, 0xaf123456          ; encoding: [0xff,0x4e,0xfe,0x7f,0x56,0x34,0x12,0xaf]
+
+v_mov_b32 v5, v1
+// GFX12: v_mov_b32_e32 v5, v1                    ; encoding: [0x01,0x03,0x0a,0x7e]
+
+v_mov_b32 v5, v255
+// GFX12: v_mov_b32_e32 v5, v255                  ; encoding: [0xff,0x03,0x0a,0x7e]
+
+v_mov_b32 v5, s1
+// GFX12: v_mov_b32_e32 v5, s1                    ; encoding: [0x01,0x02,0x0a,0x7e]
+
+v_mov_b32 v5, s105
+// GFX12: v_mov_b32_e32 v5, s105                  ; encoding: [0x69,0x02,0x0a,0x7e]
+
+v_mov_b32 v5, vcc_lo
+// GFX12: v_mov_b32_e32 v5, vcc_lo                ; encoding: [0x6a,0x02,0x0a,0x7e]
+
+v_mov_b32 v5, vcc_hi
+// GFX12: v_mov_b32_e32 v5, vcc_hi                ; encoding: [0x6b,0x02,0x0a,0x7e]
+
+v_mov_b32 v5, ttmp15
+// GFX12: v_mov_b32_e32 v5, ttmp15                ; encoding: [0x7b,0x02,0x0a,0x7e]
+
+v_mov_b32 v5, m0
+// GFX12: v_mov_b32_e32 v5, m0                    ; encoding: [0x7d,0x02,0x0a,0x7e]
+
+v_mov_b32 v5, exec_lo
+// GFX12: v_mov_b32_e32 v5, exec_lo               ; encoding: [0x7e,0x02,0x0a,0x7e]
+
+v_mov_b32 v5, exec_hi
+// GFX12: v_mov_b32_e32 v5, exec_hi               ; encoding: [0x7f,0x02,0x0a,0x7e]
+
+v_mov_b32 v5, null
+// GFX12: v_mov_b32_e32 v5, null                  ; encoding: [0x7c,0x02,0x0a,0x7e]
+
+v_mov_b32 v5, -1
+// GFX12: v_mov_b32_e32 v5, -1                    ; encoding: [0xc1,0x02,0x0a,0x7e]
+
+v_mov_b32 v5, 0.5
+// GFX12: v_mov_b32_e32 v5, 0.5                   ; encoding: [0xf0,0x02,0x0a,0x7e]
+
+v_mov_b32 v5, src_scc
+// GFX12: v_mov_b32_e32 v5, src_scc               ; encoding: [0xfd,0x02,0x0a,0x7e]
+
+v_mov_b32 v255, 0xaf123456
+// GFX12: v_mov_b32_e32 v255, 0xaf123456          ; encoding: [0xff,0x02,0xfe,0x7f,0x56,0x34,0x12,0xaf]
+
+v_movreld_b32 v5, v1
+// GFX12: v_movreld_b32_e32 v5, v1                ; encoding: [0x01,0x85,0x0a,0x7e]
+
+v_movreld_b32 v5, v255
+// GFX12: v_movreld_b32_e32 v5, v255              ; encoding: [0xff,0x85,0x0a,0x7e]
+
+v_movreld_b32 v5, s1
+// GFX12: v_movreld_b32_e32 v5, s1                ; encoding: [0x01,0x84,0x0a,0x7e]
+
+v_movreld_b32 v5, s105
+// GFX12: v_movreld_b32_e32 v5, s105              ; encoding: [0x69,0x84,0x0a,0x7e]
+
+v_movreld_b32 v5, vcc_lo
+// GFX12: v_movreld_b32_e32 v5, vcc_lo            ; encoding: [0x6a,0x84,0x0a,0x7e]
+
+v_movreld_b32 v5, vcc_hi
+// GFX12: v_movreld_b32_e32 v5, vcc_hi            ; encoding: [0x6b,0x84,0x0a,0x7e]
+
+v_movreld_b32 v5, ttmp15
+// GFX12: v_movreld_b32_e32 v5, ttmp15            ; encoding: [0x7b,0x84,0x0a,0x7e]
+
+v_movreld_b32 v5, m0
+// GFX12: v_movreld_b32_e32 v5, m0                ; encoding: [0x7d,0x84,0x0a,0x7e]
+
+v_movreld_b32 v5, exec_lo
+// GFX12: v_movreld_b32_e32 v5, exec_lo           ; encoding: [0x7e,0x84,0x0a,0x7e]
+
+v_movreld_b32 v5, exec_hi
+// GFX12: v_movreld_b32_e32 v5, exec_hi           ; encoding: [0x7f,0x84,0x0a,0x7e]
+
+v_movreld_b32 v5, null
+// GFX12: v_movreld_b32_e32 v5, null              ; encoding: [0x7c,0x84,0x0a,0x7e]
+
+v_movreld_b32 v5, -1
+// GFX12: v_movreld_b32_e32 v5, -1                ; encoding: [0xc1,0x84,0x0a,0x7e]
+
+v_movreld_b32 v5, 0.5
+// GFX12: v_movreld_b32_e32 v5, 0.5               ; encoding: [0xf0,0x84,0x0a,0x7e]
+
+v_movreld_b32 v5, src_scc
+// GFX12: v_movreld_b32_e32 v5, src_scc           ; encoding: [0xfd,0x84,0x0a,0x7e]
+
+v_movreld_b32 v255, 0xaf123456
+// GFX12: v_movreld_b32_e32 v255, 0xaf123456      ; encoding: [0xff,0x84,0xfe,0x7f,0x56,0x34,0x12,0xaf]
+
+v_movrels_b32 v5, v1
+// GFX12: v_movrels_b32_e32 v5, v1                ; encoding: [0x01,0x87,0x0a,0x7e]
+
+v_movrels_b32 v255, v255
+// GFX12: v_movrels_b32_e32 v255, v255            ; encoding: [0xff,0x87,0xfe,0x7f]
+
+v_movrelsd_2_b32 v5, v1
+// GFX12: v_movrelsd_2_b32_e32 v5, v1             ; encoding: [0x01,0x91,0x0a,0x7e]
+
+v_movrelsd_2_b32 v255, v255
+// GFX12: v_movrelsd_2_b32_e32 v255, v255         ; encoding: [0xff,0x91,0xfe,0x7f]
+
+v_movrelsd_b32 v5, v1
+// GFX12: v_movrelsd_b32_e32 v5, v1               ; encoding: [0x01,0x89,0x0a,0x7e]
+
+v_movrelsd_b32 v255, v255
+// GFX12: v_movrelsd_b32_e32 v255, v255           ; encoding: [0xff,0x89,0xfe,0x7f]
+
+v_nop
+// GFX12: v_nop                                   ; encoding: [0x00,0x00,0x00,0x7e]
+
+v_not_b16 v5, v1
+// GFX12: v_not_b16_e32 v5, v1                    ; encoding: [0x01,0xd3,0x0a,0x7e]
+
+v_not_b16 v5, v127
+// GFX12: v_not_b16_e32 v5, v127                  ; encoding: [0x7f,0xd3,0x0a,0x7e]
+
+v_not_b16 v5, s1
+// GFX12: v_not_b16_e32 v5, s1                    ; encoding: [0x01,0xd2,0x0a,0x7e]
+
+v_not_b16 v5, s105
+// GFX12: v_not_b16_e32 v5, s105                  ; encoding: [0x69,0xd2,0x0a,0x7e]
+
+v_not_b16 v5, vcc_lo
+// GFX12: v_not_b16_e32 v5, vcc_lo                ; encoding: [0x6a,0xd2,0x0a,0x7e]
+
+v_not_b16 v5, vcc_hi
+// GFX12: v_not_b16_e32 v5, vcc_hi                ; encoding: [0x6b,0xd2,0x0a,0x7e]
+
+v_not_b16 v5, ttmp15
+// GFX12: v_not_b16_e32 v5, ttmp15                ; encoding: [0x7b,0xd2,0x0a,0x7e]
+
+v_not_b16 v5, m0
+// GFX12: v_not_b16_e32 v5, m0                    ; encoding: [0x7d,0xd2,0x0a,0x7e]
+
+v_not_b16 v5, exec_lo
+// GFX12: v_not_b16_e32 v5, exec_lo               ; encoding: [0x7e,0xd2,0x0a,0x7e]
+
+v_not_b16 v5, exec_hi
+// GFX12: v_not_b16_e32 v5, exec_hi               ; encoding: [0x7f,0xd2,0x0a,0x7e]
+
+v_not_b16 v5, null
+// GFX12: v_not_b16_e32 v5, null                  ; encoding: [0x7c,0xd2,0x0a,0x7e]
+
+v_not_b16 v5, -1
+// GFX12: v_not_b16_e32 v5, -1                    ; encoding: [0xc1,0xd2,0x0a,0x7e]
+
+v_not_b16 v5, 0.5
+// GFX12-ASM: v_not_b16_e32 v5, 0.5                   ; encoding: [0xf0,0xd2,0x0a,0x7e]
+// GFX12-DIS: v_not_b16_e32 v5, 0x3800                ; encoding: [0xff,0xd2,0x0a,0x7e,0x00,0x38,0x00,0x00]
+
+v_not_b16 v5, src_scc
+// GFX12: v_not_b16_e32 v5, src_scc               ; encoding: [0xfd,0xd2,0x0a,0x7e]
+
+v_not_b16 v127, 0xfe0b
+// GFX12: v_not_b16_e32 v127, 0xfe0b              ; encoding: [0xff,0xd2,0xfe,0x7e,0x0b,0xfe,0x00,0x00]
+
+v_not_b32 v5, v1
+// GFX12: v_not_b32_e32 v5, v1                    ; encoding: [0x01,0x6f,0x0a,0x7e]
+
+v_not_b32 v5, v255
+// GFX12: v_not_b32_e32 v5, v255                  ; encoding: [0xff,0x6f,0x0a,0x7e]
+
+v_not_b32 v5, s1
+// GFX12: v_not_b32_e32 v5, s1                    ; encoding: [0x01,0x6e,0x0a,0x7e]
+
+v_not_b32 v5, s105
+// GFX12: v_not_b32_e32 v5, s105                  ; encoding: [0x69,0x6e,0x0a,0x7e]
+
+v_not_b32 v5, vcc_lo
+// GFX12: v_not_b32_e32 v5, vcc_lo                ; encoding: [0x6a,0x6e,0x0a,0x7e]
+
+v_not_b32 v5, vcc_hi
+// GFX12: v_not_b32_e32 v5, vcc_hi                ; encoding: [0x6b,0x6e,0x0a,0x7e]
+
+v_not_b32 v5, ttmp15
+// GFX12: v_not_b32_e32 v5, ttmp15                ; encoding: [0x7b,0x6e,0x0a,0x7e]
+
+v_not_b32 v5, m0
+// GFX12: v_not_b32_e32 v5, m0                    ; encoding: [0x7d,0x6e,0x0a,0x7e]
+
+v_not_b32 v5, exec_lo
+// GFX12: v_not_b32_e32 v5, exec_lo               ; encoding: [0x7e,0x6e,0x0a,0x7e]
+
+v_not_b32 v5, exec_hi
+// GFX12: v_not_b32_e32 v5, exec_hi               ; encoding: [0x7f,0x6e,0x0a,0x7e]
+
+v_not_b32 v5, null
+// GFX12: v_not_b32_e32 v5, null                  ; encoding: [0x7c,0x6e,0x0a,0x7e]
+
+v_not_b32 v5, -1
+// GFX12: v_not_b32_e32 v5, -1                    ; encoding: [0xc1,0x6e,0x0a,0x7e]
+
+v_not_b32 v5, 0.5
+// GFX12: v_not_b32_e32 v5, 0.5                   ; encoding: [0xf0,0x6e,0x0a,0x7e]
+
+v_not_b32 v5, src_scc
+// GFX12: v_not_b32_e32 v5, src_scc               ; encoding: [0xfd,0x6e,0x0a,0x7e]
+
+v_not_b32 v255, 0xaf123456
+// GFX12: v_not_b32_e32 v255, 0xaf123456          ; encoding: [0xff,0x6e,0xfe,0x7f,0x56,0x34,0x12,0xaf]
+
+v_permlane64_b32 v5, v1
+// GFX12: v_permlane64_b32 v5, v1                 ; encoding: [0x01,0xcf,0x0a,0x7e]
+
+v_permlane64_b32 v255, v255
+// GFX12: v_permlane64_b32 v255, v255             ; encoding: [0xff,0xcf,0xfe,0x7f]
+
+v_pipeflush
+// GFX12: v_pipeflush                             ; encoding: [0x00,0x36,0x00,0x7e]
+
+v_rcp_f16 v5, v1
+// GFX12: v_rcp_f16_e32 v5, v1                    ; encoding: [0x01,0xa9,0x0a,0x7e]
+
+v_rcp_f16 v5, v127
+// GFX12: v_rcp_f16_e32 v5, v127                  ; encoding: [0x7f,0xa9,0x0a,0x7e]
+
+v_rcp_f16 v5, s1
+// GFX12: v_rcp_f16_e32 v5, s1                    ; encoding: [0x01,0xa8,0x0a,0x7e]
+
+v_rcp_f16 v5, s105
+// GFX12: v_rcp_f16_e32 v5, s105                  ; encoding: [0x69,0xa8,0x0a,0x7e]
+
+v_rcp_f16 v5, vcc_lo
+// GFX12: v_rcp_f16_e32 v5, vcc_lo                ; encoding: [0x6a,0xa8,0x0a,0x7e]
+
+v_rcp_f16 v5, vcc_hi
+// GFX12: v_rcp_f16_e32 v5, vcc_hi                ; encoding: [0x6b,0xa8,0x0a,0x7e]
+
+v_rcp_f16 v5, ttmp15
+// GFX12: v_rcp_f16_e32 v5, ttmp15                ; encoding: [0x7b,0xa8,0x0a,0x7e]
+
+v_rcp_f16 v5, m0
+// GFX12: v_rcp_f16_e32 v5, m0                    ; encoding: [0x7d,0xa8,0x0a,0x7e]
+
+v_rcp_f16 v5, exec_lo
+// GFX12: v_rcp_f16_e32 v5, exec_lo               ; encoding: [0x7e,0xa8,0x0a,0x7e]
+
+v_rcp_f16 v5, exec_hi
+// GFX12: v_rcp_f16_e32 v5, exec_hi               ; encoding: [0x7f,0xa8,0x0a,0x7e]
+
+v_rcp_f16 v5, null
+// GFX12: v_rcp_f16_e32 v5, null                  ; encoding: [0x7c,0xa8,0x0a,0x7e]
+
+v_rcp_f16 v5, -1
+// GFX12: v_rcp_f16_e32 v5, -1                    ; encoding: [0xc1,0xa8,0x0a,0x7e]
+
+v_rcp_f16 v5, 0.5
+// GFX12: v_rcp_f16_e32 v5, 0.5                   ; encoding: [0xf0,0xa8,0x0a,0x7e]
+
+v_rcp_f16 v5, src_scc
+// GFX12: v_rcp_f16_e32 v5, src_scc               ; encoding: [0xfd,0xa8,0x0a,0x7e]
+
+v_rcp_f16 v127, 0xfe0b
+// GFX12: v_rcp_f16_e32 v127, 0xfe0b              ; encoding: [0xff,0xa8,0xfe,0x7e,0x0b,0xfe,0x00,0x00]
+
+v_rcp_f32 v5, v1
+// GFX12: v_rcp_f32_e32 v5, v1                    ; encoding: [0x01,0x55,0x0a,0x7e]
+
+v_rcp_f32 v5, v255
+// GFX12: v_rcp_f32_e32 v5, v255                  ; encoding: [0xff,0x55,0x0a,0x7e]
+
+v_rcp_f32 v5, s1
+// GFX12: v_rcp_f32_e32 v5, s1                    ; encoding: [0x01,0x54,0x0a,0x7e]
+
+v_rcp_f32 v5, s105
+// GFX12: v_rcp_f32_e32 v5, s105                  ; encoding: [0x69,0x54,0x0a,0x7e]
+
+v_rcp_f32 v5, vcc_lo
+// GFX12: v_rcp_f32_e32 v5, vcc_lo                ; encoding: [0x6a,0x54,0x0a,0x7e]
+
+v_rcp_f32 v5, vcc_hi
+// GFX12: v_rcp_f32_e32 v5, vcc_hi                ; encoding: [0x6b,0x54,0x0a,0x7e]
+
+v_rcp_f32 v5, ttmp15
+// GFX12: v_rcp_f32_e32 v5, ttmp15                ; encoding: [0x7b,0x54,0x0a,0x7e]
+
+v_rcp_f32 v5, m0
+// GFX12: v_rcp_f32_e32 v5, m0                    ; encoding: [0x7d,0x54,0x0a,0x7e]
+
+v_rcp_f32 v5, exec_lo
+// GFX12: v_rcp_f32_e32 v5, exec_lo               ; encoding: [0x7e,0x54,0x0a,0x7e]
+
+v_rcp_f32 v5, exec_hi
+// GFX12: v_rcp_f32_e32 v5, exec_hi               ; encoding: [0x7f,0x54,0x0a,0x7e]
+
+v_rcp_f32 v5, null
+// GFX12: v_rcp_f32_e32 v5, null                  ; encoding: [0x7c,0x54,0x0a,0x7e]
+
+v_rcp_f32 v5, -1
+// GFX12: v_rcp_f32_e32 v5, -1                    ; encoding: [0xc1,0x54,0x0a,0x7e]
+
+v_rcp_f32 v5, 0.5
+// GFX12: v_rcp_f32_e32 v5, 0.5                   ; encoding: [0xf0,0x54,0x0a,0x7e]
+
+v_rcp_f32 v5, src_scc
+// GFX12: v_rcp_f32_e32 v5, src_scc               ; encoding: [0xfd,0x54,0x0a,0x7e]
+
+v_rcp_f32 v255, 0xaf123456
+// GFX12: v_rcp_f32_e32 v255, 0xaf123456          ; encoding: [0xff,0x54,0xfe,0x7f,0x56,0x34,0x12,0xaf]
+
+v_rcp_f64 v[5:6], v[1:2]
+// GFX12: v_rcp_f64_e32 v[5:6], v[1:2]            ; encoding: [0x01,0x5f,0x0a,0x7e]
+
+v_rcp_f64 v[5:6], v[254:255]
+// GFX12: v_rcp_f64_e32 v[5:6], v[254:255]        ; encoding: [0xfe,0x5f,0x0a,0x7e]
+
+v_rcp_f64 v[5:6], s[2:3]
+// GFX12: v_rcp_f64_e32 v[5:6], s[2:3]            ; encoding: [0x02,0x5e,0x0a,0x7e]
+
+v_rcp_f64 v[5:6], s[104:105]
+// GFX12: v_rcp_f64_e32 v[5:6], s[104:105]        ; encoding: [0x68,0x5e,0x0a,0x7e]
+
+v_rcp_f64 v[5:6], vcc
+// GFX12: v_rcp_f64_e32 v[5:6], vcc               ; encoding: [0x6a,0x5e,0x0a,0x7e]
+
+v_rcp_f64 v[5:6], ttmp[14:15]
+// GFX12: v_rcp_f64_e32 v[5:6], ttmp[14:15]       ; encoding: [0x7a,0x5e,0x0a,0x7e]
+
+v_rcp_f64 v[5:6], exec
+// GFX12: v_rcp_f64_e32 v[5:6], exec              ; encoding: [0x7e,0x5e,0x0a,0x7e]
+
+v_rcp_f64 v[5:6], null
+// GFX12: v_rcp_f64_e32 v[5:6], null              ; encoding: [0x7c,0x5e,0x0a,0x7e]
+
+v_rcp_f64 v[5:6], -1
+// GFX12: v_rcp_f64_e32 v[5:6], -1                ; encoding: [0xc1,0x5e,0x0a,0x7e]
+
+v_rcp_f64 v[5:6], 0.5
+// GFX12: v_rcp_f64_e32 v[5:6], 0.5               ; encoding: [0xf0,0x5e,0x0a,0x7e]
+
+v_rcp_f64 v[5:6], src_scc
+// GFX12: v_rcp_f64_e32 v[5:6], src_scc           ; encoding: [0xfd,0x5e,0x0a,0x7e]
+
+v_rcp_f64 v[254:255], 0xaf123456
+// GFX12: v_rcp_f64_e32 v[254:255], 0xaf123456    ; encoding: [0xff,0x5e,0xfc,0x7f,0x56,0x34,0x12,0xaf]
+
+v_rcp_iflag_f32 v5, v1
+// GFX12: v_rcp_iflag_f32_e32 v5, v1              ; encoding: [0x01,0x57,0x0a,0x7e]
+
+v_rcp_iflag_f32 v5, v255
+// GFX12: v_rcp_iflag_f32_e32 v5, v255            ; encoding: [0xff,0x57,0x0a,0x7e]
+
+v_rcp_iflag_f32 v5, s1
+// GFX12: v_rcp_iflag_f32_e32 v5, s1              ; encoding: [0x01,0x56,0x0a,0x7e]
+
+v_rcp_iflag_f32 v5, s105
+// GFX12: v_rcp_iflag_f32_e32 v5, s105            ; encoding: [0x69,0x56,0x0a,0x7e]
+
+v_rcp_iflag_f32 v5, vcc_lo
+// GFX12: v_rcp_iflag_f32_e32 v5, vcc_lo          ; encoding: [0x6a,0x56,0x0a,0x7e]
+
+v_rcp_iflag_f32 v5, vcc_hi
+// GFX12: v_rcp_iflag_f32_e32 v5, vcc_hi          ; encoding: [0x6b,0x56,0x0a,0x7e]
+
+v_rcp_iflag_f32 v5, ttmp15
+// GFX12: v_rcp_iflag_f32_e32 v5, ttmp15          ; encoding: [0x7b,0x56,0x0a,0x7e]
+
+v_rcp_iflag_f32 v5, m0
+// GFX12: v_rcp_iflag_f32_e32 v5, m0              ; encoding: [0x7d,0x56,0x0a,0x7e]
+
+v_rcp_iflag_f32 v5, exec_lo
+// GFX12: v_rcp_iflag_f32_e32 v5, exec_lo         ; encoding: [0x7e,0x56,0x0a,0x7e]
+
+v_rcp_iflag_f32 v5, exec_hi
+// GFX12: v_rcp_iflag_f32_e32 v5, exec_hi         ; encoding: [0x7f,0x56,0x0a,0x7e]
+
+v_rcp_iflag_f32 v5, null
+// GFX12: v_rcp_iflag_f32_e32 v5, null            ; encoding: [0x7c,0x56,0x0a,0x7e]
+
+v_rcp_iflag_f32 v5, -1
+// GFX12: v_rcp_iflag_f32_e32 v5, -1              ; encoding: [0xc1,0x56,0x0a,0x7e]
+
+v_rcp_iflag_f32 v5, 0.5
+// GFX12: v_rcp_iflag_f32_e32 v5, 0.5             ; encoding: [0xf0,0x56,0x0a,0x7e]
+
+v_rcp_iflag_f32 v5, src_scc
+// GFX12: v_rcp_iflag_f32_e32 v5, src_scc         ; encoding: [0xfd,0x56,0x0a,0x7e]
+
+v_rcp_iflag_f32 v255, 0xaf123456
+// GFX12: v_rcp_iflag_f32_e32 v255, 0xaf123456    ; encoding: [0xff,0x56,0xfe,0x7f,0x56,0x34,0x12,0xaf]
+
+v_readfirstlane_b32 s5, v1
+// GFX12: v_readfirstlane_b32 s5, v1              ; encoding: [0x01,0x05,0x0a,0x7e]
+
+v_readfirstlane_b32 s105, v1
+// GFX12: v_readfirstlane_b32 s105, v1            ; encoding: [0x01,0x05,0xd2,0x7e]
+
+v_readfirstlane_b32 vcc_lo, v1
+// GFX12: v_readfirstlane_b32 vcc_lo, v1          ; encoding: [0x01,0x05,0xd4,0x7e]
+
+v_readfirstlane_b32 vcc_hi, v1
+// GFX12: v_readfirstlane_b32 vcc_hi, v1          ; encoding: [0x01,0x05,0xd6,0x7e]
+
+v_readfirstlane_b32 ttmp15, v1
+// GFX12: v_readfirstlane_b32 ttmp15, v1          ; encoding: [0x01,0x05,0xf6,0x7e]
+
+v_readfirstlane_b32 null, v255
+// GFX12: v_readfirstlane_b32 null, v255          ; encoding: [0xff,0x05,0xf8,0x7e]
+
+v_rndne_f16 v5, v1
+// GFX12: v_rndne_f16_e32 v5, v1                  ; encoding: [0x01,0xbd,0x0a,0x7e]
+
+v_rndne_f16 v5, v127
+// GFX12: v_rndne_f16_e32 v5, v127                ; encoding: [0x7f,0xbd,0x0a,0x7e]
+
+v_rndne_f16 v5, s1
+// GFX12: v_rndne_f16_e32 v5, s1                  ; encoding: [0x01,0xbc,0x0a,0x7e]
+
+v_rndne_f16 v5, s105
+// GFX12: v_rndne_f16_e32 v5, s105                ; encoding: [0x69,0xbc,0x0a,0x7e]
+
+v_rndne_f16 v5, vcc_lo
+// GFX12: v_rndne_f16_e32 v5, vcc_lo              ; encoding: [0x6a,0xbc,0x0a,0x7e]
+
+v_rndne_f16 v5, vcc_hi
+// GFX12: v_rndne_f16_e32 v5, vcc_hi              ; encoding: [0x6b,0xbc,0x0a,0x7e]
+
+v_rndne_f16 v5, ttmp15
+// GFX12: v_rndne_f16_e32 v5, ttmp15              ; encoding: [0x7b,0xbc,0x0a,0x7e]
+
+v_rndne_f16 v5, m0
+// GFX12: v_rndne_f16_e32 v5, m0                  ; encoding: [0x7d,0xbc,0x0a,0x7e]
+
+v_rndne_f16 v5, exec_lo
+// GFX12: v_rndne_f16_e32 v5, exec_lo             ; encoding: [0x7e,0xbc,0x0a,0x7e]
+
+v_rndne_f16 v5, exec_hi
+// GFX12: v_rndne_f16_e32 v5, exec_hi             ; encoding: [0x7f,0xbc,0x0a,0x7e]
+
+v_rndne_f16 v5, null
+// GFX12: v_rndne_f16_e32 v5, null                ; encoding: [0x7c,0xbc,0x0a,0x7e]
+
+v_rndne_f16 v5, -1
+// GFX12: v_rndne_f16_e32 v5, -1                  ; encoding: [0xc1,0xbc,0x0a,0x7e]
+
+v_rndne_f16 v5, 0.5
+// GFX12: v_rndne_f16_e32 v5, 0.5                 ; encoding: [0xf0,0xbc,0x0a,0x7e]
+
+v_rndne_f16 v5, src_scc
+// GFX12: v_rndne_f16_e32 v5, src_scc             ; encoding: [0xfd,0xbc,0x0a,0x7e]
+
+v_rndne_f16 v127, 0xfe0b
+// GFX12: v_rndne_f16_e32 v127, 0xfe0b            ; encoding: [0xff,0xbc,0xfe,0x7e,0x0b,0xfe,0x00,0x00]
+
+v_rndne_f32 v5, v1
+// GFX12: v_rndne_f32_e32 v5, v1                  ; encoding: [0x01,0x47,0x0a,0x7e]
+
+v_rndne_f32 v5, v255
+// GFX12: v_rndne_f32_e32 v5, v255                ; encoding: [0xff,0x47,0x0a,0x7e]
+
+v_rndne_f32 v5, s1
+// GFX12: v_rndne_f32_e32 v5, s1                  ; encoding: [0x01,0x46,0x0a,0x7e]
+
+v_rndne_f32 v5, s105
+// GFX12: v_rndne_f32_e32 v5, s105                ; encoding: [0x69,0x46,0x0a,0x7e]
+
+v_rndne_f32 v5, vcc_lo
+// GFX12: v_rndne_f32_e32 v5, vcc_lo              ; encoding: [0x6a,0x46,0x0a,0x7e]
+
+v_rndne_f32 v5, vcc_hi
+// GFX12: v_rndne_f32_e32 v5, vcc_hi              ; encoding: [0x6b,0x46,0x0a,0x7e]
+
+v_rndne_f32 v5, ttmp15
+// GFX12: v_rndne_f32_e32 v5, ttmp15              ; encoding: [0x7b,0x46,0x0a,0x7e]
+
+v_rndne_f32 v5, m0
+// GFX12: v_rndne_f32_e32 v5, m0                  ; encoding: [0x7d,0x46,0x0a,0x7e]
+
+v_rndne_f32 v5, exec_lo
+// GFX12: v_rndne_f32_e32 v5, exec_lo             ; encoding: [0x7e,0x46,0x0a,0x7e]
+
+v_rndne_f32 v5, exec_hi
+// GFX12: v_rndne_f32_e32 v5, exec_hi             ; encoding: [0x7f,0x46,0x0a,0x7e]
+
+v_rndne_f32 v5, null
+// GFX12: v_rndne_f32_e32 v5, null                ; encoding: [0x7c,0x46,0x0a,0x7e]
+
+v_rndne_f32 v5, -1
+// GFX12: v_rndne_f32_e32 v5, -1                  ; encoding: [0xc1,0x46,0x0a,0x7e]
+
+v_rndne_f32 v5, 0.5
+// GFX12: v_rndne_f32_e32 v5, 0.5                 ; encoding: [0xf0,0x46,0x0a,0x7e]
+
+v_rndne_f32 v5, src_scc
+// GFX12: v_rndne_f32_e32 v5, src_scc             ; encoding: [0xfd,0x46,0x0a,0x7e]
+
+v_rndne_f32 v255, 0xaf123456
+// GFX12: v_rndne_f32_e32 v255, 0xaf123456        ; encoding: [0xff,0x46,0xfe,0x7f,0x56,0x34,0x12,0xaf]
+
+v_rndne_f64 v[5:6], v[1:2]
+// GFX12: v_rndne_f64_e32 v[5:6], v[1:2]          ; encoding: [0x01,0x33,0x0a,0x7e]
+
+v_rndne_f64 v[5:6], v[254:255]
+// GFX12: v_rndne_f64_e32 v[5:6], v[254:255]      ; encoding: [0xfe,0x33,0x0a,0x7e]
+
+v_rndne_f64 v[5:6], s[2:3]
+// GFX12: v_rndne_f64_e32 v[5:6], s[2:3]          ; encoding: [0x02,0x32,0x0a,0x7e]
+
+v_rndne_f64 v[5:6], s[104:105]
+// GFX12: v_rndne_f64_e32 v[5:6], s[104:105]      ; encoding: [0x68,0x32,0x0a,0x7e]
+
+v_rndne_f64 v[5:6], vcc
+// GFX12: v_rndne_f64_e32 v[5:6], vcc             ; encoding: [0x6a,0x32,0x0a,0x7e]
+
+v_rndne_f64 v[5:6], ttmp[14:15]
+// GFX12: v_rndne_f64_e32 v[5:6], ttmp[14:15]     ; encoding: [0x7a,0x32,0x0a,0x7e]
+
+v_rndne_f64 v[5:6], exec
+// GFX12: v_rndne_f64_e32 v[5:6], exec            ; encoding: [0x7e,0x32,0x0a,0x7e]
+
+v_rndne_f64 v[5:6], null
+// GFX12: v_rndne_f64_e32 v[5:6], null            ; encoding: [0x7c,0x32,0x0a,0x7e]
+
+v_rndne_f64 v[5:6], -1
+// GFX12: v_rndne_f64_e32 v[5:6], -1              ; encoding: [0xc1,0x32,0x0a,0x7e]
+
+v_rndne_f64 v[5:6], 0.5
+// GFX12: v_rndne_f64_e32 v[5:6], 0.5             ; encoding: [0xf0,0x32,0x0a,0x7e]
+
+v_rndne_f64 v[5:6], src_scc
+// GFX12: v_rndne_f64_e32 v[5:6], src_scc         ; encoding: [0xfd,0x32,0x0a,0x7e]
+
+v_rndne_f64 v[254:255], 0xaf123456
+// GFX12: v_rndne_f64_e32 v[254:255], 0xaf123456  ; encoding: [0xff,0x32,0xfc,0x7f,0x56,0x34,0x12,0xaf]
+
+v_rsq_f16 v5, v1
+// GFX12: v_rsq_f16_e32 v5, v1                    ; encoding: [0x01,0xad,0x0a,0x7e]
+
+v_rsq_f16 v5, v127
+// GFX12: v_rsq_f16_e32 v5, v127                  ; encoding: [0x7f,0xad,0x0a,0x7e]
+
+v_rsq_f16 v5, s1
+// GFX12: v_rsq_f16_e32 v5, s1                    ; encoding: [0x01,0xac,0x0a,0x7e]
+
+v_rsq_f16 v5, s105
+// GFX12: v_rsq_f16_e32 v5, s105                  ; encoding: [0x69,0xac,0x0a,0x7e]
+
+v_rsq_f16 v5, vcc_lo
+// GFX12: v_rsq_f16_e32 v5, vcc_lo                ; encoding: [0x6a,0xac,0x0a,0x7e]
+
+v_rsq_f16 v5, vcc_hi
+// GFX12: v_rsq_f16_e32 v5, vcc_hi                ; encoding: [0x6b,0xac,0x0a,0x7e]
+
+v_rsq_f16 v5, ttmp15
+// GFX12: v_rsq_f16_e32 v5, ttmp15                ; encoding: [0x7b,0xac,0x0a,0x7e]
+
+v_rsq_f16 v5, m0
+// GFX12: v_rsq_f16_e32 v5, m0                    ; encoding: [0x7d,0xac,0x0a,0x7e]
+
+v_rsq_f16 v5, exec_lo
+// GFX12: v_rsq_f16_e32 v5, exec_lo               ; encoding: [0x7e,0xac,0x0a,0x7e]
+
+v_rsq_f16 v5, exec_hi
+// GFX12: v_rsq_f16_e32 v5, exec_hi               ; encoding: [0x7f,0xac,0x0a,0x7e]
+
+v_rsq_f16 v5, null
+// GFX12: v_rsq_f16_e32 v5, null                  ; encoding: [0x7c,0xac,0x0a,0x7e]
+
+v_rsq_f16 v5, -1
+// GFX12: v_rsq_f16_e32 v5, -1                    ; encoding: [0xc1,0xac,0x0a,0x7e]
+
+v_rsq_f16 v5, 0.5
+// GFX12: v_rsq_f16_e32 v5, 0.5                   ; encoding: [0xf0,0xac,0x0a,0x7e]
+
+v_rsq_f16 v5, src_scc
+// GFX12: v_rsq_f16_e32 v5, src_scc               ; encoding: [0xfd,0xac,0x0a,0x7e]
+
+v_rsq_f16 v127, 0xfe0b
+// GFX12: v_rsq_f16_e32 v127, 0xfe0b              ; encoding: [0xff,0xac,0xfe,0x7e,0x0b,0xfe,0x00,0x00]
+
+v_rsq_f32 v5, v1
+// GFX12: v_rsq_f32_e32 v5, v1                    ; encoding: [0x01,0x5d,0x0a,0x7e]
+
+v_rsq_f32 v5, v255
+// GFX12: v_rsq_f32_e32 v5, v255                  ; encoding: [0xff,0x5d,0x0a,0x7e]
+
+v_rsq_f32 v5, s1
+// GFX12: v_rsq_f32_e32 v5, s1                    ; encoding: [0x01,0x5c,0x0a,0x7e]
+
+v_rsq_f32 v5, s105
+// GFX12: v_rsq_f32_e32 v5, s105                  ; encoding: [0x69,0x5c,0x0a,0x7e]
+
+v_rsq_f32 v5, vcc_lo
+// GFX12: v_rsq_f32_e32 v5, vcc_lo                ; encoding: [0x6a,0x5c,0x0a,0x7e]
+
+v_rsq_f32 v5, vcc_hi
+// GFX12: v_rsq_f32_e32 v5, vcc_hi                ; encoding: [0x6b,0x5c,0x0a,0x7e]
+
+v_rsq_f32 v5, ttmp15
+// GFX12: v_rsq_f32_e32 v5, ttmp15                ; encoding: [0x7b,0x5c,0x0a,0x7e]
+
+v_rsq_f32 v5, m0
+// GFX12: v_rsq_f32_e32 v5, m0                    ; encoding: [0x7d,0x5c,0x0a,0x7e]
+
+v_rsq_f32 v5, exec_lo
+// GFX12: v_rsq_f32_e32 v5, exec_lo               ; encoding: [0x7e,0x5c,0x0a,0x7e]
+
+v_rsq_f32 v5, exec_hi
+// GFX12: v_rsq_f32_e32 v5, exec_hi               ; encoding: [0x7f,0x5c,0x0a,0x7e]
+
+v_rsq_f32 v5, null
+// GFX12: v_rsq_f32_e32 v5, null                  ; encoding: [0x7c,0x5c,0x0a,0x7e]
+
+v_rsq_f32 v5, -1
+// GFX12: v_rsq_f32_e32 v5, -1                    ; encoding: [0xc1,0x5c,0x0a,0x7e]
+
+v_rsq_f32 v5, 0.5
+// GFX12: v_rsq_f32_e32 v5, 0.5                   ; encoding: [0xf0,0x5c,0x0a,0x7e]
+
+v_rsq_f32 v5, src_scc
+// GFX12: v_rsq_f32_e32 v5, src_scc               ; encoding: [0xfd,0x5c,0x0a,0x7e]
+
+v_rsq_f32 v255, 0xaf123456
+// GFX12: v_rsq_f32_e32 v255, 0xaf123456          ; encoding: [0xff,0x5c,0xfe,0x7f,0x56,0x34,0x12,0xaf]
+
+v_rsq_f64 v[5:6], v[1:2]
+// GFX12: v_rsq_f64_e32 v[5:6], v[1:2]            ; encoding: [0x01,0x63,0x0a,0x7e]
+
+v_rsq_f64 v[5:6], v[254:255]
+// GFX12: v_rsq_f64_e32 v[5:6], v[254:255]        ; encoding: [0xfe,0x63,0x0a,0x7e]
+
+v_rsq_f64 v[5:6], s[2:3]
+// GFX12: v_rsq_f64_e32 v[5:6], s[2:3]            ; encoding: [0x02,0x62,0x0a,0x7e]
+
+v_rsq_f64 v[5:6], s[104:105]
+// GFX12: v_rsq_f64_e32 v[5:6], s[104:105]        ; encoding: [0x68,0x62,0x0a,0x7e]
+
+v_rsq_f64 v[5:6], vcc
+// GFX12: v_rsq_f64_e32 v[5:6], vcc               ; encoding: [0x6a,0x62,0x0a,0x7e]
+
+v_rsq_f64 v[5:6], ttmp[14:15]
+// GFX12: v_rsq_f64_e32 v[5:6], ttmp[14:15]       ; encoding: [0x7a,0x62,0x0a,0x7e]
+
+v_rsq_f64 v[5:6], exec
+// GFX12: v_rsq_f64_e32 v[5:6], exec              ; encoding: [0x7e,0x62,0x0a,0x7e]
+
+v_rsq_f64 v[5:6], null
+// GFX12: v_rsq_f64_e32 v[5:6], null              ; encoding: [0x7c,0x62,0x0a,0x7e]
+
+v_rsq_f64 v[5:6], -1
+// GFX12: v_rsq_f64_e32 v[5:6], -1                ; encoding: [0xc1,0x62,0x0a,0x7e]
+
+v_rsq_f64 v[5:6], 0.5
+// GFX12: v_rsq_f64_e32 v[5:6], 0.5               ; encoding: [0xf0,0x62,0x0a,0x7e]
+
+v_rsq_f64 v[5:6], src_scc
+// GFX12: v_rsq_f64_e32 v[5:6], src_scc           ; encoding: [0xfd,0x62,0x0a,0x7e]
+
+v_rsq_f64 v[254:255], 0xaf123456
+// GFX12: v_rsq_f64_e32 v[254:255], 0xaf123456    ; encoding: [0xff,0x62,0xfc,0x7f,0x56,0x34,0x12,0xaf]
+
+v_sat_pk_u8_i16 v5, v1
+// GFX12: v_sat_pk_u8_i16_e32 v5, v1              ; encoding: [0x01,0xc5,0x0a,0x7e]
+
+v_sat_pk_u8_i16 v5, v255
+// GFX12: v_sat_pk_u8_i16_e32 v5, v255            ; encoding: [0xff,0xc5,0x0a,0x7e]
+
+v_sat_pk_u8_i16 v5, s1
+// GFX12: v_sat_pk_u8_i16_e32 v5, s1              ; encoding: [0x01,0xc4,0x0a,0x7e]
+
+v_sat_pk_u8_i16 v5, s105
+// GFX12: v_sat_pk_u8_i16_e32 v5, s105            ; encoding: [0x69,0xc4,0x0a,0x7e]
+
+v_sat_pk_u8_i16 v5, vcc_lo
+// GFX12: v_sat_pk_u8_i16_e32 v5, vcc_lo          ; encoding: [0x6a,0xc4,0x0a,0x7e]
+
+v_sat_pk_u8_i16 v5, vcc_hi
+// GFX12: v_sat_pk_u8_i16_e32 v5, vcc_hi          ; encoding: [0x6b,0xc4,0x0a,0x7e]
+
+v_sat_pk_u8_i16 v5, ttmp15
+// GFX12: v_sat_pk_u8_i16_e32 v5, ttmp15          ; encoding: [0x7b,0xc4,0x0a,0x7e]
+
+v_sat_pk_u8_i16 v5, m0
+// GFX12: v_sat_pk_u8_i16_e32 v5, m0              ; encoding: [0x7d,0xc4,0x0a,0x7e]
+
+v_sat_pk_u8_i16 v5, exec_lo
+// GFX12: v_sat_pk_u8_i16_e32 v5, exec_lo         ; encoding: [0x7e,0xc4,0x0a,0x7e]
+
+v_sat_pk_u8_i16 v5, exec_hi
+// GFX12: v_sat_pk_u8_i16_e32 v5, exec_hi         ; encoding: [0x7f,0xc4,0x0a,0x7e]
+
+v_sat_pk_u8_i16 v5, null
+// GFX12: v_sat_pk_u8_i16_e32 v5, null            ; encoding: [0x7c,0xc4,0x0a,0x7e]
+
+v_sat_pk_u8_i16 v5, -1
+// GFX12: v_sat_pk_u8_i16_e32 v5, -1              ; encoding: [0xc1,0xc4,0x0a,0x7e]
+
+v_sat_pk_u8_i16 v5, 0.5
+// GFX12: v_sat_pk_u8_i16_e32 v5, 0.5             ; encoding: [0xf0,0xc4,0x0a,0x7e]
+
+v_sat_pk_u8_i16 v5, src_scc
+// GFX12: v_sat_pk_u8_i16_e32 v5, src_scc         ; encoding: [0xfd,0xc4,0x0a,0x7e]
+
+v_sat_pk_u8_i16 v127, 0xfe0b
+// GFX12: v_sat_pk_u8_i16_e32 v127, 0xfe0b        ; encoding: [0xff,0xc4,0xfe,0x7e,0x0b,0xfe,0x00,0x00]
+
+v_sin_f16 v5, v1
+// GFX12: v_sin_f16_e32 v5, v1                    ; encoding: [0x01,0xc1,0x0a,0x7e]
+
+v_sin_f16 v5, v127
+// GFX12: v_sin_f16_e32 v5, v127                  ; encoding: [0x7f,0xc1,0x0a,0x7e]
+
+v_sin_f16 v5, s1
+// GFX12: v_sin_f16_e32 v5, s1                    ; encoding: [0x01,0xc0,0x0a,0x7e]
+
+v_sin_f16 v5, s105
+// GFX12: v_sin_f16_e32 v5, s105                  ; encoding: [0x69,0xc0,0x0a,0x7e]
+
+v_sin_f16 v5, vcc_lo
+// GFX12: v_sin_f16_e32 v5, vcc_lo                ; encoding: [0x6a,0xc0,0x0a,0x7e]
+
+v_sin_f16 v5, vcc_hi
+// GFX12: v_sin_f16_e32 v5, vcc_hi                ; encoding: [0x6b,0xc0,0x0a,0x7e]
+
+v_sin_f16 v5, ttmp15
+// GFX12: v_sin_f16_e32 v5, ttmp15                ; encoding: [0x7b,0xc0,0x0a,0x7e]
+
+v_sin_f16 v5, m0
+// GFX12: v_sin_f16_e32 v5, m0                    ; encoding: [0x7d,0xc0,0x0a,0x7e]
+
+v_sin_f16 v5, exec_lo
+// GFX12: v_sin_f16_e32 v5, exec_lo               ; encoding: [0x7e,0xc0,0x0a,0x7e]
+
+v_sin_f16 v5, exec_hi
+// GFX12: v_sin_f16_e32 v5, exec_hi               ; encoding: [0x7f,0xc0,0x0a,0x7e]
+
+v_sin_f16 v5, null
+// GFX12: v_sin_f16_e32 v5, null                  ; encoding: [0x7c,0xc0,0x0a,0x7e]
+
+v_sin_f16 v5, -1
+// GFX12: v_sin_f16_e32 v5, -1                    ; encoding: [0xc1,0xc0,0x0a,0x7e]
+
+v_sin_f16 v5, 0.5
+// GFX12: v_sin_f16_e32 v5, 0.5                   ; encoding: [0xf0,0xc0,0x0a,0x7e]
+
+v_sin_f16 v5, src_scc
+// GFX12: v_sin_f16_e32 v5, src_scc               ; encoding: [0xfd,0xc0,0x0a,0x7e]
+
+v_sin_f16 v127, 0xfe0b
+// GFX12: v_sin_f16_e32 v127, 0xfe0b              ; encoding: [0xff,0xc0,0xfe,0x7e,0x0b,0xfe,0x00,0x00]
+
+v_sin_f32 v5, v1
+// GFX12: v_sin_f32_e32 v5, v1                    ; encoding: [0x01,0x6b,0x0a,0x7e]
+
+v_sin_f32 v5, v255
+// GFX12: v_sin_f32_e32 v5, v255                  ; encoding: [0xff,0x6b,0x0a,0x7e]
+
+v_sin_f32 v5, s1
+// GFX12: v_sin_f32_e32 v5, s1                    ; encoding: [0x01,0x6a,0x0a,0x7e]
+
+v_sin_f32 v5, s105
+// GFX12: v_sin_f32_e32 v5, s105                  ; encoding: [0x69,0x6a,0x0a,0x7e]
+
+v_sin_f32 v5, vcc_lo
+// GFX12: v_sin_f32_e32 v5, vcc_lo                ; encoding: [0x6a,0x6a,0x0a,0x7e]
+
+v_sin_f32 v5, vcc_hi
+// GFX12: v_sin_f32_e32 v5, vcc_hi                ; encoding: [0x6b,0x6a,0x0a,0x7e]
+
+v_sin_f32 v5, ttmp15
+// GFX12: v_sin_f32_e32 v5, ttmp15                ; encoding: [0x7b,0x6a,0x0a,0x7e]
+
+v_sin_f32 v5, m0
+// GFX12: v_sin_f32_e32 v5, m0                    ; encoding: [0x7d,0x6a,0x0a,0x7e]
+
+v_sin_f32 v5, exec_lo
+// GFX12: v_sin_f32_e32 v5, exec_lo               ; encoding: [0x7e,0x6a,0x0a,0x7e]
+
+v_sin_f32 v5, exec_hi
+// GFX12: v_sin_f32_e32 v5, exec_hi               ; encoding: [0x7f,0x6a,0x0a,0x7e]
+
+v_sin_f32 v5, null
+// GFX12: v_sin_f32_e32 v5, null                  ; encoding: [0x7c,0x6a,0x0a,0x7e]
+
+v_sin_f32 v5, -1
+// GFX12: v_sin_f32_e32 v5, -1                    ; encoding: [0xc1,0x6a,0x0a,0x7e]
+
+v_sin_f32 v5, 0.5
+// GFX12: v_sin_f32_e32 v5, 0.5                   ; encoding: [0xf0,0x6a,0x0a,0x7e]
+
+v_sin_f32 v5, src_scc
+// GFX12: v_sin_f32_e32 v5, src_scc               ; encoding: [0xfd,0x6a,0x0a,0x7e]
+
+v_sin_f32 v255, 0xaf123456
+// GFX12: v_sin_f32_e32 v255, 0xaf123456          ; encoding: [0xff,0x6a,0xfe,0x7f,0x56,0x34,0x12,0xaf]
+
+v_sqrt_f16 v5, v1
+// GFX12: v_sqrt_f16_e32 v5, v1                   ; encoding: [0x01,0xab,0x0a,0x7e]
+
+v_sqrt_f16 v5, v127
+// GFX12: v_sqrt_f16_e32 v5, v127                 ; encoding: [0x7f,0xab,0x0a,0x7e]
+
+v_sqrt_f16 v5, s1
+// GFX12: v_sqrt_f16_e32 v5, s1                   ; encoding: [0x01,0xaa,0x0a,0x7e]
+
+v_sqrt_f16 v5, s105
+// GFX12: v_sqrt_f16_e32 v5, s105                 ; encoding: [0x69,0xaa,0x0a,0x7e]
+
+v_sqrt_f16 v5, vcc_lo
+// GFX12: v_sqrt_f16_e32 v5, vcc_lo               ; encoding: [0x6a,0xaa,0x0a,0x7e]
+
+v_sqrt_f16 v5, vcc_hi
+// GFX12: v_sqrt_f16_e32 v5, vcc_hi               ; encoding: [0x6b,0xaa,0x0a,0x7e]
+
+v_sqrt_f16 v5, ttmp15
+// GFX12: v_sqrt_f16_e32 v5, ttmp15               ; encoding: [0x7b,0xaa,0x0a,0x7e]
+
+v_sqrt_f16 v5, m0
+// GFX12: v_sqrt_f16_e32 v5, m0                   ; encoding: [0x7d,0xaa,0x0a,0x7e]
+
+v_sqrt_f16 v5, exec_lo
+// GFX12: v_sqrt_f16_e32 v5, exec_lo              ; encoding: [0x7e,0xaa,0x0a,0x7e]
+
+v_sqrt_f16 v5, exec_hi
+// GFX12: v_sqrt_f16_e32 v5, exec_hi              ; encoding: [0x7f,0xaa,0x0a,0x7e]
+
+v_sqrt_f16 v5, null
+// GFX12: v_sqrt_f16_e32 v5, null                 ; encoding: [0x7c,0xaa,0x0a,0x7e]
+
+v_sqrt_f16 v5, -1
+// GFX12: v_sqrt_f16_e32 v5, -1                   ; encoding: [0xc1,0xaa,0x0a,0x7e]
+
+v_sqrt_f16 v5, 0.5
+// GFX12: v_sqrt_f16_e32 v5, 0.5                  ; encoding: [0xf0,0xaa,0x0a,0x7e]
+
+v_sqrt_f16 v5, src_scc
+// GFX12: v_sqrt_f16_e32 v5, src_scc              ; encoding: [0xfd,0xaa,0x0a,0x7e]
+
+v_sqrt_f16 v127, 0xfe0b
+// GFX12: v_sqrt_f16_e32 v127, 0xfe0b             ; encoding: [0xff,0xaa,0xfe,0x7e,0x0b,0xfe,0x00,0x00]
+
+v_sqrt_f32 v5, v1
+// GFX12: v_sqrt_f32_e32 v5, v1                   ; encoding: [0x01,0x67,0x0a,0x7e]
+
+v_sqrt_f32 v5, v255
+// GFX12: v_sqrt_f32_e32 v5, v255                 ; encoding: [0xff,0x67,0x0a,0x7e]
+
+v_sqrt_f32 v5, s1
+// GFX12: v_sqrt_f32_e32 v5, s1                   ; encoding: [0x01,0x66,0x0a,0x7e]
+
+v_sqrt_f32 v5, s105
+// GFX12: v_sqrt_f32_e32 v5, s105                 ; encoding: [0x69,0x66,0x0a,0x7e]
+
+v_sqrt_f32 v5, vcc_lo
+// GFX12: v_sqrt_f32_e32 v5, vcc_lo               ; encoding: [0x6a,0x66,0x0a,0x7e]
+
+v_sqrt_f32 v5, vcc_hi
+// GFX12: v_sqrt_f32_e32 v5, vcc_hi               ; encoding: [0x6b,0x66,0x0a,0x7e]
+
+v_sqrt_f32 v5, ttmp15
+// GFX12: v_sqrt_f32_e32 v5, ttmp15               ; encoding: [0x7b,0x66,0x0a,0x7e]
+
+v_sqrt_f32 v5, m0
+// GFX12: v_sqrt_f32_e32 v5, m0                   ; encoding: [0x7d,0x66,0x0a,0x7e]
+
+v_sqrt_f32 v5, exec_lo
+// GFX12: v_sqrt_f32_e32 v5, exec_lo              ; encoding: [0x7e,0x66,0x0a,0x7e]
+
+v_sqrt_f32 v5, exec_hi
+// GFX12: v_sqrt_f32_e32 v5, exec_hi              ; encoding: [0x7f,0x66,0x0a,0x7e]
+
+v_sqrt_f32 v5, null
+// GFX12: v_sqrt_f32_e32 v5, null                 ; encoding: [0x7c,0x66,0x0a,0x7e]
+
+v_sqrt_f32 v5, -1
+// GFX12: v_sqrt_f32_e32 v5, -1                   ; encoding: [0xc1,0x66,0x0a,0x7e]
+
+v_sqrt_f32 v5, 0.5
+// GFX12: v_sqrt_f32_e32 v5, 0.5                  ; encoding: [0xf0,0x66,0x0a,0x7e]
+
+v_sqrt_f32 v5, src_scc
+// GFX12: v_sqrt_f32_e32 v5, src_scc              ; encoding: [0xfd,0x66,0x0a,0x7e]
+
+v_sqrt_f32 v255, 0xaf123456
+// GFX12: v_sqrt_f32_e32 v255, 0xaf123456         ; encoding: [0xff,0x66,0xfe,0x7f,0x56,0x34,0x12,0xaf]
+
+v_sqrt_f64 v[5:6], v[1:2]
+// GFX12: v_sqrt_f64_e32 v[5:6], v[1:2]           ; encoding: [0x01,0x69,0x0a,0x7e]
+
+v_sqrt_f64 v[5:6], v[254:255]
+// GFX12: v_sqrt_f64_e32 v[5:6], v[254:255]       ; encoding: [0xfe,0x69,0x0a,0x7e]
+
+v_sqrt_f64 v[5:6], s[2:3]
+// GFX12: v_sqrt_f64_e32 v[5:6], s[2:3]           ; encoding: [0x02,0x68,0x0a,0x7e]
+
+v_sqrt_f64 v[5:6], s[104:105]
+// GFX12: v_sqrt_f64_e32 v[5:6], s[104:105]       ; encoding: [0x68,0x68,0x0a,0x7e]
+
+v_sqrt_f64 v[5:6], vcc
+// GFX12: v_sqrt_f64_e32 v[5:6], vcc              ; encoding: [0x6a,0x68,0x0a,0x7e]
+
+v_sqrt_f64 v[5:6], ttmp[14:15]
+// GFX12: v_sqrt_f64_e32 v[5:6], ttmp[14:15]      ; encoding: [0x7a,0x68,0x0a,0x7e]
+
+v_sqrt_f64 v[5:6], exec
+// GFX12: v_sqrt_f64_e32 v[5:6], exec             ; encoding: [0x7e,0x68,0x0a,0x7e]
+
+v_sqrt_f64 v[5:6], null
+// GFX12: v_sqrt_f64_e32 v[5:6], null             ; encoding: [0x7c,0x68,0x0a,0x7e]
+
+v_sqrt_f64 v[5:6], -1
+// GFX12: v_sqrt_f64_e32 v[5:6], -1               ; encoding: [0xc1,0x68,0x0a,0x7e]
+
+v_sqrt_f64 v[5:6], 0.5
+// GFX12: v_sqrt_f64_e32 v[5:6], 0.5              ; encoding: [0xf0,0x68,0x0a,0x7e]
+
+v_sqrt_f64 v[5:6], src_scc
+// GFX12: v_sqrt_f64_e32 v[5:6], src_scc          ; encoding: [0xfd,0x68,0x0a,0x7e]
+
+v_sqrt_f64 v[254:255], 0xaf123456
+// GFX12: v_sqrt_f64_e32 v[254:255], 0xaf123456   ; encoding: [0xff,0x68,0xfc,0x7f,0x56,0x34,0x12,0xaf]
+
+v_swap_b32 v5, v1
+// GFX12: v_swap_b32 v5, v1                       ; encoding: [0x01,0xcb,0x0a,0x7e]
+
+v_swap_b32 v255, v255
+// GFX12: v_swap_b32 v255, v255                   ; encoding: [0xff,0xcb,0xfe,0x7f]
+
+v_swaprel_b32 v5, v1
+// GFX12: v_swaprel_b32 v5, v1                    ; encoding: [0x01,0xd1,0x0a,0x7e]
+
+v_swaprel_b32 v255, v255
+// GFX12: v_swaprel_b32 v255, v255                ; encoding: [0xff,0xd1,0xfe,0x7f]
+
+v_trunc_f16 v5, v1
+// GFX12: v_trunc_f16_e32 v5, v1                  ; encoding: [0x01,0xbb,0x0a,0x7e]
+
+v_trunc_f16 v5, v127
+// GFX12: v_trunc_f16_e32 v5, v127                ; encoding: [0x7f,0xbb,0x0a,0x7e]
+
+v_trunc_f16 v5, s1
+// GFX12: v_trunc_f16_e32 v5, s1                  ; encoding: [0x01,0xba,0x0a,0x7e]
+
+v_trunc_f16 v5, s105
+// GFX12: v_trunc_f16_e32 v5, s105                ; encoding: [0x69,0xba,0x0a,0x7e]
+
+v_trunc_f16 v5, vcc_lo
+// GFX12: v_trunc_f16_e32 v5, vcc_lo              ; encoding: [0x6a,0xba,0x0a,0x7e]
+
+v_trunc_f16 v5, vcc_hi
+// GFX12: v_trunc_f16_e32 v5, vcc_hi              ; encoding: [0x6b,0xba,0x0a,0x7e]
+
+v_trunc_f16 v5, ttmp15
+// GFX12: v_trunc_f16_e32 v5, ttmp15              ; encoding: [0x7b,0xba,0x0a,0x7e]
+
+v_trunc_f16 v5, m0
+// GFX12: v_trunc_f16_e32 v5, m0                  ; encoding: [0x7d,0xba,0x0a,0x7e]
+
+v_trunc_f16 v5, exec_lo
+// GFX12: v_trunc_f16_e32 v5, exec_lo             ; encoding: [0x7e,0xba,0x0a,0x7e]
+
+v_trunc_f16 v5, exec_hi
+// GFX12: v_trunc_f16_e32 v5, exec_hi             ; encoding: [0x7f,0xba,0x0a,0x7e]
+
+v_trunc_f16 v5, null
+// GFX12: v_trunc_f16_e32 v5, null                ; encoding: [0x7c,0xba,0x0a,0x7e]
+
+v_trunc_f16 v5, -1
+// GFX12: v_trunc_f16_e32 v5, -1                  ; encoding: [0xc1,0xba,0x0a,0x7e]
+
+v_trunc_f16 v5, 0.5
+// GFX12: v_trunc_f16_e32 v5, 0.5                 ; encoding: [0xf0,0xba,0x0a,0x7e]
+
+v_trunc_f16 v5, src_scc
+// GFX12: v_trunc_f16_e32 v5, src_scc             ; encoding: [0xfd,0xba,0x0a,0x7e]
+
+v_trunc_f16 v127, 0xfe0b
+// GFX12: v_trunc_f16_e32 v127, 0xfe0b            ; encoding: [0xff,0xba,0xfe,0x7e,0x0b,0xfe,0x00,0x00]
+
+v_trunc_f32 v5, v1
+// GFX12: v_trunc_f32_e32 v5, v1                  ; encoding: [0x01,0x43,0x0a,0x7e]
+
+v_trunc_f32 v5, v255
+// GFX12: v_trunc_f32_e32 v5, v255                ; encoding: [0xff,0x43,0x0a,0x7e]
+
+v_trunc_f32 v5, s1
+// GFX12: v_trunc_f32_e32 v5, s1                  ; encoding: [0x01,0x42,0x0a,0x7e]
+
+v_trunc_f32 v5, s105
+// GFX12: v_trunc_f32_e32 v5, s105                ; encoding: [0x69,0x42,0x0a,0x7e]
+
+v_trunc_f32 v5, vcc_lo
+// GFX12: v_trunc_f32_e32 v5, vcc_lo              ; encoding: [0x6a,0x42,0x0a,0x7e]
+
+v_trunc_f32 v5, vcc_hi
+// GFX12: v_trunc_f32_e32 v5, vcc_hi              ; encoding: [0x6b,0x42,0x0a,0x7e]
+
+v_trunc_f32 v5, ttmp15
+// GFX12: v_trunc_f32_e32 v5, ttmp15              ; encoding: [0x7b,0x42,0x0a,0x7e]
+
+v_trunc_f32 v5, m0
+// GFX12: v_trunc_f32_e32 v5, m0                  ; encoding: [0x7d,0x42,0x0a,0x7e]
+
+v_trunc_f32 v5, exec_lo
+// GFX12: v_trunc_f32_e32 v5, exec_lo             ; encoding: [0x7e,0x42,0x0a,0x7e]
+
+v_trunc_f32 v5, exec_hi
+// GFX12: v_trunc_f32_e32 v5, exec_hi             ; encoding: [0x7f,0x42,0x0a,0x7e]
+
+v_trunc_f32 v5, null
+// GFX12: v_trunc_f32_e32 v5, null                ; encoding: [0x7c,0x42,0x0a,0x7e]
+
+v_trunc_f32 v5, -1
+// GFX12: v_trunc_f32_e32 v5, -1                  ; encoding: [0xc1,0x42,0x0a,0x7e]
+
+v_trunc_f32 v5, 0.5
+// GFX12: v_trunc_f32_e32 v5, 0.5                 ; encoding: [0xf0,0x42,0x0a,0x7e]
+
+v_trunc_f32 v5, src_scc
+// GFX12: v_trunc_f32_e32 v5, src_scc             ; encoding: [0xfd,0x42,0x0a,0x7e]
+
+v_trunc_f32 v255, 0xaf123456
+// GFX12: v_trunc_f32_e32 v255, 0xaf123456        ; encoding: [0xff,0x42,0xfe,0x7f,0x56,0x34,0x12,0xaf]
+
+v_trunc_f64 v[5:6], v[1:2]
+// GFX12: v_trunc_f64_e32 v[5:6], v[1:2]          ; encoding: [0x01,0x2f,0x0a,0x7e]
+
+v_trunc_f64 v[5:6], v[254:255]
+// GFX12: v_trunc_f64_e32 v[5:6], v[254:255]      ; encoding: [0xfe,0x2f,0x0a,0x7e]
+
+v_trunc_f64 v[5:6], s[2:3]
+// GFX12: v_trunc_f64_e32 v[5:6], s[2:3]          ; encoding: [0x02,0x2e,0x0a,0x7e]
+
+v_trunc_f64 v[5:6], s[104:105]
+// GFX12: v_trunc_f64_e32 v[5:6], s[104:105]      ; encoding: [0x68,0x2e,0x0a,0x7e]
+
+v_trunc_f64 v[5:6], vcc
+// GFX12: v_trunc_f64_e32 v[5:6], vcc             ; encoding: [0x6a,0x2e,0x0a,0x7e]
+
+v_trunc_f64 v[5:6], ttmp[14:15]
+// GFX12: v_trunc_f64_e32 v[5:6], ttmp[14:15]     ; encoding: [0x7a,0x2e,0x0a,0x7e]
+
+v_trunc_f64 v[5:6], exec
+// GFX12: v_trunc_f64_e32 v[5:6], exec            ; encoding: [0x7e,0x2e,0x0a,0x7e]
+
+v_trunc_f64 v[5:6], null
+// GFX12: v_trunc_f64_e32 v[5:6], null            ; encoding: [0x7c,0x2e,0x0a,0x7e]
+
+v_trunc_f64 v[5:6], -1
+// GFX12: v_trunc_f64_e32 v[5:6], -1              ; encoding: [0xc1,0x2e,0x0a,0x7e]
+
+v_trunc_f64 v[5:6], 0.5
+// GFX12: v_trunc_f64_e32 v[5:6], 0.5             ; encoding: [0xf0,0x2e,0x0a,0x7e]
+
+v_trunc_f64 v[5:6], src_scc
+// GFX12: v_trunc_f64_e32 v[5:6], src_scc         ; encoding: [0xfd,0x2e,0x0a,0x7e]
+
+v_trunc_f64 v[254:255], 0xaf123456
+// GFX12: v_trunc_f64_e32 v[254:255], 0xaf123456  ; encoding: [0xff,0x2e,0xfc,0x7f,0x56,0x34,0x12,0xaf]
diff --git a/llvm/test/MC/AMDGPU/gfx12_asm_vop1.s b/llvm/test/MC/AMDGPU/gfx12_asm_vop1.s
index 2e83f8f..e82ccc1 100644
--- a/llvm/test/MC/AMDGPU/gfx12_asm_vop1.s
+++ b/llvm/test/MC/AMDGPU/gfx12_asm_vop1.s
@@ -1,7 +1,9 @@
-// RUN: llvm-mc -triple=amdgcn -mcpu=gfx1200 -mattr=+wavefrontsize32 -show-encoding %s | FileCheck --strict-whitespace --check-prefixes=GFX12,GFX12-ASM %s
-// RUN: llvm-mc -triple=amdgcn -mcpu=gfx1200 -mattr=+wavefrontsize32 -show-encoding %s | %extract-encodings | llvm-mc -triple=amdgcn -mcpu=gfx1200 -mattr=+wavefrontsize32 -disassemble -show-encoding | FileCheck --strict-whitespace --check-prefixes=GFX12,GFX12-DIS %s
-// RUN: llvm-mc -triple=amdgcn -mcpu=gfx1200 -mattr=+wavefrontsize64 -show-encoding %s | FileCheck --strict-whitespace --check-prefixes=GFX12,GFX12-ASM %s
-// RUN: llvm-mc -triple=amdgcn -mcpu=gfx1200 -mattr=+wavefrontsize64 -show-encoding %s | %extract-encodings | llvm-mc -triple=amdgcn -mcpu=gfx1200 -mattr=+wavefrontsize64 -disassemble -show-encoding | FileCheck --strict-whitespace --check-prefixes=GFX12,GFX12-DIS %s
+// RUN: llvm-mc -triple=amdgcn -mcpu=gfx1200 -mattr=+wavefrontsize32,+real-true16 -show-encoding %s | FileCheck --strict-whitespace --check-prefixes=GFX12,GFX12-ASM %s
+// RUN: llvm-mc -triple=amdgcn -mcpu=gfx1200 -mattr=+wavefrontsize32,+real-true16 -show-encoding %s | %extract-encodings | llvm-mc -triple=amdgcn -mcpu=gfx1200 -mattr=+wavefrontsize32 -disassemble -show-encoding | FileCheck --strict-whitespace --check-prefixes=GFX12,GFX12-DIS %s
+// RUN: llvm-mc -triple=amdgcn -mcpu=gfx1200 -mattr=+wavefrontsize64,+real-true16 -show-encoding %s | FileCheck --strict-whitespace --check-prefixes=GFX12,GFX12-ASM %s
+// RUN: llvm-mc -triple=amdgcn -mcpu=gfx1200 -mattr=+wavefrontsize64,+real-true16 -show-encoding %s | %extract-encodings | llvm-mc -triple=amdgcn -mcpu=gfx1200 -mattr=+wavefrontsize64 -disassemble -show-encoding | FileCheck --strict-whitespace --check-prefixes=GFX12,GFX12-DIS %s
+
+// this file will be converted to true16 format when more true16 instructions are supported
 
 v_bfrev_b32_e32 v5, v1
 // GFX12: v_bfrev_b32_e32 v5, v1                  ; encoding: [0x01,0x71,0x0a,0x7e]
diff --git a/llvm/test/MC/AMDGPU/gfx12_asm_vop1_dpp16-fake16.s b/llvm/test/MC/AMDGPU/gfx12_asm_vop1_dpp16-fake16.s
new file mode 100644
index 0000000..9b181e9
--- /dev/null
+++ b/llvm/test/MC/AMDGPU/gfx12_asm_vop1_dpp16-fake16.s
@@ -0,0 +1,2828 @@
+// RUN: llvm-mc -triple=amdgcn -mcpu=gfx1200 -mattr=+wavefrontsize32,-real-true16 -show-encoding %s | FileCheck --check-prefixes=GFX12 %s
+// RUN: llvm-mc -triple=amdgcn -mcpu=gfx1200 -mattr=+wavefrontsize64,-real-true16 -show-encoding %s | FileCheck --check-prefixes=GFX12 %s
+
+v_bfrev_b32_dpp v5, v1 quad_perm:[3,2,1,0]
+// GFX12: encoding: [0xfa,0x70,0x0a,0x7e,0x01,0x1b,0x00,0xff]
+
+v_bfrev_b32 v5, v1 quad_perm:[0,1,2,3]
+// GFX12: encoding: [0xfa,0x70,0x0a,0x7e,0x01,0xe4,0x00,0xff]
+
+v_bfrev_b32 v5, v1 row_mirror
+// GFX12: encoding: [0xfa,0x70,0x0a,0x7e,0x01,0x40,0x01,0xff]
+
+v_bfrev_b32 v5, v1 row_half_mirror
+// GFX12: encoding: [0xfa,0x70,0x0a,0x7e,0x01,0x41,0x01,0xff]
+
+v_bfrev_b32 v5, v1 row_shl:1
+// GFX12: encoding: [0xfa,0x70,0x0a,0x7e,0x01,0x01,0x01,0xff]
+
+v_bfrev_b32 v5, v1 row_shl:15
+// GFX12: encoding: [0xfa,0x70,0x0a,0x7e,0x01,0x0f,0x01,0xff]
+
+v_bfrev_b32 v5, v1 row_shr:1
+// GFX12: encoding: [0xfa,0x70,0x0a,0x7e,0x01,0x11,0x01,0xff]
+
+v_bfrev_b32 v5, v1 row_shr:15
+// GFX12: encoding: [0xfa,0x70,0x0a,0x7e,0x01,0x1f,0x01,0xff]
+
+v_bfrev_b32 v5, v1 row_ror:1
+// GFX12: encoding: [0xfa,0x70,0x0a,0x7e,0x01,0x21,0x01,0xff]
+
+v_bfrev_b32 v5, v1 row_ror:15
+// GFX12: encoding: [0xfa,0x70,0x0a,0x7e,0x01,0x2f,0x01,0xff]
+
+v_bfrev_b32 v5, v1 row_share:0 row_mask:0xf bank_mask:0xf
+// GFX12: encoding: [0xfa,0x70,0x0a,0x7e,0x01,0x50,0x01,0xff]
+
+v_bfrev_b32 v5, v1 row_share:15 row_mask:0x0 bank_mask:0x1
+// GFX12: encoding: [0xfa,0x70,0x0a,0x7e,0x01,0x5f,0x01,0x01]
+
+v_bfrev_b32 v5, v1 row_xmask:0 row_mask:0x1 bank_mask:0x3 bound_ctrl:1 fi:0
+// GFX12: encoding: [0xfa,0x70,0x0a,0x7e,0x01,0x60,0x09,0x13]
+
+v_bfrev_b32 v255, v255 row_xmask:15 row_mask:0x3 bank_mask:0x0 bound_ctrl:0 fi:1
+// GFX12: encoding: [0xfa,0x70,0xfe,0x7f,0xff,0x6f,0x05,0x30]
+
+v_ceil_f16 v5, v1 quad_perm:[3,2,1,0]
+// GFX12: encoding: [0xfa,0xb8,0x0a,0x7e,0x01,0x1b,0x00,0xff]
+
+v_ceil_f16 v5, v1 quad_perm:[0,1,2,3]
+// GFX12: encoding: [0xfa,0xb8,0x0a,0x7e,0x01,0xe4,0x00,0xff]
+
+v_ceil_f16 v5, v1 row_mirror
+// GFX12: encoding: [0xfa,0xb8,0x0a,0x7e,0x01,0x40,0x01,0xff]
+
+v_ceil_f16 v5, v1 row_half_mirror
+// GFX12: encoding: [0xfa,0xb8,0x0a,0x7e,0x01,0x41,0x01,0xff]
+
+v_ceil_f16 v5, v1 row_shl:1
+// GFX12: encoding: [0xfa,0xb8,0x0a,0x7e,0x01,0x01,0x01,0xff]
+
+v_ceil_f16 v5, v1 row_shl:15
+// GFX12: encoding: [0xfa,0xb8,0x0a,0x7e,0x01,0x0f,0x01,0xff]
+
+v_ceil_f16 v5, v1 row_shr:1
+// GFX12: encoding: [0xfa,0xb8,0x0a,0x7e,0x01,0x11,0x01,0xff]
+
+v_ceil_f16 v5, v1 row_shr:15
+// GFX12: encoding: [0xfa,0xb8,0x0a,0x7e,0x01,0x1f,0x01,0xff]
+
+v_ceil_f16 v5, v1 row_ror:1
+// GFX12: encoding: [0xfa,0xb8,0x0a,0x7e,0x01,0x21,0x01,0xff]
+
+v_ceil_f16 v5, v1 row_ror:15
+// GFX12: encoding: [0xfa,0xb8,0x0a,0x7e,0x01,0x2f,0x01,0xff]
+
+v_ceil_f16 v5, v1 row_share:0 row_mask:0xf bank_mask:0xf
+// GFX12: encoding: [0xfa,0xb8,0x0a,0x7e,0x01,0x50,0x01,0xff]
+
+v_ceil_f16 v5, v1 row_share:15 row_mask:0x0 bank_mask:0x1
+// GFX12: encoding: [0xfa,0xb8,0x0a,0x7e,0x01,0x5f,0x01,0x01]
+
+v_ceil_f16 v5, v1 row_xmask:0 row_mask:0x1 bank_mask:0x3 bound_ctrl:1 fi:0
+// GFX12: encoding: [0xfa,0xb8,0x0a,0x7e,0x01,0x60,0x09,0x13]
+
+v_ceil_f16 v127, -|v127| row_xmask:15 row_mask:0x3 bank_mask:0x0 bound_ctrl:0 fi:1
+// GFX12: encoding: [0xfa,0xb8,0xfe,0x7e,0x7f,0x6f,0x35,0x30]
+
+v_ceil_f32 v5, v1 quad_perm:[3,2,1,0]
+// GFX12: encoding: [0xfa,0x44,0x0a,0x7e,0x01,0x1b,0x00,0xff]
+
+v_ceil_f32 v5, v1 quad_perm:[0,1,2,3]
+// GFX12: encoding: [0xfa,0x44,0x0a,0x7e,0x01,0xe4,0x00,0xff]
+
+v_ceil_f32 v5, v1 row_mirror
+// GFX12: encoding: [0xfa,0x44,0x0a,0x7e,0x01,0x40,0x01,0xff]
+
+v_ceil_f32 v5, v1 row_half_mirror
+// GFX12: encoding: [0xfa,0x44,0x0a,0x7e,0x01,0x41,0x01,0xff]
+
+v_ceil_f32 v5, v1 row_shl:1
+// GFX12: encoding: [0xfa,0x44,0x0a,0x7e,0x01,0x01,0x01,0xff]
+
+v_ceil_f32 v5, v1 row_shl:15
+// GFX12: encoding: [0xfa,0x44,0x0a,0x7e,0x01,0x0f,0x01,0xff]
+
+v_ceil_f32 v5, v1 row_shr:1
+// GFX12: encoding: [0xfa,0x44,0x0a,0x7e,0x01,0x11,0x01,0xff]
+
+v_ceil_f32 v5, v1 row_shr:15
+// GFX12: encoding: [0xfa,0x44,0x0a,0x7e,0x01,0x1f,0x01,0xff]
+
+v_ceil_f32 v5, v1 row_ror:1
+// GFX12: encoding: [0xfa,0x44,0x0a,0x7e,0x01,0x21,0x01,0xff]
+
+v_ceil_f32 v5, v1 row_ror:15
+// GFX12: encoding: [0xfa,0x44,0x0a,0x7e,0x01,0x2f,0x01,0xff]
+
+v_ceil_f32 v5, v1 row_share:0 row_mask:0xf bank_mask:0xf
+// GFX12: encoding: [0xfa,0x44,0x0a,0x7e,0x01,0x50,0x01,0xff]
+
+v_ceil_f32 v5, v1 row_share:15 row_mask:0x0 bank_mask:0x1
+// GFX12: encoding: [0xfa,0x44,0x0a,0x7e,0x01,0x5f,0x01,0x01]
+
+v_ceil_f32 v5, v1 row_xmask:0 row_mask:0x1 bank_mask:0x3 bound_ctrl:1 fi:0
+// GFX12: encoding: [0xfa,0x44,0x0a,0x7e,0x01,0x60,0x09,0x13]
+
+v_ceil_f32 v255, -|v255| row_xmask:15 row_mask:0x3 bank_mask:0x0 bound_ctrl:0 fi:1
+// GFX12: encoding: [0xfa,0x44,0xfe,0x7f,0xff,0x6f,0x35,0x30]
+
+v_cls_i32 v5, v1 quad_perm:[3,2,1,0]
+// GFX12: encoding: [0xfa,0x76,0x0a,0x7e,0x01,0x1b,0x00,0xff]
+
+v_cls_i32 v5, v1 quad_perm:[0,1,2,3]
+// GFX12: encoding: [0xfa,0x76,0x0a,0x7e,0x01,0xe4,0x00,0xff]
+
+v_cls_i32 v5, v1 row_mirror
+// GFX12: encoding: [0xfa,0x76,0x0a,0x7e,0x01,0x40,0x01,0xff]
+
+v_cls_i32 v5, v1 row_half_mirror
+// GFX12: encoding: [0xfa,0x76,0x0a,0x7e,0x01,0x41,0x01,0xff]
+
+v_cls_i32 v5, v1 row_shl:1
+// GFX12: encoding: [0xfa,0x76,0x0a,0x7e,0x01,0x01,0x01,0xff]
+
+v_cls_i32 v5, v1 row_shl:15
+// GFX12: encoding: [0xfa,0x76,0x0a,0x7e,0x01,0x0f,0x01,0xff]
+
+v_cls_i32 v5, v1 row_shr:1
+// GFX12: encoding: [0xfa,0x76,0x0a,0x7e,0x01,0x11,0x01,0xff]
+
+v_cls_i32 v5, v1 row_shr:15
+// GFX12: encoding: [0xfa,0x76,0x0a,0x7e,0x01,0x1f,0x01,0xff]
+
+v_cls_i32 v5, v1 row_ror:1
+// GFX12: encoding: [0xfa,0x76,0x0a,0x7e,0x01,0x21,0x01,0xff]
+
+v_cls_i32 v5, v1 row_ror:15
+// GFX12: encoding: [0xfa,0x76,0x0a,0x7e,0x01,0x2f,0x01,0xff]
+
+v_cls_i32 v5, v1 row_share:0 row_mask:0xf bank_mask:0xf
+// GFX12: encoding: [0xfa,0x76,0x0a,0x7e,0x01,0x50,0x01,0xff]
+
+v_cls_i32 v5, v1 row_share:15 row_mask:0x0 bank_mask:0x1
+// GFX12: encoding: [0xfa,0x76,0x0a,0x7e,0x01,0x5f,0x01,0x01]
+
+v_cls_i32 v5, v1 row_xmask:0 row_mask:0x1 bank_mask:0x3 bound_ctrl:1 fi:0
+// GFX12: encoding: [0xfa,0x76,0x0a,0x7e,0x01,0x60,0x09,0x13]
+
+v_cls_i32 v255, v255 row_xmask:15 row_mask:0x3 bank_mask:0x0 bound_ctrl:0 fi:1
+// GFX12: encoding: [0xfa,0x76,0xfe,0x7f,0xff,0x6f,0x05,0x30]
+
+v_clz_i32_u32 v5, v1 quad_perm:[3,2,1,0]
+// GFX12: encoding: [0xfa,0x72,0x0a,0x7e,0x01,0x1b,0x00,0xff]
+
+v_clz_i32_u32 v5, v1 quad_perm:[0,1,2,3]
+// GFX12: encoding: [0xfa,0x72,0x0a,0x7e,0x01,0xe4,0x00,0xff]
+
+v_clz_i32_u32 v5, v1 row_mirror
+// GFX12: encoding: [0xfa,0x72,0x0a,0x7e,0x01,0x40,0x01,0xff]
+
+v_clz_i32_u32 v5, v1 row_half_mirror
+// GFX12: encoding: [0xfa,0x72,0x0a,0x7e,0x01,0x41,0x01,0xff]
+
+v_clz_i32_u32 v5, v1 row_shl:1
+// GFX12: encoding: [0xfa,0x72,0x0a,0x7e,0x01,0x01,0x01,0xff]
+
+v_clz_i32_u32 v5, v1 row_shl:15
+// GFX12: encoding: [0xfa,0x72,0x0a,0x7e,0x01,0x0f,0x01,0xff]
+
+v_clz_i32_u32 v5, v1 row_shr:1
+// GFX12: encoding: [0xfa,0x72,0x0a,0x7e,0x01,0x11,0x01,0xff]
+
+v_clz_i32_u32 v5, v1 row_shr:15
+// GFX12: encoding: [0xfa,0x72,0x0a,0x7e,0x01,0x1f,0x01,0xff]
+
+v_clz_i32_u32 v5, v1 row_ror:1
+// GFX12: encoding: [0xfa,0x72,0x0a,0x7e,0x01,0x21,0x01,0xff]
+
+v_clz_i32_u32 v5, v1 row_ror:15
+// GFX12: encoding: [0xfa,0x72,0x0a,0x7e,0x01,0x2f,0x01,0xff]
+
+v_clz_i32_u32 v5, v1 row_share:0 row_mask:0xf bank_mask:0xf
+// GFX12: encoding: [0xfa,0x72,0x0a,0x7e,0x01,0x50,0x01,0xff]
+
+v_clz_i32_u32 v5, v1 row_share:15 row_mask:0x0 bank_mask:0x1
+// GFX12: encoding: [0xfa,0x72,0x0a,0x7e,0x01,0x5f,0x01,0x01]
+
+v_clz_i32_u32 v5, v1 row_xmask:0 row_mask:0x1 bank_mask:0x3 bound_ctrl:1 fi:0
+// GFX12: encoding: [0xfa,0x72,0x0a,0x7e,0x01,0x60,0x09,0x13]
+
+v_clz_i32_u32 v255, v255 row_xmask:15 row_mask:0x3 bank_mask:0x0 bound_ctrl:0 fi:1
+// GFX12: encoding: [0xfa,0x72,0xfe,0x7f,0xff,0x6f,0x05,0x30]
+
+v_cos_f16 v5, v1 quad_perm:[3,2,1,0]
+// GFX12: encoding: [0xfa,0xc2,0x0a,0x7e,0x01,0x1b,0x00,0xff]
+
+v_cos_f16 v5, v1 quad_perm:[0,1,2,3]
+// GFX12: encoding: [0xfa,0xc2,0x0a,0x7e,0x01,0xe4,0x00,0xff]
+
+v_cos_f16 v5, v1 row_mirror
+// GFX12: encoding: [0xfa,0xc2,0x0a,0x7e,0x01,0x40,0x01,0xff]
+
+v_cos_f16 v5, v1 row_half_mirror
+// GFX12: encoding: [0xfa,0xc2,0x0a,0x7e,0x01,0x41,0x01,0xff]
+
+v_cos_f16 v5, v1 row_shl:1
+// GFX12: encoding: [0xfa,0xc2,0x0a,0x7e,0x01,0x01,0x01,0xff]
+
+v_cos_f16 v5, v1 row_shl:15
+// GFX12: encoding: [0xfa,0xc2,0x0a,0x7e,0x01,0x0f,0x01,0xff]
+
+v_cos_f16 v5, v1 row_shr:1
+// GFX12: encoding: [0xfa,0xc2,0x0a,0x7e,0x01,0x11,0x01,0xff]
+
+v_cos_f16 v5, v1 row_shr:15
+// GFX12: encoding: [0xfa,0xc2,0x0a,0x7e,0x01,0x1f,0x01,0xff]
+
+v_cos_f16 v5, v1 row_ror:1
+// GFX12: encoding: [0xfa,0xc2,0x0a,0x7e,0x01,0x21,0x01,0xff]
+
+v_cos_f16 v5, v1 row_ror:15
+// GFX12: encoding: [0xfa,0xc2,0x0a,0x7e,0x01,0x2f,0x01,0xff]
+
+v_cos_f16 v5, v1 row_share:0 row_mask:0xf bank_mask:0xf
+// GFX12: encoding: [0xfa,0xc2,0x0a,0x7e,0x01,0x50,0x01,0xff]
+
+v_cos_f16 v5, v1 row_share:15 row_mask:0x0 bank_mask:0x1
+// GFX12: encoding: [0xfa,0xc2,0x0a,0x7e,0x01,0x5f,0x01,0x01]
+
+v_cos_f16 v5, v1 row_xmask:0 row_mask:0x1 bank_mask:0x3 bound_ctrl:1 fi:0
+// GFX12: encoding: [0xfa,0xc2,0x0a,0x7e,0x01,0x60,0x09,0x13]
+
+v_cos_f16 v127, -|v127| row_xmask:15 row_mask:0x3 bank_mask:0x0 bound_ctrl:0 fi:1
+// GFX12: encoding: [0xfa,0xc2,0xfe,0x7e,0x7f,0x6f,0x35,0x30]
+
+v_cos_f32 v5, v1 quad_perm:[3,2,1,0]
+// GFX12: encoding: [0xfa,0x6c,0x0a,0x7e,0x01,0x1b,0x00,0xff]
+
+v_cos_f32 v5, v1 quad_perm:[0,1,2,3]
+// GFX12: encoding: [0xfa,0x6c,0x0a,0x7e,0x01,0xe4,0x00,0xff]
+
+v_cos_f32 v5, v1 row_mirror
+// GFX12: encoding: [0xfa,0x6c,0x0a,0x7e,0x01,0x40,0x01,0xff]
+
+v_cos_f32 v5, v1 row_half_mirror
+// GFX12: encoding: [0xfa,0x6c,0x0a,0x7e,0x01,0x41,0x01,0xff]
+
+v_cos_f32 v5, v1 row_shl:1
+// GFX12: encoding: [0xfa,0x6c,0x0a,0x7e,0x01,0x01,0x01,0xff]
+
+v_cos_f32 v5, v1 row_shl:15
+// GFX12: encoding: [0xfa,0x6c,0x0a,0x7e,0x01,0x0f,0x01,0xff]
+
+v_cos_f32 v5, v1 row_shr:1
+// GFX12: encoding: [0xfa,0x6c,0x0a,0x7e,0x01,0x11,0x01,0xff]
+
+v_cos_f32 v5, v1 row_shr:15
+// GFX12: encoding: [0xfa,0x6c,0x0a,0x7e,0x01,0x1f,0x01,0xff]
+
+v_cos_f32 v5, v1 row_ror:1
+// GFX12: encoding: [0xfa,0x6c,0x0a,0x7e,0x01,0x21,0x01,0xff]
+
+v_cos_f32 v5, v1 row_ror:15
+// GFX12: encoding: [0xfa,0x6c,0x0a,0x7e,0x01,0x2f,0x01,0xff]
+
+v_cos_f32 v5, v1 row_share:0 row_mask:0xf bank_mask:0xf
+// GFX12: encoding: [0xfa,0x6c,0x0a,0x7e,0x01,0x50,0x01,0xff]
+
+v_cos_f32 v5, v1 row_share:15 row_mask:0x0 bank_mask:0x1
+// GFX12: encoding: [0xfa,0x6c,0x0a,0x7e,0x01,0x5f,0x01,0x01]
+
+v_cos_f32 v5, v1 row_xmask:0 row_mask:0x1 bank_mask:0x3 bound_ctrl:1 fi:0
+// GFX12: encoding: [0xfa,0x6c,0x0a,0x7e,0x01,0x60,0x09,0x13]
+
+v_cos_f32 v255, -|v255| row_xmask:15 row_mask:0x3 bank_mask:0x0 bound_ctrl:0 fi:1
+// GFX12: encoding: [0xfa,0x6c,0xfe,0x7f,0xff,0x6f,0x35,0x30]
+
+v_ctz_i32_b32 v5, v1 quad_perm:[3,2,1,0]
+// GFX12: encoding: [0xfa,0x74,0x0a,0x7e,0x01,0x1b,0x00,0xff]
+
+v_ctz_i32_b32 v5, v1 quad_perm:[0,1,2,3]
+// GFX12: encoding: [0xfa,0x74,0x0a,0x7e,0x01,0xe4,0x00,0xff]
+
+v_ctz_i32_b32 v5, v1 row_mirror
+// GFX12: encoding: [0xfa,0x74,0x0a,0x7e,0x01,0x40,0x01,0xff]
+
+v_ctz_i32_b32 v5, v1 row_half_mirror
+// GFX12: encoding: [0xfa,0x74,0x0a,0x7e,0x01,0x41,0x01,0xff]
+
+v_ctz_i32_b32 v5, v1 row_shl:1
+// GFX12: encoding: [0xfa,0x74,0x0a,0x7e,0x01,0x01,0x01,0xff]
+
+v_ctz_i32_b32 v5, v1 row_shl:15
+// GFX12: encoding: [0xfa,0x74,0x0a,0x7e,0x01,0x0f,0x01,0xff]
+
+v_ctz_i32_b32 v5, v1 row_shr:1
+// GFX12: encoding: [0xfa,0x74,0x0a,0x7e,0x01,0x11,0x01,0xff]
+
+v_ctz_i32_b32 v5, v1 row_shr:15
+// GFX12: encoding: [0xfa,0x74,0x0a,0x7e,0x01,0x1f,0x01,0xff]
+
+v_ctz_i32_b32 v5, v1 row_ror:1
+// GFX12: encoding: [0xfa,0x74,0x0a,0x7e,0x01,0x21,0x01,0xff]
+
+v_ctz_i32_b32 v5, v1 row_ror:15
+// GFX12: encoding: [0xfa,0x74,0x0a,0x7e,0x01,0x2f,0x01,0xff]
+
+v_ctz_i32_b32 v5, v1 row_share:0 row_mask:0xf bank_mask:0xf
+// GFX12: encoding: [0xfa,0x74,0x0a,0x7e,0x01,0x50,0x01,0xff]
+
+v_ctz_i32_b32 v5, v1 row_share:15 row_mask:0x0 bank_mask:0x1
+// GFX12: encoding: [0xfa,0x74,0x0a,0x7e,0x01,0x5f,0x01,0x01]
+
+v_ctz_i32_b32 v5, v1 row_xmask:0 row_mask:0x1 bank_mask:0x3 bound_ctrl:1 fi:0
+// GFX12: encoding: [0xfa,0x74,0x0a,0x7e,0x01,0x60,0x09,0x13]
+
+v_ctz_i32_b32 v255, v255 row_xmask:15 row_mask:0x3 bank_mask:0x0 bound_ctrl:0 fi:1
+// GFX12: encoding: [0xfa,0x74,0xfe,0x7f,0xff,0x6f,0x05,0x30]
+
+v_cvt_f32_fp8 v1, v3 quad_perm:[0,1,2,3] row_mask:0xa bank_mask:0xc
+// GFX12: encoding: [0xfa,0xd8,0x02,0x7e,0x03,0xe4,0x00,0xac]
+
+v_cvt_f32_fp8 v1, v3 quad_perm:[3,2,1,0] row_mask:0x2 bank_mask:0xe
+// GFX12: encoding: [0xfa,0xd8,0x02,0x7e,0x03,0x1b,0x00,0x2e]
+
+v_cvt_f32_bf8 v1, v3 quad_perm:[0,1,2,3] row_mask:0xa bank_mask:0xc
+// GFX12: encoding: [0xfa,0xda,0x02,0x7e,0x03,0xe4,0x00,0xac]
+
+v_cvt_f32_bf8 v1, v3 quad_perm:[3,2,1,0] row_mask:0x2 bank_mask:0xe
+// GFX12: encoding: [0xfa,0xda,0x02,0x7e,0x03,0x1b,0x00,0x2e]
+
+v_cvt_f16_f32 v5, v1 quad_perm:[3,2,1,0]
+// GFX12: encoding: [0xfa,0x14,0x0a,0x7e,0x01,0x1b,0x00,0xff]
+
+v_cvt_f16_f32 v5, v1 quad_perm:[0,1,2,3]
+// GFX12: encoding: [0xfa,0x14,0x0a,0x7e,0x01,0xe4,0x00,0xff]
+
+v_cvt_f16_f32 v5, v1 row_mirror
+// GFX12: encoding: [0xfa,0x14,0x0a,0x7e,0x01,0x40,0x01,0xff]
+
+v_cvt_f16_f32 v5, v1 row_half_mirror
+// GFX12: encoding: [0xfa,0x14,0x0a,0x7e,0x01,0x41,0x01,0xff]
+
+v_cvt_f16_f32 v5, v1 row_shl:1
+// GFX12: encoding: [0xfa,0x14,0x0a,0x7e,0x01,0x01,0x01,0xff]
+
+v_cvt_f16_f32 v5, v1 row_shl:15
+// GFX12: encoding: [0xfa,0x14,0x0a,0x7e,0x01,0x0f,0x01,0xff]
+
+v_cvt_f16_f32 v5, v1 row_shr:1
+// GFX12: encoding: [0xfa,0x14,0x0a,0x7e,0x01,0x11,0x01,0xff]
+
+v_cvt_f16_f32 v5, v1 row_shr:15
+// GFX12: encoding: [0xfa,0x14,0x0a,0x7e,0x01,0x1f,0x01,0xff]
+
+v_cvt_f16_f32 v5, v1 row_ror:1
+// GFX12: encoding: [0xfa,0x14,0x0a,0x7e,0x01,0x21,0x01,0xff]
+
+v_cvt_f16_f32 v5, v1 row_ror:15
+// GFX12: encoding: [0xfa,0x14,0x0a,0x7e,0x01,0x2f,0x01,0xff]
+
+v_cvt_f16_f32 v5, v1 row_share:0 row_mask:0xf bank_mask:0xf
+// GFX12: encoding: [0xfa,0x14,0x0a,0x7e,0x01,0x50,0x01,0xff]
+
+v_cvt_f16_f32 v5, v1 row_share:15 row_mask:0x0 bank_mask:0x1
+// GFX12: encoding: [0xfa,0x14,0x0a,0x7e,0x01,0x5f,0x01,0x01]
+
+v_cvt_f16_f32 v5, v1 row_xmask:0 row_mask:0x1 bank_mask:0x3 bound_ctrl:1 fi:0
+// GFX12: encoding: [0xfa,0x14,0x0a,0x7e,0x01,0x60,0x09,0x13]
+
+v_cvt_f16_f32 v127, -|v255| row_xmask:15 row_mask:0x3 bank_mask:0x0 bound_ctrl:0 fi:1
+// GFX12: encoding: [0xfa,0x14,0xfe,0x7e,0xff,0x6f,0x35,0x30]
+
+v_cvt_f16_i16 v5, v1 quad_perm:[3,2,1,0]
+// GFX12: encoding: [0xfa,0xa2,0x0a,0x7e,0x01,0x1b,0x00,0xff]
+
+v_cvt_f16_i16 v5, v1 quad_perm:[0,1,2,3]
+// GFX12: encoding: [0xfa,0xa2,0x0a,0x7e,0x01,0xe4,0x00,0xff]
+
+v_cvt_f16_i16 v5, v1 row_mirror
+// GFX12: encoding: [0xfa,0xa2,0x0a,0x7e,0x01,0x40,0x01,0xff]
+
+v_cvt_f16_i16 v5, v1 row_half_mirror
+// GFX12: encoding: [0xfa,0xa2,0x0a,0x7e,0x01,0x41,0x01,0xff]
+
+v_cvt_f16_i16 v5, v1 row_shl:1
+// GFX12: encoding: [0xfa,0xa2,0x0a,0x7e,0x01,0x01,0x01,0xff]
+
+v_cvt_f16_i16 v5, v1 row_shl:15
+// GFX12: encoding: [0xfa,0xa2,0x0a,0x7e,0x01,0x0f,0x01,0xff]
+
+v_cvt_f16_i16 v5, v1 row_shr:1
+// GFX12: encoding: [0xfa,0xa2,0x0a,0x7e,0x01,0x11,0x01,0xff]
+
+v_cvt_f16_i16 v5, v1 row_shr:15
+// GFX12: encoding: [0xfa,0xa2,0x0a,0x7e,0x01,0x1f,0x01,0xff]
+
+v_cvt_f16_i16 v5, v1 row_ror:1
+// GFX12: encoding: [0xfa,0xa2,0x0a,0x7e,0x01,0x21,0x01,0xff]
+
+v_cvt_f16_i16 v5, v1 row_ror:15
+// GFX12: encoding: [0xfa,0xa2,0x0a,0x7e,0x01,0x2f,0x01,0xff]
+
+v_cvt_f16_i16 v5, v1 row_share:0 row_mask:0xf bank_mask:0xf
+// GFX12: encoding: [0xfa,0xa2,0x0a,0x7e,0x01,0x50,0x01,0xff]
+
+v_cvt_f16_i16 v5, v1 row_share:15 row_mask:0x0 bank_mask:0x1
+// GFX12: encoding: [0xfa,0xa2,0x0a,0x7e,0x01,0x5f,0x01,0x01]
+
+v_cvt_f16_i16 v5, v1 row_xmask:0 row_mask:0x1 bank_mask:0x3 bound_ctrl:1 fi:0
+// GFX12: encoding: [0xfa,0xa2,0x0a,0x7e,0x01,0x60,0x09,0x13]
+
+v_cvt_f16_i16 v127, v127 row_xmask:15 row_mask:0x3 bank_mask:0x0 bound_ctrl:0 fi:1
+// GFX12: encoding: [0xfa,0xa2,0xfe,0x7e,0x7f,0x6f,0x05,0x30]
+
+v_cvt_f16_u16 v5, v1 quad_perm:[3,2,1,0]
+// GFX12: encoding: [0xfa,0xa0,0x0a,0x7e,0x01,0x1b,0x00,0xff]
+
+v_cvt_f16_u16 v5, v1 quad_perm:[0,1,2,3]
+// GFX12: encoding: [0xfa,0xa0,0x0a,0x7e,0x01,0xe4,0x00,0xff]
+
+v_cvt_f16_u16 v5, v1 row_mirror
+// GFX12: encoding: [0xfa,0xa0,0x0a,0x7e,0x01,0x40,0x01,0xff]
+
+v_cvt_f16_u16 v5, v1 row_half_mirror
+// GFX12: encoding: [0xfa,0xa0,0x0a,0x7e,0x01,0x41,0x01,0xff]
+
+v_cvt_f16_u16 v5, v1 row_shl:1
+// GFX12: encoding: [0xfa,0xa0,0x0a,0x7e,0x01,0x01,0x01,0xff]
+
+v_cvt_f16_u16 v5, v1 row_shl:15
+// GFX12: encoding: [0xfa,0xa0,0x0a,0x7e,0x01,0x0f,0x01,0xff]
+
+v_cvt_f16_u16 v5, v1 row_shr:1
+// GFX12: encoding: [0xfa,0xa0,0x0a,0x7e,0x01,0x11,0x01,0xff]
+
+v_cvt_f16_u16 v5, v1 row_shr:15
+// GFX12: encoding: [0xfa,0xa0,0x0a,0x7e,0x01,0x1f,0x01,0xff]
+
+v_cvt_f16_u16 v5, v1 row_ror:1
+// GFX12: encoding: [0xfa,0xa0,0x0a,0x7e,0x01,0x21,0x01,0xff]
+
+v_cvt_f16_u16 v5, v1 row_ror:15
+// GFX12: encoding: [0xfa,0xa0,0x0a,0x7e,0x01,0x2f,0x01,0xff]
+
+v_cvt_f16_u16 v5, v1 row_share:0 row_mask:0xf bank_mask:0xf
+// GFX12: encoding: [0xfa,0xa0,0x0a,0x7e,0x01,0x50,0x01,0xff]
+
+v_cvt_f16_u16 v5, v1 row_share:15 row_mask:0x0 bank_mask:0x1
+// GFX12: encoding: [0xfa,0xa0,0x0a,0x7e,0x01,0x5f,0x01,0x01]
+
+v_cvt_f16_u16 v5, v1 row_xmask:0 row_mask:0x1 bank_mask:0x3 bound_ctrl:1 fi:0
+// GFX12: encoding: [0xfa,0xa0,0x0a,0x7e,0x01,0x60,0x09,0x13]
+
+v_cvt_f16_u16 v127, v127 row_xmask:15 row_mask:0x3 bank_mask:0x0 bound_ctrl:0 fi:1
+// GFX12: encoding: [0xfa,0xa0,0xfe,0x7e,0x7f,0x6f,0x05,0x30]
+
+v_cvt_f32_f16 v5, v1 quad_perm:[3,2,1,0]
+// GFX12: encoding: [0xfa,0x16,0x0a,0x7e,0x01,0x1b,0x00,0xff]
+
+v_cvt_f32_f16 v5, v1 quad_perm:[0,1,2,3]
+// GFX12: encoding: [0xfa,0x16,0x0a,0x7e,0x01,0xe4,0x00,0xff]
+
+v_cvt_f32_f16 v5, v1 row_mirror
+// GFX12: encoding: [0xfa,0x16,0x0a,0x7e,0x01,0x40,0x01,0xff]
+
+v_cvt_f32_f16 v5, v1 row_half_mirror
+// GFX12: encoding: [0xfa,0x16,0x0a,0x7e,0x01,0x41,0x01,0xff]
+
+v_cvt_f32_f16 v5, v1 row_shl:1
+// GFX12: encoding: [0xfa,0x16,0x0a,0x7e,0x01,0x01,0x01,0xff]
+
+v_cvt_f32_f16 v5, v1 row_shl:15
+// GFX12: encoding: [0xfa,0x16,0x0a,0x7e,0x01,0x0f,0x01,0xff]
+
+v_cvt_f32_f16 v5, v1 row_shr:1
+// GFX12: encoding: [0xfa,0x16,0x0a,0x7e,0x01,0x11,0x01,0xff]
+
+v_cvt_f32_f16 v5, v1 row_shr:15
+// GFX12: encoding: [0xfa,0x16,0x0a,0x7e,0x01,0x1f,0x01,0xff]
+
+v_cvt_f32_f16 v5, v1 row_ror:1
+// GFX12: encoding: [0xfa,0x16,0x0a,0x7e,0x01,0x21,0x01,0xff]
+
+v_cvt_f32_f16 v5, v1 row_ror:15
+// GFX12: encoding: [0xfa,0x16,0x0a,0x7e,0x01,0x2f,0x01,0xff]
+
+v_cvt_f32_f16 v5, v1 row_share:0 row_mask:0xf bank_mask:0xf
+// GFX12: encoding: [0xfa,0x16,0x0a,0x7e,0x01,0x50,0x01,0xff]
+
+v_cvt_f32_f16 v5, v1 row_share:15 row_mask:0x0 bank_mask:0x1
+// GFX12: encoding: [0xfa,0x16,0x0a,0x7e,0x01,0x5f,0x01,0x01]
+
+v_cvt_f32_f16 v5, v1 row_xmask:0 row_mask:0x1 bank_mask:0x3 bound_ctrl:1 fi:0
+// GFX12: encoding: [0xfa,0x16,0x0a,0x7e,0x01,0x60,0x09,0x13]
+
+v_cvt_f32_f16 v255, -|v127| row_xmask:15 row_mask:0x3 bank_mask:0x0 bound_ctrl:0 fi:1
+// GFX12: encoding: [0xfa,0x16,0xfe,0x7f,0x7f,0x6f,0x35,0x30]
+
+v_cvt_f32_i32 v5, v1 quad_perm:[3,2,1,0]
+// GFX12: encoding: [0xfa,0x0a,0x0a,0x7e,0x01,0x1b,0x00,0xff]
+
+v_cvt_f32_i32 v5, v1 quad_perm:[0,1,2,3]
+// GFX12: encoding: [0xfa,0x0a,0x0a,0x7e,0x01,0xe4,0x00,0xff]
+
+v_cvt_f32_i32 v5, v1 row_mirror
+// GFX12: encoding: [0xfa,0x0a,0x0a,0x7e,0x01,0x40,0x01,0xff]
+
+v_cvt_f32_i32 v5, v1 row_half_mirror
+// GFX12: encoding: [0xfa,0x0a,0x0a,0x7e,0x01,0x41,0x01,0xff]
+
+v_cvt_f32_i32 v5, v1 row_shl:1
+// GFX12: encoding: [0xfa,0x0a,0x0a,0x7e,0x01,0x01,0x01,0xff]
+
+v_cvt_f32_i32 v5, v1 row_shl:15
+// GFX12: encoding: [0xfa,0x0a,0x0a,0x7e,0x01,0x0f,0x01,0xff]
+
+v_cvt_f32_i32 v5, v1 row_shr:1
+// GFX12: encoding: [0xfa,0x0a,0x0a,0x7e,0x01,0x11,0x01,0xff]
+
+v_cvt_f32_i32 v5, v1 row_shr:15
+// GFX12: encoding: [0xfa,0x0a,0x0a,0x7e,0x01,0x1f,0x01,0xff]
+
+v_cvt_f32_i32 v5, v1 row_ror:1
+// GFX12: encoding: [0xfa,0x0a,0x0a,0x7e,0x01,0x21,0x01,0xff]
+
+v_cvt_f32_i32 v5, v1 row_ror:15
+// GFX12: encoding: [0xfa,0x0a,0x0a,0x7e,0x01,0x2f,0x01,0xff]
+
+v_cvt_f32_i32 v5, v1 row_share:0 row_mask:0xf bank_mask:0xf
+// GFX12: encoding: [0xfa,0x0a,0x0a,0x7e,0x01,0x50,0x01,0xff]
+
+v_cvt_f32_i32 v5, v1 row_share:15 row_mask:0x0 bank_mask:0x1
+// GFX12: encoding: [0xfa,0x0a,0x0a,0x7e,0x01,0x5f,0x01,0x01]
+
+v_cvt_f32_i32 v5, v1 row_xmask:0 row_mask:0x1 bank_mask:0x3 bound_ctrl:1 fi:0
+// GFX12: encoding: [0xfa,0x0a,0x0a,0x7e,0x01,0x60,0x09,0x13]
+
+v_cvt_f32_i32 v255, v255 row_xmask:15 row_mask:0x3 bank_mask:0x0 bound_ctrl:0 fi:1
+// GFX12: encoding: [0xfa,0x0a,0xfe,0x7f,0xff,0x6f,0x05,0x30]
+
+v_cvt_f32_u32 v5, v1 quad_perm:[3,2,1,0]
+// GFX12: encoding: [0xfa,0x0c,0x0a,0x7e,0x01,0x1b,0x00,0xff]
+
+v_cvt_f32_u32 v5, v1 quad_perm:[0,1,2,3]
+// GFX12: encoding: [0xfa,0x0c,0x0a,0x7e,0x01,0xe4,0x00,0xff]
+
+v_cvt_f32_u32 v5, v1 row_mirror
+// GFX12: encoding: [0xfa,0x0c,0x0a,0x7e,0x01,0x40,0x01,0xff]
+
+v_cvt_f32_u32 v5, v1 row_half_mirror
+// GFX12: encoding: [0xfa,0x0c,0x0a,0x7e,0x01,0x41,0x01,0xff]
+
+v_cvt_f32_u32 v5, v1 row_shl:1
+// GFX12: encoding: [0xfa,0x0c,0x0a,0x7e,0x01,0x01,0x01,0xff]
+
+v_cvt_f32_u32 v5, v1 row_shl:15
+// GFX12: encoding: [0xfa,0x0c,0x0a,0x7e,0x01,0x0f,0x01,0xff]
+
+v_cvt_f32_u32 v5, v1 row_shr:1
+// GFX12: encoding: [0xfa,0x0c,0x0a,0x7e,0x01,0x11,0x01,0xff]
+
+v_cvt_f32_u32 v5, v1 row_shr:15
+// GFX12: encoding: [0xfa,0x0c,0x0a,0x7e,0x01,0x1f,0x01,0xff]
+
+v_cvt_f32_u32 v5, v1 row_ror:1
+// GFX12: encoding: [0xfa,0x0c,0x0a,0x7e,0x01,0x21,0x01,0xff]
+
+v_cvt_f32_u32 v5, v1 row_ror:15
+// GFX12: encoding: [0xfa,0x0c,0x0a,0x7e,0x01,0x2f,0x01,0xff]
+
+v_cvt_f32_u32 v5, v1 row_share:0 row_mask:0xf bank_mask:0xf
+// GFX12: encoding: [0xfa,0x0c,0x0a,0x7e,0x01,0x50,0x01,0xff]
+
+v_cvt_f32_u32 v5, v1 row_share:15 row_mask:0x0 bank_mask:0x1
+// GFX12: encoding: [0xfa,0x0c,0x0a,0x7e,0x01,0x5f,0x01,0x01]
+
+v_cvt_f32_u32 v5, v1 row_xmask:0 row_mask:0x1 bank_mask:0x3 bound_ctrl:1 fi:0
+// GFX12: encoding: [0xfa,0x0c,0x0a,0x7e,0x01,0x60,0x09,0x13]
+
+v_cvt_f32_u32 v255, v255 row_xmask:15 row_mask:0x3 bank_mask:0x0 bound_ctrl:0 fi:1
+// GFX12: encoding: [0xfa,0x0c,0xfe,0x7f,0xff,0x6f,0x05,0x30]
+
+v_cvt_f32_ubyte0 v5, v1 quad_perm:[3,2,1,0]
+// GFX12: encoding: [0xfa,0x22,0x0a,0x7e,0x01,0x1b,0x00,0xff]
+
+v_cvt_f32_ubyte0 v5, v1 quad_perm:[0,1,2,3]
+// GFX12: encoding: [0xfa,0x22,0x0a,0x7e,0x01,0xe4,0x00,0xff]
+
+v_cvt_f32_ubyte0 v5, v1 row_mirror
+// GFX12: encoding: [0xfa,0x22,0x0a,0x7e,0x01,0x40,0x01,0xff]
+
+v_cvt_f32_ubyte0 v5, v1 row_half_mirror
+// GFX12: encoding: [0xfa,0x22,0x0a,0x7e,0x01,0x41,0x01,0xff]
+
+v_cvt_f32_ubyte0 v5, v1 row_shl:1
+// GFX12: encoding: [0xfa,0x22,0x0a,0x7e,0x01,0x01,0x01,0xff]
+
+v_cvt_f32_ubyte0 v5, v1 row_shl:15
+// GFX12: encoding: [0xfa,0x22,0x0a,0x7e,0x01,0x0f,0x01,0xff]
+
+v_cvt_f32_ubyte0 v5, v1 row_shr:1
+// GFX12: encoding: [0xfa,0x22,0x0a,0x7e,0x01,0x11,0x01,0xff]
+
+v_cvt_f32_ubyte0 v5, v1 row_shr:15
+// GFX12: encoding: [0xfa,0x22,0x0a,0x7e,0x01,0x1f,0x01,0xff]
+
+v_cvt_f32_ubyte0 v5, v1 row_ror:1
+// GFX12: encoding: [0xfa,0x22,0x0a,0x7e,0x01,0x21,0x01,0xff]
+
+v_cvt_f32_ubyte0 v5, v1 row_ror:15
+// GFX12: encoding: [0xfa,0x22,0x0a,0x7e,0x01,0x2f,0x01,0xff]
+
+v_cvt_f32_ubyte0 v5, v1 row_share:0 row_mask:0xf bank_mask:0xf
+// GFX12: encoding: [0xfa,0x22,0x0a,0x7e,0x01,0x50,0x01,0xff]
+
+v_cvt_f32_ubyte0 v5, v1 row_share:15 row_mask:0x0 bank_mask:0x1
+// GFX12: encoding: [0xfa,0x22,0x0a,0x7e,0x01,0x5f,0x01,0x01]
+
+v_cvt_f32_ubyte0 v5, v1 row_xmask:0 row_mask:0x1 bank_mask:0x3 bound_ctrl:1 fi:0
+// GFX12: encoding: [0xfa,0x22,0x0a,0x7e,0x01,0x60,0x09,0x13]
+
+v_cvt_f32_ubyte0 v255, v255 row_xmask:15 row_mask:0x3 bank_mask:0x0 bound_ctrl:0 fi:1
+// GFX12: encoding: [0xfa,0x22,0xfe,0x7f,0xff,0x6f,0x05,0x30]
+
+v_cvt_f32_ubyte1 v5, v1 quad_perm:[3,2,1,0]
+// GFX12: encoding: [0xfa,0x24,0x0a,0x7e,0x01,0x1b,0x00,0xff]
+
+v_cvt_f32_ubyte1 v5, v1 quad_perm:[0,1,2,3]
+// GFX12: encoding: [0xfa,0x24,0x0a,0x7e,0x01,0xe4,0x00,0xff]
+
+v_cvt_f32_ubyte1 v5, v1 row_mirror
+// GFX12: encoding: [0xfa,0x24,0x0a,0x7e,0x01,0x40,0x01,0xff]
+
+v_cvt_f32_ubyte1 v5, v1 row_half_mirror
+// GFX12: encoding: [0xfa,0x24,0x0a,0x7e,0x01,0x41,0x01,0xff]
+
+v_cvt_f32_ubyte1 v5, v1 row_shl:1
+// GFX12: encoding: [0xfa,0x24,0x0a,0x7e,0x01,0x01,0x01,0xff]
+
+v_cvt_f32_ubyte1 v5, v1 row_shl:15
+// GFX12: encoding: [0xfa,0x24,0x0a,0x7e,0x01,0x0f,0x01,0xff]
+
+v_cvt_f32_ubyte1 v5, v1 row_shr:1
+// GFX12: encoding: [0xfa,0x24,0x0a,0x7e,0x01,0x11,0x01,0xff]
+
+v_cvt_f32_ubyte1 v5, v1 row_shr:15
+// GFX12: encoding: [0xfa,0x24,0x0a,0x7e,0x01,0x1f,0x01,0xff]
+
+v_cvt_f32_ubyte1 v5, v1 row_ror:1
+// GFX12: encoding: [0xfa,0x24,0x0a,0x7e,0x01,0x21,0x01,0xff]
+
+v_cvt_f32_ubyte1 v5, v1 row_ror:15
+// GFX12: encoding: [0xfa,0x24,0x0a,0x7e,0x01,0x2f,0x01,0xff]
+
+v_cvt_f32_ubyte1 v5, v1 row_share:0 row_mask:0xf bank_mask:0xf
+// GFX12: encoding: [0xfa,0x24,0x0a,0x7e,0x01,0x50,0x01,0xff]
+
+v_cvt_f32_ubyte1 v5, v1 row_share:15 row_mask:0x0 bank_mask:0x1
+// GFX12: encoding: [0xfa,0x24,0x0a,0x7e,0x01,0x5f,0x01,0x01]
+
+v_cvt_f32_ubyte1 v5, v1 row_xmask:0 row_mask:0x1 bank_mask:0x3 bound_ctrl:1 fi:0
+// GFX12: encoding: [0xfa,0x24,0x0a,0x7e,0x01,0x60,0x09,0x13]
+
+v_cvt_f32_ubyte1 v255, v255 row_xmask:15 row_mask:0x3 bank_mask:0x0 bound_ctrl:0 fi:1
+// GFX12: encoding: [0xfa,0x24,0xfe,0x7f,0xff,0x6f,0x05,0x30]
+
+v_cvt_f32_ubyte2 v5, v1 quad_perm:[3,2,1,0]
+// GFX12: encoding: [0xfa,0x26,0x0a,0x7e,0x01,0x1b,0x00,0xff]
+
+v_cvt_f32_ubyte2 v5, v1 quad_perm:[0,1,2,3]
+// GFX12: encoding: [0xfa,0x26,0x0a,0x7e,0x01,0xe4,0x00,0xff]
+
+v_cvt_f32_ubyte2 v5, v1 row_mirror
+// GFX12: encoding: [0xfa,0x26,0x0a,0x7e,0x01,0x40,0x01,0xff]
+
+v_cvt_f32_ubyte2 v5, v1 row_half_mirror
+// GFX12: encoding: [0xfa,0x26,0x0a,0x7e,0x01,0x41,0x01,0xff]
+
+v_cvt_f32_ubyte2 v5, v1 row_shl:1
+// GFX12: encoding: [0xfa,0x26,0x0a,0x7e,0x01,0x01,0x01,0xff]
+
+v_cvt_f32_ubyte2 v5, v1 row_shl:15
+// GFX12: encoding: [0xfa,0x26,0x0a,0x7e,0x01,0x0f,0x01,0xff]
+
+v_cvt_f32_ubyte2 v5, v1 row_shr:1
+// GFX12: encoding: [0xfa,0x26,0x0a,0x7e,0x01,0x11,0x01,0xff]
+
+v_cvt_f32_ubyte2 v5, v1 row_shr:15
+// GFX12: encoding: [0xfa,0x26,0x0a,0x7e,0x01,0x1f,0x01,0xff]
+
+v_cvt_f32_ubyte2 v5, v1 row_ror:1
+// GFX12: encoding: [0xfa,0x26,0x0a,0x7e,0x01,0x21,0x01,0xff]
+
+v_cvt_f32_ubyte2 v5, v1 row_ror:15
+// GFX12: encoding: [0xfa,0x26,0x0a,0x7e,0x01,0x2f,0x01,0xff]
+
+v_cvt_f32_ubyte2 v5, v1 row_share:0 row_mask:0xf bank_mask:0xf
+// GFX12: encoding: [0xfa,0x26,0x0a,0x7e,0x01,0x50,0x01,0xff]
+
+v_cvt_f32_ubyte2 v5, v1 row_share:15 row_mask:0x0 bank_mask:0x1
+// GFX12: encoding: [0xfa,0x26,0x0a,0x7e,0x01,0x5f,0x01,0x01]
+
+v_cvt_f32_ubyte2 v5, v1 row_xmask:0 row_mask:0x1 bank_mask:0x3 bound_ctrl:1 fi:0
+// GFX12: encoding: [0xfa,0x26,0x0a,0x7e,0x01,0x60,0x09,0x13]
+
+v_cvt_f32_ubyte2 v255, v255 row_xmask:15 row_mask:0x3 bank_mask:0x0 bound_ctrl:0 fi:1
+// GFX12: encoding: [0xfa,0x26,0xfe,0x7f,0xff,0x6f,0x05,0x30]
+
+v_cvt_f32_ubyte3 v5, v1 quad_perm:[3,2,1,0]
+// GFX12: encoding: [0xfa,0x28,0x0a,0x7e,0x01,0x1b,0x00,0xff]
+
+v_cvt_f32_ubyte3 v5, v1 quad_perm:[0,1,2,3]
+// GFX12: encoding: [0xfa,0x28,0x0a,0x7e,0x01,0xe4,0x00,0xff]
+
+v_cvt_f32_ubyte3 v5, v1 row_mirror
+// GFX12: encoding: [0xfa,0x28,0x0a,0x7e,0x01,0x40,0x01,0xff]
+
+v_cvt_f32_ubyte3 v5, v1 row_half_mirror
+// GFX12: encoding: [0xfa,0x28,0x0a,0x7e,0x01,0x41,0x01,0xff]
+
+v_cvt_f32_ubyte3 v5, v1 row_shl:1
+// GFX12: encoding: [0xfa,0x28,0x0a,0x7e,0x01,0x01,0x01,0xff]
+
+v_cvt_f32_ubyte3 v5, v1 row_shl:15
+// GFX12: encoding: [0xfa,0x28,0x0a,0x7e,0x01,0x0f,0x01,0xff]
+
+v_cvt_f32_ubyte3 v5, v1 row_shr:1
+// GFX12: encoding: [0xfa,0x28,0x0a,0x7e,0x01,0x11,0x01,0xff]
+
+v_cvt_f32_ubyte3 v5, v1 row_shr:15
+// GFX12: encoding: [0xfa,0x28,0x0a,0x7e,0x01,0x1f,0x01,0xff]
+
+v_cvt_f32_ubyte3 v5, v1 row_ror:1
+// GFX12: encoding: [0xfa,0x28,0x0a,0x7e,0x01,0x21,0x01,0xff]
+
+v_cvt_f32_ubyte3 v5, v1 row_ror:15
+// GFX12: encoding: [0xfa,0x28,0x0a,0x7e,0x01,0x2f,0x01,0xff]
+
+v_cvt_f32_ubyte3 v5, v1 row_share:0 row_mask:0xf bank_mask:0xf
+// GFX12: encoding: [0xfa,0x28,0x0a,0x7e,0x01,0x50,0x01,0xff]
+
+v_cvt_f32_ubyte3 v5, v1 row_share:15 row_mask:0x0 bank_mask:0x1
+// GFX12: encoding: [0xfa,0x28,0x0a,0x7e,0x01,0x5f,0x01,0x01]
+
+v_cvt_f32_ubyte3 v5, v1 row_xmask:0 row_mask:0x1 bank_mask:0x3 bound_ctrl:1 fi:0
+// GFX12: encoding: [0xfa,0x28,0x0a,0x7e,0x01,0x60,0x09,0x13]
+
+v_cvt_f32_ubyte3 v255, v255 row_xmask:15 row_mask:0x3 bank_mask:0x0 bound_ctrl:0 fi:1
+// GFX12: encoding: [0xfa,0x28,0xfe,0x7f,0xff,0x6f,0x05,0x30]
+
+v_cvt_floor_i32_f32 v5, v1 quad_perm:[3,2,1,0]
+// GFX12: encoding: [0xfa,0x1a,0x0a,0x7e,0x01,0x1b,0x00,0xff]
+
+v_cvt_floor_i32_f32 v5, v1 quad_perm:[0,1,2,3]
+// GFX12: encoding: [0xfa,0x1a,0x0a,0x7e,0x01,0xe4,0x00,0xff]
+
+v_cvt_floor_i32_f32 v5, v1 row_mirror
+// GFX12: encoding: [0xfa,0x1a,0x0a,0x7e,0x01,0x40,0x01,0xff]
+
+v_cvt_floor_i32_f32 v5, v1 row_half_mirror
+// GFX12: encoding: [0xfa,0x1a,0x0a,0x7e,0x01,0x41,0x01,0xff]
+
+v_cvt_floor_i32_f32 v5, v1 row_shl:1
+// GFX12: encoding: [0xfa,0x1a,0x0a,0x7e,0x01,0x01,0x01,0xff]
+
+v_cvt_floor_i32_f32 v5, v1 row_shl:15
+// GFX12: encoding: [0xfa,0x1a,0x0a,0x7e,0x01,0x0f,0x01,0xff]
+
+v_cvt_floor_i32_f32 v5, v1 row_shr:1
+// GFX12: encoding: [0xfa,0x1a,0x0a,0x7e,0x01,0x11,0x01,0xff]
+
+v_cvt_floor_i32_f32 v5, v1 row_shr:15
+// GFX12: encoding: [0xfa,0x1a,0x0a,0x7e,0x01,0x1f,0x01,0xff]
+
+v_cvt_floor_i32_f32 v5, v1 row_ror:1
+// GFX12: encoding: [0xfa,0x1a,0x0a,0x7e,0x01,0x21,0x01,0xff]
+
+v_cvt_floor_i32_f32 v5, v1 row_ror:15
+// GFX12: encoding: [0xfa,0x1a,0x0a,0x7e,0x01,0x2f,0x01,0xff]
+
+v_cvt_floor_i32_f32 v5, v1 row_share:0 row_mask:0xf bank_mask:0xf
+// GFX12: encoding: [0xfa,0x1a,0x0a,0x7e,0x01,0x50,0x01,0xff]
+
+v_cvt_floor_i32_f32 v5, v1 row_share:15 row_mask:0x0 bank_mask:0x1
+// GFX12: encoding: [0xfa,0x1a,0x0a,0x7e,0x01,0x5f,0x01,0x01]
+
+v_cvt_floor_i32_f32 v5, v1 row_xmask:0 row_mask:0x1 bank_mask:0x3 bound_ctrl:1 fi:0
+// GFX12: encoding: [0xfa,0x1a,0x0a,0x7e,0x01,0x60,0x09,0x13]
+
+v_cvt_floor_i32_f32 v255, -|v255| row_xmask:15 row_mask:0x3 bank_mask:0x0 bound_ctrl:0 fi:1
+// GFX12: encoding: [0xfa,0x1a,0xfe,0x7f,0xff,0x6f,0x35,0x30]
+
+v_cvt_flr_i32_f32 v5, v1 quad_perm:[3,2,1,0]
+// GFX12: encoding: [0xfa,0x1a,0x0a,0x7e,0x01,0x1b,0x00,0xff]
+
+v_cvt_flr_i32_f32 v5, v1 quad_perm:[0,1,2,3]
+// GFX12: encoding: [0xfa,0x1a,0x0a,0x7e,0x01,0xe4,0x00,0xff]
+
+v_cvt_flr_i32_f32 v5, v1 row_mirror
+// GFX12: encoding: [0xfa,0x1a,0x0a,0x7e,0x01,0x40,0x01,0xff]
+
+v_cvt_flr_i32_f32 v5, v1 row_half_mirror
+// GFX12: encoding: [0xfa,0x1a,0x0a,0x7e,0x01,0x41,0x01,0xff]
+
+v_cvt_flr_i32_f32 v5, v1 row_shl:1
+// GFX12: encoding: [0xfa,0x1a,0x0a,0x7e,0x01,0x01,0x01,0xff]
+
+v_cvt_flr_i32_f32 v5, v1 row_shl:15
+// GFX12: encoding: [0xfa,0x1a,0x0a,0x7e,0x01,0x0f,0x01,0xff]
+
+v_cvt_flr_i32_f32 v5, v1 row_shr:1
+// GFX12: encoding: [0xfa,0x1a,0x0a,0x7e,0x01,0x11,0x01,0xff]
+
+v_cvt_flr_i32_f32 v5, v1 row_shr:15
+// GFX12: encoding: [0xfa,0x1a,0x0a,0x7e,0x01,0x1f,0x01,0xff]
+
+v_cvt_flr_i32_f32 v5, v1 row_ror:1
+// GFX12: encoding: [0xfa,0x1a,0x0a,0x7e,0x01,0x21,0x01,0xff]
+
+v_cvt_flr_i32_f32 v5, v1 row_ror:15
+// GFX12: encoding: [0xfa,0x1a,0x0a,0x7e,0x01,0x2f,0x01,0xff]
+
+v_cvt_flr_i32_f32 v5, v1 row_share:0 row_mask:0xf bank_mask:0xf
+// GFX12: encoding: [0xfa,0x1a,0x0a,0x7e,0x01,0x50,0x01,0xff]
+
+v_cvt_flr_i32_f32 v5, v1 row_share:15 row_mask:0x0 bank_mask:0x1
+// GFX12: encoding: [0xfa,0x1a,0x0a,0x7e,0x01,0x5f,0x01,0x01]
+
+v_cvt_flr_i32_f32 v5, v1 row_xmask:0 row_mask:0x1 bank_mask:0x3 bound_ctrl:1 fi:0
+// GFX12: encoding: [0xfa,0x1a,0x0a,0x7e,0x01,0x60,0x09,0x13]
+
+v_cvt_flr_i32_f32 v255, -|v255| row_xmask:15 row_mask:0x3 bank_mask:0x0 bound_ctrl:0 fi:1
+// GFX12: encoding: [0xfa,0x1a,0xfe,0x7f,0xff,0x6f,0x35,0x30]
+
+v_cvt_i16_f16 v5, v1 quad_perm:[3,2,1,0]
+// GFX12: encoding: [0xfa,0xa6,0x0a,0x7e,0x01,0x1b,0x00,0xff]
+
+v_cvt_i16_f16 v5, v1 quad_perm:[0,1,2,3]
+// GFX12: encoding: [0xfa,0xa6,0x0a,0x7e,0x01,0xe4,0x00,0xff]
+
+v_cvt_i16_f16 v5, v1 row_mirror
+// GFX12: encoding: [0xfa,0xa6,0x0a,0x7e,0x01,0x40,0x01,0xff]
+
+v_cvt_i16_f16 v5, v1 row_half_mirror
+// GFX12: encoding: [0xfa,0xa6,0x0a,0x7e,0x01,0x41,0x01,0xff]
+
+v_cvt_i16_f16 v5, v1 row_shl:1
+// GFX12: encoding: [0xfa,0xa6,0x0a,0x7e,0x01,0x01,0x01,0xff]
+
+v_cvt_i16_f16 v5, v1 row_shl:15
+// GFX12: encoding: [0xfa,0xa6,0x0a,0x7e,0x01,0x0f,0x01,0xff]
+
+v_cvt_i16_f16 v5, v1 row_shr:1
+// GFX12: encoding: [0xfa,0xa6,0x0a,0x7e,0x01,0x11,0x01,0xff]
+
+v_cvt_i16_f16 v5, v1 row_shr:15
+// GFX12: encoding: [0xfa,0xa6,0x0a,0x7e,0x01,0x1f,0x01,0xff]
+
+v_cvt_i16_f16 v5, v1 row_ror:1
+// GFX12: encoding: [0xfa,0xa6,0x0a,0x7e,0x01,0x21,0x01,0xff]
+
+v_cvt_i16_f16 v5, v1 row_ror:15
+// GFX12: encoding: [0xfa,0xa6,0x0a,0x7e,0x01,0x2f,0x01,0xff]
+
+v_cvt_i16_f16 v5, v1 row_share:0 row_mask:0xf bank_mask:0xf
+// GFX12: encoding: [0xfa,0xa6,0x0a,0x7e,0x01,0x50,0x01,0xff]
+
+v_cvt_i16_f16 v5, v1 row_share:15 row_mask:0x0 bank_mask:0x1
+// GFX12: encoding: [0xfa,0xa6,0x0a,0x7e,0x01,0x5f,0x01,0x01]
+
+v_cvt_i16_f16 v5, v1 row_xmask:0 row_mask:0x1 bank_mask:0x3 bound_ctrl:1 fi:0
+// GFX12: encoding: [0xfa,0xa6,0x0a,0x7e,0x01,0x60,0x09,0x13]
+
+v_cvt_i16_f16 v127, -|v127| row_xmask:15 row_mask:0x3 bank_mask:0x0 bound_ctrl:0 fi:1
+// GFX12: encoding: [0xfa,0xa6,0xfe,0x7e,0x7f,0x6f,0x35,0x30]
+
+v_cvt_i32_f32 v5, v1 quad_perm:[3,2,1,0]
+// GFX12: encoding: [0xfa,0x10,0x0a,0x7e,0x01,0x1b,0x00,0xff]
+
+v_cvt_i32_f32 v5, v1 quad_perm:[0,1,2,3]
+// GFX12: encoding: [0xfa,0x10,0x0a,0x7e,0x01,0xe4,0x00,0xff]
+
+v_cvt_i32_f32 v5, v1 row_mirror
+// GFX12: encoding: [0xfa,0x10,0x0a,0x7e,0x01,0x40,0x01,0xff]
+
+v_cvt_i32_f32 v5, v1 row_half_mirror
+// GFX12: encoding: [0xfa,0x10,0x0a,0x7e,0x01,0x41,0x01,0xff]
+
+v_cvt_i32_f32 v5, v1 row_shl:1
+// GFX12: encoding: [0xfa,0x10,0x0a,0x7e,0x01,0x01,0x01,0xff]
+
+v_cvt_i32_f32 v5, v1 row_shl:15
+// GFX12: encoding: [0xfa,0x10,0x0a,0x7e,0x01,0x0f,0x01,0xff]
+
+v_cvt_i32_f32 v5, v1 row_shr:1
+// GFX12: encoding: [0xfa,0x10,0x0a,0x7e,0x01,0x11,0x01,0xff]
+
+v_cvt_i32_f32 v5, v1 row_shr:15
+// GFX12: encoding: [0xfa,0x10,0x0a,0x7e,0x01,0x1f,0x01,0xff]
+
+v_cvt_i32_f32 v5, v1 row_ror:1
+// GFX12: encoding: [0xfa,0x10,0x0a,0x7e,0x01,0x21,0x01,0xff]
+
+v_cvt_i32_f32 v5, v1 row_ror:15
+// GFX12: encoding: [0xfa,0x10,0x0a,0x7e,0x01,0x2f,0x01,0xff]
+
+v_cvt_i32_f32 v5, v1 row_share:0 row_mask:0xf bank_mask:0xf
+// GFX12: encoding: [0xfa,0x10,0x0a,0x7e,0x01,0x50,0x01,0xff]
+
+v_cvt_i32_f32 v5, v1 row_share:15 row_mask:0x0 bank_mask:0x1
+// GFX12: encoding: [0xfa,0x10,0x0a,0x7e,0x01,0x5f,0x01,0x01]
+
+v_cvt_i32_f32 v5, v1 row_xmask:0 row_mask:0x1 bank_mask:0x3 bound_ctrl:1 fi:0
+// GFX12: encoding: [0xfa,0x10,0x0a,0x7e,0x01,0x60,0x09,0x13]
+
+v_cvt_i32_f32 v255, -|v255| row_xmask:15 row_mask:0x3 bank_mask:0x0 bound_ctrl:0 fi:1
+// GFX12: encoding: [0xfa,0x10,0xfe,0x7f,0xff,0x6f,0x35,0x30]
+
+v_cvt_i32_i16 v5, v1 quad_perm:[3,2,1,0]
+// GFX12: encoding: [0xfa,0xd4,0x0a,0x7e,0x01,0x1b,0x00,0xff]
+
+v_cvt_i32_i16 v5, v1 quad_perm:[0,1,2,3]
+// GFX12: encoding: [0xfa,0xd4,0x0a,0x7e,0x01,0xe4,0x00,0xff]
+
+v_cvt_i32_i16 v5, v1 row_mirror
+// GFX12: encoding: [0xfa,0xd4,0x0a,0x7e,0x01,0x40,0x01,0xff]
+
+v_cvt_i32_i16 v5, v1 row_half_mirror
+// GFX12: encoding: [0xfa,0xd4,0x0a,0x7e,0x01,0x41,0x01,0xff]
+
+v_cvt_i32_i16 v5, v1 row_shl:1
+// GFX12: encoding: [0xfa,0xd4,0x0a,0x7e,0x01,0x01,0x01,0xff]
+
+v_cvt_i32_i16 v5, v1 row_shl:15
+// GFX12: encoding: [0xfa,0xd4,0x0a,0x7e,0x01,0x0f,0x01,0xff]
+
+v_cvt_i32_i16 v5, v1 row_shr:1
+// GFX12: encoding: [0xfa,0xd4,0x0a,0x7e,0x01,0x11,0x01,0xff]
+
+v_cvt_i32_i16 v5, v1 row_shr:15
+// GFX12: encoding: [0xfa,0xd4,0x0a,0x7e,0x01,0x1f,0x01,0xff]
+
+v_cvt_i32_i16 v5, v1 row_ror:1
+// GFX12: encoding: [0xfa,0xd4,0x0a,0x7e,0x01,0x21,0x01,0xff]
+
+v_cvt_i32_i16 v5, v1 row_ror:15
+// GFX12: encoding: [0xfa,0xd4,0x0a,0x7e,0x01,0x2f,0x01,0xff]
+
+v_cvt_i32_i16 v5, v1 row_share:0 row_mask:0xf bank_mask:0xf
+// GFX12: encoding: [0xfa,0xd4,0x0a,0x7e,0x01,0x50,0x01,0xff]
+
+v_cvt_i32_i16 v5, v1 row_share:15 row_mask:0x0 bank_mask:0x1
+// GFX12: encoding: [0xfa,0xd4,0x0a,0x7e,0x01,0x5f,0x01,0x01]
+
+v_cvt_i32_i16 v5, v1 row_xmask:0 row_mask:0x1 bank_mask:0x3 bound_ctrl:1 fi:0
+// GFX12: encoding: [0xfa,0xd4,0x0a,0x7e,0x01,0x60,0x09,0x13]
+
+v_cvt_i32_i16 v255, v127 row_xmask:15 row_mask:0x3 bank_mask:0x0 bound_ctrl:0 fi:1
+// GFX12: encoding: [0xfa,0xd4,0xfe,0x7f,0x7f,0x6f,0x05,0x30]
+
+v_cvt_nearest_i32_f32 v5, v1 quad_perm:[3,2,1,0]
+// GFX12: encoding: [0xfa,0x18,0x0a,0x7e,0x01,0x1b,0x00,0xff]
+
+v_cvt_nearest_i32_f32 v5, v1 quad_perm:[0,1,2,3]
+// GFX12: encoding: [0xfa,0x18,0x0a,0x7e,0x01,0xe4,0x00,0xff]
+
+v_cvt_nearest_i32_f32 v5, v1 row_mirror
+// GFX12: encoding: [0xfa,0x18,0x0a,0x7e,0x01,0x40,0x01,0xff]
+
+v_cvt_nearest_i32_f32 v5, v1 row_half_mirror
+// GFX12: encoding: [0xfa,0x18,0x0a,0x7e,0x01,0x41,0x01,0xff]
+
+v_cvt_nearest_i32_f32 v5, v1 row_shl:1
+// GFX12: encoding: [0xfa,0x18,0x0a,0x7e,0x01,0x01,0x01,0xff]
+
+v_cvt_nearest_i32_f32 v5, v1 row_shl:15
+// GFX12: encoding: [0xfa,0x18,0x0a,0x7e,0x01,0x0f,0x01,0xff]
+
+v_cvt_nearest_i32_f32 v5, v1 row_shr:1
+// GFX12: encoding: [0xfa,0x18,0x0a,0x7e,0x01,0x11,0x01,0xff]
+
+v_cvt_nearest_i32_f32 v5, v1 row_shr:15
+// GFX12: encoding: [0xfa,0x18,0x0a,0x7e,0x01,0x1f,0x01,0xff]
+
+v_cvt_nearest_i32_f32 v5, v1 row_ror:1
+// GFX12: encoding: [0xfa,0x18,0x0a,0x7e,0x01,0x21,0x01,0xff]
+
+v_cvt_nearest_i32_f32 v5, v1 row_ror:15
+// GFX12: encoding: [0xfa,0x18,0x0a,0x7e,0x01,0x2f,0x01,0xff]
+
+v_cvt_nearest_i32_f32 v5, v1 row_share:0 row_mask:0xf bank_mask:0xf
+// GFX12: encoding: [0xfa,0x18,0x0a,0x7e,0x01,0x50,0x01,0xff]
+
+v_cvt_nearest_i32_f32 v5, v1 row_share:15 row_mask:0x0 bank_mask:0x1
+// GFX12: encoding: [0xfa,0x18,0x0a,0x7e,0x01,0x5f,0x01,0x01]
+
+v_cvt_nearest_i32_f32 v5, v1 row_xmask:0 row_mask:0x1 bank_mask:0x3 bound_ctrl:1 fi:0
+// GFX12: encoding: [0xfa,0x18,0x0a,0x7e,0x01,0x60,0x09,0x13]
+
+v_cvt_nearest_i32_f32 v255, -|v255| row_xmask:15 row_mask:0x3 bank_mask:0x0 bound_ctrl:0 fi:1
+// GFX12: encoding: [0xfa,0x18,0xfe,0x7f,0xff,0x6f,0x35,0x30]
+
+v_cvt_norm_i16_f16 v5, v1 quad_perm:[3,2,1,0]
+// GFX12: encoding: [0xfa,0xc6,0x0a,0x7e,0x01,0x1b,0x00,0xff]
+
+v_cvt_norm_i16_f16 v5, v1 quad_perm:[0,1,2,3]
+// GFX12: encoding: [0xfa,0xc6,0x0a,0x7e,0x01,0xe4,0x00,0xff]
+
+v_cvt_norm_i16_f16 v5, v1 row_mirror
+// GFX12: encoding: [0xfa,0xc6,0x0a,0x7e,0x01,0x40,0x01,0xff]
+
+v_cvt_norm_i16_f16 v5, v1 row_half_mirror
+// GFX12: encoding: [0xfa,0xc6,0x0a,0x7e,0x01,0x41,0x01,0xff]
+
+v_cvt_norm_i16_f16 v5, v1 row_shl:1
+// GFX12: encoding: [0xfa,0xc6,0x0a,0x7e,0x01,0x01,0x01,0xff]
+
+v_cvt_norm_i16_f16 v5, v1 row_shl:15
+// GFX12: encoding: [0xfa,0xc6,0x0a,0x7e,0x01,0x0f,0x01,0xff]
+
+v_cvt_norm_i16_f16 v5, v1 row_shr:1
+// GFX12: encoding: [0xfa,0xc6,0x0a,0x7e,0x01,0x11,0x01,0xff]
+
+v_cvt_norm_i16_f16 v5, v1 row_shr:15
+// GFX12: encoding: [0xfa,0xc6,0x0a,0x7e,0x01,0x1f,0x01,0xff]
+
+v_cvt_norm_i16_f16 v5, v1 row_ror:1
+// GFX12: encoding: [0xfa,0xc6,0x0a,0x7e,0x01,0x21,0x01,0xff]
+
+v_cvt_norm_i16_f16 v5, v1 row_ror:15
+// GFX12: encoding: [0xfa,0xc6,0x0a,0x7e,0x01,0x2f,0x01,0xff]
+
+v_cvt_norm_i16_f16 v5, v1 row_share:0 row_mask:0xf bank_mask:0xf
+// GFX12: encoding: [0xfa,0xc6,0x0a,0x7e,0x01,0x50,0x01,0xff]
+
+v_cvt_norm_i16_f16 v5, v1 row_share:15 row_mask:0x0 bank_mask:0x1
+// GFX12: encoding: [0xfa,0xc6,0x0a,0x7e,0x01,0x5f,0x01,0x01]
+
+v_cvt_norm_i16_f16 v5, v1 row_xmask:0 row_mask:0x1 bank_mask:0x3 bound_ctrl:1 fi:0
+// GFX12: encoding: [0xfa,0xc6,0x0a,0x7e,0x01,0x60,0x09,0x13]
+
+v_cvt_norm_i16_f16 v127, -|v127| row_xmask:15 row_mask:0x3 bank_mask:0x0 bound_ctrl:0 fi:1
+// GFX12: encoding: [0xfa,0xc6,0xfe,0x7e,0x7f,0x6f,0x35,0x30]
+
+v_cvt_norm_u16_f16 v5, v1 quad_perm:[3,2,1,0]
+// GFX12: encoding: [0xfa,0xc8,0x0a,0x7e,0x01,0x1b,0x00,0xff]
+
+v_cvt_norm_u16_f16 v5, v1 quad_perm:[0,1,2,3]
+// GFX12: encoding: [0xfa,0xc8,0x0a,0x7e,0x01,0xe4,0x00,0xff]
+
+v_cvt_norm_u16_f16 v5, v1 row_mirror
+// GFX12: encoding: [0xfa,0xc8,0x0a,0x7e,0x01,0x40,0x01,0xff]
+
+v_cvt_norm_u16_f16 v5, v1 row_half_mirror
+// GFX12: encoding: [0xfa,0xc8,0x0a,0x7e,0x01,0x41,0x01,0xff]
+
+v_cvt_norm_u16_f16 v5, v1 row_shl:1
+// GFX12: encoding: [0xfa,0xc8,0x0a,0x7e,0x01,0x01,0x01,0xff]
+
+v_cvt_norm_u16_f16 v5, v1 row_shl:15
+// GFX12: encoding: [0xfa,0xc8,0x0a,0x7e,0x01,0x0f,0x01,0xff]
+
+v_cvt_norm_u16_f16 v5, v1 row_shr:1
+// GFX12: encoding: [0xfa,0xc8,0x0a,0x7e,0x01,0x11,0x01,0xff]
+
+v_cvt_norm_u16_f16 v5, v1 row_shr:15
+// GFX12: encoding: [0xfa,0xc8,0x0a,0x7e,0x01,0x1f,0x01,0xff]
+
+v_cvt_norm_u16_f16 v5, v1 row_ror:1
+// GFX12: encoding: [0xfa,0xc8,0x0a,0x7e,0x01,0x21,0x01,0xff]
+
+v_cvt_norm_u16_f16 v5, v1 row_ror:15
+// GFX12: encoding: [0xfa,0xc8,0x0a,0x7e,0x01,0x2f,0x01,0xff]
+
+v_cvt_norm_u16_f16 v5, v1 row_share:0 row_mask:0xf bank_mask:0xf
+// GFX12: encoding: [0xfa,0xc8,0x0a,0x7e,0x01,0x50,0x01,0xff]
+
+v_cvt_norm_u16_f16 v5, v1 row_share:15 row_mask:0x0 bank_mask:0x1
+// GFX12: encoding: [0xfa,0xc8,0x0a,0x7e,0x01,0x5f,0x01,0x01]
+
+v_cvt_norm_u16_f16 v5, v1 row_xmask:0 row_mask:0x1 bank_mask:0x3 bound_ctrl:1 fi:0
+// GFX12: encoding: [0xfa,0xc8,0x0a,0x7e,0x01,0x60,0x09,0x13]
+
+v_cvt_norm_u16_f16 v127, -|v127| row_xmask:15 row_mask:0x3 bank_mask:0x0 bound_ctrl:0 fi:1
+// GFX12: encoding: [0xfa,0xc8,0xfe,0x7e,0x7f,0x6f,0x35,0x30]
+
+v_cvt_off_f32_i4 v5, v1 quad_perm:[3,2,1,0]
+// GFX12: encoding: [0xfa,0x1c,0x0a,0x7e,0x01,0x1b,0x00,0xff]
+
+v_cvt_off_f32_i4 v5, v1 quad_perm:[0,1,2,3]
+// GFX12: encoding: [0xfa,0x1c,0x0a,0x7e,0x01,0xe4,0x00,0xff]
+
+v_cvt_off_f32_i4 v5, v1 row_mirror
+// GFX12: encoding: [0xfa,0x1c,0x0a,0x7e,0x01,0x40,0x01,0xff]
+
+v_cvt_off_f32_i4 v5, v1 row_half_mirror
+// GFX12: encoding: [0xfa,0x1c,0x0a,0x7e,0x01,0x41,0x01,0xff]
+
+v_cvt_off_f32_i4 v5, v1 row_shl:1
+// GFX12: encoding: [0xfa,0x1c,0x0a,0x7e,0x01,0x01,0x01,0xff]
+
+v_cvt_off_f32_i4 v5, v1 row_shl:15
+// GFX12: encoding: [0xfa,0x1c,0x0a,0x7e,0x01,0x0f,0x01,0xff]
+
+v_cvt_off_f32_i4 v5, v1 row_shr:1
+// GFX12: encoding: [0xfa,0x1c,0x0a,0x7e,0x01,0x11,0x01,0xff]
+
+v_cvt_off_f32_i4 v5, v1 row_shr:15
+// GFX12: encoding: [0xfa,0x1c,0x0a,0x7e,0x01,0x1f,0x01,0xff]
+
+v_cvt_off_f32_i4 v5, v1 row_ror:1
+// GFX12: encoding: [0xfa,0x1c,0x0a,0x7e,0x01,0x21,0x01,0xff]
+
+v_cvt_off_f32_i4 v5, v1 row_ror:15
+// GFX12: encoding: [0xfa,0x1c,0x0a,0x7e,0x01,0x2f,0x01,0xff]
+
+v_cvt_off_f32_i4 v5, v1 row_share:0 row_mask:0xf bank_mask:0xf
+// GFX12: encoding: [0xfa,0x1c,0x0a,0x7e,0x01,0x50,0x01,0xff]
+
+v_cvt_off_f32_i4 v5, v1 row_share:15 row_mask:0x0 bank_mask:0x1
+// GFX12: encoding: [0xfa,0x1c,0x0a,0x7e,0x01,0x5f,0x01,0x01]
+
+v_cvt_off_f32_i4 v5, v1 row_xmask:0 row_mask:0x1 bank_mask:0x3 bound_ctrl:1 fi:0
+// GFX12: encoding: [0xfa,0x1c,0x0a,0x7e,0x01,0x60,0x09,0x13]
+
+v_cvt_off_f32_i4 v255, v255 row_xmask:15 row_mask:0x3 bank_mask:0x0 bound_ctrl:0 fi:1
+// GFX12: encoding: [0xfa,0x1c,0xfe,0x7f,0xff,0x6f,0x05,0x30]
+
+v_cvt_rpi_i32_f32 v5, v1 quad_perm:[3,2,1,0]
+// GFX12: encoding: [0xfa,0x18,0x0a,0x7e,0x01,0x1b,0x00,0xff]
+
+v_cvt_rpi_i32_f32 v5, v1 quad_perm:[0,1,2,3]
+// GFX12: encoding: [0xfa,0x18,0x0a,0x7e,0x01,0xe4,0x00,0xff]
+
+v_cvt_rpi_i32_f32 v5, v1 row_mirror
+// GFX12: encoding: [0xfa,0x18,0x0a,0x7e,0x01,0x40,0x01,0xff]
+
+v_cvt_rpi_i32_f32 v5, v1 row_half_mirror
+// GFX12: encoding: [0xfa,0x18,0x0a,0x7e,0x01,0x41,0x01,0xff]
+
+v_cvt_rpi_i32_f32 v5, v1 row_shl:1
+// GFX12: encoding: [0xfa,0x18,0x0a,0x7e,0x01,0x01,0x01,0xff]
+
+v_cvt_rpi_i32_f32 v5, v1 row_shl:15
+// GFX12: encoding: [0xfa,0x18,0x0a,0x7e,0x01,0x0f,0x01,0xff]
+
+v_cvt_rpi_i32_f32 v5, v1 row_shr:1
+// GFX12: encoding: [0xfa,0x18,0x0a,0x7e,0x01,0x11,0x01,0xff]
+
+v_cvt_rpi_i32_f32 v5, v1 row_shr:15
+// GFX12: encoding: [0xfa,0x18,0x0a,0x7e,0x01,0x1f,0x01,0xff]
+
+v_cvt_rpi_i32_f32 v5, v1 row_ror:1
+// GFX12: encoding: [0xfa,0x18,0x0a,0x7e,0x01,0x21,0x01,0xff]
+
+v_cvt_rpi_i32_f32 v5, v1 row_ror:15
+// GFX12: encoding: [0xfa,0x18,0x0a,0x7e,0x01,0x2f,0x01,0xff]
+
+v_cvt_rpi_i32_f32 v5, v1 row_share:0 row_mask:0xf bank_mask:0xf
+// GFX12: encoding: [0xfa,0x18,0x0a,0x7e,0x01,0x50,0x01,0xff]
+
+v_cvt_rpi_i32_f32 v5, v1 row_share:15 row_mask:0x0 bank_mask:0x1
+// GFX12: encoding: [0xfa,0x18,0x0a,0x7e,0x01,0x5f,0x01,0x01]
+
+v_cvt_rpi_i32_f32 v5, v1 row_xmask:0 row_mask:0x1 bank_mask:0x3 bound_ctrl:1 fi:0
+// GFX12: encoding: [0xfa,0x18,0x0a,0x7e,0x01,0x60,0x09,0x13]
+
+v_cvt_rpi_i32_f32 v255, -|v255| row_xmask:15 row_mask:0x3 bank_mask:0x0 bound_ctrl:0 fi:1
+// GFX12: encoding: [0xfa,0x18,0xfe,0x7f,0xff,0x6f,0x35,0x30]
+
+v_cvt_u16_f16 v5, v1 quad_perm:[3,2,1,0]
+// GFX12: encoding: [0xfa,0xa4,0x0a,0x7e,0x01,0x1b,0x00,0xff]
+
+v_cvt_u16_f16 v5, v1 quad_perm:[0,1,2,3]
+// GFX12: encoding: [0xfa,0xa4,0x0a,0x7e,0x01,0xe4,0x00,0xff]
+
+v_cvt_u16_f16 v5, v1 row_mirror
+// GFX12: encoding: [0xfa,0xa4,0x0a,0x7e,0x01,0x40,0x01,0xff]
+
+v_cvt_u16_f16 v5, v1 row_half_mirror
+// GFX12: encoding: [0xfa,0xa4,0x0a,0x7e,0x01,0x41,0x01,0xff]
+
+v_cvt_u16_f16 v5, v1 row_shl:1
+// GFX12: encoding: [0xfa,0xa4,0x0a,0x7e,0x01,0x01,0x01,0xff]
+
+v_cvt_u16_f16 v5, v1 row_shl:15
+// GFX12: encoding: [0xfa,0xa4,0x0a,0x7e,0x01,0x0f,0x01,0xff]
+
+v_cvt_u16_f16 v5, v1 row_shr:1
+// GFX12: encoding: [0xfa,0xa4,0x0a,0x7e,0x01,0x11,0x01,0xff]
+
+v_cvt_u16_f16 v5, v1 row_shr:15
+// GFX12: encoding: [0xfa,0xa4,0x0a,0x7e,0x01,0x1f,0x01,0xff]
+
+v_cvt_u16_f16 v5, v1 row_ror:1
+// GFX12: encoding: [0xfa,0xa4,0x0a,0x7e,0x01,0x21,0x01,0xff]
+
+v_cvt_u16_f16 v5, v1 row_ror:15
+// GFX12: encoding: [0xfa,0xa4,0x0a,0x7e,0x01,0x2f,0x01,0xff]
+
+v_cvt_u16_f16 v5, v1 row_share:0 row_mask:0xf bank_mask:0xf
+// GFX12: encoding: [0xfa,0xa4,0x0a,0x7e,0x01,0x50,0x01,0xff]
+
+v_cvt_u16_f16 v5, v1 row_share:15 row_mask:0x0 bank_mask:0x1
+// GFX12: encoding: [0xfa,0xa4,0x0a,0x7e,0x01,0x5f,0x01,0x01]
+
+v_cvt_u16_f16 v5, v1 row_xmask:0 row_mask:0x1 bank_mask:0x3 bound_ctrl:1 fi:0
+// GFX12: encoding: [0xfa,0xa4,0x0a,0x7e,0x01,0x60,0x09,0x13]
+
+v_cvt_u16_f16 v127, -|v127| row_xmask:15 row_mask:0x3 bank_mask:0x0 bound_ctrl:0 fi:1
+// GFX12: encoding: [0xfa,0xa4,0xfe,0x7e,0x7f,0x6f,0x35,0x30]
+
+v_cvt_u32_f32 v5, v1 quad_perm:[3,2,1,0]
+// GFX12: encoding: [0xfa,0x0e,0x0a,0x7e,0x01,0x1b,0x00,0xff]
+
+v_cvt_u32_f32 v5, v1 quad_perm:[0,1,2,3]
+// GFX12: encoding: [0xfa,0x0e,0x0a,0x7e,0x01,0xe4,0x00,0xff]
+
+v_cvt_u32_f32 v5, v1 row_mirror
+// GFX12: encoding: [0xfa,0x0e,0x0a,0x7e,0x01,0x40,0x01,0xff]
+
+v_cvt_u32_f32 v5, v1 row_half_mirror
+// GFX12: encoding: [0xfa,0x0e,0x0a,0x7e,0x01,0x41,0x01,0xff]
+
+v_cvt_u32_f32 v5, v1 row_shl:1
+// GFX12: encoding: [0xfa,0x0e,0x0a,0x7e,0x01,0x01,0x01,0xff]
+
+v_cvt_u32_f32 v5, v1 row_shl:15
+// GFX12: encoding: [0xfa,0x0e,0x0a,0x7e,0x01,0x0f,0x01,0xff]
+
+v_cvt_u32_f32 v5, v1 row_shr:1
+// GFX12: encoding: [0xfa,0x0e,0x0a,0x7e,0x01,0x11,0x01,0xff]
+
+v_cvt_u32_f32 v5, v1 row_shr:15
+// GFX12: encoding: [0xfa,0x0e,0x0a,0x7e,0x01,0x1f,0x01,0xff]
+
+v_cvt_u32_f32 v5, v1 row_ror:1
+// GFX12: encoding: [0xfa,0x0e,0x0a,0x7e,0x01,0x21,0x01,0xff]
+
+v_cvt_u32_f32 v5, v1 row_ror:15
+// GFX12: encoding: [0xfa,0x0e,0x0a,0x7e,0x01,0x2f,0x01,0xff]
+
+v_cvt_u32_f32 v5, v1 row_share:0 row_mask:0xf bank_mask:0xf
+// GFX12: encoding: [0xfa,0x0e,0x0a,0x7e,0x01,0x50,0x01,0xff]
+
+v_cvt_u32_f32 v5, v1 row_share:15 row_mask:0x0 bank_mask:0x1
+// GFX12: encoding: [0xfa,0x0e,0x0a,0x7e,0x01,0x5f,0x01,0x01]
+
+v_cvt_u32_f32 v5, v1 row_xmask:0 row_mask:0x1 bank_mask:0x3 bound_ctrl:1 fi:0
+// GFX12: encoding: [0xfa,0x0e,0x0a,0x7e,0x01,0x60,0x09,0x13]
+
+v_cvt_u32_f32 v255, -|v255| row_xmask:15 row_mask:0x3 bank_mask:0x0 bound_ctrl:0 fi:1
+// GFX12: encoding: [0xfa,0x0e,0xfe,0x7f,0xff,0x6f,0x35,0x30]
+
+v_cvt_u32_u16 v5, v1 quad_perm:[3,2,1,0]
+// GFX12: encoding: [0xfa,0xd6,0x0a,0x7e,0x01,0x1b,0x00,0xff]
+
+v_cvt_u32_u16 v5, v1 quad_perm:[0,1,2,3]
+// GFX12: encoding: [0xfa,0xd6,0x0a,0x7e,0x01,0xe4,0x00,0xff]
+
+v_cvt_u32_u16 v5, v1 row_mirror
+// GFX12: encoding: [0xfa,0xd6,0x0a,0x7e,0x01,0x40,0x01,0xff]
+
+v_cvt_u32_u16 v5, v1 row_half_mirror
+// GFX12: encoding: [0xfa,0xd6,0x0a,0x7e,0x01,0x41,0x01,0xff]
+
+v_cvt_u32_u16 v5, v1 row_shl:1
+// GFX12: encoding: [0xfa,0xd6,0x0a,0x7e,0x01,0x01,0x01,0xff]
+
+v_cvt_u32_u16 v5, v1 row_shl:15
+// GFX12: encoding: [0xfa,0xd6,0x0a,0x7e,0x01,0x0f,0x01,0xff]
+
+v_cvt_u32_u16 v5, v1 row_shr:1
+// GFX12: encoding: [0xfa,0xd6,0x0a,0x7e,0x01,0x11,0x01,0xff]
+
+v_cvt_u32_u16 v5, v1 row_shr:15
+// GFX12: encoding: [0xfa,0xd6,0x0a,0x7e,0x01,0x1f,0x01,0xff]
+
+v_cvt_u32_u16 v5, v1 row_ror:1
+// GFX12: encoding: [0xfa,0xd6,0x0a,0x7e,0x01,0x21,0x01,0xff]
+
+v_cvt_u32_u16 v5, v1 row_ror:15
+// GFX12: encoding: [0xfa,0xd6,0x0a,0x7e,0x01,0x2f,0x01,0xff]
+
+v_cvt_u32_u16 v5, v1 row_share:0 row_mask:0xf bank_mask:0xf
+// GFX12: encoding: [0xfa,0xd6,0x0a,0x7e,0x01,0x50,0x01,0xff]
+
+v_cvt_u32_u16 v5, v1 row_share:15 row_mask:0x0 bank_mask:0x1
+// GFX12: encoding: [0xfa,0xd6,0x0a,0x7e,0x01,0x5f,0x01,0x01]
+
+v_cvt_u32_u16 v5, v1 row_xmask:0 row_mask:0x1 bank_mask:0x3 bound_ctrl:1 fi:0
+// GFX12: encoding: [0xfa,0xd6,0x0a,0x7e,0x01,0x60,0x09,0x13]
+
+v_cvt_u32_u16 v255, v127 row_xmask:15 row_mask:0x3 bank_mask:0x0 bound_ctrl:0 fi:1
+// GFX12: encoding: [0xfa,0xd6,0xfe,0x7f,0x7f,0x6f,0x05,0x30]
+
+v_exp_f16 v5, v1 quad_perm:[3,2,1,0]
+// GFX12: encoding: [0xfa,0xb0,0x0a,0x7e,0x01,0x1b,0x00,0xff]
+
+v_exp_f16 v5, v1 quad_perm:[0,1,2,3]
+// GFX12: encoding: [0xfa,0xb0,0x0a,0x7e,0x01,0xe4,0x00,0xff]
+
+v_exp_f16 v5, v1 row_mirror
+// GFX12: encoding: [0xfa,0xb0,0x0a,0x7e,0x01,0x40,0x01,0xff]
+
+v_exp_f16 v5, v1 row_half_mirror
+// GFX12: encoding: [0xfa,0xb0,0x0a,0x7e,0x01,0x41,0x01,0xff]
+
+v_exp_f16 v5, v1 row_shl:1
+// GFX12: encoding: [0xfa,0xb0,0x0a,0x7e,0x01,0x01,0x01,0xff]
+
+v_exp_f16 v5, v1 row_shl:15
+// GFX12: encoding: [0xfa,0xb0,0x0a,0x7e,0x01,0x0f,0x01,0xff]
+
+v_exp_f16 v5, v1 row_shr:1
+// GFX12: encoding: [0xfa,0xb0,0x0a,0x7e,0x01,0x11,0x01,0xff]
+
+v_exp_f16 v5, v1 row_shr:15
+// GFX12: encoding: [0xfa,0xb0,0x0a,0x7e,0x01,0x1f,0x01,0xff]
+
+v_exp_f16 v5, v1 row_ror:1
+// GFX12: encoding: [0xfa,0xb0,0x0a,0x7e,0x01,0x21,0x01,0xff]
+
+v_exp_f16 v5, v1 row_ror:15
+// GFX12: encoding: [0xfa,0xb0,0x0a,0x7e,0x01,0x2f,0x01,0xff]
+
+v_exp_f16 v5, v1 row_share:0 row_mask:0xf bank_mask:0xf
+// GFX12: encoding: [0xfa,0xb0,0x0a,0x7e,0x01,0x50,0x01,0xff]
+
+v_exp_f16 v5, v1 row_share:15 row_mask:0x0 bank_mask:0x1
+// GFX12: encoding: [0xfa,0xb0,0x0a,0x7e,0x01,0x5f,0x01,0x01]
+
+v_exp_f16 v5, v1 row_xmask:0 row_mask:0x1 bank_mask:0x3 bound_ctrl:1 fi:0
+// GFX12: encoding: [0xfa,0xb0,0x0a,0x7e,0x01,0x60,0x09,0x13]
+
+v_exp_f16 v127, -|v127| row_xmask:15 row_mask:0x3 bank_mask:0x0 bound_ctrl:0 fi:1
+// GFX12: encoding: [0xfa,0xb0,0xfe,0x7e,0x7f,0x6f,0x35,0x30]
+
+v_exp_f32 v5, v1 quad_perm:[3,2,1,0]
+// GFX12: encoding: [0xfa,0x4a,0x0a,0x7e,0x01,0x1b,0x00,0xff]
+
+v_exp_f32 v5, v1 quad_perm:[0,1,2,3]
+// GFX12: encoding: [0xfa,0x4a,0x0a,0x7e,0x01,0xe4,0x00,0xff]
+
+v_exp_f32 v5, v1 row_mirror
+// GFX12: encoding: [0xfa,0x4a,0x0a,0x7e,0x01,0x40,0x01,0xff]
+
+v_exp_f32 v5, v1 row_half_mirror
+// GFX12: encoding: [0xfa,0x4a,0x0a,0x7e,0x01,0x41,0x01,0xff]
+
+v_exp_f32 v5, v1 row_shl:1
+// GFX12: encoding: [0xfa,0x4a,0x0a,0x7e,0x01,0x01,0x01,0xff]
+
+v_exp_f32 v5, v1 row_shl:15
+// GFX12: encoding: [0xfa,0x4a,0x0a,0x7e,0x01,0x0f,0x01,0xff]
+
+v_exp_f32 v5, v1 row_shr:1
+// GFX12: encoding: [0xfa,0x4a,0x0a,0x7e,0x01,0x11,0x01,0xff]
+
+v_exp_f32 v5, v1 row_shr:15
+// GFX12: encoding: [0xfa,0x4a,0x0a,0x7e,0x01,0x1f,0x01,0xff]
+
+v_exp_f32 v5, v1 row_ror:1
+// GFX12: encoding: [0xfa,0x4a,0x0a,0x7e,0x01,0x21,0x01,0xff]
+
+v_exp_f32 v5, v1 row_ror:15
+// GFX12: encoding: [0xfa,0x4a,0x0a,0x7e,0x01,0x2f,0x01,0xff]
+
+v_exp_f32 v5, v1 row_share:0 row_mask:0xf bank_mask:0xf
+// GFX12: encoding: [0xfa,0x4a,0x0a,0x7e,0x01,0x50,0x01,0xff]
+
+v_exp_f32 v5, v1 row_share:15 row_mask:0x0 bank_mask:0x1
+// GFX12: encoding: [0xfa,0x4a,0x0a,0x7e,0x01,0x5f,0x01,0x01]
+
+v_exp_f32 v5, v1 row_xmask:0 row_mask:0x1 bank_mask:0x3 bound_ctrl:1 fi:0
+// GFX12: encoding: [0xfa,0x4a,0x0a,0x7e,0x01,0x60,0x09,0x13]
+
+v_exp_f32 v255, -|v255| row_xmask:15 row_mask:0x3 bank_mask:0x0 bound_ctrl:0 fi:1
+// GFX12: encoding: [0xfa,0x4a,0xfe,0x7f,0xff,0x6f,0x35,0x30]
+
+v_ffbh_i32 v5, v1 quad_perm:[3,2,1,0]
+// GFX12: encoding: [0xfa,0x76,0x0a,0x7e,0x01,0x1b,0x00,0xff]
+
+v_ffbh_i32 v5, v1 quad_perm:[0,1,2,3]
+// GFX12: encoding: [0xfa,0x76,0x0a,0x7e,0x01,0xe4,0x00,0xff]
+
+v_ffbh_i32 v5, v1 row_mirror
+// GFX12: encoding: [0xfa,0x76,0x0a,0x7e,0x01,0x40,0x01,0xff]
+
+v_ffbh_i32 v5, v1 row_half_mirror
+// GFX12: encoding: [0xfa,0x76,0x0a,0x7e,0x01,0x41,0x01,0xff]
+
+v_ffbh_i32 v5, v1 row_shl:1
+// GFX12: encoding: [0xfa,0x76,0x0a,0x7e,0x01,0x01,0x01,0xff]
+
+v_ffbh_i32 v5, v1 row_shl:15
+// GFX12: encoding: [0xfa,0x76,0x0a,0x7e,0x01,0x0f,0x01,0xff]
+
+v_ffbh_i32 v5, v1 row_shr:1
+// GFX12: encoding: [0xfa,0x76,0x0a,0x7e,0x01,0x11,0x01,0xff]
+
+v_ffbh_i32 v5, v1 row_shr:15
+// GFX12: encoding: [0xfa,0x76,0x0a,0x7e,0x01,0x1f,0x01,0xff]
+
+v_ffbh_i32 v5, v1 row_ror:1
+// GFX12: encoding: [0xfa,0x76,0x0a,0x7e,0x01,0x21,0x01,0xff]
+
+v_ffbh_i32 v5, v1 row_ror:15
+// GFX12: encoding: [0xfa,0x76,0x0a,0x7e,0x01,0x2f,0x01,0xff]
+
+v_ffbh_i32 v5, v1 row_share:0 row_mask:0xf bank_mask:0xf
+// GFX12: encoding: [0xfa,0x76,0x0a,0x7e,0x01,0x50,0x01,0xff]
+
+v_ffbh_i32 v5, v1 row_share:15 row_mask:0x0 bank_mask:0x1
+// GFX12: encoding: [0xfa,0x76,0x0a,0x7e,0x01,0x5f,0x01,0x01]
+
+v_ffbh_i32 v5, v1 row_xmask:0 row_mask:0x1 bank_mask:0x3 bound_ctrl:1 fi:0
+// GFX12: encoding: [0xfa,0x76,0x0a,0x7e,0x01,0x60,0x09,0x13]
+
+v_ffbh_i32 v255, v255 row_xmask:15 row_mask:0x3 bank_mask:0x0 bound_ctrl:0 fi:1
+// GFX12: encoding: [0xfa,0x76,0xfe,0x7f,0xff,0x6f,0x05,0x30]
+
+v_ffbh_u32 v5, v1 quad_perm:[3,2,1,0]
+// GFX12: encoding: [0xfa,0x72,0x0a,0x7e,0x01,0x1b,0x00,0xff]
+
+v_ffbh_u32 v5, v1 quad_perm:[0,1,2,3]
+// GFX12: encoding: [0xfa,0x72,0x0a,0x7e,0x01,0xe4,0x00,0xff]
+
+v_ffbh_u32 v5, v1 row_mirror
+// GFX12: encoding: [0xfa,0x72,0x0a,0x7e,0x01,0x40,0x01,0xff]
+
+v_ffbh_u32 v5, v1 row_half_mirror
+// GFX12: encoding: [0xfa,0x72,0x0a,0x7e,0x01,0x41,0x01,0xff]
+
+v_ffbh_u32 v5, v1 row_shl:1
+// GFX12: encoding: [0xfa,0x72,0x0a,0x7e,0x01,0x01,0x01,0xff]
+
+v_ffbh_u32 v5, v1 row_shl:15
+// GFX12: encoding: [0xfa,0x72,0x0a,0x7e,0x01,0x0f,0x01,0xff]
+
+v_ffbh_u32 v5, v1 row_shr:1
+// GFX12: encoding: [0xfa,0x72,0x0a,0x7e,0x01,0x11,0x01,0xff]
+
+v_ffbh_u32 v5, v1 row_shr:15
+// GFX12: encoding: [0xfa,0x72,0x0a,0x7e,0x01,0x1f,0x01,0xff]
+
+v_ffbh_u32 v5, v1 row_ror:1
+// GFX12: encoding: [0xfa,0x72,0x0a,0x7e,0x01,0x21,0x01,0xff]
+
+v_ffbh_u32 v5, v1 row_ror:15
+// GFX12: encoding: [0xfa,0x72,0x0a,0x7e,0x01,0x2f,0x01,0xff]
+
+v_ffbh_u32 v5, v1 row_share:0 row_mask:0xf bank_mask:0xf
+// GFX12: encoding: [0xfa,0x72,0x0a,0x7e,0x01,0x50,0x01,0xff]
+
+v_ffbh_u32 v5, v1 row_share:15 row_mask:0x0 bank_mask:0x1
+// GFX12: encoding: [0xfa,0x72,0x0a,0x7e,0x01,0x5f,0x01,0x01]
+
+v_ffbh_u32 v5, v1 row_xmask:0 row_mask:0x1 bank_mask:0x3 bound_ctrl:1 fi:0
+// GFX12: encoding: [0xfa,0x72,0x0a,0x7e,0x01,0x60,0x09,0x13]
+
+v_ffbh_u32 v255, v255 row_xmask:15 row_mask:0x3 bank_mask:0x0 bound_ctrl:0 fi:1
+// GFX12: encoding: [0xfa,0x72,0xfe,0x7f,0xff,0x6f,0x05,0x30]
+
+v_ffbl_b32 v5, v1 quad_perm:[3,2,1,0]
+// GFX12: encoding: [0xfa,0x74,0x0a,0x7e,0x01,0x1b,0x00,0xff]
+
+v_ffbl_b32 v5, v1 quad_perm:[0,1,2,3]
+// GFX12: encoding: [0xfa,0x74,0x0a,0x7e,0x01,0xe4,0x00,0xff]
+
+v_ffbl_b32 v5, v1 row_mirror
+// GFX12: encoding: [0xfa,0x74,0x0a,0x7e,0x01,0x40,0x01,0xff]
+
+v_ffbl_b32 v5, v1 row_half_mirror
+// GFX12: encoding: [0xfa,0x74,0x0a,0x7e,0x01,0x41,0x01,0xff]
+
+v_ffbl_b32 v5, v1 row_shl:1
+// GFX12: encoding: [0xfa,0x74,0x0a,0x7e,0x01,0x01,0x01,0xff]
+
+v_ffbl_b32 v5, v1 row_shl:15
+// GFX12: encoding: [0xfa,0x74,0x0a,0x7e,0x01,0x0f,0x01,0xff]
+
+v_ffbl_b32 v5, v1 row_shr:1
+// GFX12: encoding: [0xfa,0x74,0x0a,0x7e,0x01,0x11,0x01,0xff]
+
+v_ffbl_b32 v5, v1 row_shr:15
+// GFX12: encoding: [0xfa,0x74,0x0a,0x7e,0x01,0x1f,0x01,0xff]
+
+v_ffbl_b32 v5, v1 row_ror:1
+// GFX12: encoding: [0xfa,0x74,0x0a,0x7e,0x01,0x21,0x01,0xff]
+
+v_ffbl_b32 v5, v1 row_ror:15
+// GFX12: encoding: [0xfa,0x74,0x0a,0x7e,0x01,0x2f,0x01,0xff]
+
+v_ffbl_b32 v5, v1 row_share:0 row_mask:0xf bank_mask:0xf
+// GFX12: encoding: [0xfa,0x74,0x0a,0x7e,0x01,0x50,0x01,0xff]
+
+v_ffbl_b32 v5, v1 row_share:15 row_mask:0x0 bank_mask:0x1
+// GFX12: encoding: [0xfa,0x74,0x0a,0x7e,0x01,0x5f,0x01,0x01]
+
+v_ffbl_b32 v5, v1 row_xmask:0 row_mask:0x1 bank_mask:0x3 bound_ctrl:1 fi:0
+// GFX12: encoding: [0xfa,0x74,0x0a,0x7e,0x01,0x60,0x09,0x13]
+
+v_ffbl_b32 v255, v255 row_xmask:15 row_mask:0x3 bank_mask:0x0 bound_ctrl:0 fi:1
+// GFX12: encoding: [0xfa,0x74,0xfe,0x7f,0xff,0x6f,0x05,0x30]
+
+v_floor_f16 v5, v1 quad_perm:[3,2,1,0]
+// GFX12: encoding: [0xfa,0xb6,0x0a,0x7e,0x01,0x1b,0x00,0xff]
+
+v_floor_f16 v5, v1 quad_perm:[0,1,2,3]
+// GFX12: encoding: [0xfa,0xb6,0x0a,0x7e,0x01,0xe4,0x00,0xff]
+
+v_floor_f16 v5, v1 row_mirror
+// GFX12: encoding: [0xfa,0xb6,0x0a,0x7e,0x01,0x40,0x01,0xff]
+
+v_floor_f16 v5, v1 row_half_mirror
+// GFX12: encoding: [0xfa,0xb6,0x0a,0x7e,0x01,0x41,0x01,0xff]
+
+v_floor_f16 v5, v1 row_shl:1
+// GFX12: encoding: [0xfa,0xb6,0x0a,0x7e,0x01,0x01,0x01,0xff]
+
+v_floor_f16 v5, v1 row_shl:15
+// GFX12: encoding: [0xfa,0xb6,0x0a,0x7e,0x01,0x0f,0x01,0xff]
+
+v_floor_f16 v5, v1 row_shr:1
+// GFX12: encoding: [0xfa,0xb6,0x0a,0x7e,0x01,0x11,0x01,0xff]
+
+v_floor_f16 v5, v1 row_shr:15
+// GFX12: encoding: [0xfa,0xb6,0x0a,0x7e,0x01,0x1f,0x01,0xff]
+
+v_floor_f16 v5, v1 row_ror:1
+// GFX12: encoding: [0xfa,0xb6,0x0a,0x7e,0x01,0x21,0x01,0xff]
+
+v_floor_f16 v5, v1 row_ror:15
+// GFX12: encoding: [0xfa,0xb6,0x0a,0x7e,0x01,0x2f,0x01,0xff]
+
+v_floor_f16 v5, v1 row_share:0 row_mask:0xf bank_mask:0xf
+// GFX12: encoding: [0xfa,0xb6,0x0a,0x7e,0x01,0x50,0x01,0xff]
+
+v_floor_f16 v5, v1 row_share:15 row_mask:0x0 bank_mask:0x1
+// GFX12: encoding: [0xfa,0xb6,0x0a,0x7e,0x01,0x5f,0x01,0x01]
+
+v_floor_f16 v5, v1 row_xmask:0 row_mask:0x1 bank_mask:0x3 bound_ctrl:1 fi:0
+// GFX12: encoding: [0xfa,0xb6,0x0a,0x7e,0x01,0x60,0x09,0x13]
+
+v_floor_f16 v127, -|v127| row_xmask:15 row_mask:0x3 bank_mask:0x0 bound_ctrl:0 fi:1
+// GFX12: encoding: [0xfa,0xb6,0xfe,0x7e,0x7f,0x6f,0x35,0x30]
+
+v_floor_f32 v5, v1 quad_perm:[3,2,1,0]
+// GFX12: encoding: [0xfa,0x48,0x0a,0x7e,0x01,0x1b,0x00,0xff]
+
+v_floor_f32 v5, v1 quad_perm:[0,1,2,3]
+// GFX12: encoding: [0xfa,0x48,0x0a,0x7e,0x01,0xe4,0x00,0xff]
+
+v_floor_f32 v5, v1 row_mirror
+// GFX12: encoding: [0xfa,0x48,0x0a,0x7e,0x01,0x40,0x01,0xff]
+
+v_floor_f32 v5, v1 row_half_mirror
+// GFX12: encoding: [0xfa,0x48,0x0a,0x7e,0x01,0x41,0x01,0xff]
+
+v_floor_f32 v5, v1 row_shl:1
+// GFX12: encoding: [0xfa,0x48,0x0a,0x7e,0x01,0x01,0x01,0xff]
+
+v_floor_f32 v5, v1 row_shl:15
+// GFX12: encoding: [0xfa,0x48,0x0a,0x7e,0x01,0x0f,0x01,0xff]
+
+v_floor_f32 v5, v1 row_shr:1
+// GFX12: encoding: [0xfa,0x48,0x0a,0x7e,0x01,0x11,0x01,0xff]
+
+v_floor_f32 v5, v1 row_shr:15
+// GFX12: encoding: [0xfa,0x48,0x0a,0x7e,0x01,0x1f,0x01,0xff]
+
+v_floor_f32 v5, v1 row_ror:1
+// GFX12: encoding: [0xfa,0x48,0x0a,0x7e,0x01,0x21,0x01,0xff]
+
+v_floor_f32 v5, v1 row_ror:15
+// GFX12: encoding: [0xfa,0x48,0x0a,0x7e,0x01,0x2f,0x01,0xff]
+
+v_floor_f32 v5, v1 row_share:0 row_mask:0xf bank_mask:0xf
+// GFX12: encoding: [0xfa,0x48,0x0a,0x7e,0x01,0x50,0x01,0xff]
+
+v_floor_f32 v5, v1 row_share:15 row_mask:0x0 bank_mask:0x1
+// GFX12: encoding: [0xfa,0x48,0x0a,0x7e,0x01,0x5f,0x01,0x01]
+
+v_floor_f32 v5, v1 row_xmask:0 row_mask:0x1 bank_mask:0x3 bound_ctrl:1 fi:0
+// GFX12: encoding: [0xfa,0x48,0x0a,0x7e,0x01,0x60,0x09,0x13]
+
+v_floor_f32 v255, -|v255| row_xmask:15 row_mask:0x3 bank_mask:0x0 bound_ctrl:0 fi:1
+// GFX12: encoding: [0xfa,0x48,0xfe,0x7f,0xff,0x6f,0x35,0x30]
+
+v_fract_f16 v5, v1 quad_perm:[3,2,1,0]
+// GFX12: encoding: [0xfa,0xbe,0x0a,0x7e,0x01,0x1b,0x00,0xff]
+
+v_fract_f16 v5, v1 quad_perm:[0,1,2,3]
+// GFX12: encoding: [0xfa,0xbe,0x0a,0x7e,0x01,0xe4,0x00,0xff]
+
+v_fract_f16 v5, v1 row_mirror
+// GFX12: encoding: [0xfa,0xbe,0x0a,0x7e,0x01,0x40,0x01,0xff]
+
+v_fract_f16 v5, v1 row_half_mirror
+// GFX12: encoding: [0xfa,0xbe,0x0a,0x7e,0x01,0x41,0x01,0xff]
+
+v_fract_f16 v5, v1 row_shl:1
+// GFX12: encoding: [0xfa,0xbe,0x0a,0x7e,0x01,0x01,0x01,0xff]
+
+v_fract_f16 v5, v1 row_shl:15
+// GFX12: encoding: [0xfa,0xbe,0x0a,0x7e,0x01,0x0f,0x01,0xff]
+
+v_fract_f16 v5, v1 row_shr:1
+// GFX12: encoding: [0xfa,0xbe,0x0a,0x7e,0x01,0x11,0x01,0xff]
+
+v_fract_f16 v5, v1 row_shr:15
+// GFX12: encoding: [0xfa,0xbe,0x0a,0x7e,0x01,0x1f,0x01,0xff]
+
+v_fract_f16 v5, v1 row_ror:1
+// GFX12: encoding: [0xfa,0xbe,0x0a,0x7e,0x01,0x21,0x01,0xff]
+
+v_fract_f16 v5, v1 row_ror:15
+// GFX12: encoding: [0xfa,0xbe,0x0a,0x7e,0x01,0x2f,0x01,0xff]
+
+v_fract_f16 v5, v1 row_share:0 row_mask:0xf bank_mask:0xf
+// GFX12: encoding: [0xfa,0xbe,0x0a,0x7e,0x01,0x50,0x01,0xff]
+
+v_fract_f16 v5, v1 row_share:15 row_mask:0x0 bank_mask:0x1
+// GFX12: encoding: [0xfa,0xbe,0x0a,0x7e,0x01,0x5f,0x01,0x01]
+
+v_fract_f16 v5, v1 row_xmask:0 row_mask:0x1 bank_mask:0x3 bound_ctrl:1 fi:0
+// GFX12: encoding: [0xfa,0xbe,0x0a,0x7e,0x01,0x60,0x09,0x13]
+
+v_fract_f16 v127, -|v127| row_xmask:15 row_mask:0x3 bank_mask:0x0 bound_ctrl:0 fi:1
+// GFX12: encoding: [0xfa,0xbe,0xfe,0x7e,0x7f,0x6f,0x35,0x30]
+
+v_fract_f32 v5, v1 quad_perm:[3,2,1,0]
+// GFX12: encoding: [0xfa,0x40,0x0a,0x7e,0x01,0x1b,0x00,0xff]
+
+v_fract_f32 v5, v1 quad_perm:[0,1,2,3]
+// GFX12: encoding: [0xfa,0x40,0x0a,0x7e,0x01,0xe4,0x00,0xff]
+
+v_fract_f32 v5, v1 row_mirror
+// GFX12: encoding: [0xfa,0x40,0x0a,0x7e,0x01,0x40,0x01,0xff]
+
+v_fract_f32 v5, v1 row_half_mirror
+// GFX12: encoding: [0xfa,0x40,0x0a,0x7e,0x01,0x41,0x01,0xff]
+
+v_fract_f32 v5, v1 row_shl:1
+// GFX12: encoding: [0xfa,0x40,0x0a,0x7e,0x01,0x01,0x01,0xff]
+
+v_fract_f32 v5, v1 row_shl:15
+// GFX12: encoding: [0xfa,0x40,0x0a,0x7e,0x01,0x0f,0x01,0xff]
+
+v_fract_f32 v5, v1 row_shr:1
+// GFX12: encoding: [0xfa,0x40,0x0a,0x7e,0x01,0x11,0x01,0xff]
+
+v_fract_f32 v5, v1 row_shr:15
+// GFX12: encoding: [0xfa,0x40,0x0a,0x7e,0x01,0x1f,0x01,0xff]
+
+v_fract_f32 v5, v1 row_ror:1
+// GFX12: encoding: [0xfa,0x40,0x0a,0x7e,0x01,0x21,0x01,0xff]
+
+v_fract_f32 v5, v1 row_ror:15
+// GFX12: encoding: [0xfa,0x40,0x0a,0x7e,0x01,0x2f,0x01,0xff]
+
+v_fract_f32 v5, v1 row_share:0 row_mask:0xf bank_mask:0xf
+// GFX12: encoding: [0xfa,0x40,0x0a,0x7e,0x01,0x50,0x01,0xff]
+
+v_fract_f32 v5, v1 row_share:15 row_mask:0x0 bank_mask:0x1
+// GFX12: encoding: [0xfa,0x40,0x0a,0x7e,0x01,0x5f,0x01,0x01]
+
+v_fract_f32 v5, v1 row_xmask:0 row_mask:0x1 bank_mask:0x3 bound_ctrl:1 fi:0
+// GFX12: encoding: [0xfa,0x40,0x0a,0x7e,0x01,0x60,0x09,0x13]
+
+v_fract_f32 v255, -|v255| row_xmask:15 row_mask:0x3 bank_mask:0x0 bound_ctrl:0 fi:1
+// GFX12: encoding: [0xfa,0x40,0xfe,0x7f,0xff,0x6f,0x35,0x30]
+
+v_frexp_exp_i16_f16 v5, v1 quad_perm:[3,2,1,0]
+// GFX12: encoding: [0xfa,0xb4,0x0a,0x7e,0x01,0x1b,0x00,0xff]
+
+v_frexp_exp_i16_f16 v5, v1 quad_perm:[0,1,2,3]
+// GFX12: encoding: [0xfa,0xb4,0x0a,0x7e,0x01,0xe4,0x00,0xff]
+
+v_frexp_exp_i16_f16 v5, v1 row_mirror
+// GFX12: encoding: [0xfa,0xb4,0x0a,0x7e,0x01,0x40,0x01,0xff]
+
+v_frexp_exp_i16_f16 v5, v1 row_half_mirror
+// GFX12: encoding: [0xfa,0xb4,0x0a,0x7e,0x01,0x41,0x01,0xff]
+
+v_frexp_exp_i16_f16 v5, v1 row_shl:1
+// GFX12: encoding: [0xfa,0xb4,0x0a,0x7e,0x01,0x01,0x01,0xff]
+
+v_frexp_exp_i16_f16 v5, v1 row_shl:15
+// GFX12: encoding: [0xfa,0xb4,0x0a,0x7e,0x01,0x0f,0x01,0xff]
+
+v_frexp_exp_i16_f16 v5, v1 row_shr:1
+// GFX12: encoding: [0xfa,0xb4,0x0a,0x7e,0x01,0x11,0x01,0xff]
+
+v_frexp_exp_i16_f16 v5, v1 row_shr:15
+// GFX12: encoding: [0xfa,0xb4,0x0a,0x7e,0x01,0x1f,0x01,0xff]
+
+v_frexp_exp_i16_f16 v5, v1 row_ror:1
+// GFX12: encoding: [0xfa,0xb4,0x0a,0x7e,0x01,0x21,0x01,0xff]
+
+v_frexp_exp_i16_f16 v5, v1 row_ror:15
+// GFX12: encoding: [0xfa,0xb4,0x0a,0x7e,0x01,0x2f,0x01,0xff]
+
+v_frexp_exp_i16_f16 v5, v1 row_share:0 row_mask:0xf bank_mask:0xf
+// GFX12: encoding: [0xfa,0xb4,0x0a,0x7e,0x01,0x50,0x01,0xff]
+
+v_frexp_exp_i16_f16 v5, v1 row_share:15 row_mask:0x0 bank_mask:0x1
+// GFX12: encoding: [0xfa,0xb4,0x0a,0x7e,0x01,0x5f,0x01,0x01]
+
+v_frexp_exp_i16_f16 v5, v1 row_xmask:0 row_mask:0x1 bank_mask:0x3 bound_ctrl:1 fi:0
+// GFX12: encoding: [0xfa,0xb4,0x0a,0x7e,0x01,0x60,0x09,0x13]
+
+v_frexp_exp_i16_f16 v127, -|v127| row_xmask:15 row_mask:0x3 bank_mask:0x0 bound_ctrl:0 fi:1
+// GFX12: encoding: [0xfa,0xb4,0xfe,0x7e,0x7f,0x6f,0x35,0x30]
+
+v_frexp_exp_i32_f32 v5, v1 quad_perm:[3,2,1,0]
+// GFX12: encoding: [0xfa,0x7e,0x0a,0x7e,0x01,0x1b,0x00,0xff]
+
+v_frexp_exp_i32_f32 v5, v1 quad_perm:[0,1,2,3]
+// GFX12: encoding: [0xfa,0x7e,0x0a,0x7e,0x01,0xe4,0x00,0xff]
+
+v_frexp_exp_i32_f32 v5, v1 row_mirror
+// GFX12: encoding: [0xfa,0x7e,0x0a,0x7e,0x01,0x40,0x01,0xff]
+
+v_frexp_exp_i32_f32 v5, v1 row_half_mirror
+// GFX12: encoding: [0xfa,0x7e,0x0a,0x7e,0x01,0x41,0x01,0xff]
+
+v_frexp_exp_i32_f32 v5, v1 row_shl:1
+// GFX12: encoding: [0xfa,0x7e,0x0a,0x7e,0x01,0x01,0x01,0xff]
+
+v_frexp_exp_i32_f32 v5, v1 row_shl:15
+// GFX12: encoding: [0xfa,0x7e,0x0a,0x7e,0x01,0x0f,0x01,0xff]
+
+v_frexp_exp_i32_f32 v5, v1 row_shr:1
+// GFX12: encoding: [0xfa,0x7e,0x0a,0x7e,0x01,0x11,0x01,0xff]
+
+v_frexp_exp_i32_f32 v5, v1 row_shr:15
+// GFX12: encoding: [0xfa,0x7e,0x0a,0x7e,0x01,0x1f,0x01,0xff]
+
+v_frexp_exp_i32_f32 v5, v1 row_ror:1
+// GFX12: encoding: [0xfa,0x7e,0x0a,0x7e,0x01,0x21,0x01,0xff]
+
+v_frexp_exp_i32_f32 v5, v1 row_ror:15
+// GFX12: encoding: [0xfa,0x7e,0x0a,0x7e,0x01,0x2f,0x01,0xff]
+
+v_frexp_exp_i32_f32 v5, v1 row_share:0 row_mask:0xf bank_mask:0xf
+// GFX12: encoding: [0xfa,0x7e,0x0a,0x7e,0x01,0x50,0x01,0xff]
+
+v_frexp_exp_i32_f32 v5, v1 row_share:15 row_mask:0x0 bank_mask:0x1
+// GFX12: encoding: [0xfa,0x7e,0x0a,0x7e,0x01,0x5f,0x01,0x01]
+
+v_frexp_exp_i32_f32 v5, v1 row_xmask:0 row_mask:0x1 bank_mask:0x3 bound_ctrl:1 fi:0
+// GFX12: encoding: [0xfa,0x7e,0x0a,0x7e,0x01,0x60,0x09,0x13]
+
+v_frexp_exp_i32_f32 v255, -|v255| row_xmask:15 row_mask:0x3 bank_mask:0x0 bound_ctrl:0 fi:1
+// GFX12: encoding: [0xfa,0x7e,0xfe,0x7f,0xff,0x6f,0x35,0x30]
+
+v_frexp_mant_f16 v5, v1 quad_perm:[3,2,1,0]
+// GFX12: encoding: [0xfa,0xb2,0x0a,0x7e,0x01,0x1b,0x00,0xff]
+
+v_frexp_mant_f16 v5, v1 quad_perm:[0,1,2,3]
+// GFX12: encoding: [0xfa,0xb2,0x0a,0x7e,0x01,0xe4,0x00,0xff]
+
+v_frexp_mant_f16 v5, v1 row_mirror
+// GFX12: encoding: [0xfa,0xb2,0x0a,0x7e,0x01,0x40,0x01,0xff]
+
+v_frexp_mant_f16 v5, v1 row_half_mirror
+// GFX12: encoding: [0xfa,0xb2,0x0a,0x7e,0x01,0x41,0x01,0xff]
+
+v_frexp_mant_f16 v5, v1 row_shl:1
+// GFX12: encoding: [0xfa,0xb2,0x0a,0x7e,0x01,0x01,0x01,0xff]
+
+v_frexp_mant_f16 v5, v1 row_shl:15
+// GFX12: encoding: [0xfa,0xb2,0x0a,0x7e,0x01,0x0f,0x01,0xff]
+
+v_frexp_mant_f16 v5, v1 row_shr:1
+// GFX12: encoding: [0xfa,0xb2,0x0a,0x7e,0x01,0x11,0x01,0xff]
+
+v_frexp_mant_f16 v5, v1 row_shr:15
+// GFX12: encoding: [0xfa,0xb2,0x0a,0x7e,0x01,0x1f,0x01,0xff]
+
+v_frexp_mant_f16 v5, v1 row_ror:1
+// GFX12: encoding: [0xfa,0xb2,0x0a,0x7e,0x01,0x21,0x01,0xff]
+
+v_frexp_mant_f16 v5, v1 row_ror:15
+// GFX12: encoding: [0xfa,0xb2,0x0a,0x7e,0x01,0x2f,0x01,0xff]
+
+v_frexp_mant_f16 v5, v1 row_share:0 row_mask:0xf bank_mask:0xf
+// GFX12: encoding: [0xfa,0xb2,0x0a,0x7e,0x01,0x50,0x01,0xff]
+
+v_frexp_mant_f16 v5, v1 row_share:15 row_mask:0x0 bank_mask:0x1
+// GFX12: encoding: [0xfa,0xb2,0x0a,0x7e,0x01,0x5f,0x01,0x01]
+
+v_frexp_mant_f16 v5, v1 row_xmask:0 row_mask:0x1 bank_mask:0x3 bound_ctrl:1 fi:0
+// GFX12: encoding: [0xfa,0xb2,0x0a,0x7e,0x01,0x60,0x09,0x13]
+
+v_frexp_mant_f16 v127, -|v127| row_xmask:15 row_mask:0x3 bank_mask:0x0 bound_ctrl:0 fi:1
+// GFX12: encoding: [0xfa,0xb2,0xfe,0x7e,0x7f,0x6f,0x35,0x30]
+
+v_frexp_mant_f32 v5, v1 quad_perm:[3,2,1,0]
+// GFX12: encoding: [0xfa,0x80,0x0a,0x7e,0x01,0x1b,0x00,0xff]
+
+v_frexp_mant_f32 v5, v1 quad_perm:[0,1,2,3]
+// GFX12: encoding: [0xfa,0x80,0x0a,0x7e,0x01,0xe4,0x00,0xff]
+
+v_frexp_mant_f32 v5, v1 row_mirror
+// GFX12: encoding: [0xfa,0x80,0x0a,0x7e,0x01,0x40,0x01,0xff]
+
+v_frexp_mant_f32 v5, v1 row_half_mirror
+// GFX12: encoding: [0xfa,0x80,0x0a,0x7e,0x01,0x41,0x01,0xff]
+
+v_frexp_mant_f32 v5, v1 row_shl:1
+// GFX12: encoding: [0xfa,0x80,0x0a,0x7e,0x01,0x01,0x01,0xff]
+
+v_frexp_mant_f32 v5, v1 row_shl:15
+// GFX12: encoding: [0xfa,0x80,0x0a,0x7e,0x01,0x0f,0x01,0xff]
+
+v_frexp_mant_f32 v5, v1 row_shr:1
+// GFX12: encoding: [0xfa,0x80,0x0a,0x7e,0x01,0x11,0x01,0xff]
+
+v_frexp_mant_f32 v5, v1 row_shr:15
+// GFX12: encoding: [0xfa,0x80,0x0a,0x7e,0x01,0x1f,0x01,0xff]
+
+v_frexp_mant_f32 v5, v1 row_ror:1
+// GFX12: encoding: [0xfa,0x80,0x0a,0x7e,0x01,0x21,0x01,0xff]
+
+v_frexp_mant_f32 v5, v1 row_ror:15
+// GFX12: encoding: [0xfa,0x80,0x0a,0x7e,0x01,0x2f,0x01,0xff]
+
+v_frexp_mant_f32 v5, v1 row_share:0 row_mask:0xf bank_mask:0xf
+// GFX12: encoding: [0xfa,0x80,0x0a,0x7e,0x01,0x50,0x01,0xff]
+
+v_frexp_mant_f32 v5, v1 row_share:15 row_mask:0x0 bank_mask:0x1
+// GFX12: encoding: [0xfa,0x80,0x0a,0x7e,0x01,0x5f,0x01,0x01]
+
+v_frexp_mant_f32 v5, v1 row_xmask:0 row_mask:0x1 bank_mask:0x3 bound_ctrl:1 fi:0
+// GFX12: encoding: [0xfa,0x80,0x0a,0x7e,0x01,0x60,0x09,0x13]
+
+v_frexp_mant_f32 v255, -|v255| row_xmask:15 row_mask:0x3 bank_mask:0x0 bound_ctrl:0 fi:1
+// GFX12: encoding: [0xfa,0x80,0xfe,0x7f,0xff,0x6f,0x35,0x30]
+
+v_log_f16 v5, v1 quad_perm:[3,2,1,0]
+// GFX12: encoding: [0xfa,0xae,0x0a,0x7e,0x01,0x1b,0x00,0xff]
+
+v_log_f16 v5, v1 quad_perm:[0,1,2,3]
+// GFX12: encoding: [0xfa,0xae,0x0a,0x7e,0x01,0xe4,0x00,0xff]
+
+v_log_f16 v5, v1 row_mirror
+// GFX12: encoding: [0xfa,0xae,0x0a,0x7e,0x01,0x40,0x01,0xff]
+
+v_log_f16 v5, v1 row_half_mirror
+// GFX12: encoding: [0xfa,0xae,0x0a,0x7e,0x01,0x41,0x01,0xff]
+
+v_log_f16 v5, v1 row_shl:1
+// GFX12: encoding: [0xfa,0xae,0x0a,0x7e,0x01,0x01,0x01,0xff]
+
+v_log_f16 v5, v1 row_shl:15
+// GFX12: encoding: [0xfa,0xae,0x0a,0x7e,0x01,0x0f,0x01,0xff]
+
+v_log_f16 v5, v1 row_shr:1
+// GFX12: encoding: [0xfa,0xae,0x0a,0x7e,0x01,0x11,0x01,0xff]
+
+v_log_f16 v5, v1 row_shr:15
+// GFX12: encoding: [0xfa,0xae,0x0a,0x7e,0x01,0x1f,0x01,0xff]
+
+v_log_f16 v5, v1 row_ror:1
+// GFX12: encoding: [0xfa,0xae,0x0a,0x7e,0x01,0x21,0x01,0xff]
+
+v_log_f16 v5, v1 row_ror:15
+// GFX12: encoding: [0xfa,0xae,0x0a,0x7e,0x01,0x2f,0x01,0xff]
+
+v_log_f16 v5, v1 row_share:0 row_mask:0xf bank_mask:0xf
+// GFX12: encoding: [0xfa,0xae,0x0a,0x7e,0x01,0x50,0x01,0xff]
+
+v_log_f16 v5, v1 row_share:15 row_mask:0x0 bank_mask:0x1
+// GFX12: encoding: [0xfa,0xae,0x0a,0x7e,0x01,0x5f,0x01,0x01]
+
+v_log_f16 v5, v1 row_xmask:0 row_mask:0x1 bank_mask:0x3 bound_ctrl:1 fi:0
+// GFX12: encoding: [0xfa,0xae,0x0a,0x7e,0x01,0x60,0x09,0x13]
+
+v_log_f16 v127, -|v127| row_xmask:15 row_mask:0x3 bank_mask:0x0 bound_ctrl:0 fi:1
+// GFX12: encoding: [0xfa,0xae,0xfe,0x7e,0x7f,0x6f,0x35,0x30]
+
+v_log_f32 v5, v1 quad_perm:[3,2,1,0]
+// GFX12: encoding: [0xfa,0x4e,0x0a,0x7e,0x01,0x1b,0x00,0xff]
+
+v_log_f32 v5, v1 quad_perm:[0,1,2,3]
+// GFX12: encoding: [0xfa,0x4e,0x0a,0x7e,0x01,0xe4,0x00,0xff]
+
+v_log_f32 v5, v1 row_mirror
+// GFX12: encoding: [0xfa,0x4e,0x0a,0x7e,0x01,0x40,0x01,0xff]
+
+v_log_f32 v5, v1 row_half_mirror
+// GFX12: encoding: [0xfa,0x4e,0x0a,0x7e,0x01,0x41,0x01,0xff]
+
+v_log_f32 v5, v1 row_shl:1
+// GFX12: encoding: [0xfa,0x4e,0x0a,0x7e,0x01,0x01,0x01,0xff]
+
+v_log_f32 v5, v1 row_shl:15
+// GFX12: encoding: [0xfa,0x4e,0x0a,0x7e,0x01,0x0f,0x01,0xff]
+
+v_log_f32 v5, v1 row_shr:1
+// GFX12: encoding: [0xfa,0x4e,0x0a,0x7e,0x01,0x11,0x01,0xff]
+
+v_log_f32 v5, v1 row_shr:15
+// GFX12: encoding: [0xfa,0x4e,0x0a,0x7e,0x01,0x1f,0x01,0xff]
+
+v_log_f32 v5, v1 row_ror:1
+// GFX12: encoding: [0xfa,0x4e,0x0a,0x7e,0x01,0x21,0x01,0xff]
+
+v_log_f32 v5, v1 row_ror:15
+// GFX12: encoding: [0xfa,0x4e,0x0a,0x7e,0x01,0x2f,0x01,0xff]
+
+v_log_f32 v5, v1 row_share:0 row_mask:0xf bank_mask:0xf
+// GFX12: encoding: [0xfa,0x4e,0x0a,0x7e,0x01,0x50,0x01,0xff]
+
+v_log_f32 v5, v1 row_share:15 row_mask:0x0 bank_mask:0x1
+// GFX12: encoding: [0xfa,0x4e,0x0a,0x7e,0x01,0x5f,0x01,0x01]
+
+v_log_f32 v5, v1 row_xmask:0 row_mask:0x1 bank_mask:0x3 bound_ctrl:1 fi:0
+// GFX12: encoding: [0xfa,0x4e,0x0a,0x7e,0x01,0x60,0x09,0x13]
+
+v_log_f32 v255, -|v255| row_xmask:15 row_mask:0x3 bank_mask:0x0 bound_ctrl:0 fi:1
+// GFX12: encoding: [0xfa,0x4e,0xfe,0x7f,0xff,0x6f,0x35,0x30]
+
+v_mov_b32 v5, v1 quad_perm:[3,2,1,0]
+// GFX12: encoding: [0xfa,0x02,0x0a,0x7e,0x01,0x1b,0x00,0xff]
+
+v_mov_b32 v5, v1 quad_perm:[0,1,2,3]
+// GFX12: encoding: [0xfa,0x02,0x0a,0x7e,0x01,0xe4,0x00,0xff]
+
+v_mov_b32 v5, v1 row_mirror
+// GFX12: encoding: [0xfa,0x02,0x0a,0x7e,0x01,0x40,0x01,0xff]
+
+v_mov_b32 v5, v1 row_half_mirror
+// GFX12: encoding: [0xfa,0x02,0x0a,0x7e,0x01,0x41,0x01,0xff]
+
+v_mov_b32 v5, v1 row_shl:1
+// GFX12: encoding: [0xfa,0x02,0x0a,0x7e,0x01,0x01,0x01,0xff]
+
+v_mov_b32 v5, v1 row_shl:15
+// GFX12: encoding: [0xfa,0x02,0x0a,0x7e,0x01,0x0f,0x01,0xff]
+
+v_mov_b32 v5, v1 row_shr:1
+// GFX12: encoding: [0xfa,0x02,0x0a,0x7e,0x01,0x11,0x01,0xff]
+
+v_mov_b32 v5, v1 row_shr:15
+// GFX12: encoding: [0xfa,0x02,0x0a,0x7e,0x01,0x1f,0x01,0xff]
+
+v_mov_b32 v5, v1 row_ror:1
+// GFX12: encoding: [0xfa,0x02,0x0a,0x7e,0x01,0x21,0x01,0xff]
+
+v_mov_b32 v5, v1 row_ror:15
+// GFX12: encoding: [0xfa,0x02,0x0a,0x7e,0x01,0x2f,0x01,0xff]
+
+v_mov_b32 v5, v1 row_share:0 row_mask:0xf bank_mask:0xf
+// GFX12: encoding: [0xfa,0x02,0x0a,0x7e,0x01,0x50,0x01,0xff]
+
+v_mov_b32 v5, v1 row_share:15 row_mask:0x0 bank_mask:0x1
+// GFX12: encoding: [0xfa,0x02,0x0a,0x7e,0x01,0x5f,0x01,0x01]
+
+v_mov_b32 v5, v1 row_xmask:0 row_mask:0x1 bank_mask:0x3 bound_ctrl:1 fi:0
+// GFX12: encoding: [0xfa,0x02,0x0a,0x7e,0x01,0x60,0x09,0x13]
+
+v_mov_b32 v255, v255 row_xmask:15 row_mask:0x3 bank_mask:0x0 bound_ctrl:0 fi:1
+// GFX12: encoding: [0xfa,0x02,0xfe,0x7f,0xff,0x6f,0x05,0x30]
+
+v_movreld_b32 v5, v1 quad_perm:[3,2,1,0]
+// GFX12: encoding: [0xfa,0x84,0x0a,0x7e,0x01,0x1b,0x00,0xff]
+
+v_movreld_b32 v5, v1 quad_perm:[0,1,2,3]
+// GFX12: encoding: [0xfa,0x84,0x0a,0x7e,0x01,0xe4,0x00,0xff]
+
+v_movreld_b32 v5, v1 row_mirror
+// GFX12: encoding: [0xfa,0x84,0x0a,0x7e,0x01,0x40,0x01,0xff]
+
+v_movreld_b32 v5, v1 row_half_mirror
+// GFX12: encoding: [0xfa,0x84,0x0a,0x7e,0x01,0x41,0x01,0xff]
+
+v_movreld_b32 v5, v1 row_shl:1
+// GFX12: encoding: [0xfa,0x84,0x0a,0x7e,0x01,0x01,0x01,0xff]
+
+v_movreld_b32 v5, v1 row_shl:15
+// GFX12: encoding: [0xfa,0x84,0x0a,0x7e,0x01,0x0f,0x01,0xff]
+
+v_movreld_b32 v5, v1 row_shr:1
+// GFX12: encoding: [0xfa,0x84,0x0a,0x7e,0x01,0x11,0x01,0xff]
+
+v_movreld_b32 v5, v1 row_shr:15
+// GFX12: encoding: [0xfa,0x84,0x0a,0x7e,0x01,0x1f,0x01,0xff]
+
+v_movreld_b32 v5, v1 row_ror:1
+// GFX12: encoding: [0xfa,0x84,0x0a,0x7e,0x01,0x21,0x01,0xff]
+
+v_movreld_b32 v5, v1 row_ror:15
+// GFX12: encoding: [0xfa,0x84,0x0a,0x7e,0x01,0x2f,0x01,0xff]
+
+v_movreld_b32 v5, v1 row_share:0 row_mask:0xf bank_mask:0xf
+// GFX12: encoding: [0xfa,0x84,0x0a,0x7e,0x01,0x50,0x01,0xff]
+
+v_movreld_b32 v5, v1 row_share:15 row_mask:0x0 bank_mask:0x1
+// GFX12: encoding: [0xfa,0x84,0x0a,0x7e,0x01,0x5f,0x01,0x01]
+
+v_movreld_b32 v5, v1 row_xmask:0 row_mask:0x1 bank_mask:0x3 bound_ctrl:1 fi:0
+// GFX12: encoding: [0xfa,0x84,0x0a,0x7e,0x01,0x60,0x09,0x13]
+
+v_movreld_b32 v255, v255 row_xmask:15 row_mask:0x3 bank_mask:0x0 bound_ctrl:0 fi:1
+// GFX12: encoding: [0xfa,0x84,0xfe,0x7f,0xff,0x6f,0x05,0x30]
+
+v_movrels_b32 v5, v1 quad_perm:[3,2,1,0]
+// GFX12: encoding: [0xfa,0x86,0x0a,0x7e,0x01,0x1b,0x00,0xff]
+
+v_movrels_b32 v5, v1 quad_perm:[0,1,2,3]
+// GFX12: encoding: [0xfa,0x86,0x0a,0x7e,0x01,0xe4,0x00,0xff]
+
+v_movrels_b32 v5, v1 row_mirror
+// GFX12: encoding: [0xfa,0x86,0x0a,0x7e,0x01,0x40,0x01,0xff]
+
+v_movrels_b32 v5, v1 row_half_mirror
+// GFX12: encoding: [0xfa,0x86,0x0a,0x7e,0x01,0x41,0x01,0xff]
+
+v_movrels_b32 v5, v1 row_shl:1
+// GFX12: encoding: [0xfa,0x86,0x0a,0x7e,0x01,0x01,0x01,0xff]
+
+v_movrels_b32 v5, v1 row_shl:15
+// GFX12: encoding: [0xfa,0x86,0x0a,0x7e,0x01,0x0f,0x01,0xff]
+
+v_movrels_b32 v5, v1 row_shr:1
+// GFX12: encoding: [0xfa,0x86,0x0a,0x7e,0x01,0x11,0x01,0xff]
+
+v_movrels_b32 v5, v1 row_shr:15
+// GFX12: encoding: [0xfa,0x86,0x0a,0x7e,0x01,0x1f,0x01,0xff]
+
+v_movrels_b32 v5, v1 row_ror:1
+// GFX12: encoding: [0xfa,0x86,0x0a,0x7e,0x01,0x21,0x01,0xff]
+
+v_movrels_b32 v5, v1 row_ror:15
+// GFX12: encoding: [0xfa,0x86,0x0a,0x7e,0x01,0x2f,0x01,0xff]
+
+v_movrels_b32 v5, v1 row_share:0 row_mask:0xf bank_mask:0xf
+// GFX12: encoding: [0xfa,0x86,0x0a,0x7e,0x01,0x50,0x01,0xff]
+
+v_movrels_b32 v5, v1 row_share:15 row_mask:0x0 bank_mask:0x1
+// GFX12: encoding: [0xfa,0x86,0x0a,0x7e,0x01,0x5f,0x01,0x01]
+
+v_movrels_b32 v5, v1 row_xmask:0 row_mask:0x1 bank_mask:0x3 bound_ctrl:1 fi:0
+// GFX12: encoding: [0xfa,0x86,0x0a,0x7e,0x01,0x60,0x09,0x13]
+
+v_movrels_b32 v255, v255 row_xmask:15 row_mask:0x3 bank_mask:0x0 bound_ctrl:0 fi:1
+// GFX12: encoding: [0xfa,0x86,0xfe,0x7f,0xff,0x6f,0x05,0x30]
+
+v_movrelsd_2_b32 v5, v1 quad_perm:[3,2,1,0]
+// GFX12: encoding: [0xfa,0x90,0x0a,0x7e,0x01,0x1b,0x00,0xff]
+
+v_movrelsd_2_b32 v5, v1 quad_perm:[0,1,2,3]
+// GFX12: encoding: [0xfa,0x90,0x0a,0x7e,0x01,0xe4,0x00,0xff]
+
+v_movrelsd_2_b32 v5, v1 row_mirror
+// GFX12: encoding: [0xfa,0x90,0x0a,0x7e,0x01,0x40,0x01,0xff]
+
+v_movrelsd_2_b32 v5, v1 row_half_mirror
+// GFX12: encoding: [0xfa,0x90,0x0a,0x7e,0x01,0x41,0x01,0xff]
+
+v_movrelsd_2_b32 v5, v1 row_shl:1
+// GFX12: encoding: [0xfa,0x90,0x0a,0x7e,0x01,0x01,0x01,0xff]
+
+v_movrelsd_2_b32 v5, v1 row_shl:15
+// GFX12: encoding: [0xfa,0x90,0x0a,0x7e,0x01,0x0f,0x01,0xff]
+
+v_movrelsd_2_b32 v5, v1 row_shr:1
+// GFX12: encoding: [0xfa,0x90,0x0a,0x7e,0x01,0x11,0x01,0xff]
+
+v_movrelsd_2_b32 v5, v1 row_shr:15
+// GFX12: encoding: [0xfa,0x90,0x0a,0x7e,0x01,0x1f,0x01,0xff]
+
+v_movrelsd_2_b32 v5, v1 row_ror:1
+// GFX12: encoding: [0xfa,0x90,0x0a,0x7e,0x01,0x21,0x01,0xff]
+
+v_movrelsd_2_b32 v5, v1 row_ror:15
+// GFX12: encoding: [0xfa,0x90,0x0a,0x7e,0x01,0x2f,0x01,0xff]
+
+v_movrelsd_2_b32 v5, v1 row_share:0 row_mask:0xf bank_mask:0xf
+// GFX12: encoding: [0xfa,0x90,0x0a,0x7e,0x01,0x50,0x01,0xff]
+
+v_movrelsd_2_b32 v5, v1 row_share:15 row_mask:0x0 bank_mask:0x1
+// GFX12: encoding: [0xfa,0x90,0x0a,0x7e,0x01,0x5f,0x01,0x01]
+
+v_movrelsd_2_b32 v5, v1 row_xmask:0 row_mask:0x1 bank_mask:0x3 bound_ctrl:1 fi:0
+// GFX12: encoding: [0xfa,0x90,0x0a,0x7e,0x01,0x60,0x09,0x13]
+
+v_movrelsd_2_b32 v255, v255 row_xmask:15 row_mask:0x3 bank_mask:0x0 bound_ctrl:0 fi:1
+// GFX12: encoding: [0xfa,0x90,0xfe,0x7f,0xff,0x6f,0x05,0x30]
+
+v_movrelsd_b32 v5, v1 quad_perm:[3,2,1,0]
+// GFX12: encoding: [0xfa,0x88,0x0a,0x7e,0x01,0x1b,0x00,0xff]
+
+v_movrelsd_b32 v5, v1 quad_perm:[0,1,2,3]
+// GFX12: encoding: [0xfa,0x88,0x0a,0x7e,0x01,0xe4,0x00,0xff]
+
+v_movrelsd_b32 v5, v1 row_mirror
+// GFX12: encoding: [0xfa,0x88,0x0a,0x7e,0x01,0x40,0x01,0xff]
+
+v_movrelsd_b32 v5, v1 row_half_mirror
+// GFX12: encoding: [0xfa,0x88,0x0a,0x7e,0x01,0x41,0x01,0xff]
+
+v_movrelsd_b32 v5, v1 row_shl:1
+// GFX12: encoding: [0xfa,0x88,0x0a,0x7e,0x01,0x01,0x01,0xff]
+
+v_movrelsd_b32 v5, v1 row_shl:15
+// GFX12: encoding: [0xfa,0x88,0x0a,0x7e,0x01,0x0f,0x01,0xff]
+
+v_movrelsd_b32 v5, v1 row_shr:1
+// GFX12: encoding: [0xfa,0x88,0x0a,0x7e,0x01,0x11,0x01,0xff]
+
+v_movrelsd_b32 v5, v1 row_shr:15
+// GFX12: encoding: [0xfa,0x88,0x0a,0x7e,0x01,0x1f,0x01,0xff]
+
+v_movrelsd_b32 v5, v1 row_ror:1
+// GFX12: encoding: [0xfa,0x88,0x0a,0x7e,0x01,0x21,0x01,0xff]
+
+v_movrelsd_b32 v5, v1 row_ror:15
+// GFX12: encoding: [0xfa,0x88,0x0a,0x7e,0x01,0x2f,0x01,0xff]
+
+v_movrelsd_b32 v5, v1 row_share:0 row_mask:0xf bank_mask:0xf
+// GFX12: encoding: [0xfa,0x88,0x0a,0x7e,0x01,0x50,0x01,0xff]
+
+v_movrelsd_b32 v5, v1 row_share:15 row_mask:0x0 bank_mask:0x1
+// GFX12: encoding: [0xfa,0x88,0x0a,0x7e,0x01,0x5f,0x01,0x01]
+
+v_movrelsd_b32 v5, v1 row_xmask:0 row_mask:0x1 bank_mask:0x3 bound_ctrl:1 fi:0
+// GFX12: encoding: [0xfa,0x88,0x0a,0x7e,0x01,0x60,0x09,0x13]
+
+v_movrelsd_b32 v255, v255 row_xmask:15 row_mask:0x3 bank_mask:0x0 bound_ctrl:0 fi:1
+// GFX12: encoding: [0xfa,0x88,0xfe,0x7f,0xff,0x6f,0x05,0x30]
+
+v_not_b16 v5, v1 quad_perm:[3,2,1,0]
+// GFX12: encoding: [0xfa,0xd2,0x0a,0x7e,0x01,0x1b,0x00,0xff]
+
+v_not_b16 v5, v1 quad_perm:[0,1,2,3]
+// GFX12: encoding: [0xfa,0xd2,0x0a,0x7e,0x01,0xe4,0x00,0xff]
+
+v_not_b16 v5, v1 row_mirror
+// GFX12: encoding: [0xfa,0xd2,0x0a,0x7e,0x01,0x40,0x01,0xff]
+
+v_not_b16 v5, v1 row_half_mirror
+// GFX12: encoding: [0xfa,0xd2,0x0a,0x7e,0x01,0x41,0x01,0xff]
+
+v_not_b16 v5, v1 row_shl:1
+// GFX12: encoding: [0xfa,0xd2,0x0a,0x7e,0x01,0x01,0x01,0xff]
+
+v_not_b16 v5, v1 row_shl:15
+// GFX12: encoding: [0xfa,0xd2,0x0a,0x7e,0x01,0x0f,0x01,0xff]
+
+v_not_b16 v5, v1 row_shr:1
+// GFX12: encoding: [0xfa,0xd2,0x0a,0x7e,0x01,0x11,0x01,0xff]
+
+v_not_b16 v5, v1 row_shr:15
+// GFX12: encoding: [0xfa,0xd2,0x0a,0x7e,0x01,0x1f,0x01,0xff]
+
+v_not_b16 v5, v1 row_ror:1
+// GFX12: encoding: [0xfa,0xd2,0x0a,0x7e,0x01,0x21,0x01,0xff]
+
+v_not_b16 v5, v1 row_ror:15
+// GFX12: encoding: [0xfa,0xd2,0x0a,0x7e,0x01,0x2f,0x01,0xff]
+
+v_not_b16 v5, v1 row_share:0 row_mask:0xf bank_mask:0xf
+// GFX12: encoding: [0xfa,0xd2,0x0a,0x7e,0x01,0x50,0x01,0xff]
+
+v_not_b16 v5, v1 row_share:15 row_mask:0x0 bank_mask:0x1
+// GFX12: encoding: [0xfa,0xd2,0x0a,0x7e,0x01,0x5f,0x01,0x01]
+
+v_not_b16 v5, v1 row_xmask:0 row_mask:0x1 bank_mask:0x3 bound_ctrl:1 fi:0
+// GFX12: encoding: [0xfa,0xd2,0x0a,0x7e,0x01,0x60,0x09,0x13]
+
+v_not_b16 v127, v127 row_xmask:15 row_mask:0x3 bank_mask:0x0 bound_ctrl:0 fi:1
+// GFX12: encoding: [0xfa,0xd2,0xfe,0x7e,0x7f,0x6f,0x05,0x30]
+
+v_not_b32 v5, v1 quad_perm:[3,2,1,0]
+// GFX12: encoding: [0xfa,0x6e,0x0a,0x7e,0x01,0x1b,0x00,0xff]
+
+v_not_b32 v5, v1 quad_perm:[0,1,2,3]
+// GFX12: encoding: [0xfa,0x6e,0x0a,0x7e,0x01,0xe4,0x00,0xff]
+
+v_not_b32 v5, v1 row_mirror
+// GFX12: encoding: [0xfa,0x6e,0x0a,0x7e,0x01,0x40,0x01,0xff]
+
+v_not_b32 v5, v1 row_half_mirror
+// GFX12: encoding: [0xfa,0x6e,0x0a,0x7e,0x01,0x41,0x01,0xff]
+
+v_not_b32 v5, v1 row_shl:1
+// GFX12: encoding: [0xfa,0x6e,0x0a,0x7e,0x01,0x01,0x01,0xff]
+
+v_not_b32 v5, v1 row_shl:15
+// GFX12: encoding: [0xfa,0x6e,0x0a,0x7e,0x01,0x0f,0x01,0xff]
+
+v_not_b32 v5, v1 row_shr:1
+// GFX12: encoding: [0xfa,0x6e,0x0a,0x7e,0x01,0x11,0x01,0xff]
+
+v_not_b32 v5, v1 row_shr:15
+// GFX12: encoding: [0xfa,0x6e,0x0a,0x7e,0x01,0x1f,0x01,0xff]
+
+v_not_b32 v5, v1 row_ror:1
+// GFX12: encoding: [0xfa,0x6e,0x0a,0x7e,0x01,0x21,0x01,0xff]
+
+v_not_b32 v5, v1 row_ror:15
+// GFX12: encoding: [0xfa,0x6e,0x0a,0x7e,0x01,0x2f,0x01,0xff]
+
+v_not_b32 v5, v1 row_share:0 row_mask:0xf bank_mask:0xf
+// GFX12: encoding: [0xfa,0x6e,0x0a,0x7e,0x01,0x50,0x01,0xff]
+
+v_not_b32 v5, v1 row_share:15 row_mask:0x0 bank_mask:0x1
+// GFX12: encoding: [0xfa,0x6e,0x0a,0x7e,0x01,0x5f,0x01,0x01]
+
+v_not_b32 v5, v1 row_xmask:0 row_mask:0x1 bank_mask:0x3 bound_ctrl:1 fi:0
+// GFX12: encoding: [0xfa,0x6e,0x0a,0x7e,0x01,0x60,0x09,0x13]
+
+v_not_b32 v255, v255 row_xmask:15 row_mask:0x3 bank_mask:0x0 bound_ctrl:0 fi:1
+// GFX12: encoding: [0xfa,0x6e,0xfe,0x7f,0xff,0x6f,0x05,0x30]
+
+v_rcp_f16 v5, v1 quad_perm:[3,2,1,0]
+// GFX12: encoding: [0xfa,0xa8,0x0a,0x7e,0x01,0x1b,0x00,0xff]
+
+v_rcp_f16 v5, v1 quad_perm:[0,1,2,3]
+// GFX12: encoding: [0xfa,0xa8,0x0a,0x7e,0x01,0xe4,0x00,0xff]
+
+v_rcp_f16 v5, v1 row_mirror
+// GFX12: encoding: [0xfa,0xa8,0x0a,0x7e,0x01,0x40,0x01,0xff]
+
+v_rcp_f16 v5, v1 row_half_mirror
+// GFX12: encoding: [0xfa,0xa8,0x0a,0x7e,0x01,0x41,0x01,0xff]
+
+v_rcp_f16 v5, v1 row_shl:1
+// GFX12: encoding: [0xfa,0xa8,0x0a,0x7e,0x01,0x01,0x01,0xff]
+
+v_rcp_f16 v5, v1 row_shl:15
+// GFX12: encoding: [0xfa,0xa8,0x0a,0x7e,0x01,0x0f,0x01,0xff]
+
+v_rcp_f16 v5, v1 row_shr:1
+// GFX12: encoding: [0xfa,0xa8,0x0a,0x7e,0x01,0x11,0x01,0xff]
+
+v_rcp_f16 v5, v1 row_shr:15
+// GFX12: encoding: [0xfa,0xa8,0x0a,0x7e,0x01,0x1f,0x01,0xff]
+
+v_rcp_f16 v5, v1 row_ror:1
+// GFX12: encoding: [0xfa,0xa8,0x0a,0x7e,0x01,0x21,0x01,0xff]
+
+v_rcp_f16 v5, v1 row_ror:15
+// GFX12: encoding: [0xfa,0xa8,0x0a,0x7e,0x01,0x2f,0x01,0xff]
+
+v_rcp_f16 v5, v1 row_share:0 row_mask:0xf bank_mask:0xf
+// GFX12: encoding: [0xfa,0xa8,0x0a,0x7e,0x01,0x50,0x01,0xff]
+
+v_rcp_f16 v5, v1 row_share:15 row_mask:0x0 bank_mask:0x1
+// GFX12: encoding: [0xfa,0xa8,0x0a,0x7e,0x01,0x5f,0x01,0x01]
+
+v_rcp_f16 v5, v1 row_xmask:0 row_mask:0x1 bank_mask:0x3 bound_ctrl:1 fi:0
+// GFX12: encoding: [0xfa,0xa8,0x0a,0x7e,0x01,0x60,0x09,0x13]
+
+v_rcp_f16 v127, -|v127| row_xmask:15 row_mask:0x3 bank_mask:0x0 bound_ctrl:0 fi:1
+// GFX12: encoding: [0xfa,0xa8,0xfe,0x7e,0x7f,0x6f,0x35,0x30]
+
+v_rcp_f32 v5, v1 quad_perm:[3,2,1,0]
+// GFX12: encoding: [0xfa,0x54,0x0a,0x7e,0x01,0x1b,0x00,0xff]
+
+v_rcp_f32 v5, v1 quad_perm:[0,1,2,3]
+// GFX12: encoding: [0xfa,0x54,0x0a,0x7e,0x01,0xe4,0x00,0xff]
+
+v_rcp_f32 v5, v1 row_mirror
+// GFX12: encoding: [0xfa,0x54,0x0a,0x7e,0x01,0x40,0x01,0xff]
+
+v_rcp_f32 v5, v1 row_half_mirror
+// GFX12: encoding: [0xfa,0x54,0x0a,0x7e,0x01,0x41,0x01,0xff]
+
+v_rcp_f32 v5, v1 row_shl:1
+// GFX12: encoding: [0xfa,0x54,0x0a,0x7e,0x01,0x01,0x01,0xff]
+
+v_rcp_f32 v5, v1 row_shl:15
+// GFX12: encoding: [0xfa,0x54,0x0a,0x7e,0x01,0x0f,0x01,0xff]
+
+v_rcp_f32 v5, v1 row_shr:1
+// GFX12: encoding: [0xfa,0x54,0x0a,0x7e,0x01,0x11,0x01,0xff]
+
+v_rcp_f32 v5, v1 row_shr:15
+// GFX12: encoding: [0xfa,0x54,0x0a,0x7e,0x01,0x1f,0x01,0xff]
+
+v_rcp_f32 v5, v1 row_ror:1
+// GFX12: encoding: [0xfa,0x54,0x0a,0x7e,0x01,0x21,0x01,0xff]
+
+v_rcp_f32 v5, v1 row_ror:15
+// GFX12: encoding: [0xfa,0x54,0x0a,0x7e,0x01,0x2f,0x01,0xff]
+
+v_rcp_f32 v5, v1 row_share:0 row_mask:0xf bank_mask:0xf
+// GFX12: encoding: [0xfa,0x54,0x0a,0x7e,0x01,0x50,0x01,0xff]
+
+v_rcp_f32 v5, v1 row_share:15 row_mask:0x0 bank_mask:0x1
+// GFX12: encoding: [0xfa,0x54,0x0a,0x7e,0x01,0x5f,0x01,0x01]
+
+v_rcp_f32 v5, v1 row_xmask:0 row_mask:0x1 bank_mask:0x3 bound_ctrl:1 fi:0
+// GFX12: encoding: [0xfa,0x54,0x0a,0x7e,0x01,0x60,0x09,0x13]
+
+v_rcp_f32 v255, -|v255| row_xmask:15 row_mask:0x3 bank_mask:0x0 bound_ctrl:0 fi:1
+// GFX12: encoding: [0xfa,0x54,0xfe,0x7f,0xff,0x6f,0x35,0x30]
+
+v_rcp_iflag_f32 v5, v1 quad_perm:[3,2,1,0]
+// GFX12: encoding: [0xfa,0x56,0x0a,0x7e,0x01,0x1b,0x00,0xff]
+
+v_rcp_iflag_f32 v5, v1 quad_perm:[0,1,2,3]
+// GFX12: encoding: [0xfa,0x56,0x0a,0x7e,0x01,0xe4,0x00,0xff]
+
+v_rcp_iflag_f32 v5, v1 row_mirror
+// GFX12: encoding: [0xfa,0x56,0x0a,0x7e,0x01,0x40,0x01,0xff]
+
+v_rcp_iflag_f32 v5, v1 row_half_mirror
+// GFX12: encoding: [0xfa,0x56,0x0a,0x7e,0x01,0x41,0x01,0xff]
+
+v_rcp_iflag_f32 v5, v1 row_shl:1
+// GFX12: encoding: [0xfa,0x56,0x0a,0x7e,0x01,0x01,0x01,0xff]
+
+v_rcp_iflag_f32 v5, v1 row_shl:15
+// GFX12: encoding: [0xfa,0x56,0x0a,0x7e,0x01,0x0f,0x01,0xff]
+
+v_rcp_iflag_f32 v5, v1 row_shr:1
+// GFX12: encoding: [0xfa,0x56,0x0a,0x7e,0x01,0x11,0x01,0xff]
+
+v_rcp_iflag_f32 v5, v1 row_shr:15
+// GFX12: encoding: [0xfa,0x56,0x0a,0x7e,0x01,0x1f,0x01,0xff]
+
+v_rcp_iflag_f32 v5, v1 row_ror:1
+// GFX12: encoding: [0xfa,0x56,0x0a,0x7e,0x01,0x21,0x01,0xff]
+
+v_rcp_iflag_f32 v5, v1 row_ror:15
+// GFX12: encoding: [0xfa,0x56,0x0a,0x7e,0x01,0x2f,0x01,0xff]
+
+v_rcp_iflag_f32 v5, v1 row_share:0 row_mask:0xf bank_mask:0xf
+// GFX12: encoding: [0xfa,0x56,0x0a,0x7e,0x01,0x50,0x01,0xff]
+
+v_rcp_iflag_f32 v5, v1 row_share:15 row_mask:0x0 bank_mask:0x1
+// GFX12: encoding: [0xfa,0x56,0x0a,0x7e,0x01,0x5f,0x01,0x01]
+
+v_rcp_iflag_f32 v5, v1 row_xmask:0 row_mask:0x1 bank_mask:0x3 bound_ctrl:1 fi:0
+// GFX12: encoding: [0xfa,0x56,0x0a,0x7e,0x01,0x60,0x09,0x13]
+
+v_rcp_iflag_f32 v255, -|v255| row_xmask:15 row_mask:0x3 bank_mask:0x0 bound_ctrl:0 fi:1
+// GFX12: encoding: [0xfa,0x56,0xfe,0x7f,0xff,0x6f,0x35,0x30]
+
+v_rndne_f16 v5, v1 quad_perm:[3,2,1,0]
+// GFX12: encoding: [0xfa,0xbc,0x0a,0x7e,0x01,0x1b,0x00,0xff]
+
+v_rndne_f16 v5, v1 quad_perm:[0,1,2,3]
+// GFX12: encoding: [0xfa,0xbc,0x0a,0x7e,0x01,0xe4,0x00,0xff]
+
+v_rndne_f16 v5, v1 row_mirror
+// GFX12: encoding: [0xfa,0xbc,0x0a,0x7e,0x01,0x40,0x01,0xff]
+
+v_rndne_f16 v5, v1 row_half_mirror
+// GFX12: encoding: [0xfa,0xbc,0x0a,0x7e,0x01,0x41,0x01,0xff]
+
+v_rndne_f16 v5, v1 row_shl:1
+// GFX12: encoding: [0xfa,0xbc,0x0a,0x7e,0x01,0x01,0x01,0xff]
+
+v_rndne_f16 v5, v1 row_shl:15
+// GFX12: encoding: [0xfa,0xbc,0x0a,0x7e,0x01,0x0f,0x01,0xff]
+
+v_rndne_f16 v5, v1 row_shr:1
+// GFX12: encoding: [0xfa,0xbc,0x0a,0x7e,0x01,0x11,0x01,0xff]
+
+v_rndne_f16 v5, v1 row_shr:15
+// GFX12: encoding: [0xfa,0xbc,0x0a,0x7e,0x01,0x1f,0x01,0xff]
+
+v_rndne_f16 v5, v1 row_ror:1
+// GFX12: encoding: [0xfa,0xbc,0x0a,0x7e,0x01,0x21,0x01,0xff]
+
+v_rndne_f16 v5, v1 row_ror:15
+// GFX12: encoding: [0xfa,0xbc,0x0a,0x7e,0x01,0x2f,0x01,0xff]
+
+v_rndne_f16 v5, v1 row_share:0 row_mask:0xf bank_mask:0xf
+// GFX12: encoding: [0xfa,0xbc,0x0a,0x7e,0x01,0x50,0x01,0xff]
+
+v_rndne_f16 v5, v1 row_share:15 row_mask:0x0 bank_mask:0x1
+// GFX12: encoding: [0xfa,0xbc,0x0a,0x7e,0x01,0x5f,0x01,0x01]
+
+v_rndne_f16 v5, v1 row_xmask:0 row_mask:0x1 bank_mask:0x3 bound_ctrl:1 fi:0
+// GFX12: encoding: [0xfa,0xbc,0x0a,0x7e,0x01,0x60,0x09,0x13]
+
+v_rndne_f16 v127, -|v127| row_xmask:15 row_mask:0x3 bank_mask:0x0 bound_ctrl:0 fi:1
+// GFX12: encoding: [0xfa,0xbc,0xfe,0x7e,0x7f,0x6f,0x35,0x30]
+
+v_rndne_f32 v5, v1 quad_perm:[3,2,1,0]
+// GFX12: encoding: [0xfa,0x46,0x0a,0x7e,0x01,0x1b,0x00,0xff]
+
+v_rndne_f32 v5, v1 quad_perm:[0,1,2,3]
+// GFX12: encoding: [0xfa,0x46,0x0a,0x7e,0x01,0xe4,0x00,0xff]
+
+v_rndne_f32 v5, v1 row_mirror
+// GFX12: encoding: [0xfa,0x46,0x0a,0x7e,0x01,0x40,0x01,0xff]
+
+v_rndne_f32 v5, v1 row_half_mirror
+// GFX12: encoding: [0xfa,0x46,0x0a,0x7e,0x01,0x41,0x01,0xff]
+
+v_rndne_f32 v5, v1 row_shl:1
+// GFX12: encoding: [0xfa,0x46,0x0a,0x7e,0x01,0x01,0x01,0xff]
+
+v_rndne_f32 v5, v1 row_shl:15
+// GFX12: encoding: [0xfa,0x46,0x0a,0x7e,0x01,0x0f,0x01,0xff]
+
+v_rndne_f32 v5, v1 row_shr:1
+// GFX12: encoding: [0xfa,0x46,0x0a,0x7e,0x01,0x11,0x01,0xff]
+
+v_rndne_f32 v5, v1 row_shr:15
+// GFX12: encoding: [0xfa,0x46,0x0a,0x7e,0x01,0x1f,0x01,0xff]
+
+v_rndne_f32 v5, v1 row_ror:1
+// GFX12: encoding: [0xfa,0x46,0x0a,0x7e,0x01,0x21,0x01,0xff]
+
+v_rndne_f32 v5, v1 row_ror:15
+// GFX12: encoding: [0xfa,0x46,0x0a,0x7e,0x01,0x2f,0x01,0xff]
+
+v_rndne_f32 v5, v1 row_share:0 row_mask:0xf bank_mask:0xf
+// GFX12: encoding: [0xfa,0x46,0x0a,0x7e,0x01,0x50,0x01,0xff]
+
+v_rndne_f32 v5, v1 row_share:15 row_mask:0x0 bank_mask:0x1
+// GFX12: encoding: [0xfa,0x46,0x0a,0x7e,0x01,0x5f,0x01,0x01]
+
+v_rndne_f32 v5, v1 row_xmask:0 row_mask:0x1 bank_mask:0x3 bound_ctrl:1 fi:0
+// GFX12: encoding: [0xfa,0x46,0x0a,0x7e,0x01,0x60,0x09,0x13]
+
+v_rndne_f32 v255, -|v255| row_xmask:15 row_mask:0x3 bank_mask:0x0 bound_ctrl:0 fi:1
+// GFX12: encoding: [0xfa,0x46,0xfe,0x7f,0xff,0x6f,0x35,0x30]
+
+v_rsq_f16 v5, v1 quad_perm:[3,2,1,0]
+// GFX12: encoding: [0xfa,0xac,0x0a,0x7e,0x01,0x1b,0x00,0xff]
+
+v_rsq_f16 v5, v1 quad_perm:[0,1,2,3]
+// GFX12: encoding: [0xfa,0xac,0x0a,0x7e,0x01,0xe4,0x00,0xff]
+
+v_rsq_f16 v5, v1 row_mirror
+// GFX12: encoding: [0xfa,0xac,0x0a,0x7e,0x01,0x40,0x01,0xff]
+
+v_rsq_f16 v5, v1 row_half_mirror
+// GFX12: encoding: [0xfa,0xac,0x0a,0x7e,0x01,0x41,0x01,0xff]
+
+v_rsq_f16 v5, v1 row_shl:1
+// GFX12: encoding: [0xfa,0xac,0x0a,0x7e,0x01,0x01,0x01,0xff]
+
+v_rsq_f16 v5, v1 row_shl:15
+// GFX12: encoding: [0xfa,0xac,0x0a,0x7e,0x01,0x0f,0x01,0xff]
+
+v_rsq_f16 v5, v1 row_shr:1
+// GFX12: encoding: [0xfa,0xac,0x0a,0x7e,0x01,0x11,0x01,0xff]
+
+v_rsq_f16 v5, v1 row_shr:15
+// GFX12: encoding: [0xfa,0xac,0x0a,0x7e,0x01,0x1f,0x01,0xff]
+
+v_rsq_f16 v5, v1 row_ror:1
+// GFX12: encoding: [0xfa,0xac,0x0a,0x7e,0x01,0x21,0x01,0xff]
+
+v_rsq_f16 v5, v1 row_ror:15
+// GFX12: encoding: [0xfa,0xac,0x0a,0x7e,0x01,0x2f,0x01,0xff]
+
+v_rsq_f16 v5, v1 row_share:0 row_mask:0xf bank_mask:0xf
+// GFX12: encoding: [0xfa,0xac,0x0a,0x7e,0x01,0x50,0x01,0xff]
+
+v_rsq_f16 v5, v1 row_share:15 row_mask:0x0 bank_mask:0x1
+// GFX12: encoding: [0xfa,0xac,0x0a,0x7e,0x01,0x5f,0x01,0x01]
+
+v_rsq_f16 v5, v1 row_xmask:0 row_mask:0x1 bank_mask:0x3 bound_ctrl:1 fi:0
+// GFX12: encoding: [0xfa,0xac,0x0a,0x7e,0x01,0x60,0x09,0x13]
+
+v_rsq_f16 v127, -|v127| row_xmask:15 row_mask:0x3 bank_mask:0x0 bound_ctrl:0 fi:1
+// GFX12: encoding: [0xfa,0xac,0xfe,0x7e,0x7f,0x6f,0x35,0x30]
+
+v_rsq_f32 v5, v1 quad_perm:[3,2,1,0]
+// GFX12: encoding: [0xfa,0x5c,0x0a,0x7e,0x01,0x1b,0x00,0xff]
+
+v_rsq_f32 v5, v1 quad_perm:[0,1,2,3]
+// GFX12: encoding: [0xfa,0x5c,0x0a,0x7e,0x01,0xe4,0x00,0xff]
+
+v_rsq_f32 v5, v1 row_mirror
+// GFX12: encoding: [0xfa,0x5c,0x0a,0x7e,0x01,0x40,0x01,0xff]
+
+v_rsq_f32 v5, v1 row_half_mirror
+// GFX12: encoding: [0xfa,0x5c,0x0a,0x7e,0x01,0x41,0x01,0xff]
+
+v_rsq_f32 v5, v1 row_shl:1
+// GFX12: encoding: [0xfa,0x5c,0x0a,0x7e,0x01,0x01,0x01,0xff]
+
+v_rsq_f32 v5, v1 row_shl:15
+// GFX12: encoding: [0xfa,0x5c,0x0a,0x7e,0x01,0x0f,0x01,0xff]
+
+v_rsq_f32 v5, v1 row_shr:1
+// GFX12: encoding: [0xfa,0x5c,0x0a,0x7e,0x01,0x11,0x01,0xff]
+
+v_rsq_f32 v5, v1 row_shr:15
+// GFX12: encoding: [0xfa,0x5c,0x0a,0x7e,0x01,0x1f,0x01,0xff]
+
+v_rsq_f32 v5, v1 row_ror:1
+// GFX12: encoding: [0xfa,0x5c,0x0a,0x7e,0x01,0x21,0x01,0xff]
+
+v_rsq_f32 v5, v1 row_ror:15
+// GFX12: encoding: [0xfa,0x5c,0x0a,0x7e,0x01,0x2f,0x01,0xff]
+
+v_rsq_f32 v5, v1 row_share:0 row_mask:0xf bank_mask:0xf
+// GFX12: encoding: [0xfa,0x5c,0x0a,0x7e,0x01,0x50,0x01,0xff]
+
+v_rsq_f32 v5, v1 row_share:15 row_mask:0x0 bank_mask:0x1
+// GFX12: encoding: [0xfa,0x5c,0x0a,0x7e,0x01,0x5f,0x01,0x01]
+
+v_rsq_f32 v5, v1 row_xmask:0 row_mask:0x1 bank_mask:0x3 bound_ctrl:1 fi:0
+// GFX12: encoding: [0xfa,0x5c,0x0a,0x7e,0x01,0x60,0x09,0x13]
+
+v_rsq_f32 v255, -|v255| row_xmask:15 row_mask:0x3 bank_mask:0x0 bound_ctrl:0 fi:1
+// GFX12: encoding: [0xfa,0x5c,0xfe,0x7f,0xff,0x6f,0x35,0x30]
+
+v_sat_pk_u8_i16 v5, v1 quad_perm:[3,2,1,0]
+// GFX12: encoding: [0xfa,0xc4,0x0a,0x7e,0x01,0x1b,0x00,0xff]
+
+v_sat_pk_u8_i16 v5, v1 quad_perm:[0,1,2,3]
+// GFX12: encoding: [0xfa,0xc4,0x0a,0x7e,0x01,0xe4,0x00,0xff]
+
+v_sat_pk_u8_i16 v5, v1 row_mirror
+// GFX12: encoding: [0xfa,0xc4,0x0a,0x7e,0x01,0x40,0x01,0xff]
+
+v_sat_pk_u8_i16 v5, v1 row_half_mirror
+// GFX12: encoding: [0xfa,0xc4,0x0a,0x7e,0x01,0x41,0x01,0xff]
+
+v_sat_pk_u8_i16 v5, v1 row_shl:1
+// GFX12: encoding: [0xfa,0xc4,0x0a,0x7e,0x01,0x01,0x01,0xff]
+
+v_sat_pk_u8_i16 v5, v1 row_shl:15
+// GFX12: encoding: [0xfa,0xc4,0x0a,0x7e,0x01,0x0f,0x01,0xff]
+
+v_sat_pk_u8_i16 v5, v1 row_shr:1
+// GFX12: encoding: [0xfa,0xc4,0x0a,0x7e,0x01,0x11,0x01,0xff]
+
+v_sat_pk_u8_i16 v5, v1 row_shr:15
+// GFX12: encoding: [0xfa,0xc4,0x0a,0x7e,0x01,0x1f,0x01,0xff]
+
+v_sat_pk_u8_i16 v5, v1 row_ror:1
+// GFX12: encoding: [0xfa,0xc4,0x0a,0x7e,0x01,0x21,0x01,0xff]
+
+v_sat_pk_u8_i16 v5, v1 row_ror:15
+// GFX12: encoding: [0xfa,0xc4,0x0a,0x7e,0x01,0x2f,0x01,0xff]
+
+v_sat_pk_u8_i16 v5, v1 row_share:0 row_mask:0xf bank_mask:0xf
+// GFX12: encoding: [0xfa,0xc4,0x0a,0x7e,0x01,0x50,0x01,0xff]
+
+v_sat_pk_u8_i16 v5, v1 row_share:15 row_mask:0x0 bank_mask:0x1
+// GFX12: encoding: [0xfa,0xc4,0x0a,0x7e,0x01,0x5f,0x01,0x01]
+
+v_sat_pk_u8_i16 v5, v1 row_xmask:0 row_mask:0x1 bank_mask:0x3 bound_ctrl:1 fi:0
+// GFX12: encoding: [0xfa,0xc4,0x0a,0x7e,0x01,0x60,0x09,0x13]
+
+v_sat_pk_u8_i16 v127, v255 row_xmask:15 row_mask:0x3 bank_mask:0x0 bound_ctrl:0 fi:1
+// GFX12: encoding: [0xfa,0xc4,0xfe,0x7e,0xff,0x6f,0x05,0x30]
+
+v_sin_f16 v5, v1 quad_perm:[3,2,1,0]
+// GFX12: encoding: [0xfa,0xc0,0x0a,0x7e,0x01,0x1b,0x00,0xff]
+
+v_sin_f16 v5, v1 quad_perm:[0,1,2,3]
+// GFX12: encoding: [0xfa,0xc0,0x0a,0x7e,0x01,0xe4,0x00,0xff]
+
+v_sin_f16 v5, v1 row_mirror
+// GFX12: encoding: [0xfa,0xc0,0x0a,0x7e,0x01,0x40,0x01,0xff]
+
+v_sin_f16 v5, v1 row_half_mirror
+// GFX12: encoding: [0xfa,0xc0,0x0a,0x7e,0x01,0x41,0x01,0xff]
+
+v_sin_f16 v5, v1 row_shl:1
+// GFX12: encoding: [0xfa,0xc0,0x0a,0x7e,0x01,0x01,0x01,0xff]
+
+v_sin_f16 v5, v1 row_shl:15
+// GFX12: encoding: [0xfa,0xc0,0x0a,0x7e,0x01,0x0f,0x01,0xff]
+
+v_sin_f16 v5, v1 row_shr:1
+// GFX12: encoding: [0xfa,0xc0,0x0a,0x7e,0x01,0x11,0x01,0xff]
+
+v_sin_f16 v5, v1 row_shr:15
+// GFX12: encoding: [0xfa,0xc0,0x0a,0x7e,0x01,0x1f,0x01,0xff]
+
+v_sin_f16 v5, v1 row_ror:1
+// GFX12: encoding: [0xfa,0xc0,0x0a,0x7e,0x01,0x21,0x01,0xff]
+
+v_sin_f16 v5, v1 row_ror:15
+// GFX12: encoding: [0xfa,0xc0,0x0a,0x7e,0x01,0x2f,0x01,0xff]
+
+v_sin_f16 v5, v1 row_share:0 row_mask:0xf bank_mask:0xf
+// GFX12: encoding: [0xfa,0xc0,0x0a,0x7e,0x01,0x50,0x01,0xff]
+
+v_sin_f16 v5, v1 row_share:15 row_mask:0x0 bank_mask:0x1
+// GFX12: encoding: [0xfa,0xc0,0x0a,0x7e,0x01,0x5f,0x01,0x01]
+
+v_sin_f16 v5, v1 row_xmask:0 row_mask:0x1 bank_mask:0x3 bound_ctrl:1 fi:0
+// GFX12: encoding: [0xfa,0xc0,0x0a,0x7e,0x01,0x60,0x09,0x13]
+
+v_sin_f16 v127, -|v127| row_xmask:15 row_mask:0x3 bank_mask:0x0 bound_ctrl:0 fi:1
+// GFX12: encoding: [0xfa,0xc0,0xfe,0x7e,0x7f,0x6f,0x35,0x30]
+
+v_sin_f32 v5, v1 quad_perm:[3,2,1,0]
+// GFX12: encoding: [0xfa,0x6a,0x0a,0x7e,0x01,0x1b,0x00,0xff]
+
+v_sin_f32 v5, v1 quad_perm:[0,1,2,3]
+// GFX12: encoding: [0xfa,0x6a,0x0a,0x7e,0x01,0xe4,0x00,0xff]
+
+v_sin_f32 v5, v1 row_mirror
+// GFX12: encoding: [0xfa,0x6a,0x0a,0x7e,0x01,0x40,0x01,0xff]
+
+v_sin_f32 v5, v1 row_half_mirror
+// GFX12: encoding: [0xfa,0x6a,0x0a,0x7e,0x01,0x41,0x01,0xff]
+
+v_sin_f32 v5, v1 row_shl:1
+// GFX12: encoding: [0xfa,0x6a,0x0a,0x7e,0x01,0x01,0x01,0xff]
+
+v_sin_f32 v5, v1 row_shl:15
+// GFX12: encoding: [0xfa,0x6a,0x0a,0x7e,0x01,0x0f,0x01,0xff]
+
+v_sin_f32 v5, v1 row_shr:1
+// GFX12: encoding: [0xfa,0x6a,0x0a,0x7e,0x01,0x11,0x01,0xff]
+
+v_sin_f32 v5, v1 row_shr:15
+// GFX12: encoding: [0xfa,0x6a,0x0a,0x7e,0x01,0x1f,0x01,0xff]
+
+v_sin_f32 v5, v1 row_ror:1
+// GFX12: encoding: [0xfa,0x6a,0x0a,0x7e,0x01,0x21,0x01,0xff]
+
+v_sin_f32 v5, v1 row_ror:15
+// GFX12: encoding: [0xfa,0x6a,0x0a,0x7e,0x01,0x2f,0x01,0xff]
+
+v_sin_f32 v5, v1 row_share:0 row_mask:0xf bank_mask:0xf
+// GFX12: encoding: [0xfa,0x6a,0x0a,0x7e,0x01,0x50,0x01,0xff]
+
+v_sin_f32 v5, v1 row_share:15 row_mask:0x0 bank_mask:0x1
+// GFX12: encoding: [0xfa,0x6a,0x0a,0x7e,0x01,0x5f,0x01,0x01]
+
+v_sin_f32 v5, v1 row_xmask:0 row_mask:0x1 bank_mask:0x3 bound_ctrl:1 fi:0
+// GFX12: encoding: [0xfa,0x6a,0x0a,0x7e,0x01,0x60,0x09,0x13]
+
+v_sin_f32 v255, -|v255| row_xmask:15 row_mask:0x3 bank_mask:0x0 bound_ctrl:0 fi:1
+// GFX12: encoding: [0xfa,0x6a,0xfe,0x7f,0xff,0x6f,0x35,0x30]
+
+v_sqrt_f16 v5, v1 quad_perm:[3,2,1,0]
+// GFX12: encoding: [0xfa,0xaa,0x0a,0x7e,0x01,0x1b,0x00,0xff]
+
+v_sqrt_f16 v5, v1 quad_perm:[0,1,2,3]
+// GFX12: encoding: [0xfa,0xaa,0x0a,0x7e,0x01,0xe4,0x00,0xff]
+
+v_sqrt_f16 v5, v1 row_mirror
+// GFX12: encoding: [0xfa,0xaa,0x0a,0x7e,0x01,0x40,0x01,0xff]
+
+v_sqrt_f16 v5, v1 row_half_mirror
+// GFX12: encoding: [0xfa,0xaa,0x0a,0x7e,0x01,0x41,0x01,0xff]
+
+v_sqrt_f16 v5, v1 row_shl:1
+// GFX12: encoding: [0xfa,0xaa,0x0a,0x7e,0x01,0x01,0x01,0xff]
+
+v_sqrt_f16 v5, v1 row_shl:15
+// GFX12: encoding: [0xfa,0xaa,0x0a,0x7e,0x01,0x0f,0x01,0xff]
+
+v_sqrt_f16 v5, v1 row_shr:1
+// GFX12: encoding: [0xfa,0xaa,0x0a,0x7e,0x01,0x11,0x01,0xff]
+
+v_sqrt_f16 v5, v1 row_shr:15
+// GFX12: encoding: [0xfa,0xaa,0x0a,0x7e,0x01,0x1f,0x01,0xff]
+
+v_sqrt_f16 v5, v1 row_ror:1
+// GFX12: encoding: [0xfa,0xaa,0x0a,0x7e,0x01,0x21,0x01,0xff]
+
+v_sqrt_f16 v5, v1 row_ror:15
+// GFX12: encoding: [0xfa,0xaa,0x0a,0x7e,0x01,0x2f,0x01,0xff]
+
+v_sqrt_f16 v5, v1 row_share:0 row_mask:0xf bank_mask:0xf
+// GFX12: encoding: [0xfa,0xaa,0x0a,0x7e,0x01,0x50,0x01,0xff]
+
+v_sqrt_f16 v5, v1 row_share:15 row_mask:0x0 bank_mask:0x1
+// GFX12: encoding: [0xfa,0xaa,0x0a,0x7e,0x01,0x5f,0x01,0x01]
+
+v_sqrt_f16 v5, v1 row_xmask:0 row_mask:0x1 bank_mask:0x3 bound_ctrl:1 fi:0
+// GFX12: encoding: [0xfa,0xaa,0x0a,0x7e,0x01,0x60,0x09,0x13]
+
+v_sqrt_f16 v127, -|v127| row_xmask:15 row_mask:0x3 bank_mask:0x0 bound_ctrl:0 fi:1
+// GFX12: encoding: [0xfa,0xaa,0xfe,0x7e,0x7f,0x6f,0x35,0x30]
+
+v_sqrt_f32 v5, v1 quad_perm:[3,2,1,0]
+// GFX12: encoding: [0xfa,0x66,0x0a,0x7e,0x01,0x1b,0x00,0xff]
+
+v_sqrt_f32 v5, v1 quad_perm:[0,1,2,3]
+// GFX12: encoding: [0xfa,0x66,0x0a,0x7e,0x01,0xe4,0x00,0xff]
+
+v_sqrt_f32 v5, v1 row_mirror
+// GFX12: encoding: [0xfa,0x66,0x0a,0x7e,0x01,0x40,0x01,0xff]
+
+v_sqrt_f32 v5, v1 row_half_mirror
+// GFX12: encoding: [0xfa,0x66,0x0a,0x7e,0x01,0x41,0x01,0xff]
+
+v_sqrt_f32 v5, v1 row_shl:1
+// GFX12: encoding: [0xfa,0x66,0x0a,0x7e,0x01,0x01,0x01,0xff]
+
+v_sqrt_f32 v5, v1 row_shl:15
+// GFX12: encoding: [0xfa,0x66,0x0a,0x7e,0x01,0x0f,0x01,0xff]
+
+v_sqrt_f32 v5, v1 row_shr:1
+// GFX12: encoding: [0xfa,0x66,0x0a,0x7e,0x01,0x11,0x01,0xff]
+
+v_sqrt_f32 v5, v1 row_shr:15
+// GFX12: encoding: [0xfa,0x66,0x0a,0x7e,0x01,0x1f,0x01,0xff]
+
+v_sqrt_f32 v5, v1 row_ror:1
+// GFX12: encoding: [0xfa,0x66,0x0a,0x7e,0x01,0x21,0x01,0xff]
+
+v_sqrt_f32 v5, v1 row_ror:15
+// GFX12: encoding: [0xfa,0x66,0x0a,0x7e,0x01,0x2f,0x01,0xff]
+
+v_sqrt_f32 v5, v1 row_share:0 row_mask:0xf bank_mask:0xf
+// GFX12: encoding: [0xfa,0x66,0x0a,0x7e,0x01,0x50,0x01,0xff]
+
+v_sqrt_f32 v5, v1 row_share:15 row_mask:0x0 bank_mask:0x1
+// GFX12: encoding: [0xfa,0x66,0x0a,0x7e,0x01,0x5f,0x01,0x01]
+
+v_sqrt_f32 v5, v1 row_xmask:0 row_mask:0x1 bank_mask:0x3 bound_ctrl:1 fi:0
+// GFX12: encoding: [0xfa,0x66,0x0a,0x7e,0x01,0x60,0x09,0x13]
+
+v_sqrt_f32 v255, -|v255| row_xmask:15 row_mask:0x3 bank_mask:0x0 bound_ctrl:0 fi:1
+// GFX12: encoding: [0xfa,0x66,0xfe,0x7f,0xff,0x6f,0x35,0x30]
+
+v_trunc_f16 v5, v1 quad_perm:[3,2,1,0]
+// GFX12: encoding: [0xfa,0xba,0x0a,0x7e,0x01,0x1b,0x00,0xff]
+
+v_trunc_f16 v5, v1 quad_perm:[0,1,2,3]
+// GFX12: encoding: [0xfa,0xba,0x0a,0x7e,0x01,0xe4,0x00,0xff]
+
+v_trunc_f16 v5, v1 row_mirror
+// GFX12: encoding: [0xfa,0xba,0x0a,0x7e,0x01,0x40,0x01,0xff]
+
+v_trunc_f16 v5, v1 row_half_mirror
+// GFX12: encoding: [0xfa,0xba,0x0a,0x7e,0x01,0x41,0x01,0xff]
+
+v_trunc_f16 v5, v1 row_shl:1
+// GFX12: encoding: [0xfa,0xba,0x0a,0x7e,0x01,0x01,0x01,0xff]
+
+v_trunc_f16 v5, v1 row_shl:15
+// GFX12: encoding: [0xfa,0xba,0x0a,0x7e,0x01,0x0f,0x01,0xff]
+
+v_trunc_f16 v5, v1 row_shr:1
+// GFX12: encoding: [0xfa,0xba,0x0a,0x7e,0x01,0x11,0x01,0xff]
+
+v_trunc_f16 v5, v1 row_shr:15
+// GFX12: encoding: [0xfa,0xba,0x0a,0x7e,0x01,0x1f,0x01,0xff]
+
+v_trunc_f16 v5, v1 row_ror:1
+// GFX12: encoding: [0xfa,0xba,0x0a,0x7e,0x01,0x21,0x01,0xff]
+
+v_trunc_f16 v5, v1 row_ror:15
+// GFX12: encoding: [0xfa,0xba,0x0a,0x7e,0x01,0x2f,0x01,0xff]
+
+v_trunc_f16 v5, v1 row_share:0 row_mask:0xf bank_mask:0xf
+// GFX12: encoding: [0xfa,0xba,0x0a,0x7e,0x01,0x50,0x01,0xff]
+
+v_trunc_f16 v5, v1 row_share:15 row_mask:0x0 bank_mask:0x1
+// GFX12: encoding: [0xfa,0xba,0x0a,0x7e,0x01,0x5f,0x01,0x01]
+
+v_trunc_f16 v5, v1 row_xmask:0 row_mask:0x1 bank_mask:0x3 bound_ctrl:1 fi:0
+// GFX12: encoding: [0xfa,0xba,0x0a,0x7e,0x01,0x60,0x09,0x13]
+
+v_trunc_f16 v127, -|v127| row_xmask:15 row_mask:0x3 bank_mask:0x0 bound_ctrl:0 fi:1
+// GFX12: encoding: [0xfa,0xba,0xfe,0x7e,0x7f,0x6f,0x35,0x30]
+
+v_trunc_f32 v5, v1 quad_perm:[3,2,1,0]
+// GFX12: encoding: [0xfa,0x42,0x0a,0x7e,0x01,0x1b,0x00,0xff]
+
+v_trunc_f32 v5, v1 quad_perm:[0,1,2,3]
+// GFX12: encoding: [0xfa,0x42,0x0a,0x7e,0x01,0xe4,0x00,0xff]
+
+v_trunc_f32 v5, v1 row_mirror
+// GFX12: encoding: [0xfa,0x42,0x0a,0x7e,0x01,0x40,0x01,0xff]
+
+v_trunc_f32 v5, v1 row_half_mirror
+// GFX12: encoding: [0xfa,0x42,0x0a,0x7e,0x01,0x41,0x01,0xff]
+
+v_trunc_f32 v5, v1 row_shl:1
+// GFX12: encoding: [0xfa,0x42,0x0a,0x7e,0x01,0x01,0x01,0xff]
+
+v_trunc_f32 v5, v1 row_shl:15
+// GFX12: encoding: [0xfa,0x42,0x0a,0x7e,0x01,0x0f,0x01,0xff]
+
+v_trunc_f32 v5, v1 row_shr:1
+// GFX12: encoding: [0xfa,0x42,0x0a,0x7e,0x01,0x11,0x01,0xff]
+
+v_trunc_f32 v5, v1 row_shr:15
+// GFX12: encoding: [0xfa,0x42,0x0a,0x7e,0x01,0x1f,0x01,0xff]
+
+v_trunc_f32 v5, v1 row_ror:1
+// GFX12: encoding: [0xfa,0x42,0x0a,0x7e,0x01,0x21,0x01,0xff]
+
+v_trunc_f32 v5, v1 row_ror:15
+// GFX12: encoding: [0xfa,0x42,0x0a,0x7e,0x01,0x2f,0x01,0xff]
+
+v_trunc_f32 v5, v1 row_share:0 row_mask:0xf bank_mask:0xf
+// GFX12: encoding: [0xfa,0x42,0x0a,0x7e,0x01,0x50,0x01,0xff]
+
+v_trunc_f32 v5, v1 row_share:15 row_mask:0x0 bank_mask:0x1
+// GFX12: encoding: [0xfa,0x42,0x0a,0x7e,0x01,0x5f,0x01,0x01]
+
+v_trunc_f32 v5, v1 row_xmask:0 row_mask:0x1 bank_mask:0x3 bound_ctrl:1 fi:0
+// GFX12: encoding: [0xfa,0x42,0x0a,0x7e,0x01,0x60,0x09,0x13]
+
+v_trunc_f32 v255, -|v255| row_xmask:15 row_mask:0x3 bank_mask:0x0 bound_ctrl:0 fi:1
+// GFX12: encoding: [0xfa,0x42,0xfe,0x7f,0xff,0x6f,0x35,0x30]
diff --git a/llvm/test/MC/AMDGPU/gfx12_asm_vop1_dpp16.s b/llvm/test/MC/AMDGPU/gfx12_asm_vop1_dpp16.s
index d5cafcd..323439b 100644
--- a/llvm/test/MC/AMDGPU/gfx12_asm_vop1_dpp16.s
+++ b/llvm/test/MC/AMDGPU/gfx12_asm_vop1_dpp16.s
@@ -1,5 +1,7 @@
-// RUN: llvm-mc -triple=amdgcn -mcpu=gfx1200 -mattr=+wavefrontsize32 -show-encoding %s | FileCheck --check-prefixes=GFX12 %s
-// RUN: llvm-mc -triple=amdgcn -mcpu=gfx1200 -mattr=+wavefrontsize64 -show-encoding %s | FileCheck --check-prefixes=GFX12 %s
+// RUN: llvm-mc -triple=amdgcn -mcpu=gfx1200 -mattr=+wavefrontsize32,+real-true16 -show-encoding %s | FileCheck --check-prefixes=GFX12 %s
+// RUN: llvm-mc -triple=amdgcn -mcpu=gfx1200 -mattr=+wavefrontsize64,+real-true16 -show-encoding %s | FileCheck --check-prefixes=GFX12 %s
+
+// this file will be converted to true16 format when more true16 instructions are supported
 
 v_bfrev_b32_dpp v5, v1 quad_perm:[3,2,1,0]
 // GFX12: encoding: [0xfa,0x70,0x0a,0x7e,0x01,0x1b,0x00,0xff]
diff --git a/llvm/test/MC/AMDGPU/gfx12_asm_vop1_dpp8-fake16.s b/llvm/test/MC/AMDGPU/gfx12_asm_vop1_dpp8-fake16.s
new file mode 100644
index 0000000..82acbd4
--- /dev/null
+++ b/llvm/test/MC/AMDGPU/gfx12_asm_vop1_dpp8-fake16.s
@@ -0,0 +1,617 @@
+// RUN: llvm-mc -triple=amdgcn -mcpu=gfx1200 -mattr=+wavefrontsize32,-real-true16 -show-encoding %s | FileCheck --check-prefixes=GFX12 %s
+// RUN: llvm-mc -triple=amdgcn -mcpu=gfx1200 -mattr=+wavefrontsize64,-real-true16 -show-encoding %s | FileCheck --check-prefixes=GFX12 %s
+
+v_bfrev_b32_dpp v5, v1 dpp8:[7,6,5,4,3,2,1,0]
+// GFX12: encoding: [0xe9,0x70,0x0a,0x7e,0x01,0x77,0x39,0x05]
+
+v_bfrev_b32 v5, v1 dpp8:[7,6,5,4,3,2,1,0] fi:1
+// GFX12: encoding: [0xea,0x70,0x0a,0x7e,0x01,0x77,0x39,0x05]
+
+v_bfrev_b32 v255, v255 dpp8:[0,0,0,0,0,0,0,0] fi:0
+// GFX12: encoding: [0xe9,0x70,0xfe,0x7f,0xff,0x00,0x00,0x00]
+
+v_ceil_f16 v5, v1 dpp8:[7,6,5,4,3,2,1,0]
+// GFX12: encoding: [0xe9,0xb8,0x0a,0x7e,0x01,0x77,0x39,0x05]
+
+v_ceil_f16 v5, v1 dpp8:[7,6,5,4,3,2,1,0] fi:1
+// GFX12: encoding: [0xea,0xb8,0x0a,0x7e,0x01,0x77,0x39,0x05]
+
+v_ceil_f16 v127, v127 dpp8:[0,0,0,0,0,0,0,0] fi:0
+// GFX12: encoding: [0xe9,0xb8,0xfe,0x7e,0x7f,0x00,0x00,0x00]
+
+v_ceil_f32 v5, v1 dpp8:[7,6,5,4,3,2,1,0]
+// GFX12: encoding: [0xe9,0x44,0x0a,0x7e,0x01,0x77,0x39,0x05]
+
+v_ceil_f32 v5, v1 dpp8:[7,6,5,4,3,2,1,0] fi:1
+// GFX12: encoding: [0xea,0x44,0x0a,0x7e,0x01,0x77,0x39,0x05]
+
+v_ceil_f32 v255, v255 dpp8:[0,0,0,0,0,0,0,0] fi:0
+// GFX12: encoding: [0xe9,0x44,0xfe,0x7f,0xff,0x00,0x00,0x00]
+
+v_cls_i32 v5, v1 dpp8:[7,6,5,4,3,2,1,0]
+// GFX12: encoding: [0xe9,0x76,0x0a,0x7e,0x01,0x77,0x39,0x05]
+
+v_cls_i32 v5, v1 dpp8:[7,6,5,4,3,2,1,0] fi:1
+// GFX12: encoding: [0xea,0x76,0x0a,0x7e,0x01,0x77,0x39,0x05]
+
+v_cls_i32 v255, v255 dpp8:[0,0,0,0,0,0,0,0] fi:0
+// GFX12: encoding: [0xe9,0x76,0xfe,0x7f,0xff,0x00,0x00,0x00]
+
+v_clz_i32_u32 v5, v1 dpp8:[7,6,5,4,3,2,1,0]
+// GFX12: encoding: [0xe9,0x72,0x0a,0x7e,0x01,0x77,0x39,0x05]
+
+v_clz_i32_u32 v5, v1 dpp8:[7,6,5,4,3,2,1,0] fi:1
+// GFX12: encoding: [0xea,0x72,0x0a,0x7e,0x01,0x77,0x39,0x05]
+
+v_clz_i32_u32 v255, v255 dpp8:[0,0,0,0,0,0,0,0] fi:0
+// GFX12: encoding: [0xe9,0x72,0xfe,0x7f,0xff,0x00,0x00,0x00]
+
+v_cos_f16 v5, v1 dpp8:[7,6,5,4,3,2,1,0]
+// GFX12: encoding: [0xe9,0xc2,0x0a,0x7e,0x01,0x77,0x39,0x05]
+
+v_cos_f16 v5, v1 dpp8:[7,6,5,4,3,2,1,0] fi:1
+// GFX12: encoding: [0xea,0xc2,0x0a,0x7e,0x01,0x77,0x39,0x05]
+
+v_cos_f16 v127, v127 dpp8:[0,0,0,0,0,0,0,0] fi:0
+// GFX12: encoding: [0xe9,0xc2,0xfe,0x7e,0x7f,0x00,0x00,0x00]
+
+v_cos_f32 v5, v1 dpp8:[7,6,5,4,3,2,1,0]
+// GFX12: encoding: [0xe9,0x6c,0x0a,0x7e,0x01,0x77,0x39,0x05]
+
+v_cos_f32 v5, v1 dpp8:[7,6,5,4,3,2,1,0] fi:1
+// GFX12: encoding: [0xea,0x6c,0x0a,0x7e,0x01,0x77,0x39,0x05]
+
+v_cos_f32 v255, v255 dpp8:[0,0,0,0,0,0,0,0] fi:0
+// GFX12: encoding: [0xe9,0x6c,0xfe,0x7f,0xff,0x00,0x00,0x00]
+
+v_ctz_i32_b32 v5, v1 dpp8:[7,6,5,4,3,2,1,0]
+// GFX12: encoding: [0xe9,0x74,0x0a,0x7e,0x01,0x77,0x39,0x05]
+
+v_ctz_i32_b32 v5, v1 dpp8:[7,6,5,4,3,2,1,0] fi:1
+// GFX12: encoding: [0xea,0x74,0x0a,0x7e,0x01,0x77,0x39,0x05]
+
+v_ctz_i32_b32 v255, v255 dpp8:[0,0,0,0,0,0,0,0] fi:0
+// GFX12: encoding: [0xe9,0x74,0xfe,0x7f,0xff,0x00,0x00,0x00]
+
+v_cvt_f32_fp8 v5, v1 dpp8:[0,1,2,3,4,5,6,7]
+// GFX12: encoding: [0xe9,0xd8,0x0a,0x7e,0x01,0x88,0xc6,0xfa]
+
+v_cvt_f32_fp8 v1, v3 dpp8:[7,6,5,4,3,2,1,0]
+// GFX12: encoding: [0xe9,0xd8,0x02,0x7e,0x03,0x77,0x39,0x05]
+
+v_cvt_f32_bf8 v5, v1 dpp8:[0,1,2,3,4,5,6,7]
+// GFX12: encoding: [0xe9,0xda,0x0a,0x7e,0x01,0x88,0xc6,0xfa]
+
+v_cvt_f32_bf8 v1, v3 dpp8:[7,6,5,4,3,2,1,0]
+// GFX12: encoding: [0xe9,0xda,0x02,0x7e,0x03,0x77,0x39,0x05]
+
+v_cvt_f16_f32 v5, v1 dpp8:[7,6,5,4,3,2,1,0]
+// GFX12: encoding: [0xe9,0x14,0x0a,0x7e,0x01,0x77,0x39,0x05]
+
+v_cvt_f16_f32 v5, v1 dpp8:[7,6,5,4,3,2,1,0] fi:1
+// GFX12: encoding: [0xea,0x14,0x0a,0x7e,0x01,0x77,0x39,0x05]
+
+v_cvt_f16_f32 v127, v255 dpp8:[0,0,0,0,0,0,0,0] fi:0
+// GFX12: encoding: [0xe9,0x14,0xfe,0x7e,0xff,0x00,0x00,0x00]
+
+v_cvt_f16_i16 v5, v1 dpp8:[7,6,5,4,3,2,1,0]
+// GFX12: encoding: [0xe9,0xa2,0x0a,0x7e,0x01,0x77,0x39,0x05]
+
+v_cvt_f16_i16 v5, v1 dpp8:[7,6,5,4,3,2,1,0] fi:1
+// GFX12: encoding: [0xea,0xa2,0x0a,0x7e,0x01,0x77,0x39,0x05]
+
+v_cvt_f16_i16 v127, v127 dpp8:[0,0,0,0,0,0,0,0] fi:0
+// GFX12: encoding: [0xe9,0xa2,0xfe,0x7e,0x7f,0x00,0x00,0x00]
+
+v_cvt_f16_u16 v5, v1 dpp8:[7,6,5,4,3,2,1,0]
+// GFX12: encoding: [0xe9,0xa0,0x0a,0x7e,0x01,0x77,0x39,0x05]
+
+v_cvt_f16_u16 v5, v1 dpp8:[7,6,5,4,3,2,1,0] fi:1
+// GFX12: encoding: [0xea,0xa0,0x0a,0x7e,0x01,0x77,0x39,0x05]
+
+v_cvt_f16_u16 v127, v127 dpp8:[0,0,0,0,0,0,0,0] fi:0
+// GFX12: encoding: [0xe9,0xa0,0xfe,0x7e,0x7f,0x00,0x00,0x00]
+
+v_cvt_f32_f16 v5, v1 dpp8:[7,6,5,4,3,2,1,0]
+// GFX12: encoding: [0xe9,0x16,0x0a,0x7e,0x01,0x77,0x39,0x05]
+
+v_cvt_f32_f16 v5, v1 dpp8:[7,6,5,4,3,2,1,0] fi:1
+// GFX12: encoding: [0xea,0x16,0x0a,0x7e,0x01,0x77,0x39,0x05]
+
+v_cvt_f32_f16 v255, v127 dpp8:[0,0,0,0,0,0,0,0] fi:0
+// GFX12: encoding: [0xe9,0x16,0xfe,0x7f,0x7f,0x00,0x00,0x00]
+
+v_cvt_f32_i32 v5, v1 dpp8:[7,6,5,4,3,2,1,0]
+// GFX12: encoding: [0xe9,0x0a,0x0a,0x7e,0x01,0x77,0x39,0x05]
+
+v_cvt_f32_i32 v5, v1 dpp8:[7,6,5,4,3,2,1,0] fi:1
+// GFX12: encoding: [0xea,0x0a,0x0a,0x7e,0x01,0x77,0x39,0x05]
+
+v_cvt_f32_i32 v255, v255 dpp8:[0,0,0,0,0,0,0,0] fi:0
+// GFX12: encoding: [0xe9,0x0a,0xfe,0x7f,0xff,0x00,0x00,0x00]
+
+v_cvt_f32_u32 v5, v1 dpp8:[7,6,5,4,3,2,1,0]
+// GFX12: encoding: [0xe9,0x0c,0x0a,0x7e,0x01,0x77,0x39,0x05]
+
+v_cvt_f32_u32 v5, v1 dpp8:[7,6,5,4,3,2,1,0] fi:1
+// GFX12: encoding: [0xea,0x0c,0x0a,0x7e,0x01,0x77,0x39,0x05]
+
+v_cvt_f32_u32 v255, v255 dpp8:[0,0,0,0,0,0,0,0] fi:0
+// GFX12: encoding: [0xe9,0x0c,0xfe,0x7f,0xff,0x00,0x00,0x00]
+
+v_cvt_f32_ubyte0 v5, v1 dpp8:[7,6,5,4,3,2,1,0]
+// GFX12: encoding: [0xe9,0x22,0x0a,0x7e,0x01,0x77,0x39,0x05]
+
+v_cvt_f32_ubyte0 v5, v1 dpp8:[7,6,5,4,3,2,1,0] fi:1
+// GFX12: encoding: [0xea,0x22,0x0a,0x7e,0x01,0x77,0x39,0x05]
+
+v_cvt_f32_ubyte0 v255, v255 dpp8:[0,0,0,0,0,0,0,0] fi:0
+// GFX12: encoding: [0xe9,0x22,0xfe,0x7f,0xff,0x00,0x00,0x00]
+
+v_cvt_f32_ubyte1 v5, v1 dpp8:[7,6,5,4,3,2,1,0]
+// GFX12: encoding: [0xe9,0x24,0x0a,0x7e,0x01,0x77,0x39,0x05]
+
+v_cvt_f32_ubyte1 v5, v1 dpp8:[7,6,5,4,3,2,1,0] fi:1
+// GFX12: encoding: [0xea,0x24,0x0a,0x7e,0x01,0x77,0x39,0x05]
+
+v_cvt_f32_ubyte1 v255, v255 dpp8:[0,0,0,0,0,0,0,0] fi:0
+// GFX12: encoding: [0xe9,0x24,0xfe,0x7f,0xff,0x00,0x00,0x00]
+
+v_cvt_f32_ubyte2 v5, v1 dpp8:[7,6,5,4,3,2,1,0]
+// GFX12: encoding: [0xe9,0x26,0x0a,0x7e,0x01,0x77,0x39,0x05]
+
+v_cvt_f32_ubyte2 v5, v1 dpp8:[7,6,5,4,3,2,1,0] fi:1
+// GFX12: encoding: [0xea,0x26,0x0a,0x7e,0x01,0x77,0x39,0x05]
+
+v_cvt_f32_ubyte2 v255, v255 dpp8:[0,0,0,0,0,0,0,0] fi:0
+// GFX12: encoding: [0xe9,0x26,0xfe,0x7f,0xff,0x00,0x00,0x00]
+
+v_cvt_f32_ubyte3 v5, v1 dpp8:[7,6,5,4,3,2,1,0]
+// GFX12: encoding: [0xe9,0x28,0x0a,0x7e,0x01,0x77,0x39,0x05]
+
+v_cvt_f32_ubyte3 v5, v1 dpp8:[7,6,5,4,3,2,1,0] fi:1
+// GFX12: encoding: [0xea,0x28,0x0a,0x7e,0x01,0x77,0x39,0x05]
+
+v_cvt_f32_ubyte3 v255, v255 dpp8:[0,0,0,0,0,0,0,0] fi:0
+// GFX12: encoding: [0xe9,0x28,0xfe,0x7f,0xff,0x00,0x00,0x00]
+
+v_cvt_floor_i32_f32 v5, v1 dpp8:[7,6,5,4,3,2,1,0]
+// GFX12: encoding: [0xe9,0x1a,0x0a,0x7e,0x01,0x77,0x39,0x05]
+
+v_cvt_floor_i32_f32 v5, v1 dpp8:[7,6,5,4,3,2,1,0] fi:1
+// GFX12: encoding: [0xea,0x1a,0x0a,0x7e,0x01,0x77,0x39,0x05]
+
+v_cvt_floor_i32_f32 v255, v255 dpp8:[0,0,0,0,0,0,0,0] fi:0
+// GFX12: encoding: [0xe9,0x1a,0xfe,0x7f,0xff,0x00,0x00,0x00]
+
+v_cvt_flr_i32_f32 v5, v1 dpp8:[7,6,5,4,3,2,1,0]
+// GFX12: encoding: [0xe9,0x1a,0x0a,0x7e,0x01,0x77,0x39,0x05]
+
+v_cvt_flr_i32_f32 v5, v1 dpp8:[7,6,5,4,3,2,1,0] fi:1
+// GFX12: encoding: [0xea,0x1a,0x0a,0x7e,0x01,0x77,0x39,0x05]
+
+v_cvt_flr_i32_f32 v255, v255 dpp8:[0,0,0,0,0,0,0,0] fi:0
+// GFX12: encoding: [0xe9,0x1a,0xfe,0x7f,0xff,0x00,0x00,0x00]
+
+v_cvt_i16_f16 v5, v1 dpp8:[7,6,5,4,3,2,1,0]
+// GFX12: encoding: [0xe9,0xa6,0x0a,0x7e,0x01,0x77,0x39,0x05]
+
+v_cvt_i16_f16 v5, v1 dpp8:[7,6,5,4,3,2,1,0] fi:1
+// GFX12: encoding: [0xea,0xa6,0x0a,0x7e,0x01,0x77,0x39,0x05]
+
+v_cvt_i16_f16 v127, v127 dpp8:[0,0,0,0,0,0,0,0] fi:0
+// GFX12: encoding: [0xe9,0xa6,0xfe,0x7e,0x7f,0x00,0x00,0x00]
+
+v_cvt_i32_f32 v5, v1 dpp8:[7,6,5,4,3,2,1,0]
+// GFX12: encoding: [0xe9,0x10,0x0a,0x7e,0x01,0x77,0x39,0x05]
+
+v_cvt_i32_f32 v5, v1 dpp8:[7,6,5,4,3,2,1,0] fi:1
+// GFX12: encoding: [0xea,0x10,0x0a,0x7e,0x01,0x77,0x39,0x05]
+
+v_cvt_i32_f32 v255, v255 dpp8:[0,0,0,0,0,0,0,0] fi:0
+// GFX12: encoding: [0xe9,0x10,0xfe,0x7f,0xff,0x00,0x00,0x00]
+
+v_cvt_i32_i16 v5, v1 dpp8:[7,6,5,4,3,2,1,0]
+// GFX12: encoding: [0xe9,0xd4,0x0a,0x7e,0x01,0x77,0x39,0x05]
+
+v_cvt_i32_i16 v5, v1 dpp8:[7,6,5,4,3,2,1,0] fi:1
+// GFX12: encoding: [0xea,0xd4,0x0a,0x7e,0x01,0x77,0x39,0x05]
+
+v_cvt_i32_i16 v255, v127 dpp8:[0,0,0,0,0,0,0,0] fi:0
+// GFX12: encoding: [0xe9,0xd4,0xfe,0x7f,0x7f,0x00,0x00,0x00]
+
+v_cvt_nearest_i32_f32 v5, v1 dpp8:[7,6,5,4,3,2,1,0]
+// GFX12: encoding: [0xe9,0x18,0x0a,0x7e,0x01,0x77,0x39,0x05]
+
+v_cvt_nearest_i32_f32 v5, v1 dpp8:[7,6,5,4,3,2,1,0] fi:1
+// GFX12: encoding: [0xea,0x18,0x0a,0x7e,0x01,0x77,0x39,0x05]
+
+v_cvt_nearest_i32_f32 v255, v255 dpp8:[0,0,0,0,0,0,0,0] fi:0
+// GFX12: encoding: [0xe9,0x18,0xfe,0x7f,0xff,0x00,0x00,0x00]
+
+v_cvt_norm_i16_f16 v5, v1 dpp8:[7,6,5,4,3,2,1,0]
+// GFX12: encoding: [0xe9,0xc6,0x0a,0x7e,0x01,0x77,0x39,0x05]
+
+v_cvt_norm_i16_f16 v5, v1 dpp8:[7,6,5,4,3,2,1,0] fi:1
+// GFX12: encoding: [0xea,0xc6,0x0a,0x7e,0x01,0x77,0x39,0x05]
+
+v_cvt_norm_i16_f16 v127, v127 dpp8:[0,0,0,0,0,0,0,0] fi:0
+// GFX12: encoding: [0xe9,0xc6,0xfe,0x7e,0x7f,0x00,0x00,0x00]
+
+v_cvt_norm_u16_f16 v5, v1 dpp8:[7,6,5,4,3,2,1,0]
+// GFX12: encoding: [0xe9,0xc8,0x0a,0x7e,0x01,0x77,0x39,0x05]
+
+v_cvt_norm_u16_f16 v5, v1 dpp8:[7,6,5,4,3,2,1,0] fi:1
+// GFX12: encoding: [0xea,0xc8,0x0a,0x7e,0x01,0x77,0x39,0x05]
+
+v_cvt_norm_u16_f16 v127, v127 dpp8:[0,0,0,0,0,0,0,0] fi:0
+// GFX12: encoding: [0xe9,0xc8,0xfe,0x7e,0x7f,0x00,0x00,0x00]
+
+v_cvt_off_f32_i4 v5, v1 dpp8:[7,6,5,4,3,2,1,0]
+// GFX12: encoding: [0xe9,0x1c,0x0a,0x7e,0x01,0x77,0x39,0x05]
+
+v_cvt_off_f32_i4 v5, v1 dpp8:[7,6,5,4,3,2,1,0] fi:1
+// GFX12: encoding: [0xea,0x1c,0x0a,0x7e,0x01,0x77,0x39,0x05]
+
+v_cvt_off_f32_i4 v255, v255 dpp8:[0,0,0,0,0,0,0,0] fi:0
+// GFX12: encoding: [0xe9,0x1c,0xfe,0x7f,0xff,0x00,0x00,0x00]
+
+v_cvt_rpi_i32_f32 v5, v1 dpp8:[7,6,5,4,3,2,1,0]
+// GFX12: encoding: [0xe9,0x18,0x0a,0x7e,0x01,0x77,0x39,0x05]
+
+v_cvt_rpi_i32_f32 v5, v1 dpp8:[7,6,5,4,3,2,1,0] fi:1
+// GFX12: encoding: [0xea,0x18,0x0a,0x7e,0x01,0x77,0x39,0x05]
+
+v_cvt_rpi_i32_f32 v255, v255 dpp8:[0,0,0,0,0,0,0,0] fi:0
+// GFX12: encoding: [0xe9,0x18,0xfe,0x7f,0xff,0x00,0x00,0x00]
+
+v_cvt_u16_f16 v5, v1 dpp8:[7,6,5,4,3,2,1,0]
+// GFX12: encoding: [0xe9,0xa4,0x0a,0x7e,0x01,0x77,0x39,0x05]
+
+v_cvt_u16_f16 v5, v1 dpp8:[7,6,5,4,3,2,1,0] fi:1
+// GFX12: encoding: [0xea,0xa4,0x0a,0x7e,0x01,0x77,0x39,0x05]
+
+v_cvt_u16_f16 v127, v127 dpp8:[0,0,0,0,0,0,0,0] fi:0
+// GFX12: encoding: [0xe9,0xa4,0xfe,0x7e,0x7f,0x00,0x00,0x00]
+
+v_cvt_u32_f32 v5, v1 dpp8:[7,6,5,4,3,2,1,0]
+// GFX12: encoding: [0xe9,0x0e,0x0a,0x7e,0x01,0x77,0x39,0x05]
+
+v_cvt_u32_f32 v5, v1 dpp8:[7,6,5,4,3,2,1,0] fi:1
+// GFX12: encoding: [0xea,0x0e,0x0a,0x7e,0x01,0x77,0x39,0x05]
+
+v_cvt_u32_f32 v255, v255 dpp8:[0,0,0,0,0,0,0,0] fi:0
+// GFX12: encoding: [0xe9,0x0e,0xfe,0x7f,0xff,0x00,0x00,0x00]
+
+v_cvt_u32_u16 v5, v1 dpp8:[7,6,5,4,3,2,1,0]
+// GFX12: encoding: [0xe9,0xd6,0x0a,0x7e,0x01,0x77,0x39,0x05]
+
+v_cvt_u32_u16 v5, v1 dpp8:[7,6,5,4,3,2,1,0] fi:1
+// GFX12: encoding: [0xea,0xd6,0x0a,0x7e,0x01,0x77,0x39,0x05]
+
+v_cvt_u32_u16 v255, v127 dpp8:[0,0,0,0,0,0,0,0] fi:0
+// GFX12: encoding: [0xe9,0xd6,0xfe,0x7f,0x7f,0x00,0x00,0x00]
+
+v_exp_f16 v5, v1 dpp8:[7,6,5,4,3,2,1,0]
+// GFX12: encoding: [0xe9,0xb0,0x0a,0x7e,0x01,0x77,0x39,0x05]
+
+v_exp_f16 v5, v1 dpp8:[7,6,5,4,3,2,1,0] fi:1
+// GFX12: encoding: [0xea,0xb0,0x0a,0x7e,0x01,0x77,0x39,0x05]
+
+v_exp_f16 v127, v127 dpp8:[0,0,0,0,0,0,0,0] fi:0
+// GFX12: encoding: [0xe9,0xb0,0xfe,0x7e,0x7f,0x00,0x00,0x00]
+
+v_exp_f32 v5, v1 dpp8:[7,6,5,4,3,2,1,0]
+// GFX12: encoding: [0xe9,0x4a,0x0a,0x7e,0x01,0x77,0x39,0x05]
+
+v_exp_f32 v5, v1 dpp8:[7,6,5,4,3,2,1,0] fi:1
+// GFX12: encoding: [0xea,0x4a,0x0a,0x7e,0x01,0x77,0x39,0x05]
+
+v_exp_f32 v255, v255 dpp8:[0,0,0,0,0,0,0,0] fi:0
+// GFX12: encoding: [0xe9,0x4a,0xfe,0x7f,0xff,0x00,0x00,0x00]
+
+v_ffbh_i32 v5, v1 dpp8:[7,6,5,4,3,2,1,0]
+// GFX12: encoding: [0xe9,0x76,0x0a,0x7e,0x01,0x77,0x39,0x05]
+
+v_ffbh_i32 v5, v1 dpp8:[7,6,5,4,3,2,1,0] fi:1
+// GFX12: encoding: [0xea,0x76,0x0a,0x7e,0x01,0x77,0x39,0x05]
+
+v_ffbh_i32 v255, v255 dpp8:[0,0,0,0,0,0,0,0] fi:0
+// GFX12: encoding: [0xe9,0x76,0xfe,0x7f,0xff,0x00,0x00,0x00]
+
+v_ffbh_u32 v5, v1 dpp8:[7,6,5,4,3,2,1,0]
+// GFX12: encoding: [0xe9,0x72,0x0a,0x7e,0x01,0x77,0x39,0x05]
+
+v_ffbh_u32 v5, v1 dpp8:[7,6,5,4,3,2,1,0] fi:1
+// GFX12: encoding: [0xea,0x72,0x0a,0x7e,0x01,0x77,0x39,0x05]
+
+v_ffbh_u32 v255, v255 dpp8:[0,0,0,0,0,0,0,0] fi:0
+// GFX12: encoding: [0xe9,0x72,0xfe,0x7f,0xff,0x00,0x00,0x00]
+
+v_ffbl_b32 v5, v1 dpp8:[7,6,5,4,3,2,1,0]
+// GFX12: encoding: [0xe9,0x74,0x0a,0x7e,0x01,0x77,0x39,0x05]
+
+v_ffbl_b32 v5, v1 dpp8:[7,6,5,4,3,2,1,0] fi:1
+// GFX12: encoding: [0xea,0x74,0x0a,0x7e,0x01,0x77,0x39,0x05]
+
+v_ffbl_b32 v255, v255 dpp8:[0,0,0,0,0,0,0,0] fi:0
+// GFX12: encoding: [0xe9,0x74,0xfe,0x7f,0xff,0x00,0x00,0x00]
+
+v_floor_f16 v5, v1 dpp8:[7,6,5,4,3,2,1,0]
+// GFX12: encoding: [0xe9,0xb6,0x0a,0x7e,0x01,0x77,0x39,0x05]
+
+v_floor_f16 v5, v1 dpp8:[7,6,5,4,3,2,1,0] fi:1
+// GFX12: encoding: [0xea,0xb6,0x0a,0x7e,0x01,0x77,0x39,0x05]
+
+v_floor_f16 v127, v127 dpp8:[0,0,0,0,0,0,0,0] fi:0
+// GFX12: encoding: [0xe9,0xb6,0xfe,0x7e,0x7f,0x00,0x00,0x00]
+
+v_floor_f32 v5, v1 dpp8:[7,6,5,4,3,2,1,0]
+// GFX12: encoding: [0xe9,0x48,0x0a,0x7e,0x01,0x77,0x39,0x05]
+
+v_floor_f32 v5, v1 dpp8:[7,6,5,4,3,2,1,0] fi:1
+// GFX12: encoding: [0xea,0x48,0x0a,0x7e,0x01,0x77,0x39,0x05]
+
+v_floor_f32 v255, v255 dpp8:[0,0,0,0,0,0,0,0] fi:0
+// GFX12: encoding: [0xe9,0x48,0xfe,0x7f,0xff,0x00,0x00,0x00]
+
+v_fract_f16 v5, v1 dpp8:[7,6,5,4,3,2,1,0]
+// GFX12: encoding: [0xe9,0xbe,0x0a,0x7e,0x01,0x77,0x39,0x05]
+
+v_fract_f16 v5, v1 dpp8:[7,6,5,4,3,2,1,0] fi:1
+// GFX12: encoding: [0xea,0xbe,0x0a,0x7e,0x01,0x77,0x39,0x05]
+
+v_fract_f16 v127, v127 dpp8:[0,0,0,0,0,0,0,0] fi:0
+// GFX12: encoding: [0xe9,0xbe,0xfe,0x7e,0x7f,0x00,0x00,0x00]
+
+v_fract_f32 v5, v1 dpp8:[7,6,5,4,3,2,1,0]
+// GFX12: encoding: [0xe9,0x40,0x0a,0x7e,0x01,0x77,0x39,0x05]
+
+v_fract_f32 v5, v1 dpp8:[7,6,5,4,3,2,1,0] fi:1
+// GFX12: encoding: [0xea,0x40,0x0a,0x7e,0x01,0x77,0x39,0x05]
+
+v_fract_f32 v255, v255 dpp8:[0,0,0,0,0,0,0,0] fi:0
+// GFX12: encoding: [0xe9,0x40,0xfe,0x7f,0xff,0x00,0x00,0x00]
+
+v_frexp_exp_i16_f16 v5, v1 dpp8:[7,6,5,4,3,2,1,0]
+// GFX12: encoding: [0xe9,0xb4,0x0a,0x7e,0x01,0x77,0x39,0x05]
+
+v_frexp_exp_i16_f16 v5, v1 dpp8:[7,6,5,4,3,2,1,0] fi:1
+// GFX12: encoding: [0xea,0xb4,0x0a,0x7e,0x01,0x77,0x39,0x05]
+
+v_frexp_exp_i16_f16 v127, v127 dpp8:[0,0,0,0,0,0,0,0] fi:0
+// GFX12: encoding: [0xe9,0xb4,0xfe,0x7e,0x7f,0x00,0x00,0x00]
+
+v_frexp_exp_i32_f32 v5, v1 dpp8:[7,6,5,4,3,2,1,0]
+// GFX12: encoding: [0xe9,0x7e,0x0a,0x7e,0x01,0x77,0x39,0x05]
+
+v_frexp_exp_i32_f32 v5, v1 dpp8:[7,6,5,4,3,2,1,0] fi:1
+// GFX12: encoding: [0xea,0x7e,0x0a,0x7e,0x01,0x77,0x39,0x05]
+
+v_frexp_exp_i32_f32 v255, v255 dpp8:[0,0,0,0,0,0,0,0] fi:0
+// GFX12: encoding: [0xe9,0x7e,0xfe,0x7f,0xff,0x00,0x00,0x00]
+
+v_frexp_mant_f16 v5, v1 dpp8:[7,6,5,4,3,2,1,0]
+// GFX12: encoding: [0xe9,0xb2,0x0a,0x7e,0x01,0x77,0x39,0x05]
+
+v_frexp_mant_f16 v5, v1 dpp8:[7,6,5,4,3,2,1,0] fi:1
+// GFX12: encoding: [0xea,0xb2,0x0a,0x7e,0x01,0x77,0x39,0x05]
+
+v_frexp_mant_f16 v127, v127 dpp8:[0,0,0,0,0,0,0,0] fi:0
+// GFX12: encoding: [0xe9,0xb2,0xfe,0x7e,0x7f,0x00,0x00,0x00]
+
+v_frexp_mant_f32 v5, v1 dpp8:[7,6,5,4,3,2,1,0]
+// GFX12: encoding: [0xe9,0x80,0x0a,0x7e,0x01,0x77,0x39,0x05]
+
+v_frexp_mant_f32 v5, v1 dpp8:[7,6,5,4,3,2,1,0] fi:1
+// GFX12: encoding: [0xea,0x80,0x0a,0x7e,0x01,0x77,0x39,0x05]
+
+v_frexp_mant_f32 v255, v255 dpp8:[0,0,0,0,0,0,0,0] fi:0
+// GFX12: encoding: [0xe9,0x80,0xfe,0x7f,0xff,0x00,0x00,0x00]
+
+v_log_f16 v5, v1 dpp8:[7,6,5,4,3,2,1,0]
+// GFX12: encoding: [0xe9,0xae,0x0a,0x7e,0x01,0x77,0x39,0x05]
+
+v_log_f16 v5, v1 dpp8:[7,6,5,4,3,2,1,0] fi:1
+// GFX12: encoding: [0xea,0xae,0x0a,0x7e,0x01,0x77,0x39,0x05]
+
+v_log_f16 v127, v127 dpp8:[0,0,0,0,0,0,0,0] fi:0
+// GFX12: encoding: [0xe9,0xae,0xfe,0x7e,0x7f,0x00,0x00,0x00]
+
+v_log_f32 v5, v1 dpp8:[7,6,5,4,3,2,1,0]
+// GFX12: encoding: [0xe9,0x4e,0x0a,0x7e,0x01,0x77,0x39,0x05]
+
+v_log_f32 v5, v1 dpp8:[7,6,5,4,3,2,1,0] fi:1
+// GFX12: encoding: [0xea,0x4e,0x0a,0x7e,0x01,0x77,0x39,0x05]
+
+v_log_f32 v255, v255 dpp8:[0,0,0,0,0,0,0,0] fi:0
+// GFX12: encoding: [0xe9,0x4e,0xfe,0x7f,0xff,0x00,0x00,0x00]
+
+v_mov_b32 v5, v1 dpp8:[7,6,5,4,3,2,1,0]
+// GFX12: encoding: [0xe9,0x02,0x0a,0x7e,0x01,0x77,0x39,0x05]
+
+v_mov_b32 v5, v1 dpp8:[7,6,5,4,3,2,1,0] fi:1
+// GFX12: encoding: [0xea,0x02,0x0a,0x7e,0x01,0x77,0x39,0x05]
+
+v_mov_b32 v255, v255 dpp8:[0,0,0,0,0,0,0,0] fi:0
+// GFX12: encoding: [0xe9,0x02,0xfe,0x7f,0xff,0x00,0x00,0x00]
+
+v_movreld_b32 v5, v1 dpp8:[7,6,5,4,3,2,1,0]
+// GFX12: encoding: [0xe9,0x84,0x0a,0x7e,0x01,0x77,0x39,0x05]
+
+v_movreld_b32 v5, v1 dpp8:[7,6,5,4,3,2,1,0] fi:1
+// GFX12: encoding: [0xea,0x84,0x0a,0x7e,0x01,0x77,0x39,0x05]
+
+v_movreld_b32 v255, v255 dpp8:[0,0,0,0,0,0,0,0] fi:0
+// GFX12: encoding: [0xe9,0x84,0xfe,0x7f,0xff,0x00,0x00,0x00]
+
+v_movrels_b32 v5, v1 dpp8:[7,6,5,4,3,2,1,0]
+// GFX12: encoding: [0xe9,0x86,0x0a,0x7e,0x01,0x77,0x39,0x05]
+
+v_movrels_b32 v5, v1 dpp8:[7,6,5,4,3,2,1,0] fi:1
+// GFX12: encoding: [0xea,0x86,0x0a,0x7e,0x01,0x77,0x39,0x05]
+
+v_movrels_b32 v255, v255 dpp8:[0,0,0,0,0,0,0,0] fi:0
+// GFX12: encoding: [0xe9,0x86,0xfe,0x7f,0xff,0x00,0x00,0x00]
+
+v_movrelsd_2_b32 v5, v1 dpp8:[7,6,5,4,3,2,1,0]
+// GFX12: encoding: [0xe9,0x90,0x0a,0x7e,0x01,0x77,0x39,0x05]
+
+v_movrelsd_2_b32 v5, v1 dpp8:[7,6,5,4,3,2,1,0] fi:1
+// GFX12: encoding: [0xea,0x90,0x0a,0x7e,0x01,0x77,0x39,0x05]
+
+v_movrelsd_2_b32 v255, v255 dpp8:[0,0,0,0,0,0,0,0] fi:0
+// GFX12: encoding: [0xe9,0x90,0xfe,0x7f,0xff,0x00,0x00,0x00]
+
+v_movrelsd_b32 v5, v1 dpp8:[7,6,5,4,3,2,1,0]
+// GFX12: encoding: [0xe9,0x88,0x0a,0x7e,0x01,0x77,0x39,0x05]
+
+v_movrelsd_b32 v5, v1 dpp8:[7,6,5,4,3,2,1,0] fi:1
+// GFX12: encoding: [0xea,0x88,0x0a,0x7e,0x01,0x77,0x39,0x05]
+
+v_movrelsd_b32 v255, v255 dpp8:[0,0,0,0,0,0,0,0] fi:0
+// GFX12: encoding: [0xe9,0x88,0xfe,0x7f,0xff,0x00,0x00,0x00]
+
+v_not_b16 v5, v1 dpp8:[7,6,5,4,3,2,1,0]
+// GFX12: encoding: [0xe9,0xd2,0x0a,0x7e,0x01,0x77,0x39,0x05]
+
+v_not_b16 v5, v1 dpp8:[7,6,5,4,3,2,1,0] fi:1
+// GFX12: encoding: [0xea,0xd2,0x0a,0x7e,0x01,0x77,0x39,0x05]
+
+v_not_b16 v127, v127 dpp8:[0,0,0,0,0,0,0,0] fi:0
+// GFX12: encoding: [0xe9,0xd2,0xfe,0x7e,0x7f,0x00,0x00,0x00]
+
+v_not_b32 v5, v1 dpp8:[7,6,5,4,3,2,1,0]
+// GFX12: encoding: [0xe9,0x6e,0x0a,0x7e,0x01,0x77,0x39,0x05]
+
+v_not_b32 v5, v1 dpp8:[7,6,5,4,3,2,1,0] fi:1
+// GFX12: encoding: [0xea,0x6e,0x0a,0x7e,0x01,0x77,0x39,0x05]
+
+v_not_b32 v255, v255 dpp8:[0,0,0,0,0,0,0,0] fi:0
+// GFX12: encoding: [0xe9,0x6e,0xfe,0x7f,0xff,0x00,0x00,0x00]
+
+v_rcp_f16 v5, v1 dpp8:[7,6,5,4,3,2,1,0]
+// GFX12: encoding: [0xe9,0xa8,0x0a,0x7e,0x01,0x77,0x39,0x05]
+
+v_rcp_f16 v5, v1 dpp8:[7,6,5,4,3,2,1,0] fi:1
+// GFX12: encoding: [0xea,0xa8,0x0a,0x7e,0x01,0x77,0x39,0x05]
+
+v_rcp_f16 v127, v127 dpp8:[0,0,0,0,0,0,0,0] fi:0
+// GFX12: encoding: [0xe9,0xa8,0xfe,0x7e,0x7f,0x00,0x00,0x00]
+
+v_rcp_f32 v5, v1 dpp8:[7,6,5,4,3,2,1,0]
+// GFX12: encoding: [0xe9,0x54,0x0a,0x7e,0x01,0x77,0x39,0x05]
+
+v_rcp_f32 v5, v1 dpp8:[7,6,5,4,3,2,1,0] fi:1
+// GFX12: encoding: [0xea,0x54,0x0a,0x7e,0x01,0x77,0x39,0x05]
+
+v_rcp_f32 v255, v255 dpp8:[0,0,0,0,0,0,0,0] fi:0
+// GFX12: encoding: [0xe9,0x54,0xfe,0x7f,0xff,0x00,0x00,0x00]
+
+v_rcp_iflag_f32 v5, v1 dpp8:[7,6,5,4,3,2,1,0]
+// GFX12: encoding: [0xe9,0x56,0x0a,0x7e,0x01,0x77,0x39,0x05]
+
+v_rcp_iflag_f32 v5, v1 dpp8:[7,6,5,4,3,2,1,0] fi:1
+// GFX12: encoding: [0xea,0x56,0x0a,0x7e,0x01,0x77,0x39,0x05]
+
+v_rcp_iflag_f32 v255, v255 dpp8:[0,0,0,0,0,0,0,0] fi:0
+// GFX12: encoding: [0xe9,0x56,0xfe,0x7f,0xff,0x00,0x00,0x00]
+
+v_rndne_f16 v5, v1 dpp8:[7,6,5,4,3,2,1,0]
+// GFX12: encoding: [0xe9,0xbc,0x0a,0x7e,0x01,0x77,0x39,0x05]
+
+v_rndne_f16 v5, v1 dpp8:[7,6,5,4,3,2,1,0] fi:1
+// GFX12: encoding: [0xea,0xbc,0x0a,0x7e,0x01,0x77,0x39,0x05]
+
+v_rndne_f16 v127, v127 dpp8:[0,0,0,0,0,0,0,0] fi:0
+// GFX12: encoding: [0xe9,0xbc,0xfe,0x7e,0x7f,0x00,0x00,0x00]
+
+v_rndne_f32 v5, v1 dpp8:[7,6,5,4,3,2,1,0]
+// GFX12: encoding: [0xe9,0x46,0x0a,0x7e,0x01,0x77,0x39,0x05]
+
+v_rndne_f32 v5, v1 dpp8:[7,6,5,4,3,2,1,0] fi:1
+// GFX12: encoding: [0xea,0x46,0x0a,0x7e,0x01,0x77,0x39,0x05]
+
+v_rndne_f32 v255, v255 dpp8:[0,0,0,0,0,0,0,0] fi:0
+// GFX12: encoding: [0xe9,0x46,0xfe,0x7f,0xff,0x00,0x00,0x00]
+
+v_rsq_f16 v5, v1 dpp8:[7,6,5,4,3,2,1,0]
+// GFX12: encoding: [0xe9,0xac,0x0a,0x7e,0x01,0x77,0x39,0x05]
+
+v_rsq_f16 v5, v1 dpp8:[7,6,5,4,3,2,1,0] fi:1
+// GFX12: encoding: [0xea,0xac,0x0a,0x7e,0x01,0x77,0x39,0x05]
+
+v_rsq_f16 v127, v127 dpp8:[0,0,0,0,0,0,0,0] fi:0
+// GFX12: encoding: [0xe9,0xac,0xfe,0x7e,0x7f,0x00,0x00,0x00]
+
+v_rsq_f32 v5, v1 dpp8:[7,6,5,4,3,2,1,0]
+// GFX12: encoding: [0xe9,0x5c,0x0a,0x7e,0x01,0x77,0x39,0x05]
+
+v_rsq_f32 v5, v1 dpp8:[7,6,5,4,3,2,1,0] fi:1
+// GFX12: encoding: [0xea,0x5c,0x0a,0x7e,0x01,0x77,0x39,0x05]
+
+v_rsq_f32 v255, v255 dpp8:[0,0,0,0,0,0,0,0] fi:0
+// GFX12: encoding: [0xe9,0x5c,0xfe,0x7f,0xff,0x00,0x00,0x00]
+
+v_sat_pk_u8_i16 v5, v1 dpp8:[7,6,5,4,3,2,1,0]
+// GFX12: encoding: [0xe9,0xc4,0x0a,0x7e,0x01,0x77,0x39,0x05]
+
+v_sat_pk_u8_i16 v5, v1 dpp8:[7,6,5,4,3,2,1,0] fi:1
+// GFX12: encoding: [0xea,0xc4,0x0a,0x7e,0x01,0x77,0x39,0x05]
+
+v_sat_pk_u8_i16 v127, v255 dpp8:[0,0,0,0,0,0,0,0] fi:0
+// GFX12: encoding: [0xe9,0xc4,0xfe,0x7e,0xff,0x00,0x00,0x00]
+
+v_sin_f16 v5, v1 dpp8:[7,6,5,4,3,2,1,0]
+// GFX12: encoding: [0xe9,0xc0,0x0a,0x7e,0x01,0x77,0x39,0x05]
+
+v_sin_f16 v5, v1 dpp8:[7,6,5,4,3,2,1,0] fi:1
+// GFX12: encoding: [0xea,0xc0,0x0a,0x7e,0x01,0x77,0x39,0x05]
+
+v_sin_f16 v127, v127 dpp8:[0,0,0,0,0,0,0,0] fi:0
+// GFX12: encoding: [0xe9,0xc0,0xfe,0x7e,0x7f,0x00,0x00,0x00]
+
+v_sin_f32 v5, v1 dpp8:[7,6,5,4,3,2,1,0]
+// GFX12: encoding: [0xe9,0x6a,0x0a,0x7e,0x01,0x77,0x39,0x05]
+
+v_sin_f32 v5, v1 dpp8:[7,6,5,4,3,2,1,0] fi:1
+// GFX12: encoding: [0xea,0x6a,0x0a,0x7e,0x01,0x77,0x39,0x05]
+
+v_sin_f32 v255, v255 dpp8:[0,0,0,0,0,0,0,0] fi:0
+// GFX12: encoding: [0xe9,0x6a,0xfe,0x7f,0xff,0x00,0x00,0x00]
+
+v_sqrt_f16 v5, v1 dpp8:[7,6,5,4,3,2,1,0]
+// GFX12: encoding: [0xe9,0xaa,0x0a,0x7e,0x01,0x77,0x39,0x05]
+
+v_sqrt_f16 v5, v1 dpp8:[7,6,5,4,3,2,1,0] fi:1
+// GFX12: encoding: [0xea,0xaa,0x0a,0x7e,0x01,0x77,0x39,0x05]
+
+v_sqrt_f16 v127, v127 dpp8:[0,0,0,0,0,0,0,0] fi:0
+// GFX12: encoding: [0xe9,0xaa,0xfe,0x7e,0x7f,0x00,0x00,0x00]
+
+v_sqrt_f32 v5, v1 dpp8:[7,6,5,4,3,2,1,0]
+// GFX12: encoding: [0xe9,0x66,0x0a,0x7e,0x01,0x77,0x39,0x05]
+
+v_sqrt_f32 v5, v1 dpp8:[7,6,5,4,3,2,1,0] fi:1
+// GFX12: encoding: [0xea,0x66,0x0a,0x7e,0x01,0x77,0x39,0x05]
+
+v_sqrt_f32 v255, v255 dpp8:[0,0,0,0,0,0,0,0] fi:0
+// GFX12: encoding: [0xe9,0x66,0xfe,0x7f,0xff,0x00,0x00,0x00]
+
+v_trunc_f16 v5, v1 dpp8:[7,6,5,4,3,2,1,0]
+// GFX12: encoding: [0xe9,0xba,0x0a,0x7e,0x01,0x77,0x39,0x05]
+
+v_trunc_f16 v5, v1 dpp8:[7,6,5,4,3,2,1,0] fi:1
+// GFX12: encoding: [0xea,0xba,0x0a,0x7e,0x01,0x77,0x39,0x05]
+
+v_trunc_f16 v127, v127 dpp8:[0,0,0,0,0,0,0,0] fi:0
+// GFX12: encoding: [0xe9,0xba,0xfe,0x7e,0x7f,0x00,0x00,0x00]
+
+v_trunc_f32 v5, v1 dpp8:[7,6,5,4,3,2,1,0]
+// GFX12: encoding: [0xe9,0x42,0x0a,0x7e,0x01,0x77,0x39,0x05]
+
+v_trunc_f32 v5, v1 dpp8:[7,6,5,4,3,2,1,0] fi:1
+// GFX12: encoding: [0xea,0x42,0x0a,0x7e,0x01,0x77,0x39,0x05]
+
+v_trunc_f32 v255, v255 dpp8:[0,0,0,0,0,0,0,0] fi:0
+// GFX12: encoding: [0xe9,0x42,0xfe,0x7f,0xff,0x00,0x00,0x00]
diff --git a/llvm/test/MC/AMDGPU/gfx12_asm_vop1_dpp8.s b/llvm/test/MC/AMDGPU/gfx12_asm_vop1_dpp8.s
index 4c88401..fa3234d8 100644
--- a/llvm/test/MC/AMDGPU/gfx12_asm_vop1_dpp8.s
+++ b/llvm/test/MC/AMDGPU/gfx12_asm_vop1_dpp8.s
@@ -1,5 +1,7 @@
-// RUN: llvm-mc -triple=amdgcn -mcpu=gfx1200 -mattr=+wavefrontsize32 -show-encoding %s | FileCheck --check-prefixes=GFX12 %s
-// RUN: llvm-mc -triple=amdgcn -mcpu=gfx1200 -mattr=+wavefrontsize64 -show-encoding %s | FileCheck --check-prefixes=GFX12 %s
+// RUN: llvm-mc -triple=amdgcn -mcpu=gfx1200 -mattr=+wavefrontsize32,+real-true16 -show-encoding %s | FileCheck --check-prefixes=GFX12 %s
+// RUN: llvm-mc -triple=amdgcn -mcpu=gfx1200 -mattr=+wavefrontsize64,+real-true16 -show-encoding %s | FileCheck --check-prefixes=GFX12 %s
+
+// this file will be converted to true16 format when more true16 instructions are supported
 
 v_bfrev_b32_dpp v5, v1 dpp8:[7,6,5,4,3,2,1,0]
 // GFX12: encoding: [0xe9,0x70,0x0a,0x7e,0x01,0x77,0x39,0x05]
diff --git a/llvm/test/MC/AMDGPU/gfx12_asm_vop1_t16_err-fake16.s b/llvm/test/MC/AMDGPU/gfx12_asm_vop1_t16_err-fake16.s
new file mode 100644
index 0000000..d3aa94d
--- /dev/null
+++ b/llvm/test/MC/AMDGPU/gfx12_asm_vop1_t16_err-fake16.s
@@ -0,0 +1,505 @@
+// RUN: not llvm-mc -triple=amdgcn -mcpu=gfx1200 -mattr=-real-true16 -show-encoding %s 2>&1 | FileCheck --check-prefix=GFX12 --implicit-check-not=error %s
+
+v_ceil_f16_e32 v128, 0xfe0b
+// GFX12: :[[@LINE-1]]:{{[0-9]+}}: error: operands are not valid for this GPU or mode
+
+v_ceil_f16_e32 v255, v1
+// GFX12: :[[@LINE-1]]:{{[0-9]+}}: error: operands are not valid for this GPU or mode
+
+v_ceil_f16_e32 v5, v199
+// GFX12: :[[@LINE-1]]:{{[0-9]+}}: error: operands are not valid for this GPU or mode
+
+v_cos_f16_e32 v128, 0xfe0b
+// GFX12: :[[@LINE-1]]:{{[0-9]+}}: error: operands are not valid for this GPU or mode
+
+v_cos_f16_e32 v255, v1
+// GFX12: :[[@LINE-1]]:{{[0-9]+}}: error: operands are not valid for this GPU or mode
+
+v_cos_f16_e32 v5, v199
+// GFX12: :[[@LINE-1]]:{{[0-9]+}}: error: operands are not valid for this GPU or mode
+
+v_cvt_f16_f32_e32 v128, 0xaf123456
+// GFX12: :[[@LINE-1]]:{{[0-9]+}}: error: operands are not valid for this GPU or mode
+
+v_cvt_f16_f32_e32 v255, v1
+// GFX12: :[[@LINE-1]]:{{[0-9]+}}: error: operands are not valid for this GPU or mode
+
+v_cvt_f16_f32_e32 v255, v255
+// GFX12: :[[@LINE-1]]:{{[0-9]+}}: error: operands are not valid for this GPU or mode
+
+v_cvt_f16_i16_e32 v128, 0xfe0b
+// GFX12: :[[@LINE-1]]:{{[0-9]+}}: error: operands are not valid for this GPU or mode
+
+v_cvt_f16_i16_e32 v255, v1
+// GFX12: :[[@LINE-1]]:{{[0-9]+}}: error: operands are not valid for this GPU or mode
+
+v_cvt_f16_i16_e32 v5, v199
+// GFX12: :[[@LINE-1]]:{{[0-9]+}}: error: operands are not valid for this GPU or mode
+
+v_cvt_f16_u16_e32 v128, 0xfe0b
+// GFX12: :[[@LINE-1]]:{{[0-9]+}}: error: operands are not valid for this GPU or mode
+
+v_cvt_f16_u16_e32 v255, v1
+// GFX12: :[[@LINE-1]]:{{[0-9]+}}: error: operands are not valid for this GPU or mode
+
+v_cvt_f16_u16_e32 v5, v199
+// GFX12: :[[@LINE-1]]:{{[0-9]+}}: error: operands are not valid for this GPU or mode
+
+v_cvt_f32_f16_e32 v5, v199
+// GFX12: :[[@LINE-1]]:{{[0-9]+}}: error: operands are not valid for this GPU or mode
+
+v_cvt_i16_f16_e32 v128, 0xfe0b
+// GFX12: :[[@LINE-1]]:{{[0-9]+}}: error: operands are not valid for this GPU or mode
+
+v_cvt_i16_f16_e32 v255, v1
+// GFX12: :[[@LINE-1]]:{{[0-9]+}}: error: operands are not valid for this GPU or mode
+
+v_cvt_i16_f16_e32 v5, v199
+// GFX12: :[[@LINE-1]]:{{[0-9]+}}: error: operands are not valid for this GPU or mode
+
+v_cvt_i32_i16_e32 v5, v199
+// GFX12: :[[@LINE-1]]:{{[0-9]+}}: error: invalid operand for instruction
+
+v_cvt_norm_i16_f16_e32 v128, 0xfe0b
+// GFX12: :[[@LINE-1]]:{{[0-9]+}}: error: operands are not valid for this GPU or mode
+
+v_cvt_norm_i16_f16_e32 v255, v1
+// GFX12: :[[@LINE-1]]:{{[0-9]+}}: error: operands are not valid for this GPU or mode
+
+v_cvt_norm_i16_f16_e32 v5, v199
+// GFX12: :[[@LINE-1]]:{{[0-9]+}}: error: operands are not valid for this GPU or mode
+
+v_cvt_norm_u16_f16_e32 v128, 0xfe0b
+// GFX12: :[[@LINE-1]]:{{[0-9]+}}: error: operands are not valid for this GPU or mode
+
+v_cvt_norm_u16_f16_e32 v255, v1
+// GFX12: :[[@LINE-1]]:{{[0-9]+}}: error: operands are not valid for this GPU or mode
+
+v_cvt_norm_u16_f16_e32 v5, v199
+// GFX12: :[[@LINE-1]]:{{[0-9]+}}: error: operands are not valid for this GPU or mode
+
+v_cvt_u16_f16_e32 v128, 0xfe0b
+// GFX12: :[[@LINE-1]]:{{[0-9]+}}: error: operands are not valid for this GPU or mode
+
+v_cvt_u16_f16_e32 v255, v1
+// GFX12: :[[@LINE-1]]:{{[0-9]+}}: error: operands are not valid for this GPU or mode
+
+v_cvt_u16_f16_e32 v5, v199
+// GFX12: :[[@LINE-1]]:{{[0-9]+}}: error: operands are not valid for this GPU or mode
+
+v_cvt_u32_u16_e32 v5, v199
+// GFX12: :[[@LINE-1]]:{{[0-9]+}}: error: invalid operand for instruction
+
+v_exp_f16_e32 v128, 0xfe0b
+// GFX12: :[[@LINE-1]]:{{[0-9]+}}: error: operands are not valid for this GPU or mode
+
+v_exp_f16_e32 v255, v1
+// GFX12: :[[@LINE-1]]:{{[0-9]+}}: error: operands are not valid for this GPU or mode
+
+v_exp_f16_e32 v5, v199
+// GFX12: :[[@LINE-1]]:{{[0-9]+}}: error: operands are not valid for this GPU or mode
+
+v_floor_f16_e32 v128, 0xfe0b
+// GFX12: :[[@LINE-1]]:{{[0-9]+}}: error: operands are not valid for this GPU or mode
+
+v_floor_f16_e32 v255, v1
+// GFX12: :[[@LINE-1]]:{{[0-9]+}}: error: operands are not valid for this GPU or mode
+
+v_floor_f16_e32 v5, v199
+// GFX12: :[[@LINE-1]]:{{[0-9]+}}: error: operands are not valid for this GPU or mode
+
+v_fract_f16_e32 v128, 0xfe0b
+// GFX12: :[[@LINE-1]]:{{[0-9]+}}: error: operands are not valid for this GPU or mode
+
+v_fract_f16_e32 v255, v1
+// GFX12: :[[@LINE-1]]:{{[0-9]+}}: error: operands are not valid for this GPU or mode
+
+v_fract_f16_e32 v5, v199
+// GFX12: :[[@LINE-1]]:{{[0-9]+}}: error: operands are not valid for this GPU or mode
+
+v_frexp_exp_i16_f16_e32 v128, 0xfe0b
+// GFX12: :[[@LINE-1]]:{{[0-9]+}}: error: operands are not valid for this GPU or mode
+
+v_frexp_exp_i16_f16_e32 v255, v1
+// GFX12: :[[@LINE-1]]:{{[0-9]+}}: error: operands are not valid for this GPU or mode
+
+v_frexp_exp_i16_f16_e32 v5, v199
+// GFX12: :[[@LINE-1]]:{{[0-9]+}}: error: operands are not valid for this GPU or mode
+
+v_frexp_mant_f16_e32 v128, 0xfe0b
+// GFX12: :[[@LINE-1]]:{{[0-9]+}}: error: operands are not valid for this GPU or mode
+
+v_frexp_mant_f16_e32 v255, v1
+// GFX12: :[[@LINE-1]]:{{[0-9]+}}: error: operands are not valid for this GPU or mode
+
+v_frexp_mant_f16_e32 v5, v199
+// GFX12: :[[@LINE-1]]:{{[0-9]+}}: error: operands are not valid for this GPU or mode
+
+v_log_f16_e32 v128, 0xfe0b
+// GFX12: :[[@LINE-1]]:{{[0-9]+}}: error: operands are not valid for this GPU or mode
+
+v_log_f16_e32 v255, v1
+// GFX12: :[[@LINE-1]]:{{[0-9]+}}: error: operands are not valid for this GPU or mode
+
+v_log_f16_e32 v5, v199
+// GFX12: :[[@LINE-1]]:{{[0-9]+}}: error: operands are not valid for this GPU or mode
+
+v_not_b16_e32 v128, 0xfe0b
+// GFX12: :[[@LINE-1]]:{{[0-9]+}}: error: invalid operand for instruction
+
+v_not_b16_e32 v255, v1
+// GFX12: :[[@LINE-1]]:{{[0-9]+}}: error: invalid operand for instruction
+
+v_not_b16_e32 v5, v199
+// GFX12: :[[@LINE-1]]:{{[0-9]+}}: error: invalid operand for instruction
+
+v_rcp_f16_e32 v128, 0xfe0b
+// GFX12: :[[@LINE-1]]:{{[0-9]+}}: error: operands are not valid for this GPU or mode
+
+v_rcp_f16_e32 v255, v1
+// GFX12: :[[@LINE-1]]:{{[0-9]+}}: error: operands are not valid for this GPU or mode
+
+v_rcp_f16_e32 v5, v199
+// GFX12: :[[@LINE-1]]:{{[0-9]+}}: error: operands are not valid for this GPU or mode
+
+v_rndne_f16_e32 v128, 0xfe0b
+// GFX12: :[[@LINE-1]]:{{[0-9]+}}: error: operands are not valid for this GPU or mode
+
+v_rndne_f16_e32 v255, v1
+// GFX12: :[[@LINE-1]]:{{[0-9]+}}: error: operands are not valid for this GPU or mode
+
+v_rndne_f16_e32 v5, v199
+// GFX12: :[[@LINE-1]]:{{[0-9]+}}: error: operands are not valid for this GPU or mode
+
+v_rsq_f16_e32 v128, 0xfe0b
+// GFX12: :[[@LINE-1]]:{{[0-9]+}}: error: operands are not valid for this GPU or mode
+
+v_rsq_f16_e32 v255, v1
+// GFX12: :[[@LINE-1]]:{{[0-9]+}}: error: operands are not valid for this GPU or mode
+
+v_rsq_f16_e32 v5, v199
+// GFX12: :[[@LINE-1]]:{{[0-9]+}}: error: operands are not valid for this GPU or mode
+
+v_sat_pk_u8_i16_e32 v199, v5
+// GFX12: :[[@LINE-1]]:{{[0-9]+}}: error: operands are not valid for this GPU or mode
+
+v_sin_f16_e32 v128, 0xfe0b
+// GFX12: :[[@LINE-1]]:{{[0-9]+}}: error: operands are not valid for this GPU or mode
+
+v_sin_f16_e32 v255, v1
+// GFX12: :[[@LINE-1]]:{{[0-9]+}}: error: operands are not valid for this GPU or mode
+
+v_sin_f16_e32 v5, v199
+// GFX12: :[[@LINE-1]]:{{[0-9]+}}: error: operands are not valid for this GPU or mode
+
+v_sqrt_f16_e32 v128, 0xfe0b
+// GFX12: :[[@LINE-1]]:{{[0-9]+}}: error: operands are not valid for this GPU or mode
+
+v_sqrt_f16_e32 v255, v1
+// GFX12: :[[@LINE-1]]:{{[0-9]+}}: error: operands are not valid for this GPU or mode
+
+v_sqrt_f16_e32 v5, v199
+// GFX12: :[[@LINE-1]]:{{[0-9]+}}: error: operands are not valid for this GPU or mode
+
+v_trunc_f16_e32 v128, 0xfe0b
+// GFX12: :[[@LINE-1]]:{{[0-9]+}}: error: operands are not valid for this GPU or mode
+
+v_trunc_f16_e32 v255, v1
+// GFX12: :[[@LINE-1]]:{{[0-9]+}}: error: operands are not valid for this GPU or mode
+
+v_trunc_f16_e32 v5, v199
+// GFX12: :[[@LINE-1]]:{{[0-9]+}}: error: operands are not valid for this GPU or mode
+
+v_ceil_f16_e32 v255, v1 quad_perm:[3,2,1,0]
+// GFX12: :[[@LINE-1]]:{{[0-9]+}}: error: invalid operand for instruction
+
+v_ceil_f16_e32 v5, v199 quad_perm:[3,2,1,0]
+// GFX12: :[[@LINE-1]]:{{[0-9]+}}: error: invalid operand for instruction
+
+v_cos_f16_e32 v255, v1 quad_perm:[3,2,1,0]
+// GFX12: :[[@LINE-1]]:{{[0-9]+}}: error: invalid operand for instruction
+
+v_cos_f16_e32 v5, v199 quad_perm:[3,2,1,0]
+// GFX12: :[[@LINE-1]]:{{[0-9]+}}: error: invalid operand for instruction
+
+v_cvt_f16_f32_e32 v128, 0xaf123456 quad_perm:[3,2,1,0]
+// GFX12: :[[@LINE-1]]:{{[0-9]+}}: error: invalid operand for instruction
+
+v_cvt_f16_f32_e32 v255, v1 quad_perm:[3,2,1,0]
+// GFX12: :[[@LINE-1]]:{{[0-9]+}}: error: invalid operand for instruction
+
+v_cvt_f16_f32_e32 v255, v255 quad_perm:[3,2,1,0]
+// GFX12: :[[@LINE-1]]:{{[0-9]+}}: error: invalid operand for instruction
+
+v_cvt_f16_i16_e32 v255, v1 quad_perm:[3,2,1,0]
+// GFX12: :[[@LINE-1]]:{{[0-9]+}}: error: invalid operand for instruction
+
+v_cvt_f16_i16_e32 v5, v199 quad_perm:[3,2,1,0]
+// GFX12: :[[@LINE-1]]:{{[0-9]+}}: error: invalid operand for instruction
+
+v_cvt_f16_u16_e32 v255, v1 quad_perm:[3,2,1,0]
+// GFX12: :[[@LINE-1]]:{{[0-9]+}}: error: invalid operand for instruction
+
+v_cvt_f16_u16_e32 v5, v199 quad_perm:[3,2,1,0]
+// GFX12: :[[@LINE-1]]:{{[0-9]+}}: error: invalid operand for instruction
+
+v_cvt_f32_f16_e32 v5, v199 quad_perm:[3,2,1,0]
+// GFX12: :[[@LINE-1]]:{{[0-9]+}}: error: invalid operand for instruction
+
+v_cvt_i16_f16_e32 v255, v1 quad_perm:[3,2,1,0]
+// GFX12: :[[@LINE-1]]:{{[0-9]+}}: error: invalid operand for instruction
+
+v_cvt_i16_f16_e32 v5, v199 quad_perm:[3,2,1,0]
+// GFX12: :[[@LINE-1]]:{{[0-9]+}}: error: invalid operand for instruction
+
+v_cvt_i32_i16_e32 v5, v199 quad_perm:[3,2,1,0]
+// GFX12: :[[@LINE-1]]:{{[0-9]+}}: error: invalid operand for instruction
+
+v_cvt_norm_i16_f16_e32 v255, v1 quad_perm:[3,2,1,0]
+// GFX12: :[[@LINE-1]]:{{[0-9]+}}: error: invalid operand for instruction
+
+v_cvt_norm_i16_f16_e32 v5, v199 quad_perm:[3,2,1,0]
+// GFX12: :[[@LINE-1]]:{{[0-9]+}}: error: invalid operand for instruction
+
+v_cvt_norm_u16_f16_e32 v255, v1 quad_perm:[3,2,1,0]
+// GFX12: :[[@LINE-1]]:{{[0-9]+}}: error: invalid operand for instruction
+
+v_cvt_norm_u16_f16_e32 v5, v199 quad_perm:[3,2,1,0]
+// GFX12: :[[@LINE-1]]:{{[0-9]+}}: error: invalid operand for instruction
+
+v_cvt_u16_f16_e32 v255, v1 quad_perm:[3,2,1,0]
+// GFX12: :[[@LINE-1]]:{{[0-9]+}}: error: invalid operand for instruction
+
+v_cvt_u16_f16_e32 v5, v199 quad_perm:[3,2,1,0]
+// GFX12: :[[@LINE-1]]:{{[0-9]+}}: error: invalid operand for instruction
+
+v_cvt_u32_u16_e32 v5, v199 quad_perm:[3,2,1,0]
+// GFX12: :[[@LINE-1]]:{{[0-9]+}}: error: invalid operand for instruction
+
+v_exp_f16_e32 v255, v1 quad_perm:[3,2,1,0]
+// GFX12: :[[@LINE-1]]:{{[0-9]+}}: error: invalid operand for instruction
+
+v_exp_f16_e32 v5, v199 quad_perm:[3,2,1,0]
+// GFX12: :[[@LINE-1]]:{{[0-9]+}}: error: invalid operand for instruction
+
+v_floor_f16_e32 v255, v1 quad_perm:[3,2,1,0]
+// GFX12: :[[@LINE-1]]:{{[0-9]+}}: error: invalid operand for instruction
+
+v_floor_f16_e32 v5, v199 quad_perm:[3,2,1,0]
+// GFX12: :[[@LINE-1]]:{{[0-9]+}}: error: invalid operand for instruction
+
+v_fract_f16_e32 v255, v1 quad_perm:[3,2,1,0]
+// GFX12: :[[@LINE-1]]:{{[0-9]+}}: error: invalid operand for instruction
+
+v_fract_f16_e32 v5, v199 quad_perm:[3,2,1,0]
+// GFX12: :[[@LINE-1]]:{{[0-9]+}}: error: invalid operand for instruction
+
+v_frexp_exp_i16_f16_e32 v255, v1 quad_perm:[3,2,1,0]
+// GFX12: :[[@LINE-1]]:{{[0-9]+}}: error: invalid operand for instruction
+
+v_frexp_exp_i16_f16_e32 v5, v199 quad_perm:[3,2,1,0]
+// GFX12: :[[@LINE-1]]:{{[0-9]+}}: error: invalid operand for instruction
+
+v_frexp_mant_f16_e32 v255, v1 quad_perm:[3,2,1,0]
+// GFX12: :[[@LINE-1]]:{{[0-9]+}}: error: invalid operand for instruction
+
+v_frexp_mant_f16_e32 v5, v199 quad_perm:[3,2,1,0]
+// GFX12: :[[@LINE-1]]:{{[0-9]+}}: error: invalid operand for instruction
+
+v_log_f16_e32 v255, v1 quad_perm:[3,2,1,0]
+// GFX12: :[[@LINE-1]]:{{[0-9]+}}: error: invalid operand for instruction
+
+v_log_f16_e32 v5, v199 quad_perm:[3,2,1,0]
+// GFX12: :[[@LINE-1]]:{{[0-9]+}}: error: invalid operand for instruction
+
+v_not_b16_e32 v255, v1 quad_perm:[3,2,1,0]
+// GFX12: :[[@LINE-1]]:{{[0-9]+}}: error: invalid operand for instruction
+
+v_not_b16_e32 v5, v199 quad_perm:[3,2,1,0]
+// GFX12: :[[@LINE-1]]:{{[0-9]+}}: error: invalid operand for instruction
+
+v_rcp_f16_e32 v255, v1 quad_perm:[3,2,1,0]
+// GFX12: :[[@LINE-1]]:{{[0-9]+}}: error: invalid operand for instruction
+
+v_rcp_f16_e32 v5, v199 quad_perm:[3,2,1,0]
+// GFX12: :[[@LINE-1]]:{{[0-9]+}}: error: invalid operand for instruction
+
+v_rndne_f16_e32 v255, v1 quad_perm:[3,2,1,0]
+// GFX12: :[[@LINE-1]]:{{[0-9]+}}: error: invalid operand for instruction
+
+v_rndne_f16_e32 v5, v199 quad_perm:[3,2,1,0]
+// GFX12: :[[@LINE-1]]:{{[0-9]+}}: error: invalid operand for instruction
+
+v_rsq_f16_e32 v255, v1 quad_perm:[3,2,1,0]
+// GFX12: :[[@LINE-1]]:{{[0-9]+}}: error: invalid operand for instruction
+
+v_rsq_f16_e32 v5, v199 quad_perm:[3,2,1,0]
+// GFX12: :[[@LINE-1]]:{{[0-9]+}}: error: invalid operand for instruction
+
+v_sat_pk_u8_i16_e32 v199, v5 quad_perm:[3,2,1,0]
+// GFX12: :[[@LINE-1]]:{{[0-9]+}}: error: invalid operand for instruction
+
+v_sin_f16_e32 v255, v1 quad_perm:[3,2,1,0]
+// GFX12: :[[@LINE-1]]:{{[0-9]+}}: error: invalid operand for instruction
+
+v_sin_f16_e32 v5, v199 quad_perm:[3,2,1,0]
+// GFX12: :[[@LINE-1]]:{{[0-9]+}}: error: invalid operand for instruction
+
+v_sqrt_f16_e32 v255, v1 quad_perm:[3,2,1,0]
+// GFX12: :[[@LINE-1]]:{{[0-9]+}}: error: invalid operand for instruction
+
+v_sqrt_f16_e32 v5, v199 quad_perm:[3,2,1,0]
+// GFX12: :[[@LINE-1]]:{{[0-9]+}}: error: invalid operand for instruction
+
+v_trunc_f16_e32 v255, v1 quad_perm:[3,2,1,0]
+// GFX12: :[[@LINE-1]]:{{[0-9]+}}: error: invalid operand for instruction
+
+v_trunc_f16_e32 v5, v199 quad_perm:[3,2,1,0]
+// GFX12: :[[@LINE-1]]:{{[0-9]+}}: error: invalid operand for instruction
+
+v_ceil_f16_e32 v255, v1 dpp8:[7,6,5,4,3,2,1,0]
+// GFX12: :[[@LINE-1]]:{{[0-9]+}}: error: invalid operand for instruction
+
+v_ceil_f16_e32 v5, v199 dpp8:[7,6,5,4,3,2,1,0]
+// GFX12: :[[@LINE-1]]:{{[0-9]+}}: error: invalid operand for instruction
+
+v_cos_f16_e32 v255, v1 dpp8:[7,6,5,4,3,2,1,0]
+// GFX12: :[[@LINE-1]]:{{[0-9]+}}: error: invalid operand for instruction
+
+v_cos_f16_e32 v5, v199 dpp8:[7,6,5,4,3,2,1,0]
+// GFX12: :[[@LINE-1]]:{{[0-9]+}}: error: invalid operand for instruction
+
+v_cvt_f16_f32_e32 v128, 0xaf123456 dpp8:[7,6,5,4,3,2,1,0]
+// GFX12: :[[@LINE-1]]:{{[0-9]+}}: error: invalid operand for instruction
+
+v_cvt_f16_f32_e32 v255, v1 dpp8:[7,6,5,4,3,2,1,0]
+// GFX12: :[[@LINE-1]]:{{[0-9]+}}: error: invalid operand for instruction
+
+v_cvt_f16_f32_e32 v255, v255 dpp8:[7,6,5,4,3,2,1,0]
+// GFX12: :[[@LINE-1]]:{{[0-9]+}}: error: invalid operand for instruction
+
+v_cvt_f16_i16_e32 v255, v1 dpp8:[7,6,5,4,3,2,1,0]
+// GFX12: :[[@LINE-1]]:{{[0-9]+}}: error: invalid operand for instruction
+
+v_cvt_f16_i16_e32 v5, v199 dpp8:[7,6,5,4,3,2,1,0]
+// GFX12: :[[@LINE-1]]:{{[0-9]+}}: error: invalid operand for instruction
+
+v_cvt_f16_u16_e32 v255, v1 dpp8:[7,6,5,4,3,2,1,0]
+// GFX12: :[[@LINE-1]]:{{[0-9]+}}: error: invalid operand for instruction
+
+v_cvt_f16_u16_e32 v5, v199 dpp8:[7,6,5,4,3,2,1,0]
+// GFX12: :[[@LINE-1]]:{{[0-9]+}}: error: invalid operand for instruction
+
+v_cvt_f32_f16_e32 v5, v199 dpp8:[7,6,5,4,3,2,1,0]
+// GFX12: :[[@LINE-1]]:{{[0-9]+}}: error: invalid operand for instruction
+
+v_cvt_i16_f16_e32 v255, v1 dpp8:[7,6,5,4,3,2,1,0]
+// GFX12: :[[@LINE-1]]:{{[0-9]+}}: error: invalid operand for instruction
+
+v_cvt_i16_f16_e32 v5, v199 dpp8:[7,6,5,4,3,2,1,0]
+// GFX12: :[[@LINE-1]]:{{[0-9]+}}: error: invalid operand for instruction
+
+v_cvt_i32_i16_e32 v5, v199 dpp8:[7,6,5,4,3,2,1,0]
+// GFX12: :[[@LINE-1]]:{{[0-9]+}}: error: invalid operand for instruction
+
+v_cvt_norm_i16_f16_e32 v255, v1 dpp8:[7,6,5,4,3,2,1,0]
+// GFX12: :[[@LINE-1]]:{{[0-9]+}}: error: invalid operand for instruction
+
+v_cvt_norm_i16_f16_e32 v5, v199 dpp8:[7,6,5,4,3,2,1,0]
+// GFX12: :[[@LINE-1]]:{{[0-9]+}}: error: invalid operand for instruction
+
+v_cvt_norm_u16_f16_e32 v255, v1 dpp8:[7,6,5,4,3,2,1,0]
+// GFX12: :[[@LINE-1]]:{{[0-9]+}}: error: invalid operand for instruction
+
+v_cvt_norm_u16_f16_e32 v5, v199 dpp8:[7,6,5,4,3,2,1,0]
+// GFX12: :[[@LINE-1]]:{{[0-9]+}}: error: invalid operand for instruction
+
+v_cvt_u16_f16_e32 v255, v1 dpp8:[7,6,5,4,3,2,1,0]
+// GFX12: :[[@LINE-1]]:{{[0-9]+}}: error: invalid operand for instruction
+
+v_cvt_u16_f16_e32 v5, v199 dpp8:[7,6,5,4,3,2,1,0]
+// GFX12: :[[@LINE-1]]:{{[0-9]+}}: error: invalid operand for instruction
+
+v_cvt_u32_u16_e32 v5, v199 dpp8:[7,6,5,4,3,2,1,0]
+// GFX12: :[[@LINE-1]]:{{[0-9]+}}: error: invalid operand for instruction
+
+v_exp_f16_e32 v255, v1 dpp8:[7,6,5,4,3,2,1,0]
+// GFX12: :[[@LINE-1]]:{{[0-9]+}}: error: invalid operand for instruction
+
+v_exp_f16_e32 v5, v199 dpp8:[7,6,5,4,3,2,1,0]
+// GFX12: :[[@LINE-1]]:{{[0-9]+}}: error: invalid operand for instruction
+
+v_floor_f16_e32 v255, v1 dpp8:[7,6,5,4,3,2,1,0]
+// GFX12: :[[@LINE-1]]:{{[0-9]+}}: error: invalid operand for instruction
+
+v_floor_f16_e32 v5, v199 dpp8:[7,6,5,4,3,2,1,0]
+// GFX12: :[[@LINE-1]]:{{[0-9]+}}: error: invalid operand for instruction
+
+v_fract_f16_e32 v255, v1 dpp8:[7,6,5,4,3,2,1,0]
+// GFX12: :[[@LINE-1]]:{{[0-9]+}}: error: invalid operand for instruction
+
+v_fract_f16_e32 v5, v199 dpp8:[7,6,5,4,3,2,1,0]
+// GFX12: :[[@LINE-1]]:{{[0-9]+}}: error: invalid operand for instruction
+
+v_frexp_exp_i16_f16_e32 v255, v1 dpp8:[7,6,5,4,3,2,1,0]
+// GFX12: :[[@LINE-1]]:{{[0-9]+}}: error: invalid operand for instruction
+
+v_frexp_exp_i16_f16_e32 v5, v199 dpp8:[7,6,5,4,3,2,1,0]
+// GFX12: :[[@LINE-1]]:{{[0-9]+}}: error: invalid operand for instruction
+
+v_frexp_mant_f16_e32 v255, v1 dpp8:[7,6,5,4,3,2,1,0]
+// GFX12: :[[@LINE-1]]:{{[0-9]+}}: error: invalid operand for instruction
+
+v_frexp_mant_f16_e32 v5, v199 dpp8:[7,6,5,4,3,2,1,0]
+// GFX12: :[[@LINE-1]]:{{[0-9]+}}: error: invalid operand for instruction
+
+v_log_f16_e32 v255, v1 dpp8:[7,6,5,4,3,2,1,0]
+// GFX12: :[[@LINE-1]]:{{[0-9]+}}: error: invalid operand for instruction
+
+v_log_f16_e32 v5, v199 dpp8:[7,6,5,4,3,2,1,0]
+// GFX12: :[[@LINE-1]]:{{[0-9]+}}: error: invalid operand for instruction
+
+v_not_b16_e32 v255, v1 dpp8:[7,6,5,4,3,2,1,0]
+// GFX12: :[[@LINE-1]]:{{[0-9]+}}: error: invalid operand for instruction
+
+v_not_b16_e32 v5, v199 dpp8:[7,6,5,4,3,2,1,0]
+// GFX12: :[[@LINE-1]]:{{[0-9]+}}: error: invalid operand for instruction
+
+v_rcp_f16_e32 v255, v1 dpp8:[7,6,5,4,3,2,1,0]
+// GFX12: :[[@LINE-1]]:{{[0-9]+}}: error: invalid operand for instruction
+
+v_rcp_f16_e32 v5, v199 dpp8:[7,6,5,4,3,2,1,0]
+// GFX12: :[[@LINE-1]]:{{[0-9]+}}: error: invalid operand for instruction
+
+v_rndne_f16_e32 v255, v1 dpp8:[7,6,5,4,3,2,1,0]
+// GFX12: :[[@LINE-1]]:{{[0-9]+}}: error: invalid operand for instruction
+
+v_rndne_f16_e32 v5, v199 dpp8:[7,6,5,4,3,2,1,0]
+// GFX12: :[[@LINE-1]]:{{[0-9]+}}: error: invalid operand for instruction
+
+v_rsq_f16_e32 v255, v1 dpp8:[7,6,5,4,3,2,1,0]
+// GFX12: :[[@LINE-1]]:{{[0-9]+}}: error: invalid operand for instruction
+
+v_rsq_f16_e32 v5, v199 dpp8:[7,6,5,4,3,2,1,0]
+// GFX12: :[[@LINE-1]]:{{[0-9]+}}: error: invalid operand for instruction
+
+v_sat_pk_u8_i16_e32 v199, v5 dpp8:[7,6,5,4,3,2,1,0]
+// GFX12: :[[@LINE-1]]:{{[0-9]+}}: error: invalid operand for instruction
+
+v_sin_f16_e32 v255, v1 dpp8:[7,6,5,4,3,2,1,0]
+// GFX12: :[[@LINE-1]]:{{[0-9]+}}: error: invalid operand for instruction
+
+v_sin_f16_e32 v5, v199 dpp8:[7,6,5,4,3,2,1,0]
+// GFX12: :[[@LINE-1]]:{{[0-9]+}}: error: invalid operand for instruction
+
+v_sqrt_f16_e32 v255, v1 dpp8:[7,6,5,4,3,2,1,0]
+// GFX12: :[[@LINE-1]]:{{[0-9]+}}: error: invalid operand for instruction
+
+v_sqrt_f16_e32 v5, v199 dpp8:[7,6,5,4,3,2,1,0]
+// GFX12: :[[@LINE-1]]:{{[0-9]+}}: error: invalid operand for instruction
+
+v_trunc_f16_e32 v255, v1 dpp8:[7,6,5,4,3,2,1,0]
+// GFX12: :[[@LINE-1]]:{{[0-9]+}}: error: invalid operand for instruction
+
+v_trunc_f16_e32 v5, v199 dpp8:[7,6,5,4,3,2,1,0]
+// GFX12: :[[@LINE-1]]:{{[0-9]+}}: error: invalid operand for instruction
diff --git a/llvm/test/MC/AMDGPU/gfx12_asm_vop1_t16_err.s b/llvm/test/MC/AMDGPU/gfx12_asm_vop1_t16_err.s
index 37edf62..46a865b 100644
--- a/llvm/test/MC/AMDGPU/gfx12_asm_vop1_t16_err.s
+++ b/llvm/test/MC/AMDGPU/gfx12_asm_vop1_t16_err.s
@@ -1,4 +1,4 @@
-// RUN: not llvm-mc -triple=amdgcn -mcpu=gfx1200 -show-encoding %s 2>&1 | FileCheck --check-prefix=GFX12 --implicit-check-not=error %s
+// RUN: not llvm-mc -triple=amdgcn -mcpu=gfx1200 -mattr=+real-true16 -show-encoding %s 2>&1 | FileCheck --check-prefix=GFX12 --implicit-check-not=error %s
 
 v_ceil_f16_e32 v128, 0xfe0b
 // GFX12: :[[@LINE-1]]:{{[0-9]+}}: error: operands are not valid for this GPU or mode
diff --git a/llvm/test/MC/ARM/Windows/invalid-relocation.s b/llvm/test/MC/ARM/Windows/invalid-relocation.s
index db41570..821de8d 100644
--- a/llvm/test/MC/ARM/Windows/invalid-relocation.s
+++ b/llvm/test/MC/ARM/Windows/invalid-relocation.s
@@ -1,4 +1,4 @@
-# RUN: not --crash llvm-mc -triple thumbv7-windows -incremental-linker-compatible -filetype obj -o /dev/null 2>&1 %s \
+# RUN: not llvm-mc -triple thumbv7-windows -incremental-linker-compatible -filetype obj -o /dev/null 2>&1 %s \
 # RUN:     | FileCheck %s
 
 	.def invalid_relocation
@@ -9,5 +9,4 @@
 	.thumb_func
 	adr r0, invalid_relocation+1
 
-# CHECK: LLVM ERROR: unsupported relocation type: fixup_t2_adr_pcrel_12
-
+# CHECK: 10:2: error: unsupported relocation type
diff --git a/llvm/test/MC/ELF/layout-interdependency2.s b/llvm/test/MC/ELF/layout-interdependency2.s
new file mode 100644
index 0000000..24fafe5
--- /dev/null
+++ b/llvm/test/MC/ELF/layout-interdependency2.s
@@ -0,0 +1,84 @@
+## Contrived .zero directive example, simplified from the Linux kernel use case,
+## which requires multiple iterations to converge.
+## https://github.com/llvm/llvm-project/issues/100283
+# RUN: llvm-mc -filetype=obj -triple=x86_64 %s -o %t
+# RUN: llvm-objdump -d --no-show-raw-insn %t | FileCheck %s
+
+# CHECK:       80: jne 0x0 <.text>
+# CHECK-EMPTY:
+
+	.text
+.Ltmp0:
+.Ltmp1:
+	cli
+	popq	%rdi
+text1:
+	.zero	(.Ltmp2-.Ltmp3)
+	.section	"","ax",@progbits
+.Ltmp3:
+	movq	$0, %rax
+.Ltmp4:
+.Ltmp5:
+	.section	.discard.intra_function_calls,"ax",@progbits
+	.long	.Ltmp5
+	.section	"","ax",@progbits
+	callq	.Ltmp6
+	int3
+.Ltmp7:
+.Ltmp8:
+	.section	.discard.intra_function_calls,"ax",@progbits
+	.long	.Ltmp8
+	.section	"","ax",@progbits
+	callq	.Ltmp6
+	int3
+.Ltmp6:
+	addq	$0, %rsp
+	decq	%rax
+	jne	.Ltmp4
+	lfence
+	movq	$-1, %gs:pcpu_hot+6
+
+.Ltmp2:
+	.text
+text2:
+
+	.zero	(.Ltmp9-.Ltmp10)
+	.section	"","ax",@progbits
+.Ltmp10:
+	jmp	.Ltmp11
+.Ltmp9:
+	.text
+text3:
+
+.Ltmp12:
+	.zero	(.Ltmp13-.Ltmp14)
+	.section	"","ax",@progbits
+.Ltmp14:
+	callq	entry_untrain_ret
+.Ltmp13:
+	.text
+
+	.zero	(.Ltmp15-.Ltmp16)
+	.section	"","ax",@progbits
+.Ltmp16:
+	xorl	%eax, %eax
+	btsq	$63, %rax
+	movq	%rax, %gs:pcpu_hot+6
+
+.Ltmp15:
+	.text
+
+	popq	%r12
+	popq	rbp
+	jmp	__x86_return_thunk
+	movl	936(%rdi), %eax
+	cmpl	%gs:x86_spec_ctrl_current, %eax
+	je	.Ltmp0
+	movl	edx, %edx
+	wrmsr
+	jmp	.Ltmp0
+.Ltmp11:
+	movl	$72, %ecx
+	jmp	.Ltmp12
+	cmpb	$0, kvm_rebooting
+	jne	.Ltmp1
diff --git a/llvm/test/MC/X86/apx/ccmp-att.s b/llvm/test/MC/X86/apx/ccmp-att.s
index 405071b..32e6f37 100644
--- a/llvm/test/MC/X86/apx/ccmp-att.s
+++ b/llvm/test/MC/X86/apx/ccmp-att.s
@@ -1,7 +1,7 @@
 # RUN: llvm-mc -triple x86_64 -show-encoding %s | FileCheck %s
 # RUN: not llvm-mc -triple i386 -show-encoding %s 2>&1 | FileCheck %s --check-prefix=ERROR
 
-# ERROR-COUNT-402: error:
+# ERROR-COUNT-428: error:
 # ERROR-NOT: error:
 ## Condition flags
 
@@ -1217,3 +1217,84 @@
 # CHECK: ccmpoq {dfv=of,sf,zf,cf} %rax, %rbx
 # CHECK: encoding: [0x62,0xf4,0xfc,0x00,0x39,0xc3]
          ccmpoq {dFV=Cf,zF,SF,of} %rax, %rbx
+
+## "{evex} cmp*" are alias for "ccmpt* {dfv=}"
+
+# CHECK: ccmptb	{dfv=}	$123, 123(%r8,%rax,4)
+# CHECK: encoding: [0x62,0xd4,0x04,0x0a,0x80,0x7c,0x80,0x7b,0x7b]
+         {evex} cmpb	$123, 123(%r8,%rax,4)
+# CHECK: ccmptw	{dfv=}	$123, 123(%r8,%rax,4)
+# CHECK: encoding: [0x62,0xd4,0x05,0x0a,0x83,0x7c,0x80,0x7b,0x7b]
+         {evex} cmpw	$123, 123(%r8,%rax,4)
+# CHECK: ccmptw	{dfv=}	$1234, 123(%r8,%rax,4)
+# CHECK: encoding: [0x62,0xd4,0x05,0x0a,0x81,0x7c,0x80,0x7b,0xd2,0x04]
+         {evex} cmpw	$1234, 123(%r8,%rax,4)
+# CHECK: ccmptl	{dfv=}	$123, 123(%r8,%rax,4)
+# CHECK: encoding: [0x62,0xd4,0x04,0x0a,0x83,0x7c,0x80,0x7b,0x7b]
+         {evex} cmpl	$123, 123(%r8,%rax,4)
+# CHECK: ccmptl	{dfv=}	$123456, 123(%r8,%rax,4)
+# CHECK: encoding: [0x62,0xd4,0x04,0x0a,0x81,0x7c,0x80,0x7b,0x40,0xe2,0x01,0x00]
+         {evex} cmpl	$123456, 123(%r8,%rax,4)
+# CHECK: ccmptq	{dfv=}	$123, 123(%r8,%rax,4)
+# CHECK: encoding: [0x62,0xd4,0x84,0x0a,0x83,0x7c,0x80,0x7b,0x7b]
+         {evex} cmpq	$123, 123(%r8,%rax,4)
+# CHECK: ccmptq	{dfv=}	$123456, 123(%r8,%rax,4)
+# CHECK: encoding: [0x62,0xd4,0x84,0x0a,0x81,0x7c,0x80,0x7b,0x40,0xe2,0x01,0x00]
+         {evex} cmpq	$123456, 123(%r8,%rax,4)
+# CHECK: ccmptb	{dfv=}	%bl, 123(%r8,%rax,4)
+# CHECK: encoding: [0x62,0xd4,0x04,0x0a,0x38,0x5c,0x80,0x7b]
+         {evex} cmpb	%bl, 123(%r8,%rax,4)
+# CHECK: ccmptw	{dfv=}	%dx, 123(%r8,%rax,4)
+# CHECK: encoding: [0x62,0xd4,0x05,0x0a,0x39,0x54,0x80,0x7b]
+         {evex} cmpw	%dx, 123(%r8,%rax,4)
+# CHECK: ccmptl	{dfv=}	%ecx, 123(%r8,%rax,4)
+# CHECK: encoding: [0x62,0xd4,0x04,0x0a,0x39,0x4c,0x80,0x7b]
+         {evex} cmpl	%ecx, 123(%r8,%rax,4)
+# CHECK: ccmptq	{dfv=}	%r9, 123(%r8,%rax,4)
+# CHECK: encoding: [0x62,0x54,0x84,0x0a,0x39,0x4c,0x80,0x7b]
+         {evex} cmpq	%r9, 123(%r8,%rax,4)
+# CHECK: ccmptb	{dfv=}	123(%r8,%rax,4), %bl
+# CHECK: encoding: [0x62,0xd4,0x04,0x0a,0x3a,0x5c,0x80,0x7b]
+         {evex} cmpb	123(%r8,%rax,4), %bl
+# CHECK: ccmptw	{dfv=}	123(%r8,%rax,4), %dx
+# CHECK: encoding: [0x62,0xd4,0x05,0x0a,0x3b,0x54,0x80,0x7b]
+         {evex} cmpw	123(%r8,%rax,4), %dx
+# CHECK: ccmptl	{dfv=}	123(%r8,%rax,4), %ecx
+# CHECK: encoding: [0x62,0xd4,0x04,0x0a,0x3b,0x4c,0x80,0x7b]
+         {evex} cmpl	123(%r8,%rax,4), %ecx
+# CHECK: ccmptq	{dfv=}	123(%r8,%rax,4), %r9
+# CHECK: encoding: [0x62,0x54,0x84,0x0a,0x3b,0x4c,0x80,0x7b]
+         {evex} cmpq	123(%r8,%rax,4), %r9
+# CHECK: ccmptb	{dfv=}	$123, %bl
+# CHECK: encoding: [0x62,0xf4,0x04,0x0a,0x80,0xfb,0x7b]
+         {evex} cmpb	$123, %bl
+# CHECK: ccmptw	{dfv=}	$123, %dx
+# CHECK: encoding: [0x62,0xf4,0x05,0x0a,0x83,0xfa,0x7b]
+         {evex} cmpw	$123, %dx
+# CHECK: ccmptl	{dfv=}	$123, %ecx
+# CHECK: encoding: [0x62,0xf4,0x04,0x0a,0x83,0xf9,0x7b]
+         {evex} cmpl	$123, %ecx
+# CHECK: ccmptq	{dfv=}	$123, %r9
+# CHECK: encoding: [0x62,0xd4,0x84,0x0a,0x83,0xf9,0x7b]
+         {evex} cmpq	$123, %r9
+# CHECK: ccmptw	{dfv=}	$1234, %dx
+# CHECK: encoding: [0x62,0xf4,0x05,0x0a,0x81,0xfa,0xd2,0x04]
+         {evex} cmpw	$1234, %dx
+# CHECK: ccmptl	{dfv=}	$123456, %ecx
+# CHECK: encoding: [0x62,0xf4,0x04,0x0a,0x81,0xf9,0x40,0xe2,0x01,0x00]
+         {evex} cmpl	$123456, %ecx
+# CHECK: ccmptq	{dfv=}	$123456, %r9
+# CHECK: encoding: [0x62,0xd4,0x84,0x0a,0x81,0xf9,0x40,0xe2,0x01,0x00]
+         {evex} cmpq	$123456, %r9
+# CHECK: ccmptb	{dfv=}	%bl, %dl
+# CHECK: encoding: [0x62,0xf4,0x04,0x0a,0x38,0xda]
+         {evex} cmpb	%bl, %dl
+# CHECK: ccmptw	{dfv=}	%dx, %ax
+# CHECK: encoding: [0x62,0xf4,0x05,0x0a,0x39,0xd0]
+         {evex} cmpw	%dx, %ax
+# CHECK: ccmptl	{dfv=}	%ecx, %edx
+# CHECK: encoding: [0x62,0xf4,0x04,0x0a,0x39,0xca]
+         {evex} cmpl	%ecx, %edx
+# CHECK: ccmptq	{dfv=}	%r9, %r15
+# CHECK: encoding: [0x62,0x54,0x84,0x0a,0x39,0xcf]
+         {evex} cmpq	%r9, %r15
diff --git a/llvm/test/MC/X86/apx/ccmp-intel.s b/llvm/test/MC/X86/apx/ccmp-intel.s
index 2d446b0..5e64b63 100644
--- a/llvm/test/MC/X86/apx/ccmp-intel.s
+++ b/llvm/test/MC/X86/apx/ccmp-intel.s
@@ -1214,3 +1214,84 @@
 # CHECK: ccmpo {dfv=of,sf,zf,cf} rbx, rax
 # CHECK: encoding: [0x62,0xf4,0xfc,0x00,0x39,0xc3]
          ccmpo {DFv=Cf,zF,SF,of} rbx, rax
+
+## "{evex} cmp*" are alias for "ccmpt* {dfv=}"
+
+# CHECK: ccmpt	{dfv=}	byte ptr [r8 + 4*rax + 123], 123
+# CHECK: encoding: [0x62,0xd4,0x04,0x0a,0x80,0x7c,0x80,0x7b,0x7b]
+         {evex} cmp	byte ptr [r8 + 4*rax + 123], 123
+# CHECK: ccmpt	{dfv=}	word ptr [r8 + 4*rax + 123], 123
+# CHECK: encoding: [0x62,0xd4,0x05,0x0a,0x83,0x7c,0x80,0x7b,0x7b]
+         {evex} cmp	word ptr [r8 + 4*rax + 123], 123
+# CHECK: ccmpt	{dfv=}	word ptr [r8 + 4*rax + 123], 1234
+# CHECK: encoding: [0x62,0xd4,0x05,0x0a,0x81,0x7c,0x80,0x7b,0xd2,0x04]
+         {evex} cmp	word ptr [r8 + 4*rax + 123], 1234
+# CHECK: ccmpt	{dfv=}	dword ptr [r8 + 4*rax + 123], 123
+# CHECK: encoding: [0x62,0xd4,0x04,0x0a,0x83,0x7c,0x80,0x7b,0x7b]
+         {evex} cmp	dword ptr [r8 + 4*rax + 123], 123
+# CHECK: ccmpt	{dfv=}	dword ptr [r8 + 4*rax + 123], 123456
+# CHECK: encoding: [0x62,0xd4,0x04,0x0a,0x81,0x7c,0x80,0x7b,0x40,0xe2,0x01,0x00]
+         {evex} cmp	dword ptr [r8 + 4*rax + 123], 123456
+# CHECK: ccmpt	{dfv=}	qword ptr [r8 + 4*rax + 123], 123
+# CHECK: encoding: [0x62,0xd4,0x84,0x0a,0x83,0x7c,0x80,0x7b,0x7b]
+         {evex} cmp	qword ptr [r8 + 4*rax + 123], 123
+# CHECK: ccmpt	{dfv=}	qword ptr [r8 + 4*rax + 123], 123456
+# CHECK: encoding: [0x62,0xd4,0x84,0x0a,0x81,0x7c,0x80,0x7b,0x40,0xe2,0x01,0x00]
+         {evex} cmp	qword ptr [r8 + 4*rax + 123], 123456
+# CHECK: ccmpt	{dfv=}	byte ptr [r8 + 4*rax + 123], bl
+# CHECK: encoding: [0x62,0xd4,0x04,0x0a,0x38,0x5c,0x80,0x7b]
+         {evex} cmp	byte ptr [r8 + 4*rax + 123], bl
+# CHECK: ccmpt	{dfv=}	word ptr [r8 + 4*rax + 123], dx
+# CHECK: encoding: [0x62,0xd4,0x05,0x0a,0x39,0x54,0x80,0x7b]
+         {evex} cmp	word ptr [r8 + 4*rax + 123], dx
+# CHECK: ccmpt	{dfv=}	dword ptr [r8 + 4*rax + 123], ecx
+# CHECK: encoding: [0x62,0xd4,0x04,0x0a,0x39,0x4c,0x80,0x7b]
+         {evex} cmp	dword ptr [r8 + 4*rax + 123], ecx
+# CHECK: ccmpt	{dfv=}	qword ptr [r8 + 4*rax + 123], r9
+# CHECK: encoding: [0x62,0x54,0x84,0x0a,0x39,0x4c,0x80,0x7b]
+         {evex} cmp	qword ptr [r8 + 4*rax + 123], r9
+# CHECK: ccmpt	{dfv=}	bl, byte ptr [r8 + 4*rax + 123]
+# CHECK: encoding: [0x62,0xd4,0x04,0x0a,0x3a,0x5c,0x80,0x7b]
+         {evex} cmp	bl, byte ptr [r8 + 4*rax + 123]
+# CHECK: ccmpt	{dfv=}	dx, word ptr [r8 + 4*rax + 123]
+# CHECK: encoding: [0x62,0xd4,0x05,0x0a,0x3b,0x54,0x80,0x7b]
+         {evex} cmp	dx, word ptr [r8 + 4*rax + 123]
+# CHECK: ccmpt	{dfv=}	ecx, dword ptr [r8 + 4*rax + 123]
+# CHECK: encoding: [0x62,0xd4,0x04,0x0a,0x3b,0x4c,0x80,0x7b]
+         {evex} cmp	ecx, dword ptr [r8 + 4*rax + 123]
+# CHECK: ccmpt	{dfv=}	r9, qword ptr [r8 + 4*rax + 123]
+# CHECK: encoding: [0x62,0x54,0x84,0x0a,0x3b,0x4c,0x80,0x7b]
+         {evex} cmp	r9, qword ptr [r8 + 4*rax + 123]
+# CHECK: ccmpt	{dfv=}	bl, 123
+# CHECK: encoding: [0x62,0xf4,0x04,0x0a,0x80,0xfb,0x7b]
+         {evex} cmp	bl, 123
+# CHECK: ccmpt	{dfv=}	dx, 123
+# CHECK: encoding: [0x62,0xf4,0x05,0x0a,0x83,0xfa,0x7b]
+         {evex} cmp	dx, 123
+# CHECK: ccmpt	{dfv=}	ecx, 123
+# CHECK: encoding: [0x62,0xf4,0x04,0x0a,0x83,0xf9,0x7b]
+         {evex} cmp	ecx, 123
+# CHECK: ccmpt	{dfv=}	r9, 123
+# CHECK: encoding: [0x62,0xd4,0x84,0x0a,0x83,0xf9,0x7b]
+         {evex} cmp	r9, 123
+# CHECK: ccmpt	{dfv=}	dx, 1234
+# CHECK: encoding: [0x62,0xf4,0x05,0x0a,0x81,0xfa,0xd2,0x04]
+         {evex} cmp	dx, 1234
+# CHECK: ccmpt	{dfv=}	ecx, 123456
+# CHECK: encoding: [0x62,0xf4,0x04,0x0a,0x81,0xf9,0x40,0xe2,0x01,0x00]
+         {evex} cmp	ecx, 123456
+# CHECK: ccmpt	{dfv=}	r9, 123456
+# CHECK: encoding: [0x62,0xd4,0x84,0x0a,0x81,0xf9,0x40,0xe2,0x01,0x00]
+         {evex} cmp	r9, 123456
+# CHECK: ccmpt	{dfv=}	dl, bl
+# CHECK: encoding: [0x62,0xf4,0x04,0x0a,0x38,0xda]
+         {evex} cmp	dl, bl
+# CHECK: ccmpt	{dfv=}	ax, dx
+# CHECK: encoding: [0x62,0xf4,0x05,0x0a,0x39,0xd0]
+         {evex} cmp	ax, dx
+# CHECK: ccmpt	{dfv=}	edx, ecx
+# CHECK: encoding: [0x62,0xf4,0x04,0x0a,0x39,0xca]
+         {evex} cmp	edx, ecx
+# CHECK: ccmpt	{dfv=}	r15, r9
+# CHECK: encoding: [0x62,0x54,0x84,0x0a,0x39,0xcf]
+         {evex} cmp	r15, r9
diff --git a/llvm/test/MC/X86/apx/ctest-att.s b/llvm/test/MC/X86/apx/ctest-att.s
index 809ffc4..4cb9287 100644
--- a/llvm/test/MC/X86/apx/ctest-att.s
+++ b/llvm/test/MC/X86/apx/ctest-att.s
@@ -1,7 +1,7 @@
 # RUN: llvm-mc -triple x86_64 -show-encoding %s | FileCheck %s
 # RUN: not llvm-mc -triple i386 -show-encoding %s 2>&1 | FileCheck %s --check-prefix=ERROR
 
-# ERROR-COUNT-260: error:
+# ERROR-COUNT-276: error:
 # ERROR-NOT: error:
 # CHECK: ctestbb {dfv=of} $123, 123(%r8,%rax,4)
 # CHECK: encoding: [0x62,0xd4,0x44,0x02,0xf6,0x44,0x80,0x7b,0x7b]
@@ -784,3 +784,54 @@
 # CHECK: ctesteq {dfv=of} %r9, %r15
 # CHECK: encoding: [0x62,0x54,0xc4,0x04,0x85,0xcf]
          ctesteq {dfv=of} %r9, %r15
+
+## "{evex} test*" are alias for "ctestt* {dfv=}"
+
+# CHECK: ctesttb	{dfv=}	$123, 123(%r8,%rax,4)
+# CHECK: encoding: [0x62,0xd4,0x04,0x0a,0xf6,0x44,0x80,0x7b,0x7b]
+         {evex} testb	$123, 123(%r8,%rax,4)
+# CHECK: ctesttw	{dfv=}	$1234, 123(%r8,%rax,4)
+# CHECK: encoding: [0x62,0xd4,0x05,0x0a,0xf7,0x44,0x80,0x7b,0xd2,0x04]
+         {evex} testw	$1234, 123(%r8,%rax,4)
+# CHECK: ctesttl	{dfv=}	$123456, 123(%r8,%rax,4)
+# CHECK: encoding: [0x62,0xd4,0x04,0x0a,0xf7,0x44,0x80,0x7b,0x40,0xe2,0x01,0x00]
+         {evex} testl	$123456, 123(%r8,%rax,4)
+# CHECK: ctesttq	{dfv=}	$123456, 123(%r8,%rax,4)
+# CHECK: encoding: [0x62,0xd4,0x84,0x0a,0xf7,0x44,0x80,0x7b,0x40,0xe2,0x01,0x00]
+         {evex} testq	$123456, 123(%r8,%rax,4)
+# CHECK: ctesttb	{dfv=}	%bl, 123(%r8,%rax,4)
+# CHECK: encoding: [0x62,0xd4,0x04,0x0a,0x84,0x5c,0x80,0x7b]
+         {evex} testb	%bl, 123(%r8,%rax,4)
+# CHECK: ctesttw	{dfv=}	%dx, 123(%r8,%rax,4)
+# CHECK: encoding: [0x62,0xd4,0x05,0x0a,0x85,0x54,0x80,0x7b]
+         {evex} testw	%dx, 123(%r8,%rax,4)
+# CHECK: ctesttl	{dfv=}	%ecx, 123(%r8,%rax,4)
+# CHECK: encoding: [0x62,0xd4,0x04,0x0a,0x85,0x4c,0x80,0x7b]
+         {evex} testl	%ecx, 123(%r8,%rax,4)
+# CHECK: ctesttq	{dfv=}	%r9, 123(%r8,%rax,4)
+# CHECK: encoding: [0x62,0x54,0x84,0x0a,0x85,0x4c,0x80,0x7b]
+         {evex} testq	%r9, 123(%r8,%rax,4)
+# CHECK: ctesttb	{dfv=}	$123, %bl
+# CHECK: encoding: [0x62,0xf4,0x04,0x0a,0xf6,0xc3,0x7b]
+         {evex} testb	$123, %bl
+# CHECK: ctesttw	{dfv=}	$1234, %dx
+# CHECK: encoding: [0x62,0xf4,0x05,0x0a,0xf7,0xc2,0xd2,0x04]
+         {evex} testw	$1234, %dx
+# CHECK: ctesttl	{dfv=}	$123456, %ecx
+# CHECK: encoding: [0x62,0xf4,0x04,0x0a,0xf7,0xc1,0x40,0xe2,0x01,0x00]
+         {evex} testl	$123456, %ecx
+# CHECK: ctesttq	{dfv=}	$123456, %r9
+# CHECK: encoding: [0x62,0xd4,0x84,0x0a,0xf7,0xc1,0x40,0xe2,0x01,0x00]
+         {evex} testq	$123456, %r9
+# CHECK: ctesttb	{dfv=}	%bl, %dl
+# CHECK: encoding: [0x62,0xf4,0x04,0x0a,0x84,0xda]
+         {evex} testb	%bl, %dl
+# CHECK: ctesttw	{dfv=}	%dx, %ax
+# CHECK: encoding: [0x62,0xf4,0x05,0x0a,0x85,0xd0]
+         {evex} testw	%dx, %ax
+# CHECK: ctesttl	{dfv=}	%ecx, %edx
+# CHECK: encoding: [0x62,0xf4,0x04,0x0a,0x85,0xca]
+         {evex} testl	%ecx, %edx
+# CHECK: ctesttq	{dfv=}	%r9, %r15
+# CHECK: encoding: [0x62,0x54,0x84,0x0a,0x85,0xcf]
+         {evex} testq	%r9, %r15
diff --git a/llvm/test/MC/X86/apx/ctest-intel.s b/llvm/test/MC/X86/apx/ctest-intel.s
index b984163..701c517 100644
--- a/llvm/test/MC/X86/apx/ctest-intel.s
+++ b/llvm/test/MC/X86/apx/ctest-intel.s
@@ -780,3 +780,54 @@
 # CHECK: cteste {dfv=of} r15, r9
 # CHECK: encoding: [0x62,0x54,0xc4,0x04,0x85,0xcf]
          cteste {dfv=of} r15, r9
+
+## "{evex} test*" are alias for "ctestt* {dfv=}"
+
+# CHECK: ctestt	{dfv=}	byte ptr [r8 + 4*rax + 123], 123
+# CHECK: encoding: [0x62,0xd4,0x04,0x0a,0xf6,0x44,0x80,0x7b,0x7b]
+         {evex} test	byte ptr [r8 + 4*rax + 123], 123
+# CHECK: ctestt	{dfv=}	word ptr [r8 + 4*rax + 123], 1234
+# CHECK: encoding: [0x62,0xd4,0x05,0x0a,0xf7,0x44,0x80,0x7b,0xd2,0x04]
+         {evex} test	word ptr [r8 + 4*rax + 123], 1234
+# CHECK: ctestt	{dfv=}	dword ptr [r8 + 4*rax + 123], 123456
+# CHECK: encoding: [0x62,0xd4,0x04,0x0a,0xf7,0x44,0x80,0x7b,0x40,0xe2,0x01,0x00]
+         {evex} test	dword ptr [r8 + 4*rax + 123], 123456
+# CHECK: ctestt	{dfv=}	qword ptr [r8 + 4*rax + 123], 123456
+# CHECK: encoding: [0x62,0xd4,0x84,0x0a,0xf7,0x44,0x80,0x7b,0x40,0xe2,0x01,0x00]
+         {evex} test	qword ptr [r8 + 4*rax + 123], 123456
+# CHECK: ctestt	{dfv=}	byte ptr [r8 + 4*rax + 123], bl
+# CHECK: encoding: [0x62,0xd4,0x04,0x0a,0x84,0x5c,0x80,0x7b]
+         {evex} test	byte ptr [r8 + 4*rax + 123], bl
+# CHECK: ctestt	{dfv=}	word ptr [r8 + 4*rax + 123], dx
+# CHECK: encoding: [0x62,0xd4,0x05,0x0a,0x85,0x54,0x80,0x7b]
+         {evex} test	word ptr [r8 + 4*rax + 123], dx
+# CHECK: ctestt	{dfv=}	dword ptr [r8 + 4*rax + 123], ecx
+# CHECK: encoding: [0x62,0xd4,0x04,0x0a,0x85,0x4c,0x80,0x7b]
+         {evex} test	dword ptr [r8 + 4*rax + 123], ecx
+# CHECK: ctestt	{dfv=}	qword ptr [r8 + 4*rax + 123], r9
+# CHECK: encoding: [0x62,0x54,0x84,0x0a,0x85,0x4c,0x80,0x7b]
+         {evex} test	qword ptr [r8 + 4*rax + 123], r9
+# CHECK: ctestt	{dfv=}	bl, 123
+# CHECK: encoding: [0x62,0xf4,0x04,0x0a,0xf6,0xc3,0x7b]
+         {evex} test	bl, 123
+# CHECK: ctestt	{dfv=}	dx, 1234
+# CHECK: encoding: [0x62,0xf4,0x05,0x0a,0xf7,0xc2,0xd2,0x04]
+         {evex} test	dx, 1234
+# CHECK: ctestt	{dfv=}	ecx, 123456
+# CHECK: encoding: [0x62,0xf4,0x04,0x0a,0xf7,0xc1,0x40,0xe2,0x01,0x00]
+         {evex} test	ecx, 123456
+# CHECK: ctestt	{dfv=}	r9, 123456
+# CHECK: encoding: [0x62,0xd4,0x84,0x0a,0xf7,0xc1,0x40,0xe2,0x01,0x00]
+         {evex} test	r9, 123456
+# CHECK: ctestt	{dfv=}	dl, bl
+# CHECK: encoding: [0x62,0xf4,0x04,0x0a,0x84,0xda]
+         {evex} test	dl, bl
+# CHECK: ctestt	{dfv=}	ax, dx
+# CHECK: encoding: [0x62,0xf4,0x05,0x0a,0x85,0xd0]
+         {evex} test	ax, dx
+# CHECK: ctestt	{dfv=}	edx, ecx
+# CHECK: encoding: [0x62,0xf4,0x04,0x0a,0x85,0xca]
+         {evex} test	edx, ecx
+# CHECK: ctestt	{dfv=}	r15, r9
+# CHECK: encoding: [0x62,0x54,0x84,0x0a,0x85,0xcf]
+         {evex} test	r15, r9
diff --git a/llvm/test/MC/X86/x86-GCC-inline-asm-Y-constraints.ll b/llvm/test/MC/X86/x86-GCC-inline-asm-Y-constraints.ll
index 990e281..2e9da9f 100644
--- a/llvm/test/MC/X86/x86-GCC-inline-asm-Y-constraints.ll
+++ b/llvm/test/MC/X86/x86-GCC-inline-asm-Y-constraints.ll
@@ -11,7 +11,7 @@ define void @f_Ym(i64 %m.coerce) {
 ; CHECK:         ## InlineAsm End
 
 entry:
-  %0 = tail call x86_mmx asm sideeffect "movq $0, %mm1\0A\09", "=^Ym,~{dirflag},~{fpsr},~{flags}"() 
+  %0 = tail call <1 x i64> asm sideeffect "movq $0, %mm1\0A\09", "=^Ym,~{dirflag},~{fpsr},~{flags}"() 
   ret void
 }
 
diff --git a/llvm/test/Other/new-pm-thinlto-prelink-defaults.ll b/llvm/test/Other/new-pm-thinlto-prelink-defaults.ll
index 42ef49f..ab04f80 100644
--- a/llvm/test/Other/new-pm-thinlto-prelink-defaults.ll
+++ b/llvm/test/Other/new-pm-thinlto-prelink-defaults.ll
@@ -184,12 +184,10 @@
 ; CHECK-O-NEXT: Running pass: PostOrderFunctionAttrsPass
 ; CHECK-O-NEXT: Running pass: RequireAnalysisPass<{{.*}}ShouldNotRunFunctionPassesAnalysis
 ; CHECK-O-NEXT: Running analysis: ShouldNotRunFunctionPassesAnalysis
-; CHECK-O-NEXT: Running pass: CoroSplitPass
 ; CHECK-O-NEXT: Running pass: InvalidateAnalysisPass<{{.*}}ShouldNotRunFunctionPassesAnalysis
 ; CHECK-O-NEXT: Invalidating analysis: ShouldNotRunFunctionPassesAnalysis
 ; CHECK-O-NEXT: Invalidating analysis: InlineAdvisorAnalysis
 ; CHECK-O-NEXT: Running pass: DeadArgumentEliminationPass
-; CHECK-O-NEXT: Running pass: CoroCleanupPass
 ; CHECK-O-NEXT: Running pass: GlobalOptPass
 ; CHECK-O-NEXT: Running pass: GlobalDCEPass
 ; CHECK-EXT: Running pass: {{.*}}::Bye
diff --git a/llvm/test/Other/new-pm-thinlto-prelink-pgo-defaults.ll b/llvm/test/Other/new-pm-thinlto-prelink-pgo-defaults.ll
index e74f88c..cb49cbd 100644
--- a/llvm/test/Other/new-pm-thinlto-prelink-pgo-defaults.ll
+++ b/llvm/test/Other/new-pm-thinlto-prelink-pgo-defaults.ll
@@ -183,12 +183,10 @@
 ; CHECK-O-NEXT: Running pass: PostOrderFunctionAttrsPass
 ; CHECK-O-NEXT: Running pass: RequireAnalysisPass<{{.*}}ShouldNotRunFunctionPassesAnalysis
 ; CHECK-O-NEXT: Running analysis: ShouldNotRunFunctionPassesAnalysis
-; CHECK-O-NEXT: Running pass: CoroSplitPass
 ; CHECK-O-NEXT: Running pass: InvalidateAnalysisPass<{{.*}}ShouldNotRunFunctionPassesAnalysis
 ; CHECK-O-NEXT: Invalidating analysis: ShouldNotRunFunctionPassesAnalysis
 ; CHECK-O-NEXT: Invalidating analysis: InlineAdvisorAnalysis
 ; CHECK-O-NEXT: Running pass: DeadArgumentEliminationPass
-; CHECK-O-NEXT: Running pass: CoroCleanupPass
 ; CHECK-O-NEXT: Running pass: GlobalOptPass
 ; CHECK-O-NEXT: Running analysis: TargetLibraryAnalysis on bar
 ; CHECK-O-NEXT: Running pass: GlobalDCEPass
diff --git a/llvm/test/Other/new-pm-thinlto-prelink-samplepgo-defaults.ll b/llvm/test/Other/new-pm-thinlto-prelink-samplepgo-defaults.ll
index 0bb2633..96e8349 100644
--- a/llvm/test/Other/new-pm-thinlto-prelink-samplepgo-defaults.ll
+++ b/llvm/test/Other/new-pm-thinlto-prelink-samplepgo-defaults.ll
@@ -148,12 +148,10 @@
 ; CHECK-O-NEXT: Running pass: PostOrderFunctionAttrsPass
 ; CHECK-O-NEXT: Running pass: RequireAnalysisPass<{{.*}}ShouldNotRunFunctionPassesAnalysis
 ; CHECK-O-NEXT: Running analysis: ShouldNotRunFunctionPassesAnalysis
-; CHECK-O-NEXT: Running pass: CoroSplitPass
 ; CHECK-O-NEXT: Running pass: InvalidateAnalysisPass<{{.*}}ShouldNotRunFunctionPassesAnalysis
 ; CHECK-O-NEXT: Invalidating analysis: ShouldNotRunFunctionPassesAnalysis
 ; CHECK-O-NEXT: Invalidating analysis: InlineAdvisorAnalysis
 ; CHECK-O-NEXT: Running pass: DeadArgumentEliminationPass
-; CHECK-O-NEXT: Running pass: CoroCleanupPass
 ; CHECK-O-NEXT: Running pass: GlobalOptPass
 ; CHECK-O-NEXT: Running pass: GlobalDCEPass
 ; CHECK-O-NEXT: Running pass: AnnotationRemarksPass on foo
diff --git a/llvm/test/TableGen/dag-isel-regclass-emit-enum.td b/llvm/test/TableGen/dag-isel-regclass-emit-enum.td
index ba60df2..c41a19e 100644
--- a/llvm/test/TableGen/dag-isel-regclass-emit-enum.td
+++ b/llvm/test/TableGen/dag-isel-regclass-emit-enum.td
@@ -27,13 +27,13 @@ def GPRAbove127 : RegisterClass<"TestTarget", [i32], 32,
 // CHECK-NEXT: OPC_CheckChild1Integer, 0,
 // CHECK-NEXT: OPC_EmitInteger32, 0|128,2/*256*/,
 // CHECK-NEXT: OPC_MorphNodeTo1None, TARGET_VAL(TargetOpcode::COPY_TO_REGCLASS),
-// CHECK-NEXT:     MVT::i32, 2/*#Ops*/, 1, 0,
+// CHECK-NEXT:     /*MVT::i32*/7, 2/*#Ops*/, 1, 0,
 def : Pat<(i32 (add i32:$src, (i32 0))),
           (COPY_TO_REGCLASS GPRAbove127, GPR0:$src)>;
 
 // CHECK:      OPC_CheckChild1Integer, 2,
 // CHECK-NEXT: OPC_EmitStringInteger32, TestNamespace::GPR127RegClassID,
 // CHECK-NEXT: OPC_MorphNodeTo1None, TARGET_VAL(TargetOpcode::COPY_TO_REGCLASS),
-// CHECK-NEXT:     MVT::i32, 2/*#Ops*/, 1, 0,
+// CHECK-NEXT:     /*MVT::i32*/7, 2/*#Ops*/, 1, 0,
 def : Pat<(i32 (add i32:$src, (i32 1))),
           (COPY_TO_REGCLASS GPR127, GPR0:$src)>;
diff --git a/llvm/test/ThinLTO/X86/ctxprof.ll b/llvm/test/ThinLTO/X86/ctxprof.ll
new file mode 100644
index 0000000..4c86ec9
--- /dev/null
+++ b/llvm/test/ThinLTO/X86/ctxprof.ll
@@ -0,0 +1,73 @@
+; Test workload based importing via -thinlto-pgo-ctx-prof
+; Use external linkage symbols so we don't depend on module paths which are
+; used when computing the GUIDs of internal linkage symbols.
+; The functionality is shared with what workload.ll tests, so here we only care
+; about testing the ctx profile is loaded and handled correctly.
+;
+; Set up
+; RUN: rm -rf %t
+; RUN: mkdir -p %t
+; RUN: split-file %s %t
+;
+; RUN: opt -module-summary %t/m1.ll -o %t/m1.bc
+; RUN: opt -module-summary %t/m2.ll -o %t/m2.bc
+; RUN: llvm-dis %t/m1.bc -o - | FileCheck %s --check-prefix=GUIDS-1
+; RUN: llvm-dis %t/m2.bc -o - | FileCheck %s --check-prefix=GUIDS-2
+;
+; GUIDS-1: name: "m1_f1"
+; GUIDS-1-SAME: guid = 6019442868614718803
+; GUIDS-2: name: "m2_f1"
+; GUIDS-2-SAME: guid = 15593096274670919754
+;
+; RUN: rm -rf %t_baseline
+; RUN: rm -rf %t_exp
+; RUN: mkdir -p %t_baseline
+; RUN: mkdir -p %t_exp
+;
+; Normal run. m1 shouldn't get m2_f1 because it's not referenced from there, and
+; m1_f1 shouldn't go to m2.
+;
+; RUN: llvm-lto2 run %t/m1.bc %t/m2.bc \
+; RUN:  -o %t_baseline/result.o -save-temps \
+; RUN:  -r %t/m1.bc,m1_f1,plx \
+; RUN:  -r %t/m2.bc,m2_f1,plx
+; RUN: llvm-dis %t_baseline/result.o.1.3.import.bc -o - | FileCheck %s --check-prefix=NOPROF-1
+; RUN: llvm-dis %t_baseline/result.o.2.3.import.bc -o - | FileCheck %s --check-prefix=NOPROF-2
+;
+; NOPROF-1-NOT: m2_f1()
+; NOPROF-2-NOT: m1_f1()
+;
+; The run with workload definitions - same other options.
+;
+; RUN: echo '[ \
+; RUN:        {"Guid": 6019442868614718803, "Counters": [1], "Callsites": [[{"Guid": 15593096274670919754, "Counters": [1]}]]}, \
+; RUN:        {"Guid": 15593096274670919754, "Counters": [1], "Callsites": [[{"Guid": 6019442868614718803, "Counters": [1]}]]} \
+; RUN:  ]' > %t_exp/ctxprof.json
+; RUN: llvm-ctxprof-util fromJSON --input %t_exp/ctxprof.json --output %t_exp/ctxprof.bitstream
+; RUN: llvm-lto2 run %t/m1.bc %t/m2.bc \
+; RUN:  -o %t_exp/result.o -save-temps \
+; RUN:  -thinlto-pgo-ctx-prof=%t_exp/ctxprof.bitstream \
+; RUN:  -r %t/m1.bc,m1_f1,plx \
+; RUN:  -r %t/m2.bc,m2_f1,plx
+; RUN: llvm-dis %t_exp/result.o.1.3.import.bc -o - | FileCheck %s --check-prefix=FIRST
+; RUN: llvm-dis %t_exp/result.o.2.3.import.bc -o - | FileCheck %s --check-prefix=SECOND
+;
+;
+; FIRST: m2_f1()
+; SECOND: m1_f1()
+;
+;--- m1.ll
+target datalayout = "e-m:e-p270:32:32-p271:32:32-p272:64:64-i64:64-f80:128-n8:16:32:64-S128"
+target triple = "x86_64-pc-linux-gnu"
+
+define dso_local void @m1_f1() {
+  ret void
+}
+
+;--- m2.ll
+target datalayout = "e-m:e-p270:32:32-p271:32:32-p272:64:64-i64:64-f80:128-n8:16:32:64-S128"
+target triple = "x86_64-pc-linux-gnu"
+
+define dso_local void @m2_f1() {
+  ret void
+}
diff --git a/llvm/test/Transforms/AggressiveInstCombine/AArch64/fptosisat.ll b/llvm/test/Transforms/AggressiveInstCombine/AArch64/fptosisat.ll
index 5fea6f6..2518a30 100644
--- a/llvm/test/Transforms/AggressiveInstCombine/AArch64/fptosisat.ll
+++ b/llvm/test/Transforms/AggressiveInstCombine/AArch64/fptosisat.ll
@@ -92,16 +92,10 @@ define i32 @f64_i16(double %in) {
 }
 
 define i64 @f16_i32(half %in) {
-; CHECK-FP-LABEL: @f16_i32(
-; CHECK-FP-NEXT:    [[CONV:%.*]] = fptosi half [[IN:%.*]] to i64
-; CHECK-FP-NEXT:    [[MIN:%.*]] = call i64 @llvm.smin.i64(i64 [[CONV]], i64 2147483647)
-; CHECK-FP-NEXT:    [[MAX:%.*]] = call i64 @llvm.smax.i64(i64 [[MIN]], i64 -2147483648)
-; CHECK-FP-NEXT:    ret i64 [[MAX]]
-;
-; CHECK-FP16-LABEL: @f16_i32(
-; CHECK-FP16-NEXT:    [[TMP1:%.*]] = call i32 @llvm.fptosi.sat.i32.f16(half [[IN:%.*]])
-; CHECK-FP16-NEXT:    [[TMP2:%.*]] = sext i32 [[TMP1]] to i64
-; CHECK-FP16-NEXT:    ret i64 [[TMP2]]
+; CHECK-LABEL: @f16_i32(
+; CHECK-NEXT:    [[TMP1:%.*]] = call i32 @llvm.fptosi.sat.i32.f16(half [[IN:%.*]])
+; CHECK-NEXT:    [[TMP2:%.*]] = sext i32 [[TMP1]] to i64
+; CHECK-NEXT:    ret i64 [[TMP2]]
 ;
   %conv = fptosi half %in to i64
   %min = call i64 @llvm.smin.i64(i64 %conv, i64 2147483647)
@@ -185,16 +179,10 @@ define <8 x i64> @v8f32_i32(<8 x float> %in) {
 }
 
 define <4 x i32> @v4f16_i16(<4 x half> %in) {
-; CHECK-FP-LABEL: @v4f16_i16(
-; CHECK-FP-NEXT:    [[CONV:%.*]] = fptosi <4 x half> [[IN:%.*]] to <4 x i32>
-; CHECK-FP-NEXT:    [[MIN:%.*]] = call <4 x i32> @llvm.smin.v4i32(<4 x i32> [[CONV]], <4 x i32> <i32 32767, i32 32767, i32 32767, i32 32767>)
-; CHECK-FP-NEXT:    [[MAX:%.*]] = call <4 x i32> @llvm.smax.v4i32(<4 x i32> [[MIN]], <4 x i32> <i32 -32768, i32 -32768, i32 -32768, i32 -32768>)
-; CHECK-FP-NEXT:    ret <4 x i32> [[MAX]]
-;
-; CHECK-FP16-LABEL: @v4f16_i16(
-; CHECK-FP16-NEXT:    [[TMP1:%.*]] = call <4 x i16> @llvm.fptosi.sat.v4i16.v4f16(<4 x half> [[IN:%.*]])
-; CHECK-FP16-NEXT:    [[TMP2:%.*]] = sext <4 x i16> [[TMP1]] to <4 x i32>
-; CHECK-FP16-NEXT:    ret <4 x i32> [[TMP2]]
+; CHECK-LABEL: @v4f16_i16(
+; CHECK-NEXT:    [[TMP1:%.*]] = call <4 x i16> @llvm.fptosi.sat.v4i16.v4f16(<4 x half> [[IN:%.*]])
+; CHECK-NEXT:    [[TMP2:%.*]] = sext <4 x i16> [[TMP1]] to <4 x i32>
+; CHECK-NEXT:    ret <4 x i32> [[TMP2]]
 ;
   %conv = fptosi <4 x half> %in to <4 x i32>
   %min = call <4 x i32> @llvm.smin.v4i32(<4 x i32> %conv, <4 x i32> <i32 32767, i32 32767, i32 32767, i32 32767>)
@@ -203,16 +191,10 @@ define <4 x i32> @v4f16_i16(<4 x half> %in) {
 }
 
 define <8 x i32> @v8f16_i16(<8 x half> %in) {
-; CHECK-FP-LABEL: @v8f16_i16(
-; CHECK-FP-NEXT:    [[CONV:%.*]] = fptosi <8 x half> [[IN:%.*]] to <8 x i32>
-; CHECK-FP-NEXT:    [[MIN:%.*]] = call <8 x i32> @llvm.smin.v8i32(<8 x i32> [[CONV]], <8 x i32> <i32 32767, i32 32767, i32 32767, i32 32767, i32 32767, i32 32767, i32 32767, i32 32767>)
-; CHECK-FP-NEXT:    [[MAX:%.*]] = call <8 x i32> @llvm.smax.v8i32(<8 x i32> [[MIN]], <8 x i32> <i32 -32768, i32 -32768, i32 -32768, i32 -32768, i32 -32768, i32 -32768, i32 -32768, i32 -32768>)
-; CHECK-FP-NEXT:    ret <8 x i32> [[MAX]]
-;
-; CHECK-FP16-LABEL: @v8f16_i16(
-; CHECK-FP16-NEXT:    [[TMP1:%.*]] = call <8 x i16> @llvm.fptosi.sat.v8i16.v8f16(<8 x half> [[IN:%.*]])
-; CHECK-FP16-NEXT:    [[TMP2:%.*]] = sext <8 x i16> [[TMP1]] to <8 x i32>
-; CHECK-FP16-NEXT:    ret <8 x i32> [[TMP2]]
+; CHECK-LABEL: @v8f16_i16(
+; CHECK-NEXT:    [[TMP1:%.*]] = call <8 x i16> @llvm.fptosi.sat.v8i16.v8f16(<8 x half> [[IN:%.*]])
+; CHECK-NEXT:    [[TMP2:%.*]] = sext <8 x i16> [[TMP1]] to <8 x i32>
+; CHECK-NEXT:    ret <8 x i32> [[TMP2]]
 ;
   %conv = fptosi <8 x half> %in to <8 x i32>
   %min = call <8 x i32> @llvm.smin.v8i32(<8 x i32> %conv, <8 x i32> <i32 32767, i32 32767, i32 32767, i32 32767, i32 32767, i32 32767, i32 32767, i32 32767>)
@@ -292,3 +274,6 @@ declare <4 x i32> @llvm.smin.v4i32(<4 x i32>, <4 x i32>)
 declare <4 x i32> @llvm.smax.v4i32(<4 x i32>, <4 x i32>)
 declare <8 x i32> @llvm.smin.v8i32(<8 x i32>, <8 x i32>)
 declare <8 x i32> @llvm.smax.v8i32(<8 x i32>, <8 x i32>)
+;; NOTE: These prefixes are unused and the list is autogenerated. Do not add tests below this line:
+; CHECK-FP: {{.*}}
+; CHECK-FP16: {{.*}}
diff --git a/llvm/test/Transforms/GlobalOpt/x86_mmx_load.ll b/llvm/test/Transforms/GlobalOpt/x86_mmx_load.ll
deleted file mode 100644
index e352900..0000000
--- a/llvm/test/Transforms/GlobalOpt/x86_mmx_load.ll
+++ /dev/null
@@ -1,12 +0,0 @@
-; NOTE: Assertions have been autogenerated by utils/update_test_checks.py
-; RUN: opt -S -passes=globalopt < %s | FileCheck %s
-
-@m64 = internal global <1 x i64> zeroinitializer
-
-define i32 @load_mmx() {
-; CHECK-LABEL: @load_mmx(
-; CHECK-NEXT:    ret i32 0
-;
-  %temp = load x86_mmx, ptr @m64
-  ret i32 0
-}
diff --git a/llvm/test/Transforms/IRCE/wide_indvar.ll b/llvm/test/Transforms/IRCE/wide_indvar.ll
index ecb13ad..b9be8ae 100644
--- a/llvm/test/Transforms/IRCE/wide_indvar.ll
+++ b/llvm/test/Transforms/IRCE/wide_indvar.ll
@@ -94,7 +94,7 @@ define i32 @test_increasing_slt_slt_wide_simple_postloop() {
 ; CHECK-NEXT:    [[IV_NEXT_POSTLOOP]] = add i64 [[IV_POSTLOOP]], 1
 ; CHECK-NEXT:    [[NARROW_IV_POSTLOOP]] = trunc i64 [[IV_NEXT_POSTLOOP]] to i32
 ; CHECK-NEXT:    [[LATCH_COND_POSTLOOP:%.*]] = icmp slt i32 [[NARROW_IV_POSTLOOP]], 100
-; CHECK-NEXT:    br i1 [[LATCH_COND_POSTLOOP]], label [[LOOP_POSTLOOP]], label [[EXIT_LOOPEXIT]], !llvm.loop [[LOOP0:![0-9]+]], !loop_constrainer.loop.clone !5
+; CHECK-NEXT:    br i1 [[LATCH_COND_POSTLOOP]], label [[LOOP_POSTLOOP]], label [[EXIT_LOOPEXIT]], !llvm.loop [[LOOP0:![0-9]+]], !loop_constrainer.loop.clone [[META5:![0-9]+]]
 ;
 
 entry:
@@ -175,7 +175,7 @@ define i32 @test_increasing_slt_slt_wide_non-negative(ptr %n_ptr, ptr %m_ptr) {
 ; CHECK-NEXT:    [[IV_NEXT_POSTLOOP]] = add i64 [[IV_POSTLOOP]], 1
 ; CHECK-NEXT:    [[NARROW_IV_POSTLOOP]] = trunc i64 [[IV_NEXT_POSTLOOP]] to i32
 ; CHECK-NEXT:    [[LATCH_COND_POSTLOOP:%.*]] = icmp slt i32 [[NARROW_IV_POSTLOOP]], [[N]]
-; CHECK-NEXT:    br i1 [[LATCH_COND_POSTLOOP]], label [[LOOP_POSTLOOP]], label [[EXIT_LOOPEXIT]], !llvm.loop [[LOOP8:![0-9]+]], !loop_constrainer.loop.clone !5
+; CHECK-NEXT:    br i1 [[LATCH_COND_POSTLOOP]], label [[LOOP_POSTLOOP]], label [[EXIT_LOOPEXIT]], !llvm.loop [[LOOP8:![0-9]+]], !loop_constrainer.loop.clone [[META5]]
 ;
 
 entry:
@@ -268,7 +268,7 @@ define i32 @test_increasing_slt_slt_wide_general(ptr %n_ptr, ptr %m_ptr) {
 ; CHECK-NEXT:    [[IV_NEXT_POSTLOOP]] = add i64 [[IV_POSTLOOP]], 1
 ; CHECK-NEXT:    [[NARROW_IV_POSTLOOP]] = trunc i64 [[IV_NEXT_POSTLOOP]] to i32
 ; CHECK-NEXT:    [[LATCH_COND_POSTLOOP:%.*]] = icmp slt i32 [[NARROW_IV_POSTLOOP]], [[N]]
-; CHECK-NEXT:    br i1 [[LATCH_COND_POSTLOOP]], label [[LOOP_POSTLOOP]], label [[EXIT_LOOPEXIT]], !llvm.loop [[LOOP9:![0-9]+]], !loop_constrainer.loop.clone !5
+; CHECK-NEXT:    br i1 [[LATCH_COND_POSTLOOP]], label [[LOOP_POSTLOOP]], label [[EXIT_LOOPEXIT]], !llvm.loop [[LOOP9:![0-9]+]], !loop_constrainer.loop.clone [[META5]]
 ;
 
 entry:
@@ -367,7 +367,7 @@ define i32 @test_increasing_slt_slt_wide_general_preloop(ptr %n_ptr, ptr %m_ptr)
 ; CHECK-NEXT:    [[LATCH_COND_PRELOOP:%.*]] = icmp slt i32 [[NARROW_IV_PRELOOP]], [[N]]
 ; CHECK-NEXT:    [[WIDE_NARROW_IV_PRELOOP:%.*]] = sext i32 [[NARROW_IV_PRELOOP]] to i64
 ; CHECK-NEXT:    [[TMP9:%.*]] = icmp slt i64 [[WIDE_NARROW_IV_PRELOOP]], [[EXIT_PRELOOP_AT]]
-; CHECK-NEXT:    br i1 [[TMP9]], label [[LOOP_PRELOOP]], label [[PRELOOP_EXIT_SELECTOR]], !llvm.loop [[LOOP10:![0-9]+]], !loop_constrainer.loop.clone !5
+; CHECK-NEXT:    br i1 [[TMP9]], label [[LOOP_PRELOOP]], label [[PRELOOP_EXIT_SELECTOR]], !llvm.loop [[LOOP10:![0-9]+]], !loop_constrainer.loop.clone [[META5]]
 ; CHECK:       preloop.exit.selector:
 ; CHECK-NEXT:    [[IV_NEXT_PRELOOP_LCSSA:%.*]] = phi i64 [ [[IV_NEXT_PRELOOP]], [[BACKEDGE_PRELOOP]] ]
 ; CHECK-NEXT:    [[NARROW_IV_PRELOOP_LCSSA]] = phi i32 [ [[NARROW_IV_PRELOOP]], [[BACKEDGE_PRELOOP]] ]
@@ -389,7 +389,7 @@ define i32 @test_increasing_slt_slt_wide_general_preloop(ptr %n_ptr, ptr %m_ptr)
 ; CHECK-NEXT:    [[IV_NEXT_POSTLOOP]] = add i64 [[IV_POSTLOOP]], 1
 ; CHECK-NEXT:    [[NARROW_IV_POSTLOOP]] = trunc i64 [[IV_POSTLOOP]] to i32
 ; CHECK-NEXT:    [[LATCH_COND_POSTLOOP:%.*]] = icmp slt i32 [[NARROW_IV_POSTLOOP]], [[N]]
-; CHECK-NEXT:    br i1 [[LATCH_COND_POSTLOOP]], label [[LOOP_POSTLOOP]], label [[EXIT_LOOPEXIT]], !llvm.loop [[LOOP11:![0-9]+]], !loop_constrainer.loop.clone !5
+; CHECK-NEXT:    br i1 [[LATCH_COND_POSTLOOP]], label [[LOOP_POSTLOOP]], label [[EXIT_LOOPEXIT]], !llvm.loop [[LOOP11:![0-9]+]], !loop_constrainer.loop.clone [[META5]]
 ;
 
 entry:
@@ -519,7 +519,7 @@ define i32 @test_increasing_slt_slt_wide_multiple_checks(ptr %n_ptr, ptr %m1_ptr
 ; CHECK-NEXT:    [[IV_NEXT_POSTLOOP]] = add i64 [[IV_POSTLOOP]], 1
 ; CHECK-NEXT:    [[NARROW_IV_POSTLOOP]] = trunc i64 [[IV_NEXT_POSTLOOP]] to i32
 ; CHECK-NEXT:    [[LATCH_COND_POSTLOOP:%.*]] = icmp slt i32 [[NARROW_IV_POSTLOOP]], [[N]]
-; CHECK-NEXT:    br i1 [[LATCH_COND_POSTLOOP]], label [[LOOP_POSTLOOP]], label [[EXIT_LOOPEXIT]], !llvm.loop [[LOOP12:![0-9]+]], !loop_constrainer.loop.clone !5
+; CHECK-NEXT:    br i1 [[LATCH_COND_POSTLOOP]], label [[LOOP_POSTLOOP]], label [[EXIT_LOOPEXIT]], !llvm.loop [[LOOP12:![0-9]+]], !loop_constrainer.loop.clone [[META5]]
 ;
 
 entry:
@@ -688,7 +688,7 @@ define i32 @test_increasing_ult_ult_wide_simple_postloop() {
 ; CHECK-NEXT:    [[IV_NEXT_POSTLOOP]] = add i64 [[IV_POSTLOOP]], 1
 ; CHECK-NEXT:    [[NARROW_IV_POSTLOOP]] = trunc i64 [[IV_NEXT_POSTLOOP]] to i32
 ; CHECK-NEXT:    [[LATCH_COND_POSTLOOP:%.*]] = icmp ult i32 [[NARROW_IV_POSTLOOP]], 100
-; CHECK-NEXT:    br i1 [[LATCH_COND_POSTLOOP]], label [[LOOP_POSTLOOP]], label [[EXIT_LOOPEXIT]], !llvm.loop [[LOOP13:![0-9]+]], !loop_constrainer.loop.clone !5
+; CHECK-NEXT:    br i1 [[LATCH_COND_POSTLOOP]], label [[LOOP_POSTLOOP]], label [[EXIT_LOOPEXIT]], !llvm.loop [[LOOP13:![0-9]+]], !loop_constrainer.loop.clone [[META5]]
 ;
 
 entry:
@@ -769,7 +769,7 @@ define i32 @test_increasing_ult_ult_wide_non-negative(ptr %n_ptr, ptr %m_ptr) {
 ; CHECK-NEXT:    [[IV_NEXT_POSTLOOP]] = add i64 [[IV_POSTLOOP]], 1
 ; CHECK-NEXT:    [[NARROW_IV_POSTLOOP]] = trunc i64 [[IV_NEXT_POSTLOOP]] to i32
 ; CHECK-NEXT:    [[LATCH_COND_POSTLOOP:%.*]] = icmp ult i32 [[NARROW_IV_POSTLOOP]], [[N]]
-; CHECK-NEXT:    br i1 [[LATCH_COND_POSTLOOP]], label [[LOOP_POSTLOOP]], label [[EXIT_LOOPEXIT]], !llvm.loop [[LOOP14:![0-9]+]], !loop_constrainer.loop.clone !5
+; CHECK-NEXT:    br i1 [[LATCH_COND_POSTLOOP]], label [[LOOP_POSTLOOP]], label [[EXIT_LOOPEXIT]], !llvm.loop [[LOOP14:![0-9]+]], !loop_constrainer.loop.clone [[META5]]
 ;
 
 entry:
@@ -859,7 +859,7 @@ define i32 @test_increasing_ult_ult_wide_general(ptr %n_ptr, ptr %m_ptr) {
 ; CHECK-NEXT:    [[IV_NEXT_POSTLOOP]] = add i64 [[IV_POSTLOOP]], 1
 ; CHECK-NEXT:    [[NARROW_IV_POSTLOOP]] = trunc i64 [[IV_NEXT_POSTLOOP]] to i32
 ; CHECK-NEXT:    [[LATCH_COND_POSTLOOP:%.*]] = icmp ult i32 [[NARROW_IV_POSTLOOP]], [[N]]
-; CHECK-NEXT:    br i1 [[LATCH_COND_POSTLOOP]], label [[LOOP_POSTLOOP]], label [[EXIT_LOOPEXIT]], !llvm.loop [[LOOP15:![0-9]+]], !loop_constrainer.loop.clone !5
+; CHECK-NEXT:    br i1 [[LATCH_COND_POSTLOOP]], label [[LOOP_POSTLOOP]], label [[EXIT_LOOPEXIT]], !llvm.loop [[LOOP15:![0-9]+]], !loop_constrainer.loop.clone [[META5]]
 ;
 
 entry:
@@ -980,7 +980,7 @@ define i32 @test_increasing_ult_ult_wide_multiple_checks(ptr %n_ptr, ptr %m1_ptr
 ; CHECK-NEXT:    [[IV_NEXT_POSTLOOP]] = add i64 [[IV_POSTLOOP]], 1
 ; CHECK-NEXT:    [[NARROW_IV_POSTLOOP]] = trunc i64 [[IV_NEXT_POSTLOOP]] to i32
 ; CHECK-NEXT:    [[LATCH_COND_POSTLOOP:%.*]] = icmp ult i32 [[NARROW_IV_POSTLOOP]], [[N]]
-; CHECK-NEXT:    br i1 [[LATCH_COND_POSTLOOP]], label [[LOOP_POSTLOOP]], label [[EXIT_LOOPEXIT]], !llvm.loop [[LOOP16:![0-9]+]], !loop_constrainer.loop.clone !5
+; CHECK-NEXT:    br i1 [[LATCH_COND_POSTLOOP]], label [[LOOP_POSTLOOP]], label [[EXIT_LOOPEXIT]], !llvm.loop [[LOOP16:![0-9]+]], !loop_constrainer.loop.clone [[META5]]
 ;
 
 entry:
diff --git a/llvm/test/Transforms/InferFunctionAttrs/annotate.ll b/llvm/test/Transforms/InferFunctionAttrs/annotate.ll
index 79962b4..3b914dc2 100644
--- a/llvm/test/Transforms/InferFunctionAttrs/annotate.ll
+++ b/llvm/test/Transforms/InferFunctionAttrs/annotate.ll
@@ -717,6 +717,15 @@ declare float @modff(float, ptr)
 ; CHECK: declare x86_fp80 @modfl(x86_fp80, ptr nocapture) [[ARGMEMONLY_NOFREE_NOUNWIND_WILLRETURN_WRITEONLY]]
 declare x86_fp80 @modfl(x86_fp80, ptr)
 
+; CHECK: declare double @nan(ptr nocapture) [[ARGMEMONLY_NOFREE_NOUNWIND_READONLY_WILLRETURN:#[0-9]+]]
+declare double @nan(ptr)
+
+; CHECK: declare float @nanf(ptr nocapture) [[ARGMEMONLY_NOFREE_NOUNWIND_READONLY_WILLRETURN]]
+declare float @nanf(ptr)
+
+; CHECK: declare x86_fp80 @nanl(ptr nocapture) [[ARGMEMONLY_NOFREE_NOUNWIND_READONLY_WILLRETURN]]
+declare x86_fp80 @nanl(ptr)
+
 ; CHECK: declare double @nearbyint(double) [[NOFREE_NOUNWIND_WILLRETURN_WRITEONLY]]
 declare double @nearbyint(double)
 
@@ -956,7 +965,7 @@ declare ptr @strncpy(ptr, ptr, i64)
 ; CHECK: declare noalias ptr @strndup(ptr nocapture readonly, i64 noundef) [[INACCESSIBLEMEMORARGONLY_NOFREE_NOUNWIND_WILLRETURN_FAMILY_MALLOC]]
 declare ptr @strndup(ptr, i64)
 
-; CHECK: declare i64 @strnlen(ptr nocapture, i64) [[ARGMEMONLY_NOFREE_NOUNWIND_READONLY_WILLRETURN:#[0-9]+]]
+; CHECK: declare i64 @strnlen(ptr nocapture, i64) [[ARGMEMONLY_NOFREE_NOUNWIND_READONLY_WILLRETURN]]
 declare i64 @strnlen(ptr, i64)
 
 ; CHECK: declare ptr @strpbrk(ptr, ptr nocapture) [[ARGMEMONLY_NOFREE_NOUNWIND_READONLY_WILLRETURN]]
@@ -1088,6 +1097,8 @@ declare i32 @vsscanf(ptr, ptr, ptr)
 ; CHECK: declare noundef i64 @write(i32 noundef, ptr nocapture noundef readonly, i64 noundef) [[NOFREE]]
 declare i64 @write(i32, ptr, i64)
 
+; CHECK: declare void @abort() [[NOFREE_COLD:#[0-9]+]]
+declare void @abort()
 
 ; memset_pattern{4,8,16} aren't available everywhere.
 ; CHECK-DARWIN: declare void @memset_pattern4(ptr nocapture writeonly, ptr nocapture readonly, i64) [[ARGMEMONLY_NOFREE_NOUNWIND_WILLRETURN]]
@@ -1114,6 +1125,7 @@ declare void @memset_pattern16(ptr, ptr, i64)
 ; CHECK-DAG: attributes [[ARGMEMONLY_NOFREE_NOUNWIND]] = { nofree nounwind memory(argmem: readwrite) }
 ; CHECK-DAG: attributes [[INACCESSIBLEMEMORARGMEMONLY_NOUNWIND_WILLRETURN_ALLOCKIND_REALLOC_ALLOCSIZE1_FAMILY_MALLOC]] = { mustprogress nounwind willreturn allockind("realloc") allocsize(1) memory(argmem: readwrite, inaccessiblemem: readwrite) "alloc-family"="malloc" }
 ; CHECK-DAG: attributes [[INACCESSIBLEMEMORARGONLY_NOFREE_NOUNWIND_WILLRETURN_FAMILY_MALLOC]] = { mustprogress nofree nounwind willreturn memory(argmem: readwrite, inaccessiblemem: readwrite) "alloc-family"="malloc" }
+; CHECK-DAG: attributes [[NOFREE_COLD]] = { cold nofree }
 
 ; CHECK-NVPTX-DAG: attributes [[NOFREE_NOUNWIND_READNONE]] = { nofree nosync nounwind memory(none) }
 
diff --git a/llvm/test/Transforms/InstCombine/X86/x86-movmsk.ll b/llvm/test/Transforms/InstCombine/X86/x86-movmsk.ll
index 9fbc392..04bba79 100644
--- a/llvm/test/Transforms/InstCombine/X86/x86-movmsk.ll
+++ b/llvm/test/Transforms/InstCombine/X86/x86-movmsk.ll
@@ -7,12 +7,12 @@ target datalayout = "e-m:e-i64:64-f80:128-n8:16:32:64-S128"
 ; DemandedBits - MOVMSK zeros the upper bits of the result.
 ;
 
-define i32 @test_upper_x86_mmx_pmovmskb(x86_mmx %a0) {
+define i32 @test_upper_x86_mmx_pmovmskb(<1 x i64> %a0) {
 ; CHECK-LABEL: @test_upper_x86_mmx_pmovmskb(
 ; CHECK-NEXT:    [[TMP1:%.*]] = call i32 @llvm.x86.mmx.pmovmskb(<1 x i64> [[A0:%.*]])
 ; CHECK-NEXT:    ret i32 [[TMP1]]
 ;
-  %1 = call i32 @llvm.x86.mmx.pmovmskb(x86_mmx %a0)
+  %1 = call i32 @llvm.x86.mmx.pmovmskb(<1 x i64> %a0)
   %2 = and i32 %1, 255
   ret i32 %2
 }
@@ -87,11 +87,11 @@ define i32 @test_upper_x86_avx_movmsk_pd_256(<4 x double> %a0) {
 ; DemandedBits - If we don't use the lower bits then we just return zero.
 ;
 
-define i32 @test_lower_x86_mmx_pmovmskb(x86_mmx %a0) {
+define i32 @test_lower_x86_mmx_pmovmskb(<1 x i64> %a0) {
 ; CHECK-LABEL: @test_lower_x86_mmx_pmovmskb(
 ; CHECK-NEXT:    ret i32 0
 ;
-  %1 = call i32 @llvm.x86.mmx.pmovmskb(x86_mmx %a0)
+  %1 = call i32 @llvm.x86.mmx.pmovmskb(<1 x i64> %a0)
   %2 = and i32 %1, -256
   ret i32 %2
 }
@@ -151,7 +151,7 @@ define i32 @undef_x86_mmx_pmovmskb() {
 ; CHECK-LABEL: @undef_x86_mmx_pmovmskb(
 ; CHECK-NEXT:    ret i32 0
 ;
-  %1 = call i32 @llvm.x86.mmx.pmovmskb(x86_mmx undef)
+  %1 = call i32 @llvm.x86.mmx.pmovmskb(<1 x i64> undef)
   ret i32 %1
 }
 
@@ -264,8 +264,8 @@ define i32 @fold_x86_mmx_pmovmskb() {
 ; CHECK-NEXT:    [[TMP1:%.*]] = call i32 @llvm.x86.mmx.pmovmskb(<1 x i64> <i64 18084223940296448>)
 ; CHECK-NEXT:    ret i32 [[TMP1]]
 ;
-  %1 = bitcast <8 x i8> <i8 0, i8 255, i8 -1, i8 127, i8 -127, i8 63, i8 64, i8 256> to x86_mmx
-  %2 = call i32 @llvm.x86.mmx.pmovmskb(x86_mmx %1)
+  %1 = bitcast <8 x i8> <i8 0, i8 255, i8 -1, i8 127, i8 -127, i8 63, i8 64, i8 256> to <1 x i64>
+  %2 = call i32 @llvm.x86.mmx.pmovmskb(<1 x i64> %1)
   ret i32 %2
 }
 
@@ -437,7 +437,7 @@ define i32 @sext_sse_movmsk_ps_must_replicate_bits(<2 x i1> %x) {
   ret i32 %r
 }
 
-declare i32 @llvm.x86.mmx.pmovmskb(x86_mmx)
+declare i32 @llvm.x86.mmx.pmovmskb(<1 x i64>)
 
 declare i32 @llvm.x86.sse.movmsk.ps(<4 x float>)
 declare i32 @llvm.x86.sse2.movmsk.pd(<2 x double>)
diff --git a/llvm/test/Transforms/InstCombine/bitcast.ll b/llvm/test/Transforms/InstCombine/bitcast.ll
index 5599604..26047f2 100644
--- a/llvm/test/Transforms/InstCombine/bitcast.ll
+++ b/llvm/test/Transforms/InstCombine/bitcast.ll
@@ -711,3 +711,171 @@ define ptr @select_bitcast_unsized_pointer(i1 %c) {
   %s = select i1 %c, ptr @f1, ptr @f2
   ret ptr %s
 }
+
+define float @copysign_idiom_constant(float %x) {
+; CHECK-LABEL: @copysign_idiom_constant(
+; CHECK-NEXT:    [[Y:%.*]] = call float @llvm.copysign.f32(float 1.000000e+00, float [[X:%.*]])
+; CHECK-NEXT:    ret float [[Y]]
+;
+  %bits = bitcast float %x to i32
+  %sign = and i32 %bits, -2147483648
+  %res = or i32 %sign, 1065353216
+  %y = bitcast i32 %res to float
+  ret float %y
+}
+
+define float @copysign_idiom(float %x, i32 %mag) {
+; CHECK-LABEL: @copysign_idiom(
+; CHECK-NEXT:    [[COND:%.*]] = icmp sgt i32 [[MAG:%.*]], -1
+; CHECK-NEXT:    call void @llvm.assume(i1 [[COND]])
+; CHECK-NEXT:    [[TMP1:%.*]] = bitcast i32 [[MAG]] to float
+; CHECK-NEXT:    [[Y:%.*]] = call float @llvm.copysign.f32(float [[TMP1]], float [[X:%.*]])
+; CHECK-NEXT:    ret float [[Y]]
+;
+  %cond = icmp sgt i32 %mag, -1
+  call void @llvm.assume(i1 %cond)
+
+  %bits = bitcast float %x to i32
+  %sign = and i32 %bits, -2147483648
+  %res = or i32 %sign, %mag
+  %y = bitcast i32 %res to float
+  ret float %y
+}
+
+define float @copysign_idiom_commuted(float %x, i32 %magx) {
+; CHECK-LABEL: @copysign_idiom_commuted(
+; CHECK-NEXT:    [[MAG:%.*]] = add i32 [[MAGX:%.*]], -1
+; CHECK-NEXT:    [[COND:%.*]] = icmp sgt i32 [[MAG]], -1
+; CHECK-NEXT:    call void @llvm.assume(i1 [[COND]])
+; CHECK-NEXT:    [[TMP1:%.*]] = bitcast i32 [[MAG]] to float
+; CHECK-NEXT:    [[Y:%.*]] = call float @llvm.copysign.f32(float [[TMP1]], float [[X:%.*]])
+; CHECK-NEXT:    ret float [[Y]]
+;
+  %mag = add i32 %magx, -1 ; thwart complexity-based canonicalization
+  %cond = icmp sgt i32 %mag, -1
+  call void @llvm.assume(i1 %cond)
+
+  %bits = bitcast float %x to i32
+  %sign = and i32 %bits, -2147483648
+  %res = or i32 %mag, %sign
+  %y = bitcast i32 %res to float
+  ret float %y
+}
+
+define float @copysign_idiom_abs(float %x, float %mag) {
+; CHECK-LABEL: @copysign_idiom_abs(
+; CHECK-NEXT:    [[Y:%.*]] = call float @llvm.copysign.f32(float [[MAG:%.*]], float [[X:%.*]])
+; CHECK-NEXT:    ret float [[Y]]
+;
+  %abs = call float @llvm.fabs.f32(float %mag)
+  %absbits = bitcast float %abs to i32
+  %bits = bitcast float %x to i32
+  %sign = and i32 %bits, -2147483648
+  %res = or i32 %sign, %absbits
+  %y = bitcast i32 %res to float
+  ret float %y
+}
+
+define double @copysign_idiom_f64(double %x, i64 %mag) {
+; CHECK-LABEL: @copysign_idiom_f64(
+; CHECK-NEXT:    [[COND:%.*]] = icmp sgt i64 [[MAG:%.*]], -1
+; CHECK-NEXT:    call void @llvm.assume(i1 [[COND]])
+; CHECK-NEXT:    [[TMP1:%.*]] = bitcast i64 [[MAG]] to double
+; CHECK-NEXT:    [[Y:%.*]] = call double @llvm.copysign.f64(double [[TMP1]], double [[X:%.*]])
+; CHECK-NEXT:    ret double [[Y]]
+;
+  %cond = icmp sgt i64 %mag, -1
+  call void @llvm.assume(i1 %cond)
+
+  %bits = bitcast double %x to i64
+  %sign = and i64 %bits, -9223372036854775808
+  %res = or i64 %sign, %mag
+  %y = bitcast i64 %res to double
+  ret double %y
+}
+
+define <2 x float> @copysign_idiom_vec(<2 x float> %x) {
+; CHECK-LABEL: @copysign_idiom_vec(
+; CHECK-NEXT:    [[Y:%.*]] = call <2 x float> @llvm.copysign.v2f32(<2 x float> <float 1.000000e+00, float 1.000000e+00>, <2 x float> [[X:%.*]])
+; CHECK-NEXT:    ret <2 x float> [[Y]]
+;
+  %bits = bitcast <2 x float> %x to <2 x i32>
+  %sign = and <2 x i32> %bits, splat(i32 -2147483648)
+  %res = or <2 x i32> %sign, splat(i32 1065353216)
+  %y = bitcast <2 x i32> %res to <2 x float>
+  ret <2 x float> %y
+}
+
+; negative tests
+
+define float @copysign_idiom_without_nneg(float %x, i32 %mag) {
+; CHECK-LABEL: @copysign_idiom_without_nneg(
+; CHECK-NEXT:    [[BITS:%.*]] = bitcast float [[X:%.*]] to i32
+; CHECK-NEXT:    [[SIGN:%.*]] = and i32 [[BITS]], -2147483648
+; CHECK-NEXT:    [[RES:%.*]] = or i32 [[SIGN]], [[MAG:%.*]]
+; CHECK-NEXT:    [[Y:%.*]] = bitcast i32 [[RES]] to float
+; CHECK-NEXT:    ret float [[Y]]
+;
+  %bits = bitcast float %x to i32
+  %sign = and i32 %bits, -2147483648
+  %res = or i32 %sign, %mag
+  %y = bitcast i32 %res to float
+  ret float %y
+}
+
+define float @copysign_idiom_not_signmask(float %x, i32 %mag) {
+; CHECK-LABEL: @copysign_idiom_not_signmask(
+; CHECK-NEXT:    [[COND:%.*]] = icmp sgt i32 [[MAG:%.*]], -1
+; CHECK-NEXT:    call void @llvm.assume(i1 [[COND]])
+; CHECK-NEXT:    [[BITS:%.*]] = bitcast float [[X:%.*]] to i32
+; CHECK-NEXT:    [[SIGN:%.*]] = and i32 [[BITS]], -2147483647
+; CHECK-NEXT:    [[RES:%.*]] = or i32 [[SIGN]], [[MAG]]
+; CHECK-NEXT:    [[Y:%.*]] = bitcast i32 [[RES]] to float
+; CHECK-NEXT:    ret float [[Y]]
+;
+  %cond = icmp sgt i32 %mag, -1
+  call void @llvm.assume(i1 %cond)
+
+  %bits = bitcast float %x to i32
+  %sign = and i32 %bits, -2147483647
+  %res = or i32 %sign, %mag
+  %y = bitcast i32 %res to float
+  ret float %y
+}
+
+define float @copysign_idiom_constant_wrong_type1(<1 x i32> %x) {
+; CHECK-LABEL: @copysign_idiom_constant_wrong_type1(
+; CHECK-NEXT:    [[TMP1:%.*]] = extractelement <1 x i32> [[X:%.*]], i64 0
+; CHECK-NEXT:    [[COND:%.*]] = icmp sgt i32 [[TMP1]], -1
+; CHECK-NEXT:    call void @llvm.assume(i1 [[COND]])
+; CHECK-NEXT:    ret float 1.000000e+00
+;
+  %bits = bitcast <1 x i32> %x to i32
+  %cond = icmp sgt i32 %bits, -1
+  call void @llvm.assume(i1 %cond)
+
+  %sign = and i32 %bits, -2147483648
+  %res = or i32 %sign, 1065353216
+  %y = bitcast i32 %res to float
+  ret float %y
+}
+
+define half @copysign_idiom_constant_wrong_type2(bfloat %x, i16 %mag) {
+; CHECK-LABEL: @copysign_idiom_constant_wrong_type2(
+; CHECK-NEXT:    [[COND:%.*]] = icmp sgt i16 [[MAG:%.*]], -1
+; CHECK-NEXT:    call void @llvm.assume(i1 [[COND]])
+; CHECK-NEXT:    [[BITS:%.*]] = bitcast bfloat [[X:%.*]] to i16
+; CHECK-NEXT:    [[SIGN:%.*]] = and i16 [[BITS]], -32768
+; CHECK-NEXT:    [[RES:%.*]] = or disjoint i16 [[SIGN]], [[MAG]]
+; CHECK-NEXT:    [[Y:%.*]] = bitcast i16 [[RES]] to half
+; CHECK-NEXT:    ret half [[Y]]
+;
+  %cond = icmp sgt i16 %mag, -1
+  call void @llvm.assume(i1 %cond)
+
+  %bits = bitcast bfloat %x to i16
+  %sign = and i16 %bits, -32768
+  %res = or i16 %sign, %mag
+  %y = bitcast i16 %res to half
+  ret half %y
+}
diff --git a/llvm/test/Transforms/InstCombine/cast.ll b/llvm/test/Transforms/InstCombine/cast.ll
index 6cee7bb..564e829 100644
--- a/llvm/test/Transforms/InstCombine/cast.ll
+++ b/llvm/test/Transforms/InstCombine/cast.ll
@@ -937,27 +937,6 @@ define float @test2c() {
   ret float extractelement (<2 x float> bitcast (double bitcast (<2 x float> <float -1.000000e+00, float -1.000000e+00> to double) to <2 x float>), i32 0)
 }
 
-define i64 @test_mmx(<2 x i32> %x) {
-; ALL-LABEL: @test_mmx(
-; ALL-NEXT:    [[C:%.*]] = bitcast <2 x i32> [[X:%.*]] to i64
-; ALL-NEXT:    ret i64 [[C]]
-;
-  %A = bitcast <2 x i32> %x to x86_mmx
-  %B = bitcast x86_mmx %A to <2 x i32>
-  %C = bitcast <2 x i32> %B to i64
-  ret i64 %C
-}
-
-define i64 @test_mmx_const(<2 x i32> %c) {
-; ALL-LABEL: @test_mmx_const(
-; ALL-NEXT:    ret i64 0
-;
-  %A = bitcast <2 x i32> zeroinitializer to x86_mmx
-  %B = bitcast x86_mmx %A to <2 x i32>
-  %C = bitcast <2 x i32> %B to i64
-  ret i64 %C
-}
-
 ; PR12514
 define i1 @test67(i1 %a, i32 %b) {
 ; ALL-LABEL: @test67(
diff --git a/llvm/test/Transforms/InstCombine/ctpop-pow2.ll b/llvm/test/Transforms/InstCombine/ctpop-pow2.ll
index 7facdaf..4ef1ed0 100644
--- a/llvm/test/Transforms/InstCombine/ctpop-pow2.ll
+++ b/llvm/test/Transforms/InstCombine/ctpop-pow2.ll
@@ -60,7 +60,7 @@ define i8 @ctpop_imin_plus1_lshr_nz(i8 %x) {
 ; CHECK-NEXT:    [[CMP:%.*]] = icmp ne i8 [[X:%.*]], 0
 ; CHECK-NEXT:    call void @llvm.assume(i1 [[CMP]])
 ; CHECK-NEXT:    [[V:%.*]] = lshr i8 -127, [[X]]
-; CHECK-NEXT:    [[CNT:%.*]] = call range(i8 0, 9) i8 @llvm.ctpop.i8(i8 [[V]])
+; CHECK-NEXT:    [[CNT:%.*]] = call range(i8 1, 9) i8 @llvm.ctpop.i8(i8 [[V]])
 ; CHECK-NEXT:    ret i8 [[CNT]]
 ;
   %cmp = icmp ne i8 %x, 0
@@ -104,7 +104,7 @@ define <2 x i32> @ctpop_lshr_intmin_intmin_plus1_vec_nz(<2 x i32> %x) {
 ; CHECK-LABEL: @ctpop_lshr_intmin_intmin_plus1_vec_nz(
 ; CHECK-NEXT:    [[X1:%.*]] = or <2 x i32> [[X:%.*]], <i32 1, i32 1>
 ; CHECK-NEXT:    [[SHR:%.*]] = lshr <2 x i32> <i32 -2147483648, i32 -2147483647>, [[X1]]
-; CHECK-NEXT:    [[CNT:%.*]] = call range(i32 0, 17) <2 x i32> @llvm.ctpop.v2i32(<2 x i32> [[SHR]])
+; CHECK-NEXT:    [[CNT:%.*]] = call range(i32 1, 17) <2 x i32> @llvm.ctpop.v2i32(<2 x i32> [[SHR]])
 ; CHECK-NEXT:    ret <2 x i32> [[CNT]]
 ;
   %x1 = or <2 x i32> %x, <i32 1 ,i32 1>
diff --git a/llvm/test/Transforms/InstCombine/ctpop.ll b/llvm/test/Transforms/InstCombine/ctpop.ll
index 83700e7..940bb86 100644
--- a/llvm/test/Transforms/InstCombine/ctpop.ll
+++ b/llvm/test/Transforms/InstCombine/ctpop.ll
@@ -169,8 +169,8 @@ define <2 x i32> @_parity_of_not_poison(<2 x i32> %x) {
 
 define <2 x i32> @_parity_of_not_poison2(<2 x i32> %x) {
 ; CHECK-LABEL: @_parity_of_not_poison2(
-; CHECK-NEXT:    [[CNT:%.*]] = call range(i32 0, 33) <2 x i32> @llvm.ctpop.v2i32(<2 x i32> [[X:%.*]])
-; CHECK-NEXT:    [[R:%.*]] = and <2 x i32> [[CNT]], <i32 1, i32 poison>
+; CHECK-NEXT:    [[TMP1:%.*]] = call range(i32 0, 33) <2 x i32> @llvm.ctpop.v2i32(<2 x i32> [[X:%.*]])
+; CHECK-NEXT:    [[R:%.*]] = and <2 x i32> [[TMP1]], <i32 1, i32 poison>
 ; CHECK-NEXT:    ret <2 x i32> [[R]]
 ;
   %neg = xor <2 x i32> %x, <i32 -1 ,i32 -1>
@@ -485,3 +485,21 @@ define i32 @select_ctpop_zero(i32 %x) {
   %res = select i1 %cmp, i32 0, i32 %ctpop
   ret i32 %res
 }
+
+define i32 @ctpop_non_zero(i32 range(i32 1, 255) %x) {
+; CHECK-LABEL: @ctpop_non_zero(
+; CHECK-NEXT:    [[CTPOP:%.*]] = call range(i32 1, 9) i32 @llvm.ctpop.i32(i32 [[X:%.*]])
+; CHECK-NEXT:    ret i32 [[CTPOP]]
+;
+  %ctpop = call i32 @llvm.ctpop.i32(i32 %x)
+  ret i32 %ctpop
+}
+
+define i32 @ctpop_non_zero_with_existing_range_attr(i32 range(i32 1, 255) %x) {
+; CHECK-LABEL: @ctpop_non_zero_with_existing_range_attr(
+; CHECK-NEXT:    [[CTPOP:%.*]] = call range(i32 1, 9) i32 @llvm.ctpop.i32(i32 [[X:%.*]])
+; CHECK-NEXT:    ret i32 [[CTPOP]]
+;
+  %ctpop = call range(i32 0, 9) i32 @llvm.ctpop.i32(i32 %x)
+  ret i32 %ctpop
+}
diff --git a/llvm/test/Transforms/InstCombine/icmp-ne-pow2.ll b/llvm/test/Transforms/InstCombine/icmp-ne-pow2.ll
index e9ec6b4..618f5d6 100644
--- a/llvm/test/Transforms/InstCombine/icmp-ne-pow2.ll
+++ b/llvm/test/Transforms/InstCombine/icmp-ne-pow2.ll
@@ -306,7 +306,7 @@ define i32 @pow2_32_nonconst_assume(i32 %x, i32 %y) {
 
 define i32 @pow2_32_gtnonconst_assume(i32 %x, i32 %y) {
 ; CHECK-LABEL: @pow2_32_gtnonconst_assume(
-; CHECK-NEXT:    [[CTPOP:%.*]] = call range(i32 0, 33) i32 @llvm.ctpop.i32(i32 [[Y:%.*]])
+; CHECK-NEXT:    [[CTPOP:%.*]] = call range(i32 1, 33) i32 @llvm.ctpop.i32(i32 [[Y:%.*]])
 ; CHECK-NEXT:    [[YP2:%.*]] = icmp eq i32 [[CTPOP]], 1
 ; CHECK-NEXT:    call void @llvm.assume(i1 [[YP2]])
 ; CHECK-NEXT:    [[YGT:%.*]] = icmp ugt i32 [[Y]], [[X:%.*]]
@@ -513,7 +513,7 @@ define i32 @maybe_pow2_32_noncont(i32 %x, i32 %y) {
 ; CHECK-NEXT:    [[YGT8:%.*]] = icmp ugt i32 [[Y:%.*]], 8
 ; CHECK-NEXT:    br i1 [[YGT8]], label [[CONT1:%.*]], label [[CONT2:%.*]]
 ; CHECK:       Cont1:
-; CHECK-NEXT:    [[CTPOP:%.*]] = call range(i32 0, 33) i32 @llvm.ctpop.i32(i32 [[Y]])
+; CHECK-NEXT:    [[CTPOP:%.*]] = call range(i32 1, 33) i32 @llvm.ctpop.i32(i32 [[Y]])
 ; CHECK-NEXT:    [[YP2:%.*]] = icmp eq i32 [[CTPOP]], 1
 ; CHECK-NEXT:    call void @llvm.assume(i1 [[YP2]])
 ; CHECK-NEXT:    br i1 true, label [[CONT2]], label [[FALSE:%.*]]
diff --git a/llvm/test/Transforms/InstCombine/ispow2.ll b/llvm/test/Transforms/InstCombine/ispow2.ll
index a143b13..3f2c31d 100644
--- a/llvm/test/Transforms/InstCombine/ispow2.ll
+++ b/llvm/test/Transforms/InstCombine/ispow2.ll
@@ -197,7 +197,7 @@ define i1 @is_pow2_non_zero_ult_2(i32 %x) {
 ; CHECK-LABEL: @is_pow2_non_zero_ult_2(
 ; CHECK-NEXT:    [[NOTZERO:%.*]] = icmp ne i32 [[X:%.*]], 0
 ; CHECK-NEXT:    call void @llvm.assume(i1 [[NOTZERO]])
-; CHECK-NEXT:    [[T0:%.*]] = tail call range(i32 0, 33) i32 @llvm.ctpop.i32(i32 [[X]])
+; CHECK-NEXT:    [[T0:%.*]] = tail call range(i32 1, 33) i32 @llvm.ctpop.i32(i32 [[X]])
 ; CHECK-NEXT:    [[CMP:%.*]] = icmp ult i32 [[T0]], 2
 ; CHECK-NEXT:    ret i1 [[CMP]]
 ;
@@ -212,7 +212,7 @@ define i1 @is_pow2_non_zero_eq_1(i32 %x) {
 ; CHECK-LABEL: @is_pow2_non_zero_eq_1(
 ; CHECK-NEXT:    [[NOTZERO:%.*]] = icmp ne i32 [[X:%.*]], 0
 ; CHECK-NEXT:    call void @llvm.assume(i1 [[NOTZERO]])
-; CHECK-NEXT:    [[T0:%.*]] = tail call range(i32 0, 33) i32 @llvm.ctpop.i32(i32 [[X]])
+; CHECK-NEXT:    [[T0:%.*]] = tail call range(i32 1, 33) i32 @llvm.ctpop.i32(i32 [[X]])
 ; CHECK-NEXT:    [[CMP:%.*]] = icmp eq i32 [[T0]], 1
 ; CHECK-NEXT:    ret i1 [[CMP]]
 ;
@@ -227,7 +227,7 @@ define i1 @is_pow2_non_zero_ugt_1(i32 %x) {
 ; CHECK-LABEL: @is_pow2_non_zero_ugt_1(
 ; CHECK-NEXT:    [[NOTZERO:%.*]] = icmp ne i32 [[X:%.*]], 0
 ; CHECK-NEXT:    call void @llvm.assume(i1 [[NOTZERO]])
-; CHECK-NEXT:    [[T0:%.*]] = tail call range(i32 0, 33) i32 @llvm.ctpop.i32(i32 [[X]])
+; CHECK-NEXT:    [[T0:%.*]] = tail call range(i32 1, 33) i32 @llvm.ctpop.i32(i32 [[X]])
 ; CHECK-NEXT:    [[CMP:%.*]] = icmp ugt i32 [[T0]], 1
 ; CHECK-NEXT:    ret i1 [[CMP]]
 ;
@@ -242,7 +242,7 @@ define i1 @is_pow2_non_zero_ne_1(i32 %x) {
 ; CHECK-LABEL: @is_pow2_non_zero_ne_1(
 ; CHECK-NEXT:    [[NOTZERO:%.*]] = icmp ne i32 [[X:%.*]], 0
 ; CHECK-NEXT:    call void @llvm.assume(i1 [[NOTZERO]])
-; CHECK-NEXT:    [[T0:%.*]] = tail call range(i32 0, 33) i32 @llvm.ctpop.i32(i32 [[X]])
+; CHECK-NEXT:    [[T0:%.*]] = tail call range(i32 1, 33) i32 @llvm.ctpop.i32(i32 [[X]])
 ; CHECK-NEXT:    [[CMP:%.*]] = icmp ne i32 [[T0]], 1
 ; CHECK-NEXT:    ret i1 [[CMP]]
 ;
diff --git a/llvm/test/Transforms/InstCombine/lib-call-exit.ll b/llvm/test/Transforms/InstCombine/lib-call-exit.ll
new file mode 100644
index 0000000..ad133af
--- /dev/null
+++ b/llvm/test/Transforms/InstCombine/lib-call-exit.ll
@@ -0,0 +1,46 @@
+; NOTE: Assertions have been autogenerated by utils/update_test_checks.py UTC_ARGS: --check-globals all --version 5
+; RUN: opt < %s -passes=instcombine -S | FileCheck %s
+
+declare void @exit(i32)
+declare void @_Exit(i32)
+
+define void @call_exit_0() {
+; CHECK-LABEL: define void @call_exit_0() {
+; CHECK-NEXT:    call void @exit(i32 0)
+; CHECK-NEXT:    ret void
+;
+  call void @exit(i32 0)
+  ret void
+}
+
+define void @call_exit_1() {
+; CHECK-LABEL: define void @call_exit_1() {
+; CHECK-NEXT:    call void @exit(i32 1) #[[ATTR0:[0-9]+]]
+; CHECK-NEXT:    ret void
+;
+  call void @exit(i32 1)
+  ret void
+}
+
+define void @call__Exit_m1() {
+; CHECK-LABEL: define void @call__Exit_m1() {
+; CHECK-NEXT:    call void @_Exit(i32 -1) #[[ATTR0]]
+; CHECK-NEXT:    ret void
+;
+  call void @_Exit(i32 -1)
+  ret void
+}
+
+define void @call__Exit_N(i32 %N) {
+; CHECK-LABEL: define void @call__Exit_N(
+; CHECK-SAME: i32 [[N:%.*]]) {
+; CHECK-NEXT:    call void @_Exit(i32 [[N]])
+; CHECK-NEXT:    ret void
+;
+  call void @_Exit(i32 %N)
+  ret void
+}
+
+;.
+; CHECK: attributes #[[ATTR0]] = { cold }
+;.
diff --git a/llvm/test/Transforms/InstCombine/load.ll b/llvm/test/Transforms/InstCombine/load.ll
index 7d53c8e..6c087aa 100644
--- a/llvm/test/Transforms/InstCombine/load.ll
+++ b/llvm/test/Transforms/InstCombine/load.ll
@@ -56,6 +56,18 @@ define i32 @test5(i1 %C) {
   ret i32 %Z
 }
 
+; FIXME: Constants should be allowed for this optimization.
+define i32 @test5_asan(i1 %C) sanitize_address {
+; CHECK-LABEL: @test5_asan(
+; CHECK-NEXT:    [[Y:%.*]] = select i1 [[C:%.*]], ptr @X, ptr @X2
+; CHECK-NEXT:    [[Z:%.*]] = load i32, ptr [[Y]], align 4
+; CHECK-NEXT:    ret i32 [[Z]]
+;
+  %Y = select i1 %C, ptr @X, ptr @X2		; <ptr> [#uses=1]
+  %Z = load i32, ptr %Y		; <i32> [#uses=1]
+  ret i32 %Z
+}
+
 define i32 @load_gep_null_inbounds(i64 %X) {
 ; CHECK-LABEL: @load_gep_null_inbounds(
 ; CHECK-NEXT:    store i1 true, ptr poison, align 1
diff --git a/llvm/test/Transforms/InstCombine/multiple-uses-load-bitcast-select.ll b/llvm/test/Transforms/InstCombine/multiple-uses-load-bitcast-select.ll
index a75a649..38fca03 100644
--- a/llvm/test/Transforms/InstCombine/multiple-uses-load-bitcast-select.ll
+++ b/llvm/test/Transforms/InstCombine/multiple-uses-load-bitcast-select.ll
@@ -7,8 +7,8 @@ define void @PR35618(ptr %st1, ptr %st2) {
 ; CHECK-NEXT:    [[Z1:%.*]] = alloca double, align 8
 ; CHECK-NEXT:    [[LD1:%.*]] = load double, ptr [[Y1]], align 8
 ; CHECK-NEXT:    [[LD2:%.*]] = load double, ptr [[Z1]], align 8
-; CHECK-NEXT:    [[TMP10:%.*]] = fcmp olt double [[LD1]], [[LD2]]
-; CHECK-NEXT:    [[TMP12_V:%.*]] = select i1 [[TMP10]], double [[LD1]], double [[LD2]]
+; CHECK-NEXT:    [[TMP:%.*]] = fcmp olt double [[LD1]], [[LD2]]
+; CHECK-NEXT:    [[TMP12_V:%.*]] = select i1 [[TMP]], double [[LD1]], double [[LD2]]
 ; CHECK-NEXT:    store double [[TMP12_V]], ptr [[ST1:%.*]], align 8
 ; CHECK-NEXT:    store double [[TMP12_V]], ptr [[ST2:%.*]], align 8
 ; CHECK-NEXT:    ret void
@@ -17,8 +17,32 @@ define void @PR35618(ptr %st1, ptr %st2) {
   %z1 = alloca double
   %ld1 = load double, ptr %y1
   %ld2 = load double, ptr %z1
-  %tmp10 = fcmp olt double %ld1, %ld2
-  %sel = select i1 %tmp10, ptr %y1, ptr %z1
+  %tmp = fcmp olt double %ld1, %ld2
+  %sel = select i1 %tmp, ptr %y1, ptr %z1
+  %tmp12 = load i64, ptr %sel
+  store i64 %tmp12, ptr %st1
+  store i64 %tmp12, ptr %st2
+  ret void
+}
+
+define void @PR35618_asan(ptr %st1, ptr %st2) sanitize_address {
+; CHECK-LABEL: @PR35618_asan(
+; CHECK-NEXT:    [[Y1:%.*]] = alloca double, align 8
+; CHECK-NEXT:    [[Z1:%.*]] = alloca double, align 8
+; CHECK-NEXT:    [[LD1:%.*]] = load double, ptr [[Y1]], align 8
+; CHECK-NEXT:    [[LD2:%.*]] = load double, ptr [[Z1]], align 8
+; CHECK-NEXT:    [[TMP:%.*]] = fcmp olt double [[LD1]], [[LD2]]
+; CHECK-NEXT:    [[TMP12_V:%.*]] = select i1 [[TMP]], double [[LD1]], double [[LD2]]
+; CHECK-NEXT:    store double [[TMP12_V]], ptr [[ST1:%.*]], align 8
+; CHECK-NEXT:    store double [[TMP12_V]], ptr [[ST2:%.*]], align 8
+; CHECK-NEXT:    ret void
+;
+  %y1 = alloca double
+  %z1 = alloca double
+  %ld1 = load double, ptr %y1
+  %ld2 = load double, ptr %z1
+  %tmp = fcmp olt double %ld1, %ld2
+  %sel = select i1 %tmp, ptr %y1, ptr %z1
   %tmp12 = load i64, ptr %sel
   store i64 %tmp12, ptr %st1
   store i64 %tmp12, ptr %st2
diff --git a/llvm/test/Transforms/InstCombine/ptr-replace-alloca.ll b/llvm/test/Transforms/InstCombine/ptr-replace-alloca.ll
index 7c65a93..a1d10c2 100644
--- a/llvm/test/Transforms/InstCombine/ptr-replace-alloca.ll
+++ b/llvm/test/Transforms/InstCombine/ptr-replace-alloca.ll
@@ -427,6 +427,23 @@ entry:
   ret i8 %load
 }
 
+define i8 @select_diff_addrspace_remove_alloca_asan(i1 %cond, ptr %p) sanitize_address {
+; CHECK-LABEL: @select_diff_addrspace_remove_alloca_asan(
+; CHECK-NEXT:  entry:
+; CHECK-NEXT:    [[GEP2:%.*]] = select i1 [[COND:%.*]], ptr addrspace(1) getelementptr inbounds (i8, ptr addrspace(1) @g2, i64 4), ptr addrspace(1) getelementptr inbounds (i8, ptr addrspace(1) @g2, i64 6)
+; CHECK-NEXT:    [[LOAD:%.*]] = load i8, ptr addrspace(1) [[GEP2]], align 1
+; CHECK-NEXT:    ret i8 [[LOAD]]
+;
+entry:
+  %alloca = alloca [32 x i8]
+  call void @llvm.memcpy.p0.p1.i64(ptr %alloca, ptr addrspace(1) @g2, i64 32, i1 false)
+  %gep = getelementptr inbounds [32 x i8], ptr %alloca, i32 0, i32 2
+  %sel = select i1 %cond, ptr %alloca, ptr %gep
+  %gep2 = getelementptr inbounds i8, ptr %sel, i64 4
+  %load = load i8, ptr %gep2
+  ret i8 %load
+}
+
 declare i8 @readonly_callee(ptr readonly nocapture)
 
 ; FIXME: This should be able to fold to call i8 @readonly_callee(ptr nonnull @g1)
diff --git a/llvm/test/Transforms/InstCombine/select-load.ll b/llvm/test/Transforms/InstCombine/select-load.ll
new file mode 100644
index 0000000..3688342
--- /dev/null
+++ b/llvm/test/Transforms/InstCombine/select-load.ll
@@ -0,0 +1,101 @@
+; NOTE: Assertions have been autogenerated by utils/update_test_checks.py
+; RUN: opt -passes=instcombine -S < %s | FileCheck %s
+
+target datalayout = "e-m:e-p270:32:32-p271:32:32-p272:64:64-i64:64-i128:128-f80:128-n8:16:32:64-S128"
+target triple = "x86_64-grtev4-linux-gnu"
+
+define i32 @test_plain(i1 %f) {
+; CHECK-LABEL: @test_plain(
+; CHECK-NEXT:  entry:
+; CHECK-NEXT:    [[A:%.*]] = alloca i32, align 8
+; CHECK-NEXT:    [[B:%.*]] = alloca i32, align 8
+; CHECK-NEXT:    [[A_VAL:%.*]] = load i32, ptr [[A]], align 8
+; CHECK-NEXT:    [[B_VAL:%.*]] = load i32, ptr [[B]], align 8
+; CHECK-NEXT:    [[L:%.*]] = select i1 [[F:%.*]], i32 [[A_VAL]], i32 [[B_VAL]]
+; CHECK-NEXT:    ret i32 [[L]]
+;
+entry:
+  %a = alloca i32, align 8
+  %b = alloca i32, align 8
+  %sel = select i1 %f, ptr %a, ptr %b
+  %l = load i32, ptr %sel, align 8
+  ret i32 %l
+}
+
+; Don't speculate as the condition may control which memory is valid from
+; sanitizer perspective.
+define i32 @test_asan(i1 %f) sanitize_address {
+; CHECK-LABEL: @test_asan(
+; CHECK-NEXT:  entry:
+; CHECK-NEXT:    [[A:%.*]] = alloca i32, align 8
+; CHECK-NEXT:    [[B:%.*]] = alloca i32, align 8
+; CHECK-NEXT:    [[SEL:%.*]] = select i1 [[F:%.*]], ptr [[A]], ptr [[B]]
+; CHECK-NEXT:    [[L:%.*]] = load i32, ptr [[SEL]], align 8
+; CHECK-NEXT:    ret i32 [[L]]
+;
+entry:
+  %a = alloca i32, align 8
+  %b = alloca i32, align 8
+  %sel = select i1 %f, ptr %a, ptr %b
+  %l = load i32, ptr %sel, align 8
+  ret i32 %l
+}
+
+
+; Don't speculate as the condition may control which memory is valid from
+; sanitizer perspective.
+define i32 @test_hwasan(i1 %f) sanitize_hwaddress {
+; CHECK-LABEL: @test_hwasan(
+; CHECK-NEXT:  entry:
+; CHECK-NEXT:    [[A:%.*]] = alloca i32, align 8
+; CHECK-NEXT:    [[B:%.*]] = alloca i32, align 8
+; CHECK-NEXT:    [[SEL:%.*]] = select i1 [[F:%.*]], ptr [[A]], ptr [[B]]
+; CHECK-NEXT:    [[L:%.*]] = load i32, ptr [[SEL]], align 8
+; CHECK-NEXT:    ret i32 [[L]]
+;
+entry:
+  %a = alloca i32, align 8
+  %b = alloca i32, align 8
+  %sel = select i1 %f, ptr %a, ptr %b
+  %l = load i32, ptr %sel, align 8
+  ret i32 %l
+}
+
+; Don't speculate as the condition may control which memory is valid from
+; sanitizer perspective.
+define i32 @test_tsan(i1 %f) sanitize_thread {
+; CHECK-LABEL: @test_tsan(
+; CHECK-NEXT:  entry:
+; CHECK-NEXT:    [[A:%.*]] = alloca i32, align 8
+; CHECK-NEXT:    [[B:%.*]] = alloca i32, align 8
+; CHECK-NEXT:    [[SEL:%.*]] = select i1 [[F:%.*]], ptr [[A]], ptr [[B]]
+; CHECK-NEXT:    [[L:%.*]] = load i32, ptr [[SEL]], align 8
+; CHECK-NEXT:    ret i32 [[L]]
+;
+entry:
+  %a = alloca i32, align 8
+  %b = alloca i32, align 8
+  %sel = select i1 %f, ptr %a, ptr %b
+  %l = load i32, ptr %sel, align 8
+  ret i32 %l
+}
+
+; Msan just propagates shadow, even if speculated load accesses uninitialized
+; value, instrumentation will select shadow of the desired value anyway.
+define i32 @test_msan(i1 %f) sanitize_memory {
+; CHECK-LABEL: @test_msan(
+; CHECK-NEXT:  entry:
+; CHECK-NEXT:    [[A:%.*]] = alloca i32, align 8
+; CHECK-NEXT:    [[B:%.*]] = alloca i32, align 8
+; CHECK-NEXT:    [[A_VAL:%.*]] = load i32, ptr [[A]], align 8
+; CHECK-NEXT:    [[B_VAL:%.*]] = load i32, ptr [[B]], align 8
+; CHECK-NEXT:    [[L:%.*]] = select i1 [[F:%.*]], i32 [[A_VAL]], i32 [[B_VAL]]
+; CHECK-NEXT:    ret i32 [[L]]
+;
+entry:
+  %a = alloca i32, align 8
+  %b = alloca i32, align 8
+  %sel = select i1 %f, ptr %a, ptr %b
+  %l = load i32, ptr %sel, align 8
+  ret i32 %l
+}
diff --git a/llvm/test/Transforms/InstCombine/strnlen-2.ll b/llvm/test/Transforms/InstCombine/strnlen-2.ll
index 5e95aaf..9c04240 100644
--- a/llvm/test/Transforms/InstCombine/strnlen-2.ll
+++ b/llvm/test/Transforms/InstCombine/strnlen-2.ll
@@ -38,6 +38,21 @@ define i64 @fold_strnlen_s3_s5_1(i1 %C) {
   ret i64 %len
 }
 
+; FIXME: Constants should be allowed for this optimization.
+define i64 @fold_strnlen_s3_s5_1_asan(i1 %C) sanitize_address {
+; CHECK-LABEL: @fold_strnlen_s3_s5_1_asan(
+; CHECK-NEXT:    [[PTR:%.*]] = select i1 [[C:%.*]], ptr @s3, ptr @s6
+; CHECK-NEXT:    [[STRNLEN_CHAR0:%.*]] = load i8, ptr [[PTR]], align 1
+; CHECK-NEXT:    [[STRNLEN_CHAR0CMP:%.*]] = icmp ne i8 [[STRNLEN_CHAR0]], 0
+; CHECK-NEXT:    [[LEN:%.*]] = zext i1 [[STRNLEN_CHAR0CMP]] to i64
+; CHECK-NEXT:    ret i64 [[LEN]]
+;
+  %ptr = select i1 %C, ptr @s3, ptr @s6
+
+  %len = call i64 @strnlen(ptr %ptr, i64 1)
+  ret i64 %len
+}
+
 
 ; Fold strnlen (C ? s3 : s5, 3) to 3.
 
diff --git a/llvm/test/Transforms/InstSimplify/ConstProp/gep-zeroinit-vector.ll b/llvm/test/Transforms/InstSimplify/ConstProp/gep-zeroinit-vector.ll
index bce07b0..c383ff7 100644
--- a/llvm/test/Transforms/InstSimplify/ConstProp/gep-zeroinit-vector.ll
+++ b/llvm/test/Transforms/InstSimplify/ConstProp/gep-zeroinit-vector.ll
@@ -12,18 +12,5 @@ define <2 x ptr> @test_gep() {
 ; CHECK-NEXT:    ret <2 x ptr> <ptr @a, ptr @a>
 ;
   %A = getelementptr [1 x %rec8], ptr @a, <2 x i16> zeroinitializer, <2 x i64> zeroinitializer
-  %B = bitcast <2 x ptr> %A to <2 x ptr>
-  ret <2 x ptr> %B
-}
-
-; Testcase that verify the cast-of-cast when the outer/second cast is to a
-; vector type.
-
-define <4 x i16> @test_mmx_const() {
-; CHECK-LABEL: @test_mmx_const(
-; CHECK-NEXT:    ret <4 x i16> zeroinitializer
-;
-  %A = bitcast <2 x i32> zeroinitializer to x86_mmx
-  %B = bitcast x86_mmx %A to <4 x i16>
-  ret <4 x i16> %B
+  ret <2 x ptr> %A
 }
diff --git a/llvm/test/Transforms/LoopVectorize/AArch64/scalable-alloca.ll b/llvm/test/Transforms/LoopVectorize/AArch64/scalable-alloca.ll
index b66bb94..0ef03c5 100644
--- a/llvm/test/Transforms/LoopVectorize/AArch64/scalable-alloca.ll
+++ b/llvm/test/Transforms/LoopVectorize/AArch64/scalable-alloca.ll
@@ -2,8 +2,8 @@
 ; RUN: FileCheck %s --check-prefix=CHECK-REMARKS < %t
 
 ; CHECK-REMARKS: UserVF ignored because of invalid costs.
-; CHECK-REMARKS: Instruction with invalid costs prevented vectorization at VF=(vscale x 1, vscale x 2): alloca
-; CHECK-REMARKS: Instruction with invalid costs prevented vectorization at VF=(vscale x 1): store
+; CHECK-REMARKS: Recipe with invalid costs prevented vectorization at VF=(vscale x 1, vscale x 2): alloca
+; CHECK-REMARKS: Recipe with invalid costs prevented vectorization at VF=(vscale x 1): store
 define void @alloca(ptr %vla, i64 %N) {
 ; CHECK-LABEL: @alloca(
 ; CHECK-NOT: <vscale x
diff --git a/llvm/test/Transforms/LoopVectorize/AArch64/scalable-call.ll b/llvm/test/Transforms/LoopVectorize/AArch64/scalable-call.ll
index 333bb20..bc6eeb4 100644
--- a/llvm/test/Transforms/LoopVectorize/AArch64/scalable-call.ll
+++ b/llvm/test/Transforms/LoopVectorize/AArch64/scalable-call.ll
@@ -101,9 +101,9 @@ for.end:
 }
 
 ; CHECK-REMARKS: UserVF ignored because of invalid costs.
-; CHECK-REMARKS-NEXT: t.c:3:10: Instruction with invalid costs prevented vectorization at VF=(vscale x 1): load
-; CHECK-REMARKS-NEXT: t.c:3:20: Instruction with invalid costs prevented vectorization at VF=(vscale x 1, vscale x 2): call to llvm.sin.f32
-; CHECK-REMARKS-NEXT: t.c:3:30: Instruction with invalid costs prevented vectorization at VF=(vscale x 1): store
+; CHECK-REMARKS-NEXT: t.c:3:10: Recipe with invalid costs prevented vectorization at VF=(vscale x 1): load
+; CHECK-REMARKS-NEXT: t.c:3:20: Recipe with invalid costs prevented vectorization at VF=(vscale x 1, vscale x 2): call to llvm.sin.f32
+; CHECK-REMARKS-NEXT: t.c:3:30: Recipe with invalid costs prevented vectorization at VF=(vscale x 1): store
 define void @vec_sin_no_mapping(ptr noalias nocapture %dst, ptr noalias nocapture readonly %src, i64 %n) {
 ; CHECK: @vec_sin_no_mapping
 ; CHECK: call fast <2 x float> @llvm.sin.v2f32
@@ -127,10 +127,10 @@ for.cond.cleanup:                                 ; preds = %for.body
 }
 
 ; CHECK-REMARKS: UserVF ignored because of invalid costs.
-; CHECK-REMARKS-NEXT: t.c:3:10: Instruction with invalid costs prevented vectorization at VF=(vscale x 1): load
-; CHECK-REMARKS-NEXT: t.c:3:30: Instruction with invalid costs prevented vectorization at VF=(vscale x 1, vscale x 2): call to llvm.sin.f32
-; CHECK-REMARKS-NEXT: t.c:3:20: Instruction with invalid costs prevented vectorization at VF=(vscale x 1, vscale x 2): call to llvm.sin.f32
-; CHECK-REMARKS-NEXT: t.c:3:40: Instruction with invalid costs prevented vectorization at VF=(vscale x 1): store
+; CHECK-REMARKS-NEXT: t.c:3:10: Recipe with invalid costs prevented vectorization at VF=(vscale x 1): load
+; CHECK-REMARKS-NEXT: t.c:3:30: Recipe with invalid costs prevented vectorization at VF=(vscale x 1, vscale x 2): call to llvm.sin.f32
+; CHECK-REMARKS-NEXT: t.c:3:20: Recipe with invalid costs prevented vectorization at VF=(vscale x 1, vscale x 2): call to llvm.sin.f32
+; CHECK-REMARKS-NEXT: t.c:3:40: Recipe with invalid costs prevented vectorization at VF=(vscale x 1): store
 define void @vec_sin_no_mapping_ite(ptr noalias nocapture %dst, ptr noalias nocapture readonly %src, i64 %n) {
 ; CHECK: @vec_sin_no_mapping_ite
 ; CHECK-NOT: <vscale x
@@ -163,9 +163,9 @@ for.cond.cleanup:                                 ; preds = %for.body
 }
 
 ; CHECK-REMARKS: UserVF ignored because of invalid costs.
-; CHECK-REMARKS-NEXT: t.c:3:10: Instruction with invalid costs prevented vectorization at VF=(vscale x 1): load
-; CHECK-REMARKS-NEXT: t.c:3:20: Instruction with invalid costs prevented vectorization at VF=(vscale x 1, vscale x 2): call to llvm.sin.f32
-; CHECK-REMARKS-NEXT: t.c:3:30: Instruction with invalid costs prevented vectorization at VF=(vscale x 1): store
+; CHECK-REMARKS-NEXT: t.c:3:10: Recipe with invalid costs prevented vectorization at VF=(vscale x 1): load
+; CHECK-REMARKS-NEXT: t.c:3:20: Recipe with invalid costs prevented vectorization at VF=(vscale x 1, vscale x 2): call to llvm.sin.f32
+; CHECK-REMARKS-NEXT: t.c:3:30: Recipe with invalid costs prevented vectorization at VF=(vscale x 1): store
 define void @vec_sin_fixed_mapping(ptr noalias nocapture %dst, ptr noalias nocapture readonly %src, i64 %n) {
 ; CHECK: @vec_sin_fixed_mapping
 ; CHECK: call fast <2 x float> @llvm.sin.v2f32
diff --git a/llvm/test/Transforms/LoopVectorize/RISCV/short-trip-count.ll b/llvm/test/Transforms/LoopVectorize/RISCV/short-trip-count.ll
index b699b24..bb716d7 100644
--- a/llvm/test/Transforms/LoopVectorize/RISCV/short-trip-count.ll
+++ b/llvm/test/Transforms/LoopVectorize/RISCV/short-trip-count.ll
@@ -7,24 +7,22 @@ define void @small_trip_count_min_vlen_128(ptr nocapture %a) nounwind vscale_ran
 ; CHECK-NEXT:    br i1 false, label [[SCALAR_PH:%.*]], label [[VECTOR_PH:%.*]]
 ; CHECK:       vector.ph:
 ; CHECK-NEXT:    [[TMP0:%.*]] = call i32 @llvm.vscale.i32()
-; CHECK-NEXT:    [[TMP1:%.*]] = mul i32 [[TMP0]], 2
-; CHECK-NEXT:    [[TMP2:%.*]] = sub i32 [[TMP1]], 1
-; CHECK-NEXT:    [[N_RND_UP:%.*]] = add i32 4, [[TMP2]]
-; CHECK-NEXT:    [[N_MOD_VF:%.*]] = urem i32 [[N_RND_UP]], [[TMP1]]
+; CHECK-NEXT:    [[TMP1:%.*]] = sub i32 [[TMP0]], 1
+; CHECK-NEXT:    [[N_RND_UP:%.*]] = add i32 4, [[TMP1]]
+; CHECK-NEXT:    [[N_MOD_VF:%.*]] = urem i32 [[N_RND_UP]], [[TMP0]]
 ; CHECK-NEXT:    [[N_VEC:%.*]] = sub i32 [[N_RND_UP]], [[N_MOD_VF]]
-; CHECK-NEXT:    [[TMP3:%.*]] = call i32 @llvm.vscale.i32()
-; CHECK-NEXT:    [[TMP4:%.*]] = mul i32 [[TMP3]], 2
+; CHECK-NEXT:    [[TMP2:%.*]] = call i32 @llvm.vscale.i32()
 ; CHECK-NEXT:    br label [[VECTOR_BODY:%.*]]
 ; CHECK:       vector.body:
 ; CHECK-NEXT:    [[INDEX:%.*]] = phi i32 [ 0, [[VECTOR_PH]] ], [ [[INDEX_NEXT:%.*]], [[VECTOR_BODY]] ]
-; CHECK-NEXT:    [[TMP5:%.*]] = add i32 [[INDEX]], 0
-; CHECK-NEXT:    [[ACTIVE_LANE_MASK:%.*]] = call <vscale x 2 x i1> @llvm.get.active.lane.mask.nxv2i1.i32(i32 [[TMP5]], i32 4)
-; CHECK-NEXT:    [[TMP6:%.*]] = getelementptr inbounds i32, ptr [[A:%.*]], i32 [[TMP5]]
-; CHECK-NEXT:    [[TMP7:%.*]] = getelementptr inbounds i32, ptr [[TMP6]], i32 0
-; CHECK-NEXT:    [[WIDE_MASKED_LOAD:%.*]] = call <vscale x 2 x i32> @llvm.masked.load.nxv2i32.p0(ptr [[TMP7]], i32 4, <vscale x 2 x i1> [[ACTIVE_LANE_MASK]], <vscale x 2 x i32> poison)
-; CHECK-NEXT:    [[TMP8:%.*]] = add nsw <vscale x 2 x i32> [[WIDE_MASKED_LOAD]], shufflevector (<vscale x 2 x i32> insertelement (<vscale x 2 x i32> poison, i32 1, i64 0), <vscale x 2 x i32> poison, <vscale x 2 x i32> zeroinitializer)
-; CHECK-NEXT:    call void @llvm.masked.store.nxv2i32.p0(<vscale x 2 x i32> [[TMP8]], ptr [[TMP7]], i32 4, <vscale x 2 x i1> [[ACTIVE_LANE_MASK]])
-; CHECK-NEXT:    [[INDEX_NEXT]] = add i32 [[INDEX]], [[TMP4]]
+; CHECK-NEXT:    [[TMP3:%.*]] = add i32 [[INDEX]], 0
+; CHECK-NEXT:    [[ACTIVE_LANE_MASK:%.*]] = call <vscale x 1 x i1> @llvm.get.active.lane.mask.nxv1i1.i32(i32 [[TMP3]], i32 4)
+; CHECK-NEXT:    [[TMP4:%.*]] = getelementptr inbounds i32, ptr [[A:%.*]], i32 [[TMP3]]
+; CHECK-NEXT:    [[TMP5:%.*]] = getelementptr inbounds i32, ptr [[TMP4]], i32 0
+; CHECK-NEXT:    [[WIDE_MASKED_LOAD:%.*]] = call <vscale x 1 x i32> @llvm.masked.load.nxv1i32.p0(ptr [[TMP5]], i32 4, <vscale x 1 x i1> [[ACTIVE_LANE_MASK]], <vscale x 1 x i32> poison)
+; CHECK-NEXT:    [[TMP6:%.*]] = add nsw <vscale x 1 x i32> [[WIDE_MASKED_LOAD]], shufflevector (<vscale x 1 x i32> insertelement (<vscale x 1 x i32> poison, i32 1, i64 0), <vscale x 1 x i32> poison, <vscale x 1 x i32> zeroinitializer)
+; CHECK-NEXT:    call void @llvm.masked.store.nxv1i32.p0(<vscale x 1 x i32> [[TMP6]], ptr [[TMP5]], i32 4, <vscale x 1 x i1> [[ACTIVE_LANE_MASK]])
+; CHECK-NEXT:    [[INDEX_NEXT]] = add i32 [[INDEX]], [[TMP2]]
 ; CHECK-NEXT:    br i1 true, label [[MIDDLE_BLOCK:%.*]], label [[VECTOR_BODY]], !llvm.loop [[LOOP0:![0-9]+]]
 ; CHECK:       middle.block:
 ; CHECK-NEXT:    br i1 true, label [[EXIT:%.*]], label [[SCALAR_PH]]
diff --git a/llvm/test/Transforms/LoopVectorize/X86/amdlibm-calls.ll b/llvm/test/Transforms/LoopVectorize/X86/amdlibm-calls.ll
index 1627292..3803921 100644
--- a/llvm/test/Transforms/LoopVectorize/X86/amdlibm-calls.ll
+++ b/llvm/test/Transforms/LoopVectorize/X86/amdlibm-calls.ll
@@ -1,4 +1,5 @@
 ; RUN: opt -vector-library=AMDLIBM -passes=inject-tli-mappings,loop-vectorize -force-vector-width=4 -force-vector-interleave=1 -mattr=avx -S < %s | FileCheck %s
+; RUN: opt -vector-library=AMDLIBM -passes=inject-tli-mappings,loop-vectorize -force-vector-width=2 -force-vector-interleave=1 -mattr=avx -S < %s | FileCheck %s --check-prefix=CHECK-AVX-VF2
 ; RUN: opt -vector-library=AMDLIBM -passes=inject-tli-mappings,loop-vectorize -force-vector-width=8 -force-vector-interleave=1 -mattr=+avx512f -S < %s | FileCheck %s --check-prefix=CHECK-AVX512-VF8
 ; RUN: opt -vector-library=AMDLIBM -passes=inject-tli-mappings,loop-vectorize -force-vector-width=16 -force-vector-interleave=1 -mattr=+avx512f -S < %s | FileCheck %s --check-prefix=CHECK-AVX512-VF16
 
@@ -20,6 +21,27 @@ declare float @tanf(float) #0
 declare double @llvm.tan.f64(double) #0
 declare float @llvm.tan.f32(float) #0
 
+declare float @acosf(float) #0
+declare float @llvm.acos.f32(float) #0
+
+declare double @asin(double) #0
+declare float @asinf(float) #0
+declare double @llvm.asin.f64(double) #0
+declare float @llvm.asin.f32(float) #0
+
+declare double @atan(double) #0
+declare float @atanf(float) #0
+declare double @llvm.atan.f64(double) #0
+declare float @llvm.atan.f32(float) #0
+
+declare double @cosh(double) #0
+declare float @coshf(float) #0
+declare double @llvm.cosh.f64(double) #0
+declare float @llvm.cosh.f32(float) #0
+
+declare float @tanhf(float) #0
+declare float @llvm.tanh.f32(float) #0
+
 declare double @pow(double, double) #0
 declare float @powf(float, float) #0
 declare double @llvm.pow.f64(double, double) #0
@@ -274,6 +296,10 @@ define void @tan_f64(ptr nocapture %varray) {
 ; CHECK:    [[TMP5:%.*]] = call <4 x double> @amd_vrd4_tan(<4 x double> [[TMP4:%.*]])
 ; CHECK:    ret void
 ;
+; CHECK-AVX-VF2-LABEL: @tan_f64(
+; CHECK-AVX-VF2:    [[TMP5:%.*]] = call <2 x double> @amd_vrd2_tan(<2 x double> [[TMP4:%.*]])
+; CHECK-AVX-VF2:    ret void
+;
 ; CHECK-AVX512-VF8-LABEL: @tan_f64(
 ; CHECK-AVX512-VF8:    [[TMP5:%.*]] = call <8 x double> @amd_vrd8_tan(<8 x double> [[TMP4:%.*]])
 ; CHECK-AVX512-VF8:    ret void
@@ -328,6 +354,10 @@ define void @tan_f64_intrinsic(ptr nocapture %varray) {
 ; CHECK:    [[TMP5:%.*]] = call <4 x double> @amd_vrd4_tan(<4 x double> [[TMP4:%.*]])
 ; CHECK:    ret void
 ;
+; CHECK-AVX-VF2-LABEL: @tan_f64_intrinsic(
+; CHECK-AVX-VF2:    [[TMP5:%.*]] = call <2 x double> @amd_vrd2_tan(<2 x double> [[TMP4:%.*]])
+; CHECK-AVX-VF2:    ret void
+;
 ; CHECK-AVX512-VF8-LABEL: @tan_f64_intrinsic(
 ; CHECK-AVX512-VF8:    [[TMP5:%.*]] = call <8 x double> @amd_vrd8_tan(<8 x double> [[TMP4:%.*]])
 ; CHECK-AVX512-VF8:    ret void
@@ -377,6 +407,360 @@ for.end:
   ret void
 }
 
+define void @acos_f32(ptr nocapture %varray) {
+; CHECK-LABEL: @acos_f32(
+; CHECK:    [[TMP5:%.*]] = call <4 x float> @amd_vrs4_acosf(<4 x float> [[TMP4:%.*]])
+; CHECK:    ret void
+;
+entry:
+  br label %for.body
+
+for.body:
+  %iv = phi i64 [ 0, %entry ], [ %iv.next, %for.body ]
+  %tmp = trunc i64 %iv to i32
+  %conv = sitofp i32 %tmp to float
+  %call = tail call float @acosf(float %conv)
+  %arrayidx = getelementptr inbounds float, ptr %varray, i64 %iv
+  store float %call, ptr %arrayidx, align 4
+  %iv.next = add nuw nsw i64 %iv, 1
+  %exitcond = icmp eq i64 %iv.next, 1000
+  br i1 %exitcond, label %for.end, label %for.body
+
+for.end:
+  ret void
+}
+
+define void @acos_f32_intrinsic(ptr nocapture %varray) {
+; CHECK-LABEL: @acos_f32_intrinsic(
+; CHECK:    [[TMP5:%.*]] = call <4 x float> @amd_vrs4_acosf(<4 x float> [[TMP4:%.*]])
+; CHECK:    ret void
+;
+entry:
+  br label %for.body
+
+for.body:
+  %iv = phi i64 [ 0, %entry ], [ %iv.next, %for.body ]
+  %tmp = trunc i64 %iv to i32
+  %conv = sitofp i32 %tmp to float
+  %call = tail call float @llvm.acos.f32(float %conv)
+  %arrayidx = getelementptr inbounds float, ptr %varray, i64 %iv
+  store float %call, ptr %arrayidx, align 4
+  %iv.next = add nuw nsw i64 %iv, 1
+  %exitcond = icmp eq i64 %iv.next, 1000
+  br i1 %exitcond, label %for.end, label %for.body
+
+for.end:
+  ret void
+}
+
+define void @asin_f64(ptr nocapture %varray) {
+; CHECK-AVX512-VF8-LABEL: @asin_f64(
+; CHECK-AVX512-VF8:    [[TMP5:%.*]] = call <8 x double> @amd_vrd8_asin(<8 x double> [[TMP4:%.*]])
+; CHECK-AVX512-VF8:    ret void
+;
+entry:
+  br label %for.body
+
+for.body:
+  %iv = phi i64 [ 0, %entry ], [ %iv.next, %for.body ]
+  %tmp = trunc i64 %iv to i32
+  %conv = sitofp i32 %tmp to double
+  %call = tail call double @asin(double %conv)
+  %arrayidx = getelementptr inbounds double, ptr %varray, i64 %iv
+  store double %call, ptr %arrayidx, align 4
+  %iv.next = add nuw nsw i64 %iv, 1
+  %exitcond = icmp eq i64 %iv.next, 1000
+  br i1 %exitcond, label %for.end, label %for.body
+
+for.end:
+  ret void
+}
+
+define void @asin_f32(ptr nocapture %varray) {
+; CHECK-LABEL: @asin_f32(
+; CHECK:    [[TMP5:%.*]] = call <4 x float> @amd_vrs4_asinf(<4 x float> [[TMP4:%.*]])
+; CHECK:    ret void
+;
+; CHECK-AVX512-VF16-LABEL: @asin_f32(
+; CHECK-AVX512-VF16:    [[TMP5:%.*]] = call <16 x float> @amd_vrs16_asinf(<16 x float> [[TMP4:%.*]])
+; CHECK-AVX512-VF16:    ret void
+;
+entry:
+  br label %for.body
+
+for.body:
+  %iv = phi i64 [ 0, %entry ], [ %iv.next, %for.body ]
+  %tmp = trunc i64 %iv to i32
+  %conv = sitofp i32 %tmp to float
+  %call = tail call float @asinf(float %conv)
+  %arrayidx = getelementptr inbounds float, ptr %varray, i64 %iv
+  store float %call, ptr %arrayidx, align 4
+  %iv.next = add nuw nsw i64 %iv, 1
+  %exitcond = icmp eq i64 %iv.next, 1000
+  br i1 %exitcond, label %for.end, label %for.body
+
+for.end:
+  ret void
+}
+
+define void @asin_f64_intrinsic(ptr nocapture %varray) {
+; CHECK-AVX512-VF8-LABEL: @asin_f64_intrinsic(
+; CHECK-AVX512-VF8:    [[TMP5:%.*]] = call <8 x double> @amd_vrd8_asin(<8 x double> [[TMP4:%.*]])
+; CHECK-AVX512-VF8:    ret void
+;
+entry:
+  br label %for.body
+
+for.body:
+  %iv = phi i64 [ 0, %entry ], [ %iv.next, %for.body ]
+  %tmp = trunc i64 %iv to i32
+  %conv = sitofp i32 %tmp to double
+  %call = tail call double @llvm.asin.f64(double %conv)
+  %arrayidx = getelementptr inbounds double, ptr %varray, i64 %iv
+  store double %call, ptr %arrayidx, align 4
+  %iv.next = add nuw nsw i64 %iv, 1
+  %exitcond = icmp eq i64 %iv.next, 1000
+  br i1 %exitcond, label %for.end, label %for.body
+
+for.end:
+  ret void
+}
+
+define void @asin_f32_intrinsic(ptr nocapture %varray) {
+; CHECK-LABEL: @asin_f32_intrinsic(
+; CHECK:    [[TMP5:%.*]] = call <4 x float> @amd_vrs4_asinf(<4 x float> [[TMP4:%.*]])
+; CHECK:    ret void
+;
+; CHECK-AVX512-VF16-LABEL: @asin_f32_intrinsic(
+; CHECK-AVX512-VF16:    [[TMP5:%.*]] = call <16 x float> @amd_vrs16_asinf(<16 x float> [[TMP4:%.*]])
+; CHECK-AVX512-VF16:    ret void
+;
+entry:
+  br label %for.body
+
+for.body:
+  %iv = phi i64 [ 0, %entry ], [ %iv.next, %for.body ]
+  %tmp = trunc i64 %iv to i32
+  %conv = sitofp i32 %tmp to float
+  %call = tail call float @llvm.asin.f32(float %conv)
+  %arrayidx = getelementptr inbounds float, ptr %varray, i64 %iv
+  store float %call, ptr %arrayidx, align 4
+  %iv.next = add nuw nsw i64 %iv, 1
+  %exitcond = icmp eq i64 %iv.next, 1000
+  br i1 %exitcond, label %for.end, label %for.body
+
+for.end:
+  ret void
+}
+
+define void @atan_f64(ptr nocapture %varray) {
+; CHECK-LABEL: @atan_f64(
+; CHECK:    [[TMP5:%.*]] = call <4 x double> @amd_vrd4_atan(<4 x double> [[TMP4:%.*]])
+; CHECK:    ret void
+;
+; CHECK-AVX-VF2-LABEL: @atan_f64(
+; CHECK-AVX-VF2:    [[TMP5:%.*]] = call <2 x double> @amd_vrd2_atan(<2 x double> [[TMP4:%.*]])
+; CHECK-AVX-VF2:    ret void
+;
+; CHECK-AVX512-VF8-LABEL: @atan_f64(
+; CHECK-AVX512-VF8:    [[TMP5:%.*]] = call <8 x double> @amd_vrd8_atan(<8 x double> [[TMP4:%.*]])
+; CHECK-AVX512-VF8:    ret void
+;
+entry:
+  br label %for.body
+
+for.body:
+  %iv = phi i64 [ 0, %entry ], [ %iv.next, %for.body ]
+  %tmp = trunc i64 %iv to i32
+  %conv = sitofp i32 %tmp to double
+  %call = tail call double @atan(double %conv)
+  %arrayidx = getelementptr inbounds double, ptr %varray, i64 %iv
+  store double %call, ptr %arrayidx, align 4
+  %iv.next = add nuw nsw i64 %iv, 1
+  %exitcond = icmp eq i64 %iv.next, 1000
+  br i1 %exitcond, label %for.end, label %for.body
+
+for.end:
+  ret void
+}
+
+define void @atan_f32(ptr nocapture %varray) {
+; CHECK-LABEL: @atan_f32(
+; CHECK:    [[TMP5:%.*]] = call <4 x float> @amd_vrs4_atanf(<4 x float> [[TMP4:%.*]])
+; CHECK:    ret void
+;
+; CHECK-AVX512-VF16-LABEL: @atan_f32(
+; CHECK-AVX512-VF16:    [[TMP5:%.*]] = call <16 x float> @amd_vrs16_atanf(<16 x float> [[TMP4:%.*]])
+; CHECK-AVX512-VF16:    ret void
+;
+entry:
+  br label %for.body
+
+for.body:
+  %iv = phi i64 [ 0, %entry ], [ %iv.next, %for.body ]
+  %tmp = trunc i64 %iv to i32
+  %conv = sitofp i32 %tmp to float
+  %call = tail call float @atanf(float %conv)
+  %arrayidx = getelementptr inbounds float, ptr %varray, i64 %iv
+  store float %call, ptr %arrayidx, align 4
+  %iv.next = add nuw nsw i64 %iv, 1
+  %exitcond = icmp eq i64 %iv.next, 1000
+  br i1 %exitcond, label %for.end, label %for.body
+
+for.end:
+  ret void
+}
+
+define void @atan_f64_intrinsic(ptr nocapture %varray) {
+; CHECK-LABEL: @atan_f64_intrinsic(
+; CHECK:    [[TMP5:%.*]] = call <4 x double> @amd_vrd4_atan(<4 x double> [[TMP4:%.*]])
+; CHECK:    ret void
+;
+; CHECK-AVX-VF2-LABEL: @atan_f64_intrinsic(
+; CHECK-AVX-VF2:    [[TMP5:%.*]] = call <2 x double> @amd_vrd2_atan(<2 x double> [[TMP4:%.*]])
+; CHECK-AVX-VF2:    ret void
+;
+; CHECK-AVX512-VF8-LABEL: @atan_f64_intrinsic(
+; CHECK-AVX512-VF8:    [[TMP5:%.*]] = call <8 x double> @amd_vrd8_atan(<8 x double> [[TMP4:%.*]])
+; CHECK-AVX512-VF8:    ret void
+;
+entry:
+  br label %for.body
+
+for.body:
+  %iv = phi i64 [ 0, %entry ], [ %iv.next, %for.body ]
+  %tmp = trunc i64 %iv to i32
+  %conv = sitofp i32 %tmp to double
+  %call = tail call double @llvm.atan.f64(double %conv)
+  %arrayidx = getelementptr inbounds double, ptr %varray, i64 %iv
+  store double %call, ptr %arrayidx, align 4
+  %iv.next = add nuw nsw i64 %iv, 1
+  %exitcond = icmp eq i64 %iv.next, 1000
+  br i1 %exitcond, label %for.end, label %for.body
+
+for.end:
+  ret void
+}
+
+define void @atan_f32_intrinsic(ptr nocapture %varray) {
+; CHECK-LABEL: @atan_f32_intrinsic(
+; CHECK:    [[TMP5:%.*]] = call <4 x float> @amd_vrs4_atanf(<4 x float> [[TMP4:%.*]])
+; CHECK:    ret void
+;
+; CHECK-AVX512-VF16-LABEL: @atan_f32_intrinsic(
+; CHECK-AVX512-VF16:    [[TMP5:%.*]] = call <16 x float> @amd_vrs16_atanf(<16 x float> [[TMP4:%.*]])
+; CHECK-AVX512-VF16:    ret void
+;
+entry:
+  br label %for.body
+
+for.body:
+  %iv = phi i64 [ 0, %entry ], [ %iv.next, %for.body ]
+  %tmp = trunc i64 %iv to i32
+  %conv = sitofp i32 %tmp to float
+  %call = tail call float @llvm.atan.f32(float %conv)
+  %arrayidx = getelementptr inbounds float, ptr %varray, i64 %iv
+  store float %call, ptr %arrayidx, align 4
+  %iv.next = add nuw nsw i64 %iv, 1
+  %exitcond = icmp eq i64 %iv.next, 1000
+  br i1 %exitcond, label %for.end, label %for.body
+
+for.end:
+  ret void
+}
+
+define void @cosh_f32(ptr nocapture %varray) {
+; CHECK-LABEL: @cosh_f32(
+; CHECK:    [[TMP5:%.*]] = call <4 x float> @amd_vrs4_coshf(<4 x float> [[TMP4:%.*]])
+; CHECK:    ret void
+;
+entry:
+  br label %for.body
+
+for.body:
+  %iv = phi i64 [ 0, %entry ], [ %iv.next, %for.body ]
+  %tmp = trunc i64 %iv to i32
+  %conv = sitofp i32 %tmp to float
+  %call = tail call float @coshf(float %conv)
+  %arrayidx = getelementptr inbounds float, ptr %varray, i64 %iv
+  store float %call, ptr %arrayidx, align 4
+  %iv.next = add nuw nsw i64 %iv, 1
+  %exitcond = icmp eq i64 %iv.next, 1000
+  br i1 %exitcond, label %for.end, label %for.body
+
+for.end:
+  ret void
+}
+
+define void @cosh_f32_intrinsic(ptr nocapture %varray) {
+; CHECK-LABEL: @cosh_f32_intrinsic(
+; CHECK:    [[TMP5:%.*]] = call <4 x float> @amd_vrs4_coshf(<4 x float> [[TMP4:%.*]])
+; CHECK:    ret void
+;
+entry:
+  br label %for.body
+
+for.body:
+  %iv = phi i64 [ 0, %entry ], [ %iv.next, %for.body ]
+  %tmp = trunc i64 %iv to i32
+  %conv = sitofp i32 %tmp to float
+  %call = tail call float @llvm.cosh.f32(float %conv)
+  %arrayidx = getelementptr inbounds float, ptr %varray, i64 %iv
+  store float %call, ptr %arrayidx, align 4
+  %iv.next = add nuw nsw i64 %iv, 1
+  %exitcond = icmp eq i64 %iv.next, 1000
+  br i1 %exitcond, label %for.end, label %for.body
+
+for.end:
+  ret void
+}
+
+define void @tanh_f32(ptr nocapture %varray) {
+; CHECK-LABEL: @tanh_f32(
+; CHECK:    [[TMP5:%.*]] = call <4 x float> @amd_vrs4_tanhf(<4 x float> [[TMP4:%.*]])
+; CHECK:    ret void
+;
+entry:
+  br label %for.body
+
+for.body:
+  %iv = phi i64 [ 0, %entry ], [ %iv.next, %for.body ]
+  %tmp = trunc i64 %iv to i32
+  %conv = sitofp i32 %tmp to float
+  %call = tail call float @tanhf(float %conv)
+  %arrayidx = getelementptr inbounds float, ptr %varray, i64 %iv
+  store float %call, ptr %arrayidx, align 4
+  %iv.next = add nuw nsw i64 %iv, 1
+  %exitcond = icmp eq i64 %iv.next, 1000
+  br i1 %exitcond, label %for.end, label %for.body
+
+for.end:
+  ret void
+}
+
+define void @tanh_f32_intrinsic(ptr nocapture %varray) {
+; CHECK-LABEL: @tanh_f32_intrinsic(
+; CHECK:    [[TMP5:%.*]] = call <4 x float> @amd_vrs4_tanhf(<4 x float> [[TMP4:%.*]])
+; CHECK:    ret void
+;
+entry:
+  br label %for.body
+
+for.body:
+  %iv = phi i64 [ 0, %entry ], [ %iv.next, %for.body ]
+  %tmp = trunc i64 %iv to i32
+  %conv = sitofp i32 %tmp to float
+  %call = tail call float @llvm.tanh.f32(float %conv)
+  %arrayidx = getelementptr inbounds float, ptr %varray, i64 %iv
+  store float %call, ptr %arrayidx, align 4
+  %iv.next = add nuw nsw i64 %iv, 1
+  %exitcond = icmp eq i64 %iv.next, 1000
+  br i1 %exitcond, label %for.end, label %for.body
+
+for.end:
+  ret void
+}
+
 define void @pow_f64(ptr nocapture %varray, ptr nocapture readonly %exp) {
 ; CHECK-LABEL: @pow_f64(
 ; CHECK:    [[TMP8:%.*]] = call <4 x double> @amd_vrd4_pow(<4 x double> [[TMP4:%.*]], <4 x double> [[WIDE_LOAD:%.*]])
diff --git a/llvm/test/Transforms/LoopVectorize/X86/veclib-calls.ll b/llvm/test/Transforms/LoopVectorize/X86/veclib-calls.ll
index 27038f3..b3a5836 100644
--- a/llvm/test/Transforms/LoopVectorize/X86/veclib-calls.ll
+++ b/llvm/test/Transforms/LoopVectorize/X86/veclib-calls.ll
@@ -456,6 +456,31 @@ for.end:                                          ; preds = %for.body, %entry
   ret void
 }
 
+;CHECK-LABEL: @asin_f32_intrinsic(
+;CHECK: vasinf{{.*}}<4 x float>
+;CHECK: ret void
+declare float @llvm.asin.f32(float) nounwind readnone
+define void @asin_f32_intrinsic(i32 %n, ptr noalias %y, ptr noalias %x) nounwind uwtable {
+entry:
+  %cmp6 = icmp sgt i32 %n, 0
+  br i1 %cmp6, label %for.body, label %for.end
+
+for.body:                                         ; preds = %entry, %for.body
+  %indvars.iv = phi i64 [ %indvars.iv.next, %for.body ], [ 0, %entry ]
+  %arrayidx = getelementptr inbounds float, ptr %y, i64 %indvars.iv
+  %0 = load float, ptr %arrayidx, align 4
+  %call = tail call float @llvm.asin.f32(float %0) nounwind readnone
+  %arrayidx2 = getelementptr inbounds float, ptr %x, i64 %indvars.iv
+  store float %call, ptr %arrayidx2, align 4
+  %indvars.iv.next = add i64 %indvars.iv, 1
+  %lftr.wideiv = trunc i64 %indvars.iv.next to i32
+  %exitcond = icmp eq i32 %lftr.wideiv, %n
+  br i1 %exitcond, label %for.end, label %for.body
+
+for.end:                                          ; preds = %for.body, %entry
+  ret void
+}
+
 ;CHECK-LABEL: @acos_f32(
 ;CHECK: vacosf{{.*}}<4 x float>
 ;CHECK: ret void
@@ -481,6 +506,31 @@ for.end:                                          ; preds = %for.body, %entry
   ret void
 }
 
+;CHECK-LABEL: @acos_f32_intrinsic(
+;CHECK: vacosf{{.*}}<4 x float>
+;CHECK: ret void
+declare float @llvm.acos.f32(float) nounwind readnone
+define void @acos_f32_intrinsic(i32 %n, ptr noalias %y, ptr noalias %x) nounwind uwtable {
+entry:
+  %cmp6 = icmp sgt i32 %n, 0
+  br i1 %cmp6, label %for.body, label %for.end
+
+for.body:                                         ; preds = %entry, %for.body
+  %indvars.iv = phi i64 [ %indvars.iv.next, %for.body ], [ 0, %entry ]
+  %arrayidx = getelementptr inbounds float, ptr %y, i64 %indvars.iv
+  %0 = load float, ptr %arrayidx, align 4
+  %call = tail call float @llvm.acos.f32(float %0) nounwind readnone
+  %arrayidx2 = getelementptr inbounds float, ptr %x, i64 %indvars.iv
+  store float %call, ptr %arrayidx2, align 4
+  %indvars.iv.next = add i64 %indvars.iv, 1
+  %lftr.wideiv = trunc i64 %indvars.iv.next to i32
+  %exitcond = icmp eq i32 %lftr.wideiv, %n
+  br i1 %exitcond, label %for.end, label %for.body
+
+for.end:                                          ; preds = %for.body, %entry
+  ret void
+}
+
 ;CHECK-LABEL: @atan_f32(
 ;CHECK: vatanf{{.*}}<4 x float>
 ;CHECK: ret void
@@ -506,6 +556,31 @@ for.end:                                          ; preds = %for.body, %entry
   ret void
 }
 
+;CHECK-LABEL: @atan_f32_intrinsic(
+;CHECK: vatanf{{.*}}<4 x float>
+;CHECK: ret void
+declare float @llvm.atan.f32(float) nounwind readnone
+define void @atan_f32_intrinsic(i32 %n, ptr noalias %y, ptr noalias %x) nounwind uwtable {
+entry:
+  %cmp6 = icmp sgt i32 %n, 0
+  br i1 %cmp6, label %for.body, label %for.end
+
+for.body:                                         ; preds = %entry, %for.body
+  %indvars.iv = phi i64 [ %indvars.iv.next, %for.body ], [ 0, %entry ]
+  %arrayidx = getelementptr inbounds float, ptr %y, i64 %indvars.iv
+  %0 = load float, ptr %arrayidx, align 4
+  %call = tail call float @llvm.atan.f32(float %0) nounwind readnone
+  %arrayidx2 = getelementptr inbounds float, ptr %x, i64 %indvars.iv
+  store float %call, ptr %arrayidx2, align 4
+  %indvars.iv.next = add i64 %indvars.iv, 1
+  %lftr.wideiv = trunc i64 %indvars.iv.next to i32
+  %exitcond = icmp eq i32 %lftr.wideiv, %n
+  br i1 %exitcond, label %for.end, label %for.body
+
+for.end:                                          ; preds = %for.body, %entry
+  ret void
+}
+
 ;CHECK-LABEL: @sinh_f32(
 ;CHECK: vsinhf{{.*}}<4 x float>
 ;CHECK: ret void
@@ -531,6 +606,31 @@ for.end:                                          ; preds = %for.body, %entry
   ret void
 }
 
+;CHECK-LABEL: @sinh_f32_intrinsic(
+;CHECK: vsinhf{{.*}}<4 x float>
+;CHECK: ret void
+declare float @llvm.sinh.f32(float) nounwind readnone
+define void @sinh_f32_intrinsic(i32 %n, ptr noalias %y, ptr noalias %x) nounwind uwtable {
+entry:
+  %cmp6 = icmp sgt i32 %n, 0
+  br i1 %cmp6, label %for.body, label %for.end
+
+for.body:                                         ; preds = %entry, %for.body
+  %indvars.iv = phi i64 [ %indvars.iv.next, %for.body ], [ 0, %entry ]
+  %arrayidx = getelementptr inbounds float, ptr %y, i64 %indvars.iv
+  %0 = load float, ptr %arrayidx, align 4
+  %call = tail call float @llvm.sinh.f32(float %0) nounwind readnone
+  %arrayidx2 = getelementptr inbounds float, ptr %x, i64 %indvars.iv
+  store float %call, ptr %arrayidx2, align 4
+  %indvars.iv.next = add i64 %indvars.iv, 1
+  %lftr.wideiv = trunc i64 %indvars.iv.next to i32
+  %exitcond = icmp eq i32 %lftr.wideiv, %n
+  br i1 %exitcond, label %for.end, label %for.body
+
+for.end:                                          ; preds = %for.body, %entry
+  ret void
+}
+
 ;CHECK-LABEL: @cosh_f32(
 ;CHECK: vcoshf{{.*}}<4 x float>
 ;CHECK: ret void
@@ -556,6 +656,31 @@ for.end:                                          ; preds = %for.body, %entry
   ret void
 }
 
+;CHECK-LABEL: @cosh_f32_intrinsic(
+;CHECK: vcoshf{{.*}}<4 x float>
+;CHECK: ret void
+declare float @llvm.cosh.f32(float) nounwind readnone
+define void @cosh_f32_intrinsic(i32 %n, ptr noalias %y, ptr noalias %x) nounwind uwtable {
+entry:
+  %cmp6 = icmp sgt i32 %n, 0
+  br i1 %cmp6, label %for.body, label %for.end
+
+for.body:                                         ; preds = %entry, %for.body
+  %indvars.iv = phi i64 [ %indvars.iv.next, %for.body ], [ 0, %entry ]
+  %arrayidx = getelementptr inbounds float, ptr %y, i64 %indvars.iv
+  %0 = load float, ptr %arrayidx, align 4
+  %call = tail call float @llvm.cosh.f32(float %0) nounwind readnone
+  %arrayidx2 = getelementptr inbounds float, ptr %x, i64 %indvars.iv
+  store float %call, ptr %arrayidx2, align 4
+  %indvars.iv.next = add i64 %indvars.iv, 1
+  %lftr.wideiv = trunc i64 %indvars.iv.next to i32
+  %exitcond = icmp eq i32 %lftr.wideiv, %n
+  br i1 %exitcond, label %for.end, label %for.body
+
+for.end:                                          ; preds = %for.body, %entry
+  ret void
+}
+
 ;CHECK-LABEL: @tanh_f32(
 ;CHECK: vtanhf{{.*}}<4 x float>
 ;CHECK: ret void
@@ -581,6 +706,31 @@ for.end:                                          ; preds = %for.body, %entry
   ret void
 }
 
+;CHECK-LABEL: @tanh_f32_intrinsic(
+;CHECK: vtanhf{{.*}}<4 x float>
+;CHECK: ret void
+declare float @llvm.tanh.f32(float) nounwind readnone
+define void @tanh_f32_intrinsic(i32 %n, ptr noalias %y, ptr noalias %x) nounwind uwtable {
+entry:
+  %cmp6 = icmp sgt i32 %n, 0
+  br i1 %cmp6, label %for.body, label %for.end
+
+for.body:                                         ; preds = %entry, %for.body
+  %indvars.iv = phi i64 [ %indvars.iv.next, %for.body ], [ 0, %entry ]
+  %arrayidx = getelementptr inbounds float, ptr %y, i64 %indvars.iv
+  %0 = load float, ptr %arrayidx, align 4
+  %call = tail call float @llvm.tanh.f32(float %0) nounwind readnone
+  %arrayidx2 = getelementptr inbounds float, ptr %x, i64 %indvars.iv
+  store float %call, ptr %arrayidx2, align 4
+  %indvars.iv.next = add i64 %indvars.iv, 1
+  %lftr.wideiv = trunc i64 %indvars.iv.next to i32
+  %exitcond = icmp eq i32 %lftr.wideiv, %n
+  br i1 %exitcond, label %for.end, label %for.body
+
+for.end:                                          ; preds = %for.body, %entry
+  ret void
+}
+
 ;CHECK-LABEL: @asinh_f32(
 ;CHECK: vasinhf{{.*}}<4 x float>
 ;CHECK: ret void
diff --git a/llvm/test/Transforms/MemProfContextDisambiguation/tailcall.ll b/llvm/test/Transforms/MemProfContextDisambiguation/tailcall.ll
index 9a771e0..abba3dd 100644
--- a/llvm/test/Transforms/MemProfContextDisambiguation/tailcall.ll
+++ b/llvm/test/Transforms/MemProfContextDisambiguation/tailcall.ll
@@ -16,6 +16,12 @@ source_filename = "memprof-tailcall.cc"
 target datalayout = "e-m:e-p270:32:32-p271:32:32-p272:64:64-i64:64-i128:128-f80:128-n8:16:32:64-S128"
 target triple = "x86_64-unknown-linux-gnu"
 
+@a = dso_local global [2 x ptr] [ptr @_Z2a1v, ptr @_Z2a2v], align 16
+
+declare void @_Z2a1v() #0
+
+declare void @_Z2a2v() #0
+
 ; Function Attrs: noinline
 ; IR-LABEL: @_Z3barv()
 define ptr @_Z3barv() local_unnamed_addr #0 {
@@ -58,6 +64,8 @@ define i32 @main() #0 {
   ;; cloned functions.
   ; IR: call ptr @_Z3foov.memprof.1()
   %call1 = tail call ptr @_Z3foov(), !callsite !7
+  %2 = load ptr, ptr @a, align 16
+  call void %2(), !callsite !10
   ret i32 0
 }
 
@@ -79,7 +87,7 @@ attributes #0 = { noinline }
 attributes #1 = { nobuiltin allocsize(0) }
 attributes #2 = { builtin allocsize(0) }
 
-!0 = !{!1, !3}
+!0 = !{!1, !3, !8}
 !1 = !{!2, !"notcold"}
 !2 = !{i64 3186456655321080972, i64 8632435727821051414}
 !3 = !{!4, !"cold"}
@@ -87,3 +95,6 @@ attributes #2 = { builtin allocsize(0) }
 !5 = !{i64 3186456655321080972}
 !6 = !{i64 8632435727821051414}
 !7 = !{i64 -3421689549917153178}
+!8 = !{!9, !"notcold"}
+!9 = !{i64 3186456655321080972, i64 1}
+!10 = !{i64 1}
diff --git a/llvm/test/Transforms/PhaseOrdering/AArch64/slpordering.ll b/llvm/test/Transforms/PhaseOrdering/AArch64/slpordering.ll
index 0e14481..22511c0 100644
--- a/llvm/test/Transforms/PhaseOrdering/AArch64/slpordering.ll
+++ b/llvm/test/Transforms/PhaseOrdering/AArch64/slpordering.ll
@@ -2,7 +2,9 @@
 ; RUN: opt -S -O3 < %s | FileCheck %s
 
 ; Check unrolling / SLP vectorization where the order of lanes is important for
-; producing efficient shuffles.
+; producing efficient shuffles. The shuffles should be regular and cheap for
+; AArch64. [0 2 4 6] and [1 3 5 7] will produce uzp1/uzp2 instruction. The
+; v16i32 shuffles will be legalized to individual v4i32.
 
 target datalayout = "e-m:e-i8:8:32-i16:16:32-i64:64-i128:128-n32:64-S128-Fn32"
 target triple = "aarch64"
@@ -44,29 +46,29 @@ define i32 @slpordering(ptr noundef %p1, i32 noundef %ip1, ptr noundef %p2, i32
 ; CHECK-NEXT:    [[TMP13:%.*]] = load <4 x i8>, ptr [[RDD_PTR64_2]], align 1, !tbaa [[TBAA0]]
 ; CHECK-NEXT:    [[TMP14:%.*]] = load <4 x i8>, ptr [[RRRAYIDX3_3]], align 1, !tbaa [[TBAA0]]
 ; CHECK-NEXT:    [[TMP15:%.*]] = load <4 x i8>, ptr [[RRRAYIDX5_3]], align 1, !tbaa [[TBAA0]]
-; CHECK-NEXT:    [[TMP16:%.*]] = shufflevector <4 x i8> [[TMP8]], <4 x i8> [[TMP12]], <16 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison>
-; CHECK-NEXT:    [[TMP17:%.*]] = shufflevector <4 x i8> [[TMP4]], <4 x i8> poison, <16 x i32> <i32 0, i32 1, i32 2, i32 3, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison>
+; CHECK-NEXT:    [[TMP16:%.*]] = shufflevector <4 x i8> [[TMP0]], <4 x i8> [[TMP4]], <16 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison>
+; CHECK-NEXT:    [[TMP17:%.*]] = shufflevector <4 x i8> [[TMP8]], <4 x i8> poison, <16 x i32> <i32 0, i32 1, i32 2, i32 3, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison>
 ; CHECK-NEXT:    [[TMP18:%.*]] = shufflevector <16 x i8> [[TMP16]], <16 x i8> [[TMP17]], <16 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7, i32 16, i32 17, i32 18, i32 19, i32 poison, i32 poison, i32 poison, i32 poison>
-; CHECK-NEXT:    [[TMP19:%.*]] = shufflevector <4 x i8> [[TMP0]], <4 x i8> poison, <16 x i32> <i32 0, i32 1, i32 2, i32 3, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison>
+; CHECK-NEXT:    [[TMP19:%.*]] = shufflevector <4 x i8> [[TMP12]], <4 x i8> poison, <16 x i32> <i32 0, i32 1, i32 2, i32 3, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison>
 ; CHECK-NEXT:    [[TMP20:%.*]] = shufflevector <16 x i8> [[TMP18]], <16 x i8> [[TMP19]], <16 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7, i32 8, i32 9, i32 10, i32 11, i32 16, i32 17, i32 18, i32 19>
 ; CHECK-NEXT:    [[TMP21:%.*]] = zext <16 x i8> [[TMP20]] to <16 x i32>
-; CHECK-NEXT:    [[TMP22:%.*]] = shufflevector <4 x i8> [[TMP9]], <4 x i8> [[TMP13]], <16 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison>
-; CHECK-NEXT:    [[TMP23:%.*]] = shufflevector <4 x i8> [[TMP5]], <4 x i8> poison, <16 x i32> <i32 0, i32 1, i32 2, i32 3, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison>
+; CHECK-NEXT:    [[TMP22:%.*]] = shufflevector <4 x i8> [[TMP1]], <4 x i8> [[TMP5]], <16 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison>
+; CHECK-NEXT:    [[TMP23:%.*]] = shufflevector <4 x i8> [[TMP9]], <4 x i8> poison, <16 x i32> <i32 0, i32 1, i32 2, i32 3, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison>
 ; CHECK-NEXT:    [[TMP24:%.*]] = shufflevector <16 x i8> [[TMP22]], <16 x i8> [[TMP23]], <16 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7, i32 16, i32 17, i32 18, i32 19, i32 poison, i32 poison, i32 poison, i32 poison>
-; CHECK-NEXT:    [[TMP25:%.*]] = shufflevector <4 x i8> [[TMP1]], <4 x i8> poison, <16 x i32> <i32 0, i32 1, i32 2, i32 3, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison>
+; CHECK-NEXT:    [[TMP25:%.*]] = shufflevector <4 x i8> [[TMP13]], <4 x i8> poison, <16 x i32> <i32 0, i32 1, i32 2, i32 3, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison>
 ; CHECK-NEXT:    [[TMP26:%.*]] = shufflevector <16 x i8> [[TMP24]], <16 x i8> [[TMP25]], <16 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7, i32 8, i32 9, i32 10, i32 11, i32 16, i32 17, i32 18, i32 19>
 ; CHECK-NEXT:    [[TMP27:%.*]] = zext <16 x i8> [[TMP26]] to <16 x i32>
 ; CHECK-NEXT:    [[TMP28:%.*]] = sub nsw <16 x i32> [[TMP21]], [[TMP27]]
-; CHECK-NEXT:    [[TMP29:%.*]] = shufflevector <4 x i8> [[TMP10]], <4 x i8> [[TMP14]], <16 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison>
-; CHECK-NEXT:    [[TMP30:%.*]] = shufflevector <4 x i8> [[TMP6]], <4 x i8> poison, <16 x i32> <i32 0, i32 1, i32 2, i32 3, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison>
+; CHECK-NEXT:    [[TMP29:%.*]] = shufflevector <4 x i8> [[TMP2]], <4 x i8> [[TMP6]], <16 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison>
+; CHECK-NEXT:    [[TMP30:%.*]] = shufflevector <4 x i8> [[TMP10]], <4 x i8> poison, <16 x i32> <i32 0, i32 1, i32 2, i32 3, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison>
 ; CHECK-NEXT:    [[TMP31:%.*]] = shufflevector <16 x i8> [[TMP29]], <16 x i8> [[TMP30]], <16 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7, i32 16, i32 17, i32 18, i32 19, i32 poison, i32 poison, i32 poison, i32 poison>
-; CHECK-NEXT:    [[TMP32:%.*]] = shufflevector <4 x i8> [[TMP2]], <4 x i8> poison, <16 x i32> <i32 0, i32 1, i32 2, i32 3, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison>
+; CHECK-NEXT:    [[TMP32:%.*]] = shufflevector <4 x i8> [[TMP14]], <4 x i8> poison, <16 x i32> <i32 0, i32 1, i32 2, i32 3, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison>
 ; CHECK-NEXT:    [[TMP33:%.*]] = shufflevector <16 x i8> [[TMP31]], <16 x i8> [[TMP32]], <16 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7, i32 8, i32 9, i32 10, i32 11, i32 16, i32 17, i32 18, i32 19>
 ; CHECK-NEXT:    [[TMP34:%.*]] = zext <16 x i8> [[TMP33]] to <16 x i32>
-; CHECK-NEXT:    [[TMP35:%.*]] = shufflevector <4 x i8> [[TMP11]], <4 x i8> [[TMP15]], <16 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison>
-; CHECK-NEXT:    [[TMP36:%.*]] = shufflevector <4 x i8> [[TMP7]], <4 x i8> poison, <16 x i32> <i32 0, i32 1, i32 2, i32 3, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison>
+; CHECK-NEXT:    [[TMP35:%.*]] = shufflevector <4 x i8> [[TMP3]], <4 x i8> [[TMP7]], <16 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison>
+; CHECK-NEXT:    [[TMP36:%.*]] = shufflevector <4 x i8> [[TMP11]], <4 x i8> poison, <16 x i32> <i32 0, i32 1, i32 2, i32 3, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison>
 ; CHECK-NEXT:    [[TMP37:%.*]] = shufflevector <16 x i8> [[TMP35]], <16 x i8> [[TMP36]], <16 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7, i32 16, i32 17, i32 18, i32 19, i32 poison, i32 poison, i32 poison, i32 poison>
-; CHECK-NEXT:    [[TMP38:%.*]] = shufflevector <4 x i8> [[TMP3]], <4 x i8> poison, <16 x i32> <i32 0, i32 1, i32 2, i32 3, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison>
+; CHECK-NEXT:    [[TMP38:%.*]] = shufflevector <4 x i8> [[TMP15]], <4 x i8> poison, <16 x i32> <i32 0, i32 1, i32 2, i32 3, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison>
 ; CHECK-NEXT:    [[TMP39:%.*]] = shufflevector <16 x i8> [[TMP37]], <16 x i8> [[TMP38]], <16 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7, i32 8, i32 9, i32 10, i32 11, i32 16, i32 17, i32 18, i32 19>
 ; CHECK-NEXT:    [[TMP40:%.*]] = zext <16 x i8> [[TMP39]] to <16 x i32>
 ; CHECK-NEXT:    [[TMP41:%.*]] = sub nsw <16 x i32> [[TMP34]], [[TMP40]]
@@ -84,19 +86,19 @@ define i32 @slpordering(ptr noundef %p1, i32 noundef %ip1, ptr noundef %p2, i32
 ; CHECK-NEXT:    [[TMP53:%.*]] = shufflevector <16 x i32> [[TMP48]], <16 x i32> [[TMP49]], <16 x i32> <i32 0, i32 2, i32 4, i32 6, i32 16, i32 18, i32 20, i32 22, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison>
 ; CHECK-NEXT:    [[TMP54:%.*]] = add nsw <16 x i32> [[TMP51]], [[TMP53]]
 ; CHECK-NEXT:    [[TMP55:%.*]] = sub nsw <16 x i32> [[TMP50]], [[TMP52]]
-; CHECK-NEXT:    [[TMP56:%.*]] = shufflevector <16 x i32> [[TMP54]], <16 x i32> [[TMP55]], <16 x i32> <i32 1, i32 2, i32 5, i32 6, i32 17, i32 18, i32 21, i32 22, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison>
-; CHECK-NEXT:    [[TMP57:%.*]] = shufflevector <16 x i32> [[TMP54]], <16 x i32> [[TMP55]], <16 x i32> <i32 0, i32 3, i32 4, i32 7, i32 16, i32 19, i32 20, i32 23, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison>
-; CHECK-NEXT:    [[TMP58:%.*]] = shufflevector <16 x i32> [[TMP54]], <16 x i32> [[TMP55]], <16 x i32> <i32 0, i32 3, i32 4, i32 7, i32 16, i32 19, i32 20, i32 23, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison>
-; CHECK-NEXT:    [[TMP59:%.*]] = shufflevector <16 x i32> [[TMP54]], <16 x i32> [[TMP55]], <16 x i32> <i32 1, i32 2, i32 5, i32 6, i32 17, i32 18, i32 21, i32 22, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison>
+; CHECK-NEXT:    [[TMP56:%.*]] = shufflevector <16 x i32> [[TMP54]], <16 x i32> [[TMP55]], <16 x i32> <i32 1, i32 3, i32 5, i32 7, i32 17, i32 19, i32 21, i32 23, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison>
+; CHECK-NEXT:    [[TMP57:%.*]] = shufflevector <16 x i32> [[TMP54]], <16 x i32> [[TMP55]], <16 x i32> <i32 0, i32 2, i32 4, i32 6, i32 16, i32 18, i32 20, i32 22, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison>
+; CHECK-NEXT:    [[TMP58:%.*]] = shufflevector <16 x i32> [[TMP54]], <16 x i32> [[TMP55]], <16 x i32> <i32 0, i32 2, i32 4, i32 6, i32 16, i32 18, i32 20, i32 22, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison>
+; CHECK-NEXT:    [[TMP59:%.*]] = shufflevector <16 x i32> [[TMP54]], <16 x i32> [[TMP55]], <16 x i32> <i32 1, i32 3, i32 5, i32 7, i32 17, i32 19, i32 21, i32 23, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison>
 ; CHECK-NEXT:    [[TMP60:%.*]] = sub nsw <16 x i32> [[TMP57]], [[TMP59]]
 ; CHECK-NEXT:    [[TMP61:%.*]] = add nsw <16 x i32> [[TMP56]], [[TMP58]]
-; CHECK-NEXT:    [[TMP62:%.*]] = shufflevector <16 x i32> [[TMP60]], <16 x i32> [[TMP61]], <16 x i32> <i32 1, i32 3, i32 5, i32 7, i32 17, i32 19, i32 21, i32 23, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison>
-; CHECK-NEXT:    [[TMP63:%.*]] = shufflevector <16 x i32> [[TMP60]], <16 x i32> [[TMP61]], <16 x i32> <i32 0, i32 2, i32 4, i32 6, i32 16, i32 18, i32 20, i32 22, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison>
-; CHECK-NEXT:    [[TMP64:%.*]] = shufflevector <16 x i32> [[TMP60]], <16 x i32> [[TMP61]], <16 x i32> <i32 0, i32 2, i32 4, i32 6, i32 16, i32 18, i32 20, i32 22, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison>
-; CHECK-NEXT:    [[TMP65:%.*]] = shufflevector <16 x i32> [[TMP60]], <16 x i32> [[TMP61]], <16 x i32> <i32 1, i32 3, i32 5, i32 7, i32 17, i32 19, i32 21, i32 23, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison>
+; CHECK-NEXT:    [[TMP62:%.*]] = shufflevector <16 x i32> [[TMP60]], <16 x i32> [[TMP61]], <16 x i32> <i32 0, i32 2, i32 4, i32 6, i32 16, i32 18, i32 20, i32 22, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison>
+; CHECK-NEXT:    [[TMP63:%.*]] = shufflevector <16 x i32> [[TMP60]], <16 x i32> [[TMP61]], <16 x i32> <i32 1, i32 3, i32 5, i32 7, i32 17, i32 19, i32 21, i32 23, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison>
+; CHECK-NEXT:    [[TMP64:%.*]] = shufflevector <16 x i32> [[TMP60]], <16 x i32> [[TMP61]], <16 x i32> <i32 1, i32 3, i32 5, i32 7, i32 17, i32 19, i32 21, i32 23, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison>
+; CHECK-NEXT:    [[TMP65:%.*]] = shufflevector <16 x i32> [[TMP60]], <16 x i32> [[TMP61]], <16 x i32> <i32 0, i32 2, i32 4, i32 6, i32 16, i32 18, i32 20, i32 22, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison>
 ; CHECK-NEXT:    [[TMP66:%.*]] = add nsw <16 x i32> [[TMP63]], [[TMP65]]
 ; CHECK-NEXT:    [[TMP67:%.*]] = sub nsw <16 x i32> [[TMP62]], [[TMP64]]
-; CHECK-NEXT:    [[TMP68:%.*]] = shufflevector <16 x i32> [[TMP66]], <16 x i32> [[TMP67]], <16 x i32> <i32 0, i32 4, i32 1, i32 5, i32 2, i32 6, i32 3, i32 7, i32 20, i32 16, i32 21, i32 17, i32 22, i32 18, i32 23, i32 19>
+; CHECK-NEXT:    [[TMP68:%.*]] = shufflevector <16 x i32> [[TMP66]], <16 x i32> [[TMP67]], <16 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7, i32 16, i32 17, i32 18, i32 19, i32 20, i32 21, i32 22, i32 23>
 ; CHECK-NEXT:    [[TMP69:%.*]] = lshr <16 x i32> [[TMP68]], <i32 15, i32 15, i32 15, i32 15, i32 15, i32 15, i32 15, i32 15, i32 15, i32 15, i32 15, i32 15, i32 15, i32 15, i32 15, i32 15>
 ; CHECK-NEXT:    [[TMP70:%.*]] = and <16 x i32> [[TMP69]], <i32 65537, i32 65537, i32 65537, i32 65537, i32 65537, i32 65537, i32 65537, i32 65537, i32 65537, i32 65537, i32 65537, i32 65537, i32 65537, i32 65537, i32 65537, i32 65537>
 ; CHECK-NEXT:    [[TMP71:%.*]] = mul nuw <16 x i32> [[TMP70]], <i32 65535, i32 65535, i32 65535, i32 65535, i32 65535, i32 65535, i32 65535, i32 65535, i32 65535, i32 65535, i32 65535, i32 65535, i32 65535, i32 65535, i32 65535, i32 65535>
diff --git a/llvm/test/Transforms/SCCP/crash.ll b/llvm/test/Transforms/SCCP/crash.ll
index 8f8ad44d..47d9329 100644
--- a/llvm/test/Transforms/SCCP/crash.ll
+++ b/llvm/test/Transforms/SCCP/crash.ll
@@ -28,7 +28,7 @@ define i32 @test2([4 x i32] %A) {
   ret i32 %B
 }
 
-define x86_mmx @test3() {
-  %load = load x86_mmx, ptr null
-  ret x86_mmx %load
+define <1 x i64> @test3() {
+  %load = load <1 x i64>, ptr null
+  ret <1 x i64> %load
 }
diff --git a/llvm/test/Transforms/SCCP/float-denormal-simplification.ll b/llvm/test/Transforms/SCCP/float-denormal-simplification.ll
new file mode 100644
index 0000000..fec9883
--- /dev/null
+++ b/llvm/test/Transforms/SCCP/float-denormal-simplification.ll
@@ -0,0 +1,21 @@
+; NOTE: Assertions have been autogenerated by utils/update_test_checks.py
+; RUN: opt -passes=sccp -S %s | FileCheck %s
+
+define float @test_ieee() #0 {
+; CHECK-LABEL: @test_ieee(
+; CHECK-NEXT:    ret float 0x36F4000000000000
+;
+  %1 = fmul float 2.802596928649634e-44, 2.000000e+00
+  ret float %1
+}
+
+define float @test_preserve_sign() #1 {
+; CHECK-LABEL: @test_preserve_sign(
+; CHECK-NEXT:    ret float 0.000000e+00
+;
+  %1 = fmul float 2.802596928649634e-44, 2.000000e+00
+  ret float %1
+}
+
+attributes #0 = {"denormal-fp-math"="ieee,ieee"}
+attributes #1 = {"denormal-fp-math"="preserve-sign,preserve-sign"}
diff --git a/llvm/test/Transforms/SCCP/ipsccp-preserve-pdt.ll b/llvm/test/Transforms/SCCP/ipsccp-preserve-pdt.ll
index d605611..ff57569 100644
--- a/llvm/test/Transforms/SCCP/ipsccp-preserve-pdt.ll
+++ b/llvm/test/Transforms/SCCP/ipsccp-preserve-pdt.ll
@@ -17,11 +17,11 @@
 ; CHECK-NOT: <badref>
 ; CHECK: Inorder PostDominator Tree: DFSNumbers invalid: 0 slow queries.
 ; CHECK-NEXT:   [1]  <<exit node>> {4294967295,4294967295} [0]
+; CHECK-NEXT:     [2] %for.cond34 {4294967295,4294967295} [1]
+; CHECK-NEXT:       [3] %for.cond16 {4294967295,4294967295} [2]
 ; CHECK-NEXT:     [2] %for.body {4294967295,4294967295} [1]
 ; CHECK-NEXT:     [2] %if.end4 {4294967295,4294967295} [1]
 ; CHECK-NEXT:       [3] %entry {4294967295,4294967295} [2]
-; CHECK-NEXT:     [2] %for.cond34 {4294967295,4294967295} [1]
-; CHECK-NEXT:       [3] %for.cond16 {4294967295,4294967295} [2]
 ; CHECK-NEXT: Roots: %for.cond34 %for.body
 ; CHECK-NEXT: PostDominatorTree for function: bar
 ; CHECK-NOT: <badref>
diff --git a/llvm/test/Transforms/SLPVectorizer/AArch64/loadorder.ll b/llvm/test/Transforms/SLPVectorizer/AArch64/loadorder.ll
index 807d246..d79aed8 100644
--- a/llvm/test/Transforms/SLPVectorizer/AArch64/loadorder.ll
+++ b/llvm/test/Transforms/SLPVectorizer/AArch64/loadorder.ll
@@ -428,14 +428,14 @@ define i32 @reduce_blockstrided4x4(ptr nocapture noundef readonly %p1, i32 nound
 ; CHECK-NEXT:    [[TMP5:%.*]] = load <4 x i8>, ptr [[ADD_PTR64]], align 1
 ; CHECK-NEXT:    [[TMP6:%.*]] = load <4 x i8>, ptr [[ARRAYIDX3_1]], align 1
 ; CHECK-NEXT:    [[TMP7:%.*]] = load <4 x i8>, ptr [[ARRAYIDX5_1]], align 1
-; CHECK-NEXT:    [[TMP8:%.*]] = shufflevector <4 x i8> [[TMP0]], <4 x i8> [[TMP1]], <16 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison>
-; CHECK-NEXT:    [[TMP9:%.*]] = shufflevector <4 x i8> [[TMP4]], <4 x i8> poison, <16 x i32> <i32 0, i32 1, i32 2, i32 3, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison>
+; CHECK-NEXT:    [[TMP8:%.*]] = shufflevector <4 x i8> [[TMP0]], <4 x i8> [[TMP4]], <16 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison>
+; CHECK-NEXT:    [[TMP9:%.*]] = shufflevector <4 x i8> [[TMP1]], <4 x i8> poison, <16 x i32> <i32 0, i32 1, i32 2, i32 3, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison>
 ; CHECK-NEXT:    [[TMP10:%.*]] = shufflevector <16 x i8> [[TMP8]], <16 x i8> [[TMP9]], <16 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7, i32 16, i32 17, i32 18, i32 19, i32 poison, i32 poison, i32 poison, i32 poison>
 ; CHECK-NEXT:    [[TMP11:%.*]] = shufflevector <4 x i8> [[TMP5]], <4 x i8> poison, <16 x i32> <i32 0, i32 1, i32 2, i32 3, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison>
 ; CHECK-NEXT:    [[TMP12:%.*]] = shufflevector <16 x i8> [[TMP10]], <16 x i8> [[TMP11]], <16 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7, i32 8, i32 9, i32 10, i32 11, i32 16, i32 17, i32 18, i32 19>
 ; CHECK-NEXT:    [[TMP13:%.*]] = zext <16 x i8> [[TMP12]] to <16 x i32>
-; CHECK-NEXT:    [[TMP14:%.*]] = shufflevector <4 x i8> [[TMP2]], <4 x i8> [[TMP3]], <16 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison>
-; CHECK-NEXT:    [[TMP15:%.*]] = shufflevector <4 x i8> [[TMP6]], <4 x i8> poison, <16 x i32> <i32 0, i32 1, i32 2, i32 3, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison>
+; CHECK-NEXT:    [[TMP14:%.*]] = shufflevector <4 x i8> [[TMP2]], <4 x i8> [[TMP6]], <16 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison>
+; CHECK-NEXT:    [[TMP15:%.*]] = shufflevector <4 x i8> [[TMP3]], <4 x i8> poison, <16 x i32> <i32 0, i32 1, i32 2, i32 3, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison>
 ; CHECK-NEXT:    [[TMP16:%.*]] = shufflevector <16 x i8> [[TMP14]], <16 x i8> [[TMP15]], <16 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7, i32 16, i32 17, i32 18, i32 19, i32 poison, i32 poison, i32 poison, i32 poison>
 ; CHECK-NEXT:    [[TMP17:%.*]] = shufflevector <4 x i8> [[TMP7]], <4 x i8> poison, <16 x i32> <i32 0, i32 1, i32 2, i32 3, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison>
 ; CHECK-NEXT:    [[TMP18:%.*]] = shufflevector <16 x i8> [[TMP16]], <16 x i8> [[TMP17]], <16 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7, i32 8, i32 9, i32 10, i32 11, i32 16, i32 17, i32 18, i32 19>
@@ -1231,29 +1231,29 @@ define dso_local i32 @full(ptr nocapture noundef readonly %p1, i32 noundef %st1,
 ; CHECK-NEXT:    [[TMP13:%.*]] = load <4 x i8>, ptr [[ADD_PTR64_2]], align 1
 ; CHECK-NEXT:    [[TMP14:%.*]] = load <4 x i8>, ptr [[ARRAYIDX3_3]], align 1
 ; CHECK-NEXT:    [[TMP15:%.*]] = load <4 x i8>, ptr [[ARRAYIDX5_3]], align 1
-; CHECK-NEXT:    [[TMP16:%.*]] = shufflevector <4 x i8> [[TMP8]], <4 x i8> [[TMP12]], <16 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison>
-; CHECK-NEXT:    [[TMP17:%.*]] = shufflevector <4 x i8> [[TMP4]], <4 x i8> poison, <16 x i32> <i32 0, i32 1, i32 2, i32 3, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison>
+; CHECK-NEXT:    [[TMP16:%.*]] = shufflevector <4 x i8> [[TMP0]], <4 x i8> [[TMP4]], <16 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison>
+; CHECK-NEXT:    [[TMP17:%.*]] = shufflevector <4 x i8> [[TMP8]], <4 x i8> poison, <16 x i32> <i32 0, i32 1, i32 2, i32 3, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison>
 ; CHECK-NEXT:    [[TMP18:%.*]] = shufflevector <16 x i8> [[TMP16]], <16 x i8> [[TMP17]], <16 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7, i32 16, i32 17, i32 18, i32 19, i32 poison, i32 poison, i32 poison, i32 poison>
-; CHECK-NEXT:    [[TMP19:%.*]] = shufflevector <4 x i8> [[TMP0]], <4 x i8> poison, <16 x i32> <i32 0, i32 1, i32 2, i32 3, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison>
+; CHECK-NEXT:    [[TMP19:%.*]] = shufflevector <4 x i8> [[TMP12]], <4 x i8> poison, <16 x i32> <i32 0, i32 1, i32 2, i32 3, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison>
 ; CHECK-NEXT:    [[TMP20:%.*]] = shufflevector <16 x i8> [[TMP18]], <16 x i8> [[TMP19]], <16 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7, i32 8, i32 9, i32 10, i32 11, i32 16, i32 17, i32 18, i32 19>
 ; CHECK-NEXT:    [[TMP21:%.*]] = zext <16 x i8> [[TMP20]] to <16 x i32>
-; CHECK-NEXT:    [[TMP22:%.*]] = shufflevector <4 x i8> [[TMP9]], <4 x i8> [[TMP13]], <16 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison>
-; CHECK-NEXT:    [[TMP23:%.*]] = shufflevector <4 x i8> [[TMP5]], <4 x i8> poison, <16 x i32> <i32 0, i32 1, i32 2, i32 3, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison>
+; CHECK-NEXT:    [[TMP22:%.*]] = shufflevector <4 x i8> [[TMP1]], <4 x i8> [[TMP5]], <16 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison>
+; CHECK-NEXT:    [[TMP23:%.*]] = shufflevector <4 x i8> [[TMP9]], <4 x i8> poison, <16 x i32> <i32 0, i32 1, i32 2, i32 3, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison>
 ; CHECK-NEXT:    [[TMP24:%.*]] = shufflevector <16 x i8> [[TMP22]], <16 x i8> [[TMP23]], <16 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7, i32 16, i32 17, i32 18, i32 19, i32 poison, i32 poison, i32 poison, i32 poison>
-; CHECK-NEXT:    [[TMP25:%.*]] = shufflevector <4 x i8> [[TMP1]], <4 x i8> poison, <16 x i32> <i32 0, i32 1, i32 2, i32 3, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison>
+; CHECK-NEXT:    [[TMP25:%.*]] = shufflevector <4 x i8> [[TMP13]], <4 x i8> poison, <16 x i32> <i32 0, i32 1, i32 2, i32 3, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison>
 ; CHECK-NEXT:    [[TMP26:%.*]] = shufflevector <16 x i8> [[TMP24]], <16 x i8> [[TMP25]], <16 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7, i32 8, i32 9, i32 10, i32 11, i32 16, i32 17, i32 18, i32 19>
 ; CHECK-NEXT:    [[TMP27:%.*]] = zext <16 x i8> [[TMP26]] to <16 x i32>
 ; CHECK-NEXT:    [[TMP28:%.*]] = sub nsw <16 x i32> [[TMP21]], [[TMP27]]
-; CHECK-NEXT:    [[TMP29:%.*]] = shufflevector <4 x i8> [[TMP10]], <4 x i8> [[TMP14]], <16 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison>
-; CHECK-NEXT:    [[TMP30:%.*]] = shufflevector <4 x i8> [[TMP6]], <4 x i8> poison, <16 x i32> <i32 0, i32 1, i32 2, i32 3, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison>
+; CHECK-NEXT:    [[TMP29:%.*]] = shufflevector <4 x i8> [[TMP2]], <4 x i8> [[TMP6]], <16 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison>
+; CHECK-NEXT:    [[TMP30:%.*]] = shufflevector <4 x i8> [[TMP10]], <4 x i8> poison, <16 x i32> <i32 0, i32 1, i32 2, i32 3, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison>
 ; CHECK-NEXT:    [[TMP31:%.*]] = shufflevector <16 x i8> [[TMP29]], <16 x i8> [[TMP30]], <16 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7, i32 16, i32 17, i32 18, i32 19, i32 poison, i32 poison, i32 poison, i32 poison>
-; CHECK-NEXT:    [[TMP32:%.*]] = shufflevector <4 x i8> [[TMP2]], <4 x i8> poison, <16 x i32> <i32 0, i32 1, i32 2, i32 3, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison>
+; CHECK-NEXT:    [[TMP32:%.*]] = shufflevector <4 x i8> [[TMP14]], <4 x i8> poison, <16 x i32> <i32 0, i32 1, i32 2, i32 3, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison>
 ; CHECK-NEXT:    [[TMP33:%.*]] = shufflevector <16 x i8> [[TMP31]], <16 x i8> [[TMP32]], <16 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7, i32 8, i32 9, i32 10, i32 11, i32 16, i32 17, i32 18, i32 19>
 ; CHECK-NEXT:    [[TMP34:%.*]] = zext <16 x i8> [[TMP33]] to <16 x i32>
-; CHECK-NEXT:    [[TMP35:%.*]] = shufflevector <4 x i8> [[TMP11]], <4 x i8> [[TMP15]], <16 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison>
-; CHECK-NEXT:    [[TMP36:%.*]] = shufflevector <4 x i8> [[TMP7]], <4 x i8> poison, <16 x i32> <i32 0, i32 1, i32 2, i32 3, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison>
+; CHECK-NEXT:    [[TMP35:%.*]] = shufflevector <4 x i8> [[TMP3]], <4 x i8> [[TMP7]], <16 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison>
+; CHECK-NEXT:    [[TMP36:%.*]] = shufflevector <4 x i8> [[TMP11]], <4 x i8> poison, <16 x i32> <i32 0, i32 1, i32 2, i32 3, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison>
 ; CHECK-NEXT:    [[TMP37:%.*]] = shufflevector <16 x i8> [[TMP35]], <16 x i8> [[TMP36]], <16 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7, i32 16, i32 17, i32 18, i32 19, i32 poison, i32 poison, i32 poison, i32 poison>
-; CHECK-NEXT:    [[TMP38:%.*]] = shufflevector <4 x i8> [[TMP3]], <4 x i8> poison, <16 x i32> <i32 0, i32 1, i32 2, i32 3, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison>
+; CHECK-NEXT:    [[TMP38:%.*]] = shufflevector <4 x i8> [[TMP15]], <4 x i8> poison, <16 x i32> <i32 0, i32 1, i32 2, i32 3, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison>
 ; CHECK-NEXT:    [[TMP39:%.*]] = shufflevector <16 x i8> [[TMP37]], <16 x i8> [[TMP38]], <16 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7, i32 8, i32 9, i32 10, i32 11, i32 16, i32 17, i32 18, i32 19>
 ; CHECK-NEXT:    [[TMP40:%.*]] = zext <16 x i8> [[TMP39]] to <16 x i32>
 ; CHECK-NEXT:    [[TMP41:%.*]] = sub nsw <16 x i32> [[TMP34]], [[TMP40]]
@@ -1262,7 +1262,7 @@ define dso_local i32 @full(ptr nocapture noundef readonly %p1, i32 noundef %st1,
 ; CHECK-NEXT:    [[TMP44:%.*]] = shufflevector <16 x i32> [[TMP43]], <16 x i32> poison, <16 x i32> <i32 1, i32 0, i32 3, i32 2, i32 5, i32 4, i32 7, i32 6, i32 9, i32 8, i32 11, i32 10, i32 13, i32 12, i32 15, i32 14>
 ; CHECK-NEXT:    [[TMP45:%.*]] = add nsw <16 x i32> [[TMP43]], [[TMP44]]
 ; CHECK-NEXT:    [[TMP46:%.*]] = sub nsw <16 x i32> [[TMP43]], [[TMP44]]
-; CHECK-NEXT:    [[TMP47:%.*]] = shufflevector <16 x i32> [[TMP45]], <16 x i32> [[TMP46]], <16 x i32> <i32 3, i32 7, i32 11, i32 15, i32 18, i32 22, i32 26, i32 30, i32 1, i32 5, i32 9, i32 13, i32 16, i32 20, i32 24, i32 28>
+; CHECK-NEXT:    [[TMP47:%.*]] = shufflevector <16 x i32> [[TMP45]], <16 x i32> [[TMP46]], <16 x i32> <i32 11, i32 15, i32 7, i32 3, i32 26, i32 30, i32 22, i32 18, i32 9, i32 13, i32 5, i32 1, i32 24, i32 28, i32 20, i32 16>
 ; CHECK-NEXT:    [[TMP48:%.*]] = shufflevector <16 x i32> [[TMP47]], <16 x i32> poison, <16 x i32> <i32 8, i32 9, i32 10, i32 11, i32 12, i32 13, i32 14, i32 15, i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7>
 ; CHECK-NEXT:    [[TMP49:%.*]] = add nsw <16 x i32> [[TMP47]], [[TMP48]]
 ; CHECK-NEXT:    [[TMP50:%.*]] = sub nsw <16 x i32> [[TMP47]], [[TMP48]]
diff --git a/llvm/test/Transforms/SLPVectorizer/AMDGPU/slp-v2f16.ll b/llvm/test/Transforms/SLPVectorizer/AMDGPU/slp-v2f16.ll
new file mode 100644
index 0000000..c51bdedd
--- /dev/null
+++ b/llvm/test/Transforms/SLPVectorizer/AMDGPU/slp-v2f16.ll
@@ -0,0 +1,366 @@
+; RUN: opt -S -mtriple=amdgcn-amd-amdhsa -mcpu=gfx803 -passes=slp-vectorizer < %s | FileCheck -check-prefixes=GCN,GFX8 %s
+; RUN: opt -S -mtriple=amdgcn-amd-amdhsa -mcpu=gfx908 -passes=slp-vectorizer < %s | FileCheck -check-prefixes=GCN,GFX9 %s
+; RUN: opt -S -mtriple=amdgcn-amd-amdhsa -mcpu=gfx90a -passes=slp-vectorizer < %s | FileCheck -check-prefixes=GCN,GFX9 %s
+; RUN: opt -S -mtriple=amdgcn-amd-amdhsa -mcpu=gfx1030 -passes=slp-vectorizer < %s | FileCheck -check-prefixes=GCN,GFX9 %s
+
+; FIXME: Should not vectorize on gfx8
+
+; GCN-LABEL: @fadd_combine_v2f16
+; GCN: fadd <2 x half>
+define void @fadd_combine_v2f16(ptr addrspace(1) %arg) {
+bb:
+  %tmp = tail call i32 @llvm.amdgcn.workitem.id.x()
+  %tmp1 = zext i32 %tmp to i64
+  %tmp2 = getelementptr inbounds half, ptr addrspace(1) %arg, i64 %tmp1
+  %tmp3 = load half, ptr addrspace(1) %tmp2, align 2
+  %tmp4 = fadd half %tmp3, 1.000000e+00
+  store half %tmp4, ptr addrspace(1) %tmp2, align 2
+  %tmp5 = add nuw nsw i64 %tmp1, 1
+  %tmp6 = getelementptr inbounds half, ptr addrspace(1) %arg, i64 %tmp5
+  %tmp7 = load half, ptr addrspace(1) %tmp6, align 2
+  %tmp8 = fadd half %tmp7, 1.000000e+00
+  store half %tmp8, ptr addrspace(1) %tmp6, align 2
+  ret void
+}
+
+; FIXME: Should not vectorize on gfx8
+; GCN-LABEL: @fsub_combine_v2f16
+; GCN: fsub <2 x half>
+define void @fsub_combine_v2f16(ptr addrspace(1) %arg) {
+bb:
+  %tmp = tail call i32 @llvm.amdgcn.workitem.id.x()
+  %tmp1 = zext i32 %tmp to i64
+  %tmp2 = getelementptr inbounds half, ptr addrspace(1) %arg, i64 %tmp1
+  %tmp3 = load half, ptr addrspace(1) %tmp2, align 2
+  %tmp4 = fsub half %tmp3, 1.000000e+00
+  store half %tmp4, ptr addrspace(1) %tmp2, align 2
+  %tmp5 = add nuw nsw i64 %tmp1, 1
+  %tmp6 = getelementptr inbounds half, ptr addrspace(1) %arg, i64 %tmp5
+  %tmp7 = load half, ptr addrspace(1) %tmp6, align 2
+  %tmp8 = fsub half %tmp7, 1.000000e+00
+  store half %tmp8, ptr addrspace(1) %tmp6, align 2
+  ret void
+}
+
+; FIXME: Should not vectorize on gfx8
+; GCN-LABEL: @fmul_combine_v2f16
+; GCN: fmul <2 x half>
+define void @fmul_combine_v2f16(ptr addrspace(1) %arg) {
+bb:
+  %tmp = tail call i32 @llvm.amdgcn.workitem.id.x()
+  %tmp1 = zext i32 %tmp to i64
+  %tmp2 = getelementptr inbounds half, ptr addrspace(1) %arg, i64 %tmp1
+  %tmp3 = load half, ptr addrspace(1) %tmp2, align 2
+  %tmp4 = fmul half %tmp3, 1.000000e+00
+  store half %tmp4, ptr addrspace(1) %tmp2, align 2
+  %tmp5 = add nuw nsw i64 %tmp1, 1
+  %tmp6 = getelementptr inbounds half, ptr addrspace(1) %arg, i64 %tmp5
+  %tmp7 = load half, ptr addrspace(1) %tmp6, align 2
+  %tmp8 = fmul half %tmp7, 1.000000e+00
+  store half %tmp8, ptr addrspace(1) %tmp6, align 2
+  ret void
+}
+
+; GCN-LABEL: @fdiv_combine_v2f16
+; GCN: fdiv <2 x half>
+define void @fdiv_combine_v2f16(ptr addrspace(1) %arg) {
+bb:
+  %tmp = tail call i32 @llvm.amdgcn.workitem.id.x()
+  %tmp1 = zext i32 %tmp to i64
+  %tmp2 = getelementptr inbounds half, ptr addrspace(1) %arg, i64 %tmp1
+  %tmp3 = load half, ptr addrspace(1) %tmp2, align 2
+  %tmp4 = fdiv half %tmp3, 1.000000e+00
+  store half %tmp4, ptr addrspace(1) %tmp2, align 2
+  %tmp5 = add nuw nsw i64 %tmp1, 1
+  %tmp6 = getelementptr inbounds half, ptr addrspace(1) %arg, i64 %tmp5
+  %tmp7 = load half, ptr addrspace(1) %tmp6, align 2
+  %tmp8 = fdiv half %tmp7, 1.000000e+00
+  store half %tmp8, ptr addrspace(1) %tmp6, align 2
+  ret void
+}
+
+; GCN-LABEL: @frem_combine_v2f16
+; GCN: frem <2 x half>
+define void @frem_combine_v2f16(ptr addrspace(1) %arg) {
+bb:
+  %tmp = tail call i32 @llvm.amdgcn.workitem.id.x()
+  %tmp1 = zext i32 %tmp to i64
+  %tmp2 = getelementptr inbounds half, ptr addrspace(1) %arg, i64 %tmp1
+  %tmp3 = load half, ptr addrspace(1) %tmp2, align 2
+  %tmp4 = frem half %tmp3, 1.000000e+00
+  store half %tmp4, ptr addrspace(1) %tmp2, align 2
+  %tmp5 = add nuw nsw i64 %tmp1, 1
+  %tmp6 = getelementptr inbounds half, ptr addrspace(1) %arg, i64 %tmp5
+  %tmp7 = load half, ptr addrspace(1) %tmp6, align 2
+  %tmp8 = frem half %tmp7, 1.000000e+00
+  store half %tmp8, ptr addrspace(1) %tmp6, align 2
+  ret void
+}
+
+; FIXME: Should not vectorize on gfx8
+; GCN-LABEL: @fma_combine_v2f16
+; GCN: call <2 x half> @llvm.fma.v2f16
+define amdgpu_kernel void @fma_combine_v2f16(ptr addrspace(1) %arg) {
+bb:
+  %tmp = tail call i32 @llvm.amdgcn.workitem.id.x()
+  %tmp1 = zext i32 %tmp to i64
+  %tmp2 = getelementptr inbounds half, ptr addrspace(1) %arg, i64 %tmp1
+  %tmp3 = load half, ptr addrspace(1) %tmp2, align 2
+  %tmp4 = tail call half @llvm.fma.f16(half %tmp3, half 1.000000e+00, half 1.000000e+00)
+  store half %tmp4, ptr addrspace(1) %tmp2, align 2
+  %tmp5 = add nuw nsw i64 %tmp1, 1
+  %tmp6 = getelementptr inbounds half, ptr addrspace(1) %arg, i64 %tmp5
+  %tmp7 = load half, ptr addrspace(1) %tmp6, align 2
+  %tmp8 = tail call half @llvm.fma.f16(half %tmp7, half 1.000000e+00, half 1.000000e+00)
+  store half %tmp8, ptr addrspace(1) %tmp6, align 2
+  ret void
+}
+
+; FIXME: Should not vectorize on gfx8
+; GCN-LABEL: @fmuladd_combine_v2f16
+; GCN: call <2 x half> @llvm.fmuladd.v2f16
+define amdgpu_kernel void @fmuladd_combine_v2f16(ptr addrspace(1) %arg) {
+bb:
+  %tmp = tail call i32 @llvm.amdgcn.workitem.id.x()
+  %tmp1 = zext i32 %tmp to i64
+  %tmp2 = getelementptr inbounds half, ptr addrspace(1) %arg, i64 %tmp1
+  %tmp3 = load half, ptr addrspace(1) %tmp2, align 2
+  %tmp4 = tail call half @llvm.fmuladd.f16(half %tmp3, half 1.000000e+00, half 1.000000e+00)
+  store half %tmp4, ptr addrspace(1) %tmp2, align 2
+  %tmp5 = add nuw nsw i64 %tmp1, 1
+  %tmp6 = getelementptr inbounds half, ptr addrspace(1) %arg, i64 %tmp5
+  %tmp7 = load half, ptr addrspace(1) %tmp6, align 2
+  %tmp8 = tail call half @llvm.fmuladd.f16(half %tmp7, half 1.000000e+00, half 1.000000e+00)
+  store half %tmp8, ptr addrspace(1) %tmp6, align 2
+  ret void
+}
+
+; GCN-LABEL: @minnum_combine_v2f16
+; GFX8: call half @llvm.minnum.f16(
+; GFX8: call half @llvm.minnum.f16(
+
+; GFX9: call <2 x half> @llvm.minnum.v2f16
+define void @minnum_combine_v2f16(ptr addrspace(1) %arg) {
+bb:
+  %tmp = tail call i32 @llvm.amdgcn.workitem.id.x()
+  %tmp1 = zext i32 %tmp to i64
+  %tmp2 = getelementptr inbounds half, ptr addrspace(1) %arg, i64 %tmp1
+  %tmp3 = load half, ptr addrspace(1) %tmp2, align 2
+  %tmp4 = call half @llvm.minnum.f16(half %tmp3, half 1.000000e+00)
+  store half %tmp4, ptr addrspace(1) %tmp2, align 2
+  %tmp5 = add nuw nsw i64 %tmp1, 1
+  %tmp6 = getelementptr inbounds half, ptr addrspace(1) %arg, i64 %tmp5
+  %tmp7 = load half, ptr addrspace(1) %tmp6, align 2
+  %tmp8 = call half @llvm.minnum.f16(half %tmp7, half 1.000000e+00)
+  store half %tmp8, ptr addrspace(1) %tmp6, align 2
+  ret void
+}
+
+; GCN-LABEL: @maxnum_combine_v2f16
+; GFX8: call half @llvm.maxnum.f16(
+; GFX8: call half @llvm.maxnum.f16(
+
+; GFX9: call <2 x half> @llvm.maxnum.v2f16
+define void @maxnum_combine_v2f16(ptr addrspace(1) %arg) {
+bb:
+  %tmp = tail call i32 @llvm.amdgcn.workitem.id.x()
+  %tmp1 = zext i32 %tmp to i64
+  %tmp2 = getelementptr inbounds half, ptr addrspace(1) %arg, i64 %tmp1
+  %tmp3 = load half, ptr addrspace(1) %tmp2, align 2
+  %tmp4 = call half @llvm.maxnum.f16(half %tmp3, half 1.000000e+00)
+  store half %tmp4, ptr addrspace(1) %tmp2, align 2
+  %tmp5 = add nuw nsw i64 %tmp1, 1
+  %tmp6 = getelementptr inbounds half, ptr addrspace(1) %arg, i64 %tmp5
+  %tmp7 = load half, ptr addrspace(1) %tmp6, align 2
+  %tmp8 = call half @llvm.maxnum.f16(half %tmp7, half 1.000000e+00)
+  store half %tmp8, ptr addrspace(1) %tmp6, align 2
+  ret void
+}
+
+; FIXME: Should vectorize
+; GCN-LABEL: @minimum_combine_v2f16
+; GCN: call half @llvm.minimum.f16(
+; GCN: call half @llvm.minimum.f16(
+define void @minimum_combine_v2f16(ptr addrspace(1) %arg) {
+bb:
+  %tmp = tail call i32 @llvm.amdgcn.workitem.id.x()
+  %tmp1 = zext i32 %tmp to i64
+  %tmp2 = getelementptr inbounds half, ptr addrspace(1) %arg, i64 %tmp1
+  %tmp3 = load half, ptr addrspace(1) %tmp2, align 2
+  %tmp4 = call half @llvm.minimum.f16(half %tmp3, half 1.000000e+00)
+  store half %tmp4, ptr addrspace(1) %tmp2, align 2
+  %tmp5 = add nuw nsw i64 %tmp1, 1
+  %tmp6 = getelementptr inbounds half, ptr addrspace(1) %arg, i64 %tmp5
+  %tmp7 = load half, ptr addrspace(1) %tmp6, align 2
+  %tmp8 = call half @llvm.minimum.f16(half %tmp7, half 1.000000e+00)
+  store half %tmp8, ptr addrspace(1) %tmp6, align 2
+  ret void
+}
+
+; GCN-LABEL: @maximum_combine_v2f16
+; GCN: call half @llvm.maximum.f16(
+; GCN: call half @llvm.maximum.f16(
+define void @maximum_combine_v2f16(ptr addrspace(1) %arg) {
+bb:
+  %tmp = tail call i32 @llvm.amdgcn.workitem.id.x()
+  %tmp1 = zext i32 %tmp to i64
+  %tmp2 = getelementptr inbounds half, ptr addrspace(1) %arg, i64 %tmp1
+  %tmp3 = load half, ptr addrspace(1) %tmp2, align 2
+  %tmp4 = call half @llvm.maximum.f16(half %tmp3, half 1.000000e+00)
+  store half %tmp4, ptr addrspace(1) %tmp2, align 2
+  %tmp5 = add nuw nsw i64 %tmp1, 1
+  %tmp6 = getelementptr inbounds half, ptr addrspace(1) %arg, i64 %tmp5
+  %tmp7 = load half, ptr addrspace(1) %tmp6, align 2
+  %tmp8 = call half @llvm.maximum.f16(half %tmp7, half 1.000000e+00)
+  store half %tmp8, ptr addrspace(1) %tmp6, align 2
+  ret void
+}
+
+; GCN-LABEL: @canonicalize_combine_v2f16
+; GFX8: call half @llvm.canonicalize.f16(
+; GFX8: call half @llvm.canonicalize.f16(
+
+; GFX9: call <2 x half> @llvm.canonicalize.v2f16(
+define void @canonicalize_combine_v2f16(ptr addrspace(1) %arg) {
+bb:
+  %tmp = tail call i32 @llvm.amdgcn.workitem.id.x()
+  %tmp1 = zext i32 %tmp to i64
+  %tmp2 = getelementptr inbounds half, ptr addrspace(1) %arg, i64 %tmp1
+  %tmp3 = load half, ptr addrspace(1) %tmp2, align 2
+  %tmp4 = call half @llvm.canonicalize.f16(half %tmp3)
+  store half %tmp4, ptr addrspace(1) %tmp2, align 2
+  %tmp5 = add nuw nsw i64 %tmp1, 1
+  %tmp6 = getelementptr inbounds half, ptr addrspace(1) %arg, i64 %tmp5
+  %tmp7 = load half, ptr addrspace(1) %tmp6, align 2
+  %tmp8 = call half @llvm.canonicalize.f16(half %tmp7)
+  store half %tmp8, ptr addrspace(1) %tmp6, align 2
+  ret void
+}
+
+; GCN-LABEL: @fabs_combine_v2f16
+; GCN: call <2 x half> @llvm.fabs.v2f16(
+define void @fabs_combine_v2f16(ptr addrspace(1) %arg) {
+bb:
+  %tmp = tail call i32 @llvm.amdgcn.workitem.id.x()
+  %tmp1 = zext i32 %tmp to i64
+  %tmp2 = getelementptr inbounds half, ptr addrspace(1) %arg, i64 %tmp1
+  %tmp3 = load half, ptr addrspace(1) %tmp2, align 2
+  %tmp4 = call half @llvm.fabs.f16(half %tmp3)
+  store half %tmp4, ptr addrspace(1) %tmp2, align 2
+  %tmp5 = add nuw nsw i64 %tmp1, 1
+  %tmp6 = getelementptr inbounds half, ptr addrspace(1) %arg, i64 %tmp5
+  %tmp7 = load half, ptr addrspace(1) %tmp6, align 2
+  %tmp8 = call half @llvm.fabs.f16(half %tmp7)
+  store half %tmp8, ptr addrspace(1) %tmp6, align 2
+  ret void
+}
+
+; GCN-LABEL: @fneg_combine_v2f16
+; GCN: fneg <2 x half>
+define void @fneg_combine_v2f16(ptr addrspace(1) %arg) {
+bb:
+  %tmp = tail call i32 @llvm.amdgcn.workitem.id.x()
+  %tmp1 = zext i32 %tmp to i64
+  %tmp2 = getelementptr inbounds half, ptr addrspace(1) %arg, i64 %tmp1
+  %tmp3 = load half, ptr addrspace(1) %tmp2, align 2
+  %tmp4 = fneg half %tmp3
+  store half %tmp4, ptr addrspace(1) %tmp2, align 2
+  %tmp5 = add nuw nsw i64 %tmp1, 1
+  %tmp6 = getelementptr inbounds half, ptr addrspace(1) %arg, i64 %tmp5
+  %tmp7 = load half, ptr addrspace(1) %tmp6, align 2
+  %tmp8 = fneg half %tmp7
+  store half %tmp8, ptr addrspace(1) %tmp6, align 2
+  ret void
+}
+
+; GCN-LABEL: @copysign_combine_v2f16
+; GCN: call <2 x half> @llvm.copysign.v2f16(
+define void @copysign_combine_v2f16(ptr addrspace(1) %arg, half %sign) {
+bb:
+  %tmp = tail call i32 @llvm.amdgcn.workitem.id.x()
+  %tmp1 = zext i32 %tmp to i64
+  %tmp2 = getelementptr inbounds half, ptr addrspace(1) %arg, i64 %tmp1
+  %tmp3 = load half, ptr addrspace(1) %tmp2, align 2
+  %tmp4 = call half @llvm.copysign.f16(half %tmp3, half %sign)
+  store half %tmp4, ptr addrspace(1) %tmp2, align 2
+  %tmp5 = add nuw nsw i64 %tmp1, 1
+  %tmp6 = getelementptr inbounds half, ptr addrspace(1) %arg, i64 %tmp5
+  %tmp7 = load half, ptr addrspace(1) %tmp6, align 2
+  %tmp8 = call half @llvm.copysign.f16(half %tmp7, half %sign)
+  store half %tmp8, ptr addrspace(1) %tmp6, align 2
+  ret void
+}
+
+; FIXME: Should always vectorize
+; GCN-LABEL: @copysign_combine_v4f16
+; GCN: call <2 x half> @llvm.copysign.v2f16(
+
+; GFX8: call half @llvm.copysign.f16(
+; GFX8: call half @llvm.copysign.f16(
+
+; GFX9: call <2 x half> @llvm.copysign.v2f16(
+define void @copysign_combine_v4f16(ptr addrspace(1) %arg, half %sign) {
+bb:
+  %tmp = tail call i32 @llvm.amdgcn.workitem.id.x()
+  %tmp1 = zext i32 %tmp to i64
+
+  %tmp2 = getelementptr inbounds half, ptr addrspace(1) %arg, i64 %tmp1
+  %tmp3 = load half, ptr addrspace(1) %tmp2, align 2
+  %tmp4 = call half @llvm.copysign.f16(half %tmp3, half %sign)
+  store half %tmp4, ptr addrspace(1) %tmp2, align 2
+
+  %tmp5 = add nuw nsw i64 %tmp1, 1
+  %tmp6 = getelementptr inbounds half, ptr addrspace(1) %arg, i64 %tmp5
+  %tmp7 = load half, ptr addrspace(1) %tmp6, align 2
+  %tmp8 = call half @llvm.copysign.f16(half %tmp7, half %sign)
+  store half %tmp8, ptr addrspace(1) %tmp6, align 2
+
+  %tmp9 = add nuw nsw i64 %tmp1, 2
+  %tmp10 = getelementptr inbounds half, ptr addrspace(1) %arg, i64 %tmp9
+  %tmp11 = load half, ptr addrspace(1) %tmp6, align 2
+  %tmp12 = call half @llvm.copysign.f16(half %tmp11, half %sign)
+  store half %tmp12, ptr addrspace(1) %tmp10, align 2
+
+  %tmp13 = add nuw nsw i64 %tmp1, 3
+  %tmp14 = getelementptr inbounds half, ptr addrspace(1) %arg, i64 %tmp13
+  %tmp15 = load half, ptr addrspace(1) %tmp14, align 2
+  %tmp16 = call half @llvm.copysign.f16(half %tmp15, half %sign)
+  store half %tmp16, ptr addrspace(1) %tmp14, align 2
+  ret void
+}
+
+; GCN-LABEL: @canonicalize_combine_v4f16
+; GFX8: call half @llvm.canonicalize.f16(
+; GFX8: call half @llvm.canonicalize.f16(
+
+; GFX9: call <2 x half> @llvm.canonicalize.v2f16(
+; GFX9: call <2 x half> @llvm.canonicalize.v2f16(
+define void @canonicalize_combine_v4f16(ptr addrspace(1) %arg) {
+bb:
+  %tmp = tail call i32 @llvm.amdgcn.workitem.id.x()
+  %tmp1 = zext i32 %tmp to i64
+
+  %tmp2 = getelementptr inbounds half, ptr addrspace(1) %arg, i64 %tmp1
+  %tmp3 = load half, ptr addrspace(1) %tmp2, align 2
+  %tmp4 = call half @llvm.canonicalize.f16(half %tmp3)
+  store half %tmp4, ptr addrspace(1) %tmp2, align 2
+
+  %tmp5 = add nuw nsw i64 %tmp1, 1
+  %tmp6 = getelementptr inbounds half, ptr addrspace(1) %arg, i64 %tmp5
+  %tmp7 = load half, ptr addrspace(1) %tmp6, align 2
+  %tmp8 = call half @llvm.canonicalize.f16(half %tmp7)
+  store half %tmp8, ptr addrspace(1) %tmp6, align 2
+
+  %tmp9 = add nuw nsw i64 %tmp1, 2
+  %tmp10 = getelementptr inbounds half, ptr addrspace(1) %arg, i64 %tmp9
+  %tmp11 = load half, ptr addrspace(1) %tmp6, align 2
+  %tmp12 = call half @llvm.canonicalize.f16(half %tmp11)
+  store half %tmp12, ptr addrspace(1) %tmp10, align 2
+
+  %tmp13 = add nuw nsw i64 %tmp1, 3
+  %tmp14 = getelementptr inbounds half, ptr addrspace(1) %arg, i64 %tmp13
+  %tmp15 = load half, ptr addrspace(1) %tmp14, align 2
+  %tmp16 = call half @llvm.canonicalize.f16(half %tmp15)
+  store half %tmp16, ptr addrspace(1) %tmp14, align 2
+  ret void
+}
diff --git a/llvm/test/Transforms/SLPVectorizer/X86/external-used-across-reductions.ll b/llvm/test/Transforms/SLPVectorizer/X86/external-used-across-reductions.ll
index 31ad6291..3d3d00f 100644
--- a/llvm/test/Transforms/SLPVectorizer/X86/external-used-across-reductions.ll
+++ b/llvm/test/Transforms/SLPVectorizer/X86/external-used-across-reductions.ll
@@ -13,11 +13,11 @@ define void @test() {
 ; CHECK-NEXT:    [[PHI1:%.*]] = phi i64 [ 0, [[ENTRY:%.*]] ], [ [[OP_RDX25:%.*]], [[LOOP]] ]
 ; CHECK-NEXT:    [[TMP6:%.*]] = phi <8 x i64> [ [[TMP0]], [[ENTRY]] ], [ [[TMP1]], [[LOOP]] ]
 ; CHECK-NEXT:    [[TMP7:%.*]] = mul <8 x i64> [[TMP6]], <i64 4, i64 4, i64 4, i64 4, i64 4, i64 4, i64 4, i64 4>
+; CHECK-NEXT:    [[TMP5:%.*]] = mul <8 x i64> [[TMP1]], <i64 2, i64 2, i64 2, i64 2, i64 2, i64 2, i64 2, i64 2>
 ; CHECK-NEXT:    [[TMP9:%.*]] = call i64 @llvm.vector.reduce.add.v8i64(<8 x i64> [[TMP7]])
-; CHECK-NEXT:    [[TMP8:%.*]] = call i64 @llvm.vector.reduce.add.v8i64(<8 x i64> [[TMP1]])
-; CHECK-NEXT:    [[TMP10:%.*]] = mul i64 [[TMP8]], 2
-; CHECK-NEXT:    [[OP_RDX33:%.*]] = add i64 [[TMP10]], [[TMP9]]
-; CHECK-NEXT:    [[OP_RDX25]] = add i64 [[OP_RDX33]], [[TMP3]]
+; CHECK-NEXT:    [[TMP8:%.*]] = call i64 @llvm.vector.reduce.add.v8i64(<8 x i64> [[TMP5]])
+; CHECK-NEXT:    [[OP_RDX16:%.*]] = add i64 [[TMP9]], [[TMP8]]
+; CHECK-NEXT:    [[OP_RDX25]] = add i64 [[OP_RDX16]], [[TMP3]]
 ; CHECK-NEXT:    br label [[LOOP]]
 ;
 entry:
diff --git a/llvm/test/Transforms/SLPVectorizer/X86/extract-vectorized-operand.ll b/llvm/test/Transforms/SLPVectorizer/X86/extract-vectorized-operand.ll
new file mode 100644
index 0000000..f1a5709
--- /dev/null
+++ b/llvm/test/Transforms/SLPVectorizer/X86/extract-vectorized-operand.ll
@@ -0,0 +1,49 @@
+; NOTE: Assertions have been autogenerated by utils/update_test_checks.py UTC_ARGS: --version 5
+; RUN: opt -S --passes=slp-vectorizer -slp-threshold=-99999 < %s -mtriple=x86_64-unknown-linux-gnu | FileCheck %s
+
+define void @test() {
+; CHECK-LABEL: define void @test() {
+; CHECK-NEXT:  [[BB:.*]]:
+; CHECK-NEXT:    br label %[[BB43:.*]]
+; CHECK:       [[BB20:.*]]:
+; CHECK-NEXT:    br label %[[BB105:.*]]
+; CHECK:       [[BB43]]:
+; CHECK-NEXT:    [[TMP0:%.*]] = phi <2 x ptr addrspace(1)> [ [[TMP1:%.*]], %[[BB51:.*]] ], [ zeroinitializer, %[[BB]] ]
+; CHECK-NEXT:    br i1 false, label %[[BB105]], label %[[BB51]]
+; CHECK:       [[BB51]]:
+; CHECK-NEXT:    [[TMP1]] = phi <2 x ptr addrspace(1)> [ poison, %[[BB54:.*]] ], [ zeroinitializer, %[[BB43]] ]
+; CHECK-NEXT:    br label %[[BB43]]
+; CHECK:       [[BB54]]:
+; CHECK-NEXT:    br label %[[BB51]]
+; CHECK:       [[BB105]]:
+; CHECK-NEXT:    [[PHI106:%.*]] = phi ptr addrspace(1) [ null, %[[BB20]] ], [ null, %[[BB43]] ]
+; CHECK-NEXT:    ret void
+;
+bb:
+  %0 = shufflevector <2 x ptr addrspace(1)> zeroinitializer, <2 x ptr addrspace(1)> zeroinitializer, <2 x i32> <i32 1, i32 0>
+  %1 = extractelement <2 x ptr addrspace(1)> %0, i32 0
+  %2 = extractelement <2 x ptr addrspace(1)> %0, i32 1
+  br label %bb43
+
+bb20:
+  br label %bb105
+
+bb43:
+  %phi441 = phi ptr addrspace(1) [ %4, %bb51 ], [ %2, %bb ]
+  %phi452 = phi ptr addrspace(1) [ %5, %bb51 ], [ %1, %bb ]
+  br i1 false, label %bb105, label %bb51
+
+bb51:
+  %3 = phi <2 x ptr addrspace(1)> [ poison, %bb54 ], [ zeroinitializer, %bb43 ]
+  %4 = extractelement <2 x ptr addrspace(1)> %3, i32 0
+  %5 = extractelement <2 x ptr addrspace(1)> %3, i32 1
+  br label %bb43
+
+bb54:
+  br label %bb51
+
+bb105:
+  %phi106 = phi ptr addrspace(1) [ %1, %bb20 ], [ null, %bb43 ]
+  ret void
+}
+
diff --git a/llvm/test/Transforms/SLPVectorizer/X86/gather-extractelements-different-bbs.ll b/llvm/test/Transforms/SLPVectorizer/X86/gather-extractelements-different-bbs.ll
index be790b7..e66cce1 100644
--- a/llvm/test/Transforms/SLPVectorizer/X86/gather-extractelements-different-bbs.ll
+++ b/llvm/test/Transforms/SLPVectorizer/X86/gather-extractelements-different-bbs.ll
@@ -4,30 +4,24 @@
 define i32 @foo(i32 %a) {
 ; CHECK-LABEL: @foo(
 ; CHECK-NEXT:  entry:
-; CHECK-NEXT:    [[TMP0:%.*]] = insertelement <2 x i32> <i32 poison, i32 0>, i32 [[A:%.*]], i32 0
-; CHECK-NEXT:    [[TMP1:%.*]] = sub nsw <2 x i32> zeroinitializer, [[TMP0]]
-; CHECK-NEXT:    [[TMP2:%.*]] = shufflevector <2 x i32> [[TMP1]], <2 x i32> poison, <4 x i32> <i32 0, i32 1, i32 1, i32 1>
-; CHECK-NEXT:    [[TMP3:%.*]] = extractelement <4 x i32> [[TMP2]], i32 1
+; CHECK-NEXT:    [[TMP0:%.*]] = sub nsw i32 0, [[A:%.*]]
+; CHECK-NEXT:    [[LOCAL:%.*]] = sub nsw i32 0, 0
 ; CHECK-NEXT:    br i1 false, label [[BB5:%.*]], label [[BB1:%.*]]
 ; CHECK:       bb1:
-; CHECK-NEXT:    [[TMP4:%.*]] = mul <2 x i32> [[TMP1]], <i32 1, i32 3>
-; CHECK-NEXT:    [[TMP5:%.*]] = extractelement <2 x i32> [[TMP4]], i32 0
-; CHECK-NEXT:    [[TMP6:%.*]] = extractelement <2 x i32> [[TMP4]], i32 1
-; CHECK-NEXT:    [[OP_RDX10:%.*]] = add i32 [[TMP6]], [[TMP5]]
-; CHECK-NEXT:    [[OP_RDX11:%.*]] = add i32 [[OP_RDX10]], 0
+; CHECK-NEXT:    [[TMP1:%.*]] = mul i32 [[LOCAL]], 3
+; CHECK-NEXT:    [[OP_RDX2:%.*]] = add i32 [[TMP1]], [[TMP0]]
+; CHECK-NEXT:    [[OP_RDX3:%.*]] = add i32 [[OP_RDX2]], 0
 ; CHECK-NEXT:    br label [[BB3:%.*]]
 ; CHECK:       bb2:
 ; CHECK-NEXT:    br label [[BB3]]
 ; CHECK:       bb3:
-; CHECK-NEXT:    [[P1:%.*]] = phi i32 [ [[OP_RDX11]], [[BB1]] ], [ 0, [[BB2:%.*]] ]
+; CHECK-NEXT:    [[P1:%.*]] = phi i32 [ [[OP_RDX3]], [[BB1]] ], [ 0, [[BB2:%.*]] ]
 ; CHECK-NEXT:    ret i32 0
 ; CHECK:       bb4:
-; CHECK-NEXT:    [[TMP7:%.*]] = shufflevector <2 x i32> [[TMP1]], <2 x i32> poison, <4 x i32> <i32 1, i32 1, i32 1, i32 1>
-; CHECK-NEXT:    [[TMP8:%.*]] = add <4 x i32> [[TMP7]], [[TMP2]]
-; CHECK-NEXT:    [[TMP9:%.*]] = call i32 @llvm.vector.reduce.add.v4i32(<4 x i32> [[TMP8]])
-; CHECK-NEXT:    [[OP_RDX8:%.*]] = add i32 [[TMP9]], 0
-; CHECK-NEXT:    [[OP_RDX9:%.*]] = add i32 [[OP_RDX8]], [[TMP3]]
-; CHECK-NEXT:    ret i32 [[OP_RDX9]]
+; CHECK-NEXT:    [[TMP2:%.*]] = mul i32 [[LOCAL]], 8
+; CHECK-NEXT:    [[OP_RDX:%.*]] = add i32 [[TMP2]], [[TMP0]]
+; CHECK-NEXT:    [[OP_RDX1:%.*]] = add i32 [[OP_RDX]], 0
+; CHECK-NEXT:    ret i32 [[OP_RDX1]]
 ; CHECK:       bb5:
 ; CHECK-NEXT:    br label [[BB4:%.*]]
 ;
diff --git a/llvm/test/Transforms/SLPVectorizer/X86/horizontal-minmax.ll b/llvm/test/Transforms/SLPVectorizer/X86/horizontal-minmax.ll
index de06daa..fa022ad 100644
--- a/llvm/test/Transforms/SLPVectorizer/X86/horizontal-minmax.ll
+++ b/llvm/test/Transforms/SLPVectorizer/X86/horizontal-minmax.ll
@@ -977,64 +977,54 @@ define i32 @maxi8_wrong_parent(i32) {
 ; SSE4-LABEL: @maxi8_wrong_parent(
 ; SSE4-NEXT:    [[TMP2:%.*]] = load i32, ptr @arr, align 16
 ; SSE4-NEXT:    [[TMP3:%.*]] = load i32, ptr getelementptr inbounds ([32 x i32], ptr @arr, i64 0, i64 1), align 4
-; SSE4-NEXT:    [[TMP4:%.*]] = icmp sgt i32 [[TMP2]], [[TMP3]]
 ; SSE4-NEXT:    br label [[PP:%.*]]
 ; SSE4:       pp:
-; SSE4-NEXT:    [[TMP5:%.*]] = select i1 [[TMP4]], i32 [[TMP2]], i32 [[TMP3]]
-; SSE4-NEXT:    [[TMP6:%.*]] = load <4 x i32>, ptr getelementptr inbounds ([32 x i32], ptr @arr, i64 0, i64 2), align 8
-; SSE4-NEXT:    [[TMP7:%.*]] = load i32, ptr getelementptr inbounds ([32 x i32], ptr @arr, i64 0, i64 6), align 8
-; SSE4-NEXT:    [[TMP8:%.*]] = load i32, ptr getelementptr inbounds ([32 x i32], ptr @arr, i64 0, i64 7), align 4
-; SSE4-NEXT:    [[TMP9:%.*]] = call i32 @llvm.vector.reduce.smax.v4i32(<4 x i32> [[TMP6]])
-; SSE4-NEXT:    [[OP_RDX:%.*]] = icmp sgt i32 [[TMP9]], [[TMP7]]
-; SSE4-NEXT:    [[OP_RDX1:%.*]] = select i1 [[OP_RDX]], i32 [[TMP9]], i32 [[TMP7]]
-; SSE4-NEXT:    [[OP_RDX2:%.*]] = icmp sgt i32 [[TMP8]], [[TMP5]]
-; SSE4-NEXT:    [[OP_RDX3:%.*]] = select i1 [[OP_RDX2]], i32 [[TMP8]], i32 [[TMP5]]
+; SSE4-NEXT:    [[TMP4:%.*]] = load <4 x i32>, ptr getelementptr inbounds ([32 x i32], ptr @arr, i64 0, i64 2), align 8
+; SSE4-NEXT:    [[TMP5:%.*]] = load i32, ptr getelementptr inbounds ([32 x i32], ptr @arr, i64 0, i64 6), align 8
+; SSE4-NEXT:    [[TMP6:%.*]] = load i32, ptr getelementptr inbounds ([32 x i32], ptr @arr, i64 0, i64 7), align 4
+; SSE4-NEXT:    [[TMP7:%.*]] = call i32 @llvm.vector.reduce.smax.v4i32(<4 x i32> [[TMP4]])
+; SSE4-NEXT:    [[OP_RDX:%.*]] = icmp sgt i32 [[TMP7]], [[TMP2]]
+; SSE4-NEXT:    [[OP_RDX1:%.*]] = select i1 [[OP_RDX]], i32 [[TMP7]], i32 [[TMP2]]
+; SSE4-NEXT:    [[OP_RDX2:%.*]] = icmp sgt i32 [[TMP5]], [[TMP6]]
+; SSE4-NEXT:    [[OP_RDX3:%.*]] = select i1 [[OP_RDX2]], i32 [[TMP5]], i32 [[TMP6]]
 ; SSE4-NEXT:    [[OP_RDX4:%.*]] = icmp sgt i32 [[OP_RDX1]], [[OP_RDX3]]
 ; SSE4-NEXT:    [[OP_RDX5:%.*]] = select i1 [[OP_RDX4]], i32 [[OP_RDX1]], i32 [[OP_RDX3]]
-; SSE4-NEXT:    ret i32 [[OP_RDX5]]
+; SSE4-NEXT:    [[OP_RDX6:%.*]] = icmp sgt i32 [[OP_RDX5]], [[TMP3]]
+; SSE4-NEXT:    [[OP_RDX7:%.*]] = select i1 [[OP_RDX6]], i32 [[OP_RDX5]], i32 [[TMP3]]
+; SSE4-NEXT:    ret i32 [[OP_RDX7]]
 ;
 ; AVX-LABEL: @maxi8_wrong_parent(
 ; AVX-NEXT:    [[TMP2:%.*]] = load i32, ptr @arr, align 16
 ; AVX-NEXT:    [[TMP3:%.*]] = load i32, ptr getelementptr inbounds ([32 x i32], ptr @arr, i64 0, i64 1), align 4
-; AVX-NEXT:    [[TMP4:%.*]] = icmp sgt i32 [[TMP2]], [[TMP3]]
 ; AVX-NEXT:    br label [[PP:%.*]]
 ; AVX:       pp:
-; AVX-NEXT:    [[TMP5:%.*]] = select i1 [[TMP4]], i32 [[TMP2]], i32 [[TMP3]]
-; AVX-NEXT:    [[TMP6:%.*]] = load <4 x i32>, ptr getelementptr inbounds ([32 x i32], ptr @arr, i64 0, i64 2), align 8
-; AVX-NEXT:    [[TMP7:%.*]] = load i32, ptr getelementptr inbounds ([32 x i32], ptr @arr, i64 0, i64 6), align 8
-; AVX-NEXT:    [[TMP8:%.*]] = load i32, ptr getelementptr inbounds ([32 x i32], ptr @arr, i64 0, i64 7), align 4
-; AVX-NEXT:    [[TMP9:%.*]] = call i32 @llvm.vector.reduce.smax.v4i32(<4 x i32> [[TMP6]])
-; AVX-NEXT:    [[OP_RDX:%.*]] = icmp sgt i32 [[TMP9]], [[TMP7]]
-; AVX-NEXT:    [[OP_RDX1:%.*]] = select i1 [[OP_RDX]], i32 [[TMP9]], i32 [[TMP7]]
-; AVX-NEXT:    [[OP_RDX2:%.*]] = icmp sgt i32 [[TMP8]], [[TMP5]]
-; AVX-NEXT:    [[OP_RDX3:%.*]] = select i1 [[OP_RDX2]], i32 [[TMP8]], i32 [[TMP5]]
+; AVX-NEXT:    [[TMP4:%.*]] = load <4 x i32>, ptr getelementptr inbounds ([32 x i32], ptr @arr, i64 0, i64 2), align 8
+; AVX-NEXT:    [[TMP5:%.*]] = load i32, ptr getelementptr inbounds ([32 x i32], ptr @arr, i64 0, i64 6), align 8
+; AVX-NEXT:    [[TMP6:%.*]] = load i32, ptr getelementptr inbounds ([32 x i32], ptr @arr, i64 0, i64 7), align 4
+; AVX-NEXT:    [[TMP7:%.*]] = call i32 @llvm.vector.reduce.smax.v4i32(<4 x i32> [[TMP4]])
+; AVX-NEXT:    [[OP_RDX:%.*]] = icmp sgt i32 [[TMP7]], [[TMP2]]
+; AVX-NEXT:    [[OP_RDX1:%.*]] = select i1 [[OP_RDX]], i32 [[TMP7]], i32 [[TMP2]]
+; AVX-NEXT:    [[OP_RDX2:%.*]] = icmp sgt i32 [[TMP5]], [[TMP6]]
+; AVX-NEXT:    [[OP_RDX3:%.*]] = select i1 [[OP_RDX2]], i32 [[TMP5]], i32 [[TMP6]]
 ; AVX-NEXT:    [[OP_RDX4:%.*]] = icmp sgt i32 [[OP_RDX1]], [[OP_RDX3]]
 ; AVX-NEXT:    [[OP_RDX5:%.*]] = select i1 [[OP_RDX4]], i32 [[OP_RDX1]], i32 [[OP_RDX3]]
-; AVX-NEXT:    ret i32 [[OP_RDX5]]
+; AVX-NEXT:    [[OP_RDX6:%.*]] = icmp sgt i32 [[OP_RDX5]], [[TMP3]]
+; AVX-NEXT:    [[OP_RDX7:%.*]] = select i1 [[OP_RDX6]], i32 [[OP_RDX5]], i32 [[TMP3]]
+; AVX-NEXT:    ret i32 [[OP_RDX7]]
 ;
 ; THRESH-LABEL: @maxi8_wrong_parent(
 ; THRESH-NEXT:    [[TMP2:%.*]] = load <2 x i32>, ptr @arr, align 16
-; THRESH-NEXT:    [[TMP3:%.*]] = extractelement <2 x i32> [[TMP2]], i32 0
-; THRESH-NEXT:    [[TMP4:%.*]] = extractelement <2 x i32> [[TMP2]], i32 1
-; THRESH-NEXT:    [[TMP5:%.*]] = icmp sgt i32 [[TMP3]], [[TMP4]]
 ; THRESH-NEXT:    br label [[PP:%.*]]
 ; THRESH:       pp:
-; THRESH-NEXT:    [[TMP6:%.*]] = select i1 [[TMP5]], i32 [[TMP3]], i32 [[TMP4]]
-; THRESH-NEXT:    [[TMP7:%.*]] = load <4 x i32>, ptr getelementptr inbounds ([32 x i32], ptr @arr, i64 0, i64 2), align 8
-; THRESH-NEXT:    [[TMP8:%.*]] = load i32, ptr getelementptr inbounds ([32 x i32], ptr @arr, i64 0, i64 6), align 8
-; THRESH-NEXT:    [[TMP9:%.*]] = load i32, ptr getelementptr inbounds ([32 x i32], ptr @arr, i64 0, i64 7), align 4
-; THRESH-NEXT:    [[TMP10:%.*]] = call i32 @llvm.vector.reduce.smax.v4i32(<4 x i32> [[TMP7]])
-; THRESH-NEXT:    [[TMP11:%.*]] = insertelement <2 x i32> poison, i32 [[TMP10]], i32 0
-; THRESH-NEXT:    [[TMP12:%.*]] = insertelement <2 x i32> [[TMP11]], i32 [[TMP9]], i32 1
-; THRESH-NEXT:    [[TMP13:%.*]] = insertelement <2 x i32> poison, i32 [[TMP8]], i32 0
-; THRESH-NEXT:    [[TMP14:%.*]] = insertelement <2 x i32> [[TMP13]], i32 [[TMP6]], i32 1
-; THRESH-NEXT:    [[TMP15:%.*]] = icmp sgt <2 x i32> [[TMP12]], [[TMP14]]
-; THRESH-NEXT:    [[TMP16:%.*]] = select <2 x i1> [[TMP15]], <2 x i32> [[TMP12]], <2 x i32> [[TMP14]]
-; THRESH-NEXT:    [[TMP17:%.*]] = extractelement <2 x i32> [[TMP16]], i32 0
-; THRESH-NEXT:    [[TMP18:%.*]] = extractelement <2 x i32> [[TMP16]], i32 1
-; THRESH-NEXT:    [[OP_RDX4:%.*]] = icmp sgt i32 [[TMP17]], [[TMP18]]
-; THRESH-NEXT:    [[OP_RDX5:%.*]] = select i1 [[OP_RDX4]], i32 [[TMP17]], i32 [[TMP18]]
-; THRESH-NEXT:    ret i32 [[OP_RDX5]]
+; THRESH-NEXT:    [[TMP3:%.*]] = load <4 x i32>, ptr getelementptr inbounds ([32 x i32], ptr @arr, i64 0, i64 2), align 8
+; THRESH-NEXT:    [[TMP4:%.*]] = load <2 x i32>, ptr getelementptr inbounds ([32 x i32], ptr @arr, i64 0, i64 6), align 8
+; THRESH-NEXT:    [[TMP5:%.*]] = shufflevector <2 x i32> [[TMP2]], <2 x i32> poison, <8 x i32> <i32 0, i32 1, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison>
+; THRESH-NEXT:    [[TMP6:%.*]] = shufflevector <4 x i32> [[TMP3]], <4 x i32> poison, <8 x i32> <i32 0, i32 1, i32 2, i32 3, i32 poison, i32 poison, i32 poison, i32 poison>
+; THRESH-NEXT:    [[TMP7:%.*]] = shufflevector <8 x i32> [[TMP5]], <8 x i32> [[TMP6]], <8 x i32> <i32 0, i32 1, i32 8, i32 9, i32 10, i32 11, i32 poison, i32 poison>
+; THRESH-NEXT:    [[TMP8:%.*]] = shufflevector <2 x i32> [[TMP4]], <2 x i32> poison, <8 x i32> <i32 0, i32 1, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison>
+; THRESH-NEXT:    [[TMP9:%.*]] = shufflevector <8 x i32> [[TMP7]], <8 x i32> [[TMP8]], <8 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 8, i32 9>
+; THRESH-NEXT:    [[TMP10:%.*]] = call i32 @llvm.vector.reduce.smax.v8i32(<8 x i32> [[TMP9]])
+; THRESH-NEXT:    ret i32 [[TMP10]]
 ;
   %2 = load i32, ptr @arr, align 16
   %3 = load i32, ptr getelementptr inbounds ([32 x i32], ptr @arr, i64 0, i64 1), align 4
diff --git a/llvm/test/Transforms/SLPVectorizer/X86/reduced-gathered-vectorized.ll b/llvm/test/Transforms/SLPVectorizer/X86/reduced-gathered-vectorized.ll
index 4dea523..31f0e06 100644
--- a/llvm/test/Transforms/SLPVectorizer/X86/reduced-gathered-vectorized.ll
+++ b/llvm/test/Transforms/SLPVectorizer/X86/reduced-gathered-vectorized.ll
@@ -6,23 +6,19 @@ define i16 @test() {
 ; CHECK-NEXT:  entry:
 ; CHECK-NEXT:    [[A:%.*]] = getelementptr [1000 x i64], ptr null, i64 0, i64 5
 ; CHECK-NEXT:    [[A1:%.*]] = getelementptr [1000 x i64], ptr null, i64 0, i64 6
-; CHECK-NEXT:    [[A2:%.*]] = getelementptr [1000 x i64], ptr null, i64 0, i64 7
 ; CHECK-NEXT:    br label [[WHILE:%.*]]
 ; CHECK:       while:
-; CHECK-NEXT:    [[PH:%.*]] = phi i64 [ 0, [[ENTRY:%.*]] ], [ [[OP_RDX5:%.*]], [[WHILE]] ]
+; CHECK-NEXT:    [[PH:%.*]] = phi i64 [ 0, [[ENTRY:%.*]] ], [ [[OP_RDX26:%.*]], [[WHILE]] ]
 ; CHECK-NEXT:    [[TMP0:%.*]] = load i64, ptr null, align 8
 ; CHECK-NEXT:    [[TMP1:%.*]] = load i64, ptr [[A1]], align 16
-; CHECK-NEXT:    [[TMP2:%.*]] = load <2 x i64>, ptr [[A2]], align 8
-; CHECK-NEXT:    [[TMP3:%.*]] = load i64, ptr null, align 8
-; CHECK-NEXT:    [[TMP4:%.*]] = load <4 x i64>, ptr [[A]], align 8
-; CHECK-NEXT:    [[TMP5:%.*]] = shufflevector <2 x i64> [[TMP2]], <2 x i64> poison, <8 x i32> <i32 0, i32 1, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison>
-; CHECK-NEXT:    [[TMP6:%.*]] = insertelement <8 x i64> [[TMP5]], i64 [[TMP0]], i32 2
-; CHECK-NEXT:    [[TMP7:%.*]] = insertelement <8 x i64> [[TMP6]], i64 [[TMP1]], i32 3
-; CHECK-NEXT:    [[TMP8:%.*]] = shufflevector <4 x i64> [[TMP4]], <4 x i64> poison, <8 x i32> <i32 0, i32 1, i32 2, i32 3, i32 poison, i32 poison, i32 poison, i32 poison>
-; CHECK-NEXT:    [[TMP9:%.*]] = shufflevector <8 x i64> [[TMP7]], <8 x i64> [[TMP8]], <8 x i32> <i32 0, i32 1, i32 2, i32 3, i32 8, i32 9, i32 10, i32 11>
-; CHECK-NEXT:    [[TMP10:%.*]] = shufflevector <8 x i64> [[TMP9]], <8 x i64> zeroinitializer, <8 x i32> <i32 8, i32 8, i32 2, i32 3, i32 4, i32 5, i32 8, i32 8>
-; CHECK-NEXT:    [[TMP11:%.*]] = call i64 @llvm.vector.reduce.xor.v8i64(<8 x i64> [[TMP10]])
-; CHECK-NEXT:    [[OP_RDX5]] = xor i64 [[TMP3]], [[TMP11]]
+; CHECK-NEXT:    [[TMP2:%.*]] = load i64, ptr null, align 8
+; CHECK-NEXT:    [[TMP3:%.*]] = load <4 x i64>, ptr [[A]], align 8
+; CHECK-NEXT:    [[TMP4:%.*]] = shufflevector <4 x i64> [[TMP3]], <4 x i64> zeroinitializer, <4 x i32> <i32 0, i32 1, i32 4, i32 4>
+; CHECK-NEXT:    [[TMP5:%.*]] = call i64 @llvm.vector.reduce.xor.v4i64(<4 x i64> [[TMP4]])
+; CHECK-NEXT:    [[OP_RDX:%.*]] = xor i64 0, [[TMP2]]
+; CHECK-NEXT:    [[OP_RDX24:%.*]] = xor i64 [[TMP0]], [[TMP1]]
+; CHECK-NEXT:    [[OP_RDX25:%.*]] = xor i64 [[OP_RDX]], [[OP_RDX24]]
+; CHECK-NEXT:    [[OP_RDX26]] = xor i64 [[OP_RDX25]], [[TMP5]]
 ; CHECK-NEXT:    br label [[WHILE]]
 ;
 entry:
diff --git a/llvm/test/Transforms/SLPVectorizer/X86/reduction-gather-non-scheduled-extracts.ll b/llvm/test/Transforms/SLPVectorizer/X86/reduction-gather-non-scheduled-extracts.ll
index f032d4b..e8abcce 100644
--- a/llvm/test/Transforms/SLPVectorizer/X86/reduction-gather-non-scheduled-extracts.ll
+++ b/llvm/test/Transforms/SLPVectorizer/X86/reduction-gather-non-scheduled-extracts.ll
@@ -7,15 +7,13 @@ define void @tes() {
 ; CHECK-NEXT:    [[TMP0:%.*]] = fcmp ole <2 x double> zeroinitializer, zeroinitializer
 ; CHECK-NEXT:    br label [[TMP1:%.*]]
 ; CHECK:       1:
-; CHECK-NEXT:    [[TMP2:%.*]] = select i1 false, i1 false, i1 false
 ; CHECK-NEXT:    [[TMP3:%.*]] = shufflevector <2 x i1> zeroinitializer, <2 x i1> [[TMP0]], <4 x i32> <i32 0, i32 0, i32 0, i32 2>
 ; CHECK-NEXT:    [[TMP4:%.*]] = call i1 @llvm.vector.reduce.and.v4i1(<4 x i1> [[TMP3]])
 ; CHECK-NEXT:    [[OP_RDX:%.*]] = select i1 false, i1 [[TMP4]], i1 false
-; CHECK-NEXT:    [[OP_RDX1:%.*]] = select i1 [[TMP2]], i1 [[OP_RDX]], i1 false
-; CHECK-NEXT:    br i1 [[OP_RDX1]], label [[TMP5:%.*]], label [[TMP6:%.*]]
-; CHECK:       5:
+; CHECK-NEXT:    br i1 [[OP_RDX]], label [[TMP6:%.*]], label [[TMP5:%.*]]
+; CHECK:       4:
 ; CHECK-NEXT:    ret void
-; CHECK:       6:
+; CHECK:       5:
 ; CHECK-NEXT:    ret void
 ;
 entry:
diff --git a/llvm/test/Transforms/SLPVectorizer/X86/reduction-logical.ll b/llvm/test/Transforms/SLPVectorizer/X86/reduction-logical.ll
index 838a75d..c25e07c 100644
--- a/llvm/test/Transforms/SLPVectorizer/X86/reduction-logical.ll
+++ b/llvm/test/Transforms/SLPVectorizer/X86/reduction-logical.ll
@@ -436,10 +436,9 @@ define i1 @logical_and_icmp_clamp_pred_diff(<4 x i32> %x) {
 define i1 @logical_and_icmp_extra_op(<4 x i32> %x, <4 x i32> %y, i1 %c) {
 ; CHECK-LABEL: @logical_and_icmp_extra_op(
 ; CHECK-NEXT:    [[TMP1:%.*]] = icmp slt <4 x i32> [[X:%.*]], [[Y:%.*]]
-; CHECK-NEXT:    [[S3:%.*]] = select i1 [[C:%.*]], i1 [[C]], i1 false
 ; CHECK-NEXT:    [[TMP2:%.*]] = freeze <4 x i1> [[TMP1]]
 ; CHECK-NEXT:    [[TMP3:%.*]] = call i1 @llvm.vector.reduce.and.v4i1(<4 x i1> [[TMP2]])
-; CHECK-NEXT:    [[OP_RDX:%.*]] = select i1 [[S3]], i1 [[TMP3]], i1 false
+; CHECK-NEXT:    [[OP_RDX:%.*]] = select i1 [[TMP3]], i1 [[C:%.*]], i1 false
 ; CHECK-NEXT:    ret i1 [[OP_RDX]]
 ;
   %x0 = extractelement <4 x i32> %x, i32 0
@@ -465,10 +464,9 @@ define i1 @logical_and_icmp_extra_op(<4 x i32> %x, <4 x i32> %y, i1 %c) {
 define i1 @logical_or_icmp_extra_op(<4 x i32> %x, <4 x i32> %y, i1 %c) {
 ; CHECK-LABEL: @logical_or_icmp_extra_op(
 ; CHECK-NEXT:    [[TMP1:%.*]] = icmp slt <4 x i32> [[X:%.*]], [[Y:%.*]]
-; CHECK-NEXT:    [[S3:%.*]] = select i1 [[C:%.*]], i1 true, i1 [[C]]
 ; CHECK-NEXT:    [[TMP2:%.*]] = freeze <4 x i1> [[TMP1]]
 ; CHECK-NEXT:    [[TMP3:%.*]] = call i1 @llvm.vector.reduce.or.v4i1(<4 x i1> [[TMP2]])
-; CHECK-NEXT:    [[OP_RDX:%.*]] = select i1 [[S3]], i1 true, i1 [[TMP3]]
+; CHECK-NEXT:    [[OP_RDX:%.*]] = select i1 [[TMP3]], i1 true, i1 [[C:%.*]]
 ; CHECK-NEXT:    ret i1 [[OP_RDX]]
 ;
   %x0 = extractelement <4 x i32> %x, i32 0
diff --git a/llvm/test/Transforms/SLPVectorizer/X86/vec_list_bias_external_insert_shuffled.ll b/llvm/test/Transforms/SLPVectorizer/X86/vec_list_bias_external_insert_shuffled.ll
index 8f1d7a1..69ecf18 100644
--- a/llvm/test/Transforms/SLPVectorizer/X86/vec_list_bias_external_insert_shuffled.ll
+++ b/llvm/test/Transforms/SLPVectorizer/X86/vec_list_bias_external_insert_shuffled.ll
@@ -7,9 +7,11 @@ define void @test(ptr nocapture %t2) {
 ; CHECK-NEXT:    [[T4:%.*]] = getelementptr inbounds i32, ptr [[T2]], i64 7
 ; CHECK-NEXT:    [[T5:%.*]] = load i32, ptr [[T4]], align 4
 ; CHECK-NEXT:    [[T8:%.*]] = getelementptr inbounds i32, ptr [[T2]], i64 1
+; CHECK-NEXT:    [[T9:%.*]] = load i32, ptr [[T8]], align 4
 ; CHECK-NEXT:    [[T10:%.*]] = getelementptr inbounds i32, ptr [[T2]], i64 6
 ; CHECK-NEXT:    [[T11:%.*]] = load i32, ptr [[T10]], align 4
-; CHECK-NEXT:    [[TMP1:%.*]] = load <2 x i32>, ptr [[T8]], align 4
+; CHECK-NEXT:    [[T14:%.*]] = getelementptr inbounds i32, ptr [[T2]], i64 2
+; CHECK-NEXT:    [[T15:%.*]] = load i32, ptr [[T14]], align 4
 ; CHECK-NEXT:    [[T16:%.*]] = getelementptr inbounds i32, ptr [[T2]], i64 5
 ; CHECK-NEXT:    [[T17:%.*]] = load i32, ptr [[T16]], align 4
 ; CHECK-NEXT:    [[T20:%.*]] = getelementptr inbounds i32, ptr [[T2]], i64 3
@@ -19,11 +21,10 @@ define void @test(ptr nocapture %t2) {
 ; CHECK-NEXT:    [[T24:%.*]] = add nsw i32 [[T23]], [[T21]]
 ; CHECK-NEXT:    [[T25:%.*]] = sub nsw i32 [[T21]], [[T23]]
 ; CHECK-NEXT:    [[T27:%.*]] = sub nsw i32 [[T3]], [[T24]]
-; CHECK-NEXT:    [[T9:%.*]] = extractelement <2 x i32> [[TMP1]], i32 0
-; CHECK-NEXT:    [[T15:%.*]] = extractelement <2 x i32> [[TMP1]], i32 1
 ; CHECK-NEXT:    [[T29:%.*]] = sub nsw i32 [[T9]], [[T15]]
 ; CHECK-NEXT:    [[T30:%.*]] = add nsw i32 [[T27]], [[T29]]
 ; CHECK-NEXT:    [[T31:%.*]] = mul nsw i32 [[T30]], 4433
+; CHECK-NEXT:    [[T32:%.*]] = mul nsw i32 [[T27]], 6270
 ; CHECK-NEXT:    [[T34:%.*]] = mul nsw i32 [[T29]], -15137
 ; CHECK-NEXT:    [[T37:%.*]] = add nsw i32 [[T25]], [[T11]]
 ; CHECK-NEXT:    [[T38:%.*]] = add nsw i32 [[T17]], [[T5]]
@@ -33,19 +34,20 @@ define void @test(ptr nocapture %t2) {
 ; CHECK-NEXT:    [[T42:%.*]] = mul nsw i32 [[T17]], 16819
 ; CHECK-NEXT:    [[T47:%.*]] = mul nsw i32 [[T37]], -16069
 ; CHECK-NEXT:    [[T48:%.*]] = mul nsw i32 [[T38]], -3196
-; CHECK-NEXT:    [[TMP4:%.*]] = shufflevector <2 x i32> [[TMP1]], <2 x i32> poison, <4 x i32> <i32 0, i32 1, i32 poison, i32 poison>
-; CHECK-NEXT:    [[TMP5:%.*]] = insertelement <4 x i32> [[TMP4]], i32 [[T27]], i32 2
-; CHECK-NEXT:    [[TMP6:%.*]] = insertelement <4 x i32> [[TMP5]], i32 [[T47]], i32 3
-; CHECK-NEXT:    [[TMP7:%.*]] = shufflevector <4 x i32> [[TMP6]], <4 x i32> <i32 poison, i32 poison, i32 6270, i32 poison>, <4 x i32> <i32 1, i32 0, i32 6, i32 poison>
-; CHECK-NEXT:    [[TMP8:%.*]] = insertelement <4 x i32> [[TMP7]], i32 [[T40]], i32 3
-; CHECK-NEXT:    [[TMP9:%.*]] = add nsw <4 x i32> [[TMP6]], [[TMP8]]
-; CHECK-NEXT:    [[TMP10:%.*]] = mul nsw <4 x i32> [[TMP6]], [[TMP8]]
-; CHECK-NEXT:    [[TMP11:%.*]] = shufflevector <4 x i32> [[TMP9]], <4 x i32> [[TMP10]], <4 x i32> <i32 0, i32 1, i32 6, i32 3>
-; CHECK-NEXT:    [[T50:%.*]] = add nsw i32 [[T40]], [[T48]]
-; CHECK-NEXT:    [[TMP12:%.*]] = shufflevector <4 x i32> [[TMP11]], <4 x i32> poison, <8 x i32> <i32 0, i32 1, i32 2, i32 3, i32 0, i32 poison, i32 poison, i32 3>
-; CHECK-NEXT:    [[T701:%.*]] = insertelement <8 x i32> [[TMP12]], i32 [[T50]], i32 5
+; CHECK-NEXT:    [[T49:%.*]] = add nsw i32 [[T40]], [[T47]]
+; CHECK-NEXT:    [[TMP1:%.*]] = insertelement <2 x i32> poison, i32 [[T15]], i32 0
+; CHECK-NEXT:    [[TMP2:%.*]] = insertelement <2 x i32> [[TMP1]], i32 [[T40]], i32 1
+; CHECK-NEXT:    [[TMP3:%.*]] = insertelement <2 x i32> poison, i32 [[T9]], i32 0
+; CHECK-NEXT:    [[TMP4:%.*]] = insertelement <2 x i32> [[TMP3]], i32 [[T48]], i32 1
+; CHECK-NEXT:    [[TMP5:%.*]] = add nsw <2 x i32> [[TMP2]], [[TMP4]]
+; CHECK-NEXT:    [[TMP6:%.*]] = shufflevector <2 x i32> [[TMP5]], <2 x i32> poison, <8 x i32> <i32 0, i32 0, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison>
+; CHECK-NEXT:    [[T67:%.*]] = insertelement <8 x i32> [[TMP6]], i32 [[T32]], i32 2
+; CHECK-NEXT:    [[T68:%.*]] = insertelement <8 x i32> [[T67]], i32 [[T49]], i32 3
+; CHECK-NEXT:    [[TMP7:%.*]] = shufflevector <2 x i32> [[TMP5]], <2 x i32> poison, <8 x i32> <i32 0, i32 1, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison>
+; CHECK-NEXT:    [[T701:%.*]] = shufflevector <8 x i32> [[T68]], <8 x i32> [[TMP7]], <8 x i32> <i32 0, i32 1, i32 2, i32 3, i32 8, i32 9, i32 poison, i32 poison>
 ; CHECK-NEXT:    [[T71:%.*]] = insertelement <8 x i32> [[T701]], i32 [[T34]], i32 6
-; CHECK-NEXT:    [[T76:%.*]] = shl <8 x i32> [[T71]], <i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3>
+; CHECK-NEXT:    [[T72:%.*]] = insertelement <8 x i32> [[T71]], i32 [[T49]], i32 7
+; CHECK-NEXT:    [[T76:%.*]] = shl <8 x i32> [[T72]], <i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3>
 ; CHECK-NEXT:    store <8 x i32> [[T76]], ptr [[T2]], align 4
 ; CHECK-NEXT:    ret void
 ;
diff --git a/llvm/test/Transforms/SLPVectorizer/revec.ll b/llvm/test/Transforms/SLPVectorizer/revec.ll
index c2dc6d0..a6e10611 100644
--- a/llvm/test/Transforms/SLPVectorizer/revec.ll
+++ b/llvm/test/Transforms/SLPVectorizer/revec.ll
@@ -58,3 +58,33 @@ entry:
   store <8 x i16> %4, ptr %5, align 2
   ret void
 }
+
+define void @test3(ptr %x, ptr %y, ptr %z) {
+; CHECK-LABEL: @test3(
+; CHECK-NEXT:  entry:
+; CHECK-NEXT:    [[TMP0:%.*]] = insertelement <2 x ptr> poison, ptr [[X:%.*]], i32 0
+; CHECK-NEXT:    [[TMP1:%.*]] = insertelement <2 x ptr> [[TMP0]], ptr [[Y:%.*]], i32 1
+; CHECK-NEXT:    [[TMP2:%.*]] = icmp eq <2 x ptr> [[TMP1]], zeroinitializer
+; CHECK-NEXT:    [[TMP3:%.*]] = load <8 x i32>, ptr [[X]], align 4
+; CHECK-NEXT:    [[TMP4:%.*]] = load <8 x i32>, ptr [[Y]], align 4
+; CHECK-NEXT:    [[TMP5:%.*]] = shufflevector <2 x i1> [[TMP2]], <2 x i1> poison, <8 x i32> <i32 0, i32 0, i32 0, i32 0, i32 1, i32 1, i32 1, i32 1>
+; CHECK-NEXT:    [[TMP6:%.*]] = select <8 x i1> [[TMP5]], <8 x i32> [[TMP3]], <8 x i32> [[TMP4]]
+; CHECK-NEXT:    store <8 x i32> [[TMP6]], ptr [[Z:%.*]], align 4
+; CHECK-NEXT:    ret void
+;
+entry:
+  %0 = getelementptr inbounds i32, ptr %x, i64 4
+  %1 = getelementptr inbounds i32, ptr %y, i64 4
+  %2 = load <4 x i32>, ptr %x, align 4
+  %3 = load <4 x i32>, ptr %0, align 4
+  %4 = load <4 x i32>, ptr %y, align 4
+  %5 = load <4 x i32>, ptr %1, align 4
+  %6 = icmp eq ptr %x, null
+  %7 = icmp eq ptr %y, null
+  %8 = select i1 %6, <4 x i32> %2, <4 x i32> %4
+  %9 = select i1 %7, <4 x i32> %3, <4 x i32> %5
+  %10 = getelementptr inbounds i32, ptr %z, i64 4
+  store <4 x i32> %8, ptr %z, align 4
+  store <4 x i32> %9, ptr %10, align 4
+  ret void
+}
diff --git a/llvm/test/Transforms/SROA/phi-and-select.ll b/llvm/test/Transforms/SROA/phi-and-select.ll
index 7c8b27c..616617b 100644
--- a/llvm/test/Transforms/SROA/phi-and-select.ll
+++ b/llvm/test/Transforms/SROA/phi-and-select.ll
@@ -344,6 +344,40 @@ entry:
   ret i32 %loaded
 }
 
+; We should not unconditionally load with sanitizers.
+define i32 @test9_asan(i32 %b, ptr %ptr) sanitize_address {
+; Same as @test8 but for a select rather than a PHI node.
+;
+; CHECK-PRESERVE-CFG-LABEL: @test9_asan(
+; CHECK-PRESERVE-CFG-NEXT:  entry:
+; CHECK-PRESERVE-CFG-NEXT:    [[F:%.*]] = alloca float, align 4
+; CHECK-PRESERVE-CFG-NEXT:    store i32 0, ptr [[PTR:%.*]], align 4
+; CHECK-PRESERVE-CFG-NEXT:    [[TEST:%.*]] = icmp ne i32 [[B:%.*]], 0
+; CHECK-PRESERVE-CFG-NEXT:    [[SELECT:%.*]] = select i1 [[TEST]], ptr [[F]], ptr [[PTR]]
+; CHECK-PRESERVE-CFG-NEXT:    [[LOADED:%.*]] = load i32, ptr [[SELECT]], align 4
+; CHECK-PRESERVE-CFG-NEXT:    ret i32 [[LOADED]]
+;
+; CHECK-MODIFY-CFG-LABEL: @test9_asan(
+; CHECK-MODIFY-CFG-NEXT:  entry:
+; CHECK-MODIFY-CFG-NEXT:    store i32 0, ptr [[PTR:%.*]], align 4
+; CHECK-MODIFY-CFG-NEXT:    [[TEST:%.*]] = icmp ne i32 [[B:%.*]], 0
+; CHECK-MODIFY-CFG-NEXT:    [[LOADED_ELSE_VAL:%.*]] = load i32, ptr [[PTR]], align 4
+; CHECK-MODIFY-CFG-NEXT:    br i1 [[TEST]], label [[ENTRY_THEN:%.*]], label [[ENTRY_CONT:%.*]]
+; CHECK-MODIFY-CFG:       entry.then:
+; CHECK-MODIFY-CFG-NEXT:    br label [[ENTRY_CONT]]
+; CHECK-MODIFY-CFG:       entry.cont:
+; CHECK-MODIFY-CFG-NEXT:    [[LOADED:%.*]] = phi i32 [ undef, [[ENTRY_THEN]] ], [ [[LOADED_ELSE_VAL]], [[ENTRY:%.*]] ]
+; CHECK-MODIFY-CFG-NEXT:    ret i32 [[LOADED]]
+;
+entry:
+  %f = alloca float
+  store i32 0, ptr %ptr
+  %test = icmp ne i32 %b, 0
+  %select = select i1 %test, ptr %f, ptr %ptr
+  %loaded = load i32, ptr %select, align 4
+  ret i32 %loaded
+}
+
 define float @test10(i32 %b, ptr %ptr) {
 ; Don't try to promote allocas which are not elligible for it even after
 ; rewriting due to the necessity of inserting bitcasts when speculating a PHI
diff --git a/llvm/test/Transforms/SROA/phi-with-duplicate-pred.ll b/llvm/test/Transforms/SROA/phi-with-duplicate-pred.ll
index a51c757..76e00a9 100644
--- a/llvm/test/Transforms/SROA/phi-with-duplicate-pred.ll
+++ b/llvm/test/Transforms/SROA/phi-with-duplicate-pred.ll
@@ -16,8 +16,8 @@ define void @f2(i1 %c1) {
 ; CHECK:       cleanup:
 ; CHECK-NEXT:    [[G_0_SROA_SPECULATE_LOAD_CLEANUP:%.*]] = load i16, ptr @a, align 1
 ; CHECK-NEXT:    switch i32 2, label [[CLEANUP7:%.*]] [
-; CHECK-NEXT:    i32 0, label [[LBL1:%.*]]
-; CHECK-NEXT:    i32 2, label [[LBL1]]
+; CHECK-NEXT:      i32 0, label [[LBL1:%.*]]
+; CHECK-NEXT:      i32 2, label [[LBL1]]
 ; CHECK-NEXT:    ]
 ; CHECK:       if.else:
 ; CHECK-NEXT:    br label [[LBL1]]
@@ -52,6 +52,52 @@ cleanup7:                                         ; preds = %cleanup
   ret void
 }
 
+define void @f2_hwasan(i1 %c1) sanitize_hwaddress {
+; CHECK-LABEL: @f2_hwasan(
+; CHECK-NEXT:  entry:
+; CHECK-NEXT:    [[E:%.*]] = alloca i16, align 1
+; CHECK-NEXT:    br i1 [[C1:%.*]], label [[IF_THEN:%.*]], label [[IF_ELSE:%.*]]
+; CHECK:       if.then:
+; CHECK-NEXT:    br label [[CLEANUP:%.*]]
+; CHECK:       cleanup:
+; CHECK-NEXT:    switch i32 2, label [[CLEANUP7:%.*]] [
+; CHECK-NEXT:      i32 0, label [[LBL1:%.*]]
+; CHECK-NEXT:      i32 2, label [[LBL1]]
+; CHECK-NEXT:    ]
+; CHECK:       if.else:
+; CHECK-NEXT:    br label [[LBL1]]
+; CHECK:       lbl1:
+; CHECK-NEXT:    [[G_0:%.*]] = phi ptr [ @a, [[CLEANUP]] ], [ @a, [[CLEANUP]] ], [ [[E]], [[IF_ELSE]] ]
+; CHECK-NEXT:    [[TMP0:%.*]] = load i16, ptr [[G_0]], align 1
+; CHECK-NEXT:    unreachable
+; CHECK:       cleanup7:
+; CHECK-NEXT:    ret void
+;
+entry:
+  %e = alloca i16, align 1
+  br i1 %c1, label %if.then, label %if.else
+
+if.then:                                          ; preds = %entry
+  br label %cleanup
+
+cleanup:                                          ; preds = %if.then
+  switch i32 2, label %cleanup7 [
+  i32 0, label %lbl1
+  i32 2, label %lbl1
+  ]
+
+if.else:                                          ; preds = %entry
+  br label %lbl1
+
+lbl1:                                             ; preds = %if.else, %cleanup, %cleanup
+  %g.0 = phi ptr [ @a, %cleanup ], [ @a, %cleanup ], [ %e, %if.else ]
+  %0 = load i16, ptr %g.0, align 1
+  unreachable
+
+cleanup7:                                         ; preds = %cleanup
+  ret void
+}
+
 define void @f3(i1 %c1) {
 ; CHECK-LABEL: @f3(
 ; CHECK-NEXT:  entry:
@@ -61,8 +107,8 @@ define void @f3(i1 %c1) {
 ; CHECK-NEXT:    br label [[CLEANUP:%.*]]
 ; CHECK:       cleanup:
 ; CHECK-NEXT:    switch i32 2, label [[CLEANUP7:%.*]] [
-; CHECK-NEXT:    i32 0, label [[LBL1:%.*]]
-; CHECK-NEXT:    i32 2, label [[LBL1]]
+; CHECK-NEXT:      i32 0, label [[LBL1:%.*]]
+; CHECK-NEXT:      i32 2, label [[LBL1]]
 ; CHECK-NEXT:    ]
 ; CHECK:       if.else:
 ; CHECK-NEXT:    br label [[LBL1]]
@@ -112,8 +158,8 @@ define void @f4(i1 %c1) {
 ; CHECK-NEXT:    br label [[CLEANUP:%.*]]
 ; CHECK:       cleanup:
 ; CHECK-NEXT:    switch i32 2, label [[CLEANUP7:%.*]] [
-; CHECK-NEXT:    i32 0, label [[LBL1:%.*]]
-; CHECK-NEXT:    i32 2, label [[LBL1]]
+; CHECK-NEXT:      i32 0, label [[LBL1:%.*]]
+; CHECK-NEXT:      i32 2, label [[LBL1]]
 ; CHECK-NEXT:    ]
 ; CHECK:       if.else:
 ; CHECK-NEXT:    br label [[LBL1]]
@@ -165,8 +211,8 @@ define void @f5(i1 %c1, i1 %c2) {
 ; CHECK-NEXT:    br label [[CLEANUP:%.*]]
 ; CHECK:       cleanup:
 ; CHECK-NEXT:    switch i32 2, label [[CLEANUP7:%.*]] [
-; CHECK-NEXT:    i32 0, label [[LBL1:%.*]]
-; CHECK-NEXT:    i32 2, label [[LBL1]]
+; CHECK-NEXT:      i32 0, label [[LBL1:%.*]]
+; CHECK-NEXT:      i32 2, label [[LBL1]]
 ; CHECK-NEXT:    ]
 ; CHECK:       if.else:
 ; CHECK-NEXT:    br label [[LBL1]]
@@ -216,8 +262,8 @@ define void @f6(i1 %c1) {
 ; CHECK-NEXT:    br label [[CLEANUP:%.*]]
 ; CHECK:       cleanup:
 ; CHECK-NEXT:    switch i32 2, label [[CLEANUP7:%.*]] [
-; CHECK-NEXT:    i32 0, label [[LBL1:%.*]]
-; CHECK-NEXT:    i32 2, label [[LBL1]]
+; CHECK-NEXT:      i32 0, label [[LBL1:%.*]]
+; CHECK-NEXT:      i32 2, label [[LBL1]]
 ; CHECK-NEXT:    ]
 ; CHECK:       if.else:
 ; CHECK-NEXT:    br label [[LBL1]]
diff --git a/llvm/test/Transforms/SROA/pr57796.ll b/llvm/test/Transforms/SROA/pr57796.ll
index dbcb6d0..4eb6a71 100644
--- a/llvm/test/Transforms/SROA/pr57796.ll
+++ b/llvm/test/Transforms/SROA/pr57796.ll
@@ -29,13 +29,13 @@ entry:
   %call.i = call align 32 ptr @value_set_type(ptr align 32 %ref.tmp.i)
   %0 = load <32 x i8>, ptr %call.i, align 32
   store <32 x i8> %0, ptr %ref.tmp, align 32
-  %1 = load x86_mmx, ptr %ref.tmp, align 32
-  %2 = call x86_mmx @llvm.x86.sse.pshuf.w(x86_mmx %1, i8 0)
-  store x86_mmx %2, ptr @A, align 8
+  %1 = load <1 x i64>, ptr %ref.tmp, align 32
+  %2 = call <1 x i64> @llvm.x86.sse.pshuf.w(<1 x i64> %1, i8 0)
+  store <1 x i64> %2, ptr @A, align 8
   ret void
 }
 
-declare x86_mmx @llvm.x86.sse.pshuf.w(x86_mmx, i8 immarg)
+declare <1 x i64> @llvm.x86.sse.pshuf.w(<1 x i64>, i8 immarg)
 
 declare dso_local void @value_create(ptr sret(%struct.Value) align 32)
 
diff --git a/llvm/test/Transforms/SROA/select-load.ll b/llvm/test/Transforms/SROA/select-load.ll
index 7df7241..9de7650 100644
--- a/llvm/test/Transforms/SROA/select-load.ll
+++ b/llvm/test/Transforms/SROA/select-load.ll
@@ -36,7 +36,7 @@ entry:
 %st.args = type { i32, ptr }
 
 ; A bitcasted load and a direct load of select.
-define void @test_multiple_loads_select(i1 %cmp){
+define void @test_multiple_loads_select(i1 %cmp) {
 ; CHECK-LABEL: @test_multiple_loads_select(
 ; CHECK-NEXT:  entry:
 ; CHECK-NEXT:    [[ADDR_I8_SROA_SPECULATED:%.*]] = select i1 [[CMP:%.*]], ptr undef, ptr undef
@@ -57,6 +57,51 @@ entry:
   ret void
 }
 
+; Sanitizer will break optimization.
+define void @test_multiple_loads_select_asan(i1 %cmp) sanitize_address {
+; CHECK-PRESERVE-CFG-LABEL: @test_multiple_loads_select_asan(
+; CHECK-PRESERVE-CFG-NEXT:  entry:
+; CHECK-PRESERVE-CFG-NEXT:    [[ARGS_SROA_0:%.*]] = alloca ptr, align 8
+; CHECK-PRESERVE-CFG-NEXT:    [[ARGS_SROA_1:%.*]] = alloca ptr, align 8
+; CHECK-PRESERVE-CFG-NEXT:    [[SEL_SROA_SEL:%.*]] = select i1 [[CMP:%.*]], ptr [[ARGS_SROA_1]], ptr [[ARGS_SROA_0]]
+; CHECK-PRESERVE-CFG-NEXT:    [[ADDR_I8:%.*]] = load ptr, ptr [[SEL_SROA_SEL]], align 8
+; CHECK-PRESERVE-CFG-NEXT:    call void @foo_i8(ptr [[ADDR_I8]])
+; CHECK-PRESERVE-CFG-NEXT:    [[ADDR_I32:%.*]] = load ptr, ptr [[SEL_SROA_SEL]], align 8
+; CHECK-PRESERVE-CFG-NEXT:    call void @foo_i32(ptr [[ADDR_I32]])
+; CHECK-PRESERVE-CFG-NEXT:    ret void
+;
+; CHECK-MODIFY-CFG-LABEL: @test_multiple_loads_select_asan(
+; CHECK-MODIFY-CFG-NEXT:  entry:
+; CHECK-MODIFY-CFG-NEXT:    br i1 [[CMP:%.*]], label [[ENTRY_THEN:%.*]], label [[ENTRY_ELSE:%.*]]
+; CHECK-MODIFY-CFG:       entry.then:
+; CHECK-MODIFY-CFG-NEXT:    br label [[ENTRY_CONT:%.*]]
+; CHECK-MODIFY-CFG:       entry.else:
+; CHECK-MODIFY-CFG-NEXT:    br label [[ENTRY_CONT]]
+; CHECK-MODIFY-CFG:       entry.cont:
+; CHECK-MODIFY-CFG-NEXT:    [[ADDR_I8:%.*]] = phi ptr [ undef, [[ENTRY_THEN]] ], [ undef, [[ENTRY_ELSE]] ]
+; CHECK-MODIFY-CFG-NEXT:    call void @foo_i8(ptr [[ADDR_I8]])
+; CHECK-MODIFY-CFG-NEXT:    br i1 [[CMP]], label [[ENTRY_CONT_THEN:%.*]], label [[ENTRY_CONT_ELSE:%.*]]
+; CHECK-MODIFY-CFG:       entry.cont.then:
+; CHECK-MODIFY-CFG-NEXT:    br label [[ENTRY_CONT_CONT:%.*]]
+; CHECK-MODIFY-CFG:       entry.cont.else:
+; CHECK-MODIFY-CFG-NEXT:    br label [[ENTRY_CONT_CONT]]
+; CHECK-MODIFY-CFG:       entry.cont.cont:
+; CHECK-MODIFY-CFG-NEXT:    [[ADDR_I32:%.*]] = phi ptr [ undef, [[ENTRY_CONT_THEN]] ], [ undef, [[ENTRY_CONT_ELSE]] ]
+; CHECK-MODIFY-CFG-NEXT:    call void @foo_i32(ptr [[ADDR_I32]])
+; CHECK-MODIFY-CFG-NEXT:    ret void
+;
+entry:
+  %args = alloca [2 x %st.args], align 16
+  %arr1 = getelementptr inbounds [2 x %st.args], ptr %args, i64 0, i64 1
+  %sel = select i1 %cmp, ptr %arr1, ptr %args
+  %addr = getelementptr inbounds %st.args, ptr %sel, i64 0, i32 1
+  %addr.i8 = load ptr, ptr %addr, align 8
+  call void @foo_i8(ptr %addr.i8)
+  %addr.i32 = load ptr, ptr %addr, align 8
+  call void @foo_i32 (ptr %addr.i32)
+  ret void
+}
+
 declare void @foo_i8(ptr)
 declare void @foo_i32(ptr)
 
@@ -414,13 +459,13 @@ define void @load_of_select_with_noundef_nonnull(ptr %buffer, i1 %b) {
 ; CHECK-PRESERVE-CFG-LABEL: @load_of_select_with_noundef_nonnull(
 ; CHECK-PRESERVE-CFG-NEXT:    [[UB_PTR:%.*]] = alloca ptr, align 8
 ; CHECK-PRESERVE-CFG-NEXT:    [[SELECT_PTR:%.*]] = select i1 [[B:%.*]], ptr [[BUFFER:%.*]], ptr [[UB_PTR]]
-; CHECK-PRESERVE-CFG-NEXT:    [[LOAD_PTR:%.*]] = load ptr, ptr [[SELECT_PTR]], align 8, !nonnull !1, !noundef !1
+; CHECK-PRESERVE-CFG-NEXT:    [[LOAD_PTR:%.*]] = load ptr, ptr [[SELECT_PTR]], align 8, !nonnull [[META1:![0-9]+]], !noundef [[META1]]
 ; CHECK-PRESERVE-CFG-NEXT:    ret void
 ;
 ; CHECK-MODIFY-CFG-LABEL: @load_of_select_with_noundef_nonnull(
 ; CHECK-MODIFY-CFG-NEXT:    br i1 [[B:%.*]], label [[DOTTHEN:%.*]], label [[DOTCONT:%.*]]
 ; CHECK-MODIFY-CFG:       .then:
-; CHECK-MODIFY-CFG-NEXT:    [[LOAD_PTR_THEN_VAL:%.*]] = load ptr, ptr [[BUFFER:%.*]], align 8, !nonnull !2, !noundef !2
+; CHECK-MODIFY-CFG-NEXT:    [[LOAD_PTR_THEN_VAL:%.*]] = load ptr, ptr [[BUFFER:%.*]], align 8, !nonnull [[META2:![0-9]+]], !noundef [[META2]]
 ; CHECK-MODIFY-CFG-NEXT:    br label [[DOTCONT]]
 ; CHECK-MODIFY-CFG:       .cont:
 ; CHECK-MODIFY-CFG-NEXT:    [[LOAD_PTR:%.*]] = phi ptr [ [[LOAD_PTR_THEN_VAL]], [[DOTTHEN]] ], [ undef, [[TMP0:%.*]] ]
diff --git a/llvm/test/Transforms/VectorCombine/X86/load-inseltpoison.ll b/llvm/test/Transforms/VectorCombine/X86/load-inseltpoison.ll
index f883282..c4aba63 100644
--- a/llvm/test/Transforms/VectorCombine/X86/load-inseltpoison.ll
+++ b/llvm/test/Transforms/VectorCombine/X86/load-inseltpoison.ll
@@ -626,6 +626,58 @@ define <8 x i16> @gep1_load_v2i16_extract_insert_v8i16(ptr align 1 dereferenceab
   ret <8 x i16> %r
 }
 
+; Negative sanitizer tests.
+
+define <4 x i32> @load_i32_insert_v4i32_asan(ptr align 16 dereferenceable(16) %p) nofree nosync sanitize_address  {
+; CHECK-LABEL: @load_i32_insert_v4i32_asan(
+; CHECK-NEXT:    [[S:%.*]] = load i32, ptr [[P:%.*]], align 4
+; CHECK-NEXT:    [[R:%.*]] = insertelement <4 x i32> poison, i32 [[S]], i32 0
+; CHECK-NEXT:    ret <4 x i32> [[R]]
+;
+  %s = load i32, ptr %p, align 4
+  %r = insertelement <4 x i32> poison, i32 %s, i32 0
+  ret <4 x i32> %r
+}
+
+define <4 x float> @load_v2f32_extract_insert_v4f32_hwasan(ptr align 16 dereferenceable(16) %p) nofree nosync sanitize_hwaddress  {
+; CHECK-LABEL: @load_v2f32_extract_insert_v4f32_hwasan(
+; CHECK-NEXT:    [[L:%.*]] = load <2 x float>, ptr [[P:%.*]], align 4
+; CHECK-NEXT:    [[S:%.*]] = extractelement <2 x float> [[L]], i32 0
+; CHECK-NEXT:    [[R:%.*]] = insertelement <4 x float> poison, float [[S]], i32 0
+; CHECK-NEXT:    ret <4 x float> [[R]]
+;
+  %l = load <2 x float>, ptr %p, align 4
+  %s = extractelement <2 x float> %l, i32 0
+  %r = insertelement <4 x float> poison, float %s, i32 0
+  ret <4 x float> %r
+}
+
+define <4 x float> @load_v2f32_extract_insert_v4f32_tsan(ptr align 16 dereferenceable(16) %p) nofree nosync sanitize_thread  {
+; CHECK-LABEL: @load_v2f32_extract_insert_v4f32_tsan(
+; CHECK-NEXT:    [[L:%.*]] = load <2 x float>, ptr [[P:%.*]], align 4
+; CHECK-NEXT:    [[S:%.*]] = extractelement <2 x float> [[L]], i32 0
+; CHECK-NEXT:    [[R:%.*]] = insertelement <4 x float> poison, float [[S]], i32 0
+; CHECK-NEXT:    ret <4 x float> [[R]]
+;
+  %l = load <2 x float>, ptr %p, align 4
+  %s = extractelement <2 x float> %l, i32 0
+  %r = insertelement <4 x float> poison, float %s, i32 0
+  ret <4 x float> %r
+}
+
+; Double negative msan tests, it's OK with the optimization.
+
+define <2 x float> @load_f32_insert_v2f32_msan(ptr align 16 dereferenceable(16) %p) nofree nosync sanitize_memory  {
+; CHECK-LABEL: @load_f32_insert_v2f32_msan(
+; CHECK-NEXT:    [[TMP1:%.*]] = load <4 x float>, ptr [[P:%.*]], align 16
+; CHECK-NEXT:    [[R:%.*]] = shufflevector <4 x float> [[TMP1]], <4 x float> poison, <2 x i32> <i32 0, i32 poison>
+; CHECK-NEXT:    ret <2 x float> [[R]]
+;
+  %s = load float, ptr %p, align 4
+  %r = insertelement <2 x float> poison, float %s, i32 0
+  ret <2 x float> %r
+}
+
 ; PR30986 - split vector loads for scalarized operations
 define <2 x i64> @PR30986(ptr %0) {
 ; CHECK-LABEL: @PR30986(
diff --git a/llvm/test/Transforms/VectorCombine/X86/load-widening.ll b/llvm/test/Transforms/VectorCombine/X86/load-widening.ll
index a53abab..30a0898 100644
--- a/llvm/test/Transforms/VectorCombine/X86/load-widening.ll
+++ b/llvm/test/Transforms/VectorCombine/X86/load-widening.ll
@@ -382,3 +382,83 @@ define <4 x i32> @load_v2i32_v4i32_addrspacecast(ptr addrspace(5) align 16 deref
   %s = shufflevector <2 x i32> %l, <2 x i32> poison, <4 x i32> <i32 0, i32 1, i32 undef, i32 undef>
   ret <4 x i32> %s
 }
+
+; Negative-negative tests with msan, which should be OK with widening.
+
+define <4 x float> @load_v1f32_v4f32_msan(ptr dereferenceable(16) %p) sanitize_memory  {
+; CHECK-LABEL: @load_v1f32_v4f32_msan(
+; CHECK-NEXT:    [[S:%.*]] = load <4 x float>, ptr [[P:%.*]], align 16
+; CHECK-NEXT:    ret <4 x float> [[S]]
+;
+  %l = load <1 x float>, ptr %p, align 16
+  %s = shufflevector <1 x float> %l, <1 x float> poison, <4 x i32> <i32 0, i32 undef, i32 undef, i32 undef>
+  ret <4 x float> %s
+}
+
+; Negative tests with sanitizers.
+
+define <4 x float> @load_v1f32_v4f32_asan(ptr dereferenceable(16) %p) sanitize_address  {
+; CHECK-LABEL: @load_v1f32_v4f32_asan(
+; CHECK-NEXT:    [[L:%.*]] = load <1 x float>, ptr [[P:%.*]], align 16
+; CHECK-NEXT:    [[S:%.*]] = shufflevector <1 x float> [[L]], <1 x float> poison, <4 x i32> <i32 0, i32 poison, i32 poison, i32 poison>
+; CHECK-NEXT:    ret <4 x float> [[S]]
+;
+  %l = load <1 x float>, ptr %p, align 16
+  %s = shufflevector <1 x float> %l, <1 x float> poison, <4 x i32> <i32 0, i32 undef, i32 undef, i32 undef>
+  ret <4 x float> %s
+}
+
+define <4 x float> @load_v2f32_v4f32_hwasan(ptr align 16 dereferenceable(16) %p) sanitize_hwaddress {
+; CHECK-LABEL: @load_v2f32_v4f32_hwasan(
+; CHECK-NEXT:    [[L:%.*]] = load <2 x float>, ptr [[P:%.*]], align 1
+; CHECK-NEXT:    [[S:%.*]] = shufflevector <2 x float> [[L]], <2 x float> poison, <4 x i32> <i32 0, i32 1, i32 poison, i32 poison>
+; CHECK-NEXT:    ret <4 x float> [[S]]
+;
+  %l = load <2 x float>, ptr %p, align 1
+  %s = shufflevector <2 x float> %l, <2 x float> poison, <4 x i32> <i32 0, i32 1, i32 undef, i32 undef>
+  ret <4 x float> %s
+}
+
+define <4 x float> @load_v3f32_v4f32_tsan(ptr dereferenceable(16) %p) sanitize_thread  {
+; CHECK-LABEL: @load_v3f32_v4f32_tsan(
+; CHECK-NEXT:    [[L:%.*]] = load <3 x float>, ptr [[P:%.*]], align 1
+; CHECK-NEXT:    [[S:%.*]] = shufflevector <3 x float> [[L]], <3 x float> poison, <4 x i32> <i32 0, i32 1, i32 2, i32 poison>
+; CHECK-NEXT:    ret <4 x float> [[S]]
+;
+  %l = load <3 x float>, ptr %p, align 1
+  %s = shufflevector <3 x float> %l, <3 x float> poison, <4 x i32> <i32 0, i32 1, i32 2, i32 undef>
+  ret <4 x float> %s
+}
+
+define <8 x float> @load_v2f32_v8f32_hwasan(ptr dereferenceable(32) %p) sanitize_hwaddress {
+; CHECK-LABEL: @load_v2f32_v8f32_hwasan(
+; CHECK-NEXT:    [[L:%.*]] = load <2 x float>, ptr [[P:%.*]], align 1
+; CHECK-NEXT:    [[S:%.*]] = shufflevector <2 x float> [[L]], <2 x float> poison, <8 x i32> <i32 0, i32 1, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison>
+; CHECK-NEXT:    ret <8 x float> [[S]]
+;
+  %l = load <2 x float>, ptr %p, align 1
+  %s = shufflevector <2 x float> %l, <2 x float> poison, <8 x i32> <i32 0, i32 1, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef>
+  ret <8 x float> %s
+}
+
+define <4 x i32> @load_v2i32_v4i32_asan(ptr dereferenceable(16) %p) sanitize_address {
+; CHECK-LABEL: @load_v2i32_v4i32_asan(
+; CHECK-NEXT:    [[L:%.*]] = load <2 x i32>, ptr [[P:%.*]], align 1
+; CHECK-NEXT:    [[S:%.*]] = shufflevector <2 x i32> [[L]], <2 x i32> poison, <4 x i32> <i32 0, i32 poison, i32 poison, i32 poison>
+; CHECK-NEXT:    ret <4 x i32> [[S]]
+;
+  %l = load <2 x i32>, ptr %p, align 1
+  %s = shufflevector <2 x i32> %l, <2 x i32> poison, <4 x i32> <i32 0, i32 undef, i32 undef, i32 undef>
+  ret <4 x i32> %s
+}
+
+define <4 x i32> @load_v2i32_v4i32_non_canonical_mask_commute_hwasan(ptr dereferenceable(16) %p) sanitize_hwaddress {
+; CHECK-LABEL: @load_v2i32_v4i32_non_canonical_mask_commute_hwasan(
+; CHECK-NEXT:    [[L:%.*]] = load <2 x i32>, ptr [[P:%.*]], align 1
+; CHECK-NEXT:    [[S:%.*]] = shufflevector <2 x i32> poison, <2 x i32> [[L]], <4 x i32> <i32 2, i32 3, i32 poison, i32 poison>
+; CHECK-NEXT:    ret <4 x i32> [[S]]
+;
+  %l = load <2 x i32>, ptr %p, align 1
+  %s = shufflevector <2 x i32> poison, <2 x i32> %l, <4 x i32> <i32 2, i32 3, i32 undef, i32 undef>
+  ret <4 x i32> %s
+}
diff --git a/llvm/test/Transforms/lower-builtin-allow-check.ll b/llvm/test/Transforms/lower-builtin-allow-check.ll
index 05d940a..bcd9722 100644
--- a/llvm/test/Transforms/lower-builtin-allow-check.ll
+++ b/llvm/test/Transforms/lower-builtin-allow-check.ll
@@ -1,8 +1,11 @@
 ; NOTE: Assertions have been autogenerated by utils/update_test_checks.py UTC_ARGS: --version 4
 ; RUN: opt < %s -passes='function(lower-allow-check)' -S | FileCheck %s --check-prefixes=NOPROFILE
 ; RUN: opt < %s -passes='function(lower-allow-check)' -lower-allow-check-random-rate=0 -S | FileCheck %s --check-prefixes=NONE
+; RUN: opt < %s -passes='function(lower-allow-check)' -lower-allow-check-random-rate=1 -S | FileCheck %s --check-prefixes=ALL
 ; RUN: opt < %s -passes='require<profile-summary>,function(lower-allow-check)' -lower-allow-check-percentile-cutoff-hot=990000 -S | FileCheck %s --check-prefixes=HOT99
 ; RUN: opt < %s -passes='require<profile-summary>,function(lower-allow-check)' -lower-allow-check-percentile-cutoff-hot=700000 -S | FileCheck %s --check-prefixes=HOT70
+; RUN: opt < %s -passes='require<profile-summary>,function(lower-allow-check)' -lower-allow-check-random-rate=0 -lower-allow-check-percentile-cutoff-hot=990000 -S | FileCheck %s --check-prefixes=NONE99
+; RUN: opt < %s -passes='require<profile-summary>,function(lower-allow-check)' -lower-allow-check-random-rate=1 -lower-allow-check-percentile-cutoff-hot=700000 -S | FileCheck %s --check-prefixes=ALL70
 
 target triple = "x86_64-pc-linux-gnu"
 
@@ -36,6 +39,19 @@ define dso_local noundef i32 @simple(ptr noundef readonly %0) {
 ; NONE-NEXT:    [[TMP5:%.*]] = load i32, ptr [[TMP0]], align 4
 ; NONE-NEXT:    ret i32 [[TMP5]]
 ;
+; ALL-LABEL: define dso_local noundef i32 @simple(
+; ALL-SAME: ptr noundef readonly [[TMP0:%.*]]) {
+; ALL-NEXT:    [[CHK:%.*]] = icmp eq ptr [[TMP0]], null
+; ALL-NEXT:    [[HOT:%.*]] = xor i1 true, true
+; ALL-NEXT:    [[TMP2:%.*]] = or i1 [[CHK]], [[HOT]]
+; ALL-NEXT:    br i1 [[TMP2]], label [[TMP3:%.*]], label [[TMP4:%.*]]
+; ALL:       3:
+; ALL-NEXT:    tail call void @llvm.ubsantrap(i8 22)
+; ALL-NEXT:    unreachable
+; ALL:       4:
+; ALL-NEXT:    [[TMP5:%.*]] = load i32, ptr [[TMP0]], align 4
+; ALL-NEXT:    ret i32 [[TMP5]]
+;
 ; HOT99-LABEL: define dso_local noundef i32 @simple(
 ; HOT99-SAME: ptr noundef readonly [[TMP0:%.*]]) {
 ; HOT99-NEXT:    [[TMP2:%.*]] = icmp eq ptr [[TMP0]], null
@@ -62,6 +78,32 @@ define dso_local noundef i32 @simple(ptr noundef readonly %0) {
 ; HOT70-NEXT:    [[TMP5:%.*]] = load i32, ptr [[TMP0]], align 4
 ; HOT70-NEXT:    ret i32 [[TMP5]]
 ;
+; NONE99-LABEL: define dso_local noundef i32 @simple(
+; NONE99-SAME: ptr noundef readonly [[TMP0:%.*]]) {
+; NONE99-NEXT:    [[CHK:%.*]] = icmp eq ptr [[TMP0]], null
+; NONE99-NEXT:    [[HOT:%.*]] = xor i1 false, true
+; NONE99-NEXT:    [[TMP2:%.*]] = or i1 [[CHK]], [[HOT]]
+; NONE99-NEXT:    br i1 [[TMP2]], label [[TMP3:%.*]], label [[TMP4:%.*]]
+; NONE99:       3:
+; NONE99-NEXT:    tail call void @llvm.ubsantrap(i8 22)
+; NONE99-NEXT:    unreachable
+; NONE99:       4:
+; NONE99-NEXT:    [[TMP5:%.*]] = load i32, ptr [[TMP0]], align 4
+; NONE99-NEXT:    ret i32 [[TMP5]]
+;
+; ALL70-LABEL: define dso_local noundef i32 @simple(
+; ALL70-SAME: ptr noundef readonly [[TMP0:%.*]]) {
+; ALL70-NEXT:    [[CHK:%.*]] = icmp eq ptr [[TMP0]], null
+; ALL70-NEXT:    [[HOT:%.*]] = xor i1 true, true
+; ALL70-NEXT:    [[TMP2:%.*]] = or i1 [[CHK]], [[HOT]]
+; ALL70-NEXT:    br i1 [[TMP2]], label [[TMP3:%.*]], label [[TMP4:%.*]]
+; ALL70:       3:
+; ALL70-NEXT:    tail call void @llvm.ubsantrap(i8 22)
+; ALL70-NEXT:    unreachable
+; ALL70:       4:
+; ALL70-NEXT:    [[TMP5:%.*]] = load i32, ptr [[TMP0]], align 4
+; ALL70-NEXT:    ret i32 [[TMP5]]
+;
   %chk = icmp eq ptr %0, null
   %allow = call i1 @llvm.allow.ubsan.check(i8 22)
   %hot = xor i1 %allow, true
@@ -105,6 +147,19 @@ define dso_local noundef i32 @hot(ptr noundef readonly %0) !prof !36 {
 ; NONE-NEXT:    [[TMP5:%.*]] = load i32, ptr [[TMP0]], align 4
 ; NONE-NEXT:    ret i32 [[TMP5]]
 ;
+; ALL-LABEL: define dso_local noundef i32 @hot(
+; ALL-SAME: ptr noundef readonly [[TMP0:%.*]]) !prof [[PROF16:![0-9]+]] {
+; ALL-NEXT:    [[CHK:%.*]] = icmp eq ptr [[TMP0]], null
+; ALL-NEXT:    [[HOT:%.*]] = xor i1 true, true
+; ALL-NEXT:    [[TMP2:%.*]] = or i1 [[CHK]], [[HOT]]
+; ALL-NEXT:    br i1 [[TMP2]], label [[TMP3:%.*]], label [[TMP4:%.*]]
+; ALL:       3:
+; ALL-NEXT:    tail call void @llvm.ubsantrap(i8 22)
+; ALL-NEXT:    unreachable
+; ALL:       4:
+; ALL-NEXT:    [[TMP5:%.*]] = load i32, ptr [[TMP0]], align 4
+; ALL-NEXT:    ret i32 [[TMP5]]
+;
 ; HOT99-LABEL: define dso_local noundef i32 @hot(
 ; HOT99-SAME: ptr noundef readonly [[TMP0:%.*]]) !prof [[PROF16:![0-9]+]] {
 ; HOT99-NEXT:    [[TMP2:%.*]] = icmp eq ptr [[TMP0]], null
@@ -131,6 +186,32 @@ define dso_local noundef i32 @hot(ptr noundef readonly %0) !prof !36 {
 ; HOT70-NEXT:    [[TMP5:%.*]] = load i32, ptr [[TMP0]], align 4
 ; HOT70-NEXT:    ret i32 [[TMP5]]
 ;
+; NONE99-LABEL: define dso_local noundef i32 @hot(
+; NONE99-SAME: ptr noundef readonly [[TMP0:%.*]]) !prof [[PROF16:![0-9]+]] {
+; NONE99-NEXT:    [[CHK:%.*]] = icmp eq ptr [[TMP0]], null
+; NONE99-NEXT:    [[HOT:%.*]] = xor i1 false, true
+; NONE99-NEXT:    [[TMP2:%.*]] = or i1 [[CHK]], [[HOT]]
+; NONE99-NEXT:    br i1 [[TMP2]], label [[TMP3:%.*]], label [[TMP4:%.*]]
+; NONE99:       3:
+; NONE99-NEXT:    tail call void @llvm.ubsantrap(i8 22)
+; NONE99-NEXT:    unreachable
+; NONE99:       4:
+; NONE99-NEXT:    [[TMP5:%.*]] = load i32, ptr [[TMP0]], align 4
+; NONE99-NEXT:    ret i32 [[TMP5]]
+;
+; ALL70-LABEL: define dso_local noundef i32 @hot(
+; ALL70-SAME: ptr noundef readonly [[TMP0:%.*]]) !prof [[PROF16:![0-9]+]] {
+; ALL70-NEXT:    [[CHK:%.*]] = icmp eq ptr [[TMP0]], null
+; ALL70-NEXT:    [[HOT:%.*]] = xor i1 true, true
+; ALL70-NEXT:    [[TMP2:%.*]] = or i1 [[CHK]], [[HOT]]
+; ALL70-NEXT:    br i1 [[TMP2]], label [[TMP3:%.*]], label [[TMP4:%.*]]
+; ALL70:       3:
+; ALL70-NEXT:    tail call void @llvm.ubsantrap(i8 22)
+; ALL70-NEXT:    unreachable
+; ALL70:       4:
+; ALL70-NEXT:    [[TMP5:%.*]] = load i32, ptr [[TMP0]], align 4
+; ALL70-NEXT:    ret i32 [[TMP5]]
+;
   %chk = icmp eq ptr %0, null
   %allow = call i1 @llvm.allow.ubsan.check(i8 22)
   %hot = xor i1 %allow, true
@@ -173,6 +254,19 @@ define dso_local noundef i32 @veryHot(ptr noundef readonly %0) !prof !39 {
 ; NONE-NEXT:    [[TMP5:%.*]] = load i32, ptr [[TMP0]], align 4
 ; NONE-NEXT:    ret i32 [[TMP5]]
 ;
+; ALL-LABEL: define dso_local noundef i32 @veryHot(
+; ALL-SAME: ptr noundef readonly [[TMP0:%.*]]) !prof [[PROF17:![0-9]+]] {
+; ALL-NEXT:    [[CHK:%.*]] = icmp eq ptr [[TMP0]], null
+; ALL-NEXT:    [[HOT:%.*]] = xor i1 true, true
+; ALL-NEXT:    [[TMP2:%.*]] = or i1 [[CHK]], [[HOT]]
+; ALL-NEXT:    br i1 [[TMP2]], label [[TMP3:%.*]], label [[TMP4:%.*]]
+; ALL:       3:
+; ALL-NEXT:    tail call void @llvm.ubsantrap(i8 22)
+; ALL-NEXT:    unreachable
+; ALL:       4:
+; ALL-NEXT:    [[TMP5:%.*]] = load i32, ptr [[TMP0]], align 4
+; ALL-NEXT:    ret i32 [[TMP5]]
+;
 ; HOT99-LABEL: define dso_local noundef i32 @veryHot(
 ; HOT99-SAME: ptr noundef readonly [[TMP0:%.*]]) !prof [[PROF17:![0-9]+]] {
 ; HOT99-NEXT:    [[TMP2:%.*]] = icmp eq ptr [[TMP0]], null
@@ -199,6 +293,32 @@ define dso_local noundef i32 @veryHot(ptr noundef readonly %0) !prof !39 {
 ; HOT70-NEXT:    [[TMP5:%.*]] = load i32, ptr [[TMP0]], align 4
 ; HOT70-NEXT:    ret i32 [[TMP5]]
 ;
+; NONE99-LABEL: define dso_local noundef i32 @veryHot(
+; NONE99-SAME: ptr noundef readonly [[TMP0:%.*]]) !prof [[PROF17:![0-9]+]] {
+; NONE99-NEXT:    [[CHK:%.*]] = icmp eq ptr [[TMP0]], null
+; NONE99-NEXT:    [[HOT:%.*]] = xor i1 false, true
+; NONE99-NEXT:    [[TMP2:%.*]] = or i1 [[CHK]], [[HOT]]
+; NONE99-NEXT:    br i1 [[TMP2]], label [[TMP3:%.*]], label [[TMP4:%.*]]
+; NONE99:       3:
+; NONE99-NEXT:    tail call void @llvm.ubsantrap(i8 22)
+; NONE99-NEXT:    unreachable
+; NONE99:       4:
+; NONE99-NEXT:    [[TMP5:%.*]] = load i32, ptr [[TMP0]], align 4
+; NONE99-NEXT:    ret i32 [[TMP5]]
+;
+; ALL70-LABEL: define dso_local noundef i32 @veryHot(
+; ALL70-SAME: ptr noundef readonly [[TMP0:%.*]]) !prof [[PROF17:![0-9]+]] {
+; ALL70-NEXT:    [[CHK:%.*]] = icmp eq ptr [[TMP0]], null
+; ALL70-NEXT:    [[HOT:%.*]] = xor i1 false, true
+; ALL70-NEXT:    [[TMP2:%.*]] = or i1 [[CHK]], [[HOT]]
+; ALL70-NEXT:    br i1 [[TMP2]], label [[TMP3:%.*]], label [[TMP4:%.*]]
+; ALL70:       3:
+; ALL70-NEXT:    tail call void @llvm.ubsantrap(i8 22)
+; ALL70-NEXT:    unreachable
+; ALL70:       4:
+; ALL70-NEXT:    [[TMP5:%.*]] = load i32, ptr [[TMP0]], align 4
+; ALL70-NEXT:    ret i32 [[TMP5]]
+;
   %chk = icmp eq ptr %0, null
   %allow = call i1 @llvm.allow.ubsan.check(i8 22)
   %hot = xor i1 %allow, true
@@ -254,6 +374,25 @@ define dso_local noundef i32 @branchColdFnHot(i32 noundef %0, ptr noundef readon
 ; NONE-NEXT:    [[TMP10:%.*]] = phi i32 [ [[TMP8]], [[TMP7]] ], [ 0, [[TMP2:%.*]] ]
 ; NONE-NEXT:    ret i32 [[TMP10]]
 ;
+; ALL-LABEL: define dso_local noundef i32 @branchColdFnHot(
+; ALL-SAME: i32 noundef [[TMP0:%.*]], ptr noundef readonly [[TMP1:%.*]]) !prof [[PROF17]] {
+; ALL-NEXT:    [[TMP3:%.*]] = icmp eq i32 [[TMP0]], 0
+; ALL-NEXT:    br i1 [[TMP3]], label [[TMP9:%.*]], label [[TMP4:%.*]], !prof [[PROF18:![0-9]+]]
+; ALL:       4:
+; ALL-NEXT:    [[CHK:%.*]] = icmp eq ptr [[TMP1]], null
+; ALL-NEXT:    [[HOT:%.*]] = xor i1 true, true
+; ALL-NEXT:    [[TMP5:%.*]] = or i1 [[CHK]], [[HOT]]
+; ALL-NEXT:    br i1 [[TMP5]], label [[TMP6:%.*]], label [[TMP7:%.*]]
+; ALL:       6:
+; ALL-NEXT:    tail call void @llvm.ubsantrap(i8 22)
+; ALL-NEXT:    unreachable
+; ALL:       7:
+; ALL-NEXT:    [[TMP8:%.*]] = load i32, ptr [[TMP1]], align 4
+; ALL-NEXT:    br label [[TMP9]]
+; ALL:       9:
+; ALL-NEXT:    [[TMP10:%.*]] = phi i32 [ [[TMP8]], [[TMP7]] ], [ 0, [[TMP2:%.*]] ]
+; ALL-NEXT:    ret i32 [[TMP10]]
+;
 ; HOT99-LABEL: define dso_local noundef i32 @branchColdFnHot(
 ; HOT99-SAME: i32 noundef [[TMP0:%.*]], ptr noundef readonly [[TMP1:%.*]]) !prof [[PROF17]] {
 ; HOT99-NEXT:    [[TMP3:%.*]] = icmp eq i32 [[TMP0]], 0
@@ -292,6 +431,44 @@ define dso_local noundef i32 @branchColdFnHot(i32 noundef %0, ptr noundef readon
 ; HOT70-NEXT:    [[TMP10:%.*]] = phi i32 [ [[TMP8]], [[TMP7]] ], [ 0, [[TMP2:%.*]] ]
 ; HOT70-NEXT:    ret i32 [[TMP10]]
 ;
+; NONE99-LABEL: define dso_local noundef i32 @branchColdFnHot(
+; NONE99-SAME: i32 noundef [[TMP0:%.*]], ptr noundef readonly [[TMP1:%.*]]) !prof [[PROF17]] {
+; NONE99-NEXT:    [[TMP3:%.*]] = icmp eq i32 [[TMP0]], 0
+; NONE99-NEXT:    br i1 [[TMP3]], label [[TMP9:%.*]], label [[TMP4:%.*]], !prof [[PROF18:![0-9]+]]
+; NONE99:       4:
+; NONE99-NEXT:    [[CHK:%.*]] = icmp eq ptr [[TMP1]], null
+; NONE99-NEXT:    [[HOT:%.*]] = xor i1 false, true
+; NONE99-NEXT:    [[TMP5:%.*]] = or i1 [[CHK]], [[HOT]]
+; NONE99-NEXT:    br i1 [[TMP5]], label [[TMP6:%.*]], label [[TMP7:%.*]]
+; NONE99:       6:
+; NONE99-NEXT:    tail call void @llvm.ubsantrap(i8 22)
+; NONE99-NEXT:    unreachable
+; NONE99:       7:
+; NONE99-NEXT:    [[TMP8:%.*]] = load i32, ptr [[TMP1]], align 4
+; NONE99-NEXT:    br label [[TMP9]]
+; NONE99:       9:
+; NONE99-NEXT:    [[TMP10:%.*]] = phi i32 [ [[TMP8]], [[TMP7]] ], [ 0, [[TMP2:%.*]] ]
+; NONE99-NEXT:    ret i32 [[TMP10]]
+;
+; ALL70-LABEL: define dso_local noundef i32 @branchColdFnHot(
+; ALL70-SAME: i32 noundef [[TMP0:%.*]], ptr noundef readonly [[TMP1:%.*]]) !prof [[PROF17]] {
+; ALL70-NEXT:    [[TMP3:%.*]] = icmp eq i32 [[TMP0]], 0
+; ALL70-NEXT:    br i1 [[TMP3]], label [[TMP9:%.*]], label [[TMP4:%.*]], !prof [[PROF18:![0-9]+]]
+; ALL70:       4:
+; ALL70-NEXT:    [[CHK:%.*]] = icmp eq ptr [[TMP1]], null
+; ALL70-NEXT:    [[HOT:%.*]] = xor i1 true, true
+; ALL70-NEXT:    [[TMP5:%.*]] = or i1 [[CHK]], [[HOT]]
+; ALL70-NEXT:    br i1 [[TMP5]], label [[TMP6:%.*]], label [[TMP7:%.*]]
+; ALL70:       6:
+; ALL70-NEXT:    tail call void @llvm.ubsantrap(i8 22)
+; ALL70-NEXT:    unreachable
+; ALL70:       7:
+; ALL70-NEXT:    [[TMP8:%.*]] = load i32, ptr [[TMP1]], align 4
+; ALL70-NEXT:    br label [[TMP9]]
+; ALL70:       9:
+; ALL70-NEXT:    [[TMP10:%.*]] = phi i32 [ [[TMP8]], [[TMP7]] ], [ 0, [[TMP2:%.*]] ]
+; ALL70-NEXT:    ret i32 [[TMP10]]
+;
   %3 = icmp eq i32 %0, 0
   br i1 %3, label %9, label %4, !prof !38
 
@@ -354,6 +531,25 @@ define dso_local noundef i32 @branchHotFnCold(i32 noundef %0, ptr noundef readon
 ; NONE-NEXT:    [[TMP10:%.*]] = phi i32 [ [[TMP8]], [[TMP7]] ], [ 0, [[TMP2:%.*]] ]
 ; NONE-NEXT:    ret i32 [[TMP10]]
 ;
+; ALL-LABEL: define dso_local noundef i32 @branchHotFnCold(
+; ALL-SAME: i32 noundef [[TMP0:%.*]], ptr noundef readonly [[TMP1:%.*]]) !prof [[PROF16]] {
+; ALL-NEXT:    [[TMP3:%.*]] = icmp eq i32 [[TMP0]], 0
+; ALL-NEXT:    br i1 [[TMP3]], label [[TMP9:%.*]], label [[TMP4:%.*]], !prof [[PROF19:![0-9]+]]
+; ALL:       4:
+; ALL-NEXT:    [[CHK:%.*]] = icmp eq ptr [[TMP1]], null
+; ALL-NEXT:    [[HOT:%.*]] = xor i1 true, true
+; ALL-NEXT:    [[TMP5:%.*]] = or i1 [[CHK]], [[HOT]]
+; ALL-NEXT:    br i1 [[TMP5]], label [[TMP6:%.*]], label [[TMP7:%.*]]
+; ALL:       6:
+; ALL-NEXT:    tail call void @llvm.ubsantrap(i8 22)
+; ALL-NEXT:    unreachable
+; ALL:       7:
+; ALL-NEXT:    [[TMP8:%.*]] = load i32, ptr [[TMP1]], align 4
+; ALL-NEXT:    br label [[TMP9]]
+; ALL:       9:
+; ALL-NEXT:    [[TMP10:%.*]] = phi i32 [ [[TMP8]], [[TMP7]] ], [ 0, [[TMP2:%.*]] ]
+; ALL-NEXT:    ret i32 [[TMP10]]
+;
 ; HOT99-LABEL: define dso_local noundef i32 @branchHotFnCold(
 ; HOT99-SAME: i32 noundef [[TMP0:%.*]], ptr noundef readonly [[TMP1:%.*]]) !prof [[PROF16]] {
 ; HOT99-NEXT:    [[TMP3:%.*]] = icmp eq i32 [[TMP0]], 0
@@ -392,6 +588,44 @@ define dso_local noundef i32 @branchHotFnCold(i32 noundef %0, ptr noundef readon
 ; HOT70-NEXT:    [[TMP10:%.*]] = phi i32 [ [[TMP8]], [[TMP7]] ], [ 0, [[TMP2:%.*]] ]
 ; HOT70-NEXT:    ret i32 [[TMP10]]
 ;
+; NONE99-LABEL: define dso_local noundef i32 @branchHotFnCold(
+; NONE99-SAME: i32 noundef [[TMP0:%.*]], ptr noundef readonly [[TMP1:%.*]]) !prof [[PROF16]] {
+; NONE99-NEXT:    [[TMP3:%.*]] = icmp eq i32 [[TMP0]], 0
+; NONE99-NEXT:    br i1 [[TMP3]], label [[TMP9:%.*]], label [[TMP4:%.*]], !prof [[PROF19:![0-9]+]]
+; NONE99:       4:
+; NONE99-NEXT:    [[CHK:%.*]] = icmp eq ptr [[TMP1]], null
+; NONE99-NEXT:    [[HOT:%.*]] = xor i1 false, true
+; NONE99-NEXT:    [[TMP5:%.*]] = or i1 [[CHK]], [[HOT]]
+; NONE99-NEXT:    br i1 [[TMP5]], label [[TMP6:%.*]], label [[TMP7:%.*]]
+; NONE99:       6:
+; NONE99-NEXT:    tail call void @llvm.ubsantrap(i8 22)
+; NONE99-NEXT:    unreachable
+; NONE99:       7:
+; NONE99-NEXT:    [[TMP8:%.*]] = load i32, ptr [[TMP1]], align 4
+; NONE99-NEXT:    br label [[TMP9]]
+; NONE99:       9:
+; NONE99-NEXT:    [[TMP10:%.*]] = phi i32 [ [[TMP8]], [[TMP7]] ], [ 0, [[TMP2:%.*]] ]
+; NONE99-NEXT:    ret i32 [[TMP10]]
+;
+; ALL70-LABEL: define dso_local noundef i32 @branchHotFnCold(
+; ALL70-SAME: i32 noundef [[TMP0:%.*]], ptr noundef readonly [[TMP1:%.*]]) !prof [[PROF16]] {
+; ALL70-NEXT:    [[TMP3:%.*]] = icmp eq i32 [[TMP0]], 0
+; ALL70-NEXT:    br i1 [[TMP3]], label [[TMP9:%.*]], label [[TMP4:%.*]], !prof [[PROF19:![0-9]+]]
+; ALL70:       4:
+; ALL70-NEXT:    [[CHK:%.*]] = icmp eq ptr [[TMP1]], null
+; ALL70-NEXT:    [[HOT:%.*]] = xor i1 true, true
+; ALL70-NEXT:    [[TMP5:%.*]] = or i1 [[CHK]], [[HOT]]
+; ALL70-NEXT:    br i1 [[TMP5]], label [[TMP6:%.*]], label [[TMP7:%.*]]
+; ALL70:       6:
+; ALL70-NEXT:    tail call void @llvm.ubsantrap(i8 22)
+; ALL70-NEXT:    unreachable
+; ALL70:       7:
+; ALL70-NEXT:    [[TMP8:%.*]] = load i32, ptr [[TMP1]], align 4
+; ALL70-NEXT:    br label [[TMP9]]
+; ALL70:       9:
+; ALL70-NEXT:    [[TMP10:%.*]] = phi i32 [ [[TMP8]], [[TMP7]] ], [ 0, [[TMP2:%.*]] ]
+; ALL70-NEXT:    ret i32 [[TMP10]]
+;
   %3 = icmp eq i32 %0, 0
   br i1 %3, label %9, label %4, !prof !37
 
@@ -450,6 +684,11 @@ define dso_local noundef i32 @branchHotFnCold(i32 noundef %0, ptr noundef readon
 ; NONE: [[PROF18]] = !{!"branch_weights", i32 1000, i32 1}
 ; NONE: [[PROF19]] = !{!"branch_weights", i32 1, i32 1000}
 ;.
+; ALL: [[PROF16]] = !{!"function_entry_count", i64 1000}
+; ALL: [[PROF17]] = !{!"function_entry_count", i64 7000}
+; ALL: [[PROF18]] = !{!"branch_weights", i32 1000, i32 1}
+; ALL: [[PROF19]] = !{!"branch_weights", i32 1, i32 1000}
+;.
 ; HOT99: [[PROF16]] = !{!"function_entry_count", i64 1000}
 ; HOT99: [[PROF17]] = !{!"function_entry_count", i64 7000}
 ; HOT99: [[PROF18]] = !{!"branch_weights", i32 1000, i32 1}
@@ -460,3 +699,13 @@ define dso_local noundef i32 @branchHotFnCold(i32 noundef %0, ptr noundef readon
 ; HOT70: [[PROF18]] = !{!"branch_weights", i32 1000, i32 1}
 ; HOT70: [[PROF19]] = !{!"branch_weights", i32 1, i32 1000}
 ;.
+; NONE99: [[PROF16]] = !{!"function_entry_count", i64 1000}
+; NONE99: [[PROF17]] = !{!"function_entry_count", i64 7000}
+; NONE99: [[PROF18]] = !{!"branch_weights", i32 1000, i32 1}
+; NONE99: [[PROF19]] = !{!"branch_weights", i32 1, i32 1000}
+;.
+; ALL70: [[PROF16]] = !{!"function_entry_count", i64 1000}
+; ALL70: [[PROF17]] = !{!"function_entry_count", i64 7000}
+; ALL70: [[PROF18]] = !{!"branch_weights", i32 1000, i32 1}
+; ALL70: [[PROF19]] = !{!"branch_weights", i32 1, i32 1000}
+;.
diff --git a/llvm/test/Verifier/atomics.ll b/llvm/test/Verifier/atomics.ll
index fe70ba0..f835b98 100644
--- a/llvm/test/Verifier/atomics.ll
+++ b/llvm/test/Verifier/atomics.ll
@@ -3,12 +3,12 @@
 ; CHECK: atomic store operand must have integer, pointer, or floating point type!
 ; CHECK: atomic load operand must have integer, pointer, or floating point type!
 
-define void @foo(ptr %P, x86_mmx %v) {
-  store atomic x86_mmx %v, ptr %P unordered, align 8
+define void @foo(ptr %P, <1 x i64> %v) {
+  store atomic <1 x i64> %v, ptr %P unordered, align 8
   ret void
 }
 
-define x86_mmx @bar(ptr %P) {
-  %v = load atomic x86_mmx, ptr %P unordered, align 8
-  ret x86_mmx %v
+define <1 x i64> @bar(ptr %P) {
+  %v = load atomic <1 x i64>, ptr %P unordered, align 8
+  ret <1 x i64> %v
 }
diff --git a/llvm/test/tools/UpdateTestChecks/update_llc_test_checks/Inputs/amdgpu_isel.ll.expected b/llvm/test/tools/UpdateTestChecks/update_llc_test_checks/Inputs/amdgpu_isel.ll.expected
index 38b8ba1..bdba243 100644
--- a/llvm/test/tools/UpdateTestChecks/update_llc_test_checks/Inputs/amdgpu_isel.ll.expected
+++ b/llvm/test/tools/UpdateTestChecks/update_llc_test_checks/Inputs/amdgpu_isel.ll.expected
@@ -16,7 +16,7 @@ define i64 @i64_test(i64 %i) nounwind readnone {
 ; CHECK-NEXT:    t16: ch,glue = CopyToReg # D:1 t0, Register:i32 $vgpr0, t23
 ; CHECK-NEXT:    t38: i32 = EXTRACT_SUBREG # D:1 t10, TargetConstant:i32<11>
 ; CHECK-NEXT:    t18: ch,glue = CopyToReg # D:1 t16, Register:i32 $vgpr1, t38, t16:1
-; CHECK-NEXT:    t19: ch = SI_RETURN # D:1 Register:i32 $vgpr0, Register:i32 $vgpr1, t18, t18:1
+; CHECK-NEXT:    t19: ch = SI_RETURN Register:i32 $vgpr0, Register:i32 $vgpr1, t18, t18:1
 ; CHECK-EMPTY:
   %loc = alloca i64, addrspace(5)
   %j = load i64, ptr addrspace(5) %loc
@@ -33,8 +33,8 @@ define i64 @i32_test(i32 %i) nounwind readnone {
 ; CHECK-NEXT:    t7: i32,i1 = V_ADD_CO_U32_e64 # D:1 t2, t6, TargetConstant:i1<0>
 ; CHECK-NEXT:    t14: ch,glue = CopyToReg # D:1 t0, Register:i32 $vgpr0, t7
 ; CHECK-NEXT:    t22: i32 = V_MOV_B32_e32 TargetConstant:i32<0>
-; CHECK-NEXT:    t16: ch,glue = CopyToReg # D:1 t14, Register:i32 $vgpr1, t22, t14:1
-; CHECK-NEXT:    t17: ch = SI_RETURN # D:1 Register:i32 $vgpr0, Register:i32 $vgpr1, t16, t16:1
+; CHECK-NEXT:    t16: ch,glue = CopyToReg t14, Register:i32 $vgpr1, t22, t14:1
+; CHECK-NEXT:    t17: ch = SI_RETURN Register:i32 $vgpr0, Register:i32 $vgpr1, t16, t16:1
 ; CHECK-EMPTY:
   %loc = alloca i32, addrspace(5)
   %j = load i32, ptr addrspace(5) %loc
@@ -54,8 +54,8 @@ define i64 @i16_test(i16 %i) nounwind readnone {
 ; CHECK-NEXT:    t25: i32 = V_AND_B32_e64 # D:1 t20, t24
 ; CHECK-NEXT:    t15: ch,glue = CopyToReg # D:1 t0, Register:i32 $vgpr0, t25
 ; CHECK-NEXT:    t31: i32 = V_MOV_B32_e32 TargetConstant:i32<0>
-; CHECK-NEXT:    t17: ch,glue = CopyToReg # D:1 t15, Register:i32 $vgpr1, t31, t15:1
-; CHECK-NEXT:    t18: ch = SI_RETURN # D:1 Register:i32 $vgpr0, Register:i32 $vgpr1, t17, t17:1
+; CHECK-NEXT:    t17: ch,glue = CopyToReg t15, Register:i32 $vgpr1, t31, t15:1
+; CHECK-NEXT:    t18: ch = SI_RETURN Register:i32 $vgpr0, Register:i32 $vgpr1, t17, t17:1
 ; CHECK-EMPTY:
   %loc = alloca i16, addrspace(5)
   %j = load i16, ptr addrspace(5) %loc
@@ -75,8 +75,8 @@ define i64 @i8_test(i8 %i) nounwind readnone {
 ; CHECK-NEXT:    t25: i32 = V_AND_B32_e64 # D:1 t20, t24
 ; CHECK-NEXT:    t15: ch,glue = CopyToReg # D:1 t0, Register:i32 $vgpr0, t25
 ; CHECK-NEXT:    t31: i32 = V_MOV_B32_e32 TargetConstant:i32<0>
-; CHECK-NEXT:    t17: ch,glue = CopyToReg # D:1 t15, Register:i32 $vgpr1, t31, t15:1
-; CHECK-NEXT:    t18: ch = SI_RETURN # D:1 Register:i32 $vgpr0, Register:i32 $vgpr1, t17, t17:1
+; CHECK-NEXT:    t17: ch,glue = CopyToReg t15, Register:i32 $vgpr1, t31, t15:1
+; CHECK-NEXT:    t18: ch = SI_RETURN Register:i32 $vgpr0, Register:i32 $vgpr1, t17, t17:1
 ; CHECK-EMPTY:
   %loc = alloca i8, addrspace(5)
   %j = load i8, ptr addrspace(5) %loc
diff --git a/llvm/test/tools/llvm-mca/X86/Barcelona/resources-sse2.s b/llvm/test/tools/llvm-mca/X86/Barcelona/resources-sse2.s
index 904454a..df0053a 100644
--- a/llvm/test/tools/llvm-mca/X86/Barcelona/resources-sse2.s
+++ b/llvm/test/tools/llvm-mca/X86/Barcelona/resources-sse2.s
@@ -624,8 +624,8 @@ xorpd       (%rax), %xmm2
 # CHECK-NEXT:  2      7     0.50    *                   psubb	(%rax), %xmm2
 # CHECK-NEXT:  1      1     0.50                        psubd	%xmm0, %xmm2
 # CHECK-NEXT:  2      7     0.50    *                   psubd	(%rax), %xmm2
-# CHECK-NEXT:  1      3     1.00                        psubq	%mm0, %mm2
-# CHECK-NEXT:  2      8     1.00    *                   psubq	(%rax), %mm2
+# CHECK-NEXT:  1      1     0.50                        psubq	%mm0, %mm2
+# CHECK-NEXT:  2      7     0.50    *                   psubq	(%rax), %mm2
 # CHECK-NEXT:  1      1     0.50                        psubq	%xmm0, %xmm2
 # CHECK-NEXT:  2      7     0.50    *                   psubq	(%rax), %xmm2
 # CHECK-NEXT:  1      1     0.50                        psubsb	%xmm0, %xmm2
@@ -687,7 +687,7 @@ xorpd       (%rax), %xmm2
 
 # CHECK:      Resource pressure per iteration:
 # CHECK-NEXT: [0]    [1]    [2]    [3]    [4]    [5]    [6.0]  [6.1]
-# CHECK-NEXT:  -     172.00 75.83  118.33 17.00  100.83 67.00  67.00
+# CHECK-NEXT:  -     172.00 75.83  117.33 17.00  101.83 67.00  67.00
 
 # CHECK:      Resource pressure by instruction:
 # CHECK-NEXT: [0]    [1]    [2]    [3]    [4]    [5]    [6.0]  [6.1]  Instructions:
@@ -908,8 +908,8 @@ xorpd       (%rax), %xmm2
 # CHECK-NEXT:  -      -      -     0.50    -     0.50   0.50   0.50   psubb	(%rax), %xmm2
 # CHECK-NEXT:  -      -      -     0.50    -     0.50    -      -     psubd	%xmm0, %xmm2
 # CHECK-NEXT:  -      -      -     0.50    -     0.50   0.50   0.50   psubd	(%rax), %xmm2
-# CHECK-NEXT:  -      -      -     1.00    -      -      -      -     psubq	%mm0, %mm2
-# CHECK-NEXT:  -      -      -     1.00    -      -     0.50   0.50   psubq	(%rax), %mm2
+# CHECK-NEXT:  -      -      -     0.50    -     0.50    -      -     psubq	%mm0, %mm2
+# CHECK-NEXT:  -      -      -     0.50    -     0.50   0.50   0.50   psubq	(%rax), %mm2
 # CHECK-NEXT:  -      -      -     0.50    -     0.50    -      -     psubq	%xmm0, %xmm2
 # CHECK-NEXT:  -      -      -     0.50    -     0.50   0.50   0.50   psubq	(%rax), %xmm2
 # CHECK-NEXT:  -      -      -     0.50    -     0.50    -      -     psubsb	%xmm0, %xmm2
diff --git a/llvm/test/tools/llvm-mca/X86/Barcelona/zero-idioms.s b/llvm/test/tools/llvm-mca/X86/Barcelona/zero-idioms.s
index 71902fe..54ff013 100644
--- a/llvm/test/tools/llvm-mca/X86/Barcelona/zero-idioms.s
+++ b/llvm/test/tools/llvm-mca/X86/Barcelona/zero-idioms.s
@@ -49,13 +49,13 @@ pxor   %xmm2, %xmm2
 
 # CHECK:      Iterations:        1
 # CHECK-NEXT: Instructions:      35
-# CHECK-NEXT: Total Cycles:      39
+# CHECK-NEXT: Total Cycles:      37
 # CHECK-NEXT: Total uOps:        35
 
 # CHECK:      Dispatch Width:    4
-# CHECK-NEXT: uOps Per Cycle:    0.90
-# CHECK-NEXT: IPC:               0.90
-# CHECK-NEXT: Block RThroughput: 11.0
+# CHECK-NEXT: uOps Per Cycle:    0.95
+# CHECK-NEXT: IPC:               0.95
+# CHECK-NEXT: Block RThroughput: 10.0
 
 # CHECK:      Instruction Info:
 # CHECK-NEXT: [1]: #uOps
@@ -79,7 +79,7 @@ pxor   %xmm2, %xmm2
 # CHECK-NEXT:  1      0     0.25                        pcmpgtw	%xmm2, %xmm2
 # CHECK-NEXT:  1      3     1.00                        psubb	%mm2, %mm2
 # CHECK-NEXT:  1      3     1.00                        psubd	%mm2, %mm2
-# CHECK-NEXT:  1      3     1.00                        psubq	%mm2, %mm2
+# CHECK-NEXT:  1      1     0.50                        psubq	%mm2, %mm2
 # CHECK-NEXT:  1      3     1.00                        psubw	%mm2, %mm2
 # CHECK-NEXT:  1      0     0.25                        psubb	%xmm2, %xmm2
 # CHECK-NEXT:  1      0     0.25                        psubd	%xmm2, %xmm2
@@ -118,7 +118,7 @@ pxor   %xmm2, %xmm2
 
 # CHECK:      Resource pressure per iteration:
 # CHECK-NEXT: [0]    [1]    [2]    [3]    [4]    [5]    [6.0]  [6.1]
-# CHECK-NEXT:  -      -     2.00   12.00   -     6.00    -      -
+# CHECK-NEXT:  -      -     3.00   11.00   -     6.00    -      -
 
 # CHECK:      Resource pressure by instruction:
 # CHECK-NEXT: [0]    [1]    [2]    [3]    [4]    [5]    [6.0]  [6.1]  Instructions:
@@ -135,7 +135,7 @@ pxor   %xmm2, %xmm2
 # CHECK-NEXT:  -      -      -      -      -      -      -      -     pcmpgtw	%xmm2, %xmm2
 # CHECK-NEXT:  -      -      -     1.00    -      -      -      -     psubb	%mm2, %mm2
 # CHECK-NEXT:  -      -      -     1.00    -      -      -      -     psubd	%mm2, %mm2
-# CHECK-NEXT:  -      -      -     1.00    -      -      -      -     psubq	%mm2, %mm2
+# CHECK-NEXT:  -      -      -      -      -     1.00    -      -     psubq	%mm2, %mm2
 # CHECK-NEXT:  -      -      -     1.00    -      -      -      -     psubw	%mm2, %mm2
 # CHECK-NEXT:  -      -      -      -      -      -      -      -     psubb	%xmm2, %xmm2
 # CHECK-NEXT:  -      -      -      -      -      -      -      -     psubd	%xmm2, %xmm2
@@ -155,48 +155,48 @@ pxor   %xmm2, %xmm2
 # CHECK-NEXT:  -      -     1.00    -      -      -      -      -     pandn	%xmm2, %xmm2
 # CHECK-NEXT:  -      -      -      -      -      -      -      -     xorps	%xmm0, %xmm0
 # CHECK-NEXT:  -      -      -      -      -      -      -      -     xorpd	%xmm1, %xmm1
-# CHECK-NEXT:  -      -      -      -      -     1.00    -      -     pxor	%mm2, %mm2
+# CHECK-NEXT:  -      -     1.00    -      -      -      -      -     pxor	%mm2, %mm2
 # CHECK-NEXT:  -      -      -      -      -      -      -      -     pxor	%xmm2, %xmm2
 
 # CHECK:      Timeline view:
-# CHECK-NEXT:                     0123456789          012345678
+# CHECK-NEXT:                     0123456789          0123456
 # CHECK-NEXT: Index     0123456789          0123456789
 
-# CHECK:      [0,0]     DR   .    .    .    .    .    .    .  .   subl	%eax, %eax
-# CHECK-NEXT: [0,1]     DR   .    .    .    .    .    .    .  .   subq	%rax, %rax
-# CHECK-NEXT: [0,2]     DR   .    .    .    .    .    .    .  .   xorl	%eax, %eax
-# CHECK-NEXT: [0,3]     DR   .    .    .    .    .    .    .  .   xorq	%rax, %rax
-# CHECK-NEXT: [0,4]     .DeeeER   .    .    .    .    .    .  .   pcmpgtb	%mm2, %mm2
-# CHECK-NEXT: [0,5]     .D===eeeER.    .    .    .    .    .  .   pcmpgtd	%mm2, %mm2
-# CHECK-NEXT: [0,6]     .D======eeeER  .    .    .    .    .  .   pcmpgtw	%mm2, %mm2
-# CHECK-NEXT: [0,7]     .D----------R  .    .    .    .    .  .   pcmpgtb	%xmm2, %xmm2
-# CHECK-NEXT: [0,8]     . D---------R  .    .    .    .    .  .   pcmpgtd	%xmm2, %xmm2
-# CHECK-NEXT: [0,9]     . D---------R  .    .    .    .    .  .   pcmpgtq	%xmm2, %xmm2
-# CHECK-NEXT: [0,10]    . D---------R  .    .    .    .    .  .   pcmpgtw	%xmm2, %xmm2
-# CHECK-NEXT: [0,11]    . D========eeeER    .    .    .    .  .   psubb	%mm2, %mm2
-# CHECK-NEXT: [0,12]    .  D==========eeeER .    .    .    .  .   psubd	%mm2, %mm2
-# CHECK-NEXT: [0,13]    .  D=============eeeER   .    .    .  .   psubq	%mm2, %mm2
-# CHECK-NEXT: [0,14]    .  D================eeeER.    .    .  .   psubw	%mm2, %mm2
-# CHECK-NEXT: [0,15]    .  D--------------------R.    .    .  .   psubb	%xmm2, %xmm2
-# CHECK-NEXT: [0,16]    .   D-------------------R.    .    .  .   psubd	%xmm2, %xmm2
-# CHECK-NEXT: [0,17]    .   D-------------------R.    .    .  .   psubq	%xmm2, %xmm2
-# CHECK-NEXT: [0,18]    .   D-------------------R.    .    .  .   psubw	%xmm2, %xmm2
-# CHECK-NEXT: [0,19]    .   D==================eeeER  .    .  .   psubsb	%mm2, %mm2
-# CHECK-NEXT: [0,20]    .    D====================eeeER    .  .   psubsw	%mm2, %mm2
-# CHECK-NEXT: [0,21]    .    DeE----------------------R    .  .   psubsb	%xmm2, %xmm2
-# CHECK-NEXT: [0,22]    .    D=eE---------------------R    .  .   psubsw	%xmm2, %xmm2
-# CHECK-NEXT: [0,23]    .    D=======================eeeER .  .   psubusb	%mm2, %mm2
-# CHECK-NEXT: [0,24]    .    .D=========================eeeER .   psubusw	%mm2, %mm2
-# CHECK-NEXT: [0,25]    .    .D=eE--------------------------R .   psubusb	%xmm2, %xmm2
-# CHECK-NEXT: [0,26]    .    .D==eE-------------------------R .   psubusw	%xmm2, %xmm2
-# CHECK-NEXT: [0,27]    .    .D==eE-------------------------R .   andnps	%xmm0, %xmm0
-# CHECK-NEXT: [0,28]    .    . D==eE------------------------R .   andnpd	%xmm1, %xmm1
-# CHECK-NEXT: [0,29]    .    . D===========================eER.   pandn	%mm2, %mm2
-# CHECK-NEXT: [0,30]    .    . D==eE-------------------------R.   pandn	%xmm2, %xmm2
-# CHECK-NEXT: [0,31]    .    . D-----------------------------R.   xorps	%xmm0, %xmm0
-# CHECK-NEXT: [0,32]    .    .  D----------------------------R.   xorpd	%xmm1, %xmm1
-# CHECK-NEXT: [0,33]    .    .  D===========================eER   pxor	%mm2, %mm2
-# CHECK-NEXT: [0,34]    .    .  D-----------------------------R   pxor	%xmm2, %xmm2
+# CHECK:      [0,0]     DR   .    .    .    .    .    .    ..   subl	%eax, %eax
+# CHECK-NEXT: [0,1]     DR   .    .    .    .    .    .    ..   subq	%rax, %rax
+# CHECK-NEXT: [0,2]     DR   .    .    .    .    .    .    ..   xorl	%eax, %eax
+# CHECK-NEXT: [0,3]     DR   .    .    .    .    .    .    ..   xorq	%rax, %rax
+# CHECK-NEXT: [0,4]     .DeeeER   .    .    .    .    .    ..   pcmpgtb	%mm2, %mm2
+# CHECK-NEXT: [0,5]     .D===eeeER.    .    .    .    .    ..   pcmpgtd	%mm2, %mm2
+# CHECK-NEXT: [0,6]     .D======eeeER  .    .    .    .    ..   pcmpgtw	%mm2, %mm2
+# CHECK-NEXT: [0,7]     .D----------R  .    .    .    .    ..   pcmpgtb	%xmm2, %xmm2
+# CHECK-NEXT: [0,8]     . D---------R  .    .    .    .    ..   pcmpgtd	%xmm2, %xmm2
+# CHECK-NEXT: [0,9]     . D---------R  .    .    .    .    ..   pcmpgtq	%xmm2, %xmm2
+# CHECK-NEXT: [0,10]    . D---------R  .    .    .    .    ..   pcmpgtw	%xmm2, %xmm2
+# CHECK-NEXT: [0,11]    . D========eeeER    .    .    .    ..   psubb	%mm2, %mm2
+# CHECK-NEXT: [0,12]    .  D==========eeeER .    .    .    ..   psubd	%mm2, %mm2
+# CHECK-NEXT: [0,13]    .  D=============eER.    .    .    ..   psubq	%mm2, %mm2
+# CHECK-NEXT: [0,14]    .  D==============eeeER  .    .    ..   psubw	%mm2, %mm2
+# CHECK-NEXT: [0,15]    .  D------------------R  .    .    ..   psubb	%xmm2, %xmm2
+# CHECK-NEXT: [0,16]    .   D-----------------R  .    .    ..   psubd	%xmm2, %xmm2
+# CHECK-NEXT: [0,17]    .   D-----------------R  .    .    ..   psubq	%xmm2, %xmm2
+# CHECK-NEXT: [0,18]    .   D-----------------R  .    .    ..   psubw	%xmm2, %xmm2
+# CHECK-NEXT: [0,19]    .   D================eeeER    .    ..   psubsb	%mm2, %mm2
+# CHECK-NEXT: [0,20]    .    D==================eeeER .    ..   psubsw	%mm2, %mm2
+# CHECK-NEXT: [0,21]    .    DeE--------------------R .    ..   psubsb	%xmm2, %xmm2
+# CHECK-NEXT: [0,22]    .    D=eE-------------------R .    ..   psubsw	%xmm2, %xmm2
+# CHECK-NEXT: [0,23]    .    D=====================eeeER   ..   psubusb	%mm2, %mm2
+# CHECK-NEXT: [0,24]    .    .D=======================eeeER..   psubusw	%mm2, %mm2
+# CHECK-NEXT: [0,25]    .    .D=eE------------------------R..   psubusb	%xmm2, %xmm2
+# CHECK-NEXT: [0,26]    .    .D==eE-----------------------R..   psubusw	%xmm2, %xmm2
+# CHECK-NEXT: [0,27]    .    .D==eE-----------------------R..   andnps	%xmm0, %xmm0
+# CHECK-NEXT: [0,28]    .    . D==eE----------------------R..   andnpd	%xmm1, %xmm1
+# CHECK-NEXT: [0,29]    .    . D=========================eER.   pandn	%mm2, %mm2
+# CHECK-NEXT: [0,30]    .    . D==eE-----------------------R.   pandn	%xmm2, %xmm2
+# CHECK-NEXT: [0,31]    .    . D---------------------------R.   xorps	%xmm0, %xmm0
+# CHECK-NEXT: [0,32]    .    .  D--------------------------R.   xorpd	%xmm1, %xmm1
+# CHECK-NEXT: [0,33]    .    .  D=========================eER   pxor	%mm2, %mm2
+# CHECK-NEXT: [0,34]    .    .  D---------------------------R   pxor	%xmm2, %xmm2
 
 # CHECK:      Average Wait times (based on the timeline view):
 # CHECK-NEXT: [0]: Executions
@@ -219,25 +219,25 @@ pxor   %xmm2, %xmm2
 # CHECK-NEXT: 11.    1     9.0    0.0    0.0       psubb	%mm2, %mm2
 # CHECK-NEXT: 12.    1     11.0   0.0    0.0       psubd	%mm2, %mm2
 # CHECK-NEXT: 13.    1     14.0   0.0    0.0       psubq	%mm2, %mm2
-# CHECK-NEXT: 14.    1     17.0   0.0    0.0       psubw	%mm2, %mm2
-# CHECK-NEXT: 15.    1     0.0    0.0    20.0      psubb	%xmm2, %xmm2
-# CHECK-NEXT: 16.    1     0.0    0.0    19.0      psubd	%xmm2, %xmm2
-# CHECK-NEXT: 17.    1     0.0    0.0    19.0      psubq	%xmm2, %xmm2
-# CHECK-NEXT: 18.    1     0.0    0.0    19.0      psubw	%xmm2, %xmm2
-# CHECK-NEXT: 19.    1     19.0   0.0    0.0       psubsb	%mm2, %mm2
-# CHECK-NEXT: 20.    1     21.0   0.0    0.0       psubsw	%mm2, %mm2
-# CHECK-NEXT: 21.    1     1.0    1.0    22.0      psubsb	%xmm2, %xmm2
-# CHECK-NEXT: 22.    1     2.0    0.0    21.0      psubsw	%xmm2, %xmm2
-# CHECK-NEXT: 23.    1     24.0   0.0    0.0       psubusb	%mm2, %mm2
-# CHECK-NEXT: 24.    1     26.0   0.0    0.0       psubusw	%mm2, %mm2
-# CHECK-NEXT: 25.    1     2.0    0.0    26.0      psubusb	%xmm2, %xmm2
-# CHECK-NEXT: 26.    1     3.0    0.0    25.0      psubusw	%xmm2, %xmm2
-# CHECK-NEXT: 27.    1     3.0    3.0    25.0      andnps	%xmm0, %xmm0
-# CHECK-NEXT: 28.    1     3.0    3.0    24.0      andnpd	%xmm1, %xmm1
-# CHECK-NEXT: 29.    1     28.0   0.0    0.0       pandn	%mm2, %mm2
-# CHECK-NEXT: 30.    1     3.0    0.0    25.0      pandn	%xmm2, %xmm2
-# CHECK-NEXT: 31.    1     0.0    0.0    29.0      xorps	%xmm0, %xmm0
-# CHECK-NEXT: 32.    1     0.0    0.0    28.0      xorpd	%xmm1, %xmm1
-# CHECK-NEXT: 33.    1     28.0   0.0    0.0       pxor	%mm2, %mm2
-# CHECK-NEXT: 34.    1     0.0    0.0    29.0      pxor	%xmm2, %xmm2
-# CHECK-NEXT:        1     6.5    0.2    10.5      <total>
+# CHECK-NEXT: 14.    1     15.0   0.0    0.0       psubw	%mm2, %mm2
+# CHECK-NEXT: 15.    1     0.0    0.0    18.0      psubb	%xmm2, %xmm2
+# CHECK-NEXT: 16.    1     0.0    0.0    17.0      psubd	%xmm2, %xmm2
+# CHECK-NEXT: 17.    1     0.0    0.0    17.0      psubq	%xmm2, %xmm2
+# CHECK-NEXT: 18.    1     0.0    0.0    17.0      psubw	%xmm2, %xmm2
+# CHECK-NEXT: 19.    1     17.0   0.0    0.0       psubsb	%mm2, %mm2
+# CHECK-NEXT: 20.    1     19.0   0.0    0.0       psubsw	%mm2, %mm2
+# CHECK-NEXT: 21.    1     1.0    1.0    20.0      psubsb	%xmm2, %xmm2
+# CHECK-NEXT: 22.    1     2.0    0.0    19.0      psubsw	%xmm2, %xmm2
+# CHECK-NEXT: 23.    1     22.0   0.0    0.0       psubusb	%mm2, %mm2
+# CHECK-NEXT: 24.    1     24.0   0.0    0.0       psubusw	%mm2, %mm2
+# CHECK-NEXT: 25.    1     2.0    0.0    24.0      psubusb	%xmm2, %xmm2
+# CHECK-NEXT: 26.    1     3.0    0.0    23.0      psubusw	%xmm2, %xmm2
+# CHECK-NEXT: 27.    1     3.0    3.0    23.0      andnps	%xmm0, %xmm0
+# CHECK-NEXT: 28.    1     3.0    3.0    22.0      andnpd	%xmm1, %xmm1
+# CHECK-NEXT: 29.    1     26.0   0.0    0.0       pandn	%mm2, %mm2
+# CHECK-NEXT: 30.    1     3.0    0.0    23.0      pandn	%xmm2, %xmm2
+# CHECK-NEXT: 31.    1     0.0    0.0    27.0      xorps	%xmm0, %xmm0
+# CHECK-NEXT: 32.    1     0.0    0.0    26.0      xorpd	%xmm1, %xmm1
+# CHECK-NEXT: 33.    1     26.0   0.0    0.0       pxor	%mm2, %mm2
+# CHECK-NEXT: 34.    1     0.0    0.0    27.0      pxor	%xmm2, %xmm2
+# CHECK-NEXT:        1     6.1    0.2    9.7       <total>
diff --git a/llvm/test/tools/llvm-mca/X86/Broadwell/resources-mmx.s b/llvm/test/tools/llvm-mca/X86/Broadwell/resources-mmx.s
index 69491f0..53b9d22 100644
--- a/llvm/test/tools/llvm-mca/X86/Broadwell/resources-mmx.s
+++ b/llvm/test/tools/llvm-mca/X86/Broadwell/resources-mmx.s
@@ -173,11 +173,11 @@ pxor        (%rax), %mm2
 # CHECK-NEXT:  1      5     0.50    *                   movq	(%rax), %mm2
 # CHECK-NEXT:  1      1     1.00                        movq	%mm0, %rcx
 # CHECK-NEXT:  2      1     1.00           *            movq	%mm0, (%rax)
-# CHECK-NEXT:  3      3     2.00                        packsswb	%mm0, %mm2
+# CHECK-NEXT:  2      3     2.00                        packsswb	%mm0, %mm2
 # CHECK-NEXT:  3      7     2.00    *                   packsswb	(%rax), %mm2
-# CHECK-NEXT:  3      3     2.00                        packssdw	%mm0, %mm2
+# CHECK-NEXT:  2      3     2.00                        packssdw	%mm0, %mm2
 # CHECK-NEXT:  3      7     2.00    *                   packssdw	(%rax), %mm2
-# CHECK-NEXT:  3      3     2.00                        packuswb	%mm0, %mm2
+# CHECK-NEXT:  2      3     2.00                        packuswb	%mm0, %mm2
 # CHECK-NEXT:  3      7     2.00    *                   packuswb	(%rax), %mm2
 # CHECK-NEXT:  1      1     0.50                        paddb	%mm0, %mm2
 # CHECK-NEXT:  2      6     0.50    *                   paddb	(%rax), %mm2
@@ -284,7 +284,7 @@ pxor        (%rax), %mm2
 
 # CHECK:      Resource pressure per iteration:
 # CHECK-NEXT: [0]    [1]    [2]    [3]    [4]    [5]    [6]    [7]    [8]    [9]
-# CHECK-NEXT:  -      -     46.67  35.17  23.67  23.67  2.00   57.17  1.00   0.67
+# CHECK-NEXT:  -      -     45.92  34.42  23.67  23.67  2.00   56.42  0.25   0.67
 
 # CHECK:      Resource pressure by instruction:
 # CHECK-NEXT: [0]    [1]    [2]    [3]    [4]    [5]    [6]    [7]    [8]    [9]    Instructions:
@@ -297,11 +297,11 @@ pxor        (%rax), %mm2
 # CHECK-NEXT:  -      -      -      -     0.50   0.50    -      -      -      -     movq	(%rax), %mm2
 # CHECK-NEXT:  -      -     1.00    -      -      -      -      -      -      -     movq	%mm0, %rcx
 # CHECK-NEXT:  -      -      -      -     0.33   0.33   1.00    -      -     0.33   movq	%mm0, (%rax)
-# CHECK-NEXT:  -      -     0.25   0.25    -      -      -     2.25   0.25    -     packsswb	%mm0, %mm2
+# CHECK-NEXT:  -      -      -      -      -      -      -     2.00    -      -     packsswb	%mm0, %mm2
 # CHECK-NEXT:  -      -      -      -     0.50   0.50    -     2.00    -      -     packsswb	(%rax), %mm2
-# CHECK-NEXT:  -      -     0.25   0.25    -      -      -     2.25   0.25    -     packssdw	%mm0, %mm2
+# CHECK-NEXT:  -      -      -      -      -      -      -     2.00    -      -     packssdw	%mm0, %mm2
 # CHECK-NEXT:  -      -      -      -     0.50   0.50    -     2.00    -      -     packssdw	(%rax), %mm2
-# CHECK-NEXT:  -      -     0.25   0.25    -      -      -     2.25   0.25    -     packuswb	%mm0, %mm2
+# CHECK-NEXT:  -      -      -      -      -      -      -     2.00    -      -     packuswb	%mm0, %mm2
 # CHECK-NEXT:  -      -      -      -     0.50   0.50    -     2.00    -      -     packuswb	(%rax), %mm2
 # CHECK-NEXT:  -      -      -     0.50    -      -      -     0.50    -      -     paddb	%mm0, %mm2
 # CHECK-NEXT:  -      -      -     0.50   0.50   0.50    -     0.50    -      -     paddb	(%rax), %mm2
diff --git a/llvm/test/tools/llvm-mca/X86/Generic/resources-sse2.s b/llvm/test/tools/llvm-mca/X86/Generic/resources-sse2.s
index 904454a..df0053a 100644
--- a/llvm/test/tools/llvm-mca/X86/Generic/resources-sse2.s
+++ b/llvm/test/tools/llvm-mca/X86/Generic/resources-sse2.s
@@ -624,8 +624,8 @@ xorpd       (%rax), %xmm2
 # CHECK-NEXT:  2      7     0.50    *                   psubb	(%rax), %xmm2
 # CHECK-NEXT:  1      1     0.50                        psubd	%xmm0, %xmm2
 # CHECK-NEXT:  2      7     0.50    *                   psubd	(%rax), %xmm2
-# CHECK-NEXT:  1      3     1.00                        psubq	%mm0, %mm2
-# CHECK-NEXT:  2      8     1.00    *                   psubq	(%rax), %mm2
+# CHECK-NEXT:  1      1     0.50                        psubq	%mm0, %mm2
+# CHECK-NEXT:  2      7     0.50    *                   psubq	(%rax), %mm2
 # CHECK-NEXT:  1      1     0.50                        psubq	%xmm0, %xmm2
 # CHECK-NEXT:  2      7     0.50    *                   psubq	(%rax), %xmm2
 # CHECK-NEXT:  1      1     0.50                        psubsb	%xmm0, %xmm2
@@ -687,7 +687,7 @@ xorpd       (%rax), %xmm2
 
 # CHECK:      Resource pressure per iteration:
 # CHECK-NEXT: [0]    [1]    [2]    [3]    [4]    [5]    [6.0]  [6.1]
-# CHECK-NEXT:  -     172.00 75.83  118.33 17.00  100.83 67.00  67.00
+# CHECK-NEXT:  -     172.00 75.83  117.33 17.00  101.83 67.00  67.00
 
 # CHECK:      Resource pressure by instruction:
 # CHECK-NEXT: [0]    [1]    [2]    [3]    [4]    [5]    [6.0]  [6.1]  Instructions:
@@ -908,8 +908,8 @@ xorpd       (%rax), %xmm2
 # CHECK-NEXT:  -      -      -     0.50    -     0.50   0.50   0.50   psubb	(%rax), %xmm2
 # CHECK-NEXT:  -      -      -     0.50    -     0.50    -      -     psubd	%xmm0, %xmm2
 # CHECK-NEXT:  -      -      -     0.50    -     0.50   0.50   0.50   psubd	(%rax), %xmm2
-# CHECK-NEXT:  -      -      -     1.00    -      -      -      -     psubq	%mm0, %mm2
-# CHECK-NEXT:  -      -      -     1.00    -      -     0.50   0.50   psubq	(%rax), %mm2
+# CHECK-NEXT:  -      -      -     0.50    -     0.50    -      -     psubq	%mm0, %mm2
+# CHECK-NEXT:  -      -      -     0.50    -     0.50   0.50   0.50   psubq	(%rax), %mm2
 # CHECK-NEXT:  -      -      -     0.50    -     0.50    -      -     psubq	%xmm0, %xmm2
 # CHECK-NEXT:  -      -      -     0.50    -     0.50   0.50   0.50   psubq	(%rax), %xmm2
 # CHECK-NEXT:  -      -      -     0.50    -     0.50    -      -     psubsb	%xmm0, %xmm2
diff --git a/llvm/test/tools/llvm-mca/X86/Haswell/resources-mmx.s b/llvm/test/tools/llvm-mca/X86/Haswell/resources-mmx.s
index 5094dd1..01f516a 100644
--- a/llvm/test/tools/llvm-mca/X86/Haswell/resources-mmx.s
+++ b/llvm/test/tools/llvm-mca/X86/Haswell/resources-mmx.s
@@ -173,11 +173,11 @@ pxor        (%rax), %mm2
 # CHECK-NEXT:  1      5     0.50    *                   movq	(%rax), %mm2
 # CHECK-NEXT:  1      1     1.00                        movq	%mm0, %rcx
 # CHECK-NEXT:  2      1     1.00           *            movq	%mm0, (%rax)
-# CHECK-NEXT:  3      3     2.00                        packsswb	%mm0, %mm2
+# CHECK-NEXT:  2      3     2.00                        packsswb	%mm0, %mm2
 # CHECK-NEXT:  3      7     2.00    *                   packsswb	(%rax), %mm2
-# CHECK-NEXT:  3      3     2.00                        packssdw	%mm0, %mm2
+# CHECK-NEXT:  2      3     2.00                        packssdw	%mm0, %mm2
 # CHECK-NEXT:  3      7     2.00    *                   packssdw	(%rax), %mm2
-# CHECK-NEXT:  3      3     2.00                        packuswb	%mm0, %mm2
+# CHECK-NEXT:  2      3     2.00                        packuswb	%mm0, %mm2
 # CHECK-NEXT:  3      7     2.00    *                   packuswb	(%rax), %mm2
 # CHECK-NEXT:  1      1     0.50                        paddb	%mm0, %mm2
 # CHECK-NEXT:  2      6     0.50    *                   paddb	(%rax), %mm2
@@ -284,7 +284,7 @@ pxor        (%rax), %mm2
 
 # CHECK:      Resource pressure per iteration:
 # CHECK-NEXT: [0]    [1]    [2]    [3]    [4]    [5]    [6]    [7]    [8]    [9]
-# CHECK-NEXT:  -      -     46.67  35.17  23.67  23.67  2.00   57.17  1.00   0.67
+# CHECK-NEXT:  -      -     45.92  34.42  23.67  23.67  2.00   56.42  0.25   0.67
 
 # CHECK:      Resource pressure by instruction:
 # CHECK-NEXT: [0]    [1]    [2]    [3]    [4]    [5]    [6]    [7]    [8]    [9]    Instructions:
@@ -297,11 +297,11 @@ pxor        (%rax), %mm2
 # CHECK-NEXT:  -      -      -      -     0.50   0.50    -      -      -      -     movq	(%rax), %mm2
 # CHECK-NEXT:  -      -     1.00    -      -      -      -      -      -      -     movq	%mm0, %rcx
 # CHECK-NEXT:  -      -      -      -     0.33   0.33   1.00    -      -     0.33   movq	%mm0, (%rax)
-# CHECK-NEXT:  -      -     0.25   0.25    -      -      -     2.25   0.25    -     packsswb	%mm0, %mm2
+# CHECK-NEXT:  -      -      -      -      -      -      -     2.00    -      -     packsswb	%mm0, %mm2
 # CHECK-NEXT:  -      -      -      -     0.50   0.50    -     2.00    -      -     packsswb	(%rax), %mm2
-# CHECK-NEXT:  -      -     0.25   0.25    -      -      -     2.25   0.25    -     packssdw	%mm0, %mm2
+# CHECK-NEXT:  -      -      -      -      -      -      -     2.00    -      -     packssdw	%mm0, %mm2
 # CHECK-NEXT:  -      -      -      -     0.50   0.50    -     2.00    -      -     packssdw	(%rax), %mm2
-# CHECK-NEXT:  -      -     0.25   0.25    -      -      -     2.25   0.25    -     packuswb	%mm0, %mm2
+# CHECK-NEXT:  -      -      -      -      -      -      -     2.00    -      -     packuswb	%mm0, %mm2
 # CHECK-NEXT:  -      -      -      -     0.50   0.50    -     2.00    -      -     packuswb	(%rax), %mm2
 # CHECK-NEXT:  -      -      -     0.50    -      -      -     0.50    -      -     paddb	%mm0, %mm2
 # CHECK-NEXT:  -      -      -     0.50   0.50   0.50    -     0.50    -      -     paddb	(%rax), %mm2
diff --git a/llvm/test/tools/llvm-mca/X86/SandyBridge/resources-sse2.s b/llvm/test/tools/llvm-mca/X86/SandyBridge/resources-sse2.s
index c3b8b73..e2cfd02 100644
--- a/llvm/test/tools/llvm-mca/X86/SandyBridge/resources-sse2.s
+++ b/llvm/test/tools/llvm-mca/X86/SandyBridge/resources-sse2.s
@@ -624,8 +624,8 @@ xorpd       (%rax), %xmm2
 # CHECK-NEXT:  2      7     0.50    *                   psubb	(%rax), %xmm2
 # CHECK-NEXT:  1      1     0.50                        psubd	%xmm0, %xmm2
 # CHECK-NEXT:  2      7     0.50    *                   psubd	(%rax), %xmm2
-# CHECK-NEXT:  1      3     1.00                        psubq	%mm0, %mm2
-# CHECK-NEXT:  2      8     1.00    *                   psubq	(%rax), %mm2
+# CHECK-NEXT:  1      1     0.50                        psubq	%mm0, %mm2
+# CHECK-NEXT:  2      7     0.50    *                   psubq	(%rax), %mm2
 # CHECK-NEXT:  1      1     0.50                        psubq	%xmm0, %xmm2
 # CHECK-NEXT:  2      7     0.50    *                   psubq	(%rax), %xmm2
 # CHECK-NEXT:  1      1     0.50                        psubsb	%xmm0, %xmm2
@@ -687,7 +687,7 @@ xorpd       (%rax), %xmm2
 
 # CHECK:      Resource pressure per iteration:
 # CHECK-NEXT: [0]    [1]    [2]    [3]    [4]    [5]    [6.0]  [6.1]
-# CHECK-NEXT:  -     172.00 75.83  118.33 17.00  100.83 67.00  67.00
+# CHECK-NEXT:  -     172.00 75.83  117.33 17.00  101.83 67.00  67.00
 
 # CHECK:      Resource pressure by instruction:
 # CHECK-NEXT: [0]    [1]    [2]    [3]    [4]    [5]    [6.0]  [6.1]  Instructions:
@@ -908,8 +908,8 @@ xorpd       (%rax), %xmm2
 # CHECK-NEXT:  -      -      -     0.50    -     0.50   0.50   0.50   psubb	(%rax), %xmm2
 # CHECK-NEXT:  -      -      -     0.50    -     0.50    -      -     psubd	%xmm0, %xmm2
 # CHECK-NEXT:  -      -      -     0.50    -     0.50   0.50   0.50   psubd	(%rax), %xmm2
-# CHECK-NEXT:  -      -      -     1.00    -      -      -      -     psubq	%mm0, %mm2
-# CHECK-NEXT:  -      -      -     1.00    -      -     0.50   0.50   psubq	(%rax), %mm2
+# CHECK-NEXT:  -      -      -     0.50    -     0.50    -      -     psubq	%mm0, %mm2
+# CHECK-NEXT:  -      -      -     0.50    -     0.50   0.50   0.50   psubq	(%rax), %mm2
 # CHECK-NEXT:  -      -      -     0.50    -     0.50    -      -     psubq	%xmm0, %xmm2
 # CHECK-NEXT:  -      -      -     0.50    -     0.50   0.50   0.50   psubq	(%rax), %xmm2
 # CHECK-NEXT:  -      -      -     0.50    -     0.50    -      -     psubsb	%xmm0, %xmm2
diff --git a/llvm/test/tools/llvm-mca/X86/SandyBridge/zero-idioms.s b/llvm/test/tools/llvm-mca/X86/SandyBridge/zero-idioms.s
index 4a034cc..bdca772 100644
--- a/llvm/test/tools/llvm-mca/X86/SandyBridge/zero-idioms.s
+++ b/llvm/test/tools/llvm-mca/X86/SandyBridge/zero-idioms.s
@@ -83,12 +83,12 @@ vpxor  %xmm3, %xmm3, %xmm5
 
 # CHECK:      Iterations:        1
 # CHECK-NEXT: Instructions:      63
-# CHECK-NEXT: Total Cycles:      27
+# CHECK-NEXT: Total Cycles:      25
 # CHECK-NEXT: Total uOps:        63
 
 # CHECK:      Dispatch Width:    4
-# CHECK-NEXT: uOps Per Cycle:    2.33
-# CHECK-NEXT: IPC:               2.33
+# CHECK-NEXT: uOps Per Cycle:    2.52
+# CHECK-NEXT: IPC:               2.52
 # CHECK-NEXT: Block RThroughput: 15.8
 
 # CHECK:      Instruction Info:
@@ -121,7 +121,7 @@ vpxor  %xmm3, %xmm3, %xmm5
 # CHECK-NEXT:  1      0     0.25                        vpcmpgtw	%xmm3, %xmm3, %xmm5
 # CHECK-NEXT:  1      3     1.00                        psubb	%mm2, %mm2
 # CHECK-NEXT:  1      3     1.00                        psubd	%mm2, %mm2
-# CHECK-NEXT:  1      3     1.00                        psubq	%mm2, %mm2
+# CHECK-NEXT:  1      1     0.50                        psubq	%mm2, %mm2
 # CHECK-NEXT:  1      3     1.00                        psubw	%mm2, %mm2
 # CHECK-NEXT:  1      0     0.25                        psubb	%xmm2, %xmm2
 # CHECK-NEXT:  1      0     0.25                        psubd	%xmm2, %xmm2
@@ -250,71 +250,71 @@ vpxor  %xmm3, %xmm3, %xmm5
 
 # CHECK:      Timeline view:
 # CHECK-NEXT:                     0123456789
-# CHECK-NEXT: Index     0123456789          0123456
+# CHECK-NEXT: Index     0123456789          01234
 
-# CHECK:      [0,0]     DR   .    .    .    .    ..   subl	%eax, %eax
-# CHECK-NEXT: [0,1]     DR   .    .    .    .    ..   subq	%rax, %rax
-# CHECK-NEXT: [0,2]     DR   .    .    .    .    ..   xorl	%eax, %eax
-# CHECK-NEXT: [0,3]     DR   .    .    .    .    ..   xorq	%rax, %rax
-# CHECK-NEXT: [0,4]     .DeeeER   .    .    .    ..   pcmpgtb	%mm2, %mm2
-# CHECK-NEXT: [0,5]     .D===eeeER.    .    .    ..   pcmpgtd	%mm2, %mm2
-# CHECK-NEXT: [0,6]     .D======eeeER  .    .    ..   pcmpgtw	%mm2, %mm2
-# CHECK-NEXT: [0,7]     .D----------R  .    .    ..   pcmpgtb	%xmm2, %xmm2
-# CHECK-NEXT: [0,8]     . D---------R  .    .    ..   pcmpgtd	%xmm2, %xmm2
-# CHECK-NEXT: [0,9]     . D---------R  .    .    ..   pcmpgtq	%xmm2, %xmm2
-# CHECK-NEXT: [0,10]    . D---------R  .    .    ..   pcmpgtw	%xmm2, %xmm2
-# CHECK-NEXT: [0,11]    . D---------R  .    .    ..   vpcmpgtb	%xmm3, %xmm3, %xmm3
-# CHECK-NEXT: [0,12]    .  D--------R  .    .    ..   vpcmpgtd	%xmm3, %xmm3, %xmm3
-# CHECK-NEXT: [0,13]    .  D--------R  .    .    ..   vpcmpgtq	%xmm3, %xmm3, %xmm3
-# CHECK-NEXT: [0,14]    .  D--------R  .    .    ..   vpcmpgtw	%xmm3, %xmm3, %xmm3
-# CHECK-NEXT: [0,15]    .  D--------R  .    .    ..   vpcmpgtb	%xmm3, %xmm3, %xmm5
-# CHECK-NEXT: [0,16]    .   D-------R  .    .    ..   vpcmpgtd	%xmm3, %xmm3, %xmm5
-# CHECK-NEXT: [0,17]    .   D-------R  .    .    ..   vpcmpgtq	%xmm3, %xmm3, %xmm5
-# CHECK-NEXT: [0,18]    .   D-------R  .    .    ..   vpcmpgtw	%xmm3, %xmm3, %xmm5
-# CHECK-NEXT: [0,19]    .   D======eeeER    .    ..   psubb	%mm2, %mm2
-# CHECK-NEXT: [0,20]    .    D========eeeER .    ..   psubd	%mm2, %mm2
-# CHECK-NEXT: [0,21]    .    D===========eeeER   ..   psubq	%mm2, %mm2
-# CHECK-NEXT: [0,22]    .    D==============eeeER..   psubw	%mm2, %mm2
-# CHECK-NEXT: [0,23]    .    D------------------R..   psubb	%xmm2, %xmm2
-# CHECK-NEXT: [0,24]    .    .D-----------------R..   psubd	%xmm2, %xmm2
-# CHECK-NEXT: [0,25]    .    .D-----------------R..   psubq	%xmm2, %xmm2
-# CHECK-NEXT: [0,26]    .    .D-----------------R..   psubw	%xmm2, %xmm2
-# CHECK-NEXT: [0,27]    .    .D-----------------R..   vpsubb	%xmm3, %xmm3, %xmm3
-# CHECK-NEXT: [0,28]    .    . D----------------R..   vpsubd	%xmm3, %xmm3, %xmm3
-# CHECK-NEXT: [0,29]    .    . D----------------R..   vpsubq	%xmm3, %xmm3, %xmm3
-# CHECK-NEXT: [0,30]    .    . D----------------R..   vpsubw	%xmm3, %xmm3, %xmm3
-# CHECK-NEXT: [0,31]    .    . D----------------R..   vpsubb	%xmm3, %xmm3, %xmm5
-# CHECK-NEXT: [0,32]    .    .  D---------------R..   vpsubd	%xmm3, %xmm3, %xmm5
-# CHECK-NEXT: [0,33]    .    .  D---------------R..   vpsubq	%xmm3, %xmm3, %xmm5
-# CHECK-NEXT: [0,34]    .    .  D---------------R..   vpsubw	%xmm3, %xmm3, %xmm5
-# CHECK-NEXT: [0,35]    .    .  DeE-------------R..   andnps	%xmm0, %xmm0
-# CHECK-NEXT: [0,36]    .    .   DeE------------R..   andnpd	%xmm1, %xmm1
-# CHECK-NEXT: [0,37]    .    .   D=eE-----------R..   vandnps	%xmm2, %xmm2, %xmm2
-# CHECK-NEXT: [0,38]    .    .   D===eE---------R..   vandnpd	%xmm1, %xmm1, %xmm1
-# CHECK-NEXT: [0,39]    .    .   D==eE----------R..   vandnps	%ymm2, %ymm2, %ymm2
-# CHECK-NEXT: [0,40]    .    .    D===eE--------R..   vandnpd	%ymm1, %ymm1, %ymm1
-# CHECK-NEXT: [0,41]    .    .    D============eER.   pandn	%mm2, %mm2
-# CHECK-NEXT: [0,42]    .    .    D==eE----------R.   pandn	%xmm2, %xmm2
-# CHECK-NEXT: [0,43]    .    .    DeE------------R.   vpandn	%xmm3, %xmm3, %xmm3
-# CHECK-NEXT: [0,44]    .    .    .D===eE--------R.   vandnps	%xmm2, %xmm2, %xmm5
-# CHECK-NEXT: [0,45]    .    .    .D====eE-------R.   vandnpd	%xmm1, %xmm1, %xmm5
-# CHECK-NEXT: [0,46]    .    .    .DeE-----------R.   vpandn	%xmm3, %xmm3, %xmm5
-# CHECK-NEXT: [0,47]    .    .    .D=====eE------R.   vandnps	%ymm2, %ymm2, %ymm5
-# CHECK-NEXT: [0,48]    .    .    . D=====eE-----R.   vandnpd	%ymm1, %ymm1, %ymm5
-# CHECK-NEXT: [0,49]    .    .    . D------------R.   xorps	%xmm0, %xmm0
-# CHECK-NEXT: [0,50]    .    .    . D------------R.   xorpd	%xmm1, %xmm1
-# CHECK-NEXT: [0,51]    .    .    . D------------R.   vxorps	%xmm2, %xmm2, %xmm2
-# CHECK-NEXT: [0,52]    .    .    .  D-----------R.   vxorpd	%xmm1, %xmm1, %xmm1
-# CHECK-NEXT: [0,53]    .    .    .  D-----------R.   vxorps	%ymm2, %ymm2, %ymm2
-# CHECK-NEXT: [0,54]    .    .    .  D-----------R.   vxorpd	%ymm1, %ymm1, %ymm1
-# CHECK-NEXT: [0,55]    .    .    .  D==========eER   pxor	%mm2, %mm2
-# CHECK-NEXT: [0,56]    .    .    .   D-----------R   pxor	%xmm2, %xmm2
-# CHECK-NEXT: [0,57]    .    .    .   D-----------R   vpxor	%xmm3, %xmm3, %xmm3
-# CHECK-NEXT: [0,58]    .    .    .   D-----------R   vxorps	%xmm4, %xmm4, %xmm5
-# CHECK-NEXT: [0,59]    .    .    .   D-----------R   vxorpd	%xmm1, %xmm1, %xmm3
-# CHECK-NEXT: [0,60]    .    .    .    D----------R   vxorps	%ymm4, %ymm4, %ymm5
-# CHECK-NEXT: [0,61]    .    .    .    D----------R   vxorpd	%ymm1, %ymm1, %ymm3
-# CHECK-NEXT: [0,62]    .    .    .    D----------R   vpxor	%xmm3, %xmm3, %xmm5
+# CHECK:      [0,0]     DR   .    .    .    .   .   subl	%eax, %eax
+# CHECK-NEXT: [0,1]     DR   .    .    .    .   .   subq	%rax, %rax
+# CHECK-NEXT: [0,2]     DR   .    .    .    .   .   xorl	%eax, %eax
+# CHECK-NEXT: [0,3]     DR   .    .    .    .   .   xorq	%rax, %rax
+# CHECK-NEXT: [0,4]     .DeeeER   .    .    .   .   pcmpgtb	%mm2, %mm2
+# CHECK-NEXT: [0,5]     .D===eeeER.    .    .   .   pcmpgtd	%mm2, %mm2
+# CHECK-NEXT: [0,6]     .D======eeeER  .    .   .   pcmpgtw	%mm2, %mm2
+# CHECK-NEXT: [0,7]     .D----------R  .    .   .   pcmpgtb	%xmm2, %xmm2
+# CHECK-NEXT: [0,8]     . D---------R  .    .   .   pcmpgtd	%xmm2, %xmm2
+# CHECK-NEXT: [0,9]     . D---------R  .    .   .   pcmpgtq	%xmm2, %xmm2
+# CHECK-NEXT: [0,10]    . D---------R  .    .   .   pcmpgtw	%xmm2, %xmm2
+# CHECK-NEXT: [0,11]    . D---------R  .    .   .   vpcmpgtb	%xmm3, %xmm3, %xmm3
+# CHECK-NEXT: [0,12]    .  D--------R  .    .   .   vpcmpgtd	%xmm3, %xmm3, %xmm3
+# CHECK-NEXT: [0,13]    .  D--------R  .    .   .   vpcmpgtq	%xmm3, %xmm3, %xmm3
+# CHECK-NEXT: [0,14]    .  D--------R  .    .   .   vpcmpgtw	%xmm3, %xmm3, %xmm3
+# CHECK-NEXT: [0,15]    .  D--------R  .    .   .   vpcmpgtb	%xmm3, %xmm3, %xmm5
+# CHECK-NEXT: [0,16]    .   D-------R  .    .   .   vpcmpgtd	%xmm3, %xmm3, %xmm5
+# CHECK-NEXT: [0,17]    .   D-------R  .    .   .   vpcmpgtq	%xmm3, %xmm3, %xmm5
+# CHECK-NEXT: [0,18]    .   D-------R  .    .   .   vpcmpgtw	%xmm3, %xmm3, %xmm5
+# CHECK-NEXT: [0,19]    .   D======eeeER    .   .   psubb	%mm2, %mm2
+# CHECK-NEXT: [0,20]    .    D========eeeER .   .   psubd	%mm2, %mm2
+# CHECK-NEXT: [0,21]    .    D===========eER.   .   psubq	%mm2, %mm2
+# CHECK-NEXT: [0,22]    .    D============eeeER .   psubw	%mm2, %mm2
+# CHECK-NEXT: [0,23]    .    D----------------R .   psubb	%xmm2, %xmm2
+# CHECK-NEXT: [0,24]    .    .D---------------R .   psubd	%xmm2, %xmm2
+# CHECK-NEXT: [0,25]    .    .D---------------R .   psubq	%xmm2, %xmm2
+# CHECK-NEXT: [0,26]    .    .D---------------R .   psubw	%xmm2, %xmm2
+# CHECK-NEXT: [0,27]    .    .D---------------R .   vpsubb	%xmm3, %xmm3, %xmm3
+# CHECK-NEXT: [0,28]    .    . D--------------R .   vpsubd	%xmm3, %xmm3, %xmm3
+# CHECK-NEXT: [0,29]    .    . D--------------R .   vpsubq	%xmm3, %xmm3, %xmm3
+# CHECK-NEXT: [0,30]    .    . D--------------R .   vpsubw	%xmm3, %xmm3, %xmm3
+# CHECK-NEXT: [0,31]    .    . D--------------R .   vpsubb	%xmm3, %xmm3, %xmm5
+# CHECK-NEXT: [0,32]    .    .  D-------------R .   vpsubd	%xmm3, %xmm3, %xmm5
+# CHECK-NEXT: [0,33]    .    .  D-------------R .   vpsubq	%xmm3, %xmm3, %xmm5
+# CHECK-NEXT: [0,34]    .    .  D-------------R .   vpsubw	%xmm3, %xmm3, %xmm5
+# CHECK-NEXT: [0,35]    .    .  DeE-----------R .   andnps	%xmm0, %xmm0
+# CHECK-NEXT: [0,36]    .    .   DeE----------R .   andnpd	%xmm1, %xmm1
+# CHECK-NEXT: [0,37]    .    .   D=eE---------R .   vandnps	%xmm2, %xmm2, %xmm2
+# CHECK-NEXT: [0,38]    .    .   D===eE-------R .   vandnpd	%xmm1, %xmm1, %xmm1
+# CHECK-NEXT: [0,39]    .    .   D==eE--------R .   vandnps	%ymm2, %ymm2, %ymm2
+# CHECK-NEXT: [0,40]    .    .    D===eE------R .   vandnpd	%ymm1, %ymm1, %ymm1
+# CHECK-NEXT: [0,41]    .    .    D==========eER.   pandn	%mm2, %mm2
+# CHECK-NEXT: [0,42]    .    .    D==eE--------R.   pandn	%xmm2, %xmm2
+# CHECK-NEXT: [0,43]    .    .    DeE----------R.   vpandn	%xmm3, %xmm3, %xmm3
+# CHECK-NEXT: [0,44]    .    .    .D===eE------R.   vandnps	%xmm2, %xmm2, %xmm5
+# CHECK-NEXT: [0,45]    .    .    .D====eE-----R.   vandnpd	%xmm1, %xmm1, %xmm5
+# CHECK-NEXT: [0,46]    .    .    .DeE---------R.   vpandn	%xmm3, %xmm3, %xmm5
+# CHECK-NEXT: [0,47]    .    .    .D=====eE----R.   vandnps	%ymm2, %ymm2, %ymm5
+# CHECK-NEXT: [0,48]    .    .    . D=====eE---R.   vandnpd	%ymm1, %ymm1, %ymm5
+# CHECK-NEXT: [0,49]    .    .    . D----------R.   xorps	%xmm0, %xmm0
+# CHECK-NEXT: [0,50]    .    .    . D----------R.   xorpd	%xmm1, %xmm1
+# CHECK-NEXT: [0,51]    .    .    . D----------R.   vxorps	%xmm2, %xmm2, %xmm2
+# CHECK-NEXT: [0,52]    .    .    .  D---------R.   vxorpd	%xmm1, %xmm1, %xmm1
+# CHECK-NEXT: [0,53]    .    .    .  D---------R.   vxorps	%ymm2, %ymm2, %ymm2
+# CHECK-NEXT: [0,54]    .    .    .  D---------R.   vxorpd	%ymm1, %ymm1, %ymm1
+# CHECK-NEXT: [0,55]    .    .    .  D========eER   pxor	%mm2, %mm2
+# CHECK-NEXT: [0,56]    .    .    .   D---------R   pxor	%xmm2, %xmm2
+# CHECK-NEXT: [0,57]    .    .    .   D---------R   vpxor	%xmm3, %xmm3, %xmm3
+# CHECK-NEXT: [0,58]    .    .    .   D---------R   vxorps	%xmm4, %xmm4, %xmm5
+# CHECK-NEXT: [0,59]    .    .    .   D---------R   vxorpd	%xmm1, %xmm1, %xmm3
+# CHECK-NEXT: [0,60]    .    .    .    D--------R   vxorps	%ymm4, %ymm4, %ymm5
+# CHECK-NEXT: [0,61]    .    .    .    D--------R   vxorpd	%ymm1, %ymm1, %ymm3
+# CHECK-NEXT: [0,62]    .    .    .    D--------R   vpxor	%xmm3, %xmm3, %xmm5
 
 # CHECK:      Average Wait times (based on the timeline view):
 # CHECK-NEXT: [0]: Executions
@@ -345,45 +345,45 @@ vpxor  %xmm3, %xmm3, %xmm5
 # CHECK-NEXT: 19.    1     7.0    0.0    0.0       psubb	%mm2, %mm2
 # CHECK-NEXT: 20.    1     9.0    0.0    0.0       psubd	%mm2, %mm2
 # CHECK-NEXT: 21.    1     12.0   0.0    0.0       psubq	%mm2, %mm2
-# CHECK-NEXT: 22.    1     15.0   0.0    0.0       psubw	%mm2, %mm2
-# CHECK-NEXT: 23.    1     0.0    0.0    18.0      psubb	%xmm2, %xmm2
-# CHECK-NEXT: 24.    1     0.0    0.0    17.0      psubd	%xmm2, %xmm2
-# CHECK-NEXT: 25.    1     0.0    0.0    17.0      psubq	%xmm2, %xmm2
-# CHECK-NEXT: 26.    1     0.0    0.0    17.0      psubw	%xmm2, %xmm2
-# CHECK-NEXT: 27.    1     0.0    0.0    17.0      vpsubb	%xmm3, %xmm3, %xmm3
-# CHECK-NEXT: 28.    1     0.0    0.0    16.0      vpsubd	%xmm3, %xmm3, %xmm3
-# CHECK-NEXT: 29.    1     0.0    0.0    16.0      vpsubq	%xmm3, %xmm3, %xmm3
-# CHECK-NEXT: 30.    1     0.0    0.0    16.0      vpsubw	%xmm3, %xmm3, %xmm3
-# CHECK-NEXT: 31.    1     0.0    0.0    16.0      vpsubb	%xmm3, %xmm3, %xmm5
-# CHECK-NEXT: 32.    1     0.0    0.0    15.0      vpsubd	%xmm3, %xmm3, %xmm5
-# CHECK-NEXT: 33.    1     0.0    0.0    15.0      vpsubq	%xmm3, %xmm3, %xmm5
-# CHECK-NEXT: 34.    1     0.0    0.0    15.0      vpsubw	%xmm3, %xmm3, %xmm5
-# CHECK-NEXT: 35.    1     1.0    1.0    13.0      andnps	%xmm0, %xmm0
-# CHECK-NEXT: 36.    1     1.0    1.0    12.0      andnpd	%xmm1, %xmm1
-# CHECK-NEXT: 37.    1     2.0    2.0    11.0      vandnps	%xmm2, %xmm2, %xmm2
-# CHECK-NEXT: 38.    1     4.0    2.0    9.0       vandnpd	%xmm1, %xmm1, %xmm1
-# CHECK-NEXT: 39.    1     3.0    0.0    10.0      vandnps	%ymm2, %ymm2, %ymm2
-# CHECK-NEXT: 40.    1     4.0    0.0    8.0       vandnpd	%ymm1, %ymm1, %ymm1
-# CHECK-NEXT: 41.    1     13.0   0.0    0.0       pandn	%mm2, %mm2
-# CHECK-NEXT: 42.    1     3.0    0.0    10.0      pandn	%xmm2, %xmm2
-# CHECK-NEXT: 43.    1     1.0    1.0    12.0      vpandn	%xmm3, %xmm3, %xmm3
-# CHECK-NEXT: 44.    1     4.0    1.0    8.0       vandnps	%xmm2, %xmm2, %xmm5
-# CHECK-NEXT: 45.    1     5.0    1.0    7.0       vandnpd	%xmm1, %xmm1, %xmm5
-# CHECK-NEXT: 46.    1     1.0    0.0    11.0      vpandn	%xmm3, %xmm3, %xmm5
-# CHECK-NEXT: 47.    1     6.0    3.0    6.0       vandnps	%ymm2, %ymm2, %ymm5
-# CHECK-NEXT: 48.    1     6.0    3.0    5.0       vandnpd	%ymm1, %ymm1, %ymm5
-# CHECK-NEXT: 49.    1     0.0    0.0    12.0      xorps	%xmm0, %xmm0
-# CHECK-NEXT: 50.    1     0.0    0.0    12.0      xorpd	%xmm1, %xmm1
-# CHECK-NEXT: 51.    1     0.0    0.0    12.0      vxorps	%xmm2, %xmm2, %xmm2
-# CHECK-NEXT: 52.    1     0.0    0.0    11.0      vxorpd	%xmm1, %xmm1, %xmm1
-# CHECK-NEXT: 53.    1     0.0    0.0    11.0      vxorps	%ymm2, %ymm2, %ymm2
-# CHECK-NEXT: 54.    1     0.0    0.0    11.0      vxorpd	%ymm1, %ymm1, %ymm1
-# CHECK-NEXT: 55.    1     11.0   0.0    0.0       pxor	%mm2, %mm2
-# CHECK-NEXT: 56.    1     0.0    0.0    11.0      pxor	%xmm2, %xmm2
-# CHECK-NEXT: 57.    1     0.0    0.0    11.0      vpxor	%xmm3, %xmm3, %xmm3
-# CHECK-NEXT: 58.    1     0.0    0.0    11.0      vxorps	%xmm4, %xmm4, %xmm5
-# CHECK-NEXT: 59.    1     0.0    0.0    11.0      vxorpd	%xmm1, %xmm1, %xmm3
-# CHECK-NEXT: 60.    1     0.0    0.0    10.0      vxorps	%ymm4, %ymm4, %ymm5
-# CHECK-NEXT: 61.    1     0.0    0.0    10.0      vxorpd	%ymm1, %ymm1, %ymm3
-# CHECK-NEXT: 62.    1     0.0    0.0    10.0      vpxor	%xmm3, %xmm3, %xmm5
-# CHECK-NEXT:        1     1.9    0.3    8.9       <total>
+# CHECK-NEXT: 22.    1     13.0   0.0    0.0       psubw	%mm2, %mm2
+# CHECK-NEXT: 23.    1     0.0    0.0    16.0      psubb	%xmm2, %xmm2
+# CHECK-NEXT: 24.    1     0.0    0.0    15.0      psubd	%xmm2, %xmm2
+# CHECK-NEXT: 25.    1     0.0    0.0    15.0      psubq	%xmm2, %xmm2
+# CHECK-NEXT: 26.    1     0.0    0.0    15.0      psubw	%xmm2, %xmm2
+# CHECK-NEXT: 27.    1     0.0    0.0    15.0      vpsubb	%xmm3, %xmm3, %xmm3
+# CHECK-NEXT: 28.    1     0.0    0.0    14.0      vpsubd	%xmm3, %xmm3, %xmm3
+# CHECK-NEXT: 29.    1     0.0    0.0    14.0      vpsubq	%xmm3, %xmm3, %xmm3
+# CHECK-NEXT: 30.    1     0.0    0.0    14.0      vpsubw	%xmm3, %xmm3, %xmm3
+# CHECK-NEXT: 31.    1     0.0    0.0    14.0      vpsubb	%xmm3, %xmm3, %xmm5
+# CHECK-NEXT: 32.    1     0.0    0.0    13.0      vpsubd	%xmm3, %xmm3, %xmm5
+# CHECK-NEXT: 33.    1     0.0    0.0    13.0      vpsubq	%xmm3, %xmm3, %xmm5
+# CHECK-NEXT: 34.    1     0.0    0.0    13.0      vpsubw	%xmm3, %xmm3, %xmm5
+# CHECK-NEXT: 35.    1     1.0    1.0    11.0      andnps	%xmm0, %xmm0
+# CHECK-NEXT: 36.    1     1.0    1.0    10.0      andnpd	%xmm1, %xmm1
+# CHECK-NEXT: 37.    1     2.0    2.0    9.0       vandnps	%xmm2, %xmm2, %xmm2
+# CHECK-NEXT: 38.    1     4.0    2.0    7.0       vandnpd	%xmm1, %xmm1, %xmm1
+# CHECK-NEXT: 39.    1     3.0    0.0    8.0       vandnps	%ymm2, %ymm2, %ymm2
+# CHECK-NEXT: 40.    1     4.0    0.0    6.0       vandnpd	%ymm1, %ymm1, %ymm1
+# CHECK-NEXT: 41.    1     11.0   0.0    0.0       pandn	%mm2, %mm2
+# CHECK-NEXT: 42.    1     3.0    0.0    8.0       pandn	%xmm2, %xmm2
+# CHECK-NEXT: 43.    1     1.0    1.0    10.0      vpandn	%xmm3, %xmm3, %xmm3
+# CHECK-NEXT: 44.    1     4.0    1.0    6.0       vandnps	%xmm2, %xmm2, %xmm5
+# CHECK-NEXT: 45.    1     5.0    1.0    5.0       vandnpd	%xmm1, %xmm1, %xmm5
+# CHECK-NEXT: 46.    1     1.0    0.0    9.0       vpandn	%xmm3, %xmm3, %xmm5
+# CHECK-NEXT: 47.    1     6.0    3.0    4.0       vandnps	%ymm2, %ymm2, %ymm5
+# CHECK-NEXT: 48.    1     6.0    3.0    3.0       vandnpd	%ymm1, %ymm1, %ymm5
+# CHECK-NEXT: 49.    1     0.0    0.0    10.0      xorps	%xmm0, %xmm0
+# CHECK-NEXT: 50.    1     0.0    0.0    10.0      xorpd	%xmm1, %xmm1
+# CHECK-NEXT: 51.    1     0.0    0.0    10.0      vxorps	%xmm2, %xmm2, %xmm2
+# CHECK-NEXT: 52.    1     0.0    0.0    9.0       vxorpd	%xmm1, %xmm1, %xmm1
+# CHECK-NEXT: 53.    1     0.0    0.0    9.0       vxorps	%ymm2, %ymm2, %ymm2
+# CHECK-NEXT: 54.    1     0.0    0.0    9.0       vxorpd	%ymm1, %ymm1, %ymm1
+# CHECK-NEXT: 55.    1     9.0    0.0    0.0       pxor	%mm2, %mm2
+# CHECK-NEXT: 56.    1     0.0    0.0    9.0       pxor	%xmm2, %xmm2
+# CHECK-NEXT: 57.    1     0.0    0.0    9.0       vpxor	%xmm3, %xmm3, %xmm3
+# CHECK-NEXT: 58.    1     0.0    0.0    9.0       vxorps	%xmm4, %xmm4, %xmm5
+# CHECK-NEXT: 59.    1     0.0    0.0    9.0       vxorpd	%xmm1, %xmm1, %xmm3
+# CHECK-NEXT: 60.    1     0.0    0.0    8.0       vxorps	%ymm4, %ymm4, %ymm5
+# CHECK-NEXT: 61.    1     0.0    0.0    8.0       vxorpd	%ymm1, %ymm1, %ymm3
+# CHECK-NEXT: 62.    1     0.0    0.0    8.0       vpxor	%xmm3, %xmm3, %xmm5
+# CHECK-NEXT:        1     1.8    0.3    7.7       <total>
diff --git a/llvm/test/tools/llvm-objcopy/ELF/change-section-address.test b/llvm/test/tools/llvm-objcopy/ELF/change-section-address.test
new file mode 100644
index 0000000..b17b149
--- /dev/null
+++ b/llvm/test/tools/llvm-objcopy/ELF/change-section-address.test
@@ -0,0 +1,199 @@
+## This test tests the behavior of --change-section-address option.
+
+# RUN: yaml2obj -DTYPE=REL %s -o %ti1
+
+## Basic check that the option processes wildcards and changes the address as expected.
+# RUN: llvm-objcopy --change-section-address *+0x20 %ti1 %to1
+# RUN: llvm-readelf --section-headers %to1 | FileCheck %s --check-prefix=CHECK-ADD-ALL
+
+## Check that --change-section-address alias --adjust-section-vma produces the same output as the test above.
+# RUN: llvm-objcopy --adjust-section-vma *+0x20 %ti1 %to2
+# RUN: cmp %to1 %to2
+
+## Check that negative adjustment reduces the address by the specified value.
+# RUN: llvm-objcopy --change-section-address .anotherone-0x30 %ti1 %to3
+# RUN: llvm-readelf --section-headers %to3 | FileCheck %s --check-prefix=CHECK-SUB-SECTION
+
+## Check that a wildcard pattern works and only the specified sections are updated.
+# RUN: llvm-objcopy --change-section-address .text*+0x20 %ti1 %to4
+# RUN: llvm-readelf --section-headers %to4 | FileCheck %s --check-prefix=CHECK-ADD-PATTERN
+
+## Check that regex pattern can be used with --change-section-address.
+# RUN: llvm-objcopy --regex --change-section-address .text.+0x20 %ti1 %to5
+# RUN: llvm-readelf --section-headers %to5 | FileCheck %s --check-prefix=CHECK-ADD-PATTERN
+
+## Check that a section address can be set to a specific value.
+# RUN: llvm-objcopy --change-section-address .text*=0x10 %ti1 %to6
+# RUN: llvm-readelf --section-headers %to6 | FileCheck %s --check-prefix=CHECK-SET-PATTERN
+
+## Check setting that a section address can be set to the maximum possible value (UINT64_MAX).
+# RUN: llvm-objcopy --change-section-address .text2=0xffffffffffffffff %ti1 %to7
+# RUN: llvm-readelf --section-headers %to7 | FileCheck %s --check-prefix=CHECK-MAX
+
+## Check that a section address can be adjusted to the maximum possible value (UINT64_MAX).
+# RUN: llvm-objcopy --change-section-address .text2+0xfffffffffffffdff %ti1 %to8
+# RUN: llvm-readelf --section-headers %to8 | FileCheck %s --check-prefix=CHECK-MAX
+
+## Check that the section address can be adjusted to the minimum possible value (0).
+# RUN: llvm-objcopy --change-section-address .text2-0x200 %ti1 %to9
+# RUN: llvm-readelf --section-headers %to9 | FileCheck %s --check-prefix=CHECK-ZERO
+
+## Check that a section address can be adjusted by a maximum possible positive offset (UINT64_MAX).
+# RUN: llvm-objcopy --change-section-address .text2=0 %ti1 %to10
+# RUN: llvm-objcopy --change-section-address .text2+0xffffffffffffffff %to10 %to11
+# RUN: llvm-readelf --section-headers %to11 | FileCheck %s --check-prefix=CHECK-MAX
+
+## Check that a section address can be adjusted by a maximum possible negative offset (UINT64_MIN).
+# RUN: llvm-objcopy --change-section-address .text2=0xffffffffffffffff %ti1 %to12
+# RUN: llvm-objcopy --change-section-address .text2-0xffffffffffffffff %to12 %to13
+# RUN: llvm-readelf --section-headers %to13 | FileCheck %s --check-prefix=CHECK-ZERO
+
+## Check two independent changes.
+# RUN: llvm-objcopy --change-section-address .text1=0x110 --change-section-address .text2=0x210 %ti1 %to14
+# RUN: llvm-readelf --section-headers %to14 | FileCheck %s --check-prefix=CHECK-INDEPENDENT
+
+## Check two overlapping changes.
+# RUN: llvm-objcopy --change-section-address .anotherone-0x30 --change-section-address .anotherone+0x20 %ti1 %to15
+# RUN: llvm-readelf --section-headers %to15 | FileCheck %s --check-prefix=CHECK-USE-LAST
+
+## Check unused option.
+# RUN: llvm-objcopy --change-section-address .anotherone=0x455 --change-section-address *+0x20 %ti1 %to16
+# RUN: llvm-readelf --section-headers %to16 | FileCheck %s --check-prefix=CHECK-NOTSUPERSET-SET
+
+## Check partial overlap (.anotherone overlaps).
+# RUN: llvm-objcopy --change-section-address *+0x20 --change-section-address .anotherone=0x455 %ti1 %to17
+# RUN: llvm-readelf --section-headers %to17 | FileCheck %s --check-prefix=CHECK-SUPERSET-SET
+
+## Check more complex partial overlap (P1: .anotherone, .text2, P2: .text1, text2) (.text2 overlaps).
+# RUN: llvm-objcopy --regex  --change-section-address  ".(text2|anotherone)+0x20" --change-section-address .text.*+0x30  %ti1 %to18
+# RUN: llvm-readelf --section-headers %to18 | FileCheck %s --check-prefix=CHECK-PARTIAL-OVERLAP
+
+# CHECK-ADD-ALL:          [Nr] Name              Type            Address
+# CHECK-ADD-ALL:               .text1
+# CHECK-ADD-ALL-SAME:                                            0000000000000120
+# CHECK-ADD-ALL:               .text2
+# CHECK-ADD-ALL-SAME:                                            0000000000000220
+# CHECK-ADD-ALL:               .anotherone
+# CHECK-ADD-ALL-SAME:                                            0000000000000320
+# CHECK-ADD-ALL:               =a-b+c++d
+# CHECK-ADD-ALL-SAME:                                            0000000000000420
+# CHECK-ADD-ALL:               .strtab
+# CHECK-ADD_ALL-SAME:                                            0000000000000020
+# CHECK-ADD-ALL:               .shstrtab
+# CHECK-ADD-ALL-SAME:                                            0000000000000020
+
+# CHECK-SUB-SECTION:           .text1
+# CHECK-SUB-SECTION-SAME:                                        0000000000000100
+# CHECK-SUB-SECTION:           .text2
+# CHECK-SUB-SECTION-SAME:                                        0000000000000200
+# CHECK-SUB-SECTION:           .anotherone
+# CHECK-SUB-SECTION-SAME:                                        00000000000002d0
+
+# CHECK-ADD-PATTERN:           .text1
+# CHECK-ADD-PATTERN-SAME:                                        0000000000000120
+# CHECK-ADD-PATTERN:           .text2
+# CHECK-ADD-PATTERN-SAME:                                        0000000000000220
+# CHECK-ADD-PATTERN:           .anotherone
+# CHECK-ADD-PATTERN-SAME:                                        0000000000000300
+
+# CHECK-SET-PATTERN:           .text1
+# CHECK-SET-PATTERN-SAME:                                        0000000000000010
+# CHECK-SET-PATTERN:           .text2
+# CHECK-SET-PATTERN-SAME:                                        0000000000000010
+# CHECK-SET-PATTERN:           .anotherone
+# CHECK-SET-PATTERN-SAME:                                        0000000000000300
+
+# CHECK-MAX:                   .text2
+# CHECK-MAX-SAME:                                                ffffffffffffffff
+# CHECK-ZERO:                  .text2
+# CHECK-ZERO-SAME:                                               0000000000000000
+
+# CHECK-INDEPENDENT:           .text1
+# CHECK-INDEPENDENT-SAME:                                        0000000000000110
+# CHECK-INDEPENDENT:           .text2
+# CHECK-INDEPENDENT-SAME:                                        0000000000000210
+
+# CHECK-USE-LAST:              .anotherone
+# CHECK-USE-LAST-SAME:                                           0000000000000320
+
+# CHECK-NOTSUPERSET-SET:       .text1
+# CHECK-NOTSUPERSET-SET-SAME:                                    0000000000000120
+# CHECK-NOTSUPERSET-SET:       .text2
+# CHECK-NOTSUPERSET-SET-SAME:                                    0000000000000220
+# CHECK-NOTSUPERSET-SET:       .anotherone
+# CHECK-NOTSUPERSET-SET-SAME:                                    0000000000000320
+
+# CHECK-SUPERSET-SET:          .text1
+# CHECK-SUPERSET-SET-SAME:                                       0000000000000120
+# CHECK-SUPERSET-SET:          .text2
+# CHECK-SUPERSET-SET-SAME:                                       0000000000000220
+# CHECK-SUPERSET-SET:          .anotherone
+# CHECK-SUPERSET-SET-SAME:                                       0000000000000455
+
+# CHECK-PARTIAL-OVERLAP:        .text1
+# CHECK-PARTIAL-OVERLAP-SAME:                                    0000000000000130
+# CHECK-PARTIAL-OVERLAP:        .text2
+# CHECK-PARTIAL-OVERLAP-SAME:                                    0000000000000230
+# CHECK-PARTIAL-OVERLAP:        .anotherone
+# CHECK-PARTIAL-OVERLAP-SAME:                                    0000000000000320
+
+## Check overflow by 1.
+# RUN: not llvm-objcopy --change-section-address .anotherone+0xfffffffffffffd00 %ti1 2>&1 | FileCheck %s --check-prefix=ERR-OVERFLOW
+## Check underflow by 1.
+# RUN: not llvm-objcopy --change-section-address .text2-0x201 %ti1 2>&1 | FileCheck %s --check-prefix=ERR-UNDERFLOW
+## Check error when argument value is invalid as a whole.
+# RUN: not llvm-objcopy --change-section-address 0 %ti1 2>&1 | FileCheck %s --check-prefix=ERR-IVALID-VAL
+## Check error when the value is invalid in the argument value.
+# RUN: not llvm-objcopy --change-section-address .anotherone+0c50 %ti1 2>&1 | FileCheck %s --check-prefix=ERR-NOT-INTEGER
+## Check error when the value does not fit in uint64_t.
+# RUN  not llvm-objcopy --change-section-address .text1=0x10000000000000000 %ti1 %to 2>&1 | FileCheck %s --chack-prefix=ERR-NOT-INTEGER
+## Check error when the section pattern is missing.
+# RUN: not llvm-objcopy --change-section-address =0x10 %ti1 2>&1 | FileCheck %s --check-prefix=ERR-MISSING-SECTION
+## Check error when the negative adjustment value is missing.
+# RUN: not llvm-objcopy --change-section-address .text1- %ti1 2>&1 | FileCheck %s --check-prefix=ERR-MISSING-VALUE-MINUS
+## Check error when the positive adjustment value is missing.
+# RUN: not llvm-objcopy --change-section-address .text1+ %ti1 2>&1 | FileCheck %s --check-prefix=ERR-MISSING-VALUE-PLUS
+## Check error when the value to set the address to is missing.
+# RUN: not llvm-objcopy --change-section-address .text1= %ti1 2>&1 | FileCheck %s --check-prefix=ERR-MISSING-VALUE-EQUAL
+## Check error when the provided regex is invalid.
+# RUN: not llvm-objcopy --regex --change-section-address "ab**-0x20" %ti1 2>&1 | FileCheck %s --check-prefix=ERR-MATCHER-FAILURE
+
+# ERR-OVERFLOW: address 0x300 cannot be increased by 0xfffffffffffffd00. The result would overflow
+# ERR-UNDERFLOW: address 0x200 cannot be decreased by 0x201. The result would underflow
+# ERR-IVALID-VAL: error: bad format for --change-section-address: argument value 0 is invalid. See --help
+# ERR-NOT-INTEGER: error: bad format for --change-section-address: value after + is 0c50 when it should be a 64-bit integer
+# ERR-MISSING-SECTION: error: bad format for --change-section-address: missing section pattern to apply address change to
+# ERR-MISSING-VALUE-MINUS: error: bad format for --change-section-address: missing value of offset after '-'
+# ERR-MISSING-VALUE-PLUS: error: bad format for --change-section-address: missing value of offset after '+'
+# ERR-MISSING-VALUE-EQUAL: error: bad format for --change-section-address: missing address value after '='
+# ERR-MATCHER-FAILURE: error: cannot compile regular expression 'ab**': repetition-operator operand invalid
+
+--- !ELF
+FileHeader:
+  Class:           ELFCLASS64
+  Data:            ELFDATA2LSB
+  Type:            ET_[[TYPE]]
+Sections:
+  - Name:          .text1
+    Type:          SHT_PROGBITS
+    Size:          0x100
+    Address:       0x100
+  - Name:          .text2
+    Type:          SHT_PROGBITS
+    Size:          0x100
+    Address:       0x200
+  - Name:          .anotherone
+    Type:          SHT_PROGBITS
+    Size:          0x100
+    Address:       0x300
+  - Name:          =a-b+c++d
+    Type:          SHT_PROGBITS
+    Size:          0x100
+    Address:       0x400
+
+# RUN: yaml2obj -DTYPE=EXEC %s -o %ti2
+
+## Input file is not ET_REL
+# RUN: not llvm-objcopy --change-section-address *+0x20 %ti2 2>&1 | FileCheck %s --check-prefix=ERR-FILE-TYPE
+
+# ERR-FILE-TYPE: cannot change section address in a non-relocatable file
diff --git a/llvm/test/tools/llvm-objdump/BPF/disassemble-symbolize-operands.s b/llvm/test/tools/llvm-objdump/BPF/disassemble-symbolize-operands.s
new file mode 100644
index 0000000..a52ed56
--- /dev/null
+++ b/llvm/test/tools/llvm-objdump/BPF/disassemble-symbolize-operands.s
@@ -0,0 +1,24 @@
+# REQUIRES: bpf-registered-target
+
+## Verify generation of 'Lxx' labels for local jump targets, when
+## --symbolize-operands option is specified.
+
+# RUN: llvm-mc -triple=bpfel %s -filetype=obj -o %t
+# RUN: llvm-objdump -d --symbolize-operands --no-show-raw-insn --no-leading-addr %t | \
+# RUN:   FileCheck %s
+        .text
+main:
+        if r1 > 42 goto +2
+        r1 -= 10
+        goto -3
+        r0 = 0
+        exit
+
+# CHECK:      <main>:
+# CHECK-NEXT: <L1>:
+# CHECK-NEXT: 	if r1 > 0x2a goto +0x2 <L0>
+# CHECK-NEXT: 	r1 -= 0xa
+# CHECK-NEXT: 	goto -0x3 <main>
+# CHECK-NEXT: <L0>:
+# CHECK-NEXT: 	r0 = 0x0
+# CHECK-NEXT: 	exit
diff --git a/llvm/test/tools/llvm-tli-checker/ps4-tli-check.yaml b/llvm/test/tools/llvm-tli-checker/ps4-tli-check.yaml
index 5840e02..81f2c9c 100644
--- a/llvm/test/tools/llvm-tli-checker/ps4-tli-check.yaml
+++ b/llvm/test/tools/llvm-tli-checker/ps4-tli-check.yaml
@@ -34,21 +34,21 @@
 #
 # CHECK: << Total TLI yes SDK no:  8
 # CHECK: >> Total TLI no  SDK yes: 0
-# CHECK: == Total TLI yes SDK yes: 242
+# CHECK: == Total TLI yes SDK yes: 248
 #
 # WRONG_DETAIL: << TLI yes SDK no : '_ZdaPv' aka operator delete[](void*)
 # WRONG_DETAIL: >> TLI no  SDK yes: '_ZdaPvj' aka operator delete[](void*, unsigned int)
 # WRONG_DETAIL-COUNT-8: << TLI yes SDK no : {{.*}}__hot_cold_t
 # WRONG_SUMMARY: << Total TLI yes SDK no:  9{{$}}
 # WRONG_SUMMARY: >> Total TLI no  SDK yes: 1{{$}}
-# WRONG_SUMMARY: == Total TLI yes SDK yes: 241
+# WRONG_SUMMARY: == Total TLI yes SDK yes: 247
 #
 ## The -COUNT suffix doesn't care if there are too many matches, so check
 ## the exact count first; the two directives should add up to that.
 ## Yes, this means additions to TLI will fail this test, but the argument
 ## to -COUNT can't be an expression.
-# AVAIL: TLI knows 483 symbols, 250 available
-# AVAIL-COUNT-250: {{^}} available
+# AVAIL: TLI knows 489 symbols, 256 available
+# AVAIL-COUNT-256: {{^}} available
 # AVAIL-NOT:       {{^}} available
 # UNAVAIL-COUNT-233: not available
 # UNAVAIL-NOT:       not available
@@ -267,6 +267,18 @@ DynamicSymbols:
     Type:            STT_FUNC
     Section:         .text
     Binding:         STB_GLOBAL
+  - Name:            abort
+    Type:            STT_FUNC
+    Section:         .text
+    Binding:         STB_GLOBAL
+  - Name:            exit
+    Type:            STT_FUNC
+    Section:         .text
+    Binding:         STB_GLOBAL
+  - Name:            _Exit
+    Type:            STT_FUNC
+    Section:         .text
+    Binding:         STB_GLOBAL
   - Name:            atof
     Type:            STT_FUNC
     Section:         .text
@@ -691,6 +703,18 @@ DynamicSymbols:
     Type:            STT_FUNC
     Section:         .text
     Binding:         STB_GLOBAL
+  - Name:            nan
+    Type:            STT_FUNC
+    Section:         .text
+    Binding:         STB_GLOBAL
+  - Name:            nanf
+    Type:            STT_FUNC
+    Section:         .text
+    Binding:         STB_GLOBAL
+  - Name:            nanl
+    Type:            STT_FUNC
+    Section:         .text
+    Binding:         STB_GLOBAL
   - Name:            nearbyint
     Type:            STT_FUNC
     Section:         .text
diff --git a/llvm/tools/llc/NewPMDriver.cpp b/llvm/tools/llc/NewPMDriver.cpp
index 31c089e..c8088da 100644
--- a/llvm/tools/llc/NewPMDriver.cpp
+++ b/llvm/tools/llc/NewPMDriver.cpp
@@ -45,10 +45,6 @@
 #include "llvm/Transforms/Scalar/LoopPassManager.h"
 #include "llvm/Transforms/Utils/Cloning.h"
 
-namespace llvm {
-extern cl::opt<bool> PrintPipelinePasses;
-} // namespace llvm
-
 using namespace llvm;
 
 static cl::opt<std::string>
diff --git a/llvm/tools/llvm-ctxprof-util/CMakeLists.txt b/llvm/tools/llvm-ctxprof-util/CMakeLists.txt
index abf8e1a..4814c99 100644
--- a/llvm/tools/llvm-ctxprof-util/CMakeLists.txt
+++ b/llvm/tools/llvm-ctxprof-util/CMakeLists.txt
@@ -10,5 +10,4 @@ llvm-ctxprof-util.cpp
 
   DEPENDS
   intrinsics_gen
-  GENERATE_DRIVER
   )
diff --git a/llvm/tools/llvm-ctxprof-util/llvm-ctxprof-util.cpp b/llvm/tools/llvm-ctxprof-util/llvm-ctxprof-util.cpp
index ded8c8a..3bb7681 100644
--- a/llvm/tools/llvm-ctxprof-util/llvm-ctxprof-util.cpp
+++ b/llvm/tools/llvm-ctxprof-util/llvm-ctxprof-util.cpp
@@ -19,8 +19,8 @@
 #include "llvm/Support/CommandLine.h"
 #include "llvm/Support/Error.h"
 #include "llvm/Support/ErrorHandling.h"
+#include "llvm/Support/InitLLVM.h"
 #include "llvm/Support/JSON.h"
-#include "llvm/Support/LLVMDriver.h"
 #include "llvm/Support/MemoryBuffer.h"
 #include "llvm/Support/raw_ostream.h"
 
@@ -132,7 +132,7 @@ Error convertFromJSON() {
   return Error::success();
 }
 
-int llvm_ctxprof_util_main(int argc, char **argv, const llvm::ToolContext &) {
+int main(int argc, const char **argv) {
   cl::ParseCommandLineOptions(argc, argv, "LLVM Contextual Profile Utils\n");
   ExitOnError ExitOnErr("llvm-ctxprof-util: ");
   if (FromJSON) {
@@ -145,6 +145,6 @@ int llvm_ctxprof_util_main(int argc, char **argv, const llvm::ToolContext &) {
     }
     return 0;
   }
-  llvm_unreachable("Unknown subcommands should have been handled by the "
-                   "command line parser.");
+  cl::PrintHelpMessage();
+  return 1;
 }
diff --git a/llvm/tools/llvm-exegesis/lib/BenchmarkRunner.cpp b/llvm/tools/llvm-exegesis/lib/BenchmarkRunner.cpp
index ed53f8f..adee869 100644
--- a/llvm/tools/llvm-exegesis/lib/BenchmarkRunner.cpp
+++ b/llvm/tools/llvm-exegesis/lib/BenchmarkRunner.cpp
@@ -466,9 +466,20 @@ private:
 // segfaults in the program. Unregister the rseq region so that we can safely
 // unmap it later
 #ifdef GLIBC_INITS_RSEQ
+    unsigned int RseqStructSize = __rseq_size;
+
+    // Glibc v2.40 (the change is also expected to be backported to v2.35)
+    // changes the definition of __rseq_size to be the usable area of the struct
+    // rather than the actual size of the struct. v2.35 uses only 20 bytes of
+    // the 32 byte struct. For now, it should be safe to assume that if the
+    // usable size is less than 32, the actual size of the struct will be 32
+    // bytes given alignment requirements.
+    if (__rseq_size < 32)
+      RseqStructSize = 32;
+
     long RseqDisableOutput =
         syscall(SYS_rseq, (intptr_t)__builtin_thread_pointer() + __rseq_offset,
-                __rseq_size, RSEQ_FLAG_UNREGISTER, RSEQ_SIG);
+                RseqStructSize, RSEQ_FLAG_UNREGISTER, RSEQ_SIG);
     if (RseqDisableOutput != 0)
       exit(ChildProcessExitCodeE::RSeqDisableFailed);
 #endif // GLIBC_INITS_RSEQ
diff --git a/llvm/tools/llvm-objcopy/ObjcopyOptions.cpp b/llvm/tools/llvm-objcopy/ObjcopyOptions.cpp
index a54feb7..d82ecc8 100644
--- a/llvm/tools/llvm-objcopy/ObjcopyOptions.cpp
+++ b/llvm/tools/llvm-objcopy/ObjcopyOptions.cpp
@@ -584,6 +584,68 @@ static Expected<int64_t> parseChangeSectionLMA(StringRef ArgValue,
   return *LMAValue;
 }
 
+static Expected<SectionPatternAddressUpdate>
+parseChangeSectionAddr(StringRef ArgValue, StringRef OptionName,
+                       MatchStyle SectionMatchStyle,
+                       function_ref<Error(Error)> ErrorCallback) {
+  SectionPatternAddressUpdate PatternUpdate;
+
+  size_t LastSymbolIndex = ArgValue.find_last_of("+-=");
+  if (LastSymbolIndex == StringRef::npos)
+    return createStringError(errc::invalid_argument,
+                             "bad format for " + OptionName +
+                                 ": argument value " + ArgValue +
+                                 " is invalid. See --help");
+  char UpdateSymbol = ArgValue[LastSymbolIndex];
+
+  StringRef SectionPattern = ArgValue.slice(0, LastSymbolIndex);
+  if (SectionPattern.empty())
+    return createStringError(
+        errc::invalid_argument,
+        "bad format for " + OptionName +
+            ": missing section pattern to apply address change to");
+  if (Error E = PatternUpdate.SectionPattern.addMatcher(NameOrPattern::create(
+          SectionPattern, SectionMatchStyle, ErrorCallback)))
+    return std::move(E);
+
+  StringRef Value = ArgValue.slice(LastSymbolIndex + 1, StringRef::npos);
+  if (Value.empty()) {
+    switch (UpdateSymbol) {
+    case '+':
+    case '-':
+      return createStringError(errc::invalid_argument,
+                               "bad format for " + OptionName +
+                                   ": missing value of offset after '" +
+                                   std::string({UpdateSymbol}) + "'");
+
+    case '=':
+      return createStringError(errc::invalid_argument,
+                               "bad format for " + OptionName +
+                                   ": missing address value after '='");
+    }
+  }
+  auto AddrValue = getAsInteger<uint64_t>(Value);
+  if (!AddrValue)
+    return createStringError(AddrValue.getError(),
+                             "bad format for " + OptionName + ": value after " +
+                                 std::string({UpdateSymbol}) + " is " + Value +
+                                 " when it should be a 64-bit integer");
+
+  switch (UpdateSymbol) {
+  case '+':
+    PatternUpdate.Update.Kind = AdjustKind::Add;
+    break;
+  case '-':
+    PatternUpdate.Update.Kind = AdjustKind::Subtract;
+    break;
+  case '=':
+    PatternUpdate.Update.Kind = AdjustKind::Set;
+  }
+
+  PatternUpdate.Update.Value = *AddrValue;
+  return PatternUpdate;
+}
+
 // parseObjcopyOptions returns the config and sets the input arguments. If a
 // help flag is set then parseObjcopyOptions will print the help messege and
 // exit.
@@ -874,6 +936,15 @@ objcopy::parseObjcopyOptions(ArrayRef<const char *> RawArgsArr,
     Config.ChangeSectionLMAValAll = *LMAValue;
   }
 
+  for (auto *Arg : InputArgs.filtered(OBJCOPY_change_section_address)) {
+    Expected<SectionPatternAddressUpdate> AddressUpdate =
+        parseChangeSectionAddr(Arg->getValue(), Arg->getSpelling(),
+                               SectionMatchStyle, ErrorCallback);
+    if (!AddressUpdate)
+      return AddressUpdate.takeError();
+    Config.ChangeSectionAddress.push_back(*AddressUpdate);
+  }
+
   for (auto *Arg : InputArgs.filtered(OBJCOPY_redefine_symbol)) {
     if (!StringRef(Arg->getValue()).contains('='))
       return createStringError(errc::invalid_argument,
diff --git a/llvm/tools/llvm-objcopy/ObjcopyOpts.td b/llvm/tools/llvm-objcopy/ObjcopyOpts.td
index b26c497..434b5ff 100644
--- a/llvm/tools/llvm-objcopy/ObjcopyOpts.td
+++ b/llvm/tools/llvm-objcopy/ObjcopyOpts.td
@@ -267,6 +267,13 @@ defm change_section_lma
     : Eq<"change-section-lma", "Shift LMA of non-zero-sized sections in the program header by <val>">,
       MetaVarName<"*{+|-}val">;
 
+defm change_section_address
+    : Eq<"change-section-address", "Set the address of the <section> to, or adjust it by, <val>">,
+      MetaVarName<"sectionpattern{=|+|-}val">;
+def adjust_section_vma : JoinedOrSeparate<["--"], "adjust-section-vma">,
+                   Alias<change_section_address>,
+                   HelpText<"Alias for --change-section-address">;
+
 defm add_symbol
     : Eq<"add-symbol", "Add new symbol <name> to .symtab. Accepted flags: "
          "global, local, weak, default, hidden, protected, file, section, object, "
diff --git a/llvm/tools/llvm-objdump/llvm-objdump.cpp b/llvm/tools/llvm-objdump/llvm-objdump.cpp
index d124002..768a976 100644
--- a/llvm/tools/llvm-objdump/llvm-objdump.cpp
+++ b/llvm/tools/llvm-objdump/llvm-objdump.cpp
@@ -1484,9 +1484,11 @@ collectLocalBranchTargets(ArrayRef<uint8_t> Bytes, MCInstrAnalysis *MIA,
                           const MCSubtargetInfo *STI, uint64_t SectionAddr,
                           uint64_t Start, uint64_t End,
                           std::unordered_map<uint64_t, std::string> &Labels) {
-  // So far only supports PowerPC and X86.
+  // Supported by certain targets.
   const bool isPPC = STI->getTargetTriple().isPPC();
-  if (!isPPC && !STI->getTargetTriple().isX86())
+  const bool isX86 = STI->getTargetTriple().isX86();
+  const bool isBPF = STI->getTargetTriple().isBPF();
+  if (!isPPC && !isX86 && !isBPF)
     return;
 
   if (MIA)
diff --git a/llvm/tools/opt/NewPMDriver.cpp b/llvm/tools/opt/NewPMDriver.cpp
index 3746980..91ec905 100644
--- a/llvm/tools/opt/NewPMDriver.cpp
+++ b/llvm/tools/opt/NewPMDriver.cpp
@@ -227,10 +227,6 @@ static cl::opt<bool> DisableLoopUnrolling(
     "disable-loop-unrolling",
     cl::desc("Disable loop unrolling in all relevant passes"), cl::init(false));
 
-namespace llvm {
-extern cl::opt<bool> PrintPipelinePasses;
-} // namespace llvm
-
 template <typename PassManagerT>
 bool tryParsePipelineText(PassBuilder &PB,
                           const cl::opt<std::string> &PipelineOpt) {
diff --git a/llvm/unittests/ADT/APFloatTest.cpp b/llvm/unittests/ADT/APFloatTest.cpp
index d50bdf4..be675bb 100644
--- a/llvm/unittests/ADT/APFloatTest.cpp
+++ b/llvm/unittests/ADT/APFloatTest.cpp
@@ -2141,6 +2141,8 @@ TEST(APFloatTest, getZero) {
       {&APFloat::Float8E4M3FNUZ(), true, false, {0, 0}, 1},
       {&APFloat::Float8E4M3B11FNUZ(), false, false, {0, 0}, 1},
       {&APFloat::Float8E4M3B11FNUZ(), true, false, {0, 0}, 1},
+      {&APFloat::Float8E3M4(), false, true, {0, 0}, 1},
+      {&APFloat::Float8E3M4(), true, true, {0x80ULL, 0}, 1},
       {&APFloat::FloatTF32(), false, true, {0, 0}, 1},
       {&APFloat::FloatTF32(), true, true, {0x40000ULL, 0}, 1},
       {&APFloat::Float6E3M2FN(), false, true, {0, 0}, 1},
@@ -6636,6 +6638,45 @@ TEST(APFloatTest, Float8E4M3FNUZToDouble) {
   EXPECT_TRUE(std::isnan(QNaN.convertToDouble()));
 }
 
+TEST(APFloatTest, Float8E3M4ToDouble) {
+  APFloat PosZero = APFloat::getZero(APFloat::Float8E3M4(), false);
+  APFloat PosZeroToDouble(PosZero.convertToDouble());
+  EXPECT_TRUE(PosZeroToDouble.isPosZero());
+  APFloat NegZero = APFloat::getZero(APFloat::Float8E3M4(), true);
+  APFloat NegZeroToDouble(NegZero.convertToDouble());
+  EXPECT_TRUE(NegZeroToDouble.isNegZero());
+
+  APFloat One(APFloat::Float8E3M4(), "1.0");
+  EXPECT_EQ(1.0, One.convertToDouble());
+  APFloat Two(APFloat::Float8E3M4(), "2.0");
+  EXPECT_EQ(2.0, Two.convertToDouble());
+  APFloat PosLargest = APFloat::getLargest(APFloat::Float8E3M4(), false);
+  EXPECT_EQ(15.5F, PosLargest.convertToDouble());
+  APFloat NegLargest = APFloat::getLargest(APFloat::Float8E3M4(), true);
+  EXPECT_EQ(-15.5F, NegLargest.convertToDouble());
+  APFloat PosSmallest =
+      APFloat::getSmallestNormalized(APFloat::Float8E3M4(), false);
+  EXPECT_EQ(0x1.p-2, PosSmallest.convertToDouble());
+  APFloat NegSmallest =
+      APFloat::getSmallestNormalized(APFloat::Float8E3M4(), true);
+  EXPECT_EQ(-0x1.p-2, NegSmallest.convertToDouble());
+
+  APFloat PosSmallestDenorm =
+      APFloat::getSmallest(APFloat::Float8E3M4(), false);
+  EXPECT_TRUE(PosSmallestDenorm.isDenormal());
+  EXPECT_EQ(0x1.p-6, PosSmallestDenorm.convertToDouble());
+  APFloat NegSmallestDenorm = APFloat::getSmallest(APFloat::Float8E3M4(), true);
+  EXPECT_TRUE(NegSmallestDenorm.isDenormal());
+  EXPECT_EQ(-0x1.p-6, NegSmallestDenorm.convertToDouble());
+
+  APFloat PosInf = APFloat::getInf(APFloat::Float8E3M4());
+  EXPECT_EQ(std::numeric_limits<double>::infinity(), PosInf.convertToDouble());
+  APFloat NegInf = APFloat::getInf(APFloat::Float8E3M4(), true);
+  EXPECT_EQ(-std::numeric_limits<double>::infinity(), NegInf.convertToDouble());
+  APFloat QNaN = APFloat::getQNaN(APFloat::Float8E3M4());
+  EXPECT_TRUE(std::isnan(QNaN.convertToDouble()));
+}
+
 TEST(APFloatTest, FloatTF32ToDouble) {
   APFloat One(APFloat::FloatTF32(), "1.0");
   EXPECT_EQ(1.0, One.convertToDouble());
@@ -6944,6 +6985,46 @@ TEST(APFloatTest, Float8E4M3FNToFloat) {
   EXPECT_TRUE(std::isnan(QNaN.convertToFloat()));
 }
 
+TEST(APFloatTest, Float8E3M4ToFloat) {
+  APFloat PosZero = APFloat::getZero(APFloat::Float8E3M4(), false);
+  APFloat PosZeroToFloat(PosZero.convertToFloat());
+  EXPECT_TRUE(PosZeroToFloat.isPosZero());
+  APFloat NegZero = APFloat::getZero(APFloat::Float8E3M4(), true);
+  APFloat NegZeroToFloat(NegZero.convertToFloat());
+  EXPECT_TRUE(NegZeroToFloat.isNegZero());
+
+  APFloat One(APFloat::Float8E3M4(), "1.0");
+  EXPECT_EQ(1.0F, One.convertToFloat());
+  APFloat Two(APFloat::Float8E3M4(), "2.0");
+  EXPECT_EQ(2.0F, Two.convertToFloat());
+
+  APFloat PosLargest = APFloat::getLargest(APFloat::Float8E3M4(), false);
+  EXPECT_EQ(15.5F, PosLargest.convertToFloat());
+  APFloat NegLargest = APFloat::getLargest(APFloat::Float8E3M4(), true);
+  EXPECT_EQ(-15.5F, NegLargest.convertToFloat());
+  APFloat PosSmallest =
+      APFloat::getSmallestNormalized(APFloat::Float8E3M4(), false);
+  EXPECT_EQ(0x1.p-2, PosSmallest.convertToFloat());
+  APFloat NegSmallest =
+      APFloat::getSmallestNormalized(APFloat::Float8E3M4(), true);
+  EXPECT_EQ(-0x1.p-2, NegSmallest.convertToFloat());
+
+  APFloat PosSmallestDenorm =
+      APFloat::getSmallest(APFloat::Float8E3M4(), false);
+  EXPECT_TRUE(PosSmallestDenorm.isDenormal());
+  EXPECT_EQ(0x1.p-6, PosSmallestDenorm.convertToFloat());
+  APFloat NegSmallestDenorm = APFloat::getSmallest(APFloat::Float8E3M4(), true);
+  EXPECT_TRUE(NegSmallestDenorm.isDenormal());
+  EXPECT_EQ(-0x1.p-6, NegSmallestDenorm.convertToFloat());
+
+  APFloat PosInf = APFloat::getInf(APFloat::Float8E3M4());
+  EXPECT_EQ(std::numeric_limits<float>::infinity(), PosInf.convertToFloat());
+  APFloat NegInf = APFloat::getInf(APFloat::Float8E3M4(), true);
+  EXPECT_EQ(-std::numeric_limits<float>::infinity(), NegInf.convertToFloat());
+  APFloat QNaN = APFloat::getQNaN(APFloat::Float8E3M4());
+  EXPECT_TRUE(std::isnan(QNaN.convertToFloat()));
+}
+
 TEST(APFloatTest, FloatTF32ToFloat) {
   APFloat PosZero = APFloat::getZero(APFloat::FloatTF32());
   APFloat PosZeroToFloat(PosZero.convertToFloat());
diff --git a/llvm/unittests/Analysis/DXILResourceTest.cpp b/llvm/unittests/Analysis/DXILResourceTest.cpp
index 554cbd0..7bbb417 100644
--- a/llvm/unittests/Analysis/DXILResourceTest.cpp
+++ b/llvm/unittests/Analysis/DXILResourceTest.cpp
@@ -151,6 +151,19 @@ TEST(DXILResource, AnnotationsAndMetadata) {
   EXPECT_MDEQ(
       MD, TestMD.get(0, Symbol, "Buffer0", 0, 0, 1, 12, 0, TestMD.get(1, 16)));
 
+  // StructuredBuffer<float3> Buffer1 : register(t1);
+  Symbol = UndefValue::get(StructType::create(
+      Context, {Floatx3Ty}, "class.StructuredBuffer<vector<float, 3> >"));
+  Resource = ResourceInfo::StructuredBuffer(Symbol, "Buffer1",
+                                            /*Stride=*/12, {});
+  Resource.bind(1, 0, 1, 1);
+  Props = Resource.getAnnotateProps();
+  EXPECT_EQ(Props.first, 0x0000000cU);
+  EXPECT_EQ(Props.second, 0x0000000cU);
+  MD = Resource.getAsMetadata(Context);
+  EXPECT_MDEQ(
+      MD, TestMD.get(1, Symbol, "Buffer1", 0, 1, 1, 12, 0, TestMD.get(1, 12)));
+
   // Texture2D<float4> ColorMapTexture : register(t2);
   Symbol = UndefValue::get(StructType::create(
       Context, {Floatx4Ty}, "class.Texture2D<vector<float, 4> >"));
diff --git a/llvm/unittests/Analysis/ScalarEvolutionTest.cpp b/llvm/unittests/Analysis/ScalarEvolutionTest.cpp
index a6a5ffd..76e6095 100644
--- a/llvm/unittests/Analysis/ScalarEvolutionTest.cpp
+++ b/llvm/unittests/Analysis/ScalarEvolutionTest.cpp
@@ -1625,4 +1625,40 @@ TEST_F(ScalarEvolutionsTest, ForgetValueWithOverflowInst) {
   });
 }
 
+TEST_F(ScalarEvolutionsTest, ComplexityComparatorIsStrictWeakOrdering) {
+  // Regression test for a case where caching of equivalent values caused the
+  // comparator to get inconsistent.
+  LLVMContext C;
+  SMDiagnostic Err;
+  std::unique_ptr<Module> M = parseAssemblyString(R"(
+    define i32 @foo(i32 %arg0) {
+      %1 = add i32 %arg0, 1
+      %2 = add i32 %arg0, 1
+      %3 = xor i32 %2, %1
+      %4 = add i32 %3, %2
+      %5 = add i32 %arg0, 1
+      %6 = xor i32 %5, %arg0
+      %7 = add i32 %arg0, %6
+      %8 = add i32 %5, %7
+      %9 = xor i32 %8, %7
+      %10 = add i32 %9, %8
+      %11 = xor i32 %10, %9
+      %12 = add i32 %11, %10
+      %13 = xor i32 %12, %11
+      %14 = add i32 %12, %13
+      %15 = add i32 %14, %4
+      ret i32 %15
+    })",
+                                                  Err, C);
+
+  ASSERT_TRUE(M && "Could not parse module?");
+  ASSERT_TRUE(!verifyModule(*M) && "Must have been well formed!");
+
+  runWithSE(*M, "foo", [](Function &F, LoopInfo &LI, ScalarEvolution &SE) {
+    // When _LIBCPP_HARDENING_MODE == _LIBCPP_HARDENING_MODE_DEBUG, this will
+    // crash if the comparator has the specific caching bug.
+    SE.getSCEV(F.getEntryBlock().getTerminator()->getOperand(0));
+  });
+}
+
 }  // end namespace llvm
diff --git a/llvm/unittests/Analysis/TargetLibraryInfoTest.cpp b/llvm/unittests/Analysis/TargetLibraryInfoTest.cpp
index 1447291..d344ebe 100644
--- a/llvm/unittests/Analysis/TargetLibraryInfoTest.cpp
+++ b/llvm/unittests/Analysis/TargetLibraryInfoTest.cpp
@@ -255,6 +255,9 @@ TEST_F(TargetLibraryInfoTest, ValidProto) {
       "declare double @modf(double, double*)\n"
       "declare float @modff(float, float*)\n"
       "declare x86_fp80 @modfl(x86_fp80, x86_fp80*)\n"
+      "declare double @nan(ptr)\n"
+      "declare float @nanf(ptr)\n"
+      "declare x86_fp80 @nanl(ptr)\n"
       "declare double @nearbyint(double)\n"
       "declare float @nearbyintf(float)\n"
       "declare x86_fp80 @nearbyintl(x86_fp80)\n"
@@ -500,6 +503,10 @@ TEST_F(TargetLibraryInfoTest, ValidProto) {
 
       "declare i32 @atexit(void ()*)\n"
 
+      "declare void @abort()\n"
+      "declare void @exit(i32)\n"
+      "declare void @_Exit(i32)\n"
+
       "declare i32 @__nvvm_reflect(i8*)\n"
 
       "declare i8* @__memcpy_chk(i8*, i8*, i64, i64)\n"
@@ -663,4 +670,4 @@ protected:
 TEST_F(TLITestAarch64, TestFrem) {
   EXPECT_EQ(getScalarName(Instruction::FRem, Type::getDoubleTy(Ctx)), "fmod");
   EXPECT_EQ(getScalarName(Instruction::FRem, Type::getFloatTy(Ctx)), "fmodf");
-}
-\ No newline at end of file
+}
diff --git a/llvm/unittests/CodeGen/SelectionDAGPatternMatchTest.cpp b/llvm/unittests/CodeGen/SelectionDAGPatternMatchTest.cpp
index eadb0e2..074247e 100644
--- a/llvm/unittests/CodeGen/SelectionDAGPatternMatchTest.cpp
+++ b/llvm/unittests/CodeGen/SelectionDAGPatternMatchTest.cpp
@@ -131,6 +131,17 @@ TEST_F(SelectionDAGPatternMatchTest, matchTernaryOp) {
   SDValue ICMP_EQ01 = DAG->getSetCC(DL, MVT::i1, Op0, Op1, ISD::SETEQ);
   SDValue ICMP_EQ10 = DAG->getSetCC(DL, MVT::i1, Op1, Op0, ISD::SETEQ);
 
+  auto Int1VT = EVT::getIntegerVT(Context, 1);
+  SDValue Cond = DAG->getCopyFromReg(DAG->getEntryNode(), DL, 3, Int1VT);
+  SDValue T = DAG->getCopyFromReg(DAG->getEntryNode(), DL, 4, Int1VT);
+  SDValue F = DAG->getCopyFromReg(DAG->getEntryNode(), DL, 5, Int1VT);
+  SDValue Select = DAG->getSelect(DL, MVT::i1, Cond, T, F);
+
+  auto VInt32VT = EVT::getVectorVT(Context, Int32VT, 4);
+  SDValue V1 = DAG->getCopyFromReg(DAG->getEntryNode(), DL, 6, VInt32VT);
+  SDValue V2 = DAG->getCopyFromReg(DAG->getEntryNode(), DL, 7, VInt32VT);
+  SDValue VSelect = DAG->getNode(ISD::VSELECT, DL, VInt32VT, Cond, V1, V2);
+
   using namespace SDPatternMatch;
   ISD::CondCode CC;
   EXPECT_TRUE(sd_match(ICMP_UGT, m_SetCC(m_Value(), m_Value(),
@@ -153,6 +164,17 @@ TEST_F(SelectionDAGPatternMatchTest, matchTernaryOp) {
                                             m_SpecificCondCode(ISD::SETEQ))));
   EXPECT_TRUE(sd_match(ICMP_EQ10, m_c_SetCC(m_Specific(Op0), m_Specific(Op1),
                                             m_SpecificCondCode(ISD::SETEQ))));
+
+  EXPECT_TRUE(sd_match(
+      Select, m_Select(m_Specific(Cond), m_Specific(T), m_Specific(F))));
+  EXPECT_FALSE(sd_match(
+      Select, m_Select(m_Specific(Cond), m_Specific(F), m_Specific(T))));
+  EXPECT_FALSE(sd_match(ICMP_EQ01, m_Select(m_Specific(Op0), m_Specific(Op1),
+                                            m_SpecificCondCode(ISD::SETEQ))));
+  EXPECT_TRUE(sd_match(
+      VSelect, m_VSelect(m_Specific(Cond), m_Specific(V1), m_Specific(V2))));
+  EXPECT_FALSE(sd_match(
+      Select, m_VSelect(m_Specific(Cond), m_Specific(V1), m_Specific(V2))));
 }
 
 TEST_F(SelectionDAGPatternMatchTest, matchBinaryOp) {
@@ -228,6 +250,8 @@ TEST_F(SelectionDAGPatternMatchTest, matchUnaryOp) {
   SDValue Neg = DAG->getNegative(Op0, DL, Int32VT);
   SDValue Not = DAG->getNOT(DL, Op0, Int32VT);
 
+  SDValue VScale = DAG->getVScale(DL, Int32VT, APInt::getMaxValue(32));
+
   using namespace SDPatternMatch;
   EXPECT_TRUE(sd_match(ZExt, m_UnaryOp(ISD::ZERO_EXTEND, m_Value())));
   EXPECT_TRUE(sd_match(SExt, m_SExt(m_Value())));
@@ -238,6 +262,7 @@ TEST_F(SelectionDAGPatternMatchTest, matchUnaryOp) {
   EXPECT_FALSE(sd_match(ZExt, m_Neg(m_Value())));
   EXPECT_FALSE(sd_match(Sub, m_Neg(m_Value())));
   EXPECT_FALSE(sd_match(Neg, m_Not(m_Value())));
+  EXPECT_TRUE(sd_match(VScale, m_VScale(m_Value())));
 }
 
 TEST_F(SelectionDAGPatternMatchTest, matchConstants) {
diff --git a/llvm/unittests/IR/DominatorTreeTest.cpp b/llvm/unittests/IR/DominatorTreeTest.cpp
index 44bde74..555348c 100644
--- a/llvm/unittests/IR/DominatorTreeTest.cpp
+++ b/llvm/unittests/IR/DominatorTreeTest.cpp
@@ -607,11 +607,10 @@ TEST(DominatorTree, DeletingEdgesIntroducesInfiniteLoop2) {
         SwitchC->removeCase(SwitchC->case_begin());
         DT->deleteEdge(C, C2);
         PDT->deleteEdge(C, C2);
-        C2->removeFromParent();
 
         EXPECT_EQ(DT->getNode(C2), nullptr);
         PDT->eraseNode(C2);
-        delete C2;
+        C2->eraseFromParent();
 
         EXPECT_TRUE(DT->verify());
         EXPECT_TRUE(PDT->verify());
diff --git a/llvm/unittests/IR/FunctionTest.cpp b/llvm/unittests/IR/FunctionTest.cpp
index 9aaff3e..4026679 100644
--- a/llvm/unittests/IR/FunctionTest.cpp
+++ b/llvm/unittests/IR/FunctionTest.cpp
@@ -487,6 +487,98 @@ TEST(FunctionTest, EraseBBs) {
   EXPECT_EQ(F->size(), 0u);
 }
 
+TEST(FunctionTest, BasicBlockNumbers) {
+  LLVMContext Context;
+  Type *VoidType = Type::getVoidTy(Context);
+  FunctionType *FuncType = FunctionType::get(VoidType, false);
+  std::unique_ptr<Function> Func(
+      Function::Create(FuncType, GlobalValue::ExternalLinkage));
+
+  EXPECT_EQ(Func->getBlockNumberEpoch(), 0u);
+  EXPECT_EQ(Func->getMaxBlockNumber(), 0u);
+
+  BasicBlock *BB1 = BasicBlock::Create(Context, "bb1", Func.get());
+  EXPECT_EQ(BB1->getNumber(), 0u);
+  EXPECT_EQ(Func->getMaxBlockNumber(), 1u);
+  BasicBlock *BB2 = BasicBlock::Create(Context, "bb2", Func.get());
+  EXPECT_EQ(BB2->getNumber(), 1u);
+  EXPECT_EQ(Func->getMaxBlockNumber(), 2u);
+  BasicBlock *BB3 = BasicBlock::Create(Context, "bb3", Func.get());
+  EXPECT_EQ(BB3->getNumber(), 2u);
+  EXPECT_EQ(Func->getMaxBlockNumber(), 3u);
+
+  BB2->eraseFromParent();
+  // Erasing doesn't trigger renumbering
+  EXPECT_EQ(BB1->getNumber(), 0u);
+  EXPECT_EQ(BB3->getNumber(), 2u);
+  EXPECT_EQ(Func->getMaxBlockNumber(), 3u);
+  // ... and number are assigned monotonically increasing
+  BasicBlock *BB4 = BasicBlock::Create(Context, "bb4", Func.get());
+  EXPECT_EQ(BB4->getNumber(), 3u);
+  EXPECT_EQ(Func->getMaxBlockNumber(), 4u);
+  // ... even if inserted not at the end
+  BasicBlock *BB5 = BasicBlock::Create(Context, "bb5", Func.get(), BB1);
+  EXPECT_EQ(BB5->getNumber(), 4u);
+  EXPECT_EQ(Func->getMaxBlockNumber(), 5u);
+
+  // Func is now: bb5, bb1, bb3, bb4
+  // Renumbering assigns numbers in their order in the function
+  EXPECT_EQ(Func->getBlockNumberEpoch(), 0u);
+  Func->renumberBlocks();
+  EXPECT_EQ(Func->getBlockNumberEpoch(), 1u);
+  EXPECT_EQ(BB5->getNumber(), 0u);
+  EXPECT_EQ(BB1->getNumber(), 1u);
+  EXPECT_EQ(BB3->getNumber(), 2u);
+  EXPECT_EQ(BB4->getNumber(), 3u);
+  EXPECT_EQ(Func->getMaxBlockNumber(), 4u);
+
+  // Moving a block inside the function doesn't change numbers
+  BB1->moveBefore(BB5);
+  EXPECT_EQ(BB5->getNumber(), 0u);
+  EXPECT_EQ(BB1->getNumber(), 1u);
+  EXPECT_EQ(BB3->getNumber(), 2u);
+  EXPECT_EQ(BB4->getNumber(), 3u);
+  EXPECT_EQ(Func->getMaxBlockNumber(), 4u);
+
+  // Removing a block and adding it back assigns a new number, because the
+  // block was temporarily without a parent.
+  BB4->removeFromParent();
+  BB4->insertInto(Func.get());
+  EXPECT_EQ(BB5->getNumber(), 0u);
+  EXPECT_EQ(BB1->getNumber(), 1u);
+  EXPECT_EQ(BB3->getNumber(), 2u);
+  EXPECT_EQ(BB4->getNumber(), 4u);
+  EXPECT_EQ(Func->getMaxBlockNumber(), 5u);
+
+  std::unique_ptr<Function> Func2(
+      Function::Create(FuncType, GlobalValue::ExternalLinkage));
+  BasicBlock *BB6 = BasicBlock::Create(Context, "bb6", Func2.get());
+  EXPECT_EQ(BB6->getNumber(), 0u);
+  EXPECT_EQ(Func2->getMaxBlockNumber(), 1u);
+  // Moving a block to a different function assigns a new number
+  BB3->removeFromParent();
+  BB3->insertInto(Func2.get(), BB6);
+  EXPECT_EQ(BB3->getParent(), Func2.get());
+  EXPECT_EQ(BB3->getNumber(), 1u);
+  EXPECT_EQ(Func2->getMaxBlockNumber(), 2u);
+
+  EXPECT_EQ(Func2->getBlockNumberEpoch(), 0u);
+  Func2->renumberBlocks();
+  EXPECT_EQ(Func2->getBlockNumberEpoch(), 1u);
+  EXPECT_EQ(BB3->getNumber(), 0u);
+  EXPECT_EQ(BB6->getNumber(), 1u);
+  EXPECT_EQ(Func2->getMaxBlockNumber(), 2u);
+
+  // splice works as expected and assigns new numbers
+  Func->splice(Func->end(), Func2.get());
+  EXPECT_EQ(BB5->getNumber(), 0u);
+  EXPECT_EQ(BB1->getNumber(), 1u);
+  EXPECT_EQ(BB4->getNumber(), 4u);
+  EXPECT_EQ(BB3->getNumber(), 5u);
+  EXPECT_EQ(BB6->getNumber(), 6u);
+  EXPECT_EQ(Func->getMaxBlockNumber(), 7u);
+}
+
 TEST(FunctionTest, UWTable) {
   LLVMContext Ctx;
   std::unique_ptr<Module> M = parseIR(Ctx, R"(
diff --git a/llvm/unittests/IR/PatternMatch.cpp b/llvm/unittests/IR/PatternMatch.cpp
index 309fcc9..379f97f 100644
--- a/llvm/unittests/IR/PatternMatch.cpp
+++ b/llvm/unittests/IR/PatternMatch.cpp
@@ -2317,6 +2317,14 @@ TYPED_TEST(MutableConstTest, ICmp) {
   EXPECT_FALSE(m_SpecificCmp(ICmpInst::getInversePredicate(Pred),
                              m_Value(MatchL), m_Value(MatchR))
                    .match((InstructionType)IRB.CreateICmp(Pred, L, R)));
+
+  EXPECT_TRUE(m_c_SpecificICmp(Pred, m_Specific(L), m_Specific(R))
+                  .match((InstructionType)IRB.CreateICmp(Pred, L, R)));
+  EXPECT_TRUE(m_c_SpecificICmp(ICmpInst::getSwappedPredicate(Pred),
+                               m_Specific(R), m_Specific(L))
+                  .match((InstructionType)IRB.CreateICmp(Pred, L, R)));
+  EXPECT_FALSE(m_c_SpecificICmp(Pred, m_Specific(R), m_Specific(L))
+                   .match((InstructionType)IRB.CreateICmp(Pred, L, R)));
 }
 
 TYPED_TEST(MutableConstTest, FCmp) {
diff --git a/llvm/unittests/SandboxIR/SandboxIRTest.cpp b/llvm/unittests/SandboxIR/SandboxIRTest.cpp
index c600103..cbf3952 100644
--- a/llvm/unittests/SandboxIR/SandboxIRTest.cpp
+++ b/llvm/unittests/SandboxIR/SandboxIRTest.cpp
@@ -9,6 +9,7 @@
 #include "llvm/SandboxIR/SandboxIR.h"
 #include "llvm/AsmParser/Parser.h"
 #include "llvm/IR/BasicBlock.h"
+#include "llvm/IR/DataLayout.h"
 #include "llvm/IR/Function.h"
 #include "llvm/IR/Instruction.h"
 #include "llvm/IR/Module.h"
@@ -90,7 +91,7 @@ define void @foo(i32 %v1) {
   EXPECT_FALSE(isa<sandboxir::Instruction>(Const0));
   EXPECT_TRUE(isa<sandboxir::Instruction>(OpaqueI));
 
-  EXPECT_FALSE(isa<sandboxir::User>(F));
+  EXPECT_TRUE(isa<sandboxir::User>(F));
   EXPECT_FALSE(isa<sandboxir::User>(Arg0));
   EXPECT_FALSE(isa<sandboxir::User>(BB));
   EXPECT_TRUE(isa<sandboxir::User>(AddI));
@@ -180,8 +181,8 @@ define i32 @foo(i32 %v0, i32 %v1) {
   BS << "\n";
   I0->getOperandUse(0).dump(BS);
   EXPECT_EQ(Buff, R"IR(
-Def:  i32 %v0 ; SB1. (Argument)
-User:   %add0 = add i32 %v0, %v1 ; SB4. (Opaque)
+Def:  i32 %v0 ; SB2. (Argument)
+User:   %add0 = add i32 %v0, %v1 ; SB5. (Opaque)
 OperandNo: 0
 )IR");
 #endif // NDEBUG
@@ -398,10 +399,10 @@ bb1:
     EXPECT_EQ(Buff, R"IR(
 void @foo(i32 %arg0, i32 %arg1) {
 bb0:
-  br label %bb1 ; SB3. (Br)
+  br label %bb1 ; SB4. (Br)
 
 bb1:
-  ret void ; SB5. (Ret)
+  ret void ; SB6. (Ret)
 }
 )IR");
   }
@@ -466,7 +467,7 @@ bb1:
     BB0.dump(BS);
     EXPECT_EQ(Buff, R"IR(
 bb0:
-  br label %bb1 ; SB2. (Br)
+  br label %bb1 ; SB3. (Br)
 )IR");
   }
 #endif // NDEBUG
@@ -738,6 +739,7 @@ TEST_F(SandboxIRTest, LoadInst) {
   parseIR(C, R"IR(
 define void @foo(ptr %arg0, ptr %arg1) {
   %ld = load i8, ptr %arg0, align 64
+  %vld = load volatile i8, ptr %arg0, align 64
   ret void
 }
 )IR");
@@ -749,8 +751,13 @@ define void @foo(ptr %arg0, ptr %arg1) {
   auto *BB = &*F->begin();
   auto It = BB->begin();
   auto *Ld = cast<sandboxir::LoadInst>(&*It++);
+  auto *VLd = cast<sandboxir::LoadInst>(&*It++);
   auto *Ret = cast<sandboxir::ReturnInst>(&*It++);
 
+  // Check isVolatile()
+  EXPECT_FALSE(Ld->isVolatile());
+  // Check isVolatile()
+  EXPECT_TRUE(VLd->isVolatile());
   // Check getPointerOperand()
   EXPECT_EQ(Ld->getPointerOperand(), Arg0);
   // Check getAlign()
@@ -759,16 +766,49 @@ define void @foo(ptr %arg0, ptr %arg1) {
   sandboxir::LoadInst *NewLd =
       sandboxir::LoadInst::create(Ld->getType(), Arg1, Align(8),
                                   /*InsertBefore=*/Ret, Ctx, "NewLd");
+  EXPECT_FALSE(NewLd->isVolatile());
   EXPECT_EQ(NewLd->getType(), Ld->getType());
   EXPECT_EQ(NewLd->getPointerOperand(), Arg1);
   EXPECT_EQ(NewLd->getAlign(), 8);
   EXPECT_EQ(NewLd->getName(), "NewLd");
+  // Check create(InsertBefore, IsVolatile=true)
+  sandboxir::LoadInst *NewVLd =
+      sandboxir::LoadInst::create(VLd->getType(), Arg1, Align(8),
+                                  /*InsertBefore=*/Ret,
+                                  /*IsVolatile=*/true, Ctx, "NewVLd");
+
+  EXPECT_TRUE(NewVLd->isVolatile());
+  EXPECT_EQ(NewVLd->getName(), "NewVLd");
+  // Check create(InsertAtEnd)
+  sandboxir::LoadInst *NewLdEnd =
+      sandboxir::LoadInst::create(Ld->getType(), Arg1, Align(8),
+                                  /*InsertAtEnd=*/BB, Ctx, "NewLdEnd");
+  EXPECT_FALSE(NewLdEnd->isVolatile());
+  EXPECT_EQ(NewLdEnd->getName(), "NewLdEnd");
+  EXPECT_EQ(NewLdEnd->getType(), Ld->getType());
+  EXPECT_EQ(NewLdEnd->getPointerOperand(), Arg1);
+  EXPECT_EQ(NewLdEnd->getAlign(), 8);
+  EXPECT_EQ(NewLdEnd->getParent(), BB);
+  EXPECT_EQ(NewLdEnd->getNextNode(), nullptr);
+  // Check create(InsertAtEnd, IsVolatile=true)
+  sandboxir::LoadInst *NewVLdEnd =
+      sandboxir::LoadInst::create(VLd->getType(), Arg1, Align(8),
+                                  /*InsertAtEnd=*/BB,
+                                  /*IsVolatile=*/true, Ctx, "NewVLdEnd");
+  EXPECT_TRUE(NewVLdEnd->isVolatile());
+  EXPECT_EQ(NewVLdEnd->getName(), "NewVLdEnd");
+  EXPECT_EQ(NewVLdEnd->getType(), VLd->getType());
+  EXPECT_EQ(NewVLdEnd->getPointerOperand(), Arg1);
+  EXPECT_EQ(NewVLdEnd->getAlign(), 8);
+  EXPECT_EQ(NewVLdEnd->getParent(), BB);
+  EXPECT_EQ(NewVLdEnd->getNextNode(), nullptr);
 }
 
 TEST_F(SandboxIRTest, StoreInst) {
   parseIR(C, R"IR(
 define void @foo(i8 %val, ptr %ptr) {
   store i8 %val, ptr %ptr, align 64
+  store volatile i8 %val, ptr %ptr, align 64
   ret void
 }
 )IR");
@@ -780,9 +820,12 @@ define void @foo(i8 %val, ptr %ptr) {
   auto *BB = &*F->begin();
   auto It = BB->begin();
   auto *St = cast<sandboxir::StoreInst>(&*It++);
+  auto *VSt = cast<sandboxir::StoreInst>(&*It++);
   auto *Ret = cast<sandboxir::ReturnInst>(&*It++);
 
   // Check that the StoreInst has been created correctly.
+  EXPECT_FALSE(St->isVolatile());
+  EXPECT_TRUE(VSt->isVolatile());
   // Check getPointerOperand()
   EXPECT_EQ(St->getValueOperand(), Val);
   EXPECT_EQ(St->getPointerOperand(), Ptr);
@@ -792,10 +835,46 @@ define void @foo(i8 %val, ptr %ptr) {
   sandboxir::StoreInst *NewSt =
       sandboxir::StoreInst::create(Val, Ptr, Align(8),
                                    /*InsertBefore=*/Ret, Ctx);
+  EXPECT_FALSE(NewSt->isVolatile());
   EXPECT_EQ(NewSt->getType(), St->getType());
   EXPECT_EQ(NewSt->getValueOperand(), Val);
   EXPECT_EQ(NewSt->getPointerOperand(), Ptr);
   EXPECT_EQ(NewSt->getAlign(), 8);
+  EXPECT_EQ(NewSt->getNextNode(), Ret);
+  // Check create(InsertBefore, IsVolatile=true)
+  sandboxir::StoreInst *NewVSt =
+      sandboxir::StoreInst::create(Val, Ptr, Align(8),
+                                   /*InsertBefore=*/Ret,
+                                   /*IsVolatile=*/true, Ctx);
+  EXPECT_TRUE(NewVSt->isVolatile());
+  EXPECT_EQ(NewVSt->getType(), VSt->getType());
+  EXPECT_EQ(NewVSt->getValueOperand(), Val);
+  EXPECT_EQ(NewVSt->getPointerOperand(), Ptr);
+  EXPECT_EQ(NewVSt->getAlign(), 8);
+  EXPECT_EQ(NewVSt->getNextNode(), Ret);
+  // Check create(InsertAtEnd)
+  sandboxir::StoreInst *NewStEnd =
+      sandboxir::StoreInst::create(Val, Ptr, Align(8),
+                                   /*InsertAtEnd=*/BB, Ctx);
+  EXPECT_FALSE(NewStEnd->isVolatile());
+  EXPECT_EQ(NewStEnd->getType(), St->getType());
+  EXPECT_EQ(NewStEnd->getValueOperand(), Val);
+  EXPECT_EQ(NewStEnd->getPointerOperand(), Ptr);
+  EXPECT_EQ(NewStEnd->getAlign(), 8);
+  EXPECT_EQ(NewStEnd->getParent(), BB);
+  EXPECT_EQ(NewStEnd->getNextNode(), nullptr);
+  // Check create(InsertAtEnd, IsVolatile=true)
+  sandboxir::StoreInst *NewVStEnd =
+      sandboxir::StoreInst::create(Val, Ptr, Align(8),
+                                   /*InsertAtEnd=*/BB,
+                                   /*IsVolatile=*/true, Ctx);
+  EXPECT_TRUE(NewVStEnd->isVolatile());
+  EXPECT_EQ(NewVStEnd->getType(), VSt->getType());
+  EXPECT_EQ(NewVStEnd->getValueOperand(), Val);
+  EXPECT_EQ(NewVStEnd->getPointerOperand(), Ptr);
+  EXPECT_EQ(NewVStEnd->getAlign(), 8);
+  EXPECT_EQ(NewVStEnd->getParent(), BB);
+  EXPECT_EQ(NewVStEnd->getNextNode(), nullptr);
 }
 
 TEST_F(SandboxIRTest, ReturnInst) {
@@ -836,3 +915,998 @@ define i8 @foo(i8 %val) {
       sandboxir::ReturnInst::create(Val, /*InsertAtEnd=*/BB, Ctx));
   EXPECT_EQ(NewRet4->getReturnValue(), Val);
 }
+
+TEST_F(SandboxIRTest, CallBase) {
+  parseIR(C, R"IR(
+declare void @bar1(i8)
+declare void @bar2()
+declare void @bar3()
+declare void @variadic(ptr, ...)
+
+define i8 @foo(i8 %arg0, i32 %arg1, ptr %indirectFoo) {
+  %call = call i8 @foo(i8 %arg0, i32 %arg1)
+  call void @bar1(i8 %arg0)
+  call void @bar2()
+  call void %indirectFoo()
+  call void @bar2() noreturn
+  tail call fastcc void @bar2()
+  call void (ptr, ...) @variadic(ptr %indirectFoo, i32 1)
+  ret i8 %call
+}
+)IR");
+  llvm::Function &LLVMF = *M->getFunction("foo");
+  unsigned ArgIdx = 0;
+  llvm::Argument *LLVMArg0 = LLVMF.getArg(ArgIdx++);
+  llvm::Argument *LLVMArg1 = LLVMF.getArg(ArgIdx++);
+  llvm::BasicBlock *LLVMBB = &*LLVMF.begin();
+  SmallVector<llvm::CallBase *, 8> LLVMCalls;
+  auto LLVMIt = LLVMBB->begin();
+  while (isa<llvm::CallBase>(&*LLVMIt))
+    LLVMCalls.push_back(cast<llvm::CallBase>(&*LLVMIt++));
+
+  sandboxir::Context Ctx(C);
+  sandboxir::Function &F = *Ctx.createFunction(&LLVMF);
+
+  for (llvm::CallBase *LLVMCall : LLVMCalls) {
+    // Check classof(Instruction *).
+    auto *Call = cast<sandboxir::CallBase>(Ctx.getValue(LLVMCall));
+    // Check classof(Value *).
+    EXPECT_TRUE(isa<sandboxir::CallBase>((sandboxir::Value *)Call));
+    // Check getFunctionType().
+    EXPECT_EQ(Call->getFunctionType(), LLVMCall->getFunctionType());
+    // Check data_ops().
+    EXPECT_EQ(range_size(Call->data_ops()), range_size(LLVMCall->data_ops()));
+    auto DataOpIt = Call->data_operands_begin();
+    for (llvm::Use &LLVMUse : LLVMCall->data_ops()) {
+      Value *LLVMOp = LLVMUse.get();
+      sandboxir::Use Use = *DataOpIt++;
+      EXPECT_EQ(Ctx.getValue(LLVMOp), Use.get());
+      // Check isDataOperand().
+      EXPECT_EQ(Call->isDataOperand(Use), LLVMCall->isDataOperand(&LLVMUse));
+      // Check getDataOperandNo().
+      EXPECT_EQ(Call->getDataOperandNo(Use),
+                LLVMCall->getDataOperandNo(&LLVMUse));
+      // Check isArgOperand().
+      EXPECT_EQ(Call->isArgOperand(Use), LLVMCall->isArgOperand(&LLVMUse));
+      // Check isCallee().
+      EXPECT_EQ(Call->isCallee(Use), LLVMCall->isCallee(&LLVMUse));
+    }
+    // Check data_operands_empty().
+    EXPECT_EQ(Call->data_operands_empty(), LLVMCall->data_operands_empty());
+    // Check data_operands_size().
+    EXPECT_EQ(Call->data_operands_size(), LLVMCall->data_operands_size());
+    // Check getNumTotalBundleOperands().
+    EXPECT_EQ(Call->getNumTotalBundleOperands(),
+              LLVMCall->getNumTotalBundleOperands());
+    // Check args().
+    EXPECT_EQ(range_size(Call->args()), range_size(LLVMCall->args()));
+    auto ArgIt = Call->arg_begin();
+    for (llvm::Use &LLVMUse : LLVMCall->args()) {
+      Value *LLVMArg = LLVMUse.get();
+      sandboxir::Use Use = *ArgIt++;
+      EXPECT_EQ(Ctx.getValue(LLVMArg), Use.get());
+    }
+    // Check arg_empty().
+    EXPECT_EQ(Call->arg_empty(), LLVMCall->arg_empty());
+    // Check arg_size().
+    EXPECT_EQ(Call->arg_size(), LLVMCall->arg_size());
+    for (unsigned ArgIdx = 0, E = Call->arg_size(); ArgIdx != E; ++ArgIdx) {
+      // Check getArgOperand().
+      EXPECT_EQ(Call->getArgOperand(ArgIdx),
+                Ctx.getValue(LLVMCall->getArgOperand(ArgIdx)));
+      // Check getArgOperandUse().
+      sandboxir::Use Use = Call->getArgOperandUse(ArgIdx);
+      llvm::Use &LLVMUse = LLVMCall->getArgOperandUse(ArgIdx);
+      EXPECT_EQ(Use.get(), Ctx.getValue(LLVMUse.get()));
+      // Check getArgOperandNo().
+      EXPECT_EQ(Call->getArgOperandNo(Use),
+                LLVMCall->getArgOperandNo(&LLVMUse));
+    }
+    // Check hasArgument().
+    SmallVector<llvm::Value *> TestArgs(
+        {LLVMArg0, LLVMArg1, &LLVMF, LLVMBB, LLVMCall});
+    for (llvm::Value *LLVMV : TestArgs) {
+      sandboxir::Value *V = Ctx.getValue(LLVMV);
+      EXPECT_EQ(Call->hasArgument(V), LLVMCall->hasArgument(LLVMV));
+    }
+    // Check getCalledOperand().
+    EXPECT_EQ(Call->getCalledOperand(),
+              Ctx.getValue(LLVMCall->getCalledOperand()));
+    // Check getCalledOperandUse().
+    EXPECT_EQ(Call->getCalledOperandUse().get(),
+              Ctx.getValue(LLVMCall->getCalledOperandUse()));
+    // Check getCalledFunction().
+    if (LLVMCall->getCalledFunction() == nullptr)
+      EXPECT_EQ(Call->getCalledFunction(), nullptr);
+    else {
+      auto *LLVMCF = cast<llvm::Function>(LLVMCall->getCalledFunction());
+      (void)LLVMCF;
+      EXPECT_EQ(Call->getCalledFunction(),
+                cast<sandboxir::Function>(
+                    Ctx.getValue(LLVMCall->getCalledFunction())));
+    }
+    // Check isIndirectCall().
+    EXPECT_EQ(Call->isIndirectCall(), LLVMCall->isIndirectCall());
+    // Check getCaller().
+    EXPECT_EQ(Call->getCaller(), Ctx.getValue(LLVMCall->getCaller()));
+    // Check isMustTailCall().
+    EXPECT_EQ(Call->isMustTailCall(), LLVMCall->isMustTailCall());
+    // Check isTailCall().
+    EXPECT_EQ(Call->isTailCall(), LLVMCall->isTailCall());
+    // Check getIntrinsicID().
+    EXPECT_EQ(Call->getIntrinsicID(), LLVMCall->getIntrinsicID());
+    // Check getCallingConv().
+    EXPECT_EQ(Call->getCallingConv(), LLVMCall->getCallingConv());
+    // Check isInlineAsm().
+    EXPECT_EQ(Call->isInlineAsm(), LLVMCall->isInlineAsm());
+  }
+
+  auto *Arg0 = F.getArg(0);
+  auto *Arg1 = F.getArg(1);
+  auto *BB = &*F.begin();
+  auto It = BB->begin();
+  auto *Call0 = cast<sandboxir::CallBase>(&*It++);
+  [[maybe_unused]] auto *Call1 = cast<sandboxir::CallBase>(&*It++);
+  auto *Call2 = cast<sandboxir::CallBase>(&*It++);
+  // Check setArgOperand
+  Call0->setArgOperand(0, Arg1);
+  EXPECT_EQ(Call0->getArgOperand(0), Arg1);
+  Call0->setArgOperand(0, Arg0);
+  EXPECT_EQ(Call0->getArgOperand(0), Arg0);
+
+  auto *Bar3F = Ctx.createFunction(M->getFunction("bar3"));
+
+  // Check setCalledOperand
+  auto *SvOp = Call0->getCalledOperand();
+  Call0->setCalledOperand(Bar3F);
+  EXPECT_EQ(Call0->getCalledOperand(), Bar3F);
+  Call0->setCalledOperand(SvOp);
+  // Check setCalledFunction
+  Call2->setCalledFunction(Bar3F);
+  EXPECT_EQ(Call2->getCalledFunction(), Bar3F);
+}
+
+TEST_F(SandboxIRTest, CallInst) {
+  parseIR(C, R"IR(
+define i8 @foo(i8 %arg) {
+  %call = call i8 @foo(i8 %arg)
+  ret i8 %call
+}
+)IR");
+  Function &LLVMF = *M->getFunction("foo");
+  sandboxir::Context Ctx(C);
+  auto &F = *Ctx.createFunction(&LLVMF);
+  unsigned ArgIdx = 0;
+  auto *Arg0 = F.getArg(ArgIdx++);
+  auto *BB = &*F.begin();
+  auto It = BB->begin();
+  auto *Call = cast<sandboxir::CallInst>(&*It++);
+  auto *Ret = cast<sandboxir::ReturnInst>(&*It++);
+  EXPECT_EQ(Call->getNumOperands(), 2u);
+  EXPECT_EQ(Ret->getOpcode(), sandboxir::Instruction::Opcode::Ret);
+  FunctionType *FTy = F.getFunctionType();
+  SmallVector<sandboxir::Value *, 1> Args;
+  Args.push_back(Arg0);
+  {
+    // Check create() WhereIt.
+    auto *Call = cast<sandboxir::CallInst>(sandboxir::CallInst::create(
+        FTy, &F, Args, /*WhereIt=*/Ret->getIterator(), BB, Ctx));
+    EXPECT_EQ(Call->getNextNode(), Ret);
+    EXPECT_EQ(Call->getCalledFunction(), &F);
+    EXPECT_EQ(range_size(Call->args()), 1u);
+    EXPECT_EQ(Call->getArgOperand(0), Arg0);
+  }
+  {
+    // Check create() InsertBefore.
+    auto *Call = cast<sandboxir::CallInst>(
+        sandboxir::CallInst::create(FTy, &F, Args, /*InsertBefore=*/Ret, Ctx));
+    EXPECT_EQ(Call->getNextNode(), Ret);
+    EXPECT_EQ(Call->getCalledFunction(), &F);
+    EXPECT_EQ(range_size(Call->args()), 1u);
+    EXPECT_EQ(Call->getArgOperand(0), Arg0);
+  }
+  {
+    // Check create() InsertAtEnd.
+    auto *Call = cast<sandboxir::CallInst>(
+        sandboxir::CallInst::create(FTy, &F, Args, /*InsertAtEnd=*/BB, Ctx));
+    EXPECT_EQ(Call->getPrevNode(), Ret);
+    EXPECT_EQ(Call->getCalledFunction(), &F);
+    EXPECT_EQ(range_size(Call->args()), 1u);
+    EXPECT_EQ(Call->getArgOperand(0), Arg0);
+  }
+}
+
+TEST_F(SandboxIRTest, InvokeInst) {
+  parseIR(C, R"IR(
+define void @foo(i8 %arg) {
+ bb0:
+   invoke i8 @foo(i8 %arg) to label %normal_bb
+                       unwind label %exception_bb
+ normal_bb:
+   ret void
+ exception_bb:
+   %lpad = landingpad { ptr, i32}
+           cleanup
+   ret void
+ other_bb:
+   ret void
+}
+)IR");
+  Function &LLVMF = *M->getFunction("foo");
+  sandboxir::Context Ctx(C);
+  auto &F = *Ctx.createFunction(&LLVMF);
+  auto *Arg = F.getArg(0);
+  auto *BB0 = cast<sandboxir::BasicBlock>(
+      Ctx.getValue(getBasicBlockByName(LLVMF, "bb0")));
+  auto *NormalBB = cast<sandboxir::BasicBlock>(
+      Ctx.getValue(getBasicBlockByName(LLVMF, "normal_bb")));
+  auto *ExceptionBB = cast<sandboxir::BasicBlock>(
+      Ctx.getValue(getBasicBlockByName(LLVMF, "exception_bb")));
+  auto *LandingPad = &*ExceptionBB->begin();
+  auto *OtherBB = cast<sandboxir::BasicBlock>(
+      Ctx.getValue(getBasicBlockByName(LLVMF, "other_bb")));
+  auto It = BB0->begin();
+  // Check classof(Instruction *).
+  auto *Invoke = cast<sandboxir::InvokeInst>(&*It++);
+
+  // Check getNormalDest().
+  EXPECT_EQ(Invoke->getNormalDest(), NormalBB);
+  // Check getUnwindDest().
+  EXPECT_EQ(Invoke->getUnwindDest(), ExceptionBB);
+  // Check getSuccessor().
+  EXPECT_EQ(Invoke->getSuccessor(0), NormalBB);
+  EXPECT_EQ(Invoke->getSuccessor(1), ExceptionBB);
+  // Check setNormalDest().
+  Invoke->setNormalDest(OtherBB);
+  EXPECT_EQ(Invoke->getNormalDest(), OtherBB);
+  EXPECT_EQ(Invoke->getUnwindDest(), ExceptionBB);
+  // Check setUnwindDest().
+  Invoke->setUnwindDest(OtherBB);
+  EXPECT_EQ(Invoke->getNormalDest(), OtherBB);
+  EXPECT_EQ(Invoke->getUnwindDest(), OtherBB);
+  // Check setSuccessor().
+  Invoke->setSuccessor(0, NormalBB);
+  EXPECT_EQ(Invoke->getNormalDest(), NormalBB);
+  Invoke->setSuccessor(1, ExceptionBB);
+  EXPECT_EQ(Invoke->getUnwindDest(), ExceptionBB);
+  // Check getLandingPadInst().
+  EXPECT_EQ(Invoke->getLandingPadInst(), LandingPad);
+
+  {
+    // Check create() WhereIt, WhereBB.
+    SmallVector<sandboxir::Value *> Args({Arg});
+    auto *InsertBefore = &*BB0->begin();
+    auto *NewInvoke = cast<sandboxir::InvokeInst>(sandboxir::InvokeInst::create(
+        F.getFunctionType(), &F, NormalBB, ExceptionBB, Args,
+        /*WhereIt=*/InsertBefore->getIterator(), /*WhereBB=*/BB0, Ctx));
+    EXPECT_EQ(NewInvoke->getNormalDest(), NormalBB);
+    EXPECT_EQ(NewInvoke->getUnwindDest(), ExceptionBB);
+    EXPECT_EQ(NewInvoke->getNextNode(), InsertBefore);
+  }
+  {
+    // Check create() InsertBefore.
+    SmallVector<sandboxir::Value *> Args({Arg});
+    auto *InsertBefore = &*BB0->begin();
+    auto *NewInvoke = cast<sandboxir::InvokeInst>(
+        sandboxir::InvokeInst::create(F.getFunctionType(), &F, NormalBB,
+                                      ExceptionBB, Args, InsertBefore, Ctx));
+    EXPECT_EQ(NewInvoke->getNormalDest(), NormalBB);
+    EXPECT_EQ(NewInvoke->getUnwindDest(), ExceptionBB);
+    EXPECT_EQ(NewInvoke->getNextNode(), InsertBefore);
+  }
+  {
+    // Check create() InsertAtEnd.
+    SmallVector<sandboxir::Value *> Args({Arg});
+    auto *NewInvoke = cast<sandboxir::InvokeInst>(sandboxir::InvokeInst::create(
+        F.getFunctionType(), &F, NormalBB, ExceptionBB, Args,
+        /*InsertAtEnd=*/BB0, Ctx));
+    EXPECT_EQ(NewInvoke->getNormalDest(), NormalBB);
+    EXPECT_EQ(NewInvoke->getUnwindDest(), ExceptionBB);
+    EXPECT_EQ(NewInvoke->getParent(), BB0);
+    EXPECT_EQ(NewInvoke->getNextNode(), nullptr);
+  }
+}
+
+TEST_F(SandboxIRTest, CallBrInst) {
+  parseIR(C, R"IR(
+define void @foo(i8 %arg) {
+ bb0:
+   callbr void asm "", ""()
+               to label %bb1 [label %bb2]
+ bb1:
+   ret void
+ bb2:
+   ret void
+ other_bb:
+   ret void
+ bb3:
+   callbr void @foo(i8 %arg)
+               to label %bb1 [label %bb2]
+}
+)IR");
+  Function &LLVMF = *M->getFunction("foo");
+  auto *LLVMBB0 = getBasicBlockByName(LLVMF, "bb0");
+  auto *LLVMCallBr = cast<llvm::CallBrInst>(&*LLVMBB0->begin());
+  sandboxir::Context Ctx(C);
+  auto &F = *Ctx.createFunction(&LLVMF);
+  auto *Arg = F.getArg(0);
+  auto *BB0 = cast<sandboxir::BasicBlock>(
+      Ctx.getValue(getBasicBlockByName(LLVMF, "bb0")));
+  auto *BB1 = cast<sandboxir::BasicBlock>(
+      Ctx.getValue(getBasicBlockByName(LLVMF, "bb1")));
+  auto *BB2 = cast<sandboxir::BasicBlock>(
+      Ctx.getValue(getBasicBlockByName(LLVMF, "bb2")));
+  auto *BB3 = cast<sandboxir::BasicBlock>(
+      Ctx.getValue(getBasicBlockByName(LLVMF, "bb3")));
+  auto *OtherBB = cast<sandboxir::BasicBlock>(
+      Ctx.getValue(getBasicBlockByName(LLVMF, "other_bb")));
+  auto It = BB0->begin();
+  // Check classof(Instruction *).
+  auto *CallBr0 = cast<sandboxir::CallBrInst>(&*It++);
+
+  It = BB3->begin();
+  auto *CallBr1 = cast<sandboxir::CallBrInst>(&*It++);
+  for (sandboxir::CallBrInst *CallBr : {CallBr0, CallBr1}) {
+    // Check getNumIndirectDests().
+    EXPECT_EQ(CallBr->getNumIndirectDests(), 1u);
+    // Check getIndirectDestLabel().
+    EXPECT_EQ(CallBr->getIndirectDestLabel(0),
+              Ctx.getValue(LLVMCallBr->getIndirectDestLabel(0)));
+    // Check getIndirectDestLabelUse().
+    EXPECT_EQ(CallBr->getIndirectDestLabelUse(0),
+              Ctx.getValue(LLVMCallBr->getIndirectDestLabelUse(0)));
+    // Check getDefaultDest().
+    EXPECT_EQ(CallBr->getDefaultDest(),
+              Ctx.getValue(LLVMCallBr->getDefaultDest()));
+    // Check getIndirectDest().
+    EXPECT_EQ(CallBr->getIndirectDest(0),
+              Ctx.getValue(LLVMCallBr->getIndirectDest(0)));
+    // Check getIndirectDests().
+    auto Dests = CallBr->getIndirectDests();
+    EXPECT_EQ(Dests.size(), LLVMCallBr->getIndirectDests().size());
+    EXPECT_EQ(Dests[0], Ctx.getValue(LLVMCallBr->getIndirectDests()[0]));
+    // Check getNumSuccessors().
+    EXPECT_EQ(CallBr->getNumSuccessors(), LLVMCallBr->getNumSuccessors());
+    // Check getSuccessor().
+    for (unsigned SuccIdx = 0, E = CallBr->getNumSuccessors(); SuccIdx != E;
+         ++SuccIdx)
+      EXPECT_EQ(CallBr->getSuccessor(SuccIdx),
+                Ctx.getValue(LLVMCallBr->getSuccessor(SuccIdx)));
+    // Check setDefaultDest().
+    auto *SvDefaultDest = CallBr->getDefaultDest();
+    CallBr->setDefaultDest(OtherBB);
+    EXPECT_EQ(CallBr->getDefaultDest(), OtherBB);
+    CallBr->setDefaultDest(SvDefaultDest);
+    // Check setIndirectDest().
+    auto *SvIndirectDest = CallBr->getIndirectDest(0);
+    CallBr->setIndirectDest(0, OtherBB);
+    EXPECT_EQ(CallBr->getIndirectDest(0), OtherBB);
+    CallBr->setIndirectDest(0, SvIndirectDest);
+  }
+
+  {
+    // Check create() WhereIt, WhereBB.
+    SmallVector<sandboxir::Value *> Args({Arg});
+    auto *NewCallBr = cast<sandboxir::CallBrInst>(sandboxir::CallBrInst::create(
+        F.getFunctionType(), &F, BB1, {BB2}, Args, /*WhereIt=*/BB0->end(),
+        /*WhereBB=*/BB0, Ctx));
+    EXPECT_EQ(NewCallBr->getDefaultDest(), BB1);
+    EXPECT_EQ(NewCallBr->getIndirectDests().size(), 1u);
+    EXPECT_EQ(NewCallBr->getIndirectDests()[0], BB2);
+    EXPECT_EQ(NewCallBr->getNextNode(), nullptr);
+    EXPECT_EQ(NewCallBr->getParent(), BB0);
+  }
+  {
+    // Check create() InsertBefore
+    SmallVector<sandboxir::Value *> Args({Arg});
+    auto *InsertBefore = &*BB0->rbegin();
+    auto *NewCallBr = cast<sandboxir::CallBrInst>(sandboxir::CallBrInst::create(
+        F.getFunctionType(), &F, BB1, {BB2}, Args, InsertBefore, Ctx));
+    EXPECT_EQ(NewCallBr->getDefaultDest(), BB1);
+    EXPECT_EQ(NewCallBr->getIndirectDests().size(), 1u);
+    EXPECT_EQ(NewCallBr->getIndirectDests()[0], BB2);
+    EXPECT_EQ(NewCallBr->getNextNode(), InsertBefore);
+  }
+  {
+    // Check create() InsertAtEnd.
+    SmallVector<sandboxir::Value *> Args({Arg});
+    auto *NewCallBr = cast<sandboxir::CallBrInst>(
+        sandboxir::CallBrInst::create(F.getFunctionType(), &F, BB1, {BB2}, Args,
+                                      /*InsertAtEnd=*/BB0, Ctx));
+    EXPECT_EQ(NewCallBr->getDefaultDest(), BB1);
+    EXPECT_EQ(NewCallBr->getIndirectDests().size(), 1u);
+    EXPECT_EQ(NewCallBr->getIndirectDests()[0], BB2);
+    EXPECT_EQ(NewCallBr->getNextNode(), nullptr);
+    EXPECT_EQ(NewCallBr->getParent(), BB0);
+  }
+}
+
+TEST_F(SandboxIRTest, GetElementPtrInstruction) {
+  parseIR(C, R"IR(
+define void @foo(ptr %ptr, <2 x ptr> %ptrs) {
+  %gep0 = getelementptr i8, ptr %ptr, i32 0
+  %gep1 = getelementptr nusw i8, ptr %ptr, i32 0
+  %gep2 = getelementptr nuw i8, ptr %ptr, i32 0
+  %gep3 = getelementptr inbounds {i32, {i32, i8}}, ptr %ptr, i32 1, i32 0
+  %gep4 = getelementptr inbounds {i8, i8, {i32, i16}}, <2 x ptr> %ptrs, i32 2, <2 x i32> <i32 0, i32 0>
+  ret void
+}
+)IR");
+  Function &LLVMF = *M->getFunction("foo");
+  BasicBlock *LLVMBB = &*LLVMF.begin();
+  auto LLVMIt = LLVMBB->begin();
+  SmallVector<llvm::GetElementPtrInst *, 4> LLVMGEPs;
+  while (isa<llvm::GetElementPtrInst>(&*LLVMIt))
+    LLVMGEPs.push_back(cast<llvm::GetElementPtrInst>(&*LLVMIt++));
+  auto *LLVMRet = cast<llvm::ReturnInst>(&*LLVMIt++);
+  sandboxir::Context Ctx(C);
+  [[maybe_unused]] auto &F = *Ctx.createFunction(&LLVMF);
+
+  for (llvm::GetElementPtrInst *LLVMGEP : LLVMGEPs) {
+    // Check classof().
+    auto *GEP = cast<sandboxir::GetElementPtrInst>(Ctx.getValue(LLVMGEP));
+    // Check getSourceElementType().
+    EXPECT_EQ(GEP->getSourceElementType(), LLVMGEP->getSourceElementType());
+    // Check getResultElementType().
+    EXPECT_EQ(GEP->getResultElementType(), LLVMGEP->getResultElementType());
+    // Check getAddressSpace().
+    EXPECT_EQ(GEP->getAddressSpace(), LLVMGEP->getAddressSpace());
+    // Check indices().
+    EXPECT_EQ(range_size(GEP->indices()), range_size(LLVMGEP->indices()));
+    auto IdxIt = GEP->idx_begin();
+    for (llvm::Value *LLVMIdxV : LLVMGEP->indices()) {
+      sandboxir::Value *IdxV = *IdxIt++;
+      EXPECT_EQ(IdxV, Ctx.getValue(LLVMIdxV));
+    }
+    // Check getPointerOperand().
+    EXPECT_EQ(GEP->getPointerOperand(),
+              Ctx.getValue(LLVMGEP->getPointerOperand()));
+    // Check getPointerOperandIndex().
+    EXPECT_EQ(GEP->getPointerOperandIndex(), LLVMGEP->getPointerOperandIndex());
+    // Check getPointerOperandType().
+    EXPECT_EQ(GEP->getPointerOperandType(), LLVMGEP->getPointerOperandType());
+    // Check getPointerAddressSpace().
+    EXPECT_EQ(GEP->getPointerAddressSpace(), LLVMGEP->getPointerAddressSpace());
+    // Check getNumIndices().
+    EXPECT_EQ(GEP->getNumIndices(), LLVMGEP->getNumIndices());
+    // Check hasIndices().
+    EXPECT_EQ(GEP->hasIndices(), LLVMGEP->hasIndices());
+    // Check hasAllConstantIndices().
+    EXPECT_EQ(GEP->hasAllConstantIndices(), LLVMGEP->hasAllConstantIndices());
+    // Check getNoWrapFlags().
+    EXPECT_EQ(GEP->getNoWrapFlags(), LLVMGEP->getNoWrapFlags());
+    // Check isInBounds().
+    EXPECT_EQ(GEP->isInBounds(), LLVMGEP->isInBounds());
+    // Check hasNoUnsignedWrap().
+    EXPECT_EQ(GEP->hasNoUnsignedWrap(), LLVMGEP->hasNoUnsignedWrap());
+    // Check accumulateConstantOffset().
+    DataLayout DL(M.get());
+    APInt Offset1 =
+        APInt::getZero(DL.getIndexSizeInBits(GEP->getPointerAddressSpace()));
+    APInt Offset2 =
+        APInt::getZero(DL.getIndexSizeInBits(GEP->getPointerAddressSpace()));
+    EXPECT_EQ(GEP->accumulateConstantOffset(DL, Offset1),
+              LLVMGEP->accumulateConstantOffset(DL, Offset2));
+    EXPECT_EQ(Offset1, Offset2);
+  }
+
+  auto *BB = &*F.begin();
+  auto *GEP0 = cast<sandboxir::GetElementPtrInst>(&*BB->begin());
+  auto *Ret = cast<sandboxir::ReturnInst>(Ctx.getValue(LLVMRet));
+  SmallVector<sandboxir::Value *> Indices(GEP0->indices());
+
+  // Check create() WhereIt, WhereBB.
+  auto *NewGEP0 =
+      cast<sandboxir::GetElementPtrInst>(sandboxir::GetElementPtrInst::create(
+          GEP0->getType(), GEP0->getPointerOperand(), Indices,
+          /*WhereIt=*/Ret->getIterator(), /*WhereBB=*/Ret->getParent(), Ctx,
+          "NewGEP0"));
+  EXPECT_EQ(NewGEP0->getName(), "NewGEP0");
+  EXPECT_EQ(NewGEP0->getType(), GEP0->getType());
+  EXPECT_EQ(NewGEP0->getPointerOperand(), GEP0->getPointerOperand());
+  EXPECT_EQ(range_size(NewGEP0->indices()), range_size(GEP0->indices()));
+  for (auto NewIt = NewGEP0->idx_begin(), NewItE = NewGEP0->idx_end(),
+            OldIt = GEP0->idx_begin();
+       NewIt != NewItE; ++NewIt) {
+    sandboxir::Value *NewIdxV = *NewIt;
+    sandboxir::Value *OldIdxV = *OldIt;
+    EXPECT_EQ(NewIdxV, OldIdxV);
+  }
+  EXPECT_EQ(NewGEP0->getNextNode(), Ret);
+
+  // Check create() InsertBefore.
+  auto *NewGEP1 =
+      cast<sandboxir::GetElementPtrInst>(sandboxir::GetElementPtrInst::create(
+          GEP0->getType(), GEP0->getPointerOperand(), Indices,
+          /*InsertBefore=*/Ret, Ctx, "NewGEP1"));
+  EXPECT_EQ(NewGEP1->getName(), "NewGEP1");
+  EXPECT_EQ(NewGEP1->getType(), GEP0->getType());
+  EXPECT_EQ(NewGEP1->getPointerOperand(), GEP0->getPointerOperand());
+  EXPECT_EQ(range_size(NewGEP1->indices()), range_size(GEP0->indices()));
+  for (auto NewIt = NewGEP0->idx_begin(), NewItE = NewGEP0->idx_end(),
+            OldIt = GEP0->idx_begin();
+       NewIt != NewItE; ++NewIt) {
+    sandboxir::Value *NewIdxV = *NewIt;
+    sandboxir::Value *OldIdxV = *OldIt;
+    EXPECT_EQ(NewIdxV, OldIdxV);
+  }
+  EXPECT_EQ(NewGEP1->getNextNode(), Ret);
+
+  // Check create() InsertAtEnd.
+  auto *NewGEP2 =
+      cast<sandboxir::GetElementPtrInst>(sandboxir::GetElementPtrInst::create(
+          GEP0->getType(), GEP0->getPointerOperand(), Indices,
+          /*InsertAtEnd=*/BB, Ctx, "NewGEP2"));
+  EXPECT_EQ(NewGEP2->getName(), "NewGEP2");
+  EXPECT_EQ(NewGEP2->getType(), GEP0->getType());
+  EXPECT_EQ(NewGEP2->getPointerOperand(), GEP0->getPointerOperand());
+  EXPECT_EQ(range_size(NewGEP2->indices()), range_size(GEP0->indices()));
+  for (auto NewIt = NewGEP0->idx_begin(), NewItE = NewGEP0->idx_end(),
+            OldIt = GEP0->idx_begin();
+       NewIt != NewItE; ++NewIt) {
+    sandboxir::Value *NewIdxV = *NewIt;
+    sandboxir::Value *OldIdxV = *OldIt;
+    EXPECT_EQ(NewIdxV, OldIdxV);
+  }
+  EXPECT_EQ(NewGEP2->getPrevNode(), Ret);
+  EXPECT_EQ(NewGEP2->getNextNode(), nullptr);
+}
+
+TEST_F(SandboxIRTest, CastInst) {
+  parseIR(C, R"IR(
+define void @foo(i32 %arg, float %farg, double %darg, ptr %ptr) {
+  %zext = zext i32 %arg to i64
+  %sext = sext i32 %arg to i64
+  %fptoui = fptoui float %farg to i32
+  %fptosi = fptosi float %farg to i32
+  %fpext = fpext float %farg to double
+  %ptrtoint = ptrtoint ptr %ptr to i32
+  %inttoptr = inttoptr i32 %arg to ptr
+  %sitofp = sitofp i32 %arg to float
+  %uitofp = uitofp i32 %arg to float
+  %trunc = trunc i32 %arg to i16
+  %fptrunc = fptrunc double %darg to float
+  %bitcast = bitcast i32 %arg to float
+  %addrspacecast = addrspacecast ptr %ptr to ptr addrspace(1)
+  ret void
+}
+)IR");
+  Function &LLVMF = *M->getFunction("foo");
+  sandboxir::Context Ctx(C);
+  sandboxir::Function *F = Ctx.createFunction(&LLVMF);
+  unsigned ArgIdx = 0;
+  auto *Arg = F->getArg(ArgIdx++);
+  auto *BB = &*F->begin();
+  auto It = BB->begin();
+
+  Type *Ti64 = Type::getInt64Ty(C);
+  Type *Ti32 = Type::getInt32Ty(C);
+  Type *Ti16 = Type::getInt16Ty(C);
+  Type *Tdouble = Type::getDoubleTy(C);
+  Type *Tfloat = Type::getFloatTy(C);
+  Type *Tptr = Tfloat->getPointerTo();
+  Type *Tptr1 = Tfloat->getPointerTo(1);
+
+  // Check classof(), getOpcode(), getSrcTy(), getDstTy()
+  auto *ZExt = cast<sandboxir::CastInst>(&*It++);
+  EXPECT_EQ(ZExt->getOpcode(), sandboxir::Instruction::Opcode::ZExt);
+  EXPECT_EQ(ZExt->getSrcTy(), Ti32);
+  EXPECT_EQ(ZExt->getDestTy(), Ti64);
+
+  auto *SExt = cast<sandboxir::CastInst>(&*It++);
+  EXPECT_EQ(SExt->getOpcode(), sandboxir::Instruction::Opcode::SExt);
+  EXPECT_EQ(SExt->getSrcTy(), Ti32);
+  EXPECT_EQ(SExt->getDestTy(), Ti64);
+
+  auto *FPToUI = cast<sandboxir::CastInst>(&*It++);
+  EXPECT_TRUE(isa<sandboxir::FPToUIInst>(FPToUI));
+  EXPECT_EQ(FPToUI->getOpcode(), sandboxir::Instruction::Opcode::FPToUI);
+  EXPECT_EQ(FPToUI->getSrcTy(), Tfloat);
+  EXPECT_EQ(FPToUI->getDestTy(), Ti32);
+
+  auto *FPToSI = cast<sandboxir::CastInst>(&*It++);
+  EXPECT_TRUE(isa<sandboxir::FPToSIInst>(FPToSI));
+  EXPECT_EQ(FPToSI->getOpcode(), sandboxir::Instruction::Opcode::FPToSI);
+  EXPECT_EQ(FPToSI->getSrcTy(), Tfloat);
+  EXPECT_EQ(FPToSI->getDestTy(), Ti32);
+
+  auto *FPExt = cast<sandboxir::CastInst>(&*It++);
+  EXPECT_EQ(FPExt->getOpcode(), sandboxir::Instruction::Opcode::FPExt);
+  EXPECT_EQ(FPExt->getSrcTy(), Tfloat);
+  EXPECT_EQ(FPExt->getDestTy(), Tdouble);
+
+  auto *PtrToInt = cast<sandboxir::CastInst>(&*It++);
+  EXPECT_TRUE(isa<sandboxir::PtrToIntInst>(PtrToInt));
+  EXPECT_EQ(PtrToInt->getOpcode(), sandboxir::Instruction::Opcode::PtrToInt);
+  EXPECT_EQ(PtrToInt->getSrcTy(), Tptr);
+  EXPECT_EQ(PtrToInt->getDestTy(), Ti32);
+
+  auto *IntToPtr = cast<sandboxir::CastInst>(&*It++);
+  EXPECT_TRUE(isa<sandboxir::IntToPtrInst>(IntToPtr));
+  EXPECT_EQ(IntToPtr->getOpcode(), sandboxir::Instruction::Opcode::IntToPtr);
+  EXPECT_EQ(IntToPtr->getSrcTy(), Ti32);
+  EXPECT_EQ(IntToPtr->getDestTy(), Tptr);
+
+  auto *SIToFP = cast<sandboxir::CastInst>(&*It++);
+  EXPECT_TRUE(isa<sandboxir::SIToFPInst>(SIToFP));
+  EXPECT_EQ(SIToFP->getOpcode(), sandboxir::Instruction::Opcode::SIToFP);
+  EXPECT_EQ(SIToFP->getSrcTy(), Ti32);
+  EXPECT_EQ(SIToFP->getDestTy(), Tfloat);
+
+  auto *UIToFP = cast<sandboxir::CastInst>(&*It++);
+  EXPECT_EQ(UIToFP->getOpcode(), sandboxir::Instruction::Opcode::UIToFP);
+  EXPECT_EQ(UIToFP->getSrcTy(), Ti32);
+  EXPECT_EQ(UIToFP->getDestTy(), Tfloat);
+
+  auto *Trunc = cast<sandboxir::CastInst>(&*It++);
+  EXPECT_EQ(Trunc->getOpcode(), sandboxir::Instruction::Opcode::Trunc);
+  EXPECT_EQ(Trunc->getSrcTy(), Ti32);
+  EXPECT_EQ(Trunc->getDestTy(), Ti16);
+
+  auto *FPTrunc = cast<sandboxir::CastInst>(&*It++);
+  EXPECT_EQ(FPTrunc->getOpcode(), sandboxir::Instruction::Opcode::FPTrunc);
+  EXPECT_EQ(FPTrunc->getSrcTy(), Tdouble);
+  EXPECT_EQ(FPTrunc->getDestTy(), Tfloat);
+
+  auto *BitCast = cast<sandboxir::CastInst>(&*It++);
+  EXPECT_TRUE(isa<sandboxir::BitCastInst>(BitCast));
+  EXPECT_EQ(BitCast->getOpcode(), sandboxir::Instruction::Opcode::BitCast);
+  EXPECT_EQ(BitCast->getSrcTy(), Ti32);
+  EXPECT_EQ(BitCast->getDestTy(), Tfloat);
+
+  auto *AddrSpaceCast = cast<sandboxir::CastInst>(&*It++);
+  EXPECT_TRUE(isa<sandboxir::AddrSpaceCastInst>(AddrSpaceCast));
+  EXPECT_EQ(AddrSpaceCast->getOpcode(),
+            sandboxir::Instruction::Opcode::AddrSpaceCast);
+  EXPECT_EQ(AddrSpaceCast->getSrcTy(), Tptr);
+  EXPECT_EQ(AddrSpaceCast->getDestTy(), Tptr1);
+
+  auto *Ret = cast<sandboxir::ReturnInst>(&*It++);
+
+  {
+    // Check create() WhereIt, WhereBB
+    auto *NewI = cast<sandboxir::CastInst>(sandboxir::CastInst::create(
+        Ti64, sandboxir::Instruction::Opcode::SExt, Arg, /*WhereIt=*/BB->end(),
+        /*WhereBB=*/BB, Ctx, "SExt"));
+    // Check getOpcode().
+    EXPECT_EQ(NewI->getOpcode(), sandboxir::Instruction::Opcode::SExt);
+    // Check getSrcTy().
+    EXPECT_EQ(NewI->getSrcTy(), Arg->getType());
+    // Check getDestTy().
+    EXPECT_EQ(NewI->getDestTy(), Ti64);
+    // Check instr position.
+    EXPECT_EQ(NewI->getNextNode(), nullptr);
+    EXPECT_EQ(NewI->getPrevNode(), Ret);
+  }
+
+  {
+    // Check create() InsertBefore.
+    auto *NewI = cast<sandboxir::CastInst>(
+        sandboxir::CastInst::create(Ti64, sandboxir::Instruction::Opcode::ZExt,
+                                    Arg, /*InsertBefore=*/Ret, Ctx, "ZExt"));
+    // Check getOpcode().
+    EXPECT_EQ(NewI->getOpcode(), sandboxir::Instruction::Opcode::ZExt);
+    // Check getSrcTy().
+    EXPECT_EQ(NewI->getSrcTy(), Arg->getType());
+    // Check getDestTy().
+    EXPECT_EQ(NewI->getDestTy(), Ti64);
+    // Check instr position.
+    EXPECT_EQ(NewI->getNextNode(), Ret);
+  }
+  {
+    // Check create() InsertAtEnd.
+    auto *NewI = cast<sandboxir::CastInst>(
+        sandboxir::CastInst::create(Ti64, sandboxir::Instruction::Opcode::ZExt,
+                                    Arg, /*InsertAtEnd=*/BB, Ctx, "ZExt"));
+    // Check getOpcode().
+    EXPECT_EQ(NewI->getOpcode(), sandboxir::Instruction::Opcode::ZExt);
+    // Check getSrcTy().
+    EXPECT_EQ(NewI->getSrcTy(), Arg->getType());
+    // Check getDestTy().
+    EXPECT_EQ(NewI->getDestTy(), Ti64);
+    // Check instr position.
+    EXPECT_EQ(NewI->getNextNode(), nullptr);
+    EXPECT_EQ(NewI->getParent(), BB);
+  }
+
+  {
+#ifndef NDEBUG
+    // Check that passing a non-cast opcode crashes.
+    EXPECT_DEATH(
+        sandboxir::CastInst::create(Ti64, sandboxir::Instruction::Opcode::Store,
+                                    Arg, /*InsertBefore=*/Ret, Ctx, "Bad"),
+        ".*Opcode.*");
+#endif // NDEBUG
+  }
+}
+
+/// CastInst's subclasses are very similar so we can use a common test function
+/// for them.
+template <typename SubclassT, sandboxir::Instruction::Opcode OpcodeT>
+void testCastInst(llvm::Module &M, Type *SrcTy, Type *DstTy) {
+  Function &LLVMF = *M.getFunction("foo");
+  sandboxir::Context Ctx(M.getContext());
+  sandboxir::Function *F = Ctx.createFunction(&LLVMF);
+  unsigned ArgIdx = 0;
+  auto *Arg = F->getArg(ArgIdx++);
+  auto *BB = &*F->begin();
+  auto It = BB->begin();
+
+  auto *CI = cast<SubclassT>(&*It++);
+  EXPECT_EQ(CI->getOpcode(), OpcodeT);
+  EXPECT_EQ(CI->getSrcTy(), SrcTy);
+  EXPECT_EQ(CI->getDestTy(), DstTy);
+  auto *Ret = cast<sandboxir::ReturnInst>(&*It++);
+
+  {
+    // Check create() WhereIt, WhereBB
+    auto *NewI =
+        cast<SubclassT>(SubclassT::create(Arg, DstTy, /*WhereIt=*/BB->end(),
+                                          /*WhereBB=*/BB, Ctx, "NewCI"));
+    // Check getOpcode().
+    EXPECT_EQ(NewI->getOpcode(), OpcodeT);
+    // Check getSrcTy().
+    EXPECT_EQ(NewI->getSrcTy(), Arg->getType());
+    // Check getDestTy().
+    EXPECT_EQ(NewI->getDestTy(), DstTy);
+    // Check instr position.
+    EXPECT_EQ(NewI->getNextNode(), nullptr);
+    EXPECT_EQ(NewI->getPrevNode(), Ret);
+    // Check instr name.
+    EXPECT_EQ(NewI->getName(), "NewCI");
+  }
+  {
+    // Check create() InsertBefore.
+    auto *NewI =
+        cast<SubclassT>(SubclassT::create(Arg, DstTy,
+                                          /*InsertBefore=*/Ret, Ctx, "NewCI"));
+    // Check getOpcode().
+    EXPECT_EQ(NewI->getOpcode(), OpcodeT);
+    // Check getSrcTy().
+    EXPECT_EQ(NewI->getSrcTy(), Arg->getType());
+    // Check getDestTy().
+    EXPECT_EQ(NewI->getDestTy(), DstTy);
+    // Check instr position.
+    EXPECT_EQ(NewI->getNextNode(), Ret);
+  }
+  {
+    // Check create() InsertAtEnd.
+    auto *NewI =
+        cast<SubclassT>(SubclassT::create(Arg, DstTy,
+                                          /*InsertAtEnd=*/BB, Ctx, "NewCI"));
+    // Check getOpcode().
+    EXPECT_EQ(NewI->getOpcode(), OpcodeT);
+    // Check getSrcTy().
+    EXPECT_EQ(NewI->getSrcTy(), Arg->getType());
+    // Check getDestTy().
+    EXPECT_EQ(NewI->getDestTy(), DstTy);
+    // Check instr position.
+    EXPECT_EQ(NewI->getNextNode(), nullptr);
+    EXPECT_EQ(NewI->getParent(), BB);
+  }
+}
+
+TEST_F(SandboxIRTest, SIToFPInst) {
+  parseIR(C, R"IR(
+define void @foo(i32 %arg) {
+  %sitofp = sitofp i32 %arg to float
+  ret void
+}
+)IR");
+  testCastInst<sandboxir::SIToFPInst, sandboxir::Instruction::Opcode::SIToFP>(
+      *M,
+      /*SrcTy=*/Type::getInt32Ty(C),
+      /*DstTy=*/Type::getFloatTy(C));
+}
+
+TEST_F(SandboxIRTest, FPToUIInst) {
+  parseIR(C, R"IR(
+define void @foo(float %arg) {
+  %fptoui = fptoui float %arg to i32
+  ret void
+}
+)IR");
+  testCastInst<sandboxir::FPToUIInst, sandboxir::Instruction::Opcode::FPToUI>(
+
+      *M, /*SrcTy=*/Type::getFloatTy(C), /*DstTy=*/Type::getInt32Ty(C));
+}
+
+TEST_F(SandboxIRTest, FPToSIInst) {
+  parseIR(C, R"IR(
+define void @foo(float %arg) {
+  %fptosi = fptosi float %arg to i32
+  ret void
+}
+)IR");
+  testCastInst<sandboxir::FPToSIInst, sandboxir::Instruction::Opcode::FPToSI>(
+      *M, /*SrcTy=*/Type::getFloatTy(C), /*DstTy=*/Type::getInt32Ty(C));
+}
+
+TEST_F(SandboxIRTest, IntToPtrInst) {
+  parseIR(C, R"IR(
+define void @foo(i32 %arg) {
+  %inttoptr = inttoptr i32 %arg to ptr
+  ret void
+}
+)IR");
+  testCastInst<sandboxir::IntToPtrInst,
+               sandboxir::Instruction::Opcode::IntToPtr>(
+      *M,
+      /*SrcTy=*/Type::getInt32Ty(C), /*DstTy=*/PointerType::get(C, 0));
+}
+
+TEST_F(SandboxIRTest, PtrToIntInst) {
+  parseIR(C, R"IR(
+define void @foo(ptr %ptr) {
+  %ptrtoint = ptrtoint ptr %ptr to i32
+  ret void
+}
+)IR");
+  testCastInst<sandboxir::PtrToIntInst,
+               sandboxir::Instruction::Opcode::PtrToInt>(
+      *M, /*SrcTy=*/PointerType::get(C, 0), /*DstTy=*/Type::getInt32Ty(C));
+}
+
+TEST_F(SandboxIRTest, BitCastInst) {
+  parseIR(C, R"IR(
+define void @foo(i32 %arg) {
+  %bitcast = bitcast i32 %arg to float
+  ret void
+}
+)IR");
+  testCastInst<sandboxir::BitCastInst, sandboxir::Instruction::Opcode::BitCast>(
+      *M,
+      /*SrcTy=*/Type::getInt32Ty(C), /*DstTy=*/Type::getFloatTy(C));
+}
+
+TEST_F(SandboxIRTest, AddrSpaceCastInst) {
+  parseIR(C, R"IR(
+define void @foo(ptr %ptr) {
+  %addrspacecast = addrspacecast ptr %ptr to ptr addrspace(1)
+  ret void
+}
+)IR");
+  Type *Tptr0 = PointerType::get(C, 0);
+  Type *Tptr1 = PointerType::get(C, 1);
+  testCastInst<sandboxir::AddrSpaceCastInst,
+               sandboxir::Instruction::Opcode::AddrSpaceCast>(*M,
+                                                              /*SrcTy=*/Tptr0,
+                                                              /*DstTy=*/Tptr1);
+  Function &LLVMF = *M->getFunction("foo");
+  sandboxir::Context Ctx(C);
+  sandboxir::Function *F = Ctx.createFunction(&LLVMF);
+  unsigned ArgIdx = 0;
+  auto *Arg = F->getArg(ArgIdx++);
+  auto *BB = &*F->begin();
+  auto It = BB->begin();
+
+  auto *AddrSpaceCast = cast<sandboxir::AddrSpaceCastInst>(&*It++);
+  EXPECT_EQ(AddrSpaceCast->getOpcode(),
+            sandboxir::Instruction::Opcode::AddrSpaceCast);
+  EXPECT_EQ(AddrSpaceCast->getPointerOperand(), Arg);
+  EXPECT_EQ(sandboxir::AddrSpaceCastInst::getPointerOperandIndex(), 0u);
+  EXPECT_EQ(AddrSpaceCast->getSrcAddressSpace(),
+            cast<PointerType>(Tptr0)->getPointerAddressSpace());
+  EXPECT_EQ(AddrSpaceCast->getDestAddressSpace(),
+            cast<PointerType>(Tptr1)->getPointerAddressSpace());
+}
+
+TEST_F(SandboxIRTest, PHINode) {
+  parseIR(C, R"IR(
+define void @foo(i32 %arg) {
+bb1:
+  br label %bb2
+
+bb2:
+  %phi = phi i32 [ %arg, %bb1 ], [ 0, %bb2 ]
+  br label %bb2
+
+bb3:
+  ret void
+}
+)IR");
+  Function &LLVMF = *M->getFunction("foo");
+  auto *LLVMBB1 = getBasicBlockByName(LLVMF, "bb1");
+  auto *LLVMBB2 = getBasicBlockByName(LLVMF, "bb2");
+  auto *LLVMBB3 = getBasicBlockByName(LLVMF, "bb3");
+  auto LLVMIt = LLVMBB2->begin();
+  auto *LLVMPHI = cast<llvm::PHINode>(&*LLVMIt++);
+  sandboxir::Context Ctx(C);
+  sandboxir::Function *F = Ctx.createFunction(&LLVMF);
+  auto *Arg = F->getArg(0);
+  auto *BB1 = cast<sandboxir::BasicBlock>(Ctx.getValue(LLVMBB1));
+  auto *BB2 = cast<sandboxir::BasicBlock>(Ctx.getValue(LLVMBB2));
+  auto *BB3 = cast<sandboxir::BasicBlock>(Ctx.getValue(LLVMBB3));
+  auto It = BB2->begin();
+  // Check classof().
+  auto *PHI = cast<sandboxir::PHINode>(&*It++);
+  auto *Br = cast<sandboxir::BranchInst>(&*It++);
+  // Check blocks().
+  EXPECT_EQ(range_size(PHI->blocks()), range_size(LLVMPHI->blocks()));
+  auto BlockIt = PHI->block_begin();
+  for (llvm::BasicBlock *LLVMBB : LLVMPHI->blocks()) {
+    sandboxir::BasicBlock *BB = *BlockIt++;
+    EXPECT_EQ(BB, Ctx.getValue(LLVMBB));
+  }
+  // Check incoming_values().
+  EXPECT_EQ(range_size(PHI->incoming_values()),
+            range_size(LLVMPHI->incoming_values()));
+  auto IncIt = PHI->incoming_values().begin();
+  for (llvm::Value *LLVMV : LLVMPHI->incoming_values()) {
+    sandboxir::Value *IncV = *IncIt++;
+    EXPECT_EQ(IncV, Ctx.getValue(LLVMV));
+  }
+  // Check getNumIncomingValues().
+  EXPECT_EQ(PHI->getNumIncomingValues(), LLVMPHI->getNumIncomingValues());
+  // Check getIncomingValue().
+  EXPECT_EQ(PHI->getIncomingValue(0),
+            Ctx.getValue(LLVMPHI->getIncomingValue(0)));
+  EXPECT_EQ(PHI->getIncomingValue(1),
+            Ctx.getValue(LLVMPHI->getIncomingValue(1)));
+  // Check setIncomingValue().
+  auto *OrigV = PHI->getIncomingValue(0);
+  PHI->setIncomingValue(0, PHI);
+  EXPECT_EQ(PHI->getIncomingValue(0), PHI);
+  PHI->setIncomingValue(0, OrigV);
+  // Check getOperandNumForIncomingValue().
+  EXPECT_EQ(sandboxir::PHINode::getOperandNumForIncomingValue(0),
+            llvm::PHINode::getOperandNumForIncomingValue(0));
+  // Check getIncomingValueNumForOperand().
+  EXPECT_EQ(sandboxir::PHINode::getIncomingValueNumForOperand(0),
+            llvm::PHINode::getIncomingValueNumForOperand(0));
+  // Check getIncomingBlock(unsigned).
+  EXPECT_EQ(PHI->getIncomingBlock(0),
+            Ctx.getValue(LLVMPHI->getIncomingBlock(0)));
+  // Check getIncomingBlock(Use).
+  llvm::Use &LLVMUse = LLVMPHI->getOperandUse(0);
+  sandboxir::Use Use = PHI->getOperandUse(0);
+  EXPECT_EQ(PHI->getIncomingBlock(Use),
+            Ctx.getValue(LLVMPHI->getIncomingBlock(LLVMUse)));
+  // Check setIncomingBlock().
+  sandboxir::BasicBlock *OrigBB = PHI->getIncomingBlock(0);
+  EXPECT_NE(OrigBB, BB2);
+  PHI->setIncomingBlock(0, BB2);
+  EXPECT_EQ(PHI->getIncomingBlock(0), BB2);
+  PHI->setIncomingBlock(0, OrigBB);
+  EXPECT_EQ(PHI->getIncomingBlock(0), OrigBB);
+  // Check addIncoming().
+  unsigned OrigNumIncoming = PHI->getNumIncomingValues();
+  PHI->addIncoming(Arg, BB3);
+  EXPECT_EQ(PHI->getNumIncomingValues(), LLVMPHI->getNumIncomingValues());
+  EXPECT_EQ(PHI->getNumIncomingValues(), OrigNumIncoming + 1);
+  EXPECT_EQ(PHI->getIncomingValue(OrigNumIncoming), Arg);
+  EXPECT_EQ(PHI->getIncomingBlock(OrigNumIncoming), BB3);
+  // Check removeIncomingValue(unsigned).
+  PHI->removeIncomingValue(OrigNumIncoming);
+  EXPECT_EQ(PHI->getNumIncomingValues(), OrigNumIncoming);
+  // Check removeIncomingValue(BasicBlock *).
+  PHI->addIncoming(Arg, BB3);
+  PHI->removeIncomingValue(BB3);
+  EXPECT_EQ(PHI->getNumIncomingValues(), OrigNumIncoming);
+  // Check getBasicBlockIndex().
+  EXPECT_EQ(PHI->getBasicBlockIndex(BB1), LLVMPHI->getBasicBlockIndex(LLVMBB1));
+  // Check getIncomingValueForBlock().
+  EXPECT_EQ(PHI->getIncomingValueForBlock(BB1),
+            Ctx.getValue(LLVMPHI->getIncomingValueForBlock(LLVMBB1)));
+  // Check hasConstantValue().
+  llvm::Value *ConstV = LLVMPHI->hasConstantValue();
+  EXPECT_EQ(PHI->hasConstantValue(),
+            ConstV != nullptr ? Ctx.getValue(ConstV) : nullptr);
+  // Check hasConstantOrUndefValue().
+  EXPECT_EQ(PHI->hasConstantOrUndefValue(), LLVMPHI->hasConstantOrUndefValue());
+  // Check isComplete().
+  EXPECT_EQ(PHI->isComplete(), LLVMPHI->isComplete());
+
+  // Check create().
+  auto *NewPHI = cast<sandboxir::PHINode>(
+      sandboxir::PHINode::create(PHI->getType(), 0, Br, Ctx, "NewPHI"));
+  EXPECT_EQ(NewPHI->getType(), PHI->getType());
+  EXPECT_EQ(NewPHI->getNextNode(), Br);
+  EXPECT_EQ(NewPHI->getName(), "NewPHI");
+  EXPECT_EQ(NewPHI->getNumIncomingValues(), 0u);
+  for (auto [Idx, V] : enumerate(PHI->incoming_values())) {
+    sandboxir::BasicBlock *IncBB = PHI->getIncomingBlock(Idx);
+    NewPHI->addIncoming(V, IncBB);
+  }
+  EXPECT_EQ(NewPHI->getNumIncomingValues(), PHI->getNumIncomingValues());
+}
diff --git a/llvm/unittests/SandboxIR/TrackerTest.cpp b/llvm/unittests/SandboxIR/TrackerTest.cpp
index dd9dcd5..d016c77 100644
--- a/llvm/unittests/SandboxIR/TrackerTest.cpp
+++ b/llvm/unittests/SandboxIR/TrackerTest.cpp
@@ -69,6 +69,34 @@ define void @foo(ptr %ptr) {
   EXPECT_EQ(Ld->getOperand(0), Gep0);
 }
 
+TEST_F(TrackerTest, SetUse) {
+  parseIR(C, R"IR(
+define void @foo(ptr %ptr, i8 %arg) {
+  %ld = load i8, ptr %ptr
+  %add = add i8 %ld, %arg
+  ret void
+}
+)IR");
+  Function &LLVMF = *M->getFunction("foo");
+  sandboxir::Context Ctx(C);
+  auto *F = Ctx.createFunction(&LLVMF);
+  unsigned ArgIdx = 0;
+  auto *Arg0 = F->getArg(ArgIdx++);
+  auto *BB = &*F->begin();
+  auto &Tracker = Ctx.getTracker();
+  Tracker.save();
+  auto It = BB->begin();
+  auto *Ld = &*It++;
+  auto *Add = &*It++;
+
+  Ctx.save();
+  sandboxir::Use Use = Add->getOperandUse(0);
+  Use.set(Arg0);
+  EXPECT_EQ(Add->getOperand(0), Arg0);
+  Ctx.revert();
+  EXPECT_EQ(Add->getOperand(0), Ld);
+}
+
 TEST_F(TrackerTest, SwapOperands) {
   parseIR(C, R"IR(
 define void @foo(i1 %cond) {
@@ -413,3 +441,270 @@ define i32 @foo(i32 %arg) {
   EXPECT_EQ(&*It++, Ret);
   EXPECT_EQ(It, BB->end());
 }
+
+TEST_F(TrackerTest, CallBaseSetters) {
+  parseIR(C, R"IR(
+declare void @bar1(i8)
+declare void @bar2(i8)
+
+define void @foo(i8 %arg0, i8 %arg1) {
+  call void @bar1(i8 %arg0)
+  ret void
+}
+)IR");
+  Function &LLVMF = *M->getFunction("foo");
+  sandboxir::Context Ctx(C);
+
+  auto *F = Ctx.createFunction(&LLVMF);
+  unsigned ArgIdx = 0;
+  auto *Arg0 = F->getArg(ArgIdx++);
+  auto *Arg1 = F->getArg(ArgIdx++);
+  auto *BB = &*F->begin();
+  auto It = BB->begin();
+  auto *Call = cast<sandboxir::CallBase>(&*It++);
+  [[maybe_unused]] auto *Ret = cast<sandboxir::ReturnInst>(&*It++);
+
+  // Check setArgOperand().
+  Ctx.save();
+  Call->setArgOperand(0, Arg1);
+  EXPECT_EQ(Call->getArgOperand(0), Arg1);
+  Ctx.revert();
+  EXPECT_EQ(Call->getArgOperand(0), Arg0);
+
+  auto *Bar1F = Call->getCalledFunction();
+  auto *Bar2F = Ctx.createFunction(M->getFunction("bar2"));
+
+  // Check setCalledOperand().
+  Ctx.save();
+  Call->setCalledOperand(Bar2F);
+  EXPECT_EQ(Call->getCalledOperand(), Bar2F);
+  Ctx.revert();
+  EXPECT_EQ(Call->getCalledOperand(), Bar1F);
+
+  // Check setCalledFunction().
+  Ctx.save();
+  Call->setCalledFunction(Bar2F);
+  EXPECT_EQ(Call->getCalledFunction(), Bar2F);
+  Ctx.revert();
+  EXPECT_EQ(Call->getCalledFunction(), Bar1F);
+}
+
+TEST_F(TrackerTest, InvokeSetters) {
+  parseIR(C, R"IR(
+define void @foo(i8 %arg) {
+ bb0:
+   invoke i8 @foo(i8 %arg) to label %normal_bb
+                       unwind label %exception_bb
+ normal_bb:
+   ret void
+ exception_bb:
+   ret void
+ other_bb:
+   ret void
+}
+)IR");
+  Function &LLVMF = *M->getFunction("foo");
+  sandboxir::Context Ctx(C);
+  [[maybe_unused]] auto &F = *Ctx.createFunction(&LLVMF);
+  auto *BB0 = cast<sandboxir::BasicBlock>(
+      Ctx.getValue(getBasicBlockByName(LLVMF, "bb0")));
+  auto *NormalBB = cast<sandboxir::BasicBlock>(
+      Ctx.getValue(getBasicBlockByName(LLVMF, "normal_bb")));
+  auto *ExceptionBB = cast<sandboxir::BasicBlock>(
+      Ctx.getValue(getBasicBlockByName(LLVMF, "exception_bb")));
+  auto *OtherBB = cast<sandboxir::BasicBlock>(
+      Ctx.getValue(getBasicBlockByName(LLVMF, "other_bb")));
+  auto It = BB0->begin();
+  auto *Invoke = cast<sandboxir::InvokeInst>(&*It++);
+
+  // Check setNormalDest().
+  Ctx.save();
+  Invoke->setNormalDest(OtherBB);
+  EXPECT_EQ(Invoke->getNormalDest(), OtherBB);
+  Ctx.revert();
+  EXPECT_EQ(Invoke->getNormalDest(), NormalBB);
+
+  // Check setUnwindDest().
+  Ctx.save();
+  Invoke->setUnwindDest(OtherBB);
+  EXPECT_EQ(Invoke->getUnwindDest(), OtherBB);
+  Ctx.revert();
+  EXPECT_EQ(Invoke->getUnwindDest(), ExceptionBB);
+
+  // Check setSuccessor().
+  Ctx.save();
+  Invoke->setSuccessor(0, OtherBB);
+  EXPECT_EQ(Invoke->getSuccessor(0), OtherBB);
+  Ctx.revert();
+  EXPECT_EQ(Invoke->getSuccessor(0), NormalBB);
+
+  Ctx.save();
+  Invoke->setSuccessor(1, OtherBB);
+  EXPECT_EQ(Invoke->getSuccessor(1), OtherBB);
+  Ctx.revert();
+  EXPECT_EQ(Invoke->getSuccessor(1), ExceptionBB);
+}
+
+TEST_F(TrackerTest, CallBrSetters) {
+  parseIR(C, R"IR(
+define void @foo(i8 %arg) {
+ bb0:
+   callbr void @foo(i8 %arg)
+               to label %bb1 [label %bb2]
+ bb1:
+   ret void
+ bb2:
+   ret void
+ other_bb:
+   ret void
+}
+)IR");
+  Function &LLVMF = *M->getFunction("foo");
+  sandboxir::Context Ctx(C);
+  [[maybe_unused]] auto &F = *Ctx.createFunction(&LLVMF);
+  auto *BB0 = cast<sandboxir::BasicBlock>(
+      Ctx.getValue(getBasicBlockByName(LLVMF, "bb0")));
+  auto *OtherBB = cast<sandboxir::BasicBlock>(
+      Ctx.getValue(getBasicBlockByName(LLVMF, "other_bb")));
+  auto It = BB0->begin();
+  auto *CallBr = cast<sandboxir::CallBrInst>(&*It++);
+  // Check setDefaultDest().
+  Ctx.save();
+  auto *OrigDefaultDest = CallBr->getDefaultDest();
+  CallBr->setDefaultDest(OtherBB);
+  EXPECT_EQ(CallBr->getDefaultDest(), OtherBB);
+  Ctx.revert();
+  EXPECT_EQ(CallBr->getDefaultDest(), OrigDefaultDest);
+
+  // Check setIndirectDest().
+  Ctx.save();
+  auto *OrigIndirectDest = CallBr->getIndirectDest(0);
+  CallBr->setIndirectDest(0, OtherBB);
+  EXPECT_EQ(CallBr->getIndirectDest(0), OtherBB);
+  Ctx.revert();
+  EXPECT_EQ(CallBr->getIndirectDest(0), OrigIndirectDest);
+}
+
+TEST_F(TrackerTest, PHINodeSetters) {
+  parseIR(C, R"IR(
+define void @foo(i8 %arg0, i8 %arg1, i8 %arg2) {
+bb0:
+  br label %bb2
+
+bb1:
+  %phi = phi i8 [ %arg0, %bb0 ], [ %arg1, %bb1 ]
+  br label %bb1
+
+bb2:
+  ret void
+}
+)IR");
+  Function &LLVMF = *M->getFunction("foo");
+  sandboxir::Context Ctx(C);
+  auto &F = *Ctx.createFunction(&LLVMF);
+  unsigned ArgIdx = 0;
+  auto *Arg0 = F.getArg(ArgIdx++);
+  auto *Arg1 = F.getArg(ArgIdx++);
+  auto *Arg2 = F.getArg(ArgIdx++);
+  auto *BB0 = cast<sandboxir::BasicBlock>(
+      Ctx.getValue(getBasicBlockByName(LLVMF, "bb0")));
+  auto *BB1 = cast<sandboxir::BasicBlock>(
+      Ctx.getValue(getBasicBlockByName(LLVMF, "bb1")));
+  auto *BB2 = cast<sandboxir::BasicBlock>(
+      Ctx.getValue(getBasicBlockByName(LLVMF, "bb2")));
+  auto *PHI = cast<sandboxir::PHINode>(&*BB1->begin());
+
+  // Check setIncomingValue().
+  Ctx.save();
+  EXPECT_EQ(PHI->getIncomingValue(0), Arg0);
+  PHI->setIncomingValue(0, Arg2);
+  EXPECT_EQ(PHI->getIncomingValue(0), Arg2);
+  Ctx.revert();
+  EXPECT_EQ(PHI->getIncomingValue(0), Arg0);
+  EXPECT_EQ(PHI->getNumIncomingValues(), 2u);
+  EXPECT_EQ(PHI->getIncomingBlock(0), BB0);
+  EXPECT_EQ(PHI->getIncomingValue(0), Arg0);
+  EXPECT_EQ(PHI->getIncomingBlock(1), BB1);
+  EXPECT_EQ(PHI->getIncomingValue(1), Arg1);
+
+  // Check setIncomingBlock().
+  Ctx.save();
+  EXPECT_EQ(PHI->getIncomingBlock(0), BB0);
+  PHI->setIncomingBlock(0, BB2);
+  EXPECT_EQ(PHI->getIncomingBlock(0), BB2);
+  Ctx.revert();
+  EXPECT_EQ(PHI->getIncomingBlock(0), BB0);
+  EXPECT_EQ(PHI->getNumIncomingValues(), 2u);
+  EXPECT_EQ(PHI->getIncomingBlock(0), BB0);
+  EXPECT_EQ(PHI->getIncomingValue(0), Arg0);
+  EXPECT_EQ(PHI->getIncomingBlock(1), BB1);
+  EXPECT_EQ(PHI->getIncomingValue(1), Arg1);
+
+  // Check addIncoming().
+  Ctx.save();
+  EXPECT_EQ(PHI->getNumIncomingValues(), 2u);
+  PHI->addIncoming(Arg1, BB2);
+  EXPECT_EQ(PHI->getNumIncomingValues(), 3u);
+  EXPECT_EQ(PHI->getIncomingBlock(2), BB2);
+  EXPECT_EQ(PHI->getIncomingValue(2), Arg1);
+  Ctx.revert();
+  EXPECT_EQ(PHI->getNumIncomingValues(), 2u);
+  EXPECT_EQ(PHI->getIncomingBlock(0), BB0);
+  EXPECT_EQ(PHI->getIncomingValue(0), Arg0);
+  EXPECT_EQ(PHI->getIncomingBlock(1), BB1);
+  EXPECT_EQ(PHI->getIncomingValue(1), Arg1);
+
+  // Check removeIncomingValue(1).
+  Ctx.save();
+  PHI->removeIncomingValue(1);
+  EXPECT_EQ(PHI->getNumIncomingValues(), 1u);
+  EXPECT_EQ(PHI->getIncomingBlock(0), BB0);
+  EXPECT_EQ(PHI->getIncomingValue(0), Arg0);
+  Ctx.revert();
+  EXPECT_EQ(PHI->getNumIncomingValues(), 2u);
+  EXPECT_EQ(PHI->getIncomingBlock(0), BB0);
+  EXPECT_EQ(PHI->getIncomingValue(0), Arg0);
+  EXPECT_EQ(PHI->getIncomingBlock(1), BB1);
+  EXPECT_EQ(PHI->getIncomingValue(1), Arg1);
+
+  // Check removeIncomingValue(0).
+  Ctx.save();
+  PHI->removeIncomingValue(0u);
+  EXPECT_EQ(PHI->getNumIncomingValues(), 1u);
+  EXPECT_EQ(PHI->getIncomingBlock(0), BB1);
+  EXPECT_EQ(PHI->getIncomingValue(0), Arg1);
+  Ctx.revert();
+  EXPECT_EQ(PHI->getNumIncomingValues(), 2u);
+  EXPECT_EQ(PHI->getIncomingBlock(0), BB0);
+  EXPECT_EQ(PHI->getIncomingValue(0), Arg0);
+  EXPECT_EQ(PHI->getIncomingBlock(1), BB1);
+  EXPECT_EQ(PHI->getIncomingValue(1), Arg1);
+
+  // Check removeIncomingValue() remove all.
+  Ctx.save();
+  PHI->removeIncomingValue(0u);
+  EXPECT_EQ(PHI->getNumIncomingValues(), 1u);
+  EXPECT_EQ(PHI->getIncomingBlock(0), BB1);
+  EXPECT_EQ(PHI->getIncomingValue(0), Arg1);
+  PHI->removeIncomingValue(0u);
+  EXPECT_EQ(PHI->getNumIncomingValues(), 0u);
+  Ctx.revert();
+  EXPECT_EQ(PHI->getNumIncomingValues(), 2u);
+  EXPECT_EQ(PHI->getIncomingBlock(0), BB0);
+  EXPECT_EQ(PHI->getIncomingValue(0), Arg0);
+  EXPECT_EQ(PHI->getIncomingBlock(1), BB1);
+  EXPECT_EQ(PHI->getIncomingValue(1), Arg1);
+
+  // Check removeIncomingValue(BasicBlock *).
+  Ctx.save();
+  PHI->removeIncomingValue(BB1);
+  EXPECT_EQ(PHI->getNumIncomingValues(), 1u);
+  EXPECT_EQ(PHI->getIncomingBlock(0), BB0);
+  EXPECT_EQ(PHI->getIncomingValue(0), Arg0);
+  Ctx.revert();
+  EXPECT_EQ(PHI->getNumIncomingValues(), 2u);
+  EXPECT_EQ(PHI->getIncomingBlock(0), BB0);
+  EXPECT_EQ(PHI->getIncomingValue(0), Arg0);
+  EXPECT_EQ(PHI->getIncomingBlock(1), BB1);
+  EXPECT_EQ(PHI->getIncomingValue(1), Arg1);
+}
diff --git a/llvm/unittests/Support/ErrorTest.cpp b/llvm/unittests/Support/ErrorTest.cpp
index bd098a4..4cd9896 100644
--- a/llvm/unittests/Support/ErrorTest.cpp
+++ b/llvm/unittests/Support/ErrorTest.cpp
@@ -930,6 +930,8 @@ TEST(Error, C_API) {
     });
   EXPECT_TRUE(GotCSE) << "Failed to round-trip ErrorList via C API";
   EXPECT_TRUE(GotCE) << "Failed to round-trip ErrorList via C API";
+
+  LLVMCantFail(wrap(Error::success()));
 }
 
 TEST(Error, FileErrorTest) {
diff --git a/llvm/utils/TableGen/Common/CodeGenDAGPatterns.h b/llvm/utils/TableGen/Common/CodeGenDAGPatterns.h
index 1f4d45d..d8df742 100644
--- a/llvm/utils/TableGen/Common/CodeGenDAGPatterns.h
+++ b/llvm/utils/TableGen/Common/CodeGenDAGPatterns.h
@@ -48,15 +48,12 @@ class CodeGenDAGPatterns;
 using TreePatternNodePtr = IntrusiveRefCntPtr<TreePatternNode>;
 
 /// This represents a set of MVTs. Since the underlying type for the MVT
-/// is uint8_t, there are at most 256 values. To reduce the number of memory
+/// is uint16_t, there are at most 65536 values. To reduce the number of memory
 /// allocations and deallocations, represent the set as a sequence of bits.
 /// To reduce the allocations even further, make MachineValueTypeSet own
 /// the storage and use std::array as the bit container.
 struct MachineValueTypeSet {
-  static_assert(std::is_same<std::underlying_type_t<MVT::SimpleValueType>,
-                             uint8_t>::value,
-                "Change uint8_t here to the SimpleValueType's type");
-  static unsigned constexpr Capacity = std::numeric_limits<uint8_t>::max() + 1;
+  static unsigned constexpr Capacity = 512;
   using WordType = uint64_t;
   static unsigned constexpr WordWidth = CHAR_BIT * sizeof(WordType);
   static unsigned constexpr NumWords = Capacity / WordWidth;
@@ -84,9 +81,11 @@ struct MachineValueTypeSet {
   }
   LLVM_ATTRIBUTE_ALWAYS_INLINE
   unsigned count(MVT T) const {
+    assert(T.SimpleTy < Capacity && "Capacity needs to be enlarged");
     return (Words[T.SimpleTy / WordWidth] >> (T.SimpleTy % WordWidth)) & 1;
   }
   std::pair<MachineValueTypeSet &, bool> insert(MVT T) {
+    assert(T.SimpleTy < Capacity && "Capacity needs to be enlarged");
     bool V = count(T.SimpleTy);
     Words[T.SimpleTy / WordWidth] |= WordType(1) << (T.SimpleTy % WordWidth);
     return {*this, V};
@@ -98,6 +97,7 @@ struct MachineValueTypeSet {
   }
   LLVM_ATTRIBUTE_ALWAYS_INLINE
   void erase(MVT T) {
+    assert(T.SimpleTy < Capacity && "Capacity needs to be enlarged");
     Words[T.SimpleTy / WordWidth] &= ~(WordType(1) << (T.SimpleTy % WordWidth));
   }
 
@@ -193,8 +193,6 @@ struct TypeSetByHwMode : public InfoByHwMode<MachineValueTypeSet> {
   TypeSetByHwMode &operator=(const TypeSetByHwMode &) = default;
   TypeSetByHwMode(MVT::SimpleValueType VT)
       : TypeSetByHwMode(ValueTypeByHwMode(VT)) {}
-  TypeSetByHwMode(ValueTypeByHwMode VT)
-      : TypeSetByHwMode(ArrayRef<ValueTypeByHwMode>(&VT, 1)) {}
   TypeSetByHwMode(ArrayRef<ValueTypeByHwMode> VTList);
 
   SetType &getOrCreate(unsigned Mode) { return Map[Mode]; }
@@ -264,7 +262,8 @@ struct TypeInfer {
   bool MergeInTypeInfo(TypeSetByHwMode &Out, MVT::SimpleValueType InVT) const {
     return MergeInTypeInfo(Out, TypeSetByHwMode(InVT));
   }
-  bool MergeInTypeInfo(TypeSetByHwMode &Out, ValueTypeByHwMode InVT) const {
+  bool MergeInTypeInfo(TypeSetByHwMode &Out,
+                       const ValueTypeByHwMode &InVT) const {
     return MergeInTypeInfo(Out, TypeSetByHwMode(InVT));
   }
 
@@ -841,7 +840,8 @@ public: // Higher level manipulation routines.
                       TreePattern &TP);
   bool UpdateNodeType(unsigned ResNo, MVT::SimpleValueType InTy,
                       TreePattern &TP);
-  bool UpdateNodeType(unsigned ResNo, ValueTypeByHwMode InTy, TreePattern &TP);
+  bool UpdateNodeType(unsigned ResNo, const ValueTypeByHwMode &InTy,
+                      TreePattern &TP);
 
   // Update node type with types inferred from an instruction operand or result
   // def from the ins/outs lists.
@@ -996,7 +996,7 @@ inline bool TreePatternNode::UpdateNodeType(unsigned ResNo,
 }
 
 inline bool TreePatternNode::UpdateNodeType(unsigned ResNo,
-                                            ValueTypeByHwMode InTy,
+                                            const ValueTypeByHwMode &InTy,
                                             TreePattern &TP) {
   TypeSetByHwMode VTS(InTy);
   TP.getInfer().expandOverloads(VTS);
diff --git a/llvm/utils/TableGen/DAGISelMatcherEmitter.cpp b/llvm/utils/TableGen/DAGISelMatcherEmitter.cpp
index 229245f..1b93e3d 100644
--- a/llvm/utils/TableGen/DAGISelMatcherEmitter.cpp
+++ b/llvm/utils/TableGen/DAGISelMatcherEmitter.cpp
@@ -339,7 +339,8 @@ unsigned MatcherTableEmitter::SizeMatcher(Matcher *N, raw_ostream &OS) {
         Size += 2; // Count the child's opcode.
       } else {
         Child = cast<SwitchTypeMatcher>(N)->getCaseMatcher(i);
-        ++Size; // Count the child's type.
+        Size += GetVBRSize(cast<SwitchTypeMatcher>(N)->getCaseType(
+            i)); // Count the child's type.
       }
       const unsigned ChildSize = SizeMatcherList(Child, OS);
       assert(ChildSize != 0 && "Matcher cannot have child of size 0");
@@ -599,7 +600,8 @@ unsigned MatcherTableEmitter::EmitMatcher(const Matcher *N,
         IdxSize = 2; // size of opcode in table is 2 bytes.
       } else {
         Child = cast<SwitchTypeMatcher>(N)->getCaseMatcher(i);
-        IdxSize = 1; // size of type in table is 1 byte.
+        IdxSize = GetVBRSize(cast<SwitchTypeMatcher>(N)->getCaseType(
+            i)); // size of type in table is sizeof(VBR(MVT)) byte.
       }
 
       if (i != 0) {
@@ -615,8 +617,13 @@ unsigned MatcherTableEmitter::EmitMatcher(const Matcher *N,
       CurrentIdx += EmitVBRValue(ChildSize, OS) + IdxSize;
       if (const SwitchOpcodeMatcher *SOM = dyn_cast<SwitchOpcodeMatcher>(N))
         OS << "TARGET_VAL(" << SOM->getCaseOpcode(i).getEnumName() << "),";
-      else
-        OS << getEnumName(cast<SwitchTypeMatcher>(N)->getCaseType(i)) << ',';
+      else {
+        if (!OmitComments)
+          OS << "/*" << getEnumName(cast<SwitchTypeMatcher>(N)->getCaseType(i))
+             << "*/";
+        EmitVBRValue(cast<SwitchTypeMatcher>(N)->getCaseType(i),
+                     OS);
+      }
       if (!OmitComments)
         OS << "// ->" << CurrentIdx + ChildSize;
       OS << '\n';
@@ -639,7 +646,7 @@ unsigned MatcherTableEmitter::EmitMatcher(const Matcher *N,
     return CurrentIdx - StartIdx + 1;
   }
 
-  case Matcher::CheckType:
+  case Matcher::CheckType: {
     if (cast<CheckTypeMatcher>(N)->getResNo() == 0) {
       MVT::SimpleValueType VT = cast<CheckTypeMatcher>(N)->getType();
       switch (VT) {
@@ -648,13 +655,21 @@ unsigned MatcherTableEmitter::EmitMatcher(const Matcher *N,
         OS << "OPC_CheckTypeI" << MVT(VT).getSizeInBits() << ",\n";
         return 1;
       default:
-        OS << "OPC_CheckType, " << getEnumName(VT) << ",\n";
-        return 2;
+        OS << "OPC_CheckType, ";
+        if (!OmitComments)
+          OS << "/*" << getEnumName(VT) << "*/";
+        unsigned NumBytes = EmitVBRValue(VT, OS);
+        OS << "\n";
+        return NumBytes + 1;
       }
     }
-    OS << "OPC_CheckTypeRes, " << cast<CheckTypeMatcher>(N)->getResNo() << ", "
-       << getEnumName(cast<CheckTypeMatcher>(N)->getType()) << ",\n";
-    return 3;
+    OS << "OPC_CheckTypeRes, " << cast<CheckTypeMatcher>(N)->getResNo() << ", ";
+    if (!OmitComments)
+      OS << "/*" << getEnumName(cast<CheckTypeMatcher>(N)->getType()) << "*/";
+    unsigned NumBytes = EmitVBRValue(cast<CheckTypeMatcher>(N)->getType(), OS);
+    OS << "\n";
+    return NumBytes + 2;
+  }
 
   case Matcher::CheckChildType: {
     MVT::SimpleValueType VT = cast<CheckChildTypeMatcher>(N)->getType();
@@ -666,8 +681,12 @@ unsigned MatcherTableEmitter::EmitMatcher(const Matcher *N,
       return 1;
     default:
       OS << "OPC_CheckChild" << cast<CheckChildTypeMatcher>(N)->getChildNo()
-         << "Type, " << getEnumName(VT) << ",\n";
-      return 2;
+         << "Type, ";
+      if (!OmitComments)
+        OS << "/*" << getEnumName(VT) << "*/";
+      unsigned NumBytes = EmitVBRValue(VT, OS);
+      OS << "\n";
+      return NumBytes + 1;
     }
   }
 
@@ -696,10 +715,16 @@ unsigned MatcherTableEmitter::EmitMatcher(const Matcher *N,
        << cast<CheckChild2CondCodeMatcher>(N)->getCondCodeName() << ",\n";
     return 2;
 
-  case Matcher::CheckValueType:
-    OS << "OPC_CheckValueType, "
-       << getEnumName(cast<CheckValueTypeMatcher>(N)->getVT()) << ",\n";
-    return 2;
+  case Matcher::CheckValueType: {
+    OS << "OPC_CheckValueType, ";
+    if (!OmitComments)
+      OS << "/*" << getEnumName(cast<CheckValueTypeMatcher>(N)->getVT())
+         << "*/";
+    unsigned NumBytes =
+        EmitVBRValue(cast<CheckValueTypeMatcher>(N)->getVT(), OS);
+    OS << "\n";
+    return NumBytes + 1;
+  }
 
   case Matcher::CheckComplexPat: {
     const CheckComplexPatMatcher *CCPM = cast<CheckComplexPatMatcher>(N);
@@ -766,8 +791,10 @@ unsigned MatcherTableEmitter::EmitMatcher(const Matcher *N,
       OS << "OPC_EmitInteger" << MVT(VT).getSizeInBits() << ", ";
       break;
     default:
-      OpBytes = 2;
-      OS << "OPC_EmitInteger, " << getEnumName(VT) << ", ";
+      OS << "OPC_EmitInteger, ";
+      if (!OmitComments)
+        OS << "/*" << getEnumName(VT) << "*/";
+      OpBytes = EmitVBRValue(VT, OS) + 1;
       break;
     }
     unsigned Bytes = OpBytes + EmitSignedVBRValue(Val, OS);
@@ -785,8 +812,10 @@ unsigned MatcherTableEmitter::EmitMatcher(const Matcher *N,
       OS << "OPC_EmitStringInteger" << MVT(VT).getSizeInBits() << ", ";
       break;
     default:
-      OpBytes = 2;
-      OS << "OPC_EmitStringInteger, " << getEnumName(VT) << ", ";
+      OS << "OPC_EmitStringInteger, ";
+      if (!OmitComments)
+        OS << "/*" << getEnumName(VT) << "*/";
+      OpBytes = EmitVBRValue(VT, OS) + 1;
       break;
     }
     OS << Val << ",\n";
@@ -797,14 +826,17 @@ unsigned MatcherTableEmitter::EmitMatcher(const Matcher *N,
     const EmitRegisterMatcher *Matcher = cast<EmitRegisterMatcher>(N);
     const CodeGenRegister *Reg = Matcher->getReg();
     MVT::SimpleValueType VT = Matcher->getVT();
+    unsigned OpBytes;
     // If the enum value of the register is larger than one byte can handle,
     // use EmitRegister2.
     if (Reg && Reg->EnumValue > 255) {
-      OS << "OPC_EmitRegister2, " << getEnumName(VT) << ", ";
+      OS << "OPC_EmitRegister2, ";
+      if (!OmitComments)
+        OS << "/*" << getEnumName(VT) << "*/";
+      OpBytes = EmitVBRValue(VT, OS);
       OS << "TARGET_VAL(" << getQualifiedName(Reg->TheDef) << "),\n";
-      return 4;
+      return OpBytes + 3;
     }
-    unsigned OpBytes;
     switch (VT) {
     case MVT::i32:
     case MVT::i64:
@@ -812,8 +844,10 @@ unsigned MatcherTableEmitter::EmitMatcher(const Matcher *N,
       OS << "OPC_EmitRegisterI" << MVT(VT).getSizeInBits() << ", ";
       break;
     default:
-      OpBytes = 2;
-      OS << "OPC_EmitRegister, " << getEnumName(VT) << ", ";
+      OS << "OPC_EmitRegister, ";
+      if (!OmitComments)
+        OS << "/*" << getEnumName(VT) << "*/";
+      OpBytes = EmitVBRValue(VT, OS) + 1;
       break;
     }
     if (Reg) {
@@ -958,8 +992,12 @@ unsigned MatcherTableEmitter::EmitMatcher(const Matcher *N,
         OS << "/*#VTs*/";
       OS << ", ";
     }
-    for (unsigned i = 0, e = EN->getNumVTs(); i != e; ++i)
-      OS << getEnumName(EN->getVT(i)) << ", ";
+    unsigned NumTypeBytes = 0;
+    for (unsigned i = 0, e = EN->getNumVTs(); i != e; ++i) {
+      if (!OmitComments)
+        OS << "/*" << getEnumName(EN->getVT(i)) << "*/";
+      NumTypeBytes += EmitVBRValue(EN->getVT(i), OS);
+    }
 
     OS << EN->getNumOperands();
     if (!OmitComments)
@@ -992,7 +1030,7 @@ unsigned MatcherTableEmitter::EmitMatcher(const Matcher *N,
     } else
       OS << '\n';
 
-    return 4 + !CompressVTs + !CompressNodeInfo + EN->getNumVTs() +
+    return 4 + !CompressVTs + !CompressNodeInfo + NumTypeBytes +
            NumOperandBytes + NumCoveredBytes;
   }
   case Matcher::CompleteMatch: {
diff --git a/llvm/utils/TableGen/DXILEmitter.cpp b/llvm/utils/TableGen/DXILEmitter.cpp
index 0439df8..8e54ead5 100644
--- a/llvm/utils/TableGen/DXILEmitter.cpp
+++ b/llvm/utils/TableGen/DXILEmitter.cpp
@@ -16,48 +16,37 @@
 #include "llvm/ADT/STLExtras.h"
 #include "llvm/ADT/SmallSet.h"
 #include "llvm/ADT/SmallVector.h"
+#include "llvm/ADT/StringExtras.h"
 #include "llvm/ADT/StringSet.h"
-#include "llvm/ADT/StringSwitch.h"
 #include "llvm/CodeGenTypes/MachineValueType.h"
 #include "llvm/Support/DXILABI.h"
+#include "llvm/Support/VersionTuple.h"
+#include "llvm/TableGen/Error.h"
 #include "llvm/TableGen/Record.h"
 #include "llvm/TableGen/TableGenBackend.h"
+
 #include <string>
+#include <vector>
 
 using namespace llvm;
 using namespace llvm::dxil;
 
 namespace {
 
-struct DXILShaderModel {
-  int Major = 0;
-  int Minor = 0;
-};
-
 struct DXILOperationDesc {
   std::string OpName; // name of DXIL operation
   int OpCode;         // ID of DXIL operation
   StringRef OpClass;  // name of the opcode class
   StringRef Doc;      // the documentation description of this instruction
-  SmallVector<Record *> OpTypes; // Vector of operand type records -
-                                 // return type is at index 0
-  SmallVector<std::string>
-      OpAttributes;     // operation attribute represented as strings
-  StringRef Intrinsic;  // The llvm intrinsic map to OpName. Default is "" which
-                        // means no map exists
-  bool IsDeriv = false; // whether this is some kind of derivative
-  bool IsGradient = false; // whether this requires a gradient calculation
-  bool IsFeedback = false; // whether this is a sampler feedback op
-  bool IsWave =
-      false; // whether this requires in-wave, cross-lane functionality
-  bool RequiresUniformInputs = false; // whether this operation requires that
-                                      // all of its inputs are uniform across
-                                      // the wave
+  // Vector of operand type records - return type is at index 0
+  SmallVector<Record *> OpTypes;
+  SmallVector<Record *> OverloadRecs;
+  SmallVector<Record *> StageRecs;
+  SmallVector<Record *> AttrRecs;
+  StringRef Intrinsic; // The llvm intrinsic map to OpName. Default is "" which
+                       // means no map exists
   SmallVector<StringRef, 4>
       ShaderStages; // shader stages to which this applies, empty for all.
-  DXILShaderModel ShaderModel;           // minimum shader model required
-  DXILShaderModel ShaderModelTranslated; // minimum shader model required with
-                                         // translation by linker
   int OverloadParamIndex;             // Index of parameter with overload type.
                                       //   -1 : no overload types
   SmallVector<StringRef, 4> counters; // counters for this inst.
@@ -91,18 +80,32 @@ static ParameterKind getParameterKind(const Record *R) {
     return ParameterKind::I32;
   case MVT::fAny:
   case MVT::iAny:
+  case MVT::Any:
     return ParameterKind::Overload;
-  case MVT::Other:
-    // Handle DXIL-specific overload types
-    if (R->getValueAsInt("isHalfOrFloat") || R->getValueAsInt("isI16OrI32")) {
-      return ParameterKind::Overload;
-    }
-    [[fallthrough]];
   default:
-    llvm_unreachable("Support for specified DXIL Type not yet implemented");
+    llvm_unreachable(
+        "Support for specified parameter type not yet implemented");
   }
 }
 
+/// In-place sort TableGen records of class with a field
+///    Version dxil_version
+/// in the ascending version order.
+static void AscendingSortByVersion(std::vector<Record *> &Recs) {
+  std::sort(Recs.begin(), Recs.end(), [](Record *RecA, Record *RecB) {
+    unsigned RecAMaj =
+        RecA->getValueAsDef("dxil_version")->getValueAsInt("Major");
+    unsigned RecAMin =
+        RecA->getValueAsDef("dxil_version")->getValueAsInt("Minor");
+    unsigned RecBMaj =
+        RecB->getValueAsDef("dxil_version")->getValueAsInt("Major");
+    unsigned RecBMin =
+        RecB->getValueAsDef("dxil_version")->getValueAsInt("Minor");
+
+    return (VersionTuple(RecAMaj, RecAMin) < VersionTuple(RecBMaj, RecBMin));
+  });
+}
+
 /// Construct an object using the DXIL Operation records specified
 /// in DXIL.td. This serves as the single source of reference of
 /// the information extracted from the specified Record R, for
@@ -113,9 +116,15 @@ DXILOperationDesc::DXILOperationDesc(const Record *R) {
   OpCode = R->getValueAsInt("OpCode");
 
   Doc = R->getValueAsString("Doc");
+  SmallVector<Record *> ParamTypeRecs;
+
+  ParamTypeRecs.push_back(R->getValueAsDef("result"));
 
-  auto TypeRecs = R->getValueAsListOfDefs("OpTypes");
-  unsigned TypeRecsSize = TypeRecs.size();
+  std::vector<Record *> ArgTys = R->getValueAsListOfDefs("arguments");
+  for (auto Ty : ArgTys) {
+    ParamTypeRecs.push_back(Ty);
+  }
+  size_t ParamTypeRecsSize = ParamTypeRecs.size();
   // Populate OpTypes with return type and parameter types
 
   // Parameter indices of overloaded parameters.
@@ -124,30 +133,23 @@ DXILOperationDesc::DXILOperationDesc(const Record *R) {
   // the comment before the definition of class LLVMMatchType in
   // llvm/IR/Intrinsics.td
   SmallVector<int> OverloadParamIndices;
-  for (unsigned i = 0; i < TypeRecsSize; i++) {
-    auto TR = TypeRecs[i];
+  for (unsigned i = 0; i < ParamTypeRecsSize; i++) {
+    auto TR = ParamTypeRecs[i];
     // Track operation parameter indices of any overload types
     auto isAny = TR->getValueAsInt("isAny");
     if (isAny == 1) {
-      // TODO: At present it is expected that all overload types in a DXIL Op
-      // are of the same type. Hence, OverloadParamIndices will have only one
-      // element. This implies we do not need a vector. However, until more
-      // (all?) DXIL Ops are added in DXIL.td, a vector is being used to flag
-      // cases this assumption would not hold.
+      // All overload types in a DXIL Op are required to be of the same type.
       if (!OverloadParamIndices.empty()) {
-        bool knownType = true;
+        [[maybe_unused]] bool knownType = true;
         // Ensure that the same overload type registered earlier is being used
         for (auto Idx : OverloadParamIndices) {
-          if (TR != TypeRecs[Idx]) {
+          if (TR != ParamTypeRecs[Idx]) {
             knownType = false;
             break;
           }
         }
-        if (!knownType) {
-          report_fatal_error("Specification of multiple differing overload "
-                             "parameter types not yet supported",
-                             false);
-        }
+        assert(knownType && "Specification of multiple differing overload "
+                            "parameter types not yet supported");
       } else {
         OverloadParamIndices.push_back(i);
       }
@@ -160,7 +162,7 @@ DXILOperationDesc::DXILOperationDesc(const Record *R) {
       // Get the parameter index of anonymous type, TR, references
       auto OLParamIndex = TR->getValueAsInt("Number");
       // Resolve and insert the type to that at OLParamIndex
-      OpTypes.emplace_back(TypeRecs[OLParamIndex]);
+      OpTypes.emplace_back(ParamTypeRecs[OLParamIndex]);
     } else {
       // A non-anonymous type. Just record it in OpTypes
       OpTypes.emplace_back(TR);
@@ -170,28 +172,62 @@ DXILOperationDesc::DXILOperationDesc(const Record *R) {
   // Set the index of the overload parameter, if any.
   OverloadParamIndex = -1; // default; indicating none
   if (!OverloadParamIndices.empty()) {
-    if (OverloadParamIndices.size() > 1)
-      report_fatal_error("Multiple overload type specification not supported",
-                         false);
+    assert(OverloadParamIndices.size() == 1 &&
+           "Multiple overload type specification not supported");
     OverloadParamIndex = OverloadParamIndices[0];
   }
+
+  // Get overload records
+  std::vector<Record *> Recs = R->getValueAsListOfDefs("overloads");
+
+  // Sort records in ascending order of DXIL version
+  AscendingSortByVersion(Recs);
+
+  for (Record *CR : Recs) {
+    OverloadRecs.push_back(CR);
+  }
+
+  // Get stage records
+  Recs = R->getValueAsListOfDefs("stages");
+
+  if (Recs.empty()) {
+    PrintFatalError(R, Twine("Atleast one specification of valid stage for ") +
+                           OpName + " is required");
+  }
+
+  // Sort records in ascending order of DXIL version
+  AscendingSortByVersion(Recs);
+
+  for (Record *CR : Recs) {
+    StageRecs.push_back(CR);
+  }
+
+  // Get attribute records
+  Recs = R->getValueAsListOfDefs("attributes");
+
+  // Sort records in ascending order of DXIL version
+  AscendingSortByVersion(Recs);
+
+  for (Record *CR : Recs) {
+    AttrRecs.push_back(CR);
+  }
+
   // Get the operation class
   OpClass = R->getValueAsDef("OpClass")->getName();
 
-  if (R->getValue("LLVMIntrinsic")) {
-    auto *IntrinsicDef = R->getValueAsDef("LLVMIntrinsic");
-    auto DefName = IntrinsicDef->getName();
-    assert(DefName.starts_with("int_") && "invalid intrinsic name");
-    // Remove the int_ from intrinsic name.
-    Intrinsic = DefName.substr(4);
-    // TODO: For now, assume that attributes of DXIL Operation are the same as
-    // that of the intrinsic. Deviations are expected to be encoded in TableGen
-    // record specification and handled accordingly here. Support to be added
-    // as needed.
-    auto IntrPropList = IntrinsicDef->getValueAsListInit("IntrProperties");
-    auto IntrPropListSize = IntrPropList->size();
-    for (unsigned i = 0; i < IntrPropListSize; i++) {
-      OpAttributes.emplace_back(IntrPropList->getElement(i)->getAsString());
+  if (!OpClass.str().compare("UnknownOpClass")) {
+    PrintFatalError(R, Twine("Unspecified DXIL OpClass for DXIL operation - ") +
+                           OpName);
+  }
+
+  const RecordVal *RV = R->getValue("LLVMIntrinsic");
+  if (RV && RV->getValue()) {
+    if (DefInit *DI = dyn_cast<DefInit>(RV->getValue())) {
+      auto *IntrinsicDef = DI->getDef();
+      auto DefName = IntrinsicDef->getName();
+      assert(DefName.starts_with("int_") && "invalid intrinsic name");
+      // Remove the int_ from intrinsic name.
+      Intrinsic = DefName.substr(4);
     }
   }
 }
@@ -239,10 +275,8 @@ static std::string getParameterKindStr(ParameterKind Kind) {
 /// \return std::string string representation of OverloadKind
 
 static std::string getOverloadKindStr(const Record *R) {
-  auto VTRec = R->getValueAsDef("VT");
+  Record *VTRec = R->getValueAsDef("VT");
   switch (getValueType(VTRec)) {
-  case MVT::isVoid:
-    return "OverloadKind::VOID";
   case MVT::f16:
     return "OverloadKind::HALF";
   case MVT::f32:
@@ -259,57 +293,163 @@ static std::string getOverloadKindStr(const Record *R) {
     return "OverloadKind::I32";
   case MVT::i64:
     return "OverloadKind::I64";
-  case MVT::iAny:
-    return "OverloadKind::I16 | OverloadKind::I32 | OverloadKind::I64";
-  case MVT::fAny:
-    return "OverloadKind::HALF | OverloadKind::FLOAT | OverloadKind::DOUBLE";
-  case MVT::Other:
-    // Handle DXIL-specific overload types
-    {
-      if (R->getValueAsInt("isHalfOrFloat")) {
-        return "OverloadKind::HALF | OverloadKind::FLOAT";
-      } else if (R->getValueAsInt("isI16OrI32")) {
-        return "OverloadKind::I16 | OverloadKind::I32";
-      }
-    }
-    [[fallthrough]];
   default:
-    llvm_unreachable(
-        "Support for specified parameter OverloadKind not yet implemented");
+    llvm_unreachable("Support for specified fixed type option for overload "
+                     "type not supported");
   }
 }
 
-/// Emit Enums of DXIL Ops
-/// \param A vector of DXIL Ops
-/// \param Output stream
-static void emitDXILEnums(std::vector<DXILOperationDesc> &Ops,
-                          raw_ostream &OS) {
-  // Sort by OpCode
-  llvm::sort(Ops, [](DXILOperationDesc &A, DXILOperationDesc &B) {
-    return A.OpCode < B.OpCode;
-  });
+/// Return a string representation of valid overload information denoted
+// by input records
+//
+/// \param Recs A vector of records of TableGen Overload records
+/// \return std::string string representation of overload mask string
+///         predicated by DXIL Version. E.g.,
+//          {{{1, 0}, Mask1}, {{1, 2}, Mask2}, ...}
+static std::string getOverloadMaskString(const SmallVector<Record *> Recs) {
+  std::string MaskString = "";
+  std::string Prefix = "";
+  MaskString.append("{");
+  // If no overload information records were specified, assume the operation
+  // a) to be supported in DXIL Version 1.0 and later
+  // b) has no overload types
+  if (Recs.empty()) {
+    MaskString.append("{{1, 0}, OverloadKind::UNDEFINED}}");
+  } else {
+    for (auto Rec : Recs) {
+      unsigned Major =
+          Rec->getValueAsDef("dxil_version")->getValueAsInt("Major");
+      unsigned Minor =
+          Rec->getValueAsDef("dxil_version")->getValueAsInt("Minor");
+      MaskString.append(Prefix)
+          .append("{{")
+          .append(std::to_string(Major))
+          .append(", ")
+          .append(std::to_string(Minor).append("}, "));
+
+      std::string PipePrefix = "";
+      auto Tys = Rec->getValueAsListOfDefs("overload_types");
+      if (Tys.empty()) {
+        MaskString.append("OverloadKind::UNDEFINED");
+      }
+      for (const auto *Ty : Tys) {
+        MaskString.append(PipePrefix).append(getOverloadKindStr(Ty));
+        PipePrefix = " | ";
+      }
 
-  OS << "// Enumeration for operations specified by DXIL\n";
-  OS << "enum class OpCode : unsigned {\n";
+      MaskString.append("}");
+      Prefix = ", ";
+    }
+    MaskString.append("}");
+  }
+  return MaskString;
+}
 
-  for (auto &Op : Ops) {
-    // Name = ID, // Doc
-    OS << Op.OpName << " = " << Op.OpCode << ", // " << Op.Doc << "\n";
+/// Return a string representation of valid shader stag information denoted
+// by input records
+//
+/// \param Recs A vector of records of TableGen Stages records
+/// \return std::string string representation of stages mask string
+///         predicated by DXIL Version. E.g.,
+//          {{{1, 0}, Mask1}, {{1, 2}, Mask2}, ...}
+static std::string getStageMaskString(const SmallVector<Record *> Recs) {
+  std::string MaskString = "";
+  std::string Prefix = "";
+  MaskString.append("{");
+  // Atleast one stage information record is expected to be specified.
+  if (Recs.empty()) {
+    PrintFatalError("Atleast one specification of valid stages for "
+                    "operation must be specified");
   }
 
-  OS << "\n};\n\n";
+  for (auto Rec : Recs) {
+    unsigned Major = Rec->getValueAsDef("dxil_version")->getValueAsInt("Major");
+    unsigned Minor = Rec->getValueAsDef("dxil_version")->getValueAsInt("Minor");
+    MaskString.append(Prefix)
+        .append("{{")
+        .append(std::to_string(Major))
+        .append(", ")
+        .append(std::to_string(Minor).append("}, "));
+
+    std::string PipePrefix = "";
+    auto Stages = Rec->getValueAsListOfDefs("shader_stages");
+    if (Stages.empty()) {
+      PrintFatalError("No valid stages for operation specified");
+    }
+    for (const auto *S : Stages) {
+      MaskString.append(PipePrefix).append("ShaderKind::").append(S->getName());
+      PipePrefix = " | ";
+    }
 
-  OS << "// Groups for DXIL operations with equivalent function templates\n";
-  OS << "enum class OpCodeClass : unsigned {\n";
-  // Build an OpClass set to print
-  SmallSet<StringRef, 2> OpClassSet;
-  for (auto &Op : Ops) {
-    OpClassSet.insert(Op.OpClass);
+    MaskString.append("}");
+    Prefix = ", ";
   }
-  for (auto &C : OpClassSet) {
-    OS << C << ",\n";
+  MaskString.append("}");
+  return MaskString;
+}
+
+/// Return a string representation of valid attribute information denoted
+// by input records
+//
+/// \param Recs A vector of records of TableGen Attribute records
+/// \return std::string string representation of stages mask string
+///         predicated by DXIL Version. E.g.,
+//          {{{1, 0}, Mask1}, {{1, 2}, Mask2}, ...}
+static std::string getAttributeMaskString(const SmallVector<Record *> Recs) {
+  std::string MaskString = "";
+  std::string Prefix = "";
+  MaskString.append("{");
+
+  for (auto Rec : Recs) {
+    unsigned Major = Rec->getValueAsDef("dxil_version")->getValueAsInt("Major");
+    unsigned Minor = Rec->getValueAsDef("dxil_version")->getValueAsInt("Minor");
+    MaskString.append(Prefix)
+        .append("{{")
+        .append(std::to_string(Major))
+        .append(", ")
+        .append(std::to_string(Minor).append("}, "));
+
+    std::string PipePrefix = "";
+    auto Attrs = Rec->getValueAsListOfDefs("op_attrs");
+    if (Attrs.empty()) {
+      MaskString.append("Attribute::None");
+    } else {
+      for (const auto *Attr : Attrs) {
+        MaskString.append(PipePrefix)
+            .append("Attribute::")
+            .append(Attr->getName());
+        PipePrefix = " | ";
+      }
+    }
+
+    MaskString.append("}");
+    Prefix = ", ";
   }
-  OS << "\n};\n\n";
+  MaskString.append("}");
+  return MaskString;
+}
+
+/// Emit a mapping of DXIL opcode to opname
+static void emitDXILOpCodes(std::vector<DXILOperationDesc> &Ops,
+                            raw_ostream &OS) {
+  OS << "#ifdef DXIL_OPCODE\n";
+  for (const DXILOperationDesc &Op : Ops)
+    OS << "DXIL_OPCODE(" << Op.OpCode << ", " << Op.OpName << ")\n";
+  OS << "#undef DXIL_OPCODE\n";
+  OS << "\n";
+  OS << "#endif\n\n";
+}
+
+/// Emit a list of DXIL op classes
+static void emitDXILOpClasses(RecordKeeper &Records,
+                              raw_ostream &OS) {
+  OS << "#ifdef DXIL_OPCLASS\n";
+  std::vector<Record *> OpClasses =
+      Records.getAllDerivedDefinitions("DXILOpClass");
+  for (Record *OpClass : OpClasses)
+    OS << "DXIL_OPCLASS(" << OpClass->getName() << ")\n";
+  OS << "#undef DXIL_OPCLASS\n";
+  OS << "#endif\n\n";
 }
 
 /// Emit map of DXIL operation to LLVM or DirectX intrinsic
@@ -317,37 +457,17 @@ static void emitDXILEnums(std::vector<DXILOperationDesc> &Ops,
 /// \param Output stream
 static void emitDXILIntrinsicMap(std::vector<DXILOperationDesc> &Ops,
                                  raw_ostream &OS) {
+  OS << "#ifdef DXIL_OP_INTRINSIC\n";
   OS << "\n";
-  // FIXME: use array instead of SmallDenseMap.
-  OS << "static const SmallDenseMap<Intrinsic::ID, dxil::OpCode> LowerMap = "
-        "{\n";
-  for (auto &Op : Ops) {
+  for (const auto &Op : Ops) {
     if (Op.Intrinsic.empty())
       continue;
-    // {Intrinsic::sin, dxil::OpCode::Sin},
-    OS << "  { Intrinsic::" << Op.Intrinsic << ", dxil::OpCode::" << Op.OpName
-       << "},\n";
+    OS << "DXIL_OP_INTRINSIC(dxil::OpCode::" << Op.OpName
+       << ", Intrinsic::" << Op.Intrinsic << ")\n";
   }
-  OS << "};\n";
   OS << "\n";
-}
-
-/// Convert operation attribute string to Attribute enum
-///
-/// \param Attr string reference
-/// \return std::string Attribute enum string
-
-static std::string emitDXILOperationAttr(SmallVector<std::string> Attrs) {
-  for (auto Attr : Attrs) {
-    // TODO: For now just recognize IntrNoMem and IntrReadMem as valid and
-    //  ignore others.
-    if (Attr == "IntrNoMem") {
-      return "Attribute::ReadNone";
-    } else if (Attr == "IntrReadMem") {
-      return "Attribute::ReadOnly";
-    }
-  }
-  return "Attribute::None";
+  OS << "#undef DXIL_OP_INTRINSIC\n";
+  OS << "#endif\n\n";
 }
 
 /// Emit DXIL operation table
@@ -355,11 +475,6 @@ static std::string emitDXILOperationAttr(SmallVector<std::string> Attrs) {
 /// \param Output stream
 static void emitDXILOperationTable(std::vector<DXILOperationDesc> &Ops,
                                    raw_ostream &OS) {
-  // Sort by OpCode.
-  llvm::sort(Ops, [](DXILOperationDesc &A, DXILOperationDesc &B) {
-    return A.OpCode < B.OpCode;
-  });
-
   // Collect Names.
   SequenceToOffsetTable<std::string> OpClassStrings;
   SequenceToOffsetTable<std::string> OpStrings;
@@ -388,15 +503,13 @@ static void emitDXILOperationTable(std::vector<DXILOperationDesc> &Ops,
   OpClassStrings.layout();
   Parameters.layout();
 
-  // Emit the DXIL operation table.
-  //{dxil::OpCode::Sin, OpCodeNameIndex, OpCodeClass::unary,
-  // OpCodeClassNameIndex,
-  // OverloadKind::FLOAT | OverloadKind::HALF, Attribute::AttrKind::ReadNone, 0,
-  // 3, ParameterTableOffset},
+  // Emit access function getOpcodeProperty() that embeds DXIL Operation table
+  // with entries of type struct OpcodeProperty.
   OS << "static const OpCodeProperty *getOpCodeProperty(dxil::OpCode Op) "
         "{\n";
 
   OS << "  static const OpCodeProperty OpCodeProps[] = {\n";
+  std::string Prefix = "";
   for (auto &Op : Ops) {
     // Consider Op.OverloadParamIndex as the overload parameter index, by
     // default
@@ -408,13 +521,15 @@ static void emitDXILOperationTable(std::vector<DXILOperationDesc> &Ops,
     if (OLParamIdx < 0) {
       OLParamIdx = (Op.OpTypes.size() > 1) ? 1 : 0;
     }
-    OS << "  { dxil::OpCode::" << Op.OpName << ", " << OpStrings.get(Op.OpName)
-       << ", OpCodeClass::" << Op.OpClass << ", "
+    OS << Prefix << "  { dxil::OpCode::" << Op.OpName << ", "
+       << OpStrings.get(Op.OpName) << ", OpCodeClass::" << Op.OpClass << ", "
        << OpClassStrings.get(Op.OpClass.data()) << ", "
-       << getOverloadKindStr(Op.OpTypes[OLParamIdx]) << ", "
-       << emitDXILOperationAttr(Op.OpAttributes) << ", "
-       << Op.OverloadParamIndex << ", " << Op.OpTypes.size() - 1 << ", "
-       << Parameters.get(ParameterMap[Op.OpClass]) << " },\n";
+       << getOverloadMaskString(Op.OverloadRecs) << ", "
+       << getStageMaskString(Op.StageRecs) << ", "
+       << getAttributeMaskString(Op.AttrRecs) << ", " << Op.OverloadParamIndex
+       << ", " << Op.OpTypes.size() - 1 << ", "
+       << Parameters.get(ParameterMap[Op.OpClass]) << " }";
+    Prefix = ",\n";
   }
   OS << "  };\n";
 
@@ -466,7 +581,43 @@ static void emitDXILOperationTable(std::vector<DXILOperationDesc> &Ops,
   OS << "  };\n\n";
   OS << "  unsigned Index = Prop.ParameterTableOffset;\n";
   OS << "  return DXILOpParameterKindTable + Index;\n";
-  OS << "}\n ";
+  OS << "}\n\n";
+}
+
+static void emitDXILOperationTableDataStructs(RecordKeeper &Records,
+                                              raw_ostream &OS) {
+  // Get Shader stage records
+  std::vector<Record *> ShaderKindRecs =
+      Records.getAllDerivedDefinitions("DXILShaderStage");
+  // Sort records by name
+  llvm::sort(ShaderKindRecs,
+             [](Record *A, Record *B) { return A->getName() < B->getName(); });
+
+  OS << "// Valid shader kinds\n\n";
+  // Choose the type of enum ShaderKind based on the number of stages declared.
+  // This gives the flexibility to just add add new stage records in DXIL.td, if
+  // needed, with no need to change this backend code.
+  size_t ShaderKindCount = ShaderKindRecs.size();
+  uint64_t ShaderKindTySz = PowerOf2Ceil(ShaderKindRecs.size() + 1);
+  OS << "enum ShaderKind : uint" << ShaderKindTySz << "_t {\n";
+  const std::string allStages("all_stages");
+  const std::string removed("removed");
+  int shiftVal = 1;
+  for (auto R : ShaderKindRecs) {
+    auto Name = R->getName();
+    if (Name.compare(removed) == 0) {
+      OS << "  " << Name
+         << " =  0,  // Pseudo-stage indicating op not supported in any "
+            "stage\n";
+    } else if (Name.compare(allStages) == 0) {
+      OS << "  " << Name << " =  0x"
+         << utohexstr(((1 << ShaderKindCount) - 1), false, 0)
+         << ", // Pseudo-stage indicating op is supported in all stages\n";
+    } else if (Name.compare(allStages)) {
+      OS << "  " << Name << " = 1 << " << std::to_string(shiftVal++) << ",\n";
+    }
+  }
+  OS << "}; // enum ShaderKind\n\n";
 }
 
 /// Entry function call that invokes the functionality of this TableGen backend
@@ -475,21 +626,31 @@ static void emitDXILOperationTable(std::vector<DXILOperationDesc> &Ops,
 static void EmitDXILOperation(RecordKeeper &Records, raw_ostream &OS) {
   OS << "// Generated code, do not edit.\n";
   OS << "\n";
-  // Get all DXIL Ops to intrinsic mapping records
-  std::vector<Record *> OpIntrMaps =
-      Records.getAllDerivedDefinitions("DXILOpMapping");
+  // Get all DXIL Ops property records
+  std::vector<Record *> OpIntrProps =
+      Records.getAllDerivedDefinitions("DXILOp");
   std::vector<DXILOperationDesc> DXILOps;
-  for (auto *Record : OpIntrMaps) {
+  for (auto *Record : OpIntrProps) {
     DXILOps.emplace_back(DXILOperationDesc(Record));
   }
-  OS << "#ifdef DXIL_OP_ENUM\n";
-  emitDXILEnums(DXILOps, OS);
-  OS << "#endif\n\n";
-  OS << "#ifdef DXIL_OP_INTRINSIC_MAP\n";
+  // Sort by opcode.
+  llvm::sort(DXILOps, [](DXILOperationDesc &A, DXILOperationDesc &B) {
+    return A.OpCode < B.OpCode;
+  });
+  int PrevOp = -1;
+  for (DXILOperationDesc &Desc : DXILOps) {
+    if (Desc.OpCode == PrevOp)
+      PrintFatalError(Twine("Duplicate opcode: ") + Twine(Desc.OpCode));
+    PrevOp = Desc.OpCode;
+  }
+
+  emitDXILOpCodes(DXILOps, OS);
+  emitDXILOpClasses(Records, OS);
   emitDXILIntrinsicMap(DXILOps, OS);
-  OS << "#endif\n\n";
-  OS << "#ifdef DXIL_OP_OPERATION_TABLE\n";
+  OS << "#ifdef DXIL_OP_OPERATION_TABLE\n\n";
+  emitDXILOperationTableDataStructs(Records, OS);
   emitDXILOperationTable(DXILOps, OS);
+  OS << "#undef DXIL_OP_OPERATION_TABLE\n";
   OS << "#endif\n\n";
 }
 
diff --git a/llvm/utils/TableGen/GlobalISelEmitter.cpp b/llvm/utils/TableGen/GlobalISelEmitter.cpp
index c29cb4e..2ebe8f7 100644
--- a/llvm/utils/TableGen/GlobalISelEmitter.cpp
+++ b/llvm/utils/TableGen/GlobalISelEmitter.cpp
@@ -1162,7 +1162,7 @@ Error GlobalISelEmitter::importChildMatcher(
       OperandMatcher &OM =
           InsnOperand.getInsnMatcher().addOperand(0, "", TempOpIdx);
       if (auto Error =
-              OM.addTypeCheckPredicate(VTy, false /* OperandIsAPointer */))
+              OM.addTypeCheckPredicate(TypeSetByHwMode(VTy), false /* OperandIsAPointer */))
         return failedImport(toString(std::move(Error)) +
                             " for result of Src pattern operator");
 
diff --git a/llvm/utils/TableGen/PseudoLoweringEmitter.cpp b/llvm/utils/TableGen/PseudoLoweringEmitter.cpp
index 01cfd4a..7c3abef 100644
--- a/llvm/utils/TableGen/PseudoLoweringEmitter.cpp
+++ b/llvm/utils/TableGen/PseudoLoweringEmitter.cpp
@@ -227,22 +227,20 @@ void PseudoLoweringEmitter::emitLoweringEmitter(raw_ostream &o) {
   // Emit file header.
   emitSourceFileHeader("Pseudo-instruction MC lowering Source Fragment", o);
 
-  o << "bool " << Target.getName() + "AsmPrinter"
-    << "::\n"
-    << "emitPseudoExpansionLowering(MCStreamer &OutStreamer,\n"
-    << "                            const MachineInstr *MI) {\n";
+  o << "bool " << Target.getName() + "AsmPrinter::\n"
+    << "lowerPseudoInstExpansion(const MachineInstr *MI, MCInst &Inst) {\n";
 
   if (!Expansions.empty()) {
-    o << "  switch (MI->getOpcode()) {\n"
+    o << "  Inst.clear();\n"
+      << "  switch (MI->getOpcode()) {\n"
       << "  default: return false;\n";
     for (auto &Expansion : Expansions) {
       CodeGenInstruction &Source = Expansion.Source;
       CodeGenInstruction &Dest = Expansion.Dest;
       o << "  case " << Source.Namespace << "::" << Source.TheDef->getName()
         << ": {\n"
-        << "    MCInst TmpInst;\n"
         << "    MCOperand MCOp;\n"
-        << "    TmpInst.setOpcode(" << Dest.Namespace
+        << "    Inst.setOpcode(" << Dest.Namespace
         << "::" << Dest.TheDef->getName() << ");\n";
 
       // Copy the operands from the source instruction.
@@ -260,15 +258,15 @@ void PseudoLoweringEmitter::emitLoweringEmitter(raw_ostream &o) {
                          .MIOperandNo +
                      i
               << "), MCOp);\n"
-              << "    TmpInst.addOperand(MCOp);\n";
+              << "    Inst.addOperand(MCOp);\n";
             break;
           case OpData::Imm:
-            o << "    TmpInst.addOperand(MCOperand::createImm("
+            o << "    Inst.addOperand(MCOperand::createImm("
               << Expansion.OperandMap[MIOpNo + i].Data.Imm << "));\n";
             break;
           case OpData::Reg: {
             Record *Reg = Expansion.OperandMap[MIOpNo + i].Data.Reg;
-            o << "    TmpInst.addOperand(MCOperand::createReg(";
+            o << "    Inst.addOperand(MCOperand::createReg(";
             // "zero_reg" is special.
             if (Reg->getName() == "zero_reg")
               o << "0";
@@ -287,10 +285,9 @@ void PseudoLoweringEmitter::emitLoweringEmitter(raw_ostream &o) {
         o << "    for (unsigned i = " << MIOpNo
           << ", e = MI->getNumOperands(); i != e; ++i)\n"
           << "      if (lowerOperand(MI->getOperand(i), MCOp))\n"
-          << "        TmpInst.addOperand(MCOp);\n";
+          << "        Inst.addOperand(MCOp);\n";
       }
-      o << "    EmitToStreamer(OutStreamer, TmpInst);\n"
-        << "    break;\n"
+      o << "    break;\n"
         << "  }\n";
     }
     o << "  }\n  return true;";
diff --git a/llvm/utils/TableGen/VTEmitter.cpp b/llvm/utils/TableGen/VTEmitter.cpp
index 851a525..eb58148 100644
--- a/llvm/utils/TableGen/VTEmitter.cpp
+++ b/llvm/utils/TableGen/VTEmitter.cpp
@@ -79,12 +79,12 @@ static void VTtoGetLLVMTyString(raw_ostream &OS, const Record *VT) {
 void VTEmitter::run(raw_ostream &OS) {
   emitSourceFileHeader("ValueTypes Source Fragment", OS, Records);
 
-  std::array<const Record *, 256> VTsByNumber = {};
+  std::vector<const Record *> VTsByNumber{512};
   auto ValueTypes = Records.getAllDerivedDefinitions("ValueType");
   for (auto *VT : ValueTypes) {
     auto Number = VT->getValueAsInt("Value");
     assert(0 <= Number && Number < (int)VTsByNumber.size() &&
-           "ValueType should be uint8_t");
+           "ValueType should be uint16_t");
     assert(!VTsByNumber[Number] && "Duplicate ValueType");
     VTsByNumber[Number] = VT;
   }
diff --git a/llvm/utils/gn/secondary/lldb/source/Interpreter/BUILD.gn b/llvm/utils/gn/secondary/lldb/source/Interpreter/BUILD.gn
index 7825396..c6c6ef9 100644
--- a/llvm/utils/gn/secondary/lldb/source/Interpreter/BUILD.gn
+++ b/llvm/utils/gn/secondary/lldb/source/Interpreter/BUILD.gn
@@ -15,6 +15,7 @@ static_library("Interpreter") {
   deps = [
     ":InterpreterProperties",
     ":InterpreterPropertiesEnum",
+    "Interfaces",
     "//lldb/source/Commands",
     "//lldb/source/Core",
     "//lldb/source/DataFormatters",
diff --git a/llvm/utils/gn/secondary/lldb/source/Interpreter/Interfaces/BUILD.gn b/llvm/utils/gn/secondary/lldb/source/Interpreter/Interfaces/BUILD.gn
new file mode 100644
index 0000000..2e70c54
--- /dev/null
+++ b/llvm/utils/gn/secondary/lldb/source/Interpreter/Interfaces/BUILD.gn
@@ -0,0 +1,9 @@
+static_library("Interfaces") {
+  output_name = "lldbInterpreterInterfaces"
+  configs += [ "//llvm/utils/gn/build:lldb_code" ]
+  deps = [
+    "//lldb/source/Utility",
+    "//llvm/lib/Support",
+  ]
+  sources = [ "ScriptedInterfaceUsages.cpp" ]
+}
diff --git a/llvm/utils/gn/secondary/llvm/tools/llvm-ctxprof-util/BUILD.gn b/llvm/utils/gn/secondary/llvm/tools/llvm-ctxprof-util/BUILD.gn
index fd921f6..76f28a7 100644
--- a/llvm/utils/gn/secondary/llvm/tools/llvm-ctxprof-util/BUILD.gn
+++ b/llvm/utils/gn/secondary/llvm/tools/llvm-ctxprof-util/BUILD.gn
@@ -1,6 +1,4 @@
-import("//llvm/utils/gn/build/driver_executable.gni")
-
-driver_executable("llvm-ctxprof-util") {
+executable("llvm-ctxprof-util") {
   deps = [
     "//llvm/lib/IR",
     "//llvm/lib/Object",
diff --git a/mlir/docs/DefiningDialects/Operations.md b/mlir/docs/DefiningDialects/Operations.md
index 01fadef..c252d9c 100644
--- a/mlir/docs/DefiningDialects/Operations.md
+++ b/mlir/docs/DefiningDialects/Operations.md
@@ -101,6 +101,9 @@ their semantics via a special [TableGen backend][TableGenBackend]:
 *   The `AttrConstraint` class hierarchy: They are used to specify the
     constraints over attributes. A notable subclass hierarchy is `Attr`, which
     stands for constraints for attributes whose values are of common types.
+*   The `Property` class hierarchy: They are used to specify non-attribute-backed
+    properties that are inherent to operations. This will be expanded to a
+    `PropertyConstraint` class or something similar in the future.
 
 An operation is defined by specializing the `Op` class with concrete contents
 for all the fields it requires. For example, `tf.AvgPool` is defined as
@@ -172,9 +175,9 @@ understanding the operation.
 
 ### Operation arguments
 
-There are two kinds of arguments: operands and attributes. Operands are runtime
-values produced by other ops; while attributes are compile-time known constant
-values, including two categories:
+There are three kinds of arguments: operands, attributes, and properties.
+Operands are runtime values produced by other ops; while attributes and properties
+are compile-time known constant values, including two categories:
 
 1.  Natural attributes: these attributes affect the behavior of the operations
     (e.g., padding for convolution);
@@ -187,8 +190,11 @@ values, including two categories:
     even though they are not materialized, it should be possible to store as an
     attribute.
 
-Both operands and attributes are specified inside the `dag`-typed `arguments`,
-led by `ins`:
+Properties are similar to attributes, except that they are not stored within
+the MLIR context but are stored inline with the operation.
+
+Operands, attributes, and properties are specified inside the `dag`-typed
+`arguments`, led by `ins`:
 
 ```tablegen
 let arguments = (ins
@@ -196,13 +202,15 @@ let arguments = (ins
   ...
   <attr-constraint>:$<attr-name>,
   ...
+  <property-constraint>:$<property-name>,
 );
 ```
 
 Here `<type-constraint>` is a TableGen `def` from the `TypeConstraint` class
 hierarchy. Similarly, `<attr-constraint>` is a TableGen `def` from the
-`AttrConstraint` class hierarchy. See [Constraints](#constraints) for more
-information.
+`AttrConstraint` class hierarchy and `<property-constraint>` is a subclass
+of `Property` (though a `PropertyConstraint` hierarchy is planned).
+See [Constraints](#constraints) for more information.
 
 There is no requirements on the relative order of operands and attributes; they
 can mix freely. The relative order of operands themselves matters. From each
@@ -324,6 +332,18 @@ Right now, the following primitive constraints are supported:
 
 TODO: Design and implement more primitive constraints
 
+#### Optional and default-valued properties
+
+To declare a property with a default value, use `DefaultValuedProperty<..., "...">`.
+If the property's storage data type is different from its interface type,
+for example, in the case of array properties (which are stored as `SmallVector`s
+but use `ArrayRef` as an interface type), add the storage-type equivalent
+of the default value as the third argument.
+
+To declare an optional property, use `OptionalProperty<...>`.
+This wraps the underlying property in an `std::optional` and gives it a
+default value of `std::nullopt`.
+
 #### Combining constraints
 
 `AllAttrOf` is provided to allow combination of multiple constraints which
@@ -429,6 +449,8 @@ def MyOp : ... {
     I32Attr:$i32_attr,
     F32Attr:$f32_attr,
     ...
+    I32Property:$i32_prop,
+    ...
   );
 
   let results = (outs
@@ -453,7 +475,8 @@ static void build(OpBuilder &odsBuilder, OperationState &odsState,
 static void build(OpBuilder &odsBuilder, OperationState &odsState,
                   Type i32_result, Type f32_result, ...,
                   Value i32_operand, Value f32_operand, ...,
-                  IntegerAttr i32_attr, FloatAttr f32_attr, ...);
+                  IntegerAttr i32_attr, FloatAttr f32_attr, ...,
+                  int32_t i32_prop);
 
 // Each result-type/operand/attribute has a separate parameter. The parameters
 // for attributes are raw values unwrapped with mlir::Attribute instances.
@@ -462,13 +485,15 @@ static void build(OpBuilder &odsBuilder, OperationState &odsState,
 static void build(OpBuilder &odsBuilder, OperationState &odsState,
                   Type i32_result, Type f32_result, ...,
                   Value i32_operand, Value f32_operand, ...,
-                  APInt i32_attr, StringRef f32_attr, ...);
+                  APInt i32_attr, StringRef f32_attr, ...,
+                  int32_t i32_prop, ...);
 
 // Each operand/attribute has a separate parameter but result type is aggregate.
 static void build(OpBuilder &odsBuilder, OperationState &odsState,
                   TypeRange resultTypes,
                   Value i32_operand, Value f32_operand, ...,
-                  IntegerAttr i32_attr, FloatAttr f32_attr, ...);
+                  IntegerAttr i32_attr, FloatAttr f32_attr, ...,
+                  int32_t i32_prop, ...);
 
 // All operands/attributes have aggregate parameters.
 // Generated if return type can be inferred.
@@ -921,8 +946,10 @@ optional-group: `(` then-elements `)` (`:` `(` else-elements `)`)? `?`
 The elements of an optional group have the following requirements:
 
 *   The first element of `then-elements` must either be a attribute, literal,
-    operand, or region.
+    operand, property, or region.
     -   This is because the first element must be optionally parsable.
+    -   If a property is used, it must have an `optionalParser` defined and have a
+        default value.
 *   Exactly one argument variable or type directive within either
     `then-elements` or `else-elements` must be marked as the anchor of the
     group.
@@ -984,6 +1011,8 @@ foo.op is_read_only
 foo.op
 ```
 
+The same logic applies to a `UnitProperty`.
+
 ##### Optional "else" Group
 
 Optional groups also have support for an "else" group of elements. These are
@@ -1026,6 +1055,8 @@ to:
 1.  All operand and result types must appear within the format using the various
     `type` directives, either individually or with the `operands` or `results`
     directives.
+1.  Unless all non-attribute properties appear in the format, the `prop-dict`
+    directive must be present.
 1.  The `attr-dict` directive must always be present.
 1.  Must not contain overlapping information; e.g. multiple instances of
     'attr-dict', types, operands, etc.
diff --git a/mlir/include/mlir/Dialect/AMDGPU/IR/AMDGPU.td b/mlir/include/mlir/Dialect/AMDGPU/IR/AMDGPU.td
index 3f27e15..aa2b454 100644
--- a/mlir/include/mlir/Dialect/AMDGPU/IR/AMDGPU.td
+++ b/mlir/include/mlir/Dialect/AMDGPU/IR/AMDGPU.td
@@ -433,6 +433,46 @@ def AMDGPU_LDSBarrierOp : AMDGPU_Op<"lds_barrier"> {
   let assemblyFormat = "attr-dict";
 }
 
+def AMDGPU_SchedBarrierOpOpt : I32BitEnumAttr<"sched_barrier_opt_enum",
+    "The possible options for scheduling barriers",
+    [
+      I32BitEnumAttrCaseNone<"none">,
+      I32BitEnumAttrCaseBit<"non_mem_non_sideffect", 0>,
+      I32BitEnumAttrCaseBit<"valu", 1>,
+      I32BitEnumAttrCaseBit<"salu", 2>,
+      I32BitEnumAttrCaseBit<"mfma_wmma",  3>,
+      I32BitEnumAttrCaseBit<"all_vmem",  4>,
+      I32BitEnumAttrCaseBit<"vmem_read",  5>,
+      I32BitEnumAttrCaseBit<"vmem_write", 6>,
+      I32BitEnumAttrCaseBit<"all_ds", 7>,
+      I32BitEnumAttrCaseBit<"ds_read", 8>,
+      I32BitEnumAttrCaseBit<"ds_write", 9>,
+      I32BitEnumAttrCaseBit<"transcendental", 10>
+    ]> {
+  let genSpecializedAttr = 0;
+  let cppNamespace = "::mlir::amdgpu";
+}
+
+def AMDGPU_SchedBarrierOpOptAttr : EnumAttr<AMDGPU_Dialect, AMDGPU_SchedBarrierOpOpt,
+  "sched_barrier_opt">{
+   let assemblyFormat = "`<` $value `>`";
+}
+
+def AMDGPU_SchedBarrierOp :
+  AMDGPU_Op<"sched_barrier">,
+  Arguments<(ins  AMDGPU_SchedBarrierOpOptAttr:$opts)>
+  {
+  let summary = "Barrier that limits the backend scheduler of instruction movement";
+  let description = [{
+    `amdgpu.sched_barrier` serves as a barrier that could be
+    configured to restrict movements of instructions through it as
+    defined by sched_barrier_opts.
+  }];
+  let assemblyFormat = [{
+    `allow` `=` $opts attr-dict
+  }];
+}
+
 def AMDGPU_MFMAPermB : I32EnumAttr<"MFMAPermB",
     "The possible permutations of the lanes storing B available in an MFMA",
     [
diff --git a/mlir/include/mlir/Dialect/ControlFlow/IR/ControlFlowOps.td b/mlir/include/mlir/Dialect/ControlFlow/IR/ControlFlowOps.td
index 181bc5a..3bdbfb0 100644
--- a/mlir/include/mlir/Dialect/ControlFlow/IR/ControlFlowOps.td
+++ b/mlir/include/mlir/Dialect/ControlFlow/IR/ControlFlowOps.td
@@ -50,7 +50,7 @@ def AssertOp : CF_Op<"assert",
     Example:
 
     ```mlir
-    assert %b, "Expected ... to be true"
+    cf.assert %b, "Expected ... to be true"
     ```
   }];
 
@@ -118,7 +118,7 @@ def CondBranchOp : CF_Op<"cond_br",
      Pure, Terminator]> {
   let summary = "conditional branch operation";
   let description = [{
-    The `cond_br` terminator operation represents a conditional branch on a
+    The `cf.cond_br` terminator operation represents a conditional branch on a
     boolean (1-bit integer) value. If the bit is set, then the first destination
     is jumped to; if it is false, the second destination is chosen. The count
     and types of operands must align with the arguments in the corresponding
@@ -136,7 +136,7 @@ def CondBranchOp : CF_Op<"cond_br",
     ```mlir
     func.func @select(%a: i32, %b: i32, %flag: i1) -> i32 {
       // Both targets are the same, operands differ
-      cond_br %flag, ^bb1(%a : i32), ^bb1(%b : i32)
+      cf.cond_br %flag, ^bb1(%a : i32), ^bb1(%b : i32)
 
     ^bb1(%x : i32) :
       return %x : i32
@@ -233,7 +233,7 @@ def SwitchOp : CF_Op<"switch",
      Pure, Terminator]> {
   let summary = "switch operation";
   let description = [{
-    The `switch` terminator operation represents a switch on a signless integer
+    The `cf.switch` terminator operation represents a switch on a signless integer
     value. If the flag matches one of the specified cases, then the
     corresponding destination is jumped to. If the flag does not match any of
     the cases, the default destination is jumped to. The count and types of
@@ -242,7 +242,7 @@ def SwitchOp : CF_Op<"switch",
     Example:
 
     ```mlir
-    switch %flag : i32, [
+    cf.switch %flag : i32, [
       default: ^bb1(%a : i32),
       42: ^bb1(%b : i32),
       43: ^bb3(%c : i32)
diff --git a/mlir/include/mlir/Dialect/EmitC/IR/EmitC.td b/mlir/include/mlir/Dialect/EmitC/IR/EmitC.td
index 626b057..b8f9615 100644
--- a/mlir/include/mlir/Dialect/EmitC/IR/EmitC.td
+++ b/mlir/include/mlir/Dialect/EmitC/IR/EmitC.td
@@ -989,7 +989,7 @@ def EmitC_UnaryMinusOp : EmitC_UnaryOp<"unary_minus", [CExpression]> {
     Example:
 
     ```mlir
-    %0 = emitc.unary_plus %arg0 : (i32) -> i32
+    %0 = emitc.unary_minus %arg0 : (i32) -> i32
     ```
     ```c++
     // Code emitted for the operation above.
diff --git a/mlir/include/mlir/Dialect/LLVMIR/LLVMOps.td b/mlir/include/mlir/Dialect/LLVMIR/LLVMOps.td
index b14bd83..260d421 100644
--- a/mlir/include/mlir/Dialect/LLVMIR/LLVMOps.td
+++ b/mlir/include/mlir/Dialect/LLVMIR/LLVMOps.td
@@ -59,22 +59,9 @@ class LLVM_IntArithmeticOpWithOverflowFlag<string mnemonic, string instName,
                                    list<Trait> traits = []> :
     LLVM_ArithmeticOpBase<AnySignlessInteger, mnemonic, instName,
     !listconcat([DeclareOpInterfaceMethods<IntegerOverflowFlagsInterface>], traits)> {
-  dag iofArg = (ins EnumProperty<"IntegerOverflowFlags">:$overflowFlags);
+  dag iofArg = (ins EnumProperty<"IntegerOverflowFlags", "", "IntegerOverflowFlags::none">:$overflowFlags);
   let arguments = !con(commonArgs, iofArg);
 
-  let builders = [
-    OpBuilder<(ins "Type":$type, "Value":$lhs, "Value":$rhs,
-                   "IntegerOverflowFlags":$overflowFlags), [{
-      $_state.getOrAddProperties<Properties>().overflowFlags = overflowFlags;
-      build($_builder, $_state, type, lhs, rhs);
-    }]>,
-    OpBuilder<(ins "Value":$lhs, "Value":$rhs,
-                   "IntegerOverflowFlags":$overflowFlags), [{
-      $_state.getOrAddProperties<Properties>().overflowFlags = overflowFlags;
-      build($_builder, $_state, lhs, rhs);
-    }]>
-  ];
-
   string mlirBuilder = [{
     auto op = $_builder.create<$_qualCppClassName>($_location, $lhs, $rhs);
     moduleImport.setIntegerOverflowFlags(inst, op);
diff --git a/mlir/include/mlir/Dialect/Linalg/TransformOps/LinalgTransformOps.h b/mlir/include/mlir/Dialect/Linalg/TransformOps/LinalgTransformOps.h
index 3af6427..db25c9b 100644
--- a/mlir/include/mlir/Dialect/Linalg/TransformOps/LinalgTransformOps.h
+++ b/mlir/include/mlir/Dialect/Linalg/TransformOps/LinalgTransformOps.h
@@ -30,6 +30,10 @@ class GenericOp;
 class LinalgOp;
 } // namespace linalg
 
+namespace scf {
+struct SCFTilingResult;
+} // namespace scf
+
 namespace tensor {
 class InsertSliceOp;
 class PackOp;
@@ -60,7 +64,7 @@ tileToForallOpImpl(RewriterBase &rewriter, transform::TransformState &state,
                    ArrayRef<OpFoldResult> mixedNumThreads,
                    ArrayRef<OpFoldResult> mixedTileSizes,
                    std::optional<ArrayAttr> mapping,
-                   linalg::ForallTilingResult &tilingResult);
+                   scf::SCFTilingResult &tilingResult);
 
 } // namespace transform
 } // namespace mlir
diff --git a/mlir/include/mlir/Dialect/Linalg/Transforms/Transforms.h b/mlir/include/mlir/Dialect/Linalg/Transforms/Transforms.h
index 0c7a8ed..477ef7b 100644
--- a/mlir/include/mlir/Dialect/Linalg/Transforms/Transforms.h
+++ b/mlir/include/mlir/Dialect/Linalg/Transforms/Transforms.h
@@ -866,29 +866,6 @@ FailureOr<ContinuousTileSizeSpecification>
 computeContinuousTileSizes(OpBuilder &builder, TilingInterface op,
                            unsigned dimension, OpFoldResult targetSize,
                            bool emitAssertions);
-/// Rewrite a TilingInterface `op` to a tiled `scf.forall`, applying
-/// tiling by `numThreads`.
-/// If non-empty, the `mapping` is added as an attribute to the
-/// resulting `scf.forall`.
-/// Zero tile sizes indicate that the dimension is not tiled, and can be
-/// thought of as tiling by the full size of data. It is the user's
-/// responsibility to ensure that `numThreads` is a valid tiling specification
-/// (i.e. that only tiles parallel dimensions, e.g. in the Linalg case).
-struct ForallTilingResult {
-  Operation *tileOp;
-  Operation *tiledOp;
-};
-FailureOr<ForallTilingResult> tileToForallOp(RewriterBase &builder,
-                                             TilingInterface op,
-                                             ArrayRef<OpFoldResult> numThreads,
-                                             std::optional<ArrayAttr> mapping);
-
-/// Same as `tileToForallOp`, but calculate the number of threads
-/// required using the given tileSizes.
-FailureOr<ForallTilingResult>
-tileToForallOpUsingTileSizes(RewriterBase &builder, TilingInterface op,
-                             ArrayRef<OpFoldResult> tileSizes,
-                             std::optional<ArrayAttr> mapping);
 
 /// Transformation information returned after reduction tiling.
 struct ForallReductionTilingResult {
@@ -1750,10 +1727,12 @@ void populateWinogradConv2DPatterns(RewritePatternSet &patterns, int64_t m,
 void populateDecomposeWinogradOpsPatterns(RewritePatternSet &patterns);
 
 /// Adds patterns that reduce the rank of named contraction ops that have
-/// unit dimensions in the operand(s) by converting to a sequence of `collapse_shape`,
-/// `<corresponding linalg named op>`, `expand_shape` (if on tensors).  For example a
-/// `linalg.batch_matmul` with unit batch size will convert to `linalg.matmul`
-/// and a `linalg.matvec` with with unit spatial dim in lhs will convert to a `linalg.dot`.
+/// unit dimensions in the operand(s) by converting to a sequence of
+/// `collapse_shape`,
+/// `<corresponding linalg named op>`, `expand_shape` (if on tensors).  For
+/// example a `linalg.batch_matmul` with unit batch size will convert to
+/// `linalg.matmul` and a `linalg.matvec` with with unit spatial dim in lhs will
+/// convert to a `linalg.dot`.
 void populateContractionOpRankReducingPatterns(RewritePatternSet &patterns);
 
 } // namespace linalg
diff --git a/mlir/include/mlir/Dialect/NVGPU/IR/NVGPU.td b/mlir/include/mlir/Dialect/NVGPU/IR/NVGPU.td
index dda8f31..1f52f6b 100644
--- a/mlir/include/mlir/Dialect/NVGPU/IR/NVGPU.td
+++ b/mlir/include/mlir/Dialect/NVGPU/IR/NVGPU.td
@@ -20,6 +20,7 @@
 #ifndef NVGPU
 #define NVGPU
 
+include "mlir/Interfaces/InferTypeOpInterface.td"
 include "mlir/Interfaces/SideEffectInterfaces.td"
 include "mlir/IR/AttrTypeBase.td"
 include "mlir/IR/OpBase.td"
@@ -109,10 +110,22 @@ def TensorMapInterleaveKind : I32EnumAttr<"TensorMapInterleaveKind",
   let cppNamespace = "::mlir::nvgpu";
 }
 
+def RcpApprox : I32EnumAttrCase<"APPROX", 0, "approx">;
+def RcpRN     : I32EnumAttrCase<"RN", 1, "rn">;
+def RcpRZ     : I32EnumAttrCase<"RZ", 2, "rz">;
+def RcpRM     : I32EnumAttrCase<"RM", 3, "rm">;
+def RcpRP     : I32EnumAttrCase<"RP", 4, "rp">;
+def RcpRoundingMode   : I32EnumAttr<"RcpRoundingMode", "Rounding mode of rcp",
+  [RcpApprox, RcpRN, RcpRZ, RcpRM, RcpRP]> {
+  let genSpecializedAttr = 0;
+  let cppNamespace = "::mlir::nvgpu";
+}
+
 def TensorMapSwizzleAttr : EnumAttr<NVGPU_Dialect, TensorMapSwizzleKind, "swizzle">;
 def TensorMapL2PromoAttr : EnumAttr<NVGPU_Dialect, TensorMapL2PromoKind, "l2promo">;
 def TensorMapOOBAttr : EnumAttr<NVGPU_Dialect, TensorMapOOBKind, "oob">;
 def TensorMapInterleaveAttr : EnumAttr<NVGPU_Dialect, TensorMapInterleaveKind, "interleave">;
+def RcpRoundingModeAttr : EnumAttr<NVGPU_Dialect, RcpRoundingMode, "rcp_rounding_mode">;
 
 //===----------------------------------------------------------------------===//
 // NVGPU Type Definitions
@@ -802,4 +815,24 @@ def NVGPU_WarpgroupMmaInitAccumulatorOp : NVGPU_Op<"warpgroup.mma.init.accumulat
   let hasVerifier = 1;
 }
 
+def NVGPU_RcpOp : NVGPU_Op<"rcp", [Pure,
+                                   SameOperandsAndResultType]> {
+  let summary = "The reciprocal calculation for vector types";
+  let description = [{
+    Reciprocal calculation for `vector` types using `nvvm.rcp` OPs.
+
+    Currently, only the `approx` rounding mode and `ftz` are supported, and only for the `f32` type.
+
+    The input and output must be of the same vector type and shape.
+  }];
+  let arguments = (ins VectorOf<[F32]>:$in,
+                       DefaultValuedAttr<RcpRoundingModeAttr, "RcpRoundingMode::APPROX">:$rounding,
+                       UnitAttr:$ftz);
+  let results = (outs VectorOf<[F32]>:$out);
+  let assemblyFormat = [{
+    $in `{` `rounding` `=` $rounding (`,` `ftz` $ftz^)? `}` 
+    attr-dict `:` type($out)
+  }];
+  let hasVerifier = 1;
+}
 #endif // NVGPU
diff --git a/mlir/include/mlir/Dialect/NVGPU/IR/NVGPUDialect.h b/mlir/include/mlir/Dialect/NVGPU/IR/NVGPUDialect.h
index 19070f6..aad2ac6 100644
--- a/mlir/include/mlir/Dialect/NVGPU/IR/NVGPUDialect.h
+++ b/mlir/include/mlir/Dialect/NVGPU/IR/NVGPUDialect.h
@@ -17,6 +17,7 @@
 #include "mlir/IR/BuiltinTypes.h"
 #include "mlir/IR/Dialect.h"
 #include "mlir/IR/OpDefinition.h"
+#include "mlir/Interfaces/InferTypeOpInterface.h"
 #include "mlir/Interfaces/SideEffectInterfaces.h"
 
 #include "mlir/Dialect/NVGPU/IR/NVGPUEnums.h.inc"
diff --git a/mlir/include/mlir/Dialect/OpenACC/CMakeLists.txt b/mlir/include/mlir/Dialect/OpenACC/CMakeLists.txt
index 2aa4b18..66b1e89 100644
--- a/mlir/include/mlir/Dialect/OpenACC/CMakeLists.txt
+++ b/mlir/include/mlir/Dialect/OpenACC/CMakeLists.txt
@@ -21,9 +21,10 @@ mlir_tablegen(OpenACCOpsAttributes.cpp.inc -gen-attrdef-defs -attrdefs-dialect=a
 add_public_tablegen_target(MLIROpenACCAttributesIncGen)
 add_dependencies(mlir-headers MLIROpenACCAttributesIncGen)
 
+add_mlir_interface(OpenACCOpsInterfaces)
+
 set(LLVM_TARGET_DEFINITIONS OpenACCTypeInterfaces.td)
 mlir_tablegen(OpenACCTypeInterfaces.h.inc -gen-type-interface-decls)
 mlir_tablegen(OpenACCTypeInterfaces.cpp.inc -gen-type-interface-defs)
 add_public_tablegen_target(MLIROpenACCTypeInterfacesIncGen)
 add_dependencies(mlir-headers MLIROpenACCTypeInterfacesIncGen)
-
diff --git a/mlir/include/mlir/Dialect/OpenACC/OpenACC.h b/mlir/include/mlir/Dialect/OpenACC/OpenACC.h
index 8239367..ca96ce6 100644
--- a/mlir/include/mlir/Dialect/OpenACC/OpenACC.h
+++ b/mlir/include/mlir/Dialect/OpenACC/OpenACC.h
@@ -22,6 +22,7 @@
 #include "mlir/Bytecode/BytecodeOpInterface.h"
 #include "mlir/Dialect/OpenACC/OpenACCOpsDialect.h.inc"
 #include "mlir/Dialect/OpenACC/OpenACCOpsEnums.h.inc"
+#include "mlir/Dialect/OpenACC/OpenACCOpsInterfaces.h.inc"
 #include "mlir/Dialect/OpenACC/OpenACCTypeInterfaces.h.inc"
 #include "mlir/Dialect/OpenACCMPCommon/Interfaces/AtomicInterfaces.h"
 #include "mlir/Interfaces/ControlFlowInterfaces.h"
diff --git a/mlir/include/mlir/Dialect/OpenACC/OpenACCOps.td b/mlir/include/mlir/Dialect/OpenACC/OpenACCOps.td
index 148bed6..d9f3825 100644
--- a/mlir/include/mlir/Dialect/OpenACC/OpenACCOps.td
+++ b/mlir/include/mlir/Dialect/OpenACC/OpenACCOps.td
@@ -22,6 +22,7 @@ include "mlir/IR/OpBase.td"
 include "mlir/IR/SymbolInterfaces.td"
 include "mlir/Dialect/OpenACC/OpenACCBase.td"
 include "mlir/Dialect/OpenACC/OpenACCOpsTypes.td"
+include "mlir/Dialect/OpenACC/OpenACCOpsInterfaces.td"
 include "mlir/Dialect/OpenACC/OpenACCTypeInterfaces.td"
 include "mlir/Dialect/OpenACCMPCommon/Interfaces/AtomicInterfaces.td"
 include "mlir/Dialect/OpenACCMPCommon/Interfaces/OpenACCMPOpsInterfaces.td"
@@ -1067,7 +1068,9 @@ def OpenACC_ReductionRecipeOp : OpenACC_Op<"reduction.recipe",
 //===----------------------------------------------------------------------===//
 
 def OpenACC_ParallelOp : OpenACC_Op<"parallel",
-    [AttrSizedOperandSegments, RecursiveMemoryEffects,
+    [AttrSizedOperandSegments, AutomaticAllocationScope,
+     RecursiveMemoryEffects,
+     DeclareOpInterfaceMethods<ComputeRegionOpInterface>,
      MemoryEffects<[MemWrite<OpenACC_ConstructResource>,
                     MemRead<OpenACC_CurrentDeviceIdResource>]>]> {
   let summary = "parallel construct";
@@ -1231,7 +1234,9 @@ def OpenACC_ParallelOp : OpenACC_Op<"parallel",
 //===----------------------------------------------------------------------===//
 
 def OpenACC_SerialOp : OpenACC_Op<"serial",
-    [AttrSizedOperandSegments, RecursiveMemoryEffects,
+    [AttrSizedOperandSegments, AutomaticAllocationScope,
+     RecursiveMemoryEffects,
+     DeclareOpInterfaceMethods<ComputeRegionOpInterface>,
      MemoryEffects<[MemWrite<OpenACC_ConstructResource>,
                     MemRead<OpenACC_CurrentDeviceIdResource>]>]> {
   let summary = "serial construct";
@@ -1347,7 +1352,9 @@ def OpenACC_SerialOp : OpenACC_Op<"serial",
 //===----------------------------------------------------------------------===//
 
 def OpenACC_KernelsOp : OpenACC_Op<"kernels",
-    [AttrSizedOperandSegments, RecursiveMemoryEffects,
+    [AttrSizedOperandSegments, AutomaticAllocationScope,
+     RecursiveMemoryEffects,
+     DeclareOpInterfaceMethods<ComputeRegionOpInterface>,
      MemoryEffects<[MemWrite<OpenACC_ConstructResource>,
                     MemRead<OpenACC_CurrentDeviceIdResource>]>]> {
   let summary = "kernels construct";
@@ -1737,9 +1744,11 @@ def OpenACC_HostDataOp : OpenACC_Op<"host_data",
 //===----------------------------------------------------------------------===//
 
 def OpenACC_LoopOp : OpenACC_Op<"loop",
-    [AttrSizedOperandSegments, RecursiveMemoryEffects,
-     MemoryEffects<[MemWrite<OpenACC_ConstructResource>]>,
-     DeclareOpInterfaceMethods<LoopLikeOpInterface>]> {
+    [AttrSizedOperandSegments, AutomaticAllocationScope,
+     RecursiveMemoryEffects,
+     DeclareOpInterfaceMethods<ComputeRegionOpInterface>,
+     DeclareOpInterfaceMethods<LoopLikeOpInterface>,
+     MemoryEffects<[MemWrite<OpenACC_ConstructResource>]>]> {
   let summary = "loop construct";
 
   let description = [{
diff --git a/mlir/include/mlir/Dialect/OpenACC/OpenACCOpsInterfaces.td b/mlir/include/mlir/Dialect/OpenACC/OpenACCOpsInterfaces.td
new file mode 100644
index 0000000..6fb9a95
--- /dev/null
+++ b/mlir/include/mlir/Dialect/OpenACC/OpenACCOpsInterfaces.td
@@ -0,0 +1,29 @@
+//===-- OpenACCOpsInterfaces.td - OpenACC type interfaces ---*- tablegen -*-===//
+//
+// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
+// See https://llvm.org/LICENSE.txt for license information.
+// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
+//
+//===----------------------------------------------------------------------===//
+
+#ifndef OPENACC_OPS_INTERFACES
+#define OPENACC_OPS_INTERFACES
+
+include "mlir/IR/OpBase.td"
+
+def ComputeRegionOpInterface : OpInterface<"ComputeRegionOpInterface"> {
+  let cppNamespace = "::mlir::acc";
+
+  let description = [{
+    An interface for compute and loop construct operations.
+  }];
+
+  let methods = [
+    InterfaceMethod<"Get alloca block", "::mlir::Block*", "getAllocaBlock",
+      (ins), [{
+        return &$_op.getRegion().front();
+      }]>,
+  ];
+}
+
+#endif // OPENACC_OPS_INTERFACES
diff --git a/mlir/include/mlir/Dialect/OpenMP/OpenMPClauseOperands.h b/mlir/include/mlir/Dialect/OpenMP/OpenMPClauseOperands.h
index 0eefe06..38e4d8f 100644
--- a/mlir/include/mlir/Dialect/OpenMP/OpenMPClauseOperands.h
+++ b/mlir/include/mlir/Dialect/OpenMP/OpenMPClauseOperands.h
@@ -32,37 +32,37 @@ namespace omp {
 
 struct AlignedClauseOps {
   llvm::SmallVector<Value> alignedVars;
-  llvm::SmallVector<Attribute> alignmentAttrs;
+  llvm::SmallVector<Attribute> alignments;
 };
 
 struct AllocateClauseOps {
-  llvm::SmallVector<Value> allocatorVars, allocateVars;
+  llvm::SmallVector<Value> allocateVars, allocatorVars;
 };
 
 struct CancelDirectiveNameClauseOps {
-  ClauseCancellationConstructTypeAttr cancelDirectiveNameAttr;
-};
-
-struct CollapseClauseOps {
-  llvm::SmallVector<Value> loopLBVar, loopUBVar, loopStepVar;
+  ClauseCancellationConstructTypeAttr cancelDirective;
 };
 
 struct CopyprivateClauseOps {
   llvm::SmallVector<Value> copyprivateVars;
-  llvm::SmallVector<Attribute> copyprivateFuncs;
+  llvm::SmallVector<Attribute> copyprivateSyms;
 };
 
 struct CriticalNameClauseOps {
-  StringAttr criticalNameAttr;
+  /// This field has a generic name because it's mirroring the `sym_name`
+  /// argument of the `OpenMP_CriticalNameClause` tablegen definition. That one
+  /// can't be renamed to anything more specific because the `sym_name` name is
+  /// a requirement of the `Symbol` MLIR trait associated with that clause.
+  StringAttr symName;
 };
 
 struct DependClauseOps {
-  llvm::SmallVector<Attribute> dependTypeAttrs;
+  llvm::SmallVector<Attribute> dependKinds;
   llvm::SmallVector<Value> dependVars;
 };
 
 struct DeviceClauseOps {
-  Value deviceVar;
+  Value device;
 };
 
 struct DeviceTypeClauseOps {
@@ -71,26 +71,26 @@ struct DeviceTypeClauseOps {
 };
 
 struct DistScheduleClauseOps {
-  UnitAttr distScheduleStaticAttr;
-  Value distScheduleChunkSizeVar;
+  UnitAttr distScheduleStatic;
+  Value distScheduleChunkSize;
 };
 
 struct DoacrossClauseOps {
-  llvm::SmallVector<Value> doacrossVectorVars;
-  ClauseDependAttr doacrossDependTypeAttr;
-  IntegerAttr doacrossNumLoopsAttr;
+  ClauseDependAttr doacrossDependType;
+  IntegerAttr doacrossNumLoops;
+  llvm::SmallVector<Value> doacrossDependVars;
 };
 
 struct FilterClauseOps {
-  Value filteredThreadIdVar;
+  Value filteredThreadId;
 };
 
 struct FinalClauseOps {
-  Value finalVar;
+  Value final;
 };
 
 struct GrainsizeClauseOps {
-  Value grainsizeVar;
+  Value grainsize;
 };
 
 struct HasDeviceAddrClauseOps {
@@ -98,7 +98,7 @@ struct HasDeviceAddrClauseOps {
 };
 
 struct HintClauseOps {
-  IntegerAttr hintAttr;
+  IntegerAttr hint;
 };
 
 struct IfClauseOps {
@@ -107,8 +107,8 @@ struct IfClauseOps {
 
 struct InReductionClauseOps {
   llvm::SmallVector<Value> inReductionVars;
-  llvm::SmallVector<bool> inReductionVarsByRef;
-  llvm::SmallVector<Attribute> inReductionDeclSymbols;
+  llvm::SmallVector<bool> inReductionByref;
+  llvm::SmallVector<Attribute> inReductionSyms;
 };
 
 struct IsDevicePtrClauseOps {
@@ -120,7 +120,8 @@ struct LinearClauseOps {
 };
 
 struct LoopRelatedOps {
-  UnitAttr loopInclusiveAttr;
+  llvm::SmallVector<Value> loopLowerBounds, loopUpperBounds, loopSteps;
+  UnitAttr loopInclusive;
 };
 
 struct MapClauseOps {
@@ -128,11 +129,11 @@ struct MapClauseOps {
 };
 
 struct MergeableClauseOps {
-  UnitAttr mergeableAttr;
+  UnitAttr mergeable;
 };
 
 struct NogroupClauseOps {
-  UnitAttr nogroupAttr;
+  UnitAttr nogroup;
 };
 
 struct NontemporalClauseOps {
@@ -140,36 +141,36 @@ struct NontemporalClauseOps {
 };
 
 struct NowaitClauseOps {
-  UnitAttr nowaitAttr;
+  UnitAttr nowait;
 };
 
 struct NumTasksClauseOps {
-  Value numTasksVar;
+  Value numTasks;
 };
 
 struct NumTeamsClauseOps {
-  Value numTeamsLowerVar, numTeamsUpperVar;
+  Value numTeamsLower, numTeamsUpper;
 };
 
 struct NumThreadsClauseOps {
-  Value numThreadsVar;
+  Value numThreads;
 };
 
 struct OrderClauseOps {
-  ClauseOrderKindAttr orderAttr;
-  OrderModifierAttr orderModAttr;
+  ClauseOrderKindAttr order;
+  OrderModifierAttr orderMod;
 };
 
 struct OrderedClauseOps {
-  IntegerAttr orderedAttr;
+  IntegerAttr ordered;
 };
 
 struct ParallelizationLevelClauseOps {
-  UnitAttr parLevelSimdAttr;
+  UnitAttr parLevelSimd;
 };
 
 struct PriorityClauseOps {
-  Value priorityVar;
+  Value priority;
 };
 
 struct PrivateClauseOps {
@@ -179,46 +180,46 @@ struct PrivateClauseOps {
   llvm::SmallVector<Value> privateVars;
   // The list of symbols referring to delayed privatizer ops (i.e. `omp.private`
   // ops).
-  llvm::SmallVector<Attribute> privatizers;
+  llvm::SmallVector<Attribute> privateSyms;
 };
 
 struct ProcBindClauseOps {
-  ClauseProcBindKindAttr procBindKindAttr;
+  ClauseProcBindKindAttr procBindKind;
 };
 
 struct ReductionClauseOps {
   llvm::SmallVector<Value> reductionVars;
-  llvm::SmallVector<bool> reductionVarsByRef;
-  llvm::SmallVector<Attribute> reductionDeclSymbols;
+  llvm::SmallVector<bool> reductionByref;
+  llvm::SmallVector<Attribute> reductionSyms;
 };
 
 struct SafelenClauseOps {
-  IntegerAttr safelenAttr;
+  IntegerAttr safelen;
 };
 
 struct ScheduleClauseOps {
-  ClauseScheduleKindAttr scheduleValAttr;
-  ScheduleModifierAttr scheduleModAttr;
-  Value scheduleChunkVar;
-  UnitAttr scheduleSimdAttr;
+  ClauseScheduleKindAttr scheduleKind;
+  Value scheduleChunk;
+  ScheduleModifierAttr scheduleMod;
+  UnitAttr scheduleSimd;
 };
 
 struct SimdlenClauseOps {
-  IntegerAttr simdlenAttr;
+  IntegerAttr simdlen;
 };
 
 struct TaskReductionClauseOps {
   llvm::SmallVector<Value> taskReductionVars;
-  llvm::SmallVector<bool> taskReductionVarsByRef;
-  llvm::SmallVector<Attribute> taskReductionDeclSymbols;
+  llvm::SmallVector<bool> taskReductionByref;
+  llvm::SmallVector<Attribute> taskReductionSyms;
 };
 
 struct ThreadLimitClauseOps {
-  Value threadLimitVar;
+  Value threadLimit;
 };
 
 struct UntiedClauseOps {
-  UnitAttr untiedAttr;
+  UnitAttr untied;
 };
 
 struct UseDeviceAddrClauseOps {
@@ -241,82 +242,81 @@ template <typename... Mixins>
 struct Clauses : public Mixins... {};
 } // namespace detail
 
-using CancelClauseOps =
+using CancelOperands =
     detail::Clauses<CancelDirectiveNameClauseOps, IfClauseOps>;
 
-using CancellationPointClauseOps =
-    detail::Clauses<CancelDirectiveNameClauseOps>;
+using CancellationPointOperands = detail::Clauses<CancelDirectiveNameClauseOps>;
 
-using CriticalClauseOps = detail::Clauses<CriticalNameClauseOps, HintClauseOps>;
+using CriticalDeclareOperands =
+    detail::Clauses<CriticalNameClauseOps, HintClauseOps>;
 
 // TODO `indirect` clause.
-using DeclareTargetClauseOps = detail::Clauses<DeviceTypeClauseOps>;
+using DeclareTargetOperands = detail::Clauses<DeviceTypeClauseOps>;
 
-using DistributeClauseOps =
+using DistributeOperands =
     detail::Clauses<AllocateClauseOps, DistScheduleClauseOps, OrderClauseOps,
                     PrivateClauseOps>;
 
-using LoopNestClauseOps = detail::Clauses<CollapseClauseOps, LoopRelatedOps>;
+using LoopNestOperands = detail::Clauses<LoopRelatedOps>;
 
-using MaskedClauseOps = detail::Clauses<FilterClauseOps>;
+using MaskedOperands = detail::Clauses<FilterClauseOps>;
 
-using OrderedOpClauseOps = detail::Clauses<DoacrossClauseOps>;
+using OrderedOperands = detail::Clauses<DoacrossClauseOps>;
 
-using OrderedRegionClauseOps = detail::Clauses<ParallelizationLevelClauseOps>;
+using OrderedRegionOperands = detail::Clauses<ParallelizationLevelClauseOps>;
 
-using ParallelClauseOps =
+using ParallelOperands =
     detail::Clauses<AllocateClauseOps, IfClauseOps, NumThreadsClauseOps,
                     PrivateClauseOps, ProcBindClauseOps, ReductionClauseOps>;
 
-using SectionsClauseOps = detail::Clauses<AllocateClauseOps, NowaitClauseOps,
-                                          PrivateClauseOps, ReductionClauseOps>;
+using SectionsOperands = detail::Clauses<AllocateClauseOps, NowaitClauseOps,
+                                         PrivateClauseOps, ReductionClauseOps>;
 
-// TODO `linear` clause.
-using SimdClauseOps =
-    detail::Clauses<AlignedClauseOps, IfClauseOps, NontemporalClauseOps,
-                    OrderClauseOps, PrivateClauseOps, ReductionClauseOps,
-                    SafelenClauseOps, SimdlenClauseOps>;
+using SimdOperands =
+    detail::Clauses<AlignedClauseOps, IfClauseOps, LinearClauseOps,
+                    NontemporalClauseOps, OrderClauseOps, PrivateClauseOps,
+                    ReductionClauseOps, SafelenClauseOps, SimdlenClauseOps>;
 
-using SingleClauseOps = detail::Clauses<AllocateClauseOps, CopyprivateClauseOps,
-                                        NowaitClauseOps, PrivateClauseOps>;
+using SingleOperands = detail::Clauses<AllocateClauseOps, CopyprivateClauseOps,
+                                       NowaitClauseOps, PrivateClauseOps>;
 
 // TODO `defaultmap`, `uses_allocators` clauses.
-using TargetClauseOps =
+using TargetOperands =
     detail::Clauses<AllocateClauseOps, DependClauseOps, DeviceClauseOps,
                     HasDeviceAddrClauseOps, IfClauseOps, InReductionClauseOps,
                     IsDevicePtrClauseOps, MapClauseOps, NowaitClauseOps,
                     PrivateClauseOps, ThreadLimitClauseOps>;
 
-using TargetDataClauseOps =
+using TargetDataOperands =
     detail::Clauses<DeviceClauseOps, IfClauseOps, MapClauseOps,
                     UseDeviceAddrClauseOps, UseDevicePtrClauseOps>;
 
-using TargetEnterExitUpdateDataClauseOps =
+using TargetEnterExitUpdateDataOperands =
     detail::Clauses<DependClauseOps, DeviceClauseOps, IfClauseOps, MapClauseOps,
                     NowaitClauseOps>;
 
 // TODO `affinity`, `detach` clauses.
-using TaskClauseOps =
+using TaskOperands =
     detail::Clauses<AllocateClauseOps, DependClauseOps, FinalClauseOps,
                     IfClauseOps, InReductionClauseOps, MergeableClauseOps,
                     PriorityClauseOps, PrivateClauseOps, UntiedClauseOps>;
 
-using TaskgroupClauseOps =
+using TaskgroupOperands =
     detail::Clauses<AllocateClauseOps, TaskReductionClauseOps>;
 
-using TaskloopClauseOps =
+using TaskloopOperands =
     detail::Clauses<AllocateClauseOps, FinalClauseOps, GrainsizeClauseOps,
                     IfClauseOps, InReductionClauseOps, MergeableClauseOps,
                     NogroupClauseOps, NumTasksClauseOps, PriorityClauseOps,
                     PrivateClauseOps, ReductionClauseOps, UntiedClauseOps>;
 
-using TaskwaitClauseOps = detail::Clauses<DependClauseOps, NowaitClauseOps>;
+using TaskwaitOperands = detail::Clauses<DependClauseOps, NowaitClauseOps>;
 
-using TeamsClauseOps =
+using TeamsOperands =
     detail::Clauses<AllocateClauseOps, IfClauseOps, NumTeamsClauseOps,
                     PrivateClauseOps, ReductionClauseOps, ThreadLimitClauseOps>;
 
-using WsloopClauseOps =
+using WsloopOperands =
     detail::Clauses<AllocateClauseOps, LinearClauseOps, NowaitClauseOps,
                     OrderClauseOps, OrderedClauseOps, PrivateClauseOps,
                     ReductionClauseOps, ScheduleClauseOps>;
diff --git a/mlir/include/mlir/Dialect/OpenMP/OpenMPClauses.td b/mlir/include/mlir/Dialect/OpenMP/OpenMPClauses.td
index 5b20168..e703c32 100644
--- a/mlir/include/mlir/Dialect/OpenMP/OpenMPClauses.td
+++ b/mlir/include/mlir/Dialect/OpenMP/OpenMPClauses.td
@@ -20,6 +20,7 @@
 #define OPENMP_CLAUSES
 
 include "mlir/Dialect/OpenMP/OpenMPOpBase.td"
+include "mlir/IR/SymbolInterfaces.td"
 
 //===----------------------------------------------------------------------===//
 // V5.2: [5.11] `aligned` clause
@@ -32,18 +33,18 @@ class OpenMP_AlignedClauseSkip<
                     description, extraClassDeclaration> {
   let arguments = (ins
     Variadic<OpenMP_PointerLikeType>:$aligned_vars,
-    OptionalAttr<I64ArrayAttr>:$alignment_values
+    OptionalAttr<I64ArrayAttr>:$alignments
   );
 
   let assemblyFormat = [{
     `aligned` `(` custom<AlignedClause>($aligned_vars, type($aligned_vars),
-                                        $alignment_values) `)`
+                                        $alignments) `)`
   }];
 
   let description = [{
-    The `alignment_values` attribute additionally specifies alignment of each
-    corresponding aligned operand. Note that `aligned_vars` and
-    `alignment_values` should contain the same number of elements.
+    The `alignments` attribute additionally specifies alignment of each
+    corresponding aligned operand. Note that `aligned_vars` and `alignments`
+    must contain the same number of elements.
   }];
 }
 
@@ -60,22 +61,22 @@ class OpenMP_AllocateClauseSkip<
                     description, extraClassDeclaration> {
   let arguments = (ins
     Variadic<AnyType>:$allocate_vars,
-    Variadic<AnyType>:$allocators_vars
+    Variadic<AnyType>:$allocator_vars
   );
 
   let extraClassDeclaration = [{
     unsigned getNumAllocateVars() { return getAllocateVars().size(); }
-    unsigned getNumAllocatorsVars() { return getAllocatorsVars().size(); }
+    unsigned getNumAllocatorsVars() { return getAllocatorVars().size(); }
   }];
 
   let assemblyFormat = [{
     `allocate` `(`
       custom<AllocateAndAllocator>($allocate_vars, type($allocate_vars),
-                                   $allocators_vars, type($allocators_vars)) `)`
+                                   $allocator_vars, type($allocator_vars)) `)`
   }];
 
   let description = [{
-    The `allocators_vars` and `allocate_vars` parameters are a variadic list of
+    The `allocator_vars` and `allocate_vars` parameters are a variadic list of
     values that specify the memory allocator to be used to obtain storage for
     private values.
   }];
@@ -93,12 +94,12 @@ class OpenMP_CancelDirectiveNameClauseSkip<
   > : OpenMP_Clause</*isRequired=*/true, traits, arguments, assemblyFormat,
                     description, extraClassDeclaration> {
   let arguments = (ins
-    CancellationConstructTypeAttr:$cancellation_construct_type_val
+    CancellationConstructTypeAttr:$cancel_directive
   );
 
   let assemblyFormat = [{
     `cancellation_construct_type` `(`
-      custom<ClauseAttr>($cancellation_construct_type_val) `)`
+      custom<ClauseAttr>($cancel_directive) `)`
   }];
 
   // TODO: Add description.
@@ -107,36 +108,6 @@ class OpenMP_CancelDirectiveNameClauseSkip<
 def OpenMP_CancelDirectiveNameClause : OpenMP_CancelDirectiveNameClauseSkip<>;
 
 //===----------------------------------------------------------------------===//
-// V5.2: [4.4.3] `collapse` clause
-//===----------------------------------------------------------------------===//
-
-class OpenMP_CollapseClauseSkip<
-    bit traits = false, bit arguments = false, bit assemblyFormat = false,
-    bit description = false, bit extraClassDeclaration = false
-  > : OpenMP_Clause</*isRequired=*/false, traits, arguments, assemblyFormat,
-                    description, extraClassDeclaration> {
-  let traits = [
-    AllTypesMatch<["lowerBound", "upperBound", "step"]>
-  ];
-
-  let arguments = (ins
-    Variadic<IntLikeType>:$lowerBound,
-    Variadic<IntLikeType>:$upperBound,
-    Variadic<IntLikeType>:$step
-  );
-
-  let extraClassDeclaration = [{
-    /// Returns the number of loops in the loop nest.
-    unsigned getNumLoops() { return getLowerBound().size(); }
-  }];
-
-  // Description and formatting integrated in the `omp.loop_nest` operation,
-  // which is the only one currently accepting this clause.
-}
-
-def OpenMP_CollapseClause : OpenMP_CollapseClauseSkip<>;
-
-//===----------------------------------------------------------------------===//
 // V5.2: [5.7.2] `copyprivate` clause
 //===----------------------------------------------------------------------===//
 
@@ -147,13 +118,13 @@ class OpenMP_CopyprivateClauseSkip<
                     description, extraClassDeclaration> {
   let arguments = (ins
     Variadic<OpenMP_PointerLikeType>:$copyprivate_vars,
-    OptionalAttr<SymbolRefArrayAttr>:$copyprivate_funcs
+    OptionalAttr<SymbolRefArrayAttr>:$copyprivate_syms
   );
 
   let assemblyFormat = [{
     `copyprivate` `(`
-      custom<CopyPrivateVarList>($copyprivate_vars, type($copyprivate_vars),
-                                 $copyprivate_funcs) `)`
+      custom<Copyprivate>($copyprivate_vars, type($copyprivate_vars),
+                          $copyprivate_syms) `)`
   }];
 
   let description = [{
@@ -174,6 +145,10 @@ class OpenMP_CriticalNameClauseSkip<
     bit description = false, bit extraClassDeclaration = false
   > : OpenMP_Clause</*isRequired=*/true, traits, arguments, assemblyFormat,
                     description, extraClassDeclaration> {
+  let traits = [
+    Symbol
+  ];
+
   let arguments = (ins
     SymbolNameAttr:$sym_name
   );
@@ -197,18 +172,19 @@ class OpenMP_DependClauseSkip<
   > : OpenMP_Clause</*isRequired=*/false, traits, arguments, assemblyFormat,
                     description, extraClassDeclaration> {
   let arguments = (ins
-    OptionalAttr<TaskDependArrayAttr>:$depends,
+    OptionalAttr<TaskDependArrayAttr>:$depend_kinds,
     Variadic<OpenMP_PointerLikeType>:$depend_vars
   );
 
   let assemblyFormat = [{
     `depend` `(`
-      custom<DependVarList>($depend_vars, type($depend_vars), $depends) `)`
+      custom<DependVarList>($depend_vars, type($depend_vars), $depend_kinds) `)`
   }];
 
   let description = [{
-    The `depends` and `depend_vars` arguments are variadic lists of values that
-    specify the dependencies of this particular task in relation to other tasks.
+    The `depend_kinds` and `depend_vars` arguments are variadic lists of values
+    that specify the dependencies of this particular task in relation to other
+    tasks.
   }];
 }
 
@@ -250,19 +226,20 @@ class OpenMP_DistScheduleClauseSkip<
                     description, extraClassDeclaration> {
   let arguments = (ins
     UnitAttr:$dist_schedule_static,
-    Optional<IntLikeType>:$chunk_size
+    Optional<IntLikeType>:$dist_schedule_chunk_size
   );
 
   let assemblyFormat = [{
     `dist_schedule_static` $dist_schedule_static
-    | `chunk_size` `(` $chunk_size `:` type($chunk_size) `)`
+    | `dist_schedule_chunk_size` `(` $dist_schedule_chunk_size `:`
+      type($dist_schedule_chunk_size) `)`
   }];
 
   let description = [{
     The `dist_schedule_static` attribute specifies the schedule for this loop,
     determining how the loop is distributed across the various teams. The
-    optional `chunk_size` associated with this determines further controls this
-    distribution.
+    optional `dist_schedule_chunk_size` associated with this determines further
+    controls this distribution.
   }];
 }
 
@@ -278,24 +255,25 @@ class OpenMP_DoacrossClauseSkip<
   > : OpenMP_Clause</*isRequired=*/true, traits, arguments, assemblyFormat,
                     description, extraClassDeclaration> {
   let arguments = (ins
-    OptionalAttr<ClauseDependAttr>:$depend_type_val,
-    ConfinedAttr<OptionalAttr<I64Attr>, [IntMinValue<0>]>:$num_loops_val,
-    Variadic<AnyType>:$depend_vec_vars
+    OptionalAttr<ClauseDependAttr>:$doacross_depend_type,
+    ConfinedAttr<OptionalAttr<I64Attr>, [IntMinValue<0>]>:$doacross_num_loops,
+    Variadic<AnyType>:$doacross_depend_vars
   );
 
   let assemblyFormat = [{
-    ( `depend_type` `` $depend_type_val^ )?
-    ( `depend_vec` `(` $depend_vec_vars^ `:` type($depend_vec_vars) `)` )?
+    ( `depend_type` `` $doacross_depend_type^ )?
+    ( `depend_vec` `(` $doacross_depend_vars^ `:` type($doacross_depend_vars)
+                   `)` )?
   }];
 
   let description = [{
-    The `depend_type_val` attribute refers to either the DEPEND(SOURCE) clause
-    or the DEPEND(SINK: vec) clause.
+    The `doacross_depend_type` attribute refers to either the DEPEND(SOURCE)
+    clause or the DEPEND(SINK: vec) clause.
 
-    The `num_loops_val` attribute specifies the number of loops in the doacross
-    nest.
+    The `doacross_num_loops` attribute specifies the number of loops in the
+    doacross nest.
 
-    The `depend_vec_vars` is a variadic list of operands that specifies the
+    The `doacross_depend_vars` is a variadic list of operands that specifies the
     index of the loop iterator in the doacross nest for the DEPEND(SOURCE)
     clause or the index of the element of "vec" for the DEPEND(SINK: vec)
     clause. It contains the operands in multiple "vec" when multiple
@@ -343,11 +321,11 @@ class OpenMP_FinalClauseSkip<
   > : OpenMP_Clause</*isRequired=*/false, traits, arguments, assemblyFormat,
                     description, extraClassDeclaration> {
   let arguments = (ins
-    Optional<I1>:$final_expr
+    Optional<I1>:$final
   );
 
   let assemblyFormat = [{
-    `final` `(` $final_expr `)`
+    `final` `(` $final `)`
   }];
 
   let description = [{
@@ -371,11 +349,11 @@ class OpenMP_GrainsizeClauseSkip<
   > : OpenMP_Clause</*isRequired=*/false, traits, arguments, assemblyFormat,
                     description, extraClassDeclaration> {
   let arguments = (ins
-    Optional<IntLikeType>:$grain_size
+    Optional<IntLikeType>:$grainsize
   );
 
   let assemblyFormat = [{
-    `grain_size` `(` $grain_size `:` type($grain_size) `)`
+    `grainsize` `(` $grainsize `:` type($grainsize) `)`
   }];
 
   let description = [{
@@ -398,17 +376,18 @@ class OpenMP_HasDeviceAddrClauseSkip<
   > : OpenMP_Clause</*isRequired=*/false, traits, arguments, assemblyFormat,
                     description, extraClassDeclaration> {
   let arguments = (ins
-    Variadic<OpenMP_PointerLikeType>:$has_device_addr
+    Variadic<OpenMP_PointerLikeType>:$has_device_addr_vars
   );
 
   let assemblyFormat = [{
-    `has_device_addr` `(` $has_device_addr `:` type($has_device_addr) `)`
+    `has_device_addr` `(` $has_device_addr_vars `:` type($has_device_addr_vars)
+                      `)`
   }];
 
   let description = [{
-    The optional `has_device_addr` indicates that list items already have device
-    addresses, so they may be directly accessed from the target device. This
-    includes array sections.
+    The optional `has_device_addr_vars` indicates that list items already have
+    device addresses, so they may be directly accessed from the target device.
+    This includes array sections.
   }];
 }
 
@@ -424,11 +403,11 @@ class OpenMP_HintClauseSkip<
   > : OpenMP_Clause</*isRequired=*/false, traits, arguments, assemblyFormat,
                     description, extraClassDeclaration> {
   let arguments = (ins
-    DefaultValuedOptionalAttr<I64Attr, "0">:$hint_val
+    DefaultValuedOptionalAttr<I64Attr, "0">:$hint
   );
 
   let assemblyFormat = [{
-    `hint` `(` custom<SynchronizationHint>($hint_val) `)`
+    `hint` `(` custom<SynchronizationHint>($hint) `)`
   }];
 
   let description = [{
@@ -477,14 +456,14 @@ class OpenMP_InReductionClauseSkip<
 
   let arguments = (ins
     Variadic<OpenMP_PointerLikeType>:$in_reduction_vars,
-    OptionalAttr<DenseBoolArrayAttr>:$in_reduction_vars_byref,
-    OptionalAttr<SymbolRefArrayAttr>:$in_reductions
+    OptionalAttr<DenseBoolArrayAttr>:$in_reduction_byref,
+    OptionalAttr<SymbolRefArrayAttr>:$in_reduction_syms
   );
 
   let assemblyFormat = [{
     `in_reduction` `(`
       custom<ReductionVarList>($in_reduction_vars, type($in_reduction_vars),
-                               $in_reduction_vars_byref, $in_reductions) `)`
+                               $in_reduction_byref, $in_reduction_syms) `)`
   }];
 
   let extraClassDeclaration = [{
@@ -510,15 +489,15 @@ class OpenMP_IsDevicePtrClauseSkip<
   > : OpenMP_Clause</*isRequired=*/false, traits, arguments, assemblyFormat,
                     description, extraClassDeclaration> {
   let arguments = (ins
-    Variadic<OpenMP_PointerLikeType>:$is_device_ptr
+    Variadic<OpenMP_PointerLikeType>:$is_device_ptr_vars
   );
 
   let assemblyFormat = [{
-    `is_device_ptr` `(` $is_device_ptr `:` type($is_device_ptr) `)`
+    `is_device_ptr` `(` $is_device_ptr_vars `:` type($is_device_ptr_vars) `)`
   }];
 
   let description = [{
-    The optional `is_device_ptr` indicates list items are device pointers.
+    The optional `is_device_ptr_vars` indicates list items are device pointers.
   }];
 }
 
@@ -555,6 +534,38 @@ class OpenMP_LinearClauseSkip<
 def OpenMP_LinearClause : OpenMP_LinearClauseSkip<>;
 
 //===----------------------------------------------------------------------===//
+// Not in the spec: Clause-like structure to hold loop related information.
+//===----------------------------------------------------------------------===//
+
+class OpenMP_LoopRelatedClauseSkip<
+    bit traits = false, bit arguments = false, bit assemblyFormat = false,
+    bit description = false, bit extraClassDeclaration = false
+  > : OpenMP_Clause</*isRequired=*/false, traits, arguments, assemblyFormat,
+                    description, extraClassDeclaration> {
+  let traits = [
+    AllTypesMatch<
+      ["loop_lower_bounds", "loop_upper_bounds", "loop_steps"]>
+  ];
+  
+  let arguments = (ins
+    Variadic<IntLikeType>:$loop_lower_bounds,
+    Variadic<IntLikeType>:$loop_upper_bounds,
+    Variadic<IntLikeType>:$loop_steps,
+    UnitAttr:$loop_inclusive
+  );
+
+  let extraClassDeclaration = [{
+    /// Returns the number of loops in the loop nest.
+    unsigned getNumLoops() { return getLoopLowerBounds().size(); }
+  }];
+
+  // Description and formatting integrated in the `omp.loop_nest` operation,
+  // which is the only one currently accepting this clause.
+}
+
+def OpenMP_LoopRelatedClause : OpenMP_LoopRelatedClauseSkip<>;
+
+//===----------------------------------------------------------------------===//
 // V5.2: [5.8.3] `map` clause
 //===----------------------------------------------------------------------===//
 
@@ -568,16 +579,16 @@ class OpenMP_MapClauseSkip<
   ];
 
   let arguments = (ins
-    Variadic<OpenMP_PointerLikeType>:$map_operands
+    Variadic<OpenMP_PointerLikeType>:$map_vars
   );
 
   let assemblyFormat = [{
-    `map_entries` `(` custom<MapEntries>($map_operands, type($map_operands)) `)`
+    `map_entries` `(` custom<MapEntries>($map_vars, type($map_vars)) `)`
   }];
 
   let description = [{
-    The optional `map_operands` maps data from the current task's data
-    environment to the device data environment.
+    The optional `map_vars` maps data from the current task's data environment
+    to the device data environment.
   }];
 }
 
@@ -593,11 +604,11 @@ class OpenMP_MemoryOrderClauseSkip<
   > : OpenMP_Clause</*isRequired=*/false, traits, arguments, assemblyFormat,
                     description, extraClassDeclaration> {
   let arguments = (ins
-    OptionalAttr<MemoryOrderKindAttr>:$memory_order_val
+    OptionalAttr<MemoryOrderKindAttr>:$memory_order
   );
 
   let assemblyFormat = [{
-    `memory_order` `(` custom<ClauseAttr>($memory_order_val) `)`
+    `memory_order` `(` custom<ClauseAttr>($memory_order) `)`
   }];
 
   let description = [{
@@ -779,16 +790,16 @@ class OpenMP_NumThreadsClauseSkip<
   > : OpenMP_Clause</*isRequired=*/false, traits, arguments, assemblyFormat,
                     description, extraClassDeclaration> {
   let arguments = (ins
-    Optional<IntLikeType>:$num_threads_var
+    Optional<IntLikeType>:$num_threads
   );
 
   let assemblyFormat = [{
-    `num_threads` `(` $num_threads_var `:` type($num_threads_var) `)`
+    `num_threads` `(` $num_threads `:` type($num_threads) `)`
   }];
 
   let description = [{
-    The optional `num_threads_var` parameter specifies the number of threads
-    which should be used to execute the parallel region.
+    The optional `num_threads` parameter specifies the number of threads which
+    should be used to execute the parallel region.
   }];
 }
 
@@ -804,12 +815,12 @@ class OpenMP_OrderClauseSkip<
   > : OpenMP_Clause</*isRequired=*/false, traits, arguments, assemblyFormat,
                     description, extraClassDeclaration> {
   let arguments = (ins
-    OptionalAttr<OrderKindAttr>:$order_val,
+    OptionalAttr<OrderKindAttr>:$order,
     OptionalAttr<OrderModifierAttr>:$order_mod
   );
 
   let assemblyFormat = [{
-    `order` `(` custom<OrderClause>($order_val, $order_mod) `)`
+    `order` `(` custom<OrderClause>($order, $order_mod) `)`
   }];
 
   let description = [{
@@ -831,15 +842,15 @@ class OpenMP_OrderedClauseSkip<
   > : OpenMP_Clause</*isRequired=*/false, traits, arguments, assemblyFormat,
                     description, extraClassDeclaration> {
   let arguments = (ins
-    ConfinedAttr<OptionalAttr<I64Attr>, [IntMinValue<0>]>:$ordered_val
+    ConfinedAttr<OptionalAttr<I64Attr>, [IntMinValue<0>]>:$ordered
   );
 
   let assemblyFormat = [{
-    `ordered` `(` $ordered_val `)`
+    `ordered` `(` $ordered `)`
   }];
 
   let description = [{
-    The optional `ordered_val` attribute specifies how many loops are associated
+    The optional `ordered` attribute specifies how many loops are associated
     with the worksharing-loop construct. The value of zero refers to the ordered
     clause specified without parameter.
   }];
@@ -857,17 +868,17 @@ class OpenMP_ParallelizationLevelClauseSkip<
   > : OpenMP_Clause</*isRequired=*/false, traits, arguments, assemblyFormat,
                     description, extraClassDeclaration> {
   let arguments = (ins
-    UnitAttr:$simd
+    UnitAttr:$par_level_simd
   );
 
   let assemblyFormat = [{
-    `simd` $simd
+    `par_level_simd` $par_level_simd
   }];
 
   let description = [{
-    The `simd` attribute corresponds to the simd clause specified. If it is not
-    present, it behaves as if the threads clause is specified or no clause is
-    specified.
+    The `par_level_simd` attribute corresponds to the simd clause specified. If
+    it is not present, it behaves as if the threads clause is specified or no
+    clause is specified.
   }];
 }
 
@@ -914,12 +925,12 @@ class OpenMP_PrivateClauseSkip<
                     description, extraClassDeclaration> {
   let arguments = (ins
     Variadic<AnyType>:$private_vars,
-    OptionalAttr<SymbolRefArrayAttr>:$privatizers
+    OptionalAttr<SymbolRefArrayAttr>:$private_syms
   );
 
   let assemblyFormat = [{
     `private` `(`
-      custom<PrivateList>($private_vars, type($private_vars), $privatizers) `)`
+      custom<PrivateList>($private_vars, type($private_vars), $private_syms) `)`
   }];
 
   // TODO: Add description.
@@ -937,15 +948,15 @@ class OpenMP_ProcBindClauseSkip<
   > : OpenMP_Clause</*isRequired=*/false, traits, arguments, assemblyFormat,
                     description, extraClassDeclaration> {
   let arguments = (ins
-    OptionalAttr<ProcBindKindAttr>:$proc_bind_val
+    OptionalAttr<ProcBindKindAttr>:$proc_bind_kind
   );
 
   let assemblyFormat = [{
-    `proc_bind` `(` custom<ClauseAttr>($proc_bind_val) `)`
+    `proc_bind` `(` custom<ClauseAttr>($proc_bind_kind) `)`
   }];
 
   let description = [{
-    The optional `proc_bind_val` attribute controls the thread affinity for the
+    The optional `proc_bind_kind` attribute controls the thread affinity for the
     execution of the parallel region.
   }];
 }
@@ -967,14 +978,14 @@ class OpenMP_ReductionClauseSkip<
 
   let arguments = (ins
     Variadic<OpenMP_PointerLikeType>:$reduction_vars,
-    OptionalAttr<DenseBoolArrayAttr>:$reduction_vars_byref,
-    OptionalAttr<SymbolRefArrayAttr>:$reductions
+    OptionalAttr<DenseBoolArrayAttr>:$reduction_byref,
+    OptionalAttr<SymbolRefArrayAttr>:$reduction_syms
   );
 
   let assemblyFormat = [{
     `reduction` `(`
       custom<ReductionVarList>($reduction_vars, type($reduction_vars),
-                               $reduction_vars_byref, $reductions) `)`
+                               $reduction_byref, $reduction_syms) `)`
   }];
 
   let extraClassDeclaration = [{
@@ -986,10 +997,10 @@ class OpenMP_ReductionClauseSkip<
   let description = [{
     Reductions can be performed by specifying reduction accumulator variables in
     `reduction_vars`, symbols referring to reduction declarations in the
-    `reductions` attribute, and whether the reduction variable should be passed
-    into the reduction region by value or by reference in
-    `reduction_vars_byref`. Each reduction is identified by the accumulator it
-    uses and accumulators must not be repeated in the same reduction. A private
+    `reduction_syms` attribute, and whether the reduction variable should be
+    passed into the reduction region by value or by reference in
+    `reduction_byref`. Each reduction is identified by the accumulator it uses
+    and accumulators must not be repeated in the same reduction. A private
     variable corresponding to the accumulator is used in place of the
     accumulator inside the body of the operation. The reduction declaration
     specifies how to combine the values from each iteration, section, team,
@@ -1036,22 +1047,22 @@ class OpenMP_ScheduleClauseSkip<
   > : OpenMP_Clause</*isRequired=*/false, traits, arguments, assemblyFormat,
                     description, extraClassDeclaration> {
   let arguments = (ins
-    OptionalAttr<ScheduleKindAttr>:$schedule_val,
-    Optional<AnyType>:$schedule_chunk_var,
-    OptionalAttr<ScheduleModifierAttr>:$schedule_modifier,
-    UnitAttr:$simd_modifier
+    OptionalAttr<ScheduleKindAttr>:$schedule_kind,
+    Optional<AnyType>:$schedule_chunk,
+    OptionalAttr<ScheduleModifierAttr>:$schedule_mod,
+    UnitAttr:$schedule_simd
   );
 
   let assemblyFormat = [{
     `schedule` `(`
-      custom<ScheduleClause>($schedule_val, $schedule_modifier, $simd_modifier,
-                             $schedule_chunk_var, type($schedule_chunk_var)) `)`
+      custom<ScheduleClause>($schedule_kind, $schedule_mod, $schedule_simd,
+                             $schedule_chunk, type($schedule_chunk)) `)`
   }];
 
   let description = [{
-    The optional `schedule_val` attribute specifies the loop schedule for this
+    The optional `schedule_kind` attribute specifies the loop schedule for this
     loop, determining how the loop is distributed across the parallel threads.
-    The optional `schedule_chunk_var` associated with this determines further
+    The optional `schedule_chunk` associated with this determines further
     controls this distribution.
   }];
 }
@@ -1098,14 +1109,14 @@ class OpenMP_TaskReductionClauseSkip<
 
   let arguments = (ins
     Variadic<OpenMP_PointerLikeType>:$task_reduction_vars,
-    OptionalAttr<DenseBoolArrayAttr>:$task_reduction_vars_byref,
-    OptionalAttr<SymbolRefArrayAttr>:$task_reductions
+    OptionalAttr<DenseBoolArrayAttr>:$task_reduction_byref,
+    OptionalAttr<SymbolRefArrayAttr>:$task_reduction_syms
   );
 
   let assemblyFormat = [{
     `task_reduction` `(`
       custom<ReductionVarList>($task_reduction_vars, type($task_reduction_vars),
-                               $task_reduction_vars_byref, $task_reductions) `)`
+                               $task_reduction_byref, $task_reduction_syms) `)`
   }];
 
   let description = [{
@@ -1115,9 +1126,9 @@ class OpenMP_TaskReductionClauseSkip<
     participating in the reduction. After the end of the region, the original
     list item contains the result of the reduction. Similarly to the `reduction`
     clause, accumulator variables must be passed in `task_reduction_vars`,
-    symbols referring to reduction declarations in the `task_reductions`
+    symbols referring to reduction declarations in the `task_reduction_syms`
     attribute, and whether the reduction variable should be passed into the
-    reduction region by value or by reference in `task_reduction_vars_byref`.
+    reduction region by value or by reference in `task_reduction_byref`.
   }];
 
   let extraClassDeclaration = [{
@@ -1176,7 +1187,7 @@ class OpenMP_UntiedClauseSkip<
     If the `untied` clause is present on a task construct, any thread in the
     team can resume the task region after a suspension. The `untied` clause is
     ignored if a `final` clause is present on the same task construct and the
-    `final_expr` evaluates to `true`, or if a task is an included task.
+    `final` expression evaluates to `true`, or if a task is an included task.
   }];
 }
 
@@ -1192,16 +1203,16 @@ class OpenMP_UseDeviceAddrClauseSkip<
   > : OpenMP_Clause</*isRequired=*/false, traits, arguments, assemblyFormat,
                     description, extraClassDeclaration> {
   let arguments = (ins
-    Variadic<OpenMP_PointerLikeType>:$use_device_addr
+    Variadic<OpenMP_PointerLikeType>:$use_device_addr_vars
   );
 
   let assemblyFormat = [{
-    `use_device_addr` `(` $use_device_addr `:` type($use_device_addr) `)`
+    `use_device_addr` `(` $use_device_addr_vars `:` type($use_device_addr_vars) `)`
   }];
 
   let description = [{
-    The optional `use_device_addr` specifies the address of the objects in the
-    device data environment.
+    The optional `use_device_addr_vars` specifies the address of the objects in
+    the device data environment.
   }];
 }
 
@@ -1217,15 +1228,15 @@ class OpenMP_UseDevicePtrClauseSkip<
   > : OpenMP_Clause</*isRequired=*/false, traits, arguments, assemblyFormat,
                     description, extraClassDeclaration> {
   let arguments = (ins
-    Variadic<OpenMP_PointerLikeType>:$use_device_ptr
+    Variadic<OpenMP_PointerLikeType>:$use_device_ptr_vars
   );
 
   let assemblyFormat = [{
-    `use_device_ptr` `(` $use_device_ptr `:` type($use_device_ptr) `)`
+    `use_device_ptr` `(` $use_device_ptr_vars `:` type($use_device_ptr_vars) `)`
   }];
 
   let description = [{
-    The optional `use_device_ptr` specifies the device pointers to the
+    The optional `use_device_ptr_vars` specifies the device pointers to the
     corresponding list items in the device data environment.
   }];
 }
diff --git a/mlir/include/mlir/Dialect/OpenMP/OpenMPOps.td b/mlir/include/mlir/Dialect/OpenMP/OpenMPOps.td
index 69fd1f1..68f92e6 100644
--- a/mlir/include/mlir/Dialect/OpenMP/OpenMPOps.td
+++ b/mlir/include/mlir/Dialect/OpenMP/OpenMPOps.td
@@ -132,28 +132,26 @@ def ParallelOp : OpenMP_Op<"parallel", traits = [
     DeclareOpInterfaceMethods<OutlineableOpenMPOpInterface>,
     RecursiveMemoryEffects
   ], clauses = [
-    // TODO: Sort clauses alphabetically.
+    OpenMP_AllocateClauseSkip<assemblyFormat = true>,
     OpenMP_IfClauseSkip<assemblyFormat = true>,
     OpenMP_NumThreadsClauseSkip<assemblyFormat = true>,
-    OpenMP_AllocateClauseSkip<assemblyFormat = true>,
-    OpenMP_ReductionClauseSkip<assemblyFormat = true>,
+    OpenMP_PrivateClauseSkip<assemblyFormat = true>,
     OpenMP_ProcBindClauseSkip<assemblyFormat = true>,
-    OpenMP_PrivateClauseSkip<assemblyFormat = true>
+    OpenMP_ReductionClauseSkip<assemblyFormat = true>
   ], singleRegion = true> {
   let summary = "parallel construct";
   let description = [{
     The parallel construct includes a region of code which is to be executed
     by a team of threads.
 
-    The optional `if_expr` parameter specifies a boolean result of a
-    conditional check. If this value is 1 or is not provided then the parallel
-    region runs as normal, if it is 0 then the parallel region is executed with
-    one thread.
+    The optional `if_expr` parameter specifies a boolean result of a conditional
+    check. If this value is 1 or is not provided then the parallel region runs
+    as normal, if it is 0 then the parallel region is executed with one thread.
   }] # clausesDescription;
 
   let builders = [
     OpBuilder<(ins CArg<"ArrayRef<NamedAttribute>", "{}">:$attributes)>,
-    OpBuilder<(ins CArg<"const ParallelClauseOps &">:$clauses)>
+    OpBuilder<(ins CArg<"const ParallelOperands &">:$clauses)>
   ];
 
   // TODO: Use default assembly format inherited from OpenMP_Op once printing
@@ -163,16 +161,16 @@ def ParallelOp : OpenMP_Op<"parallel", traits = [
   let assemblyFormat = [{
     oilist(
           `if` `(` $if_expr `)`
-          | `num_threads` `(` $num_threads_var `:` type($num_threads_var) `)`
+          | `num_threads` `(` $num_threads `:` type($num_threads) `)`
           | `allocate` `(`
               custom<AllocateAndAllocator>(
                 $allocate_vars, type($allocate_vars),
-                $allocators_vars, type($allocators_vars)
+                $allocator_vars, type($allocator_vars)
               ) `)`
-          | `proc_bind` `(` custom<ClauseAttr>($proc_bind_val) `)`
+          | `proc_bind` `(` custom<ClauseAttr>($proc_bind_kind) `)`
     ) custom<ParallelRegion>($region, $reduction_vars, type($reduction_vars),
-                             $reduction_vars_byref, $reductions, $private_vars,
-                             type($private_vars), $privatizers) attr-dict
+                             $reduction_byref, $reduction_syms, $private_vars,
+                             type($private_vars), $private_syms) attr-dict
   }];
 
   let hasVerifier = 1;
@@ -196,10 +194,8 @@ def TerminatorOp : OpenMP_Op<"terminator", [Terminator, Pure]> {
 def TeamsOp : OpenMP_Op<"teams", traits = [
     AttrSizedOperandSegments, RecursiveMemoryEffects
   ], clauses = [
-    // TODO: Complete clause list (private).
-    // TODO: Sort clauses alphabetically.
-    OpenMP_NumTeamsClause, OpenMP_IfClause, OpenMP_ThreadLimitClause,
-    OpenMP_AllocateClause, OpenMP_ReductionClause
+    OpenMP_AllocateClause, OpenMP_IfClause, OpenMP_NumTeamsClause,
+    OpenMP_PrivateClause, OpenMP_ReductionClause, OpenMP_ThreadLimitClause
   ], singleRegion = true> {
   let summary = "teams construct";
   let description = [{
@@ -212,7 +208,7 @@ def TeamsOp : OpenMP_Op<"teams", traits = [
   }] # clausesDescription;
 
   let builders = [
-    OpBuilder<(ins CArg<"const TeamsClauseOps &">:$clauses)>
+    OpBuilder<(ins CArg<"const TeamsOperands &">:$clauses)>
   ];
 
   let hasVerifier = 1;
@@ -239,9 +235,8 @@ def SectionOp : OpenMP_Op<"section", [HasParent<"SectionsOp">],
 def SectionsOp : OpenMP_Op<"sections", traits = [
     AttrSizedOperandSegments
   ], clauses = [
-    // TODO: Complete clause list (private).
-    // TODO: Sort clauses alphabetically.
-    OpenMP_ReductionClause, OpenMP_AllocateClause, OpenMP_NowaitClause
+    OpenMP_AllocateClause, OpenMP_NowaitClause, OpenMP_PrivateClause,
+    OpenMP_ReductionClause
   ], singleRegion = true> {
   let summary = "sections construct";
   let description = [{
@@ -258,7 +253,7 @@ def SectionsOp : OpenMP_Op<"sections", traits = [
   let regions = (region SizedRegion<1>:$region);
 
   let builders = [
-    OpBuilder<(ins CArg<"const SectionsClauseOps &">:$clauses)>
+    OpBuilder<(ins CArg<"const SectionsOperands &">:$clauses)>
   ];
 
   let hasVerifier = 1;
@@ -272,8 +267,8 @@ def SectionsOp : OpenMP_Op<"sections", traits = [
 def SingleOp : OpenMP_Op<"single", traits = [
     AttrSizedOperandSegments
   ], clauses = [
-    // TODO: Complete clause list (private).
-    OpenMP_AllocateClause, OpenMP_CopyprivateClause, OpenMP_NowaitClause
+    OpenMP_AllocateClause, OpenMP_CopyprivateClause, OpenMP_NowaitClause,
+    OpenMP_PrivateClause
   ], singleRegion = true> {
   let summary = "single directive";
   let description = [{
@@ -285,7 +280,7 @@ def SingleOp : OpenMP_Op<"single", traits = [
   }] # clausesDescription;
 
   let builders = [
-    OpBuilder<(ins CArg<"const SingleClauseOps &">:$clauses)>
+    OpBuilder<(ins CArg<"const SingleOperands &">:$clauses)>
   ];
 
   let hasVerifier = 1;
@@ -298,7 +293,7 @@ def SingleOp : OpenMP_Op<"single", traits = [
 def LoopNestOp : OpenMP_Op<"loop_nest", traits = [
     RecursiveMemoryEffects, SameVariadicOperandSize
   ], clauses = [
-    OpenMP_CollapseClause
+    OpenMP_LoopRelatedClause
   ], singleRegion = true> {
   let summary = "rectangular loop nest";
   let description = [{
@@ -307,14 +302,14 @@ def LoopNestOp : OpenMP_Op<"loop_nest", traits = [
     lower and upper bounds, as well as a step variable, must be defined.
 
     The lower and upper bounds specify a half-open range: the range includes the
-    lower bound but does not include the upper bound. If the `inclusive`
+    lower bound but does not include the upper bound. If the `loop_inclusive`
     attribute is specified then the upper bound is also included.
 
     The body region can contain any number of blocks. The region is terminated
     by an `omp.yield` instruction without operands. The induction variables,
     represented as entry block arguments to the loop nest operation's single
-    region, match the types of the `lowerBound`, `upperBound` and `step`
-    arguments.
+    region, match the types of the `loop_lower_bounds`, `loop_upper_bounds` and
+    `loop_steps` arguments.
 
     ```mlir
     omp.loop_nest (%i1, %i2) : i32 = (%c0, %c0) to (%c10, %c10) step (%c1, %c1) {
@@ -336,10 +331,8 @@ def LoopNestOp : OpenMP_Op<"loop_nest", traits = [
     non-perfectly nested loops.
   }];
 
-  let arguments = !con(clausesArgs, (ins UnitAttr:$inclusive));
-
   let builders = [
-    OpBuilder<(ins CArg<"const LoopNestClauseOps &">:$clauses)>
+    OpBuilder<(ins CArg<"const LoopNestOperands &">:$clauses)>
   ];
 
   let extraClassDeclaration = [{
@@ -366,14 +359,14 @@ def WsloopOp : OpenMP_Op<"wsloop", traits = [
     AttrSizedOperandSegments, DeclareOpInterfaceMethods<LoopWrapperInterface>,
     RecursiveMemoryEffects, SingleBlock
   ], clauses = [
-    // TODO: Complete clause list (allocate, private).
-    // TODO: Sort clauses alphabetically.
+    OpenMP_AllocateClauseSkip<assemblyFormat = true>,
     OpenMP_LinearClauseSkip<assemblyFormat = true>,
-    OpenMP_ReductionClauseSkip<assemblyFormat = true>,
-    OpenMP_ScheduleClauseSkip<assemblyFormat = true>,
     OpenMP_NowaitClauseSkip<assemblyFormat = true>,
+    OpenMP_OrderClauseSkip<assemblyFormat = true>,
     OpenMP_OrderedClauseSkip<assemblyFormat = true>,
-    OpenMP_OrderClauseSkip<assemblyFormat = true>
+    OpenMP_PrivateClauseSkip<assemblyFormat = true>,
+    OpenMP_ReductionClauseSkip<assemblyFormat = true>,
+    OpenMP_ScheduleClauseSkip<assemblyFormat = true>
   ], singleRegion = true> {
   let summary = "worksharing-loop construct";
   let description = [{
@@ -402,7 +395,7 @@ def WsloopOp : OpenMP_Op<"wsloop", traits = [
 
   let builders = [
     OpBuilder<(ins CArg<"ArrayRef<NamedAttribute>", "{}">:$attributes)>,
-    OpBuilder<(ins CArg<"const WsloopClauseOps &">:$clauses)>
+    OpBuilder<(ins CArg<"const WsloopOperands &">:$clauses)>
   ];
 
   // TODO: Use default assembly format inherited from OpenMP_Op once printing
@@ -415,13 +408,20 @@ def WsloopOp : OpenMP_Op<"wsloop", traits = [
                                    $linear_step_vars) `)`
           |`schedule` `(`
               custom<ScheduleClause>(
-                $schedule_val, $schedule_modifier, $simd_modifier,
-                $schedule_chunk_var, type($schedule_chunk_var)) `)`
+                $schedule_kind, $schedule_mod, $schedule_simd,
+                $schedule_chunk, type($schedule_chunk)) `)`
           |`nowait` $nowait
-          |`ordered` `(` $ordered_val `)`
-          |`order` `(` custom<OrderClause>($order_val, $order_mod) `)`
+          |`ordered` `(` $ordered `)`
+          |`order` `(` custom<OrderClause>($order, $order_mod) `)`
+          |`allocate` `(`
+                custom<AllocateAndAllocator>(
+                  $allocate_vars, type($allocate_vars), $allocator_vars,
+                  type($allocator_vars)) `)`
+          |`private` `(`
+                custom<PrivateList>(
+                  $private_vars, type($private_vars), $private_syms) `)`
     ) custom<Wsloop>($region, $reduction_vars, type($reduction_vars),
-                     $reduction_vars_byref, $reductions) attr-dict
+                     $reduction_byref, $reduction_syms) attr-dict
   }];
 
   let hasVerifier = 1;
@@ -435,9 +435,9 @@ def SimdOp : OpenMP_Op<"simd", traits = [
     AttrSizedOperandSegments, DeclareOpInterfaceMethods<LoopWrapperInterface>,
     RecursiveMemoryEffects, SingleBlock
   ], clauses = [
-    // TODO: Complete clause list (linear, private, reduction).
-    OpenMP_AlignedClause, OpenMP_IfClause, OpenMP_NontemporalClause,
-    OpenMP_OrderClause, OpenMP_SafelenClause, OpenMP_SimdlenClause
+    OpenMP_AlignedClause, OpenMP_IfClause, OpenMP_LinearClause,
+    OpenMP_NontemporalClause, OpenMP_OrderClause, OpenMP_PrivateClause,
+    OpenMP_ReductionClause, OpenMP_SafelenClause, OpenMP_SimdlenClause
   ], singleRegion = true> {
   let summary = "simd construct";
   let description = [{
@@ -468,7 +468,7 @@ def SimdOp : OpenMP_Op<"simd", traits = [
   }] # clausesDescription;
 
   let builders = [
-    OpBuilder<(ins CArg<"const SimdClauseOps &">:$clauses)>
+    OpBuilder<(ins CArg<"const SimdOperands &">:$clauses)>
   ];
 
   let hasVerifier = 1;
@@ -502,9 +502,8 @@ def DistributeOp : OpenMP_Op<"distribute", traits = [
     AttrSizedOperandSegments, DeclareOpInterfaceMethods<LoopWrapperInterface>,
     RecursiveMemoryEffects, SingleBlock
   ], clauses = [
-    // TODO: Complete clause list (private).
-    // TODO: Sort clauses alphabetically.
-    OpenMP_DistScheduleClause, OpenMP_AllocateClause, OpenMP_OrderClause
+    OpenMP_AllocateClause, OpenMP_DistScheduleClause, OpenMP_OrderClause,
+    OpenMP_PrivateClause
   ], singleRegion = true> {
   let summary = "distribute construct";
   let description = [{
@@ -541,7 +540,7 @@ def DistributeOp : OpenMP_Op<"distribute", traits = [
   }] # clausesDescription;
 
   let builders = [
-    OpBuilder<(ins CArg<"const DistributeClauseOps &">:$clauses)>
+    OpBuilder<(ins CArg<"const DistributeOperands &">:$clauses)>
   ];
 
   let hasVerifier = 1;
@@ -555,11 +554,10 @@ def TaskOp : OpenMP_Op<"task", traits = [
     AttrSizedOperandSegments, AutomaticAllocationScope,
     OutlineableOpenMPOpInterface
   ], clauses = [
-    // TODO: Complete clause list (affinity, detach, private).
-    // TODO: Sort clauses alphabetically.
-    OpenMP_IfClause, OpenMP_FinalClause, OpenMP_UntiedClause,
-    OpenMP_MergeableClause, OpenMP_InReductionClause,
-    OpenMP_PriorityClause, OpenMP_DependClause, OpenMP_AllocateClause
+    // TODO: Complete clause list (affinity, detach).
+    OpenMP_AllocateClause, OpenMP_DependClause, OpenMP_FinalClause,
+    OpenMP_IfClause, OpenMP_InReductionClause, OpenMP_MergeableClause,
+    OpenMP_PriorityClause, OpenMP_PrivateClause, OpenMP_UntiedClause
   ], singleRegion = true> {
   let summary = "task construct";
   let description = [{
@@ -576,12 +574,12 @@ def TaskOp : OpenMP_Op<"task", traits = [
 
     The `in_reduction` clause specifies that this particular task (among all the
     tasks in current taskgroup, if any) participates in a reduction.
-    `in_reduction_vars_byref` indicates whether each reduction variable should
+    `in_reduction_byref` indicates whether each reduction variable should
     be passed by value or by reference.
   }] # clausesDescription;
 
   let builders = [
-    OpBuilder<(ins CArg<"const TaskClauseOps &">:$clauses)>
+    OpBuilder<(ins CArg<"const TaskOperands &">:$clauses)>
   ];
 
   let hasVerifier = 1;
@@ -592,14 +590,12 @@ def TaskloopOp : OpenMP_Op<"taskloop", traits = [
     DeclareOpInterfaceMethods<LoopWrapperInterface>, RecursiveMemoryEffects,
     SingleBlock
   ], clauses = [
-    // TODO: Complete clause list (private).
-    // TODO: Sort clauses alphabetically.
-    OpenMP_IfClause, OpenMP_FinalClause, OpenMP_UntiedClause,
-    OpenMP_MergeableClause,
-    OpenMP_InReductionClauseSkip<extraClassDeclaration = true>,
+    OpenMP_AllocateClause, OpenMP_FinalClause, OpenMP_GrainsizeClause,
+    OpenMP_IfClause, OpenMP_InReductionClauseSkip<extraClassDeclaration = true>,
+    OpenMP_MergeableClause, OpenMP_NogroupClause, OpenMP_NumTasksClause,
+    OpenMP_PriorityClause, OpenMP_PrivateClause,
     OpenMP_ReductionClauseSkip<extraClassDeclaration = true>,
-    OpenMP_PriorityClause, OpenMP_AllocateClause, OpenMP_GrainsizeClause,
-    OpenMP_NumTasksClause, OpenMP_NogroupClause
+    OpenMP_UntiedClause
   ], singleRegion = true> {
   let summary = "taskloop construct";
   let description = [{
@@ -639,7 +635,7 @@ def TaskloopOp : OpenMP_Op<"taskloop", traits = [
     items is present. Thus, the generated tasks are participants of a reduction
     previously defined by a reduction scoping clause. In this case, accumulator
     variables are specified in `in_reduction_vars`, symbols referring to
-    reduction declarations in `in_reductions` and `in_reduction_vars_byref`
+    reduction declarations in `in_reduction_syms` and `in_reduction_byref`
     indicate for each reduction variable whether it should be passed by value or
     by reference.
 
@@ -654,7 +650,7 @@ def TaskloopOp : OpenMP_Op<"taskloop", traits = [
   }];
 
   let builders = [
-    OpBuilder<(ins CArg<"const TaskloopClauseOps &">:$clauses)>
+    OpBuilder<(ins CArg<"const TaskloopOperands &">:$clauses)>
   ];
 
   let extraClassDeclaration = [{
@@ -670,8 +666,7 @@ def TaskloopOp : OpenMP_Op<"taskloop", traits = [
 def TaskgroupOp : OpenMP_Op<"taskgroup", traits = [
     AttrSizedOperandSegments, AutomaticAllocationScope
   ], clauses = [
-    // TODO: Sort clauses alphabetically.
-    OpenMP_TaskReductionClause, OpenMP_AllocateClause
+    OpenMP_AllocateClause, OpenMP_TaskReductionClause
   ], singleRegion = true> {
   let summary = "taskgroup construct";
   let description = [{
@@ -688,7 +683,7 @@ def TaskgroupOp : OpenMP_Op<"taskgroup", traits = [
   }] # clausesDescription;
 
   let builders = [
-    OpBuilder<(ins CArg<"const TaskgroupClauseOps &">:$clauses)>
+    OpBuilder<(ins CArg<"const TaskgroupOperands &">:$clauses)>
   ];
 
   let hasVerifier = 1;
@@ -951,9 +946,8 @@ def MapInfoOp : OpenMP_Op<"map.info", [AttrSizedOperandSegments]> {
 def TargetDataOp: OpenMP_Op<"target_data", traits = [
     AttrSizedOperandSegments
   ], clauses = [
-    // TODO: Sort clauses alphabetically.
-    OpenMP_IfClause, OpenMP_DeviceClause, OpenMP_UseDevicePtrClause,
-    OpenMP_UseDeviceAddrClause, OpenMP_MapClause
+    OpenMP_DeviceClause, OpenMP_IfClause, OpenMP_MapClause,
+    OpenMP_UseDeviceAddrClause, OpenMP_UseDevicePtrClause
   ], singleRegion = true> {
   let summary = "target data construct";
   let description = [{
@@ -965,14 +959,13 @@ def TargetDataOp: OpenMP_Op<"target_data", traits = [
     to and from the offloading device when multiple target regions are using
     the same data.
 
-    The optional `if_expr` parameter specifies a boolean result of a
-    conditional check. If this value is 1 or is not provided then the target
-    region runs on a device, if it is 0 then the target region is executed
-    on the host device.
+    The optional `if_expr` parameter specifies a boolean result of a conditional
+    check. If this value is 1 or is not provided then the target region runs on
+    a device, if it is 0 then the target region is executed on the host device.
   }] # clausesDescription;
 
   let builders = [
-    OpBuilder<(ins CArg<"const TargetDataClauseOps &">:$clauses)>
+    OpBuilder<(ins CArg<"const TargetDataOperands &">:$clauses)>
   ];
 
   let hasVerifier = 1;
@@ -985,9 +978,8 @@ def TargetDataOp: OpenMP_Op<"target_data", traits = [
 def TargetEnterDataOp: OpenMP_Op<"target_enter_data", traits = [
     AttrSizedOperandSegments
   ], clauses = [
-    // TODO: Sort clauses alphabetically.
-    OpenMP_IfClause, OpenMP_DeviceClause, OpenMP_DependClause,
-    OpenMP_NowaitClause, OpenMP_MapClause
+    OpenMP_DependClause, OpenMP_DeviceClause, OpenMP_IfClause, OpenMP_MapClause,
+    OpenMP_NowaitClause
   ]> {
   let summary = "target enter data construct";
   let description = [{
@@ -995,14 +987,13 @@ def TargetEnterDataOp: OpenMP_Op<"target_enter_data", traits = [
     a device data environment. The target enter data directive is a
     stand-alone directive.
 
-    The optional `if_expr` parameter specifies a boolean result of a
-    conditional check. If this value is 1 or is not provided then the target
-    region runs on a device, if it is 0 then the target region is executed on
-    the host device.
+    The optional `if_expr` parameter specifies a boolean result of a conditional
+    check. If this value is 1 or is not provided then the target region runs on
+    a device, if it is 0 then the target region is executed on the host device.
   }] # clausesDescription;
 
   let builders = [
-    OpBuilder<(ins CArg<"const TargetEnterExitUpdateDataClauseOps &">:$clauses)>
+    OpBuilder<(ins CArg<"const TargetEnterExitUpdateDataOperands &">:$clauses)>
   ];
 
   let hasVerifier = 1;
@@ -1015,9 +1006,8 @@ def TargetEnterDataOp: OpenMP_Op<"target_enter_data", traits = [
 def TargetExitDataOp: OpenMP_Op<"target_exit_data", traits = [
     AttrSizedOperandSegments
   ], clauses = [
-    // TODO: Sort clauses alphabetically.
-    OpenMP_IfClause, OpenMP_DeviceClause, OpenMP_DependClause,
-    OpenMP_NowaitClause, OpenMP_MapClause
+    OpenMP_DependClause, OpenMP_DeviceClause, OpenMP_IfClause, OpenMP_MapClause,
+    OpenMP_NowaitClause
   ]> {
   let summary = "target exit data construct";
   let description = [{
@@ -1025,14 +1015,13 @@ def TargetExitDataOp: OpenMP_Op<"target_exit_data", traits = [
     device data environment. The target exit data directive is
     a stand-alone directive.
 
-    The optional `if_expr` parameter specifies a boolean result of a
-    conditional check. If this value is 1 or is not provided then the target
-    region runs on a device, if it is 0 then the target region is executed
-    on the host device.
+    The optional `if_expr` parameter specifies a boolean result of a conditional
+    check. If this value is 1 or is not provided then the target region runs on
+    a device, if it is 0 then the target region is executed on the host device.
   }] # clausesDescription;
 
   let builders = [
-    OpBuilder<(ins CArg<"const TargetEnterExitUpdateDataClauseOps &">:$clauses)>
+    OpBuilder<(ins CArg<"const TargetEnterExitUpdateDataOperands &">:$clauses)>
   ];
 
   let hasVerifier = 1;
@@ -1045,9 +1034,8 @@ def TargetExitDataOp: OpenMP_Op<"target_exit_data", traits = [
 def TargetUpdateOp: OpenMP_Op<"target_update", traits = [
     AttrSizedOperandSegments
   ], clauses = [
-    // TODO: Sort clauses alphabetically.
-    OpenMP_IfClause, OpenMP_DeviceClause, OpenMP_DependClause,
-    OpenMP_NowaitClause, OpenMP_MapClause
+    OpenMP_DependClause, OpenMP_DeviceClause, OpenMP_IfClause, OpenMP_MapClause,
+    OpenMP_NowaitClause
   ]> {
   let summary = "target update construct";
   let description = [{
@@ -1056,10 +1044,9 @@ def TargetUpdateOp: OpenMP_Op<"target_update", traits = [
     specified motion clauses. The target update construct is a stand-alone
     directive.
 
-    The optional `if_expr` parameter specifies a boolean result of a
-    conditional check. If this value is 1 or is not provided then the target
-    region runs on a device, if it is 0 then the target region is executed
-    on the host device.
+    The optional `if_expr` parameter specifies a boolean result of a conditional
+    check. If this value is 1 or is not provided then the target region runs on
+    a device, if it is 0 then the target region is executed on the host device.
 
     We use `MapInfoOp` to model the motion clauses and their modifiers. Even
     though the spec differentiates between map-types & map-type-modifiers vs.
@@ -1070,7 +1057,7 @@ def TargetUpdateOp: OpenMP_Op<"target_update", traits = [
   }] # clausesDescription;
 
   let builders = [
-    OpBuilder<(ins CArg<"const TargetEnterExitUpdateDataClauseOps &">:$clauses)>
+    OpBuilder<(ins CArg<"const TargetEnterExitUpdateDataOperands &">:$clauses)>
   ];
 
   let hasVerifier = 1;
@@ -1083,26 +1070,24 @@ def TargetUpdateOp: OpenMP_Op<"target_update", traits = [
 def TargetOp : OpenMP_Op<"target", traits = [
     AttrSizedOperandSegments, IsolatedFromAbove, OutlineableOpenMPOpInterface
   ], clauses = [
-    // TODO: Complete clause list (allocate, defaultmap, in_reduction,
-    // uses_allocators).
-    // TODO: Sort clauses alphabetically.
-    OpenMP_IfClause, OpenMP_DeviceClause, OpenMP_ThreadLimitClause,
-    OpenMP_DependClause, OpenMP_NowaitClause, OpenMP_IsDevicePtrClause,
-    OpenMP_HasDeviceAddrClause, OpenMP_MapClause, OpenMP_PrivateClause
+    // TODO: Complete clause list (defaultmap, uses_allocators).
+    OpenMP_AllocateClause, OpenMP_DependClause, OpenMP_DeviceClause,
+    OpenMP_HasDeviceAddrClause, OpenMP_IfClause, OpenMP_InReductionClause,
+    OpenMP_IsDevicePtrClause, OpenMP_MapClause, OpenMP_NowaitClause,
+    OpenMP_PrivateClause, OpenMP_ThreadLimitClause
   ], singleRegion = true> {
   let summary = "target construct";
   let description = [{
     The target construct includes a region of code which is to be executed
     on a device.
 
-    The optional `if_expr` parameter specifies a boolean result of a
-    conditional check. If this value is 1 or is not provided then the target
-    region runs on a device, if it is 0 then the target region is executed on the
-    host device.
+    The optional `if_expr` parameter specifies a boolean result of a conditional
+    check. If this value is 1 or is not provided then the target region runs on
+    a device, if it is 0 then the target region is executed on the host device.
   }] # clausesDescription;
 
   let builders = [
-    OpBuilder<(ins CArg<"const TargetClauseOps &">:$clauses)>
+    OpBuilder<(ins CArg<"const TargetOperands &">:$clauses)>
   ];
 
   let hasVerifier = 1;
@@ -1125,9 +1110,7 @@ def MasterOp : OpenMP_Op<"master", singleRegion = true> {
 //===----------------------------------------------------------------------===//
 // 2.17.1 critical Construct
 //===----------------------------------------------------------------------===//
-def CriticalDeclareOp : OpenMP_Op<"critical.declare", traits = [
-    Symbol
-  ], clauses = [
+def CriticalDeclareOp : OpenMP_Op<"critical.declare", clauses = [
     OpenMP_CriticalNameClause, OpenMP_HintClause
   ]> {
   let summary = "declares a named critical section.";
@@ -1136,7 +1119,7 @@ def CriticalDeclareOp : OpenMP_Op<"critical.declare", traits = [
   }] # clausesDescription;
 
   let builders = [
-    OpBuilder<(ins CArg<"const CriticalClauseOps &">:$clauses)>
+    OpBuilder<(ins CArg<"const CriticalDeclareOperands &">:$clauses)>
   ];
 
   let hasVerifier = 1;
@@ -1189,7 +1172,7 @@ def OrderedOp : OpenMP_Op<"ordered", clauses = [OpenMP_DoacrossClause]> {
   }] # clausesDescription;
 
   let builders = [
-    OpBuilder<(ins CArg<"const OrderedOpClauseOps &">:$clauses)>
+    OpBuilder<(ins CArg<"const OrderedOperands &">:$clauses)>
   ];
 
   let hasVerifier = 1;
@@ -1206,7 +1189,7 @@ def OrderedRegionOp : OpenMP_Op<"ordered.region", clauses = [
   }] # clausesDescription;
 
   let builders = [
-    OpBuilder<(ins CArg<"const OrderedRegionClauseOps &">:$clauses)>
+    OpBuilder<(ins CArg<"const OrderedRegionOperands &">:$clauses)>
   ];
 
   let hasVerifier = 1;
@@ -1217,7 +1200,7 @@ def OrderedRegionOp : OpenMP_Op<"ordered.region", clauses = [
 //===----------------------------------------------------------------------===//
 
 def TaskwaitOp : OpenMP_Op<"taskwait", clauses = [
-    // TODO: Complete clause list (depend, nowait).
+    OpenMP_DependClause, OpenMP_NowaitClause
   ]> {
   let summary = "taskwait construct";
   let description = [{
@@ -1226,11 +1209,8 @@ def TaskwaitOp : OpenMP_Op<"taskwait", clauses = [
   }] # clausesDescription;
 
   let builders = [
-    OpBuilder<(ins CArg<"const TaskwaitClauseOps &">:$clauses)>
+    OpBuilder<(ins CArg<"const TaskwaitOperands &">:$clauses)>
   ];
-
-  // TODO: Remove overriden `assemblyFormat` once a clause is added.
-  let assemblyFormat = "attr-dict";
 }
 
 //===----------------------------------------------------------------------===//
@@ -1264,8 +1244,8 @@ def AtomicReadOp : OpenMP_Op<"atomic.read", traits = [
   // Override clause-based assemblyFormat.
   let assemblyFormat = [{
     $v `=` $x
-    oilist( `memory_order` `(` custom<ClauseAttr>($memory_order_val) `)`
-          | `hint` `(` custom<SynchronizationHint>($hint_val) `)`)
+    oilist( `memory_order` `(` custom<ClauseAttr>($memory_order) `)`
+          | `hint` `(` custom<SynchronizationHint>($hint) `)`)
     `:` type($x) `,` $element_type attr-dict
   }];
 
@@ -1308,8 +1288,8 @@ def AtomicWriteOp : OpenMP_Op<"atomic.write", traits = [
   // Override clause-based assemblyFormat.
   let assemblyFormat = [{
     $x `=` $expr
-    oilist( `hint` `(` custom<SynchronizationHint>($hint_val) `)`
-          | `memory_order` `(` custom<ClauseAttr>($memory_order_val) `)`)
+    oilist( `hint` `(` custom<SynchronizationHint>($hint) `)`
+          | `memory_order` `(` custom<ClauseAttr>($memory_order) `)`)
     `:` type($x) `,` type($expr)
     attr-dict
   }];
@@ -1371,8 +1351,8 @@ def AtomicUpdateOp : OpenMP_Op<"atomic.update", traits = [
 
   // Override clause-based assemblyFormat.
   let assemblyFormat = [{
-    oilist( `memory_order` `(` custom<ClauseAttr>($memory_order_val) `)`
-          | `hint` `(` custom<SynchronizationHint>($hint_val) `)`)
+    oilist( `memory_order` `(` custom<ClauseAttr>($memory_order) `)`
+          | `hint` `(` custom<SynchronizationHint>($hint) `)`)
     $x `:` type($x) $region attr-dict
   }];
 
@@ -1505,7 +1485,7 @@ def CancelOp : OpenMP_Op<"cancel", clauses = [
   }] # clausesDescription;
 
   let builders = [
-    OpBuilder<(ins CArg<"const CancelClauseOps &">:$clauses)>
+    OpBuilder<(ins CArg<"const CancelOperands &">:$clauses)>
   ];
 
   let hasVerifier = 1;
@@ -1525,7 +1505,7 @@ def CancellationPointOp : OpenMP_Op<"cancellation_point", clauses = [
   }] # clausesDescription;
 
   let builders = [
-    OpBuilder<(ins CArg<"const CancellationPointClauseOps &">:$clauses)>
+    OpBuilder<(ins CArg<"const CancellationPointOperands &">:$clauses)>
   ];
 
   let hasVerifier = 1;
@@ -1605,7 +1585,7 @@ def MaskedOp : OpenMP_Op<"masked", clauses = [
   }] # clausesDescription;
 
   let builders = [
-    OpBuilder<(ins CArg<"const MaskedClauseOps &">:$clauses)>
+    OpBuilder<(ins CArg<"const MaskedOperands &">:$clauses)>
   ];
 }
 
diff --git a/mlir/include/mlir/Dialect/OpenMP/OpenMPOpsInterfaces.td b/mlir/include/mlir/Dialect/OpenMP/OpenMPOpsInterfaces.td
index 385aa8b1..45d30a4 100644
--- a/mlir/include/mlir/Dialect/OpenMP/OpenMPOpsInterfaces.td
+++ b/mlir/include/mlir/Dialect/OpenMP/OpenMPOpsInterfaces.td
@@ -41,14 +41,14 @@ def MapClauseOwningOpInterface : OpInterface<"MapClauseOwningOpInterface"> {
   let cppNamespace = "::mlir::omp";
 
   let methods = [
-    InterfaceMethod<"Get map operands", "::mlir::OperandRange", "getMapOperands",
+    InterfaceMethod<"Get map operands", "::mlir::OperandRange", "getMapVars",
       (ins), [{
-        return $_op.getMapOperands();
+        return $_op.getMapVars();
       }]>,
       InterfaceMethod<"Get mutable map operands", "::mlir::MutableOperandRange",
-                      "getMapOperandsMutable",
+                      "getMapVarsMutable",
       (ins), [{
-        return $_op.getMapOperandsMutable();
+        return $_op.getMapVarsMutable();
       }]>,
   ];
 }
diff --git a/mlir/include/mlir/Dialect/Ptr/IR/CMakeLists.txt b/mlir/include/mlir/Dialect/Ptr/IR/CMakeLists.txt
index df07b8d..8293896 100644
--- a/mlir/include/mlir/Dialect/Ptr/IR/CMakeLists.txt
+++ b/mlir/include/mlir/Dialect/Ptr/IR/CMakeLists.txt
@@ -1,5 +1,5 @@
 add_mlir_dialect(PtrOps ptr)
-add_mlir_doc(PtrOps PtrOps Dialects/ -gen-op-doc)
+add_mlir_doc(PtrOps PtrOps Dialects/ -gen-dialect-doc -dialect=ptr)
 
 set(LLVM_TARGET_DEFINITIONS PtrOps.td)
 mlir_tablegen(PtrOpsAttrs.h.inc -gen-attrdef-decls -attrdefs-dialect=ptr)
diff --git a/mlir/include/mlir/Dialect/SCF/Transforms/Patterns.h b/mlir/include/mlir/Dialect/SCF/Transforms/Patterns.h
index fdf2570..5e66774 100644
--- a/mlir/include/mlir/Dialect/SCF/Transforms/Patterns.h
+++ b/mlir/include/mlir/Dialect/SCF/Transforms/Patterns.h
@@ -85,6 +85,9 @@ void populateSCFForLoopCanonicalizationPatterns(RewritePatternSet &patterns);
 ///  * `after` block containing arith.addi
 void populateUpliftWhileToForPatterns(RewritePatternSet &patterns);
 
+/// Populate patterns to rotate `scf.while` ops, constructing `do-while` loops
+/// from `while` loops.
+void populateSCFRotateWhileLoopPatterns(RewritePatternSet &patterns);
 } // namespace scf
 } // namespace mlir
 
diff --git a/mlir/include/mlir/Dialect/SCF/Transforms/TileUsingInterface.h b/mlir/include/mlir/Dialect/SCF/Transforms/TileUsingInterface.h
index d68ca11..1f21af6d 100644
--- a/mlir/include/mlir/Dialect/SCF/Transforms/TileUsingInterface.h
+++ b/mlir/include/mlir/Dialect/SCF/Transforms/TileUsingInterface.h
@@ -32,9 +32,11 @@ using SCFTileSizeComputationFunction =
 
 /// Options to use to control tiling.
 struct SCFTilingOptions {
-  /// Computation function that returns the tile sizes for each operation.
-  /// Delayed construction of constant tile sizes should occur to interoperate
-  /// with folding.
+  /// Computation function that returns the tile sizes to use for each loop.
+  /// Returning a tile size of zero implies no tiling for that loop. If the
+  /// size of the returned vector is smaller than the number of loops, the inner
+  /// loops are not tiled. If the size of the returned vector is larger, then
+  /// the vector is truncated to number of loops.
   SCFTileSizeComputationFunction tileSizeComputationFunction = nullptr;
 
   SCFTilingOptions &
@@ -45,7 +47,27 @@ struct SCFTilingOptions {
   /// Convenience function to set the `tileSizeComputationFunction` to a
   /// function that computes tile sizes at the point they are needed. Allows
   /// proper interaction with folding.
-  SCFTilingOptions &setTileSizes(ArrayRef<OpFoldResult> ts);
+  SCFTilingOptions &setTileSizes(ArrayRef<OpFoldResult> tileSizes);
+
+  /// Computation function that returns the number of threads to use for
+  /// each loop. Returning a num threads of zero implies no tiling for that
+  /// loop. If the size of the returned vector is smaller than the number of
+  /// loops, the inner loops are not tiled. If the size of the returned vector
+  /// is larger, then the vector is truncated to number of loops. Note: This
+  /// option is only supported with loopType set to `LoopType::ForallOp`. If the
+  /// tile size function is not specified while the num threads computation is,
+  /// then the tile size is determined automatically to map at most one tile per
+  /// thread.
+  SCFTileSizeComputationFunction numThreadsComputationFunction = nullptr;
+
+  SCFTilingOptions &
+  setNumThreadsComputationFunction(SCFTileSizeComputationFunction fun) {
+    numThreadsComputationFunction = std::move(fun);
+    return *this;
+  }
+  /// Convenience function to set the `numThreadsComputationFunction` to a
+  /// function that computes num threads at the point they are needed.
+  SCFTilingOptions &setNumThreads(ArrayRef<OpFoldResult> numThreads);
 
   /// The interchange vector to reorder the tiled loops.
   SmallVector<int64_t> interchangeVector = {};
@@ -67,9 +89,8 @@ struct SCFTilingOptions {
   /// when using loop constructs that dont support such a mapping (like
   /// `scf.for`)
   SmallVector<Attribute> mappingVector = {};
-  SCFTilingOptions &setMapping(ArrayRef<DeviceMappingAttrInterface> mapping) {
-    mappingVector = llvm::map_to_vector(
-        mapping, [](auto attr) -> Attribute { return attr; });
+  SCFTilingOptions &setMapping(ArrayRef<Attribute> mapping) {
+    mappingVector = llvm::to_vector(mapping);
     return *this;
   }
 };
diff --git a/mlir/include/mlir/Dialect/SCF/Transforms/Transforms.h b/mlir/include/mlir/Dialect/SCF/Transforms/Transforms.h
index 71835cd..ea2f457 100644
--- a/mlir/include/mlir/Dialect/SCF/Transforms/Transforms.h
+++ b/mlir/include/mlir/Dialect/SCF/Transforms/Transforms.h
@@ -228,6 +228,11 @@ FailureOr<ForOp> pipelineForLoop(RewriterBase &rewriter, ForOp forOp,
 ///   } else {
 ///     scf.yield %pre_val : i64
 ///   }
+///
+/// Failure mechanism is not implemented for this function, so it currently
+/// always returns a `WhileOp` operation: a new one if the transformation took
+/// place or the input `whileOp` if the loop was already in a `do-while` form
+/// and `forceCreateCheck` is `false`.
 FailureOr<WhileOp> wrapWhileLoopInZeroTripCheck(WhileOp whileOp,
                                                 RewriterBase &rewriter,
                                                 bool forceCreateCheck = false);
diff --git a/mlir/include/mlir/Dialect/SCF/Utils/Utils.h b/mlir/include/mlir/Dialect/SCF/Utils/Utils.h
index b7d6e99..4001ba3 100644
--- a/mlir/include/mlir/Dialect/SCF/Utils/Utils.h
+++ b/mlir/include/mlir/Dialect/SCF/Utils/Utils.h
@@ -195,6 +195,14 @@ scf::ForallOp fuseIndependentSiblingForallLoops(scf::ForallOp target,
 scf::ForOp fuseIndependentSiblingForLoops(scf::ForOp target, scf::ForOp source,
                                           RewriterBase &rewriter);
 
+/// Normalize an `scf.forall` operation. Returns `failure()`if normalization
+/// fails.
+// On `success()` returns the
+/// newly created operation with all uses of the original operation replaced
+/// with results of the new operation.
+FailureOr<scf::ForallOp> normalizeForallOp(RewriterBase &rewriter,
+                                           scf::ForallOp forallOp);
+
 } // namespace mlir
 
 #endif // MLIR_DIALECT_SCF_UTILS_UTILS_H_
diff --git a/mlir/include/mlir/Dialect/SparseTensor/IR/SparseTensor.h b/mlir/include/mlir/Dialect/SparseTensor/IR/SparseTensor.h
index 68ca036..388efd1 100644
--- a/mlir/include/mlir/Dialect/SparseTensor/IR/SparseTensor.h
+++ b/mlir/include/mlir/Dialect/SparseTensor/IR/SparseTensor.h
@@ -61,37 +61,62 @@ struct COOSegment {
 /// A simple wrapper to encode a bitset of (at most 64) levels, currently used
 /// by `sparse_tensor.iterate` operation for the set of levels on which the
 /// coordinates should be loaded.
-class LevelSet {
-  uint64_t bits = 0;
+class I64BitSet {
+  uint64_t storage = 0;
 
 public:
-  LevelSet() = default;
-  explicit LevelSet(uint64_t bits) : bits(bits) {}
-  operator uint64_t() const { return bits; }
+  using const_set_bits_iterator = llvm::const_set_bits_iterator_impl<I64BitSet>;
+  const_set_bits_iterator begin() const {
+    return const_set_bits_iterator(*this);
+  }
+  const_set_bits_iterator end() const {
+    return const_set_bits_iterator(*this, -1);
+  }
+  iterator_range<const_set_bits_iterator> bits() const {
+    return make_range(begin(), end());
+  }
+
+  I64BitSet() = default;
+  explicit I64BitSet(uint64_t bits) : storage(bits) {}
+  operator uint64_t() const { return storage; }
 
-  LevelSet &set(unsigned i) {
+  I64BitSet &set(unsigned i) {
     assert(i < 64);
-    bits |= static_cast<uint64_t>(0x01u) << i;
+    storage |= static_cast<uint64_t>(0x01u) << i;
     return *this;
   }
 
-  LevelSet &operator|=(LevelSet lhs) {
-    bits |= static_cast<uint64_t>(lhs);
+  I64BitSet &operator|=(I64BitSet lhs) {
+    storage |= static_cast<uint64_t>(lhs);
     return *this;
   }
 
-  LevelSet &lshift(unsigned offset) {
-    bits = bits << offset;
+  I64BitSet &lshift(unsigned offset) {
+    storage = storage << offset;
     return *this;
   }
 
+  // Needed by `llvm::const_set_bits_iterator_impl`.
+  int find_first() const { return min(); }
+  int find_next(unsigned prev) const {
+    if (prev >= max())
+      return -1;
+
+    uint64_t b = storage >> (prev + 1);
+    if (b == 0)
+      return -1;
+
+    return llvm::countr_zero(b) + prev + 1;
+  }
+
   bool operator[](unsigned i) const {
     assert(i < 64);
-    return (bits & (1 << i)) != 0;
+    return (storage & (1 << i)) != 0;
   }
-  unsigned max() const { return 64 - llvm::countl_zero(bits); }
-  unsigned count() const { return llvm::popcount(bits); }
-  bool empty() const { return bits == 0; }
+  unsigned min() const { return llvm::countr_zero(storage); }
+  unsigned max() const { return 64 - llvm::countl_zero(storage); }
+  unsigned count() const { return llvm::popcount(storage); }
+  bool empty() const { return storage == 0; }
 };
 
 } // namespace sparse_tensor
diff --git a/mlir/include/mlir/Dialect/SparseTensor/IR/SparseTensorAttrDefs.td b/mlir/include/mlir/Dialect/SparseTensor/IR/SparseTensorAttrDefs.td
index 69b212c..cb6c1b6 100644
--- a/mlir/include/mlir/Dialect/SparseTensor/IR/SparseTensorAttrDefs.td
+++ b/mlir/include/mlir/Dialect/SparseTensor/IR/SparseTensorAttrDefs.td
@@ -24,16 +24,17 @@ class SparseTensor_Attr<string name,
 // sparse tensor levels.
 //===----------------------------------------------------------------------===//
 
-def LevelSetAttr :
-    TypedAttrBase<
-      I64, "IntegerAttr",
+def I64BitSetAttr : TypedAttrBase<I64, "IntegerAttr",
       And<[CPred<"::llvm::isa<::mlir::IntegerAttr>($_self)">,
            CPred<"::llvm::cast<::mlir::IntegerAttr>($_self).getType().isInteger(64)">]>,
       "LevelSet attribute"> {
-  let returnType = [{::mlir::sparse_tensor::LevelSet}];
-  let convertFromStorage = [{::mlir::sparse_tensor::LevelSet($_self.getValue().getZExtValue())}];
+  let returnType = [{::mlir::sparse_tensor::I64BitSet}];
+  let convertFromStorage = [{::mlir::sparse_tensor::I64BitSet($_self.getValue().getZExtValue())}];
 }
 
+def I64BitSetArrayAttr :
+    TypedArrayAttrBase<I64BitSetAttr, "I64BitSet array attribute">;
+
 //===----------------------------------------------------------------------===//
 // These attributes are just like `IndexAttr` except that they clarify whether
 // the index refers to a dimension (an axis of the semantic tensor) or a level
diff --git a/mlir/include/mlir/Dialect/SparseTensor/IR/SparseTensorOps.td b/mlir/include/mlir/Dialect/SparseTensor/IR/SparseTensorOps.td
index f31df08..6e17f80 100644
--- a/mlir/include/mlir/Dialect/SparseTensor/IR/SparseTensorOps.td
+++ b/mlir/include/mlir/Dialect/SparseTensor/IR/SparseTensorOps.td
@@ -1306,7 +1306,7 @@ def SparseTensor_SelectOp : SparseTensor_Op<"select", [Pure, SameOperandsAndResu
 
 def SparseTensor_YieldOp : SparseTensor_Op<"yield", [Pure, Terminator,
     ParentOneOf<["BinaryOp", "UnaryOp", "ReduceOp", "SelectOp",
-                 "ForeachOp", "IterateOp"]>]> {
+                 "ForeachOp", "IterateOp", "CoIterateOp"]>]> {
   let summary = "Yield from sparse_tensor set-like operations";
   let description = [{
       Yields a value from within a `binary`, `unary`, `reduce`,
@@ -1531,6 +1531,31 @@ def ExtractIterSpaceOp : SparseTensor_Op<"extract_iteration_space",
   let hasVerifier = 1;
 }
 
+def ExtractValOp : SparseTensor_Op<"extract_value", [
+    Pure,
+    TypesMatchWith<"result type matches element type of tensor",
+                   "tensor", "result",
+                   "::llvm::cast<TensorType>($_self).getElementType()">]> {
+  let summary = "Extracts a value from a sparse tensor using an iterator.";
+  let description = [{
+      The `sparse_tensor.extract_value` operation extracts the value
+      pointed to by a sparse iterator from a sparse tensor.
+
+      Example:
+
+      ```mlir
+      %val = sparse_tensor.extract_value %sp at %it
+           : tensor<?x?xf32, #CSR>, !sparse_tensor.iterator<#CSR, lvl = 1>
+      ```
+  }];
+
+  let arguments = (ins AnySparseTensor:$tensor, AnySparseIterator:$iterator);
+  let results = (outs AnyType:$result);
+
+  let assemblyFormat = "$tensor `at` $iterator attr-dict `:` type($tensor)`,` qualified(type($iterator))";
+  let hasVerifier = 1;
+}
+
 def IterateOp : SparseTensor_Op<"iterate",
     [RecursiveMemoryEffects, RecursivelySpeculatable,
      DeclareOpInterfaceMethods<LoopLikeOpInterface,
@@ -1604,14 +1629,14 @@ def IterateOp : SparseTensor_Op<"iterate",
 
   let arguments = (ins AnySparseIterSpace:$iterSpace,
                        Variadic<AnyType>:$initArgs,
-                       LevelSetAttr:$crdUsedLvls);
+                       I64BitSetAttr:$crdUsedLvls);
   let results = (outs Variadic<AnyType>:$results);
   let regions = (region SizedRegion<1>:$region);
 
   let skipDefaultBuilders = 1;
   let builders = [
     OpBuilder<(ins "Value":$iterSpace, "ValueRange":$initArgs)>,
-    OpBuilder<(ins "Value":$iterSpace, "ValueRange":$initArgs, "LevelSet" :$crdUsedLvls)>
+    OpBuilder<(ins "Value":$iterSpace, "ValueRange":$initArgs, "I64BitSet" :$crdUsedLvls)>
   ];
 
   let extraClassDeclaration = [{
@@ -1644,6 +1669,127 @@ def IterateOp : SparseTensor_Op<"iterate",
   let hasCustomAssemblyFormat = 1;
 }
 
+def SparseTensor_CoIterateOp : SparseTensor_Op<"coiterate",
+    [AttrSizedOperandSegments,
+     SingleBlockImplicitTerminator<"sparse_tensor::YieldOp">,
+     RecursiveMemoryEffects]> {
+  let summary = "Co-iterates over a set of sparse iteration spaces";
+  let description = [{
+      The `sparse_tensor.coiterate` operation represents a loop (nest) over
+      a set of iteration spaces. The operation can have multiple regions,
+      with each of them defining a case to compute a result at the current iterations.
+      The case condition is defined solely based on the pattern of specified iterators.
+      For example:
+      ```mlir
+      %ret = sparse_tensor.coiterate (%sp1, %sp2) at(%coord) iter_args(%arg = %init)
+           : (!sparse_tensor.iter_space<#CSR, lvls = 0>,
+              !sparse_tensor.iter_space<#COO, lvls = 0>)
+           -> index
+      case %it1, _ {
+        // %coord is specifed in space %sp1 but *NOT* specified in space %sp2.
+      }
+      case %it1, %it2 {
+        // %coord is specifed in *BOTH* spaces %sp1 and %sp2.
+      }
+      ```
+
+      `sparse_tensor.coiterate` can also operate on loop-carried variables.
+      It returns the final value for each loop-carried variable after loop termination.
+      The initial values of the variables are passed as additional SSA operands
+      to the iterator SSA value and used coordinate SSA values.
+      Each operation region has variadic arguments for specified (used), one argument
+      for each loop-carried variable, representing the value of the variable
+      at the current iteration, followed by a list of arguments for iterators.
+      The body region must contain exactly one block that terminates with
+      `sparse_tensor.yield`.
+
+      The results of an `sparse_tensor.coiterate` hold the final values after
+      the last iteration. If the `sparse_tensor.coiterate` defines any values,
+      a yield must be explicitly present in every region defined in the operation.
+      The number and types of the `sparse_tensor.coiterate` results must match
+      the initial values in the iter_args binding and the yield operands.
+
+
+      A `sparse_tensor.coiterate` example that does elementwise addition between two
+      sparse vectors.
+
+
+      ```mlir
+      %ret = sparse_tensor.coiterate (%sp1, %sp2) at(%coord) iter_args(%arg = %init)
+           : (!sparse_tensor.iter_space<#CSR, lvls = 0>,
+              !sparse_tensor.iter_space<#CSR, lvls = 0>)
+           -> tensor<?xindex, #CSR>
+      case %it1, _ {
+         // v = v1 + 0 = v1
+         %v1 = sparse_tensor.extract_value %t1 at %it1 : index
+         %yield = sparse_tensor.insert %v1 into %arg[%coord]
+         sparse_tensor.yield %yield
+      }
+      case _, %it2 {
+         // v = v2 + 0 = v2
+         %v2 = sparse_tensor.extract_value %t2 at %it2 : index
+         %yield = sparse_tensor.insert %v1 into %arg[%coord]
+         sparse_tensor.yield %yield
+      }
+      case %it1, %it2 {
+         // v = v1 + v2
+         %v1 = sparse_tensor.extract_value %t1 at %it1 : index
+         %v2 = sparse_tensor.extract_value %t2 at %it2 : index
+         %v = arith.addi %v1, %v2 : index
+         %yield = sparse_tensor.insert %v into %arg[%coord]
+         sparse_tensor.yield %yield
+      }
+      ```
+  }];
+
+  let arguments = (ins Variadic<AnySparseIterSpace>:$iterSpaces,
+                       Variadic<AnyType>:$initArgs,
+                       I64BitSetAttr:$crdUsedLvls,
+                       I64BitSetArrayAttr:$cases);
+  let results = (outs Variadic<AnyType>:$results);
+  let regions = (region VariadicRegion<SizedRegion<1>>:$caseRegions);
+
+  let extraClassDeclaration = [{
+    unsigned getSpaceDim() {
+      return llvm::cast<::mlir::sparse_tensor::IterSpaceType>(
+                 getIterSpaces().front().getType())
+          .getSpaceDim();
+    }
+    I64BitSet getRegionDefinedSpace(unsigned regionIdx) {
+      return I64BitSet(llvm::cast<IntegerAttr>(getCases()[regionIdx])
+                           .getValue().getZExtValue());
+    }
+    auto getRegionDefinedSpaces() {
+      return llvm::map_range(getCases().getValue(), [](Attribute attr) {
+        return I64BitSet(llvm::cast<IntegerAttr>(attr).getValue().getZExtValue());
+      });
+    }
+
+    // The block arguments starts with referenced coordinates, follows by
+    // user-provided iteration arguments and ends with iterators.
+    Block::BlockArgListType getCrds(unsigned regionIdx) {
+      return getRegion(regionIdx).getArguments()
+          .take_front(getCrdUsedLvls().count());
+    }
+    unsigned getNumRegionIterArgs(unsigned regionIdx) {
+      return getInitArgs().size();
+    }
+    Block::BlockArgListType getRegionIterArgs(unsigned regionIdx) {
+      return getRegion(regionIdx).getArguments()
+          .slice(getCrdUsedLvls().count(), getNumRegionIterArgs(regionIdx));
+    }
+    Block::BlockArgListType getRegionIterators(unsigned regionIdx) {
+      return getRegion(regionIdx).getArguments()
+          .take_back(getRegionDefinedSpace(regionIdx).count());
+    }
+    ValueRange getYieldedValues(unsigned regionIdx);
+  }];
+
+  let hasVerifier = 1;
+  let hasRegionVerifier = 1;
+  let hasCustomAssemblyFormat = 1;
+}
+
 //===----------------------------------------------------------------------===//
 // Sparse Tensor Debugging and Test-Only Operations.
 //===----------------------------------------------------------------------===//
diff --git a/mlir/include/mlir/Dialect/Vector/IR/VectorOps.td b/mlir/include/mlir/Dialect/Vector/IR/VectorOps.td
index 39ad03c..434ff395 100644
--- a/mlir/include/mlir/Dialect/Vector/IR/VectorOps.td
+++ b/mlir/include/mlir/Dialect/Vector/IR/VectorOps.td
@@ -421,7 +421,7 @@ def Vector_ShuffleOp :
                  TCresVTEtIsSameAsOpBase<0, 1>>,
      InferTypeOpAdaptor]>,
      Arguments<(ins AnyFixedVector:$v1, AnyFixedVector:$v2,
-                    I64ArrayAttr:$mask)>,
+                    DenseI64ArrayAttr:$mask)>,
      Results<(outs AnyVector:$vector)> {
   let summary = "shuffle operation";
   let description = [{
@@ -459,11 +459,7 @@ def Vector_ShuffleOp :
                : vector<f32>, vector<f32>           ; yields vector<2xf32>
     ```
   }];
-  let builders = [
-    OpBuilder<(ins "Value":$v1, "Value":$v2, "ArrayRef<int64_t>")>
-  ];
-  let hasFolder = 1;
-  let hasCanonicalizer = 1;
+
   let extraClassDeclaration = [{
     VectorType getV1VectorType() {
       return ::llvm::cast<VectorType>(getV1().getType());
@@ -475,7 +471,10 @@ def Vector_ShuffleOp :
       return ::llvm::cast<VectorType>(getVector().getType());
     }
   }];
+
   let assemblyFormat = "operands $mask attr-dict `:` type(operands)";
+
+  let hasFolder = 1;
   let hasVerifier = 1;
   let hasCanonicalizer = 1;
 }
@@ -2443,7 +2442,7 @@ def Vector_TypeCastOp :
 
 def Vector_ConstantMaskOp :
   Vector_Op<"constant_mask", [Pure]>,
-    Arguments<(ins I64ArrayAttr:$mask_dim_sizes)>,
+    Arguments<(ins DenseI64ArrayAttr:$mask_dim_sizes)>,
     Results<(outs VectorOfAnyRankOf<[I1]>)> {
   let summary = "creates a constant vector mask";
   let description = [{
diff --git a/mlir/include/mlir/IR/ODSSupport.h b/mlir/include/mlir/IR/ODSSupport.h
index 70e3f98..25d6f3d 100644
--- a/mlir/include/mlir/IR/ODSSupport.h
+++ b/mlir/include/mlir/IR/ODSSupport.h
@@ -33,6 +33,37 @@ convertFromAttribute(int64_t &storage, Attribute attr,
 /// Convert the provided int64_t to an IntegerAttr attribute.
 Attribute convertToAttribute(MLIRContext *ctx, int64_t storage);
 
+/// Convert an IntegerAttr attribute to an int32_t, or return an error if the
+/// attribute isn't an IntegerAttr. If the optional diagnostic is provided an
+/// error message is also emitted.
+LogicalResult
+convertFromAttribute(int32_t &storage, Attribute attr,
+                     function_ref<InFlightDiagnostic()> emitError);
+
+/// Convert the provided int32_t to an IntegerAttr attribute.
+Attribute convertToAttribute(MLIRContext *ctx, int32_t storage);
+
+/// Extract the string from `attr` into `storage`. If `attr` is not a
+/// `StringAttr`, return failure and emit an error into the diagnostic from
+/// `emitError`.
+LogicalResult
+convertFromAttribute(std::string &storage, Attribute attr,
+                     function_ref<InFlightDiagnostic()> emitError);
+
+/// Convert the given string into a StringAttr. Note that this takes a reference
+/// to the storage of a string property, which is an std::string.
+Attribute convertToAttribute(MLIRContext *ctx, const std::string &storage);
+
+/// Extract the boolean from `attr` into `storage`. If `attr` is not a
+/// `BoolAttr`, return failure and emit an error into the diagnostic from
+/// `emitError`.
+LogicalResult
+convertFromAttribute(bool &storage, Attribute attr,
+                     function_ref<InFlightDiagnostic()> emitError);
+
+/// Convert the given string into a BooleanAttr.
+Attribute convertToAttribute(MLIRContext *ctx, bool storage);
+
 /// Convert a DenseI64ArrayAttr to the provided storage. It is expected that the
 /// storage has the same size as the array. An error is returned if the
 /// attribute isn't a DenseI64ArrayAttr or it does not have the same size. If
@@ -49,9 +80,24 @@ LogicalResult
 convertFromAttribute(MutableArrayRef<int32_t> storage, Attribute attr,
                      function_ref<InFlightDiagnostic()> emitError);
 
+/// Convert a DenseI64ArrayAttr to the provided storage, which will be
+/// cleared before writing. An error is returned and emitted to the optional
+/// `emitError` function if the attribute isn't a DenseI64ArrayAttr.
+LogicalResult
+convertFromAttribute(SmallVectorImpl<int64_t> &storage, Attribute attr,
+                     function_ref<InFlightDiagnostic()> emitError);
+
+/// Convert a DenseI32ArrayAttr to the provided storage, which will be
+/// cleared before writing. It is expected that the storage has the same size as
+/// the array. An error is returned and emitted to the optional `emitError`
+/// function if the attribute isn't a DenseI32ArrayAttr.
+LogicalResult
+convertFromAttribute(SmallVectorImpl<int32_t> &storage, Attribute attr,
+                     function_ref<InFlightDiagnostic()> emitError);
+
 /// Convert the provided ArrayRef<int64_t> to a DenseI64ArrayAttr attribute.
 Attribute convertToAttribute(MLIRContext *ctx, ArrayRef<int64_t> storage);
 
 } // namespace mlir
 
-#endif // MLIR_IR_ODSSUPPORT_H
-\ No newline at end of file
+#endif // MLIR_IR_ODSSUPPORT_H
diff --git a/mlir/include/mlir/IR/Properties.td b/mlir/include/mlir/IR/Properties.td
index 0babdbb..0becf7d 100644
--- a/mlir/include/mlir/IR/Properties.td
+++ b/mlir/include/mlir/IR/Properties.td
@@ -29,7 +29,6 @@ class Property<string storageTypeParam = "", string desc = ""> {
   //
   // Format:
   // - `$_storage` will contain the property in the storage type.
-  // - `$_ctxt` will contain an `MLIRContext *`.
   code convertFromStorage = "$_storage";
 
   // The call expression to build a property storage from the interface type.
@@ -40,24 +39,26 @@ class Property<string storageTypeParam = "", string desc = ""> {
   code assignToStorage = "$_storage = $_value";
 
   // The call expression to convert from the storage type to an attribute.
+  // The resulting attribute must be non-null in non-error cases.
   //
   // Format:
   // - `$_storage` is the storage type value.
   // - `$_ctxt` is a `MLIRContext *`.
   //
-  // The expression must result in an Attribute.
+  // The expression must return an `Attribute` and will be used as a function body.
   code convertToAttribute = [{
-    convertToAttribute($_ctxt, $_storage)
+    return convertToAttribute($_ctxt, $_storage);
   }];
 
   // The call expression to convert from an Attribute to the storage type.
   //
   // Format:
-  // - `$_storage` is the storage type value.
+  // - `$_storage` is a reference to a value of the storage type.
   // - `$_attr` is the attribute.
   // - `$_diag` is a callback to get a Diagnostic to emit error.
   //
-  // The expression must return a LogicalResult
+  // The expression must return a LogicalResult and will be used as a function body
+  // or in other similar contexts.
   code convertFromAttribute = [{
     return convertFromAttribute($_storage, $_attr, $_diag);
   }];
@@ -68,18 +69,68 @@ class Property<string storageTypeParam = "", string desc = ""> {
   // - `$_storage` is the variable to hash.
   //
   // The expression should define a llvm::hash_code.
-  code hashProperty = [{
-    llvm::hash_value($_storage);
+  // If unspecified, defaults to `llvm::hash_value($_storage)`.
+  // The default is not specified in tablegen because many combinators, like
+  // ArrayProperty, can fall back to more efficient implementations of
+  // `hashProperty` when their underlying elements have trivial hashing.
+  code hashProperty = "";
+
+  // The body of the parser for a value of this property.
+  // Format:
+  // - `$_parser` is the OpAsmParser.
+  // - `$_storage` is the location into which the value is to be placed if it is
+  //  present.
+  // - `$_ctxt` is a `MLIRContext *`
+  //
+  // This defines the body of a function (typically a lambda) that returns a
+  // ParseResult. There is an implicit `return success()` at the end of the parser
+  // code.
+  //
+  // When this code executes, `$_storage` will be initialized to the property's
+  // default value (if any, accounting for the storage type override).
+  code parser = [{
+    auto value = ::mlir::FieldParser<}] # storageType # [{>::parse($_parser);
+    if (::mlir::failed(value))
+      return ::mlir::failure();
+    $_storage = std::move(*value);
   }];
 
+  // The body of the parser for a value of this property as the anchor of an optional
+  // group. This should parse the property if possible and do nothing if a value of
+  // the relevant type is not next in the parse stream.
+  // You are not required to define this parser if it cannot be meaningfully
+  // implemented.
+  // This has the same context and substitutions as `parser` except that it is
+  // required to return an OptionalParseResult.
+  //
+  // If the optional parser doesn't parse anything, it should not set
+  // $_storage, since the parser doesn't know if the default value has been
+  // overwritten.
+  code optionalParser = "";
+
+  // The printer for a value of this property.
+  // Format:
+  // - `$_storage` is the storage data.
+  // - `$_printer` is the OpAsmPrinter instance.
+  // - `$_ctxt` is a `MLIRContext *`
+  //
+  // This may be called in an expression context, so variable declarations must
+  // be placed within a new scope.
+  //
+  // The printer for a property should always print a non-empty value - default value
+  // printing elision happens outside the context of this printing expression.
+  code printer = "$_printer << $_storage";
+
   // The call expression to emit the storage type to bytecode.
   //
   // Format:
   // - `$_storage` is the storage type value.
   // - `$_writer` is a `DialectBytecodeWriter`.
   // - `$_ctxt` is a `MLIRContext *`.
+  //
+  // This will become the body af a function returning void.
   code writeToMlirBytecode = [{
-    writeToMlirBytecode($_writer, $_storage)
+    writeToMlirBytecode($_writer, $_storage);
   }];
 
   // The call expression to read the storage type from bytecode.
@@ -88,13 +139,31 @@ class Property<string storageTypeParam = "", string desc = ""> {
   // - `$_storage` is the storage type value.
   // - `$_reader` is a `DialectBytecodeReader`.
   // - `$_ctxt` is a `MLIRContext *`.
+  //
+  // This will become the body of a function returning LogicalResult.
+  // There is an implicit `return success()` at the end of this function.
+  //
+  // When this code executes, `$_storage` will be initialized to the property's
+  // default value (if any, accounting for the storage type override).
   code readFromMlirBytecode = [{
     if (::mlir::failed(readFromMlirBytecode($_reader, $_storage)))
       return ::mlir::failure();
   }];
 
-  // Default value for the property.
-  string defaultValue = ?;
+  // Base definition for the property. (Will be) used for `OptionalProperty` and
+  // such cases, analogously to `baseAttr`.
+  Property baseProperty = ?;
+
+  // Default value for the property within its storage. This should be an expression
+  // of type `interfaceType` and should be comparable with other types of that
+  // interface typ with `==`. The empty string means there is no default value.
+  string defaultValue = "";
+
+  // If set, the default value the storage of the property should be initilized to.
+  // This is only needed when the storage and interface types of the property
+  // are distinct (ex. SmallVector for storage vs. ArrayRef for interfacing), as it
+  // will fall back to `defaultValue` when unspecified.
+  string storageTypeValueOverride = "";
 }
 
 /// Implementation of the Property class's `readFromMlirBytecode` field using
@@ -133,12 +202,16 @@ defvar writeMlirBytecodeWithConvertToAttribute = [{
 // Primitive property kinds
 
 // Any kind of integer stored as properties.
-class IntProperty<string storageTypeParam = "", string desc = ""> :
+class IntProperty<string storageTypeParam, string desc = ""> :
     Property<storageTypeParam, desc> {
-  code writeToMlirBytecode = [{
+  let summary = !if(!empty(desc), storageTypeParam, desc);
+  let optionalParser = [{
+    return $_parser.parseOptionalInteger($_storage);
+  }];
+  let writeToMlirBytecode = [{
     $_writer.writeVarInt($_storage);
   }];
-  code readFromMlirBytecode = [{
+  let readFromMlirBytecode = [{
     uint64_t val;
     if (failed($_reader.readVarInt(val)))
       return ::mlir::failure();
@@ -146,24 +219,472 @@ class IntProperty<string storageTypeParam = "", string desc = ""> :
   }];
 }
 
-class ArrayProperty<string storageTypeParam = "", int n, string desc = ""> :
-  Property<storageTypeParam # "[" # n # "]", desc> {
-  let interfaceType = "::llvm::ArrayRef<" # storageTypeParam # ">";
-  let convertFromStorage = "$_storage";
-  let assignToStorage = "::llvm::copy($_value, $_storage)";
-}
+def I32Property : IntProperty<"int32_t">;
+def I64Property : IntProperty<"int64_t">;
 
-class EnumProperty<string storageTypeParam, string desc = ""> :
+class EnumProperty<string storageTypeParam, string desc = "", string default = ""> :
     Property<storageTypeParam, desc> {
-  code writeToMlirBytecode = [{
+  // TODO: take advantage of EnumAttrInfo and the like to make this share nice
+  // parsing code with EnumAttr.
+  let writeToMlirBytecode = [{
     $_writer.writeVarInt(static_cast<uint64_t>($_storage));
   }];
-  code readFromMlirBytecode = [{
+  let readFromMlirBytecode = [{
     uint64_t val;
     if (failed($_reader.readVarInt(val)))
       return ::mlir::failure();
     $_storage = static_cast<}] # storageTypeParam # [{>(val);
   }];
+  let defaultValue = default;
 }
 
+def StringProperty : Property<"std::string", "string"> {
+  let interfaceType = "::llvm::StringRef";
+  let convertFromStorage = "::llvm::StringRef{$_storage}";
+  let assignToStorage = "$_storage = $_value.str()";
+  let optionalParser = [{
+    if (::mlir::failed($_parser.parseOptionalString(&$_storage)))
+      return std::nullopt;
+  }];
+  let printer = "$_printer.printString($_storage)";
+  let readFromMlirBytecode = [{
+    StringRef val;
+    if (::mlir::failed($_reader.readString(val)))
+      return ::mlir::failure();
+    $_storage = val.str();
+  }];
+  let writeToMlirBytecode = [{
+    $_writer.writeOwnedString($_storage);
+  }];
+}
+
+def BoolProperty : IntProperty<"bool", "boolean"> {
+  let printer = [{ $_printer << ($_storage ? "true" : "false") }];
+  let readFromMlirBytecode = [{
+    return $_reader.readBool($_storage);
+  }];
+  let writeToMlirBytecode = [{
+    $_writer.writeOwnedBool($_storage);
+  }];
+}
+
+def UnitProperty : Property<"bool", "unit property"> {
+  let summary = "unit property";
+  let description = [{
+    A property whose presence or abscence is used as a flag.
+
+    This is stored as a boolean that defaults to false, and is named UnitProperty
+    by analogy with UnitAttr, which has the more comprehensive rationale and
+    explains the less typical syntax.
+
+    Note that this attribute does have a syntax for the false case to allow for its
+    use in contexts where default values shouldn't be elided.
+  }];
+  let defaultValue = "false";
+
+  let convertToAttribute = [{
+    if ($_storage)
+      return ::mlir::UnitAttr::get($_ctxt);
+    else
+      return ::mlir::BoolAttr::get($_ctxt, false);
+  }];
+  let convertFromAttribute = [{
+    if (::llvm::isa<::mlir::UnitAttr>($_attr)) {
+      $_storage = true;
+      return ::mlir::success();
+    }
+    if (auto boolAttr = ::llvm::dyn_cast<::mlir::BoolAttr>($_attr)) {
+      $_storage = boolAttr.getValue();
+      return ::mlir::success();
+    }
+    return ::mlir::failure();
+  }];
+
+  let parser = [{
+    ::llvm::StringRef keyword;
+    if (::mlir::failed($_parser.parseOptionalKeyword(&keyword,
+        {"unit", "unit_absent"})))
+      return $_parser.emitError($_parser.getCurrentLocation(),
+        "expected 'unit' or 'unit_absent'");
+    $_storage = (keyword == "unit");
+  }];
+
+  let optionalParser = [{
+    ::llvm::StringRef keyword;
+    if (::mlir::failed($_parser.parseOptionalKeyword(&keyword,
+        {"unit", "unit_absent"})))
+      return std::nullopt;
+    $_storage = (keyword == "unit");
+  }];
+
+  let printer = [{
+    $_printer << ($_storage ? "unit" : "unit_absent")
+  }];
+
+  let writeToMlirBytecode = [{
+    $_writer.writeOwnedBool($_storage);
+  }];
+  let readFromMlirBytecode = [{
+    if (::mlir::failed($_reader.readBool($_storage)))
+      return ::mlir::failure();
+  }];
+}
+
+//===----------------------------------------------------------------------===//
+// Primitive property combinators
+
+/// Create a variable named `name` of `prop`'s storage type that is initialized
+/// to the correct default value, if there is one.
+class _makePropStorage<Property prop, string name> {
+  code ret = prop.storageType # " " # name
+      # !cond(!not(!empty(prop.storageTypeValueOverride)) : " = " # prop.storageTypeValueOverride,
+        !not(!empty(prop.defaultValue)) : " = " # prop.defaultValue,
+        true : "") # ";";
+}
+
+/// The generic class for arrays of some other property, which is stored as a
+/// `SmallVector` of that property. This uses an `ArrayAttr` as its attribute form
+/// though subclasses can override this, as is the case with IntArrayAttr below.
+/// Those wishing to use a non-default number of SmallVector elements should
+/// subclass `ArrayProperty`.
+class ArrayProperty<Property elem = Property<>, string desc = ""> :
+  Property<"::llvm::SmallVector<" # elem.storageType # ">", desc> {
+  let summary = "array of " # elem.summary;
+  let interfaceType = "::llvm::ArrayRef<" # elem.storageType # ">";
+  let convertFromStorage = "::llvm::ArrayRef<" # elem.storageType # ">{$_storage}";
+  let assignToStorage = "$_storage.assign($_value.begin(), $_value.end())";
+
+  let convertFromAttribute = [{
+    auto arrayAttr = ::llvm::dyn_cast_if_present<::mlir::ArrayAttr>($_attr);
+    if (!arrayAttr)
+      return $_diag() << "expected array attribute";
+    for (::mlir::Attribute elemAttr : arrayAttr) {
+      }] # _makePropStorage<elem, "elemVal">.ret # [{
+      auto elemRes = [&](Attribute propAttr, }] # elem.storageType # [{& propStorage) -> ::mlir::LogicalResult {
+        }] # !subst("$_attr", "propAttr",
+          !subst("$_storage", "propStorage", elem.convertFromAttribute)) # [{
+      }(elemAttr, elemVal);
+      if (::mlir::failed(elemRes))
+        return ::mlir::failure();
+      $_storage.push_back(std::move(elemVal));
+    }
+    return ::mlir::success();
+  }];
+
+  let convertToAttribute = [{
+    SmallVector<Attribute> elems;
+    for (const auto& elemVal : $_storage) {
+      auto elemAttr = [&](const }] # elem.storageType #[{& propStorage) -> ::mlir::Attribute {
+        }] # !subst("$_storage", "propStorage", elem.convertToAttribute) # [{
+      }(elemVal);
+      elems.push_back(elemAttr);
+    }
+    return ::mlir::ArrayAttr::get($_ctxt, elems);
+  }];
+
+  defvar theParserBegin = [{
+    auto& storage = $_storage;
+    auto parseElemFn = [&]() -> ::mlir::ParseResult {
+      }] # _makePropStorage<elem, "elemVal">.ret # [{
+      auto elemParse = [&](}] # elem.storageType # [{& propStorage) -> ::mlir::ParseResult {
+        }] # !subst("$_storage", "propStorage", elem.parser) # [{
+        return ::mlir::success();
+       }(elemVal);
+      if (::mlir::failed(elemParse))
+        return ::mlir::failure();
+      storage.push_back(std::move(elemVal));
+      return ::mlir::success();
+    };
+    }];
+  let parser = theParserBegin # [{
+    return $_parser.parseCommaSeparatedList(
+      ::mlir::OpAsmParser::Delimiter::Square, parseElemFn);
+  }];
+  // Hack around the lack of a peek method
+  let optionalParser = theParserBegin # [{
+    auto oldLoc = $_parser.getCurrentLocation();
+    auto parseResult = $_parser.parseCommaSeparatedList(
+      ::mlir::OpAsmParser::Delimiter::OptionalSquare, parseElemFn);
+    if (::mlir::failed(parseResult))
+      return ::mlir::failure();
+    auto newLoc = $_parser.getCurrentLocation();
+    if (oldLoc == newLoc)
+      return std::nullopt;
+    return ::mlir::success();
+  }];
+
+  let printer = [{ [&](){
+    $_printer << "[";
+    auto elemPrinter = [&](const }] # elem.storageType # [{& elemVal) {
+      }] # !subst("$_storage", "elemVal", elem.printer) #[{;
+    };
+    ::llvm::interleaveComma($_storage, $_printer, elemPrinter);
+    $_printer << "]";
+  }()}];
+
+  let readFromMlirBytecode = [{
+    uint64_t length;
+    if (::mlir::failed($_reader.readVarInt(length)))
+      return ::mlir::failure();
+    $_storage.reserve(length);
+    for (uint64_t i = 0; i < length; ++i) {
+      }]# _makePropStorage<elem, "elemVal">.ret # [{
+      auto elemRead = [&](}] # elem.storageType # [{& propStorage) -> ::mlir::LogicalResult {
+        }] # !subst("$_storage", "propStorage", elem.readFromMlirBytecode) # [{;
+        return ::mlir::success();
+      }(elemVal);
+      if (::mlir::failed(elemRead))
+        return ::mlir::failure();
+      $_storage.push_back(std::move(elemVal));
+    }
+  }];
+
+  let writeToMlirBytecode = [{
+    $_writer.writeVarInt($_storage.size());
+    for (const auto& elemVal : $_storage) {
+      [&]() {
+        }] # !subst("$_storage", "elemVal", elem.writeToMlirBytecode) #[{;
+      }();
+    }
+  }];
+
+  // There's no hash_value for SmallVector<T>, so we construct the ArrayRef ourselves.
+  // In the non-trivial case, we define a mapped range to get internal hash
+  // codes.
+  let hashProperty = !if(!empty(elem.hashProperty),
+    [{::llvm::hash_value(::llvm::ArrayRef<}] # elem.storageType # [{>{$_storage})}],
+    [{[&]() -> ::llvm::hash_code {
+        auto getElemHash = [](const auto& propStorage) -> ::llvm::hash_code {
+          return }] # !subst("$_storage", "propStorage", elem.hashProperty) # [{;
+        };
+        auto mapped = ::llvm::map_range($_storage, getElemHash);
+        return ::llvm::hash_combine_range(mapped.begin(), mapped.end());
+      }()
+    }]);
+}
+
+class IntArrayProperty<string storageTypeParam = "", string desc = ""> :
+    ArrayProperty<IntProperty<storageTypeParam, desc>> {
+  // Bring back the trivial conversions we don't get in the general case.
+  let convertFromAttribute = [{
+    return convertFromAttribute($_storage, $_attr, $_diag);
+  }];
+  let convertToAttribute = [{
+    return convertToAttribute($_ctxt, $_storage);
+  }];
+}
+
+/// Class for giving a property a default value.
+/// This doesn't change anything about the property other than giving it a default
+/// which can be used by ODS to elide printing.
+class DefaultValuedProperty<Property p, string default = "", string storageDefault = ""> : Property<p.storageType, p.summary> {
+  let defaultValue = default;
+  let storageTypeValueOverride = storageDefault;
+  let baseProperty = p;
+  // Keep up to date with `Property` above.
+  let summary = p.summary;
+  let description = p.description;
+  let storageType = p.storageType;
+  let interfaceType = p.interfaceType;
+  let convertFromStorage = p.convertFromStorage;
+  let assignToStorage = p.assignToStorage;
+  let convertToAttribute = p.convertToAttribute;
+  let convertFromAttribute = p.convertFromAttribute;
+  let hashProperty = p.hashProperty;
+  let parser = p.parser;
+  let optionalParser = p.optionalParser;
+  let printer = p.printer;
+  let readFromMlirBytecode = p.readFromMlirBytecode;
+  let writeToMlirBytecode = p.writeToMlirBytecode;
+}
+
+/// An optional property, stored as an std::optional<p.storageType>
+/// interfaced with as an std::optional<p.interfaceType>..
+/// The syntax is `none` (or empty string if elided) for an absent value or
+/// `some<[underlying property]>` when a value is set.
+///
+/// As a special exception, if the underlying property has an optional parser and
+/// no default value (ex. an integer property), the printer will skip the `some`
+/// bracketing and delegate to the optional parser. In that case, the syntax is the
+/// syntax of the underlying property, or the keyword `none` in the rare cases that
+/// it is needed. This behavior can be disabled by setting `canDelegateParsing` to 0.
+class OptionalProperty<Property p, bit canDelegateParsing = 1>
+    : Property<"std::optional<" # p.storageType # ">", "optional " # p.summary> {
+
+  // In the cases where the underlying attribute is plain old data that's passed by
+  // value, the conversion code is trivial.
+  defvar hasTrivialStorage = !and(!eq(p.convertFromStorage, "$_storage"),
+    !eq(p.assignToStorage, "$_storage = $_value"),
+    !eq(p.storageType, p.interfaceType));
+
+  defvar delegatesParsing = !and(!empty(p.defaultValue),
+    !not(!empty(p.optionalParser)), canDelegateParsing);
+
+  let interfaceType = "std::optional<" # p.interfaceType # ">";
+  let defaultValue = "std::nullopt";
+
+  let convertFromStorage = !if(hasTrivialStorage,
+    p.convertFromStorage,
+    [{($_storage.has_value() ? std::optional<}] # p.interfaceType # ">{"
+      # !subst("$_storage", "(*($_storage))", p.convertFromStorage)
+      # [{} : std::nullopt)}]);
+  let assignToStorage = !if(hasTrivialStorage,
+    p.assignToStorage,
+    [{[&]() {
+      if (!$_value.has_value()) {
+        $_storage = std::nullopt;
+        return;
+      }
+      }] # _makePropStorage<p, "presentVal">.ret # [{
+      [&](}] # p.storageType # [{& propStorage) {
+        }] # !subst("$_storage", "propStorage",
+          !subst("$_value", "(*($_value))", p.assignToStorage)) # [{;
+      }(presentVal);
+      $_storage = std::move(presentVal);
+    }()}]);
+
+  let convertFromAttribute = [{
+    auto arrayAttr = ::llvm::dyn_cast<::mlir::ArrayAttr>($_attr);
+    if (!arrayAttr)
+      return $_diag() << "expected optional properties to materialize as arrays";
+    if (arrayAttr.size() > 1)
+      return $_diag() << "expected optional properties to become 0- or 1-element arrays";
+    if (arrayAttr.empty()) {
+      $_storage = std::nullopt;
+      return ::mlir::success();
+    }
+    ::mlir::Attribute presentAttr = arrayAttr[0];
+    }] # _makePropStorage<p, "presentVal">.ret # [{
+    auto presentRes = [&](Attribute propAttr, }] # p.storageType # [{& propStorage) -> ::mlir::LogicalResult {
+      }] # !subst("$_storage", "propStorage",
+          !subst("$_attr", "propAttr", p.convertFromAttribute)) # [{
+    }(presentAttr, presentVal);
+    if (::mlir::failed(presentRes))
+      return ::mlir::failure();
+    $_storage = std::move(presentVal);
+    return ::mlir::success();
+  }];
+
+  let convertToAttribute = [{
+    if (!$_storage.has_value()) {
+      return ::mlir::ArrayAttr::get($_ctxt, {});
+    }
+    auto attr = [&]() -> ::mlir::Attribute {
+      }] # !subst("$_storage", "(*($_storage))", p.convertToAttribute) # [{
+    }();
+    return ::mlir::ArrayAttr::get($_ctxt, {attr});
+  }];
+
+  defvar delegatedParserBegin = [{
+    if (::mlir::succeeded($_parser.parseOptionalKeyword("none"))) {
+      $_storage = std::nullopt;
+      return ::mlir::success();
+    }
+    }] #_makePropStorage<p, "presentVal">.ret # [{
+    auto delegParseResult = [&](}] # p.storageType # [{& propStorage) -> ::mlir::OptionalParseResult {
+    }] # !subst("$_storage", "propStorage", p.optionalParser) # [{
+        return ::mlir::success();
+    }(presentVal);
+    if (!delegParseResult.has_value()) {
+  }];
+
+  defvar delegatedParserEnd = [{
+    }
+    if (delegParseResult.has_value() && ::mlir::failed(*delegParseResult))
+      return ::mlir::failure();
+    $_storage = std::move(presentVal);
+    return ::mlir::success();
+  }];
+  // If we're being explicitly called for our parser, we're expecting to have been
+  // printede into a context where the default value isn't elided. Therefore,
+  // not-present from the underlying parser is a failure.
+  defvar delegatedParser = delegatedParserBegin # [{
+    return ::mlir::failure();
+  }] # delegatedParserEnd;
+  defvar delegatedOptionalParser = delegatedParserBegin # [{
+      return std::nullopt;
+  }] # delegatedParserEnd;
+
+  defvar generalParserBegin = [{
+    ::llvm::StringRef keyword;
+    if (::mlir::failed($_parser.parseOptionalKeyword(&keyword, {"none", "some"}))) {
+  }];
+  defvar generalParserEnd = [{
+    }
+    if (keyword == "none") {
+      $_storage = std::nullopt;
+      return ::mlir::success();
+    }
+    if (::mlir::failed($_parser.parseLess()))
+      return ::mlir::failure();
+    }] # _makePropStorage<p, "presentVal">.ret # [{
+    auto presentParse = [&](}] # p.storageType # [{& propStorage) -> ::mlir::ParseResult {
+      }] # !subst("$_storage", "propStorage", p.parser) # [{
+      return ::mlir::success();
+    }(presentVal);
+    if (presentParse || $_parser.parseGreater())
+      return ::mlir::failure();
+    $_storage = std::move(presentVal);
+  }];
+  defvar generalParser = generalParserBegin # [{
+    return $_parser.emitError($_parser.getCurrentLocation(), "expected 'none' or 'some<prop>'");
+  }] # generalParserEnd;
+  defvar generalOptionalParser = generalParserBegin # [{
+    return std::nullopt;
+  }] # generalParserEnd;
+
+  let parser = !if(delegatesParsing, delegatedParser, generalParser);
+  let optionalParser = !if(delegatesParsing,
+    delegatedOptionalParser, generalOptionalParser);
+
+  defvar delegatedPrinter = [{
+    [&]() {
+      if (!$_storage.has_value()) {
+        $_printer << "none";
+        return;
+      }
+      }] # !subst("$_storage", "(*($_storage))", p.printer) # [{;
+    }()}];
+  defvar generalPrinter = [{
+      [&]() {
+        if (!$_storage.has_value()) {
+          $_printer << "none";
+          return;
+        }
+        $_printer << "some<";
+        }] # !subst("$_storage", "(*($_storage))", p.printer) # [{;
+        $_printer << ">";
+      }()}];
+  let printer = !if(delegatesParsing, delegatedPrinter, generalPrinter);
+
+  let readFromMlirBytecode = [{
+    bool isPresent = false;
+    if (::mlir::failed($_reader.readBool(isPresent)))
+      return ::mlir::failure();
+    if (!isPresent) {
+      $_storage = std::nullopt;
+      return ::mlir::success();
+    }
+    }] # _makePropStorage<p, "presentVal">.ret # [{
+    auto presentResult = [&](}] # p.storageType # [{& propStorage) -> ::mlir::LogicalResult {
+      }] # !subst("$_storage", "propStorage", p.readFromMlirBytecode) # [{;
+      return ::mlir::success();
+    }(presentVal);
+    if (::mlir::failed(presentResult))
+      return ::mlir::failure();
+    $_storage = std::move(presentVal);
+  }];
+  let writeToMlirBytecode = [{
+    $_writer.writeOwnedBool($_storage.has_value());
+    if (!$_storage.has_value())
+      return;
+  }] # !subst("$_storage", "(*($_storage))", p.writeToMlirBytecode);
+
+  let hashProperty = !if(!empty(p.hashProperty), p.hashProperty,
+    [{ ::llvm::hash_value($_storage.has_value() ? std::optional<::llvm::hash_code>{}] #
+      !subst("$_storage", "(*($_storage))", p.hashProperty) #[{} : std::nullopt) }]);
+  assert !or(!not(delegatesParsing), !eq(defaultValue, "std::nullopt")),
+    "For delegated parsing to be used, the default value must be nullopt. " #
+    "To use a non-trivial default, set the canDelegateParsing argument to 0";
+}
 #endif // PROPERTIES
diff --git a/mlir/include/mlir/TableGen/Operator.h b/mlir/include/mlir/TableGen/Operator.h
index cc5853c..768291a 100644
--- a/mlir/include/mlir/TableGen/Operator.h
+++ b/mlir/include/mlir/TableGen/Operator.h
@@ -384,7 +384,7 @@ private:
   SmallVector<NamedAttribute, 4> attributes;
 
   /// The properties of the op.
-  SmallVector<NamedProperty> properties;
+  SmallVector<NamedProperty, 4> properties;
 
   /// The arguments of the op (operands and native attributes).
   SmallVector<Argument, 4> arguments;
diff --git a/mlir/include/mlir/TableGen/Property.h b/mlir/include/mlir/TableGen/Property.h
index d0d6f49..702e675 100644
--- a/mlir/include/mlir/TableGen/Property.h
+++ b/mlir/include/mlir/TableGen/Property.h
@@ -35,12 +35,20 @@ class Property {
 public:
   explicit Property(const llvm::Record *record);
   explicit Property(const llvm::DefInit *init);
-  Property(StringRef storageType, StringRef interfaceType,
-           StringRef convertFromStorageCall, StringRef assignToStorageCall,
-           StringRef convertToAttributeCall, StringRef convertFromAttributeCall,
+  Property(StringRef summary, StringRef description, StringRef storageType,
+           StringRef interfaceType, StringRef convertFromStorageCall,
+           StringRef assignToStorageCall, StringRef convertToAttributeCall,
+           StringRef convertFromAttributeCall, StringRef parserCall,
+           StringRef optionalParserCall, StringRef printerCall,
            StringRef readFromMlirBytecodeCall,
            StringRef writeToMlirBytecodeCall, StringRef hashPropertyCall,
-           StringRef defaultValue);
+           StringRef defaultValue, StringRef storageTypeValueOverride);
+
+  // Returns the summary (for error messages) of this property's type.
+  StringRef getSummary() const { return summary; }
+
+  // Returns the description of this property.
+  StringRef getDescription() const { return description; }
 
   // Returns the storage type.
   StringRef getStorageType() const { return storageType; }
@@ -66,6 +74,19 @@ public:
     return convertFromAttributeCall;
   }
 
+  // Returns the method call which parses this property from textual MLIR.
+  StringRef getParserCall() const { return parserCall; }
+
+  // Returns true if this property has defined an optional parser.
+  bool hasOptionalParser() const { return !optionalParserCall.empty(); }
+
+  // Returns the method call which optionally parses this property from textual
+  // MLIR.
+  StringRef getOptionalParserCall() const { return optionalParserCall; }
+
+  // Returns the method call which prints this property to textual MLIR.
+  StringRef getPrinterCall() const { return printerCall; }
+
   // Returns the method call which reads this property from
   // bytecode and assign it to the storage.
   StringRef getReadFromMlirBytecodeCall() const {
@@ -87,6 +108,24 @@ public:
   // Returns the default value for this Property.
   StringRef getDefaultValue() const { return defaultValue; }
 
+  // Returns whether this Property has a default storage-type value that is
+  // distinct from its default interface-type value.
+  bool hasStorageTypeValueOverride() const {
+    return !storageTypeValueOverride.empty();
+  }
+
+  StringRef getStorageTypeValueOverride() const {
+    return storageTypeValueOverride;
+  }
+
+  // Returns this property's TableGen def-name.
+  StringRef getPropertyDefName() const;
+
+  // Returns the base-level property that this Property constraint is based on
+  // or the Property itself otherwise. (Note: there are currently no
+  // property constraints, this function is added for future-proofing)
+  Property getBaseProperty() const;
+
   // Returns the TableGen definition this Property was constructed from.
   const llvm::Record &getDef() const { return *def; }
 
@@ -95,16 +134,22 @@ private:
   const llvm::Record *def;
 
   // Elements describing a Property, in general fetched from the record.
+  StringRef summary;
+  StringRef description;
   StringRef storageType;
   StringRef interfaceType;
   StringRef convertFromStorageCall;
   StringRef assignToStorageCall;
   StringRef convertToAttributeCall;
   StringRef convertFromAttributeCall;
+  StringRef parserCall;
+  StringRef optionalParserCall;
+  StringRef printerCall;
   StringRef readFromMlirBytecodeCall;
   StringRef writeToMlirBytecodeCall;
   StringRef hashPropertyCall;
   StringRef defaultValue;
+  StringRef storageTypeValueOverride;
 };
 
 // A struct wrapping an op property and its name together
diff --git a/mlir/lib/CAPI/Dialect/SparseTensor.cpp b/mlir/lib/CAPI/Dialect/SparseTensor.cpp
index 19171d6..f2a0ab3 100644
--- a/mlir/lib/CAPI/Dialect/SparseTensor.cpp
+++ b/mlir/lib/CAPI/Dialect/SparseTensor.cpp
@@ -107,6 +107,7 @@ MlirSparseTensorLevelType mlirSparseTensorEncodingAttrBuildLvlType(
     unsigned size, unsigned n, unsigned m) {
 
   std::vector<LevelPropNonDefault> props;
+  props.reserve(size);
   for (unsigned i = 0; i < size; i++)
     props.push_back(static_cast<LevelPropNonDefault>(properties[i]));
 
diff --git a/mlir/lib/Conversion/AMDGPUToROCDL/AMDGPUToROCDL.cpp b/mlir/lib/Conversion/AMDGPUToROCDL/AMDGPUToROCDL.cpp
index 033e66c..b808738 100644
--- a/mlir/lib/Conversion/AMDGPUToROCDL/AMDGPUToROCDL.cpp
+++ b/mlir/lib/Conversion/AMDGPUToROCDL/AMDGPUToROCDL.cpp
@@ -321,6 +321,22 @@ struct LDSBarrierOpLowering : public ConvertOpToLLVMPattern<LDSBarrierOp> {
     return success();
   }
 };
+
+struct SchedBarrierOpLowering : public ConvertOpToLLVMPattern<SchedBarrierOp> {
+  SchedBarrierOpLowering(LLVMTypeConverter &converter, Chipset chipset)
+      : ConvertOpToLLVMPattern<SchedBarrierOp>(converter), chipset(chipset) {}
+
+  Chipset chipset;
+
+  LogicalResult
+  matchAndRewrite(SchedBarrierOp op, SchedBarrierOp::Adaptor adaptor,
+                  ConversionPatternRewriter &rewriter) const override {
+    rewriter.replaceOpWithNewOp<ROCDL::SchedBarrier>(op,
+                                                     (uint32_t)op.getOpts());
+    return success();
+  }
+};
+
 } // namespace
 
 /// If `input` is a vector of bytes, concatentate those bytes in little-endian
@@ -879,8 +895,8 @@ void mlir::populateAMDGPUToROCDLConversionPatterns(LLVMTypeConverter &converter,
                                ROCDL::RawPtrBufferAtomicUminOp>,
            RawBufferOpLowering<RawBufferAtomicCmpswapOp,
                                ROCDL::RawPtrBufferAtomicCmpSwap>,
-           LDSBarrierOpLowering, MFMAOpLowering, WMMAOpLowering,
-           ExtPackedFp8OpLowering, PackedTrunc2xFp8OpLowering,
+           LDSBarrierOpLowering, SchedBarrierOpLowering, MFMAOpLowering,
+           WMMAOpLowering, ExtPackedFp8OpLowering, PackedTrunc2xFp8OpLowering,
            PackedStochRoundFp8OpLowering>(converter, chipset);
 }
 
diff --git a/mlir/lib/Conversion/ArithToEmitC/ArithToEmitC.cpp b/mlir/lib/Conversion/ArithToEmitC/ArithToEmitC.cpp
index db93f18..50384d9 100644
--- a/mlir/lib/Conversion/ArithToEmitC/ArithToEmitC.cpp
+++ b/mlir/lib/Conversion/ArithToEmitC/ArithToEmitC.cpp
@@ -421,6 +421,38 @@ public:
   }
 };
 
+template <class ArithOp, class EmitCOp>
+class BinaryUIOpConversion final : public OpConversionPattern<ArithOp> {
+public:
+  using OpConversionPattern<ArithOp>::OpConversionPattern;
+
+  LogicalResult
+  matchAndRewrite(ArithOp uiBinOp, typename ArithOp::Adaptor adaptor,
+                  ConversionPatternRewriter &rewriter) const override {
+    Type newRetTy = this->getTypeConverter()->convertType(uiBinOp.getType());
+    if (!newRetTy)
+      return rewriter.notifyMatchFailure(uiBinOp,
+                                         "converting result type failed");
+    if (!isa<IntegerType>(newRetTy)) {
+      return rewriter.notifyMatchFailure(uiBinOp, "expected integer type");
+    }
+    Type unsignedType =
+        adaptIntegralTypeSignedness(newRetTy, /*needsUnsigned=*/true);
+    if (!unsignedType)
+      return rewriter.notifyMatchFailure(uiBinOp,
+                                         "converting result type failed");
+    Value lhsAdapted = adaptValueType(uiBinOp.getLhs(), rewriter, unsignedType);
+    Value rhsAdapted = adaptValueType(uiBinOp.getRhs(), rewriter, unsignedType);
+
+    auto newDivOp =
+        rewriter.create<EmitCOp>(uiBinOp.getLoc(), unsignedType,
+                                 ArrayRef<Value>{lhsAdapted, rhsAdapted});
+    Value resultAdapted = adaptValueType(newDivOp, rewriter, newRetTy);
+    rewriter.replaceOp(uiBinOp, resultAdapted);
+    return success();
+  }
+};
+
 template <typename ArithOp, typename EmitCOp>
 class IntegerOpConversion final : public OpConversionPattern<ArithOp> {
 public:
@@ -722,6 +754,8 @@ void mlir::populateArithToEmitCPatterns(TypeConverter &typeConverter,
     ArithOpConversion<arith::MulFOp, emitc::MulOp>,
     ArithOpConversion<arith::RemSIOp, emitc::RemOp>,
     ArithOpConversion<arith::SubFOp, emitc::SubOp>,
+    BinaryUIOpConversion<arith::DivUIOp, emitc::DivOp>,
+    BinaryUIOpConversion<arith::RemUIOp, emitc::RemOp>,
     IntegerOpConversion<arith::AddIOp, emitc::AddOp>,
     IntegerOpConversion<arith::MulIOp, emitc::MulOp>,
     IntegerOpConversion<arith::SubIOp, emitc::SubOp>,
diff --git a/mlir/lib/Conversion/NVGPUToNVVM/NVGPUToNVVM.cpp b/mlir/lib/Conversion/NVGPUToNVVM/NVGPUToNVVM.cpp
index 11d2975..cf984ca 100644
--- a/mlir/lib/Conversion/NVGPUToNVVM/NVGPUToNVVM.cpp
+++ b/mlir/lib/Conversion/NVGPUToNVVM/NVGPUToNVVM.cpp
@@ -11,6 +11,7 @@
 #include "mlir/Conversion/GPUCommon/GPUCommonPass.h"
 #include "mlir/Conversion/LLVMCommon/ConversionTarget.h"
 #include "mlir/Conversion/LLVMCommon/Pattern.h"
+#include "mlir/Conversion/LLVMCommon/VectorPattern.h"
 #include "mlir/Dialect/Arith/IR/Arith.h"
 #include "mlir/Dialect/GPU/IR/GPUDialect.h"
 #include "mlir/Dialect/LLVMIR/LLVMDialect.h"
@@ -1666,6 +1667,40 @@ struct NVGPUTmaPrefetchOpLowering
   }
 };
 
+struct NVGPURcpOpLowering : public ConvertOpToLLVMPattern<nvgpu::RcpOp> {
+  using ConvertOpToLLVMPattern<nvgpu::RcpOp>::ConvertOpToLLVMPattern;
+  LogicalResult
+  matchAndRewrite(nvgpu::RcpOp op, OpAdaptor adaptor,
+                  ConversionPatternRewriter &rewriter) const override {
+    ImplicitLocOpBuilder b(op->getLoc(), rewriter);
+    auto i64Ty = b.getI64Type();
+    auto f32Ty = b.getF32Type();
+    VectorType inTy = op.getIn().getType();
+    // apply rcp.approx.ftz.f on each element in vector.
+    auto convert1DVec = [&](Type llvm1DVectorTy, Value inVec) {
+      Value ret1DVec = b.create<LLVM::UndefOp>(llvm1DVectorTy);
+      int numElems = llvm::cast<VectorType>(llvm1DVectorTy).getNumElements();
+      for (int i = 0; i < numElems; i++) {
+        Value idx = b.create<LLVM::ConstantOp>(i64Ty, b.getI64IntegerAttr(i));
+        Value elem = b.create<LLVM::ExtractElementOp>(inVec, idx);
+        Value dst = b.create<NVVM::RcpApproxFtzF32Op>(f32Ty, elem);
+        ret1DVec = b.create<LLVM::InsertElementOp>(ret1DVec, dst, idx);
+      }
+      return ret1DVec;
+    };
+    if (inTy.getRank() == 1) {
+      rewriter.replaceOp(op, convert1DVec(inTy, adaptor.getIn()));
+      return success();
+    }
+    return LLVM::detail::handleMultidimensionalVectors(
+        op.getOperation(), adaptor.getOperands(), *(this->getTypeConverter()),
+        [&](Type llvm1DVectorTy, ValueRange operands) -> Value {
+          OpAdaptor adaptor(operands);
+          return convert1DVec(llvm1DVectorTy, adaptor.getIn());
+        },
+        rewriter);
+  }
+};
 } // namespace
 
 void mlir::populateNVGPUToNVVMConversionPatterns(LLVMTypeConverter &converter,
@@ -1688,5 +1723,5 @@ void mlir::populateNVGPUToNVVMConversionPatterns(LLVMTypeConverter &converter,
       NVGPUWarpgroupMmaInitAccumulatorOpLowering, // nvgpu.warpgroup.mma.init.accumulator
       MmaSyncOptoNVVM, MmaLdMatrixOpToNVVM, NVGPUAsyncCopyLowering,
       NVGPUAsyncCreateGroupLowering, NVGPUAsyncWaitLowering,
-      NVGPUMmaSparseSyncLowering>(converter);
+      NVGPUMmaSparseSyncLowering, NVGPURcpOpLowering>(converter);
 }
diff --git a/mlir/lib/Conversion/SCFToOpenMP/SCFToOpenMP.cpp b/mlir/lib/Conversion/SCFToOpenMP/SCFToOpenMP.cpp
index fdc4c7b..6d25023 100644
--- a/mlir/lib/Conversion/SCFToOpenMP/SCFToOpenMP.cpp
+++ b/mlir/lib/Conversion/SCFToOpenMP/SCFToOpenMP.cpp
@@ -366,7 +366,7 @@ struct ParallelOpLowering : public OpRewritePattern<scf::ParallelOp> {
     // Declare reductions.
     // TODO: consider checking it here is already a compatible reduction
     // declaration and use it instead of redeclaring.
-    SmallVector<Attribute> reductionDeclSymbols;
+    SmallVector<Attribute> reductionSyms;
     SmallVector<omp::DeclareReductionOp> ompReductionDecls;
     auto reduce = cast<scf::ReduceOp>(parallelOp.getBody()->getTerminator());
     for (int64_t i = 0, e = parallelOp.getNumReductions(); i < e; ++i) {
@@ -374,7 +374,7 @@ struct ParallelOpLowering : public OpRewritePattern<scf::ParallelOp> {
       ompReductionDecls.push_back(decl);
       if (!decl)
         return failure();
-      reductionDeclSymbols.push_back(
+      reductionSyms.push_back(
           SymbolRefAttr::get(rewriter.getContext(), decl.getSymName()));
     }
 
@@ -444,16 +444,16 @@ struct ParallelOpLowering : public OpRewritePattern<scf::ParallelOp> {
     // Create the parallel wrapper.
     auto ompParallel = rewriter.create<omp::ParallelOp>(
         loc,
-        /* if_expr = */ Value{},
-        /* num_threads_var = */ numThreadsVar,
         /* allocate_vars = */ llvm::SmallVector<Value>{},
-        /* allocators_vars = */ llvm::SmallVector<Value>{},
-        /* reduction_vars = */ llvm::SmallVector<Value>{},
-        /* reduction_vars_isbyref = */ DenseBoolArrayAttr{},
-        /* reductions = */ ArrayAttr{},
-        /* proc_bind_val = */ omp::ClauseProcBindKindAttr{},
+        /* allocator_vars = */ llvm::SmallVector<Value>{},
+        /* if_expr = */ Value{},
+        /* num_threads = */ numThreadsVar,
         /* private_vars = */ ValueRange(),
-        /* privatizers = */ nullptr);
+        /* private_syms = */ nullptr,
+        /* proc_bind_kind = */ omp::ClauseProcBindKindAttr{},
+        /* reduction_vars = */ llvm::SmallVector<Value>{},
+        /* reduction_byref = */ DenseBoolArrayAttr{},
+        /* reduction_syms = */ ArrayAttr{});
     {
 
       OpBuilder::InsertionGuard guard(rewriter);
@@ -465,15 +465,15 @@ struct ParallelOpLowering : public OpRewritePattern<scf::ParallelOp> {
         // Create worksharing loop wrapper.
         auto wsloopOp = rewriter.create<omp::WsloopOp>(parallelOp.getLoc());
         if (!reductionVariables.empty()) {
-          wsloopOp.setReductionsAttr(
-              ArrayAttr::get(rewriter.getContext(), reductionDeclSymbols));
+          wsloopOp.setReductionSymsAttr(
+              ArrayAttr::get(rewriter.getContext(), reductionSyms));
           wsloopOp.getReductionVarsMutable().append(reductionVariables);
-          llvm::SmallVector<bool> byRefVec;
+          llvm::SmallVector<bool> reductionByRef;
           // false because these reductions always reduce scalars and so do
           // not need to pass by reference
-          byRefVec.resize(reductionVariables.size(), false);
-          wsloopOp.setReductionVarsByref(
-              DenseBoolArrayAttr::get(rewriter.getContext(), byRefVec));
+          reductionByRef.resize(reductionVariables.size(), false);
+          wsloopOp.setReductionByref(
+              DenseBoolArrayAttr::get(rewriter.getContext(), reductionByRef));
         }
         rewriter.create<omp::TerminatorOp>(loc); // omp.parallel terminator.
 
diff --git a/mlir/lib/Conversion/VectorToLLVM/ConvertVectorToLLVM.cpp b/mlir/lib/Conversion/VectorToLLVM/ConvertVectorToLLVM.cpp
index f6b1c42..53e18a2 100644
--- a/mlir/lib/Conversion/VectorToLLVM/ConvertVectorToLLVM.cpp
+++ b/mlir/lib/Conversion/VectorToLLVM/ConvertVectorToLLVM.cpp
@@ -994,7 +994,7 @@ public:
     auto v2Type = shuffleOp.getV2VectorType();
     auto vectorType = shuffleOp.getResultVectorType();
     Type llvmType = typeConverter->convertType(vectorType);
-    auto maskArrayAttr = shuffleOp.getMask();
+    ArrayRef<int64_t> mask = shuffleOp.getMask();
 
     // Bail if result type cannot be lowered.
     if (!llvmType)
@@ -1015,7 +1015,7 @@ public:
     if (rank <= 1 && v1Type == v2Type) {
       Value llvmShuffleOp = rewriter.create<LLVM::ShuffleVectorOp>(
           loc, adaptor.getV1(), adaptor.getV2(),
-          LLVM::convertArrayToIndices<int32_t>(maskArrayAttr));
+          llvm::to_vector_of<int32_t>(mask));
       rewriter.replaceOp(shuffleOp, llvmShuffleOp);
       return success();
     }
@@ -1029,8 +1029,7 @@ public:
       eltType = cast<VectorType>(llvmType).getElementType();
     Value insert = rewriter.create<LLVM::UndefOp>(loc, llvmType);
     int64_t insPos = 0;
-    for (const auto &en : llvm::enumerate(maskArrayAttr)) {
-      int64_t extPos = cast<IntegerAttr>(en.value()).getInt();
+    for (int64_t extPos : mask) {
       Value value = adaptor.getV1();
       if (extPos >= v1Dim) {
         extPos -= v1Dim;
diff --git a/mlir/lib/Conversion/VectorToSPIRV/VectorToSPIRV.cpp b/mlir/lib/Conversion/VectorToSPIRV/VectorToSPIRV.cpp
index 527fbe5..21b8858 100644
--- a/mlir/lib/Conversion/VectorToSPIRV/VectorToSPIRV.cpp
+++ b/mlir/lib/Conversion/VectorToSPIRV/VectorToSPIRV.cpp
@@ -527,10 +527,7 @@ struct VectorShuffleOpConvert final
       return rewriter.notifyMatchFailure(shuffleOp,
                                          "unsupported result vector type");
 
-    SmallVector<int32_t, 4> mask = llvm::map_to_vector<4>(
-        shuffleOp.getMask(), [](Attribute attr) -> int32_t {
-          return cast<IntegerAttr>(attr).getValue().getZExtValue();
-        });
+    auto mask = llvm::to_vector_of<int32_t>(shuffleOp.getMask());
 
     VectorType oldV1Type = shuffleOp.getV1VectorType();
     VectorType oldV2Type = shuffleOp.getV2VectorType();
@@ -906,6 +903,43 @@ struct VectorReductionToFPDotProd final
   }
 };
 
+struct VectorStepOpConvert final : OpConversionPattern<vector::StepOp> {
+  using OpConversionPattern::OpConversionPattern;
+
+  LogicalResult
+  matchAndRewrite(vector::StepOp stepOp, OpAdaptor adaptor,
+                  ConversionPatternRewriter &rewriter) const override {
+    const auto &typeConverter = *getTypeConverter<SPIRVTypeConverter>();
+    Type dstType = typeConverter.convertType(stepOp.getType());
+    if (!dstType)
+      return failure();
+
+    Location loc = stepOp.getLoc();
+    int64_t numElements = stepOp.getType().getNumElements();
+    auto intType =
+        rewriter.getIntegerType(typeConverter.getIndexTypeBitwidth());
+
+    // Input vectors of size 1 are converted to scalars by the type converter.
+    // We just create a constant in this case.
+    if (numElements == 1) {
+      Value zero = spirv::ConstantOp::getZero(intType, loc, rewriter);
+      rewriter.replaceOp(stepOp, zero);
+      return success();
+    }
+
+    SmallVector<Value> source;
+    source.reserve(numElements);
+    for (int64_t i = 0; i < numElements; ++i) {
+      Attribute intAttr = rewriter.getIntegerAttr(intType, i);
+      Value constOp = rewriter.create<spirv::ConstantOp>(loc, intType, intAttr);
+      source.push_back(constOp);
+    }
+    rewriter.replaceOpWithNewOp<spirv::CompositeConstructOp>(stepOp, dstType,
+                                                             source);
+    return success();
+  }
+};
+
 } // namespace
 #define CL_INT_MAX_MIN_OPS                                                     \
   spirv::CLUMaxOp, spirv::CLUMinOp, spirv::CLSMaxOp, spirv::CLSMinOp
@@ -929,8 +963,9 @@ void mlir::populateVectorToSPIRVPatterns(SPIRVTypeConverter &typeConverter,
       VectorReductionFloatMinMax<GL_FLOAT_MAX_MIN_OPS>, VectorShapeCast,
       VectorInsertStridedSliceOpConvert, VectorShuffleOpConvert,
       VectorInterleaveOpConvert, VectorDeinterleaveOpConvert,
-      VectorSplatPattern, VectorLoadOpConverter, VectorStoreOpConverter>(
-      typeConverter, patterns.getContext(), PatternBenefit(1));
+      VectorSplatPattern, VectorLoadOpConverter, VectorStoreOpConverter,
+      VectorStepOpConvert>(typeConverter, patterns.getContext(),
+                           PatternBenefit(1));
 
   // Make sure that the more specialized dot product pattern has higher benefit
   // than the generic one that extracts all elements.
diff --git a/mlir/lib/Dialect/Affine/Analysis/CMakeLists.txt b/mlir/lib/Dialect/Affine/Analysis/CMakeLists.txt
index 61e49b0..3a19963 100644
--- a/mlir/lib/Dialect/Affine/Analysis/CMakeLists.txt
+++ b/mlir/lib/Dialect/Affine/Analysis/CMakeLists.txt
@@ -8,6 +8,9 @@ add_mlir_dialect_library(MLIRAffineAnalysis
   ADDITIONAL_HEADER_DIRS
   ${MLIR_MAIN_INCLUDE_DIR}/mlir/Dialect/Affine
 
+  DEPENDS
+  MLIRFuncOpsIncGen
+
   LINK_LIBS PUBLIC
   MLIRAffineDialect
   MLIRAnalysis
diff --git a/mlir/lib/Dialect/Arith/IR/ArithOps.cpp b/mlir/lib/Dialect/Arith/IR/ArithOps.cpp
index aa5eb95..641b7d7 100644
--- a/mlir/lib/Dialect/Arith/IR/ArithOps.cpp
+++ b/mlir/lib/Dialect/Arith/IR/ArithOps.cpp
@@ -1739,6 +1739,8 @@ OpFoldResult arith::BitcastOp::fold(FoldAdaptor adaptor) {
   APInt bits = llvm::isa<FloatAttr>(operand)
                    ? llvm::cast<FloatAttr>(operand).getValue().bitcastToAPInt()
                    : llvm::cast<IntegerAttr>(operand).getValue();
+  assert(resType.getIntOrFloatBitWidth() == bits.getBitWidth() &&
+         "trying to fold on broken IR: operands have incompatible types");
 
   if (auto resFloatType = llvm::dyn_cast<FloatType>(resType))
     return FloatAttr::get(resType,
diff --git a/mlir/lib/Dialect/ArmSME/Transforms/OuterProductFusion.cpp b/mlir/lib/Dialect/ArmSME/Transforms/OuterProductFusion.cpp
index 39292c4..1e71167 100644
--- a/mlir/lib/Dialect/ArmSME/Transforms/OuterProductFusion.cpp
+++ b/mlir/lib/Dialect/ArmSME/Transforms/OuterProductFusion.cpp
@@ -479,7 +479,7 @@ struct SwapVectorExtractOfArithExtend
       return rewriter.notifyMatchFailure(extractOp,
                                          "extracted type is not a vector type");
 
-    auto numScalableDims = llvm::count(resultType.getScalableDims(), true);
+    auto numScalableDims = resultType.getNumScalableDims();
     if (numScalableDims != 1)
       return rewriter.notifyMatchFailure(
           extractOp, "extracted type is not a 1-D scalable vector type");
diff --git a/mlir/lib/Dialect/ArmSME/Transforms/VectorLegalization.cpp b/mlir/lib/Dialect/ArmSME/Transforms/VectorLegalization.cpp
index c76d489..53df7af 100644
--- a/mlir/lib/Dialect/ArmSME/Transforms/VectorLegalization.cpp
+++ b/mlir/lib/Dialect/ArmSME/Transforms/VectorLegalization.cpp
@@ -548,7 +548,7 @@ struct FoldExtractFromVectorOfSMELikeCreateMasks
       return rewriter.notifyMatchFailure(extractOp,
                                          "extracted type is not a vector type");
 
-    auto numScalable = llvm::count(extractedMaskType.getScalableDims(), true);
+    auto numScalable = extractedMaskType.getNumScalableDims();
     if (numScalable != 2)
       return rewriter.notifyMatchFailure(
           extractOp, "expected extracted type to be an SME-like mask");
diff --git a/mlir/lib/Dialect/Linalg/IR/LinalgOps.cpp b/mlir/lib/Dialect/Linalg/IR/LinalgOps.cpp
index d1db90b..99b625d 100644
--- a/mlir/lib/Dialect/Linalg/IR/LinalgOps.cpp
+++ b/mlir/lib/Dialect/Linalg/IR/LinalgOps.cpp
@@ -879,15 +879,66 @@ struct FoldFillWithTranspose : OpRewritePattern<linalg::TransposeOp> {
   }
 };
 
+/// Fold a concat with all elements being fills of the same value
+/// into a fill of the concat result shape.
+struct FoldConcatsOfFill : public OpRewritePattern<tensor::ConcatOp> {
+  using OpRewritePattern::OpRewritePattern;
+
+  LogicalResult matchAndRewrite(tensor::ConcatOp concatOp,
+                                PatternRewriter &rewriter) const override {
+    auto concatOperands = concatOp.getInputs();
+    if (concatOperands.empty()) {
+      return failure();
+    }
+
+    auto firstFillOp = concatOperands.front().getDefiningOp<linalg::FillOp>();
+    if (!firstFillOp) {
+      return failure();
+    }
+    // Prefetch the fill value.
+    OpFoldResult firstFillVal =
+        getAsOpFoldResult(firstFillOp.getDpsInputOperand(0)->get());
+    // Collect all the outs values for the fill operations.
+    SmallVector<Value> allOuts;
+    allOuts.push_back(firstFillOp.getDpsInitOperand(0)->get());
+
+    auto isDefinedByCompatibleFillOp = [&](Value v) -> bool {
+      auto fillOp = v.getDefiningOp<linalg::FillOp>();
+      if (!fillOp) {
+        return false;
+      }
+
+      OpFoldResult fillVal =
+          getAsOpFoldResult(fillOp.getDpsInputOperand(0)->get());
+      if (fillVal != firstFillVal)
+        return false;
+
+      allOuts.push_back(fillOp.getDpsInitOperand(0)->get());
+      return true;
+    };
+    if (!llvm::all_of(concatOperands.drop_front(),
+                      isDefinedByCompatibleFillOp)) {
+      return rewriter.notifyMatchFailure(
+          concatOp, "not all operands are defined by a compatible fill op");
+    }
+
+    Value outsConcat = rewriter.create<tensor::ConcatOp>(
+        concatOp.getLoc(), concatOp.getDim(), allOuts);
+    rewriter.replaceOpWithNewOp<linalg::FillOp>(
+        concatOp, firstFillOp.getDpsInputOperand(0)->get(), outsConcat);
+    return success();
+  }
+};
+
 } // namespace
 
 void FillOp::getCanonicalizationPatterns(RewritePatternSet &results,
                                          MLIRContext *context) {
-  results
-      .add<FoldFillWithCopy, FoldFillWithTensorExtract, FoldFillWithPack,
-           FoldFillWithPad, FoldFillWithTensorReshape<tensor::CollapseShapeOp>,
-           FoldFillWithTensorReshape<tensor::ExpandShapeOp>,
-           FoldInsertPadIntoFill, FoldFillWithTranspose>(context);
+  results.add<FoldConcatsOfFill, FoldFillWithCopy, FoldFillWithTensorExtract,
+              FoldFillWithPack, FoldFillWithPad,
+              FoldFillWithTensorReshape<tensor::CollapseShapeOp>,
+              FoldFillWithTensorReshape<tensor::ExpandShapeOp>,
+              FoldInsertPadIntoFill, FoldFillWithTranspose>(context);
 }
 
 //===----------------------------------------------------------------------===//
diff --git a/mlir/lib/Dialect/Linalg/TransformOps/LinalgTransformOps.cpp b/mlir/lib/Dialect/Linalg/TransformOps/LinalgTransformOps.cpp
index b611347..9baf358 100644
--- a/mlir/lib/Dialect/Linalg/TransformOps/LinalgTransformOps.cpp
+++ b/mlir/lib/Dialect/Linalg/TransformOps/LinalgTransformOps.cpp
@@ -12,6 +12,7 @@
 
 #include "mlir/Dialect/Affine/IR/AffineOps.h"
 #include "mlir/Dialect/Arith/IR/Arith.h"
+#include "mlir/Dialect/Arith/Utils/Utils.h"
 #include "mlir/Dialect/Bufferization/IR/Bufferization.h"
 #include "mlir/Dialect/Bufferization/Transforms/OneShotAnalysis.h"
 #include "mlir/Dialect/GPU/IR/GPUDialect.h"
@@ -3151,12 +3152,100 @@ void transform::TileUsingForallOp::build(OpBuilder &builder,
         /*mapping=*/mapping);
 }
 
+/// Given `lbs`, `ubs` and `steps` of loops, return (for each loop), the
+/// normalized upper bound.
+static SmallVector<OpFoldResult>
+normalizeUpperBounds(RewriterBase &rewriter, Location loc,
+                     ArrayRef<OpFoldResult> lbs, ArrayRef<OpFoldResult> ubs,
+                     ArrayRef<OpFoldResult> steps) {
+  AffineExpr s0, s1, s2;
+  bindSymbols(rewriter.getContext(), s0, s1, s2);
+  AffineExpr normalizedUbExpr = (s1 - s0).ceilDiv(s2);
+  SmallVector<OpFoldResult> normalizedUbs;
+  for (auto [lb, ub, step] : llvm::zip_equal(lbs, ubs, steps)) {
+    OpFoldResult normalizedUb = affine::makeComposedFoldedAffineApply(
+        rewriter, loc, normalizedUbExpr, {lb, ub, step});
+    normalizedUbs.push_back(normalizedUb);
+  }
+  return normalizedUbs;
+}
+
+/// When a loop is normalized, the uses of the induction variable within the
+/// loop need to replaced with `original_lb + old_iv * original_step`.
+static SmallVector<Value> denormalizeIndVar(RewriterBase &rewriter,
+                                            Location loc, ValueRange ivs,
+                                            ArrayRef<OpFoldResult> lbs,
+                                            ArrayRef<OpFoldResult> steps) {
+  AffineExpr s0, s1;
+  AffineExpr d0;
+  bindSymbols(rewriter.getContext(), s0, s1);
+  bindDims(rewriter.getContext(), d0);
+  AffineExpr denormExpr = s0 + d0 * s1;
+  SmallVector<Value> denormalizedIvs;
+
+  for (auto [iv, lb, step] : llvm::zip_equal(ivs, lbs, steps)) {
+    OpFoldResult denormValue = affine::makeComposedFoldedAffineApply(
+        rewriter, loc, denormExpr, ArrayRef<OpFoldResult>{iv, lb, step});
+    denormalizedIvs.push_back(
+        getValueOrCreateConstantIndexOp(rewriter, loc, denormValue));
+  }
+  return denormalizedIvs;
+}
+
+/// Given a `scf.forall` loop return a loop op with the loop bounds
+/// normalized.
+/// TODO: Replace this with a general utility to normalize `scf.forall`.
+/// At the time of writing, this wasnt done since adding this to `scf`
+/// dialect would disallow using of `affine.apply` operations due
+/// to cyclic dependencies. To avoid churn in lit tests
+/// with the change this was added with, defer that to a follow up.
+static scf::ForallOp normalizeForallLoopOp(RewriterBase &rewriter,
+                                           scf::ForallOp loop) {
+  SmallVector<OpFoldResult> lbs = loop.getMixedLowerBound();
+  SmallVector<OpFoldResult> ubs = loop.getMixedUpperBound();
+  SmallVector<OpFoldResult> steps = loop.getMixedStep();
+
+  if (llvm::all_of(
+          lbs, [](OpFoldResult ofr) { return isConstantIntValue(ofr, 0); }) &&
+      llvm::all_of(
+          steps, [](OpFoldResult ofr) { return isConstantIntValue(ofr, 1); })) {
+    return loop;
+  }
+
+  Location loc = loop.getLoc();
+  SmallVector<OpFoldResult> normalizedUbs =
+      normalizeUpperBounds(rewriter, loc, lbs, ubs, steps);
+  SmallVector<OpFoldResult> normalizedLbs(normalizedUbs.size(),
+                                          rewriter.getIndexAttr(0));
+  SmallVector<OpFoldResult> normalizedSteps(normalizedUbs.size(),
+                                            rewriter.getIndexAttr(1));
+
+  auto normalizedForallOp = rewriter.create<scf::ForallOp>(
+      loc, normalizedLbs, normalizedUbs, normalizedSteps, loop.getOutputs(),
+      loop.getMapping(), [](OpBuilder &, Location, ValueRange) {});
+
+  auto normalizedLoopIvs = normalizedForallOp.getInductionVars();
+  OpBuilder::InsertionGuard g(rewriter);
+  Block *normalizedLoopBlock = normalizedForallOp.getBody();
+  rewriter.setInsertionPointToStart(normalizedLoopBlock);
+
+  SmallVector<Value> argValues =
+      denormalizeIndVar(rewriter, loc, normalizedLoopIvs, lbs, steps);
+  argValues.append(normalizedForallOp.getRegionIterArgs().begin(),
+                   normalizedForallOp.getRegionIterArgs().end());
+  Block *origLoopBlock = loop.getBody();
+  rewriter.mergeBlocks(origLoopBlock, normalizedLoopBlock, argValues);
+
+  rewriter.replaceOp(loop, normalizedForallOp);
+  return normalizedForallOp;
+}
+
 DiagnosedSilenceableFailure transform::tileToForallOpImpl(
     RewriterBase &rewriter, transform::TransformState &state,
     TransformOpInterface transformOp, Operation *target,
     ArrayRef<OpFoldResult> mixedNumThreads,
     ArrayRef<OpFoldResult> mixedTileSizes, std::optional<ArrayAttr> mapping,
-    linalg::ForallTilingResult &tilingResult) {
+    scf::SCFTilingResult &tilingResult) {
   // Transform all targets one by one.
   auto tileableOp = dyn_cast<TilingInterface>(target);
   if (!tileableOp) {
@@ -3167,20 +3256,35 @@ DiagnosedSilenceableFailure transform::tileToForallOpImpl(
     return diag;
   }
   rewriter.setInsertionPoint(tileableOp);
-  FailureOr<linalg::ForallTilingResult> maybeTilingResult = failure();
+  scf::SCFTilingOptions options;
+  options.setLoopType(scf::SCFTilingOptions::LoopType::ForallOp);
   if (!mixedNumThreads.empty()) {
-    maybeTilingResult =
-        linalg::tileToForallOp(rewriter, tileableOp, mixedNumThreads, mapping);
+    options.setNumThreads(mixedNumThreads);
   } else {
-    maybeTilingResult = linalg::tileToForallOpUsingTileSizes(
-        rewriter, tileableOp, mixedTileSizes, mapping);
+    options.setTileSizes(mixedTileSizes);
   }
+  if (mapping) {
+    options.setMapping(mapping.value().getValue());
+  }
+  FailureOr<scf::SCFTilingResult> maybeTilingResult =
+      scf::tileUsingSCF(rewriter, tileableOp, options);
 
   if (failed(maybeTilingResult))
     return transformOp.emitDefaultSilenceableFailure(tileableOp);
-  rewriter.replaceOp(tileableOp, maybeTilingResult->tileOp->getResults());
+
+  rewriter.replaceOp(tileableOp, maybeTilingResult->replacements);
 
   tilingResult = *maybeTilingResult;
+
+  if (mixedNumThreads.empty()) {
+    auto generatedForallOp = cast<scf::ForallOp>(tilingResult.loops.front());
+    OpBuilder::InsertionGuard g(rewriter);
+    rewriter.setInsertionPoint(generatedForallOp);
+    scf::ForallOp normalizedForallOp =
+        normalizeForallLoopOp(rewriter, generatedForallOp);
+    tilingResult.loops.front() = normalizedForallOp;
+  }
+
   return DiagnosedSilenceableFailure::success();
 }
 
@@ -3214,14 +3318,14 @@ DiagnosedSilenceableFailure transform::TileUsingForallOp::apply(
     return status;
 
   for (Operation *target : state.getPayloadOps(getTarget())) {
-    linalg::ForallTilingResult tilingResult;
+    scf::SCFTilingResult tilingResult;
     DiagnosedSilenceableFailure diag = tileToForallOpImpl(
         rewriter, state, transformOp, target, mixedNumThreads, mixedTileSizes,
         getMapping(), tilingResult);
     if (!diag.succeeded())
       return diag;
-    tileOps.push_back(tilingResult.tileOp);
-    tiledOps.push_back(tilingResult.tiledOp);
+    tileOps.push_back(tilingResult.loops.front());
+    tiledOps.append(tilingResult.tiledOps);
   }
 
   transformResults.set(cast<OpResult>(getForallOp()), tileOps);
@@ -3699,7 +3803,7 @@ DiagnosedSilenceableFailure transform::MapCopyToThreadsOp::applyToOne(
 
   // OpBuilder only used to compute attributes.
   OpBuilder b(getContext());
-  linalg::ForallTilingResult tilingResult;
+  scf::SCFTilingResult tilingResult;
   DiagnosedSilenceableFailure diag = tileToForallOpImpl(
       /*rewriter=*/rewriter,
       /*state=*/state,
@@ -3712,8 +3816,9 @@ DiagnosedSilenceableFailure transform::MapCopyToThreadsOp::applyToOne(
   if (!diag.succeeded())
     return diag;
 
-  results.push_back(tilingResult.tileOp);
-  results.push_back(tilingResult.tiledOp);
+  results.push_back(tilingResult.loops.front());
+  for (auto op : tilingResult.tiledOps)
+    results.push_back(op);
   return DiagnosedSilenceableFailure::success();
 }
 
diff --git a/mlir/lib/Dialect/Linalg/Transforms/Tiling.cpp b/mlir/lib/Dialect/Linalg/Transforms/Tiling.cpp
index 8ef8651..fb6ab20 100644
--- a/mlir/lib/Dialect/Linalg/Transforms/Tiling.cpp
+++ b/mlir/lib/Dialect/Linalg/Transforms/Tiling.cpp
@@ -435,188 +435,6 @@ static void calculateTileOffsetsAndSizes(
   }
 }
 
-/// Returns a vector of bools representing if, for each axis, `op` can be tiled
-/// without incurring in a race condition and thus it is thread-safe to do the
-/// tiling. This is checked by iterating over numThreads and ensuring that the
-/// corresponding iterator type is "parallel". If it is not, then we know that
-/// such dimension is unsafe to tile.
-SmallVector<bool> safeToTileToForall(mlir::MLIRContext *ctx, LinalgOp linalgOp,
-                                     ArrayRef<OpFoldResult> numThreads) {
-  auto iterators = linalgOp.getIteratorTypesArray();
-  SmallVector<bool> safeToTile(numThreads.size(), true);
-
-  for (unsigned i = 0, e = numThreads.size(); i != e; i++) {
-    if (auto attr = llvm::dyn_cast_if_present<Attribute>(numThreads[i])) {
-      if (cast<IntegerAttr>(attr).getValue().getSExtValue() > 1) {
-        safeToTile[i] = iterators[i] == utils::IteratorType::parallel;
-      }
-    } else {
-      safeToTile[i] = iterators[i] == utils::IteratorType::parallel;
-    }
-  }
-  return safeToTile;
-}
-
-/// Rewrite a TilingInterface `op` to a tiled `scf.forall`. The
-/// tiling is specified by the number of tiles/threads `numThreads` and the
-/// optional nominal tile size `nominalTileSizes`. If `nominalTilSizes` is
-/// not specified, then  it is derived from `numThreads` as `ceilDiv(dimSize[i],
-/// numThreads[i])`. If non-empty, the `mapping` is added as an
-/// attribute to the resulting `scf.forall`. A zero tile sizes indicate
-/// that the dimension is not tiled, and can be thought of as tiling by the full
-/// size of data.
-/// It is the user's responsibility to ensure that `numThreads` is a valid
-/// tiling specification (i.e. that only tiles parallel dimensions, e.g. in the
-/// Linalg case). If the dimension is not parallelizable, a warning is issued to
-/// notify the user that the generated code is not safe to parallelize. If
-/// `omitTileOffsetBoundsCheck` is true, then the function will assume that
-/// `tileSize[i] * (numThread[i] -1) <= dimSize[i]` holds.
-static FailureOr<ForallTilingResult> tileToForallOpImpl(
-    RewriterBase &b, TilingInterface op, ArrayRef<OpFoldResult> numThreads,
-    std::optional<ArrayRef<OpFoldResult>> nominalTileSizes,
-    std::optional<ArrayAttr> mapping, bool omitTileOffsetBoundsCheck) {
-  Location loc = op->getLoc();
-  OpBuilder::InsertionGuard g(b);
-
-  SmallVector<Range> loopRanges = op.getIterationDomain(b);
-  if (loopRanges.empty())
-    return op->emitOpError("expected non-empty loop ranges");
-  auto hasStrideOne = [](Range r) { return !isConstantIntValue(r.stride, 1); };
-  if (llvm::any_of(loopRanges, hasStrideOne))
-    return op->emitOpError("only stride-1 supported atm");
-
-  // Gather destination tensors.
-  SmallVector<Value> dest;
-  if (failed(tensor::getOrCreateDestinations(b, loc, op, dest)))
-    return op->emitOpError("failed to get destination tensors");
-
-  SmallVector<OpFoldResult> nonZeroNumThreads =
-      llvm::to_vector(llvm::make_filter_range(numThreads, [](OpFoldResult ofr) {
-        return !isConstantIntValue(ofr, 0);
-      }));
-  SmallVector<Value> materializedNonZeroNumThreads =
-      llvm::to_vector(llvm::map_range(nonZeroNumThreads, [&](OpFoldResult ofr) {
-        return getValueOrCreateConstantIndexOp(b, loc, ofr);
-      }));
-
-  LinalgOp linalgOp = dyn_cast<LinalgOp>(op.getOperation());
-  if (linalgOp) {
-    // Check if tiling is thread safe and print a warning if not.
-    SmallVector<bool> tilingSafety =
-        safeToTileToForall(b.getContext(), linalgOp, numThreads);
-    for (size_t i = 0; i < tilingSafety.size(); i++)
-      if (!tilingSafety[i])
-        op.emitWarning() << "tiling is not thread safe at axis #" << i;
-  }
-
-  // 1. Create the ForallOp. We don't use the lambda body-builder
-  // version because we require the use of RewriterBase in the body, so we
-  // manually move the insertion point to the body below.
-  scf::ForallOp forallOp = b.create<scf::ForallOp>(
-      loc, getAsOpFoldResult((materializedNonZeroNumThreads)), dest, mapping);
-
-  // 2. Fill out the ForallOp body.
-  SmallVector<OpFoldResult> tiledOffsets, tiledSizes;
-  calculateTileOffsetsAndSizes(b, loc, forallOp, numThreads, loopRanges,
-                               omitTileOffsetBoundsCheck, nominalTileSizes,
-                               tiledOffsets, tiledSizes);
-
-  // 3. Clone the tileable op and update its destination operands to use the
-  // output bbArgs of the ForallOp.
-  ArrayRef<BlockArgument> destBbArgs = forallOp.getRegionIterArgs();
-  Operation *tiledOp = nullptr;
-  SmallVector<Value> tiledValues;
-  {
-    // 3.a. RAII guard, inserting within forallOp, before terminator.
-    OpBuilder::InsertionGuard g(b);
-    b.setInsertionPoint(forallOp.getTerminator());
-    Operation *clonedOp = b.clone(*op.getOperation());
-    auto destinationStyleOp = dyn_cast<DestinationStyleOpInterface>(clonedOp);
-    if (destinationStyleOp) {
-      for (OpOperand &outOperand : destinationStyleOp.getDpsInitsMutable()) {
-        // Swap tensor inits with the corresponding block argument of the
-        // scf.forall op. Memref inits remain as is.
-        if (isa<TensorType>(outOperand.get().getType())) {
-          auto *it = llvm::find(dest, outOperand.get());
-          assert(it != dest.end() && "could not find destination tensor");
-          unsigned destNum = std::distance(dest.begin(), it);
-          outOperand.set(destBbArgs[destNum]);
-        }
-      }
-    }
-
-    // 4. Tile the cloned op and delete the clone.
-    FailureOr<TilingResult> tilingResult =
-        cast<TilingInterface>(clonedOp).getTiledImplementation(b, tiledOffsets,
-                                                               tiledSizes);
-    if (failed(tilingResult))
-      return clonedOp->emitError("Failed to tile op: ");
-    if (tilingResult->tiledOps.size() != 1) {
-      return clonedOp->emitError("expected a single produced tiled op, got ")
-             << tilingResult->tiledOps.size();
-    }
-
-    b.eraseOp(clonedOp);
-    tiledOp = tilingResult->tiledOps.front();
-    tiledValues = tilingResult->tiledValues;
-  }
-
-  // 5. Parallel insert back into the result tensor.
-  for (auto it : llvm::zip(llvm::seq(unsigned(0), unsigned(dest.size())),
-                           tiledValues, destBbArgs)) {
-    // 5.a. Partial subset information is inserted just before the terminator.
-    OpBuilder::InsertionGuard g(b);
-    b.setInsertionPoint(forallOp.getTerminator());
-
-    SmallVector<OpFoldResult> resultOffsets, resultSizes;
-    if (failed(op.getResultTilePosition(b, std::get<0>(it), tiledOffsets,
-                                        tiledSizes, resultOffsets,
-                                        resultSizes)))
-      return op->emitOpError("output offsets couldn't be calculated");
-    SmallVector<OpFoldResult> strides(resultSizes.size(), b.getIndexAttr(1));
-
-    // 5.b. Parallel insertions are inserted at the end of the combining
-    // terminator.
-    b.setInsertionPointToEnd(forallOp.getTerminator().getBody());
-    b.create<tensor::ParallelInsertSliceOp>(loc, std::get<1>(it),
-                                            std::get<2>(it), resultOffsets,
-                                            resultSizes, strides);
-  }
-  return ForallTilingResult{forallOp, tiledOp};
-}
-
-FailureOr<ForallTilingResult>
-linalg::tileToForallOp(RewriterBase &b, TilingInterface op,
-                       ArrayRef<OpFoldResult> numThreads,
-                       std::optional<ArrayAttr> mapping) {
-  return tileToForallOpImpl(b, op, numThreads,
-                            /*nominalTileSizes=*/std::nullopt, mapping,
-                            /*omitTileOffsetBoundsCheck=*/false);
-}
-
-FailureOr<ForallTilingResult>
-linalg::tileToForallOpUsingTileSizes(RewriterBase &b, TilingInterface op,
-                                     ArrayRef<OpFoldResult> tileSizes,
-                                     std::optional<ArrayAttr> mapping) {
-  SmallVector<Range> loopRanges = op.getIterationDomain(b);
-  unsigned nLoops = loopRanges.size();
-  SmallVector<OpFoldResult> numThreads;
-  numThreads.reserve(nLoops);
-  AffineExpr s0, s1;
-  bindSymbols(b.getContext(), s0, s1);
-  AffineExpr divExpr = s0.ceilDiv(s1);
-  for (const auto &it : llvm::zip(tileSizes, loopRanges)) {
-    OpFoldResult numTiles = std::get<0>(it);
-    if (!isConstantIntValue(numTiles, 0))
-      numTiles = makeComposedFoldedAffineApply(
-          b, op.getLoc(), divExpr, {std::get<1>(it).size, std::get<0>(it)});
-    numThreads.push_back(numTiles);
-  }
-  return tileToForallOpImpl(b, op, numThreads,
-                            /*nominalTileSizes=*/tileSizes, mapping,
-                            /*omitTileOffsetBoundsCheck=*/true);
-}
-
 template <typename LoopTy>
 static FailureOr<TiledLinalgOp>
 tileLinalgOpImpl(RewriterBase &b, LinalgOp op, ArrayRef<OpFoldResult> tileSizes,
diff --git a/mlir/lib/Dialect/Linalg/Transforms/Vectorization.cpp b/mlir/lib/Dialect/Linalg/Transforms/Vectorization.cpp
index 9185663..3d0d6ab 100644
--- a/mlir/lib/Dialect/Linalg/Transforms/Vectorization.cpp
+++ b/mlir/lib/Dialect/Linalg/Transforms/Vectorization.cpp
@@ -522,9 +522,11 @@ mlir::linalg::getCombinerOpKind(Operation *combinerOp) {
       .Case<arith::MaxSIOp>([&](auto op) { return CombiningKind::MAXSI; })
       .Case<arith::MaxUIOp>([&](auto op) { return CombiningKind::MAXUI; })
       .Case<arith::MaximumFOp>([&](auto op) { return CombiningKind::MAXIMUMF; })
+      .Case<arith::MaxNumFOp>([&](auto op) { return CombiningKind::MAXNUMF; })
       .Case<arith::MinSIOp>([&](auto op) { return CombiningKind::MINSI; })
       .Case<arith::MinUIOp>([&](auto op) { return CombiningKind::MINUI; })
       .Case<arith::MinimumFOp>([&](auto op) { return CombiningKind::MINIMUMF; })
+      .Case<arith::MinNumFOp>([&](auto op) { return CombiningKind::MINNUMF; })
       .Case<arith::MulIOp, arith::MulFOp>(
           [&](auto op) { return CombiningKind::MUL; })
       .Case<arith::OrIOp>([&](auto op) { return CombiningKind::OR; })
diff --git a/mlir/lib/Dialect/Math/Transforms/PolynomialApproximation.cpp b/mlir/lib/Dialect/Math/Transforms/PolynomialApproximation.cpp
index f4fae68..d6fe221 100644
--- a/mlir/lib/Dialect/Math/Transforms/PolynomialApproximation.cpp
+++ b/mlir/lib/Dialect/Math/Transforms/PolynomialApproximation.cpp
@@ -861,7 +861,34 @@ AsinPolynomialApproximation::matchAndRewrite(math::AsinOp op,
     return builder.create<arith::MulFOp>(a, b);
   };
 
-  Value s = mul(operand, operand);
+  auto sub = [&](Value a, Value b) -> Value {
+    return builder.create<arith::SubFOp>(a, b);
+  };
+
+  auto abs = [&](Value a) -> Value { return builder.create<math::AbsFOp>(a); };
+
+  auto sqrt = [&](Value a) -> Value { return builder.create<math::SqrtOp>(a); };
+
+  auto scopy = [&](Value a, Value b) -> Value {
+    return builder.create<math::CopySignOp>(a, b);
+  };
+
+  auto sel = [&](Value a, Value b, Value c) -> Value {
+    return builder.create<arith::SelectOp>(a, b, c);
+  };
+
+  Value abso = abs(operand);
+  Value aa = mul(operand, operand);
+  Value opp = sqrt(sub(bcast(floatCst(builder, 1.0, elementType)), aa));
+
+  Value gt =
+      builder.create<arith::CmpFOp>(arith::CmpFPredicate::OGT, aa,
+                                    bcast(floatCst(builder, 0.5, elementType)));
+
+  Value x = sel(gt, opp, abso);
+
+  // Asin(x) approximation for x = [-9/16, 9/16]:
+  Value s = mul(x, x);
   Value q = mul(s, s);
   Value r = bcast(floatCst(builder, 5.5579749017470502e-2, elementType));
   Value t = bcast(floatCst(builder, -6.2027913464120114e-2, elementType));
@@ -878,8 +905,12 @@ AsinPolynomialApproximation::matchAndRewrite(math::AsinOp op,
   t = fma(t, q, bcast(floatCst(builder, 7.4999999991367292e-2, elementType)));
   r = fma(r, s, t);
   r = fma(r, s, bcast(floatCst(builder, 1.6666666666670193e-1, elementType)));
-  t = mul(operand, s);
-  r = fma(r, t, operand);
+  t = mul(x, s);
+  r = fma(r, t, x);
+
+  Value rsub = sub(bcast(floatCst(builder, 1.57079632679, elementType)), r);
+  r = sel(gt, rsub, r);
+  r = scopy(r, operand);
 
   rewriter.replaceOp(op, r);
   return success();
diff --git a/mlir/lib/Dialect/NVGPU/IR/NVGPUDialect.cpp b/mlir/lib/Dialect/NVGPU/IR/NVGPUDialect.cpp
index 26f831f..de9bbcb 100644
--- a/mlir/lib/Dialect/NVGPU/IR/NVGPUDialect.cpp
+++ b/mlir/lib/Dialect/NVGPU/IR/NVGPUDialect.cpp
@@ -645,6 +645,21 @@ LogicalResult WarpgroupMmaInitAccumulatorOp::verify() {
 }
 
 //===----------------------------------------------------------------------===//
+// RcpOp
+//===----------------------------------------------------------------------===//
+
+LogicalResult RcpOp::verify() {
+  RcpRoundingModeAttr rounding = getRoundingAttr();
+  bool ftz = getFtz();
+  // Currently, only `rcp_approx` and `ftz` is supported.
+  if (rounding.getValue() != RcpRoundingMode::APPROX || !ftz) {
+    return emitOpError() << "has a limitation. " << rounding
+                         << " or non-ftz is not supported yet.";
+  }
+  return success();
+}
+
+//===----------------------------------------------------------------------===//
 // TableGen'd dialect, type, and op definitions
 //===----------------------------------------------------------------------===//
 
diff --git a/mlir/lib/Dialect/OpenACC/IR/CMakeLists.txt b/mlir/lib/Dialect/OpenACC/IR/CMakeLists.txt
index 387a690..ed7425b 100644
--- a/mlir/lib/Dialect/OpenACC/IR/CMakeLists.txt
+++ b/mlir/lib/Dialect/OpenACC/IR/CMakeLists.txt
@@ -9,6 +9,7 @@ add_mlir_dialect_library(MLIROpenACCDialect
   MLIROpenACCEnumsIncGen
   MLIROpenACCAttributesIncGen
   MLIROpenACCMPOpsInterfacesIncGen
+  MLIROpenACCOpsInterfacesIncGen
   MLIROpenACCTypeInterfacesIncGen
 
   LINK_LIBS PUBLIC
diff --git a/mlir/lib/Dialect/OpenACC/IR/OpenACC.cpp b/mlir/lib/Dialect/OpenACC/IR/OpenACC.cpp
index c3c6dff..b4da504 100644
--- a/mlir/lib/Dialect/OpenACC/IR/OpenACC.cpp
+++ b/mlir/lib/Dialect/OpenACC/IR/OpenACC.cpp
@@ -24,6 +24,7 @@ using namespace acc;
 
 #include "mlir/Dialect/OpenACC/OpenACCOpsDialect.cpp.inc"
 #include "mlir/Dialect/OpenACC/OpenACCOpsEnums.cpp.inc"
+#include "mlir/Dialect/OpenACC/OpenACCOpsInterfaces.cpp.inc"
 #include "mlir/Dialect/OpenACC/OpenACCTypeInterfaces.cpp.inc"
 #include "mlir/Dialect/OpenACCMPCommon/Interfaces/OpenACCMPOpsInterfaces.cpp.inc"
 
diff --git a/mlir/lib/Dialect/OpenACC/Transforms/CMakeLists.txt b/mlir/lib/Dialect/OpenACC/Transforms/CMakeLists.txt
index 2a29bd1..41ba7f8 100644
--- a/mlir/lib/Dialect/OpenACC/Transforms/CMakeLists.txt
+++ b/mlir/lib/Dialect/OpenACC/Transforms/CMakeLists.txt
@@ -10,6 +10,7 @@ add_mlir_dialect_library(MLIROpenACCTransforms
   MLIROpenACCEnumsIncGen
   MLIROpenACCAttributesIncGen
   MLIROpenACCMPOpsInterfacesIncGen
+  MLIROpenACCOpsInterfacesIncGen
   MLIROpenACCTypeInterfacesIncGen
 
   LINK_LIBS PUBLIC
diff --git a/mlir/lib/Dialect/OpenMP/IR/OpenMPDialect.cpp b/mlir/lib/Dialect/OpenMP/IR/OpenMPDialect.cpp
index f5ec5a4..11780f8 100644
--- a/mlir/lib/Dialect/OpenMP/IR/OpenMPDialect.cpp
+++ b/mlir/lib/Dialect/OpenMP/IR/OpenMPDialect.cpp
@@ -117,39 +117,39 @@ void OpenMPDialect::initialize() {
 /// ssa-id-and-type ::= ssa-id `:` type
 static ParseResult parseAllocateAndAllocator(
     OpAsmParser &parser,
-    SmallVectorImpl<OpAsmParser::UnresolvedOperand> &operandsAllocate,
-    SmallVectorImpl<Type> &typesAllocate,
-    SmallVectorImpl<OpAsmParser::UnresolvedOperand> &operandsAllocator,
-    SmallVectorImpl<Type> &typesAllocator) {
+    SmallVectorImpl<OpAsmParser::UnresolvedOperand> &allocateVars,
+    SmallVectorImpl<Type> &allocateTypes,
+    SmallVectorImpl<OpAsmParser::UnresolvedOperand> &allocatorVars,
+    SmallVectorImpl<Type> &allocatorTypes) {
 
   return parser.parseCommaSeparatedList([&]() {
     OpAsmParser::UnresolvedOperand operand;
     Type type;
     if (parser.parseOperand(operand) || parser.parseColonType(type))
       return failure();
-    operandsAllocator.push_back(operand);
-    typesAllocator.push_back(type);
+    allocatorVars.push_back(operand);
+    allocatorTypes.push_back(type);
     if (parser.parseArrow())
       return failure();
     if (parser.parseOperand(operand) || parser.parseColonType(type))
       return failure();
 
-    operandsAllocate.push_back(operand);
-    typesAllocate.push_back(type);
+    allocateVars.push_back(operand);
+    allocateTypes.push_back(type);
     return success();
   });
 }
 
 /// Print allocate clause
 static void printAllocateAndAllocator(OpAsmPrinter &p, Operation *op,
-                                      OperandRange varsAllocate,
-                                      TypeRange typesAllocate,
-                                      OperandRange varsAllocator,
-                                      TypeRange typesAllocator) {
-  for (unsigned i = 0; i < varsAllocate.size(); ++i) {
-    std::string separator = i == varsAllocate.size() - 1 ? "" : ", ";
-    p << varsAllocator[i] << " : " << typesAllocator[i] << " -> ";
-    p << varsAllocate[i] << " : " << typesAllocate[i] << separator;
+                                      OperandRange allocateVars,
+                                      TypeRange allocateTypes,
+                                      OperandRange allocatorVars,
+                                      TypeRange allocatorTypes) {
+  for (unsigned i = 0; i < allocateVars.size(); ++i) {
+    std::string separator = i == allocateVars.size() - 1 ? "" : ", ";
+    p << allocatorVars[i] << " : " << allocatorTypes[i] << " -> ";
+    p << allocateVars[i] << " : " << allocateTypes[i] << separator;
   }
 }
 
@@ -183,11 +183,11 @@ void printClauseAttr(OpAsmPrinter &p, Operation *op, ClauseAttr attr) {
 /// linear ::= `linear` `(` linear-list `)`
 /// linear-list := linear-val | linear-val linear-list
 /// linear-val := ssa-id-and-type `=` ssa-id-and-type
-static ParseResult
-parseLinearClause(OpAsmParser &parser,
-                  SmallVectorImpl<OpAsmParser::UnresolvedOperand> &vars,
-                  SmallVectorImpl<Type> &types,
-                  SmallVectorImpl<OpAsmParser::UnresolvedOperand> &stepVars) {
+static ParseResult parseLinearClause(
+    OpAsmParser &parser,
+    SmallVectorImpl<OpAsmParser::UnresolvedOperand> &linearVars,
+    SmallVectorImpl<Type> &linearTypes,
+    SmallVectorImpl<OpAsmParser::UnresolvedOperand> &linearStepVars) {
   return parser.parseCommaSeparatedList([&]() {
     OpAsmParser::UnresolvedOperand var;
     Type type;
@@ -196,16 +196,16 @@ parseLinearClause(OpAsmParser &parser,
         parser.parseOperand(stepVar) || parser.parseColonType(type))
       return failure();
 
-    vars.push_back(var);
-    types.push_back(type);
-    stepVars.push_back(stepVar);
+    linearVars.push_back(var);
+    linearTypes.push_back(type);
+    linearStepVars.push_back(stepVar);
     return success();
   });
 }
 
 /// Print Linear Clause
 static void printLinearClause(OpAsmPrinter &p, Operation *op,
-                              ValueRange linearVars, TypeRange linearVarTypes,
+                              ValueRange linearVars, TypeRange linearTypes,
                               ValueRange linearStepVars) {
   size_t linearVarsSize = linearVars.size();
   for (unsigned i = 0; i < linearVarsSize; ++i) {
@@ -221,12 +221,12 @@ static void printLinearClause(OpAsmPrinter &p, Operation *op,
 // Verifier for Nontemporal Clause
 //===----------------------------------------------------------------------===//
 
-static LogicalResult
-verifyNontemporalClause(Operation *op, OperandRange nontemporalVariables) {
+static LogicalResult verifyNontemporalClause(Operation *op,
+                                             OperandRange nontemporalVars) {
 
   // Check if each var is unique - OpenMP 5.0 -> 2.9.3.1 section
   DenseSet<Value> nontemporalItems;
-  for (const auto &it : nontemporalVariables)
+  for (const auto &it : nontemporalVars)
     if (!nontemporalItems.insert(it).second)
       return op->emitOpError() << "nontemporal variable used more than once";
 
@@ -236,32 +236,32 @@ verifyNontemporalClause(Operation *op, OperandRange nontemporalVariables) {
 //===----------------------------------------------------------------------===//
 // Parser, verifier and printer for Aligned Clause
 //===----------------------------------------------------------------------===//
-static LogicalResult
-verifyAlignedClause(Operation *op, std::optional<ArrayAttr> alignmentValues,
-                    OperandRange alignedVariables) {
+static LogicalResult verifyAlignedClause(Operation *op,
+                                         std::optional<ArrayAttr> alignments,
+                                         OperandRange alignedVars) {
   // Check if number of alignment values equals to number of aligned variables
-  if (!alignedVariables.empty()) {
-    if (!alignmentValues || alignmentValues->size() != alignedVariables.size())
+  if (!alignedVars.empty()) {
+    if (!alignments || alignments->size() != alignedVars.size())
       return op->emitOpError()
              << "expected as many alignment values as aligned variables";
   } else {
-    if (alignmentValues)
+    if (alignments)
       return op->emitOpError() << "unexpected alignment values attribute";
     return success();
   }
 
   // Check if each var is aligned only once - OpenMP 4.5 -> 2.8.1 section
   DenseSet<Value> alignedItems;
-  for (auto it : alignedVariables)
+  for (auto it : alignedVars)
     if (!alignedItems.insert(it).second)
       return op->emitOpError() << "aligned variable used more than once";
 
-  if (!alignmentValues)
+  if (!alignments)
     return success();
 
   // Check if all alignment values are positive - OpenMP 4.5 -> 2.8.1 section
-  for (unsigned i = 0; i < (*alignmentValues).size(); ++i) {
-    if (auto intAttr = llvm::dyn_cast<IntegerAttr>((*alignmentValues)[i])) {
+  for (unsigned i = 0; i < (*alignments).size(); ++i) {
+    if (auto intAttr = llvm::dyn_cast<IntegerAttr>((*alignments)[i])) {
       if (intAttr.getValue().sle(0))
         return op->emitOpError() << "alignment should be greater than 0";
     } else {
@@ -275,14 +275,15 @@ verifyAlignedClause(Operation *op, std::optional<ArrayAttr> alignmentValues,
 /// aligned ::= `aligned` `(` aligned-list `)`
 /// aligned-list := aligned-val | aligned-val aligned-list
 /// aligned-val := ssa-id-and-type `->` alignment
-static ParseResult parseAlignedClause(
-    OpAsmParser &parser,
-    SmallVectorImpl<OpAsmParser::UnresolvedOperand> &alignedItems,
-    SmallVectorImpl<Type> &types, ArrayAttr &alignmentValues) {
+static ParseResult
+parseAlignedClause(OpAsmParser &parser,
+                   SmallVectorImpl<OpAsmParser::UnresolvedOperand> &alignedVars,
+                   SmallVectorImpl<Type> &alignedTypes,
+                   ArrayAttr &alignmentsAttr) {
   SmallVector<Attribute> alignmentVec;
   if (failed(parser.parseCommaSeparatedList([&]() {
-        if (parser.parseOperand(alignedItems.emplace_back()) ||
-            parser.parseColonType(types.emplace_back()) ||
+        if (parser.parseOperand(alignedVars.emplace_back()) ||
+            parser.parseColonType(alignedTypes.emplace_back()) ||
             parser.parseArrow() ||
             parser.parseAttribute(alignmentVec.emplace_back())) {
           return failure();
@@ -291,20 +292,19 @@ static ParseResult parseAlignedClause(
       })))
     return failure();
   SmallVector<Attribute> alignments(alignmentVec.begin(), alignmentVec.end());
-  alignmentValues = ArrayAttr::get(parser.getContext(), alignments);
+  alignmentsAttr = ArrayAttr::get(parser.getContext(), alignments);
   return success();
 }
 
 /// Print Aligned Clause
 static void printAlignedClause(OpAsmPrinter &p, Operation *op,
-                               ValueRange alignedVars,
-                               TypeRange alignedVarTypes,
-                               std::optional<ArrayAttr> alignmentValues) {
+                               ValueRange alignedVars, TypeRange alignedTypes,
+                               std::optional<ArrayAttr> alignments) {
   for (unsigned i = 0; i < alignedVars.size(); ++i) {
     if (i != 0)
       p << ", ";
     p << alignedVars[i] << " : " << alignedVars[i].getType();
-    p << " -> " << (*alignmentValues)[i];
+    p << " -> " << (*alignments)[i];
   }
 }
 
@@ -353,10 +353,11 @@ verifyScheduleModifiers(OpAsmParser &parser,
 /// sched-wo-chunk ::=  `auto` | `runtime`
 /// sched-modifier ::=  sched-mod-val | sched-mod-val `,` sched-mod-val
 /// sched-mod-val ::=  `monotonic` | `nonmonotonic` | `simd` | `none`
-static ParseResult parseScheduleClause(
-    OpAsmParser &parser, ClauseScheduleKindAttr &scheduleAttr,
-    ScheduleModifierAttr &scheduleModifier, UnitAttr &simdModifier,
-    std::optional<OpAsmParser::UnresolvedOperand> &chunkSize, Type &chunkType) {
+static ParseResult
+parseScheduleClause(OpAsmParser &parser, ClauseScheduleKindAttr &scheduleAttr,
+                    ScheduleModifierAttr &scheduleMod, UnitAttr &scheduleSimd,
+                    std::optional<OpAsmParser::UnresolvedOperand> &chunkSize,
+                    Type &chunkType) {
   StringRef keyword;
   if (parser.parseKeyword(&keyword))
     return failure();
@@ -399,14 +400,14 @@ static ParseResult parseScheduleClause(
     SMLoc loc = parser.getCurrentLocation();
     if (std::optional<ScheduleModifier> mod =
             symbolizeScheduleModifier(modifiers[0])) {
-      scheduleModifier = ScheduleModifierAttr::get(parser.getContext(), *mod);
+      scheduleMod = ScheduleModifierAttr::get(parser.getContext(), *mod);
     } else {
       return parser.emitError(loc, "invalid schedule modifier");
     }
     // Only SIMD attribute is allowed here!
     if (modifiers.size() > 1) {
       assert(symbolizeScheduleModifier(modifiers[1]) == ScheduleModifier::simd);
-      simdModifier = UnitAttr::get(parser.getBuilder().getContext());
+      scheduleSimd = UnitAttr::get(parser.getBuilder().getContext());
     }
   }
 
@@ -415,16 +416,16 @@ static ParseResult parseScheduleClause(
 
 /// Print schedule clause
 static void printScheduleClause(OpAsmPrinter &p, Operation *op,
-                                ClauseScheduleKindAttr schedAttr,
-                                ScheduleModifierAttr modifier, UnitAttr simd,
-                                Value scheduleChunkVar,
+                                ClauseScheduleKindAttr scheduleKind,
+                                ScheduleModifierAttr scheduleMod,
+                                UnitAttr scheduleSimd, Value scheduleChunk,
                                 Type scheduleChunkType) {
-  p << stringifyClauseScheduleKind(schedAttr.getValue());
-  if (scheduleChunkVar)
-    p << " = " << scheduleChunkVar << " : " << scheduleChunkVar.getType();
-  if (modifier)
-    p << ", " << stringifyScheduleModifier(modifier.getValue());
-  if (simd)
+  p << stringifyClauseScheduleKind(scheduleKind.getValue());
+  if (scheduleChunk)
+    p << " = " << scheduleChunk << " : " << scheduleChunk.getType();
+  if (scheduleMod)
+    p << ", " << stringifyScheduleModifier(scheduleMod.getValue());
+  if (scheduleSimd)
     p << ", simd";
 }
 
@@ -435,15 +436,15 @@ static void printScheduleClause(OpAsmPrinter &p, Operation *op,
 // order ::= `order` `(` [order-modiﬁer ':'] concurrent `)`
 // order-modiﬁer ::= reproducible | unconstrained
 static ParseResult parseOrderClause(OpAsmParser &parser,
-                                    ClauseOrderKindAttr &kindAttr,
-                                    OrderModifierAttr &modifierAttr) {
+                                    ClauseOrderKindAttr &order,
+                                    OrderModifierAttr &orderMod) {
   StringRef enumStr;
   SMLoc loc = parser.getCurrentLocation();
   if (parser.parseKeyword(&enumStr))
     return failure();
   if (std::optional<OrderModifier> enumValue =
           symbolizeOrderModifier(enumStr)) {
-    modifierAttr = OrderModifierAttr::get(parser.getContext(), *enumValue);
+    orderMod = OrderModifierAttr::get(parser.getContext(), *enumValue);
     if (parser.parseOptionalColon())
       return failure();
     loc = parser.getCurrentLocation();
@@ -452,19 +453,19 @@ static ParseResult parseOrderClause(OpAsmParser &parser,
   }
   if (std::optional<ClauseOrderKind> enumValue =
           symbolizeClauseOrderKind(enumStr)) {
-    kindAttr = ClauseOrderKindAttr::get(parser.getContext(), *enumValue);
+    order = ClauseOrderKindAttr::get(parser.getContext(), *enumValue);
     return success();
   }
   return parser.emitError(loc, "invalid clause value: '") << enumStr << "'";
 }
 
 static void printOrderClause(OpAsmPrinter &p, Operation *op,
-                             ClauseOrderKindAttr kindAttr,
-                             OrderModifierAttr modifierAttr) {
-  if (modifierAttr)
-    p << stringifyOrderModifier(modifierAttr.getValue()) << ":";
-  if (kindAttr)
-    p << stringifyClauseOrderKind(kindAttr.getValue());
+                             ClauseOrderKindAttr order,
+                             OrderModifierAttr orderMod) {
+  if (orderMod)
+    p << stringifyOrderModifier(orderMod.getValue()) << ":";
+  if (order)
+    p << stringifyClauseOrderKind(order.getValue());
 }
 
 //===----------------------------------------------------------------------===//
@@ -474,8 +475,7 @@ static void printOrderClause(OpAsmPrinter &p, Operation *op,
 static ParseResult parseClauseWithRegionArgs(
     OpAsmParser &parser, Region &region,
     SmallVectorImpl<OpAsmParser::UnresolvedOperand> &operands,
-    SmallVectorImpl<Type> &types, DenseBoolArrayAttr &isByRef,
-    ArrayAttr &symbols,
+    SmallVectorImpl<Type> &types, DenseBoolArrayAttr &byref, ArrayAttr &symbols,
     SmallVectorImpl<OpAsmParser::Argument> &regionPrivateArgs) {
   SmallVector<SymbolRefAttr> reductionVec;
   SmallVector<bool> isByRefVec;
@@ -494,7 +494,7 @@ static ParseResult parseClauseWithRegionArgs(
             return success();
           })))
     return failure();
-  isByRef = makeDenseBoolArrayAttr(parser.getContext(), isByRefVec);
+  byref = makeDenseBoolArrayAttr(parser.getContext(), isByRefVec);
 
   auto *argsBegin = regionPrivateArgs.begin();
   MutableArrayRef argsSubrange(argsBegin + regionArgOffset,
@@ -510,13 +510,13 @@ static ParseResult parseClauseWithRegionArgs(
 static void printClauseWithRegionArgs(OpAsmPrinter &p, Operation *op,
                                       ValueRange argsSubrange,
                                       StringRef clauseName, ValueRange operands,
-                                      TypeRange types, DenseBoolArrayAttr byRef,
+                                      TypeRange types, DenseBoolArrayAttr byref,
                                       ArrayAttr symbols) {
   if (!clauseName.empty())
     p << clauseName << "(";
 
   llvm::interleaveComma(llvm::zip_equal(symbols, operands, argsSubrange, types,
-                                        byRef.asArrayRef()),
+                                        byref.asArrayRef()),
                         p, [&p](auto t) {
                           auto [sym, op, arg, type, isByRef] = t;
                           p << (isByRef ? "byref " : "") << sym << " " << op
@@ -529,28 +529,27 @@ static void printClauseWithRegionArgs(OpAsmPrinter &p, Operation *op,
 
 static ParseResult parseParallelRegion(
     OpAsmParser &parser, Region &region,
-    SmallVectorImpl<OpAsmParser::UnresolvedOperand> &reductionVarOperands,
-    SmallVectorImpl<Type> &reductionVarTypes,
-    DenseBoolArrayAttr &reductionByRef, ArrayAttr &reductionSymbols,
-    llvm::SmallVectorImpl<OpAsmParser::UnresolvedOperand> &privateVarOperands,
-    llvm::SmallVectorImpl<Type> &privateVarsTypes,
-    ArrayAttr &privatizerSymbols) {
+    SmallVectorImpl<OpAsmParser::UnresolvedOperand> &reductionVars,
+    SmallVectorImpl<Type> &reductionTypes, DenseBoolArrayAttr &reductionByref,
+    ArrayAttr &reductionSyms,
+    llvm::SmallVectorImpl<OpAsmParser::UnresolvedOperand> &privateVars,
+    llvm::SmallVectorImpl<Type> &privateTypes, ArrayAttr &privateSyms) {
   llvm::SmallVector<OpAsmParser::Argument> regionPrivateArgs;
 
   if (succeeded(parser.parseOptionalKeyword("reduction"))) {
-    if (failed(parseClauseWithRegionArgs(parser, region, reductionVarOperands,
-                                         reductionVarTypes, reductionByRef,
-                                         reductionSymbols, regionPrivateArgs)))
+    if (failed(parseClauseWithRegionArgs(parser, region, reductionVars,
+                                         reductionTypes, reductionByref,
+                                         reductionSyms, regionPrivateArgs)))
       return failure();
   }
 
   if (succeeded(parser.parseOptionalKeyword("private"))) {
-    auto privateByRef = DenseBoolArrayAttr::get(parser.getContext(), {});
-    if (failed(parseClauseWithRegionArgs(parser, region, privateVarOperands,
-                                         privateVarsTypes, privateByRef,
-                                         privatizerSymbols, regionPrivateArgs)))
+    auto privateByref = DenseBoolArrayAttr::get(parser.getContext(), {});
+    if (failed(parseClauseWithRegionArgs(parser, region, privateVars,
+                                         privateTypes, privateByref,
+                                         privateSyms, regionPrivateArgs)))
       return failure();
-    if (llvm::any_of(privateByRef.asArrayRef(),
+    if (llvm::any_of(privateByref.asArrayRef(),
                      [](bool byref) { return byref; })) {
       parser.emitError(parser.getCurrentLocation(),
                        "private clause cannot have byref attributes");
@@ -562,35 +561,30 @@ static ParseResult parseParallelRegion(
 }
 
 static void printParallelRegion(OpAsmPrinter &p, Operation *op, Region &region,
-                                ValueRange reductionVarOperands,
-                                TypeRange reductionVarTypes,
-                                DenseBoolArrayAttr reductionVarIsByRef,
-                                ArrayAttr reductionSymbols,
-                                ValueRange privateVarOperands,
-                                TypeRange privateVarTypes,
-                                ArrayAttr privatizerSymbols) {
-  if (reductionSymbols) {
+                                ValueRange reductionVars,
+                                TypeRange reductionTypes,
+                                DenseBoolArrayAttr reductionByref,
+                                ArrayAttr reductionSyms, ValueRange privateVars,
+                                TypeRange privateTypes, ArrayAttr privateSyms) {
+  if (reductionSyms) {
     auto *argsBegin = region.front().getArguments().begin();
-    MutableArrayRef argsSubrange(argsBegin,
-                                 argsBegin + reductionVarTypes.size());
-    printClauseWithRegionArgs(p, op, argsSubrange, "reduction",
-                              reductionVarOperands, reductionVarTypes,
-                              reductionVarIsByRef, reductionSymbols);
+    MutableArrayRef argsSubrange(argsBegin, argsBegin + reductionTypes.size());
+    printClauseWithRegionArgs(p, op, argsSubrange, "reduction", reductionVars,
+                              reductionTypes, reductionByref, reductionSyms);
   }
 
-  if (privatizerSymbols) {
+  if (privateSyms) {
     auto *argsBegin = region.front().getArguments().begin();
-    MutableArrayRef argsSubrange(argsBegin + reductionVarOperands.size(),
-                                 argsBegin + reductionVarOperands.size() +
-                                     privateVarTypes.size());
+    MutableArrayRef argsSubrange(argsBegin + reductionVars.size(),
+                                 argsBegin + reductionVars.size() +
+                                     privateTypes.size());
     mlir::SmallVector<bool> isByRefVec;
-    isByRefVec.resize(privateVarTypes.size(), false);
+    isByRefVec.resize(privateTypes.size(), false);
     DenseBoolArrayAttr isByRef =
         makeDenseBoolArrayAttr(op->getContext(), isByRefVec);
 
-    printClauseWithRegionArgs(p, op, argsSubrange, "private",
-                              privateVarOperands, privateVarTypes, isByRef,
-                              privatizerSymbols);
+    printClauseWithRegionArgs(p, op, argsSubrange, "private", privateVars,
+                              privateTypes, isByRef, privateSyms);
   }
 
   p.printRegion(region, /*printEntryBlockArgs=*/false);
@@ -599,41 +593,41 @@ static void printParallelRegion(OpAsmPrinter &p, Operation *op, Region &region,
 /// reduction-entry-list ::= reduction-entry
 ///                        | reduction-entry-list `,` reduction-entry
 /// reduction-entry ::= (`byref`)? symbol-ref `->` ssa-id `:` type
-static ParseResult
-parseReductionVarList(OpAsmParser &parser,
-                      SmallVectorImpl<OpAsmParser::UnresolvedOperand> &operands,
-                      SmallVectorImpl<Type> &types, DenseBoolArrayAttr &isByRef,
-                      ArrayAttr &reductionSymbols) {
+static ParseResult parseReductionVarList(
+    OpAsmParser &parser,
+    SmallVectorImpl<OpAsmParser::UnresolvedOperand> &reductionVars,
+    SmallVectorImpl<Type> &reductionTypes, DenseBoolArrayAttr &reductionByref,
+    ArrayAttr &reductionSyms) {
   SmallVector<SymbolRefAttr> reductionVec;
   SmallVector<bool> isByRefVec;
   if (failed(parser.parseCommaSeparatedList([&]() {
         ParseResult optionalByref = parser.parseOptionalKeyword("byref");
         if (parser.parseAttribute(reductionVec.emplace_back()) ||
             parser.parseArrow() ||
-            parser.parseOperand(operands.emplace_back()) ||
-            parser.parseColonType(types.emplace_back()))
+            parser.parseOperand(reductionVars.emplace_back()) ||
+            parser.parseColonType(reductionTypes.emplace_back()))
           return failure();
         isByRefVec.push_back(optionalByref.succeeded());
         return success();
       })))
     return failure();
-  isByRef = makeDenseBoolArrayAttr(parser.getContext(), isByRefVec);
+  reductionByref = makeDenseBoolArrayAttr(parser.getContext(), isByRefVec);
   SmallVector<Attribute> reductions(reductionVec.begin(), reductionVec.end());
-  reductionSymbols = ArrayAttr::get(parser.getContext(), reductions);
+  reductionSyms = ArrayAttr::get(parser.getContext(), reductions);
   return success();
 }
 
 /// Print Reduction clause
-static void printReductionVarList(OpAsmPrinter &p, Operation *op,
-                                  OperandRange reductionVars,
-                                  TypeRange reductionTypes,
-                                  std::optional<DenseBoolArrayAttr> isByRef,
-                                  std::optional<ArrayAttr> reductions) {
+static void
+printReductionVarList(OpAsmPrinter &p, Operation *op,
+                      OperandRange reductionVars, TypeRange reductionTypes,
+                      std::optional<DenseBoolArrayAttr> reductionByref,
+                      std::optional<ArrayAttr> reductionSyms) {
   auto getByRef = [&](unsigned i) -> const char * {
-    if (!isByRef || !*isByRef)
+    if (!reductionByref || !*reductionByref)
       return "";
-    assert(isByRef->empty() || i < isByRef->size());
-    if (!isByRef->empty() && (*isByRef)[i])
+    assert(reductionByref->empty() || i < reductionByref->size());
+    if (!reductionByref->empty() && (*reductionByref)[i])
       return "byref ";
     return "";
   };
@@ -641,26 +635,26 @@ static void printReductionVarList(OpAsmPrinter &p, Operation *op,
   for (unsigned i = 0, e = reductionVars.size(); i < e; ++i) {
     if (i != 0)
       p << ", ";
-    p << getByRef(i) << (*reductions)[i] << " -> " << reductionVars[i] << " : "
-      << reductionVars[i].getType();
+    p << getByRef(i) << (*reductionSyms)[i] << " -> " << reductionVars[i]
+      << " : " << reductionVars[i].getType();
   }
 }
 
 /// Verifies Reduction Clause
 static LogicalResult
-verifyReductionVarList(Operation *op, std::optional<ArrayAttr> reductions,
+verifyReductionVarList(Operation *op, std::optional<ArrayAttr> reductionSyms,
                        OperandRange reductionVars,
-                       std::optional<ArrayRef<bool>> byRef) {
+                       std::optional<ArrayRef<bool>> reductionByref) {
   if (!reductionVars.empty()) {
-    if (!reductions || reductions->size() != reductionVars.size())
+    if (!reductionSyms || reductionSyms->size() != reductionVars.size())
       return op->emitOpError()
              << "expected as many reduction symbol references "
                 "as reduction variables";
-    if (byRef && byRef->size() != reductionVars.size())
+    if (reductionByref && reductionByref->size() != reductionVars.size())
       return op->emitError() << "expected as many reduction variable by "
                                 "reference attributes as reduction variables";
   } else {
-    if (reductions)
+    if (reductionSyms)
       return op->emitOpError() << "unexpected reduction symbol references";
     return success();
   }
@@ -668,7 +662,7 @@ verifyReductionVarList(Operation *op, std::optional<ArrayAttr> reductions,
   // TODO: The followings should be done in
   // SymbolUserOpInterface::verifySymbolUses.
   DenseSet<Value> accumulators;
-  for (auto args : llvm::zip(reductionVars, *reductions)) {
+  for (auto args : llvm::zip(reductionVars, *reductionSyms)) {
     Value accum = std::get<0>(args);
 
     if (!accumulators.insert(accum).second)
@@ -693,41 +687,40 @@ verifyReductionVarList(Operation *op, std::optional<ArrayAttr> reductions,
 }
 
 //===----------------------------------------------------------------------===//
-// Parser, printer and verifier for CopyPrivateVarList
+// Parser, printer and verifier for Copyprivate
 //===----------------------------------------------------------------------===//
 
 /// copyprivate-entry-list ::= copyprivate-entry
 ///                          | copyprivate-entry-list `,` copyprivate-entry
 /// copyprivate-entry ::= ssa-id `->` symbol-ref `:` type
-static ParseResult parseCopyPrivateVarList(
+static ParseResult parseCopyprivate(
     OpAsmParser &parser,
-    SmallVectorImpl<OpAsmParser::UnresolvedOperand> &operands,
-    SmallVectorImpl<Type> &types, ArrayAttr &copyPrivateSymbols) {
-  SmallVector<SymbolRefAttr> copyPrivateFuncsVec;
+    SmallVectorImpl<OpAsmParser::UnresolvedOperand> &copyprivateVars,
+    SmallVectorImpl<Type> &copyprivateTypes, ArrayAttr &copyprivateSyms) {
+  SmallVector<SymbolRefAttr> symsVec;
   if (failed(parser.parseCommaSeparatedList([&]() {
-        if (parser.parseOperand(operands.emplace_back()) ||
+        if (parser.parseOperand(copyprivateVars.emplace_back()) ||
             parser.parseArrow() ||
-            parser.parseAttribute(copyPrivateFuncsVec.emplace_back()) ||
-            parser.parseColonType(types.emplace_back()))
+            parser.parseAttribute(symsVec.emplace_back()) ||
+            parser.parseColonType(copyprivateTypes.emplace_back()))
           return failure();
         return success();
       })))
     return failure();
-  SmallVector<Attribute> copyPrivateFuncs(copyPrivateFuncsVec.begin(),
-                                          copyPrivateFuncsVec.end());
-  copyPrivateSymbols = ArrayAttr::get(parser.getContext(), copyPrivateFuncs);
+  SmallVector<Attribute> syms(symsVec.begin(), symsVec.end());
+  copyprivateSyms = ArrayAttr::get(parser.getContext(), syms);
   return success();
 }
 
-/// Print CopyPrivate clause
-static void printCopyPrivateVarList(OpAsmPrinter &p, Operation *op,
-                                    OperandRange copyPrivateVars,
-                                    TypeRange copyPrivateTypes,
-                                    std::optional<ArrayAttr> copyPrivateFuncs) {
-  if (!copyPrivateFuncs.has_value())
+/// Print Copyprivate clause
+static void printCopyprivate(OpAsmPrinter &p, Operation *op,
+                             OperandRange copyprivateVars,
+                             TypeRange copyprivateTypes,
+                             std::optional<ArrayAttr> copyprivateSyms) {
+  if (!copyprivateSyms.has_value())
     return;
   llvm::interleaveComma(
-      llvm::zip(copyPrivateVars, *copyPrivateFuncs, copyPrivateTypes), p,
+      llvm::zip(copyprivateVars, *copyprivateSyms, copyprivateTypes), p,
       [&](const auto &args) {
         p << std::get<0>(args) << " -> " << std::get<1>(args) << " : "
           << std::get<2>(args);
@@ -736,22 +729,22 @@ static void printCopyPrivateVarList(OpAsmPrinter &p, Operation *op,
 
 /// Verifies CopyPrivate Clause
 static LogicalResult
-verifyCopyPrivateVarList(Operation *op, OperandRange copyPrivateVars,
-                         std::optional<ArrayAttr> copyPrivateFuncs) {
-  size_t copyPrivateFuncsSize =
-      copyPrivateFuncs.has_value() ? copyPrivateFuncs->size() : 0;
-  if (copyPrivateFuncsSize != copyPrivateVars.size())
-    return op->emitOpError() << "inconsistent number of copyPrivate vars (= "
-                             << copyPrivateVars.size()
-                             << ") and functions (= " << copyPrivateFuncsSize
+verifyCopyprivateVarList(Operation *op, OperandRange copyprivateVars,
+                         std::optional<ArrayAttr> copyprivateSyms) {
+  size_t copyprivateSymsSize =
+      copyprivateSyms.has_value() ? copyprivateSyms->size() : 0;
+  if (copyprivateSymsSize != copyprivateVars.size())
+    return op->emitOpError() << "inconsistent number of copyprivate vars (= "
+                             << copyprivateVars.size()
+                             << ") and functions (= " << copyprivateSymsSize
                              << "), both must be equal";
-  if (!copyPrivateFuncs.has_value())
+  if (!copyprivateSyms.has_value())
     return success();
 
-  for (auto copyPrivateVarAndFunc :
-       llvm::zip(copyPrivateVars, *copyPrivateFuncs)) {
+  for (auto copyprivateVarAndSym :
+       llvm::zip(copyprivateVars, *copyprivateSyms)) {
     auto symbolRef =
-        llvm::cast<SymbolRefAttr>(std::get<1>(copyPrivateVarAndFunc));
+        llvm::cast<SymbolRefAttr>(std::get<1>(copyprivateVarAndSym));
     std::optional<std::variant<mlir::func::FuncOp, mlir::LLVM::LLVMFuncOp>>
         funcOp;
     if (mlir::func::FuncOp mlirFuncOp =
@@ -785,7 +778,7 @@ verifyCopyPrivateVarList(Operation *op, OperandRange copyPrivateVars,
       return op->emitOpError() << "expected copy function " << symbolRef
                                << " arguments to have the same type";
 
-    Type varType = std::get<0>(copyPrivateVarAndFunc).getType();
+    Type varType = std::get<0>(copyprivateVarAndSym).getType();
     if (argTy != varType)
       return op->emitOpError()
              << "expected copy function arguments' type (" << argTy
@@ -805,39 +798,39 @@ verifyCopyPrivateVarList(Operation *op, OperandRange copyPrivateVars,
 /// depend-entry ::= depend-kind `->` ssa-id `:` type
 static ParseResult
 parseDependVarList(OpAsmParser &parser,
-                   SmallVectorImpl<OpAsmParser::UnresolvedOperand> &operands,
-                   SmallVectorImpl<Type> &types, ArrayAttr &dependsArray) {
-  SmallVector<ClauseTaskDependAttr> dependVec;
+                   SmallVectorImpl<OpAsmParser::UnresolvedOperand> &dependVars,
+                   SmallVectorImpl<Type> &dependTypes, ArrayAttr &dependKinds) {
+  SmallVector<ClauseTaskDependAttr> kindsVec;
   if (failed(parser.parseCommaSeparatedList([&]() {
         StringRef keyword;
         if (parser.parseKeyword(&keyword) || parser.parseArrow() ||
-            parser.parseOperand(operands.emplace_back()) ||
-            parser.parseColonType(types.emplace_back()))
+            parser.parseOperand(dependVars.emplace_back()) ||
+            parser.parseColonType(dependTypes.emplace_back()))
           return failure();
         if (std::optional<ClauseTaskDepend> keywordDepend =
                 (symbolizeClauseTaskDepend(keyword)))
-          dependVec.emplace_back(
+          kindsVec.emplace_back(
               ClauseTaskDependAttr::get(parser.getContext(), *keywordDepend));
         else
           return failure();
         return success();
       })))
     return failure();
-  SmallVector<Attribute> depends(dependVec.begin(), dependVec.end());
-  dependsArray = ArrayAttr::get(parser.getContext(), depends);
+  SmallVector<Attribute> kinds(kindsVec.begin(), kindsVec.end());
+  dependKinds = ArrayAttr::get(parser.getContext(), kinds);
   return success();
 }
 
 /// Print Depend clause
 static void printDependVarList(OpAsmPrinter &p, Operation *op,
                                OperandRange dependVars, TypeRange dependTypes,
-                               std::optional<ArrayAttr> depends) {
+                               std::optional<ArrayAttr> dependKinds) {
 
-  for (unsigned i = 0, e = depends->size(); i < e; ++i) {
+  for (unsigned i = 0, e = dependKinds->size(); i < e; ++i) {
     if (i != 0)
       p << ", ";
     p << stringifyClauseTaskDepend(
-             llvm::cast<mlir::omp::ClauseTaskDependAttr>((*depends)[i])
+             llvm::cast<mlir::omp::ClauseTaskDependAttr>((*dependKinds)[i])
                  .getValue())
       << " -> " << dependVars[i] << " : " << dependTypes[i];
   }
@@ -845,14 +838,14 @@ static void printDependVarList(OpAsmPrinter &p, Operation *op,
 
 /// Verifies Depend clause
 static LogicalResult verifyDependVarList(Operation *op,
-                                         std::optional<ArrayAttr> depends,
+                                         std::optional<ArrayAttr> dependKinds,
                                          OperandRange dependVars) {
   if (!dependVars.empty()) {
-    if (!depends || depends->size() != dependVars.size())
+    if (!dependKinds || dependKinds->size() != dependVars.size())
       return op->emitOpError() << "expected as many depend values"
                                   " as depend variables";
   } else {
-    if (depends && !depends->empty())
+    if (dependKinds && !dependKinds->empty())
       return op->emitOpError() << "unexpected depend values";
     return success();
   }
@@ -1144,8 +1137,8 @@ static void printMembersIndex(OpAsmPrinter &p, MapInfoOp op,
 
 static ParseResult
 parseMapEntries(OpAsmParser &parser,
-                SmallVectorImpl<OpAsmParser::UnresolvedOperand> &mapOperands,
-                SmallVectorImpl<Type> &mapOperandTypes) {
+                SmallVectorImpl<OpAsmParser::UnresolvedOperand> &mapVars,
+                SmallVectorImpl<Type> &mapTypes) {
   OpAsmParser::UnresolvedOperand arg;
   OpAsmParser::UnresolvedOperand blockArg;
   Type argType;
@@ -1154,14 +1147,14 @@ parseMapEntries(OpAsmParser &parser,
       return failure();
     if (succeeded(parser.parseOptionalArrow()) && parser.parseOperand(blockArg))
       return failure();
-    mapOperands.push_back(arg);
+    mapVars.push_back(arg);
     return success();
   };
 
   auto parseTypes = [&]() -> ParseResult {
     if (parser.parseType(argType))
       return failure();
-    mapOperandTypes.push_back(argType);
+    mapTypes.push_back(argType);
     return success();
   };
 
@@ -1178,48 +1171,47 @@ parseMapEntries(OpAsmParser &parser,
 }
 
 static void printMapEntries(OpAsmPrinter &p, Operation *op,
-                            OperandRange mapOperands,
-                            TypeRange mapOperandTypes) {
+                            OperandRange mapVars, TypeRange mapTypes) {
   // Get pointer to the region if this is an omp.target, because printing map
   // clauses for that operation has to also show the correspondence of each
   // variable to the corresponding block argument.
   Block *entryBlock = isa<TargetOp>(op) ? &op->getRegion(0).front() : nullptr;
   unsigned argIndex = 0;
 
-  for (const auto &mapOp : mapOperands) {
+  for (const auto &mapOp : mapVars) {
     p << mapOp;
     if (entryBlock) {
       const auto &blockArg = entryBlock->getArgument(argIndex);
       p << " -> " << blockArg;
     }
     argIndex++;
-    if (argIndex < mapOperands.size())
+    if (argIndex < mapVars.size())
       p << ", ";
   }
   p << " : ";
 
   argIndex = 0;
-  for (const auto &mapType : mapOperandTypes) {
+  for (const auto &mapType : mapTypes) {
     p << mapType;
     argIndex++;
-    if (argIndex < mapOperands.size())
+    if (argIndex < mapVars.size())
       p << ", ";
   }
 }
 
-static ParseResult parsePrivateList(
-    OpAsmParser &parser,
-    SmallVectorImpl<OpAsmParser::UnresolvedOperand> &privateOperands,
-    SmallVectorImpl<Type> &privateOperandTypes, ArrayAttr &privatizerSymbols) {
+static ParseResult
+parsePrivateList(OpAsmParser &parser,
+                 SmallVectorImpl<OpAsmParser::UnresolvedOperand> &privateVars,
+                 SmallVectorImpl<Type> &privateTypes, ArrayAttr &privateSyms) {
   SmallVector<SymbolRefAttr> privateSymRefs;
   SmallVector<OpAsmParser::Argument> regionPrivateArgs;
 
   if (failed(parser.parseCommaSeparatedList([&]() {
         if (parser.parseAttribute(privateSymRefs.emplace_back()) ||
-            parser.parseOperand(privateOperands.emplace_back()) ||
+            parser.parseOperand(privateVars.emplace_back()) ||
             parser.parseArrow() ||
             parser.parseArgument(regionPrivateArgs.emplace_back()) ||
-            parser.parseColonType(privateOperandTypes.emplace_back()))
+            parser.parseColonType(privateTypes.emplace_back()))
           return failure();
         return success();
       })))
@@ -1227,32 +1219,31 @@ static ParseResult parsePrivateList(
 
   SmallVector<Attribute> privateSymAttrs(privateSymRefs.begin(),
                                          privateSymRefs.end());
-  privatizerSymbols = ArrayAttr::get(parser.getContext(), privateSymAttrs);
+  privateSyms = ArrayAttr::get(parser.getContext(), privateSymAttrs);
 
   return success();
 }
 
 static void printPrivateList(OpAsmPrinter &p, Operation *op,
-                             ValueRange privateVarOperands,
-                             TypeRange privateVarTypes,
-                             ArrayAttr privatizerSymbols) {
+                             ValueRange privateVars, TypeRange privateTypes,
+                             ArrayAttr privateSyms) {
   // TODO: Remove target-specific logic from this function.
   auto targetOp = mlir::dyn_cast<mlir::omp::TargetOp>(op);
   assert(targetOp);
 
   auto &region = op->getRegion(0);
   auto *argsBegin = region.front().getArguments().begin();
-  MutableArrayRef argsSubrange(argsBegin + targetOp.getMapOperands().size(),
-                               argsBegin + targetOp.getMapOperands().size() +
-                                   privateVarTypes.size());
+  MutableArrayRef argsSubrange(argsBegin + targetOp.getMapVars().size(),
+                               argsBegin + targetOp.getMapVars().size() +
+                                   privateTypes.size());
   mlir::SmallVector<bool> isByRefVec;
-  isByRefVec.resize(privateVarTypes.size(), false);
+  isByRefVec.resize(privateTypes.size(), false);
   DenseBoolArrayAttr isByRef =
       DenseBoolArrayAttr::get(op->getContext(), isByRefVec);
 
-  printClauseWithRegionArgs(
-      p, op, argsSubrange, /*clauseName=*/llvm::StringRef{}, privateVarOperands,
-      privateVarTypes, isByRef, privatizerSymbols);
+  printClauseWithRegionArgs(p, op, argsSubrange,
+                            /*clauseName=*/llvm::StringRef{}, privateVars,
+                            privateTypes, isByRef, privateSyms);
 }
 
 static void printCaptureType(OpAsmPrinter &p, Operation *op,
@@ -1271,32 +1262,32 @@ static void printCaptureType(OpAsmPrinter &p, Operation *op,
 }
 
 static ParseResult parseCaptureType(OpAsmParser &parser,
-                                    VariableCaptureKindAttr &mapCapture) {
+                                    VariableCaptureKindAttr &mapCaptureType) {
   StringRef mapCaptureKey;
   if (parser.parseKeyword(&mapCaptureKey))
     return failure();
 
   if (mapCaptureKey == "This")
-    mapCapture = mlir::omp::VariableCaptureKindAttr::get(
+    mapCaptureType = mlir::omp::VariableCaptureKindAttr::get(
         parser.getContext(), mlir::omp::VariableCaptureKind::This);
   if (mapCaptureKey == "ByRef")
-    mapCapture = mlir::omp::VariableCaptureKindAttr::get(
+    mapCaptureType = mlir::omp::VariableCaptureKindAttr::get(
         parser.getContext(), mlir::omp::VariableCaptureKind::ByRef);
   if (mapCaptureKey == "ByCopy")
-    mapCapture = mlir::omp::VariableCaptureKindAttr::get(
+    mapCaptureType = mlir::omp::VariableCaptureKindAttr::get(
         parser.getContext(), mlir::omp::VariableCaptureKind::ByCopy);
   if (mapCaptureKey == "VLAType")
-    mapCapture = mlir::omp::VariableCaptureKindAttr::get(
+    mapCaptureType = mlir::omp::VariableCaptureKindAttr::get(
         parser.getContext(), mlir::omp::VariableCaptureKind::VLAType);
 
   return success();
 }
 
-static LogicalResult verifyMapClause(Operation *op, OperandRange mapOperands) {
+static LogicalResult verifyMapClause(Operation *op, OperandRange mapVars) {
   llvm::DenseSet<mlir::TypedValue<mlir::omp::PointerLikeType>> updateToVars;
   llvm::DenseSet<mlir::TypedValue<mlir::omp::PointerLikeType>> updateFromVars;
 
-  for (auto mapOp : mapOperands) {
+  for (auto mapOp : mapVars) {
     if (!mapOp.getDefiningOp())
       emitError(op->getLoc(), "missing map operation");
 
@@ -1378,19 +1369,20 @@ static LogicalResult verifyMapClause(Operation *op, OperandRange mapOperands) {
 //===----------------------------------------------------------------------===//
 
 void TargetDataOp::build(OpBuilder &builder, OperationState &state,
-                         const TargetDataClauseOps &clauses) {
-  TargetDataOp::build(builder, state, clauses.ifVar, clauses.deviceVar,
-                      clauses.useDevicePtrVars, clauses.useDeviceAddrVars,
-                      clauses.mapVars);
+                         const TargetDataOperands &clauses) {
+  TargetDataOp::build(builder, state, clauses.device, clauses.ifVar,
+                      clauses.mapVars, clauses.useDeviceAddrVars,
+                      clauses.useDevicePtrVars);
 }
 
 LogicalResult TargetDataOp::verify() {
-  if (getMapOperands().empty() && getUseDevicePtr().empty() &&
-      getUseDeviceAddr().empty()) {
-    return ::emitError(this->getLoc(), "At least one of map, useDevicePtr, or "
-                                       "useDeviceAddr operand must be present");
+  if (getMapVars().empty() && getUseDevicePtrVars().empty() &&
+      getUseDeviceAddrVars().empty()) {
+    return ::emitError(this->getLoc(),
+                       "At least one of map, use_device_ptr_vars, or "
+                       "use_device_addr_vars operand must be present");
   }
-  return verifyMapClause(*this, getMapOperands());
+  return verifyMapClause(*this, getMapVars());
 }
 
 //===----------------------------------------------------------------------===//
@@ -1399,40 +1391,39 @@ LogicalResult TargetDataOp::verify() {
 
 void TargetEnterDataOp::build(
     OpBuilder &builder, OperationState &state,
-    const TargetEnterExitUpdateDataClauseOps &clauses) {
+    const TargetEnterExitUpdateDataOperands &clauses) {
   MLIRContext *ctx = builder.getContext();
-  TargetEnterDataOp::build(builder, state, clauses.ifVar, clauses.deviceVar,
-                           makeArrayAttr(ctx, clauses.dependTypeAttrs),
-                           clauses.dependVars, clauses.nowaitAttr,
-                           clauses.mapVars);
+  TargetEnterDataOp::build(builder, state,
+                           makeArrayAttr(ctx, clauses.dependKinds),
+                           clauses.dependVars, clauses.device, clauses.ifVar,
+                           clauses.mapVars, clauses.nowait);
 }
 
 LogicalResult TargetEnterDataOp::verify() {
   LogicalResult verifyDependVars =
-      verifyDependVarList(*this, getDepends(), getDependVars());
+      verifyDependVarList(*this, getDependKinds(), getDependVars());
   return failed(verifyDependVars) ? verifyDependVars
-                                  : verifyMapClause(*this, getMapOperands());
+                                  : verifyMapClause(*this, getMapVars());
 }
 
 //===----------------------------------------------------------------------===//
 // TargetExitDataOp
 //===----------------------------------------------------------------------===//
 
-void TargetExitDataOp::build(
-    OpBuilder &builder, OperationState &state,
-    const TargetEnterExitUpdateDataClauseOps &clauses) {
+void TargetExitDataOp::build(OpBuilder &builder, OperationState &state,
+                             const TargetEnterExitUpdateDataOperands &clauses) {
   MLIRContext *ctx = builder.getContext();
-  TargetExitDataOp::build(builder, state, clauses.ifVar, clauses.deviceVar,
-                          makeArrayAttr(ctx, clauses.dependTypeAttrs),
-                          clauses.dependVars, clauses.nowaitAttr,
-                          clauses.mapVars);
+  TargetExitDataOp::build(builder, state,
+                          makeArrayAttr(ctx, clauses.dependKinds),
+                          clauses.dependVars, clauses.device, clauses.ifVar,
+                          clauses.mapVars, clauses.nowait);
 }
 
 LogicalResult TargetExitDataOp::verify() {
   LogicalResult verifyDependVars =
-      verifyDependVarList(*this, getDepends(), getDependVars());
+      verifyDependVarList(*this, getDependKinds(), getDependVars());
   return failed(verifyDependVars) ? verifyDependVars
-                                  : verifyMapClause(*this, getMapOperands());
+                                  : verifyMapClause(*this, getMapVars());
 }
 
 //===----------------------------------------------------------------------===//
@@ -1440,19 +1431,18 @@ LogicalResult TargetExitDataOp::verify() {
 //===----------------------------------------------------------------------===//
 
 void TargetUpdateOp::build(OpBuilder &builder, OperationState &state,
-                           const TargetEnterExitUpdateDataClauseOps &clauses) {
+                           const TargetEnterExitUpdateDataOperands &clauses) {
   MLIRContext *ctx = builder.getContext();
-  TargetUpdateOp::build(builder, state, clauses.ifVar, clauses.deviceVar,
-                        makeArrayAttr(ctx, clauses.dependTypeAttrs),
-                        clauses.dependVars, clauses.nowaitAttr,
-                        clauses.mapVars);
+  TargetUpdateOp::build(builder, state, makeArrayAttr(ctx, clauses.dependKinds),
+                        clauses.dependVars, clauses.device, clauses.ifVar,
+                        clauses.mapVars, clauses.nowait);
 }
 
 LogicalResult TargetUpdateOp::verify() {
   LogicalResult verifyDependVars =
-      verifyDependVarList(*this, getDepends(), getDependVars());
+      verifyDependVarList(*this, getDependKinds(), getDependVars());
   return failed(verifyDependVars) ? verifyDependVars
-                                  : verifyMapClause(*this, getMapOperands());
+                                  : verifyMapClause(*this, getMapVars());
 }
 
 //===----------------------------------------------------------------------===//
@@ -1460,24 +1450,24 @@ LogicalResult TargetUpdateOp::verify() {
 //===----------------------------------------------------------------------===//
 
 void TargetOp::build(OpBuilder &builder, OperationState &state,
-                     const TargetClauseOps &clauses) {
+                     const TargetOperands &clauses) {
   MLIRContext *ctx = builder.getContext();
   // TODO Store clauses in op: allocateVars, allocatorVars, inReductionVars,
-  // inReduceVarByRef, inReductionDeclSymbols, reductionVars, reduceVarByRef,
-  // reductionDeclSymbols.
-  TargetOp::build(
-      builder, state, clauses.ifVar, clauses.deviceVar, clauses.threadLimitVar,
-      makeArrayAttr(ctx, clauses.dependTypeAttrs), clauses.dependVars,
-      clauses.nowaitAttr, clauses.isDevicePtrVars, clauses.hasDeviceAddrVars,
-      clauses.mapVars, clauses.privateVars,
-      makeArrayAttr(ctx, clauses.privatizers));
+  // inReductionByref, inReductionSyms.
+  TargetOp::build(builder, state, /*allocate_vars=*/{}, /*allocator_vars=*/{},
+                  makeArrayAttr(ctx, clauses.dependKinds), clauses.dependVars,
+                  clauses.device, clauses.hasDeviceAddrVars, clauses.ifVar,
+                  /*in_reduction_vars=*/{}, /*in_reduction_byref=*/nullptr,
+                  /*in_reduction_syms=*/nullptr, clauses.isDevicePtrVars,
+                  clauses.mapVars, clauses.nowait, clauses.privateVars,
+                  makeArrayAttr(ctx, clauses.privateSyms), clauses.threadLimit);
 }
 
 LogicalResult TargetOp::verify() {
   LogicalResult verifyDependVars =
-      verifyDependVarList(*this, getDepends(), getDependVars());
+      verifyDependVarList(*this, getDependKinds(), getDependVars());
   return failed(verifyDependVars) ? verifyDependVars
-                                  : verifyMapClause(*this, getMapOperands());
+                                  : verifyMapClause(*this, getMapVars());
 }
 
 //===----------------------------------------------------------------------===//
@@ -1486,56 +1476,53 @@ LogicalResult TargetOp::verify() {
 
 void ParallelOp::build(OpBuilder &builder, OperationState &state,
                        ArrayRef<NamedAttribute> attributes) {
-  ParallelOp::build(
-      builder, state, /*if_expr=*/nullptr, /*num_threads_var=*/nullptr,
-      /*allocate_vars=*/ValueRange(), /*allocators_vars=*/ValueRange(),
-      /*reduction_vars=*/ValueRange(), /*reduction_vars_byref=*/nullptr,
-      /*reductions=*/nullptr, /*proc_bind_val=*/nullptr,
-      /*private_vars=*/ValueRange(), /*privatizers=*/nullptr);
+  ParallelOp::build(builder, state, /*allocate_vars=*/ValueRange(),
+                    /*allocator_vars=*/ValueRange(), /*if_expr=*/nullptr,
+                    /*num_threads=*/nullptr, /*private_vars=*/ValueRange(),
+                    /*private_syms=*/nullptr, /*proc_bind_kind=*/nullptr,
+                    /*reduction_vars=*/ValueRange(),
+                    /*reduction_byref=*/nullptr, /*reduction_syms=*/nullptr);
   state.addAttributes(attributes);
 }
 
 void ParallelOp::build(OpBuilder &builder, OperationState &state,
-                       const ParallelClauseOps &clauses) {
+                       const ParallelOperands &clauses) {
   MLIRContext *ctx = builder.getContext();
 
-  ParallelOp::build(builder, state, clauses.ifVar, clauses.numThreadsVar,
-                    clauses.allocateVars, clauses.allocatorVars,
-                    clauses.reductionVars,
-                    makeDenseBoolArrayAttr(ctx, clauses.reductionVarsByRef),
-                    makeArrayAttr(ctx, clauses.reductionDeclSymbols),
-                    clauses.procBindKindAttr, clauses.privateVars,
-                    makeArrayAttr(ctx, clauses.privatizers));
+  ParallelOp::build(builder, state, clauses.allocateVars, clauses.allocatorVars,
+                    clauses.ifVar, clauses.numThreads, clauses.privateVars,
+                    makeArrayAttr(ctx, clauses.privateSyms),
+                    clauses.procBindKind, clauses.reductionVars,
+                    makeDenseBoolArrayAttr(ctx, clauses.reductionByref),
+                    makeArrayAttr(ctx, clauses.reductionSyms));
 }
 
 template <typename OpType>
 static LogicalResult verifyPrivateVarList(OpType &op) {
   auto privateVars = op.getPrivateVars();
-  auto privatizers = op.getPrivatizersAttr();
+  auto privateSyms = op.getPrivateSymsAttr();
 
-  if (privateVars.empty() && (privatizers == nullptr || privatizers.empty()))
+  if (privateVars.empty() && (privateSyms == nullptr || privateSyms.empty()))
     return success();
 
   auto numPrivateVars = privateVars.size();
-  auto numPrivatizers = (privatizers == nullptr) ? 0 : privatizers.size();
+  auto numPrivateSyms = (privateSyms == nullptr) ? 0 : privateSyms.size();
 
-  if (numPrivateVars != numPrivatizers)
+  if (numPrivateVars != numPrivateSyms)
     return op.emitError() << "inconsistent number of private variables and "
                              "privatizer op symbols, private vars: "
                           << numPrivateVars
-                          << " vs. privatizer op symbols: " << numPrivatizers;
+                          << " vs. privatizer op symbols: " << numPrivateSyms;
 
-  for (auto privateVarInfo : llvm::zip_equal(privateVars, privatizers)) {
+  for (auto privateVarInfo : llvm::zip_equal(privateVars, privateSyms)) {
     Type varType = std::get<0>(privateVarInfo).getType();
-    SymbolRefAttr privatizerSym =
-        cast<SymbolRefAttr>(std::get<1>(privateVarInfo));
+    SymbolRefAttr privateSym = cast<SymbolRefAttr>(std::get<1>(privateVarInfo));
     PrivateClauseOp privatizerOp =
-        SymbolTable::lookupNearestSymbolFrom<PrivateClauseOp>(op,
-                                                              privatizerSym);
+        SymbolTable::lookupNearestSymbolFrom<PrivateClauseOp>(op, privateSym);
 
     if (privatizerOp == nullptr)
       return op.emitError() << "failed to lookup privatizer op with symbol: '"
-                            << privatizerSym << "'";
+                            << privateSym << "'";
 
     Type privatizerType = privatizerOp.getType();
 
@@ -1570,15 +1557,15 @@ LogicalResult ParallelOp::verify() {
     }
   }
 
-  if (getAllocateVars().size() != getAllocatorsVars().size())
+  if (getAllocateVars().size() != getAllocatorVars().size())
     return emitError(
         "expected equal sizes for allocate and allocator variables");
 
   if (failed(verifyPrivateVarList(*this)))
     return failure();
 
-  return verifyReductionVarList(*this, getReductions(), getReductionVars(),
-                                getReductionVarsByref());
+  return verifyReductionVarList(*this, getReductionSyms(), getReductionVars(),
+                                getReductionByref());
 }
 
 //===----------------------------------------------------------------------===//
@@ -1593,15 +1580,16 @@ static bool opInGlobalImplicitParallelRegion(Operation *op) {
 }
 
 void TeamsOp::build(OpBuilder &builder, OperationState &state,
-                    const TeamsClauseOps &clauses) {
+                    const TeamsOperands &clauses) {
   MLIRContext *ctx = builder.getContext();
-  // TODO Store clauses in op: privateVars, privatizers.
-  TeamsOp::build(builder, state, clauses.numTeamsLowerVar,
-                 clauses.numTeamsUpperVar, clauses.ifVar,
-                 clauses.threadLimitVar, clauses.allocateVars,
-                 clauses.allocatorVars, clauses.reductionVars,
-                 makeDenseBoolArrayAttr(ctx, clauses.reductionVarsByRef),
-                 makeArrayAttr(ctx, clauses.reductionDeclSymbols));
+  // TODO Store clauses in op: privateVars, privateSyms.
+  TeamsOp::build(builder, state, clauses.allocateVars, clauses.allocatorVars,
+                 clauses.ifVar, clauses.numTeamsLower, clauses.numTeamsUpper,
+                 /*private_vars=*/{},
+                 /*private_syms=*/nullptr, clauses.reductionVars,
+                 makeDenseBoolArrayAttr(ctx, clauses.reductionByref),
+                 makeArrayAttr(ctx, clauses.reductionSyms),
+                 clauses.threadLimit);
 }
 
 LogicalResult TeamsOp::verify() {
@@ -1628,12 +1616,12 @@ LogicalResult TeamsOp::verify() {
   }
 
   // Check for allocate clause restrictions
-  if (getAllocateVars().size() != getAllocatorsVars().size())
+  if (getAllocateVars().size() != getAllocatorVars().size())
     return emitError(
         "expected equal sizes for allocate and allocator variables");
 
-  return verifyReductionVarList(*this, getReductions(), getReductionVars(),
-                                getReductionVarsByref());
+  return verifyReductionVarList(*this, getReductionSyms(), getReductionVars(),
+                                getReductionByref());
 }
 
 //===----------------------------------------------------------------------===//
@@ -1641,23 +1629,23 @@ LogicalResult TeamsOp::verify() {
 //===----------------------------------------------------------------------===//
 
 void SectionsOp::build(OpBuilder &builder, OperationState &state,
-                       const SectionsClauseOps &clauses) {
+                       const SectionsOperands &clauses) {
   MLIRContext *ctx = builder.getContext();
-  // TODO Store clauses in op: privateVars, privatizers.
-  SectionsOp::build(builder, state, clauses.reductionVars,
-                    makeDenseBoolArrayAttr(ctx, clauses.reductionVarsByRef),
-                    makeArrayAttr(ctx, clauses.reductionDeclSymbols),
-                    clauses.allocateVars, clauses.allocatorVars,
-                    clauses.nowaitAttr);
+  // TODO Store clauses in op: privateVars, privateSyms.
+  SectionsOp::build(builder, state, clauses.allocateVars, clauses.allocatorVars,
+                    clauses.nowait, /*private_vars=*/{},
+                    /*private_syms=*/nullptr, clauses.reductionVars,
+                    makeDenseBoolArrayAttr(ctx, clauses.reductionByref),
+                    makeArrayAttr(ctx, clauses.reductionSyms));
 }
 
 LogicalResult SectionsOp::verify() {
-  if (getAllocateVars().size() != getAllocatorsVars().size())
+  if (getAllocateVars().size() != getAllocatorVars().size())
     return emitError(
         "expected equal sizes for allocate and allocator variables");
 
-  return verifyReductionVarList(*this, getReductions(), getReductionVars(),
-                                getReductionVarsByref());
+  return verifyReductionVarList(*this, getReductionSyms(), getReductionVars(),
+                                getReductionByref());
 }
 
 LogicalResult SectionsOp::verifyRegions() {
@@ -1676,23 +1664,23 @@ LogicalResult SectionsOp::verifyRegions() {
 //===----------------------------------------------------------------------===//
 
 void SingleOp::build(OpBuilder &builder, OperationState &state,
-                     const SingleClauseOps &clauses) {
+                     const SingleOperands &clauses) {
   MLIRContext *ctx = builder.getContext();
-  // TODO Store clauses in op: privateVars, privatizers.
+  // TODO Store clauses in op: privateVars, privateSyms.
   SingleOp::build(builder, state, clauses.allocateVars, clauses.allocatorVars,
                   clauses.copyprivateVars,
-                  makeArrayAttr(ctx, clauses.copyprivateFuncs),
-                  clauses.nowaitAttr);
+                  makeArrayAttr(ctx, clauses.copyprivateSyms), clauses.nowait,
+                  /*private_vars=*/{}, /*private_syms=*/nullptr);
 }
 
 LogicalResult SingleOp::verify() {
   // Check for allocate clause restrictions
-  if (getAllocateVars().size() != getAllocatorsVars().size())
+  if (getAllocateVars().size() != getAllocatorVars().size())
     return emitError(
         "expected equal sizes for allocate and allocator variables");
 
-  return verifyCopyPrivateVarList(*this, getCopyprivateVars(),
-                                  getCopyprivateFuncs());
+  return verifyCopyprivateVarList(*this, getCopyprivateVars(),
+                                  getCopyprivateSyms());
 }
 
 //===----------------------------------------------------------------------===//
@@ -1729,30 +1717,31 @@ void printWsloop(OpAsmPrinter &p, Operation *op, Region &region,
 
 void WsloopOp::build(OpBuilder &builder, OperationState &state,
                      ArrayRef<NamedAttribute> attributes) {
-  build(builder, state, /*linear_vars=*/ValueRange(),
-        /*linear_step_vars=*/ValueRange(), /*reduction_vars=*/ValueRange(),
-        /*reduction_vars_byref=*/nullptr,
-        /*reductions=*/nullptr, /*schedule_val=*/nullptr,
-        /*schedule_chunk_var=*/nullptr, /*schedule_modifier=*/nullptr,
-        /*simd_modifier=*/false, /*nowait=*/false,
-        /*ordered_val=*/nullptr, /*order_val=*/nullptr,
-        /*order_modifier=*/nullptr);
+  build(builder, state, /*allocate_vars=*/{}, /*allocator_vars=*/{},
+        /*linear_vars=*/ValueRange(), /*linear_step_vars=*/ValueRange(),
+        /*nowait=*/false, /*order=*/nullptr, /*order_mod=*/nullptr,
+        /*ordered=*/nullptr, /*private_vars=*/{}, /*private_syms=*/nullptr,
+        /*reduction_vars=*/ValueRange(), /*reduction_byref=*/nullptr,
+        /*reduction_syms=*/nullptr, /*schedule_kind=*/nullptr,
+        /*schedule_chunk=*/nullptr, /*schedule_mod=*/nullptr,
+        /*schedule_simd=*/false);
   state.addAttributes(attributes);
 }
 
 void WsloopOp::build(OpBuilder &builder, OperationState &state,
-                     const WsloopClauseOps &clauses) {
+                     const WsloopOperands &clauses) {
   MLIRContext *ctx = builder.getContext();
   // TODO: Store clauses in op: allocateVars, allocatorVars, privateVars,
-  // privatizers.
-  WsloopOp::build(builder, state, clauses.linearVars, clauses.linearStepVars,
-                  clauses.reductionVars,
-                  makeDenseBoolArrayAttr(ctx, clauses.reductionVarsByRef),
-                  makeArrayAttr(ctx, clauses.reductionDeclSymbols),
-                  clauses.scheduleValAttr, clauses.scheduleChunkVar,
-                  clauses.scheduleModAttr, clauses.scheduleSimdAttr,
-                  clauses.nowaitAttr, clauses.orderedAttr, clauses.orderAttr,
-                  clauses.orderModAttr);
+  // privateSyms.
+  WsloopOp::build(
+      builder, state,
+      /*allocate_vars=*/{}, /*allocator_vars=*/{}, clauses.linearVars,
+      clauses.linearStepVars, clauses.nowait, clauses.order, clauses.orderMod,
+      clauses.ordered, /*private_vars=*/{}, /*private_syms=*/nullptr,
+      clauses.reductionVars,
+      makeDenseBoolArrayAttr(ctx, clauses.reductionByref),
+      makeArrayAttr(ctx, clauses.reductionSyms), clauses.scheduleKind,
+      clauses.scheduleChunk, clauses.scheduleMod, clauses.scheduleSimd);
 }
 
 LogicalResult WsloopOp::verify() {
@@ -1766,8 +1755,8 @@ LogicalResult WsloopOp::verify() {
       return emitError() << "only supported nested wrapper is 'omp.simd'";
   }
 
-  return verifyReductionVarList(*this, getReductions(), getReductionVars(),
-                                getReductionVarsByref());
+  return verifyReductionVarList(*this, getReductionSyms(), getReductionVars(),
+                                getReductionByref());
 }
 
 //===----------------------------------------------------------------------===//
@@ -1775,14 +1764,17 @@ LogicalResult WsloopOp::verify() {
 //===----------------------------------------------------------------------===//
 
 void SimdOp::build(OpBuilder &builder, OperationState &state,
-                   const SimdClauseOps &clauses) {
+                   const SimdOperands &clauses) {
   MLIRContext *ctx = builder.getContext();
-  // TODO Store clauses in op: privateVars, privatizers, reductionVars,
-  // reduceVarByRef, reductionDeclSymbols.
+  // TODO Store clauses in op: linearVars, linearStepVars, privateVars,
+  // privateSyms, reductionVars, reductionByref, reductionSyms.
   SimdOp::build(builder, state, clauses.alignedVars,
-                makeArrayAttr(ctx, clauses.alignmentAttrs), clauses.ifVar,
-                clauses.nontemporalVars, clauses.orderAttr,
-                clauses.orderModAttr, clauses.safelenAttr, clauses.simdlenAttr);
+                makeArrayAttr(ctx, clauses.alignments), clauses.ifVar,
+                /*linear_vars=*/{}, /*linear_step_vars=*/{},
+                clauses.nontemporalVars, clauses.order, clauses.orderMod,
+                /*private_vars=*/{}, /*private_syms=*/nullptr,
+                /*reduction_vars=*/{}, /*reduction_byref=*/nullptr,
+                /*reduction_syms=*/nullptr, clauses.safelen, clauses.simdlen);
 }
 
 LogicalResult SimdOp::verify() {
@@ -1792,8 +1784,7 @@ LogicalResult SimdOp::verify() {
            << "simdlen clause and safelen clause are both present, but the "
               "simdlen value is not less than or equal to safelen value";
 
-  if (verifyAlignedClause(*this, getAlignmentValues(), getAlignedVars())
-          .failed())
+  if (verifyAlignedClause(*this, getAlignments(), getAlignedVars()).failed())
     return failure();
 
   if (verifyNontemporalClause(*this, getNontemporalVars()).failed())
@@ -1813,20 +1804,20 @@ LogicalResult SimdOp::verify() {
 //===----------------------------------------------------------------------===//
 
 void DistributeOp::build(OpBuilder &builder, OperationState &state,
-                         const DistributeClauseOps &clauses) {
-  // TODO Store clauses in op: privateVars, privatizers.
-  DistributeOp::build(builder, state, clauses.distScheduleStaticAttr,
-                      clauses.distScheduleChunkSizeVar, clauses.allocateVars,
-                      clauses.allocatorVars, clauses.orderAttr,
-                      clauses.orderModAttr);
+                         const DistributeOperands &clauses) {
+  // TODO Store clauses in op: privateVars, privateSyms.
+  DistributeOp::build(
+      builder, state, clauses.allocateVars, clauses.allocatorVars,
+      clauses.distScheduleStatic, clauses.distScheduleChunkSize, clauses.order,
+      clauses.orderMod, /*private_vars=*/{}, /*private_syms=*/nullptr);
 }
 
 LogicalResult DistributeOp::verify() {
-  if (this->getChunkSize() && !this->getDistScheduleStatic())
+  if (this->getDistScheduleChunkSize() && !this->getDistScheduleStatic())
     return emitOpError() << "chunk size set without "
                             "dist_schedule_static being present";
 
-  if (getAllocateVars().size() != getAllocatorsVars().size())
+  if (getAllocateVars().size() != getAllocatorVars().size())
     return emitError(
         "expected equal sizes for allocate and allocator variables");
 
@@ -1942,26 +1933,26 @@ LogicalResult DeclareReductionOp::verifyRegions() {
 //===----------------------------------------------------------------------===//
 
 void TaskOp::build(OpBuilder &builder, OperationState &state,
-                   const TaskClauseOps &clauses) {
+                   const TaskOperands &clauses) {
   MLIRContext *ctx = builder.getContext();
-  // TODO Store clauses in op: privateVars, privatizers.
-  TaskOp::build(
-      builder, state, clauses.ifVar, clauses.finalVar, clauses.untiedAttr,
-      clauses.mergeableAttr, clauses.inReductionVars,
-      makeDenseBoolArrayAttr(ctx, clauses.inReductionVarsByRef),
-      makeArrayAttr(ctx, clauses.inReductionDeclSymbols), clauses.priorityVar,
-      makeArrayAttr(ctx, clauses.dependTypeAttrs), clauses.dependVars,
-      clauses.allocateVars, clauses.allocatorVars);
+  // TODO Store clauses in op: privateVars, privateSyms.
+  TaskOp::build(builder, state, clauses.allocateVars, clauses.allocatorVars,
+                makeArrayAttr(ctx, clauses.dependKinds), clauses.dependVars,
+                clauses.final, clauses.ifVar, clauses.inReductionVars,
+                makeDenseBoolArrayAttr(ctx, clauses.inReductionByref),
+                makeArrayAttr(ctx, clauses.inReductionSyms), clauses.mergeable,
+                clauses.priority, /*private_vars=*/{}, /*private_syms=*/nullptr,
+                clauses.untied);
 }
 
 LogicalResult TaskOp::verify() {
   LogicalResult verifyDependVars =
-      verifyDependVarList(*this, getDepends(), getDependVars());
+      verifyDependVarList(*this, getDependKinds(), getDependVars());
   return failed(verifyDependVars)
              ? verifyDependVars
-             : verifyReductionVarList(*this, getInReductions(),
+             : verifyReductionVarList(*this, getInReductionSyms(),
                                       getInReductionVars(),
-                                      getInReductionVarsByref());
+                                      getInReductionByref());
 }
 
 //===----------------------------------------------------------------------===//
@@ -1969,19 +1960,18 @@ LogicalResult TaskOp::verify() {
 //===----------------------------------------------------------------------===//
 
 void TaskgroupOp::build(OpBuilder &builder, OperationState &state,
-                        const TaskgroupClauseOps &clauses) {
+                        const TaskgroupOperands &clauses) {
   MLIRContext *ctx = builder.getContext();
-  TaskgroupOp::build(
-      builder, state, clauses.taskReductionVars,
-      makeDenseBoolArrayAttr(ctx, clauses.taskReductionVarsByRef),
-      makeArrayAttr(ctx, clauses.taskReductionDeclSymbols),
-      clauses.allocateVars, clauses.allocatorVars);
+  TaskgroupOp::build(builder, state, clauses.allocateVars,
+                     clauses.allocatorVars, clauses.taskReductionVars,
+                     makeDenseBoolArrayAttr(ctx, clauses.taskReductionByref),
+                     makeArrayAttr(ctx, clauses.taskReductionSyms));
 }
 
 LogicalResult TaskgroupOp::verify() {
-  return verifyReductionVarList(*this, getTaskReductions(),
+  return verifyReductionVarList(*this, getTaskReductionSyms(),
                                 getTaskReductionVars(),
-                                getTaskReductionVarsByref());
+                                getTaskReductionByref());
 }
 
 //===----------------------------------------------------------------------===//
@@ -1989,18 +1979,18 @@ LogicalResult TaskgroupOp::verify() {
 //===----------------------------------------------------------------------===//
 
 void TaskloopOp::build(OpBuilder &builder, OperationState &state,
-                       const TaskloopClauseOps &clauses) {
+                       const TaskloopOperands &clauses) {
   MLIRContext *ctx = builder.getContext();
-  // TODO Store clauses in op: privateVars, privatizers.
+  // TODO Store clauses in op: privateVars, privateSyms.
   TaskloopOp::build(
-      builder, state, clauses.ifVar, clauses.finalVar, clauses.untiedAttr,
-      clauses.mergeableAttr, clauses.inReductionVars,
-      makeDenseBoolArrayAttr(ctx, clauses.inReductionVarsByRef),
-      makeArrayAttr(ctx, clauses.inReductionDeclSymbols), clauses.reductionVars,
-      makeDenseBoolArrayAttr(ctx, clauses.reductionVarsByRef),
-      makeArrayAttr(ctx, clauses.reductionDeclSymbols), clauses.priorityVar,
-      clauses.allocateVars, clauses.allocatorVars, clauses.grainsizeVar,
-      clauses.numTasksVar, clauses.nogroupAttr);
+      builder, state, clauses.allocateVars, clauses.allocatorVars,
+      clauses.final, clauses.grainsize, clauses.ifVar, clauses.inReductionVars,
+      makeDenseBoolArrayAttr(ctx, clauses.inReductionByref),
+      makeArrayAttr(ctx, clauses.inReductionSyms), clauses.mergeable,
+      clauses.nogroup, clauses.numTasks, clauses.priority, /*private_vars=*/{},
+      /*private_syms=*/nullptr, clauses.reductionVars,
+      makeDenseBoolArrayAttr(ctx, clauses.reductionByref),
+      makeArrayAttr(ctx, clauses.reductionSyms), clauses.untied);
 }
 
 SmallVector<Value> TaskloopOp::getAllReductionVars() {
@@ -2012,14 +2002,14 @@ SmallVector<Value> TaskloopOp::getAllReductionVars() {
 }
 
 LogicalResult TaskloopOp::verify() {
-  if (getAllocateVars().size() != getAllocatorsVars().size())
+  if (getAllocateVars().size() != getAllocatorVars().size())
     return emitError(
         "expected equal sizes for allocate and allocator variables");
-  if (failed(verifyReductionVarList(*this, getReductions(), getReductionVars(),
-                                    getReductionVarsByref())) ||
-      failed(verifyReductionVarList(*this, getInReductions(),
+  if (failed(verifyReductionVarList(*this, getReductionSyms(),
+                                    getReductionVars(), getReductionByref())) ||
+      failed(verifyReductionVarList(*this, getInReductionSyms(),
                                     getInReductionVars(),
-                                    getInReductionVarsByref())))
+                                    getInReductionByref())))
     return failure();
 
   if (!getReductionVars().empty() && getNogroup())
@@ -2031,7 +2021,7 @@ LogicalResult TaskloopOp::verify() {
                        "and an in_reduction clause");
   }
 
-  if (getGrainSize() && getNumTasks()) {
+  if (getGrainsize() && getNumTasks()) {
     return emitError(
         "the grainsize clause and num_tasks clause are mutually exclusive and "
         "may not appear on the same taskloop directive");
@@ -2072,7 +2062,7 @@ ParseResult LoopNestOp::parse(OpAsmParser &parser, OperationState &result) {
 
   // Parse "inclusive" flag.
   if (succeeded(parser.parseOptionalKeyword("inclusive")))
-    result.addAttribute("inclusive",
+    result.addAttribute("loop_inclusive",
                         UnitAttr::get(parser.getBuilder().getContext()));
 
   // Parse step values.
@@ -2099,28 +2089,29 @@ ParseResult LoopNestOp::parse(OpAsmParser &parser, OperationState &result) {
 void LoopNestOp::print(OpAsmPrinter &p) {
   Region &region = getRegion();
   auto args = region.getArguments();
-  p << " (" << args << ") : " << args[0].getType() << " = (" << getLowerBound()
-    << ") to (" << getUpperBound() << ") ";
-  if (getInclusive())
+  p << " (" << args << ") : " << args[0].getType() << " = ("
+    << getLoopLowerBounds() << ") to (" << getLoopUpperBounds() << ") ";
+  if (getLoopInclusive())
     p << "inclusive ";
-  p << "step (" << getStep() << ") ";
+  p << "step (" << getLoopSteps() << ") ";
   p.printRegion(region, /*printEntryBlockArgs=*/false);
 }
 
 void LoopNestOp::build(OpBuilder &builder, OperationState &state,
-                       const LoopNestClauseOps &clauses) {
-  LoopNestOp::build(builder, state, clauses.loopLBVar, clauses.loopUBVar,
-                    clauses.loopStepVar, clauses.loopInclusiveAttr);
+                       const LoopNestOperands &clauses) {
+  LoopNestOp::build(builder, state, clauses.loopLowerBounds,
+                    clauses.loopUpperBounds, clauses.loopSteps,
+                    clauses.loopInclusive);
 }
 
 LogicalResult LoopNestOp::verify() {
-  if (getLowerBound().empty())
+  if (getLoopLowerBounds().empty())
     return emitOpError() << "must represent at least one loop";
 
-  if (getLowerBound().size() != getIVs().size())
+  if (getLoopLowerBounds().size() != getIVs().size())
     return emitOpError() << "number of range arguments and IVs do not match";
 
-  for (auto [lb, iv] : llvm::zip_equal(getLowerBound(), getIVs())) {
+  for (auto [lb, iv] : llvm::zip_equal(getLoopLowerBounds(), getIVs())) {
     if (lb.getType() != iv.getType())
       return emitOpError()
              << "range argument type does not match corresponding IV type";
@@ -2152,13 +2143,12 @@ void LoopNestOp::gatherWrappers(
 //===----------------------------------------------------------------------===//
 
 void CriticalDeclareOp::build(OpBuilder &builder, OperationState &state,
-                              const CriticalClauseOps &clauses) {
-  CriticalDeclareOp::build(builder, state, clauses.criticalNameAttr,
-                           clauses.hintAttr);
+                              const CriticalDeclareOperands &clauses) {
+  CriticalDeclareOp::build(builder, state, clauses.symName, clauses.hint);
 }
 
 LogicalResult CriticalDeclareOp::verify() {
-  return verifySynchronizationHint(*this, getHintVal());
+  return verifySynchronizationHint(*this, getHint());
 }
 
 LogicalResult CriticalOp::verifySymbolUses(SymbolTableCollection &symbolTable) {
@@ -2193,7 +2183,7 @@ static LogicalResult verifyOrderedParent(Operation &op) {
 
   Operation *wrapper = loopOp->getParentOp();
   if (auto wsloopOp = dyn_cast<WsloopOp>(wrapper)) {
-    IntegerAttr orderedAttr = wsloopOp.getOrderedValAttr();
+    IntegerAttr orderedAttr = wsloopOp.getOrderedAttr();
     if (!orderedAttr)
       return op.emitOpError() << "the enclosing worksharing-loop region must "
                                  "have an ordered clause";
@@ -2213,9 +2203,9 @@ static LogicalResult verifyOrderedParent(Operation &op) {
 }
 
 void OrderedOp::build(OpBuilder &builder, OperationState &state,
-                      const OrderedOpClauseOps &clauses) {
-  OrderedOp::build(builder, state, clauses.doacrossDependTypeAttr,
-                   clauses.doacrossNumLoopsAttr, clauses.doacrossVectorVars);
+                      const OrderedOperands &clauses) {
+  OrderedOp::build(builder, state, clauses.doacrossDependType,
+                   clauses.doacrossNumLoops, clauses.doacrossDependVars);
 }
 
 LogicalResult OrderedOp::verify() {
@@ -2223,7 +2213,7 @@ LogicalResult OrderedOp::verify() {
     return failure();
 
   auto wrapper = (*this)->getParentOfType<WsloopOp>();
-  if (!wrapper || *wrapper.getOrderedVal() != *getNumLoopsVal())
+  if (!wrapper || *wrapper.getOrdered() != *getDoacrossNumLoops())
     return emitOpError() << "number of variables in depend clause does not "
                          << "match number of iteration variables in the "
                          << "doacross loop";
@@ -2232,13 +2222,13 @@ LogicalResult OrderedOp::verify() {
 }
 
 void OrderedRegionOp::build(OpBuilder &builder, OperationState &state,
-                            const OrderedRegionClauseOps &clauses) {
-  OrderedRegionOp::build(builder, state, clauses.parLevelSimdAttr);
+                            const OrderedRegionOperands &clauses) {
+  OrderedRegionOp::build(builder, state, clauses.parLevelSimd);
 }
 
 LogicalResult OrderedRegionOp::verify() {
   // TODO: The code generation for ordered simd directive is not supported yet.
-  if (getSimd())
+  if (getParLevelSimd())
     return failure();
 
   return verifyOrderedParent(**this);
@@ -2249,9 +2239,10 @@ LogicalResult OrderedRegionOp::verify() {
 //===----------------------------------------------------------------------===//
 
 void TaskwaitOp::build(OpBuilder &builder, OperationState &state,
-                       const TaskwaitClauseOps &clauses) {
-  // TODO Store clauses in op: dependTypeAttrs, dependVars, nowaitAttr.
-  TaskwaitOp::build(builder, state);
+                       const TaskwaitOperands &clauses) {
+  // TODO Store clauses in op: dependKinds, dependVars, nowait.
+  TaskwaitOp::build(builder, state, /*depend_kinds=*/nullptr,
+                    /*depend_vars=*/{}, /*nowait=*/nullptr);
 }
 
 //===----------------------------------------------------------------------===//
@@ -2262,14 +2253,14 @@ LogicalResult AtomicReadOp::verify() {
   if (verifyCommon().failed())
     return mlir::failure();
 
-  if (auto mo = getMemoryOrderVal()) {
+  if (auto mo = getMemoryOrder()) {
     if (*mo == ClauseMemoryOrderKind::Acq_rel ||
         *mo == ClauseMemoryOrderKind::Release) {
       return emitError(
           "memory-order must not be acq_rel or release for atomic reads");
     }
   }
-  return verifySynchronizationHint(*this, getHintVal());
+  return verifySynchronizationHint(*this, getHint());
 }
 
 //===----------------------------------------------------------------------===//
@@ -2280,14 +2271,14 @@ LogicalResult AtomicWriteOp::verify() {
   if (verifyCommon().failed())
     return mlir::failure();
 
-  if (auto mo = getMemoryOrderVal()) {
+  if (auto mo = getMemoryOrder()) {
     if (*mo == ClauseMemoryOrderKind::Acq_rel ||
         *mo == ClauseMemoryOrderKind::Acquire) {
       return emitError(
           "memory-order must not be acq_rel or acquire for atomic writes");
     }
   }
-  return verifySynchronizationHint(*this, getHintVal());
+  return verifySynchronizationHint(*this, getHint());
 }
 
 //===----------------------------------------------------------------------===//
@@ -2301,9 +2292,8 @@ LogicalResult AtomicUpdateOp::canonicalize(AtomicUpdateOp op,
     return success();
   }
   if (Value writeVal = op.getWriteOpVal()) {
-    rewriter.replaceOpWithNewOp<AtomicWriteOp>(op, op.getX(), writeVal,
-                                               op.getHintValAttr(),
-                                               op.getMemoryOrderValAttr());
+    rewriter.replaceOpWithNewOp<AtomicWriteOp>(
+        op, op.getX(), writeVal, op.getHintAttr(), op.getMemoryOrderAttr());
     return success();
   }
   return failure();
@@ -2313,7 +2303,7 @@ LogicalResult AtomicUpdateOp::verify() {
   if (verifyCommon().failed())
     return mlir::failure();
 
-  if (auto mo = getMemoryOrderVal()) {
+  if (auto mo = getMemoryOrder()) {
     if (*mo == ClauseMemoryOrderKind::Acq_rel ||
         *mo == ClauseMemoryOrderKind::Acquire) {
       return emitError(
@@ -2321,7 +2311,7 @@ LogicalResult AtomicUpdateOp::verify() {
     }
   }
 
-  return verifySynchronizationHint(*this, getHintVal());
+  return verifySynchronizationHint(*this, getHint());
 }
 
 LogicalResult AtomicUpdateOp::verifyRegions() { return verifyRegionsCommon(); }
@@ -2349,19 +2339,19 @@ AtomicUpdateOp AtomicCaptureOp::getAtomicUpdateOp() {
 }
 
 LogicalResult AtomicCaptureOp::verify() {
-  return verifySynchronizationHint(*this, getHintVal());
+  return verifySynchronizationHint(*this, getHint());
 }
 
 LogicalResult AtomicCaptureOp::verifyRegions() {
   if (verifyRegionsCommon().failed())
     return mlir::failure();
 
-  if (getFirstOp()->getAttr("hint_val") || getSecondOp()->getAttr("hint_val"))
+  if (getFirstOp()->getAttr("hint") || getSecondOp()->getAttr("hint"))
     return emitOpError(
         "operations inside capture region must not have hint clause");
 
-  if (getFirstOp()->getAttr("memory_order_val") ||
-      getSecondOp()->getAttr("memory_order_val"))
+  if (getFirstOp()->getAttr("memory_order") ||
+      getSecondOp()->getAttr("memory_order"))
     return emitOpError(
         "operations inside capture region must not have memory_order clause");
   return success();
@@ -2372,13 +2362,12 @@ LogicalResult AtomicCaptureOp::verifyRegions() {
 //===----------------------------------------------------------------------===//
 
 void CancelOp::build(OpBuilder &builder, OperationState &state,
-                     const CancelClauseOps &clauses) {
-  CancelOp::build(builder, state, clauses.cancelDirectiveNameAttr,
-                  clauses.ifVar);
+                     const CancelOperands &clauses) {
+  CancelOp::build(builder, state, clauses.cancelDirective, clauses.ifVar);
 }
 
 LogicalResult CancelOp::verify() {
-  ClauseCancellationConstructType cct = getCancellationConstructTypeVal();
+  ClauseCancellationConstructType cct = getCancelDirective();
   Operation *parentOp = (*this)->getParentOp();
 
   if (!parentOp) {
@@ -2404,7 +2393,7 @@ LogicalResult CancelOp::verify() {
       return emitError() << "A worksharing construct that is canceled "
                          << "must not have a nowait clause";
     }
-    if (wsloopOp.getOrderedValAttr()) {
+    if (wsloopOp.getOrderedAttr()) {
       return emitError() << "A worksharing construct that is canceled "
                          << "must not have an ordered clause";
     }
@@ -2429,12 +2418,12 @@ LogicalResult CancelOp::verify() {
 //===----------------------------------------------------------------------===//
 
 void CancellationPointOp::build(OpBuilder &builder, OperationState &state,
-                                const CancellationPointClauseOps &clauses) {
-  CancellationPointOp::build(builder, state, clauses.cancelDirectiveNameAttr);
+                                const CancellationPointOperands &clauses) {
+  CancellationPointOp::build(builder, state, clauses.cancelDirective);
 }
 
 LogicalResult CancellationPointOp::verify() {
-  ClauseCancellationConstructType cct = getCancellationConstructTypeVal();
+  ClauseCancellationConstructType cct = getCancelDirective();
   Operation *parentOp = (*this)->getParentOp();
 
   if (!parentOp) {
@@ -2574,8 +2563,8 @@ LogicalResult PrivateClauseOp::verify() {
 //===----------------------------------------------------------------------===//
 
 void MaskedOp::build(OpBuilder &builder, OperationState &state,
-                     const MaskedClauseOps &clauses) {
-  MaskedOp::build(builder, state, clauses.filteredThreadIdVar);
+                     const MaskedOperands &clauses) {
+  MaskedOp::build(builder, state, clauses.filteredThreadId);
 }
 
 #define GET_ATTRDEF_CLASSES
diff --git a/mlir/lib/Dialect/SCF/Transforms/CMakeLists.txt b/mlir/lib/Dialect/SCF/Transforms/CMakeLists.txt
index d363ffe..8c73515 100644
--- a/mlir/lib/Dialect/SCF/Transforms/CMakeLists.txt
+++ b/mlir/lib/Dialect/SCF/Transforms/CMakeLists.txt
@@ -13,6 +13,7 @@ add_mlir_dialect_library(MLIRSCFTransforms
   ParallelLoopCollapsing.cpp
   ParallelLoopFusion.cpp
   ParallelLoopTiling.cpp
+  RotateWhileLoop.cpp
   StructuralTypeConversions.cpp
   TileUsingInterface.cpp
   WrapInZeroTripCheck.cpp
diff --git a/mlir/lib/Dialect/SCF/Transforms/RotateWhileLoop.cpp b/mlir/lib/Dialect/SCF/Transforms/RotateWhileLoop.cpp
new file mode 100644
index 0000000..8707ec9
--- /dev/null
+++ b/mlir/lib/Dialect/SCF/Transforms/RotateWhileLoop.cpp
@@ -0,0 +1,44 @@
+//===- RotateWhileLoop.cpp - scf.while loop rotation ----------------------===//
+//
+// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
+// See https://llvm.org/LICENSE.txt for license information.
+// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
+//
+//===----------------------------------------------------------------------===//
+//
+// Rotates `scf.while` loops.
+//
+//===----------------------------------------------------------------------===//
+
+#include "mlir/Dialect/SCF/Transforms/Patterns.h"
+
+#include "mlir/Dialect/SCF/IR/SCF.h"
+
+using namespace mlir;
+
+namespace {
+struct RotateWhileLoopPattern : OpRewritePattern<scf::WhileOp> {
+  using OpRewritePattern<scf::WhileOp>::OpRewritePattern;
+
+  LogicalResult matchAndRewrite(scf::WhileOp whileOp,
+                                PatternRewriter &rewriter) const final {
+    // Setting this option would lead to infinite recursion on a greedy driver
+    // as 'do-while' loops wouldn't be skipped.
+    constexpr bool forceCreateCheck = false;
+    FailureOr<scf::WhileOp> result =
+        scf::wrapWhileLoopInZeroTripCheck(whileOp, rewriter, forceCreateCheck);
+    // scf::wrapWhileLoopInZeroTripCheck hasn't yet implemented a failure
+    // mechanism. 'do-while' loops are simply returned unmodified. In order to
+    // stop recursion, we check input and output operations differ.
+    return success(succeeded(result) && *result != whileOp);
+  }
+};
+} // namespace
+
+namespace mlir {
+namespace scf {
+void populateSCFRotateWhileLoopPatterns(RewritePatternSet &patterns) {
+  patterns.add<RotateWhileLoopPattern>(patterns.getContext());
+}
+} // namespace scf
+} // namespace mlir
diff --git a/mlir/lib/Dialect/SCF/Transforms/TileUsingInterface.cpp b/mlir/lib/Dialect/SCF/Transforms/TileUsingInterface.cpp
index a139281..e404c01 100644
--- a/mlir/lib/Dialect/SCF/Transforms/TileUsingInterface.cpp
+++ b/mlir/lib/Dialect/SCF/Transforms/TileUsingInterface.cpp
@@ -42,6 +42,16 @@ scf::SCFTilingOptions::setTileSizes(ArrayRef<OpFoldResult> ts) {
   return *this;
 }
 
+scf::SCFTilingOptions &
+scf::SCFTilingOptions::setNumThreads(ArrayRef<OpFoldResult> nt) {
+  assert(!numThreadsComputationFunction && "num tiles already set");
+  auto numThreads = llvm::to_vector(nt);
+  numThreadsComputationFunction = [numThreads](OpBuilder &b, Operation *op) {
+    return numThreads;
+  };
+  return *this;
+}
+
 /// Helper method to adjust the interchange vector to match the iteration
 /// domain.
 static SmallVector<int64_t>
@@ -61,7 +71,120 @@ fillInterchangeVector(ArrayRef<int64_t> interchangeVector,
 // tileUsingSCF implementation.
 //===----------------------------------------------------------------------===//
 
-// Check if `stride` evenly divides the trip count `size - offset`.
+/// Verify the tile size options are set in a consistent manner.
+static LogicalResult
+verifyTileSizeOptions(RewriterBase &rewriter, Location loc,
+                      const scf::SCFTilingOptions &options) {
+  // Specifying number of threads is only supported on `scf.forall` op.
+  if (options.numThreadsComputationFunction &&
+      options.loopType != scf::SCFTilingOptions::LoopType::ForallOp) {
+    return rewriter.notifyMatchFailure(
+        loc, "number of threads can only by specified when loop type is "
+             "set to use `scf.forall`");
+  }
+
+  // If specified, check that the interchange vector is a permutation.
+  if (!options.interchangeVector.empty()) {
+    if (!isPermutationVector(options.interchangeVector)) {
+      return rewriter.notifyMatchFailure(
+          loc, "invalid interchange vector, not a permutation of the entire "
+               "iteration space");
+    }
+  }
+  return success();
+}
+
+/// Method to instantiate the tile sizes and/or number of threads specified
+/// by the user.
+static std::tuple<SmallVector<OpFoldResult>, SmallVector<OpFoldResult>>
+getUserTileSizesAndNumThreads(RewriterBase &rewriter, TilingInterface op,
+                              ArrayRef<Range> iterationDomain,
+                              const scf::SCFTilingOptions &options) {
+  OpFoldResult zero = rewriter.getIndexAttr(0);
+  SmallVector<OpFoldResult> tileSizes, numThreads;
+  size_t numLoops = iterationDomain.size();
+
+  // Check whether the number of tiles to use is specified.
+  if (options.numThreadsComputationFunction) {
+    numThreads = options.numThreadsComputationFunction(rewriter, op);
+    numThreads.resize(numLoops, zero);
+
+    // If the number of tiles is also specified, use that.
+    if (options.tileSizeComputationFunction) {
+      tileSizes = options.tileSizeComputationFunction(rewriter, op);
+      tileSizes.resize(numLoops, zero);
+      return {tileSizes, numThreads};
+    }
+
+    // Compute the tile sizes from the iteration domain and number
+    // of tiles as follows
+    // - niters = ceilDiv(ub - lb, step)
+    // - tileSize = ceilDiv(niters, numThreads)
+    AffineExpr s0, s1, s2;
+    bindSymbols(rewriter.getContext(), s0, s1, s2);
+    // TODO: The step here is assumed to be 1.
+    AffineExpr numItersExpr = (s1 - s0);
+    AffineExpr tileSizeExpr = numItersExpr.ceilDiv(s2);
+    tileSizes.resize(numLoops, zero);
+    for (auto [index, range, nt] :
+         llvm::enumerate(iterationDomain, numThreads)) {
+      if (isConstantIntValue(nt, 0))
+        continue;
+
+      tileSizes[index] = affine::makeComposedFoldedAffineApply(
+          rewriter, op.getLoc(), tileSizeExpr, {range.offset, range.size, nt});
+    }
+    tileSizes.resize(numLoops, zero);
+    return {tileSizes, numThreads};
+  }
+
+  // Enforce the convention that "tiling by zero"
+  // skips tiling a particular dimension. This convention is significantly
+  // simpler to handle instead of adjusting affine maps to account for missing
+  // dimensions.
+  assert(options.tileSizeComputationFunction &&
+         "expected tile sizes to be specified");
+  tileSizes = options.tileSizeComputationFunction(rewriter, op);
+  tileSizes.resize(numLoops, zero);
+
+  return {tileSizes, numThreads};
+}
+
+/// Checks if any of the tiled loops are not parallel.
+static void checkSafeToTileToForall(TilingInterface op,
+                                    ArrayRef<OpFoldResult> tileSizes,
+                                    ArrayRef<OpFoldResult> numThreads) {
+  auto iterators = op.getLoopIteratorTypes();
+  assert(iterators.size() == tileSizes.size() &&
+         "expected as many tile size values as number of loops");
+  assert((numThreads.empty() || (numThreads.size() == iterators.size())) &&
+         "when specified, expected number of threads to use for each loop");
+
+  for (auto [index, iterator, tileSize] :
+       llvm::enumerate(iterators, tileSizes)) {
+    // If num threads is specified, check that it is greater than one only for
+    // parallel dimensions.
+    if (!numThreads.empty()) {
+      if (std::optional<int64_t> constNumThreads =
+              getConstantIntValue(numThreads[index])) {
+        if (constNumThreads.value() > 1 &&
+            iterator != utils::IteratorType::parallel) {
+          op.emitWarning() << "tiling is not thread safe at axis #" << index;
+        }
+      }
+      continue;
+    }
+
+    if (std::optional<int64_t> constTileSize = getConstantIntValue(tileSize)) {
+      if (constTileSize.value() > 0 &&
+          iterator != utils::IteratorType::parallel) {
+        op.emitWarning() << "tiling is not thread safe at axis #" << index;
+      }
+    }
+  }
+}
+
+/// Check if `stride` evenly divides the trip count `size - offset`.
 static bool tileDividesIterationDomain(Range loopRange) {
   std::optional<int64_t> offsetAsInt = getConstantIntValue(loopRange.offset);
   if (!offsetAsInt)
@@ -75,10 +198,10 @@ static bool tileDividesIterationDomain(Range loopRange) {
   return ((sizeAsInt.value() - offsetAsInt.value()) % strideAsInt.value() == 0);
 }
 
-/// Returns the bounded tile size given the current `iv`, `loopRange` and
-/// `tileSize`, i.e., `min(tileSize, range.end() - iv)`.
+/// Returns the bounded tile size given the current `offset`, `loopRange` and
+/// `tileSize`, i.e., `min(tileSize, range.end() - offset)`.
 static OpFoldResult getBoundedTileSize(OpBuilder &b, Location loc,
-                                       Range loopRange, Value iv,
+                                       Range loopRange, OpFoldResult offset,
                                        OpFoldResult tileSize) {
   std::optional<int64_t> ts = getConstantIntValue(tileSize);
   if (ts && ts.value() == 1)
@@ -94,10 +217,132 @@ static OpFoldResult getBoundedTileSize(OpBuilder &b, Location loc,
   AffineExpr s0, s1, d0;
   bindDims(b.getContext(), d0);
   bindSymbols(b.getContext(), s0, s1);
-  AffineMap minMap = AffineMap::get(1, 2, {s0, s1 - d0}, b.getContext());
+  AffineMap minMap = AffineMap::get(1, 2, {s0 - d0, s1}, b.getContext());
   Value size = getValueOrCreateConstantIndexOp(b, loc, loopRange.size);
   return affine::makeComposedFoldedAffineMin(
-      b, loc, minMap, SmallVector<OpFoldResult>{iv, tileSize, size});
+      b, loc, minMap, SmallVector<OpFoldResult>{offset, size, tileSize});
+}
+
+/// Returns true if the maximum tile offset `tileSize * numThreads-1` is less
+/// than `iterationSize`.
+static bool canOmitTileOffsetInBoundsCheck(OpFoldResult tileSize,
+                                           OpFoldResult numThreads,
+                                           OpFoldResult iterationSize) {
+  std::optional<int64_t> tileSizeConst = getConstantIntValue(tileSize);
+  std::optional<int64_t> numThreadsConst = getConstantIntValue(numThreads);
+  std::optional<int64_t> iterSizeConst = getConstantIntValue(iterationSize);
+  if (!tileSizeConst || !numThreadsConst || !iterSizeConst)
+    return false;
+  return *tileSizeConst * (*numThreadsConst - 1) < *iterSizeConst;
+}
+
+/// Compute the `OpFoldResult`s that represents the multi-dimensional
+/// `offset`s and `size`s of the tile of the iteration space that the
+/// innermost loop body of the generated tiled loops corresponds to.
+static std::tuple<SmallVector<OpFoldResult>, SmallVector<OpFoldResult>>
+getTileOffsetAndSizes(RewriterBase &rewriter, Location loc, ValueRange ivs,
+                      ArrayRef<Range> iterationDomain,
+                      ArrayRef<OpFoldResult> tileSizes,
+                      ArrayRef<OpFoldResult> numThreads) {
+  SmallVector<OpFoldResult> offsets, sizes;
+  int materializedLoopNum = 0;
+
+  if (!numThreads.empty()) {
+    AffineExpr d0, d1, s0, s1;
+    AffineExpr offsetExpr, residualTileSizeExpr;
+    bindDims(rewriter.getContext(), d0, d1);
+    bindSymbols(rewriter.getContext(), s0, s1);
+    offsetExpr = d0 + d1 * s0;
+    residualTileSizeExpr = s1 - (d0 + d1 * s0);
+
+    for (auto [nt, tileSize, loopRange] :
+         llvm::zip_equal(numThreads, tileSizes, iterationDomain)) {
+
+      // Non-tiled cases, set the offset and size to the
+      // `loopRange.offset/size`.
+      if (isConstantIntValue(nt, 0)) {
+        offsets.push_back(loopRange.offset);
+        sizes.push_back(loopRange.size);
+        continue;
+      }
+
+      Value iv = ivs[materializedLoopNum++];
+      OpFoldResult offset = affine::makeComposedFoldedAffineApply(
+          rewriter, loc, offsetExpr,
+          ArrayRef<OpFoldResult>{loopRange.offset, iv, tileSize});
+      OpFoldResult residualTileSize = affine::makeComposedFoldedAffineApply(
+          rewriter, loc, residualTileSizeExpr,
+          {loopRange.offset, nt, tileSize, loopRange.size});
+
+      OpFoldResult size = tileSize;
+      if (!isConstantIntValue(residualTileSize, 0)) {
+        OpFoldResult sizeMinusOffsetPerThread =
+            affine::makeComposedFoldedAffineApply(rewriter, loc, s0 - d0,
+                                                  {offset, loopRange.size});
+        size = affine::makeComposedFoldedAffineMin(
+            rewriter, loc,
+            AffineMap::getMultiDimIdentityMap(2, rewriter.getContext()),
+            {sizeMinusOffsetPerThread, tileSize});
+      }
+
+      // Consider the case where the original loop was `[0, 100)`.
+      // If number of threads are `7`, the tile size would be computed as
+      // `ceilDiv(100, 7) = 15`. For the last thread (thread_id = 6)
+      // - `offset = 0 + 6 * 15 = 105`
+      // - `tileSize = min(15, 100 - 105) = -5`
+      // To avoid negative tile sizes, we need to do a further
+      // `nonNegativeTileSize = affine.max(0, tileSize)`.
+      // This `max` can be avoided if
+      //  `offset + tileSize * (numThreads - 1) < (ub - lb)`
+      if (!canOmitTileOffsetInBoundsCheck(tileSize, nt, loopRange.size)) {
+        AffineMap maxMap =
+            AffineMap::getMultiDimIdentityMap(2, rewriter.getContext());
+        size = affine::makeComposedFoldedAffineMax(
+            rewriter, loc, maxMap, {rewriter.getIndexAttr(0), size});
+      }
+
+      offsets.push_back(offset);
+      sizes.push_back(size);
+    }
+    return {offsets, sizes};
+  } else {
+    for (auto [tileSize, loopRange] :
+         llvm::zip_equal(tileSizes, iterationDomain)) {
+
+      // Non-tiled cases, set the offset and size to the
+      // `loopRange.offset/size`.
+      if (isConstantIntValue(tileSize, 0)) {
+        offsets.push_back(loopRange.offset);
+        sizes.push_back(loopRange.size);
+        continue;
+      }
+
+      Value iv = ivs[materializedLoopNum++];
+      OpFoldResult offset = getAsOpFoldResult(iv);
+      offsets.push_back(offset);
+      OpFoldResult size =
+          getBoundedTileSize(rewriter, loc, loopRange, offset, tileSize);
+      sizes.push_back(size);
+    }
+    return {offsets, sizes};
+  }
+}
+
+/// Function to return the bounds of the loops to be generated.
+static std::tuple<SmallVector<OpFoldResult>, SmallVector<OpFoldResult>,
+                  SmallVector<OpFoldResult>>
+getLoopBounds(RewriterBase &rewriter, Location loc, ArrayRef<Range> loopRanges,
+              ArrayRef<OpFoldResult> tileSizes) {
+  SmallVector<OpFoldResult> lbs, ubs, steps;
+  for (auto [loopRange, tileSize] : llvm::zip_equal(loopRanges, tileSizes)) {
+    // No loop if the tile size is 0.
+    if (isConstantIntValue(tileSize, 0))
+      continue;
+    lbs.push_back(loopRange.offset);
+    ubs.push_back(loopRange.size);
+    steps.push_back(tileSize);
+  }
+  return {lbs, ubs, steps};
 }
 
 /// A function that allows returning additional yielded values during
@@ -152,17 +397,19 @@ static LogicalResult generateLoopNestUsingForOp(
   assert(loopRanges.size() == tileSizes.size() &&
          "expected as many tile sizes as loop ranges");
   OpBuilder::InsertionGuard guard(rewriter);
-  SmallVector<Value> ivs;
 
-  for (auto [loopRange, tileSize] : llvm::zip_equal(loopRanges, tileSizes)) {
-    // No loops if tile size is zero. Set offset and size to the loop
-    // offset and size.
-    if (isConstantIntValue(tileSize, 0))
-      continue;
+  SmallVector<OpFoldResult> lbs, ubs, steps;
+  std::tie(lbs, ubs, steps) =
+      getLoopBounds(rewriter, loc, loopRanges, tileSizes);
+  SmallVector<Value> lbVals =
+      getValueOrCreateConstantIndexOp(rewriter, loc, lbs);
+  SmallVector<Value> ubVals =
+      getValueOrCreateConstantIndexOp(rewriter, loc, ubs);
+  SmallVector<Value> stepVals =
+      getValueOrCreateConstantIndexOp(rewriter, loc, steps);
 
-    Value lb = getValueOrCreateConstantIndexOp(rewriter, loc, loopRange.offset);
-    Value ub = getValueOrCreateConstantIndexOp(rewriter, loc, loopRange.size);
-    Value step = getValueOrCreateConstantIndexOp(rewriter, loc, tileSize);
+  SmallVector<Value> ivs;
+  for (auto [lb, ub, step] : llvm::zip_equal(lbVals, ubVals, stepVals)) {
     auto loop =
         rewriter.create<scf::ForOp>(loc, lb, ub, step, destinationTensors,
                                     [](OpBuilder &bodyBuilder, Location bodyLoc,
@@ -224,10 +471,9 @@ static LogicalResult generateLoopNestUsingForOp(
 ///    populated.
 static LogicalResult generateLoopNestUsingForallOp(
     RewriterBase &rewriter, Location loc, ArrayRef<Range> loopRanges,
-    ArrayRef<OpFoldResult> tileSizes, ArrayRef<Attribute> mappingVector,
-    ValueRange destinationTensors, YieldTiledValuesFn tiledBodyFn,
-    SmallVector<LoopLikeOpInterface> &loops) {
-  SmallVector<OpFoldResult> lbs, ubs, steps;
+    ArrayRef<OpFoldResult> tileSizes, ArrayRef<OpFoldResult> numThreads,
+    ArrayRef<Attribute> mappingVector, ValueRange destinationTensors,
+    YieldTiledValuesFn tiledBodyFn, SmallVector<LoopLikeOpInterface> &loops) {
   assert(!loopRanges.empty() && "unexpected empty loop ranges");
   assert(loopRanges.size() == tileSizes.size() &&
          "expected as many tile sizes as loop ranges");
@@ -235,21 +481,30 @@ static LogicalResult generateLoopNestUsingForallOp(
   SmallVector<OpFoldResult> offsets(loopRanges.size()),
       sizes(loopRanges.size());
 
-  for (auto [tileSize, loopRange] : llvm::zip_equal(tileSizes, loopRanges)) {
-    if (isConstantIntValue(tileSize, 0))
-      continue;
-    lbs.push_back(loopRange.offset);
-    ubs.push_back(loopRange.size);
-    steps.push_back(tileSize);
-  }
-  assert(!lbs.empty() && "Expected at least one loop range");
-
   std::optional<ArrayAttr> mappingAttr;
   if (!mappingVector.empty())
     mappingAttr = rewriter.getArrayAttr(mappingVector);
 
-  auto forallOp = rewriter.create<scf::ForallOp>(
-      loc, lbs, ubs, steps, destinationTensors, mappingAttr);
+  scf::ForallOp forallOp;
+  bool useNumThreads = !numThreads.empty();
+
+  if (useNumThreads) {
+    // Prune the zero numthreads.
+    SmallVector<OpFoldResult> nonZeroNumThreads;
+    for (auto nt : numThreads) {
+      if (isConstantIntValue(nt, 0))
+        continue;
+      nonZeroNumThreads.push_back(nt);
+    }
+    forallOp = rewriter.create<scf::ForallOp>(loc, nonZeroNumThreads,
+                                              destinationTensors, mappingAttr);
+  } else {
+    SmallVector<OpFoldResult> lbs, ubs, steps;
+    std::tie(lbs, ubs, steps) =
+        getLoopBounds(rewriter, loc, loopRanges, tileSizes);
+    forallOp = rewriter.create<scf::ForallOp>(loc, lbs, ubs, steps,
+                                              destinationTensors, mappingAttr);
+  }
   loops.push_back(forallOp);
 
   rewriter.setInsertionPoint(forallOp.getTerminator());
@@ -286,13 +541,11 @@ static LogicalResult generateLoopNestUsingForallOp(
 ///    loop.
 /// - `loops` is an in-out parameter into which the generated loops are
 ///    populated.
-static LogicalResult generateLoopNest(RewriterBase &rewriter, Location loc,
-                                      const scf::SCFTilingOptions &options,
-                                      ArrayRef<Range> loopRanges,
-                                      ArrayRef<OpFoldResult> tileSizes,
-                                      ValueRange destinationTensors,
-                                      YieldTiledValuesFn tiledBodyFn,
-                                      SmallVector<LoopLikeOpInterface> &loops) {
+static LogicalResult generateLoopNest(
+    RewriterBase &rewriter, Location loc, const scf::SCFTilingOptions &options,
+    ArrayRef<Range> loopRanges, ArrayRef<OpFoldResult> tileSizes,
+    ArrayRef<OpFoldResult> numThreads, ValueRange destinationTensors,
+    YieldTiledValuesFn tiledBodyFn, SmallVector<LoopLikeOpInterface> &loops) {
   // If the tile sizes are all zero, no loops are generated. Just call the
   // callback function to handle untiled case.
   if (llvm::all_of(tileSizes, isZeroIndex)) {
@@ -307,7 +560,7 @@ static LogicalResult generateLoopNest(RewriterBase &rewriter, Location loc,
   }
   if (options.loopType == scf::SCFTilingOptions::LoopType::ForallOp) {
     return generateLoopNestUsingForallOp(
-        rewriter, loc, loopRanges, tileSizes, options.mappingVector,
+        rewriter, loc, loopRanges, tileSizes, numThreads, options.mappingVector,
         destinationTensors, tiledBodyFn, loops);
   }
   return rewriter.notifyMatchFailure(loc, "unhandled loop type");
@@ -531,27 +784,25 @@ static LogicalResult addInitOperandsToLoopNest(
 FailureOr<scf::SCFTilingResult>
 mlir::scf::tileUsingSCF(RewriterBase &rewriter, TilingInterface op,
                         const scf::SCFTilingOptions &options) {
+  if (failed(verifyTileSizeOptions(rewriter, op.getLoc(), options))) {
+    return failure();
+  }
+
   OpBuilder::InsertionGuard guard(rewriter);
   rewriter.setInsertionPointAfter(op);
 
-  if (!options.tileSizeComputationFunction) {
-    return rewriter.notifyMatchFailure(
-        op, "missing tile size computation function");
-  }
-
   // 1. Get the range of the loops that are represented by the operation.
   SmallVector<Range> iterationDomain = op.getIterationDomain(rewriter);
-  size_t numLoops = iterationDomain.size();
 
-  // 2. Materialize the tile sizes. Enforce the convention that "tiling by zero"
-  // skips tiling a particular dimension. This convention is significantly
-  // simpler to handle instead of adjusting affine maps to account for missing
-  // dimensions.
-  SmallVector<OpFoldResult> tileSizes =
-      options.tileSizeComputationFunction(rewriter, op);
-  if (tileSizes.size() < iterationDomain.size()) {
-    auto zero = rewriter.getIndexAttr(0);
-    tileSizes.append(numLoops - tileSizes.size(), zero);
+  // 2. Materialize the tile sizes and/or number of threads;
+  SmallVector<OpFoldResult> tileSizes, numThreads;
+  std::tie(tileSizes, numThreads) =
+      getUserTileSizesAndNumThreads(rewriter, op, iterationDomain, options);
+
+  // Check if it is safe to tile. This is hold over from previous iterations
+  // of tile to for-all. Consider dropping it.
+  if (options.loopType == scf::SCFTilingOptions::LoopType::ForallOp) {
+    checkSafeToTileToForall(op, tileSizes, numThreads);
   }
 
   // 3. If there is an interchange specified, permute the iteration domain and
@@ -560,16 +811,13 @@ mlir::scf::tileUsingSCF(RewriterBase &rewriter, TilingInterface op,
   if (!options.interchangeVector.empty()) {
     interchangeVector = fillInterchangeVector(options.interchangeVector,
                                               iterationDomain.size());
-  }
-  if (!interchangeVector.empty()) {
-    if (!isPermutationVector(interchangeVector)) {
-      return rewriter.notifyMatchFailure(
-          op, "invalid intechange vector, not a permutation of the entire "
-              "iteration space");
-    }
+    assert(isPermutationVector(interchangeVector) &&
+           "expected interchange vector to be a permutation");
 
     applyPermutationToVector(iterationDomain, interchangeVector);
     applyPermutationToVector(tileSizes, interchangeVector);
+    if (!numThreads.empty())
+      applyPermutationToVector(numThreads, interchangeVector);
   }
 
   FailureOr<TilingResult> tilingResult;
@@ -583,21 +831,8 @@ mlir::scf::tileUsingSCF(RewriterBase &rewriter, TilingInterface op,
       -> LogicalResult {
     // 4a. Compute the `offsets` and `sizes` to use for tiling.
     SmallVector<OpFoldResult> offsets, sizes;
-    {
-      int materializedLoopNum = 0;
-      for (auto [tileSize, loopRange] :
-           llvm::zip_equal(tileSizes, iterationDomain)) {
-        if (isConstantIntValue(tileSize, 0)) {
-          offsets.push_back(loopRange.offset);
-          sizes.push_back(loopRange.size);
-          continue;
-        }
-        Value iv = ivs[materializedLoopNum++];
-        offsets.push_back(iv);
-        sizes.push_back(
-            getBoundedTileSize(rewriter, loc, loopRange, iv, tileSize));
-      }
-    }
+    std::tie(offsets, sizes) = getTileOffsetAndSizes(
+        rewriter, loc, ivs, iterationDomain, tileSizes, numThreads);
 
     // 4b. If interchange was provided, apply inverse of the interchange
     //     to get back the offsets/sizes in the order to be specified.
@@ -665,7 +900,7 @@ mlir::scf::tileUsingSCF(RewriterBase &rewriter, TilingInterface op,
   // 7. Generate the tiled loops nest using the callback defined above.
   SmallVector<LoopLikeOpInterface> loops;
   if (failed(generateLoopNest(rewriter, op.getLoc(), options, iterationDomain,
-                              tileSizes, destinationTensors,
+                              tileSizes, numThreads, destinationTensors,
                               innerYieldTiledValuesFn, loops)))
     return op.emitOpError("failed to generate tiling loops");
   assert(succeeded(tilingResult) &&
@@ -781,6 +1016,7 @@ mlir::scf::tileReductionUsingScf(RewriterBase &b,
   scf::SCFTilingOptions options;
   options.setLoopType(scf::SCFTilingOptions::LoopType::ForOp);
   if (failed(generateLoopNest(b, loc, options, iterationDomain, tileSizesVector,
+                              /*numThreads=*/ArrayRef<OpFoldResult>{},
                               initTensors, innerYieldTiledValuesFn, loops)))
     return b.notifyMatchFailure(op, "failed to tile for parallel reduction");
 
diff --git a/mlir/lib/Dialect/SCF/Utils/Utils.cpp b/mlir/lib/Dialect/SCF/Utils/Utils.cpp
index c0ee9d2..9df6e24 100644
--- a/mlir/lib/Dialect/SCF/Utils/Utils.cpp
+++ b/mlir/lib/Dialect/SCF/Utils/Utils.cpp
@@ -294,8 +294,8 @@ static Value ceilDivPositive(OpBuilder &builder, Location loc, Value dividend,
 }
 
 /// Returns the trip count of `forOp` if its' low bound, high bound and step are
-/// constants, or optional otherwise. Trip count is computed as ceilDiv(highBound
-/// - lowBound, step).
+/// constants, or optional otherwise. Trip count is computed as
+/// ceilDiv(highBound - lowBound, step).
 static std::optional<int64_t> getConstantTripCount(scf::ForOp forOp) {
   std::optional<int64_t> lbCstOp = getConstantIntValue(forOp.getLowerBound());
   std::optional<int64_t> ubCstOp = getConstantIntValue(forOp.getUpperBound());
@@ -1363,3 +1363,37 @@ scf::ForOp mlir::fuseIndependentSiblingForLoops(scf::ForOp target,
 
   return fusedLoop;
 }
+
+FailureOr<scf::ForallOp> mlir::normalizeForallOp(RewriterBase &rewriter,
+                                                 scf::ForallOp forallOp) {
+  SmallVector<OpFoldResult> lbs = forallOp.getMixedLowerBound();
+  SmallVector<OpFoldResult> ubs = forallOp.getMixedUpperBound();
+  SmallVector<OpFoldResult> steps = forallOp.getMixedStep();
+
+  if (llvm::all_of(
+          lbs, [](OpFoldResult ofr) { return isConstantIntValue(ofr, 0); }) &&
+      llvm::all_of(
+          steps, [](OpFoldResult ofr) { return isConstantIntValue(ofr, 1); })) {
+    return forallOp;
+  }
+
+  SmallVector<OpFoldResult> newLbs, newUbs, newSteps;
+  for (auto [lb, ub, step] : llvm::zip_equal(lbs, ubs, steps)) {
+    Range normalizedLoopParams =
+        emitNormalizedLoopBounds(rewriter, forallOp.getLoc(), lb, ub, step);
+    newLbs.push_back(normalizedLoopParams.offset);
+    newUbs.push_back(normalizedLoopParams.size);
+    newSteps.push_back(normalizedLoopParams.stride);
+  }
+
+  auto normalizedForallOp = rewriter.create<scf::ForallOp>(
+      forallOp.getLoc(), newLbs, newUbs, newSteps, forallOp.getOutputs(),
+      forallOp.getMapping(), [](OpBuilder &, Location, ValueRange) {});
+
+  rewriter.inlineRegionBefore(forallOp.getBodyRegion(),
+                              normalizedForallOp.getBodyRegion(),
+                              normalizedForallOp.getBodyRegion().begin());
+
+  rewriter.replaceAllOpUsesWith(forallOp, normalizedForallOp);
+  return success();
+}
diff --git a/mlir/lib/Dialect/SparseTensor/IR/SparseTensorDialect.cpp b/mlir/lib/Dialect/SparseTensor/IR/SparseTensorDialect.cpp
index 616e91a..1135ea3 100644
--- a/mlir/lib/Dialect/SparseTensor/IR/SparseTensorDialect.cpp
+++ b/mlir/lib/Dialect/SparseTensor/IR/SparseTensorDialect.cpp
@@ -2131,10 +2131,82 @@ static void printLevelRange(OpAsmPrinter &p, Operation *, IntegerAttr lvlLo,
   printLevelRange(p, lo, hi);
 }
 
+/// Parses a list of `optional` defined list in the form of
+/// "(%val0, _, %val1, ...)", where `_` is used to annotate that the
+/// corresponding value is not defined (e.g., to represent an undefined
+/// coordinate in the sparse iteration space).
+static ParseResult parseOptionalDefinedList(
+    OpAsmParser &parser, OperationState &state, I64BitSet &definedSet,
+    SmallVectorImpl<OpAsmParser::Argument> &definedArgs,
+    unsigned maxCnt = std::numeric_limits<unsigned>::max(),
+    OpAsmParser::Delimiter delimiter = OpAsmParser::Delimiter::Paren) {
+  unsigned cnt = 0;
+  ParseResult crdList =
+      parser.parseCommaSeparatedList(delimiter, [&]() -> ParseResult {
+        if (parser.parseOptionalKeyword("_")) {
+          if (parser.parseArgument(definedArgs.emplace_back()))
+            return failure();
+          definedSet.set(cnt);
+        }
+        cnt += 1;
+        return success();
+      });
+
+  if (cnt > maxCnt)
+    return parser.emitError(parser.getNameLoc(),
+                            "parsed more value than expected.");
+
+  if (failed(crdList)) {
+    return parser.emitError(
+        parser.getNameLoc(),
+        "expecting SSA value or \"_\" for level coordinates");
+  }
+  assert(definedArgs.size() == definedSet.count());
+  return success();
+}
+
+static void printOptionalDefinedList(OpAsmPrinter &p, unsigned size,
+                                     Block::BlockArgListType blocksArgs,
+                                     I64BitSet definedSet) {
+  if (definedSet.empty())
+    return;
+
+  for (unsigned i = 0; i < size; i++) {
+    if (definedSet[i]) {
+      p << blocksArgs.front();
+      blocksArgs = blocksArgs.drop_front();
+    } else {
+      p << "_";
+    }
+    if (i != size - 1)
+      p << ", ";
+  }
+  assert(blocksArgs.empty());
+}
+
+static ParseResult
+parseUsedCoordList(OpAsmParser &parser, OperationState &state,
+                   SmallVectorImpl<OpAsmParser::Argument> &coords) {
+  // Parse "at(%crd0, _, ...)"
+  I64BitSet crdUsedLvlSet;
+  if (succeeded(parser.parseOptionalKeyword("at")) &&
+      failed(parseOptionalDefinedList(parser, state, crdUsedLvlSet, coords)))
+    return failure();
+
+  // Always use IndexType for the coordinate.
+  for (auto &coord : coords)
+    coord.type = parser.getBuilder().getIndexType();
+
+  // Set the CrdUsedLvl bitset.
+  state.addAttribute("crdUsedLvls",
+                     parser.getBuilder().getI64IntegerAttr(crdUsedLvlSet));
+  return success();
+}
+
 static ParseResult
-parseSparseSpaceLoop(OpAsmParser &parser, OperationState &state,
-                     SmallVectorImpl<OpAsmParser::Argument> &iterators,
-                     SmallVectorImpl<OpAsmParser::Argument> &iterArgs) {
+parseSparseIterateLoop(OpAsmParser &parser, OperationState &state,
+                       SmallVectorImpl<OpAsmParser::Argument> &iterators,
+                       SmallVectorImpl<OpAsmParser::Argument> &blockArgs) {
   SmallVector<OpAsmParser::UnresolvedOperand> spaces;
   SmallVector<OpAsmParser::UnresolvedOperand> initArgs;
 
@@ -2148,37 +2220,14 @@ parseSparseSpaceLoop(OpAsmParser &parser, OperationState &state,
         parser.getNameLoc(),
         "mismatch in number of sparse iterators and sparse spaces");
 
-  // Parse "at(%crd0, _, ...)"
-  LevelSet crdUsedLvlSet;
-  bool hasUsedCrds = succeeded(parser.parseOptionalKeyword("at"));
-  unsigned lvlCrdCnt = 0;
-  if (hasUsedCrds) {
-    ParseResult crdList = parser.parseCommaSeparatedList(
-        OpAsmParser::Delimiter::Paren, [&]() -> ParseResult {
-          if (parser.parseOptionalKeyword("_")) {
-            if (parser.parseArgument(iterArgs.emplace_back()))
-              return failure();
-            // Always use IndexType for the coordinate.
-            crdUsedLvlSet.set(lvlCrdCnt);
-            iterArgs.back().type = parser.getBuilder().getIndexType();
-          }
-          lvlCrdCnt += 1;
-          return success();
-        });
-    if (failed(crdList)) {
-      return parser.emitError(
-          parser.getNameLoc(),
-          "expecting SSA value or \"_\" for level coordinates");
-    }
-  }
-  // Set the CrdUsedLvl bitset.
-  state.addAttribute("crdUsedLvls",
-                     parser.getBuilder().getI64IntegerAttr(crdUsedLvlSet));
+  if (failed(parseUsedCoordList(parser, state, blockArgs)))
+    return failure();
+  size_t numCrds = blockArgs.size();
 
   // Parse "iter_args(%arg = %init, ...)"
   bool hasIterArgs = succeeded(parser.parseOptionalKeyword("iter_args"));
   if (hasIterArgs)
-    if (parser.parseAssignmentList(iterArgs, initArgs))
+    if (parser.parseAssignmentList(blockArgs, initArgs))
       return failure();
 
   SmallVector<Type> iterSpaceTps;
@@ -2196,10 +2245,6 @@ parseSparseSpaceLoop(OpAsmParser &parser, OperationState &state,
       return parser.emitError(parser.getNameLoc(),
                               "expected sparse_tensor.iter_space type for "
                               "iteration space operands");
-    if (hasUsedCrds && spaceTp.getSpaceDim() != lvlCrdCnt)
-      return parser.emitError(parser.getNameLoc(),
-                              "mismatch in number of iteration space dimension "
-                              "and specified coordinates");
     it.type = spaceTp.getIteratorType();
   }
 
@@ -2213,9 +2258,68 @@ parseSparseSpaceLoop(OpAsmParser &parser, OperationState &state,
     return failure();
 
   if (hasIterArgs) {
-    unsigned numCrds = crdUsedLvlSet.count();
     // Strip off leading args that used for coordinates.
-    MutableArrayRef args = MutableArrayRef(iterArgs).drop_front(numCrds);
+    MutableArrayRef args = MutableArrayRef(blockArgs).drop_front(numCrds);
+    if (args.size() != initArgs.size() || args.size() != state.types.size()) {
+      return parser.emitError(
+          parser.getNameLoc(),
+          "mismatch in number of iteration arguments and return values");
+    }
+
+    for (auto [it, init, tp] : llvm::zip_equal(args, initArgs, state.types)) {
+      it.type = tp;
+      if (parser.resolveOperand(init, tp, state.operands))
+        return failure();
+    }
+  }
+  return success();
+}
+
+static ParseResult
+parseSparseCoIterateLoop(OpAsmParser &parser, OperationState &state,
+                         SmallVectorImpl<Value> &spacesVals,
+                         SmallVectorImpl<OpAsmParser::Argument> &blockArgs) {
+
+  // Parse "(%spaces, ...)"
+  SmallVector<OpAsmParser::UnresolvedOperand> spaces;
+  if (parser.parseOperandList(spaces, OpAsmParser::Delimiter::Paren))
+    return failure();
+
+  if (failed(parseUsedCoordList(parser, state, blockArgs)))
+    return failure();
+  size_t numCrds = blockArgs.size();
+
+  // Parse "iter_args(%arg = %init, ...)"
+  SmallVector<OpAsmParser::UnresolvedOperand> initArgs;
+  bool hasIterArgs = succeeded(parser.parseOptionalKeyword("iter_args"));
+  if (hasIterArgs)
+    if (parser.parseAssignmentList(blockArgs, initArgs))
+      return failure();
+
+  SmallVector<Type> iterSpaceTps;
+  // parse ": (sparse_tensor.iter_space, ...) -> ret"
+  if (parser.parseColon() || parser.parseLParen() ||
+      parser.parseTypeList(iterSpaceTps) || parser.parseRParen())
+    return failure();
+
+  if (iterSpaceTps.size() != spaces.size())
+    return parser.emitError(parser.getNameLoc(),
+                            "mismatch in number of iteration space operands "
+                            "and iteration space types");
+
+  if (hasIterArgs)
+    if (parser.parseArrowTypeList(state.types))
+      return failure();
+
+  // Resolves input sparse iteration spaces.
+  if (parser.resolveOperands(spaces, iterSpaceTps, parser.getNameLoc(),
+                             spacesVals))
+    return failure();
+  state.operands.append(spacesVals);
+
+  if (hasIterArgs) {
+    // Strip off leading args that used for coordinates.
+    MutableArrayRef args = MutableArrayRef(blockArgs).drop_front(numCrds);
     if (args.size() != initArgs.size() || args.size() != state.types.size()) {
       return parser.emitError(
           parser.getNameLoc(),
@@ -2267,12 +2371,25 @@ LogicalResult ExtractIterSpaceOp::verify() {
   return success();
 }
 
+LogicalResult ExtractValOp::verify() {
+  auto stt = getSparseTensorType(getTensor());
+  auto itTp = getIterator().getType();
+
+  if (stt.getEncoding() != itTp.getEncoding())
+    return emitOpError("mismatch in tensor encoding and iterator encoding.");
+
+  if (stt.getLvlRank() != itTp.getHiLvl())
+    return emitOpError("must use last-level iterator to extract values. ");
+
+  return success();
+}
+
 struct RemoveUnusedLvlCrds : public OpRewritePattern<IterateOp> {
   using OpRewritePattern::OpRewritePattern;
 
   LogicalResult matchAndRewrite(IterateOp iterateOp,
                                 PatternRewriter &rewriter) const override {
-    LevelSet newUsedLvls(0);
+    I64BitSet newUsedLvls(0);
     llvm::BitVector toRemove(iterateOp.getBody()->getNumArguments());
     for (unsigned i = 0, e = iterateOp.getSpaceDim(); i < e; i++) {
       if (auto crd = iterateOp.getLvlCrd(i)) {
@@ -2304,13 +2421,13 @@ void IterateOp::build(OpBuilder &builder, OperationState &odsState,
                       Value iterSpace, ValueRange initArgs) {
   unsigned rank = llvm::cast<IterSpaceType>(iterSpace.getType()).getSpaceDim();
   // All ones.
-  LevelSet set((1 << rank) - 1);
+  I64BitSet set((1 << rank) - 1);
   return build(builder, odsState, iterSpace, initArgs, set);
 }
 
 void IterateOp::build(OpBuilder &builder, OperationState &odsState,
                       Value iterSpace, ValueRange initArgs,
-                      LevelSet crdUsedLvls) {
+                      I64BitSet crdUsedLvls) {
   OpBuilder::InsertionGuard guard(builder);
 
   odsState.addOperands(iterSpace);
@@ -2340,7 +2457,7 @@ ParseResult IterateOp::parse(OpAsmParser &parser, OperationState &result) {
   OpAsmParser::UnresolvedOperand iterSpace;
 
   SmallVector<OpAsmParser::Argument> iters, iterArgs;
-  if (parseSparseSpaceLoop(parser, result, iters, iterArgs))
+  if (parseSparseIterateLoop(parser, result, iters, iterArgs))
     return failure();
   if (iters.size() != 1)
     return parser.emitError(parser.getNameLoc(),
@@ -2380,51 +2497,39 @@ static void printInitializationList(OpAsmPrinter &p,
   p << ")";
 }
 
-static void printUsedCrdsList(OpAsmPrinter &p, unsigned spaceDim,
-                              Block::BlockArgListType blocksArgs,
-                              LevelSet crdUsedLvls) {
-  if (crdUsedLvls.empty())
-    return;
-
-  p << " at(";
-  for (unsigned i = 0; i < spaceDim; i++) {
-    if (crdUsedLvls[i]) {
-      p << blocksArgs.front();
-      blocksArgs = blocksArgs.drop_front();
-    } else {
-      p << "_";
-    }
-    if (i != spaceDim - 1)
-      p << ", ";
+template <typename SparseLoopOp>
+static LogicalResult verifySparseLoopOp(SparseLoopOp op) {
+  if (op.getInitArgs().size() != op.getNumResults()) {
+    return op.emitOpError(
+        "mismatch in number of loop-carried values and defined values");
   }
-  assert(blocksArgs.empty());
-  p << ")";
+  if (op.getCrdUsedLvls().max() > op.getSpaceDim())
+    return op.emitOpError("required out-of-bound coordinates");
+
+  return success();
 }
 
+LogicalResult IterateOp::verify() { return verifySparseLoopOp(*this); }
+LogicalResult CoIterateOp::verify() { return verifySparseLoopOp(*this); }
+
 void IterateOp::print(OpAsmPrinter &p) {
   p << " " << getIterator() << " in " << getIterSpace();
-  printUsedCrdsList(p, getSpaceDim(), getCrds(), getCrdUsedLvls());
+  if (!getCrdUsedLvls().empty()) {
+    p << " at(";
+    printOptionalDefinedList(p, getSpaceDim(), getCrds(), getCrdUsedLvls());
+    p << ")";
+  }
   printInitializationList(p, getRegionIterArgs(), getInitArgs(), " iter_args");
 
   p << " : " << getIterSpace().getType() << " ";
   if (!getInitArgs().empty())
-    p << "-> (" << getInitArgs().getTypes() << ") ";
+    p.printArrowTypeList(getInitArgs().getTypes());
 
+  p << " ";
   p.printRegion(getRegion(), /*printEntryBlockArgs=*/false,
                 /*printBlockTerminators=*/!getInitArgs().empty());
 }
 
-LogicalResult IterateOp::verify() {
-  if (getInitArgs().size() != getNumResults()) {
-    return emitOpError(
-        "mismatch in number of loop-carried values and defined values");
-  }
-  if (getCrdUsedLvls().max() > getSpaceDim())
-    return emitOpError("required out-of-bound coordinates");
-
-  return success();
-}
-
 LogicalResult IterateOp::verifyRegions() {
   if (getIterator().getType() != getIterSpace().getType().getIteratorType())
     return emitOpError("mismatch in iterator and iteration space type");
@@ -2482,13 +2587,136 @@ OperandRange IterateOp::getEntrySuccessorOperands(RegionBranchPoint point) {
 
 void IterateOp::getSuccessorRegions(RegionBranchPoint point,
                                     SmallVectorImpl<RegionSuccessor> &regions) {
-  // Both the operation itself and the region may be branching into the body or
-  // back into the operation itself.
+  // Both the operation itself and the region may be branching into the body
+  // or back into the operation itself.
   regions.push_back(RegionSuccessor(&getRegion(), getRegionIterArgs()));
   // It is possible for loop not to enter the body.
   regions.push_back(RegionSuccessor(getResults()));
 }
 
+ParseResult CoIterateOp::parse(OpAsmParser &parser, OperationState &result) {
+
+  SmallVector<Value> spaces;
+  // The block argument list of each regions, it is arranged in the order of
+  // ([used coordinate list], [loop iterations args], [sparse iterator list]).
+  SmallVector<OpAsmParser::Argument> blockArgs;
+  if (parseSparseCoIterateLoop(parser, result, spaces, blockArgs))
+    return failure();
+
+  result.addAttribute("operandSegmentSizes",
+                      parser.getBuilder().getDenseI32ArrayAttr(
+                          {static_cast<int32_t>(spaces.size()),
+                           static_cast<int32_t>(result.types.size())}));
+
+  SmallVector<Attribute> cases;
+  while (succeeded(parser.parseOptionalKeyword("case"))) {
+    // Parse one region per case.
+    I64BitSet definedItSet;
+    SmallVector<OpAsmParser::Argument> definedIts;
+    if (parseOptionalDefinedList(parser, result, definedItSet, definedIts,
+                                 spaces.size(), OpAsmParser::Delimiter::None))
+      return failure();
+
+    cases.push_back(parser.getBuilder().getI64IntegerAttr(definedItSet));
+
+    for (auto [i, definedIdx] : llvm::enumerate(definedItSet.bits())) {
+      // Resolve the iterator type based on the iteration space type.
+      auto spaceTp = llvm::cast<IterSpaceType>(spaces[definedIdx].getType());
+      definedIts[i].type = spaceTp.getIteratorType();
+    }
+    definedIts.insert(definedIts.begin(), blockArgs.begin(), blockArgs.end());
+    Region *body = result.addRegion();
+    if (parser.parseRegion(*body, definedIts))
+      return failure();
+
+    CoIterateOp::ensureTerminator(*body, parser.getBuilder(), result.location);
+  }
+
+  result.addAttribute("cases", ArrayAttr::get(parser.getContext(), cases));
+
+  // Parse the optional attribute list.
+  if (parser.parseOptionalAttrDict(result.attributes))
+    return failure();
+
+  return success();
+}
+
+void CoIterateOp::print(OpAsmPrinter &p) {
+  p << " (";
+  llvm::interleaveComma(getIterSpaces(), p, [&](auto s) { p << s; });
+  p << ")";
+
+  if (!getCrdUsedLvls().empty()) {
+    p << " at(";
+    printOptionalDefinedList(p, getSpaceDim(), getCrds(0), getCrdUsedLvls());
+    p << ")";
+  }
+
+  printInitializationList(p, getRegionIterArgs(0), getInitArgs(), " iter_args");
+
+  p << " : (" << getIterSpaces().getTypes() << ")";
+  if (!getInitArgs().empty())
+    p.printArrowTypeList(getInitArgs().getTypes());
+
+  for (unsigned idx = 0, e = getRegions().size(); idx < e; idx++) {
+    p.printNewline();
+    p << "case ";
+    printOptionalDefinedList(p, getIterSpaces().size(), getRegionIterators(idx),
+                             getRegionDefinedSpace(idx));
+    p << " ";
+    p.printRegion(getRegion(idx), /*printEntryBlockArgs=*/false,
+                  /*printBlockTerminators=*/!getInitArgs().empty());
+  }
+}
+
+ValueRange CoIterateOp::getYieldedValues(unsigned regionIdx) {
+  return cast<sparse_tensor::YieldOp>(
+             getRegion(regionIdx).getBlocks().front().getTerminator())
+      .getResults();
+}
+
+LogicalResult CoIterateOp::verifyRegions() {
+  for (unsigned r = 0, e = getNumRegions(); r < e; r++) {
+    if (getNumRegionIterArgs(r) != getNumResults())
+      return emitOpError(
+          "mismatch in number of basic block args and defined values");
+
+    auto initArgs = getInitArgs();
+    auto iterArgs = getRegionIterArgs(r);
+    auto yieldVals = getYieldedValues(r);
+    auto opResults = getResults();
+    if (!llvm::all_equal({initArgs.size(), iterArgs.size(), yieldVals.size(),
+                          opResults.size()})) {
+      return emitOpError()
+             << "number mismatch between iter args and results on " << r
+             << "th region";
+    }
+
+    for (auto [i, init, iter, yield, ret] :
+         llvm::enumerate(initArgs, iterArgs, yieldVals, opResults)) {
+      if (init.getType() != ret.getType())
+        return emitOpError()
+               << "types mismatch between " << i
+               << "th iter operand and defined value on " << r << "th region";
+      if (iter.getType() != ret.getType())
+        return emitOpError() << "types mismatch between " << i
+                             << "th iter region arg and defined value on " << r
+                             << "th region";
+      if (yield.getType() != ret.getType())
+        return emitOpError()
+               << "types mismatch between " << i
+               << "th yield value and defined value on " << r << "th region";
+    }
+  }
+
+  auto cases = getRegionDefinedSpaces();
+  llvm::SmallSetVector<uint64_t, 8> set(cases.begin(), cases.end());
+  if (set.size() != getNumRegions())
+    return emitOpError("contains duplicated cases.");
+
+  return success();
+}
+
 //===----------------------------------------------------------------------===//
 // Sparse Tensor Dialect Setups.
 //===----------------------------------------------------------------------===//
diff --git a/mlir/lib/Dialect/SparseTensor/Transforms/SparseIterationToScf.cpp b/mlir/lib/Dialect/SparseTensor/Transforms/SparseIterationToScf.cpp
index 1d614b7..b1451de 100644
--- a/mlir/lib/Dialect/SparseTensor/Transforms/SparseIterationToScf.cpp
+++ b/mlir/lib/Dialect/SparseTensor/Transforms/SparseIterationToScf.cpp
@@ -2,6 +2,7 @@
 #include "Utils/CodegenUtils.h"
 #include "Utils/SparseTensorIterator.h"
 
+#include "mlir/Dialect/MemRef/IR/MemRef.h"
 #include "mlir/Dialect/SCF/IR/SCF.h"
 #include "mlir/Dialect/SparseTensor/IR/SparseTensor.h"
 #include "mlir/Dialect/SparseTensor/Transforms/Passes.h"
@@ -10,8 +11,8 @@
 using namespace mlir;
 using namespace mlir::sparse_tensor;
 
-void convertLevelType(SparseTensorEncodingAttr enc, Level lvl,
-                      SmallVectorImpl<Type> &fields) {
+static void convertLevelType(SparseTensorEncodingAttr enc, Level lvl,
+                             SmallVectorImpl<Type> &fields) {
   // Position and coordinate buffer in the sparse structure.
   if (enc.getLvlType(lvl).isWithPosLT())
     fields.push_back(enc.getPosMemRefType());
@@ -71,6 +72,21 @@ public:
   }
 };
 
+/// Sparse codegen rule for number of entries operator.
+class ExtractValOpConverter : public OneToNOpConversionPattern<ExtractValOp> {
+public:
+  using OneToNOpConversionPattern::OneToNOpConversionPattern;
+  LogicalResult
+  matchAndRewrite(ExtractValOp op, OpAdaptor adaptor,
+                  OneToNPatternRewriter &rewriter) const override {
+    Location loc = op.getLoc();
+    Value pos = adaptor.getIterator().back();
+    Value valBuf = rewriter.create<ToValuesOp>(loc, op.getTensor());
+    rewriter.replaceOpWithNewOp<memref::LoadOp>(op, valBuf, pos);
+    return success();
+  }
+};
+
 class SparseIterateOpConverter : public OneToNOpConversionPattern<IterateOp> {
 public:
   using OneToNOpConversionPattern::OneToNOpConversionPattern;
@@ -193,6 +209,6 @@ void mlir::populateLowerSparseIterationToSCFPatterns(
     TypeConverter &converter, RewritePatternSet &patterns) {
 
   IterateOp::getCanonicalizationPatterns(patterns, patterns.getContext());
-  patterns.add<ExtractIterSpaceConverter, SparseIterateOpConverter>(
-      converter, patterns.getContext());
+  patterns.add<ExtractIterSpaceConverter, ExtractValOpConverter,
+               SparseIterateOpConverter>(converter, patterns.getContext());
 }
diff --git a/mlir/lib/Dialect/SparseTensor/Transforms/SparseSpaceCollapse.cpp b/mlir/lib/Dialect/SparseTensor/Transforms/SparseSpaceCollapse.cpp
index 924046fc..f85c476 100644
--- a/mlir/lib/Dialect/SparseTensor/Transforms/SparseSpaceCollapse.cpp
+++ b/mlir/lib/Dialect/SparseTensor/Transforms/SparseSpaceCollapse.cpp
@@ -141,10 +141,10 @@ void collapseSparseSpace(MutableArrayRef<CollapseSpaceInfo> toCollapse) {
   auto cloned = llvm::cast<IterateOp>(builder.clone(*innermost, mapper));
   builder.setInsertionPointToStart(cloned.getBody());
 
-  LevelSet crdUsedLvls;
+  I64BitSet crdUsedLvls;
   unsigned shift = 0, argIdx = 1;
   for (auto info : toCollapse.drop_back()) {
-    LevelSet set = info.loop.getCrdUsedLvls();
+    I64BitSet set = info.loop.getCrdUsedLvls();
     crdUsedLvls |= set.lshift(shift);
     shift += info.loop.getSpaceDim();
     for (BlockArgument crd : info.loop.getCrds()) {
diff --git a/mlir/lib/Dialect/SparseTensor/Transforms/Sparsification.cpp b/mlir/lib/Dialect/SparseTensor/Transforms/Sparsification.cpp
index c612a52..08fc104 100644
--- a/mlir/lib/Dialect/SparseTensor/Transforms/Sparsification.cpp
+++ b/mlir/lib/Dialect/SparseTensor/Transforms/Sparsification.cpp
@@ -357,6 +357,9 @@ static Value genSubscript(CodegenEnv &env, OpBuilder &builder, OpOperand *t,
     const auto pos = env.emitter().getValPosits(tid);
     assert(!pos.empty());
     args.append(pos);
+    // Simply returns the tensor to extract value using iterators.
+    if (env.options().sparseEmitStrategy == SparseEmitStrategy::kSparseIterator)
+      return t->get();
   } else {
     // For dense tensors we push all level's coordinates onto `args`.
     const Level lvlRank = stt.getLvlRank();
@@ -512,9 +515,16 @@ static Value genTensorLoad(CodegenEnv &env, OpBuilder &builder, ExprId exp) {
       return genInsertionLoadReduce(env, builder, t);
     return genInsertionLoad(env, builder, t);
   }
+
   // Actual load.
   SmallVector<Value> args;
   Value ptr = genSubscript(env, builder, t, args);
+  if (llvm::isa<TensorType>(ptr.getType())) {
+    assert(env.options().sparseEmitStrategy ==
+               SparseEmitStrategy::kSparseIterator &&
+           args.size() == 1);
+    return builder.create<ExtractValOp>(loc, ptr, args.front());
+  }
   return builder.create<memref::LoadOp>(loc, ptr, args);
 }
 
diff --git a/mlir/lib/Dialect/SparseTensor/Transforms/Utils/LoopEmitter.h b/mlir/lib/Dialect/SparseTensor/Transforms/Utils/LoopEmitter.h
index 2a884b1..f3e73e4 100644
--- a/mlir/lib/Dialect/SparseTensor/Transforms/Utils/LoopEmitter.h
+++ b/mlir/lib/Dialect/SparseTensor/Transforms/Utils/LoopEmitter.h
@@ -221,6 +221,11 @@ public:
   /// Getters.
   ///
   SmallVector<Value> getValPosits(TensorId tid) const {
+    // Returns the iterator if we are generating sparse (co)iterate-based loops.
+    if (emitStrategy == SparseEmitStrategy::kSparseIterator)
+      return {spIterVals[tid].back()};
+
+    // Returns {[batch coords], last-level position}.
     SmallVector<Value> batchCrds = iters[tid].back().back()->getBatchCrds();
     Value lastLvlPos = iters[tid].back().back()->getCurPosition().front();
     batchCrds.push_back(lastLvlPos);
diff --git a/mlir/lib/Dialect/Vector/IR/VectorOps.cpp b/mlir/lib/Dialect/Vector/IR/VectorOps.cpp
index d297c40..5047bd9 100644
--- a/mlir/lib/Dialect/Vector/IR/VectorOps.cpp
+++ b/mlir/lib/Dialect/Vector/IR/VectorOps.cpp
@@ -88,15 +88,14 @@ static MaskFormat getMaskFormat(Value mask) {
     // Inspect constant mask index. If the index exceeds the
     // dimension size, all bits are set. If the index is zero
     // or less, no bits are set.
-    ArrayAttr masks = m.getMaskDimSizes();
+    ArrayRef<int64_t> masks = m.getMaskDimSizes();
     auto shape = m.getType().getShape();
     bool allTrue = true;
     bool allFalse = true;
     for (auto [maskIdx, dimSize] : llvm::zip_equal(masks, shape)) {
-      int64_t i = llvm::cast<IntegerAttr>(maskIdx).getInt();
-      if (i < dimSize)
+      if (maskIdx < dimSize)
         allTrue = false;
-      if (i > 0)
+      if (maskIdx > 0)
         allFalse = false;
     }
     if (allTrue)
@@ -2465,11 +2464,6 @@ void BroadcastOp::getCanonicalizationPatterns(RewritePatternSet &results,
 // ShuffleOp
 //===----------------------------------------------------------------------===//
 
-void ShuffleOp::build(OpBuilder &builder, OperationState &result, Value v1,
-                      Value v2, ArrayRef<int64_t> mask) {
-  build(builder, result, v1, v2, getVectorSubscriptAttr(builder, mask));
-}
-
 LogicalResult ShuffleOp::verify() {
   VectorType resultType = getResultVectorType();
   VectorType v1Type = getV1VectorType();
@@ -2492,8 +2486,8 @@ LogicalResult ShuffleOp::verify() {
       return emitOpError("dimension mismatch");
   }
   // Verify mask length.
-  auto maskAttr = getMask().getValue();
-  int64_t maskLength = maskAttr.size();
+  ArrayRef<int64_t> mask = getMask();
+  int64_t maskLength = mask.size();
   if (maskLength <= 0)
     return emitOpError("invalid mask length");
   if (maskLength != resultType.getDimSize(0))
@@ -2501,10 +2495,9 @@ LogicalResult ShuffleOp::verify() {
   // Verify all indices.
   int64_t indexSize = (v1Type.getRank() == 0 ? 1 : v1Type.getDimSize(0)) +
                       (v2Type.getRank() == 0 ? 1 : v2Type.getDimSize(0));
-  for (const auto &en : llvm::enumerate(maskAttr)) {
-    auto attr = llvm::dyn_cast<IntegerAttr>(en.value());
-    if (!attr || attr.getInt() < 0 || attr.getInt() >= indexSize)
-      return emitOpError("mask index #") << (en.index() + 1) << " out of range";
+  for (auto [idx, maskPos] : llvm::enumerate(mask)) {
+    if (maskPos < 0 || maskPos >= indexSize)
+      return emitOpError("mask index #") << (idx + 1) << " out of range";
   }
   return success();
 }
@@ -2528,13 +2521,12 @@ ShuffleOp::inferReturnTypes(MLIRContext *, std::optional<Location>,
   return success();
 }
 
-static bool isStepIndexArray(ArrayAttr idxArr, uint64_t begin, size_t width) {
-  uint64_t expected = begin;
-  return idxArr.size() == width &&
-         llvm::all_of(idxArr.getAsValueRange<IntegerAttr>(),
-                      [&expected](auto attr) {
-                        return attr.getZExtValue() == expected++;
-                      });
+template <typename T>
+static bool isStepIndexArray(ArrayRef<T> idxArr, uint64_t begin, size_t width) {
+  T expected = begin;
+  return idxArr.size() == width && llvm::all_of(idxArr, [&expected](T value) {
+           return value == expected++;
+         });
 }
 
 OpFoldResult vector::ShuffleOp::fold(FoldAdaptor adaptor) {
@@ -2569,8 +2561,7 @@ OpFoldResult vector::ShuffleOp::fold(FoldAdaptor adaptor) {
   SmallVector<Attribute> results;
   auto lhsElements = llvm::cast<DenseElementsAttr>(lhs).getValues<Attribute>();
   auto rhsElements = llvm::cast<DenseElementsAttr>(rhs).getValues<Attribute>();
-  for (const auto &index : this->getMask().getAsValueRange<IntegerAttr>()) {
-    int64_t i = index.getZExtValue();
+  for (int64_t i : this->getMask()) {
     if (i >= lhsSize) {
       results.push_back(rhsElements[i - lhsSize]);
     } else {
@@ -2591,13 +2582,13 @@ struct Canonicalize0DShuffleOp : public OpRewritePattern<ShuffleOp> {
   LogicalResult matchAndRewrite(ShuffleOp shuffleOp,
                                 PatternRewriter &rewriter) const override {
     VectorType v1VectorType = shuffleOp.getV1VectorType();
-    ArrayAttr mask = shuffleOp.getMask();
+    ArrayRef<int64_t> mask = shuffleOp.getMask();
     if (v1VectorType.getRank() > 0)
       return failure();
     if (mask.size() != 1)
       return failure();
     VectorType resType = VectorType::Builder(v1VectorType).setShape({1});
-    if (llvm::cast<IntegerAttr>(mask[0]).getInt() == 0)
+    if (mask[0] == 0)
       rewriter.replaceOpWithNewOp<vector::BroadcastOp>(shuffleOp, resType,
                                                        shuffleOp.getV1());
     else
@@ -2652,11 +2643,11 @@ public:
           op, "ShuffleOp types don't match an interleave");
     }
 
-    ArrayAttr shuffleMask = op.getMask();
+    ArrayRef<int64_t> shuffleMask = op.getMask();
     int64_t resultVectorSize = resultType.getNumElements();
     for (int i = 0, e = resultVectorSize / 2; i < e; ++i) {
-      int64_t maskValueA = cast<IntegerAttr>(shuffleMask[i * 2]).getInt();
-      int64_t maskValueB = cast<IntegerAttr>(shuffleMask[(i * 2) + 1]).getInt();
+      int64_t maskValueA = shuffleMask[i * 2];
+      int64_t maskValueB = shuffleMask[(i * 2) + 1];
       if (maskValueA != i || maskValueB != (resultVectorSize / 2) + i)
         return rewriter.notifyMatchFailure(op,
                                            "ShuffleOp mask not interleaving");
@@ -3593,8 +3584,7 @@ public:
     if (extractStridedSliceOp.hasNonUnitStrides())
       return failure();
     // Gather constant mask dimension sizes.
-    SmallVector<int64_t, 4> maskDimSizes;
-    populateFromInt64AttrArray(constantMaskOp.getMaskDimSizes(), maskDimSizes);
+    ArrayRef<int64_t> maskDimSizes = constantMaskOp.getMaskDimSizes();
     // Gather strided slice offsets and sizes.
     SmallVector<int64_t, 4> sliceOffsets;
     populateFromInt64AttrArray(extractStridedSliceOp.getOffsets(),
@@ -3625,7 +3615,7 @@ public:
     // region.
     rewriter.replaceOpWithNewOp<ConstantMaskOp>(
         extractStridedSliceOp, extractStridedSliceOp.getResult().getType(),
-        vector::getVectorSubscriptAttr(rewriter, sliceMaskDimSizes));
+        sliceMaskDimSizes);
     return success();
   }
 };
@@ -5410,21 +5400,19 @@ public:
     }
 
     if (constantMaskOp) {
-      auto maskDimSizes = constantMaskOp.getMaskDimSizes().getValue();
+      auto maskDimSizes = constantMaskOp.getMaskDimSizes();
       auto numMaskOperands = maskDimSizes.size();
 
       // Check every mask dim size to see whether it can be dropped
       for (size_t i = numMaskOperands - 1; i >= numMaskOperands - numDimsToDrop;
            --i) {
-        if (cast<IntegerAttr>(maskDimSizes[i]).getValue() != 1)
+        if (maskDimSizes[i] != 1)
           return failure();
       }
 
       auto newMaskOperands = maskDimSizes.drop_back(numDimsToDrop);
-      ArrayAttr newMaskOperandsAttr = rewriter.getArrayAttr(newMaskOperands);
-
       rewriter.replaceOpWithNewOp<vector::ConstantMaskOp>(shapeOp, shapeOpResTy,
-                                                          newMaskOperandsAttr);
+                                                          newMaskOperands);
       return success();
     }
 
@@ -5804,12 +5792,10 @@ public:
 
     // ConstantMaskOp case.
     auto maskDimSizes = constantMaskOp.getMaskDimSizes();
-    SmallVector<Attribute> newMaskDimSizes(maskDimSizes.getValue());
-    applyPermutationToVector(newMaskDimSizes, permutation);
+    auto newMaskDimSizes = applyPermutation(maskDimSizes, permutation);
 
     rewriter.replaceOpWithNewOp<vector::ConstantMaskOp>(
-        transpOp, transpOp.getResultVectorType(),
-        ArrayAttr::get(transpOp.getContext(), newMaskDimSizes));
+        transpOp, transpOp.getResultVectorType(), newMaskDimSizes);
     return success();
   }
 };
@@ -5832,7 +5818,7 @@ LogicalResult ConstantMaskOp::verify() {
   if (resultType.getRank() == 0) {
     if (getMaskDimSizes().size() != 1)
       return emitError("array attr must have length 1 for 0-D vectors");
-    auto dim = llvm::cast<IntegerAttr>(getMaskDimSizes()[0]).getInt();
+    auto dim = getMaskDimSizes()[0];
     if (dim != 0 && dim != 1)
       return emitError("mask dim size must be either 0 or 1 for 0-D vectors");
     return success();
@@ -5846,9 +5832,8 @@ LogicalResult ConstantMaskOp::verify() {
   // result dimension size.
   auto resultShape = resultType.getShape();
   auto resultScalableDims = resultType.getScalableDims();
-  SmallVector<int64_t, 4> maskDimSizes;
-  for (const auto [index, intAttr] : llvm::enumerate(getMaskDimSizes())) {
-    int64_t maskDimSize = llvm::cast<IntegerAttr>(intAttr).getInt();
+  ArrayRef<int64_t> maskDimSizes = getMaskDimSizes();
+  for (const auto [index, maskDimSize] : llvm::enumerate(maskDimSizes)) {
     if (maskDimSize < 0 || maskDimSize > resultShape[index])
       return emitOpError(
           "array attr of size out of bounds of vector result dimension size");
@@ -5856,7 +5841,6 @@ LogicalResult ConstantMaskOp::verify() {
         maskDimSize != resultShape[index])
       return emitOpError(
           "only supports 'none set' or 'all set' scalable dimensions");
-    maskDimSizes.push_back(maskDimSize);
   }
   // Verify that if one mask dim size is zero, they all should be zero (because
   // the mask region is a conjunction of each mask dimension interval).
@@ -5873,11 +5857,10 @@ bool ConstantMaskOp::isAllOnesMask() {
   // Check the corner case of 0-D vectors first.
   if (resultType.getRank() == 0) {
     assert(getMaskDimSizes().size() == 1 && "invalid sizes for zero rank mask");
-    return llvm::cast<IntegerAttr>(getMaskDimSizes()[0]).getInt() == 1;
+    return getMaskDimSizes()[0] == 1;
   }
-  for (const auto [resultSize, intAttr] :
+  for (const auto [resultSize, maskDimSize] :
        llvm::zip_equal(resultType.getShape(), getMaskDimSizes())) {
-    int64_t maskDimSize = llvm::cast<IntegerAttr>(intAttr).getInt();
     if (maskDimSize < resultSize)
       return false;
   }
@@ -6007,9 +5990,8 @@ public:
     }
 
     // Replace 'createMaskOp' with ConstantMaskOp.
-    rewriter.replaceOpWithNewOp<ConstantMaskOp>(
-        createMaskOp, retTy,
-        vector::getVectorSubscriptAttr(rewriter, maskDimSizes));
+    rewriter.replaceOpWithNewOp<ConstantMaskOp>(createMaskOp, retTy,
+                                                maskDimSizes);
     return success();
   }
 };
diff --git a/mlir/lib/Dialect/Vector/Transforms/LowerVectorMask.cpp b/mlir/lib/Dialect/Vector/Transforms/LowerVectorMask.cpp
index dfeb7bc5..bfc05c7 100644
--- a/mlir/lib/Dialect/Vector/Transforms/LowerVectorMask.cpp
+++ b/mlir/lib/Dialect/Vector/Transforms/LowerVectorMask.cpp
@@ -111,7 +111,7 @@ public:
     if (rank == 0) {
       assert(dimSizes.size() == 1 &&
              "Expected exactly one dim size for a 0-D vector");
-      bool value = cast<IntegerAttr>(dimSizes[0]).getInt() == 1;
+      bool value = dimSizes.front() == 1;
       rewriter.replaceOpWithNewOp<arith::ConstantOp>(
           op, dstType,
           DenseIntElementsAttr::get(VectorType::get({}, rewriter.getI1Type()),
@@ -119,7 +119,7 @@ public:
       return success();
     }
 
-    int64_t trueDimSize = cast<IntegerAttr>(dimSizes[0]).getInt();
+    int64_t trueDimSize = dimSizes.front();
 
     if (rank == 1) {
       if (trueDimSize == 0 || trueDimSize == dstType.getDimSize(0)) {
@@ -147,7 +147,7 @@ public:
 
     VectorType lowType = VectorType::Builder(dstType).dropDim(0);
     Value trueVal = rewriter.create<vector::ConstantMaskOp>(
-        loc, lowType, rewriter.getArrayAttr(dimSizes.getValue().drop_front()));
+        loc, lowType, dimSizes.drop_front());
     Value result = rewriter.create<arith::ConstantOp>(
         loc, dstType, rewriter.getZeroAttr(dstType));
     for (int64_t d = 0; d < trueDimSize; d++)
diff --git a/mlir/lib/Dialect/Vector/Transforms/VectorDropLeadUnitDim.cpp b/mlir/lib/Dialect/Vector/Transforms/VectorDropLeadUnitDim.cpp
index 7ed3dea..42ac717 100644
--- a/mlir/lib/Dialect/Vector/Transforms/VectorDropLeadUnitDim.cpp
+++ b/mlir/lib/Dialect/Vector/Transforms/VectorDropLeadUnitDim.cpp
@@ -550,9 +550,7 @@ struct CastAwayConstantMaskLeadingOneDim
       return failure();
 
     int64_t dropDim = oldType.getRank() - newType.getRank();
-    SmallVector<int64_t> dimSizes;
-    for (auto attr : mask.getMaskDimSizes())
-      dimSizes.push_back(llvm::cast<IntegerAttr>(attr).getInt());
+    ArrayRef<int64_t> dimSizes = mask.getMaskDimSizes();
 
     // If any of the dropped unit dims has a size of `0`, the entire mask is a
     // zero mask, else the unit dim has no effect on the mask.
@@ -563,7 +561,7 @@ struct CastAwayConstantMaskLeadingOneDim
     newDimSizes.append(dimSizes.begin() + dropDim + 1, dimSizes.end());
 
     auto newMask = rewriter.create<vector::ConstantMaskOp>(
-        mask.getLoc(), newType, rewriter.getI64ArrayAttr(newDimSizes));
+        mask.getLoc(), newType, newDimSizes);
     rewriter.replaceOpWithNewOp<vector::BroadcastOp>(mask, oldType, newMask);
     return success();
   }
diff --git a/mlir/lib/Dialect/Vector/Transforms/VectorEmulateNarrowType.cpp b/mlir/lib/Dialect/Vector/Transforms/VectorEmulateNarrowType.cpp
index ac2a4d3..d3296ee 100644
--- a/mlir/lib/Dialect/Vector/Transforms/VectorEmulateNarrowType.cpp
+++ b/mlir/lib/Dialect/Vector/Transforms/VectorEmulateNarrowType.cpp
@@ -83,17 +83,14 @@ static FailureOr<Operation *> getCompressedMaskOp(OpBuilder &rewriter,
     newMask = rewriter.create<vector::CreateMaskOp>(loc, newMaskType,
                                                     newMaskOperands);
   } else if (constantMaskOp) {
-    ArrayRef<Attribute> maskDimSizes =
-        constantMaskOp.getMaskDimSizes().getValue();
+    ArrayRef<int64_t> maskDimSizes = constantMaskOp.getMaskDimSizes();
     size_t numMaskOperands = maskDimSizes.size();
-    auto origIndex =
-        cast<IntegerAttr>(maskDimSizes[numMaskOperands - 1]).getInt();
-    IntegerAttr maskIndexAttr =
-        rewriter.getI64IntegerAttr((origIndex + scale - 1) / scale);
-    SmallVector<Attribute> newMaskDimSizes(maskDimSizes.drop_back());
-    newMaskDimSizes.push_back(maskIndexAttr);
-    newMask = rewriter.create<vector::ConstantMaskOp>(
-        loc, newMaskType, rewriter.getArrayAttr(newMaskDimSizes));
+    int64_t origIndex = maskDimSizes[numMaskOperands - 1];
+    int64_t maskIndex = (origIndex + scale - 1) / scale;
+    SmallVector<int64_t> newMaskDimSizes(maskDimSizes.drop_back());
+    newMaskDimSizes.push_back(maskIndex);
+    newMask = rewriter.create<vector::ConstantMaskOp>(loc, newMaskType,
+                                                      newMaskDimSizes);
   }
 
   while (!extractOps.empty()) {
diff --git a/mlir/lib/Dialect/Vector/Transforms/VectorInsertExtractStridedSliceRewritePatterns.cpp b/mlir/lib/Dialect/Vector/Transforms/VectorInsertExtractStridedSliceRewritePatterns.cpp
index 37216ce..ec2ef3f 100644
--- a/mlir/lib/Dialect/Vector/Transforms/VectorInsertExtractStridedSliceRewritePatterns.cpp
+++ b/mlir/lib/Dialect/Vector/Transforms/VectorInsertExtractStridedSliceRewritePatterns.cpp
@@ -225,8 +225,7 @@ public:
          off += stride)
       offsets.push_back(off);
     rewriter.replaceOpWithNewOp<ShuffleOp>(op, dstType, op.getVector(),
-                                           op.getVector(),
-                                           rewriter.getI64ArrayAttr(offsets));
+                                           op.getVector(), offsets);
     return success();
   }
 };
diff --git a/mlir/lib/Dialect/Vector/Transforms/VectorLinearize.cpp b/mlir/lib/Dialect/Vector/Transforms/VectorLinearize.cpp
index 4a3ae1b..868397f 100644
--- a/mlir/lib/Dialect/Vector/Transforms/VectorLinearize.cpp
+++ b/mlir/lib/Dialect/Vector/Transforms/VectorLinearize.cpp
@@ -232,8 +232,7 @@ struct LinearizeVectorExtractStridedSlice final
     }
     // Perform a shuffle to extract the kD vector.
     rewriter.replaceOpWithNewOp<vector::ShuffleOp>(
-        extractOp, dstType, srcVector, srcVector,
-        rewriter.getI64ArrayAttr(indices));
+        extractOp, dstType, srcVector, srcVector, indices);
     return success();
   }
 
@@ -298,20 +297,17 @@ struct LinearizeVectorShuffle final
     // that needs to be shuffled to the destination vector. If shuffleSliceLen >
     // 1 we need to shuffle the slices (consecutive shuffleSliceLen number of
     // elements) instead of scalars.
-    ArrayAttr mask = shuffleOp.getMask();
+    ArrayRef<int64_t> mask = shuffleOp.getMask();
     int64_t totalSizeOfShuffledElmnts = mask.size() * shuffleSliceLen;
     llvm::SmallVector<int64_t, 2> indices(totalSizeOfShuffledElmnts);
-    for (auto [i, value] :
-         llvm::enumerate(mask.getAsValueRange<IntegerAttr>())) {
-
-      int64_t v = value.getZExtValue();
+    for (auto [i, value] : llvm::enumerate(mask)) {
       std::iota(indices.begin() + shuffleSliceLen * i,
                 indices.begin() + shuffleSliceLen * (i + 1),
-                shuffleSliceLen * v);
+                shuffleSliceLen * value);
     }
 
-    rewriter.replaceOpWithNewOp<vector::ShuffleOp>(
-        shuffleOp, dstType, vec1, vec2, rewriter.getI64ArrayAttr(indices));
+    rewriter.replaceOpWithNewOp<vector::ShuffleOp>(shuffleOp, dstType, vec1,
+                                                   vec2, indices);
     return success();
   }
 
@@ -368,8 +364,7 @@ struct LinearizeVectorExtract final
     llvm::SmallVector<int64_t, 2> indices(size);
     std::iota(indices.begin(), indices.end(), linearizedOffset);
     rewriter.replaceOpWithNewOp<vector::ShuffleOp>(
-        extractOp, dstTy, adaptor.getVector(), adaptor.getVector(),
-        rewriter.getI64ArrayAttr(indices));
+        extractOp, dstTy, adaptor.getVector(), adaptor.getVector(), indices);
 
     return success();
   }
@@ -452,8 +447,7 @@ struct LinearizeVectorInsert final
                                            // [offset+srcNumElements, end)
 
     rewriter.replaceOpWithNewOp<vector::ShuffleOp>(
-        insertOp, dstTy, adaptor.getDest(), adaptor.getSource(),
-        rewriter.getI64ArrayAttr(indices));
+        insertOp, dstTy, adaptor.getDest(), adaptor.getSource(), indices);
 
     return success();
   }
diff --git a/mlir/lib/Dialect/Vector/Utils/VectorUtils.cpp b/mlir/lib/Dialect/Vector/Utils/VectorUtils.cpp
index 4ed5a8b..e590d8c 100644
--- a/mlir/lib/Dialect/Vector/Utils/VectorUtils.cpp
+++ b/mlir/lib/Dialect/Vector/Utils/VectorUtils.cpp
@@ -323,8 +323,7 @@ SmallVector<OpFoldResult> vector::getMixedSizesXfer(bool hasTensorSemantics,
 }
 
 bool vector::isLinearizableVector(VectorType type) {
-  auto numScalableDims = llvm::count(type.getScalableDims(), true);
-  return (type.getRank() > 1) && (numScalableDims <= 1);
+  return (type.getRank() > 1) && (type.getNumScalableDims() <= 1);
 }
 
 Value vector::createReadOrMaskedRead(OpBuilder &builder, Location loc,
diff --git a/mlir/lib/IR/ODSSupport.cpp b/mlir/lib/IR/ODSSupport.cpp
index 6e968d62..d56c75e 100644
--- a/mlir/lib/IR/ODSSupport.cpp
+++ b/mlir/lib/IR/ODSSupport.cpp
@@ -33,6 +33,50 @@ Attribute mlir::convertToAttribute(MLIRContext *ctx, int64_t storage) {
   return IntegerAttr::get(IntegerType::get(ctx, 64), storage);
 }
 
+LogicalResult
+mlir::convertFromAttribute(int32_t &storage, Attribute attr,
+                           function_ref<InFlightDiagnostic()> emitError) {
+  auto valueAttr = dyn_cast<IntegerAttr>(attr);
+  if (!valueAttr) {
+    emitError() << "expected IntegerAttr for key `value`";
+    return failure();
+  }
+  storage = valueAttr.getValue().getSExtValue();
+  return success();
+}
+Attribute mlir::convertToAttribute(MLIRContext *ctx, int32_t storage) {
+  return IntegerAttr::get(IntegerType::get(ctx, 32), storage);
+}
+
+LogicalResult
+mlir::convertFromAttribute(std::string &storage, Attribute attr,
+                           function_ref<InFlightDiagnostic()> emitError) {
+  auto valueAttr = dyn_cast<StringAttr>(attr);
+  if (!valueAttr)
+    return emitError()
+           << "expected string property to come from string attribute";
+  storage = valueAttr.getValue().str();
+  return success();
+}
+Attribute mlir::convertToAttribute(MLIRContext *ctx,
+                                   const std::string &storage) {
+  return StringAttr::get(ctx, storage);
+}
+
+LogicalResult
+mlir::convertFromAttribute(bool &storage, Attribute attr,
+                           function_ref<InFlightDiagnostic()> emitError) {
+  auto valueAttr = dyn_cast<BoolAttr>(attr);
+  if (!valueAttr)
+    return emitError()
+           << "expected string property to come from string attribute";
+  storage = valueAttr.getValue();
+  return success();
+}
+Attribute mlir::convertToAttribute(MLIRContext *ctx, bool storage) {
+  return BoolAttr::get(ctx, storage);
+}
+
 template <typename DenseArrayTy, typename T>
 LogicalResult
 convertDenseArrayFromAttr(MutableArrayRef<T> storage, Attribute attr,
@@ -64,6 +108,33 @@ mlir::convertFromAttribute(MutableArrayRef<int32_t> storage, Attribute attr,
                                                       "DenseI32ArrayAttr");
 }
 
+template <typename DenseArrayTy, typename T>
+LogicalResult
+convertDenseArrayFromAttr(SmallVectorImpl<T> &storage, Attribute attr,
+                          function_ref<InFlightDiagnostic()> emitError,
+                          StringRef denseArrayTyStr) {
+  auto valueAttr = dyn_cast<DenseArrayTy>(attr);
+  if (!valueAttr) {
+    emitError() << "expected " << denseArrayTyStr << " for key `value`";
+    return failure();
+  }
+  storage.resize_for_overwrite(valueAttr.size());
+  llvm::copy(valueAttr.asArrayRef(), storage.begin());
+  return success();
+}
+LogicalResult
+mlir::convertFromAttribute(SmallVectorImpl<int64_t> &storage, Attribute attr,
+                           function_ref<InFlightDiagnostic()> emitError) {
+  return convertDenseArrayFromAttr<DenseI64ArrayAttr>(storage, attr, emitError,
+                                                      "DenseI64ArrayAttr");
+}
+LogicalResult
+mlir::convertFromAttribute(SmallVectorImpl<int32_t> &storage, Attribute attr,
+                           function_ref<InFlightDiagnostic()> emitError) {
+  return convertDenseArrayFromAttr<DenseI32ArrayAttr>(storage, attr, emitError,
+                                                      "DenseI32ArrayAttr");
+}
+
 Attribute mlir::convertToAttribute(MLIRContext *ctx,
                                    ArrayRef<int64_t> storage) {
   return DenseI64ArrayAttr::get(ctx, storage);
diff --git a/mlir/lib/IR/Verifier.cpp b/mlir/lib/IR/Verifier.cpp
index a09b47e..5d81e9b 100644
--- a/mlir/lib/IR/Verifier.cpp
+++ b/mlir/lib/IR/Verifier.cpp
@@ -268,7 +268,7 @@ LogicalResult OperationVerifier::verifyOnExit(Operation &op) {
 /// verifyBlockPostChildren.
 LogicalResult OperationVerifier::verifyOperation(Operation &op) {
   SmallVector<WorkItem> worklist{{&op}};
-  DenseSet<WorkItem> seen;
+  SmallPtrSet<WorkItem, 8> seen;
   while (!worklist.empty()) {
     WorkItem top = worklist.back();
 
diff --git a/mlir/lib/TableGen/Property.cpp b/mlir/lib/TableGen/Property.cpp
index e61d2fd..b86b87df 100644
--- a/mlir/lib/TableGen/Property.cpp
+++ b/mlir/lib/TableGen/Property.cpp
@@ -33,16 +33,23 @@ static StringRef getValueAsString(const Init *init) {
 }
 
 Property::Property(const Record *def)
-    : Property(getValueAsString(def->getValueInit("storageType")),
-               getValueAsString(def->getValueInit("interfaceType")),
-               getValueAsString(def->getValueInit("convertFromStorage")),
-               getValueAsString(def->getValueInit("assignToStorage")),
-               getValueAsString(def->getValueInit("convertToAttribute")),
-               getValueAsString(def->getValueInit("convertFromAttribute")),
-               getValueAsString(def->getValueInit("readFromMlirBytecode")),
-               getValueAsString(def->getValueInit("writeToMlirBytecode")),
-               getValueAsString(def->getValueInit("hashProperty")),
-               getValueAsString(def->getValueInit("defaultValue"))) {
+    : Property(
+          getValueAsString(def->getValueInit("summary")),
+          getValueAsString(def->getValueInit("description")),
+          getValueAsString(def->getValueInit("storageType")),
+          getValueAsString(def->getValueInit("interfaceType")),
+          getValueAsString(def->getValueInit("convertFromStorage")),
+          getValueAsString(def->getValueInit("assignToStorage")),
+          getValueAsString(def->getValueInit("convertToAttribute")),
+          getValueAsString(def->getValueInit("convertFromAttribute")),
+          getValueAsString(def->getValueInit("parser")),
+          getValueAsString(def->getValueInit("optionalParser")),
+          getValueAsString(def->getValueInit("printer")),
+          getValueAsString(def->getValueInit("readFromMlirBytecode")),
+          getValueAsString(def->getValueInit("writeToMlirBytecode")),
+          getValueAsString(def->getValueInit("hashProperty")),
+          getValueAsString(def->getValueInit("defaultValue")),
+          getValueAsString(def->getValueInit("storageTypeValueOverride"))) {
   this->def = def;
   assert((def->isSubClassOf("Property") || def->isSubClassOf("Attr")) &&
          "must be subclass of TableGen 'Property' class");
@@ -50,22 +57,44 @@ Property::Property(const Record *def)
 
 Property::Property(const DefInit *init) : Property(init->getDef()) {}
 
-Property::Property(StringRef storageType, StringRef interfaceType,
+Property::Property(StringRef summary, StringRef description,
+                   StringRef storageType, StringRef interfaceType,
                    StringRef convertFromStorageCall,
                    StringRef assignToStorageCall,
                    StringRef convertToAttributeCall,
-                   StringRef convertFromAttributeCall,
+                   StringRef convertFromAttributeCall, StringRef parserCall,
+                   StringRef optionalParserCall, StringRef printerCall,
                    StringRef readFromMlirBytecodeCall,
                    StringRef writeToMlirBytecodeCall,
-                   StringRef hashPropertyCall, StringRef defaultValue)
-    : storageType(storageType), interfaceType(interfaceType),
+                   StringRef hashPropertyCall, StringRef defaultValue,
+                   StringRef storageTypeValueOverride)
+    : summary(summary), description(description), storageType(storageType),
+      interfaceType(interfaceType),
       convertFromStorageCall(convertFromStorageCall),
       assignToStorageCall(assignToStorageCall),
       convertToAttributeCall(convertToAttributeCall),
       convertFromAttributeCall(convertFromAttributeCall),
+      parserCall(parserCall), optionalParserCall(optionalParserCall),
+      printerCall(printerCall),
       readFromMlirBytecodeCall(readFromMlirBytecodeCall),
       writeToMlirBytecodeCall(writeToMlirBytecodeCall),
-      hashPropertyCall(hashPropertyCall), defaultValue(defaultValue) {
+      hashPropertyCall(hashPropertyCall), defaultValue(defaultValue),
+      storageTypeValueOverride(storageTypeValueOverride) {
   if (storageType.empty())
     storageType = "Property";
 }
+
+StringRef Property::getPropertyDefName() const {
+  if (def->isAnonymous()) {
+    return getBaseProperty().def->getName();
+  }
+  return def->getName();
+}
+
+Property Property::getBaseProperty() const {
+  if (const auto *defInit =
+          llvm::dyn_cast<llvm::DefInit>(def->getValueInit("baseProperty"))) {
+    return Property(defInit).getBaseProperty();
+  }
+  return *this;
+}
diff --git a/mlir/lib/Target/LLVMIR/Dialect/OpenMP/OpenMPToLLVMIRTranslation.cpp b/mlir/lib/Target/LLVMIR/Dialect/OpenMP/OpenMPToLLVMIRTranslation.cpp
index 8b031de..ddee117 100644
--- a/mlir/lib/Target/LLVMIR/Dialect/OpenMP/OpenMPToLLVMIRTranslation.cpp
+++ b/mlir/lib/Target/LLVMIR/Dialect/OpenMP/OpenMPToLLVMIRTranslation.cpp
@@ -362,9 +362,9 @@ convertOmpCritical(Operation &opInst, llvm::IRBuilderBase &builder,
     auto criticalDeclareOp =
         SymbolTable::lookupNearestSymbolFrom<omp::CriticalDeclareOp>(criticalOp,
                                                                      symbolRef);
-    hint = llvm::ConstantInt::get(
-        llvm::Type::getInt32Ty(llvmContext),
-        static_cast<int>(criticalDeclareOp.getHintVal()));
+    hint =
+        llvm::ConstantInt::get(llvm::Type::getInt32Ty(llvmContext),
+                               static_cast<int>(criticalDeclareOp.getHint()));
   }
   builder.restoreIP(moduleTranslation.getOpenMPBuilder()->createCritical(
       ompLoc, bodyGenCB, finiCB, criticalOp.getName().value_or(""), hint));
@@ -376,7 +376,7 @@ template <typename T>
 static void
 collectReductionDecls(T loop,
                       SmallVectorImpl<omp::DeclareReductionOp> &reductions) {
-  std::optional<ArrayAttr> attr = loop.getReductions();
+  std::optional<ArrayAttr> attr = loop.getReductionSyms();
   if (!attr)
     return;
 
@@ -534,11 +534,11 @@ convertOmpOrdered(Operation &opInst, llvm::IRBuilderBase &builder,
                   LLVM::ModuleTranslation &moduleTranslation) {
   auto orderedOp = cast<omp::OrderedOp>(opInst);
 
-  omp::ClauseDepend dependType = *orderedOp.getDependTypeVal();
+  omp::ClauseDepend dependType = *orderedOp.getDoacrossDependType();
   bool isDependSource = dependType == omp::ClauseDepend::dependsource;
-  unsigned numLoops = *orderedOp.getNumLoopsVal();
+  unsigned numLoops = *orderedOp.getDoacrossNumLoops();
   SmallVector<llvm::Value *> vecValues =
-      moduleTranslation.lookupValues(orderedOp.getDependVecVars());
+      moduleTranslation.lookupValues(orderedOp.getDoacrossDependVars());
 
   size_t indexVecValues = 0;
   while (indexVecValues < vecValues.size()) {
@@ -566,7 +566,7 @@ convertOmpOrderedRegion(Operation &opInst, llvm::IRBuilderBase &builder,
   auto orderedRegionOp = cast<omp::OrderedRegionOp>(opInst);
 
   // TODO: The code generation for ordered simd directive is not supported yet.
-  if (orderedRegionOp.getSimd())
+  if (orderedRegionOp.getParLevelSimd())
     return failure();
 
   // TODO: support error propagation in OpenMPIRBuilder and use it instead of
@@ -588,7 +588,7 @@ convertOmpOrderedRegion(Operation &opInst, llvm::IRBuilderBase &builder,
   llvm::OpenMPIRBuilder::LocationDescription ompLoc(builder);
   builder.restoreIP(
       moduleTranslation.getOpenMPBuilder()->createOrderedThreadsSimd(
-          ompLoc, bodyGenCB, finiCB, !orderedRegionOp.getSimd()));
+          ompLoc, bodyGenCB, finiCB, !orderedRegionOp.getParLevelSimd()));
   return bodyGenStatus;
 }
 
@@ -837,11 +837,11 @@ convertOmpSections(Operation &opInst, llvm::IRBuilderBase &builder,
   // TODO: Support the following clauses: private, firstprivate, lastprivate,
   // allocate
   if (!sectionsOp.getAllocateVars().empty() ||
-      !sectionsOp.getAllocatorsVars().empty())
-    return emitError(sectionsOp.getLoc())
-           << "allocate clause is not supported for sections construct";
+      !sectionsOp.getAllocatorVars().empty() ||
+      !sectionsOp.getPrivateVars().empty() || sectionsOp.getPrivateSyms())
+    return opInst.emitError("unhandled clauses for translation to LLVM IR");
 
-  llvm::ArrayRef<bool> isByRef = getIsByRef(sectionsOp.getReductionVarsByref());
+  llvm::ArrayRef<bool> isByRef = getIsByRef(sectionsOp.getReductionByref());
   assert(isByRef.size() == sectionsOp.getNumReductionVars());
 
   SmallVector<omp::DeclareReductionOp> reductionDecls;
@@ -945,6 +945,9 @@ convertOmpSingle(omp::SingleOp &singleOp, llvm::IRBuilderBase &builder,
   using InsertPointTy = llvm::OpenMPIRBuilder::InsertPointTy;
   llvm::OpenMPIRBuilder::LocationDescription ompLoc(builder);
   LogicalResult bodyGenStatus = success();
+  if (!singleOp.getPrivateVars().empty() || singleOp.getPrivateSyms())
+    return singleOp.emitError("unhandled clauses for translation to LLVM IR");
+
   auto bodyCB = [&](InsertPointTy allocaIP, InsertPointTy codegenIP) {
     builder.restoreIP(codegenIP);
     convertOmpOpRegions(singleOp.getRegion(), "omp.single.region", builder,
@@ -954,7 +957,7 @@ convertOmpSingle(omp::SingleOp &singleOp, llvm::IRBuilderBase &builder,
 
   // Handle copyprivate
   Operation::operand_range cpVars = singleOp.getCopyprivateVars();
-  std::optional<ArrayAttr> cpFuncs = singleOp.getCopyprivateFuncs();
+  std::optional<ArrayAttr> cpFuncs = singleOp.getCopyprivateSyms();
   llvm::SmallVector<llvm::Value *> llvmCPVars;
   llvm::SmallVector<llvm::Function *> llvmCPFuncs;
   for (size_t i = 0, e = cpVars.size(); i < e; ++i) {
@@ -976,7 +979,8 @@ convertOmpTeams(omp::TeamsOp op, llvm::IRBuilderBase &builder,
                 LLVM::ModuleTranslation &moduleTranslation) {
   using InsertPointTy = llvm::OpenMPIRBuilder::InsertPointTy;
   LogicalResult bodyGenStatus = success();
-  if (!op.getAllocatorsVars().empty() || op.getReductions())
+  if (!op.getAllocatorVars().empty() || op.getReductionSyms() ||
+      !op.getPrivateVars().empty() || op.getPrivateSyms())
     return op.emitError("unhandled clauses for translation to LLVM IR");
 
   auto bodyCB = [&](InsertPointTy allocaIP, InsertPointTy codegenIP) {
@@ -1000,8 +1004,8 @@ convertOmpTeams(omp::TeamsOp op, llvm::IRBuilderBase &builder,
     threadLimit = moduleTranslation.lookupValue(threadLimitVar);
 
   llvm::Value *ifExpr = nullptr;
-  if (Value ifExprVar = op.getIfExpr())
-    ifExpr = moduleTranslation.lookupValue(ifExprVar);
+  if (Value ifVar = op.getIfExpr())
+    ifExpr = moduleTranslation.lookupValue(ifVar);
 
   llvm::OpenMPIRBuilder::LocationDescription ompLoc(builder);
   builder.restoreIP(moduleTranslation.getOpenMPBuilder()->createTeams(
@@ -1010,12 +1014,12 @@ convertOmpTeams(omp::TeamsOp op, llvm::IRBuilderBase &builder,
 }
 
 static void
-buildDependData(std::optional<ArrayAttr> depends, OperandRange dependVars,
+buildDependData(std::optional<ArrayAttr> dependKinds, OperandRange dependVars,
                 LLVM::ModuleTranslation &moduleTranslation,
                 SmallVectorImpl<llvm::OpenMPIRBuilder::DependData> &dds) {
   if (dependVars.empty())
     return;
-  for (auto dep : llvm::zip(dependVars, depends->getValue())) {
+  for (auto dep : llvm::zip(dependVars, dependKinds->getValue())) {
     llvm::omp::RTLDependenceKindTy type;
     switch (
         cast<mlir::omp::ClauseTaskDependAttr>(std::get<1>(dep)).getValue()) {
@@ -1042,8 +1046,9 @@ convertOmpTaskOp(omp::TaskOp taskOp, llvm::IRBuilderBase &builder,
   using InsertPointTy = llvm::OpenMPIRBuilder::InsertPointTy;
   LogicalResult bodyGenStatus = success();
   if (taskOp.getUntiedAttr() || taskOp.getMergeableAttr() ||
-      taskOp.getInReductions() || taskOp.getPriority() ||
-      !taskOp.getAllocateVars().empty()) {
+      taskOp.getInReductionSyms() || taskOp.getPriority() ||
+      !taskOp.getAllocateVars().empty() || !taskOp.getPrivateVars().empty() ||
+      taskOp.getPrivateSyms()) {
     return taskOp.emitError("unhandled clauses for translation to LLVM IR");
   }
   auto bodyCB = [&](InsertPointTy allocaIP, InsertPointTy codegenIP) {
@@ -1058,7 +1063,7 @@ convertOmpTaskOp(omp::TaskOp taskOp, llvm::IRBuilderBase &builder,
   };
 
   SmallVector<llvm::OpenMPIRBuilder::DependData> dds;
-  buildDependData(taskOp.getDepends(), taskOp.getDependVars(),
+  buildDependData(taskOp.getDependKinds(), taskOp.getDependVars(),
                   moduleTranslation, dds);
 
   llvm::OpenMPIRBuilder::InsertPointTy allocaIP =
@@ -1066,7 +1071,7 @@ convertOmpTaskOp(omp::TaskOp taskOp, llvm::IRBuilderBase &builder,
   llvm::OpenMPIRBuilder::LocationDescription ompLoc(builder);
   builder.restoreIP(moduleTranslation.getOpenMPBuilder()->createTask(
       ompLoc, allocaIP, bodyCB, !taskOp.getUntied(),
-      moduleTranslation.lookupValue(taskOp.getFinalExpr()),
+      moduleTranslation.lookupValue(taskOp.getFinal()),
       moduleTranslation.lookupValue(taskOp.getIfExpr()), dds));
   return bodyGenStatus;
 }
@@ -1091,30 +1096,47 @@ convertOmpTaskgroupOp(omp::TaskgroupOp tgOp, llvm::IRBuilderBase &builder,
       ompLoc, allocaIP, bodyCB));
   return bodyGenStatus;
 }
+
+static LogicalResult
+convertOmpTaskwaitOp(omp::TaskwaitOp twOp, llvm::IRBuilderBase &builder,
+                     LLVM::ModuleTranslation &moduleTranslation) {
+  if (!twOp.getDependVars().empty() || twOp.getDependKinds() ||
+      twOp.getNowait())
+    return twOp.emitError("unhandled clauses for translation to LLVM IR");
+
+  moduleTranslation.getOpenMPBuilder()->createTaskwait(builder.saveIP());
+  return success();
+}
+
 /// Converts an OpenMP workshare loop into LLVM IR using OpenMPIRBuilder.
 static LogicalResult
 convertOmpWsloop(Operation &opInst, llvm::IRBuilderBase &builder,
                  LLVM::ModuleTranslation &moduleTranslation) {
   auto wsloopOp = cast<omp::WsloopOp>(opInst);
+  if (!wsloopOp.getAllocateVars().empty() ||
+      !wsloopOp.getAllocatorVars().empty() ||
+      !wsloopOp.getPrivateVars().empty() || wsloopOp.getPrivateSyms())
+    return opInst.emitError("unhandled clauses for translation to LLVM IR");
+
   // FIXME: Here any other nested wrappers (e.g. omp.simd) are skipped, so
   // codegen for composite constructs like 'DO/FOR SIMD' will be the same as for
   // 'DO/FOR'.
   auto loopOp = cast<omp::LoopNestOp>(wsloopOp.getWrappedLoop());
 
-  llvm::ArrayRef<bool> isByRef = getIsByRef(wsloopOp.getReductionVarsByref());
+  llvm::ArrayRef<bool> isByRef = getIsByRef(wsloopOp.getReductionByref());
   assert(isByRef.size() == wsloopOp.getNumReductionVars());
 
   // Static is the default.
   auto schedule =
-      wsloopOp.getScheduleVal().value_or(omp::ClauseScheduleKind::Static);
+      wsloopOp.getScheduleKind().value_or(omp::ClauseScheduleKind::Static);
 
   // Find the loop configuration.
-  llvm::Value *step = moduleTranslation.lookupValue(loopOp.getStep()[0]);
+  llvm::Value *step = moduleTranslation.lookupValue(loopOp.getLoopSteps()[0]);
   llvm::Type *ivType = step->getType();
   llvm::Value *chunk = nullptr;
-  if (wsloopOp.getScheduleChunkVar()) {
+  if (wsloopOp.getScheduleChunk()) {
     llvm::Value *chunkVar =
-        moduleTranslation.lookupValue(wsloopOp.getScheduleChunkVar());
+        moduleTranslation.lookupValue(wsloopOp.getScheduleChunk());
     chunk = builder.CreateSExtOrTrunc(chunkVar, ivType);
   }
 
@@ -1178,10 +1200,10 @@ convertOmpWsloop(Operation &opInst, llvm::IRBuilderBase &builder,
   llvm::OpenMPIRBuilder *ompBuilder = moduleTranslation.getOpenMPBuilder();
   for (unsigned i = 0, e = loopOp.getNumLoops(); i < e; ++i) {
     llvm::Value *lowerBound =
-        moduleTranslation.lookupValue(loopOp.getLowerBound()[i]);
+        moduleTranslation.lookupValue(loopOp.getLoopLowerBounds()[i]);
     llvm::Value *upperBound =
-        moduleTranslation.lookupValue(loopOp.getUpperBound()[i]);
-    llvm::Value *step = moduleTranslation.lookupValue(loopOp.getStep()[i]);
+        moduleTranslation.lookupValue(loopOp.getLoopUpperBounds()[i]);
+    llvm::Value *step = moduleTranslation.lookupValue(loopOp.getLoopSteps()[i]);
 
     // Make sure loop trip count are emitted in the preheader of the outermost
     // loop at the latest so that they are all available for the new collapsed
@@ -1194,7 +1216,7 @@ convertOmpWsloop(Operation &opInst, llvm::IRBuilderBase &builder,
     }
     loopInfos.push_back(ompBuilder->createCanonicalLoop(
         loc, bodyGen, lowerBound, upperBound, step,
-        /*IsSigned=*/true, loopOp.getInclusive(), computeIP));
+        /*IsSigned=*/true, loopOp.getLoopInclusive(), computeIP));
 
     if (failed(bodyGenStatus))
       return failure();
@@ -1209,16 +1231,15 @@ convertOmpWsloop(Operation &opInst, llvm::IRBuilderBase &builder,
   allocaIP = findAllocaInsertPoint(builder, moduleTranslation);
 
   // TODO: Handle doacross loops when the ordered clause has a parameter.
-  bool isOrdered = wsloopOp.getOrderedVal().has_value();
-  std::optional<omp::ScheduleModifier> scheduleModifier =
-      wsloopOp.getScheduleModifier();
-  bool isSimd = wsloopOp.getSimdModifier();
+  bool isOrdered = wsloopOp.getOrdered().has_value();
+  std::optional<omp::ScheduleModifier> scheduleMod = wsloopOp.getScheduleMod();
+  bool isSimd = wsloopOp.getScheduleSimd();
 
   ompBuilder->applyWorkshareLoop(
       ompLoc.DL, loopInfo, allocaIP, !wsloopOp.getNowait(),
       convertToScheduleKind(schedule), chunk, isSimd,
-      scheduleModifier == omp::ScheduleModifier::monotonic,
-      scheduleModifier == omp::ScheduleModifier::nonmonotonic, isOrdered);
+      scheduleMod == omp::ScheduleModifier::monotonic,
+      scheduleMod == omp::ScheduleModifier::nonmonotonic, isOrdered);
 
   // Continue building IR after the loop. Note that the LoopInfo returned by
   // `collapseLoops` points inside the outermost loop and is intended for
@@ -1275,7 +1296,7 @@ convertOmpParallel(omp::ParallelOp opInst, llvm::IRBuilderBase &builder,
                    LLVM::ModuleTranslation &moduleTranslation) {
   using InsertPointTy = llvm::OpenMPIRBuilder::InsertPointTy;
   OmpParallelOpConversionManager raii(opInst);
-  ArrayRef<bool> isByRef = getIsByRef(opInst.getReductionVarsByref());
+  ArrayRef<bool> isByRef = getIsByRef(opInst.getReductionByref());
   assert(isByRef.size() == opInst.getNumReductionVars());
 
   // TODO: support error propagation in OpenMPIRBuilder and use it instead of
@@ -1420,11 +1441,11 @@ convertOmpParallel(omp::ParallelOp opInst, llvm::IRBuilderBase &builder,
     auto [privVar, privatizerClone] =
         [&]() -> std::pair<mlir::Value, omp::PrivateClauseOp> {
       if (!opInst.getPrivateVars().empty()) {
-        auto privVars = opInst.getPrivateVars();
-        auto privatizers = opInst.getPrivatizers();
+        auto privateVars = opInst.getPrivateVars();
+        auto privateSyms = opInst.getPrivateSyms();
 
         for (auto [privVar, privatizerAttr] :
-             llvm::zip_equal(privVars, *privatizers)) {
+             llvm::zip_equal(privateVars, *privateSyms)) {
           // Find the MLIR private variable corresponding to the LLVM value
           // being privatized.
           llvm::Value *llvmPrivVar = moduleTranslation.lookupValue(privVar);
@@ -1564,13 +1585,13 @@ convertOmpParallel(omp::ParallelOp opInst, llvm::IRBuilderBase &builder,
   };
 
   llvm::Value *ifCond = nullptr;
-  if (auto ifExprVar = opInst.getIfExpr())
-    ifCond = moduleTranslation.lookupValue(ifExprVar);
+  if (auto ifVar = opInst.getIfExpr())
+    ifCond = moduleTranslation.lookupValue(ifVar);
   llvm::Value *numThreads = nullptr;
-  if (auto numThreadsVar = opInst.getNumThreadsVar())
+  if (auto numThreadsVar = opInst.getNumThreads())
     numThreads = moduleTranslation.lookupValue(numThreadsVar);
   auto pbKind = llvm::omp::OMP_PROC_BIND_default;
-  if (auto bind = opInst.getProcBindVal())
+  if (auto bind = opInst.getProcBindKind())
     pbKind = getProcBindKind(*bind);
   // TODO: Is the Parallel construct cancellable?
   bool isCancellable = false;
@@ -1608,6 +1629,12 @@ convertOmpSimd(Operation &opInst, llvm::IRBuilderBase &builder,
   auto simdOp = cast<omp::SimdOp>(opInst);
   auto loopOp = cast<omp::LoopNestOp>(simdOp.getWrappedLoop());
 
+  if (!simdOp.getLinearVars().empty() || !simdOp.getLinearStepVars().empty() ||
+      !simdOp.getPrivateVars().empty() || simdOp.getPrivateSyms() ||
+      !simdOp.getReductionVars().empty() || simdOp.getReductionByref() ||
+      simdOp.getReductionSyms())
+    return opInst.emitError("unhandled clauses for translation to LLVM IR");
+
   llvm::OpenMPIRBuilder::LocationDescription ompLoc(builder);
 
   // Generator of the canonical loop body.
@@ -1643,10 +1670,10 @@ convertOmpSimd(Operation &opInst, llvm::IRBuilderBase &builder,
   llvm::OpenMPIRBuilder *ompBuilder = moduleTranslation.getOpenMPBuilder();
   for (unsigned i = 0, e = loopOp.getNumLoops(); i < e; ++i) {
     llvm::Value *lowerBound =
-        moduleTranslation.lookupValue(loopOp.getLowerBound()[i]);
+        moduleTranslation.lookupValue(loopOp.getLoopLowerBounds()[i]);
     llvm::Value *upperBound =
-        moduleTranslation.lookupValue(loopOp.getUpperBound()[i]);
-    llvm::Value *step = moduleTranslation.lookupValue(loopOp.getStep()[i]);
+        moduleTranslation.lookupValue(loopOp.getLoopUpperBounds()[i]);
+    llvm::Value *step = moduleTranslation.lookupValue(loopOp.getLoopSteps()[i]);
 
     // Make sure loop trip count are emitted in the preheader of the outermost
     // loop at the latest so that they are all available for the new collapsed
@@ -1680,7 +1707,7 @@ convertOmpSimd(Operation &opInst, llvm::IRBuilderBase &builder,
     safelen = builder.getInt64(safelenVar.value());
 
   llvm::MapVector<llvm::Value *, llvm::Value *> alignedVars;
-  llvm::omp::OrderKind order = convertOrderKind(simdOp.getOrderVal());
+  llvm::omp::OrderKind order = convertOrderKind(simdOp.getOrder());
   ompBuilder->applySimd(loopInfo, alignedVars,
                         simdOp.getIfExpr()
                             ? moduleTranslation.lookupValue(simdOp.getIfExpr())
@@ -1722,7 +1749,7 @@ convertOmpAtomicRead(Operation &opInst, llvm::IRBuilderBase &builder,
 
   llvm::OpenMPIRBuilder::LocationDescription ompLoc(builder);
 
-  llvm::AtomicOrdering AO = convertAtomicOrdering(readOp.getMemoryOrderVal());
+  llvm::AtomicOrdering AO = convertAtomicOrdering(readOp.getMemoryOrder());
   llvm::Value *x = moduleTranslation.lookupValue(readOp.getX());
   llvm::Value *v = moduleTranslation.lookupValue(readOp.getV());
 
@@ -1743,7 +1770,7 @@ convertOmpAtomicWrite(Operation &opInst, llvm::IRBuilderBase &builder,
   llvm::OpenMPIRBuilder *ompBuilder = moduleTranslation.getOpenMPBuilder();
 
   llvm::OpenMPIRBuilder::LocationDescription ompLoc(builder);
-  llvm::AtomicOrdering ao = convertAtomicOrdering(writeOp.getMemoryOrderVal());
+  llvm::AtomicOrdering ao = convertAtomicOrdering(writeOp.getMemoryOrder());
   llvm::Value *expr = moduleTranslation.lookupValue(writeOp.getExpr());
   llvm::Value *dest = moduleTranslation.lookupValue(writeOp.getX());
   llvm::Type *ty = moduleTranslation.convertType(writeOp.getExpr().getType());
@@ -1811,7 +1838,7 @@ convertOmpAtomicUpdate(omp::AtomicUpdateOp &opInst,
                                                       /*isVolatile=*/false};
 
   llvm::AtomicOrdering atomicOrdering =
-      convertAtomicOrdering(opInst.getMemoryOrderVal());
+      convertAtomicOrdering(opInst.getMemoryOrder());
 
   // Generate update code.
   LogicalResult updateGenStatus = success();
@@ -1903,7 +1930,7 @@ convertOmpAtomicCapture(omp::AtomicCaptureOp atomicCaptureOp,
                                                       /*isVolatile=*/false};
 
   llvm::AtomicOrdering atomicOrdering =
-      convertAtomicOrdering(atomicCaptureOp.getMemoryOrderVal());
+      convertAtomicOrdering(atomicCaptureOp.getMemoryOrder());
 
   LogicalResult updateGenStatus = success();
   auto updateFn = [&](llvm::Value *atomicx,
@@ -2166,12 +2193,11 @@ llvm::Value *getSizeInBytes(DataLayout &dl, const mlir::Type &type,
   return builder.getInt64(dl.getTypeSizeInBits(type) / 8);
 }
 
-void collectMapDataFromMapOperands(MapInfoData &mapData,
-                                   llvm::SmallVectorImpl<Value> &mapOperands,
-                                   LLVM::ModuleTranslation &moduleTranslation,
-                                   DataLayout &dl,
-                                   llvm::IRBuilderBase &builder) {
-  for (mlir::Value mapValue : mapOperands) {
+void collectMapDataFromMapVars(MapInfoData &mapData,
+                               llvm::SmallVectorImpl<Value> &mapVars,
+                               LLVM::ModuleTranslation &moduleTranslation,
+                               DataLayout &dl, llvm::IRBuilderBase &builder) {
+  for (mlir::Value mapValue : mapVars) {
     if (auto mapOp = mlir::dyn_cast_if_present<mlir::omp::MapInfoOp>(
             mapValue.getDefiningOp())) {
       mlir::Value offloadPtr =
@@ -2211,7 +2237,7 @@ void collectMapDataFromMapOperands(MapInfoData &mapData,
       // TODO: May require some further additions to support nested record
       // types, i.e. member maps that can have member maps.
       mapData.IsAMember.push_back(false);
-      for (mlir::Value mapValue : mapOperands) {
+      for (mlir::Value mapValue : mapVars) {
         if (auto map = mlir::dyn_cast_if_present<mlir::omp::MapInfoOp>(
                 mapValue.getDefiningOp())) {
           for (auto member : map.getMembers()) {
@@ -2689,8 +2715,8 @@ static void genMapInfos(llvm::IRBuilderBase &builder,
                         DataLayout &dl,
                         llvm::OpenMPIRBuilder::MapInfosTy &combinedInfo,
                         MapInfoData &mapData,
-                        const SmallVector<Value> &devPtrOperands = {},
-                        const SmallVector<Value> &devAddrOperands = {},
+                        const SmallVector<Value> &useDevicePtrVars = {},
+                        const SmallVector<Value> &useDeviceAddrVars = {},
                         bool isTargetParams = false) {
   // We wish to modify some of the methods in which arguments are
   // passed based on their capture type by the target region, this can
@@ -2748,13 +2774,13 @@ static void genMapInfos(llvm::IRBuilderBase &builder,
     return false;
   };
 
-  auto addDevInfos = [&, fail](auto devOperands, auto devOpType) -> void {
-    for (const auto &devOp : devOperands) {
+  auto addDevInfos = [&, fail](auto useDeviceVars, auto devOpType) -> void {
+    for (const auto &useDeviceVar : useDeviceVars) {
       // TODO: Only LLVMPointerTypes are handled.
-      if (!isa<LLVM::LLVMPointerType>(devOp.getType()))
+      if (!isa<LLVM::LLVMPointerType>(useDeviceVar.getType()))
         return fail();
 
-      llvm::Value *mapOpValue = moduleTranslation.lookupValue(devOp);
+      llvm::Value *mapOpValue = moduleTranslation.lookupValue(useDeviceVar);
 
       // Check if map info is already present for this entry.
       unsigned infoIndex;
@@ -2767,7 +2793,7 @@ static void genMapInfos(llvm::IRBuilderBase &builder,
         combinedInfo.Pointers.emplace_back(mapOpValue);
         combinedInfo.DevicePointers.emplace_back(devOpType);
         combinedInfo.Names.emplace_back(
-            LLVM::createMappingInformation(devOp.getLoc(), *ompBuilder));
+            LLVM::createMappingInformation(useDeviceVar.getLoc(), *ompBuilder));
         combinedInfo.Types.emplace_back(
             llvm::omp::OpenMPOffloadMappingFlags::OMP_MAP_RETURN_PARAM);
         combinedInfo.Sizes.emplace_back(builder.getInt64(0));
@@ -2775,8 +2801,8 @@ static void genMapInfos(llvm::IRBuilderBase &builder,
     }
   };
 
-  addDevInfos(devPtrOperands, llvm::OpenMPIRBuilder::DeviceInfoTy::Pointer);
-  addDevInfos(devAddrOperands, llvm::OpenMPIRBuilder::DeviceInfoTy::Address);
+  addDevInfos(useDevicePtrVars, llvm::OpenMPIRBuilder::DeviceInfoTy::Pointer);
+  addDevInfos(useDeviceAddrVars, llvm::OpenMPIRBuilder::DeviceInfoTy::Address);
 }
 
 static LogicalResult
@@ -2784,9 +2810,9 @@ convertOmpTargetData(Operation *op, llvm::IRBuilderBase &builder,
                      LLVM::ModuleTranslation &moduleTranslation) {
   llvm::Value *ifCond = nullptr;
   int64_t deviceID = llvm::omp::OMP_DEVICEID_UNDEF;
-  SmallVector<Value> mapOperands;
-  SmallVector<Value> useDevPtrOperands;
-  SmallVector<Value> useDevAddrOperands;
+  SmallVector<Value> mapVars;
+  SmallVector<Value> useDevicePtrVars;
+  SmallVector<Value> useDeviceAddrVars;
   llvm::omp::RuntimeFunction RTLFn;
   DataLayout DL = DataLayout(op->getParentOfType<ModuleOp>());
 
@@ -2795,8 +2821,8 @@ convertOmpTargetData(Operation *op, llvm::IRBuilderBase &builder,
   LogicalResult result =
       llvm::TypeSwitch<Operation *, LogicalResult>(op)
           .Case([&](omp::TargetDataOp dataOp) {
-            if (auto ifExprVar = dataOp.getIfExpr())
-              ifCond = moduleTranslation.lookupValue(ifExprVar);
+            if (auto ifVar = dataOp.getIfExpr())
+              ifCond = moduleTranslation.lookupValue(ifVar);
 
             if (auto devId = dataOp.getDevice())
               if (auto constOp =
@@ -2804,9 +2830,9 @@ convertOmpTargetData(Operation *op, llvm::IRBuilderBase &builder,
                 if (auto intAttr = dyn_cast<IntegerAttr>(constOp.getValue()))
                   deviceID = intAttr.getInt();
 
-            mapOperands = dataOp.getMapOperands();
-            useDevPtrOperands = dataOp.getUseDevicePtr();
-            useDevAddrOperands = dataOp.getUseDeviceAddr();
+            mapVars = dataOp.getMapVars();
+            useDevicePtrVars = dataOp.getUseDevicePtrVars();
+            useDeviceAddrVars = dataOp.getUseDeviceAddrVars();
             return success();
           })
           .Case([&](omp::TargetEnterDataOp enterDataOp) {
@@ -2814,8 +2840,8 @@ convertOmpTargetData(Operation *op, llvm::IRBuilderBase &builder,
               return (LogicalResult)(enterDataOp.emitError(
                   "`nowait` is not supported yet"));
 
-            if (auto ifExprVar = enterDataOp.getIfExpr())
-              ifCond = moduleTranslation.lookupValue(ifExprVar);
+            if (auto ifVar = enterDataOp.getIfExpr())
+              ifCond = moduleTranslation.lookupValue(ifVar);
 
             if (auto devId = enterDataOp.getDevice())
               if (auto constOp =
@@ -2823,7 +2849,7 @@ convertOmpTargetData(Operation *op, llvm::IRBuilderBase &builder,
                 if (auto intAttr = dyn_cast<IntegerAttr>(constOp.getValue()))
                   deviceID = intAttr.getInt();
             RTLFn = llvm::omp::OMPRTL___tgt_target_data_begin_mapper;
-            mapOperands = enterDataOp.getMapOperands();
+            mapVars = enterDataOp.getMapVars();
             return success();
           })
           .Case([&](omp::TargetExitDataOp exitDataOp) {
@@ -2831,8 +2857,8 @@ convertOmpTargetData(Operation *op, llvm::IRBuilderBase &builder,
               return (LogicalResult)(exitDataOp.emitError(
                   "`nowait` is not supported yet"));
 
-            if (auto ifExprVar = exitDataOp.getIfExpr())
-              ifCond = moduleTranslation.lookupValue(ifExprVar);
+            if (auto ifVar = exitDataOp.getIfExpr())
+              ifCond = moduleTranslation.lookupValue(ifVar);
 
             if (auto devId = exitDataOp.getDevice())
               if (auto constOp =
@@ -2841,7 +2867,7 @@ convertOmpTargetData(Operation *op, llvm::IRBuilderBase &builder,
                   deviceID = intAttr.getInt();
 
             RTLFn = llvm::omp::OMPRTL___tgt_target_data_end_mapper;
-            mapOperands = exitDataOp.getMapOperands();
+            mapVars = exitDataOp.getMapVars();
             return success();
           })
           .Case([&](omp::TargetUpdateOp updateDataOp) {
@@ -2849,8 +2875,8 @@ convertOmpTargetData(Operation *op, llvm::IRBuilderBase &builder,
               return (LogicalResult)(updateDataOp.emitError(
                   "`nowait` is not supported yet"));
 
-            if (auto ifExprVar = updateDataOp.getIfExpr())
-              ifCond = moduleTranslation.lookupValue(ifExprVar);
+            if (auto ifVar = updateDataOp.getIfExpr())
+              ifCond = moduleTranslation.lookupValue(ifVar);
 
             if (auto devId = updateDataOp.getDevice())
               if (auto constOp =
@@ -2859,7 +2885,7 @@ convertOmpTargetData(Operation *op, llvm::IRBuilderBase &builder,
                   deviceID = intAttr.getInt();
 
             RTLFn = llvm::omp::OMPRTL___tgt_target_data_update_mapper;
-            mapOperands = updateDataOp.getMapOperands();
+            mapVars = updateDataOp.getMapVars();
             return success();
           })
           .Default([&](Operation *op) {
@@ -2873,8 +2899,7 @@ convertOmpTargetData(Operation *op, llvm::IRBuilderBase &builder,
   using InsertPointTy = llvm::OpenMPIRBuilder::InsertPointTy;
 
   MapInfoData mapData;
-  collectMapDataFromMapOperands(mapData, mapOperands, moduleTranslation, DL,
-                                builder);
+  collectMapDataFromMapVars(mapData, mapVars, moduleTranslation, DL, builder);
 
   // Fill up the arrays with all the mapped variables.
   llvm::OpenMPIRBuilder::MapInfosTy combinedInfo;
@@ -2883,7 +2908,7 @@ convertOmpTargetData(Operation *op, llvm::IRBuilderBase &builder,
     builder.restoreIP(codeGenIP);
     if (auto dataOp = dyn_cast<omp::TargetDataOp>(op)) {
       genMapInfos(builder, moduleTranslation, DL, combinedInfo, mapData,
-                  useDevPtrOperands, useDevAddrOperands);
+                  useDevicePtrVars, useDeviceAddrVars);
     } else {
       genMapInfos(builder, moduleTranslation, DL, combinedInfo, mapData);
     }
@@ -2905,7 +2930,7 @@ convertOmpTargetData(Operation *op, llvm::IRBuilderBase &builder,
       if (!info.DevicePtrInfoMap.empty()) {
         builder.restoreIP(codeGenIP);
         unsigned argIndex = 0;
-        for (auto &devPtrOp : useDevPtrOperands) {
+        for (auto &devPtrOp : useDevicePtrVars) {
           llvm::Value *mapOpValue = moduleTranslation.lookupValue(devPtrOp);
           const auto &arg = region.front().getArgument(argIndex);
           moduleTranslation.mapValue(arg,
@@ -2913,7 +2938,7 @@ convertOmpTargetData(Operation *op, llvm::IRBuilderBase &builder,
           argIndex++;
         }
 
-        for (auto &devAddrOp : useDevAddrOperands) {
+        for (auto &devAddrOp : useDeviceAddrVars) {
           llvm::Value *mapOpValue = moduleTranslation.lookupValue(devAddrOp);
           const auto &arg = region.front().getArgument(argIndex);
           auto *LI = builder.CreateLoad(
@@ -3038,6 +3063,18 @@ static bool targetOpSupported(Operation &opInst) {
     return false;
   }
 
+  if (!targetOp.getAllocateVars().empty() ||
+      !targetOp.getAllocatorVars().empty()) {
+    opInst.emitError("Allocate clause not yet supported");
+    return false;
+  }
+
+  if (!targetOp.getInReductionVars().empty() ||
+      targetOp.getInReductionByref() || targetOp.getInReductionSyms()) {
+    opInst.emitError("In reduction clause not yet supported");
+    return false;
+  }
+
   return true;
 }
 
@@ -3200,7 +3237,7 @@ convertOmpTarget(Operation &opInst, llvm::IRBuilderBase &builder,
   auto targetOp = cast<omp::TargetOp>(opInst);
   auto &targetRegion = targetOp.getRegion();
   DataLayout dl = DataLayout(opInst.getParentOfType<ModuleOp>());
-  SmallVector<Value> mapOperands = targetOp.getMapOperands();
+  SmallVector<Value> mapVars = targetOp.getMapVars();
   llvm::Function *llvmOutlinedFn = nullptr;
 
   LogicalResult bodyGenStatus = success();
@@ -3225,7 +3262,7 @@ convertOmpTarget(Operation &opInst, llvm::IRBuilderBase &builder,
 
     builder.restoreIP(codeGenIP);
     unsigned argIndex = 0;
-    for (auto &mapOp : mapOperands) {
+    for (auto &mapOp : mapVars) {
       auto mapInfoOp =
           mlir::dyn_cast<mlir::omp::MapInfoOp>(mapOp.getDefiningOp());
       llvm::Value *mapOpValue =
@@ -3255,8 +3292,7 @@ convertOmpTarget(Operation &opInst, llvm::IRBuilderBase &builder,
       findAllocaInsertPoint(builder, moduleTranslation);
 
   MapInfoData mapData;
-  collectMapDataFromMapOperands(mapData, mapOperands, moduleTranslation, dl,
-                                builder);
+  collectMapDataFromMapVars(mapData, mapVars, moduleTranslation, dl, builder);
 
   llvm::OpenMPIRBuilder::MapInfosTy combinedInfos;
   auto genMapInfoCB = [&](llvm::OpenMPIRBuilder::InsertPointTy codeGenIP)
@@ -3288,7 +3324,7 @@ convertOmpTarget(Operation &opInst, llvm::IRBuilderBase &builder,
   };
 
   llvm::SmallVector<llvm::Value *, 4> kernelInput;
-  for (size_t i = 0; i < mapOperands.size(); ++i) {
+  for (size_t i = 0; i < mapVars.size(); ++i) {
     // declare target arguments are not passed to kernels as arguments
     // TODO: We currently do not handle cases where a member is explicitly
     // passed in as an argument, this will likley need to be handled in
@@ -3299,7 +3335,7 @@ convertOmpTarget(Operation &opInst, llvm::IRBuilderBase &builder,
       kernelInput.push_back(mapData.OriginalValue[i]);
   }
   SmallVector<llvm::OpenMPIRBuilder::DependData> dds;
-  buildDependData(targetOp.getDepends(), targetOp.getDependVars(),
+  buildDependData(targetOp.getDependKinds(), targetOp.getDependVars(),
                   moduleTranslation, dds);
 
   builder.restoreIP(moduleTranslation.getOpenMPBuilder()->createTarget(
@@ -3438,10 +3474,6 @@ convertHostOrTargetOperation(Operation *op, llvm::IRBuilderBase &builder,
         ompBuilder->createBarrier(builder.saveIP(), llvm::omp::OMPD_barrier);
         return success();
       })
-      .Case([&](omp::TaskwaitOp) {
-        ompBuilder->createTaskwait(builder.saveIP());
-        return success();
-      })
       .Case([&](omp::TaskyieldOp) {
         ompBuilder->createTaskyield(builder.saveIP());
         return success();
@@ -3509,6 +3541,9 @@ convertHostOrTargetOperation(Operation *op, llvm::IRBuilderBase &builder,
       .Case([&](omp::TaskgroupOp op) {
         return convertOmpTaskgroupOp(op, builder, moduleTranslation);
       })
+      .Case([&](omp::TaskwaitOp op) {
+        return convertOmpTaskwaitOp(op, builder, moduleTranslation);
+      })
       .Case<omp::YieldOp, omp::TerminatorOp, omp::DeclareReductionOp,
             omp::CriticalDeclareOp>([](auto op) {
         // `yield` and `terminator` can be just omitted. The block structure
diff --git a/mlir/lib/Transforms/OpStats.cpp b/mlir/lib/Transforms/OpStats.cpp
index 6a71e1f..6746ed5 100644
--- a/mlir/lib/Transforms/OpStats.cpp
+++ b/mlir/lib/Transforms/OpStats.cpp
@@ -55,6 +55,7 @@ void PrintOpStatsPass::runOnOperation() {
     printSummaryInJSON();
   } else
     printSummary();
+  markAllAnalysesPreserved();
 }
 
 void PrintOpStatsPass::printSummary() {
diff --git a/mlir/lib/Transforms/PrintIR.cpp b/mlir/lib/Transforms/PrintIR.cpp
index cc42c7e..3c55f92 100644
--- a/mlir/lib/Transforms/PrintIR.cpp
+++ b/mlir/lib/Transforms/PrintIR.cpp
@@ -25,6 +25,7 @@ struct PrintIRPass : public impl::PrintIRPassBase<PrintIRPass> {
       llvm::dbgs() << " " << this->label;
     llvm::dbgs() << " //----- //\n";
     getOperation()->dump();
+    markAllAnalysesPreserved();
   }
 };
 
diff --git a/mlir/lib/Transforms/Utils/DialectConversion.cpp b/mlir/lib/Transforms/Utils/DialectConversion.cpp
index 059288e..fdd0175 100644
--- a/mlir/lib/Transforms/Utils/DialectConversion.cpp
+++ b/mlir/lib/Transforms/Utils/DialectConversion.cpp
@@ -1316,24 +1316,34 @@ Block *ConversionPatternRewriterImpl::applySignatureConversion(
       continue;
     }
 
-    // This is a 1->1+ mapping. 1->N mappings are not fully supported in the
-    // dialect conversion. Therefore, we need an argument materialization to
-    // turn the replacement block arguments into a single SSA value that can be
-    // used as a replacement.
+    // This is a 1->1+ mapping.
     auto replArgs =
         newBlock->getArguments().slice(inputMap->inputNo, inputMap->size);
+    
+    // When there is no type converter, assume that the new block argument
+    // types are legal. This is reasonable to assume because they were
+    // specified by the user.
+    // FIXME: This won't work for 1->N conversions because multiple output
+    // types are not supported in parts of the dialect conversion. In such a
+    // case, we currently use the original block argument type (produced by
+    // the argument materialization).
+    if (!converter && replArgs.size() == 1) {
+      mapping.map(origArg, replArgs[0]);
+      appendRewrite<ReplaceBlockArgRewrite>(block, origArg);
+      continue;
+    }
+
+    // 1->N mappings are not fully supported in the dialect conversion.
+    // Therefore, we need an argument materialization to turn the replacement
+    // block arguments into a single SSA value (of the original type) that can
+    // be used as a replacement.
     Value argMat = buildUnresolvedMaterialization(
         MaterializationKind::Argument, newBlock, newBlock->begin(),
         origArg.getLoc(), /*inputs=*/replArgs, origArgType, converter);
     mapping.map(origArg, argMat);
     appendRewrite<ReplaceBlockArgRewrite>(block, origArg);
 
-    // FIXME: We simply pass through the replacement argument if there wasn't a
-    // converter, which isn't great as it allows implicit type conversions to
-    // appear. We should properly restructure this code to handle cases where a
-    // converter isn't provided and also to properly handle the case where an
-    // argument materialization is actually a temporary source materialization
-    // (e.g. in the case of 1->N).
+    // Now legalize the type by building a target materialization.
     Type legalOutputType;
     if (converter)
       legalOutputType = converter->convertType(origArgType);
@@ -1341,8 +1351,8 @@ Block *ConversionPatternRewriterImpl::applySignatureConversion(
       Value targetMat = buildUnresolvedTargetMaterialization(
           origArg.getLoc(), argMat, legalOutputType, converter);
       mapping.map(argMat, targetMat);
+      appendRewrite<ReplaceBlockArgRewrite>(block, origArg);
     }
-    appendRewrite<ReplaceBlockArgRewrite>(block, origArg);
   }
 
   appendRewrite<BlockTypeConversionRewrite>(newBlock, block, converter);
diff --git a/mlir/lib/Transforms/ViewOpGraph.cpp b/mlir/lib/Transforms/ViewOpGraph.cpp
index b3c0a06..82e9863 100644
--- a/mlir/lib/Transforms/ViewOpGraph.cpp
+++ b/mlir/lib/Transforms/ViewOpGraph.cpp
@@ -93,6 +93,7 @@ public:
       processOperation(getOperation());
       emitAllEdgeStmts();
     });
+    markAllAnalysesPreserved();
   }
 
   /// Create a CFG graph for a region. Used in `Region::viewGraph`.
diff --git a/mlir/python/mlir/runtime/np_to_memref.py b/mlir/python/mlir/runtime/np_to_memref.py
index f6b706f..882b275 100644
--- a/mlir/python/mlir/runtime/np_to_memref.py
+++ b/mlir/python/mlir/runtime/np_to_memref.py
@@ -7,6 +7,12 @@
 import numpy as np
 import ctypes
 
+try:
+    import ml_dtypes
+except ModuleNotFoundError:
+    # The third-party ml_dtypes provides some optional low precision data-types for NumPy.
+    ml_dtypes = None
+
 
 class C128(ctypes.Structure):
     """A ctype representation for MLIR's Double Complex."""
@@ -26,6 +32,12 @@ class F16(ctypes.Structure):
     _fields_ = [("f16", ctypes.c_int16)]
 
 
+class BF16(ctypes.Structure):
+    """A ctype representation for MLIR's BFloat16."""
+
+    _fields_ = [("bf16", ctypes.c_int16)]
+
+
 # https://stackoverflow.com/questions/26921836/correct-way-to-test-for-numpy-dtype
 def as_ctype(dtp):
     """Converts dtype to ctype."""
@@ -35,6 +47,8 @@ def as_ctype(dtp):
         return C64
     if dtp == np.dtype(np.float16):
         return F16
+    if ml_dtypes is not None and dtp == ml_dtypes.bfloat16:
+        return BF16
     return np.ctypeslib.as_ctypes_type(dtp)
 
 
@@ -46,6 +60,11 @@ def to_numpy(array):
         return array.view("complex64")
     if array.dtype == F16:
         return array.view("float16")
+    assert not (
+        array.dtype == BF16 and ml_dtypes is None
+    ), f"bfloat16 requires the ml_dtypes package, please run:\n\npip install ml_dtypes\n"
+    if array.dtype == BF16:
+        return array.view("bfloat16")
     return array
 
 
diff --git a/mlir/python/requirements.txt b/mlir/python/requirements.txt
index acd6dbb..6ec63e4 100644
--- a/mlir/python/requirements.txt
+++ b/mlir/python/requirements.txt
@@ -1,3 +1,4 @@
 numpy>=1.19.5, <=1.26
 pybind11>=2.9.0, <=2.10.3
-PyYAML>=5.3.1, <=6.0.1
-\ No newline at end of file
+PyYAML>=5.3.1, <=6.0.1
+ml_dtypes   # provides several NumPy dtype extensions, including the bf16
+\ No newline at end of file
diff --git a/mlir/test/Conversion/AMDGPUToROCDL/amdgpu-to-rocdl.mlir b/mlir/test/Conversion/AMDGPUToROCDL/amdgpu-to-rocdl.mlir
index bb1ceda..717667c 100644
--- a/mlir/test/Conversion/AMDGPUToROCDL/amdgpu-to-rocdl.mlir
+++ b/mlir/test/Conversion/AMDGPUToROCDL/amdgpu-to-rocdl.mlir
@@ -226,3 +226,34 @@ func.func @lds_barrier() {
   amdgpu.lds_barrier
   func.return
 }
+
+// CHECK-LABEL: func @sched_barrier
+func.func @sched_barrier() {
+  // CHECK: rocdl.sched.barrier 0
+  amdgpu.sched_barrier allow = <none>
+  // CHECK: rocdl.sched.barrier 1
+  amdgpu.sched_barrier allow = <non_mem_non_sideffect>
+  // CHECK: rocdl.sched.barrier 2
+  amdgpu.sched_barrier allow = <valu>
+  // CHECK: rocdl.sched.barrier 4
+  amdgpu.sched_barrier allow = <salu>
+  // CHECK: rocdl.sched.barrier 8
+  amdgpu.sched_barrier allow = <mfma_wmma>
+  // CHECK: rocdl.sched.barrier 16
+  amdgpu.sched_barrier allow = <all_vmem>
+  // CHECK: rocdl.sched.barrier 32
+  amdgpu.sched_barrier allow = <vmem_read>
+  // CHECK: rocdl.sched.barrier 64
+  amdgpu.sched_barrier allow = <vmem_write>
+  // CHECK: rocdl.sched.barrier 128
+  amdgpu.sched_barrier allow = <all_ds>
+  // CHECK: rocdl.sched.barrier 256
+  amdgpu.sched_barrier allow = <ds_read>
+  // CHECK: rocdl.sched.barrier 512
+  amdgpu.sched_barrier allow = <ds_write>
+  // CHECK: rocdl.sched.barrier 1024
+  amdgpu.sched_barrier allow = <transcendental>
+  // CHECK: rocdl.sched.barrier 18
+  amdgpu.sched_barrier allow = <valu|all_vmem>
+  func.return
+}
diff --git a/mlir/test/Conversion/ArithToEmitC/arith-to-emitc-unsupported.mlir b/mlir/test/Conversion/ArithToEmitC/arith-to-emitc-unsupported.mlir
index 766ad40..ef0e71e 100644
--- a/mlir/test/Conversion/ArithToEmitC/arith-to-emitc-unsupported.mlir
+++ b/mlir/test/Conversion/ArithToEmitC/arith-to-emitc-unsupported.mlir
@@ -134,3 +134,19 @@ func.func @arith_shrui_i1(%arg0: i1, %arg1: i1) {
   %shrui = arith.shrui %arg0, %arg1 : i1
   return
 }
+
+// -----
+
+func.func @arith_divui_vector(%arg0: vector<5xi32>, %arg1: vector<5xi32>) -> vector<5xi32> {
+  // expected-error @+1 {{failed to legalize operation 'arith.divui'}}
+  %divui = arith.divui %arg0, %arg1 : vector<5xi32>
+  return %divui: vector<5xi32>
+}
+
+// -----
+
+func.func @arith_remui_vector(%arg0: vector<5xi32>, %arg1: vector<5xi32>) -> vector<5xi32> {
+  // expected-error @+1 {{failed to legalize operation 'arith.remui'}}
+  %divui = arith.remui %arg0, %arg1 : vector<5xi32>
+  return %divui: vector<5xi32>
+}
diff --git a/mlir/test/Conversion/ArithToEmitC/arith-to-emitc.mlir b/mlir/test/Conversion/ArithToEmitC/arith-to-emitc.mlir
index 858ccd1..afd1198 100644
--- a/mlir/test/Conversion/ArithToEmitC/arith-to-emitc.mlir
+++ b/mlir/test/Conversion/ArithToEmitC/arith-to-emitc.mlir
@@ -717,3 +717,21 @@ func.func @arith_index_castui(%arg0: i32) -> i32 {
 
   return %int : i32
 }
+
+// -----
+
+func.func @arith_divui_remui(%arg0: i32, %arg1: i32) -> i32 {
+  // CHECK-LABEL: arith_divui_remui
+  // CHECK-SAME: (%[[Arg0:[^ ]*]]: i32, %[[Arg1:[^ ]*]]: i32)
+  // CHECK: %[[Conv0:.*]] = emitc.cast %[[Arg0]] : i32 to ui32
+  // CHECK: %[[Conv1:.*]] = emitc.cast %[[Arg1]] : i32 to ui32
+  // CHECK: %[[Div:.*]] = emitc.div %[[Conv0]], %[[Conv1]] : (ui32, ui32) -> ui32
+  %div = arith.divui %arg0, %arg1 : i32
+
+  // CHECK: %[[Conv2:.*]] = emitc.cast %[[Arg0]] : i32 to ui32
+  // CHECK: %[[Conv3:.*]] = emitc.cast %[[Arg1]] : i32 to ui32
+  // CHECK: %[[Rem:.*]] = emitc.rem %[[Conv2]], %[[Conv3]] : (ui32, ui32) -> ui32
+  %rem = arith.remui %arg0, %arg1 : i32
+
+  return %div : i32
+}
diff --git a/mlir/test/Conversion/NVGPUToNVVM/nvgpu-to-nvvm.mlir b/mlir/test/Conversion/NVGPUToNVVM/nvgpu-to-nvvm.mlir
index 86a552c..156a8a4 100644
--- a/mlir/test/Conversion/NVGPUToNVVM/nvgpu-to-nvvm.mlir
+++ b/mlir/test/Conversion/NVGPUToNVVM/nvgpu-to-nvvm.mlir
@@ -1339,3 +1339,18 @@ module attributes {transform.with_named_sequence} {
     transform.yield
   }
 }
+
+// CHECK-LABEL: @rcp_approx_ftz_f32
+// CHECK-SAME:  %[[IN:.*]]: vector<32x16xf32>
+func.func @rcp_approx_ftz_f32(%in: vector<32x16xf32>) {
+  // CHECK: %[[IN_LLVM:.*]] = builtin.unrealized_conversion_cast %[[IN]] : vector<32x16xf32> to !llvm.array<32 x vector<16xf32>>
+  // CHECK: %[[IN1DVEC:.*]] = llvm.extractvalue %[[IN_LLVM]][0] : !llvm.array<32 x vector<16xf32>>
+  // CHECK: %[[OUT1DVEC:.*]] = llvm.mlir.undef : vector<16xf32>
+  // CHECK: %[[IDX_0:.+]] = llvm.mlir.constant(0 : i64) : i64
+  // CHECK: %[[ELEM_0:.*]] = llvm.extractelement %[[IN1DVEC]][%[[IDX_0]] : i64]
+  // CHECK: %[[ELEM_RCP0:.*]] = nvvm.rcp.approx.ftz.f %[[ELEM_0]] : f32
+  // CHECK: llvm.insertelement %[[ELEM_RCP0]], %[[OUT1DVEC]][%[[IDX_0]] : i64] : vector<16xf32>
+  // CHECK-COUNT-511: nvvm.rcp.approx.ftz.f
+  %out = nvgpu.rcp %in {rounding = approx, ftz} : vector<32x16xf32>
+  return
+}
diff --git a/mlir/test/Conversion/OpenMPToLLVM/convert-to-llvmir.mlir b/mlir/test/Conversion/OpenMPToLLVM/convert-to-llvmir.mlir
index 4c9e099..d81487d 100644
--- a/mlir/test/Conversion/OpenMPToLLVM/convert-to-llvmir.mlir
+++ b/mlir/test/Conversion/OpenMPToLLVM/convert-to-llvmir.mlir
@@ -253,7 +253,7 @@ llvm.func @_QPomp_target_data_region(%a : !llvm.ptr, %i : !llvm.ptr) {
 // CHECK:           %[[VAL_0:.*]] = llvm.mlir.constant(64 : i32) : i32
 // CHECK:           %[[MAP1:.*]] = omp.map.info var_ptr(%[[ARG_0]] : !llvm.ptr, !llvm.array<1024 x i32>)   map_clauses(tofrom) capture(ByRef) -> !llvm.ptr {name = ""}
 // CHECK:           %[[MAP2:.*]] = omp.map.info var_ptr(%[[ARG_1]] : !llvm.ptr, i32)   map_clauses(implicit, exit_release_or_enter_alloc) capture(ByCopy) -> !llvm.ptr {name = ""}
-// CHECK:           omp.target   thread_limit(%[[VAL_0]] : i32) map_entries(%[[MAP1]] -> %[[BB_ARG0:.*]], %[[MAP2]] -> %[[BB_ARG1:.*]] : !llvm.ptr, !llvm.ptr) {
+// CHECK:           omp.target map_entries(%[[MAP1]] -> %[[BB_ARG0:.*]], %[[MAP2]] -> %[[BB_ARG1:.*]] : !llvm.ptr, !llvm.ptr) thread_limit(%[[VAL_0]] : i32) {
 // CHECK:           ^bb0(%[[BB_ARG0]]: !llvm.ptr, %[[BB_ARG1]]: !llvm.ptr):
 // CHECK:             %[[VAL_1:.*]] = llvm.mlir.constant(10 : i32) : i32
 // CHECK:             llvm.store %[[VAL_1]], %[[BB_ARG1]] : i32, !llvm.ptr
diff --git a/mlir/test/Conversion/VectorToSPIRV/vector-to-spirv.mlir b/mlir/test/Conversion/VectorToSPIRV/vector-to-spirv.mlir
index edad208..dd0ed77 100644
--- a/mlir/test/Conversion/VectorToSPIRV/vector-to-spirv.mlir
+++ b/mlir/test/Conversion/VectorToSPIRV/vector-to-spirv.mlir
@@ -794,6 +794,32 @@ func.func @shape_cast_size1_vector(%arg0 : vector<f32>) -> vector<1xf32> {
 
 // -----
 
+// CHECK-LABEL: @step()
+//       CHECK:   %[[CST0:.*]] = spirv.Constant 0 : i32
+//       CHECK:   %[[CST1:.*]] = spirv.Constant 1 : i32
+//       CHECK:   %[[CST2:.*]] = spirv.Constant 2 : i32
+//       CHECK:   %[[CST3:.*]] = spirv.Constant 3 : i32
+//       CHECK:   %[[CONSTRUCT:.*]] = spirv.CompositeConstruct %[[CST0]], %[[CST1]], %[[CST2]], %[[CST3]] : (i32, i32, i32, i32) -> vector<4xi32>
+//       CHECK:   %[[CAST:.*]] = builtin.unrealized_conversion_cast %[[CONSTRUCT]] : vector<4xi32> to vector<4xindex>
+//       CHECK:   return %[[CAST]] : vector<4xindex>
+func.func @step() -> vector<4xindex> {
+  %0 = vector.step : vector<4xindex>
+  return %0 : vector<4xindex>
+}
+
+// -----
+
+// CHECK-LABEL: @step_size1()
+//       CHECK:   %[[CST0:.*]] = spirv.Constant 0 : i32
+//       CHECK:   %[[CAST:.*]] = builtin.unrealized_conversion_cast %[[CST0]] : i32 to vector<1xindex>
+//       CHECK:   return %[[CAST]] : vector<1xindex>
+func.func @step_size1() -> vector<1xindex> {
+  %0 = vector.step : vector<1xindex>
+  return %0 : vector<1xindex>
+}
+
+// -----
+
 module attributes {
   spirv.target_env = #spirv.target_env<
     #spirv.vce<v1.0, [Shader], [SPV_KHR_storage_buffer_storage_class]>, #spirv.resource_limits<>>
diff --git a/mlir/test/Dialect/AMDGPU/ops.mlir b/mlir/test/Dialect/AMDGPU/ops.mlir
index 744a096..9457a1b 100644
--- a/mlir/test/Dialect/AMDGPU/ops.mlir
+++ b/mlir/test/Dialect/AMDGPU/ops.mlir
@@ -109,6 +109,15 @@ func.func @lds_barrier() {
   func.return
 }
 
+// CHECK-LABEL: func @sched_barrier
+func.func @sched_barrier() {
+  // CHECK: amdgpu.sched_barrier allow = <none>
+  amdgpu.sched_barrier allow = <none>
+  // CHECK: amdgpu.sched_barrier allow = <valu|all_vmem>
+  amdgpu.sched_barrier allow = <valu|all_vmem>
+  func.return
+}
+
 // CHECK-LABEL: func @mfma
 func.func @mfma(%arg0 : f32, %arg1 : vector<32xf32>) -> vector<32xf32> {
   // CHECK: amdgpu.mfma
diff --git a/mlir/test/Dialect/Linalg/canonicalize.mlir b/mlir/test/Dialect/Linalg/canonicalize.mlir
index d34bc8c..b1212b8 100644
--- a/mlir/test/Dialect/Linalg/canonicalize.mlir
+++ b/mlir/test/Dialect/Linalg/canonicalize.mlir
@@ -1169,3 +1169,30 @@ func.func @broadcast_transpose_fold_2dim(%input: tensor<2xf32>,
       permutation = [1, 0]
   func.return %transpose : tensor<4x2xf32>
 }
+
+// -----
+
+func.func @concats_of_fill(
+    %arg0 : index, %arg1 : index, %arg2 : index, %arg3 : index)
+    -> tensor<5x?x?xf32>
+{
+  %cst0 = arith.constant 0.0 : f32
+  %cst1 = arith.constant 0.0 : f32
+  %0 = tensor.empty(%arg0, %arg1) : tensor<5x?x?xf32>
+  %1 = linalg.fill ins(%cst0 : f32) outs(%0 : tensor<5x?x?xf32>) -> tensor<5x?x?xf32>
+  %2 = tensor.empty(%arg2, %arg3) : tensor<5x?x?xf32>
+  %3 = linalg.fill ins(%cst1 : f32) outs(%2 : tensor<5x?x?xf32>) -> tensor<5x?x?xf32>
+  %4 = tensor.concat dim(1) %1, %3 : (tensor<5x?x?xf32>, tensor<5x?x?xf32>) -> tensor<5x?x?xf32>
+  return %4 : tensor<5x?x?xf32>
+}
+//       CHECK: func @concats_of_fill(
+//  CHECK-SAME:     %[[ARG0:[a-zA-Z0-9]+]]: index,
+//  CHECK-SAME:     %[[ARG1:[a-zA-Z0-9]+]]: index,
+//  CHECK-SAME:     %[[ARG2:[a-zA-Z0-9]+]]: index,
+//  CHECK-SAME:     %[[ARG3:[a-zA-Z0-9]+]]: index)
+//   CHECK-DAG:   %[[CST:.+]] = arith.constant 0.0
+//   CHECK-DAG:   %[[EMPTY0:.+]] = tensor.empty(%[[ARG0]], %[[ARG1]])
+//   CHECK-DAG:   %[[EMPTY1:.+]] = tensor.empty(%[[ARG2]], %[[ARG3]])
+//       CHECK:   %[[CONCAT:.+]] = tensor.concat dim(1) %[[EMPTY0]], %[[EMPTY1]]
+//       CHECK:   %[[FILL:.+]] = linalg.fill ins(%[[CST]] : f32) outs(%[[CONCAT]] :
+//       CHECK:   return %[[FILL]]
diff --git a/mlir/test/Dialect/Linalg/tile-tensors.mlir b/mlir/test/Dialect/Linalg/tile-tensors.mlir
index 8918381..8f13c69 100644
--- a/mlir/test/Dialect/Linalg/tile-tensors.mlir
+++ b/mlir/test/Dialect/Linalg/tile-tensors.mlir
@@ -119,7 +119,7 @@ module attributes {transform.with_named_sequence} {
 
 // -----
 
-//  CHECK-DAG:  #[[MAP0:.*]] = affine_map<(d0)[s0] -> (2, -d0 + s0)>
+//  CHECK-DAG:  #[[MAP0:.*]] = affine_map<(d0)[s0] -> (-d0 + s0, 2)>
 
 //      CHECK:  fold_extract_slice
 // CHECK-SAME:    %[[ARG0:[0-9a-zA-Z]*]]: tensor<?x128xf32>
diff --git a/mlir/test/Dialect/Linalg/tile-to-forall.mlir b/mlir/test/Dialect/Linalg/tile-to-forall.mlir
index 8545dfd..778d5bb 100644
--- a/mlir/test/Dialect/Linalg/tile-to-forall.mlir
+++ b/mlir/test/Dialect/Linalg/tile-to-forall.mlir
@@ -177,7 +177,6 @@ module attributes {transform.with_named_sequence} {
   }
 }
 
-
 // -----
 
 // CHECK-DAG: #[[$map0:.+]] = affine_map<()[s0] -> (s0 ceildiv 10)>
@@ -194,13 +193,13 @@ module attributes {transform.with_named_sequence} {
 func.func @matmul_tile_size_dynamic(%A: tensor<?x?xf32>, %B: tensor<?x?xf32>, %C: tensor<?x?xf32>) -> tensor<?x?xf32> {
   //      CHECK: %[[M:.+]] = tensor.dim %[[A]], %c0 :
   //      CHECK: %[[N:.+]] = tensor.dim %[[B]], %c1 :
-  //      CHECK: %[[NT0:.+]] = affine.apply #map()[%[[M]]]
-  //      CHECK: %[[NT1:.+]] = affine.apply #map1()[%[[N]]]
+  //      CHECK: %[[NT0:.+]] = affine.apply #[[$map0]]()[%[[M]]]
+  //      CHECK: %[[NT1:.+]] = affine.apply #[[$map1]]()[%[[N]]]
   //      CHECK: scf.forall (%[[IV0:.+]], %[[IV1:.+]]) in (%[[NT0]], %[[NT1]]) shared_outs(%[[C_BLK:.*]] = %[[C]])
-  //      CHECK:   %[[TS0:.+]] = affine.min #[[$map2]](%[[IV0]])[%[[M]]]
-  //      CHECK:   %[[TS1:.+]] = affine.min #[[$map4]](%[[IV1]])[%[[N]]]
-  //      CHECK:   %[[LB0:.+]] = affine.apply #[[$map5]](%[[IV0]])
-  //      CHECK:   %[[LB1:.+]] = affine.apply #[[$map6]](%[[IV1]])
+  //  CHECK-DAG:   %[[TS0:.+]] = affine.min #[[$map2]](%[[IV0]])[%[[M]]]
+  //  CHECK-DAG:   %[[TS1:.+]] = affine.min #[[$map4]](%[[IV1]])[%[[N]]]
+  //  CHECK-DAG:   %[[LB0:.+]] = affine.apply #[[$map5]](%[[IV0]])
+  //  CHECK-DAG:   %[[LB1:.+]] = affine.apply #[[$map6]](%[[IV1]])
   //      CHECK:   tensor.extract_slice %[[A]]
   //      CHECK:   tensor.extract_slice %[[B]]
   //      CHECK:   tensor.extract_slice %[[C_BLK]]
@@ -220,7 +219,6 @@ module attributes {transform.with_named_sequence} {
     transform.yield
   }
 }
-
 // -----
 
 // Tests that dimension 0 can eliminate affine.min/max, dimension 1 cannot.
@@ -235,11 +233,11 @@ module attributes {transform.with_named_sequence} {
 //  CHECK-SAME:   %[[C:[0-9a-z]+]]: tensor
 func.func @matmul_tile_size_static(%A: tensor<100x200xf32>, %B: tensor<200x300xf32>, %C: tensor<100x300xf32>) -> tensor<100x300xf32> {
   //      CHECK: scf.forall (%[[IV0:.+]], %[[IV1:.+]]) in (10, 15) shared_outs(%[[C_BLK:.*]] = %[[C]])
-  //      CHECK:   %[[TS:.+]] = affine.min #[[$map0]](%[[IV1]])
+  //  CHECK-DAG:   %[[TS:.+]] = affine.min #[[$map0]](%[[IV1]])
+  //  CHECK-DAG:   %[[LB0:.+]] = affine.apply #[[$map2]](%[[IV0]])
+  //  CHECK-DAG:   %[[LB1:.+]] = affine.apply #[[$map3]](%[[IV1]])
   //  CHECK-NOT:   affine.max
   //  CHECK-NOT:   affine.min
-  //      CHECK:   %[[LB0:.+]] = affine.apply #[[$map2]](%[[IV0]])
-  //      CHECK:   %[[LB1:.+]] = affine.apply #[[$map3]](%[[IV1]])
   //      CHECK:   %[[tA:.+]] = tensor.extract_slice %[[A]][%[[LB0]], 0] [10, 200] [1, 1] :
   //      CHECK:   %[[tB:.+]] = tensor.extract_slice %[[B]][0, %[[LB1]]] [200, %[[TS]]] [1, 1] :
   //      CHECK:   %[[tC:.+]] = tensor.extract_slice %[[C_BLK]][%[[LB0]], %[[LB1]]] [10, %[[TS]]] [1, 1] :
@@ -342,7 +340,6 @@ module attributes {transform.with_named_sequence} {
 // -----
 
 // CHECK-DAG: #[[$map0:.+]] = affine_map<(d0) -> (d0 * -15 + 100, 15)>
-// CHECK-DAG: #[[$map1:.+]] = affine_map<(d0) -> (0, d0)>
 // CHECK-DAG: #[[$map2:.+]] = affine_map<(d0) -> (d0 * 15)>
 // CHECK-DAG: #[[$map3:.+]] = affine_map<(d0) -> (d0)>
 
@@ -355,8 +352,7 @@ module attributes {transform.with_named_sequence} {
                                          %OUT1: tensor<100xf32>, %OUT2: tensor<100xf32>)
                                          -> (tensor<100xf32>, tensor<100xf32>) {
 //      CHECK: scf.forall (%[[IV0:.+]]) in (7) shared_outs(%[[OUT1:[0-9a-z]+]] = %[[ORGOUT1]], %[[OUT2:[0-9a-z]+]] = %[[ORGOUT2]])
-//      CHECK:   %[[TSMIN:.+]] = affine.min #[[$map0]](%[[IV0]])
-//      CHECK:   %[[TS:.+]] = affine.max #[[$map1]](%[[TSMIN]])
+//      CHECK:   %[[TS:.+]] = affine.min #[[$map0]](%[[IV0]])
 //  CHECK-NOT:   affine.min
 //  CHECK-NOT:   affine.max
 //      CHECK:   %[[LB:.+]] = affine.apply #[[$map2]](%[[IV0]])
@@ -467,16 +463,16 @@ module attributes {transform.with_named_sequence} {
 func.func @matmul_tile_size_dynamic(%A: tensor<?x?xf32>, %B: tensor<?x?xf32>, %C: tensor<?x?xf32>) -> tensor<?x?xf32> {
   //      CHECK: %[[c1:.*]] = arith.constant 1 : index
   //      CHECK: %[[c0:.*]] = arith.constant 0 : index
-  //      CHECK: %[[M:.+]] = tensor.dim %[[A]], %[[c0]] :
-  //      CHECK: %[[N:.+]] = tensor.dim %[[B]], %[[c1]] :
-  //      CHECK: %[[NT0:.+]] = affine.apply #map()[%[[M]]]
-  //      CHECK: %[[NT1:.+]] = affine.apply #map1()[%[[N]]]
-  //      CHECK: %[[K:.+]] = tensor.dim %[[A]], %[[c1]] :
+  //  CHECK-DAG: %[[M:.+]] = tensor.dim %[[A]], %[[c0]] :
+  //  CHECK-DAG: %[[N:.+]] = tensor.dim %[[B]], %[[c1]] :
+  //  CHECK-DAG: %[[NT0:.+]] = affine.apply #map()[%[[M]]]
+  //  CHECK-DAG: %[[NT1:.+]] = affine.apply #map1()[%[[N]]]
+  //  CHECK-DAG: %[[K:.+]] = tensor.dim %[[A]], %[[c1]] :
   //      CHECK: scf.forall (%[[IV0:.+]], %[[IV1:.+]]) in (%[[NT0]], %[[NT1]]) shared_outs(%[[C_BLK:.*]] = %[[C]])
-  //      CHECK:   %[[TS0:.+]] = affine.min #[[$map2]](%[[IV0]])[%[[M]]]
-  //      CHECK:   %[[TS1:.+]] = affine.min #[[$map3]](%[[IV1]])[%[[N]]]
-  //      CHECK:   %[[LB0:.+]] = affine.apply #[[$map4]](%[[IV0]])
-  //      CHECK:   %[[LB1:.+]] = affine.apply #[[$map5]](%[[IV1]])
+  //  CHECK-DAG:   %[[TS0:.+]] = affine.min #[[$map2]](%[[IV0]])[%[[M]]]
+  //  CHECK-DAG:   %[[TS1:.+]] = affine.min #[[$map3]](%[[IV1]])[%[[N]]]
+  //  CHECK-DAG:   %[[LB0:.+]] = affine.apply #[[$map4]](%[[IV0]])
+  //  CHECK-DAG:   %[[LB1:.+]] = affine.apply #[[$map5]](%[[IV1]])
   //      CHECK:   tensor.extract_slice %[[A]][%[[LB0]], 0] [%[[TS0]], %[[K]]] [1, 1] :
   //      CHECK:   tensor.extract_slice %[[B]][0, %[[LB1]]] [%[[K]], %[[TS1]]] [1, 1] :
   //      CHECK:   tensor.extract_slice %[[C_BLK]][%[[LB0]], %[[LB1]]] [%[[TS0]], %[[TS1]]] [1, 1] :
@@ -535,16 +531,16 @@ module attributes {transform.with_named_sequence} {
 func.func @matmul_tile_size_dynamic(%A: tensor<?x?xf32>, %B: tensor<?x?xf32>, %C: tensor<?x?xf32>) -> tensor<?x?xf32> {
   //      CHECK: %[[c1:.*]] = arith.constant 1 : index
   //      CHECK: %[[c0:.*]] = arith.constant 0 : index
-  //      CHECK: %[[M:.+]] = tensor.dim %[[A]], %[[c0]] :
-  //      CHECK: %[[N:.+]] = tensor.dim %[[B]], %[[c1]] :
-  //      CHECK: %[[NT0:.+]] = affine.apply #map()[%[[M]]]
-  //      CHECK: %[[NT1:.+]] = affine.apply #map1()[%[[N]]]
-  //      CHECK: %[[K:.+]] = tensor.dim %[[A]], %[[c1]] :
+  //  CHECK-DAG: %[[M:.+]] = tensor.dim %[[A]], %[[c0]] :
+  //  CHECK-DAG: %[[N:.+]] = tensor.dim %[[B]], %[[c1]] :
+  //  CHECK-DAG: %[[NT0:.+]] = affine.apply #map()[%[[M]]]
+  //  CHECK-DAG: %[[NT1:.+]] = affine.apply #map1()[%[[N]]]
+  //  CHECK-DAG: %[[K:.+]] = tensor.dim %[[A]], %[[c1]] :
   //      CHECK: scf.forall (%[[IV0:.+]], %[[IV1:.+]]) in (%[[NT0]], %[[NT1]]) shared_outs(%[[C_BLK:.*]] = %[[C]])
-  //      CHECK:   %[[TS0:.+]] = affine.min #[[$map2]](%[[IV0]])[%[[M]]]
-  //      CHECK:   %[[TS1:.+]] = affine.min #[[$map3]](%[[IV1]])[%[[N]]]
-  //      CHECK:   %[[LB0:.+]] = affine.apply #[[$map4]](%[[IV0]])
-  //      CHECK:   %[[LB1:.+]] = affine.apply #[[$map5]](%[[IV1]])
+  //  CHECK-DAG:   %[[TS0:.+]] = affine.min #[[$map2]](%[[IV0]])[%[[M]]]
+  //  CHECK-DAG:   %[[TS1:.+]] = affine.min #[[$map3]](%[[IV1]])[%[[N]]]
+  //  CHECK-DAG:   %[[LB0:.+]] = affine.apply #[[$map4]](%[[IV0]])
+  //  CHECK-DAG:   %[[LB1:.+]] = affine.apply #[[$map5]](%[[IV1]])
   //      CHECK:   tensor.extract_slice %[[A]][%[[LB0]], 0] [%[[TS0]], %[[K]]] [1, 1] :
   //      CHECK:   tensor.extract_slice %[[B]][0, %[[LB1]]] [%[[K]], %[[TS1]]] [1, 1] :
   //      CHECK:   tensor.extract_slice %[[C_BLK]][%[[LB0]], %[[LB1]]] [%[[TS0]], %[[TS1]]] [1, 1] :
diff --git a/mlir/test/Dialect/Linalg/transform-op-tile.mlir b/mlir/test/Dialect/Linalg/transform-op-tile.mlir
index 955ea6b..7bac850 100644
--- a/mlir/test/Dialect/Linalg/transform-op-tile.mlir
+++ b/mlir/test/Dialect/Linalg/transform-op-tile.mlir
@@ -1,4 +1,4 @@
-// RUN: mlir-opt --transform-interpreter --mlir-print-local-scope --split-input-file --verify-diagnostics %s | FileCheck %s
+// RUN: mlir-opt --transform-interpreter --mlir-print-local-scope --split-input-file --verify-diagnostics --cse %s | FileCheck %s
 
 module attributes {transform.with_named_sequence} {
   transform.named_sequence @__transform_main(%arg1: !transform.any_op {transform.readonly}) {
@@ -178,14 +178,13 @@ module {
 
 // CHECK-LABEL:   func.func @scalable_tile(
 // CHECK-SAME:      %[[ARG_0:.*]]: tensor<?xf32>, %[[ARG_1:.*]]: tensor<?xf32>, %[[ARG_2:.*]]: tensor<?xf32>,
-// CHECK:           %[[C4:.*]] = arith.constant 0 : index
-// CHECK:           %[[DIM:.*]] = tensor.dim %[[ARG_0]], %[[C4]] : tensor<?xf32>
+// CHECK:           %[[C0:.*]] = arith.constant 0 : index
+// CHECK:           %[[DIM:.*]] = tensor.dim %[[ARG_0]], %[[C0]] : tensor<?xf32>
 // CHECK:           %[[VEC_SIZE:.*]] = arith.constant 4 : index
 // CHECK:           %[[VS:.*]] = vector.vscale
 // CHECK:           %[[STEP:.*]] = arith.muli %[[VEC_SIZE]], %[[VS]] : index
-// CHECK:           %[[C0:.*]] = arith.constant 0 : index
 // CHECK:           scf.for %[[IV:.*]] = %[[C0]] to %[[DIM]] step %[[STEP]] iter_args(%[[VAL:.*]] = %[[ARG_2]]) -> (tensor<?xf32>) {
-// CHECK:             %[[SIZE:.*]] = affine.min affine_map<(d0)[s0, s1] -> (s0, -d0 + s1)>(%[[IV]])[%[[STEP]], %[[DIM]]]
+// CHECK:             %[[SIZE:.*]] = affine.min affine_map<(d0)[s0, s1] -> (-d0 + s0, s1)>(%[[IV]])[%[[DIM]], %[[STEP]]]
 // CHECK:             %[[SLICE_ARG0:.*]] = tensor.extract_slice %[[ARG_0]][%[[IV]]] [%[[SIZE]]] [1] : tensor<?xf32> to tensor<?xf32>
 // CHECK:             %[[SLICE_ARG1:.*]] = tensor.extract_slice %[[ARG_1]][%[[IV]]] [%[[SIZE]]] [1] : tensor<?xf32> to tensor<?xf32>
 // CHECK:             %[[SLICE_ARG2:.*]] = tensor.extract_slice %[[VAL]][%[[IV]]] [%[[SIZE]]] [1] : tensor<?xf32> to tensor<?xf32>
@@ -202,20 +201,14 @@ module {
 // -----
 
 // CHECK-LABEL:   func.func @scalable_and_fixed_length_tile
-// CHECK:           %[[C4:.*]] = arith.constant 4 : index
-// CHECK:           %[[VS:.*]] = vector.vscale
-// CHECK:           %[[STEP_2:.*]] = arith.muli %[[C4]], %[[VS]] : index
-// CHECK:           %[[C0:.*]] = arith.constant 0 : index
-// CHECK:           %[[C128:.*]] = arith.constant 128 : index
-// CHECK:           %[[STEP_0:.*]] = arith.constant 4 : index
-// CHECK:           scf.for %[[VAL_11:.*]] = %[[C0]] to %[[C128]] step %[[STEP_0]]
-// CHECK:             %[[C0_1:.*]] = arith.constant 0 : index
-// CHECK:             %[[C128_1:.*]] = arith.constant 128 : index
-// CHECK:             %[[STEP_1:.*]] = arith.constant 4 : index
-// CHECK:             scf.for %[[VAL_16:.*]] = %[[C0_1]] to %[[C128_1]] step %[[STEP_1]]
-// CHECK:               %[[C0_2:.*]] = arith.constant 0 : index
-// CHECK:               %[[C128_2:.*]] = arith.constant 128 : index
-// CHECK:               scf.for %{{.*}} = %[[C0_2]] to %[[C128_2]] step %[[STEP_2]]
+//   CHECK-DAG:     %[[C4:.*]] = arith.constant 4 : index
+//   CHECK-DAG:     %[[VS:.*]] = vector.vscale
+//   CHECK-DAG:     %[[STEP_2:.*]] = arith.muli %[[C4]], %[[VS]] : index
+//   CHECK-DAG:     %[[C0:.*]] = arith.constant 0 : index
+//   CHECK-DAG:     %[[C128:.*]] = arith.constant 128 : index
+//       CHECK:     scf.for %[[VAL_11:.*]] = %[[C0]] to %[[C128]] step %[[C4]]
+//       CHECK:       scf.for %[[VAL_16:.*]] = %[[C0]] to %[[C128]] step %[[C4]]
+//       CHECK:         scf.for %{{.*}} = %[[C0]] to %[[C128]] step %[[STEP_2]]
 
 func.func @scalable_and_fixed_length_tile(
   %arg0: tensor<128x128xf32>, %arg1: tensor<128x128xf32>, %arg2: tensor<128x128xf32>)
diff --git a/mlir/test/Dialect/Linalg/vectorization-with-patterns.mlir b/mlir/test/Dialect/Linalg/vectorization-with-patterns.mlir
index d7ff1de..3404b73 100644
--- a/mlir/test/Dialect/Linalg/vectorization-with-patterns.mlir
+++ b/mlir/test/Dialect/Linalg/vectorization-with-patterns.mlir
@@ -1240,8 +1240,8 @@ module attributes {transform.with_named_sequence} {
 
 // -----
 
-// CHECK-LABEL:   func @red_max_2d(
-func.func @red_max_2d(%arg0: tensor<4x4xf32>) -> tensor<4xf32> {
+// CHECK-LABEL:   func @red_maximumf_2d(
+func.func @red_maximumf_2d(%arg0: tensor<4x4xf32>) -> tensor<4xf32> {
   // CHECK: %[[CMINF:.+]] = arith.constant dense<-3.402820e+38> : vector<4xf32>
   // CHECK: tensor.empty() : tensor<4xf32>
   // CHECK: vector.multi_reduction <maximumf>, {{.*}}, %[[CMINF]] [1] : vector<4x4xf32> to vector<4xf32>
@@ -1272,8 +1272,40 @@ module attributes {transform.with_named_sequence} {
 
 // -----
 
-// CHECK-LABEL:   func @red_min_2d(
-func.func @red_min_2d(%arg0: tensor<4x4xf32>) -> tensor<4xf32> {
+// CHECK-LABEL:   func @red_maxnumf_2d(
+func.func @red_maxnumf_2d(%arg0: tensor<4x4xf32>) -> tensor<4xf32> {
+  // CHECK: %[[CMINF:.+]] = arith.constant dense<-3.402820e+38> : vector<4xf32>
+  // CHECK: tensor.empty() : tensor<4xf32>
+  // CHECK: vector.multi_reduction <maxnumf>, {{.*}}, %[[CMINF]] [1] : vector<4x4xf32> to vector<4xf32>
+  // CHECK: vector.transfer_write {{.*}} : vector<4xf32>, tensor<4xf32>
+  %ident = arith.constant -3.40282e+38 : f32
+  %init = tensor.empty() : tensor<4xf32>
+  %fill = linalg.fill ins(%ident : f32) outs(%init : tensor<4xf32>) -> tensor<4xf32>
+  %red = linalg.generic {indexing_maps = [affine_map<(d0, d1) -> (d0, d1)>,
+                                          affine_map<(d0, d1) -> (d0)>],
+                         iterator_types = ["parallel", "reduction"]}
+                         ins(%arg0 : tensor<4x4xf32>) outs(%fill : tensor<4xf32>) {
+  ^bb0(%in0: f32, %out0: f32):
+    %max = arith.maxnumf %in0, %out0 : f32
+    linalg.yield %max : f32
+  } -> tensor<4xf32>
+  return %red : tensor<4xf32>
+}
+
+
+module attributes {transform.with_named_sequence} {
+  transform.named_sequence @__transform_main(%arg1: !transform.any_op {transform.readonly}) {
+    %3 = transform.structured.match ops{["linalg.generic"]} in %arg1 : (!transform.any_op) -> !transform.any_op
+    %4 = transform.get_parent_op %3 {isolated_from_above} : (!transform.any_op) -> !transform.any_op
+    %5 = transform.structured.vectorize_children_and_apply_patterns %4 { vectorize_padding } : (!transform.any_op) -> !transform.any_op
+    transform.yield
+  }
+}
+
+// -----
+
+// CHECK-LABEL:   func @red_minimumf_2d(
+func.func @red_minimumf_2d(%arg0: tensor<4x4xf32>) -> tensor<4xf32> {
   // CHECK: %[[CMAXF:.+]] = arith.constant dense<3.402820e+38> : vector<4xf32>
   // CHECK: tensor.empty() : tensor<4xf32>
   // CHECK: vector.transfer_read {{.*}} : tensor<4x4xf32>, vector<4x4xf32>
@@ -1305,6 +1337,39 @@ module attributes {transform.with_named_sequence} {
 
 // -----
 
+// CHECK-LABEL:   func @red_minnumf_2d(
+func.func @red_minnumf_2d(%arg0: tensor<4x4xf32>) -> tensor<4xf32> {
+  // CHECK: %[[CMAXF:.+]] = arith.constant dense<3.402820e+38> : vector<4xf32>
+  // CHECK: tensor.empty() : tensor<4xf32>
+  // CHECK: vector.transfer_read {{.*}} : tensor<4x4xf32>, vector<4x4xf32>
+  // CHECK: vector.multi_reduction <minnumf>, {{.*}}, %[[CMAXF]] [1] : vector<4x4xf32> to vector<4xf32>
+  // CHECK: vector.transfer_write {{.*}} : vector<4xf32>, tensor<4xf32>
+  %maxf32 = arith.constant 3.40282e+38 : f32
+  %init = tensor.empty() : tensor<4xf32>
+  %fill = linalg.fill ins(%maxf32 : f32) outs(%init : tensor<4xf32>) -> tensor<4xf32>
+  %red = linalg.generic {indexing_maps = [affine_map<(d0, d1) -> (d0, d1)>,
+                                          affine_map<(d0, d1) -> (d0)>],
+                         iterator_types = ["parallel", "reduction"]}
+                         ins(%arg0 : tensor<4x4xf32>) outs(%fill : tensor<4xf32>) {
+  ^bb0(%in0: f32, %out0: f32):
+    %min = arith.minnumf %out0, %in0 : f32
+    linalg.yield %min : f32
+  } -> tensor<4xf32>
+  return %red : tensor<4xf32>
+}
+
+
+module attributes {transform.with_named_sequence} {
+  transform.named_sequence @__transform_main(%arg1: !transform.any_op {transform.readonly}) {
+    %3 = transform.structured.match ops{["linalg.generic"]} in %arg1 : (!transform.any_op) -> !transform.any_op
+    %4 = transform.get_parent_op %3 {isolated_from_above} : (!transform.any_op) -> !transform.any_op
+    %5 = transform.structured.vectorize_children_and_apply_patterns %4 : (!transform.any_op) -> !transform.any_op
+    transform.yield
+  }
+}
+
+// -----
+
 // CHECK-LABEL:   func @red_mul_2d(
 func.func @red_mul_2d(%arg0: tensor<4x4xf32>) -> tensor<4xf32> {
   // CHECK: tensor.empty() : tensor<4xf32>
diff --git a/mlir/test/Dialect/NVGPU/invalid.mlir b/mlir/test/Dialect/NVGPU/invalid.mlir
index c3aed35..f7db114 100644
--- a/mlir/test/Dialect/NVGPU/invalid.mlir
+++ b/mlir/test/Dialect/NVGPU/invalid.mlir
@@ -336,3 +336,21 @@ func.func @tma_generate_descriptor_incorrect_last_dim(%desc: !desc,  %buffer2: m
   nvgpu.tma.async.load %desc[%c0, %c0], %mbarrier[%c0] to %buffer2 : !desc, !mbarrier -> memref<64x128xf32,3>
   return
 }
+// -----
+
+func.func @rcp_unsupported_rounding_0(%in : vector<16xf32>) {
+  // expected-error @+1 {{'nvgpu.rcp' op has a limitation. #nvgpu<rcp_rounding_mode rn> or non-ftz is not supported yet.}}
+  %out = nvgpu.rcp %in {rounding = rn, ftz} : vector<16xf32>
+}
+// -----
+
+func.func @rcp_unsupported_rounding_1(%in : vector<16xf32>) {
+  // expected-error @+1 {{'nvgpu.rcp' op has a limitation. #nvgpu<rcp_rounding_mode rz> or non-ftz is not supported yet.}}
+  %out = nvgpu.rcp %in {rounding = rz} : vector<16xf32>
+}
+// -----
+
+func.func @rcp_unsupported_ftz(%in : vector<16xf32>) {
+  // expected-error @+1 {{'nvgpu.rcp' op has a limitation. #nvgpu<rcp_rounding_mode approx> or non-ftz is not supported yet.}}
+  %out = nvgpu.rcp %in {rounding = approx} : vector<16xf32>
+}
diff --git a/mlir/test/Dialect/OpenMP/invalid.mlir b/mlir/test/Dialect/OpenMP/invalid.mlir
index 9977dd5..1d1d93f 100644
--- a/mlir/test/Dialect/OpenMP/invalid.mlir
+++ b/mlir/test/Dialect/OpenMP/invalid.mlir
@@ -420,8 +420,8 @@ func.func @omp_simd_aligned_mismatch(%arg0 : index, %arg1 : index,
     omp.loop_nest (%iv) : index = (%arg0) to (%arg1) step (%arg2) {
       omp.yield
     }
-  }) {alignment_values = [128],
-      operandSegmentSizes = array<i32: 2, 0, 0>} : (memref<i32>, memref<i32>) -> ()
+  }) {alignments = [128],
+      operandSegmentSizes = array<i32: 2, 0, 0, 0, 0, 0, 0>} : (memref<i32>, memref<i32>) -> ()
   return
 }
 
@@ -435,7 +435,7 @@ func.func @omp_simd_aligned_negative(%arg0 : index, %arg1 : index,
     omp.loop_nest (%iv) : index = (%arg0) to (%arg1) step (%arg2) {
       omp.yield
     }
-  }) {alignment_values = [-1, 128], operandSegmentSizes = array<i32: 2, 0, 0>} : (memref<i32>, memref<i32>) -> ()
+  }) {alignments = [-1, 128], operandSegmentSizes = array<i32: 2, 0, 0, 0, 0, 0, 0>} : (memref<i32>, memref<i32>) -> ()
   return
 }
 
@@ -449,7 +449,7 @@ func.func @omp_simd_unexpected_alignment(%arg0 : index, %arg1 : index,
     omp.loop_nest (%iv) : index = (%arg0) to (%arg1) step (%arg2) {
       omp.yield
     }
-  }) {alignment_values = [1, 128]} : () -> ()
+  }) {alignments = [1, 128]} : () -> ()
   return
 }
 
@@ -463,7 +463,7 @@ func.func @omp_simd_aligned_float(%arg0 : index, %arg1 : index,
     omp.loop_nest (%iv) : index = (%arg0) to (%arg1) step (%arg2) {
       omp.yield
     }
-  }) {alignment_values = [1.5, 128], operandSegmentSizes = array<i32: 2, 0, 0>} : (memref<i32>, memref<i32>) -> ()
+  }) {alignments = [1.5, 128], operandSegmentSizes = array<i32: 2, 0, 0, 0, 0, 0, 0>} : (memref<i32>, memref<i32>) -> ()
   return
 }
 
@@ -477,7 +477,7 @@ func.func @omp_simd_aligned_the_same_var(%arg0 : index, %arg1 : index,
     omp.loop_nest (%iv) : index = (%arg0) to (%arg1) step (%arg2) {
       omp.yield
     }
-  }) {alignment_values = [1, 128], operandSegmentSizes = array<i32: 2, 0, 0>} : (memref<i32>, memref<i32>) -> ()
+  }) {alignments = [1, 128], operandSegmentSizes = array<i32: 2, 0, 0, 0, 0, 0, 0>} : (memref<i32>, memref<i32>) -> ()
   return
 }
 
@@ -491,7 +491,7 @@ func.func @omp_simd_nontemporal_the_same_var(%arg0 : index,  %arg1 : index,
     omp.loop_nest (%iv) : index = (%arg0) to (%arg1) step (%arg2) {
       omp.yield
     }
-  }) {operandSegmentSizes = array<i32: 0, 0, 2>} : (memref<i32>, memref<i32>) -> ()
+  }) {operandSegmentSizes = array<i32: 0, 0, 0, 0, 2, 0, 0>} : (memref<i32>, memref<i32>) -> ()
   return
 }
 
@@ -839,7 +839,7 @@ func.func @omp_ordered_region3(%x : i32) -> () {
 
 func.func @omp_ordered1(%vec0 : i64) -> () {
   // expected-error @below {{op must be nested inside of a loop}}
-  omp.ordered depend_type(dependsink) depend_vec(%vec0 : i64) {num_loops_val = 1 : i64}
+  omp.ordered depend_type(dependsink) depend_vec(%vec0 : i64) {doacross_num_loops = 1 : i64}
   return
 }
 
@@ -849,7 +849,7 @@ func.func @omp_ordered2(%arg1 : i32, %arg2 : i32, %arg3 : i32, %vec0 : i64) -> (
   omp.distribute {
     omp.loop_nest (%0) : i32 = (%arg1) to (%arg2) step (%arg3) {
       // expected-error @below {{op must be nested inside of a worksharing, simd or worksharing simd loop}}
-      omp.ordered depend_type(dependsink) depend_vec(%vec0 : i64) {num_loops_val = 1 : i64}
+      omp.ordered depend_type(dependsink) depend_vec(%vec0 : i64) {doacross_num_loops = 1 : i64}
       omp.yield
     }
     omp.terminator
@@ -863,7 +863,7 @@ func.func @omp_ordered3(%arg1 : i32, %arg2 : i32, %arg3 : i32, %vec0 : i64) -> (
   omp.wsloop {
     omp.loop_nest (%0) : i32 = (%arg1) to (%arg2) step (%arg3) {
       // expected-error @below {{the enclosing worksharing-loop region must have an ordered clause}}
-      omp.ordered depend_type(dependsink) depend_vec(%vec0 : i64) {num_loops_val = 1 : i64}
+      omp.ordered depend_type(dependsink) depend_vec(%vec0 : i64) {doacross_num_loops = 1 : i64}
       omp.yield
     }
     omp.terminator
@@ -877,7 +877,7 @@ func.func @omp_ordered4(%arg1 : i32, %arg2 : i32, %arg3 : i32, %vec0 : i64) -> (
   omp.wsloop ordered(0) {
     omp.loop_nest (%0) : i32 = (%arg1) to (%arg2) step (%arg3) {
       // expected-error @below {{the enclosing loop's ordered clause must have a parameter present}}
-      omp.ordered depend_type(dependsink) depend_vec(%vec0 : i64) {num_loops_val = 1 : i64}
+      omp.ordered depend_type(dependsink) depend_vec(%vec0 : i64) {doacross_num_loops = 1 : i64}
       omp.yield
     }
     omp.terminator
@@ -891,7 +891,7 @@ func.func @omp_ordered5(%arg1 : i32, %arg2 : i32, %arg3 : i32, %vec0 : i64, %vec
   omp.wsloop ordered(1) {
     omp.loop_nest (%0) : i32 = (%arg1) to (%arg2) step (%arg3) {
       // expected-error @below {{number of variables in depend clause does not match number of iteration variables in the doacross loop}}
-      omp.ordered depend_type(dependsource) depend_vec(%vec0, %vec1 : i64, i64) {num_loops_val = 2 : i64}
+      omp.ordered depend_type(dependsource) depend_vec(%vec0, %vec1 : i64, i64) {doacross_num_loops = 2 : i64}
       omp.yield
     }
     omp.terminator
@@ -1394,7 +1394,7 @@ func.func @omp_teams_allocate(%data_var : memref<i32>) {
     // expected-error @below {{expected equal sizes for allocate and allocator variables}}
     "omp.teams" (%data_var) ({
       omp.terminator
-    }) {operandSegmentSizes = array<i32: 0,0,0,0,1,0,0>} : (memref<i32>) -> ()
+    }) {operandSegmentSizes = array<i32: 1,0,0,0,0,0,0,0>} : (memref<i32>) -> ()
     omp.terminator
   }
   return
@@ -1407,7 +1407,7 @@ func.func @omp_teams_num_teams1(%lb : i32) {
     // expected-error @below {{expected num_teams upper bound to be defined if the lower bound is defined}}
     "omp.teams" (%lb) ({
       omp.terminator
-    }) {operandSegmentSizes = array<i32: 1,0,0,0,0,0,0>} : (i32) -> ()
+    }) {operandSegmentSizes = array<i32: 0,0,0,1,0,0,0,0>} : (i32) -> ()
     omp.terminator
   }
   return
@@ -1432,7 +1432,7 @@ func.func @omp_sections(%data_var : memref<i32>) -> () {
   // expected-error @below {{expected equal sizes for allocate and allocator variables}}
   "omp.sections" (%data_var) ({
     omp.terminator
-  }) {operandSegmentSizes = array<i32: 0,1,0>} : (memref<i32>) -> ()
+  }) {operandSegmentSizes = array<i32: 1,0,0,0>} : (memref<i32>) -> ()
   return
 }
 
@@ -1442,7 +1442,7 @@ func.func @omp_sections(%data_var : memref<i32>) -> () {
   // expected-error @below {{expected as many reduction symbol references as reduction variables}}
   "omp.sections" (%data_var) ({
     omp.terminator
-  }) {operandSegmentSizes = array<i32: 1,0,0>} : (memref<i32>) -> ()
+  }) {operandSegmentSizes = array<i32: 0,0,0,1>} : (memref<i32>) -> ()
   return
 }
 
@@ -1557,17 +1557,17 @@ func.func @omp_single(%data_var : memref<i32>) -> () {
   // expected-error @below {{expected equal sizes for allocate and allocator variables}}
   "omp.single" (%data_var) ({
     omp.barrier
-  }) {operandSegmentSizes = array<i32: 1,0,0>} : (memref<i32>) -> ()
+  }) {operandSegmentSizes = array<i32: 1,0,0,0>} : (memref<i32>) -> ()
   return
 }
 
 // -----
 
 func.func @omp_single_copyprivate(%data_var : memref<i32>) -> () {
-  // expected-error @below {{inconsistent number of copyPrivate vars (= 1) and functions (= 0), both must be equal}}
+  // expected-error @below {{inconsistent number of copyprivate vars (= 1) and functions (= 0), both must be equal}}
   "omp.single" (%data_var) ({
     omp.barrier
-  }) {operandSegmentSizes = array<i32: 0,0,1>} : (memref<i32>) -> ()
+  }) {operandSegmentSizes = array<i32: 0,0,1,0>} : (memref<i32>) -> ()
   return
 }
 
@@ -1623,7 +1623,7 @@ func.func @omp_task_depend(%data_var: memref<i32>) {
   // expected-error @below {{op expected as many depend values as depend variables}}
     "omp.task"(%data_var) ({
       "omp.terminator"() : () -> ()
-    }) {depends = [], operandSegmentSizes = array<i32: 0, 0, 0, 0, 1, 0, 0>} : (memref<i32>) -> ()
+    }) {depend_kinds = [], operandSegmentSizes = array<i32: 0, 0, 1, 0, 0, 0, 0, 0>} : (memref<i32>) -> ()
    "func.return"() : () -> ()
 }
 
@@ -1820,7 +1820,7 @@ func.func @taskloop(%lb: i32, %ub: i32, %step: i32) {
     omp.loop_nest (%i, %j) : i32 = (%lb, %ub) to (%ub, %lb) step (%step, %step) {
       omp.yield
     }
-  }) {operandSegmentSizes = array<i32: 0, 0, 0, 0, 0, 1, 0, 0, 0>} : (memref<i32>) -> ()
+  }) {operandSegmentSizes = array<i32: 1, 0, 0, 0, 0, 0, 0, 0, 0, 0>} : (memref<i32>) -> ()
   return
 }
 
@@ -1834,7 +1834,7 @@ func.func @taskloop(%lb: i32, %ub: i32, %step: i32) {
     omp.loop_nest (%i, %j) : i32 = (%lb, %ub) to (%ub, %lb) step (%step, %step) {
       omp.yield
     }
-  }) {operandSegmentSizes = array<i32: 0, 0, 0, 2, 0, 0, 0, 0, 0>, reductions = [@add_f32]} : (!llvm.ptr, !llvm.ptr) -> ()
+  }) {operandSegmentSizes = array<i32: 0, 0, 0, 0, 0, 0, 0, 0, 0, 2>, reduction_syms = [@add_f32]} : (!llvm.ptr, !llvm.ptr) -> ()
   return
 }
 
@@ -1847,7 +1847,7 @@ func.func @taskloop(%lb: i32, %ub: i32, %step: i32) {
     omp.loop_nest (%i, %j) : i32 = (%lb, %ub) to (%ub, %lb) step (%step, %step) {
       omp.yield
     }
-  }) {operandSegmentSizes = array<i32: 0, 0, 0, 1, 0, 0, 0, 0, 0>, reductions = [@add_f32, @add_f32]} : (!llvm.ptr) -> ()
+  }) {operandSegmentSizes = array<i32: 0, 0, 0, 0, 0, 0, 0, 0, 0, 1>, reduction_syms = [@add_f32, @add_f32]} : (!llvm.ptr) -> ()
   return
 }
 
@@ -1861,7 +1861,7 @@ func.func @taskloop(%lb: i32, %ub: i32, %step: i32) {
     omp.loop_nest (%i, %j) : i32 = (%lb, %ub) to (%ub, %lb) step (%step, %step) {
       omp.yield
     }
-  }) {in_reductions = [@add_f32], operandSegmentSizes = array<i32: 0, 0, 2, 0, 0, 0, 0, 0, 0>} : (!llvm.ptr, !llvm.ptr) -> ()
+  }) {in_reduction_syms = [@add_f32], operandSegmentSizes = array<i32: 0, 0, 0, 0, 0, 2, 0, 0, 0, 0>} : (!llvm.ptr, !llvm.ptr) -> ()
   return
 }
 
@@ -1874,7 +1874,7 @@ func.func @taskloop(%lb: i32, %ub: i32, %step: i32) {
     omp.loop_nest (%i, %j) : i32 = (%lb, %ub) to (%ub, %lb) step (%step, %step) {
       omp.yield
     }
-  }) {in_reductions = [@add_f32, @add_f32], operandSegmentSizes = array<i32: 0, 0, 1, 0, 0, 0, 0, 0, 0>} : (!llvm.ptr) -> ()
+  }) {in_reduction_syms = [@add_f32, @add_f32], operandSegmentSizes = array<i32: 0, 0, 0, 0, 0, 1, 0, 0, 0, 0>} : (!llvm.ptr) -> ()
   return
 }
 
@@ -1934,7 +1934,7 @@ func.func @taskloop(%lb: i32, %ub: i32, %step: i32) {
 func.func @taskloop(%lb: i32, %ub: i32, %step: i32) {
   %testi64 = "test.i64"() : () -> (i64)
   // expected-error @below {{the grainsize clause and num_tasks clause are mutually exclusive and may not appear on the same taskloop directive}}
-  omp.taskloop grain_size(%testi64: i64) num_tasks(%testi64: i64) {
+  omp.taskloop grainsize(%testi64: i64) num_tasks(%testi64: i64) {
     omp.loop_nest (%i, %j) : i32 = (%lb, %ub) to (%ub, %lb) step (%step, %step) {
       omp.yield
     }
@@ -2001,7 +2001,7 @@ func.func @omp_target_data(%map1: memref<?xi32>) {
 // -----
 
 func.func @omp_target_data() {
-  // expected-error @below {{At least one of map, useDevicePtr, or useDeviceAddr operand must be present}}
+  // expected-error @below {{At least one of map, use_device_ptr_vars, or use_device_addr_vars operand must be present}}
   omp.target_data {}
   return
 }
@@ -2020,7 +2020,7 @@ func.func @omp_target_enter_data(%map1: memref<?xi32>) {
 func.func @omp_target_enter_data_depend(%a: memref<?xi32>) {
   %0 = omp.map.info var_ptr(%a: memref<?xi32>, tensor<?xi32>) map_clauses(to) capture(ByRef) -> memref<?xi32>
   // expected-error @below {{op expected as many depend values as depend variables}}
-  omp.target_enter_data map_entries(%0: memref<?xi32> ) {operandSegmentSizes = array<i32: 0, 0, 1, 0>}
+  omp.target_enter_data map_entries(%0: memref<?xi32> ) {operandSegmentSizes = array<i32: 1, 0, 0, 0>}
   return
 }
 
@@ -2038,7 +2038,7 @@ func.func @omp_target_exit_data(%map1: memref<?xi32>) {
 func.func @omp_target_exit_data_depend(%a: memref<?xi32>) {
   %0 = omp.map.info var_ptr(%a: memref<?xi32>, tensor<?xi32>) map_clauses(from) capture(ByRef) -> memref<?xi32>
   // expected-error @below {{op expected as many depend values as depend variables}}
-  omp.target_exit_data map_entries(%0: memref<?xi32> ) {operandSegmentSizes = array<i32: 0, 0, 1, 0>}
+  omp.target_exit_data map_entries(%0: memref<?xi32> ) {operandSegmentSizes = array<i32: 1, 0, 0, 0>}
   return
 }
 
@@ -2119,7 +2119,7 @@ llvm.mlir.global internal @_QFsubEx() : i32
 func.func @omp_target_update_data_depend(%a: memref<?xi32>) {
   %0 = omp.map.info var_ptr(%a: memref<?xi32>, tensor<?xi32>) map_clauses(to) capture(ByRef) -> memref<?xi32>
   // expected-error @below {{op expected as many depend values as depend variables}}
-  omp.target_update map_entries(%0: memref<?xi32> ) {operandSegmentSizes = array<i32: 0, 0, 1, 0>}
+  omp.target_update map_entries(%0: memref<?xi32> ) {operandSegmentSizes = array<i32: 1, 0, 0, 0>}
   return
 }
 
@@ -2129,7 +2129,7 @@ func.func @omp_target_depend(%data_var: memref<i32>) {
   // expected-error @below {{op expected as many depend values as depend variables}}
     "omp.target"(%data_var) ({
       "omp.terminator"() : () -> ()
-    }) {depends = [], operandSegmentSizes = array<i32: 0, 0, 0, 1, 0, 0, 0, 0>} : (memref<i32>) -> ()
+    }) {depend_kinds = [], operandSegmentSizes = array<i32: 0, 0, 1, 0, 0, 0, 0, 0, 0, 0, 0>} : (memref<i32>) -> ()
    "func.return"() : () -> ()
 }
 
@@ -2137,7 +2137,7 @@ func.func @omp_target_depend(%data_var: memref<i32>) {
 
 func.func @omp_distribute_schedule(%chunk_size : i32) -> () {
   // expected-error @below {{op chunk size set without dist_schedule_static being present}}
-  "omp.distribute"(%chunk_size) <{operandSegmentSizes = array<i32: 1, 0, 0>}> ({
+  "omp.distribute"(%chunk_size) <{operandSegmentSizes = array<i32: 0, 0, 1, 0>}> ({
       "omp.terminator"() : () -> ()
     }) : (i32) -> ()
 }
@@ -2146,7 +2146,7 @@ func.func @omp_distribute_schedule(%chunk_size : i32) -> () {
 
 func.func @omp_distribute_allocate(%data_var : memref<i32>) -> () {
   // expected-error @below {{expected equal sizes for allocate and allocator variables}}
-  "omp.distribute"(%data_var) <{operandSegmentSizes = array<i32: 0, 1, 0>}> ({
+  "omp.distribute"(%data_var) <{operandSegmentSizes = array<i32: 1, 0, 0, 0>}> ({
       "omp.terminator"() : () -> ()
     }) : (memref<i32>) -> ()
 }
@@ -2340,7 +2340,7 @@ func.func @undefined_privatizer(%arg0: index) {
 // -----
 func.func @undefined_privatizer(%arg0: !llvm.ptr) {
   // expected-error @below {{inconsistent number of private variables and privatizer op symbols, private vars: 1 vs. privatizer op symbols: 2}}
-  "omp.parallel"(%arg0) <{operandSegmentSizes = array<i32: 0, 0, 0, 0, 0, 1>, privatizers = [@x.privatizer, @y.privatizer]}> ({
+  "omp.parallel"(%arg0) <{operandSegmentSizes = array<i32: 0, 0, 0, 0, 1, 0>, private_syms = [@x.privatizer, @y.privatizer]}> ({
     ^bb0(%arg2: !llvm.ptr):
       omp.terminator
     }) : (!llvm.ptr) -> ()
diff --git a/mlir/test/Dialect/OpenMP/ops.mlir b/mlir/test/Dialect/OpenMP/ops.mlir
index d6f4a81..d292499 100644
--- a/mlir/test/Dialect/OpenMP/ops.mlir
+++ b/mlir/test/Dialect/OpenMP/ops.mlir
@@ -67,37 +67,37 @@ func.func @omp_terminator() -> () {
 
 func.func @omp_parallel(%data_var : memref<i32>, %if_cond : i1, %num_threads : i32, %idx : index) -> () {
   // CHECK: omp.parallel if(%{{.*}}) num_threads(%{{.*}} : i32) allocate(%{{.*}} : memref<i32> -> %{{.*}} : memref<i32>)
-  "omp.parallel" (%if_cond, %num_threads, %data_var, %data_var) ({
+  "omp.parallel" (%data_var, %data_var, %if_cond, %num_threads) ({
 
   // test without if condition
   // CHECK: omp.parallel num_threads(%{{.*}} : i32) allocate(%{{.*}} : memref<i32> -> %{{.*}} : memref<i32>)
-    "omp.parallel"(%num_threads, %data_var, %data_var) ({
+    "omp.parallel"(%data_var, %data_var, %num_threads) ({
       omp.terminator
-    }) {operandSegmentSizes = array<i32: 0,1,1,1,0,0>} : (i32, memref<i32>, memref<i32>) -> ()
+    }) {operandSegmentSizes = array<i32: 1,1,0,1,0,0>} : (memref<i32>, memref<i32>, i32) -> ()
 
   // CHECK: omp.barrier
     omp.barrier
 
   // test without num_threads
   // CHECK: omp.parallel if(%{{.*}}) allocate(%{{.*}} : memref<i32> -> %{{.*}} : memref<i32>)
-    "omp.parallel"(%if_cond, %data_var, %data_var) ({
+    "omp.parallel"(%data_var, %data_var, %if_cond) ({
       omp.terminator
-    }) {operandSegmentSizes = array<i32: 1,0,1,1,0,0>} : (i1, memref<i32>, memref<i32>) -> ()
+    }) {operandSegmentSizes = array<i32: 1,1,1,0,0,0>} : (memref<i32>, memref<i32>, i1) -> ()
 
   // test without allocate
   // CHECK: omp.parallel if(%{{.*}}) num_threads(%{{.*}} : i32)
     "omp.parallel"(%if_cond, %num_threads) ({
       omp.terminator
-    }) {operandSegmentSizes = array<i32: 1,1,0,0,0,0>} : (i1, i32) -> ()
+    }) {operandSegmentSizes = array<i32: 0,0,1,1,0,0>} : (i1, i32) -> ()
 
     omp.terminator
-  }) {operandSegmentSizes = array<i32: 1,1,1,1,0,0>, proc_bind_val = #omp<procbindkind spread>} : (i1, i32, memref<i32>, memref<i32>) -> ()
+  }) {operandSegmentSizes = array<i32: 1,1,1,1,0,0>, proc_bind_kind = #omp<procbindkind spread>} : (memref<i32>, memref<i32>, i1, i32) -> ()
 
   // test with multiple parameters for single variadic argument
   // CHECK: omp.parallel allocate(%{{.*}} : memref<i32> -> %{{.*}} : memref<i32>)
   "omp.parallel" (%data_var, %data_var) ({
     omp.terminator
-  }) {operandSegmentSizes = array<i32: 0,0,1,1,0,0>} : (memref<i32>, memref<i32>) -> ()
+  }) {operandSegmentSizes = array<i32: 1,1,0,0,0,0>} : (memref<i32>, memref<i32>) -> ()
 
   // CHECK: omp.distribute
   omp.distribute {
@@ -184,7 +184,7 @@ func.func @omp_loop_nest(%lb : index, %ub : index, %step : index) -> () {
     "omp.loop_nest" (%lb, %ub, %step) ({
     ^bb0(%iv: index):
       omp.yield
-    }) {inclusive} : (index, index, index) -> ()
+    }) {loop_inclusive} : (index, index, index) -> ()
     omp.terminator
   }
 
@@ -382,7 +382,7 @@ func.func @omp_wsloop(%lb : index, %ub : index, %step : index, %data_var : memre
       omp.yield
     }
     omp.terminator
-  }) {operandSegmentSizes = array<i32: 0,0,0,0>, ordered_val = 1} :
+  }) {operandSegmentSizes = array<i32: 0,0,0,0,0,0,0>, ordered = 1} :
     () -> ()
 
   // CHECK: omp.wsloop linear(%{{.*}} = %{{.*}} : memref<i32>) schedule(static) {
@@ -392,7 +392,7 @@ func.func @omp_wsloop(%lb : index, %ub : index, %step : index, %data_var : memre
       omp.yield
     }
     omp.terminator
-  }) {operandSegmentSizes = array<i32: 1,1,0,0>, schedule_val = #omp<schedulekind static>} :
+  }) {operandSegmentSizes = array<i32: 0,0,1,1,0,0,0>, schedule_kind = #omp<schedulekind static>} :
     (memref<i32>, i32) -> ()
 
   // CHECK: omp.wsloop linear(%{{.*}} = %{{.*}} : memref<i32>, %{{.*}} = %{{.*}} : memref<i32>) schedule(static) {
@@ -402,7 +402,7 @@ func.func @omp_wsloop(%lb : index, %ub : index, %step : index, %data_var : memre
       omp.yield
     }
     omp.terminator
-  }) {operandSegmentSizes = array<i32: 2,2,0,0>, schedule_val = #omp<schedulekind static>} :
+  }) {operandSegmentSizes = array<i32: 0,0,2,2,0,0,0>, schedule_kind = #omp<schedulekind static>} :
     (memref<i32>, memref<i32>, i32, i32) -> ()
 
   // CHECK: omp.wsloop linear(%{{.*}} = %{{.*}} : memref<i32>) schedule(dynamic = %{{.*}}) ordered(2) {
@@ -412,7 +412,7 @@ func.func @omp_wsloop(%lb : index, %ub : index, %step : index, %data_var : memre
       omp.yield
     }
     omp.terminator
-  }) {operandSegmentSizes = array<i32: 1,1,0,1>, schedule_val = #omp<schedulekind dynamic>, ordered_val = 2} :
+  }) {operandSegmentSizes = array<i32: 0,0,1,1,0,0,1>, schedule_kind = #omp<schedulekind dynamic>, ordered = 2} :
     (memref<i32>, i32, i32) -> ()
 
   // CHECK: omp.wsloop schedule(auto) nowait {
@@ -422,7 +422,7 @@ func.func @omp_wsloop(%lb : index, %ub : index, %step : index, %data_var : memre
       omp.yield
     }
     omp.terminator
-  }) {operandSegmentSizes = array<i32: 0,0,0,0>, nowait, schedule_val = #omp<schedulekind auto>} :
+  }) {operandSegmentSizes = array<i32: 0,0,0,0,0,0,0>, nowait, schedule_kind = #omp<schedulekind auto>} :
     () -> ()
 
   // CHECK: omp.wsloop {
@@ -574,8 +574,8 @@ func.func @omp_simd_aligned_list(%arg0 : index, %arg1 : index, %arg2 : index,
       "omp.yield"() : () -> ()
     }) : (index, index, index) -> ()
     "omp.terminator"() : () -> ()
-  }) {alignment_values = [32, 128],
-      operandSegmentSizes = array<i32: 2, 0, 0>} : (memref<i32>, memref<i32>) -> ()
+  }) {alignments = [32, 128],
+      operandSegmentSizes = array<i32: 2, 0, 0, 0, 0, 0, 0>} : (memref<i32>, memref<i32>) -> ()
   return
 }
 
@@ -589,8 +589,8 @@ func.func @omp_simd_aligned_single(%arg0 : index, %arg1 : index, %arg2 : index,
       "omp.yield"() : () -> ()
     }) : (index, index, index) -> ()
     "omp.terminator"() : () -> ()
-  }) {alignment_values = [32],
-      operandSegmentSizes = array<i32: 1, 0, 0>} : (memref<i32>) -> ()
+  }) {alignments = [32],
+      operandSegmentSizes = array<i32: 1, 0, 0, 0, 0, 0, 0>} : (memref<i32>) -> ()
   return
 }
 
@@ -605,7 +605,7 @@ func.func @omp_simd_nontemporal_list(%arg0 : index, %arg1 : index,
       "omp.yield"() : () -> ()
     }) : (index, index, index) -> ()
     "omp.terminator"() : () -> ()
-  }) {operandSegmentSizes = array<i32: 0, 0, 2>} : (memref<i32>, memref<i64>) -> ()
+  }) {operandSegmentSizes = array<i32: 0, 0, 0, 0, 2, 0, 0>} : (memref<i32>, memref<i64>) -> ()
   return
 }
 
@@ -620,7 +620,7 @@ func.func @omp_simd_nontemporal_single(%arg0 : index, %arg1 : index,
       "omp.yield"() : () -> ()
     }) : (index, index, index) -> ()
     "omp.terminator"() : () -> ()
-  }) {operandSegmentSizes = array<i32: 0, 0, 1>} : (memref<i32>) -> ()
+  }) {operandSegmentSizes = array<i32: 0, 0, 0, 0, 1, 0, 0>} : (memref<i32>) -> ()
   return
 }
 
@@ -752,8 +752,8 @@ func.func @omp_distribute(%chunk_size : i32, %data_var : memref<i32>, %arg0 : i3
     }
     omp.terminator
   }
-  // CHECK: omp.distribute dist_schedule_static chunk_size(%{{.+}} : i32)
-  omp.distribute dist_schedule_static chunk_size(%chunk_size : i32) {
+  // CHECK: omp.distribute dist_schedule_static dist_schedule_chunk_size(%{{.+}} : i32)
+  omp.distribute dist_schedule_static dist_schedule_chunk_size(%chunk_size : i32) {
     omp.loop_nest (%iv) : i32 = (%arg0) to (%arg0) step (%arg0) {
       omp.yield
     }
@@ -805,16 +805,16 @@ func.func @omp_distribute(%chunk_size : i32, %data_var : memref<i32>, %arg0 : i3
 func.func @omp_target(%if_cond : i1, %device : si32,  %num_threads : i32, %device_ptr: memref<i32>, %device_addr: memref<?xi32>, %map1: memref<?xi32>, %map2: memref<?xi32>) -> () {
 
     // Test with optional operands; if_expr, device, thread_limit, private, firstprivate and nowait.
-    // CHECK: omp.target if({{.*}}) device({{.*}}) thread_limit({{.*}}) nowait
-    "omp.target"(%if_cond, %device, %num_threads) ({
+    // CHECK: omp.target device({{.*}}) if({{.*}}) nowait thread_limit({{.*}})
+    "omp.target"(%device, %if_cond, %num_threads) ({
        // CHECK: omp.terminator
        omp.terminator
-    }) {nowait, operandSegmentSizes = array<i32: 1,1,1,0,0,0,0,0>} : ( i1, si32, i32 ) -> ()
+    }) {nowait, operandSegmentSizes = array<i32: 0,0,0,1,0,1,0,0,0,0,1>} : ( si32, i1, i32 ) -> ()
 
     // Test with optional map clause.
     // CHECK: %[[MAP_A:.*]] = omp.map.info var_ptr(%[[VAL_1:.*]] : memref<?xi32>, tensor<?xi32>)   map_clauses(tofrom) capture(ByRef) -> memref<?xi32> {name = ""}
     // CHECK: %[[MAP_B:.*]] = omp.map.info var_ptr(%[[VAL_2:.*]] : memref<?xi32>, tensor<?xi32>)   map_clauses(exit_release_or_enter_alloc) capture(ByRef) -> memref<?xi32> {name = ""}
-    // CHECK: omp.target is_device_ptr(%[[VAL_4:.*]] : memref<i32>) has_device_addr(%[[VAL_5:.*]] : memref<?xi32>) map_entries(%[[MAP_A]] -> {{.*}}, %[[MAP_B]] -> {{.*}} : memref<?xi32>, memref<?xi32>) {
+    // CHECK: omp.target has_device_addr(%[[VAL_5:.*]] : memref<?xi32>) is_device_ptr(%[[VAL_4:.*]] : memref<i32>) map_entries(%[[MAP_A]] -> {{.*}}, %[[MAP_B]] -> {{.*}} : memref<?xi32>, memref<?xi32>) {
     %mapv1 = omp.map.info var_ptr(%map1 : memref<?xi32>, tensor<?xi32>)   map_clauses(tofrom) capture(ByRef) -> memref<?xi32> {name = ""}
     %mapv2 = omp.map.info var_ptr(%map2 : memref<?xi32>, tensor<?xi32>)   map_clauses(exit_release_or_enter_alloc) capture(ByRef) -> memref<?xi32> {name = ""}
     omp.target map_entries(%mapv1 -> %arg0, %mapv2 -> %arg1 : memref<?xi32>, memref<?xi32>) is_device_ptr(%device_ptr : memref<i32>) has_device_addr(%device_addr : memref<?xi32>) {
@@ -838,12 +838,12 @@ func.func @omp_target(%if_cond : i1, %device : si32,  %num_threads : i32, %devic
 
 func.func @omp_target_data (%if_cond : i1, %device : si32, %device_ptr: memref<i32>, %device_addr: memref<?xi32>, %map1: memref<?xi32>, %map2: memref<?xi32>) -> () {
     // CHECK: %[[MAP_A:.*]] = omp.map.info var_ptr(%[[VAL_2:.*]] : memref<?xi32>, tensor<?xi32>)   map_clauses(always, from) capture(ByRef) -> memref<?xi32> {name = ""}
-    // CHECK: omp.target_data if(%[[VAL_0:.*]]) device(%[[VAL_1:.*]] : si32) map_entries(%[[MAP_A]] : memref<?xi32>)
+    // CHECK: omp.target_data device(%[[VAL_1:.*]] : si32) if(%[[VAL_0:.*]]) map_entries(%[[MAP_A]] : memref<?xi32>)
     %mapv1 = omp.map.info var_ptr(%map1 : memref<?xi32>, tensor<?xi32>)   map_clauses(always, from) capture(ByRef) -> memref<?xi32> {name = ""}
     omp.target_data if(%if_cond) device(%device : si32) map_entries(%mapv1 : memref<?xi32>){}
 
     // CHECK: %[[MAP_A:.*]] = omp.map.info var_ptr(%[[VAL_2:.*]] : memref<?xi32>, tensor<?xi32>)   map_clauses(close, present, to) capture(ByRef) -> memref<?xi32> {name = ""}
-    // CHECK: omp.target_data use_device_ptr(%[[VAL_3:.*]] : memref<i32>) use_device_addr(%[[VAL_4:.*]] : memref<?xi32>) map_entries(%[[MAP_A]] : memref<?xi32>)
+    // CHECK: omp.target_data map_entries(%[[MAP_A]] : memref<?xi32>) use_device_addr(%[[VAL_4:.*]] : memref<?xi32>) use_device_ptr(%[[VAL_3:.*]] : memref<i32>)
     %mapv2 = omp.map.info var_ptr(%map1 : memref<?xi32>, tensor<?xi32>)   map_clauses(close, present, to) capture(ByRef) -> memref<?xi32> {name = ""}
     omp.target_data use_device_ptr(%device_ptr : memref<i32>) use_device_addr(%device_addr : memref<?xi32>) map_entries(%mapv2 : memref<?xi32>) {}
 
@@ -855,12 +855,12 @@ func.func @omp_target_data (%if_cond : i1, %device : si32, %device_ptr: memref<i
     omp.target_data map_entries(%mapv3, %mapv4 : memref<?xi32>, memref<?xi32>) {}
 
     // CHECK: %[[MAP_A:.*]] = omp.map.info var_ptr(%[[VAL_3:.*]] : memref<?xi32>, tensor<?xi32>)   map_clauses(exit_release_or_enter_alloc) capture(ByRef) -> memref<?xi32> {name = ""}
-    // CHECK: omp.target_enter_data if(%[[VAL_0:.*]]) device(%[[VAL_1:.*]] : si32) nowait map_entries(%[[MAP_A]] : memref<?xi32>)
+    // CHECK: omp.target_enter_data device(%[[VAL_1:.*]] : si32) if(%[[VAL_0:.*]]) map_entries(%[[MAP_A]] : memref<?xi32>) nowait
     %mapv5 = omp.map.info var_ptr(%map1 : memref<?xi32>, tensor<?xi32>)   map_clauses(exit_release_or_enter_alloc) capture(ByRef) -> memref<?xi32> {name = ""}
     omp.target_enter_data if(%if_cond) device(%device : si32) nowait map_entries(%mapv5 : memref<?xi32>)
 
     // CHECK: %[[MAP_A:.*]] = omp.map.info var_ptr(%[[VAL_3:.*]] : memref<?xi32>, tensor<?xi32>)   map_clauses(exit_release_or_enter_alloc) capture(ByRef) -> memref<?xi32> {name = ""}
-    // CHECK: omp.target_exit_data if(%[[VAL_0:.*]]) device(%[[VAL_1:.*]] : si32) nowait map_entries(%[[MAP_A]] : memref<?xi32>)
+    // CHECK: omp.target_exit_data device(%[[VAL_1:.*]] : si32) if(%[[VAL_0:.*]]) map_entries(%[[MAP_A]] : memref<?xi32>) nowait
     %mapv6 = omp.map.info var_ptr(%map2 : memref<?xi32>, tensor<?xi32>)   map_clauses(exit_release_or_enter_alloc) capture(ByRef) -> memref<?xi32> {name = ""}
     omp.target_exit_data if(%if_cond) device(%device : si32) nowait map_entries(%mapv6 : memref<?xi32>)
 
@@ -869,12 +869,12 @@ func.func @omp_target_data (%if_cond : i1, %device : si32, %device_ptr: memref<i
 
 // CHECK-LABEL: omp_target_pretty
 func.func @omp_target_pretty(%if_cond : i1, %device : si32,  %num_threads : i32) -> () {
-    // CHECK: omp.target if({{.*}}) device({{.*}})
+    // CHECK: omp.target device({{.*}}) if({{.*}})
     omp.target if(%if_cond) device(%device : si32) {
       omp.terminator
     }
 
-    // CHECK: omp.target if({{.*}}) device({{.*}}) nowait
+    // CHECK: omp.target device({{.*}}) if({{.*}}) nowait
     omp.target if(%if_cond) device(%device : si32) thread_limit(%num_threads : i32) nowait {
       omp.terminator
     }
@@ -1294,11 +1294,11 @@ func.func @omp_ordered(%arg1 : i32, %arg2 : i32, %arg3 : i32,
   omp.wsloop ordered(1) {
     omp.loop_nest (%0) : i32 = (%arg1) to (%arg2) step (%arg3) {
       // Only one DEPEND(SINK: vec) clause
-      // CHECK: omp.ordered depend_type(dependsink) depend_vec(%{{.*}} : i64) {num_loops_val = 1 : i64}
-      omp.ordered depend_type(dependsink) depend_vec(%vec0 : i64) {num_loops_val = 1 : i64}
+      // CHECK: omp.ordered depend_type(dependsink) depend_vec(%{{.*}} : i64) {doacross_num_loops = 1 : i64}
+      omp.ordered depend_type(dependsink) depend_vec(%vec0 : i64) {doacross_num_loops = 1 : i64}
 
-      // CHECK: omp.ordered depend_type(dependsource) depend_vec(%{{.*}} : i64) {num_loops_val = 1 : i64}
-      omp.ordered depend_type(dependsource) depend_vec(%vec0 : i64) {num_loops_val = 1 : i64}
+      // CHECK: omp.ordered depend_type(dependsource) depend_vec(%{{.*}} : i64) {doacross_num_loops = 1 : i64}
+      omp.ordered depend_type(dependsource) depend_vec(%vec0 : i64) {doacross_num_loops = 1 : i64}
 
       omp.yield
     }
@@ -1308,11 +1308,11 @@ func.func @omp_ordered(%arg1 : i32, %arg2 : i32, %arg3 : i32,
   omp.wsloop ordered(2) {
     omp.loop_nest (%0) : i32 = (%arg1) to (%arg2) step (%arg3) {
       // Multiple DEPEND(SINK: vec) clauses
-      // CHECK: omp.ordered depend_type(dependsink) depend_vec(%{{.*}}, %{{.*}}, %{{.*}}, %{{.*}} : i64, i64, i64, i64) {num_loops_val = 2 : i64}
-      omp.ordered depend_type(dependsink) depend_vec(%vec0, %vec1, %vec2, %vec3 : i64, i64, i64, i64) {num_loops_val = 2 : i64}
+      // CHECK: omp.ordered depend_type(dependsink) depend_vec(%{{.*}}, %{{.*}}, %{{.*}}, %{{.*}} : i64, i64, i64, i64) {doacross_num_loops = 2 : i64}
+      omp.ordered depend_type(dependsink) depend_vec(%vec0, %vec1, %vec2, %vec3 : i64, i64, i64, i64) {doacross_num_loops = 2 : i64}
 
-      // CHECK: omp.ordered depend_type(dependsource) depend_vec(%{{.*}}, %{{.*}} : i64, i64) {num_loops_val = 2 : i64}
-      omp.ordered depend_type(dependsource) depend_vec(%vec0, %vec1 : i64, i64) {num_loops_val = 2 : i64}
+      // CHECK: omp.ordered depend_type(dependsource) depend_vec(%{{.*}}, %{{.*}} : i64, i64) {doacross_num_loops = 2 : i64}
+      omp.ordered depend_type(dependsource) depend_vec(%vec0, %vec1 : i64, i64) {doacross_num_loops = 2 : i64}
 
       omp.yield
     }
@@ -1874,13 +1874,13 @@ func.func @omp_sectionsop(%data_var1 : memref<i32>, %data_var2 : memref<i32>,
   "omp.sections" (%data_var1, %data_var1) ({
     // CHECK: omp.terminator
     omp.terminator
-  }) {operandSegmentSizes = array<i32: 0,1,1>} : (memref<i32>, memref<i32>) -> ()
+  }) {operandSegmentSizes = array<i32: 1,1,0,0>} : (memref<i32>, memref<i32>) -> ()
 
     // CHECK: omp.sections reduction(@add_f32 -> %{{.*}} : !llvm.ptr)
   "omp.sections" (%redn_var) ({
     // CHECK: omp.terminator
     omp.terminator
-  }) {operandSegmentSizes = array<i32: 1,0,0>, reduction_vars_byref = array<i1: false>, reductions=[@add_f32]} : (!llvm.ptr) -> ()
+  }) {operandSegmentSizes = array<i32: 0,0,0,1>, reduction_byref = array<i1: false>, reduction_syms=[@add_f32]} : (!llvm.ptr) -> ()
 
   // CHECK: omp.sections nowait {
   omp.sections nowait {
@@ -2098,14 +2098,14 @@ func.func @omp_task(%bool_var: i1, %i64_var: i64, %i32_var: i32, %data_var: memr
   }
 
   // Checking multiple clauses
-  // CHECK: omp.task if(%[[bool_var]]) final(%[[bool_var]]) untied
-  omp.task if(%bool_var) final(%bool_var) untied
+  // CHECK: omp.task allocate(%[[data_var]] : memref<i32> -> %[[data_var]] : memref<i32>)
+  omp.task allocate(%data_var : memref<i32> -> %data_var : memref<i32>)
+      // CHECK-SAME: final(%[[bool_var]]) if(%[[bool_var]])
+      final(%bool_var) if(%bool_var) 
       // CHECK-SAME: in_reduction(@add_f32 -> %[[redn_var1]] : !llvm.ptr, byref @add_f32 -> %[[redn_var2]] : !llvm.ptr)
       in_reduction(@add_f32 -> %0 : !llvm.ptr, byref @add_f32 -> %1 : !llvm.ptr)
-      // CHECK-SAME: priority(%[[i32_var]] : i32)
-      priority(%i32_var : i32)
-      // CHECK-SAME: allocate(%[[data_var]] : memref<i32> -> %[[data_var]] : memref<i32>)
-      allocate(%data_var : memref<i32> -> %data_var : memref<i32>) {
+      // CHECK-SAME: priority(%[[i32_var]] : i32) untied
+      priority(%i32_var : i32) untied {
     // CHECK: "test.foo"() : () -> ()
     "test.foo"() : () -> ()
     // CHECK: omp.terminator
@@ -2136,7 +2136,7 @@ func.func @omp_target_depend(%arg0: memref<i32>, %arg1: memref<i32>) {
   omp.target depend(taskdependin -> %arg0 : memref<i32>, taskdependin -> %arg1 : memref<i32>, taskdependinout -> %arg0 : memref<i32>) {
     // CHECK: omp.terminator
     omp.terminator
-  } {operandSegmentSizes = array<i32: 0,0,0,3,0>}
+  } {operandSegmentSizes = array<i32: 0,0,0,3,0,0,0,0>}
   return
 }
 
@@ -2281,7 +2281,7 @@ func.func @omp_taskgroup_multiple_tasks() -> () {
 func.func @omp_taskgroup_clauses() -> () {
   %testmemref = "test.memref"() : () -> (memref<i32>)
   %testf32 = "test.f32"() : () -> (!llvm.ptr)
-  // CHECK: omp.taskgroup task_reduction(@add_f32 -> %{{.+}}: !llvm.ptr) allocate(%{{.+}}: memref<i32> -> %{{.+}}: memref<i32>)
+  // CHECK: omp.taskgroup allocate(%{{.+}}: memref<i32> -> %{{.+}}: memref<i32>) task_reduction(@add_f32 -> %{{.+}}: !llvm.ptr)
   omp.taskgroup allocate(%testmemref : memref<i32> -> %testmemref : memref<i32>) task_reduction(@add_f32 -> %testf32 : !llvm.ptr) {
     // CHECK: omp.task
     omp.task {
@@ -2421,8 +2421,8 @@ func.func @omp_taskloop(%lb: i32, %ub: i32, %step: i32) -> () {
   }
 
   %testi64 = "test.i64"() : () -> (i64)
-  // CHECK: omp.taskloop grain_size(%{{[^:]+}}: i64) {
-  omp.taskloop grain_size(%testi64: i64) {
+  // CHECK: omp.taskloop grainsize(%{{[^:]+}}: i64) {
+  omp.taskloop grainsize(%testi64: i64) {
     omp.loop_nest (%i, %j) : i32 = (%lb, %ub) to (%ub, %lb) step (%step, %step) {
       // CHECK: omp.yield
       omp.yield
@@ -2577,7 +2577,7 @@ func.func @omp_target_update_data (%if_cond : i1, %device : si32, %map1: memref<
 
     %mapv_to = omp.map.info var_ptr(%map2 : memref<?xi32>, tensor<?xi32>) map_clauses(present, to) capture(ByRef) -> memref<?xi32> {name = ""}
 
-    // CHECK: omp.target_update if(%[[VAL_0:.*]]) device(%[[VAL_1:.*]] : si32) nowait map_entries(%{{.*}}, %{{.*}} : memref<?xi32>, memref<?xi32>)
+    // CHECK: omp.target_update device(%[[VAL_1:.*]] : si32) if(%[[VAL_0:.*]]) map_entries(%{{.*}}, %{{.*}} : memref<?xi32>, memref<?xi32>) nowait
     omp.target_update if(%if_cond) device(%device : si32) nowait map_entries(%mapv_from , %mapv_to : memref<?xi32>, memref<?xi32>)
     return
 }
@@ -2614,7 +2614,7 @@ func.func @omp_target_enter_update_exit_data_depend(%a: memref<?xi32>, %b: memre
   }
 
   // Then map that over to the target
-  // CHECK: omp.target_enter_data depend(taskdependin -> [[ARG0]] : memref<?xi32>) nowait map_entries([[MAP0]], [[MAP2]] : memref<?xi32>, memref<?xi32>)
+  // CHECK: omp.target_enter_data depend(taskdependin -> [[ARG0]] : memref<?xi32>) map_entries([[MAP0]], [[MAP2]] : memref<?xi32>, memref<?xi32>) nowait
   omp.target_enter_data depend(taskdependin ->  %a: memref<?xi32>) nowait map_entries(%map_a, %map_c: memref<?xi32>, memref<?xi32>)
 
   // Compute 'b' on the target and copy it back
@@ -2631,7 +2631,7 @@ func.func @omp_target_enter_update_exit_data_depend(%a: memref<?xi32>, %b: memre
   }
 
   // Copy the updated 'a' onto the target
-  // CHECK: omp.target_update depend(taskdependin -> [[ARG0]] : memref<?xi32>) nowait map_entries([[MAP0]] : memref<?xi32>)
+  // CHECK: omp.target_update depend(taskdependin -> [[ARG0]] : memref<?xi32>) map_entries([[MAP0]] : memref<?xi32>) nowait
   omp.target_update depend(taskdependin -> %a : memref<?xi32>) nowait map_entries(%map_a :  memref<?xi32>)
 
   // Compute 'c' on the target and copy it back
diff --git a/mlir/test/Dialect/SCF/wrap-while-loop-in-zero-trip-check.mlir b/mlir/test/Dialect/SCF/wrap-while-loop-in-zero-trip-check.mlir
index 8954839..43a4693 100644
--- a/mlir/test/Dialect/SCF/wrap-while-loop-in-zero-trip-check.mlir
+++ b/mlir/test/Dialect/SCF/wrap-while-loop-in-zero-trip-check.mlir
@@ -20,7 +20,7 @@ func.func @wrap_while_loop_in_zero_trip_check(%bound : i32) -> i32 {
 // CHECK-SAME:      %[[BOUND:.*]]: i32) -> i32 {
 // CHECK-DAG:     %[[C0:.*]] = arith.constant 0 : i32
 // CHECK-DAG:     %[[C5:.*]] = arith.constant 5 : i32
-// CHECK-DAG:     %[[PRE_COND:.*]] = arith.cmpi slt, %[[C0]], %[[BOUND]] : i32
+// CHECK-DAG:     %[[PRE_COND:.*]] = arith.cmpi sgt, %[[BOUND]], %[[C0]] : i32
 // CHECK-DAG:     %[[PRE_INV:.*]] = arith.addi %[[BOUND]], %[[C5]] : i32
 // CHECK:         %[[IF:.*]]:2 = scf.if %[[PRE_COND]] -> (i32, i32) {
 // CHECK:           %[[WHILE:.*]]:2 = scf.while (
diff --git a/mlir/test/Dialect/SparseTensor/invalid.mlir b/mlir/test/Dialect/SparseTensor/invalid.mlir
index eb0dc01..737b736 100644
--- a/mlir/test/Dialect/SparseTensor/invalid.mlir
+++ b/mlir/test/Dialect/SparseTensor/invalid.mlir
@@ -1099,6 +1099,42 @@ func.func @sparse_extract_iter_space(%sp : tensor<4x8xf32, #COO>, %it1 : !sparse
   return
 }
 
+// -----
+
+#COO = #sparse_tensor.encoding<{
+  map = (i, j) -> (
+    i : compressed(nonunique),
+    j : singleton(soa)
+  )
+}>
+
+#CSR = #sparse_tensor.encoding<{
+  map = (i, j) -> (
+    i : dense,
+    j : compressed
+  )
+}>
+
+func.func @sparse_extract_value(%sp : tensor<4x8xf32, #COO>, %it1 : !sparse_tensor.iterator<#CSR, lvls = 1>) -> f32 {
+  // expected-error@+1 {{'sparse_tensor.extract_value' op mismatch in tensor encoding and iterator encoding.}}
+  %f = sparse_tensor.extract_value %sp at %it1 : tensor<4x8xf32, #COO>, !sparse_tensor.iterator<#CSR, lvls = 1>
+  return %f : f32
+}
+
+// -----
+
+#COO = #sparse_tensor.encoding<{
+  map = (i, j) -> (
+    i : compressed(nonunique),
+    j : singleton(soa)
+  )
+}>
+
+func.func @sparse_extract_value(%sp : tensor<4x8xf32, #COO>, %it1 : !sparse_tensor.iterator<#COO, lvls = 0>) -> f32 {
+  // expected-error@+1 {{'sparse_tensor.extract_value' op must use last-level iterator to extract values.}}
+  %f = sparse_tensor.extract_value %sp at %it1 : tensor<4x8xf32, #COO>, !sparse_tensor.iterator<#COO, lvls = 0>
+  return %f : f32
+}
 
 // -----
 
@@ -1155,3 +1191,78 @@ func.func @sparse_iterate(%sp : tensor<4x8xf32, #COO>, %i : index, %j : index) -
   }
   return %r1 : index
 }
+
+// -----
+
+#COO = #sparse_tensor.encoding<{
+  map = (i, j) -> (
+    i : compressed(nonunique),
+    j : singleton(soa)
+  )
+}>
+
+
+func.func @sparse_coiteration(%sp1 : !sparse_tensor.iter_space<#COO, lvls = 0>,
+                              %sp2 : !sparse_tensor.iter_space<#COO, lvls = 1>) -> index {
+  %init = arith.constant 0 : index
+  // expected-error @+1 {{'sparse_tensor.coiterate' op contains duplicated cases.}}
+  %ret = sparse_tensor.coiterate (%sp1, %sp2) at (%coord) iter_args(%arg = %init)
+       : (!sparse_tensor.iter_space<#COO, lvls = 0>, !sparse_tensor.iter_space<#COO, lvls = 1>)
+       -> index
+  case %it1, _ {
+    sparse_tensor.yield %arg : index
+  }
+  case %it1, _ {
+    sparse_tensor.yield %arg : index
+  }
+  return %ret : index
+}
+
+
+// -----
+
+#COO = #sparse_tensor.encoding<{
+  map = (i, j) -> (
+    i : compressed(nonunique),
+    j : singleton(soa)
+  )
+}>
+
+
+func.func @sparse_coiteration(%sp1 : !sparse_tensor.iter_space<#COO, lvls = 0>,
+                              %sp2 : !sparse_tensor.iter_space<#COO, lvls = 1>) -> index {
+  %init = arith.constant 0 : index
+  // expected-error @+1 {{'sparse_tensor.coiterate' op types mismatch between 0th yield value and defined value on 0th region}}
+  %ret = sparse_tensor.coiterate (%sp1, %sp2) at (%coord) iter_args(%arg = %init)
+       : (!sparse_tensor.iter_space<#COO, lvls = 0>, !sparse_tensor.iter_space<#COO, lvls = 1>)
+       -> index
+  case %it1, _ {
+    %i = arith.constant 1 : i32
+    sparse_tensor.yield %i : i32
+  }
+  return %ret : index
+}
+
+// -----
+
+#COO = #sparse_tensor.encoding<{
+  map = (i, j) -> (
+    i : compressed(nonunique),
+    j : singleton(soa)
+  )
+}>
+
+
+func.func @sparse_coiteration(%sp1 : !sparse_tensor.iter_space<#COO, lvls = 0>,
+                              %sp2 : !sparse_tensor.iter_space<#COO, lvls = 1>) -> index {
+  %init = arith.constant 0 : index
+  // expected-error @+1 {{'sparse_tensor.coiterate' op required out-of-bound coordinates}}
+  %ret = sparse_tensor.coiterate (%sp1, %sp2) at (%coord1, %coord2) iter_args(%arg = %init)
+       : (!sparse_tensor.iter_space<#COO, lvls = 0>, !sparse_tensor.iter_space<#COO, lvls = 1>)
+       -> index
+  case %it1, _ {
+    %i = arith.constant 1 : i32
+    sparse_tensor.yield %i : i32
+  }
+  return %ret : index
+}
diff --git a/mlir/test/Dialect/SparseTensor/roundtrip.mlir b/mlir/test/Dialect/SparseTensor/roundtrip.mlir
index bce0b41a..ab861a2 100644
--- a/mlir/test/Dialect/SparseTensor/roundtrip.mlir
+++ b/mlir/test/Dialect/SparseTensor/roundtrip.mlir
@@ -748,6 +748,27 @@ func.func @sparse_has_runtime() -> i1 {
   )
 }>
 
+// CHECK-LABEL:   func.func @sparse_extract_value(
+// CHECK-SAME:      %[[VAL_0:.*]]: tensor<4x8xf32, #sparse>,
+// CHECK-SAME:      %[[VAL_1:.*]]: !sparse_tensor.iterator<#sparse, lvls = 1>) -> f32 {
+// CHECK:           %[[VAL_2:.*]] = sparse_tensor.extract_value %[[VAL_0]] at %[[VAL_1]] : tensor<4x8xf32, #sparse>, !sparse_tensor.iterator<#sparse, lvls = 1>
+// CHECK:           return %[[VAL_2]] : f32
+// CHECK:         }
+func.func @sparse_extract_value(%sp : tensor<4x8xf32, #COO>, %it1 : !sparse_tensor.iterator<#COO, lvls = 1>) -> f32 {
+  %f = sparse_tensor.extract_value %sp at %it1 : tensor<4x8xf32, #COO>, !sparse_tensor.iterator<#COO, lvls = 1>
+  return %f : f32
+}
+
+
+// -----
+
+#COO = #sparse_tensor.encoding<{
+  map = (i, j) -> (
+    i : compressed(nonunique),
+    j : singleton(soa)
+  )
+}>
+
 // CHECK-LABEL:   func.func @sparse_extract_iter_space(
 // CHECK-SAME:      %[[VAL_0:.*]]: tensor<4x8xf32, #sparse{{[0-9]*}}>,
 // CHECK-SAME:      %[[VAL_1:.*]]: !sparse_tensor.iterator<#sparse{{[0-9]*}}, lvls = 0>)
@@ -780,7 +801,7 @@ func.func @sparse_extract_iter_space(%sp : tensor<4x8xf32, #COO>, %it1 : !sparse
 // CHECK-SAME:      %[[VAL_1:.*]]: index,
 // CHECK-SAME:      %[[VAL_2:.*]]: index) -> index {
 // CHECK:           %[[VAL_3:.*]] = sparse_tensor.extract_iteration_space %[[VAL_0]] lvls = 0 : tensor<4x8xf32, #sparse{{[0-9]*}}>
-// CHECK:           %[[VAL_4:.*]] = sparse_tensor.iterate %[[VAL_5:.*]] in %[[VAL_3]] at(%[[VAL_6:.*]]) iter_args(%[[VAL_7:.*]] = %[[VAL_1]]) : !sparse_tensor.iter_space<#sparse{{[0-9]*}}, lvls = 0> -> (index) {
+// CHECK:           %[[VAL_4:.*]] = sparse_tensor.iterate %[[VAL_5:.*]] in %[[VAL_3]] at(%[[VAL_6:.*]]) iter_args(%[[VAL_7:.*]] = %[[VAL_1]]) : !sparse_tensor.iter_space<#sparse{{[0-9]*}}, lvls = 0> -> index {
 // CHECK:             sparse_tensor.yield %[[VAL_7]] : index
 // CHECK:           }
 // CHECK:           return %[[VAL_4]] : index
@@ -792,3 +813,36 @@ func.func @sparse_iterate(%sp : tensor<4x8xf32, #COO>, %i : index, %j : index) -
   }
   return %r1 : index
 }
+
+
+// -----
+
+#COO = #sparse_tensor.encoding<{
+  map = (i, j) -> (
+    i : compressed(nonunique),
+    j : singleton(soa)
+  )
+}>
+
+
+// CHECK-LABEL:   func.func @sparse_coiteration(
+// CHECK-SAME:      %[[SP1:.*]]: !sparse_tensor.iter_space<#sparse, lvls = 0>,
+// CHECK-SAME:      %[[SP2:.*]]: !sparse_tensor.iter_space<#sparse, lvls = 1>) -> index {
+// CHECK:           %[[INIT:.*]] = arith.constant 0 : index
+// CHECK:           %[[RET:.*]] = sparse_tensor.coiterate (%[[SP1]], %[[SP2]]) at(%[[COORD:.*]]) iter_args(%[[ARG:.*]] = %[[INIT]])
+// CHECK:           case %[[VAL_6:.*]], _ {
+// CHECK:             sparse_tensor.yield %[[ARG]] : index
+// CHECK:           }
+// CHECK:           return %[[RET]] : index
+// CHECK:         }
+func.func @sparse_coiteration(%sp1 : !sparse_tensor.iter_space<#COO, lvls = 0>,
+                              %sp2 : !sparse_tensor.iter_space<#COO, lvls = 1>) -> index {
+  %init = arith.constant 0 : index
+  %ret = sparse_tensor.coiterate (%sp1, %sp2) at (%coord) iter_args(%arg = %init)
+       : (!sparse_tensor.iter_space<#COO, lvls = 0>, !sparse_tensor.iter_space<#COO, lvls = 1>)
+       -> index
+  case %it1, _ {
+    sparse_tensor.yield %arg : index
+  }
+  return %ret : index
+}
diff --git a/mlir/test/Dialect/SparseTensor/sparse_kernels_to_iterator.mlir b/mlir/test/Dialect/SparseTensor/sparse_kernels_to_iterator.mlir
index f5bbea0..268b394 100644
--- a/mlir/test/Dialect/SparseTensor/sparse_kernels_to_iterator.mlir
+++ b/mlir/test/Dialect/SparseTensor/sparse_kernels_to_iterator.mlir
@@ -1,4 +1,4 @@
-// RUN: mlir-opt %s --sparse-reinterpret-map -sparsification="sparse-emit-strategy=sparse-iterator" --sparse-space-collapse --lower-sparse-iteration-to-scf | FileCheck %s
+// RUN: mlir-opt %s --sparse-reinterpret-map -sparsification="sparse-emit-strategy=sparse-iterator" --cse --sparse-space-collapse --lower-sparse-iteration-to-scf --loop-invariant-code-motion | FileCheck %s
 
 
 #COO = #sparse_tensor.encoding<{
@@ -7,8 +7,7 @@
     d1 : singleton(nonunique, soa),
     d2 : singleton(nonunique, soa),
     d3 : singleton(soa)
-  ),
-  explicitVal = 1 : i32
+  )
 }>
 
 // CHECK-LABEL:   func.func @sqsum(
@@ -17,7 +16,10 @@
 // CHECK-DAG:       %[[POS_BUF:.*]] = sparse_tensor.positions %{{.*}} {level = 0 : index} : tensor<?x?x?x?xi32, #sparse> to memref<?xindex>
 // CHECK:           %[[POS_LO:.*]] = memref.load %[[POS_BUF]]{{\[}}%[[C0]]] : memref<?xindex>
 // CHECK:           %[[POS_HI:.*]] = memref.load %[[POS_BUF]]{{\[}}%[[C1]]] : memref<?xindex>
+// CHECK:           %[[VAL_BUF:.*]] = sparse_tensor.values %{{.*}} : tensor<?x?x?x?xi32, #sparse> to memref<?xi32>
 // CHECK:           %[[SQ_SUM:.*]] = scf.for %[[POS:.*]] = %[[POS_LO]] to %[[POS_HI]] step %[[C1]] {{.*}} {
+// CHECK:             %[[VAL:.*]] = memref.load %[[VAL_BUF]]{{\[}}%[[POS]]] : memref<?xi32>
+// CHECK:             %[[MUL:.*]] = arith.muli %[[VAL]], %[[VAL]] : i32
 // CHECK:             %[[SUM:.*]] = arith.addi
 // CHECK:             scf.yield %[[SUM]] : i32
 // CHECK:           }
diff --git a/mlir/test/IR/properties.mlir b/mlir/test/IR/properties.mlir
index 01ea856..418b81d 100644
--- a/mlir/test/IR/properties.mlir
+++ b/mlir/test/IR/properties.mlir
@@ -2,10 +2,10 @@
 // # RUN: mlir-opt %s -mlir-print-op-generic -split-input-file  | mlir-opt -mlir-print-op-generic | FileCheck %s --check-prefix=GENERIC
 
 // CHECK:   test.with_properties
-// CHECK-SAME: <{a = 32 : i64, array = array<i64: 1, 2, 3, 4>, b = "foo"}>{{$}}
+// CHECK-SAME: a = 32, b = "foo", c = "bar", flag = true, array = [1, 2, 3, 4]{{$}}
 // GENERIC:   "test.with_properties"()
-// GENERIC-SAME: <{a = 32 : i64, array = array<i64: 1, 2, 3, 4>, b = "foo"}> : () -> ()
-test.with_properties <{a = 32 : i64, array = array<i64: 1, 2, 3, 4>, b = "foo"}>
+// GENERIC-SAME: <{a = 32 : i64, array = array<i64: 1, 2, 3, 4>, b = "foo", c = "bar", flag = true}> : () -> ()
+test.with_properties a = 32, b = "foo", c = "bar", flag = true, array = [1, 2, 3, 4]
 
 // CHECK:   test.with_nice_properties
 // CHECK-SAME:    "foo bar" is -3{{$}}
@@ -34,18 +34,48 @@ test.using_property_in_custom [1, 4, 20]
 // GENERIC-SAME: }>
 test.using_property_ref_in_custom 1 + 4 = 5
 
-// CHECK:   test.with_default_valued_properties {{$}}
+// CHECK:   test.with_default_valued_properties na{{$}}
 // GENERIC: "test.with_default_valued_properties"()
-// GENERIC-SAME:  <{a = 0 : i32}>
-test.with_default_valued_properties <{a = 0 : i32}>
+// GENERIC-SAME: <{a = 0 : i32, b = "", c = -1 : i32, unit = false}> : () -> ()
+test.with_default_valued_properties 0 "" -1 unit_absent
+
+// CHECK:   test.with_default_valued_properties 1 "foo" 0 unit{{$}}
+// GENERIC: "test.with_default_valued_properties"()
+// GENERIC-SAME: <{a = 1 : i32, b = "foo", c = 0 : i32, unit}> : () -> ()
+test.with_default_valued_properties 1 "foo" 0 unit
 
 // CHECK:   test.with_optional_properties
-// CHECK-SAME:  <{b = 0 : i32}>
+// CHECK-SAME: simple = 0
+// GENERIC: "test.with_optional_properties"()
+// GENERIC-SAME:  <{hasDefault = [], hasUnit = false, longSyntax = [], maybeUnit = [], nested = [], nonTrivialStorage = [], simple = [0]}> : () -> ()
+test.with_optional_properties simple = 0
+
+// CHECK:   test.with_optional_properties{{$}}
 // GENERIC: "test.with_optional_properties"()
-// GENERIC-SAME:  <{b = 0 : i32}>
-test.with_optional_properties <{b = 0 : i32}>
+// GENERIC-SAME: simple = []
+test.with_optional_properties
 
-// CHECK:   test.with_optional_properties {{$}}
+// CHECK:    test.with_optional_properties
+// CHECK-SAME: anAttr = 0 simple = 1 nonTrivialStorage = "foo" hasDefault = some<0> nested = some<1>  longSyntax = some<"bar"> hasUnit maybeUnit = some<unit>
 // GENERIC: "test.with_optional_properties"()
-// GENERIC-SAME:  : () -> ()
+// GENERIC-SAME: <{anAttr = 0 : i32, hasDefault = [0], hasUnit, longSyntax = ["bar"], maybeUnit = [unit], nested = {{\[}}[1]], nonTrivialStorage = ["foo"], simple = [1]}> : () -> ()
 test.with_optional_properties
+  anAttr = 0
+  simple = 1
+  nonTrivialStorage = "foo"
+  hasDefault = some<0>
+  nested = some<1>
+  longSyntax = some<"bar">
+  hasUnit
+  maybeUnit = some<unit>
+
+// CHECK:    test.with_optional_properties
+// CHECK-SAME: nested = some<none>
+// GENERIC: "test.with_optional_properties"()
+// GENERIC-SAME: nested = {{\[}}[]]
+test.with_optional_properties nested = some<none>
+
+// CHECK:    test.with_array_properties
+// CHECK-SAME: ints = [1, 2] strings = ["a", "b"] nested = {{\[}}[1, 2], [3, 4]] opt = [-1, -2] explicitOptions = [none, 0] explicitUnits = [unit, unit_absent]
+// GENERIC: "test.with_array_properties"()
+test.with_array_properties ints = [1, 2] strings = ["a", "b"] nested = [[1, 2], [3, 4]] opt = [-1, -2] explicitOptions = [none, 0] explicitUnits = [unit, unit_absent] [] thats_has_default
diff --git a/mlir/test/IR/traits.mlir b/mlir/test/IR/traits.mlir
index 1e04670..49cfd7e 100644
--- a/mlir/test/IR/traits.mlir
+++ b/mlir/test/IR/traits.mlir
@@ -502,6 +502,25 @@ func.func @succeededOilistTrivial() {
 
 // -----
 
+// CHECK-LABEL: @succeededOilistTrivialProperties
+func.func @succeededOilistTrivialProperties() {
+  // CHECK: test.oilist_with_keywords_only_properties keyword
+  test.oilist_with_keywords_only_properties keyword
+  // CHECK: test.oilist_with_keywords_only_properties otherKeyword
+  test.oilist_with_keywords_only_properties otherKeyword
+  // CHECK: test.oilist_with_keywords_only_properties keyword otherKeyword
+  test.oilist_with_keywords_only_properties keyword otherKeyword
+  // CHECK: test.oilist_with_keywords_only_properties keyword otherKeyword
+  test.oilist_with_keywords_only_properties otherKeyword keyword
+  // CHECK: test.oilist_with_keywords_only_properties thirdKeyword
+  test.oilist_with_keywords_only_properties thirdKeyword
+  // CHECK: test.oilist_with_keywords_only_properties keyword thirdKeyword
+  test.oilist_with_keywords_only_properties keyword thirdKeyword
+  return
+}
+
+// -----
+
 // CHECK-LABEL: @succeededOilistSimple
 func.func @succeededOilistSimple(%arg0 : i32, %arg1 : i32, %arg2 : i32) {
   // CHECK: test.oilist_with_simple_args keyword %{{.*}} : i32
diff --git a/mlir/test/Interfaces/TilingInterface/tile-and-fuse-using-interface.mlir b/mlir/test/Interfaces/TilingInterface/tile-and-fuse-using-interface.mlir
index 11ab30a..d1aed59 100644
--- a/mlir/test/Interfaces/TilingInterface/tile-and-fuse-using-interface.mlir
+++ b/mlir/test/Interfaces/TilingInterface/tile-and-fuse-using-interface.mlir
@@ -428,7 +428,7 @@ module attributes {transform.with_named_sequence} {
     transform.yield
   }
 }
-//       CHECK: #[[MAP:.+]] = affine_map<(d0)[s0] -> (10, -d0 + s0)>
+//       CHECK: #[[MAP:.+]] = affine_map<(d0)[s0] -> (-d0 + s0, 10)>
 //       CHECK: func @matmul_sequence_fusion(
 //  CHECK-SAME:   %[[ARG0:[a-zA-Z0-9_]+]]: tensor<?x?xf32>
 //  CHECK-SAME:   %[[ARG1:[a-zA-Z0-9_]+]]: tensor<?x?xf32>
diff --git a/mlir/test/Interfaces/TilingInterface/tile-pad-using-interface.mlir b/mlir/test/Interfaces/TilingInterface/tile-pad-using-interface.mlir
index 7d247ae..ccf8e37 100644
--- a/mlir/test/Interfaces/TilingInterface/tile-pad-using-interface.mlir
+++ b/mlir/test/Interfaces/TilingInterface/tile-pad-using-interface.mlir
@@ -31,8 +31,8 @@ module attributes {transform.with_named_sequence} {
 //   CHECK-DAG:   %[[DIM_IN1:.+]] = tensor.dim %[[IN]], %[[C1]]
 //   CHECK-DAG:   %[[DIM1:.+]] = affine.apply #[[MAP1]]()[%[[DIM_IN1]]]
 //   CHECK-DAG:   %[[C2:.+]] = arith.constant 2 : index
+//   CHECK-DAG:   %[[C3:.+]] = arith.constant 3 : index
 //       CHECK:   %[[RESULT:[a-zA-Z0-9]+]] = scf.for %[[IV0:[a-zA-Z0-9]+]] = %[[C0]] to %[[DIM0]] step %[[C2]]
-//       CHECK:     %[[C3:.+]] = arith.constant 3 : index
 //       CHECK:     scf.for {{.*}} = %[[C0]] to %[[DIM1]] step %[[C3]] iter_args(%[[INNER_OUT:.*]] =
 //       CHECK:       %[[SWAP_RESULT:.*]] = scf.if
 //       CHECK:         tensor.generate
@@ -62,8 +62,8 @@ module attributes {transform.with_named_sequence} {
     transform.yield
   }
 }
-//   CHECK-DAG: #[[MAP0:.*]] = affine_map<()[s0] -> (s0 + 8)>
-//   CHECK-DAG: #[[MAP1:.*]] = affine_map<()[s0] -> (s0 + 7)>
+//   CHECK-DAG: #[[MAP0:.+]] = affine_map<()[s0] -> (s0 + 8)>
+//   CHECK-DAG: #[[MAP1:.+]] = affine_map<()[s0] -> (s0 + 7)>
 //       CHECK: func @dynamic_2d_pad_tensor_inner_tiling(
 //  CHECK-SAME:     %[[IN:.*]]: tensor<?x?xf32>
 //   CHECK-DAG:   %[[C0:.*]] = arith.constant 0 : index
@@ -107,9 +107,9 @@ module attributes {transform.with_named_sequence} {
 //   CHECK-DAG:   %[[C0:.*]] = arith.constant 0 : index
 //   CHECK-DAG:   %[[C15:.*]] = arith.constant 15 : index
 //   CHECK-DAG:   %[[C2:.*]] = arith.constant 2 : index
+//   CHECK-DAG:   %[[C16:.*]] = arith.constant 16 : index
+//   CHECK-DAG:   %[[C3:.*]] = arith.constant 3 : index
 //       CHECK:   %[[RESULT:.*]] = scf.for {{.*}} = %[[C0]] to %[[C15]] step %[[C2]]
-//   CHECK-DAG:     %[[C16:.*]] = arith.constant 16 : index
-//   CHECK-DAG:     %[[C3:.*]] = arith.constant 3 : index
 //       CHECK:     scf.for {{.*}} = %[[C0]] to %[[C16]] step %[[C3]] iter_args(%[[INNER_OUT:.*]] =
 //       CHECK:       %[[SWAP_RESULT:.*]] = scf.if
 //       CHECK:         tensor.generate
diff --git a/mlir/test/Interfaces/TilingInterface/tile-using-interface.mlir b/mlir/test/Interfaces/TilingInterface/tile-using-interface.mlir
index 488a52e..8eb1311 100644
--- a/mlir/test/Interfaces/TilingInterface/tile-using-interface.mlir
+++ b/mlir/test/Interfaces/TilingInterface/tile-using-interface.mlir
@@ -16,21 +16,21 @@ module attributes {transform.with_named_sequence} {
     transform.yield
   }
 }
-//  CHECK-DAG: #[[$MAP0:.+]] = affine_map<(d0)[s0] -> (10, -d0 + s0)>
-//  CHECK-DAG: #[[$MAP1:.+]] = affine_map<(d0)[s0] -> (20, -d0 + s0)>
+//  CHECK-DAG: #[[$MAP0:.+]] = affine_map<(d0)[s0] -> (-d0 + s0, 10)>
+//  CHECK-DAG: #[[$MAP1:.+]] = affine_map<(d0)[s0] -> (-d0 + s0, 20)>
 //      CHECK-LABEL: func.func @simple_matmul(
 // CHECK-SAME:     %[[ARG0:[a-zA-Z0-9]+]]: tensor<?x?xf32>
 // CHECK-SAME:     %[[ARG1:[a-zA-Z0-9]+]]: tensor<?x?xf32>
 // CHECK-SAME:     %[[ARG2:[a-zA-Z0-9]+]]: tensor<?x?xf32>
 //  CHECK-DAG:   %[[C0:.+]] = arith.constant 0 : index
 //  CHECK-DAG:   %[[C1:.+]] = arith.constant 1 : index
-//  CHECK-DAG:   %[[C10:.+]] = arith.constant 10 : index
 //  CHECK-DAG:   %[[M:.+]] = tensor.dim %[[ARG0]], %[[C0]]
 //  CHECK-DAG:   %[[K:.+]] = tensor.dim %[[ARG0]], %[[C1]]
 //  CHECK-DAG:   %[[N:.+]] = tensor.dim %[[ARG1]], %[[C1]]
+//  CHECK-DAG:   %[[C10:.+]] = arith.constant 10 : index
+//  CHECK-DAG:   %[[C20:.+]] = arith.constant 20 : index
 //      CHECK:   %[[OUTER:[a-zA-Z0-9]+]] = scf.for %[[IV0:[a-zA-Z0-9]+]] = %[[C0]] to %[[M]] step %[[C10]]
 // CHECK-SAME:       iter_args(%[[INIT0:.+]] = %[[ARG2]])
-//  CHECK-DAG:     %[[C20:.+]] = arith.constant 20 : index
 //      CHECK:     %[[INNER:[a-zA-Z0-9]+]] = scf.for %[[IV1:[a-zA-Z0-9]+]] = %[[C0]] to %[[N]] step %[[C20]]
 // CHECK-SAME:         iter_args(%[[INIT1:.+]] = %[[INIT0]])
 //  CHECK-DAG:       %[[TS_Y:.+]] = affine.min #[[$MAP0]](%[[IV0]])[%[[M]]]
@@ -68,23 +68,23 @@ module attributes {transform.with_named_sequence} {
     transform.yield
   }
 }
-//  CHECK-DAG: #[[$MAP0:.+]] = affine_map<(d0)[s0] -> (10, -d0 + s0)>
-//  CHECK-DAG: #[[$MAP1:.+]] = affine_map<(d0)[s0] -> (20, -d0 + s0)>
-//  CHECK-DAG: #[[$MAP2:.+]] = affine_map<(d0)[s0] -> (30, -d0 + s0)>
+//  CHECK-DAG: #[[$MAP0:.+]] = affine_map<(d0)[s0] -> (-d0 + s0, 10)>
+//  CHECK-DAG: #[[$MAP1:.+]] = affine_map<(d0)[s0] -> (-d0 + s0, 20)>
+//  CHECK-DAG: #[[$MAP2:.+]] = affine_map<(d0)[s0] -> (-d0 + s0, 30)>
 //      CHECK-LABEL: func.func @simple_matmul_memref(
 // CHECK-SAME:     %[[ARG0:[a-zA-Z0-9]+]]: memref<?x?xf32>
 // CHECK-SAME:     %[[ARG1:[a-zA-Z0-9]+]]: memref<?x?xf32>
 // CHECK-SAME:     %[[ARG2:[a-zA-Z0-9]+]]: memref<?x?xf32>
 //  CHECK-DAG:   %[[C0:.+]] = arith.constant 0 : index
 //  CHECK-DAG:   %[[C1:.+]] = arith.constant 1 : index
-//  CHECK-DAG:   %[[C10:.+]] = arith.constant 10 : index
 //  CHECK-DAG:   %[[M:.+]] = memref.dim %[[ARG0]], %[[C0]]
 //  CHECK-DAG:   %[[K:.+]] = memref.dim %[[ARG0]], %[[C1]]
 //  CHECK-DAG:   %[[N:.+]] = memref.dim %[[ARG1]], %[[C1]]
+//  CHECK-DAG:   %[[C10:.+]] = arith.constant 10 : index
+//  CHECK-DAG:   %[[C20:.+]] = arith.constant 20 : index
+//  CHECK-DAG:   %[[C30:.+]] = arith.constant 30 : index
 //      CHECK:   scf.for %[[IV0:[a-zA-Z0-9]+]] = %[[C0]] to %[[M]] step %[[C10]]
-//  CHECK-DAG:     %[[C20:.+]] = arith.constant 20 : index
 //      CHECK:     scf.for %[[IV1:[a-zA-Z0-9]+]] = %[[C0]] to %[[N]] step %[[C20]]
-//  CHECK-DAG:       %[[C30:.+]] = arith.constant 30 : index
 //      CHECK:       scf.for %[[IV2:[a-zA-Z0-9]+]] = %[[C0]] to %[[K]] step %[[C30]]
 //  CHECK-DAG:         %[[TS_M:.+]] = affine.min #[[$MAP0]](%[[IV0]])[%[[M]]]
 //  CHECK-DAG:         %[[TS_N:.+]] = affine.min #[[$MAP1]](%[[IV1]])[%[[N]]]
@@ -127,18 +127,18 @@ module attributes {transform.with_named_sequence} {
     transform.yield
   }
 }
-//   CHECK-DAG: #[[$MAP0:.+]] = affine_map<(d0) -> (10, -d0 + 128)>
+//   CHECK-DAG: #[[$MAP0:.+]] = affine_map<(d0) -> (-d0 + 128, 10)>
 // CHECK-LABEL: func.func @multi_result(
 //  CHECK-SAME:     %[[ARG0:[a-zA-Z0-9]+]]: tensor<128x200x300xf32>)
-//   CHECK-DAG:   %[[C0:.+]] = arith.constant 0 : index
-//   CHECK-DAG:   %[[C10:.+]] = arith.constant 10 : index
-//   CHECK-DAG:   %[[C128:.+]] = arith.constant 128 : index
 //   CHECK-DAG:   %[[INIT0:.+]] = tensor.empty()
 //   CHECK-DAG:   %[[INIT1:.+]] = tensor.empty()
+//   CHECK-DAG:   %[[C0:.+]] = arith.constant 0 : index
+//   CHECK-DAG:   %[[C128:.+]] = arith.constant 128 : index
+//   CHECK-DAG:   %[[C300:.+]] = arith.constant 300 : index
+//   CHECK-DAG:   %[[C10:.+]] = arith.constant 10 : index
+//   CHECK-DAG:   %[[C20:.+]] = arith.constant 20 : index
 //       CHECK:   %[[OUTER:[a-zA-Z0-9]+]]:2 = scf.for %[[IV0:[a-zA-Z0-9]+]] = %[[C0]] to %[[C128]] step %[[C10]]
 //  CHECK-SAME:       iter_args(%[[ARG1:[a-zA-Z0-9]+]] = %[[INIT0]], %[[ARG2:[a-zA-Z0-9]+]] = %[[INIT1]])
-//   CHECK-DAG:     %[[C300:.+]] = arith.constant 300 : index
-//   CHECK-DAG:     %[[C20:.+]] = arith.constant 20 : index
 //       CHECK:     %[[INNER:[a-zA-Z0-9]+]]:2 = scf.for %[[IV1:[a-zA-Z0-9]+]] = %[[C0]] to %[[C300]] step %[[C20]]
 //  CHECK-SAME:         iter_args(%[[ARG3:[a-zA-Z0-9]+]] = %[[ARG1]], %[[ARG4:[a-zA-Z0-9]+]] = %[[ARG2]])
 //   CHECK-DAG:       %[[TS_Y:.+]] = affine.min #[[$MAP0]](%[[IV0]])
@@ -180,9 +180,9 @@ module attributes {transform.with_named_sequence} {
     transform.yield
   }
 }
-//  CHECK-DAG: #[[$MAP0:.+]] = affine_map<(d0)[s0] -> (10, -d0 + s0)>
-//  CHECK-DAG: #[[$MAP1:.+]] = affine_map<(d0)[s0] -> (20, -d0 + s0)>
-//  CHECK-DAG: #[[$MAP2:.+]] = affine_map<(d0)[s0] -> (30, -d0 + s0)>
+//  CHECK-DAG: #[[$MAP0:.+]] = affine_map<(d0)[s0] -> (-d0 + s0, 10)>
+//  CHECK-DAG: #[[$MAP1:.+]] = affine_map<(d0)[s0] -> (-d0 + s0, 20)>
+//  CHECK-DAG: #[[$MAP2:.+]] = affine_map<(d0)[s0] -> (-d0 + s0, 30)>
 //  CHECK-DAG: #[[$MAP3:.+]] = affine_map<(d0)[s0] -> (d0 + s0 * 2 - 2)>
 //  CHECK-DAG: #[[$MAP4:.+]] = affine_map<(d0)[s0] -> (d0 + s0 * 3 - 3)>
 //      CHECK-LABEL: func.func @conv2D(
@@ -193,7 +193,6 @@ module attributes {transform.with_named_sequence} {
 //  CHECK-DAG:   %[[C1:.+]] = arith.constant 1 : index
 //  CHECK-DAG:   %[[C2:.+]] = arith.constant 2 : index
 //  CHECK-DAG:   %[[C3:.+]] = arith.constant 3 : index
-//  CHECK-DAG:   %[[C10:.+]] = arith.constant 10 : index
 //  CHECK-DAG:   %[[N:.+]] = tensor.dim %[[INPUT]], %[[C0]]
 //  CHECK-DAG:   %[[C:.+]] = tensor.dim %[[INPUT]], %[[C3]]
 //  CHECK-DAG:   %[[P:.+]] = tensor.dim %[[FILTER]], %[[C0]]
@@ -201,12 +200,13 @@ module attributes {transform.with_named_sequence} {
 //  CHECK-DAG:   %[[F:.+]] = tensor.dim %[[FILTER]], %[[C3]]
 //  CHECK-DAG:   %[[R:.+]] = tensor.dim %[[INIT]], %[[C1]]
 //  CHECK-DAG:   %[[S:.+]] = tensor.dim %[[INIT]], %[[C2]]
+//  CHECK-DAG:   %[[C10:.+]] = arith.constant 10 : index
+//  CHECK-DAG:   %[[C20:.+]] = arith.constant 20 : index
+//  CHECK-DAG:   %[[C30:.+]] = arith.constant 30 : index
 //      CHECK:   scf.for %[[IV0:[a-zA-Z0-9]+]] = %[[C0]] to %[[P]] step %[[C10]]
 // CHECK-SAME:       iter_args(%[[INIT0:.+]] = %[[INIT]])
-//  CHECK-DAG:     %[[C20:.+]] = arith.constant 20 : index
 //      CHECK:     scf.for %[[IV1:[a-zA-Z0-9]+]] = %[[C0]] to %[[Q]] step %[[C20]]
 // CHECK-SAME:         iter_args(%[[INIT1:.+]] = %[[INIT0]])
-//  CHECK-DAG:       %[[C30:.+]] = arith.constant 30 : index
 //      CHECK:       scf.for %[[IV2:[a-zA-Z0-9]+]] = %[[C0]] to %[[C]] step %[[C30]]
 // CHECK-SAME:           iter_args(%[[INIT2:.+]] = %[[INIT1]])
 //  CHECK-DAG:         %[[TS_P:.+]] = affine.min #[[$MAP0]](%[[IV0]])[%[[P]]]
@@ -287,25 +287,25 @@ module attributes {transform.with_named_sequence} {
     transform.yield
   }
 }
-//  CHECK-DAG: #[[$MAP0:.+]] = affine_map<(d0)[s0] -> (20, -d0 + s0)>
-//  CHECK-DAG: #[[$MAP1:.+]] = affine_map<(d0)[s0] -> (30, -d0 + s0)>
-//  CHECK-DAG: #[[$MAP2:.+]] = affine_map<(d0)[s0] -> (10, -d0 + s0)>
+//  CHECK-DAG: #[[$MAP0:.+]] = affine_map<(d0)[s0] -> (-d0 + s0, 20)>
+//  CHECK-DAG: #[[$MAP1:.+]] = affine_map<(d0)[s0] -> (-d0 + s0, 30)>
+//  CHECK-DAG: #[[$MAP2:.+]] = affine_map<(d0)[s0] -> (-d0 + s0, 10)>
 //      CHECK-LABEL: func.func @interchange_matmul(
 // CHECK-SAME:     %[[ARG0:[a-zA-Z0-9]+]]: tensor<?x?xf32>
 // CHECK-SAME:     %[[ARG1:[a-zA-Z0-9]+]]: tensor<?x?xf32>
 // CHECK-SAME:     %[[ARG2:[a-zA-Z0-9]+]]: tensor<?x?xf32>
 //  CHECK-DAG:   %[[C0:.+]] = arith.constant 0 : index
 //  CHECK-DAG:   %[[C1:.+]] = arith.constant 1 : index
-//  CHECK-DAG:   %[[C20:.+]] = arith.constant 20 : index
 //  CHECK-DAG:   %[[M:.+]] = tensor.dim %[[ARG0]], %[[C0]]
 //  CHECK-DAG:   %[[K:.+]] = tensor.dim %[[ARG0]], %[[C1]]
 //  CHECK-DAG:   %[[N:.+]] = tensor.dim %[[ARG1]], %[[C1]]
+//  CHECK-DAG:   %[[C10:.+]] = arith.constant 10 : index
+//  CHECK-DAG:   %[[C20:.+]] = arith.constant 20 : index
+//  CHECK-DAG:   %[[C30:.+]] = arith.constant 30 : index
 //      CHECK:   %[[OUTER:[a-zA-Z0-9]+]] = scf.for %[[IV0:[a-zA-Z0-9]+]] = %[[C0]] to %[[N]] step %[[C20]]
 // CHECK-SAME:       iter_args(%[[INIT0:.+]] = %[[ARG2]])
-//  CHECK-DAG:     %[[C30:.+]] = arith.constant 30 : index
 //      CHECK:     %[[INNER1:[a-zA-Z0-9]+]] = scf.for %[[IV1:[a-zA-Z0-9]+]] = %[[C0]] to %[[K]] step %[[C30]]
 // CHECK-SAME:         iter_args(%[[INIT1:.+]] = %[[INIT0]])
-//  CHECK-DAG:       %[[C10:.+]] = arith.constant 10 : index
 //      CHECK:       %[[INNER2:[a-zA-Z0-9]+]] = scf.for %[[IV2:[a-zA-Z0-9]+]] = %[[C0]] to %[[M]] step %[[C10]]
 // CHECK-SAME:           iter_args(%[[INIT2:.+]] = %[[INIT1]])
 //  CHECK-DAG:         %[[TS_N:.+]] = affine.min #[[$MAP0]](%[[IV0]])[%[[N]]]
diff --git a/mlir/test/Interfaces/TilingInterface/tile-using-scfforall.mlir b/mlir/test/Interfaces/TilingInterface/tile-using-scfforall.mlir
index c5aff74..53dd0c6 100644
--- a/mlir/test/Interfaces/TilingInterface/tile-using-scfforall.mlir
+++ b/mlir/test/Interfaces/TilingInterface/tile-using-scfforall.mlir
@@ -17,8 +17,8 @@ module attributes {transform.with_named_sequence} {
     transform.yield
   }
 }
-//  CHECK-DAG: #[[MAP0:.+]] = affine_map<(d0)[s0] -> (10, -d0 + s0)>
-//  CHECK-DAG: #[[MAP1:.+]] = affine_map<(d0)[s0] -> (20, -d0 + s0)>
+//  CHECK-DAG: #[[MAP0:.+]] = affine_map<(d0)[s0] -> (-d0 + s0, 10)>
+//  CHECK-DAG: #[[MAP1:.+]] = affine_map<(d0)[s0] -> (-d0 + s0, 20)>
 //      CHECK: func.func @simple_matmul(
 // CHECK-SAME:     %[[ARG0:[a-zA-Z0-9]+]]: tensor<?x?xf32>
 // CHECK-SAME:     %[[ARG1:[a-zA-Z0-9]+]]: tensor<?x?xf32>
@@ -65,8 +65,8 @@ module attributes {transform.with_named_sequence} {
     transform.yield
   }
 }
-//  CHECK-DAG: #[[$MAP0:.+]] = affine_map<(d0)[s0] -> (10, -d0 + s0)>
-//  CHECK-DAG: #[[$MAP1:.+]] = affine_map<(d0)[s0] -> (20, -d0 + s0)>
+//  CHECK-DAG: #[[$MAP0:.+]] = affine_map<(d0)[s0] -> (-d0 + s0, 10)>
+//  CHECK-DAG: #[[$MAP1:.+]] = affine_map<(d0)[s0] -> (-d0 + s0, 20)>
 //      CHECK-LABEL: func.func @simple_matmul_memref(
 // CHECK-SAME:     %[[ARG0:[a-zA-Z0-9]+]]: memref<?x?xf32>
 // CHECK-SAME:     %[[ARG1:[a-zA-Z0-9]+]]: memref<?x?xf32>
@@ -117,7 +117,7 @@ module attributes {transform.with_named_sequence} {
     transform.yield
   }
 }
-//  CHECK-DAG: #[[$MAP0:.+]] = affine_map<(d0) -> (10, -d0 + 128)>
+//  CHECK-DAG: #[[$MAP0:.+]] = affine_map<(d0) -> (-d0 + 128, 10)>
 //      CHECK-LABEL: func.func @multi_result(
 // CHECK-SAME:     %[[ARG0:[a-zA-Z0-9]+]]: tensor<128x200x300xf32>)
 //  CHECK-DAG:   %[[INIT0:.+]] = tensor.empty()
@@ -161,9 +161,9 @@ module attributes {transform.with_named_sequence} {
     transform.yield
   }
 }
-//  CHECK-DAG: #[[$MAP0:.+]] = affine_map<(d0)[s0] -> (10, -d0 + s0)>
-//  CHECK-DAG: #[[$MAP1:.+]] = affine_map<(d0)[s0] -> (20, -d0 + s0)>
-//  CHECK-DAG: #[[$MAP2:.+]] = affine_map<(d0)[s0] -> (30, -d0 + s0)>
+//  CHECK-DAG: #[[$MAP0:.+]] = affine_map<(d0)[s0] -> (-d0 + s0, 10)>
+//  CHECK-DAG: #[[$MAP1:.+]] = affine_map<(d0)[s0] -> (-d0 + s0, 20)>
+//  CHECK-DAG: #[[$MAP2:.+]] = affine_map<(d0)[s0] -> (-d0 + s0, 30)>
 //  CHECK-DAG: #[[$MAP3:.+]] = affine_map<(d0)[s0] -> (d0 + s0 * 2 - 2)>
 //  CHECK-DAG: #[[$MAP4:.+]] = affine_map<(d0)[s0] -> (d0 + s0 * 3 - 3)>
 //      CHECK-LABEL: func.func @conv2D(
@@ -264,8 +264,8 @@ module attributes {transform.with_named_sequence} {
     transform.yield
   }
 }
-//  CHECK-DAG: #[[$MAP0:.+]] = affine_map<(d0)[s0] -> (20, -d0 + s0)>
-//  CHECK-DAG: #[[$MAP2:.+]] = affine_map<(d0)[s0] -> (10, -d0 + s0)>
+//  CHECK-DAG: #[[$MAP0:.+]] = affine_map<(d0)[s0] -> (-d0 + s0, 20)>
+//  CHECK-DAG: #[[$MAP2:.+]] = affine_map<(d0)[s0] -> (-d0 + s0, 10)>
 //      CHECK-LABEL: func.func @interchange_matmul(
 // CHECK-SAME:     %[[ARG0:[a-zA-Z0-9]+]]: tensor<?x?xf32>
 // CHECK-SAME:     %[[ARG1:[a-zA-Z0-9]+]]: tensor<?x?xf32>
diff --git a/mlir/test/Target/LLVMIR/openmp-llvm.mlir b/mlir/test/Target/LLVMIR/openmp-llvm.mlir
index 04c2e9f..acebb206 100644
--- a/mlir/test/Target/LLVMIR/openmp-llvm.mlir
+++ b/mlir/test/Target/LLVMIR/openmp-llvm.mlir
@@ -148,12 +148,12 @@ llvm.func @test_omp_parallel_num_threads_3() -> () {
 // CHECK: define internal void @[[OMP_OUTLINED_FN_NUM_THREADS_3_1]]
   // CHECK: call void @__kmpc_barrier
 
-// CHECK: define void @test_omp_parallel_if_1(i32 %[[IF_VAR_1:.*]])
+// CHECK: define void @test_omp_parallel_if_1(i32 %[[IF_EXPR_1:.*]])
 llvm.func @test_omp_parallel_if_1(%arg0: i32) -> () {
 
   %0 = llvm.mlir.constant(0 : index) : i32
   %1 = llvm.icmp "slt" %arg0, %0 : i32
-// CHECK: %[[IF_COND_VAR_1:.*]] = icmp slt i32 %[[IF_VAR_1]], 0
+// CHECK: %[[IF_COND_VAR_1:.*]] = icmp slt i32 %[[IF_EXPR_1]], 0
 
 
 // CHECK: %[[GTN_IF_1:.*]] = call i32 @__kmpc_global_thread_num(ptr @[[SI_VAR_IF_1:.*]])
@@ -1330,14 +1330,14 @@ llvm.func @omp_ordered(%arg0 : i32, %arg1 : i32, %arg2 : i32, %arg3 : i64,
       // CHECK: [[TMP2:%.*]] = getelementptr inbounds [1 x i64], ptr [[ADDR]], i64 0, i64 0
       // CHECK: [[OMP_THREAD2:%.*]] = call i32 @__kmpc_global_thread_num(ptr @[[GLOB3:[0-9]+]])
       // CHECK: call void @__kmpc_doacross_wait(ptr @[[GLOB3]], i32 [[OMP_THREAD2]], ptr [[TMP2]])
-      omp.ordered depend_type(dependsink) depend_vec(%arg3 : i64) {num_loops_val = 1 : i64}
+      omp.ordered depend_type(dependsink) depend_vec(%arg3 : i64) {doacross_num_loops = 1 : i64}
 
       // CHECK: [[TMP3:%.*]] = getelementptr inbounds [1 x i64], ptr [[ADDR3]], i64 0, i64 0
       // CHECK: store i64 [[ARG0]], ptr [[TMP3]], align 8
       // CHECK: [[TMP4:%.*]] = getelementptr inbounds [1 x i64], ptr [[ADDR3]], i64 0, i64 0
       // CHECK: [[OMP_THREAD4:%.*]] = call i32 @__kmpc_global_thread_num(ptr @[[GLOB5:[0-9]+]])
       // CHECK: call void @__kmpc_doacross_post(ptr @[[GLOB5]], i32 [[OMP_THREAD4]], ptr [[TMP4]])
-      omp.ordered depend_type(dependsource) depend_vec(%arg3 : i64) {num_loops_val = 1 : i64}
+      omp.ordered depend_type(dependsource) depend_vec(%arg3 : i64) {doacross_num_loops = 1 : i64}
 
       omp.yield
     }
@@ -1360,7 +1360,7 @@ llvm.func @omp_ordered(%arg0 : i32, %arg1 : i32, %arg2 : i32, %arg3 : i64,
       // CHECK: [[TMP10:%.*]] = getelementptr inbounds [2 x i64], ptr [[ADDR7]], i64 0, i64 0
       // CHECK: [[OMP_THREAD8:%.*]] = call i32 @__kmpc_global_thread_num(ptr @[[GLOB7]])
       // CHECK: call void @__kmpc_doacross_wait(ptr @[[GLOB7]], i32 [[OMP_THREAD8]], ptr [[TMP10]])
-      omp.ordered depend_type(dependsink) depend_vec(%arg3, %arg4, %arg5, %arg6 : i64, i64, i64, i64) {num_loops_val = 2 : i64}
+      omp.ordered depend_type(dependsink) depend_vec(%arg3, %arg4, %arg5, %arg6 : i64, i64, i64, i64) {doacross_num_loops = 2 : i64}
 
       // CHECK: [[TMP11:%.*]] = getelementptr inbounds [2 x i64], ptr [[ADDR9]], i64 0, i64 0
       // CHECK: store i64 [[ARG0]], ptr [[TMP11]], align 8
@@ -1369,7 +1369,7 @@ llvm.func @omp_ordered(%arg0 : i32, %arg1 : i32, %arg2 : i32, %arg3 : i64,
       // CHECK: [[TMP13:%.*]] = getelementptr inbounds [2 x i64], ptr [[ADDR9]], i64 0, i64 0
       // CHECK: [[OMP_THREAD10:%.*]] = call i32 @__kmpc_global_thread_num(ptr @[[GLOB9:[0-9]+]])
       // CHECK: call void @__kmpc_doacross_post(ptr @[[GLOB9]], i32 [[OMP_THREAD10]], ptr [[TMP13]])
-      omp.ordered depend_type(dependsource) depend_vec(%arg3, %arg4 : i64, i64) {num_loops_val = 2 : i64}
+      omp.ordered depend_type(dependsource) depend_vec(%arg3, %arg4 : i64, i64) {doacross_num_loops = 2 : i64}
 
       omp.yield
     }
diff --git a/mlir/test/Transforms/test-legalize-type-conversion.mlir b/mlir/test/Transforms/test-legalize-type-conversion.mlir
index 8254be6..07dfb49 100644
--- a/mlir/test/Transforms/test-legalize-type-conversion.mlir
+++ b/mlir/test/Transforms/test-legalize-type-conversion.mlir
@@ -103,8 +103,9 @@ func.func @test_block_argument_not_converted() {
 // Make sure argument type changes aren't implicitly forwarded.
 func.func @test_signature_conversion_no_converter() {
   "test.signature_conversion_no_converter"() ({
-  // expected-error@below {{failed to legalize unresolved materialization from ('f64') to 'f32' that remained live after conversion}}
+  // expected-error@below {{failed to materialize conversion for block argument #0 that remained live after conversion, type was 'f32'}}
   ^bb0(%arg0: f32):
+    // expected-note@below{{see existing live user here}}
     "test.type_consumer"(%arg0) : (f32) -> ()
     "test.return"(%arg0) : (f32) -> ()
   }) : () -> ()
@@ -127,3 +128,18 @@ llvm.func @unsupported_func_op_interface() {
   // CHECK: llvm.return
   llvm.return
 }
+
+// -----
+
+// CHECK-LABEL: func @test_signature_conversion_no_converter()
+func.func @test_signature_conversion_no_converter() {
+  // CHECK: "test.signature_conversion_no_converter"() ({
+  // CHECK: ^{{.*}}(%[[arg0:.*]]: f64):
+  "test.signature_conversion_no_converter"() ({
+  ^bb0(%arg0: f32):
+    // CHECK: "test.legal_op_d"(%[[arg0]]) : (f64) -> ()
+    "test.replace_with_legal_op"(%arg0) : (f32) -> ()
+    "test.return"() : () -> ()
+  }) : () -> ()
+  return
+}
diff --git a/mlir/test/Transforms/test-legalizer.mlir b/mlir/test/Transforms/test-legalizer.mlir
index 7a7af2b..a789ab9 100644
--- a/mlir/test/Transforms/test-legalizer.mlir
+++ b/mlir/test/Transforms/test-legalizer.mlir
@@ -408,10 +408,10 @@ func.func @test_move_op_before_rollback() {
 
 // CHECK-LABEL: func @test_properties_rollback()
 func.func @test_properties_rollback() {
-  // CHECK: test.with_properties <{a = 32 : i64,
+  // CHECK: test.with_properties a = 32,
   // expected-remark @below{{op 'test.with_properties' is not legalizable}}
   test.with_properties
-      <{a = 32 : i64, array = array<i64: 1, 2, 3, 4>, b = "foo"}>
+      a = 32, b = "foo", c = "bar", flag = true, array = [1, 2, 3, 4]
       {modify_inplace}
   "test.return"() : () -> ()
 }
diff --git a/mlir/test/lib/Dialect/SCF/TestSCFWrapInZeroTripCheck.cpp b/mlir/test/lib/Dialect/SCF/TestSCFWrapInZeroTripCheck.cpp
index 10206dd..7e51d67 100644
--- a/mlir/test/lib/Dialect/SCF/TestSCFWrapInZeroTripCheck.cpp
+++ b/mlir/test/lib/Dialect/SCF/TestSCFWrapInZeroTripCheck.cpp
@@ -1,4 +1,4 @@
-//===- TestWrapInZeroTripCheck.cpp -- Passes to test SCF zero-trip-check --===//
+//===- TestSCFWrapInZeroTripCheck.cpp -- Pass to test SCF zero-trip-check -===//
 //
 // Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
 // See https://llvm.org/LICENSE.txt for license information.
@@ -13,9 +13,11 @@
 
 #include "mlir/Dialect/Func/IR/FuncOps.h"
 #include "mlir/Dialect/SCF/IR/SCF.h"
+#include "mlir/Dialect/SCF/Transforms/Patterns.h"
 #include "mlir/Dialect/SCF/Transforms/Transforms.h"
 #include "mlir/IR/PatternMatch.h"
 #include "mlir/Pass/Pass.h"
+#include "mlir/Transforms/GreedyPatternRewriteDriver.h"
 
 using namespace mlir;
 
@@ -46,13 +48,19 @@ struct TestWrapWhileLoopInZeroTripCheckPass
     func::FuncOp func = getOperation();
     MLIRContext *context = &getContext();
     IRRewriter rewriter(context);
-    func.walk([&](scf::WhileOp op) {
-      FailureOr<scf::WhileOp> result =
-          scf::wrapWhileLoopInZeroTripCheck(op, rewriter, forceCreateCheck);
-      // Ignore not implemented failure in tests. The expected output should
-      // catch problems (e.g. transformation doesn't happen).
-      (void)result;
-    });
+    if (forceCreateCheck) {
+      func.walk([&](scf::WhileOp op) {
+        FailureOr<scf::WhileOp> result =
+            scf::wrapWhileLoopInZeroTripCheck(op, rewriter, forceCreateCheck);
+        // Ignore not implemented failure in tests. The expected output should
+        // catch problems (e.g. transformation doesn't happen).
+        (void)result;
+      });
+    } else {
+      RewritePatternSet patterns(context);
+      scf::populateSCFRotateWhileLoopPatterns(patterns);
+      (void)applyPatternsAndFoldGreedily(func, std::move(patterns));
+    }
   }
 
   Option<bool> forceCreateCheck{
diff --git a/mlir/test/lib/Dialect/Test/TestFormatUtils.cpp b/mlir/test/lib/Dialect/Test/TestFormatUtils.cpp
index 6e75dd3..9ed1b3a 100644
--- a/mlir/test/lib/Dialect/Test/TestFormatUtils.cpp
+++ b/mlir/test/lib/Dialect/Test/TestFormatUtils.cpp
@@ -297,11 +297,17 @@ void test::printSwitchCases(OpAsmPrinter &p, Operation *op,
 // CustomUsingPropertyInCustom
 //===----------------------------------------------------------------------===//
 
-bool test::parseUsingPropertyInCustom(OpAsmParser &parser, int64_t value[3]) {
-  return parser.parseLSquare() || parser.parseInteger(value[0]) ||
-         parser.parseComma() || parser.parseInteger(value[1]) ||
-         parser.parseComma() || parser.parseInteger(value[2]) ||
-         parser.parseRSquare();
+bool test::parseUsingPropertyInCustom(OpAsmParser &parser,
+                                      SmallVector<int64_t> &value) {
+  auto elemParser = [&]() {
+    int64_t v = 0;
+    if (failed(parser.parseInteger(v)))
+      return failure();
+    value.push_back(v);
+    return success();
+  };
+  return failed(parser.parseCommaSeparatedList(OpAsmParser::Delimiter::Square,
+                                               elemParser));
 }
 
 void test::printUsingPropertyInCustom(OpAsmPrinter &printer, Operation *op,
diff --git a/mlir/test/lib/Dialect/Test/TestFormatUtils.h b/mlir/test/lib/Dialect/Test/TestFormatUtils.h
index 7e9cd83..6d4df7d 100644
--- a/mlir/test/lib/Dialect/Test/TestFormatUtils.h
+++ b/mlir/test/lib/Dialect/Test/TestFormatUtils.h
@@ -160,7 +160,8 @@ void printSwitchCases(mlir::OpAsmPrinter &p, mlir::Operation *op,
 // CustomUsingPropertyInCustom
 //===----------------------------------------------------------------------===//
 
-bool parseUsingPropertyInCustom(mlir::OpAsmParser &parser, int64_t value[3]);
+bool parseUsingPropertyInCustom(mlir::OpAsmParser &parser,
+                                llvm::SmallVector<int64_t> &value);
 
 void printUsingPropertyInCustom(mlir::OpAsmPrinter &printer,
                                 mlir::Operation *op,
diff --git a/mlir/test/lib/Dialect/Test/TestOps.td b/mlir/test/lib/Dialect/Test/TestOps.td
index 9450764..2b55bff 100644
--- a/mlir/test/lib/Dialect/Test/TestOps.td
+++ b/mlir/test/lib/Dialect/Test/TestOps.td
@@ -1884,6 +1884,7 @@ def LegalOpA : TEST_Op<"legal_op_a">,
 def LegalOpB : TEST_Op<"legal_op_b">, Results<(outs I32)>;
 def LegalOpC : TEST_Op<"legal_op_c">,
   Arguments<(ins I32)>, Results<(outs I32)>;
+def LegalOpD : TEST_Op<"legal_op_d">, Arguments<(ins AnyType)>;
 
 // Check that the conversion infrastructure can properly undo the creation of
 // operations where an operation was created before its parent, in this case,
@@ -2947,11 +2948,18 @@ def TestVersionedOpC : TEST_Op<"versionedC"> {
 
 // Op with a properties struct defined inline.
 def TestOpWithProperties : TEST_Op<"with_properties"> {
-  let assemblyFormat = "prop-dict attr-dict";
+  let assemblyFormat = [{
+    `a` `=` $a `,`
+    `b` `=` $b `,`
+    `c` `=` $c `,`
+    `flag` `=` $flag `,`
+    `array` `=` $array attr-dict}];
   let arguments = (ins
-    IntProperty<"int64_t">:$a,
+    I64Property:$a,
     StrAttr:$b, // Attributes can directly be used here.
-    ArrayProperty<"int64_t", 4>:$array // example of an array
+    StringProperty:$c,
+    BoolProperty:$flag,
+    IntArrayProperty<"int64_t">:$array // example of an array
   );
 }
 
@@ -2974,7 +2982,7 @@ def TestOpWithPropertiesAndInferredType
 
 // Demonstrate how to wrap an existing C++ class named MyPropStruct.
 def MyStructProperty : Property<"MyPropStruct"> {
-  let convertToAttribute = "$_storage.asAttribute($_ctxt)";
+  let convertToAttribute = "return $_storage.asAttribute($_ctxt);";
   let convertFromAttribute = "return MyPropStruct::setFromAttr($_storage, $_attr, $_diag);";
   let hashProperty = "$_storage.hash();";
 }
@@ -2988,14 +2996,14 @@ def TestOpWithWrappedProperties : TEST_Op<"with_wrapped_properties"> {
 
 def TestOpUsingPropertyInCustom : TEST_Op<"using_property_in_custom"> {
   let assemblyFormat = "custom<UsingPropertyInCustom>($prop) attr-dict";
-  let arguments = (ins ArrayProperty<"int64_t", 3>:$prop);
+  let arguments = (ins IntArrayProperty<"int64_t">:$prop);
 }
 
 def TestOpUsingPropertyInCustomAndOther
   : TEST_Op<"using_property_in_custom_and_other"> {
   let assemblyFormat = "custom<UsingPropertyInCustom>($prop) prop-dict attr-dict";
   let arguments = (ins
-    ArrayProperty<"int64_t", 3>:$prop,
+    IntArrayProperty<"int64_t">:$prop,
     IntProperty<"int64_t">:$other
   );
 }
@@ -3021,7 +3029,7 @@ def TestOpUsingIntPropertyWithWorseBytecode
 
 def PropertiesWithCustomPrint : Property<"PropertiesWithCustomPrint"> {
   let convertToAttribute = [{
-    getPropertiesAsAttribute($_ctxt, $_storage)
+    return getPropertiesAsAttribute($_ctxt, $_storage);
   }];
   let convertFromAttribute = [{
     return setPropertiesFromAttribute($_storage, $_attr, $_diag);
@@ -3085,7 +3093,7 @@ def TestOpWithNiceProperties : TEST_Op<"with_nice_properties"> {
 
 def VersionedProperties : Property<"VersionedProperties"> {
   let convertToAttribute = [{
-      getPropertiesAsAttribute($_ctxt, $_storage)
+    return getPropertiesAsAttribute($_ctxt, $_storage);
   }];
   let convertFromAttribute = [{
     return setPropertiesFromAttribute($_storage, $_attr, $_diag);
@@ -3131,13 +3139,65 @@ def TestOpWithVersionedProperties : TEST_Op<"with_versioned_properties"> {
 }
 
 def TestOpWithDefaultValuedProperties : TEST_Op<"with_default_valued_properties"> {
-  let assemblyFormat = "prop-dict attr-dict";
-  let arguments = (ins DefaultValuedAttr<I32Attr, "0">:$a);
+  let assemblyFormat = [{
+    ($a^) : (`na`)?
+    ($b^)?
+    ($c^)?
+    ($unit^)?
+    attr-dict
+  }];
+  let arguments = (ins DefaultValuedAttr<I32Attr, "0">:$a,
+    DefaultValuedProperty<StringProperty, "\"\"">:$b,
+    DefaultValuedProperty<IntProperty<"int32_t">, "-1">:$c,
+    UnitProperty:$unit);
 }
 
 def TestOpWithOptionalProperties : TEST_Op<"with_optional_properties"> {
-  let assemblyFormat = "prop-dict attr-dict";
-  let arguments = (ins OptionalAttr<I32Attr>:$a, OptionalAttr<I32Attr>:$b);
+  let assemblyFormat = [{
+    (`anAttr` `=` $anAttr^)?
+    (`simple` `=` $simple^)?
+    (`nonTrivialStorage` `=` $nonTrivialStorage^)?
+    (`hasDefault` `=` $hasDefault^)?
+    (`nested` `=` $nested^)?
+    (`longSyntax` `=` $longSyntax^)?
+    (`hasUnit` $hasUnit^)?
+    (`maybeUnit` `=` $maybeUnit^)?
+    attr-dict
+  }];
+  let arguments = (ins
+    OptionalAttr<I32Attr>:$anAttr,
+    OptionalProperty<I64Property>:$simple,
+    OptionalProperty<StringProperty>:$nonTrivialStorage,
+    // Confirm that properties with default values now default to nullopt and have
+    // the long syntax.
+    OptionalProperty<DefaultValuedProperty<I64Property, "0">>:$hasDefault,
+    OptionalProperty<OptionalProperty<I64Property>>:$nested,
+    OptionalProperty<StringProperty, 0>:$longSyntax,
+    UnitProperty:$hasUnit,
+    OptionalProperty<UnitProperty>:$maybeUnit);
+}
+
+def TestOpWithArrayProperties : TEST_Op<"with_array_properties"> {
+  let assemblyFormat = [{
+    `ints` `=` $ints
+    `strings` `=` $strings
+    `nested` `=` $nested
+    `opt` `=` $opt
+    `explicitOptions` `=` $explicitOptions
+    `explicitUnits` `=` $explicitUnits
+    ($hasDefault^ `thats_has_default`)?
+    attr-dict
+  }];
+  let arguments = (ins
+    ArrayProperty<I64Property>:$ints,
+    ArrayProperty<StringProperty>:$strings,
+    ArrayProperty<ArrayProperty<I32Property>>:$nested,
+    OptionalProperty<ArrayProperty<I32Property>>:$opt,
+    ArrayProperty<OptionalProperty<I64Property>>:$explicitOptions,
+    ArrayProperty<UnitProperty>:$explicitUnits,
+    DefaultValuedProperty<ArrayProperty<I64Property>,
+      "::llvm::ArrayRef<int64_t>{}", "::llvm::SmallVector<int64_t>{}">:$hasDefault
+  );
 }
 
 //===----------------------------------------------------------------------===//
diff --git a/mlir/test/lib/Dialect/Test/TestOpsSyntax.td b/mlir/test/lib/Dialect/Test/TestOpsSyntax.td
index 3129085..795b9da 100644
--- a/mlir/test/lib/Dialect/Test/TestOpsSyntax.td
+++ b/mlir/test/lib/Dialect/Test/TestOpsSyntax.td
@@ -86,6 +86,17 @@ def OIListTrivial : TEST_Op<"oilist_with_keywords_only"> {
   }];
 }
 
+// Ops related to OIList primitive
+def OIListTrivialProperties : TEST_Op<"oilist_with_keywords_only_properties"> {
+  let arguments = (ins UnitProperty:$keyword, UnitProperty:$otherKeyword,
+                       UnitProperty:$diffNameUnitPropertyKeyword);
+  let assemblyFormat = [{
+    oilist( `keyword` $keyword
+          | `otherKeyword` $otherKeyword
+          | `thirdKeyword` $diffNameUnitPropertyKeyword) attr-dict
+  }];
+}
+
 def OIListSimple : TEST_Op<"oilist_with_simple_args", [AttrSizedOperandSegments]> {
   let arguments = (ins Optional<AnyType>:$arg0,
                        Optional<AnyType>:$arg1,
@@ -392,6 +403,17 @@ def FormatOptionalUnitAttrNoElide
   let assemblyFormat = "($is_optional^)? attr-dict";
 }
 
+def FormatOptionalUnitProperty : TEST_Op<"format_optional_unit_property"> {
+  let arguments = (ins UnitProperty:$is_optional);
+  let assemblyFormat = "(`is_optional` $is_optional^)? attr-dict";
+}
+
+def FormatOptionalUnitPropertyNoElide
+    : TEST_Op<"format_optional_unit_property_no_elide"> {
+  let arguments = (ins UnitProperty:$is_optional);
+  let assemblyFormat = "($is_optional^)? attr-dict";
+}
+
 def FormatOptionalEnumAttr : TEST_Op<"format_optional_enum_attr"> {
   let arguments = (ins OptionalAttr<SomeI64Enum>:$attr);
   let assemblyFormat = "($attr^)? attr-dict";
diff --git a/mlir/test/lib/Dialect/Test/TestPatterns.cpp b/mlir/test/lib/Dialect/Test/TestPatterns.cpp
index 0546523..91dfb2f 100644
--- a/mlir/test/lib/Dialect/Test/TestPatterns.cpp
+++ b/mlir/test/lib/Dialect/Test/TestPatterns.cpp
@@ -1580,6 +1580,17 @@ struct TestTypeConversionAnotherProducer
   }
 };
 
+struct TestReplaceWithLegalOp : public ConversionPattern {
+  TestReplaceWithLegalOp(MLIRContext *ctx)
+      : ConversionPattern("test.replace_with_legal_op", /*benefit=*/1, ctx) {}
+  LogicalResult
+  matchAndRewrite(Operation *op, ArrayRef<Value> operands,
+                  ConversionPatternRewriter &rewriter) const final {
+    rewriter.replaceOpWithNewOp<LegalOpD>(op, operands[0]);
+    return success();
+  }
+};
+
 struct TestTypeConversionDriver
     : public PassWrapper<TestTypeConversionDriver, OperationPass<>> {
   MLIR_DEFINE_EXPLICIT_INTERNAL_INLINE_TYPE_ID(TestTypeConversionDriver)
@@ -1671,6 +1682,7 @@ struct TestTypeConversionDriver
 
     // Initialize the conversion target.
     mlir::ConversionTarget target(getContext());
+    target.addLegalOp<LegalOpD>();
     target.addDynamicallyLegalOp<TestTypeProducerOp>([](TestTypeProducerOp op) {
       auto recursiveType = dyn_cast<test::TestRecursiveType>(op.getType());
       return op.getType().isF64() || op.getType().isInteger(64) ||
@@ -1696,7 +1708,8 @@ struct TestTypeConversionDriver
                  TestSignatureConversionUndo,
                  TestTestSignatureConversionNoConverter>(converter,
                                                          &getContext());
-    patterns.add<TestTypeConversionAnotherProducer>(&getContext());
+    patterns.add<TestTypeConversionAnotherProducer, TestReplaceWithLegalOp>(
+        &getContext());
     mlir::populateAnyFunctionOpInterfaceTypeConversionPattern(patterns,
                                                               converter);
 
diff --git a/mlir/test/lib/Interfaces/TilingInterface/TestTilingInterfaceTransformOps.cpp b/mlir/test/lib/Interfaces/TilingInterface/TestTilingInterfaceTransformOps.cpp
index 8f206d9..a99441c 100644
--- a/mlir/test/lib/Interfaces/TilingInterface/TestTilingInterfaceTransformOps.cpp
+++ b/mlir/test/lib/Interfaces/TilingInterface/TestTilingInterfaceTransformOps.cpp
@@ -234,11 +234,7 @@ applyTileToAll(RewriterBase &rewriter, Operation *transformOp,
     scf::SCFTilingOptions tilingOptions;
     tilingOptions.setTileSizes(tileSizes).setInterchange(interchange);
     if (mapping) {
-      auto mappingAttrs =
-          llvm::map_to_vector(mapping.value(), [](Attribute attr) {
-            return cast<DeviceMappingAttrInterface>(attr);
-          });
-      tilingOptions.setMapping(mappingAttrs);
+      tilingOptions.setMapping(mapping.value().getValue());
     }
     tilingOptions.setLoopType(scf::SCFTilingOptions::LoopType::ForallOp);
 
diff --git a/mlir/test/mlir-cpu-runner/math-polynomial-approx.mlir b/mlir/test/mlir-cpu-runner/math-polynomial-approx.mlir
index 370c5ba..b886119 100644
--- a/mlir/test/mlir-cpu-runner/math-polynomial-approx.mlir
+++ b/mlir/test/mlir-cpu-runner/math-polynomial-approx.mlir
@@ -493,6 +493,10 @@ func.func @asin() {
   %cst3 = arith.constant -0.25 : f32
   call @asin_f32(%cst3) : (f32) -> ()
 
+  // CHECK: -1.1197
+  %cst4 = arith.constant -0.90 : f32
+  call @asin_f32(%cst4) : (f32) -> ()
+
   // CHECK: 0.25268, 0.384397, 0.597406
   %vec_x = arith.constant dense<[0.25, 0.375, 0.5625]> : vector<3xf32>
   call @asin_3xf32(%vec_x) : (vector<3xf32>) -> ()
diff --git a/mlir/test/mlir-tblgen/op-format.mlir b/mlir/test/mlir-tblgen/op-format.mlir
index 46d27264..03288ae 100644
--- a/mlir/test/mlir-tblgen/op-format.mlir
+++ b/mlir/test/mlir-tblgen/op-format.mlir
@@ -195,6 +195,16 @@ test.format_optional_unit_attribute
 // CHECK: test.format_optional_unit_attribute_no_elide unit
 test.format_optional_unit_attribute_no_elide unit
 
+// CHECK: test.format_optional_unit_property is_optional
+test.format_optional_unit_property is_optional
+
+// CHECK: test.format_optional_unit_property
+// CHECK-NOT: is_optional
+test.format_optional_unit_property
+
+// CHECK: test.format_optional_unit_property_no_elide unit
+test.format_optional_unit_property_no_elide unit
+
 // CHECK: test.format_optional_enum_attr case5
 test.format_optional_enum_attr case5
 
diff --git a/mlir/test/mlir-tblgen/op-format.td b/mlir/test/mlir-tblgen/op-format.td
index 4a19ffb..8af4341 100644
--- a/mlir/test/mlir-tblgen/op-format.td
+++ b/mlir/test/mlir-tblgen/op-format.td
@@ -73,7 +73,7 @@ def OptionalGroupA : TestFormat_Op<[{
 // CHECK-NEXT: result.addAttribute("a", parser.getBuilder().getUnitAttr())
 // CHECK: parser.parseKeyword("bar")
 // CHECK-LABEL: OptionalGroupB::print
-// CHECK: if (!getAAttr())
+// CHECK: if (!(getAAttr() && getAAttr() != ((false) ? ::mlir::OpBuilder((*this)->getContext()).getUnitAttr() : nullptr)))
 // CHECK-NEXT: odsPrinter << ' ' << "foo"
 // CHECK-NEXT: else
 // CHECK-NEXT: odsPrinter << ' ' << "bar"
@@ -84,7 +84,7 @@ def OptionalGroupB : TestFormat_Op<[{
 // Optional group anchored on a default-valued attribute:
 // CHECK-LABEL: OptionalGroupC::parse
 
-//       CHECK: if (getAAttr() && getAAttr() != ::mlir::OpBuilder((*this)->getContext()).getStringAttr("default")) {
+//       CHECK: if (getAAttr() != ::mlir::OpBuilder((*this)->getContext()).getStringAttr("default")) {
 //  CHECK-NEXT:   odsPrinter << ' ';
 //  CHECK-NEXT:   odsPrinter.printAttributeWithoutType(getAAttr());
 //  CHECK-NEXT: }
diff --git a/mlir/test/mlir-tblgen/op-properties.td b/mlir/test/mlir-tblgen/op-properties.td
index 7b0ee6b..918583c 100644
--- a/mlir/test/mlir-tblgen/op-properties.td
+++ b/mlir/test/mlir-tblgen/op-properties.td
@@ -1,8 +1,10 @@
-// RUN: mlir-tblgen -gen-op-decls -I %S/../../include %s | FileCheck %s
+// RUN: mlir-tblgen -gen-op-decls -I %S/../../include %s | FileCheck %s --check-prefix=DECL
+// RUN: mlir-tblgen -gen-op-defs -I %S/../../include %s | FileCheck %s --check-prefix=DEFS
 
 include "mlir/IR/AttrTypeBase.td"
 include "mlir/IR/EnumAttr.td"
 include "mlir/IR/OpBase.td"
+include "mlir/IR/Properties.td"
 
 def Test_Dialect : Dialect {
   let name = "test";
@@ -15,7 +17,115 @@ def OpWithAttr : NS_Op<"op_with_attr">{
   let arguments = (ins AnyAttr:$attr, OptionalAttr<AnyAttr>:$optional);
 }
 
-// CHECK: void setAttrAttr(::mlir::Attribute attr)
-// CHECK-NEXT: getProperties().attr = attr
-// CHECK: void setOptionalAttr(::mlir::Attribute attr)
-// CHECK-NEXT: getProperties().optional = attr
+// Test required and optional properties
+// ---
+
+def DefaultI64Array : IntArrayProperty<"int64_t"> {
+  let defaultValue = "::llvm::ArrayRef<int64_t>{}";
+  let storageTypeValueOverride = "::llvm::SmallVector<int64_t>{}";
+}
+
+def OpWithProps : NS_Op<"op_with_props"> {
+  let arguments = (ins
+    BoolProperty:$flag,
+    StringProperty:$string,
+    ArrayProperty<StringProperty>:$strings,
+    DefaultValuedProperty<I32Property, "0">:$default_int,
+    OptionalProperty<I64Property>:$optional,
+    DefaultI64Array:$intArray
+  );
+}
+
+/// Check that optional arguments to builders only go at the end.
+def OpWithSomeOptionalProperties : NS_Op<"op_with_some_optional_props"> {
+  let arguments = (ins
+    OptionalProperty<I64Property>:$mustSpecify,
+    I64Property:$required,
+    OptionalProperty<StringProperty>:$canOmit,
+    DefaultValuedProperty<I64Property, "-1">:$canOmit2
+  );
+}
+
+/// Check that the ambiguous attribute protection correctly stops optional properties
+/// from getting default argument values in builders.
+def OpWithOptionalPropsAndAttrs :
+    NS_Op<"with_some_optional_props_and_atts"> {
+  let arguments = (ins
+    OptionalProperty<BoolProperty>:$mustSpecify,
+    OptionalAttr<BoolAttr>:$ambiguous,
+    OptionalAttr<I32Attr>:$canOmit,
+    OptionalProperty<I32Property>:$canOmitProp
+  );
+}
+
+// DECL: void setAttrAttr(::mlir::Attribute attr)
+// DECL-NEXT: getProperties().attr = attr
+// DECL: void setOptionalAttr(::mlir::Attribute attr)
+// DECL-NEXT: getProperties().optional = attr
+
+// -----
+
+// DECL-LABEL: class OpWithOptionalPropsAndAttrs :
+// DECL: static void build(
+// DECL-SAME: ::mlir::OpBuilder &odsBuilder,
+// DECL-SAME: ::mlir::OperationState &odsState,
+// DECL-SAME: /*optional*/std::optional<bool> mustSpecify,
+// DECL-SAME: /*optional*/::mlir::BoolAttr ambiguous,
+// DECL-SAME: /*optional*/::mlir::IntegerAttr canOmit,
+// DECL-SAME: /*optional*/std::optional<int32_t> canOmitProp = std::nullopt);
+
+// -----
+
+// COM: Ensure the struct is set up how we expect
+// DECL-LABEL: class OpWithPropsGenericAdaptorBase
+// DECL: using flagTy = bool;
+// DECL-NEXT: flagTy flag;
+// DECL-NEXT: bool getFlag()
+// DECL-NEXT: propStorage = this->flag
+// DECL-NEXT: return propStorage;
+// DECL: void setFlag(bool propValue)
+// DECL-NEXT: propStorage = this->flag;
+// DECL-NEXT: propStorage = propValue;
+// DECL: using stringTy = std::string;
+// DECL: llvm::StringRef getString()
+// DECL: auto &propStorage = this->string;
+// DECL-NEXT: return ::llvm::StringRef{propStorage};
+// DECL: using stringsTy = ::llvm::SmallVector<std::string>
+// DECL: ::llvm::ArrayRef<std::string> getStrings()
+// DECL: using default_intTy = int32_t;
+// DECL: default_intTy default_int = 0;
+// DECL: intArrayTy intArray = ::llvm::SmallVector<int64_t>{};
+// DECL: ::llvm::ArrayRef<int64_t> getIntArray()
+// DECL: return ::llvm::ArrayRef<int64_t>{propStorage}
+// DECL: void setIntArray(::llvm::ArrayRef<int64_t> propValue)
+// DECL: propStorage.assign
+// DECL-LABEL: class OpWithProps :
+// DECL: setString(::llvm::StringRef newString)
+// DECL-NEXT: getProperties().setString(newString)
+
+// DECL: static void build(
+// DECL-SAME: ::mlir::OpBuilder &odsBuilder,
+// DECL-SAME: ::mlir::OperationState &odsState,
+// DECL-SAME: bool flag,
+// DECL-SAME: ::llvm::StringRef string,
+// DECL-SAME: ::llvm::ArrayRef<std::string> strings,
+// DECL-SAME: /*optional*/int32_t default_int = 0,
+// DECL-SAME: /*optional*/std::optional<int64_t> optional = std::nullopt,
+// DECL-SAME: /*optional*/::llvm::ArrayRef<int64_t> intArray = ::llvm::ArrayRef<int64_t>{});
+
+// DEFS-LABEL: OpWithProps::computePropertiesHash
+// DEFS: hash_intArray
+// DEFS-NEXT: return ::llvm::hash_value(::llvm::ArrayRef<int64_t>{propStorage})
+// DEFS: ::llvm::hash_value(prop.optional)
+// DEFS: hash_intArray(prop.intArray)
+
+// -----
+
+// DECL-LABEL: class OpWithSomeOptionalProperties :
+// DECL: static void build(
+// DECL-SAME: ::mlir::OpBuilder &odsBuilder,
+// DECL-SAME: ::mlir::OperationState &odsState,
+// DECL-SAME: /*optional*/std::optional<int64_t> mustSpecify,
+// DECL-SAME: int64_t required,
+// DECL-SAME: /*optional*/std::optional<::llvm::StringRef> canOmit = std::nullopt,
+// DECL-SAME: /*optional*/int64_t canOmit2 = -1);
diff --git a/mlir/test/python/execution_engine.py b/mlir/test/python/execution_engine.py
index e8b4700..8125bf3 100644
--- a/mlir/test/python/execution_engine.py
+++ b/mlir/test/python/execution_engine.py
@@ -5,6 +5,7 @@ from mlir.ir import *
 from mlir.passmanager import *
 from mlir.execution_engine import *
 from mlir.runtime import *
+from ml_dtypes import bfloat16
 
 
 # Log everything to stderr and flush so that we have a unified stream to match
@@ -521,6 +522,45 @@ def testComplexUnrankedMemrefAdd():
 run(testComplexUnrankedMemrefAdd)
 
 
+# Test bf16 memrefs
+# CHECK-LABEL: TEST: testBF16Memref
+def testBF16Memref():
+    with Context():
+        module = Module.parse(
+            """
+    module  {
+      func.func @main(%arg0: memref<1xbf16>,
+                      %arg1: memref<1xbf16>) attributes { llvm.emit_c_interface } {
+        %0 = arith.constant 0 : index
+        %1 = memref.load %arg0[%0] : memref<1xbf16>
+        memref.store %1, %arg1[%0] : memref<1xbf16>
+        return
+      }
+    } """
+        )
+
+        arg1 = np.array([0.5]).astype(bfloat16)
+        arg2 = np.array([0.0]).astype(bfloat16)
+
+        arg1_memref_ptr = ctypes.pointer(
+            ctypes.pointer(get_ranked_memref_descriptor(arg1))
+        )
+        arg2_memref_ptr = ctypes.pointer(
+            ctypes.pointer(get_ranked_memref_descriptor(arg2))
+        )
+
+        execution_engine = ExecutionEngine(lowerToLLVM(module))
+        execution_engine.invoke("main", arg1_memref_ptr, arg2_memref_ptr)
+
+        # test to-numpy utility
+        # CHECK: [0.5]
+        npout = ranked_memref_to_numpy(arg2_memref_ptr[0])
+        log(npout)
+
+
+run(testBF16Memref)
+
+
 #  Test addition of two 2d_memref
 # CHECK-LABEL: TEST: testDynamicMemrefAdd2D
 def testDynamicMemrefAdd2D():
diff --git a/mlir/tools/mlir-tblgen/OpDefinitionsGen.cpp b/mlir/tools/mlir-tblgen/OpDefinitionsGen.cpp
index 0fc750c..a2ceefb 100644
--- a/mlir/tools/mlir-tblgen/OpDefinitionsGen.cpp
+++ b/mlir/tools/mlir-tblgen/OpDefinitionsGen.cpp
@@ -155,6 +155,36 @@ static const char *const valueRangeReturnCode = R"(
            std::next({0}, valueRange.first + valueRange.second)};
 )";
 
+/// Parse operand/result segment_size property.
+/// {0}: Number of elements in the segment array
+static const char *const parseTextualSegmentSizeFormat = R"(
+  size_t i = 0;
+  auto parseElem = [&]() -> ::mlir::ParseResult {
+    if (i >= {0})
+      return $_parser.emitError($_parser.getCurrentLocation(),
+        "expected `]` after {0} segment sizes");
+    if (failed($_parser.parseInteger($_storage[i])))
+      return ::mlir::failure();
+    i += 1;
+    return ::mlir::success();
+  };
+  if (failed($_parser.parseCommaSeparatedList(
+      ::mlir::AsmParser::Delimeter::Square, parseElem)))
+    return failure();
+  if (i < {0})
+    return $_parser.emitError($_parser.getCurrentLocation(),
+      "expected {0} segment sizes, found only ") << i;
+  return success();
+)";
+
+static const char *const printTextualSegmentSize = R"(
+  [&]() {
+    $_printer << '[';
+    ::llvm::interleaveComma($_storage, $_printer);
+    $_printer << ']';
+  }()
+)";
+
 /// Read operand/result segment_size from bytecode.
 static const char *const readBytecodeSegmentSizeNative = R"(
   if ($_reader.getBytecodeVersion() >= /*kNativePropertiesODSSegmentSize=*/6)
@@ -422,8 +452,10 @@ private:
   // Property
   std::optional<NamedProperty> operandSegmentsSize;
   std::string operandSegmentsSizeStorage;
+  std::string operandSegmentsSizeParser;
   std::optional<NamedProperty> resultSegmentsSize;
   std::string resultSegmentsSizeStorage;
+  std::string resultSegmentsSizeParser;
 
   // Indices to store the position in the emission order of the operand/result
   // segment sizes attribute if emitted as part of the properties for legacy
@@ -448,31 +480,40 @@ void OpOrAdaptorHelper::computeAttrMetadata() {
         {namedAttr.name, AttributeMetadata{namedAttr.name, !isOptional, attr}});
   }
 
-  auto makeProperty = [&](StringRef storageType) {
+  auto makeProperty = [&](StringRef storageType, StringRef parserCall) {
     return Property(
+        /*summary=*/"",
+        /*description=*/"",
         /*storageType=*/storageType,
         /*interfaceType=*/"::llvm::ArrayRef<int32_t>",
         /*convertFromStorageCall=*/"$_storage",
         /*assignToStorageCall=*/
         "::llvm::copy($_value, $_storage.begin())",
         /*convertToAttributeCall=*/
-        "::mlir::DenseI32ArrayAttr::get($_ctxt, $_storage)",
+        "return ::mlir::DenseI32ArrayAttr::get($_ctxt, $_storage);",
         /*convertFromAttributeCall=*/
         "return convertFromAttribute($_storage, $_attr, $_diag);",
+        /*parserCall=*/parserCall,
+        /*optionalParserCall=*/"",
+        /*printerCall=*/printTextualSegmentSize,
         /*readFromMlirBytecodeCall=*/readBytecodeSegmentSizeNative,
         /*writeToMlirBytecodeCall=*/writeBytecodeSegmentSizeNative,
         /*hashPropertyCall=*/
         "::llvm::hash_combine_range(std::begin($_storage), "
         "std::end($_storage));",
-        /*StringRef defaultValue=*/"");
+        /*StringRef defaultValue=*/"",
+        /*storageTypeValueOverride=*/"");
   };
   // Include key attributes from several traits as implicitly registered.
   if (op.getTrait("::mlir::OpTrait::AttrSizedOperandSegments")) {
     if (op.getDialect().usePropertiesForAttributes()) {
       operandSegmentsSizeStorage =
           llvm::formatv("std::array<int32_t, {0}>", op.getNumOperands());
-      operandSegmentsSize = {"operandSegmentSizes",
-                             makeProperty(operandSegmentsSizeStorage)};
+      operandSegmentsSizeParser =
+          llvm::formatv(parseTextualSegmentSizeFormat, op.getNumOperands());
+      operandSegmentsSize = {
+          "operandSegmentSizes",
+          makeProperty(operandSegmentsSizeStorage, operandSegmentsSizeParser)};
     } else {
       attrMetadata.insert(
           {operandSegmentAttrName, AttributeMetadata{operandSegmentAttrName,
@@ -484,8 +525,11 @@ void OpOrAdaptorHelper::computeAttrMetadata() {
     if (op.getDialect().usePropertiesForAttributes()) {
       resultSegmentsSizeStorage =
           llvm::formatv("std::array<int32_t, {0}>", op.getNumResults());
-      resultSegmentsSize = {"resultSegmentSizes",
-                            makeProperty(resultSegmentsSizeStorage)};
+      resultSegmentsSizeParser =
+          llvm::formatv(parseTextualSegmentSizeFormat, op.getNumResults());
+      resultSegmentsSize = {
+          "resultSegmentSizes",
+          makeProperty(resultSegmentsSizeStorage, resultSegmentsSizeParser)};
     } else {
       attrMetadata.insert(
           {resultSegmentAttrName,
@@ -572,6 +616,12 @@ private:
   void
   genPropertiesSupportForBytecode(ArrayRef<ConstArgument> attrOrProperties);
 
+  // Generates getters for the properties.
+  void genPropGetters();
+
+  // Generates seters for the properties.
+  void genPropSetters();
+
   // Generates getters for the attributes.
   void genAttrGetters();
 
@@ -1041,6 +1091,8 @@ OpEmitter::OpEmitter(const Operator &op,
   genNamedRegionGetters();
   genNamedSuccessorGetters();
   genPropertiesSupport();
+  genPropGetters();
+  genPropSetters();
   genAttrGetters();
   genAttrSetters();
   genOptionalAttrRemovers();
@@ -1198,6 +1250,16 @@ void OpEmitter::genAttrNameGetters() {
   }
 }
 
+// Emit the getter for a named property.
+// It is templated to be shared between the Op and the adaptor class.
+template <typename OpClassOrAdaptor>
+static void emitPropGetter(OpClassOrAdaptor &opClass, const Operator &op,
+                           StringRef name, const Property &prop) {
+  auto *method = opClass.addInlineMethod(prop.getInterfaceType(), name);
+  ERROR_IF_PRUNED(method, name, op);
+  method->body() << formatv("  return getProperties().{0}();", name);
+}
+
 // Emit the getter for an attribute with the return type specified.
 // It is templated to be shared between the Op and the adaptor class.
 template <typename OpClassOrAdaptor>
@@ -1313,7 +1375,7 @@ void OpEmitter::genPropertiesSupport() {
     )decl";
   const char *propFromAttrFmt = R"decl(
       auto setFromAttr = [] (auto &propStorage, ::mlir::Attribute propAttr,
-               ::llvm::function_ref<::mlir::InFlightDiagnostic()> emitError) {{
+               ::llvm::function_ref<::mlir::InFlightDiagnostic()> emitError) -> ::mlir::LogicalResult {{
         {0}
       };
       {2};
@@ -1358,7 +1420,10 @@ void OpEmitter::genPropertiesSupport() {
                                           .addSubst("_storage", propertyStorage)
                                           .addSubst("_diag", propertyDiag)),
                                name, getAttr);
-      if (prop.hasDefaultValue()) {
+      if (prop.hasStorageTypeValueOverride()) {
+        setPropMethod << formatv(attrGetDefaultFmt, name,
+                                 prop.getStorageTypeValueOverride());
+      } else if (prop.hasDefaultValue()) {
         setPropMethod << formatv(attrGetDefaultFmt, name,
                                  prop.getDefaultValue());
       } else {
@@ -1409,8 +1474,10 @@ void OpEmitter::genPropertiesSupport() {
   const char *propToAttrFmt = R"decl(
     {
       const auto &propStorage = prop.{0};
-      attrs.push_back(odsBuilder.getNamedAttr("{0}",
-                                              {1}));
+      auto attr = [&]() -> ::mlir::Attribute {{
+        {1}
+      }();
+      attrs.push_back(odsBuilder.getNamedAttr("{0}", attr));
     }
 )decl";
   for (const auto &attrOrProp : attrOrProperties) {
@@ -1458,9 +1525,12 @@ void OpEmitter::genPropertiesSupport() {
       StringRef name = namedProperty->name;
       auto &prop = namedProperty->prop;
       FmtContext fctx;
-      hashMethod << formatv(propHashFmt, name,
-                            tgfmt(prop.getHashPropertyCall(),
-                                  &fctx.addSubst("_storage", propertyStorage)));
+      if (!prop.getHashPropertyCall().empty()) {
+        hashMethod << formatv(
+            propHashFmt, name,
+            tgfmt(prop.getHashPropertyCall(),
+                  &fctx.addSubst("_storage", propertyStorage)));
+      }
     }
   }
   hashMethod << "  return llvm::hash_combine(";
@@ -1468,8 +1538,13 @@ void OpEmitter::genPropertiesSupport() {
       attrOrProperties, hashMethod, [&](const ConstArgument &attrOrProp) {
         if (const auto *namedProperty =
                 llvm::dyn_cast_if_present<const NamedProperty *>(attrOrProp)) {
-          hashMethod << "\n    hash_" << namedProperty->name << "(prop."
-                     << namedProperty->name << ")";
+          if (!namedProperty->prop.getHashPropertyCall().empty()) {
+            hashMethod << "\n    hash_" << namedProperty->name << "(prop."
+                       << namedProperty->name << ")";
+          } else {
+            hashMethod << "\n    ::llvm::hash_value(prop."
+                       << namedProperty->name << ")";
+          }
           return;
         }
         const auto *namedAttr =
@@ -1524,8 +1599,9 @@ void OpEmitter::genPropertiesSupport() {
                      "\"{0}\") return ",
                      resultSegmentAttrName);
     }
-    getInherentAttrMethod << tgfmt(prop.getConvertToAttributeCall(), &fctx)
-                          << ";\n";
+    getInherentAttrMethod << "[&]() -> ::mlir::Attribute { "
+                          << tgfmt(prop.getConvertToAttributeCall(), &fctx)
+                          << " }();\n";
 
     if (name == operandSegmentAttrName) {
       setInherentAttrMethod
@@ -1549,13 +1625,15 @@ void OpEmitter::genPropertiesSupport() {
 )decl",
                                      name);
     if (name == operandSegmentAttrName) {
-      populateInherentAttrsMethod
-          << formatv("  attrs.append(\"{0}\", {1});\n", operandSegmentAttrName,
-                     tgfmt(prop.getConvertToAttributeCall(), &fctx));
+      populateInherentAttrsMethod << formatv(
+          "  attrs.append(\"{0}\", [&]() -> ::mlir::Attribute { {1} }());\n",
+          operandSegmentAttrName,
+          tgfmt(prop.getConvertToAttributeCall(), &fctx));
     } else {
-      populateInherentAttrsMethod
-          << formatv("  attrs.append(\"{0}\", {1});\n", resultSegmentAttrName,
-                     tgfmt(prop.getConvertToAttributeCall(), &fctx));
+      populateInherentAttrsMethod << formatv(
+          "  attrs.append(\"{0}\", [&]() -> ::mlir::Attribute { {1} }());\n",
+          resultSegmentAttrName,
+          tgfmt(prop.getConvertToAttributeCall(), &fctx));
     }
   }
   getInherentAttrMethod << "  return std::nullopt;\n";
@@ -1701,6 +1779,26 @@ void OpEmitter::genPropertiesSupportForBytecode(
   readPropertiesMethod << "  return ::mlir::success();";
 }
 
+void OpEmitter::genPropGetters() {
+  for (const NamedProperty &prop : op.getProperties()) {
+    std::string name = op.getGetterName(prop.name);
+    emitPropGetter(opClass, op, name, prop.prop);
+  }
+}
+
+void OpEmitter::genPropSetters() {
+  for (const NamedProperty &prop : op.getProperties()) {
+    std::string name = op.getSetterName(prop.name);
+    std::string argName = "new" + convertToCamelFromSnakeCase(
+                                      prop.name, /*capitalizeFirst=*/true);
+    auto *method = opClass.addInlineMethod(
+        "void", name, MethodParameter(prop.prop.getInterfaceType(), argName));
+    if (!method)
+      return;
+    method->body() << formatv("  getProperties().{0}({1});", name, argName);
+  }
+}
+
 void OpEmitter::genAttrGetters() {
   FmtContext fctx;
   fctx.withBuilder("::mlir::Builder((*this)->getContext())");
@@ -2957,6 +3055,12 @@ void OpEmitter::buildParamList(SmallVectorImpl<MethodParameter> &paramList,
   }
 
   // Add parameters for all arguments (operands and attributes).
+  // Track "attr-like" (property and attribute) optional values separate from
+  // attributes themselves so that the disambiguation code can look at the first
+  // attribute specifically when determining where to trim the optional-value
+  // list to avoid ambiguity while preserving the ability of all-property ops to
+  // use default parameters.
+  int defaultValuedAttrLikeStartIndex = op.getNumArgs();
   int defaultValuedAttrStartIndex = op.getNumArgs();
   // Successors and variadic regions go at the end of the parameter list, so no
   // default arguments are possible.
@@ -2967,6 +3071,15 @@ void OpEmitter::buildParamList(SmallVectorImpl<MethodParameter> &paramList,
     for (int i = op.getNumArgs() - 1; i >= 0; --i) {
       auto *namedAttr =
           llvm::dyn_cast_if_present<tblgen::NamedAttribute *>(op.getArg(i));
+      auto *namedProperty =
+          llvm::dyn_cast_if_present<tblgen::NamedProperty *>(op.getArg(i));
+      if (namedProperty) {
+        Property prop = namedProperty->prop;
+        if (!prop.hasDefaultValue())
+          break;
+        defaultValuedAttrLikeStartIndex = i;
+        continue;
+      }
       if (!namedAttr)
         break;
 
@@ -2986,6 +3099,7 @@ void OpEmitter::buildParamList(SmallVectorImpl<MethodParameter> &paramList,
       if (retType == "::llvm::APInt" || retType == "::llvm::APFloat")
         break;
 
+      defaultValuedAttrLikeStartIndex = i;
       defaultValuedAttrStartIndex = i;
     }
   }
@@ -3001,8 +3115,10 @@ void OpEmitter::buildParamList(SmallVectorImpl<MethodParameter> &paramList,
     if ((attrParamKind == AttrParamKind::WrappedAttr &&
          canUseUnwrappedRawValue(attr)) ||
         (attrParamKind == AttrParamKind::UnwrappedValue &&
-         !canUseUnwrappedRawValue(attr)))
+         !canUseUnwrappedRawValue(attr))) {
       ++defaultValuedAttrStartIndex;
+      defaultValuedAttrLikeStartIndex = defaultValuedAttrStartIndex;
+    }
   }
 
   /// Collect any inferred attributes.
@@ -3029,8 +3145,16 @@ void OpEmitter::buildParamList(SmallVectorImpl<MethodParameter> &paramList,
                              operand->isOptional());
       continue;
     }
-    if (llvm::isa_and_present<NamedProperty *>(arg)) {
-      // TODO
+    if (auto *propArg = llvm::dyn_cast_if_present<NamedProperty *>(arg)) {
+      const Property &prop = propArg->prop;
+      StringRef type = prop.getInterfaceType();
+      std::string defaultValue;
+      if (prop.hasDefaultValue() && i >= defaultValuedAttrLikeStartIndex) {
+        defaultValue = prop.getDefaultValue();
+      }
+      bool isOptional = prop.hasDefaultValue();
+      paramList.emplace_back(type, propArg->name, StringRef(defaultValue),
+                             isOptional);
       continue;
     }
     const NamedAttribute &namedAttr = *arg.get<NamedAttribute *>();
@@ -3157,6 +3281,15 @@ void OpEmitter::genCodeForAddingArgAndRegionForBuilder(
     }
   }
 
+  // Push all properties to the result.
+  for (const auto &namedProp : op.getProperties()) {
+    // Use the setter from the Properties struct since the conversion from the
+    // interface type (used in the builder argument) to the storage type (used
+    // in the state) is not necessarily trivial.
+    std::string setterName = op.getSetterName(namedProp.name);
+    body << formatv("  {0}.getOrAddProperties<Properties>().{1}({2});\n",
+                    builderOpState, setterName, namedProp.name);
+  }
   // Push all attributes to the result.
   for (const auto &namedAttr : op.getAttributes()) {
     auto &attr = namedAttr.attr;
@@ -3996,17 +4129,19 @@ OpOperandAdaptorEmitter::OpOperandAdaptorEmitter(
         // Generate the data member using the storage type.
         os << "    using " << name << "Ty = " << prop.getStorageType() << ";\n"
            << "    " << name << "Ty " << name;
-        if (prop.hasDefaultValue())
+        if (prop.hasStorageTypeValueOverride())
+          os << " = " << prop.getStorageTypeValueOverride();
+        else if (prop.hasDefaultValue())
           os << " = " << prop.getDefaultValue();
         comparatorOs << "        rhs." << name << " == this->" << name
                      << " &&\n";
         // Emit accessors using the interface type.
         const char *accessorFmt = R"decl(;
-    {0} get{1}() {
+    {0} get{1}() const {
       auto &propStorage = this->{2};
       return {3};
     }
-    void set{1}(const {0} &propValue) {
+    void set{1}({0} propValue) {
       auto &propStorage = this->{2};
       {4};
     }
@@ -4274,6 +4409,11 @@ OpOperandAdaptorEmitter::OpOperandAdaptorEmitter(
     ERROR_IF_PRUNED(m, "Adaptor::getAttributes", op);
     m->body() << "  return odsAttrs;";
   }
+  for (auto &namedProp : op.getProperties()) {
+    std::string name = op.getGetterName(namedProp.name);
+    emitPropGetter(genericAdaptorBase, op, name, namedProp.prop);
+  }
+
   for (auto &namedAttr : op.getAttributes()) {
     const auto &name = namedAttr.name;
     const auto &attr = namedAttr.attr;
diff --git a/mlir/tools/mlir-tblgen/OpFormatGen.cpp b/mlir/tools/mlir-tblgen/OpFormatGen.cpp
index a97d876..27ad79a 100644
--- a/mlir/tools/mlir-tblgen/OpFormatGen.cpp
+++ b/mlir/tools/mlir-tblgen/OpFormatGen.cpp
@@ -45,7 +45,7 @@ public:
   OpVariableElement(const VarT *var) : var(var) {}
 
   /// Get the variable.
-  const VarT *getVar() { return var; }
+  const VarT *getVar() const { return var; }
 
 protected:
   /// The op variable, e.g. a type or attribute constraint.
@@ -64,11 +64,6 @@ struct AttributeVariable
     return attrType ? attrType->getBuilderCall() : std::nullopt;
   }
 
-  /// Return if this attribute refers to a UnitAttr.
-  bool isUnitAttr() const {
-    return var->attr.getBaseAttr().getAttrDefName() == "UnitAttr";
-  }
-
   /// Indicate if this attribute is printed "qualified" (that is it is
   /// prefixed with the `#dialect.mnemonic`).
   bool shouldBeQualified() { return shouldBeQualifiedFlag; }
@@ -98,6 +93,42 @@ using SuccessorVariable =
 /// This class represents a variable that refers to a property argument.
 using PropertyVariable =
     OpVariableElement<NamedProperty, VariableElement::Property>;
+
+/// LLVM RTTI helper for attribute-like variables, that is, attributes or
+/// properties. This allows for common handling of attributes and properties in
+/// parts of the code that are oblivious to whether something is stored as an
+/// attribute or a property.
+struct AttributeLikeVariable : public VariableElement {
+  enum { AttributeLike = 1 << 0 };
+
+  static bool classof(const VariableElement *ve) {
+    return ve->getKind() == VariableElement::Attribute ||
+           ve->getKind() == VariableElement::Property;
+  }
+
+  static bool classof(const FormatElement *fe) {
+    return isa<VariableElement>(fe) && classof(cast<VariableElement>(fe));
+  }
+
+  /// Returns true if the variable is a UnitAttr or a UnitProperty.
+  bool isUnit() const {
+    if (const auto *attr = dyn_cast<AttributeVariable>(this))
+      return attr->getVar()->attr.getBaseAttr().getAttrDefName() == "UnitAttr";
+    if (const auto *prop = dyn_cast<PropertyVariable>(this)) {
+      return prop->getVar()->prop.getBaseProperty().getPropertyDefName() ==
+             "UnitProperty";
+    }
+    llvm_unreachable("Type that wasn't listed in classof()");
+  }
+
+  StringRef getName() const {
+    if (const auto *attr = dyn_cast<AttributeVariable>(this))
+      return attr->getVar()->name;
+    if (const auto *prop = dyn_cast<PropertyVariable>(this))
+      return prop->getVar()->name;
+    llvm_unreachable("Type that wasn't listed in classof()");
+  }
+};
 } // namespace
 
 //===----------------------------------------------------------------------===//
@@ -214,11 +245,11 @@ public:
 
   /// If the parsing element is a single UnitAttr element, then it returns the
   /// attribute variable. Otherwise, returns nullptr.
-  AttributeVariable *
-  getUnitAttrParsingElement(ArrayRef<FormatElement *> pelement) {
+  AttributeLikeVariable *
+  getUnitVariableParsingElement(ArrayRef<FormatElement *> pelement) {
     if (pelement.size() == 1) {
-      auto *attrElem = dyn_cast<AttributeVariable>(pelement[0]);
-      if (attrElem && attrElem->isUnitAttr())
+      auto *attrElem = dyn_cast<AttributeLikeVariable>(pelement[0]);
+      if (attrElem && attrElem->isUnit())
         return attrElem;
     }
     return nullptr;
@@ -488,6 +519,36 @@ const char *const enumAttrParserCode = R"(
   }
 )";
 
+/// The code snippet used to generate a parser call for a property.
+/// {0}: The name of the property
+/// {1}: The C++ class name of the operation
+/// {2}: The property's parser code with appropriate substitutions performed
+/// {3}: The description of the expected property for the error message.
+const char *const propertyParserCode = R"(
+  auto {0}PropLoc = parser.getCurrentLocation();
+  auto {0}PropParseResult = [&](auto& propStorage) -> ::mlir::ParseResult {{
+    {2}
+    return ::mlir::success();
+  }(result.getOrAddProperties<{1}::Properties>().{0});
+  if (failed({0}PropParseResult)) {{
+    return parser.emitError({0}PropLoc, "invalid value for property {0}, expected {3}");
+  }
+)";
+
+/// The code snippet used to generate a parser call for a property.
+/// {0}: The name of the property
+/// {1}: The C++ class name of the operation
+/// {2}: The property's parser code with appropriate substitutions performed
+const char *const optionalPropertyParserCode = R"(
+  auto {0}PropParseResult = [&](auto& propStorage) -> ::mlir::OptionalParseResult {{
+    {2}
+    return ::mlir::success();
+  }(result.getOrAddProperties<{1}::Properties>().{0});
+  if ({0}PropParseResult.has_value() && failed(*{0}PropParseResult)) {{
+    return ::mlir::failure();
+  }
+)";
+
 /// The code snippet used to generate a parser call for an operand.
 ///
 /// {0}: The name of the operand.
@@ -796,9 +857,9 @@ static void genElementParserStorage(FormatElement *element, const Operator &op,
 
     // If the anchor is a unit attribute, it won't be parsed directly so elide
     // it.
-    auto *anchor = dyn_cast<AttributeVariable>(optional->getAnchor());
+    auto *anchor = dyn_cast<AttributeLikeVariable>(optional->getAnchor());
     FormatElement *elidedAnchorElement = nullptr;
-    if (anchor && anchor != elements.front() && anchor->isUnitAttr())
+    if (anchor && anchor != elements.front() && anchor->isUnit())
       elidedAnchorElement = anchor;
     for (FormatElement *childElement : elements)
       if (childElement != elidedAnchorElement)
@@ -808,7 +869,7 @@ static void genElementParserStorage(FormatElement *element, const Operator &op,
 
   } else if (auto *oilist = dyn_cast<OIListElement>(element)) {
     for (ArrayRef<FormatElement *> pelement : oilist->getParsingElements()) {
-      if (!oilist->getUnitAttrParsingElement(pelement))
+      if (!oilist->getUnitVariableParsingElement(pelement))
         for (FormatElement *element : pelement)
           genElementParserStorage(element, op, body);
     }
@@ -1049,7 +1110,6 @@ static void genCustomDirectiveParser(CustomDirective *dir, MethodBody &body,
         body << llvm::formatv("    result.addAttribute(\"{0}\", {0}Attr);\n",
                               var->name);
       }
-
     } else if (auto *operand = dyn_cast<OperandVariable>(param)) {
       const NamedTypeConstraint *var = operand->getVar();
       if (var->isOptional()) {
@@ -1137,6 +1197,29 @@ static void genEnumAttrParser(const NamedAttribute *var, MethodBody &body,
                   validCaseKeywordsStr, errorMessage, attrAssignment);
 }
 
+// Generate the parser for a property.
+static void genPropertyParser(PropertyVariable *propVar, MethodBody &body,
+                              StringRef opCppClassName,
+                              bool requireParse = true) {
+  StringRef name = propVar->getVar()->name;
+  const Property &prop = propVar->getVar()->prop;
+  bool parseOptionally =
+      prop.hasDefaultValue() && !requireParse && prop.hasOptionalParser();
+  FmtContext fmtContext;
+  fmtContext.addSubst("_parser", "parser");
+  fmtContext.addSubst("_ctxt", "parser.getContext()");
+  fmtContext.addSubst("_storage", "propStorage");
+
+  if (parseOptionally) {
+    body << formatv(optionalPropertyParserCode, name, opCppClassName,
+                    tgfmt(prop.getOptionalParserCall(), &fmtContext));
+  } else {
+    body << formatv(propertyParserCode, name, opCppClassName,
+                    tgfmt(prop.getParserCall(), &fmtContext),
+                    prop.getSummary());
+  }
+}
+
 // Generate the parser for an attribute.
 static void genAttrParser(AttributeVariable *attr, MethodBody &body,
                           FmtContext &attrTypeCtx, bool parseAsOptional,
@@ -1213,14 +1296,16 @@ if (!dict) {
 }
 )decl";
 
-  // TODO: properties might be optional as well.
+  // {0}: fromAttribute call
+  // {1}: property name
+  // {2}: isRequired
   const char *propFromAttrFmt = R"decl(
 auto setFromAttr = [] (auto &propStorage, ::mlir::Attribute propAttr,
-         ::llvm::function_ref<::mlir::InFlightDiagnostic()> emitError) {{
+         ::llvm::function_ref<::mlir::InFlightDiagnostic()> emitError) -> ::mlir::LogicalResult {{
   {0};
 };
 auto attr = dict.get("{1}");
-if (!attr) {{
+if (!attr && {2}) {{
   emitError() << "expected key entry for {1} in DictionaryAttr to set "
              "Properties.";
   return ::mlir::failure();
@@ -1238,13 +1323,14 @@ if (::mlir::failed(setFromAttr(prop.{1}, attr, emitError)))
 
     StringRef name = namedProperty.name;
     const Property &prop = namedProperty.prop;
+    bool isRequired = !prop.hasDefaultValue();
     FmtContext fctx;
     body << formatv(propFromAttrFmt,
                     tgfmt(prop.getConvertFromAttributeCall(),
                           &fctx.addSubst("_attr", "propAttr")
                                .addSubst("_storage", "propStorage")
                                .addSubst("_diag", "emitError")),
-                    name);
+                    name, isRequired);
   }
 
   // Generate the setter for any attribute not parsed elsewhere.
@@ -1331,20 +1417,24 @@ void OperationFormat::genElementParser(FormatElement *element, MethodBody &body,
       // If the anchor is a unit attribute, we don't need to print it. When
       // parsing, we will add this attribute if this group is present.
       FormatElement *elidedAnchorElement = nullptr;
-      auto *anchorAttr = dyn_cast<AttributeVariable>(optional->getAnchor());
-      if (anchorAttr && anchorAttr != firstElement &&
-          anchorAttr->isUnitAttr()) {
-        elidedAnchorElement = anchorAttr;
+      auto *anchorVar = dyn_cast<AttributeLikeVariable>(optional->getAnchor());
+      if (anchorVar && anchorVar != firstElement && anchorVar->isUnit()) {
+        elidedAnchorElement = anchorVar;
 
         if (!thenGroup == optional->isInverted()) {
-          // Add the anchor unit attribute to the operation state.
-          if (useProperties) {
+          // Add the anchor unit attribute or property to the operation state
+          // or set the property to true.
+          if (isa<PropertyVariable>(anchorVar)) {
+            body << formatv(
+                "    result.getOrAddProperties<{1}::Properties>().{0} = true;",
+                anchorVar->getName(), opCppClassName);
+          } else if (useProperties) {
             body << formatv(
                 "    result.getOrAddProperties<{1}::Properties>().{0} = "
                 "parser.getBuilder().getUnitAttr();",
-                anchorAttr->getVar()->name, opCppClassName);
+                anchorVar->getName(), opCppClassName);
           } else {
-            body << "    result.addAttribute(\"" << anchorAttr->getVar()->name
+            body << "    result.addAttribute(\"" << anchorVar->getName()
                  << "\", parser.getBuilder().getUnitAttr());\n";
           }
         }
@@ -1368,6 +1458,12 @@ void OperationFormat::genElementParser(FormatElement *element, MethodBody &body,
       genAttrParser(attrVar, body, attrTypeCtx, /*parseAsOptional=*/true,
                     useProperties, opCppClassName);
       body << "  if (" << attrVar->getVar()->name << "Attr) {\n";
+    } else if (auto *propVar = dyn_cast<PropertyVariable>(firstElement)) {
+      genPropertyParser(propVar, body, opCppClassName, /*requireParse=*/false);
+      body << llvm::formatv("if ({0}PropParseResult.has_value() && "
+                            "succeeded(*{0}PropParseResult)) ",
+                            propVar->getVar()->name)
+           << " {\n";
     } else if (auto *literal = dyn_cast<LiteralElement>(firstElement)) {
       body << "  if (::mlir::succeeded(parser.parseOptional";
       genLiteralParser(literal->getSpelling(), body);
@@ -1430,15 +1526,19 @@ void OperationFormat::genElementParser(FormatElement *element, MethodBody &body,
       body << ")) {\n";
       StringRef lelementName = lelement->getSpelling();
       body << formatv(oilistParserCode, lelementName);
-      if (AttributeVariable *unitAttrElem =
-              oilist->getUnitAttrParsingElement(pelement)) {
-        if (useProperties) {
+      if (AttributeLikeVariable *unitVarElem =
+              oilist->getUnitVariableParsingElement(pelement)) {
+        if (isa<PropertyVariable>(unitVarElem)) {
+          body << formatv(
+              "    result.getOrAddProperties<{1}::Properties>().{0} = true;",
+              unitVarElem->getName(), opCppClassName);
+        } else if (useProperties) {
           body << formatv(
               "    result.getOrAddProperties<{1}::Properties>().{0} = "
               "parser.getBuilder().getUnitAttr();",
-              unitAttrElem->getVar()->name, opCppClassName);
+              unitVarElem->getName(), opCppClassName);
         } else {
-          body << "  result.addAttribute(\"" << unitAttrElem->getVar()->name
+          body << "  result.addAttribute(\"" << unitVarElem->getName()
                << "\", UnitAttr::get(parser.getContext()));\n";
         }
       } else {
@@ -1468,6 +1568,8 @@ void OperationFormat::genElementParser(FormatElement *element, MethodBody &body,
         (genCtx == GenContext::Normal && attr->getVar()->attr.isOptional());
     genAttrParser(attr, body, attrTypeCtx, parseAsOptional, useProperties,
                   opCppClassName);
+  } else if (auto *prop = dyn_cast<PropertyVariable>(element)) {
+    genPropertyParser(prop, body, opCppClassName);
 
   } else if (auto *operand = dyn_cast<OperandVariable>(element)) {
     ArgumentLengthKind lengthKind = getArgumentLengthKind(operand->getVar());
@@ -1876,6 +1978,38 @@ const char *enumAttrBeginPrinterCode = R"(
     auto caseValueStr = {1}(caseValue);
 )";
 
+/// Generate a check that an optional or default-valued attribute or property
+/// has a non-default value. For these purposes, the default value of an
+/// optional attribute is its presence, even if the attribute itself has a
+/// default value.
+static void genNonDefaultValueCheck(MethodBody &body, const Operator &op,
+                                    AttributeVariable &attrElement) {
+  Attribute attr = attrElement.getVar()->attr;
+  std::string getter = op.getGetterName(attrElement.getVar()->name);
+  bool optionalAndDefault = attr.isOptional() && attr.hasDefaultValue();
+  if (optionalAndDefault)
+    body << "(";
+  if (attr.isOptional())
+    body << getter << "Attr()";
+  if (optionalAndDefault)
+    body << " && ";
+  if (attr.hasDefaultValue()) {
+    FmtContext fctx;
+    fctx.withBuilder("::mlir::OpBuilder((*this)->getContext())");
+    body << getter << "Attr() != "
+         << tgfmt(attr.getConstBuilderTemplate(), &fctx,
+                  attr.getDefaultValue());
+  }
+  if (optionalAndDefault)
+    body << ")";
+}
+
+static void genNonDefaultValueCheck(MethodBody &body, const Operator &op,
+                                    PropertyVariable &propElement) {
+  body << op.getGetterName(propElement.getVar()->name)
+       << "() != " << propElement.getVar()->prop.getDefaultValue();
+}
+
 /// Generate the printer for the 'prop-dict' directive.
 static void genPropDictPrinter(OperationFormat &fmt, Operator &op,
                                MethodBody &body) {
@@ -1904,6 +2038,15 @@ static void genPropDictPrinter(OperationFormat &fmt, Operator &op,
       body << "  }\n";
     }
   }
+  // Similarly, elide default-valued properties.
+  for (const NamedProperty &prop : op.getProperties()) {
+    if (prop.prop.hasDefaultValue()) {
+      body << "  if (" << op.getGetterName(prop.name)
+           << "() == " << prop.prop.getDefaultValue() << ") {";
+      body << "    elidedProps.push_back(\"" << prop.name << "\");\n";
+      body << "  }\n";
+    }
+  }
 
   body << "  _odsPrinter << \" \";\n"
        << "  printProperties(this->getContext(), _odsPrinter, "
@@ -2031,7 +2174,6 @@ static void genCustomDirectiveParameterPrinter(FormatElement *element,
 
   } else if (auto *property = dyn_cast<PropertyVariable>(element)) {
     FmtContext ctx;
-    ctx.addSubst("_ctxt", "getContext()");
     const NamedProperty *namedProperty = property->getVar();
     ctx.addSubst("_storage", "getProperties()." + namedProperty->name);
     body << tgfmt(namedProperty->prop.getConvertFromStorageCall(), &ctx);
@@ -2154,16 +2296,6 @@ static void genEnumAttrPrinter(const NamedAttribute *var, const Operator &op,
           "  }\n";
 }
 
-/// Generate a check that a DefaultValuedAttr has a value that is non-default.
-static void genNonDefaultValueCheck(MethodBody &body, const Operator &op,
-                                    AttributeVariable &attrElement) {
-  FmtContext fctx;
-  Attribute attr = attrElement.getVar()->attr;
-  fctx.withBuilder("::mlir::OpBuilder((*this)->getContext())");
-  body << " && " << op.getGetterName(attrElement.getVar()->name) << "Attr() != "
-       << tgfmt(attr.getConstBuilderTemplate(), &fctx, attr.getDefaultValue());
-}
-
 /// Generate the check for the anchor of an optional group.
 static void genOptionalGroupPrinterAnchor(FormatElement *anchor,
                                           const Operator &op,
@@ -2190,17 +2322,12 @@ static void genOptionalGroupPrinterAnchor(FormatElement *anchor,
         genOptionalGroupPrinterAnchor(element->getInputs(), op, body);
       })
       .Case([&](AttributeVariable *element) {
-        Attribute attr = element->getVar()->attr;
-        body << op.getGetterName(element->getVar()->name) << "Attr()";
-        if (attr.isOptional())
-          return; // done
-        if (attr.hasDefaultValue()) {
-          // Consider a default-valued attribute as present if it's not the
-          // default value.
-          genNonDefaultValueCheck(body, op, *element);
-          return;
-        }
-        llvm_unreachable("attribute must be optional or default-valued");
+        // Consider a default-valued attribute as present if it's not the
+        // default value and an optional one present if it is set.
+        genNonDefaultValueCheck(body, op, *element);
+      })
+      .Case([&](PropertyVariable *element) {
+        genNonDefaultValueCheck(body, op, *element);
       })
       .Case([&](CustomDirective *ele) {
         body << '(';
@@ -2276,10 +2403,10 @@ void OperationFormat::genElementPrinter(FormatElement *element,
     ArrayRef<FormatElement *> thenElements = optional->getThenElements();
     ArrayRef<FormatElement *> elseElements = optional->getElseElements();
     FormatElement *elidedAnchorElement = nullptr;
-    auto *anchorAttr = dyn_cast<AttributeVariable>(anchor);
+    auto *anchorAttr = dyn_cast<AttributeLikeVariable>(anchor);
     if (anchorAttr && anchorAttr != thenElements.front() &&
         (elseElements.empty() || anchorAttr != elseElements.front()) &&
-        anchorAttr->isUnitAttr()) {
+        anchorAttr->isUnit()) {
       elidedAnchorElement = anchorAttr;
     }
     auto genElementPrinters = [&](ArrayRef<FormatElement *> elements) {
@@ -2319,13 +2446,13 @@ void OperationFormat::genElementPrinter(FormatElement *element,
       for (VariableElement *var : vars) {
         TypeSwitch<FormatElement *>(var)
             .Case([&](AttributeVariable *attrEle) {
-              body << " || (" << op.getGetterName(attrEle->getVar()->name)
-                   << "Attr()";
-              Attribute attr = attrEle->getVar()->attr;
-              if (attr.hasDefaultValue()) {
-                // Don't print default-valued attributes.
-                genNonDefaultValueCheck(body, op, *attrEle);
-              }
+              body << " || (";
+              genNonDefaultValueCheck(body, op, *attrEle);
+              body << ")";
+            })
+            .Case([&](PropertyVariable *propEle) {
+              body << " || (";
+              genNonDefaultValueCheck(body, op, *propEle);
               body << ")";
             })
             .Case([&](OperandVariable *ele) {
@@ -2352,7 +2479,7 @@ void OperationFormat::genElementPrinter(FormatElement *element,
       body << ") {\n";
       genLiteralPrinter(lelement->getSpelling(), body, shouldEmitSpace,
                         lastWasPunctuation);
-      if (oilist->getUnitAttrParsingElement(pelement) == nullptr) {
+      if (oilist->getUnitVariableParsingElement(pelement) == nullptr) {
         for (FormatElement *element : pelement)
           genElementPrinter(element, body, op, shouldEmitSpace,
                             lastWasPunctuation);
@@ -2369,7 +2496,7 @@ void OperationFormat::genElementPrinter(FormatElement *element,
     return;
   }
 
-  // Emit the attribute dictionary.
+  // Emit the property dictionary.
   if (isa<PropDictDirective>(element)) {
     genPropDictPrinter(*this, op, body);
     lastWasPunctuation = false;
@@ -2408,6 +2535,13 @@ void OperationFormat::genElementPrinter(FormatElement *element,
     else
       body << "_odsPrinter.printStrippedAttrOrType("
            << op.getGetterName(var->name) << "Attr());\n";
+  } else if (auto *property = dyn_cast<PropertyVariable>(element)) {
+    const NamedProperty *var = property->getVar();
+    FmtContext fmtContext;
+    fmtContext.addSubst("_printer", "_odsPrinter");
+    fmtContext.addSubst("_ctxt", "getContext()");
+    fmtContext.addSubst("_storage", "getProperties()." + var->name);
+    body << tgfmt(var->prop.getPrinterCall(), &fmtContext) << ";\n";
   } else if (auto *operand = dyn_cast<OperandVariable>(element)) {
     if (operand->getVar()->isVariadicOfVariadic()) {
       body << "  ::llvm::interleaveComma("
@@ -2737,6 +2871,10 @@ static bool isOptionallyParsed(FormatElement *el) {
     Attribute attr = attrVar->getVar()->attr;
     return attr.isOptional() || attr.hasDefaultValue();
   }
+  if (auto *propVar = dyn_cast<PropertyVariable>(el)) {
+    const Property &prop = propVar->getVar()->prop;
+    return prop.hasDefaultValue() && prop.hasOptionalParser();
+  }
   if (auto *operandVar = dyn_cast<OperandVariable>(el)) {
     const NamedTypeConstraint *operand = operandVar->getVar();
     return operand->isOptional() || operand->isVariadic() ||
@@ -3141,10 +3279,9 @@ OpFormatParser::parseVariableImpl(SMLoc loc, StringRef name, Context ctx) {
   }
 
   if (const NamedProperty *property = findArg(op.getProperties(), name)) {
-    if (ctx != CustomDirectiveContext && ctx != RefDirectiveContext)
+    if (ctx == TypeDirectiveContext)
       return emitError(
-          loc, "properties currently only supported in `custom` directive");
-
+          loc, "properties cannot be used as children to a `type` directive");
     if (ctx == RefDirectiveContext) {
       if (!seenProperties.count(property))
         return emitError(loc, "property '" + name +
@@ -3428,6 +3565,15 @@ LogicalResult OpFormatParser::verifyOIListParsingElement(FormatElement *element,
                                       "an oilist parsing group");
               return success();
             })
+            // Only optional properties can be within an oilist parsing group.
+            .Case([&](PropertyVariable *propEle) {
+              if (!propEle->getVar()->prop.hasDefaultValue())
+                return emitError(
+                    loc,
+                    "only default-valued or optional properties can be used in "
+                    "an olist parsing group");
+              return success();
+            })
             // Only optional-like(i.e. variadic) operands can be within an
             // oilist parsing group.
             .Case([&](OperandVariable *ele) {
@@ -3557,6 +3703,16 @@ LogicalResult OpFormatParser::verifyOptionalGroupElement(SMLoc loc,
                                 "can be used to anchor an optional group");
         return success();
       })
+      // All properties can be within the optional group, but only optional
+      // properties can be the anchor.
+      .Case([&](PropertyVariable *propEle) {
+        Property prop = propEle->getVar()->prop;
+        if (isAnchor && !(prop.hasDefaultValue() && prop.hasOptionalParser()))
+          return emitError(loc, "only properties with default values "
+                                "that can be optionally parsed "
+                                "can be used to anchor an optional group");
+        return success();
+      })
       // Only optional-like(i.e. variadic) operands can be within an optional
       // group.
       .Case([&](OperandVariable *ele) {
diff --git a/mlir/utils/spirv/gen_spirv_dialect.py b/mlir/utils/spirv/gen_spirv_dialect.py
index 426bfca..78c1022 100755
--- a/mlir/utils/spirv/gen_spirv_dialect.py
+++ b/mlir/utils/spirv/gen_spirv_dialect.py
@@ -536,7 +536,10 @@ def gen_instr_coverage_report(path, instructions):
 
     content = content.split(AUTOGEN_OPCODE_SECTION_MARKER)
 
-    existing_opcodes = [k[11:] for k in re.findall("def SPIRV_OC_\w+", content[1])]
+    prefix = "def SPIRV_OC_"
+    existing_opcodes = [
+        k[len(prefix) :] for k in re.findall(prefix + "\w+", content[1])
+    ]
     existing_instructions = list(
         filter(lambda inst: (inst["opname"] in existing_opcodes), instructions)
     )
@@ -637,7 +640,12 @@ def update_td_enum_attrs(path, operand_kinds, filter_list):
     assert len(content) == 3
 
     # Extend filter list with existing enum definitions
-    existing_kinds = [k[8:-4] for k in re.findall("def SPIRV_\w+Attr", content[1])]
+    prefix = "def SPIRV_"
+    suffix = "Attr"
+    existing_kinds = [
+        k[len(prefix) : -len(suffix)]
+        for k in re.findall(prefix + "\w+" + suffix, content[1])
+    ]
     filter_list.extend(existing_kinds)
 
     capability_mapping = get_capability_mapping(operand_kinds)
@@ -959,12 +967,20 @@ def extract_td_op_info(op_def):
       - A dict containing potential manually specified sections
     """
     # Get opname
-    opname = [o[8:-2] for o in re.findall("def SPIRV_\w+Op", op_def)]
+    prefix = "def SPIRV_"
+    suffix = "Op"
+    opname = [
+        o[len(prefix) : -len(suffix)]
+        for o in re.findall(prefix + "\w+" + suffix, op_def)
+    ]
     assert len(opname) == 1, "more than one ops in the same section!"
     opname = opname[0]
 
     # Get instruction category
-    inst_category = [o[4:] for o in re.findall("SPIRV_\w+Op", op_def.split(":", 1)[1])]
+    prefix = "SPIRV_"
+    inst_category = [
+        o[len(prefix) :] for o in re.findall(prefix + "\w+Op", op_def.split(":", 1)[1])
+    ]
     assert len(inst_category) <= 1, "more than one ops in the same section!"
     inst_category = inst_category[0] if len(inst_category) == 1 else "Op"
 
diff --git a/offload/DeviceRTL/include/LibC.h b/offload/DeviceRTL/include/LibC.h
index dde86af..59a795c 100644
--- a/offload/DeviceRTL/include/LibC.h
+++ b/offload/DeviceRTL/include/LibC.h
@@ -18,7 +18,6 @@ extern "C" {
 
 int memcmp(const void *lhs, const void *rhs, size_t count);
 void memset(void *dst, int C, size_t count);
-
 int printf(const char *format, ...);
 }
 
diff --git a/offload/DeviceRTL/src/Debug.cpp b/offload/DeviceRTL/src/Debug.cpp
index 4e16591..5a2c84c 100644
--- a/offload/DeviceRTL/src/Debug.cpp
+++ b/offload/DeviceRTL/src/Debug.cpp
@@ -26,10 +26,13 @@ using namespace ompx;
 extern "C" {
 void __assert_assume(bool condition) { __builtin_assume(condition); }
 
+#ifndef OMPTARGET_HAS_LIBC
 [[gnu::weak]] void __assert_fail(const char *expr, const char *file,
                                  unsigned line, const char *function) {
   __assert_fail_internal(expr, nullptr, file, line, function);
 }
+#endif
+
 void __assert_fail_internal(const char *expr, const char *msg, const char *file,
                             unsigned line, const char *function) {
   if (msg) {
diff --git a/offload/DeviceRTL/src/LibC.cpp b/offload/DeviceRTL/src/LibC.cpp
index 4bca5d2..291ceb0 100644
--- a/offload/DeviceRTL/src/LibC.cpp
+++ b/offload/DeviceRTL/src/LibC.cpp
@@ -11,44 +11,33 @@
 #pragma omp begin declare target device_type(nohost)
 
 namespace impl {
-int32_t omp_vprintf(const char *Format, void *Arguments, uint32_t);
+int32_t omp_vprintf(const char *Format, __builtin_va_list vlist);
 }
 
+#ifndef OMPTARGET_HAS_LIBC
+namespace impl {
 #pragma omp begin declare variant match(                                       \
         device = {arch(nvptx, nvptx64)},                                       \
             implementation = {extension(match_any)})
-extern "C" int32_t vprintf(const char *, void *);
-namespace impl {
-int32_t omp_vprintf(const char *Format, void *Arguments, uint32_t) {
-  return vprintf(Format, Arguments);
+extern "C" int vprintf(const char *format, ...);
+int omp_vprintf(const char *Format, __builtin_va_list vlist) {
+  return vprintf(Format, vlist);
 }
-} // namespace impl
 #pragma omp end declare variant
 
 #pragma omp begin declare variant match(device = {arch(amdgcn)})
-
-#ifdef OMPTARGET_HAS_LIBC
-// TODO: Remove this handling once we have varargs support.
-extern "C" struct FILE *stdout;
-extern "C" int32_t rpc_fprintf(FILE *, const char *, void *, uint64_t);
-
-namespace impl {
-int32_t omp_vprintf(const char *Format, void *Arguments, uint32_t Size) {
-  return rpc_fprintf(stdout, Format, Arguments, Size);
-}
+int omp_vprintf(const char *Format, __builtin_va_list) { return -1; }
+#pragma omp end declare variant
 } // namespace impl
-#else
-// We do not have a vprintf implementation for AMD GPU so we use a stub.
-namespace impl {
-int32_t omp_vprintf(const char *Format, void *Arguments, uint32_t) {
-  return -1;
+
+extern "C" int printf(const char *Format, ...) {
+  __builtin_va_list vlist;
+  __builtin_va_start(vlist, Format);
+  return impl::omp_vprintf(Format, vlist);
 }
-} // namespace impl
-#endif
-#pragma omp end declare variant
+#endif // OMPTARGET_HAS_LIBC
 
 extern "C" {
-
 [[gnu::weak]] int memcmp(const void *lhs, const void *rhs, size_t count) {
   auto *L = reinterpret_cast<const unsigned char *>(lhs);
   auto *R = reinterpret_cast<const unsigned char *>(rhs);
@@ -65,11 +54,6 @@ extern "C" {
   for (size_t I = 0; I < count; ++I)
     dstc[I] = C;
 }
-
-/// printf() calls are rewritten by CGGPUBuiltin to __llvm_omp_vprintf
-int32_t __llvm_omp_vprintf(const char *Format, void *Arguments, uint32_t Size) {
-  return impl::omp_vprintf(Format, Arguments, Size);
-}
 }
 
 #pragma omp end declare target
diff --git a/offload/include/Shared/EnvironmentVar.h b/offload/include/Shared/EnvironmentVar.h
index 4cbdad69..82f434e 100644
--- a/offload/include/Shared/EnvironmentVar.h
+++ b/offload/include/Shared/EnvironmentVar.h
@@ -28,6 +28,7 @@ struct StringParser {
 /// Class for reading and checking environment variables. Currently working with
 /// integer, floats, std::string and bool types.
 template <typename Ty> class Envar {
+  llvm::StringRef Name;
   Ty Data;
   bool IsPresent;
   bool Initialized;
@@ -53,7 +54,7 @@ public:
   /// take the value read from the environment variable, or the default if it
   /// was not set or not correct. This constructor is not fallible.
   Envar(llvm::StringRef Name, Ty Default = Ty())
-      : Data(Default), IsPresent(false), Initialized(true) {
+      : Name(Name), Data(Default), IsPresent(false), Initialized(true) {
 
     if (const char *EnvStr = getenv(Name.data())) {
       // Check whether the envar is defined and valid.
@@ -84,6 +85,9 @@ public:
   /// Get the definitive value.
   operator Ty() const { return get(); }
 
+  /// Return the environment variable name.
+  llvm::StringRef getName() const { return Name; }
+
   /// Indicate whether the environment variable was defined and valid.
   bool isPresent() const { return IsPresent; }
 
diff --git a/offload/plugins-nextgen/amdgpu/dynamic_hsa/hsa.h b/offload/plugins-nextgen/amdgpu/dynamic_hsa/hsa.h
index 64a1d33..5d9fb5d 100644
--- a/offload/plugins-nextgen/amdgpu/dynamic_hsa/hsa.h
+++ b/offload/plugins-nextgen/amdgpu/dynamic_hsa/hsa.h
@@ -31,6 +31,7 @@ typedef enum {
   HSA_STATUS_ERROR = 0x1000,
   HSA_STATUS_ERROR_INVALID_CODE_OBJECT = 0x1010,
   HSA_STATUS_ERROR_NOT_INITIALIZED = 0x100B,
+  HSA_STATUS_ERROR_EXCEPTION = 0x1016,
 } hsa_status_t;
 
 hsa_status_t hsa_status_string(hsa_status_t status, const char **status_string);
diff --git a/offload/plugins-nextgen/amdgpu/src/rtl.cpp b/offload/plugins-nextgen/amdgpu/src/rtl.cpp
index e6643d3..6046833 100644
--- a/offload/plugins-nextgen/amdgpu/src/rtl.cpp
+++ b/offload/plugins-nextgen/amdgpu/src/rtl.cpp
@@ -13,13 +13,16 @@
 #include <atomic>
 #include <cassert>
 #include <cstddef>
+#include <cstdint>
 #include <deque>
+#include <functional>
 #include <mutex>
 #include <string>
 #include <system_error>
 #include <unistd.h>
 #include <unordered_map>
 
+#include "ErrorReporting.h"
 #include "Shared/APITypes.h"
 #include "Shared/Debug.h"
 #include "Shared/Environment.h"
@@ -43,6 +46,7 @@
 #include "llvm/Support/FileSystem.h"
 #include "llvm/Support/MemoryBuffer.h"
 #include "llvm/Support/Program.h"
+#include "llvm/Support/Signals.h"
 #include "llvm/Support/raw_ostream.h"
 
 #if !defined(__BYTE_ORDER__) || !defined(__ORDER_LITTLE_ENDIAN__) ||           \
@@ -685,12 +689,12 @@ struct AMDGPUQueueTy {
   AMDGPUQueueTy() : Queue(nullptr), Mutex(), NumUsers(0) {}
 
   /// Lazily initialize a new queue belonging to a specific agent.
-  Error init(hsa_agent_t Agent, int32_t QueueSize) {
+  Error init(GenericDeviceTy &Device, hsa_agent_t Agent, int32_t QueueSize) {
     if (Queue)
       return Plugin::success();
     hsa_status_t Status =
         hsa_queue_create(Agent, QueueSize, HSA_QUEUE_TYPE_MULTI, callbackError,
-                         nullptr, UINT32_MAX, UINT32_MAX, &Queue);
+                         &Device, UINT32_MAX, UINT32_MAX, &Queue);
     return Plugin::check(Status, "Error in hsa_queue_create: %s");
   }
 
@@ -875,10 +879,8 @@ private:
   }
 
   /// Callack that will be called when an error is detected on the HSA queue.
-  static void callbackError(hsa_status_t Status, hsa_queue_t *Source, void *) {
-    auto Err = Plugin::check(Status, "Received error in queue %p: %s", Source);
-    FATAL_MESSAGE(1, "%s", toString(std::move(Err)).data());
-  }
+  static void callbackError(hsa_status_t Status, hsa_queue_t *Source,
+                            void *Data);
 
   /// The HSA queue.
   hsa_queue_t *Queue;
@@ -1484,6 +1486,8 @@ public:
     return true;
   }
 
+  const AMDGPUQueueTy *getQueue() const { return Queue; }
+
   /// Record the state of the stream on an event.
   Error recordEvent(AMDGPUEventTy &Event) const;
 
@@ -1594,7 +1598,7 @@ struct AMDGPUStreamManagerTy final
   using ResourcePoolTy = GenericDeviceResourceManagerTy<ResourceRef>;
 
   AMDGPUStreamManagerTy(GenericDeviceTy &Device, hsa_agent_t HSAAgent)
-      : GenericDeviceResourceManagerTy(Device),
+      : GenericDeviceResourceManagerTy(Device), Device(Device),
         OMPX_QueueTracking("LIBOMPTARGET_AMDGPU_HSA_QUEUE_BUSY_TRACKING", true),
         NextQueue(0), Agent(HSAAgent) {}
 
@@ -1603,7 +1607,7 @@ struct AMDGPUStreamManagerTy final
     QueueSize = HSAQueueSize;
     MaxNumQueues = NumHSAQueues;
     // Initialize one queue eagerly
-    if (auto Err = Queues.front().init(Agent, QueueSize))
+    if (auto Err = Queues.front().init(Device, Agent, QueueSize))
       return Err;
 
     return GenericDeviceResourceManagerTy::init(InitialSize);
@@ -1660,7 +1664,7 @@ private:
     }
 
     // Make sure the queue is initialized, then add user & assign.
-    if (auto Err = Queues[Index].init(Agent, QueueSize))
+    if (auto Err = Queues[Index].init(Device, Agent, QueueSize))
       return Err;
     Queues[Index].addUser();
     Stream->Queue = &Queues[Index];
@@ -1668,6 +1672,9 @@ private:
     return Plugin::success();
   }
 
+  /// The device associated with this stream.
+  GenericDeviceTy &Device;
+
   /// Envar for controlling the tracking of busy HSA queues.
   BoolEnvar OMPX_QueueTracking;
 
@@ -3074,7 +3081,7 @@ struct AMDGPUPluginTy final : public GenericPluginTy {
     Initialized = true;
 
     // Register event handler to detect memory errors on the devices.
-    Status = hsa_amd_register_system_event_handler(eventHandler, nullptr);
+    Status = hsa_amd_register_system_event_handler(eventHandler, this);
     if (auto Err = Plugin::check(
             Status, "Error in hsa_amd_register_system_event_handler: %s"))
       return std::move(Err);
@@ -3209,7 +3216,8 @@ struct AMDGPUPluginTy final : public GenericPluginTy {
 
 private:
   /// Event handler that will be called by ROCr if an event is detected.
-  static hsa_status_t eventHandler(const hsa_amd_event_t *Event, void *) {
+  static hsa_status_t eventHandler(const hsa_amd_event_t *Event,
+                                   void *PluginPtr) {
     if (Event->event_type != HSA_AMD_GPU_MEMORY_FAULT_EVENT)
       return HSA_STATUS_SUCCESS;
 
@@ -3240,6 +3248,26 @@ private:
     uint32_t Node = -1;
     hsa_agent_get_info(Event->memory_fault.agent, HSA_AGENT_INFO_NODE, &Node);
 
+    AMDGPUPluginTy &Plugin = *reinterpret_cast<AMDGPUPluginTy *>(PluginPtr);
+    for (uint32_t I = 0, E = Plugin.getNumDevices();
+         Node != uint32_t(-1) && I < E; ++I) {
+      AMDGPUDeviceTy &AMDGPUDevice =
+          reinterpret_cast<AMDGPUDeviceTy &>(Plugin.getDevice(I));
+      auto KernelTraceInfoRecord =
+          AMDGPUDevice.KernelLaunchTraces.getExclusiveAccessor();
+
+      uint32_t DeviceNode = -1;
+      if (auto Err =
+              AMDGPUDevice.getDeviceAttr(HSA_AGENT_INFO_NODE, DeviceNode)) {
+        consumeError(std::move(Err));
+        continue;
+      }
+      if (DeviceNode != Node)
+        continue;
+
+      ErrorReporter::reportKernelTraces(AMDGPUDevice, *KernelTraceInfoRecord);
+    }
+
     // Abort the execution since we do not recover from this error.
     FATAL_MESSAGE(1,
                   "Memory access fault by GPU %" PRIu32 " (agent 0x%" PRIx64
@@ -3480,6 +3508,28 @@ void *AMDGPUDeviceTy::allocate(size_t Size, void *, TargetAllocTy Kind) {
   return Alloc;
 }
 
+void AMDGPUQueueTy::callbackError(hsa_status_t Status, hsa_queue_t *Source,
+                                  void *Data) {
+  auto &AMDGPUDevice = *reinterpret_cast<AMDGPUDeviceTy *>(Data);
+
+  if (Status == HSA_STATUS_ERROR_EXCEPTION) {
+    auto KernelTraceInfoRecord =
+        AMDGPUDevice.KernelLaunchTraces.getExclusiveAccessor();
+    std::function<bool(__tgt_async_info &)> AsyncInfoWrapperMatcher =
+        [=](__tgt_async_info &AsyncInfo) {
+          auto *Stream = reinterpret_cast<AMDGPUStreamTy *>(AsyncInfo.Queue);
+          if (!Stream || !Stream->getQueue())
+            return false;
+          return Stream->getQueue()->Queue == Source;
+        };
+    ErrorReporter::reportTrapInKernel(AMDGPUDevice, *KernelTraceInfoRecord,
+                                      AsyncInfoWrapperMatcher);
+  }
+
+  auto Err = Plugin::check(Status, "Received error in queue %p: %s", Source);
+  FATAL_MESSAGE(1, "%s", toString(std::move(Err)).data());
+}
+
 } // namespace plugin
 } // namespace target
 } // namespace omp
diff --git a/offload/plugins-nextgen/common/include/ErrorReporting.h b/offload/plugins-nextgen/common/include/ErrorReporting.h
new file mode 100644
index 0000000..72cfb527
--- /dev/null
+++ b/offload/plugins-nextgen/common/include/ErrorReporting.h
@@ -0,0 +1,311 @@
+//===- ErrorReporting.h - Helper to provide nice error messages ----- c++ -===//
+//
+// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
+// See https://llvm.org/LICENSE.txt for license information.
+// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
+//
+//===----------------------------------------------------------------------===//
+//
+//===----------------------------------------------------------------------===//
+
+#ifndef OFFLOAD_PLUGINS_NEXTGEN_COMMON_ERROR_REPORTING_H
+#define OFFLOAD_PLUGINS_NEXTGEN_COMMON_ERROR_REPORTING_H
+
+#include "PluginInterface.h"
+#include "Shared/EnvironmentVar.h"
+
+#include "llvm/ADT/STLExtras.h"
+#include "llvm/ADT/SmallString.h"
+#include "llvm/ADT/StringRef.h"
+#include "llvm/Support/ErrorHandling.h"
+#include "llvm/Support/WithColor.h"
+#include "llvm/Support/raw_ostream.h"
+
+#include <cstdint>
+#include <cstdio>
+#include <cstdlib>
+#include <functional>
+#include <optional>
+#include <string>
+#include <unistd.h>
+
+namespace llvm {
+namespace omp {
+namespace target {
+namespace plugin {
+
+class ErrorReporter {
+
+  enum ColorTy {
+    Yellow = int(HighlightColor::Address),
+    Green = int(HighlightColor::String),
+    DarkBlue = int(HighlightColor::Tag),
+    Cyan = int(HighlightColor::Attribute),
+    DarkPurple = int(HighlightColor::Enumerator),
+    DarkRed = int(HighlightColor::Macro),
+    BoldRed = int(HighlightColor::Error),
+    BoldLightPurple = int(HighlightColor::Warning),
+    BoldDarkGrey = int(HighlightColor::Note),
+    BoldLightBlue = int(HighlightColor::Remark),
+  };
+
+  /// The banner printed at the beginning of an error report.
+  static constexpr auto ErrorBanner = "OFFLOAD ERROR: ";
+
+  /// Return the device id as string, or n/a if not available.
+  static std::string getDeviceIdStr(GenericDeviceTy *Device) {
+    return Device ? std::to_string(Device->getDeviceId()) : "n/a";
+  }
+
+  /// Return a nice name for an TargetAllocTy.
+  static StringRef getAllocTyName(TargetAllocTy Kind) {
+    switch (Kind) {
+    case TARGET_ALLOC_DEVICE_NON_BLOCKING:
+    case TARGET_ALLOC_DEFAULT:
+    case TARGET_ALLOC_DEVICE:
+      return "device memory";
+    case TARGET_ALLOC_HOST:
+      return "pinned host memory";
+    case TARGET_ALLOC_SHARED:
+      return "managed memory";
+      break;
+    }
+    llvm_unreachable("Unknown target alloc kind");
+  }
+
+#pragma clang diagnostic push
+#pragma clang diagnostic ignored "-Wgcc-compat"
+#pragma clang diagnostic ignored "-Wformat-security"
+  /// Print \p Format, instantiated with \p Args to stderr.
+  /// TODO: Allow redirection into a file stream.
+  template <typename... ArgsTy>
+  [[gnu::format(__printf__, 1, 2)]] static void print(const char *Format,
+                                                      ArgsTy &&...Args) {
+    raw_fd_ostream OS(STDERR_FILENO, false);
+    OS << llvm::format(Format, Args...);
+  }
+
+  /// Print \p Format, instantiated with \p Args to stderr, but colored.
+  /// TODO: Allow redirection into a file stream.
+  template <typename... ArgsTy>
+  [[gnu::format(__printf__, 2, 3)]] static void
+  print(ColorTy Color, const char *Format, ArgsTy &&...Args) {
+    raw_fd_ostream OS(STDERR_FILENO, false);
+    WithColor(OS, HighlightColor(Color)) << llvm::format(Format, Args...);
+  }
+
+  /// Print \p Format, instantiated with \p Args to stderr, but colored and with
+  /// a banner.
+  /// TODO: Allow redirection into a file stream.
+  template <typename... ArgsTy>
+  [[gnu::format(__printf__, 1, 2)]] static void reportError(const char *Format,
+                                                            ArgsTy &&...Args) {
+    print(BoldRed, "%s", ErrorBanner);
+    print(BoldRed, Format, Args...);
+    print("\n");
+  }
+#pragma clang diagnostic pop
+
+  static void reportError(const char *Str) { reportError("%s", Str); }
+  static void print(const char *Str) { print("%s", Str); }
+  static void print(StringRef Str) { print("%s", Str.str().c_str()); }
+  static void print(ColorTy Color, const char *Str) { print(Color, "%s", Str); }
+  static void print(ColorTy Color, StringRef Str) {
+    print(Color, "%s", Str.str().c_str());
+  }
+
+  /// Pretty print a stack trace.
+  static void reportStackTrace(StringRef StackTrace) {
+    if (StackTrace.empty())
+      return;
+
+    SmallVector<StringRef> Lines, Parts;
+    StackTrace.split(Lines, "\n", /*MaxSplit=*/-1, /*KeepEmpty=*/false);
+    int Start = Lines.empty() || !Lines[0].contains("PrintStackTrace") ? 0 : 1;
+    unsigned NumDigits =
+        (int)(floor(log10(Lines.size() - Start - /*0*/ 1)) + 1);
+    for (int I = Start, E = Lines.size(); I < E; ++I) {
+      auto Line = Lines[I];
+      Parts.clear();
+      Line = Line.drop_while([](char C) { return std::isspace(C); });
+      Line.split(Parts, " ", /*MaxSplit=*/2);
+      if (Parts.size() != 3 || Parts[0].size() < 2 || Parts[0][0] != '#') {
+        print("%s\n", Line.str().c_str());
+        continue;
+      }
+      unsigned FrameIdx = std::stoi(Parts[0].drop_front(1).str());
+      if (Start)
+        FrameIdx -= 1;
+      print(DarkPurple, "    %s", Parts[0].take_front().str().c_str());
+      print(Green, "%*u", NumDigits, FrameIdx);
+      print(BoldLightBlue, " %s", Parts[1].str().c_str());
+      print(" %s\n", Parts[2].str().c_str());
+    }
+    print("\n");
+  }
+
+  /// Report information about an allocation associated with \p ATI.
+  static void reportAllocationInfo(AllocationTraceInfoTy *ATI) {
+    if (!ATI)
+      return;
+
+    if (!ATI->DeallocationTrace.empty()) {
+      print(BoldLightPurple, "Last deallocation:\n");
+      reportStackTrace(ATI->DeallocationTrace);
+    }
+
+    if (ATI->HostPtr)
+      print(BoldLightPurple,
+            "Last allocation of size %lu for host pointer %p:\n", ATI->Size,
+            ATI->HostPtr);
+    else
+      print(BoldLightPurple, "Last allocation of size %lu:\n", ATI->Size);
+    reportStackTrace(ATI->AllocationTrace);
+    if (!ATI->LastAllocationInfo)
+      return;
+
+    unsigned I = 0;
+    print(BoldLightPurple, "Prior allocations with the same base pointer:");
+    while (ATI->LastAllocationInfo) {
+      print("\n");
+      ATI = ATI->LastAllocationInfo;
+      print(BoldLightPurple, " #%u Prior deallocation of size %lu:\n", I,
+            ATI->Size);
+      reportStackTrace(ATI->DeallocationTrace);
+      if (ATI->HostPtr)
+        print(BoldLightPurple, " #%u Prior allocation for host pointer %p:\n",
+              I, ATI->HostPtr);
+      else
+        print(BoldLightPurple, " #%u Prior allocation:\n", I);
+      reportStackTrace(ATI->AllocationTrace);
+      ++I;
+    }
+  }
+
+  /// End the execution of the program.
+  static void abortExecution() { abort(); }
+
+public:
+#define DEALLOCATION_ERROR(Format, ...)                                        \
+  reportError(Format, __VA_ARGS__);                                            \
+  reportStackTrace(StackTrace);                                                \
+  reportAllocationInfo(ATI);                                                   \
+  abortExecution();
+
+  static void reportDeallocationOfNonAllocatedPtr(void *DevicePtr,
+                                                  TargetAllocTy Kind,
+                                                  AllocationTraceInfoTy *ATI,
+                                                  std::string &StackTrace) {
+    DEALLOCATION_ERROR("deallocation of non-allocated %s: %p",
+                       getAllocTyName(Kind).data(), DevicePtr);
+  }
+
+  static void reportDeallocationOfDeallocatedPtr(void *DevicePtr,
+                                                 TargetAllocTy Kind,
+                                                 AllocationTraceInfoTy *ATI,
+                                                 std::string &StackTrace) {
+    DEALLOCATION_ERROR("double-free of %s: %p", getAllocTyName(Kind).data(),
+                       DevicePtr);
+  }
+
+  static void reportDeallocationOfWrongPtrKind(void *DevicePtr,
+                                               TargetAllocTy Kind,
+                                               AllocationTraceInfoTy *ATI,
+                                               std::string &StackTrace) {
+    DEALLOCATION_ERROR("deallocation requires %s but allocation was %s: %p",
+                       getAllocTyName(Kind).data(),
+                       getAllocTyName(ATI->Kind).data(), DevicePtr);
+#undef DEALLOCATION_ERROR
+  }
+
+  /// Report that a kernel encountered a trap instruction.
+  static void reportTrapInKernel(
+      GenericDeviceTy &Device, KernelTraceInfoRecordTy &KTIR,
+      std::function<bool(__tgt_async_info &)> AsyncInfoWrapperMatcher) {
+    assert(AsyncInfoWrapperMatcher && "A matcher is required");
+
+    uint32_t Idx = 0;
+    for (uint32_t I = 0, E = KTIR.size(); I < E; ++I) {
+      auto KTI = KTIR.getKernelTraceInfo(I);
+      if (KTI.Kernel == nullptr)
+        break;
+      // Skip kernels issued in other queues.
+      if (KTI.AsyncInfo && !(AsyncInfoWrapperMatcher(*KTI.AsyncInfo)))
+        continue;
+      Idx = I;
+      break;
+    }
+
+    auto KTI = KTIR.getKernelTraceInfo(Idx);
+    if (KTI.AsyncInfo && (AsyncInfoWrapperMatcher(*KTI.AsyncInfo)))
+      reportError("Kernel '%s'", KTI.Kernel->getName());
+    reportError("execution interrupted by hardware trap instruction");
+    if (KTI.AsyncInfo && (AsyncInfoWrapperMatcher(*KTI.AsyncInfo))) {
+      if (!KTI.LaunchTrace.empty())
+        reportStackTrace(KTI.LaunchTrace);
+      else
+        print(Yellow, "Use '%s=1' to show the stack trace of the kernel\n",
+              Device.OMPX_TrackNumKernelLaunches.getName().data());
+    }
+    abort();
+  }
+
+  /// Report the kernel traces taken from \p KTIR, up to
+  /// OFFLOAD_TRACK_NUM_KERNEL_LAUNCH_TRACES many.
+  static void reportKernelTraces(GenericDeviceTy &Device,
+                                 KernelTraceInfoRecordTy &KTIR) {
+    uint32_t NumKTIs = 0;
+    for (uint32_t I = 0, E = KTIR.size(); I < E; ++I) {
+      auto KTI = KTIR.getKernelTraceInfo(I);
+      if (KTI.Kernel == nullptr)
+        break;
+      ++NumKTIs;
+    }
+    if (NumKTIs == 0) {
+      print(BoldRed, "No kernel launches known\n");
+      return;
+    }
+
+    uint32_t TracesToShow =
+        std::min(Device.OMPX_TrackNumKernelLaunches.get(), NumKTIs);
+    if (TracesToShow == 0) {
+      if (NumKTIs == 1)
+        print(BoldLightPurple, "Display only launched kernel:\n");
+      else
+        print(BoldLightPurple, "Display last %u kernels launched:\n", NumKTIs);
+    } else {
+      if (NumKTIs == 1)
+        print(BoldLightPurple, "Display kernel launch trace:\n");
+      else
+        print(BoldLightPurple,
+              "Display %u of the %u last kernel launch traces:\n", TracesToShow,
+              NumKTIs);
+    }
+
+    for (uint32_t Idx = 0, I = 0; I < NumKTIs; ++Idx) {
+      auto KTI = KTIR.getKernelTraceInfo(Idx);
+      if (NumKTIs == 1)
+        print(BoldLightPurple, "Kernel '%s'\n", KTI.Kernel->getName());
+      else
+        print(BoldLightPurple, "Kernel %d: '%s'\n", I, KTI.Kernel->getName());
+      reportStackTrace(KTI.LaunchTrace);
+      ++I;
+    }
+
+    if (NumKTIs != 1) {
+      print(Yellow,
+            "Use '%s=<num>' to adjust the number of shown stack traces (%u "
+            "now, up to %zu)\n",
+            Device.OMPX_TrackNumKernelLaunches.getName().data(),
+            Device.OMPX_TrackNumKernelLaunches.get(), KTIR.size());
+    }
+    // TODO: Let users know how to serialize kernels
+  }
+};
+
+} // namespace plugin
+} // namespace target
+} // namespace omp
+} // namespace llvm
+
+#endif // OFFLOAD_PLUGINS_NEXTGEN_COMMON_ERROR_REPORTING_H
diff --git a/offload/plugins-nextgen/common/include/PluginInterface.h b/offload/plugins-nextgen/common/include/PluginInterface.h
index 973add0..8182333 100644
--- a/offload/plugins-nextgen/common/include/PluginInterface.h
+++ b/offload/plugins-nextgen/common/include/PluginInterface.h
@@ -19,6 +19,7 @@
 #include <shared_mutex>
 #include <vector>
 
+#include "ExclusiveAccess.h"
 #include "Shared/APITypes.h"
 #include "Shared/Debug.h"
 #include "Shared/Environment.h"
@@ -382,6 +383,73 @@ protected:
   bool IsBareKernel = false;
 };
 
+/// Information about an allocation, when it has been allocated, and when/if it
+/// has been deallocated, for error reporting purposes.
+struct AllocationTraceInfoTy {
+
+  /// The stack trace of the allocation itself.
+  std::string AllocationTrace;
+
+  /// The stack trace of the deallocation, or empty.
+  std::string DeallocationTrace;
+
+  /// The allocated device pointer.
+  void *DevicePtr = nullptr;
+
+  /// The corresponding host pointer (can be null).
+  void *HostPtr = nullptr;
+
+  /// The size of the allocation.
+  uint64_t Size = 0;
+
+  /// The kind of the allocation.
+  TargetAllocTy Kind = TargetAllocTy::TARGET_ALLOC_DEFAULT;
+
+  /// Information about the last allocation at this address, if any.
+  AllocationTraceInfoTy *LastAllocationInfo = nullptr;
+
+  /// Lock to keep accesses race free.
+  std::mutex Lock;
+};
+
+/// Information about an allocation, when it has been allocated, and when/if it
+/// has been deallocated, for error reporting purposes.
+struct KernelTraceInfoTy {
+
+  /// The launched kernel.
+  GenericKernelTy *Kernel;
+
+  /// The stack trace of the launch itself.
+  std::string LaunchTrace;
+
+  /// The async info the kernel was launched in.
+  __tgt_async_info *AsyncInfo;
+};
+
+struct KernelTraceInfoRecordTy {
+  KernelTraceInfoRecordTy() { KTIs.fill({}); }
+
+  /// Return the (maximal) record size.
+  auto size() const { return KTIs.size(); }
+
+  /// Create a new kernel trace info and add it into the record.
+  void emplace(GenericKernelTy *Kernel, const std::string &&StackTrace,
+               __tgt_async_info *AsyncInfo) {
+    KTIs[Idx] = {Kernel, std::move(StackTrace), AsyncInfo};
+    Idx = (Idx + 1) % size();
+  }
+
+  /// Return the \p I'th last kernel trace info.
+  auto getKernelTraceInfo(int32_t I) const {
+    // Note that kernel trace infos "grow forward", so lookup is backwards.
+    return KTIs[(Idx - I - 1 + size()) % size()];
+  }
+
+private:
+  std::array<KernelTraceInfoTy, 8> KTIs;
+  unsigned Idx = 0;
+};
+
 /// Class representing a map of host pinned allocations. We track these pinned
 /// allocations, so memory tranfers invloving these buffers can be optimized.
 class PinnedAllocationMapTy {
@@ -866,6 +934,18 @@ struct GenericDeviceTy : public DeviceAllocatorTy {
   /// Reference to the underlying plugin that created this device.
   GenericPluginTy &Plugin;
 
+  /// Map to record when allocations have been performed, and when they have
+  /// been deallocated, both for error reporting purposes.
+  ProtectedObj<DenseMap<void *, AllocationTraceInfoTy *>> AllocationTraces;
+
+  /// Map to record kernel have been launchedl, for error reporting purposes.
+  ProtectedObj<KernelTraceInfoRecordTy> KernelLaunchTraces;
+
+  /// Environment variable to determine if stack traces for kernel launches are
+  /// tracked.
+  UInt32Envar OMPX_TrackNumKernelLaunches =
+      UInt32Envar("OFFLOAD_TRACK_NUM_KERNEL_LAUNCH_TRACES", 0);
+
 private:
   /// Get and set the stack size and heap size for the device. If not used, the
   /// plugin can implement the setters as no-op and setting the output
@@ -916,6 +996,11 @@ protected:
   UInt32Envar OMPX_InitialNumStreams;
   UInt32Envar OMPX_InitialNumEvents;
 
+  /// Environment variable to determine if stack traces for allocations and
+  /// deallocations are tracked.
+  BoolEnvar OMPX_TrackAllocationTraces =
+      BoolEnvar("OFFLOAD_TRACK_ALLOCATION_TRACES", false);
+
   /// Array of images loaded into the device. Images are automatically
   /// deallocated by the allocator.
   llvm::SmallVector<DeviceImageTy *> LoadedImages;
diff --git a/offload/plugins-nextgen/common/src/PluginInterface.cpp b/offload/plugins-nextgen/common/src/PluginInterface.cpp
index 1182659..c3ecbcc 100644
--- a/offload/plugins-nextgen/common/src/PluginInterface.cpp
+++ b/offload/plugins-nextgen/common/src/PluginInterface.cpp
@@ -14,6 +14,7 @@
 #include "Shared/Debug.h"
 #include "Shared/Environment.h"
 
+#include "ErrorReporting.h"
 #include "GlobalHandler.h"
 #include "JIT.h"
 #include "Utils/ELF.h"
@@ -30,6 +31,8 @@
 #include "llvm/Support/JSON.h"
 #include "llvm/Support/MathExtras.h"
 #include "llvm/Support/MemoryBuffer.h"
+#include "llvm/Support/Signals.h"
+#include "llvm/Support/raw_ostream.h"
 
 #include <cstdint>
 #include <limits>
@@ -1337,6 +1340,25 @@ Expected<void *> GenericDeviceTy::dataAlloc(int64_t Size, void *HostPtr,
     if (auto Err = PinnedAllocs.registerHostBuffer(Alloc, Alloc, Size))
       return std::move(Err);
 
+  // Keep track of the allocation stack if we track allocation traces.
+  if (OMPX_TrackAllocationTraces) {
+    std::string StackTrace;
+    llvm::raw_string_ostream OS(StackTrace);
+    llvm::sys::PrintStackTrace(OS);
+
+    AllocationTraceInfoTy *ATI = new AllocationTraceInfoTy();
+    ATI->AllocationTrace = std::move(StackTrace);
+    ATI->DevicePtr = Alloc;
+    ATI->HostPtr = HostPtr;
+    ATI->Size = Size;
+    ATI->Kind = Kind;
+
+    auto AllocationTraceMap = AllocationTraces.getExclusiveAccessor();
+    auto *&MapATI = (*AllocationTraceMap)[Alloc];
+    ATI->LastAllocationInfo = MapATI;
+    MapATI = ATI;
+  }
+
   return Alloc;
 }
 
@@ -1345,6 +1367,37 @@ Error GenericDeviceTy::dataDelete(void *TgtPtr, TargetAllocTy Kind) {
   if (Plugin.getRecordReplay().isRecordingOrReplaying())
     return Plugin::success();
 
+  // Keep track of the deallocation stack if we track allocation traces.
+  if (OMPX_TrackAllocationTraces) {
+    AllocationTraceInfoTy *ATI = nullptr;
+    {
+      auto AllocationTraceMap = AllocationTraces.getExclusiveAccessor();
+      ATI = (*AllocationTraceMap)[TgtPtr];
+    }
+
+    std::string StackTrace;
+    llvm::raw_string_ostream OS(StackTrace);
+    llvm::sys::PrintStackTrace(OS);
+
+    if (!ATI)
+      ErrorReporter::reportDeallocationOfNonAllocatedPtr(TgtPtr, Kind, ATI,
+                                                         StackTrace);
+
+    // ATI is not null, thus we can lock it to inspect and modify it further.
+    std::lock_guard<std::mutex> LG(ATI->Lock);
+    if (!ATI->DeallocationTrace.empty())
+      ErrorReporter::reportDeallocationOfDeallocatedPtr(TgtPtr, Kind, ATI,
+                                                        StackTrace);
+
+    if (ATI->Kind != Kind)
+      ErrorReporter::reportDeallocationOfWrongPtrKind(TgtPtr, Kind, ATI,
+                                                      StackTrace);
+
+    ATI->DeallocationTrace = StackTrace;
+
+#undef DEALLOCATION_ERROR
+  }
+
   int Res;
   switch (Kind) {
   case TARGET_ALLOC_DEFAULT:
@@ -1415,6 +1468,18 @@ Error GenericDeviceTy::launchKernel(void *EntryPtr, void **ArgPtrs,
   GenericKernelTy &GenericKernel =
       *reinterpret_cast<GenericKernelTy *>(EntryPtr);
 
+  {
+    std::string StackTrace;
+    if (OMPX_TrackNumKernelLaunches) {
+      llvm::raw_string_ostream OS(StackTrace);
+      llvm::sys::PrintStackTrace(OS);
+    }
+
+    auto KernelTraceInfoRecord = KernelLaunchTraces.getExclusiveAccessor();
+    (*KernelTraceInfoRecord)
+        .emplace(&GenericKernel, std::move(StackTrace), AsyncInfo);
+  }
+
   auto Err = GenericKernel.launch(*this, ArgPtrs, ArgOffsets, KernelArgs,
                                   AsyncInfoWrapper);
 
diff --git a/offload/src/omptarget.cpp b/offload/src/omptarget.cpp
index 9bca852..3b627d2 100644
--- a/offload/src/omptarget.cpp
+++ b/offload/src/omptarget.cpp
@@ -462,7 +462,9 @@ void targetFreeExplicit(void *DevicePtr, int DeviceNum, int Kind,
     FATAL_MESSAGE(DeviceNum, "%s", toString(DeviceOrErr.takeError()).c_str());
 
   if (DeviceOrErr->deleteData(DevicePtr, Kind) == OFFLOAD_FAIL)
-    FATAL_MESSAGE(DeviceNum, "%s", "Failed to deallocate device ptr");
+    FATAL_MESSAGE(DeviceNum, "%s",
+                  "Failed to deallocate device ptr. Set "
+                  "OFFLOAD_TRACK_ALLOCATION_TRACES=1 to track allocations.");
 
   DP("omp_target_free deallocated device ptr\n");
 }
diff --git a/offload/test/libc/assert.c b/offload/test/libc/assert.c
index 0501e36..bf155b6 100644
--- a/offload/test/libc/assert.c
+++ b/offload/test/libc/assert.c
@@ -2,10 +2,6 @@
 // RUN:   %fcheck-generic --check-prefix=CHECK
 
 // REQUIRES: libc
-
-// AMDGPU and NVPTX without LTO uses the implementation in OpenMP currently.
-// UNSUPPORTED: nvptx64-nvidia-cuda
-// UNSUPPORTED: amdgcn-amd-amdhsa
 // REQUIRES: gpu
 
 #include <assert.h>
diff --git a/offload/test/offloading/bug51781.c b/offload/test/offloading/bug51781.c
index 237e158..35ecf55 100644
--- a/offload/test/offloading/bug51781.c
+++ b/offload/test/offloading/bug51781.c
@@ -31,7 +31,6 @@
 // RUN: %libomptarget-run-generic 2>&1 | %fcheck-generic
 //
 // CUSTOM: Rewriting generic-mode kernel with a customized state machine.
-// XFAIL: amdgcn-amd-amdhsa
 
 #if ADD_REDUCTION
 #define REDUCTION(...) reduction(__VA_ARGS__)
diff --git a/offload/test/sanitizer/double_free.c b/offload/test/sanitizer/double_free.c
new file mode 100644
index 0000000..ca7310e
--- /dev/null
+++ b/offload/test/sanitizer/double_free.c
@@ -0,0 +1,68 @@
+// clang-format off
+// RUN: %libomptarget-compileopt-generic
+// RUN: %not --crash env -u LLVM_DISABLE_SYMBOLIZATION OFFLOAD_TRACK_ALLOCATION_TRACES=1 %libomptarget-run-generic 2>&1 | %fcheck-generic --check-prefixes=CHECK,NDEBG 
+// RUN: %libomptarget-compileopt-generic -g
+// RUN: %not --crash env -u LLVM_DISABLE_SYMBOLIZATION OFFLOAD_TRACK_ALLOCATION_TRACES=1 %libomptarget-run-generic 2>&1 | %fcheck-generic --check-prefixes=CHECK,DEBUG
+// clang-format on
+
+// UNSUPPORTED: aarch64-unknown-linux-gnu
+// UNSUPPORTED: aarch64-unknown-linux-gnu-LTO
+// UNSUPPORTED: x86_64-pc-linux-gnu
+// UNSUPPORTED: x86_64-pc-linux-gnu-LTO
+// UNSUPPORTED: s390x-ibm-linux-gnu
+// UNSUPPORTED: s390x-ibm-linux-gnu-LTO
+
+#include <omp.h>
+
+int main(void) {
+  void *Ptr1 = omp_target_alloc(8, 0);
+  omp_target_free(Ptr1, 0);
+  void *Ptr2 = omp_target_alloc(8, 0);
+  omp_target_free(Ptr2, 0);
+  void *Ptr3 = omp_target_alloc(8, 0);
+  omp_target_free(Ptr3, 0);
+  omp_target_free(Ptr2, 0);
+}
+
+// CHECK: OFFLOAD ERROR: double-free of device memory: 0x
+// CHECK:   dataDelete
+// CHECK:   omp_target_free
+// NDEBG:   main
+// DEBUG:   main {{.*}}double_free.c:25
+//
+// CHECK: Last deallocation:
+// CHECK:  dataDelete
+// CHECK:  omp_target_free
+// NDEBG:  main
+// DEBUG:  main {{.*}}double_free.c:24
+//
+// CHECK: Last allocation of size 8:
+// CHECK:  dataAlloc
+// CHECK:  omp_target_alloc
+// NDEBG:  main
+// DEBUG:  main {{.*}}double_free.c:23
+//
+// CHECK: Prior allocations with the same base pointer:
+// CHECK: #0 Prior deallocation of size 8:
+// CHECK:  dataDelete
+// CHECK:  omp_target_free
+// NDEBG:  main
+// DEBUG:  main {{.*}}double_free.c:22
+//
+// CHECK: #0 Prior allocation:
+// CHECK:  dataAlloc
+// CHECK:  omp_target_alloc
+// NDEBG:  main
+// DEBUG:  main {{.*}}double_free.c:20
+//
+// CHECK: #1 Prior deallocation of size 8:
+// CHECK:  dataDelete
+// CHECK:  omp_target_free
+// NDEBG:  main
+// DEBUG:  main {{.*}}double_free.c:20
+//
+// CHECK: #1 Prior allocation:
+// CHECK:  dataAlloc
+// CHECK:  omp_target_alloc
+// NDEBG:  main
+// DEBUG:  main {{.*}}double_free.c:19
diff --git a/offload/test/sanitizer/double_free_racy.c b/offload/test/sanitizer/double_free_racy.c
new file mode 100644
index 0000000..3b4f2d5
--- /dev/null
+++ b/offload/test/sanitizer/double_free_racy.c
@@ -0,0 +1,33 @@
+// clang-format off
+// RUN: %libomptarget-compileopt-generic
+// RUN: %not --crash env -u LLVM_DISABLE_SYMBOLIZATION OFFLOAD_TRACK_ALLOCATION_TRACES=1 %libomptarget-run-generic 2>&1 | %fcheck-generic --check-prefixes=CHECK
+// RUN: %libomptarget-compileopt-generic -g
+// RUN: %not --crash env -u LLVM_DISABLE_SYMBOLIZATION OFFLOAD_TRACK_ALLOCATION_TRACES=1 %libomptarget-run-generic 2>&1 | %fcheck-generic --check-prefixes=CHECK
+// clang-format on
+
+// UNSUPPORTED: aarch64-unknown-linux-gnu
+// UNSUPPORTED: aarch64-unknown-linux-gnu-LTO
+// UNSUPPORTED: x86_64-pc-linux-gnu
+// UNSUPPORTED: x86_64-pc-linux-gnu-LTO
+// UNSUPPORTED: s390x-ibm-linux-gnu
+// UNSUPPORTED: s390x-ibm-linux-gnu-LTO
+
+#include <omp.h>
+
+int main(void) {
+  void *Ptr1 = omp_target_alloc(8, 0);
+#pragma omp parallel num_threads(4)
+  omp_target_free(Ptr1, 0);
+}
+
+// CHECK: OFFLOAD ERROR: double-free of device memory: 0x
+// CHECK   dataDelete
+// CHECK:  omp_target_free
+//
+// CHECK: Last deallocation:
+// CHECK:  dataDelete
+// CHECK:  omp_target_free
+
+// CHECK: Last allocation of size 8:
+// CHECK:  dataAlloc
+// CHECK:  omp_target_alloc
diff --git a/offload/test/sanitizer/free_host_ptr.c b/offload/test/sanitizer/free_host_ptr.c
new file mode 100644
index 0000000..1ec6863
--- /dev/null
+++ b/offload/test/sanitizer/free_host_ptr.c
@@ -0,0 +1,25 @@
+// clang-format off
+// RUN: %libomptarget-compileopt-generic
+// RUN: %not --crash env -u LLVM_DISABLE_SYMBOLIZATION OFFLOAD_TRACK_ALLOCATION_TRACES=1 %libomptarget-run-generic 2>&1 | %fcheck-generic --check-prefixes=CHECK,NDEBG 
+// RUN: %libomptarget-compileopt-generic -g
+// RUN: %not --crash env -u LLVM_DISABLE_SYMBOLIZATION OFFLOAD_TRACK_ALLOCATION_TRACES=1 %libomptarget-run-generic 2>&1 | %fcheck-generic --check-prefixes=CHECK,DEBUG
+// clang-format on
+
+// UNSUPPORTED: aarch64-unknown-linux-gnu
+// UNSUPPORTED: aarch64-unknown-linux-gnu-LTO
+// UNSUPPORTED: x86_64-pc-linux-gnu
+// UNSUPPORTED: x86_64-pc-linux-gnu-LTO
+// UNSUPPORTED: s390x-ibm-linux-gnu
+// UNSUPPORTED: s390x-ibm-linux-gnu-LTO
+
+#include <omp.h>
+
+int main(void) {
+  int X;
+  omp_target_free(&X, 0);
+}
+
+// CHECK:  OFFLOAD ERROR: deallocation of non-allocated device memory: 0x
+// CHECK:   dataDelete
+// NDEBG:   main
+// DEBUG:   main {{.*}}free_host_ptr.c:20
diff --git a/offload/test/sanitizer/free_wrong_ptr_kind.c b/offload/test/sanitizer/free_wrong_ptr_kind.c
new file mode 100644
index 0000000..0c17854
--- /dev/null
+++ b/offload/test/sanitizer/free_wrong_ptr_kind.c
@@ -0,0 +1,35 @@
+// clang-format off
+// RUN: %libomptarget-compileopt-generic
+// RUN: %not --crash env -u LLVM_DISABLE_SYMBOLIZATION OFFLOAD_TRACK_ALLOCATION_TRACES=1 %libomptarget-run-generic 2>&1 | %fcheck-generic --check-prefixes=CHECK,NDEBG 
+// RUN: %libomptarget-compileopt-generic -g
+// RUN: %not --crash env -u LLVM_DISABLE_SYMBOLIZATION OFFLOAD_TRACK_ALLOCATION_TRACES=1 %libomptarget-run-generic 2>&1 | %fcheck-generic --check-prefixes=CHECK,DEBUG
+// clang-format on
+
+// UNSUPPORTED: aarch64-unknown-linux-gnu
+// UNSUPPORTED: aarch64-unknown-linux-gnu-LTO
+// UNSUPPORTED: x86_64-pc-linux-gnu
+// UNSUPPORTED: x86_64-pc-linux-gnu-LTO
+// UNSUPPORTED: s390x-ibm-linux-gnu
+// UNSUPPORTED: s390x-ibm-linux-gnu-LTO
+
+#include <omp.h>
+
+void *llvm_omp_target_alloc_host(size_t Size, int DeviceNum);
+
+int main(void) {
+  void *P = llvm_omp_target_alloc_host(8, 0);
+  omp_target_free(P, 0);
+}
+
+// clang-format off
+// CHECK: OFFLOAD ERROR: deallocation requires device memory but allocation was pinned host memory: 0x 
+// CHECK:  dataDelete 
+// CHECK:  omp_target_free
+// NDEBG: main
+// DEBUG:  main {{.*}}free_wrong_ptr_kind.c:22
+//
+// CHECK: Last allocation of size 8:
+// CHECK:  dataAlloc
+// CHECK:  llvm_omp_target_alloc_host
+// NDEBG:  main
+// DEBUG:  main {{.*}}free_wrong_ptr_kind.c:21
diff --git a/offload/test/sanitizer/free_wrong_ptr_kind.cpp b/offload/test/sanitizer/free_wrong_ptr_kind.cpp
new file mode 100644
index 0000000..87a52c5
--- /dev/null
+++ b/offload/test/sanitizer/free_wrong_ptr_kind.cpp
@@ -0,0 +1,38 @@
+// clang-format off
+// RUN: %libomptarget-compileoptxx-generic
+// RUN: %not --crash env -u LLVM_DISABLE_SYMBOLIZATION OFFLOAD_TRACK_ALLOCATION_TRACES=1 %libomptarget-run-generic 2>&1 | %fcheck-generic --check-prefixes=CHECK,NDEBG 
+// RUN: %libomptarget-compileoptxx-generic -g
+// RUN: %not --crash env -u LLVM_DISABLE_SYMBOLIZATION OFFLOAD_TRACK_ALLOCATION_TRACES=1 %libomptarget-run-generic 2>&1 | %fcheck-generic --check-prefixes=CHECK,DEBUG
+// clang-format on
+
+// UNSUPPORTED: aarch64-unknown-linux-gnu
+// UNSUPPORTED: aarch64-unknown-linux-gnu-LTO
+// UNSUPPORTED: x86_64-pc-linux-gnu
+// UNSUPPORTED: x86_64-pc-linux-gnu-LTO
+// UNSUPPORTED: s390x-ibm-linux-gnu
+// UNSUPPORTED: s390x-ibm-linux-gnu-LTO
+
+#include <omp.h>
+
+extern "C" {
+void *llvm_omp_target_alloc_shared(size_t Size, int DeviceNum);
+void llvm_omp_target_free_host(void *Ptr, int DeviceNum);
+}
+
+int main(void) {
+  void *P = llvm_omp_target_alloc_shared(8, 0);
+  llvm_omp_target_free_host(P, 0);
+}
+
+// clang-format off
+// CHECK: OFFLOAD ERROR: deallocation requires pinned host memory but allocation was managed memory: 0x
+// CHECK:  dataDelete 
+// CHECK:  llvm_omp_target_free_host
+// NDEBG: main
+// DEBUG:  main {{.*}}free_wrong_ptr_kind.cpp:25
+//
+// CHECK: Last allocation of size 8:
+// CHECK:  dataAlloc
+// CHECK:  llvm_omp_target_alloc_shared
+// NDEBG:  main
+// DEBUG:  main {{.*}}free_wrong_ptr_kind.cpp:24
diff --git a/offload/test/sanitizer/kernel_crash.c b/offload/test/sanitizer/kernel_crash.c
new file mode 100644
index 0000000..457d953
--- /dev/null
+++ b/offload/test/sanitizer/kernel_crash.c
@@ -0,0 +1,47 @@
+
+// clang-format off
+// RUN: %libomptarget-compile-generic
+// RUN: %not --crash env -u LLVM_DISABLE_SYMBOLIZATION OFFLOAD_TRACK_NUM_KERNEL_LAUNCH_TRACES=1 %libomptarget-run-generic 2>&1 | %fcheck-generic --check-prefixes=TRACE,NDEBG 
+// RUN: %not --crash %libomptarget-run-generic 2>&1 | %fcheck-generic --check-prefixes=CHECK
+// RUN: %libomptarget-compile-generic -g
+// RUN: %not --crash env -u LLVM_DISABLE_SYMBOLIZATION OFFLOAD_TRACK_NUM_KERNEL_LAUNCH_TRACES=1 %libomptarget-run-generic 2>&1 | %fcheck-generic --check-prefixes=TRACE,DEBUG
+// RUN: %not --crash %libomptarget-run-generic 2>&1 | %fcheck-generic --check-prefixes=CHECK
+// clang-format on
+
+// UNSUPPORTED: nvptx64-nvidia-cuda
+// UNSUPPORTED: nvptx64-nvidia-cuda-LTO
+// UNSUPPORTED: aarch64-unknown-linux-gnu
+// UNSUPPORTED: aarch64-unknown-linux-gnu-LTO
+// UNSUPPORTED: x86_64-pc-linux-gnu
+// UNSUPPORTED: x86_64-pc-linux-gnu-LTO
+// UNSUPPORTED: s390x-ibm-linux-gnu
+// UNSUPPORTED: s390x-ibm-linux-gnu-LTO
+
+#include <omp.h>
+
+int main(void) {
+  int *A = 0;
+#pragma omp target
+  {
+  }
+#pragma omp target
+  {
+  }
+#pragma omp target
+  {
+    *A = 42;
+  }
+#pragma omp target
+  {
+  }
+}
+// TRACE: Display 1 of the 3 last kernel launch traces
+// TRACE: Kernel 0: '__omp_offloading_{{.*}}_main_l30'
+// TRACE:     launchKernel
+// NDEBG:     main
+// DEBUG:     main {{.*}}kernel_crash.c:30
+//
+// CHECK: Display last 3 kernels launched:
+// CHECK: Kernel 0: '__omp_offloading_{{.*}}_main_l30'
+// CHECK: Kernel 1: '__omp_offloading_{{.*}}_main_l27'
+// CHECK: Kernel 2: '__omp_offloading_{{.*}}_main_l24'
diff --git a/offload/test/sanitizer/kernel_crash_async.c b/offload/test/sanitizer/kernel_crash_async.c
new file mode 100644
index 0000000..6aebf1b
--- /dev/null
+++ b/offload/test/sanitizer/kernel_crash_async.c
@@ -0,0 +1,40 @@
+
+// clang-format off
+// RUN: %libomptarget-compileopt-generic
+// RUN: %not --crash env -u LLVM_DISABLE_SYMBOLIZATION OFFLOAD_TRACK_NUM_KERNEL_LAUNCH_TRACES=1 %libomptarget-run-generic 2>&1 | %fcheck-generic --check-prefixes=TRACE
+// RUN: %not --crash %libomptarget-run-generic 2>&1 | %fcheck-generic --check-prefixes=CHECK
+// RUN: %libomptarget-compileopt-generic -g
+// RUN: %not --crash env -u LLVM_DISABLE_SYMBOLIZATION OFFLOAD_TRACK_NUM_KERNEL_LAUNCH_TRACES=1 %libomptarget-run-generic 2>&1 | %fcheck-generic --check-prefixes=TRACE
+// RUN: %not --crash %libomptarget-run-generic 2>&1 | %fcheck-generic --check-prefixes=CHECK
+// clang-format on
+
+// UNSUPPORTED: nvptx64-nvidia-cuda
+// UNSUPPORTED: nvptx64-nvidia-cuda-LTO
+// UNSUPPORTED: aarch64-unknown-linux-gnu
+// UNSUPPORTED: aarch64-unknown-linux-gnu-LTO
+// UNSUPPORTED: x86_64-pc-linux-gnu
+// UNSUPPORTED: x86_64-pc-linux-gnu-LTO
+// UNSUPPORTED: s390x-ibm-linux-gnu
+// UNSUPPORTED: s390x-ibm-linux-gnu-LTO
+
+#include <omp.h>
+
+int main(void) {
+  int *A = 0;
+#pragma omp target nowait
+  {
+  }
+#pragma omp target nowait
+  {
+  }
+#pragma omp target nowait
+  {
+    *A = 42;
+  }
+#pragma omp taskwait
+}
+
+// TRACE: Kernel {{.*}}'__omp_offloading_{{.*}}_main_
+// TRACE:     launchKernel
+//
+// CHECK-DAG: Kernel {{[0-9]}}: '__omp_offloading_{{.*}}_main_l30'
diff --git a/offload/test/sanitizer/kernel_crash_many.c b/offload/test/sanitizer/kernel_crash_many.c
new file mode 100644
index 0000000..9e3f4f1
--- /dev/null
+++ b/offload/test/sanitizer/kernel_crash_many.c
@@ -0,0 +1,73 @@
+
+// clang-format off
+// RUN: %libomptarget-compile-generic
+// RUN: %not --crash env -u LLVM_DISABLE_SYMBOLIZATION OFFLOAD_TRACK_NUM_KERNEL_LAUNCH_TRACES=24 %libomptarget-run-generic 2>&1 | %fcheck-generic --check-prefixes=CHECK,NDEBG 
+// RUN: %libomptarget-compile-generic -g
+// RUN: %not --crash env -u LLVM_DISABLE_SYMBOLIZATION OFFLOAD_TRACK_NUM_KERNEL_LAUNCH_TRACES=16 %libomptarget-run-generic 2>&1 | %fcheck-generic --check-prefixes=CHECK,DEBUG
+// clang-format on
+
+// UNSUPPORTED: nvptx64-nvidia-cuda
+// UNSUPPORTED: nvptx64-nvidia-cuda-LTO
+// UNSUPPORTED: aarch64-unknown-linux-gnu
+// UNSUPPORTED: aarch64-unknown-linux-gnu-LTO
+// UNSUPPORTED: x86_64-pc-linux-gnu
+// UNSUPPORTED: x86_64-pc-linux-gnu-LTO
+// UNSUPPORTED: s390x-ibm-linux-gnu
+// UNSUPPORTED: s390x-ibm-linux-gnu-LTO
+
+#include <omp.h>
+
+int main(void) {
+  int *A = 0;
+  for (int i = 0; i < 10; ++i) {
+#pragma omp target
+    {
+    }
+  }
+#pragma omp target
+  {
+    *A = 42;
+  }
+}
+// CHECK: Display 8 of the 8 last kernel launch traces
+// CHECK: Kernel 0: '__omp_offloading_{{.*}}_main_l27'
+// CHECK:     launchKernel
+// NDEBG:     main
+// DEBUG:     main {{.*}}kernel_crash_many.c:27
+//
+// CHECK: Kernel 1: '__omp_offloading_{{.*}}_main_l23'
+// CHECK:     launchKernel
+// NDEBG:     main
+// DEBUG:     main {{.*}}kernel_crash_many.c:
+//
+// CHECK: Kernel 2: '__omp_offloading_{{.*}}_main_l23'
+// CHECK:     launchKernel
+// NDEBG:     main
+// DEBUG:     main {{.*}}kernel_crash_many.c:
+//
+// CHECK: Kernel 3: '__omp_offloading_{{.*}}_main_l23'
+// CHECK:     launchKernel
+// NDEBG:     main
+// DEBUG:     main {{.*}}kernel_crash_many.c:
+//
+// CHECK: Kernel 4: '__omp_offloading_{{.*}}_main_l23'
+// CHECK:     launchKernel
+// NDEBG:     main
+// DEBUG:     main {{.*}}kernel_crash_many.c:
+//
+// CHECK: Kernel 5: '__omp_offloading_{{.*}}_main_l23'
+// CHECK:     launchKernel
+// NDEBG:     main
+// DEBUG:     main {{.*}}kernel_crash_many.c:
+//
+// CHECK: Kernel 6: '__omp_offloading_{{.*}}_main_l23'
+// CHECK:     launchKernel
+// NDEBG:     main
+// DEBUG:     main {{.*}}kernel_crash_many.c:
+//
+// CHECK: Kernel 7: '__omp_offloading_{{.*}}_main_l23'
+// CHECK:     launchKernel
+// NDEBG:     main
+// DEBUG:     main {{.*}}kernel_crash_many.c:
+//
+// CHECK-NOT: Kernel {{[[0-9]]+}}:
diff --git a/offload/test/sanitizer/kernel_crash_single.c b/offload/test/sanitizer/kernel_crash_single.c
new file mode 100644
index 0000000..16a8159
--- /dev/null
+++ b/offload/test/sanitizer/kernel_crash_single.c
@@ -0,0 +1,36 @@
+
+// clang-format off
+// RUN: %libomptarget-compile-generic
+// RUN: %not --crash env -u LLVM_DISABLE_SYMBOLIZATION OFFLOAD_TRACK_NUM_KERNEL_LAUNCH_TRACES=1 %libomptarget-run-generic 2>&1 | %fcheck-generic --check-prefixes=TRACE,NDEBG 
+// RUN: %not --crash %libomptarget-run-generic 2>&1 | %fcheck-generic --check-prefixes=CHECK
+// RUN: %libomptarget-compile-generic -g
+// RUN: %not --crash env -u LLVM_DISABLE_SYMBOLIZATION OFFLOAD_TRACK_NUM_KERNEL_LAUNCH_TRACES=1 %libomptarget-run-generic 2>&1 | %fcheck-generic --check-prefixes=TRACE,DEBUG
+// RUN: %not --crash %libomptarget-run-generic 2>&1 | %fcheck-generic --check-prefixes=CHECK
+// clang-format on
+
+// UNSUPPORTED: nvptx64-nvidia-cuda
+// UNSUPPORTED: nvptx64-nvidia-cuda-LTO
+// UNSUPPORTED: aarch64-unknown-linux-gnu
+// UNSUPPORTED: aarch64-unknown-linux-gnu-LTO
+// UNSUPPORTED: x86_64-pc-linux-gnu
+// UNSUPPORTED: x86_64-pc-linux-gnu-LTO
+// UNSUPPORTED: s390x-ibm-linux-gnu
+// UNSUPPORTED: s390x-ibm-linux-gnu-LTO
+
+#include <omp.h>
+
+int main(void) {
+  int *A = 0;
+#pragma omp target
+  {
+    *A = 42;
+  }
+}
+// TRACE: Display kernel launch trace
+// TRACE: Kernel '__omp_offloading_{{.*}}_main_l24'
+// TRACE:     launchKernel
+// NDEBG:     main
+// DEBUG:     main {{.*}}kernel_crash_single.c:24
+//
+// CHECK: Display only launched kernel:
+// CHECK: Kernel '__omp_offloading_{{.*}}_main_l24'
diff --git a/offload/test/sanitizer/kernel_trap.c b/offload/test/sanitizer/kernel_trap.c
new file mode 100644
index 0000000..13fe6f2
--- /dev/null
+++ b/offload/test/sanitizer/kernel_trap.c
@@ -0,0 +1,42 @@
+
+// clang-format off
+// RUN: %libomptarget-compile-generic
+// RUN: %not --crash env -u LLVM_DISABLE_SYMBOLIZATION OFFLOAD_TRACK_NUM_KERNEL_LAUNCH_TRACES=1 %libomptarget-run-generic 2>&1 | %fcheck-generic --check-prefixes=CHECK,TRACE,NDEBG 
+// RUN: %not --crash %libomptarget-run-generic 2>&1 | %fcheck-generic --check-prefixes=CHECK
+// RUN: %libomptarget-compile-generic -g
+// RUN: %not --crash env -u LLVM_DISABLE_SYMBOLIZATION OFFLOAD_TRACK_NUM_KERNEL_LAUNCH_TRACES=1 %libomptarget-run-generic 2>&1 | %fcheck-generic --check-prefixes=CHECK,TRACE,DEBUG
+// RUN: %not --crash %libomptarget-run-generic 2>&1 | %fcheck-generic --check-prefixes=CHECK
+// clang-format on
+
+// UNSUPPORTED: nvptx64-nvidia-cuda
+// UNSUPPORTED: nvptx64-nvidia-cuda-LTO
+// UNSUPPORTED: aarch64-unknown-linux-gnu
+// UNSUPPORTED: aarch64-unknown-linux-gnu-LTO
+// UNSUPPORTED: x86_64-pc-linux-gnu
+// UNSUPPORTED: x86_64-pc-linux-gnu-LTO
+// UNSUPPORTED: s390x-ibm-linux-gnu
+// UNSUPPORTED: s390x-ibm-linux-gnu-LTO
+
+#include <omp.h>
+
+int main(void) {
+
+#pragma omp target
+  {
+  }
+#pragma omp target
+  {
+  }
+#pragma omp target
+  {
+    __builtin_trap();
+  }
+#pragma omp target
+  {
+  }
+}
+// CHECK: OFFLOAD ERROR: Kernel '__omp_offloading_{{.*}}_main_l30'
+// CHECK: OFFLOAD ERROR: execution interrupted by hardware trap instruction
+// TRACE:     launchKernel
+// NDEBG:     main
+// DEBUG:     main {{.*}}kernel_trap.c:
diff --git a/offload/test/sanitizer/kernel_trap_async.c b/offload/test/sanitizer/kernel_trap_async.c
new file mode 100644
index 0000000..65e8880
--- /dev/null
+++ b/offload/test/sanitizer/kernel_trap_async.c
@@ -0,0 +1,40 @@
+
+// clang-format off
+// RUN: %libomptarget-compileopt-generic
+// RUN: %not --crash env -u LLVM_DISABLE_SYMBOLIZATION OFFLOAD_TRACK_NUM_KERNEL_LAUNCH_TRACES=1 %libomptarget-run-generic 2>&1 | %fcheck-generic --check-prefixes=CHECK,TRACE
+// RUN: %not --crash %libomptarget-run-generic 2>&1 | %fcheck-generic --check-prefixes=CHECK
+// RUN: %libomptarget-compileopt-generic -g
+// RUN: %not --crash env -u LLVM_DISABLE_SYMBOLIZATION OFFLOAD_TRACK_NUM_KERNEL_LAUNCH_TRACES=1 %libomptarget-run-generic 2>&1 | %fcheck-generic --check-prefixes=CHECK,TRACE,DEBUG
+// RUN: %not --crash %libomptarget-run-generic 2>&1 | %fcheck-generic --check-prefixes=CHECK
+// clang-format on
+
+// UNSUPPORTED: nvptx64-nvidia-cuda
+// UNSUPPORTED: nvptx64-nvidia-cuda-LTO
+// UNSUPPORTED: aarch64-unknown-linux-gnu
+// UNSUPPORTED: aarch64-unknown-linux-gnu-LTO
+// UNSUPPORTED: x86_64-pc-linux-gnu
+// UNSUPPORTED: x86_64-pc-linux-gnu-LTO
+// UNSUPPORTED: s390x-ibm-linux-gnu
+// UNSUPPORTED: s390x-ibm-linux-gnu-LTO
+
+#include <omp.h>
+
+int main(void) {
+
+#pragma omp target nowait
+  {
+  }
+#pragma omp target nowait
+  {
+  }
+#pragma omp target nowait
+  {
+    __builtin_trap();
+  }
+#pragma omp taskwait
+}
+
+// CHECK: OFFLOAD ERROR: Kernel '__omp_offloading_{{.*}}_main_l30'
+// CHECK: OFFLOAD ERROR: execution interrupted by hardware trap instruction
+// TRACE:     launchKernel
+// DEBUG:     kernel_trap_async.c:
diff --git a/offload/test/sanitizer/kernel_trap_many.c b/offload/test/sanitizer/kernel_trap_many.c
new file mode 100644
index 0000000..3f1796e
--- /dev/null
+++ b/offload/test/sanitizer/kernel_trap_many.c
@@ -0,0 +1,36 @@
+
+// clang-format off
+// RUN: %libomptarget-compile-generic
+// RUN: %not --crash env -u LLVM_DISABLE_SYMBOLIZATION OFFLOAD_TRACK_NUM_KERNEL_LAUNCH_TRACES=24 %libomptarget-run-generic 2>&1 | %fcheck-generic --check-prefixes=TRACE,NDEBG 
+// RUN: %libomptarget-compile-generic -g
+// RUN: %not --crash env -u LLVM_DISABLE_SYMBOLIZATION OFFLOAD_TRACK_NUM_KERNEL_LAUNCH_TRACES=16 %libomptarget-run-generic 2>&1 | %fcheck-generic --check-prefixes=TRACE,DEBUG
+// clang-format on
+
+// UNSUPPORTED: nvptx64-nvidia-cuda
+// UNSUPPORTED: nvptx64-nvidia-cuda-LTO
+// UNSUPPORTED: aarch64-unknown-linux-gnu
+// UNSUPPORTED: aarch64-unknown-linux-gnu-LTO
+// UNSUPPORTED: x86_64-pc-linux-gnu
+// UNSUPPORTED: x86_64-pc-linux-gnu-LTO
+// UNSUPPORTED: s390x-ibm-linux-gnu
+// UNSUPPORTED: s390x-ibm-linux-gnu-LTO
+
+#include <omp.h>
+
+int main(void) {
+
+  for (int i = 0; i < 10; ++i) {
+#pragma omp target
+    {
+    }
+  }
+#pragma omp target
+  {
+    __builtin_trap();
+  }
+}
+// TRACE: OFFLOAD ERROR: Kernel '__omp_offloading_{{.*}}_main_l27'
+// TRACE: OFFLOAD ERROR: execution interrupted by hardware trap instruction
+// TRACE:     launchKernel
+// NDEBG:     main
+// DEBUG:     main {{.*}}kernel_trap_many.c:
diff --git a/openmp/docs/design/Runtimes.rst b/openmp/docs/design/Runtimes.rst
index 98dd984..ed002c8 100644
--- a/openmp/docs/design/Runtimes.rst
+++ b/openmp/docs/design/Runtimes.rst
@@ -743,6 +743,8 @@ variables is defined below.
     * ``LIBOMPTARGET_JIT_POST_OPT_IR_MODULE=<out:Filename> (LLVM-IR file)``
     * ``LIBOMPTARGET_MIN_THREADS_FOR_LOW_TRIP_COUNT=<Num> (default: 32)``
     * ``LIBOMPTARGET_REUSE_BLOCKS_FOR_HIGH_TRIP_COUNT=[TRUE/FALSE] (default TRUE)``
+    * ``OFFLOAD_TRACK_ALLOCATION_TRACES=[TRUE/FALSE] (default FALSE)``
+    * ``OFFLOAD_TRACK_NUM_KERNEL_LAUNCH_TRACES=<Num> (default 0)``
 
 LIBOMPTARGET_DEBUG
 """"""""""""""""""
@@ -1170,6 +1172,18 @@ This environment variable can be used to control how the OpenMP runtime assigns
 blocks to loops with high trip counts. By default we reuse existing blocks
 rather than spawning new blocks.
 
+OFFLOAD_TRACK_ALLOCATION_TRACES
+"""""""""""""""""""""""""""""""
+
+This environment variable determines if the stack traces of allocations and
+deallocations are tracked to aid in error reporting, e.g., in case of
+double-free.
+
+OFFLOAD_TRACK_KERNEL_LAUNCH_TRACES
+""""""""""""""""""""""""""""""""""
+
+This environment variable determines how manytstack traces of kernel launches
+are tracked to aid in error reporting, e.g., what asynchronous kernel failed.
 
 .. _libomptarget_plugin:
 
diff --git a/openmp/runtime/src/kmp_affinity.cpp b/openmp/runtime/src/kmp_affinity.cpp
index f34e555..ab13ac4 100644
--- a/openmp/runtime/src/kmp_affinity.cpp
+++ b/openmp/runtime/src/kmp_affinity.cpp
@@ -203,7 +203,26 @@ int kmp_hw_thread_t::compare_ids(const void *a, const void *b) {
   const kmp_hw_thread_t *bhwthread = (const kmp_hw_thread_t *)b;
   int depth = __kmp_topology->get_depth();
   for (int level = 0; level < depth; ++level) {
-    if (ahwthread->ids[level] < bhwthread->ids[level])
+    // Reverse sort (higher efficiencies earlier in list) cores by core
+    // efficiency if available.
+    if (__kmp_is_hybrid_cpu() &&
+        __kmp_topology->get_type(level) == KMP_HW_CORE &&
+        ahwthread->attrs.is_core_eff_valid() &&
+        bhwthread->attrs.is_core_eff_valid()) {
+      if (ahwthread->attrs.get_core_eff() < bhwthread->attrs.get_core_eff())
+        return 1;
+      if (ahwthread->attrs.get_core_eff() > bhwthread->attrs.get_core_eff())
+        return -1;
+    }
+    if (ahwthread->ids[level] == bhwthread->ids[level])
+      continue;
+    // If the hardware id is unknown for this level, then place hardware thread
+    // further down in the sorted list as it should take last priority
+    if (ahwthread->ids[level] == UNKNOWN_ID)
+      return 1;
+    else if (bhwthread->ids[level] == UNKNOWN_ID)
+      return -1;
+    else if (ahwthread->ids[level] < bhwthread->ids[level])
       return -1;
     else if (ahwthread->ids[level] > bhwthread->ids[level])
       return 1;
@@ -246,7 +265,7 @@ void kmp_hw_thread_t::print() const {
   int depth = __kmp_topology->get_depth();
   printf("%4d ", os_id);
   for (int i = 0; i < depth; ++i) {
-    printf("%4d ", ids[i]);
+    printf("%4d (%d) ", ids[i], sub_ids[i]);
   }
   if (attrs) {
     if (attrs.is_core_type_valid())
@@ -264,7 +283,7 @@ void kmp_hw_thread_t::print() const {
 
 // Add a layer to the topology based on the ids. Assume the topology
 // is perfectly nested (i.e., so no object has more than one parent)
-void kmp_topology_t::_insert_layer(kmp_hw_t type, const int *ids) {
+void kmp_topology_t::insert_layer(kmp_hw_t type, const int *ids) {
   // Figure out where the layer should go by comparing the ids of the current
   // layers with the new ids
   int target_layer;
@@ -325,7 +344,7 @@ void kmp_topology_t::_insert_windows_proc_groups() {
     ids[i] = __kmp_get_proc_group(mask);
   }
   KMP_CPU_FREE(mask);
-  _insert_layer(KMP_HW_PROC_GROUP, ids);
+  insert_layer(KMP_HW_PROC_GROUP, ids);
   __kmp_free(ids);
 
   // sort topology after adding proc groups
@@ -465,10 +484,13 @@ void kmp_topology_t::_gather_enumeration_information() {
       int id = hw_thread.ids[layer];
       if (id != previous_id[layer]) {
         // Add an additional increment to each count
-        for (int l = layer; l < depth; ++l)
-          count[l]++;
+        for (int l = layer; l < depth; ++l) {
+          if (hw_thread.ids[l] != kmp_hw_thread_t::UNKNOWN_ID)
+            count[l]++;
+        }
         // Keep track of topology layer ratio statistics
-        max[layer]++;
+        if (hw_thread.ids[layer] != kmp_hw_thread_t::UNKNOWN_ID)
+          max[layer]++;
         for (int l = layer + 1; l < depth; ++l) {
           if (max[l] > ratio[l])
             ratio[l] = max[l];
@@ -833,6 +855,8 @@ void kmp_topology_t::print(const char *env_var) const {
   for (int i = 0; i < num_hw_threads; i++) {
     __kmp_str_buf_clear(&buf);
     for (int level = 0; level < depth; ++level) {
+      if (hw_threads[i].ids[level] == kmp_hw_thread_t::UNKNOWN_ID)
+        continue;
       kmp_hw_t type = types[level];
       __kmp_str_buf_print(&buf, "%s ", __kmp_hw_get_catalog_string(type));
       __kmp_str_buf_print(&buf, "%d ", hw_threads[i].ids[level]);
@@ -1354,7 +1378,8 @@ bool kmp_topology_t::filter_hw_subset() {
           sub_id = abs_sub_ids[level];
         else
           sub_id = hw_thread.sub_ids[level];
-        if (sub_id < offset ||
+        if (hw_thread.ids[level] == kmp_hw_thread_t::UNKNOWN_ID ||
+            sub_id < offset ||
             (num != kmp_hw_subset_t::USE_ALL && sub_id >= offset + num)) {
           should_be_filtered = true;
           break;
@@ -1904,6 +1929,7 @@ static bool __kmp_affinity_create_hwloc_map(kmp_i18n_id_t *const msg_id) {
       hw_thread.clear();
       hw_thread.ids[index] = pu->logical_index;
       hw_thread.os_id = pu->os_index;
+      hw_thread.original_idx = hw_thread_index;
       // If multiple core types, then set that attribute for the hardware thread
 #if HWLOC_API_VERSION >= 0x00020400
       if (cpukinds) {
@@ -2018,6 +2044,7 @@ static bool __kmp_affinity_create_flat_map(kmp_i18n_id_t *const msg_id) {
     kmp_hw_thread_t &hw_thread = __kmp_topology->at(avail_ct);
     hw_thread.clear();
     hw_thread.os_id = i;
+    hw_thread.original_idx = avail_ct;
     hw_thread.ids[0] = i;
     hw_thread.ids[1] = 0;
     hw_thread.ids[2] = 0;
@@ -2063,11 +2090,13 @@ static bool __kmp_affinity_create_proc_group_map(kmp_i18n_id_t *const msg_id) {
     if (!KMP_CPU_ISSET(i, __kmp_affin_fullMask)) {
       continue;
     }
-    kmp_hw_thread_t &hw_thread = __kmp_topology->at(avail_ct++);
+    kmp_hw_thread_t &hw_thread = __kmp_topology->at(avail_ct);
     hw_thread.clear();
     hw_thread.os_id = i;
+    hw_thread.original_idx = avail_ct;
     hw_thread.ids[0] = i / BITS_PER_GROUP;
     hw_thread.ids[1] = hw_thread.ids[2] = i % BITS_PER_GROUP;
+    avail_ct++;
   }
   return true;
 }
@@ -2123,15 +2152,43 @@ static int __kmp_affinity_cmp_apicThreadInfo_phys_id(const void *a,
   return 0;
 }
 
-class kmp_cache_info_t {
+class cpuid_cache_info_t {
 public:
   struct info_t {
-    unsigned level, mask;
+    unsigned level = 0;
+    unsigned mask = 0;
+    bool operator==(const info_t &rhs) const {
+      return level == rhs.level && mask == rhs.mask;
+    }
+    bool operator!=(const info_t &rhs) const { return !operator==(rhs); }
   };
-  kmp_cache_info_t() : depth(0) { get_leaf4_levels(); }
+  cpuid_cache_info_t() : depth(0) {
+    table[MAX_CACHE_LEVEL].level = 0;
+    table[MAX_CACHE_LEVEL].mask = 0;
+  }
   size_t get_depth() const { return depth; }
   info_t &operator[](size_t index) { return table[index]; }
   const info_t &operator[](size_t index) const { return table[index]; }
+  bool operator==(const cpuid_cache_info_t &rhs) const {
+    if (rhs.depth != depth)
+      return false;
+    for (size_t i = 0; i < depth; ++i)
+      if (table[i] != rhs.table[i])
+        return false;
+    return true;
+  }
+  bool operator!=(const cpuid_cache_info_t &rhs) const {
+    return !operator==(rhs);
+  }
+  // Get cache information assocaited with L1, L2, L3 cache, etc.
+  // If level does not exist, then return the "NULL" level (level 0)
+  const info_t &get_level(unsigned level) const {
+    for (size_t i = 0; i < depth; ++i) {
+      if (table[i].level == level)
+        return table[i];
+    }
+    return table[MAX_CACHE_LEVEL];
+  }
 
   static kmp_hw_t get_topology_type(unsigned level) {
     KMP_DEBUG_ASSERT(level >= 1 && level <= MAX_CACHE_LEVEL);
@@ -2145,13 +2202,6 @@ public:
     }
     return KMP_HW_UNKNOWN;
   }
-
-private:
-  static const int MAX_CACHE_LEVEL = 3;
-
-  size_t depth;
-  info_t table[MAX_CACHE_LEVEL];
-
   void get_leaf4_levels() {
     unsigned level = 0;
     while (depth < MAX_CACHE_LEVEL) {
@@ -2176,6 +2226,11 @@ private:
       level++;
     }
   }
+  static const int MAX_CACHE_LEVEL = 3;
+
+private:
+  size_t depth;
+  info_t table[MAX_CACHE_LEVEL + 1];
 };
 
 // On IA-32 architecture and Intel(R) 64 architecture, we attempt to use
@@ -2483,6 +2538,7 @@ static bool __kmp_affinity_create_apicid_map(kmp_i18n_id_t *const msg_id) {
       hw_thread.ids[idx++] = threadInfo[i].threadId;
     }
     hw_thread.os_id = os;
+    hw_thread.original_idx = i;
   }
 
   __kmp_free(threadInfo);
@@ -2543,10 +2599,8 @@ enum {
   INTEL_LEVEL_TYPE_DIE = 5,
   INTEL_LEVEL_TYPE_LAST = 6,
 };
-
-struct cpuid_level_info_t {
-  unsigned level_type, mask, mask_width, nitems, cache_mask;
-};
+KMP_BUILD_ASSERT(INTEL_LEVEL_TYPE_LAST < sizeof(unsigned) * CHAR_BIT);
+#define KMP_LEAF_1F_KNOWN_LEVELS ((1u << INTEL_LEVEL_TYPE_LAST) - 1u)
 
 static kmp_hw_t __kmp_intel_type_2_topology_type(int intel_type) {
   switch (intel_type) {
@@ -2566,16 +2620,77 @@ static kmp_hw_t __kmp_intel_type_2_topology_type(int intel_type) {
   return KMP_HW_UNKNOWN;
 }
 
-// This function takes the topology leaf, a levels array to store the levels
-// detected and a bitmap of the known levels.
-// Returns the number of levels in the topology
-static unsigned
-__kmp_x2apicid_get_levels(int leaf,
-                          cpuid_level_info_t levels[INTEL_LEVEL_TYPE_LAST],
-                          kmp_uint64 known_levels) {
+static int __kmp_topology_type_2_intel_type(kmp_hw_t type) {
+  switch (type) {
+  case KMP_HW_SOCKET:
+    return INTEL_LEVEL_TYPE_INVALID;
+  case KMP_HW_THREAD:
+    return INTEL_LEVEL_TYPE_SMT;
+  case KMP_HW_CORE:
+    return INTEL_LEVEL_TYPE_CORE;
+  case KMP_HW_TILE:
+    return INTEL_LEVEL_TYPE_TILE;
+  case KMP_HW_MODULE:
+    return INTEL_LEVEL_TYPE_MODULE;
+  case KMP_HW_DIE:
+    return INTEL_LEVEL_TYPE_DIE;
+  }
+  return INTEL_LEVEL_TYPE_INVALID;
+}
+
+struct cpuid_level_info_t {
+  unsigned level_type, mask, mask_width, nitems, cache_mask;
+};
+
+class cpuid_topo_desc_t {
+  unsigned desc = 0;
+
+public:
+  void clear() { desc = 0; }
+  bool contains(int intel_type) const {
+    KMP_DEBUG_ASSERT(intel_type >= 0 && intel_type < INTEL_LEVEL_TYPE_LAST);
+    if ((1u << intel_type) & desc)
+      return true;
+    return false;
+  }
+  bool contains_topology_type(kmp_hw_t type) const {
+    KMP_DEBUG_ASSERT(type >= 0 && type < KMP_HW_LAST);
+    int intel_type = __kmp_topology_type_2_intel_type(type);
+    return contains(intel_type);
+  }
+  bool contains(cpuid_topo_desc_t rhs) const {
+    return ((desc | rhs.desc) == desc);
+  }
+  void add(int intel_type) { desc |= (1u << intel_type); }
+  void add(cpuid_topo_desc_t rhs) { desc |= rhs.desc; }
+};
+
+struct cpuid_proc_info_t {
+  // Topology info
+  int os_id;
+  unsigned apic_id;
+  unsigned depth;
+  // Hybrid info
+  unsigned native_model_id;
+  int efficiency;
+  kmp_hw_core_type_t type;
+  cpuid_topo_desc_t description;
+
+  cpuid_level_info_t levels[INTEL_LEVEL_TYPE_LAST];
+};
+
+// This function takes the topology leaf, an info pointer to store the levels
+// detected, and writable descriptors for the total topology.
+// Returns whether total types, depth, or description were modified.
+static bool __kmp_x2apicid_get_levels(int leaf, cpuid_proc_info_t *info,
+                                      kmp_hw_t total_types[KMP_HW_LAST],
+                                      int *total_depth,
+                                      cpuid_topo_desc_t *total_description) {
   unsigned level, levels_index;
   unsigned level_type, mask_width, nitems;
   kmp_cpuid buf;
+  cpuid_level_info_t(&levels)[INTEL_LEVEL_TYPE_LAST] = info->levels;
+  bool retval = false;
 
   // New algorithm has known topology layers act as highest unknown topology
   // layers when unknown topology layers exist.
@@ -2590,10 +2705,12 @@ __kmp_x2apicid_get_levels(int leaf,
     level_type = __kmp_extract_bits<8, 15>(buf.ecx);
     mask_width = __kmp_extract_bits<0, 4>(buf.eax);
     nitems = __kmp_extract_bits<0, 15>(buf.ebx);
-    if (level_type != INTEL_LEVEL_TYPE_INVALID && nitems == 0)
-      return 0;
+    if (level_type != INTEL_LEVEL_TYPE_INVALID && nitems == 0) {
+      info->depth = 0;
+      return retval;
+    }
 
-    if (known_levels & (1ull << level_type)) {
+    if (KMP_LEAF_1F_KNOWN_LEVELS & (1u << level_type)) {
       // Add a new level to the topology
       KMP_ASSERT(levels_index < INTEL_LEVEL_TYPE_LAST);
       levels[levels_index].level_type = level_type;
@@ -2609,6 +2726,22 @@ __kmp_x2apicid_get_levels(int leaf,
     }
     level++;
   } while (level_type != INTEL_LEVEL_TYPE_INVALID);
+  KMP_ASSERT(levels_index <= INTEL_LEVEL_TYPE_LAST);
+  info->description.clear();
+  info->depth = levels_index;
+
+  // If types, depth, and total_description are uninitialized,
+  // then initialize them now
+  if (*total_depth == 0) {
+    *total_depth = info->depth;
+    total_description->clear();
+    for (int i = *total_depth - 1, j = 0; i >= 0; --i, ++j) {
+      total_types[j] =
+          __kmp_intel_type_2_topology_type(info->levels[i].level_type);
+      total_description->add(info->levels[i].level_type);
+    }
+    retval = true;
+  }
 
   // Ensure the INTEL_LEVEL_TYPE_INVALID (Socket) layer isn't first
   if (levels_index == 0 || levels[0].level_type == INTEL_LEVEL_TYPE_INVALID)
@@ -2626,38 +2759,61 @@ __kmp_x2apicid_get_levels(int leaf,
       levels[i].mask = (-1) << levels[i - 1].mask_width;
       levels[i].cache_mask = 0;
     }
+    info->description.add(info->levels[i].level_type);
+  }
+
+  // If this processor has level type not on other processors, then make
+  // sure to include it in total types, depth, and description.
+  // One assumption here is that the first type, i.e. socket, is known.
+  // Another assumption is that types array is always large enough to fit any
+  // new layers since its length is KMP_HW_LAST.
+  if (!total_description->contains(info->description)) {
+    for (int i = info->depth - 1, j = 0; i >= 0; --i, ++j) {
+      // If this level is known already, then skip it.
+      if (total_description->contains(levels[i].level_type))
+        continue;
+      // Unknown level, insert before last known level
+      kmp_hw_t curr_type =
+          __kmp_intel_type_2_topology_type(levels[i].level_type);
+      KMP_ASSERT(j != 0 && "Bad APIC Id information");
+      // Move over all known levels to make room for new level
+      for (int k = info->depth - 1; k >= j; --k) {
+        KMP_DEBUG_ASSERT(k + 1 < KMP_HW_LAST);
+        total_types[k + 1] = total_types[k];
+      }
+      // Insert new level
+      total_types[j] = curr_type;
+      (*total_depth)++;
+    }
+    total_description->add(info->description);
+    retval = true;
   }
-  return levels_index;
+  return retval;
 }
 
 static bool __kmp_affinity_create_x2apicid_map(kmp_i18n_id_t *const msg_id) {
 
-  cpuid_level_info_t levels[INTEL_LEVEL_TYPE_LAST];
   kmp_hw_t types[INTEL_LEVEL_TYPE_LAST];
-  unsigned levels_index;
   kmp_cpuid buf;
-  kmp_uint64 known_levels;
-  int topology_leaf, highest_leaf, apic_id;
+  int topology_leaf, highest_leaf;
   int num_leaves;
+  int depth = 0;
+  cpuid_topo_desc_t total_description;
   static int leaves[] = {0, 0};
 
-  kmp_i18n_id_t leaf_message_id;
+  // If affinity is disabled, __kmp_avail_proc may be zero
+  int ninfos = (__kmp_avail_proc > 0 ? __kmp_avail_proc : 1);
+  cpuid_proc_info_t *proc_info = (cpuid_proc_info_t *)__kmp_allocate(
+      (sizeof(cpuid_proc_info_t) + sizeof(cpuid_cache_info_t)) * ninfos);
+  cpuid_cache_info_t *cache_info = (cpuid_cache_info_t *)(proc_info + ninfos);
 
-  KMP_BUILD_ASSERT(sizeof(known_levels) * CHAR_BIT > KMP_HW_LAST);
+  kmp_i18n_id_t leaf_message_id;
 
   *msg_id = kmp_i18n_null;
   if (__kmp_affinity.flags.verbose) {
     KMP_INFORM(AffInfoStr, "KMP_AFFINITY", KMP_I18N_STR(Decodingx2APIC));
   }
 
-  // Figure out the known topology levels
-  known_levels = 0ull;
-  for (int i = 0; i < INTEL_LEVEL_TYPE_LAST; ++i) {
-    if (__kmp_intel_type_2_topology_type(i) != KMP_HW_UNKNOWN) {
-      known_levels |= (1ull << i);
-    }
-  }
-
   // Get the highest cpuid leaf supported
   __kmp_x86_cpuid(0, 0, &buf);
   highest_leaf = buf.eax;
@@ -2691,16 +2847,18 @@ static bool __kmp_affinity_create_x2apicid_map(kmp_i18n_id_t *const msg_id) {
     if (buf.ebx == 0)
       continue;
     topology_leaf = leaf;
-    levels_index = __kmp_x2apicid_get_levels(leaf, levels, known_levels);
-    if (levels_index == 0)
+    __kmp_x2apicid_get_levels(leaf, &proc_info[0], types, &depth,
+                              &total_description);
+    if (depth == 0)
       continue;
     break;
   }
-  if (topology_leaf == -1 || levels_index == 0) {
+  if (topology_leaf == -1 || depth == 0) {
     *msg_id = leaf_message_id;
+    __kmp_free(proc_info);
     return false;
   }
-  KMP_ASSERT(levels_index <= INTEL_LEVEL_TYPE_LAST);
+  KMP_ASSERT(depth <= INTEL_LEVEL_TYPE_LAST);
 
   // The algorithm used starts by setting the affinity to each available thread
   // and retrieving info from the cpuid instruction, so if we are not capable of
@@ -2711,42 +2869,19 @@ static bool __kmp_affinity_create_x2apicid_map(kmp_i18n_id_t *const msg_id) {
     // Hack to try and infer the machine topology using only the data
     // available from cpuid on the current thread, and __kmp_xproc.
     KMP_ASSERT(__kmp_affinity.type == affinity_none);
-    for (unsigned i = 0; i < levels_index; ++i) {
-      if (levels[i].level_type == INTEL_LEVEL_TYPE_SMT) {
-        __kmp_nThreadsPerCore = levels[i].nitems;
-      } else if (levels[i].level_type == INTEL_LEVEL_TYPE_CORE) {
-        nCoresPerPkg = levels[i].nitems;
+    for (int i = 0; i < depth; ++i) {
+      if (proc_info[0].levels[i].level_type == INTEL_LEVEL_TYPE_SMT) {
+        __kmp_nThreadsPerCore = proc_info[0].levels[i].nitems;
+      } else if (proc_info[0].levels[i].level_type == INTEL_LEVEL_TYPE_CORE) {
+        nCoresPerPkg = proc_info[0].levels[i].nitems;
       }
     }
     __kmp_ncores = __kmp_xproc / __kmp_nThreadsPerCore;
     nPackages = (__kmp_xproc + nCoresPerPkg - 1) / nCoresPerPkg;
+    __kmp_free(proc_info);
     return true;
   }
 
-  // Allocate the data structure to be returned.
-  int depth = levels_index;
-  for (int i = depth - 1, j = 0; i >= 0; --i, ++j)
-    types[j] = __kmp_intel_type_2_topology_type(levels[i].level_type);
-  __kmp_topology =
-      kmp_topology_t::allocate(__kmp_avail_proc, levels_index, types);
-
-  // Insert equivalent cache types if they exist
-  kmp_cache_info_t cache_info;
-  for (size_t i = 0; i < cache_info.get_depth(); ++i) {
-    const kmp_cache_info_t::info_t &info = cache_info[i];
-    unsigned cache_mask = info.mask;
-    unsigned cache_level = info.level;
-    for (unsigned j = 0; j < levels_index; ++j) {
-      unsigned hw_cache_mask = levels[j].cache_mask;
-      kmp_hw_t cache_type = kmp_cache_info_t::get_topology_type(cache_level);
-      if (hw_cache_mask == cache_mask && j < levels_index - 1) {
-        kmp_hw_t type =
-            __kmp_intel_type_2_topology_type(levels[j + 1].level_type);
-        __kmp_topology->set_equivalent_type(cache_type, type);
-      }
-    }
-  }
-
   // From here on, we can assume that it is safe to call
   // __kmp_get_system_affinity() and __kmp_set_system_affinity(), even if
   // __kmp_affinity.type = affinity_none.
@@ -2758,56 +2893,167 @@ static bool __kmp_affinity_create_x2apicid_map(kmp_i18n_id_t *const msg_id) {
   // to it, and obtaining the pertinent information using the cpuid instr.
   unsigned int proc;
   int hw_thread_index = 0;
-  KMP_CPU_SET_ITERATE(proc, __kmp_affin_fullMask) {
-    cpuid_level_info_t my_levels[INTEL_LEVEL_TYPE_LAST];
-    unsigned my_levels_index;
+  bool uniform_caches = true;
 
+  KMP_CPU_SET_ITERATE(proc, __kmp_affin_fullMask) {
     // Skip this proc if it is not included in the machine model.
     if (!KMP_CPU_ISSET(proc, __kmp_affin_fullMask)) {
       continue;
     }
     KMP_DEBUG_ASSERT(hw_thread_index < __kmp_avail_proc);
 
+    // Gather topology information
     __kmp_affinity_dispatch->bind_thread(proc);
-
-    // New algorithm
     __kmp_x86_cpuid(topology_leaf, 0, &buf);
-    apic_id = buf.edx;
-    kmp_hw_thread_t &hw_thread = __kmp_topology->at(hw_thread_index);
-    my_levels_index =
-        __kmp_x2apicid_get_levels(topology_leaf, my_levels, known_levels);
-    if (my_levels_index == 0 || my_levels_index != levels_index) {
+    proc_info[hw_thread_index].os_id = proc;
+    proc_info[hw_thread_index].apic_id = buf.edx;
+    __kmp_x2apicid_get_levels(topology_leaf, &proc_info[hw_thread_index], types,
+                              &depth, &total_description);
+    if (proc_info[hw_thread_index].depth == 0) {
       *msg_id = kmp_i18n_str_InvalidCpuidInfo;
+      __kmp_free(proc_info);
       return false;
     }
-    hw_thread.clear();
-    hw_thread.os_id = proc;
-    // Put in topology information
-    for (unsigned j = 0, idx = depth - 1; j < my_levels_index; ++j, --idx) {
-      hw_thread.ids[idx] = apic_id & my_levels[j].mask;
-      if (j > 0) {
-        hw_thread.ids[idx] >>= my_levels[j - 1].mask_width;
-      }
-    }
+    // Gather cache information and insert afterwards
+    cache_info[hw_thread_index].get_leaf4_levels();
+    if (uniform_caches && hw_thread_index > 0)
+      if (cache_info[0] != cache_info[hw_thread_index])
+        uniform_caches = false;
     // Hybrid information
     if (__kmp_is_hybrid_cpu() && highest_leaf >= 0x1a) {
-      kmp_hw_core_type_t type;
-      unsigned native_model_id;
-      int efficiency;
-      __kmp_get_hybrid_info(&type, &efficiency, &native_model_id);
-      hw_thread.attrs.set_core_type(type);
-      hw_thread.attrs.set_core_eff(efficiency);
+      __kmp_get_hybrid_info(&proc_info[hw_thread_index].type,
+                            &proc_info[hw_thread_index].efficiency,
+                            &proc_info[hw_thread_index].native_model_id);
     }
     hw_thread_index++;
   }
   KMP_ASSERT(hw_thread_index > 0);
+  previous_affinity.restore();
+
+  // Allocate the data structure to be returned.
+  __kmp_topology = kmp_topology_t::allocate(__kmp_avail_proc, depth, types);
+
+  // Create topology Ids and hybrid types in __kmp_topology
+  for (int i = 0; i < __kmp_topology->get_num_hw_threads(); ++i) {
+    kmp_hw_thread_t &hw_thread = __kmp_topology->at(i);
+    hw_thread.clear();
+    hw_thread.os_id = proc_info[i].os_id;
+    hw_thread.original_idx = i;
+    unsigned apic_id = proc_info[i].apic_id;
+    // Put in topology information
+    for (int j = 0, idx = depth - 1; j < depth; ++j, --idx) {
+      if (!(proc_info[i].description.contains_topology_type(
+              __kmp_topology->get_type(j)))) {
+        hw_thread.ids[idx] = kmp_hw_thread_t::UNKNOWN_ID;
+      } else {
+        hw_thread.ids[idx] = apic_id & proc_info[i].levels[j].mask;
+        if (j > 0) {
+          hw_thread.ids[idx] >>= proc_info[i].levels[j - 1].mask_width;
+        }
+      }
+    }
+    hw_thread.attrs.set_core_type(proc_info[i].type);
+    hw_thread.attrs.set_core_eff(proc_info[i].efficiency);
+  }
+
   __kmp_topology->sort_ids();
+
+  // Change Ids to logical Ids
+  for (int j = 0; j < depth - 1; ++j) {
+    int new_id = 0;
+    int prev_id = __kmp_topology->at(0).ids[j];
+    int curr_id = __kmp_topology->at(0).ids[j + 1];
+    __kmp_topology->at(0).ids[j + 1] = new_id;
+    for (int i = 1; i < __kmp_topology->get_num_hw_threads(); ++i) {
+      kmp_hw_thread_t &hw_thread = __kmp_topology->at(i);
+      if (hw_thread.ids[j] == prev_id && hw_thread.ids[j + 1] == curr_id) {
+        hw_thread.ids[j + 1] = new_id;
+      } else if (hw_thread.ids[j] == prev_id &&
+                 hw_thread.ids[j + 1] != curr_id) {
+        curr_id = hw_thread.ids[j + 1];
+        hw_thread.ids[j + 1] = ++new_id;
+      } else {
+        prev_id = hw_thread.ids[j];
+        curr_id = hw_thread.ids[j + 1];
+        hw_thread.ids[j + 1] = ++new_id;
+      }
+    }
+  }
+
+  // First check for easy cache placement. This occurs when caches are
+  // equivalent to a layer in the CPUID leaf 0xb or 0x1f topology.
+  if (uniform_caches) {
+    for (size_t i = 0; i < cache_info[0].get_depth(); ++i) {
+      unsigned cache_mask = cache_info[0][i].mask;
+      unsigned cache_level = cache_info[0][i].level;
+      KMP_ASSERT(cache_level <= cpuid_cache_info_t::MAX_CACHE_LEVEL);
+      kmp_hw_t cache_type = cpuid_cache_info_t::get_topology_type(cache_level);
+      __kmp_topology->set_equivalent_type(cache_type, cache_type);
+      for (int j = 0; j < depth; ++j) {
+        unsigned hw_cache_mask = proc_info[0].levels[j].cache_mask;
+        if (hw_cache_mask == cache_mask && j < depth - 1) {
+          kmp_hw_t type = __kmp_intel_type_2_topology_type(
+              proc_info[0].levels[j + 1].level_type);
+          __kmp_topology->set_equivalent_type(cache_type, type);
+        }
+      }
+    }
+  } else {
+    // If caches are non-uniform, then record which caches exist.
+    for (int i = 0; i < __kmp_topology->get_num_hw_threads(); ++i) {
+      for (size_t j = 0; j < cache_info[i].get_depth(); ++j) {
+        unsigned cache_level = cache_info[i][j].level;
+        kmp_hw_t cache_type =
+            cpuid_cache_info_t::get_topology_type(cache_level);
+        if (__kmp_topology->get_equivalent_type(cache_type) == KMP_HW_UNKNOWN)
+          __kmp_topology->set_equivalent_type(cache_type, cache_type);
+      }
+    }
+  }
+
+  // See if any cache level needs to be added manually through cache Ids
+  bool unresolved_cache_levels = false;
+  for (unsigned level = 1; level <= cpuid_cache_info_t::MAX_CACHE_LEVEL;
+       ++level) {
+    kmp_hw_t cache_type = cpuid_cache_info_t::get_topology_type(level);
+    // This also filters out caches which may not be in the topology
+    // since the equivalent type might be KMP_HW_UNKNOWN.
+    if (__kmp_topology->get_equivalent_type(cache_type) == cache_type) {
+      unresolved_cache_levels = true;
+      break;
+    }
+  }
+
+  // Insert unresolved cache layers into machine topology using cache Ids
+  if (unresolved_cache_levels) {
+    int num_hw_threads = __kmp_topology->get_num_hw_threads();
+    int *ids = (int *)__kmp_allocate(sizeof(int) * num_hw_threads);
+    for (unsigned l = 1; l <= cpuid_cache_info_t::MAX_CACHE_LEVEL; ++l) {
+      kmp_hw_t cache_type = cpuid_cache_info_t::get_topology_type(l);
+      if (__kmp_topology->get_equivalent_type(cache_type) != cache_type)
+        continue;
+      for (int i = 0; i < num_hw_threads; ++i) {
+        int original_idx = __kmp_topology->at(i).original_idx;
+        ids[i] = kmp_hw_thread_t::UNKNOWN_ID;
+        const cpuid_cache_info_t::info_t &info =
+            cache_info[original_idx].get_level(l);
+        // if cache level not in topology for this processor, then skip
+        if (info.level == 0)
+          continue;
+        ids[i] = info.mask & proc_info[original_idx].apic_id;
+      }
+      __kmp_topology->insert_layer(cache_type, ids);
+    }
+  }
+
   if (!__kmp_topology->check_ids()) {
     kmp_topology_t::deallocate(__kmp_topology);
     __kmp_topology = nullptr;
     *msg_id = kmp_i18n_str_x2ApicIDsNotUnique;
+    __kmp_free(proc_info);
     return false;
   }
+  __kmp_free(proc_info);
   return true;
 }
 #endif /* KMP_ARCH_X86 || KMP_ARCH_X86_64 */
@@ -3571,6 +3817,7 @@ restart_radix_check:
     kmp_hw_thread_t &hw_thread = __kmp_topology->at(i);
     hw_thread.clear();
     hw_thread.os_id = os;
+    hw_thread.original_idx = i;
 
     idx = 0;
     for (src_index = maxIndex; src_index >= threadIdIndex; src_index--) {
@@ -3594,6 +3841,32 @@ restart_radix_check:
   __kmp_free(counts);
   CLEANUP_THREAD_INFO;
   __kmp_topology->sort_ids();
+
+  int tlevel = __kmp_topology->get_level(KMP_HW_THREAD);
+  if (tlevel > 0) {
+    // If the thread level does not have ids, then put them in.
+    if (__kmp_topology->at(0).ids[tlevel] == kmp_hw_thread_t::UNKNOWN_ID) {
+      __kmp_topology->at(0).ids[tlevel] = 0;
+    }
+    for (int i = 1; i < __kmp_topology->get_num_hw_threads(); ++i) {
+      kmp_hw_thread_t &hw_thread = __kmp_topology->at(i);
+      if (hw_thread.ids[tlevel] != kmp_hw_thread_t::UNKNOWN_ID)
+        continue;
+      kmp_hw_thread_t &prev_hw_thread = __kmp_topology->at(i - 1);
+      // Check if socket, core, anything above thread level changed.
+      // If the ids did change, then restart thread id at 0
+      // Otherwise, set thread id to prev thread's id + 1
+      for (int j = 0; j < tlevel; ++j) {
+        if (hw_thread.ids[j] != prev_hw_thread.ids[j]) {
+          hw_thread.ids[tlevel] = 0;
+          break;
+        }
+      }
+      if (hw_thread.ids[tlevel] == kmp_hw_thread_t::UNKNOWN_ID)
+        hw_thread.ids[tlevel] = prev_hw_thread.ids[tlevel] + 1;
+    }
+  }
+
   if (!__kmp_topology->check_ids()) {
     kmp_topology_t::deallocate(__kmp_topology);
     __kmp_topology = nullptr;
@@ -3620,8 +3893,8 @@ static void __kmp_create_os_id_masks(unsigned *numUnique,
   KMP_ASSERT(depth);
 
   i = find_next(-1);
-  // If could not find HW thread location with attributes, then return and
-  // fallback to increment find_next and disregard core attributes.
+  // If could not find HW thread location that satisfies find_next conditions,
+  // then return and fallback to increment find_next.
   if (i >= numAddrs)
     return;
 
@@ -4739,16 +5012,33 @@ static void __kmp_aux_affinity_initialize(kmp_affinity_t &affinity) {
     }
   }
   // If core attributes did not work, or none were specified,
-  // then make OS Id mask table using typical incremental way.
+  // then make OS Id mask table using typical incremental way with
+  // checking for validity of each id at granularity level specified.
+  if (!affinity.os_id_masks) {
+    int gran = affinity.gran_levels;
+    int gran_level = depth - 1 - affinity.gran_levels;
+    if (gran >= 0 && gran_level >= 0 && gran_level < depth) {
+      __kmp_create_os_id_masks(
+          &numUnique, affinity, [depth, numAddrs, &affinity](int idx) {
+            KMP_ASSERT(idx >= -1);
+            int gran = affinity.gran_levels;
+            int gran_level = depth - 1 - affinity.gran_levels;
+            for (int i = idx + 1; i < numAddrs; ++i)
+              if ((gran >= depth) ||
+                  (gran < depth && __kmp_topology->at(i).ids[gran_level] !=
+                                       kmp_hw_thread_t::UNKNOWN_ID))
+                return i;
+            return numAddrs;
+          });
+    }
+  }
+  // Final attempt to make OS Id mask table using typical incremental way.
   if (!affinity.os_id_masks) {
     __kmp_create_os_id_masks(&numUnique, affinity, [](int idx) {
       KMP_ASSERT(idx >= -1);
       return idx + 1;
     });
   }
-  if (affinity.gran_levels == 0) {
-    KMP_DEBUG_ASSERT((int)numUnique == __kmp_avail_proc);
-  }
 
   switch (affinity.type) {
 
@@ -4894,6 +5184,8 @@ static void __kmp_aux_affinity_initialize(kmp_affinity_t &affinity) {
         int osId = __kmp_topology->at(i).os_id;
 
         kmp_affin_mask_t *src = KMP_CPU_INDEX(affinity.os_id_masks, osId);
+        if (KMP_CPU_ISEMPTY(src))
+          continue;
         kmp_affin_mask_t *dest = KMP_CPU_INDEX(affinity.masks, j);
         KMP_ASSERT(KMP_CPU_ISSET(osId, src));
         KMP_CPU_COPY(dest, src);
diff --git a/openmp/runtime/src/kmp_affinity.h b/openmp/runtime/src/kmp_affinity.h
index 3dc2c84..ed24b6f 100644
--- a/openmp/runtime/src/kmp_affinity.h
+++ b/openmp/runtime/src/kmp_affinity.h
@@ -846,6 +846,7 @@ public:
   int sub_ids[KMP_HW_LAST];
   bool leader;
   int os_id;
+  int original_idx;
   kmp_hw_attr_t attrs;
 
   void print() const;
@@ -905,9 +906,6 @@ class kmp_topology_t {
   // Compact value used during sort_compact()
   int compact;
 
-  // Insert a new topology layer after allocation
-  void _insert_layer(kmp_hw_t type, const int *ids);
-
 #if KMP_GROUP_AFFINITY
   // Insert topology information about Windows Processor groups
   void _insert_windows_proc_groups();
@@ -967,6 +965,10 @@ public:
     qsort(hw_threads, num_hw_threads, sizeof(kmp_hw_thread_t),
           kmp_hw_thread_t::compare_ids);
   }
+
+  // Insert a new topology layer after allocation
+  void insert_layer(kmp_hw_t type, const int *ids);
+
   // Check if the hardware ids are unique, if they are
   // return true, otherwise return false
   bool check_ids() const;
diff --git a/polly/include/polly/ScopBuilder.h b/polly/include/polly/ScopBuilder.h
index 635c23c..e589a7f 100644
--- a/polly/include/polly/ScopBuilder.h
+++ b/polly/include/polly/ScopBuilder.h
@@ -663,19 +663,6 @@ class ScopBuilder final {
   ///         nullptr if it cannot be hoisted at all.
   isl::set getNonHoistableCtx(MemoryAccess *Access, isl::union_map Writes);
 
-  /// Collect loads which might form a reduction chain with @p StoreMA.
-  ///
-  /// Check if the stored value for @p StoreMA is a binary operator with one or
-  /// two loads as operands. If the binary operand is commutative & associative,
-  /// used only once (by @p StoreMA) and its load operands are also used only
-  /// once, we have found a possible reduction chain. It starts at an operand
-  /// load and includes the binary operator and @p StoreMA.
-  ///
-  /// Note: We allow only one use to ensure the load and binary operator cannot
-  ///       escape this block or into any other store except @p StoreMA.
-  void collectCandidateReductionLoads(MemoryAccess *StoreMA,
-                                      SmallVectorImpl<MemoryAccess *> &Loads);
-
   /// Build the access relation of all memory accesses of @p Stmt.
   void buildAccessRelations(ScopStmt &Stmt);
 
diff --git a/polly/include/polly/ScopInfo.h b/polly/include/polly/ScopInfo.h
index 1e0692f..974de81 100644
--- a/polly/include/polly/ScopInfo.h
+++ b/polly/include/polly/ScopInfo.h
@@ -470,6 +470,8 @@ public:
     RT_BOR,  ///< Bitwise Or
     RT_BXOR, ///< Bitwise XOr
     RT_BAND, ///< Bitwise And
+
+    RT_BOTTOM, ///< Pseudo type for the data flow analysis
   };
 
   using SubscriptsTy = SmallVector<const SCEV *, 4>;
@@ -1139,6 +1141,7 @@ class ScopStmt final {
   friend class ScopBuilder;
 
 public:
+  using MemoryAccessVec = llvm::SmallVector<MemoryAccess *, 8>;
   /// Create the ScopStmt from a BasicBlock.
   ScopStmt(Scop &parent, BasicBlock &bb, StringRef Name, Loop *SurroundingLoop,
            std::vector<Instruction *> Instructions);
@@ -1206,7 +1209,6 @@ private:
   /// The memory accesses of this statement.
   ///
   /// The only side effects of a statement are its memory accesses.
-  using MemoryAccessVec = llvm::SmallVector<MemoryAccess *, 8>;
   MemoryAccessVec MemAccs;
 
   /// Mapping from instructions to (scalar) memory accesses.
diff --git a/polly/lib/Analysis/ScopBuilder.cpp b/polly/lib/Analysis/ScopBuilder.cpp
index d594823..c05fc1a 100644
--- a/polly/lib/Analysis/ScopBuilder.cpp
+++ b/polly/lib/Analysis/ScopBuilder.cpp
@@ -2481,8 +2481,8 @@ void ScopBuilder::collectSurroundingLoops(ScopStmt &Stmt) {
 }
 
 /// Return the reduction type for a given binary operator.
-static MemoryAccess::ReductionType getReductionType(const BinaryOperator *BinOp,
-                                                    const Instruction *Load) {
+static MemoryAccess::ReductionType
+getReductionType(const BinaryOperator *BinOp) {
   if (!BinOp)
     return MemoryAccess::RT_NONE;
   switch (BinOp->getOpcode()) {
@@ -2511,6 +2511,17 @@ static MemoryAccess::ReductionType getReductionType(const BinaryOperator *BinOp,
   }
 }
 
+/// @brief Combine two reduction types
+static MemoryAccess::ReductionType
+combineReductionType(MemoryAccess::ReductionType RT0,
+                     MemoryAccess::ReductionType RT1) {
+  if (RT0 == MemoryAccess::RT_BOTTOM)
+    return RT1;
+  if (RT0 == RT1)
+    return RT1;
+  return MemoryAccess::RT_NONE;
+}
+
 ///  True if @p AllAccs intersects with @p MemAccs execpt @p LoadMA and @p
 ///  StoreMA
 bool hasIntersectingAccesses(isl::set AllAccs, MemoryAccess *LoadMA,
@@ -2571,47 +2582,205 @@ bool checkCandidatePairAccesses(MemoryAccess *LoadMA, MemoryAccess *StoreMA,
     AllAccsRel = AllAccsRel.intersect_domain(Domain);
     isl::set AllAccs = AllAccsRel.range();
     Valid = !hasIntersectingAccesses(AllAccs, LoadMA, StoreMA, Domain, MemAccs);
-
     POLLY_DEBUG(dbgs() << " == The accessed memory is " << (Valid ? "not " : "")
                        << "accessed by other instructions!\n");
   }
+
   return Valid;
 }
 
 void ScopBuilder::checkForReductions(ScopStmt &Stmt) {
-  SmallVector<MemoryAccess *, 2> Loads;
-  SmallVector<std::pair<MemoryAccess *, MemoryAccess *>, 4> Candidates;
+  // Perform a data flow analysis on the current scop statement to propagate the
+  // uses of loaded values. Then check and mark the memory accesses which are
+  // part of reduction like chains.
+  // During the data flow analysis we use the State variable to keep track of
+  // the used "load-instructions" for each instruction in the scop statement.
+  // This includes the LLVM-IR of the load and the "number of uses" (or the
+  // number of paths in the operand tree which end in this load).
+  using StatePairTy = std::pair<unsigned, MemoryAccess::ReductionType>;
+  using FlowInSetTy = MapVector<const LoadInst *, StatePairTy>;
+  using StateTy = MapVector<const Instruction *, FlowInSetTy>;
+  StateTy State;
+
+  // Invalid loads are loads which have uses we can't track properly in the
+  // state map. This includes loads which:
+  //   o do not form a reduction when they flow into a memory location:
+  //     (e.g., A[i] = B[i] * 3 and  A[i] = A[i] * A[i] + A[i])
+  //   o are used by a non binary operator or one which is not commutative
+  //     and associative (e.g., A[i] = A[i] % 3)
+  //   o might change the control flow            (e.g., if (A[i]))
+  //   o are used in indirect memory accesses     (e.g., A[B[i]])
+  //   o are used outside the current scop statement
+  SmallPtrSet<const Instruction *, 8> InvalidLoads;
+  SmallVector<BasicBlock *, 8> ScopBlocks;
+  BasicBlock *BB = Stmt.getBasicBlock();
+  if (BB)
+    ScopBlocks.push_back(BB);
+  else
+    for (BasicBlock *Block : Stmt.getRegion()->blocks())
+      ScopBlocks.push_back(Block);
+  // Run the data flow analysis for all values in the scop statement
+  for (BasicBlock *Block : ScopBlocks) {
+    for (Instruction &Inst : *Block) {
+      if ((Stmt.getParent())->getStmtFor(&Inst) != &Stmt)
+        continue;
+      bool UsedOutsideStmt = any_of(Inst.users(), [&Stmt](User *U) {
+        return (Stmt.getParent())->getStmtFor(cast<Instruction>(U)) != &Stmt;
+      });
+      //  Treat loads and stores special
+      if (auto *Load = dyn_cast<LoadInst>(&Inst)) {
+        // Invalidate all loads used which feed into the address of this load.
+        if (auto *Ptr = dyn_cast<Instruction>(Load->getPointerOperand())) {
+          const auto &It = State.find(Ptr);
+          if (It != State.end())
+            for (const auto &FlowInSetElem : It->second)
+              InvalidLoads.insert(FlowInSetElem.first);
+        }
 
-  // First collect candidate load-store reduction chains by iterating over all
-  // stores and collecting possible reduction loads.
-  for (MemoryAccess *StoreMA : Stmt) {
-    if (StoreMA->isRead())
-      continue;
+        // If this load is used outside this stmt, invalidate it.
+        if (UsedOutsideStmt)
+          InvalidLoads.insert(Load);
+
+        // And indicate that this load uses itself once but without specifying
+        // any reduction operator.
+        State[Load].insert(
+            std::make_pair(Load, std::make_pair(1, MemoryAccess::RT_BOTTOM)));
+        continue;
+      }
+
+      if (auto *Store = dyn_cast<StoreInst>(&Inst)) {
+        // Invalidate all loads which feed into the address of this store.
+        if (const Instruction *Ptr =
+                dyn_cast<Instruction>(Store->getPointerOperand())) {
+          const auto &It = State.find(Ptr);
+          if (It != State.end())
+            for (const auto &FlowInSetElem : It->second)
+              InvalidLoads.insert(FlowInSetElem.first);
+        }
+
+        // Propagate the uses of the value operand to the store
+        if (auto *ValueInst = dyn_cast<Instruction>(Store->getValueOperand()))
+          State.insert(std::make_pair(Store, State[ValueInst]));
+        continue;
+      }
+
+      // Non load and store instructions are either binary operators or they
+      // will invalidate all used loads.
+      auto *BinOp = dyn_cast<BinaryOperator>(&Inst);
+      MemoryAccess::ReductionType CurRedType = getReductionType(BinOp);
+      POLLY_DEBUG(dbgs() << "CurInst: " << Inst << " RT: " << CurRedType
+                         << "\n");
+
+      // Iterate over all operands and propagate their input loads to
+      // instruction.
+      FlowInSetTy &InstInFlowSet = State[&Inst];
+      for (Use &Op : Inst.operands()) {
+        auto *OpInst = dyn_cast<Instruction>(Op);
+        if (!OpInst)
+          continue;
+
+        POLLY_DEBUG(dbgs().indent(4) << "Op Inst: " << *OpInst << "\n");
+        const StateTy::iterator &OpInFlowSetIt = State.find(OpInst);
+        if (OpInFlowSetIt == State.end())
+          continue;
+
+        // Iterate over all the input loads of the operand and combine them
+        // with the input loads of current instruction.
+        FlowInSetTy &OpInFlowSet = OpInFlowSetIt->second;
+        for (auto &OpInFlowPair : OpInFlowSet) {
+          unsigned OpFlowIn = OpInFlowPair.second.first;
+          unsigned InstFlowIn = InstInFlowSet[OpInFlowPair.first].first;
+
+          MemoryAccess::ReductionType OpRedType = OpInFlowPair.second.second;
+          MemoryAccess::ReductionType InstRedType =
+              InstInFlowSet[OpInFlowPair.first].second;
+
+          MemoryAccess::ReductionType NewRedType =
+              combineReductionType(OpRedType, CurRedType);
+          if (InstFlowIn)
+            NewRedType = combineReductionType(NewRedType, InstRedType);
+
+          POLLY_DEBUG(dbgs().indent(8) << "OpRedType: " << OpRedType << "\n");
+          POLLY_DEBUG(dbgs().indent(8) << "NewRedType: " << NewRedType << "\n");
+          InstInFlowSet[OpInFlowPair.first] =
+              std::make_pair(OpFlowIn + InstFlowIn, NewRedType);
+        }
+      }
 
-    Loads.clear();
-    collectCandidateReductionLoads(StoreMA, Loads);
-    for (MemoryAccess *LoadMA : Loads)
-      Candidates.push_back(std::make_pair(LoadMA, StoreMA));
+      // If this operation is used outside the stmt, invalidate all the loads
+      // which feed into it.
+      if (UsedOutsideStmt)
+        for (const auto &FlowInSetElem : InstInFlowSet)
+          InvalidLoads.insert(FlowInSetElem.first);
+    }
   }
 
-  // Then check each possible candidate pair.
-  for (const auto &CandidatePair : Candidates) {
-    MemoryAccess *LoadMA = CandidatePair.first;
-    MemoryAccess *StoreMA = CandidatePair.second;
-    bool Valid = checkCandidatePairAccesses(LoadMA, StoreMA, Stmt.getDomain(),
-                                            Stmt.MemAccs);
-    if (!Valid)
+  // All used loads are propagated through the whole basic block; now try to
+  // find valid reduction-like candidate pairs. These load-store pairs fulfill
+  // all reduction like properties with regards to only this load-store chain.
+  // We later have to check if the loaded value was invalidated by an
+  // instruction not in that chain.
+  using MemAccPair = std::pair<MemoryAccess *, MemoryAccess *>;
+  DenseMap<MemAccPair, MemoryAccess::ReductionType> ValidCandidates;
+
+  // Iterate over all write memory accesses and check the loads flowing into
+  // it for reduction candidate pairs.
+  for (MemoryAccess *WriteMA : Stmt.MemAccs) {
+    if (WriteMA->isRead())
+      continue;
+    StoreInst *St = dyn_cast<StoreInst>(WriteMA->getAccessInstruction());
+    if (!St)
       continue;
+    assert(!St->isVolatile());
+
+    FlowInSetTy &MaInFlowSet = State[WriteMA->getAccessInstruction()];
+    for (auto &MaInFlowSetElem : MaInFlowSet) {
+      MemoryAccess *ReadMA = &Stmt.getArrayAccessFor(MaInFlowSetElem.first);
+      assert(ReadMA && "Couldn't find memory access for incoming load!");
 
-    const LoadInst *Load =
-        dyn_cast<const LoadInst>(CandidatePair.first->getAccessInstruction());
-    MemoryAccess::ReductionType RT =
-        getReductionType(dyn_cast<BinaryOperator>(Load->user_back()), Load);
+      POLLY_DEBUG(dbgs() << "'" << *ReadMA->getAccessInstruction()
+                         << "'\n\tflows into\n'"
+                         << *WriteMA->getAccessInstruction() << "'\n\t #"
+                         << MaInFlowSetElem.second.first << " times & RT: "
+                         << MaInFlowSetElem.second.second << "\n");
 
-    // If no overlapping access was found we mark the load and store as
-    // reduction like.
-    LoadMA->markAsReductionLike(RT);
-    StoreMA->markAsReductionLike(RT);
+      MemoryAccess::ReductionType RT = MaInFlowSetElem.second.second;
+      unsigned NumAllowableInFlow = 1;
+
+      // We allow the load to flow in exactly once for binary reductions
+      bool Valid = (MaInFlowSetElem.second.first == NumAllowableInFlow);
+
+      // Check if we saw a valid chain of binary operators.
+      Valid = Valid && RT != MemoryAccess::RT_BOTTOM;
+      Valid = Valid && RT != MemoryAccess::RT_NONE;
+
+      // Then check if the memory accesses allow a reduction.
+      Valid = Valid && checkCandidatePairAccesses(
+                           ReadMA, WriteMA, Stmt.getDomain(), Stmt.MemAccs);
+
+      // Finally, mark the pair as a candidate or the load as a invalid one.
+      if (Valid)
+        ValidCandidates[std::make_pair(ReadMA, WriteMA)] = RT;
+      else
+        InvalidLoads.insert(ReadMA->getAccessInstruction());
+    }
+  }
+
+  // In the last step mark the memory accesses of candidate pairs as reduction
+  // like if the load wasn't marked invalid in the previous step.
+  for (auto &CandidatePair : ValidCandidates) {
+    MemoryAccess *LoadMA = CandidatePair.first.first;
+    if (InvalidLoads.count(LoadMA->getAccessInstruction()))
+      continue;
+    POLLY_DEBUG(
+        dbgs() << " Load :: "
+               << *((CandidatePair.first.first)->getAccessInstruction())
+               << "\n Store :: "
+               << *((CandidatePair.first.second)->getAccessInstruction())
+               << "\n are marked as reduction like\n");
+    MemoryAccess::ReductionType RT = CandidatePair.second;
+    CandidatePair.first.first->markAsReductionLike(RT);
+    CandidatePair.first.second->markAsReductionLike(RT);
   }
 }
 
@@ -2770,7 +2939,7 @@ isl::set ScopBuilder::getNonHoistableCtx(MemoryAccess *Access,
 
   auto &DL = scop->getFunction().getDataLayout();
   if (isSafeToLoadUnconditionally(LI->getPointerOperand(), LI->getType(),
-                                  LI->getAlign(), DL)) {
+                                  LI->getAlign(), DL, nullptr)) {
     SafeToLoad = isl::set::universe(AccessRelation.get_space().range());
   } else if (BB != LI->getParent()) {
     // Skip accesses in non-affine subregions as they might not be executed
@@ -2965,52 +3134,6 @@ void ScopBuilder::addInvariantLoads(ScopStmt &Stmt,
   }
 }
 
-void ScopBuilder::collectCandidateReductionLoads(
-    MemoryAccess *StoreMA, SmallVectorImpl<MemoryAccess *> &Loads) {
-  ScopStmt *Stmt = StoreMA->getStatement();
-
-  auto *Store = dyn_cast<StoreInst>(StoreMA->getAccessInstruction());
-  if (!Store)
-    return;
-
-  // Skip if there is not one binary operator between the load and the store
-  auto *BinOp = dyn_cast<BinaryOperator>(Store->getValueOperand());
-  if (!BinOp)
-    return;
-
-  // Skip if the binary operators has multiple uses
-  if (BinOp->getNumUses() != 1)
-    return;
-
-  // Skip if the opcode of the binary operator is not commutative/associative
-  if (!BinOp->isCommutative() || !BinOp->isAssociative())
-    return;
-
-  // Skip if the binary operator is outside the current SCoP
-  if (BinOp->getParent() != Store->getParent())
-    return;
-
-  // Skip if it is a multiplicative reduction and we disabled them
-  if (DisableMultiplicativeReductions &&
-      (BinOp->getOpcode() == Instruction::Mul ||
-       BinOp->getOpcode() == Instruction::FMul))
-    return;
-
-  // Check the binary operator operands for a candidate load
-  auto *PossibleLoad0 = dyn_cast<LoadInst>(BinOp->getOperand(0));
-  auto *PossibleLoad1 = dyn_cast<LoadInst>(BinOp->getOperand(1));
-  if (!PossibleLoad0 && !PossibleLoad1)
-    return;
-
-  // A load is only a candidate if it cannot escape (thus has only this use)
-  if (PossibleLoad0 && PossibleLoad0->getNumUses() == 1)
-    if (PossibleLoad0->getParent() == Store->getParent())
-      Loads.push_back(&Stmt->getArrayAccessFor(PossibleLoad0));
-  if (PossibleLoad1 && PossibleLoad1->getNumUses() == 1)
-    if (PossibleLoad1->getParent() == Store->getParent())
-      Loads.push_back(&Stmt->getArrayAccessFor(PossibleLoad1));
-}
-
 /// Find the canonical scop array info object for a set of invariant load
 /// hoisted loads. The canonical array is the one that corresponds to the
 /// first load in the list of accesses which is used as base pointer of a
diff --git a/polly/lib/Analysis/ScopDetection.cpp b/polly/lib/Analysis/ScopDetection.cpp
index eab7bd8..79db396 100644
--- a/polly/lib/Analysis/ScopDetection.cpp
+++ b/polly/lib/Analysis/ScopDetection.cpp
@@ -490,7 +490,8 @@ bool ScopDetection::onlyValidRequiredInvariantLoads(
 
     for (auto NonAffineRegion : Context.NonAffineSubRegionSet) {
       if (isSafeToLoadUnconditionally(Load->getPointerOperand(),
-                                      Load->getType(), Load->getAlign(), DL))
+                                      Load->getType(), Load->getAlign(), DL,
+                                      nullptr))
         continue;
 
       if (NonAffineRegion->contains(Load) &&
diff --git a/polly/lib/Analysis/ScopInfo.cpp b/polly/lib/Analysis/ScopInfo.cpp
index fa35fae..044d357 100644
--- a/polly/lib/Analysis/ScopInfo.cpp
+++ b/polly/lib/Analysis/ScopInfo.cpp
@@ -533,6 +533,9 @@ MemoryAccess::getReductionOperatorStr(MemoryAccess::ReductionType RT) {
   case MemoryAccess::RT_NONE:
     llvm_unreachable("Requested a reduction operator string for a memory "
                      "access which isn't a reduction");
+  case MemoryAccess::RT_BOTTOM:
+    llvm_unreachable("Requested a reduction operator string for a internal "
+                     "reduction type!");
   case MemoryAccess::RT_ADD:
     return "+";
   case MemoryAccess::RT_MUL:
@@ -915,10 +918,15 @@ isl::id MemoryAccess::getId() const { return Id; }
 
 raw_ostream &polly::operator<<(raw_ostream &OS,
                                MemoryAccess::ReductionType RT) {
-  if (RT == MemoryAccess::RT_NONE)
+  switch (RT) {
+  case MemoryAccess::RT_NONE:
+  case MemoryAccess::RT_BOTTOM:
     OS << "NONE";
-  else
+    break;
+  default:
     OS << MemoryAccess::getReductionOperatorStr(RT);
+    break;
+  }
   return OS;
 }
 
diff --git a/polly/test/DependenceInfo/reduction_indirect_access.ll b/polly/test/DependenceInfo/reduction_indirect_access.ll
new file mode 100644
index 0000000..3b4bd9ef
--- /dev/null
+++ b/polly/test/DependenceInfo/reduction_indirect_access.ll
@@ -0,0 +1,39 @@
+; RUN: opt %loadNPMPolly -aa-pipeline=basic-aa '-passes=print<polly-dependences>' -polly-allow-nonaffine -disable-output < %s | FileCheck %s
+;
+; CHECK: Reduction dependences:
+; CHECK:   [N] -> { Stmt_for_body[i0] -> Stmt_for_body[1 + i0] : 0 <= i0 <= -2 + N }
+;
+;    void f(double *restrict A, int *restrict INDICES, int N) {
+;      for (int i = 0; i < N; i++)
+;        A[INDICES[i]] += N;
+;    }
+;
+target datalayout = "e-m:e-p:32:32-i64:64-v128:64:128-n32-S64"
+
+define void @f(ptr noalias %A, ptr noalias %INDICES, i32 %N) {
+entry:
+  br label %for.cond
+
+for.cond:                                         ; preds = %for.inc, %entry
+  %i.0 = phi i32 [ 0, %entry ], [ %inc, %for.inc ]
+  %cmp = icmp slt i32 %i.0, %N
+  br i1 %cmp, label %for.body, label %for.end
+
+for.body:                                         ; preds = %for.cond
+  %conv = sitofp i32 %N to double
+  %arrayidx = getelementptr inbounds ptr, ptr %INDICES, i32 %i.0
+  %tmp = load i32, ptr %arrayidx, align 4
+  %arrayidx1 = getelementptr inbounds ptr, ptr %A, i32 %tmp
+  %tmp1 = load double, ptr %arrayidx1, align 8
+  %add = fadd fast double %tmp1, %conv
+  store double %add, double* %arrayidx1, align 8
+  br label %for.inc
+
+for.inc:                                          ; preds = %for.body
+  %inc = add nsw i32 %i.0, 1
+  br label %for.cond
+
+for.end:                                          ; preds = %for.cond
+  ret void
+}
+
diff --git a/polly/test/ScopInfo/reduction_double.ll b/polly/test/ScopInfo/reduction_double.ll
new file mode 100644
index 0000000..d126d3d
--- /dev/null
+++ b/polly/test/ScopInfo/reduction_double.ll
@@ -0,0 +1,57 @@
+; RUN: opt %loadPolly -basic-aa -polly-print-scops -disable-output -polly-allow-nonaffine < %s | FileCheck %s
+;
+; Verify if two independent reductions in same loop is detected
+;
+; CHECK: Stmt_for_body
+; CHECK: Reduction Type: +
+; CHECK-NEXT: MemRef_sum1[0]
+; CHECK-NEXT: Reduction Type: +
+; CHECK-NEXT: MemRef_sum1[0]
+;
+; CHECK: Stmt_for_body_b
+; CHECK: Reduction Type: +
+; CHECK-NEXT: MemRef_sum2[0]
+; CHECK-NEXT: Reduction Type: +
+; CHECK-NEXT: MemRef_sum2[0]
+;
+; int red(int *A, int *B, int *sum, int * prod, int n) {
+;   for (int i = 0; i < n; ++i) {
+;     *sum += A[i];
+;     *prod += B[i];
+;   }
+; }
+
+target datalayout = "e-m:e-i8:8:32-i16:16:32-i64:64-i128:128-n32:64-S128"
+target triple = "aarch64-unknown-linux-gnu"
+
+; Function Attrs: nofree norecurse nosync nounwind memory(argmem: readwrite) uwtable
+define dso_local i32 @red(ptr nocapture noundef readonly %A, ptr nocapture noundef readonly %B, ptr nocapture noundef %sum1, ptr nocapture noundef %sum2, i32 noundef %n) local_unnamed_addr #0 {
+entry:
+  %cmp7 = icmp sgt i32 %n, 0
+  br i1 %cmp7, label %for.body.preheader, label %for.cond.cleanup
+
+for.body.preheader:                               ; preds = %entry
+  %wide.trip.count = zext nneg i32 %n to i64
+  br label %for.body
+
+for.cond.cleanup:                                 ; preds = %for.body, %entry
+  ret i32 undef
+
+for.body:                                         ; preds = %for.body.preheader, %for.body
+  %indvars.iv = phi i64 [ 0, %for.body.preheader ], [ %indvars.iv.next, %for.body ]
+  %arrayidx = getelementptr inbounds i32, ptr %A, i64 %indvars.iv
+  %0 = load i32, ptr %arrayidx
+  %1 = load i32, ptr %sum1
+  %add = add nsw i32 %1, %0
+  store i32 %add, ptr %sum1
+  %arrayidx2 = getelementptr inbounds i32, ptr %B, i64 %indvars.iv
+  %2 = load i32, ptr %arrayidx2
+  %3 = load i32, ptr %sum2
+  %add3 = add nsw i32 %3, %2
+  store i32 %add3, ptr %sum2
+  %indvars.iv.next = add nuw nsw i64 %indvars.iv, 1
+  %exitcond.not = icmp eq i64 %indvars.iv.next, %wide.trip.count
+  br i1 %exitcond.not, label %for.cond.cleanup, label %for.body
+}
+
+
diff --git a/polly/test/ScopInfo/reduction_escaping_intermediate_3.ll b/polly/test/ScopInfo/reduction_escaping_intermediate_3.ll
new file mode 100644
index 0000000..92a071e
--- /dev/null
+++ b/polly/test/ScopInfo/reduction_escaping_intermediate_3.ll
@@ -0,0 +1,43 @@
+; RUN: opt %loadPolly -basic-aa -polly-print-scops -disable-output < %s | FileCheck %s
+;
+; void f(int N, int * restrict sums, int * restrict escape) {
+;   int i, j;
+;   for (i = 0; i < 1024; i++) {
+;     sums[i] += 5;
+;     escape[i] = sums[i];
+;   }
+; }
+;
+; CHECK: Reduction Type: NONE
+; CHECK: sums
+; CHECK: Reduction Type: NONE
+; CHECK: sums
+; CHECK: Reduction Type: NONE
+; CHECK: escape
+target datalayout = "e-m:e-p:32:32-i64:64-v128:64:128-n32-S64"
+
+define void @f(i32 %N, i32* noalias %sums, i32* noalias %escape) {
+entry:
+  br label %for.cond
+
+for.cond:                                         ; preds = %for.inc, %entry
+  %i.0 = phi i32 [ 0, %entry ], [ %inc8, %for.inc ]
+  %exitcond1 = icmp ne i32 %i.0, 1024
+  br i1 %exitcond1, label %for.body, label %for.end
+
+for.body:                                         ; preds = %for.cond
+  %arrayidx = getelementptr inbounds i32, i32* %sums, i32 0
+  %tmp = load i32, i32* %arrayidx, align 4
+  %add = add nsw i32 %tmp, 5
+  store i32 %add, i32* %arrayidx, align 4
+  %arrayidx6 = getelementptr inbounds i32, i32* %escape, i32 %i.0
+  store i32 %add, i32* %arrayidx6, align 4
+  br label %for.inc
+
+for.inc:                                          ; preds = %for.body
+  %inc8 = add nsw i32 %i.0, 1
+  br label %for.cond
+
+for.end:                                          ; preds = %for.cond
+  ret void
+}
diff --git a/polly/test/ScopInfo/reduction_if.ll b/polly/test/ScopInfo/reduction_if.ll
new file mode 100644
index 0000000..4f7d368
--- /dev/null
+++ b/polly/test/ScopInfo/reduction_if.ll
@@ -0,0 +1,52 @@
+; RUN: opt %loadPolly -basic-aa -polly-print-scops -disable-output -polly-allow-nonaffine < %s | FileCheck %s
+;
+; Verify if reduction spread across multiple blocks in a single scop statement are detected
+;
+; CHECK: Stmt_for_body
+; CHECK: Reduction Type: +
+; CHECK-NEXT: MemRef_sum[0]
+; CHECK: Reduction Type: +
+; CHECK-NEXT: MemRef_sum[0]
+;
+; void f(int*__restrict A, int*__restrict B, int *sum) {
+;   for (int i = 0; i < 4444; ++i) {
+;     if (B[i])
+;       *sum += A[i];
+;   }
+; }
+
+target datalayout = "e-m:e-i8:8:32-i16:16:32-i64:64-i128:128-n32:64-S128"
+target triple = "aarch64-unknown-linux-gnu"
+
+; Function Attrs: nofree norecurse nosync nounwind memory(argmem: readwrite) uwtable
+define dso_local void @f(ptr noalias nocapture noundef readonly %A, ptr noalias nocapture noundef readonly %B, ptr nocapture noundef %sum) local_unnamed_addr #0 {
+entry:
+  br label %entry.split
+
+entry.split:                                      ; preds = %entry
+  br label %for.body
+
+for.cond.cleanup:                                 ; preds = %for.inc
+  ret void
+
+for.body:                                         ; preds = %entry.split, %for.inc
+  %indvars.iv = phi i64 [ 0, %entry.split ], [ %indvars.iv.next, %for.inc ]
+  %arrayidx = getelementptr inbounds i32, ptr %B, i64 %indvars.iv
+  %0 = load i32, ptr %arrayidx
+  %tobool.not = icmp eq i32 %0, 0
+  br i1 %tobool.not, label %for.inc, label %if.then
+
+if.then:                                          ; preds = %for.body
+  %arrayidx2 = getelementptr inbounds i32, ptr %A, i64 %indvars.iv
+  %1 = load i32, ptr %arrayidx2
+  %2 = load i32, ptr %sum
+  %add = add nsw i32 %2, %1
+  store i32 %add, ptr %sum
+  br label %for.inc
+
+for.inc:                                          ; preds = %for.body, %if.then
+  %indvars.iv.next = add nuw nsw i64 %indvars.iv, 1
+  %exitcond.not = icmp eq i64 %indvars.iv.next, 4444
+  br i1 %exitcond.not, label %for.cond.cleanup, label %for.body
+}
+
diff --git a/polly/test/ScopInfo/reduction_indirect_access.ll b/polly/test/ScopInfo/reduction_indirect_access.ll
new file mode 100644
index 0000000..7acac4b
--- /dev/null
+++ b/polly/test/ScopInfo/reduction_indirect_access.ll
@@ -0,0 +1,42 @@
+; RUN: opt %loadPolly -basic-aa -polly-print-scops -polly-allow-nonaffine -disable-output < %s | FileCheck %s
+;
+; CHECK: Reduction Type: NONE
+; CHECK: MemRef_INDICES[i0]
+; CHECK: Reduction Type: +
+; CHECK: MemRef_A[o0]
+; CHECK: Reduction Type: +
+; CHECK: MemRef_A[o0]
+;
+;    void f(double *restrict A, int *restrict INDICES, int N) {
+;      for (int i = 0; i < N; i++)
+;        A[INDICES[i]] += N;
+;    }
+;
+target datalayout = "e-m:e-p:32:32-i64:64-v128:64:128-n32-S64"
+
+define void @f(double* noalias %A, i32* noalias %INDICES, i32 %N) {
+entry:
+  br label %for.cond
+
+for.cond:                                         ; preds = %for.inc, %entry
+  %i.0 = phi i32 [ 0, %entry ], [ %inc, %for.inc ]
+  %cmp = icmp slt i32 %i.0, %N
+  br i1 %cmp, label %for.body, label %for.end
+
+for.body:                                         ; preds = %for.cond
+  %conv = sitofp i32 %N to double
+  %arrayidx = getelementptr inbounds i32, i32* %INDICES, i32 %i.0
+  %tmp = load i32, i32* %arrayidx, align 4
+  %arrayidx1 = getelementptr inbounds double, double* %A, i32 %tmp
+  %tmp1 = load double, double* %arrayidx1, align 8
+  %add = fadd fast double %tmp1, %conv
+  store double %add, double* %arrayidx1, align 8
+  br label %for.inc
+
+for.inc:                                          ; preds = %for.body
+  %inc = add nsw i32 %i.0, 1
+  br label %for.cond
+
+for.end:                                          ; preds = %for.cond
+  ret void
+}
diff --git a/polly/test/ScopInfo/reduction_indirect_access_2.ll b/polly/test/ScopInfo/reduction_indirect_access_2.ll
new file mode 100644
index 0000000..3319539
--- /dev/null
+++ b/polly/test/ScopInfo/reduction_indirect_access_2.ll
@@ -0,0 +1,50 @@
+; RUN: opt %loadPolly -basic-aa -polly-print-scops -disable-output -polly-allow-nonaffine < %s | FileCheck %s
+;
+; Validate that the accesses to INDICES[i] is not part of a reduction.
+;
+; CHECK: Reduction Type: NONE
+; CHECK: MemRef_INDICES[i0]
+; CHECK: Reduction Type: +
+; CHECK: MemRef_A[o0]
+; CHECK: Reduction Type: +
+; CHECK: MemRef_A[o0]
+; CHECK: Reduction Type: NONE
+; CHECK: MemRef_INDICES[i0]
+;
+;    void f(double *restrict A, int *restrict INDICES, int N) {
+;      for (int i = 0; i < N; i++) {
+;        A[INDICES[i]] += N;
+;        INDICES[i] += N;
+;      }
+;    }
+;
+target datalayout = "e-m:e-p:32:32-i64:64-v128:64:128-n32-S64"
+
+define void @f(double* noalias %A, i32* noalias %INDICES, i32 %N) {
+entry:
+  br label %for.cond
+
+for.cond:                                         ; preds = %for.inc, %entry
+  %i.0 = phi i32 [ 0, %entry ], [ %inc, %for.inc ]
+  %cmp = icmp slt i32 %i.0, %N
+  br i1 %cmp, label %for.body, label %for.end
+
+for.body:                                         ; preds = %for.cond
+  %conv = sitofp i32 %N to double
+  %arrayidx = getelementptr inbounds i32, i32* %INDICES, i32 %i.0
+  %tmp = load i32, i32* %arrayidx, align 4
+  %arrayidx1 = getelementptr inbounds double, double* %A, i32 %tmp
+  %tmp1 = load double, double* %arrayidx1, align 8
+  %add = fadd fast double %tmp1, %conv
+  store double %add, double* %arrayidx1, align 8
+  %add3 = add nsw i32 %tmp, %N
+  store i32 %add3, i32* %arrayidx, align 4
+  br label %for.inc
+
+for.inc:                                          ; preds = %for.body
+  %inc = add nsw i32 %i.0, 1
+  br label %for.cond
+
+for.end:                                          ; preds = %for.cond
+  ret void
+}
diff --git a/polly/test/ScopInfo/reduction_long_reduction_chain.ll b/polly/test/ScopInfo/reduction_long_reduction_chain.ll
new file mode 100644
index 0000000..62ae1fe
--- /dev/null
+++ b/polly/test/ScopInfo/reduction_long_reduction_chain.ll
@@ -0,0 +1,61 @@
+; RUN: opt %loadPolly -basic-aa -polly-print-scops -disable-output < %s | FileCheck %s
+;
+; CHECK: Reduction Type: +
+; CHECK: MemRef_sum
+; CHECK: Reduction Type: NONE
+; CHECK: MemRef_A
+; CHECK: Reduction Type: +
+; CHECK: MemRef_sum
+; CHECK-NOT: MemRef_A
+;
+;    void f(int *restrict sum, int *restrict A) {
+;      for (int i = 0; i < 1024; i++)
+;        *sum = (A[i + 3] * (i - 14)) + ((A[i] + *sum + A[0]) + A[1023]) +
+;               (A[i + 2] * A[i - 1]);
+;    }
+;
+target datalayout = "e-m:e-p:32:32-i64:64-v128:64:128-n32-S64"
+
+define void @f(i32* noalias %sum, i32* noalias %A) {
+entry:
+  br label %for.cond
+
+for.cond:                                         ; preds = %for.inc, %entry
+  %i.0 = phi i32 [ 0, %entry ], [ %inc, %for.inc ]
+  %exitcond = icmp ne i32 %i.0, 1024
+  br i1 %exitcond, label %for.body, label %for.end
+
+for.body:                                         ; preds = %for.cond
+  %add = add nsw i32 %i.0, 3
+  %arrayidx = getelementptr inbounds i32, i32* %A, i32 %add
+  %tmp = load i32, i32* %arrayidx, align 4
+  %sub = add nsw i32 %i.0, -14
+  %mul = mul nsw i32 %tmp, %sub
+  %arrayidx1 = getelementptr inbounds i32, i32* %A, i32 %i.0
+  %tmp1 = load i32, i32* %arrayidx1, align 4
+  %tmp2 = load i32, i32* %sum, align 4
+  %add2 = add nsw i32 %tmp1, %tmp2
+  %tmp3 = load i32, i32* %A, align 4
+  %add4 = add nsw i32 %add2, %tmp3
+  %arrayidx5 = getelementptr inbounds i32, i32* %A, i32 1023
+  %tmp4 = load i32, i32* %arrayidx5, align 4
+  %add6 = add nsw i32 %add4, %tmp4
+  %add7 = add nsw i32 %mul, %add6
+  %add8 = add nsw i32 %i.0, 2
+  %arrayidx9 = getelementptr inbounds i32, i32* %A, i32 %add8
+  %tmp5 = load i32, i32* %arrayidx9, align 4
+  %sub10 = add nsw i32 %i.0, -1
+  %arrayidx11 = getelementptr inbounds i32, i32* %A, i32 %sub10
+  %tmp6 = load i32, i32* %arrayidx11, align 4
+  %mul12 = mul nsw i32 %tmp5, %tmp6
+  %add13 = add nsw i32 %add7, %mul12
+  store i32 %add13, i32* %sum, align 4
+  br label %for.inc
+
+for.inc:                                          ; preds = %for.body
+  %inc = add nsw i32 %i.0, 1
+  br label %for.cond
+
+for.end:                                          ; preds = %for.cond
+  ret void
+}
diff --git a/polly/test/ScopInfo/reduction_long_reduction_chain_double_use.ll b/polly/test/ScopInfo/reduction_long_reduction_chain_double_use.ll
new file mode 100644
index 0000000..7ca46fa
--- /dev/null
+++ b/polly/test/ScopInfo/reduction_long_reduction_chain_double_use.ll
@@ -0,0 +1,58 @@
+; RUN: opt %loadPolly -basic-aa -polly-print-scops -disable-output < %s | FileCheck %s
+;
+; Sum is added twice in the statement. Hence no reduction.
+; CHECK: Reduction Type: NONE
+;
+;    void f(int *restrict sum, int *restrict A) {
+;      for (int i = 0; i < 1024; i++)
+;        *sum = (A[i + 3] * (i - 14)) + ((A[i] + *sum + A[0]) + A[1023]) +
+;               (A[i + 2] * A[i - 1]) + *sum;
+;    }
+;
+target datalayout = "e-m:e-p:32:32-i64:64-v128:64:128-n32-S64"
+
+define void @f(i32* noalias %sum, i32* noalias %A) {
+entry:
+  br label %for.cond
+
+for.cond:                                         ; preds = %for.inc, %entry
+  %i.0 = phi i32 [ 0, %entry ], [ %inc, %for.inc ]
+  %exitcond = icmp ne i32 %i.0, 1024
+  br i1 %exitcond, label %for.body, label %for.end
+
+for.body:                                         ; preds = %for.cond
+  %add = add nsw i32 %i.0, 3
+  %arrayidx = getelementptr inbounds i32, i32* %A, i32 %add
+  %tmp = load i32, i32* %arrayidx, align 4
+  %sub = add nsw i32 %i.0, -14
+  %mul = mul nsw i32 %tmp, %sub
+  %arrayidx1 = getelementptr inbounds i32, i32* %A, i32 %i.0
+  %tmp1 = load i32, i32* %arrayidx1, align 4
+  %tmp2 = load i32, i32* %sum, align 4
+  %add2 = add nsw i32 %tmp1, %tmp2
+  %tmp3 = load i32, i32* %A, align 4
+  %add4 = add nsw i32 %add2, %tmp3
+  %arrayidx5 = getelementptr inbounds i32, i32* %A, i32 1023
+  %tmp4 = load i32, i32* %arrayidx5, align 4
+  %add6 = add nsw i32 %add4, %tmp4
+  %add7 = add nsw i32 %mul, %add6
+  %add8 = add nsw i32 %i.0, 2
+  %arrayidx9 = getelementptr inbounds i32, i32* %A, i32 %add8
+  %tmp5 = load i32, i32* %arrayidx9, align 4
+  %sub10 = add nsw i32 %i.0, -1
+  %arrayidx11 = getelementptr inbounds i32, i32* %A, i32 %sub10
+  %tmp6 = load i32, i32* %arrayidx11, align 4
+  %mul12 = mul nsw i32 %tmp5, %tmp6
+  %add13 = add nsw i32 %add7, %mul12
+  %tmp7 = load i32, i32* %sum, align 4
+  %add14 = add nsw i32 %add13, %tmp7
+  store i32 %add14, i32* %sum, align 4
+  br label %for.inc
+
+for.inc:                                          ; preds = %for.body
+  %inc = add nsw i32 %i.0, 1
+  br label %for.cond
+
+for.end:                                          ; preds = %for.cond
+  ret void
+}
diff --git a/polly/test/ScopInfo/reduction_multiple_different_operators.ll b/polly/test/ScopInfo/reduction_multiple_different_operators.ll
new file mode 100644
index 0000000..b77c72a
--- /dev/null
+++ b/polly/test/ScopInfo/reduction_multiple_different_operators.ll
@@ -0,0 +1,37 @@
+; RUN: opt %loadPolly -polly-print-scops -disable-output < %s | FileCheck %s
+;
+; Should not be identified as reduction as there are different operations
+; involved on sum (multiplication followed by addition)
+; CHECK: Reduction Type: NONE
+;
+;    void f(int *restrict sum) {
+;      for (int i = 0; i < 1024; i++) {
+;        *sum = (*sum * 5) + 25;
+;      }
+;    }
+;
+target datalayout = "e-m:e-p:32:32-i64:64-v128:64:128-n32-S64"
+
+define void @f(i32* noalias %sum) {
+entry:
+  br label %for.cond
+
+for.cond:                                         ; preds = %for.inc, %entry
+  %i.0 = phi i32 [ 0, %entry ], [ %inc, %for.inc ]
+  %exitcond = icmp ne i32 %i.0, 1024
+  br i1 %exitcond, label %for.body, label %for.end
+
+for.body:                                         ; preds = %for.cond
+  %tmp = load i32, i32* %sum, align 4
+  %tmp1 = mul i32 %tmp, 5
+  %mul = add i32 %tmp1, 25
+  store i32 %mul, i32* %sum, align 4
+  br label %for.inc
+
+for.inc:                                          ; preds = %for.body
+  %inc = add nsw i32 %i.0, 1
+  br label %for.cond
+
+for.end:                                          ; preds = %for.cond
+  ret void
+}
diff --git a/utils/bazel/llvm-project-overlay/clang/BUILD.bazel b/utils/bazel/llvm-project-overlay/clang/BUILD.bazel
index cf57598..5aa6b01 100644
--- a/utils/bazel/llvm-project-overlay/clang/BUILD.bazel
+++ b/utils/bazel/llvm-project-overlay/clang/BUILD.bazel
@@ -4,10 +4,10 @@
 
 load(
     "//:vars.bzl",
-    "LLVM_VERSION",
     "LLVM_VERSION_MAJOR",
     "LLVM_VERSION_MINOR",
     "LLVM_VERSION_PATCH",
+    "PACKAGE_VERSION",
 )
 load("//:workspace_root.bzl", "workspace_root")
 load("//llvm:binary_alias.bzl", "binary_alias")
@@ -553,12 +553,12 @@ genrule(
         "echo '#define CLANG_VERSION_MAJOR_STRING \"{major}\"' >> $@\n" +
         "echo '#define CLANG_VERSION_MINOR {minor}' >> $@\n" +
         "echo '#define CLANG_VERSION_PATCHLEVEL {patch}' >> $@\n" +
-        "echo '#define CLANG_VERSION_STRING \"{vers}git\"' >> $@\n"
+        "echo '#define CLANG_VERSION_STRING \"{vers}\"' >> $@\n"
     ).format(
         major = LLVM_VERSION_MAJOR,
         minor = LLVM_VERSION_MINOR,
         patch = LLVM_VERSION_PATCH,
-        vers = LLVM_VERSION,
+        vers = PACKAGE_VERSION,
     ),
 )
 
diff --git a/utils/bazel/llvm-project-overlay/libc/BUILD.bazel b/utils/bazel/llvm-project-overlay/libc/BUILD.bazel
index 5294351..253b892 100644
--- a/utils/bazel/llvm-project-overlay/libc/BUILD.bazel
+++ b/utils/bazel/llvm-project-overlay/libc/BUILD.bazel
@@ -1078,22 +1078,6 @@ libc_support_library(
 )
 
 libc_support_library(
-    name = "__support_osutil_pid",
-    srcs = ["src/__support/OSUtil/linux/pid.cpp"],
-    hdrs = ["src/__support/OSUtil/pid.h"],
-    target_compatible_with = select({
-        "@platforms//os:linux": [],
-        "//conditions:default": ["@platforms//:incompatible"],
-    }),
-    deps = [
-        ":__support_macros_attributes",
-        ":__support_macros_optimization",
-        ":__support_osutil_syscall",
-        ":types_pid_t",
-    ],
-)
-
-libc_support_library(
     name = "__support_stringutil",
     srcs = glob(["src/__support/StringUtil/tables/**/*.h"]) + [
         "src/__support/StringUtil/error_to_string.cpp",
@@ -3044,20 +3028,6 @@ libc_function(
 )
 
 libc_function(
-    name = "getpid",
-    srcs = ["src/unistd/linux/getpid.cpp"],
-    hdrs = ["src/unistd/getpid.h"],
-    deps = [
-        ":__support_common",
-        ":__support_macros_config",
-        ":__support_osutil_pid",
-        ":__support_osutil_syscall",
-        ":errno",
-        ":types_pid_t",
-    ],
-)
-
-libc_function(
     name = "getppid",
     srcs = ["src/unistd/linux/getppid.cpp"],
     hdrs = ["src/unistd/getppid.h"],
diff --git a/utils/bazel/llvm-project-overlay/libc/test/src/unistd/BUILD.bazel b/utils/bazel/llvm-project-overlay/libc/test/src/unistd/BUILD.bazel
index 4549fa2..66d8ddb 100644
--- a/utils/bazel/llvm-project-overlay/libc/test/src/unistd/BUILD.bazel
+++ b/utils/bazel/llvm-project-overlay/libc/test/src/unistd/BUILD.bazel
@@ -261,13 +261,6 @@ libc_test(
 #     ],
 # )
 
-libc_test(
-    name = "getpid_test",
-    srcs = ["getpid_test.cpp"],
-    libc_function_deps = [
-        "//libc:getpid",
-    ],
-)
 
 libc_test(
     name = "getppid_test",
diff --git a/utils/bazel/llvm-project-overlay/llvm/config.bzl b/utils/bazel/llvm-project-overlay/llvm/config.bzl
index 2e3bff5..9de9666 100644
--- a/utils/bazel/llvm-project-overlay/llvm/config.bzl
+++ b/utils/bazel/llvm-project-overlay/llvm/config.bzl
@@ -6,10 +6,10 @@
 
 load(
     "//:vars.bzl",
-    "LLVM_VERSION",
     "LLVM_VERSION_MAJOR",
     "LLVM_VERSION_MINOR",
     "LLVM_VERSION_PATCH",
+    "PACKAGE_VERSION",
 )
 
 def native_arch_defines(arch, triple):
@@ -108,7 +108,7 @@ llvm_config_defines = os_defines + builtin_thread_pointer + select({
     "LLVM_VERSION_MAJOR={}".format(LLVM_VERSION_MAJOR),
     "LLVM_VERSION_MINOR={}".format(LLVM_VERSION_MINOR),
     "LLVM_VERSION_PATCH={}".format(LLVM_VERSION_PATCH),
-    r'LLVM_VERSION_STRING=\"{}git\"'.format(LLVM_VERSION),
+    r'LLVM_VERSION_STRING=\"{}\"'.format(PACKAGE_VERSION),
     # These shouldn't be needed by the C++11 standard, but are for some
     # platforms (e.g. glibc < 2.18. See
     # https://sourceware.org/bugzilla/show_bug.cgi?id=15366). These are also
diff --git a/utils/bazel/llvm-project-overlay/mlir/BUILD.bazel b/utils/bazel/llvm-project-overlay/mlir/BUILD.bazel
index f83c471..8493823 100644
--- a/utils/bazel/llvm-project-overlay/mlir/BUILD.bazel
+++ b/utils/bazel/llvm-project-overlay/mlir/BUILD.bazel
@@ -3389,6 +3389,7 @@ td_library(
     srcs = ["include/mlir/Dialect/NVGPU/IR/NVGPU.td"],
     includes = ["include"],
     deps = [
+        ":InferTypeOpInterfaceTdFiles",
         ":SideEffectInterfacesTdFiles",
     ],
 )
@@ -3477,6 +3478,7 @@ cc_library(
         ":BytecodeOpInterface",
         ":GPUDialect",
         ":IR",
+        ":InferTypeOpInterface",
         ":LLVMDialect",
         ":NVGPUIncGen",
         ":SideEffectInterfaces",
@@ -10079,6 +10081,7 @@ td_library(
         "include/mlir/Dialect/OpenACC/AccCommon.td",
         "include/mlir/Dialect/OpenACC/OpenACCBase.td",
         "include/mlir/Dialect/OpenACC/OpenACCOps.td",
+        "include/mlir/Dialect/OpenACC/OpenACCOpsInterfaces.td",
         "include/mlir/Dialect/OpenACC/OpenACCOpsTypes.td",
         "include/mlir/Dialect/OpenACC/OpenACCTypeInterfaces.td",
         "include/mlir/Dialect/OpenACCMPCommon/Interfaces/AtomicInterfaces.td",
@@ -10093,6 +10096,23 @@ td_library(
 )
 
 gentbl_cc_library(
+    name = "OpenACCOpsInterfacesIncGen",
+    tbl_outs = [
+        (
+            ["-gen-op-interface-decls"],
+            "include/mlir/Dialect/OpenACC/OpenACCOpsInterfaces.h.inc",
+        ),
+        (
+            ["-gen-op-interface-defs"],
+            "include/mlir/Dialect/OpenACC/OpenACCOpsInterfaces.cpp.inc",
+        ),
+    ],
+    tblgen = ":mlir-tblgen",
+    td_file = "include/mlir/Dialect/OpenACC/OpenACCOpsInterfaces.td",
+    deps = [":OpenAccOpsTdFiles"],
+)
+
+gentbl_cc_library(
     name = "OpenACCMPOpsInterfacesIncGen",
     tbl_outs = [
         (
@@ -10235,6 +10255,7 @@ cc_library(
         ":MemRefDialect",
         ":OpenACCMPOpsInterfacesIncGen",
         ":OpenACCOpsIncGen",
+        ":OpenACCOpsInterfacesIncGen",
         ":OpenACCTypeInterfacesIncGen",
         ":OpenACCTypesIncGen",
         ":SideEffectInterfaces",
@@ -11192,6 +11213,7 @@ cc_library(
         ":AffineDialect",
         ":Analysis",
         ":ArithDialect",
+	":ArithUtils",
         ":AsmParser",
         ":BufferizationDialect",
         ":BufferizationTransforms",
author	Amir Ayupov <aaupov@fb.com>	2024-07-31 22:13:40 -0700
committer	Amir Ayupov <aaupov@fb.com>	2024-07-31 22:13:40 -0700
commit	16a22bc81f9200e016296237ca6640fd9c0c3178 (patch)
tree	f79f44ef275aa946390a055fb913899c5288149e
parent	e1ae4a428056fc77ceedf4a6d354c9fe52b8a79a (diff)
parent	fb97b4f96217442c684a940558135ffbfe45b756 (diff)
download	llvm-users/aaupov/spr/main.boltnfc-print-timers-in-perf2bolt-invocation.zip llvm-users/aaupov/spr/main.boltnfc-print-timers-in-perf2bolt-invocation.tar.gz llvm-users/aaupov/spr/main.boltnfc-print-timers-in-perf2bolt-invocation.tar.bz2